From 16a63c5d49e12a3b9976e27fc30e353385b04d14 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 17 Nov 2018 08:27:27 +0100 Subject: [PATCH 001/494] first commit --- README.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 README.md diff --git a/README.md b/README.md new file mode 100644 index 000000000..1a34b0d0e --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +# tdl-examples From bd5b213921a0addd6d3c99505b36129ff8d367c5 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 24 Nov 2018 10:14:26 +0100 Subject: [PATCH 002/494] Initial commit --- CMakeLists.txt | 21 +++ conv.cpp | 399 +++++++++++++++++++++++++++++++++++++++++++++++++ gemm.cpp | 342 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 762 insertions(+) create mode 100644 CMakeLists.txt create mode 100644 conv.cpp create mode 100644 gemm.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 000000000..3f1650aca --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,21 @@ +cmake_minimum_required(VERSION 2.8.7) +project(TDL) + +# LLVM +include(cmake/FindLLVM.cmake) + +# Link directories +link_directories(/home/philippe/Development/llvm-tlvm/build/lib) +# Include directories +include_directories(/home/philippe/Development/llvm-tlvm/include) +include_directories(/home/philippe/Development/llvm-tlvm/build/include) + +# Flags +# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Wextra -pedantic -Wno-strict-aliasing") + +# Executables +foreach(PROG gemm conv) + add_executable(${PROG} ${PROG}.cpp) + set_target_properties(${PROG} PROPERTIES OUTPUT_NAME ${PROG}) + target_link_libraries(${PROG} ${LLVM_LIBRARIES}) +endforeach() diff --git a/conv.cpp b/conv.cpp new file mode 100644 index 000000000..0132b3fe2 --- /dev/null +++ b/conv.cpp @@ -0,0 +1,399 @@ +#include + +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ExecutionEngine/ExecutionEngine.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Verifier.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Host.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/Cloning.h" + + + +void autotune(llvm::TargetMachine *machine, llvm::Module &module){ + // Target parameters + std::vector ranges = { + // asm + 2, 16, 1, 64, + // bsn + 2, 16, 1, 64, + // pa + 1, 2, 4, 8, + // pb + 1, 2, 4, + // sm + 2, 1, 16, 2, 2, 2 + }; + + // Function + llvm::Function *F = module.getFunction("kernel"); + + // Auto-tuning + llvm::legacy::PassManager pass; + llvm::TargetPassConfig *pass_config = static_cast(machine)->createPassConfig(pass); + llvm::FunctionPass *tuning_params = pass_config->createTargetTuningParameters(); + tuning_params->runOnFunction(*F); + + + // Gather all parameters + llvm::DenseSet unique; + llvm::SmallVector params; + for(llvm::BasicBlock &bb: *F) + for(llvm::Instruction &instr: bb){ + // Get tuning parameters for this particular instruction + std::vector tuning_params; + machine->getTargetTuner().getParams(&instr, tuning_params); + for(llvm::TargetTuner::ParamType ¶m: tuning_params){ + // This parameter has not been seen before + if(unique.insert(param.Value).second){ + std::cout << instr.getName().data() << " " << param.Name << std::endl; + params.push_back(param.Value); + } + } + } + + // Gather all constraints + std::vector> constraints; + for(llvm::BasicBlock &bb: *F) + for(llvm::Instruction &instr: bb) + machine->getTargetTuner().getConstraints(&instr, constraints); + + // Assign parameters + std::cout << params.size() << " " << ranges.size() << std::endl; + for(unsigned i = 0; i < params.size(); i++) + *params[i] = ranges[i]; + + // Verify constraints + bool valid = true; + for(auto &constraint: constraints){ + valid = valid & constraint(); + } + + if(!valid){ + printf("Invalid kernel parameters\n"); + exit(EXIT_FAILURE); + } +} + +int main(){ +// llvm::DebugFlag = true; + + std::string error; + + llvm::InitializeAllTargetInfos(); + llvm::InitializeAllTargets(); + llvm::InitializeAllTargetMCs(); + llvm::InitializeAllAsmParsers(); + llvm::InitializeAllAsmPrinters(); + + // Module + llvm::LLVMContext context; + std::unique_ptr module = llvm::make_unique("TLVM toy example", context); + llvm::IRBuilder<> builder(context); + + unsigned RR = 3, SS = 3; + unsigned Nfilt = RR * SS; + unsigned block = 8; + unsigned nlut = (block + Nfilt - 1)/Nfilt * Nfilt; + + // Globals + llvm::Type* bool_t = llvm::Type::getInt1Ty(context); + llvm::Type* mask_tile_t = llvm::TileType::get(bool_t, 2); + llvm::Type* numeric_t = llvm::Type::getFloatTy(context); + llvm::PointerType* numeric_ptr_t = llvm::PointerType::get(numeric_t, 0); + llvm::IntegerType* int32_t = llvm::Type::getInt32Ty(context); + llvm::PointerType* lut_ptr_t = llvm::PointerType::get(int32_t, 4); + llvm::IntegerType* int1_t = llvm::Type::getInt1Ty(context); + + llvm::Type* tile_t = llvm::TileType::get(numeric_t, 2); + llvm::Type* int32_slice_t = llvm::TileType::get(int32_t, 1); + llvm::Type* int32_tile_t = llvm::TileType::get(int32_t, 2); + llvm::Type* int1_slice_t = llvm::TileType::get(int1_t, 1); + llvm::Type* int1_tile_t = llvm::TileType::get(int1_t, 2); + + llvm::PointerType* tile_ptr_t = llvm::PointerType::get(tile_t, 0); + llvm::Function* read_slice_x = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_read_slice_x, {int32_slice_t}); + llvm::Function* read_slice_y = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_read_slice_y, {int32_slice_t}); + llvm::Function* range = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_range, {int32_slice_t}); + llvm::Function* gtp_1d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_gtp_1d, {int32_slice_t->getPointerTo(4), int32_t->getPointerTo(4), int32_slice_t}); + llvm::Function* stp_1d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_stp_1d, {int32_slice_t->getPointerTo(4), int32_slice_t}); + + llvm::Function* gtp_2d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_gtp_2d, {tile_ptr_t, numeric_ptr_t, int32_tile_t}); + llvm::Function* stp_2d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_stp_2d, {tile_ptr_t, int32_tile_t}); + llvm::Intrinsic::ID mma_id = llvm::Intrinsic::tlvm_mma_nt; + llvm::Function* outer_add = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_outer_add, {int32_tile_t, int32_slice_t, int32_slice_t}); + llvm::Function* outer_and = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_outer_and, {int1_tile_t, int1_slice_t, int1_slice_t}); + llvm::Function* mma = llvm::Intrinsic::getDeclaration(module.get(), mma_id, {tile_t}); + llvm::Function* reshape = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_reshape_2d, {tile_t}); + llvm::Function* splat_2d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_splat_2d, {mask_tile_t, tile_t, bool_t}); + llvm::Function* splat_1d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_splat_1d, {int32_slice_t, int32_slice_t, int32_t}); + llvm::Function* masked_load = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_masked_load, {tile_t, tile_ptr_t, mask_tile_t}); + llvm::Function* masked_store = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_masked_store, {tile_t, tile_ptr_t, mask_tile_t}); + + // Hyperparameters + llvm::Hyperparameter *bm = llvm::Hyperparameter::get(int32_t, 0); + llvm::Hyperparameter *bn = llvm::Hyperparameter::get(int32_t, 1); + llvm::Hyperparameter *bk = llvm::Hyperparameter::get(int32_t, 2); + + // Constants + llvm::Constant *_s0 = llvm::ConstantInt::get(int32_t, 0); + llvm::Constant *_f0 = llvm::ConstantFP::get(numeric_t, 0); + llvm::Constant *_0 = llvm::ConstantTile::get(_f0, {bm, bn}); + + // LUT + llvm::GlobalVariable *lut_array = + new llvm::GlobalVariable(*module, llvm::ArrayType::get(int32_t, nlut), false, llvm::GlobalVariable::InternalLinkage, + nullptr, "lut_array", nullptr, llvm::GlobalVariable::NotThreadLocal, 4); + llvm::Value *lut_ptr = builder.CreateBitCast(lut_array, lut_ptr_t); + + + // Function + llvm::FunctionType* prototype = llvm::FunctionType::get(llvm::Type::getVoidTy(context), std::vector{numeric_ptr_t, numeric_ptr_t, numeric_ptr_t, int32_t, int32_t, int32_t, int32_t, int32_t}, false); + llvm::Function* F = llvm::Function::Create(prototype, llvm::Function::ExternalLinkage, "kernel", module.get()); + std::vector args; + F->addAttribute(1, llvm::Attribute::ReadOnly); + F->addAttribute(1, llvm::Attribute::NoAlias); + F->addAttribute(2, llvm::Attribute::ReadOnly); + F->addAttribute(2, llvm::Attribute::NoAlias); + std::transform(F->arg_begin(), F->arg_end(), std::back_inserter(args), [&](llvm::Argument& x){ return &x;}); + llvm::Value *base_o_ptr = args[0], *base_i_ptr = args[1], *base_f_ptr = args[2]; + llvm::Value *C = args[3], *H = args[4], *W = args[5], *N = args[6], *K = args[7]; + llvm::Value *R = builder.getInt32(RR), *S = builder.getInt32(SS); + + // All basic blocks + llvm::BasicBlock* PrologBB = llvm::BasicBlock::Create(context, "prologue", F); + llvm::BasicBlock* LoopBB = llvm::BasicBlock::Create(context, "loop", F); + llvm::BasicBlock* EarlyExitBB = llvm::BasicBlock::Create(context, "early_exit", F); + llvm::BasicBlock* LastIterBB = llvm::BasicBlock::Create(context, "last_iter", F); + llvm::BasicBlock* EpilogueBB = llvm::BasicBlock::Create(context, "epilogue", F); + + + // First basic block + builder.SetInsertPoint(PrologBB); + llvm::Value* sa0 = builder.CreateCall(read_slice_x, {bm}, "i_slice_pqn"); + llvm::Value* sb0 = builder.CreateCall(read_slice_y, {bn}, "f_slice_k"); + llvm::Value* sa1 = builder.CreateCall(range, {builder.getInt32(0), bk}, "i_slice_crs"); + llvm::Value* sb1 = builder.CreateCall(range, {builder.getInt32(0), bk}, "f_slice_crs"); + + llvm::Value* lda_w = builder.getInt32(1); + llvm::Value* lda_h = builder.CreateMul(lda_w, W); + llvm::Value* lda_c = builder.CreateMul(lda_h, H); + llvm::Value* lda_n = builder.CreateMul(lda_c, C); + + llvm::Value* ldb_s = builder.getInt32(1); + llvm::Value* ldb_r = builder.CreateMul(ldb_s, S); + llvm::Value* ldb_c = builder.CreateMul(ldb_r, R); + llvm::Value* ldb_k = builder.CreateMul(ldb_c, C); + + llvm::Value* CRS = builder.CreateMul(C, builder.CreateMul(R, S)); + llvm::Value* PQN = builder.CreateMul(H, builder.CreateMul(W, N)); + + // Images HWN offset + llvm::Value* sa_hw = builder.CreateUDiv(sa0, builder.CreateCall(splat_1d, {sa0, N})); + llvm::Value* sa_n = builder.CreateURem(sa0, builder.CreateCall(splat_1d, {sa0, N})); + llvm::Value* sa_h = builder.CreateUDiv(sa_hw, builder.CreateCall(splat_1d, {sa0, W})); + llvm::Value* sa_w = builder.CreateURem(sa_hw, builder.CreateCall(splat_1d, {sa0, W})); + llvm::Value* offa_0 = builder.CreateMul(sa_n, builder.CreateCall(splat_1d, {sa0, lda_n})); + offa_0 = builder.CreateAdd(offa_0, builder.CreateMul(sa_h, builder.CreateCall(splat_1d, {sa0, lda_h}))); + offa_0 = builder.CreateAdd(offa_0, builder.CreateMul(sa_w, builder.CreateCall(splat_1d, {sa0, lda_w}))); + // Images CRS offset + llvm::Value* sa_cr = builder.CreateUDiv(sa1, builder.CreateCall(splat_1d, {sa1, S})); + llvm::Value* sa_s = builder.CreateURem(sa1, builder.CreateCall(splat_1d, {sa1, S})); + llvm::Value* sa_c = builder.CreateUDiv(sa_cr, builder.CreateCall(splat_1d, {sa1, R})); + llvm::Value* sa_r = builder.CreateURem(sa_cr, builder.CreateCall(splat_1d, {sa1, R})); + llvm::Value* offa_1 = builder.CreateMul(sa_c, builder.CreateCall(splat_1d, {sa1, lda_c})); + offa_1 = builder.CreateAdd(offa_1, builder.CreateMul(sa_r, builder.CreateCall(splat_1d, {sa1, lda_h}))); + offa_1 = builder.CreateAdd(offa_1, builder.CreateMul(sa_s, builder.CreateCall(splat_1d, {sa1, lda_w}))); + // Images pointer + llvm::Value* off_a = builder.CreateCall(outer_add, {offa_0, offa_1}); + llvm::Value* start_pa = builder.CreateCall(gtp_2d, {base_i_ptr, off_a}, "start_i_ptr"); + llvm::LoadInst* start_aa = builder.CreateLoad(start_pa, false, "start_i_val"); + llvm::Value* start_a = builder.CreateCall(reshape, {start_aa, bm, bk}, "start_i"); + // Filters pointer + llvm::Value* tldb_s = builder.CreateCall(splat_1d, {sb1, K}); + llvm::Value* off_b = builder.CreateCall(outer_add, {sb0, builder.CreateMul(sb1, tldb_s)}, "off_f"); + llvm::Value* start_pb = builder.CreateCall(gtp_2d, {base_f_ptr, off_b}, "start_f_ptr"); + llvm::Value* start_bb = builder.CreateLoad(start_pb, false, "start_f_val"); + llvm::Value* start_b = builder.CreateCall(reshape, {start_bb, bn, bk}, "start_f"); + // Filters increment + llvm::Value* inc_b_0 = builder.CreateCall(splat_1d, {sb0, _s0}, "inc_f_0"); + llvm::Value* inc_b_1 = builder.CreateCall(splat_1d, {sb1, builder.CreateMul(bk, ldb_k)}, "inc_f_1"); + llvm::Value* inc_b = builder.CreateCall(outer_add, {inc_b_0, inc_b_1}, "inc_f"); + // Delta pointers + llvm::Value* base_incdelta = lut_ptr; + llvm::Value* start_pincdelta = builder.CreateCall(gtp_1d, {base_incdelta, sa1}, "start_pincdelta"); + llvm::Value* base_delta = builder.CreateGEP(lut_ptr, builder.getInt32(nlut)); + llvm::Value* start_pdelta = builder.CreateCall(gtp_1d, {base_delta, builder.CreateCall(splat_1d, {sa1, _s0})}, "start_pdelta"); + // Masks + llvm::Value* _1 = builder.CreateCall(splat_1d, {sb1, builder.getInt32(1)}); + llvm::Value* mask_a_1 = builder.CreateShl(_1, sb1); + llvm::Value* base_incmask = builder.CreateGEP(lut_ptr, builder.getInt32(2*nlut)); + llvm::Value* start_pincmask = builder.CreateCall(gtp_1d, {base_incmask, sb1}); + llvm::Value* base_mask = builder.CreateGEP(lut_ptr, builder.getInt32(3*nlut)); + llvm::Value* start_pmask = builder.CreateCall(gtp_1d, {base_mask, sb1}); + // Enter loop + builder.CreateBr(LoopBB); + builder.SetInsertPoint(LoopBB); + // PHI nodes + llvm::PHINode* c = builder.CreatePHI(_0->getType(), 3, "c"); + llvm::PHINode* crs = builder.CreatePHI(int32_t, 3, "crs"); + llvm::PHINode* pa = builder.CreatePHI(start_pa->getType(), 3, "pa"); + llvm::PHINode* pb = builder.CreatePHI(start_pb->getType(), 3, "pb"); + llvm::PHINode *a = builder.CreatePHI(start_a->getType(), 3, "a"); + llvm::PHINode *b = builder.CreatePHI(start_b->getType(), 3, "b"); + llvm::PHINode *pdelta = builder.CreatePHI(start_pdelta->getType(), 3); + llvm::PHINode *pincdelta = builder.CreatePHI(start_pincdelta->getType(), 3); + llvm::PHINode *pmasks = builder.CreatePHI(start_pmask->getType(), 3); + llvm::PHINode *pincmasks = builder.CreatePHI(start_pincmask->getType(), 3); + llvm::Value* next_c = builder.CreateCall(mma, {a, b, c}, "next_c"); + c->addIncoming(_0, PrologBB); + c->addIncoming(next_c, LoopBB); + // Induction variable + llvm::Value *next_crs = builder.CreateSub(crs, bk); + crs->addIncoming(CRS, PrologBB); + crs->addIncoming(next_crs, LoopBB); + // Update pointer + llvm::Value *inc_delta = builder.CreateLoad(pincdelta); + llvm::Value *inc_mask = builder.CreateLoad(pincmasks); + llvm::Value *inc_a_1 = builder.CreateLoad(pdelta); + llvm::Value *inc_a_0 = builder.CreateCall(splat_1d, {sa0, builder.getInt32(0)}); + llvm::Value *inc_a = builder.CreateCall(outer_add, {inc_a_0, inc_a_1}); + llvm::Value *next_pa = builder.CreateCall(stp_2d, {pa, inc_a}, "next_i_ptr"); + llvm::Value *next_pb = builder.CreateCall(stp_2d, {pb, inc_b}, "next_f_ptr"); + llvm::Value *next_pdelta = builder.CreateCall(stp_1d, {pdelta, inc_delta}); + llvm::Value *next_pincdelta = builder.CreateCall(stp_1d, {pincdelta, inc_delta}); + llvm::Value *next_pmask = builder.CreateCall(stp_1d, {pmasks, inc_mask}); + llvm::Value *next_pincmask = builder.CreateCall(stp_1d, {pincmasks, inc_mask}); + pdelta->addIncoming(start_pdelta, PrologBB); + pdelta->addIncoming(next_pdelta, LoopBB); + pincdelta->addIncoming(start_pincdelta, PrologBB); + pincdelta->addIncoming(next_pincdelta, LoopBB); + pmasks->addIncoming(start_pmask, PrologBB); + pmasks->addIncoming(next_pmask, LoopBB); + pincmasks->addIncoming(start_pincmask, PrologBB); + pincmasks->addIncoming(next_pincmask, LoopBB); + pa->addIncoming(start_pa, PrologBB); + pa->addIncoming(next_pa, LoopBB); + pb->addIncoming(start_pb, PrologBB); + pb->addIncoming(next_pb, LoopBB); + // End condition + llvm::Value* no_bounds_check = builder.CreateICmpSGT(next_crs, builder.getInt32(0)); + // Masks + llvm::Value* mask_a_0 = builder.CreateLoad(pdelta); + llvm::Value* mask_a = builder.CreateCall(outer_and, {mask_a_0, mask_a_1}); + llvm::Value* mask_b = builder.CreateCall(splat_2d, {start_bb, no_bounds_check}, "mask_b"); + // Pre-fetch + llvm::Value* next_aa = builder.CreateCall(masked_load, {next_pa, mask_a}, "next_aa"); + llvm::Value* next_bb = builder.CreateCall(masked_load, {next_pb, mask_b}, "next_bb"); + llvm::Value* next_a = builder.CreateCall(reshape, {next_aa, bm, bk}, "next_a"); + llvm::Value* next_b = builder.CreateCall(reshape, {next_bb, bn, bk}, "next_b"); + a->addIncoming(start_a, PrologBB); + a->addIncoming(next_a, LoopBB); + b->addIncoming(start_b, PrologBB); + b->addIncoming(next_b, LoopBB); + // End condition + builder.CreateCondBr(no_bounds_check, LoopBB, EarlyExitBB); + // Early exit + builder.SetInsertPoint(EarlyExitBB); + llvm::Value* exit = builder.CreateICmpSLE(next_crs, _s0); + builder.CreateCondBr(exit, EpilogueBB, LastIterBB); + // Last Iteration + builder.SetInsertPoint(LastIterBB); + llvm::Value* in_bounds_b0 = builder.CreateICmpSLT(sb0, builder.CreateCall(splat_1d, {sb0, K})); + llvm::Value* in_bounds_b1 = builder.CreateICmpSLT(sb1, builder.CreateCall(splat_1d, {sb1, bk})); + llvm::Value* last_maskb = builder.CreateCall(outer_and, {in_bounds_b0, in_bounds_b1}, "last_maskb"); + llvm::Value* last_bb = builder.CreateCall(masked_load, {next_pb, last_maskb}, "last_bb"); + llvm::Value* last_b = builder.CreateCall(reshape, {last_bb, bn, bk}, "last_b"); + llvm::Value* loop = builder.CreateICmpSGT(next_crs, _s0); + a->addIncoming(next_a, LastIterBB); + b->addIncoming(last_b, LastIterBB); + c->addIncoming(next_c, LastIterBB); + crs->addIncoming(next_crs, LastIterBB); + pa->addIncoming(next_pa, LastIterBB); + pb->addIncoming(next_pb, LastIterBB); + pdelta->addIncoming(next_pdelta, LastIterBB); + pincdelta->addIncoming(next_pincdelta, LastIterBB); + pmasks->addIncoming(next_pmask, LastIterBB); + pincmasks->addIncoming(next_pincmask, LastIterBB); + builder.CreateCondBr(loop, LoopBB, EpilogueBB); + + // Epilogue + builder.SetInsertPoint(EpilogueBB); + llvm::Value* sc_pqn = builder.CreateCall(read_slice_x, {bm}, "o_slice_pqn"); + llvm::Value* sc_k = builder.CreateCall(read_slice_y, {bn}, "o_slice_k"); + // Output strides + llvm::Value* ldc_q = builder.getInt32(1); + llvm::Value* ldc_p = builder.CreateMul(lda_w, W); + llvm::Value* ldc_k = builder.CreateMul(lda_h, H); + llvm::Value* ldb_n = builder.CreateMul(lda_c, K); + // Output PQN offset + llvm::Value* sc_pq = builder.CreateUDiv(sc_pqn, builder.CreateCall(splat_1d, {sc_pqn, N})); + llvm::Value* sc_n = builder.CreateURem(sc_pqn, builder.CreateCall(splat_1d, {sc_pqn, N})); + llvm::Value* sc_p = builder.CreateUDiv(sc_pq, builder.CreateCall(splat_1d, {sc_pqn, W})); + llvm::Value* sc_q = builder.CreateURem(sc_pq, builder.CreateCall(splat_1d, {sc_pqn, W})); + llvm::Value* offc0 = builder.CreateMul(sc_n, builder.CreateCall(splat_1d, {sc_pqn, ldb_n})); + offc0 = builder.CreateAdd(offc0, builder.CreateMul(sc_p, builder.CreateCall(splat_1d, {sc_pqn, ldc_p}))); + offc0 = builder.CreateAdd(offc0, builder.CreateMul(sc_q, builder.CreateCall(splat_1d, {sc_pqn, ldc_q}))); + // Output K offset + llvm::Value* offc1 = builder.CreateMul(sc_k, builder.CreateCall(splat_1d, {sc_k, ldc_k})); + // Output pointer + llvm::Value* offc = builder.CreateCall(outer_add, {offc0, offc1}); + llvm::Value* pc = builder.CreateCall(gtp_2d, {base_o_ptr, offc}); + // Output masks + llvm::Value* in_bounds_c0 = builder.CreateICmpSLT(sc_pqn, builder.CreateCall(splat_1d, {sc_pqn, PQN})); + llvm::Value* in_bounds_c1 = builder.CreateICmpSLT(sc_k, builder.CreateCall(splat_1d, {sc_k, K})); + llvm::Value* maskc = builder.CreateCall(outer_and, {in_bounds_c0, in_bounds_c1}); + builder.CreateCall(masked_store, {next_c, pc, maskc}); + builder.CreateRet(NULL); + + + // Set metadata + llvm::Metadata *md_args[] = { + llvm::ValueAsMetadata::get(F), + llvm::MDString::get(context, "kernel"), + llvm::ValueAsMetadata::get(llvm::ConstantInt::get(llvm::Type::getInt32Ty(context), 1)) + }; + module->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(llvm::MDNode::get(context, md_args)); + + // Machine + module->setTargetTriple("nvptx64-nvidia-cuda"); + auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error); + + llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), "sm_52", "", + llvm::TargetOptions(), llvm::Reloc::Model(), + llvm::CodeModel::Model(), llvm::CodeGenOpt::Aggressive); + module->setDataLayout(machine->createDataLayout()); + + // Auto-tuning + autotune(machine, *module); + + // Emit + llvm::legacy::PassManager pass; + llvm::SmallVector buffer; + llvm::raw_svector_ostream stream(buffer); + machine->addPassesToEmitFile(pass, stream, nullptr, llvm::TargetMachine::CGFT_AssemblyFile); + pass.run(*module); + std::string src(buffer.begin(), buffer.end()); + + // Execute + std::cout << src << std::endl; +} diff --git a/gemm.cpp b/gemm.cpp new file mode 100644 index 000000000..6a63327b9 --- /dev/null +++ b/gemm.cpp @@ -0,0 +1,342 @@ +#include + +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ExecutionEngine/ExecutionEngine.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Verifier.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Host.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/Cloning.h" + + +bool AT = false; +bool BT = true; + + +void autotune(llvm::TargetMachine *machine, llvm::Module &module){ + // Target parameters + std::vector ranges = { + // asm + 2, 16, 1, 64, + // bsn + 2, 16, 1, 64, + // pa + 1, 2, 4, 8, + // pb + 1, 2, 4, + // sm + 2, 1, 16, 2, 2, 2 + }; + + // Function + llvm::Function *F = module.getFunction("kernel"); + + // Auto-tuning + llvm::legacy::PassManager pass; + llvm::TargetPassConfig *pass_config = static_cast(machine)->createPassConfig(pass); + llvm::FunctionPass *tuning_params = pass_config->createTargetTuningParameters(); + tuning_params->runOnFunction(*F); + + + // Gather all parameters + llvm::DenseSet unique; + llvm::SmallVector params; + for(llvm::BasicBlock &bb: *F) + for(llvm::Instruction &instr: bb){ + // Get tuning parameters for this particular instruction + std::vector tuning_params; + machine->getTargetTuner().getParams(&instr, tuning_params); + for(llvm::TargetTuner::ParamType ¶m: tuning_params){ + // This parameter has not been seen before + if(unique.insert(param.Value).second){ + std::cout << instr.getName().data() << " " << param.Name << std::endl; + params.push_back(param.Value); + } + } + } + + // Gather all constraints + std::vector> constraints; + for(llvm::BasicBlock &bb: *F) + for(llvm::Instruction &instr: bb) + machine->getTargetTuner().getConstraints(&instr, constraints); + + // Assign parameters + std::cout << params.size() << " " << ranges.size() << std::endl; + for(unsigned i = 0; i < params.size(); i++) + *params[i] = ranges[i]; + + // Verify constraints + bool valid = true; + for(auto &constraint: constraints){ + valid = valid & constraint(); + } + + if(!valid){ + printf("Invalid kernel parameters\n"); + exit(EXIT_FAILURE); + } +} + +int main(){ +// llvm::DebugFlag = true; + + std::string error; + + llvm::InitializeAllTargetInfos(); + llvm::InitializeAllTargets(); + llvm::InitializeAllTargetMCs(); + llvm::InitializeAllAsmParsers(); + llvm::InitializeAllAsmPrinters(); + + // Module + llvm::LLVMContext context; + std::unique_ptr module = llvm::make_unique("TLVM toy example", context); + llvm::IRBuilder<> builder(context); + + // Globals + llvm::Type* bool_t = llvm::Type::getInt1Ty(context); + llvm::Type* mask_tile_t = llvm::TileType::get(bool_t, 2); + llvm::Type* numeric_t = llvm::Type::getFloatTy(context); + llvm::PointerType* numeric_ptr_t = llvm::PointerType::get(numeric_t, 0); + llvm::IntegerType* int32_t = llvm::Type::getInt32Ty(context); + llvm::IntegerType* int1_t = llvm::Type::getInt1Ty(context); + + llvm::Type* tile_t = llvm::TileType::get(numeric_t, 2); + llvm::Type* int32_slice_t = llvm::TileType::get(int32_t, 1); + llvm::Type* int32_tile_t = llvm::TileType::get(int32_t, 2); + llvm::Type* int1_slice_t = llvm::TileType::get(int1_t, 1); + llvm::Type* int1_tile_t = llvm::TileType::get(int1_t, 2); + + llvm::PointerType* tile_ptr_t = llvm::PointerType::get(tile_t, 0); + llvm::Function* read_slice_x = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_read_slice_x, {int32_slice_t}); + llvm::Function* read_slice_y = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_read_slice_y, {int32_slice_t}); + llvm::Function* range = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_range, {int32_slice_t}); + llvm::Function* gtp = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_gtp_2d, {tile_ptr_t, numeric_ptr_t, int32_tile_t}); + llvm::Function* stp = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_stp_2d, {tile_ptr_t, int32_tile_t}); + llvm::Intrinsic::ID mma_id; + if(!AT && !BT) mma_id = llvm::Intrinsic::tlvm_mma_nn; + if(!AT && BT) mma_id = llvm::Intrinsic::tlvm_mma_nt; + if(AT && !BT) mma_id = llvm::Intrinsic::tlvm_mma_tn; + if(AT && BT) mma_id = llvm::Intrinsic::tlvm_mma_tt; + llvm::Function* broadcast_int32 = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_broadcast_1d, {int32_tile_t, int32_slice_t}); + llvm::Function* broadcast_int1 = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_broadcast_1d, {int1_tile_t, int1_slice_t}); + llvm::Function* outer_add = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_outer_add, {int32_tile_t, int32_slice_t, int32_slice_t}); + llvm::Function* outer_and = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_outer_and, {int1_tile_t, int1_slice_t, int1_slice_t}); + llvm::Function* mma = llvm::Intrinsic::getDeclaration(module.get(), mma_id, {tile_t}); + llvm::Function* reshape = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_reshape_2d, {tile_t}); + llvm::Function* splat_2d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_splat_2d, {mask_tile_t, tile_t, bool_t}); + llvm::Function* splat_1d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_splat_1d, {int32_slice_t, int32_slice_t, int32_t}); + llvm::Function* masked_load = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_masked_load, {tile_t, tile_ptr_t, mask_tile_t}); + llvm::Function* masked_store = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_masked_store, {tile_t, tile_ptr_t, mask_tile_t}); + + // Hyperparameters + llvm::Hyperparameter *bm = llvm::Hyperparameter::get(int32_t, 0); + llvm::Hyperparameter *bn = llvm::Hyperparameter::get(int32_t, 1); + llvm::Hyperparameter *bk = llvm::Hyperparameter::get(int32_t, 2); + + // Constants + llvm::Constant *_s0 = llvm::ConstantInt::get(int32_t, 0); + llvm::Constant *_f0 = llvm::ConstantFP::get(numeric_t, 0); + llvm::Constant *_0 = llvm::ConstantTile::get(_f0, {bm, bn}); + + // Function + llvm::FunctionType* prototype = llvm::FunctionType::get(llvm::Type::getVoidTy(context), std::vector{numeric_ptr_t, numeric_ptr_t, numeric_ptr_t, int32_t, int32_t, int32_t, int32_t}, false); + llvm::Function* F = llvm::Function::Create(prototype, llvm::Function::ExternalLinkage, "kernel", module.get()); + std::vector arguments; + F->addAttribute(1, llvm::Attribute::ReadOnly); + F->addAttribute(1, llvm::Attribute::NoAlias); + F->addAttribute(2, llvm::Attribute::ReadOnly); + F->addAttribute(2, llvm::Attribute::NoAlias); + std::transform(F->arg_begin(), F->arg_end(), std::back_inserter(arguments), [&](llvm::Argument& x){ return &x;}); + arguments[0]->setName("pa"); + arguments[1]->setName("pb"); + arguments[2]->setName("pc"); + arguments[3]->setName("M"); + arguments[4]->setName("N"); + arguments[5]->setName("K"); + arguments[6]->setName("bound"); + + // All basic blocks + llvm::BasicBlock* PrologBB = llvm::BasicBlock::Create(context, "prologue", F); + llvm::BasicBlock* LoopBB = llvm::BasicBlock::Create(context, "loop", F); + llvm::BasicBlock* EarlyExitBB = llvm::BasicBlock::Create(context, "early_exit", F); + llvm::BasicBlock* LastIterBB = llvm::BasicBlock::Create(context, "last_iter", F); + llvm::BasicBlock* EpilogueBB = llvm::BasicBlock::Create(context, "epilogue", F); + + + // First basic block + builder.SetInsertPoint(PrologBB); + + llvm::CallInst* aasm = builder.CreateCall(read_slice_x, {bm}, "asm"); + llvm::CallInst* bbsn = builder.CreateCall(read_slice_y, {bn}, "bsn"); + llvm::CallInst* ask = builder.CreateCall(range, {builder.getInt32(0), bk}, "ask"); + llvm::CallInst* bsk = builder.CreateCall(range, {builder.getInt32(0), bk}, "bsk"); + + llvm::Value *M = arguments[3], *N = arguments[4], *K = arguments[5]; + llvm::Value *bound = arguments[6]; + llvm::Value *AS0 = M, *AS1 = K; + llvm::Value *sa0 = aasm, *sa1 = ask; + llvm::Value *ba0 = bm, *ba1 = bk; + llvm::Value *inca0 = _s0, *inca1 = bk; + if(AT){ + std::swap(AS0, AS1); + std::swap(sa0, sa1); + std::swap(ba0, ba1); + std::swap(inca0, inca1); + } + llvm::Value *BS0 = K, *BS1 = N; + llvm::Value *sb0 = bsk, *sb1 = bbsn; + llvm::Value *bb0 = bk, *bb1 = bn; + llvm::Value *incb0 = bk, *incb1 = _s0; + if(BT){ + std::swap(BS0, BS1); + std::swap(sb0, sb1); + std::swap(bb0, bb1); + std::swap(incb0, incb1); + } + + llvm::CallInst* tlda = builder.CreateCall(splat_1d, {sa1, AS0}, "lda"); + llvm::CallInst* tldb = builder.CreateCall(splat_1d, {sb1, BS1}, "ldb"); + llvm::CallInst* offa = builder.CreateCall(outer_add, {sa0, builder.CreateMul(sa1, tlda)}, "offa"); + llvm::CallInst* offb = builder.CreateCall(outer_add, {sb0, builder.CreateMul(sb1, tldb)}, "offb"); + llvm::CallInst* startpa = builder.CreateCall(gtp, {arguments[0], offa}, "startpa"); + llvm::CallInst* startpb = builder.CreateCall(gtp, {arguments[1], offb}, "startpb"); + llvm::LoadInst* startfa = builder.CreateLoad(startpa, "startfa"); + llvm::LoadInst* startfb = builder.CreateLoad(startpb, "startfb"); + llvm::Value* starta = builder.CreateCall(reshape, {startfa, ba0, ba1}, "starta"); + llvm::Value* startb = builder.CreateCall(reshape, {startfb, bb0, bb1}, "startb"); + llvm::Value* tinca0 = builder.CreateCall(splat_1d, {sa0, builder.CreateMul(inca0, AS0)}); + llvm::Value* tinca1 = builder.CreateCall(splat_1d, {sa1, builder.CreateMul(inca1, AS1)}); + llvm::Value* tincb0 = builder.CreateCall(splat_1d, {sb0, builder.CreateMul(incb0, BS0)}); + llvm::Value* tincb1 = builder.CreateCall(splat_1d, {sb1, builder.CreateMul(incb1, BS1)}); + llvm::Value* inca = builder.CreateCall(outer_add, {tinca0, tinca1}, "inca"); + llvm::Value* incb = builder.CreateCall(outer_add, {tincb0, tincb1}, "incb"); + // Enter loop + builder.CreateBr(LoopBB); + builder.SetInsertPoint(LoopBB); + // PHI nodes + llvm::PHINode* c = builder.CreatePHI(_0->getType(), 2, "c"); + llvm::PHINode* k = builder.CreatePHI(int32_t, 2, "k"); + llvm::PHINode* pa = builder.CreatePHI(startpa->getType(), 2, "pa"); + llvm::PHINode* pb = builder.CreatePHI(startpb->getType(), 2, "pb"); + llvm::PHINode *a = builder.CreatePHI(starta->getType(), 2, "a"); + llvm::PHINode *b = builder.CreatePHI(startb->getType(), 2, "b"); + llvm::Value* nextc = builder.CreateCall(mma, {a, b, c}, "nextc"); + c->addIncoming(_0, PrologBB); + c->addIncoming(nextc, LoopBB); + // Induction variable + llvm::Value *nextk = builder.CreateSub(k, bk); + k->addIncoming(K, PrologBB); + k->addIncoming(nextk, LoopBB); + // Update pointer + llvm::Value *nextpa = builder.CreateCall(stp, {pa, inca}, "nextpa"); + llvm::Value *nextpb = builder.CreateCall(stp, {pb, incb}, "nextpb"); + pa->addIncoming(startpa, PrologBB); + pa->addIncoming(nextpa, LoopBB); + pb->addIncoming(startpb, PrologBB); + pb->addIncoming(nextpb, LoopBB); + // End condition + llvm::Value* no_bounds_check = builder.CreateICmpSGT(nextk, bound); + // Masks + llvm::Value* maska = builder.CreateCall(splat_2d, {startfa, no_bounds_check}, "maska"); + llvm::Value* maskb = builder.CreateCall(splat_2d, {startfb, no_bounds_check}, "maskb"); + // Pre-fetch + llvm::Value* nextfa = builder.CreateCall(masked_load, {nextpa, maska}, "nextfa"); + llvm::Value* nextfb = builder.CreateCall(masked_load, {nextpb, maskb}, "nextfb"); + llvm::Value* nexta = builder.CreateCall(reshape, {nextfa, ba0, ba1}, "nexta"); + llvm::Value* nextb = builder.CreateCall(reshape, {nextfb, bb0, bb1}, "nextb"); + a->addIncoming(starta, PrologBB); + a->addIncoming(nexta, LoopBB); + b->addIncoming(startb, PrologBB); + b->addIncoming(nextb, LoopBB); + // End condition + builder.CreateCondBr(no_bounds_check, LoopBB, EarlyExitBB); + // Early exit + builder.SetInsertPoint(EarlyExitBB); + llvm::Value* exit = builder.CreateICmpSLE(nextk, _s0); + builder.CreateCondBr(exit, EpilogueBB, LastIterBB); + // Last Iteration + builder.SetInsertPoint(LastIterBB); + llvm::Value* in_bounds_a0 = builder.CreateICmpSLT(aasm, builder.CreateCall(splat_1d, {aasm, M})); + llvm::Value* in_bounds_a1 = builder.CreateICmpSLT(ask, builder.CreateCall(splat_1d, {ask, bk})); + llvm::Value* in_bounds_b0 = builder.CreateICmpSLT(bbsn, builder.CreateCall(splat_1d, {bbsn, N})); + llvm::Value* in_bounds_b1 = builder.CreateICmpSLT(bsk, builder.CreateCall(splat_1d, {bsk, bk})); + llvm::Value* lastmaska = builder.CreateCall(outer_and, {in_bounds_a0, in_bounds_a1}, "lastmaska"); + llvm::Value* lastmaskb = builder.CreateCall(outer_and, {in_bounds_b0, in_bounds_b1}, "lastmaskb"); + llvm::Value* lastfa = builder.CreateCall(masked_load, {nextpa, lastmaska}, "lastfa"); + llvm::Value* lastfb = builder.CreateCall(masked_load, {nextpb, lastmaskb}, "lastfb"); + llvm::Value* lasta = builder.CreateCall(reshape, {lastfa, ba0, ba1}, "lasta"); + llvm::Value* lastb = builder.CreateCall(reshape, {lastfb, bb0, bb1}, "lastb"); + llvm::Value* loop = builder.CreateICmpSGT(nextk, _s0); + a->addIncoming(lasta, LastIterBB); + b->addIncoming(lastb, LastIterBB); + c->addIncoming(nextc, LastIterBB); + k->addIncoming(nextk, LastIterBB); + pa->addIncoming(nextpa, LastIterBB); + pb->addIncoming(nextpb, LastIterBB); + builder.CreateCondBr(loop, LoopBB, EpilogueBB); + // Epilogue + builder.SetInsertPoint(EpilogueBB); + llvm::CallInst* sm = builder.CreateCall(read_slice_x, {bm}, "sm"); + llvm::CallInst* sn = builder.CreateCall(read_slice_y, {bn}, "sn"); + llvm::CallInst* ldc = builder.CreateCall(splat_1d, {sn, M}, "lda"); + llvm::CallInst* offc = builder.CreateCall(outer_add, {sm, builder.CreateMul(sn, ldc)}, "offc"); + llvm::CallInst* pc = builder.CreateCall(gtp, {arguments[2], offc}, "pc"); + llvm::Value* in_bounds_c0 = builder.CreateICmpSLT(sm, builder.CreateCall(splat_1d, {sm, M})); + llvm::Value* in_bounds_c1 = builder.CreateICmpSLT(sn, builder.CreateCall(splat_1d, {sn, N})); + llvm::Value* maskc = builder.CreateCall(outer_and, {in_bounds_c0, in_bounds_c1}, "maskc"); + builder.CreateCall(masked_store, {nextc, pc, maskc}); + builder.CreateRet(NULL); + + + // Set metadata + llvm::Metadata *md_args[] = { + llvm::ValueAsMetadata::get(F), + llvm::MDString::get(context, "kernel"), + llvm::ValueAsMetadata::get(llvm::ConstantInt::get(llvm::Type::getInt32Ty(context), 1)) + }; + module->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(llvm::MDNode::get(context, md_args)); + + // Machine + module->setTargetTriple("nvptx64-nvidia-cuda"); + auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error); + + llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), "sm_52", "", + llvm::TargetOptions(), llvm::Reloc::Model(), + llvm::CodeModel::Model(), llvm::CodeGenOpt::Aggressive); + module->setDataLayout(machine->createDataLayout()); + + // Auto-tuning + autotune(machine, *module); + + // Emit + llvm::legacy::PassManager pass; + llvm::SmallVector buffer; + llvm::raw_svector_ostream stream(buffer); + machine->addPassesToEmitFile(pass, stream, nullptr, llvm::TargetMachine::CGFT_AssemblyFile); + pass.run(*module); + std::string src(buffer.begin(), buffer.end()); + + // Execute + std::cout << src << std::endl; +} From e0cd621bb867018a8aea22b6081a7018033bfcad Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 27 Nov 2018 09:39:56 +0100 Subject: [PATCH 003/494] more tinkering --- conv.cpp | 113 ++++++++++++++++++++++++++++--------------------------- gemm.cpp | 36 +++++++++--------- 2 files changed, 75 insertions(+), 74 deletions(-) diff --git a/conv.cpp b/conv.cpp index 0132b3fe2..d806b87c7 100644 --- a/conv.cpp +++ b/conv.cpp @@ -65,7 +65,7 @@ void autotune(llvm::TargetMachine *machine, llvm::Module &module){ for(llvm::TargetTuner::ParamType ¶m: tuning_params){ // This parameter has not been seen before if(unique.insert(param.Value).second){ - std::cout << instr.getName().data() << " " << param.Name << std::endl; + std::cout << "PARAM: " << instr.getName().data() << " " << param.Name << std::endl; params.push_back(param.Value); } } @@ -142,10 +142,12 @@ int main(){ llvm::Intrinsic::ID mma_id = llvm::Intrinsic::tlvm_mma_nt; llvm::Function* outer_add = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_outer_add, {int32_tile_t, int32_slice_t, int32_slice_t}); llvm::Function* outer_and = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_outer_and, {int1_tile_t, int1_slice_t, int1_slice_t}); + llvm::Function* outer_and_int32 = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_outer_and, {int1_tile_t, int32_slice_t, int32_slice_t}); llvm::Function* mma = llvm::Intrinsic::getDeclaration(module.get(), mma_id, {tile_t}); llvm::Function* reshape = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_reshape_2d, {tile_t}); - llvm::Function* splat_2d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_splat_2d, {mask_tile_t, tile_t, bool_t}); - llvm::Function* splat_1d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_splat_1d, {int32_slice_t, int32_slice_t, int32_t}); + llvm::Function* splat_2d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_splat_2d, {mask_tile_t, bool_t}); + llvm::Function* splat_1d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_splat_1d, {int32_slice_t, int32_t}); + llvm::Function* masked_load = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_masked_load, {tile_t, tile_ptr_t, mask_tile_t}); llvm::Function* masked_store = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_masked_store, {tile_t, tile_ptr_t, mask_tile_t}); @@ -208,48 +210,48 @@ int main(){ llvm::Value* PQN = builder.CreateMul(H, builder.CreateMul(W, N)); // Images HWN offset - llvm::Value* sa_hw = builder.CreateUDiv(sa0, builder.CreateCall(splat_1d, {sa0, N})); - llvm::Value* sa_n = builder.CreateURem(sa0, builder.CreateCall(splat_1d, {sa0, N})); - llvm::Value* sa_h = builder.CreateUDiv(sa_hw, builder.CreateCall(splat_1d, {sa0, W})); - llvm::Value* sa_w = builder.CreateURem(sa_hw, builder.CreateCall(splat_1d, {sa0, W})); - llvm::Value* offa_0 = builder.CreateMul(sa_n, builder.CreateCall(splat_1d, {sa0, lda_n})); - offa_0 = builder.CreateAdd(offa_0, builder.CreateMul(sa_h, builder.CreateCall(splat_1d, {sa0, lda_h}))); - offa_0 = builder.CreateAdd(offa_0, builder.CreateMul(sa_w, builder.CreateCall(splat_1d, {sa0, lda_w}))); + llvm::Value* sa_hw = builder.CreateUDiv(sa0, builder.CreateCall(splat_1d, {bm, N})); + llvm::Value* sa_n = builder.CreateURem(sa0, builder.CreateCall(splat_1d, {bm, N})); + llvm::Value* sa_h = builder.CreateUDiv(sa_hw, builder.CreateCall(splat_1d, {bm, W})); + llvm::Value* sa_w = builder.CreateURem(sa_hw, builder.CreateCall(splat_1d, {bm, W})); + llvm::Value* offa_0 = builder.CreateMul(sa_n, builder.CreateCall(splat_1d, {bm, lda_n})); + offa_0 = builder.CreateAdd(offa_0, builder.CreateMul(sa_h, builder.CreateCall(splat_1d, {bm, lda_h}))); + offa_0 = builder.CreateAdd(offa_0, builder.CreateMul(sa_w, builder.CreateCall(splat_1d, {bm, lda_w}))); // Images CRS offset - llvm::Value* sa_cr = builder.CreateUDiv(sa1, builder.CreateCall(splat_1d, {sa1, S})); - llvm::Value* sa_s = builder.CreateURem(sa1, builder.CreateCall(splat_1d, {sa1, S})); - llvm::Value* sa_c = builder.CreateUDiv(sa_cr, builder.CreateCall(splat_1d, {sa1, R})); - llvm::Value* sa_r = builder.CreateURem(sa_cr, builder.CreateCall(splat_1d, {sa1, R})); - llvm::Value* offa_1 = builder.CreateMul(sa_c, builder.CreateCall(splat_1d, {sa1, lda_c})); - offa_1 = builder.CreateAdd(offa_1, builder.CreateMul(sa_r, builder.CreateCall(splat_1d, {sa1, lda_h}))); - offa_1 = builder.CreateAdd(offa_1, builder.CreateMul(sa_s, builder.CreateCall(splat_1d, {sa1, lda_w}))); + llvm::Value* sa_cr = builder.CreateUDiv(sa1, builder.CreateCall(splat_1d, {bk, S})); + llvm::Value* sa_s = builder.CreateURem(sa1, builder.CreateCall(splat_1d, {bk, S})); + llvm::Value* sa_c = builder.CreateUDiv(sa_cr, builder.CreateCall(splat_1d, {bk, R})); + llvm::Value* sa_r = builder.CreateURem(sa_cr, builder.CreateCall(splat_1d, {bk, R})); + llvm::Value* offa_1 = builder.CreateMul(sa_c, builder.CreateCall(splat_1d, {bk, lda_c})); + offa_1 = builder.CreateAdd(offa_1, builder.CreateMul(sa_r, builder.CreateCall(splat_1d, {bk, lda_h}))); + offa_1 = builder.CreateAdd(offa_1, builder.CreateMul(sa_s, builder.CreateCall(splat_1d, {bk, lda_w}))); // Images pointer llvm::Value* off_a = builder.CreateCall(outer_add, {offa_0, offa_1}); llvm::Value* start_pa = builder.CreateCall(gtp_2d, {base_i_ptr, off_a}, "start_i_ptr"); llvm::LoadInst* start_aa = builder.CreateLoad(start_pa, false, "start_i_val"); llvm::Value* start_a = builder.CreateCall(reshape, {start_aa, bm, bk}, "start_i"); // Filters pointer - llvm::Value* tldb_s = builder.CreateCall(splat_1d, {sb1, K}); + llvm::Value* tldb_s = builder.CreateCall(splat_1d, {bk, K}); llvm::Value* off_b = builder.CreateCall(outer_add, {sb0, builder.CreateMul(sb1, tldb_s)}, "off_f"); llvm::Value* start_pb = builder.CreateCall(gtp_2d, {base_f_ptr, off_b}, "start_f_ptr"); llvm::Value* start_bb = builder.CreateLoad(start_pb, false, "start_f_val"); llvm::Value* start_b = builder.CreateCall(reshape, {start_bb, bn, bk}, "start_f"); // Filters increment - llvm::Value* inc_b_0 = builder.CreateCall(splat_1d, {sb0, _s0}, "inc_f_0"); - llvm::Value* inc_b_1 = builder.CreateCall(splat_1d, {sb1, builder.CreateMul(bk, ldb_k)}, "inc_f_1"); + llvm::Value* inc_b_0 = builder.CreateCall(splat_1d, {bn, _s0}, "inc_f_0"); + llvm::Value* inc_b_1 = builder.CreateCall(splat_1d, {bk, builder.CreateMul(bk, ldb_k)}, "inc_f_1"); llvm::Value* inc_b = builder.CreateCall(outer_add, {inc_b_0, inc_b_1}, "inc_f"); // Delta pointers llvm::Value* base_incdelta = lut_ptr; - llvm::Value* start_pincdelta = builder.CreateCall(gtp_1d, {base_incdelta, sa1}, "start_pincdelta"); + llvm::Value* start_pincdelta = builder.CreateCall(gtp_1d, {base_incdelta, sa0}, "start_pincdelta"); llvm::Value* base_delta = builder.CreateGEP(lut_ptr, builder.getInt32(nlut)); - llvm::Value* start_pdelta = builder.CreateCall(gtp_1d, {base_delta, builder.CreateCall(splat_1d, {sa1, _s0})}, "start_pdelta"); + llvm::Value* start_pdelta = builder.CreateCall(gtp_1d, {base_delta, builder.CreateCall(splat_1d, {bk, _s0})}, "start_pdelta"); // Masks - llvm::Value* _1 = builder.CreateCall(splat_1d, {sb1, builder.getInt32(1)}); - llvm::Value* mask_a_1 = builder.CreateShl(_1, sb1); - llvm::Value* base_incmask = builder.CreateGEP(lut_ptr, builder.getInt32(2*nlut)); - llvm::Value* start_pincmask = builder.CreateCall(gtp_1d, {base_incmask, sb1}); - llvm::Value* base_mask = builder.CreateGEP(lut_ptr, builder.getInt32(3*nlut)); - llvm::Value* start_pmask = builder.CreateCall(gtp_1d, {base_mask, sb1}); +// llvm::Value* _1 = builder.CreateCall(splat_1d, {bk, builder.getInt32(1)}); +// llvm::Value* mask_a_1 = builder.CreateShl(_1, sa1); +// llvm::Value* base_incmask = builder.CreateGEP(lut_ptr, builder.getInt32(2*nlut)); +// llvm::Value* start_pincmask = builder.CreateCall(gtp_1d, {base_incmask, sa0}); +// llvm::Value* base_mask = builder.CreateGEP(lut_ptr, builder.getInt32(3*nlut)); +// llvm::Value* start_pmask = builder.CreateCall(gtp_1d, {base_mask, sa0}); // Enter loop builder.CreateBr(LoopBB); builder.SetInsertPoint(LoopBB); @@ -262,8 +264,8 @@ int main(){ llvm::PHINode *b = builder.CreatePHI(start_b->getType(), 3, "b"); llvm::PHINode *pdelta = builder.CreatePHI(start_pdelta->getType(), 3); llvm::PHINode *pincdelta = builder.CreatePHI(start_pincdelta->getType(), 3); - llvm::PHINode *pmasks = builder.CreatePHI(start_pmask->getType(), 3); - llvm::PHINode *pincmasks = builder.CreatePHI(start_pincmask->getType(), 3); +// llvm::PHINode *pmasks = builder.CreatePHI(start_pmask->getType(), 3); +// llvm::PHINode *pincmasks = builder.CreatePHI(start_pincmask->getType(), 3); llvm::Value* next_c = builder.CreateCall(mma, {a, b, c}, "next_c"); c->addIncoming(_0, PrologBB); c->addIncoming(next_c, LoopBB); @@ -273,24 +275,24 @@ int main(){ crs->addIncoming(next_crs, LoopBB); // Update pointer llvm::Value *inc_delta = builder.CreateLoad(pincdelta); - llvm::Value *inc_mask = builder.CreateLoad(pincmasks); +// llvm::Value *inc_mask = builder.CreateLoad(pincmasks); llvm::Value *inc_a_1 = builder.CreateLoad(pdelta); - llvm::Value *inc_a_0 = builder.CreateCall(splat_1d, {sa0, builder.getInt32(0)}); + llvm::Value *inc_a_0 = builder.CreateCall(splat_1d, {bm, builder.getInt32(0)}); llvm::Value *inc_a = builder.CreateCall(outer_add, {inc_a_0, inc_a_1}); llvm::Value *next_pa = builder.CreateCall(stp_2d, {pa, inc_a}, "next_i_ptr"); llvm::Value *next_pb = builder.CreateCall(stp_2d, {pb, inc_b}, "next_f_ptr"); llvm::Value *next_pdelta = builder.CreateCall(stp_1d, {pdelta, inc_delta}); llvm::Value *next_pincdelta = builder.CreateCall(stp_1d, {pincdelta, inc_delta}); - llvm::Value *next_pmask = builder.CreateCall(stp_1d, {pmasks, inc_mask}); - llvm::Value *next_pincmask = builder.CreateCall(stp_1d, {pincmasks, inc_mask}); +// llvm::Value *next_pmask = builder.CreateCall(stp_1d, {pmasks, inc_mask}); +// llvm::Value *next_pincmask = builder.CreateCall(stp_1d, {pincmasks, inc_mask}); pdelta->addIncoming(start_pdelta, PrologBB); pdelta->addIncoming(next_pdelta, LoopBB); pincdelta->addIncoming(start_pincdelta, PrologBB); pincdelta->addIncoming(next_pincdelta, LoopBB); - pmasks->addIncoming(start_pmask, PrologBB); - pmasks->addIncoming(next_pmask, LoopBB); - pincmasks->addIncoming(start_pincmask, PrologBB); - pincmasks->addIncoming(next_pincmask, LoopBB); +// pmasks->addIncoming(start_pmask, PrologBB); +// pmasks->addIncoming(next_pmask, LoopBB); +// pincmasks->addIncoming(start_pincmask, PrologBB); +// pincmasks->addIncoming(next_pincmask, LoopBB); pa->addIncoming(start_pa, PrologBB); pa->addIncoming(next_pa, LoopBB); pb->addIncoming(start_pb, PrologBB); @@ -298,9 +300,10 @@ int main(){ // End condition llvm::Value* no_bounds_check = builder.CreateICmpSGT(next_crs, builder.getInt32(0)); // Masks - llvm::Value* mask_a_0 = builder.CreateLoad(pdelta); - llvm::Value* mask_a = builder.CreateCall(outer_and, {mask_a_0, mask_a_1}); - llvm::Value* mask_b = builder.CreateCall(splat_2d, {start_bb, no_bounds_check}, "mask_b"); +// llvm::Value* mask_a_0 = builder.CreateLoad(pmasks); +// llvm::Value* mask_a = builder.CreateCall(outer_and_int32, {mask_a_0, mask_a_1}); + llvm::Value* mask_a = builder.CreateCall(splat_2d, {bm, bk, no_bounds_check}, "mask_a"); + llvm::Value* mask_b = builder.CreateCall(splat_2d, {bn, bk, no_bounds_check}, "mask_b"); // Pre-fetch llvm::Value* next_aa = builder.CreateCall(masked_load, {next_pa, mask_a}, "next_aa"); llvm::Value* next_bb = builder.CreateCall(masked_load, {next_pb, mask_b}, "next_bb"); @@ -318,8 +321,8 @@ int main(){ builder.CreateCondBr(exit, EpilogueBB, LastIterBB); // Last Iteration builder.SetInsertPoint(LastIterBB); - llvm::Value* in_bounds_b0 = builder.CreateICmpSLT(sb0, builder.CreateCall(splat_1d, {sb0, K})); - llvm::Value* in_bounds_b1 = builder.CreateICmpSLT(sb1, builder.CreateCall(splat_1d, {sb1, bk})); + llvm::Value* in_bounds_b0 = builder.CreateICmpSLT(sb0, builder.CreateCall(splat_1d, {bn, K})); + llvm::Value* in_bounds_b1 = builder.CreateICmpSLT(sb1, builder.CreateCall(splat_1d, {bk, next_crs})); llvm::Value* last_maskb = builder.CreateCall(outer_and, {in_bounds_b0, in_bounds_b1}, "last_maskb"); llvm::Value* last_bb = builder.CreateCall(masked_load, {next_pb, last_maskb}, "last_bb"); llvm::Value* last_b = builder.CreateCall(reshape, {last_bb, bn, bk}, "last_b"); @@ -332,8 +335,8 @@ int main(){ pb->addIncoming(next_pb, LastIterBB); pdelta->addIncoming(next_pdelta, LastIterBB); pincdelta->addIncoming(next_pincdelta, LastIterBB); - pmasks->addIncoming(next_pmask, LastIterBB); - pincmasks->addIncoming(next_pincmask, LastIterBB); +// pmasks->addIncoming(next_pmask, LastIterBB); +// pincmasks->addIncoming(next_pincmask, LastIterBB); builder.CreateCondBr(loop, LoopBB, EpilogueBB); // Epilogue @@ -346,21 +349,21 @@ int main(){ llvm::Value* ldc_k = builder.CreateMul(lda_h, H); llvm::Value* ldb_n = builder.CreateMul(lda_c, K); // Output PQN offset - llvm::Value* sc_pq = builder.CreateUDiv(sc_pqn, builder.CreateCall(splat_1d, {sc_pqn, N})); - llvm::Value* sc_n = builder.CreateURem(sc_pqn, builder.CreateCall(splat_1d, {sc_pqn, N})); - llvm::Value* sc_p = builder.CreateUDiv(sc_pq, builder.CreateCall(splat_1d, {sc_pqn, W})); - llvm::Value* sc_q = builder.CreateURem(sc_pq, builder.CreateCall(splat_1d, {sc_pqn, W})); - llvm::Value* offc0 = builder.CreateMul(sc_n, builder.CreateCall(splat_1d, {sc_pqn, ldb_n})); - offc0 = builder.CreateAdd(offc0, builder.CreateMul(sc_p, builder.CreateCall(splat_1d, {sc_pqn, ldc_p}))); - offc0 = builder.CreateAdd(offc0, builder.CreateMul(sc_q, builder.CreateCall(splat_1d, {sc_pqn, ldc_q}))); + llvm::Value* sc_pq = builder.CreateUDiv(sc_pqn, builder.CreateCall(splat_1d, {bm, N})); + llvm::Value* sc_n = builder.CreateURem(sc_pqn, builder.CreateCall(splat_1d, {bm, N})); + llvm::Value* sc_p = builder.CreateUDiv(sc_pq, builder.CreateCall(splat_1d, {bm, W})); + llvm::Value* sc_q = builder.CreateURem(sc_pq, builder.CreateCall(splat_1d, {bm, W})); + llvm::Value* offc0 = builder.CreateMul(sc_n, builder.CreateCall(splat_1d, {bm, ldb_n})); + offc0 = builder.CreateAdd(offc0, builder.CreateMul(sc_p, builder.CreateCall(splat_1d, {bm, ldc_p}))); + offc0 = builder.CreateAdd(offc0, builder.CreateMul(sc_q, builder.CreateCall(splat_1d, {bm, ldc_q}))); // Output K offset - llvm::Value* offc1 = builder.CreateMul(sc_k, builder.CreateCall(splat_1d, {sc_k, ldc_k})); + llvm::Value* offc1 = builder.CreateMul(sc_k, builder.CreateCall(splat_1d, {bn, ldc_k})); // Output pointer llvm::Value* offc = builder.CreateCall(outer_add, {offc0, offc1}); llvm::Value* pc = builder.CreateCall(gtp_2d, {base_o_ptr, offc}); // Output masks - llvm::Value* in_bounds_c0 = builder.CreateICmpSLT(sc_pqn, builder.CreateCall(splat_1d, {sc_pqn, PQN})); - llvm::Value* in_bounds_c1 = builder.CreateICmpSLT(sc_k, builder.CreateCall(splat_1d, {sc_k, K})); + llvm::Value* in_bounds_c0 = builder.CreateICmpSLT(sc_pqn, builder.CreateCall(splat_1d, {bm, PQN})); + llvm::Value* in_bounds_c1 = builder.CreateICmpSLT(sc_k, builder.CreateCall(splat_1d, {bn, K})); llvm::Value* maskc = builder.CreateCall(outer_and, {in_bounds_c0, in_bounds_c1}); builder.CreateCall(masked_store, {next_c, pc, maskc}); builder.CreateRet(NULL); diff --git a/gemm.cpp b/gemm.cpp index 6a63327b9..5adc3ebbd 100644 --- a/gemm.cpp +++ b/gemm.cpp @@ -138,14 +138,12 @@ int main(){ if(!AT && BT) mma_id = llvm::Intrinsic::tlvm_mma_nt; if(AT && !BT) mma_id = llvm::Intrinsic::tlvm_mma_tn; if(AT && BT) mma_id = llvm::Intrinsic::tlvm_mma_tt; - llvm::Function* broadcast_int32 = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_broadcast_1d, {int32_tile_t, int32_slice_t}); - llvm::Function* broadcast_int1 = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_broadcast_1d, {int1_tile_t, int1_slice_t}); llvm::Function* outer_add = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_outer_add, {int32_tile_t, int32_slice_t, int32_slice_t}); llvm::Function* outer_and = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_outer_and, {int1_tile_t, int1_slice_t, int1_slice_t}); llvm::Function* mma = llvm::Intrinsic::getDeclaration(module.get(), mma_id, {tile_t}); llvm::Function* reshape = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_reshape_2d, {tile_t}); - llvm::Function* splat_2d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_splat_2d, {mask_tile_t, tile_t, bool_t}); - llvm::Function* splat_1d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_splat_1d, {int32_slice_t, int32_slice_t, int32_t}); + llvm::Function* splat_2d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_splat_2d, {mask_tile_t, bool_t}); + llvm::Function* splat_1d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_splat_1d, {int32_slice_t, int32_t}); llvm::Function* masked_load = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_masked_load, {tile_t, tile_ptr_t, mask_tile_t}); llvm::Function* masked_store = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_masked_store, {tile_t, tile_ptr_t, mask_tile_t}); @@ -215,8 +213,8 @@ int main(){ std::swap(incb0, incb1); } - llvm::CallInst* tlda = builder.CreateCall(splat_1d, {sa1, AS0}, "lda"); - llvm::CallInst* tldb = builder.CreateCall(splat_1d, {sb1, BS1}, "ldb"); + llvm::CallInst* tlda = builder.CreateCall(splat_1d, {ba1, AS0}, "lda"); + llvm::CallInst* tldb = builder.CreateCall(splat_1d, {bb1, BS1}, "ldb"); llvm::CallInst* offa = builder.CreateCall(outer_add, {sa0, builder.CreateMul(sa1, tlda)}, "offa"); llvm::CallInst* offb = builder.CreateCall(outer_add, {sb0, builder.CreateMul(sb1, tldb)}, "offb"); llvm::CallInst* startpa = builder.CreateCall(gtp, {arguments[0], offa}, "startpa"); @@ -225,10 +223,10 @@ int main(){ llvm::LoadInst* startfb = builder.CreateLoad(startpb, "startfb"); llvm::Value* starta = builder.CreateCall(reshape, {startfa, ba0, ba1}, "starta"); llvm::Value* startb = builder.CreateCall(reshape, {startfb, bb0, bb1}, "startb"); - llvm::Value* tinca0 = builder.CreateCall(splat_1d, {sa0, builder.CreateMul(inca0, AS0)}); - llvm::Value* tinca1 = builder.CreateCall(splat_1d, {sa1, builder.CreateMul(inca1, AS1)}); - llvm::Value* tincb0 = builder.CreateCall(splat_1d, {sb0, builder.CreateMul(incb0, BS0)}); - llvm::Value* tincb1 = builder.CreateCall(splat_1d, {sb1, builder.CreateMul(incb1, BS1)}); + llvm::Value* tinca0 = builder.CreateCall(splat_1d, {ba0, builder.CreateMul(inca0, AS0)}, "tinca0"); + llvm::Value* tinca1 = builder.CreateCall(splat_1d, {ba1, builder.CreateMul(inca1, AS1)}); + llvm::Value* tincb0 = builder.CreateCall(splat_1d, {bb0, builder.CreateMul(incb0, BS0)}); + llvm::Value* tincb1 = builder.CreateCall(splat_1d, {bb1, builder.CreateMul(incb1, BS1)}); llvm::Value* inca = builder.CreateCall(outer_add, {tinca0, tinca1}, "inca"); llvm::Value* incb = builder.CreateCall(outer_add, {tincb0, tincb1}, "incb"); // Enter loop @@ -258,8 +256,8 @@ int main(){ // End condition llvm::Value* no_bounds_check = builder.CreateICmpSGT(nextk, bound); // Masks - llvm::Value* maska = builder.CreateCall(splat_2d, {startfa, no_bounds_check}, "maska"); - llvm::Value* maskb = builder.CreateCall(splat_2d, {startfb, no_bounds_check}, "maskb"); + llvm::Value* maska = builder.CreateCall(splat_2d, {ba0, ba1, no_bounds_check}, "maska"); + llvm::Value* maskb = builder.CreateCall(splat_2d, {bb0, bb1, no_bounds_check}, "maskb"); // Pre-fetch llvm::Value* nextfa = builder.CreateCall(masked_load, {nextpa, maska}, "nextfa"); llvm::Value* nextfb = builder.CreateCall(masked_load, {nextpb, maskb}, "nextfb"); @@ -277,10 +275,10 @@ int main(){ builder.CreateCondBr(exit, EpilogueBB, LastIterBB); // Last Iteration builder.SetInsertPoint(LastIterBB); - llvm::Value* in_bounds_a0 = builder.CreateICmpSLT(aasm, builder.CreateCall(splat_1d, {aasm, M})); - llvm::Value* in_bounds_a1 = builder.CreateICmpSLT(ask, builder.CreateCall(splat_1d, {ask, bk})); - llvm::Value* in_bounds_b0 = builder.CreateICmpSLT(bbsn, builder.CreateCall(splat_1d, {bbsn, N})); - llvm::Value* in_bounds_b1 = builder.CreateICmpSLT(bsk, builder.CreateCall(splat_1d, {bsk, bk})); + llvm::Value* in_bounds_a0 = builder.CreateICmpSLT(aasm, builder.CreateCall(splat_1d, {ba0, M})); + llvm::Value* in_bounds_a1 = builder.CreateICmpSLT(ask, builder.CreateCall(splat_1d, {ba1, bk})); + llvm::Value* in_bounds_b0 = builder.CreateICmpSLT(bbsn, builder.CreateCall(splat_1d, {bb0, N})); + llvm::Value* in_bounds_b1 = builder.CreateICmpSLT(bsk, builder.CreateCall(splat_1d, {bb1, bk})); llvm::Value* lastmaska = builder.CreateCall(outer_and, {in_bounds_a0, in_bounds_a1}, "lastmaska"); llvm::Value* lastmaskb = builder.CreateCall(outer_and, {in_bounds_b0, in_bounds_b1}, "lastmaskb"); llvm::Value* lastfa = builder.CreateCall(masked_load, {nextpa, lastmaska}, "lastfa"); @@ -299,11 +297,11 @@ int main(){ builder.SetInsertPoint(EpilogueBB); llvm::CallInst* sm = builder.CreateCall(read_slice_x, {bm}, "sm"); llvm::CallInst* sn = builder.CreateCall(read_slice_y, {bn}, "sn"); - llvm::CallInst* ldc = builder.CreateCall(splat_1d, {sn, M}, "lda"); + llvm::CallInst* ldc = builder.CreateCall(splat_1d, {bn, M}, "lda"); llvm::CallInst* offc = builder.CreateCall(outer_add, {sm, builder.CreateMul(sn, ldc)}, "offc"); llvm::CallInst* pc = builder.CreateCall(gtp, {arguments[2], offc}, "pc"); - llvm::Value* in_bounds_c0 = builder.CreateICmpSLT(sm, builder.CreateCall(splat_1d, {sm, M})); - llvm::Value* in_bounds_c1 = builder.CreateICmpSLT(sn, builder.CreateCall(splat_1d, {sn, N})); + llvm::Value* in_bounds_c0 = builder.CreateICmpSLT(sm, builder.CreateCall(splat_1d, {bm, M})); + llvm::Value* in_bounds_c1 = builder.CreateICmpSLT(sn, builder.CreateCall(splat_1d, {bn, N})); llvm::Value* maskc = builder.CreateCall(outer_and, {in_bounds_c0, in_bounds_c1}, "maskc"); builder.CreateCall(masked_store, {nextc, pc, maskc}); builder.CreateRet(NULL); From 68c8de88f56d1cd3fe7a9a933c64f369b4d20ed2 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 27 Nov 2018 12:20:51 +0100 Subject: [PATCH 004/494] More cleaning of masks --- conv.cpp | 52 ++++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/conv.cpp b/conv.cpp index d806b87c7..7959a612d 100644 --- a/conv.cpp +++ b/conv.cpp @@ -242,16 +242,16 @@ int main(){ llvm::Value* inc_b = builder.CreateCall(outer_add, {inc_b_0, inc_b_1}, "inc_f"); // Delta pointers llvm::Value* base_incdelta = lut_ptr; - llvm::Value* start_pincdelta = builder.CreateCall(gtp_1d, {base_incdelta, sa0}, "start_pincdelta"); + llvm::Value* start_pincdelta = builder.CreateCall(gtp_1d, {base_incdelta, sa1}, "start_pincdelta"); llvm::Value* base_delta = builder.CreateGEP(lut_ptr, builder.getInt32(nlut)); llvm::Value* start_pdelta = builder.CreateCall(gtp_1d, {base_delta, builder.CreateCall(splat_1d, {bk, _s0})}, "start_pdelta"); // Masks -// llvm::Value* _1 = builder.CreateCall(splat_1d, {bk, builder.getInt32(1)}); -// llvm::Value* mask_a_1 = builder.CreateShl(_1, sa1); -// llvm::Value* base_incmask = builder.CreateGEP(lut_ptr, builder.getInt32(2*nlut)); -// llvm::Value* start_pincmask = builder.CreateCall(gtp_1d, {base_incmask, sa0}); -// llvm::Value* base_mask = builder.CreateGEP(lut_ptr, builder.getInt32(3*nlut)); -// llvm::Value* start_pmask = builder.CreateCall(gtp_1d, {base_mask, sa0}); + llvm::Value* _1 = builder.CreateCall(splat_1d, {bk, builder.getInt32(1)}); + llvm::Value* mask_a_1 = builder.CreateShl(_1, sa1); + llvm::Value* base_incmask = builder.CreateGEP(lut_ptr, builder.getInt32(2*nlut), "base_incmask"); + llvm::Value* start_pincmask = builder.CreateCall(gtp_1d, {base_incmask, sa0}, "start_pincmask"); + llvm::Value* base_mask = builder.CreateGEP(lut_ptr, builder.getInt32(3*nlut), "base_mask"); + llvm::Value* start_pmask = builder.CreateCall(gtp_1d, {base_mask, sa0}, "start_pmask"); // Enter loop builder.CreateBr(LoopBB); builder.SetInsertPoint(LoopBB); @@ -264,8 +264,8 @@ int main(){ llvm::PHINode *b = builder.CreatePHI(start_b->getType(), 3, "b"); llvm::PHINode *pdelta = builder.CreatePHI(start_pdelta->getType(), 3); llvm::PHINode *pincdelta = builder.CreatePHI(start_pincdelta->getType(), 3); -// llvm::PHINode *pmasks = builder.CreatePHI(start_pmask->getType(), 3); -// llvm::PHINode *pincmasks = builder.CreatePHI(start_pincmask->getType(), 3); + llvm::PHINode *pmasks = builder.CreatePHI(start_pmask->getType(), 3); + llvm::PHINode *pincmasks = builder.CreatePHI(start_pincmask->getType(), 3); llvm::Value* next_c = builder.CreateCall(mma, {a, b, c}, "next_c"); c->addIncoming(_0, PrologBB); c->addIncoming(next_c, LoopBB); @@ -275,34 +275,34 @@ int main(){ crs->addIncoming(next_crs, LoopBB); // Update pointer llvm::Value *inc_delta = builder.CreateLoad(pincdelta); -// llvm::Value *inc_mask = builder.CreateLoad(pincmasks); + llvm::Value *inc_mask = builder.CreateLoad(pincmasks); llvm::Value *inc_a_1 = builder.CreateLoad(pdelta); llvm::Value *inc_a_0 = builder.CreateCall(splat_1d, {bm, builder.getInt32(0)}); llvm::Value *inc_a = builder.CreateCall(outer_add, {inc_a_0, inc_a_1}); - llvm::Value *next_pa = builder.CreateCall(stp_2d, {pa, inc_a}, "next_i_ptr"); - llvm::Value *next_pb = builder.CreateCall(stp_2d, {pb, inc_b}, "next_f_ptr"); - llvm::Value *next_pdelta = builder.CreateCall(stp_1d, {pdelta, inc_delta}); - llvm::Value *next_pincdelta = builder.CreateCall(stp_1d, {pincdelta, inc_delta}); -// llvm::Value *next_pmask = builder.CreateCall(stp_1d, {pmasks, inc_mask}); -// llvm::Value *next_pincmask = builder.CreateCall(stp_1d, {pincmasks, inc_mask}); + llvm::Value *next_pa = builder.CreateCall(stp_2d, {pa, inc_a}, "next_pa"); + llvm::Value *next_pb = builder.CreateCall(stp_2d, {pb, inc_b}, "next_pb"); + llvm::Value *next_pdelta = builder.CreateCall(stp_1d, {pdelta, inc_delta}, "next_pdelta"); + llvm::Value *next_pincdelta = builder.CreateCall(stp_1d, {pincdelta, inc_delta}, "next_pincdelta"); + llvm::Value *next_pmask = builder.CreateCall(stp_1d, {pmasks, inc_mask}, "next_pmask"); + llvm::Value *next_pincmask = builder.CreateCall(stp_1d, {pincmasks, inc_mask}, "next_pincmask"); pdelta->addIncoming(start_pdelta, PrologBB); pdelta->addIncoming(next_pdelta, LoopBB); pincdelta->addIncoming(start_pincdelta, PrologBB); pincdelta->addIncoming(next_pincdelta, LoopBB); -// pmasks->addIncoming(start_pmask, PrologBB); -// pmasks->addIncoming(next_pmask, LoopBB); -// pincmasks->addIncoming(start_pincmask, PrologBB); -// pincmasks->addIncoming(next_pincmask, LoopBB); + pmasks->addIncoming(start_pmask, PrologBB); + pmasks->addIncoming(next_pmask, LoopBB); + pincmasks->addIncoming(start_pincmask, PrologBB); + pincmasks->addIncoming(next_pincmask, LoopBB); pa->addIncoming(start_pa, PrologBB); pa->addIncoming(next_pa, LoopBB); pb->addIncoming(start_pb, PrologBB); pb->addIncoming(next_pb, LoopBB); // End condition - llvm::Value* no_bounds_check = builder.CreateICmpSGT(next_crs, builder.getInt32(0)); + llvm::Value* no_bounds_check = builder.CreateICmpSGT(next_crs, builder.getInt32(0), "no_bounds_check"); // Masks -// llvm::Value* mask_a_0 = builder.CreateLoad(pmasks); -// llvm::Value* mask_a = builder.CreateCall(outer_and_int32, {mask_a_0, mask_a_1}); - llvm::Value* mask_a = builder.CreateCall(splat_2d, {bm, bk, no_bounds_check}, "mask_a"); + llvm::Value* mask_a_0 = builder.CreateLoad(pmasks, "mask_a_0"); + llvm::Value* mask_a_i32 = builder.CreateCall(outer_and_int32, {mask_a_0, mask_a_1}, "mask_a_i32"); + llvm::Value* mask_a = builder.CreateICmpNE(mask_a_i32, llvm::ConstantTile::get(_s0, {bm, bk}), "mask_a"); llvm::Value* mask_b = builder.CreateCall(splat_2d, {bn, bk, no_bounds_check}, "mask_b"); // Pre-fetch llvm::Value* next_aa = builder.CreateCall(masked_load, {next_pa, mask_a}, "next_aa"); @@ -335,8 +335,8 @@ int main(){ pb->addIncoming(next_pb, LastIterBB); pdelta->addIncoming(next_pdelta, LastIterBB); pincdelta->addIncoming(next_pincdelta, LastIterBB); -// pmasks->addIncoming(next_pmask, LastIterBB); -// pincmasks->addIncoming(next_pincmask, LastIterBB); + pmasks->addIncoming(next_pmask, LastIterBB); + pincmasks->addIncoming(next_pincmask, LastIterBB); builder.CreateCondBr(loop, LoopBB, EpilogueBB); // Epilogue From 8b040b46454ffec90ef637bf1011fa9e278d5da5 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 3 Dec 2018 07:42:05 -0500 Subject: [PATCH 005/494] updates --- conv.cpp | 108 +++++++++++++++++++++++++++++++++++++++++-------------- gemm.cpp | 30 ++++++++-------- 2 files changed, 97 insertions(+), 41 deletions(-) diff --git a/conv.cpp b/conv.cpp index 7959a612d..fa99d301e 100644 --- a/conv.cpp +++ b/conv.cpp @@ -27,7 +27,57 @@ #include "llvm/Support/Debug.h" #include "llvm/Transforms/Utils/Cloning.h" +// Index computation +inline int32_t idx(int32_t x, int32_t y, int32_t z, int32_t w, int32_t u, + int32_t /*s0*/, int32_t s1, int32_t s2, int32_t s3, int32_t s4) +{ return u + w*s4 + z*s4*s3 + y*s4*s3*s2 + x*s4*s3*s2*s1; } +template +void cpp_conv_nchw(int32_t C, int32_t N, int32_t K, + int32_t D, int32_t H, int32_t W, + int32_t T, int32_t R, int32_t S, + int32_t pad_d, int32_t pad_h, int32_t pad_w, + int32_t stride_d, int32_t stride_h, int32_t stride_w, + int32_t M, int32_t P, int32_t Q, + std::vector>& O, IN_DTYPE* I, IN_DTYPE* F) +{ + size_t num_outputs = O.size(); + static const int PACK_IN = 1; + static const int PACK_OUT = 1; + if(C % PACK_IN != 0) throw std::runtime_error("Number of input channels must be a multiple of 4"); + if(K % PACK_OUT != 0) throw std::runtime_error("Number of output channels must be a multiple of 4"); + C /= PACK_IN; + K /= PACK_OUT; + int32_t Kout = K; + IN_DTYPE accs[PACK_OUT]; + for(size_t o = 0; o < num_outputs; o++) + for(int32_t m = 0 ; m < M; ++m) + for(int32_t p = 0 ; p < P; ++p) + for(int32_t q = 0; q < Q; ++q) + for(int32_t n = 0; n < N; ++n) + for(int32_t k = 0; k < Kout ; ++k) + { + for(int32_t i = 0 ; i < PACK_OUT; ++i) + accs[i] = 0; + int32_t mm = m*stride_d - pad_d; + int32_t pp = p*stride_h - pad_h; + int32_t qq = q*stride_w - pad_w; + for(int32_t kk = 0; kk < PACK_OUT; ++kk) + for(int32_t c = 0; c < C; ++c) + for(int32_t t = 0; t < T; ++t) + for(int32_t r = 0; r < R; ++r) + for(int32_t s = 0; s < S; ++s){ + int32_t d = mm + t; + int32_t h = pp + r; + int32_t w = qq + s; + bool in_bounds = (d >= 0 && h >= 0 && w >= 0 && d < D && h < H && w < W); + IN_DTYPE i = in_bounds?I[idx(n, c, d, h, w, N, C, D, H, W)]:0; + IN_DTYPE f = F[idx(c, t, r, s, k*PACK_OUT + kk, C, T, R, S, K*PACK_OUT)]; + accs[kk] += i*f; + } + O[o][idx(n, k, m, p, q, N, K, M, P, Q)] = accs[0]; + } +} void autotune(llvm::TargetMachine *machine, llvm::Module &module){ // Target parameters @@ -95,8 +145,6 @@ void autotune(llvm::TargetMachine *machine, llvm::Module &module){ } int main(){ -// llvm::DebugFlag = true; - std::string error; llvm::InitializeAllTargetInfos(); @@ -162,10 +210,15 @@ int main(){ llvm::Constant *_0 = llvm::ConstantTile::get(_f0, {bm, bn}); // LUT + unsigned num_delta = nlut; + unsigned num_inc_delta = nlut; + unsigned num_masks = nlut; + unsigned num_inc_masks = nlut; + unsigned cst_size = num_delta + num_inc_delta + num_masks + num_inc_masks; llvm::GlobalVariable *lut_array = - new llvm::GlobalVariable(*module, llvm::ArrayType::get(int32_t, nlut), false, llvm::GlobalVariable::InternalLinkage, + new llvm::GlobalVariable(*module, llvm::ArrayType::get(int32_t, cst_size), false, llvm::GlobalVariable::InternalLinkage, nullptr, "lut_array", nullptr, llvm::GlobalVariable::NotThreadLocal, 4); - llvm::Value *lut_ptr = builder.CreateBitCast(lut_array, lut_ptr_t); + llvm::Value *cst_ptr = builder.CreateBitCast(lut_array, lut_ptr_t); // Function @@ -177,7 +230,7 @@ int main(){ F->addAttribute(2, llvm::Attribute::ReadOnly); F->addAttribute(2, llvm::Attribute::NoAlias); std::transform(F->arg_begin(), F->arg_end(), std::back_inserter(args), [&](llvm::Argument& x){ return &x;}); - llvm::Value *base_o_ptr = args[0], *base_i_ptr = args[1], *base_f_ptr = args[2]; + llvm::Value *base_pc = args[0], *base_pa = args[1], *base_pb = args[2]; llvm::Value *C = args[3], *H = args[4], *W = args[5], *N = args[6], *K = args[7]; llvm::Value *R = builder.getInt32(RR), *S = builder.getInt32(SS); @@ -191,10 +244,10 @@ int main(){ // First basic block builder.SetInsertPoint(PrologBB); - llvm::Value* sa0 = builder.CreateCall(read_slice_x, {bm}, "i_slice_pqn"); - llvm::Value* sb0 = builder.CreateCall(read_slice_y, {bn}, "f_slice_k"); - llvm::Value* sa1 = builder.CreateCall(range, {builder.getInt32(0), bk}, "i_slice_crs"); - llvm::Value* sb1 = builder.CreateCall(range, {builder.getInt32(0), bk}, "f_slice_crs"); + llvm::Value* sa0 = builder.CreateCall(read_slice_x, {bm}, "sa0"); + llvm::Value* sb0 = builder.CreateCall(read_slice_y, {bn}, "sb0"); + llvm::Value* sa1 = builder.CreateCall(range, {builder.getInt32(0), bk}, "sa1"); + llvm::Value* sb1 = builder.CreateCall(range, {builder.getInt32(0), bk}, "sb1"); llvm::Value* lda_w = builder.getInt32(1); llvm::Value* lda_h = builder.CreateMul(lda_w, W); @@ -227,30 +280,31 @@ int main(){ offa_1 = builder.CreateAdd(offa_1, builder.CreateMul(sa_s, builder.CreateCall(splat_1d, {bk, lda_w}))); // Images pointer llvm::Value* off_a = builder.CreateCall(outer_add, {offa_0, offa_1}); - llvm::Value* start_pa = builder.CreateCall(gtp_2d, {base_i_ptr, off_a}, "start_i_ptr"); - llvm::LoadInst* start_aa = builder.CreateLoad(start_pa, false, "start_i_val"); - llvm::Value* start_a = builder.CreateCall(reshape, {start_aa, bm, bk}, "start_i"); + llvm::Value* start_pa = builder.CreateCall(gtp_2d, {base_pa, off_a}, "start_pa"); + llvm::LoadInst* start_aa = builder.CreateLoad(start_pa, false, "start_aa"); + llvm::Value* start_a = builder.CreateCall(reshape, {start_aa, bm, bk}, "start_a"); // Filters pointer llvm::Value* tldb_s = builder.CreateCall(splat_1d, {bk, K}); - llvm::Value* off_b = builder.CreateCall(outer_add, {sb0, builder.CreateMul(sb1, tldb_s)}, "off_f"); - llvm::Value* start_pb = builder.CreateCall(gtp_2d, {base_f_ptr, off_b}, "start_f_ptr"); - llvm::Value* start_bb = builder.CreateLoad(start_pb, false, "start_f_val"); - llvm::Value* start_b = builder.CreateCall(reshape, {start_bb, bn, bk}, "start_f"); + llvm::Value* off_b = builder.CreateCall(outer_add, {sb0, builder.CreateMul(sb1, tldb_s)}, "off_b"); + llvm::Value* start_pb = builder.CreateCall(gtp_2d, {base_pb, off_b}, "start_pb"); + llvm::Value* start_bb = builder.CreateLoad(start_pb, false, "start_bb"); + llvm::Value* start_b = builder.CreateCall(reshape, {start_bb, bn, bk}, "start_b"); // Filters increment - llvm::Value* inc_b_0 = builder.CreateCall(splat_1d, {bn, _s0}, "inc_f_0"); - llvm::Value* inc_b_1 = builder.CreateCall(splat_1d, {bk, builder.CreateMul(bk, ldb_k)}, "inc_f_1"); - llvm::Value* inc_b = builder.CreateCall(outer_add, {inc_b_0, inc_b_1}, "inc_f"); - // Delta pointers - llvm::Value* base_incdelta = lut_ptr; + llvm::Value* inc_b_0 = builder.CreateCall(splat_1d, {bn, _s0}, "inc_b_0"); + llvm::Value* inc_b_1 = builder.CreateCall(splat_1d, {bk, builder.CreateMul(bk, ldb_k)}, "inc_b_1"); + llvm::Value* inc_b = builder.CreateCall(outer_add, {inc_b_0, inc_b_1}, "inc_b"); + // Pointers to constant memory + llvm::Value* base_incdelta = builder.CreateGEP(cst_ptr, builder.getInt32(0)); + llvm::Value* base_delta = builder.CreateGEP(cst_ptr, builder.getInt32(num_inc_delta)); + llvm::Value* base_incmask = builder.CreateGEP(cst_ptr, builder.getInt32(num_delta)); + llvm::Value* base_mask = builder.CreateGEP(cst_ptr, builder.getInt32(num_inc_masks)); + // Delta pointers llvm::Value* start_pincdelta = builder.CreateCall(gtp_1d, {base_incdelta, sa1}, "start_pincdelta"); - llvm::Value* base_delta = builder.CreateGEP(lut_ptr, builder.getInt32(nlut)); llvm::Value* start_pdelta = builder.CreateCall(gtp_1d, {base_delta, builder.CreateCall(splat_1d, {bk, _s0})}, "start_pdelta"); // Masks llvm::Value* _1 = builder.CreateCall(splat_1d, {bk, builder.getInt32(1)}); llvm::Value* mask_a_1 = builder.CreateShl(_1, sa1); - llvm::Value* base_incmask = builder.CreateGEP(lut_ptr, builder.getInt32(2*nlut), "base_incmask"); llvm::Value* start_pincmask = builder.CreateCall(gtp_1d, {base_incmask, sa0}, "start_pincmask"); - llvm::Value* base_mask = builder.CreateGEP(lut_ptr, builder.getInt32(3*nlut), "base_mask"); llvm::Value* start_pmask = builder.CreateCall(gtp_1d, {base_mask, sa0}, "start_pmask"); // Enter loop builder.CreateBr(LoopBB); @@ -341,8 +395,8 @@ int main(){ // Epilogue builder.SetInsertPoint(EpilogueBB); - llvm::Value* sc_pqn = builder.CreateCall(read_slice_x, {bm}, "o_slice_pqn"); - llvm::Value* sc_k = builder.CreateCall(read_slice_y, {bn}, "o_slice_k"); + llvm::Value* sc_pqn = builder.CreateCall(read_slice_x, {bm}, "sc_pqn"); + llvm::Value* sc_k = builder.CreateCall(read_slice_y, {bn}, "sc_k"); // Output strides llvm::Value* ldc_q = builder.getInt32(1); llvm::Value* ldc_p = builder.CreateMul(lda_w, W); @@ -360,7 +414,7 @@ int main(){ llvm::Value* offc1 = builder.CreateMul(sc_k, builder.CreateCall(splat_1d, {bn, ldc_k})); // Output pointer llvm::Value* offc = builder.CreateCall(outer_add, {offc0, offc1}); - llvm::Value* pc = builder.CreateCall(gtp_2d, {base_o_ptr, offc}); + llvm::Value* pc = builder.CreateCall(gtp_2d, {base_pc, offc}); // Output masks llvm::Value* in_bounds_c0 = builder.CreateICmpSLT(sc_pqn, builder.CreateCall(splat_1d, {bm, PQN})); llvm::Value* in_bounds_c1 = builder.CreateICmpSLT(sc_k, builder.CreateCall(splat_1d, {bn, K})); diff --git a/gemm.cpp b/gemm.cpp index 5adc3ebbd..5433fd8d9 100644 --- a/gemm.cpp +++ b/gemm.cpp @@ -121,18 +121,19 @@ int main(){ llvm::IntegerType* int32_t = llvm::Type::getInt32Ty(context); llvm::IntegerType* int1_t = llvm::Type::getInt1Ty(context); - llvm::Type* tile_t = llvm::TileType::get(numeric_t, 2); + llvm::Type* tile2d_t = llvm::TileType::get(numeric_t, 2); + llvm::Type* tile3d_t = llvm::TileType::get(numeric_t, 3); llvm::Type* int32_slice_t = llvm::TileType::get(int32_t, 1); llvm::Type* int32_tile_t = llvm::TileType::get(int32_t, 2); llvm::Type* int1_slice_t = llvm::TileType::get(int1_t, 1); llvm::Type* int1_tile_t = llvm::TileType::get(int1_t, 2); - llvm::PointerType* tile_ptr_t = llvm::PointerType::get(tile_t, 0); + llvm::PointerType* tile2d_ptr_t = llvm::PointerType::get(tile2d_t, 0); llvm::Function* read_slice_x = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_read_slice_x, {int32_slice_t}); llvm::Function* read_slice_y = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_read_slice_y, {int32_slice_t}); llvm::Function* range = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_range, {int32_slice_t}); - llvm::Function* gtp = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_gtp_2d, {tile_ptr_t, numeric_ptr_t, int32_tile_t}); - llvm::Function* stp = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_stp_2d, {tile_ptr_t, int32_tile_t}); + llvm::Function* gtp = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_gtp_2d, {tile2d_ptr_t, numeric_ptr_t, int32_tile_t}); + llvm::Function* stp = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_stp_2d, {tile2d_ptr_t, int32_tile_t}); llvm::Intrinsic::ID mma_id; if(!AT && !BT) mma_id = llvm::Intrinsic::tlvm_mma_nn; if(!AT && BT) mma_id = llvm::Intrinsic::tlvm_mma_nt; @@ -140,17 +141,18 @@ int main(){ if(AT && BT) mma_id = llvm::Intrinsic::tlvm_mma_tt; llvm::Function* outer_add = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_outer_add, {int32_tile_t, int32_slice_t, int32_slice_t}); llvm::Function* outer_and = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_outer_and, {int1_tile_t, int1_slice_t, int1_slice_t}); - llvm::Function* mma = llvm::Intrinsic::getDeclaration(module.get(), mma_id, {tile_t}); - llvm::Function* reshape = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_reshape_2d, {tile_t}); + llvm::Function* mma = llvm::Intrinsic::getDeclaration(module.get(), mma, {tile3d_t}); + llvm::Function* reshape = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_reshape_3d, {tile3d_t, tile2d_t}); llvm::Function* splat_2d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_splat_2d, {mask_tile_t, bool_t}); llvm::Function* splat_1d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_splat_1d, {int32_slice_t, int32_t}); - llvm::Function* masked_load = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_masked_load, {tile_t, tile_ptr_t, mask_tile_t}); - llvm::Function* masked_store = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_masked_store, {tile_t, tile_ptr_t, mask_tile_t}); + llvm::Function* masked_load = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_masked_load, {tile2d_t, tile2d_ptr_t, mask_tile_t}); + llvm::Function* masked_store = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_masked_store, {tile2d_t, tile2d_ptr_t, mask_tile_t}); // Hyperparameters llvm::Hyperparameter *bm = llvm::Hyperparameter::get(int32_t, 0); llvm::Hyperparameter *bn = llvm::Hyperparameter::get(int32_t, 1); llvm::Hyperparameter *bk = llvm::Hyperparameter::get(int32_t, 2); + llvm::Hyperparameter *br = llvm::Hyperparameter::get(int32_t, 3); // Constants llvm::Constant *_s0 = llvm::ConstantInt::get(int32_t, 0); @@ -221,8 +223,8 @@ int main(){ llvm::CallInst* startpb = builder.CreateCall(gtp, {arguments[1], offb}, "startpb"); llvm::LoadInst* startfa = builder.CreateLoad(startpa, "startfa"); llvm::LoadInst* startfb = builder.CreateLoad(startpb, "startfb"); - llvm::Value* starta = builder.CreateCall(reshape, {startfa, ba0, ba1}, "starta"); - llvm::Value* startb = builder.CreateCall(reshape, {startfb, bb0, bb1}, "startb"); + llvm::Value* starta = builder.CreateCall(reshape, {startfa, ba0, ba1, br}, "starta"); + llvm::Value* startb = builder.CreateCall(reshape, {startfb, bb0, bb1, br}, "startb"); llvm::Value* tinca0 = builder.CreateCall(splat_1d, {ba0, builder.CreateMul(inca0, AS0)}, "tinca0"); llvm::Value* tinca1 = builder.CreateCall(splat_1d, {ba1, builder.CreateMul(inca1, AS1)}); llvm::Value* tincb0 = builder.CreateCall(splat_1d, {bb0, builder.CreateMul(incb0, BS0)}); @@ -261,8 +263,8 @@ int main(){ // Pre-fetch llvm::Value* nextfa = builder.CreateCall(masked_load, {nextpa, maska}, "nextfa"); llvm::Value* nextfb = builder.CreateCall(masked_load, {nextpb, maskb}, "nextfb"); - llvm::Value* nexta = builder.CreateCall(reshape, {nextfa, ba0, ba1}, "nexta"); - llvm::Value* nextb = builder.CreateCall(reshape, {nextfb, bb0, bb1}, "nextb"); + llvm::Value* nexta = builder.CreateCall(reshape, {nextfa, ba0, ba1, br}, "nexta"); + llvm::Value* nextb = builder.CreateCall(reshape, {nextfb, bb0, bb1, br}, "nextb"); a->addIncoming(starta, PrologBB); a->addIncoming(nexta, LoopBB); b->addIncoming(startb, PrologBB); @@ -283,8 +285,8 @@ int main(){ llvm::Value* lastmaskb = builder.CreateCall(outer_and, {in_bounds_b0, in_bounds_b1}, "lastmaskb"); llvm::Value* lastfa = builder.CreateCall(masked_load, {nextpa, lastmaska}, "lastfa"); llvm::Value* lastfb = builder.CreateCall(masked_load, {nextpb, lastmaskb}, "lastfb"); - llvm::Value* lasta = builder.CreateCall(reshape, {lastfa, ba0, ba1}, "lasta"); - llvm::Value* lastb = builder.CreateCall(reshape, {lastfb, bb0, bb1}, "lastb"); + llvm::Value* lasta = builder.CreateCall(reshape, {lastfa, ba0, ba1, br}, "lasta"); + llvm::Value* lastb = builder.CreateCall(reshape, {lastfb, bb0, bb1, br}, "lastb"); llvm::Value* loop = builder.CreateICmpSGT(nextk, _s0); a->addIncoming(lasta, LastIterBB); b->addIncoming(lastb, LastIterBB); From a7a3d57f3c9f949968cba4055a36f8f8e4c194fa Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 3 Dec 2018 07:44:45 -0500 Subject: [PATCH 006/494] FindLLVM --- cmake/FindLLVM.cmake | 88 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 cmake/FindLLVM.cmake diff --git a/cmake/FindLLVM.cmake b/cmake/FindLLVM.cmake new file mode 100644 index 000000000..b3196d444 --- /dev/null +++ b/cmake/FindLLVM.cmake @@ -0,0 +1,88 @@ +# - Find LLVM +# This module can be used to find LLVM. +# It requires that the llvm-config executable be available on the system path. +# Once found, llvm-config is used for everything else. +# +# Typical usage could be: +# find_package(LLVM QUIET REQUIRED COMPONENTS jit native interpreter) +# +# If the QUIET flag is not set, the specified components and LLVM version are +# outputted. +# +# If the COMPONENTS are not set, the default set of "all" is used. +# +# The following variables are set: +# +# LLVM_FOUND - Set to YES if LLVM is found. +# LLVM_VERSION - Set to the decimal version of the LLVM library. +# LLVM_C_FLAGS - All flags that should be passed to a C compiler. +# LLVM_CXX_FLAGS - All flags that should be passed to a C++ compiler. +# LLVM_CPP_FLAGS - All flags that should be passed to the C pre-processor. +# LLVM_LD_FLAGS - Additional flags to pass to the linker. +# LLVM_LIBRARY_DIRS - A list of directories where the LLVM libraries are located. +# LLVM_INCLUDE_DIRS - A list of directories where the LLVM headers are located. +# LLVM_LIBRARIES - A list of libraries which should be linked against. + +# A macro to run llvm config +macro(_llvm_config _var_name) + # Firstly, locate the LLVM config executable + find_program(_llvm_config_exe + NAMES llvm-config + PATHS /home/philippe/Development/llvm-tlvm/build/bin/ + DOC "llvm-config executable location" + ) + + # If no llvm-config executable was found, set the output variable to not + # found. + if(NOT _llvm_config_exe) + set(${_var_name} "${_var_name}-NOTFOUND") + else(NOT _llvm_config_exe) + # Otherwise, run llvm-config + execute_process( + COMMAND ${_llvm_config_exe} ${ARGN} + OUTPUT_VARIABLE ${_var_name} + RESULT_VARIABLE _llvm_config_retval + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(RESULT_VARIABLE) + message(SEND_ERROR + "Error running llvm-config with arguments: ${ARGN}") + endif(RESULT_VARIABLE) + endif(NOT _llvm_config_exe) +endmacro(_llvm_config) + +# The default set of components +set(_llvm_components all) + +# If components have been specified via find_package, use them +if(LLVM_FIND_COMPONENTS) + set(_llvm_components ${LLVM_FIND_COMPONENTS}) +endif(LLVM_FIND_COMPONENTS) + +if(NOT LLVM_FIND_QUIETLY) + message(STATUS "Looking for LLVM components: ${_llvm_components}") +endif(NOT LLVM_FIND_QUIETLY) + +_llvm_config(LLVM_VERSION --version) +_llvm_config(LLVM_C_FLAGS --cflags) +_llvm_config(LLVM_CXX_FLAGS --cxxflags) +_llvm_config(LLVM_CPP_FLAGS --cppflags) +_llvm_config(LLVM_LD_FLAGS --ldflags) +_llvm_config(LLVM_LIBRARY_DIRS --libdir) +_llvm_config(LLVM_INCLUDE_DIRS --includedir) +_llvm_config(LLVM_LIBRARIES --libs) + +if(NOT LLVM_FIND_QUIETLY) + message(STATUS "Found LLVM version: ${LLVM_VERSION}") +endif(NOT LLVM_FIND_QUIETLY) + +# handle the QUIETLY and REQUIRED arguments and set LLVM_FOUND to TRUE if +# all listed variables are TRUE +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(LLVM + DEFAULT_MSG + LLVM_LIBRARIES + LLVM_INCLUDE_DIRS + LLVM_LIBRARY_DIRS) + +# vim:sw=4:ts=4:autoindent From dc755612b970042e7fe3a57281525b5f09d279ea Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 15 Dec 2018 22:29:36 -0500 Subject: [PATCH 007/494] TDL [Parser]: Initial commit --- CMakeLists.txt | 28 +-- README.md | 1 - ast.h | 295 ++++++++++++++++++++++++++++ cmake/FindLLVM.cmake | 88 --------- conv.cpp | 456 ------------------------------------------- gemm.cpp | 342 -------------------------------- main.cpp | 14 ++ parser.y | 305 +++++++++++++++++++++++++++++ scanner.l | 128 ++++++++++++ 9 files changed, 749 insertions(+), 908 deletions(-) delete mode 100644 README.md create mode 100644 ast.h delete mode 100644 cmake/FindLLVM.cmake delete mode 100644 conv.cpp delete mode 100644 gemm.cpp create mode 100644 main.cpp create mode 100644 parser.y create mode 100644 scanner.l diff --git a/CMakeLists.txt b/CMakeLists.txt index 3f1650aca..308a86ad1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,21 +1,7 @@ -cmake_minimum_required(VERSION 2.8.7) -project(TDL) - -# LLVM -include(cmake/FindLLVM.cmake) - -# Link directories -link_directories(/home/philippe/Development/llvm-tlvm/build/lib) -# Include directories -include_directories(/home/philippe/Development/llvm-tlvm/include) -include_directories(/home/philippe/Development/llvm-tlvm/build/include) - -# Flags -# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Wextra -pedantic -Wno-strict-aliasing") - -# Executables -foreach(PROG gemm conv) - add_executable(${PROG} ${PROG}.cpp) - set_target_properties(${PROG} PROPERTIES OUTPUT_NAME ${PROG}) - target_link_libraries(${PROG} ${LLVM_LIBRARIES}) -endforeach() +find_package(BISON) +BISON_TARGET(Parser parser.y ${CMAKE_CURRENT_BINARY_DIR}/parser.cpp) +find_package(FLEX) +FLEX_TARGET(Lexer scanner.l ${CMAKE_CURRENT_BINARY_DIR}/scanner.cpp) +get_filename_component(BISON_Parser_INCLUDE_DIRECTORIES ${BISON_Parser_OUTPUT_HEADER} DIRECTORY) +include_directories(${BISON_Parser_INCLUDE_DIRECTORIES}) +add_executable(test main.cpp ${BISON_Parser_OUTPUTS} ${FLEX_Lexer_OUTPUTS}) diff --git a/README.md b/README.md deleted file mode 100644 index 1a34b0d0e..000000000 --- a/README.md +++ /dev/null @@ -1 +0,0 @@ -# tdl-examples diff --git a/ast.h b/ast.h new file mode 100644 index 000000000..6cf22e89b --- /dev/null +++ b/ast.h @@ -0,0 +1,295 @@ +#include "parser.hpp" +#include +#include +#include + +typedef yytokentype token_type; + +namespace ast{ + +class node { }; + +template +class list: public node { +public: + list(const T& x): values_{x} {} + node* append(const T& x) { values_.push_back(x); return this;} + +private: + std::list values_; +}; + +template +node* append_ptr_list(node *result, node *in){ + return static_cast*>(result)->append((T*)in); +} + +class binary_operator: public node{ +public: + enum OP_T{ + MUL, DIV, REM, + ADD, SUB, + LEFT_SHIFT, RIGHT_SHIFT, + LT, GT, + LE, GE, + EQ, NE, + AND, XOR, OR, + LAND, LOR + }; + + static OP_T get_op(token_type token){ + switch(token){ + case LEFT_OP: return LEFT_SHIFT; + case RIGHT_OP: return RIGHT_SHIFT; + case LE_OP: return LE; + case GE_OP: return GE; + case EQ_OP: return EQ; + case NE_OP: return NE; + case AND_OP: return LAND; + case OR_OP: return LOR; + default: assert(false && "unreachable"); throw; + } + } + + static OP_T get_op(char token){ + switch(token){ + case '*': return MUL; + case '/': return DIV; + case '%': return REM; + case '+': return ADD; + case '-': return SUB; + case '<': return LT; + case '>': return GT; + case '&': return AND; + case '^': return XOR; + case '|': return OR; + default: assert(false && "unreachable"); throw; + } + } + +public: + binary_operator(token_type op, node *lhs, node *rhs) + : op_(get_op(op)), lhs_(lhs), rhs_(rhs) { } + binary_operator(char op, node *lhs, node *rhs) + : op_(get_op(op)), lhs_(lhs), rhs_(rhs){ } + +private: + const OP_T op_; + const node *lhs_; + const node *rhs_; +}; + + +class constant: public node{ +public: + constant(int value): value_(value) { } + +private: + const int value_; +}; + +class identifier: public node{ +public: + identifier(char *&name): name_(name) { } + +private: + std::string name_; +}; + +class string_literal: public node{ +public: + string_literal(char *&value): value_(value) { } + +public: + std::string value_; +}; + +class unary_operator: public node{ +public: + unary_operator(token_type token, node *arg): token_(token), arg_(arg) { } + +private: + const token_type token_; + const node *arg_; +}; + +class cast_operator: public node{ +public: + cast_operator(token_type type, node *arg): type_(type), arg_(arg) { } + +public: + const token_type type_; + const node *arg_; +}; + +class conditional_expression: public node{ +public: + conditional_expression(node *cond, node *true_value, node *false_value) + : cond_(cond), true_value_(true_value), false_value_(false_value) { } + +public: + const node *cond_; + const node *true_value_; + const node *false_value_; +}; + +class assignment_expression: public node{ + typedef binary_operator::OP_T op_t; + +public: + assignment_expression(node *lvalue, token_type op, node *rvalue) + : lvalue_(lvalue), op_(binary_operator::get_op(op)), rvalue_(rvalue) { } + +public: + op_t op_; + const node *lvalue_; + const node *rvalue_; +}; + +class compound_statement: public node{ +public: + compound_statement() : statements_() {} + compound_statement(node *stmt): statements_{stmt} {} + compound_statement* append(node *stmt) { statements_.push_back(stmt); return this; } + +private: + std::list statements_; +}; + +class selection_statement: public node{ +public: + selection_statement(node *cond, node *if_value, node *else_value = nullptr) + : cond_(cond), if_value_(if_value), else_value_(else_value) { } + +public: + const node *cond_; + const node *if_value_; + const node *else_value_; +}; + +class iteration_statement: public node{ +public: + iteration_statement(node *init, node *stop, node *exec, node *statements) + : init_(init), stop_(stop), exec_(exec), statements_(statements) { } + +private: + const node *init_; + const node *stop_; + const node *exec_; + const node *statements_; +}; + +class no_op: public node { }; + +// Types +class declarator: public node{ + +}; + +class pointer_declarator: public declarator{ +public: + pointer_declarator(unsigned order) + : order_(order) { } + + pointer_declarator *inc(){ + order_ += 1; + return this; + } + +private: + unsigned order_; +}; + +class tile_declarator: public declarator{ +public: + tile_declarator(node *shapes) + : shapes_((list*)(shapes)) { } + +public: + const list* shapes_; +}; + +class parameter: public declarator { +public: + parameter(token_type type, node *decl) + : type_(type), decl_(decl) { } + +public: + const token_type type_; + const node *decl_; +}; + +class function_declarator: public declarator{ +public: + function_declarator(node *args) + : args_((list)args) { } + +public: + const list args_; +}; + +class compound_declarator: public declarator{ +public: + compound_declarator(node *ptr, node *tile) + : ptr_(ptr), tile_(tile) { } + +public: + const node *ptr_; + const node *tile_; +}; + +class init_declarator : public declarator{ +public: + init_declarator(node *decl, node *initializer) + : decl_(decl), initializer_(initializer){ } + +public: + const node *decl_; + const node *initializer_; +}; + +class declaration: public node{ +public: + declaration(node *spec, node *init) + : spec_(spec), init_(init) { } + +public: + const node *spec_; + const node *init_; +}; + +class type: public node{ +public: + type(token_type spec, node * decl) + : spec_(spec), decl_(decl) { } + +public: + const token_type spec_; + const node *decl_; +}; + +class translation_unit: public node{ +public: + translation_unit(node *item) + : decls_(item) { } + + translation_unit *add(node *item) { + decls_.append(item); + return this; + } + +private: + list decls_; +}; + +class function_definition: public node{ +public: + function_definition(node *header, node *body) + : header_((declarator *)header), body_((compound_statement*)body) { } + +public: + const declarator *header_; + const compound_statement *body_; +}; + +} diff --git a/cmake/FindLLVM.cmake b/cmake/FindLLVM.cmake deleted file mode 100644 index b3196d444..000000000 --- a/cmake/FindLLVM.cmake +++ /dev/null @@ -1,88 +0,0 @@ -# - Find LLVM -# This module can be used to find LLVM. -# It requires that the llvm-config executable be available on the system path. -# Once found, llvm-config is used for everything else. -# -# Typical usage could be: -# find_package(LLVM QUIET REQUIRED COMPONENTS jit native interpreter) -# -# If the QUIET flag is not set, the specified components and LLVM version are -# outputted. -# -# If the COMPONENTS are not set, the default set of "all" is used. -# -# The following variables are set: -# -# LLVM_FOUND - Set to YES if LLVM is found. -# LLVM_VERSION - Set to the decimal version of the LLVM library. -# LLVM_C_FLAGS - All flags that should be passed to a C compiler. -# LLVM_CXX_FLAGS - All flags that should be passed to a C++ compiler. -# LLVM_CPP_FLAGS - All flags that should be passed to the C pre-processor. -# LLVM_LD_FLAGS - Additional flags to pass to the linker. -# LLVM_LIBRARY_DIRS - A list of directories where the LLVM libraries are located. -# LLVM_INCLUDE_DIRS - A list of directories where the LLVM headers are located. -# LLVM_LIBRARIES - A list of libraries which should be linked against. - -# A macro to run llvm config -macro(_llvm_config _var_name) - # Firstly, locate the LLVM config executable - find_program(_llvm_config_exe - NAMES llvm-config - PATHS /home/philippe/Development/llvm-tlvm/build/bin/ - DOC "llvm-config executable location" - ) - - # If no llvm-config executable was found, set the output variable to not - # found. - if(NOT _llvm_config_exe) - set(${_var_name} "${_var_name}-NOTFOUND") - else(NOT _llvm_config_exe) - # Otherwise, run llvm-config - execute_process( - COMMAND ${_llvm_config_exe} ${ARGN} - OUTPUT_VARIABLE ${_var_name} - RESULT_VARIABLE _llvm_config_retval - OUTPUT_STRIP_TRAILING_WHITESPACE - ) - if(RESULT_VARIABLE) - message(SEND_ERROR - "Error running llvm-config with arguments: ${ARGN}") - endif(RESULT_VARIABLE) - endif(NOT _llvm_config_exe) -endmacro(_llvm_config) - -# The default set of components -set(_llvm_components all) - -# If components have been specified via find_package, use them -if(LLVM_FIND_COMPONENTS) - set(_llvm_components ${LLVM_FIND_COMPONENTS}) -endif(LLVM_FIND_COMPONENTS) - -if(NOT LLVM_FIND_QUIETLY) - message(STATUS "Looking for LLVM components: ${_llvm_components}") -endif(NOT LLVM_FIND_QUIETLY) - -_llvm_config(LLVM_VERSION --version) -_llvm_config(LLVM_C_FLAGS --cflags) -_llvm_config(LLVM_CXX_FLAGS --cxxflags) -_llvm_config(LLVM_CPP_FLAGS --cppflags) -_llvm_config(LLVM_LD_FLAGS --ldflags) -_llvm_config(LLVM_LIBRARY_DIRS --libdir) -_llvm_config(LLVM_INCLUDE_DIRS --includedir) -_llvm_config(LLVM_LIBRARIES --libs) - -if(NOT LLVM_FIND_QUIETLY) - message(STATUS "Found LLVM version: ${LLVM_VERSION}") -endif(NOT LLVM_FIND_QUIETLY) - -# handle the QUIETLY and REQUIRED arguments and set LLVM_FOUND to TRUE if -# all listed variables are TRUE -include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(LLVM - DEFAULT_MSG - LLVM_LIBRARIES - LLVM_INCLUDE_DIRS - LLVM_LIBRARY_DIRS) - -# vim:sw=4:ts=4:autoindent diff --git a/conv.cpp b/conv.cpp deleted file mode 100644 index fa99d301e..000000000 --- a/conv.cpp +++ /dev/null @@ -1,456 +0,0 @@ -#include - -#include "llvm/ADT/APFloat.h" -#include "llvm/ADT/Optional.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ExecutionEngine/ExecutionEngine.h" -#include "llvm/IR/BasicBlock.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/LegacyPassManager.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Type.h" -#include "llvm/IR/Verifier.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/Support/FileSystem.h" -#include "llvm/Support/Host.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/TargetSelect.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetOptions.h" -#include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/Support/Debug.h" -#include "llvm/Transforms/Utils/Cloning.h" - -// Index computation -inline int32_t idx(int32_t x, int32_t y, int32_t z, int32_t w, int32_t u, - int32_t /*s0*/, int32_t s1, int32_t s2, int32_t s3, int32_t s4) -{ return u + w*s4 + z*s4*s3 + y*s4*s3*s2 + x*s4*s3*s2*s1; } - -template -void cpp_conv_nchw(int32_t C, int32_t N, int32_t K, - int32_t D, int32_t H, int32_t W, - int32_t T, int32_t R, int32_t S, - int32_t pad_d, int32_t pad_h, int32_t pad_w, - int32_t stride_d, int32_t stride_h, int32_t stride_w, - int32_t M, int32_t P, int32_t Q, - std::vector>& O, IN_DTYPE* I, IN_DTYPE* F) -{ - size_t num_outputs = O.size(); - static const int PACK_IN = 1; - static const int PACK_OUT = 1; - if(C % PACK_IN != 0) throw std::runtime_error("Number of input channels must be a multiple of 4"); - if(K % PACK_OUT != 0) throw std::runtime_error("Number of output channels must be a multiple of 4"); - C /= PACK_IN; - K /= PACK_OUT; - int32_t Kout = K; - IN_DTYPE accs[PACK_OUT]; - for(size_t o = 0; o < num_outputs; o++) - for(int32_t m = 0 ; m < M; ++m) - for(int32_t p = 0 ; p < P; ++p) - for(int32_t q = 0; q < Q; ++q) - for(int32_t n = 0; n < N; ++n) - for(int32_t k = 0; k < Kout ; ++k) - { - for(int32_t i = 0 ; i < PACK_OUT; ++i) - accs[i] = 0; - int32_t mm = m*stride_d - pad_d; - int32_t pp = p*stride_h - pad_h; - int32_t qq = q*stride_w - pad_w; - for(int32_t kk = 0; kk < PACK_OUT; ++kk) - for(int32_t c = 0; c < C; ++c) - for(int32_t t = 0; t < T; ++t) - for(int32_t r = 0; r < R; ++r) - for(int32_t s = 0; s < S; ++s){ - int32_t d = mm + t; - int32_t h = pp + r; - int32_t w = qq + s; - bool in_bounds = (d >= 0 && h >= 0 && w >= 0 && d < D && h < H && w < W); - IN_DTYPE i = in_bounds?I[idx(n, c, d, h, w, N, C, D, H, W)]:0; - IN_DTYPE f = F[idx(c, t, r, s, k*PACK_OUT + kk, C, T, R, S, K*PACK_OUT)]; - accs[kk] += i*f; - } - O[o][idx(n, k, m, p, q, N, K, M, P, Q)] = accs[0]; - } -} - -void autotune(llvm::TargetMachine *machine, llvm::Module &module){ - // Target parameters - std::vector ranges = { - // asm - 2, 16, 1, 64, - // bsn - 2, 16, 1, 64, - // pa - 1, 2, 4, 8, - // pb - 1, 2, 4, - // sm - 2, 1, 16, 2, 2, 2 - }; - - // Function - llvm::Function *F = module.getFunction("kernel"); - - // Auto-tuning - llvm::legacy::PassManager pass; - llvm::TargetPassConfig *pass_config = static_cast(machine)->createPassConfig(pass); - llvm::FunctionPass *tuning_params = pass_config->createTargetTuningParameters(); - tuning_params->runOnFunction(*F); - - - // Gather all parameters - llvm::DenseSet unique; - llvm::SmallVector params; - for(llvm::BasicBlock &bb: *F) - for(llvm::Instruction &instr: bb){ - // Get tuning parameters for this particular instruction - std::vector tuning_params; - machine->getTargetTuner().getParams(&instr, tuning_params); - for(llvm::TargetTuner::ParamType ¶m: tuning_params){ - // This parameter has not been seen before - if(unique.insert(param.Value).second){ - std::cout << "PARAM: " << instr.getName().data() << " " << param.Name << std::endl; - params.push_back(param.Value); - } - } - } - - // Gather all constraints - std::vector> constraints; - for(llvm::BasicBlock &bb: *F) - for(llvm::Instruction &instr: bb) - machine->getTargetTuner().getConstraints(&instr, constraints); - - // Assign parameters - std::cout << params.size() << " " << ranges.size() << std::endl; - for(unsigned i = 0; i < params.size(); i++) - *params[i] = ranges[i]; - - // Verify constraints - bool valid = true; - for(auto &constraint: constraints){ - valid = valid & constraint(); - } - - if(!valid){ - printf("Invalid kernel parameters\n"); - exit(EXIT_FAILURE); - } -} - -int main(){ - std::string error; - - llvm::InitializeAllTargetInfos(); - llvm::InitializeAllTargets(); - llvm::InitializeAllTargetMCs(); - llvm::InitializeAllAsmParsers(); - llvm::InitializeAllAsmPrinters(); - - // Module - llvm::LLVMContext context; - std::unique_ptr module = llvm::make_unique("TLVM toy example", context); - llvm::IRBuilder<> builder(context); - - unsigned RR = 3, SS = 3; - unsigned Nfilt = RR * SS; - unsigned block = 8; - unsigned nlut = (block + Nfilt - 1)/Nfilt * Nfilt; - - // Globals - llvm::Type* bool_t = llvm::Type::getInt1Ty(context); - llvm::Type* mask_tile_t = llvm::TileType::get(bool_t, 2); - llvm::Type* numeric_t = llvm::Type::getFloatTy(context); - llvm::PointerType* numeric_ptr_t = llvm::PointerType::get(numeric_t, 0); - llvm::IntegerType* int32_t = llvm::Type::getInt32Ty(context); - llvm::PointerType* lut_ptr_t = llvm::PointerType::get(int32_t, 4); - llvm::IntegerType* int1_t = llvm::Type::getInt1Ty(context); - - llvm::Type* tile_t = llvm::TileType::get(numeric_t, 2); - llvm::Type* int32_slice_t = llvm::TileType::get(int32_t, 1); - llvm::Type* int32_tile_t = llvm::TileType::get(int32_t, 2); - llvm::Type* int1_slice_t = llvm::TileType::get(int1_t, 1); - llvm::Type* int1_tile_t = llvm::TileType::get(int1_t, 2); - - llvm::PointerType* tile_ptr_t = llvm::PointerType::get(tile_t, 0); - llvm::Function* read_slice_x = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_read_slice_x, {int32_slice_t}); - llvm::Function* read_slice_y = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_read_slice_y, {int32_slice_t}); - llvm::Function* range = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_range, {int32_slice_t}); - llvm::Function* gtp_1d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_gtp_1d, {int32_slice_t->getPointerTo(4), int32_t->getPointerTo(4), int32_slice_t}); - llvm::Function* stp_1d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_stp_1d, {int32_slice_t->getPointerTo(4), int32_slice_t}); - - llvm::Function* gtp_2d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_gtp_2d, {tile_ptr_t, numeric_ptr_t, int32_tile_t}); - llvm::Function* stp_2d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_stp_2d, {tile_ptr_t, int32_tile_t}); - llvm::Intrinsic::ID mma_id = llvm::Intrinsic::tlvm_mma_nt; - llvm::Function* outer_add = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_outer_add, {int32_tile_t, int32_slice_t, int32_slice_t}); - llvm::Function* outer_and = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_outer_and, {int1_tile_t, int1_slice_t, int1_slice_t}); - llvm::Function* outer_and_int32 = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_outer_and, {int1_tile_t, int32_slice_t, int32_slice_t}); - llvm::Function* mma = llvm::Intrinsic::getDeclaration(module.get(), mma_id, {tile_t}); - llvm::Function* reshape = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_reshape_2d, {tile_t}); - llvm::Function* splat_2d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_splat_2d, {mask_tile_t, bool_t}); - llvm::Function* splat_1d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_splat_1d, {int32_slice_t, int32_t}); - - llvm::Function* masked_load = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_masked_load, {tile_t, tile_ptr_t, mask_tile_t}); - llvm::Function* masked_store = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_masked_store, {tile_t, tile_ptr_t, mask_tile_t}); - - // Hyperparameters - llvm::Hyperparameter *bm = llvm::Hyperparameter::get(int32_t, 0); - llvm::Hyperparameter *bn = llvm::Hyperparameter::get(int32_t, 1); - llvm::Hyperparameter *bk = llvm::Hyperparameter::get(int32_t, 2); - - // Constants - llvm::Constant *_s0 = llvm::ConstantInt::get(int32_t, 0); - llvm::Constant *_f0 = llvm::ConstantFP::get(numeric_t, 0); - llvm::Constant *_0 = llvm::ConstantTile::get(_f0, {bm, bn}); - - // LUT - unsigned num_delta = nlut; - unsigned num_inc_delta = nlut; - unsigned num_masks = nlut; - unsigned num_inc_masks = nlut; - unsigned cst_size = num_delta + num_inc_delta + num_masks + num_inc_masks; - llvm::GlobalVariable *lut_array = - new llvm::GlobalVariable(*module, llvm::ArrayType::get(int32_t, cst_size), false, llvm::GlobalVariable::InternalLinkage, - nullptr, "lut_array", nullptr, llvm::GlobalVariable::NotThreadLocal, 4); - llvm::Value *cst_ptr = builder.CreateBitCast(lut_array, lut_ptr_t); - - - // Function - llvm::FunctionType* prototype = llvm::FunctionType::get(llvm::Type::getVoidTy(context), std::vector{numeric_ptr_t, numeric_ptr_t, numeric_ptr_t, int32_t, int32_t, int32_t, int32_t, int32_t}, false); - llvm::Function* F = llvm::Function::Create(prototype, llvm::Function::ExternalLinkage, "kernel", module.get()); - std::vector args; - F->addAttribute(1, llvm::Attribute::ReadOnly); - F->addAttribute(1, llvm::Attribute::NoAlias); - F->addAttribute(2, llvm::Attribute::ReadOnly); - F->addAttribute(2, llvm::Attribute::NoAlias); - std::transform(F->arg_begin(), F->arg_end(), std::back_inserter(args), [&](llvm::Argument& x){ return &x;}); - llvm::Value *base_pc = args[0], *base_pa = args[1], *base_pb = args[2]; - llvm::Value *C = args[3], *H = args[4], *W = args[5], *N = args[6], *K = args[7]; - llvm::Value *R = builder.getInt32(RR), *S = builder.getInt32(SS); - - // All basic blocks - llvm::BasicBlock* PrologBB = llvm::BasicBlock::Create(context, "prologue", F); - llvm::BasicBlock* LoopBB = llvm::BasicBlock::Create(context, "loop", F); - llvm::BasicBlock* EarlyExitBB = llvm::BasicBlock::Create(context, "early_exit", F); - llvm::BasicBlock* LastIterBB = llvm::BasicBlock::Create(context, "last_iter", F); - llvm::BasicBlock* EpilogueBB = llvm::BasicBlock::Create(context, "epilogue", F); - - - // First basic block - builder.SetInsertPoint(PrologBB); - llvm::Value* sa0 = builder.CreateCall(read_slice_x, {bm}, "sa0"); - llvm::Value* sb0 = builder.CreateCall(read_slice_y, {bn}, "sb0"); - llvm::Value* sa1 = builder.CreateCall(range, {builder.getInt32(0), bk}, "sa1"); - llvm::Value* sb1 = builder.CreateCall(range, {builder.getInt32(0), bk}, "sb1"); - - llvm::Value* lda_w = builder.getInt32(1); - llvm::Value* lda_h = builder.CreateMul(lda_w, W); - llvm::Value* lda_c = builder.CreateMul(lda_h, H); - llvm::Value* lda_n = builder.CreateMul(lda_c, C); - - llvm::Value* ldb_s = builder.getInt32(1); - llvm::Value* ldb_r = builder.CreateMul(ldb_s, S); - llvm::Value* ldb_c = builder.CreateMul(ldb_r, R); - llvm::Value* ldb_k = builder.CreateMul(ldb_c, C); - - llvm::Value* CRS = builder.CreateMul(C, builder.CreateMul(R, S)); - llvm::Value* PQN = builder.CreateMul(H, builder.CreateMul(W, N)); - - // Images HWN offset - llvm::Value* sa_hw = builder.CreateUDiv(sa0, builder.CreateCall(splat_1d, {bm, N})); - llvm::Value* sa_n = builder.CreateURem(sa0, builder.CreateCall(splat_1d, {bm, N})); - llvm::Value* sa_h = builder.CreateUDiv(sa_hw, builder.CreateCall(splat_1d, {bm, W})); - llvm::Value* sa_w = builder.CreateURem(sa_hw, builder.CreateCall(splat_1d, {bm, W})); - llvm::Value* offa_0 = builder.CreateMul(sa_n, builder.CreateCall(splat_1d, {bm, lda_n})); - offa_0 = builder.CreateAdd(offa_0, builder.CreateMul(sa_h, builder.CreateCall(splat_1d, {bm, lda_h}))); - offa_0 = builder.CreateAdd(offa_0, builder.CreateMul(sa_w, builder.CreateCall(splat_1d, {bm, lda_w}))); - // Images CRS offset - llvm::Value* sa_cr = builder.CreateUDiv(sa1, builder.CreateCall(splat_1d, {bk, S})); - llvm::Value* sa_s = builder.CreateURem(sa1, builder.CreateCall(splat_1d, {bk, S})); - llvm::Value* sa_c = builder.CreateUDiv(sa_cr, builder.CreateCall(splat_1d, {bk, R})); - llvm::Value* sa_r = builder.CreateURem(sa_cr, builder.CreateCall(splat_1d, {bk, R})); - llvm::Value* offa_1 = builder.CreateMul(sa_c, builder.CreateCall(splat_1d, {bk, lda_c})); - offa_1 = builder.CreateAdd(offa_1, builder.CreateMul(sa_r, builder.CreateCall(splat_1d, {bk, lda_h}))); - offa_1 = builder.CreateAdd(offa_1, builder.CreateMul(sa_s, builder.CreateCall(splat_1d, {bk, lda_w}))); - // Images pointer - llvm::Value* off_a = builder.CreateCall(outer_add, {offa_0, offa_1}); - llvm::Value* start_pa = builder.CreateCall(gtp_2d, {base_pa, off_a}, "start_pa"); - llvm::LoadInst* start_aa = builder.CreateLoad(start_pa, false, "start_aa"); - llvm::Value* start_a = builder.CreateCall(reshape, {start_aa, bm, bk}, "start_a"); - // Filters pointer - llvm::Value* tldb_s = builder.CreateCall(splat_1d, {bk, K}); - llvm::Value* off_b = builder.CreateCall(outer_add, {sb0, builder.CreateMul(sb1, tldb_s)}, "off_b"); - llvm::Value* start_pb = builder.CreateCall(gtp_2d, {base_pb, off_b}, "start_pb"); - llvm::Value* start_bb = builder.CreateLoad(start_pb, false, "start_bb"); - llvm::Value* start_b = builder.CreateCall(reshape, {start_bb, bn, bk}, "start_b"); - // Filters increment - llvm::Value* inc_b_0 = builder.CreateCall(splat_1d, {bn, _s0}, "inc_b_0"); - llvm::Value* inc_b_1 = builder.CreateCall(splat_1d, {bk, builder.CreateMul(bk, ldb_k)}, "inc_b_1"); - llvm::Value* inc_b = builder.CreateCall(outer_add, {inc_b_0, inc_b_1}, "inc_b"); - // Pointers to constant memory - llvm::Value* base_incdelta = builder.CreateGEP(cst_ptr, builder.getInt32(0)); - llvm::Value* base_delta = builder.CreateGEP(cst_ptr, builder.getInt32(num_inc_delta)); - llvm::Value* base_incmask = builder.CreateGEP(cst_ptr, builder.getInt32(num_delta)); - llvm::Value* base_mask = builder.CreateGEP(cst_ptr, builder.getInt32(num_inc_masks)); - // Delta pointers - llvm::Value* start_pincdelta = builder.CreateCall(gtp_1d, {base_incdelta, sa1}, "start_pincdelta"); - llvm::Value* start_pdelta = builder.CreateCall(gtp_1d, {base_delta, builder.CreateCall(splat_1d, {bk, _s0})}, "start_pdelta"); - // Masks - llvm::Value* _1 = builder.CreateCall(splat_1d, {bk, builder.getInt32(1)}); - llvm::Value* mask_a_1 = builder.CreateShl(_1, sa1); - llvm::Value* start_pincmask = builder.CreateCall(gtp_1d, {base_incmask, sa0}, "start_pincmask"); - llvm::Value* start_pmask = builder.CreateCall(gtp_1d, {base_mask, sa0}, "start_pmask"); - // Enter loop - builder.CreateBr(LoopBB); - builder.SetInsertPoint(LoopBB); - // PHI nodes - llvm::PHINode* c = builder.CreatePHI(_0->getType(), 3, "c"); - llvm::PHINode* crs = builder.CreatePHI(int32_t, 3, "crs"); - llvm::PHINode* pa = builder.CreatePHI(start_pa->getType(), 3, "pa"); - llvm::PHINode* pb = builder.CreatePHI(start_pb->getType(), 3, "pb"); - llvm::PHINode *a = builder.CreatePHI(start_a->getType(), 3, "a"); - llvm::PHINode *b = builder.CreatePHI(start_b->getType(), 3, "b"); - llvm::PHINode *pdelta = builder.CreatePHI(start_pdelta->getType(), 3); - llvm::PHINode *pincdelta = builder.CreatePHI(start_pincdelta->getType(), 3); - llvm::PHINode *pmasks = builder.CreatePHI(start_pmask->getType(), 3); - llvm::PHINode *pincmasks = builder.CreatePHI(start_pincmask->getType(), 3); - llvm::Value* next_c = builder.CreateCall(mma, {a, b, c}, "next_c"); - c->addIncoming(_0, PrologBB); - c->addIncoming(next_c, LoopBB); - // Induction variable - llvm::Value *next_crs = builder.CreateSub(crs, bk); - crs->addIncoming(CRS, PrologBB); - crs->addIncoming(next_crs, LoopBB); - // Update pointer - llvm::Value *inc_delta = builder.CreateLoad(pincdelta); - llvm::Value *inc_mask = builder.CreateLoad(pincmasks); - llvm::Value *inc_a_1 = builder.CreateLoad(pdelta); - llvm::Value *inc_a_0 = builder.CreateCall(splat_1d, {bm, builder.getInt32(0)}); - llvm::Value *inc_a = builder.CreateCall(outer_add, {inc_a_0, inc_a_1}); - llvm::Value *next_pa = builder.CreateCall(stp_2d, {pa, inc_a}, "next_pa"); - llvm::Value *next_pb = builder.CreateCall(stp_2d, {pb, inc_b}, "next_pb"); - llvm::Value *next_pdelta = builder.CreateCall(stp_1d, {pdelta, inc_delta}, "next_pdelta"); - llvm::Value *next_pincdelta = builder.CreateCall(stp_1d, {pincdelta, inc_delta}, "next_pincdelta"); - llvm::Value *next_pmask = builder.CreateCall(stp_1d, {pmasks, inc_mask}, "next_pmask"); - llvm::Value *next_pincmask = builder.CreateCall(stp_1d, {pincmasks, inc_mask}, "next_pincmask"); - pdelta->addIncoming(start_pdelta, PrologBB); - pdelta->addIncoming(next_pdelta, LoopBB); - pincdelta->addIncoming(start_pincdelta, PrologBB); - pincdelta->addIncoming(next_pincdelta, LoopBB); - pmasks->addIncoming(start_pmask, PrologBB); - pmasks->addIncoming(next_pmask, LoopBB); - pincmasks->addIncoming(start_pincmask, PrologBB); - pincmasks->addIncoming(next_pincmask, LoopBB); - pa->addIncoming(start_pa, PrologBB); - pa->addIncoming(next_pa, LoopBB); - pb->addIncoming(start_pb, PrologBB); - pb->addIncoming(next_pb, LoopBB); - // End condition - llvm::Value* no_bounds_check = builder.CreateICmpSGT(next_crs, builder.getInt32(0), "no_bounds_check"); - // Masks - llvm::Value* mask_a_0 = builder.CreateLoad(pmasks, "mask_a_0"); - llvm::Value* mask_a_i32 = builder.CreateCall(outer_and_int32, {mask_a_0, mask_a_1}, "mask_a_i32"); - llvm::Value* mask_a = builder.CreateICmpNE(mask_a_i32, llvm::ConstantTile::get(_s0, {bm, bk}), "mask_a"); - llvm::Value* mask_b = builder.CreateCall(splat_2d, {bn, bk, no_bounds_check}, "mask_b"); - // Pre-fetch - llvm::Value* next_aa = builder.CreateCall(masked_load, {next_pa, mask_a}, "next_aa"); - llvm::Value* next_bb = builder.CreateCall(masked_load, {next_pb, mask_b}, "next_bb"); - llvm::Value* next_a = builder.CreateCall(reshape, {next_aa, bm, bk}, "next_a"); - llvm::Value* next_b = builder.CreateCall(reshape, {next_bb, bn, bk}, "next_b"); - a->addIncoming(start_a, PrologBB); - a->addIncoming(next_a, LoopBB); - b->addIncoming(start_b, PrologBB); - b->addIncoming(next_b, LoopBB); - // End condition - builder.CreateCondBr(no_bounds_check, LoopBB, EarlyExitBB); - // Early exit - builder.SetInsertPoint(EarlyExitBB); - llvm::Value* exit = builder.CreateICmpSLE(next_crs, _s0); - builder.CreateCondBr(exit, EpilogueBB, LastIterBB); - // Last Iteration - builder.SetInsertPoint(LastIterBB); - llvm::Value* in_bounds_b0 = builder.CreateICmpSLT(sb0, builder.CreateCall(splat_1d, {bn, K})); - llvm::Value* in_bounds_b1 = builder.CreateICmpSLT(sb1, builder.CreateCall(splat_1d, {bk, next_crs})); - llvm::Value* last_maskb = builder.CreateCall(outer_and, {in_bounds_b0, in_bounds_b1}, "last_maskb"); - llvm::Value* last_bb = builder.CreateCall(masked_load, {next_pb, last_maskb}, "last_bb"); - llvm::Value* last_b = builder.CreateCall(reshape, {last_bb, bn, bk}, "last_b"); - llvm::Value* loop = builder.CreateICmpSGT(next_crs, _s0); - a->addIncoming(next_a, LastIterBB); - b->addIncoming(last_b, LastIterBB); - c->addIncoming(next_c, LastIterBB); - crs->addIncoming(next_crs, LastIterBB); - pa->addIncoming(next_pa, LastIterBB); - pb->addIncoming(next_pb, LastIterBB); - pdelta->addIncoming(next_pdelta, LastIterBB); - pincdelta->addIncoming(next_pincdelta, LastIterBB); - pmasks->addIncoming(next_pmask, LastIterBB); - pincmasks->addIncoming(next_pincmask, LastIterBB); - builder.CreateCondBr(loop, LoopBB, EpilogueBB); - - // Epilogue - builder.SetInsertPoint(EpilogueBB); - llvm::Value* sc_pqn = builder.CreateCall(read_slice_x, {bm}, "sc_pqn"); - llvm::Value* sc_k = builder.CreateCall(read_slice_y, {bn}, "sc_k"); - // Output strides - llvm::Value* ldc_q = builder.getInt32(1); - llvm::Value* ldc_p = builder.CreateMul(lda_w, W); - llvm::Value* ldc_k = builder.CreateMul(lda_h, H); - llvm::Value* ldb_n = builder.CreateMul(lda_c, K); - // Output PQN offset - llvm::Value* sc_pq = builder.CreateUDiv(sc_pqn, builder.CreateCall(splat_1d, {bm, N})); - llvm::Value* sc_n = builder.CreateURem(sc_pqn, builder.CreateCall(splat_1d, {bm, N})); - llvm::Value* sc_p = builder.CreateUDiv(sc_pq, builder.CreateCall(splat_1d, {bm, W})); - llvm::Value* sc_q = builder.CreateURem(sc_pq, builder.CreateCall(splat_1d, {bm, W})); - llvm::Value* offc0 = builder.CreateMul(sc_n, builder.CreateCall(splat_1d, {bm, ldb_n})); - offc0 = builder.CreateAdd(offc0, builder.CreateMul(sc_p, builder.CreateCall(splat_1d, {bm, ldc_p}))); - offc0 = builder.CreateAdd(offc0, builder.CreateMul(sc_q, builder.CreateCall(splat_1d, {bm, ldc_q}))); - // Output K offset - llvm::Value* offc1 = builder.CreateMul(sc_k, builder.CreateCall(splat_1d, {bn, ldc_k})); - // Output pointer - llvm::Value* offc = builder.CreateCall(outer_add, {offc0, offc1}); - llvm::Value* pc = builder.CreateCall(gtp_2d, {base_pc, offc}); - // Output masks - llvm::Value* in_bounds_c0 = builder.CreateICmpSLT(sc_pqn, builder.CreateCall(splat_1d, {bm, PQN})); - llvm::Value* in_bounds_c1 = builder.CreateICmpSLT(sc_k, builder.CreateCall(splat_1d, {bn, K})); - llvm::Value* maskc = builder.CreateCall(outer_and, {in_bounds_c0, in_bounds_c1}); - builder.CreateCall(masked_store, {next_c, pc, maskc}); - builder.CreateRet(NULL); - - - // Set metadata - llvm::Metadata *md_args[] = { - llvm::ValueAsMetadata::get(F), - llvm::MDString::get(context, "kernel"), - llvm::ValueAsMetadata::get(llvm::ConstantInt::get(llvm::Type::getInt32Ty(context), 1)) - }; - module->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(llvm::MDNode::get(context, md_args)); - - // Machine - module->setTargetTriple("nvptx64-nvidia-cuda"); - auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error); - - llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), "sm_52", "", - llvm::TargetOptions(), llvm::Reloc::Model(), - llvm::CodeModel::Model(), llvm::CodeGenOpt::Aggressive); - module->setDataLayout(machine->createDataLayout()); - - // Auto-tuning - autotune(machine, *module); - - // Emit - llvm::legacy::PassManager pass; - llvm::SmallVector buffer; - llvm::raw_svector_ostream stream(buffer); - machine->addPassesToEmitFile(pass, stream, nullptr, llvm::TargetMachine::CGFT_AssemblyFile); - pass.run(*module); - std::string src(buffer.begin(), buffer.end()); - - // Execute - std::cout << src << std::endl; -} diff --git a/gemm.cpp b/gemm.cpp deleted file mode 100644 index 5433fd8d9..000000000 --- a/gemm.cpp +++ /dev/null @@ -1,342 +0,0 @@ -#include - -#include "llvm/ADT/APFloat.h" -#include "llvm/ADT/Optional.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ExecutionEngine/ExecutionEngine.h" -#include "llvm/IR/BasicBlock.h" -#include "llvm/IR/Constants.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/LegacyPassManager.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/Type.h" -#include "llvm/IR/Verifier.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/Support/FileSystem.h" -#include "llvm/Support/Host.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/TargetSelect.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetOptions.h" -#include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/Support/Debug.h" -#include "llvm/Transforms/Utils/Cloning.h" - - -bool AT = false; -bool BT = true; - - -void autotune(llvm::TargetMachine *machine, llvm::Module &module){ - // Target parameters - std::vector ranges = { - // asm - 2, 16, 1, 64, - // bsn - 2, 16, 1, 64, - // pa - 1, 2, 4, 8, - // pb - 1, 2, 4, - // sm - 2, 1, 16, 2, 2, 2 - }; - - // Function - llvm::Function *F = module.getFunction("kernel"); - - // Auto-tuning - llvm::legacy::PassManager pass; - llvm::TargetPassConfig *pass_config = static_cast(machine)->createPassConfig(pass); - llvm::FunctionPass *tuning_params = pass_config->createTargetTuningParameters(); - tuning_params->runOnFunction(*F); - - - // Gather all parameters - llvm::DenseSet unique; - llvm::SmallVector params; - for(llvm::BasicBlock &bb: *F) - for(llvm::Instruction &instr: bb){ - // Get tuning parameters for this particular instruction - std::vector tuning_params; - machine->getTargetTuner().getParams(&instr, tuning_params); - for(llvm::TargetTuner::ParamType ¶m: tuning_params){ - // This parameter has not been seen before - if(unique.insert(param.Value).second){ - std::cout << instr.getName().data() << " " << param.Name << std::endl; - params.push_back(param.Value); - } - } - } - - // Gather all constraints - std::vector> constraints; - for(llvm::BasicBlock &bb: *F) - for(llvm::Instruction &instr: bb) - machine->getTargetTuner().getConstraints(&instr, constraints); - - // Assign parameters - std::cout << params.size() << " " << ranges.size() << std::endl; - for(unsigned i = 0; i < params.size(); i++) - *params[i] = ranges[i]; - - // Verify constraints - bool valid = true; - for(auto &constraint: constraints){ - valid = valid & constraint(); - } - - if(!valid){ - printf("Invalid kernel parameters\n"); - exit(EXIT_FAILURE); - } -} - -int main(){ -// llvm::DebugFlag = true; - - std::string error; - - llvm::InitializeAllTargetInfos(); - llvm::InitializeAllTargets(); - llvm::InitializeAllTargetMCs(); - llvm::InitializeAllAsmParsers(); - llvm::InitializeAllAsmPrinters(); - - // Module - llvm::LLVMContext context; - std::unique_ptr module = llvm::make_unique("TLVM toy example", context); - llvm::IRBuilder<> builder(context); - - // Globals - llvm::Type* bool_t = llvm::Type::getInt1Ty(context); - llvm::Type* mask_tile_t = llvm::TileType::get(bool_t, 2); - llvm::Type* numeric_t = llvm::Type::getFloatTy(context); - llvm::PointerType* numeric_ptr_t = llvm::PointerType::get(numeric_t, 0); - llvm::IntegerType* int32_t = llvm::Type::getInt32Ty(context); - llvm::IntegerType* int1_t = llvm::Type::getInt1Ty(context); - - llvm::Type* tile2d_t = llvm::TileType::get(numeric_t, 2); - llvm::Type* tile3d_t = llvm::TileType::get(numeric_t, 3); - llvm::Type* int32_slice_t = llvm::TileType::get(int32_t, 1); - llvm::Type* int32_tile_t = llvm::TileType::get(int32_t, 2); - llvm::Type* int1_slice_t = llvm::TileType::get(int1_t, 1); - llvm::Type* int1_tile_t = llvm::TileType::get(int1_t, 2); - - llvm::PointerType* tile2d_ptr_t = llvm::PointerType::get(tile2d_t, 0); - llvm::Function* read_slice_x = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_read_slice_x, {int32_slice_t}); - llvm::Function* read_slice_y = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_read_slice_y, {int32_slice_t}); - llvm::Function* range = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_range, {int32_slice_t}); - llvm::Function* gtp = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_gtp_2d, {tile2d_ptr_t, numeric_ptr_t, int32_tile_t}); - llvm::Function* stp = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_stp_2d, {tile2d_ptr_t, int32_tile_t}); - llvm::Intrinsic::ID mma_id; - if(!AT && !BT) mma_id = llvm::Intrinsic::tlvm_mma_nn; - if(!AT && BT) mma_id = llvm::Intrinsic::tlvm_mma_nt; - if(AT && !BT) mma_id = llvm::Intrinsic::tlvm_mma_tn; - if(AT && BT) mma_id = llvm::Intrinsic::tlvm_mma_tt; - llvm::Function* outer_add = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_outer_add, {int32_tile_t, int32_slice_t, int32_slice_t}); - llvm::Function* outer_and = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_outer_and, {int1_tile_t, int1_slice_t, int1_slice_t}); - llvm::Function* mma = llvm::Intrinsic::getDeclaration(module.get(), mma, {tile3d_t}); - llvm::Function* reshape = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_reshape_3d, {tile3d_t, tile2d_t}); - llvm::Function* splat_2d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_splat_2d, {mask_tile_t, bool_t}); - llvm::Function* splat_1d = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_splat_1d, {int32_slice_t, int32_t}); - llvm::Function* masked_load = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_masked_load, {tile2d_t, tile2d_ptr_t, mask_tile_t}); - llvm::Function* masked_store = llvm::Intrinsic::getDeclaration(module.get(), llvm::Intrinsic::tlvm_masked_store, {tile2d_t, tile2d_ptr_t, mask_tile_t}); - - // Hyperparameters - llvm::Hyperparameter *bm = llvm::Hyperparameter::get(int32_t, 0); - llvm::Hyperparameter *bn = llvm::Hyperparameter::get(int32_t, 1); - llvm::Hyperparameter *bk = llvm::Hyperparameter::get(int32_t, 2); - llvm::Hyperparameter *br = llvm::Hyperparameter::get(int32_t, 3); - - // Constants - llvm::Constant *_s0 = llvm::ConstantInt::get(int32_t, 0); - llvm::Constant *_f0 = llvm::ConstantFP::get(numeric_t, 0); - llvm::Constant *_0 = llvm::ConstantTile::get(_f0, {bm, bn}); - - // Function - llvm::FunctionType* prototype = llvm::FunctionType::get(llvm::Type::getVoidTy(context), std::vector{numeric_ptr_t, numeric_ptr_t, numeric_ptr_t, int32_t, int32_t, int32_t, int32_t}, false); - llvm::Function* F = llvm::Function::Create(prototype, llvm::Function::ExternalLinkage, "kernel", module.get()); - std::vector arguments; - F->addAttribute(1, llvm::Attribute::ReadOnly); - F->addAttribute(1, llvm::Attribute::NoAlias); - F->addAttribute(2, llvm::Attribute::ReadOnly); - F->addAttribute(2, llvm::Attribute::NoAlias); - std::transform(F->arg_begin(), F->arg_end(), std::back_inserter(arguments), [&](llvm::Argument& x){ return &x;}); - arguments[0]->setName("pa"); - arguments[1]->setName("pb"); - arguments[2]->setName("pc"); - arguments[3]->setName("M"); - arguments[4]->setName("N"); - arguments[5]->setName("K"); - arguments[6]->setName("bound"); - - // All basic blocks - llvm::BasicBlock* PrologBB = llvm::BasicBlock::Create(context, "prologue", F); - llvm::BasicBlock* LoopBB = llvm::BasicBlock::Create(context, "loop", F); - llvm::BasicBlock* EarlyExitBB = llvm::BasicBlock::Create(context, "early_exit", F); - llvm::BasicBlock* LastIterBB = llvm::BasicBlock::Create(context, "last_iter", F); - llvm::BasicBlock* EpilogueBB = llvm::BasicBlock::Create(context, "epilogue", F); - - - // First basic block - builder.SetInsertPoint(PrologBB); - - llvm::CallInst* aasm = builder.CreateCall(read_slice_x, {bm}, "asm"); - llvm::CallInst* bbsn = builder.CreateCall(read_slice_y, {bn}, "bsn"); - llvm::CallInst* ask = builder.CreateCall(range, {builder.getInt32(0), bk}, "ask"); - llvm::CallInst* bsk = builder.CreateCall(range, {builder.getInt32(0), bk}, "bsk"); - - llvm::Value *M = arguments[3], *N = arguments[4], *K = arguments[5]; - llvm::Value *bound = arguments[6]; - llvm::Value *AS0 = M, *AS1 = K; - llvm::Value *sa0 = aasm, *sa1 = ask; - llvm::Value *ba0 = bm, *ba1 = bk; - llvm::Value *inca0 = _s0, *inca1 = bk; - if(AT){ - std::swap(AS0, AS1); - std::swap(sa0, sa1); - std::swap(ba0, ba1); - std::swap(inca0, inca1); - } - llvm::Value *BS0 = K, *BS1 = N; - llvm::Value *sb0 = bsk, *sb1 = bbsn; - llvm::Value *bb0 = bk, *bb1 = bn; - llvm::Value *incb0 = bk, *incb1 = _s0; - if(BT){ - std::swap(BS0, BS1); - std::swap(sb0, sb1); - std::swap(bb0, bb1); - std::swap(incb0, incb1); - } - - llvm::CallInst* tlda = builder.CreateCall(splat_1d, {ba1, AS0}, "lda"); - llvm::CallInst* tldb = builder.CreateCall(splat_1d, {bb1, BS1}, "ldb"); - llvm::CallInst* offa = builder.CreateCall(outer_add, {sa0, builder.CreateMul(sa1, tlda)}, "offa"); - llvm::CallInst* offb = builder.CreateCall(outer_add, {sb0, builder.CreateMul(sb1, tldb)}, "offb"); - llvm::CallInst* startpa = builder.CreateCall(gtp, {arguments[0], offa}, "startpa"); - llvm::CallInst* startpb = builder.CreateCall(gtp, {arguments[1], offb}, "startpb"); - llvm::LoadInst* startfa = builder.CreateLoad(startpa, "startfa"); - llvm::LoadInst* startfb = builder.CreateLoad(startpb, "startfb"); - llvm::Value* starta = builder.CreateCall(reshape, {startfa, ba0, ba1, br}, "starta"); - llvm::Value* startb = builder.CreateCall(reshape, {startfb, bb0, bb1, br}, "startb"); - llvm::Value* tinca0 = builder.CreateCall(splat_1d, {ba0, builder.CreateMul(inca0, AS0)}, "tinca0"); - llvm::Value* tinca1 = builder.CreateCall(splat_1d, {ba1, builder.CreateMul(inca1, AS1)}); - llvm::Value* tincb0 = builder.CreateCall(splat_1d, {bb0, builder.CreateMul(incb0, BS0)}); - llvm::Value* tincb1 = builder.CreateCall(splat_1d, {bb1, builder.CreateMul(incb1, BS1)}); - llvm::Value* inca = builder.CreateCall(outer_add, {tinca0, tinca1}, "inca"); - llvm::Value* incb = builder.CreateCall(outer_add, {tincb0, tincb1}, "incb"); - // Enter loop - builder.CreateBr(LoopBB); - builder.SetInsertPoint(LoopBB); - // PHI nodes - llvm::PHINode* c = builder.CreatePHI(_0->getType(), 2, "c"); - llvm::PHINode* k = builder.CreatePHI(int32_t, 2, "k"); - llvm::PHINode* pa = builder.CreatePHI(startpa->getType(), 2, "pa"); - llvm::PHINode* pb = builder.CreatePHI(startpb->getType(), 2, "pb"); - llvm::PHINode *a = builder.CreatePHI(starta->getType(), 2, "a"); - llvm::PHINode *b = builder.CreatePHI(startb->getType(), 2, "b"); - llvm::Value* nextc = builder.CreateCall(mma, {a, b, c}, "nextc"); - c->addIncoming(_0, PrologBB); - c->addIncoming(nextc, LoopBB); - // Induction variable - llvm::Value *nextk = builder.CreateSub(k, bk); - k->addIncoming(K, PrologBB); - k->addIncoming(nextk, LoopBB); - // Update pointer - llvm::Value *nextpa = builder.CreateCall(stp, {pa, inca}, "nextpa"); - llvm::Value *nextpb = builder.CreateCall(stp, {pb, incb}, "nextpb"); - pa->addIncoming(startpa, PrologBB); - pa->addIncoming(nextpa, LoopBB); - pb->addIncoming(startpb, PrologBB); - pb->addIncoming(nextpb, LoopBB); - // End condition - llvm::Value* no_bounds_check = builder.CreateICmpSGT(nextk, bound); - // Masks - llvm::Value* maska = builder.CreateCall(splat_2d, {ba0, ba1, no_bounds_check}, "maska"); - llvm::Value* maskb = builder.CreateCall(splat_2d, {bb0, bb1, no_bounds_check}, "maskb"); - // Pre-fetch - llvm::Value* nextfa = builder.CreateCall(masked_load, {nextpa, maska}, "nextfa"); - llvm::Value* nextfb = builder.CreateCall(masked_load, {nextpb, maskb}, "nextfb"); - llvm::Value* nexta = builder.CreateCall(reshape, {nextfa, ba0, ba1, br}, "nexta"); - llvm::Value* nextb = builder.CreateCall(reshape, {nextfb, bb0, bb1, br}, "nextb"); - a->addIncoming(starta, PrologBB); - a->addIncoming(nexta, LoopBB); - b->addIncoming(startb, PrologBB); - b->addIncoming(nextb, LoopBB); - // End condition - builder.CreateCondBr(no_bounds_check, LoopBB, EarlyExitBB); - // Early exit - builder.SetInsertPoint(EarlyExitBB); - llvm::Value* exit = builder.CreateICmpSLE(nextk, _s0); - builder.CreateCondBr(exit, EpilogueBB, LastIterBB); - // Last Iteration - builder.SetInsertPoint(LastIterBB); - llvm::Value* in_bounds_a0 = builder.CreateICmpSLT(aasm, builder.CreateCall(splat_1d, {ba0, M})); - llvm::Value* in_bounds_a1 = builder.CreateICmpSLT(ask, builder.CreateCall(splat_1d, {ba1, bk})); - llvm::Value* in_bounds_b0 = builder.CreateICmpSLT(bbsn, builder.CreateCall(splat_1d, {bb0, N})); - llvm::Value* in_bounds_b1 = builder.CreateICmpSLT(bsk, builder.CreateCall(splat_1d, {bb1, bk})); - llvm::Value* lastmaska = builder.CreateCall(outer_and, {in_bounds_a0, in_bounds_a1}, "lastmaska"); - llvm::Value* lastmaskb = builder.CreateCall(outer_and, {in_bounds_b0, in_bounds_b1}, "lastmaskb"); - llvm::Value* lastfa = builder.CreateCall(masked_load, {nextpa, lastmaska}, "lastfa"); - llvm::Value* lastfb = builder.CreateCall(masked_load, {nextpb, lastmaskb}, "lastfb"); - llvm::Value* lasta = builder.CreateCall(reshape, {lastfa, ba0, ba1, br}, "lasta"); - llvm::Value* lastb = builder.CreateCall(reshape, {lastfb, bb0, bb1, br}, "lastb"); - llvm::Value* loop = builder.CreateICmpSGT(nextk, _s0); - a->addIncoming(lasta, LastIterBB); - b->addIncoming(lastb, LastIterBB); - c->addIncoming(nextc, LastIterBB); - k->addIncoming(nextk, LastIterBB); - pa->addIncoming(nextpa, LastIterBB); - pb->addIncoming(nextpb, LastIterBB); - builder.CreateCondBr(loop, LoopBB, EpilogueBB); - // Epilogue - builder.SetInsertPoint(EpilogueBB); - llvm::CallInst* sm = builder.CreateCall(read_slice_x, {bm}, "sm"); - llvm::CallInst* sn = builder.CreateCall(read_slice_y, {bn}, "sn"); - llvm::CallInst* ldc = builder.CreateCall(splat_1d, {bn, M}, "lda"); - llvm::CallInst* offc = builder.CreateCall(outer_add, {sm, builder.CreateMul(sn, ldc)}, "offc"); - llvm::CallInst* pc = builder.CreateCall(gtp, {arguments[2], offc}, "pc"); - llvm::Value* in_bounds_c0 = builder.CreateICmpSLT(sm, builder.CreateCall(splat_1d, {bm, M})); - llvm::Value* in_bounds_c1 = builder.CreateICmpSLT(sn, builder.CreateCall(splat_1d, {bn, N})); - llvm::Value* maskc = builder.CreateCall(outer_and, {in_bounds_c0, in_bounds_c1}, "maskc"); - builder.CreateCall(masked_store, {nextc, pc, maskc}); - builder.CreateRet(NULL); - - - // Set metadata - llvm::Metadata *md_args[] = { - llvm::ValueAsMetadata::get(F), - llvm::MDString::get(context, "kernel"), - llvm::ValueAsMetadata::get(llvm::ConstantInt::get(llvm::Type::getInt32Ty(context), 1)) - }; - module->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(llvm::MDNode::get(context, md_args)); - - // Machine - module->setTargetTriple("nvptx64-nvidia-cuda"); - auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error); - - llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), "sm_52", "", - llvm::TargetOptions(), llvm::Reloc::Model(), - llvm::CodeModel::Model(), llvm::CodeGenOpt::Aggressive); - module->setDataLayout(machine->createDataLayout()); - - // Auto-tuning - autotune(machine, *module); - - // Emit - llvm::legacy::PassManager pass; - llvm::SmallVector buffer; - llvm::raw_svector_ostream stream(buffer); - machine->addPassesToEmitFile(pass, stream, nullptr, llvm::TargetMachine::CGFT_AssemblyFile); - pass.run(*module); - std::string src(buffer.begin(), buffer.end()); - - // Execute - std::cout << src << std::endl; -} diff --git a/main.cpp b/main.cpp new file mode 100644 index 000000000..5a01e7c68 --- /dev/null +++ b/main.cpp @@ -0,0 +1,14 @@ +#include +#include + +typedef struct yy_buffer_state * YY_BUFFER_STATE; +extern int yyparse(); +extern YY_BUFFER_STATE yy_scan_string(const char * str); +extern void yy_delete_buffer(YY_BUFFER_STATE buffer); + +int main() { + char string[] = "void test(int);"; + YY_BUFFER_STATE buffer = yy_scan_string(string); + yy_delete_buffer(buffer); + return 0; +} diff --git a/parser.y b/parser.y new file mode 100644 index 000000000..3501fba0f --- /dev/null +++ b/parser.y @@ -0,0 +1,305 @@ +%{ +namespace ast{ +class node; +} +using namespace ast; +#define YYSTYPE node* +#include "../ast.h" +using namespace ast; + +extern char* yytext; +void yyerror(const char *s); +int yylex(void); + +translation_unit *ast_root; + +%} + +%token IDENTIFIER CONSTANT STRING_LITERAL +%token PTR_OP INC_OP DEC_OP LEFT_OP RIGHT_OP LE_OP GE_OP EQ_OP NE_OP +%token AND_OP OR_OP MUL_ASSIGN DIV_ASSIGN MOD_ASSIGN ADD_ASSIGN +%token SUB_ASSIGN LEFT_ASSIGN RIGHT_ASSIGN AND_ASSIGN +%token XOR_ASSIGN OR_ASSIGN TYPE_NAME +%token VOID UINT8 UINT16 UINT32 UINT64 INT8 INT16 INT32 INT64 FP32 FP64 +%token IF ELSE FOR +%token DEF + +%start translation_unit +%% + + +/* -------------------------- */ +/* Types */ +/* -------------------------- */ + +type_specifier + : VOID + | UINT8 | UINT16 | UINT32 | UINT64 + | INT8 | INT16 | INT32 | INT64 + | FP32 | FP64 + ; + +pointer + : '*' { $$ = new pointer_declarator(1); } + | '*' pointer { $$ = ((pointer_declarator*)$1)->inc(); } + +abstract_declarator + : pointer { $$ = $1; } + | direct_abstract_declarator { $$ = $1; } + | pointer direct_abstract_declarator { $$ = new compound_declarator($1, $2); } + ; + +direct_abstract_declarator + : '[' constant_list ']' { $$ = new tile_declarator($1); } + +constant : + CONSTANT { $$ = new constant(atoi(yytext)); } + ; + +constant_list + : constant { $$ = new list((constant*)$1); } + | constant_list ',' constant { $$ = append_ptr_list($1, $2); } + ; + +type_name + : type_specifier { $$ = new type((yytokentype)(size_t)$1, nullptr); } + | type_specifier abstract_declarator { $$ = new type((yytokentype)(size_t)$1, $2); } + ; + +/* -------------------------- */ +/* Expressions */ +/* -------------------------- */ + +identifier + : IDENTIFIER { $$ = new identifier(yytext); } + ; + +primary_expression + : identifier { $$ = $1; } + | constant { $$ = $1; } + | STRING_LITERAL { $$ = new string_literal(yytext); } + | '(' unary_expression ')' { $$ = $1; } + ; + +unary_expression + : primary_expression { $$ = $1; } + | INC_OP unary_expression { $$ = new unary_operator(INC_OP, $2); } + | DEC_OP unary_expression { $$ = new unary_operator(DEC_OP, $2); } + | unary_operator cast_expression { $$ = new unary_operator((yytokentype)(size_t)$1, $2); } + ; + +unary_operator + : '&' + | '*' + | '+' + | '-' + | '~' + | '!' + ; + +cast_expression + : unary_expression { $$ = $1; } + | '(' type_name ')' cast_expression { $$ = new cast_operator((yytokentype)(size_t)$1, $2); } + ; + +multiplicative_expression + : cast_expression { $$ = $1; } + | multiplicative_expression '*' cast_expression { $$ = new binary_operator('*', $1, $3); } + | multiplicative_expression '/' cast_expression { $$ = new binary_operator('/', $1, $3); } + | multiplicative_expression '%' cast_expression { $$ = new binary_operator('%', $1, $3); } + ; + +additive_expression + : multiplicative_expression { $$ = $1; } + | additive_expression '+' multiplicative_expression { $$ = new binary_operator('+', $1, $3); } + | additive_expression '-' multiplicative_expression { $$ = new binary_operator('-', $1, $3); } + ; + +shift_expression + : additive_expression { $$ = $1; } + | shift_expression LEFT_OP additive_expression { $$ = new binary_operator(LEFT_OP, $1, $3); } + | shift_expression RIGHT_OP additive_expression { $$ = new binary_operator(RIGHT_OP, $1, $3); } + ; + +relational_expression + : shift_expression { $$ = $1; } + | relational_expression '<' shift_expression { $$ = new binary_operator('<', $1, $3); } + | relational_expression '>' shift_expression { $$ = new binary_operator('>', $1, $3); } + | relational_expression LE_OP shift_expression { $$ = new binary_operator(LE_OP, $1, $3); } + | relational_expression GE_OP shift_expression { $$ = new binary_operator(GE_OP, $1, $3); } + ; + +equality_expression + : relational_expression { $$ = $1; } + | equality_expression EQ_OP relational_expression { $$ = new binary_operator(EQ_OP, $1, $3); } + | equality_expression NE_OP relational_expression { $$ = new binary_operator(NE_OP, $1, $3); } + ; + +and_expression + : equality_expression { $$ = $1; } + | and_expression '&' equality_expression { $$ = new binary_operator('&', $1, $3); } + ; + +exclusive_or_expression + : and_expression { $$ = $1; } + | exclusive_or_expression '^' and_expression { $$ = new binary_operator('^', $1, $3); } + ; + +inclusive_or_expression + : exclusive_or_expression { $$ = $1; } + | inclusive_or_expression '|' exclusive_or_expression { $$ = new binary_operator('|', $1, $3); } + ; + +logical_and_expression + : inclusive_or_expression { $$ = $1; } + | logical_and_expression AND_OP inclusive_or_expression { $$ = new binary_operator(AND_OP, $1, $3); } + ; + +logical_or_expression + : logical_and_expression { $$ = $1; } + | logical_or_expression OR_OP logical_and_expression { $$ = new binary_operator(OR_OP, $1, $3); } + ; + +conditional_expression + : logical_or_expression { $$ = $1; } + | logical_or_expression '?' conditional_expression ':' conditional_expression { $$ = new conditional_expression($1, $2, $3); } + ; + +assignment_operator + : '=' + | MUL_ASSIGN + | DIV_ASSIGN + | MOD_ASSIGN + | ADD_ASSIGN + | SUB_ASSIGN + | LEFT_ASSIGN + | RIGHT_ASSIGN + | AND_ASSIGN + | XOR_ASSIGN + | OR_ASSIGN + ; + + +assignment_expression + : conditional_expression { $$ = $1; } + | unary_expression assignment_operator assignment_expression { $$ = new assignment_expression($1, (yytokentype)(size_t)$2, $3); } + ; + +expression + : assignment_expression { $$ = $1; } + ; + +/* -------------------------- */ +/* Statements */ +/* -------------------------- */ + +statement + : compound_statement { $$ = $1; } + | expression_statement { $$ = $1; } + | selection_statement { $$ = $1; } + | iteration_statement { $$ = $1; } + ; + +compound_statement + : '{' '}' { $$ = new compound_statement(); } + | '{' statement_list '}' { $$ = $1; } + ; + +statement_list + : statement { $$ = new compound_statement($1); } + | statement_list statement { $$ = append_ptr_list($1, $2); } + ; + +expression_statement + : ';' { $$ = new no_op(); } + | expression ';' { $$ = $1; } + ; + +selection_statement + : IF '(' expression ')' statement { $$ = new selection_statement($1, $2); } + | IF '(' expression ')' statement ELSE statement { $$ = new selection_statement($1, $2, $3); } + ; + +iteration_statement + : FOR '(' expression_statement expression_statement ')' statement { $$ = new iteration_statement($1, $2, NULL, $3); } + | FOR '(' expression_statement expression_statement expression ')' statement { $$ = new iteration_statement($1, $2, $3, $3); } + ; + + +/* -------------------------- */ +/* Declarator */ +/* -------------------------- */ + + +direct_declarator + : identifier { $$ = $1; } + | direct_declarator '[' constant_list ']' { $$ = new tile_declarator($2); } + | direct_declarator '(' parameter_list ')' { $$ = new function_declarator($2); } + | direct_declarator '(' identifier_list ')' { $$ = new function_declarator($2); } + | direct_declarator '(' ')' { $$ = new function_declarator(nullptr); } + ; + +identifier_list + : identifier { $$ = new list((identifier*)$1); } + | identifier_list ',' identifier { $$ = append_ptr_list($1, $2); } + ; + +parameter_list + : parameter_declaration { $$ = new list((parameter*)$1); } + | parameter_list ',' parameter_declaration { $$ = append_ptr_list($1, $2); } + ; + +parameter_declaration + : declaration_specifiers declarator { $$ = new parameter((yytokentype)(size_t)$1, $2); } + | declaration_specifiers abstract_declarator { $$ = new parameter((yytokentype)(size_t)$1, $2); } + | declaration_specifiers { $$ = new parameter((yytokentype)(size_t)$1, nullptr); } + ; + + +declaration_specifiers + : type_specifier { $$ = $1; } + ; + +init_declarator_list + : init_declarator { $$ = new list((init_declarator*)$1); } + | init_declarator_list ',' init_declarator { $$ = append_ptr_list($1, $2); } + ; + +declaration + : declaration_specifiers ';' { $$ = new declaration($1, nullptr); } + | declaration_specifiers init_declarator_list ';' { $$ = new declaration($1, $2); } + ; + +declarator + : pointer direct_declarator { $$ = new compound_declarator($1, $2); } + | direct_declarator { $$ = $1; } + ; + +initializer + : assignment_expression { $$ = $1; } + | '{' constant '}' { $$ = $1; } + ; + +init_declarator + : declarator { $$ = new init_declarator($1, nullptr); } + | declarator '=' initializer { $$ = new init_declarator($1, $2); } + ; + +/* -------------------------- */ +/* Translation Unit */ +/* -------------------------- */ + +translation_unit + : external_declaration { $$ = new translation_unit($1); } + | translation_unit external_declaration { $$ = ((translation_unit*)($1))->add($2); } + ; + +external_declaration + : function_definition { $$ = $1; } + | declaration { $$ = $1; } + ; + +function_definition + : declarator compound_statement { $$ = new function_definition($1, $2); } + ; + diff --git a/scanner.l b/scanner.l new file mode 100644 index 000000000..394cca7c4 --- /dev/null +++ b/scanner.l @@ -0,0 +1,128 @@ +D [0-9] +L [a-zA-Z_] +H [a-fA-F0-9] +E [Ee][+-]?{D}+ +FS (f|F|l|L) +IS (u|U|l|L)* + +%{ +#include +#include "parser.hpp" + +void count(); +int check_type(); +int comment(); + +%} + +%% +"def" { count(); return(DEF); } +"if" { count(); return(IF); } +"else" { count(); return(ELSE); } +"for" { count(); return(FOR); } +"void" { count(); return(VOID); } +"uint8" { count(); return(UINT8); } +"uint16" { count(); return(UINT16); } +"uint32" { count(); return(UINT32); } +"uint64" { count(); return(UINT64); } +"int8" { count(); return(INT8); } +"int16" { count(); return(INT16); } +"int32" { count(); return(INT32); } +"int64" { count(); return(INT64); } +"fp32" { count(); return(FP32); } +"fp64" { count(); return(FP64); } + +{L}({L}|{D})* { count(); return(check_type()); } + +0[xX]{H}+{IS}? { count(); return(CONSTANT); } +0{D}+{IS}? { count(); return(CONSTANT); } +{D}+{IS}? { count(); return(CONSTANT); } +L?'(\\.|[^\\'])+' { count(); return(CONSTANT); } + +{D}+{E}{FS}? { count(); return(CONSTANT); } +{D}*"."{D}+({E})?{FS}? { count(); return(CONSTANT); } +{D}+"."{D}*({E})?{FS}? { count(); return(CONSTANT); } + +L?\"(\\.|[^\\"])*\" { count(); return(STRING_LITERAL); } + +">>=" { count(); return(RIGHT_ASSIGN); } +"<<=" { count(); return(LEFT_ASSIGN); } +"+=" { count(); return(ADD_ASSIGN); } +"-=" { count(); return(SUB_ASSIGN); } +"*=" { count(); return(MUL_ASSIGN); } +"/=" { count(); return(DIV_ASSIGN); } +"%=" { count(); return(MOD_ASSIGN); } +"&=" { count(); return(AND_ASSIGN); } +"^=" { count(); return(XOR_ASSIGN); } +"|=" { count(); return(OR_ASSIGN); } +">>" { count(); return(RIGHT_OP); } +"<<" { count(); return(LEFT_OP); } +"++" { count(); return(INC_OP); } +"--" { count(); return(DEC_OP); } +"->" { count(); return(PTR_OP); } +"&&" { count(); return(AND_OP); } +"||" { count(); return(OR_OP); } +"<=" { count(); return(LE_OP); } +">=" { count(); return(GE_OP); } +"==" { count(); return(EQ_OP); } +"!=" { count(); return(NE_OP); } +";" { count(); return(';'); } +("{"|"<%") { count(); return('{'); } +("}"|"%>") { count(); return('}'); } +"," { count(); return(','); } +":" { count(); return(':'); } +"=" { count(); return('='); } +"(" { count(); return('('); } +")" { count(); return(')'); } +("["|"<:") { count(); return('['); } +("]"|":>") { count(); return(']'); } +"." { count(); return('.'); } +"&" { count(); return('&'); } +"!" { count(); return('!'); } +"~" { count(); return('~'); } +"-" { count(); return('-'); } +"+" { count(); return('+'); } +"*" { count(); return('*'); } +"/" { count(); return('/'); } +"%" { count(); return('%'); } +"<" { count(); return('<'); } +">" { count(); return('>'); } +"^" { count(); return('^'); } +"|" { count(); return('|'); } +"?" { count(); return('?'); } + +[ \t\v\n\f] { count(); } +. { /* ignore bad characters */ } + +%% + +int yywrap() +{ return(1); } + + +int column = 0; + +void count() +{ + int i; + + for (i = 0; yytext[i] != '\0'; i++) + if (yytext[i] == '\n') + column = 0; + else if (yytext[i] == '\t') + column += 8 - (column % 8); + else + column++; + + ECHO; +} + +void yyerror (const char *s) /* Called by yyparse on error */ +{ + printf ("Error: %s\n", s); +} + +int check_type() +{ + return(IDENTIFIER); +} From 986b1588333fba2d7ae09ba8579c758ea5903b79 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 16 Dec 2018 12:35:28 -0500 Subject: [PATCH 008/494] TDL [Parser]: better handling of operator/specifier tokens --- ast.h | 184 ++++++++++++++++++++++++++++++------------------------- main.cpp | 13 +++- parser.y | 146 ++++++++++++++++++++++++------------------- 3 files changed, 194 insertions(+), 149 deletions(-) diff --git a/ast.h b/ast.h index 6cf22e89b..da9bee095 100644 --- a/ast.h +++ b/ast.h @@ -3,12 +3,60 @@ #include #include -typedef yytokentype token_type; - namespace ast{ +// Enumerations +enum ASSIGN_OP_T{ + ASSIGN, + INPLACE_MUL, INPLACE_DIV, INPLACE_MOD, + INPLACE_ADD, INPLACE_SUB, + INPLACE_LSHIFT, INPLACE_RSHIFT, + INPLACE_AND, INPLACE_XOR, + INPLACE_OR +}; + +enum BIN_OP_T{ + MUL, DIV, MOD, + ADD, SUB, + LEFT_SHIFT, RIGHT_SHIFT, + LT, GT, + LE, GE, + EQ, NE, + AND, XOR, OR, + LAND, LOR +}; + +enum UNARY_OP_T{ + INC, DEC, + PLUS, MINUS, + ADDR, DEREF, + COMPL, NOT +}; + +enum TYPE_T{ + VOID_T, + UINT8_T, UINT16_T, UINT32_T, UINT64_T, + INT8_T, INT16_T, INT32_T, INT64_T, + FLOAT32_T, FLOAT64_T +}; + +// AST class node { }; +struct token: public node{ + token(ASSIGN_OP_T value): assign_op(value){ } + token(BIN_OP_T value): bin_op(value){ } + token(UNARY_OP_T value): unary_op(value){ } + token(TYPE_T value): type(value){ } + + union { + ASSIGN_OP_T assign_op; + BIN_OP_T bin_op; + UNARY_OP_T unary_op; + TYPE_T type; + }; +}; + template class list: public node { public: @@ -26,55 +74,13 @@ node* append_ptr_list(node *result, node *in){ class binary_operator: public node{ public: - enum OP_T{ - MUL, DIV, REM, - ADD, SUB, - LEFT_SHIFT, RIGHT_SHIFT, - LT, GT, - LE, GE, - EQ, NE, - AND, XOR, OR, - LAND, LOR - }; - - static OP_T get_op(token_type token){ - switch(token){ - case LEFT_OP: return LEFT_SHIFT; - case RIGHT_OP: return RIGHT_SHIFT; - case LE_OP: return LE; - case GE_OP: return GE; - case EQ_OP: return EQ; - case NE_OP: return NE; - case AND_OP: return LAND; - case OR_OP: return LOR; - default: assert(false && "unreachable"); throw; - } - } - - static OP_T get_op(char token){ - switch(token){ - case '*': return MUL; - case '/': return DIV; - case '%': return REM; - case '+': return ADD; - case '-': return SUB; - case '<': return LT; - case '>': return GT; - case '&': return AND; - case '^': return XOR; - case '|': return OR; - default: assert(false && "unreachable"); throw; - } - } - -public: - binary_operator(token_type op, node *lhs, node *rhs) - : op_(get_op(op)), lhs_(lhs), rhs_(rhs) { } - binary_operator(char op, node *lhs, node *rhs) - : op_(get_op(op)), lhs_(lhs), rhs_(rhs){ } + binary_operator(node *op, node *lhs, node *rhs) + : op_(((token*)op)->bin_op), lhs_(lhs), rhs_(rhs) { } + binary_operator(BIN_OP_T op, node *lhs, node *rhs) + : op_(op), lhs_(lhs), rhs_(rhs) { } private: - const OP_T op_; + const BIN_OP_T op_; const node *lhs_; const node *rhs_; }; @@ -106,19 +112,22 @@ public: class unary_operator: public node{ public: - unary_operator(token_type token, node *arg): token_(token), arg_(arg) { } + unary_operator(node *op, node *arg) + : op_(((token*)op)->unary_op), arg_(arg) { } + unary_operator(UNARY_OP_T op, node *arg) + : op_(op), arg_(arg) { } private: - const token_type token_; + const UNARY_OP_T op_; const node *arg_; }; class cast_operator: public node{ public: - cast_operator(token_type type, node *arg): type_(type), arg_(arg) { } + cast_operator(node *type, node *arg): type_(type), arg_(arg) { } public: - const token_type type_; + const node *type_; const node *arg_; }; @@ -134,29 +143,45 @@ public: }; class assignment_expression: public node{ - typedef binary_operator::OP_T op_t; +public: + assignment_expression(node *lvalue, node *op, node *rvalue) + : lvalue_(lvalue), op_(((token*)op)->assign_op), rvalue_(rvalue) { } public: - assignment_expression(node *lvalue, token_type op, node *rvalue) - : lvalue_(lvalue), op_(binary_operator::get_op(op)), rvalue_(rvalue) { } - -public: - op_t op_; + ASSIGN_OP_T op_; const node *lvalue_; const node *rvalue_; }; -class compound_statement: public node{ -public: - compound_statement() : statements_() {} - compound_statement(node *stmt): statements_{stmt} {} - compound_statement* append(node *stmt) { statements_.push_back(stmt); return this; } +class statement: public node{ -private: - std::list statements_; }; -class selection_statement: public node{ +class declaration: public node{ +public: + declaration(node *spec, node *init) + : spec_(spec), init_(init) { } + +public: + const node *spec_; + const node *init_; +}; + + +class compound_statement: public statement{ + typedef list* declarations_t; + typedef list* statements_t; + +public: + compound_statement(node* decls, node* statements) + : decls_((declarations_t)decls), statements_((statements_t)statements) {} + +private: + declarations_t decls_; + statements_t statements_; +}; + +class selection_statement: public statement{ public: selection_statement(node *cond, node *if_value, node *else_value = nullptr) : cond_(cond), if_value_(if_value), else_value_(else_value) { } @@ -167,7 +192,7 @@ public: const node *else_value_; }; -class iteration_statement: public node{ +class iteration_statement: public statement{ public: iteration_statement(node *init, node *stop, node *exec, node *statements) : init_(init), stop_(stop), exec_(exec), statements_(statements) { } @@ -179,7 +204,7 @@ private: const node *statements_; }; -class no_op: public node { }; +class no_op: public statement { }; // Types class declarator: public node{ @@ -211,11 +236,11 @@ public: class parameter: public declarator { public: - parameter(token_type type, node *decl) - : type_(type), decl_(decl) { } + parameter(node *spec, node *decl) + : spec_(((token*)spec)->type), decl_(decl) { } public: - const token_type type_; + const TYPE_T spec_; const node *decl_; }; @@ -248,23 +273,14 @@ public: const node *initializer_; }; -class declaration: public node{ -public: - declaration(node *spec, node *init) - : spec_(spec), init_(init) { } - -public: - const node *spec_; - const node *init_; -}; class type: public node{ public: - type(token_type spec, node * decl) - : spec_(spec), decl_(decl) { } + type(node *spec, node * decl) + : spec_(((token*)spec)->type), decl_(decl) { } public: - const token_type spec_; + const TYPE_T spec_; const node *decl_; }; diff --git a/main.cpp b/main.cpp index 5a01e7c68..f4a67ddb4 100644 --- a/main.cpp +++ b/main.cpp @@ -6,9 +6,18 @@ extern int yyparse(); extern YY_BUFFER_STATE yy_scan_string(const char * str); extern void yy_delete_buffer(YY_BUFFER_STATE buffer); +const char src[] = +"\ +void test(int32 id){\ + fp32 c[16, 16] = {0};\ + int32 i = 0;\ + i += 1;\ +}\ +"; + int main() { - char string[] = "void test(int);"; - YY_BUFFER_STATE buffer = yy_scan_string(string); + YY_BUFFER_STATE buffer = yy_scan_string(src); + yyparse(); yy_delete_buffer(buffer); return 0; } diff --git a/parser.y b/parser.y index 3501fba0f..3fbed1113 100644 --- a/parser.y +++ b/parser.y @@ -33,10 +33,17 @@ translation_unit *ast_root; /* -------------------------- */ type_specifier - : VOID - | UINT8 | UINT16 | UINT32 | UINT64 - | INT8 | INT16 | INT32 | INT64 - | FP32 | FP64 + : VOID { $$ = new token(VOID_T); } + | UINT8 { $$ = new token(UINT8_T); } + | UINT16 { $$ = new token(UINT16_T); } + | UINT32 { $$ = new token(UINT32_T); } + | UINT64 { $$ = new token(UINT64_T); } + | INT8 { $$ = new token(INT8_T); } + | INT16 { $$ = new token(INT16_T); } + | INT32 { $$ = new token(INT32_T); } + | INT64 { $$ = new token(INT64_T); } + | FP32 { $$ = new token(FLOAT32_T); } + | FP64 { $$ = new token(FLOAT64_T); } ; pointer @@ -62,8 +69,8 @@ constant_list ; type_name - : type_specifier { $$ = new type((yytokentype)(size_t)$1, nullptr); } - | type_specifier abstract_declarator { $$ = new type((yytokentype)(size_t)$1, $2); } + : type_specifier { $$ = new type($1, nullptr); } + | type_specifier abstract_declarator { $$ = new type($1, $2); } ; /* -------------------------- */ @@ -83,108 +90,113 @@ primary_expression unary_expression : primary_expression { $$ = $1; } - | INC_OP unary_expression { $$ = new unary_operator(INC_OP, $2); } - | DEC_OP unary_expression { $$ = new unary_operator(DEC_OP, $2); } - | unary_operator cast_expression { $$ = new unary_operator((yytokentype)(size_t)$1, $2); } + | INC_OP unary_expression { $$ = new unary_operator(INC, $2); } + | DEC_OP unary_expression { $$ = new unary_operator(DEC, $2); } + | unary_operator cast_expression { $$ = new unary_operator($1, $2); } ; unary_operator - : '&' - | '*' - | '+' - | '-' - | '~' - | '!' + : '&' { $$ = new token(ADDR); } + | '*' { $$ = new token(DEREF); } + | '+' { $$ = new token(PLUS); } + | '-' { $$ = new token(MINUS); } + | '~' { $$ = new token(COMPL); } + | '!' { $$ = new token(NOT); } ; cast_expression : unary_expression { $$ = $1; } - | '(' type_name ')' cast_expression { $$ = new cast_operator((yytokentype)(size_t)$1, $2); } + | '(' type_name ')' cast_expression { $$ = new cast_operator($1, $2); } ; multiplicative_expression : cast_expression { $$ = $1; } - | multiplicative_expression '*' cast_expression { $$ = new binary_operator('*', $1, $3); } - | multiplicative_expression '/' cast_expression { $$ = new binary_operator('/', $1, $3); } - | multiplicative_expression '%' cast_expression { $$ = new binary_operator('%', $1, $3); } + | multiplicative_expression '*' cast_expression { $$ = new binary_operator(MUL, $1, $3); } + | multiplicative_expression '/' cast_expression { $$ = new binary_operator(DIV, $1, $3); } + | multiplicative_expression '%' cast_expression { $$ = new binary_operator(MOD, $1, $3); } ; additive_expression : multiplicative_expression { $$ = $1; } - | additive_expression '+' multiplicative_expression { $$ = new binary_operator('+', $1, $3); } - | additive_expression '-' multiplicative_expression { $$ = new binary_operator('-', $1, $3); } + | additive_expression '+' multiplicative_expression { $$ = new binary_operator(ADD, $1, $3); } + | additive_expression '-' multiplicative_expression { $$ = new binary_operator(SUB, $1, $3); } ; shift_expression : additive_expression { $$ = $1; } - | shift_expression LEFT_OP additive_expression { $$ = new binary_operator(LEFT_OP, $1, $3); } - | shift_expression RIGHT_OP additive_expression { $$ = new binary_operator(RIGHT_OP, $1, $3); } + | shift_expression LEFT_OP additive_expression { $$ = new binary_operator(LEFT_SHIFT, $1, $3); } + | shift_expression RIGHT_OP additive_expression { $$ = new binary_operator(RIGHT_SHIFT, $1, $3); } ; +/* Comparison */ relational_expression : shift_expression { $$ = $1; } - | relational_expression '<' shift_expression { $$ = new binary_operator('<', $1, $3); } - | relational_expression '>' shift_expression { $$ = new binary_operator('>', $1, $3); } - | relational_expression LE_OP shift_expression { $$ = new binary_operator(LE_OP, $1, $3); } - | relational_expression GE_OP shift_expression { $$ = new binary_operator(GE_OP, $1, $3); } + | relational_expression '<' shift_expression { $$ = new binary_operator(LT, $1, $3); } + | relational_expression '>' shift_expression { $$ = new binary_operator(GT, $1, $3); } + | relational_expression LE_OP shift_expression { $$ = new binary_operator(LE, $1, $3); } + | relational_expression GE_OP shift_expression { $$ = new binary_operator(GE, $1, $3); } ; equality_expression : relational_expression { $$ = $1; } - | equality_expression EQ_OP relational_expression { $$ = new binary_operator(EQ_OP, $1, $3); } - | equality_expression NE_OP relational_expression { $$ = new binary_operator(NE_OP, $1, $3); } + | equality_expression EQ_OP relational_expression { $$ = new binary_operator(EQ, $1, $3); } + | equality_expression NE_OP relational_expression { $$ = new binary_operator(NE, $1, $3); } ; +/* Binary */ and_expression : equality_expression { $$ = $1; } - | and_expression '&' equality_expression { $$ = new binary_operator('&', $1, $3); } + | and_expression '&' equality_expression { $$ = new binary_operator(AND, $1, $3); } ; exclusive_or_expression : and_expression { $$ = $1; } - | exclusive_or_expression '^' and_expression { $$ = new binary_operator('^', $1, $3); } + | exclusive_or_expression '^' and_expression { $$ = new binary_operator(XOR, $1, $3); } ; inclusive_or_expression : exclusive_or_expression { $$ = $1; } - | inclusive_or_expression '|' exclusive_or_expression { $$ = new binary_operator('|', $1, $3); } + | inclusive_or_expression '|' exclusive_or_expression { $$ = new binary_operator(OR, $1, $3); } ; +/* Logical */ logical_and_expression : inclusive_or_expression { $$ = $1; } - | logical_and_expression AND_OP inclusive_or_expression { $$ = new binary_operator(AND_OP, $1, $3); } + | logical_and_expression AND_OP inclusive_or_expression { $$ = new binary_operator(LAND, $1, $3); } ; logical_or_expression : logical_and_expression { $$ = $1; } - | logical_or_expression OR_OP logical_and_expression { $$ = new binary_operator(OR_OP, $1, $3); } + | logical_or_expression OR_OP logical_and_expression { $$ = new binary_operator(LOR, $1, $3); } ; +/* Conditional */ conditional_expression : logical_or_expression { $$ = $1; } | logical_or_expression '?' conditional_expression ':' conditional_expression { $$ = new conditional_expression($1, $2, $3); } ; +/* Assignment */ assignment_operator - : '=' - | MUL_ASSIGN - | DIV_ASSIGN - | MOD_ASSIGN - | ADD_ASSIGN - | SUB_ASSIGN - | LEFT_ASSIGN - | RIGHT_ASSIGN - | AND_ASSIGN - | XOR_ASSIGN - | OR_ASSIGN + : '=' { $$ = new token(ASSIGN); } + | MUL_ASSIGN { $$ = new token(INPLACE_MUL); } + | DIV_ASSIGN { $$ = new token(INPLACE_DIV); } + | MOD_ASSIGN { $$ = new token(INPLACE_MOD); } + | ADD_ASSIGN { $$ = new token(INPLACE_ADD); } + | SUB_ASSIGN { $$ = new token(INPLACE_SUB); } + | LEFT_ASSIGN { $$ = new token(INPLACE_LSHIFT); } + | RIGHT_ASSIGN { $$ = new token(INPLACE_RSHIFT); } + | AND_ASSIGN { $$ = new token(INPLACE_AND); } + | XOR_ASSIGN { $$ = new token(INPLACE_XOR); } + | OR_ASSIGN { $$ = new token(INPLACE_OR); } ; - assignment_expression : conditional_expression { $$ = $1; } - | unary_expression assignment_operator assignment_expression { $$ = new assignment_expression($1, (yytokentype)(size_t)$2, $3); } + | unary_expression assignment_operator assignment_expression { $$ = new assignment_expression($1, $2, $3); } ; +/* Expression */ expression : assignment_expression { $$ = $1; } ; @@ -201,13 +213,20 @@ statement ; compound_statement - : '{' '}' { $$ = new compound_statement(); } - | '{' statement_list '}' { $$ = $1; } + : '{' '}' { $$ = new compound_statement(nullptr, nullptr); } + | '{' statement_list '}' { $$ = new compound_statement(nullptr, $1); } + | '{' declaration_list '}' { $$ = new compound_statement($1, nullptr); } + | '{' declaration_list statement_list '}' { $$ = new compound_statement($1, $2);} ; + +declaration_list + : declaration { $$ = new list((declaration*)$1); } + | declaration_list declaration { $$ = append_ptr_list($1, $2); } + statement_list - : statement { $$ = new compound_statement($1); } - | statement_list statement { $$ = append_ptr_list($1, $2); } + : statement { $$ = new list((statement*)$1); } + | statement_list statement { $$ = append_ptr_list($1, $2); } ; expression_statement @@ -232,11 +251,11 @@ iteration_statement direct_declarator - : identifier { $$ = $1; } - | direct_declarator '[' constant_list ']' { $$ = new tile_declarator($2); } - | direct_declarator '(' parameter_list ')' { $$ = new function_declarator($2); } - | direct_declarator '(' identifier_list ')' { $$ = new function_declarator($2); } - | direct_declarator '(' ')' { $$ = new function_declarator(nullptr); } + : identifier { $$ = $1; } + | direct_declarator '[' constant_list ']' { $$ = new tile_declarator($2); } + | direct_declarator '(' parameter_list ')' { $$ = new function_declarator($2); } + | direct_declarator '(' identifier_list ')' { $$ = new function_declarator($2); } + | direct_declarator '(' ')' { $$ = new function_declarator(nullptr); } ; identifier_list @@ -250,9 +269,9 @@ parameter_list ; parameter_declaration - : declaration_specifiers declarator { $$ = new parameter((yytokentype)(size_t)$1, $2); } - | declaration_specifiers abstract_declarator { $$ = new parameter((yytokentype)(size_t)$1, $2); } - | declaration_specifiers { $$ = new parameter((yytokentype)(size_t)$1, nullptr); } + : declaration_specifiers declarator { $$ = new parameter($1, $2); } + | declaration_specifiers abstract_declarator { $$ = new parameter($1, $2); } + | declaration_specifiers { $$ = new parameter($1, nullptr); } ; @@ -295,11 +314,12 @@ translation_unit ; external_declaration - : function_definition { $$ = $1; } - | declaration { $$ = $1; } - ; + : function_definition { $$ = $1; } + | declaration { $$ = $1; } + ; function_definition - : declarator compound_statement { $$ = new function_definition($1, $2); } + : declaration_specifiers declarator compound_statement + | declarator compound_statement { $$ = new function_definition($1, $2); } ; From 50573052f7331900f861404278f2e745ef7c1257 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 16 Dec 2018 16:15:40 -0500 Subject: [PATCH 009/494] TDL: restructured project directories --- CMakeLists.txt | 41 +++++++++++++-- cmake/FindLLVM.cmake | 88 +++++++++++++++++++++++++++++++++ examples/CMakeLists.txt | 6 +++ main.cpp => examples/matrix.cpp | 4 ++ ast.h => include/ast.h | 35 +++---------- parser.y => include/parser.y | 42 +++++++++++++--- scanner.l => include/scanner.l | 0 lib/codegen.cpp | 0 8 files changed, 176 insertions(+), 40 deletions(-) create mode 100644 cmake/FindLLVM.cmake create mode 100644 examples/CMakeLists.txt rename main.cpp => examples/matrix.cpp (78%) rename ast.h => include/ast.h (84%) rename parser.y => include/parser.y (88%) rename scanner.l => include/scanner.l (100%) create mode 100644 lib/codegen.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 308a86ad1..256bbcf9c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,42 @@ +cmake_minimum_required(VERSION 2.8.7) +project(TDL) +include(CTest) +include(cmake/FindLLVM.cmake) + +# FLEX/YACC find_package(BISON) -BISON_TARGET(Parser parser.y ${CMAKE_CURRENT_BINARY_DIR}/parser.cpp) find_package(FLEX) -FLEX_TARGET(Lexer scanner.l ${CMAKE_CURRENT_BINARY_DIR}/scanner.cpp) +BISON_TARGET(Parser ${CMAKE_CURRENT_SOURCE_DIR}/include/parser.y ${CMAKE_CURRENT_BINARY_DIR}/parser.cpp) +FLEX_TARGET(Lexer ${CMAKE_CURRENT_SOURCE_DIR}/include/scanner.l ${CMAKE_CURRENT_BINARY_DIR}/scanner.cpp) get_filename_component(BISON_Parser_INCLUDE_DIRECTORIES ${BISON_Parser_OUTPUT_HEADER} DIRECTORY) include_directories(${BISON_Parser_INCLUDE_DIRECTORIES}) -add_executable(test main.cpp ${BISON_Parser_OUTPUTS} ${FLEX_Lexer_OUTPUTS}) + +#Default build type +if(NOT CMAKE_BUILD_TYPE) + message(STATUS "Default build type: Release") + set(CMAKE_BUILD_TYPE "Release") +endif() + +# Gather headers for cmake-based IDEs +file( GLOB_RECURSE ALL_SRC *.cpp *.hpp *.h *.py) +add_custom_target( ALL SOURCES ${ALL_SRC} ) + +# Compiler flags +link_directories(/home/philippe/Development/llvm-tlvm/build/lib) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) +include_directories(/home/philippe/Development/llvm-tlvm/include) +include_directories(/home/philippe/Development/llvm-tlvm/build/include) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + +# Library +file(GLOB_RECURSE LIBTDL_SRC lib/*.cpp) +add_library(tdl SHARED ${LIBTDL_SRC} ${BISON_Parser_OUTPUTS} ${FLEX_Lexer_OUTPUTS}) +target_link_libraries(tdl "dl" ${LLVM_LIBRARIES}) + +# Examples +add_subdirectory(examples) + + + + + diff --git a/cmake/FindLLVM.cmake b/cmake/FindLLVM.cmake new file mode 100644 index 000000000..b3196d444 --- /dev/null +++ b/cmake/FindLLVM.cmake @@ -0,0 +1,88 @@ +# - Find LLVM +# This module can be used to find LLVM. +# It requires that the llvm-config executable be available on the system path. +# Once found, llvm-config is used for everything else. +# +# Typical usage could be: +# find_package(LLVM QUIET REQUIRED COMPONENTS jit native interpreter) +# +# If the QUIET flag is not set, the specified components and LLVM version are +# outputted. +# +# If the COMPONENTS are not set, the default set of "all" is used. +# +# The following variables are set: +# +# LLVM_FOUND - Set to YES if LLVM is found. +# LLVM_VERSION - Set to the decimal version of the LLVM library. +# LLVM_C_FLAGS - All flags that should be passed to a C compiler. +# LLVM_CXX_FLAGS - All flags that should be passed to a C++ compiler. +# LLVM_CPP_FLAGS - All flags that should be passed to the C pre-processor. +# LLVM_LD_FLAGS - Additional flags to pass to the linker. +# LLVM_LIBRARY_DIRS - A list of directories where the LLVM libraries are located. +# LLVM_INCLUDE_DIRS - A list of directories where the LLVM headers are located. +# LLVM_LIBRARIES - A list of libraries which should be linked against. + +# A macro to run llvm config +macro(_llvm_config _var_name) + # Firstly, locate the LLVM config executable + find_program(_llvm_config_exe + NAMES llvm-config + PATHS /home/philippe/Development/llvm-tlvm/build/bin/ + DOC "llvm-config executable location" + ) + + # If no llvm-config executable was found, set the output variable to not + # found. + if(NOT _llvm_config_exe) + set(${_var_name} "${_var_name}-NOTFOUND") + else(NOT _llvm_config_exe) + # Otherwise, run llvm-config + execute_process( + COMMAND ${_llvm_config_exe} ${ARGN} + OUTPUT_VARIABLE ${_var_name} + RESULT_VARIABLE _llvm_config_retval + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(RESULT_VARIABLE) + message(SEND_ERROR + "Error running llvm-config with arguments: ${ARGN}") + endif(RESULT_VARIABLE) + endif(NOT _llvm_config_exe) +endmacro(_llvm_config) + +# The default set of components +set(_llvm_components all) + +# If components have been specified via find_package, use them +if(LLVM_FIND_COMPONENTS) + set(_llvm_components ${LLVM_FIND_COMPONENTS}) +endif(LLVM_FIND_COMPONENTS) + +if(NOT LLVM_FIND_QUIETLY) + message(STATUS "Looking for LLVM components: ${_llvm_components}") +endif(NOT LLVM_FIND_QUIETLY) + +_llvm_config(LLVM_VERSION --version) +_llvm_config(LLVM_C_FLAGS --cflags) +_llvm_config(LLVM_CXX_FLAGS --cxxflags) +_llvm_config(LLVM_CPP_FLAGS --cppflags) +_llvm_config(LLVM_LD_FLAGS --ldflags) +_llvm_config(LLVM_LIBRARY_DIRS --libdir) +_llvm_config(LLVM_INCLUDE_DIRS --includedir) +_llvm_config(LLVM_LIBRARIES --libs) + +if(NOT LLVM_FIND_QUIETLY) + message(STATUS "Found LLVM version: ${LLVM_VERSION}") +endif(NOT LLVM_FIND_QUIETLY) + +# handle the QUIETLY and REQUIRED arguments and set LLVM_FOUND to TRUE if +# all listed variables are TRUE +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(LLVM + DEFAULT_MSG + LLVM_LIBRARIES + LLVM_INCLUDE_DIRS + LLVM_LIBRARY_DIRS) + +# vim:sw=4:ts=4:autoindent diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt new file mode 100644 index 000000000..53d780a94 --- /dev/null +++ b/examples/CMakeLists.txt @@ -0,0 +1,6 @@ +foreach(PROG matrix) + add_executable(${PROG} ${PROG}.cpp) + set_target_properties(${PROG} PROPERTIES OUTPUT_NAME ${PROG}) + include_directories(/usr/local/cuda/include/) + target_link_libraries(${PROG} tdl) +endforeach(PROG) diff --git a/main.cpp b/examples/matrix.cpp similarity index 78% rename from main.cpp rename to examples/matrix.cpp index f4a67ddb4..cd46cc008 100644 --- a/main.cpp +++ b/examples/matrix.cpp @@ -1,10 +1,13 @@ #include #include +#include "ast.h" typedef struct yy_buffer_state * YY_BUFFER_STATE; extern int yyparse(); extern YY_BUFFER_STATE yy_scan_string(const char * str); extern void yy_delete_buffer(YY_BUFFER_STATE buffer); +using ast::translation_unit; +extern translation_unit *ast_root; const char src[] = "\ @@ -19,5 +22,6 @@ int main() { YY_BUFFER_STATE buffer = yy_scan_string(src); yyparse(); yy_delete_buffer(buffer); + translation_unit *program = ast_root; return 0; } diff --git a/ast.h b/include/ast.h similarity index 84% rename from ast.h rename to include/ast.h index da9bee095..3e1544264 100644 --- a/ast.h +++ b/include/ast.h @@ -43,20 +43,6 @@ enum TYPE_T{ // AST class node { }; -struct token: public node{ - token(ASSIGN_OP_T value): assign_op(value){ } - token(BIN_OP_T value): bin_op(value){ } - token(UNARY_OP_T value): unary_op(value){ } - token(TYPE_T value): type(value){ } - - union { - ASSIGN_OP_T assign_op; - BIN_OP_T bin_op; - UNARY_OP_T unary_op; - TYPE_T type; - }; -}; - template class list: public node { public: @@ -67,15 +53,8 @@ private: std::list values_; }; -template -node* append_ptr_list(node *result, node *in){ - return static_cast*>(result)->append((T*)in); -} - class binary_operator: public node{ public: - binary_operator(node *op, node *lhs, node *rhs) - : op_(((token*)op)->bin_op), lhs_(lhs), rhs_(rhs) { } binary_operator(BIN_OP_T op, node *lhs, node *rhs) : op_(op), lhs_(lhs), rhs_(rhs) { } @@ -112,8 +91,6 @@ public: class unary_operator: public node{ public: - unary_operator(node *op, node *arg) - : op_(((token*)op)->unary_op), arg_(arg) { } unary_operator(UNARY_OP_T op, node *arg) : op_(op), arg_(arg) { } @@ -144,8 +121,8 @@ public: class assignment_expression: public node{ public: - assignment_expression(node *lvalue, node *op, node *rvalue) - : lvalue_(lvalue), op_(((token*)op)->assign_op), rvalue_(rvalue) { } + assignment_expression(node *lvalue, ASSIGN_OP_T op, node *rvalue) + : lvalue_(lvalue), op_(op), rvalue_(rvalue) { } public: ASSIGN_OP_T op_; @@ -236,8 +213,8 @@ public: class parameter: public declarator { public: - parameter(node *spec, node *decl) - : spec_(((token*)spec)->type), decl_(decl) { } + parameter(TYPE_T spec, node *decl) + : spec_(spec), decl_(decl) { } public: const TYPE_T spec_; @@ -276,8 +253,8 @@ public: class type: public node{ public: - type(node *spec, node * decl) - : spec_(((token*)spec)->type), decl_(decl) { } + type(TYPE_T spec, node * decl) + : spec_(spec), decl_(decl) { } public: const TYPE_T spec_; diff --git a/parser.y b/include/parser.y similarity index 88% rename from parser.y rename to include/parser.y index 3fbed1113..40153c9c4 100644 --- a/parser.y +++ b/include/parser.y @@ -4,7 +4,7 @@ class node; } using namespace ast; #define YYSTYPE node* -#include "../ast.h" +#include "../include/ast.h" using namespace ast; extern char* yytext; @@ -13,6 +13,32 @@ int yylex(void); translation_unit *ast_root; +/* wrap token in AST node */ +struct token: public node{ + token(ASSIGN_OP_T value): assign_op(value){ } + token(BIN_OP_T value): bin_op(value){ } + token(UNARY_OP_T value): unary_op(value){ } + token(TYPE_T value): type(value){ } + + union { + ASSIGN_OP_T assign_op; + BIN_OP_T bin_op; + UNARY_OP_T unary_op; + TYPE_T type; + }; +}; + +/* shortcut to append in list */ +template +node* append_ptr_list(node *result, node *in){ + return static_cast*>(result)->append((T*)in); +} + +/* shortcut to access token value */ +ASSIGN_OP_T get_assign_op(node *op) { return ((token*)op)->assign_op; } +UNARY_OP_T get_unary_op(node *op) { return ((token*)op)->unary_op; } +TYPE_T get_type_spec(node *op) { return ((token*)op)->type; } + %} %token IDENTIFIER CONSTANT STRING_LITERAL @@ -69,8 +95,8 @@ constant_list ; type_name - : type_specifier { $$ = new type($1, nullptr); } - | type_specifier abstract_declarator { $$ = new type($1, $2); } + : type_specifier { $$ = new type(get_type_spec($1), nullptr); } + | type_specifier abstract_declarator { $$ = new type(get_type_spec($1), $2); } ; /* -------------------------- */ @@ -92,7 +118,7 @@ unary_expression : primary_expression { $$ = $1; } | INC_OP unary_expression { $$ = new unary_operator(INC, $2); } | DEC_OP unary_expression { $$ = new unary_operator(DEC, $2); } - | unary_operator cast_expression { $$ = new unary_operator($1, $2); } + | unary_operator cast_expression { $$ = new unary_operator(get_unary_op($1), $2); } ; unary_operator @@ -193,7 +219,7 @@ assignment_operator assignment_expression : conditional_expression { $$ = $1; } - | unary_expression assignment_operator assignment_expression { $$ = new assignment_expression($1, $2, $3); } + | unary_expression assignment_operator assignment_expression { $$ = new assignment_expression($1, get_assign_op($2), $3); } ; /* Expression */ @@ -269,9 +295,9 @@ parameter_list ; parameter_declaration - : declaration_specifiers declarator { $$ = new parameter($1, $2); } - | declaration_specifiers abstract_declarator { $$ = new parameter($1, $2); } - | declaration_specifiers { $$ = new parameter($1, nullptr); } + : declaration_specifiers declarator { $$ = new parameter(get_type_spec($1), $2); } + | declaration_specifiers abstract_declarator { $$ = new parameter(get_type_spec($1), $2); } + | declaration_specifiers { $$ = new parameter(get_type_spec($1), nullptr); } ; diff --git a/scanner.l b/include/scanner.l similarity index 100% rename from scanner.l rename to include/scanner.l diff --git a/lib/codegen.cpp b/lib/codegen.cpp new file mode 100644 index 000000000..e69de29bb From 9dfa6993fbd364ad9357a7e5b0cd4e88aaeaf1df Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 17 Dec 2018 10:43:49 -0500 Subject: [PATCH 010/494] TDL [codegen]: added basic structure --- CMakeLists.txt | 2 +- examples/matrix.cpp | 2 +- include/ast.h | 58 ++++++++++++++++++++++++++++++--------------- include/parser.y | 7 +++--- include/scanner.l | 2 -- lib/codegen.cpp | 14 +++++++++++ 6 files changed, 59 insertions(+), 26 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 256bbcf9c..b20429946 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,7 +18,7 @@ if(NOT CMAKE_BUILD_TYPE) endif() # Gather headers for cmake-based IDEs -file( GLOB_RECURSE ALL_SRC *.cpp *.hpp *.h *.py) +file( GLOB_RECURSE ALL_SRC *.cpp *.hpp *.h *.py *.y *.l) add_custom_target( ALL SOURCES ${ALL_SRC} ) # Compiler flags diff --git a/examples/matrix.cpp b/examples/matrix.cpp index cd46cc008..8d74b27b6 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -6,7 +6,7 @@ typedef struct yy_buffer_state * YY_BUFFER_STATE; extern int yyparse(); extern YY_BUFFER_STATE yy_scan_string(const char * str); extern void yy_delete_buffer(YY_BUFFER_STATE buffer); -using ast::translation_unit; +using tdl::ast::translation_unit; extern translation_unit *ast_root; const char src[] = diff --git a/include/ast.h b/include/ast.h index 3e1544264..48e2f7852 100644 --- a/include/ast.h +++ b/include/ast.h @@ -1,8 +1,15 @@ +#ifndef TDL_INCLUDE_AST_H +#define TDL_INCLUDE_AST_H + #include "parser.hpp" #include #include #include +namespace tdl{ + +class module; + namespace ast{ // Enumerations @@ -41,13 +48,18 @@ enum TYPE_T{ }; // AST -class node { }; +class node { +public: + virtual void codegen(module*) { } +}; template class list: public node { public: list(const T& x): values_{x} {} node* append(const T& x) { values_.push_back(x); return this;} + void codegen(module* mod) { for(T x: values_){ x->codegen(mod); } } + const std::list &values() const { return values_; } private: std::list values_; @@ -224,10 +236,10 @@ public: class function_declarator: public declarator{ public: function_declarator(node *args) - : args_((list)args) { } + : args_((list*)args) { } public: - const list args_; + const list* args_; }; class compound_declarator: public declarator{ @@ -261,28 +273,36 @@ public: const node *decl_; }; -class translation_unit: public node{ -public: - translation_unit(node *item) - : decls_(item) { } - - translation_unit *add(node *item) { - decls_.append(item); - return this; - } - -private: - list decls_; -}; - +/* Function definition */ class function_definition: public node{ public: function_definition(node *header, node *body) - : header_((declarator *)header), body_((compound_statement*)body) { } + : header_((function_declarator *)header), body_((compound_statement*)body) { } public: - const declarator *header_; + const function_declarator *header_; const compound_statement *body_; }; +/* Translation Unit */ +class translation_unit: public node{ +public: + translation_unit(node *item) + : decls_((list*)item) { } + + translation_unit *add(node *item) { + decls_->append(item); + return this; + } + + void codegen(module* mod); + +private: + list* decls_; +}; + } + +} + +#endif diff --git a/include/parser.y b/include/parser.y index 40153c9c4..73ef31ad6 100644 --- a/include/parser.y +++ b/include/parser.y @@ -1,11 +1,12 @@ %{ +namespace tdl{ namespace ast{ class node; } -using namespace ast; +} +using namespace tdl::ast; #define YYSTYPE node* #include "../include/ast.h" -using namespace ast; extern char* yytext; void yyerror(const char *s); @@ -111,7 +112,7 @@ primary_expression : identifier { $$ = $1; } | constant { $$ = $1; } | STRING_LITERAL { $$ = new string_literal(yytext); } - | '(' unary_expression ')' { $$ = $1; } + | '(' expression ')' { $$ = $1; } ; unary_expression diff --git a/include/scanner.l b/include/scanner.l index 394cca7c4..df730aec8 100644 --- a/include/scanner.l +++ b/include/scanner.l @@ -113,8 +113,6 @@ void count() column += 8 - (column % 8); else column++; - - ECHO; } void yyerror (const char *s) /* Called by yyparse on error */ diff --git a/lib/codegen.cpp b/lib/codegen.cpp index e69de29bb..f3d61d59c 100644 --- a/lib/codegen.cpp +++ b/lib/codegen.cpp @@ -0,0 +1,14 @@ +#include "codegen.h" +#include "ast.h" + +namespace tdl{ + +namespace ast{ + +void translation_unit::codegen(module *mod) +{ decls_->codegen(mod); } + + +} + +} From 97acf52dca13b1e40ff8215ea22bbcb1a0730b76 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 17 Dec 2018 18:38:02 -0500 Subject: [PATCH 011/494] TDL [codegen]: improving class structure --- include/ast.h | 55 +++++++++++++++++++++++++++++++++--------------- include/parser.y | 19 ++++++----------- lib/codegen.cpp | 1 - 3 files changed, 44 insertions(+), 31 deletions(-) diff --git a/include/ast.h b/include/ast.h index 48e2f7852..ecabb3dd0 100644 --- a/include/ast.h +++ b/include/ast.h @@ -6,6 +6,12 @@ #include #include +namespace llvm{ + +class LLVMType; + +} + namespace tdl{ class module; @@ -197,7 +203,8 @@ class no_op: public statement { }; // Types class declarator: public node{ - +public: + virtual llvm::LLVMType llvm_type(TYPE_T spec) const = 0; }; class pointer_declarator: public declarator{ @@ -210,35 +217,33 @@ public: return this; } + llvm::LLVMType llvm_type(TYPE_T spec) const; + private: unsigned order_; }; class tile_declarator: public declarator{ public: - tile_declarator(node *shapes) - : shapes_((list*)(shapes)) { } + tile_declarator(node *decl, node *shapes) + : decl_(decl), shapes_((list*)(shapes)) { } + + llvm::LLVMType llvm_type(TYPE_T spec) const; public: + const node* decl_; const list* shapes_; }; -class parameter: public declarator { -public: - parameter(TYPE_T spec, node *decl) - : spec_(spec), decl_(decl) { } - -public: - const TYPE_T spec_; - const node *decl_; -}; - class function_declarator: public declarator{ public: - function_declarator(node *args) - : args_((list*)args) { } + function_declarator(node *decl, node *args) + : decl_(decl), args_((list*)args) { } + + llvm::LLVMType llvm_type(TYPE_T spec) const; public: + const node* decl_; const list* args_; }; @@ -247,6 +252,8 @@ public: compound_declarator(node *ptr, node *tile) : ptr_(ptr), tile_(tile) { } + llvm::LLVMType llvm_type(TYPE_T spec) const; + public: const node *ptr_; const node *tile_; @@ -257,11 +264,24 @@ public: init_declarator(node *decl, node *initializer) : decl_(decl), initializer_(initializer){ } + llvm::LLVMType llvm_type(TYPE_T spec) const; + public: const node *decl_; const node *initializer_; }; +class parameter: public node { +public: + parameter(TYPE_T spec, node *decl) + : spec_(spec), decl_(decl) { } + + llvm::LLVMType* llvm_type() const; + +public: + const TYPE_T spec_; + const node *decl_; +}; class type: public node{ public: @@ -276,10 +296,11 @@ public: /* Function definition */ class function_definition: public node{ public: - function_definition(node *header, node *body) - : header_((function_declarator *)header), body_((compound_statement*)body) { } + function_definition(TYPE_T spec, node *header, node *body) + : spec_(spec), header_((function_declarator *)header), body_((compound_statement*)body) { } public: + const TYPE_T spec_; const function_declarator *header_; const compound_statement *body_; }; diff --git a/include/parser.y b/include/parser.y index 73ef31ad6..50e89ce3f 100644 --- a/include/parser.y +++ b/include/parser.y @@ -84,7 +84,7 @@ abstract_declarator ; direct_abstract_declarator - : '[' constant_list ']' { $$ = new tile_declarator($1); } + : '[' constant_list ']' { $$ = new tile_declarator(nullptr, $1); } constant : CONSTANT { $$ = new constant(atoi(yytext)); } @@ -279,17 +279,12 @@ iteration_statement direct_declarator : identifier { $$ = $1; } - | direct_declarator '[' constant_list ']' { $$ = new tile_declarator($2); } - | direct_declarator '(' parameter_list ')' { $$ = new function_declarator($2); } - | direct_declarator '(' identifier_list ')' { $$ = new function_declarator($2); } - | direct_declarator '(' ')' { $$ = new function_declarator(nullptr); } - ; - -identifier_list - : identifier { $$ = new list((identifier*)$1); } - | identifier_list ',' identifier { $$ = append_ptr_list($1, $2); } + | identifier '[' constant_list ']' { $$ = new tile_declarator($1, $2); } + | identifier '(' parameter_list ')' { $$ = new function_declarator($1, $2); } + | identifier '(' ')' { $$ = new function_declarator($1, nullptr); } ; + parameter_list : parameter_declaration { $$ = new list((parameter*)$1); } | parameter_list ',' parameter_declaration { $$ = append_ptr_list($1, $2); } @@ -298,7 +293,6 @@ parameter_list parameter_declaration : declaration_specifiers declarator { $$ = new parameter(get_type_spec($1), $2); } | declaration_specifiers abstract_declarator { $$ = new parameter(get_type_spec($1), $2); } - | declaration_specifiers { $$ = new parameter(get_type_spec($1), nullptr); } ; @@ -346,7 +340,6 @@ external_declaration ; function_definition - : declaration_specifiers declarator compound_statement - | declarator compound_statement { $$ = new function_definition($1, $2); } + : type_specifier declarator compound_statement { $$ = new function_definition(get_type_spec($1), $2, $3); } ; diff --git a/lib/codegen.cpp b/lib/codegen.cpp index f3d61d59c..7372cc272 100644 --- a/lib/codegen.cpp +++ b/lib/codegen.cpp @@ -1,4 +1,3 @@ -#include "codegen.h" #include "ast.h" namespace tdl{ From 176a437b2152f19e2c7ecba5afa374230a8a38a7 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 18 Dec 2018 23:02:28 -0500 Subject: [PATCH 012/494] [Code generation] bugfixes in type logic --- examples/matrix.cpp | 6 +- include/ast.h | 173 ++++++++++++++++++++++++++------------------ include/parser.y | 54 +++++++------- lib/codegen.cpp | 102 +++++++++++++++++++++++++- 4 files changed, 236 insertions(+), 99 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 8d74b27b6..a7f0fa7b6 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -1,6 +1,7 @@ #include #include #include "ast.h" +#include "codegen.h" typedef struct yy_buffer_state * YY_BUFFER_STATE; extern int yyparse(); @@ -11,7 +12,7 @@ extern translation_unit *ast_root; const char src[] = "\ -void test(int32 id){\ +void test(fp32 *A, fp32 *B, fp32 *C){\ fp32 c[16, 16] = {0};\ int32 i = 0;\ i += 1;\ @@ -23,5 +24,8 @@ int main() { yyparse(); yy_delete_buffer(buffer); translation_unit *program = ast_root; + tdl::context context; + tdl::module module("matrix", &context); + program->codegen(&module); return 0; } diff --git a/include/ast.h b/include/ast.h index ecabb3dd0..a01c20fcc 100644 --- a/include/ast.h +++ b/include/ast.h @@ -8,7 +8,9 @@ namespace llvm{ -class LLVMType; +class Function; +class Value; +class Type; } @@ -63,9 +65,17 @@ template class list: public node { public: list(const T& x): values_{x} {} - node* append(const T& x) { values_.push_back(x); return this;} - void codegen(module* mod) { for(T x: values_){ x->codegen(mod); } } - const std::list &values() const { return values_; } + + node* append(const T& x){ + values_.push_back(x); + return this; + } + + void codegen(module* mod) + { for(T x: values_){ x->codegen(mod); } } + + const std::list &values() const + { return values_; } private: std::list values_; @@ -91,13 +101,6 @@ private: const int value_; }; -class identifier: public node{ -public: - identifier(char *&name): name_(name) { } - -private: - std::string name_; -}; class string_literal: public node{ public: @@ -202,106 +205,138 @@ private: class no_op: public statement { }; // Types -class declarator: public node{ + +class declaration_specifier: public node{ public: - virtual llvm::LLVMType llvm_type(TYPE_T spec) const = 0; + declaration_specifier(TYPE_T spec) + : spec_(spec) { } + + llvm::Type* type(module *mod) const; + +private: + const TYPE_T spec_; }; -class pointer_declarator: public declarator{ +class declarator; +class parameter: public node { public: - pointer_declarator(unsigned order) - : order_(order) { } + parameter(node *spec, node *decl) + : spec_((declaration_specifier*)spec), + decl_((declarator*)decl) { } - pointer_declarator *inc(){ - order_ += 1; + llvm::Type* type(module *mod) const; + +public: + const declaration_specifier *spec_; + const declarator *decl_; +}; + +/* Declarators */ +class pointer; +class identifier; + +class declarator: public node{ + virtual llvm::Type* type_impl(module*mod, llvm::Type *type) const = 0; + +public: + declarator(node *lhs) + : lhs_((declarator*)lhs), ptr_(nullptr){ } + + llvm::Type* type(module*mod, llvm::Type *type) const; + + const identifier* id() const { + return (const identifier*)lhs_; + } + + declarator *set_ptr(node *ptr){ + ptr_ = (pointer*)ptr; return this; } - llvm::LLVMType llvm_type(TYPE_T spec) const; - -private: - unsigned order_; +protected: + declarator *lhs_; + pointer *ptr_; }; -class tile_declarator: public declarator{ -public: - tile_declarator(node *decl, node *shapes) - : decl_(decl), shapes_((list*)(shapes)) { } - - llvm::LLVMType llvm_type(TYPE_T spec) const; +class identifier: public declarator{ + llvm::Type* type_impl(module*mod, llvm::Type *type) const; + +public: + identifier(char *&name): declarator(nullptr), name_(name) { } + const std::string &name() const; + +private: + std::string name_; +}; + +class pointer: public declarator{ +private: + llvm::Type* type_impl(module *mod, llvm::Type *type) const; + +public: + pointer(node *id): declarator(id) { } +}; + +class tile: public declarator{ +private: + llvm::Type* type_impl(module *mod, llvm::Type *type) const; + +public: + tile(node *id, node *shapes) + : declarator(id), shapes_((list*)(shapes)) { } public: - const node* decl_; const list* shapes_; }; -class function_declarator: public declarator{ -public: - function_declarator(node *decl, node *args) - : decl_(decl), args_((list*)args) { } - - llvm::LLVMType llvm_type(TYPE_T spec) const; +class function: public declarator{ +private: + llvm::Type* type_impl(module *mod, llvm::Type *type) const; public: - const node* decl_; - const list* args_; + function(node *id, node *args) + : declarator(id), args_((list*)args) { } + +public: + const list* args_; }; -class compound_declarator: public declarator{ -public: - compound_declarator(node *ptr, node *tile) - : ptr_(ptr), tile_(tile) { } - llvm::LLVMType llvm_type(TYPE_T spec) const; +class initializer : public declarator{ +private: + llvm::Type* type_impl(module* mod, llvm::Type *type) const; public: - const node *ptr_; - const node *tile_; -}; + initializer(node *id, node *initializer) + : declarator(id), initializer_(initializer){ } -class init_declarator : public declarator{ -public: - init_declarator(node *decl, node *initializer) - : decl_(decl), initializer_(initializer){ } - - llvm::LLVMType llvm_type(TYPE_T spec) const; public: - const node *decl_; const node *initializer_; }; -class parameter: public node { -public: - parameter(TYPE_T spec, node *decl) - : spec_(spec), decl_(decl) { } - - llvm::LLVMType* llvm_type() const; - -public: - const TYPE_T spec_; - const node *decl_; -}; class type: public node{ public: type(TYPE_T spec, node * decl) - : spec_(spec), decl_(decl) { } + : spec_(spec), decl_((declarator*)decl) { } public: const TYPE_T spec_; - const node *decl_; + const declarator *decl_; }; /* Function definition */ class function_definition: public node{ public: - function_definition(TYPE_T spec, node *header, node *body) - : spec_(spec), header_((function_declarator *)header), body_((compound_statement*)body) { } + function_definition(node *spec, node *header, node *body) + : spec_((declaration_specifier*)spec), header_((function *)header), body_((compound_statement*)body) { } + + void codegen(module* mod); public: - const TYPE_T spec_; - const function_declarator *header_; + const declaration_specifier *spec_; + const function *header_; const compound_statement *body_; }; diff --git a/include/parser.y b/include/parser.y index 50e89ce3f..65329182c 100644 --- a/include/parser.y +++ b/include/parser.y @@ -74,17 +74,17 @@ type_specifier ; pointer - : '*' { $$ = new pointer_declarator(1); } - | '*' pointer { $$ = ((pointer_declarator*)$1)->inc(); } + : '*' { $$ = new pointer(nullptr); } + | '*' pointer { $$ = new pointer($1); } abstract_declarator : pointer { $$ = $1; } + | pointer direct_abstract_declarator { $$ = ((declarator*)$2)->set_ptr($1); } | direct_abstract_declarator { $$ = $1; } - | pointer direct_abstract_declarator { $$ = new compound_declarator($1, $2); } ; direct_abstract_declarator - : '[' constant_list ']' { $$ = new tile_declarator(nullptr, $1); } + : '[' constant_list ']' { $$ = new tile(nullptr, $1); } constant : CONSTANT { $$ = new constant(atoi(yytext)); } @@ -241,9 +241,9 @@ statement compound_statement : '{' '}' { $$ = new compound_statement(nullptr, nullptr); } - | '{' statement_list '}' { $$ = new compound_statement(nullptr, $1); } - | '{' declaration_list '}' { $$ = new compound_statement($1, nullptr); } - | '{' declaration_list statement_list '}' { $$ = new compound_statement($1, $2);} + | '{' statement_list '}' { $$ = new compound_statement(nullptr, $2); } + | '{' declaration_list '}' { $$ = new compound_statement($2, nullptr); } + | '{' declaration_list statement_list '}' { $$ = new compound_statement($2, $3);} ; @@ -262,13 +262,13 @@ expression_statement ; selection_statement - : IF '(' expression ')' statement { $$ = new selection_statement($1, $2); } - | IF '(' expression ')' statement ELSE statement { $$ = new selection_statement($1, $2, $3); } + : IF '(' expression ')' statement { $$ = new selection_statement($1, $3); } + | IF '(' expression ')' statement ELSE statement { $$ = new selection_statement($1, $3, $5); } ; iteration_statement - : FOR '(' expression_statement expression_statement ')' statement { $$ = new iteration_statement($1, $2, NULL, $3); } - | FOR '(' expression_statement expression_statement expression ')' statement { $$ = new iteration_statement($1, $2, $3, $3); } + : FOR '(' expression_statement expression_statement ')' statement { $$ = new iteration_statement($1, $3, NULL, $4); } + | FOR '(' expression_statement expression_statement expression ')' statement { $$ = new iteration_statement($1, $3, $4, $5); } ; @@ -279,30 +279,30 @@ iteration_statement direct_declarator : identifier { $$ = $1; } - | identifier '[' constant_list ']' { $$ = new tile_declarator($1, $2); } - | identifier '(' parameter_list ')' { $$ = new function_declarator($1, $2); } - | identifier '(' ')' { $$ = new function_declarator($1, nullptr); } - ; + | identifier '[' constant_list ']' { $$ = new tile($1, $3); } + | identifier '(' parameter_list ')' { $$ = new function($1, $3); } + | identifier '(' ')' { $$ = new function($1, nullptr); } + ; parameter_list : parameter_declaration { $$ = new list((parameter*)$1); } - | parameter_list ',' parameter_declaration { $$ = append_ptr_list($1, $2); } + | parameter_list ',' parameter_declaration { $$ = append_ptr_list($1, $3); } ; parameter_declaration - : declaration_specifiers declarator { $$ = new parameter(get_type_spec($1), $2); } - | declaration_specifiers abstract_declarator { $$ = new parameter(get_type_spec($1), $2); } + : declaration_specifiers declarator { $$ = new parameter($1, $2); } + | declaration_specifiers abstract_declarator { $$ = new parameter($1, $2); } ; declaration_specifiers - : type_specifier { $$ = $1; } + : type_specifier { $$ = new declaration_specifier(get_type_spec($1)); } ; init_declarator_list - : init_declarator { $$ = new list((init_declarator*)$1); } - | init_declarator_list ',' init_declarator { $$ = append_ptr_list($1, $2); } + : init_declarator { $$ = new list((initializer*)$1); } + | init_declarator_list ',' init_declarator { $$ = append_ptr_list($1, $3); } ; declaration @@ -311,18 +311,18 @@ declaration ; declarator - : pointer direct_declarator { $$ = new compound_declarator($1, $2); } + : pointer direct_declarator { $$ = ((declarator*)$2)->set_ptr($1); } | direct_declarator { $$ = $1; } ; initializer : assignment_expression { $$ = $1; } - | '{' constant '}' { $$ = $1; } + | '{' constant '}' { $$ = $2; } ; init_declarator - : declarator { $$ = new init_declarator($1, nullptr); } - | declarator '=' initializer { $$ = new init_declarator($1, $2); } + : declarator { $$ = new initializer($1, nullptr); } + | declarator '=' initializer { $$ = new initializer($1, $3); } ; /* -------------------------- */ @@ -330,7 +330,7 @@ init_declarator /* -------------------------- */ translation_unit - : external_declaration { $$ = new translation_unit($1); } + : external_declaration { ast_root = new translation_unit($1); $$ = ast_root; } | translation_unit external_declaration { $$ = ((translation_unit*)($1))->add($2); } ; @@ -340,6 +340,6 @@ external_declaration ; function_definition - : type_specifier declarator compound_statement { $$ = new function_definition(get_type_spec($1), $2, $3); } + : declaration_specifiers declarator compound_statement { $$ = new function_definition($1, $2, $3); } ; diff --git a/lib/codegen.cpp b/lib/codegen.cpp index 7372cc272..e5efd686b 100644 --- a/lib/codegen.cpp +++ b/lib/codegen.cpp @@ -1,12 +1,110 @@ #include "ast.h" +#include "codegen.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Module.h" + +using namespace llvm; namespace tdl{ +/* Context */ +context::context() { } + +LLVMContext *context::handle() { + return &handle_; +} + +/* Module */ +module::module(const std::string &name, context *ctx) + : handle_(name.c_str(), *ctx->handle()), builder_(*ctx->handle()) { +} + +llvm::Module* module::handle() { + return &handle_; +} + +llvm::IRBuilder<>& module::builder() { + return builder_; +} + + namespace ast{ -void translation_unit::codegen(module *mod) -{ decls_->codegen(mod); } +/* Translation unit */ +void translation_unit::codegen(module *mod){ + decls_->codegen(mod); +} +/* Declaration specifier */ +Type* declaration_specifier::type(module *mod) const { + LLVMContext &ctx = mod->handle()->getContext(); + switch (spec_) { + case VOID_T: return Type::getVoidTy(ctx); + case INT8_T: return IntegerType::get(ctx, 8); + case INT16_T: return IntegerType::get(ctx, 16); + case INT32_T: return IntegerType::get(ctx, 32); + case INT64_T: return IntegerType::get(ctx, 64); + case FLOAT32_T: return Type::getFloatTy(ctx); + case FLOAT64_T: return Type::getDoubleTy(ctx); + default: assert(false && "unreachable"); throw; + } +} + +/* Parameter */ +Type* parameter::type(module *mod) const { + return decl_->type(mod, spec_->type(mod)); +} + +/* Declarators */ +Type* declarator::type(module *mod, Type *type) const{ + if(ptr_) + return type_impl(mod, ptr_->type(mod, type)); + return type_impl(mod, type); +} + +// Identifier +Type* identifier::type_impl(module *, Type *type) const{ + return type; +} + +const std::string &identifier::name() const{ + return name_; +} + + +// Tile +Type* tile::type_impl(module*, Type *type) const{ + return TileType::get(type, shapes_->values().size()); +} + +// Initializer +Type* initializer::type_impl(module *, Type *type) const{ + return type; +} + +// Pointer +Type* pointer::type_impl(module*, Type *type) const{ + return PointerType::get(type, 1); +} + +// Function +Type* function::type_impl(module*mod, Type *type) const{ + SmallVector types; + for(parameter* param: args_->values()){ + types.push_back(param->type(mod)); + } + return FunctionType::get(type, types, false); +} + +/* Function definition */ +void function_definition::codegen(module *mod){ + llvm::FunctionType *prototype = (llvm::FunctionType *)header_->type(mod, spec_->type(mod)); + const std::string &name = header_->id()->name(); + llvm::Function *fn = llvm::Function::Create(prototype, llvm::Function::ExternalLinkage, name, mod->handle()); + llvm::BasicBlock::Create(mod->handle()->getContext(), "entry", fn); + mod->builder().SetInsertPoint(); + +} } From 951e9733ea6278f4cd14e49d85c3f9c4c9803294 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 18 Dec 2018 23:04:02 -0500 Subject: [PATCH 013/494] [Code generation] added missing file --- include/codegen.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 include/codegen.h diff --git a/include/codegen.h b/include/codegen.h new file mode 100644 index 000000000..02bb158ed --- /dev/null +++ b/include/codegen.h @@ -0,0 +1,29 @@ +#include "ast.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/IRBuilder.h" + +namespace tdl +{ + +class context { +public: + context(); + llvm::LLVMContext* handle(); + +private: + llvm::LLVMContext handle_; +}; + +class module { +public: + module(const std::string &name, context *ctx); + llvm::Module* handle(); + llvm::IRBuilder<>& builder(); + +private: + llvm::Module handle_; + llvm::IRBuilder<> builder_; +}; + + +} From 9247ed3714abe9ec88e211960ad4200f4459d570 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 19 Dec 2018 11:25:29 -0500 Subject: [PATCH 014/494] [Code generation] --- include/ast.h | 26 +++++++++++++++++--------- lib/codegen.cpp | 37 +++++++++++++++++++++++++++---------- 2 files changed, 44 insertions(+), 19 deletions(-) diff --git a/include/ast.h b/include/ast.h index a01c20fcc..da14081ab 100644 --- a/include/ast.h +++ b/include/ast.h @@ -58,7 +58,7 @@ enum TYPE_T{ // AST class node { public: - virtual void codegen(module*) { } + virtual void codegen(module*) const { } }; template @@ -71,7 +71,7 @@ public: return this; } - void codegen(module* mod) + void codegen(module* mod) const { for(T x: values_){ x->codegen(mod); } } const std::list &values() const @@ -155,14 +155,17 @@ class statement: public node{ }; +class initializer; class declaration: public node{ public: declaration(node *spec, node *init) - : spec_(spec), init_(init) { } + : spec_(spec), init_((list*)init) { } + + void codegen(module* mod) const; public: const node *spec_; - const node *init_; + const list *init_; }; @@ -174,6 +177,8 @@ public: compound_statement(node* decls, node* statements) : decls_((declarations_t)decls), statements_((statements_t)statements) {} + virtual void codegen(module* mod) const; + private: declarations_t decls_; statements_t statements_; @@ -307,12 +312,15 @@ private: llvm::Type* type_impl(module* mod, llvm::Type *type) const; public: - initializer(node *id, node *initializer) - : declarator(id), initializer_(initializer){ } + initializer(node *decl, node *init) + : declarator((node*)((declarator*)decl)->id()), + decl_((declarator*)decl), init_(init){ } + void codegen(module *) const; public: - const node *initializer_; + const declarator *decl_; + const node *init_; }; @@ -332,7 +340,7 @@ public: function_definition(node *spec, node *header, node *body) : spec_((declaration_specifier*)spec), header_((function *)header), body_((compound_statement*)body) { } - void codegen(module* mod); + void codegen(module* mod) const; public: const declaration_specifier *spec_; @@ -351,7 +359,7 @@ public: return this; } - void codegen(module* mod); + void codegen(module* mod) const; private: list* decls_; diff --git a/lib/codegen.cpp b/lib/codegen.cpp index e5efd686b..6fd528120 100644 --- a/lib/codegen.cpp +++ b/lib/codegen.cpp @@ -31,7 +31,7 @@ llvm::IRBuilder<>& module::builder() { namespace ast{ /* Translation unit */ -void translation_unit::codegen(module *mod){ +void translation_unit::codegen(module *mod) const{ decls_->codegen(mod); } @@ -78,8 +78,8 @@ Type* tile::type_impl(module*, Type *type) const{ } // Initializer -Type* initializer::type_impl(module *, Type *type) const{ - return type; +Type* initializer::type_impl(module *mod, Type *type) const{ + return decl_->type(mod, type); } // Pointer @@ -97,14 +97,31 @@ Type* function::type_impl(module*mod, Type *type) const{ } /* Function definition */ -void function_definition::codegen(module *mod){ - llvm::FunctionType *prototype = (llvm::FunctionType *)header_->type(mod, spec_->type(mod)); +void function_definition::codegen(module *mod) const{ + FunctionType *prototype = (FunctionType *)header_->type(mod, spec_->type(mod)); const std::string &name = header_->id()->name(); - llvm::Function *fn = llvm::Function::Create(prototype, llvm::Function::ExternalLinkage, name, mod->handle()); - llvm::BasicBlock::Create(mod->handle()->getContext(), "entry", fn); - mod->builder().SetInsertPoint(); - -} + Function *fn = Function::Create(prototype, Function::ExternalLinkage, name, mod->handle()); + BasicBlock *entry = BasicBlock::Create(mod->handle()->getContext(), "entry", fn); + mod->builder().SetInsertPoint(entry); + body_->codegen(mod); +} + +/* Statements */ +void compound_statement::codegen(module* mod) const{ + decls_->codegen(mod); + statements_->codegen(mod); +} + +/* Declaration */ +void declaration::codegen(module* mod) const{ + +} + +/* Initializat */ +void initializer::codegen(module *) const{ + +} + } From eab275dc99f361b5b4f96b50fffac5af40e274d6 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 20 Dec 2018 10:32:07 -0500 Subject: [PATCH 015/494] [Code generation] Added skeleton for expressions generation --- examples/matrix.cpp | 2 - include/ast.h | 142 +++++++++++++++-------- include/codegen.h | 4 + include/parser.y | 19 ++-- lib/codegen.cpp | 270 ++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 374 insertions(+), 63 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index a7f0fa7b6..0bc553849 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -13,9 +13,7 @@ extern translation_unit *ast_root; const char src[] = "\ void test(fp32 *A, fp32 *B, fp32 *C){\ - fp32 c[16, 16] = {0};\ int32 i = 0;\ - i += 1;\ }\ "; diff --git a/include/ast.h b/include/ast.h index da14081ab..04590a800 100644 --- a/include/ast.h +++ b/include/ast.h @@ -2,8 +2,9 @@ #define TDL_INCLUDE_AST_H #include "parser.hpp" +#include "llvm/IR/IRBuilder.h" #include -#include +#include #include namespace llvm{ @@ -58,7 +59,7 @@ enum TYPE_T{ // AST class node { public: - virtual void codegen(module*) const { } + virtual llvm::Value* codegen(module*) const { return nullptr; } }; template @@ -71,100 +72,146 @@ public: return this; } - void codegen(module* mod) const - { for(T x: values_){ x->codegen(mod); } } + llvm::Value* codegen(module* mod) const{ + for(T x: values_){ + x->codegen(mod); + } + return nullptr; + } - const std::list &values() const + const std::vector &values() const { return values_; } private: - std::list values_; + std::vector values_; }; -class binary_operator: public node{ +class expression: public node{ +public: + virtual llvm::Value* codegen(module *) const = 0; +}; + +class binary_operator: public expression{ +private: + llvm::Value* llvm_op(llvm::IRBuilder<> &bld, llvm::Value *lhs, llvm::Value *rhs, const std::string &name) const; + public: binary_operator(BIN_OP_T op, node *lhs, node *rhs) - : op_(op), lhs_(lhs), rhs_(rhs) { } + : op_(op), lhs_((expression*)lhs), rhs_((expression*)rhs) { } + llvm::Value* codegen(module *) const; private: const BIN_OP_T op_; - const node *lhs_; - const node *rhs_; + const expression *lhs_; + const expression *rhs_; }; -class constant: public node{ +class constant: public expression{ public: constant(int value): value_(value) { } + llvm::Value* codegen(module *mod) const; private: const int value_; }; -class string_literal: public node{ +class string_literal: public expression{ public: string_literal(char *&value): value_(value) { } + llvm::Value* codegen(module *mod) const; public: std::string value_; }; -class unary_operator: public node{ +class unary_operator: public expression{ +private: + llvm::Value *llvm_op(llvm::IRBuilder<> &builder, llvm::Value *arg, const std::string &name) const; + public: unary_operator(UNARY_OP_T op, node *arg) - : op_(op), arg_(arg) { } + : op_(op), + arg_((expression*)arg) { } + + llvm::Value* codegen(module *mod) const; private: const UNARY_OP_T op_; - const node *arg_; + const expression *arg_; }; -class cast_operator: public node{ -public: - cast_operator(node *type, node *arg): type_(type), arg_(arg) { } +class type_name; +class cast_operator: public expression{ +private: + llvm::Value *llvm_op(llvm::IRBuilder<> &builder, llvm::Type *T, llvm::Value *arg, const std::string &name) const; public: - const node *type_; - const node *arg_; + cast_operator(node *T, node *arg): + T_((type_name*)T), + arg_((expression*)arg) { } + + llvm::Value* codegen(module *mod) const; + +public: + const type_name *T_; + const expression *arg_; }; -class conditional_expression: public node{ +class conditional_expression: public expression{ +private: + llvm::Value *llvm_op(llvm::IRBuilder<> &builder, + llvm::Value *cond, llvm::Value *true_value, llvm::Value *false_value, + const std::string &name) const; + public: conditional_expression(node *cond, node *true_value, node *false_value) - : cond_(cond), true_value_(true_value), false_value_(false_value) { } + : cond_((expression*)cond), + true_value_((expression*)true_value), + false_value_((expression*)false_value) { } + + llvm::Value* codegen(module *mod) const; public: - const node *cond_; - const node *true_value_; - const node *false_value_; + const expression *cond_; + const expression *true_value_; + const expression *false_value_; }; -class assignment_expression: public node{ +class assignment_expression: public expression{ +private: + llvm::Value *llvm_op(llvm::IRBuilder<> &builder, + llvm::Value *lvalue, llvm::Value *rvalue, + const std::string &name) const; + public: assignment_expression(node *lvalue, ASSIGN_OP_T op, node *rvalue) - : lvalue_(lvalue), op_(op), rvalue_(rvalue) { } + : lvalue_((expression*)lvalue), op_(op), rvalue_((expression*)rvalue) { } + + llvm::Value* codegen(module *mod) const; public: ASSIGN_OP_T op_; - const node *lvalue_; - const node *rvalue_; + const expression *lvalue_; + const expression *rvalue_; }; class statement: public node{ - }; class initializer; +class declaration_specifier; + class declaration: public node{ public: declaration(node *spec, node *init) - : spec_(spec), init_((list*)init) { } + : spec_((declaration_specifier*)spec), init_((list*)init) { } - void codegen(module* mod) const; + llvm::Value* codegen(module* mod) const; public: - const node *spec_; + const declaration_specifier *spec_; const list *init_; }; @@ -177,7 +224,7 @@ public: compound_statement(node* decls, node* statements) : decls_((declarations_t)decls), statements_((statements_t)statements) {} - virtual void codegen(module* mod) const; + llvm::Value* codegen(module* mod) const; private: declarations_t decls_; @@ -230,6 +277,7 @@ public: decl_((declarator*)decl) { } llvm::Type* type(module *mod) const; + std::string name() const; public: const declaration_specifier *spec_; @@ -267,7 +315,7 @@ class identifier: public declarator{ llvm::Type* type_impl(module*mod, llvm::Type *type) const; public: - identifier(char *&name): declarator(nullptr), name_(name) { } + identifier(char *&name): declarator(this), name_(name) { } const std::string &name() const; private: @@ -302,6 +350,8 @@ public: function(node *id, node *args) : declarator(id), args_((list*)args) { } + void bind_parameters(module *mod, llvm::Function *fn) const; + public: const list* args_; }; @@ -314,23 +364,27 @@ private: public: initializer(node *decl, node *init) : declarator((node*)((declarator*)decl)->id()), - decl_((declarator*)decl), init_(init){ } + decl_((declarator*)decl), init_((expression*)init){ } - void codegen(module *) const; + void specifier(const declaration_specifier *spec); + llvm::Value* codegen(module *) const; public: + const declaration_specifier *spec_; const declarator *decl_; - const node *init_; + const expression *init_; }; -class type: public node{ +class type_name: public node{ public: - type(TYPE_T spec, node * decl) - : spec_(spec), decl_((declarator*)decl) { } + type_name(node *spec, node * decl) + : spec_((declaration_specifier*)spec), decl_((declarator*)decl) { } + + llvm::Type *type(module *mod) const; public: - const TYPE_T spec_; + const declaration_specifier *spec_; const declarator *decl_; }; @@ -340,7 +394,7 @@ public: function_definition(node *spec, node *header, node *body) : spec_((declaration_specifier*)spec), header_((function *)header), body_((compound_statement*)body) { } - void codegen(module* mod) const; + llvm::Value* codegen(module* mod) const; public: const declaration_specifier *spec_; @@ -359,7 +413,7 @@ public: return this; } - void codegen(module* mod) const; + llvm::Value* codegen(module* mod) const; private: list* decls_; diff --git a/include/codegen.h b/include/codegen.h index 02bb158ed..099ad5787 100644 --- a/include/codegen.h +++ b/include/codegen.h @@ -1,3 +1,4 @@ +#include #include "ast.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/IRBuilder.h" @@ -19,10 +20,13 @@ public: module(const std::string &name, context *ctx); llvm::Module* handle(); llvm::IRBuilder<>& builder(); + void value(ast::node* node, llvm::Value* value); + llvm::Value *value(ast::node* node); private: llvm::Module handle_; llvm::IRBuilder<> builder_; + std::unordered_map values_; }; diff --git a/include/parser.y b/include/parser.y index 65329182c..ab6e1f489 100644 --- a/include/parser.y +++ b/include/parser.y @@ -96,8 +96,8 @@ constant_list ; type_name - : type_specifier { $$ = new type(get_type_spec($1), nullptr); } - | type_specifier abstract_declarator { $$ = new type(get_type_spec($1), $2); } + : declaration_specifiers { $$ = new type_name($1, nullptr); } + | declaration_specifiers abstract_declarator { $$ = new type_name($1, $2); } ; /* -------------------------- */ @@ -228,6 +228,13 @@ expression : assignment_expression { $$ = $1; } ; +/* Initialization */ +initialization_expression + : assignment_expression { $$ = $1; } + | '{' constant '}' { $$ = $2; } + ; + + /* -------------------------- */ /* Statements */ /* -------------------------- */ @@ -315,14 +322,10 @@ declarator | direct_declarator { $$ = $1; } ; -initializer - : assignment_expression { $$ = $1; } - | '{' constant '}' { $$ = $2; } - ; - + init_declarator : declarator { $$ = new initializer($1, nullptr); } - | declarator '=' initializer { $$ = new initializer($1, $3); } + | declarator '=' initialization_expression { $$ = new initializer($1, $3); } ; /* -------------------------- */ diff --git a/lib/codegen.cpp b/lib/codegen.cpp index 6fd528120..676b61f2f 100644 --- a/lib/codegen.cpp +++ b/lib/codegen.cpp @@ -1,3 +1,4 @@ +#include #include "ast.h" #include "codegen.h" #include "llvm/IR/DerivedTypes.h" @@ -27,12 +28,21 @@ llvm::IRBuilder<>& module::builder() { return builder_; } +void module::value(ast::node* node, llvm::Value* value){ + values_[node] = value; +} + +llvm::Value *module::value(ast::node* node){ + return values_[node]; +} + namespace ast{ /* Translation unit */ -void translation_unit::codegen(module *mod) const{ +Value* translation_unit::codegen(module *mod) const{ decls_->codegen(mod); + return nullptr; } /* Declaration specifier */ @@ -55,6 +65,12 @@ Type* parameter::type(module *mod) const { return decl_->type(mod, spec_->type(mod)); } +std::string parameter::name() const { + if(auto id = decl_->id()) + return id->name(); + return ""; +} + /* Declarators */ Type* declarator::type(module *mod, Type *type) const{ if(ptr_) @@ -88,6 +104,17 @@ Type* pointer::type_impl(module*, Type *type) const{ } // Function +void function::bind_parameters(module *mod, Function *fn) const{ + std::vector args; + std::transform(fn->arg_begin(), fn->arg_end(), std::back_inserter(args), [&](llvm::Argument& x){ return &x;}); + assert(args.size() == args_->values().size()); + for(size_t i = 0; i < args.size(); i++){ + parameter *param_i = args_->values().at(i); + args[i]->setName(param_i->name()); + mod->value(param_i, args[i]); + } +} + Type* function::type_impl(module*mod, Type *type) const{ SmallVector types; for(parameter* param: args_->values()){ @@ -97,31 +124,256 @@ Type* function::type_impl(module*mod, Type *type) const{ } /* Function definition */ -void function_definition::codegen(module *mod) const{ +Value* function_definition::codegen(module *mod) const{ FunctionType *prototype = (FunctionType *)header_->type(mod, spec_->type(mod)); const std::string &name = header_->id()->name(); Function *fn = Function::Create(prototype, Function::ExternalLinkage, name, mod->handle()); + header_->bind_parameters(mod, fn); BasicBlock *entry = BasicBlock::Create(mod->handle()->getContext(), "entry", fn); mod->builder().SetInsertPoint(entry); body_->codegen(mod); + return nullptr; } /* Statements */ -void compound_statement::codegen(module* mod) const{ +Value* compound_statement::codegen(module* mod) const{ decls_->codegen(mod); - statements_->codegen(mod); +// statements_->codegen(mod); + return nullptr; } /* Declaration */ -void declaration::codegen(module* mod) const{ - +Value* declaration::codegen(module* mod) const{ + for(initializer *init: init_->values()) + init->specifier(spec_); + init_->codegen(mod); + return nullptr; } -/* Initializat */ -void initializer::codegen(module *) const{ - +/* Initializer */ +void initializer::specifier(const declaration_specifier *spec) { + spec_ = spec; } +Value* initializer::codegen(module * mod) const{ + Type *ty = decl_->type(mod, spec_->type(mod)); + std::string name = decl_->id()->name(); + Value *value = llvm::UndefValue::get(ty); + value->setName(name); + return nullptr; +} + +/*------------------*/ +/* Expression */ +/*------------------*/ + +/* Binary operator */ +Value *binary_operator::llvm_op(llvm::IRBuilder<> &builder, Value *lhs, Value *rhs, const std::string &name) const +{ + Type *ltype = lhs->getType(); + Type *rtype = rhs->getType(); + bool is_float = ltype->isFloatingPointTy() || rtype->isFloatingPointTy(); + bool is_ptr = ltype->isPointerTy() || rtype->isPointerTy(); + bool is_int = ltype->isIntegerTy() || rtype->isIntegerTy(); + bool is_signed = false; + // Mul + if(op_==MUL && is_float) + return builder.CreateFMul(lhs, rhs, name); + if(op_==MUL && is_int) + return builder.CreateMul(lhs, rhs, name); + // Div + if(op_==DIV && is_float) + return builder.CreateFDiv(lhs, rhs, name); + if(op_==DIV && is_int && is_signed) + return builder.CreateSDiv(lhs, rhs, name); + if(op_==DIV && is_int && !is_signed) + return builder.CreateUDiv(lhs, rhs, name); + // Mod + if(op_==MOD && is_float) + return builder.CreateFRem(lhs, rhs, name); + if(op_==MOD && is_int && is_signed) + return builder.CreateSRem(lhs, rhs, name); + if(op_==MOD && is_int && !is_signed) + return builder.CreateURem(lhs, rhs, name); + // Add + if(op_==ADD && is_float) + return builder.CreateFAdd(lhs, rhs, name); + if(op_==ADD && is_int) + return builder.CreateAdd(lhs, rhs); + if(op_==ADD && is_ptr) + return builder.CreateGEP(lhs, {rhs}); + // Sub + if(op_==SUB && is_float) + return builder.CreateFSub(lhs, rhs, name); + if(op_==SUB && is_int) + return builder.CreateSub(lhs, rhs, name); + if(op_==SUB && is_ptr) + return builder.CreateGEP(lhs, {builder.CreateNeg(rhs)}); + // Left shift + if(op_==LEFT_SHIFT){ + assert(is_int); + return builder.CreateLShr(lhs, rhs, name); + } + // Right shift + if(op_==RIGHT_SHIFT){ + assert(is_int); + return builder.CreateAShr(lhs, rhs, name); + } + // LT + if(op_ == LT && is_float) + return builder.CreateFCmpOLT(lhs, rhs, name); + if(op_ == LT && is_int && is_signed) + return builder.CreateICmpSLT(lhs, rhs, name); + if(op_ == LT && is_int && !is_signed) + return builder.CreateICmpULT(lhs, rhs, name); + // GT + if(op_ == GT && is_float) + return builder.CreateFCmpOGT(lhs, rhs, name); + if(op_ == GT && is_int && is_signed) + return builder.CreateICmpSGT(lhs, rhs, name); + if(op_ == GT && is_int && !is_signed) + return builder.CreateICmpUGT(lhs, rhs, name); + // LE + if(op_ == LE && is_float) + return builder.CreateFCmpOLE(lhs, rhs, name); + if(op_ == LE && is_int && is_signed) + return builder.CreateICmpSLE(lhs, rhs, name); + if(op_ == LE && is_int && !is_signed) + return builder.CreateICmpULE(lhs, rhs, name); + // GE + if(op_ == GE && is_float) + return builder.CreateFCmpOGE(lhs, rhs, name); + if(op_ == GE && is_int && is_signed) + return builder.CreateICmpSGE(lhs, rhs, name); + if(op_ == GE && is_int && !is_signed) + return builder.CreateICmpUGE(lhs, rhs, name); + // EQ + if(op_ == EQ && is_float) + return builder.CreateFCmpOEQ(lhs, rhs, name); + if(op_ == EQ && is_int) + return builder.CreateICmpEQ(lhs, rhs, name); + // NE + if(op_ == NE && is_float) + return builder.CreateFCmpONE(lhs, rhs, name); + if(op_ == NE && is_int) + return builder.CreateICmpNE(lhs, rhs, name); + // AND + if(op_ == AND){ + assert(is_int); + return builder.CreateAnd(lhs, rhs, name); + } + if(op_ == XOR){ + assert(is_int); + return builder.CreateXor(lhs, rhs, name); + } + if(op_ == OR){ + assert(is_int); + return builder.CreateOr(lhs, rhs, name); + } + if(op_ == LAND){ + assert(is_int); + return builder.CreateAnd(lhs, rhs, name); + } + if(op_ == LOR){ + assert(is_int); + return builder.CreateOr(lhs, rhs, name); + } + assert(false && "unreachable"); + throw; +} + +Value* binary_operator::codegen(module *mod) const{ + Value *lhs = lhs_->codegen(mod); + Value *rhs = rhs_->codegen(mod); + Value *result = llvm_op(mod->builder(), lhs, rhs, ""); + return result; +} + +/* Unary operator */ +Value *unary_operator::llvm_op(llvm::IRBuilder<> &builder, Value *arg, const std::string &name) const{ + Type *atype = arg->getType(); + bool is_float = atype->isFloatingPointTy(); + bool is_int = atype->isIntegerTy(); + if(op_ == INC){ + assert(is_int); + return builder.CreateAdd(arg, builder.getInt32(1), name); + } + if(op_ == DEC){ + assert(is_int); + return builder.CreateSub(arg, builder.getInt32(1), name); + } + if(op_ == PLUS) + return arg; + if(op_ == MINUS && is_float) + return builder.CreateFNeg(arg, name); + if(op_ == MINUS && is_int) + return builder.CreateNeg(arg, name); + if(op_ == ADDR) + throw std::runtime_error("not supported"); + if(op_ == DEREF) + return builder.CreateLoad(arg, name); + if(op_ == COMPL) + throw std::runtime_error("not supported"); + if(op_ == NOT) + return builder.CreateNot(arg, name); + assert(false && "unrechable"); + throw; +} + +Value* unary_operator::codegen(module *mod) const{ + Value *arg = arg_->codegen(mod); + Value *result = llvm_op(mod->builder(), arg, ""); + return result; +} + +/* Cast operator */ +Value *cast_operator::llvm_op(IRBuilder<> &builder, Type *T, Value *arg, const std::string &name) const{ + return nullptr; +} + +Value* cast_operator::codegen(module *mod) const{ + Value *arg = arg_->codegen(mod); + Type *T = T_->type(mod); + return llvm_op(mod->builder(), T, arg, ""); +} + +/* Conditional expression */ +Value *conditional_expression::llvm_op(IRBuilder<> &builder, Value *cond, Value *true_value, Value *false_value, const std::string &name) const{ + return nullptr; +} + +Value *conditional_expression::codegen(module *mod) const{ + Value *cond = cond_->codegen(mod); + Value *true_value = true_value_->codegen(mod); + Value *false_value = false_value_->codegen(mod); + return llvm_op(mod->builder(), cond, true_value, false_value, ""); +} + +/* Assignment expression */ +Value *assignment_expression::llvm_op(llvm::IRBuilder<> &builder, Value *lvalue, Value *rvalue, const std::string &name) const{ + return nullptr; +} + +Value *assignment_expression::codegen(module *mod) const{ + Value *lvalue = lvalue_->codegen(mod); + Value *rvalue = rvalue_->codegen(mod); + return llvm_op(mod->builder(), lvalue, rvalue, ""); +} + +/* Type name */ +llvm::Type *type_name::type(module *mod) const{ + return decl_->type(mod, spec_->type(mod)); +} + +/* String literal */ +llvm::Value* string_literal::codegen(module *mod) const{ + return ConstantDataArray::getString(mod->handle()->getContext(), value_); +} + +/* Constant */ +llvm::Value* constant::codegen(module *mod) const{ + return mod->builder().getInt32(value_); +} } From d06f0fa593633a69d6b19a5417a8d097567790e3 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 22 Dec 2018 11:55:04 -0500 Subject: [PATCH 016/494] [AST] disambiguate named_expression vs identifier --- include/ast.h | 17 ++++++-- include/codegen.h | 6 +-- include/parser.y | 2 +- lib/codegen.cpp | 98 ++++++++++++++++++++++++++++++++++++++++++----- 4 files changed, 106 insertions(+), 17 deletions(-) diff --git a/include/ast.h b/include/ast.h index 04590a800..2d5e46f98 100644 --- a/include/ast.h +++ b/include/ast.h @@ -56,6 +56,9 @@ enum TYPE_T{ FLOAT32_T, FLOAT64_T }; +class pointer; +class identifier; + // AST class node { public: @@ -91,6 +94,15 @@ public: virtual llvm::Value* codegen(module *) const = 0; }; +class named_expression: public expression { +public: + named_expression(node *id): id_((const identifier*)id){} + llvm::Value* codegen(module* mod) const; + +private: + const identifier *id_; +}; + class binary_operator: public expression{ private: llvm::Value* llvm_op(llvm::IRBuilder<> &bld, llvm::Value *lhs, llvm::Value *rhs, const std::string &name) const; @@ -285,9 +297,6 @@ public: }; /* Declarators */ -class pointer; -class identifier; - class declarator: public node{ virtual llvm::Type* type_impl(module*mod, llvm::Type *type) const = 0; @@ -311,7 +320,7 @@ protected: pointer *ptr_; }; -class identifier: public declarator{ +class identifier: public declarator { llvm::Type* type_impl(module*mod, llvm::Type *type) const; public: diff --git a/include/codegen.h b/include/codegen.h index 099ad5787..796845582 100644 --- a/include/codegen.h +++ b/include/codegen.h @@ -20,13 +20,13 @@ public: module(const std::string &name, context *ctx); llvm::Module* handle(); llvm::IRBuilder<>& builder(); - void value(ast::node* node, llvm::Value* value); - llvm::Value *value(ast::node* node); + void value(const ast::node* node, llvm::Value* value); + llvm::Value *value(const ast::node *node); private: llvm::Module handle_; llvm::IRBuilder<> builder_; - std::unordered_map values_; + std::unordered_map values_; }; diff --git a/include/parser.y b/include/parser.y index ab6e1f489..b5f4b56a4 100644 --- a/include/parser.y +++ b/include/parser.y @@ -109,7 +109,7 @@ identifier ; primary_expression - : identifier { $$ = $1; } + : identifier { $$ = new named_expression($1); } | constant { $$ = $1; } | STRING_LITERAL { $$ = new string_literal(yytext); } | '(' expression ')' { $$ = $1; } diff --git a/lib/codegen.cpp b/lib/codegen.cpp index 676b61f2f..5d264f1c1 100644 --- a/lib/codegen.cpp +++ b/lib/codegen.cpp @@ -28,11 +28,11 @@ llvm::IRBuilder<>& module::builder() { return builder_; } -void module::value(ast::node* node, llvm::Value* value){ +void module::value(const ast::node* node, llvm::Value* value){ values_[node] = value; } -llvm::Value *module::value(ast::node* node){ +llvm::Value *module::value(const ast::node* node){ return values_[node]; } @@ -87,7 +87,6 @@ const std::string &identifier::name() const{ return name_; } - // Tile Type* tile::type_impl(module*, Type *type) const{ return TileType::get(type, shapes_->values().size()); @@ -166,16 +165,90 @@ Value* initializer::codegen(module * mod) const{ /*------------------*/ /* Expression */ /*------------------*/ +llvm::Value *llvm_cast(llvm::IRBuilder<> &builder, Value *src, Type *dst_ty){ + Type *src_ty = src->getType(); + bool src_signed = false; + bool dst_signed = false; + if(src_ty == dst_ty) + return src; + else if(src_ty->isIntegerTy() && src_signed && dst_ty->isFloatingPointTy()) + return builder.CreateSIToFP(src, dst_ty); + + else if(src_ty->isIntegerTy() && !src_signed && dst_ty->isFloatingPointTy()) + return builder.CreateUIToFP(src, dst_ty); + + else if(src_ty->isFloatingPointTy() && dst_ty->isIntegerTy() && dst_signed) + return builder.CreateFPToSI(src, dst_ty); + + else if(src_ty->isFloatingPointTy() && dst_ty->isIntegerTy() && !dst_signed) + return builder.CreateFPToUI(src, dst_ty); + + else if(src_ty->isFloatingPointTy() && dst_ty->isFloatingPointTy() && + src_ty->getFPMantissaWidth() < dst_ty->getFPMantissaWidth()) + return builder.CreateFPExt(src, dst_ty); + + else if(src_ty->isFloatingPointTy() && dst_ty->isFloatingPointTy() && + src_ty->getFPMantissaWidth() > dst_ty->getFPMantissaWidth()) + return builder.CreateFPTrunc(src, dst_ty); + + else if(src_ty->isIntegerTy() && dst_ty->isIntegerTy() && + src_ty->getIntegerBitWidth()) + return builder.CreateIntCast(src, dst_ty, dst_signed); + + else{ + assert(false && "unreachable"); + throw; + } +} + +inline void implicit_cast(llvm::IRBuilder<> &builder, Value *&lhs, Value *&rhs, + bool &is_float, bool &is_ptr, bool &is_int, bool &is_signed){ + // Input types + Type *left_ty = lhs->getType(); + Type *right_ty = rhs->getType(); + // One operand is pointer + if(left_ty->isPointerTy()){ + is_ptr = true; + } + // One operand is double + else if(left_ty->isDoubleTy() || right_ty->isDoubleTy()){ + Value *&to_convert = left_ty->isDoubleTy()?rhs:lhs; + to_convert = llvm_cast(builder, to_convert, builder.getDoubleTy()); + is_float = true; + } + // One operand is float + else if(left_ty->isFloatTy() || right_ty->isFloatTy()){ + Value *&to_convert = left_ty->isFloatTy()?rhs:lhs; + to_convert = llvm_cast(builder, to_convert, builder.getFloatTy()); + is_float = true; + } + // Both operands are integers + else if(left_ty->isIntegerTy() && right_ty->isIntegerTy()){ + is_int = true; + is_signed = false; + if(left_ty->getIntegerBitWidth() != right_ty->getIntegerBitWidth()){ + Value *&to_convert = (left_ty->getIntegerBitWidth() > right_ty->getIntegerBitWidth())?rhs:lhs; + Type *dst_ty = (to_convert==lhs)?right_ty:left_ty; + to_convert = llvm_cast(builder, to_convert, dst_ty); + } + } + // Not reachable + else{ + assert(false); + throw; + } +} + +//inline void implicit_broadcast(llvm::IRBuilder<> &builder, Value *&lhs, Value *&rhs){ +// return; +//} /* Binary operator */ Value *binary_operator::llvm_op(llvm::IRBuilder<> &builder, Value *lhs, Value *rhs, const std::string &name) const { - Type *ltype = lhs->getType(); - Type *rtype = rhs->getType(); - bool is_float = ltype->isFloatingPointTy() || rtype->isFloatingPointTy(); - bool is_ptr = ltype->isPointerTy() || rtype->isPointerTy(); - bool is_int = ltype->isIntegerTy() || rtype->isIntegerTy(); - bool is_signed = false; + bool is_float, is_ptr, is_int, is_signed; + implicit_cast(builder, lhs, rhs, is_float, is_ptr, is_int, is_signed); +// implicit_broadcast(builder, lhs, rhs); // Mul if(op_==MUL && is_float) return builder.CreateFMul(lhs, rhs, name); @@ -357,6 +430,7 @@ Value *assignment_expression::llvm_op(llvm::IRBuilder<> &builder, Value *lvalue, Value *assignment_expression::codegen(module *mod) const{ Value *lvalue = lvalue_->codegen(mod); Value *rvalue = rvalue_->codegen(mod); + BasicBlock *block = mod->builder().GetInsertBlock(); return llvm_op(mod->builder(), lvalue, rvalue, ""); } @@ -375,6 +449,12 @@ llvm::Value* constant::codegen(module *mod) const{ return mod->builder().getInt32(value_); } +/* Named */ +llvm::Value* named_expression::codegen(module *mod) const{ + return mod->value(id_); +} + + } } From 91c9ede0213f52a9d7720665041023808b7f0c64 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 22 Dec 2018 18:25:03 -0500 Subject: [PATCH 017/494] [Code generation] Adding functions to construct SSA form --- include/ast.h | 2 +- include/codegen.h | 18 ++++++++++++---- lib/codegen.cpp | 55 +++++++++++++++++++++++++++++++++++++---------- 3 files changed, 59 insertions(+), 16 deletions(-) diff --git a/include/ast.h b/include/ast.h index 2d5e46f98..031c974c1 100644 --- a/include/ast.h +++ b/include/ast.h @@ -289,7 +289,7 @@ public: decl_((declarator*)decl) { } llvm::Type* type(module *mod) const; - std::string name() const; + const identifier* id() const; public: const declaration_specifier *spec_; diff --git a/include/codegen.h b/include/codegen.h index 796845582..e81c8cd01 100644 --- a/include/codegen.h +++ b/include/codegen.h @@ -1,4 +1,5 @@ -#include +#include +#include #include "ast.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/IRBuilder.h" @@ -16,17 +17,26 @@ private: }; class module { + typedef std::pair val_key_t; + llvm::Value *get_value_recursive(const ast::node* node, llvm::BasicBlock *block); + public: module(const std::string &name, context *ctx); llvm::Module* handle(); llvm::IRBuilder<>& builder(); - void value(const ast::node* node, llvm::Value* value); - llvm::Value *value(const ast::node *node); + // Setters + void set_value(const ast::node *node, llvm::BasicBlock* block, llvm::Value *value); + void set_value(const ast::node* node, llvm::Value* value); + // Getters + llvm::Value *get_value(const ast::node *node, llvm::BasicBlock* block); + llvm::Value *get_value(const ast::node *node); private: llvm::Module handle_; llvm::IRBuilder<> builder_; - std::unordered_map values_; + std::map values_; + std::set sealed_blocks_; + std::map incomplete_phis_; }; diff --git a/lib/codegen.cpp b/lib/codegen.cpp index 5d264f1c1..ac23557f0 100644 --- a/lib/codegen.cpp +++ b/lib/codegen.cpp @@ -3,6 +3,7 @@ #include "codegen.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Module.h" +#include "llvm/IR/CFG.h" using namespace llvm; @@ -28,12 +29,43 @@ llvm::IRBuilder<>& module::builder() { return builder_; } -void module::value(const ast::node* node, llvm::Value* value){ - values_[node] = value; +void module::set_value(const ast::node *node, BasicBlock *block, Value *value){ + values_[val_key_t{node, block}] = value; } -llvm::Value *module::value(const ast::node* node){ - return values_[node]; +void module::set_value(const ast::node* node, llvm::Value* value){ + return set_value(node, builder_.GetInsertBlock(), value); +} + +llvm::Value *module::get_value_recursive(const ast::node* node, BasicBlock *block) { + llvm::Value *result; + if(sealed_blocks_.find(block) == sealed_blocks_.end()){ + result = builder_.CreatePHI(nullptr, 1); + incomplete_phis_[val_key_t(node, block)] = (PHINode*)result; + } + else if(pred_size(block) <= 1){ + result = get_value(node, *pred_begin(block)); + } + else{ + result = builder_.CreatePHI(nullptr, 1); + set_value(node, block, result); + for(BasicBlock *pred: predecessors(block)){ + llvm::Value *value = get_value(node, pred); + static_cast(result)->addIncoming(value, pred); + } + } + set_value(node, block, result); +} + +llvm::Value *module::get_value(const ast::node* node, BasicBlock *block) { + val_key_t key(node, block); + if(values_.find(key) != values_.end()) + return values_.at(key); + return get_value_recursive(node, block); +} + +llvm::Value *module::get_value(const ast::node *node) { + return get_value(node, builder_.GetInsertBlock()); } @@ -65,10 +97,8 @@ Type* parameter::type(module *mod) const { return decl_->type(mod, spec_->type(mod)); } -std::string parameter::name() const { - if(auto id = decl_->id()) - return id->name(); - return ""; +const identifier *parameter::id() const { + return decl_->id(); } /* Declarators */ @@ -109,8 +139,11 @@ void function::bind_parameters(module *mod, Function *fn) const{ assert(args.size() == args_->values().size()); for(size_t i = 0; i < args.size(); i++){ parameter *param_i = args_->values().at(i); - args[i]->setName(param_i->name()); - mod->value(param_i, args[i]); + const identifier *id_i = param_i->id(); + if(id_i){ + args[i]->setName(id_i->name()); + mod->set_value(id_i, nullptr, args[i]); + } } } @@ -451,7 +484,7 @@ llvm::Value* constant::codegen(module *mod) const{ /* Named */ llvm::Value* named_expression::codegen(module *mod) const{ - return mod->value(id_); + return mod->get_value(id_); } From 9d3224754e65ae6d81090f986f21665cc9853557 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 22 Dec 2018 21:45:45 -0500 Subject: [PATCH 018/494] [Code generation] Prototype for phi node --- examples/matrix.cpp | 8 ++++++++ include/ast.h | 13 ++++--------- include/codegen.h | 6 +++++- lib/codegen.cpp | 46 ++++++++++++++++++++++++++++++--------------- 4 files changed, 48 insertions(+), 25 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 0bc553849..9bf15ba00 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -2,6 +2,9 @@ #include #include "ast.h" #include "codegen.h" +#include "llvm/IR/IRPrintingPasses.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/PassManager.h" typedef struct yy_buffer_state * YY_BUFFER_STATE; extern int yyparse(); @@ -14,6 +17,8 @@ const char src[] = "\ void test(fp32 *A, fp32 *B, fp32 *C){\ int32 i = 0;\ + int32 j = 1;\ + i = i + j;\ }\ "; @@ -25,5 +30,8 @@ int main() { tdl::context context; tdl::module module("matrix", &context); program->codegen(&module); + llvm::PrintModulePass print(llvm::outs()); + llvm::AnalysisManager analysis; + print.run(*module.handle(), analysis); return 0; } diff --git a/include/ast.h b/include/ast.h index 031c974c1..1359c761b 100644 --- a/include/ast.h +++ b/include/ast.h @@ -192,25 +192,18 @@ public: }; class assignment_expression: public expression{ -private: - llvm::Value *llvm_op(llvm::IRBuilder<> &builder, - llvm::Value *lvalue, llvm::Value *rvalue, - const std::string &name) const; - public: assignment_expression(node *lvalue, ASSIGN_OP_T op, node *rvalue) - : lvalue_((expression*)lvalue), op_(op), rvalue_((expression*)rvalue) { } + : lvalue_((identifier*)lvalue), op_(op), rvalue_((expression*)rvalue) { } llvm::Value* codegen(module *mod) const; public: ASSIGN_OP_T op_; - const expression *lvalue_; + const identifier *lvalue_; const expression *rvalue_; }; -class statement: public node{ -}; class initializer; class declaration_specifier; @@ -227,6 +220,8 @@ public: const list *init_; }; +class statement: public node{ +}; class compound_statement: public statement{ typedef list* declarations_t; diff --git a/include/codegen.h b/include/codegen.h index e81c8cd01..87a6e0f30 100644 --- a/include/codegen.h +++ b/include/codegen.h @@ -18,6 +18,8 @@ private: class module { typedef std::pair val_key_t; + llvm::PHINode *make_phi(llvm::Type *type, unsigned num_values, llvm::BasicBlock *block); + llvm::Value *add_phi_operands(const ast::node *node, llvm::PHINode *&phi); llvm::Value *get_value_recursive(const ast::node* node, llvm::BasicBlock *block); public: @@ -30,13 +32,15 @@ public: // Getters llvm::Value *get_value(const ast::node *node, llvm::BasicBlock* block); llvm::Value *get_value(const ast::node *node); + // Seal block -- no more predecessors will be added + llvm::Value *seal_block(llvm::BasicBlock *block); private: llvm::Module handle_; llvm::IRBuilder<> builder_; std::map values_; std::set sealed_blocks_; - std::map incomplete_phis_; + std::map> incomplete_phis_; }; diff --git a/lib/codegen.cpp b/lib/codegen.cpp index ac23557f0..9657a2c7a 100644 --- a/lib/codegen.cpp +++ b/lib/codegen.cpp @@ -37,22 +37,37 @@ void module::set_value(const ast::node* node, llvm::Value* value){ return set_value(node, builder_.GetInsertBlock(), value); } +PHINode* module::make_phi(Type *type, unsigned num_values, BasicBlock *block){ + llvm::BasicBlock::iterator save = builder_.GetInsertPoint(); + builder_.SetInsertPoint(&*block->getFirstInsertionPt()); + PHINode *res = builder_.CreatePHI(type, num_values); + builder_.SetInsertPoint(&*save); + return res; +} + +Value *module::add_phi_operands(const ast::node *node, PHINode *&phi){ + BasicBlock *block = phi->getParent(); + for(BasicBlock *pred: predecessors(block)){ + llvm::Value *value = get_value(node, pred); + if(phi->getType()==nullptr){ + phi = make_phi(value->getType(), pred_size(block), block); + } + phi->addIncoming(value, pred); + } +} + llvm::Value *module::get_value_recursive(const ast::node* node, BasicBlock *block) { llvm::Value *result; if(sealed_blocks_.find(block) == sealed_blocks_.end()){ - result = builder_.CreatePHI(nullptr, 1); - incomplete_phis_[val_key_t(node, block)] = (PHINode*)result; + incomplete_phis_[block][node] = make_phi(nullptr, 1, block); } else if(pred_size(block) <= 1){ result = get_value(node, *pred_begin(block)); } else{ - result = builder_.CreatePHI(nullptr, 1); + result = make_phi(nullptr, 1, block); set_value(node, block, result); - for(BasicBlock *pred: predecessors(block)){ - llvm::Value *value = get_value(node, pred); - static_cast(result)->addIncoming(value, pred); - } + add_phi_operands(node, (PHINode*&)result); } set_value(node, block, result); } @@ -68,6 +83,11 @@ llvm::Value *module::get_value(const ast::node *node) { return get_value(node, builder_.GetInsertBlock()); } +llvm::Value *module::seal_block(BasicBlock *block){ + for(auto &x: incomplete_phis_[block]) + add_phi_operands(x.first, x.second); + sealed_blocks_.insert(block); +} namespace ast{ @@ -170,7 +190,8 @@ Value* function_definition::codegen(module *mod) const{ /* Statements */ Value* compound_statement::codegen(module* mod) const{ decls_->codegen(mod); -// statements_->codegen(mod); + if(statements_) + statements_->codegen(mod); return nullptr; } @@ -456,15 +477,10 @@ Value *conditional_expression::codegen(module *mod) const{ } /* Assignment expression */ -Value *assignment_expression::llvm_op(llvm::IRBuilder<> &builder, Value *lvalue, Value *rvalue, const std::string &name) const{ - return nullptr; -} - Value *assignment_expression::codegen(module *mod) const{ - Value *lvalue = lvalue_->codegen(mod); Value *rvalue = rvalue_->codegen(mod); - BasicBlock *block = mod->builder().GetInsertBlock(); - return llvm_op(mod->builder(), lvalue, rvalue, ""); + mod->set_value(lvalue_, rvalue); + return rvalue; } /* Type name */ From 1b8199b82d0bfecff7a6d52b7d76ddc49343b7cd Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 24 Dec 2018 01:04:55 -0500 Subject: [PATCH 019/494] [Code generation] added support for FOR and IF/THEN/ELSE --- examples/matrix.cpp | 10 +++- include/ast.h | 31 ++++++---- include/codegen.h | 16 +++--- include/parser.y | 7 +-- include/scanner.l | 1 + lib/codegen.cpp | 134 ++++++++++++++++++++++++++++++++++---------- 6 files changed, 144 insertions(+), 55 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 9bf15ba00..8fa19a840 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -15,10 +15,16 @@ extern translation_unit *ast_root; const char src[] = "\ -void test(fp32 *A, fp32 *B, fp32 *C){\ - int32 i = 0;\ +void test(fp32 *A, fp32 *B, fp32 *C, int32 i){\ int32 j = 1;\ + int32 k;\ i = i + j;\ + for(k = 0; k < 10; k = k+1){\ + int32 u = 1;\ + u = u + i;\ + if(k == 0)\ + u = u + 2;\ + }\ }\ "; diff --git a/include/ast.h b/include/ast.h index 1359c761b..6b8e48570 100644 --- a/include/ast.h +++ b/include/ast.h @@ -94,15 +94,21 @@ public: virtual llvm::Value* codegen(module *) const = 0; }; -class named_expression: public expression { +class unary_expression: public node{ public: - named_expression(node *id): id_((const identifier*)id){} - llvm::Value* codegen(module* mod) const; + unary_expression(node *id): id_((const identifier*)id) {} + const identifier *id() const; private: const identifier *id_; }; +class named_expression: public unary_expression { +public: + named_expression(node *id): unary_expression(id){ } + llvm::Value* codegen(module* mod) const; +}; + class binary_operator: public expression{ private: llvm::Value* llvm_op(llvm::IRBuilder<> &bld, llvm::Value *lhs, llvm::Value *rhs, const std::string &name) const; @@ -194,13 +200,13 @@ public: class assignment_expression: public expression{ public: assignment_expression(node *lvalue, ASSIGN_OP_T op, node *rvalue) - : lvalue_((identifier*)lvalue), op_(op), rvalue_((expression*)rvalue) { } + : lvalue_((unary_expression*)lvalue), op_(op), rvalue_((expression*)rvalue) { } llvm::Value* codegen(module *mod) const; public: ASSIGN_OP_T op_; - const identifier *lvalue_; + const unary_expression *lvalue_; const expression *rvalue_; }; @@ -241,18 +247,23 @@ private: class selection_statement: public statement{ public: selection_statement(node *cond, node *if_value, node *else_value = nullptr) - : cond_(cond), if_value_(if_value), else_value_(else_value) { } + : cond_(cond), then_value_(if_value), else_value_(else_value) { } + + llvm::Value* codegen(module *mod) const; public: const node *cond_; - const node *if_value_; + const node *then_value_; const node *else_value_; }; class iteration_statement: public statement{ public: iteration_statement(node *init, node *stop, node *exec, node *statements) - : init_(init), stop_(stop), exec_(exec), statements_(statements) { } + : init_(init), stop_(stop), exec_(exec), statements_(statements) + { } + + llvm::Value* codegen(module *mod) const; private: const node *init_; @@ -368,7 +379,7 @@ private: public: initializer(node *decl, node *init) : declarator((node*)((declarator*)decl)->id()), - decl_((declarator*)decl), init_((expression*)init){ } + decl_((declarator*)decl), expr_((expression*)init){ } void specifier(const declaration_specifier *spec); llvm::Value* codegen(module *) const; @@ -376,7 +387,7 @@ public: public: const declaration_specifier *spec_; const declarator *decl_; - const expression *init_; + const expression *expr_; }; diff --git a/include/codegen.h b/include/codegen.h index 87a6e0f30..316e3b859 100644 --- a/include/codegen.h +++ b/include/codegen.h @@ -17,21 +17,21 @@ private: }; class module { - typedef std::pair val_key_t; + typedef std::pair val_key_t; llvm::PHINode *make_phi(llvm::Type *type, unsigned num_values, llvm::BasicBlock *block); - llvm::Value *add_phi_operands(const ast::node *node, llvm::PHINode *&phi); - llvm::Value *get_value_recursive(const ast::node* node, llvm::BasicBlock *block); + llvm::Value *add_phi_operands(const std::string& name, llvm::PHINode *&phi); + llvm::Value *get_value_recursive(const std::string& name, llvm::BasicBlock *block); public: module(const std::string &name, context *ctx); llvm::Module* handle(); llvm::IRBuilder<>& builder(); // Setters - void set_value(const ast::node *node, llvm::BasicBlock* block, llvm::Value *value); - void set_value(const ast::node* node, llvm::Value* value); + void set_value(const std::string& name, llvm::BasicBlock* block, llvm::Value *value); + void set_value(const std::string& name, llvm::Value* value); // Getters - llvm::Value *get_value(const ast::node *node, llvm::BasicBlock* block); - llvm::Value *get_value(const ast::node *node); + llvm::Value *get_value(const std::string& name, llvm::BasicBlock* block); + llvm::Value *get_value(const std::string& name); // Seal block -- no more predecessors will be added llvm::Value *seal_block(llvm::BasicBlock *block); @@ -40,7 +40,7 @@ private: llvm::IRBuilder<> builder_; std::map values_; std::set sealed_blocks_; - std::map> incomplete_phis_; + std::map> incomplete_phis_; }; diff --git a/include/parser.y b/include/parser.y index b5f4b56a4..6d1e81936 100644 --- a/include/parser.y +++ b/include/parser.y @@ -269,13 +269,12 @@ expression_statement ; selection_statement - : IF '(' expression ')' statement { $$ = new selection_statement($1, $3); } - | IF '(' expression ')' statement ELSE statement { $$ = new selection_statement($1, $3, $5); } + : IF '(' expression ')' statement { $$ = new selection_statement($3, $5); } + | IF '(' expression ')' statement ELSE statement { $$ = new selection_statement($3, $5, $7); } ; iteration_statement - : FOR '(' expression_statement expression_statement ')' statement { $$ = new iteration_statement($1, $3, NULL, $4); } - | FOR '(' expression_statement expression_statement expression ')' statement { $$ = new iteration_statement($1, $3, $4, $5); } + : FOR '(' expression_statement expression_statement expression ')' statement { $$ = new iteration_statement($3, $4, $5, $7); } ; diff --git a/include/scanner.l b/include/scanner.l index df730aec8..5ecf37b1b 100644 --- a/include/scanner.l +++ b/include/scanner.l @@ -113,6 +113,7 @@ void count() column += 8 - (column % 8); else column++; + //ECHO; } void yyerror (const char *s) /* Called by yyparse on error */ diff --git a/lib/codegen.cpp b/lib/codegen.cpp index 9657a2c7a..3c20c1487 100644 --- a/lib/codegen.cpp +++ b/lib/codegen.cpp @@ -4,6 +4,9 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Module.h" #include "llvm/IR/CFG.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include using namespace llvm; @@ -19,6 +22,7 @@ LLVMContext *context::handle() { /* Module */ module::module(const std::string &name, context *ctx) : handle_(name.c_str(), *ctx->handle()), builder_(*ctx->handle()) { + sealed_blocks_.insert(nullptr); } llvm::Module* module::handle() { @@ -29,58 +33,64 @@ llvm::IRBuilder<>& module::builder() { return builder_; } -void module::set_value(const ast::node *node, BasicBlock *block, Value *value){ - values_[val_key_t{node, block}] = value; +void module::set_value(const std::string& name, BasicBlock *block, Value *value){ + values_[val_key_t{name, block}] = value; } -void module::set_value(const ast::node* node, llvm::Value* value){ - return set_value(node, builder_.GetInsertBlock(), value); +void module::set_value(const std::string& name, llvm::Value* value){ + return set_value(name, builder_.GetInsertBlock(), value); } PHINode* module::make_phi(Type *type, unsigned num_values, BasicBlock *block){ - llvm::BasicBlock::iterator save = builder_.GetInsertPoint(); - builder_.SetInsertPoint(&*block->getFirstInsertionPt()); + Instruction* instr = block->getFirstNonPHIOrDbg(); + if(instr) + builder_.SetInsertPoint(instr); PHINode *res = builder_.CreatePHI(type, num_values); - builder_.SetInsertPoint(&*save); + if(instr) + builder_.SetInsertPoint(block); return res; } -Value *module::add_phi_operands(const ast::node *node, PHINode *&phi){ +Value *module::add_phi_operands(const std::string& name, PHINode *&phi){ BasicBlock *block = phi->getParent(); for(BasicBlock *pred: predecessors(block)){ - llvm::Value *value = get_value(node, pred); - if(phi->getType()==nullptr){ - phi = make_phi(value->getType(), pred_size(block), block); - } + llvm::Value *value = get_value(name, pred); phi->addIncoming(value, pred); } + return phi; } -llvm::Value *module::get_value_recursive(const ast::node* node, BasicBlock *block) { +llvm::Value *module::get_value_recursive(const std::string& name, BasicBlock *block) { llvm::Value *result; if(sealed_blocks_.find(block) == sealed_blocks_.end()){ - incomplete_phis_[block][node] = make_phi(nullptr, 1, block); + llvm::Value *pred = get_value(name, *pred_begin(block)); + incomplete_phis_[block][name] = make_phi(pred->getType(), 1, block); + result = (Value*)incomplete_phis_[block][name]; } else if(pred_size(block) <= 1){ - result = get_value(node, *pred_begin(block)); + bool has_pred = pred_size(block); + result = get_value(name, has_pred?*pred_begin(block):nullptr); } else{ - result = make_phi(nullptr, 1, block); - set_value(node, block, result); - add_phi_operands(node, (PHINode*&)result); + llvm::Value *pred = get_value(name, *pred_begin(block)); + result = make_phi(pred->getType(), 1, block); + set_value(name, block, result); + add_phi_operands(name, (PHINode*&)result); } - set_value(node, block, result); + set_value(name, block, result); + return result; } -llvm::Value *module::get_value(const ast::node* node, BasicBlock *block) { - val_key_t key(node, block); - if(values_.find(key) != values_.end()) +llvm::Value *module::get_value(const std::string& name, BasicBlock *block) { + val_key_t key(name, block); + if(values_.find(key) != values_.end()){ return values_.at(key); - return get_value_recursive(node, block); + } + return get_value_recursive(name, block); } -llvm::Value *module::get_value(const ast::node *node) { - return get_value(node, builder_.GetInsertBlock()); +llvm::Value *module::get_value(const std::string& name) { + return get_value(name, builder_.GetInsertBlock()); } llvm::Value *module::seal_block(BasicBlock *block){ @@ -162,7 +172,7 @@ void function::bind_parameters(module *mod, Function *fn) const{ const identifier *id_i = param_i->id(); if(id_i){ args[i]->setName(id_i->name()); - mod->set_value(id_i, nullptr, args[i]); + mod->set_value(id_i->name(), nullptr, args[i]); } } } @@ -182,8 +192,10 @@ Value* function_definition::codegen(module *mod) const{ Function *fn = Function::Create(prototype, Function::ExternalLinkage, name, mod->handle()); header_->bind_parameters(mod, fn); BasicBlock *entry = BasicBlock::Create(mod->handle()->getContext(), "entry", fn); + mod->seal_block(entry); mod->builder().SetInsertPoint(entry); body_->codegen(mod); + mod->builder().CreateRetVoid(); return nullptr; } @@ -195,6 +207,55 @@ Value* compound_statement::codegen(module* mod) const{ return nullptr; } +/* Iteration statement */ +Value* iteration_statement::codegen(module *mod) const{ + IRBuilder<> &builder = mod->builder(); + LLVMContext &ctx = mod->handle()->getContext(); + Function *fn = builder.GetInsertBlock()->getParent(); + BasicBlock *loop_bb = BasicBlock::Create(ctx, "loop", fn); + BasicBlock *next_bb = BasicBlock::Create(ctx, "postloop", fn); + init_->codegen(mod); + builder.CreateBr(loop_bb); + builder.SetInsertPoint(loop_bb); + statements_->codegen(mod); + exec_->codegen(mod); + Value *cond = stop_->codegen(mod); + builder.CreateCondBr(cond, loop_bb, next_bb); + builder.SetInsertPoint(next_bb); + mod->seal_block(loop_bb); + mod->seal_block(next_bb); + return nullptr; +} + +/* Selection statement */ +Value* selection_statement::codegen(module* mod) const{ + IRBuilder<> &builder = mod->builder(); + LLVMContext &ctx = mod->handle()->getContext(); + Function *fn = builder.GetInsertBlock()->getParent(); + Value *cond = cond_->codegen(mod); + BasicBlock *then_bb = BasicBlock::Create(ctx, "then", fn); + BasicBlock *else_bb = else_value_?BasicBlock::Create(ctx, "else", fn):nullptr; + BasicBlock *endif_bb = BasicBlock::Create(ctx, "endif", fn); + // Branch + if(else_value_) + builder.CreateCondBr(cond, then_bb, else_bb); + else + builder.CreateCondBr(cond, then_bb, endif_bb); + // Then + builder.SetInsertPoint(then_bb); + then_value_->codegen(mod); + if(else_value_) + builder.CreateBr(endif_bb); + // Else + if(else_value_){ + builder.SetInsertPoint(else_bb); + else_value_->codegen(mod); + builder.CreateBr(endif_bb); + } + // Endif + builder.SetInsertPoint(endif_bb); +} + /* Declaration */ Value* declaration::codegen(module* mod) const{ for(initializer *init: init_->values()) @@ -211,9 +272,14 @@ void initializer::specifier(const declaration_specifier *spec) { Value* initializer::codegen(module * mod) const{ Type *ty = decl_->type(mod, spec_->type(mod)); std::string name = decl_->id()->name(); - Value *value = llvm::UndefValue::get(ty); + Value *value; + if(expr_) + value = expr_->codegen(mod); + else + value = llvm::UndefValue::get(ty); value->setName(name); - return nullptr; + mod->set_value(name, value); + return value; } /*------------------*/ @@ -300,7 +366,7 @@ inline void implicit_cast(llvm::IRBuilder<> &builder, Value *&lhs, Value *&rhs, /* Binary operator */ Value *binary_operator::llvm_op(llvm::IRBuilder<> &builder, Value *lhs, Value *rhs, const std::string &name) const { - bool is_float, is_ptr, is_int, is_signed; + bool is_float = false, is_ptr = false, is_int = false, is_signed = false; implicit_cast(builder, lhs, rhs, is_float, is_ptr, is_int, is_signed); // implicit_broadcast(builder, lhs, rhs); // Mul @@ -479,7 +545,7 @@ Value *conditional_expression::codegen(module *mod) const{ /* Assignment expression */ Value *assignment_expression::codegen(module *mod) const{ Value *rvalue = rvalue_->codegen(mod); - mod->set_value(lvalue_, rvalue); + mod->set_value(lvalue_->id()->name(), rvalue); return rvalue; } @@ -498,9 +564,15 @@ llvm::Value* constant::codegen(module *mod) const{ return mod->builder().getInt32(value_); } +/* Unary expression */ +const identifier* unary_expression::id() const{ + return id_; +} + /* Named */ llvm::Value* named_expression::codegen(module *mod) const{ - return mod->get_value(id_); + const std::string &name = id()->name(); + return mod->get_value(name); } From 8f9e6a36555ca6fb6fe7e4c46af0dc59535cab75 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 29 Dec 2018 17:06:48 -0500 Subject: [PATCH 020/494] [AST] Adding indexing operations --- include/ast.h | 34 ++++++++++++++++++++++++++++++++++ include/parser.y | 17 +++++++++++++++-- include/scanner.l | 2 +- lib/codegen.cpp | 5 +++++ 4 files changed, 55 insertions(+), 3 deletions(-) diff --git a/include/ast.h b/include/ast.h index 6b8e48570..072644f1e 100644 --- a/include/ast.h +++ b/include/ast.h @@ -89,11 +89,45 @@ private: std::vector values_; }; +enum range_enum_t{ + ALL, + NEWAXIS +}; + +class range: public node{ +public: + range(range_enum_t type) + : type_(type){} + + range_enum_t type() const{ + return type_; + } + +public: + const range_enum_t type_; +}; + class expression: public node{ public: virtual llvm::Value* codegen(module *) const = 0; }; +class postfix_expression: public expression{ + +}; + +class indexing_expression: public postfix_expression{ +public: + indexing_expression(node *id, node *ranges) + : id_((const identifier*)id), ranges_((const list*)ranges) {} + + llvm::Value* codegen(module *) const; + +private: + const identifier* id_; + const list* ranges_; +}; + class unary_expression: public node{ public: unary_expression(node *id): id_((const identifier*)id) {} diff --git a/include/parser.y b/include/parser.y index 6d1e81936..74b5f8bbd 100644 --- a/include/parser.y +++ b/include/parser.y @@ -49,7 +49,7 @@ TYPE_T get_type_spec(node *op) { return ((token*)op)->type; } %token XOR_ASSIGN OR_ASSIGN TYPE_NAME %token VOID UINT8 UINT16 UINT32 UINT64 INT8 INT16 INT32 INT64 FP32 FP64 %token IF ELSE FOR -%token DEF +%token NEWAXIS %start translation_unit %% @@ -115,8 +115,21 @@ primary_expression | '(' expression ')' { $$ = $1; } ; +range + : ':' { $$ = new range(tdl::ast::ALL); } + | NEWAXIS { $$ = new range(tdl::ast::NEWAXIS); } + +range_list + : range { $$ = new list((range*)$1); } + | range_list ',' range { $$ = append_ptr_list($1, $2); } + +postfix_expression + : primary_expression { $$ = $1;} + | identifier '[' range_list ']' { $$ = new indexing_expression($1, $2);} + ; + unary_expression - : primary_expression { $$ = $1; } + : postfix_expression { $$ = $1; } | INC_OP unary_expression { $$ = new unary_operator(INC, $2); } | DEC_OP unary_expression { $$ = new unary_operator(DEC, $2); } | unary_operator cast_expression { $$ = new unary_operator(get_unary_op($1), $2); } diff --git a/include/scanner.l b/include/scanner.l index 5ecf37b1b..7d5d5984a 100644 --- a/include/scanner.l +++ b/include/scanner.l @@ -16,7 +16,7 @@ int comment(); %} %% -"def" { count(); return(DEF); } +"newaxis" { count(); return(NEWAXIS); } "if" { count(); return(IF); } "else" { count(); return(ELSE); } "for" { count(); return(FOR); } diff --git a/lib/codegen.cpp b/lib/codegen.cpp index 3c20c1487..97b9407b8 100644 --- a/lib/codegen.cpp +++ b/lib/codegen.cpp @@ -482,6 +482,11 @@ Value* binary_operator::codegen(module *mod) const{ return result; } +/* Postfix expression */ +Value* indexing_expression::codegen(module *mod) const{ + return nullptr; +} + /* Unary operator */ Value *unary_operator::llvm_op(llvm::IRBuilder<> &builder, Value *arg, const std::string &name) const{ Type *atype = arg->getType(); From d260aefbd1f0bef2e8b95c89989f4f645a423a19 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 31 Dec 2018 13:16:25 -0500 Subject: [PATCH 021/494] [Codegen] More debugging --- examples/matrix.cpp | 11 ++--- include/ast.h | 3 +- include/parser.y | 2 +- lib/codegen.cpp | 109 +++++++++++++++++++++++++++++++++++++++----- 4 files changed, 104 insertions(+), 21 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 8fa19a840..aa3b39b9b 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -17,14 +17,11 @@ const char src[] = "\ void test(fp32 *A, fp32 *B, fp32 *C, int32 i){\ int32 j = 1;\ + int32 test[16, 16] = 0;\ + int32 test2[16, 16];\ + int32 test3[16, 16];\ int32 k;\ - i = i + j;\ - for(k = 0; k < 10; k = k+1){\ - int32 u = 1;\ - u = u + i;\ - if(k == 0)\ - u = u + 2;\ - }\ + test = test2 + test3;\ }\ "; diff --git a/include/ast.h b/include/ast.h index 072644f1e..282e421e3 100644 --- a/include/ast.h +++ b/include/ast.h @@ -145,7 +145,7 @@ public: class binary_operator: public expression{ private: - llvm::Value* llvm_op(llvm::IRBuilder<> &bld, llvm::Value *lhs, llvm::Value *rhs, const std::string &name) const; + llvm::Value* llvm_op(module *mod, llvm::IRBuilder<> &bld, llvm::Value *lhs, llvm::Value *rhs, const std::string &name) const; public: binary_operator(BIN_OP_T op, node *lhs, node *rhs) @@ -163,6 +163,7 @@ class constant: public expression{ public: constant(int value): value_(value) { } llvm::Value* codegen(module *mod) const; + int value() const; private: const int value_; diff --git a/include/parser.y b/include/parser.y index 74b5f8bbd..f43b45265 100644 --- a/include/parser.y +++ b/include/parser.y @@ -92,7 +92,7 @@ constant : constant_list : constant { $$ = new list((constant*)$1); } - | constant_list ',' constant { $$ = append_ptr_list($1, $2); } + | constant_list ',' constant { $$ = append_ptr_list($1, $3); } ; type_name diff --git a/lib/codegen.cpp b/lib/codegen.cpp index 97b9407b8..f7af12a60 100644 --- a/lib/codegen.cpp +++ b/lib/codegen.cpp @@ -12,6 +12,18 @@ using namespace llvm; namespace tdl{ +/* Nd Array utils */ +inline std::vector array_shapes(Type *array_ty){ + std::vector result; + Type *current = array_ty; + while(isa(current)){ + result.push_back(array_ty->getArrayNumElements()); + current = array_ty->getArrayElementType(); + printf("%d %d\n", current, current->getTypeID()); + }; + return result; +} + /* Context */ context::context() { } @@ -149,13 +161,14 @@ const std::string &identifier::name() const{ // Tile Type* tile::type_impl(module*, Type *type) const{ - return TileType::get(type, shapes_->values().size()); + Type *current = type; + unsigned i = 0; + do{ + current = ArrayType::get(current, shapes_->values()[i++]->value()); + }while(i < shapes_->values().size()); + return current; } -// Initializer -Type* initializer::type_impl(module *mod, Type *type) const{ - return decl_->type(mod, type); -} // Pointer Type* pointer::type_impl(module*, Type *type) const{ @@ -265,6 +278,10 @@ Value* declaration::codegen(module* mod) const{ } /* Initializer */ +Type* initializer::type_impl(module *mod, Type *type) const{ + return decl_->type(mod, type); +} + void initializer::specifier(const declaration_specifier *spec) { spec_ = spec; } @@ -359,16 +376,66 @@ inline void implicit_cast(llvm::IRBuilder<> &builder, Value *&lhs, Value *&rhs, } } -//inline void implicit_broadcast(llvm::IRBuilder<> &builder, Value *&lhs, Value *&rhs){ -// return; -//} +inline void implicit_broadcast(module *mod, llvm::IRBuilder<> &builder, Value *&lhs, Value *&rhs){ + std::vector lhs_shapes = array_shapes(lhs->getType()); + std::vector rhs_shapes = array_shapes(rhs->getType()); + // Both are scalar + if(lhs_shapes.empty() && rhs_shapes.empty()) + return; + // One argument is scalar + if(!lhs_shapes.empty() ^ !rhs_shapes.empty()){ + auto &ref_shapes = lhs_shapes.empty()?rhs_shapes:lhs_shapes; + auto &ref = lhs_shapes.empty()?rhs:lhs; + auto &target = lhs_shapes.empty()?lhs:rhs; + Function *splat_fn = Intrinsic::getDeclaration(mod->handle(), Intrinsic::tlvm_splat_2d, {ref->getType()}); + SmallVector args(1 + ref_shapes.size()); + for(unsigned i = 0; i < ref_shapes.size(); i++) + args[1 + i] = builder.getInt32(ref_shapes[i]); + args[0] = target; + target = builder.CreateCall(splat_fn, args); + return; + } + // Both are arrays + int lhs_dim = lhs_shapes.size(); + int rhs_dim = rhs_shapes.size(); + std::vector &shortest = (lhs_dim < rhs_dim)?lhs_shapes:rhs_shapes; + std::vector &longest = (lhs_dim < rhs_dim)?rhs_shapes:lhs_shapes; + size_t ndim = longest.size(); + int off = longest.size() - shortest.size(); + for(int i = longest.size(); i>= 0; i--){ + if(shortest[off + i] != longest[i]) + throw std::runtime_error("cannot broadcast"); + } + // Pad + for(size_t i = 0; i < off; i++){ + shortest.insert(shortest.begin(), 1); + } + Value *&target = (lhs_dim < rhs_dim)?lhs:rhs; + SmallVector args(1 + ndim); + // Reshape left hand side + for(size_t i = 0; i < ndim; i++) + args[1 + i] = builder.getInt32(shortest[i]); + args[0] = target; + Function *reshape_fn = Intrinsic::getDeclaration(mod->handle(), Intrinsic::tlvm_reshape_2d_1d, {rhs->getType(), lhs->getType()}); + target = builder.CreateCall(reshape_fn, args); + // Broadcast both arguments + for(size_t i = 0; i < ndim; i++) + args[1 + i] = builder.getInt32(std::max(shortest[i], longest[i])); + Function *broadcast_fn = Intrinsic::getDeclaration(mod->handle(), Intrinsic::tlvm_broadcast_2d, {target->getType(), target->getType()}); + // Broadcast lhs + args[0] = lhs; + lhs = builder.CreateCall(broadcast_fn, args); + // Broadcast rhs + args[0] = rhs; + rhs = builder.CreateCall(broadcast_fn, args); +} /* Binary operator */ -Value *binary_operator::llvm_op(llvm::IRBuilder<> &builder, Value *lhs, Value *rhs, const std::string &name) const +Value *binary_operator::llvm_op(module *mod, llvm::IRBuilder<> &builder, Value *lhs, Value *rhs, const std::string &name) const { bool is_float = false, is_ptr = false, is_int = false, is_signed = false; - implicit_cast(builder, lhs, rhs, is_float, is_ptr, is_int, is_signed); -// implicit_broadcast(builder, lhs, rhs); +// implicit_cast(builder, lhs, rhs, is_float, is_ptr, is_int, is_signed); +// implicit_broadcast(mod, builder, lhs, rhs); // Mul if(op_==MUL && is_float) return builder.CreateFMul(lhs, rhs, name); @@ -478,12 +545,25 @@ Value *binary_operator::llvm_op(llvm::IRBuilder<> &builder, Value *lhs, Value *r Value* binary_operator::codegen(module *mod) const{ Value *lhs = lhs_->codegen(mod); Value *rhs = rhs_->codegen(mod); - Value *result = llvm_op(mod->builder(), lhs, rhs, ""); + Value *result = llvm_op(mod, mod->builder(), lhs, rhs, ""); return result; } /* Postfix expression */ Value* indexing_expression::codegen(module *mod) const{ + Value *in = mod->get_value(id_->name()); + std::vector ranges; + for(range *x: ranges_->values()) + ranges.push_back(x->type()); + // Type information + Function* reshape; + Type *in_type = in->getType(); + size_t in_dim = in_type->getTileNumDimensions(); + size_t out_dim = ranges.size(); + Type *out_type = TileType::get(in_type->getTileElementType(), out_dim); + // Intrinsic function + Function *reshape_fn = Intrinsic::getDeclaration(mod->handle(), Intrinsic::tlvm_reshape_2d_1d, {out_type, in_type}); + return nullptr; } @@ -569,6 +649,11 @@ llvm::Value* constant::codegen(module *mod) const{ return mod->builder().getInt32(value_); } +int constant::value() const{ + return value_; +} + + /* Unary expression */ const identifier* unary_expression::id() const{ return id_; From e7a4e70e225676cfa2dd6867fa18d2263c12d6db Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 31 Dec 2018 22:47:31 -0500 Subject: [PATCH 022/494] [Intermediate Representation] Added skeleton --- examples/matrix.cpp | 18 +- include/ast.h | 86 +++--- include/codegen.h | 47 --- include/ir/basic_block.h | 29 ++ include/ir/builder.h | 89 ++++++ include/ir/constant.h | 32 ++ include/ir/context.h | 14 + include/ir/function.h | 40 +++ include/ir/instructions.h | 29 ++ include/ir/module.h | 47 +++ include/ir/type.h | 58 ++++ include/ir/value.h | 24 ++ lib/codegen.cpp | 603 ++++++++++++++------------------------ lib/ir/basic_block.cpp | 0 lib/ir/builder.cpp | 0 lib/ir/constant.cpp | 0 lib/ir/context.cpp | 10 + lib/ir/function.cpp | 0 lib/ir/instructions.cpp | 0 lib/ir/ir.cpp | 0 lib/ir/module.cpp | 87 ++++++ lib/ir/type.cpp | 0 22 files changed, 729 insertions(+), 484 deletions(-) delete mode 100644 include/codegen.h create mode 100644 include/ir/basic_block.h create mode 100644 include/ir/builder.h create mode 100644 include/ir/constant.h create mode 100644 include/ir/context.h create mode 100644 include/ir/function.h create mode 100644 include/ir/instructions.h create mode 100644 include/ir/module.h create mode 100644 include/ir/type.h create mode 100644 include/ir/value.h create mode 100644 lib/ir/basic_block.cpp create mode 100644 lib/ir/builder.cpp create mode 100644 lib/ir/constant.cpp create mode 100644 lib/ir/context.cpp create mode 100644 lib/ir/function.cpp create mode 100644 lib/ir/instructions.cpp create mode 100644 lib/ir/ir.cpp create mode 100644 lib/ir/module.cpp create mode 100644 lib/ir/type.cpp diff --git a/examples/matrix.cpp b/examples/matrix.cpp index aa3b39b9b..2f8194675 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -1,10 +1,6 @@ #include #include #include "ast.h" -#include "codegen.h" -#include "llvm/IR/IRPrintingPasses.h" -#include "llvm/IR/LegacyPassManager.h" -#include "llvm/IR/PassManager.h" typedef struct yy_buffer_state * YY_BUFFER_STATE; extern int yyparse(); @@ -29,12 +25,12 @@ int main() { YY_BUFFER_STATE buffer = yy_scan_string(src); yyparse(); yy_delete_buffer(buffer); - translation_unit *program = ast_root; - tdl::context context; - tdl::module module("matrix", &context); - program->codegen(&module); - llvm::PrintModulePass print(llvm::outs()); - llvm::AnalysisManager analysis; - print.run(*module.handle(), analysis); +// translation_unit *program = ast_root; +// tdl::context context; +// tdl::module module("matrix", &context); +// program->codegen(&module); +// llvm::PrintModulePass print(llvm::outs()); +// llvm::AnalysisManager analysis; +// print.run(*module.handle(), analysis); return 0; } diff --git a/include/ast.h b/include/ast.h index 282e421e3..031d49f74 100644 --- a/include/ast.h +++ b/include/ast.h @@ -2,22 +2,22 @@ #define TDL_INCLUDE_AST_H #include "parser.hpp" -#include "llvm/IR/IRBuilder.h" #include #include #include -namespace llvm{ -class Function; -class Value; -class Type; - -} namespace tdl{ -class module; + +namespace ir{ + class function; + class value; + class type; + class builder; + class module; +} namespace ast{ @@ -62,7 +62,7 @@ class identifier; // AST class node { public: - virtual llvm::Value* codegen(module*) const { return nullptr; } + virtual ir::value* codegen(ir::module *) const { return nullptr; } }; template @@ -75,7 +75,7 @@ public: return this; } - llvm::Value* codegen(module* mod) const{ + ir::value* codegen(ir::module * mod) const{ for(T x: values_){ x->codegen(mod); } @@ -109,7 +109,7 @@ public: class expression: public node{ public: - virtual llvm::Value* codegen(module *) const = 0; + virtual ir::value* codegen(ir::module *) const = 0; }; class postfix_expression: public expression{ @@ -121,7 +121,7 @@ public: indexing_expression(node *id, node *ranges) : id_((const identifier*)id), ranges_((const list*)ranges) {} - llvm::Value* codegen(module *) const; + ir::value* codegen(ir::module *) const; private: const identifier* id_; @@ -140,17 +140,17 @@ private: class named_expression: public unary_expression { public: named_expression(node *id): unary_expression(id){ } - llvm::Value* codegen(module* mod) const; + ir::value* codegen(ir::module * mod) const; }; class binary_operator: public expression{ private: - llvm::Value* llvm_op(module *mod, llvm::IRBuilder<> &bld, llvm::Value *lhs, llvm::Value *rhs, const std::string &name) const; + ir::value* llvm_op(ir::module *mod, ir::builder &bld, ir::value *lhs, ir::value *rhs, const std::string &name) const; public: binary_operator(BIN_OP_T op, node *lhs, node *rhs) : op_(op), lhs_((expression*)lhs), rhs_((expression*)rhs) { } - llvm::Value* codegen(module *) const; + ir::value* codegen(ir::module *) const; private: const BIN_OP_T op_; @@ -162,7 +162,7 @@ private: class constant: public expression{ public: constant(int value): value_(value) { } - llvm::Value* codegen(module *mod) const; + ir::value* codegen(ir::module *mod) const; int value() const; private: @@ -173,7 +173,7 @@ private: class string_literal: public expression{ public: string_literal(char *&value): value_(value) { } - llvm::Value* codegen(module *mod) const; + ir::value* codegen(ir::module *mod) const; public: std::string value_; @@ -181,14 +181,14 @@ public: class unary_operator: public expression{ private: - llvm::Value *llvm_op(llvm::IRBuilder<> &builder, llvm::Value *arg, const std::string &name) const; + ir::value *llvm_op(ir::builder &builder, ir::value *arg, const std::string &name) const; public: unary_operator(UNARY_OP_T op, node *arg) : op_(op), arg_((expression*)arg) { } - llvm::Value* codegen(module *mod) const; + ir::value* codegen(ir::module *mod) const; private: const UNARY_OP_T op_; @@ -198,14 +198,14 @@ private: class type_name; class cast_operator: public expression{ private: - llvm::Value *llvm_op(llvm::IRBuilder<> &builder, llvm::Type *T, llvm::Value *arg, const std::string &name) const; + ir::value *llvm_op(ir::builder &builder, ir::type *T, ir::value *arg, const std::string &name) const; public: cast_operator(node *T, node *arg): T_((type_name*)T), arg_((expression*)arg) { } - llvm::Value* codegen(module *mod) const; + ir::value* codegen(ir::module *mod) const; public: const type_name *T_; @@ -214,8 +214,8 @@ public: class conditional_expression: public expression{ private: - llvm::Value *llvm_op(llvm::IRBuilder<> &builder, - llvm::Value *cond, llvm::Value *true_value, llvm::Value *false_value, + ir::value *llvm_op(ir::builder &builder, + ir::value *cond, ir::value *true_value, ir::value *false_value, const std::string &name) const; public: @@ -224,7 +224,7 @@ public: true_value_((expression*)true_value), false_value_((expression*)false_value) { } - llvm::Value* codegen(module *mod) const; + ir::value* codegen(ir::module *mod) const; public: const expression *cond_; @@ -237,7 +237,7 @@ public: assignment_expression(node *lvalue, ASSIGN_OP_T op, node *rvalue) : lvalue_((unary_expression*)lvalue), op_(op), rvalue_((expression*)rvalue) { } - llvm::Value* codegen(module *mod) const; + ir::value* codegen(ir::module *mod) const; public: ASSIGN_OP_T op_; @@ -254,7 +254,7 @@ public: declaration(node *spec, node *init) : spec_((declaration_specifier*)spec), init_((list*)init) { } - llvm::Value* codegen(module* mod) const; + ir::value* codegen(ir::module * mod) const; public: const declaration_specifier *spec_; @@ -272,7 +272,7 @@ public: compound_statement(node* decls, node* statements) : decls_((declarations_t)decls), statements_((statements_t)statements) {} - llvm::Value* codegen(module* mod) const; + ir::value* codegen(ir::module * mod) const; private: declarations_t decls_; @@ -284,7 +284,7 @@ public: selection_statement(node *cond, node *if_value, node *else_value = nullptr) : cond_(cond), then_value_(if_value), else_value_(else_value) { } - llvm::Value* codegen(module *mod) const; + ir::value* codegen(ir::module *mod) const; public: const node *cond_; @@ -298,7 +298,7 @@ public: : init_(init), stop_(stop), exec_(exec), statements_(statements) { } - llvm::Value* codegen(module *mod) const; + ir::value* codegen(ir::module *mod) const; private: const node *init_; @@ -316,7 +316,7 @@ public: declaration_specifier(TYPE_T spec) : spec_(spec) { } - llvm::Type* type(module *mod) const; + ir::type* type(ir::module *mod) const; private: const TYPE_T spec_; @@ -329,7 +329,7 @@ public: : spec_((declaration_specifier*)spec), decl_((declarator*)decl) { } - llvm::Type* type(module *mod) const; + ir::type* type(ir::module *mod) const; const identifier* id() const; public: @@ -339,13 +339,13 @@ public: /* Declarators */ class declarator: public node{ - virtual llvm::Type* type_impl(module*mod, llvm::Type *type) const = 0; + virtual ir::type* type_impl(ir::module *mod, ir::type *type) const = 0; public: declarator(node *lhs) : lhs_((declarator*)lhs), ptr_(nullptr){ } - llvm::Type* type(module*mod, llvm::Type *type) const; + ir::type* type(ir::module *mod, ir::type *type) const; const identifier* id() const { return (const identifier*)lhs_; @@ -362,7 +362,7 @@ protected: }; class identifier: public declarator { - llvm::Type* type_impl(module*mod, llvm::Type *type) const; + ir::type* type_impl(ir::module *mod, ir::type *type) const; public: identifier(char *&name): declarator(this), name_(name) { } @@ -374,7 +374,7 @@ private: class pointer: public declarator{ private: - llvm::Type* type_impl(module *mod, llvm::Type *type) const; + ir::type* type_impl(ir::module *mod, ir::type *type) const; public: pointer(node *id): declarator(id) { } @@ -382,7 +382,7 @@ public: class tile: public declarator{ private: - llvm::Type* type_impl(module *mod, llvm::Type *type) const; + ir::type* type_impl(ir::module *mod, ir::type *type) const; public: tile(node *id, node *shapes) @@ -394,13 +394,13 @@ public: class function: public declarator{ private: - llvm::Type* type_impl(module *mod, llvm::Type *type) const; + ir::type* type_impl(ir::module *mod, ir::type *type) const; public: function(node *id, node *args) : declarator(id), args_((list*)args) { } - void bind_parameters(module *mod, llvm::Function *fn) const; + void bind_parameters(ir::module *mod, ir::function *fn) const; public: const list* args_; @@ -409,7 +409,7 @@ public: class initializer : public declarator{ private: - llvm::Type* type_impl(module* mod, llvm::Type *type) const; + ir::type* type_impl(ir::module * mod, ir::type *type) const; public: initializer(node *decl, node *init) @@ -417,7 +417,7 @@ public: decl_((declarator*)decl), expr_((expression*)init){ } void specifier(const declaration_specifier *spec); - llvm::Value* codegen(module *) const; + ir::value* codegen(ir::module *) const; public: const declaration_specifier *spec_; @@ -431,7 +431,7 @@ public: type_name(node *spec, node * decl) : spec_((declaration_specifier*)spec), decl_((declarator*)decl) { } - llvm::Type *type(module *mod) const; + ir::type *type(ir::module *mod) const; public: const declaration_specifier *spec_; @@ -444,7 +444,7 @@ public: function_definition(node *spec, node *header, node *body) : spec_((declaration_specifier*)spec), header_((function *)header), body_((compound_statement*)body) { } - llvm::Value* codegen(module* mod) const; + ir::value* codegen(ir::module * mod) const; public: const declaration_specifier *spec_; @@ -463,7 +463,7 @@ public: return this; } - llvm::Value* codegen(module* mod) const; + ir::value* codegen(ir::module * mod) const; private: list* decls_; diff --git a/include/codegen.h b/include/codegen.h deleted file mode 100644 index 316e3b859..000000000 --- a/include/codegen.h +++ /dev/null @@ -1,47 +0,0 @@ -#include -#include -#include "ast.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/IRBuilder.h" - -namespace tdl -{ - -class context { -public: - context(); - llvm::LLVMContext* handle(); - -private: - llvm::LLVMContext handle_; -}; - -class module { - typedef std::pair val_key_t; - llvm::PHINode *make_phi(llvm::Type *type, unsigned num_values, llvm::BasicBlock *block); - llvm::Value *add_phi_operands(const std::string& name, llvm::PHINode *&phi); - llvm::Value *get_value_recursive(const std::string& name, llvm::BasicBlock *block); - -public: - module(const std::string &name, context *ctx); - llvm::Module* handle(); - llvm::IRBuilder<>& builder(); - // Setters - void set_value(const std::string& name, llvm::BasicBlock* block, llvm::Value *value); - void set_value(const std::string& name, llvm::Value* value); - // Getters - llvm::Value *get_value(const std::string& name, llvm::BasicBlock* block); - llvm::Value *get_value(const std::string& name); - // Seal block -- no more predecessors will be added - llvm::Value *seal_block(llvm::BasicBlock *block); - -private: - llvm::Module handle_; - llvm::IRBuilder<> builder_; - std::map values_; - std::set sealed_blocks_; - std::map> incomplete_phis_; -}; - - -} diff --git a/include/ir/basic_block.h b/include/ir/basic_block.h new file mode 100644 index 000000000..c059a2634 --- /dev/null +++ b/include/ir/basic_block.h @@ -0,0 +1,29 @@ +#ifndef TDL_INCLUDE_IR_BASIC_BLOCK_H +#define TDL_INCLUDE_IR_BASIC_BLOCK_H + +#include +#include "value.h" + +namespace tdl{ +namespace ir{ + +class context; +class function; + +/* Basic Block */ +class basic_block: public value{ +public: + function* get_parent(); + // Factory functions + static basic_block* create(context &ctx, const std::string &name, function *parent); + +private: + context &ctx_; + std::string name_; + function *parent_; +}; + +} +} + +#endif diff --git a/include/ir/builder.h b/include/ir/builder.h new file mode 100644 index 000000000..4ce5dedf5 --- /dev/null +++ b/include/ir/builder.h @@ -0,0 +1,89 @@ +#ifndef TDL_INCLUDE_IR_BUILDER_H +#define TDL_INCLUDE_IR_BUILDER_H + +#include +#include + +namespace tdl{ +namespace ir{ + +class basic_block; +class value; +class type; +class constant_int; + +/* Builder */ +class builder{ +public: + void set_insert_point(basic_block* bb); + basic_block* get_insert_block(); + // Constants + value *get_int32(unsigned val); + // Types + type *get_float_ty(); + type *get_double_ty(); + // Branch instructions + value* create_br(basic_block *bb); + value* create_cond_br(value *cond, basic_block* if_bb, basic_block* else_bb); + // Cast instructions + value* create_si_to_fp(value *src, type *dst_ty); + value* create_ui_to_fp(value *src, type *dst_ty); + value* create_fp_to_si(value *src, type *dst_ty); + value* create_fp_to_ui(value *src, type *dst_ty); + value* create_fp_ext(value *src, type *dst_ty); + value* create_fp_trunc(value *src, type *dst_ty); + value* create_int_cast(value *src, type *dst_ty, bool is_signed); + // Call instruction + value* create_call(value *fn, const std::vector &args); + // Binary instructions + value *create_fmul(value *lhs, value *rhs, const std::string &name = ""); + value *create_mul(value *lhs, value *rhs, const std::string &name = ""); + value *create_fdiv(value *lhs, value *rhs, const std::string &name = ""); + value *create_sdiv(value *lhs, value *rhs, const std::string &name = ""); + value *create_udiv(value *lhs, value *rhs, const std::string &name = ""); + value *create_frem(value *lhs, value *rhs, const std::string &name = ""); + value *create_srem(value *lhs, value *rhs, const std::string &name = ""); + value *create_urem(value *lhs, value *rhs, const std::string &name = ""); + value *create_fadd(value *lhs, value *rhs, const std::string &name = ""); + value *create_add(value *lhs, value *rhs, const std::string &name = ""); + value *create_gep(value *lhs, const std::vector &offs, const std::string &name = ""); + value *create_fsub(value *lhs, value *rhs, const std::string &name = ""); + value *create_sub(value *lhs, value *rhs, const std::string &name = ""); + value *create_lshr(value *lhs, value *rhs, const std::string &name = ""); + value *create_ashr(value *lhs, value *rhs, const std::string &name = ""); + value *create_fcmpOLT(value *lhs, value *rhs, const std::string &name = ""); + value *create_icmpSLT(value *lhs, value *rhs, const std::string &name = ""); + value *create_icmpULT(value *lhs, value *rhs, const std::string &name = ""); + value *create_fcmpOGT(value *lhs, value *rhs, const std::string &name = ""); + value *create_icmpSGT(value *lhs, value *rhs, const std::string &name = ""); + value *create_icmpUGT(value *lhs, value *rhs, const std::string &name = ""); + value *create_fcmpOLE(value *lhs, value *rhs, const std::string &name = ""); + value *create_icmpSLE(value *lhs, value *rhs, const std::string &name = ""); + value *create_icmpULE(value *lhs, value *rhs, const std::string &name = ""); + value *create_fcmpOGE(value *lhs, value *rhs, const std::string &name = ""); + value *create_icmpSGE(value *lhs, value *rhs, const std::string &name = ""); + value *create_icmpUGE(value *lhs, value *rhs, const std::string &name = ""); + value *create_fcmpOEQ(value *lhs, value *rhs, const std::string &name = ""); + value *create_icmpEQ(value *lhs, value *rhs, const std::string &name = ""); + value *create_fcmpONE(value *lhs, value *rhs, const std::string &name = ""); + value *create_icmpNE(value *lhs, value *rhs, const std::string &name = ""); + value *create_and(value *lhs, value *rhs, const std::string &name = ""); + value *create_xor(value *lhs, value *rhs, const std::string &name = ""); + value *create_or(value *lhs, value *rhs, const std::string &name = ""); + // Side effects + value *create_fneg(value *arg, const std::string &name = ""); + value *create_neg(value *arg, const std::string &name = ""); + value *create_load(value *arg, const std::string &name = ""); + value *create_not(value *arg, const std::string &name = ""); + // Tile instruction + value *create_splat(value *arg, const std::vector &shapes, const std::string &name = ""); + value *create_reshape(value *arg, const std::vector &shapes, const std::string &name = ""); + value *create_broadcast(value *arg, const std::vector &shapes, const std::string &name = ""); + // Terminators + value *create_ret_void(); +}; + +} +} + +#endif diff --git a/include/ir/constant.h b/include/ir/constant.h new file mode 100644 index 000000000..a7790e9d3 --- /dev/null +++ b/include/ir/constant.h @@ -0,0 +1,32 @@ +#ifndef TDL_INCLUDE_IR_CONSTANT_H +#define TDL_INCLUDE_IR_CONSTANT_H + +#include "value.h" + +namespace tdl{ +namespace ir{ + +class type; +class context; + +/* Constant */ +class constant: public value{ + +}; + +/* Undef value */ +class undef_value: public constant{ +public: + static undef_value* get(type* ty); +}; + +/* Data array */ +class constant_data_array: public constant{ +public: + static constant_data_array* get_string(context &ctx, const std::string &str); +}; + +} +} + +#endif diff --git a/include/ir/context.h b/include/ir/context.h new file mode 100644 index 000000000..8b80f7491 --- /dev/null +++ b/include/ir/context.h @@ -0,0 +1,14 @@ +#ifndef TDL_INCLUDE_IR_CONTEXT_H +#define TDL_INCLUDE_IR_CONTEXT_H + +namespace tdl{ +namespace ir{ + +/* Context */ +class context { +}; + +} +} + +#endif diff --git a/include/ir/function.h b/include/ir/function.h new file mode 100644 index 000000000..3b8816b48 --- /dev/null +++ b/include/ir/function.h @@ -0,0 +1,40 @@ +#ifndef TDL_INCLUDE_IR_FUNCTION_H +#define TDL_INCLUDE_IR_FUNCTION_H + +#include +#include "value.h" + +namespace tdl{ +namespace ir{ + +class function_type; +class module; + +/* Argument */ +class argument: public value{ + +}; + +/* Function */ +class function: public value{ + using arg_iterator = argument *; + using const_arg_iterator = const argument *; + +public: + arg_iterator arg_begin(); + arg_iterator arg_end(); + const_arg_iterator arg_begin() const; + const_arg_iterator arg_end() const; + // Factory methods + static function *create(function_type *type, const std::string &name, module *mod); + +private: + function_type *type_; + std::string name_; + module *mod_; +}; + +} +} + +#endif diff --git a/include/ir/instructions.h b/include/ir/instructions.h new file mode 100644 index 000000000..782033cb8 --- /dev/null +++ b/include/ir/instructions.h @@ -0,0 +1,29 @@ +#ifndef TDL_INCLUDE_IR_INSTRUCTIONS_H +#define TDL_INCLUDE_IR_INSTRUCTIONS_H + +#include "value.h" + +namespace tdl{ +namespace ir{ + +/* Instructions */ +class instruction: public value{ + +}; + +class phi_node: public instruction{ + +}; + +class binary_operator: public instruction{ + +}; + +class unary_operator: public instruction{ + +}; + +} +} + +#endif diff --git a/include/ir/module.h b/include/ir/module.h new file mode 100644 index 000000000..fb3745caf --- /dev/null +++ b/include/ir/module.h @@ -0,0 +1,47 @@ +#ifndef TDL_INCLUDE_IR_MODULE_H +#define TDL_INCLUDE_IR_MODULE_H + +#include +#include +#include +#include "builder.h" + +namespace tdl{ +namespace ir{ + +class basic_block; +class phi_node; +class value; +class context; + +/* Module */ +class module { + typedef std::pair val_key_t; + phi_node *make_phi(type *ty, unsigned num_values, basic_block *block); + void add_phi_operands(const std::string& name, phi_node *&phi); + value *get_value_recursive(const std::string& name, basic_block *block); + +public: + module(const std::string &name, context *ctx); + context& get_context(); + builder& get_builder(); + // Setters + void set_value(const std::string& name, basic_block* block, value *x); + void set_value(const std::string& name, value* x); + // Getters + value *get_value(const std::string& name, basic_block* block); + value *get_value(const std::string& name); + // Seal block -- no more predecessors will be added + void seal_block(basic_block *block); + +private: + builder builder_; + std::map values_; + std::set sealed_blocks_; + std::map> incomplete_phis_; +}; + +} +} + +#endif diff --git a/include/ir/type.h b/include/ir/type.h new file mode 100644 index 000000000..6a50690ed --- /dev/null +++ b/include/ir/type.h @@ -0,0 +1,58 @@ +#ifndef TDL_INCLUDE_IR_TYPE_H +#define TDL_INCLUDE_IR_TYPE_H + +#include + +namespace tdl{ +namespace ir{ + +class context; + +/* Type */ +class type { +public: + bool is_integer_ty() const; + bool is_pointer_ty() const; + bool is_float_ty() const; + bool is_double_ty() const; + bool is_floating_point_ty() const; + + // type attributes + unsigned get_fp_mantissa_width() const; + unsigned get_integer_bit_width() const; + const std::vector &get_tile_shapes() const; + // Factory methods + static type* get_void_ty(context &ctx); + static type* get_float_ty(context &ctx); + static type* get_double_ty(context &ctx); +}; + +class integer_type: public type { +public: + static integer_type* get(context &ctx, unsigned width); +}; + +class tile_type: public type { +public: + static tile_type* get(type *ty, const std::vector &shapes); +}; + +class pointer_type: public type { +public: + static pointer_type* get(type *ty, unsigned address_space); +}; + +class function_type: public type { +public: + static function_type* get(type *ret_ty, const std::vector& param_tys); + +private: + type *return_type_; + std::vector param_types_; +}; + + +} +} + +#endif diff --git a/include/ir/value.h b/include/ir/value.h new file mode 100644 index 000000000..79a409cff --- /dev/null +++ b/include/ir/value.h @@ -0,0 +1,24 @@ +#ifndef TDL_INCLUDE_IR_VALUE_H +#define TDL_INCLUDE_IR_VALUE_H + +#include + +namespace tdl{ +namespace ir{ + +class type; + +/* Value */ +class value { +public: + void set_name(const std::string &name); + type* get_type(); + +private: + std::string name_; +}; + +} +} + +#endif diff --git a/lib/codegen.cpp b/lib/codegen.cpp index f7af12a60..920584edc 100644 --- a/lib/codegen.cpp +++ b/lib/codegen.cpp @@ -1,141 +1,42 @@ #include +#include #include "ast.h" -#include "codegen.h" -#include "llvm/IR/DerivedTypes.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/CFG.h" -#include "llvm/IR/Instructions.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "ir/constant.h" +#include "ir/function.h" +#include "ir/module.h" +#include "ir/basic_block.h" +#include "ir/builder.h" +#include "ir/type.h" #include -using namespace llvm; namespace tdl{ -/* Nd Array utils */ -inline std::vector array_shapes(Type *array_ty){ - std::vector result; - Type *current = array_ty; - while(isa(current)){ - result.push_back(array_ty->getArrayNumElements()); - current = array_ty->getArrayElementType(); - printf("%d %d\n", current, current->getTypeID()); - }; - return result; -} - -/* Context */ -context::context() { } - -LLVMContext *context::handle() { - return &handle_; -} - -/* Module */ -module::module(const std::string &name, context *ctx) - : handle_(name.c_str(), *ctx->handle()), builder_(*ctx->handle()) { - sealed_blocks_.insert(nullptr); -} - -llvm::Module* module::handle() { - return &handle_; -} - -llvm::IRBuilder<>& module::builder() { - return builder_; -} - -void module::set_value(const std::string& name, BasicBlock *block, Value *value){ - values_[val_key_t{name, block}] = value; -} - -void module::set_value(const std::string& name, llvm::Value* value){ - return set_value(name, builder_.GetInsertBlock(), value); -} - -PHINode* module::make_phi(Type *type, unsigned num_values, BasicBlock *block){ - Instruction* instr = block->getFirstNonPHIOrDbg(); - if(instr) - builder_.SetInsertPoint(instr); - PHINode *res = builder_.CreatePHI(type, num_values); - if(instr) - builder_.SetInsertPoint(block); - return res; -} - -Value *module::add_phi_operands(const std::string& name, PHINode *&phi){ - BasicBlock *block = phi->getParent(); - for(BasicBlock *pred: predecessors(block)){ - llvm::Value *value = get_value(name, pred); - phi->addIncoming(value, pred); - } - return phi; -} - -llvm::Value *module::get_value_recursive(const std::string& name, BasicBlock *block) { - llvm::Value *result; - if(sealed_blocks_.find(block) == sealed_blocks_.end()){ - llvm::Value *pred = get_value(name, *pred_begin(block)); - incomplete_phis_[block][name] = make_phi(pred->getType(), 1, block); - result = (Value*)incomplete_phis_[block][name]; - } - else if(pred_size(block) <= 1){ - bool has_pred = pred_size(block); - result = get_value(name, has_pred?*pred_begin(block):nullptr); - } - else{ - llvm::Value *pred = get_value(name, *pred_begin(block)); - result = make_phi(pred->getType(), 1, block); - set_value(name, block, result); - add_phi_operands(name, (PHINode*&)result); - } - set_value(name, block, result); - return result; -} - -llvm::Value *module::get_value(const std::string& name, BasicBlock *block) { - val_key_t key(name, block); - if(values_.find(key) != values_.end()){ - return values_.at(key); - } - return get_value_recursive(name, block); -} - -llvm::Value *module::get_value(const std::string& name) { - return get_value(name, builder_.GetInsertBlock()); -} - -llvm::Value *module::seal_block(BasicBlock *block){ - for(auto &x: incomplete_phis_[block]) - add_phi_operands(x.first, x.second); - sealed_blocks_.insert(block); -} - namespace ast{ /* Translation unit */ -Value* translation_unit::codegen(module *mod) const{ +ir::value* translation_unit::codegen(ir::module *mod) const{ decls_->codegen(mod); return nullptr; } /* Declaration specifier */ -Type* declaration_specifier::type(module *mod) const { - LLVMContext &ctx = mod->handle()->getContext(); +ir::type* declaration_specifier::type(ir::module *mod) const { + ir::context &ctx = mod->get_context(); switch (spec_) { - case VOID_T: return Type::getVoidTy(ctx); - case INT8_T: return IntegerType::get(ctx, 8); - case INT16_T: return IntegerType::get(ctx, 16); - case INT32_T: return IntegerType::get(ctx, 32); - case INT64_T: return IntegerType::get(ctx, 64); - case FLOAT32_T: return Type::getFloatTy(ctx); - case FLOAT64_T: return Type::getDoubleTy(ctx); - default: assert(false && "unreachable"); throw; + case VOID_T: return ir::type::get_void_ty(ctx); + case INT8_T: return ir::integer_type::get(ctx, 8); + case INT16_T: return ir::integer_type::get(ctx, 16); + case INT32_T: return ir::integer_type::get(ctx, 32); + case INT64_T: return ir::integer_type::get(ctx, 64); + case FLOAT32_T: return ir::type::get_float_ty(ctx); + case FLOAT64_T: return ir::type::get_double_ty(ctx); + default: throw std::runtime_error("unreachable"); } } /* Parameter */ -Type* parameter::type(module *mod) const { +ir::type* parameter::type(ir::module *mod) const { return decl_->type(mod, spec_->type(mod)); } @@ -144,14 +45,14 @@ const identifier *parameter::id() const { } /* Declarators */ -Type* declarator::type(module *mod, Type *type) const{ +ir::type* declarator::type(ir::module *mod, ir::type *type) const{ if(ptr_) return type_impl(mod, ptr_->type(mod, type)); return type_impl(mod, type); } // Identifier -Type* identifier::type_impl(module *, Type *type) const{ +ir::type* identifier::type_impl(ir::module *, ir::type *type) const{ return type; } @@ -160,60 +61,57 @@ const std::string &identifier::name() const{ } // Tile -Type* tile::type_impl(module*, Type *type) const{ - Type *current = type; - unsigned i = 0; - do{ - current = ArrayType::get(current, shapes_->values()[i++]->value()); - }while(i < shapes_->values().size()); - return current; +ir::type* tile::type_impl(ir::module*, ir::type *type) const{ + std::vector shapes; + for(constant *cst: shapes_->values()) + shapes.push_back(cst->value()); + return ir::tile_type::get(type, shapes); } // Pointer -Type* pointer::type_impl(module*, Type *type) const{ - return PointerType::get(type, 1); +ir::type* pointer::type_impl(ir::module*, ir::type *type) const{ + return ir::pointer_type::get(type, 1); } // Function -void function::bind_parameters(module *mod, Function *fn) const{ - std::vector args; - std::transform(fn->arg_begin(), fn->arg_end(), std::back_inserter(args), [&](llvm::Argument& x){ return &x;}); +void function::bind_parameters(ir::module *mod, ir::function *fn) const{ + std::vector args; + std::transform(fn->arg_begin(), fn->arg_end(), std::back_inserter(args), [&](ir::argument& x){ return &x;}); assert(args.size() == args_->values().size()); for(size_t i = 0; i < args.size(); i++){ parameter *param_i = args_->values().at(i); const identifier *id_i = param_i->id(); if(id_i){ - args[i]->setName(id_i->name()); + args[i]->set_name(id_i->name()); mod->set_value(id_i->name(), nullptr, args[i]); } } } -Type* function::type_impl(module*mod, Type *type) const{ - SmallVector types; - for(parameter* param: args_->values()){ +ir::type* function::type_impl(ir::module* mod, ir::type *type) const{ + std::vector types; + for(parameter* param: args_->values()) types.push_back(param->type(mod)); - } - return FunctionType::get(type, types, false); + return ir::function_type::get(type, types); } /* Function definition */ -Value* function_definition::codegen(module *mod) const{ - FunctionType *prototype = (FunctionType *)header_->type(mod, spec_->type(mod)); +ir::value* function_definition::codegen(ir::module *mod) const{ + ir::function_type *prototype = (ir::function_type*)header_->type(mod, spec_->type(mod)); const std::string &name = header_->id()->name(); - Function *fn = Function::Create(prototype, Function::ExternalLinkage, name, mod->handle()); + ir::function *fn = ir::function::create(prototype, name, mod); header_->bind_parameters(mod, fn); - BasicBlock *entry = BasicBlock::Create(mod->handle()->getContext(), "entry", fn); + ir::basic_block *entry = ir::basic_block::create(mod->get_context(), "entry", fn); mod->seal_block(entry); - mod->builder().SetInsertPoint(entry); + mod->get_builder().set_insert_point(entry); body_->codegen(mod); - mod->builder().CreateRetVoid(); + mod->get_builder().create_ret_void(); return nullptr; } /* Statements */ -Value* compound_statement::codegen(module* mod) const{ +ir::value* compound_statement::codegen(ir::module* mod) const{ decls_->codegen(mod); if(statements_) statements_->codegen(mod); @@ -221,56 +119,56 @@ Value* compound_statement::codegen(module* mod) const{ } /* Iteration statement */ -Value* iteration_statement::codegen(module *mod) const{ - IRBuilder<> &builder = mod->builder(); - LLVMContext &ctx = mod->handle()->getContext(); - Function *fn = builder.GetInsertBlock()->getParent(); - BasicBlock *loop_bb = BasicBlock::Create(ctx, "loop", fn); - BasicBlock *next_bb = BasicBlock::Create(ctx, "postloop", fn); +ir::value* iteration_statement::codegen(ir::module *mod) const{ + ir::builder &builder = mod->get_builder(); + ir::context &ctx = mod->get_context(); + ir::function *fn = builder.get_insert_block()->get_parent(); + ir::basic_block *loop_bb = ir::basic_block::create(ctx, "loop", fn); + ir::basic_block *next_bb = ir::basic_block::create(ctx, "postloop", fn); init_->codegen(mod); - builder.CreateBr(loop_bb); - builder.SetInsertPoint(loop_bb); + builder.create_br(loop_bb); + builder.set_insert_point(loop_bb); statements_->codegen(mod); exec_->codegen(mod); - Value *cond = stop_->codegen(mod); - builder.CreateCondBr(cond, loop_bb, next_bb); - builder.SetInsertPoint(next_bb); + ir::value *cond = stop_->codegen(mod); + builder.create_cond_br(cond, loop_bb, next_bb); + builder.set_insert_point(next_bb); mod->seal_block(loop_bb); mod->seal_block(next_bb); return nullptr; } /* Selection statement */ -Value* selection_statement::codegen(module* mod) const{ - IRBuilder<> &builder = mod->builder(); - LLVMContext &ctx = mod->handle()->getContext(); - Function *fn = builder.GetInsertBlock()->getParent(); - Value *cond = cond_->codegen(mod); - BasicBlock *then_bb = BasicBlock::Create(ctx, "then", fn); - BasicBlock *else_bb = else_value_?BasicBlock::Create(ctx, "else", fn):nullptr; - BasicBlock *endif_bb = BasicBlock::Create(ctx, "endif", fn); +ir::value* selection_statement::codegen(ir::module* mod) const{ + ir::builder &builder = mod->get_builder(); + ir::context &ctx = mod->get_context(); + ir::function *fn = builder.get_insert_block()->get_parent(); + ir::value *cond = cond_->codegen(mod); + ir::basic_block *then_bb = ir::basic_block::create(ctx, "then", fn); + ir::basic_block *else_bb = else_value_?ir::basic_block::create(ctx, "else", fn):nullptr; + ir::basic_block *endif_bb = ir::basic_block::create(ctx, "endif", fn); // Branch if(else_value_) - builder.CreateCondBr(cond, then_bb, else_bb); + builder.create_cond_br(cond, then_bb, else_bb); else - builder.CreateCondBr(cond, then_bb, endif_bb); + builder.create_cond_br(cond, then_bb, endif_bb); // Then - builder.SetInsertPoint(then_bb); + builder.set_insert_point(then_bb); then_value_->codegen(mod); if(else_value_) - builder.CreateBr(endif_bb); + builder.create_br(endif_bb); // Else if(else_value_){ - builder.SetInsertPoint(else_bb); + builder.set_insert_point(else_bb); else_value_->codegen(mod); - builder.CreateBr(endif_bb); + builder.create_br(endif_bb); } // Endif - builder.SetInsertPoint(endif_bb); + builder.set_insert_point(endif_bb); } /* Declaration */ -Value* declaration::codegen(module* mod) const{ +ir::value* declaration::codegen(ir::module* mod) const{ for(initializer *init: init_->values()) init->specifier(spec_); init_->codegen(mod); @@ -278,7 +176,7 @@ Value* declaration::codegen(module* mod) const{ } /* Initializer */ -Type* initializer::type_impl(module *mod, Type *type) const{ +ir::type* initializer::type_impl(ir::module *mod, ir::type *type) const{ return decl_->type(mod, type); } @@ -286,15 +184,15 @@ void initializer::specifier(const declaration_specifier *spec) { spec_ = spec; } -Value* initializer::codegen(module * mod) const{ - Type *ty = decl_->type(mod, spec_->type(mod)); +ir::value* initializer::codegen(ir::module * mod) const{ + ir::type *ty = decl_->type(mod, spec_->type(mod)); std::string name = decl_->id()->name(); - Value *value; + ir::value *value; if(expr_) value = expr_->codegen(mod); else - value = llvm::UndefValue::get(ty); - value->setName(name); + value = ir::undef_value::get(ty); + value->set_name(name); mod->set_value(name, value); return value; } @@ -302,97 +200,87 @@ Value* initializer::codegen(module * mod) const{ /*------------------*/ /* Expression */ /*------------------*/ -llvm::Value *llvm_cast(llvm::IRBuilder<> &builder, Value *src, Type *dst_ty){ - Type *src_ty = src->getType(); +ir::value *llvm_cast(ir::builder &builder, ir::value *src, ir::type *dst_ty){ + ir::type *src_ty = src->get_type(); bool src_signed = false; bool dst_signed = false; if(src_ty == dst_ty) return src; - else if(src_ty->isIntegerTy() && src_signed && dst_ty->isFloatingPointTy()) - return builder.CreateSIToFP(src, dst_ty); + else if(src_ty->is_integer_ty() && src_signed && dst_ty->is_floating_point_ty()) + return builder.create_si_to_fp(src, dst_ty); - else if(src_ty->isIntegerTy() && !src_signed && dst_ty->isFloatingPointTy()) - return builder.CreateUIToFP(src, dst_ty); + else if(src_ty->is_integer_ty() && !src_signed && dst_ty->is_floating_point_ty()) + return builder.create_ui_to_fp(src, dst_ty); - else if(src_ty->isFloatingPointTy() && dst_ty->isIntegerTy() && dst_signed) - return builder.CreateFPToSI(src, dst_ty); + else if(src_ty->is_floating_point_ty() && dst_ty->is_integer_ty() && dst_signed) + return builder.create_fp_to_si(src, dst_ty); - else if(src_ty->isFloatingPointTy() && dst_ty->isIntegerTy() && !dst_signed) - return builder.CreateFPToUI(src, dst_ty); + else if(src_ty->is_floating_point_ty() && dst_ty->is_integer_ty() && !dst_signed) + return builder.create_fp_to_ui(src, dst_ty); - else if(src_ty->isFloatingPointTy() && dst_ty->isFloatingPointTy() && - src_ty->getFPMantissaWidth() < dst_ty->getFPMantissaWidth()) - return builder.CreateFPExt(src, dst_ty); + else if(src_ty->is_floating_point_ty() && dst_ty->is_floating_point_ty() && + src_ty->get_fp_mantissa_width() < dst_ty->get_fp_mantissa_width()) + return builder.create_fp_ext(src, dst_ty); - else if(src_ty->isFloatingPointTy() && dst_ty->isFloatingPointTy() && - src_ty->getFPMantissaWidth() > dst_ty->getFPMantissaWidth()) - return builder.CreateFPTrunc(src, dst_ty); + else if(src_ty->is_floating_point_ty() && dst_ty->is_floating_point_ty() && + src_ty->get_fp_mantissa_width() > dst_ty->get_fp_mantissa_width()) + return builder.create_fp_trunc(src, dst_ty); - else if(src_ty->isIntegerTy() && dst_ty->isIntegerTy() && - src_ty->getIntegerBitWidth()) - return builder.CreateIntCast(src, dst_ty, dst_signed); + else if(src_ty->is_integer_ty() && dst_ty->is_integer_ty() && + src_ty->get_integer_bit_width()) + return builder.create_int_cast(src, dst_ty, dst_signed); - else{ - assert(false && "unreachable"); - throw; - } + else + throw std::runtime_error("unreachable"); } -inline void implicit_cast(llvm::IRBuilder<> &builder, Value *&lhs, Value *&rhs, +inline void implicit_cast(ir::builder &builder, ir::value *&lhs, ir::value *&rhs, bool &is_float, bool &is_ptr, bool &is_int, bool &is_signed){ // Input types - Type *left_ty = lhs->getType(); - Type *right_ty = rhs->getType(); + ir::type *left_ty = lhs->get_type(); + ir::type *right_ty = rhs->get_type(); // One operand is pointer - if(left_ty->isPointerTy()){ + if(left_ty->is_pointer_ty()){ is_ptr = true; } // One operand is double - else if(left_ty->isDoubleTy() || right_ty->isDoubleTy()){ - Value *&to_convert = left_ty->isDoubleTy()?rhs:lhs; - to_convert = llvm_cast(builder, to_convert, builder.getDoubleTy()); + else if(left_ty->is_double_ty() || right_ty->is_double_ty()){ + ir::value *&to_convert = left_ty->is_double_ty()?rhs:lhs; + to_convert = llvm_cast(builder, to_convert, builder.get_double_ty()); is_float = true; } // One operand is float - else if(left_ty->isFloatTy() || right_ty->isFloatTy()){ - Value *&to_convert = left_ty->isFloatTy()?rhs:lhs; - to_convert = llvm_cast(builder, to_convert, builder.getFloatTy()); + else if(left_ty->is_float_ty() || right_ty->is_float_ty()){ + ir::value *&to_convert = left_ty->is_float_ty()?rhs:lhs; + to_convert = llvm_cast(builder, to_convert, builder.get_float_ty()); is_float = true; } // Both operands are integers - else if(left_ty->isIntegerTy() && right_ty->isIntegerTy()){ + else if(left_ty->is_integer_ty() && right_ty->is_integer_ty()){ is_int = true; is_signed = false; - if(left_ty->getIntegerBitWidth() != right_ty->getIntegerBitWidth()){ - Value *&to_convert = (left_ty->getIntegerBitWidth() > right_ty->getIntegerBitWidth())?rhs:lhs; - Type *dst_ty = (to_convert==lhs)?right_ty:left_ty; + if(left_ty->get_integer_bit_width() != right_ty->get_integer_bit_width()){ + ir::value *&to_convert = (left_ty->get_integer_bit_width() > right_ty->get_integer_bit_width())?rhs:lhs; + ir::type *dst_ty = (to_convert==lhs)?right_ty:left_ty; to_convert = llvm_cast(builder, to_convert, dst_ty); } } // Not reachable - else{ - assert(false); - throw; - } + else + throw std::runtime_error("unreachable"); } -inline void implicit_broadcast(module *mod, llvm::IRBuilder<> &builder, Value *&lhs, Value *&rhs){ - std::vector lhs_shapes = array_shapes(lhs->getType()); - std::vector rhs_shapes = array_shapes(rhs->getType()); +inline void implicit_broadcast(ir::module *mod, ir::builder &builder, ir::value *&lhs, ir::value *&rhs){ + std::vector lhs_shapes = lhs->get_type()->get_tile_shapes(); + std::vector rhs_shapes = rhs->get_type()->get_tile_shapes(); // Both are scalar if(lhs_shapes.empty() && rhs_shapes.empty()) return; // One argument is scalar if(!lhs_shapes.empty() ^ !rhs_shapes.empty()){ - auto &ref_shapes = lhs_shapes.empty()?rhs_shapes:lhs_shapes; - auto &ref = lhs_shapes.empty()?rhs:lhs; + auto &shapes = lhs_shapes.empty()?rhs_shapes:lhs_shapes; auto &target = lhs_shapes.empty()?lhs:rhs; - Function *splat_fn = Intrinsic::getDeclaration(mod->handle(), Intrinsic::tlvm_splat_2d, {ref->getType()}); - SmallVector args(1 + ref_shapes.size()); - for(unsigned i = 0; i < ref_shapes.size(); i++) - args[1 + i] = builder.getInt32(ref_shapes[i]); - args[0] = target; - target = builder.CreateCall(splat_fn, args); + target = builder.create_splat(target, shapes); return; } // Both are arrays @@ -407,246 +295,195 @@ inline void implicit_broadcast(module *mod, llvm::IRBuilder<> &builder, Value *& throw std::runtime_error("cannot broadcast"); } // Pad - for(size_t i = 0; i < off; i++){ + for(size_t i = 0; i < off; i++) shortest.insert(shortest.begin(), 1); - } - Value *&target = (lhs_dim < rhs_dim)?lhs:rhs; - SmallVector args(1 + ndim); - // Reshape left hand side + ir::value *&target = (lhs_dim < rhs_dim)?lhs:rhs; + target = builder.create_reshape(target, shortest); + // Broadcast + std::vector shapes(ndim); for(size_t i = 0; i < ndim; i++) - args[1 + i] = builder.getInt32(shortest[i]); - args[0] = target; - Function *reshape_fn = Intrinsic::getDeclaration(mod->handle(), Intrinsic::tlvm_reshape_2d_1d, {rhs->getType(), lhs->getType()}); - target = builder.CreateCall(reshape_fn, args); - // Broadcast both arguments - for(size_t i = 0; i < ndim; i++) - args[1 + i] = builder.getInt32(std::max(shortest[i], longest[i])); - Function *broadcast_fn = Intrinsic::getDeclaration(mod->handle(), Intrinsic::tlvm_broadcast_2d, {target->getType(), target->getType()}); - // Broadcast lhs - args[0] = lhs; - lhs = builder.CreateCall(broadcast_fn, args); - // Broadcast rhs - args[0] = rhs; - rhs = builder.CreateCall(broadcast_fn, args); + shapes[i] = std::max(shortest[i], longest[i]); + lhs = builder.create_broadcast(lhs, shapes); + rhs = builder.create_broadcast(rhs, shapes); } /* Binary operator */ -Value *binary_operator::llvm_op(module *mod, llvm::IRBuilder<> &builder, Value *lhs, Value *rhs, const std::string &name) const +ir::value *binary_operator::llvm_op(ir::module *mod, ir::builder &builder, ir::value *lhs, ir::value *rhs, const std::string &name) const { bool is_float = false, is_ptr = false, is_int = false, is_signed = false; // implicit_cast(builder, lhs, rhs, is_float, is_ptr, is_int, is_signed); // implicit_broadcast(mod, builder, lhs, rhs); - // Mul if(op_==MUL && is_float) - return builder.CreateFMul(lhs, rhs, name); + return builder.create_fmul(lhs, rhs, name); if(op_==MUL && is_int) - return builder.CreateMul(lhs, rhs, name); - // Div + return builder.create_mul(lhs, rhs, name); if(op_==DIV && is_float) - return builder.CreateFDiv(lhs, rhs, name); + return builder.create_fdiv(lhs, rhs, name); if(op_==DIV && is_int && is_signed) - return builder.CreateSDiv(lhs, rhs, name); + return builder.create_sdiv(lhs, rhs, name); if(op_==DIV && is_int && !is_signed) - return builder.CreateUDiv(lhs, rhs, name); - // Mod + return builder.create_udiv(lhs, rhs, name); if(op_==MOD && is_float) - return builder.CreateFRem(lhs, rhs, name); + return builder.create_frem(lhs, rhs, name); if(op_==MOD && is_int && is_signed) - return builder.CreateSRem(lhs, rhs, name); + return builder.create_srem(lhs, rhs, name); if(op_==MOD && is_int && !is_signed) - return builder.CreateURem(lhs, rhs, name); - // Add + return builder.create_urem(lhs, rhs, name); if(op_==ADD && is_float) - return builder.CreateFAdd(lhs, rhs, name); + return builder.create_fadd(lhs, rhs, name); if(op_==ADD && is_int) - return builder.CreateAdd(lhs, rhs); + return builder.create_add(lhs, rhs); if(op_==ADD && is_ptr) - return builder.CreateGEP(lhs, {rhs}); - // Sub + return builder.create_gep(lhs, {rhs}); if(op_==SUB && is_float) - return builder.CreateFSub(lhs, rhs, name); + return builder.create_fsub(lhs, rhs, name); if(op_==SUB && is_int) - return builder.CreateSub(lhs, rhs, name); + return builder.create_sub(lhs, rhs, name); if(op_==SUB && is_ptr) - return builder.CreateGEP(lhs, {builder.CreateNeg(rhs)}); - // Left shift - if(op_==LEFT_SHIFT){ - assert(is_int); - return builder.CreateLShr(lhs, rhs, name); - } - // Right shift - if(op_==RIGHT_SHIFT){ - assert(is_int); - return builder.CreateAShr(lhs, rhs, name); - } - // LT + return builder.create_gep(lhs, {builder.create_neg(rhs)}); + if(op_==LEFT_SHIFT) + return builder.create_lshr(lhs, rhs, name); + if(op_==RIGHT_SHIFT) + return builder.create_ashr(lhs, rhs, name); if(op_ == LT && is_float) - return builder.CreateFCmpOLT(lhs, rhs, name); + return builder.create_fcmpOLT(lhs, rhs, name); if(op_ == LT && is_int && is_signed) - return builder.CreateICmpSLT(lhs, rhs, name); + return builder.create_icmpSLT(lhs, rhs, name); if(op_ == LT && is_int && !is_signed) - return builder.CreateICmpULT(lhs, rhs, name); - // GT + return builder.create_icmpULT(lhs, rhs, name); if(op_ == GT && is_float) - return builder.CreateFCmpOGT(lhs, rhs, name); + return builder.create_fcmpOGT(lhs, rhs, name); if(op_ == GT && is_int && is_signed) - return builder.CreateICmpSGT(lhs, rhs, name); + return builder.create_icmpSGT(lhs, rhs, name); if(op_ == GT && is_int && !is_signed) - return builder.CreateICmpUGT(lhs, rhs, name); - // LE + return builder.create_icmpUGT(lhs, rhs, name); if(op_ == LE && is_float) - return builder.CreateFCmpOLE(lhs, rhs, name); + return builder.create_fcmpOLE(lhs, rhs, name); if(op_ == LE && is_int && is_signed) - return builder.CreateICmpSLE(lhs, rhs, name); + return builder.create_icmpSLE(lhs, rhs, name); if(op_ == LE && is_int && !is_signed) - return builder.CreateICmpULE(lhs, rhs, name); - // GE + return builder.create_icmpULE(lhs, rhs, name); if(op_ == GE && is_float) - return builder.CreateFCmpOGE(lhs, rhs, name); + return builder.create_fcmpOGE(lhs, rhs, name); if(op_ == GE && is_int && is_signed) - return builder.CreateICmpSGE(lhs, rhs, name); + return builder.create_icmpSGE(lhs, rhs, name); if(op_ == GE && is_int && !is_signed) - return builder.CreateICmpUGE(lhs, rhs, name); - // EQ + return builder.create_icmpUGE(lhs, rhs, name); if(op_ == EQ && is_float) - return builder.CreateFCmpOEQ(lhs, rhs, name); + return builder.create_fcmpOEQ(lhs, rhs, name); if(op_ == EQ && is_int) - return builder.CreateICmpEQ(lhs, rhs, name); - // NE + return builder.create_icmpEQ(lhs, rhs, name); if(op_ == NE && is_float) - return builder.CreateFCmpONE(lhs, rhs, name); + return builder.create_fcmpONE(lhs, rhs, name); if(op_ == NE && is_int) - return builder.CreateICmpNE(lhs, rhs, name); - // AND - if(op_ == AND){ - assert(is_int); - return builder.CreateAnd(lhs, rhs, name); - } - if(op_ == XOR){ - assert(is_int); - return builder.CreateXor(lhs, rhs, name); - } - if(op_ == OR){ - assert(is_int); - return builder.CreateOr(lhs, rhs, name); - } - if(op_ == LAND){ - assert(is_int); - return builder.CreateAnd(lhs, rhs, name); - } - if(op_ == LOR){ - assert(is_int); - return builder.CreateOr(lhs, rhs, name); - } - assert(false && "unreachable"); - throw; + return builder.create_icmpNE(lhs, rhs, name); + if(op_ == AND) + return builder.create_and(lhs, rhs, name); + if(op_ == XOR) + return builder.create_xor(lhs, rhs, name); + if(op_ == OR) + return builder.create_or(lhs, rhs, name); + if(op_ == LAND) + return builder.create_and(lhs, rhs, name); + if(op_ == LOR) + return builder.create_or(lhs, rhs, name); + throw std::runtime_error("unreachable"); } -Value* binary_operator::codegen(module *mod) const{ - Value *lhs = lhs_->codegen(mod); - Value *rhs = rhs_->codegen(mod); - Value *result = llvm_op(mod, mod->builder(), lhs, rhs, ""); +ir::value* binary_operator::codegen(ir::module *mod) const{ + ir::value *lhs = lhs_->codegen(mod); + ir::value *rhs = rhs_->codegen(mod); + ir::value *result = llvm_op(mod, mod->get_builder(), lhs, rhs, ""); return result; } /* Postfix expression */ -Value* indexing_expression::codegen(module *mod) const{ - Value *in = mod->get_value(id_->name()); - std::vector ranges; - for(range *x: ranges_->values()) - ranges.push_back(x->type()); - // Type information - Function* reshape; - Type *in_type = in->getType(); - size_t in_dim = in_type->getTileNumDimensions(); - size_t out_dim = ranges.size(); - Type *out_type = TileType::get(in_type->getTileElementType(), out_dim); - // Intrinsic function - Function *reshape_fn = Intrinsic::getDeclaration(mod->handle(), Intrinsic::tlvm_reshape_2d_1d, {out_type, in_type}); - - return nullptr; +ir::value* indexing_expression::codegen(ir::module *mod) const{ + ir::value *in = mod->get_value(id_->name()); + const std::vector &ranges = ranges_->values(); + std::vector in_shapes = in->get_type()->get_tile_shapes(); + std::vector out_shapes(ranges.size()); + size_t current = 0; + for(size_t i = 0; i < out_shapes.size(); i++) + out_shapes[i] = (ranges[i]->type()==NEWAXIS)?1:in_shapes[current++]; + return mod->get_builder().create_reshape(in, out_shapes); } /* Unary operator */ -Value *unary_operator::llvm_op(llvm::IRBuilder<> &builder, Value *arg, const std::string &name) const{ - Type *atype = arg->getType(); - bool is_float = atype->isFloatingPointTy(); - bool is_int = atype->isIntegerTy(); - if(op_ == INC){ - assert(is_int); - return builder.CreateAdd(arg, builder.getInt32(1), name); - } - if(op_ == DEC){ - assert(is_int); - return builder.CreateSub(arg, builder.getInt32(1), name); - } +ir::value *unary_operator::llvm_op(ir::builder &builder, ir::value *arg, const std::string &name) const{ + ir::type *atype = arg->get_type(); + bool is_float = atype->is_floating_point_ty(); + bool is_int = atype->is_integer_ty(); + if(op_ == INC) + return builder.create_add(arg, builder.get_int32(1), name); + if(op_ == DEC) + return builder.create_sub(arg, builder.get_int32(1), name); if(op_ == PLUS) return arg; if(op_ == MINUS && is_float) - return builder.CreateFNeg(arg, name); + return builder.create_fneg(arg, name); if(op_ == MINUS && is_int) - return builder.CreateNeg(arg, name); + return builder.create_neg(arg, name); if(op_ == ADDR) throw std::runtime_error("not supported"); if(op_ == DEREF) - return builder.CreateLoad(arg, name); + return builder.create_load(arg, name); if(op_ == COMPL) throw std::runtime_error("not supported"); if(op_ == NOT) - return builder.CreateNot(arg, name); - assert(false && "unrechable"); - throw; + return builder.create_not(arg, name); + throw std::runtime_error("unreachable"); } -Value* unary_operator::codegen(module *mod) const{ - Value *arg = arg_->codegen(mod); - Value *result = llvm_op(mod->builder(), arg, ""); +ir::value* unary_operator::codegen(ir::module *mod) const{ + ir::value *arg = arg_->codegen(mod); + ir::value *result = llvm_op(mod->get_builder(), arg, ""); return result; } /* Cast operator */ -Value *cast_operator::llvm_op(IRBuilder<> &builder, Type *T, Value *arg, const std::string &name) const{ +ir::value *cast_operator::llvm_op(ir::builder &builder, ir::type *T, ir::value *arg, const std::string &name) const{ return nullptr; } -Value* cast_operator::codegen(module *mod) const{ - Value *arg = arg_->codegen(mod); - Type *T = T_->type(mod); - return llvm_op(mod->builder(), T, arg, ""); +ir::value* cast_operator::codegen(ir::module *mod) const{ + ir::value *arg = arg_->codegen(mod); + ir::type *T = T_->type(mod); + return llvm_op(mod->get_builder(), T, arg, ""); } /* Conditional expression */ -Value *conditional_expression::llvm_op(IRBuilder<> &builder, Value *cond, Value *true_value, Value *false_value, const std::string &name) const{ +ir::value *conditional_expression::llvm_op(ir::builder &builder, ir::value *cond, ir::value *true_value, ir::value *false_value, const std::string &name) const{ return nullptr; } -Value *conditional_expression::codegen(module *mod) const{ - Value *cond = cond_->codegen(mod); - Value *true_value = true_value_->codegen(mod); - Value *false_value = false_value_->codegen(mod); - return llvm_op(mod->builder(), cond, true_value, false_value, ""); +ir::value *conditional_expression::codegen(ir::module *mod) const{ + ir::value *cond = cond_->codegen(mod); + ir::value *true_value = true_value_->codegen(mod); + ir::value *false_value = false_value_->codegen(mod); + return llvm_op(mod->get_builder(), cond, true_value, false_value, ""); } /* Assignment expression */ -Value *assignment_expression::codegen(module *mod) const{ - Value *rvalue = rvalue_->codegen(mod); +ir::value *assignment_expression::codegen(ir::module *mod) const{ + ir::value *rvalue = rvalue_->codegen(mod); mod->set_value(lvalue_->id()->name(), rvalue); return rvalue; } /* Type name */ -llvm::Type *type_name::type(module *mod) const{ +ir::type *type_name::type(ir::module *mod) const{ return decl_->type(mod, spec_->type(mod)); } /* String literal */ -llvm::Value* string_literal::codegen(module *mod) const{ - return ConstantDataArray::getString(mod->handle()->getContext(), value_); +ir::value* string_literal::codegen(ir::module *mod) const{ + return ir::constant_data_array::get_string(mod->get_context(), value_); } /* Constant */ -llvm::Value* constant::codegen(module *mod) const{ - return mod->builder().getInt32(value_); +ir::value* constant::codegen(ir::module *mod) const{ + return mod->get_builder().get_int32(value_); } int constant::value() const{ @@ -660,7 +497,7 @@ const identifier* unary_expression::id() const{ } /* Named */ -llvm::Value* named_expression::codegen(module *mod) const{ +ir::value* named_expression::codegen(ir::module *mod) const{ const std::string &name = id()->name(); return mod->get_value(name); } diff --git a/lib/ir/basic_block.cpp b/lib/ir/basic_block.cpp new file mode 100644 index 000000000..e69de29bb diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp new file mode 100644 index 000000000..e69de29bb diff --git a/lib/ir/constant.cpp b/lib/ir/constant.cpp new file mode 100644 index 000000000..e69de29bb diff --git a/lib/ir/context.cpp b/lib/ir/context.cpp new file mode 100644 index 000000000..f565ac9d1 --- /dev/null +++ b/lib/ir/context.cpp @@ -0,0 +1,10 @@ +#include "ir/context.h" + +namespace tdl{ +namespace ir{ + +/* Context */ +context::context() { } + +} +} diff --git a/lib/ir/function.cpp b/lib/ir/function.cpp new file mode 100644 index 000000000..e69de29bb diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp new file mode 100644 index 000000000..e69de29bb diff --git a/lib/ir/ir.cpp b/lib/ir/ir.cpp new file mode 100644 index 000000000..e69de29bb diff --git a/lib/ir/module.cpp b/lib/ir/module.cpp new file mode 100644 index 000000000..7661c349f --- /dev/null +++ b/lib/ir/module.cpp @@ -0,0 +1,87 @@ +#include "ir/module.h" + +namespace tdl{ +namespace ir{ + +/* Module */ +module::module(const std::string &name, context *ctx) + : handle_(name.c_str(), *ctx->handle()), builder_(*ctx->handle()) { + sealed_blocks_.insert(nullptr); +} + +Module* module::handle() { + return &handle_; +} + +IRBuilder<>& module::builder() { + return builder_; +} + +void module::set_value(const std::string& name, BasicBlock *block, Value *value){ + values_[val_key_t{name, block}] = value; +} + +void module::set_value(const std::string& name, Value* value){ + return set_value(name, builder_.GetInsertBlock(), value); +} + +PHINode* module::make_phi(Type *type, unsigned num_values, BasicBlock *block){ + Instruction* instr = block->getFirstNonPHIOrDbg(); + if(instr) + builder_.SetInsertPoint(instr); + PHINode *res = builder_.CreatePHI(type, num_values); + if(instr) + builder_.SetInsertPoint(block); + return res; +} + +Value *module::add_phi_operands(const std::string& name, PHINode *&phi){ + BasicBlock *block = phi->getParent(); + for(BasicBlock *pred: predecessors(block)){ + Value *value = get_value(name, pred); + phi->addIncoming(value, pred); + } + return phi; +} + +Value *module::get_value_recursive(const std::string& name, BasicBlock *block) { + Value *result; + if(sealed_blocks_.find(block) == sealed_blocks_.end()){ + Value *pred = get_value(name, *pred_begin(block)); + incomplete_phis_[block][name] = make_phi(pred->getType(), 1, block); + result = (Value*)incomplete_phis_[block][name]; + } + else if(pred_size(block) <= 1){ + bool has_pred = pred_size(block); + result = get_value(name, has_pred?*pred_begin(block):nullptr); + } + else{ + Value *pred = get_value(name, *pred_begin(block)); + result = make_phi(pred->getType(), 1, block); + set_value(name, block, result); + add_phi_operands(name, (PHINode*&)result); + } + set_value(name, block, result); + return result; +} + +Value *module::get_value(const std::string& name, BasicBlock *block) { + val_key_t key(name, block); + if(values_.find(key) != values_.end()){ + return values_.at(key); + } + return get_value_recursive(name, block); +} + +Value *module::get_value(const std::string& name) { + return get_value(name, builder_.GetInsertBlock()); +} + +Value *module::seal_block(BasicBlock *block){ + for(auto &x: incomplete_phis_[block]) + add_phi_operands(x.first, x.second); + sealed_blocks_.insert(block); +} + +} +} diff --git a/lib/ir/type.cpp b/lib/ir/type.cpp new file mode 100644 index 000000000..e69de29bb From 24bd2145dfbbf144045a5fbadf85d005a08470e5 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 2 Jan 2019 01:06:43 -0500 Subject: [PATCH 023/494] [intermediate representation] improved skeleton --- include/ir/basic_block.h | 10 ++ include/ir/builder.h | 91 ++++++++++------ include/ir/instructions.h | 143 +++++++++++++++++++++++- include/ir/module.h | 6 +- include/ir/value.h | 55 +++++++++- lib/codegen.cpp | 2 +- lib/ir/builder.cpp | 224 ++++++++++++++++++++++++++++++++++++++ lib/ir/context.cpp | 3 - lib/ir/instructions.cpp | 26 +++++ lib/ir/module.cpp | 72 ++++++------ lib/ir/value.cpp | 58 ++++++++++ 11 files changed, 611 insertions(+), 79 deletions(-) create mode 100644 lib/ir/value.cpp diff --git a/include/ir/basic_block.h b/include/ir/basic_block.h index c059a2634..e98a7ddfd 100644 --- a/include/ir/basic_block.h +++ b/include/ir/basic_block.h @@ -9,11 +9,20 @@ namespace ir{ class context; class function; +class instruction; /* Basic Block */ class basic_block: public value{ public: + // Accessors function* get_parent(); + instruction* get_first_non_phi_or_dbg(); + // Iterators + instruction* begin(); + instruction* end(); + // CFG + const std::vector& get_predecessors() const; + void add_predecessor(basic_block* pred); // Factory functions static basic_block* create(context &ctx, const std::string &name, function *parent); @@ -21,6 +30,7 @@ private: context &ctx_; std::string name_; function *parent_; + std::vector preds_; }; } diff --git a/include/ir/builder.h b/include/ir/builder.h index 4ce5dedf5..2e208d62b 100644 --- a/include/ir/builder.h +++ b/include/ir/builder.h @@ -3,6 +3,7 @@ #include #include +#include "instructions.h" namespace tdl{ namespace ir{ @@ -11,62 +12,81 @@ class basic_block; class value; class type; class constant_int; +class instruction; +class context; +class phi_node; /* Builder */ class builder{ public: - void set_insert_point(basic_block* bb); - basic_block* get_insert_block(); + // Constructor + builder(context &ctx); + // Setters + void set_insert_point(instruction* instr); + void set_insert_point(basic_block* block); + basic_block* get_insert_block() { return block_; } + instruction* get_insert_point() { return insert_point_;} // Constants value *get_int32(unsigned val); // Types type *get_float_ty(); type *get_double_ty(); + // Insert + template + InstTy* insert(InstTy *instr, const std::string &name = ""); // Branch instructions - value* create_br(basic_block *bb); - value* create_cond_br(value *cond, basic_block* if_bb, basic_block* else_bb); + value* create_br(basic_block *dest); + value* create_cond_br(value *cond, basic_block* if_dest, basic_block* else_dest); // Cast instructions - value* create_si_to_fp(value *src, type *dst_ty); - value* create_ui_to_fp(value *src, type *dst_ty); - value* create_fp_to_si(value *src, type *dst_ty); - value* create_fp_to_ui(value *src, type *dst_ty); - value* create_fp_ext(value *src, type *dst_ty); - value* create_fp_trunc(value *src, type *dst_ty); - value* create_int_cast(value *src, type *dst_ty, bool is_signed); - // Call instruction - value* create_call(value *fn, const std::vector &args); + value *create_cast(cast_inst::op_t op, value *v, type *dst_ty, const std::string &name = ""); + value* create_si_to_fp(value *src, type *dst_ty, const std::string &name = ""); + value* create_ui_to_fp(value *src, type *dst_ty, const std::string &name = ""); + value* create_fp_to_si(value *src, type *dst_ty, const std::string &name = ""); + value* create_fp_to_ui(value *src, type *dst_ty, const std::string &name = ""); + value* create_fp_ext(value *src, type *dst_ty, const std::string &name = ""); + value* create_fp_trunc(value *src, type *dst_ty, const std::string &name = ""); + value* create_int_cast(value *src, type *dst_ty, bool is_signed, const std::string &name = ""); + // Phi instruction + phi_node* create_phi(type *ty, unsigned num_reserved, const std::string &name = ""); // Binary instructions + value *create_insert_nuwnswb_binop(binary_operator::op_t op, value *lhs, value *rhs, const std::string &name, bool has_nuw, bool has_nsw); value *create_fmul(value *lhs, value *rhs, const std::string &name = ""); - value *create_mul(value *lhs, value *rhs, const std::string &name = ""); value *create_fdiv(value *lhs, value *rhs, const std::string &name = ""); + value *create_frem(value *lhs, value *rhs, const std::string &name = ""); + value *create_fadd(value *lhs, value *rhs, const std::string &name = ""); + value *create_fsub(value *lhs, value *rhs, const std::string &name = ""); + value *create_mul(value *lhs, value *rhs, const std::string &name = "", bool has_nuw = false, bool has_nsw = false); value *create_sdiv(value *lhs, value *rhs, const std::string &name = ""); value *create_udiv(value *lhs, value *rhs, const std::string &name = ""); - value *create_frem(value *lhs, value *rhs, const std::string &name = ""); value *create_srem(value *lhs, value *rhs, const std::string &name = ""); value *create_urem(value *lhs, value *rhs, const std::string &name = ""); - value *create_fadd(value *lhs, value *rhs, const std::string &name = ""); - value *create_add(value *lhs, value *rhs, const std::string &name = ""); - value *create_gep(value *lhs, const std::vector &offs, const std::string &name = ""); - value *create_fsub(value *lhs, value *rhs, const std::string &name = ""); - value *create_sub(value *lhs, value *rhs, const std::string &name = ""); - value *create_lshr(value *lhs, value *rhs, const std::string &name = ""); - value *create_ashr(value *lhs, value *rhs, const std::string &name = ""); - value *create_fcmpOLT(value *lhs, value *rhs, const std::string &name = ""); - value *create_icmpSLT(value *lhs, value *rhs, const std::string &name = ""); - value *create_icmpULT(value *lhs, value *rhs, const std::string &name = ""); - value *create_fcmpOGT(value *lhs, value *rhs, const std::string &name = ""); - value *create_icmpSGT(value *lhs, value *rhs, const std::string &name = ""); - value *create_icmpUGT(value *lhs, value *rhs, const std::string &name = ""); - value *create_fcmpOLE(value *lhs, value *rhs, const std::string &name = ""); + value *create_add(value *lhs, value *rhs, const std::string &name = "", bool has_nuw = false, bool has_nsw = false); + value *create_sub(value *lhs, value *rhs, const std::string &name = "", bool has_nuw = false, bool has_nsw = false); + value *create_shl(value *lhs, value *rhs, const std::string &name = "", bool has_nuw = false, bool has_nsw = false); + value *create_ashr(value *lhs, value *rhs, const std::string &name = "", bool has_nuw = false, bool has_nsw = false); + // GEP + value *create_gep(value *ptr, const std::vector& idx_list, const std::string &name = ""); + // Comparison (int) + value *create_icmp(cmp_inst::pred_t pred, value *lhs, value *rhs, const std::string &name = ""); value *create_icmpSLE(value *lhs, value *rhs, const std::string &name = ""); - value *create_icmpULE(value *lhs, value *rhs, const std::string &name = ""); - value *create_fcmpOGE(value *lhs, value *rhs, const std::string &name = ""); + value *create_icmpSLT(value *lhs, value *rhs, const std::string &name = ""); value *create_icmpSGE(value *lhs, value *rhs, const std::string &name = ""); + value *create_icmpSGT(value *lhs, value *rhs, const std::string &name = ""); + value *create_icmpULE(value *lhs, value *rhs, const std::string &name = ""); + value *create_icmpULT(value *lhs, value *rhs, const std::string &name = ""); value *create_icmpUGE(value *lhs, value *rhs, const std::string &name = ""); - value *create_fcmpOEQ(value *lhs, value *rhs, const std::string &name = ""); + value *create_icmpUGT(value *lhs, value *rhs, const std::string &name = ""); value *create_icmpEQ(value *lhs, value *rhs, const std::string &name = ""); - value *create_fcmpONE(value *lhs, value *rhs, const std::string &name = ""); value *create_icmpNE(value *lhs, value *rhs, const std::string &name = ""); + // Comparison (float) + value *create_fcmp(cmp_inst::pred_t pred, value *lhs, value *rhs, const std::string &name = ""); + value *create_fcmpOLT(value *lhs, value *rhs, const std::string &name = ""); + value *create_fcmpOGT(value *lhs, value *rhs, const std::string &name = ""); + value *create_fcmpOLE(value *lhs, value *rhs, const std::string &name = ""); + value *create_fcmpOGE(value *lhs, value *rhs, const std::string &name = ""); + value *create_fcmpOEQ(value *lhs, value *rhs, const std::string &name = ""); + value *create_fcmpONE(value *lhs, value *rhs, const std::string &name = ""); + // Logical value *create_and(value *lhs, value *rhs, const std::string &name = ""); value *create_xor(value *lhs, value *rhs, const std::string &name = ""); value *create_or(value *lhs, value *rhs, const std::string &name = ""); @@ -81,6 +101,11 @@ public: value *create_broadcast(value *arg, const std::vector &shapes, const std::string &name = ""); // Terminators value *create_ret_void(); + +private: + context &ctx_; + basic_block *block_; + instruction *insert_point_; }; } diff --git a/include/ir/instructions.h b/include/ir/instructions.h index 782033cb8..05ee57070 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -1,28 +1,165 @@ #ifndef TDL_INCLUDE_IR_INSTRUCTIONS_H #define TDL_INCLUDE_IR_INSTRUCTIONS_H +#include #include "value.h" +#include "llvm/IR/Instructions.h" namespace tdl{ namespace ir{ -/* Instructions */ -class instruction: public value{ +class basic_block; +//===----------------------------------------------------------------------===// +// instruction classes +//===----------------------------------------------------------------------===// + +class instruction: public user{ +public: + // constructors + instruction(type *ty, unsigned num_used, instruction *next = nullptr); + + // parent + const basic_block *get_parent() const { return parent_;} + basic_block *get_parent() { return parent_; } + +private: + basic_block *parent_; }; +//===----------------------------------------------------------------------===// +// phi_node classes +//===----------------------------------------------------------------------===// + class phi_node: public instruction{ +private: + phi_node(type *ty, unsigned num_reserved); +public: + void add_incoming(value *x, basic_block *bb); + + // Factory methods + static phi_node* create(type *ty, unsigned num_reserved); + +private: + unsigned num_reserved_; }; +//===----------------------------------------------------------------------===// +// binary_operator classes +//===----------------------------------------------------------------------===// + class binary_operator: public instruction{ +public: + typedef llvm::BinaryOperator::BinaryOps op_t; + +protected: + // Constructors + binary_operator(op_t op, value *lhs, value *rhs, type *ty, const std::string &name, instruction *next); + +public: + // Get operand + op_t get_op() const { return op_; } + + // Factory methods + static binary_operator *create(op_t op, value *lhs, value *rhs, + const std::string &name = "", instruction *next = nullptr); + static binary_operator *create_fneg(value *arg, const std::string &name = "", instruction *next = nullptr); + static binary_operator *create_neg(value *arg, const std::string &name = "", instruction *next = nullptr); + static binary_operator *create_not(value *arg, const std::string &name = "", instruction *next = nullptr); + +public: + op_t op_; +}; + + +//===----------------------------------------------------------------------===// +// cmp_inst classes +//===----------------------------------------------------------------------===// + +class cmp_inst: public instruction{ +public: + typedef llvm::CmpInst::Predicate pred_t; + +protected: + cmp_inst(pred_t pred, value *lhs, value *rhs, type *ty, + const std::string &name = "", instruction *next = nullptr); + +private: + pred_t pred_; +}; + +class icmp_inst: public cmp_inst{ +public: + static icmp_inst* create(pred_t pred, value *lhs, value *rhs, + const std::string &name = "", instruction *next = nullptr); +}; + +class fcmp_inst: public cmp_inst{ +public: + static fcmp_inst* create(pred_t pred, value *lhs, value *rhs, + const std::string &name = "", instruction *next = nullptr); +}; + +//===----------------------------------------------------------------------===// +// cast_inst classes +//===----------------------------------------------------------------------===// + +class cast_inst: public instruction{ +public: + typedef llvm::CastInst::CastOps op_t; + +protected: + // Constructors + cast_inst(op_t op, value *arg, type *ty, const std::string &name, instruction *next); + +public: + // Factory methods + static cast_inst *create(op_t op, value *arg, type *ty, + const std::string &name = "", instruction *next = nullptr); + static cast_inst *create_integer_cast(value *arg, type *ty, bool is_signed, + const std::string &name = "", instruction *next = nullptr); + + +private: + op_t op_; +}; + +//===----------------------------------------------------------------------===// +// terminator_inst classes +//===----------------------------------------------------------------------===// + +class terminator_inst: public instruction{ +public: +}; + +class return_inst: public instruction{ }; -class unary_operator: public instruction{ +//===----------------------------------------------------------------------===// +// branch_inst classes +//===----------------------------------------------------------------------===// +class branch_inst: public instruction{ +public: + static branch_inst* create(basic_block *dest, + const std::string &name = "", instruction *next = nullptr); + static branch_inst* create(value *cond, basic_block *if_dest, basic_block *else_dest, + const std::string &name = "", instruction *next = nullptr); }; +//===----------------------------------------------------------------------===// +// getelementptr_inst classes +//===----------------------------------------------------------------------===// + +class getelementptr_inst: public instruction{ +public: + static getelementptr_inst* create(value *ptr, const std::vector &idx, + const std::string &name = "", instruction *next = nullptr); +}; + + } } diff --git a/include/ir/module.h b/include/ir/module.h index fb3745caf..3b6536cda 100644 --- a/include/ir/module.h +++ b/include/ir/module.h @@ -18,11 +18,11 @@ class context; class module { typedef std::pair val_key_t; phi_node *make_phi(type *ty, unsigned num_values, basic_block *block); - void add_phi_operands(const std::string& name, phi_node *&phi); + value *add_phi_operands(const std::string& name, phi_node *&phi); value *get_value_recursive(const std::string& name, basic_block *block); public: - module(const std::string &name, context *ctx); + module(const std::string &name, context &ctx); context& get_context(); builder& get_builder(); // Setters @@ -35,6 +35,8 @@ public: void seal_block(basic_block *block); private: + std::string name_; + context &context_; builder builder_; std::map values_; std::set sealed_blocks_; diff --git a/include/ir/value.h b/include/ir/value.h index 79a409cff..b7a017200 100644 --- a/include/ir/value.h +++ b/include/ir/value.h @@ -2,22 +2,73 @@ #define TDL_INCLUDE_IR_VALUE_H #include +#include +#include namespace tdl{ namespace ir{ class type; +class use; + +//===----------------------------------------------------------------------===// +// value class +//===----------------------------------------------------------------------===// -/* Value */ class value { public: + // constructor + value(type *ty, const std::string &name = ""); + // uses + void add_use(use *arg); + // name void set_name(const std::string &name); - type* get_type(); + type* get_type() { return ty_; } private: + type *ty_; std::string name_; }; +//===----------------------------------------------------------------------===// +// use class +//===----------------------------------------------------------------------===// + +class use { +public: + // Implicit conversions to/from value + friend class value; + operator value *() const { return val_; } + value *get() const { return val_; } + value *operator->() { return val_; } + const value *operator->() const { return val_; } + inline void set(value *val); + inline value *operator=(value *rhs); + inline const use &operator=(const use &rhs); + +private: + value *val_; +}; + +//===----------------------------------------------------------------------===// +// user class +//===----------------------------------------------------------------------===// + +class user: public value{ +public: + // Constructor + user(type *ty, unsigned num_ops, const std::string &name = "") + : value(ty, name), ops_(num_ops){ } + + // Operands + void set_operand(unsigned i, value *x); + value *get_operand(unsigned i); + unsigned get_num_operands(); + +private: + std::vector ops_; +}; + } } diff --git a/lib/codegen.cpp b/lib/codegen.cpp index 920584edc..1ef5df769 100644 --- a/lib/codegen.cpp +++ b/lib/codegen.cpp @@ -342,7 +342,7 @@ ir::value *binary_operator::llvm_op(ir::module *mod, ir::builder &builder, ir::v if(op_==SUB && is_ptr) return builder.create_gep(lhs, {builder.create_neg(rhs)}); if(op_==LEFT_SHIFT) - return builder.create_lshr(lhs, rhs, name); + return builder.create_shl(lhs, rhs, name); if(op_==RIGHT_SHIFT) return builder.create_ashr(lhs, rhs, name); if(op_ == LT && is_float) diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index e69de29bb..209ab78c5 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -0,0 +1,224 @@ +#include +#include "ir/basic_block.h" +#include "ir/builder.h" +#include "ir/instructions.h" +#include "llvm/IR/Instruction.h" + +namespace tdl{ +namespace ir{ + +builder::builder(context &ctx): + ctx_(ctx){} + +//===----------------------------------------------------------------------===// +// insertion helpers +//===----------------------------------------------------------------------===// + +void builder::set_insert_point(instruction *instr){ + block_ = instr->get_parent(); + insert_point_ = instr; +} + +void builder::set_insert_point(basic_block *block){ + block_ = block; + insert_point_ = block->end(); +} + +//===----------------------------------------------------------------------===// +// branch instructions +//===----------------------------------------------------------------------===// + +value* builder::create_br(basic_block *dest){ + return insert(branch_inst::create(dest)); +} + +value* builder::create_cond_br(value *cond, basic_block *if_dest, basic_block *else_dest){ + return insert(branch_inst::create(cond, if_dest, else_dest)); +} + +//===----------------------------------------------------------------------===// +// cast instructions +//===----------------------------------------------------------------------===// +#define DEFINE_CAST_INSTR(SUFFIX, OPCODE)\ + value *builder::create_ ## SUFFIX(value *src, type *dst_ty, std::string const &name){\ + return create_cast(OPCODE, src, dst_ty, name);\ + } + +DEFINE_CAST_INSTR(si_to_fp, llvm::Instruction::SIToFP) +DEFINE_CAST_INSTR(ui_to_fp, llvm::Instruction::UIToFP) +DEFINE_CAST_INSTR(fp_to_si, llvm::Instruction::FPToSI) +DEFINE_CAST_INSTR(fp_to_ui, llvm::Instruction::FPToUI) +DEFINE_CAST_INSTR(fp_ext, llvm::Instruction::FPExt) +DEFINE_CAST_INSTR(fp_trunc, llvm::Instruction::FPTrunc) + +value* builder::create_cast(cast_inst::op_t op, value *v, type *dst_ty, const std::string &name){ + return insert(cast_inst::create(op, v, dst_ty), name); +} + +value* builder::create_int_cast(value *src, type *dst_ty, bool is_signed, const std::string &name){ + return insert(cast_inst::create_integer_cast(src, dst_ty, is_signed), name); +} + +//===----------------------------------------------------------------------===// +// phi instructions +//===----------------------------------------------------------------------===// + +phi_node* builder::create_phi(type *ty, unsigned num_reserved, const std::string &name){ + return insert(phi_node::create(ty, num_reserved), name); +} + +//===----------------------------------------------------------------------===// +// binary float instructions +//===----------------------------------------------------------------------===// + +#define DEFINE_BINARY_FLOAT(SUFFIX, OPCODE)\ + value *builder::create_ ## SUFFIX(value *lhs, value *rhs, const std::string &name){\ + return insert(binary_operator::create(OPCODE, lhs, rhs), name);\ + } + +#define DEFINE_UNARY_FLOAT(SUFFIX)\ + value *builder::create_ ## SUFFIX(value *arg, const std::string &name){\ + return insert(binary_operator::create_ ## SUFFIX(arg), name);\ + } + +// Binary +DEFINE_BINARY_FLOAT(fmul, llvm::Instruction::FMul) +DEFINE_BINARY_FLOAT(fdiv, llvm::Instruction::FDiv) +DEFINE_BINARY_FLOAT(frem, llvm::Instruction::FRem) +DEFINE_BINARY_FLOAT(fadd, llvm::Instruction::FAdd) +DEFINE_BINARY_FLOAT(fsub, llvm::Instruction::FSub) +// Unary +DEFINE_UNARY_FLOAT(fneg) + + +//===----------------------------------------------------------------------===// +// binary int instructions +//===----------------------------------------------------------------------===// + +#define DEFINE_NOWRAP_BINARY(SUFFIX, OPCODE)\ + value* builder::create_ ## SUFFIX(value *lhs, value *rhs, const std::string &name, bool has_nuw, bool has_nsw){\ + return create_insert_nuwnswb_binop(OPCODE, lhs, rhs, name, has_nuw, has_nsw);\ + }\ + +#define DEFINE_BINARY_INT(SUFFIX, OPCODE)\ + value *builder::create_ ## SUFFIX(value *lhs, value *rhs, const std::string &name){\ + return insert(binary_operator::create(OPCODE, lhs, rhs), name);\ + } + +#define DEFINE_UNARY_INT(SUFFIX)\ + value *builder::create_ ## SUFFIX(value *arg, const std::string &name){\ + return insert(binary_operator::create_ ## SUFFIX(arg), name);\ + } + +// Binary +DEFINE_NOWRAP_BINARY(mul, llvm::Instruction::Mul) +DEFINE_NOWRAP_BINARY(add, llvm::Instruction::Add) +DEFINE_NOWRAP_BINARY(sub, llvm::Instruction::Sub) +DEFINE_NOWRAP_BINARY(shl, llvm::Instruction::Shl) +DEFINE_NOWRAP_BINARY(ashr, llvm::Instruction::AShr) +DEFINE_BINARY_INT(sdiv, llvm::Instruction::SDiv) +DEFINE_BINARY_INT(udiv, llvm::Instruction::UDiv) +DEFINE_BINARY_INT(srem, llvm::Instruction::SRem) +DEFINE_BINARY_INT(urem, llvm::Instruction::URem) +DEFINE_BINARY_INT(and, llvm::Instruction::And) +DEFINE_BINARY_INT(or, llvm::Instruction::Or) +DEFINE_BINARY_INT(xor, llvm::Instruction::Xor) +// Unary +DEFINE_UNARY_INT(neg) +DEFINE_UNARY_INT(not) + + +//===----------------------------------------------------------------------===// +// getelementptr instructions +//===----------------------------------------------------------------------===// + +value* builder::create_gep(value *ptr, const std::vector& idx_list, const std::string &name){ + return insert(getelementptr_inst::create(ptr, idx_list), name); +} + +//===----------------------------------------------------------------------===// +// icmp instructions +//===----------------------------------------------------------------------===// + +value *builder::create_icmp(cmp_inst::pred_t pred, value *lhs, value *rhs, const std::string &name){ + return insert(icmp_inst::create(pred, lhs, rhs), name); +} + +#define DEFINE_ICMP_INSTR(SUFFIX, OPCODE)\ + value *builder::create_icmp ## SUFFIX(value *lhs, value *rhs, const std::string &name){\ + return create_icmp(OPCODE, lhs, rhs, name);\ + } + +// Signed +DEFINE_ICMP_INSTR(SLE, llvm::ICmpInst::ICMP_SLE) +DEFINE_ICMP_INSTR(SLT, llvm::ICmpInst::ICMP_SLT) +DEFINE_ICMP_INSTR(SGE, llvm::ICmpInst::ICMP_SGE) +DEFINE_ICMP_INSTR(SGT, llvm::ICmpInst::ICMP_SGT) +// Unsigned +DEFINE_ICMP_INSTR(ULE, llvm::ICmpInst::ICMP_ULE) +DEFINE_ICMP_INSTR(ULT, llvm::ICmpInst::ICMP_ULT) +DEFINE_ICMP_INSTR(UGE, llvm::ICmpInst::ICMP_UGE) +DEFINE_ICMP_INSTR(UGT, llvm::ICmpInst::ICMP_UGT) +// General +DEFINE_ICMP_INSTR(EQ, llvm::ICmpInst::ICMP_EQ) +DEFINE_ICMP_INSTR(NE, llvm::ICmpInst::ICMP_NE) + + +//===----------------------------------------------------------------------===// +// fcmp instructions +//===----------------------------------------------------------------------===// + +value *builder::create_fcmp(cmp_inst::pred_t pred, value *lhs, value *rhs, const std::string &name){ + return insert(fcmp_inst::create(pred, lhs, rhs), name); +} + +#define DEFINE_FCMP_INSTR(SUFFIX, OPCODE)\ + value *builder::create_fcmp ## SUFFIX(value *lhs, value *rhs, const std::string &name){\ + return create_fcmp(OPCODE, lhs, rhs, name);\ + } + +// Ordered +DEFINE_FCMP_INSTR(OLE, llvm::FCmpInst::FCMP_OLE) +DEFINE_FCMP_INSTR(OLT, llvm::FCmpInst::FCMP_OLT) +DEFINE_FCMP_INSTR(OGE, llvm::FCmpInst::FCMP_OGE) +DEFINE_FCMP_INSTR(OGT, llvm::FCmpInst::FCMP_OGT) +DEFINE_FCMP_INSTR(OEQ, llvm::FCmpInst::FCMP_OEQ) +DEFINE_FCMP_INSTR(ONE, llvm::FCmpInst::FCMP_ONE) + + + +//===----------------------------------------------------------------------===// +// load instructions +//===----------------------------------------------------------------------===// + +//value *builder::create_load(value *arg, const std::string &name){ + +//} + +//===----------------------------------------------------------------------===// +// tile instructions +//===----------------------------------------------------------------------===// + +//value *create_splat(value *arg, const std::vector &shapes, const std::string &name) { +//} + +//value *create_reshape(value *arg, const std::vector &shapes, const std::string &name) { + +//} + +//value *create_broadcast(value *arg, const std::vector &shapes, const std::string &name) { + +//} + +//===----------------------------------------------------------------------===// +// terminator instructions +//===----------------------------------------------------------------------===// + +//value *create_red_void() { + +//} + + + +} +} diff --git a/lib/ir/context.cpp b/lib/ir/context.cpp index f565ac9d1..8357b0ab1 100644 --- a/lib/ir/context.cpp +++ b/lib/ir/context.cpp @@ -3,8 +3,5 @@ namespace tdl{ namespace ir{ -/* Context */ -context::context() { } - } } diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index e69de29bb..8a7061cc7 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -0,0 +1,26 @@ +#include "ir/basic_block.h" +#include "ir/instructions.h" + +namespace tdl{ +namespace ir{ + + +instruction::instruction(type *ty, unsigned num_ops, instruction *next) + : user(ty, num_ops) { + if(next){ + basic_block *block = next->get_parent(); + assert(block && "Next instruction is not in a basic block!"); + } +} + +// // If requested, insert this instruction into a basic block... +// if (InsertBefore) { +// BasicBlock *BB = InsertBefore->getParent(); +// assert(BB && "Instruction to insert before is not in a basic block!"); +// BB->getInstList().insert(InsertBefore->getIterator(), this); +// } + + + +} +} diff --git a/lib/ir/module.cpp b/lib/ir/module.cpp index 7661c349f..3c6739213 100644 --- a/lib/ir/module.cpp +++ b/lib/ir/module.cpp @@ -1,71 +1,73 @@ +#include "ir/basic_block.h" #include "ir/module.h" namespace tdl{ namespace ir{ /* Module */ -module::module(const std::string &name, context *ctx) - : handle_(name.c_str(), *ctx->handle()), builder_(*ctx->handle()) { +module::module(const std::string &name, context &ctx) + : name_(name), context_(ctx), builder_(ctx) { sealed_blocks_.insert(nullptr); } -Module* module::handle() { - return &handle_; -} - -IRBuilder<>& module::builder() { +ir::builder& module::get_builder() { return builder_; } -void module::set_value(const std::string& name, BasicBlock *block, Value *value){ +ir::context& module::get_context() { + return context_; +} + +void module::set_value(const std::string& name, ir::basic_block *block, ir::value *value){ values_[val_key_t{name, block}] = value; } -void module::set_value(const std::string& name, Value* value){ - return set_value(name, builder_.GetInsertBlock(), value); +void module::set_value(const std::string& name, ir::value *value){ + return set_value(name, builder_.get_insert_block(), value); } -PHINode* module::make_phi(Type *type, unsigned num_values, BasicBlock *block){ - Instruction* instr = block->getFirstNonPHIOrDbg(); +ir::phi_node* module::make_phi(ir::type *ty, unsigned num_values, ir::basic_block *block){ + ir::instruction* instr = block->get_first_non_phi_or_dbg(); if(instr) - builder_.SetInsertPoint(instr); - PHINode *res = builder_.CreatePHI(type, num_values); + builder_.set_insert_point(instr); + ir::phi_node *res = builder_.create_phi(ty, num_values); if(instr) - builder_.SetInsertPoint(block); + builder_.set_insert_point(block); return res; } -Value *module::add_phi_operands(const std::string& name, PHINode *&phi){ - BasicBlock *block = phi->getParent(); - for(BasicBlock *pred: predecessors(block)){ - Value *value = get_value(name, pred); - phi->addIncoming(value, pred); +ir::value *module::add_phi_operands(const std::string& name, ir::phi_node *&phi){ + ir::basic_block *block = phi->get_parent(); + for(ir::basic_block *pred: block->get_predecessors()){ + ir::value *value = get_value(name, pred); + phi->add_incoming(value, pred); } return phi; } -Value *module::get_value_recursive(const std::string& name, BasicBlock *block) { - Value *result; +ir::value *module::get_value_recursive(const std::string& name, ir::basic_block *block) { + ir::value *result; + auto &preds = block->get_predecessors(); if(sealed_blocks_.find(block) == sealed_blocks_.end()){ - Value *pred = get_value(name, *pred_begin(block)); - incomplete_phis_[block][name] = make_phi(pred->getType(), 1, block); - result = (Value*)incomplete_phis_[block][name]; + ir::value *pred = get_value(name, preds.front()); + incomplete_phis_[block][name] = make_phi(pred->get_type(), 1, block); + result = (ir::value*)incomplete_phis_[block][name]; } - else if(pred_size(block) <= 1){ - bool has_pred = pred_size(block); - result = get_value(name, has_pred?*pred_begin(block):nullptr); + else if(preds.size() <= 1){ + bool has_pred = preds.size(); + result = get_value(name, has_pred?preds.front():nullptr); } else{ - Value *pred = get_value(name, *pred_begin(block)); - result = make_phi(pred->getType(), 1, block); + ir::value *pred = get_value(name, preds.front()); + result = make_phi(pred->get_type(), 1, block); set_value(name, block, result); - add_phi_operands(name, (PHINode*&)result); + add_phi_operands(name, (ir::phi_node*&)result); } set_value(name, block, result); return result; } -Value *module::get_value(const std::string& name, BasicBlock *block) { +ir::value *module::get_value(const std::string& name, ir::basic_block *block) { val_key_t key(name, block); if(values_.find(key) != values_.end()){ return values_.at(key); @@ -73,11 +75,11 @@ Value *module::get_value(const std::string& name, BasicBlock *block) { return get_value_recursive(name, block); } -Value *module::get_value(const std::string& name) { - return get_value(name, builder_.GetInsertBlock()); +ir::value *module::get_value(const std::string& name) { + return get_value(name, builder_.get_insert_block()); } -Value *module::seal_block(BasicBlock *block){ +void module::seal_block(ir::basic_block *block){ for(auto &x: incomplete_phis_[block]) add_phi_operands(x.first, x.second); sealed_blocks_.insert(block); diff --git a/lib/ir/value.cpp b/lib/ir/value.cpp new file mode 100644 index 000000000..e069f593c --- /dev/null +++ b/lib/ir/value.cpp @@ -0,0 +1,58 @@ +#include "ir/value.h" +#include + +namespace tdl{ +namespace ir{ + +class type; + +//===----------------------------------------------------------------------===// +// value class +//===----------------------------------------------------------------------===// + +value::value(type *ty, const std::string &name): ty_(ty){ + set_name(name); +} + +// TODO: automatic naming scheme + update symbol table +void value::set_name(const std::string &name){ + name_ = name; +} + + +//===----------------------------------------------------------------------===// +// use class +//===----------------------------------------------------------------------===// +void use::set(value *val){ + val_ = val; +} + +value *use::operator=(value *rhs){ + set(rhs); + return rhs; +} + +const use &use::operator=(const use &rhs){ + set(rhs.val_); + return rhs; +} + +//===----------------------------------------------------------------------===// +// user class +//===----------------------------------------------------------------------===// +void user::set_operand(unsigned i, value *x){ + assert(i < ops_.size() && "set_operand() out of range!"); + ops_[i] = x; +} + +value* user::get_operand(unsigned i){ + assert(i < ops_.size() && "get_operand() out of range!"); + return ops_[i]; +} + +unsigned user::get_num_operands(){ + return ops_.size(); +} + +} +} From 0378b9eb4379b51b3337207e79c651fa152a1008 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 2 Jan 2019 14:37:14 -0500 Subject: [PATCH 024/494] [intermediate representation] more implementation --- include/ir/basic_block.h | 51 ++++++++++++++++++++---- include/ir/builder.h | 15 +++++-- include/ir/constant.h | 15 ++++++- include/ir/instructions.h | 23 +++++++++-- lib/ir/builder.cpp | 4 +- lib/ir/instructions.cpp | 84 ++++++++++++++++++++++++++++++++++++--- lib/ir/module.cpp | 8 ++-- 7 files changed, 171 insertions(+), 29 deletions(-) diff --git a/include/ir/basic_block.h b/include/ir/basic_block.h index e98a7ddfd..240123796 100644 --- a/include/ir/basic_block.h +++ b/include/ir/basic_block.h @@ -2,6 +2,7 @@ #define TDL_INCLUDE_IR_BASIC_BLOCK_H #include +#include #include "value.h" namespace tdl{ @@ -14,16 +15,49 @@ class instruction; /* Basic Block */ class basic_block: public value{ public: - // Accessors - function* get_parent(); - instruction* get_first_non_phi_or_dbg(); - // Iterators - instruction* begin(); - instruction* end(); - // CFG + // instruction iterator types + typedef std::list inst_list_t; + typedef inst_list_t::iterator iterator; + typedef inst_list_t::const_iterator const_iterator; + typedef inst_list_t::reverse_iterator reverse_iterator; + typedef inst_list_t::const_reverse_iterator const_reverse_iterator; + + +public: + // parent + function* get_parent() { return parent_; } + + // get iterator to first instruction that is not a phi + iterator get_first_non_phi(); + + // get instruction list + inst_list_t &get_inst_list() { return inst_list_; } + + // instruction iterator functions + inline iterator begin() { return inst_list_.begin(); } + inline const_iterator begin() const { return inst_list_.begin(); } + inline iterator end () { return inst_list_.end(); } + inline const_iterator end () const { return inst_list_.end(); } + + inline reverse_iterator rbegin() { return inst_list_.rbegin(); } + inline const_reverse_iterator rbegin() const { return inst_list_.rbegin(); } + inline reverse_iterator rend () { return inst_list_.rend(); } + inline const_reverse_iterator rend () const { return inst_list_.rend(); } + + inline size_t size() const { return inst_list_.size(); } + inline bool empty() const { return inst_list_.empty(); } + inline const instruction &front() const { return *inst_list_.front(); } + inline instruction &front() { return *inst_list_.front(); } + inline const instruction &back() const { return *inst_list_.back(); } + inline instruction &back() { return *inst_list_.back(); } + + // get predecessors const std::vector& get_predecessors() const; + + // add predecessor void add_predecessor(basic_block* pred); - // Factory functions + + // factory functions static basic_block* create(context &ctx, const std::string &name, function *parent); private: @@ -31,6 +65,7 @@ private: std::string name_; function *parent_; std::vector preds_; + inst_list_t inst_list_; }; } diff --git a/include/ir/builder.h b/include/ir/builder.h index 2e208d62b..254d3bab6 100644 --- a/include/ir/builder.h +++ b/include/ir/builder.h @@ -4,6 +4,7 @@ #include #include #include "instructions.h" +#include "basic_block.h" namespace tdl{ namespace ir{ @@ -18,14 +19,16 @@ class phi_node; /* Builder */ class builder{ + typedef basic_block::iterator iterator; + public: // Constructor builder(context &ctx); // Setters - void set_insert_point(instruction* instr); + void set_insert_point(iterator instr); void set_insert_point(basic_block* block); basic_block* get_insert_block() { return block_; } - instruction* get_insert_point() { return insert_point_;} + iterator get_insert_point() { return insert_point_;} // Constants value *get_int32(unsigned val); // Types @@ -33,7 +36,11 @@ public: type *get_double_ty(); // Insert template - InstTy* insert(InstTy *instr, const std::string &name = ""); + InstTy* insert(InstTy *inst, const std::string &name = ""){ + if(block_) + block_->get_inst_list().insert(insert_point_, inst); + inst->set_name(name); + } // Branch instructions value* create_br(basic_block *dest); value* create_cond_br(value *cond, basic_block* if_dest, basic_block* else_dest); @@ -105,7 +112,7 @@ public: private: context &ctx_; basic_block *block_; - instruction *insert_point_; + iterator insert_point_; }; } diff --git a/include/ir/constant.h b/include/ir/constant.h index a7790e9d3..40f3b056c 100644 --- a/include/ir/constant.h +++ b/include/ir/constant.h @@ -11,7 +11,8 @@ class context; /* Constant */ class constant: public value{ - +public: + static constant* get_all_ones_value(type *ty); }; /* Undef value */ @@ -26,6 +27,18 @@ public: static constant_data_array* get_string(context &ctx, const std::string &str); }; +/* Constant int */ +class constant_int: public constant{ + +}; + +/* constant fp */ +class constant_fp: public constant{ +public: + static constant* get_zero_value_for_negation(type *ty); +}; + + } } diff --git a/include/ir/instructions.h b/include/ir/instructions.h index 05ee57070..95c26de1d 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -15,9 +15,11 @@ class basic_block; //===----------------------------------------------------------------------===// class instruction: public user{ -public: +protected: // constructors - instruction(type *ty, unsigned num_used, instruction *next = nullptr); + instruction(type *ty, unsigned num_ops, instruction *next = nullptr); + +public: // parent const basic_block *get_parent() const { return parent_;} @@ -80,22 +82,35 @@ public: class cmp_inst: public instruction{ public: typedef llvm::CmpInst::Predicate pred_t; + using pcmp = llvm::CmpInst; + +private: + type* make_cmp_result_type(type *ty); protected: - cmp_inst(pred_t pred, value *lhs, value *rhs, type *ty, - const std::string &name = "", instruction *next = nullptr); + cmp_inst(pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next); + + static bool is_fp_predicate(pred_t pred); + static bool is_int_predicate(pred_t pred); + +public: + private: pred_t pred_; }; class icmp_inst: public cmp_inst{ + using cmp_inst::cmp_inst; + public: static icmp_inst* create(pred_t pred, value *lhs, value *rhs, const std::string &name = "", instruction *next = nullptr); }; class fcmp_inst: public cmp_inst{ + using cmp_inst::cmp_inst; + public: static fcmp_inst* create(pred_t pred, value *lhs, value *rhs, const std::string &name = "", instruction *next = nullptr); diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index 209ab78c5..a5d14b913 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -14,8 +14,8 @@ builder::builder(context &ctx): // insertion helpers //===----------------------------------------------------------------------===// -void builder::set_insert_point(instruction *instr){ - block_ = instr->get_parent(); +void builder::set_insert_point(basic_block::iterator instr){ + block_ = (*instr)->get_parent(); insert_point_ = instr; } diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 8a7061cc7..383085c10 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -1,26 +1,98 @@ #include "ir/basic_block.h" #include "ir/instructions.h" +#include "ir/constant.h" namespace tdl{ namespace ir{ +//===----------------------------------------------------------------------===// +// instruction classes +//===----------------------------------------------------------------------===// instruction::instruction(type *ty, unsigned num_ops, instruction *next) : user(ty, num_ops) { if(next){ basic_block *block = next->get_parent(); assert(block && "Next instruction is not in a basic block!"); + auto it = std::find(block->begin(), block->end(), next); + block->get_inst_list().insert(it, next); } } -// // If requested, insert this instruction into a basic block... -// if (InsertBefore) { -// BasicBlock *BB = InsertBefore->getParent(); -// assert(BB && "Instruction to insert before is not in a basic block!"); -// BB->getInstList().insert(InsertBefore->getIterator(), this); -// } +//===----------------------------------------------------------------------===// +// phi_node classes +//===----------------------------------------------------------------------===// + +// Add incoming +void phi_node::add_incoming(value *x, basic_block *bb){ + +} + +// Factory methods +phi_node* phi_node::create(type *ty, unsigned num_reserved){ + return new phi_node(ty, num_reserved); +} +//===----------------------------------------------------------------------===// +// binary_operator classes +//===----------------------------------------------------------------------===// + +binary_operator::binary_operator(op_t op, value *lhs, value *rhs, type *ty, const std::string &name, instruction *next) + : instruction(ty, 2, next), op_(op){ + set_operand(0, lhs); + set_operand(1, rhs); +} + +binary_operator *binary_operator::create(op_t op, value *lhs, value *rhs, const std::string &name, instruction *next){ + assert(lhs->get_type() == rhs->get_type() && + "Cannot create binary operator with two operands of differing type!"); + return new binary_operator(op, lhs, rhs, lhs->get_type(), name, next); +} + +binary_operator *binary_operator::create_fneg(value *arg, const std::string &name, instruction *next){ + assert(arg->get_type()->is_floating_point_ty()); + value *zero = constant_fp::get_zero_value_for_negation(arg->get_type()); + return binary_operator::create(llvm::Instruction::FSub, zero, arg, name, next); +} + +binary_operator *binary_operator::create_neg(value *arg, const std::string &name, instruction *next){ + assert(arg->get_type()->is_integer_ty()); + value *zero = constant_fp::get_zero_value_for_negation(arg->get_type()); + return binary_operator::create(llvm::Instruction::Sub, zero, arg, name, next); +} + +binary_operator *binary_operator::create_not(value *arg, const std::string &name, instruction *next){ + assert(arg->get_type()->is_integer_ty()); + constant *mask = constant::get_all_ones_value(arg->get_type()); + return binary_operator::create(llvm::Instruction::Xor, arg, mask, name, next); +} + +//===----------------------------------------------------------------------===// +// cmp_inst classes +//===----------------------------------------------------------------------===// + +bool cmp_inst::is_fp_predicate(pred_t pred) { + return pred >= pcmp::FIRST_FCMP_PREDICATE && pred <= pcmp::LAST_FCMP_PREDICATE; +} + +bool cmp_inst::is_int_predicate(pred_t pred) { + return pred >= pcmp::FIRST_ICMP_PREDICATE && pred <= pcmp::LAST_ICMP_PREDICATE; +} + +// icmp_inst + +icmp_inst* icmp_inst::create(pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next){ + assert(is_int_predicate(pred)); + return new icmp_inst(pred, lhs, rhs, name, next); +} + +// fcmp_inst + +fcmp_inst* fcmp_inst::create(pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next){ + assert(is_fp_predicate(pred)); + return new fcmp_inst(pred, lhs, rhs, name, next); +} } } diff --git a/lib/ir/module.cpp b/lib/ir/module.cpp index 3c6739213..c1979881b 100644 --- a/lib/ir/module.cpp +++ b/lib/ir/module.cpp @@ -27,11 +27,11 @@ void module::set_value(const std::string& name, ir::value *value){ } ir::phi_node* module::make_phi(ir::type *ty, unsigned num_values, ir::basic_block *block){ - ir::instruction* instr = block->get_first_non_phi_or_dbg(); - if(instr) - builder_.set_insert_point(instr); + basic_block::iterator insert = block->get_first_non_phi(); + if(insert == block->end()) + builder_.set_insert_point(insert); ir::phi_node *res = builder_.create_phi(ty, num_values); - if(instr) + if(insert == block->end()) builder_.set_insert_point(block); return res; } From 22a83ab526a2545af94f3d4402601ed072666b3f Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 2 Jan 2019 19:29:59 -0500 Subject: [PATCH 025/494] [intermediate representation] added some instruction definitions --- include/ir/basic_block.h | 9 +- include/ir/context.h | 7 ++ include/ir/instructions.h | 99 +++++++++++++++---- include/ir/type.h | 34 ++++++- include/ir/value.h | 3 + lib/ir/instructions.cpp | 198 ++++++++++++++++++++++++++++++++++++-- 6 files changed, 314 insertions(+), 36 deletions(-) diff --git a/include/ir/basic_block.h b/include/ir/basic_block.h index 240123796..3a4989f27 100644 --- a/include/ir/basic_block.h +++ b/include/ir/basic_block.h @@ -24,8 +24,9 @@ public: public: - // parent + // accessors function* get_parent() { return parent_; } + context& get_context() { return ctx_; } // get iterator to first instruction that is not a phi iterator get_first_non_phi(); @@ -51,10 +52,8 @@ public: inline const instruction &back() const { return *inst_list_.back(); } inline instruction &back() { return *inst_list_.back(); } - // get predecessors - const std::vector& get_predecessors() const; - - // add predecessor + // predecessors + const std::vector& get_predecessors() const { return preds_; } void add_predecessor(basic_block* pred); // factory functions diff --git a/include/ir/context.h b/include/ir/context.h index 8b80f7491..35907ede1 100644 --- a/include/ir/context.h +++ b/include/ir/context.h @@ -4,8 +4,15 @@ namespace tdl{ namespace ir{ +class type; + /* Context */ class context { +public: + type *get_void_ty(); + type *get_int1_ty(); + +private: }; } diff --git a/include/ir/instructions.h b/include/ir/instructions.h index 95c26de1d..aee0aa1d0 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -9,6 +9,7 @@ namespace tdl{ namespace ir{ class basic_block; +class context; //===----------------------------------------------------------------------===// // instruction classes @@ -17,7 +18,7 @@ class basic_block; class instruction: public user{ protected: // constructors - instruction(type *ty, unsigned num_ops, instruction *next = nullptr); + instruction(type *ty, unsigned num_ops, const std::string &name = "", instruction *next = nullptr); public: @@ -38,13 +39,17 @@ private: phi_node(type *ty, unsigned num_reserved); public: - void add_incoming(value *x, basic_block *bb); + void set_incoming_value(unsigned i, value *v); + void set_incoming_block(unsigned i, basic_block *block); + + void add_incoming(value *v, basic_block *block); // Factory methods static phi_node* create(type *ty, unsigned num_reserved); private: unsigned num_reserved_; + std::vector blocks_; }; //===----------------------------------------------------------------------===// @@ -84,11 +89,10 @@ public: typedef llvm::CmpInst::Predicate pred_t; using pcmp = llvm::CmpInst; -private: - type* make_cmp_result_type(type *ty); - protected: - cmp_inst(pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next); + cmp_inst(type *ty, pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next); + + static type* make_cmp_result_type(type *ty); static bool is_fp_predicate(pred_t pred); static bool is_int_predicate(pred_t pred); @@ -116,17 +120,29 @@ public: const std::string &name = "", instruction *next = nullptr); }; +//===----------------------------------------------------------------------===// +// unary_inst classes +//===----------------------------------------------------------------------===// + +class unary_inst: public instruction { +protected: + unary_inst(type *Ty, value *v, const std::string &name, instruction *next); +}; + + //===----------------------------------------------------------------------===// // cast_inst classes //===----------------------------------------------------------------------===// -class cast_inst: public instruction{ +class cast_inst: public unary_inst{ + using unary_inst::unary_inst; + using ic = llvm::Instruction::CastOps; + public: typedef llvm::CastInst::CastOps op_t; -protected: - // Constructors - cast_inst(op_t op, value *arg, type *ty, const std::string &name, instruction *next); +private: + bool is_valid(op_t op, value *arg, type *ty); public: // Factory methods @@ -135,33 +151,67 @@ public: static cast_inst *create_integer_cast(value *arg, type *ty, bool is_signed, const std::string &name = "", instruction *next = nullptr); - private: op_t op_; }; +#define TDL_IR_DECLARE_CAST_INST_SIMPLE(name) \ + class name : public cast_inst{ \ + friend class cast_inst; \ + using cast_inst::cast_inst; \ + }; + +TDL_IR_DECLARE_CAST_INST_SIMPLE(trunc_inst) +TDL_IR_DECLARE_CAST_INST_SIMPLE(z_ext_inst) +TDL_IR_DECLARE_CAST_INST_SIMPLE(s_ext_inst) +TDL_IR_DECLARE_CAST_INST_SIMPLE(fp_trunc_inst) +TDL_IR_DECLARE_CAST_INST_SIMPLE(fp_ext_inst) +TDL_IR_DECLARE_CAST_INST_SIMPLE(ui_to_fp_inst) +TDL_IR_DECLARE_CAST_INST_SIMPLE(si_to_fp_inst) +TDL_IR_DECLARE_CAST_INST_SIMPLE(fp_to_ui_inst) +TDL_IR_DECLARE_CAST_INST_SIMPLE(fp_to_si_inst) +TDL_IR_DECLARE_CAST_INST_SIMPLE(ptr_to_int_inst) +TDL_IR_DECLARE_CAST_INST_SIMPLE(int_to_ptr_inst) +TDL_IR_DECLARE_CAST_INST_SIMPLE(bit_cast_inst) +TDL_IR_DECLARE_CAST_INST_SIMPLE(addr_space_cast_inst) + //===----------------------------------------------------------------------===// // terminator_inst classes //===----------------------------------------------------------------------===// class terminator_inst: public instruction{ -public: + using instruction::instruction; }; -class return_inst: public instruction{ +// return instruction +class return_inst: public terminator_inst{ + return_inst(context &ctx, value *ret_val, instruction *next); + +public: + // accessors + value *get_return_value() + { return get_num_operands() ? get_operand(0) : nullptr; } + + unsigned get_num_successors() const { return 0; } + + // factory methods + static return_inst* create(context &ctx, value *ret_val = nullptr, instruction *next = nullptr); }; -//===----------------------------------------------------------------------===// -// branch_inst classes -//===----------------------------------------------------------------------===// +// conditional/unconditional branch instruction + +class branch_inst: public terminator_inst{ + branch_inst(basic_block *dst, instruction *next); + branch_inst(basic_block *if_dst, basic_block *else_dst, value *cond, instruction *next); -class branch_inst: public instruction{ public: + + // factory methods static branch_inst* create(basic_block *dest, - const std::string &name = "", instruction *next = nullptr); + instruction *next = nullptr); static branch_inst* create(value *cond, basic_block *if_dest, basic_block *else_dest, - const std::string &name = "", instruction *next = nullptr); + instruction *next = nullptr); }; //===----------------------------------------------------------------------===// @@ -169,9 +219,20 @@ public: //===----------------------------------------------------------------------===// class getelementptr_inst: public instruction{ + getelementptr_inst(type *pointee_ty, value *ptr, const std::vector &idx, const std::string &name, instruction *next); + +private: + static type *get_return_type(type *ty, value *ptr, const std::vector &idx); + static type *get_indexed_type_impl(type *ty, const std::vector &idx); + static type *get_indexed_type(type *ty, const std::vector &idx); + public: static getelementptr_inst* create(value *ptr, const std::vector &idx, const std::string &name = "", instruction *next = nullptr); + +private: + type *source_elt_ty; + type *res_elt_ty; }; diff --git a/include/ir/type.h b/include/ir/type.h index 6a50690ed..874bffcdd 100644 --- a/include/ir/type.h +++ b/include/ir/type.h @@ -7,24 +7,40 @@ namespace tdl{ namespace ir{ class context; +class value; /* Type */ class type { public: - bool is_integer_ty() const; - bool is_pointer_ty() const; - bool is_float_ty() const; - bool is_double_ty() const; - bool is_floating_point_ty() const; + virtual ~type(){} + + // accessors + context &get_context() const; // type attributes unsigned get_fp_mantissa_width() const; unsigned get_integer_bit_width() const; + unsigned get_scalar_bitsize() const; const std::vector &get_tile_shapes() const; + type *get_scalar_ty() const; + unsigned get_pointer_address_space() const; + + // type predicates + bool is_int_or_tileint_ty(); + bool is_integer_ty() const; + bool is_integer_ty(unsigned width) const; + bool is_pointer_ty() const; + bool is_float_ty() const; + bool is_double_ty() const; + bool is_floating_point_ty() const; + bool is_sized() const; + bool is_tile_ty() const; + // Factory methods static type* get_void_ty(context &ctx); static type* get_float_ty(context &ctx); static type* get_double_ty(context &ctx); + }; class integer_type: public type { @@ -32,14 +48,22 @@ public: static integer_type* get(context &ctx, unsigned width); }; +class composite_type: public type{ +public: + bool index_valid(value *idx) const; + type* get_type_at_index(value *idx) const; +}; + class tile_type: public type { public: static tile_type* get(type *ty, const std::vector &shapes); + static tile_type* get_same_shapes(type *ty, type *ref); }; class pointer_type: public type { public: static pointer_type* get(type *ty, unsigned address_space); + type *get_element_ty() const; }; class function_type: public type { diff --git a/include/ir/value.h b/include/ir/value.h index b7a017200..effa44014 100644 --- a/include/ir/value.h +++ b/include/ir/value.h @@ -55,6 +55,9 @@ private: //===----------------------------------------------------------------------===// class user: public value{ +protected: + void resize_ops(unsigned n) { ops_.resize(n); } + public: // Constructor user(type *ty, unsigned num_ops, const std::string &name = "") diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 383085c10..b72341d72 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -1,6 +1,8 @@ +#include "ir/context.h" #include "ir/basic_block.h" #include "ir/instructions.h" #include "ir/constant.h" +#include "ir/type.h" namespace tdl{ namespace ir{ @@ -9,8 +11,8 @@ namespace ir{ // instruction classes //===----------------------------------------------------------------------===// -instruction::instruction(type *ty, unsigned num_ops, instruction *next) - : user(ty, num_ops) { +instruction::instruction(type *ty, unsigned num_ops, const std::string &name, instruction *next) + : user(ty, num_ops, name) { if(next){ basic_block *block = next->get_parent(); assert(block && "Next instruction is not in a basic block!"); @@ -23,9 +25,29 @@ instruction::instruction(type *ty, unsigned num_ops, instruction *next) // phi_node classes //===----------------------------------------------------------------------===// -// Add incoming -void phi_node::add_incoming(value *x, basic_block *bb){ +// Set incoming value +void phi_node::set_incoming_value(unsigned i, value *v){ + assert(v && "PHI node got a null value!"); + assert(get_type() == v->get_type() && + "All operands to PHI node must be the same type as the PHI node!"); + set_operand(i, v); +} +// Set incoming block +void phi_node::set_incoming_block(unsigned i, basic_block *block){ + assert(block && "PHI node got a null basic block!"); + blocks_[i] = block; +} + +// Add incoming +void phi_node::add_incoming(value *v, basic_block *block){ + if(get_num_operands()==num_reserved_){ + num_reserved_++; + resize_ops(num_reserved_); + blocks_.resize(num_reserved_); + } + set_incoming_value(get_num_operands() - 1, v); + set_incoming_block(get_num_operands() - 1, block); } // Factory methods @@ -39,7 +61,7 @@ phi_node* phi_node::create(type *ty, unsigned num_reserved){ //===----------------------------------------------------------------------===// binary_operator::binary_operator(op_t op, value *lhs, value *rhs, type *ty, const std::string &name, instruction *next) - : instruction(ty, 2, next), op_(op){ + : instruction(ty, 2, name, next), op_(op){ set_operand(0, lhs); set_operand(1, rhs); } @@ -72,6 +94,24 @@ binary_operator *binary_operator::create_not(value *arg, const std::string &name // cmp_inst classes //===----------------------------------------------------------------------===// +// cmp_inst + +cmp_inst::cmp_inst(type *ty, cmp_inst::pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next) + : instruction(ty, 2, name, next), pred_(pred) { + set_operand(0, lhs); + set_operand(1, rhs); +} + +type* cmp_inst::make_cmp_result_type(type *ty){ + type* int1_ty = ty->get_context().get_int1_ty(); + if (tile_type* tile_ty = dynamic_cast(ty)) + return tile_type::get_same_shapes(int1_ty, tile_ty); + return int1_ty; +} + + + + bool cmp_inst::is_fp_predicate(pred_t pred) { return pred >= pcmp::FIRST_FCMP_PREDICATE && pred <= pcmp::LAST_FCMP_PREDICATE; } @@ -84,15 +124,159 @@ bool cmp_inst::is_int_predicate(pred_t pred) { icmp_inst* icmp_inst::create(pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next){ assert(is_int_predicate(pred)); - return new icmp_inst(pred, lhs, rhs, name, next); + type *res_ty = make_cmp_result_type(lhs->get_type()); + return new icmp_inst(res_ty, pred, lhs, rhs, name, next); } // fcmp_inst fcmp_inst* fcmp_inst::create(pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next){ assert(is_fp_predicate(pred)); - return new fcmp_inst(pred, lhs, rhs, name, next); + type *res_ty = make_cmp_result_type(lhs->get_type()); + return new fcmp_inst(res_ty, pred, lhs, rhs, name, next); } +//===----------------------------------------------------------------------===// +// unary_inst classes +//===----------------------------------------------------------------------===// + +unary_inst::unary_inst(type *ty, value *v, const std::string &name, instruction *next) + : instruction(ty, 1, name, next) { + set_operand(0, v); +} + +//===----------------------------------------------------------------------===// +// cast_inst classes +//===----------------------------------------------------------------------===// + +cast_inst *cast_inst::create(op_t op, value *arg, type *ty, const std::string &name, instruction *next){ + assert(is_valid(op, arg, ty) && "Invalid cast!"); + // Construct and return the appropriate CastInst subclass + switch (op) { + case ic::Trunc: return new trunc_inst (ty, arg, name, next); + case ic::ZExt: return new z_ext_inst (ty, arg, name, next); + case ic::SExt: return new s_ext_inst (ty, arg, name, next); + case ic::FPTrunc: return new fp_trunc_inst (ty, arg, name, next); + case ic::FPExt: return new fp_ext_inst (ty, arg, name, next); + case ic::UIToFP: return new ui_to_fp_inst (ty, arg, name, next); + case ic::SIToFP: return new si_to_fp_inst (ty, arg, name, next); + case ic::FPToUI: return new fp_to_ui_inst (ty, arg, name, next); + case ic::FPToSI: return new fp_to_si_inst (ty, arg, name, next); + case ic::PtrToInt: return new ptr_to_int_inst (ty, arg, name, next); + case ic::IntToPtr: return new int_to_ptr_inst (ty, arg, name, next); + case ic::BitCast: return new bit_cast_inst (ty, arg, name, next); + case ic::AddrSpaceCast: return new addr_space_cast_inst (ty, arg, name, next); + default: throw std::runtime_error("unreachable"); + } +} + +cast_inst *cast_inst::create_integer_cast(value *arg, type *ty, bool is_signed, const std::string &name, instruction *next){ + type *arg_ty = arg->get_type(); + assert(arg_ty->is_int_or_tileint_ty() && ty->is_int_or_tileint_ty() && "Invalid integer cast!"); + unsigned arg_bits = arg_ty->get_scalar_bitsize(); + unsigned dst_bits = ty->get_scalar_bitsize(); + op_t op = (arg_bits == dst_bits ? ic::BitCast : + (arg_bits > dst_bits ? ic::Trunc : + (is_signed ? ic::SExt : ic::ZExt))); + return create(op, arg, ty, name, next); +} + +//===----------------------------------------------------------------------===// +// terminator_inst classes +//===----------------------------------------------------------------------===// + + +// return_inst + +return_inst::return_inst(context &ctx, value *ret_val, instruction *next) + : terminator_inst(ctx.get_void_ty(), !!ret_val, "", next){ + if(ret_val) + set_operand(0, ret_val); +} + +return_inst *return_inst::create(context &ctx, value *ret_val, instruction *next){ + return new return_inst(ctx, ret_val, next); +} + + +// conditional/unconditional branch + +branch_inst::branch_inst(basic_block *dst, instruction *next) + : terminator_inst(dst->get_context().get_void_ty(), 1, "", next){ + set_operand(0, dst); +} + +branch_inst::branch_inst(basic_block *if_dst, basic_block *else_dst, value *cond, instruction *next) + : terminator_inst(if_dst->get_context().get_void_ty(), 3, "", next){ + assert(cond->get_type()->is_integer_ty(1) && "May only branch on boolean predicates!"); + set_operand(0, if_dst); + set_operand(1, else_dst); + set_operand(2, cond); +} + +branch_inst* branch_inst::create(basic_block *dst, instruction *next) { + assert(dst && "Branch destination may not be null!"); + return new branch_inst(dst, next); +} + +branch_inst* branch_inst::create(value *cond, basic_block *if_dst, basic_block *else_dst, instruction *next) { + assert(cond->get_type()->is_integer_ty(1) && "May only branch on boolean predicates!"); + return new branch_inst(if_dst, else_dst, cond, next); +} + + +//===----------------------------------------------------------------------===// +// getelementptr_inst classes +//===----------------------------------------------------------------------===// + +getelementptr_inst::getelementptr_inst(type *pointee_ty, value *ptr, const std::vector &idx, const std::string &name, instruction *next) + : instruction(get_return_type(pointee_ty, ptr, idx), idx.size(), name, next), + source_elt_ty(pointee_ty), + res_elt_ty(get_indexed_type(pointee_ty, idx)){ + type *expected_ty = ((pointer_type*)(get_type()->get_scalar_ty()))->get_element_ty(); + assert(res_elt_ty == expected_ty); + set_operand(0, ptr); + for(size_t i = 0; i < idx.size(); i++) + set_operand(1 + i, idx[i]); +} + +type *getelementptr_inst::get_return_type(type *elt_ty, value *ptr, const std::vector &idx_list) { + // result pointer type + type *ptr_ty = pointer_type::get(get_indexed_type(elt_ty, idx_list), ptr->get_type()->get_pointer_address_space()); + // Tile GEP + if(ptr->get_type()->is_tile_ty()) + return tile_type::get_same_shapes(ptr_ty, ptr->get_type()); + for(value *idx : idx_list) + if (idx->get_type()->is_tile_ty()) + return tile_type::get_same_shapes(ptr_ty, idx->get_type()); + // Scalar GEP + return ptr_ty; +} + +type *getelementptr_inst::get_indexed_type_impl(type *ty, const std::vector &idx_list) { + if(idx_list.empty()) + return ty; + if(!ty->is_sized()) + return nullptr; + unsigned cur_idx = 1; + for(; cur_idx != idx_list.size(); cur_idx++){ + composite_type *cty = dynamic_cast(ty); + if(!cty || cty->is_pointer_ty()) + break; + value *idx = idx_list[cur_idx]; + if(!cty->index_valid(idx)) + break; + ty = cty->get_type_at_index(idx); + } + return (cur_idx == idx_list.size())? ty : nullptr; +} + +type *getelementptr_inst::get_indexed_type(type *ty, const std::vector &idx_list) { + type *result = get_indexed_type_impl(ty, idx_list); + assert(result && "invalid GEP type!"); + return result; +} + + } } From b039498d1544d8433a32635b400d8a1c1e804024 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 3 Jan 2019 00:42:37 -0500 Subject: [PATCH 026/494] [intermediate representation] added subdefinitions in types submodule --- include/ir/context.h | 10 ++- include/ir/instructions.h | 25 +++++- include/ir/type.h | 117 ++++++++++++++++++++++++---- include/ir/value.h | 2 +- lib/codegen.cpp | 14 ++-- lib/ir/context.cpp | 22 ++++++ lib/ir/instructions.cpp | 19 +++-- lib/ir/type.cpp | 156 ++++++++++++++++++++++++++++++++++++++ 8 files changed, 328 insertions(+), 37 deletions(-) diff --git a/include/ir/context.h b/include/ir/context.h index 35907ede1..c7382a0cb 100644 --- a/include/ir/context.h +++ b/include/ir/context.h @@ -1,18 +1,22 @@ #ifndef TDL_INCLUDE_IR_CONTEXT_H #define TDL_INCLUDE_IR_CONTEXT_H +#include +#include "ir/type.h" + namespace tdl{ namespace ir{ class type; +class context_impl; /* Context */ class context { public: - type *get_void_ty(); - type *get_int1_ty(); + context(); -private: +public: + std::shared_ptr p_impl; }; } diff --git a/include/ir/instructions.h b/include/ir/instructions.h index aee0aa1d0..cb673d73e 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -36,7 +36,7 @@ private: class phi_node: public instruction{ private: - phi_node(type *ty, unsigned num_reserved); + phi_node(type *ty, unsigned num_reserved, const std::string &name, instruction *next); public: void set_incoming_value(unsigned i, value *v); @@ -45,7 +45,7 @@ public: void add_incoming(value *v, basic_block *block); // Factory methods - static phi_node* create(type *ty, unsigned num_reserved); + static phi_node* create(type *ty, unsigned num_reserved, const std::string &name = "", instruction *next = nullptr); private: unsigned num_reserved_; @@ -235,6 +235,27 @@ private: type *res_elt_ty; }; +//===----------------------------------------------------------------------===// +// retile_inst classes +//===----------------------------------------------------------------------===// + +class retile_inst: public instruction{ + +}; + +class reshape_inst: public instruction{ + +}; + +class splat_inst: public instruction{ + +}; + +class broadcast_inst: public instruction{ + +}; + + } } diff --git a/include/ir/type.h b/include/ir/type.h index 874bffcdd..43b6d1c34 100644 --- a/include/ir/type.h +++ b/include/ir/type.h @@ -8,62 +8,147 @@ namespace ir{ class context; class value; +class integer_type; /* Type */ class type { public: + enum id_t { + // primitive types + VoidTyID = 0, ///< 0: type with no size + HalfTyID, ///< 1: 16-bit floating point type + FloatTyID, ///< 2: 32-bit floating point type + DoubleTyID, ///< 3: 64-bit floating point type + LabelTyID, ///< 4: Labels + MetadataTyID, ///< 5: Metadata + TokenTyID, ///< 6: Token + // derived types + IntegerTyID, ///< 7: Arbitrary bit width integers + FunctionTyID, ///< 8: Functions + PointerTyID, ///< 9: Pointers + TileTyID, ///< 10: Tile + }; + +public: + //constructors + type(context &ctx, id_t id) : ctx_(ctx), id_(id) {} + + //destructor virtual ~type(){} // accessors - context &get_context() const; + context &get_context() const { return ctx_; } // type attributes unsigned get_fp_mantissa_width() const; - unsigned get_integer_bit_width() const; - unsigned get_scalar_bitsize() const; - const std::vector &get_tile_shapes() const; + unsigned get_integer_bitwidth() const; type *get_scalar_ty() const; + const std::vector &get_tile_shapes() const; + type *get_tile_element_ty() const; unsigned get_pointer_address_space() const; - // type predicates + // primitive predicates + bool is_void_ty() const { return id_ == VoidTyID; } + bool is_half_ty() const { return id_ == HalfTyID; } + bool is_float_ty() const { return id_ == FloatTyID; } + bool is_double_ty() const { return id_ == DoubleTyID; } + bool is_label_ty() const { return id_ == LabelTyID;} + bool is_metadata_ty() const { return id_ == MetadataTyID; } + bool is_token_ty() const { return id_ == TokenTyID; } + bool is_integer_ty() const { return id_ == IntegerTyID; } + bool is_pointer_ty() const { return id_ == PointerTyID; } + bool is_tile_ty() const { return id_ == TileTyID; } + + // Composite predicates bool is_int_or_tileint_ty(); - bool is_integer_ty() const; bool is_integer_ty(unsigned width) const; - bool is_pointer_ty() const; - bool is_float_ty() const; - bool is_double_ty() const; bool is_floating_point_ty() const; - bool is_sized() const; - bool is_tile_ty() const; + bool is_sized() const ; // Factory methods - static type* get_void_ty(context &ctx); - static type* get_float_ty(context &ctx); - static type* get_double_ty(context &ctx); + // primitive types + static type *get_void_ty(context &ctx); + static type *get_label_ty(context &ctx); + // half + static type *get_half_ty(context &ctx); + static type *get_float_ty(context &ctx); + static type *get_double_ty(context &ctx); + // integer types + static integer_type *get_int1_ty(context &ctx); + static integer_type *get_int8_ty(context &ctx); + static integer_type *get_int16_ty(context &ctx); + static integer_type *get_int32_ty(context &ctx); + static integer_type *get_int64_ty(context &ctx); + static integer_type *get_int128_ty(context &ctx); +private: + context &ctx_; + id_t id_; + +protected: + std::vector contained_tys_; }; class integer_type: public type { + friend class context_impl; + +private: + // constructors + integer_type(context &ctx, unsigned bitwidth) + : type(ctx, IntegerTyID), bitwidth_(bitwidth){ } + public: + // accessors + unsigned get_bitwidth() const { return bitwidth_; } + + // factory methods static integer_type* get(context &ctx, unsigned width); + +private: + unsigned bitwidth_; }; class composite_type: public type{ +protected: + using type::type; + public: bool index_valid(value *idx) const; type* get_type_at_index(value *idx) const; }; -class tile_type: public type { +class tile_type: public composite_type { +private: + tile_type(type *ty, const std::vector &shapes); + static bool is_valid_elt_ty(type *ty); + public: + // accessors + const std::vector& get_shapes() const { return shapes_; } + + // factory methods static tile_type* get(type *ty, const std::vector &shapes); static tile_type* get_same_shapes(type *ty, type *ref); + +private: + std::vector shapes_; }; class pointer_type: public type { +private: + pointer_type(type *ty, unsigned address_space); + static bool is_valid_elt_ty(type *ty); + public: + // accessors + unsigned get_address_space() const { return address_space_; } + type *get_element_ty() const { return contained_tys_[0]; } + + // factory methods static pointer_type* get(type *ty, unsigned address_space); - type *get_element_ty() const; + +private: + unsigned address_space_; }; class function_type: public type { diff --git a/include/ir/value.h b/include/ir/value.h index effa44014..1b26391f3 100644 --- a/include/ir/value.h +++ b/include/ir/value.h @@ -23,7 +23,7 @@ public: void add_use(use *arg); // name void set_name(const std::string &name); - type* get_type() { return ty_; } + type* get_type() const { return ty_; } private: type *ty_; diff --git a/lib/codegen.cpp b/lib/codegen.cpp index 1ef5df769..9f8ad8420 100644 --- a/lib/codegen.cpp +++ b/lib/codegen.cpp @@ -25,10 +25,10 @@ ir::type* declaration_specifier::type(ir::module *mod) const { ir::context &ctx = mod->get_context(); switch (spec_) { case VOID_T: return ir::type::get_void_ty(ctx); - case INT8_T: return ir::integer_type::get(ctx, 8); - case INT16_T: return ir::integer_type::get(ctx, 16); - case INT32_T: return ir::integer_type::get(ctx, 32); - case INT64_T: return ir::integer_type::get(ctx, 64); + case INT8_T: return ir::type::get_int8_ty(ctx); + case INT16_T: return ir::type::get_int16_ty(ctx); + case INT32_T: return ir::type::get_int32_ty(ctx); + case INT64_T: return ir::type::get_int64_ty(ctx); case FLOAT32_T: return ir::type::get_float_ty(ctx); case FLOAT64_T: return ir::type::get_double_ty(ctx); default: throw std::runtime_error("unreachable"); @@ -227,7 +227,7 @@ ir::value *llvm_cast(ir::builder &builder, ir::value *src, ir::type *dst_ty){ return builder.create_fp_trunc(src, dst_ty); else if(src_ty->is_integer_ty() && dst_ty->is_integer_ty() && - src_ty->get_integer_bit_width()) + src_ty->get_integer_bitwidth()) return builder.create_int_cast(src, dst_ty, dst_signed); else @@ -259,8 +259,8 @@ inline void implicit_cast(ir::builder &builder, ir::value *&lhs, ir::value *&rhs else if(left_ty->is_integer_ty() && right_ty->is_integer_ty()){ is_int = true; is_signed = false; - if(left_ty->get_integer_bit_width() != right_ty->get_integer_bit_width()){ - ir::value *&to_convert = (left_ty->get_integer_bit_width() > right_ty->get_integer_bit_width())?rhs:lhs; + if(left_ty->get_integer_bitwidth() != right_ty->get_integer_bitwidth()){ + ir::value *&to_convert = (left_ty->get_integer_bitwidth() > right_ty->get_integer_bitwidth())?rhs:lhs; ir::type *dst_ty = (to_convert==lhs)?right_ty:left_ty; to_convert = llvm_cast(builder, to_convert, dst_ty); } diff --git a/lib/ir/context.cpp b/lib/ir/context.cpp index 8357b0ab1..56b64b4a3 100644 --- a/lib/ir/context.cpp +++ b/lib/ir/context.cpp @@ -1,7 +1,29 @@ +#include "ir/context_impl.h" #include "ir/context.h" +#include "ir/type.h" namespace tdl{ namespace ir{ +//===----------------------------------------------------------------------===// +// context implementation +//===----------------------------------------------------------------------===// + +context_impl::context_impl(context &ctx) + : void_ty(ctx, type::VoidTyID), + label_ty(ctx, type::LabelTyID), + half_ty(ctx, type::HalfTyID), + float_ty(ctx, type::FloatTyID), + double_ty(ctx, type::DoubleTyID), + int1_ty(ctx, 1), + int8_ty(ctx, 8), + int16_ty(ctx, 16), + int32_ty(ctx, 32), + int64_ty(ctx, 64), + int128_ty(ctx, 128) +{ + +} + } } diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index b72341d72..a42dc0c4b 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -25,6 +25,9 @@ instruction::instruction(type *ty, unsigned num_ops, const std::string &name, in // phi_node classes //===----------------------------------------------------------------------===// +phi_node::phi_node(type *ty, unsigned num_reserved, std::string const &name, instruction *next) + : instruction(ty, num_reserved, name, next){ } + // Set incoming value void phi_node::set_incoming_value(unsigned i, value *v){ assert(v && "PHI node got a null value!"); @@ -51,8 +54,8 @@ void phi_node::add_incoming(value *v, basic_block *block){ } // Factory methods -phi_node* phi_node::create(type *ty, unsigned num_reserved){ - return new phi_node(ty, num_reserved); +phi_node* phi_node::create(type *ty, unsigned num_reserved, const std::string &name, instruction *next){ + return new phi_node(ty, num_reserved, name, next); } @@ -103,7 +106,7 @@ cmp_inst::cmp_inst(type *ty, cmp_inst::pred_t pred, value *lhs, value *rhs, cons } type* cmp_inst::make_cmp_result_type(type *ty){ - type* int1_ty = ty->get_context().get_int1_ty(); + type* int1_ty = type::get_int1_ty(ty->get_context()); if (tile_type* tile_ty = dynamic_cast(ty)) return tile_type::get_same_shapes(int1_ty, tile_ty); return int1_ty; @@ -173,8 +176,8 @@ cast_inst *cast_inst::create(op_t op, value *arg, type *ty, const std::string &n cast_inst *cast_inst::create_integer_cast(value *arg, type *ty, bool is_signed, const std::string &name, instruction *next){ type *arg_ty = arg->get_type(); assert(arg_ty->is_int_or_tileint_ty() && ty->is_int_or_tileint_ty() && "Invalid integer cast!"); - unsigned arg_bits = arg_ty->get_scalar_bitsize(); - unsigned dst_bits = ty->get_scalar_bitsize(); + unsigned arg_bits = arg_ty->get_integer_bitwidth(); + unsigned dst_bits = ty->get_integer_bitwidth(); op_t op = (arg_bits == dst_bits ? ic::BitCast : (arg_bits > dst_bits ? ic::Trunc : (is_signed ? ic::SExt : ic::ZExt))); @@ -189,7 +192,7 @@ cast_inst *cast_inst::create_integer_cast(value *arg, type *ty, bool is_signed, // return_inst return_inst::return_inst(context &ctx, value *ret_val, instruction *next) - : terminator_inst(ctx.get_void_ty(), !!ret_val, "", next){ + : terminator_inst(type::get_void_ty(ctx), !!ret_val, "", next){ if(ret_val) set_operand(0, ret_val); } @@ -202,12 +205,12 @@ return_inst *return_inst::create(context &ctx, value *ret_val, instruction *next // conditional/unconditional branch branch_inst::branch_inst(basic_block *dst, instruction *next) - : terminator_inst(dst->get_context().get_void_ty(), 1, "", next){ + : terminator_inst(type::get_void_ty(dst->get_context()), 1, "", next){ set_operand(0, dst); } branch_inst::branch_inst(basic_block *if_dst, basic_block *else_dst, value *cond, instruction *next) - : terminator_inst(if_dst->get_context().get_void_ty(), 3, "", next){ + : terminator_inst(type::get_void_ty(if_dst->get_context()), 3, "", next){ assert(cond->get_type()->is_integer_ty(1) && "May only branch on boolean predicates!"); set_operand(0, if_dst); set_operand(1, else_dst); diff --git a/lib/ir/type.cpp b/lib/ir/type.cpp index e69de29bb..bd49100d1 100644 --- a/lib/ir/type.cpp +++ b/lib/ir/type.cpp @@ -0,0 +1,156 @@ +#include +#include "ir/type.h" +#include "ir/context.h" +#include "ir/context_impl.h" +#include "ir/value.h" + +namespace tdl{ +namespace ir{ + +//===----------------------------------------------------------------------===// +// type class +//===----------------------------------------------------------------------===// + +// attributes +type *type::get_scalar_ty() const { + if(is_tile_ty()) + return get_tile_element_ty(); + return const_cast(this); +} + +unsigned type::get_integer_bitwidth() const +{ return ((integer_type*)(this))->get_bitwidth(); } + +unsigned type::get_fp_mantissa_width() const { + id_t id = get_scalar_ty()->id_; + assert(is_floating_point_ty() && "Not a floating point type!"); + if (id == HalfTyID) return 11; + if (id == FloatTyID) return 24; + if (id == DoubleTyID) return 53; + throw std::runtime_error("unreachable"); +} + +type* type::get_tile_element_ty() const { + assert(is_tile_ty()); + return contained_tys_[0]; +} + +unsigned type::get_pointer_address_space() const { + assert(is_pointer_ty()); + return ((pointer_type*)this)->get_address_space(); +} + +const std::vector &type::get_tile_shapes() const { + assert(is_tile_ty()); + return ((tile_type*)this)->get_shapes(); +} + + +// composite predicates +bool type::is_int_or_tileint_ty() +{ return get_scalar_ty()->is_integer_ty(); } + +bool type::is_integer_ty(unsigned width) const +{ return is_integer_ty() && get_integer_bitwidth()== width; } + + +bool type::is_floating_point_ty() const +{ return is_half_ty() || is_float_ty() || is_double_ty(); } + +bool type::is_sized() const { + // primitive types are sized + if(is_integer_ty() || is_floating_point_ty() || + is_pointer_ty()){ + return true; + } + // tile types are sizes + if(is_tile_ty()) + return get_scalar_ty()->is_sized(); + return false; +} + +// primitive types +type *type::get_void_ty(context &ctx) { return &ctx.p_impl->void_ty; } +type *type::get_label_ty(context &ctx) { return &ctx.p_impl->label_ty; } +// half +type *type::get_half_ty(context &ctx) { return &ctx.p_impl->half_ty; } +type *type::get_float_ty(context &ctx) { return &ctx.p_impl->float_ty; } +type *type::get_double_ty(context &ctx) { return &ctx.p_impl->double_ty; } +// integer types +integer_type *type::get_int1_ty(context &ctx) { return &ctx.p_impl->int1_ty; } +integer_type *type::get_int8_ty(context &ctx) { return &ctx.p_impl->int8_ty; } +integer_type *type::get_int16_ty(context &ctx) { return &ctx.p_impl->int16_ty; } +integer_type *type::get_int32_ty(context &ctx) { return &ctx.p_impl->int32_ty; } +integer_type *type::get_int64_ty(context &ctx) { return &ctx.p_impl->int64_ty; } +integer_type *type::get_int128_ty(context &ctx) { return &ctx.p_impl->int128_ty; } + + + +pointer_type::pointer_type(type *ty, unsigned address_space) + : type(ty->get_context(), PointerTyID), address_space_(address_space){ + contained_tys_.push_back(ty); +} + +bool pointer_type::is_valid_elt_ty(type *ty){ + return !ty->is_void_ty() && !ty->is_label_ty() && + !ty->is_metadata_ty() && !ty->is_token_ty(); +} + +pointer_type* pointer_type::get(type *elt_ty, unsigned address_space){ + assert(elt_ty && "Can't get a pointer to type!"); + assert(is_valid_elt_ty(elt_ty) && "Invalid type for pointer element!"); + // look-up + context_impl *impl = elt_ty->get_context().p_impl.get(); + pointer_type *&entry = impl->ptr_tys[std::make_pair(elt_ty, address_space)]; + if(!entry) + entry = new pointer_type(elt_ty, address_space); + return entry; +} + +//===----------------------------------------------------------------------===// +// composite_type class +//===----------------------------------------------------------------------===// + +type* composite_type::get_type_at_index(value *) const{ + assert(is_tile_ty()); + return get_scalar_ty(); +} + +bool composite_type::index_valid(value *idx) const{ + assert(is_tile_ty()); + return idx->get_type()->is_int_or_tileint_ty(); +} + +//===----------------------------------------------------------------------===// +// tile_type class +//===----------------------------------------------------------------------===// + +tile_type::tile_type(type *ty, const std::vector &shapes) + : composite_type(ty->get_context(), TileTyID), shapes_(shapes) { + contained_tys_.push_back(ty); +} + +bool tile_type::is_valid_elt_ty(type *ty) { + return ty->is_pointer_ty() || ty->is_floating_point_ty() || ty->is_integer_ty(); +} + +tile_type* tile_type::get(type *elt_ty, const std::vector &shapes) { + assert(elt_ty && "Can't get a tile of type!"); + assert(shapes.size() && "Can't create a tile with empty shapes!"); + assert(is_valid_elt_ty(elt_ty) && "Invalid type for pointer element!"); + // look-up + context_impl *impl = elt_ty->get_context().p_impl.get(); + tile_type *&entry = impl->tile_tys[std::make_pair(elt_ty, shapes)]; + if(!entry) + entry = new tile_type(elt_ty, shapes); + return entry; +} + +tile_type* tile_type::get_same_shapes(type *ty, type *ref){ + assert(ref->is_tile_ty()); + return get(ty, ref->get_tile_shapes()); +} + + +} +} From 8dbb5652004f9080e6b86aa5ef36a822cb2df4ca Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 3 Jan 2019 00:55:24 -0500 Subject: [PATCH 027/494] [general] added missing file --- include/ir/context_impl.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 include/ir/context_impl.h diff --git a/include/ir/context_impl.h b/include/ir/context_impl.h new file mode 100644 index 000000000..091a563b6 --- /dev/null +++ b/include/ir/context_impl.h @@ -0,0 +1,32 @@ +#ifndef TDL_INCLUDE_IR_CONTEXT_IMPL_H +#define TDL_INCLUDE_IR_CONTEXT_IMPL_H + +#include +#include +#include "ir/type.h" + +namespace tdl{ +namespace ir{ + +class context; + +/* Context impl */ +class context_impl { +public: + // constructors + context_impl(context &ctx); + +public: + // primitive types + type void_ty, label_ty, half_ty, float_ty, double_ty; + // derived types + integer_type int1_ty, int8_ty, int16_ty, int32_ty, int64_ty, int128_ty; + // Pointer types + std::map, pointer_type*> ptr_tys; + std::map>, tile_type*> tile_tys; +}; + +} +} + +#endif From 8f4aafb4ac68835d6c93e31267e780ec6f67a3fc Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 3 Jan 2019 03:42:10 -0500 Subject: [PATCH 028/494] [intermediate representation] improvements on constants --- include/ir/basic_block.h | 3 ++ include/ir/constant.h | 28 +++++++++--- include/ir/context_impl.h | 9 ++++ include/ir/type.h | 19 ++++---- include/ir/value.h | 1 + lib/codegen.cpp | 5 ++- lib/ir/basic_block.cpp | 29 ++++++++++++ lib/ir/constant.cpp | 93 +++++++++++++++++++++++++++++++++++++++ 8 files changed, 170 insertions(+), 17 deletions(-) diff --git a/include/ir/basic_block.h b/include/ir/basic_block.h index 3a4989f27..f6c7897f6 100644 --- a/include/ir/basic_block.h +++ b/include/ir/basic_block.h @@ -22,6 +22,9 @@ public: typedef inst_list_t::reverse_iterator reverse_iterator; typedef inst_list_t::const_reverse_iterator const_reverse_iterator; +private: + // constructors + basic_block(context &ctx, const std::string &name, function *parent); public: // accessors diff --git a/include/ir/constant.h b/include/ir/constant.h index 40f3b056c..c7c53ae2a 100644 --- a/include/ir/constant.h +++ b/include/ir/constant.h @@ -10,32 +10,46 @@ class type; class context; /* Constant */ -class constant: public value{ +class constant: public user{ +protected: + using user::user; + public: static constant* get_all_ones_value(type *ty); + static constant* get_null_value(type *ty); }; /* Undef value */ class undef_value: public constant{ +private: + undef_value(type *ty); + public: static undef_value* get(type* ty); }; -/* Data array */ -class constant_data_array: public constant{ -public: - static constant_data_array* get_string(context &ctx, const std::string &str); -}; - /* Constant int */ class constant_int: public constant{ + constant_int(type *ty, uint64_t value); +public: + static constant *get(type *ty, uint64_t value); + +private: + uint64_t value_; }; /* constant fp */ class constant_fp: public constant{ + constant_fp(context &ctx, double value); + public: + static constant* get_negative_zero(type *ty); static constant* get_zero_value_for_negation(type *ty); + static constant *get(context &ctx, double v); + +private: + double value_; }; diff --git a/include/ir/context_impl.h b/include/ir/context_impl.h index 091a563b6..cb3acc186 100644 --- a/include/ir/context_impl.h +++ b/include/ir/context_impl.h @@ -9,6 +9,9 @@ namespace tdl{ namespace ir{ class context; +class constant_int; +class constant_fp; +class undef_value; /* Context impl */ class context_impl { @@ -24,6 +27,12 @@ public: // Pointer types std::map, pointer_type*> ptr_tys; std::map>, tile_type*> tile_tys; + // Int constants + std::map int_constants_; + // Float constants + std::map fp_constants_; + // undef values + std::map uv_constants_; }; } diff --git a/include/ir/type.h b/include/ir/type.h index 43b6d1c34..8ff710373 100644 --- a/include/ir/type.h +++ b/include/ir/type.h @@ -19,14 +19,17 @@ public: HalfTyID, ///< 1: 16-bit floating point type FloatTyID, ///< 2: 32-bit floating point type DoubleTyID, ///< 3: 64-bit floating point type - LabelTyID, ///< 4: Labels - MetadataTyID, ///< 5: Metadata - TokenTyID, ///< 6: Token + X86_FP80TyID, ///< 4: 80-bit floating point type (X87) + FP128TyID, ///< 5: 128-bit floating point type (112-bit mantissa) + PPC_FP128TyID, ///< 6: 128-bit floating point type (two 64-bits, PowerPC) + LabelTyID, ///< 7: Labels + MetadataTyID, ///< 8: Metadata + TokenTyID, ///< 9: Token // derived types - IntegerTyID, ///< 7: Arbitrary bit width integers - FunctionTyID, ///< 8: Functions - PointerTyID, ///< 9: Pointers - TileTyID, ///< 10: Tile + IntegerTyID, ///< 10: Arbitrary bit width integers + FunctionTyID, ///< 11: Functions + PointerTyID, ///< 12: Pointers + TileTyID, ///< 13: Tile }; public: @@ -38,7 +41,7 @@ public: // accessors context &get_context() const { return ctx_; } - + id_t get_type_id() const { return id_; } // type attributes unsigned get_fp_mantissa_width() const; unsigned get_integer_bitwidth() const; diff --git a/include/ir/value.h b/include/ir/value.h index 1b26391f3..bab034603 100644 --- a/include/ir/value.h +++ b/include/ir/value.h @@ -19,6 +19,7 @@ class value { public: // constructor value(type *ty, const std::string &name = ""); + virtual ~value(){ } // uses void add_use(use *arg); // name diff --git a/lib/codegen.cpp b/lib/codegen.cpp index 9f8ad8420..6d4b7038c 100644 --- a/lib/codegen.cpp +++ b/lib/codegen.cpp @@ -477,8 +477,9 @@ ir::type *type_name::type(ir::module *mod) const{ } /* String literal */ -ir::value* string_literal::codegen(ir::module *mod) const{ - return ir::constant_data_array::get_string(mod->get_context(), value_); +ir::value* string_literal::codegen(ir::module *) const{ + throw std::runtime_error("not supported"); +// return ir::constant_data_array::get_string(mod->get_context(), value_); } /* Constant */ diff --git a/lib/ir/basic_block.cpp b/lib/ir/basic_block.cpp index e69de29bb..359c55d0c 100644 --- a/lib/ir/basic_block.cpp +++ b/lib/ir/basic_block.cpp @@ -0,0 +1,29 @@ +#include "ir/basic_block.h" +#include "ir/instructions.h" +#include "ir/type.h" + +namespace tdl { +namespace ir { + +class phi_node; + +basic_block::basic_block(context &ctx, const std::string &name, function *parent): + value(type::get_label_ty(ctx), name), ctx_(ctx), parent_(parent){ + +} + +basic_block* basic_block::create(context &ctx, const std::string &name, function *parent){ + return new basic_block(ctx, name, parent); +} + +basic_block::iterator basic_block::get_first_non_phi(){ + auto it = begin(); + for(; it != end(); it++) + if(!dynamic_cast(*it)) + return it; + return it; +} + +} + +} diff --git a/lib/ir/constant.cpp b/lib/ir/constant.cpp index e69de29bb..f2a3bd7e9 100644 --- a/lib/ir/constant.cpp +++ b/lib/ir/constant.cpp @@ -0,0 +1,93 @@ +#include "ir/constant.h" +#include "ir/type.h" +#include "ir/context.h" +#include "ir/context_impl.h" + +namespace tdl{ +namespace ir{ + + +// constant + +constant *constant::get_null_value(type *ty) { + context &ctx = ty->get_context(); + switch (ty->get_type_id()) { + case type::IntegerTyID: + return constant_int::get(ty, 0); + case type::HalfTyID: + return constant_fp::get(ctx, 0); + case type::FloatTyID: + return constant_fp::get(ctx, 0); + case type::DoubleTyID: + return constant_fp::get(ctx, 0); + case type::X86_FP80TyID: + return constant_fp::get(ctx, 0); + case type::FP128TyID: + return constant_fp::get(ctx, 0); + case type::PPC_FP128TyID: + return constant_fp::get(ctx, 0); + default: + throw std::runtime_error("Cannot create a null constant of that type!"); + } +} + +// FIXME + +constant *constant::get_all_ones_value(type *ty) { + if(ty->is_integer_ty()) + return constant_int::get(ty, 0xFFFFFFFF); + if(ty->is_floating_point_ty()) + return constant_fp::get(ty->get_context(), 0xFFFFFFFF); + throw std::runtime_error("Cannot create all ones value for that type!"); +} + +// constant_int +// FIXME use something like APInt + +constant_int::constant_int(type *ty, uint64_t value) + : constant(ty, 0), value_(value){ } + +constant *constant_int::get(type *ty, uint64_t value) { + return new constant_int(ty, value); +} + +// constant_fp +// FIXME use something like APFloat + +constant_fp::constant_fp(context &ctx, double value) + : constant(type::get_float_ty(ctx), 0), value_(value){ } + +constant *constant_fp::get_negative_zero(type *ty){ + double neg_zero = 0; + return get(ty->get_context(), neg_zero); +} + +constant *constant_fp::get_zero_value_for_negation(type *ty) { + if(ty->get_scalar_ty()->is_floating_point_ty()) + return get_negative_zero(ty); + return constant::get_null_value(ty); +} + +constant *constant_fp::get(context &ctx, double v){ + context_impl *impl = ctx.p_impl.get(); + constant_fp *&result = impl->fp_constants_[v]; + if(!result) + result = new constant_fp(ctx, v); + return result; +} + +// undef value +undef_value::undef_value(type *ty) + : constant(ty, 0) { } + +undef_value *undef_value::get(type *ty) { + context_impl *impl = ty->get_context().p_impl.get(); + undef_value *&result = impl->uv_constants_[ty]; + if(!result) + result = new undef_value(ty); + return result; +} + + +} +} From 9a1739957d73d6abcd2972c0c307479c89e12e8e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 3 Jan 2019 12:44:33 -0500 Subject: [PATCH 029/494] [intermediate representation] added some builder function definitions --- include/ir/builder.h | 5 ++--- include/ir/instructions.h | 31 ++++++++++++++++++++++++++----- include/ir/type.h | 7 +++---- lib/ir/builder.cpp | 33 +++++++++++++++------------------ lib/ir/instructions.cpp | 37 +++++++++++++++++++++++++++++++++++++ lib/ir/type.cpp | 15 +++++++++++++++ 6 files changed, 98 insertions(+), 30 deletions(-) diff --git a/include/ir/builder.h b/include/ir/builder.h index 254d3bab6..4e1fd922c 100644 --- a/include/ir/builder.h +++ b/include/ir/builder.h @@ -41,9 +41,10 @@ public: block_->get_inst_list().insert(insert_point_, inst); inst->set_name(name); } - // Branch instructions + // terminator instructions value* create_br(basic_block *dest); value* create_cond_br(value *cond, basic_block* if_dest, basic_block* else_dest); + value* create_ret_void(); // Cast instructions value *create_cast(cast_inst::op_t op, value *v, type *dst_ty, const std::string &name = ""); value* create_si_to_fp(value *src, type *dst_ty, const std::string &name = ""); @@ -106,8 +107,6 @@ public: value *create_splat(value *arg, const std::vector &shapes, const std::string &name = ""); value *create_reshape(value *arg, const std::vector &shapes, const std::string &name = ""); value *create_broadcast(value *arg, const std::vector &shapes, const std::string &name = ""); - // Terminators - value *create_ret_void(); private: context &ctx_; diff --git a/include/ir/instructions.h b/include/ir/instructions.h index cb673d73e..3c4c408ed 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -227,7 +227,7 @@ private: static type *get_indexed_type(type *ty, const std::vector &idx); public: - static getelementptr_inst* create(value *ptr, const std::vector &idx, + static getelementptr_inst* create(type *pointee_ty, value *ptr, const std::vector &idx, const std::string &name = "", instruction *next = nullptr); private: @@ -239,20 +239,41 @@ private: // retile_inst classes //===----------------------------------------------------------------------===// -class retile_inst: public instruction{ +// retile +class retile_inst: public instruction { +protected: + retile_inst(value *arg, const std::vector &shapes, const std::string &name, instruction *next); }; -class reshape_inst: public instruction{ +// reshape +class reshape_inst: public retile_inst { + using retile_inst::retile_inst; + +public: + static instruction* create(value *arg, const std::vector &shapes, + const std::string &name = "", instruction *next = nullptr); }; -class splat_inst: public instruction{ +// splat +class splat_inst: public retile_inst { + using retile_inst::retile_inst; + +public: + static instruction* create(value *arg, const std::vector &shapes, + const std::string &name = "", instruction *next = nullptr); }; -class broadcast_inst: public instruction{ +// broadcast +class broadcast_inst: public retile_inst { + using retile_inst::retile_inst; + +public: + static instruction* create(value *arg, const std::vector &shapes, + const std::string &name = "", instruction *next = nullptr); }; diff --git a/include/ir/type.h b/include/ir/type.h index 8ff710373..a82ed8e63 100644 --- a/include/ir/type.h +++ b/include/ir/type.h @@ -155,12 +155,11 @@ private: }; class function_type: public type { +private: + function_type(type *ret_ty, const std::vector ¶m_tys); + public: static function_type* get(type *ret_ty, const std::vector& param_tys); - -private: - type *return_type_; - std::vector param_types_; }; diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index a5d14b913..43cca943d 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -25,7 +25,7 @@ void builder::set_insert_point(basic_block *block){ } //===----------------------------------------------------------------------===// -// branch instructions +// terminator instructions //===----------------------------------------------------------------------===// value* builder::create_br(basic_block *dest){ @@ -36,6 +36,10 @@ value* builder::create_cond_br(value *cond, basic_block *if_dest, basic_block *e return insert(branch_inst::create(cond, if_dest, else_dest)); } +value *builder::create_ret_void() { + return insert(return_inst::create(ctx_)); +} + //===----------------------------------------------------------------------===// // cast instructions //===----------------------------------------------------------------------===// @@ -133,7 +137,7 @@ DEFINE_UNARY_INT(not) //===----------------------------------------------------------------------===// value* builder::create_gep(value *ptr, const std::vector& idx_list, const std::string &name){ - return insert(getelementptr_inst::create(ptr, idx_list), name); + return insert(getelementptr_inst::create(nullptr, ptr, idx_list), name); } //===----------------------------------------------------------------------===// @@ -199,24 +203,17 @@ DEFINE_FCMP_INSTR(ONE, llvm::FCmpInst::FCMP_ONE) // tile instructions //===----------------------------------------------------------------------===// -//value *create_splat(value *arg, const std::vector &shapes, const std::string &name) { -//} +value *builder::create_reshape(value *arg, const std::vector &shapes, const std::string &name) { + return insert(reshape_inst::create(arg, shapes, name)); +} -//value *create_reshape(value *arg, const std::vector &shapes, const std::string &name) { +value *builder::create_splat(value *arg, const std::vector &shapes, const std::string &name) { + return insert(splat_inst::create(arg, shapes, name)); +} -//} - -//value *create_broadcast(value *arg, const std::vector &shapes, const std::string &name) { - -//} - -//===----------------------------------------------------------------------===// -// terminator instructions -//===----------------------------------------------------------------------===// - -//value *create_red_void() { - -//} +value *builder::create_broadcast(value *arg, const std::vector &shapes, const std::string &name) { + return insert(broadcast_inst::create(arg, shapes, name)); +} diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index a42dc0c4b..51a778589 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -280,6 +280,43 @@ type *getelementptr_inst::get_indexed_type(type *ty, const std::vector return result; } +getelementptr_inst *getelementptr_inst::create(type *pointee_ty, value *ptr, const std::vector &idx, const std::string &name, instruction *next) { + return new getelementptr_inst(pointee_ty, ptr, idx, name, next); +} + + + +//===----------------------------------------------------------------------===// +// retile_inst classes +//===----------------------------------------------------------------------===// + +retile_inst::retile_inst(value *arg, const std::vector &shapes, + const std::string &name, instruction *next) + : instruction(tile_type::get(arg->get_type()->get_scalar_ty(), shapes), 1, name, next) { + set_operand(0, arg); +} + +// reshape + +instruction* reshape_inst::create(value *arg, const std::vector &shapes, + const std::string &name, instruction *next) { + return new reshape_inst(arg, shapes, name, next); +} + + +// splat + +instruction* splat_inst::create(value *arg, const std::vector &shapes, + const std::string &name, instruction *next) { + return new splat_inst(arg, shapes, name, next); +} + +// broadcast + +instruction* broadcast_inst::create(value *arg, const std::vector &shapes, + const std::string &name, instruction *next) { + return new broadcast_inst(arg, shapes, name, next); +} } } diff --git a/lib/ir/type.cpp b/lib/ir/type.cpp index bd49100d1..b6c9a4dff 100644 --- a/lib/ir/type.cpp +++ b/lib/ir/type.cpp @@ -152,5 +152,20 @@ tile_type* tile_type::get_same_shapes(type *ty, type *ref){ } +//===----------------------------------------------------------------------===// +// function_type class +//===----------------------------------------------------------------------===// + +function_type::function_type(type *ret_ty, const std::vector ¶m_tys): + type(ret_ty->get_context(), FunctionTyID) { + contained_tys_.push_back(ret_ty); + for(type *ty: param_tys) + contained_tys_.push_back(ty); +} + +function_type* function_type::get(type *ret_ty, const std::vector ¶m_tys) { + return new function_type(ret_ty, param_tys); +} + } } From c35ca8353e7f802d8ddd5ac2bfc5d8cb8387dd1f Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 3 Jan 2019 15:32:22 -0500 Subject: [PATCH 030/494] [intermediate representation] defined more symbols --- CMakeLists.txt | 3 - cmake/FindLLVM.cmake | 247 ++++++++++++++++++++++++++------------ include/ir/constant.h | 24 ++++ include/ir/function.h | 39 ++++-- include/ir/instructions.h | 21 +++- include/ir/type.h | 5 + lib/codegen.cpp | 5 +- lib/ir/builder.cpp | 32 ++++- lib/ir/constant.cpp | 14 +++ lib/ir/function.cpp | 47 ++++++++ lib/ir/instructions.cpp | 13 +- lib/ir/type.cpp | 6 + 12 files changed, 355 insertions(+), 101 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b20429946..cea5cc99d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,10 +22,7 @@ file( GLOB_RECURSE ALL_SRC *.cpp *.hpp *.h *.py *.y *.l) add_custom_target( ALL SOURCES ${ALL_SRC} ) # Compiler flags -link_directories(/home/philippe/Development/llvm-tlvm/build/lib) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) -include_directories(/home/philippe/Development/llvm-tlvm/include) -include_directories(/home/philippe/Development/llvm-tlvm/build/include) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") # Library diff --git a/cmake/FindLLVM.cmake b/cmake/FindLLVM.cmake index b3196d444..e3e3606df 100644 --- a/cmake/FindLLVM.cmake +++ b/cmake/FindLLVM.cmake @@ -1,88 +1,185 @@ -# - Find LLVM -# This module can be used to find LLVM. -# It requires that the llvm-config executable be available on the system path. -# Once found, llvm-config is used for everything else. +# - Find LLVM headers and libraries. +# This module locates LLVM and adapts the llvm-config output for use with +# CMake. # -# Typical usage could be: -# find_package(LLVM QUIET REQUIRED COMPONENTS jit native interpreter) +# A given list of COMPONENTS is passed to llvm-config. # -# If the QUIET flag is not set, the specified components and LLVM version are -# outputted. +# The following variables are defined: +# LLVM_FOUND - true if LLVM was found +# LLVM_CXXFLAGS - C++ compiler flags for files that include LLVM headers. +# LLVM_HOST_TARGET - Target triple used to configure LLVM. +# LLVM_INCLUDE_DIRS - Directory containing LLVM include files. +# LLVM_LDFLAGS - Linker flags to add when linking against LLVM +# (includes -LLLVM_LIBRARY_DIRS). +# LLVM_LIBRARIES - Full paths to the library files to link against. +# LLVM_LIBRARY_DIRS - Directory containing LLVM libraries. +# LLVM_ROOT_DIR - The root directory of the LLVM installation. +# llvm-config is searched for in ${LLVM_ROOT_DIR}/bin. +# LLVM_VERSION_MAJOR - Major version of LLVM. +# LLVM_VERSION_MINOR - Minor version of LLVM. +# LLVM_VERSION_STRING - Full LLVM version string (e.g. 6.0.0svn). +# LLVM_VERSION_BASE_STRING - Base LLVM version string without git/svn suffix (e.g. 6.0.0). # -# If the COMPONENTS are not set, the default set of "all" is used. -# -# The following variables are set: -# -# LLVM_FOUND - Set to YES if LLVM is found. -# LLVM_VERSION - Set to the decimal version of the LLVM library. -# LLVM_C_FLAGS - All flags that should be passed to a C compiler. -# LLVM_CXX_FLAGS - All flags that should be passed to a C++ compiler. -# LLVM_CPP_FLAGS - All flags that should be passed to the C pre-processor. -# LLVM_LD_FLAGS - Additional flags to pass to the linker. -# LLVM_LIBRARY_DIRS - A list of directories where the LLVM libraries are located. -# LLVM_INCLUDE_DIRS - A list of directories where the LLVM headers are located. -# LLVM_LIBRARIES - A list of libraries which should be linked against. +# Note: The variable names were chosen in conformance with the offical CMake +# guidelines, see ${CMAKE_ROOT}/Modules/readme.txt. -# A macro to run llvm config -macro(_llvm_config _var_name) - # Firstly, locate the LLVM config executable - find_program(_llvm_config_exe - NAMES llvm-config - PATHS /home/philippe/Development/llvm-tlvm/build/bin/ - DOC "llvm-config executable location" - ) +# Try suffixed versions to pick up the newest LLVM install available on Debian +# derivatives. +# We also want an user-specified LLVM_ROOT_DIR to take precedence over the +# system default locations such as /usr/local/bin. Executing find_program() +# multiples times is the approach recommended in the docs. +set(llvm_config_names llvm-config-8.0 llvm-config80 + llvm-config-7.0 llvm-config70 + llvm-config-6.0 llvm-config60 + llvm-config-5.0 llvm-config50 + llvm-config-4.0 llvm-config40 + llvm-config-3.9 llvm-config39 + llvm-config) +find_program(LLVM_CONFIG + NAMES ${llvm_config_names} + PATHS ${LLVM_ROOT_DIR}/bin NO_DEFAULT_PATH + DOC "Path to llvm-config tool.") +find_program(LLVM_CONFIG NAMES ${llvm_config_names}) - # If no llvm-config executable was found, set the output variable to not - # found. - if(NOT _llvm_config_exe) - set(${_var_name} "${_var_name}-NOTFOUND") - else(NOT _llvm_config_exe) - # Otherwise, run llvm-config - execute_process( - COMMAND ${_llvm_config_exe} ${ARGN} - OUTPUT_VARIABLE ${_var_name} - RESULT_VARIABLE _llvm_config_retval - OUTPUT_STRIP_TRAILING_WHITESPACE - ) - if(RESULT_VARIABLE) - message(SEND_ERROR - "Error running llvm-config with arguments: ${ARGN}") - endif(RESULT_VARIABLE) - endif(NOT _llvm_config_exe) -endmacro(_llvm_config) +# Prints a warning/failure message depending on the required/quiet flags. Copied +# from FindPackageHandleStandardArgs.cmake because it doesn't seem to be exposed. +macro(_LLVM_FAIL _msg) + if(LLVM_FIND_REQUIRED) + message(FATAL_ERROR "${_msg}") + else() + if(NOT LLVM_FIND_QUIETLY) + message(STATUS "${_msg}") + endif() + endif() +endmacro() -# The default set of components -set(_llvm_components all) -# If components have been specified via find_package, use them -if(LLVM_FIND_COMPONENTS) - set(_llvm_components ${LLVM_FIND_COMPONENTS}) -endif(LLVM_FIND_COMPONENTS) +if(NOT LLVM_CONFIG) + if(NOT LLVM_FIND_QUIETLY) + message(WARNING "Could not find llvm-config (LLVM >= ${LLVM_FIND_VERSION}). Try manually setting LLVM_CONFIG to the llvm-config executable of the installation to use.") + endif() +else() + macro(llvm_set var flag) + if(LLVM_FIND_QUIETLY) + set(_quiet_arg ERROR_QUIET) + endif() + set(result_code) + execute_process( + COMMAND ${LLVM_CONFIG} --${flag} + RESULT_VARIABLE result_code + OUTPUT_VARIABLE LLVM_${var} + OUTPUT_STRIP_TRAILING_WHITESPACE + ${_quiet_arg} + ) + if(result_code) + _LLVM_FAIL("Failed to execute llvm-config ('${LLVM_CONFIG}', result code: '${result_code})'") + else() + if(${ARGV2}) + file(TO_CMAKE_PATH "${LLVM_${var}}" LLVM_${var}) + endif() + endif() + endmacro() + macro(llvm_set_libs var flag) + if(LLVM_FIND_QUIETLY) + set(_quiet_arg ERROR_QUIET) + endif() + set(result_code) + execute_process( + COMMAND ${LLVM_CONFIG} --${flag} ${LLVM_FIND_COMPONENTS} + RESULT_VARIABLE result_code + OUTPUT_VARIABLE tmplibs + OUTPUT_STRIP_TRAILING_WHITESPACE + ${_quiet_arg} + ) + if(result_code) + _LLVM_FAIL("Failed to execute llvm-config ('${LLVM_CONFIG}', result code: '${result_code})'") + else() + file(TO_CMAKE_PATH "${tmplibs}" tmplibs) + string(REGEX MATCHALL "${pattern}[^ ]+" LLVM_${var} ${tmplibs}) + endif() + endmacro() -if(NOT LLVM_FIND_QUIETLY) - message(STATUS "Looking for LLVM components: ${_llvm_components}") -endif(NOT LLVM_FIND_QUIETLY) + llvm_set(VERSION_STRING version) + llvm_set(CXXFLAGS cxxflags) + llvm_set(HOST_TARGET host-target) + llvm_set(INCLUDE_DIRS includedir true) + llvm_set(ROOT_DIR prefix true) + llvm_set(ENABLE_ASSERTIONS assertion-mode) -_llvm_config(LLVM_VERSION --version) -_llvm_config(LLVM_C_FLAGS --cflags) -_llvm_config(LLVM_CXX_FLAGS --cxxflags) -_llvm_config(LLVM_CPP_FLAGS --cppflags) -_llvm_config(LLVM_LD_FLAGS --ldflags) -_llvm_config(LLVM_LIBRARY_DIRS --libdir) -_llvm_config(LLVM_INCLUDE_DIRS --includedir) -_llvm_config(LLVM_LIBRARIES --libs) + # The LLVM version string _may_ contain a git/svn suffix, so cut that off + string(SUBSTRING "${LLVM_VERSION_STRING}" 0 5 LLVM_VERSION_BASE_STRING) -if(NOT LLVM_FIND_QUIETLY) - message(STATUS "Found LLVM version: ${LLVM_VERSION}") -endif(NOT LLVM_FIND_QUIETLY) + # Versions below 3.9 do not support components debuginfocodeview, globalisel, ipa + list(REMOVE_ITEM LLVM_FIND_COMPONENTS "debuginfocodeview" index) + list(REMOVE_ITEM LLVM_FIND_COMPONENTS "globalisel" index) + list(REMOVE_ITEM LLVM_FIND_COMPONENTS "ipa" index) + if(${LLVM_VERSION_STRING} MATCHES "^3\\.[0-9][\\.0-9A-Za-z]*") + # Versions below 4.0 do not support component debuginfomsf + list(REMOVE_ITEM LLVM_FIND_COMPONENTS "debuginfomsf" index) + endif() + if(${LLVM_VERSION_STRING} MATCHES "^[3-5]\\..*") + # Versions below 6.0 do not support component windowsmanifest + list(REMOVE_ITEM LLVM_FIND_COMPONENTS "windowsmanifest" index) + endif() -# handle the QUIETLY and REQUIRED arguments and set LLVM_FOUND to TRUE if -# all listed variables are TRUE + llvm_set(LDFLAGS ldflags) + # In LLVM 3.5+, the system library dependencies (e.g. "-lz") are accessed + # using the separate "--system-libs" flag. + llvm_set(SYSTEM_LIBS system-libs) + string(REPLACE "\n" " " LLVM_LDFLAGS "${LLVM_LDFLAGS} ${LLVM_SYSTEM_LIBS}") + llvm_set(LIBRARY_DIRS libdir true) + llvm_set_libs(LIBRARIES libs) + # LLVM bug: llvm-config --libs tablegen returns -lLLVM-3.8.0 + # but code for it is not in shared library + if("${LLVM_FIND_COMPONENTS}" MATCHES "tablegen") + if (NOT "${LLVM_LIBRARIES}" MATCHES "LLVMTableGen") + set(LLVM_LIBRARIES "${LLVM_LIBRARIES};-lLLVMTableGen") + endif() + endif() + + if(${LLVM_VERSION_STRING} MATCHES "^3\\.[0-9][\\.0-9A-Za-z]*") + # Versions below 4.0 do not support llvm-config --cmakedir + set(LLVM_CMAKEDIR ${LLVM_LIBRARY_DIRS}/cmake/llvm) + else() + llvm_set(CMAKEDIR cmakedir) + endif() + + llvm_set(TARGETS_TO_BUILD targets-built) + string(REGEX MATCHALL "${pattern}[^ ]+" LLVM_TARGETS_TO_BUILD ${LLVM_TARGETS_TO_BUILD}) +endif() + +# On CMake builds of LLVM, the output of llvm-config --cxxflags does not +# include -fno-rtti, leading to linker errors. Be sure to add it. +if(NOT MSVC AND (CMAKE_COMPILER_IS_GNUCXX OR (${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang"))) + if(NOT ${LLVM_CXXFLAGS} MATCHES "-fno-rtti") + set(LLVM_CXXFLAGS "${LLVM_CXXFLAGS} -fno-rtti") + endif() +endif() + +# Remove some clang-specific flags for gcc. +if(CMAKE_COMPILER_IS_GNUCXX) + string(REPLACE "-Wcovered-switch-default " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS}) + string(REPLACE "-Wstring-conversion " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS}) + string(REPLACE "-fcolor-diagnostics " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS}) + # this requires more recent gcc versions (not supported by 4.9) + string(REPLACE "-Werror=unguarded-availability-new " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS}) +endif() + +# Remove gcc-specific flags for clang. +if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") + string(REPLACE "-Wno-maybe-uninitialized " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS}) +endif() + +string(REGEX REPLACE "([0-9]+).*" "\\1" LLVM_VERSION_MAJOR "${LLVM_VERSION_STRING}" ) +string(REGEX REPLACE "[0-9]+\\.([0-9]+).*[A-Za-z]*" "\\1" LLVM_VERSION_MINOR "${LLVM_VERSION_STRING}" ) + +if (${LLVM_VERSION_STRING} VERSION_LESS ${LLVM_FIND_VERSION}) + message(FATAL_ERROR "Unsupported LLVM version found ${LLVM_VERSION_STRING}. At least version ${LLVM_FIND_VERSION} is required.") +endif() + +# Use the default CMake facilities for handling QUIET/REQUIRED. include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(LLVM - DEFAULT_MSG - LLVM_LIBRARIES - LLVM_INCLUDE_DIRS - LLVM_LIBRARY_DIRS) -# vim:sw=4:ts=4:autoindent +find_package_handle_standard_args(LLVM + REQUIRED_VARS LLVM_ROOT_DIR LLVM_HOST_TARGET + VERSION_VAR LLVM_VERSION_STRING) diff --git a/include/ir/constant.h b/include/ir/constant.h index c7c53ae2a..6dc8ecec8 100644 --- a/include/ir/constant.h +++ b/include/ir/constant.h @@ -52,6 +52,30 @@ private: double value_; }; +/* global value */ +class global_value: public constant { +public: + enum linkage_types_t { + internal + }; + +public: + global_value(type *ty, unsigned num_ops, + linkage_types_t linkage, const std::string &name, + unsigned addr_space); + +private: + linkage_types_t linkage_; +}; + +/* global object */ +class global_object: public global_value { +public: + global_object(type *ty, unsigned num_ops, + linkage_types_t linkage, const std::string &name, + unsigned addr_space = 0); +}; + } } diff --git a/include/ir/function.h b/include/ir/function.h index 3b8816b48..456faf5f3 100644 --- a/include/ir/function.h +++ b/include/ir/function.h @@ -3,35 +3,52 @@ #include #include "value.h" +#include "constant.h" namespace tdl{ namespace ir{ +class function; class function_type; class module; /* Argument */ class argument: public value{ + argument(type *ty, const std::string &name, function *parent, unsigned arg_no); +public: + static argument* create(type *ty, const std::string &name, + function *parent = nullptr, unsigned arg_no = 0); + +private: + function *parent_; + unsigned arg_no_; }; /* Function */ -class function: public value{ - using arg_iterator = argument *; - using const_arg_iterator = const argument *; +class function: public global_object{ + typedef std::vector args_t; + typedef args_t::iterator arg_iterator; + typedef args_t::const_iterator const_arg_iterator; +private: + function(function_type *ty, linkage_types_t linkage, + const std::string &name = "", module *parent = nullptr); public: - arg_iterator arg_begin(); - arg_iterator arg_end(); - const_arg_iterator arg_begin() const; - const_arg_iterator arg_end() const; + arg_iterator arg_begin() { return args_.begin(); } + arg_iterator arg_end() { return args_.end(); } + const_arg_iterator arg_begin() const { return args_.begin(); } + const_arg_iterator arg_end() const { return args_.end(); } + // Accessors + function_type* get_function_ty() const; // Factory methods - static function *create(function_type *type, const std::string &name, module *mod); + static function *create(function_type *ty, linkage_types_t linkage, + const std::string &name, module *mod); private: - function_type *type_; - std::string name_; - module *mod_; + module *parent_; + args_t args_; + bool init_; }; } diff --git a/include/ir/instructions.h b/include/ir/instructions.h index 3c4c408ed..1d7ecffcd 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -68,6 +68,10 @@ public: // Get operand op_t get_op() const { return op_; } + // Wraps + void set_has_no_unsigned_wrap(bool b = true) { has_no_unsigned_wrap_ = b; } + void set_has_no_signed_wrap(bool b = true) { has_no_signed_wrap_ = b; } + // Factory methods static binary_operator *create(op_t op, value *lhs, value *rhs, const std::string &name = "", instruction *next = nullptr); @@ -77,6 +81,8 @@ public: public: op_t op_; + bool has_no_unsigned_wrap_; + bool has_no_signed_wrap_; }; @@ -235,13 +241,26 @@ private: type *res_elt_ty; }; +//===----------------------------------------------------------------------===// +// load_inst/store_inst classes +//===----------------------------------------------------------------------===// + +class load_inst: public unary_inst{ + load_inst(value *ptr, const std::string &name, instruction *next); + +public: + static load_inst* create(value *ptr, const std::string &name = "", + instruction *next = nullptr); + +}; + //===----------------------------------------------------------------------===// // retile_inst classes //===----------------------------------------------------------------------===// // retile -class retile_inst: public instruction { +class retile_inst: public unary_inst { protected: retile_inst(value *arg, const std::vector &shapes, const std::string &name, instruction *next); }; diff --git a/include/ir/type.h b/include/ir/type.h index a82ed8e63..ebbbe5952 100644 --- a/include/ir/type.h +++ b/include/ir/type.h @@ -49,6 +49,7 @@ public: const std::vector &get_tile_shapes() const; type *get_tile_element_ty() const; unsigned get_pointer_address_space() const; + type *get_pointer_element_ty() const; // primitive predicates bool is_void_ty() const { return id_ == VoidTyID; } @@ -159,6 +160,10 @@ private: function_type(type *ret_ty, const std::vector ¶m_tys); public: + // accessors + unsigned get_num_params() const { return contained_tys_.size() - 1; } + type* get_param_ty(unsigned i) const { return contained_tys_.at(1 + i); } + // factory methods static function_type* get(type *ret_ty, const std::vector& param_tys); }; diff --git a/lib/codegen.cpp b/lib/codegen.cpp index 6d4b7038c..d6f41bba3 100644 --- a/lib/codegen.cpp +++ b/lib/codegen.cpp @@ -76,8 +76,7 @@ ir::type* pointer::type_impl(ir::module*, ir::type *type) const{ // Function void function::bind_parameters(ir::module *mod, ir::function *fn) const{ - std::vector args; - std::transform(fn->arg_begin(), fn->arg_end(), std::back_inserter(args), [&](ir::argument& x){ return &x;}); + std::vector args(fn->arg_begin(), fn->arg_end()); assert(args.size() == args_->values().size()); for(size_t i = 0; i < args.size(); i++){ parameter *param_i = args_->values().at(i); @@ -100,7 +99,7 @@ ir::type* function::type_impl(ir::module* mod, ir::type *type) const{ ir::value* function_definition::codegen(ir::module *mod) const{ ir::function_type *prototype = (ir::function_type*)header_->type(mod, spec_->type(mod)); const std::string &name = header_->id()->name(); - ir::function *fn = ir::function::create(prototype, name, mod); + ir::function *fn = ir::function::create(prototype, ir::function::internal, name, mod); header_->bind_parameters(mod, fn); ir::basic_block *entry = ir::basic_block::create(mod->get_context(), "entry", fn); mod->seal_block(entry); diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index 43cca943d..b26c1946e 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -1,7 +1,9 @@ #include #include "ir/basic_block.h" #include "ir/builder.h" +#include "ir/constant.h" #include "ir/instructions.h" +#include "ir/type.h" #include "llvm/IR/Instruction.h" namespace tdl{ @@ -11,7 +13,7 @@ builder::builder(context &ctx): ctx_(ctx){} //===----------------------------------------------------------------------===// -// insertion helpers +// utilities //===----------------------------------------------------------------------===// void builder::set_insert_point(basic_block::iterator instr){ @@ -24,6 +26,16 @@ void builder::set_insert_point(basic_block *block){ insert_point_ = block->end(); } + +//===----------------------------------------------------------------------===// +// convenience functions +//===----------------------------------------------------------------------===// + +value *builder::get_int32(unsigned val) { + return constant_int::get(type::get_int32_ty(ctx_), val); +} + + //===----------------------------------------------------------------------===// // terminator instructions //===----------------------------------------------------------------------===// @@ -99,6 +111,16 @@ DEFINE_UNARY_FLOAT(fneg) // binary int instructions //===----------------------------------------------------------------------===// + +value* builder::create_insert_nuwnswb_binop(binary_operator::op_t op, value *lhs, + value *rhs, const std::string &name, + bool has_nuw, bool has_nsw) { + binary_operator* result = insert(binary_operator::create(op, lhs, rhs), name); + if (has_nuw) result->set_has_no_unsigned_wrap(); + if (has_nsw) result->set_has_no_signed_wrap(); + return result; +} + #define DEFINE_NOWRAP_BINARY(SUFFIX, OPCODE)\ value* builder::create_ ## SUFFIX(value *lhs, value *rhs, const std::string &name, bool has_nuw, bool has_nsw){\ return create_insert_nuwnswb_binop(OPCODE, lhs, rhs, name, has_nuw, has_nsw);\ @@ -192,12 +214,12 @@ DEFINE_FCMP_INSTR(ONE, llvm::FCmpInst::FCMP_ONE) //===----------------------------------------------------------------------===// -// load instructions +// load/store instructions //===----------------------------------------------------------------------===// -//value *builder::create_load(value *arg, const std::string &name){ - -//} +value *builder::create_load(value *arg, const std::string &name){ + return load_inst::create(arg, name); +} //===----------------------------------------------------------------------===// // tile instructions diff --git a/lib/ir/constant.cpp b/lib/ir/constant.cpp index f2a3bd7e9..35e244613 100644 --- a/lib/ir/constant.cpp +++ b/lib/ir/constant.cpp @@ -88,6 +88,20 @@ undef_value *undef_value::get(type *ty) { return result; } +/* global value */ +global_value::global_value(type *ty, unsigned num_ops, + linkage_types_t linkage, + const std::string &name, unsigned addr_space) + : constant(pointer_type::get(ty, addr_space), num_ops, name), + linkage_(linkage) { } + + +/* global object */ +global_object::global_object(type *ty, unsigned num_ops, + linkage_types_t linkage, + const std::string &name, unsigned addr_space) + : global_value(ty, num_ops, linkage, name, addr_space) { } + } } diff --git a/lib/ir/function.cpp b/lib/ir/function.cpp index e69de29bb..b7cc14df5 100644 --- a/lib/ir/function.cpp +++ b/lib/ir/function.cpp @@ -0,0 +1,47 @@ +#include "ir/function.h" +#include "ir/type.h" + +namespace tdl{ +namespace ir{ + + +/* Argument */ + +argument::argument(type *ty, const std::string &name, function *parent, unsigned arg_no) + : value(ty, name), parent_(parent), arg_no_(arg_no) { } + +argument *argument::create(type *ty, const std::string &name, + function *parent, unsigned arg_no) { + return new argument(ty, name, parent, arg_no); +} + +/* function */ +function::function(function_type *ty, linkage_types_t linkage, + const std::string &name, module *parent) + : global_object(ty, 0, linkage, name), parent_(parent) { + // create arguments + function_type *fn_ty = get_function_ty(); + unsigned num_params = fn_ty->get_num_params(); + if(num_params > 0) { + args_.resize(num_params); + for(unsigned i = 0; i < num_params; i++){ + type *param_ty = fn_ty->get_param_ty(i); + args_.push_back(argument::create(param_ty, "", this, i)); + } + } +} + + +function *function::create(function_type *ty, linkage_types_t linkage, + const std::string &name, module *mod){ + return new function(ty, linkage, name, mod); +} + + +function_type* function::get_function_ty() const +{ return static_cast(get_type()); } + + +} +} + diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 51a778589..d330e015d 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -285,6 +285,15 @@ getelementptr_inst *getelementptr_inst::create(type *pointee_ty, value *ptr, con } +//===----------------------------------------------------------------------===// +// load_inst/store_inst classes +//===----------------------------------------------------------------------===// +load_inst::load_inst(value *ptr, const std::string &name, instruction *next) + : unary_inst(ptr->get_type()->get_pointer_element_ty(), ptr, name, next) { } + +load_inst* load_inst::create(value *ptr, const std::string &name, instruction *next) { + return new load_inst(ptr, name, next); +} //===----------------------------------------------------------------------===// // retile_inst classes @@ -292,9 +301,7 @@ getelementptr_inst *getelementptr_inst::create(type *pointee_ty, value *ptr, con retile_inst::retile_inst(value *arg, const std::vector &shapes, const std::string &name, instruction *next) - : instruction(tile_type::get(arg->get_type()->get_scalar_ty(), shapes), 1, name, next) { - set_operand(0, arg); -} + : unary_inst(tile_type::get(arg->get_type()->get_scalar_ty(), shapes), arg, name, next) { } // reshape diff --git a/lib/ir/type.cpp b/lib/ir/type.cpp index b6c9a4dff..50ffa7c23 100644 --- a/lib/ir/type.cpp +++ b/lib/ir/type.cpp @@ -40,6 +40,12 @@ unsigned type::get_pointer_address_space() const { return ((pointer_type*)this)->get_address_space(); } +type * type::get_pointer_element_ty() const { + assert(is_pointer_ty()); + return ((pointer_type*)this)->get_element_ty(); +} + + const std::vector &type::get_tile_shapes() const { assert(is_tile_ty()); return ((tile_type*)this)->get_shapes(); From 88504ca1727c5e9aabaffff757bbcd36d9c361ab Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 3 Jan 2019 17:14:54 -0500 Subject: [PATCH 031/494] [build sysem] better llvm handling --- CMakeLists.txt | 15 ++-- cmake/FindLLVM.cmake | 185 ------------------------------------------- examples/matrix.cpp | 10 ++- lib/ir/context.cpp | 13 ++- 4 files changed, 27 insertions(+), 196 deletions(-) delete mode 100644 cmake/FindLLVM.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index cea5cc99d..6f97a0ab3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,6 @@ -cmake_minimum_required(VERSION 2.8.7) +cmake_minimum_required(VERSION 2.8) project(TDL) include(CTest) -include(cmake/FindLLVM.cmake) # FLEX/YACC find_package(BISON) @@ -11,6 +10,12 @@ FLEX_TARGET(Lexer ${CMAKE_CURRENT_SOURCE_DIR}/include/scanner.l ${CMAKE_CURRENT_ get_filename_component(BISON_Parser_INCLUDE_DIRECTORIES ${BISON_Parser_OUTPUT_HEADER} DIRECTORY) include_directories(${BISON_Parser_INCLUDE_DIRECTORIES}) +# LLVM +find_package(LLVM REQUIRED CONFIG) +include_directories(${LLVM_INCLUDE_DIRS}) +add_definitions(${LLVM_DEFINITIONS}) +llvm_map_components_to_libnames(llvm_libs support core irreader) + #Default build type if(NOT CMAKE_BUILD_TYPE) message(STATUS "Default build type: Release") @@ -23,12 +28,12 @@ add_custom_target( ALL SOURCES ${ALL_SRC} ) # Compiler flags include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${LLVM_CXXFLAGS} -std=c++11") -# Library +# TDL file(GLOB_RECURSE LIBTDL_SRC lib/*.cpp) add_library(tdl SHARED ${LIBTDL_SRC} ${BISON_Parser_OUTPUTS} ${FLEX_Lexer_OUTPUTS}) -target_link_libraries(tdl "dl" ${LLVM_LIBRARIES}) +target_link_libraries(tdl ${llvm_libs}) # Examples add_subdirectory(examples) diff --git a/cmake/FindLLVM.cmake b/cmake/FindLLVM.cmake deleted file mode 100644 index e3e3606df..000000000 --- a/cmake/FindLLVM.cmake +++ /dev/null @@ -1,185 +0,0 @@ -# - Find LLVM headers and libraries. -# This module locates LLVM and adapts the llvm-config output for use with -# CMake. -# -# A given list of COMPONENTS is passed to llvm-config. -# -# The following variables are defined: -# LLVM_FOUND - true if LLVM was found -# LLVM_CXXFLAGS - C++ compiler flags for files that include LLVM headers. -# LLVM_HOST_TARGET - Target triple used to configure LLVM. -# LLVM_INCLUDE_DIRS - Directory containing LLVM include files. -# LLVM_LDFLAGS - Linker flags to add when linking against LLVM -# (includes -LLLVM_LIBRARY_DIRS). -# LLVM_LIBRARIES - Full paths to the library files to link against. -# LLVM_LIBRARY_DIRS - Directory containing LLVM libraries. -# LLVM_ROOT_DIR - The root directory of the LLVM installation. -# llvm-config is searched for in ${LLVM_ROOT_DIR}/bin. -# LLVM_VERSION_MAJOR - Major version of LLVM. -# LLVM_VERSION_MINOR - Minor version of LLVM. -# LLVM_VERSION_STRING - Full LLVM version string (e.g. 6.0.0svn). -# LLVM_VERSION_BASE_STRING - Base LLVM version string without git/svn suffix (e.g. 6.0.0). -# -# Note: The variable names were chosen in conformance with the offical CMake -# guidelines, see ${CMAKE_ROOT}/Modules/readme.txt. - -# Try suffixed versions to pick up the newest LLVM install available on Debian -# derivatives. -# We also want an user-specified LLVM_ROOT_DIR to take precedence over the -# system default locations such as /usr/local/bin. Executing find_program() -# multiples times is the approach recommended in the docs. -set(llvm_config_names llvm-config-8.0 llvm-config80 - llvm-config-7.0 llvm-config70 - llvm-config-6.0 llvm-config60 - llvm-config-5.0 llvm-config50 - llvm-config-4.0 llvm-config40 - llvm-config-3.9 llvm-config39 - llvm-config) -find_program(LLVM_CONFIG - NAMES ${llvm_config_names} - PATHS ${LLVM_ROOT_DIR}/bin NO_DEFAULT_PATH - DOC "Path to llvm-config tool.") -find_program(LLVM_CONFIG NAMES ${llvm_config_names}) - -# Prints a warning/failure message depending on the required/quiet flags. Copied -# from FindPackageHandleStandardArgs.cmake because it doesn't seem to be exposed. -macro(_LLVM_FAIL _msg) - if(LLVM_FIND_REQUIRED) - message(FATAL_ERROR "${_msg}") - else() - if(NOT LLVM_FIND_QUIETLY) - message(STATUS "${_msg}") - endif() - endif() -endmacro() - - -if(NOT LLVM_CONFIG) - if(NOT LLVM_FIND_QUIETLY) - message(WARNING "Could not find llvm-config (LLVM >= ${LLVM_FIND_VERSION}). Try manually setting LLVM_CONFIG to the llvm-config executable of the installation to use.") - endif() -else() - macro(llvm_set var flag) - if(LLVM_FIND_QUIETLY) - set(_quiet_arg ERROR_QUIET) - endif() - set(result_code) - execute_process( - COMMAND ${LLVM_CONFIG} --${flag} - RESULT_VARIABLE result_code - OUTPUT_VARIABLE LLVM_${var} - OUTPUT_STRIP_TRAILING_WHITESPACE - ${_quiet_arg} - ) - if(result_code) - _LLVM_FAIL("Failed to execute llvm-config ('${LLVM_CONFIG}', result code: '${result_code})'") - else() - if(${ARGV2}) - file(TO_CMAKE_PATH "${LLVM_${var}}" LLVM_${var}) - endif() - endif() - endmacro() - macro(llvm_set_libs var flag) - if(LLVM_FIND_QUIETLY) - set(_quiet_arg ERROR_QUIET) - endif() - set(result_code) - execute_process( - COMMAND ${LLVM_CONFIG} --${flag} ${LLVM_FIND_COMPONENTS} - RESULT_VARIABLE result_code - OUTPUT_VARIABLE tmplibs - OUTPUT_STRIP_TRAILING_WHITESPACE - ${_quiet_arg} - ) - if(result_code) - _LLVM_FAIL("Failed to execute llvm-config ('${LLVM_CONFIG}', result code: '${result_code})'") - else() - file(TO_CMAKE_PATH "${tmplibs}" tmplibs) - string(REGEX MATCHALL "${pattern}[^ ]+" LLVM_${var} ${tmplibs}) - endif() - endmacro() - - llvm_set(VERSION_STRING version) - llvm_set(CXXFLAGS cxxflags) - llvm_set(HOST_TARGET host-target) - llvm_set(INCLUDE_DIRS includedir true) - llvm_set(ROOT_DIR prefix true) - llvm_set(ENABLE_ASSERTIONS assertion-mode) - - # The LLVM version string _may_ contain a git/svn suffix, so cut that off - string(SUBSTRING "${LLVM_VERSION_STRING}" 0 5 LLVM_VERSION_BASE_STRING) - - # Versions below 3.9 do not support components debuginfocodeview, globalisel, ipa - list(REMOVE_ITEM LLVM_FIND_COMPONENTS "debuginfocodeview" index) - list(REMOVE_ITEM LLVM_FIND_COMPONENTS "globalisel" index) - list(REMOVE_ITEM LLVM_FIND_COMPONENTS "ipa" index) - if(${LLVM_VERSION_STRING} MATCHES "^3\\.[0-9][\\.0-9A-Za-z]*") - # Versions below 4.0 do not support component debuginfomsf - list(REMOVE_ITEM LLVM_FIND_COMPONENTS "debuginfomsf" index) - endif() - if(${LLVM_VERSION_STRING} MATCHES "^[3-5]\\..*") - # Versions below 6.0 do not support component windowsmanifest - list(REMOVE_ITEM LLVM_FIND_COMPONENTS "windowsmanifest" index) - endif() - - llvm_set(LDFLAGS ldflags) - # In LLVM 3.5+, the system library dependencies (e.g. "-lz") are accessed - # using the separate "--system-libs" flag. - llvm_set(SYSTEM_LIBS system-libs) - string(REPLACE "\n" " " LLVM_LDFLAGS "${LLVM_LDFLAGS} ${LLVM_SYSTEM_LIBS}") - llvm_set(LIBRARY_DIRS libdir true) - llvm_set_libs(LIBRARIES libs) - # LLVM bug: llvm-config --libs tablegen returns -lLLVM-3.8.0 - # but code for it is not in shared library - if("${LLVM_FIND_COMPONENTS}" MATCHES "tablegen") - if (NOT "${LLVM_LIBRARIES}" MATCHES "LLVMTableGen") - set(LLVM_LIBRARIES "${LLVM_LIBRARIES};-lLLVMTableGen") - endif() - endif() - - if(${LLVM_VERSION_STRING} MATCHES "^3\\.[0-9][\\.0-9A-Za-z]*") - # Versions below 4.0 do not support llvm-config --cmakedir - set(LLVM_CMAKEDIR ${LLVM_LIBRARY_DIRS}/cmake/llvm) - else() - llvm_set(CMAKEDIR cmakedir) - endif() - - llvm_set(TARGETS_TO_BUILD targets-built) - string(REGEX MATCHALL "${pattern}[^ ]+" LLVM_TARGETS_TO_BUILD ${LLVM_TARGETS_TO_BUILD}) -endif() - -# On CMake builds of LLVM, the output of llvm-config --cxxflags does not -# include -fno-rtti, leading to linker errors. Be sure to add it. -if(NOT MSVC AND (CMAKE_COMPILER_IS_GNUCXX OR (${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang"))) - if(NOT ${LLVM_CXXFLAGS} MATCHES "-fno-rtti") - set(LLVM_CXXFLAGS "${LLVM_CXXFLAGS} -fno-rtti") - endif() -endif() - -# Remove some clang-specific flags for gcc. -if(CMAKE_COMPILER_IS_GNUCXX) - string(REPLACE "-Wcovered-switch-default " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS}) - string(REPLACE "-Wstring-conversion " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS}) - string(REPLACE "-fcolor-diagnostics " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS}) - # this requires more recent gcc versions (not supported by 4.9) - string(REPLACE "-Werror=unguarded-availability-new " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS}) -endif() - -# Remove gcc-specific flags for clang. -if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") - string(REPLACE "-Wno-maybe-uninitialized " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS}) -endif() - -string(REGEX REPLACE "([0-9]+).*" "\\1" LLVM_VERSION_MAJOR "${LLVM_VERSION_STRING}" ) -string(REGEX REPLACE "[0-9]+\\.([0-9]+).*[A-Za-z]*" "\\1" LLVM_VERSION_MINOR "${LLVM_VERSION_STRING}" ) - -if (${LLVM_VERSION_STRING} VERSION_LESS ${LLVM_FIND_VERSION}) - message(FATAL_ERROR "Unsupported LLVM version found ${LLVM_VERSION_STRING}. At least version ${LLVM_FIND_VERSION} is required.") -endif() - -# Use the default CMake facilities for handling QUIET/REQUIRED. -include(FindPackageHandleStandardArgs) - -find_package_handle_standard_args(LLVM - REQUIRED_VARS LLVM_ROOT_DIR LLVM_HOST_TARGET - VERSION_VAR LLVM_VERSION_STRING) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 2f8194675..0c42165bc 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -1,6 +1,8 @@ #include #include #include "ast.h" +#include "ir/context.h" +#include "ir/module.h" typedef struct yy_buffer_state * YY_BUFFER_STATE; extern int yyparse(); @@ -25,10 +27,10 @@ int main() { YY_BUFFER_STATE buffer = yy_scan_string(src); yyparse(); yy_delete_buffer(buffer); -// translation_unit *program = ast_root; -// tdl::context context; -// tdl::module module("matrix", &context); -// program->codegen(&module); + translation_unit *program = ast_root; + tdl::ir::context context; + tdl::ir::module module("matrix", context); + program->codegen(&module); // llvm::PrintModulePass print(llvm::outs()); // llvm::AnalysisManager analysis; // print.run(*module.handle(), analysis); diff --git a/lib/ir/context.cpp b/lib/ir/context.cpp index 56b64b4a3..a3fd665e1 100644 --- a/lib/ir/context.cpp +++ b/lib/ir/context.cpp @@ -20,10 +20,19 @@ context_impl::context_impl(context &ctx) int16_ty(ctx, 16), int32_ty(ctx, 32), int64_ty(ctx, 64), - int128_ty(ctx, 128) -{ + int128_ty(ctx, 128){ } +//===----------------------------------------------------------------------===// +// context +//===----------------------------------------------------------------------===// + +context::context(): + p_impl(std::make_shared(*this)) { + +} + + } } From f131ebb0bc099aa011707c4f012e801b15bddb53 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 4 Jan 2019 01:43:02 -0500 Subject: [PATCH 032/494] [intermediate representation] fixed some bugs --- examples/matrix.cpp | 11 +++++++---- include/ir/builder.h | 8 ++++++-- include/ir/function.h | 3 +-- include/ir/instructions.h | 7 ++++--- include/ir/type.h | 3 ++- include/ir/value.h | 1 + lib/codegen.cpp | 2 +- lib/ir/basic_block.cpp | 8 ++++++-- lib/ir/builder.cpp | 9 +++++++++ lib/ir/function.cpp | 22 +++++++++------------- lib/ir/instructions.cpp | 7 ++++++- lib/ir/module.cpp | 5 +++-- 12 files changed, 55 insertions(+), 31 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 0c42165bc..09bbcb593 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -15,11 +15,14 @@ const char src[] = "\ void test(fp32 *A, fp32 *B, fp32 *C, int32 i){\ int32 j = 1;\ - int32 test[16, 16] = 0;\ - int32 test2[16, 16];\ - int32 test3[16, 16];\ int32 k;\ - test = test2 + test3;\ + i = i + j;\ + for(k = 0; k < 10; k = k+1){\ + int32 u = 1;\ + u = u + i;\ + if(k == 0)\ + u = u + 2;\ + }\ }\ "; diff --git a/include/ir/builder.h b/include/ir/builder.h index 4e1fd922c..3e0ba4293 100644 --- a/include/ir/builder.h +++ b/include/ir/builder.h @@ -1,6 +1,7 @@ #ifndef TDL_INCLUDE_IR_BUILDER_H #define TDL_INCLUDE_IR_BUILDER_H +#include #include #include #include "instructions.h" @@ -37,9 +38,12 @@ public: // Insert template InstTy* insert(InstTy *inst, const std::string &name = ""){ - if(block_) - block_->get_inst_list().insert(insert_point_, inst); + assert(block_); + block_->get_inst_list().insert(insert_point_, inst); + inst->set_parent(block_); inst->set_name(name); + insert_point_ = block_->end(); + return inst; } // terminator instructions value* create_br(basic_block *dest); diff --git a/include/ir/function.h b/include/ir/function.h index 456faf5f3..52b1579d7 100644 --- a/include/ir/function.h +++ b/include/ir/function.h @@ -39,8 +39,6 @@ public: arg_iterator arg_end() { return args_.end(); } const_arg_iterator arg_begin() const { return args_.begin(); } const_arg_iterator arg_end() const { return args_.end(); } - // Accessors - function_type* get_function_ty() const; // Factory methods static function *create(function_type *ty, linkage_types_t linkage, const std::string &name, module *mod); @@ -49,6 +47,7 @@ private: module *parent_; args_t args_; bool init_; + function_type *fn_ty_; }; } diff --git a/include/ir/instructions.h b/include/ir/instructions.h index 1d7ecffcd..53d437082 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -23,8 +23,9 @@ protected: public: // parent - const basic_block *get_parent() const { return parent_;} - basic_block *get_parent() { return parent_; } + void set_parent(basic_block *block) { parent_ = block; } + const basic_block *get_parent() const { return parent_; } + basic_block *get_parent() { return parent_; } private: basic_block *parent_; @@ -148,7 +149,7 @@ public: typedef llvm::CastInst::CastOps op_t; private: - bool is_valid(op_t op, value *arg, type *ty); + static bool is_valid(op_t op, value *arg, type *ty); public: // Factory methods diff --git a/include/ir/type.h b/include/ir/type.h index ebbbe5952..c089df4c2 100644 --- a/include/ir/type.h +++ b/include/ir/type.h @@ -2,6 +2,7 @@ #define TDL_INCLUDE_IR_TYPE_H #include +#include namespace tdl{ namespace ir{ @@ -34,7 +35,7 @@ public: public: //constructors - type(context &ctx, id_t id) : ctx_(ctx), id_(id) {} + type(context &ctx, id_t id) : ctx_(ctx), id_(id) { } //destructor virtual ~type(){} diff --git a/include/ir/value.h b/include/ir/value.h index bab034603..c8b162071 100644 --- a/include/ir/value.h +++ b/include/ir/value.h @@ -24,6 +24,7 @@ public: void add_use(use *arg); // name void set_name(const std::string &name); + const std::string &get_name() const { return name_; } type* get_type() const { return ty_; } private: diff --git a/lib/codegen.cpp b/lib/codegen.cpp index d6f41bba3..12732b1b4 100644 --- a/lib/codegen.cpp +++ b/lib/codegen.cpp @@ -310,7 +310,7 @@ inline void implicit_broadcast(ir::module *mod, ir::builder &builder, ir::value ir::value *binary_operator::llvm_op(ir::module *mod, ir::builder &builder, ir::value *lhs, ir::value *rhs, const std::string &name) const { bool is_float = false, is_ptr = false, is_int = false, is_signed = false; -// implicit_cast(builder, lhs, rhs, is_float, is_ptr, is_int, is_signed); + implicit_cast(builder, lhs, rhs, is_float, is_ptr, is_int, is_signed); // implicit_broadcast(mod, builder, lhs, rhs); if(op_==MUL && is_float) return builder.create_fmul(lhs, rhs, name); diff --git a/lib/ir/basic_block.cpp b/lib/ir/basic_block.cpp index 359c55d0c..29361b3b7 100644 --- a/lib/ir/basic_block.cpp +++ b/lib/ir/basic_block.cpp @@ -9,18 +9,22 @@ class phi_node; basic_block::basic_block(context &ctx, const std::string &name, function *parent): value(type::get_label_ty(ctx), name), ctx_(ctx), parent_(parent){ - } basic_block* basic_block::create(context &ctx, const std::string &name, function *parent){ return new basic_block(ctx, name, parent); } +void basic_block::add_predecessor(basic_block *pred) { + preds_.push_back(pred); +} + basic_block::iterator basic_block::get_first_non_phi(){ auto it = begin(); for(; it != end(); it++) - if(!dynamic_cast(*it)) + if(!dynamic_cast(*it)){ return it; + } return it; } diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index b26c1946e..a2c7ef809 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -35,16 +35,25 @@ value *builder::get_int32(unsigned val) { return constant_int::get(type::get_int32_ty(ctx_), val); } +type *builder::get_float_ty() +{ return type::get_float_ty(ctx_); } + +type *builder::get_double_ty() +{ return type::get_double_ty(ctx_); } + //===----------------------------------------------------------------------===// // terminator instructions //===----------------------------------------------------------------------===// value* builder::create_br(basic_block *dest){ + dest->add_predecessor(block_); return insert(branch_inst::create(dest)); } value* builder::create_cond_br(value *cond, basic_block *if_dest, basic_block *else_dest){ + if_dest->add_predecessor(block_); + else_dest->add_predecessor(block_); return insert(branch_inst::create(cond, if_dest, else_dest)); } diff --git a/lib/ir/function.cpp b/lib/ir/function.cpp index b7cc14df5..2def17325 100644 --- a/lib/ir/function.cpp +++ b/lib/ir/function.cpp @@ -18,16 +18,16 @@ argument *argument::create(type *ty, const std::string &name, /* function */ function::function(function_type *ty, linkage_types_t linkage, const std::string &name, module *parent) - : global_object(ty, 0, linkage, name), parent_(parent) { + : global_object(ty, 0, linkage, name), parent_(parent), fn_ty_(ty) { + unsigned num_params = fn_ty_->get_num_params(); + // skip if no parameter + if(num_params == 0) + return; // create arguments - function_type *fn_ty = get_function_ty(); - unsigned num_params = fn_ty->get_num_params(); - if(num_params > 0) { - args_.resize(num_params); - for(unsigned i = 0; i < num_params; i++){ - type *param_ty = fn_ty->get_param_ty(i); - args_.push_back(argument::create(param_ty, "", this, i)); - } + args_.resize(num_params); + for(unsigned i = 0; i < num_params; i++){ + type *param_ty = fn_ty_->get_param_ty(i); + args_[i] = argument::create(param_ty, "", this, i); } } @@ -38,10 +38,6 @@ function *function::create(function_type *ty, linkage_types_t linkage, } -function_type* function::get_function_ty() const -{ return static_cast(get_type()); } - - } } diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index d330e015d..92a42fb00 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -26,7 +26,7 @@ instruction::instruction(type *ty, unsigned num_ops, const std::string &name, in //===----------------------------------------------------------------------===// phi_node::phi_node(type *ty, unsigned num_reserved, std::string const &name, instruction *next) - : instruction(ty, num_reserved, name, next){ } + : instruction(ty, num_reserved, name, next), blocks_(num_reserved){ } // Set incoming value void phi_node::set_incoming_value(unsigned i, value *v){ @@ -152,6 +152,11 @@ unary_inst::unary_inst(type *ty, value *v, const std::string &name, instruction // cast_inst classes //===----------------------------------------------------------------------===// +// TODO +bool cast_inst::is_valid(op_t op, value *arg, type *ty) { + return true; +} + cast_inst *cast_inst::create(op_t op, value *arg, type *ty, const std::string &name, instruction *next){ assert(is_valid(op, arg, ty) && "Invalid cast!"); // Construct and return the appropriate CastInst subclass diff --git a/lib/ir/module.cpp b/lib/ir/module.cpp index c1979881b..d5ad53e26 100644 --- a/lib/ir/module.cpp +++ b/lib/ir/module.cpp @@ -1,5 +1,6 @@ #include "ir/basic_block.h" #include "ir/module.h" +#include "ir/type.h" namespace tdl{ namespace ir{ @@ -28,10 +29,10 @@ void module::set_value(const std::string& name, ir::value *value){ ir::phi_node* module::make_phi(ir::type *ty, unsigned num_values, ir::basic_block *block){ basic_block::iterator insert = block->get_first_non_phi(); - if(insert == block->end()) + if(*insert) builder_.set_insert_point(insert); ir::phi_node *res = builder_.create_phi(ty, num_values); - if(insert == block->end()) + if(*insert) builder_.set_insert_point(block); return res; } From ec656af57cb36158525b66025891c3241c3bf79c Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 5 Jan 2019 14:50:31 -0500 Subject: [PATCH 033/494] [code generation] basic to-llvm lowering --- CMakeLists.txt | 4 +- TODO | 4 + examples/matrix.cpp | 3 +- include/{ => ast}/ast.h | 0 include/{ => ast}/parser.y | 2 +- include/{ => ast}/scanner.l | 0 include/codegen/layout.h | 10 +++ include/codegen/liveness.h | 10 +++ include/codegen/lowering.h | 118 ++++++++++++++++++++++++++ include/codegen/storage_alloc.h | 10 +++ include/codegen/tune.h | 10 +++ include/ir/constant.h | 2 +- include/ir/function.h | 23 +++-- include/ir/instructions.h | 57 ++++++++++--- include/ir/module.h | 20 +++++ include/ir/type.h | 16 +++- include/ir/value.h | 13 ++- lib/{codegen.cpp => ast/lowering.cpp} | 6 +- lib/ir/basic_block.cpp | 6 +- lib/ir/function.cpp | 11 ++- lib/ir/instructions.cpp | 38 ++++----- lib/ir/module.cpp | 11 +++ lib/ir/value.cpp | 2 +- 23 files changed, 320 insertions(+), 56 deletions(-) create mode 100644 TODO rename include/{ => ast}/ast.h (100%) rename include/{ => ast}/parser.y (99%) rename include/{ => ast}/scanner.l (100%) create mode 100644 include/codegen/layout.h create mode 100644 include/codegen/liveness.h create mode 100644 include/codegen/lowering.h create mode 100644 include/codegen/storage_alloc.h create mode 100644 include/codegen/tune.h rename lib/{codegen.cpp => ast/lowering.cpp} (98%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6f97a0ab3..9d49af3e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,8 +5,8 @@ include(CTest) # FLEX/YACC find_package(BISON) find_package(FLEX) -BISON_TARGET(Parser ${CMAKE_CURRENT_SOURCE_DIR}/include/parser.y ${CMAKE_CURRENT_BINARY_DIR}/parser.cpp) -FLEX_TARGET(Lexer ${CMAKE_CURRENT_SOURCE_DIR}/include/scanner.l ${CMAKE_CURRENT_BINARY_DIR}/scanner.cpp) +BISON_TARGET(Parser ${CMAKE_CURRENT_SOURCE_DIR}/include/ast/parser.y ${CMAKE_CURRENT_BINARY_DIR}/parser.cpp) +FLEX_TARGET(Lexer ${CMAKE_CURRENT_SOURCE_DIR}/include/ast/scanner.l ${CMAKE_CURRENT_BINARY_DIR}/scanner.cpp) get_filename_component(BISON_Parser_INCLUDE_DIRECTORIES ${BISON_Parser_OUTPUT_HEADER} DIRECTORY) include_directories(${BISON_Parser_INCLUDE_DIRECTORIES}) diff --git a/TODO b/TODO new file mode 100644 index 000000000..340218d7a --- /dev/null +++ b/TODO @@ -0,0 +1,4 @@ +[Intermediate Representation] + - proper naming scheme + - symbols table + - name conflicts on globals? diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 09bbcb593..41e6120e2 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -1,8 +1,9 @@ #include #include -#include "ast.h" +#include "ast/ast.h" #include "ir/context.h" #include "ir/module.h" +#include "codegen/lowering.h" typedef struct yy_buffer_state * YY_BUFFER_STATE; extern int yyparse(); diff --git a/include/ast.h b/include/ast/ast.h similarity index 100% rename from include/ast.h rename to include/ast/ast.h diff --git a/include/parser.y b/include/ast/parser.y similarity index 99% rename from include/parser.y rename to include/ast/parser.y index f43b45265..bcfd7498d 100644 --- a/include/parser.y +++ b/include/ast/parser.y @@ -6,7 +6,7 @@ class node; } using namespace tdl::ast; #define YYSTYPE node* -#include "../include/ast.h" +#include "../include/ast/ast.h" extern char* yytext; void yyerror(const char *s); diff --git a/include/scanner.l b/include/ast/scanner.l similarity index 100% rename from include/scanner.l rename to include/ast/scanner.l diff --git a/include/codegen/layout.h b/include/codegen/layout.h new file mode 100644 index 000000000..40227001c --- /dev/null +++ b/include/codegen/layout.h @@ -0,0 +1,10 @@ +#ifndef TDL_INCLUDE_IR_CODEGEN_LAYOUT_H +#define TDL_INCLUDE_IR_CODEGEN_LAYOUT_H + +namespace tdl{ +namespace codegen{ + +} +} + +#endif diff --git a/include/codegen/liveness.h b/include/codegen/liveness.h new file mode 100644 index 000000000..9afbc4e46 --- /dev/null +++ b/include/codegen/liveness.h @@ -0,0 +1,10 @@ +#ifndef TDL_INCLUDE_IR_CODEGEN_LIVENESS_H +#define TDL_INCLUDE_IR_CODEGEN_LIVENESS_H + +namespace tdl{ +namespace codegen{ + +} +} + +#endif diff --git a/include/codegen/lowering.h b/include/codegen/lowering.h new file mode 100644 index 000000000..d418c38b7 --- /dev/null +++ b/include/codegen/lowering.h @@ -0,0 +1,118 @@ +#ifndef TDL_INCLUDE_IR_CODEGEN_LOWERING_H +#define TDL_INCLUDE_IR_CODEGEN_LOWERING_H + +#include "llvm/IR/Module.h" +#include "llvm/IR/IRBuilder.h" +#include "ir/context.h" +#include "ir/module.h" +#include "ir/function.h" +#include "ir/type.h" + + +namespace tdl{ +namespace codegen{ + +/* convert ir::type to llvm::Type */ + +llvm::Type *llvm_type(ir::type *ty, llvm::LLVMContext &ctx) { + // function + if(auto* tt = dynamic_cast(ty)){ + llvm::Type *return_ty = llvm_type(tt->get_return_ty(), ctx); + std::vector param_tys; + std::transform(tt->params_begin(), tt->params_end(), std::back_inserter(param_tys), + [&ctx](ir::type* t){ return llvm_type(t, ctx);}); + return llvm::FunctionType::get(return_ty, param_tys, false); + } + // pointer + if(ty->is_pointer_ty()){ + llvm::Type *elt_ty = llvm_type(ty->get_pointer_element_ty(), ctx); + unsigned addr_space = ty->get_pointer_address_space(); + return llvm::PointerType::get(elt_ty, addr_space); + } + // integer + if(ty->is_integer_ty()){ + unsigned bitwidth = ty->get_integer_bitwidth(); + return llvm::IntegerType::get(ctx, bitwidth); + } + // primitive types + switch(ty->get_type_id()){ + case ir::type::VoidTyID: return llvm::Type::getVoidTy(ctx); + case ir::type::HalfTyID: return llvm::Type::getHalfTy(ctx); + case ir::type::FloatTyID: return llvm::Type::getFloatTy(ctx); + case ir::type::DoubleTyID: return llvm::Type::getDoubleTy(ctx); + case ir::type::X86_FP80TyID: return llvm::Type::getX86_FP80Ty(ctx); + case ir::type::PPC_FP128TyID: return llvm::Type::getPPC_FP128Ty(ctx); + case ir::type::LabelTyID: return llvm::Type::getLabelTy(ctx); + case ir::type::MetadataTyID: return llvm::Type::getMetadataTy(ctx); + case ir::type::TokenTyID: return llvm::Type::getTokenTy(ctx); + default: break; + } + // unknown type + throw std::runtime_error("unknown conversion from ir::type to llvm::Type"); +} + +/* convert ir::instruction to llvm::Instruction */ +llvm::Instruction *llvm_inst(ir::instruction *inst, llvm::LLVMContext & ctx, + std::map &v, + std::map &b) { + if(auto* ii = dynamic_cast(inst)) + return llvm::BranchInst::Create(b[ii->get_true_dest()], b[ii->get_false_dest()], v[ii->get_cond()]); + if(auto* ii = dynamic_cast(inst)) + return llvm::BranchInst::Create(b[ii->get_dest()]); + if(auto* ii = dynamic_cast(inst)) + return llvm::PHINode::Create(llvm_type(ii->get_type(), ctx), ii->get_num_operands(), ii->get_name()); + if(auto* ii = dynamic_cast(inst)) + return llvm::ReturnInst::Create(ctx, v[ii->get_return_value()]); + if(auto* ii = dynamic_cast(inst)) + return llvm::BinaryOperator::Create(ii->get_op(), v[ii->get_operand(0)], v[ii->get_operand(1)], ii->get_name()); + if(auto* ii = dynamic_cast(inst)) + return llvm::CmpInst::Create(llvm::Instruction::ICmp, ii->get_pred(), v[ii->get_operand(0)], v[ii->get_operand(1)], ii->get_name()); + if(auto* ii = dynamic_cast(inst)) + return llvm::FCmpInst::Create(llvm::Instruction::FCmp, ii->get_pred(), v[ii->get_operand(0)], v[ii->get_operand(1)], ii->get_name()); + if(auto* ii = dynamic_cast(inst)) + return llvm::CastInst::Create(ii->get_op(), v[ii->get_operand(0)], llvm_type(ii->get_type(), ctx), ii->get_name()); + if(auto* ii = dynamic_cast(inst)){ + std::vector idx_vals; + std::transform(ii->idx_begin(), ii->idx_end(), std::back_inserter(idx_vals), + [&v](ir::value* x){ return v[x];}); + return llvm::GetElementPtrInst::Create(llvm_type(ii->get_source_elt_ty(), ctx), v[ii->get_operand(0)], idx_vals, ii->get_name()); + } + if(ir::load_inst* ii = dynamic_cast(inst)) + return new llvm::LoadInst(v[ii->get_pointer_operand()], ii->get_name()); + // unknown instruction + throw std::runtime_error("unknown conversion from ir::type to llvm::Type"); +} + +void lowering(ir::module &src, llvm::Module &dst){ + using namespace llvm; + std::map vmap; + std::map bmap; + LLVMContext &dst_ctx = dst.getContext(); + IRBuilder<> dst_builder(dst_ctx); + // iterate over functions + for(ir::function *fn: src.get_function_list()) { + // create LLVM function + Type *fn_ty = llvm_type(fn->get_type(), dst_ctx); + Function *dst_function = (Function*)dst.getOrInsertFunction(fn->get_name(), fn_ty); + // create blocks + for(ir::basic_block *block: fn->blocks()) { + BasicBlock *dst_block = BasicBlock::Create(dst_ctx, block->get_name(), dst_function); + bmap[block] = dst_block; + } + // iterate through block + for(ir::basic_block *block: fn->blocks()) { + dst_builder.SetInsertPoint(bmap[block]); + for(ir::instruction *inst: block->get_inst_list()) { + Instruction *dst_inst = llvm_inst(inst, dst_ctx, vmap, bmap); + vmap[inst] = dst_inst; + } + } + // add phi operands + } +} + + +} +} + +#endif diff --git a/include/codegen/storage_alloc.h b/include/codegen/storage_alloc.h new file mode 100644 index 000000000..b2112b20b --- /dev/null +++ b/include/codegen/storage_alloc.h @@ -0,0 +1,10 @@ +#ifndef TDL_INCLUDE_IR_CODEGEN_STORAGE_ALLOC_H +#define TDL_INCLUDE_IR_CODEGEN_STORAGE_ALLOC_H + +namespace tdl{ +namespace codegen{ + +} +} + +#endif diff --git a/include/codegen/tune.h b/include/codegen/tune.h new file mode 100644 index 000000000..f1871167a --- /dev/null +++ b/include/codegen/tune.h @@ -0,0 +1,10 @@ +#ifndef TDL_INCLUDE_IR_CODEGEN_TUNE_H +#define TDL_INCLUDE_IR_CODEGEN_TUNE_H + +namespace tdl{ +namespace codegen{ + +} +} + +#endif diff --git a/include/ir/constant.h b/include/ir/constant.h index 6dc8ecec8..dff6606f8 100644 --- a/include/ir/constant.h +++ b/include/ir/constant.h @@ -56,7 +56,7 @@ private: class global_value: public constant { public: enum linkage_types_t { - internal + external }; public: diff --git a/include/ir/function.h b/include/ir/function.h index 52b1579d7..4becaa606 100644 --- a/include/ir/function.h +++ b/include/ir/function.h @@ -11,6 +11,7 @@ namespace ir{ class function; class function_type; class module; +class basic_block; /* Argument */ class argument: public value{ @@ -25,29 +26,41 @@ private: unsigned arg_no_; }; +/* Attribute */ +class attribute { + +}; + /* Function */ class function: public global_object{ typedef std::vector args_t; typedef args_t::iterator arg_iterator; typedef args_t::const_iterator const_arg_iterator; + + typedef std::vector blocks_t; + typedef blocks_t::iterator block_iterator; + typedef blocks_t::const_iterator const_block_iterator; + private: function(function_type *ty, linkage_types_t linkage, const std::string &name = "", module *parent = nullptr); public: - arg_iterator arg_begin() { return args_.begin(); } - arg_iterator arg_end() { return args_.end(); } - const_arg_iterator arg_begin() const { return args_.begin(); } - const_arg_iterator arg_end() const { return args_.end(); } + // arguments + const args_t &args() { return args_; } // Factory methods static function *create(function_type *ty, linkage_types_t linkage, const std::string &name, module *mod); + // blocks + const blocks_t &blocks() { return blocks_; } + void insert_block(basic_block* block, basic_block *next = nullptr); private: module *parent_; - args_t args_; bool init_; function_type *fn_ty_; + args_t args_; + blocks_t blocks_; }; } diff --git a/include/ir/instructions.h b/include/ir/instructions.h index 53d437082..8ec38cd1e 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -69,6 +69,13 @@ public: // Get operand op_t get_op() const { return op_; } + // Bool + bool is_terminator() const; + bool is_binary_op() const; + bool is_int_div_rem() const; + bool is_shift() const; + bool is_cast() const; + // Wraps void set_has_no_unsigned_wrap(bool b = true) { has_no_unsigned_wrap_ = b; } void set_has_no_signed_wrap(bool b = true) { has_no_signed_wrap_ = b; } @@ -98,14 +105,12 @@ public: protected: cmp_inst(type *ty, pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next); - - static type* make_cmp_result_type(type *ty); - static bool is_fp_predicate(pred_t pred); static bool is_int_predicate(pred_t pred); + static type* make_cmp_result_type(type *ty); public: - + pred_t get_pred() const { return pred_; } private: pred_t pred_; @@ -152,7 +157,10 @@ private: static bool is_valid(op_t op, value *arg, type *ty); public: - // Factory methods + // accessors + op_t get_op() const { return op_; } + + // factory methods static cast_inst *create(op_t op, value *arg, type *ty, const std::string &name = "", instruction *next = nullptr); static cast_inst *create_integer_cast(value *arg, type *ty, bool is_signed, @@ -191,7 +199,6 @@ class terminator_inst: public instruction{ }; // return instruction - class return_inst: public terminator_inst{ return_inst(context &ctx, value *ret_val, instruction *next); @@ -206,26 +213,43 @@ public: static return_inst* create(context &ctx, value *ret_val = nullptr, instruction *next = nullptr); }; -// conditional/unconditional branch instruction - +// base branch instruction class branch_inst: public terminator_inst{ - branch_inst(basic_block *dst, instruction *next); - branch_inst(basic_block *if_dst, basic_block *else_dst, value *cond, instruction *next); +protected: + using terminator_inst::terminator_inst; public: - - // factory methods static branch_inst* create(basic_block *dest, instruction *next = nullptr); static branch_inst* create(value *cond, basic_block *if_dest, basic_block *else_dest, instruction *next = nullptr); }; +// conditional branch +class cond_branch_inst: public branch_inst { + cond_branch_inst(basic_block *if_dst, basic_block *else_dst, value *cond, instruction *next); + friend class branch_inst; + +public: + basic_block *get_true_dest() { return (basic_block*)get_operand(0); } + basic_block *get_false_dest() { return (basic_block*)get_operand(1); } + value *get_cond() { return get_operand(2); } +}; + +// unconditional branch +class uncond_branch_inst: public branch_inst { + friend class branch_inst; + uncond_branch_inst(basic_block *dst, instruction *next); + +public: + basic_block *get_dest() { return (basic_block*)get_operand(0); } +}; //===----------------------------------------------------------------------===// // getelementptr_inst classes //===----------------------------------------------------------------------===// class getelementptr_inst: public instruction{ +private: getelementptr_inst(type *pointee_ty, value *ptr, const std::vector &idx, const std::string &name, instruction *next); private: @@ -234,6 +258,12 @@ private: static type *get_indexed_type(type *ty, const std::vector &idx); public: + // accessors + type *get_source_elt_ty() { return source_elt_ty; } + op_iterator idx_begin() { return op_begin() + 1; } + op_iterator idx_end() { return op_end(); } + + // factory methods static getelementptr_inst* create(type *pointee_ty, value *ptr, const std::vector &idx, const std::string &name = "", instruction *next = nullptr); @@ -250,6 +280,9 @@ class load_inst: public unary_inst{ load_inst(value *ptr, const std::string &name, instruction *next); public: + // accessors + value *get_pointer_operand() { return get_operand(0); } + // factory method static load_inst* create(value *ptr, const std::string &name = "", instruction *next = nullptr); diff --git a/include/ir/module.h b/include/ir/module.h index 3b6536cda..d38fc05d5 100644 --- a/include/ir/module.h +++ b/include/ir/module.h @@ -13,13 +13,26 @@ class basic_block; class phi_node; class value; class context; +class function; +class attribute; +class function_type; +class constant; +class global_value; /* Module */ class module { typedef std::pair val_key_t; + friend class function; + +public: + typedef std::map symbols_map_t; + typedef std::vector functions_list_t; + +private: phi_node *make_phi(type *ty, unsigned num_values, basic_block *block); value *add_phi_operands(const std::string& name, phi_node *&phi); value *get_value_recursive(const std::string& name, basic_block *block); + void push_function(function *fn) { functions_.push_back(fn); } public: module(const std::string &name, context &ctx); @@ -33,6 +46,11 @@ public: value *get_value(const std::string& name); // Seal block -- no more predecessors will be added void seal_block(basic_block *block); + // Functions + const functions_list_t &get_function_list() const { return functions_; } + functions_list_t &get_function_list() { return functions_; } + function *get_or_insert_function(const std::string &name, function_type *ty); + private: std::string name_; @@ -41,6 +59,8 @@ private: std::map values_; std::set sealed_blocks_; std::map> incomplete_phis_; + functions_list_t functions_; + symbols_map_t symbols_; }; } diff --git a/include/ir/type.h b/include/ir/type.h index c089df4c2..fb268326a 100644 --- a/include/ir/type.h +++ b/include/ir/type.h @@ -13,6 +13,11 @@ class integer_type; /* Type */ class type { +protected: + typedef std::vector contained_tys_vec_t; + typedef contained_tys_vec_t::iterator ty_iterator; + typedef contained_tys_vec_t::const_iterator const_ty_iterator; + public: enum id_t { // primitive types @@ -91,7 +96,7 @@ private: id_t id_; protected: - std::vector contained_tys_; + contained_tys_vec_t contained_tys_; }; class integer_type: public type { @@ -162,8 +167,13 @@ private: public: // accessors - unsigned get_num_params() const { return contained_tys_.size() - 1; } - type* get_param_ty(unsigned i) const { return contained_tys_.at(1 + i); } + unsigned get_num_params() const { return contained_tys_.size() - 1; } + const_ty_iterator params_begin() const { return contained_tys_.begin() + 1; } + const_ty_iterator params_end() const { return contained_tys_.end(); } + ty_iterator params_begin() { return contained_tys_.begin() + 1; } + ty_iterator params_end() { return contained_tys_.end(); } + type* get_param_ty(unsigned i) const { return contained_tys_.at(1 + i); } + type* get_return_ty() const { return contained_tys_.at(0); } // factory methods static function_type* get(type *ret_ty, const std::vector& param_tys); }; diff --git a/include/ir/value.h b/include/ir/value.h index c8b162071..df2f099de 100644 --- a/include/ir/value.h +++ b/include/ir/value.h @@ -57,6 +57,11 @@ private: //===----------------------------------------------------------------------===// class user: public value{ +public: + typedef std::vector ops_t; + typedef ops_t::iterator op_iterator; + typedef ops_t::const_iterator const_op_iterator; + protected: void resize_ops(unsigned n) { ops_.resize(n); } @@ -66,9 +71,11 @@ public: : value(ty, name), ops_(num_ops){ } // Operands - void set_operand(unsigned i, value *x); - value *get_operand(unsigned i); - unsigned get_num_operands(); + op_iterator op_begin() { return ops_.begin(); } + op_iterator op_end() { return ops_.end(); } + void set_operand(unsigned i, value *x); + value *get_operand(unsigned i); + unsigned get_num_operands() const ; private: std::vector ops_; diff --git a/lib/codegen.cpp b/lib/ast/lowering.cpp similarity index 98% rename from lib/codegen.cpp rename to lib/ast/lowering.cpp index 12732b1b4..9dba1e633 100644 --- a/lib/codegen.cpp +++ b/lib/ast/lowering.cpp @@ -1,6 +1,6 @@ #include #include -#include "ast.h" +#include "ast/ast.h" #include "ir/constant.h" #include "ir/function.h" #include "ir/module.h" @@ -76,7 +76,7 @@ ir::type* pointer::type_impl(ir::module*, ir::type *type) const{ // Function void function::bind_parameters(ir::module *mod, ir::function *fn) const{ - std::vector args(fn->arg_begin(), fn->arg_end()); + std::vector args = fn->args(); assert(args.size() == args_->values().size()); for(size_t i = 0; i < args.size(); i++){ parameter *param_i = args_->values().at(i); @@ -99,7 +99,7 @@ ir::type* function::type_impl(ir::module* mod, ir::type *type) const{ ir::value* function_definition::codegen(ir::module *mod) const{ ir::function_type *prototype = (ir::function_type*)header_->type(mod, spec_->type(mod)); const std::string &name = header_->id()->name(); - ir::function *fn = ir::function::create(prototype, ir::function::internal, name, mod); + ir::function *fn = mod->get_or_insert_function(name, prototype); header_->bind_parameters(mod, fn); ir::basic_block *entry = ir::basic_block::create(mod->get_context(), "entry", fn); mod->seal_block(entry); diff --git a/lib/ir/basic_block.cpp b/lib/ir/basic_block.cpp index 29361b3b7..25fb91a95 100644 --- a/lib/ir/basic_block.cpp +++ b/lib/ir/basic_block.cpp @@ -1,14 +1,18 @@ #include "ir/basic_block.h" #include "ir/instructions.h" #include "ir/type.h" +#include "ir/function.h" namespace tdl { namespace ir { class phi_node; + basic_block::basic_block(context &ctx, const std::string &name, function *parent): - value(type::get_label_ty(ctx), name), ctx_(ctx), parent_(parent){ + value(type::get_label_ty(ctx), name), ctx_(ctx), parent_(parent) { + if(parent_) + parent_->insert_block(this); } basic_block* basic_block::create(context &ctx, const std::string &name, function *parent){ diff --git a/lib/ir/function.cpp b/lib/ir/function.cpp index 2def17325..6d3329f21 100644 --- a/lib/ir/function.cpp +++ b/lib/ir/function.cpp @@ -1,5 +1,6 @@ #include "ir/function.h" #include "ir/type.h" +#include "ir/module.h" namespace tdl{ namespace ir{ @@ -29,11 +30,19 @@ function::function(function_type *ty, linkage_types_t linkage, type *param_ty = fn_ty_->get_param_ty(i); args_[i] = argument::create(param_ty, "", this, i); } + if(parent) + parent->push_function(this); +} + +/* basic block */ +void function::insert_block(basic_block *block, basic_block *next) { + auto it = std::find(blocks_.begin(), blocks_.end(), next); + blocks_.insert(it, block); } function *function::create(function_type *ty, linkage_types_t linkage, - const std::string &name, module *mod){ + const std::string &name, module *mod) { return new function(ty, linkage, name, mod); } diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 92a42fb00..03f85bec6 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -98,7 +98,6 @@ binary_operator *binary_operator::create_not(value *arg, const std::string &name //===----------------------------------------------------------------------===// // cmp_inst - cmp_inst::cmp_inst(type *ty, cmp_inst::pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next) : instruction(ty, 2, name, next), pred_(pred) { set_operand(0, lhs); @@ -113,8 +112,6 @@ type* cmp_inst::make_cmp_result_type(type *ty){ } - - bool cmp_inst::is_fp_predicate(pred_t pred) { return pred >= pcmp::FIRST_FCMP_PREDICATE && pred <= pcmp::LAST_FCMP_PREDICATE; } @@ -124,7 +121,6 @@ bool cmp_inst::is_int_predicate(pred_t pred) { } // icmp_inst - icmp_inst* icmp_inst::create(pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next){ assert(is_int_predicate(pred)); type *res_ty = make_cmp_result_type(lhs->get_type()); @@ -132,7 +128,6 @@ icmp_inst* icmp_inst::create(pred_t pred, value *lhs, value *rhs, const std::str } // fcmp_inst - fcmp_inst* fcmp_inst::create(pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next){ assert(is_fp_predicate(pred)); type *res_ty = make_cmp_result_type(lhs->get_type()); @@ -195,7 +190,6 @@ cast_inst *cast_inst::create_integer_cast(value *arg, type *ty, bool is_signed, // return_inst - return_inst::return_inst(context &ctx, value *ret_val, instruction *next) : terminator_inst(type::get_void_ty(ctx), !!ret_val, "", next){ if(ret_val) @@ -207,32 +201,32 @@ return_inst *return_inst::create(context &ctx, value *ret_val, instruction *next } -// conditional/unconditional branch +// branch_inst +branch_inst* branch_inst::create(basic_block *dst, instruction *next) { + assert(dst && "Branch destination may not be null!"); + return new uncond_branch_inst(dst, next); +} -branch_inst::branch_inst(basic_block *dst, instruction *next) - : terminator_inst(type::get_void_ty(dst->get_context()), 1, "", next){ +branch_inst* branch_inst::create(value *cond, basic_block *if_dst, basic_block *else_dst, instruction *next) { + assert(cond->get_type()->is_integer_ty(1) && "May only branch on boolean predicates!"); + return new cond_branch_inst(if_dst, else_dst, cond, next); +} + +// uncond_branch_inst +uncond_branch_inst::uncond_branch_inst(basic_block *dst, instruction *next) + : branch_inst(type::get_void_ty(dst->get_context()), 1, "", next){ set_operand(0, dst); } -branch_inst::branch_inst(basic_block *if_dst, basic_block *else_dst, value *cond, instruction *next) - : terminator_inst(type::get_void_ty(if_dst->get_context()), 3, "", next){ +// cond_branch_inst +cond_branch_inst::cond_branch_inst(basic_block *if_dst, basic_block *else_dst, value *cond, instruction *next) + : branch_inst(type::get_void_ty(if_dst->get_context()), 3, "", next){ assert(cond->get_type()->is_integer_ty(1) && "May only branch on boolean predicates!"); set_operand(0, if_dst); set_operand(1, else_dst); set_operand(2, cond); } -branch_inst* branch_inst::create(basic_block *dst, instruction *next) { - assert(dst && "Branch destination may not be null!"); - return new branch_inst(dst, next); -} - -branch_inst* branch_inst::create(value *cond, basic_block *if_dst, basic_block *else_dst, instruction *next) { - assert(cond->get_type()->is_integer_ty(1) && "May only branch on boolean predicates!"); - return new branch_inst(if_dst, else_dst, cond, next); -} - - //===----------------------------------------------------------------------===// // getelementptr_inst classes //===----------------------------------------------------------------------===// diff --git a/lib/ir/module.cpp b/lib/ir/module.cpp index d5ad53e26..6e8fca715 100644 --- a/lib/ir/module.cpp +++ b/lib/ir/module.cpp @@ -1,6 +1,8 @@ #include "ir/basic_block.h" #include "ir/module.h" #include "ir/type.h" +#include "ir/constant.h" +#include "ir/function.h" namespace tdl{ namespace ir{ @@ -86,5 +88,14 @@ void module::seal_block(ir::basic_block *block){ sealed_blocks_.insert(block); } +/* functions */ +function *module::get_or_insert_function(const std::string &name, function_type *ty) { + function *&fn = (function*&)symbols_[name]; + if(fn == nullptr) + return fn = function::create(ty, global_value::external, name, this); + return fn; +} + + } } diff --git a/lib/ir/value.cpp b/lib/ir/value.cpp index e069f593c..c1aaf3d42 100644 --- a/lib/ir/value.cpp +++ b/lib/ir/value.cpp @@ -50,7 +50,7 @@ value* user::get_operand(unsigned i){ return ops_[i]; } -unsigned user::get_num_operands(){ +unsigned user::get_num_operands() const{ return ops_.size(); } From f9ba69f1a4690d7e3c45f2e53276a04d688e0e1d Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 5 Jan 2019 19:23:00 -0500 Subject: [PATCH 034/494] [code generation] some bugfixes --- examples/matrix.cpp | 6 ++ include/codegen/lowering.h | 181 +++++++++++++++++++++++++++---------- include/ir/constant.h | 2 + include/ir/instructions.h | 4 +- include/ir/value.h | 1 + 5 files changed, 143 insertions(+), 51 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 41e6120e2..bae0ee52b 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -4,6 +4,9 @@ #include "ir/context.h" #include "ir/module.h" #include "codegen/lowering.h" +#include "llvm/IR/IRPrintingPasses.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/LLVMContext.h" typedef struct yy_buffer_state * YY_BUFFER_STATE; extern int yyparse(); @@ -35,6 +38,9 @@ int main() { tdl::ir::context context; tdl::ir::module module("matrix", context); program->codegen(&module); + llvm::LLVMContext llvm_context; + llvm::Module llvm_module("test", llvm_context); + tdl::codegen::lowering(module, llvm_module); // llvm::PrintModulePass print(llvm::outs()); // llvm::AnalysisManager analysis; // print.run(*module.handle(), analysis); diff --git a/include/codegen/lowering.h b/include/codegen/lowering.h index d418c38b7..673bcbf09 100644 --- a/include/codegen/lowering.h +++ b/include/codegen/lowering.h @@ -12,79 +12,146 @@ namespace tdl{ namespace codegen{ -/* convert ir::type to llvm::Type */ +using namespace llvm; -llvm::Type *llvm_type(ir::type *ty, llvm::LLVMContext &ctx) { +/* convert ir::type to Type */ +Type *llvm_type(ir::type *ty, LLVMContext &ctx) { // function if(auto* tt = dynamic_cast(ty)){ - llvm::Type *return_ty = llvm_type(tt->get_return_ty(), ctx); - std::vector param_tys; + Type *return_ty = llvm_type(tt->get_return_ty(), ctx); + std::vector param_tys; std::transform(tt->params_begin(), tt->params_end(), std::back_inserter(param_tys), [&ctx](ir::type* t){ return llvm_type(t, ctx);}); - return llvm::FunctionType::get(return_ty, param_tys, false); + return FunctionType::get(return_ty, param_tys, false); } // pointer if(ty->is_pointer_ty()){ - llvm::Type *elt_ty = llvm_type(ty->get_pointer_element_ty(), ctx); + Type *elt_ty = llvm_type(ty->get_pointer_element_ty(), ctx); unsigned addr_space = ty->get_pointer_address_space(); - return llvm::PointerType::get(elt_ty, addr_space); + return PointerType::get(elt_ty, addr_space); } // integer if(ty->is_integer_ty()){ unsigned bitwidth = ty->get_integer_bitwidth(); - return llvm::IntegerType::get(ctx, bitwidth); + return IntegerType::get(ctx, bitwidth); } // primitive types switch(ty->get_type_id()){ - case ir::type::VoidTyID: return llvm::Type::getVoidTy(ctx); - case ir::type::HalfTyID: return llvm::Type::getHalfTy(ctx); - case ir::type::FloatTyID: return llvm::Type::getFloatTy(ctx); - case ir::type::DoubleTyID: return llvm::Type::getDoubleTy(ctx); - case ir::type::X86_FP80TyID: return llvm::Type::getX86_FP80Ty(ctx); - case ir::type::PPC_FP128TyID: return llvm::Type::getPPC_FP128Ty(ctx); - case ir::type::LabelTyID: return llvm::Type::getLabelTy(ctx); - case ir::type::MetadataTyID: return llvm::Type::getMetadataTy(ctx); - case ir::type::TokenTyID: return llvm::Type::getTokenTy(ctx); + case ir::type::VoidTyID: return Type::getVoidTy(ctx); + case ir::type::HalfTyID: return Type::getHalfTy(ctx); + case ir::type::FloatTyID: return Type::getFloatTy(ctx); + case ir::type::DoubleTyID: return Type::getDoubleTy(ctx); + case ir::type::X86_FP80TyID: return Type::getX86_FP80Ty(ctx); + case ir::type::PPC_FP128TyID: return Type::getPPC_FP128Ty(ctx); + case ir::type::LabelTyID: return Type::getLabelTy(ctx); + case ir::type::MetadataTyID: return Type::getMetadataTy(ctx); + case ir::type::TokenTyID: return Type::getTokenTy(ctx); default: break; } // unknown type - throw std::runtime_error("unknown conversion from ir::type to llvm::Type"); + throw std::runtime_error("unknown conversion from ir::type to Type"); } -/* convert ir::instruction to llvm::Instruction */ -llvm::Instruction *llvm_inst(ir::instruction *inst, llvm::LLVMContext & ctx, - std::map &v, - std::map &b) { - if(auto* ii = dynamic_cast(inst)) - return llvm::BranchInst::Create(b[ii->get_true_dest()], b[ii->get_false_dest()], v[ii->get_cond()]); - if(auto* ii = dynamic_cast(inst)) - return llvm::BranchInst::Create(b[ii->get_dest()]); - if(auto* ii = dynamic_cast(inst)) - return llvm::PHINode::Create(llvm_type(ii->get_type(), ctx), ii->get_num_operands(), ii->get_name()); - if(auto* ii = dynamic_cast(inst)) - return llvm::ReturnInst::Create(ctx, v[ii->get_return_value()]); - if(auto* ii = dynamic_cast(inst)) - return llvm::BinaryOperator::Create(ii->get_op(), v[ii->get_operand(0)], v[ii->get_operand(1)], ii->get_name()); - if(auto* ii = dynamic_cast(inst)) - return llvm::CmpInst::Create(llvm::Instruction::ICmp, ii->get_pred(), v[ii->get_operand(0)], v[ii->get_operand(1)], ii->get_name()); - if(auto* ii = dynamic_cast(inst)) - return llvm::FCmpInst::Create(llvm::Instruction::FCmp, ii->get_pred(), v[ii->get_operand(0)], v[ii->get_operand(1)], ii->get_name()); - if(auto* ii = dynamic_cast(inst)) - return llvm::CastInst::Create(ii->get_op(), v[ii->get_operand(0)], llvm_type(ii->get_type(), ctx), ii->get_name()); - if(auto* ii = dynamic_cast(inst)){ - std::vector idx_vals; - std::transform(ii->idx_begin(), ii->idx_end(), std::back_inserter(idx_vals), - [&v](ir::value* x){ return v[x];}); - return llvm::GetElementPtrInst::Create(llvm_type(ii->get_source_elt_ty(), ctx), v[ii->get_operand(0)], idx_vals, ii->get_name()); +Value* llvm_value(ir::value *v, LLVMContext &ctx, + std::map &vmap, + std::map &bmap); + +/* convert ir::constant to Constant */ +Constant *llvm_constant(ir::constant *cst, LLVMContext &ctx) { + Type *dst_ty = llvm_type(cst->get_type(), ctx); + if(auto* cc = dynamic_cast(cst)) + return ConstantInt::get(dst_ty, cc->get_value()); + if(auto* cc = dynamic_cast(cst)) + return ConstantFP::get(dst_ty, cc->get_value()); + // unknown constant + throw std::runtime_error("unknown conversion from ir::constant to Constant"); +} + + +/* convert ir::instruction to Instruction */ +Instruction *llvm_inst(ir::instruction *inst, LLVMContext & ctx, + std::map &vmap, + std::map &bmap) { + auto value = [&](ir::value *x) { return llvm_value(x, ctx, vmap, bmap); }; + auto block = [&](ir::basic_block *x) { return bmap.at(x); }; + auto type = [&](ir::type *x) { return llvm_type(x, ctx); }; + if(auto* ii = dynamic_cast(inst)){ + BasicBlock *true_dest = block(ii->get_true_dest()); + BasicBlock *false_dest = block(ii->get_false_dest()); + Value *cond = value(ii->get_cond()); + return BranchInst::Create(true_dest, false_dest, cond); + } + if(auto* ii = dynamic_cast(inst)){ + BasicBlock *dest = block(ii->get_dest()); + return BranchInst::Create(dest); + } + if(auto* ii = dynamic_cast(inst)){ + Type *ty = type(ii->get_type()); + unsigned num_ops = ii->get_num_operands(); + return PHINode::Create(ty, num_ops, ii->get_name()); + } + if(auto* ii = dynamic_cast(inst)){ + Value *ret_val = value(ii->get_return_value()); + return ReturnInst::Create(ctx, ret_val); + } + if(auto* ii = dynamic_cast(inst)){ + Value *lhs = value(ii->get_operand(0)); + Value *rhs = value(ii->get_operand(1)); + return BinaryOperator::Create(ii->get_op(), lhs, rhs, ii->get_name()); + } + if(auto* ii = dynamic_cast(inst)){ + CmpInst::Predicate pred = ii->get_pred(); + Value *lhs = value(ii->get_operand(0)); + Value *rhs = value(ii->get_operand(1)); + return CmpInst::Create(Instruction::ICmp, pred, lhs, rhs, ii->get_name()); + } + if(auto* ii = dynamic_cast(inst)){ + CmpInst::Predicate pred = ii->get_pred(); + Value *lhs = value(ii->get_operand(0)); + Value *rhs = value(ii->get_operand(1)); + return FCmpInst::Create(Instruction::FCmp, pred, lhs, rhs, ii->get_name()); + } + if(auto* ii = dynamic_cast(inst)){ + Value *arg = value(ii->get_operand(0)); + Type *dst_ty = type(ii->get_type()); + return CastInst::Create(ii->get_op(), arg, dst_ty, ii->get_name()); + } + if(auto* ii = dynamic_cast(inst)){ + std::vector idx_vals; + std::transform(ii->idx_begin(), ii->idx_end(), std::back_inserter(idx_vals), + [&value](ir::value* x){ return value(x);}); + Type *source_ty = type(ii->get_source_elt_ty()); + Value *arg = value(ii->get_operand(0)); + return GetElementPtrInst::Create(source_ty, arg, idx_vals, ii->get_name()); + } + if(ir::load_inst* ii = dynamic_cast(inst)){ + Value *ptr = value(ii->get_pointer_operand()); + return new LoadInst(ptr, ii->get_name()); } - if(ir::load_inst* ii = dynamic_cast(inst)) - return new llvm::LoadInst(v[ii->get_pointer_operand()], ii->get_name()); // unknown instruction - throw std::runtime_error("unknown conversion from ir::type to llvm::Type"); + throw std::runtime_error("unknown conversion from ir::type to Type"); } -void lowering(ir::module &src, llvm::Module &dst){ - using namespace llvm; +Value* llvm_value(ir::value *v, LLVMContext &ctx, + std::map &vmap, + std::map &bmap) { + if(vmap.find(v) != vmap.end()) + return vmap.at(v); + // create operands + if(auto *uu = dynamic_cast(v)) + for(ir::use u: uu->ops()) + vmap[u.get()] = llvm_value(u, ctx, vmap, bmap); + // constant + if(auto *cc = dynamic_cast(v)) + return llvm_constant(cc, ctx); + // instruction + if(auto *ii = dynamic_cast(v)) + return llvm_inst(ii, ctx, vmap, bmap); + // unknown value + throw std::runtime_error("unknown conversion from ir::value to Value"); +} + +void lowering(ir::module &src, Module &dst){ std::map vmap; std::map bmap; LLVMContext &dst_ctx = dst.getContext(); @@ -93,10 +160,14 @@ void lowering(ir::module &src, llvm::Module &dst){ for(ir::function *fn: src.get_function_list()) { // create LLVM function Type *fn_ty = llvm_type(fn->get_type(), dst_ctx); - Function *dst_function = (Function*)dst.getOrInsertFunction(fn->get_name(), fn_ty); + Function *dst_fn = (Function*)dst.getOrInsertFunction(fn->get_name(), fn_ty); + // map parameters + for(unsigned i = 0; i < fn->args().size(); i++) { + vmap[fn->args()[i]] = &*(dst_fn->arg_begin() + i); + } // create blocks for(ir::basic_block *block: fn->blocks()) { - BasicBlock *dst_block = BasicBlock::Create(dst_ctx, block->get_name(), dst_function); + BasicBlock *dst_block = BasicBlock::Create(dst_ctx, block->get_name(), dst_fn); bmap[block] = dst_block; } // iterate through block @@ -108,6 +179,16 @@ void lowering(ir::module &src, llvm::Module &dst){ } } // add phi operands + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *inst: block->get_inst_list()) + if(auto *phi = dynamic_cast(inst)){ + PHINode *dst_phi = (PHINode*)vmap.at(phi); + for(unsigned i = 0; i < phi->get_num_incoming(); i++){ + ir::value *inc_val = phi->get_incoming_value(i); + ir::basic_block *inc_block = phi->get_incoming_block(i); + dst_phi->addIncoming(vmap[inc_val], bmap[inc_block]); + } + } } } diff --git a/include/ir/constant.h b/include/ir/constant.h index dff6606f8..57af4ad33 100644 --- a/include/ir/constant.h +++ b/include/ir/constant.h @@ -33,6 +33,7 @@ class constant_int: public constant{ constant_int(type *ty, uint64_t value); public: + uint64_t get_value() const { return value_; } static constant *get(type *ty, uint64_t value); private: @@ -44,6 +45,7 @@ class constant_fp: public constant{ constant_fp(context &ctx, double value); public: + double get_value() { return value_; } static constant* get_negative_zero(type *ty); static constant* get_zero_value_for_negation(type *ty); static constant *get(context &ctx, double v); diff --git a/include/ir/instructions.h b/include/ir/instructions.h index 8ec38cd1e..8632d9098 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -42,7 +42,9 @@ private: public: void set_incoming_value(unsigned i, value *v); void set_incoming_block(unsigned i, basic_block *block); - + value *get_incoming_value(unsigned i) { return get_operand(i); } + basic_block *get_incoming_block(unsigned i) { return blocks_[i]; } + unsigned get_num_incoming() { return get_num_operands(); } void add_incoming(value *v, basic_block *block); // Factory methods diff --git a/include/ir/value.h b/include/ir/value.h index df2f099de..35c6ca839 100644 --- a/include/ir/value.h +++ b/include/ir/value.h @@ -71,6 +71,7 @@ public: : value(ty, name), ops_(num_ops){ } // Operands + const std::vector& ops() { return ops_; } op_iterator op_begin() { return ops_.begin(); } op_iterator op_end() { return ops_.end(); } void set_operand(unsigned i, value *x); From c12ec9f2143f3b439e4e2490cd592d78060b6d29 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 6 Jan 2019 00:53:11 -0500 Subject: [PATCH 035/494] [code generator] more bugfixes --- examples/matrix.cpp | 10 ++++++---- include/ast/ast.h | 11 +++++++++++ include/ast/parser.y | 2 +- include/codegen/lowering.h | 17 +++++++++-------- include/ir/function.h | 5 +++-- lib/ast/lowering.cpp | 5 +++++ lib/ir/instructions.cpp | 2 +- 7 files changed, 36 insertions(+), 16 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index bae0ee52b..6e6389185 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -7,6 +7,8 @@ #include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/Module.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Support/raw_ostream.h" typedef struct yy_buffer_state * YY_BUFFER_STATE; extern int yyparse(); @@ -25,7 +27,7 @@ void test(fp32 *A, fp32 *B, fp32 *C, int32 i){\ int32 u = 1;\ u = u + i;\ if(k == 0)\ - u = u + 2;\ + j = u + 2;\ }\ }\ "; @@ -41,8 +43,8 @@ int main() { llvm::LLVMContext llvm_context; llvm::Module llvm_module("test", llvm_context); tdl::codegen::lowering(module, llvm_module); -// llvm::PrintModulePass print(llvm::outs()); -// llvm::AnalysisManager analysis; -// print.run(*module.handle(), analysis); + llvm::PrintModulePass print(llvm::outs()); + llvm::AnalysisManager analysis; + print.run(llvm_module, analysis); return 0; } diff --git a/include/ast/ast.h b/include/ast/ast.h index 031d49f74..d9b24f8e4 100644 --- a/include/ast/ast.h +++ b/include/ast/ast.h @@ -264,6 +264,17 @@ public: class statement: public node{ }; +class expression_statement: public statement{ +public: + expression_statement(node *expr) + : expr_((expression*)expr){ } + + ir::value* codegen(ir::module * mod) const; + +private: + expression *expr_; +}; + class compound_statement: public statement{ typedef list* declarations_t; typedef list* statements_t; diff --git a/include/ast/parser.y b/include/ast/parser.y index bcfd7498d..2806f49ea 100644 --- a/include/ast/parser.y +++ b/include/ast/parser.y @@ -278,7 +278,7 @@ statement_list expression_statement : ';' { $$ = new no_op(); } - | expression ';' { $$ = $1; } + | expression ';' { $$ = new expression_statement($1); } ; selection_statement diff --git a/include/codegen/lowering.h b/include/codegen/lowering.h index 673bcbf09..a660e5706 100644 --- a/include/codegen/lowering.h +++ b/include/codegen/lowering.h @@ -91,8 +91,8 @@ Instruction *llvm_inst(ir::instruction *inst, LLVMContext & ctx, return PHINode::Create(ty, num_ops, ii->get_name()); } if(auto* ii = dynamic_cast(inst)){ - Value *ret_val = value(ii->get_return_value()); - return ReturnInst::Create(ctx, ret_val); + ir::value *ret_val = ii->get_return_value(); + return ReturnInst::Create(ctx, ret_val?value(ret_val):nullptr); } if(auto* ii = dynamic_cast(inst)){ Value *lhs = value(ii->get_operand(0)); @@ -139,9 +139,9 @@ Value* llvm_value(ir::value *v, LLVMContext &ctx, return vmap.at(v); // create operands if(auto *uu = dynamic_cast(v)) - for(ir::use u: uu->ops()) + for(ir::use u: uu->ops()){ vmap[u.get()] = llvm_value(u, ctx, vmap, bmap); - // constant + } if(auto *cc = dynamic_cast(v)) return llvm_constant(cc, ctx); // instruction @@ -159,12 +159,12 @@ void lowering(ir::module &src, Module &dst){ // iterate over functions for(ir::function *fn: src.get_function_list()) { // create LLVM function - Type *fn_ty = llvm_type(fn->get_type(), dst_ctx); - Function *dst_fn = (Function*)dst.getOrInsertFunction(fn->get_name(), fn_ty); + FunctionType *fn_ty = (FunctionType*)llvm_type(fn->get_fn_type(), dst_ctx); + Function *dst_fn = Function::Create(fn_ty, Function::ExternalLinkage, "kernel", &dst); +// std::cout << ((FunctionType*)fn_ty)->getNumParams() << std::endl; // map parameters - for(unsigned i = 0; i < fn->args().size(); i++) { + for(unsigned i = 0; i < fn->args().size(); i++) vmap[fn->args()[i]] = &*(dst_fn->arg_begin() + i); - } // create blocks for(ir::basic_block *block: fn->blocks()) { BasicBlock *dst_block = BasicBlock::Create(dst_ctx, block->get_name(), dst_fn); @@ -176,6 +176,7 @@ void lowering(ir::module &src, Module &dst){ for(ir::instruction *inst: block->get_inst_list()) { Instruction *dst_inst = llvm_inst(inst, dst_ctx, vmap, bmap); vmap[inst] = dst_inst; + dst_builder.Insert(dst_inst); } } // add phi operands diff --git a/include/ir/function.h b/include/ir/function.h index 4becaa606..4f0762067 100644 --- a/include/ir/function.h +++ b/include/ir/function.h @@ -46,9 +46,10 @@ private: const std::string &name = "", module *parent = nullptr); public: - // arguments + // accessors const args_t &args() { return args_; } - // Factory methods + function_type* get_fn_type() { return fn_ty_; } + // factory methods static function *create(function_type *ty, linkage_types_t linkage, const std::string &name, module *mod); // blocks diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 9dba1e633..7d9d79235 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -117,6 +117,11 @@ ir::value* compound_statement::codegen(ir::module* mod) const{ return nullptr; } +/* expression statement */ +ir::value* expression_statement::codegen(ir::module *mod) const{ + return expr_->codegen(mod); +} + /* Iteration statement */ ir::value* iteration_statement::codegen(ir::module *mod) const{ ir::builder &builder = mod->get_builder(); diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 03f85bec6..3dd2ccd44 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -191,7 +191,7 @@ cast_inst *cast_inst::create_integer_cast(value *arg, type *ty, bool is_signed, // return_inst return_inst::return_inst(context &ctx, value *ret_val, instruction *next) - : terminator_inst(type::get_void_ty(ctx), !!ret_val, "", next){ + : terminator_inst(type::get_void_ty(ctx), ret_val!=nullptr, "", next){ if(ret_val) set_operand(0, ret_val); } From 6bfceae4a65ca939aa42c60c096b8735dbba616b Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 6 Jan 2019 03:36:56 -0500 Subject: [PATCH 036/494] [code generation] some more bugfixes --- lib/ast/lowering.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 7d9d79235..24d322524 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -105,6 +105,7 @@ ir::value* function_definition::codegen(ir::module *mod) const{ mod->seal_block(entry); mod->get_builder().set_insert_point(entry); body_->codegen(mod); + std::cout << mod->get_builder().get_insert_block()->get_name() << std::endl; mod->get_builder().create_ret_void(); return nullptr; } @@ -128,17 +129,18 @@ ir::value* iteration_statement::codegen(ir::module *mod) const{ ir::context &ctx = mod->get_context(); ir::function *fn = builder.get_insert_block()->get_parent(); ir::basic_block *loop_bb = ir::basic_block::create(ctx, "loop", fn); - ir::basic_block *next_bb = ir::basic_block::create(ctx, "postloop", fn); init_->codegen(mod); builder.create_br(loop_bb); builder.set_insert_point(loop_bb); statements_->codegen(mod); exec_->codegen(mod); ir::value *cond = stop_->codegen(mod); + ir::basic_block *next_bb = ir::basic_block::create(ctx, "postloop", fn); builder.create_cond_br(cond, loop_bb, next_bb); - builder.set_insert_point(next_bb); mod->seal_block(loop_bb); + mod->seal_block(builder.get_insert_block()); mod->seal_block(next_bb); + builder.set_insert_point(next_bb); return nullptr; } @@ -161,11 +163,13 @@ ir::value* selection_statement::codegen(ir::module* mod) const{ then_value_->codegen(mod); if(else_value_) builder.create_br(endif_bb); + mod->seal_block(then_bb); // Else if(else_value_){ builder.set_insert_point(else_bb); else_value_->codegen(mod); builder.create_br(endif_bb); + mod->seal_block(else_bb); } // Endif builder.set_insert_point(endif_bb); From 179890c7ad3df5e0c35e53a7ce4860399b134718 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 6 Jan 2019 15:16:02 -0500 Subject: [PATCH 037/494] [ast] laying down the ground work for on-the-fly phi-node simplification --- examples/matrix.cpp | 2 +- include/codegen/lowering.h | 4 +++- include/ir/module.h | 1 + include/ir/value.h | 11 ++++++++++- lib/ast/lowering.cpp | 1 - lib/ir/instructions.cpp | 11 +++++------ lib/ir/module.cpp | 23 ++++++++++++++++++++++- lib/ir/value.cpp | 21 ++++++++++++++++++--- 8 files changed, 60 insertions(+), 14 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 6e6389185..925c9edd6 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -23,7 +23,7 @@ void test(fp32 *A, fp32 *B, fp32 *C, int32 i){\ int32 j = 1;\ int32 k;\ i = i + j;\ - for(k = 0; k < 10; k = k+1){\ + for(k = 0; k < 10; k = k+5){\ int32 u = 1;\ u = u + i;\ if(k == 0)\ diff --git a/include/codegen/lowering.h b/include/codegen/lowering.h index a660e5706..208f91b40 100644 --- a/include/codegen/lowering.h +++ b/include/codegen/lowering.h @@ -187,7 +187,9 @@ void lowering(ir::module &src, Module &dst){ for(unsigned i = 0; i < phi->get_num_incoming(); i++){ ir::value *inc_val = phi->get_incoming_value(i); ir::basic_block *inc_block = phi->get_incoming_block(i); - dst_phi->addIncoming(vmap[inc_val], bmap[inc_block]); + Value *llvm_inc_val = llvm_value(inc_val, dst_ctx, vmap, bmap); + BasicBlock *llvm_block = bmap[inc_block]; + dst_phi->addIncoming(llvm_inc_val, llvm_block); } } } diff --git a/include/ir/module.h b/include/ir/module.h index d38fc05d5..d016b68e3 100644 --- a/include/ir/module.h +++ b/include/ir/module.h @@ -30,6 +30,7 @@ public: private: phi_node *make_phi(type *ty, unsigned num_values, basic_block *block); + value *try_remove_trivial_phis(ir::phi_node *&phi); value *add_phi_operands(const std::string& name, phi_node *&phi); value *get_value_recursive(const std::string& name, basic_block *block); void push_function(function *fn) { functions_.push_back(fn); } diff --git a/include/ir/value.h b/include/ir/value.h index 35c6ca839..a3d71545b 100644 --- a/include/ir/value.h +++ b/include/ir/value.h @@ -10,6 +10,7 @@ namespace ir{ class type; class use; +class user; //===----------------------------------------------------------------------===// // value class @@ -21,7 +22,9 @@ public: value(type *ty, const std::string &name = ""); virtual ~value(){ } // uses - void add_use(use *arg); + void add_use(use arg); + const std::vector &get_uses() { return uses_; } + virtual void replace_all_uses_with(value *target); // name void set_name(const std::string &name); const std::string &get_name() const { return name_; } @@ -30,6 +33,9 @@ public: private: type *ty_; std::string name_; + +protected: + std::vector uses_; }; //===----------------------------------------------------------------------===// @@ -78,6 +84,9 @@ public: value *get_operand(unsigned i); unsigned get_num_operands() const ; + // Utils + void replace_all_uses_with(value *target); + private: std::vector ops_; }; diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 24d322524..79a589776 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -105,7 +105,6 @@ ir::value* function_definition::codegen(ir::module *mod) const{ mod->seal_block(entry); mod->get_builder().set_insert_point(entry); body_->codegen(mod); - std::cout << mod->get_builder().get_insert_block()->get_name() << std::endl; mod->get_builder().create_ret_void(); return nullptr; } diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 3dd2ccd44..3084f6d10 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -26,7 +26,9 @@ instruction::instruction(type *ty, unsigned num_ops, const std::string &name, in //===----------------------------------------------------------------------===// phi_node::phi_node(type *ty, unsigned num_reserved, std::string const &name, instruction *next) - : instruction(ty, num_reserved, name, next), blocks_(num_reserved){ } + : instruction(ty, 0, name, next) { + blocks_.reserve(num_reserved); +} // Set incoming value void phi_node::set_incoming_value(unsigned i, value *v){ @@ -44,11 +46,8 @@ void phi_node::set_incoming_block(unsigned i, basic_block *block){ // Add incoming void phi_node::add_incoming(value *v, basic_block *block){ - if(get_num_operands()==num_reserved_){ - num_reserved_++; - resize_ops(num_reserved_); - blocks_.resize(num_reserved_); - } + resize_ops(get_num_operands() + 1); + blocks_.resize(get_num_operands() + 1); set_incoming_value(get_num_operands() - 1, v); set_incoming_block(get_num_operands() - 1, block); } diff --git a/lib/ir/module.cpp b/lib/ir/module.cpp index 6e8fca715..2963e757f 100644 --- a/lib/ir/module.cpp +++ b/lib/ir/module.cpp @@ -39,13 +39,34 @@ ir::phi_node* module::make_phi(ir::type *ty, unsigned num_values, ir::basic_bloc return res; } +ir::value *module::try_remove_trivial_phis(ir::phi_node *&phi){ + ir::value *same = nullptr; + for(ir::value *op: phi->ops()){ + // unique value or self-reference + if(op == same || op == phi) + continue; + // the phi-node merges at least two values; non-trivial + if(same) + return phi; + same = op; + } + assert(same && "the phi-node is unreachable or in the start block"); + std::vector uses = phi->get_uses(); + phi->replace_all_uses_with(same); + for(ir::use &u: uses) + if(auto *uphi = dynamic_cast(u.get())) + if(uphi != phi) + try_remove_trivial_phis(uphi); + return same; +} + ir::value *module::add_phi_operands(const std::string& name, ir::phi_node *&phi){ ir::basic_block *block = phi->get_parent(); for(ir::basic_block *pred: block->get_predecessors()){ ir::value *value = get_value(name, pred); phi->add_incoming(value, pred); } - return phi; + return try_remove_trivial_phis(phi); } ir::value *module::get_value_recursive(const std::string& name, ir::basic_block *block) { diff --git a/lib/ir/value.cpp b/lib/ir/value.cpp index c1aaf3d42..9f25e21c0 100644 --- a/lib/ir/value.cpp +++ b/lib/ir/value.cpp @@ -14,17 +14,26 @@ value::value(type *ty, const std::string &name): ty_(ty){ set_name(name); } +void value::add_use(use arg) { + uses_.push_back(arg); +} + // TODO: automatic naming scheme + update symbol table void value::set_name(const std::string &name){ name_ = name; } +void value::replace_all_uses_with(value *target){ + throw std::runtime_error("not implemented"); +} + //===----------------------------------------------------------------------===// // use class //===----------------------------------------------------------------------===// void use::set(value *val){ val_ = val; + val_->add_use(*this); } value *use::operator=(value *rhs){ @@ -40,19 +49,25 @@ const use &use::operator=(const use &rhs){ //===----------------------------------------------------------------------===// // user class //===----------------------------------------------------------------------===// -void user::set_operand(unsigned i, value *x){ +void user::set_operand(unsigned i, value *x) { assert(i < ops_.size() && "set_operand() out of range!"); ops_[i] = x; } -value* user::get_operand(unsigned i){ +value* user::get_operand(unsigned i) { assert(i < ops_.size() && "get_operand() out of range!"); return ops_[i]; } -unsigned user::get_num_operands() const{ +unsigned user::get_num_operands() const { return ops_.size(); } +void user::replace_all_uses_with(value *target) { + for(use &u: uses_){ + u.set(target); + } +} + } } From 0dd4a52ce5d03663104f7b9bd19e6c974fa1cd4c Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 6 Jan 2019 21:33:53 -0500 Subject: [PATCH 038/494] [syntax tree]: debugging phi-nodes simplification --- include/ir/value.h | 1 + lib/ir/value.cpp | 13 +++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/include/ir/value.h b/include/ir/value.h index a3d71545b..4064e9ba2 100644 --- a/include/ir/value.h +++ b/include/ir/value.h @@ -86,6 +86,7 @@ public: // Utils void replace_all_uses_with(value *target); + void replace_uses_of_with(value *before, value *after); private: std::vector ops_; diff --git a/lib/ir/value.cpp b/lib/ir/value.cpp index 9f25e21c0..ce94ef404 100644 --- a/lib/ir/value.cpp +++ b/lib/ir/value.cpp @@ -1,4 +1,5 @@ #include "ir/value.h" +#include #include namespace tdl{ @@ -64,10 +65,18 @@ unsigned user::get_num_operands() const { } void user::replace_all_uses_with(value *target) { - for(use &u: uses_){ - u.set(target); + for(use &u: uses_) + if(auto *usr = dynamic_cast(u.get())){ + std::cout << "replacing " << this << " by " << target << " in " << usr << std::endl; + usr->replace_uses_of_with(this, target); } } +void user::replace_uses_of_with(value *before, value *after) { + for(use &u: ops_) + if(u.get() == before) + u.set(after); +} + } } From ce1c0a62c047175b0b06bb09e8299d4dd0a1efde Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 6 Jan 2019 22:43:18 -0500 Subject: [PATCH 039/494] [syntax tree] trivial phi-node elimination --- include/codegen/lowering.h | 4 ++-- include/ir/basic_block.h | 1 + include/ir/instructions.h | 2 +- include/ir/value.h | 34 ++++++++---------------------- lib/ir/basic_block.cpp | 2 ++ lib/ir/instructions.cpp | 5 +++++ lib/ir/module.cpp | 31 ++++++++++++++-------------- lib/ir/value.cpp | 42 ++++++++++++++------------------------ 8 files changed, 49 insertions(+), 72 deletions(-) diff --git a/include/codegen/lowering.h b/include/codegen/lowering.h index 208f91b40..b3c62c685 100644 --- a/include/codegen/lowering.h +++ b/include/codegen/lowering.h @@ -139,8 +139,8 @@ Value* llvm_value(ir::value *v, LLVMContext &ctx, return vmap.at(v); // create operands if(auto *uu = dynamic_cast(v)) - for(ir::use u: uu->ops()){ - vmap[u.get()] = llvm_value(u, ctx, vmap, bmap); + for(ir::value* u: uu->ops()){ + vmap[u] = llvm_value(u, ctx, vmap, bmap); } if(auto *cc = dynamic_cast(v)) return llvm_constant(cc, ctx); diff --git a/include/ir/basic_block.h b/include/ir/basic_block.h index f6c7897f6..a01cff008 100644 --- a/include/ir/basic_block.h +++ b/include/ir/basic_block.h @@ -36,6 +36,7 @@ public: // get instruction list inst_list_t &get_inst_list() { return inst_list_; } + void erase(instruction *i) { inst_list_.remove(i); } // instruction iterator functions inline iterator begin() { return inst_list_.begin(); } diff --git a/include/ir/instructions.h b/include/ir/instructions.h index 8632d9098..920c47c85 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -21,11 +21,11 @@ protected: instruction(type *ty, unsigned num_ops, const std::string &name = "", instruction *next = nullptr); public: - // parent void set_parent(basic_block *block) { parent_ = block; } const basic_block *get_parent() const { return parent_; } basic_block *get_parent() { return parent_; } + void erase_from_parent(); private: basic_block *parent_; diff --git a/include/ir/value.h b/include/ir/value.h index 4064e9ba2..cce66949e 100644 --- a/include/ir/value.h +++ b/include/ir/value.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace tdl{ namespace ir{ @@ -22,8 +23,9 @@ public: value(type *ty, const std::string &name = ""); virtual ~value(){ } // uses - void add_use(use arg); - const std::vector &get_uses() { return uses_; } + void add_use(user* arg); + unsigned erase_use(user* arg); + const std::set &get_users() { return users_; } virtual void replace_all_uses_with(value *target); // name void set_name(const std::string &name); @@ -35,27 +37,7 @@ private: std::string name_; protected: - std::vector uses_; -}; - -//===----------------------------------------------------------------------===// -// use class -//===----------------------------------------------------------------------===// - -class use { -public: - // Implicit conversions to/from value - friend class value; - operator value *() const { return val_; } - value *get() const { return val_; } - value *operator->() { return val_; } - const value *operator->() const { return val_; } - inline void set(value *val); - inline value *operator=(value *rhs); - inline const use &operator=(const use &rhs); - -private: - value *val_; + std::set users_; }; //===----------------------------------------------------------------------===// @@ -64,7 +46,7 @@ private: class user: public value{ public: - typedef std::vector ops_t; + typedef std::vector ops_t; typedef ops_t::iterator op_iterator; typedef ops_t::const_iterator const_op_iterator; @@ -77,7 +59,7 @@ public: : value(ty, name), ops_(num_ops){ } // Operands - const std::vector& ops() { return ops_; } + const ops_t& ops() { return ops_; } op_iterator op_begin() { return ops_.begin(); } op_iterator op_end() { return ops_.end(); } void set_operand(unsigned i, value *x); @@ -89,7 +71,7 @@ public: void replace_uses_of_with(value *before, value *after); private: - std::vector ops_; + ops_t ops_; }; } diff --git a/lib/ir/basic_block.cpp b/lib/ir/basic_block.cpp index 25fb91a95..c7d8493e8 100644 --- a/lib/ir/basic_block.cpp +++ b/lib/ir/basic_block.cpp @@ -23,6 +23,8 @@ void basic_block::add_predecessor(basic_block *pred) { preds_.push_back(pred); } + + basic_block::iterator basic_block::get_first_non_phi(){ auto it = begin(); for(; it != end(); it++) diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 3084f6d10..7f8f9d1c5 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -21,6 +21,11 @@ instruction::instruction(type *ty, unsigned num_ops, const std::string &name, in } } +void instruction::erase_from_parent() { + parent_->erase(this); +} + + //===----------------------------------------------------------------------===// // phi_node classes //===----------------------------------------------------------------------===// diff --git a/lib/ir/module.cpp b/lib/ir/module.cpp index 2963e757f..ce5d478ff 100644 --- a/lib/ir/module.cpp +++ b/lib/ir/module.cpp @@ -40,23 +40,22 @@ ir::phi_node* module::make_phi(ir::type *ty, unsigned num_values, ir::basic_bloc } ir::value *module::try_remove_trivial_phis(ir::phi_node *&phi){ - ir::value *same = nullptr; - for(ir::value *op: phi->ops()){ - // unique value or self-reference - if(op == same || op == phi) - continue; - // the phi-node merges at least two values; non-trivial - if(same) - return phi; - same = op; - } - assert(same && "the phi-node is unreachable or in the start block"); - std::vector uses = phi->get_uses(); + // find non-self references + std::vector non_self_ref; + std::copy_if(phi->ops().begin(), phi->ops().end(), std::back_inserter(non_self_ref), + [phi](ir::value* op){ return op != phi; }); + // non-trivial + if(non_self_ref.size() > 1) + return phi; + // unique value or self-reference + ir::value *same = non_self_ref[0]; + std::set users = phi->get_users(); phi->replace_all_uses_with(same); - for(ir::use &u: uses) - if(auto *uphi = dynamic_cast(u.get())) - if(uphi != phi) - try_remove_trivial_phis(uphi); + phi->erase_from_parent(); + for(ir::user* u: users) + if(auto *uphi = dynamic_cast(u)) + if(uphi != phi) + try_remove_trivial_phis(uphi); return same; } diff --git a/lib/ir/value.cpp b/lib/ir/value.cpp index ce94ef404..23d30caf9 100644 --- a/lib/ir/value.cpp +++ b/lib/ir/value.cpp @@ -1,4 +1,5 @@ #include "ir/value.h" +#include "ir/instructions.h" #include #include @@ -15,8 +16,12 @@ value::value(type *ty, const std::string &name): ty_(ty){ set_name(name); } -void value::add_use(use arg) { - uses_.push_back(arg); +void value::add_use(user *arg) { + users_.insert(arg); +} + +unsigned value::erase_use(user *arg){ + return users_.erase(arg); } // TODO: automatic naming scheme + update symbol table @@ -29,30 +34,13 @@ void value::replace_all_uses_with(value *target){ } -//===----------------------------------------------------------------------===// -// use class -//===----------------------------------------------------------------------===// -void use::set(value *val){ - val_ = val; - val_->add_use(*this); -} - -value *use::operator=(value *rhs){ - set(rhs); - return rhs; -} - -const use &use::operator=(const use &rhs){ - set(rhs.val_); - return rhs; -} - //===----------------------------------------------------------------------===// // user class //===----------------------------------------------------------------------===// void user::set_operand(unsigned i, value *x) { assert(i < ops_.size() && "set_operand() out of range!"); ops_[i] = x; + x->add_use(this); } value* user::get_operand(unsigned i) { @@ -65,17 +53,17 @@ unsigned user::get_num_operands() const { } void user::replace_all_uses_with(value *target) { - for(use &u: uses_) - if(auto *usr = dynamic_cast(u.get())){ - std::cout << "replacing " << this << " by " << target << " in " << usr << std::endl; - usr->replace_uses_of_with(this, target); + for(auto it = users_.begin(); it != users_.end();){ + (*it)->replace_uses_of_with(this, target); + target->add_use(*it); + erase_use(*it++); } } void user::replace_uses_of_with(value *before, value *after) { - for(use &u: ops_) - if(u.get() == before) - u.set(after); + for(size_t i = 0; i < ops_.size(); i++) + if(ops_[i] == before) + ops_[i] = after; } } From c48b7fb676cab6b0319d0bfdbb34853012a2790d Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 7 Jan 2019 04:08:55 -0500 Subject: [PATCH 040/494] [intermediate representation] bugfix in getelementptr_inst --- examples/matrix.cpp | 13 +++---------- include/codegen/{storage_alloc.h => allocation.h} | 0 include/codegen/{lowering.h => selection.h} | 9 ++++----- include/ir/instructions.h | 2 +- lib/ast/lowering.cpp | 3 ++- lib/ir/builder.cpp | 2 +- lib/ir/instructions.cpp | 5 +++-- 7 files changed, 14 insertions(+), 20 deletions(-) rename include/codegen/{storage_alloc.h => allocation.h} (100%) rename include/codegen/{lowering.h => selection.h} (97%) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 925c9edd6..89d4aa9ec 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -3,7 +3,7 @@ #include "ast/ast.h" #include "ir/context.h" #include "ir/module.h" -#include "codegen/lowering.h" +#include "codegen/selection.h" #include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/Module.h" #include "llvm/IR/LLVMContext.h" @@ -20,15 +20,8 @@ extern translation_unit *ast_root; const char src[] = "\ void test(fp32 *A, fp32 *B, fp32 *C, int32 i){\ - int32 j = 1;\ - int32 k;\ - i = i + j;\ - for(k = 0; k < 10; k = k+5){\ - int32 u = 1;\ - u = u + i;\ - if(k == 0)\ - j = u + 2;\ - }\ + i = 1;\ + A = A + i;\ }\ "; diff --git a/include/codegen/storage_alloc.h b/include/codegen/allocation.h similarity index 100% rename from include/codegen/storage_alloc.h rename to include/codegen/allocation.h diff --git a/include/codegen/lowering.h b/include/codegen/selection.h similarity index 97% rename from include/codegen/lowering.h rename to include/codegen/selection.h index b3c62c685..5c2bff41e 100644 --- a/include/codegen/lowering.h +++ b/include/codegen/selection.h @@ -1,5 +1,5 @@ -#ifndef TDL_INCLUDE_IR_CODEGEN_LOWERING_H -#define TDL_INCLUDE_IR_CODEGEN_LOWERING_H +#ifndef TDL_INCLUDE_CODEGEN_SELECTION_H +#define TDL_INCLUDE_CODEGEN_SELECTION_H #include "llvm/IR/Module.h" #include "llvm/IR/IRBuilder.h" @@ -139,9 +139,8 @@ Value* llvm_value(ir::value *v, LLVMContext &ctx, return vmap.at(v); // create operands if(auto *uu = dynamic_cast(v)) - for(ir::value* u: uu->ops()){ - vmap[u] = llvm_value(u, ctx, vmap, bmap); - } + for(ir::value* u: uu->ops()) + vmap[u] = llvm_value(u, ctx, vmap, bmap); if(auto *cc = dynamic_cast(v)) return llvm_constant(cc, ctx); // instruction diff --git a/include/ir/instructions.h b/include/ir/instructions.h index 920c47c85..875b2b0a0 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -266,7 +266,7 @@ public: op_iterator idx_end() { return op_end(); } // factory methods - static getelementptr_inst* create(type *pointee_ty, value *ptr, const std::vector &idx, + static getelementptr_inst* create(value *ptr, const std::vector &idx, const std::string &name = "", instruction *next = nullptr); private: diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 79a589776..9dcbb2826 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -111,7 +111,8 @@ ir::value* function_definition::codegen(ir::module *mod) const{ /* Statements */ ir::value* compound_statement::codegen(ir::module* mod) const{ - decls_->codegen(mod); + if(decls_) + decls_->codegen(mod); if(statements_) statements_->codegen(mod); return nullptr; diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index a2c7ef809..ccdf49141 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -168,7 +168,7 @@ DEFINE_UNARY_INT(not) //===----------------------------------------------------------------------===// value* builder::create_gep(value *ptr, const std::vector& idx_list, const std::string &name){ - return insert(getelementptr_inst::create(nullptr, ptr, idx_list), name); + return insert(getelementptr_inst::create(ptr, idx_list), name); } //===----------------------------------------------------------------------===// diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 7f8f9d1c5..a46c4f036 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -236,7 +236,7 @@ cond_branch_inst::cond_branch_inst(basic_block *if_dst, basic_block *else_dst, v //===----------------------------------------------------------------------===// getelementptr_inst::getelementptr_inst(type *pointee_ty, value *ptr, const std::vector &idx, const std::string &name, instruction *next) - : instruction(get_return_type(pointee_ty, ptr, idx), idx.size(), name, next), + : instruction(get_return_type(pointee_ty, ptr, idx), 1 + idx.size(), name, next), source_elt_ty(pointee_ty), res_elt_ty(get_indexed_type(pointee_ty, idx)){ type *expected_ty = ((pointer_type*)(get_type()->get_scalar_ty()))->get_element_ty(); @@ -283,7 +283,8 @@ type *getelementptr_inst::get_indexed_type(type *ty, const std::vector return result; } -getelementptr_inst *getelementptr_inst::create(type *pointee_ty, value *ptr, const std::vector &idx, const std::string &name, instruction *next) { +getelementptr_inst *getelementptr_inst::create(value *ptr, const std::vector &idx, const std::string &name, instruction *next) { + type *pointee_ty = ((pointer_type*)(ptr->get_type()->get_scalar_ty()))->get_element_ty(); return new getelementptr_inst(pointee_ty, ptr, idx, name, next); } From a1c0c9762c44574b5b61b4969482ccb81f920968 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 7 Jan 2019 22:44:26 -0500 Subject: [PATCH 041/494] [code generation] added basic structure --- examples/matrix.cpp | 3 +- include/codegen/allocation.h | 36 +++++++ include/codegen/layout.h | 37 ++++++- include/codegen/liveness.h | 52 +++++++++ include/codegen/selection.h | 200 ++++------------------------------- include/codegen/tune.h | 60 +++++++++-- include/ir/instructions.h | 5 + include/ir/type.h | 3 + 8 files changed, 209 insertions(+), 187 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 89d4aa9ec..78c8d7194 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -35,7 +35,8 @@ int main() { program->codegen(&module); llvm::LLVMContext llvm_context; llvm::Module llvm_module("test", llvm_context); - tdl::codegen::lowering(module, llvm_module); + tdl::codegen::selection selection; + selection.run(module, llvm_module); llvm::PrintModulePass print(llvm::outs()); llvm::AnalysisManager analysis; print.run(llvm_module, analysis); diff --git a/include/codegen/allocation.h b/include/codegen/allocation.h index b2112b20b..d23b98501 100644 --- a/include/codegen/allocation.h +++ b/include/codegen/allocation.h @@ -1,9 +1,45 @@ #ifndef TDL_INCLUDE_IR_CODEGEN_STORAGE_ALLOC_H #define TDL_INCLUDE_IR_CODEGEN_STORAGE_ALLOC_H +#include +#include + namespace tdl{ + +namespace ir{ + class value; + class function; +} + namespace codegen{ +class layout; +class target_tuner; +class liveness; +class loop_info; + +class allocation { +public: + // accessors + unsigned get_num_bytes(ir::value *x) const; + unsigned get_offset(ir::value *x) const { return offsets_.at(x); } + unsigned get_allocated_size() const { return allocated_size_; } + bool has_double_buffer(ir::value *x) const { return double_buffer_.find(x) != double_buffer_.end(); } + + // run + void run(ir::function &fn); + +private: + std::map offsets_; + std::set double_buffer_; + std::map num_bytes_; + size_t allocated_size_; + // dependences + liveness *liveness_; + layout *layout_; + loop_info *loop_info_; +}; + } } diff --git a/include/codegen/layout.h b/include/codegen/layout.h index 40227001c..1fd6deeda 100644 --- a/include/codegen/layout.h +++ b/include/codegen/layout.h @@ -1,9 +1,44 @@ #ifndef TDL_INCLUDE_IR_CODEGEN_LAYOUT_H #define TDL_INCLUDE_IR_CODEGEN_LAYOUT_H -namespace tdl{ +#include +#include + +namespace tdl { + +namespace ir { + class function; + class instruction; + class value; +} + namespace codegen{ +struct shared_view_info{ + ir::value *usr; + bool has_dedicated_storage; +}; + +class layout { +private: + typedef std::vector shared_view_val_t; + + void add_phi_nodes(ir::value *v); + void add_shared_views(ir::value *v); + +public: + // accessors + unsigned get_num_shared_views(ir::value *v); + shared_view_info get_shared_view(ir::value *v, unsigned idx); + + // run + bool run(ir::function &fn); + +private: + std::map shared_views_; +}; + + } } diff --git a/include/codegen/liveness.h b/include/codegen/liveness.h index 9afbc4e46..8a6806c2e 100644 --- a/include/codegen/liveness.h +++ b/include/codegen/liveness.h @@ -1,9 +1,61 @@ #ifndef TDL_INCLUDE_IR_CODEGEN_LIVENESS_H #define TDL_INCLUDE_IR_CODEGEN_LIVENESS_H +#include + namespace tdl{ + +namespace ir{ + class value; + class function; +} + namespace codegen{ +class layout; + +typedef unsigned slot_index; + +struct segment { + slot_index start; + slot_index end; + + bool contains(slot_index idx) const { + return start <= idx && idx < end; + } + + bool intersect(const segment &Other){ + return contains(Other.start) || Other.contains(start); + } +}; + +class liveness { +private: + typedef std::map indices_map_t; + typedef std::map intervals_map_t; + typedef std::map has_storage_map_t; + +public: + /// Intervals iterators... + using iterator = intervals_map_t::iterator; + using const_iterator = intervals_map_t::const_iterator; + +public: + + // accessors + const intervals_map_t& intervals() const { return intervals_; } + segment get_interval(ir::value* v) const { return intervals_.at(v); } + + // run + void run(ir::function *fn); + +private: + has_storage_map_t has_dedicated_storage_; + indices_map_t indices_; + intervals_map_t intervals_; + layout* layouts_; +}; + } } diff --git a/include/codegen/selection.h b/include/codegen/selection.h index 5c2bff41e..b55725c05 100644 --- a/include/codegen/selection.h +++ b/include/codegen/selection.h @@ -9,191 +9,35 @@ #include "ir/type.h" +namespace llvm{ + class Type; + class Value; + class Instruction; + class Constant; + class LLVMContext; +} + namespace tdl{ namespace codegen{ -using namespace llvm; +class selection{ + typedef std::map vmap_t; + typedef std::map bmap_t; -/* convert ir::type to Type */ -Type *llvm_type(ir::type *ty, LLVMContext &ctx) { - // function - if(auto* tt = dynamic_cast(ty)){ - Type *return_ty = llvm_type(tt->get_return_ty(), ctx); - std::vector param_tys; - std::transform(tt->params_begin(), tt->params_end(), std::back_inserter(param_tys), - [&ctx](ir::type* t){ return llvm_type(t, ctx);}); - return FunctionType::get(return_ty, param_tys, false); - } - // pointer - if(ty->is_pointer_ty()){ - Type *elt_ty = llvm_type(ty->get_pointer_element_ty(), ctx); - unsigned addr_space = ty->get_pointer_address_space(); - return PointerType::get(elt_ty, addr_space); - } - // integer - if(ty->is_integer_ty()){ - unsigned bitwidth = ty->get_integer_bitwidth(); - return IntegerType::get(ctx, bitwidth); - } - // primitive types - switch(ty->get_type_id()){ - case ir::type::VoidTyID: return Type::getVoidTy(ctx); - case ir::type::HalfTyID: return Type::getHalfTy(ctx); - case ir::type::FloatTyID: return Type::getFloatTy(ctx); - case ir::type::DoubleTyID: return Type::getDoubleTy(ctx); - case ir::type::X86_FP80TyID: return Type::getX86_FP80Ty(ctx); - case ir::type::PPC_FP128TyID: return Type::getPPC_FP128Ty(ctx); - case ir::type::LabelTyID: return Type::getLabelTy(ctx); - case ir::type::MetadataTyID: return Type::getMetadataTy(ctx); - case ir::type::TokenTyID: return Type::getTokenTy(ctx); - default: break; - } - // unknown type - throw std::runtime_error("unknown conversion from ir::type to Type"); -} +private: + llvm::Type* llvm_type(ir::type *ty, llvm::LLVMContext &ctx); + llvm::Value* llvm_value(ir::value *v,llvm:: LLVMContext &ctx); + llvm::Instruction* llvm_inst(ir::instruction *inst, llvm::LLVMContext &ctx); + llvm::Constant* llvm_constant(ir::constant *cst, llvm::LLVMContext &ctx); -Value* llvm_value(ir::value *v, LLVMContext &ctx, - std::map &vmap, - std::map &bmap); +public: + void run(ir::module &src, llvm::Module &dst); -/* convert ir::constant to Constant */ -Constant *llvm_constant(ir::constant *cst, LLVMContext &ctx) { - Type *dst_ty = llvm_type(cst->get_type(), ctx); - if(auto* cc = dynamic_cast(cst)) - return ConstantInt::get(dst_ty, cc->get_value()); - if(auto* cc = dynamic_cast(cst)) - return ConstantFP::get(dst_ty, cc->get_value()); - // unknown constant - throw std::runtime_error("unknown conversion from ir::constant to Constant"); -} - - -/* convert ir::instruction to Instruction */ -Instruction *llvm_inst(ir::instruction *inst, LLVMContext & ctx, - std::map &vmap, - std::map &bmap) { - auto value = [&](ir::value *x) { return llvm_value(x, ctx, vmap, bmap); }; - auto block = [&](ir::basic_block *x) { return bmap.at(x); }; - auto type = [&](ir::type *x) { return llvm_type(x, ctx); }; - if(auto* ii = dynamic_cast(inst)){ - BasicBlock *true_dest = block(ii->get_true_dest()); - BasicBlock *false_dest = block(ii->get_false_dest()); - Value *cond = value(ii->get_cond()); - return BranchInst::Create(true_dest, false_dest, cond); - } - if(auto* ii = dynamic_cast(inst)){ - BasicBlock *dest = block(ii->get_dest()); - return BranchInst::Create(dest); - } - if(auto* ii = dynamic_cast(inst)){ - Type *ty = type(ii->get_type()); - unsigned num_ops = ii->get_num_operands(); - return PHINode::Create(ty, num_ops, ii->get_name()); - } - if(auto* ii = dynamic_cast(inst)){ - ir::value *ret_val = ii->get_return_value(); - return ReturnInst::Create(ctx, ret_val?value(ret_val):nullptr); - } - if(auto* ii = dynamic_cast(inst)){ - Value *lhs = value(ii->get_operand(0)); - Value *rhs = value(ii->get_operand(1)); - return BinaryOperator::Create(ii->get_op(), lhs, rhs, ii->get_name()); - } - if(auto* ii = dynamic_cast(inst)){ - CmpInst::Predicate pred = ii->get_pred(); - Value *lhs = value(ii->get_operand(0)); - Value *rhs = value(ii->get_operand(1)); - return CmpInst::Create(Instruction::ICmp, pred, lhs, rhs, ii->get_name()); - } - if(auto* ii = dynamic_cast(inst)){ - CmpInst::Predicate pred = ii->get_pred(); - Value *lhs = value(ii->get_operand(0)); - Value *rhs = value(ii->get_operand(1)); - return FCmpInst::Create(Instruction::FCmp, pred, lhs, rhs, ii->get_name()); - } - if(auto* ii = dynamic_cast(inst)){ - Value *arg = value(ii->get_operand(0)); - Type *dst_ty = type(ii->get_type()); - return CastInst::Create(ii->get_op(), arg, dst_ty, ii->get_name()); - } - if(auto* ii = dynamic_cast(inst)){ - std::vector idx_vals; - std::transform(ii->idx_begin(), ii->idx_end(), std::back_inserter(idx_vals), - [&value](ir::value* x){ return value(x);}); - Type *source_ty = type(ii->get_source_elt_ty()); - Value *arg = value(ii->get_operand(0)); - return GetElementPtrInst::Create(source_ty, arg, idx_vals, ii->get_name()); - } - if(ir::load_inst* ii = dynamic_cast(inst)){ - Value *ptr = value(ii->get_pointer_operand()); - return new LoadInst(ptr, ii->get_name()); - } - // unknown instruction - throw std::runtime_error("unknown conversion from ir::type to Type"); -} - -Value* llvm_value(ir::value *v, LLVMContext &ctx, - std::map &vmap, - std::map &bmap) { - if(vmap.find(v) != vmap.end()) - return vmap.at(v); - // create operands - if(auto *uu = dynamic_cast(v)) - for(ir::value* u: uu->ops()) - vmap[u] = llvm_value(u, ctx, vmap, bmap); - if(auto *cc = dynamic_cast(v)) - return llvm_constant(cc, ctx); - // instruction - if(auto *ii = dynamic_cast(v)) - return llvm_inst(ii, ctx, vmap, bmap); - // unknown value - throw std::runtime_error("unknown conversion from ir::value to Value"); -} - -void lowering(ir::module &src, Module &dst){ - std::map vmap; - std::map bmap; - LLVMContext &dst_ctx = dst.getContext(); - IRBuilder<> dst_builder(dst_ctx); - // iterate over functions - for(ir::function *fn: src.get_function_list()) { - // create LLVM function - FunctionType *fn_ty = (FunctionType*)llvm_type(fn->get_fn_type(), dst_ctx); - Function *dst_fn = Function::Create(fn_ty, Function::ExternalLinkage, "kernel", &dst); -// std::cout << ((FunctionType*)fn_ty)->getNumParams() << std::endl; - // map parameters - for(unsigned i = 0; i < fn->args().size(); i++) - vmap[fn->args()[i]] = &*(dst_fn->arg_begin() + i); - // create blocks - for(ir::basic_block *block: fn->blocks()) { - BasicBlock *dst_block = BasicBlock::Create(dst_ctx, block->get_name(), dst_fn); - bmap[block] = dst_block; - } - // iterate through block - for(ir::basic_block *block: fn->blocks()) { - dst_builder.SetInsertPoint(bmap[block]); - for(ir::instruction *inst: block->get_inst_list()) { - Instruction *dst_inst = llvm_inst(inst, dst_ctx, vmap, bmap); - vmap[inst] = dst_inst; - dst_builder.Insert(dst_inst); - } - } - // add phi operands - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *inst: block->get_inst_list()) - if(auto *phi = dynamic_cast(inst)){ - PHINode *dst_phi = (PHINode*)vmap.at(phi); - for(unsigned i = 0; i < phi->get_num_incoming(); i++){ - ir::value *inc_val = phi->get_incoming_value(i); - ir::basic_block *inc_block = phi->get_incoming_block(i); - Value *llvm_inc_val = llvm_value(inc_val, dst_ctx, vmap, bmap); - BasicBlock *llvm_block = bmap[inc_block]; - dst_phi->addIncoming(llvm_inc_val, llvm_block); - } - } - } -} +private: + vmap_t vmap_; + bmap_t bmap_; +}; } } diff --git a/include/codegen/tune.h b/include/codegen/tune.h index f1871167a..98ba7e327 100644 --- a/include/codegen/tune.h +++ b/include/codegen/tune.h @@ -1,10 +1,56 @@ -#ifndef TDL_INCLUDE_IR_CODEGEN_TUNE_H -#define TDL_INCLUDE_IR_CODEGEN_TUNE_H +//#ifndef TDL_INCLUDE_IR_CODEGEN_TUNE_H +//#define TDL_INCLUDE_IR_CODEGEN_TUNE_H -namespace tdl{ -namespace codegen{ +//namespace tdl{ +//namespace codegen{ -} -} +//// Layout binding pass +//class TLVMAddTunerParams: public FunctionPass { +//private: +// enum CType{ +// Layout = 0, Shape = 1 +// }; +// // Params pool +// SmallVector LParamsPool; +// // Constraints +// typedef std::pair CNodeType; +// typedef DenseMap> CGraphType; +// // Layout constraints +// CGraphType LCGraph; +// DenseSet LCNodes; +// // Shape constraints +// CGraphType SCGraph; +// DenseSet SCNodes; +// // Relational +// std::map, std::function> ExtraParams; +// DenseSet Constants; -#endif +// void addConstraint(CNodeType X, CNodeType Y, CType CT); +// void initCPhi(Instruction *I); +// void initCGraph(Instruction *V); +// void connectedComponents(CNodeType X, ArrayRef Vals, CType CT, DenseSet &Nodes, CGraphType &Graph); + +//public: +// static char ID; +// TLVMAddTunerParams(): FunctionPass(ID){ } + +// void getAnalysisUsage(AnalysisUsage & AU) const override; +// bool runOnFunction(Function &F) override; + +//private: +// std::map, Constant*> KnownParams; +//}; + +//class TLVMAddTunerConstraints: public FunctionPass { +//public: +// static char ID; +// TLVMAddTunerConstraints(): FunctionPass(ID){ } + +// void getAnalysisUsage(AnalysisUsage & AU) const override; +// bool runOnFunction(Function &F) override; +//}; + +//} +//} + +//#endif diff --git a/include/ir/instructions.h b/include/ir/instructions.h index 875b2b0a0..9d390b8cb 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -332,6 +332,11 @@ public: }; +// matmul + +class matmul_inst: public instruction { + +}; } } diff --git a/include/ir/type.h b/include/ir/type.h index fb268326a..ab7b20ef1 100644 --- a/include/ir/type.h +++ b/include/ir/type.h @@ -51,8 +51,10 @@ public: // type attributes unsigned get_fp_mantissa_width() const; unsigned get_integer_bitwidth() const; + unsigned get_size_in_bits() const; type *get_scalar_ty() const; const std::vector &get_tile_shapes() const; + unsigned get_tile_num_elements() const; type *get_tile_element_ty() const; unsigned get_pointer_address_space() const; type *get_pointer_element_ty() const; @@ -135,6 +137,7 @@ private: public: // accessors const std::vector& get_shapes() const { return shapes_; } + unsigned get_num_elements(); // factory methods static tile_type* get(type *ty, const std::vector &shapes); From 297d1a99d1efa23ecf72e931358591e81478fcee Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 7 Jan 2019 22:49:37 -0500 Subject: [PATCH 042/494] [code generation] adding missing files --- lib/codegen/allocation.cpp | 129 ++++++++++ lib/codegen/layout.cpp | 55 +++++ lib/codegen/liveness.cpp | 42 ++++ lib/codegen/loop_info.cpp | 0 lib/codegen/selection.cpp | 189 +++++++++++++++ lib/codegen/tune.cpp | 468 +++++++++++++++++++++++++++++++++++++ 6 files changed, 883 insertions(+) create mode 100644 lib/codegen/allocation.cpp create mode 100644 lib/codegen/layout.cpp create mode 100644 lib/codegen/liveness.cpp create mode 100644 lib/codegen/loop_info.cpp create mode 100644 lib/codegen/selection.cpp create mode 100644 lib/codegen/tune.cpp diff --git a/lib/codegen/allocation.cpp b/lib/codegen/allocation.cpp new file mode 100644 index 000000000..2dcc4fbc9 --- /dev/null +++ b/lib/codegen/allocation.cpp @@ -0,0 +1,129 @@ +#include "codegen/allocation.h" +#include "codegen/liveness.h" +#include "codegen/layout.h" +#include "codegen/loop_info.h" +#include "ir/basic_block.h" +#include "ir/type.h" +#include "ir/value.h" +#include "ir/function.h" +#include "ir/instructions.h" + +namespace tdl{ +namespace codegen{ + +unsigned allocation::get_num_bytes(ir::value *x) const { + ir::type *ty = x->get_type(); + unsigned num_elements = ty->get_tile_num_elements(); + if(has_double_buffer(x)) + num_elements *= 2; + return num_elements * ty->get_scalar_ty()->get_size_in_bits(); +} + + +void allocation::run(ir::function &fn){ + using std::max; + using std::min; + typedef std::multimap triples_map_type; + + // Fill double buffering info + for(ir::basic_block *block: fn.blocks()) + for(ir::instruction *v: block->get_inst_list()) + // If requires shared memory + if(layout_->get_num_shared_views(v) && + loop_info_->get_loop_for(block)) + double_buffer_.insert(v); + + std::vector I; + for(auto x: liveness_->intervals()) + I.push_back(x.first); + std::vector J = I; + + triples_map_type H; + H.insert({0, segment{0, 100}}); + + std::vector V; + std::map starts; + while(!J.empty()){ + auto h_it = H.begin(); + unsigned w = h_it->first; + segment xh = h_it->second; + H.erase(h_it); + auto j_it = std::find_if(J.begin(), J.end(), [&](ir::value *JJ){ + segment xj = liveness_->get_interval(JJ); + bool res = xj.intersect(xh); + for(auto val: H) + res = res && !val.second.intersect(xj); + return res; + }); + if(j_it != J.end()){ + unsigned size = get_num_bytes(*j_it); + segment xj = liveness_->get_interval(*j_it); + starts[*j_it] = w; + H.insert({w + size, segment{max(xh.start, xj.start), min(xh.end, xj.end)}}); + if(xh.start < xj.start) + H.insert({w, segment{xh.start, xj.end}}); + if(xj.end < xh.end) + H.insert({w, segment{xj.start, xh.end}}); + V.push_back(*j_it); + J.erase(j_it); + } + } + + + // Build interference graph + std::map> interferences; + for(ir::value *x: V) + for(ir::value *y: V){ + if(x == y) + continue; + unsigned X0 = starts[x], Y0 = starts[y]; + unsigned NX = get_num_bytes(x); + unsigned NY = get_num_bytes(y); + segment XS = {X0, X0 + NX}; + segment YS = {Y0, Y0 + NY}; + if(liveness_->get_interval(x).intersect(liveness_->get_interval(y)) + && XS.intersect(YS)) + interferences[x].insert(y); + } + + // Initialize colors + std::map colors; + for(ir::value *X: V) + colors[X] = (X==V[0])?0:-1; + + // First-fit coloring + std::vector available(V.size()); + for(ir::value *x: V){ + // Non-neighboring colors are available + std::fill(available.begin(), available.end(), true); + for(ir::value *Y: interferences[x]){ + int color = colors[Y]; + if(color >= 0) + available[color] = false; + } + // Assigns first available color + auto It = std::find(available.begin(), available.end(), true); + colors[x] = std::distance(available.begin(), It); + } + + // Finalize allocation + for(ir::value *x: V){ + unsigned Adj = 0; + for(ir::value *y: interferences[x]) + Adj = std::max(Adj, starts[y] + get_num_bytes(y)); + offsets_[x] = starts[x] + colors[x] * Adj; + if(auto *phi = dynamic_cast(x)) + for(ir::value *px: phi->ops()){ + if(offsets_.find(px) == offsets_.end()) + offsets_[px] = offsets_[x]; + } + } + + // Save maximum size of induced memory space + allocated_size_ = 0; + for(auto &x: offsets_) + allocated_size_ = std::max(allocated_size_, x.second + get_num_bytes(x.first)); +} + +} +} diff --git a/lib/codegen/layout.cpp b/lib/codegen/layout.cpp new file mode 100644 index 000000000..cdddb1d17 --- /dev/null +++ b/lib/codegen/layout.cpp @@ -0,0 +1,55 @@ +#include "codegen/layout.h" +#include "ir/function.h" +#include "ir/basic_block.h" +#include "ir/instructions.h" + +namespace tdl{ +namespace codegen{ + + +shared_view_info layout::get_shared_view(ir::value *v, unsigned idx){ + return shared_views_.at(v)[idx]; +} + +unsigned layout::get_num_shared_views(ir::value *v){ + return shared_views_.at(v).size(); +} + +// Phi node +void layout::add_phi_nodes(ir::value *v){ + if(ir::phi_node *phi = dynamic_cast(v)) + if(shared_views_.find(phi) != shared_views_.end()) + for(ir::value *v: phi->ops()){ + shared_views_[v] = shared_views_[phi]; + for(shared_view_info &info: shared_views_[v]) + info.has_dedicated_storage = false; + } +} + +// Memory Layout +void layout::add_shared_views(ir::value *v){ + // GEMM has shared inputs + if(dynamic_cast(v)) + shared_views_[v].push_back({v, true}); + if(dynamic_cast(v)) + shared_views_[v].push_back({v, true}); +} + +// Entry point +bool layout::run(ir::function &fn) { + // Non-phis + for(ir::basic_block *block: fn.blocks()) + for(ir::instruction *instr: block->get_inst_list()) { + add_shared_views(instr); + } + // Phi nodes + for(ir::basic_block *block: fn.blocks()) + for(ir::instruction *instr: block->get_inst_list()) { + add_phi_nodes(instr); + } + // Done + return false; +} + +} +} diff --git a/lib/codegen/liveness.cpp b/lib/codegen/liveness.cpp new file mode 100644 index 000000000..0e56aac03 --- /dev/null +++ b/lib/codegen/liveness.cpp @@ -0,0 +1,42 @@ +#include "codegen/liveness.h" +#include "codegen/layout.h" +#include "ir/basic_block.h" +#include "ir/function.h" +#include "ir/instructions.h" +#include "ir/value.h" + +namespace tdl{ +namespace codegen{ + + +// Entry point +void liveness::run(ir::function *fn) { + + // Assigns index to each instruction + slot_index index = 0; + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *instr: block->get_inst_list()){ + index += 1; + indices_.insert({instr, index}); + } + + // Liveness analysis + // Creates live intervals + for(auto i: indices_){ + ir::value *v = i.first; + if(!layouts_->get_num_shared_views(v)) + continue; + if(!layouts_->get_shared_view(v, 0).has_dedicated_storage) + continue; + unsigned start = i.second; + unsigned end = start; + for(ir::value *u: v->get_users()){ + start = std::min(start, indices_.at(u)); + end = std::max(end, indices_.at(u)); + } + intervals_[v] = segment{start, end}; + } +} + +} +} diff --git a/lib/codegen/loop_info.cpp b/lib/codegen/loop_info.cpp new file mode 100644 index 000000000..e69de29bb diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp new file mode 100644 index 000000000..edf48262c --- /dev/null +++ b/lib/codegen/selection.cpp @@ -0,0 +1,189 @@ +#include "codegen/selection.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/IRBuilder.h" +#include "ir/context.h" +#include "ir/module.h" +#include "ir/function.h" +#include "ir/type.h" + + +namespace tdl{ +namespace codegen{ + +using namespace llvm; + + +/* convert ir::type to Type */ +Type *selection::llvm_type(ir::type *ty, LLVMContext &ctx) { + // function + if(auto* tt = dynamic_cast(ty)){ + Type *return_ty = llvm_type(tt->get_return_ty(), ctx); + std::vector param_tys; + std::transform(tt->params_begin(), tt->params_end(), std::back_inserter(param_tys), + [this,&ctx](ir::type* t){ return llvm_type(t, ctx);}); + return FunctionType::get(return_ty, param_tys, false); + } + // pointer + if(ty->is_pointer_ty()){ + Type *elt_ty = llvm_type(ty->get_pointer_element_ty(), ctx); + unsigned addr_space = ty->get_pointer_address_space(); + return PointerType::get(elt_ty, addr_space); + } + // integer + if(ty->is_integer_ty()){ + unsigned bitwidth = ty->get_integer_bitwidth(); + return IntegerType::get(ctx, bitwidth); + } + // primitive types + switch(ty->get_type_id()){ + case ir::type::VoidTyID: return Type::getVoidTy(ctx); + case ir::type::HalfTyID: return Type::getHalfTy(ctx); + case ir::type::FloatTyID: return Type::getFloatTy(ctx); + case ir::type::DoubleTyID: return Type::getDoubleTy(ctx); + case ir::type::X86_FP80TyID: return Type::getX86_FP80Ty(ctx); + case ir::type::PPC_FP128TyID: return Type::getPPC_FP128Ty(ctx); + case ir::type::LabelTyID: return Type::getLabelTy(ctx); + case ir::type::MetadataTyID: return Type::getMetadataTy(ctx); + case ir::type::TokenTyID: return Type::getTokenTy(ctx); + default: break; + } + // unknown type + throw std::runtime_error("unknown conversion from ir::type to Type"); +} + +/* convert ir::constant to Constant */ +Constant *selection::llvm_constant(ir::constant *cst, LLVMContext &ctx) { + Type *dst_ty = llvm_type(cst->get_type(), ctx); + if(auto* cc = dynamic_cast(cst)) + return ConstantInt::get(dst_ty, cc->get_value()); + if(auto* cc = dynamic_cast(cst)) + return ConstantFP::get(dst_ty, cc->get_value()); + // unknown constant + throw std::runtime_error("unknown conversion from ir::constant to Constant"); +} + + +/* convert ir::instruction to Instruction */ +Instruction *selection::llvm_inst(ir::instruction *inst, LLVMContext & ctx) { + auto value = [&](ir::value *x) { return llvm_value(x, ctx); }; + auto block = [&](ir::basic_block *x) { return bmap_.at(x); }; + auto type = [&](ir::type *x) { return llvm_type(x, ctx); }; + if(auto* ii = dynamic_cast(inst)){ + BasicBlock *true_dest = block(ii->get_true_dest()); + BasicBlock *false_dest = block(ii->get_false_dest()); + Value *cond = value(ii->get_cond()); + return BranchInst::Create(true_dest, false_dest, cond); + } + if(auto* ii = dynamic_cast(inst)){ + BasicBlock *dest = block(ii->get_dest()); + return BranchInst::Create(dest); + } + if(auto* ii = dynamic_cast(inst)){ + Type *ty = type(ii->get_type()); + unsigned num_ops = ii->get_num_operands(); + return PHINode::Create(ty, num_ops, ii->get_name()); + } + if(auto* ii = dynamic_cast(inst)){ + ir::value *ret_val = ii->get_return_value(); + return ReturnInst::Create(ctx, ret_val?value(ret_val):nullptr); + } + if(auto* ii = dynamic_cast(inst)){ + Value *lhs = value(ii->get_operand(0)); + Value *rhs = value(ii->get_operand(1)); + return BinaryOperator::Create(ii->get_op(), lhs, rhs, ii->get_name()); + } + if(auto* ii = dynamic_cast(inst)){ + CmpInst::Predicate pred = ii->get_pred(); + Value *lhs = value(ii->get_operand(0)); + Value *rhs = value(ii->get_operand(1)); + return CmpInst::Create(Instruction::ICmp, pred, lhs, rhs, ii->get_name()); + } + if(auto* ii = dynamic_cast(inst)){ + CmpInst::Predicate pred = ii->get_pred(); + Value *lhs = value(ii->get_operand(0)); + Value *rhs = value(ii->get_operand(1)); + return FCmpInst::Create(Instruction::FCmp, pred, lhs, rhs, ii->get_name()); + } + if(auto* ii = dynamic_cast(inst)){ + Value *arg = value(ii->get_operand(0)); + Type *dst_ty = type(ii->get_type()); + return CastInst::Create(ii->get_op(), arg, dst_ty, ii->get_name()); + } + if(auto* ii = dynamic_cast(inst)){ + std::vector idx_vals; + std::transform(ii->idx_begin(), ii->idx_end(), std::back_inserter(idx_vals), + [&value](ir::value* x){ return value(x);}); + Type *source_ty = type(ii->get_source_elt_ty()); + Value *arg = value(ii->get_operand(0)); + return GetElementPtrInst::Create(source_ty, arg, idx_vals, ii->get_name()); + } + if(ir::load_inst* ii = dynamic_cast(inst)){ + Value *ptr = value(ii->get_pointer_operand()); + return new LoadInst(ptr, ii->get_name()); + } + // unknown instruction + throw std::runtime_error("unknown conversion from ir::type to Type"); +} + +Value* selection::llvm_value(ir::value *v, LLVMContext &ctx) { + if(vmap_.find(v) != vmap_.end()) + return vmap_.at(v); + // create operands + if(auto *uu = dynamic_cast(v)) + for(ir::value* u: uu->ops()) + vmap_[u] = llvm_value(u, ctx); + if(auto *cc = dynamic_cast(v)) + return llvm_constant(cc, ctx); + // instruction + if(auto *ii = dynamic_cast(v)) + return llvm_inst(ii, ctx); + // unknown value + throw std::runtime_error("unknown conversion from ir::value to Value"); +} + +void selection::run(ir::module &src, Module &dst){ + vmap_.clear(); + bmap_.clear(); + LLVMContext &dst_ctx = dst.getContext(); + IRBuilder<> dst_builder(dst_ctx); + // iterate over functions + for(ir::function *fn: src.get_function_list()) { + // create LLVM function + FunctionType *fn_ty = (FunctionType*)llvm_type(fn->get_fn_type(), dst_ctx); + Function *dst_fn = Function::Create(fn_ty, Function::ExternalLinkage, "kernel", &dst); + // map parameters + for(unsigned i = 0; i < fn->args().size(); i++) + vmap_[fn->args()[i]] = &*(dst_fn->arg_begin() + i); + // create blocks + for(ir::basic_block *block: fn->blocks()) { + BasicBlock *dst_block = BasicBlock::Create(dst_ctx, block->get_name(), dst_fn); + bmap_[block] = dst_block; + } + // iterate through block + for(ir::basic_block *block: fn->blocks()) { + dst_builder.SetInsertPoint(bmap_[block]); + for(ir::instruction *inst: block->get_inst_list()) { + Instruction *dst_inst = llvm_inst(inst, dst_ctx); + vmap_[inst] = dst_inst; + dst_builder.Insert(dst_inst); + } + } + // add phi operands + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *inst: block->get_inst_list()) + if(auto *phi = dynamic_cast(inst)){ + PHINode *dst_phi = (PHINode*)vmap_.at(phi); + for(unsigned i = 0; i < phi->get_num_incoming(); i++){ + ir::value *inc_val = phi->get_incoming_value(i); + ir::basic_block *inc_block = phi->get_incoming_block(i); + Value *llvm_inc_val = llvm_value(inc_val, dst_ctx); + BasicBlock *llvm_block = bmap_[inc_block]; + dst_phi->addIncoming(llvm_inc_val, llvm_block); + } + } + } +} + + +} +} diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp new file mode 100644 index 000000000..6e646a4e3 --- /dev/null +++ b/lib/codegen/tune.cpp @@ -0,0 +1,468 @@ +//#include "codegen/tune.h" + +//namespace tdl{ +//namespace codegen{ + + +//// Layout binding pass +//class TLVMAddTunerConstraints: public FunctionPass { +//public: +// static char ID; +// TLVMAddTunerConstraints(): FunctionPass(ID){ } + +// void getAnalysisUsage(AnalysisUsage & AU) const override; +// bool runOnFunction(Function &F) override; +//}; + +//// Initialization +//char TLVMAddTunerConstraints::ID = 0; +//INITIALIZE_PASS_BEGIN(TLVMAddTunerConstraints, "tlvm-add-tuner-constraints", +// "Add Tuner Constraints (TLVM)", false, true) +//INITIALIZE_PASS_END(TLVMAddTunerConstraints, "tlvm-add-tuner-constraints", +// "Add Tuner Constraints (TLVM)", false, true) +//FunctionPass *llvm::createTLVMAddTunerConstraintsPass() { return new TLVMAddTunerConstraints(); } + +//// Analysis usage +//void TLVMAddTunerConstraints::getAnalysisUsage(AnalysisUsage &AU) const { +// AU.setPreservesAll(); +// FunctionPass::getAnalysisUsage(AU); +//} + + +//inline unsigned MDRead(MDNode* Node){ +// Metadata *MD = Node->getOperand(0).get(); +// Constant *Cst = ((ConstantAsMetadata*)MD)->getValue(); +// unsigned Result = Cst->getUniqueInteger().getZExtValue(); +// return Result; +//} + +//inline unsigned getNumGT1Dim(Instruction &I){ +// unsigned Res = 0; +// for(unsigned K = 0; K < I.getType()->getTileNumDimensions(); K++) +// if(MDRead(I.getMetadata("nvvm.param.shape.d" + itostr(K))) > 1) +// Res++; +// return Res; +//} +//// Run +//bool TLVMAddTunerConstraints::runOnFunction(Function &F) { +// LLVMContext &Ctx = F.getContext(); + +// DenseMap Refs; +// for(Function::iterator::value_type &BB: F) +// for(Instruction &I : BB) +// if(isTLVMValue(&I)){ +// SmallVector, 4> MDs; +// I.getAllMetadata(MDs); +// for(auto &X: MDs){ +// if(MDRead(X.second)==1) +// continue; +// Instruction *&Ref = Refs[X.second]; +// if(!Ref || getNumGT1Dim(I) > getNumGT1Dim(*Ref)) +// Ref = &I; +// } +// } +// SmallVector Grids; +// for(auto &R: Refs) +// if(std::find(Grids.begin(), Grids.end(), R.second) == Grids.end()) +// Grids.push_back(R.second); + + +// Instruction *FirstTile = Grids.front(); +// for(Instruction *I: Grids){ +// Type *Ty = I->getType(); +// size_t NumDim = Ty->getTileNumDimensions(); + +// // For each dimension, the product of layout components +// // must divide shape +// for(size_t K = 0; K < NumDim; K++){ +// unsigned Shape = MDRead(I->getMetadata("nvvm.param.shape.d" + itostr(K))); +// unsigned S0 = MDRead(I->getMetadata("nvvm.param.layout.p0.d" + itostr(K))); +// unsigned S1 = MDRead(I->getMetadata("nvvm.param.layout.p1.d" + itostr(K))); +// unsigned S2 = MDRead(I->getMetadata("nvvm.param.layout.p2.d" + itostr(K))); +// bool Constraint = Shape % (S0*S1*S2)== 0; +// Constant *Cst = Constraint?ConstantInt::getTrue(Ctx):ConstantInt::getFalse(Ctx); +// I->setMetadata("nvvm.constraint.shape.d" + itostr(K), MDNode::get(Ctx, ConstantAsMetadata::get(Cst))); +// }; +// // The number of threads per warp is 32 +// { +// int NumThreads = 1; +// for(size_t K = 0; K < NumDim; K++){ +// unsigned PC = MDRead(I->getMetadata("nvvm.param.layout.p1.d" + itostr(K))); +// NumThreads *= PC; +// } +// bool Constraint = NumThreads==32; +// Constant *Cst = Constraint?ConstantInt::getTrue(Ctx):ConstantInt::getFalse(Ctx); +// I->setMetadata("nvvm.constraint.threads", MDNode::get(Ctx, ConstantAsMetadata::get(Cst))); +// } +// // The number of warps required by the layout is the same +// // for all tiles in the function +// { +// int NumWarps = 1; +// int RefNumWarps = 1; +// for(size_t K = 0; K < NumDim; K++){ +// unsigned PC = MDRead(I->getMetadata("nvvm.param.layout.p2.d" + itostr(K))); +// unsigned PR = MDRead(FirstTile->getMetadata("nvvm.param.layout.p2.d" + itostr(K))); +// NumWarps *= PC; +// RefNumWarps *= PR; +// } +// bool Constraint = NumWarps==RefNumWarps; +// Constant *Cst = Constraint?ConstantInt::getTrue(Ctx):ConstantInt::getFalse(Ctx); +// I->setMetadata("nvvm.constraint.warps", MDNode::get(Ctx, ConstantAsMetadata::get(Cst))); +// }; +// } +// return true; +//} + + +//// Layout binding pass +//class TLVMAddTunerParams: public FunctionPass { +//private: +// enum CType{ +// Layout = 0, Shape = 1 +// }; +// // Params pool +// SmallVector LParamsPool; +// // Constraints +// typedef std::pair CNodeType; +// typedef DenseMap> CGraphType; +// // Layout constraints +// CGraphType LCGraph; +// DenseSet LCNodes; +// // Shape constraints +// CGraphType SCGraph; +// DenseSet SCNodes; +// // Relational +// std::map, std::function> ExtraParams; +// DenseSet Constants; + +// void addConstraint(CNodeType X, CNodeType Y, CType CT); +// void initCPhi(Instruction *I); +// void initCGraph(Instruction *V); +// void connectedComponents(CNodeType X, ArrayRef Vals, CType CT, DenseSet &Nodes, CGraphType &Graph); + +//public: +// static char ID; +// TLVMAddTunerParams(): FunctionPass(ID){ } + +// void getAnalysisUsage(AnalysisUsage & AU) const override; +// bool runOnFunction(Function &F) override; + +//private: +// std::map, Constant*> KnownParams; +//}; + +//// Initialization +//char TLVMAddTunerParams::ID = 0; +//INITIALIZE_PASS_BEGIN(TLVMAddTunerParams, "tlvm-add-tuner-parameters", +// "Add Tuner Parameters (TLVM)", false, true) +//INITIALIZE_PASS_END(TLVMAddTunerParams, "tlvm-add-tuner-parameters", +// "Add Tuner Parameters (TLVM)", false, true) +//FunctionPass *llvm::createTLVMAddTunerParamsPass() { return new TLVMAddTunerParams(); } + +//// Analysis usage +//void TLVMAddTunerParams::getAnalysisUsage(AnalysisUsage &AU) const { +// AU.setPreservesAll(); +// FunctionPass::getAnalysisUsage(AU); +//} + +//void TLVMAddTunerParams::addConstraint(CNodeType X, CNodeType Y, CType CT){ +// // Layout Constraint +// if(CT == Layout){ +// LCGraph[X].insert(Y); +// LCGraph[Y].insert(X); +// LCNodes.insert(X); +// LCNodes.insert(Y); +// } +// if(CT == Shape || CT == Layout){ +// SCGraph[X].insert(Y); +// SCGraph[Y].insert(X); +// SCNodes.insert(X); +// SCNodes.insert(Y); +// } +//} + +//void TLVMAddTunerParams::initCPhi(Instruction *I){ +// unsigned NumDim = 0; +// // Phi Nodes: all the incoming value share the result layout +// if(PHINode *Phi = dyn_cast(I)){ +// Type *Ty = Phi->getType(); +// NumDim = Ty->getTileNumDimensions(); +// unsigned NumInc = Phi->getNumIncomingValues(); +// for(unsigned PI = 0; PI < NumInc; PI++){ +// Value *Inc = Phi->getIncomingValue(PI); +// for(unsigned K = 0; K < NumDim; K++){ +// CType CT = (LCGraph.find({Inc,K}) != LCGraph.end() || +// LCGraph.find({Phi,K}) != LCGraph.end())?Layout:Shape; +// addConstraint({Phi, K}, {Inc, K}, CT); +// } +// } +// } +//} + +//void TLVMAddTunerParams::initCGraph(Instruction *I) { +// unsigned NumDim = 0; +// LLVMContext &Context = I->getContext(); +// Constant *_1 = ConstantInt::get(Type::getInt32Ty(Context), 1); +// // Function call +// if(CallInst *Call = dyn_cast(I)) +// if(Function *Callee = Call->getCalledFunction()){ +// Intrinsic::ID IntrinsicID = Callee->getIntrinsicID(); +// switch (IntrinsicID) { +// // Outer +// case Intrinsic::tlvm_outer_add: LLVM_FALLTHROUGH; +// case Intrinsic::tlvm_outer_and: { +// addConstraint({Call, 0}, {Call->getOperand(0), 0}, Layout); +// addConstraint({Call, 1}, {Call->getOperand(1), 0}, Layout); +// break; +// } +// // Slice +// case Intrinsic::tlvm_read_slice_x: LLVM_FALLTHROUGH; +// case Intrinsic::tlvm_read_slice_y: { +// addConstraint({Call, 0}, {Call->getOperand(0), 0}, Shape); +// break; +// } +// // Range +// case Intrinsic::tlvm_range: { +// addConstraint({Call, 0}, {Call->getOperand(1), 0}, Shape); +// break; +// } +// // GetTilePtr +// case Intrinsic::tlvm_gtp_2d: NumDim++; LLVM_FALLTHROUGH; +// case Intrinsic::tlvm_gtp_1d: NumDim++; { +// Value *Offset = Call->getOperand(1); +// for(unsigned K = 0; K < NumDim; K++){ +// addConstraint({Call, K}, {Offset, K}, Layout); +// } +// break; +// } +// // SlideTilePtr: Pointer shares result layout +// case Intrinsic::tlvm_stp_2d: NumDim++; LLVM_FALLTHROUGH; +// case Intrinsic::tlvm_stp_1d: NumDim++; { +// for(unsigned K = 0; K < NumDim; K++){ +// addConstraint({Call, K}, {Call->getOperand(0), K}, Layout); +// addConstraint({Call, K}, {Call->getOperand(1), K}, Layout); +// } +// break; +// } +// // Transpose +// case Intrinsic::tlvm_transpose_2d: NumDim++; NumDim++; { +// Value *Op = Call->getOperand(0); +// addConstraint({Call, 0}, {Op, 1}, Shape); +// addConstraint({Call, 1}, {Op, 0}, Shape); +// break; +// } +// // Reshape +// case Intrinsic::tlvm_reshape_2d: NumDim++; NumDim++; { +// for(unsigned K = 0; K < NumDim; K++) +// addConstraint({Call, K}, {Call->getOperand(1 + K), 0}, Shape); +// break; +// } +// // Reshape distributed +// case Intrinsic::tlvm_reshape_2d_1d: NumDim++; NumDim++; { +// size_t Current = 0; +// for(unsigned K = 0; K < NumDim; K++){ +// if(Call->getOperand(1 + K) == _1) +// addConstraint({Call, K}, {_1, 0}, Layout); +// else +// addConstraint({Call, K}, {Call->getOperand(0), Current++}, Layout); +// } +// break; +// } +// // Broadcast +// case Intrinsic::tlvm_broadcast_2d: NumDim++; LLVM_FALLTHROUGH; +// case Intrinsic::tlvm_broadcast_1d: NumDim++; { +// for(unsigned K = 0; K < NumDim; K++) +// addConstraint({Call, K}, {Call->getOperand(1 + K), 0}, Shape); +// break; +// } +// // Splat +// case Intrinsic::tlvm_splat_2d: NumDim++; LLVM_FALLTHROUGH; +// case Intrinsic::tlvm_splat_1d: NumDim++; { +// for(unsigned K = 0; K < NumDim; K++) +// addConstraint({Call, K}, {Call->getOperand(K), 0}, Shape); +// break; +// } + +// case Intrinsic::tlvm_load:{ +// NumDim = Call->getType()->getTileNumDimensions(); +// Value *Ptr = Call->getOperand(0); +// for(unsigned K = 0; K < NumDim; K++) +// addConstraint({Call, K}, {Ptr, K}, Layout); +// break; +// } + +// // Masked Load +// case Intrinsic::tlvm_masked_load: { +// NumDim = Call->getType()->getTileNumDimensions(); +// for(unsigned K = 0; K < NumDim; K++){ +// addConstraint({Call, K}, {Call->getOperand(0), K}, Layout); +// addConstraint({Call, K}, {Call->getOperand(1), K}, Layout); +// } +// break; +// } +// // Masked store +// case Intrinsic::tlvm_atomic_load_add_f32: LLVM_FALLTHROUGH; +// case Intrinsic::tlvm_masked_store: { +// Value *Val = Call->getOperand(0); +// Value *Ptr = Call->getOperand(1); +// Value *Mask = Call->getOperand(2); +// NumDim = Val->getType()->getTileNumDimensions(); +// for(unsigned K = 0; K < NumDim; K++){ +// addConstraint({Val, K}, {Ptr, K}, Layout); +// addConstraint({Val, K}, {Mask, K}, Layout); +// } +// break; +// } +// // Set Mask +// case Intrinsic::tlvm_set_mask_2d: NumDim++; NumDim++; { +// for(unsigned K = 0; K < NumDim; K++){ +// Value *Op = Call->getOperand(NumDim + K); +// addConstraint({Call, K}, {Op, 0}, Layout); +// } +// break; +// } +// // MMA +// // A shares first axis with C +// // B shares last axis with C +// case Intrinsic::tlvm_mma_nn: +// case Intrinsic::tlvm_mma_nt: +// case Intrinsic::tlvm_mma_tn: +// case Intrinsic::tlvm_mma_tt:{ +// bool AT = IntrinsicID == Intrinsic::tlvm_mma_tn || IntrinsicID == Intrinsic::tlvm_mma_tt; +// bool BT = IntrinsicID == Intrinsic::tlvm_mma_nt || IntrinsicID == Intrinsic::tlvm_mma_tt; +// Value *A = Call->getOperand(0); +// Value *B = Call->getOperand(1); +// Value *D = Call->getOperand(2); +// size_t AOuter = 0, AInner = 1; +// size_t BOuter = 1, BInner = 0; +// if(AT) std::swap(AOuter, AInner); +// if(BT) std::swap(BOuter, BInner); +// addConstraint({Call, 0}, {A, AOuter}, Shape); +// addConstraint({Call, 1}, {B, BOuter}, Shape); +// addConstraint({A, AInner}, {B, BInner}, Shape); +// addConstraint({Call, 0}, {D, 0}, Layout); +// addConstraint({Call, 1}, {D, 1}, Layout); +// break; +// } +// default: +// break; +// } +// } +// // LoadInst: Pointer shares the result layout +// if(LoadInst *Load = dyn_cast(I)){ +// NumDim = Load->getType()->getTileNumDimensions(); +// Value *Ptr = Load->getPointerOperand(); +// for(unsigned K = 0; K < NumDim; K++) +// addConstraint({Load, K}, {Ptr, K}, Layout); +// } +// // StoreInst: Pointer shares the value layout +// if(StoreInst *Store = dyn_cast(I)){ +// Value *Ptr = Store->getPointerOperand(); +// Value *Val = Store->getValueOperand(); +// NumDim = Val->getType()->getTileNumDimensions(); +// for(unsigned K = 0; K < NumDim; K++) +// addConstraint({Ptr, K}, {Val, K}, Layout); +// } +// // SelectInst: Selected tensor share layout +// if(SelectInst *Select = dyn_cast(I)){ +// NumDim = Select->getType()->getTileNumDimensions(); +// for(unsigned K = 0; K < NumDim; K++){ +// addConstraint({Select->getTrueValue(), K}, {Select, K}, Layout); +// addConstraint({Select->getFalseValue(), K}, {Select, K}, Layout); +// } +// } +// if(isa(I)){ +// NumDim = I->getType()->getTileNumDimensions(); +// for(unsigned K = 0; K < NumDim; K++){ +// addConstraint({I->getOperand(0), K}, {I, K}, Layout); +// } +// } +// // Phi Nodes: all the incoming value share the result layout +// if(PHINode *Phi = dyn_cast(I)){ +// Type *Ty = Phi->getType(); +// NumDim = Ty->getTileNumDimensions(); +// unsigned NumInc = Phi->getNumIncomingValues(); +// for(unsigned PI = 0; PI < NumInc; PI++){ +// Value *Inc = Phi->getIncomingValue(PI); +// for(unsigned K = 0; K < NumDim; K++){ +// CType CT = (LCGraph.find({Inc,K}) != LCGraph.end() || +// LCGraph.find({Phi,K}) != LCGraph.end())?Layout:Shape; +// addConstraint({Phi, K}, {Inc, K}, CT); +// } +// } +// } +// // Binary op: All the arguments share the result layout +// Instruction *BinOp = static_cast(I); +// if(isa(BinOp) || isa(BinOp)){ +// NumDim = BinOp->getType()->getTileNumDimensions(); +// Value *A = BinOp->getOperand(0); +// Value *B = BinOp->getOperand(1); +// for(unsigned K = 0; K < NumDim; K++){ +// addConstraint({BinOp, K}, {A, K}, Layout); +// addConstraint({BinOp, K}, {B, K}, Layout); +// } +// } +//} + +//void TLVMAddTunerParams::connectedComponents(CNodeType X, ArrayRef Vals, CType CT, +// DenseSet &Nodes, CGraphType &Graph){ +// if(Nodes.find(X) != Nodes.end()){ +// Nodes.erase(X); +// std::string Suffix = ".d" + itostr(X.second); +// if(Instruction *Instr = dyn_cast(X.first)){ +// if(CT==Shape){ +// Instr->setMetadata("nvvm.param.shape" + Suffix, Vals[0]); +// } +// if(CT==Layout){ +// Instr->setMetadata("nvvm.param.layout.p0" + Suffix, Vals[0]); +// Instr->setMetadata("nvvm.param.layout.p1" + Suffix, Vals[1]); +// Instr->setMetadata("nvvm.param.layout.p2" + Suffix, Vals[2]); +// } +// } +// if(ConstantInt *Cst = dyn_cast(X.first)){ +// Metadata *CstMD = ConstantAsMetadata::get(Cst); +// if(CT==Shape){ +// Vals[0]->replaceOperandWith(0, CstMD); +// } +// if(CT==Layout){ +// Vals[0]->replaceOperandWith(0, CstMD); +// Vals[1]->replaceOperandWith(0, CstMD); +// Vals[2]->replaceOperandWith(0, CstMD); +// } +// } +// for(CNodeType &E: Graph[X]) +// connectedComponents(E, Vals, CT, Nodes, Graph); +// } +//} + +//// Run +//bool TLVMAddTunerParams::runOnFunction(Function &F) { +// // Build constraints graph +// for(Function::iterator::value_type &BB: F) +// for(BasicBlock::iterator::value_type &I : BB) +// if(isTLVMValue(&I)) +// initCGraph(&I); +// for(Function::iterator::value_type &BB: F) +// for(BasicBlock::iterator::value_type &I : BB) +// if(isTLVMValue(&I)) +// initCPhi(&I); +// // Add parameters +// LLVMContext &Ctx = F.getContext(); +// Metadata *UndefMD = ConstantAsMetadata::get(UndefValue::get(Type::getInt32Ty(Ctx))); +// // Shape parameters +// while(!SCNodes.empty()){ +// MDNode *V0 = MDNode::getTemporary(Ctx, UndefMD).release(); +// connectedComponents(*SCNodes.begin(), {V0}, Shape, SCNodes, SCGraph); +// } +// // Layout parameters +// while(!LCNodes.empty()){ +// MDNode *V0 = MDNode::getTemporary(Ctx, UndefMD).release(); +// MDNode *V1 = MDNode::getTemporary(Ctx, UndefMD).release(); +// MDNode *V2 = MDNode::getTemporary(Ctx, UndefMD).release(); +// connectedComponents(*LCNodes.begin(), {V0, V1, V2}, Layout, LCNodes, LCGraph); +// } +// return true; +//} + +//} +//} From 58757b8f10b6e7fc236f346ed7bb41f71fbb7733 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 8 Jan 2019 12:39:25 -0500 Subject: [PATCH 043/494] [code generation] some more cleaning --- include/codegen/allocation.h | 3 - include/codegen/tune.h | 81 +++--- include/ir/type.h | 6 +- lib/codegen/allocation.cpp | 19 +- lib/codegen/liveness.cpp | 2 - lib/codegen/tune.cpp | 472 ++++++----------------------------- lib/ir/type.cpp | 28 +++ 7 files changed, 153 insertions(+), 458 deletions(-) diff --git a/include/codegen/allocation.h b/include/codegen/allocation.h index d23b98501..c6783cc5e 100644 --- a/include/codegen/allocation.h +++ b/include/codegen/allocation.h @@ -21,17 +21,14 @@ class loop_info; class allocation { public: // accessors - unsigned get_num_bytes(ir::value *x) const; unsigned get_offset(ir::value *x) const { return offsets_.at(x); } unsigned get_allocated_size() const { return allocated_size_; } - bool has_double_buffer(ir::value *x) const { return double_buffer_.find(x) != double_buffer_.end(); } // run void run(ir::function &fn); private: std::map offsets_; - std::set double_buffer_; std::map num_bytes_; size_t allocated_size_; // dependences diff --git a/include/codegen/tune.h b/include/codegen/tune.h index 98ba7e327..fcdce2266 100644 --- a/include/codegen/tune.h +++ b/include/codegen/tune.h @@ -1,56 +1,45 @@ -//#ifndef TDL_INCLUDE_IR_CODEGEN_TUNE_H -//#define TDL_INCLUDE_IR_CODEGEN_TUNE_H +#ifndef TDL_INCLUDE_IR_CODEGEN_TUNE_H +#define TDL_INCLUDE_IR_CODEGEN_TUNE_H -//namespace tdl{ -//namespace codegen{ +#include +#include +#include -//// Layout binding pass -//class TLVMAddTunerParams: public FunctionPass { -//private: -// enum CType{ -// Layout = 0, Shape = 1 -// }; -// // Params pool -// SmallVector LParamsPool; -// // Constraints -// typedef std::pair CNodeType; -// typedef DenseMap> CGraphType; -// // Layout constraints -// CGraphType LCGraph; -// DenseSet LCNodes; -// // Shape constraints -// CGraphType SCGraph; -// DenseSet SCNodes; -// // Relational -// std::map, std::function> ExtraParams; -// DenseSet Constants; +namespace tdl{ -// void addConstraint(CNodeType X, CNodeType Y, CType CT); -// void initCPhi(Instruction *I); -// void initCGraph(Instruction *V); -// void connectedComponents(CNodeType X, ArrayRef Vals, CType CT, DenseSet &Nodes, CGraphType &Graph); +namespace ir{ + class value; + class module; + class instruction; +} -//public: -// static char ID; -// TLVMAddTunerParams(): FunctionPass(ID){ } +namespace codegen{ -// void getAnalysisUsage(AnalysisUsage & AU) const override; -// bool runOnFunction(Function &F) override; +class tune { + typedef std::pair node_t; + typedef std::map > graph_t; -//private: -// std::map, Constant*> KnownParams; -//}; +private: + void add_constraint(node_t x, node_t y); + void init_c_phi(ir::instruction *i); + void init_c_graph(ir::instruction *v); + void connected_components(node_t x, const std::vector vals, std::set &nodes, graph_t &graph); -//class TLVMAddTunerConstraints: public FunctionPass { -//public: -// static char ID; -// TLVMAddTunerConstraints(): FunctionPass(ID){ } -// void getAnalysisUsage(AnalysisUsage & AU) const override; -// bool runOnFunction(Function &F) override; -//}; +public: + unsigned *get_param(ir::value *value); + bool check_constraints(std::map& errors); + void run(ir::module &mod); -//} -//} +private: + std::map> params_; + std::vector pool_; + graph_t dependencies_; + std::set nodes_; +}; -//#endif + +} +} + +#endif diff --git a/include/ir/type.h b/include/ir/type.h index ab7b20ef1..9f29b465b 100644 --- a/include/ir/type.h +++ b/include/ir/type.h @@ -51,7 +51,8 @@ public: // type attributes unsigned get_fp_mantissa_width() const; unsigned get_integer_bitwidth() const; - unsigned get_size_in_bits() const; + unsigned get_tile_bitwidth() const; + unsigned get_primitive_size_in_bits() const; type *get_scalar_ty() const; const std::vector &get_tile_shapes() const; unsigned get_tile_num_elements() const; @@ -137,7 +138,8 @@ private: public: // accessors const std::vector& get_shapes() const { return shapes_; } - unsigned get_num_elements(); + unsigned get_num_elements() const; + unsigned get_bitwidth() const; // factory methods static tile_type* get(type *ty, const std::vector &shapes); diff --git a/lib/codegen/allocation.cpp b/lib/codegen/allocation.cpp index 2dcc4fbc9..f67140bf5 100644 --- a/lib/codegen/allocation.cpp +++ b/lib/codegen/allocation.cpp @@ -1,7 +1,6 @@ #include "codegen/allocation.h" #include "codegen/liveness.h" #include "codegen/layout.h" -#include "codegen/loop_info.h" #include "ir/basic_block.h" #include "ir/type.h" #include "ir/value.h" @@ -11,27 +10,15 @@ namespace tdl{ namespace codegen{ -unsigned allocation::get_num_bytes(ir::value *x) const { - ir::type *ty = x->get_type(); - unsigned num_elements = ty->get_tile_num_elements(); - if(has_double_buffer(x)) - num_elements *= 2; - return num_elements * ty->get_scalar_ty()->get_size_in_bits(); -} - void allocation::run(ir::function &fn){ using std::max; using std::min; typedef std::multimap triples_map_type; - // Fill double buffering info - for(ir::basic_block *block: fn.blocks()) - for(ir::instruction *v: block->get_inst_list()) - // If requires shared memory - if(layout_->get_num_shared_views(v) && - loop_info_->get_loop_for(block)) - double_buffer_.insert(v); + auto get_num_bytes = [&](ir::value *x){ + return x->get_type()->get_tile_bitwidth(); + }; std::vector I; for(auto x: liveness_->intervals()) diff --git a/lib/codegen/liveness.cpp b/lib/codegen/liveness.cpp index 0e56aac03..9e910b420 100644 --- a/lib/codegen/liveness.cpp +++ b/lib/codegen/liveness.cpp @@ -11,7 +11,6 @@ namespace codegen{ // Entry point void liveness::run(ir::function *fn) { - // Assigns index to each instruction slot_index index = 0; for(ir::basic_block *block: fn->blocks()) @@ -19,7 +18,6 @@ void liveness::run(ir::function *fn) { index += 1; indices_.insert({instr, index}); } - // Liveness analysis // Creates live intervals for(auto i: indices_){ diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 6e646a4e3..a548e0a1d 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -1,49 +1,93 @@ -//#include "codegen/tune.h" +#include "codegen/tune.h" +#include "ir/instructions.h" +#include "ir/type.h" +#include "ir/module.h" +#include "ir/function.h" +#include -//namespace tdl{ -//namespace codegen{ +namespace tdl{ +namespace codegen{ +void tune::add_constraint(node_t x, node_t y) { + dependencies_[x].insert(y); + dependencies_[y].insert(x); + nodes_.insert(x); + nodes_.insert(y); +} -//// Layout binding pass -//class TLVMAddTunerConstraints: public FunctionPass { -//public: -// static char ID; -// TLVMAddTunerConstraints(): FunctionPass(ID){ } +void tune::init_c_phi(ir::instruction *v) { + // Phi Nodes: all the incoming value share the result layout + if(auto *phi = dynamic_cast(v)) + for(ir::value *inc: phi->ops()) + for(unsigned k = 0; k < phi->get_type()->get_tile_shapes().size(); k++) + if(dependencies_.find({inc, k}) != dependencies_.end() + || dependencies_.find({phi, k}) != dependencies_.end()) + add_constraint({phi, k}, {inc, k}); +} -// void getAnalysisUsage(AnalysisUsage & AU) const override; -// bool runOnFunction(Function &F) override; -//}; +void tune::init_c_graph(ir::instruction *v) { + unsigned num_dim = v->get_type()->get_tile_shapes().size(); + if(dynamic_cast(v)){ -//// Initialization -//char TLVMAddTunerConstraints::ID = 0; -//INITIALIZE_PASS_BEGIN(TLVMAddTunerConstraints, "tlvm-add-tuner-constraints", -// "Add Tuner Constraints (TLVM)", false, true) -//INITIALIZE_PASS_END(TLVMAddTunerConstraints, "tlvm-add-tuner-constraints", -// "Add Tuner Constraints (TLVM)", false, true) -//FunctionPass *llvm::createTLVMAddTunerConstraintsPass() { return new TLVMAddTunerConstraints(); } + } + else if(dynamic_cast(v)){ -//// Analysis usage -//void TLVMAddTunerConstraints::getAnalysisUsage(AnalysisUsage &AU) const { -// AU.setPreservesAll(); -// FunctionPass::getAnalysisUsage(AU); -//} + } + else if(dynamic_cast(v)){ + } + else if(auto *ii = dynamic_cast(v)){ + ir::value *D = ii->get_operand(2); + add_constraint({v, 0}, {D, 0}); + add_constraint({v, 1}, {D, 1}); + } + else if(dynamic_cast(v)) + for(unsigned i = 0; i < num_dim; i ++) + for(ir::value* op: v->ops()) + add_constraint({v, i}, {op, i}); +} -//inline unsigned MDRead(MDNode* Node){ -// Metadata *MD = Node->getOperand(0).get(); -// Constant *Cst = ((ConstantAsMetadata*)MD)->getValue(); -// unsigned Result = Cst->getUniqueInteger().getZExtValue(); -// return Result; -//} +void tune::connected_components(node_t x, const std::vector vals, std::set &nodes, graph_t &graph) { + if(nodes.find(x) != nodes.end()){ + nodes.erase(x); + std::string suffix = ".d" + std::to_string(x.second); + if(auto *instr = dynamic_cast(x.first)){ + params_[instr].insert({"p0" + suffix, vals[0]}); + params_[instr].insert({"p1" + suffix, vals[1]}); + params_[instr].insert({"p2" + suffix, vals[2]}); + } + for(const node_t &y: graph[x]) + connected_components(y, vals, nodes, graph); + } +} + +void tune::run(ir::module &mod) { + for(ir::function *fn: mod.get_function_list()){ + // Build constraints graph + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *i : block->get_inst_list()) + if(i->get_type()->is_tile_ty()) + init_c_graph(i); + // Build phi constraints + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *i : block->get_inst_list()) + if(i->get_type()->is_tile_ty()) + init_c_phi(i); + // Layout parameters + while(!nodes_.empty()){ + unsigned *v0 = new unsigned(0); + unsigned *v1 = new unsigned(0); + unsigned *v2 = new unsigned(0); + connected_components(*nodes_.begin(), {v0, v1, v2}, nodes_, dependencies_); + } + } +} + +bool tune::check_constraints(std::map &errors) { + + return true; +} -//inline unsigned getNumGT1Dim(Instruction &I){ -// unsigned Res = 0; -// for(unsigned K = 0; K < I.getType()->getTileNumDimensions(); K++) -// if(MDRead(I.getMetadata("nvvm.param.shape.d" + itostr(K))) > 1) -// Res++; -// return Res; -//} -//// Run //bool TLVMAddTunerConstraints::runOnFunction(Function &F) { // LLVMContext &Ctx = F.getContext(); @@ -114,355 +158,5 @@ //} -//// Layout binding pass -//class TLVMAddTunerParams: public FunctionPass { -//private: -// enum CType{ -// Layout = 0, Shape = 1 -// }; -// // Params pool -// SmallVector LParamsPool; -// // Constraints -// typedef std::pair CNodeType; -// typedef DenseMap> CGraphType; -// // Layout constraints -// CGraphType LCGraph; -// DenseSet LCNodes; -// // Shape constraints -// CGraphType SCGraph; -// DenseSet SCNodes; -// // Relational -// std::map, std::function> ExtraParams; -// DenseSet Constants; - -// void addConstraint(CNodeType X, CNodeType Y, CType CT); -// void initCPhi(Instruction *I); -// void initCGraph(Instruction *V); -// void connectedComponents(CNodeType X, ArrayRef Vals, CType CT, DenseSet &Nodes, CGraphType &Graph); - -//public: -// static char ID; -// TLVMAddTunerParams(): FunctionPass(ID){ } - -// void getAnalysisUsage(AnalysisUsage & AU) const override; -// bool runOnFunction(Function &F) override; - -//private: -// std::map, Constant*> KnownParams; -//}; - -//// Initialization -//char TLVMAddTunerParams::ID = 0; -//INITIALIZE_PASS_BEGIN(TLVMAddTunerParams, "tlvm-add-tuner-parameters", -// "Add Tuner Parameters (TLVM)", false, true) -//INITIALIZE_PASS_END(TLVMAddTunerParams, "tlvm-add-tuner-parameters", -// "Add Tuner Parameters (TLVM)", false, true) -//FunctionPass *llvm::createTLVMAddTunerParamsPass() { return new TLVMAddTunerParams(); } - -//// Analysis usage -//void TLVMAddTunerParams::getAnalysisUsage(AnalysisUsage &AU) const { -// AU.setPreservesAll(); -// FunctionPass::getAnalysisUsage(AU); -//} - -//void TLVMAddTunerParams::addConstraint(CNodeType X, CNodeType Y, CType CT){ -// // Layout Constraint -// if(CT == Layout){ -// LCGraph[X].insert(Y); -// LCGraph[Y].insert(X); -// LCNodes.insert(X); -// LCNodes.insert(Y); -// } -// if(CT == Shape || CT == Layout){ -// SCGraph[X].insert(Y); -// SCGraph[Y].insert(X); -// SCNodes.insert(X); -// SCNodes.insert(Y); -// } -//} - -//void TLVMAddTunerParams::initCPhi(Instruction *I){ -// unsigned NumDim = 0; -// // Phi Nodes: all the incoming value share the result layout -// if(PHINode *Phi = dyn_cast(I)){ -// Type *Ty = Phi->getType(); -// NumDim = Ty->getTileNumDimensions(); -// unsigned NumInc = Phi->getNumIncomingValues(); -// for(unsigned PI = 0; PI < NumInc; PI++){ -// Value *Inc = Phi->getIncomingValue(PI); -// for(unsigned K = 0; K < NumDim; K++){ -// CType CT = (LCGraph.find({Inc,K}) != LCGraph.end() || -// LCGraph.find({Phi,K}) != LCGraph.end())?Layout:Shape; -// addConstraint({Phi, K}, {Inc, K}, CT); -// } -// } -// } -//} - -//void TLVMAddTunerParams::initCGraph(Instruction *I) { -// unsigned NumDim = 0; -// LLVMContext &Context = I->getContext(); -// Constant *_1 = ConstantInt::get(Type::getInt32Ty(Context), 1); -// // Function call -// if(CallInst *Call = dyn_cast(I)) -// if(Function *Callee = Call->getCalledFunction()){ -// Intrinsic::ID IntrinsicID = Callee->getIntrinsicID(); -// switch (IntrinsicID) { -// // Outer -// case Intrinsic::tlvm_outer_add: LLVM_FALLTHROUGH; -// case Intrinsic::tlvm_outer_and: { -// addConstraint({Call, 0}, {Call->getOperand(0), 0}, Layout); -// addConstraint({Call, 1}, {Call->getOperand(1), 0}, Layout); -// break; -// } -// // Slice -// case Intrinsic::tlvm_read_slice_x: LLVM_FALLTHROUGH; -// case Intrinsic::tlvm_read_slice_y: { -// addConstraint({Call, 0}, {Call->getOperand(0), 0}, Shape); -// break; -// } -// // Range -// case Intrinsic::tlvm_range: { -// addConstraint({Call, 0}, {Call->getOperand(1), 0}, Shape); -// break; -// } -// // GetTilePtr -// case Intrinsic::tlvm_gtp_2d: NumDim++; LLVM_FALLTHROUGH; -// case Intrinsic::tlvm_gtp_1d: NumDim++; { -// Value *Offset = Call->getOperand(1); -// for(unsigned K = 0; K < NumDim; K++){ -// addConstraint({Call, K}, {Offset, K}, Layout); -// } -// break; -// } -// // SlideTilePtr: Pointer shares result layout -// case Intrinsic::tlvm_stp_2d: NumDim++; LLVM_FALLTHROUGH; -// case Intrinsic::tlvm_stp_1d: NumDim++; { -// for(unsigned K = 0; K < NumDim; K++){ -// addConstraint({Call, K}, {Call->getOperand(0), K}, Layout); -// addConstraint({Call, K}, {Call->getOperand(1), K}, Layout); -// } -// break; -// } -// // Transpose -// case Intrinsic::tlvm_transpose_2d: NumDim++; NumDim++; { -// Value *Op = Call->getOperand(0); -// addConstraint({Call, 0}, {Op, 1}, Shape); -// addConstraint({Call, 1}, {Op, 0}, Shape); -// break; -// } -// // Reshape -// case Intrinsic::tlvm_reshape_2d: NumDim++; NumDim++; { -// for(unsigned K = 0; K < NumDim; K++) -// addConstraint({Call, K}, {Call->getOperand(1 + K), 0}, Shape); -// break; -// } -// // Reshape distributed -// case Intrinsic::tlvm_reshape_2d_1d: NumDim++; NumDim++; { -// size_t Current = 0; -// for(unsigned K = 0; K < NumDim; K++){ -// if(Call->getOperand(1 + K) == _1) -// addConstraint({Call, K}, {_1, 0}, Layout); -// else -// addConstraint({Call, K}, {Call->getOperand(0), Current++}, Layout); -// } -// break; -// } -// // Broadcast -// case Intrinsic::tlvm_broadcast_2d: NumDim++; LLVM_FALLTHROUGH; -// case Intrinsic::tlvm_broadcast_1d: NumDim++; { -// for(unsigned K = 0; K < NumDim; K++) -// addConstraint({Call, K}, {Call->getOperand(1 + K), 0}, Shape); -// break; -// } -// // Splat -// case Intrinsic::tlvm_splat_2d: NumDim++; LLVM_FALLTHROUGH; -// case Intrinsic::tlvm_splat_1d: NumDim++; { -// for(unsigned K = 0; K < NumDim; K++) -// addConstraint({Call, K}, {Call->getOperand(K), 0}, Shape); -// break; -// } - -// case Intrinsic::tlvm_load:{ -// NumDim = Call->getType()->getTileNumDimensions(); -// Value *Ptr = Call->getOperand(0); -// for(unsigned K = 0; K < NumDim; K++) -// addConstraint({Call, K}, {Ptr, K}, Layout); -// break; -// } - -// // Masked Load -// case Intrinsic::tlvm_masked_load: { -// NumDim = Call->getType()->getTileNumDimensions(); -// for(unsigned K = 0; K < NumDim; K++){ -// addConstraint({Call, K}, {Call->getOperand(0), K}, Layout); -// addConstraint({Call, K}, {Call->getOperand(1), K}, Layout); -// } -// break; -// } -// // Masked store -// case Intrinsic::tlvm_atomic_load_add_f32: LLVM_FALLTHROUGH; -// case Intrinsic::tlvm_masked_store: { -// Value *Val = Call->getOperand(0); -// Value *Ptr = Call->getOperand(1); -// Value *Mask = Call->getOperand(2); -// NumDim = Val->getType()->getTileNumDimensions(); -// for(unsigned K = 0; K < NumDim; K++){ -// addConstraint({Val, K}, {Ptr, K}, Layout); -// addConstraint({Val, K}, {Mask, K}, Layout); -// } -// break; -// } -// // Set Mask -// case Intrinsic::tlvm_set_mask_2d: NumDim++; NumDim++; { -// for(unsigned K = 0; K < NumDim; K++){ -// Value *Op = Call->getOperand(NumDim + K); -// addConstraint({Call, K}, {Op, 0}, Layout); -// } -// break; -// } -// // MMA -// // A shares first axis with C -// // B shares last axis with C -// case Intrinsic::tlvm_mma_nn: -// case Intrinsic::tlvm_mma_nt: -// case Intrinsic::tlvm_mma_tn: -// case Intrinsic::tlvm_mma_tt:{ -// bool AT = IntrinsicID == Intrinsic::tlvm_mma_tn || IntrinsicID == Intrinsic::tlvm_mma_tt; -// bool BT = IntrinsicID == Intrinsic::tlvm_mma_nt || IntrinsicID == Intrinsic::tlvm_mma_tt; -// Value *A = Call->getOperand(0); -// Value *B = Call->getOperand(1); -// Value *D = Call->getOperand(2); -// size_t AOuter = 0, AInner = 1; -// size_t BOuter = 1, BInner = 0; -// if(AT) std::swap(AOuter, AInner); -// if(BT) std::swap(BOuter, BInner); -// addConstraint({Call, 0}, {A, AOuter}, Shape); -// addConstraint({Call, 1}, {B, BOuter}, Shape); -// addConstraint({A, AInner}, {B, BInner}, Shape); -// addConstraint({Call, 0}, {D, 0}, Layout); -// addConstraint({Call, 1}, {D, 1}, Layout); -// break; -// } -// default: -// break; -// } -// } -// // LoadInst: Pointer shares the result layout -// if(LoadInst *Load = dyn_cast(I)){ -// NumDim = Load->getType()->getTileNumDimensions(); -// Value *Ptr = Load->getPointerOperand(); -// for(unsigned K = 0; K < NumDim; K++) -// addConstraint({Load, K}, {Ptr, K}, Layout); -// } -// // StoreInst: Pointer shares the value layout -// if(StoreInst *Store = dyn_cast(I)){ -// Value *Ptr = Store->getPointerOperand(); -// Value *Val = Store->getValueOperand(); -// NumDim = Val->getType()->getTileNumDimensions(); -// for(unsigned K = 0; K < NumDim; K++) -// addConstraint({Ptr, K}, {Val, K}, Layout); -// } -// // SelectInst: Selected tensor share layout -// if(SelectInst *Select = dyn_cast(I)){ -// NumDim = Select->getType()->getTileNumDimensions(); -// for(unsigned K = 0; K < NumDim; K++){ -// addConstraint({Select->getTrueValue(), K}, {Select, K}, Layout); -// addConstraint({Select->getFalseValue(), K}, {Select, K}, Layout); -// } -// } -// if(isa(I)){ -// NumDim = I->getType()->getTileNumDimensions(); -// for(unsigned K = 0; K < NumDim; K++){ -// addConstraint({I->getOperand(0), K}, {I, K}, Layout); -// } -// } -// // Phi Nodes: all the incoming value share the result layout -// if(PHINode *Phi = dyn_cast(I)){ -// Type *Ty = Phi->getType(); -// NumDim = Ty->getTileNumDimensions(); -// unsigned NumInc = Phi->getNumIncomingValues(); -// for(unsigned PI = 0; PI < NumInc; PI++){ -// Value *Inc = Phi->getIncomingValue(PI); -// for(unsigned K = 0; K < NumDim; K++){ -// CType CT = (LCGraph.find({Inc,K}) != LCGraph.end() || -// LCGraph.find({Phi,K}) != LCGraph.end())?Layout:Shape; -// addConstraint({Phi, K}, {Inc, K}, CT); -// } -// } -// } -// // Binary op: All the arguments share the result layout -// Instruction *BinOp = static_cast(I); -// if(isa(BinOp) || isa(BinOp)){ -// NumDim = BinOp->getType()->getTileNumDimensions(); -// Value *A = BinOp->getOperand(0); -// Value *B = BinOp->getOperand(1); -// for(unsigned K = 0; K < NumDim; K++){ -// addConstraint({BinOp, K}, {A, K}, Layout); -// addConstraint({BinOp, K}, {B, K}, Layout); -// } -// } -//} - -//void TLVMAddTunerParams::connectedComponents(CNodeType X, ArrayRef Vals, CType CT, -// DenseSet &Nodes, CGraphType &Graph){ -// if(Nodes.find(X) != Nodes.end()){ -// Nodes.erase(X); -// std::string Suffix = ".d" + itostr(X.second); -// if(Instruction *Instr = dyn_cast(X.first)){ -// if(CT==Shape){ -// Instr->setMetadata("nvvm.param.shape" + Suffix, Vals[0]); -// } -// if(CT==Layout){ -// Instr->setMetadata("nvvm.param.layout.p0" + Suffix, Vals[0]); -// Instr->setMetadata("nvvm.param.layout.p1" + Suffix, Vals[1]); -// Instr->setMetadata("nvvm.param.layout.p2" + Suffix, Vals[2]); -// } -// } -// if(ConstantInt *Cst = dyn_cast(X.first)){ -// Metadata *CstMD = ConstantAsMetadata::get(Cst); -// if(CT==Shape){ -// Vals[0]->replaceOperandWith(0, CstMD); -// } -// if(CT==Layout){ -// Vals[0]->replaceOperandWith(0, CstMD); -// Vals[1]->replaceOperandWith(0, CstMD); -// Vals[2]->replaceOperandWith(0, CstMD); -// } -// } -// for(CNodeType &E: Graph[X]) -// connectedComponents(E, Vals, CT, Nodes, Graph); -// } -//} - -//// Run -//bool TLVMAddTunerParams::runOnFunction(Function &F) { -// // Build constraints graph -// for(Function::iterator::value_type &BB: F) -// for(BasicBlock::iterator::value_type &I : BB) -// if(isTLVMValue(&I)) -// initCGraph(&I); -// for(Function::iterator::value_type &BB: F) -// for(BasicBlock::iterator::value_type &I : BB) -// if(isTLVMValue(&I)) -// initCPhi(&I); -// // Add parameters -// LLVMContext &Ctx = F.getContext(); -// Metadata *UndefMD = ConstantAsMetadata::get(UndefValue::get(Type::getInt32Ty(Ctx))); -// // Shape parameters -// while(!SCNodes.empty()){ -// MDNode *V0 = MDNode::getTemporary(Ctx, UndefMD).release(); -// connectedComponents(*SCNodes.begin(), {V0}, Shape, SCNodes, SCGraph); -// } -// // Layout parameters -// while(!LCNodes.empty()){ -// MDNode *V0 = MDNode::getTemporary(Ctx, UndefMD).release(); -// MDNode *V1 = MDNode::getTemporary(Ctx, UndefMD).release(); -// MDNode *V2 = MDNode::getTemporary(Ctx, UndefMD).release(); -// connectedComponents(*LCNodes.begin(), {V0, V1, V2}, Layout, LCNodes, LCGraph); -// } -// return true; -//} - -//} -//} +} +} diff --git a/lib/ir/type.cpp b/lib/ir/type.cpp index 50ffa7c23..98ec78508 100644 --- a/lib/ir/type.cpp +++ b/lib/ir/type.cpp @@ -18,9 +18,26 @@ type *type::get_scalar_ty() const { return const_cast(this); } +unsigned type::get_primitive_size_in_bits() const { + switch (id_) { + case HalfTyID: return 16; + case FloatTyID: return 32; + case DoubleTyID: return 64; + case X86_FP80TyID: return 80; + case FP128TyID: return 128; + case PPC_FP128TyID: return 128; + case IntegerTyID: return ((integer_type*)(this))->get_bitwidth(); + case TileTyID: return ((tile_type*)(this))->get_bitwidth(); + default: return 0; + } +} + unsigned type::get_integer_bitwidth() const { return ((integer_type*)(this))->get_bitwidth(); } +unsigned type::get_tile_bitwidth() const +{ return ((tile_type*)(this))->get_bitwidth(); } + unsigned type::get_fp_mantissa_width() const { id_t id = get_scalar_ty()->id_; assert(is_floating_point_ty() && "Not a floating point type!"); @@ -140,6 +157,17 @@ bool tile_type::is_valid_elt_ty(type *ty) { return ty->is_pointer_ty() || ty->is_floating_point_ty() || ty->is_integer_ty(); } +unsigned tile_type::get_num_elements() const { + unsigned res = 1; + for(unsigned shape: shapes_) + shape *= res; + return res; +} + +unsigned tile_type::get_bitwidth() const { + return get_num_elements() * get_tile_element_ty()->get_primitive_size_in_bits(); +} + tile_type* tile_type::get(type *elt_ty, const std::vector &shapes) { assert(elt_ty && "Can't get a tile of type!"); assert(shapes.size() && "Can't create a tile with empty shapes!"); From 7a14693f5148d3b9b4683af06c6f87f9bdf0718a Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 8 Jan 2019 15:57:45 -0500 Subject: [PATCH 044/494] [code generation] added constraints checking --- include/codegen/tune.h | 2 +- lib/codegen/tune.cpp | 138 ++++++++++++++++++++--------------------- 2 files changed, 67 insertions(+), 73 deletions(-) diff --git a/include/codegen/tune.h b/include/codegen/tune.h index fcdce2266..02c1d46b6 100644 --- a/include/codegen/tune.h +++ b/include/codegen/tune.h @@ -28,7 +28,7 @@ private: public: unsigned *get_param(ir::value *value); - bool check_constraints(std::map& errors); + bool check_constraints(ir::module &fn, std::map> &errors); void run(ir::module &mod); private: diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index a548e0a1d..e77a13773 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -5,6 +5,7 @@ #include "ir/function.h" #include + namespace tdl{ namespace codegen{ @@ -83,80 +84,73 @@ void tune::run(ir::module &mod) { } } -bool tune::check_constraints(std::map &errors) { +bool tune::check_constraints(ir::module &mod, std::map> &errors) { +for(ir::function *fn: mod.get_function_list()){ + /* grids */ + auto get_tile_gt1_dim = [&](ir::value *v){ + unsigned result = 0; + for(unsigned shape: v->get_type()->get_tile_shapes()) { + result += (shape > 1)?shape:0; + } + return result; + }; + using std::to_string; + std::map references; + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *i: block->get_inst_list()){ + if(!i->get_type()->is_tile_ty()) + continue; + for(auto ¶m: params_.at(i)){ + if(*param.second == 1) + continue; + ir::instruction *&r = references[param.second]; + if(!r && get_tile_gt1_dim(i) > get_tile_gt1_dim(r)) + r = i; + } + } + // extract unique instructions in order + std::vector grids; + for(auto &ref: references) + if(std::find(grids.begin(), grids.end(), ref.second) == grids.end()) + grids.push_back(ref.second); - return true; + // number of warps + int num_warps = 1; + for(size_t k = 0; k < grids.front()->get_type()->get_tile_shapes().size(); k++) + num_warps *= *params_[grids.front()]["p2.d" + to_string(k)]; + // check constraints + for(ir::instruction *i: grids){ + ir::type *ty = i->get_type(); + const auto &shapes = ty->get_tile_shapes(); + // for each dimension, the product of layout components + // must device the shape + for(size_t k = 0; k < shapes.size(); k++) { + std::string strk = to_string(k); + unsigned *s0 = params_[i]["p0.d" + strk]; + unsigned *s1 = params_[i]["p1.d" + strk]; + unsigned *s2 = params_[i]["p2.d" + strk]; + unsigned multiple = (*s0)*(*s1)*(*s2); + if(shapes[k] % multiple != 0) + errors[i].push_back("for dim " + strk + ": shape (" + to_string(shapes[k]) + ")" + " is not a multiple of layout (" + to_string(multiple) + ")"); + } + // the number of thread per warp must be 32 + int num_threads = 1; + for(size_t k = 0; k < shapes.size(); k++) + num_threads *= *params_[i]["p1.d" + to_string(k)]; + if(num_threads != 32) + errors[i].push_back("number of threads per warp (" + to_string(num_threads) + ") must be 32"); + // The number of warps required by the layout is the same + // for all tiles in the function + int required_num_warps = 1; + for(size_t k = 0; k < shapes.size(); k++) + required_num_warps *= *params_[i]["p2.d" + to_string(k)]; + if(required_num_warps != num_warps) + errors[i].push_back("number of warps (" + to_string(required_num_warps) + ") must be " + to_string(num_warps)); + } + return errors.empty(); +} } - -//bool TLVMAddTunerConstraints::runOnFunction(Function &F) { -// LLVMContext &Ctx = F.getContext(); - -// DenseMap Refs; -// for(Function::iterator::value_type &BB: F) -// for(Instruction &I : BB) -// if(isTLVMValue(&I)){ -// SmallVector, 4> MDs; -// I.getAllMetadata(MDs); -// for(auto &X: MDs){ -// if(MDRead(X.second)==1) -// continue; -// Instruction *&Ref = Refs[X.second]; -// if(!Ref || getNumGT1Dim(I) > getNumGT1Dim(*Ref)) -// Ref = &I; -// } -// } -// SmallVector Grids; -// for(auto &R: Refs) -// if(std::find(Grids.begin(), Grids.end(), R.second) == Grids.end()) -// Grids.push_back(R.second); - - -// Instruction *FirstTile = Grids.front(); -// for(Instruction *I: Grids){ -// Type *Ty = I->getType(); -// size_t NumDim = Ty->getTileNumDimensions(); - -// // For each dimension, the product of layout components -// // must divide shape -// for(size_t K = 0; K < NumDim; K++){ -// unsigned Shape = MDRead(I->getMetadata("nvvm.param.shape.d" + itostr(K))); -// unsigned S0 = MDRead(I->getMetadata("nvvm.param.layout.p0.d" + itostr(K))); -// unsigned S1 = MDRead(I->getMetadata("nvvm.param.layout.p1.d" + itostr(K))); -// unsigned S2 = MDRead(I->getMetadata("nvvm.param.layout.p2.d" + itostr(K))); -// bool Constraint = Shape % (S0*S1*S2)== 0; -// Constant *Cst = Constraint?ConstantInt::getTrue(Ctx):ConstantInt::getFalse(Ctx); -// I->setMetadata("nvvm.constraint.shape.d" + itostr(K), MDNode::get(Ctx, ConstantAsMetadata::get(Cst))); -// }; -// // The number of threads per warp is 32 -// { -// int NumThreads = 1; -// for(size_t K = 0; K < NumDim; K++){ -// unsigned PC = MDRead(I->getMetadata("nvvm.param.layout.p1.d" + itostr(K))); -// NumThreads *= PC; -// } -// bool Constraint = NumThreads==32; -// Constant *Cst = Constraint?ConstantInt::getTrue(Ctx):ConstantInt::getFalse(Ctx); -// I->setMetadata("nvvm.constraint.threads", MDNode::get(Ctx, ConstantAsMetadata::get(Cst))); -// } -// // The number of warps required by the layout is the same -// // for all tiles in the function -// { -// int NumWarps = 1; -// int RefNumWarps = 1; -// for(size_t K = 0; K < NumDim; K++){ -// unsigned PC = MDRead(I->getMetadata("nvvm.param.layout.p2.d" + itostr(K))); -// unsigned PR = MDRead(FirstTile->getMetadata("nvvm.param.layout.p2.d" + itostr(K))); -// NumWarps *= PC; -// RefNumWarps *= PR; -// } -// bool Constraint = NumWarps==RefNumWarps; -// Constant *Cst = Constraint?ConstantInt::getTrue(Ctx):ConstantInt::getFalse(Ctx); -// I->setMetadata("nvvm.constraint.warps", MDNode::get(Ctx, ConstantAsMetadata::get(Cst))); -// }; -// } -// return true; -//} - } } From 73db84c8bacf7bf5b8ed3d164bbbd292992ed9bd Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 8 Jan 2019 17:44:31 -0500 Subject: [PATCH 045/494] [syntax tree] fixed broadcast semantics lowering --- examples/matrix.cpp | 15 ++- include/ast/ast.h | 5 + include/codegen/allocation.h | 2 +- include/codegen/layout.h | 4 +- include/codegen/liveness.h | 3 +- lib/ast/lowering.cpp | 232 ++++++++++++++++++----------------- lib/codegen/allocation.cpp | 2 +- lib/codegen/layout.cpp | 11 +- lib/codegen/liveness.cpp | 5 +- lib/codegen/tune.cpp | 2 + 10 files changed, 153 insertions(+), 128 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 78c8d7194..3a4877d36 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -4,6 +4,7 @@ #include "ir/context.h" #include "ir/module.h" #include "codegen/selection.h" +#include "codegen/tune.h" #include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/Module.h" #include "llvm/IR/LLVMContext.h" @@ -20,6 +21,8 @@ extern translation_unit *ast_root; const char src[] = "\ void test(fp32 *A, fp32 *B, fp32 *C, int32 i){\ + int32 tile[16, 16] = 0;\ + int32 test[16, 16] = tile + i;\ i = 1;\ A = A + i;\ }\ @@ -35,10 +38,14 @@ int main() { program->codegen(&module); llvm::LLVMContext llvm_context; llvm::Module llvm_module("test", llvm_context); + // lowering passes tdl::codegen::selection selection; - selection.run(module, llvm_module); - llvm::PrintModulePass print(llvm::outs()); - llvm::AnalysisManager analysis; - print.run(llvm_module, analysis); + tdl::codegen::tune tune; + tune.run(module); +// selection.run(module, llvm_module); +// // print LLVM program +// llvm::PrintModulePass print(llvm::outs()); +// llvm::AnalysisManager analysis; +// print.run(llvm_module, analysis); return 0; } diff --git a/include/ast/ast.h b/include/ast/ast.h index d9b24f8e4..2ab5c02cc 100644 --- a/include/ast/ast.h +++ b/include/ast/ast.h @@ -61,6 +61,11 @@ class identifier; // AST class node { +protected: + static ir::value* explicit_cast(ir::builder &builder, ir::value *src, ir::type *dst_ty); + static void implicit_broadcast(ir::module *mod, ir::value *&lhs, ir::value *&rhs); + static void implicit_cast(ir::builder &builder, ir::value *&lhs, ir::value *&rhs, + bool &is_float, bool &is_ptr, bool &is_int, bool &is_signed); public: virtual ir::value* codegen(ir::module *) const { return nullptr; } }; diff --git a/include/codegen/allocation.h b/include/codegen/allocation.h index c6783cc5e..b93322539 100644 --- a/include/codegen/allocation.h +++ b/include/codegen/allocation.h @@ -25,7 +25,7 @@ public: unsigned get_allocated_size() const { return allocated_size_; } // run - void run(ir::function &fn); + void run(); private: std::map offsets_; diff --git a/include/codegen/layout.h b/include/codegen/layout.h index 1fd6deeda..d63a5dbe2 100644 --- a/include/codegen/layout.h +++ b/include/codegen/layout.h @@ -7,7 +7,7 @@ namespace tdl { namespace ir { - class function; + class module; class instruction; class value; } @@ -32,7 +32,7 @@ public: shared_view_info get_shared_view(ir::value *v, unsigned idx); // run - bool run(ir::function &fn); + void run(ir::module &mod); private: std::map shared_views_; diff --git a/include/codegen/liveness.h b/include/codegen/liveness.h index 8a6806c2e..c7cfb5959 100644 --- a/include/codegen/liveness.h +++ b/include/codegen/liveness.h @@ -8,6 +8,7 @@ namespace tdl{ namespace ir{ class value; class function; + class module; } namespace codegen{ @@ -47,7 +48,7 @@ public: segment get_interval(ir::value* v) const { return intervals_.at(v); } // run - void run(ir::function *fn); + void run(ir::module &mod); private: has_storage_map_t has_dedicated_storage_; diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 9dcbb2826..52d7a4a2e 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -14,6 +14,118 @@ namespace tdl{ namespace ast{ +/* node */ +ir::value *node::explicit_cast(ir::builder &builder, ir::value *src, ir::type *dst_ty){ + ir::type *src_ty = src->get_type()->get_scalar_ty(); + bool src_signed = false; + bool dst_signed = false; + if(src_ty == dst_ty) + return src; + else if(src_ty->is_integer_ty() && src_signed && dst_ty->is_floating_point_ty()) + return builder.create_si_to_fp(src, dst_ty); + + else if(src_ty->is_integer_ty() && !src_signed && dst_ty->is_floating_point_ty()) + return builder.create_ui_to_fp(src, dst_ty); + + else if(src_ty->is_floating_point_ty() && dst_ty->is_integer_ty() && dst_signed) + return builder.create_fp_to_si(src, dst_ty); + + else if(src_ty->is_floating_point_ty() && dst_ty->is_integer_ty() && !dst_signed) + return builder.create_fp_to_ui(src, dst_ty); + + else if(src_ty->is_floating_point_ty() && dst_ty->is_floating_point_ty() && + src_ty->get_fp_mantissa_width() < dst_ty->get_fp_mantissa_width()) + return builder.create_fp_ext(src, dst_ty); + + else if(src_ty->is_floating_point_ty() && dst_ty->is_floating_point_ty() && + src_ty->get_fp_mantissa_width() > dst_ty->get_fp_mantissa_width()) + return builder.create_fp_trunc(src, dst_ty); + + else if(src_ty->is_integer_ty() && dst_ty->is_integer_ty() && + src_ty->get_integer_bitwidth()) + return builder.create_int_cast(src, dst_ty, dst_signed); + + else + throw std::runtime_error("unreachable"); +} + + +void node::implicit_cast(ir::builder &builder, ir::value *&lhs, ir::value *&rhs, + bool &is_float, bool &is_ptr, bool &is_int, bool &is_signed){ + // Input types + ir::type *left_ty = lhs->get_type()->get_scalar_ty(); + ir::type *right_ty = rhs->get_type()->get_scalar_ty(); + // One operand is pointer + if(left_ty->is_pointer_ty()){ + is_ptr = true; + } + // One operand is double + else if(left_ty->is_double_ty() || right_ty->is_double_ty()){ + ir::value *&to_convert = left_ty->is_double_ty()?rhs:lhs; + to_convert = explicit_cast(builder, to_convert, builder.get_double_ty()); + is_float = true; + } + // One operand is float + else if(left_ty->is_float_ty() || right_ty->is_float_ty()){ + ir::value *&to_convert = left_ty->is_float_ty()?rhs:lhs; + to_convert = explicit_cast(builder, to_convert, builder.get_float_ty()); + is_float = true; + } + // Both operands are integers + else if(left_ty->is_integer_ty() && right_ty->is_integer_ty()){ + is_int = true; + is_signed = false; + if(left_ty->get_integer_bitwidth() != right_ty->get_integer_bitwidth()){ + ir::value *&to_convert = (left_ty->get_integer_bitwidth() > right_ty->get_integer_bitwidth())?rhs:lhs; + ir::type *dst_ty = (to_convert==lhs)?right_ty:left_ty; + to_convert = explicit_cast(builder, to_convert, dst_ty); + } + } + // Not reachable + else + throw std::runtime_error("unreachable"); +} + +void node::implicit_broadcast(ir::module *mod, ir::value *&lhs, ir::value *&rhs){ + ir::builder &builder = mod->get_builder(); + ir::type *lhs_ty = lhs->get_type(); + ir::type *rhs_ty = rhs->get_type(); + // Both are scalar + if(!lhs_ty->is_tile_ty() && !rhs_ty->is_tile_ty()) + return; + // One argument is scalar + if(lhs_ty->is_tile_ty() ^ rhs_ty->is_tile_ty()){ + auto &shapes = lhs_ty->is_tile_ty()?lhs_ty->get_tile_shapes():rhs_ty->get_tile_shapes(); + auto &scalar = lhs_ty->is_tile_ty()?rhs:lhs; + scalar = builder.create_splat(scalar, shapes); + return; + } + // Both are arrays + std::vector lhs_shapes = lhs->get_type()->get_tile_shapes(); + std::vector rhs_shapes = rhs->get_type()->get_tile_shapes(); + int lhs_dim = lhs_shapes.size(); + int rhs_dim = rhs_shapes.size(); + std::vector &shortest = (lhs_dim < rhs_dim)?lhs_shapes:rhs_shapes; + std::vector &longest = (lhs_dim < rhs_dim)?rhs_shapes:lhs_shapes; + size_t ndim = longest.size(); + int off = longest.size() - shortest.size(); + for(int i = longest.size(); i>= 0; i--){ + if(shortest[off + i] != longest[i]) + throw std::runtime_error("cannot broadcast"); + } + // Pad + for(size_t i = 0; i < off; i++) + shortest.insert(shortest.begin(), 1); + ir::value *&target = (lhs_dim < rhs_dim)?lhs:rhs; + target = builder.create_reshape(target, shortest); + // Broadcast + std::vector shapes(ndim); + for(size_t i = 0; i < ndim; i++) + shapes[i] = std::max(shortest[i], longest[i]); + lhs = builder.create_broadcast(lhs, shapes); + rhs = builder.create_broadcast(rhs, shapes); +} + /* Translation unit */ ir::value* translation_unit::codegen(ir::module *mod) const{ decls_->codegen(mod); @@ -195,11 +307,12 @@ void initializer::specifier(const declaration_specifier *spec) { ir::value* initializer::codegen(ir::module * mod) const{ ir::type *ty = decl_->type(mod, spec_->type(mod)); std::string name = decl_->id()->name(); - ir::value *value; - if(expr_) - value = expr_->codegen(mod); - else - value = ir::undef_value::get(ty); + ir::value *value = ir::undef_value::get(ty); + if(expr_){ + ir::value* target = expr_->codegen(mod); + explicit_cast(mod->get_builder(), target, ty->get_scalar_ty()); + implicit_broadcast(mod, value, target); + } value->set_name(name); mod->set_value(name, value); return value; @@ -208,119 +321,12 @@ ir::value* initializer::codegen(ir::module * mod) const{ /*------------------*/ /* Expression */ /*------------------*/ -ir::value *llvm_cast(ir::builder &builder, ir::value *src, ir::type *dst_ty){ - ir::type *src_ty = src->get_type(); - bool src_signed = false; - bool dst_signed = false; - if(src_ty == dst_ty) - return src; - else if(src_ty->is_integer_ty() && src_signed && dst_ty->is_floating_point_ty()) - return builder.create_si_to_fp(src, dst_ty); - - else if(src_ty->is_integer_ty() && !src_signed && dst_ty->is_floating_point_ty()) - return builder.create_ui_to_fp(src, dst_ty); - - else if(src_ty->is_floating_point_ty() && dst_ty->is_integer_ty() && dst_signed) - return builder.create_fp_to_si(src, dst_ty); - - else if(src_ty->is_floating_point_ty() && dst_ty->is_integer_ty() && !dst_signed) - return builder.create_fp_to_ui(src, dst_ty); - - else if(src_ty->is_floating_point_ty() && dst_ty->is_floating_point_ty() && - src_ty->get_fp_mantissa_width() < dst_ty->get_fp_mantissa_width()) - return builder.create_fp_ext(src, dst_ty); - - else if(src_ty->is_floating_point_ty() && dst_ty->is_floating_point_ty() && - src_ty->get_fp_mantissa_width() > dst_ty->get_fp_mantissa_width()) - return builder.create_fp_trunc(src, dst_ty); - - else if(src_ty->is_integer_ty() && dst_ty->is_integer_ty() && - src_ty->get_integer_bitwidth()) - return builder.create_int_cast(src, dst_ty, dst_signed); - - else - throw std::runtime_error("unreachable"); -} - -inline void implicit_cast(ir::builder &builder, ir::value *&lhs, ir::value *&rhs, - bool &is_float, bool &is_ptr, bool &is_int, bool &is_signed){ - // Input types - ir::type *left_ty = lhs->get_type(); - ir::type *right_ty = rhs->get_type(); - // One operand is pointer - if(left_ty->is_pointer_ty()){ - is_ptr = true; - } - // One operand is double - else if(left_ty->is_double_ty() || right_ty->is_double_ty()){ - ir::value *&to_convert = left_ty->is_double_ty()?rhs:lhs; - to_convert = llvm_cast(builder, to_convert, builder.get_double_ty()); - is_float = true; - } - // One operand is float - else if(left_ty->is_float_ty() || right_ty->is_float_ty()){ - ir::value *&to_convert = left_ty->is_float_ty()?rhs:lhs; - to_convert = llvm_cast(builder, to_convert, builder.get_float_ty()); - is_float = true; - } - // Both operands are integers - else if(left_ty->is_integer_ty() && right_ty->is_integer_ty()){ - is_int = true; - is_signed = false; - if(left_ty->get_integer_bitwidth() != right_ty->get_integer_bitwidth()){ - ir::value *&to_convert = (left_ty->get_integer_bitwidth() > right_ty->get_integer_bitwidth())?rhs:lhs; - ir::type *dst_ty = (to_convert==lhs)?right_ty:left_ty; - to_convert = llvm_cast(builder, to_convert, dst_ty); - } - } - // Not reachable - else - throw std::runtime_error("unreachable"); -} - -inline void implicit_broadcast(ir::module *mod, ir::builder &builder, ir::value *&lhs, ir::value *&rhs){ - std::vector lhs_shapes = lhs->get_type()->get_tile_shapes(); - std::vector rhs_shapes = rhs->get_type()->get_tile_shapes(); - // Both are scalar - if(lhs_shapes.empty() && rhs_shapes.empty()) - return; - // One argument is scalar - if(!lhs_shapes.empty() ^ !rhs_shapes.empty()){ - auto &shapes = lhs_shapes.empty()?rhs_shapes:lhs_shapes; - auto &target = lhs_shapes.empty()?lhs:rhs; - target = builder.create_splat(target, shapes); - return; - } - // Both are arrays - int lhs_dim = lhs_shapes.size(); - int rhs_dim = rhs_shapes.size(); - std::vector &shortest = (lhs_dim < rhs_dim)?lhs_shapes:rhs_shapes; - std::vector &longest = (lhs_dim < rhs_dim)?rhs_shapes:lhs_shapes; - size_t ndim = longest.size(); - int off = longest.size() - shortest.size(); - for(int i = longest.size(); i>= 0; i--){ - if(shortest[off + i] != longest[i]) - throw std::runtime_error("cannot broadcast"); - } - // Pad - for(size_t i = 0; i < off; i++) - shortest.insert(shortest.begin(), 1); - ir::value *&target = (lhs_dim < rhs_dim)?lhs:rhs; - target = builder.create_reshape(target, shortest); - // Broadcast - std::vector shapes(ndim); - for(size_t i = 0; i < ndim; i++) - shapes[i] = std::max(shortest[i], longest[i]); - lhs = builder.create_broadcast(lhs, shapes); - rhs = builder.create_broadcast(rhs, shapes); -} - /* Binary operator */ ir::value *binary_operator::llvm_op(ir::module *mod, ir::builder &builder, ir::value *lhs, ir::value *rhs, const std::string &name) const { bool is_float = false, is_ptr = false, is_int = false, is_signed = false; implicit_cast(builder, lhs, rhs, is_float, is_ptr, is_int, is_signed); -// implicit_broadcast(mod, builder, lhs, rhs); + implicit_broadcast(mod, lhs, rhs); if(op_==MUL && is_float) return builder.create_fmul(lhs, rhs, name); if(op_==MUL && is_int) diff --git a/lib/codegen/allocation.cpp b/lib/codegen/allocation.cpp index f67140bf5..1730371ae 100644 --- a/lib/codegen/allocation.cpp +++ b/lib/codegen/allocation.cpp @@ -11,7 +11,7 @@ namespace tdl{ namespace codegen{ -void allocation::run(ir::function &fn){ +void allocation::run(){ using std::max; using std::min; typedef std::multimap triples_map_type; diff --git a/lib/codegen/layout.cpp b/lib/codegen/layout.cpp index cdddb1d17..58f81227a 100644 --- a/lib/codegen/layout.cpp +++ b/lib/codegen/layout.cpp @@ -1,5 +1,6 @@ #include "codegen/layout.h" #include "ir/function.h" +#include "ir/module.h" #include "ir/basic_block.h" #include "ir/instructions.h" @@ -36,19 +37,19 @@ void layout::add_shared_views(ir::value *v){ } // Entry point -bool layout::run(ir::function &fn) { +void layout::run(ir::module &mod) { +for(ir::function *fn: mod.get_function_list()){ // Non-phis - for(ir::basic_block *block: fn.blocks()) + for(ir::basic_block *block: fn->blocks()) for(ir::instruction *instr: block->get_inst_list()) { add_shared_views(instr); } // Phi nodes - for(ir::basic_block *block: fn.blocks()) + for(ir::basic_block *block: fn->blocks()) for(ir::instruction *instr: block->get_inst_list()) { add_phi_nodes(instr); } - // Done - return false; +} } } diff --git a/lib/codegen/liveness.cpp b/lib/codegen/liveness.cpp index 9e910b420..824c95590 100644 --- a/lib/codegen/liveness.cpp +++ b/lib/codegen/liveness.cpp @@ -2,6 +2,7 @@ #include "codegen/layout.h" #include "ir/basic_block.h" #include "ir/function.h" +#include "ir/module.h" #include "ir/instructions.h" #include "ir/value.h" @@ -10,7 +11,8 @@ namespace codegen{ // Entry point -void liveness::run(ir::function *fn) { +void liveness::run(ir::module &mod) { +for(ir::function *fn: mod.get_function_list()){ // Assigns index to each instruction slot_index index = 0; for(ir::basic_block *block: fn->blocks()) @@ -35,6 +37,7 @@ void liveness::run(ir::function *fn) { intervals_[v] = segment{start, end}; } } +} } } diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index e77a13773..670008389 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -108,6 +108,7 @@ for(ir::function *fn: mod.get_function_list()){ r = i; } } + // extract unique instructions in order std::vector grids; for(auto &ref: references) @@ -118,6 +119,7 @@ for(ir::function *fn: mod.get_function_list()){ int num_warps = 1; for(size_t k = 0; k < grids.front()->get_type()->get_tile_shapes().size(); k++) num_warps *= *params_[grids.front()]["p2.d" + to_string(k)]; + // check constraints for(ir::instruction *i: grids){ ir::type *ty = i->get_type(); From 7dfa578c9d4cb42130c75d22fba737e0b39a5c82 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 8 Jan 2019 18:04:19 -0500 Subject: [PATCH 046/494] [syntax tree] fixed bug in pointer arithmetic --- examples/matrix.cpp | 2 +- lib/ast/lowering.cpp | 6 +++++- lib/ir/instructions.cpp | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 3a4877d36..3dd6b0cfa 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -22,7 +22,7 @@ const char src[] = "\ void test(fp32 *A, fp32 *B, fp32 *C, int32 i){\ int32 tile[16, 16] = 0;\ - int32 test[16, 16] = tile + i;\ + fp32 *test[16, 16] = tile + A;\ i = 1;\ A = A + i;\ }\ diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 52d7a4a2e..6137e1022 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -56,7 +56,11 @@ void node::implicit_cast(ir::builder &builder, ir::value *&lhs, ir::value *&rhs, ir::type *left_ty = lhs->get_type()->get_scalar_ty(); ir::type *right_ty = rhs->get_type()->get_scalar_ty(); // One operand is pointer - if(left_ty->is_pointer_ty()){ + if(left_ty->is_pointer_ty() || right_ty->is_pointer_ty()){ + if(left_ty->is_pointer_ty() && right_ty->is_pointer_ty()) + throw std::runtime_error("invalid operands"); + if(right_ty->is_pointer_ty()) + std::swap(lhs, rhs); is_ptr = true; } // One operand is double diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index a46c4f036..627645a09 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -248,7 +248,7 @@ getelementptr_inst::getelementptr_inst(type *pointee_ty, value *ptr, const std:: type *getelementptr_inst::get_return_type(type *elt_ty, value *ptr, const std::vector &idx_list) { // result pointer type - type *ptr_ty = pointer_type::get(get_indexed_type(elt_ty, idx_list), ptr->get_type()->get_pointer_address_space()); + type *ptr_ty = pointer_type::get(get_indexed_type(elt_ty, idx_list), ptr->get_type()->get_scalar_ty()->get_pointer_address_space()); // Tile GEP if(ptr->get_type()->is_tile_ty()) return tile_type::get_same_shapes(ptr_ty, ptr->get_type()); From 4f923accd7990011d53d9041ad94e1471b065028 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 9 Jan 2019 02:07:34 -0500 Subject: [PATCH 047/494] [syntax tree] added basic support for range --- examples/matrix.cpp | 11 ++++++----- include/ast/ast.h | 29 ++++++++++++++++++++++++++++- include/ast/parser.y | 18 ++++++++++++------ include/ast/scanner.l | 2 ++ include/codegen/tune.h | 1 + include/ir/builder.h | 2 ++ include/ir/constant.h | 12 ++++++++++++ include/ir/instructions.h | 19 ++++++++++++++++++- lib/ast/lowering.cpp | 12 ++++++++++++ lib/codegen/tune.cpp | 11 +++++++++++ lib/ir/builder.cpp | 6 ++++++ lib/ir/constant.cpp | 16 ++++++++++++++++ lib/ir/instructions.cpp | 34 +++++++++++++++++++++++++++++----- 13 files changed, 155 insertions(+), 18 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 3dd6b0cfa..7b0f5b6c0 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -20,11 +20,9 @@ extern translation_unit *ast_root; const char src[] = "\ -void test(fp32 *A, fp32 *B, fp32 *C, int32 i){\ - int32 tile[16, 16] = 0;\ - fp32 *test[16, 16] = tile + A;\ - i = 1;\ - A = A + i;\ +void test(fp32 *A, fp32 *B, fp32 *C, int32 M, int32 N, int32 K){\ + fp32 acc[16, 16] = 0;\ + fp32 *pa[16, 8] = A;\ }\ "; @@ -42,6 +40,9 @@ int main() { tdl::codegen::selection selection; tdl::codegen::tune tune; tune.run(module); + std::vector params; + tune.get_params(module, params); + std::cout << params.size() << std::endl; // selection.run(module, llvm_module); // // print LLVM program // llvm::PrintModulePass print(llvm::outs()); diff --git a/include/ast/ast.h b/include/ast/ast.h index 2ab5c02cc..3cda23fc7 100644 --- a/include/ast/ast.h +++ b/include/ast/ast.h @@ -58,6 +58,7 @@ enum TYPE_T{ class pointer; class identifier; +class constant; // AST class node { @@ -121,6 +122,21 @@ class postfix_expression: public expression{ }; +class builtin_expression: public node{ + +}; + + +class get_global_range: public builtin_expression{ +public: + get_global_range(node *size, node *axis): size_((constant*)size), axis_((constant*)axis) { } + ir::value* codegen(ir::module *) const; + +private: + const constant* size_; + const constant* axis_; +}; + class indexing_expression: public postfix_expression{ public: indexing_expression(node *id, node *ranges) @@ -133,7 +149,7 @@ private: const list* ranges_; }; -class unary_expression: public node{ +class unary_expression: public expression{ public: unary_expression(node *id): id_((const identifier*)id) {} const identifier *id() const; @@ -174,6 +190,17 @@ private: const int value_; }; +class constant_range: public expression { +public: + constant_range(node *first, node *last) + : first_((constant*)first), last_((constant*)last) { } + + ir::value* codegen(ir::module *mod) const; + +private: + constant *first_; + constant *last_; +}; class string_literal: public expression{ public: diff --git a/include/ast/parser.y b/include/ast/parser.y index 2806f49ea..9840b53b5 100644 --- a/include/ast/parser.y +++ b/include/ast/parser.y @@ -49,7 +49,8 @@ TYPE_T get_type_spec(node *op) { return ((token*)op)->type; } %token XOR_ASSIGN OR_ASSIGN TYPE_NAME %token VOID UINT8 UINT16 UINT32 UINT64 INT8 INT16 INT32 INT64 FP32 FP64 %token IF ELSE FOR -%token NEWAXIS +%token NEWAXIS ELLIPSIS +%token GET_GLOBAL_RANGE %start translation_unit %% @@ -87,7 +88,8 @@ direct_abstract_declarator : '[' constant_list ']' { $$ = new tile(nullptr, $1); } constant : - CONSTANT { $$ = new constant(atoi(yytext)); } + CONSTANT { $$ = new constant(atoi(yytext)); } + | constant ELLIPSIS constant { $$ = new constant_range($1, $2); } ; constant_list @@ -107,11 +109,15 @@ type_name identifier : IDENTIFIER { $$ = new identifier(yytext); } ; - + +builtin + : GET_GLOBAL_RANGE '[' constant ']' '(' constant ')' { $$ = new get_global_range($3, $6); } + primary_expression - : identifier { $$ = new named_expression($1); } - | constant { $$ = $1; } - | STRING_LITERAL { $$ = new string_literal(yytext); } + : identifier { $$ = new named_expression($1); } + | constant { $$ = $1; } + | builtin { $$ = $1; } + | STRING_LITERAL { $$ = new string_literal(yytext); } | '(' expression ')' { $$ = $1; } ; diff --git a/include/ast/scanner.l b/include/ast/scanner.l index 7d5d5984a..55366859c 100644 --- a/include/ast/scanner.l +++ b/include/ast/scanner.l @@ -31,6 +31,8 @@ int comment(); "int64" { count(); return(INT64); } "fp32" { count(); return(FP32); } "fp64" { count(); return(FP64); } +"..." { count(); return(ELLIPSIS); } +"get_global_range" { count(); return GET_GLOBAL_RANGE; } {L}({L}|{D})* { count(); return(check_type()); } diff --git a/include/codegen/tune.h b/include/codegen/tune.h index 02c1d46b6..f7ce1b10d 100644 --- a/include/codegen/tune.h +++ b/include/codegen/tune.h @@ -27,6 +27,7 @@ private: public: + void get_params(ir::module& mod, std::vector &result); unsigned *get_param(ir::value *value); bool check_constraints(ir::module &fn, std::map> &errors); void run(ir::module &mod); diff --git a/include/ir/builder.h b/include/ir/builder.h index 3e0ba4293..a96408f18 100644 --- a/include/ir/builder.h +++ b/include/ir/builder.h @@ -111,6 +111,8 @@ public: value *create_splat(value *arg, const std::vector &shapes, const std::string &name = ""); value *create_reshape(value *arg, const std::vector &shapes, const std::string &name = ""); value *create_broadcast(value *arg, const std::vector &shapes, const std::string &name = ""); + // Built-in instruction + value *create_get_global_range(unsigned axis, unsigned size, const std::string &name = ""); private: context &ctx_; diff --git a/include/ir/constant.h b/include/ir/constant.h index 57af4ad33..78814283c 100644 --- a/include/ir/constant.h +++ b/include/ir/constant.h @@ -40,6 +40,18 @@ private: uint64_t value_; }; +/* constant range */ +class constant_range: public constant{ + constant_range(type *ty, uint64_t first, uint64_t last); + +public: + static constant *get(constant *first, constant *last); + +private: + uint64_t first_; + uint64_t last_; +}; + /* constant fp */ class constant_fp: public constant{ constant_fp(context &ctx, double value); diff --git a/include/ir/instructions.h b/include/ir/instructions.h index 9d390b8cb..2b412769f 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -333,11 +333,28 @@ public: // matmul - class matmul_inst: public instruction { }; +// built-in +class builtin_inst: public instruction{ +protected: + using instruction::instruction; +}; + +class get_global_range_inst: public builtin_inst { + get_global_range_inst(type *ty, unsigned axis, const std::string &name, instruction *next); + +public: + static instruction* create(context &ctx, unsigned axis, unsigned size, + const std::string &name = "", + instruction *next = nullptr); + +private: + unsigned axis_; +}; + } } diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 6137e1022..c9ad43733 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -415,6 +415,13 @@ ir::value* binary_operator::codegen(ir::module *mod) const{ return result; } +/* Builtin expression */ +ir::value* get_global_range::codegen(ir::module *mod) const { + ir::builder &builder = mod->get_builder(); + return builder.create_get_global_range(axis_->value(), size_->value()); +} + + /* Postfix expression */ ir::value* indexing_expression::codegen(ir::module *mod) const{ ir::value *in = mod->get_value(id_->name()); @@ -509,6 +516,11 @@ int constant::value() const{ return value_; } +/* Constant range */ +ir::value* constant_range::codegen(ir::module *mod) const{ + return ir::constant_range::get((ir::constant*)first_->codegen(mod), + (ir::constant*)last_->codegen(mod)); +} /* Unary expression */ const identifier* unary_expression::id() const{ diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 670008389..fe935a059 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -62,6 +62,17 @@ void tune::connected_components(node_t x, const std::vector vals, st } } +void tune::get_params(ir::module &mod, std::vector &result) { + result.clear(); + std::set seen; + for(ir::function *fn: mod.get_function_list()) + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *i : block->get_inst_list()) + for(auto &x: params_[i]) + if(seen.insert(x.second).second) + result.push_back(x.second); +} + void tune::run(ir::module &mod) { for(ir::function *fn: mod.get_function_list()){ // Build constraints graph diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index ccdf49141..1cfdeefa3 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -246,7 +246,13 @@ value *builder::create_broadcast(value *arg, const std::vector &shapes return insert(broadcast_inst::create(arg, shapes, name)); } +//===----------------------------------------------------------------------===// +// built-in instructions +//===----------------------------------------------------------------------===// +value *builder::create_get_global_range(unsigned axis, unsigned size, const std::string &name) { + return insert(get_global_range_inst::create(ctx_, axis, size, name)); +} } } diff --git a/lib/ir/constant.cpp b/lib/ir/constant.cpp index 35e244613..f2779b75b 100644 --- a/lib/ir/constant.cpp +++ b/lib/ir/constant.cpp @@ -1,3 +1,4 @@ +#include #include "ir/constant.h" #include "ir/type.h" #include "ir/context.h" @@ -51,6 +52,21 @@ constant *constant_int::get(type *ty, uint64_t value) { return new constant_int(ty, value); } +// constant_range +// FIXME use something like APInt + +constant_range::constant_range(type *ty, uint64_t first, uint64_t last) + : constant(ty, 0), first_(first), last_(last){ } + +constant *constant_range::get(constant *first, constant *last) { + assert(first->get_type()->is_integer_ty()); + assert(first->get_type() == last->get_type()); + uint64_t vfirst = ((constant_int*)first)->get_value(); + uint64_t vlast = ((constant_int*)first)->get_value(); + return new constant_range(first->get_type(), vfirst, vlast); +} + + // constant_fp // FIXME use something like APFloat diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 627645a09..2424ed074 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -246,15 +246,17 @@ getelementptr_inst::getelementptr_inst(type *pointee_ty, value *ptr, const std:: set_operand(1 + i, idx[i]); } -type *getelementptr_inst::get_return_type(type *elt_ty, value *ptr, const std::vector &idx_list) { +type *getelementptr_inst::get_return_type(type *elt_ty, value *x, const std::vector &idx_list) { // result pointer type - type *ptr_ty = pointer_type::get(get_indexed_type(elt_ty, idx_list), ptr->get_type()->get_scalar_ty()->get_pointer_address_space()); + type *ty = x->get_type(); + unsigned addr_space = ty->get_scalar_ty()->get_pointer_address_space(); + type *ptr_ty = pointer_type::get(get_indexed_type(elt_ty, idx_list), addr_space); // Tile GEP - if(ptr->get_type()->is_tile_ty()) - return tile_type::get_same_shapes(ptr_ty, ptr->get_type()); + if(ty->is_tile_ty()) + return tile_type::get_same_shapes(ptr_ty, ty); for(value *idx : idx_list) if (idx->get_type()->is_tile_ty()) - return tile_type::get_same_shapes(ptr_ty, idx->get_type()); + return tile_type::get_same_shapes(ptr_ty, ty); // Scalar GEP return ptr_ty; } @@ -329,5 +331,27 @@ instruction* broadcast_inst::create(value *arg, const std::vector &sha return new broadcast_inst(arg, shapes, name, next); } + +//===----------------------------------------------------------------------===// +// matmul_inst classes +//===----------------------------------------------------------------------===// + + +//===----------------------------------------------------------------------===// +// builtin instructions +//===----------------------------------------------------------------------===// +get_global_range_inst::get_global_range_inst(type *ty, unsigned axis, + const std::string &name, instruction *next) + : builtin_inst(ty, 0, name, next), axis_(axis) { + +} + +instruction* get_global_range_inst::create(context &ctx, unsigned axis, unsigned size, + const std::string &name, instruction *next) { + type *int_ty = type::get_int32_ty(ctx); + type *tile_ty = tile_type::get(int_ty, {size}); + return new get_global_range_inst(tile_ty, axis, name, next); +} + } } From 63459228f8f64c977cfad847bfacc1f6d1720651 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 9 Jan 2019 13:41:12 -0500 Subject: [PATCH 048/494] [syntax tree] added some slicing/retiling syntax --- examples/matrix.cpp | 6 +++++- include/ast/ast.h | 16 ++++++++-------- include/ast/parser.y | 22 +++++++++++----------- lib/ast/lowering.cpp | 12 +++++++----- 4 files changed, 31 insertions(+), 25 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 7b0f5b6c0..12f98e0ec 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -21,8 +21,12 @@ extern translation_unit *ast_root; const char src[] = "\ void test(fp32 *A, fp32 *B, fp32 *C, int32 M, int32 N, int32 K){\ + int32 rx[16] = get_global_range[16](0);\ + int32 ry[16] = get_global_range[16](1);\ + int32 rk[8] = 0 ... 8;\ fp32 acc[16, 16] = 0;\ - fp32 *pa[16, 8] = A;\ + fp32 *pa[16, 8] = A + rx[:,newaxis] + rk[newaxis,:]*M;\ + fp32 *pb[16, 8] = B + ry[:,newaxis] + rk[newaxis,:]*K;\ }\ "; diff --git a/include/ast/ast.h b/include/ast/ast.h index 3cda23fc7..e3da73802 100644 --- a/include/ast/ast.h +++ b/include/ast/ast.h @@ -95,22 +95,22 @@ private: std::vector values_; }; -enum range_enum_t{ +enum slice_enum_t{ ALL, NEWAXIS }; -class range: public node{ +class slice: public node{ public: - range(range_enum_t type) + slice(slice_enum_t type) : type_(type){} - range_enum_t type() const{ + slice_enum_t type() const{ return type_; } public: - const range_enum_t type_; + const slice_enum_t type_; }; class expression: public node{ @@ -139,14 +139,14 @@ private: class indexing_expression: public postfix_expression{ public: - indexing_expression(node *id, node *ranges) - : id_((const identifier*)id), ranges_((const list*)ranges) {} + indexing_expression(node *id, node *slices) + : id_((const identifier*)id), slices_((const list*)slices) {} ir::value* codegen(ir::module *) const; private: const identifier* id_; - const list* ranges_; + const list* slices_; }; class unary_expression: public expression{ diff --git a/include/ast/parser.y b/include/ast/parser.y index 9840b53b5..351556ff7 100644 --- a/include/ast/parser.y +++ b/include/ast/parser.y @@ -89,7 +89,6 @@ direct_abstract_declarator constant : CONSTANT { $$ = new constant(atoi(yytext)); } - | constant ELLIPSIS constant { $$ = new constant_range($1, $2); } ; constant_list @@ -116,28 +115,29 @@ builtin primary_expression : identifier { $$ = new named_expression($1); } | constant { $$ = $1; } + | constant ELLIPSIS constant { $$ = new constant_range($1, $3); } | builtin { $$ = $1; } | STRING_LITERAL { $$ = new string_literal(yytext); } | '(' expression ')' { $$ = $1; } ; -range - : ':' { $$ = new range(tdl::ast::ALL); } - | NEWAXIS { $$ = new range(tdl::ast::NEWAXIS); } +slice + : ':' { $$ = new slice(tdl::ast::ALL); } + | NEWAXIS { $$ = new slice(tdl::ast::NEWAXIS); } -range_list - : range { $$ = new list((range*)$1); } - | range_list ',' range { $$ = append_ptr_list($1, $2); } +slice_list + : slice { $$ = new list((slice*)$1); } + | slice_list ',' slice { $$ = append_ptr_list($1, $3); } postfix_expression : primary_expression { $$ = $1;} - | identifier '[' range_list ']' { $$ = new indexing_expression($1, $2);} + | identifier '[' slice_list ']' { $$ = new indexing_expression($1, $3);} ; unary_expression - : postfix_expression { $$ = $1; } - | INC_OP unary_expression { $$ = new unary_operator(INC, $2); } - | DEC_OP unary_expression { $$ = new unary_operator(DEC, $2); } + : postfix_expression { $$ = $1; } + | INC_OP unary_expression { $$ = new unary_operator(INC, $2); } + | DEC_OP unary_expression { $$ = new unary_operator(DEC, $2); } | unary_operator cast_expression { $$ = new unary_operator(get_unary_op($1), $2); } ; diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index c9ad43733..5d660bfad 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -107,14 +107,16 @@ void node::implicit_broadcast(ir::module *mod, ir::value *&lhs, ir::value *&rhs) // Both are arrays std::vector lhs_shapes = lhs->get_type()->get_tile_shapes(); std::vector rhs_shapes = rhs->get_type()->get_tile_shapes(); + if(lhs_shapes == rhs_shapes) + return; int lhs_dim = lhs_shapes.size(); int rhs_dim = rhs_shapes.size(); std::vector &shortest = (lhs_dim < rhs_dim)?lhs_shapes:rhs_shapes; std::vector &longest = (lhs_dim < rhs_dim)?rhs_shapes:lhs_shapes; size_t ndim = longest.size(); int off = longest.size() - shortest.size(); - for(int i = longest.size(); i>= 0; i--){ - if(shortest[off + i] != longest[i]) + for(int i = longest.size() - 1; i>= 0; i--){ + if(shortest[off + i] != longest[i] && shortest[off + i] != 1 && longest[i] != 1) throw std::runtime_error("cannot broadcast"); } // Pad @@ -425,12 +427,12 @@ ir::value* get_global_range::codegen(ir::module *mod) const { /* Postfix expression */ ir::value* indexing_expression::codegen(ir::module *mod) const{ ir::value *in = mod->get_value(id_->name()); - const std::vector &ranges = ranges_->values(); + const std::vector &slices = slices_->values(); std::vector in_shapes = in->get_type()->get_tile_shapes(); - std::vector out_shapes(ranges.size()); + std::vector out_shapes(slices.size()); size_t current = 0; for(size_t i = 0; i < out_shapes.size(); i++) - out_shapes[i] = (ranges[i]->type()==NEWAXIS)?1:in_shapes[current++]; + out_shapes[i] = (slices[i]->type()==NEWAXIS)?1:in_shapes[current++]; return mod->get_builder().create_reshape(in, out_shapes); } From b5c8c25d4372c16fe116d2df2e055c51c27e1fd9 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 10 Jan 2019 16:50:47 -0500 Subject: [PATCH 049/494] more debugging --- examples/matrix.cpp | 12 ++++++++--- include/ir/instructions.h | 4 ++++ lib/ast/lowering.cpp | 9 +++++--- lib/codegen/tune.cpp | 45 ++++++++++++++++++++++++++++----------- lib/ir/instructions.cpp | 11 +++++++++- lib/ir/module.cpp | 5 +++-- 6 files changed, 65 insertions(+), 21 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 12f98e0ec..e2ea19527 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -24,9 +24,15 @@ void test(fp32 *A, fp32 *B, fp32 *C, int32 M, int32 N, int32 K){\ int32 rx[16] = get_global_range[16](0);\ int32 ry[16] = get_global_range[16](1);\ int32 rk[8] = 0 ... 8;\ - fp32 acc[16, 16] = 0;\ - fp32 *pa[16, 8] = A + rx[:,newaxis] + rk[newaxis,:]*M;\ - fp32 *pb[16, 8] = B + ry[:,newaxis] + rk[newaxis,:]*K;\ + fp32 c[16, 16] = 0;\ + int32 k;\ + fp32* pa[16, 8] = A + rx[:, newaxis] + rk[newaxis, :]*M;\ + fp32* pb[16, 8] = B + ry[:, newaxis] + rk[newaxis, :]*K;\ + for(k = 0; k < K; k = k + 8){\ + fp32 a[16, 8] = *pa;\ + fp32 b[16, 8] = *pb;\ + pa = pa + 8;\ + }\ }\ "; diff --git a/include/ir/instructions.h b/include/ir/instructions.h index 2b412769f..e700bea04 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -279,8 +279,12 @@ private: //===----------------------------------------------------------------------===// class load_inst: public unary_inst{ +private: load_inst(value *ptr, const std::string &name, instruction *next); +private: + static type *get_pointee_type(type *ty); + public: // accessors value *get_pointer_operand() { return get_operand(0); } diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 5d660bfad..14a5249cc 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -123,13 +123,16 @@ void node::implicit_broadcast(ir::module *mod, ir::value *&lhs, ir::value *&rhs) for(size_t i = 0; i < off; i++) shortest.insert(shortest.begin(), 1); ir::value *&target = (lhs_dim < rhs_dim)?lhs:rhs; - target = builder.create_reshape(target, shortest); + if(off > 0) + target = builder.create_reshape(target, shortest); // Broadcast std::vector shapes(ndim); for(size_t i = 0; i < ndim; i++) shapes[i] = std::max(shortest[i], longest[i]); - lhs = builder.create_broadcast(lhs, shapes); - rhs = builder.create_broadcast(rhs, shapes); + if(shapes != lhs_shapes) + lhs = builder.create_broadcast(lhs, shapes); + if(shapes != rhs_shapes) + rhs = builder.create_broadcast(rhs, shapes); } /* Translation unit */ diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index fe935a059..fcc5930a5 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -19,33 +19,46 @@ void tune::add_constraint(node_t x, node_t y) { void tune::init_c_phi(ir::instruction *v) { // Phi Nodes: all the incoming value share the result layout if(auto *phi = dynamic_cast(v)) - for(ir::value *inc: phi->ops()) + for(ir::value *op: phi->ops()) for(unsigned k = 0; k < phi->get_type()->get_tile_shapes().size(); k++) - if(dependencies_.find({inc, k}) != dependencies_.end() - || dependencies_.find({phi, k}) != dependencies_.end()) - add_constraint({phi, k}, {inc, k}); + if(dependencies_.find({op, k}) != dependencies_.end() + || dependencies_.find({phi, k}) != dependencies_.end()){ + add_constraint({phi, k}, {op, k}); + } } void tune::init_c_graph(ir::instruction *v) { - unsigned num_dim = v->get_type()->get_tile_shapes().size(); + const auto& shapes = v->get_type()->get_tile_shapes(); if(dynamic_cast(v)){ - + ir::value *op = v->get_operand(0); + unsigned current = 0; + for(unsigned i = 0; i < shapes.size(); i ++) + if(shapes[i] > 1) + add_constraint({v, i}, {op, current++}); } else if(dynamic_cast(v)){ } else if(dynamic_cast(v)){ + ir::value *op = v->get_operand(0); + ir::type *op_ty = op->get_type(); + const auto& op_shapes = op_ty->get_tile_shapes(); + for(unsigned i = 0; i < shapes.size(); i ++){ + if(op_shapes[i] == shapes[i] && v != op) + add_constraint({v, i}, {op, i}); + } } - else if(auto *ii = dynamic_cast(v)){ - ir::value *D = ii->get_operand(2); + else if(dynamic_cast(v)){ + ir::value *D = v->get_operand(2); add_constraint({v, 0}, {D, 0}); add_constraint({v, 1}, {D, 1}); } - else if(dynamic_cast(v)) - for(unsigned i = 0; i < num_dim; i ++) + else if(dynamic_cast(v)){ + for(unsigned i = 0; i < shapes.size(); i ++) for(ir::value* op: v->ops()) add_constraint({v, i}, {op, i}); + } } void tune::connected_components(node_t x, const std::vector vals, std::set &nodes, graph_t &graph) { @@ -57,6 +70,11 @@ void tune::connected_components(node_t x, const std::vector vals, st params_[instr].insert({"p1" + suffix, vals[1]}); params_[instr].insert({"p2" + suffix, vals[2]}); } + if(auto *cst = dynamic_cast(x.first)){ + *vals[0] = cst->get_value(); + *vals[1] = cst->get_value(); + *vals[2] = cst->get_value(); + } for(const node_t &y: graph[x]) connected_components(y, vals, nodes, graph); } @@ -69,8 +87,10 @@ void tune::get_params(ir::module &mod, std::vector &result) { for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i : block->get_inst_list()) for(auto &x: params_[i]) - if(seen.insert(x.second).second) + if(seen.insert(x.second).second && *x.second == 0){ + std::cout << typeid(*i).name() << " " << i << std::endl; result.push_back(x.second); + } } void tune::run(ir::module &mod) { @@ -78,8 +98,9 @@ void tune::run(ir::module &mod) { // Build constraints graph for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i : block->get_inst_list()) - if(i->get_type()->is_tile_ty()) + if(i->get_type()->is_tile_ty()){ init_c_graph(i); + } // Build phi constraints for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i : block->get_inst_list()) diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 2424ed074..eb2483132 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -294,8 +294,17 @@ getelementptr_inst *getelementptr_inst::create(value *ptr, const std::vectorget_scalar_ty(); + type *pointee_ty = scalar_ty->get_pointer_element_ty(); + if(ty->is_tile_ty()) + return tile_type::get_same_shapes(pointee_ty, ty); + return pointee_ty; +} + load_inst::load_inst(value *ptr, const std::string &name, instruction *next) - : unary_inst(ptr->get_type()->get_pointer_element_ty(), ptr, name, next) { } + : unary_inst(get_pointee_type(ptr->get_type()), ptr, name, next) { +} load_inst* load_inst::create(value *ptr, const std::string &name, instruction *next) { return new load_inst(ptr, name, next); diff --git a/lib/ir/module.cpp b/lib/ir/module.cpp index ce5d478ff..532f1ec2f 100644 --- a/lib/ir/module.cpp +++ b/lib/ir/module.cpp @@ -31,10 +31,11 @@ void module::set_value(const std::string& name, ir::value *value){ ir::phi_node* module::make_phi(ir::type *ty, unsigned num_values, ir::basic_block *block){ basic_block::iterator insert = block->get_first_non_phi(); - if(*insert) + if(insert != block->end()){ builder_.set_insert_point(insert); + } ir::phi_node *res = builder_.create_phi(ty, num_values); - if(*insert) + if(insert != block->end()) builder_.set_insert_point(block); return res; } From 80d019ec16cc461bf9916d53a5f2b919c5fdf0da Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 10 Jan 2019 23:53:27 -0500 Subject: [PATCH 050/494] [syntax tree] added syntactic support for dereferencing --- examples/matrix.cpp | 22 ++++++++++++++-------- include/ast/ast.h | 32 +++++++++++++++++++++----------- include/ast/parser.y | 3 ++- include/ast/scanner.l | 1 + include/codegen/tune.h | 1 + include/ir/builder.h | 5 ++++- include/ir/instructions.h | 25 ++++++++++++++++++++----- lib/ast/lowering.cpp | 30 +++++++++++++++++++++--------- lib/codegen/tune.cpp | 13 +++++++------ lib/ir/builder.cpp | 10 +++++++++- lib/ir/constant.cpp | 8 +++++--- lib/ir/instructions.cpp | 23 +++++++++++++++++++++++ 12 files changed, 128 insertions(+), 45 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index e2ea19527..e9d380f39 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -20,19 +20,25 @@ extern translation_unit *ast_root; const char src[] = "\ -void test(fp32 *A, fp32 *B, fp32 *C, int32 M, int32 N, int32 K){\ +void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K){\ int32 rx[16] = get_global_range[16](0);\ int32 ry[16] = get_global_range[16](1);\ - int32 rk[8] = 0 ... 8;\ - fp32 c[16, 16] = 0;\ + int32 rka[8] = 0 ... 8;\ + int32 rkb[8] = 0 ... 8;\ + fp32 C[16, 16] = 0;\ int32 k;\ - fp32* pa[16, 8] = A + rx[:, newaxis] + rk[newaxis, :]*M;\ - fp32* pb[16, 8] = B + ry[:, newaxis] + rk[newaxis, :]*K;\ + fp32* pa[16, 8] = a + rx[:, newaxis] + rka[newaxis, :]*M;\ + fp32* pb[16, 8] = b + ry[:, newaxis] + rkb[newaxis, :]*K;\ + fp32* pc[16, 16];\ for(k = 0; k < K; k = k + 8){\ - fp32 a[16, 8] = *pa;\ - fp32 b[16, 8] = *pb;\ - pa = pa + 8;\ + fp32 A[16, 8] = *pa;\ + fp32 B[16, 8] = *pb;\ + C = dot(A, B, C);\ + pa = pa + 8*M;\ + pb = pb + 8*K;\ }\ + pc = c + rx[:, newaxis] + ry[newaxis, :];\ + *pc = C;\ }\ "; diff --git a/include/ast/ast.h b/include/ast/ast.h index e3da73802..dea0af010 100644 --- a/include/ast/ast.h +++ b/include/ast/ast.h @@ -137,6 +137,19 @@ private: const constant* axis_; }; +class matmul_expression: public builtin_expression{ +public: + matmul_expression(node* A, node *B, node *C): + A_((expression*)A), B_((expression*)B), C_((expression*)C) { } + ir::value* codegen(ir::module *) const; + +private: + const expression *A_; + const expression *B_; + const expression *C_; +}; + + class indexing_expression: public postfix_expression{ public: indexing_expression(node *id, node *slices) @@ -149,21 +162,17 @@ private: const list* slices_; }; -class unary_expression: public expression{ + +class named_expression: public expression { public: - unary_expression(node *id): id_((const identifier*)id) {} - const identifier *id() const; + named_expression(node *id): id_((const identifier*)id) { } + const identifier *id() const { return id_; } + ir::value* codegen(ir::module * mod) const; private: const identifier *id_; }; -class named_expression: public unary_expression { -public: - named_expression(node *id): unary_expression(id){ } - ir::value* codegen(ir::module * mod) const; -}; - class binary_operator: public expression{ private: ir::value* llvm_op(ir::module *mod, ir::builder &bld, ir::value *lhs, ir::value *rhs, const std::string &name) const; @@ -220,6 +229,7 @@ public: : op_(op), arg_((expression*)arg) { } + UNARY_OP_T get_op() const { return op_; } ir::value* codegen(ir::module *mod) const; private: @@ -267,13 +277,13 @@ public: class assignment_expression: public expression{ public: assignment_expression(node *lvalue, ASSIGN_OP_T op, node *rvalue) - : lvalue_((unary_expression*)lvalue), op_(op), rvalue_((expression*)rvalue) { } + : lvalue_((named_expression*)lvalue), op_(op), rvalue_((expression*)rvalue) { } ir::value* codegen(ir::module *mod) const; public: ASSIGN_OP_T op_; - const unary_expression *lvalue_; + const expression *lvalue_; const expression *rvalue_; }; diff --git a/include/ast/parser.y b/include/ast/parser.y index 351556ff7..0b68443ce 100644 --- a/include/ast/parser.y +++ b/include/ast/parser.y @@ -50,7 +50,7 @@ TYPE_T get_type_spec(node *op) { return ((token*)op)->type; } %token VOID UINT8 UINT16 UINT32 UINT64 INT8 INT16 INT32 INT64 FP32 FP64 %token IF ELSE FOR %token NEWAXIS ELLIPSIS -%token GET_GLOBAL_RANGE +%token GET_GLOBAL_RANGE DOT %start translation_unit %% @@ -111,6 +111,7 @@ identifier builtin : GET_GLOBAL_RANGE '[' constant ']' '(' constant ')' { $$ = new get_global_range($3, $6); } + | DOT '(' expression ',' expression ',' expression ')' { $$ = new matmul_expression($3, $5, $7); } primary_expression : identifier { $$ = new named_expression($1); } diff --git a/include/ast/scanner.l b/include/ast/scanner.l index 55366859c..6b5ed66b0 100644 --- a/include/ast/scanner.l +++ b/include/ast/scanner.l @@ -33,6 +33,7 @@ int comment(); "fp64" { count(); return(FP64); } "..." { count(); return(ELLIPSIS); } "get_global_range" { count(); return GET_GLOBAL_RANGE; } +"dot" { count(); return DOT;} {L}({L}|{D})* { count(); return(check_type()); } diff --git a/include/codegen/tune.h b/include/codegen/tune.h index f7ce1b10d..5904fddcf 100644 --- a/include/codegen/tune.h +++ b/include/codegen/tune.h @@ -37,6 +37,7 @@ private: std::vector pool_; graph_t dependencies_; std::set nodes_; + std::map static_params_; }; diff --git a/include/ir/builder.h b/include/ir/builder.h index a96408f18..c84ef02d8 100644 --- a/include/ir/builder.h +++ b/include/ir/builder.h @@ -105,14 +105,17 @@ public: // Side effects value *create_fneg(value *arg, const std::string &name = ""); value *create_neg(value *arg, const std::string &name = ""); - value *create_load(value *arg, const std::string &name = ""); value *create_not(value *arg, const std::string &name = ""); + // Input/Output + value *create_load(value *arg, const std::string &name = ""); + value *create_store(value *ptr, value *val, const std::string &name = ""); // Tile instruction value *create_splat(value *arg, const std::vector &shapes, const std::string &name = ""); value *create_reshape(value *arg, const std::vector &shapes, const std::string &name = ""); value *create_broadcast(value *arg, const std::vector &shapes, const std::string &name = ""); // Built-in instruction value *create_get_global_range(unsigned axis, unsigned size, const std::string &name = ""); + value *create_matmul(value *A, value *B, value *C, const std::string &name = ""); private: context &ctx_; diff --git a/include/ir/instructions.h b/include/ir/instructions.h index e700bea04..fa7a0d6e3 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -291,7 +291,18 @@ public: // factory method static load_inst* create(value *ptr, const std::string &name = "", instruction *next = nullptr); +}; +class store_inst: public instruction{ +private: + store_inst(value *ptr, value *v, const std::string &name, instruction *next); + +public: + value *get_pointer_operand() { return get_operand(0); } + value *get_value_operand() { return get_operand(1); } + // factory method + static store_inst* create(value* ptr, value *v, const std::string &name = "", + instruction *next = nullptr); }; //===----------------------------------------------------------------------===// @@ -336,11 +347,6 @@ public: }; -// matmul -class matmul_inst: public instruction { - -}; - // built-in class builtin_inst: public instruction{ protected: @@ -359,6 +365,15 @@ private: unsigned axis_; }; +class matmul_inst: public builtin_inst { + matmul_inst(value *A, value *B, value *C, const std::string &name, instruction *next); + +public: + static instruction* create(value *A, value *B, value *C, + const std::string &name = "", + instruction *next = nullptr); +}; + } } diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 14a5249cc..fcce76729 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -318,9 +318,9 @@ ir::value* initializer::codegen(ir::module * mod) const{ std::string name = decl_->id()->name(); ir::value *value = ir::undef_value::get(ty); if(expr_){ - ir::value* target = expr_->codegen(mod); - explicit_cast(mod->get_builder(), target, ty->get_scalar_ty()); - implicit_broadcast(mod, value, target); + value = expr_->codegen(mod); + explicit_cast(mod->get_builder(), value, ty->get_scalar_ty()); + implicit_broadcast(mod, value, value); } value->set_name(name); mod->set_value(name, value); @@ -421,12 +421,23 @@ ir::value* binary_operator::codegen(ir::module *mod) const{ } /* Builtin expression */ + +// get_global_range ir::value* get_global_range::codegen(ir::module *mod) const { ir::builder &builder = mod->get_builder(); return builder.create_get_global_range(axis_->value(), size_->value()); } +ir::value* matmul_expression::codegen(ir::module *mod) const { + ir::value *A = A_->codegen(mod); + ir::value *B = B_->codegen(mod); + ir::value *C = C_->codegen(mod); + return mod->get_builder().create_matmul(A, B, C); +} + + + /* Postfix expression */ ir::value* indexing_expression::codegen(ir::module *mod) const{ ir::value *in = mod->get_value(id_->name()); @@ -497,7 +508,13 @@ ir::value *conditional_expression::codegen(ir::module *mod) const{ /* Assignment expression */ ir::value *assignment_expression::codegen(ir::module *mod) const{ ir::value *rvalue = rvalue_->codegen(mod); - mod->set_value(lvalue_->id()->name(), rvalue); + if(auto *x = dynamic_cast(lvalue_)) + mod->set_value(x->id()->name(), rvalue); + else if(auto* x = dynamic_cast(lvalue_)){ + assert(x->get_op()==DEREF); + ir::value *ptr = x->codegen(mod); + mod->get_builder().create_store(ptr, rvalue); + } return rvalue; } @@ -527,11 +544,6 @@ ir::value* constant_range::codegen(ir::module *mod) const{ (ir::constant*)last_->codegen(mod)); } -/* Unary expression */ -const identifier* unary_expression::id() const{ - return id_; -} - /* Named */ ir::value* named_expression::codegen(ir::module *mod) const{ const std::string &name = id()->name(); diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index fcc5930a5..deb2f858b 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -33,7 +33,9 @@ void tune::init_c_graph(ir::instruction *v) { ir::value *op = v->get_operand(0); unsigned current = 0; for(unsigned i = 0; i < shapes.size(); i ++) - if(shapes[i] > 1) + if(shapes[i] == 1) + static_params_.insert({{v, i}, 1}); + else add_constraint({v, i}, {op, current++}); } else if(dynamic_cast(v)){ @@ -70,10 +72,10 @@ void tune::connected_components(node_t x, const std::vector vals, st params_[instr].insert({"p1" + suffix, vals[1]}); params_[instr].insert({"p2" + suffix, vals[2]}); } - if(auto *cst = dynamic_cast(x.first)){ - *vals[0] = cst->get_value(); - *vals[1] = cst->get_value(); - *vals[2] = cst->get_value(); + if(static_params_.find(x) != static_params_.end()){ + *vals[0] = static_params_.at(x); + *vals[1] = static_params_.at(x); + *vals[2] = static_params_.at(x); } for(const node_t &y: graph[x]) connected_components(y, vals, nodes, graph); @@ -88,7 +90,6 @@ void tune::get_params(ir::module &mod, std::vector &result) { for(ir::instruction *i : block->get_inst_list()) for(auto &x: params_[i]) if(seen.insert(x.second).second && *x.second == 0){ - std::cout << typeid(*i).name() << " " << i << std::endl; result.push_back(x.second); } } diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index 1cfdeefa3..8d3f58792 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -227,7 +227,11 @@ DEFINE_FCMP_INSTR(ONE, llvm::FCmpInst::FCMP_ONE) //===----------------------------------------------------------------------===// value *builder::create_load(value *arg, const std::string &name){ - return load_inst::create(arg, name); + return insert(load_inst::create(arg, name)); +} + +value *builder::create_store(value *ptr, value *val, const std::string &name){ + return insert(store_inst::create(ptr, val, name)); } //===----------------------------------------------------------------------===// @@ -254,5 +258,9 @@ value *builder::create_get_global_range(unsigned axis, unsigned size, const std: return insert(get_global_range_inst::create(ctx_, axis, size, name)); } +value *builder::create_matmul(value *A, value *B, value *C, const std::string &name) { + return insert(matmul_inst::create(A, B, C, name)); +} + } } diff --git a/lib/ir/constant.cpp b/lib/ir/constant.cpp index f2779b75b..58f3b1ab7 100644 --- a/lib/ir/constant.cpp +++ b/lib/ir/constant.cpp @@ -61,9 +61,11 @@ constant_range::constant_range(type *ty, uint64_t first, uint64_t last) constant *constant_range::get(constant *first, constant *last) { assert(first->get_type()->is_integer_ty()); assert(first->get_type() == last->get_type()); - uint64_t vfirst = ((constant_int*)first)->get_value(); - uint64_t vlast = ((constant_int*)first)->get_value(); - return new constant_range(first->get_type(), vfirst, vlast); + unsigned vfirst = ((constant_int*)first)->get_value(); + unsigned vlast = ((constant_int*)last)->get_value(); + assert(vlast > vfirst); + type *ty = tile_type::get(first->get_type(), {vlast - vfirst}); + return new constant_range(ty, vfirst, vlast); } diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index eb2483132..0a62e9a6d 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -310,6 +310,17 @@ load_inst* load_inst::create(value *ptr, const std::string &name, instruction *n return new load_inst(ptr, name, next); } +// store +store_inst::store_inst(value *ptr, value *v, const std::string &name, instruction *next) + : instruction(type::get_void_ty(ptr->get_type()->get_context()), 2, name, next) { + set_operand(0, ptr); + set_operand(1, v); +} + +store_inst* store_inst::create(value *ptr, value *v, const std::string &name, instruction *next) { + return new store_inst(ptr, v, name, next); +} + //===----------------------------------------------------------------------===// // retile_inst classes //===----------------------------------------------------------------------===// @@ -345,6 +356,18 @@ instruction* broadcast_inst::create(value *arg, const std::vector &sha // matmul_inst classes //===----------------------------------------------------------------------===// +matmul_inst::matmul_inst(value *A, value *B, value *C, + const std::string &name, instruction *next) + : builtin_inst(C->get_type(), 3, name, next) { + set_operand(0, A); + set_operand(0, B); + set_operand(0, C); +} + +instruction *matmul_inst::create(value *A, value *B, value *C, + const std::string &name, instruction *next) { + return new matmul_inst(A, B, C, name, next); +} //===----------------------------------------------------------------------===// // builtin instructions From a0ecdba5a220682dd678158d7ef91122259e7fc0 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 12 Jan 2019 23:24:25 -0500 Subject: [PATCH 051/494] [code generation] testing analysis passes --- examples/matrix.cpp | 10 +++++++++- include/codegen/allocation.h | 5 +++-- include/codegen/liveness.h | 3 --- include/codegen/selection.h | 10 ++++++++++ include/ir/builder.h | 5 +++-- include/ir/instructions.h | 20 +++++++++++++++++++- lib/codegen/allocation.cpp | 4 ++-- lib/codegen/liveness.cpp | 5 +---- lib/codegen/selection.cpp | 9 ++++++++- lib/ir/builder.cpp | 23 ++++++++++++++++++++--- lib/ir/instructions.cpp | 12 ++++++++++-- lib/ir/type.cpp | 2 +- 12 files changed, 86 insertions(+), 22 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index e9d380f39..426294a63 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -5,6 +5,9 @@ #include "ir/module.h" #include "codegen/selection.h" #include "codegen/tune.h" +#include "codegen/shared_copy.h" +#include "codegen/allocation.h" +#include "codegen/liveness.h" #include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/Module.h" #include "llvm/IR/LLVMContext.h" @@ -53,9 +56,14 @@ int main() { llvm::LLVMContext llvm_context; llvm::Module llvm_module("test", llvm_context); // lowering passes - tdl::codegen::selection selection; + tdl::codegen::place_shared_copy shared; tdl::codegen::tune tune; + tdl::codegen::liveness liveness; + tdl::codegen::allocation allocation(&liveness); tune.run(module); + shared.run(module); + liveness.run(module); + allocation.run(); std::vector params; tune.get_params(module, params); std::cout << params.size() << std::endl; diff --git a/include/codegen/allocation.h b/include/codegen/allocation.h index b93322539..4b90cf46a 100644 --- a/include/codegen/allocation.h +++ b/include/codegen/allocation.h @@ -20,6 +20,9 @@ class loop_info; class allocation { public: + allocation(liveness *live) + : liveness_(live){ } + // accessors unsigned get_offset(ir::value *x) const { return offsets_.at(x); } unsigned get_allocated_size() const { return allocated_size_; } @@ -33,8 +36,6 @@ private: size_t allocated_size_; // dependences liveness *liveness_; - layout *layout_; - loop_info *loop_info_; }; } diff --git a/include/codegen/liveness.h b/include/codegen/liveness.h index c7cfb5959..11d377c62 100644 --- a/include/codegen/liveness.h +++ b/include/codegen/liveness.h @@ -13,8 +13,6 @@ namespace ir{ namespace codegen{ -class layout; - typedef unsigned slot_index; struct segment { @@ -54,7 +52,6 @@ private: has_storage_map_t has_dedicated_storage_; indices_map_t indices_; intervals_map_t intervals_; - layout* layouts_; }; } diff --git a/include/codegen/selection.h b/include/codegen/selection.h index b55725c05..a8d4aabce 100644 --- a/include/codegen/selection.h +++ b/include/codegen/selection.h @@ -20,6 +20,14 @@ namespace llvm{ namespace tdl{ namespace codegen{ +class allocation; + +struct distributed_axis { + +}; + + + class selection{ typedef std::map vmap_t; typedef std::map bmap_t; @@ -31,11 +39,13 @@ private: llvm::Constant* llvm_constant(ir::constant *cst, llvm::LLVMContext &ctx); public: + selection(allocation *alloc): alloc_(alloc){ } void run(ir::module &src, llvm::Module &dst); private: vmap_t vmap_; bmap_t bmap_; + allocation *alloc_; }; diff --git a/include/ir/builder.h b/include/ir/builder.h index c84ef02d8..438390940 100644 --- a/include/ir/builder.h +++ b/include/ir/builder.h @@ -27,6 +27,7 @@ public: builder(context &ctx); // Setters void set_insert_point(iterator instr); + void set_insert_point(instruction* i); void set_insert_point(basic_block* block); basic_block* get_insert_block() { return block_; } iterator get_insert_point() { return insert_point_;} @@ -42,7 +43,6 @@ public: block_->get_inst_list().insert(insert_point_, inst); inst->set_parent(block_); inst->set_name(name); - insert_point_ = block_->end(); return inst; } // terminator instructions @@ -116,7 +116,8 @@ public: // Built-in instruction value *create_get_global_range(unsigned axis, unsigned size, const std::string &name = ""); value *create_matmul(value *A, value *B, value *C, const std::string &name = ""); - + // Intrinsics + value *create_copy_to_shared(value *arg, const std::string &name = ""); private: context &ctx_; basic_block *block_; diff --git a/include/ir/instructions.h b/include/ir/instructions.h index fa7a0d6e3..0b3295658 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -347,7 +347,10 @@ public: }; -// built-in +//===----------------------------------------------------------------------===// +// builtin_inst classes +//===----------------------------------------------------------------------===// + class builtin_inst: public instruction{ protected: using instruction::instruction; @@ -374,6 +377,21 @@ public: instruction *next = nullptr); }; + +//===----------------------------------------------------------------------===// +// intrinsics classes +//===----------------------------------------------------------------------===// + + +class copy_to_shared_inst: public unary_inst{ + using unary_inst::unary_inst; + +public: + static copy_to_shared_inst* create(value *arg, const std::string &name = "", + instruction *next = nullptr); +}; + + } } diff --git a/lib/codegen/allocation.cpp b/lib/codegen/allocation.cpp index 1730371ae..7a5154280 100644 --- a/lib/codegen/allocation.cpp +++ b/lib/codegen/allocation.cpp @@ -10,7 +10,6 @@ namespace tdl{ namespace codegen{ - void allocation::run(){ using std::max; using std::min; @@ -108,8 +107,9 @@ void allocation::run(){ // Save maximum size of induced memory space allocated_size_ = 0; - for(auto &x: offsets_) + for(auto &x: offsets_){ allocated_size_ = std::max(allocated_size_, x.second + get_num_bytes(x.first)); + } } } diff --git a/lib/codegen/liveness.cpp b/lib/codegen/liveness.cpp index 824c95590..bf4c99be2 100644 --- a/lib/codegen/liveness.cpp +++ b/lib/codegen/liveness.cpp @@ -1,5 +1,4 @@ #include "codegen/liveness.h" -#include "codegen/layout.h" #include "ir/basic_block.h" #include "ir/function.h" #include "ir/module.h" @@ -24,9 +23,7 @@ for(ir::function *fn: mod.get_function_list()){ // Creates live intervals for(auto i: indices_){ ir::value *v = i.first; - if(!layouts_->get_num_shared_views(v)) - continue; - if(!layouts_->get_shared_view(v, 0).has_dedicated_storage) + if(!dynamic_cast(v)) continue; unsigned start = i.second; unsigned end = start; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index edf48262c..4ce0d1b38 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -63,7 +63,7 @@ Constant *selection::llvm_constant(ir::constant *cst, LLVMContext &ctx) { } -/* convert ir::instruction to Instruction */ +/* convert ir::instruction to llvm::Instruction */ Instruction *selection::llvm_inst(ir::instruction *inst, LLVMContext & ctx) { auto value = [&](ir::value *x) { return llvm_value(x, ctx); }; auto block = [&](ir::basic_block *x) { return bmap_.at(x); }; @@ -125,7 +125,9 @@ Instruction *selection::llvm_inst(ir::instruction *inst, LLVMContext & ctx) { throw std::runtime_error("unknown conversion from ir::type to Type"); } +/* convert ir::value to llvm::Value */ Value* selection::llvm_value(ir::value *v, LLVMContext &ctx) { + assert(!v->get_type()->is_tile_ty()); if(vmap_.find(v) != vmap_.end()) return vmap_.at(v); // create operands @@ -141,6 +143,11 @@ Value* selection::llvm_value(ir::value *v, LLVMContext &ctx) { throw std::runtime_error("unknown conversion from ir::value to Value"); } +/* lower tile to a set of llvm::Value's */ +//void selection::lower_tile(ir::value *v) { + +//} + void selection::run(ir::module &src, Module &dst){ vmap_.clear(); bmap_.clear(); diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index 8d3f58792..96e52c1d8 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -3,6 +3,7 @@ #include "ir/builder.h" #include "ir/constant.h" #include "ir/instructions.h" +#include "ir/intrinsics.h" #include "ir/type.h" #include "llvm/IR/Instruction.h" @@ -16,11 +17,18 @@ builder::builder(context &ctx): // utilities //===----------------------------------------------------------------------===// -void builder::set_insert_point(basic_block::iterator instr){ - block_ = (*instr)->get_parent(); - insert_point_ = instr; +void builder::set_insert_point(basic_block::iterator it){ + block_ = (*it)->get_parent(); + insert_point_ = it; } +void builder::set_insert_point(instruction* i){ + block_ = i->get_parent(); + auto it = std::find(block_->begin(), block_->end(), i); + set_insert_point(it); +} + + void builder::set_insert_point(basic_block *block){ block_ = block; insert_point_ = block->end(); @@ -262,5 +270,14 @@ value *builder::create_matmul(value *A, value *B, value *C, const std::string &n return insert(matmul_inst::create(A, B, C, name)); } +//===----------------------------------------------------------------------===// +// intrinsic instructions +//===----------------------------------------------------------------------===// + + +value *builder::create_copy_to_shared(value *arg, const std::string &name) { + return insert(copy_to_shared_inst::create(arg, name)); +} + } } diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 0a62e9a6d..679ee5bb2 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -360,8 +360,8 @@ matmul_inst::matmul_inst(value *A, value *B, value *C, const std::string &name, instruction *next) : builtin_inst(C->get_type(), 3, name, next) { set_operand(0, A); - set_operand(0, B); - set_operand(0, C); + set_operand(1, B); + set_operand(2, C); } instruction *matmul_inst::create(value *A, value *B, value *C, @@ -385,5 +385,13 @@ instruction* get_global_range_inst::create(context &ctx, unsigned axis, unsigned return new get_global_range_inst(tile_ty, axis, name, next); } +//===----------------------------------------------------------------------===// +// intrinsic instructions +//===----------------------------------------------------------------------===// +copy_to_shared_inst* copy_to_shared_inst::create(value *arg, const std::string &name, + instruction *next) { + return new copy_to_shared_inst(arg->get_type(), arg, name, next); +} + } } diff --git a/lib/ir/type.cpp b/lib/ir/type.cpp index 98ec78508..075bcd88b 100644 --- a/lib/ir/type.cpp +++ b/lib/ir/type.cpp @@ -160,7 +160,7 @@ bool tile_type::is_valid_elt_ty(type *ty) { unsigned tile_type::get_num_elements() const { unsigned res = 1; for(unsigned shape: shapes_) - shape *= res; + res *= shape; return res; } From 7eebdceb6a48032e3b917173b63ed113a6befa90 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 23 Jan 2019 00:11:42 -0500 Subject: [PATCH 052/494] [code generation] fixed bug in on-the-fly AST to IR lowering --- examples/matrix.cpp | 47 +++++++---- include/ast/ast.h | 1 + include/codegen/selection.h | 47 ++++++++++- include/codegen/tune.h | 7 +- include/ir/instructions.h | 42 +++++----- lib/ast/lowering.cpp | 93 ++++++++++++---------- lib/codegen/selection.cpp | 155 ++++++++++++++++++++++++++++++++++-- lib/codegen/tune.cpp | 43 +++++++--- lib/ir/builder.cpp | 1 - lib/ir/module.cpp | 10 ++- 10 files changed, 344 insertions(+), 102 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 426294a63..bc37b81ae 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -24,21 +24,16 @@ extern translation_unit *ast_root; const char src[] = "\ void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K){\ - int32 rx[16] = get_global_range[16](0);\ - int32 ry[16] = get_global_range[16](1);\ + int32 rx[32] = get_global_range[32](0);\ + int32 ry[32] = get_global_range[32](1);\ int32 rka[8] = 0 ... 8;\ int32 rkb[8] = 0 ... 8;\ - fp32 C[16, 16] = 0;\ + fp32 C[32, 32] = 0;\ int32 k;\ - fp32* pa[16, 8] = a + rx[:, newaxis] + rka[newaxis, :]*M;\ - fp32* pb[16, 8] = b + ry[:, newaxis] + rkb[newaxis, :]*K;\ - fp32* pc[16, 16];\ + fp32* pa[32, 8] = a + rx[:, newaxis] + rka[newaxis, :]*M;\ + fp32* pb[32, 8] = b + ry[:, newaxis] + rkb[newaxis, :]*K;\ + fp32* pc[32, 32];\ for(k = 0; k < K; k = k + 8){\ - fp32 A[16, 8] = *pa;\ - fp32 B[16, 8] = *pb;\ - C = dot(A, B, C);\ - pa = pa + 8*M;\ - pb = pb + 8*K;\ }\ pc = c + rx[:, newaxis] + ry[newaxis, :];\ *pc = C;\ @@ -60,13 +55,37 @@ int main() { tdl::codegen::tune tune; tdl::codegen::liveness liveness; tdl::codegen::allocation allocation(&liveness); + tdl::codegen::selection selection(&allocation, &tune); tune.run(module); + std::vector params = { + // asm + 2, 16, 1, + // bsn + 2, 16, 1, + // pa + 1, 2, 4, + // pb + 1, 2, 4, + // c + 2, 16, 1, 1, 2, 4 + }; + std::map> errors; + unsigned i = 0; + std::cout << tune.get_params(module).size() << std::endl; + for(unsigned *x: tune.get_params(module)) + *x = params[i++]; + tune.check_constraints(module, errors); +// std::cout << "errors: " << errors.size() << std::endl; +// for(auto &x: errors){ +// for(auto &e: x.second) +// std::cout << e << std::endl; +// } shared.run(module); liveness.run(module); allocation.run(); - std::vector params; - tune.get_params(module, params); - std::cout << params.size() << std::endl; + selection.run(module, llvm_module); +// std::vector params = tune.get_params(module); +// std::cout << params.size() << std::endl; // selection.run(module, llvm_module); // // print LLVM program // llvm::PrintModulePass print(llvm::outs()); diff --git a/include/ast/ast.h b/include/ast/ast.h index dea0af010..fed9d6556 100644 --- a/include/ast/ast.h +++ b/include/ast/ast.h @@ -65,6 +65,7 @@ class node { protected: static ir::value* explicit_cast(ir::builder &builder, ir::value *src, ir::type *dst_ty); static void implicit_broadcast(ir::module *mod, ir::value *&lhs, ir::value *&rhs); + static void implicit_broadcast(ir::module *mod, ir::value *&arg, ir::type *ty); static void implicit_cast(ir::builder &builder, ir::value *&lhs, ir::value *&rhs, bool &is_float, bool &is_ptr, bool &is_int, bool &is_signed); public: diff --git a/include/codegen/selection.h b/include/codegen/selection.h index a8d4aabce..92c9f79b5 100644 --- a/include/codegen/selection.h +++ b/include/codegen/selection.h @@ -21,32 +21,75 @@ namespace tdl{ namespace codegen{ class allocation; +class tune; struct distributed_axis { + std::vector values; +}; + +class tile { +protected: + typedef std::vector shapes_t; + +public: + tile(const shapes_t &shapes): shapes_(shapes){ } + +private: + shapes_t shapes_; +}; + +class shared_tile: public tile { +public: + using tile::tile; }; +class distributed_tile: public tile{ + typedef std::vector axes_t; + +public: + distributed_tile(const shapes_t& shapes, const axes_t &axes) + : tile(shapes), axes_(axes) {} + +private: + axes_t axes_; +}; class selection{ typedef std::map vmap_t; typedef std::map bmap_t; + typedef std::map tmap_t; private: + // LLVM conversions llvm::Type* llvm_type(ir::type *ty, llvm::LLVMContext &ctx); llvm::Value* llvm_value(ir::value *v,llvm:: LLVMContext &ctx); llvm::Instruction* llvm_inst(ir::instruction *inst, llvm::LLVMContext &ctx); llvm::Constant* llvm_constant(ir::constant *cst, llvm::LLVMContext &ctx); + // grid construction + void create_grids(std::vector &grids, + std::map &references, + ir::function *fn); + void init_axes(ir::instruction *i, llvm::IRBuilder<> &builder, llvm::Value *u_thread_id, llvm::Value *u_warp_id); + void init_grids(ir::function *fn, llvm::IRBuilder<> &builder); + + // lowering + void lower_instruction(ir::instruction *src, llvm::IRBuilder<> &builder); + void lower_tile_instruction(ir::instruction *src, llvm::IRBuilder<> &builder); + public: - selection(allocation *alloc): alloc_(alloc){ } + selection(allocation *alloc, tune *params): alloc_(alloc), params_(params){ } void run(ir::module &src, llvm::Module &dst); private: vmap_t vmap_; bmap_t bmap_; + tmap_t tmap_; allocation *alloc_; - + tune *params_; + std::map> axes_; }; } diff --git a/include/codegen/tune.h b/include/codegen/tune.h index 5904fddcf..d1fc67549 100644 --- a/include/codegen/tune.h +++ b/include/codegen/tune.h @@ -11,6 +11,7 @@ namespace ir{ class value; class module; class instruction; + class function; } namespace codegen{ @@ -24,11 +25,13 @@ private: void init_c_phi(ir::instruction *i); void init_c_graph(ir::instruction *v); void connected_components(node_t x, const std::vector vals, std::set &nodes, graph_t &graph); + void create_grids(std::vector &grids, std::map &references, ir::function *fn); public: - void get_params(ir::module& mod, std::vector &result); - unsigned *get_param(ir::value *value); + std::vector get_params(ir::module& mod); + std::map get_params(ir::instruction* i); + unsigned *get_param(ir::value *value, const std::string &key) { return params_[value][key]; } bool check_constraints(ir::module &fn, std::map> &errors); void run(ir::module &mod); diff --git a/include/ir/instructions.h b/include/ir/instructions.h index 0b3295658..09c129160 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -149,12 +149,15 @@ protected: //===----------------------------------------------------------------------===// class cast_inst: public unary_inst{ - using unary_inst::unary_inst; using ic = llvm::Instruction::CastOps; public: typedef llvm::CastInst::CastOps op_t; +protected: + cast_inst(type *ty, value *v, const std::string &name, instruction *next, op_t op) + : unary_inst(ty, v, name, next), op_(op) { } + private: static bool is_valid(op_t op, value *arg, type *ty); @@ -172,25 +175,26 @@ private: op_t op_; }; -#define TDL_IR_DECLARE_CAST_INST_SIMPLE(name) \ - class name : public cast_inst{ \ - friend class cast_inst; \ - using cast_inst::cast_inst; \ - }; +#define TDL_IR_DECLARE_CAST_INST_SIMPLE(name, op) \ +class name : public cast_inst{ \ + friend class cast_inst; \ + name(type *ty, value *v, const std::string &name, instruction *next) \ + : cast_inst(ty, v, name, next, op){ } \ +}; -TDL_IR_DECLARE_CAST_INST_SIMPLE(trunc_inst) -TDL_IR_DECLARE_CAST_INST_SIMPLE(z_ext_inst) -TDL_IR_DECLARE_CAST_INST_SIMPLE(s_ext_inst) -TDL_IR_DECLARE_CAST_INST_SIMPLE(fp_trunc_inst) -TDL_IR_DECLARE_CAST_INST_SIMPLE(fp_ext_inst) -TDL_IR_DECLARE_CAST_INST_SIMPLE(ui_to_fp_inst) -TDL_IR_DECLARE_CAST_INST_SIMPLE(si_to_fp_inst) -TDL_IR_DECLARE_CAST_INST_SIMPLE(fp_to_ui_inst) -TDL_IR_DECLARE_CAST_INST_SIMPLE(fp_to_si_inst) -TDL_IR_DECLARE_CAST_INST_SIMPLE(ptr_to_int_inst) -TDL_IR_DECLARE_CAST_INST_SIMPLE(int_to_ptr_inst) -TDL_IR_DECLARE_CAST_INST_SIMPLE(bit_cast_inst) -TDL_IR_DECLARE_CAST_INST_SIMPLE(addr_space_cast_inst) +TDL_IR_DECLARE_CAST_INST_SIMPLE(trunc_inst, llvm::Instruction::CastOps::Trunc) +TDL_IR_DECLARE_CAST_INST_SIMPLE(z_ext_inst, llvm::Instruction::CastOps::ZExt) +TDL_IR_DECLARE_CAST_INST_SIMPLE(s_ext_inst, llvm::Instruction::CastOps::SExt) +TDL_IR_DECLARE_CAST_INST_SIMPLE(fp_trunc_inst, llvm::Instruction::CastOps::FPTrunc) +TDL_IR_DECLARE_CAST_INST_SIMPLE(fp_ext_inst, llvm::Instruction::CastOps::FPExt) +TDL_IR_DECLARE_CAST_INST_SIMPLE(ui_to_fp_inst, llvm::Instruction::CastOps::UIToFP) +TDL_IR_DECLARE_CAST_INST_SIMPLE(si_to_fp_inst, llvm::Instruction::CastOps::SIToFP) +TDL_IR_DECLARE_CAST_INST_SIMPLE(fp_to_ui_inst, llvm::Instruction::CastOps::FPToUI) +TDL_IR_DECLARE_CAST_INST_SIMPLE(fp_to_si_inst, llvm::Instruction::CastOps::FPToSI) +TDL_IR_DECLARE_CAST_INST_SIMPLE(ptr_to_int_inst, llvm::Instruction::CastOps::PtrToInt) +TDL_IR_DECLARE_CAST_INST_SIMPLE(int_to_ptr_inst, llvm::Instruction::CastOps::IntToPtr) +TDL_IR_DECLARE_CAST_INST_SIMPLE(bit_cast_inst, llvm::Instruction::CastOps::BitCast) +TDL_IR_DECLARE_CAST_INST_SIMPLE(addr_space_cast_inst, llvm::Instruction::CastOps::AddrSpaceCast) //===----------------------------------------------------------------------===// // terminator_inst classes diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index fcce76729..490dcdba1 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -90,6 +90,11 @@ void node::implicit_cast(ir::builder &builder, ir::value *&lhs, ir::value *&rhs, throw std::runtime_error("unreachable"); } +void node::implicit_broadcast(ir::module *mod, ir::value *&arg, ir::type *ty) { + ir::value *tmp = ir::undef_value::get(ty); + implicit_broadcast(mod, arg, tmp); +} + void node::implicit_broadcast(ir::module *mod, ir::value *&lhs, ir::value *&rhs){ ir::builder &builder = mod->get_builder(); ir::type *lhs_ty = lhs->get_type(); @@ -320,7 +325,7 @@ ir::value* initializer::codegen(ir::module * mod) const{ if(expr_){ value = expr_->codegen(mod); explicit_cast(mod->get_builder(), value, ty->get_scalar_ty()); - implicit_broadcast(mod, value, value); + implicit_broadcast(mod, value, ty); } value->set_name(name); mod->set_value(name, value); @@ -331,85 +336,85 @@ ir::value* initializer::codegen(ir::module * mod) const{ /* Expression */ /*------------------*/ /* Binary operator */ -ir::value *binary_operator::llvm_op(ir::module *mod, ir::builder &builder, ir::value *lhs, ir::value *rhs, const std::string &name) const +ir::value *binary_operator::llvm_op(ir::module *mod, ir::builder &builder, ir::value *arg, ir::value *rhs, const std::string &name) const { bool is_float = false, is_ptr = false, is_int = false, is_signed = false; - implicit_cast(builder, lhs, rhs, is_float, is_ptr, is_int, is_signed); - implicit_broadcast(mod, lhs, rhs); + implicit_cast(builder, arg, rhs, is_float, is_ptr, is_int, is_signed); + implicit_broadcast(mod, arg, rhs); if(op_==MUL && is_float) - return builder.create_fmul(lhs, rhs, name); + return builder.create_fmul(arg, rhs, name); if(op_==MUL && is_int) - return builder.create_mul(lhs, rhs, name); + return builder.create_mul(arg, rhs, name); if(op_==DIV && is_float) - return builder.create_fdiv(lhs, rhs, name); + return builder.create_fdiv(arg, rhs, name); if(op_==DIV && is_int && is_signed) - return builder.create_sdiv(lhs, rhs, name); + return builder.create_sdiv(arg, rhs, name); if(op_==DIV && is_int && !is_signed) - return builder.create_udiv(lhs, rhs, name); + return builder.create_udiv(arg, rhs, name); if(op_==MOD && is_float) - return builder.create_frem(lhs, rhs, name); + return builder.create_frem(arg, rhs, name); if(op_==MOD && is_int && is_signed) - return builder.create_srem(lhs, rhs, name); + return builder.create_srem(arg, rhs, name); if(op_==MOD && is_int && !is_signed) - return builder.create_urem(lhs, rhs, name); + return builder.create_urem(arg, rhs, name); if(op_==ADD && is_float) - return builder.create_fadd(lhs, rhs, name); + return builder.create_fadd(arg, rhs, name); if(op_==ADD && is_int) - return builder.create_add(lhs, rhs); + return builder.create_add(arg, rhs); if(op_==ADD && is_ptr) - return builder.create_gep(lhs, {rhs}); + return builder.create_gep(arg, {rhs}); if(op_==SUB && is_float) - return builder.create_fsub(lhs, rhs, name); + return builder.create_fsub(arg, rhs, name); if(op_==SUB && is_int) - return builder.create_sub(lhs, rhs, name); + return builder.create_sub(arg, rhs, name); if(op_==SUB && is_ptr) - return builder.create_gep(lhs, {builder.create_neg(rhs)}); + return builder.create_gep(arg, {builder.create_neg(rhs)}); if(op_==LEFT_SHIFT) - return builder.create_shl(lhs, rhs, name); + return builder.create_shl(arg, rhs, name); if(op_==RIGHT_SHIFT) - return builder.create_ashr(lhs, rhs, name); + return builder.create_ashr(arg, rhs, name); if(op_ == LT && is_float) - return builder.create_fcmpOLT(lhs, rhs, name); + return builder.create_fcmpOLT(arg, rhs, name); if(op_ == LT && is_int && is_signed) - return builder.create_icmpSLT(lhs, rhs, name); + return builder.create_icmpSLT(arg, rhs, name); if(op_ == LT && is_int && !is_signed) - return builder.create_icmpULT(lhs, rhs, name); + return builder.create_icmpULT(arg, rhs, name); if(op_ == GT && is_float) - return builder.create_fcmpOGT(lhs, rhs, name); + return builder.create_fcmpOGT(arg, rhs, name); if(op_ == GT && is_int && is_signed) - return builder.create_icmpSGT(lhs, rhs, name); + return builder.create_icmpSGT(arg, rhs, name); if(op_ == GT && is_int && !is_signed) - return builder.create_icmpUGT(lhs, rhs, name); + return builder.create_icmpUGT(arg, rhs, name); if(op_ == LE && is_float) - return builder.create_fcmpOLE(lhs, rhs, name); + return builder.create_fcmpOLE(arg, rhs, name); if(op_ == LE && is_int && is_signed) - return builder.create_icmpSLE(lhs, rhs, name); + return builder.create_icmpSLE(arg, rhs, name); if(op_ == LE && is_int && !is_signed) - return builder.create_icmpULE(lhs, rhs, name); + return builder.create_icmpULE(arg, rhs, name); if(op_ == GE && is_float) - return builder.create_fcmpOGE(lhs, rhs, name); + return builder.create_fcmpOGE(arg, rhs, name); if(op_ == GE && is_int && is_signed) - return builder.create_icmpSGE(lhs, rhs, name); + return builder.create_icmpSGE(arg, rhs, name); if(op_ == GE && is_int && !is_signed) - return builder.create_icmpUGE(lhs, rhs, name); + return builder.create_icmpUGE(arg, rhs, name); if(op_ == EQ && is_float) - return builder.create_fcmpOEQ(lhs, rhs, name); + return builder.create_fcmpOEQ(arg, rhs, name); if(op_ == EQ && is_int) - return builder.create_icmpEQ(lhs, rhs, name); + return builder.create_icmpEQ(arg, rhs, name); if(op_ == NE && is_float) - return builder.create_fcmpONE(lhs, rhs, name); + return builder.create_fcmpONE(arg, rhs, name); if(op_ == NE && is_int) - return builder.create_icmpNE(lhs, rhs, name); + return builder.create_icmpNE(arg, rhs, name); if(op_ == AND) - return builder.create_and(lhs, rhs, name); + return builder.create_and(arg, rhs, name); if(op_ == XOR) - return builder.create_xor(lhs, rhs, name); + return builder.create_xor(arg, rhs, name); if(op_ == OR) - return builder.create_or(lhs, rhs, name); + return builder.create_or(arg, rhs, name); if(op_ == LAND) - return builder.create_and(lhs, rhs, name); + return builder.create_and(arg, rhs, name); if(op_ == LOR) - return builder.create_or(lhs, rhs, name); + return builder.create_or(arg, rhs, name); throw std::runtime_error("unreachable"); } @@ -433,6 +438,12 @@ ir::value* matmul_expression::codegen(ir::module *mod) const { ir::value *A = A_->codegen(mod); ir::value *B = B_->codegen(mod); ir::value *C = C_->codegen(mod); +// unsigned M = A->get_type()->get_tile_shapes()[0]; +// unsigned N = B->get_type()->get_tile_shapes()[1]; +// ir::type *scalar_ty = A->get_type()->get_scalar_ty(); +// ir::type *tile_ty = ir::tile_type::get(scalar_ty, {M, N}); +// ir::value *tmp = ir::undef_value::get(tile_ty); +// implicit_broadcast(mod, tmp, C); return mod->get_builder().create_matmul(A, B, C); } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 4ce0d1b38..4092be811 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -1,4 +1,5 @@ #include "codegen/selection.h" +#include "codegen/tune.h" #include "llvm/IR/Module.h" #include "llvm/IR/IRBuilder.h" #include "ir/context.h" @@ -143,10 +144,148 @@ Value* selection::llvm_value(ir::value *v, LLVMContext &ctx) { throw std::runtime_error("unknown conversion from ir::value to Value"); } -/* lower tile to a set of llvm::Value's */ -//void selection::lower_tile(ir::value *v) { +// Grid construction +std::vector delinearize(Value *trailing, std::vector &shapes, IRBuilder<> &builder){ + size_t dim = shapes.size(); + std::vector result(dim); + for(unsigned k = 0; k < dim - 1; k++){ + Constant *dim_k = builder.getInt32(shapes[k]); + Value *rem = builder.CreateURem(trailing, dim_k); + trailing = builder.CreateUDiv(trailing, dim_k); + result[k] = rem; + } + result[dim - 1] = trailing; + return result; +} -//} +void selection::init_axes(ir::instruction *instr, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { + const auto& shapes = instr->get_type()->get_tile_shapes(); + size_t dim = shapes.size(); + std::vector contiguous(dim); + std::vector warp_size(dim); + std::vector n_warps(dim); + for(unsigned i = 0; i < shapes.size(); i++){ + std::string str_i = std::to_string(i); + contiguous[i] = *params_->get_param(instr, "p0.d" + str_i); + warp_size[i] = *params_->get_param(instr, "p1.d" + str_i); + n_warps[i] = *params_->get_param(instr, "p2.d" + str_i); + } + std::vector thread_id_in_warp = delinearize(u_thread_id, warp_size, builder); + std::vector warp_id = delinearize(u_warp_id, n_warps, builder); + // Create axes + std::vector axes(dim); + for(unsigned k = 0; k < dim; k++) { + Value *warp_size_k = builder.getInt32(warp_size[k]); + Value *contiguous_k = builder.getInt32(contiguous[k]); + Value *thread_id = builder.CreateAdd(thread_id_in_warp[k], builder.CreateMul(warp_id[k], warp_size_k)); + thread_id = builder.CreateMul(thread_id, contiguous_k); + unsigned per_block = contiguous[k] * warp_size[k] * n_warps[k]; + unsigned per_thread = contiguous[k] * shapes[k] / per_block; + std::vector idx_list(per_thread); + for(unsigned n = 0 ; n < per_thread; n++){ + unsigned offset = n / contiguous[k] * per_block + n % contiguous[k]; + idx_list[n] = builder.CreateAdd(thread_id, builder.getInt32(offset)); + } + axes[k] = {idx_list}; + } + // Store axes + axes_[instr] = axes; +} + +void selection::create_grids(std::vector &grids, + std::map &references, + ir::function *fn) { + // get number of dimensions greater than 1 + auto get_tile_gt1_dim = [&](ir::value *v){ + unsigned result = 0; + for(unsigned shape: v->get_type()->get_tile_shapes()) { + result += (shape > 1)?shape:0; + } + return result; + }; + // bind references + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *i: block->get_inst_list()){ + if(!i->get_type()->is_tile_ty()) + continue; + const auto& shapes = i->get_type()->get_tile_shapes(); + bool is_shared = dynamic_cast(i); + if(is_shared) + continue; + for(size_t d = 0; d < shapes.size(); d++){ + if(shapes[d] == 1) + continue; + unsigned *x = params_->get_param(i, "p0.d" + std::to_string(d)); + ir::instruction *&r = references[x]; + if(!r || get_tile_gt1_dim(i) > get_tile_gt1_dim(r)) + r = i; + } + } + // create grid + for(auto &ref: references) + if(std::find(grids.begin(), grids.end(), ref.second) == grids.end()) + grids.push_back(ref.second); +} + +void selection::init_grids(ir::function *fn, IRBuilder<> &builder){ + // fetch linear ID + Module *mod = builder.GetInsertBlock()->getParent()->getParent(); + Function *get_thread_id = Intrinsic::getDeclaration(mod, Intrinsic::nvvm_read_ptx_sreg_tid_x); + Value *warp_size = builder.getInt32(32); + Value *u_thread_id = builder.CreateCall(get_thread_id, {}); + Value *u_thread_warp_id = builder.CreateURem(u_thread_id, warp_size); + Value *u_warp_id = builder.CreateUDiv(u_thread_id, warp_size); + // create grid + std::vector grids; + std::map references; + create_grids(grids, references, fn); + for(ir::instruction* i: grids) + init_axes(i, builder, u_thread_warp_id, u_warp_id); + // create tile + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *i: block->get_inst_list()){ + if(!i->get_type()->is_tile_ty()) + continue; + bool is_shared = dynamic_cast(i); + const auto& shapes = i->get_type()->get_tile_shapes(); + // create shared tile + if(is_shared){ + tmap_.insert({i, new shared_tile(shapes)}); + } + // create distributed tile + else { + const auto &shapes = i->get_type()->get_tile_shapes(); + std::vector axes(shapes.size()); + for(size_t d = 0; d < shapes.size(); d++){ + if(shapes[d] > 1){ + unsigned *x = params_->get_param(i, "p0.d" + std::to_string(d)); + axes[d] = axes_.at(references.at(x))[d]; + } + else + axes[d].values = {builder.getInt32(0)}; + } + tmap_.insert({i, new distributed_tile(shapes, axes)}); + } + } +} + +void selection::lower_tile_instruction(ir::instruction *src, llvm::IRBuilder<> &builder) { + +} + +void selection::lower_instruction(ir::instruction *src, IRBuilder<> &builder) { + LLVMContext &ctx = builder.getContext(); + std::cout << typeid(*src).name() << " " << src->get_type()->get_type_id() << std::endl; + if(src->get_type()->is_tile_ty()) { + std::cout << "tile instruction" << std::endl; + lower_tile_instruction(src, builder); + } + else { + Instruction *i = llvm_inst(src, ctx); + vmap_[src] = i; + builder.Insert(i); + } +} void selection::run(ir::module &src, Module &dst){ vmap_.clear(); @@ -166,14 +305,14 @@ void selection::run(ir::module &src, Module &dst){ BasicBlock *dst_block = BasicBlock::Create(dst_ctx, block->get_name(), dst_fn); bmap_[block] = dst_block; } + // create grids + dst_builder.SetInsertPoint(bmap_[fn->blocks()[0]]); + init_grids(fn, dst_builder); // iterate through block for(ir::basic_block *block: fn->blocks()) { dst_builder.SetInsertPoint(bmap_[block]); - for(ir::instruction *inst: block->get_inst_list()) { - Instruction *dst_inst = llvm_inst(inst, dst_ctx); - vmap_[inst] = dst_inst; - dst_builder.Insert(dst_inst); - } + for(ir::instruction *i: block->get_inst_list()) + lower_instruction(i, dst_builder); } // add phi operands for(ir::basic_block *block: fn->blocks()) diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index deb2f858b..7d0a673a2 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -23,6 +23,7 @@ void tune::init_c_phi(ir::instruction *v) { for(unsigned k = 0; k < phi->get_type()->get_tile_shapes().size(); k++) if(dependencies_.find({op, k}) != dependencies_.end() || dependencies_.find({phi, k}) != dependencies_.end()){ + std::cout << typeid(*op).name() << std::endl; add_constraint({phi, k}, {op, k}); } } @@ -32,11 +33,12 @@ void tune::init_c_graph(ir::instruction *v) { if(dynamic_cast(v)){ ir::value *op = v->get_operand(0); unsigned current = 0; - for(unsigned i = 0; i < shapes.size(); i ++) + for(unsigned i = 0; i < shapes.size(); i ++){ if(shapes[i] == 1) static_params_.insert({{v, i}, 1}); else add_constraint({v, i}, {op, current++}); + } } else if(dynamic_cast(v)){ @@ -58,8 +60,9 @@ void tune::init_c_graph(ir::instruction *v) { } else if(dynamic_cast(v)){ for(unsigned i = 0; i < shapes.size(); i ++) - for(ir::value* op: v->ops()) + for(ir::value* op: v->ops()){ add_constraint({v, i}, {op, i}); + } } } @@ -82,8 +85,8 @@ void tune::connected_components(node_t x, const std::vector vals, st } } -void tune::get_params(ir::module &mod, std::vector &result) { - result.clear(); +std::vector tune::get_params(ir::module &mod) { + std::vector result; std::set seen; for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) @@ -92,6 +95,11 @@ void tune::get_params(ir::module &mod, std::vector &result) { if(seen.insert(x.second).second && *x.second == 0){ result.push_back(x.second); } + return result; +} + +std::map tune::get_params(ir::instruction* i) { + return params_.at(i); } void tune::run(ir::module &mod) { @@ -117,9 +125,10 @@ void tune::run(ir::module &mod) { } } -bool tune::check_constraints(ir::module &mod, std::map> &errors) { -for(ir::function *fn: mod.get_function_list()){ - /* grids */ +void tune::create_grids(std::vector &grids, + std::map &references, + ir::function *fn) { + // get number of dimensions greater than 1 auto get_tile_gt1_dim = [&](ir::value *v){ unsigned result = 0; for(unsigned shape: v->get_type()->get_tile_shapes()) { @@ -127,8 +136,7 @@ for(ir::function *fn: mod.get_function_list()){ } return result; }; - using std::to_string; - std::map references; + // bind references for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()){ if(!i->get_type()->is_tile_ty()) @@ -137,16 +145,25 @@ for(ir::function *fn: mod.get_function_list()){ if(*param.second == 1) continue; ir::instruction *&r = references[param.second]; - if(!r && get_tile_gt1_dim(i) > get_tile_gt1_dim(r)) + if(!r || get_tile_gt1_dim(i) > get_tile_gt1_dim(r)) r = i; } } - - // extract unique instructions in order - std::vector grids; + // create grid for(auto &ref: references) if(std::find(grids.begin(), grids.end(), ref.second) == grids.end()) grids.push_back(ref.second); +} + + +bool tune::check_constraints(ir::module &mod, std::map> &errors) { +for(ir::function *fn: mod.get_function_list()){ + using std::to_string; + + // initialize grids + std::map references; + std::vector grids; + create_grids(grids, references, fn); // number of warps int num_warps = 1; diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index 96e52c1d8..e9625187a 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -16,7 +16,6 @@ builder::builder(context &ctx): //===----------------------------------------------------------------------===// // utilities //===----------------------------------------------------------------------===// - void builder::set_insert_point(basic_block::iterator it){ block_ = (*it)->get_parent(); insert_point_ = it; diff --git a/lib/ir/module.cpp b/lib/ir/module.cpp index 532f1ec2f..d95e21c3b 100644 --- a/lib/ir/module.cpp +++ b/lib/ir/module.cpp @@ -85,18 +85,24 @@ ir::value *module::get_value_recursive(const std::string& name, ir::basic_block ir::value *pred = get_value(name, preds.front()); result = make_phi(pred->get_type(), 1, block); set_value(name, block, result); - add_phi_operands(name, (ir::phi_node*&)result); + result = add_phi_operands(name, (ir::phi_node*&)result); } set_value(name, block, result); return result; } ir::value *module::get_value(const std::string& name, ir::basic_block *block) { + ir::basic_block* save_block = builder_.get_insert_block(); + ir::basic_block::iterator save_pt = builder_.get_insert_point(); val_key_t key(name, block); if(values_.find(key) != values_.end()){ return values_.at(key); } - return get_value_recursive(name, block); + ir::value *result = get_value_recursive(name, block); + builder_.set_insert_point(save_block); + if(save_pt != save_block->end()) + builder_.set_insert_point(save_pt); + return result; } ir::value *module::get_value(const std::string& name) { From e2de27dfe2643df4e8d7e676a445f4d5947fa832 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 23 Jan 2019 02:07:53 -0500 Subject: [PATCH 053/494] [project] added missing files --- include/codegen/shared_copy.h | 21 +++++++++++++++++++++ lib/codegen/shared_copy.cpp | 28 ++++++++++++++++++++++++++++ lib/ir/builder.cpp | 1 - 3 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 include/codegen/shared_copy.h create mode 100644 lib/codegen/shared_copy.cpp diff --git a/include/codegen/shared_copy.h b/include/codegen/shared_copy.h new file mode 100644 index 000000000..46cd8cbc8 --- /dev/null +++ b/include/codegen/shared_copy.h @@ -0,0 +1,21 @@ +#ifndef TDL_INCLUDE_CODEGEN_SHARED_COPY_H +#define TDL_INCLUDE_CODEGEN_SHARED_COPY_H + +namespace tdl { + +namespace ir { + class module; +} + +namespace codegen{ + +class place_shared_copy { +public: + void run(ir::module &mod); +}; + + +} +} + +#endif diff --git a/lib/codegen/shared_copy.cpp b/lib/codegen/shared_copy.cpp new file mode 100644 index 000000000..d08a4a145 --- /dev/null +++ b/lib/codegen/shared_copy.cpp @@ -0,0 +1,28 @@ +#include "codegen/shared_copy.h" +#include "ir/module.h" +#include "ir/function.h" +#include "ir/basic_block.h" +#include "ir/instructions.h" + +namespace tdl { + +namespace codegen{ + +void place_shared_copy::run(ir::module &mod) { + ir::builder &builder = mod.get_builder(); + for(ir::function *fn: mod.get_function_list()) + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *i: block->get_inst_list()) + if(dynamic_cast(i)){ + builder.set_insert_point(i); + ir::value *x = i->get_operand(0); + ir::value *y = i->get_operand(1); + ir::value *rx = builder.create_copy_to_shared(x); + ir::value *ry = builder.create_copy_to_shared(y); + x->replace_all_uses_with(rx); + y->replace_all_uses_with(ry); + } +} + +} +} diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index e9625187a..848f668b7 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -3,7 +3,6 @@ #include "ir/builder.h" #include "ir/constant.h" #include "ir/instructions.h" -#include "ir/intrinsics.h" #include "ir/type.h" #include "llvm/IR/Instruction.h" From e522b06be2c8e1d948fef6075846900952d94bb3 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 26 Jan 2019 02:05:56 -0500 Subject: [PATCH 054/494] [code generation]: more progress for instruction selection --- examples/matrix.cpp | 25 +++++++++------------- include/ast/ast.h | 16 +++++++++++---- include/codegen/selection.h | 14 ++++++++++--- include/ir/instructions.h | 2 ++ lib/ast/lowering.cpp | 3 ++- lib/codegen/selection.cpp | 41 ++++++++++++++++++++++++++++++------- lib/codegen/tune.cpp | 18 ++++++++++++---- lib/ir/instructions.cpp | 6 ++++++ 8 files changed, 91 insertions(+), 34 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index bc37b81ae..bc7239038 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -32,10 +32,7 @@ void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K){\ int32 k;\ fp32* pa[32, 8] = a + rx[:, newaxis] + rka[newaxis, :]*M;\ fp32* pb[32, 8] = b + ry[:, newaxis] + rkb[newaxis, :]*K;\ - fp32* pc[32, 32];\ - for(k = 0; k < K; k = k + 8){\ - }\ - pc = c + rx[:, newaxis] + ry[newaxis, :];\ + fp32* pc[32, 32] = c + rx[:, newaxis] + ry[newaxis, :]*M;\ *pc = C;\ }\ "; @@ -59,15 +56,13 @@ int main() { tune.run(module); std::vector params = { // asm - 2, 16, 1, + 2, 8, 1, // bsn - 2, 16, 1, + 4, 4, 1, // pa - 1, 2, 4, + 2, 4, 1, // pb - 1, 2, 4, - // c - 2, 16, 1, 1, 2, 4 + 1, 8, 1, }; std::map> errors; unsigned i = 0; @@ -75,11 +70,11 @@ int main() { for(unsigned *x: tune.get_params(module)) *x = params[i++]; tune.check_constraints(module, errors); -// std::cout << "errors: " << errors.size() << std::endl; -// for(auto &x: errors){ -// for(auto &e: x.second) -// std::cout << e << std::endl; -// } + std::cout << "errors: " << errors.size() << std::endl; + for(auto &x: errors){ + for(auto &e: x.second) + std::cout << e << std::endl; + } shared.run(module); liveness.run(module); allocation.run(); diff --git a/include/ast/ast.h b/include/ast/ast.h index fed9d6556..7a2a62563 100644 --- a/include/ast/ast.h +++ b/include/ast/ast.h @@ -114,9 +114,15 @@ public: const slice_enum_t type_; }; +class named_expression; + class expression: public node{ public: virtual ir::value* codegen(ir::module *) const = 0; + named_expression *lvalue() const { return lvalue_; } + +protected: + named_expression *lvalue_; }; class postfix_expression: public expression{ @@ -163,10 +169,9 @@ private: const list* slices_; }; - class named_expression: public expression { public: - named_expression(node *id): id_((const identifier*)id) { } + named_expression(node *id): id_((const identifier*)id) { lvalue_ = this; } const identifier *id() const { return id_; } ir::value* codegen(ir::module * mod) const; @@ -227,8 +232,11 @@ private: public: unary_operator(UNARY_OP_T op, node *arg) - : op_(op), - arg_((expression*)arg) { } + : op_(op), + arg_((expression*)arg) { + if(op == DEREF) + this->lvalue_ = arg_->lvalue(); + } UNARY_OP_T get_op() const { return op_; } ir::value* codegen(ir::module *mod) const; diff --git a/include/codegen/selection.h b/include/codegen/selection.h index 92c9f79b5..6fa05782e 100644 --- a/include/codegen/selection.h +++ b/include/codegen/selection.h @@ -32,9 +32,10 @@ protected: typedef std::vector shapes_t; public: - tile(const shapes_t &shapes): shapes_(shapes){ } + tile(llvm::Type *ty, const shapes_t &shapes): shapes_(shapes){ } private: + llvm::Type *ty_; shapes_t shapes_; }; @@ -46,13 +47,20 @@ public: class distributed_tile: public tile{ typedef std::vector axes_t; + typedef std::vector indices_t; + typedef std::map indices_map_t; + typedef std::vector values_t; + +private: + void init_indices(); public: - distributed_tile(const shapes_t& shapes, const axes_t &axes) - : tile(shapes), axes_(axes) {} + distributed_tile(llvm::Type *ty, const shapes_t& shapes, const axes_t &axes); private: axes_t axes_; + indices_map_t indices_; + values_t values_; }; diff --git a/include/ir/instructions.h b/include/ir/instructions.h index 09c129160..44bafb151 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -26,6 +26,8 @@ public: const basic_block *get_parent() const { return parent_; } basic_block *get_parent() { return parent_; } void erase_from_parent(); + // helpers + bool has_tile_result_or_op(); private: basic_block *parent_; diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 490dcdba1..9d91e3588 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -523,7 +523,8 @@ ir::value *assignment_expression::codegen(ir::module *mod) const{ mod->set_value(x->id()->name(), rvalue); else if(auto* x = dynamic_cast(lvalue_)){ assert(x->get_op()==DEREF); - ir::value *ptr = x->codegen(mod); + assert(x->lvalue()); + ir::value *ptr = x->lvalue()->codegen(mod); mod->get_builder().create_store(ptr, rvalue); } return rvalue; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 4092be811..497f1f302 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -13,6 +13,33 @@ namespace codegen{ using namespace llvm; +/* Distributed Tile */ +void distributed_tile::init_indices() { + std::vector id(axes_.size(), 0); + size_t k = 0; + while(true) { + indices_t current; + for(size_t d = 0; d < id.size(); d++) + current.push_back(axes_[d].values[id[d]]); + indices_[current] = indices_.size(); + id[0]++; + while(id[k] == axes_[k].values.size()){ + if(k == id.size() - 1) + return; + id[k++] = 0; + id[k]++; + } + k = 0; + } +} + +distributed_tile::distributed_tile(Type *ty, const shapes_t &shapes, const axes_t &axes) + : tile(ty, shapes), axes_(axes) { + init_indices(); + for(size_t i = 0; i < indices_.size(); i++) + values_.push_back(UndefValue::get(ty_)); +} + /* convert ir::type to Type */ Type *selection::llvm_type(ir::type *ty, LLVMContext &ctx) { @@ -186,7 +213,7 @@ void selection::init_axes(ir::instruction *instr, IRBuilder<> &builder, Value *u unsigned offset = n / contiguous[k] * per_block + n % contiguous[k]; idx_list[n] = builder.CreateAdd(thread_id, builder.getInt32(offset)); } - axes[k] = {idx_list}; + axes[k] = distributed_axis{idx_list}; } // Store axes axes_[instr] = axes; @@ -230,6 +257,7 @@ void selection::create_grids(std::vector &grids, void selection::init_grids(ir::function *fn, IRBuilder<> &builder){ // fetch linear ID Module *mod = builder.GetInsertBlock()->getParent()->getParent(); + LLVMContext &ctx = builder.getContext(); Function *get_thread_id = Intrinsic::getDeclaration(mod, Intrinsic::nvvm_read_ptx_sreg_tid_x); Value *warp_size = builder.getInt32(32); Value *u_thread_id = builder.CreateCall(get_thread_id, {}); @@ -248,9 +276,10 @@ void selection::init_grids(ir::function *fn, IRBuilder<> &builder){ continue; bool is_shared = dynamic_cast(i); const auto& shapes = i->get_type()->get_tile_shapes(); + Type* ty = llvm_type(i->get_type(), ctx); // create shared tile if(is_shared){ - tmap_.insert({i, new shared_tile(shapes)}); + tmap_.insert({i, new shared_tile(ty, shapes)}); } // create distributed tile else { @@ -264,20 +293,18 @@ void selection::init_grids(ir::function *fn, IRBuilder<> &builder){ else axes[d].values = {builder.getInt32(0)}; } - tmap_.insert({i, new distributed_tile(shapes, axes)}); + tmap_.insert({i, new distributed_tile(ty, shapes, axes)}); } } } void selection::lower_tile_instruction(ir::instruction *src, llvm::IRBuilder<> &builder) { - + std::cout << typeid(*src).name() << std::endl; } void selection::lower_instruction(ir::instruction *src, IRBuilder<> &builder) { LLVMContext &ctx = builder.getContext(); - std::cout << typeid(*src).name() << " " << src->get_type()->get_type_id() << std::endl; - if(src->get_type()->is_tile_ty()) { - std::cout << "tile instruction" << std::endl; + if(src->has_tile_result_or_op()) { lower_tile_instruction(src, builder); } else { diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 7d0a673a2..63e4f582f 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -29,7 +29,13 @@ void tune::init_c_phi(ir::instruction *v) { } void tune::init_c_graph(ir::instruction *v) { - const auto& shapes = v->get_type()->get_tile_shapes(); + // Reference shape + std::vector shapes; + if(auto *store = dynamic_cast(v)) + shapes = store->get_pointer_operand()->get_type()->get_tile_shapes(); + else + shapes = v->get_type()->get_tile_shapes(); + // Reshape if(dynamic_cast(v)){ ir::value *op = v->get_operand(0); unsigned current = 0; @@ -40,9 +46,11 @@ void tune::init_c_graph(ir::instruction *v) { add_constraint({v, i}, {op, current++}); } } + // Splat else if(dynamic_cast(v)){ } + // Broadcast else if(dynamic_cast(v)){ ir::value *op = v->get_operand(0); ir::type *op_ty = op->get_type(); @@ -51,13 +59,14 @@ void tune::init_c_graph(ir::instruction *v) { if(op_shapes[i] == shapes[i] && v != op) add_constraint({v, i}, {op, i}); } - } + // Matrix multiplication else if(dynamic_cast(v)){ ir::value *D = v->get_operand(2); add_constraint({v, 0}, {D, 0}); add_constraint({v, 1}, {D, 1}); } + // Element-wise else if(dynamic_cast(v)){ for(unsigned i = 0; i < shapes.size(); i ++) for(ir::value* op: v->ops()){ @@ -102,18 +111,19 @@ std::map tune::get_params(ir::instruction* i) { return params_.at(i); } + void tune::run(ir::module &mod) { for(ir::function *fn: mod.get_function_list()){ // Build constraints graph for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i : block->get_inst_list()) - if(i->get_type()->is_tile_ty()){ + if(i->has_tile_result_or_op()){ init_c_graph(i); } // Build phi constraints for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i : block->get_inst_list()) - if(i->get_type()->is_tile_ty()) + if(i->has_tile_result_or_op()) init_c_phi(i); // Layout parameters while(!nodes_.empty()){ diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 679ee5bb2..3cbabd45e 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -25,6 +25,12 @@ void instruction::erase_from_parent() { parent_->erase(this); } +bool instruction::has_tile_result_or_op() { + bool result = get_type()->is_tile_ty(); + for(ir::value *v: ops()) + result |= v->get_type()->is_tile_ty(); + return result; +} //===----------------------------------------------------------------------===// // phi_node classes From 5aec34a0945d0e8430fda6acf4d5cd12daf4251a Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 6 Feb 2019 15:02:01 -0500 Subject: [PATCH 055/494] [code generation] improved handling of constants --- include/codegen/selection.h | 21 ++-- include/ir/instructions.h | 2 +- lib/codegen/selection.cpp | 222 +++++++++++++++++++++++++++--------- lib/codegen/tune.cpp | 9 +- 4 files changed, 184 insertions(+), 70 deletions(-) diff --git a/include/codegen/selection.h b/include/codegen/selection.h index 6fa05782e..6767d560d 100644 --- a/include/codegen/selection.h +++ b/include/codegen/selection.h @@ -22,6 +22,7 @@ namespace codegen{ class allocation; class tune; +typedef std::vector indices_t; struct distributed_axis { std::vector values; @@ -33,6 +34,8 @@ protected: public: tile(llvm::Type *ty, const shapes_t &shapes): shapes_(shapes){ } + virtual void set_value(indices_t idx, llvm::Value *v) = 0; + virtual llvm::Value* get_value(indices_t idx) = 0; private: llvm::Type *ty_; @@ -42,12 +45,12 @@ private: class shared_tile: public tile { public: using tile::tile; - + void set_value(indices_t idx, llvm::Value *v) { } + llvm::Value* get_value(indices_t idx) { return nullptr; } }; class distributed_tile: public tile{ typedef std::vector axes_t; - typedef std::vector indices_t; typedef std::map indices_map_t; typedef std::vector values_t; @@ -56,6 +59,9 @@ private: public: distributed_tile(llvm::Type *ty, const shapes_t& shapes, const axes_t &axes); + void set_value(indices_t idx, llvm::Value *v); + llvm::Value* get_value(indices_t idx); + void for_each(std::function fn); private: axes_t axes_; @@ -73,14 +79,15 @@ private: // LLVM conversions llvm::Type* llvm_type(ir::type *ty, llvm::LLVMContext &ctx); llvm::Value* llvm_value(ir::value *v,llvm:: LLVMContext &ctx); - llvm::Instruction* llvm_inst(ir::instruction *inst, llvm::LLVMContext &ctx); + llvm::Instruction* llvm_inst(ir::instruction *inst, std::function value, llvm::LLVMContext &ctx); llvm::Constant* llvm_constant(ir::constant *cst, llvm::LLVMContext &ctx); // grid construction - void create_grids(std::vector &grids, - std::map &references, + void create_grids(std::vector &grids, + std::map &references, ir::function *fn); - void init_axes(ir::instruction *i, llvm::IRBuilder<> &builder, llvm::Value *u_thread_id, llvm::Value *u_warp_id); + void create_tile(ir::value *v, llvm::IRBuilder<> &builder, const std::map &references, std::set &seen); + void init_axes(ir::value *i, llvm::IRBuilder<> &builder, llvm::Value *u_thread_id, llvm::Value *u_warp_id); void init_grids(ir::function *fn, llvm::IRBuilder<> &builder); // lowering @@ -97,7 +104,7 @@ private: tmap_t tmap_; allocation *alloc_; tune *params_; - std::map> axes_; + std::map> axes_; }; } diff --git a/include/ir/instructions.h b/include/ir/instructions.h index 44bafb151..cc694fd7b 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -369,6 +369,7 @@ public: static instruction* create(context &ctx, unsigned axis, unsigned size, const std::string &name = "", instruction *next = nullptr); + unsigned get_axis() const { return axis_; } private: unsigned axis_; @@ -388,7 +389,6 @@ public: // intrinsics classes //===----------------------------------------------------------------------===// - class copy_to_shared_inst: public unary_inst{ using unary_inst::unary_inst; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 497f1f302..d27b0b870 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -37,7 +37,20 @@ distributed_tile::distributed_tile(Type *ty, const shapes_t &shapes, const axes_ : tile(ty, shapes), axes_(axes) { init_indices(); for(size_t i = 0; i < indices_.size(); i++) - values_.push_back(UndefValue::get(ty_)); + values_.push_back(UndefValue::get(ty)); +} + +void distributed_tile::set_value(indices_t idx, Value *v) { + values_[indices_[idx]] = v; +} + +Value* distributed_tile::get_value(indices_t idx) { + return values_[indices_[idx]]; +} + +void distributed_tile::for_each(std::function fn) { + for(auto &idx: indices_) + fn(idx.first); } @@ -92,8 +105,7 @@ Constant *selection::llvm_constant(ir::constant *cst, LLVMContext &ctx) { /* convert ir::instruction to llvm::Instruction */ -Instruction *selection::llvm_inst(ir::instruction *inst, LLVMContext & ctx) { - auto value = [&](ir::value *x) { return llvm_value(x, ctx); }; +Instruction *selection::llvm_inst(ir::instruction *inst, std::function value, LLVMContext & ctx) { auto block = [&](ir::basic_block *x) { return bmap_.at(x); }; auto type = [&](ir::type *x) { return llvm_type(x, ctx); }; if(auto* ii = dynamic_cast(inst)){ @@ -107,7 +119,7 @@ Instruction *selection::llvm_inst(ir::instruction *inst, LLVMContext & ctx) { return BranchInst::Create(dest); } if(auto* ii = dynamic_cast(inst)){ - Type *ty = type(ii->get_type()); + Type *ty = type(ii->get_type()->get_scalar_ty()); unsigned num_ops = ii->get_num_operands(); return PHINode::Create(ty, num_ops, ii->get_name()); } @@ -134,14 +146,14 @@ Instruction *selection::llvm_inst(ir::instruction *inst, LLVMContext & ctx) { } if(auto* ii = dynamic_cast(inst)){ Value *arg = value(ii->get_operand(0)); - Type *dst_ty = type(ii->get_type()); + Type *dst_ty = type(ii->get_type()->get_scalar_ty()); return CastInst::Create(ii->get_op(), arg, dst_ty, ii->get_name()); } if(auto* ii = dynamic_cast(inst)){ std::vector idx_vals; std::transform(ii->idx_begin(), ii->idx_end(), std::back_inserter(idx_vals), [&value](ir::value* x){ return value(x);}); - Type *source_ty = type(ii->get_source_elt_ty()); + Type *source_ty = type(ii->get_source_elt_ty()->get_scalar_ty()); Value *arg = value(ii->get_operand(0)); return GetElementPtrInst::Create(source_ty, arg, idx_vals, ii->get_name()); } @@ -165,8 +177,10 @@ Value* selection::llvm_value(ir::value *v, LLVMContext &ctx) { if(auto *cc = dynamic_cast(v)) return llvm_constant(cc, ctx); // instruction - if(auto *ii = dynamic_cast(v)) - return llvm_inst(ii, ctx); + if(auto *ii = dynamic_cast(v)){ + auto value = [&](ir::value *x) { return llvm_value(x, ctx); }; + return llvm_inst(ii, value, ctx); + } // unknown value throw std::runtime_error("unknown conversion from ir::value to Value"); } @@ -185,17 +199,17 @@ std::vector delinearize(Value *trailing, std::vector &shapes, return result; } -void selection::init_axes(ir::instruction *instr, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { - const auto& shapes = instr->get_type()->get_tile_shapes(); +void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { + const auto& shapes = v->get_type()->get_tile_shapes(); size_t dim = shapes.size(); std::vector contiguous(dim); std::vector warp_size(dim); std::vector n_warps(dim); for(unsigned i = 0; i < shapes.size(); i++){ std::string str_i = std::to_string(i); - contiguous[i] = *params_->get_param(instr, "p0.d" + str_i); - warp_size[i] = *params_->get_param(instr, "p1.d" + str_i); - n_warps[i] = *params_->get_param(instr, "p2.d" + str_i); + contiguous[i] = *params_->get_param(v, "p0.d" + str_i); + warp_size[i] = *params_->get_param(v, "p1.d" + str_i); + n_warps[i] = *params_->get_param(v, "p2.d" + str_i); } std::vector thread_id_in_warp = delinearize(u_thread_id, warp_size, builder); std::vector warp_id = delinearize(u_warp_id, n_warps, builder); @@ -216,11 +230,11 @@ void selection::init_axes(ir::instruction *instr, IRBuilder<> &builder, Value *u axes[k] = distributed_axis{idx_list}; } // Store axes - axes_[instr] = axes; + axes_[v] = axes; } -void selection::create_grids(std::vector &grids, - std::map &references, +void selection::create_grids(std::vector &grids, + std::map &references, ir::function *fn) { // get number of dimensions greater than 1 auto get_tile_gt1_dim = [&](ir::value *v){ @@ -231,75 +245,171 @@ void selection::create_grids(std::vector &grids, return result; }; // bind references - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i: block->get_inst_list()){ - if(!i->get_type()->is_tile_ty()) - continue; - const auto& shapes = i->get_type()->get_tile_shapes(); - bool is_shared = dynamic_cast(i); + std::set seen; + std::function bind_references = [&](ir::value *v) + { + // skip + if(!v->get_type()->is_tile_ty() || !seen.insert(v).second) + return; + // recurse + if(auto *user = dynamic_cast(v)) + for(ir::value *op: user->ops()) + bind_references(op); + // bind + const auto& shapes = v->get_type()->get_tile_shapes(); + bool is_shared = dynamic_cast(v); if(is_shared) - continue; + return; for(size_t d = 0; d < shapes.size(); d++){ if(shapes[d] == 1) continue; - unsigned *x = params_->get_param(i, "p0.d" + std::to_string(d)); - ir::instruction *&r = references[x]; - if(!r || get_tile_gt1_dim(i) > get_tile_gt1_dim(r)) - r = i; + unsigned *x = params_->get_param(v, "p0.d" + std::to_string(d)); + ir::value *&r = references[x]; + if(!r || get_tile_gt1_dim(v) > get_tile_gt1_dim(r)) + r = v; } - } + }; + + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *i: block->get_inst_list()) + bind_references(i); + // create grid for(auto &ref: references) if(std::find(grids.begin(), grids.end(), ref.second) == grids.end()) grids.push_back(ref.second); } +void selection::create_tile(ir::value *v, IRBuilder<> &builder, + const std::map& references, + std::set &seen) { + if(!v->get_type()->is_tile_ty() || !seen.insert(v).second) + return; + if(auto *user = dynamic_cast(v)) + for(ir::value *op: user->ops()) + create_tile(op, builder, references, seen); + LLVMContext &ctx = builder.getContext(); + bool is_shared = dynamic_cast(v); + const auto& shapes = v->get_type()->get_tile_shapes(); + Type* ty = llvm_type(v->get_type()->get_scalar_ty(), ctx); + // create shared tile + if(is_shared){ + tmap_.insert({v, new shared_tile(ty, shapes)}); + } + // create distributed tile + else { + const auto &shapes = v->get_type()->get_tile_shapes(); + std::vector axes(shapes.size()); + for(size_t d = 0; d < shapes.size(); d++){ + if(shapes[d] > 1){ + unsigned *x = params_->get_param(v, "p0.d" + std::to_string(d)); + axes[d] = axes_.at(references.at(x))[d]; + } + else + axes[d].values = {builder.getInt32(0)}; + } + tmap_.insert({v, new distributed_tile(ty, shapes, axes)}); + } +} + void selection::init_grids(ir::function *fn, IRBuilder<> &builder){ // fetch linear ID Module *mod = builder.GetInsertBlock()->getParent()->getParent(); - LLVMContext &ctx = builder.getContext(); Function *get_thread_id = Intrinsic::getDeclaration(mod, Intrinsic::nvvm_read_ptx_sreg_tid_x); Value *warp_size = builder.getInt32(32); Value *u_thread_id = builder.CreateCall(get_thread_id, {}); Value *u_thread_warp_id = builder.CreateURem(u_thread_id, warp_size); Value *u_warp_id = builder.CreateUDiv(u_thread_id, warp_size); // create grid - std::vector grids; - std::map references; + std::vector grids; + std::map references; create_grids(grids, references, fn); - for(ir::instruction* i: grids) + for(ir::value* i: grids) init_axes(i, builder, u_thread_warp_id, u_warp_id); // create tile + std::set seen; for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()){ if(!i->get_type()->is_tile_ty()) continue; - bool is_shared = dynamic_cast(i); - const auto& shapes = i->get_type()->get_tile_shapes(); - Type* ty = llvm_type(i->get_type(), ctx); - // create shared tile - if(is_shared){ - tmap_.insert({i, new shared_tile(ty, shapes)}); - } - // create distributed tile - else { - const auto &shapes = i->get_type()->get_tile_shapes(); - std::vector axes(shapes.size()); - for(size_t d = 0; d < shapes.size(); d++){ - if(shapes[d] > 1){ - unsigned *x = params_->get_param(i, "p0.d" + std::to_string(d)); - axes[d] = axes_.at(references.at(x))[d]; - } - else - axes[d].values = {builder.getInt32(0)}; - } - tmap_.insert({i, new distributed_tile(ty, shapes, axes)}); - } + create_tile(i, builder, references, seen); } } -void selection::lower_tile_instruction(ir::instruction *src, llvm::IRBuilder<> &builder) { - std::cout << typeid(*src).name() << std::endl; + +void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> &builder) { + Module *module = builder.GetInsertBlock()->getModule(); + LLVMContext &ctx = builder.getContext(); + tile *ti = tmap_[ins]; + distributed_tile* result = (distributed_tile*)ti; + if(!ins->get_type()->is_tile_ty()) + return; + const auto& shapes = ins->get_type()->get_tile_shapes(); + // global_range + if(auto *x = dynamic_cast(ins)) { + static std::array ctaid = { + Intrinsic::nvvm_read_ptx_sreg_ctaid_x, + Intrinsic::nvvm_read_ptx_sreg_ctaid_y, + Intrinsic::nvvm_read_ptx_sreg_ctaid_z + }; + Function *get_group_id = Intrinsic::getDeclaration(module, ctaid[x->get_axis()]); + Value *group_id = builder.CreateCall(get_group_id, {}); + Value *offset = builder.CreateMul(builder.getInt32(shapes[0]), group_id); + result->for_each([&](indices_t idx){ + BinaryOperator *bin = static_cast(idx[0]); + result->set_value(idx, builder.CreateAdd(bin->getOperand(1), + builder.CreateAdd(bin->getOperand(0), offset))); + }); + } + // reshape + else if(dynamic_cast(ins)) { + ir::value* in = ins->get_operand(0); + distributed_tile *in_tile = (distributed_tile*)tmap_.at(in); + result->for_each([&](indices_t out_idx){ + indices_t in_idx; + for(size_t k = 0; k < shapes.size(); k++){ + if(shapes[k] > 1) + in_idx.push_back(out_idx[k]); + } + result->set_value(out_idx, in_tile->get_value(in_idx)); + }); + } + // splat + else if(dynamic_cast(ins)) { + result->for_each([&](indices_t idx) { + result->set_value(idx, llvm_value(ins->get_operand(0), ctx)); + }); + } + // broadcast + else if(dynamic_cast(ins)) { + ir::value* in = ins->get_operand(0); + const auto& in_shapes = in->get_type()->get_tile_shapes(); + distributed_tile *in_tile = (distributed_tile*)tmap_.at(in); + result->for_each([&](indices_t out_idx){ + indices_t in_idx = out_idx; + for(size_t k = 0; k < in_idx.size(); k++){ + if(in_shapes[k] == 1) + in_idx[k] = builder.getInt32(0); + result->set_value(out_idx, in_tile->get_value(in_idx)); + } + }); + } + // copy to shared + else if(dynamic_cast(ins)) { + + } + // element-wise + else { + result->for_each([&](indices_t idx){ + auto value = [&](ir::value *x) { + if(x->get_type()->is_tile_ty()) + return tmap_.at(x)->get_value(idx); + else + return llvm_value(x, ctx); + }; + result->set_value(idx, llvm_inst(ins, value, ctx)); + }); + } } void selection::lower_instruction(ir::instruction *src, IRBuilder<> &builder) { @@ -308,7 +418,7 @@ void selection::lower_instruction(ir::instruction *src, IRBuilder<> &builder) { lower_tile_instruction(src, builder); } else { - Instruction *i = llvm_inst(src, ctx); + Instruction *i = (Instruction*)llvm_value(src, ctx); vmap_[src] = i; builder.Insert(i); } diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 63e4f582f..c98b2ae66 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -23,7 +23,6 @@ void tune::init_c_phi(ir::instruction *v) { for(unsigned k = 0; k < phi->get_type()->get_tile_shapes().size(); k++) if(dependencies_.find({op, k}) != dependencies_.end() || dependencies_.find({phi, k}) != dependencies_.end()){ - std::cout << typeid(*op).name() << std::endl; add_constraint({phi, k}, {op, k}); } } @@ -79,11 +78,9 @@ void tune::connected_components(node_t x, const std::vector vals, st if(nodes.find(x) != nodes.end()){ nodes.erase(x); std::string suffix = ".d" + std::to_string(x.second); - if(auto *instr = dynamic_cast(x.first)){ - params_[instr].insert({"p0" + suffix, vals[0]}); - params_[instr].insert({"p1" + suffix, vals[1]}); - params_[instr].insert({"p2" + suffix, vals[2]}); - } + params_[x.first].insert({"p0" + suffix, vals[0]}); + params_[x.first].insert({"p1" + suffix, vals[1]}); + params_[x.first].insert({"p2" + suffix, vals[2]}); if(static_params_.find(x) != static_params_.end()){ *vals[0] = static_params_.at(x); *vals[1] = static_params_.at(x); From 4490061950081316dbda4b5861271eca8fd7c36e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 6 Feb 2019 17:21:07 -0500 Subject: [PATCH 056/494] test --- examples/matrix.cpp | 8 +- include/codegen/selection.h | 4 +- lib/ast/lowering.cpp | 82 ++++++++-------- lib/codegen/selection.cpp | 185 ++++++++++++++++++++---------------- 4 files changed, 149 insertions(+), 130 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index bc7239038..9599a8456 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -82,9 +82,9 @@ int main() { // std::vector params = tune.get_params(module); // std::cout << params.size() << std::endl; // selection.run(module, llvm_module); -// // print LLVM program -// llvm::PrintModulePass print(llvm::outs()); -// llvm::AnalysisManager analysis; -// print.run(llvm_module, analysis); + // print LLVM program + llvm::PrintModulePass print(llvm::outs()); + llvm::AnalysisManager analysis; + print.run(llvm_module, analysis); return 0; } diff --git a/include/codegen/selection.h b/include/codegen/selection.h index 6767d560d..45d614813 100644 --- a/include/codegen/selection.h +++ b/include/codegen/selection.h @@ -78,8 +78,8 @@ class selection{ private: // LLVM conversions llvm::Type* llvm_type(ir::type *ty, llvm::LLVMContext &ctx); - llvm::Value* llvm_value(ir::value *v,llvm:: LLVMContext &ctx); - llvm::Instruction* llvm_inst(ir::instruction *inst, std::function value, llvm::LLVMContext &ctx); + llvm::Value* llvm_value(ir::value *v, llvm:: LLVMContext &ctx, llvm::IRBuilder<> &builder); + llvm::Instruction* llvm_inst(ir::instruction *inst, std::function value, llvm::LLVMContext &ctx, llvm::IRBuilder<> &builder); llvm::Constant* llvm_constant(ir::constant *cst, llvm::LLVMContext &ctx); // grid construction diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 9d91e3588..5f3b37170 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -324,7 +324,7 @@ ir::value* initializer::codegen(ir::module * mod) const{ ir::value *value = ir::undef_value::get(ty); if(expr_){ value = expr_->codegen(mod); - explicit_cast(mod->get_builder(), value, ty->get_scalar_ty()); + value = explicit_cast(mod->get_builder(), value, ty->get_scalar_ty()); implicit_broadcast(mod, value, ty); } value->set_name(name); @@ -336,85 +336,85 @@ ir::value* initializer::codegen(ir::module * mod) const{ /* Expression */ /*------------------*/ /* Binary operator */ -ir::value *binary_operator::llvm_op(ir::module *mod, ir::builder &builder, ir::value *arg, ir::value *rhs, const std::string &name) const +ir::value *binary_operator::llvm_op(ir::module *mod, ir::builder &builder, ir::value *lhs, ir::value *rhs, const std::string &name) const { bool is_float = false, is_ptr = false, is_int = false, is_signed = false; - implicit_cast(builder, arg, rhs, is_float, is_ptr, is_int, is_signed); - implicit_broadcast(mod, arg, rhs); + implicit_cast(builder, lhs, rhs, is_float, is_ptr, is_int, is_signed); + implicit_broadcast(mod, lhs, rhs); if(op_==MUL && is_float) - return builder.create_fmul(arg, rhs, name); + return builder.create_fmul(lhs, rhs, name); if(op_==MUL && is_int) - return builder.create_mul(arg, rhs, name); + return builder.create_mul(lhs, rhs, name); if(op_==DIV && is_float) - return builder.create_fdiv(arg, rhs, name); + return builder.create_fdiv(lhs, rhs, name); if(op_==DIV && is_int && is_signed) - return builder.create_sdiv(arg, rhs, name); + return builder.create_sdiv(lhs, rhs, name); if(op_==DIV && is_int && !is_signed) - return builder.create_udiv(arg, rhs, name); + return builder.create_udiv(lhs, rhs, name); if(op_==MOD && is_float) - return builder.create_frem(arg, rhs, name); + return builder.create_frem(lhs, rhs, name); if(op_==MOD && is_int && is_signed) - return builder.create_srem(arg, rhs, name); + return builder.create_srem(lhs, rhs, name); if(op_==MOD && is_int && !is_signed) - return builder.create_urem(arg, rhs, name); + return builder.create_urem(lhs, rhs, name); if(op_==ADD && is_float) - return builder.create_fadd(arg, rhs, name); + return builder.create_fadd(lhs, rhs, name); if(op_==ADD && is_int) - return builder.create_add(arg, rhs); + return builder.create_add(lhs, rhs); if(op_==ADD && is_ptr) - return builder.create_gep(arg, {rhs}); + return builder.create_gep(lhs, {rhs}); if(op_==SUB && is_float) - return builder.create_fsub(arg, rhs, name); + return builder.create_fsub(lhs, rhs, name); if(op_==SUB && is_int) - return builder.create_sub(arg, rhs, name); + return builder.create_sub(lhs, rhs, name); if(op_==SUB && is_ptr) - return builder.create_gep(arg, {builder.create_neg(rhs)}); + return builder.create_gep(lhs, {builder.create_neg(rhs)}); if(op_==LEFT_SHIFT) - return builder.create_shl(arg, rhs, name); + return builder.create_shl(lhs, rhs, name); if(op_==RIGHT_SHIFT) - return builder.create_ashr(arg, rhs, name); + return builder.create_ashr(lhs, rhs, name); if(op_ == LT && is_float) - return builder.create_fcmpOLT(arg, rhs, name); + return builder.create_fcmpOLT(lhs, rhs, name); if(op_ == LT && is_int && is_signed) - return builder.create_icmpSLT(arg, rhs, name); + return builder.create_icmpSLT(lhs, rhs, name); if(op_ == LT && is_int && !is_signed) - return builder.create_icmpULT(arg, rhs, name); + return builder.create_icmpULT(lhs, rhs, name); if(op_ == GT && is_float) - return builder.create_fcmpOGT(arg, rhs, name); + return builder.create_fcmpOGT(lhs, rhs, name); if(op_ == GT && is_int && is_signed) - return builder.create_icmpSGT(arg, rhs, name); + return builder.create_icmpSGT(lhs, rhs, name); if(op_ == GT && is_int && !is_signed) - return builder.create_icmpUGT(arg, rhs, name); + return builder.create_icmpUGT(lhs, rhs, name); if(op_ == LE && is_float) - return builder.create_fcmpOLE(arg, rhs, name); + return builder.create_fcmpOLE(lhs, rhs, name); if(op_ == LE && is_int && is_signed) - return builder.create_icmpSLE(arg, rhs, name); + return builder.create_icmpSLE(lhs, rhs, name); if(op_ == LE && is_int && !is_signed) - return builder.create_icmpULE(arg, rhs, name); + return builder.create_icmpULE(lhs, rhs, name); if(op_ == GE && is_float) - return builder.create_fcmpOGE(arg, rhs, name); + return builder.create_fcmpOGE(lhs, rhs, name); if(op_ == GE && is_int && is_signed) - return builder.create_icmpSGE(arg, rhs, name); + return builder.create_icmpSGE(lhs, rhs, name); if(op_ == GE && is_int && !is_signed) - return builder.create_icmpUGE(arg, rhs, name); + return builder.create_icmpUGE(lhs, rhs, name); if(op_ == EQ && is_float) - return builder.create_fcmpOEQ(arg, rhs, name); + return builder.create_fcmpOEQ(lhs, rhs, name); if(op_ == EQ && is_int) - return builder.create_icmpEQ(arg, rhs, name); + return builder.create_icmpEQ(lhs, rhs, name); if(op_ == NE && is_float) - return builder.create_fcmpONE(arg, rhs, name); + return builder.create_fcmpONE(lhs, rhs, name); if(op_ == NE && is_int) - return builder.create_icmpNE(arg, rhs, name); + return builder.create_icmpNE(lhs, rhs, name); if(op_ == AND) - return builder.create_and(arg, rhs, name); + return builder.create_and(lhs, rhs, name); if(op_ == XOR) - return builder.create_xor(arg, rhs, name); + return builder.create_xor(lhs, rhs, name); if(op_ == OR) - return builder.create_or(arg, rhs, name); + return builder.create_or(lhs, rhs, name); if(op_ == LAND) - return builder.create_and(arg, rhs, name); + return builder.create_and(lhs, rhs, name); if(op_ == LOR) - return builder.create_or(arg, rhs, name); + return builder.create_or(lhs, rhs, name); throw std::runtime_error("unreachable"); } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index d27b0b870..31aa2a08c 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -105,49 +105,49 @@ Constant *selection::llvm_constant(ir::constant *cst, LLVMContext &ctx) { /* convert ir::instruction to llvm::Instruction */ -Instruction *selection::llvm_inst(ir::instruction *inst, std::function value, LLVMContext & ctx) { +Instruction *selection::llvm_inst(ir::instruction *inst, std::function value, LLVMContext & ctx, IRBuilder<> &builder) { auto block = [&](ir::basic_block *x) { return bmap_.at(x); }; auto type = [&](ir::type *x) { return llvm_type(x, ctx); }; if(auto* ii = dynamic_cast(inst)){ BasicBlock *true_dest = block(ii->get_true_dest()); BasicBlock *false_dest = block(ii->get_false_dest()); Value *cond = value(ii->get_cond()); - return BranchInst::Create(true_dest, false_dest, cond); + return builder.CreateCondBr(cond, true_dest, false_dest); } if(auto* ii = dynamic_cast(inst)){ BasicBlock *dest = block(ii->get_dest()); - return BranchInst::Create(dest); + return builder.CreateBr(dest); } if(auto* ii = dynamic_cast(inst)){ Type *ty = type(ii->get_type()->get_scalar_ty()); unsigned num_ops = ii->get_num_operands(); - return PHINode::Create(ty, num_ops, ii->get_name()); + return builder.CreatePHI(ty, num_ops); } if(auto* ii = dynamic_cast(inst)){ ir::value *ret_val = ii->get_return_value(); - return ReturnInst::Create(ctx, ret_val?value(ret_val):nullptr); + return builder.CreateRet(ret_val?value(ret_val):nullptr); } if(auto* ii = dynamic_cast(inst)){ Value *lhs = value(ii->get_operand(0)); Value *rhs = value(ii->get_operand(1)); - return BinaryOperator::Create(ii->get_op(), lhs, rhs, ii->get_name()); + return builder.Insert(BinaryOperator::Create(ii->get_op(), lhs, rhs)); } if(auto* ii = dynamic_cast(inst)){ CmpInst::Predicate pred = ii->get_pred(); Value *lhs = value(ii->get_operand(0)); Value *rhs = value(ii->get_operand(1)); - return CmpInst::Create(Instruction::ICmp, pred, lhs, rhs, ii->get_name()); + return builder.Insert(CmpInst::Create(Instruction::ICmp, pred, lhs, rhs)); } if(auto* ii = dynamic_cast(inst)){ CmpInst::Predicate pred = ii->get_pred(); Value *lhs = value(ii->get_operand(0)); Value *rhs = value(ii->get_operand(1)); - return FCmpInst::Create(Instruction::FCmp, pred, lhs, rhs, ii->get_name()); + return builder.Insert(FCmpInst::Create(Instruction::FCmp, pred, lhs, rhs)); } if(auto* ii = dynamic_cast(inst)){ Value *arg = value(ii->get_operand(0)); Type *dst_ty = type(ii->get_type()->get_scalar_ty()); - return CastInst::Create(ii->get_op(), arg, dst_ty, ii->get_name()); + return builder.Insert(CastInst::Create(ii->get_op(), arg, dst_ty)); } if(auto* ii = dynamic_cast(inst)){ std::vector idx_vals; @@ -155,31 +155,31 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::functionget_source_elt_ty()->get_scalar_ty()); Value *arg = value(ii->get_operand(0)); - return GetElementPtrInst::Create(source_ty, arg, idx_vals, ii->get_name()); + return builder.Insert(GetElementPtrInst::Create(source_ty, arg, idx_vals)); } if(ir::load_inst* ii = dynamic_cast(inst)){ Value *ptr = value(ii->get_pointer_operand()); - return new LoadInst(ptr, ii->get_name()); + return builder.CreateLoad(ptr); } // unknown instruction throw std::runtime_error("unknown conversion from ir::type to Type"); } /* convert ir::value to llvm::Value */ -Value* selection::llvm_value(ir::value *v, LLVMContext &ctx) { +Value* selection::llvm_value(ir::value *v, LLVMContext &ctx, IRBuilder<> &builder) { assert(!v->get_type()->is_tile_ty()); if(vmap_.find(v) != vmap_.end()) return vmap_.at(v); // create operands if(auto *uu = dynamic_cast(v)) for(ir::value* u: uu->ops()) - vmap_[u] = llvm_value(u, ctx); + vmap_.insert({u, llvm_value(u, ctx, builder)}); if(auto *cc = dynamic_cast(v)) return llvm_constant(cc, ctx); // instruction if(auto *ii = dynamic_cast(v)){ - auto value = [&](ir::value *x) { return llvm_value(x, ctx); }; - return llvm_inst(ii, value, ctx); + auto value = [&](ir::value *x) { return llvm_value(x, ctx, builder); }; + return llvm_inst(ii, value, ctx, builder); } // unknown value throw std::runtime_error("unknown conversion from ir::value to Value"); @@ -308,7 +308,14 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, else axes[d].values = {builder.getInt32(0)}; } - tmap_.insert({v, new distributed_tile(ty, shapes, axes)}); + distributed_tile *T = new distributed_tile(ty, shapes, axes); + tmap_.insert({v, T}); + // constant range + if(dynamic_cast(v)) + T->for_each([&](indices_t idx){ + T->set_value(idx, idx[0]); + }); + } } @@ -340,76 +347,88 @@ void selection::init_grids(ir::function *fn, IRBuilder<> &builder){ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> &builder) { Module *module = builder.GetInsertBlock()->getModule(); LLVMContext &ctx = builder.getContext(); - tile *ti = tmap_[ins]; - distributed_tile* result = (distributed_tile*)ti; - if(!ins->get_type()->is_tile_ty()) - return; - const auto& shapes = ins->get_type()->get_tile_shapes(); - // global_range - if(auto *x = dynamic_cast(ins)) { - static std::array ctaid = { - Intrinsic::nvvm_read_ptx_sreg_ctaid_x, - Intrinsic::nvvm_read_ptx_sreg_ctaid_y, - Intrinsic::nvvm_read_ptx_sreg_ctaid_z - }; - Function *get_group_id = Intrinsic::getDeclaration(module, ctaid[x->get_axis()]); - Value *group_id = builder.CreateCall(get_group_id, {}); - Value *offset = builder.CreateMul(builder.getInt32(shapes[0]), group_id); - result->for_each([&](indices_t idx){ - BinaryOperator *bin = static_cast(idx[0]); - result->set_value(idx, builder.CreateAdd(bin->getOperand(1), - builder.CreateAdd(bin->getOperand(0), offset))); + // store + if(auto *x = dynamic_cast(ins)) { + distributed_tile* ptr = (distributed_tile*)tmap_.at(x->get_pointer_operand()); + tile *value = tmap_.at(x->get_value_operand()); + ptr->for_each([&](indices_t idx){ + builder.CreateStore(value->get_value(idx), ptr->get_value(idx)); }); } - // reshape - else if(dynamic_cast(ins)) { - ir::value* in = ins->get_operand(0); - distributed_tile *in_tile = (distributed_tile*)tmap_.at(in); - result->for_each([&](indices_t out_idx){ - indices_t in_idx; - for(size_t k = 0; k < shapes.size(); k++){ - if(shapes[k] > 1) - in_idx.push_back(out_idx[k]); - } - result->set_value(out_idx, in_tile->get_value(in_idx)); - }); - } - // splat - else if(dynamic_cast(ins)) { - result->for_each([&](indices_t idx) { - result->set_value(idx, llvm_value(ins->get_operand(0), ctx)); - }); - } - // broadcast - else if(dynamic_cast(ins)) { - ir::value* in = ins->get_operand(0); - const auto& in_shapes = in->get_type()->get_tile_shapes(); - distributed_tile *in_tile = (distributed_tile*)tmap_.at(in); - result->for_each([&](indices_t out_idx){ - indices_t in_idx = out_idx; - for(size_t k = 0; k < in_idx.size(); k++){ - if(in_shapes[k] == 1) - in_idx[k] = builder.getInt32(0); - result->set_value(out_idx, in_tile->get_value(in_idx)); - } - }); - } - // copy to shared - else if(dynamic_cast(ins)) { - - } - // element-wise else { - result->for_each([&](indices_t idx){ - auto value = [&](ir::value *x) { - if(x->get_type()->is_tile_ty()) - return tmap_.at(x)->get_value(idx); - else - return llvm_value(x, ctx); + tile *ti = tmap_[ins]; + distributed_tile* result = (distributed_tile*)ti; + if(!ins->get_type()->is_tile_ty()) + return; + const auto& shapes = ins->get_type()->get_tile_shapes(); + // global_range + if(auto *x = dynamic_cast(ins)) { + static std::array ctaid = { + Intrinsic::nvvm_read_ptx_sreg_ctaid_x, + Intrinsic::nvvm_read_ptx_sreg_ctaid_y, + Intrinsic::nvvm_read_ptx_sreg_ctaid_z }; - result->set_value(idx, llvm_inst(ins, value, ctx)); - }); + Function *get_group_id = Intrinsic::getDeclaration(module, ctaid[x->get_axis()]); + Value *group_id = builder.CreateCall(get_group_id, {}); + Value *offset = builder.CreateMul(builder.getInt32(shapes[0]), group_id); + result->for_each([&](indices_t idx){ + BinaryOperator *bin = static_cast(idx[0]); + result->set_value(idx, builder.CreateAdd(bin->getOperand(1), + builder.CreateAdd(bin->getOperand(0), offset))); + }); + } + // reshape + else if(dynamic_cast(ins)) { + ir::value* in = ins->get_operand(0); + distributed_tile *in_tile = (distributed_tile*)tmap_.at(in); + result->for_each([&](indices_t out_idx){ + indices_t in_idx; + for(size_t k = 0; k < shapes.size(); k++){ + if(shapes[k] > 1) + in_idx.push_back(out_idx[k]); + } + result->set_value(out_idx, in_tile->get_value(in_idx)); + }); + } + // splat + else if(dynamic_cast(ins)) { + result->for_each([&](indices_t idx) { + result->set_value(idx, llvm_value(ins->get_operand(0), ctx, builder)); + }); + } + // broadcast + else if(dynamic_cast(ins)) { + ir::value* in = ins->get_operand(0); + const auto& in_shapes = in->get_type()->get_tile_shapes(); + distributed_tile *in_tile = (distributed_tile*)tmap_.at(in); + result->for_each([&](indices_t out_idx){ + indices_t in_idx = out_idx; + for(size_t k = 0; k < in_idx.size(); k++){ + if(in_shapes[k] == 1) + in_idx[k] = builder.getInt32(0); + result->set_value(out_idx, in_tile->get_value(in_idx)); + } + }); + } + // copy to shared + else if(dynamic_cast(ins)) { + + } + // element-wise + else { + result->for_each([&](indices_t idx){ + auto value = [&](ir::value *x) { + if(x->get_type()->is_tile_ty()) + return tmap_.at(x)->get_value(idx); + else + return llvm_value(x, ctx, builder); + }; + result->set_value(idx, llvm_inst(ins, value, ctx, builder)); + }); + } } + + } void selection::lower_instruction(ir::instruction *src, IRBuilder<> &builder) { @@ -418,7 +437,7 @@ void selection::lower_instruction(ir::instruction *src, IRBuilder<> &builder) { lower_tile_instruction(src, builder); } else { - Instruction *i = (Instruction*)llvm_value(src, ctx); + Instruction *i = (Instruction*)llvm_value(src, ctx, builder); vmap_[src] = i; builder.Insert(i); } @@ -459,7 +478,7 @@ void selection::run(ir::module &src, Module &dst){ for(unsigned i = 0; i < phi->get_num_incoming(); i++){ ir::value *inc_val = phi->get_incoming_value(i); ir::basic_block *inc_block = phi->get_incoming_block(i); - Value *llvm_inc_val = llvm_value(inc_val, dst_ctx); + Value *llvm_inc_val = llvm_value(inc_val, dst_ctx, dst_builder); BasicBlock *llvm_block = bmap_[inc_block]; dst_phi->addIncoming(llvm_inc_val, llvm_block); } From 53aca3fa893a65fdb94d0eae63ce7daa86cf6585 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 6 Feb 2019 17:30:33 -0500 Subject: [PATCH 057/494] [code generation] fixed bugs in tile instructions lowering --- include/codegen/selection.h | 4 ++-- lib/codegen/selection.cpp | 34 +++++++++++++++++----------------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/include/codegen/selection.h b/include/codegen/selection.h index 45d614813..9a9e03135 100644 --- a/include/codegen/selection.h +++ b/include/codegen/selection.h @@ -78,8 +78,8 @@ class selection{ private: // LLVM conversions llvm::Type* llvm_type(ir::type *ty, llvm::LLVMContext &ctx); - llvm::Value* llvm_value(ir::value *v, llvm:: LLVMContext &ctx, llvm::IRBuilder<> &builder); - llvm::Instruction* llvm_inst(ir::instruction *inst, std::function value, llvm::LLVMContext &ctx, llvm::IRBuilder<> &builder); + llvm::Value* llvm_value(ir::value *v, llvm::IRBuilder<> &builder); + llvm::Instruction* llvm_inst(ir::instruction *inst, std::function value, llvm::IRBuilder<> &builder); llvm::Constant* llvm_constant(ir::constant *cst, llvm::LLVMContext &ctx); // grid construction diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 31aa2a08c..213ecd6d2 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -105,27 +105,28 @@ Constant *selection::llvm_constant(ir::constant *cst, LLVMContext &ctx) { /* convert ir::instruction to llvm::Instruction */ -Instruction *selection::llvm_inst(ir::instruction *inst, std::function value, LLVMContext & ctx, IRBuilder<> &builder) { +Instruction *selection::llvm_inst(ir::instruction *inst, std::function value, IRBuilder<> &builder) { + LLVMContext & ctx = builder.getContext(); auto block = [&](ir::basic_block *x) { return bmap_.at(x); }; auto type = [&](ir::type *x) { return llvm_type(x, ctx); }; if(auto* ii = dynamic_cast(inst)){ BasicBlock *true_dest = block(ii->get_true_dest()); BasicBlock *false_dest = block(ii->get_false_dest()); Value *cond = value(ii->get_cond()); - return builder.CreateCondBr(cond, true_dest, false_dest); + return builder.Insert(BranchInst::Create(true_dest, false_dest, cond)); } if(auto* ii = dynamic_cast(inst)){ BasicBlock *dest = block(ii->get_dest()); - return builder.CreateBr(dest); + return builder.Insert(BranchInst::Create(dest)); } if(auto* ii = dynamic_cast(inst)){ Type *ty = type(ii->get_type()->get_scalar_ty()); unsigned num_ops = ii->get_num_operands(); - return builder.CreatePHI(ty, num_ops); + return builder.Insert(PHINode::Create(ty, num_ops)); } if(auto* ii = dynamic_cast(inst)){ ir::value *ret_val = ii->get_return_value(); - return builder.CreateRet(ret_val?value(ret_val):nullptr); + return builder.Insert(ReturnInst::Create(ctx, ret_val?value(ret_val):nullptr)); } if(auto* ii = dynamic_cast(inst)){ Value *lhs = value(ii->get_operand(0)); @@ -159,27 +160,28 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::function(inst)){ Value *ptr = value(ii->get_pointer_operand()); - return builder.CreateLoad(ptr); + return builder.Insert(new LoadInst(ptr)); } // unknown instruction throw std::runtime_error("unknown conversion from ir::type to Type"); } /* convert ir::value to llvm::Value */ -Value* selection::llvm_value(ir::value *v, LLVMContext &ctx, IRBuilder<> &builder) { +Value* selection::llvm_value(ir::value *v, IRBuilder<> &builder) { assert(!v->get_type()->is_tile_ty()); + LLVMContext &ctx = builder.getContext(); if(vmap_.find(v) != vmap_.end()) return vmap_.at(v); // create operands if(auto *uu = dynamic_cast(v)) for(ir::value* u: uu->ops()) - vmap_.insert({u, llvm_value(u, ctx, builder)}); + vmap_.insert({u, llvm_value(u, builder)}); if(auto *cc = dynamic_cast(v)) return llvm_constant(cc, ctx); // instruction if(auto *ii = dynamic_cast(v)){ - auto value = [&](ir::value *x) { return llvm_value(x, ctx, builder); }; - return llvm_inst(ii, value, ctx, builder); + auto value = [&](ir::value *x) { return llvm_value(x, builder); }; + return llvm_inst(ii, value, builder); } // unknown value throw std::runtime_error("unknown conversion from ir::value to Value"); @@ -393,7 +395,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & // splat else if(dynamic_cast(ins)) { result->for_each([&](indices_t idx) { - result->set_value(idx, llvm_value(ins->get_operand(0), ctx, builder)); + result->set_value(idx, llvm_value(ins->get_operand(0), builder)); }); } // broadcast @@ -421,9 +423,9 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & if(x->get_type()->is_tile_ty()) return tmap_.at(x)->get_value(idx); else - return llvm_value(x, ctx, builder); + return llvm_value(x, builder); }; - result->set_value(idx, llvm_inst(ins, value, ctx, builder)); + result->set_value(idx, llvm_inst(ins, value, builder)); }); } } @@ -432,14 +434,12 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & } void selection::lower_instruction(ir::instruction *src, IRBuilder<> &builder) { - LLVMContext &ctx = builder.getContext(); if(src->has_tile_result_or_op()) { lower_tile_instruction(src, builder); } else { - Instruction *i = (Instruction*)llvm_value(src, ctx, builder); + Instruction *i = (Instruction*)llvm_value(src, builder); vmap_[src] = i; - builder.Insert(i); } } @@ -478,7 +478,7 @@ void selection::run(ir::module &src, Module &dst){ for(unsigned i = 0; i < phi->get_num_incoming(); i++){ ir::value *inc_val = phi->get_incoming_value(i); ir::basic_block *inc_block = phi->get_incoming_block(i); - Value *llvm_inc_val = llvm_value(inc_val, dst_ctx, dst_builder); + Value *llvm_inc_val = llvm_value(inc_val, dst_builder); BasicBlock *llvm_block = bmap_[inc_block]; dst_phi->addIncoming(llvm_inc_val, llvm_block); } From 5fdb27d9aec401f8a55fac6a3e7a2f6e3da3f4b0 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 6 Feb 2019 23:34:45 -0500 Subject: [PATCH 058/494] [code generation] fixed bug in tile phi nodes --- examples/matrix.cpp | 3 +++ include/codegen/selection.h | 2 -- lib/codegen/selection.cpp | 39 ++++++++++++++++++++++--------------- 3 files changed, 26 insertions(+), 18 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 9599a8456..e5ea0b1ba 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -33,6 +33,9 @@ void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K){\ fp32* pa[32, 8] = a + rx[:, newaxis] + rka[newaxis, :]*M;\ fp32* pb[32, 8] = b + ry[:, newaxis] + rkb[newaxis, :]*K;\ fp32* pc[32, 32] = c + rx[:, newaxis] + ry[newaxis, :]*M;\ + for(k = K; k >= 0; k = k - 8){\ + C = C + 1;\ + }\ *pc = C;\ }\ "; diff --git a/include/codegen/selection.h b/include/codegen/selection.h index 9a9e03135..8b39ee150 100644 --- a/include/codegen/selection.h +++ b/include/codegen/selection.h @@ -72,7 +72,6 @@ private: class selection{ typedef std::map vmap_t; - typedef std::map bmap_t; typedef std::map tmap_t; private: @@ -100,7 +99,6 @@ public: private: vmap_t vmap_; - bmap_t bmap_; tmap_t tmap_; allocation *alloc_; tune *params_; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 213ecd6d2..bba6e8e43 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -107,7 +107,7 @@ Constant *selection::llvm_constant(ir::constant *cst, LLVMContext &ctx) { /* convert ir::instruction to llvm::Instruction */ Instruction *selection::llvm_inst(ir::instruction *inst, std::function value, IRBuilder<> &builder) { LLVMContext & ctx = builder.getContext(); - auto block = [&](ir::basic_block *x) { return bmap_.at(x); }; + auto block = [&](ir::basic_block *x) { return (BasicBlock*)vmap_.at(x); }; auto type = [&](ir::type *x) { return llvm_type(x, ctx); }; if(auto* ii = dynamic_cast(inst)){ BasicBlock *true_dest = block(ii->get_true_dest()); @@ -163,7 +163,7 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::function &builder) { if(vmap_.find(v) != vmap_.end()) return vmap_.at(v); // create operands - if(auto *uu = dynamic_cast(v)) - for(ir::value* u: uu->ops()) - vmap_.insert({u, llvm_value(u, builder)}); if(auto *cc = dynamic_cast(v)) return llvm_constant(cc, ctx); // instruction @@ -445,7 +442,6 @@ void selection::lower_instruction(ir::instruction *src, IRBuilder<> &builder) { void selection::run(ir::module &src, Module &dst){ vmap_.clear(); - bmap_.clear(); LLVMContext &dst_ctx = dst.getContext(); IRBuilder<> dst_builder(dst_ctx); // iterate over functions @@ -459,14 +455,14 @@ void selection::run(ir::module &src, Module &dst){ // create blocks for(ir::basic_block *block: fn->blocks()) { BasicBlock *dst_block = BasicBlock::Create(dst_ctx, block->get_name(), dst_fn); - bmap_[block] = dst_block; + vmap_[block] = dst_block; } // create grids - dst_builder.SetInsertPoint(bmap_[fn->blocks()[0]]); + dst_builder.SetInsertPoint((BasicBlock*)vmap_[fn->blocks()[0]]); init_grids(fn, dst_builder); // iterate through block for(ir::basic_block *block: fn->blocks()) { - dst_builder.SetInsertPoint(bmap_[block]); + dst_builder.SetInsertPoint((BasicBlock*)vmap_[block]); for(ir::instruction *i: block->get_inst_list()) lower_instruction(i, dst_builder); } @@ -474,13 +470,24 @@ void selection::run(ir::module &src, Module &dst){ for(ir::basic_block *block: fn->blocks()) for(ir::instruction *inst: block->get_inst_list()) if(auto *phi = dynamic_cast(inst)){ - PHINode *dst_phi = (PHINode*)vmap_.at(phi); - for(unsigned i = 0; i < phi->get_num_incoming(); i++){ - ir::value *inc_val = phi->get_incoming_value(i); - ir::basic_block *inc_block = phi->get_incoming_block(i); - Value *llvm_inc_val = llvm_value(inc_val, dst_builder); - BasicBlock *llvm_block = bmap_[inc_block]; - dst_phi->addIncoming(llvm_inc_val, llvm_block); + for(unsigned n = 0; n < phi->get_num_incoming(); n++){ + ir::value *inc_val = phi->get_incoming_value(n); + ir::basic_block *inc_block = phi->get_incoming_block(n); + BasicBlock *llvm_inc_block = (BasicBlock*)vmap_[inc_block]; + if(phi->get_type()->is_tile_ty()) { + distributed_tile *phi_tile = (distributed_tile*)tmap_.at(phi); + distributed_tile *inc_tile = (distributed_tile*)tmap_.at(inc_val); + phi_tile->for_each([&](indices_t idx){ + PHINode *llvm_phi = (PHINode*)phi_tile->get_value(idx); + Value *llvm_inc_val = inc_tile->get_value(idx); + llvm_phi->addIncoming(llvm_inc_val, llvm_inc_block); + }); + } + else { + PHINode *llvm_phi = (PHINode*)vmap_.at(phi); + Value *llvm_inc_val = vmap_.at(inc_val); + llvm_phi->addIncoming(llvm_inc_val, llvm_inc_block); + } } } } From 1b9a7a8e972ea0392e7b83a2ff92fabc2e7e1f3f Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 7 Feb 2019 17:03:19 -0500 Subject: [PATCH 059/494] [code generation] added basic shared copy/read --- examples/matrix.cpp | 4 +++ include/codegen/selection.h | 19 +++++++++---- lib/codegen/selection.cpp | 55 +++++++++++++++++++++++++++++++------ 3 files changed, 63 insertions(+), 15 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index e5ea0b1ba..51847bfb2 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -34,7 +34,11 @@ void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K){\ fp32* pb[32, 8] = b + ry[:, newaxis] + rkb[newaxis, :]*K;\ fp32* pc[32, 32] = c + rx[:, newaxis] + ry[newaxis, :]*M;\ for(k = K; k >= 0; k = k - 8){\ + fp32 a[32, 8] = *pa;\ + fp32 b[32, 8] = *pb;\ C = C + 1;\ + pa = pa + 8*M;\ + pb = pb + 8*K;\ }\ *pc = C;\ }\ diff --git a/include/codegen/selection.h b/include/codegen/selection.h index 8b39ee150..5aea5564f 100644 --- a/include/codegen/selection.h +++ b/include/codegen/selection.h @@ -37,16 +37,23 @@ public: virtual void set_value(indices_t idx, llvm::Value *v) = 0; virtual llvm::Value* get_value(indices_t idx) = 0; -private: +protected: llvm::Type *ty_; shapes_t shapes_; }; class shared_tile: public tile { +private: + llvm::Value* shared_offset(indices_t idx); + public: - using tile::tile; - void set_value(indices_t idx, llvm::Value *v) { } - llvm::Value* get_value(indices_t idx) { return nullptr; } + shared_tile(llvm::Type* ty, const shapes_t &shapes, llvm::Value* ptr, llvm::IRBuilder<> &builder); + void set_value(indices_t, llvm::Value *); + llvm::Value* get_value(indices_t idx); + +private: + llvm::Value *ptr_; + llvm::IRBuilder<> &builder_; }; class distributed_tile: public tile{ @@ -85,9 +92,9 @@ private: void create_grids(std::vector &grids, std::map &references, ir::function *fn); - void create_tile(ir::value *v, llvm::IRBuilder<> &builder, const std::map &references, std::set &seen); + void create_tile(ir::value *v, llvm::IRBuilder<> &builder, const std::map &references, std::set &seen, llvm::Value *sh_mem_ptr); void init_axes(ir::value *i, llvm::IRBuilder<> &builder, llvm::Value *u_thread_id, llvm::Value *u_warp_id); - void init_grids(ir::function *fn, llvm::IRBuilder<> &builder); + void init_grids(ir::function *fn, llvm::IRBuilder<> &builder, llvm::Value *sh_mem_ptr); // lowering void lower_instruction(ir::instruction *src, llvm::IRBuilder<> &builder); diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index bba6e8e43..473588363 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -1,5 +1,6 @@ #include "codegen/selection.h" #include "codegen/tune.h" +#include "codegen/allocation.h" #include "llvm/IR/Module.h" #include "llvm/IR/IRBuilder.h" #include "ir/context.h" @@ -53,6 +54,28 @@ void distributed_tile::for_each(std::function fn) { fn(idx.first); } +/* Shared Tile */ +Value* shared_tile::shared_offset(indices_t idx) { + Value *result = builder_.getInt32(0); + result = builder_.CreateAdd(result, idx[0]); + for(size_t i = 1; i < idx.size(); i++) + result = builder_.CreateAdd(result, builder_.CreateMul(idx[i], builder_.getInt32(shapes_[i-1]))); + return result; +} + +shared_tile::shared_tile(Type *ty, const shapes_t &shapes, Value *ptr, llvm::IRBuilder<> &builder): tile(ty, shapes), ptr_(ptr), builder_(builder) { + +} + +void shared_tile::set_value(indices_t idx, Value *value) { + Value *ptr = builder_.CreateGEP(ptr_, shared_offset(idx)); + builder_.CreateStore(value, ptr); +} + +Value* shared_tile::get_value(indices_t idx) { + Value *ptr = builder_.CreateGEP(ptr_, shared_offset(idx)); + return builder_.CreateLoad(ptr); +} /* convert ir::type to Type */ Type *selection::llvm_type(ir::type *ty, LLVMContext &ctx) { @@ -281,19 +304,21 @@ void selection::create_grids(std::vector &grids, void selection::create_tile(ir::value *v, IRBuilder<> &builder, const std::map& references, - std::set &seen) { + std::set &seen, Value *sh_mem_ptr) { if(!v->get_type()->is_tile_ty() || !seen.insert(v).second) return; if(auto *user = dynamic_cast(v)) for(ir::value *op: user->ops()) - create_tile(op, builder, references, seen); + create_tile(op, builder, references, seen, sh_mem_ptr); LLVMContext &ctx = builder.getContext(); - bool is_shared = dynamic_cast(v); const auto& shapes = v->get_type()->get_tile_shapes(); Type* ty = llvm_type(v->get_type()->get_scalar_ty(), ctx); // create shared tile + bool is_shared = dynamic_cast(v); if(is_shared){ - tmap_.insert({v, new shared_tile(ty, shapes)}); + size_t offset = alloc_->get_offset(v); + Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset)); + tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)}); } // create distributed tile else { @@ -318,7 +343,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, } } -void selection::init_grids(ir::function *fn, IRBuilder<> &builder){ +void selection::init_grids(ir::function *fn, IRBuilder<> &builder, Value *sh_mem_ptr){ // fetch linear ID Module *mod = builder.GetInsertBlock()->getParent()->getParent(); Function *get_thread_id = Intrinsic::getDeclaration(mod, Intrinsic::nvvm_read_ptx_sreg_tid_x); @@ -338,7 +363,7 @@ void selection::init_grids(ir::function *fn, IRBuilder<> &builder){ for(ir::instruction *i: block->get_inst_list()){ if(!i->get_type()->is_tile_ty()) continue; - create_tile(i, builder, references, seen); + create_tile(i, builder, references, seen, sh_mem_ptr); } } @@ -411,7 +436,10 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & } // copy to shared else if(dynamic_cast(ins)) { - + distributed_tile* in = (distributed_tile*)tmap_.at(ins->get_operand(0)); + in->for_each([&](indices_t idx){ + ti->set_value(idx, in->get_value(idx)); + }); } // element-wise else { @@ -444,6 +472,7 @@ void selection::run(ir::module &src, Module &dst){ vmap_.clear(); LLVMContext &dst_ctx = dst.getContext(); IRBuilder<> dst_builder(dst_ctx); + // iterate over functions for(ir::function *fn: src.get_function_list()) { // create LLVM function @@ -457,9 +486,17 @@ void selection::run(ir::module &src, Module &dst){ BasicBlock *dst_block = BasicBlock::Create(dst_ctx, block->get_name(), dst_fn); vmap_[block] = dst_block; } - // create grids dst_builder.SetInsertPoint((BasicBlock*)vmap_[fn->blocks()[0]]); - init_grids(fn, dst_builder); + // allocate shared memory + Type *int_8_ty = Type::getInt8Ty(dst_ctx); + ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_->get_allocated_size()); + Type *ptr_ty = PointerType::get(int_8_ty, 3); + GlobalVariable *sh_mem_array = + new GlobalVariable(*dst_fn->getParent(), array_ty, false, GlobalVariable::InternalLinkage, + nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); + Value *sh_mem_ptr = dst_builder.CreateBitCast(sh_mem_array, ptr_ty); + // create grids + init_grids(fn, dst_builder, sh_mem_ptr); // iterate through block for(ir::basic_block *block: fn->blocks()) { dst_builder.SetInsertPoint((BasicBlock*)vmap_[block]); From dd3527785824935b1a008b3036be1d1c1ee1e262 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 7 Feb 2019 22:42:54 -0500 Subject: [PATCH 060/494] [examples] added basic skeleton to generate matrix multiplication PTX --- CMakeLists.txt | 4 ++- examples/matrix.cpp | 57 +++++++++++++++++++++++++++++++------ lib/codegen/selection.cpp | 18 ++++++++++++ lib/codegen/shared_copy.cpp | 6 ++-- 4 files changed, 74 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9d49af3e4..b4c28cebc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,9 +12,10 @@ include_directories(${BISON_Parser_INCLUDE_DIRECTORIES}) # LLVM find_package(LLVM REQUIRED CONFIG) +message(STATUS ${LLVM_INCLUDE_DIRS}) include_directories(${LLVM_INCLUDE_DIRS}) add_definitions(${LLVM_DEFINITIONS}) -llvm_map_components_to_libnames(llvm_libs support core irreader) +llvm_map_components_to_libnames(llvm_libs support core irreader MC NVPTXCodeGen all) #Default build type if(NOT CMAKE_BUILD_TYPE) @@ -33,6 +34,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${LLVM_CXXFLAGS} -std=c++11") # TDL file(GLOB_RECURSE LIBTDL_SRC lib/*.cpp) add_library(tdl SHARED ${LIBTDL_SRC} ${BISON_Parser_OUTPUTS} ${FLEX_Lexer_OUTPUTS}) +message(STATUS ${llvm_libs}) target_link_libraries(tdl ${llvm_libs}) # Examples diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 51847bfb2..97af5f66b 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -13,6 +13,12 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/PassManager.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/LegacyPassManager.h" typedef struct yy_buffer_state * YY_BUFFER_STATE; extern int yyparse(); @@ -36,7 +42,7 @@ void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K){\ for(k = K; k >= 0; k = k - 8){\ fp32 a[32, 8] = *pa;\ fp32 b[32, 8] = *pb;\ - C = C + 1;\ + C = dot(a,b,C);\ pa = pa + 8*M;\ pb = pb + 8*K;\ }\ @@ -44,6 +50,16 @@ void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K){\ }\ "; +static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) { + std::string Ret = "e"; + if (!is64Bit) + Ret += "-p:32:32"; + else if (UseShortPointers) + Ret += "-p3:32:32-p4:32:32-p5:32:32"; + Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; + return Ret; +} + int main() { YY_BUFFER_STATE buffer = yy_scan_string(src); yyparse(); @@ -86,12 +102,37 @@ int main() { liveness.run(module); allocation.run(); selection.run(module, llvm_module); -// std::vector params = tune.get_params(module); -// std::cout << params.size() << std::endl; -// selection.run(module, llvm_module); - // print LLVM program - llvm::PrintModulePass print(llvm::outs()); - llvm::AnalysisManager analysis; - print.run(llvm_module, analysis); + +// // print LLVM program +// llvm::PrintModulePass print(llvm::outs()); +// llvm::AnalysisManager analysis; +// print.run(llvm_module, analysis); + + // create target machine + { + llvm::InitializeAllTargetInfos(); + llvm::InitializeAllTargets(); + llvm::InitializeAllTargetMCs(); + llvm::InitializeAllAsmParsers(); + llvm::InitializeAllAsmPrinters(); + + llvm_module.setTargetTriple("nvptx64-nvidia-cuda"); + std::string error; + auto target = llvm::TargetRegistry::lookupTarget(llvm_module.getTargetTriple(), error); + llvm::TargetMachine *machine = target->createTargetMachine(llvm_module.getTargetTriple(), "sm_52", "", + llvm::TargetOptions(), llvm::Reloc::Model(), + llvm::None, llvm::CodeGenOpt::Aggressive); + llvm_module.setDataLayout(computeDataLayout(true, true)); + + // emit machine code + llvm::legacy::PassManager pass; + llvm::SmallVector buffer; + llvm::raw_svector_ostream stream(buffer); + machine->addPassesToEmitFile(pass, stream, nullptr, llvm::TargetMachine::CGFT_AssemblyFile); + pass.run(llvm_module); + std::string src(buffer.begin(), buffer.end()); + std::cout << src << std::endl; + } + return 0; } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 473588363..1872ea5e4 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -441,6 +441,24 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & ti->set_value(idx, in->get_value(idx)); }); } + // matrix multiplication + else if(dynamic_cast(ins)) { + ir::value *A = ins->get_operand(0); + ir::value *B = ins->get_operand(1); + ir::value *C = ins->get_operand(2); + result->for_each([&](indices_t idx){ + Value *res = tmap_.at(C)->get_value(idx); + unsigned NK = A->get_type()->get_tile_shapes()[1]; + for(unsigned K = 0; K < NK; ++K){ + indices_t a_idx = {idx[0], builder.getInt32(K)}; + indices_t b_idx = {idx[1], builder.getInt32(K)}; + Value *a = tmap_.at(A)->get_value(a_idx); + Value *b = tmap_.at(B)->get_value(b_idx); + res = builder.CreateAdd(res, builder.CreateMul(a, b)); + } + result->set_value(idx, res); + }); + } // element-wise else { result->for_each([&](indices_t idx){ diff --git a/lib/codegen/shared_copy.cpp b/lib/codegen/shared_copy.cpp index d08a4a145..a6c64e08d 100644 --- a/lib/codegen/shared_copy.cpp +++ b/lib/codegen/shared_copy.cpp @@ -17,10 +17,12 @@ void place_shared_copy::run(ir::module &mod) { builder.set_insert_point(i); ir::value *x = i->get_operand(0); ir::value *y = i->get_operand(1); - ir::value *rx = builder.create_copy_to_shared(x); - ir::value *ry = builder.create_copy_to_shared(y); + ir::instruction *rx = (ir::instruction*)builder.create_copy_to_shared(x); + ir::instruction *ry = (ir::instruction*)builder.create_copy_to_shared(y); x->replace_all_uses_with(rx); y->replace_all_uses_with(ry); + rx->set_operand(0, x); + ry->set_operand(0, y); } } From 90c0474974eea62feddbee3623c4209aa9593c62 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 8 Feb 2019 12:54:20 -0500 Subject: [PATCH 061/494] [examples] improved template for testing matrix multiplication --- examples/CMakeLists.txt | 2 +- examples/matrix.cpp | 244 ++++++++++++++++++++++++++------------ lib/ast/lowering.cpp | 2 +- lib/codegen/selection.cpp | 27 +++-- 4 files changed, 190 insertions(+), 85 deletions(-) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 53d780a94..8419125c2 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -2,5 +2,5 @@ foreach(PROG matrix) add_executable(${PROG} ${PROG}.cpp) set_target_properties(${PROG} PROPERTIES OUTPUT_NAME ${PROG}) include_directories(/usr/local/cuda/include/) - target_link_libraries(${PROG} tdl) + target_link_libraries(${PROG} tdl cuda) endforeach(PROG) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 97af5f66b..c1476f330 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -1,5 +1,6 @@ #include #include +#include "cuda.h" #include "ast/ast.h" #include "ir/context.h" #include "ir/module.h" @@ -42,7 +43,7 @@ void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K){\ for(k = K; k >= 0; k = k - 8){\ fp32 a[32, 8] = *pa;\ fp32 b[32, 8] = *pb;\ - C = dot(a,b,C);\ + C = C + 1;\ pa = pa + 8*M;\ pb = pb + 8*K;\ }\ @@ -50,7 +51,7 @@ void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K){\ }\ "; -static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) { +static std::string compute_data_layout(bool is64Bit, bool UseShortPointers) { std::string Ret = "e"; if (!is64Bit) Ret += "-p:32:32"; @@ -60,79 +61,172 @@ static std::string computeDataLayout(bool is64Bit, bool UseShortPointers) { return Ret; } -int main() { - YY_BUFFER_STATE buffer = yy_scan_string(src); - yyparse(); - yy_delete_buffer(buffer); - translation_unit *program = ast_root; - tdl::ir::context context; - tdl::ir::module module("matrix", context); - program->codegen(&module); - llvm::LLVMContext llvm_context; - llvm::Module llvm_module("test", llvm_context); - // lowering passes - tdl::codegen::place_shared_copy shared; - tdl::codegen::tune tune; - tdl::codegen::liveness liveness; - tdl::codegen::allocation allocation(&liveness); - tdl::codegen::selection selection(&allocation, &tune); - tune.run(module); - std::vector params = { - // asm - 2, 8, 1, - // bsn - 4, 4, 1, - // pa - 2, 4, 1, - // pb - 1, 8, 1, - }; - std::map> errors; - unsigned i = 0; - std::cout << tune.get_params(module).size() << std::endl; - for(unsigned *x: tune.get_params(module)) - *x = params[i++]; - tune.check_constraints(module, errors); - std::cout << "errors: " << errors.size() << std::endl; - for(auto &x: errors){ - for(auto &e: x.second) - std::cout << e << std::endl; - } - shared.run(module); - liveness.run(module); - allocation.run(); - selection.run(module, llvm_module); +static std::string generate_machine_code(llvm::Module &module, const std::string &target_triple, const std::string &data_layout) { + llvm::InitializeAllTargetInfos(); + llvm::InitializeAllTargets(); + llvm::InitializeAllTargetMCs(); + llvm::InitializeAllAsmParsers(); + llvm::InitializeAllAsmPrinters(); -// // print LLVM program -// llvm::PrintModulePass print(llvm::outs()); -// llvm::AnalysisManager analysis; -// print.run(llvm_module, analysis); + module.setTargetTriple(target_triple); + std::string error; + auto target = llvm::TargetRegistry::lookupTarget(module.getTargetTriple(), error); + llvm::TargetMachine *machine = target->createTargetMachine(module.getTargetTriple(), "sm_52", "", + llvm::TargetOptions(), llvm::Reloc::Model(), + llvm::None, llvm::CodeGenOpt::Aggressive); + module.setDataLayout(data_layout); - // create target machine - { - llvm::InitializeAllTargetInfos(); - llvm::InitializeAllTargets(); - llvm::InitializeAllTargetMCs(); - llvm::InitializeAllAsmParsers(); - llvm::InitializeAllAsmPrinters(); - - llvm_module.setTargetTriple("nvptx64-nvidia-cuda"); - std::string error; - auto target = llvm::TargetRegistry::lookupTarget(llvm_module.getTargetTriple(), error); - llvm::TargetMachine *machine = target->createTargetMachine(llvm_module.getTargetTriple(), "sm_52", "", - llvm::TargetOptions(), llvm::Reloc::Model(), - llvm::None, llvm::CodeGenOpt::Aggressive); - llvm_module.setDataLayout(computeDataLayout(true, true)); - - // emit machine code - llvm::legacy::PassManager pass; - llvm::SmallVector buffer; - llvm::raw_svector_ostream stream(buffer); - machine->addPassesToEmitFile(pass, stream, nullptr, llvm::TargetMachine::CGFT_AssemblyFile); - pass.run(llvm_module); - std::string src(buffer.begin(), buffer.end()); - std::cout << src << std::endl; - } - - return 0; + // emit machine code + llvm::legacy::PassManager pass; + llvm::SmallVector buffer; + llvm::raw_svector_ostream stream(buffer); + machine->addPassesToEmitFile(pass, stream, nullptr, llvm::TargetMachine::CGFT_AssemblyFile); + pass.run(module); + std::string src(buffer.begin(), buffer.end()); + return src; +} + +static void __checkCudaErrors( CUresult err, const char *file, const int line ) +{ + if( CUDA_SUCCESS != err) { + fprintf(stderr, + "CUDA Driver API error = %04d from file <%s>, line %i.\n", + err, file, line ); + exit(-1); + } +} +#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__) + +static void compile_machine_code(CUdevice &device, CUcontext &context, CUmodule &module, + CUfunction &function, CUstream &stream, int &major, int &minor, + const std::string &src, const std::string &name) { + int numDevices; + + // Initialize + checkCudaErrors(cuInit(0)); + checkCudaErrors(cuDeviceGetCount(&numDevices)); + checkCudaErrors(cuDeviceGet(&device, 0)); + checkCudaErrors(cuDeviceComputeCapability(&major, &minor, device)); + checkCudaErrors(cuCtxCreate(&context, 0, device)); + checkCudaErrors(cuStreamCreate(&stream, 0)); + + // Compile program + CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; + unsigned int errbufsize = 8096; + std::string errbuf(errbufsize, 0); + const void *cpterr = static_cast(errbuf.data()); + void *pterr = const_cast(cpterr); + void* optval[] = {(void*)(uintptr_t)errbufsize, pterr}; + int err = cuModuleLoadDataEx(&module, src.data(), 2, opt, optval); + if(err != CUDA_SUCCESS){ + std::cerr << "Compilation Failed! Log: " << std::endl; + std::cerr << errbuf << std::endl; + } + + // Get function + checkCudaErrors(cuModuleGetFunction(&function, module, name.c_str())); +} + +int main() { + // create AST from Triton-C source + YY_BUFFER_STATE buffer = yy_scan_string(src); + yyparse(); + yy_delete_buffer(buffer); + translation_unit *program = ast_root; + + // create Triton-IR from AST + tdl::ir::context context; + tdl::ir::module module("matrix", context); + program->codegen(&module); + llvm::LLVMContext llvm_context; + llvm::Module llvm_module("test", llvm_context); + + // create passes + tdl::codegen::place_shared_copy shared; + tdl::codegen::tune tune; + tdl::codegen::liveness liveness; + tdl::codegen::allocation allocation(&liveness); + tdl::codegen::selection selection(&allocation, &tune); + + // tuning parameters + tune.run(module); + std::vector params = { + // asm + 2, 8, 1, + // bsn + 4, 4, 1, + // pa + 2, 4, 1, + // pb + 1, 8, 1, + }; + std::map> errors; + unsigned i = 0; + std::cout << tune.get_params(module).size() << std::endl; + for(unsigned *x: tune.get_params(module)) + *x = params[i++]; + tune.check_constraints(module, errors); + std::cout << "errors: " << errors.size() << std::endl; + for(auto &x: errors){ + for(auto &e: x.second) + std::cout << e << std::endl; + } + + // run passes + shared.run(module); + liveness.run(module); + allocation.run(); + selection.run(module, llvm_module); + // llvm source + llvm::PrintModulePass print(llvm::outs()); + llvm::AnalysisManager analysis; + print.run(llvm_module, analysis); + + // generate machine code + std::string src = generate_machine_code(llvm_module, "nvptx64-nvidia-cuda", compute_data_layout(true, true)); + std::cout << src << std::endl; + + // compile machine code + CUdevice cu_device; + CUcontext cu_context; + CUmodule cu_module; + CUfunction cu_kernel; + CUstream cu_stream; + int major, minor; + compile_machine_code(cu_device, cu_context, cu_module, cu_kernel, cu_stream, major, minor, src, "test"); + + // execute machine code + // Allocate buffers + typedef float numeric_t; + size_t M = 256, N = 256, K = 256; + std::vector c(M*N); + std::vector a(M*K); + std::vector b(K*N); + for(size_t i = 0; i < a.size(); i++) + a[i] = (float)rand() / RAND_MAX; + for(size_t i = 0; i < b.size(); i++) + b[i] = (float)rand() / RAND_MAX; + for(size_t i = 0; i < c.size(); i++) + c[i] = 0; + CUdeviceptr d_a, d_b, d_c; + checkCudaErrors(cuMemAlloc(&d_a, sizeof(numeric_t) * a.size())); + checkCudaErrors(cuMemAlloc(&d_b, sizeof(numeric_t) * b.size())); + checkCudaErrors(cuMemAlloc(&d_c, sizeof(numeric_t) * c.size())); + // Copy buffers + checkCudaErrors(cuMemcpyHtoD(d_a, a.data(), sizeof(numeric_t) * a.size())); + checkCudaErrors(cuMemcpyHtoD(d_b, b.data(), sizeof(numeric_t) * b.size())); + checkCudaErrors(cuMemcpyHtoD(d_c, c.data(), sizeof(numeric_t) * c.size())); + // Launch kernel + void *args[] = { &d_a, &d_b, &d_c, &M, &N, &K}; + int num_regs; + cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, cu_kernel); + unsigned TM = params[0]*params[1]; + unsigned TN = params[3]*params[4]; + unsigned nthreads = params[1]*params[2]*params[7]*params[8]; + checkCudaErrors(cuLaunchKernel(cu_kernel, M/TM, N/TN, 1, nthreads, 1, 1, 0, cu_stream, args, NULL)); + checkCudaErrors(cuStreamSynchronize(cu_stream)); + // Write back + checkCudaErrors(cuMemcpyDtoH(c.data(), d_c, sizeof(numeric_t) * c.size())); + + return 0; } diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 5f3b37170..c9d8c6ff8 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -78,7 +78,7 @@ void node::implicit_cast(ir::builder &builder, ir::value *&lhs, ir::value *&rhs, // Both operands are integers else if(left_ty->is_integer_ty() && right_ty->is_integer_ty()){ is_int = true; - is_signed = false; + is_signed = true; // always signed for now if(left_ty->get_integer_bitwidth() != right_ty->get_integer_bitwidth()){ ir::value *&to_convert = (left_ty->get_integer_bitwidth() > right_ty->get_integer_bitwidth())?rhs:lhs; ir::type *dst_ty = (to_convert==lhs)?right_ty:left_ty; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 1872ea5e4..cb22d972c 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -495,7 +495,15 @@ void selection::run(ir::module &src, Module &dst){ for(ir::function *fn: src.get_function_list()) { // create LLVM function FunctionType *fn_ty = (FunctionType*)llvm_type(fn->get_fn_type(), dst_ctx); - Function *dst_fn = Function::Create(fn_ty, Function::ExternalLinkage, "kernel", &dst); + Function *dst_fn = Function::Create(fn_ty, Function::ExternalLinkage, fn->get_name(), &dst); + // Set metadata + llvm::Metadata *md_args[] = { + llvm::ValueAsMetadata::get(dst_fn), + llvm::MDString::get(dst_ctx, "kernel"), + llvm::ValueAsMetadata::get(dst_builder.getInt32(1)) + }; + dst.getOrInsertNamedMetadata("nvvm.annotations")->addOperand(llvm::MDNode::get(dst_ctx, md_args)); + // map parameters for(unsigned i = 0; i < fn->args().size(); i++) vmap_[fn->args()[i]] = &*(dst_fn->arg_begin() + i); @@ -506,13 +514,16 @@ void selection::run(ir::module &src, Module &dst){ } dst_builder.SetInsertPoint((BasicBlock*)vmap_[fn->blocks()[0]]); // allocate shared memory - Type *int_8_ty = Type::getInt8Ty(dst_ctx); - ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_->get_allocated_size()); - Type *ptr_ty = PointerType::get(int_8_ty, 3); - GlobalVariable *sh_mem_array = - new GlobalVariable(*dst_fn->getParent(), array_ty, false, GlobalVariable::InternalLinkage, - nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); - Value *sh_mem_ptr = dst_builder.CreateBitCast(sh_mem_array, ptr_ty); + Value *sh_mem_ptr = nullptr; + if(unsigned alloc_size = alloc_->get_allocated_size()){ + Type *int_8_ty = Type::getInt8Ty(dst_ctx); + ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_size); + Type *ptr_ty = PointerType::get(int_8_ty, 3); + GlobalVariable *sh_mem_array = + new GlobalVariable(*dst_fn->getParent(), array_ty, false, GlobalVariable::InternalLinkage, + nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); + sh_mem_ptr = dst_builder.CreateBitCast(sh_mem_array, ptr_ty); + } // create grids init_grids(fn, dst_builder, sh_mem_ptr); // iterate through block From 937bc464a32e8bd573feea4aef165807ca7edcb3 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 8 Feb 2019 13:15:04 -0500 Subject: [PATCH 062/494] [examples] debugging matrix multiplication code --- examples/matrix.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index c1476f330..1283e651d 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -40,7 +40,7 @@ void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K){\ fp32* pa[32, 8] = a + rx[:, newaxis] + rka[newaxis, :]*M;\ fp32* pb[32, 8] = b + ry[:, newaxis] + rkb[newaxis, :]*K;\ fp32* pc[32, 32] = c + rx[:, newaxis] + ry[newaxis, :]*M;\ - for(k = K; k >= 0; k = k - 8){\ + for(k = K; k > 0; k = k - 8){\ fp32 a[32, 8] = *pa;\ fp32 b[32, 8] = *pb;\ C = C + 1;\ @@ -228,5 +228,8 @@ int main() { // Write back checkCudaErrors(cuMemcpyDtoH(c.data(), d_c, sizeof(numeric_t) * c.size())); + for(size_t i = 0; i < M*N; i++) + if(c[i] == 32) + std::cout << i << " " << "success" << std::endl; return 0; } From a9d219cdf5689f95819c29b28eea7aef2a097575 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 8 Feb 2019 14:47:56 -0500 Subject: [PATCH 063/494] [driver] added driver source code from isaac repository --- include/driver/backend.h | 116 + include/driver/buffer.h | 54 + include/driver/context.h | 66 + include/driver/cublas.h | 229 + include/driver/device.h | 98 + include/driver/dispatch.h | 258 + include/driver/error.h | 228 + include/driver/event.h | 49 + include/driver/handle.h | 82 + include/driver/kernel.h | 68 + include/driver/module.h | 61 + include/driver/platform.h | 54 + include/driver/stream.h | 82 + include/external/CUDA/builtin_types.h | 64 + include/external/CUDA/channel_descriptor.h | 412 + include/external/CUDA/crt/host_config.h | 266 + include/external/CUDA/crt/host_defines.h | 216 + include/external/CUDA/cuComplex.h | 338 + include/external/CUDA/cublas.h | 565 + include/external/CUDA/cublas_api.h | 2977 ++++ include/external/CUDA/cublas_v2.h | 274 + include/external/CUDA/cuda.h | 12185 ++++++++++++++++ .../external/CUDA/cuda_device_runtime_api.h | 248 + include/external/CUDA/cuda_fp16.h | 1969 +++ include/external/CUDA/cuda_fp16.hpp | 1797 +++ include/external/CUDA/cuda_runtime.h | 2040 +++ include/external/CUDA/cuda_runtime_api.h | 7422 ++++++++++ include/external/CUDA/cudnn.h | 1805 +++ include/external/CUDA/cusparse.h | 6257 ++++++++ include/external/CUDA/device_types.h | 69 + include/external/CUDA/driver_functions.h | 145 + include/external/CUDA/driver_types.h | 1610 ++ include/external/CUDA/host_config.h | 50 + include/external/CUDA/host_defines.h | 50 + include/external/CUDA/library_types.h | 80 + include/external/CUDA/nvml.h | 5628 +++++++ include/external/CUDA/nvrtc.h | 525 + include/external/CUDA/surface_types.h | 119 + include/external/CUDA/texture_types.h | 217 + include/external/CUDA/vector_functions.h | 177 + include/external/CUDA/vector_functions.hpp | 318 + include/external/CUDA/vector_types.h | 425 + include/tools/sys/getenv.hpp | 56 + include/tools/sys/mkdir.hpp | 68 + lib/driver/backend.cpp | 196 + lib/driver/buffer.cpp | 60 + lib/driver/context.cpp | 99 + lib/driver/device.cpp | 197 + lib/driver/dispatch.cpp | 363 + lib/driver/error.cpp | 155 + lib/driver/event.cpp | 40 + lib/driver/handle.cpp | 66 + lib/driver/kernel.cpp | 67 + lib/driver/module.cpp | 118 + lib/driver/platform.cpp | 56 + lib/driver/stream.cpp | 95 + 56 files changed, 51329 insertions(+) create mode 100755 include/driver/backend.h create mode 100755 include/driver/buffer.h create mode 100755 include/driver/context.h create mode 100755 include/driver/cublas.h create mode 100755 include/driver/device.h create mode 100755 include/driver/dispatch.h create mode 100755 include/driver/error.h create mode 100755 include/driver/event.h create mode 100755 include/driver/handle.h create mode 100755 include/driver/kernel.h create mode 100755 include/driver/module.h create mode 100755 include/driver/platform.h create mode 100755 include/driver/stream.h create mode 100755 include/external/CUDA/builtin_types.h create mode 100755 include/external/CUDA/channel_descriptor.h create mode 100644 include/external/CUDA/crt/host_config.h create mode 100644 include/external/CUDA/crt/host_defines.h create mode 100755 include/external/CUDA/cuComplex.h create mode 100755 include/external/CUDA/cublas.h create mode 100755 include/external/CUDA/cublas_api.h create mode 100644 include/external/CUDA/cublas_v2.h create mode 100755 include/external/CUDA/cuda.h create mode 100755 include/external/CUDA/cuda_device_runtime_api.h create mode 100755 include/external/CUDA/cuda_fp16.h create mode 100755 include/external/CUDA/cuda_fp16.hpp create mode 100755 include/external/CUDA/cuda_runtime.h create mode 100755 include/external/CUDA/cuda_runtime_api.h create mode 100755 include/external/CUDA/cudnn.h create mode 100644 include/external/CUDA/cusparse.h create mode 100755 include/external/CUDA/device_types.h create mode 100755 include/external/CUDA/driver_functions.h create mode 100755 include/external/CUDA/driver_types.h create mode 100755 include/external/CUDA/host_config.h create mode 100755 include/external/CUDA/host_defines.h create mode 100755 include/external/CUDA/library_types.h create mode 100755 include/external/CUDA/nvml.h create mode 100755 include/external/CUDA/nvrtc.h create mode 100755 include/external/CUDA/surface_types.h create mode 100755 include/external/CUDA/texture_types.h create mode 100755 include/external/CUDA/vector_functions.h create mode 100755 include/external/CUDA/vector_functions.hpp create mode 100755 include/external/CUDA/vector_types.h create mode 100755 include/tools/sys/getenv.hpp create mode 100755 include/tools/sys/mkdir.hpp create mode 100755 lib/driver/backend.cpp create mode 100755 lib/driver/buffer.cpp create mode 100755 lib/driver/context.cpp create mode 100755 lib/driver/device.cpp create mode 100755 lib/driver/dispatch.cpp create mode 100755 lib/driver/error.cpp create mode 100755 lib/driver/event.cpp create mode 100755 lib/driver/handle.cpp create mode 100755 lib/driver/kernel.cpp create mode 100755 lib/driver/module.cpp create mode 100755 lib/driver/platform.cpp create mode 100755 lib/driver/stream.cpp diff --git a/include/driver/backend.h b/include/driver/backend.h new file mode 100755 index 000000000..f71e2b424 --- /dev/null +++ b/include/driver/backend.h @@ -0,0 +1,116 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#ifndef TDL_INCLUDE_DRIVER_BACKEND_H +#define TDL_INCLUDE_DRIVER_BACKEND_H + +#include +#include +#include + + +namespace tdl +{ +namespace driver +{ + +class Buffer; +class Stream; +class Device; +class Context; +class Platform; +class Module; +class Kernel; + +struct backend +{ + + class modules + { + friend class backend; + public: + static void release(); + static Module& get(Stream const & stream, std::string const & name, std::string const &src); + private: + static std::map, Module * > cache_; + }; + + class kernels + { + friend class backend; + public: + static void release(); + static Kernel & get(Module const & program, std::string const & name); + private: + static std::map, Kernel * > cache_; + }; + + class contexts + { + friend class backend; + private: + static void init(std::vector const &); + static void release(); + public: + static Context const & get_default(); + template + static Context const & import(T context) + { + for(driver::Context const * x: cache_) + if((T)*x==context) + return *x; + cache_.emplace_back(new Context(context, false)); + return *cache_.back(); + } + static void get(std::list &); + private: + static std::list cache_; + }; + + class streams + { + friend class backend; + private: + static void init(std::list const &); + static void release(); + public: + static void get(Context const &, std::vector &streams); + static Stream & get(Context const &, unsigned int id = 0); + static Stream & get_default(); + private: + static std::map< Context, std::vector > cache_; + }; + + static void init(); + static void release(); + + static std::vector devices(); + static std::vector platforms(); + static void synchronize(Context const &); + + static unsigned int default_device; +}; + +} +} + +#endif diff --git a/include/driver/buffer.h b/include/driver/buffer.h new file mode 100755 index 000000000..475cf2273 --- /dev/null +++ b/include/driver/buffer.h @@ -0,0 +1,54 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#ifndef TDL_INCLUDE_DRIVER_BUFFER_H +#define TDL_INCLUDE_DRIVER_BUFFER_H + +#include "driver/handle.h" +#include "driver/context.h" + +namespace tdl +{ +namespace driver +{ + +class Stream; + +// Buffer +class Buffer: public HandleInterface +{ +public: + Buffer(Context const & context, size_t size); + Buffer(Context const & context, CUdeviceptr cu, bool take_ownership); + void set_zero(Stream const & queue, size_t size); + Handle const & cu() const; + Handle & cu(); + +private: + Context context_; + Handle cu_; +}; + +} +} + +#endif diff --git a/include/driver/context.h b/include/driver/context.h new file mode 100755 index 000000000..bd98faded --- /dev/null +++ b/include/driver/context.h @@ -0,0 +1,66 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#ifndef TDL_INCLUDE_DRIVER_CONTEXT_H +#define TDL_INCLUDE_DRIVER_CONTEXT_H + +#include "driver/device.h" +#include "driver/handle.h" + +namespace tdl +{ +namespace driver +{ + +class Context: public HandleInterface +{ +private: + static std::string get_cache_path(); + static CUdevice device(CUcontext); + +public: + //Constructors + explicit Context(CUcontext context, bool take_ownership = true); + explicit Context(Device const & device); + //Accessors + Device const & device() const; + std::string const & cache_path() const; + Handle const & cu() const; + +private: + Handle cu_; + Device device_; + std::string cache_path_; +}; + +class ContextSwitcher{ +public: + ContextSwitcher(Context const & ctx); + ~ContextSwitcher(); +private: + Context const & ctx_; +}; + +} +} + +#endif diff --git a/include/driver/cublas.h b/include/driver/cublas.h new file mode 100755 index 000000000..9e1688a97 --- /dev/null +++ b/include/driver/cublas.h @@ -0,0 +1,229 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#ifndef TDL_INCLUDE_DRIVER_CUBLAS_H +#define TDL_INCLUDE_DRIVER_CUBLAS_H + +#include "isaac/templates/common.hpp" +#include "driver/dispatch.h" +#include "driver/buffer.h" +#include "driver/stream.h" +#include "driver/backend.h" +#include "driver/error.h" +#include "tools/bench.hpp" +#include "tools/collections.hpp" + +namespace tdl +{ +namespace driver +{ + +enum cublasStrategy_t{ + CUBLAS_PREFER_FASTEST, + CUBLAS_HEURISTICS +}; + + +static const std::vector cublasAlgorithms = { + CUBLAS_GEMM_DFALT, CUBLAS_GEMM_ALGO0, CUBLAS_GEMM_ALGO1, CUBLAS_GEMM_ALGO2, CUBLAS_GEMM_ALGO3, + CUBLAS_GEMM_ALGO4, CUBLAS_GEMM_ALGO5, CUBLAS_GEMM_ALGO6, CUBLAS_GEMM_ALGO7 +}; + +static const std::map cudtype = {{FLOAT_TYPE, CUDA_R_32F}, {DOUBLE_TYPE,CUDA_R_64F}}; +static const std::map cuop = {{'N', CUBLAS_OP_N}, {'T', CUBLAS_OP_T}}; + +inline cublasGemmAlgo_t cublasGemmFastest(Stream& stream, cublasHandle_t handle, cudaDataType cudt, cublasOperation_t AT, cublasOperation_t BT, int32_t M, int32_t N, int32_t K, + void* alpha, CUdeviceptr A, int32_t lda, CUdeviceptr B, int32_t ldb, + void* beta, CUdeviceptr C, int32_t ldc){ + + typedef std::tuple key_t; + // Benchmark fastest algorithm in cublasGemmEx + auto benchmark_fastest = [&](key_t const &){ + std::vector times; + for(cublasGemmAlgo_t a: cublasAlgorithms){ + try{ + times.push_back(bench([&](){ dispatch::cublasGemmEx(handle, AT, BT, M, N, K, alpha, (const void*)A, cudt, lda, (const void*)B, cudt, ldb, beta, (void*)C, cudt, ldc, cudt, a); }, + [&](){ stream.synchronize(); }, + stream.context().device())); + }catch(driver::exception::cublas::base const &){ + times.push_back(INFINITY); + } + } + size_t argmin = std::min_element(times.begin(), times.end()) - times.begin(); + return cublasAlgorithms[argmin]; + }; + // Cache result + static cpp::CachedMap cache(benchmark_fastest); + return cache.get(std::make_tuple(cudt, AT, BT, M, N, K)); +} + +/* Wrapper for cublasGemmEx */ +inline void cublasGemmEx(cublasHandle_t handle, cudaDataType cudt, cublasOperation_t AT, cublasOperation_t BT, int32_t M, int32_t N, int32_t K, + void* alpha, CUdeviceptr A, int32_t lda, CUdeviceptr B, int32_t ldb, + void* beta, CUdeviceptr C, int32_t ldc, cublasGemmAlgo_t algo) +{ dispatch::cublasGemmEx(handle, AT, BT, M, N, K, alpha, (const void*)A, cudt, lda, (const void*)B, cudt, ldb, beta, (void*)C, cudt, ldc, cudt, algo); } + + +/* Simplified API for default GEMM */ +inline void cublasGemm(DType dtype, Stream& stream, char cAT, char cBT, int32_t M, int32_t N, int32_t K, scalar alpha, Buffer const & A, int32_t lda, Buffer const & B, int32_t ldb, scalar beta, Buffer& C, int32_t ldc, cublasGemmAlgo_t* fastest = NULL, cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT){ + ContextSwitcher ctx_switch(stream.context()); + cublasHandle_t handle = dispatch::cublasHandle(stream.context()); + dispatch::cublasSetStream_v2(handle, (CUstream)stream); + if(fastest) + *fastest = cublasGemmFastest(stream, handle, cudtype.at(dtype), cuop.at(cAT), cuop.at(cBT), M, N, K, alpha.data(), A, lda, B, ldb, beta.data(), C, ldc); + else + cublasGemmEx(handle, cudtype.at(dtype), cuop.at(cAT), cuop.at(cBT), M, N, K, alpha.data(), A, lda, B, ldb, beta.data(), C, ldc, algo); +} + +inline cudnnDataType_t cudnnDtype(DType dtype){ + switch(dtype){ + case INT8X4_TYPE: return CUDNN_DATA_INT8x4; + case INT32_TYPE: return CUDNN_DATA_INT32; + case FLOAT_TYPE: return CUDNN_DATA_FLOAT; + case DOUBLE_TYPE: return CUDNN_DATA_DOUBLE; + } + throw; +} + +inline cudnnTensorFormat_t format(cudnnDataType_t cutype){ + switch(cutype){ + case CUDNN_DATA_INT8x4: return CUDNN_TENSOR_NCHW_VECT_C; + default: return CUDNN_TENSOR_NCHW; + } +} + +inline void cudnnConv(DType dtype, Stream& stream, int32_t D, int32_t H, int32_t W, int32_t N, int32_t K, int32_t M, int32_t P, int32_t Q, int32_t C, int32_t T, int32_t R, int32_t S, + int32_t pad_d, int32_t pad_h, int32_t pad_w, int32_t stride_d, int32_t stride_h, int32_t stride_w, scalar alpha, Buffer const & I, Buffer const & F, scalar beta, Buffer const & O){ + driver::Context const & ctx = stream.context(); + ContextSwitcher switch_ctx(ctx); + + std::vector pad = {pad_d, pad_h, pad_w}; + std::vector stride = {stride_d, stride_h, stride_w}; + std::vector upscale = {1, 1, 1}; + std::vector Oshapes = {N, K, M, P, Q}; + std::vector Fshapes = {K, C, T, R, S}; + std::vector Ishapes = {N, C, D, H, W}; + if(M == 1 && T == 1 && D == 1){ + pad.erase(pad.begin()); + stride.erase(stride.begin()); + upscale.erase(upscale.begin()); + Oshapes.erase(Oshapes.begin() + 2); + Ishapes.erase(Ishapes.begin() + 2); + Fshapes.erase(Fshapes.begin() + 2); + } + + cudnnHandle_t handle = dispatch::cudnnHandle(ctx); + cudnnDataType_t in_cutype = cudnnDtype(dtype); + cudnnDataType_t conv_cutype = (dtype == INT8X4_TYPE)?CUDNN_DATA_INT32:in_cutype; + + dispatch::cudnnSetStream(handle, (CUstream)stream); + cudnnTensorDescriptor_t tO, tI; + cudnnFilterDescriptor_t tF; + cudnnConvolutionDescriptor_t conv; + cudnnConvolutionFwdAlgo_t algo; + dispatch::cudnnCreateTensorDescriptor(&tO); + dispatch::cudnnCreateTensorDescriptor(&tI); + dispatch::cudnnCreateFilterDescriptor(&tF); + + dispatch::cudnnSetTensorNdDescriptorEx(tO, format(in_cutype), in_cutype, Oshapes.size(), Oshapes.data()); + dispatch::cudnnSetFilterNdDescriptor(tF, in_cutype, format(in_cutype), Fshapes.size(), Fshapes.data()); + dispatch::cudnnSetTensorNdDescriptorEx(tI, format(in_cutype), in_cutype, Ishapes.size(), Ishapes.data()); + + dispatch::cudnnCreateConvolutionDescriptor(&conv); + dispatch::cudnnSetConvolutionNdDescriptor(conv, pad.size(), pad.data(), stride.data(), upscale.data(), CUDNN_CROSS_CORRELATION, conv_cutype); + dispatch::cudnnGetConvolutionForwardAlgorithm(handle, tI, tF, conv, tO, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, 1024*1024*64, &algo); + + size_t workspace_size; + dispatch::cudnnGetConvolutionForwardWorkspaceSize(handle, tI, tF, conv, tO, algo, &workspace_size); + static Buffer work(ctx, 1024*1024*64); + CUdeviceptr twork = work; + CUdeviceptr pI = I, pF = F, pO = O; + dispatch::cudnnConvolutionForward(handle, alpha.data(), tI, (void*)pI, tF, (void*)pF, conv, algo, (void*)twork, workspace_size, beta.data(), tO, (void*)pO); +} + + +inline void cudnnPool(DType dtype, Stream& stream, int32_t D, int32_t H, int32_t W, int32_t N, int32_t K, int32_t M, int32_t P, int32_t Q, int32_t T, int32_t R, int32_t S, + int32_t pad_d, int32_t pad_h, int32_t pad_w, int32_t stride_d, int32_t stride_h, int32_t stride_w, scalar alpha, Buffer const & I, scalar beta, Buffer const & O){ + driver::Context const & ctx = stream.context(); + ContextSwitcher switch_ctx(ctx); + + std::vector pad = {pad_d, pad_h, pad_w}; + std::vector stride = {stride_d, stride_h, stride_w}; + std::vector upscale = {1, 1, 1}; + std::vector Oshapes = {N, K, M, P, Q}; + std::vector Ishapes = {N, K, D, H, W}; + std::vector window = {T, R, S}; + if(M == 1 && T == 1 && D == 1){ + window.erase(window.begin()); + pad.erase(pad.begin()); + stride.erase(stride.begin()); + upscale.erase(upscale.begin()); + Oshapes.erase(Oshapes.begin() + 2); + Ishapes.erase(Ishapes.begin() + 2); + } + + cudnnHandle_t handle = dispatch::cudnnHandle(ctx); + cudnnDataType_t cutype = cudnnDtype(dtype); + + dispatch::cudnnSetStream(handle, (CUstream)stream); + cudnnTensorDescriptor_t tO, tI; + cudnnPoolingDescriptor_t desc; + dispatch::cudnnCreateTensorDescriptor(&tO); + dispatch::cudnnCreateTensorDescriptor(&tI); + + dispatch::cudnnSetTensorNdDescriptorEx(tO, CUDNN_TENSOR_NCHW, cutype, Oshapes.size(), Oshapes.data()); + dispatch::cudnnSetTensorNdDescriptorEx(tI, CUDNN_TENSOR_NCHW, cutype, Ishapes.size(), Ishapes.data()); + + dispatch::cudnnCreatePoolingDescriptor(&desc); + dispatch::cudnnSetPoolingNdDescriptor(desc, CUDNN_POOLING_MAX, CUDNN_NOT_PROPAGATE_NAN, window.size(), window.data(), pad.data(), stride.data()); + + CUdeviceptr pI = I, pO = O; + dispatch::cudnnPoolingForward(handle, desc, alpha.data(), tI, (void*)pI, beta.data(), tO, (void*)pO); +} + +inline void cudnnTransformTensor(driver::Stream & stream, + DType in_dtype, DType out_dtype, + cudnnTensorFormat_t in_layout, cudnnTensorFormat_t out_layout, + int32_t N, int32_t C, int32_t D, int32_t H, int32_t W, + scalar alpha, driver::Buffer const & I, scalar beta, driver::Buffer& O) +{ + cudnnHandle_t handle = dispatch::cudnnHandle(stream.context()); + dispatch::cudnnSetStream(handle, (CUstream)stream); + + cudnnTensorDescriptor_t tO, tI; + std::vector shapes = {N, C, D, H, W}; + dispatch::cudnnCreateTensorDescriptor(&tI); + dispatch::cudnnSetTensorNdDescriptorEx(tI, in_layout, cudnnDtype(in_dtype), shapes.size(), shapes.data()); + dispatch::cudnnCreateTensorDescriptor(&tO); + dispatch::cudnnSetTensorNdDescriptorEx(tO, out_layout, cudnnDtype(out_dtype), shapes.size(), shapes.data()); + + CUdeviceptr pI = I, pO = O; + dispatch::cudnnTransformTensor(handle, alpha.data(), tI, (void*)pI, beta.data(), tO, (void*)pO); +} + + +} +} + + + +#endif diff --git a/include/driver/device.h b/include/driver/device.h new file mode 100755 index 000000000..cffaf64b2 --- /dev/null +++ b/include/driver/device.h @@ -0,0 +1,98 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#ifndef TDL_INCLUDE_DRIVER_DEVICE_H +#define TDL_INCLUDE_DRIVER_DEVICE_H + +#include "driver/platform.h" +#include "driver/handle.h" + +namespace tdl +{ + +namespace driver +{ + +// Device +class Device: public HandleInterface +{ +public: + //Supported architectures + enum class Architecture{ + //NVidia + SM_2_0, + SM_2_1, + SM_3_0, + SM_3_5, + SM_3_7, + SM_5_0, + SM_5_2, + SM_6_0, + SM_6_1, + SM_7_0, + UNKNOWN + }; + +private: + //Metaprogramming elper to get cuda info from attribute + template + int cuGetInfo() const; + + inline Architecture nv_arch(std::pair sm) const; + inline nvmlDevice_t nvml_device() const; + +public: + Device(CUdevice cu = CUdevice(), bool take_ownership = true): cu_(cu, take_ownership){} + //Accessors + Architecture architecture() const; + Handle const & cu() const; + //Informations + std::string infos() const; + size_t address_bits() const; + driver::Platform platform() const; + std::vector max_block_dim() const; + size_t max_threads_per_block() const; + size_t max_shared_memory() const; + size_t warp_size() const; + //Compute Capability + void interpret_as(std::pair cc); + std::pair compute_capability() const; + //Identifier + std::string name() const; + std::string pci_bus_id() const; + //Clocks + size_t current_sm_clock() const; + size_t current_mem_clock() const; + + size_t max_sm_clock() const; + size_t max_mem_clock() const; + +private: + Handle cu_; + std::shared_ptr> interpreted_as_; +}; + +} + +} + +#endif diff --git a/include/driver/dispatch.h b/include/driver/dispatch.h new file mode 100755 index 000000000..910fdc001 --- /dev/null +++ b/include/driver/dispatch.h @@ -0,0 +1,258 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#ifndef TDL_INCLUDE_DRIVER_DISPATCHER_H +#define TDL_INCLUDE_DRIVER_DISPATCHER_H + +#include +#include + +//CUDA Backend +#include "external/CUDA/cuda.h" +#include "external/CUDA/nvrtc.h" +#include "external/CUDA/cublas_v2.h" +#include "external/CUDA/cudnn.h" +#include "external/CUDA/nvml.h" + +//Exceptions +#include +#include + +namespace tdl +{ +namespace driver +{ + +class Context; + +template void check(T){} +void check(nvrtcResult err); +void check(CUresult err); +void check(cublasStatus_t err); +void check(cudnnStatus_t err); + +class dispatch +{ +private: + template + struct return_type; + + template + struct return_type + { typedef R type; }; + + typedef bool (*f_init_t)(); + + template + static typename return_type::type f_impl(void*& lib_h, FunPtrT, void*& cache, const char * name, Args... args) + { + initializer(); + if(cache == nullptr){ + cache = dlsym(lib_h, name); + if(cache == 0) + throw std::runtime_error("dlsym unable to load function"); + } + FunPtrT fptr; + *reinterpret_cast(&fptr) = cache; + typename return_type::type res = (*fptr)(args...); + check(res); + return res; + } + +public: + static bool nvrtcinit(); + static bool nvmlinit(); + static bool cuinit(); + static bool cublasinit(); + static bool cudnninit(); + + static void release(); + + //CUDA + static CUresult cuCtxGetCurrent(CUcontext *pctx); + static CUresult cuCtxSetCurrent(CUcontext ctx); + + static CUresult cuCtxDestroy_v2(CUcontext ctx); + static CUresult cuEventCreate(CUevent *phEvent, unsigned int Flags); + static CUresult cuDeviceGet(CUdevice *device, int ordinal); + static CUresult cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); + static CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags); + static CUresult cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd); + static CUresult cuMemFree_v2(CUdeviceptr dptr); + static CUresult cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); + static CUresult cuDriverGetVersion(int *driverVersion); + static CUresult cuDeviceGetName(char *name, int len, CUdevice dev); + static CUresult cuDeviceGetPCIBusId(char *id, int len, CUdevice dev); + static CUresult cuModuleGetGlobal_v2(CUdeviceptr *dptr, size_t* bytes, CUmodule hmod, const char *name); + + static CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); + static CUresult cuModuleLoad(CUmodule *module, const char *fname); + static CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra); + static CUresult cuModuleUnload(CUmodule hmod); + static CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues); + static CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev); + static CUresult cuDeviceGetCount(int *count); + static CUresult cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); + static CUresult cuInit(unsigned int Flags); + static CUresult cuEventRecord(CUevent hEvent, CUstream hStream); + static CUresult cuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev); + static CUresult cuCtxPushCurrent_v2(CUcontext ctx); + static CUresult cuCtxPopCurrent_v2(CUcontext *pctx); + static CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name); + static CUresult cuStreamSynchronize(CUstream hStream); + static CUresult cuStreamDestroy_v2(CUstream hStream); + static CUresult cuEventDestroy_v2(CUevent hEvent); + static CUresult cuMemAlloc_v2(CUdeviceptr *dptr, size_t bytesize); + static CUresult cuPointerGetAttribute(void * data, CUpointer_attribute attribute, CUdeviceptr ptr); + static CUresult cuCtxGetDevice(CUdevice* result); + static CUresult cuMemsetD8Async(CUdeviceptr dst, unsigned char x, size_t N, CUstream stream); + + static nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2( const char* pciBusId, nvmlDevice_t* device); + static nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); + static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); + + static nvrtcResult nvrtcCompileProgram(nvrtcProgram prog, int numOptions, const char **options); + static nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet); + static nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx); + static nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet); + static nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char **headers, const char **includeNames); + static nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log); + + static cublasHandle_t cublasHandle(Context const & ctx); + static cublasStatus_t cublasCreate_v2(cublasHandle_t* h); + static cublasStatus_t cublasGetStream_v2(cublasHandle_t h, cudaStream_t *streamId); + static cublasStatus_t cublasSetStream_v2(cublasHandle_t h, cudaStream_t streamId); + static cublasStatus_t cublasSgemm_v2 (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc); + static cublasStatus_t cublasDgemm_v2 (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc); + static cublasStatus_t cublasHgemm (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, half* alpha, const half *A, int lda, const half *B, int ldb, half* beta, half *C, int ldc); + static cublasStatus_t cublasGemmEx(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const void *alpha, const void *A, cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb, const void *beta, void *C, cudaDataType Ctype, int ldc, cudaDataType computeType, cublasGemmAlgo_t algo); + + static cudnnHandle_t cudnnHandle(Context const & ctx); + static cudnnStatus_t cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc); + static cudnnStatus_t cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t* convDesc); + static cudnnStatus_t cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc); + static cudnnStatus_t cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc); + static cudnnStatus_t cudnnCreate(cudnnHandle_t *handle); + static cudnnStatus_t cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format, cudnnDataType_t dataType, int n, int c, int h, int w); + static cudnnStatus_t cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc, cudnnDataType_t dataType, cudnnTensorFormat_t format, int k, int c, int h, int w); + static cudnnStatus_t cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format, cudnnDataType_t dataType, int nbDims, const int dimA[]); + static cudnnStatus_t cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc, cudnnDataType_t dataType, cudnnTensorFormat_t format, int nbDims, const int filterDimA[]); + static cudnnStatus_t cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc, int pad_h, int pad_w, int u, int v, int upscalex, int upscaley, cudnnConvolutionMode_t mode); + static cudnnStatus_t cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc, int arrayLength, const int padA[], const int filterStrideA[], const int upscaleA[], cudnnConvolutionMode_t mode, cudnnDataType_t dataType); + static cudnnStatus_t cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode, const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims, const int windowDimA[], const int paddingA[], const int strideA[]); + static cudnnStatus_t cudnnGetConvolutionForwardAlgorithm(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes, cudnnConvolutionFwdAlgo_t *algo); + static cudnnStatus_t cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo, size_t *sizeInBytes); + static cudnnStatus_t cudnnConvolutionForward(cudnnHandle_t handle, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const cudnnFilterDescriptor_t wDesc, const void *w, const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo, void *workSpace, size_t workSpaceSizeInBytes, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y); + static cudnnStatus_t cudnnPoolingForward(cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y); + static cudnnStatus_t cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId); + static cudnnStatus_t cudnnTransformTensor(cudnnHandle_t handle, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y); + +private: + static void* cuda_; + static void* nvrtc_; + static void* nvml_; + static void* cublas_; + static void* cudnn_; + + //CUDA + static void* cuCtxGetCurrent_; + static void* cuCtxSetCurrent_; + static void* cuCtxDestroy_v2_; + static void* cuEventCreate_; + static void* cuDeviceGet_; + static void* cuMemcpyDtoH_v2_; + static void* cuStreamCreate_; + static void* cuEventElapsedTime_; + static void* cuMemFree_v2_; + static void* cuMemcpyDtoHAsync_v2_; + static void* cuDriverGetVersion_; + static void* cuDeviceGetName_; + static void* cuDeviceGetPCIBusId_; + static void* cuModuleGetGlobal_v2_; + + static void* cuMemcpyHtoDAsync_v2_; + static void* cuModuleLoad_; + static void* cuLaunchKernel_; + static void* cuModuleUnload_; + static void* cuModuleLoadDataEx_; + static void* cuDeviceGetAttribute_; + static void* cuDeviceGetCount_; + static void* cuMemcpyHtoD_v2_; + static void* cuInit_; + static void* cuEventRecord_; + static void* cuCtxCreate_v2_; + static void* cuModuleGetFunction_; + static void* cuStreamSynchronize_; + static void* cuStreamDestroy_v2_; + static void* cuEventDestroy_v2_; + static void* cuMemAlloc_v2_; + static void* cuPointerGetAttribute_; + static void* cuCtxGetDevice_; + static void* cuMemsetD8Async_; + static void* cuCtxPushCurrent_v2_; + static void* cuCtxPopCurrent_v2_; + + static void* nvmlInit_v2_; + static void* nvmlDeviceGetHandleByPciBusId_v2_; + static void* nvmlDeviceGetClockInfo_; + static void* nvmlDeviceGetMaxClockInfo_; + + static void* nvrtcCompileProgram_; + static void* nvrtcGetProgramLogSize_; + static void* nvrtcGetPTX_; + static void* nvrtcGetPTXSize_; + static void* nvrtcCreateProgram_; + static void* nvrtcGetProgramLog_; + + static void* cublasCreate_v2_; + static void* cublasGetStream_v2_; + static void* cublasSetStream_v2_; + static void* cublasHgemm_; + static void* cublasSgemm_v2_; + static void* cublasDgemm_v2_; + static void* cublasGemmEx_; + + static void* cudnnCreateConvolutionDescriptor_; + static void* cudnnCreatePoolingDescriptor_; + static void* cudnnCreateTensorDescriptor_; + static void* cudnnCreateFilterDescriptor_; + static void* cudnnCreate_; + static void* cudnnSetTensor4dDescriptor_; + static void* cudnnSetFilter4dDescriptor_; + static void* cudnnSetTensorNdDescriptorEx_; + static void* cudnnSetFilterNdDescriptor_; + static void* cudnnSetConvolution2dDescriptor_; + static void* cudnnSetConvolutionNdDescriptor_; + static void* cudnnSetPoolingNdDescriptor_; + static void* cudnnGetConvolutionForwardAlgorithm_; + static void* cudnnGetConvolutionForwardWorkspaceSize_; + static void* cudnnConvolutionForward_; + static void* cudnnPoolingForward_; + static void* cudnnSetStream_; + static void* cudnnTransformTensor_; +}; + +} +} + + +#endif diff --git a/include/driver/error.h b/include/driver/error.h new file mode 100755 index 000000000..d1589aad5 --- /dev/null +++ b/include/driver/error.h @@ -0,0 +1,228 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#ifndef TDL_INCLUDE_DRIVER_ERROR_H +#define TDL_INCLUDE_DRIVER_ERROR_H + +#include +#include "driver/dispatch.h" + + +namespace tdl +{ + + namespace driver + { + + namespace exception + { + + namespace nvrtc + { + +#define ISAAC_CREATE_NVRTC_EXCEPTION(name, msg) class name: public std::exception { public: const char * what() const throw(){ return "NVRTC: Error- " msg; } } + + ISAAC_CREATE_NVRTC_EXCEPTION(out_of_memory ,"out of memory"); + ISAAC_CREATE_NVRTC_EXCEPTION(program_creation_failure ,"program creation failure"); + ISAAC_CREATE_NVRTC_EXCEPTION(invalid_input ,"invalid input"); + ISAAC_CREATE_NVRTC_EXCEPTION(invalid_program ,"invalid program"); + ISAAC_CREATE_NVRTC_EXCEPTION(invalid_option ,"invalid option"); + ISAAC_CREATE_NVRTC_EXCEPTION(compilation ,"compilation"); + ISAAC_CREATE_NVRTC_EXCEPTION(builtin_operation_failure ,"builtin operation failure"); + ISAAC_CREATE_NVRTC_EXCEPTION(unknown_error ,"unknown error"); + +#undef ISAAC_CREATE_NVRTC_EXCEPTION + } + + + namespace cuda + { + class base: public std::exception{}; + +#define ISAAC_CREATE_CUDA_EXCEPTION(name, msg) class name: public base { public:const char * what() const throw(){ return "CUDA: Error- " msg; } } + + + ISAAC_CREATE_CUDA_EXCEPTION(invalid_value ,"invalid value"); + ISAAC_CREATE_CUDA_EXCEPTION(out_of_memory ,"out of memory"); + ISAAC_CREATE_CUDA_EXCEPTION(not_initialized ,"not initialized"); + ISAAC_CREATE_CUDA_EXCEPTION(deinitialized ,"deinitialized"); + ISAAC_CREATE_CUDA_EXCEPTION(profiler_disabled ,"profiler disabled"); + ISAAC_CREATE_CUDA_EXCEPTION(profiler_not_initialized ,"profiler not initialized"); + ISAAC_CREATE_CUDA_EXCEPTION(profiler_already_started ,"profiler already started"); + ISAAC_CREATE_CUDA_EXCEPTION(profiler_already_stopped ,"profiler already stopped"); + ISAAC_CREATE_CUDA_EXCEPTION(no_device ,"no device"); + ISAAC_CREATE_CUDA_EXCEPTION(invalid_device ,"invalid device"); + ISAAC_CREATE_CUDA_EXCEPTION(invalid_image ,"invalid image"); + ISAAC_CREATE_CUDA_EXCEPTION(invalid_context ,"invalid context"); + ISAAC_CREATE_CUDA_EXCEPTION(context_already_current ,"context already current"); + ISAAC_CREATE_CUDA_EXCEPTION(map_failed ,"map failed"); + ISAAC_CREATE_CUDA_EXCEPTION(unmap_failed ,"unmap failed"); + ISAAC_CREATE_CUDA_EXCEPTION(array_is_mapped ,"array is mapped"); + ISAAC_CREATE_CUDA_EXCEPTION(already_mapped ,"already mapped"); + ISAAC_CREATE_CUDA_EXCEPTION(no_binary_for_gpu ,"no binary for gpu"); + ISAAC_CREATE_CUDA_EXCEPTION(already_acquired ,"already acquired"); + ISAAC_CREATE_CUDA_EXCEPTION(not_mapped ,"not mapped"); + ISAAC_CREATE_CUDA_EXCEPTION(not_mapped_as_array ,"not mapped as array"); + ISAAC_CREATE_CUDA_EXCEPTION(not_mapped_as_pointer ,"not mapped as pointer"); + ISAAC_CREATE_CUDA_EXCEPTION(ecc_uncorrectable ,"ecc uncorrectable"); + ISAAC_CREATE_CUDA_EXCEPTION(unsupported_limit ,"unsupported limit"); + ISAAC_CREATE_CUDA_EXCEPTION(context_already_in_use ,"context already in use"); + ISAAC_CREATE_CUDA_EXCEPTION(peer_access_unsupported ,"peer access unsupported"); + ISAAC_CREATE_CUDA_EXCEPTION(invalid_ptx ,"invalid ptx"); + ISAAC_CREATE_CUDA_EXCEPTION(invalid_graphics_context ,"invalid graphics context"); + ISAAC_CREATE_CUDA_EXCEPTION(invalid_source ,"invalid source"); + ISAAC_CREATE_CUDA_EXCEPTION(file_not_found ,"file not found"); + ISAAC_CREATE_CUDA_EXCEPTION(shared_object_symbol_not_found ,"shared object symbol not found"); + ISAAC_CREATE_CUDA_EXCEPTION(shared_object_init_failed ,"shared object init failed"); + ISAAC_CREATE_CUDA_EXCEPTION(operating_system ,"operating system"); + ISAAC_CREATE_CUDA_EXCEPTION(invalid_handle ,"invalid handle"); + ISAAC_CREATE_CUDA_EXCEPTION(not_found ,"not found"); + ISAAC_CREATE_CUDA_EXCEPTION(not_ready ,"not ready"); + ISAAC_CREATE_CUDA_EXCEPTION(illegal_address ,"illegal address"); + ISAAC_CREATE_CUDA_EXCEPTION(launch_out_of_resources ,"launch out of resources"); + ISAAC_CREATE_CUDA_EXCEPTION(launch_timeout ,"launch timeout"); + ISAAC_CREATE_CUDA_EXCEPTION(launch_incompatible_texturing ,"launch incompatible texturing"); + ISAAC_CREATE_CUDA_EXCEPTION(peer_access_already_enabled ,"peer access already enabled"); + ISAAC_CREATE_CUDA_EXCEPTION(peer_access_not_enabled ,"peer access not enabled"); + ISAAC_CREATE_CUDA_EXCEPTION(primary_context_active ,"primary context active"); + ISAAC_CREATE_CUDA_EXCEPTION(context_is_destroyed ,"context is destroyed"); + ISAAC_CREATE_CUDA_EXCEPTION(assert_error ,"assert"); + ISAAC_CREATE_CUDA_EXCEPTION(too_many_peers ,"too many peers"); + ISAAC_CREATE_CUDA_EXCEPTION(host_memory_already_registered ,"host memory already registered"); + ISAAC_CREATE_CUDA_EXCEPTION(host_memory_not_registered ,"hot memory not registered"); + ISAAC_CREATE_CUDA_EXCEPTION(hardware_stack_error ,"hardware stack error"); + ISAAC_CREATE_CUDA_EXCEPTION(illegal_instruction ,"illegal instruction"); + ISAAC_CREATE_CUDA_EXCEPTION(misaligned_address ,"misaligned address"); + ISAAC_CREATE_CUDA_EXCEPTION(invalid_address_space ,"invalid address space"); + ISAAC_CREATE_CUDA_EXCEPTION(invalid_pc ,"invalid pc"); + ISAAC_CREATE_CUDA_EXCEPTION(launch_failed ,"launch failed"); + ISAAC_CREATE_CUDA_EXCEPTION(not_permitted ,"not permitted"); + ISAAC_CREATE_CUDA_EXCEPTION(not_supported ,"not supported"); + ISAAC_CREATE_CUDA_EXCEPTION(unknown ,"unknown"); + +#undef ISAAC_CREATE_CUDA_EXCEPTION + } + + namespace cublas + { + class base: public std::exception{}; + +#define ISAAC_CREATE_CUBLAS_EXCEPTION(name, msg) class name: public base { public: const char * what() const throw(){ return "CUBLAS: Error- " msg; } } + + ISAAC_CREATE_CUBLAS_EXCEPTION(not_initialized ,"not initialized"); + ISAAC_CREATE_CUBLAS_EXCEPTION(alloc_failed ,"alloc failed"); + ISAAC_CREATE_CUBLAS_EXCEPTION(invalid_value ,"invalid value"); + ISAAC_CREATE_CUBLAS_EXCEPTION(arch_mismatch ,"arch mismatch"); + ISAAC_CREATE_CUBLAS_EXCEPTION(mapping_error ,"mapping error"); + ISAAC_CREATE_CUBLAS_EXCEPTION(execution_failed ,"execution failed"); + ISAAC_CREATE_CUBLAS_EXCEPTION(internal_error ,"internal error"); + ISAAC_CREATE_CUBLAS_EXCEPTION(not_supported ,"not supported"); + ISAAC_CREATE_CUBLAS_EXCEPTION(license_error ,"license error"); + ISAAC_CREATE_CUBLAS_EXCEPTION(unknown ,"unknown"); + +#undef ISAAC_CREATE_CUBLAS_EXCEPTION + } + + namespace cudnn + { +#define ISAAC_CREATE_CUDNN_EXCEPTION(name, msg) class name: public std::exception { public: const char * what() const throw(){ return "CUDNN: Error- " msg; } } + + ISAAC_CREATE_CUDNN_EXCEPTION(not_initialized ,"not initialized"); + ISAAC_CREATE_CUDNN_EXCEPTION(alloc_failed ,"allocation failed"); + ISAAC_CREATE_CUDNN_EXCEPTION(bad_param ,"bad param"); + ISAAC_CREATE_CUDNN_EXCEPTION(internal_error ,"internal error"); + ISAAC_CREATE_CUDNN_EXCEPTION(invalid_value ,"invalid value"); + ISAAC_CREATE_CUDNN_EXCEPTION(arch_mismatch ,"arch mismatch"); + ISAAC_CREATE_CUDNN_EXCEPTION(mapping_error ,"mapping error"); + ISAAC_CREATE_CUDNN_EXCEPTION(execution_failed ,"execution failed"); + ISAAC_CREATE_CUDNN_EXCEPTION(not_supported ,"not supported"); + ISAAC_CREATE_CUDNN_EXCEPTION(license_error ,"license error"); + ISAAC_CREATE_CUDNN_EXCEPTION(runtime_prerequisite_missing ,"prerequisite missing"); + ISAAC_CREATE_CUDNN_EXCEPTION(runtime_in_progress ,"runtime in progress"); + ISAAC_CREATE_CUDNN_EXCEPTION(runtime_fp_overflow ,"runtime fp overflow"); + } + + namespace ocl + { + + class base: public std::exception{}; + +#define ISAAC_CREATE_CL_EXCEPTION(name, msg) class name: public base { public: const char * what() const throw(){ return "OpenCL: Error- " msg; } } + + + ISAAC_CREATE_CL_EXCEPTION(device_not_found, "device not found"); + ISAAC_CREATE_CL_EXCEPTION(device_not_available, "device not available"); + ISAAC_CREATE_CL_EXCEPTION(compiler_not_available, "compiler not available"); + ISAAC_CREATE_CL_EXCEPTION(mem_object_allocation_failure, "object allocation failure"); + ISAAC_CREATE_CL_EXCEPTION(out_of_resources, "launch out of resources"); + ISAAC_CREATE_CL_EXCEPTION(out_of_host_memory, "out of host memory"); + ISAAC_CREATE_CL_EXCEPTION(profiling_info_not_available, "profiling info not available"); + ISAAC_CREATE_CL_EXCEPTION(mem_copy_overlap, "mem copy overlap"); + ISAAC_CREATE_CL_EXCEPTION(image_format_mismatch, "image format mismatch"); + ISAAC_CREATE_CL_EXCEPTION(image_format_not_supported, "image format not supported"); + ISAAC_CREATE_CL_EXCEPTION(build_program_failure, "build program failure"); + ISAAC_CREATE_CL_EXCEPTION(map_failure, "map failure"); + ISAAC_CREATE_CL_EXCEPTION(invalid_value, "invalid value"); + ISAAC_CREATE_CL_EXCEPTION(invalid_device_type, "invalid device type"); + ISAAC_CREATE_CL_EXCEPTION(invalid_platform, "invalid platform"); + ISAAC_CREATE_CL_EXCEPTION(invalid_device, "invalid device"); + ISAAC_CREATE_CL_EXCEPTION(invalid_context, "invalid context"); + ISAAC_CREATE_CL_EXCEPTION(invalid_queue_properties, "invalid queue properties"); + ISAAC_CREATE_CL_EXCEPTION(invalid_command_queue, "invalid command queue"); + ISAAC_CREATE_CL_EXCEPTION(invalid_host_ptr, "invalid host pointer"); + ISAAC_CREATE_CL_EXCEPTION(invalid_mem_object, "invalid mem object"); + ISAAC_CREATE_CL_EXCEPTION(invalid_image_format_descriptor, "invalid image format descriptor"); + ISAAC_CREATE_CL_EXCEPTION(invalid_image_size, "invalid image size"); + ISAAC_CREATE_CL_EXCEPTION(invalid_sampler, "invalid sampler"); + ISAAC_CREATE_CL_EXCEPTION(invalid_binary, "invalid binary"); + ISAAC_CREATE_CL_EXCEPTION(invalid_build_options, "invalid build options"); + ISAAC_CREATE_CL_EXCEPTION(invalid_program, "invalid program"); + ISAAC_CREATE_CL_EXCEPTION(invalid_program_executable, "invalid program executable"); + ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_name, "invalid kernel name"); + ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_definition, "invalid kernel definition"); + ISAAC_CREATE_CL_EXCEPTION(invalid_kernel, "invalid kernel"); + ISAAC_CREATE_CL_EXCEPTION(invalid_arg_index, "invalid arg index"); + ISAAC_CREATE_CL_EXCEPTION(invalid_arg_value, "invalid arg value"); + ISAAC_CREATE_CL_EXCEPTION(invalid_arg_size, "invalid arg size"); + ISAAC_CREATE_CL_EXCEPTION(invalid_kernel_args, "invalid kernel args"); + ISAAC_CREATE_CL_EXCEPTION(invalid_work_dimension, "invalid work dimension"); + ISAAC_CREATE_CL_EXCEPTION(invalid_work_group_size, "invalid work group size"); + ISAAC_CREATE_CL_EXCEPTION(invalid_work_item_size, "invalid work item size"); + ISAAC_CREATE_CL_EXCEPTION(invalid_global_offset, "invalid global offset"); + ISAAC_CREATE_CL_EXCEPTION(invalid_event_wait_list, "invalid event wait list"); + ISAAC_CREATE_CL_EXCEPTION(invalid_event, "invalid event"); + ISAAC_CREATE_CL_EXCEPTION(invalid_operation, "invalid operation"); + ISAAC_CREATE_CL_EXCEPTION(invalid_gl_object, "invalid GL object"); + ISAAC_CREATE_CL_EXCEPTION(invalid_buffer_size, "invalid buffer size"); + ISAAC_CREATE_CL_EXCEPTION(invalid_mip_level, "invalid MIP level"); + ISAAC_CREATE_CL_EXCEPTION(invalid_global_work_size, "invalid global work size"); +#ifdef CL_INVALID_PROPERTY + ISAAC_CREATE_CL_EXCEPTION(invalid_property, "invalid property"); +#endif + } + + + } + } +} + +#endif diff --git a/include/driver/event.h b/include/driver/event.h new file mode 100755 index 000000000..23f2c557f --- /dev/null +++ b/include/driver/event.h @@ -0,0 +1,49 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#ifndef TDL_INCLUDE_DRIVER_EVENT_H +#define TDL_INCLUDE_DRIVER_EVENT_H + +#include "driver/handle.h" + +namespace tdl +{ + +namespace driver +{ + +// Event +class Event: public HandleInterface +{ +public: + float elapsed_time() const; + Handle const & cu() const; + +private: + Handle cu_; +}; + +} + +} + +#endif diff --git a/include/driver/handle.h b/include/driver/handle.h new file mode 100755 index 000000000..eb7c90705 --- /dev/null +++ b/include/driver/handle.h @@ -0,0 +1,82 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#ifndef TDL_INCLUDE_DRIVER_HANDLE_H +#define TDL_INCLUDE_DRIVER_HANDLE_H + +#include +#include +#include +#include +#include "driver/dispatch.h" + +namespace tdl +{ + +namespace driver +{ + +struct cu_event_t{ + operator bool() const { return first && second; } + CUevent first; + CUevent second; +}; + +struct cu_platform{ + cu_platform() : status_(dispatch::cuInit(0)) { } + operator bool() const { return status_; } +private: + CUresult status_; +}; + +template +class HandleInterface{ +public: + //Accessors + operator CUType() const { return *(((T*)this)->cu().h_); } + //Comparison + bool operator==(HandleInterface const & y) { return (CUType)(*this) == (CUType)(y); } + bool operator!=(HandleInterface const & y) { return (CUType)(*this) != (CUType)(y); } + bool operator<(HandleInterface const & y) { return (CUType)(*this) < (CUType)(y); } +}; + +template +class Handle{ +public: + template friend class HandleInterface; +public: + //Constructors + Handle(CUType cu = CUType(), bool take_ownership = true); + ~Handle(); + CUType& operator*() { return *h_; } + CUType const & operator*() const { return *h_; } + CUType* operator->() const { return h_.get(); } + +protected: + std::shared_ptr h_; + bool has_ownership_; +}; + +} +} + +#endif diff --git a/include/driver/kernel.h b/include/driver/kernel.h new file mode 100755 index 000000000..60d4dc108 --- /dev/null +++ b/include/driver/kernel.h @@ -0,0 +1,68 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#ifndef TDL_INCLUDE_DRIVER_KERNEL_H +#define TDL_INCLUDE_DRIVER_KERNEL_H + +#include "driver/module.h" +#include "driver/handle.h" + +#include + +namespace tdl +{ + +namespace driver +{ + +class Buffer; + +// Kernel +class Kernel: public HandleInterface +{ +public: + //Constructors + Kernel(Module const & program, const char * name); + //Accessors + Handle const & cu() const; + Module const & module() const; + //Arguments setters + void setArg(unsigned int index, std::size_t size, void* ptr); + void setArg(unsigned int index, Buffer const &); + template void setArg(unsigned int index, T value) { setArg(index, sizeof(T), (void*)&value); } + //Arguments getters + void* const* cu_params() const; + +private: + Handle cu_; + Module program_; + unsigned int address_bits_; + std::vector > cu_params_store_; + std::vector cu_params_; +}; + +} + +} + +#endif + diff --git a/include/driver/module.h b/include/driver/module.h new file mode 100755 index 000000000..2a1093233 --- /dev/null +++ b/include/driver/module.h @@ -0,0 +1,61 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#ifndef TDL_INCLUDE_DRIVER_MODULE_H +#define TDL_INCLUDE_DRIVER_MODULE_H + +#include +#include "driver/handle.h" +#include "driver/context.h" +#include "driver/buffer.h" + +namespace tdl +{ + +namespace driver +{ + +class Context; +class Device; + +class Module: public HandleInterface +{ + static std::string header(Device const & device); + +public: + Module(Context const & context, std::string const & source); + Context const & context() const; + Handle const & cu() const; + Buffer symbol(const char * name) const; + +private: + Handle cu_; + Context context_; + std::string source_; +}; + + +} + +} + +#endif diff --git a/include/driver/platform.h b/include/driver/platform.h new file mode 100755 index 000000000..2a3b8fcdb --- /dev/null +++ b/include/driver/platform.h @@ -0,0 +1,54 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#ifndef TDL_INCLUDE_DRIVER_PLATFORM_H +#define TDL_INCLUDE_DRIVER_PLATFORM_H + +#include +#include + +#include "driver/handle.h" + +namespace tdl +{ + +namespace driver +{ + +class Device; + +class Platform +{ +public: + //Accessors + std::string name() const; + std::string version() const; + std::vector devices() const; +private: + Handle cu_; +}; + +} + +} + +#endif diff --git a/include/driver/stream.h b/include/driver/stream.h new file mode 100755 index 000000000..5ff59356c --- /dev/null +++ b/include/driver/stream.h @@ -0,0 +1,82 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#ifndef TDL_INCLUDE_DRIVER_STREAM_H +#define TDL_INCLUDE_DRIVER_STREAM_H + +#include +#include "driver/context.h" +#include "driver/device.h" +#include "driver/handle.h" +#include "driver/buffer.h" + +namespace tdl +{ + +namespace driver +{ + +class Kernel; +class Event; +class Range; +class Buffer; + +// Command Queue +class Stream: public HandleInterface +{ +public: + //Constructors + Stream(CUstream stream, bool take_ownership); + Stream(Context const & context); + + //Accessors + Handle const & cu() const; + Context const & context() const; + + //Synchronize + void synchronize(); + + //Enqueue + void enqueue(Kernel const & kernel, std::array grid, std::array block, std::vector const * = NULL, Event *event = NULL); + + // Write + void write(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr); + + template void write(Buffer const & buffer, bool blocking, std::size_t offset, std::vector const & x) + { write(buffer, blocking, offset, x.size()*sizeof(T), x.data()); } + + // Read + void read(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr); + + template void read(Buffer const & buffer, bool blocking, std::size_t offset, std::vector& x) + { read(buffer, blocking, offset, x.size()*sizeof(T), x.data()); } +private: + Context context_; + Handle cu_; +}; + + +} + +} + +#endif diff --git a/include/external/CUDA/builtin_types.h b/include/external/CUDA/builtin_types.h new file mode 100755 index 000000000..5247c4080 --- /dev/null +++ b/include/external/CUDA/builtin_types.h @@ -0,0 +1,64 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "device_types.h" +#if !defined(__CUDACC_RTC__) +#define EXCLUDE_FROM_RTC +#include "driver_types.h" +#undef EXCLUDE_FROM_RTC +#endif /* !__CUDACC_RTC__ */ +#include "surface_types.h" +#include "texture_types.h" +#include "vector_types.h" diff --git a/include/external/CUDA/channel_descriptor.h b/include/external/CUDA/channel_descriptor.h new file mode 100755 index 000000000..150f93bde --- /dev/null +++ b/include/external/CUDA/channel_descriptor.h @@ -0,0 +1,412 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CHANNEL_DESCRIPTOR_H__) +#define __CHANNEL_DESCRIPTOR_H__ + +#if defined(__cplusplus) + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "driver_types.h" +#include "cuda_runtime_api.h" +#include "host_defines.h" +#include "vector_types.h" + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +/** + * \addtogroup CUDART_HIGHLEVEL + * + * @{ + */ + +/** + * \brief \hl Returns a channel descriptor using the specified format + * + * Returns a channel descriptor with format \p f and number of bits of each + * component \p x, \p y, \p z, and \p w. The ::cudaChannelFormatDesc is + * defined as: + * \code + struct cudaChannelFormatDesc { + int x, y, z, w; + enum cudaChannelFormatKind f; + }; + * \endcode + * + * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned, + * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat. + * + * \return + * Channel descriptor with format \p f + * + * \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (High level)", + * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, size_t) "cudaBindTexture (High level, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (High level)", + * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (High level)", + * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (High level, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (High level)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (High level)" + */ +template __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone); +} + +static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat); +} + +static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat); +} + +static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat); +} + +static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(char) * 8; + +#if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__) + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +#else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */ + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +#endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */ +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(signed char) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned char) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(signed char) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned char) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(signed char) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned char) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(signed char) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned char) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(short) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(short) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(short) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(short) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(int) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned int) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(int) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned int) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(int) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned int) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(int) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned int) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned); +} + +#if !defined(__LP64__) + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(long) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned long) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(long) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned long) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(long) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned long) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(long) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(unsigned long) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned); +} + +#endif /* !__LP64__ */ + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(float) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(float) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(float) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + int e = (int)sizeof(float) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat); +} + +#endif /* __cplusplus */ + +/** @} */ +/** @} */ /* END CUDART_TEXTURE_HL */ + +#endif /* !__CHANNEL_DESCRIPTOR_H__ */ diff --git a/include/external/CUDA/crt/host_config.h b/include/external/CUDA/crt/host_config.h new file mode 100644 index 000000000..8b023b528 --- /dev/null +++ b/include/external/CUDA/crt/host_config.h @@ -0,0 +1,266 @@ +/* + * Copyright 1993-2016 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__HOST_CONFIG_H__) +#define __HOST_CONFIG_H__ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#if defined(__CUDACC__) + +#if defined(__CUDACC_RTC__) + +#define _CRTIMP +#define __THROW + +#else /* __CUDACC_RTC__ */ + +/* check for host compilers that are compatible with nvcc */ +#if !defined(__GNUC__) && !defined(_WIN32) + +#error --- !!! UNSUPPORTED COMPILER !!! --- + +#endif /* !__GNUC__ && !_WIN32 */ + +#if defined(__ICC) + +#if (__ICC != 1500 && __ICC != 1600 && __ICC != 1700) || !defined(__GNUC__) || !defined(__LP64__) + +#error -- unsupported ICC configuration! Only ICC 15.0, ICC 16.0, and ICC 17.0 on Linux x86_64 are supported! + +#endif /* (__ICC != 1500 && __ICC != 1600 && __ICC != 17.0) || !__GNUC__ || !__LP64__ */ + +#endif /* __ICC */ + +#if defined(__PGIC__) + +#if (!(__PGIC__ == 17) && \ + !(__PGIC__ == 99 && __PGIC_MINOR__ == 99)) || \ + !defined(__GNUC__) || !defined(__LP64__) + +#error -- unsupported pgc++ configuration! Only pgc++ 17 on Linux x86_64 is supported! + +#endif /* (!(__PGIC__ == 17) && + !(__PGIC__ == 99 && __PGIC_MINOR__ == 99 )) || + !__GNUC__ || !__LP64__ */ + +#endif /* __PGIC__ */ + +#if defined(__powerpc__) + +#if !defined(__powerpc64__) || !defined(__LITTLE_ENDIAN__) + +#error -- unsupported PPC platform! Only 64-bit little endian PPC is supported! + +#endif /* !__powerpc64__ || !__LITTLE_ENDIAN__ */ + +#if defined(__ibmxl_vrm__) && (__ibmxl_vrm__ < 0x0d010000 && __ibmxl_vrm__ >= 0x0d020000) + +#error -- unsupported xlC version! only xlC 13.1 is supported + +#endif /* __ibmxl_vrm__ && (__ibmxl_vrm__ < 0x0d010000 && __ibmxl_vrm__ >= 0x0d020000) */ + +#endif /* __powerpc__ */ + +#if defined(__GNUC__) + +#if __GNUC__ > 6 + +#error -- unsupported GNU version! gcc versions later than 6 are not supported! + +#endif /* __GNUC__ > 6 */ + +#if defined(__APPLE__) && defined(__MACH__) && !defined(__clang__) +#error -- clang and clang++ are the only supported host compilers on Mac OS X! +#endif /* __APPLE__ && __MACH__ && !__clang__ */ + +#endif /* __GNUC__ */ + +#if defined(_WIN32) + +#if _MSC_VER < 1600 || _MSC_VER > 1911 + +#error -- unsupported Microsoft Visual Studio version! Only the versions 2012, 2013, 2015 and 2017 are supported! + +#elif _MSC_VER == 1600 /* _MSC_VERION == 1600 */ + +#pragma message("support for Microsoft Visual Studio 2010 has been deprecated!") + +#endif /* _MSC_VER < 1600 || _MSC_VER > 1800 || _MSC_VERSION == 1600 */ + +#endif /* _WIN32 */ + +/* configure host compiler */ +#if defined(__APPLE__) + +#define _CRTIMP +#define _ACRTIMP +#define __THROW + +#if defined(__BLOCKS__) /* nvcc does not support closures */ + +#undef __BLOCKS__ + +#endif /* __BLOCKS__ */ + +#elif defined(__ANDROID__) + +#define _CRTIMP +#define _ACRTIMP +#define __THROW + +#elif defined(__QNX__) + +#define _CRTIMP +#define _ACRTIMP +#define __THROW + +#elif defined(__HORIZON__) + +#define _CRTIMP +#define _ACRTIMP +#define __THROW + +#elif defined(__GNUC__) + +#define _CRTIMP +#define _ACRTIMP + +#include /* for __THROW */ + +#elif defined(_WIN32) + +#if _MSC_VER >= 1500 + +#undef _USE_DECLSPECS_FOR_SAL +#define _USE_DECLSPECS_FOR_SAL \ + 1 + +#endif /* _MSC_VER >= 1500 */ + +#if !defined(_CRT_NONSTDC_NO_WARNINGS) + +#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */ + +#endif /* !_CRT_NONSTDC_NO_WARNINGS */ + +#if !defined(_CRT_SECURE_NO_WARNINGS) + +#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */ + +#endif /* !_CRT_SECURE_NO_WARNINGS */ + +#if !defined(NOMINMAX) + +#define NOMINMAX /* min and max are part of cuda runtime */ + +#endif /* !NOMINMAX */ + +#include /* for _CRTIMP */ +#if _MSC_VER >= 1900 +#include /* for _ACRTIMP */ +#endif /* _MSC_VER >= 1900 */ + +#define __THROW + +#endif /* __APPLE__ */ + +#endif /* __CUDACC_RTC__ */ + + +#if defined(__cplusplus) && defined(__CUDA_ARCH__) && (defined(__PGIC__) || defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER))) + +#if __CUDACC_RTC__ +typedef char *va_list; +#else /* !__CUDACC_RTC__ */ +#include +#endif /* __CUDACC_RTC__ */ + + +#undef va_start +#undef va_end +#undef va_arg + +#ifdef __PGIC__ + +#undef __builtin_va_end + +#define va_start(v,l) __builtin_alt_va_start(v,l) +#define va_end(v) __builtin_va_end(v) +#define va_arg(v,l) __builtin_alt_va_arg(v,l) + +#if (__cplusplus >= 201103L) +#undef va_copy +#define va_copy(d,s) __builtin_va_copy(d,s) +#endif + +#else /* !__PGIC__ */ + + +#define va_start(ap, x) (__cu_va_start(&ap, x)) +#define va_end(ap) (__cu_va_end(&ap)) +#define va_arg(ap, t) (*((t *)__cu_va_arg(&ap, (t *)0))) + +#if (_MSC_VER >= 1800) || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L)) +#undef va_copy +#define va_copy(apd, aps) (__cu_va_copy(&(apd), &(aps))) +#endif /* (_MSC_VER >= 1800) || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L)) */ +#endif /* __PGIC__ */ + +#endif /* defined(__cplusplus) && (defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER))) */ + + + +#endif /* __CUDACC__ */ + +#endif /* !__HOST_CONFIG_H__ */ diff --git a/include/external/CUDA/crt/host_defines.h b/include/external/CUDA/crt/host_defines.h new file mode 100644 index 000000000..556d2e5e1 --- /dev/null +++ b/include/external/CUDA/crt/host_defines.h @@ -0,0 +1,216 @@ +/* + * Copyright 1993-2017 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__HOST_DEFINES_H__) +#define __HOST_DEFINES_H__ + +/* CUDA JIT mode (__CUDACC_RTC__) also uses GNU style attributes */ +#if defined(__GNUC__) || defined(__CUDA_LIBDEVICE__) || defined(__CUDACC_RTC__) + +#if defined(__CUDACC_RTC__) +#define __volatile__ volatile +#endif /* __CUDACC_RTC__ */ + +#define __no_return__ \ + __attribute__((noreturn)) + +#if defined(__CUDACC__) || defined(__CUDA_ARCH__) || defined(__CUDA_LIBDEVICE__) +/* gcc allows users to define attributes with underscores, + e.g., __attribute__((__noinline__)). + Consider a non-CUDA source file (e.g. .cpp) that has the + above attribute specification, and includes this header file. In that case, + defining __noinline__ as below would cause a gcc compilation error. + Hence, only define __noinline__ when the code is being processed + by a CUDA compiler component. +*/ +#define __noinline__ \ + __attribute__((noinline)) +#endif /* __CUDACC__ || __CUDA_ARCH__ || __CUDA_LIBDEVICE__ */ + +#define __forceinline__ \ + __inline__ __attribute__((always_inline)) +#define __align__(n) \ + __attribute__((aligned(n))) +#define __thread__ \ + __thread +#define __import__ +#define __export__ +#define __cdecl +#define __annotate__(a) \ + __attribute__((a)) +#define __location__(a) \ + __annotate__(a) +#define CUDARTAPI + +#elif defined(_MSC_VER) + +#if _MSC_VER >= 1400 + +#define __restrict__ \ + __restrict + +#else /* _MSC_VER >= 1400 */ + +#define __restrict__ + +#endif /* _MSC_VER >= 1400 */ + +#define __inline__ \ + __inline +#define __no_return__ \ + __declspec(noreturn) +#define __noinline__ \ + __declspec(noinline) +#define __forceinline__ \ + __forceinline +#define __align__(n) \ + __declspec(align(n)) +#define __thread__ \ + __declspec(thread) +#define __import__ \ + __declspec(dllimport) +#define __export__ \ + __declspec(dllexport) +#define __annotate__(a) \ + __declspec(a) +#define __location__(a) \ + __annotate__(__##a##__) +#define CUDARTAPI \ + __stdcall + +#else /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */ + +#define __inline__ + +#if !defined(__align__) + +#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for '__align__' !!! --- + +#endif /* !__align__ */ + +#if !defined(CUDARTAPI) + +#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for 'CUDARTAPI' !!! --- + +#endif /* !CUDARTAPI */ + +#endif /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */ + +#if (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !defined(__clang__)))) || \ + (defined(_MSC_VER) && _MSC_VER < 1900) || \ + (!defined(__GNUC__) && !defined(_MSC_VER)) + +#define __specialization_static \ + static + +#else /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) || + (_MSC_VER && _MSC_VER < 1900) || + (!__GNUC__ && !_MSC_VER) */ + +#define __specialization_static + +#endif /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) || + (_MSC_VER && _MSC_VER < 1900) || + (!__GNUC__ && !_MSC_VER) */ + +#if !defined(__CUDACC__) && !defined(__CUDA_LIBDEVICE__) + +#undef __annotate__ +#define __annotate__(a) + +#else /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */ + +#define __launch_bounds__(...) \ + __annotate__(launch_bounds(__VA_ARGS__)) + +#endif /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */ + +#if defined(__CUDACC__) || defined(__CUDA_LIBDEVICE__) || \ + defined(__GNUC__) || defined(_WIN64) + +#define __builtin_align__(a) \ + __align__(a) + +#else /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__ || _WIN64 */ + +#define __builtin_align__(a) + +#endif /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__ || _WIN64 */ + +#define __host__ \ + __location__(host) +#define __device__ \ + __location__(device) +#define __global__ \ + __location__(global) +#define __shared__ \ + __location__(shared) +#define __constant__ \ + __location__(constant) +#define __managed__ \ + __location__(managed) + +#if !defined(__CUDACC__) +#define __device_builtin__ +#define __device_builtin_texture_type__ +#define __device_builtin_surface_type__ +#define __cudart_builtin__ +#else /* defined(__CUDACC__) */ +#define __device_builtin__ \ + __location__(device_builtin) +#define __device_builtin_texture_type__ \ + __location__(device_builtin_texture_type) +#define __device_builtin_surface_type__ \ + __location__(device_builtin_surface_type) +#define __cudart_builtin__ \ + __location__(cudart_builtin) +#endif /* !defined(__CUDACC__) */ + + +#endif /* !__HOST_DEFINES_H__ */ diff --git a/include/external/CUDA/cuComplex.h b/include/external/CUDA/cuComplex.h new file mode 100755 index 000000000..78bc90353 --- /dev/null +++ b/include/external/CUDA/cuComplex.h @@ -0,0 +1,338 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(CU_COMPLEX_H_) +#define CU_COMPLEX_H_ + +/* When trying to include C header file in C++ Code extern "C" is required + * But the Standard QNX headers already have ifdef extern in them when compiling C++ Code + * extern "C" cannot be nested + * Hence keep the header out of extern "C" block + */ + +#include /* import fabsf, sqrt */ + +#if defined(__cplusplus) +extern "C" { +#endif /* __cplusplus */ + +#include "vector_types.h" + +typedef float2 cuFloatComplex; + +__host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x) +{ + return x.x; +} + +__host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x) +{ + return x.y; +} + +__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex + (float r, float i) +{ + cuFloatComplex res; + res.x = r; + res.y = i; + return res; +} + +__host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x) +{ + return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x)); +} +__host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x, + cuFloatComplex y) +{ + return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y), + cuCimagf(x) + cuCimagf(y)); +} + +__host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x, + cuFloatComplex y) +{ + return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y), + cuCimagf(x) - cuCimagf(y)); +} + +/* This implementation could suffer from intermediate overflow even though + * the final result would be in range. However, various implementations do + * not guard against this (presumably to avoid losing performance), so we + * don't do it either to stay competitive. + */ +__host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x, + cuFloatComplex y) +{ + cuFloatComplex prod; + prod = make_cuFloatComplex ((cuCrealf(x) * cuCrealf(y)) - + (cuCimagf(x) * cuCimagf(y)), + (cuCrealf(x) * cuCimagf(y)) + + (cuCimagf(x) * cuCrealf(y))); + return prod; +} + +/* This implementation guards against intermediate underflow and overflow + * by scaling. Such guarded implementations are usually the default for + * complex library implementations, with some also offering an unguarded, + * faster version. + */ +__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x, + cuFloatComplex y) +{ + cuFloatComplex quot; + float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y)); + float oos = 1.0f / s; + float ars = cuCrealf(x) * oos; + float ais = cuCimagf(x) * oos; + float brs = cuCrealf(y) * oos; + float bis = cuCimagf(y) * oos; + s = (brs * brs) + (bis * bis); + oos = 1.0f / s; + quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos, + ((ais * brs) - (ars * bis)) * oos); + return quot; +} + +/* + * We would like to call hypotf(), but it's not available on all platforms. + * This discrete implementation guards against intermediate underflow and + * overflow by scaling. Otherwise we would lose half the exponent range. + * There are various ways of doing guarded computation. For now chose the + * simplest and fastest solution, however this may suffer from inaccuracies + * if sqrt and division are not IEEE compliant. + */ +__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x) +{ + float a = cuCrealf(x); + float b = cuCimagf(x); + float v, w, t; + a = fabsf(a); + b = fabsf(b); + if (a > b) { + v = a; + w = b; + } else { + v = b; + w = a; + } + t = w / v; + t = 1.0f + t * t; + t = v * sqrtf(t); + if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) { + t = v + w; + } + return t; +} + +/* Double precision */ +typedef double2 cuDoubleComplex; + +__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x) +{ + return x.x; +} + +__host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x) +{ + return x.y; +} + +__host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex + (double r, double i) +{ + cuDoubleComplex res; + res.x = r; + res.y = i; + return res; +} + +__host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x) +{ + return make_cuDoubleComplex (cuCreal(x), -cuCimag(x)); +} + +__host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x, + cuDoubleComplex y) +{ + return make_cuDoubleComplex (cuCreal(x) + cuCreal(y), + cuCimag(x) + cuCimag(y)); +} + +__host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x, + cuDoubleComplex y) +{ + return make_cuDoubleComplex (cuCreal(x) - cuCreal(y), + cuCimag(x) - cuCimag(y)); +} + +/* This implementation could suffer from intermediate overflow even though + * the final result would be in range. However, various implementations do + * not guard against this (presumably to avoid losing performance), so we + * don't do it either to stay competitive. + */ +__host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x, + cuDoubleComplex y) +{ + cuDoubleComplex prod; + prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) - + (cuCimag(x) * cuCimag(y)), + (cuCreal(x) * cuCimag(y)) + + (cuCimag(x) * cuCreal(y))); + return prod; +} + +/* This implementation guards against intermediate underflow and overflow + * by scaling. Such guarded implementations are usually the default for + * complex library implementations, with some also offering an unguarded, + * faster version. + */ +__host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x, + cuDoubleComplex y) +{ + cuDoubleComplex quot; + double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y))); + double oos = 1.0 / s; + double ars = cuCreal(x) * oos; + double ais = cuCimag(x) * oos; + double brs = cuCreal(y) * oos; + double bis = cuCimag(y) * oos; + s = (brs * brs) + (bis * bis); + oos = 1.0 / s; + quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos, + ((ais * brs) - (ars * bis)) * oos); + return quot; +} + +/* This implementation guards against intermediate underflow and overflow + * by scaling. Otherwise we would lose half the exponent range. There are + * various ways of doing guarded computation. For now chose the simplest + * and fastest solution, however this may suffer from inaccuracies if sqrt + * and division are not IEEE compliant. + */ +__host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x) +{ + double a = cuCreal(x); + double b = cuCimag(x); + double v, w, t; + a = fabs(a); + b = fabs(b); + if (a > b) { + v = a; + w = b; + } else { + v = b; + w = a; + } + t = w / v; + t = 1.0 + t * t; + t = v * sqrt(t); + if ((v == 0.0) || + (v > 1.79769313486231570e+308) || (w > 1.79769313486231570e+308)) { + t = v + w; + } + return t; +} + +#if defined(__cplusplus) +} +#endif /* __cplusplus */ + +/* aliases */ +typedef cuFloatComplex cuComplex; +__host__ __device__ static __inline__ cuComplex make_cuComplex (float x, + float y) +{ + return make_cuFloatComplex (x, y); +} + +/* float-to-double promotion */ +__host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble + (cuFloatComplex c) +{ + return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c)); +} + +__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat +(cuDoubleComplex c) +{ + return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c)); +} + + +__host__ __device__ static __inline__ cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d) +{ + float real_res; + float imag_res; + + real_res = (cuCrealf(x) * cuCrealf(y)) + cuCrealf(d); + imag_res = (cuCrealf(x) * cuCimagf(y)) + cuCimagf(d); + + real_res = -(cuCimagf(x) * cuCimagf(y)) + real_res; + imag_res = (cuCimagf(x) * cuCrealf(y)) + imag_res; + + return make_cuComplex(real_res, imag_res); +} + +__host__ __device__ static __inline__ cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d) +{ + double real_res; + double imag_res; + + real_res = (cuCreal(x) * cuCreal(y)) + cuCreal(d); + imag_res = (cuCreal(x) * cuCimag(y)) + cuCimag(d); + + real_res = -(cuCimag(x) * cuCimag(y)) + real_res; + imag_res = (cuCimag(x) * cuCreal(y)) + imag_res; + + return make_cuDoubleComplex(real_res, imag_res); +} + +#endif /* !defined(CU_COMPLEX_H_) */ diff --git a/include/external/CUDA/cublas.h b/include/external/CUDA/cublas.h new file mode 100755 index 000000000..34521c06c --- /dev/null +++ b/include/external/CUDA/cublas.h @@ -0,0 +1,565 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* + * This is the public header file for the CUBLAS library, defining the API + * + * CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines) + * on top of the CUDA runtime. + */ + +#if !defined(CUBLAS_H_) +#define CUBLAS_H_ + +#include + +#ifndef CUBLASWINAPI +#ifdef _WIN32 +#define CUBLASWINAPI __stdcall +#else +#define CUBLASWINAPI +#endif +#endif + +#undef CUBLASAPI +#ifdef __CUDACC__ +#define CUBLASAPI __host__ +#else +#define CUBLASAPI +#endif + +#include "cublas_api.h" + +#if defined(__cplusplus) +extern "C" { +#endif + +/* CUBLAS data types */ +#define cublasStatus cublasStatus_t + +cublasStatus CUBLASWINAPI cublasInit (void); +cublasStatus CUBLASWINAPI cublasShutdown (void); +cublasStatus CUBLASWINAPI cublasGetError (void); + +cublasStatus CUBLASWINAPI cublasGetVersion(int *version); +cublasStatus CUBLASWINAPI cublasAlloc (int n, int elemSize, void **devicePtr); + +cublasStatus CUBLASWINAPI cublasFree (void *devicePtr); + + +cublasStatus CUBLASWINAPI cublasSetKernelStream (cudaStream_t stream); + + + +/* ---------------- CUBLAS BLAS1 functions ---------------- */ +/* NRM2 */ +float CUBLASWINAPI cublasSnrm2 (int n, const float *x, int incx); +double CUBLASWINAPI cublasDnrm2 (int n, const double *x, int incx); +float CUBLASWINAPI cublasScnrm2 (int n, const cuComplex *x, int incx); +double CUBLASWINAPI cublasDznrm2 (int n, const cuDoubleComplex *x, int incx); +/*------------------------------------------------------------------------*/ +/* DOT */ +float CUBLASWINAPI cublasSdot (int n, const float *x, int incx, const float *y, + int incy); +double CUBLASWINAPI cublasDdot (int n, const double *x, int incx, const double *y, + int incy); +cuComplex CUBLASWINAPI cublasCdotu (int n, const cuComplex *x, int incx, const cuComplex *y, + int incy); +cuComplex CUBLASWINAPI cublasCdotc (int n, const cuComplex *x, int incx, const cuComplex *y, + int incy); +cuDoubleComplex CUBLASWINAPI cublasZdotu (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, + int incy); +cuDoubleComplex CUBLASWINAPI cublasZdotc (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, + int incy); +/*------------------------------------------------------------------------*/ +/* SCAL */ +void CUBLASWINAPI cublasSscal (int n, float alpha, float *x, int incx); +void CUBLASWINAPI cublasDscal (int n, double alpha, double *x, int incx); +void CUBLASWINAPI cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx); +void CUBLASWINAPI cublasZscal (int n, cuDoubleComplex alpha, cuDoubleComplex *x, int incx); + +void CUBLASWINAPI cublasCsscal (int n, float alpha, cuComplex *x, int incx); +void CUBLASWINAPI cublasZdscal (int n, double alpha, cuDoubleComplex *x, int incx); +/*------------------------------------------------------------------------*/ +/* AXPY */ +void CUBLASWINAPI cublasSaxpy (int n, float alpha, const float *x, int incx, + float *y, int incy); +void CUBLASWINAPI cublasDaxpy (int n, double alpha, const double *x, + int incx, double *y, int incy); +void CUBLASWINAPI cublasCaxpy (int n, cuComplex alpha, const cuComplex *x, + int incx, cuComplex *y, int incy); +void CUBLASWINAPI cublasZaxpy (int n, cuDoubleComplex alpha, const cuDoubleComplex *x, + int incx, cuDoubleComplex *y, int incy); +/*------------------------------------------------------------------------*/ +/* COPY */ +void CUBLASWINAPI cublasScopy (int n, const float *x, int incx, float *y, + int incy); +void CUBLASWINAPI cublasDcopy (int n, const double *x, int incx, double *y, + int incy); +void CUBLASWINAPI cublasCcopy (int n, const cuComplex *x, int incx, cuComplex *y, + int incy); +void CUBLASWINAPI cublasZcopy (int n, const cuDoubleComplex *x, int incx, cuDoubleComplex *y, + int incy); +/*------------------------------------------------------------------------*/ +/* SWAP */ +void CUBLASWINAPI cublasSswap (int n, float *x, int incx, float *y, int incy); +void CUBLASWINAPI cublasDswap (int n, double *x, int incx, double *y, int incy); +void CUBLASWINAPI cublasCswap (int n, cuComplex *x, int incx, cuComplex *y, int incy); +void CUBLASWINAPI cublasZswap (int n, cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy); +/*------------------------------------------------------------------------*/ +/* AMAX */ +int CUBLASWINAPI cublasIsamax (int n, const float *x, int incx); +int CUBLASWINAPI cublasIdamax (int n, const double *x, int incx); +int CUBLASWINAPI cublasIcamax (int n, const cuComplex *x, int incx); +int CUBLASWINAPI cublasIzamax (int n, const cuDoubleComplex *x, int incx); +/*------------------------------------------------------------------------*/ +/* AMIN */ +int CUBLASWINAPI cublasIsamin (int n, const float *x, int incx); +int CUBLASWINAPI cublasIdamin (int n, const double *x, int incx); + +int CUBLASWINAPI cublasIcamin (int n, const cuComplex *x, int incx); +int CUBLASWINAPI cublasIzamin (int n, const cuDoubleComplex *x, int incx); +/*------------------------------------------------------------------------*/ +/* ASUM */ +float CUBLASWINAPI cublasSasum (int n, const float *x, int incx); +double CUBLASWINAPI cublasDasum (int n, const double *x, int incx); +float CUBLASWINAPI cublasScasum (int n, const cuComplex *x, int incx); +double CUBLASWINAPI cublasDzasum (int n, const cuDoubleComplex *x, int incx); +/*------------------------------------------------------------------------*/ +/* ROT */ +void CUBLASWINAPI cublasSrot (int n, float *x, int incx, float *y, int incy, + float sc, float ss); +void CUBLASWINAPI cublasDrot (int n, double *x, int incx, double *y, int incy, + double sc, double ss); +void CUBLASWINAPI cublasCrot (int n, cuComplex *x, int incx, cuComplex *y, + int incy, float c, cuComplex s); +void CUBLASWINAPI cublasZrot (int n, cuDoubleComplex *x, int incx, + cuDoubleComplex *y, int incy, double sc, + cuDoubleComplex cs); +void CUBLASWINAPI cublasCsrot (int n, cuComplex *x, int incx, cuComplex *y, + int incy, float c, float s); +void CUBLASWINAPI cublasZdrot (int n, cuDoubleComplex *x, int incx, + cuDoubleComplex *y, int incy, double c, double s); +/*------------------------------------------------------------------------*/ +/* ROTG */ +void CUBLASWINAPI cublasSrotg (float *sa, float *sb, float *sc, float *ss); +void CUBLASWINAPI cublasDrotg (double *sa, double *sb, double *sc, double *ss); +void CUBLASWINAPI cublasCrotg (cuComplex *ca, cuComplex cb, float *sc, + cuComplex *cs); +void CUBLASWINAPI cublasZrotg (cuDoubleComplex *ca, cuDoubleComplex cb, double *sc, + cuDoubleComplex *cs); +/*------------------------------------------------------------------------*/ +/* ROTM */ +void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy, + const float* sparam); +void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy, + const double* sparam); +/*------------------------------------------------------------------------*/ +/* ROTMG */ +void CUBLASWINAPI cublasSrotmg (float *sd1, float *sd2, float *sx1, + const float *sy1, float* sparam); +void CUBLASWINAPI cublasDrotmg (double *sd1, double *sd2, double *sx1, + const double *sy1, double* sparam); + +/* --------------- CUBLAS BLAS2 functions ---------------- */ +/* GEMV */ +void CUBLASWINAPI cublasSgemv (char trans, int m, int n, float alpha, + const float *A, int lda, const float *x, int incx, + float beta, float *y, int incy); +void CUBLASWINAPI cublasDgemv (char trans, int m, int n, double alpha, + const double *A, int lda, const double *x, int incx, + double beta, double *y, int incy); +void CUBLASWINAPI cublasCgemv (char trans, int m, int n, cuComplex alpha, + const cuComplex *A, int lda, const cuComplex *x, int incx, + cuComplex beta, cuComplex *y, int incy); +void CUBLASWINAPI cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha, + const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx, + cuDoubleComplex beta, cuDoubleComplex *y, int incy); +/*------------------------------------------------------------------------*/ +/* GBMV */ +void CUBLASWINAPI cublasSgbmv (char trans, int m, int n, int kl, int ku, + float alpha, const float *A, int lda, + const float *x, int incx, float beta, float *y, + int incy); +void CUBLASWINAPI cublasDgbmv (char trans, int m, int n, int kl, int ku, + double alpha, const double *A, int lda, + const double *x, int incx, double beta, double *y, + int incy); +void CUBLASWINAPI cublasCgbmv (char trans, int m, int n, int kl, int ku, + cuComplex alpha, const cuComplex *A, int lda, + const cuComplex *x, int incx, cuComplex beta, cuComplex *y, + int incy); +void CUBLASWINAPI cublasZgbmv (char trans, int m, int n, int kl, int ku, + cuDoubleComplex alpha, const cuDoubleComplex *A, int lda, + const cuDoubleComplex *x, int incx, cuDoubleComplex beta, cuDoubleComplex *y, + int incy); +/*------------------------------------------------------------------------*/ +/* TRMV */ +void CUBLASWINAPI cublasStrmv (char uplo, char trans, char diag, int n, + const float *A, int lda, float *x, int incx); +void CUBLASWINAPI cublasDtrmv (char uplo, char trans, char diag, int n, + const double *A, int lda, double *x, int incx); +void CUBLASWINAPI cublasCtrmv (char uplo, char trans, char diag, int n, + const cuComplex *A, int lda, cuComplex *x, int incx); +void CUBLASWINAPI cublasZtrmv (char uplo, char trans, char diag, int n, + const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx); +/*------------------------------------------------------------------------*/ +/* TBMV */ +void CUBLASWINAPI cublasStbmv (char uplo, char trans, char diag, int n, int k, + const float *A, int lda, float *x, int incx); +void CUBLASWINAPI cublasDtbmv (char uplo, char trans, char diag, int n, int k, + const double *A, int lda, double *x, int incx); +void CUBLASWINAPI cublasCtbmv (char uplo, char trans, char diag, int n, int k, + const cuComplex *A, int lda, cuComplex *x, int incx); +void CUBLASWINAPI cublasZtbmv (char uplo, char trans, char diag, int n, int k, + const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx); +/*------------------------------------------------------------------------*/ +/* TPMV */ +void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float *AP, float *x, int incx); + +void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx); + +void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx); + +void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, cuDoubleComplex *x, int incx); +/*------------------------------------------------------------------------*/ +/* TRSV */ +void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float *A, int lda, float *x, int incx); + +void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double *A, int lda, double *x, int incx); + +void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex *A, int lda, cuComplex *x, int incx); + +void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *A, int lda, + cuDoubleComplex *x, int incx); +/*------------------------------------------------------------------------*/ +/* TPSV */ +void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float *AP, + float *x, int incx); + +void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx); + +void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx); + +void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, + cuDoubleComplex *x, int incx); +/*------------------------------------------------------------------------*/ +/* TBSV */ +void CUBLASWINAPI cublasStbsv(char uplo, char trans, + char diag, int n, int k, const float *A, + int lda, float *x, int incx); + +void CUBLASWINAPI cublasDtbsv(char uplo, char trans, + char diag, int n, int k, const double *A, + int lda, double *x, int incx); +void CUBLASWINAPI cublasCtbsv(char uplo, char trans, + char diag, int n, int k, const cuComplex *A, + int lda, cuComplex *x, int incx); + +void CUBLASWINAPI cublasZtbsv(char uplo, char trans, + char diag, int n, int k, const cuDoubleComplex *A, + int lda, cuDoubleComplex *x, int incx); +/*------------------------------------------------------------------------*/ +/* SYMV/HEMV */ +void CUBLASWINAPI cublasSsymv (char uplo, int n, float alpha, const float *A, + int lda, const float *x, int incx, float beta, + float *y, int incy); +void CUBLASWINAPI cublasDsymv (char uplo, int n, double alpha, const double *A, + int lda, const double *x, int incx, double beta, + double *y, int incy); +void CUBLASWINAPI cublasChemv (char uplo, int n, cuComplex alpha, const cuComplex *A, + int lda, const cuComplex *x, int incx, cuComplex beta, + cuComplex *y, int incy); +void CUBLASWINAPI cublasZhemv (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *A, + int lda, const cuDoubleComplex *x, int incx, cuDoubleComplex beta, + cuDoubleComplex *y, int incy); +/*------------------------------------------------------------------------*/ +/* SBMV/HBMV */ +void CUBLASWINAPI cublasSsbmv (char uplo, int n, int k, float alpha, + const float *A, int lda, const float *x, int incx, + float beta, float *y, int incy); +void CUBLASWINAPI cublasDsbmv (char uplo, int n, int k, double alpha, + const double *A, int lda, const double *x, int incx, + double beta, double *y, int incy); +void CUBLASWINAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha, + const cuComplex *A, int lda, const cuComplex *x, int incx, + cuComplex beta, cuComplex *y, int incy); +void CUBLASWINAPI cublasZhbmv (char uplo, int n, int k, cuDoubleComplex alpha, + const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx, + cuDoubleComplex beta, cuDoubleComplex *y, int incy); +/*------------------------------------------------------------------------*/ +/* SPMV/HPMV */ +void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha, + const float *AP, const float *x, + int incx, float beta, float *y, int incy); +void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha, + const double *AP, const double *x, + int incx, double beta, double *y, int incy); +void CUBLASWINAPI cublasChpmv(char uplo, int n, cuComplex alpha, + const cuComplex *AP, const cuComplex *x, + int incx, cuComplex beta, cuComplex *y, int incy); +void CUBLASWINAPI cublasZhpmv(char uplo, int n, cuDoubleComplex alpha, + const cuDoubleComplex *AP, const cuDoubleComplex *x, + int incx, cuDoubleComplex beta, cuDoubleComplex *y, int incy); + +/*------------------------------------------------------------------------*/ +/* GER */ +void CUBLASWINAPI cublasSger (int m, int n, float alpha, const float *x, int incx, + const float *y, int incy, float *A, int lda); +void CUBLASWINAPI cublasDger (int m, int n, double alpha, const double *x, int incx, + const double *y, int incy, double *A, int lda); + +void CUBLASWINAPI cublasCgeru (int m, int n, cuComplex alpha, const cuComplex *x, + int incx, const cuComplex *y, int incy, + cuComplex *A, int lda); +void CUBLASWINAPI cublasCgerc (int m, int n, cuComplex alpha, const cuComplex *x, + int incx, const cuComplex *y, int incy, + cuComplex *A, int lda); +void CUBLASWINAPI cublasZgeru (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x, + int incx, const cuDoubleComplex *y, int incy, + cuDoubleComplex *A, int lda); +void CUBLASWINAPI cublasZgerc (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x, + int incx, const cuDoubleComplex *y, int incy, + cuDoubleComplex *A, int lda); +/*------------------------------------------------------------------------*/ +/* SYR/HER */ +void CUBLASWINAPI cublasSsyr (char uplo, int n, float alpha, const float *x, + int incx, float *A, int lda); +void CUBLASWINAPI cublasDsyr (char uplo, int n, double alpha, const double *x, + int incx, double *A, int lda); + +void CUBLASWINAPI cublasCher (char uplo, int n, float alpha, + const cuComplex *x, int incx, cuComplex *A, int lda); +void CUBLASWINAPI cublasZher (char uplo, int n, double alpha, + const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda); + +/*------------------------------------------------------------------------*/ +/* SPR/HPR */ +void CUBLASWINAPI cublasSspr (char uplo, int n, float alpha, const float *x, + int incx, float *AP); +void CUBLASWINAPI cublasDspr (char uplo, int n, double alpha, const double *x, + int incx, double *AP); +void CUBLASWINAPI cublasChpr (char uplo, int n, float alpha, const cuComplex *x, + int incx, cuComplex *AP); +void CUBLASWINAPI cublasZhpr (char uplo, int n, double alpha, const cuDoubleComplex *x, + int incx, cuDoubleComplex *AP); +/*------------------------------------------------------------------------*/ +/* SYR2/HER2 */ +void CUBLASWINAPI cublasSsyr2 (char uplo, int n, float alpha, const float *x, + int incx, const float *y, int incy, float *A, + int lda); +void CUBLASWINAPI cublasDsyr2 (char uplo, int n, double alpha, const double *x, + int incx, const double *y, int incy, double *A, + int lda); +void CUBLASWINAPI cublasCher2 (char uplo, int n, cuComplex alpha, const cuComplex *x, + int incx, const cuComplex *y, int incy, cuComplex *A, + int lda); +void CUBLASWINAPI cublasZher2 (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *x, + int incx, const cuDoubleComplex *y, int incy, cuDoubleComplex *A, + int lda); + +/*------------------------------------------------------------------------*/ +/* SPR2/HPR2 */ +void CUBLASWINAPI cublasSspr2 (char uplo, int n, float alpha, const float *x, + int incx, const float *y, int incy, float *AP); +void CUBLASWINAPI cublasDspr2 (char uplo, int n, double alpha, + const double *x, int incx, const double *y, + int incy, double *AP); +void CUBLASWINAPI cublasChpr2 (char uplo, int n, cuComplex alpha, + const cuComplex *x, int incx, const cuComplex *y, + int incy, cuComplex *AP); +void CUBLASWINAPI cublasZhpr2 (char uplo, int n, cuDoubleComplex alpha, + const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, + int incy, cuDoubleComplex *AP); +/* ------------------------BLAS3 Functions ------------------------------- */ +/* GEMM */ +void CUBLASWINAPI cublasSgemm (char transa, char transb, int m, int n, int k, + float alpha, const float *A, int lda, + const float *B, int ldb, float beta, float *C, + int ldc); +void CUBLASWINAPI cublasDgemm (char transa, char transb, int m, int n, int k, + double alpha, const double *A, int lda, + const double *B, int ldb, double beta, double *C, + int ldc); +void CUBLASWINAPI cublasCgemm (char transa, char transb, int m, int n, int k, + cuComplex alpha, const cuComplex *A, int lda, + const cuComplex *B, int ldb, cuComplex beta, + cuComplex *C, int ldc); +void CUBLASWINAPI cublasZgemm (char transa, char transb, int m, int n, + int k, cuDoubleComplex alpha, + const cuDoubleComplex *A, int lda, + const cuDoubleComplex *B, int ldb, + cuDoubleComplex beta, cuDoubleComplex *C, + int ldc); +/* -------------------------------------------------------*/ +/* SYRK */ +void CUBLASWINAPI cublasSsyrk (char uplo, char trans, int n, int k, float alpha, + const float *A, int lda, float beta, float *C, + int ldc); +void CUBLASWINAPI cublasDsyrk (char uplo, char trans, int n, int k, + double alpha, const double *A, int lda, + double beta, double *C, int ldc); + +void CUBLASWINAPI cublasCsyrk (char uplo, char trans, int n, int k, + cuComplex alpha, const cuComplex *A, int lda, + cuComplex beta, cuComplex *C, int ldc); +void CUBLASWINAPI cublasZsyrk (char uplo, char trans, int n, int k, + cuDoubleComplex alpha, + const cuDoubleComplex *A, int lda, + cuDoubleComplex beta, + cuDoubleComplex *C, int ldc); +/* ------------------------------------------------------- */ +/* HERK */ +void CUBLASWINAPI cublasCherk (char uplo, char trans, int n, int k, + float alpha, const cuComplex *A, int lda, + float beta, cuComplex *C, int ldc); +void CUBLASWINAPI cublasZherk (char uplo, char trans, int n, int k, + double alpha, + const cuDoubleComplex *A, int lda, + double beta, + cuDoubleComplex *C, int ldc); +/* ------------------------------------------------------- */ +/* SYR2K */ +void CUBLASWINAPI cublasSsyr2k (char uplo, char trans, int n, int k, float alpha, + const float *A, int lda, const float *B, int ldb, + float beta, float *C, int ldc); + +void CUBLASWINAPI cublasDsyr2k (char uplo, char trans, int n, int k, + double alpha, const double *A, int lda, + const double *B, int ldb, double beta, + double *C, int ldc); +void CUBLASWINAPI cublasCsyr2k (char uplo, char trans, int n, int k, + cuComplex alpha, const cuComplex *A, int lda, + const cuComplex *B, int ldb, cuComplex beta, + cuComplex *C, int ldc); + +void CUBLASWINAPI cublasZsyr2k (char uplo, char trans, int n, int k, + cuDoubleComplex alpha, const cuDoubleComplex *A, int lda, + const cuDoubleComplex *B, int ldb, cuDoubleComplex beta, + cuDoubleComplex *C, int ldc); +/* ------------------------------------------------------- */ +/* HER2K */ +void CUBLASWINAPI cublasCher2k (char uplo, char trans, int n, int k, + cuComplex alpha, const cuComplex *A, int lda, + const cuComplex *B, int ldb, float beta, + cuComplex *C, int ldc); + +void CUBLASWINAPI cublasZher2k (char uplo, char trans, int n, int k, + cuDoubleComplex alpha, const cuDoubleComplex *A, int lda, + const cuDoubleComplex *B, int ldb, double beta, + cuDoubleComplex *C, int ldc); + +/*------------------------------------------------------------------------*/ +/* SYMM*/ +void CUBLASWINAPI cublasSsymm (char side, char uplo, int m, int n, float alpha, + const float *A, int lda, const float *B, int ldb, + float beta, float *C, int ldc); +void CUBLASWINAPI cublasDsymm (char side, char uplo, int m, int n, double alpha, + const double *A, int lda, const double *B, int ldb, + double beta, double *C, int ldc); + +void CUBLASWINAPI cublasCsymm (char side, char uplo, int m, int n, cuComplex alpha, + const cuComplex *A, int lda, const cuComplex *B, int ldb, + cuComplex beta, cuComplex *C, int ldc); + +void CUBLASWINAPI cublasZsymm (char side, char uplo, int m, int n, cuDoubleComplex alpha, + const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, + cuDoubleComplex beta, cuDoubleComplex *C, int ldc); +/*------------------------------------------------------------------------*/ +/* HEMM*/ +void CUBLASWINAPI cublasChemm (char side, char uplo, int m, int n, + cuComplex alpha, const cuComplex *A, int lda, + const cuComplex *B, int ldb, cuComplex beta, + cuComplex *C, int ldc); +void CUBLASWINAPI cublasZhemm (char side, char uplo, int m, int n, + cuDoubleComplex alpha, const cuDoubleComplex *A, int lda, + const cuDoubleComplex *B, int ldb, cuDoubleComplex beta, + cuDoubleComplex *C, int ldc); + +/*------------------------------------------------------------------------*/ +/* TRSM*/ +void CUBLASWINAPI cublasStrsm (char side, char uplo, char transa, char diag, + int m, int n, float alpha, const float *A, int lda, + float *B, int ldb); + +void CUBLASWINAPI cublasDtrsm (char side, char uplo, char transa, + char diag, int m, int n, double alpha, + const double *A, int lda, double *B, + int ldb); + +void CUBLASWINAPI cublasCtrsm (char side, char uplo, char transa, char diag, + int m, int n, cuComplex alpha, const cuComplex *A, + int lda, cuComplex *B, int ldb); + +void CUBLASWINAPI cublasZtrsm (char side, char uplo, char transa, + char diag, int m, int n, cuDoubleComplex alpha, + const cuDoubleComplex *A, int lda, + cuDoubleComplex *B, int ldb); +/*------------------------------------------------------------------------*/ +/* TRMM*/ +void CUBLASWINAPI cublasStrmm (char side, char uplo, char transa, char diag, + int m, int n, float alpha, const float *A, int lda, + float *B, int ldb); +void CUBLASWINAPI cublasDtrmm (char side, char uplo, char transa, + char diag, int m, int n, double alpha, + const double *A, int lda, double *B, + int ldb); +void CUBLASWINAPI cublasCtrmm (char side, char uplo, char transa, char diag, + int m, int n, cuComplex alpha, const cuComplex *A, + int lda, cuComplex *B, int ldb); +void CUBLASWINAPI cublasZtrmm (char side, char uplo, char transa, + char diag, int m, int n, cuDoubleComplex alpha, + const cuDoubleComplex *A, int lda, cuDoubleComplex *B, + int ldb); + +#if defined(__cplusplus) +} +#endif /* __cplusplus */ + +#endif /* !defined(CUBLAS_H_) */ diff --git a/include/external/CUDA/cublas_api.h b/include/external/CUDA/cublas_api.h new file mode 100755 index 000000000..ff89141d0 --- /dev/null +++ b/include/external/CUDA/cublas_api.h @@ -0,0 +1,2977 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* + * This is the public header file for the CUBLAS library, defining the API + * + * CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines) + * on top of the CUDA runtime. + */ + +#if !defined(CUBLAS_API_H_) +#define CUBLAS_API_H_ + +#ifndef CUBLASWINAPI +#ifdef _WIN32 +#define CUBLASWINAPI __stdcall +#else +#define CUBLASWINAPI +#endif +#endif + +#ifndef CUBLASAPI +#error "This file should not be included without defining CUBLASAPI" +#endif + +#include "driver_types.h" +#include "cuComplex.h" /* import complex data type */ + +#include "cuda_fp16.h" + +#include "library_types.h" + + +#if defined(__cplusplus) +extern "C" { +#endif /* __cplusplus */ + +/* CUBLAS status type returns */ +typedef enum{ + CUBLAS_STATUS_SUCCESS =0, + CUBLAS_STATUS_NOT_INITIALIZED =1, + CUBLAS_STATUS_ALLOC_FAILED =3, + CUBLAS_STATUS_INVALID_VALUE =7, + CUBLAS_STATUS_ARCH_MISMATCH =8, + CUBLAS_STATUS_MAPPING_ERROR =11, + CUBLAS_STATUS_EXECUTION_FAILED=13, + CUBLAS_STATUS_INTERNAL_ERROR =14, + CUBLAS_STATUS_NOT_SUPPORTED =15, + CUBLAS_STATUS_LICENSE_ERROR =16 +} cublasStatus_t; + + +typedef enum { + CUBLAS_FILL_MODE_LOWER=0, + CUBLAS_FILL_MODE_UPPER=1 +} cublasFillMode_t; + +typedef enum { + CUBLAS_DIAG_NON_UNIT=0, + CUBLAS_DIAG_UNIT=1 +} cublasDiagType_t; + +typedef enum { + CUBLAS_SIDE_LEFT =0, + CUBLAS_SIDE_RIGHT=1 +} cublasSideMode_t; + + +typedef enum { + CUBLAS_OP_N=0, + CUBLAS_OP_T=1, + CUBLAS_OP_C=2 +} cublasOperation_t; + + +typedef enum { + CUBLAS_POINTER_MODE_HOST = 0, + CUBLAS_POINTER_MODE_DEVICE = 1 +} cublasPointerMode_t; + +typedef enum { + CUBLAS_ATOMICS_NOT_ALLOWED = 0, + CUBLAS_ATOMICS_ALLOWED = 1 +} cublasAtomicsMode_t; + +/*For different GEMM algorithm */ +typedef enum { + CUBLAS_GEMM_DFALT = -1, + CUBLAS_GEMM_DEFAULT = -1, + CUBLAS_GEMM_ALGO0 = 0, + CUBLAS_GEMM_ALGO1 = 1, + CUBLAS_GEMM_ALGO2 = 2, + CUBLAS_GEMM_ALGO3 = 3, + CUBLAS_GEMM_ALGO4 = 4, + CUBLAS_GEMM_ALGO5 = 5, + CUBLAS_GEMM_ALGO6 = 6, + CUBLAS_GEMM_ALGO7 = 7, + CUBLAS_GEMM_ALGO8 = 8, + CUBLAS_GEMM_ALGO9 = 9, + CUBLAS_GEMM_ALGO10 = 10, + CUBLAS_GEMM_ALGO11 = 11, + CUBLAS_GEMM_ALGO12 = 12, + CUBLAS_GEMM_ALGO13 = 13, + CUBLAS_GEMM_ALGO14 = 14, + CUBLAS_GEMM_ALGO15 = 15, + CUBLAS_GEMM_ALGO16 = 16, + CUBLAS_GEMM_ALGO17 = 17, + CUBLAS_GEMM_DEFAULT_TENSOR_OP = 99, + CUBLAS_GEMM_DFALT_TENSOR_OP = 99, + CUBLAS_GEMM_ALGO0_TENSOR_OP = 100, + CUBLAS_GEMM_ALGO1_TENSOR_OP = 101, + CUBLAS_GEMM_ALGO2_TENSOR_OP = 102, + CUBLAS_GEMM_ALGO3_TENSOR_OP = 103, + CUBLAS_GEMM_ALGO4_TENSOR_OP = 104 +} cublasGemmAlgo_t; + +/*Enum for default math mode/tensor operation*/ +typedef enum { + CUBLAS_DEFAULT_MATH = 0, + CUBLAS_TENSOR_OP_MATH = 1 +} cublasMath_t; + +/* For backward compatibility purposes */ +typedef cudaDataType cublasDataType_t; + +/* Opaque structure holding CUBLAS library context */ +struct cublasContext; +typedef struct cublasContext *cublasHandle_t; + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCreate_v2 (cublasHandle_t *handle); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDestroy_v2 (cublasHandle_t handle); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle, int *version); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type, int *value); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetStream_v2 (cublasHandle_t handle, cudaStream_t streamId); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetStream_v2 (cublasHandle_t handle, cudaStream_t *streamId); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2 (cublasHandle_t handle, cublasPointerMode_t *mode); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2 (cublasHandle_t handle, cublasPointerMode_t mode); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t *mode); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t mode); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle, cublasMath_t *mode); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode); + +/* + * cublasStatus_t + * cublasSetVector (int n, int elemSize, const void *x, int incx, + * void *y, int incy) + * + * copies n elements from a vector x in CPU memory space to a vector y + * in GPU memory space. Elements in both vectors are assumed to have a + * size of elemSize bytes. Storage spacing between consecutive elements + * is incx for the source vector x and incy for the destination vector + * y. In general, y points to an object, or part of an object, allocated + * via cublasAlloc(). Column major format for two-dimensional matrices + * is assumed throughout CUBLAS. Therefore, if the increment for a vector + * is equal to 1, this access a column vector while using an increment + * equal to the leading dimension of the respective matrix accesses a + * row vector. + * + * Return Values + * ------------- + * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized + * CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0 + * CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory + * CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI cublasSetVector (int n, int elemSize, const void *x, + int incx, void *devicePtr, int incy); + +/* + * cublasStatus_t + * cublasGetVector (int n, int elemSize, const void *x, int incx, + * void *y, int incy) + * + * copies n elements from a vector x in GPU memory space to a vector y + * in CPU memory space. Elements in both vectors are assumed to have a + * size of elemSize bytes. Storage spacing between consecutive elements + * is incx for the source vector x and incy for the destination vector + * y. In general, x points to an object, or part of an object, allocated + * via cublasAlloc(). Column major format for two-dimensional matrices + * is assumed throughout CUBLAS. Therefore, if the increment for a vector + * is equal to 1, this access a column vector while using an increment + * equal to the leading dimension of the respective matrix accesses a + * row vector. + * + * Return Values + * ------------- + * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized + * CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0 + * CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory + * CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI cublasGetVector (int n, int elemSize, const void *x, + int incx, void *y, int incy); + +/* + * cublasStatus_t + * cublasSetMatrix (int rows, int cols, int elemSize, const void *A, + * int lda, void *B, int ldb) + * + * copies a tile of rows x cols elements from a matrix A in CPU memory + * space to a matrix B in GPU memory space. Each element requires storage + * of elemSize bytes. Both matrices are assumed to be stored in column + * major format, with the leading dimension (i.e. number of rows) of + * source matrix A provided in lda, and the leading dimension of matrix B + * provided in ldb. In general, B points to an object, or part of an + * object, that was allocated via cublasAlloc(). + * + * Return Values + * ------------- + * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialized + * CUBLAS_STATUS_INVALID_VALUE if rows or cols < 0, or elemSize, lda, or + * ldb <= 0 + * CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory + * CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI cublasSetMatrix (int rows, int cols, int elemSize, + const void *A, int lda, void *B, + int ldb); + +/* + * cublasStatus_t + * cublasGetMatrix (int rows, int cols, int elemSize, const void *A, + * int lda, void *B, int ldb) + * + * copies a tile of rows x cols elements from a matrix A in GPU memory + * space to a matrix B in CPU memory space. Each element requires storage + * of elemSize bytes. Both matrices are assumed to be stored in column + * major format, with the leading dimension (i.e. number of rows) of + * source matrix A provided in lda, and the leading dimension of matrix B + * provided in ldb. In general, A points to an object, or part of an + * object, that was allocated via cublasAlloc(). + * + * Return Values + * ------------- + * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialized + * CUBLAS_STATUS_INVALID_VALUE if rows, cols, eleSize, lda, or ldb <= 0 + * CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory + * CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI cublasGetMatrix (int rows, int cols, int elemSize, + const void *A, int lda, void *B, + int ldb); + +/* + * cublasStatus + * cublasSetVectorAsync ( int n, int elemSize, const void *x, int incx, + * void *y, int incy, cudaStream_t stream ); + * + * cublasSetVectorAsync has the same functionnality as cublasSetVector + * but the transfer is done asynchronously within the CUDA stream passed + * in parameter. + * + * Return Values + * ------------- + * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized + * CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0 + * CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory + * CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI cublasSetVectorAsync (int n, int elemSize, + const void *hostPtr, int incx, + void *devicePtr, int incy, + cudaStream_t stream); +/* + * cublasStatus + * cublasGetVectorAsync( int n, int elemSize, const void *x, int incx, + * void *y, int incy, cudaStream_t stream) + * + * cublasGetVectorAsync has the same functionnality as cublasGetVector + * but the transfer is done asynchronously within the CUDA stream passed + * in parameter. + * + * Return Values + * ------------- + * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized + * CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0 + * CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory + * CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI cublasGetVectorAsync (int n, int elemSize, + const void *devicePtr, int incx, + void *hostPtr, int incy, + cudaStream_t stream); + +/* + * cublasStatus_t + * cublasSetMatrixAsync (int rows, int cols, int elemSize, const void *A, + * int lda, void *B, int ldb, cudaStream_t stream) + * + * cublasSetMatrixAsync has the same functionnality as cublasSetMatrix + * but the transfer is done asynchronously within the CUDA stream passed + * in parameter. + * + * Return Values + * ------------- + * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialized + * CUBLAS_STATUS_INVALID_VALUE if rows or cols < 0, or elemSize, lda, or + * ldb <= 0 + * CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory + * CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI cublasSetMatrixAsync (int rows, int cols, int elemSize, + const void *A, int lda, void *B, + int ldb, cudaStream_t stream); + +/* + * cublasStatus_t + * cublasGetMatrixAsync (int rows, int cols, int elemSize, const void *A, + * int lda, void *B, int ldb, cudaStream_t stream) + * + * cublasGetMatrixAsync has the same functionnality as cublasGetMatrix + * but the transfer is done asynchronously within the CUDA stream passed + * in parameter. + * + * Return Values + * ------------- + * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialized + * CUBLAS_STATUS_INVALID_VALUE if rows, cols, eleSize, lda, or ldb <= 0 + * CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory + * CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI cublasGetMatrixAsync (int rows, int cols, int elemSize, + const void *A, int lda, void *B, + int ldb, cudaStream_t stream); + + +CUBLASAPI void CUBLASWINAPI cublasXerbla (const char *srName, int info); +/* ---------------- CUBLAS BLAS1 functions ---------------- */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle, + int n, + const void *x, + cudaDataType xType, + int incx, + void *result, + cudaDataType resultType, + cudaDataType executionType); /* host or device pointer */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSnrm2_v2(cublasHandle_t handle, + int n, + const float *x, + int incx, + float *result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDnrm2_v2(cublasHandle_t handle, + int n, + const double *x, + int incx, + double *result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasScnrm2_v2(cublasHandle_t handle, + int n, + const cuComplex *x, + int incx, + float *result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDznrm2_v2(cublasHandle_t handle, + int n, + const cuDoubleComplex *x, + int incx, + double *result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDotEx (cublasHandle_t handle, + int n, + const void *x, + cudaDataType xType, + int incx, + const void *y, + cudaDataType yType, + int incy, + void *result, + cudaDataType resultType, + cudaDataType executionType); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDotcEx (cublasHandle_t handle, + int n, + const void *x, + cudaDataType xType, + int incx, + const void *y, + cudaDataType yType, + int incy, + void *result, + cudaDataType resultType, + cudaDataType executionType); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSdot_v2 (cublasHandle_t handle, + int n, + const float *x, + int incx, + const float *y, + int incy, + float *result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDdot_v2 (cublasHandle_t handle, + int n, + const double *x, + int incx, + const double *y, + int incy, + double *result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdotu_v2 (cublasHandle_t handle, + int n, + const cuComplex *x, + int incx, + const cuComplex *y, + int incy, + cuComplex *result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdotc_v2 (cublasHandle_t handle, + int n, + const cuComplex *x, + int incx, + const cuComplex *y, + int incy, + cuComplex *result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdotu_v2 (cublasHandle_t handle, + int n, + const cuDoubleComplex *x, + int incx, + const cuDoubleComplex *y, + int incy, + cuDoubleComplex *result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdotc_v2 (cublasHandle_t handle, + int n, + const cuDoubleComplex *x, + int incx, + const cuDoubleComplex *y, + int incy, + cuDoubleComplex *result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasScalEx(cublasHandle_t handle, + int n, + const void *alpha, /* host or device pointer */ + cudaDataType alphaType, + void *x, + cudaDataType xType, + int incx, + cudaDataType executionType); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSscal_v2(cublasHandle_t handle, + int n, + const float *alpha, /* host or device pointer */ + float *x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDscal_v2(cublasHandle_t handle, + int n, + const double *alpha, /* host or device pointer */ + double *x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCscal_v2(cublasHandle_t handle, + int n, + const cuComplex *alpha, /* host or device pointer */ + cuComplex *x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsscal_v2(cublasHandle_t handle, + int n, + const float *alpha, /* host or device pointer */ + cuComplex *x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZscal_v2(cublasHandle_t handle, + int n, + const cuDoubleComplex *alpha, /* host or device pointer */ + cuDoubleComplex *x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdscal_v2(cublasHandle_t handle, + int n, + const double *alpha, /* host or device pointer */ + cuDoubleComplex *x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasAxpyEx (cublasHandle_t handle, + int n, + const void *alpha, /* host or device pointer */ + cudaDataType alphaType, + const void *x, + cudaDataType xType, + int incx, + void *y, + cudaDataType yType, + int incy, + cudaDataType executiontype); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSaxpy_v2 (cublasHandle_t handle, + int n, + const float *alpha, /* host or device pointer */ + const float *x, + int incx, + float *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDaxpy_v2 (cublasHandle_t handle, + int n, + const double *alpha, /* host or device pointer */ + const double *x, + int incx, + double *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCaxpy_v2 (cublasHandle_t handle, + int n, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *x, + int incx, + cuComplex *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZaxpy_v2 (cublasHandle_t handle, + int n, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *x, + int incx, + cuDoubleComplex *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasScopy_v2 (cublasHandle_t handle, + int n, + const float *x, + int incx, + float *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDcopy_v2 (cublasHandle_t handle, + int n, + const double *x, + int incx, + double *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCcopy_v2 (cublasHandle_t handle, + int n, + const cuComplex *x, + int incx, + cuComplex *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZcopy_v2 (cublasHandle_t handle, + int n, + const cuDoubleComplex *x, + int incx, + cuDoubleComplex *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSswap_v2 (cublasHandle_t handle, + int n, + float *x, + int incx, + float *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDswap_v2 (cublasHandle_t handle, + int n, + double *x, + int incx, + double *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCswap_v2 (cublasHandle_t handle, + int n, + cuComplex *x, + int incx, + cuComplex *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZswap_v2 (cublasHandle_t handle, + int n, + cuDoubleComplex *x, + int incx, + cuDoubleComplex *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIsamax_v2(cublasHandle_t handle, + int n, + const float *x, + int incx, + int *result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIdamax_v2(cublasHandle_t handle, + int n, + const double *x, + int incx, + int *result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIcamax_v2(cublasHandle_t handle, + int n, + const cuComplex *x, + int incx, + int *result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIzamax_v2(cublasHandle_t handle, + int n, + const cuDoubleComplex *x, + int incx, + int *result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIsamin_v2(cublasHandle_t handle, + int n, + const float *x, + int incx, + int *result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIdamin_v2(cublasHandle_t handle, + int n, + const double *x, + int incx, + int *result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIcamin_v2(cublasHandle_t handle, + int n, + const cuComplex *x, + int incx, + int *result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIzamin_v2(cublasHandle_t handle, + int n, + const cuDoubleComplex *x, + int incx, + int *result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSasum_v2(cublasHandle_t handle, + int n, + const float *x, + int incx, + float *result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDasum_v2(cublasHandle_t handle, + int n, + const double *x, + int incx, + double *result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasScasum_v2(cublasHandle_t handle, + int n, + const cuComplex *x, + int incx, + float *result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDzasum_v2(cublasHandle_t handle, + int n, + const cuDoubleComplex *x, + int incx, + double *result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrot_v2 (cublasHandle_t handle, + int n, + float *x, + int incx, + float *y, + int incy, + const float *c, /* host or device pointer */ + const float *s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrot_v2 (cublasHandle_t handle, + int n, + double *x, + int incx, + double *y, + int incy, + const double *c, /* host or device pointer */ + const double *s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCrot_v2 (cublasHandle_t handle, + int n, + cuComplex *x, + int incx, + cuComplex *y, + int incy, + const float *c, /* host or device pointer */ + const cuComplex *s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsrot_v2(cublasHandle_t handle, + int n, + cuComplex *x, + int incx, + cuComplex *y, + int incy, + const float *c, /* host or device pointer */ + const float *s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZrot_v2 (cublasHandle_t handle, + int n, + cuDoubleComplex *x, + int incx, + cuDoubleComplex *y, + int incy, + const double *c, /* host or device pointer */ + const cuDoubleComplex *s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdrot_v2(cublasHandle_t handle, + int n, + cuDoubleComplex *x, + int incx, + cuDoubleComplex *y, + int incy, + const double *c, /* host or device pointer */ + const double *s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrotg_v2(cublasHandle_t handle, + float *a, /* host or device pointer */ + float *b, /* host or device pointer */ + float *c, /* host or device pointer */ + float *s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrotg_v2(cublasHandle_t handle, + double *a, /* host or device pointer */ + double *b, /* host or device pointer */ + double *c, /* host or device pointer */ + double *s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCrotg_v2(cublasHandle_t handle, + cuComplex *a, /* host or device pointer */ + cuComplex *b, /* host or device pointer */ + float *c, /* host or device pointer */ + cuComplex *s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZrotg_v2(cublasHandle_t handle, + cuDoubleComplex *a, /* host or device pointer */ + cuDoubleComplex *b, /* host or device pointer */ + double *c, /* host or device pointer */ + cuDoubleComplex *s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle, + int n, + float *x, + int incx, + float *y, + int incy, + const float* param); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle, + int n, + double *x, + int incx, + double *y, + int incy, + const double* param); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrotmg_v2(cublasHandle_t handle, + float *d1, /* host or device pointer */ + float *d2, /* host or device pointer */ + float *x1, /* host or device pointer */ + const float *y1, /* host or device pointer */ + float *param); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrotmg_v2(cublasHandle_t handle, + double *d1, /* host or device pointer */ + double *d2, /* host or device pointer */ + double *x1, /* host or device pointer */ + const double *y1, /* host or device pointer */ + double *param); /* host or device pointer */ + +/* --------------- CUBLAS BLAS2 functions ---------------- */ + +/* GEMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemv_v2 (cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const float *alpha, /* host or device pointer */ + const float *A, + int lda, + const float *x, + int incx, + const float *beta, /* host or device pointer */ + float *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemv_v2 (cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const double *alpha, /* host or device pointer */ + const double *A, + int lda, + const double *x, + int incx, + const double *beta, /* host or device pointer */ + double *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemv_v2 (cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *A, + int lda, + const cuComplex *x, + int incx, + const cuComplex *beta, /* host or device pointer */ + cuComplex *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemv_v2 (cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *A, + int lda, + const cuDoubleComplex *x, + int incx, + const cuDoubleComplex *beta, /* host or device pointer */ + cuDoubleComplex *y, + int incy); +/* GBMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgbmv_v2 (cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + int kl, + int ku, + const float *alpha, /* host or device pointer */ + const float *A, + int lda, + const float *x, + int incx, + const float *beta, /* host or device pointer */ + float *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgbmv_v2 (cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + int kl, + int ku, + const double *alpha, /* host or device pointer */ + const double *A, + int lda, + const double *x, + int incx, + const double *beta, /* host or device pointer */ + double *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgbmv_v2 (cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + int kl, + int ku, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *A, + int lda, + const cuComplex *x, + int incx, + const cuComplex *beta, /* host or device pointer */ + cuComplex *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgbmv_v2 (cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + int kl, + int ku, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *A, + int lda, + const cuDoubleComplex *x, + int incx, + const cuDoubleComplex *beta, /* host or device pointer */ + cuDoubleComplex *y, + int incy); + +/* TRMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrmv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const float *A, + int lda, + float *x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrmv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const double *A, + int lda, + double *x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrmv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const cuComplex *A, + int lda, + cuComplex *x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrmv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const cuDoubleComplex *A, + int lda, + cuDoubleComplex *x, + int incx); + +/* TBMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStbmv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + int k, + const float *A, + int lda, + float *x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtbmv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + int k, + const double *A, + int lda, + double *x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtbmv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + int k, + const cuComplex *A, + int lda, + cuComplex *x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtbmv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + int k, + const cuDoubleComplex *A, + int lda, + cuDoubleComplex *x, + int incx); + +/* TPMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStpmv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const float *AP, + float *x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtpmv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const double *AP, + double *x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtpmv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const cuComplex *AP, + cuComplex *x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtpmv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const cuDoubleComplex *AP, + cuDoubleComplex *x, + int incx); + +/* TRSV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrsv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const float *A, + int lda, + float *x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrsv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const double *A, + int lda, + double *x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrsv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const cuComplex *A, + int lda, + cuComplex *x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrsv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const cuDoubleComplex *A, + int lda, + cuDoubleComplex *x, + int incx); + +/* TPSV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStpsv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const float *AP, + float *x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtpsv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const double *AP, + double *x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtpsv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const cuComplex *AP, + cuComplex *x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtpsv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const cuDoubleComplex *AP, + cuDoubleComplex *x, + int incx); +/* TBSV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStbsv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + int k, + const float *A, + int lda, + float *x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtbsv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + int k, + const double *A, + int lda, + double *x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtbsv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + int k, + const cuComplex *A, + int lda, + cuComplex *x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtbsv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + int k, + const cuDoubleComplex *A, + int lda, + cuDoubleComplex *x, + int incx); + +/* SYMV/HEMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsymv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float *alpha, /* host or device pointer */ + const float *A, + int lda, + const float *x, + int incx, + const float *beta, /* host or device pointer */ + float *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsymv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double *alpha, /* host or device pointer */ + const double *A, + int lda, + const double *x, + int incx, + const double *beta, /* host or device pointer */ + double *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsymv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *A, + int lda, + const cuComplex *x, + int incx, + const cuComplex *beta, /* host or device pointer */ + cuComplex *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsymv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *A, + int lda, + const cuDoubleComplex *x, + int incx, + const cuDoubleComplex *beta, /* host or device pointer */ + cuDoubleComplex *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChemv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *A, + int lda, + const cuComplex *x, + int incx, + const cuComplex *beta, /* host or device pointer */ + cuComplex *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhemv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *A, + int lda, + const cuDoubleComplex *x, + int incx, + const cuDoubleComplex *beta, /* host or device pointer */ + cuDoubleComplex *y, + int incy); + +/* SBMV/HBMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsbmv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + int k, + const float *alpha, /* host or device pointer */ + const float *A, + int lda, + const float *x, + int incx, + const float *beta, /* host or device pointer */ + float *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsbmv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + int k, + const double *alpha, /* host or device pointer */ + const double *A, + int lda, + const double *x, + int incx, + const double *beta, /* host or device pointer */ + double *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChbmv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + int k, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *A, + int lda, + const cuComplex *x, + int incx, + const cuComplex *beta, /* host or device pointer */ + cuComplex *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhbmv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + int k, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *A, + int lda, + const cuDoubleComplex *x, + int incx, + const cuDoubleComplex *beta, /* host or device pointer */ + cuDoubleComplex *y, + int incy); + +/* SPMV/HPMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSspmv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float *alpha, /* host or device pointer */ + const float *AP, + const float *x, + int incx, + const float *beta, /* host or device pointer */ + float *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDspmv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double *alpha, /* host or device pointer */ + const double *AP, + const double *x, + int incx, + const double *beta, /* host or device pointer */ + double *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChpmv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *AP, + const cuComplex *x, + int incx, + const cuComplex *beta, /* host or device pointer */ + cuComplex *y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhpmv_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *AP, + const cuDoubleComplex *x, + int incx, + const cuDoubleComplex *beta, /* host or device pointer */ + cuDoubleComplex *y, + int incy); + +/* GER */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSger_v2 (cublasHandle_t handle, + int m, + int n, + const float *alpha, /* host or device pointer */ + const float *x, + int incx, + const float *y, + int incy, + float *A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDger_v2 (cublasHandle_t handle, + int m, + int n, + const double *alpha, /* host or device pointer */ + const double *x, + int incx, + const double *y, + int incy, + double *A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgeru_v2 (cublasHandle_t handle, + int m, + int n, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *x, + int incx, + const cuComplex *y, + int incy, + cuComplex *A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgerc_v2 (cublasHandle_t handle, + int m, + int n, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *x, + int incx, + const cuComplex *y, + int incy, + cuComplex *A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgeru_v2 (cublasHandle_t handle, + int m, + int n, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *x, + int incx, + const cuDoubleComplex *y, + int incy, + cuDoubleComplex *A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgerc_v2 (cublasHandle_t handle, + int m, + int n, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *x, + int incx, + const cuDoubleComplex *y, + int incy, + cuDoubleComplex *A, + int lda); + +/* SYR/HER */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyr_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float *alpha, /* host or device pointer */ + const float *x, + int incx, + float *A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyr_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double *alpha, /* host or device pointer */ + const double *x, + int incx, + double *A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyr_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *x, + int incx, + cuComplex *A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyr_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *x, + int incx, + cuDoubleComplex *A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCher_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float *alpha, /* host or device pointer */ + const cuComplex *x, + int incx, + cuComplex *A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZher_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double *alpha, /* host or device pointer */ + const cuDoubleComplex *x, + int incx, + cuDoubleComplex *A, + int lda); + +/* SPR/HPR */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSspr_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float *alpha, /* host or device pointer */ + const float *x, + int incx, + float *AP); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDspr_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double *alpha, /* host or device pointer */ + const double *x, + int incx, + double *AP); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChpr_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float *alpha, /* host or device pointer */ + const cuComplex *x, + int incx, + cuComplex *AP); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhpr_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double *alpha, /* host or device pointer */ + const cuDoubleComplex *x, + int incx, + cuDoubleComplex *AP); + +/* SYR2/HER2 */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyr2_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float *alpha, /* host or device pointer */ + const float *x, + int incx, + const float *y, + int incy, + float *A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyr2_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double *alpha, /* host or device pointer */ + const double *x, + int incx, + const double *y, + int incy, + double *A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyr2_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, int n, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *x, + int incx, + const cuComplex *y, + int incy, + cuComplex *A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyr2_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *x, + int incx, + const cuDoubleComplex *y, + int incy, + cuDoubleComplex *A, + int lda); + + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCher2_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, int n, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *x, + int incx, + const cuComplex *y, + int incy, + cuComplex *A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZher2_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *x, + int incx, + const cuDoubleComplex *y, + int incy, + cuDoubleComplex *A, + int lda); + +/* SPR2/HPR2 */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSspr2_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float *alpha, /* host or device pointer */ + const float *x, + int incx, + const float *y, + int incy, + float *AP); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDspr2_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double *alpha, /* host or device pointer */ + const double *x, + int incx, + const double *y, + int incy, + double *AP); + + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChpr2_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *x, + int incx, + const cuComplex *y, + int incy, + cuComplex *AP); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhpr2_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *x, + int incx, + const cuDoubleComplex *y, + int incy, + cuDoubleComplex *AP); + +/* ---------------- CUBLAS BLAS3 functions ---------------- */ + +/* GEMM */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemm_v2 (cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float *alpha, /* host or device pointer */ + const float *A, + int lda, + const float *B, + int ldb, + const float *beta, /* host or device pointer */ + float *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemm_v2 (cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const double *alpha, /* host or device pointer */ + const double *A, + int lda, + const double *B, + int ldb, + const double *beta, /* host or device pointer */ + double *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm_v2 (cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *A, + int lda, + const cuComplex *B, + int ldb, + const cuComplex *beta, /* host or device pointer */ + cuComplex *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3m (cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *A, + int lda, + const cuComplex *B, + int ldb, + const cuComplex *beta, /* host or device pointer */ + cuComplex *C, + int ldc); + CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3mEx (cublasHandle_t handle, + cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, + const cuComplex *alpha, + const void *A, + cudaDataType Atype, + int lda, + const void *B, + cudaDataType Btype, + int ldb, + const cuComplex *beta, + void *C, + cudaDataType Ctype, + int ldc); + + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemm_v2 (cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *A, + int lda, + const cuDoubleComplex *B, + int ldb, + const cuDoubleComplex *beta, /* host or device pointer */ + cuDoubleComplex *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemm3m (cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *A, + int lda, + const cuDoubleComplex *B, + int ldb, + const cuDoubleComplex *beta, /* host or device pointer */ + cuDoubleComplex *C, + int ldc); + +#if defined(__cplusplus) +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHgemm (cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const __half *alpha, /* host or device pointer */ + const __half *A, + int lda, + const __half *B, + int ldb, + const __half *beta, /* host or device pointer */ + __half *C, + int ldc); +#endif +/* IO in FP16/FP32, computation in float */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmEx (cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float *alpha, /* host or device pointer */ + const void *A, + cudaDataType Atype, + int lda, + const void *B, + cudaDataType Btype, + int ldb, + const float *beta, /* host or device pointer */ + void *C, + cudaDataType Ctype, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGemmEx (cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const void *alpha, /* host or device pointer */ + const void *A, + cudaDataType Atype, + int lda, + const void *B, + cudaDataType Btype, + int ldb, + const void *beta, /* host or device pointer */ + void *C, + cudaDataType Ctype, + int ldc, + cudaDataType computeType, + cublasGemmAlgo_t algo); + +/* IO in Int8 complex/cuComplex, computation in cuComplex */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemmEx (cublasHandle_t handle, + cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, + const cuComplex *alpha, + const void *A, + cudaDataType Atype, + int lda, + const void *B, + cudaDataType Btype, + int ldb, + const cuComplex *beta, + void *C, + cudaDataType Ctype, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasUint8gemmBias (cublasHandle_t handle, + cublasOperation_t transa, cublasOperation_t transb, cublasOperation_t transc, + int m, int n, int k, + const unsigned char *A, int A_bias, int lda, + const unsigned char *B, int B_bias, int ldb, + unsigned char *C, int C_bias, int ldc, + int C_mult, int C_shift); + +/* SYRK */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyrk_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const float *alpha, /* host or device pointer */ + const float *A, + int lda, + const float *beta, /* host or device pointer */ + float *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyrk_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const double *alpha, /* host or device pointer */ + const double *A, + int lda, + const double *beta, /* host or device pointer */ + double *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrk_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *A, + int lda, + const cuComplex *beta, /* host or device pointer */ + cuComplex *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyrk_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *A, + int lda, + const cuDoubleComplex *beta, /* host or device pointer */ + cuDoubleComplex *C, + int ldc); +/* IO in Int8 complex/cuComplex, computation in cuComplex */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrkEx ( cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuComplex *alpha, /* host or device pointer */ + const void *A, + cudaDataType Atype, + int lda, + const cuComplex *beta, /* host or device pointer */ + void *C, + cudaDataType Ctype, + int ldc); + +/* IO in Int8 complex/cuComplex, computation in cuComplex, Gaussian math */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuComplex *alpha, + const void *A, + cudaDataType Atype, + int lda, + const cuComplex *beta, + void *C, + cudaDataType Ctype, + int ldc); + +/* HERK */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherk_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const float *alpha, /* host or device pointer */ + const cuComplex *A, + int lda, + const float *beta, /* host or device pointer */ + cuComplex *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZherk_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const double *alpha, /* host or device pointer */ + const cuDoubleComplex *A, + int lda, + const double *beta, /* host or device pointer */ + cuDoubleComplex *C, + int ldc); + +/* IO in Int8 complex/cuComplex, computation in cuComplex */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherkEx (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const float *alpha, /* host or device pointer */ + const void *A, + cudaDataType Atype, + int lda, + const float *beta, /* host or device pointer */ + void *C, + cudaDataType Ctype, + int ldc); + +/* IO in Int8 complex/cuComplex, computation in cuComplex, Gaussian math */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherk3mEx (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const float *alpha, + const void *A, cudaDataType Atype, + int lda, + const float *beta, + void *C, + cudaDataType Ctype, + int ldc); + + + +/* SYR2K */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const float *alpha, /* host or device pointer */ + const float *A, + int lda, + const float *B, + int ldb, + const float *beta, /* host or device pointer */ + float *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const double *alpha, /* host or device pointer */ + const double *A, + int lda, + const double *B, + int ldb, + const double *beta, /* host or device pointer */ + double *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *A, + int lda, + const cuComplex *B, + int ldb, + const cuComplex *beta, /* host or device pointer */ + cuComplex *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *A, + int lda, + const cuDoubleComplex *B, + int ldb, + const cuDoubleComplex *beta, /* host or device pointer */ + cuDoubleComplex *C, + int ldc); +/* HER2K */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCher2k_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *A, + int lda, + const cuComplex *B, + int ldb, + const float *beta, /* host or device pointer */ + cuComplex *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZher2k_v2 (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *A, + int lda, + const cuDoubleComplex *B, + int ldb, + const double *beta, /* host or device pointer */ + cuDoubleComplex *C, + int ldc); +/* SYRKX : eXtended SYRK*/ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyrkx (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const float *alpha, /* host or device pointer */ + const float *A, + int lda, + const float *B, + int ldb, + const float *beta, /* host or device pointer */ + float *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyrkx (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const double *alpha, /* host or device pointer */ + const double *A, + int lda, + const double *B, + int ldb, + const double *beta, /* host or device pointer */ + double *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrkx (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *A, + int lda, + const cuComplex *B, + int ldb, + const cuComplex *beta, /* host or device pointer */ + cuComplex *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyrkx (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *A, + int lda, + const cuDoubleComplex *B, + int ldb, + const cuDoubleComplex *beta, /* host or device pointer */ + cuDoubleComplex *C, + int ldc); +/* HERKX : eXtended HERK */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherkx (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *A, + int lda, + const cuComplex *B, + int ldb, + const float *beta, /* host or device pointer */ + cuComplex *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZherkx (cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *A, + int lda, + const cuDoubleComplex *B, + int ldb, + const double *beta, /* host or device pointer */ + cuDoubleComplex *C, + int ldc); +/* SYMM */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsymm_v2 (cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const float *alpha, /* host or device pointer */ + const float *A, + int lda, + const float *B, + int ldb, + const float *beta, /* host or device pointer */ + float *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsymm_v2 (cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const double *alpha, /* host or device pointer */ + const double *A, + int lda, + const double *B, + int ldb, + const double *beta, /* host or device pointer */ + double *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsymm_v2 (cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *A, + int lda, + const cuComplex *B, + int ldb, + const cuComplex *beta, /* host or device pointer */ + cuComplex *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsymm_v2 (cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *A, + int lda, + const cuDoubleComplex *B, + int ldb, + const cuDoubleComplex *beta, /* host or device pointer */ + cuDoubleComplex *C, + int ldc); + +/* HEMM */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChemm_v2 (cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *A, + int lda, + const cuComplex *B, + int ldb, + const cuComplex *beta, /* host or device pointer */ + cuComplex *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhemm_v2 (cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *A, + int lda, + const cuDoubleComplex *B, + int ldb, + const cuDoubleComplex *beta, /* host or device pointer */ + cuDoubleComplex *C, + int ldc); + +/* TRSM */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrsm_v2 (cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const float *alpha, /* host or device pointer */ + const float *A, + int lda, + float *B, + int ldb); + + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrsm_v2 (cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const double *alpha, /* host or device pointer */ + const double *A, + int lda, + double *B, + int ldb); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *A, + int lda, + cuComplex *B, + int ldb); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *A, + int lda, + cuDoubleComplex *B, + int ldb); + + /* TRMM */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrmm_v2 (cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const float *alpha, /* host or device pointer */ + const float *A, + int lda, + const float *B, + int ldb, + float *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrmm_v2 (cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const double *alpha, /* host or device pointer */ + const double *A, + int lda, + const double *B, + int ldb, + double *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *A, + int lda, + const cuComplex *B, + int ldb, + cuComplex *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *A, + int lda, + const cuDoubleComplex *B, + int ldb, + cuDoubleComplex *C, + int ldc); +/* BATCH GEMM */ +#if defined(__cplusplus) +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHgemmBatched (cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const __half *alpha, /* host or device pointer */ + const __half *Aarray[], + int lda, + const __half *Barray[], + int ldb, + const __half *beta, /* host or device pointer */ + __half *Carray[], + int ldc, + int batchCount); +#endif +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmBatched (cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float *alpha, /* host or device pointer */ + const float *Aarray[], + int lda, + const float *Barray[], + int ldb, + const float *beta, /* host or device pointer */ + float *Carray[], + int ldc, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemmBatched (cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const double *alpha, /* host or device pointer */ + const double *Aarray[], + int lda, + const double *Barray[], + int ldb, + const double *beta, /* host or device pointer */ + double *Carray[], + int ldc, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemmBatched (cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *Aarray[], + int lda, + const cuComplex *Barray[], + int ldb, + const cuComplex *beta, /* host or device pointer */ + cuComplex *Carray[], + int ldc, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched (cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *Aarray[], + int lda, + const cuComplex *Barray[], + int ldb, + const cuComplex *beta, /* host or device pointer */ + cuComplex *Carray[], + int ldc, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemmBatched (cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *Aarray[], + int lda, + const cuDoubleComplex *Barray[], + int ldb, + const cuDoubleComplex *beta, /* host or device pointer */ + cuDoubleComplex *Carray[], + int ldc, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched (cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float *alpha, /* host or device pointer */ + const float *A, + int lda, + long long int strideA, /* purposely signed */ + const float *B, + int ldb, + long long int strideB, + const float *beta, /* host or device pointer */ + float *C, + int ldc, + long long int strideC, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched (cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const double *alpha, /* host or device pointer */ + const double *A, + int lda, + long long int strideA, /* purposely signed */ + const double *B, + int ldb, + long long int strideB, + const double *beta, /* host or device pointer */ + double *C, + int ldc, + long long int strideC, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched (cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *A, + int lda, + long long int strideA, /* purposely signed */ + const cuComplex *B, + int ldb, + long long int strideB, + const cuComplex *beta, /* host or device pointer */ + cuComplex *C, + int ldc, + long long int strideC, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched (cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *A, + int lda, + long long int strideA, /* purposely signed */ + const cuComplex *B, + int ldb, + long long int strideB, + const cuComplex *beta, /* host or device pointer */ + cuComplex *C, + int ldc, + long long int strideC, + int batchCount); + + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemmStridedBatched (cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *A, + int lda, + long long int strideA, /* purposely signed */ + const cuDoubleComplex *B, + int ldb, + long long int strideB, + const cuDoubleComplex *beta, /* host or device poi */ + cuDoubleComplex *C, + int ldc, + long long int strideC, + int batchCount); + +#if defined(__cplusplus) +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHgemmStridedBatched (cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const __half *alpha, /* host or device pointer */ + const __half *A, + int lda, + long long int strideA, /* purposely signed */ + const __half *B, + int ldb, + long long int strideB, + const __half *beta, /* host or device pointer */ + __half *C, + int ldc, + long long int strideC, + int batchCount); +#endif +/* ---------------- CUBLAS BLAS-like extension ---------------- */ +/* GEAM */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgeam(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + const float *alpha, /* host or device pointer */ + const float *A, + int lda, + const float *beta , /* host or device pointer */ + const float *B, + int ldb, + float *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgeam(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + const double *alpha, /* host or device pointer */ + const double *A, + int lda, + const double *beta, /* host or device pointer */ + const double *B, + int ldb, + double *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgeam(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *A, + int lda, + const cuComplex *beta, /* host or device pointer */ + const cuComplex *B, + int ldb, + cuComplex *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgeam(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *A, + int lda, + const cuDoubleComplex *beta, /* host or device pointer */ + const cuDoubleComplex *B, + int ldb, + cuDoubleComplex *C, + int ldc); + +/* Batched LU - GETRF*/ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgetrfBatched(cublasHandle_t handle, + int n, + float *A[], /*Device pointer*/ + int lda, + int *P, /*Device Pointer*/ + int *info, /*Device Pointer*/ + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgetrfBatched(cublasHandle_t handle, + int n, + double *A[], /*Device pointer*/ + int lda, + int *P, /*Device Pointer*/ + int *info, /*Device Pointer*/ + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgetrfBatched(cublasHandle_t handle, + int n, + cuComplex *A[], /*Device pointer*/ + int lda, + int *P, /*Device Pointer*/ + int *info, /*Device Pointer*/ + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgetrfBatched(cublasHandle_t handle, + int n, + cuDoubleComplex *A[], /*Device pointer*/ + int lda, + int *P, /*Device Pointer*/ + int *info, /*Device Pointer*/ + int batchSize); + +/* Batched inversion based on LU factorization from getrf */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgetriBatched(cublasHandle_t handle, + int n, + const float *A[], /*Device pointer*/ + int lda, + const int *P, /*Device pointer*/ + float *C[], /*Device pointer*/ + int ldc, + int *info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgetriBatched(cublasHandle_t handle, + int n, + const double *A[], /*Device pointer*/ + int lda, + const int *P, /*Device pointer*/ + double *C[], /*Device pointer*/ + int ldc, + int *info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgetriBatched(cublasHandle_t handle, + int n, + const cuComplex *A[], /*Device pointer*/ + int lda, + const int *P, /*Device pointer*/ + cuComplex *C[], /*Device pointer*/ + int ldc, + int *info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgetriBatched(cublasHandle_t handle, + int n, + const cuDoubleComplex *A[], /*Device pointer*/ + int lda, + const int *P, /*Device pointer*/ + cuDoubleComplex *C[], /*Device pointer*/ + int ldc, + int *info, + int batchSize); + +/* Batched solver based on LU factorization from getrf */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgetrsBatched( cublasHandle_t handle, + cublasOperation_t trans, + int n, + int nrhs, + const float *Aarray[], + int lda, + const int *devIpiv, + float *Barray[], + int ldb, + int *info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgetrsBatched( cublasHandle_t handle, + cublasOperation_t trans, + int n, + int nrhs, + const double *Aarray[], + int lda, + const int *devIpiv, + double *Barray[], + int ldb, + int *info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgetrsBatched( cublasHandle_t handle, + cublasOperation_t trans, + int n, + int nrhs, + const cuComplex *Aarray[], + int lda, + const int *devIpiv, + cuComplex *Barray[], + int ldb, + int *info, + int batchSize); + + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgetrsBatched( cublasHandle_t handle, + cublasOperation_t trans, + int n, + int nrhs, + const cuDoubleComplex *Aarray[], + int lda, + const int *devIpiv, + cuDoubleComplex *Barray[], + int ldb, + int *info, + int batchSize); + + + +/* TRSM - Batched Triangular Solver */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrsmBatched( cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const float *alpha, /*Host or Device Pointer*/ + const float *A[], + int lda, + float *B[], + int ldb, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrsmBatched( cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const double *alpha, /*Host or Device Pointer*/ + const double *A[], + int lda, + double *B[], + int ldb, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrsmBatched( cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const cuComplex *alpha, /*Host or Device Pointer*/ + const cuComplex *A[], + int lda, + cuComplex *B[], + int ldb, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrsmBatched( cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const cuDoubleComplex *alpha, /*Host or Device Pointer*/ + const cuDoubleComplex *A[], + int lda, + cuDoubleComplex *B[], + int ldb, + int batchCount); + +/* Batched - MATINV*/ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSmatinvBatched(cublasHandle_t handle, + int n, + const float *A[], /*Device pointer*/ + int lda, + float *Ainv[], /*Device pointer*/ + int lda_inv, + int *info, /*Device Pointer*/ + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDmatinvBatched(cublasHandle_t handle, + int n, + const double *A[], /*Device pointer*/ + int lda, + double *Ainv[], /*Device pointer*/ + int lda_inv, + int *info, /*Device Pointer*/ + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCmatinvBatched(cublasHandle_t handle, + int n, + const cuComplex *A[], /*Device pointer*/ + int lda, + cuComplex *Ainv[], /*Device pointer*/ + int lda_inv, + int *info, /*Device Pointer*/ + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZmatinvBatched(cublasHandle_t handle, + int n, + const cuDoubleComplex *A[], /*Device pointer*/ + int lda, + cuDoubleComplex *Ainv[], /*Device pointer*/ + int lda_inv, + int *info, /*Device Pointer*/ + int batchSize); + +/* Batch QR Factorization */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgeqrfBatched( cublasHandle_t handle, + int m, + int n, + float *Aarray[], /*Device pointer*/ + int lda, + float *TauArray[], /* Device pointer*/ + int *info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgeqrfBatched( cublasHandle_t handle, + int m, + int n, + double *Aarray[], /*Device pointer*/ + int lda, + double *TauArray[], /* Device pointer*/ + int *info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgeqrfBatched( cublasHandle_t handle, + int m, + int n, + cuComplex *Aarray[], /*Device pointer*/ + int lda, + cuComplex *TauArray[], /* Device pointer*/ + int *info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgeqrfBatched( cublasHandle_t handle, + int m, + int n, + cuDoubleComplex *Aarray[], /*Device pointer*/ + int lda, + cuDoubleComplex *TauArray[], /* Device pointer*/ + int *info, + int batchSize); +/* Least Square Min only m >= n and Non-transpose supported */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgelsBatched( cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + int nrhs, + float *Aarray[], /*Device pointer*/ + int lda, + float *Carray[], /* Device pointer*/ + int ldc, + int *info, + int *devInfoArray, /* Device pointer*/ + int batchSize ); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgelsBatched( cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + int nrhs, + double *Aarray[], /*Device pointer*/ + int lda, + double *Carray[], /* Device pointer*/ + int ldc, + int *info, + int *devInfoArray, /* Device pointer*/ + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgelsBatched( cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + int nrhs, + cuComplex *Aarray[], /*Device pointer*/ + int lda, + cuComplex *Carray[], /* Device pointer*/ + int ldc, + int *info, + int *devInfoArray, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgelsBatched( cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + int nrhs, + cuDoubleComplex *Aarray[], /*Device pointer*/ + int lda, + cuDoubleComplex *Carray[], /* Device pointer*/ + int ldc, + int *info, + int *devInfoArray, + int batchSize); +/* DGMM */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSdgmm(cublasHandle_t handle, + cublasSideMode_t mode, + int m, + int n, + const float *A, + int lda, + const float *x, + int incx, + float *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDdgmm(cublasHandle_t handle, + cublasSideMode_t mode, + int m, + int n, + const double *A, + int lda, + const double *x, + int incx, + double *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdgmm(cublasHandle_t handle, + cublasSideMode_t mode, + int m, + int n, + const cuComplex *A, + int lda, + const cuComplex *x, + int incx, + cuComplex *C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdgmm(cublasHandle_t handle, + cublasSideMode_t mode, + int m, + int n, + const cuDoubleComplex *A, + int lda, + const cuDoubleComplex *x, + int incx, + cuDoubleComplex *C, + int ldc); + +/* TPTTR : Triangular Pack format to Triangular format */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStpttr ( cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float *AP, + float *A, + int lda ); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtpttr ( cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double *AP, + double *A, + int lda ); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtpttr ( cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuComplex *AP, + cuComplex *A, + int lda ); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtpttr ( cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuDoubleComplex *AP, + cuDoubleComplex *A, + int lda ); + /* TRTTP : Triangular format to Triangular Pack format */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrttp ( cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float *A, + int lda, + float *AP ); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrttp ( cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double *A, + int lda, + double *AP ); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrttp ( cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuComplex *A, + int lda, + cuComplex *AP ); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrttp ( cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuDoubleComplex *A, + int lda, + cuDoubleComplex *AP ); + +#if defined(__cplusplus) +} +#endif /* __cplusplus */ + +#endif /* !defined(CUBLAS_API_H_) */ diff --git a/include/external/CUDA/cublas_v2.h b/include/external/CUDA/cublas_v2.h new file mode 100644 index 000000000..5b9553a15 --- /dev/null +++ b/include/external/CUDA/cublas_v2.h @@ -0,0 +1,274 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* + * This is the public header file for the new CUBLAS library API, it mapped the generic + * Cublas name functions to the actual _v2 implementations. + */ + +#if !defined(CUBLAS_V2_H_) +#define CUBLAS_V2_H_ + +#undef CUBLASAPI +#ifdef __CUDACC__ +#define CUBLASAPI __host__ __device__ +#else +#define CUBLASAPI +#endif + +#include "cublas_api.h" + +#define cublasCreate cublasCreate_v2 +#define cublasDestroy cublasDestroy_v2 +#define cublasGetVersion cublasGetVersion_v2 +#define cublasSetStream cublasSetStream_v2 +#define cublasGetStream cublasGetStream_v2 +#define cublasGetPointerMode cublasGetPointerMode_v2 +#define cublasSetPointerMode cublasSetPointerMode_v2 + +/* Blas3 Routines */ + +#define cublasSnrm2 cublasSnrm2_v2 +#define cublasDnrm2 cublasDnrm2_v2 +#define cublasScnrm2 cublasScnrm2_v2 +#define cublasDznrm2 cublasDznrm2_v2 + +#define cublasSdot cublasSdot_v2 +#define cublasDdot cublasDdot_v2 +#define cublasCdotu cublasCdotu_v2 +#define cublasCdotc cublasCdotc_v2 +#define cublasZdotu cublasZdotu_v2 +#define cublasZdotc cublasZdotc_v2 + +#define cublasSscal cublasSscal_v2 +#define cublasDscal cublasDscal_v2 +#define cublasCscal cublasCscal_v2 +#define cublasCsscal cublasCsscal_v2 +#define cublasZscal cublasZscal_v2 +#define cublasZdscal cublasZdscal_v2 + +#define cublasSaxpy cublasSaxpy_v2 +#define cublasDaxpy cublasDaxpy_v2 +#define cublasCaxpy cublasCaxpy_v2 +#define cublasZaxpy cublasZaxpy_v2 + +#define cublasScopy cublasScopy_v2 +#define cublasDcopy cublasDcopy_v2 +#define cublasCcopy cublasCcopy_v2 +#define cublasZcopy cublasZcopy_v2 + +#define cublasSswap cublasSswap_v2 +#define cublasDswap cublasDswap_v2 +#define cublasCswap cublasCswap_v2 +#define cublasZswap cublasZswap_v2 + +#define cublasIsamax cublasIsamax_v2 +#define cublasIdamax cublasIdamax_v2 +#define cublasIcamax cublasIcamax_v2 +#define cublasIzamax cublasIzamax_v2 + +#define cublasIsamin cublasIsamin_v2 +#define cublasIdamin cublasIdamin_v2 +#define cublasIcamin cublasIcamin_v2 +#define cublasIzamin cublasIzamin_v2 + +#define cublasSasum cublasSasum_v2 +#define cublasDasum cublasDasum_v2 +#define cublasScasum cublasScasum_v2 +#define cublasDzasum cublasDzasum_v2 + +#define cublasSrot cublasSrot_v2 +#define cublasDrot cublasDrot_v2 +#define cublasCrot cublasCrot_v2 +#define cublasCsrot cublasCsrot_v2 +#define cublasZrot cublasZrot_v2 +#define cublasZdrot cublasZdrot_v2 + +#define cublasSrotg cublasSrotg_v2 +#define cublasDrotg cublasDrotg_v2 +#define cublasCrotg cublasCrotg_v2 +#define cublasZrotg cublasZrotg_v2 + +#define cublasSrotm cublasSrotm_v2 +#define cublasDrotm cublasDrotm_v2 + +#define cublasSrotmg cublasSrotmg_v2 +#define cublasDrotmg cublasDrotmg_v2 + + +/* Blas2 Routines */ + +#define cublasSgemv cublasSgemv_v2 +#define cublasDgemv cublasDgemv_v2 +#define cublasCgemv cublasCgemv_v2 +#define cublasZgemv cublasZgemv_v2 + +#define cublasSgbmv cublasSgbmv_v2 +#define cublasDgbmv cublasDgbmv_v2 +#define cublasCgbmv cublasCgbmv_v2 +#define cublasZgbmv cublasZgbmv_v2 + +#define cublasStrmv cublasStrmv_v2 +#define cublasDtrmv cublasDtrmv_v2 +#define cublasCtrmv cublasCtrmv_v2 +#define cublasZtrmv cublasZtrmv_v2 + +#define cublasStbmv cublasStbmv_v2 +#define cublasDtbmv cublasDtbmv_v2 +#define cublasCtbmv cublasCtbmv_v2 +#define cublasZtbmv cublasZtbmv_v2 + +#define cublasStpmv cublasStpmv_v2 +#define cublasDtpmv cublasDtpmv_v2 +#define cublasCtpmv cublasCtpmv_v2 +#define cublasZtpmv cublasZtpmv_v2 + +#define cublasStrsv cublasStrsv_v2 +#define cublasDtrsv cublasDtrsv_v2 +#define cublasCtrsv cublasCtrsv_v2 +#define cublasZtrsv cublasZtrsv_v2 + +#define cublasStpsv cublasStpsv_v2 +#define cublasDtpsv cublasDtpsv_v2 +#define cublasCtpsv cublasCtpsv_v2 +#define cublasZtpsv cublasZtpsv_v2 + +#define cublasStbsv cublasStbsv_v2 +#define cublasDtbsv cublasDtbsv_v2 +#define cublasCtbsv cublasCtbsv_v2 +#define cublasZtbsv cublasZtbsv_v2 + +#define cublasSsymv cublasSsymv_v2 +#define cublasDsymv cublasDsymv_v2 +#define cublasCsymv cublasCsymv_v2 +#define cublasZsymv cublasZsymv_v2 +#define cublasChemv cublasChemv_v2 +#define cublasZhemv cublasZhemv_v2 + +#define cublasSsbmv cublasSsbmv_v2 +#define cublasDsbmv cublasDsbmv_v2 +#define cublasChbmv cublasChbmv_v2 +#define cublasZhbmv cublasZhbmv_v2 + +#define cublasSspmv cublasSspmv_v2 +#define cublasDspmv cublasDspmv_v2 +#define cublasChpmv cublasChpmv_v2 +#define cublasZhpmv cublasZhpmv_v2 + + +#define cublasSger cublasSger_v2 +#define cublasDger cublasDger_v2 +#define cublasCgeru cublasCgeru_v2 +#define cublasCgerc cublasCgerc_v2 +#define cublasZgeru cublasZgeru_v2 +#define cublasZgerc cublasZgerc_v2 + +#define cublasSsyr cublasSsyr_v2 +#define cublasDsyr cublasDsyr_v2 +#define cublasCsyr cublasCsyr_v2 +#define cublasZsyr cublasZsyr_v2 +#define cublasCher cublasCher_v2 +#define cublasZher cublasZher_v2 + +#define cublasSspr cublasSspr_v2 +#define cublasDspr cublasDspr_v2 +#define cublasChpr cublasChpr_v2 +#define cublasZhpr cublasZhpr_v2 + +#define cublasSsyr2 cublasSsyr2_v2 +#define cublasDsyr2 cublasDsyr2_v2 +#define cublasCsyr2 cublasCsyr2_v2 +#define cublasZsyr2 cublasZsyr2_v2 +#define cublasCher2 cublasCher2_v2 +#define cublasZher2 cublasZher2_v2 + +#define cublasSspr2 cublasSspr2_v2 +#define cublasDspr2 cublasDspr2_v2 +#define cublasChpr2 cublasChpr2_v2 +#define cublasZhpr2 cublasZhpr2_v2 + +/* Blas3 Routines */ + +#define cublasSgemm cublasSgemm_v2 +#define cublasDgemm cublasDgemm_v2 +#define cublasCgemm cublasCgemm_v2 +#define cublasZgemm cublasZgemm_v2 + +#define cublasSsyrk cublasSsyrk_v2 +#define cublasDsyrk cublasDsyrk_v2 +#define cublasCsyrk cublasCsyrk_v2 +#define cublasZsyrk cublasZsyrk_v2 +#define cublasCherk cublasCherk_v2 +#define cublasZherk cublasZherk_v2 + +#define cublasSsyr2k cublasSsyr2k_v2 +#define cublasDsyr2k cublasDsyr2k_v2 +#define cublasCsyr2k cublasCsyr2k_v2 +#define cublasZsyr2k cublasZsyr2k_v2 +#define cublasCher2k cublasCher2k_v2 +#define cublasZher2k cublasZher2k_v2 + +#define cublasSsymm cublasSsymm_v2 +#define cublasDsymm cublasDsymm_v2 +#define cublasCsymm cublasCsymm_v2 +#define cublasZsymm cublasZsymm_v2 +#define cublasChemm cublasChemm_v2 +#define cublasZhemm cublasZhemm_v2 + +#define cublasStrsm cublasStrsm_v2 +#define cublasDtrsm cublasDtrsm_v2 +#define cublasCtrsm cublasCtrsm_v2 +#define cublasZtrsm cublasZtrsm_v2 + +#define cublasStrmm cublasStrmm_v2 +#define cublasDtrmm cublasDtrmm_v2 +#define cublasCtrmm cublasCtrmm_v2 +#define cublasZtrmm cublasZtrmm_v2 + +#endif /* !defined(CUBLAS_V2_H_) */ diff --git a/include/external/CUDA/cuda.h b/include/external/CUDA/cuda.h new file mode 100755 index 000000000..175b31703 --- /dev/null +++ b/include/external/CUDA/cuda.h @@ -0,0 +1,12185 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef __cuda_cuda_h__ +#define __cuda_cuda_h__ + +#include +#ifdef _MSC_VER +typedef unsigned __int32 cuuint32_t; +typedef unsigned __int64 cuuint64_t; +#else +#include +typedef uint32_t cuuint32_t; +typedef uint64_t cuuint64_t; +#endif + +/** + * CUDA API versioning support + */ +#if defined(CUDA_FORCE_API_VERSION) + #if (CUDA_FORCE_API_VERSION == 3010) + #define __CUDA_API_VERSION 3010 + #else + #error "Unsupported value of CUDA_FORCE_API_VERSION" + #endif +#else + #define __CUDA_API_VERSION 9000 +#endif /* CUDA_FORCE_API_VERSION */ + +#if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) + #define __CUDA_API_PER_THREAD_DEFAULT_STREAM + #define __CUDA_API_PTDS(api) api ## _ptds + #define __CUDA_API_PTSZ(api) api ## _ptsz +#else + #define __CUDA_API_PTDS(api) api + #define __CUDA_API_PTSZ(api) api +#endif + +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 3020 + #define cuDeviceTotalMem cuDeviceTotalMem_v2 + #define cuCtxCreate cuCtxCreate_v2 + #define cuModuleGetGlobal cuModuleGetGlobal_v2 + #define cuMemGetInfo cuMemGetInfo_v2 + #define cuMemAlloc cuMemAlloc_v2 + #define cuMemAllocPitch cuMemAllocPitch_v2 + #define cuMemFree cuMemFree_v2 + #define cuMemGetAddressRange cuMemGetAddressRange_v2 + #define cuMemAllocHost cuMemAllocHost_v2 + #define cuMemHostGetDevicePointer cuMemHostGetDevicePointer_v2 + #define cuMemcpyHtoD __CUDA_API_PTDS(cuMemcpyHtoD_v2) + #define cuMemcpyDtoH __CUDA_API_PTDS(cuMemcpyDtoH_v2) + #define cuMemcpyDtoD __CUDA_API_PTDS(cuMemcpyDtoD_v2) + #define cuMemcpyDtoA __CUDA_API_PTDS(cuMemcpyDtoA_v2) + #define cuMemcpyAtoD __CUDA_API_PTDS(cuMemcpyAtoD_v2) + #define cuMemcpyHtoA __CUDA_API_PTDS(cuMemcpyHtoA_v2) + #define cuMemcpyAtoH __CUDA_API_PTDS(cuMemcpyAtoH_v2) + #define cuMemcpyAtoA __CUDA_API_PTDS(cuMemcpyAtoA_v2) + #define cuMemcpyHtoAAsync __CUDA_API_PTSZ(cuMemcpyHtoAAsync_v2) + #define cuMemcpyAtoHAsync __CUDA_API_PTSZ(cuMemcpyAtoHAsync_v2) + #define cuMemcpy2D __CUDA_API_PTDS(cuMemcpy2D_v2) + #define cuMemcpy2DUnaligned __CUDA_API_PTDS(cuMemcpy2DUnaligned_v2) + #define cuMemcpy3D __CUDA_API_PTDS(cuMemcpy3D_v2) + #define cuMemcpyHtoDAsync __CUDA_API_PTSZ(cuMemcpyHtoDAsync_v2) + #define cuMemcpyDtoHAsync __CUDA_API_PTSZ(cuMemcpyDtoHAsync_v2) + #define cuMemcpyDtoDAsync __CUDA_API_PTSZ(cuMemcpyDtoDAsync_v2) + #define cuMemcpy2DAsync __CUDA_API_PTSZ(cuMemcpy2DAsync_v2) + #define cuMemcpy3DAsync __CUDA_API_PTSZ(cuMemcpy3DAsync_v2) + #define cuMemsetD8 __CUDA_API_PTDS(cuMemsetD8_v2) + #define cuMemsetD16 __CUDA_API_PTDS(cuMemsetD16_v2) + #define cuMemsetD32 __CUDA_API_PTDS(cuMemsetD32_v2) + #define cuMemsetD2D8 __CUDA_API_PTDS(cuMemsetD2D8_v2) + #define cuMemsetD2D16 __CUDA_API_PTDS(cuMemsetD2D16_v2) + #define cuMemsetD2D32 __CUDA_API_PTDS(cuMemsetD2D32_v2) + #define cuArrayCreate cuArrayCreate_v2 + #define cuArrayGetDescriptor cuArrayGetDescriptor_v2 + #define cuArray3DCreate cuArray3DCreate_v2 + #define cuArray3DGetDescriptor cuArray3DGetDescriptor_v2 + #define cuTexRefSetAddress cuTexRefSetAddress_v2 + #define cuTexRefGetAddress cuTexRefGetAddress_v2 + #define cuGraphicsResourceGetMappedPointer cuGraphicsResourceGetMappedPointer_v2 +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 3020 */ +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 4000 + #define cuCtxDestroy cuCtxDestroy_v2 + #define cuCtxPopCurrent cuCtxPopCurrent_v2 + #define cuCtxPushCurrent cuCtxPushCurrent_v2 + #define cuStreamDestroy cuStreamDestroy_v2 + #define cuEventDestroy cuEventDestroy_v2 +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 4000 */ +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 4010 + #define cuTexRefSetAddress2D cuTexRefSetAddress2D_v3 +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 4010 */ +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 6050 + #define cuLinkCreate cuLinkCreate_v2 + #define cuLinkAddData cuLinkAddData_v2 + #define cuLinkAddFile cuLinkAddFile_v2 +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 6050 */ +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION >= 6050 + #define cuMemHostRegister cuMemHostRegister_v2 + #define cuGraphicsResourceSetMapFlags cuGraphicsResourceSetMapFlags_v2 +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION >= 6050 */ + +#if !defined(__CUDA_API_VERSION_INTERNAL) +#if defined(__CUDA_API_VERSION) && __CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010 + #define cuTexRefSetAddress2D cuTexRefSetAddress2D_v2 +#endif /* __CUDA_API_VERSION && __CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010 */ +#endif /* __CUDA_API_VERSION_INTERNAL */ + +#if defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM) + #define cuMemcpy __CUDA_API_PTDS(cuMemcpy) + #define cuMemcpyAsync __CUDA_API_PTSZ(cuMemcpyAsync) + #define cuMemcpyPeer __CUDA_API_PTDS(cuMemcpyPeer) + #define cuMemcpyPeerAsync __CUDA_API_PTSZ(cuMemcpyPeerAsync) + #define cuMemcpy3DPeer __CUDA_API_PTDS(cuMemcpy3DPeer) + #define cuMemcpy3DPeerAsync __CUDA_API_PTSZ(cuMemcpy3DPeerAsync) + #define cuMemPrefetchAsync __CUDA_API_PTSZ(cuMemPrefetchAsync) + + #define cuMemsetD8Async __CUDA_API_PTSZ(cuMemsetD8Async) + #define cuMemsetD16Async __CUDA_API_PTSZ(cuMemsetD16Async) + #define cuMemsetD32Async __CUDA_API_PTSZ(cuMemsetD32Async) + #define cuMemsetD2D8Async __CUDA_API_PTSZ(cuMemsetD2D8Async) + #define cuMemsetD2D16Async __CUDA_API_PTSZ(cuMemsetD2D16Async) + #define cuMemsetD2D32Async __CUDA_API_PTSZ(cuMemsetD2D32Async) + + #define cuStreamGetPriority __CUDA_API_PTSZ(cuStreamGetPriority) + #define cuStreamGetFlags __CUDA_API_PTSZ(cuStreamGetFlags) + #define cuStreamWaitEvent __CUDA_API_PTSZ(cuStreamWaitEvent) + #define cuStreamAddCallback __CUDA_API_PTSZ(cuStreamAddCallback) + #define cuStreamAttachMemAsync __CUDA_API_PTSZ(cuStreamAttachMemAsync) + #define cuStreamQuery __CUDA_API_PTSZ(cuStreamQuery) + #define cuStreamSynchronize __CUDA_API_PTSZ(cuStreamSynchronize) + #define cuEventRecord __CUDA_API_PTSZ(cuEventRecord) + #define cuLaunchKernel __CUDA_API_PTSZ(cuLaunchKernel) + #define cuGraphicsMapResources __CUDA_API_PTSZ(cuGraphicsMapResources) + #define cuGraphicsUnmapResources __CUDA_API_PTSZ(cuGraphicsUnmapResources) + + #define cuStreamWriteValue32 __CUDA_API_PTSZ(cuStreamWriteValue32) + #define cuStreamWaitValue32 __CUDA_API_PTSZ(cuStreamWaitValue32) + #define cuStreamWriteValue64 __CUDA_API_PTSZ(cuStreamWriteValue64) + #define cuStreamWaitValue64 __CUDA_API_PTSZ(cuStreamWaitValue64) + #define cuStreamBatchMemOp __CUDA_API_PTSZ(cuStreamBatchMemOp) + + #define cuLaunchCooperativeKernel __CUDA_API_PTSZ(cuLaunchCooperativeKernel) + +#endif + +/** + * \file cuda.h + * \brief Header file for the CUDA Toolkit application programming interface. + * + * \file cudaGL.h + * \brief Header file for the OpenGL interoperability functions of the + * low-level CUDA driver application programming interface. + * + * \file cudaD3D9.h + * \brief Header file for the Direct3D 9 interoperability functions of the + * low-level CUDA driver application programming interface. + */ + +/** + * \defgroup CUDA_TYPES Data types used by CUDA driver + * @{ + */ + +/** + * CUDA API version number + */ +#define CUDA_VERSION 9000 + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * CUDA device pointer + * CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform. + */ +#if __CUDA_API_VERSION >= 3020 + +#if defined(_WIN64) || defined(__LP64__) +typedef unsigned long long CUdeviceptr; +#else +typedef unsigned int CUdeviceptr; +#endif + +#endif /* __CUDA_API_VERSION >= 3020 */ + +typedef int CUdevice; /**< CUDA device */ +typedef struct CUctx_st *CUcontext; /**< CUDA context */ +typedef struct CUmod_st *CUmodule; /**< CUDA module */ +typedef struct CUfunc_st *CUfunction; /**< CUDA function */ +typedef struct CUarray_st *CUarray; /**< CUDA array */ +typedef struct CUmipmappedArray_st *CUmipmappedArray; /**< CUDA mipmapped array */ +typedef struct CUtexref_st *CUtexref; /**< CUDA texture reference */ +typedef struct CUsurfref_st *CUsurfref; /**< CUDA surface reference */ +typedef struct CUevent_st *CUevent; /**< CUDA event */ +typedef struct CUstream_st *CUstream; /**< CUDA stream */ +typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */ +typedef unsigned long long CUtexObject; /**< An opaque value that represents a CUDA texture object */ +typedef unsigned long long CUsurfObject; /**< An opaque value that represents a CUDA surface object */ + +typedef struct CUuuid_st { /**< CUDA definition of UUID */ + char bytes[16]; +} CUuuid; + + +#if __CUDA_API_VERSION >= 4010 + +/** + * CUDA IPC handle size + */ +#define CU_IPC_HANDLE_SIZE 64 + +/** + * CUDA IPC event handle + */ +typedef struct CUipcEventHandle_st { + char reserved[CU_IPC_HANDLE_SIZE]; +} CUipcEventHandle; + +/** + * CUDA IPC mem handle + */ +typedef struct CUipcMemHandle_st { + char reserved[CU_IPC_HANDLE_SIZE]; +} CUipcMemHandle; + +/** + * CUDA Ipc Mem Flags + */ +typedef enum CUipcMem_flags_enum { + CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */ +} CUipcMem_flags; + +#endif + +/** + * CUDA Mem Attach Flags + */ +typedef enum CUmemAttach_flags_enum { + CU_MEM_ATTACH_GLOBAL = 0x1, /**< Memory can be accessed by any stream on any device */ + CU_MEM_ATTACH_HOST = 0x2, /**< Memory cannot be accessed by any stream on any device */ + CU_MEM_ATTACH_SINGLE = 0x4 /**< Memory can only be accessed by a single stream on the associated device */ +} CUmemAttach_flags; + +/** + * Context creation flags + */ +typedef enum CUctx_flags_enum { + CU_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */ + CU_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */ + CU_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */ + CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */ + CU_CTX_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling + * \deprecated This flag was deprecated as of CUDA 4.0 + * and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */ + CU_CTX_SCHED_MASK = 0x07, + CU_CTX_MAP_HOST = 0x08, /**< Support mapped pinned allocations */ + CU_CTX_LMEM_RESIZE_TO_MAX = 0x10, /**< Keep local memory allocation after launch */ + CU_CTX_FLAGS_MASK = 0x1f +} CUctx_flags; + +/** + * Stream creation flags + */ +typedef enum CUstream_flags_enum { + CU_STREAM_DEFAULT = 0x0, /**< Default stream flag */ + CU_STREAM_NON_BLOCKING = 0x1 /**< Stream does not synchronize with stream 0 (the NULL stream) */ +} CUstream_flags; + +/** + * Legacy stream handle + * + * Stream handle that can be passed as a CUstream to use an implicit stream + * with legacy synchronization behavior. + * + * See details of the \link_sync_behavior + */ +#define CU_STREAM_LEGACY ((CUstream)0x1) + +/** + * Per-thread stream handle + * + * Stream handle that can be passed as a CUstream to use an implicit stream + * with per-thread synchronization behavior. + * + * See details of the \link_sync_behavior + */ +#define CU_STREAM_PER_THREAD ((CUstream)0x2) + +/** + * Event creation flags + */ +typedef enum CUevent_flags_enum { + CU_EVENT_DEFAULT = 0x0, /**< Default event flag */ + CU_EVENT_BLOCKING_SYNC = 0x1, /**< Event uses blocking synchronization */ + CU_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */ + CU_EVENT_INTERPROCESS = 0x4 /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */ +} CUevent_flags; + +#if __CUDA_API_VERSION >= 8000 +/** + * Flags for ::cuStreamWaitValue32 and ::cuStreamWaitValue64 + */ +typedef enum CUstreamWaitValue_flags_enum { + CU_STREAM_WAIT_VALUE_GEQ = 0x0, /**< Wait until (int32_t)(*addr - value) >= 0 (or int64_t for 64 bit + values). Note this is a cyclic comparison which ignores wraparound. + (Default behavior.) */ + CU_STREAM_WAIT_VALUE_EQ = 0x1, /**< Wait until *addr == value. */ + CU_STREAM_WAIT_VALUE_AND = 0x2, /**< Wait until (*addr & value) != 0. */ + CU_STREAM_WAIT_VALUE_NOR = 0x3, /**< Wait until ~(*addr | value) != 0. Support for this operation can be + queried with ::cuDeviceGetAttribute() and + ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR. Generally, this + requires compute capability 7.0 or greater. */ + CU_STREAM_WAIT_VALUE_FLUSH = 1<<30 /**< Follow the wait operation with a flush of outstanding remote writes. This + means that, if a remote write operation is guaranteed to have reached the + device before the wait can be satisfied, that write is guaranteed to be + visible to downstream device work. The device is permitted to reorder + remote writes internally. For example, this flag would be required if + two remote writes arrive in a defined order, the wait is satisfied by the + second write, and downstream work needs to observe the first write. */ +} CUstreamWaitValue_flags; + +/** + * Flags for ::cuStreamWriteValue32 + */ +typedef enum CUstreamWriteValue_flags_enum { + CU_STREAM_WRITE_VALUE_DEFAULT = 0x0, /**< Default behavior */ + CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER = 0x1 /**< Permits the write to be reordered with writes which were issued + before it, as a performance optimization. Normally, + ::cuStreamWriteValue32 will provide a memory fence before the + write, which has similar semantics to + __threadfence_system() but is scoped to the stream + rather than a CUDA thread. */ +} CUstreamWriteValue_flags; + +/** + * Operations for ::cuStreamBatchMemOp + */ +typedef enum CUstreamBatchMemOpType_enum { + CU_STREAM_MEM_OP_WAIT_VALUE_32 = 1, /**< Represents a ::cuStreamWaitValue32 operation */ + CU_STREAM_MEM_OP_WRITE_VALUE_32 = 2, /**< Represents a ::cuStreamWriteValue32 operation */ + CU_STREAM_MEM_OP_WAIT_VALUE_64 = 4, /**< Represents a ::cuStreamWaitValue64 operation */ + CU_STREAM_MEM_OP_WRITE_VALUE_64 = 5, /**< Represents a ::cuStreamWriteValue64 operation */ + CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3 /**< This has the same effect as ::CU_STREAM_WAIT_VALUE_FLUSH, but as a + standalone operation. */ +} CUstreamBatchMemOpType; + +/** + * Per-operation parameters for ::cuStreamBatchMemOp + */ +typedef union CUstreamBatchMemOpParams_union { + CUstreamBatchMemOpType operation; + struct CUstreamMemOpWaitValueParams_st { + CUstreamBatchMemOpType operation; + CUdeviceptr address; + union { + cuuint32_t value; + cuuint64_t value64; + }; + unsigned int flags; + CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */ + } waitValue; + struct CUstreamMemOpWriteValueParams_st { + CUstreamBatchMemOpType operation; + CUdeviceptr address; + union { + cuuint32_t value; + cuuint64_t value64; + }; + unsigned int flags; + CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */ + } writeValue; + struct CUstreamMemOpFlushRemoteWritesParams_st { + CUstreamBatchMemOpType operation; + unsigned int flags; + } flushRemoteWrites; + cuuint64_t pad[6]; +} CUstreamBatchMemOpParams; +#endif /* __CUDA_API_VERSION >= 8000 */ + +/** + * Occupancy calculator flag + */ +typedef enum CUoccupancy_flags_enum { + CU_OCCUPANCY_DEFAULT = 0x0, /**< Default behavior */ + CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1 /**< Assume global caching is enabled and cannot be automatically turned off */ +} CUoccupancy_flags; + +/** + * Array formats + */ +typedef enum CUarray_format_enum { + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */ + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */ + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */ + CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */ + CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */ + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */ + CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */ + CU_AD_FORMAT_FLOAT = 0x20 /**< 32-bit floating point */ +} CUarray_format; + +/** + * Texture reference addressing modes + */ +typedef enum CUaddress_mode_enum { + CU_TR_ADDRESS_MODE_WRAP = 0, /**< Wrapping address mode */ + CU_TR_ADDRESS_MODE_CLAMP = 1, /**< Clamp to edge address mode */ + CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */ + CU_TR_ADDRESS_MODE_BORDER = 3 /**< Border address mode */ +} CUaddress_mode; + +/** + * Texture reference filtering modes + */ +typedef enum CUfilter_mode_enum { + CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */ + CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */ +} CUfilter_mode; + +/** + * Device properties + */ +typedef enum CUdevice_attribute_enum { + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, /**< Maximum number of threads per block */ + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, /**< Maximum block dimension X */ + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, /**< Maximum block dimension Y */ + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, /**< Maximum block dimension Z */ + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, /**< Maximum grid dimension X */ + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, /**< Maximum grid dimension Y */ + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, /**< Maximum grid dimension Z */ + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, /**< Maximum shared memory available per block in bytes */ + CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */ + CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */ + CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, /**< Warp size in threads */ + CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, /**< Maximum pitch in bytes allowed by memory copies */ + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, /**< Maximum number of 32-bit registers available per block */ + CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */ + CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, /**< Typical clock frequency in kilohertz */ + CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, /**< Alignment requirement for textures */ + CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */ + CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, /**< Number of multiprocessors on device */ + CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, /**< Specifies whether there is a run time limit on kernels */ + CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, /**< Device is integrated with host memory */ + CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, /**< Device can map host memory into CUDA address space */ + CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, /**< Compute mode (See ::CUcomputemode for details) */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, /**< Maximum 1D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, /**< Maximum 2D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, /**< Maximum 2D texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, /**< Maximum 3D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, /**< Maximum 3D texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, /**< Maximum 3D texture depth */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, /**< Maximum 2D layered texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, /**< Maximum 2D layered texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, /**< Maximum layers in a 2D layered texture */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */ + CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, /**< Alignment requirement for surfaces */ + CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, /**< Device can possibly execute multiple kernels concurrently */ + CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, /**< Device has ECC support enabled */ + CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, /**< PCI bus ID of the device */ + CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, /**< PCI device ID of the device */ + CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, /**< Device is using TCC driver model */ + CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, /**< Peak memory clock frequency in kilohertz */ + CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, /**< Global memory bus width in bits */ + CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, /**< Size of L2 cache in bytes */ + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */ + CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, /**< Number of asynchronous engines */ + CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, /**< Device shares a unified address space with the host */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, /**< Maximum 1D layered texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, /**< Maximum layers in a 1D layered texture */ + CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44, /**< Deprecated, do not use. */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, /**< Alternate maximum 3D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,/**< Alternate maximum 3D texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, /**< Alternate maximum 3D texture depth */ + CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, /**< PCI domain ID of the device */ + CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, /**< Pitch alignment requirement for textures */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, /**< Maximum cubemap texture width/height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53, /**< Maximum cubemap layered texture width/height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, /**< Maximum layers in a cubemap layered texture */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, /**< Maximum 1D surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, /**< Maximum 2D surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, /**< Maximum 2D surface height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, /**< Maximum 3D surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, /**< Maximum 3D surface height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, /**< Maximum 3D surface depth */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, /**< Maximum 1D layered surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, /**< Maximum layers in a 1D layered surface */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, /**< Maximum 2D layered surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, /**< Maximum 2D layered surface height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, /**< Maximum layers in a 2D layered surface */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, /**< Maximum cubemap surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67, /**< Maximum cubemap layered surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, /**< Maximum layers in a cubemap layered surface */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69, /**< Maximum 1D linear texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, /**< Maximum 2D linear texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, /**< Maximum 2D linear texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72, /**< Maximum 2D linear texture pitch in bytes */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, /**< Maximum mipmapped 2D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,/**< Maximum mipmapped 2D texture height */ + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, /**< Major compute capability version number */ + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, /**< Minor compute capability version number */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, /**< Maximum mipmapped 1D texture width */ + CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78, /**< Device supports stream priorities */ + CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79, /**< Device supports caching globals in L1 */ + CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80, /**< Device supports caching locals in L1 */ + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81, /**< Maximum shared memory available per multiprocessor in bytes */ + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, /**< Maximum number of 32-bit registers available per multiprocessor */ + CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, /**< Device can allocate managed memory on this system */ + CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, /**< Device is on a multi-GPU board */ + CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85, /**< Unique id for a group of devices on the same multi-GPU board */ + CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86, /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/ + CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87, /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */ + CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88, /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */ + CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89, /**< Device can coherently access managed memory concurrently with the CPU */ + CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90, /**< Device supports compute preemption. */ + CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, /**< Device can access host registered memory at the same virtual address as the CPU */ + CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92, /**< ::cuStreamBatchMemOp and related APIs are supported. */ + CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93, /**< 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */ + CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94, /**< ::CU_STREAM_WAIT_VALUE_NOR is supported. */ + CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95, /**< Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */ + CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96, /**< Device can participate in cooperative kernels launched via ::cuLaunchCooperativeKernelMultiDevice */ + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, /**< Maximum optin shared memory per block */ + CU_DEVICE_ATTRIBUTE_MAX +} CUdevice_attribute; + +/** + * Legacy device properties + */ +typedef struct CUdevprop_st { + int maxThreadsPerBlock; /**< Maximum number of threads per block */ + int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */ + int maxGridSize[3]; /**< Maximum size of each dimension of a grid */ + int sharedMemPerBlock; /**< Shared memory available per block in bytes */ + int totalConstantMemory; /**< Constant memory available on device in bytes */ + int SIMDWidth; /**< Warp size in threads */ + int memPitch; /**< Maximum pitch in bytes allowed by memory copies */ + int regsPerBlock; /**< 32-bit registers available per block */ + int clockRate; /**< Clock frequency in kilohertz */ + int textureAlign; /**< Alignment requirement for textures */ +} CUdevprop; + +/** + * Pointer information + */ +typedef enum CUpointer_attribute_enum { + CU_POINTER_ATTRIBUTE_CONTEXT = 1, /**< The ::CUcontext on which a pointer was allocated or registered */ + CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2, /**< The ::CUmemorytype describing the physical location of a pointer */ + CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3, /**< The address at which a pointer's memory may be accessed on the device */ + CU_POINTER_ATTRIBUTE_HOST_POINTER = 4, /**< The address at which a pointer's memory may be accessed on the host */ + CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5, /**< A pair of tokens for use with the nv-p2p.h Linux kernel interface */ + CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6, /**< Synchronize every synchronous memory operation initiated on this region */ + CU_POINTER_ATTRIBUTE_BUFFER_ID = 7, /**< A process-wide unique ID for an allocated memory region*/ + CU_POINTER_ATTRIBUTE_IS_MANAGED = 8 /**< Indicates if the pointer points to managed memory */ +} CUpointer_attribute; + +/** + * Function properties + */ +typedef enum CUfunction_attribute_enum { + /** + * The maximum number of threads per block, beyond which a launch of the + * function would fail. This number depends on both the function and the + * device on which the function is currently loaded. + */ + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, + + /** + * The size in bytes of statically-allocated shared memory required by + * this function. This does not include dynamically-allocated shared + * memory requested by the user at runtime. + */ + CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1, + + /** + * The size in bytes of user-allocated constant memory required by this + * function. + */ + CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2, + + /** + * The size in bytes of local memory used by each thread of this function. + */ + CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3, + + /** + * The number of registers used by each thread of this function. + */ + CU_FUNC_ATTRIBUTE_NUM_REGS = 4, + + /** + * The PTX virtual architecture version for which the function was + * compiled. This value is the major PTX version * 10 + the minor PTX + * version, so a PTX version 1.3 function would return the value 13. + * Note that this may return the undefined value of 0 for cubins + * compiled prior to CUDA 3.0. + */ + CU_FUNC_ATTRIBUTE_PTX_VERSION = 5, + + /** + * The binary architecture version for which the function was compiled. + * This value is the major binary version * 10 + the minor binary version, + * so a binary version 1.3 function would return the value 13. Note that + * this will return a value of 10 for legacy cubins that do not have a + * properly-encoded binary architecture version. + */ + CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6, + + /** + * The attribute to indicate whether the function has been compiled with + * user specified option "-Xptxas --dlcm=ca" set . + */ + CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7, + + /** + * The maximum size in bytes of dynamically-allocated shared memory that can be used by + * this function. If the user-specified dynamic shared memory size is larger than this + * value, the launch will fail. + */ + CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8, + + /** + * On devices where the L1 cache and shared memory use the same hardware resources, + * this sets the shared memory carveout preference, in percent of the total resources. + * This is only a hint, and the driver can choose a different ratio if required to execute the function. + */ + CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9, + + CU_FUNC_ATTRIBUTE_MAX +} CUfunction_attribute; + +/** + * Function cache configurations + */ +typedef enum CUfunc_cache_enum { + CU_FUNC_CACHE_PREFER_NONE = 0x00, /**< no preference for shared memory or L1 (default) */ + CU_FUNC_CACHE_PREFER_SHARED = 0x01, /**< prefer larger shared memory and smaller L1 cache */ + CU_FUNC_CACHE_PREFER_L1 = 0x02, /**< prefer larger L1 cache and smaller shared memory */ + CU_FUNC_CACHE_PREFER_EQUAL = 0x03 /**< prefer equal sized L1 cache and shared memory */ +} CUfunc_cache; + +/** + * Shared memory configurations + */ +typedef enum CUsharedconfig_enum { + CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE = 0x00, /**< set default shared memory bank size */ + CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE = 0x01, /**< set shared memory bank width to four bytes */ + CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02 /**< set shared memory bank width to eight bytes */ +} CUsharedconfig; + +/** + * Shared memory carveout configurations + */ +typedef enum CUshared_carveout_enum { + CU_SHAREDMEM_CARVEOUT_DEFAULT = -1, /** < no preference for shared memory or L1 (default) */ + CU_SHAREDMEM_CARVEOUT_MAX_SHARED = 100, /** < prefer maximum available shared memory, minimum L1 cache */ + CU_SHAREDMEM_CARVEOUT_MAX_L1 = 0 /** < prefer maximum available L1 cache, minimum shared memory */ +} CUshared_carveout; + +/** + * Memory types + */ +typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, /**< Host memory */ + CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */ + CU_MEMORYTYPE_ARRAY = 0x03, /**< Array memory */ + CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */ +} CUmemorytype; + +/** + * Compute Modes + */ +typedef enum CUcomputemode_enum { + CU_COMPUTEMODE_DEFAULT = 0, /**< Default compute mode (Multiple contexts allowed per device) */ + CU_COMPUTEMODE_PROHIBITED = 2, /**< Compute-prohibited mode (No contexts can be created on this device at this time) */ + CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */ +} CUcomputemode; + +/** + * Memory advise values + */ +typedef enum CUmem_advise_enum { + CU_MEM_ADVISE_SET_READ_MOSTLY = 1, /**< Data will mostly be read and only occassionally be written to */ + CU_MEM_ADVISE_UNSET_READ_MOSTLY = 2, /**< Undo the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY */ + CU_MEM_ADVISE_SET_PREFERRED_LOCATION = 3, /**< Set the preferred location for the data as the specified device */ + CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4, /**< Clear the preferred location for the data */ + CU_MEM_ADVISE_SET_ACCESSED_BY = 5, /**< Data will be accessed by the specified device, so prevent page faults as much as possible */ + CU_MEM_ADVISE_UNSET_ACCESSED_BY = 6 /**< Let the Unified Memory subsystem decide on the page faulting policy for the specified device */ +} CUmem_advise; + +typedef enum CUmem_range_attribute_enum { + CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1, /**< Whether the range will mostly be read and only occassionally be written to */ + CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2, /**< The preferred location of the range */ + CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3, /**< Memory range has ::CU_MEM_ADVISE_SET_ACCESSED_BY set for specified device */ + CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4 /**< The last location to which the range was prefetched */ +} CUmem_range_attribute; + +/** + * Online compiler and linker options + */ +typedef enum CUjit_option_enum +{ + /** + * Max number of registers that a thread may use.\n + * Option type: unsigned int\n + * Applies to: compiler only + */ + CU_JIT_MAX_REGISTERS = 0, + + /** + * IN: Specifies minimum number of threads per block to target compilation + * for\n + * OUT: Returns the number of threads the compiler actually targeted. + * This restricts the resource utilization fo the compiler (e.g. max + * registers) such that a block with the given number of threads should be + * able to launch based on register limitations. Note, this option does not + * currently take into account any other resource limitations, such as + * shared memory utilization.\n + * Cannot be combined with ::CU_JIT_TARGET.\n + * Option type: unsigned int\n + * Applies to: compiler only + */ + CU_JIT_THREADS_PER_BLOCK, + + /** + * Overwrites the option value with the total wall clock time, in + * milliseconds, spent in the compiler and linker\n + * Option type: float\n + * Applies to: compiler and linker + */ + CU_JIT_WALL_TIME, + + /** + * Pointer to a buffer in which to print any log messages + * that are informational in nature (the buffer size is specified via + * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)\n + * Option type: char *\n + * Applies to: compiler and linker + */ + CU_JIT_INFO_LOG_BUFFER, + + /** + * IN: Log buffer size in bytes. Log messages will be capped at this size + * (including null terminator)\n + * OUT: Amount of log buffer filled with messages\n + * Option type: unsigned int\n + * Applies to: compiler and linker + */ + CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, + + /** + * Pointer to a buffer in which to print any log messages that + * reflect errors (the buffer size is specified via option + * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n + * Option type: char *\n + * Applies to: compiler and linker + */ + CU_JIT_ERROR_LOG_BUFFER, + + /** + * IN: Log buffer size in bytes. Log messages will be capped at this size + * (including null terminator)\n + * OUT: Amount of log buffer filled with messages\n + * Option type: unsigned int\n + * Applies to: compiler and linker + */ + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + + /** + * Level of optimizations to apply to generated code (0 - 4), with 4 + * being the default and highest level of optimizations.\n + * Option type: unsigned int\n + * Applies to: compiler only + */ + CU_JIT_OPTIMIZATION_LEVEL, + + /** + * No option value required. Determines the target based on the current + * attached context (default)\n + * Option type: No option value needed\n + * Applies to: compiler and linker + */ + CU_JIT_TARGET_FROM_CUCONTEXT, + + /** + * Target is chosen based on supplied ::CUjit_target. Cannot be + * combined with ::CU_JIT_THREADS_PER_BLOCK.\n + * Option type: unsigned int for enumerated type ::CUjit_target\n + * Applies to: compiler and linker + */ + CU_JIT_TARGET, + + /** + * Specifies choice of fallback strategy if matching cubin is not found. + * Choice is based on supplied ::CUjit_fallback. This option cannot be + * used with cuLink* APIs as the linker requires exact matches.\n + * Option type: unsigned int for enumerated type ::CUjit_fallback\n + * Applies to: compiler only + */ + CU_JIT_FALLBACK_STRATEGY, + + /** + * Specifies whether to create debug information in output (-g) + * (0: false, default)\n + * Option type: int\n + * Applies to: compiler and linker + */ + CU_JIT_GENERATE_DEBUG_INFO, + + /** + * Generate verbose log messages (0: false, default)\n + * Option type: int\n + * Applies to: compiler and linker + */ + CU_JIT_LOG_VERBOSE, + + /** + * Generate line number information (-lineinfo) (0: false, default)\n + * Option type: int\n + * Applies to: compiler only + */ + CU_JIT_GENERATE_LINE_INFO, + + /** + * Specifies whether to enable caching explicitly (-dlcm) \n + * Choice is based on supplied ::CUjit_cacheMode_enum.\n + * Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum\n + * Applies to: compiler only + */ + CU_JIT_CACHE_MODE, + + /** + * The below jit options are used for internal purposes only, in this version of CUDA + */ + CU_JIT_NEW_SM3X_OPT, + CU_JIT_FAST_COMPILE, + + CU_JIT_NUM_OPTIONS + +} CUjit_option; + +/** + * Online compilation targets + */ +typedef enum CUjit_target_enum +{ + CU_TARGET_COMPUTE_20 = 20, /**< Compute device class 2.0 */ + CU_TARGET_COMPUTE_21 = 21, /**< Compute device class 2.1 */ + CU_TARGET_COMPUTE_30 = 30, /**< Compute device class 3.0 */ + CU_TARGET_COMPUTE_32 = 32, /**< Compute device class 3.2 */ + CU_TARGET_COMPUTE_35 = 35, /**< Compute device class 3.5 */ + CU_TARGET_COMPUTE_37 = 37, /**< Compute device class 3.7 */ + CU_TARGET_COMPUTE_50 = 50, /**< Compute device class 5.0 */ + CU_TARGET_COMPUTE_52 = 52, /**< Compute device class 5.2 */ + CU_TARGET_COMPUTE_53 = 53, /**< Compute device class 5.3 */ + CU_TARGET_COMPUTE_60 = 60, /**< Compute device class 6.0.*/ + CU_TARGET_COMPUTE_61 = 61, /**< Compute device class 6.1.*/ + CU_TARGET_COMPUTE_62 = 62, /**< Compute device class 6.2.*/ + CU_TARGET_COMPUTE_70 = 70 /**< Compute device class 7.0.*/ +} CUjit_target; + +/** + * Cubin matching fallback strategies + */ +typedef enum CUjit_fallback_enum +{ + CU_PREFER_PTX = 0, /**< Prefer to compile ptx if exact binary match not found */ + + CU_PREFER_BINARY /**< Prefer to fall back to compatible binary code if exact match not found */ + +} CUjit_fallback; + +/** + * Caching modes for dlcm + */ +typedef enum CUjit_cacheMode_enum +{ + CU_JIT_CACHE_OPTION_NONE = 0, /**< Compile with no -dlcm flag specified */ + CU_JIT_CACHE_OPTION_CG, /**< Compile with L1 cache disabled */ + CU_JIT_CACHE_OPTION_CA /**< Compile with L1 cache enabled */ +} CUjit_cacheMode; + +/** + * Device code formats + */ +typedef enum CUjitInputType_enum +{ + /** + * Compiled device-class-specific device code\n + * Applicable options: none + */ + CU_JIT_INPUT_CUBIN = 0, + + /** + * PTX source code\n + * Applicable options: PTX compiler options + */ + CU_JIT_INPUT_PTX, + + /** + * Bundle of multiple cubins and/or PTX of some device code\n + * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY + */ + CU_JIT_INPUT_FATBINARY, + + /** + * Host object with embedded device code\n + * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY + */ + CU_JIT_INPUT_OBJECT, + + /** + * Archive of host objects with embedded device code\n + * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY + */ + CU_JIT_INPUT_LIBRARY, + + CU_JIT_NUM_INPUT_TYPES +} CUjitInputType; + +#if __CUDA_API_VERSION >= 5050 +typedef struct CUlinkState_st *CUlinkState; +#endif /* __CUDA_API_VERSION >= 5050 */ + +/** + * Flags to register a graphics resource + */ +typedef enum CUgraphicsRegisterFlags_enum { + CU_GRAPHICS_REGISTER_FLAGS_NONE = 0x00, + CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01, + CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02, + CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST = 0x04, + CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08 +} CUgraphicsRegisterFlags; + +/** + * Flags for mapping and unmapping interop resources + */ +typedef enum CUgraphicsMapResourceFlags_enum { + CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00, + CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01, + CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02 +} CUgraphicsMapResourceFlags; + +/** + * Array indices for cube faces + */ +typedef enum CUarray_cubemap_face_enum { + CU_CUBEMAP_FACE_POSITIVE_X = 0x00, /**< Positive X face of cubemap */ + CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, /**< Negative X face of cubemap */ + CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, /**< Positive Y face of cubemap */ + CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, /**< Negative Y face of cubemap */ + CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, /**< Positive Z face of cubemap */ + CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 /**< Negative Z face of cubemap */ +} CUarray_cubemap_face; + +/** + * Limits + */ +typedef enum CUlimit_enum { + CU_LIMIT_STACK_SIZE = 0x00, /**< GPU thread stack size */ + CU_LIMIT_PRINTF_FIFO_SIZE = 0x01, /**< GPU printf FIFO size */ + CU_LIMIT_MALLOC_HEAP_SIZE = 0x02, /**< GPU malloc heap size */ + CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH = 0x03, /**< GPU device runtime launch synchronize depth */ + CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04, /**< GPU device runtime pending launch count */ + CU_LIMIT_MAX +} CUlimit; + +/** + * Resource types + */ +typedef enum CUresourcetype_enum { + CU_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resoure */ + CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */ + CU_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */ + CU_RESOURCE_TYPE_PITCH2D = 0x03 /**< Pitch 2D resource */ +} CUresourcetype; + +/** + * Error codes + */ +typedef enum cudaError_enum { + /** + * The API call returned with no errors. In the case of query calls, this + * can also mean that the operation being queried is complete (see + * ::cuEventQuery() and ::cuStreamQuery()). + */ + CUDA_SUCCESS = 0, + + /** + * This indicates that one or more of the parameters passed to the API call + * is not within an acceptable range of values. + */ + CUDA_ERROR_INVALID_VALUE = 1, + + /** + * The API call failed because it was unable to allocate enough memory to + * perform the requested operation. + */ + CUDA_ERROR_OUT_OF_MEMORY = 2, + + /** + * This indicates that the CUDA driver has not been initialized with + * ::cuInit() or that initialization has failed. + */ + CUDA_ERROR_NOT_INITIALIZED = 3, + + /** + * This indicates that the CUDA driver is in the process of shutting down. + */ + CUDA_ERROR_DEINITIALIZED = 4, + + /** + * This indicates profiler is not initialized for this run. This can + * happen when the application is running with external profiling tools + * like visual profiler. + */ + CUDA_ERROR_PROFILER_DISABLED = 5, + + /** + * \deprecated + * This error return is deprecated as of CUDA 5.0. It is no longer an error + * to attempt to enable/disable the profiling via ::cuProfilerStart or + * ::cuProfilerStop without initialization. + */ + CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6, + + /** + * \deprecated + * This error return is deprecated as of CUDA 5.0. It is no longer an error + * to call cuProfilerStart() when profiling is already enabled. + */ + CUDA_ERROR_PROFILER_ALREADY_STARTED = 7, + + /** + * \deprecated + * This error return is deprecated as of CUDA 5.0. It is no longer an error + * to call cuProfilerStop() when profiling is already disabled. + */ + CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8, + + /** + * This indicates that no CUDA-capable devices were detected by the installed + * CUDA driver. + */ + CUDA_ERROR_NO_DEVICE = 100, + + /** + * This indicates that the device ordinal supplied by the user does not + * correspond to a valid CUDA device. + */ + CUDA_ERROR_INVALID_DEVICE = 101, + + + /** + * This indicates that the device kernel image is invalid. This can also + * indicate an invalid CUDA module. + */ + CUDA_ERROR_INVALID_IMAGE = 200, + + /** + * This most frequently indicates that there is no context bound to the + * current thread. This can also be returned if the context passed to an + * API call is not a valid handle (such as a context that has had + * ::cuCtxDestroy() invoked on it). This can also be returned if a user + * mixes different API versions (i.e. 3010 context with 3020 API calls). + * See ::cuCtxGetApiVersion() for more details. + */ + CUDA_ERROR_INVALID_CONTEXT = 201, + + /** + * This indicated that the context being supplied as a parameter to the + * API call was already the active context. + * \deprecated + * This error return is deprecated as of CUDA 3.2. It is no longer an + * error to attempt to push the active context via ::cuCtxPushCurrent(). + */ + CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, + + /** + * This indicates that a map or register operation has failed. + */ + CUDA_ERROR_MAP_FAILED = 205, + + /** + * This indicates that an unmap or unregister operation has failed. + */ + CUDA_ERROR_UNMAP_FAILED = 206, + + /** + * This indicates that the specified array is currently mapped and thus + * cannot be destroyed. + */ + CUDA_ERROR_ARRAY_IS_MAPPED = 207, + + /** + * This indicates that the resource is already mapped. + */ + CUDA_ERROR_ALREADY_MAPPED = 208, + + /** + * This indicates that there is no kernel image available that is suitable + * for the device. This can occur when a user specifies code generation + * options for a particular CUDA source file that do not include the + * corresponding device configuration. + */ + CUDA_ERROR_NO_BINARY_FOR_GPU = 209, + + /** + * This indicates that a resource has already been acquired. + */ + CUDA_ERROR_ALREADY_ACQUIRED = 210, + + /** + * This indicates that a resource is not mapped. + */ + CUDA_ERROR_NOT_MAPPED = 211, + + /** + * This indicates that a mapped resource is not available for access as an + * array. + */ + CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212, + + /** + * This indicates that a mapped resource is not available for access as a + * pointer. + */ + CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213, + + /** + * This indicates that an uncorrectable ECC error was detected during + * execution. + */ + CUDA_ERROR_ECC_UNCORRECTABLE = 214, + + /** + * This indicates that the ::CUlimit passed to the API call is not + * supported by the active device. + */ + CUDA_ERROR_UNSUPPORTED_LIMIT = 215, + + /** + * This indicates that the ::CUcontext passed to the API call can + * only be bound to a single CPU thread at a time but is already + * bound to a CPU thread. + */ + CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216, + + /** + * This indicates that peer access is not supported across the given + * devices. + */ + CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217, + + /** + * This indicates that a PTX JIT compilation failed. + */ + CUDA_ERROR_INVALID_PTX = 218, + + /** + * This indicates an error with OpenGL or DirectX context. + */ + CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219, + + /** + * This indicates that an uncorrectable NVLink error was detected during the + * execution. + */ + CUDA_ERROR_NVLINK_UNCORRECTABLE = 220, + + /** + * This indicates that the PTX JIT compiler library was not found. + */ + CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221, + + /** + * This indicates that the device kernel source is invalid. + */ + CUDA_ERROR_INVALID_SOURCE = 300, + + /** + * This indicates that the file specified was not found. + */ + CUDA_ERROR_FILE_NOT_FOUND = 301, + + /** + * This indicates that a link to a shared object failed to resolve. + */ + CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302, + + /** + * This indicates that initialization of a shared object failed. + */ + CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303, + + /** + * This indicates that an OS call failed. + */ + CUDA_ERROR_OPERATING_SYSTEM = 304, + + /** + * This indicates that a resource handle passed to the API call was not + * valid. Resource handles are opaque types like ::CUstream and ::CUevent. + */ + CUDA_ERROR_INVALID_HANDLE = 400, + + /** + * This indicates that a named symbol was not found. Examples of symbols + * are global/constant variable names, texture names, and surface names. + */ + CUDA_ERROR_NOT_FOUND = 500, + + /** + * This indicates that asynchronous operations issued previously have not + * completed yet. This result is not actually an error, but must be indicated + * differently than ::CUDA_SUCCESS (which indicates completion). Calls that + * may return this value include ::cuEventQuery() and ::cuStreamQuery(). + */ + CUDA_ERROR_NOT_READY = 600, + + /** + * While executing a kernel, the device encountered a + * load or store instruction on an invalid memory address. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_ILLEGAL_ADDRESS = 700, + + /** + * This indicates that a launch did not occur because it did not have + * appropriate resources. This error usually indicates that the user has + * attempted to pass too many arguments to the device kernel, or the + * kernel launch specifies too many threads for the kernel's register + * count. Passing arguments of the wrong size (i.e. a 64-bit pointer + * when a 32-bit int is expected) is equivalent to passing too many + * arguments and can also result in this error. + */ + CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, + + /** + * This indicates that the device kernel took too long to execute. This can + * only occur if timeouts are enabled - see the device attribute + * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_LAUNCH_TIMEOUT = 702, + + /** + * This error indicates a kernel launch that uses an incompatible texturing + * mode. + */ + CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, + + /** + * This error indicates that a call to ::cuCtxEnablePeerAccess() is + * trying to re-enable peer access to a context which has already + * had peer access to it enabled. + */ + CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704, + + /** + * This error indicates that ::cuCtxDisablePeerAccess() is + * trying to disable peer access which has not been enabled yet + * via ::cuCtxEnablePeerAccess(). + */ + CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705, + + /** + * This error indicates that the primary context for the specified device + * has already been initialized. + */ + CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708, + + /** + * This error indicates that the context current to the calling thread + * has been destroyed using ::cuCtxDestroy, or is a primary context which + * has not yet been initialized. + */ + CUDA_ERROR_CONTEXT_IS_DESTROYED = 709, + + /** + * A device-side assert triggered during kernel execution. The context + * cannot be used anymore, and must be destroyed. All existing device + * memory allocations from this context are invalid and must be + * reconstructed if the program is to continue using CUDA. + */ + CUDA_ERROR_ASSERT = 710, + + /** + * This error indicates that the hardware resources required to enable + * peer access have been exhausted for one or more of the devices + * passed to ::cuCtxEnablePeerAccess(). + */ + CUDA_ERROR_TOO_MANY_PEERS = 711, + + /** + * This error indicates that the memory range passed to ::cuMemHostRegister() + * has already been registered. + */ + CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712, + + /** + * This error indicates that the pointer passed to ::cuMemHostUnregister() + * does not correspond to any currently registered memory region. + */ + CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713, + + /** + * While executing a kernel, the device encountered a stack error. + * This can be due to stack corruption or exceeding the stack size limit. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_HARDWARE_STACK_ERROR = 714, + + /** + * While executing a kernel, the device encountered an illegal instruction. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_ILLEGAL_INSTRUCTION = 715, + + /** + * While executing a kernel, the device encountered a load or store instruction + * on a memory address which is not aligned. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_MISALIGNED_ADDRESS = 716, + + /** + * While executing a kernel, the device encountered an instruction + * which can only operate on memory locations in certain address spaces + * (global, shared, or local), but was supplied a memory address not + * belonging to an allowed address space. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_INVALID_ADDRESS_SPACE = 717, + + /** + * While executing a kernel, the device program counter wrapped its address space. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_INVALID_PC = 718, + + /** + * An exception occurred on the device while executing a kernel. Common + * causes include dereferencing an invalid device pointer and accessing + * out of bounds shared memory. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_LAUNCH_FAILED = 719, + + /** + * This error indicates that the number of blocks launched per grid for a kernel that was + * launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice + * exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor + * or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors + * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. + */ + CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720, + + /** + * This error indicates that the attempted operation is not permitted. + */ + CUDA_ERROR_NOT_PERMITTED = 800, + + /** + * This error indicates that the attempted operation is not supported + * on the current system or device. + */ + CUDA_ERROR_NOT_SUPPORTED = 801, + + /** + * This indicates that an unknown internal error has occurred. + */ + CUDA_ERROR_UNKNOWN = 999 +} CUresult; + +/** + * P2P Attributes + */ +typedef enum CUdevice_P2PAttribute_enum { + CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01, /**< A relative value indicating the performance of the link between two devices */ + CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02, /**< P2P Access is enable */ + CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03 /**< Atomic operation over the link supported */ +} CUdevice_P2PAttribute; + +#ifdef _WIN32 +#define CUDA_CB __stdcall +#else +#define CUDA_CB +#endif + +/** + * CUDA stream callback + * \param hStream The stream the callback was added to, as passed to ::cuStreamAddCallback. May be NULL. + * \param status ::CUDA_SUCCESS or any persistent error on the stream. + * \param userData User parameter provided at registration. + */ +typedef void (CUDA_CB *CUstreamCallback)(CUstream hStream, CUresult status, void *userData); + +/** + * Block size to per-block dynamic shared memory mapping for a certain + * kernel \param blockSize Block size of the kernel. + * + * \return The dynamic shared memory needed by a block. + */ +typedef size_t (CUDA_CB *CUoccupancyB2DSize)(int blockSize); + +/** + * If set, host memory is portable between CUDA contexts. + * Flag for ::cuMemHostAlloc() + */ +#define CU_MEMHOSTALLOC_PORTABLE 0x01 + +/** + * If set, host memory is mapped into CUDA address space and + * ::cuMemHostGetDevicePointer() may be called on the host pointer. + * Flag for ::cuMemHostAlloc() + */ +#define CU_MEMHOSTALLOC_DEVICEMAP 0x02 + +/** + * If set, host memory is allocated as write-combined - fast to write, + * faster to DMA, slow to read except via SSE4 streaming load instruction + * (MOVNTDQA). + * Flag for ::cuMemHostAlloc() + */ +#define CU_MEMHOSTALLOC_WRITECOMBINED 0x04 + +/** + * If set, host memory is portable between CUDA contexts. + * Flag for ::cuMemHostRegister() + */ +#define CU_MEMHOSTREGISTER_PORTABLE 0x01 + +/** + * If set, host memory is mapped into CUDA address space and + * ::cuMemHostGetDevicePointer() may be called on the host pointer. + * Flag for ::cuMemHostRegister() + */ +#define CU_MEMHOSTREGISTER_DEVICEMAP 0x02 + +/** + * If set, the passed memory pointer is treated as pointing to some + * memory-mapped I/O space, e.g. belonging to a third-party PCIe device. + * On Windows the flag is a no-op. + * On Linux that memory is marked as non cache-coherent for the GPU and + * is expected to be physically contiguous. It may return + * CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user, + * CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions. + * On all other platforms, it is not supported and CUDA_ERROR_NOT_SUPPORTED + * is returned. + * Flag for ::cuMemHostRegister() + */ +#define CU_MEMHOSTREGISTER_IOMEMORY 0x04 + +#if __CUDA_API_VERSION >= 3020 + +/** + * 2D memory copy parameters + */ +typedef struct CUDA_MEMCPY2D_st { + size_t srcXInBytes; /**< Source X in bytes */ + size_t srcY; /**< Source Y */ + + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + size_t srcPitch; /**< Source pitch (ignored when src is array) */ + + size_t dstXInBytes; /**< Destination X in bytes */ + size_t dstY; /**< Destination Y */ + + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ + + size_t WidthInBytes; /**< Width of 2D memory copy in bytes */ + size_t Height; /**< Height of 2D memory copy */ +} CUDA_MEMCPY2D; + +/** + * 3D memory copy parameters + */ +typedef struct CUDA_MEMCPY3D_st { + size_t srcXInBytes; /**< Source X in bytes */ + size_t srcY; /**< Source Y */ + size_t srcZ; /**< Source Z */ + size_t srcLOD; /**< Source LOD */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + void *reserved0; /**< Must be NULL */ + size_t srcPitch; /**< Source pitch (ignored when src is array) */ + size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ + + size_t dstXInBytes; /**< Destination X in bytes */ + size_t dstY; /**< Destination Y */ + size_t dstZ; /**< Destination Z */ + size_t dstLOD; /**< Destination LOD */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + void *reserved1; /**< Must be NULL */ + size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ + size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ + + size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ + size_t Height; /**< Height of 3D memory copy */ + size_t Depth; /**< Depth of 3D memory copy */ +} CUDA_MEMCPY3D; + +/** + * 3D memory cross-context copy parameters + */ +typedef struct CUDA_MEMCPY3D_PEER_st { + size_t srcXInBytes; /**< Source X in bytes */ + size_t srcY; /**< Source Y */ + size_t srcZ; /**< Source Z */ + size_t srcLOD; /**< Source LOD */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + CUcontext srcContext; /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */ + size_t srcPitch; /**< Source pitch (ignored when src is array) */ + size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ + + size_t dstXInBytes; /**< Destination X in bytes */ + size_t dstY; /**< Destination Y */ + size_t dstZ; /**< Destination Z */ + size_t dstLOD; /**< Destination LOD */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + CUcontext dstContext; /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */ + size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ + size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ + + size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ + size_t Height; /**< Height of 3D memory copy */ + size_t Depth; /**< Depth of 3D memory copy */ +} CUDA_MEMCPY3D_PEER; + +/** + * Array descriptor + */ +typedef struct CUDA_ARRAY_DESCRIPTOR_st +{ + size_t Width; /**< Width of array */ + size_t Height; /**< Height of array */ + + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ +} CUDA_ARRAY_DESCRIPTOR; + +/** + * 3D array descriptor + */ +typedef struct CUDA_ARRAY3D_DESCRIPTOR_st +{ + size_t Width; /**< Width of 3D array */ + size_t Height; /**< Height of 3D array */ + size_t Depth; /**< Depth of 3D array */ + + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ + unsigned int Flags; /**< Flags */ +} CUDA_ARRAY3D_DESCRIPTOR; + +#endif /* __CUDA_API_VERSION >= 3020 */ + +#if __CUDA_API_VERSION >= 5000 + +/** + * CUDA Resource descriptor + */ +typedef struct CUDA_RESOURCE_DESC_st +{ + CUresourcetype resType; /**< Resource type */ + + union { + struct { + CUarray hArray; /**< CUDA array */ + } array; + struct { + CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */ + } mipmap; + struct { + CUdeviceptr devPtr; /**< Device pointer */ + CUarray_format format; /**< Array format */ + unsigned int numChannels; /**< Channels per array element */ + size_t sizeInBytes; /**< Size in bytes */ + } linear; + struct { + CUdeviceptr devPtr; /**< Device pointer */ + CUarray_format format; /**< Array format */ + unsigned int numChannels; /**< Channels per array element */ + size_t width; /**< Width of the array in elements */ + size_t height; /**< Height of the array in elements */ + size_t pitchInBytes; /**< Pitch between two rows in bytes */ + } pitch2D; + struct { + int reserved[32]; + } reserved; + } res; + + unsigned int flags; /**< Flags (must be zero) */ +} CUDA_RESOURCE_DESC; + +/** + * Texture descriptor + */ +typedef struct CUDA_TEXTURE_DESC_st { + CUaddress_mode addressMode[3]; /**< Address modes */ + CUfilter_mode filterMode; /**< Filter mode */ + unsigned int flags; /**< Flags */ + unsigned int maxAnisotropy; /**< Maximum anisotropy ratio */ + CUfilter_mode mipmapFilterMode; /**< Mipmap filter mode */ + float mipmapLevelBias; /**< Mipmap level bias */ + float minMipmapLevelClamp; /**< Mipmap minimum level clamp */ + float maxMipmapLevelClamp; /**< Mipmap maximum level clamp */ + float borderColor[4]; /**< Border Color */ + int reserved[12]; +} CUDA_TEXTURE_DESC; + +/** + * Resource view format + */ +typedef enum CUresourceViewFormat_enum +{ + CU_RES_VIEW_FORMAT_NONE = 0x00, /**< No resource view format (use underlying resource format) */ + CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01, /**< 1 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X8 = 0x02, /**< 2 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X8 = 0x03, /**< 4 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X8 = 0x04, /**< 1 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X8 = 0x05, /**< 2 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X8 = 0x06, /**< 4 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_1X16 = 0x07, /**< 1 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X16 = 0x08, /**< 2 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X16 = 0x09, /**< 4 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X16 = 0x0a, /**< 1 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X16 = 0x0b, /**< 2 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X16 = 0x0c, /**< 4 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_1X32 = 0x0d, /**< 1 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X32 = 0x0e, /**< 2 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X32 = 0x0f, /**< 4 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X32 = 0x10, /**< 1 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X32 = 0x11, /**< 2 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X32 = 0x12, /**< 4 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13, /**< 1 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14, /**< 2 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15, /**< 4 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16, /**< 1 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17, /**< 2 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18, /**< 4 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19, /**< Block compressed 1 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a, /**< Block compressed 2 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b, /**< Block compressed 3 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c, /**< Block compressed 4 unsigned */ + CU_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d, /**< Block compressed 4 signed */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e, /**< Block compressed 5 unsigned */ + CU_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f, /**< Block compressed 5 signed */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */ + CU_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21, /**< Block compressed 6 signed half-float */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22 /**< Block compressed 7 */ +} CUresourceViewFormat; + +/** + * Resource view descriptor + */ +typedef struct CUDA_RESOURCE_VIEW_DESC_st +{ + CUresourceViewFormat format; /**< Resource view format */ + size_t width; /**< Width of the resource view */ + size_t height; /**< Height of the resource view */ + size_t depth; /**< Depth of the resource view */ + unsigned int firstMipmapLevel; /**< First defined mipmap level */ + unsigned int lastMipmapLevel; /**< Last defined mipmap level */ + unsigned int firstLayer; /**< First layer index */ + unsigned int lastLayer; /**< Last layer index */ + unsigned int reserved[16]; +} CUDA_RESOURCE_VIEW_DESC; + +/** + * GPU Direct v3 tokens + */ +typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st { + unsigned long long p2pToken; + unsigned int vaSpaceToken; +} CUDA_POINTER_ATTRIBUTE_P2P_TOKENS; + +#endif /* __CUDA_API_VERSION >= 5000 */ + +#if __CUDA_API_VERSION >= 9000 + +/** + * Kernel launch parameters + */ +typedef struct CUDA_LAUNCH_PARAMS_st { + CUfunction function; /**< Kernel to launch */ + unsigned int gridDimX; /**< Width of grid in blocks */ + unsigned int gridDimY; /**< Height of grid in blocks */ + unsigned int gridDimZ; /**< Depth of grid in blocks */ + unsigned int blockDimX; /**< X dimension of each thread block */ + unsigned int blockDimY; /**< Y dimension of each thread block */ + unsigned int blockDimZ; /**< Z dimension of each thread block */ + unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */ + CUstream hStream; /**< Stream identifier */ + void **kernelParams; /**< Array of pointers to kernel parameters */ +} CUDA_LAUNCH_PARAMS; + +#endif /* __CUDA_API_VERSION >= 9000 */ + +/** + * If set, each kernel launched as part of ::cuLaunchCooperativeKernelMultiDevice only + * waits for prior work in the stream corresponding to that GPU to complete before the + * kernel begins execution. + */ +#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC 0x01 + +/** + * If set, any subsequent work pushed in a stream that participated in a call to + * ::cuLaunchCooperativeKernelMultiDevice will only wait for the kernel launched on + * the GPU corresponding to that stream to complete before it begins execution. + */ +#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC 0x02 + +/** + * If set, the CUDA array is a collection of layers, where each layer is either a 1D + * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number + * of layers, not the depth of a 3D array. + */ +#define CUDA_ARRAY3D_LAYERED 0x01 + +/** + * Deprecated, use CUDA_ARRAY3D_LAYERED + */ +#define CUDA_ARRAY3D_2DARRAY 0x01 + +/** + * This flag must be set in order to bind a surface reference + * to the CUDA array + */ +#define CUDA_ARRAY3D_SURFACE_LDST 0x02 + +/** + * If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The + * width of such a CUDA array must be equal to its height, and Depth must be six. + * If ::CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps + * and Depth must be a multiple of six. + */ +#define CUDA_ARRAY3D_CUBEMAP 0x04 + +/** + * This flag must be set in order to perform texture gather operations + * on a CUDA array. + */ +#define CUDA_ARRAY3D_TEXTURE_GATHER 0x08 + +/** + * This flag if set indicates that the CUDA + * array is a DEPTH_TEXTURE. +*/ +#define CUDA_ARRAY3D_DEPTH_TEXTURE 0x10 + +/** + * Override the texref format with a format inferred from the array. + * Flag for ::cuTexRefSetArray() + */ +#define CU_TRSA_OVERRIDE_FORMAT 0x01 + +/** + * Read the texture as integers rather than promoting the values to floats + * in the range [0,1]. + * Flag for ::cuTexRefSetFlags() + */ +#define CU_TRSF_READ_AS_INTEGER 0x01 + +/** + * Use normalized texture coordinates in the range [0,1) instead of [0,dim). + * Flag for ::cuTexRefSetFlags() + */ +#define CU_TRSF_NORMALIZED_COORDINATES 0x02 + +/** + * Perform sRGB->linear conversion during texture read. + * Flag for ::cuTexRefSetFlags() + */ +#define CU_TRSF_SRGB 0x10 + +/** + * End of array terminator for the \p extra parameter to + * ::cuLaunchKernel + */ +#define CU_LAUNCH_PARAM_END ((void*)0x00) + +/** + * Indicator that the next value in the \p extra parameter to + * ::cuLaunchKernel will be a pointer to a buffer containing all kernel + * parameters used for launching kernel \p f. This buffer needs to + * honor all alignment/padding requirements of the individual parameters. + * If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the + * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no + * effect. + */ +#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01) + +/** + * Indicator that the next value in the \p extra parameter to + * ::cuLaunchKernel will be a pointer to a size_t which contains the + * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER. + * It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified + * in the \p extra array if the value associated with + * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero. + */ +#define CU_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02) + +/** + * For texture references loaded into the module, use default texunit from + * texture reference. + */ +#define CU_PARAM_TR_DEFAULT -1 + +/** + * Device that represents the CPU + */ +#define CU_DEVICE_CPU ((CUdevice)-1) + +/** + * Device that represents an invalid device + */ +#define CU_DEVICE_INVALID ((CUdevice)-2) + +/** @} */ /* END CUDA_TYPES */ + +#ifdef _WIN32 +#define CUDAAPI __stdcall +#else +#define CUDAAPI +#endif + +/** + * \defgroup CUDA_ERROR Error Handling + * + * ___MANBRIEF___ error handling functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the error handling functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Gets the string description of an error code + * + * Sets \p *pStr to the address of a NULL-terminated string description + * of the error code \p error. + * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE + * will be returned and \p *pStr will be set to the NULL address. + * + * \param error - Error code to convert to string + * \param pStr - Address of the string pointer. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::CUresult, + * ::cudaGetErrorString + */ +CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr); + +/** + * \brief Gets the string representation of an error code enum name + * + * Sets \p *pStr to the address of a NULL-terminated string representation + * of the name of the enum error code \p error. + * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE + * will be returned and \p *pStr will be set to the NULL address. + * + * \param error - Error code to convert to string + * \param pStr - Address of the string pointer. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::CUresult, + * ::cudaGetErrorName + */ +CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr); + +/** @} */ /* END CUDA_ERROR */ + +/** + * \defgroup CUDA_INITIALIZE Initialization + * + * ___MANBRIEF___ initialization functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the initialization functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Initialize the CUDA driver API + * + * Initializes the driver API and must be called before any other function from + * the driver API. Currently, the \p Flags parameter must be 0. If ::cuInit() + * has not been called, any function from the driver API will return + * ::CUDA_ERROR_NOT_INITIALIZED. + * + * \param Flags - Initialization flag for CUDA. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + */ +CUresult CUDAAPI cuInit(unsigned int Flags); + +/** @} */ /* END CUDA_INITIALIZE */ + +/** + * \defgroup CUDA_VERSION Version Management + * + * ___MANBRIEF___ version management functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the version management functions of the low-level + * CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns the CUDA driver version + * + * Returns in \p *driverVersion the version number of the installed CUDA + * driver. This function automatically returns ::CUDA_ERROR_INVALID_VALUE if + * the \p driverVersion argument is NULL. + * + * \param driverVersion - Returns the CUDA driver version + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cudaDriverGetVersion, + * ::cudaRuntimeGetVersion + */ +CUresult CUDAAPI cuDriverGetVersion(int *driverVersion); + +/** @} */ /* END CUDA_VERSION */ + +/** + * \defgroup CUDA_DEVICE Device Management + * + * ___MANBRIEF___ device management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the device management functions of the low-level + * CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns a handle to a compute device + * + * Returns in \p *device a device handle given an ordinal in the range [0, + * ::cuDeviceGetCount()-1]. + * + * \param device - Returned device handle + * \param ordinal - Device number to get handle for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceTotalMem + */ +CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal); + +/** + * \brief Returns the number of compute-capable devices + * + * Returns in \p *count the number of devices with compute capability greater + * than or equal to 2.0 that are available for execution. If there is no such + * device, ::cuDeviceGetCount() returns 0. + * + * \param count - Returned number of compute-capable devices + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetName, + * ::cuDeviceGet, + * ::cuDeviceTotalMem, + * ::cudaGetDeviceCount + */ +CUresult CUDAAPI cuDeviceGetCount(int *count); + +/** + * \brief Returns an identifer string for the device + * + * Returns an ASCII string identifying the device \p dev in the NULL-terminated + * string pointed to by \p name. \p len specifies the maximum length of the + * string that may be returned. + * + * \param name - Returned identifier string for the device + * \param len - Maximum length of string to store in \p name + * \param dev - Device to get identifier string for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGet, + * ::cuDeviceTotalMem, + * ::cudaGetDeviceProperties + */ +CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev); + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Returns the total amount of memory on the device + * + * Returns in \p *bytes the total amount of memory available on the device + * \p dev in bytes. + * + * \param bytes - Returned memory available on device in bytes + * \param dev - Device handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGet, + * ::cudaMemGetInfo + */ +CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev); +#endif /* __CUDA_API_VERSION >= 3020 */ + +/** + * \brief Returns information about the device + * + * Returns in \p *pi the integer value of the attribute \p attrib on device + * \p dev. The supported attributes are: + * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per + * block; + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block; + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block; + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block; + * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid; + * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid; + * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid; + * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of + * shared memory available to a thread block in bytes; + * - ::CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device for + * __constant__ variables in a CUDA C kernel in bytes; + * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads; + * - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the + * memory copy functions that involve memory regions allocated through + * ::cuMemAllocPitch(); + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D + * texture width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width + * for a 1D texture bound to linear memory; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: Maximum + * mipmapped 1D texture width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D + * texture width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D + * texture height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width + * for a 2D texture bound to linear memory; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: Maximum height + * for a 2D texture bound to linear memory; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch + * in bytes for a 2D texture bound to linear memory; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: Maximum + * mipmapped 2D texture width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: Maximum + * mipmapped 2D texture height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D + * texture width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D + * texture height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D + * texture depth; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE: + * Alternate maximum 3D texture width, 0 if no alternate + * maximum 3D texture size is supported; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE: + * Alternate maximum 3D texture height, 0 if no alternate + * maximum 3D texture size is supported; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE: + * Alternate maximum 3D texture depth, 0 if no alternate + * maximum 3D texture size is supported; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH: + * Maximum cubemap texture width or height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH: + * Maximum 1D layered texture width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS: + * Maximum layers in a 1D layered texture; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH: + * Maximum 2D layered texture width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT: + * Maximum 2D layered texture height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS: + * Maximum layers in a 2D layered texture; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH: + * Maximum cubemap layered texture width or height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS: + * Maximum layers in a cubemap layered texture; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH: + * Maximum 1D surface width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH: + * Maximum 2D surface width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT: + * Maximum 2D surface height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH: + * Maximum 3D surface width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT: + * Maximum 3D surface height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH: + * Maximum 3D surface depth; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH: + * Maximum 1D layered surface width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS: + * Maximum layers in a 1D layered surface; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH: + * Maximum 2D layered surface width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT: + * Maximum 2D layered surface height; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS: + * Maximum layers in a 2D layered surface; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH: + * Maximum cubemap surface width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH: + * Maximum cubemap layered surface width; + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS: + * Maximum layers in a cubemap layered surface; + * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit + * registers available to a thread block; + * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz; + * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture + * base addresses aligned to ::textureAlign bytes do not need an offset + * applied to texture fetches; + * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: Pitch alignment requirement + * for 2D texture references bound to pitched memory; + * - ::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy + * memory between host and device while executing a kernel, or 0 if not; + * - ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on + * the device; + * - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit + * for kernels executed on the device, or 0 if not; + * - ::CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the + * memory subsystem, or 0 if not; + * - ::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host + * memory into the CUDA address space, or 0 if not; + * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently + * in. Available modes are as follows: + * - ::CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted and + * can have multiple CUDA contexts present at a single time. + * - ::CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is + * prohibited from creating new CUDA contexts. + * - ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS: Compute-exclusive-process mode - Device + * can have only one context used by a single process at a time. + * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: 1 if the device supports + * executing multiple kernels within the same context simultaneously, or 0 if + * not. It is not guaranteed that multiple kernels will be resident + * on the device concurrently so this feature should not be relied upon for + * correctness; + * - ::CU_DEVICE_ATTRIBUTE_ECC_ENABLED: 1 if error correction is enabled on the + * device, 0 if error correction is disabled or not supported by the device; + * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device; + * - ::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: PCI device (also known as slot) identifier + * of the device; + * - ::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID: PCI domain identifier of the device + * - ::CU_DEVICE_ATTRIBUTE_TCC_DRIVER: 1 if the device is using a TCC driver. TCC + * is only available on Tesla hardware running Windows Vista or later; + * - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz; + * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits; + * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache; + * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor; + * - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with + * the host, or 0 if not; + * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number; + * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number; + * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: 1 if device supports caching globals + * in L1 cache, 0 if caching globals in L1 cache is not supported by the device; + * - ::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: 1 if device supports caching locals + * in L1 cache, 0 if caching locals in L1 cache is not supported by the device; + * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: Maximum amount of + * shared memory available to a multiprocessor in bytes; this amount is shared + * by all thread blocks simultaneously resident on a multiprocessor; + * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: Maximum number of 32-bit + * registers available to a multiprocessor; this number is shared by all thread + * blocks simultaneously resident on a multiprocessor; + * - ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: 1 if device supports allocating managed memory + * on this system, 0 if allocating managed memory is not supported by the device on this system. + * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: 1 if device is on a multi-GPU board, 0 if not. + * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID: Unique identifier for a group of devices + * associated with the same board. Devices on the same multi-GPU board will share the same identifier. + * - ::CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED: 1 if Link between the device and the host + * supports native atomic operations. + * - ::CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO: Ratio of single precision performance + * (in floating-point operations per second) to double precision performance. + * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS: Device suppports coherently accessing + * pageable memory without calling cudaHostRegister on it. + * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS: Device can coherently access managed memory + * concurrently with the CPU. + * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED: Device supports Compute Preemption. + * - ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: Device can access host registered + * memory at the same virtual address as the CPU. + * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN: The maximum per block shared memory size + * suported on this device. This is the maximum value that can be opted into when using the cuFuncSetAttribute() call. + * For more details see ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES + * + * \param pi - Returned device attribute value + * \param attrib - Device attribute to query + * \param dev - Device handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGet, + * ::cuDeviceTotalMem, + * ::cudaDeviceGetAttribute, + * ::cudaGetDeviceProperties + */ +CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev); + +/** @} */ /* END CUDA_DEVICE */ + +/** + * \defgroup CUDA_DEVICE_DEPRECATED Device Management [DEPRECATED] + * + * ___MANBRIEF___ deprecated device management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the device management functions of the low-level + * CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns properties for a selected device + * + * \deprecated + * + * This function was deprecated as of CUDA 5.0 and replaced by ::cuDeviceGetAttribute(). + * + * Returns in \p *prop the properties of device \p dev. The ::CUdevprop + * structure is defined as: + * + * \code + typedef struct CUdevprop_st { + int maxThreadsPerBlock; + int maxThreadsDim[3]; + int maxGridSize[3]; + int sharedMemPerBlock; + int totalConstantMemory; + int SIMDWidth; + int memPitch; + int regsPerBlock; + int clockRate; + int textureAlign + } CUdevprop; + * \endcode + * where: + * + * - ::maxThreadsPerBlock is the maximum number of threads per block; + * - ::maxThreadsDim[3] is the maximum sizes of each dimension of a block; + * - ::maxGridSize[3] is the maximum sizes of each dimension of a grid; + * - ::sharedMemPerBlock is the total amount of shared memory available per + * block in bytes; + * - ::totalConstantMemory is the total amount of constant memory available on + * the device in bytes; + * - ::SIMDWidth is the warp size; + * - ::memPitch is the maximum pitch allowed by the memory copy functions that + * involve memory regions allocated through ::cuMemAllocPitch(); + * - ::regsPerBlock is the total number of registers available per block; + * - ::clockRate is the clock frequency in kilohertz; + * - ::textureAlign is the alignment requirement; texture base addresses that + * are aligned to ::textureAlign bytes do not need an offset applied to + * texture fetches. + * + * \param prop - Returned properties of device + * \param dev - Device to get properties for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGet, + * ::cuDeviceTotalMem + */ +CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev); + +/** + * \brief Returns the compute capability of the device + * + * \deprecated + * + * This function was deprecated as of CUDA 5.0 and its functionality superceded + * by ::cuDeviceGetAttribute(). + * + * Returns in \p *major and \p *minor the major and minor revision numbers that + * define the compute capability of the device \p dev. + * + * \param major - Major revision number + * \param minor - Minor revision number + * \param dev - Device handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGet, + * ::cuDeviceTotalMem + */ +CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev); + +/** @} */ /* END CUDA_DEVICE_DEPRECATED */ + +/** + * \defgroup CUDA_PRIMARY_CTX Primary Context Management + * + * ___MANBRIEF___ primary context management functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the primary context management functions of the low-level + * CUDA driver application programming interface. + * + * The primary context is unique per device and shared with the CUDA runtime API. + * These functions allow integration with other libraries using CUDA. + * + * @{ + */ + +#if __CUDA_API_VERSION >= 7000 + +/** + * \brief Retain the primary context on the GPU + * + * Retains the primary context on the device, creating it if necessary, + * increasing its usage count. The caller must call + * ::cuDevicePrimaryCtxRelease() when done using the context. + * Unlike ::cuCtxCreate() the newly created context is not pushed onto the stack. + * + * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of + * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() + * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the compute mode + * of the device. + * The nvidia-smi tool can be used to set the compute mode for + * devices. Documentation for nvidia-smi can be obtained by passing a + * -h option to it. + * + * Please note that the primary context always supports pinned allocations. Other + * flags can be specified by ::cuDevicePrimaryCtxSetFlags(). + * + * \param pctx - Returned context handle of the new context + * \param dev - Device for which primary context is requested + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuDevicePrimaryCtxRelease, + * ::cuDevicePrimaryCtxSetFlags, + * ::cuCtxCreate, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev); + +/** + * \brief Release the primary context on the GPU + * + * Releases the primary context interop on the device by decreasing the usage + * count by 1. If the usage drops to 0 the primary context of device \p dev + * will be destroyed regardless of how many threads it is current to. + * + * Please note that unlike ::cuCtxDestroy() this method does not pop the context + * from stack in any circumstances. + * + * \param dev - Device which primary context is released + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa ::cuDevicePrimaryCtxRetain, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev); + +/** + * \brief Set flags for the primary context + * + * Sets the flags for the primary context on the device overwriting perviously + * set ones. If the primary context is already created + * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE is returned. + * + * The three LSBs of the \p flags parameter can be used to control how the OS + * thread, which owns the CUDA context at the time of an API call, interacts + * with the OS scheduler when waiting for results from the GPU. Only one of + * the scheduling flags can be set when creating a context. + * + * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for + * results from the GPU. This can decrease latency when waiting for the GPU, + * but may lower the performance of CPU threads if they are performing work in + * parallel with the CUDA thread. + * + * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for + * results from the GPU. This can increase latency when waiting for the GPU, + * but can increase the performance of CPU threads performing work in parallel + * with the GPU. + * + * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work. + * + * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work.
+ * Deprecated: This flag was deprecated as of CUDA 4.0 and was + * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. + * + * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, + * uses a heuristic based on the number of active CUDA contexts in the + * process \e C and the number of logical processors in the system \e P. If + * \e C > \e P, then CUDA will yield to other OS threads when waiting for + * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while + * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). + * However, on low power devices like Tegra, it always defaults to + * ::CU_CTX_SCHED_BLOCKING_SYNC. + * + * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory + * after resizing local memory for a kernel. This can prevent thrashing by + * local memory allocations when launching many kernels with high local + * memory usage at the cost of potentially increased memory usage. + * + * \param dev - Device for which the primary context flags are set + * \param flags - New flags for the device + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE + * \notefnerr + * + * \sa ::cuDevicePrimaryCtxRetain, + * ::cuDevicePrimaryCtxGetState, + * ::cuCtxCreate, + * ::cuCtxGetFlags, + * ::cudaSetDeviceFlags + */ +CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags); + +/** + * \brief Get the state of the primary context + * + * Returns in \p *flags the flags for the primary context of \p dev, and in + * \p *active whether it is active. See ::cuDevicePrimaryCtxSetFlags for flag + * values. + * + * \param dev - Device to get primary context flags for + * \param flags - Pointer to store flags + * \param active - Pointer to store context state; 0 = inactive, 1 = active + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + * \sa + * ::cuDevicePrimaryCtxSetFlags, + * ::cuCtxGetFlags, + * ::cudaGetDeviceFlags + */ +CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, int *active); + +/** + * \brief Destroy all allocations and reset all state on the primary context + * + * Explicitly destroys and cleans up all resources associated with the current + * device in the current process. + * + * Note that it is responsibility of the calling function to ensure that no + * other module in the process is using the device any more. For that reason + * it is recommended to use ::cuDevicePrimaryCtxRelease() in most cases. + * However it is safe for other modules to call ::cuDevicePrimaryCtxRelease() + * even after resetting the device. + * + * \param dev - Device for which primary context is destroyed + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE + * \notefnerr + * + * \sa ::cuDevicePrimaryCtxRetain, + * ::cuDevicePrimaryCtxRelease, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cudaDeviceReset + */ +CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); + +#endif /* __CUDA_API_VERSION >= 7000 */ + +/** @} */ /* END CUDA_PRIMARY_CTX */ + + +/** + * \defgroup CUDA_CTX Context Management + * + * ___MANBRIEF___ context management functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the context management functions of the low-level + * CUDA driver application programming interface. + * + * @{ + */ + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Create a CUDA context + * + * Creates a new CUDA context and associates it with the calling thread. The + * \p flags parameter is described below. The context is created with a usage + * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy() or + * when done using the context. If a context is already current to the thread, + * it is supplanted by the newly created context and may be restored by a subsequent + * call to ::cuCtxPopCurrent(). + * + * The three LSBs of the \p flags parameter can be used to control how the OS + * thread, which owns the CUDA context at the time of an API call, interacts + * with the OS scheduler when waiting for results from the GPU. Only one of + * the scheduling flags can be set when creating a context. + * + * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for + * results from the GPU. This can decrease latency when waiting for the GPU, + * but may lower the performance of CPU threads if they are performing work in + * parallel with the CUDA thread. + * + * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for + * results from the GPU. This can increase latency when waiting for the GPU, + * but can increase the performance of CPU threads performing work in parallel + * with the GPU. + * + * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work. + * + * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work.
+ * Deprecated: This flag was deprecated as of CUDA 4.0 and was + * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. + * + * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, + * uses a heuristic based on the number of active CUDA contexts in the + * process \e C and the number of logical processors in the system \e P. If + * \e C > \e P, then CUDA will yield to other OS threads when waiting for + * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while + * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). + * However, on low power devices like Tegra, it always defaults to + * ::CU_CTX_SCHED_BLOCKING_SYNC. + * + * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations. + * This flag must be set in order to allocate pinned host memory that is + * accessible to the GPU. + * + * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory + * after resizing local memory for a kernel. This can prevent thrashing by + * local memory allocations when launching many kernels with high local + * memory usage at the cost of potentially increased memory usage. + * + * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of + * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() + * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the + * compute mode of the device. The nvidia-smi tool can be used to set + * the compute mode for * devices. + * Documentation for nvidia-smi can be obtained by passing a + * -h option to it. + * + * \param pctx - Returned context handle of the new context + * \param flags - Context creation flags + * \param dev - Device to create context on + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); +#endif /* __CUDA_API_VERSION >= 3020 */ + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Destroy a CUDA context + * + * Destroys the CUDA context specified by \p ctx. The context \p ctx will be + * destroyed regardless of how many threads it is current to. + * It is the responsibility of the calling function to ensure that no API + * call issues using \p ctx while ::cuCtxDestroy() is executing. + * + * If \p ctx is current to the calling thread then \p ctx will also be + * popped from the current thread's context stack (as though ::cuCtxPopCurrent() + * were called). If \p ctx is current to other threads, then \p ctx will + * remain current to those threads, and attempting to access \p ctx from + * those threads will result in the error ::CUDA_ERROR_CONTEXT_IS_DESTROYED. + * + * \param ctx - Context to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); +#endif /* __CUDA_API_VERSION >= 4000 */ + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Pushes a context on the current CPU thread + * + * Pushes the given context \p ctx onto the CPU thread's stack of current + * contexts. The specified context becomes the CPU thread's current context, so + * all CUDA functions that operate on the current context are affected. + * + * The previous current context may be made current again by calling + * ::cuCtxDestroy() or ::cuCtxPopCurrent(). + * + * \param ctx - Context to push + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx); + +/** + * \brief Pops the current CUDA context from the current CPU thread. + * + * Pops the current CUDA context from the CPU thread and passes back the + * old context handle in \p *pctx. That context may then be made current + * to a different CPU thread by calling ::cuCtxPushCurrent(). + * + * If a context was current to the CPU thread before ::cuCtxCreate() or + * ::cuCtxPushCurrent() was called, this function makes that context current to + * the CPU thread again. + * + * \param pctx - Returned new context handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx); + +/** + * \brief Binds the specified CUDA context to the calling CPU thread + * + * Binds the specified CUDA context to the calling CPU thread. + * If \p ctx is NULL then the CUDA context previously bound to the + * calling CPU thread is unbound and ::CUDA_SUCCESS is returned. + * + * If there exists a CUDA context stack on the calling CPU thread, this + * will replace the top of that stack with \p ctx. + * If \p ctx is NULL then this will be equivalent to popping the top + * of the calling CPU thread's CUDA context stack (or a no-op if the + * calling CPU thread's CUDA context stack is empty). + * + * \param ctx - Context to bind to the calling CPU thread + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa + * ::cuCtxGetCurrent, + * ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cudaSetDevice + */ +CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx); + +/** + * \brief Returns the CUDA context bound to the calling CPU thread. + * + * Returns in \p *pctx the CUDA context bound to the calling CPU thread. + * If no context is bound to the calling CPU thread then \p *pctx is + * set to NULL and ::CUDA_SUCCESS is returned. + * + * \param pctx - Returned context handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * \notefnerr + * + * \sa + * ::cuCtxSetCurrent, + * ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cudaGetDevice + */ +CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx); +#endif /* __CUDA_API_VERSION >= 4000 */ + +/** + * \brief Returns the device ID for the current context + * + * Returns in \p *device the ordinal of the current context's device. + * + * \param device - Returned device ID for the current context + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cudaGetDevice + */ +CUresult CUDAAPI cuCtxGetDevice(CUdevice *device); + +#if __CUDA_API_VERSION >= 7000 +/** + * \brief Returns the flags for the current context + * + * Returns in \p *flags the flags of the current context. See ::cuCtxCreate + * for flag values. + * + * \param flags - Pointer to store flags of current context + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetCurrent, + * ::cuCtxGetDevice + * ::cuCtxGetLimit, + * ::cuCtxGetSharedMemConfig, + * ::cuCtxGetStreamPriorityRange, + * ::cudaGetDeviceFlags + */ +CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags); +#endif /* __CUDA_API_VERSION >= 7000 */ + +/** + * \brief Block for a context's tasks to complete + * + * Blocks until the device has completed all preceding requested tasks. + * ::cuCtxSynchronize() returns an error if one of the preceding tasks failed. + * If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the + * CPU thread will block until the GPU context has finished its work. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cudaDeviceSynchronize + */ +CUresult CUDAAPI cuCtxSynchronize(void); + +/** + * \brief Set resource limits + * + * Setting \p limit to \p value is a request by the application to update + * the current limit maintained by the context. The driver is free to + * modify the requested value to meet h/w requirements (this could be + * clamping to minimum or maximum values, rounding up to nearest element + * size, etc). The application can use ::cuCtxGetLimit() to find out exactly + * what the limit has been set to. + * + * Setting each ::CUlimit has its own specific restrictions, so each is + * discussed here. + * + * - ::CU_LIMIT_STACK_SIZE controls the stack size in bytes of each GPU thread. + * + * - ::CU_LIMIT_PRINTF_FIFO_SIZE controls the size in bytes of the FIFO used + * by the ::printf() device system call. Setting ::CU_LIMIT_PRINTF_FIFO_SIZE + * must be performed before launching any kernel that uses the ::printf() + * device system call, otherwise ::CUDA_ERROR_INVALID_VALUE will be returned. + * + * - ::CU_LIMIT_MALLOC_HEAP_SIZE controls the size in bytes of the heap used + * by the ::malloc() and ::free() device system calls. Setting + * ::CU_LIMIT_MALLOC_HEAP_SIZE must be performed before launching any kernel + * that uses the ::malloc() or ::free() device system calls, otherwise + * ::CUDA_ERROR_INVALID_VALUE will be returned. + * + * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH controls the maximum nesting depth of + * a grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting + * this limit must be performed before any launch of a kernel that uses the + * device runtime and calls ::cudaDeviceSynchronize() above the default sync + * depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail + * with error code ::cudaErrorSyncDepthExceeded if the limitation is + * violated. This limit can be set smaller than the default or up the maximum + * launch depth of 24. When setting this limit, keep in mind that additional + * levels of sync depth require the driver to reserve large amounts of device + * memory which can no longer be used for user allocations. If these + * reservations of device memory fail, ::cuCtxSetLimit will return + * ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value. + * This limit is only applicable to devices of compute capability 3.5 and + * higher. Attempting to set this limit on devices of compute capability less + * than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being + * returned. + * + * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT controls the maximum number of + * outstanding device runtime launches that can be made from the current + * context. A grid is outstanding from the point of launch up until the grid + * is known to have been completed. Device runtime launches which violate + * this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when + * ::cudaGetLastError() is called after launch. If more pending launches than + * the default (2048 launches) are needed for a module using the device + * runtime, this limit can be increased. Keep in mind that being able to + * sustain additional pending launches will require the driver to reserve + * larger amounts of device memory upfront which can no longer be used for + * allocations. If these reservations fail, ::cuCtxSetLimit will return + * ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value. + * This limit is only applicable to devices of compute capability 3.5 and + * higher. Attempting to set this limit on devices of compute capability less + * than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being + * returned. + * + * \param limit - Limit to set + * \param value - Size of limit + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNSUPPORTED_LIMIT, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSynchronize, + * ::cudaDeviceSetLimit + */ +CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value); + +/** + * \brief Returns resource limits + * + * Returns in \p *pvalue the current size of \p limit. The supported + * ::CUlimit values are: + * - ::CU_LIMIT_STACK_SIZE: stack size in bytes of each GPU thread. + * - ::CU_LIMIT_PRINTF_FIFO_SIZE: size in bytes of the FIFO used by the + * ::printf() device system call. + * - ::CU_LIMIT_MALLOC_HEAP_SIZE: size in bytes of the heap used by the + * ::malloc() and ::free() device system calls. + * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: maximum grid depth at which a thread + * can issue the device runtime call ::cudaDeviceSynchronize() to wait on + * child grid launches to complete. + * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: maximum number of outstanding + * device runtime launches that can be made from this context. + * + * \param limit - Limit to query + * \param pvalue - Returned size of limit + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNSUPPORTED_LIMIT + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cudaDeviceGetLimit + */ +CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit); + +/** + * \brief Returns the preferred cache configuration for the current context. + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this function returns through \p pconfig the preferred cache configuration + * for the current context. This is only a preference. The driver will use + * the requested configuration if possible, but it is free to choose a different + * configuration if required to execute functions. + * + * This will return a \p pconfig of ::CU_FUNC_CACHE_PREFER_NONE on devices + * where the size of the L1 cache and shared memory are fixed. + * + * The supported cache configurations are: + * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) + * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache + * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory + * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory + * + * \param pconfig - Returned cache configuration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cuFuncSetCacheConfig, + * ::cudaDeviceGetCacheConfig + */ +CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig); + +/** + * \brief Sets the preferred cache configuration for the current context. + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this sets through \p config the preferred cache configuration for + * the current context. This is only a preference. The driver will use + * the requested configuration if possible, but it is free to choose a different + * configuration if required to execute the function. Any function preference + * set via ::cuFuncSetCacheConfig() will be preferred over this context-wide + * setting. Setting the context-wide cache configuration to + * ::CU_FUNC_CACHE_PREFER_NONE will cause subsequent kernel launches to prefer + * to not change the cache configuration unless required to launch the kernel. + * + * This setting does nothing on devices where the size of the L1 cache and + * shared memory are fixed. + * + * Launching a kernel with a different preference than the most recent + * preference setting may insert a device-side synchronization point. + * + * The supported cache configurations are: + * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) + * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache + * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory + * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory + * + * \param config - Requested cache configuration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cuFuncSetCacheConfig, + * ::cudaDeviceSetCacheConfig + */ +CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config); + +#if __CUDA_API_VERSION >= 4020 +/** + * \brief Returns the current shared memory configuration for the current context. + * + * This function will return in \p pConfig the current size of shared memory banks + * in the current context. On devices with configurable shared memory banks, + * ::cuCtxSetSharedMemConfig can be used to change this setting, so that all + * subsequent kernel launches will by default use the new bank size. When + * ::cuCtxGetSharedMemConfig is called on devices without configurable shared + * memory, it will return the fixed bank size of the hardware. + * + * The returned bank configurations can be either: + * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: shared memory bank width is + * four bytes. + * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: shared memory bank width will + * eight bytes. + * + * \param pConfig - returned shared memory configuration + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cuCtxGetSharedMemConfig, + * ::cuFuncSetCacheConfig, + * ::cudaDeviceGetSharedMemConfig + */ +CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig); + +/** + * \brief Sets the shared memory configuration for the current context. + * + * On devices with configurable shared memory banks, this function will set + * the context's shared memory bank size which is used for subsequent kernel + * launches. + * + * Changed the shared memory configuration between launches may insert a device + * side synchronization point between those launches. + * + * Changing the shared memory bank size will not increase shared memory usage + * or affect occupancy of kernels, but may have major effects on performance. + * Larger bank sizes will allow for greater potential bandwidth to shared memory, + * but will change what kinds of accesses to shared memory will result in bank + * conflicts. + * + * This function will do nothing on devices with fixed shared memory bank size. + * + * The supported bank configurations are: + * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: set bank width to the default initial + * setting (currently, four bytes). + * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to + * be natively four bytes. + * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to + * be natively eight bytes. + * + * \param config - requested shared memory configuration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cuCtxGetSharedMemConfig, + * ::cuFuncSetCacheConfig, + * ::cudaDeviceSetSharedMemConfig + */ +CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config); +#endif + +/** + * \brief Gets the context's API version. + * + * Returns a version number in \p version corresponding to the capabilities of + * the context (e.g. 3010 or 3020), which library developers can use to direct + * callers to a specific API version. If \p ctx is NULL, returns the API version + * used to create the currently bound context. + * + * Note that new API versions are only introduced when context capabilities are + * changed that break binary compatibility, so the API version and driver version + * may be different. For example, it is valid for the API version to be 3020 while + * the driver version is 4020. + * + * \param ctx - Context to check + * \param version - Pointer to version + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version); + +/** + * \brief Returns numerical values that correspond to the least and + * greatest stream priorities. + * + * Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond + * to the least and greatest stream priorities respectively. Stream priorities + * follow a convention where lower numbers imply greater priorities. The range of + * meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority]. + * If the user attempts to create a stream with a priority value that is + * outside the meaningful range as specified by this API, the priority is + * automatically clamped down or up to either \p *leastPriority or \p *greatestPriority + * respectively. See ::cuStreamCreateWithPriority for details on creating a + * priority stream. + * A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value + * is not desired. + * + * This function will return '0' in both \p *leastPriority and \p *greatestPriority if + * the current context's device does not support stream priorities + * (see ::cuDeviceGetAttribute). + * + * \param leastPriority - Pointer to an int in which the numerical value for least + * stream priority is returned + * \param greatestPriority - Pointer to an int in which the numerical value for greatest + * stream priority is returned + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + * \sa ::cuStreamCreateWithPriority, + * ::cuStreamGetPriority, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cudaDeviceGetStreamPriorityRange + */ +CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority, int *greatestPriority); + +/** @} */ /* END CUDA_CTX */ + +/** + * \defgroup CUDA_CTX_DEPRECATED Context Management [DEPRECATED] + * + * ___MANBRIEF___ deprecated context management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the deprecated context management functions of the low-level + * CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Increment a context's usage-count + * + * \deprecated + * + * Note that this function is deprecated and should not be used. + * + * Increments the usage count of the context and passes back a context handle + * in \p *pctx that must be passed to ::cuCtxDetach() when the application is + * done with the context. ::cuCtxAttach() fails if there is no context current + * to the thread. + * + * Currently, the \p flags parameter must be 0. + * + * \param pctx - Returned context handle of the current context + * \param flags - Context attach flags (must be 0) + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxDetach, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags); + +/** + * \brief Decrement a context's usage-count + * + * \deprecated + * + * Note that this function is deprecated and should not be used. + * + * Decrements the usage count of the context \p ctx, and destroys the context + * if the usage count goes to 0. The context must be a handle that was passed + * back by ::cuCtxCreate() or ::cuCtxAttach(), and must be current to the + * calling thread. + * + * \param ctx - Context to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxDetach(CUcontext ctx); + +/** @} */ /* END CUDA_CTX_DEPRECATED */ + + +/** + * \defgroup CUDA_MODULE Module Management + * + * ___MANBRIEF___ module management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the module management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Loads a compute module + * + * Takes a filename \p fname and loads the corresponding module \p module into + * the current context. The CUDA driver API does not attempt to lazily + * allocate the resources needed by a module; if the memory for functions and + * data (constant and global) needed by the module cannot be allocated, + * ::cuModuleLoad() fails. The file should be a \e cubin file as output by + * \b nvcc, or a \e PTX file either as output by \b nvcc or handwritten, or + * a \e fatbin file as output by \b nvcc from toolchain 4.0 or later. + * + * \param module - Returned module + * \param fname - Filename of module to load + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_NOT_FOUND, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_FILE_NOT_FOUND, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU, + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, + * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname); + +/** + * \brief Load a module's data + * + * Takes a pointer \p image and loads the corresponding module \p module into + * the current context. The pointer may be obtained by mapping a \e cubin or + * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file + * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin + * object into the executable resources and using operating system calls such + * as Windows \c FindResource() to obtain the pointer. + * + * \param module - Returned module + * \param image - Module data to load + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU, + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, + * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image); + +/** + * \brief Load a module's data with options + * + * Takes a pointer \p image and loads the corresponding module \p module into + * the current context. The pointer may be obtained by mapping a \e cubin or + * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file + * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin + * object into the executable resources and using operating system calls such + * as Windows \c FindResource() to obtain the pointer. Options are passed as + * an array via \p options and any corresponding parameters are passed in + * \p optionValues. The number of total options is supplied via \p numOptions. + * Any outputs will be returned via \p optionValues. + * + * \param module - Returned module + * \param image - Module data to load + * \param numOptions - Number of options + * \param options - Options for JIT + * \param optionValues - Option values for JIT + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU, + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, + * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues); + +/** + * \brief Load a module's data + * + * Takes a pointer \p fatCubin and loads the corresponding module \p module + * into the current context. The pointer represents a fat binary object, + * which is a collection of different \e cubin and/or \e PTX files, all + * representing the same device code, but compiled and optimized for different + * architectures. + * + * Prior to CUDA 4.0, there was no documented API for constructing and using + * fat binary objects by programmers. Starting with CUDA 4.0, fat binary + * objects can be constructed by providing the -fatbin option to \b nvcc. + * More information can be found in the \b nvcc document. + * + * \param module - Returned module + * \param fatCubin - Fat binary to load + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_NOT_FOUND, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU, + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, + * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin); + +/** + * \brief Unloads a module + * + * Unloads a module \p hmod from the current context. + * + * \param hmod - Module to unload + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary + */ +CUresult CUDAAPI cuModuleUnload(CUmodule hmod); + +/** + * \brief Returns a function handle + * + * Returns in \p *hfunc the handle of the function of name \p name located in + * module \p hmod. If no function of that name exists, ::cuModuleGetFunction() + * returns ::CUDA_ERROR_NOT_FOUND. + * + * \param hfunc - Returned function handle + * \param hmod - Module to retrieve function from + * \param name - Name of function to retrieve + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name); + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Returns a global pointer from a module + * + * Returns in \p *dptr and \p *bytes the base pointer and size of the + * global of name \p name located in module \p hmod. If no variable of that name + * exists, ::cuModuleGetGlobal() returns ::CUDA_ERROR_NOT_FOUND. Both + * parameters \p dptr and \p bytes are optional. If one of them is + * NULL, it is ignored. + * + * \param dptr - Returned global device pointer + * \param bytes - Returned global size in bytes + * \param hmod - Module to retrieve global from + * \param name - Name of global to retrieve + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload, + * ::cudaGetSymbolAddress, + * ::cudaGetSymbolSize + */ +CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name); +#endif /* __CUDA_API_VERSION >= 3020 */ + +/** + * \brief Returns a handle to a texture reference + * + * Returns in \p *pTexRef the handle of the texture reference of name \p name + * in the module \p hmod. If no texture reference of that name exists, + * ::cuModuleGetTexRef() returns ::CUDA_ERROR_NOT_FOUND. This texture reference + * handle should not be destroyed, since it will be destroyed when the module + * is unloaded. + * + * \param pTexRef - Returned texture reference + * \param hmod - Module to retrieve texture reference from + * \param name - Name of texture reference to retrieve + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetSurfRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload, + * ::cudaGetTextureReference + */ +CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name); + +/** + * \brief Returns a handle to a surface reference + * + * Returns in \p *pSurfRef the handle of the surface reference of name \p name + * in the module \p hmod. If no surface reference of that name exists, + * ::cuModuleGetSurfRef() returns ::CUDA_ERROR_NOT_FOUND. + * + * \param pSurfRef - Returned surface reference + * \param hmod - Module to retrieve surface reference from + * \param name - Name of surface reference to retrieve + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload, + * ::cudaGetSurfaceReference + */ +CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name); + +#if __CUDA_API_VERSION >= 5050 + +/** + * \brief Creates a pending JIT linker invocation. + * + * If the call is successful, the caller owns the returned CUlinkState, which + * should eventually be destroyed with ::cuLinkDestroy. The + * device code machine size (32 or 64 bit) will match the calling application. + * + * Both linker and compiler options may be specified. Compiler options will + * be applied to inputs to this linker action which must be compiled from PTX. + * The options ::CU_JIT_WALL_TIME, + * ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, and ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES + * will accumulate data until the CUlinkState is destroyed. + * + * \p optionValues must remain valid for the life of the CUlinkState if output + * options are used. No other references to inputs are maintained after this + * call returns. + * + * \param numOptions Size of options arrays + * \param options Array of linker and compiler options + * \param optionValues Array of option values, each cast to void * + * \param stateOut On success, this will contain a CUlinkState to specify + * and complete this action + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND + * \notefnerr + * + * \sa ::cuLinkAddData, + * ::cuLinkAddFile, + * ::cuLinkComplete, + * ::cuLinkDestroy + */ +CUresult CUDAAPI +cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut); + +/** + * \brief Add an input to a pending linker invocation + * + * Ownership of \p data is retained by the caller. No reference is retained to any + * inputs after this call returns. + * + * This method accepts only compiler options, which are used if the data must + * be compiled from PTX, and does not accept any of + * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER, + * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET. + * + * \param state A pending linker action. + * \param type The type of the input data. + * \param data The input data. PTX must be NULL-terminated. + * \param size The length of the input data. + * \param name An optional name for this input in log messages. + * \param numOptions Size of options. + * \param options Options to be applied only for this input (overrides options from ::cuLinkCreate). + * \param optionValues Array of option values, each cast to void *. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_IMAGE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU + * + * \sa ::cuLinkCreate, + * ::cuLinkAddFile, + * ::cuLinkComplete, + * ::cuLinkDestroy + */ +CUresult CUDAAPI +cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, + unsigned int numOptions, CUjit_option *options, void **optionValues); + +/** + * \brief Add a file input to a pending linker invocation + * + * No reference is retained to any inputs after this call returns. + * + * This method accepts only compiler options, which are used if the input + * must be compiled from PTX, and does not accept any of + * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER, + * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET. + * + * This method is equivalent to invoking ::cuLinkAddData on the contents + * of the file. + * + * \param state A pending linker action + * \param type The type of the input data + * \param path Path to the input file + * \param numOptions Size of options + * \param options Options to be applied only for this input (overrides options from ::cuLinkCreate) + * \param optionValues Array of option values, each cast to void * + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_FILE_NOT_FOUND + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_IMAGE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU + * + * \sa ::cuLinkCreate, + * ::cuLinkAddData, + * ::cuLinkComplete, + * ::cuLinkDestroy + */ +CUresult CUDAAPI +cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path, + unsigned int numOptions, CUjit_option *options, void **optionValues); + +/** + * \brief Complete a pending linker invocation + * + * Completes the pending linker action and returns the cubin image for the linked + * device code, which can be used with ::cuModuleLoadData. The cubin is owned by + * \p state, so it should be loaded before \p state is destroyed via ::cuLinkDestroy. + * This call does not destroy \p state. + * + * \param state A pending linker invocation + * \param cubinOut On success, this will point to the output image + * \param sizeOut Optional parameter to receive the size of the generated image + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuLinkCreate, + * ::cuLinkAddData, + * ::cuLinkAddFile, + * ::cuLinkDestroy, + * ::cuModuleLoadData + */ +CUresult CUDAAPI +cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut); + +/** + * \brief Destroys state for a JIT linker invocation. + * + * \param state State object for the linker invocation + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE + * + * \sa ::cuLinkCreate + */ +CUresult CUDAAPI +cuLinkDestroy(CUlinkState state); + +#endif /* __CUDA_API_VERSION >= 5050 */ + +/** @} */ /* END CUDA_MODULE */ + + +/** + * \defgroup CUDA_MEM Memory Management + * + * ___MANBRIEF___ memory management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the memory management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Gets free and total memory + * + * Returns in \p *free and \p *total respectively, the free and total amount of + * memory available for allocation by the CUDA context, in bytes. + * + * \param free - Returned free memory in bytes + * \param total - Returned total memory in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemGetInfo + */ +CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total); + +/** + * \brief Allocates device memory + * + * Allocates \p bytesize bytes of linear memory on the device and returns in + * \p *dptr a pointer to the allocated memory. The allocated memory is suitably + * aligned for any kind of variable. The memory is not cleared. If \p bytesize + * is 0, ::cuMemAlloc() returns ::CUDA_ERROR_INVALID_VALUE. + * + * \param dptr - Returned device pointer + * \param bytesize - Requested allocation size in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMalloc + */ +CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize); + +/** + * \brief Allocates pitched device memory + * + * Allocates at least \p WidthInBytes * \p Height bytes of linear memory on + * the device and returns in \p *dptr a pointer to the allocated memory. The + * function may pad the allocation to ensure that corresponding pointers in + * any given row will continue to meet the alignment requirements for + * coalescing as the address is updated from row to row. \p ElementSizeBytes + * specifies the size of the largest reads and writes that will be performed + * on the memory range. \p ElementSizeBytes may be 4, 8 or 16 (since coalesced + * memory transactions are not possible on other data sizes). If + * \p ElementSizeBytes is smaller than the actual read/write size of a kernel, + * the kernel will run correctly, but possibly at reduced speed. The pitch + * returned in \p *pPitch by ::cuMemAllocPitch() is the width in bytes of the + * allocation. The intended usage of pitch is as a separate parameter of the + * allocation, used to compute addresses within the 2D array. Given the row + * and column of an array element of type \b T, the address is computed as: + * \code + T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column; + * \endcode + * + * The pitch returned by ::cuMemAllocPitch() is guaranteed to work with + * ::cuMemcpy2D() under all circumstances. For allocations of 2D arrays, it is + * recommended that programmers consider performing pitch allocations using + * ::cuMemAllocPitch(). Due to alignment restrictions in the hardware, this is + * especially true if the application will be performing 2D memory copies + * between different regions of device memory (whether linear memory or CUDA + * arrays). + * + * The byte alignment of the pitch returned by ::cuMemAllocPitch() is guaranteed + * to match or exceed the alignment requirement for texture binding with + * ::cuTexRefSetAddress2D(). + * + * \param dptr - Returned device pointer + * \param pPitch - Returned pitch of allocation in bytes + * \param WidthInBytes - Requested allocation width in bytes + * \param Height - Requested allocation height in rows + * \param ElementSizeBytes - Size of largest reads/writes for range + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMallocPitch + */ +CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes); + +/** + * \brief Frees device memory + * + * Frees the memory space pointed to by \p dptr, which must have been returned + * by a previous call to ::cuMemAlloc() or ::cuMemAllocPitch(). + * + * \param dptr - Pointer to memory to free + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaFree + */ +CUresult CUDAAPI cuMemFree(CUdeviceptr dptr); + +/** + * \brief Get information on memory allocations + * + * Returns the base address in \p *pbase and size in \p *psize of the + * allocation by ::cuMemAlloc() or ::cuMemAllocPitch() that contains the input + * pointer \p dptr. Both parameters \p pbase and \p psize are optional. If one + * of them is NULL, it is ignored. + * + * \param pbase - Returned base address + * \param psize - Returned size of device memory allocation + * \param dptr - Device pointer to query + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_NOT_FOUND, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr); + +/** + * \brief Allocates page-locked host memory + * + * Allocates \p bytesize bytes of host memory that is page-locked and + * accessible to the device. The driver tracks the virtual memory ranges + * allocated with this function and automatically accelerates calls to + * functions such as ::cuMemcpy(). Since the memory can be accessed directly by + * the device, it can be read or written with much higher bandwidth than + * pageable memory obtained with functions such as ::malloc(). Allocating + * excessive amounts of memory with ::cuMemAllocHost() may degrade system + * performance, since it reduces the amount of memory available to the system + * for paging. As a result, this function is best used sparingly to allocate + * staging areas for data exchange between host and device. + * + * Note all host memory allocated using ::cuMemHostAlloc() will automatically + * be immediately accessible to all contexts on all devices which support unified + * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). + * The device pointer that may be used to access this host memory from those + * contexts is always equal to the returned host pointer \p *pp. + * See \ref CUDA_UNIFIED for additional details. + * + * \param pp - Returned host pointer to page-locked memory + * \param bytesize - Requested allocation size in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMallocHost + */ +CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize); +#endif /* __CUDA_API_VERSION >= 3020 */ + +/** + * \brief Frees page-locked host memory + * + * Frees the memory space pointed to by \p p, which must have been returned by + * a previous call to ::cuMemAllocHost(). + * + * \param p - Pointer to memory to free + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaFreeHost + */ +CUresult CUDAAPI cuMemFreeHost(void *p); + +/** + * \brief Allocates page-locked host memory + * + * Allocates \p bytesize bytes of host memory that is page-locked and accessible + * to the device. The driver tracks the virtual memory ranges allocated with + * this function and automatically accelerates calls to functions such as + * ::cuMemcpyHtoD(). Since the memory can be accessed directly by the device, + * it can be read or written with much higher bandwidth than pageable memory + * obtained with functions such as ::malloc(). Allocating excessive amounts of + * pinned memory may degrade system performance, since it reduces the amount + * of memory available to the system for paging. As a result, this function is + * best used sparingly to allocate staging areas for data exchange between + * host and device. + * + * The \p Flags parameter enables different options to be specified that + * affect the allocation, as follows. + * + * - ::CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be + * considered as pinned memory by all CUDA contexts, not just the one that + * performed the allocation. + * + * - ::CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA address + * space. The device pointer to the memory may be obtained by calling + * ::cuMemHostGetDevicePointer(). + * + * - ::CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined + * (WC). WC memory can be transferred across the PCI Express bus more + * quickly on some system configurations, but cannot be read efficiently by + * most CPUs. WC memory is a good option for buffers that will be written by + * the CPU and read by the GPU via mapped pinned memory or host->device + * transfers. + * + * All of these flags are orthogonal to one another: a developer may allocate + * memory that is portable, mapped and/or write-combined with no restrictions. + * + * The CUDA context must have been created with the ::CU_CTX_MAP_HOST flag in + * order for the ::CU_MEMHOSTALLOC_DEVICEMAP flag to have any effect. + * + * The ::CU_MEMHOSTALLOC_DEVICEMAP flag may be specified on CUDA contexts for + * devices that do not support mapped pinned memory. The failure is deferred + * to ::cuMemHostGetDevicePointer() because the memory may be mapped into + * other CUDA contexts via the ::CU_MEMHOSTALLOC_PORTABLE flag. + * + * The memory allocated by this function must be freed with ::cuMemFreeHost(). + * + * Note all host memory allocated using ::cuMemHostAlloc() will automatically + * be immediately accessible to all contexts on all devices which support unified + * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). + * Unless the flag ::CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device pointer + * that may be used to access this host memory from those contexts is always equal + * to the returned host pointer \p *pp. If the flag ::CU_MEMHOSTALLOC_WRITECOMBINED + * is specified, then the function ::cuMemHostGetDevicePointer() must be used + * to query the device pointer, even if the context supports unified addressing. + * See \ref CUDA_UNIFIED for additional details. + * + * \param pp - Returned host pointer to page-locked memory + * \param bytesize - Requested allocation size in bytes + * \param Flags - Flags for allocation request + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaHostAlloc + */ +CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags); + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Passes back device pointer of mapped pinned memory + * + * Passes back the device pointer \p pdptr corresponding to the mapped, pinned + * host buffer \p p allocated by ::cuMemHostAlloc. + * + * ::cuMemHostGetDevicePointer() will fail if the ::CU_MEMHOSTALLOC_DEVICEMAP + * flag was not specified at the time the memory was allocated, or if the + * function is called on a GPU that does not support mapped pinned memory. + * + * For devices that have a non-zero value for the device attribute + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory + * can also be accessed from the device using the host pointer \p p. + * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not + * match the original host pointer \p p and depends on the devices visible to the + * application. If all devices visible to the application have a non-zero value for the + * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer() + * will match the original pointer \p p. If any device visible to the application + * has a zero value for the device attribute, the device pointer returned by + * ::cuMemHostGetDevicePointer() will not match the original host pointer \p p, + * but it will be suitable for use on all devices provided Unified Virtual Addressing + * is enabled. In such systems, it is valid to access the memory using either pointer + * on devices that have a non-zero value for the device attribute. Note however that + * such devices should access the memory using only of the two pointers and not both. + * + * \p Flags provides for future releases. For now, it must be set to 0. + * + * \param pdptr - Returned device pointer + * \param p - Host pointer + * \param Flags - Options (must be 0) + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaHostGetDevicePointer + */ +CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags); +#endif /* __CUDA_API_VERSION >= 3020 */ + +/** + * \brief Passes back flags that were used for a pinned allocation + * + * Passes back the flags \p pFlags that were specified when allocating + * the pinned host buffer \p p allocated by ::cuMemHostAlloc. + * + * ::cuMemHostGetFlags() will fail if the pointer does not reside in + * an allocation performed by ::cuMemAllocHost() or ::cuMemHostAlloc(). + * + * \param pFlags - Returned flags word + * \param p - Host pointer + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuMemAllocHost, + * ::cuMemHostAlloc, + * ::cudaHostGetFlags + */ +CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p); + +#if __CUDA_API_VERSION >= 6000 + +/** + * \brief Allocates memory that will be automatically managed by the Unified Memory system + * + * Allocates \p bytesize bytes of managed memory on the device and returns in + * \p *dptr a pointer to the allocated memory. If the device doesn't support + * allocating managed memory, ::CUDA_ERROR_NOT_SUPPORTED is returned. Support + * for managed memory can be queried using the device attribute + * ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY. The allocated memory is suitably + * aligned for any kind of variable. The memory is not cleared. If \p bytesize + * is 0, ::cuMemAllocManaged returns ::CUDA_ERROR_INVALID_VALUE. The pointer + * is valid on the CPU and on all GPUs in the system that support managed memory. + * All accesses to this pointer must obey the Unified Memory programming model. + * + * \p flags specifies the default stream association for this allocation. + * \p flags must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST. If + * ::CU_MEM_ATTACH_GLOBAL is specified, then this memory is accessible from + * any stream on any device. If ::CU_MEM_ATTACH_HOST is specified, then the + * allocation should not be accessed from devices that have a zero value for the + * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS; an explicit call to + * ::cuStreamAttachMemAsync will be required to enable access on such devices. + * + * If the association is later changed via ::cuStreamAttachMemAsync to + * a single stream, the default association as specifed during ::cuMemAllocManaged + * is restored when that stream is destroyed. For __managed__ variables, the + * default association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a + * stream is an asynchronous operation, and as a result, the change to default + * association won't happen until all work in the stream has completed. + * + * Memory allocated with ::cuMemAllocManaged should be released with ::cuMemFree. + * + * Device memory oversubscription is possible for GPUs that have a non-zero value for the + * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Managed memory on + * such GPUs may be evicted from device memory to host memory at any time by the Unified + * Memory driver in order to make room for other allocations. + * + * In a multi-GPU system where all GPUs have a non-zero value for the device attribute + * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, managed memory may not be populated when this + * API returns and instead may be populated on access. In such systems, managed memory can + * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to + * maintain data locality and prevent excessive page faults to the extent possible. The application + * can also guide the driver about memory usage patterns via ::cuMemAdvise. The application + * can also explicitly migrate memory to a desired processor's memory via + * ::cuMemPrefetchAsync. + * + * In a multi-GPU system where all of the GPUs have a zero value for the device attribute + * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS and all the GPUs have peer-to-peer support + * with each other, the physical storage for managed memory is created on the GPU which is active + * at the time ::cuMemAllocManaged is called. All other GPUs will reference the data at reduced + * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate + * memory among such GPUs. + * + * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and + * where the value of the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS + * is zero for at least one of those GPUs, the location chosen for physical storage of managed + * memory is system-dependent. + * - On Linux, the location chosen will be device memory as long as the current set of active + * contexts are on devices that either have peer-to-peer support with each other or have a + * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + * If there is an active context on a GPU that does not have a non-zero value for that device + * attribute and it does not have peer-to-peer support with the other devices that have active + * contexts on them, then the location for physical storage will be 'zero-copy' or host memory. + * Note that this means that managed memory that is located in device memory is migrated to + * host memory if a new context is created on a GPU that doesn't have a non-zero value for + * the device attribute and does not support peer-to-peer with at least one of the other devices + * that has an active context. This in turn implies that context creation may fail if there is + * insufficient host memory to migrate all managed allocations. + * - On Windows, the physical storage is always created in 'zero-copy' or host memory. + * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these + * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to + * restrict CUDA to only use those GPUs that have peer-to-peer support. + * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a + * non-zero value to force the driver to always use device memory for physical storage. + * When this environment variable is set to a non-zero value, all contexts created in + * that process on devices that support managed memory have to be peer-to-peer compatible + * with each other. Context creation will fail if a context is created on a device that + * supports managed memory and is not peer-to-peer compatible with any of the other + * managed memory supporting devices on which contexts were previously created, even if + * those contexts have been destroyed. These environment variables are described + * in the CUDA programming guide under the "CUDA environment variables" section. + * - On ARM, managed memory is not available on discrete gpu with Drive PX-2. + * + * \param dptr - Returned device pointer + * \param bytesize - Requested allocation size in bytes + * \param flags - Must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cuDeviceGetAttribute, ::cuStreamAttachMemAsync, + * ::cudaMallocManaged + */ +CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, unsigned int flags); + +#endif /* __CUDA_API_VERSION >= 6000 */ + +#if __CUDA_API_VERSION >= 4010 + +/** + * \brief Returns a handle to a compute device + * + * Returns in \p *device a device handle given a PCI bus ID string. + * + * \param dev - Returned device handle + * + * \param pciBusId - String in one of the following forms: + * [domain]:[bus]:[device].[function] + * [domain]:[bus]:[device] + * [bus]:[device].[function] + * where \p domain, \p bus, \p device, and \p function are all hexadecimal values + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGet, + * ::cuDeviceGetAttribute, + * ::cuDeviceGetPCIBusId, + * ::cudaDeviceGetByPCIBusId + */ +CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId); + +/** + * \brief Returns a PCI Bus Id string for the device + * + * Returns an ASCII string identifying the device \p dev in the NULL-terminated + * string pointed to by \p pciBusId. \p len specifies the maximum length of the + * string that may be returned. + * + * \param pciBusId - Returned identifier string for the device in the following format + * [domain]:[bus]:[device].[function] + * where \p domain, \p bus, \p device, and \p function are all hexadecimal values. + * pciBusId should be large enough to store 13 characters including the NULL-terminator. + * + * \param len - Maximum length of string to store in \p name + * + * \param dev - Device to get identifier string for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGet, + * ::cuDeviceGetAttribute, + * ::cuDeviceGetByPCIBusId, + * ::cudaDeviceGetPCIBusId + */ +CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev); + +/** + * \brief Gets an interprocess handle for a previously allocated event + * + * Takes as input a previously allocated event. This event must have been + * created with the ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING + * flags set. This opaque handle may be copied into other processes and + * opened with ::cuIpcOpenEventHandle to allow efficient hardware + * synchronization between GPU work in different processes. + * + * After the event has been opened in the importing process, + * ::cuEventRecord, ::cuEventSynchronize, ::cuStreamWaitEvent and + * ::cuEventQuery may be used in either process. Performing operations + * on the imported event after the exported event has been freed + * with ::cuEventDestroy will result in undefined behavior. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux operating systems. + * + * \param pHandle - Pointer to a user allocated CUipcEventHandle + * in which to return the opaque event handle + * \param event - Event allocated with ::CU_EVENT_INTERPROCESS and + * ::CU_EVENT_DISABLE_TIMING flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_MAP_FAILED + * + * \sa + * ::cuEventCreate, + * ::cuEventDestroy, + * ::cuEventSynchronize, + * ::cuEventQuery, + * ::cuStreamWaitEvent, + * ::cuIpcOpenEventHandle, + * ::cuIpcGetMemHandle, + * ::cuIpcOpenMemHandle, + * ::cuIpcCloseMemHandle, + * ::cudaIpcGetEventHandle + */ +CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event); + +/** + * \brief Opens an interprocess event handle for use in the current process + * + * Opens an interprocess event handle exported from another process with + * ::cuIpcGetEventHandle. This function returns a ::CUevent that behaves like + * a locally created event with the ::CU_EVENT_DISABLE_TIMING flag specified. + * This event must be freed with ::cuEventDestroy. + * + * Performing operations on the imported event after the exported event has + * been freed with ::cuEventDestroy will result in undefined behavior. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux operating systems. + * + * \param phEvent - Returns the imported event + * \param handle - Interprocess handle to open + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_MAP_FAILED, + * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, + * ::CUDA_ERROR_INVALID_HANDLE + * + * \sa + * ::cuEventCreate, + * ::cuEventDestroy, + * ::cuEventSynchronize, + * ::cuEventQuery, + * ::cuStreamWaitEvent, + * ::cuIpcGetEventHandle, + * ::cuIpcGetMemHandle, + * ::cuIpcOpenMemHandle, + * ::cuIpcCloseMemHandle, + * ::cudaIpcOpenEventHandle + */ +CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle); + +/** + * \brief Gets an interprocess memory handle for an existing device memory + * allocation + * + * Takes a pointer to the base of an existing device memory allocation created + * with ::cuMemAlloc and exports it for use in another process. This is a + * lightweight operation and may be called multiple times on an allocation + * without adverse effects. + * + * If a region of memory is freed with ::cuMemFree and a subsequent call + * to ::cuMemAlloc returns memory with the same device address, + * ::cuIpcGetMemHandle will return a unique handle for the + * new memory. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux operating systems. + * + * \param pHandle - Pointer to user allocated ::CUipcMemHandle to return + * the handle in. + * \param dptr - Base pointer to previously allocated device memory + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_MAP_FAILED, + * + * \sa + * ::cuMemAlloc, + * ::cuMemFree, + * ::cuIpcGetEventHandle, + * ::cuIpcOpenEventHandle, + * ::cuIpcOpenMemHandle, + * ::cuIpcCloseMemHandle, + * ::cudaIpcGetMemHandle + */ +CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr); + +/** + * \brief Opens an interprocess memory handle exported from another process + * and returns a device pointer usable in the local process. + * + * Maps memory exported from another process with ::cuIpcGetMemHandle into + * the current device address space. For contexts on different devices + * ::cuIpcOpenMemHandle can attempt to enable peer access between the + * devices as if the user called ::cuCtxEnablePeerAccess. This behavior is + * controlled by the ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS flag. + * ::cuDeviceCanAccessPeer can determine if a mapping is possible. + * + * Contexts that may open ::CUipcMemHandles are restricted in the following way. + * ::CUipcMemHandles from each ::CUdevice in a given process may only be opened + * by one ::CUcontext per ::CUdevice per other process. + * + * Memory returned from ::cuIpcOpenMemHandle must be freed with + * ::cuIpcCloseMemHandle. + * + * Calling ::cuMemFree on an exported memory region before calling + * ::cuIpcCloseMemHandle in the importing context will result in undefined + * behavior. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux operating systems. + * + * \param pdptr - Returned device pointer + * \param handle - ::CUipcMemHandle to open + * \param Flags - Flags for this operation. Must be specified as ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_MAP_FAILED, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_TOO_MANY_PEERS + * + * \note No guarantees are made about the address returned in \p *pdptr. + * In particular, multiple processes may not receive the same address for the same \p handle. + * + * \sa + * ::cuMemAlloc, + * ::cuMemFree, + * ::cuIpcGetEventHandle, + * ::cuIpcOpenEventHandle, + * ::cuIpcGetMemHandle, + * ::cuIpcCloseMemHandle, + * ::cuCtxEnablePeerAccess, + * ::cuDeviceCanAccessPeer, + * ::cudaIpcOpenMemHandle + */ +CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags); + +/** + * \brief Close memory mapped with ::cuIpcOpenMemHandle + * + * Unmaps memory returnd by ::cuIpcOpenMemHandle. The original allocation + * in the exporting process as well as imported mappings in other processes + * will be unaffected. + * + * Any resources used to enable peer access will be freed if this is the + * last mapping using them. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux operating systems. + * + * \param dptr - Device pointer returned by ::cuIpcOpenMemHandle + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_MAP_FAILED, + * ::CUDA_ERROR_INVALID_HANDLE, + * + * \sa + * ::cuMemAlloc, + * ::cuMemFree, + * ::cuIpcGetEventHandle, + * ::cuIpcOpenEventHandle, + * ::cuIpcGetMemHandle, + * ::cuIpcOpenMemHandle, + * ::cudaIpcCloseMemHandle + */ +CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr); + +#endif /* __CUDA_API_VERSION >= 4010 */ + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Registers an existing host memory range for use by CUDA + * + * Page-locks the memory range specified by \p p and \p bytesize and maps it + * for the device(s) as specified by \p Flags. This memory range also is added + * to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + * calls to functions such as ::cuMemcpyHtoD(). Since the memory can be accessed + * directly by the device, it can be read or written with much higher bandwidth + * than pageable memory that has not been registered. Page-locking excessive + * amounts of memory may degrade system performance, since it reduces the amount + * of memory available to the system for paging. As a result, this function is + * best used sparingly to register staging areas for data exchange between + * host and device. + * + * This function has limited support on Mac OS X. OS 10.7 or higher is required. + * + * The \p Flags parameter enables different options to be specified that + * affect the allocation, as follows. + * + * - ::CU_MEMHOSTREGISTER_PORTABLE: The memory returned by this call will be + * considered as pinned memory by all CUDA contexts, not just the one that + * performed the allocation. + * + * - ::CU_MEMHOSTREGISTER_DEVICEMAP: Maps the allocation into the CUDA address + * space. The device pointer to the memory may be obtained by calling + * ::cuMemHostGetDevicePointer(). + * + * - ::CU_MEMHOSTREGISTER_IOMEMORY: The pointer is treated as pointing to some + * I/O memory space, e.g. the PCI Express resource of a 3rd party device. + * + * All of these flags are orthogonal to one another: a developer may page-lock + * memory that is portable or mapped with no restrictions. + * + * The CUDA context must have been created with the ::CU_CTX_MAP_HOST flag in + * order for the ::CU_MEMHOSTREGISTER_DEVICEMAP flag to have any effect. + * + * The ::CU_MEMHOSTREGISTER_DEVICEMAP flag may be specified on CUDA contexts for + * devices that do not support mapped pinned memory. The failure is deferred + * to ::cuMemHostGetDevicePointer() because the memory may be mapped into + * other CUDA contexts via the ::CU_MEMHOSTREGISTER_PORTABLE flag. + * + * For devices that have a non-zero value for the device attribute + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory + * can also be accessed from the device using the host pointer \p p. + * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not + * match the original host pointer \p ptr and depends on the devices visible to the + * application. If all devices visible to the application have a non-zero value for the + * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer() + * will match the original pointer \p ptr. If any device visible to the application + * has a zero value for the device attribute, the device pointer returned by + * ::cuMemHostGetDevicePointer() will not match the original host pointer \p ptr, + * but it will be suitable for use on all devices provided Unified Virtual Addressing + * is enabled. In such systems, it is valid to access the memory using either pointer + * on devices that have a non-zero value for the device attribute. Note however that + * such devices should access the memory using only of the two pointers and not both. + * + * The memory page-locked by this function must be unregistered with + * ::cuMemHostUnregister(). + * + * \param p - Host pointer to memory to page-lock + * \param bytesize - Size in bytes of the address range to page-lock + * \param Flags - Flags for allocation request + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED, + * ::CUDA_ERROR_NOT_PERMITTED, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa + * ::cuMemHostUnregister, + * ::cuMemHostGetFlags, + * ::cuMemHostGetDevicePointer, + * ::cudaHostRegister + */ +CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags); + +/** + * \brief Unregisters a memory range that was registered with cuMemHostRegister. + * + * Unmaps the memory range whose base address is specified by \p p, and makes + * it pageable again. + * + * The base address must be the same one specified to ::cuMemHostRegister(). + * + * \param p - Host pointer to memory to unregister + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, + * \notefnerr + * + * \sa + * ::cuMemHostRegister, + * ::cudaHostUnregister + */ +CUresult CUDAAPI cuMemHostUnregister(void *p); + +/** + * \brief Copies memory + * + * Copies data between two pointers. + * \p dst and \p src are base pointers of the destination and source, respectively. + * \p ByteCount specifies the number of bytes to copy. + * Note that this function infers the type of the transfer (host to host, host to + * device, device to device, or device to host) from the pointer values. This + * function is only allowed in contexts which support unified addressing. + * + * \param dst - Destination unified virtual address space pointer + * \param src - Source unified virtual address space pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy, + * ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol + */ +CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); + +/** + * \brief Copies device memory between two contexts + * + * Copies from device memory in one context to device memory in another + * context. \p dstDevice is the base device pointer of the destination memory + * and \p dstContext is the destination context. \p srcDevice is the base + * device pointer of the source memory and \p srcContext is the source pointer. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param dstContext - Destination context + * \param srcDevice - Source device pointer + * \param srcContext - Source context + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuMemcpyDtoD, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, + * ::cuMemcpy3DPeerAsync, + * ::cudaMemcpyPeer + */ +CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount); + +#endif /* __CUDA_API_VERSION >= 4000 */ + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Copies memory from Host to Device + * + * Copies from host memory to device memory. \p dstDevice and \p srcHost are + * the base addresses of the destination and source, respectively. \p ByteCount + * specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param srcHost - Source host pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy, + * ::cudaMemcpyToSymbol + */ +CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); + +/** + * \brief Copies memory from Device to Host + * + * Copies from device to host memory. \p dstHost and \p srcDevice specify the + * base pointers of the destination and source, respectively. \p ByteCount + * specifies the number of bytes to copy. + * + * \param dstHost - Destination host pointer + * \param srcDevice - Source device pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy, + * ::cudaMemcpyFromSymbol + */ +CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); + +/** + * \brief Copies memory from Device to Device + * + * Copies from device memory to device memory. \p dstDevice and \p srcDevice + * are the base pointers of the destination and source, respectively. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param srcDevice - Source device pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy, + * ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol + */ +CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); + +/** + * \brief Copies memory from Device to Array + * + * Copies from device memory to a 1D CUDA array. \p dstArray and \p dstOffset + * specify the CUDA array handle and starting index of the destination data. + * \p srcDevice specifies the base pointer of the source. \p ByteCount + * specifies the number of bytes to copy. + * + * \param dstArray - Destination array + * \param dstOffset - Offset in bytes of destination array + * \param srcDevice - Source device pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpyToArray + */ +CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount); + +/** + * \brief Copies memory from Array to Device + * + * Copies from one 1D CUDA array to device memory. \p dstDevice specifies the + * base pointer of the destination and must be naturally aligned with the CUDA + * array elements. \p srcArray and \p srcOffset specify the CUDA array handle + * and the offset in bytes into the array where the copy is to begin. + * \p ByteCount specifies the number of bytes to copy and must be evenly + * divisible by the array element size. + * + * \param dstDevice - Destination device pointer + * \param srcArray - Source array + * \param srcOffset - Offset in bytes of source array + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpyFromArray + */ +CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount); + +/** + * \brief Copies memory from Host to Array + * + * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset + * specify the CUDA array handle and starting offset in bytes of the destination + * data. \p pSrc specifies the base address of the source. \p ByteCount specifies + * the number of bytes to copy. + * + * \param dstArray - Destination array + * \param dstOffset - Offset in bytes of destination array + * \param srcHost - Source host pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpyToArray + */ +CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount); + +/** + * \brief Copies memory from Array to Host + * + * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base + * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA + * array handle and starting offset in bytes of the source data. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstHost - Destination device pointer + * \param srcArray - Source array + * \param srcOffset - Offset in bytes of source array + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpyFromArray + */ +CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount); + +/** + * \brief Copies memory from Array to Array + * + * Copies from one 1D CUDA array to another. \p dstArray and \p srcArray + * specify the handles of the destination and source CUDA arrays for the copy, + * respectively. \p dstOffset and \p srcOffset specify the destination and + * source offsets in bytes into the CUDA arrays. \p ByteCount is the number of + * bytes to be copied. The size of the elements in the CUDA arrays need not be + * the same format, but the elements must be the same size; and count must be + * evenly divisible by that size. + * + * \param dstArray - Destination array + * \param dstOffset - Offset in bytes of destination array + * \param srcArray - Source array + * \param srcOffset - Offset in bytes of source array + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpyArrayToArray + */ +CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount); + +/** + * \brief Copies memory for 2D arrays + * + * Perform a 2D memory copy according to the parameters specified in \p pCopy. + * The ::CUDA_MEMCPY2D structure is defined as: + * + * \code + typedef struct CUDA_MEMCPY2D_st { + unsigned int srcXInBytes, srcY; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + unsigned int srcPitch; + + unsigned int dstXInBytes, dstY; + CUmemorytype dstMemoryType; + void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + unsigned int dstPitch; + + unsigned int WidthInBytes; + unsigned int Height; + } CUDA_MEMCPY2D; + * \endcode + * where: + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the + * source and destination, respectively; ::CUmemorytype_enum is defined as: + * + * \code + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 + } CUmemorytype; + * \endcode + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch + * specify the (host) base address of the source data and the bytes per row to + * apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch + * specify the (device) base address of the source data and the bytes per row + * to apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the + * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are + * ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch + * specify the (host) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch + * specify the (device) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the + * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are + * ignored. + * + * - ::srcXInBytes and ::srcY specify the base address of the source data for + * the copy. + * + * \par + * For host pointers, the starting address is + * \code + void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array + * element size. + * + * - ::dstXInBytes and ::dstY specify the base address of the destination data + * for the copy. + * + * \par + * For host pointers, the base address is + * \code + void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array + * element size. + * + * - ::WidthInBytes and ::Height specify the width (in bytes) and height of + * the 2D copy being performed. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * + * \par + * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back + * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies + * (device to device, CUDA array to device, CUDA array to CUDA array), + * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch(). + * ::cuMemcpy2DUnaligned() does not have this restriction, but may run + * significantly slower in the cases where ::cuMemcpy2D() would have returned + * an error code. + * + * \param pCopy - Parameters for the memory copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, + * ::cudaMemcpy2DFromArray + */ +CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy); + +/** + * \brief Copies memory for 2D arrays + * + * Perform a 2D memory copy according to the parameters specified in \p pCopy. + * The ::CUDA_MEMCPY2D structure is defined as: + * + * \code + typedef struct CUDA_MEMCPY2D_st { + unsigned int srcXInBytes, srcY; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + unsigned int srcPitch; + unsigned int dstXInBytes, dstY; + CUmemorytype dstMemoryType; + void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + unsigned int dstPitch; + unsigned int WidthInBytes; + unsigned int Height; + } CUDA_MEMCPY2D; + * \endcode + * where: + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the + * source and destination, respectively; ::CUmemorytype_enum is defined as: + * + * \code + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 + } CUmemorytype; + * \endcode + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch + * specify the (host) base address of the source data and the bytes per row to + * apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch + * specify the (device) base address of the source data and the bytes per row + * to apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the + * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are + * ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch + * specify the (host) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch + * specify the (device) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the + * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are + * ignored. + * + * - ::srcXInBytes and ::srcY specify the base address of the source data for + * the copy. + * + * \par + * For host pointers, the starting address is + * \code + void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array + * element size. + * + * - ::dstXInBytes and ::dstY specify the base address of the destination data + * for the copy. + * + * \par + * For host pointers, the base address is + * \code + void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array + * element size. + * + * - ::WidthInBytes and ::Height specify the width (in bytes) and height of + * the 2D copy being performed. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * + * \par + * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back + * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies + * (device to device, CUDA array to device, CUDA array to CUDA array), + * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch(). + * ::cuMemcpy2DUnaligned() does not have this restriction, but may run + * significantly slower in the cases where ::cuMemcpy2D() would have returned + * an error code. + * + * \param pCopy - Parameters for the memory copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, + * ::cudaMemcpy2DFromArray + */ +CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy); + +/** + * \brief Copies memory for 3D arrays + * + * Perform a 3D memory copy according to the parameters specified in + * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as: + * + * \code + typedef struct CUDA_MEMCPY3D_st { + + unsigned int srcXInBytes, srcY, srcZ; + unsigned int srcLOD; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + unsigned int srcPitch; // ignored when src is array + unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 + + unsigned int dstXInBytes, dstY, dstZ; + unsigned int dstLOD; + CUmemorytype dstMemoryType; + void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + unsigned int dstPitch; // ignored when dst is array + unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 + + unsigned int WidthInBytes; + unsigned int Height; + unsigned int Depth; + } CUDA_MEMCPY3D; + * \endcode + * where: + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the + * source and destination, respectively; ::CUmemorytype_enum is defined as: + * + * \code + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 + } CUmemorytype; + * \endcode + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and + * ::srcHeight specify the (host) base address of the source data, the bytes + * per row, and the height of each 2D slice of the 3D array. ::srcArray is + * ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and + * ::srcHeight specify the (device) base address of the source data, the bytes + * per row, and the height of each 2D slice of the 3D array. ::srcArray is + * ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the + * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and + * ::srcHeight are ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch + * specify the (host) base address of the destination data, the bytes per row, + * and the height of each 2D slice of the 3D array. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch + * specify the (device) base address of the destination data, the bytes per + * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the + * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and + * ::dstHeight are ignored. + * + * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source + * data for the copy. + * + * \par + * For host pointers, the starting address is + * \code + void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array + * element size. + * + * - dstXInBytes, ::dstY and ::dstZ specify the base address of the + * destination data for the copy. + * + * \par + * For host pointers, the base address is + * \code + void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array + * element size. + * + * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height + * and depth of the 3D copy being performed. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * - If specified, ::srcHeight must be greater than or equal to ::Height + + * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. + * + * \par + * ::cuMemcpy3D() returns an error if any pitch is greater than the maximum + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). + * + * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be + * set to 0. + * + * \param pCopy - Parameters for the memory copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy3D + */ +CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy); +#endif /* __CUDA_API_VERSION >= 3020 */ + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Copies memory between contexts + * + * Perform a 3D memory copy according to the parameters specified in + * \p pCopy. See the definition of the ::CUDA_MEMCPY3D_PEER structure + * for documentation of its parameters. + * + * \param pCopy - Parameters for the memory copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, + * ::cuMemcpy3DPeerAsync, + * ::cudaMemcpy3DPeer + */ +CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy); + +/** + * \brief Copies memory asynchronously + * + * Copies data between two pointers. + * \p dst and \p src are base pointers of the destination and source, respectively. + * \p ByteCount specifies the number of bytes to copy. + * Note that this function infers the type of the transfer (host to host, host to + * device, device to device, or device to host) from the pointer values. This + * function is only allowed in contexts which support unified addressing. + * + * \param dst - Destination unified virtual address space pointer + * \param src - Source unified virtual address space pointer + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpyAsync, + * ::cudaMemcpyToSymbolAsync, + * ::cudaMemcpyFromSymbolAsync + */ +CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies device memory between two contexts asynchronously. + * + * Copies from device memory in one context to device memory in another + * context. \p dstDevice is the base device pointer of the destination memory + * and \p dstContext is the destination context. \p srcDevice is the base + * device pointer of the source memory and \p srcContext is the source pointer. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param dstContext - Destination context + * \param srcDevice - Source device pointer + * \param srcContext - Source context + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, + * ::cuMemcpy3DPeerAsync, + * ::cudaMemcpyPeerAsync + */ +CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream); +#endif /* __CUDA_API_VERSION >= 4000 */ + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Copies memory from Host to Device + * + * Copies from host memory to device memory. \p dstDevice and \p srcHost are + * the base addresses of the destination and source, respectively. \p ByteCount + * specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param srcHost - Source host pointer + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpyAsync, + * ::cudaMemcpyToSymbolAsync + */ +CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory from Device to Host + * + * Copies from device to host memory. \p dstHost and \p srcDevice specify the + * base pointers of the destination and source, respectively. \p ByteCount + * specifies the number of bytes to copy. + * + * \param dstHost - Destination host pointer + * \param srcDevice - Source device pointer + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpyAsync, + * ::cudaMemcpyFromSymbolAsync + */ +CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory from Device to Device + * + * Copies from device memory to device memory. \p dstDevice and \p srcDevice + * are the base pointers of the destination and source, respectively. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param srcDevice - Source device pointer + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpyAsync, + * ::cudaMemcpyToSymbolAsync, + * ::cudaMemcpyFromSymbolAsync + */ +CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory from Host to Array + * + * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset + * specify the CUDA array handle and starting offset in bytes of the + * destination data. \p srcHost specifies the base address of the source. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstArray - Destination array + * \param dstOffset - Offset in bytes of destination array + * \param srcHost - Source host pointer + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpyToArrayAsync + */ +CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory from Array to Host + * + * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base + * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA + * array handle and starting offset in bytes of the source data. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstHost - Destination pointer + * \param srcArray - Source array + * \param srcOffset - Offset in bytes of source array + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpyFromArrayAsync + */ +CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory for 2D arrays + * + * Perform a 2D memory copy according to the parameters specified in \p pCopy. + * The ::CUDA_MEMCPY2D structure is defined as: + * + * \code + typedef struct CUDA_MEMCPY2D_st { + unsigned int srcXInBytes, srcY; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + unsigned int srcPitch; + unsigned int dstXInBytes, dstY; + CUmemorytype dstMemoryType; + void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + unsigned int dstPitch; + unsigned int WidthInBytes; + unsigned int Height; + } CUDA_MEMCPY2D; + * \endcode + * where: + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the + * source and destination, respectively; ::CUmemorytype_enum is defined as: + * + * \code + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 + } CUmemorytype; + * \endcode + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch + * specify the (host) base address of the source data and the bytes per row to + * apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch + * specify the (device) base address of the source data and the bytes per row + * to apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the + * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are + * ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch + * specify the (host) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch + * specify the (device) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the + * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are + * ignored. + * + * - ::srcXInBytes and ::srcY specify the base address of the source data for + * the copy. + * + * \par + * For host pointers, the starting address is + * \code + void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array + * element size. + * + * - ::dstXInBytes and ::dstY specify the base address of the destination data + * for the copy. + * + * \par + * For host pointers, the base address is + * \code + void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array + * element size. + * + * - ::WidthInBytes and ::Height specify the width (in bytes) and height of + * the 2D copy being performed. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * - If specified, ::srcHeight must be greater than or equal to ::Height + + * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. + * + * \par + * ::cuMemcpy2DAsync() returns an error if any pitch is greater than the maximum + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back + * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies + * (device to device, CUDA array to device, CUDA array to CUDA array), + * ::cuMemcpy2DAsync() may fail for pitches not computed by ::cuMemAllocPitch(). + * + * \param pCopy - Parameters for the memory copy + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync + */ +CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream); + +/** + * \brief Copies memory for 3D arrays + * + * Perform a 3D memory copy according to the parameters specified in + * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as: + * + * \code + typedef struct CUDA_MEMCPY3D_st { + + unsigned int srcXInBytes, srcY, srcZ; + unsigned int srcLOD; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + unsigned int srcPitch; // ignored when src is array + unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 + + unsigned int dstXInBytes, dstY, dstZ; + unsigned int dstLOD; + CUmemorytype dstMemoryType; + void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + unsigned int dstPitch; // ignored when dst is array + unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 + + unsigned int WidthInBytes; + unsigned int Height; + unsigned int Depth; + } CUDA_MEMCPY3D; + * \endcode + * where: + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the + * source and destination, respectively; ::CUmemorytype_enum is defined as: + * + * \code + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 + } CUmemorytype; + * \endcode + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and + * ::srcHeight specify the (host) base address of the source data, the bytes + * per row, and the height of each 2D slice of the 3D array. ::srcArray is + * ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and + * ::srcHeight specify the (device) base address of the source data, the bytes + * per row, and the height of each 2D slice of the 3D array. ::srcArray is + * ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the + * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and + * ::srcHeight are ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch + * specify the (host) base address of the destination data, the bytes per row, + * and the height of each 2D slice of the 3D array. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch + * specify the (device) base address of the destination data, the bytes per + * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the + * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and + * ::dstHeight are ignored. + * + * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source + * data for the copy. + * + * \par + * For host pointers, the starting address is + * \code + void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array + * element size. + * + * - dstXInBytes, ::dstY and ::dstZ specify the base address of the + * destination data for the copy. + * + * \par + * For host pointers, the base address is + * \code + void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array + * element size. + * + * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height + * and depth of the 3D copy being performed. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * - If specified, ::srcHeight must be greater than or equal to ::Height + + * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. + * + * \par + * ::cuMemcpy3DAsync() returns an error if any pitch is greater than the maximum + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). + * + * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be + * set to 0. + * + * \param pCopy - Parameters for the memory copy + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpy3DAsync + */ +CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream); +#endif /* __CUDA_API_VERSION >= 3020 */ + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Copies memory between contexts asynchronously. + * + * Perform a 3D memory copy according to the parameters specified in + * \p pCopy. See the definition of the ::CUDA_MEMCPY3D_PEER structure + * for documentation of its parameters. + * + * \param pCopy - Parameters for the memory copy + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, + * ::cuMemcpy3DPeerAsync, + * ::cudaMemcpy3DPeerAsync + */ +CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream); +#endif /* __CUDA_API_VERSION >= 4000 */ + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Initializes device memory + * + * Sets the memory range of \p N 8-bit values to the specified value + * \p uc. + * + * \param dstDevice - Destination device pointer + * \param uc - Value to set + * \param N - Number of elements + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset + */ +CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N); + +/** + * \brief Initializes device memory + * + * Sets the memory range of \p N 16-bit values to the specified value + * \p us. The \p dstDevice pointer must be two byte aligned. + * + * \param dstDevice - Destination device pointer + * \param us - Value to set + * \param N - Number of elements + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset + */ +CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N); + +/** + * \brief Initializes device memory + * + * Sets the memory range of \p N 32-bit values to the specified value + * \p ui. The \p dstDevice pointer must be four byte aligned. + * + * \param dstDevice - Destination device pointer + * \param ui - Value to set + * \param N - Number of elements + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32Async, + * ::cudaMemset + */ +CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N); + +/** + * \brief Initializes device memory + * + * Sets the 2D memory range of \p Width 8-bit values to the specified value + * \p uc. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer + * \param uc - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset2D + */ +CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height); + +/** + * \brief Initializes device memory + * + * Sets the 2D memory range of \p Width 16-bit values to the specified value + * \p us. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. The \p dstDevice pointer + * and \p dstPitch offset must be two byte aligned. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer + * \param us - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset2D + */ +CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height); + +/** + * \brief Initializes device memory + * + * Sets the 2D memory range of \p Width 32-bit values to the specified value + * \p ui. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. The \p dstDevice pointer + * and \p dstPitch offset must be four byte aligned. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer + * \param ui - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset2D + */ +CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height); + +/** + * \brief Sets device memory + * + * Sets the memory range of \p N 8-bit values to the specified value + * \p uc. + * + * \param dstDevice - Destination device pointer + * \param uc - Value to set + * \param N - Number of elements + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemsetAsync + */ +CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream); + +/** + * \brief Sets device memory + * + * Sets the memory range of \p N 16-bit values to the specified value + * \p us. The \p dstDevice pointer must be two byte aligned. + * + * \param dstDevice - Destination device pointer + * \param us - Value to set + * \param N - Number of elements + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemsetAsync + */ +CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream); + +/** + * \brief Sets device memory + * + * Sets the memory range of \p N 32-bit values to the specified value + * \p ui. The \p dstDevice pointer must be four byte aligned. + * + * \param dstDevice - Destination device pointer + * \param ui - Value to set + * \param N - Number of elements + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, ::cuMemsetD32, + * ::cudaMemsetAsync + */ +CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream); + +/** + * \brief Sets device memory + * + * Sets the 2D memory range of \p Width 8-bit values to the specified value + * \p uc. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer + * \param uc - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset2DAsync + */ +CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream); + +/** + * \brief Sets device memory + * + * Sets the 2D memory range of \p Width 16-bit values to the specified value + * \p us. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. The \p dstDevice pointer + * and \p dstPitch offset must be two byte aligned. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer + * \param us - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset2DAsync + */ +CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream); + +/** + * \brief Sets device memory + * + * Sets the 2D memory range of \p Width 32-bit values to the specified value + * \p ui. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. The \p dstDevice pointer + * and \p dstPitch offset must be four byte aligned. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer + * \param ui - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset2DAsync + */ +CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream); + +/** + * \brief Creates a 1D or 2D CUDA array + * + * Creates a CUDA array according to the ::CUDA_ARRAY_DESCRIPTOR structure + * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle. + * The ::CUDA_ARRAY_DESCRIPTOR is defined as: + * + * \code + typedef struct { + unsigned int Width; + unsigned int Height; + CUarray_format Format; + unsigned int NumChannels; + } CUDA_ARRAY_DESCRIPTOR; + * \endcode + * where: + * + * - \p Width, and \p Height are the width, and height of the CUDA array (in + * elements); the CUDA array is one-dimensional if height is 0, two-dimensional + * otherwise; + * - ::Format specifies the format of the elements; ::CUarray_format is + * defined as: + * \code + typedef enum CUarray_format_enum { + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, + CU_AD_FORMAT_SIGNED_INT8 = 0x08, + CU_AD_FORMAT_SIGNED_INT16 = 0x09, + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, + CU_AD_FORMAT_HALF = 0x10, + CU_AD_FORMAT_FLOAT = 0x20 + } CUarray_format; + * \endcode + * - \p NumChannels specifies the number of packed components per CUDA array + * element; it may be 1, 2, or 4; + * + * Here are examples of CUDA array descriptions: + * + * Description for a CUDA array of 2048 floats: + * \code + CUDA_ARRAY_DESCRIPTOR desc; + desc.Format = CU_AD_FORMAT_FLOAT; + desc.NumChannels = 1; + desc.Width = 2048; + desc.Height = 1; + * \endcode + * + * Description for a 64 x 64 CUDA array of floats: + * \code + CUDA_ARRAY_DESCRIPTOR desc; + desc.Format = CU_AD_FORMAT_FLOAT; + desc.NumChannels = 1; + desc.Width = 64; + desc.Height = 64; + * \endcode + * + * Description for a \p width x \p height CUDA array of 64-bit, 4x16-bit + * float16's: + * \code + CUDA_ARRAY_DESCRIPTOR desc; + desc.FormatFlags = CU_AD_FORMAT_HALF; + desc.NumChannels = 4; + desc.Width = width; + desc.Height = height; + * \endcode + * + * Description for a \p width x \p height CUDA array of 16-bit elements, each + * of which is two 8-bit unsigned chars: + * \code + CUDA_ARRAY_DESCRIPTOR arrayDesc; + desc.FormatFlags = CU_AD_FORMAT_UNSIGNED_INT8; + desc.NumChannels = 2; + desc.Width = width; + desc.Height = height; + * \endcode + * + * \param pHandle - Returned array + * \param pAllocateArray - Array descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMallocArray + */ +CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray); + +/** + * \brief Get a 1D or 2D CUDA array descriptor + * + * Returns in \p *pArrayDescriptor a descriptor containing information on the + * format and dimensions of the CUDA array \p hArray. It is useful for + * subroutines that have been passed a CUDA array, but need to know the CUDA + * array parameters for validation or other purposes. + * + * \param pArrayDescriptor - Returned array descriptor + * \param hArray - Array to get descriptor of + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaArrayGetInfo + */ +CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray); +#endif /* __CUDA_API_VERSION >= 3020 */ + + +/** + * \brief Destroys a CUDA array + * + * Destroys the CUDA array \p hArray. + * + * \param hArray - Array to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ARRAY_IS_MAPPED + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaFreeArray + */ +CUresult CUDAAPI cuArrayDestroy(CUarray hArray); + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Creates a 3D CUDA array + * + * Creates a CUDA array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure + * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle. + * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as: + * + * \code + typedef struct { + unsigned int Width; + unsigned int Height; + unsigned int Depth; + CUarray_format Format; + unsigned int NumChannels; + unsigned int Flags; + } CUDA_ARRAY3D_DESCRIPTOR; + * \endcode + * where: + * + * - \p Width, \p Height, and \p Depth are the width, height, and depth of the + * CUDA array (in elements); the following types of CUDA arrays can be allocated: + * - A 1D array is allocated if \p Height and \p Depth extents are both zero. + * - A 2D array is allocated if only \p Depth extent is zero. + * - A 3D array is allocated if all three extents are non-zero. + * - A 1D layered CUDA array is allocated if only \p Height is zero and the + * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number + * of layers is determined by the depth extent. + * - A 2D layered CUDA array is allocated if all three extents are non-zero and + * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number + * of layers is determined by the depth extent. + * - A cubemap CUDA array is allocated if all three extents are non-zero and the + * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and + * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, + * where the six layers represent the six faces of a cube. The order of the six + * layers in memory is the same as that listed in ::CUarray_cubemap_face. + * - A cubemap layered CUDA array is allocated if all three extents are non-zero, + * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. + * \p Width must be equal to \p Height, and \p Depth must be a multiple of six. + * A cubemap layered CUDA array is a special type of 2D layered CUDA array that + * consists of a collection of cubemaps. The first six layers represent the first + * cubemap, the next six layers form the second cubemap, and so on. + * + * - ::Format specifies the format of the elements; ::CUarray_format is + * defined as: + * \code + typedef enum CUarray_format_enum { + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, + CU_AD_FORMAT_SIGNED_INT8 = 0x08, + CU_AD_FORMAT_SIGNED_INT16 = 0x09, + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, + CU_AD_FORMAT_HALF = 0x10, + CU_AD_FORMAT_FLOAT = 0x20 + } CUarray_format; + * \endcode + * + * - \p NumChannels specifies the number of packed components per CUDA array + * element; it may be 1, 2, or 4; + * + * - ::Flags may be set to + * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA arrays. If this flag is set, + * \p Depth specifies the number of layers, not the depth of a 3D array. + * - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to the CUDA array. + * If this flag is not set, ::cuSurfRefSetArray will fail when attempting to bind the CUDA array + * to a surface reference. + * - ::CUDA_ARRAY3D_CUBEMAP to enable creation of cubemaps. If this flag is set, \p Width must be + * equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set, + * then \p Depth must be a multiple of six. + * - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA array will be used for texture gather. + * Texture gather can only be performed on 2D CUDA arrays. + * + * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. + * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute + * is not specified. For ex., TEXTURE1D_WIDTH refers to the device attribute + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH. + * + * Note that 2D CUDA arrays have different size requirements if the ::CUDA_ARRAY3D_TEXTURE_GATHER flag + * is set. \p Width and \p Height must not be greater than ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH + * and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT respectively, in that case. + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
CUDA array typeValid extents that must always be met
{(width range in elements), (height range), + * (depth range)}
Valid extents with CUDA_ARRAY3D_SURFACE_LDST set
+ * {(width range in elements), (height range), (depth range)}
1D{ (1,TEXTURE1D_WIDTH), 0, 0 }{ (1,SURFACE1D_WIDTH), 0, 0 }
2D{ (1,TEXTURE2D_WIDTH), (1,TEXTURE2D_HEIGHT), 0 }{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }
3D{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) } + *
OR
{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), + * (1,TEXTURE3D_DEPTH_ALTERNATE) }
{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), + * (1,SURFACE3D_DEPTH) }
1D Layered{ (1,TEXTURE1D_LAYERED_WIDTH), 0, + * (1,TEXTURE1D_LAYERED_LAYERS) }{ (1,SURFACE1D_LAYERED_WIDTH), 0, + * (1,SURFACE1D_LAYERED_LAYERS) }
2D Layered{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), + * (1,TEXTURE2D_LAYERED_LAYERS) }{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), + * (1,SURFACE2D_LAYERED_LAYERS) }
Cubemap{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }{ (1,SURFACECUBEMAP_WIDTH), + * (1,SURFACECUBEMAP_WIDTH), 6 }
Cubemap Layered{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), + * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), + * (1,SURFACECUBEMAP_LAYERED_LAYERS) }
+ * + * Here are examples of CUDA array descriptions: + * + * Description for a CUDA array of 2048 floats: + * \code + CUDA_ARRAY3D_DESCRIPTOR desc; + desc.Format = CU_AD_FORMAT_FLOAT; + desc.NumChannels = 1; + desc.Width = 2048; + desc.Height = 0; + desc.Depth = 0; + * \endcode + * + * Description for a 64 x 64 CUDA array of floats: + * \code + CUDA_ARRAY3D_DESCRIPTOR desc; + desc.Format = CU_AD_FORMAT_FLOAT; + desc.NumChannels = 1; + desc.Width = 64; + desc.Height = 64; + desc.Depth = 0; + * \endcode + * + * Description for a \p width x \p height x \p depth CUDA array of 64-bit, + * 4x16-bit float16's: + * \code + CUDA_ARRAY3D_DESCRIPTOR desc; + desc.FormatFlags = CU_AD_FORMAT_HALF; + desc.NumChannels = 4; + desc.Width = width; + desc.Height = height; + desc.Depth = depth; + * \endcode + * + * \param pHandle - Returned array + * \param pAllocateArray - 3D array descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMalloc3DArray + */ +CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray); + +/** + * \brief Get a 3D CUDA array descriptor + * + * Returns in \p *pArrayDescriptor a descriptor containing information on the + * format and dimensions of the CUDA array \p hArray. It is useful for + * subroutines that have been passed a CUDA array, but need to know the CUDA + * array parameters for validation or other purposes. + * + * This function may be called on 1D and 2D arrays, in which case the \p Height + * and/or \p Depth members of the descriptor struct will be set to 0. + * + * \param pArrayDescriptor - Returned 3D array descriptor + * \param hArray - 3D array to get descriptor of + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaArrayGetInfo + */ +CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray); +#endif /* __CUDA_API_VERSION >= 3020 */ + +#if __CUDA_API_VERSION >= 5000 + +/** + * \brief Creates a CUDA mipmapped array + * + * Creates a CUDA mipmapped array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure + * \p pMipmappedArrayDesc and returns a handle to the new CUDA mipmapped array in \p *pHandle. + * \p numMipmapLevels specifies the number of mipmap levels to be allocated. This value is + * clamped to the range [1, 1 + floor(log2(max(width, height, depth)))]. + * + * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as: + * + * \code + typedef struct { + unsigned int Width; + unsigned int Height; + unsigned int Depth; + CUarray_format Format; + unsigned int NumChannels; + unsigned int Flags; + } CUDA_ARRAY3D_DESCRIPTOR; + * \endcode + * where: + * + * - \p Width, \p Height, and \p Depth are the width, height, and depth of the + * CUDA array (in elements); the following types of CUDA arrays can be allocated: + * - A 1D mipmapped array is allocated if \p Height and \p Depth extents are both zero. + * - A 2D mipmapped array is allocated if only \p Depth extent is zero. + * - A 3D mipmapped array is allocated if all three extents are non-zero. + * - A 1D layered CUDA mipmapped array is allocated if only \p Height is zero and the + * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number + * of layers is determined by the depth extent. + * - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and + * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number + * of layers is determined by the depth extent. + * - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the + * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and + * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, + * where the six layers represent the six faces of a cube. The order of the six + * layers in memory is the same as that listed in ::CUarray_cubemap_face. + * - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero, + * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. + * \p Width must be equal to \p Height, and \p Depth must be a multiple of six. + * A cubemap layered CUDA array is a special type of 2D layered CUDA array that + * consists of a collection of cubemaps. The first six layers represent the first + * cubemap, the next six layers form the second cubemap, and so on. + * + * - ::Format specifies the format of the elements; ::CUarray_format is + * defined as: + * \code + typedef enum CUarray_format_enum { + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, + CU_AD_FORMAT_SIGNED_INT8 = 0x08, + CU_AD_FORMAT_SIGNED_INT16 = 0x09, + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, + CU_AD_FORMAT_HALF = 0x10, + CU_AD_FORMAT_FLOAT = 0x20 + } CUarray_format; + * \endcode + * + * - \p NumChannels specifies the number of packed components per CUDA array + * element; it may be 1, 2, or 4; + * + * - ::Flags may be set to + * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA mipmapped arrays. If this flag is set, + * \p Depth specifies the number of layers, not the depth of a 3D array. + * - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to individual mipmap levels of + * the CUDA mipmapped array. If this flag is not set, ::cuSurfRefSetArray will fail when attempting to + * bind a mipmap level of the CUDA mipmapped array to a surface reference. + * - ::CUDA_ARRAY3D_CUBEMAP to enable creation of mipmapped cubemaps. If this flag is set, \p Width must be + * equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set, + * then \p Depth must be a multiple of six. + * - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA mipmapped array will be used for texture gather. + * Texture gather can only be performed on 2D CUDA mipmapped arrays. + * + * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. + * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute + * is not specified. For ex., TEXTURE1D_MIPMAPPED_WIDTH refers to the device attribute + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH. + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
CUDA array typeValid extents that must always be met
{(width range in elements), (height range), + * (depth range)}
Valid extents with CUDA_ARRAY3D_SURFACE_LDST set
+ * {(width range in elements), (height range), (depth range)}
1D{ (1,TEXTURE1D_MIPMAPPED_WIDTH), 0, 0 }{ (1,SURFACE1D_WIDTH), 0, 0 }
2D{ (1,TEXTURE2D_MIPMAPPED_WIDTH), (1,TEXTURE2D_MIPMAPPED_HEIGHT), 0 }{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }
3D{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) } + *
OR
{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), + * (1,TEXTURE3D_DEPTH_ALTERNATE) }
{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), + * (1,SURFACE3D_DEPTH) }
1D Layered{ (1,TEXTURE1D_LAYERED_WIDTH), 0, + * (1,TEXTURE1D_LAYERED_LAYERS) }{ (1,SURFACE1D_LAYERED_WIDTH), 0, + * (1,SURFACE1D_LAYERED_LAYERS) }
2D Layered{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), + * (1,TEXTURE2D_LAYERED_LAYERS) }{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), + * (1,SURFACE2D_LAYERED_LAYERS) }
Cubemap{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }{ (1,SURFACECUBEMAP_WIDTH), + * (1,SURFACECUBEMAP_WIDTH), 6 }
Cubemap Layered{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), + * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), + * (1,SURFACECUBEMAP_LAYERED_LAYERS) }
+ * + * + * \param pHandle - Returned mipmapped array + * \param pMipmappedArrayDesc - mipmapped array descriptor + * \param numMipmapLevels - Number of mipmap levels + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuMipmappedArrayDestroy, + * ::cuMipmappedArrayGetLevel, + * ::cuArrayCreate, + * ::cudaMallocMipmappedArray + */ +CUresult CUDAAPI cuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels); + +/** + * \brief Gets a mipmap level of a CUDA mipmapped array + * + * Returns in \p *pLevelArray a CUDA array that represents a single mipmap level + * of the CUDA mipmapped array \p hMipmappedArray. + * + * If \p level is greater than the maximum number of levels in this mipmapped array, + * ::CUDA_ERROR_INVALID_VALUE is returned. + * + * \param pLevelArray - Returned mipmap level CUDA array + * \param hMipmappedArray - CUDA mipmapped array + * \param level - Mipmap level + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa + * ::cuMipmappedArrayCreate, + * ::cuMipmappedArrayDestroy, + * ::cuArrayCreate, + * ::cudaGetMipmappedArrayLevel + */ +CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level); + +/** + * \brief Destroys a CUDA mipmapped array + * + * Destroys the CUDA mipmapped array \p hMipmappedArray. + * + * \param hMipmappedArray - Mipmapped array to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ARRAY_IS_MAPPED + * \notefnerr + * + * \sa + * ::cuMipmappedArrayCreate, + * ::cuMipmappedArrayGetLevel, + * ::cuArrayCreate, + * ::cudaFreeMipmappedArray + */ +CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray); + +#endif /* __CUDA_API_VERSION >= 5000 */ + +/** @} */ /* END CUDA_MEM */ + +/** + * \defgroup CUDA_UNIFIED Unified Addressing + * + * ___MANBRIEF___ unified addressing functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the unified addressing functions of the + * low-level CUDA driver application programming interface. + * + * @{ + * + * \section CUDA_UNIFIED_overview Overview + * + * CUDA devices can share a unified address space with the host. + * For these devices there is no distinction between a device + * pointer and a host pointer -- the same pointer value may be + * used to access memory from the host program and from a kernel + * running on the device (with exceptions enumerated below). + * + * \section CUDA_UNIFIED_support Supported Platforms + * + * Whether or not a device supports unified addressing may be + * queried by calling ::cuDeviceGetAttribute() with the device + * attribute ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. + * + * Unified addressing is automatically enabled in 64-bit processes + * + * \section CUDA_UNIFIED_lookup Looking Up Information from Pointer Values + * + * It is possible to look up information about the memory which backs a + * pointer value. For instance, one may want to know if a pointer points + * to host or device memory. As another example, in the case of device + * memory, one may want to know on which CUDA device the memory + * resides. These properties may be queried using the function + * ::cuPointerGetAttribute() + * + * Since pointers are unique, it is not necessary to specify information + * about the pointers specified to the various copy functions in the + * CUDA API. The function ::cuMemcpy() may be used to perform a copy + * between two pointers, ignoring whether they point to host or device + * memory (making ::cuMemcpyHtoD(), ::cuMemcpyDtoD(), and ::cuMemcpyDtoH() + * unnecessary for devices supporting unified addressing). For + * multidimensional copies, the memory type ::CU_MEMORYTYPE_UNIFIED may be + * used to specify that the CUDA driver should infer the location of the + * pointer from its value. + * + * \section CUDA_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory + * + * All host memory allocated in all contexts using ::cuMemAllocHost() and + * ::cuMemHostAlloc() is always directly accessible from all contexts on + * all devices that support unified addressing. This is the case regardless + * of whether or not the flags ::CU_MEMHOSTALLOC_PORTABLE and + * ::CU_MEMHOSTALLOC_DEVICEMAP are specified. + * + * The pointer value through which allocated host memory may be accessed + * in kernels on all devices that support unified addressing is the same + * as the pointer value through which that memory is accessed on the host, + * so it is not necessary to call ::cuMemHostGetDevicePointer() to get the device + * pointer for these allocations. + * + * Note that this is not the case for memory allocated using the flag + * ::CU_MEMHOSTALLOC_WRITECOMBINED, as discussed below. + * + * \section CUDA_UNIFIED_autopeerregister Automatic Registration of Peer Memory + * + * Upon enabling direct access from a context that supports unified addressing + * to another peer context that supports unified addressing using + * ::cuCtxEnablePeerAccess() all memory allocated in the peer context using + * ::cuMemAlloc() and ::cuMemAllocPitch() will immediately be accessible + * by the current context. The device pointer value through + * which any peer memory may be accessed in the current context + * is the same pointer value through which that memory may be + * accessed in the peer context. + * + * \section CUDA_UNIFIED_exceptions Exceptions, Disjoint Addressing + * + * Not all memory may be accessed on devices through the same pointer + * value through which they are accessed on the host. These exceptions + * are host memory registered using ::cuMemHostRegister() and host memory + * allocated using the flag ::CU_MEMHOSTALLOC_WRITECOMBINED. For these + * exceptions, there exists a distinct host and device address for the + * memory. The device address is guaranteed to not overlap any valid host + * pointer range and is guaranteed to have the same value across all + * contexts that support unified addressing. + * + * This device address may be queried using ::cuMemHostGetDevicePointer() + * when a context using unified addressing is current. Either the host + * or the unified device pointer value may be used to refer to this memory + * through ::cuMemcpy() and similar functions using the + * ::CU_MEMORYTYPE_UNIFIED memory type. + * + */ + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Returns information about a pointer + * + * The supported attributes are: + * + * - ::CU_POINTER_ATTRIBUTE_CONTEXT: + * + * Returns in \p *data the ::CUcontext in which \p ptr was allocated or + * registered. + * The type of \p data must be ::CUcontext *. + * + * If \p ptr was not allocated by, mapped by, or registered with + * a ::CUcontext which uses unified virtual addressing then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * + * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE: + * + * Returns in \p *data the physical memory type of the memory that + * \p ptr addresses as a ::CUmemorytype enumerated value. + * The type of \p data must be unsigned int. + * + * If \p ptr addresses device memory then \p *data is set to + * ::CU_MEMORYTYPE_DEVICE. The particular ::CUdevice on which the + * memory resides is the ::CUdevice of the ::CUcontext returned by the + * ::CU_POINTER_ATTRIBUTE_CONTEXT attribute of \p ptr. + * + * If \p ptr addresses host memory then \p *data is set to + * ::CU_MEMORYTYPE_HOST. + * + * If \p ptr was not allocated by, mapped by, or registered with + * a ::CUcontext which uses unified virtual addressing then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * + * If the current ::CUcontext does not support unified virtual + * addressing then ::CUDA_ERROR_INVALID_CONTEXT is returned. + * + * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER: + * + * Returns in \p *data the device pointer value through which + * \p ptr may be accessed by kernels running in the current + * ::CUcontext. + * The type of \p data must be CUdeviceptr *. + * + * If there exists no device pointer value through which + * kernels running in the current ::CUcontext may access + * \p ptr then ::CUDA_ERROR_INVALID_VALUE is returned. + * + * If there is no current ::CUcontext then + * ::CUDA_ERROR_INVALID_CONTEXT is returned. + * + * Except in the exceptional disjoint addressing cases discussed + * below, the value returned in \p *data will equal the input + * value \p ptr. + * + * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER: + * + * Returns in \p *data the host pointer value through which + * \p ptr may be accessed by by the host program. + * The type of \p data must be void **. + * If there exists no host pointer value through which + * the host program may directly access \p ptr then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * + * Except in the exceptional disjoint addressing cases discussed + * below, the value returned in \p *data will equal the input + * value \p ptr. + * + * - ::CU_POINTER_ATTRIBUTE_P2P_TOKENS: + * + * Returns in \p *data two tokens for use with the nv-p2p.h Linux + * kernel interface. \p data must be a struct of type + * CUDA_POINTER_ATTRIBUTE_P2P_TOKENS. + * + * \p ptr must be a pointer to memory obtained from :cuMemAlloc(). + * Note that p2pToken and vaSpaceToken are only valid for the + * lifetime of the source allocation. A subsequent allocation at + * the same address may return completely different tokens. + * Querying this attribute has a side effect of setting the attribute + * ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS for the region of memory that + * \p ptr points to. + * + * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS: + * + * A boolean attribute which when set, ensures that synchronous memory operations + * initiated on the region of memory that \p ptr points to will always synchronize. + * See further documentation in the section titled "API synchronization behavior" + * to learn more about cases when synchronous memory operations can + * exhibit asynchronous behavior. + * + * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID: + * + * Returns in \p *data a buffer ID which is guaranteed to be unique within the process. + * \p data must point to an unsigned long long. + * + * \p ptr must be a pointer to memory obtained from a CUDA memory allocation API. + * Every memory allocation from any of the CUDA memory allocation APIs will + * have a unique ID over a process lifetime. Subsequent allocations do not reuse IDs + * from previous freed allocations. IDs are only unique within a single process. + * + * + * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED: + * + * Returns in \p *data a boolean that indicates whether the pointer points to + * managed memory or not. + * + * \par + * + * Note that for most allocations in the unified virtual address space + * the host and device pointer for accessing the allocation will be the + * same. The exceptions to this are + * - user memory registered using ::cuMemHostRegister + * - host memory allocated using ::cuMemHostAlloc with the + * ::CU_MEMHOSTALLOC_WRITECOMBINED flag + * For these types of allocation there will exist separate, disjoint host + * and device addresses for accessing the allocation. In particular + * - The host address will correspond to an invalid unmapped device address + * (which will result in an exception if accessed from the device) + * - The device address will correspond to an invalid unmapped host address + * (which will result in an exception if accessed from the host). + * For these types of allocations, querying ::CU_POINTER_ATTRIBUTE_HOST_POINTER + * and ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER may be used to retrieve the host + * and device addresses from either address. + * + * \param data - Returned pointer attribute value + * \param attribute - Pointer attribute to query + * \param ptr - Pointer + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuPointerSetAttribute, + * ::cuMemAlloc, + * ::cuMemFree, + * ::cuMemAllocHost, + * ::cuMemFreeHost, + * ::cuMemHostAlloc, + * ::cuMemHostRegister, + * ::cuMemHostUnregister, + * ::cudaPointerGetAttributes + */ +CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr); +#endif /* __CUDA_API_VERSION >= 4000 */ + +#if __CUDA_API_VERSION >= 8000 +/** + * \brief Prefetches memory to the specified destination device + * + * Prefetches memory to the specified destination device. \p devPtr is the + * base device pointer of the memory to be prefetched and \p dstDevice is the + * destination device. \p count specifies the number of bytes to copy. \p hStream + * is the stream in which the operation is enqueued. The memory range must refer + * to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + * + * Passing in CU_DEVICE_CPU for \p dstDevice will prefetch the data to host memory. If + * \p dstDevice is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS + * must be non-zero. Additionally, \p hStream must be associated with a device that has a + * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + * + * The start address and end address of the memory range will be rounded down and rounded up + * respectively to be aligned to CPU page size before the prefetch operation is enqueued + * in the stream. + * + * If no physical memory has been allocated for this region, then this memory region + * will be populated and mapped on the destination device. If there's insufficient + * memory to prefetch the desired region, the Unified Memory driver may evict pages from other + * ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + * allocated using ::cuMemAlloc or ::cuArrayCreate will not be evicted. + * + * By default, any mappings to the previous location of the migrated pages are removed and + * mappings for the new location are only setup on \p dstDevice. The exact behavior however + * also depends on the settings applied to this memory range via ::cuMemAdvise as described + * below: + * + * If ::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + * then that subset will create a read-only copy of the pages on \p dstDevice. + * + * If ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + * range, then the pages will be migrated to \p dstDevice even if \p dstDevice is not the + * preferred location of any pages in the memory range. + * + * If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + * then mappings to those pages from all the appropriate processors are updated to + * refer to the new location if establishing such a mapping is possible. Otherwise, + * those mappings are cleared. + * + * Note that this API is not required for functionality and only serves to improve performance + * by allowing the application to migrate data to a suitable location before it is accessed. + * Memory accesses to this range are always coherent and are allowed even when the data is + * actively being migrated. + * + * Note that this function is asynchronous with respect to the host and all work + * on other devices. + * + * \param devPtr - Pointer to be prefetched + * \param count - Size in bytes + * \param dstDevice - Destination device to prefetch to + * \param hStream - Stream to enqueue prefetch operation + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync, + * ::cuMemcpy3DPeerAsync, ::cuMemAdvise, + * ::cudaMemPrefetchAsync + */ +CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream); + +/** + * \brief Advise about the usage of a given memory range + * + * Advise the Unified Memory subsystem about the usage pattern for the memory range + * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory + * range will be rounded down and rounded up respectively to be aligned to CPU page size before the + * advice is applied. The memory range must refer to managed memory allocated via ::cuMemAllocManaged + * or declared via __managed__ variables. + * + * The \p advice parameter can take the following values: + * - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + * from and only occasionally written to. Any read accesses from any processor to this region will create a + * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + * is called on this region, it will create a read-only copy of the data on the destination processor. + * If any processor writes to this region, all copies of the corresponding page will be invalidated + * except for the one where the write occurred. The \p device argument is ignored for this advice. + * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + * that has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + * Also, if a context is created on a device that does not have the device attribute + * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + * all such contexts are destroyed. + * - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated + * copies of the data will be collapsed into a single copy. The location for the collapsed + * copy will be the preferred location if the page has a preferred location and one of the read-duplicated + * copies was resident at that location. Otherwise, the location chosen is arbitrary. + * - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + * data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + * preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the + * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Setting the preferred location + * does not cause data to migrate to that location immediately. Instead, it guides the migration policy + * when a fault occurs on that memory region. If the data is already in its preferred location and the + * faulting processor can establish a mapping without requiring the data to be migrated, then + * data migration will be avoided. On the other hand, if the data is not in its preferred location + * or if a direct mapping cannot be established, then it will be migrated to the processor accessing + * it. It is important to note that setting the preferred location does not prevent data prefetching + * done using ::cuMemPrefetchAsync. + * Having a preferred location can override the page thrash detection and resolution logic in the Unified + * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device + * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But + * if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + * policies associated with that advice will override the policies of this advice. + * - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + * and changes the preferred location to none. + * - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + * Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. If \p device is a GPU, then + * the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + * This advice does not cause data migration and has no impact on the location of the data per se. Instead, + * it causes the data to always be mapped in the specified processor's page tables, as long as the + * location of the data permits a mapping to be established. If the data gets migrated for any reason, + * the mappings are updated accordingly. + * This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data + * over to the other GPUs is not as important because the accesses are infrequent and the overhead of + * migration may be too high. But preventing faults can still help improve performance, and so having + * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the + * ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + * page in host memory. + * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + * policies associated with that advice will override the policies of this advice. Additionally, if the + * preferred location of this memory region or any subset of it is also \p device, then the policies + * associated with ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + * - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY. Any mappings to + * the data from \p device may be removed at any time causing accesses to result in non-fatal page faults. + * + * \param devPtr - Pointer to memory to set the advice for + * \param count - Size in bytes of the memory range + * \param advice - Advice to be applied for the specified memory range + * \param device - Device to apply the advice for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync, + * ::cuMemcpy3DPeerAsync, ::cuMemPrefetchAsync, + * ::cudaMemAdvise + */ +CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device); + +/** + * \brief Query an attribute of a given memory range + * + * Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The + * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via + * __managed__ variables. + * + * The \p attribute parameter can take the following values: + * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY: If this attribute is specified, \p data will be interpreted + * as a 32-bit integer, and \p dataSize must be 4. The result returned will be 1 if all pages in the given + * memory range have read-duplication enabled, or 0 otherwise. + * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION: If this attribute is specified, \p data will be + * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be a GPU device + * id if all pages in the memory range have that GPU as their preferred location, or it will be CU_DEVICE_CPU + * if all pages in the memory range have the CPU as their preferred location, or it will be CU_DEVICE_INVALID + * if either all the pages don't have the same preferred location or some of the pages don't have a + * preferred location at all. Note that the actual location of the pages in the memory range at the time of + * the query may be different from the preferred location. + * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY: If this attribute is specified, \p data will be interpreted + * as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned + * will be a list of device ids that had ::CU_MEM_ADVISE_SET_ACCESSED_BY set for that entire memory range. + * If any device does not have that advice set for the entire memory range, that device will not be included. + * If \p data is larger than the number of devices that have that advice set for that memory range, + * CU_DEVICE_INVALID will be returned in all the extra space provided. For ex., if \p dataSize is 12 + * (i.e. \p data has 3 elements) and only device 0 has the advice set, then the result returned will be + * { 0, CU_DEVICE_INVALID, CU_DEVICE_INVALID }. If \p data is smaller than the number of devices that have + * that advice set, then only as many devices will be returned as can fit in the array. There is no + * guarantee on which specific devices will be returned, however. + * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION: If this attribute is specified, \p data will be + * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be the last location + * to which all pages in the memory range were prefetched explicitly via ::cuMemPrefetchAsync. This will either be + * a GPU id or CU_DEVICE_CPU depending on whether the last location for prefetch was a GPU or the CPU + * respectively. If any page in the memory range was never explicitly prefetched or if all pages were not + * prefetched to the same location, CU_DEVICE_INVALID will be returned. Note that this simply returns the + * last location that the applicaton requested to prefetch the memory range to. It gives no indication as to + * whether the prefetch operation to that location has completed or even begun. + * + * \param data - A pointers to a memory location where the result + * of each attribute query will be written to. + * \param dataSize - Array containing the size of data + * \param attribute - The attribute to query + * \param devPtr - Start of the range to query + * \param count - Size of the range to query + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuMemRangeGetAttributes, ::cuMemPrefetchAsync, + * ::cuMemAdvise, + * ::cudaMemRangeGetAttribute + */ +CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr devPtr, size_t count); + +/** + * \brief Query attributes of a given memory range. + * + * Query attributes of the memory range starting at \p devPtr with a size of \p count bytes. The + * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via + * __managed__ variables. The \p attributes array will be interpreted to have \p numAttributes + * entries. The \p dataSizes array will also be interpreted to have \p numAttributes entries. + * The results of the query will be stored in \p data. + * + * The list of supported attributes are given below. Please refer to ::cuMemRangeGetAttribute for + * attribute descriptions and restrictions. + * + * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY + * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION + * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY + * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION + * + * \param data - A two-dimensional array containing pointers to memory + * locations where the result of each attribute query will be written to. + * \param dataSizes - Array containing the sizes of each result + * \param attributes - An array of attributes to query + * (numAttributes and the number of attributes in this array should match) + * \param numAttributes - Number of attributes to query + * \param devPtr - Start of the range to query + * \param count - Size of the range to query + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa ::cuMemRangeGetAttribute, ::cuMemAdvise + * ::cuMemPrefetchAsync, + * ::cudaMemRangeGetAttributes + */ +CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count); +#endif /* __CUDA_API_VERSION >= 8000 */ + +#if __CUDA_API_VERSION >= 6000 +/** + * \brief Set attributes on a previously allocated memory region + * + * The supported attributes are: + * + * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS: + * + * A boolean attribute that can either be set (1) or unset (0). When set, + * the region of memory that \p ptr points to is guaranteed to always synchronize + * memory operations that are synchronous. If there are some previously initiated + * synchronous memory operations that are pending when this attribute is set, the + * function does not return until those memory operations are complete. + * See further documentation in the section titled "API synchronization behavior" + * to learn more about cases when synchronous memory operations can + * exhibit asynchronous behavior. + * \p value will be considered as a pointer to an unsigned integer to which this attribute is to be set. + * + * \param value - Pointer to memory containing the value to be set + * \param attribute - Pointer attribute to set + * \param ptr - Pointer to a memory region allocated using CUDA memory allocation APIs + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa ::cuPointerGetAttribute, + * ::cuPointerGetAttributes, + * ::cuMemAlloc, + * ::cuMemFree, + * ::cuMemAllocHost, + * ::cuMemFreeHost, + * ::cuMemHostAlloc, + * ::cuMemHostRegister, + * ::cuMemHostUnregister + */ +CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute attribute, CUdeviceptr ptr); +#endif /* __CUDA_API_VERSION >= 6000 */ + +#if __CUDA_API_VERSION >= 7000 +/** + * \brief Returns information about a pointer. + * + * The supported attributes are (refer to ::cuPointerGetAttribute for attribute descriptions and restrictions): + * + * - ::CU_POINTER_ATTRIBUTE_CONTEXT + * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE + * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER + * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER + * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS + * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID + * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED + * + * \param numAttributes - Number of attributes to query + * \param attributes - An array of attributes to query + * (numAttributes and the number of attributes in this array should match) + * \param data - A two-dimensional array containing pointers to memory + * locations where the result of each attribute query will be written to. + * \param ptr - Pointer to query + * + * Unlike ::cuPointerGetAttribute, this function will not return an error when the \p ptr + * encountered is not a valid CUDA pointer. Instead, the attributes are assigned default NULL values + * and CUDA_SUCCESS is returned. + * + * If \p ptr was not allocated by, mapped by, or registered with a ::CUcontext which uses UVA + * (Unified Virtual Addressing), ::CUDA_ERROR_INVALID_CONTEXT is returned. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuPointerGetAttribute, + * ::cuPointerSetAttribute, + * ::cudaPointerGetAttributes + */ +CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr ptr); +#endif /* __CUDA_API_VERSION >= 7000 */ + +/** @} */ /* END CUDA_UNIFIED */ + +/** + * \defgroup CUDA_STREAM Stream Management + * + * ___MANBRIEF___ stream management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the stream management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Create a stream + * + * Creates a stream and returns a handle in \p phStream. The \p Flags argument + * determines behaviors of the stream. Valid values for \p Flags are: + * - ::CU_STREAM_DEFAULT: Default stream creation flag. + * - ::CU_STREAM_NON_BLOCKING: Specifies that work running in the created + * stream may run concurrently with work in stream 0 (the NULL stream), and that + * the created stream should perform no implicit synchronization with stream 0. + * + * \param phStream - Returned newly created stream + * \param Flags - Parameters for stream creation + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuStreamDestroy, + * ::cuStreamCreateWithPriority, + * ::cuStreamGetPriority, + * ::cuStreamGetFlags, + * ::cuStreamWaitEvent, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback, + * ::cudaStreamCreate, + * ::cudaStreamCreateWithFlags + */ +CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags); + +/** + * \brief Create a stream with the given priority + * + * Creates a stream with the specified priority and returns a handle in \p phStream. + * This API alters the scheduler priority of work in the stream. Work in a higher + * priority stream may preempt work already executing in a low priority stream. + * + * \p priority follows a convention where lower numbers represent higher priorities. + * '0' represents default priority. The range of meaningful numerical priorities can + * be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is + * outside the numerical range returned by ::cuCtxGetStreamPriorityRange, + * it will automatically be clamped to the lowest or the highest number in the range. + * + * \param phStream - Returned newly created stream + * \param flags - Flags for stream creation. See ::cuStreamCreate for a list of + * valid flags + * \param priority - Stream priority. Lower numbers represent higher priorities. + * See ::cuCtxGetStreamPriorityRange for more information about + * meaningful stream priorities that can be passed. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \note Stream priorities are supported only on GPUs + * with compute capability 3.5 or higher. + * + * \note In the current implementation, only compute kernels launched in + * priority streams are affected by the stream's priority. Stream priorities have + * no effect on host-to-device and device-to-host memory operations. + * + * \sa ::cuStreamDestroy, + * ::cuStreamCreate, + * ::cuStreamGetPriority, + * ::cuCtxGetStreamPriorityRange, + * ::cuStreamGetFlags, + * ::cuStreamWaitEvent, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback, + * ::cudaStreamCreateWithPriority + */ +CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int flags, int priority); + + +/** + * \brief Query the priority of a given stream + * + * Query the priority of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority + * and return the priority in \p priority. Note that if the stream was created with a + * priority outside the numerical range returned by ::cuCtxGetStreamPriorityRange, + * this function returns the clamped priority. + * See ::cuStreamCreateWithPriority for details about priority clamping. + * + * \param hStream - Handle to the stream to be queried + * \param priority - Pointer to a signed integer in which the stream's priority is returned + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuStreamDestroy, + * ::cuStreamCreate, + * ::cuStreamCreateWithPriority, + * ::cuCtxGetStreamPriorityRange, + * ::cuStreamGetFlags, + * ::cudaStreamGetPriority + */ +CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority); + +/** + * \brief Query the flags of a given stream + * + * Query the flags of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority + * and return the flags in \p flags. + * + * \param hStream - Handle to the stream to be queried + * \param flags - Pointer to an unsigned integer in which the stream's flags are returned + * The value returned in \p flags is a logical 'OR' of all flags that + * were used while creating this stream. See ::cuStreamCreate for the list + * of valid flags + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuStreamDestroy, + * ::cuStreamCreate, + * ::cuStreamGetPriority, + * ::cudaStreamGetFlags + */ +CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags); + + +/** + * \brief Make a compute stream wait on an event + * + * Makes all future work submitted to \p hStream wait until \p hEvent + * reports completion before beginning execution. This synchronization + * will be performed efficiently on the device. The event \p hEvent may + * be from a different context than \p hStream, in which case this function + * will perform cross-device synchronization. + * + * The stream \p hStream will wait only for the completion of the most recent + * host call to ::cuEventRecord() on \p hEvent. Once this call has returned, + * any functions (including ::cuEventRecord() and ::cuEventDestroy()) may be + * called on \p hEvent again, and subsequent calls will not have any + * effect on \p hStream. + * + * If ::cuEventRecord() has not been called on \p hEvent, this call acts as if + * the record has already completed, and so is a functional no-op. + * + * \param hStream - Stream to wait + * \param hEvent - Event to wait on (may not be NULL) + * \param Flags - Parameters for the operation (must be 0) + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuEventRecord, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback, + * ::cuStreamDestroy, + * ::cudaStreamWaitEvent + */ +CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags); + +/** + * \brief Add a callback to a compute stream + * + * Adds a callback to be called on the host after all currently enqueued + * items in the stream have completed. For each + * cuStreamAddCallback call, the callback will be executed exactly once. + * The callback will block later work in the stream until it is finished. + * + * The callback may be passed ::CUDA_SUCCESS or an error code. In the event + * of a device error, all subsequently executed callbacks will receive an + * appropriate ::CUresult. + * + * Callbacks must not make any CUDA API calls. Attempting to use a CUDA API + * will result in ::CUDA_ERROR_NOT_PERMITTED. Callbacks must not perform any + * synchronization that may depend on outstanding device work or other callbacks + * that are not mandated to run earlier. Callbacks without a mandated order + * (in independent streams) execute in undefined order and may be serialized. + * + * For the purposes of Unified Memory, callback execution makes a number of + * guarantees: + *
    + *
  • The callback stream is considered idle for the duration of the + * callback. Thus, for example, a callback may always use memory attached + * to the callback stream.
  • + *
  • The start of execution of a callback has the same effect as + * synchronizing an event recorded in the same stream immediately prior to + * the callback. It thus synchronizes streams which have been "joined" + * prior to the callback.
  • + *
  • Adding device work to any stream does not have the effect of making + * the stream active until all preceding callbacks have executed. Thus, for + * example, a callback might use global attached memory even if work has + * been added to another stream, if it has been properly ordered with an + * event.
  • + *
  • Completion of a callback does not cause a stream to become + * active except as described above. The callback stream will remain idle + * if no device work follows the callback, and will remain idle across + * consecutive callbacks without device work in between. Thus, for example, + * stream synchronization can be done by signaling from a callback at the + * end of the stream.
  • + *
+ * + * \param hStream - Stream to add callback to + * \param callback - The function to call once preceding stream operations are complete + * \param userData - User specified data to be passed to the callback function + * \param flags - Reserved for future use, must be 0 + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamWaitEvent, + * ::cuStreamDestroy, + * ::cuMemAllocManaged, + * ::cuStreamAttachMemAsync, + * ::cudaStreamAddCallback + */ +CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); + +#if __CUDA_API_VERSION >= 6000 + +/** + * \brief Attach memory to a stream asynchronously + * + * Enqueues an operation in \p hStream to specify stream association of + * \p length bytes of memory starting from \p dptr. This function is a + * stream-ordered operation, meaning that it is dependent on, and will + * only take effect when, previous work in stream has completed. Any + * previous association is automatically replaced. + * + * \p dptr must point to an address within managed memory space declared + * using the __managed__ keyword or allocated with ::cuMemAllocManaged. + * + * \p length must be zero, to indicate that the entire allocation's + * stream association is being changed. Currently, it's not possible + * to change stream association for a portion of an allocation. + * + * The stream association is specified using \p flags which must be + * one of ::CUmemAttach_flags. + * If the ::CU_MEM_ATTACH_GLOBAL flag is specified, the memory can be accessed + * by any stream on any device. + * If the ::CU_MEM_ATTACH_HOST flag is specified, the program makes a guarantee + * that it won't access the memory on the device from any stream on a device that + * has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + * If the ::CU_MEM_ATTACH_SINGLE flag is specified and \p hStream is associated with + * a device that has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, + * the program makes a guarantee that it will only access the memory on the device + * from \p hStream. It is illegal to attach singly to the NULL stream, because the + * NULL stream is a virtual global stream and not a specific stream. An error will + * be returned in this case. + * + * When memory is associated with a single stream, the Unified Memory system will + * allow CPU access to this memory region so long as all operations in \p hStream + * have completed, regardless of whether other streams are active. In effect, + * this constrains exclusive ownership of the managed memory region by + * an active GPU to per-stream activity instead of whole-GPU activity. + * + * Accessing memory on the device from streams that are not associated with + * it will produce undefined results. No error checking is performed by the + * Unified Memory system to ensure that kernels launched into other streams + * do not access this region. + * + * It is a program's responsibility to order calls to ::cuStreamAttachMemAsync + * via events, synchronization or other means to ensure legal access to memory + * at all times. Data visibility and coherency will be changed appropriately + * for all kernels which follow a stream-association change. + * + * If \p hStream is destroyed while data is associated with it, the association is + * removed and the association reverts to the default visibility of the allocation + * as specified at ::cuMemAllocManaged. For __managed__ variables, the default + * association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a stream is an + * asynchronous operation, and as a result, the change to default association won't + * happen until all work in the stream has completed. + * + * \param hStream - Stream in which to enqueue the attach operation + * \param dptr - Pointer to memory (must be a pointer to managed memory) + * \param length - Length of memory (must be zero) + * \param flags - Must be one of ::CUmemAttach_flags + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamWaitEvent, + * ::cuStreamDestroy, + * ::cuMemAllocManaged, + * ::cudaStreamAttachMemAsync + */ +CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags); + +#endif /* __CUDA_API_VERSION >= 6000 */ + +/** + * \brief Determine status of a compute stream + * + * Returns ::CUDA_SUCCESS if all operations in the stream specified by + * \p hStream have completed, or ::CUDA_ERROR_NOT_READY if not. + * + * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS + * is equivalent to having called ::cuStreamSynchronize(). + * + * \param hStream - Stream to query status of + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_READY + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamWaitEvent, + * ::cuStreamDestroy, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback, + * ::cudaStreamQuery + */ +CUresult CUDAAPI cuStreamQuery(CUstream hStream); + +/** + * \brief Wait until a stream's tasks are completed + * + * Waits until the device has completed all operations in the stream specified + * by \p hStream. If the context was created with the + * ::CU_CTX_SCHED_BLOCKING_SYNC flag, the CPU thread will block until the + * stream is finished with all of its tasks. + * + * \param hStream - Stream to wait for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamDestroy, + * ::cuStreamWaitEvent, + * ::cuStreamQuery, + * ::cuStreamAddCallback, + * ::cudaStreamSynchronize + */ +CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Destroys a stream + * + * Destroys the stream specified by \p hStream. + * + * In case the device is still doing work in the stream \p hStream + * when ::cuStreamDestroy() is called, the function will return immediately + * and the resources associated with \p hStream will be released automatically + * once the device has completed all work in \p hStream. + * + * \param hStream - Stream to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamWaitEvent, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback, + * ::cudaStreamDestroy + */ +CUresult CUDAAPI cuStreamDestroy(CUstream hStream); +#endif /* __CUDA_API_VERSION >= 4000 */ + +/** @} */ /* END CUDA_STREAM */ + + +/** + * \defgroup CUDA_EVENT Event Management + * + * ___MANBRIEF___ event management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the event management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Creates an event + * + * Creates an event *phEvent with the flags specified via \p Flags. Valid flags + * include: + * - ::CU_EVENT_DEFAULT: Default event creation flag. + * - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking + * synchronization. A CPU thread that uses ::cuEventSynchronize() to wait on + * an event created with this flag will block until the event has actually + * been recorded. + * - ::CU_EVENT_DISABLE_TIMING: Specifies that the created event does not need + * to record timing data. Events created with this flag specified and + * the ::CU_EVENT_BLOCKING_SYNC flag not specified will provide the best + * performance when used with ::cuStreamWaitEvent() and ::cuEventQuery(). + * - ::CU_EVENT_INTERPROCESS: Specifies that the created event may be used as an + * interprocess event by ::cuIpcGetEventHandle(). ::CU_EVENT_INTERPROCESS must + * be specified along with ::CU_EVENT_DISABLE_TIMING. + * + * \param phEvent - Returns newly created event + * \param Flags - Event creation flags + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa + * ::cuEventRecord, + * ::cuEventQuery, + * ::cuEventSynchronize, + * ::cuEventDestroy, + * ::cuEventElapsedTime, + * ::cudaEventCreate, + * ::cudaEventCreateWithFlags + */ +CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags); + +/** + * \brief Records an event + * + * Records an event. See note on NULL stream behavior. Since operation is + * asynchronous, ::cuEventQuery or ::cuEventSynchronize() must be used + * to determine when the event has actually been recorded. + * + * If ::cuEventRecord() has previously been called on \p hEvent, then this + * call will overwrite any existing state in \p hEvent. Any subsequent calls + * which examine the status of \p hEvent will only examine the completion of + * this most recent call to ::cuEventRecord(). + * + * It is necessary that \p hEvent and \p hStream be created on the same context. + * + * \param hEvent - Event to record + * \param hStream - Stream to record event for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \note_null_stream + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventQuery, + * ::cuEventSynchronize, + * ::cuStreamWaitEvent, + * ::cuEventDestroy, + * ::cuEventElapsedTime, + * ::cudaEventRecord + */ +CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream); + +/** + * \brief Queries an event's status + * + * Query the status of all device work preceding the most recent + * call to ::cuEventRecord() (in the appropriate compute streams, + * as specified by the arguments to ::cuEventRecord()). + * + * If this work has successfully been completed by the device, or if + * ::cuEventRecord() has not been called on \p hEvent, then ::CUDA_SUCCESS is + * returned. If this work has not yet been completed by the device then + * ::CUDA_ERROR_NOT_READY is returned. + * + * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS + * is equivalent to having called ::cuEventSynchronize(). + * + * \param hEvent - Event to query + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_READY + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventRecord, + * ::cuEventSynchronize, + * ::cuEventDestroy, + * ::cuEventElapsedTime, + * ::cudaEventQuery + */ +CUresult CUDAAPI cuEventQuery(CUevent hEvent); + +/** + * \brief Waits for an event to complete + * + * Wait until the completion of all device work preceding the most recent + * call to ::cuEventRecord() (in the appropriate compute streams, as specified + * by the arguments to ::cuEventRecord()). + * + * If ::cuEventRecord() has not been called on \p hEvent, ::CUDA_SUCCESS is + * returned immediately. + * + * Waiting for an event that was created with the ::CU_EVENT_BLOCKING_SYNC + * flag will cause the calling CPU thread to block until the event has + * been completed by the device. If the ::CU_EVENT_BLOCKING_SYNC flag has + * not been set, then the CPU thread will busy-wait until the event has + * been completed by the device. + * + * \param hEvent - Event to wait for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventRecord, + * ::cuEventQuery, + * ::cuEventDestroy, + * ::cuEventElapsedTime, + * ::cudaEventSynchronize + */ +CUresult CUDAAPI cuEventSynchronize(CUevent hEvent); + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Destroys an event + * + * Destroys the event specified by \p hEvent. + * + * In case \p hEvent has been recorded but has not yet been completed + * when ::cuEventDestroy() is called, the function will return immediately and + * the resources associated with \p hEvent will be released automatically once + * the device has completed \p hEvent. + * + * \param hEvent - Event to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventRecord, + * ::cuEventQuery, + * ::cuEventSynchronize, + * ::cuEventElapsedTime, + * ::cudaEventDestroy + */ +CUresult CUDAAPI cuEventDestroy(CUevent hEvent); +#endif /* __CUDA_API_VERSION >= 4000 */ + +/** + * \brief Computes the elapsed time between two events + * + * Computes the elapsed time between two events (in milliseconds with a + * resolution of around 0.5 microseconds). + * + * If either event was last recorded in a non-NULL stream, the resulting time + * may be greater than expected (even if both used the same stream handle). This + * happens because the ::cuEventRecord() operation takes place asynchronously + * and there is no guarantee that the measured latency is actually just between + * the two events. Any number of other different stream operations could execute + * in between the two measured events, thus altering the timing in a significant + * way. + * + * If ::cuEventRecord() has not been called on either event then + * ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called + * on both events but one or both of them has not yet been completed (that is, + * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the + * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with + * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return + * ::CUDA_ERROR_INVALID_HANDLE. + * + * \param pMilliseconds - Time between \p hStart and \p hEnd in ms + * \param hStart - Starting event + * \param hEnd - Ending event + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_READY + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventRecord, + * ::cuEventQuery, + * ::cuEventSynchronize, + * ::cuEventDestroy, + * ::cudaEventElapsedTime + */ +CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd); + +#if __CUDA_API_VERSION >= 8000 +/** + * \brief Wait on a memory location + * + * Enqueues a synchronization of the stream on the given memory location. Work + * ordered after the operation will block until the given condition on the + * memory is satisfied. By default, the condition is to wait for + * (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal. + * Other condition types can be specified via \p flags. + * + * If the memory was registered via ::cuMemHostRegister(), the device pointer + * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot + * be used with managed memory (::cuMemAllocManaged). + * + * Support for this can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. The only requirement for basic + * support is that on Windows, a device must be in TCC mode. + * + * \param stream The stream to synchronize on the memory location. + * \param addr The memory location to wait on. + * \param value The value to compare with the memory location. + * \param flags See ::CUstreamWaitValue_flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuStreamWaitValue64, + * ::cuStreamWriteValue32, + * ::cuStreamWriteValue64 + * ::cuStreamBatchMemOp, + * ::cuMemHostRegister, + * ::cuStreamWaitEvent + */ +CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); + +/** + * \brief Wait on a memory location + * + * Enqueues a synchronization of the stream on the given memory location. Work + * ordered after the operation will block until the given condition on the + * memory is satisfied. By default, the condition is to wait for + * (int64_t)(*addr - value) >= 0, a cyclic greater-or-equal. + * Other condition types can be specified via \p flags. + * + * If the memory was registered via ::cuMemHostRegister(), the device pointer + * should be obtained with ::cuMemHostGetDevicePointer(). + * + * Support for this can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. The requirements are + * compute capability 7.0 or greater, and on Windows, that the device be in + * TCC mode. + * + * \param stream The stream to synchronize on the memory location. + * \param addr The memory location to wait on. + * \param value The value to compare with the memory location. + * \param flags See ::CUstreamWaitValue_flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuStreamWaitValue32, + * ::cuStreamWriteValue32, + * ::cuStreamWriteValue64, + * ::cuStreamBatchMemOp, + * ::cuMemHostRegister, + * ::cuStreamWaitEvent + */ +CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); + +/** + * \brief Write a value to memory + * + * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER + * flag is passed, the write is preceded by a system-wide memory fence, + * equivalent to a __threadfence_system() but scoped to the stream + * rather than a CUDA thread. + * + * If the memory was registered via ::cuMemHostRegister(), the device pointer + * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot + * be used with managed memory (::cuMemAllocManaged). + * + * Support for this can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. The only requirement for basic + * support is that on Windows, a device must be in TCC mode. + * + * \param stream The stream to do the write in. + * \param addr The device address to write to. + * \param value The value to write. + * \param flags See ::CUstreamWriteValue_flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuStreamWriteValue64, + * ::cuStreamWaitValue32, + * ::cuStreamWaitValue64, + * ::cuStreamBatchMemOp, + * ::cuMemHostRegister, + * ::cuEventRecord + */ +CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); + +/** + * \brief Write a value to memory + * + * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER + * flag is passed, the write is preceded by a system-wide memory fence, + * equivalent to a __threadfence_system() but scoped to the stream + * rather than a CUDA thread. + * + * If the memory was registered via ::cuMemHostRegister(), the device pointer + * should be obtained with ::cuMemHostGetDevicePointer(). + * + * Support for this can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. The requirements are + * compute capability 7.0 or greater, and on Windows, that the device be in + * TCC mode. + * + * \param stream The stream to do the write in. + * \param addr The device address to write to. + * \param value The value to write. + * \param flags See ::CUstreamWriteValue_flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuStreamWriteValue32, + * ::cuStreamWaitValue32, + * ::cuStreamWaitValue64, + * ::cuStreamBatchMemOp, + * ::cuMemHostRegister, + * ::cuEventRecord + */ +CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); + +/** + * \brief Batch operations to synchronize the stream via memory operations + * + * This is a batch version of ::cuStreamWaitValue32() and ::cuStreamWriteValue32(). + * Batching operations may avoid some performance overhead in both the API call + * and the device execution versus adding them to the stream in separate API + * calls. The operations are enqueued in the order they appear in the array. + * + * See ::CUstreamBatchMemOpType for the full set of supported operations, and + * ::cuStreamWaitValue32(), ::cuStreamWaitValue64(), ::cuStreamWriteValue32(), + * and ::cuStreamWriteValue64() for details of specific operations. + * + * Basic support for this can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. See related APIs for details + * on querying support for specific operations. + * + * \param stream The stream to enqueue the operations in. + * \param count The number of operations in the array. Must be less than 256. + * \param paramArray The types and parameters of the individual operations. + * \param flags Reserved for future expansion; must be 0. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuStreamWaitValue32, + * ::cuStreamWaitValue64, + * ::cuStreamWriteValue32, + * ::cuStreamWriteValue64, + * ::cuMemHostRegister + */ +CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); +#endif /* __CUDA_API_VERSION >= 8000 */ + +/** @} */ /* END CUDA_EVENT */ + +/** + * \defgroup CUDA_EXEC Execution Control + * + * ___MANBRIEF___ execution control functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the execution control functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns information about a function + * + * Returns in \p *pi the integer value of the attribute \p attrib on the kernel + * given by \p hfunc. The supported attributes are: + * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads + * per block, beyond which a launch of the function would fail. This number + * depends on both the function and the device on which the function is + * currently loaded. + * - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of + * statically-allocated shared memory per block required by this function. + * This does not include dynamically-allocated shared memory requested by + * the user at runtime. + * - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated + * constant memory required by this function. + * - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory + * used by each thread of this function. + * - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread + * of this function. + * - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for + * which the function was compiled. This value is the major PTX version * 10 + * + the minor PTX version, so a PTX version 1.3 function would return the + * value 13. Note that this may return the undefined value of 0 for cubins + * compiled prior to CUDA 3.0. + * - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for + * which the function was compiled. This value is the major binary + * version * 10 + the minor binary version, so a binary version 1.3 function + * would return the value 13. Note that this will return a value of 10 for + * legacy cubins that do not have a properly-encoded binary architecture + * version. + * - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the function has + * been compiled with user specified option "-Xptxas --dlcm=ca" set . + * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in bytes of + * dynamically-allocated shared memory. + * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1 + * cache split ratio in percent of shared memory. + * + * \param pi - Returned attribute value + * \param attrib - Attribute requested + * \param hfunc - Function to query attribute of + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncSetCacheConfig, + * ::cuLaunchKernel, + * ::cudaFuncGetAttributes + * ::cudaFuncSetAttribute + */ +CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc); + +#if __CUDA_API_VERSION >= 9000 + +/** + * \brief Sets information about a function + * + * This call sets the value of a specified attribute \p attrib on the kernel given + * by \p hfunc to an integer value specified by \p val + * This function returns CUDA_SUCCESS if the new value of the attribute could be + * successfully set. If the set fails, this call will return an error. + * Not all attributes can have values set. Attempting to set a value on a read-only + * attribute will result in an error (CUDA_ERROR_INVALID_VALUE) + * + * Supported attributes for the cuFuncSetAttribute call are: + * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: This maximum size in bytes of + * dynamically-allocated shared memory. The value should contain the requested + * maximum size of dynamically-allocated shared memory. The sum of this value and + * the function attribute ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the + * device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN. + * The maximal size of requestable dynamic shared memory may differ by GPU + * architecture. + * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1 + * cache and shared memory use the same hardware resources, this sets the shared memory + * carveout preference, in percent of the total resources. This is only a hint, and the + * driver can choose a different ratio if required to execute the function. + * + * \param hfunc - Function to query attribute of + * \param attrib - Attribute requested + * \param value - The value to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncSetCacheConfig, + * ::cuLaunchKernel, + * ::cudaFuncGetAttributes + * ::cudaFuncSetAttribute + */ +CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value); +#endif // __CUDA_API_VERSION >= 9000 + +/** + * \brief Sets the preferred cache configuration for a device function + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this sets through \p config the preferred cache configuration for + * the device function \p hfunc. This is only a preference. The driver will use + * the requested configuration if possible, but it is free to choose a different + * configuration if required to execute \p hfunc. Any context-wide preference + * set via ::cuCtxSetCacheConfig() will be overridden by this per-function + * setting unless the per-function setting is ::CU_FUNC_CACHE_PREFER_NONE. In + * that case, the current context-wide setting will be used. + * + * This setting does nothing on devices where the size of the L1 cache and + * shared memory are fixed. + * + * Launching a kernel with a different preference than the most recent + * preference setting may insert a device-side synchronization point. + * + * + * The supported cache configurations are: + * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) + * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache + * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory + * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory + * + * \param hfunc - Kernel to configure cache for + * \param config - Requested cache configuration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cuLaunchKernel, + * ::cudaFuncSetCacheConfig + */ +CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config); + +#if __CUDA_API_VERSION >= 4020 +/** + * \brief Sets the shared memory configuration for a device function. + * + * On devices with configurable shared memory banks, this function will + * force all subsequent launches of the specified device function to have + * the given shared memory bank size configuration. On any given launch of the + * function, the shared memory configuration of the device will be temporarily + * changed if needed to suit the function's preferred configuration. Changes in + * shared memory configuration between subsequent launches of functions, + * may introduce a device side synchronization point. + * + * Any per-function setting of shared memory bank size set via + * ::cuFuncSetSharedMemConfig will override the context wide setting set with + * ::cuCtxSetSharedMemConfig. + * + * Changing the shared memory bank size will not increase shared memory usage + * or affect occupancy of kernels, but may have major effects on performance. + * Larger bank sizes will allow for greater potential bandwidth to shared memory, + * but will change what kinds of accesses to shared memory will result in bank + * conflicts. + * + * This function will do nothing on devices with fixed shared memory bank size. + * + * The supported bank configurations are: + * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: use the context's shared memory + * configuration when launching this function. + * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to + * be natively four bytes when launching this function. + * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to + * be natively eight bytes when launching this function. + * + * \param hfunc - kernel to be given a shared memory config + * \param config - requested shared memory configuration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuCtxGetSharedMemConfig, + * ::cuCtxSetSharedMemConfig, + * ::cuFuncGetAttribute, + * ::cuLaunchKernel, + * ::cudaFuncSetSharedMemConfig + */ +CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config); +#endif + +#if __CUDA_API_VERSION >= 4000 +/** + * \brief Launches a CUDA function + * + * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ + * grid of blocks. Each block contains \p blockDimX x \p blockDimY x + * \p blockDimZ threads. + * + * \p sharedMemBytes sets the amount of dynamic shared memory that will be + * available to each thread block. + * + * Kernel parameters to \p f can be specified in one of two ways: + * + * 1) Kernel parameters can be specified via \p kernelParams. If \p f + * has N parameters, then \p kernelParams needs to be an array of N + * pointers. Each of \p kernelParams[0] through \p kernelParams[N-1] + * must point to a region of memory from which the actual kernel + * parameter will be copied. The number of kernel parameters and their + * offsets and sizes do not need to be specified as that information is + * retrieved directly from the kernel's image. + * + * 2) Kernel parameters can also be packaged by the application into + * a single buffer that is passed in via the \p extra parameter. + * This places the burden on the application of knowing each kernel + * parameter's size and alignment/padding within the buffer. Here is + * an example of using the \p extra parameter in this manner: + * \code + size_t argBufferSize; + char argBuffer[256]; + + // populate argBuffer and argBufferSize + + void *config[] = { + CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, + CU_LAUNCH_PARAM_BUFFER_SIZE, &argBufferSize, + CU_LAUNCH_PARAM_END + }; + status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config); + * \endcode + * + * The \p extra parameter exists to allow ::cuLaunchKernel to take + * additional less commonly used arguments. \p extra specifies a list of + * names of extra settings and their corresponding values. Each extra + * setting name is immediately followed by the corresponding value. The + * list must be terminated with either NULL or ::CU_LAUNCH_PARAM_END. + * + * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra + * array; + * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next + * value in \p extra will be a pointer to a buffer containing all + * the kernel parameters for launching kernel \p f; + * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next + * value in \p extra will be a pointer to a size_t containing the + * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER; + * + * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel + * parameters are specified with both \p kernelParams and \p extra + * (i.e. both \p kernelParams and \p extra are non-NULL). + * + * Calling ::cuLaunchKernel() sets persistent function state that is + * the same as function state set through the following deprecated APIs: + * ::cuFuncSetBlockShape(), + * ::cuFuncSetSharedSize(), + * ::cuParamSetSize(), + * ::cuParamSeti(), + * ::cuParamSetf(), + * ::cuParamSetv(). + * + * When the kernel \p f is launched via ::cuLaunchKernel(), the previous + * block shape, shared size and parameter info associated with \p f + * is overwritten. + * + * Note that to use ::cuLaunchKernel(), the kernel \p f must either have + * been compiled with toolchain version 3.2 or later so that it will + * contain kernel parameter information, or have no kernel parameters. + * If either of these conditions is not met, then ::cuLaunchKernel() will + * return ::CUDA_ERROR_INVALID_IMAGE. + * + * \param f - Kernel to launch + * \param gridDimX - Width of grid in blocks + * \param gridDimY - Height of grid in blocks + * \param gridDimZ - Depth of grid in blocks + * \param blockDimX - X dimension of each thread block + * \param blockDimY - Y dimension of each thread block + * \param blockDimZ - Z dimension of each thread block + * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes + * \param hStream - Stream identifier + * \param kernelParams - Array of pointers to kernel parameters + * \param extra - Extra options + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_IMAGE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \note_null_stream + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cudaLaunchKernel + */ +CUresult CUDAAPI cuLaunchKernel(CUfunction f, + unsigned int gridDimX, + unsigned int gridDimY, + unsigned int gridDimZ, + unsigned int blockDimX, + unsigned int blockDimY, + unsigned int blockDimZ, + unsigned int sharedMemBytes, + CUstream hStream, + void **kernelParams, + void **extra); +#endif /* __CUDA_API_VERSION >= 4000 */ +#if __CUDA_API_VERSION >= 9000 +/** + * \brief Launches a CUDA function where thread blocks can cooperate and synchronize as they execute + * + * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ + * grid of blocks. Each block contains \p blockDimX x \p blockDimY x + * \p blockDimZ threads. + * + * \p sharedMemBytes sets the amount of dynamic shared memory that will be + * available to each thread block. + * + * The device on which this kernel is invoked must have a non-zero value for + * the device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH. + * + * The total number of blocks launched cannot exceed the maximum number of blocks per + * multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or + * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors + * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. + * + * The kernel cannot make use of CUDA dynamic parallelism. + * + * Kernel parameters must be specified via \p kernelParams. If \p f + * has N parameters, then \p kernelParams needs to be an array of N + * pointers. Each of \p kernelParams[0] through \p kernelParams[N-1] + * must point to a region of memory from which the actual kernel + * parameter will be copied. The number of kernel parameters and their + * offsets and sizes do not need to be specified as that information is + * retrieved directly from the kernel's image. + * + * Calling ::cuLaunchCooperativeKernel() sets persistent function state that is + * the same as function state set through ::cuLaunchKernel API + * + * When the kernel \p f is launched via ::cuLaunchCooperativeKernel(), the previous + * block shape, shared size and parameter info associated with \p f + * is overwritten. + * + * Note that to use ::cuLaunchCooperativeKernel(), the kernel \p f must either have + * been compiled with toolchain version 3.2 or later so that it will + * contain kernel parameter information, or have no kernel parameters. + * If either of these conditions is not met, then ::cuLaunchCooperativeKernel() will + * return ::CUDA_ERROR_INVALID_IMAGE. + * + * \param f - Kernel to launch + * \param gridDimX - Width of grid in blocks + * \param gridDimY - Height of grid in blocks + * \param gridDimZ - Depth of grid in blocks + * \param blockDimX - X dimension of each thread block + * \param blockDimY - Y dimension of each thread block + * \param blockDimZ - Z dimension of each thread block + * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes + * \param hStream - Stream identifier + * \param kernelParams - Array of pointers to kernel parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_IMAGE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \note_null_stream + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cuLaunchCooperativeKernelMultiDevice, + * ::cudaLaunchCooperativeKernel + */ +CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, + unsigned int gridDimX, + unsigned int gridDimY, + unsigned int gridDimZ, + unsigned int blockDimX, + unsigned int blockDimY, + unsigned int blockDimZ, + unsigned int sharedMemBytes, + CUstream hStream, + void **kernelParams); + +/** + * \brief Launches CUDA functions on multiple devices where thread blocks can cooperate and synchronize as they execute + * + * Invokes kernels as specified in the \p launchParamsList array where each element + * of the array specifies all the parameters required to perform a single kernel launch. + * These kernels can cooperate and synchronize as they execute. The size of the array is + * specified by \p numDevices. + * + * No two kernels can be launched on the same device. All the devices targeted by this + * multi-device launch must be identical. All devices must have a non-zero value for the + * device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH. + * + * All kernels launched must be identical with respect to the compiled code. Note that + * any __device__, __constant__ or __managed__ variables present in the module that owns + * the kernel launched on each device, are independently instantiated on every device. + * It is the application's responsiblity to ensure these variables are initialized and + * used appropriately. + * + * The size of the grids as specified in blocks, the size of the blocks themselves + * and the amount of shared memory used by each thread block must also match across + * all launched kernels. + * + * The streams used to launch these kernels must have been created via either ::cuStreamCreate + * or ::cuStreamCreateWithPriority. The NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD + * cannot be used. + * + * The total number of blocks launched per kernel cannot exceed the maximum number of blocks + * per multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or + * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors + * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. Since the + * total number of blocks launched per device has to match across all devices, the maximum + * number of blocks that can be launched per device will be limited by the device with the + * least number of multiprocessors. + * + * The kernels cannot make use of CUDA dynamic parallelism. + * + * The ::CUDA_LAUNCH_PARAMS structure is defined as: + * \code + typedef struct CUDA_LAUNCH_PARAMS_st + { + CUfunction function; + unsigned int gridDimX; + unsigned int gridDimY; + unsigned int gridDimZ; + unsigned int blockDimX; + unsigned int blockDimY; + unsigned int blockDimZ; + unsigned int sharedMemBytes; + CUstream hStream; + void **kernelParams; + } CUDA_LAUNCH_PARAMS; + * \endcode + * where: + * - ::CUDA_LAUNCH_PARAMS::function specifies the kernel to be launched. All functions must + * be identical with respect to the compiled code. + * - ::CUDA_LAUNCH_PARAMS::gridDimX is the width of the grid in blocks. This must match across + * all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::gridDimY is the height of the grid in blocks. This must match across + * all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::gridDimZ is the depth of the grid in blocks. This must match across + * all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::blockDimX is the X dimension of each thread block. This must match across + * all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::blockDimX is the Y dimension of each thread block. This must match across + * all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::blockDimZ is the Z dimension of each thread block. This must match across + * all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::sharedMemBytes is the dynamic shared-memory size per thread block in bytes. + * This must match across all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::hStream is the handle to the stream to perform the launch in. This cannot + * be the NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD. The CUDA context associated + * with this stream must match that associated with ::CUDA_LAUNCH_PARAMS::function. + * - ::CUDA_LAUNCH_PARAMS::kernelParams is an array of pointers to kernel parameters. If + * ::CUDA_LAUNCH_PARAMS::function has N parameters, then ::CUDA_LAUNCH_PARAMS::kernelParams + * needs to be an array of N pointers. Each of ::CUDA_LAUNCH_PARAMS::kernelParams[0] through + * ::CUDA_LAUNCH_PARAMS::kernelParams[N-1] must point to a region of memory from which the actual + * kernel parameter will be copied. The number of kernel parameters and their offsets and sizes + * do not need to be specified as that information is retrieved directly from the kernel's image. + * + * By default, the kernel won't begin execution on any GPU until all prior work in all the specified + * streams has completed. This behavior can be overridden by specifying the flag + * ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC. When this flag is specified, each kernel + * will only wait for prior work in the stream corresponding to that GPU to complete before it begins + * execution. + * + * Similarly, by default, any subsequent work pushed in any of the specified streams will not begin + * execution until the kernels on all GPUs have completed. This behavior can be overridden by specifying + * the flag ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC. When this flag is specified, + * any subsequent work pushed in any of the specified streams will only wait for the kernel launched + * on the GPU corresponding to that stream to complete before it begins execution. + * + * Calling ::cuLaunchCooperativeKernelMultiDevice() sets persistent function state that is + * the same as function state set through ::cuLaunchKernel API when called individually for each + * element in \p launchParamsList. + * + * When kernels are launched via ::cuLaunchCooperativeKernelMultiDevice(), the previous + * block shape, shared size and parameter info associated with each ::CUDA_LAUNCH_PARAMS::function + * in \p launchParamsList is overwritten. + * + * Note that to use ::cuLaunchCooperativeKernelMultiDevice(), the kernels must either have + * been compiled with toolchain version 3.2 or later so that it will + * contain kernel parameter information, or have no kernel parameters. + * If either of these conditions is not met, then ::cuLaunchCooperativeKernelMultiDevice() will + * return ::CUDA_ERROR_INVALID_IMAGE. + * + * \param launchParamsList - List of launch parameters, one per device + * \param numDevices - Size of the \p launchParamsList array + * \param flags - Flags to control launch behavior + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_IMAGE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \note_null_stream + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cuLaunchCooperativeKernel, + * ::cudaLaunchCooperativeKernelMultiDevice + */ +CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices, unsigned int flags); + +#endif /* __CUDA_API_VERSION >= 9000 */ + +/** @} */ /* END CUDA_EXEC */ + +/** + * \defgroup CUDA_EXEC_DEPRECATED Execution Control [DEPRECATED] + * + * ___MANBRIEF___ deprecated execution control functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the deprecated execution control functions of the + * low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Sets the block-dimensions for the function + * + * \deprecated + * + * Specifies the \p x, \p y, and \p z dimensions of the thread blocks that are + * created when the kernel given by \p hfunc is launched. + * + * \param hfunc - Kernel to specify dimensions of + * \param x - X dimension + * \param y - Y dimension + * \param z - Z dimension + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetSharedSize, + * ::cuFuncSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSeti, + * ::cuParamSetf, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z); + +/** + * \brief Sets the dynamic shared-memory size for the function + * + * \deprecated + * + * Sets through \p bytes the amount of dynamic shared memory that will be + * available to each thread block when the kernel given by \p hfunc is launched. + * + * \param hfunc - Kernel to specify dynamic shared-memory size for + * \param bytes - Dynamic shared-memory size per thread in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSeti, + * ::cuParamSetf, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes); + +/** + * \brief Sets the parameter size for the function + * + * \deprecated + * + * Sets through \p numbytes the total size in bytes needed by the function + * parameters of the kernel corresponding to \p hfunc. + * + * \param hfunc - Kernel to set parameter size for + * \param numbytes - Size of parameter list in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetf, + * ::cuParamSeti, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes); + +/** + * \brief Adds an integer parameter to the function's argument list + * + * \deprecated + * + * Sets an integer parameter that will be specified the next time the + * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset. + * + * \param hfunc - Kernel to add parameter to + * \param offset - Offset to add parameter to argument list + * \param value - Value of parameter + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSetf, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset, unsigned int value); + +/** + * \brief Adds a floating-point parameter to the function's argument list + * + * \deprecated + * + * Sets a floating-point parameter that will be specified the next time the + * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset. + * + * \param hfunc - Kernel to add parameter to + * \param offset - Offset to add parameter to argument list + * \param value - Value of parameter + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSeti, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset, float value); + +/** + * \brief Adds arbitrary data to the function's argument list + * + * \deprecated + * + * Copies an arbitrary amount of data (specified in \p numbytes) from \p ptr + * into the parameter space of the kernel corresponding to \p hfunc. \p offset + * is a byte offset. + * + * \param hfunc - Kernel to add data to + * \param offset - Offset to add data to argument list + * \param ptr - Pointer to arbitrary data + * \param numbytes - Size of data to copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSetf, + * ::cuParamSeti, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes); + +/** + * \brief Launches a CUDA function + * + * \deprecated + * + * Invokes the kernel \p f on a 1 x 1 x 1 grid of blocks. The block + * contains the number of threads specified by a previous call to + * ::cuFuncSetBlockShape(). + * + * \param f - Kernel to launch + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSetf, + * ::cuParamSeti, + * ::cuParamSetv, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +CUresult CUDAAPI cuLaunch(CUfunction f); + +/** + * \brief Launches a CUDA function + * + * \deprecated + * + * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of + * blocks. Each block contains the number of threads specified by a previous + * call to ::cuFuncSetBlockShape(). + * + * \param f - Kernel to launch + * \param grid_width - Width of grid in blocks + * \param grid_height - Height of grid in blocks + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSetf, + * ::cuParamSeti, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height); + +/** + * \brief Launches a CUDA function + * + * \deprecated + * + * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of + * blocks. Each block contains the number of threads specified by a previous + * call to ::cuFuncSetBlockShape(). + * + * \param f - Kernel to launch + * \param grid_width - Width of grid in blocks + * \param grid_height - Height of grid in blocks + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * + * \note In certain cases where cubins are created with no ABI (i.e., using \p ptxas \p --abi-compile \p no), + * this function may serialize kernel launches. In order to force the CUDA driver to retain + * asynchronous behavior, set the ::CU_CTX_LMEM_RESIZE_TO_MAX flag during context creation (see ::cuCtxCreate). + * + * \note_null_stream + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSetf, + * ::cuParamSeti, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchKernel + */ +CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream); + + +/** + * \brief Adds a texture-reference to the function's argument list + * + * \deprecated + * + * Makes the CUDA array or linear memory bound to the texture reference + * \p hTexRef available to a device program as a texture. In this version of + * CUDA, the texture-reference must be obtained via ::cuModuleGetTexRef() and + * the \p texunit parameter must be set to ::CU_PARAM_TR_DEFAULT. + * + * \param hfunc - Kernel to add texture-reference to + * \param texunit - Texture unit (must be ::CU_PARAM_TR_DEFAULT) + * \param hTexRef - Texture-reference to add to argument list + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + */ +CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef); +/** @} */ /* END CUDA_EXEC_DEPRECATED */ + + +#if __CUDA_API_VERSION >= 6050 +/** + * \defgroup CUDA_OCCUPANCY Occupancy + * + * ___MANBRIEF___ occupancy calculation functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the occupancy calculation functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns occupancy of a function + * + * Returns in \p *numBlocks the number of the maximum active blocks per + * streaming multiprocessor. + * + * \param numBlocks - Returned occupancy + * \param func - Kernel for which occupancy is calculated + * \param blockSize - Block size the kernel is intended to be launched with + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + */ +CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize); + +/** + * \brief Returns occupancy of a function + * + * Returns in \p *numBlocks the number of the maximum active blocks per + * streaming multiprocessor. + * + * The \p Flags parameter controls how special cases are handled. The + * valid flags are: + * + * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as + * ::cuOccupancyMaxActiveBlocksPerMultiprocessor; + * + * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the + * default behavior on platform where global caching affects + * occupancy. On such platforms, if caching is enabled, but + * per-block SM resource usage would result in zero occupancy, the + * occupancy calculator will calculate the occupancy as if caching + * is disabled. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE makes + * the occupancy calculator to return 0 in such cases. More information + * can be found about this feature in the "Unified L1/Texture Cache" + * section of the Maxwell tuning guide. + * + * \param numBlocks - Returned occupancy + * \param func - Kernel for which occupancy is calculated + * \param blockSize - Block size the kernel is intended to be launched with + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * \param flags - Requested behavior for the occupancy calculator + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + */ +CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags); + +/** + * \brief Suggest a launch configuration with reasonable occupancy + * + * Returns in \p *blockSize a reasonable block size that can achieve + * the maximum occupancy (or, the maximum number of active warps with + * the fewest blocks per multiprocessor), and in \p *minGridSize the + * minimum grid size to achieve the maximum occupancy. + * + * If \p blockSizeLimit is 0, the configurator will use the maximum + * block size permitted by the device / function instead. + * + * If per-block dynamic shared memory allocation is not needed, the + * user should leave both \p blockSizeToDynamicSMemSize and \p + * dynamicSMemSize as 0. + * + * If per-block dynamic shared memory allocation is needed, then if + * the dynamic shared memory size is constant regardless of block + * size, the size should be passed through \p dynamicSMemSize, and \p + * blockSizeToDynamicSMemSize should be NULL. + * + * Otherwise, if the per-block dynamic shared memory size varies with + * different block sizes, the user needs to provide a unary function + * through \p blockSizeToDynamicSMemSize that computes the dynamic + * shared memory needed by \p func for any given block size. \p + * dynamicSMemSize is ignored. An example signature is: + * + * \code + * // Take block size, returns dynamic shared memory needed + * size_t blockToSmem(int blockSize); + * \endcode + * + * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy + * \param blockSize - Returned maximum block size that can achieve the maximum occupancy + * \param func - Kernel for which launch configuration is calculated + * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size + * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes + * \param blockSizeLimit - The maximum block size \p func is designed to handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cudaOccupancyMaxPotentialBlockSize + */ +CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit); + +/** + * \brief Suggest a launch configuration with reasonable occupancy + * + * An extended version of ::cuOccupancyMaxPotentialBlockSize. In + * addition to arguments passed to ::cuOccupancyMaxPotentialBlockSize, + * ::cuOccupancyMaxPotentialBlockSizeWithFlags also takes a \p Flags + * parameter. + * + * The \p Flags parameter controls how special cases are handled. The + * valid flags are: + * + * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as + * ::cuOccupancyMaxPotentialBlockSize; + * + * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the + * default behavior on platform where global caching affects + * occupancy. On such platforms, the launch configurations that + * produces maximal occupancy might not support global + * caching. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE + * guarantees that the the produced launch configuration is global + * caching compatible at a potential cost of occupancy. More information + * can be found about this feature in the "Unified L1/Texture Cache" + * section of the Maxwell tuning guide. + * + * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy + * \param blockSize - Returned maximum block size that can achieve the maximum occupancy + * \param func - Kernel for which launch configuration is calculated + * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size + * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes + * \param blockSizeLimit - The maximum block size \p func is designed to handle + * \param flags - Options + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cudaOccupancyMaxPotentialBlockSizeWithFlags + */ +CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags); + +/** @} */ /* END CUDA_OCCUPANCY */ +#endif /* __CUDA_API_VERSION >= 6050 */ + +/** + * \defgroup CUDA_TEXREF Texture Reference Management + * + * ___MANBRIEF___ texture reference management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the texture reference management functions of the + * low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Binds an array as a texture reference + * + * Binds the CUDA array \p hArray to the texture reference \p hTexRef. Any + * previous address or CUDA array state associated with the texture reference + * is superseded by this function. \p Flags must be set to + * ::CU_TRSA_OVERRIDE_FORMAT. Any CUDA array previously bound to \p hTexRef is + * unbound. + * + * \param hTexRef - Texture reference to bind + * \param hArray - Array to bind + * \param Flags - Options (must be ::CU_TRSA_OVERRIDE_FORMAT) + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTextureToArray + */ +CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags); + +/** + * \brief Binds a mipmapped array to a texture reference + * + * Binds the CUDA mipmapped array \p hMipmappedArray to the texture reference \p hTexRef. + * Any previous address or CUDA array state associated with the texture reference + * is superseded by this function. \p Flags must be set to ::CU_TRSA_OVERRIDE_FORMAT. + * Any CUDA array previously bound to \p hTexRef is unbound. + * + * \param hTexRef - Texture reference to bind + * \param hMipmappedArray - Mipmapped array to bind + * \param Flags - Options (must be ::CU_TRSA_OVERRIDE_FORMAT) + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTextureToMipmappedArray + */ +CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags); + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Binds an address as a texture reference + * + * Binds a linear address range to the texture reference \p hTexRef. Any + * previous address or CUDA array state associated with the texture reference + * is superseded by this function. Any memory previously bound to \p hTexRef + * is unbound. + * + * Since the hardware enforces an alignment requirement on texture base + * addresses, ::cuTexRefSetAddress() passes back a byte offset in + * \p *ByteOffset that must be applied to texture fetches in order to read from + * the desired memory. This offset must be divided by the texel size and + * passed to kernels that read from the texture so they can be applied to the + * ::tex1Dfetch() function. + * + * If the device memory pointer was returned from ::cuMemAlloc(), the offset + * is guaranteed to be 0 and NULL may be passed as the \p ByteOffset parameter. + * + * The total number of elements (or texels) in the linear address range + * cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. + * The number of elements is computed as (\p bytes / bytesPerElement), + * where bytesPerElement is determined from the data format and number of + * components set using ::cuTexRefSetFormat(). + * + * \param ByteOffset - Returned byte offset + * \param hTexRef - Texture reference to bind + * \param dptr - Device pointer to bind + * \param bytes - Size of memory to bind in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTexture + */ +CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes); + +/** + * \brief Binds an address as a 2D texture reference + * + * Binds a linear address range to the texture reference \p hTexRef. Any + * previous address or CUDA array state associated with the texture reference + * is superseded by this function. Any memory previously bound to \p hTexRef + * is unbound. + * + * Using a ::tex2D() function inside a kernel requires a call to either + * ::cuTexRefSetArray() to bind the corresponding texture reference to an + * array, or ::cuTexRefSetAddress2D() to bind the texture reference to linear + * memory. + * + * Function calls to ::cuTexRefSetFormat() cannot follow calls to + * ::cuTexRefSetAddress2D() for the same texture reference. + * + * It is required that \p dptr be aligned to the appropriate hardware-specific + * texture alignment. You can query this value using the device attribute + * ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. If an unaligned \p dptr is + * supplied, ::CUDA_ERROR_INVALID_VALUE is returned. + * + * \p Pitch has to be aligned to the hardware-specific texture pitch alignment. + * This value can be queried using the device attribute + * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. If an unaligned \p Pitch is + * supplied, ::CUDA_ERROR_INVALID_VALUE is returned. + * + * Width and Height, which are specified in elements (or texels), cannot exceed + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively. + * \p Pitch, which is specified in bytes, cannot exceed + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH. + * + * \param hTexRef - Texture reference to bind + * \param desc - Descriptor of CUDA array + * \param dptr - Device pointer to bind + * \param Pitch - Line pitch in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTexture2D + */ +CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); +#endif /* __CUDA_API_VERSION >= 3020 */ + +/** + * \brief Sets the format for a texture reference + * + * Specifies the format of the data to be read by the texture reference + * \p hTexRef. \p fmt and \p NumPackedComponents are exactly analogous to the + * ::Format and ::NumChannels members of the ::CUDA_ARRAY_DESCRIPTOR structure: + * They specify the format of each component and the number of components per + * array element. + * + * \param hTexRef - Texture reference + * \param fmt - Format to set + * \param NumPackedComponents - Number of components per array element + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaCreateChannelDesc, + * ::cudaBindTexture, + * ::cudaBindTexture2D, + * ::cudaBindTextureToArray, + * ::cudaBindTextureToMipmappedArray + */ +CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents); + +/** + * \brief Sets the addressing mode for a texture reference + * + * Specifies the addressing mode \p am for the given dimension \p dim of the + * texture reference \p hTexRef. If \p dim is zero, the addressing mode is + * applied to the first parameter of the functions used to fetch from the + * texture; if \p dim is 1, the second, and so on. ::CUaddress_mode is defined + * as: + * \code + typedef enum CUaddress_mode_enum { + CU_TR_ADDRESS_MODE_WRAP = 0, + CU_TR_ADDRESS_MODE_CLAMP = 1, + CU_TR_ADDRESS_MODE_MIRROR = 2, + CU_TR_ADDRESS_MODE_BORDER = 3 + } CUaddress_mode; + * \endcode + * + * Note that this call has no effect if \p hTexRef is bound to linear memory. + * Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES, is not set, the only + * supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP. + * + * \param hTexRef - Texture reference + * \param dim - Dimension + * \param am - Addressing mode to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTexture, + * ::cudaBindTexture2D, + * ::cudaBindTextureToArray, + * ::cudaBindTextureToMipmappedArray + */ +CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am); + +/** + * \brief Sets the filtering mode for a texture reference + * + * Specifies the filtering mode \p fm to be used when reading memory through + * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as: + * + * \code + typedef enum CUfilter_mode_enum { + CU_TR_FILTER_MODE_POINT = 0, + CU_TR_FILTER_MODE_LINEAR = 1 + } CUfilter_mode; + * \endcode + * + * Note that this call has no effect if \p hTexRef is bound to linear memory. + * + * \param hTexRef - Texture reference + * \param fm - Filtering mode to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTextureToArray + */ +CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm); + +/** + * \brief Sets the mipmap filtering mode for a texture reference + * + * Specifies the mipmap filtering mode \p fm to be used when reading memory through + * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as: + * + * \code + typedef enum CUfilter_mode_enum { + CU_TR_FILTER_MODE_POINT = 0, + CU_TR_FILTER_MODE_LINEAR = 1 + } CUfilter_mode; + * \endcode + * + * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. + * + * \param hTexRef - Texture reference + * \param fm - Filtering mode to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTextureToMipmappedArray + */ +CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm); + +/** + * \brief Sets the mipmap level bias for a texture reference + * + * Specifies the mipmap level bias \p bias to be added to the specified mipmap level when + * reading memory through the texture reference \p hTexRef. + * + * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. + * + * \param hTexRef - Texture reference + * \param bias - Mipmap level bias + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTextureToMipmappedArray + */ +CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias); + +/** + * \brief Sets the mipmap min/max mipmap level clamps for a texture reference + * + * Specifies the min/max mipmap level clamps, \p minMipmapLevelClamp and \p maxMipmapLevelClamp + * respectively, to be used when reading memory through the texture reference + * \p hTexRef. + * + * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. + * + * \param hTexRef - Texture reference + * \param minMipmapLevelClamp - Mipmap min level clamp + * \param maxMipmapLevelClamp - Mipmap max level clamp + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTextureToMipmappedArray + */ +CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp); + +/** + * \brief Sets the maximum anisotropy for a texture reference + * + * Specifies the maximum anisotropy \p maxAniso to be used when reading memory through + * the texture reference \p hTexRef. + * + * Note that this call has no effect if \p hTexRef is bound to linear memory. + * + * \param hTexRef - Texture reference + * \param maxAniso - Maximum anisotropy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTextureToArray, + * ::cudaBindTextureToMipmappedArray + */ +CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso); + +/** + * \brief Sets the border color for a texture reference + * + * Specifies the value of the RGBA color via the \p pBorderColor to the texture reference + * \p hTexRef. The color value supports only float type and holds color components in + * the following sequence: + * pBorderColor[0] holds 'R' component + * pBorderColor[1] holds 'G' component + * pBorderColor[2] holds 'B' component + * pBorderColor[3] holds 'A' component + * + * Note that the color values can be set only when the Address mode is set to + * CU_TR_ADDRESS_MODE_BORDER using ::cuTexRefSetAddressMode. + * Applications using integer border color values have to "reinterpret_cast" their values to float. + * + * \param hTexRef - Texture reference + * \param pBorderColor - RGBA color + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddressMode, + * ::cuTexRefGetAddressMode, ::cuTexRefGetBorderColor, + * ::cudaBindTexture, + * ::cudaBindTexture2D, + * ::cudaBindTextureToArray, + * ::cudaBindTextureToMipmappedArray + */ +CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor); + +/** + * \brief Sets the flags for a texture reference + * + * Specifies optional flags via \p Flags to specify the behavior of data + * returned through the texture reference \p hTexRef. The valid flags are: + * + * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of + * having the texture promote integer data to floating point data in the + * range [0, 1]. Note that texture with 32-bit integer format + * would not be promoted, regardless of whether or not this + * flag is specified; + * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the + * default behavior of having the texture coordinates range + * from [0, Dim) where Dim is the width or height of the CUDA + * array. Instead, the texture coordinates [0, 1.0) reference + * the entire breadth of the array dimension; + * + * \param hTexRef - Texture reference + * \param Flags - Optional flags to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTexture, + * ::cudaBindTexture2D, + * ::cudaBindTextureToArray, + * ::cudaBindTextureToMipmappedArray + */ +CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags); + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Gets the address associated with a texture reference + * + * Returns in \p *pdptr the base address bound to the texture reference + * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference + * is not bound to any device memory range. + * + * \param pdptr - Returned device address + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef); +#endif /* __CUDA_API_VERSION >= 3020 */ + +/** + * \brief Gets the array bound to a texture reference + * + * Returns in \p *phArray the CUDA array bound to the texture reference + * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference + * is not bound to any CUDA array. + * + * \param phArray - Returned array + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef); + +/** + * \brief Gets the mipmapped array bound to a texture reference + * + * Returns in \p *phMipmappedArray the CUDA mipmapped array bound to the texture + * reference \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference + * is not bound to any CUDA mipmapped array. + * + * \param phMipmappedArray - Returned mipmapped array + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef); + +/** + * \brief Gets the addressing mode used by a texture reference + * + * Returns in \p *pam the addressing mode corresponding to the + * dimension \p dim of the texture reference \p hTexRef. Currently, the only + * valid value for \p dim are 0 and 1. + * + * \param pam - Returned addressing mode + * \param hTexRef - Texture reference + * \param dim - Dimension + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim); + +/** + * \brief Gets the filter-mode used by a texture reference + * + * Returns in \p *pfm the filtering mode of the texture reference + * \p hTexRef. + * + * \param pfm - Returned filtering mode + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); + +/** + * \brief Gets the format used by a texture reference + * + * Returns in \p *pFormat and \p *pNumChannels the format and number + * of components of the CUDA array bound to the texture reference \p hTexRef. + * If \p pFormat or \p pNumChannels is NULL, it will be ignored. + * + * \param pFormat - Returned format + * \param pNumChannels - Returned number of components + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags + */ +CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef); + +/** + * \brief Gets the mipmap filtering mode for a texture reference + * + * Returns the mipmap filtering mode in \p pfm that's used when reading memory through + * the texture reference \p hTexRef. + * + * \param pfm - Returned mipmap filtering mode + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); + +/** + * \brief Gets the mipmap level bias for a texture reference + * + * Returns the mipmap level bias in \p pBias that's added to the specified mipmap + * level when reading memory through the texture reference \p hTexRef. + * + * \param pbias - Returned mipmap level bias + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef); + +/** + * \brief Gets the min/max mipmap level clamps for a texture reference + * + * Returns the min/max mipmap level clamps in \p pminMipmapLevelClamp and \p pmaxMipmapLevelClamp + * that's used when reading memory through the texture reference \p hTexRef. + * + * \param pminMipmapLevelClamp - Returned mipmap min level clamp + * \param pmaxMipmapLevelClamp - Returned mipmap max level clamp + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef); + +/** + * \brief Gets the maximum anisotropy for a texture reference + * + * Returns the maximum anisotropy in \p pmaxAniso that's used when reading memory through + * the texture reference \p hTexRef. + * + * \param pmaxAniso - Returned maximum anisotropy + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef); + +/** + * \brief Gets the border color used by a texture reference + * + * Returns in \p pBorderColor, values of the RGBA color used by + * the texture reference \p hTexRef. + * The color value is of type float and holds color components in + * the following sequence: + * pBorderColor[0] holds 'R' component + * pBorderColor[1] holds 'G' component + * pBorderColor[2] holds 'B' component + * pBorderColor[3] holds 'A' component + * + * \param hTexRef - Texture reference + * \param pBorderColor - Returned Type and Value of RGBA color + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddressMode, + * ::cuTexRefSetAddressMode, ::cuTexRefSetBorderColor + */ +CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef); + +/** + * \brief Gets the flags used by a texture reference + * + * Returns in \p *pFlags the flags of the texture reference \p hTexRef. + * + * \param pFlags - Returned flags + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFormat + */ +CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef); + +/** @} */ /* END CUDA_TEXREF */ + +/** + * \defgroup CUDA_TEXREF_DEPRECATED Texture Reference Management [DEPRECATED] + * + * ___MANBRIEF___ deprecated texture reference management functions of the + * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the deprecated texture reference management + * functions of the low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Creates a texture reference + * + * \deprecated + * + * Creates a texture reference and returns its handle in \p *pTexRef. Once + * created, the application must call ::cuTexRefSetArray() or + * ::cuTexRefSetAddress() to associate the reference with allocated memory. + * Other texture reference functions are used to specify the format and + * interpretation (addressing, filtering, etc.) to be used when the memory is + * read through this texture reference. + * + * \param pTexRef - Returned texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefDestroy + */ +CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef); + +/** + * \brief Destroys a texture reference + * + * \deprecated + * + * Destroys the texture reference specified by \p hTexRef. + * + * \param hTexRef - Texture reference to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefCreate + */ +CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef); + +/** @} */ /* END CUDA_TEXREF_DEPRECATED */ + + +/** + * \defgroup CUDA_SURFREF Surface Reference Management + * + * ___MANBRIEF___ surface reference management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the surface reference management functions of the + * low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Sets the CUDA array for a surface reference. + * + * Sets the CUDA array \p hArray to be read and written by the surface reference + * \p hSurfRef. Any previous CUDA array state associated with the surface + * reference is superseded by this function. \p Flags must be set to 0. + * The ::CUDA_ARRAY3D_SURFACE_LDST flag must have been set for the CUDA array. + * Any CUDA array previously bound to \p hSurfRef is unbound. + + * \param hSurfRef - Surface reference handle + * \param hArray - CUDA array handle + * \param Flags - set to 0 + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuModuleGetSurfRef, + * ::cuSurfRefGetArray, + * ::cudaBindSurfaceToArray + */ +CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags); + +/** + * \brief Passes back the CUDA array bound to a surface reference. + * + * Returns in \p *phArray the CUDA array bound to the surface reference + * \p hSurfRef, or returns ::CUDA_ERROR_INVALID_VALUE if the surface reference + * is not bound to any CUDA array. + + * \param phArray - Surface reference handle + * \param hSurfRef - Surface reference handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuModuleGetSurfRef, ::cuSurfRefSetArray + */ +CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef); + +/** @} */ /* END CUDA_SURFREF */ + +#if __CUDA_API_VERSION >= 5000 +/** + * \defgroup CUDA_TEXOBJECT Texture Object Management + * + * ___MANBRIEF___ texture object management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the texture object management functions of the + * low-level CUDA driver application programming interface. The texture + * object API is only supported on devices of compute capability 3.0 or higher. + * + * @{ + */ + +/** + * \brief Creates a texture object + * + * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes + * the data to texture from. \p pTexDesc describes how the data should be sampled. + * \p pResViewDesc is an optional argument that specifies an alternate format for + * the data described by \p pResDesc, and also describes the subresource region + * to restrict access to when texturing. \p pResViewDesc can only be specified if + * the type of resource is a CUDA array or a CUDA mipmapped array. + * + * Texture objects are only supported on devices of compute capability 3.0 or higher. + * Additionally, a texture object is an opaque value, and, as such, should only be + * accessed through CUDA API calls. + * + * The ::CUDA_RESOURCE_DESC structure is defined as: + * \code + typedef struct CUDA_RESOURCE_DESC_st + { + CUresourcetype resType; + + union { + struct { + CUarray hArray; + } array; + struct { + CUmipmappedArray hMipmappedArray; + } mipmap; + struct { + CUdeviceptr devPtr; + CUarray_format format; + unsigned int numChannels; + size_t sizeInBytes; + } linear; + struct { + CUdeviceptr devPtr; + CUarray_format format; + unsigned int numChannels; + size_t width; + size_t height; + size_t pitchInBytes; + } pitch2D; + } res; + + unsigned int flags; + } CUDA_RESOURCE_DESC; + + * \endcode + * where: + * - ::CUDA_RESOURCE_DESC::resType specifies the type of resource to texture from. + * CUresourceType is defined as: + * \code + typedef enum CUresourcetype_enum { + CU_RESOURCE_TYPE_ARRAY = 0x00, + CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, + CU_RESOURCE_TYPE_LINEAR = 0x02, + CU_RESOURCE_TYPE_PITCH2D = 0x03 + } CUresourcetype; + * \endcode + * + * \par + * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_ARRAY, ::CUDA_RESOURCE_DESC::res::array::hArray + * must be set to a valid CUDA array handle. + * + * \par + * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY, ::CUDA_RESOURCE_DESC::res::mipmap::hMipmappedArray + * must be set to a valid CUDA mipmapped array handle. + * + * \par + * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_LINEAR, ::CUDA_RESOURCE_DESC::res::linear::devPtr + * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. + * ::CUDA_RESOURCE_DESC::res::linear::format and ::CUDA_RESOURCE_DESC::res::linear::numChannels + * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::linear::sizeInBytes + * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. The number of elements is computed as (sizeInBytes / (sizeof(format) * numChannels)). + * + * \par + * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_PITCH2D, ::CUDA_RESOURCE_DESC::res::pitch2D::devPtr + * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. + * ::CUDA_RESOURCE_DESC::res::pitch2D::format and ::CUDA_RESOURCE_DESC::res::pitch2D::numChannels + * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::pitch2D::width + * and ::CUDA_RESOURCE_DESC::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively. + * ::CUDA_RESOURCE_DESC::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to + * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. Pitch cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH. + * + * - ::flags must be set to zero. + * + * + * The ::CUDA_TEXTURE_DESC struct is defined as + * \code + typedef struct CUDA_TEXTURE_DESC_st { + CUaddress_mode addressMode[3]; + CUfilter_mode filterMode; + unsigned int flags; + unsigned int maxAnisotropy; + CUfilter_mode mipmapFilterMode; + float mipmapLevelBias; + float minMipmapLevelClamp; + float maxMipmapLevelClamp; + } CUDA_TEXTURE_DESC; + * \endcode + * where + * - ::CUDA_TEXTURE_DESC::addressMode specifies the addressing mode for each dimension of the texture data. ::CUaddress_mode is defined as: + * \code + typedef enum CUaddress_mode_enum { + CU_TR_ADDRESS_MODE_WRAP = 0, + CU_TR_ADDRESS_MODE_CLAMP = 1, + CU_TR_ADDRESS_MODE_MIRROR = 2, + CU_TR_ADDRESS_MODE_BORDER = 3 + } CUaddress_mode; + * \endcode + * This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES + * is not set, the only supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP. + * + * - ::CUDA_TEXTURE_DESC::filterMode specifies the filtering mode to be used when fetching from the texture. CUfilter_mode is defined as: + * \code + typedef enum CUfilter_mode_enum { + CU_TR_FILTER_MODE_POINT = 0, + CU_TR_FILTER_MODE_LINEAR = 1 + } CUfilter_mode; + * \endcode + * This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. + * + * - ::CUDA_TEXTURE_DESC::flags can be any combination of the following: + * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of having the texture promote integer data to floating point data in the + * range [0, 1]. Note that texture with 32-bit integer format would not be promoted, regardless of whether or not this flag is specified. + * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default behavior of having the texture coordinates range from [0, Dim) where Dim is + * the width or height of the CUDA array. Instead, the texture coordinates [0, 1.0) reference the entire breadth of the array dimension; Note + * that for CUDA mipmapped arrays, this flag has to be set. + * + * - ::CUDA_TEXTURE_DESC::maxAnisotropy specifies the maximum anisotropy ratio to be used when doing anisotropic filtering. This value will be + * clamped to the range [1,16]. + * + * - ::CUDA_TEXTURE_DESC::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels. + * + * - ::CUDA_TEXTURE_DESC::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level. + * + * - ::CUDA_TEXTURE_DESC::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to. + * + * - ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to. + * + * + * The ::CUDA_RESOURCE_VIEW_DESC struct is defined as + * \code + typedef struct CUDA_RESOURCE_VIEW_DESC_st + { + CUresourceViewFormat format; + size_t width; + size_t height; + size_t depth; + unsigned int firstMipmapLevel; + unsigned int lastMipmapLevel; + unsigned int firstLayer; + unsigned int lastLayer; + } CUDA_RESOURCE_VIEW_DESC; + * \endcode + * where: + * - ::CUDA_RESOURCE_VIEW_DESC::format specifies how the data contained in the CUDA array or CUDA mipmapped array should + * be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block + * compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a base of format ::CU_AD_FORMAT_UNSIGNED_INT32. + * with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have + * a format of ::CU_AD_FORMAT_UNSIGNED_INT32 with 2 channels. The other BC formats require the underlying resource to have the same base + * format but with 4 channels. + * + * - ::CUDA_RESOURCE_VIEW_DESC::width specifies the new width of the texture data. If the resource view format is a block + * compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats, + * this value has to be equal to that of the original resource. + * + * - ::CUDA_RESOURCE_VIEW_DESC::height specifies the new height of the texture data. If the resource view format is a block + * compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats, + * this value has to be equal to that of the original resource. + * + * - ::CUDA_RESOURCE_VIEW_DESC::depth specifies the new depth of the texture data. This value has to be equal to that of the + * original resource. + * + * - ::CUDA_RESOURCE_VIEW_DESC::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero. + * For non-mipmapped resources, this value has to be zero.::CUDA_TEXTURE_DESC::minMipmapLevelClamp and ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp + * will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified, + * then the actual minimum mipmap level clamp will be 3.2. + * + * - ::CUDA_RESOURCE_VIEW_DESC::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value + * has to be zero. + * + * - ::CUDA_RESOURCE_VIEW_DESC::firstLayer specifies the first layer index for layered textures. This will be the new layer zero. + * For non-layered resources, this value has to be zero. + * + * - ::CUDA_RESOURCE_VIEW_DESC::lastLayer specifies the last layer index for layered textures. For non-layered resources, + * this value has to be zero. + * + * + * \param pTexObject - Texture object to create + * \param pResDesc - Resource descriptor + * \param pTexDesc - Texture descriptor + * \param pResViewDesc - Resource view descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexObjectDestroy, + * ::cudaCreateTextureObject + */ +CUresult CUDAAPI cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc); + +/** + * \brief Destroys a texture object + * + * Destroys the texture object specified by \p texObject. + * + * \param texObject - Texture object to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexObjectCreate, + * ::cudaDestroyTextureObject + */ +CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject); + +/** + * \brief Returns a texture object's resource descriptor + * + * Returns the resource descriptor for the texture object specified by \p texObject. + * + * \param pResDesc - Resource descriptor + * \param texObject - Texture object + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexObjectCreate, + * ::cudaGetTextureObjectResourceDesc, + */ +CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUtexObject texObject); + +/** + * \brief Returns a texture object's texture descriptor + * + * Returns the texture descriptor for the texture object specified by \p texObject. + * + * \param pTexDesc - Texture descriptor + * \param texObject - Texture object + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexObjectCreate, + * ::cudaGetTextureObjectTextureDesc + */ +CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc, CUtexObject texObject); + +/** + * \brief Returns a texture object's resource view descriptor + * + * Returns the resource view descriptor for the texture object specified by \p texObject. + * If no resource view was set for \p texObject, the ::CUDA_ERROR_INVALID_VALUE is returned. + * + * \param pResViewDesc - Resource view descriptor + * \param texObject - Texture object + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexObjectCreate, + * ::cudaGetTextureObjectResourceViewDesc + */ +CUresult CUDAAPI cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject); + +/** @} */ /* END CUDA_TEXOBJECT */ + +/** + * \defgroup CUDA_SURFOBJECT Surface Object Management + * + * ___MANBRIEF___ surface object management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the surface object management functions of the + * low-level CUDA driver application programming interface. The surface + * object API is only supported on devices of compute capability 3.0 or higher. + * + * @{ + */ + +/** + * \brief Creates a surface object + * + * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes + * the data to perform surface load/stores on. ::CUDA_RESOURCE_DESC::resType must be + * ::CU_RESOURCE_TYPE_ARRAY and ::CUDA_RESOURCE_DESC::res::array::hArray + * must be set to a valid CUDA array handle. ::CUDA_RESOURCE_DESC::flags must be set to zero. + * + * Surface objects are only supported on devices of compute capability 3.0 or higher. + * Additionally, a surface object is an opaque value, and, as such, should only be + * accessed through CUDA API calls. + * + * \param pSurfObject - Surface object to create + * \param pResDesc - Resource descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuSurfObjectDestroy, + * ::cudaCreateSurfaceObject + */ +CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject, const CUDA_RESOURCE_DESC *pResDesc); + +/** + * \brief Destroys a surface object + * + * Destroys the surface object specified by \p surfObject. + * + * \param surfObject - Surface object to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuSurfObjectCreate, + * ::cudaDestroySurfaceObject + */ +CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject); + +/** + * \brief Returns a surface object's resource descriptor + * + * Returns the resource descriptor for the surface object specified by \p surfObject. + * + * \param pResDesc - Resource descriptor + * \param surfObject - Surface object + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuSurfObjectCreate, + * ::cudaGetSurfaceObjectResourceDesc + */ +CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsurfObject surfObject); + +/** @} */ /* END CUDA_SURFOBJECT */ +#endif /* __CUDA_API_VERSION >= 5000 */ + +/** + * \defgroup CUDA_PEER_ACCESS Peer Context Memory Access + * + * ___MANBRIEF___ direct peer context memory access functions of the low-level + * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the direct peer context memory access functions + * of the low-level CUDA driver application programming interface. + * + * @{ + */ + +#if __CUDA_API_VERSION >= 4000 + +/** + * \brief Queries if a device may directly access a peer device's memory. + * + * Returns in \p *canAccessPeer a value of 1 if contexts on \p dev are capable of + * directly accessing memory from contexts on \p peerDev and 0 otherwise. + * If direct access of \p peerDev from \p dev is possible, then access may be + * enabled on two specific contexts by calling ::cuCtxEnablePeerAccess(). + * + * \param canAccessPeer - Returned access capability + * \param dev - Device from which allocations on \p peerDev are to + * be directly accessed. + * \param peerDev - Device on which the allocations to be directly accessed + * by \p dev reside. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuCtxEnablePeerAccess, + * ::cuCtxDisablePeerAccess, + * ::cudaDeviceCanAccessPeer + */ +CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev); + +/** + * \brief Enables direct access to memory allocations in a peer context. + * + * If both the current context and \p peerContext are on devices which support unified + * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING) and same + * major compute capability, then on success all allocations from \p peerContext will + * immediately be accessible by the current context. See \ref CUDA_UNIFIED for additional + * details. + * + * Note that access granted by this call is unidirectional and that in order to access + * memory from the current context in \p peerContext, a separate symmetric call + * to ::cuCtxEnablePeerAccess() is required. + * + * There is a system-wide maximum of eight peer connections per device. + * + * Returns ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED if ::cuDeviceCanAccessPeer() indicates + * that the ::CUdevice of the current context cannot directly access memory + * from the ::CUdevice of \p peerContext. + * + * Returns ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED if direct access of + * \p peerContext from the current context has already been enabled. + * + * Returns ::CUDA_ERROR_TOO_MANY_PEERS if direct peer access is not possible + * because hardware resources required for peer access have been exhausted. + * + * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, \p peerContext + * is not a valid context, or if the current context is \p peerContext. + * + * Returns ::CUDA_ERROR_INVALID_VALUE if \p Flags is not 0. + * + * \param peerContext - Peer context to enable direct access to from the current context + * \param Flags - Reserved for future use and must be set to 0 + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, + * ::CUDA_ERROR_TOO_MANY_PEERS, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuDeviceCanAccessPeer, + * ::cuCtxDisablePeerAccess, + * ::cudaDeviceEnablePeerAccess + */ +CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags); + +/** + * \brief Disables direct access to memory allocations in a peer context and + * unregisters any registered allocations. + * + Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has + * not yet been enabled from \p peerContext to the current context. + * + * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, or if + * \p peerContext is not a valid context. + * + * \param peerContext - Peer context to disable direct access to + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * \notefnerr + * + * \sa + * ::cuDeviceCanAccessPeer, + * ::cuCtxEnablePeerAccess, + * ::cudaDeviceDisablePeerAccess + */ +CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext); + +#endif /* __CUDA_API_VERSION >= 4000 */ + +#if __CUDA_API_VERSION >= 8000 + +/** + * \brief Queries attributes of the link between two devices. + * + * Returns in \p *value the value of the requested attribute \p attrib of the + * link between \p srcDevice and \p dstDevice. The supported attributes are: + * - ::CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK: A relative value indicating the + * performance of the link between two devices. + * - ::CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED P2P: 1 if P2P Access is enable. + * - ::CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED: 1 if Atomic operations over + * the link are supported. + * + * Returns ::CUDA_ERROR_INVALID_DEVICE if \p srcDevice or \p dstDevice are not valid + * or if they represent the same device. + * + * Returns ::CUDA_ERROR_INVALID_VALUE if \p attrib is not valid or if \p value is + * a null pointer. + * + * \param value - Returned value of the requested attribute + * \param attrib - The requested attribute of the link between \p srcDevice and \p dstDevice. + * \param srcDevice - The source device of the target link. + * \param dstDevice - The destination device of the target link. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuCtxEnablePeerAccess, + * ::cuCtxDisablePeerAccess, + * ::cuDeviceCanAccessPeer, + * ::cudaDeviceGetP2PAttribute + */ +CUresult CUDAAPI cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice); + +#endif /* __CUDA_API_VERSION >= 8000 */ + +/** @} */ /* END CUDA_PEER_ACCESS */ + +/** + * \defgroup CUDA_GRAPHICS Graphics Interoperability + * + * ___MANBRIEF___ graphics interoperability functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the graphics interoperability functions of the + * low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Unregisters a graphics resource for access by CUDA + * + * Unregisters the graphics resource \p resource so it is not accessible by + * CUDA unless registered again. + * + * If \p resource is invalid then ::CUDA_ERROR_INVALID_HANDLE is + * returned. + * + * \param resource - Resource to unregister + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuGraphicsD3D9RegisterResource, + * ::cuGraphicsD3D10RegisterResource, + * ::cuGraphicsD3D11RegisterResource, + * ::cuGraphicsGLRegisterBuffer, + * ::cuGraphicsGLRegisterImage, + * ::cudaGraphicsUnregisterResource + */ +CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource); + +/** + * \brief Get an array through which to access a subresource of a mapped graphics resource. + * + * Returns in \p *pArray an array through which the subresource of the mapped + * graphics resource \p resource which corresponds to array index \p arrayIndex + * and mipmap level \p mipLevel may be accessed. The value set in \p *pArray may + * change every time that \p resource is mapped. + * + * If \p resource is not a texture then it cannot be accessed via an array and + * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned. + * If \p arrayIndex is not a valid array index for \p resource then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * If \p mipLevel is not a valid mipmap level for \p resource then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. + * + * \param pArray - Returned array through which a subresource of \p resource may be accessed + * \param resource - Mapped resource to access + * \param arrayIndex - Array index for array textures or cubemap face + * index as defined by ::CUarray_cubemap_face for + * cubemap textures for the subresource to access + * \param mipLevel - Mipmap level for the subresource to access + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED, + * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY + * \notefnerr + * + * \sa + * ::cuGraphicsResourceGetMappedPointer, + * ::cudaGraphicsSubResourceGetMappedArray + */ +CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel); + +#if __CUDA_API_VERSION >= 5000 + +/** + * \brief Get a mipmapped array through which to access a mapped graphics resource. + * + * Returns in \p *pMipmappedArray a mipmapped array through which the mapped graphics + * resource \p resource. The value set in \p *pMipmappedArray may change every time + * that \p resource is mapped. + * + * If \p resource is not a texture then it cannot be accessed via a mipmapped array and + * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned. + * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. + * + * \param pMipmappedArray - Returned mipmapped array through which \p resource may be accessed + * \param resource - Mapped resource to access + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED, + * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY + * \notefnerr + * + * \sa + * ::cuGraphicsResourceGetMappedPointer, + * ::cudaGraphicsResourceGetMappedMipmappedArray + */ +CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource); + +#endif /* __CUDA_API_VERSION >= 5000 */ + +#if __CUDA_API_VERSION >= 3020 +/** + * \brief Get a device pointer through which to access a mapped graphics resource. + * + * Returns in \p *pDevPtr a pointer through which the mapped graphics resource + * \p resource may be accessed. + * Returns in \p pSize the size of the memory in bytes which may be accessed from that pointer. + * The value set in \p pPointer may change every time that \p resource is mapped. + * + * If \p resource is not a buffer then it cannot be accessed via a pointer and + * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER is returned. + * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. + * * + * \param pDevPtr - Returned pointer through which \p resource may be accessed + * \param pSize - Returned size of the buffer accessible starting at \p *pPointer + * \param resource - Mapped resource to access + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED, + * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER + * \notefnerr + * + * \sa + * ::cuGraphicsMapResources, + * ::cuGraphicsSubResourceGetMappedArray, + * ::cudaGraphicsResourceGetMappedPointer + */ +CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource); +#endif /* __CUDA_API_VERSION >= 3020 */ + +/** + * \brief Set usage flags for mapping a graphics resource + * + * Set \p flags for mapping the graphics resource \p resource. + * + * Changes to \p flags will take effect the next time \p resource is mapped. + * The \p flags argument may be any of the following: + + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this + * resource will be used. It is therefore assumed that this resource will be + * read from and written to by CUDA kernels. This is the default value. + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which + * access this resource will not write to this resource. + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels + * which access this resource will not read from this resource and will + * write over the entire contents of the resource, so none of the data + * previously stored in the resource will be preserved. + * + * If \p resource is presently mapped for access by CUDA then + * ::CUDA_ERROR_ALREADY_MAPPED is returned. + * If \p flags is not one of the above values then ::CUDA_ERROR_INVALID_VALUE is returned. + * + * \param resource - Registered resource to set flags for + * \param flags - Parameters for resource mapping + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ALREADY_MAPPED + * \notefnerr + * + * \sa + * ::cuGraphicsMapResources, + * ::cudaGraphicsResourceSetMapFlags + */ +CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags); + +/** + * \brief Map graphics resources for access by CUDA + * + * Maps the \p count graphics resources in \p resources for access by CUDA. + * + * The resources in \p resources may be accessed by CUDA until they + * are unmapped. The graphics API from which \p resources were registered + * should not access any resources while they are mapped by CUDA. If an + * application does so, the results are undefined. + * + * This function provides the synchronization guarantee that any graphics calls + * issued before ::cuGraphicsMapResources() will complete before any subsequent CUDA + * work issued in \p stream begins. + * + * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned. + * If any of \p resources are presently mapped for access by CUDA then ::CUDA_ERROR_ALREADY_MAPPED is returned. + * + * \param count - Number of resources to map + * \param resources - Resources to map for CUDA usage + * \param hStream - Stream with which to synchronize + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ALREADY_MAPPED, + * ::CUDA_ERROR_UNKNOWN + * \note_null_stream + * \notefnerr + * + * \sa + * ::cuGraphicsResourceGetMappedPointer, + * ::cuGraphicsSubResourceGetMappedArray, + * ::cuGraphicsUnmapResources, + * ::cudaGraphicsMapResources + */ +CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); + +/** + * \brief Unmap graphics resources. + * + * Unmaps the \p count graphics resources in \p resources. + * + * Once unmapped, the resources in \p resources may not be accessed by CUDA + * until they are mapped again. + * + * This function provides the synchronization guarantee that any CUDA work issued + * in \p stream before ::cuGraphicsUnmapResources() will complete before any + * subsequently issued graphics work begins. + * + * + * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned. + * If any of \p resources are not presently mapped for access by CUDA then ::CUDA_ERROR_NOT_MAPPED is returned. + * + * \param count - Number of resources to unmap + * \param resources - Resources to unmap + * \param hStream - Stream with which to synchronize + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED, + * ::CUDA_ERROR_UNKNOWN + * \note_null_stream + * \notefnerr + * + * \sa + * ::cuGraphicsMapResources, + * ::cudaGraphicsUnmapResources + */ +CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); + +/** @} */ /* END CUDA_GRAPHICS */ + +CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId); + + +/** + * CUDA API versioning support + */ +#if defined(__CUDA_API_VERSION_INTERNAL) + #undef cuMemHostRegister + #undef cuGraphicsResourceSetMapFlags + #undef cuLinkCreate + #undef cuLinkAddData + #undef cuLinkAddFile + #undef cuDeviceTotalMem + #undef cuCtxCreate + #undef cuModuleGetGlobal + #undef cuMemGetInfo + #undef cuMemAlloc + #undef cuMemAllocPitch + #undef cuMemFree + #undef cuMemGetAddressRange + #undef cuMemAllocHost + #undef cuMemHostGetDevicePointer + #undef cuMemcpyHtoD + #undef cuMemcpyDtoH + #undef cuMemcpyDtoD + #undef cuMemcpyDtoA + #undef cuMemcpyAtoD + #undef cuMemcpyHtoA + #undef cuMemcpyAtoH + #undef cuMemcpyAtoA + #undef cuMemcpyHtoAAsync + #undef cuMemcpyAtoHAsync + #undef cuMemcpy2D + #undef cuMemcpy2DUnaligned + #undef cuMemcpy3D + #undef cuMemcpyHtoDAsync + #undef cuMemcpyDtoHAsync + #undef cuMemcpyDtoDAsync + #undef cuMemcpy2DAsync + #undef cuMemcpy3DAsync + #undef cuMemsetD8 + #undef cuMemsetD16 + #undef cuMemsetD32 + #undef cuMemsetD2D8 + #undef cuMemsetD2D16 + #undef cuMemsetD2D32 + #undef cuArrayCreate + #undef cuArrayGetDescriptor + #undef cuArray3DCreate + #undef cuArray3DGetDescriptor + #undef cuTexRefSetAddress + #undef cuTexRefSetAddress2D + #undef cuTexRefGetAddress + #undef cuGraphicsResourceGetMappedPointer + #undef cuCtxDestroy + #undef cuCtxPopCurrent + #undef cuCtxPushCurrent + #undef cuStreamDestroy + #undef cuEventDestroy + #undef cuMemcpy + #undef cuMemcpyAsync + #undef cuMemcpyPeer + #undef cuMemcpyPeerAsync + #undef cuMemcpy3DPeer + #undef cuMemcpy3DPeerAsync + #undef cuMemsetD8Async + #undef cuMemsetD16Async + #undef cuMemsetD32Async + #undef cuMemsetD2D8Async + #undef cuMemsetD2D16Async + #undef cuMemsetD2D32Async + #undef cuStreamGetPriority + #undef cuStreamGetFlags + #undef cuStreamWaitEvent + #undef cuStreamAddCallback + #undef cuStreamAttachMemAsync + #undef cuStreamQuery + #undef cuStreamSynchronize + #undef cuEventRecord + #undef cuLaunchKernel + #undef cuGraphicsMapResources + #undef cuGraphicsUnmapResources + #undef cuStreamWriteValue32 + #undef cuStreamWaitValue32 + #undef cuStreamWriteValue64 + #undef cuStreamWaitValue64 + #undef cuStreamBatchMemOp + #undef cuMemPrefetchAsync + #undef cuLaunchCooperativeKernel +#endif /* __CUDA_API_VERSION_INTERNAL */ + +#if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 4000 && __CUDA_API_VERSION < 6050) +CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags); +#endif /* defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 4000 && __CUDA_API_VERSION < 6050) */ + +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 6050 +CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags); +#endif /* defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 6050 */ + +#if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 5050 && __CUDA_API_VERSION < 6050) +CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut); +CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, + unsigned int numOptions, CUjit_option *options, void **optionValues); +CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path, + unsigned int numOptions, CUjit_option *options, void **optionValues); +#endif /* __CUDA_API_VERSION_INTERNAL || (__CUDA_API_VERSION >= 5050 && __CUDA_API_VERSION < 6050) */ + +#if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010) +CUresult CUDAAPI cuTexRefSetAddress2D_v2(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); +#endif /* __CUDA_API_VERSION_INTERNAL || (__CUDA_API_VERSION >= 3020 && __CUDA_API_VERSION < 4010) */ + +/** + * CUDA API made obselete at API version 3020 + */ +#if defined(__CUDA_API_VERSION_INTERNAL) + #define CUdeviceptr CUdeviceptr_v1 + #define CUDA_MEMCPY2D_st CUDA_MEMCPY2D_v1_st + #define CUDA_MEMCPY2D CUDA_MEMCPY2D_v1 + #define CUDA_MEMCPY3D_st CUDA_MEMCPY3D_v1_st + #define CUDA_MEMCPY3D CUDA_MEMCPY3D_v1 + #define CUDA_ARRAY_DESCRIPTOR_st CUDA_ARRAY_DESCRIPTOR_v1_st + #define CUDA_ARRAY_DESCRIPTOR CUDA_ARRAY_DESCRIPTOR_v1 + #define CUDA_ARRAY3D_DESCRIPTOR_st CUDA_ARRAY3D_DESCRIPTOR_v1_st + #define CUDA_ARRAY3D_DESCRIPTOR CUDA_ARRAY3D_DESCRIPTOR_v1 +#endif /* CUDA_FORCE_LEGACY32_INTERNAL */ + +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 3020 + +typedef unsigned int CUdeviceptr; + +typedef struct CUDA_MEMCPY2D_st +{ + unsigned int srcXInBytes; /**< Source X in bytes */ + unsigned int srcY; /**< Source Y */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ + + unsigned int dstXInBytes; /**< Destination X in bytes */ + unsigned int dstY; /**< Destination Y */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ + + unsigned int WidthInBytes; /**< Width of 2D memory copy in bytes */ + unsigned int Height; /**< Height of 2D memory copy */ +} CUDA_MEMCPY2D; + +typedef struct CUDA_MEMCPY3D_st +{ + unsigned int srcXInBytes; /**< Source X in bytes */ + unsigned int srcY; /**< Source Y */ + unsigned int srcZ; /**< Source Z */ + unsigned int srcLOD; /**< Source LOD */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + void *reserved0; /**< Must be NULL */ + unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ + unsigned int srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ + + unsigned int dstXInBytes; /**< Destination X in bytes */ + unsigned int dstY; /**< Destination Y */ + unsigned int dstZ; /**< Destination Z */ + unsigned int dstLOD; /**< Destination LOD */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + void *reserved1; /**< Must be NULL */ + unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ + unsigned int dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ + + unsigned int WidthInBytes; /**< Width of 3D memory copy in bytes */ + unsigned int Height; /**< Height of 3D memory copy */ + unsigned int Depth; /**< Depth of 3D memory copy */ +} CUDA_MEMCPY3D; + +typedef struct CUDA_ARRAY_DESCRIPTOR_st +{ + unsigned int Width; /**< Width of array */ + unsigned int Height; /**< Height of array */ + + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ +} CUDA_ARRAY_DESCRIPTOR; + +typedef struct CUDA_ARRAY3D_DESCRIPTOR_st +{ + unsigned int Width; /**< Width of 3D array */ + unsigned int Height; /**< Height of 3D array */ + unsigned int Depth; /**< Depth of 3D array */ + + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ + unsigned int Flags; /**< Flags */ +} CUDA_ARRAY3D_DESCRIPTOR; + +CUresult CUDAAPI cuDeviceTotalMem(unsigned int *bytes, CUdevice dev); +CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); +CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, unsigned int *bytes, CUmodule hmod, const char *name); +CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total); +CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, unsigned int bytesize); +CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes); +CUresult CUDAAPI cuMemFree(CUdeviceptr dptr); +CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, unsigned int *psize, CUdeviceptr dptr); +CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize); +CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags); +CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount); +CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount); +CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount); +CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr srcDevice, unsigned int ByteCount); +CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); +CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount); +CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); +CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); +CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream); +CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream); +CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy); +CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy); +CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy); +CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream); +CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream); +CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream); +CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream); +CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream); +CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, unsigned int N); +CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, unsigned int N); +CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, unsigned int N); +CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height); +CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height); +CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height); +CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray); +CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray); +CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray); +CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray); +CUresult CUDAAPI cuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, unsigned int bytes); +CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, unsigned int Pitch); +CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef); +CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, unsigned int *pSize, CUgraphicsResource resource); +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION < 3020 */ +#if defined(__CUDA_API_VERSION_INTERNAL) || __CUDA_API_VERSION < 4000 +CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); +CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx); +CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx); +CUresult CUDAAPI cuStreamDestroy(CUstream hStream); +CUresult CUDAAPI cuEventDestroy(CUevent hEvent); +#endif /* __CUDA_API_VERSION_INTERNAL || __CUDA_API_VERSION < 4000 */ +#if defined(__CUDA_API_VERSION_INTERNAL) + #undef CUdeviceptr + #undef CUDA_MEMCPY2D_st + #undef CUDA_MEMCPY2D + #undef CUDA_MEMCPY3D_st + #undef CUDA_MEMCPY3D + #undef CUDA_ARRAY_DESCRIPTOR_st + #undef CUDA_ARRAY_DESCRIPTOR + #undef CUDA_ARRAY3D_DESCRIPTOR_st + #undef CUDA_ARRAY3D_DESCRIPTOR +#endif /* __CUDA_API_VERSION_INTERNAL */ + +#if defined(__CUDA_API_VERSION_INTERNAL) + CUresult CUDAAPI cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); + CUresult CUDAAPI cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); + CUresult CUDAAPI cuMemcpyDtoD_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); + CUresult CUDAAPI cuMemcpyDtoA_v2(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount); + CUresult CUDAAPI cuMemcpyAtoD_v2(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount); + CUresult CUDAAPI cuMemcpyHtoA_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount); + CUresult CUDAAPI cuMemcpyAtoH_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount); + CUresult CUDAAPI cuMemcpyAtoA_v2(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount); + CUresult CUDAAPI cuMemcpyHtoAAsync_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyAtoHAsync_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpy2D_v2(const CUDA_MEMCPY2D *pCopy); + CUresult CUDAAPI cuMemcpy2DUnaligned_v2(const CUDA_MEMCPY2D *pCopy); + CUresult CUDAAPI cuMemcpy3D_v2(const CUDA_MEMCPY3D *pCopy); + CUresult CUDAAPI cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyDtoDAsync_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpy2DAsync_v2(const CUDA_MEMCPY2D *pCopy, CUstream hStream); + CUresult CUDAAPI cuMemcpy3DAsync_v2(const CUDA_MEMCPY3D *pCopy, CUstream hStream); + CUresult CUDAAPI cuMemsetD8_v2(CUdeviceptr dstDevice, unsigned char uc, size_t N); + CUresult CUDAAPI cuMemsetD16_v2(CUdeviceptr dstDevice, unsigned short us, size_t N); + CUresult CUDAAPI cuMemsetD32_v2(CUdeviceptr dstDevice, unsigned int ui, size_t N); + CUresult CUDAAPI cuMemsetD2D8_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height); + CUresult CUDAAPI cuMemsetD2D16_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height); + CUresult CUDAAPI cuMemsetD2D32_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height); + CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); + CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount); + CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy); + CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream); + + CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream); + CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream); + CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream); + CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream); + CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream); + CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream); + + CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority); + CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags); + CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags); + CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); + CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags); + CUresult CUDAAPI cuStreamQuery(CUstream hStream); + CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); + CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream); + CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra); + CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); + CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); + CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); + CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); + CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); + CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); + CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); + CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream); + CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams); +#endif + +#ifdef __cplusplus +} +#endif + +#undef __CUDA_API_VERSION + +#endif /* __cuda_cuda_h__ */ + diff --git a/include/external/CUDA/cuda_device_runtime_api.h b/include/external/CUDA/cuda_device_runtime_api.h new file mode 100755 index 000000000..44de3afc3 --- /dev/null +++ b/include/external/CUDA/cuda_device_runtime_api.h @@ -0,0 +1,248 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_DEVICE_RUNTIME_API_H__) +#define __CUDA_DEVICE_RUNTIME_API_H__ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#if !defined(__CUDACC_RTC__) + +#if (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__) + +#if defined(__cplusplus) +extern "C" { +#endif + +struct cudaFuncAttributes; + +#if defined(_WIN32) +#define __NV_WEAK__ __declspec(nv_weak) +#else +#define __NV_WEAK__ __attribute__((nv_weak)) +#endif + +__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaMalloc(void **p, size_t s) +{ + return cudaErrorUnknown; +} + +__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c) +{ + return cudaErrorUnknown; +} + +__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device) +{ + return cudaErrorUnknown; +} + +__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaGetDevice(int *device) +{ + return cudaErrorUnknown; +} + +__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize) +{ + return cudaErrorUnknown; +} + +__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags) +{ + return cudaErrorUnknown; +} + +#undef __NV_WEAK__ + +#if defined(__cplusplus) +} +#endif + +#endif /* (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__) */ + +#endif /* !defined(__CUDACC_RTC__) */ + +#if defined(__cplusplus) && defined(__CUDACC__) /* Visible to nvcc front-end only */ +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350) // Visible to SM>=3.5 and "__host__ __device__" only + +#include "driver_types.h" +#include "host_defines.h" + +extern "C" +{ +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceSynchronize(void); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void); +extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error); +extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord_ptsz(cudaEvent_t event, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion); + +/** + * \ingroup CUDART_EXECUTION + * \brief Obtains a parameter buffer + * + * Obtains a parameter buffer which can be filled with parameters for a kernel launch. + * Parameters passed to ::cudaLaunchDevice must be allocated via this function. + * + * This is a low level API and can only be accessed from Parallel Thread Execution (PTX). + * CUDA user code should use <<< >>> to launch kernels. + * + * \param alignment - Specifies alignment requirement of the parameter buffer + * \param size - Specifies size requirement in bytes + * + * \return + * Returns pointer to the allocated parameterBuffer + * \notefnerr + * + * \sa cudaLaunchDevice + */ +extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBuffer(size_t alignment, size_t size); + +/** + * \ingroup CUDART_EXECUTION + * \brief Launches a specified kernel + * + * Launches a specified kernel with the specified parameter buffer. A parameter buffer can be obtained + * by calling ::cudaGetParameterBuffer(). + * + * This is a low level API and can only be accessed from Parallel Thread Execution (PTX). + * CUDA user code should use <<< >>> to launch the kernels. + * + * \param func - Pointer to the kernel to be launched + * \param parameterBuffer - Holds the parameters to the launched kernel. parameterBuffer can be NULL. (Optional) + * \param gridDimension - Specifies grid dimensions + * \param blockDimension - Specifies block dimensions + * \param sharedMemSize - Specifies size of shared memory + * \param stream - Specifies the stream to be used + * + * \return + * ::cudaSuccess, ::cudaErrorInvalidDevice, ::cudaErrorLaunchMaxDepthExceeded, ::cudaErrorInvalidConfiguration, + * ::cudaErrorStartupFailure, ::cudaErrorLaunchPendingCountExceeded, ::cudaErrorLaunchOutOfResources + * \notefnerr + * \n Please refer to Execution Configuration and Parameter Buffer Layout from the CUDA Programming + * Guide for the detailed descriptions of launch configuration and parameter layout respectively. + * + * \sa cudaGetParameterBuffer + */ +extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream); + +#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) && defined(__CUDA_ARCH__) + // When compiling for the device and per thread default stream is enabled, add + // a static inline redirect to the per thread stream entry points. + + static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI + cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream) + { + return cudaLaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream); + } + + static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI + cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream) + { + return cudaLaunchDeviceV2_ptsz(parameterBuffer, stream); + } +#else + extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream); + extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream); +#endif + +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags); + +extern __device__ __cudart_builtin__ unsigned long long CUDARTAPI cudaCGGetIntrinsicHandle(enum cudaCGScope scope); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGSynchronize(unsigned long long handle, unsigned int flags); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetSize(unsigned int *numThreads, unsigned int *numGrids, unsigned long long handle); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetRank(unsigned int *threadRank, unsigned int *gridRank, unsigned long long handle); +} + +template static __inline__ __device__ __cudart_builtin__ cudaError_t cudaMalloc(T **devPtr, size_t size); +template static __inline__ __device__ __cudart_builtin__ cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *attr, T *entry); +template static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize); +template static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize, unsigned int flags); + + +#endif // !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350) +#endif /* defined(__cplusplus) && defined(__CUDACC__) */ + +#endif /* !__CUDA_DEVICE_RUNTIME_API_H__ */ diff --git a/include/external/CUDA/cuda_fp16.h b/include/external/CUDA/cuda_fp16.h new file mode 100755 index 000000000..b724f1e9d --- /dev/null +++ b/include/external/CUDA/cuda_fp16.h @@ -0,0 +1,1969 @@ +/* +* Copyright 1993-2014 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +/** +* \defgroup CUDA_MATH_INTRINSIC_HALF Half Precision Intrinsics +* This section describes half precision intrinsic functions that are +* only supported in device code. +*/ + +/** +* \defgroup CUDA_MATH__HALF_ARITHMETIC Half Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +*/ + +/** +* \defgroup CUDA_MATH__HALF2_ARITHMETIC Half2 Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +*/ + +/** +* \defgroup CUDA_MATH__HALF_COMPARISON Half Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +*/ + +/** +* \defgroup CUDA_MATH__HALF2_COMPARISON Half2 Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +*/ + +/** +* \defgroup CUDA_MATH__HALF_MISC Half Precision Conversion And Data Movement +* \ingroup CUDA_MATH_INTRINSIC_HALF +*/ + +/** +* \defgroup CUDA_MATH__HALF_FUNCTIONS Half Math Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +*/ + +/** +* \defgroup CUDA_MATH__HALF2_FUNCTIONS Half2 Math Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +*/ + +#ifndef __CUDA_FP16_H__ +#define __CUDA_FP16_H__ + +#if defined(__cplusplus) && defined(__CUDACC__) + +#if defined(__CUDACC_RTC__) +#define __CUDA_FP16_DECL__ __host__ __device__ +#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __CUDA_FP16_DECL__ static __device__ __inline__ +#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__ +#endif /* __CUDACC_RTC__ */ + +#define __CUDA_FP16_TYPES_EXIST__ +/* Forward-declaration of structures defined in "cuda_fp16.hpp" */ +struct __half; +struct __half2; + +/* Vector type creation functions, match vector_functions.h */ +__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y); + +#undef __VECTOR_FUNCTIONS_DECL__ + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-to-nearest-even mode +* and returns \p half with converted value. +* +* Converts float number \p a to half precision in round-to-nearest-even mode. +* +* \return Returns \p half result with converted value. +*/ +__CUDA_FP16_DECL__ __half __float2half(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-to-nearest-even mode +* and returns \p half with converted value. +* +* Converts float number \p a to half precision in round-to-nearest-even mode. +* +* \return Returns \p half result with converted value. +*/ +__CUDA_FP16_DECL__ __half __float2half_rn(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-towards-zero mode +* and returns \p half with converted value. +* +* Converts float number \p a to half precision in round-towards-zero mode. +* +* \return Returns \p half result with converted value. +*/ +__CUDA_FP16_DECL__ __half __float2half_rz(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-down mode +* and returns \p half with converted value. +* +* Converts float number \p a to half precision in round-down mode. +* +* \return Returns \p half result with converted value. +*/ +__CUDA_FP16_DECL__ __half __float2half_rd(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-up mode +* and returns \p half with converted value. +* +* Converts float number \p a to half precision in round-up mode. +* +* \return Returns \p half result with converted value. +*/ +__CUDA_FP16_DECL__ __half __float2half_ru(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts \p half number to float. +* +* Converts half number \p a to float. +* +* \return Returns float result with converted value. +*/ +__CUDA_FP16_DECL__ float __half2float(const __half a); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-to-nearest-even mode. +* +* Convert the half-precision floating point value \p h to a signed integer in +* round-to-nearest-even mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ int __half2int_rn(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-towards-zero mode. +* +* Convert the half-precision floating point value \p h to a signed integer in +* round-towards-zero mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ int __half2int_rz(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-down mode. +* +* Convert the half-precision floating point value \p h to a signed integer in +* round-down mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ int __half2int_rd(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-up mode. +* +* Convert the half-precision floating point value \p h to a signed integer in +* round-up mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ int __half2int_ru(__half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-to-nearest-even mode. +* +* Convert the signed integer value \p i to a half-precision floating point +* value in round-to-nearest-even mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ __half __int2half_rn(int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-towards-zero mode. +* +* Convert the signed integer value \p i to a half-precision floating point +* value in round-towards-zero mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ __half __int2half_rz(int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-down mode. +* +* Convert the signed integer value \p i to a half-precision floating point +* value in round-down mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ __half __int2half_rd(int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-up mode. +* +* Convert the signed integer value \p i to a half-precision floating point +* value in round-up mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ __half __int2half_ru(int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-to-nearest-even +* mode. +* +* Convert the half-precision floating point value \p h to a signed short +* integer in round-to-nearest-even mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ short int __half2short_rn(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-towards-zero mode. +* +* Convert the half-precision floating point value \p h to a signed short +* integer in round-towards-zero mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ short int __half2short_rz(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-down mode. +* +* Convert the half-precision floating point value \p h to a signed short +* integer in round-down mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ short int __half2short_rd(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-up mode. +* +* Convert the half-precision floating point value \p h to a signed short +* integer in round-up mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ short int __half2short_ru(__half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-to-nearest-even +* mode. +* +* Convert the signed short integer value \p i to a half-precision floating +* point value in round-to-nearest-even mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ __half __short2half_rn(short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-towards-zero mode. +* +* Convert the signed short integer value \p i to a half-precision floating +* point value in round-towards-zero mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ __half __short2half_rz(short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-down mode. +* +* Convert the signed short integer value \p i to a half-precision floating +* point value in round-down mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ __half __short2half_rd(short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-up mode. +* +* Convert the signed short integer value \p i to a half-precision floating +* point value in round-up mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ __half __short2half_ru(short int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-to-nearest-even mode. +* +* Convert the half-precision floating point value \p h to an unsigned integer +* in round-to-nearest-even mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_rn(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-towards-zero mode. +* +* Convert the half-precision floating point value \p h to an unsigned integer +* in round-towards-zero mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_rz(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-down mode. +* +* Convert the half-precision floating point value \p h to an unsigned integer +* in round-down mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_rd(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-up mode. +* +* Convert the half-precision floating point value \p h to an unsigned integer +* in round-up mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_ru(__half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-to-nearest-even mode. +* +* Convert the unsigned integer value \p i to a half-precision floating point +* value in round-to-nearest-even mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ __half __uint2half_rn(unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-towards-zero mode. +* +* Convert the unsigned integer value \p i to a half-precision floating point +* value in round-towards-zero mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ __half __uint2half_rz(unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-down mode. +* +* Convert the unsigned integer value \p i to a half-precision floating point +* value in round-down mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ __half __uint2half_rd(unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-up mode. +* +* Convert the unsigned integer value \p i to a half-precision floating point +* value in round-up mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ __half __uint2half_ru(unsigned int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-to-nearest-even +* mode. +* +* Convert the half-precision floating point value \p h to an unsigned short +* integer in round-to-nearest-even mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-towards-zero +* mode. +* +* Convert the half-precision floating point value \p h to an unsigned short +* integer in round-towards-zero mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rz(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-down mode. +* +* Convert the half-precision floating point value \p h to an unsigned short +* integer in round-down mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-up mode. +* +* Convert the half-precision floating point value \p h to an unsigned short +* integer in round-up mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(__half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-to-nearest-even +* mode. +* +* Convert the unsigned short integer value \p i to a half-precision floating +* point value in round-to-nearest-even mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ __half __ushort2half_rn(unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-towards-zero +* mode. +* +* Convert the unsigned short integer value \p i to a half-precision floating +* point value in round-towards-zero mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ __half __ushort2half_rz(unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-down mode. +* +* Convert the unsigned short integer value \p i to a half-precision floating +* point value in round-down mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ __half __ushort2half_rd(unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-up mode. +* +* Convert the unsigned short integer value \p i to a half-precision floating +* point value in round-up mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ __half __ushort2half_ru(unsigned short int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-to-nearest-even +* mode. +* +* Convert the half-precision floating point value \p h to an unsigned 64-bit +* integer in round-to-nearest-even mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-towards-zero +* mode. +* +* Convert the half-precision floating point value \p h to an unsigned 64-bit +* integer in round-towards-zero mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rz(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-down mode. +* +* Convert the half-precision floating point value \p h to an unsigned 64-bit +* integer in round-down mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-up mode. +* +* Convert the half-precision floating point value \p h to an unsigned 64-bit +* integer in round-up mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(__half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-to-nearest-even +* mode. +* +* Convert the unsigned 64-bit integer value \p i to a half-precision floating +* point value in round-to-nearest-even mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ __half __ull2half_rn(unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-towards-zero +* mode. +* +* Convert the unsigned 64-bit integer value \p i to a half-precision floating +* point value in round-towards-zero mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ __half __ull2half_rz(unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-down mode. +* +* Convert the unsigned 64-bit integer value \p i to a half-precision floating +* point value in round-down mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ __half __ull2half_rd(unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-up mode. +* +* Convert the unsigned 64-bit integer value \p i to a half-precision floating +* point value in round-up mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ __half __ull2half_ru(unsigned long long int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-to-nearest-even +* mode. +* +* Convert the half-precision floating point value \p h to a signed 64-bit +* integer in round-to-nearest-even mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ long long int __half2ll_rn(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-towards-zero mode. +* +* Convert the half-precision floating point value \p h to a signed 64-bit +* integer in round-towards-zero mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ long long int __half2ll_rz(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-down mode. +* +* Convert the half-precision floating point value \p h to a signed 64-bit +* integer in round-down mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ long long int __half2ll_rd(__half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-up mode. +* +* Convert the half-precision floating point value \p h to a signed 64-bit +* integer in round-up mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ long long int __half2ll_ru(__half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-to-nearest-even +* mode. +* +* Convert the signed 64-bit integer value \p i to a half-precision floating +* point value in round-to-nearest-even mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ __half __ll2half_rn(long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-towards-zero mode. +* +* Convert the signed 64-bit integer value \p i to a half-precision floating +* point value in round-towards-zero mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ __half __ll2half_rz(long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-down mode. +* +* Convert the signed 64-bit integer value \p i to a half-precision floating +* point value in round-down mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ __half __ll2half_rd(long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-up mode. +* +* Convert the signed 64-bit integer value \p i to a half-precision floating +* point value in round-up mode. +* +* \return Returns converted value. +*/ +__CUDA_FP16_DECL__ __half __ll2half_ru(long long int i); + +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Truncate input argument to the integral part. +* +* Round \p h to the nearest integer value that does not exceed \p h in +* magnitude. +* +* \return Returns truncated integer value. +*/ +__CUDA_FP16_DECL__ __half htrunc(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculate ceiling of the input argument. +* +* Compute the smallest integer value not less than \p h. +* +* \return Returns ceiling expressed as a half-precision floating point number. +*/ +__CUDA_FP16_DECL__ __half hceil(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* Calculate the largest integer value which is less than or equal to \p h. +* +* \return Returns floor expressed as half-precision floating point number. +*/ +__CUDA_FP16_DECL__ __half hfloor(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Round input to nearest integer value in half-precision floating point +* number. +* +* Round \p h to the nearest integer value in half-precision floating point +* format, with halfway cases rounded to the nearest even integer value. +* +* \return Returns rounded integer value expressed as half-precision floating +* point number. +*/ +__CUDA_FP16_DECL__ __half hrint(const __half h); + +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Truncate \p half2 vector input argument to the integral part. +* +* Round each component of vector \p h to the nearest integer value that does +* not exceed \p h in magnitude. +* +* \return Returns \p half2 vector truncated integer value. +*/ +__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculate \p half2 vector ceiling of the input argument. +* +* For each component of vector \p h compute the smallest integer value not less +* than \p h. +* +* \return Returns \p half2 vector ceiling expressed as a pair of half-precision +* floating point numbers. +*/ +__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* For each component of vector \p h calculate the largest integer value which +* is less than or equal to \p h. +* +* \return Returns \p half2 vector floor expressed as a pair of half-precision +* floating point number. +*/ +__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Round input to nearest integer value in half-precision floating point +* number. +* +* Round each component of \p half2 vector \p h to the nearest integer value in +* half-precision floating point format, with halfway cases rounded to the +* nearest even integer value. +* +* \return Returns \p half2 vector of rounded integer values expressed as +* half-precision floating point numbers. +*/ +__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts input to half precision in round-to-nearest-even mode and +* populates both halves of \p half2 with converted value. +* +* Converts input \p a to half precision in round-to-nearest-even mode and +* populates both halves of \p half2 with converted value. +* +* \return Returns \p half2 with both halves equal to the converted half +* precision number. +*/ +__CUDA_FP16_DECL__ __half2 __float2half2_rn(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both input floats to half precision in round-to-nearest-even +* mode and returns \p half2 with converted values. +* +* Converts both input floats to half precision in round-to-nearest-even mode +* and combines the results into one \p half2 number. Low 16 bits of the return +* value correspond to the input \p a, high 16 bits correspond to the input \p +* b. +* +* \return Returns \p half2 which has corresponding halves equal to the +* converted input floats. +*/ +__CUDA_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both components of float2 number to half precision in +* round-to-nearest-even mode and returns \p half2 with converted values. +* +* Converts both components of float2 to half precision in round-to-nearest +* mode and combines the results into one \p half2 number. Low 16 bits of the +* return value correspond to \p a.x and high 16 bits of the return value +* correspond to \p a.y. +* +* \return Returns \p half2 which has corresponding halves equal to the +* converted float2 components. +*/ +__CUDA_FP16_DECL__ __half2 __float22half2_rn(const float2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both halves of \p half2 to float2 and returns the result. +* +* Converts both halves of \p half2 input \p a to float2 and returns the +* result. +* +* \return Returns converted float2. +*/ +__CUDA_FP16_DECL__ float2 __half22float2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts low 16 bits of \p half2 to float and returns the result +* +* Converts low 16 bits of \p half2 input \p a to 32 bit floating point number +* and returns the result. +* +* \return Returns low 16 bits of \p a converted to float. +*/ +__CUDA_FP16_DECL__ float __low2float(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns \p half2 with both halves equal to the input value. +* +* Returns \p half2 number with both halves equal to the input \p a \p half +* number. +* +* \return Returns \p half2 with both halves equal to the input \p a. +*/ +__CUDA_FP16_DECL__ __half2 __half2half2(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts high 16 bits of \p half2 to float and returns the result +* +* Converts high 16 bits of \p half2 input \p a to 32 bit floating point number +* and returns the result. +* +* \return Returns high 16 bits of \p a converted to float. +*/ +__CUDA_FP16_DECL__ float __high2float(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Swaps both halves of the \p half2 input. +* +* Swaps both halves of the \p half2 input and returns a new \p half2 number +* with swapped halves. +* +* \return Returns \p half2 with halves swapped. +*/ +__CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts low 16 bits from each of the two \p half2 inputs and combines +* into one \p half2 number. +* +* Extracts low 16 bits from each of the two \p half2 inputs and combines into +* one \p half2 number. Low 16 bits from input \p a is stored in low 16 bits of +* the return value, low 16 bits from input \p b is stored in high 16 bits of +* the return value. +* +* \return Returns \p half2 which contains low 16 bits from \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts high 16 bits from each of the two \p half2 inputs and +* combines into one \p half2 number. +* +* Extracts high 16 bits from each of the two \p half2 inputs and combines into +* one \p half2 number. High 16 bits from input \p a is stored in low 16 bits of +* the return value, high 16 bits from input \p b is stored in high 16 bits of +* the return value. +* +* \return Returns \p half2 which contains high 16 bits from \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns high 16 bits of \p half2 input. +* +* Returns high 16 bits of \p half2 input \p a. +* +* \return Returns \p half which contains high 16 bits of the input. +*/ +__CUDA_FP16_DECL__ __half __high2half(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns low 16 bits of \p half2 input. +* +* Returns low 16 bits of \p half2 input \p a. +* +* \return Returns \p half which contains low 16 bits of the input. +*/ +__CUDA_FP16_DECL__ __half __low2half(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Checks if the input \p half number is infinite. +* +* Checks if the input \p half number \p a is infinite. +* +* \return Returns -1 iff \p a is equal to negative infinity, 1 iff \p a is +* equal to positive infinity and 0 otherwise. +*/ +__CUDA_FP16_DECL__ int __hisinf(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Combines two \p half numbers into one \p half2 number. +* +* Combines two input \p half number \p a and \p b into one \p half2 number. +* Input \p a is stored in low 16 bits of the return value, input \p b is stored +* in high 16 bits of the return value. +* +* \return Returns \p half2 number which has one half equal to \p a and the +* other to \p b. +*/ +__CUDA_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts low 16 bits from \p half2 input. +* +* Extracts low 16 bits from \p half2 input \p a and returns a new \p half2 +* number which has both halves equal to the extracted bits. +* +* \return Returns \p half2 with both halves equal to low 16 bits from the +* input. +*/ +__CUDA_FP16_DECL__ __half2 __low2half2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts high 16 bits from \p half2 input. +* +* Extracts high 16 bits from \p half2 input \p a and returns a new \p half2 +* number which has both halves equal to the extracted bits. +* +* \return Returns \p half2 with both halves equal to high 16 bits from the +* input. +*/ +__CUDA_FP16_DECL__ __half2 __high2half2(const __half2 a); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a \p half as a signed short integer. +* +* Reinterprets the bits in the half-precision floating point value \p h +* as a signed short integer. +* +* \return Returns reinterpreted value. +*/ +__CUDA_FP16_DECL__ short int __half_as_short(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a \p half as an unsigned short integer. +* +* Reinterprets the bits in the half-precision floating point value \p h +* as an unsigned short integer. +* +* \return Returns reinterpreted value. +*/ +__CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a signed short integer as a \p half. +* +* Reinterprets the bits in the signed short integer value \p i as a +* half-precision floating point value. +* +* \return Returns reinterpreted value. +*/ +__CUDA_FP16_DECL__ __half __short_as_half(const short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in an unsigned short integer as a \p half. +* +* Reinterprets the bits in the unsigned short integer value \p i as a +* half-precision floating point value. +* +* \return Returns reinterpreted value. +*/ +__CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i); + +#if __CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__) +#if !defined warpSize && !defined __local_warpSize +#define warpSize 32 +#define __local_warpSize +#endif + +#if defined(_WIN32) +# define __DEPRECATED__(msg) __declspec(deprecated(msg)) +#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__)))) +# define __DEPRECATED__(msg) __attribute__((deprecated)) +#else +# define __DEPRECATED__(msg) __attribute__((deprecated(msg))) +#endif + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#define __WSB_DEPRECATION_MESSAGE(x) #x"() is not valid on compute_70 and above, and should be replaced with "#x"_sync()." \ + "To continue using "#x"(), specify virtual architecture compute_60 when targeting sm_70 and above, for example, using the pair of compiler options: -arch=compute_60 -code=sm_70." +#else +#define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)." +#endif + +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) __half2 __shfl(__half2 var, int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) __half2 __shfl_up(__half2 var, unsigned int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down))__half2 __shfl_down(__half2 var, unsigned int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half2 __shfl_xor(__half2 var, int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) __half __shfl(__half var, int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) __half __shfl_up(__half var, unsigned int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) __half __shfl_down(__half var, unsigned int delta, int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half __shfl_xor(__half var, int delta, int width = warpSize); + +__CUDA_FP16_DECL__ __half2 __shfl_sync(unsigned mask, __half2 var, int delta, int width = warpSize); +__CUDA_FP16_DECL__ __half2 __shfl_up_sync(unsigned mask, __half2 var, unsigned int delta, int width = warpSize); +__CUDA_FP16_DECL__ __half2 __shfl_down_sync(unsigned mask, __half2 var, unsigned int delta, int width = warpSize); +__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(unsigned mask, __half2 var, int delta, int width = warpSize); +__CUDA_FP16_DECL__ __half __shfl_sync(unsigned mask, __half var, int delta, int width = warpSize); +__CUDA_FP16_DECL__ __half __shfl_up_sync(unsigned mask, __half var, unsigned int delta, int width = warpSize); +__CUDA_FP16_DECL__ __half __shfl_down_sync(unsigned mask, __half var, unsigned int delta, int width = warpSize); +__CUDA_FP16_DECL__ __half __shfl_xor_sync(unsigned mask, __half var, int delta, int width = warpSize); + +#if defined(__local_warpSize) +#undef warpSize +#undef __local_warpSize +#endif +#endif /*__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__) */ + +#if defined(__cplusplus) && ( __CUDA_ARCH__ >=320 || !defined(__CUDA_ARCH__) ) +__CUDA_FP16_DECL__ __half2 __ldg(const __half2 *ptr); +__CUDA_FP16_DECL__ __half __ldg(const __half *ptr); +__CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *ptr); +__CUDA_FP16_DECL__ __half __ldcg(const __half *ptr); +__CUDA_FP16_DECL__ __half2 __ldca(const __half2 *ptr); +__CUDA_FP16_DECL__ __half __ldca(const __half *ptr); +__CUDA_FP16_DECL__ __half2 __ldcs(const __half2 *ptr); +__CUDA_FP16_DECL__ __half __ldcs(const __half *ptr); +#endif /*defined(__cplusplus) && ( __CUDA_ARCH__ >=320 || !defined(__CUDA_ARCH__) )*/ + +#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs half2 vector if-equal comparison. +* +* Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* +* \return Returns the \p half2 vector result of if-equal comparison of vectors +* \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector not-equal comparison. +* +* Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* +* \return Returns the \p half2 vector result of not-equal comparison of vectors +* \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-equal comparison. +* +* Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* +* \return Returns the \p half2 vector result of less-equal comparison of +* vectors \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-equal comparison. +* +* Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* +* \return Returns the \p half2 vector result of greater-equal comparison of +* vectors \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-than comparison. +* +* Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* +* \return Returns the \p half2 vector result of less-than comparison of vectors +* \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-than comparison. +* +* Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* +* \return Returns the half2 vector result of greater-than comparison of vectors +* \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered if-equal comparison. +* +* Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* +* \return Returns the \p half2 vector result of unordered if-equal comparison +* of vectors \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered not-equal comparison. +* +* Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* +* \return Returns the \p half2 vector result of unordered not-equal comparison +* of vectors \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-equal comparison. +* +* Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* +* \return Returns the \p half2 vector result of unordered less-equal comparison +* of vectors \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-equal comparison. +* +* Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* +* \return Returns the \p half2 vector result of unordered greater-equal +* comparison of vectors \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-than comparison. +* +* Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* +* \return Returns the \p half2 vector result of unordered less-than comparison +* of vectors \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-than comparison. +* +* Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* +* \return Returns the \p half2 vector result of unordered greater-than +* comparison of vectors \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Determine whether \p half2 argument is a NaN. +* +* Determine whether each half of input \p half2 number \p a is a NaN. +* +* \return Returns \p half2 which has the corresponding \p half results set to +* 1.0 for true, or 0.0 for false. +*/ +__CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector addition in round-to-nearest-even mode. +* +* Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest +* mode. +* +* \return Returns the \p half2 vector result of adding vectors \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode. +* +* Subtracts \p half2 input vector \p b from input vector \p a in +* round-to-nearest-even mode. +* +* \return Returns the \p half2 vector result of subtraction vector \p b from \p +* a. +*/ +__CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode. +* +* Performs \p half2 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode. +* +* \return Returns the \p half2 vector result of multiplying vectors \p a and \p +* b. +*/ +__CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half2 vector division in round-to-nearest-even mode. +* +* Divides \p half2 input vector \p a by input vector \p b in round-to-nearest +* mode. +* +* \return Returns the \p half2 vector result of division \p a by \p b. +*/ +__CUDA_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest +* mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* +* \return Returns the \p half2 vector result of adding vectors \p a and \p b +* with saturation. +*/ +__CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* Subtracts \p half2 input vector \p b from input vector \p a in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* +* \return Returns the \p half2 vector result of subtraction vector \p b from \p +* a with saturation. +*/ +__CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* Performs \p half2 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* +* \return Returns the \p half2 vector result of multiplying vectors \p a and \p +* b with saturation. +*/ +__CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even +* mode. +* +* Performs \p half2 vector multiply on inputs \p a and \p b, +* then performs a \p half2 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* +* \return Returns the \p half2 vector result of the fused multiply-add +* operation on vectors \p a, \p b, and \p c. +*/ +__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even +* mode, with saturation to [0.0, 1.0]. +* +* Performs \p half2 vector multiply on inputs \p a and \p b, +* then performs a \p half2 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the +* results to range [0.0, 1.0]. NaN results are flushed to +0.0. +* +* \return Returns the \p half2 vector result of the fused multiply-add +* operation on vectors \p a, \p b, and \p c with saturation. +*/ +__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Negates both halves of the input \p half2 number and returns the +* result. +* +* Negates both halves of the input \p half2 number \p a and returns the result. +* +* \return Returns \p half2 number with both halves negated. +*/ +__CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half addition in round-to-nearest-even mode. +* +* Performs \p half addition of inputs \p a and \p b, in round-to-nearest-even +* mode. +* +* \return Returns the \p half result of adding \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half subtraction in round-to-nearest-even mode. +* +* Subtracts \p half input \p b from input \p a in round-to-nearest +* mode. +* +* \return Returns the \p half result of subtraction \p b from \p a. +*/ +__CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half multiplication in round-to-nearest-even mode. +* +* Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest +* mode. +* +* \return Returns the \p half result of multiplying \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half division in round-to-nearest-even mode. +* +* Divides \p half input \p a by input \p b in round-to-nearest +* mode. +* +* \return Returns the \p half result of division \p a by \p b. +*/ +__CUDA_FP16_DECL__ __half __hdiv(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* Performs \p half add of inputs \p a and \p b, in round-to-nearest-even mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* +* \return Returns the \p half result of adding \p a and \p b with saturation. +*/ +__CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half subtraction in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* Subtracts \p half input \p b from input \p a in round-to-nearest +* mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* +* \return Returns the \p half result of subtraction \p b from \p a +* with saturation. +*/ +__CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half multiplication in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest +* mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* +* \return Returns the \p half result of multiplying \p a and \p b with +* saturation. +*/ +__CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half fused multiply-add in round-to-nearest-even mode. +* +* Performs \p half multiply on inputs \p a and \p b, +* then performs a \p half add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* +* \return Returns the \p half result of the fused multiply-add operation on \p +* a, \p b, and \p c. +*/ +__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half fused multiply-add in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* Performs \p half multiply on inputs \p a and \p b, +* then performs a \p half add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the result +* to range [0.0, 1.0]. NaN results are flushed to +0.0. +* +* \return Returns the \p half result of the fused multiply-add operation on \p +* a, \p b, and \p c with saturation. +*/ +__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Negates input \p half number and returns the result. +* +* Negates input \p half number and returns the result. +* +* \return Returns negated \p half input \p a. +*/ +__CUDA_FP16_DECL__ __half __hneg(const __half a); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector if-equal comparison, and returns boolean true +* iff both \p half results are true, boolean false otherwise. +* +* Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* +* \return Returns boolean true if both \p half results of if-equal comparison +* of vectors \p a and \p b are true, boolean false otherwise. +*/ +__CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector not-equal comparison, and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* +* \return Returns boolean true if both \p half results of not-equal comparison +* of vectors \p a and \p b are true, boolean false otherwise. +*/ +__CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-equal comparison, and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* +* \return Returns boolean true if both \p half results of less-equal comparison +* of vectors \p a and \p b are true, boolean false otherwise. +*/ +__CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-equal comparison, and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* +* \return Returns boolean true if both \p half results of greater-equal +* comparison of vectors \p a and \p b are true, boolean false otherwise. +*/ +__CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-than comparison, and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* +* \return Returns boolean true if both \p half results of less-than comparison +* of vectors \p a and \p b are true, boolean false otherwise. +*/ +__CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-than comparison, and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* +* \return Returns boolean true if both \p half results of greater-than +* comparison of vectors \p a and \p b are true, boolean false otherwise. +*/ +__CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered if-equal comparison, and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* +* \return Returns boolean true if both \p half results of unordered if-equal +* comparison of vectors \p a and \p b are true, boolean false otherwise. +*/ +__CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered not-equal comparison, and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* +* \return Returns boolean true if both \p half results of unordered not-equal +* comparison of vectors \p a and \p b are true, boolean false otherwise. +*/ +__CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-equal comparison, and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* +* \return Returns boolean true if both \p half results of unordered less-equal +* comparison of vectors \p a and \p b are true, boolean false otherwise. +*/ +__CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-equal comparison, and +* returns boolean true iff both \p half results are true, boolean false +* otherwise. +* +* Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* +* \return Returns boolean true if both \p half results of unordered +* greater-equal comparison of vectors \p a and \p b are true, boolean false +* otherwise. +*/ +__CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-than comparison, and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* +* \return Returns boolean true if both \p half results of unordered less-than +* comparison of vectors \p a and \p b are true, boolean false otherwise. +*/ +__CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-than comparison, and +* returns boolean true iff both \p half results are true, boolean false +* otherwise. +* +* Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* +* \return Returns boolean true if both \p half results of unordered +* greater-than comparison of vectors \p a and \p b are true, boolean false +* otherwise. +*/ +__CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half if-equal comparison. +* +* Performs \p half if-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* +* \return Returns boolean result of if-equal comparison of \p a and \p b. +*/ +__CUDA_FP16_DECL__ bool __heq(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half not-equal comparison. +* +* Performs \p half not-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* +* \return Returns boolean result of not-equal comparison of \p a and \p b. +*/ +__CUDA_FP16_DECL__ bool __hne(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half less-equal comparison. +* +* Performs \p half less-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* +* \return Returns boolean result of less-equal comparison of \p a and \p b. +*/ +__CUDA_FP16_DECL__ bool __hle(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half greater-equal comparison. +* +* Performs \p half greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* +* \return Returns boolean result of greater-equal comparison of \p a and \p b. +*/ +__CUDA_FP16_DECL__ bool __hge(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half less-than comparison. +* +* Performs \p half less-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* +* \return Returns boolean result of less-than comparison of \p a and \p b. +*/ +__CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half greater-than comparison. +* +* Performs \p half greater-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* +* \return Returns boolean result of greater-than comparison of \p a and \p b. +*/ +__CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered if-equal comparison. +* +* Performs \p half if-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* +* \return Returns boolean result of unordered if-equal comparison of \p a and +* \p b. +*/ +__CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered not-equal comparison. +* +* Performs \p half not-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* +* \return Returns boolean result of unordered not-equal comparison of \p a and +* \p b. +*/ +__CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered less-equal comparison. +* +* Performs \p half less-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* +* \return Returns boolean result of unordered less-equal comparison of \p a and +* \p b. +*/ +__CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered greater-equal comparison. +* +* Performs \p half greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* +* \return Returns boolean result of unordered greater-equal comparison of \p a +* and \p b. +*/ +__CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered less-than comparison. +* +* Performs \p half less-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* +* \return Returns boolean result of unordered less-than comparison of \p a and +* \p b. +*/ +__CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered greater-than comparison. +* +* Performs \p half greater-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* +* \return Returns boolean result of unordered greater-than comparison of \p a +* and \p b. +*/ +__CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Determine whether \p half argument is a NaN. +* +* Determine whether \p half value \p a is a NaN. +* +* \return Returns boolean true iff argument is a NaN, boolean false otherwise. +*/ +__CUDA_FP16_DECL__ bool __hisnan(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half square root in round-to-nearest-even mode. +* +* Calculates \p half square root of input \p a in round-to-nearest-even mode. +* +* \return Returns \p half square root of \p a. +*/ +__CUDA_FP16_DECL__ __half hsqrt(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half reciprocal square root in round-to-nearest-even +* mode. +* +* Calculates \p half reciprocal square root of input \p a in round-to-nearest +* mode. +* +* \return Returns \p half reciprocal square root of \p a. +*/ +__CUDA_FP16_DECL__ __half hrsqrt(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half reciprocal in round-to-nearest-even mode. +* +* Calculates \p half reciprocal of input \p a in round-to-nearest-even mode. +* +* \return Returns \p half reciprocal of \p a. +*/ +__CUDA_FP16_DECL__ __half hrcp(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half natural logarithm in round-to-nearest-even mode. +* +* Calculates \p half natural logarithm of input \p a in round-to-nearest-even +* mode. +* +* \return Returns \p half natural logarithm of \p a. +*/ +__CUDA_FP16_DECL__ __half hlog(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half binary logarithm in round-to-nearest-even mode. +* +* Calculates \p half binary logarithm of input \p a in round-to-nearest-even +* mode. +* +* \return Returns \p half binary logarithm of \p a. +*/ +__CUDA_FP16_DECL__ __half hlog2(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half decimal logarithm in round-to-nearest-even mode. +* +* Calculates \p half decimal logarithm of input \p a in round-to-nearest-even +* mode. +* +* \return Returns \p half decimal logarithm of \p a. +*/ +__CUDA_FP16_DECL__ __half hlog10(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half natural exponential function in round-to-nearest +* mode. +* +* Calculates \p half natural exponential function of input \p a in +* round-to-nearest-even mode. +* +* \return Returns \p half natural exponential function of \p a. +*/ +__CUDA_FP16_DECL__ __half hexp(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half binary exponential function in round-to-nearest +* mode. +* +* Calculates \p half binary exponential function of input \p a in +* round-to-nearest-even mode. +* +* \return Returns \p half binary exponential function of \p a. +*/ +__CUDA_FP16_DECL__ __half hexp2(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half decimal exponential function in round-to-nearest +* mode. +* +* Calculates \p half decimal exponential function of input \p a in +* round-to-nearest-even mode. +* +* \return Returns \p half decimal exponential function of \p a. +*/ +__CUDA_FP16_DECL__ __half hexp10(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half cosine in round-to-nearest-even mode. +* +* Calculates \p half cosine of input \p a in round-to-nearest-even mode. +* +* \return Returns \p half cosine of \p a. +*/ +__CUDA_FP16_DECL__ __half hcos(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half sine in round-to-nearest-even mode. +* +* Calculates \p half sine of input \p a in round-to-nearest-even mode. +* +* \return Returns \p half sine of \p a. +*/ +__CUDA_FP16_DECL__ __half hsin(const __half a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector square root in round-to-nearest-even mode. +* +* Calculates \p half2 square root of input vector \p a in round-to-nearest +* mode. +* +* \return Returns \p half2 square root of vector \p a. +*/ +__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector reciprocal square root in round-to-nearest +* mode. +* +* Calculates \p half2 reciprocal square root of input vector \p a in +* round-to-nearest-even mode. +* +* \return Returns \p half2 reciprocal square root of vector \p a. +*/ +__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector reciprocal in round-to-nearest-even mode. +* +* Calculates \p half2 reciprocal of input vector \p a in round-to-nearest-even +* mode. +* +* \return Returns \p half2 reciprocal of vector \p a. +*/ +__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector natural logarithm in round-to-nearest-even +* mode. +* +* Calculates \p half2 natural logarithm of input vector \p a in +* round-to-nearest-even mode. +* +* \return Returns \p half2 natural logarithm of vector \p a. +*/ +__CUDA_FP16_DECL__ __half2 h2log(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector binary logarithm in round-to-nearest-even +* mode. +* +* Calculates \p half2 binary logarithm of input vector \p a in round-to-nearest +* mode. +* +* \return Returns \p half2 binary logarithm of vector \p a. +*/ +__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector decimal logarithm in round-to-nearest-even +* mode. +* +* Calculates \p half2 decimal logarithm of input vector \p a in +* round-to-nearest-even mode. +* +* \return Returns \p half2 decimal logarithm of vector \p a. +*/ +__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector exponential function in round-to-nearest +* mode. +* +* Calculates \p half2 exponential function of input vector \p a in +* round-to-nearest-even mode. +* +* \return Returns \p half2 exponential function of vector \p a. +*/ +__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector binary exponential function in +* round-to-nearest-even mode. +* +* Calculates \p half2 binary exponential function of input vector \p a in +* round-to-nearest-even mode. +* +* \return Returns \p half2 binary exponential function of vector \p a. +*/ +__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector decimal exponential function in +* round-to-nearest-even mode. +* +* Calculates \p half2 decimal exponential function of input vector \p a in +* round-to-nearest-even mode. +* +* \return Returns \p half2 decimal exponential function of vector \p a. +*/ +__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector cosine in round-to-nearest-even mode. +* +* Calculates \p half2 cosine of input vector \p a in round-to-nearest-even +* mode. +* +* \return Returns \p half2 cosine of vector \p a. +*/ +__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector sine in round-to-nearest-even mode. +* +* Calculates \p half2 sine of input vector \p a in round-to-nearest-even mode. +* +* \return Returns \p half2 sine of vector \p a. +*/ +__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a); + +#endif /*if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/ + +#undef __CUDA_FP16_DECL__ + +#endif /* defined(__cplusplus) && defined(__CUDACC__) */ + +/* Note the .hpp file is included even for host-side compilation, to capture the "half" & "half2" definitions */ +#include "cuda_fp16.hpp" + +#endif /* end of include guard: __CUDA_FP16_H__ */ diff --git a/include/external/CUDA/cuda_fp16.hpp b/include/external/CUDA/cuda_fp16.hpp new file mode 100755 index 000000000..dcbab74ae --- /dev/null +++ b/include/external/CUDA/cuda_fp16.hpp @@ -0,0 +1,1797 @@ +/* +* Copyright 1993-2014 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +#if !defined(__CUDA_FP16_HPP__) +#define __CUDA_FP16_HPP__ + +/* C++11 header for std::move */ +#if __cplusplus >= 201103L +#include +#endif /* __cplusplus >= 201103L */ + +/* Set up function decorations */ +#if defined(__CUDACC_RTC__) +#define __CUDA_FP16_DECL__ __host__ __device__ +#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__ +#define __CUDA_HOSTDEVICE__ __host__ __device__ +#elif defined(__CUDACC__) /* !__CUDACC_RTC__ but yes __CUDACC__ */ +#define __CUDA_FP16_DECL__ static __device__ __inline__ +#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__ +#define __CUDA_HOSTDEVICE__ __host__ __device__ +#else /* !__CUDACC_RTC and !__CUDACC__ (i.e. host non-nvcc compiler */ +#define __CUDA_HOSTDEVICE__ +#endif /* __CUDACC_RTC__ and __CUDACC__ */ + +/* Set up structure-alignment attribute */ +#if defined(__CUDACC__) +#define __CUDA_ALIGN__(align) __align__(align) +#else +/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */ +#if __cplusplus >= 201103L +#define __CUDA_ALIGN__(n) alignas(n) /* C++11 kindly gives us a keyword for this */ +#else /* !(__cplusplus >= 201103L)*/ +#if defined(__GNUC__) /* || defined(__IBMC__) || defined(__clang__) || defined(__PGI) */ +#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) /* || defined(__ICC) */ +#define __CUDA_ALIGN__(n) __declspec(align(n)) +#else +#define __CUDA_ALIGN__(n) +#endif /* defined(__GNUC__) */ +#endif /* __cplusplus >= 201103L */ +#endif /* defined(__CUDACC__) */ + + +/* Macros to allow half & half2 to be used by inline assembly */ +#define __HALF_TO_US(var) *(reinterpret_cast(&(var))) +#define __HALF_TO_CUS(var) *(reinterpret_cast(&(var))) +#define __HALF2_TO_UI(var) *(reinterpret_cast(&(var))) +#define __HALF2_TO_CUI(var) *(reinterpret_cast(&(var))) + + +/** +* Types which allow static initialization of "half" and "half2" until +* these become an actual builtin. Note this initialization is as a +* bitfield representation of "half", and not a conversion from short->half. +* Such a representation will be deprecated in a future version of CUDA. +* (Note these are visible to non-nvcc compilers, including C-only compilation) +*/ +typedef struct __CUDA_ALIGN__(2) { + unsigned short x; +} __half_raw; + +typedef struct __CUDA_ALIGN__(4) { + unsigned short x, y; +} __half2_raw; + +/* All other definitions in this file are only visible to C++ compilers */ +#if defined(__cplusplus) + +/* Hide GCC member initialization list warnings because of host/device in-function init requirement */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Weffc++" +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +struct __CUDA_ALIGN__(2) __half { +protected: + unsigned short __x; + +public: +#if __cplusplus >= 201103L + __half() = default; +#else + __CUDA_HOSTDEVICE__ __half() { } +#endif /* __cplusplus >= 201103L */ + + /* Convert to/from __half_raw */ + __CUDA_HOSTDEVICE__ __half(const __half_raw &hr) : __x(hr.x) { } + __CUDA_HOSTDEVICE__ __half &operator=(const __half_raw &hr) { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ operator __half_raw() const { __half_raw ret; ret.x = __x; return ret; } + +/* Member functions are only available to nvcc compilation */ +#if defined(__CUDACC__) +#if !defined(__CUDA_NO_HALF_CONVERSIONS__) + /* Allow automatic construction from types supported natively in hardware */ + /* Note we do avoid constructor init-list because of special host/device compilation rules */ + __device__ __half(float f) { __x = __float2half(f).__x; } + __device__ __half(double f) { __x = __float2half((float)f).__x; } + __device__ __half(short val) { __x = __short2half_rn(val).__x; } + __device__ __half(unsigned short val) { __x = __ushort2half_rn(val).__x; } + __device__ __half(int val) { __x = __int2half_rn(val).__x; } + __device__ __half(unsigned int val) { __x = __uint2half_rn(val).__x; } + __device__ __half(long long val) { __x = __ll2half_rn(val).__x; } + __device__ __half(unsigned long long val) { __x = __ull2half_rn(val).__x; } + + /* Allow automatic casts to supported builtin types, matching all that are permitted with float */ + __device__ operator float() const { return __half2float(*this); } + __device__ __half &operator=(float f) { __x = __float2half(f).__x; return *this; } + + /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */ + __device__ __half &operator=(double f) { __x = __float2half((float)f).__x; return *this; } + + __device__ operator short() const { return __half2short_rn(*this); } + __device__ __half &operator=(short val) { __x = __short2half_rn(val).__x; return *this; } + + __device__ operator unsigned short() const { return __half2ushort_rn(*this); } + __device__ __half &operator=(unsigned short val) { __x = __ushort2half_rn(val).__x; return *this; } + + __device__ operator int() const { return __half2int_rn(*this); } + __device__ __half &operator=(int val) { __x = __int2half_rn(val).__x; return *this; } + + __device__ operator unsigned int() const { return __half2uint_rn(*this); } + __device__ __half &operator=(unsigned int val) { __x = __uint2half_rn(val).__x; return *this; } + + __device__ operator long long() const { return __half2ll_rn(*this); } + __device__ __half &operator=(long long val) { __x = __ll2half_rn(val).__x; return *this; } + + __device__ operator unsigned long long() const { return __half2ull_rn(*this); } + __device__ __half &operator=(unsigned long long val) { __x = __ull2half_rn(val).__x; return *this; } + + /* Boolean conversion - note both 0 and -0 must return false */ + __device__ operator bool() const { return (__x & 0x7FFF) != 0; } +#endif /* !defined(__CUDA_NO_HALF_CONVERSIONS__) */ +#endif /* defined(__CUDACC__) */ +}; + +/* Global-space operator functions are only available to nvcc compilation */ +#if defined(__CUDACC__) + +/* Arithmetic FP16 operations only supported on arch >= 5.3 */ +#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) +#if !defined(__CUDA_NO_HALF_OPERATORS__) +/* Some basic arithmetic operations expected of a builtin */ +__device__ __forceinline__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); } +__device__ __forceinline__ __half operator-(const __half &lh, const __half &rh) { return __hsub(lh, rh); } +__device__ __forceinline__ __half operator*(const __half &lh, const __half &rh) { return __hmul(lh, rh); } +__device__ __forceinline__ __half operator/(const __half &lh, const __half &rh) { return __hdiv(lh, rh); } + +__device__ __forceinline__ __half &operator+=(__half &lh, const __half &rh) { lh = __hadd(lh, rh); return lh; } +__device__ __forceinline__ __half &operator-=(__half &lh, const __half &rh) { lh = __hsub(lh, rh); return lh; } +__device__ __forceinline__ __half &operator*=(__half &lh, const __half &rh) { lh = __hmul(lh, rh); return lh; } +__device__ __forceinline__ __half &operator/=(__half &lh, const __half &rh) { lh = __hdiv(lh, rh); return lh; } + +/* Note for increment and decrement we use the raw value 0x3C00 equating to half(1.0f), to avoid the extra conversion */ +__device__ __forceinline__ __half &operator++(__half &h) { __half_raw one; one.x = 0x3C00; h += one; return h; } +__device__ __forceinline__ __half &operator--(__half &h) { __half_raw one; one.x = 0x3C00; h -= one; return h; } +__device__ __forceinline__ __half operator++(__half &h, int) { __half ret = h; __half_raw one; one.x = 0x3C00; h += one; return ret; } +__device__ __forceinline__ __half operator--(__half &h, int) { __half ret = h; __half_raw one; one.x = 0x3C00; h -= one; return ret; } + +/* Unary plus and inverse operators */ +__device__ __forceinline__ __half operator+(const __half &h) { return h; } +__device__ __forceinline__ __half operator-(const __half &h) { return __hneg(h); } + +/* Some basic comparison operations to make it look like a builtin */ +__device__ __forceinline__ bool operator==(const __half &lh, const __half &rh) { return __heq(lh, rh); } +__device__ __forceinline__ bool operator!=(const __half &lh, const __half &rh) { return __hne(lh, rh); } +__device__ __forceinline__ bool operator> (const __half &lh, const __half &rh) { return __hgt(lh, rh); } +__device__ __forceinline__ bool operator< (const __half &lh, const __half &rh) { return __hlt(lh, rh); } +__device__ __forceinline__ bool operator>=(const __half &lh, const __half &rh) { return __hge(lh, rh); } +__device__ __forceinline__ bool operator<=(const __half &lh, const __half &rh) { return __hle(lh, rh); } +#endif /* !defined(__CUDA_NO_HALF_OPERATORS__) */ +#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) */ +#endif /* defined(__CUDACC__) */ + +/* __half2 is visible to non-nvcc host compilers */ +struct __CUDA_ALIGN__(4) __half2 { + __half x, y; + + // All construct/copy/assign/move +public: +#if __cplusplus >= 201103L + __half2() = default; + __CUDA_HOSTDEVICE__ __half2(__half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); } + __CUDA_HOSTDEVICE__ __half2 &operator=(__half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); return *this; } +#else + __CUDA_HOSTDEVICE__ __half2() { } +#endif /* __cplusplus >= 201103L */ + __CUDA_HOSTDEVICE__ __half2(const __half &a, const __half &b) : x(a), y(b) { } + __CUDA_HOSTDEVICE__ __half2(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); } + __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); return *this; } + + /* Convert to/from __half2_raw */ + __CUDA_HOSTDEVICE__ __half2(const __half2_raw &h2r ) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); } + __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2_raw &h2r) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); return *this; } + __CUDA_HOSTDEVICE__ operator __half2_raw() const { __half2_raw ret; __HALF2_TO_UI(ret) = __HALF2_TO_CUI(*this); return ret; } +}; + +/* Restore -Weffc++ warnings from here on */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic pop +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +#undef __CUDA_HOSTDEVICE__ +#undef __CUDA_ALIGN__ + +/* All intrinsic functions are only available to nvcc compilers */ +#if defined(__CUDACC__) + +/* CUDA vector-types compatible vector creation function (note returns __half2, not half2) */ +__VECTOR_FUNCTIONS_DECL__ __half2 make_half2(__half x, __half y) +{ + __half2 t; t.x = x; t.y = y; return t; +} +#undef __VECTOR_FUNCTIONS_DECL__ + + +/* Definitions of intrinsics */ +__CUDA_FP16_DECL__ int __half2int_rn(__half h) +{ + int i; + asm("cvt.rni.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ int __half2int_rz(__half h) +{ + int i; + asm("cvt.rzi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ int __half2int_rd(__half h) +{ + int i; + asm("cvt.rmi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ int __half2int_ru(__half h) +{ + int i; + asm("cvt.rpi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ __half __int2half_rn(int i) +{ + __half h; + asm("cvt.rn.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __int2half_rz(int i) +{ + __half h; + asm("cvt.rz.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __int2half_rd(int i) +{ + __half h; + asm("cvt.rm.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __int2half_ru(int i) +{ + __half h; + asm("cvt.rp.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} + +__CUDA_FP16_DECL__ short int __half2short_rn(__half h) +{ + short int i; + asm("cvt.rni.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ short int __half2short_rz(__half h) +{ + short int i; + asm("cvt.rzi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ short int __half2short_rd(__half h) +{ + short int i; + asm("cvt.rmi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ short int __half2short_ru(__half h) +{ + short int i; + asm("cvt.rpi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ __half __short2half_rn(short int i) +{ + __half h; + asm("cvt.rn.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __short2half_rz(short int i) +{ + __half h; + asm("cvt.rz.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __short2half_rd(short int i) +{ + __half h; + asm("cvt.rm.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __short2half_ru(short int i) +{ + __half h; + asm("cvt.rp.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} + +__CUDA_FP16_DECL__ unsigned int __half2uint_rn(__half h) +{ + unsigned int i; + asm("cvt.rni.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned int __half2uint_rz(__half h) +{ + unsigned int i; + asm("cvt.rzi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned int __half2uint_rd(__half h) +{ + unsigned int i; + asm("cvt.rmi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned int __half2uint_ru(__half h) +{ + unsigned int i; + asm("cvt.rpi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ __half __uint2half_rn(unsigned int i) +{ + __half h; + asm("cvt.rn.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __uint2half_rz(unsigned int i) +{ + __half h; + asm("cvt.rz.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __uint2half_rd(unsigned int i) +{ + __half h; + asm("cvt.rm.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __uint2half_ru(unsigned int i) +{ + __half h; + asm("cvt.rp.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} + +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(__half h) +{ + unsigned short int i; + asm("cvt.rni.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rz(__half h) +{ + unsigned short int i; + asm("cvt.rzi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(__half h) +{ + unsigned short int i; + asm("cvt.rmi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(__half h) +{ + unsigned short int i; + asm("cvt.rpi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ __half __ushort2half_rn(unsigned short int i) +{ + __half h; + asm("cvt.rn.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ushort2half_rz(unsigned short int i) +{ + __half h; + asm("cvt.rz.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ushort2half_rd(unsigned short int i) +{ + __half h; + asm("cvt.rm.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ushort2half_ru(unsigned short int i) +{ + __half h; + asm("cvt.rp.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} + +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(__half h) +{ + unsigned long long int i; + asm("cvt.rni.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rz(__half h) +{ + unsigned long long int i; + asm("cvt.rzi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(__half h) +{ + unsigned long long int i; + asm("cvt.rmi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(__half h) +{ + unsigned long long int i; + asm("cvt.rpi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ __half __ull2half_rn(unsigned long long int i) +{ + __half h; + asm("cvt.rn.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ull2half_rz(unsigned long long int i) +{ + __half h; + asm("cvt.rz.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ull2half_rd(unsigned long long int i) +{ + __half h; + asm("cvt.rm.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ull2half_ru(unsigned long long int i) +{ + __half h; + asm("cvt.rp.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} + +__CUDA_FP16_DECL__ long long int __half2ll_rn(__half h) +{ + long long int i; + asm("cvt.rni.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ long long int __half2ll_rz(__half h) +{ + long long int i; + asm("cvt.rzi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ long long int __half2ll_rd(__half h) +{ + long long int i; + asm("cvt.rmi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ long long int __half2ll_ru(__half h) +{ + long long int i; + asm("cvt.rpi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); + return i; +} +__CUDA_FP16_DECL__ __half __ll2half_rn(long long int i) +{ + __half h; + asm("cvt.rn.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ll2half_rz(long long int i) +{ + __half h; + asm("cvt.rz.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ll2half_rd(long long int i) +{ + __half h; + asm("cvt.rm.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ll2half_ru(long long int i) +{ + __half h; + asm("cvt.rp.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} + +__CUDA_FP16_DECL__ __half htrunc(const __half h) +{ + __half r; + asm("cvt.rzi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hceil(const __half h) +{ + __half r; + asm("cvt.rpi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hfloor(const __half h) +{ + __half r; + asm("cvt.rmi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hrint(const __half h) +{ + __half r; + asm("cvt.rni.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} + +__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rzi.f16.f16 low, low;\n" + " cvt.rzi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rpi.f16.f16 low, low;\n" + " cvt.rpi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rmi.f16.f16 low, low;\n" + " cvt.rmi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rni.f16.f16 low, low;\n" + " cvt.rni.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} + +__CUDA_FP16_DECL__ float2 __half22float2(const __half2 l) +{ + float hi_float; + float lo_float; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, low;}\n" : "=f"(lo_float) : "r"(__HALF2_TO_CUI(l))); + + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, high;}\n" : "=f"(hi_float) : "r"(__HALF2_TO_CUI(l))); + + return make_float2(lo_float, hi_float); +} +__CUDA_FP16_DECL__ __half __float2half(const float f) +{ + __half val; + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f)); + return val; +} +__CUDA_FP16_DECL__ __half __float2half_rn(const float f) +{ + __half val; + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f)); + return val; +} +__CUDA_FP16_DECL__ __half __float2half_rz(const float f) +{ + __half val; + asm("{ cvt.rz.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f)); + return val; +} +__CUDA_FP16_DECL__ __half __float2half_rd(const float f) +{ + __half val; + asm("{ cvt.rm.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f)); + return val; +} +__CUDA_FP16_DECL__ __half __float2half_ru(const float f) +{ + __half val; + asm("{ cvt.rp.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f)); + return val; +} +__CUDA_FP16_DECL__ float __half2float(const __half h) +{ + float val; + asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 __float2half2_rn(const float f) +{ + __half2 val; + asm("{.reg .f16 low;\n" + " cvt.rn.f16.f32 low, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(f)); + return val; +} +__CUDA_FP16_DECL__ __half2 __floats2half2_rn(const float f1, const float f2) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " cvt.rn.f16.f32 low, %1;\n" + " cvt.rn.f16.f32 high, %2;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(f1), "f"(f2)); + return val; +} +__CUDA_FP16_DECL__ __half2 __float22half2_rn(const float2 f) +{ + __half2 val = __floats2half2_rn(f.x, f.y); + return val; +} +__CUDA_FP16_DECL__ float __low2float(const __half2 l) +{ + float val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, low;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(l))); + return val; +} +__CUDA_FP16_DECL__ float __high2float(const __half2 l) +{ + float val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, high;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(l))); + return val; +} +__CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 l, const __half2 h) +{ + __half2 val; + asm("{.reg .f16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {alow,blow};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(l)), "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 l, const __half2 h) +{ + __half2 val; + asm("{.reg .f16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(l)), "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half __low2half(const __half2 h) +{ + __half ret; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, low;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(h))); + return ret; +} +__CUDA_FP16_DECL__ int __hisinf(const __half a) +{ + if (__HALF_TO_CUS(a) == 0xFC00) + return -1; + if (__HALF_TO_CUS(a) == 0x7C00) + return 1; + return 0; +} +__CUDA_FP16_DECL__ __half2 __low2half2(const __half2 l) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(l))); + return val; +} +__CUDA_FP16_DECL__ __half2 __high2half2(const __half2 l) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(l))); + return val; +} +__CUDA_FP16_DECL__ __half __high2half(const __half2 h) +{ + __half ret; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, high;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(h))); + return ret; +} +__CUDA_FP16_DECL__ __half2 __halves2half2(const __half l, const __half h) +{ + __half2 val; + asm("{ mov.b32 %0, {%1,%2};}\n" + : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(l)), "h"(__HALF_TO_CUS(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 __half2half2(const __half lh) +{ + __half2 val; + asm("{ mov.b32 %0, {%1,%1};}\n" + : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(lh))); + return val; +} +__CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 lh) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(lh))); + return val; +} +__CUDA_FP16_DECL__ short int __half_as_short(const __half h) +{ + return (short int)__HALF_TO_CUS(h); +} +__CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h) +{ + return __HALF_TO_CUS(h); +} +__CUDA_FP16_DECL__ __half __short_as_half(const short int i) +{ + __half h; + __HALF_TO_US(h) = (unsigned short int)i; + return h; +} +__CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i) +{ + __half h; + __HALF_TO_US(h) = i; + return h; +} + +#if __CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__) +/****************************************************************************** +* __half, __half2 warp shuffle * +******************************************************************************/ +#define __SHUFFLE_HALF2_MACRO(name) do {\ + __half2 r; \ + asm("{"#name" %0,%1,%2,%3;\n}" \ + :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c)); \ + return r; \ +} while(0); + +#define __SHUFFLE_SYNC_HALF2_MACRO(name) do {\ + __half2 r; \ + asm("{"#name" %0,%1,%2,%3,%4;\n}" \ + :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \ + return r; \ +} while(0); + +__CUDA_FP16_DECL__ __half2 __shfl(__half2 var, int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = ((warpSize - width) << 8) | 0x1f; + __SHUFFLE_HALF2_MACRO(shfl.idx.b32); +} +__CUDA_FP16_DECL__ __half2 __shfl_up(__half2 var, unsigned int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = (warpSize - width) << 8; + __SHUFFLE_HALF2_MACRO(shfl.up.b32); +} +__CUDA_FP16_DECL__ __half2 __shfl_down(__half2 var, unsigned int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = ((warpSize - width) << 8) | 0x1f; + __SHUFFLE_HALF2_MACRO(shfl.down.b32); +} +__CUDA_FP16_DECL__ __half2 __shfl_xor(__half2 var, int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = ((warpSize - width) << 8) | 0x1f; + __SHUFFLE_HALF2_MACRO(shfl.bfly.b32); +} + +__CUDA_FP16_DECL__ __half2 __shfl_sync(unsigned mask, __half2 var, int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = ((warpSize - width) << 8) | 0x1f; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.idx.b32); +} +__CUDA_FP16_DECL__ __half2 __shfl_up_sync(unsigned mask, __half2 var, unsigned int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = (warpSize - width) << 8; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.up.b32); +} +__CUDA_FP16_DECL__ __half2 __shfl_down_sync(unsigned mask, __half2 var, unsigned int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = ((warpSize - width) << 8) | 0x1f; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.down.b32); +} +__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(unsigned mask, __half2 var, int delta, int width) +{ + int warpSize; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); + int c = ((warpSize - width) << 8) | 0x1f; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.bfly.b32); +} + +#undef __SHUFFLE_HALF2_MACRO +#undef __SHUFFLE_SYNC_HALF2_MACRO + +__CUDA_FP16_DECL__ __half __shfl(__half var, int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_up(__half var, unsigned int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_up(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_down(__half var, unsigned int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_down(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_xor(__half var, int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_xor(temp1, delta, width); + return __low2half(temp2); +} + +__CUDA_FP16_DECL__ __half __shfl_sync(unsigned mask, __half var, int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_sync(mask, temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_up_sync(unsigned mask, __half var, unsigned int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_up_sync(mask, temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_down_sync(unsigned mask, __half var, unsigned int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_down_sync(mask, temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_xor_sync(unsigned mask, __half var, int delta, int width) +{ + __half2 temp1 = __halves2half2(var, var); + __half2 temp2 = __shfl_xor_sync(mask, temp1, delta, width); + return __low2half(temp2); +} + +#endif /*__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__)*/ +/****************************************************************************** +* __half and __half2 __ldg,__ldcg,__ldca,__ldcs * +******************************************************************************/ + +#if defined(__cplusplus) && (__CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__)) +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __LDG_PTR "l" +#else +#define __LDG_PTR "r" +#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ +__CUDA_FP16_DECL__ __half2 __ldg(const __half2 *ptr) +{ + __half2 ret; + asm ("ld.global.nc.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldg(const __half *ptr) +{ + __half ret; + asm ("ld.global.nc.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *ptr) +{ + __half2 ret; + asm ("ld.global.cg.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldcg(const __half *ptr) +{ + __half ret; + asm ("ld.global.cg.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldca(const __half2 *ptr) +{ + __half2 ret; + asm ("ld.global.ca.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldca(const __half *ptr) +{ + __half ret; + asm ("ld.global.ca.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldcs(const __half2 *ptr) +{ + __half2 ret; + asm ("ld.global.cs.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldcs(const __half *ptr) +{ + __half ret; + asm ("ld.global.cs.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +#undef __LDG_PTR +#endif /*defined(__cplusplus) && (__CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__))*/ +#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) +/****************************************************************************** +* __half2 comparison * +******************************************************************************/ +#define __COMPARISON_OP_HALF2_MACRO(name) do {\ + __half2 val; \ + asm( "{ "#name".f16x2.f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + return val; \ +} while(0); +__CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.eq); +} +__CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.ne); +} +__CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.le); +} +__CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.ge); +} +__CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.lt); +} +__CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.gt); +} +__CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.equ); +} +__CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.neu); +} +__CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.leu); +} +__CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.geu); +} +__CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.ltu); +} +__CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.gtu); +} +#undef __COMPARISON_OP_HALF2_MACRO +#define __BOOL_COMPARISON_OP_HALF2_MACRO(name) do {\ + __half2 val; \ + asm( "{ "#name".f16x2.f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + if (__HALF2_TO_CUI(val) == 0x3C003C00) \ + return true; \ + else \ + return false; \ +} while(0); +__CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.eq); +} +__CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.ne); +} +__CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.le); +} +__CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.ge); +} +__CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.lt); +} +__CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.gt); +} +__CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.equ); +} +__CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.neu); +} +__CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.leu); +} +__CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.geu); +} +__CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.ltu); +} +__CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.gtu); +} +#undef __BOOL_COMPARISON_OP_HALF2_MACRO +/****************************************************************************** +* __half comparison * +******************************************************************************/ +#define __COMPARISON_OP_HALF_MACRO(name) do {\ + unsigned short val; \ + asm( "{ .reg .pred __$temp3;\n" \ + " setp."#name".f16 __$temp3, %1, %2;\n" \ + " selp.u16 %0, 1, 0, __$temp3;}" \ + : "=h"(val) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); \ + return val ? true : false; \ +} while(0); +__CUDA_FP16_DECL__ bool __heq(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(eq); +} +__CUDA_FP16_DECL__ bool __hne(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(ne); +} +__CUDA_FP16_DECL__ bool __hle(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(le); +} +__CUDA_FP16_DECL__ bool __hge(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(ge); +} +__CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(lt); +} +__CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(gt); +} +__CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(equ); +} +__CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(neu); +} +__CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(leu); +} +__CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(geu); +} +__CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(ltu); +} +__CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(gtu); +} +#undef __COMPARISON_OP_HALF_MACRO +/****************************************************************************** +* __half2 arithmetic * +******************************************************************************/ +#define __BINARY_OP_HALF2_MACRO(name) do {\ + __half2 val; \ + asm( "{"#name".f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + return val; \ +} while(0); + +__CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(add); +} +__CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(sub); +} +__CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(mul); +} +__CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(add.sat); +} +__CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(sub.sat); +} +__CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(mul.sat); +} +#undef __BINARY_OP_HALF2_MACRO +#define __TERNARY_OP_HALF2_MACRO(name) do {\ + __half2 val; \ + asm( "{"#name".f16x2 %0,%1,%2,%3;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b)),"r"(__HALF2_TO_CUI(c))); \ + return val; \ +} while(0); +__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c) +{ + __TERNARY_OP_HALF2_MACRO(fma.rn); +} +__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c) +{ + __TERNARY_OP_HALF2_MACRO(fma.rn.sat); +} +#undef __TERNARY_OP_HALF2_MACRO +__CUDA_FP16_DECL__ __half2 __h2div(__half2 a, __half2 b) { + __half ha, hb; + + ha = __low2half(a); + hb = __low2half(b); + + __half v1 = __hdiv(ha, hb); + + ha = __high2half(a); + hb = __high2half(b); + + __half v2 = __hdiv(ha, hb); + + return __halves2half2(v1, v2); +} +/****************************************************************************** +* __half arithmetic * +******************************************************************************/ +#define __BINARY_OP_HALF_MACRO(name) do {\ + __half val; \ + asm( "{"#name".f16 %0,%1,%2;\n}" \ + :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b))); \ + return val; \ +} while(0); +__CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(add); +} +__CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(sub); +} +__CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(mul); +} +__CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(add.sat); +} +__CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(sub.sat); +} +__CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(mul.sat); +} +#undef __BINARY_OP_HALF_MACRO +#define __TERNARY_OP_HALF_MACRO(name) do {\ + __half val; \ + asm( "{"#name".f16 %0,%1,%2,%3;\n}" \ + :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b)),"h"(__HALF_TO_CUS(c))); \ + return val; \ +} while(0); +__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c) +{ + __TERNARY_OP_HALF_MACRO(fma.rn); +} +__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c) +{ + __TERNARY_OP_HALF_MACRO(fma.rn.sat); +} +#undef __TERNARY_OP_HALF2_MACRO +__CUDA_FP16_DECL__ __half __hdiv(__half a, __half b) { + __half v, abs, den; + __HALF_TO_US(den) = 0x008F; + float fa, fb, fv, rcp; + + fa = __half2float(a); + fb = __half2float(b); + + asm("{rcp.approx.f32 %0, %1;\n}" :"=f"(rcp) : "f"(fb)); + + fv = rcp * fa; + + v = __float2half(fv); + __HALF_TO_US(abs) = (unsigned short)(((unsigned int)__HALF_TO_CUS(v)) & 0x00007FFF); + if (__hlt(abs, den) && (!(__HALF_TO_CUS(abs) == 0x0000))) { + float err = __fmaf_rn(-fb, fv, fa); + fv = __fmaf_rn(rcp, err, fv); + v = __float2half(fv); + } + return v; +} + +/****************************************************************************** +* __half2 functions * +******************************************************************************/ +#define __SPEC_CASE2(i,r, spc, ulp) \ + "{.reg.b32 spc, ulp, p;\n"\ + " mov.b32 spc,"#spc";\n"\ + " mov.b32 ulp,"#ulp";\n"\ + " set.eq.f16x2.f16x2 p,"#i", spc;\n"\ + " fma.rn.f16x2 "#r",p,ulp,"#r";\n}\n" +#define __SPEC_CASE(i,r, spc, ulp) \ + "{.reg.b16 spc, ulp, p;\n"\ + " mov.b16 spc,"#spc";\n"\ + " mov.b16 ulp,"#ulp";\n"\ + " set.eq.f16.f16 p,"#i", spc;\n"\ + " fma.rn.f16 "#r",p,ulp,"#r";\n}\n" +#define __APPROX_FCAST(fun) do {\ + __half val;\ + asm("{.reg.b32 f; \n"\ + " .reg.b16 r; \n"\ + " mov.b16 r,%1; \n"\ + " cvt.f32.f16 f,r; \n"\ + " "#fun".approx.f32 f,f; \n"\ + " cvt.rn.f16.f32 r,f; \n"\ + " mov.b16 %0,r; \n"\ + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));\ + return val;\ +} while(0); +#define __APPROX_FCAST2(fun) do {\ + __half2 val;\ + asm("{.reg.b16 hl, hu; \n"\ + " .reg.b32 fl, fu; \n"\ + " mov.b32 {hl, hu}, %1; \n"\ + " cvt.f32.f16 fl, hl; \n"\ + " cvt.f32.f16 fu, hu; \n"\ + " "#fun".approx.f32 fl, fl; \n"\ + " "#fun".approx.f32 fu, fu; \n"\ + " cvt.rn.f16.f32 hl, fl; \n"\ + " cvt.rn.f16.f32 hu, fu; \n"\ + " mov.b32 %0, {hl, hu}; \n"\ + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); \ + return val;\ +} while(0); +static __device__ __forceinline__ float __float_simpl_sinf(float); +static __device__ __forceinline__ float __float_simpl_cosf(float); +__CUDA_FP16_DECL__ __half __hsin_internal(const __half a) { + float f = __half2float(a); + f = __float_simpl_sinf(f); + return __float2half_rn(f); +} +__CUDA_FP16_DECL__ __half hsin(const __half a) { + __half r = __hsin_internal(a); + asm("{\n\t" + " .reg.b16 i,r,t; \n\t" + " mov.b16 r, %0; \n\t" + " mov.b16 i, %1; \n\t" + " mov.b16 t, 0x8000; \n\t" + " and.b16 t,r,t; \n\t" + __SPEC_CASE(i, r, 0X32B3, 0x0800) + __SPEC_CASE(i, r, 0X5CB0, 0x1000) + __SPEC_CASE(i, r, 0XB2B3, 0x8800) + __SPEC_CASE(i, r, 0XDCB0, 0x9000) + " or.b16 r,r,t; \n\t" + " mov.b16 %0, r; \n" + "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} +__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a) { + __half l = __low2half(a); + __half h = __high2half(a); + __half2 r = __halves2half2(__hsin_internal(l), __hsin_internal(h)); + asm("{\n\t" + " .reg.b32 i,r,t; \n\t" + " mov.b32 r, %0; \n\t" + " mov.b32 i, %1; \n\t" + " and.b32 t, r, 0x80008000; \n\t" + __SPEC_CASE2(i, r, 0X32B332B3, 0x08000800) + __SPEC_CASE2(i, r, 0X5CB05CB0, 0x10001000) + __SPEC_CASE2(i, r, 0XB2B3B2B3, 0x88008800) + __SPEC_CASE2(i, r, 0XDCB0DCB0, 0x90009000) + " or.b32 r, r, t; \n\t" + " mov.b32 %0, r; \n" + "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ __half __hcos_internal(const __half a) { + float f = __half2float(a); + f = __float_simpl_cosf(f); + return __float2half_rn(f); +} +__CUDA_FP16_DECL__ __half hcos(const __half a) { + __half r = __hcos_internal(a); + asm("{\n\t" + " .reg.b16 i,r; \n\t" + " mov.b16 r, %0; \n\t" + " mov.b16 i, %1; \n\t" + __SPEC_CASE(i, r, 0X2B7C, 0x1000) + __SPEC_CASE(i, r, 0XAB7C, 0x1000) + " mov.b16 %0, r; \n" + "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} +__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a) { + __half l = __low2half(a); + __half h = __high2half(a); + __half2 r = __halves2half2(__hcos_internal(l), __hcos_internal(h)); + asm("{\n\t" + " .reg.b32 i,r; \n\t" + " mov.b32 r, %0; \n\t" + " mov.b32 i, %1; \n\t" + __SPEC_CASE2(i, r, 0X2B7C2B7C, 0x10001000) + __SPEC_CASE2(i, r, 0XAB7CAB7C, 0x10001000) + " mov.b32 %0, r; \n" + "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +static __device__ __forceinline__ float __internal_trig_reduction_kernel(float a, int *quadrant) +{ + float j, t; + int q; + q = __float2int_rn(a * 0.636619772f); + j = (float)q; + t = __fmaf_rn(-j, 1.5707962512969971e+000f, a); + t = __fmaf_rn(-j, 7.5497894158615964e-008f, t); + *quadrant = q; + return t; +} +static __device__ __forceinline__ float __internal_sin_cos_kernel(float x, int i) +{ + float x2, z; + x2 = x*x; + + if (i & 1) { + z = 2.44331571e-5f; + z = __fmaf_rn(z, x2, -1.38873163e-3f); + } + else { + z = -1.95152959e-4f; + z = __fmaf_rn(z, x2, 8.33216087e-3f); + } + if (i & 1) { + z = __fmaf_rn(z, x2, 4.16666457e-2f); + z = __fmaf_rn(z, x2, -5.00000000e-1f); + } + else { + z = __fmaf_rn(z, x2, -1.66666546e-1f); + z = __fmaf_rn(z, x2, 0.0f); + } + x = __fmaf_rn(z, x, x); + if (i & 1) x = __fmaf_rn(z, x2, 1.0f); + if (i & 2) x = __fmaf_rn(x, -1.0f, 0.0f); + return x; +} +static __device__ __forceinline__ float __float_simpl_sinf(float a) +{ + float z; + int i; + if (isinf(a)) { + a = a * 0.0f; + } + a = __internal_trig_reduction_kernel(a, &i); + z = __internal_sin_cos_kernel(a, i); + return z; +} +static __device__ __forceinline__ float __float_simpl_cosf(float a) +{ + float z; + int i; + if (isinf(a)) { + a = a * 0.0f; + } + a = __internal_trig_reduction_kernel(a, &i); + i++; + z = __internal_sin_cos_kernel(a, i); + return z; +} +__CUDA_FP16_DECL__ __half hexp(const __half a) { + __half val; + asm("{.reg.b32 f, C; \n" + " .reg.b16 h,r; \n" + " mov.b16 h,%1; \n" + " cvt.f32.f16 f,h; \n" + " mov.b32 C, 0x3fb8aa3b; \n" + " mul.f32 f,f,C; \n" + " ex2.approx.f32 f,f; \n" + " cvt.rn.f16.f32 r,f; \n" + __SPEC_CASE(h, r, 0X1F79, 0x9400) + __SPEC_CASE(h, r, 0X25CF, 0x9400) + __SPEC_CASE(h, r, 0XC13B, 0x0400) + __SPEC_CASE(h, r, 0XC1EF, 0x0200) + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu, C; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " mov.b32 C, 0x3fb8aa3b; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " ex2.approx.f32 fl, fl; \n" + " ex2.approx.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0X1F791F79, 0x94009400) + __SPEC_CASE2(h, r, 0X25CF25CF, 0x94009400) + __SPEC_CASE2(h, r, 0XC13BC13B, 0x04000400) + __SPEC_CASE2(h, r, 0XC1EFC1EF, 0x02000200) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hexp2(const __half a) { + __half val; + asm("{.reg.b32 f, ULP; \n" + " .reg.b16 r; \n" + " mov.b16 r,%1; \n" + " cvt.f32.f16 f,r; \n" + " ex2.approx.f32 f,f; \n" + " mov.b32 ULP, 0x33800000;\n" + " fma.rn.f32 f,f,ULP,f; \n" + " cvt.rn.f16.f32 r,f; \n" + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 fl, fu, ULP; \n" + " mov.b32 {hl, hu}, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " ex2.approx.f32 fl, fl; \n" + " ex2.approx.f32 fu, fu; \n" + " mov.b32 ULP, 0x33800000;\n" + " fma.rn.f32 fl,fl,ULP,fl; \n" + " fma.rn.f32 fu,fu,ULP,fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 %0, {hl, hu}; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hexp10(const __half a) { + __half val; + asm("{.reg.b16 h,r; \n" + " .reg.b32 f, C; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " mov.b32 C, 0x40549A78; \n" + " mul.f32 f,f,C; \n" + " ex2.approx.f32 f, f; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(h, r, 0x34DE, 0x9800) + __SPEC_CASE(h, r, 0x9766, 0x9000) + __SPEC_CASE(h, r, 0x9972, 0x1000) + __SPEC_CASE(h, r, 0xA5C4, 0x1000) + __SPEC_CASE(h, r, 0xBF0A, 0x8100) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu, C; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " mov.b32 C, 0x40549A78; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " ex2.approx.f32 fl, fl; \n" + " ex2.approx.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0x34DE34DE, 0x98009800) + __SPEC_CASE2(h, r, 0x97669766, 0x90009000) + __SPEC_CASE2(h, r, 0x99729972, 0x10001000) + __SPEC_CASE2(h, r, 0xA5C4A5C4, 0x10001000) + __SPEC_CASE2(h, r, 0xBF0ABF0A, 0x81008100) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog2(const __half a) { + __half val; + asm("{.reg.b16 h, r; \n" + " .reg.b32 f; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " lg2.approx.f32 f, f; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(r, r, 0xA2E2, 0x8080) + __SPEC_CASE(r, r, 0xBF46, 0x9400) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 fl, fu, r, p; \n" + " mov.b32 {hl, hu}, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.f32 fl, fl; \n" + " lg2.approx.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(r, r, 0xA2E2A2E2, 0x80808080) + __SPEC_CASE2(r, r, 0xBF46BF46, 0x94009400) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog(const __half a) { + __half val; + asm("{.reg.b32 f, C; \n" + " .reg.b16 r,h; \n" + " mov.b16 h,%1; \n" + " cvt.f32.f16 f,h; \n" + " lg2.approx.f32 f,f; \n" + " mov.b32 C, 0x3f317218; \n" + " mul.f32 f,f,C; \n" + " cvt.rn.f16.f32 r,f; \n" + __SPEC_CASE(h, r, 0X160D, 0x9C00) + __SPEC_CASE(h, r, 0X3BFE, 0x8010) + __SPEC_CASE(h, r, 0X3C0B, 0x8080) + __SPEC_CASE(h, r, 0X6051, 0x1C00) + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.f32 fl, fl; \n" + " lg2.approx.f32 fu, fu; \n" + " mov.b32 C, 0x3f317218; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0X160D160D, 0x9C009C00) + __SPEC_CASE2(h, r, 0X3BFE3BFE, 0x80108010) + __SPEC_CASE2(h, r, 0X3C0B3C0B, 0x80808080) + __SPEC_CASE2(h, r, 0X60516051, 0x1C001C00) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog10(const __half a) { + __half val; + asm("{.reg.b16 h, r; \n" + " .reg.b32 f, C; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " lg2.approx.f32 f, f; \n" + " mov.b32 C, 0x3E9A209B; \n" + " mul.f32 f,f,C; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(h, r, 0x338F, 0x1000) + __SPEC_CASE(h, r, 0x33F8, 0x9000) + __SPEC_CASE(h, r, 0x57E1, 0x9800) + __SPEC_CASE(h, r, 0x719D, 0x9C00) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.f32 fl, fl; \n" + " lg2.approx.f32 fu, fu; \n" + " mov.b32 C, 0x3E9A209B; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0x338F338F, 0x10001000) + __SPEC_CASE2(h, r, 0x33F833F8, 0x90009000) + __SPEC_CASE2(h, r, 0x57E157E1, 0x98009800) + __SPEC_CASE2(h, r, 0x719D719D, 0x9C009C00) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +#undef __SPEC_CASE2 +#undef __SPEC_CASE +__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a) { + __APPROX_FCAST2(rcp); +} +__CUDA_FP16_DECL__ __half hrcp(const __half a) { + __APPROX_FCAST(rcp); +} +__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a) { + __APPROX_FCAST2(rsqrt); +} +__CUDA_FP16_DECL__ __half hrsqrt(const __half a) { + __APPROX_FCAST(rsqrt); +} +__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a) { + __APPROX_FCAST2(sqrt); +} +__CUDA_FP16_DECL__ __half hsqrt(const __half a) { + __APPROX_FCAST(sqrt); +} +#undef __APPROX_FCAST +#undef __APPROX_FCAST2 +__CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a) +{ + __half2 r; + asm("{set.nan.f16x2.f16x2 %0,%1,%2;\n}" + :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ bool __hisnan(const __half a) +{ + __half r; + asm("{set.nan.f16.f16 %0,%1,%2;\n}" + :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(a))); + if (__HALF_TO_CUS(r) == 0) + return false; + else return true; +} +__CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a) +{ + __half2 zero = __float2half2_rn(0.0); + return __hsub2(zero, a); +} +__CUDA_FP16_DECL__ __half __hneg(const __half a) +{ + __half zero; + zero = __float2half(0.0); + return __hsub(zero, a); +} +#endif /*__CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/ + +#undef __CUDA_FP16_DECL__ +#endif /* defined(__CUDACC__) */ +#endif /* defined(__cplusplus) */ + +#undef __HALF_TO_US +#undef __HALF_TO_CUS +#undef __HALF2_TO_UI +#undef __HALF2_TO_CUI + + +/* Define first-class types "half" and "half2", unless user specifies otherwise via "#define CUDA_NO_HALF" */ +/* C cannot ever have these types defined here, because __half and __half2 are C++ classes */ +#if defined(__cplusplus) && !defined(CUDA_NO_HALF) +typedef __half half; +typedef __half2 half2; +#endif /* defined(__cplusplus) && !defined(CUDA_NO_HALF) */ + +#endif /* end of include guard: __CUDA_FP16_HPP__ */ diff --git a/include/external/CUDA/cuda_runtime.h b/include/external/CUDA/cuda_runtime.h new file mode 100755 index 000000000..ce880e055 --- /dev/null +++ b/include/external/CUDA/cuda_runtime.h @@ -0,0 +1,2040 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_RUNTIME_H__) +#define __CUDA_RUNTIME_H__ + +#if !defined(__CUDACC_RTC__) +#if defined(__GNUC__) +#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))) +#pragma GCC diagnostic push +#endif +#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))) +#pragma GCC diagnostic ignored "-Wunused-function" +#endif +#elif defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable: 4820) +#endif +#endif + +#ifdef __QNX__ +#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 7) +typedef unsigned size_t; +#endif +#endif +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "host_config.h" + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "builtin_types.h" +#include "library_types.h" +#if !defined(__CUDACC_RTC__) +#define EXCLUDE_FROM_RTC +#include "channel_descriptor.h" +#include "cuda_runtime_api.h" +#include "driver_functions.h" +#undef EXCLUDE_FROM_RTC +#endif /* !__CUDACC_RTC__ */ +#include "host_defines.h" +#include "vector_functions.h" + +#if defined(__CUDACC__) + +#if defined(__CUDACC_RTC__) +#include "nvrtc_device_runtime.h" +#include "device_functions.h" + +extern __host__ __device__ unsigned cudaConfigureCall(dim3 gridDim, + dim3 blockDim, + size_t sharedMem = 0, + void *stream = 0); +#include "common_functions.h" +#include "cuda_surface_types.h" +#include "cuda_texture_types.h" +#include "device_launch_parameters.h" + +#else /* !__CUDACC_RTC__ */ +#define EXCLUDE_FROM_RTC +#include "common_functions.h" +#include "cuda_surface_types.h" +#include "cuda_texture_types.h" +#include "device_functions.h" +#include "device_launch_parameters.h" + +#if defined(__CUDACC_EXTENDED_LAMBDA__) +#include +#include +struct __device_builtin__ __nv_lambda_preheader_injection { }; +#endif /* defined(__CUDACC_EXTENDED_LAMBDA__) */ + +#undef EXCLUDE_FROM_RTC +#endif /* __CUDACC_RTC__ */ + +#endif /* __CUDACC__ */ + +#if defined(__cplusplus) && !defined(__CUDACC_RTC__) + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +/** + * \addtogroup CUDART_HIGHLEVEL + * @{ + */ + +/** + *\brief Launches a device function + * + * The function invokes kernel \p func on \p gridDim (\p gridDim.x × \p gridDim.y + * × \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x × + * \p blockDim.y × \p blockDim.z) threads. + * + * If the kernel has N parameters the \p args should point to array of N pointers. + * Each pointer, from args[0] to args[N - 1], point to the region + * of memory from which the actual parameter will be copied. + * + * \p sharedMem sets the amount of dynamic shared memory that will be available to + * each thread block. + * + * \p stream specifies a stream the invocation is associated to. + * + * \param func - Device function symbol + * \param gridDim - Grid dimentions + * \param blockDim - Block dimentions + * \param args - Arguments + * \param sharedMem - Shared memory (defaults to 0) + * \param stream - Stream identifier (defaults to NULL) + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidConfiguration, + * ::cudaErrorLaunchFailure, + * ::cudaErrorLaunchTimeout, + * ::cudaErrorLaunchOutOfResources, + * ::cudaErrorSharedObjectInitFailed, + * ::cudaErrorInvalidPtx, + * ::cudaErrorNoKernelImageForDevice, + * ::cudaErrorJitCompilerNotFound + * \notefnerr + * \note_async + * \note_null_stream + * + * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)" + */ +template +static __inline__ __host__ cudaError_t cudaLaunchKernel( + const T *func, + dim3 gridDim, + dim3 blockDim, + void **args, + size_t sharedMem = 0, + cudaStream_t stream = 0 +) +{ + return ::cudaLaunchKernel((const void *)func, gridDim, blockDim, args, sharedMem, stream); +} + +/** + *\brief Launches a device function + * + * The function invokes kernel \p func on \p gridDim (\p gridDim.x × \p gridDim.y + * × \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x × + * \p blockDim.y × \p blockDim.z) threads. + * + * The device on which this kernel is invoked must have a non-zero value for + * the device attribute ::cudaDevAttrCooperativeLaunch. + * + * The total number of blocks launched cannot exceed the maximum number of blocks per + * multiprocessor as returned by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor (or + * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors + * as specified by the device attribute ::cudaDevAttrMultiProcessorCount. + * + * The kernel cannot make use of CUDA dynamic parallelism. + * + * If the kernel has N parameters the \p args should point to array of N pointers. + * Each pointer, from args[0] to args[N - 1], point to the region + * of memory from which the actual parameter will be copied. + * + * \p sharedMem sets the amount of dynamic shared memory that will be available to + * each thread block. + * + * \p stream specifies a stream the invocation is associated to. + * + * \param func - Device function symbol + * \param gridDim - Grid dimentions + * \param blockDim - Block dimentions + * \param args - Arguments + * \param sharedMem - Shared memory (defaults to 0) + * \param stream - Stream identifier (defaults to NULL) + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidConfiguration, + * ::cudaErrorLaunchFailure, + * ::cudaErrorLaunchTimeout, + * ::cudaErrorLaunchOutOfResources, + * ::cudaErrorSharedObjectInitFailed + * \notefnerr + * \note_async + * \note_null_stream + * + * \ref ::cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchCooperativeKernel (C API)" + */ +template +static __inline__ __host__ cudaError_t cudaLaunchCooperativeKernel( + const T *func, + dim3 gridDim, + dim3 blockDim, + void **args, + size_t sharedMem = 0, + cudaStream_t stream = 0 +) +{ + return ::cudaLaunchCooperativeKernel((const void *)func, gridDim, blockDim, args, sharedMem, stream); +} + +/** + * \brief \hl Configure a device launch + * + * \deprecated This function is deprecated as of CUDA 7.0 + * + * Pushes \p size bytes of the argument pointed to by \p arg at \p offset + * bytes from the start of the parameter passing area, which starts at + * offset 0. The arguments are stored in the top of the execution stack. + * \ref ::cudaSetupArgument(T, size_t) "cudaSetupArgument()" must be preceded + * by a call to ::cudaConfigureCall(). + * + * \param arg - Argument to push for a kernel launch + * \param offset - Offset in argument stack to push new arg + * + * \return + * ::cudaSuccess + * \notefnerr + * + * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)", + * \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)", + * ::cudaSetDoubleForDevice, + * ::cudaSetDoubleForHost, + * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)" + */ +template +static __inline__ __host__ cudaError_t cudaSetupArgument( + T arg, + size_t offset +) +{ + return ::cudaSetupArgument((const void*)&arg, sizeof(T), offset); +} + +/** + * \brief \hl Creates an event object with the specified flags + * + * Creates an event object with the specified flags. Valid flags include: + * - ::cudaEventDefault: Default event creation flag. + * - ::cudaEventBlockingSync: Specifies that event should use blocking + * synchronization. A host thread that uses ::cudaEventSynchronize() to wait + * on an event created with this flag will block until the event actually + * completes. + * - ::cudaEventDisableTiming: Specifies that the created event does not need + * to record timing data. Events created with this flag specified and + * the ::cudaEventBlockingSync flag not specified will provide the best + * performance when used with ::cudaStreamWaitEvent() and ::cudaEventQuery(). + * + * \param event - Newly created event + * \param flags - Flags for new event + * + * \return + * ::cudaSuccess, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidValue, + * ::cudaErrorLaunchFailure, + * ::cudaErrorMemoryAllocation + * \notefnerr + * + * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)", + * ::cudaEventCreateWithFlags, ::cudaEventRecord, ::cudaEventQuery, + * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime, + * ::cudaStreamWaitEvent + */ +static __inline__ __host__ cudaError_t cudaEventCreate( + cudaEvent_t *event, + unsigned int flags +) +{ + return ::cudaEventCreateWithFlags(event, flags); +} + +/** + * \brief \hl Allocates page-locked memory on the host + * + * Allocates \p size bytes of host memory that is page-locked and accessible + * to the device. The driver tracks the virtual memory ranges allocated with + * this function and automatically accelerates calls to functions such as + * ::cudaMemcpy(). Since the memory can be accessed directly by the device, it + * can be read or written with much higher bandwidth than pageable memory + * obtained with functions such as ::malloc(). Allocating excessive amounts of + * pinned memory may degrade system performance, since it reduces the amount + * of memory available to the system for paging. As a result, this function is + * best used sparingly to allocate staging areas for data exchange between host + * and device. + * + * The \p flags parameter enables different options to be specified that affect + * the allocation, as follows. + * - ::cudaHostAllocDefault: This flag's value is defined to be 0. + * - ::cudaHostAllocPortable: The memory returned by this call will be + * considered as pinned memory by all CUDA contexts, not just the one that + * performed the allocation. + * - ::cudaHostAllocMapped: Maps the allocation into the CUDA address space. + * The device pointer to the memory may be obtained by calling + * ::cudaHostGetDevicePointer(). + * - ::cudaHostAllocWriteCombined: Allocates the memory as write-combined (WC). + * WC memory can be transferred across the PCI Express bus more quickly on some + * system configurations, but cannot be read efficiently by most CPUs. WC + * memory is a good option for buffers that will be written by the CPU and read + * by the device via mapped pinned memory or host->device transfers. + * + * All of these flags are orthogonal to one another: a developer may allocate + * memory that is portable, mapped and/or write-combined with no restrictions. + * + * ::cudaSetDeviceFlags() must have been called with the ::cudaDeviceMapHost + * flag in order for the ::cudaHostAllocMapped flag to have any effect. + * + * The ::cudaHostAllocMapped flag may be specified on CUDA contexts for devices + * that do not support mapped pinned memory. The failure is deferred to + * ::cudaHostGetDevicePointer() because the memory may be mapped into other + * CUDA contexts via the ::cudaHostAllocPortable flag. + * + * Memory allocated by this function must be freed with ::cudaFreeHost(). + * + * \param ptr - Device pointer to allocated memory + * \param size - Requested allocation size in bytes + * \param flags - Requested properties of allocated memory + * + * \return + * ::cudaSuccess, + * ::cudaErrorMemoryAllocation + * \notefnerr + * + * \sa ::cudaSetDeviceFlags, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaHostAlloc + */ +static __inline__ __host__ cudaError_t cudaMallocHost( + void **ptr, + size_t size, + unsigned int flags +) +{ + return ::cudaHostAlloc(ptr, size, flags); +} + +template +static __inline__ __host__ cudaError_t cudaHostAlloc( + T **ptr, + size_t size, + unsigned int flags +) +{ + return ::cudaHostAlloc((void**)(void*)ptr, size, flags); +} + +template +static __inline__ __host__ cudaError_t cudaHostGetDevicePointer( + T **pDevice, + void *pHost, + unsigned int flags +) +{ + return ::cudaHostGetDevicePointer((void**)(void*)pDevice, pHost, flags); +} + +/** + * \brief Allocates memory that will be automatically managed by the Unified Memory system + * + * Allocates \p size bytes of managed memory on the device and returns in + * \p *devPtr a pointer to the allocated memory. If the device doesn't support + * allocating managed memory, ::cudaErrorNotSupported is returned. Support + * for managed memory can be queried using the device attribute + * ::cudaDevAttrManagedMemory. The allocated memory is suitably + * aligned for any kind of variable. The memory is not cleared. If \p size + * is 0, ::cudaMallocManaged returns ::cudaErrorInvalidValue. The pointer + * is valid on the CPU and on all GPUs in the system that support managed memory. + * All accesses to this pointer must obey the Unified Memory programming model. + * + * \p flags specifies the default stream association for this allocation. + * \p flags must be one of ::cudaMemAttachGlobal or ::cudaMemAttachHost. The + * default value for \p flags is ::cudaMemAttachGlobal. + * If ::cudaMemAttachGlobal is specified, then this memory is accessible from + * any stream on any device. If ::cudaMemAttachHost is specified, then the + * allocation should not be accessed from devices that have a zero value for the + * device attribute ::cudaDevAttrConcurrentManagedAccess; an explicit call to + * ::cudaStreamAttachMemAsync will be required to enable access on such devices. + * + * If the association is later changed via ::cudaStreamAttachMemAsync to + * a single stream, the default association, as specifed during ::cudaMallocManaged, + * is restored when that stream is destroyed. For __managed__ variables, the + * default association is always ::cudaMemAttachGlobal. Note that destroying a + * stream is an asynchronous operation, and as a result, the change to default + * association won't happen until all work in the stream has completed. + * + * Memory allocated with ::cudaMallocManaged should be released with ::cudaFree. + * + * Device memory oversubscription is possible for GPUs that have a non-zero value for the + * device attribute ::cudaDevAttrConcurrentManagedAccess. Managed memory on + * such GPUs may be evicted from device memory to host memory at any time by the Unified + * Memory driver in order to make room for other allocations. + * + * In a multi-GPU system where all GPUs have a non-zero value for the device attribute + * ::cudaDevAttrConcurrentManagedAccess, managed memory may not be populated when this + * API returns and instead may be populated on access. In such systems, managed memory can + * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to + * maintain data locality and prevent excessive page faults to the extent possible. The application + * can also guide the driver about memory usage patterns via ::cudaMemAdvise. The application + * can also explicitly migrate memory to a desired processor's memory via + * ::cudaMemPrefetchAsync. + * + * In a multi-GPU system where all of the GPUs have a zero value for the device attribute + * ::cudaDevAttrConcurrentManagedAccess and all the GPUs have peer-to-peer support + * with each other, the physical storage for managed memory is created on the GPU which is active + * at the time ::cudaMallocManaged is called. All other GPUs will reference the data at reduced + * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate + * memory among such GPUs. + * + * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and + * where the value of the device attribute ::cudaDevAttrConcurrentManagedAccess + * is zero for at least one of those GPUs, the location chosen for physical storage of managed + * memory is system-dependent. + * - On Linux, the location chosen will be device memory as long as the current set of active + * contexts are on devices that either have peer-to-peer support with each other or have a + * non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess. + * If there is an active context on a GPU that does not have a non-zero value for that device + * attribute and it does not have peer-to-peer support with the other devices that have active + * contexts on them, then the location for physical storage will be 'zero-copy' or host memory. + * Note that this means that managed memory that is located in device memory is migrated to + * host memory if a new context is created on a GPU that doesn't have a non-zero value for + * the device attribute and does not support peer-to-peer with at least one of the other devices + * that has an active context. This in turn implies that context creation may fail if there is + * insufficient host memory to migrate all managed allocations. + * - On Windows, the physical storage is always created in 'zero-copy' or host memory. + * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these + * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to + * restrict CUDA to only use those GPUs that have peer-to-peer support. + * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a non-zero + * value to force the driver to always use device memory for physical storage. + * When this environment variable is set to a non-zero value, all devices used in + * that process that support managed memory have to be peer-to-peer compatible + * with each other. The error ::cudaErrorInvalidDevice will be returned if a device + * that supports managed memory is used and it is not peer-to-peer compatible with + * any of the other managed memory supporting devices that were previously used in + * that process, even if ::cudaDeviceReset has been called on those devices. These + * environment variables are described in the CUDA programming guide under the + * "CUDA environment variables" section. + * - On ARM, managed memory is not available on discrete gpu with Drive PX-2. + * + * \param devPtr - Pointer to allocated device memory + * \param size - Requested allocation size in bytes + * \param flags - Must be either ::cudaMemAttachGlobal or ::cudaMemAttachHost (defaults to ::cudaMemAttachGlobal) + * + * \return + * ::cudaSuccess, + * ::cudaErrorMemoryAllocation + * ::cudaErrorNotSupported + * ::cudaErrorInvalidValue + * + * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray, + * ::cudaMalloc3D, ::cudaMalloc3DArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaHostAlloc, ::cudaDeviceGetAttribute, ::cudaStreamAttachMemAsync + */ +template +static __inline__ __host__ cudaError_t cudaMallocManaged( + T **devPtr, + size_t size, + unsigned int flags = cudaMemAttachGlobal +) +{ + return ::cudaMallocManaged((void**)(void*)devPtr, size, flags); +} + +/** + * \brief Attach memory to a stream asynchronously + * + * Enqueues an operation in \p stream to specify stream association of + * \p length bytes of memory starting from \p devPtr. This function is a + * stream-ordered operation, meaning that it is dependent on, and will + * only take effect when, previous work in stream has completed. Any + * previous association is automatically replaced. + * + * \p devPtr must point to an address within managed memory space declared + * using the __managed__ keyword or allocated with ::cudaMallocManaged. + * + * \p length must be zero, to indicate that the entire allocation's + * stream association is being changed. Currently, it's not possible + * to change stream association for a portion of an allocation. The default + * value for \p length is zero. + * + * The stream association is specified using \p flags which must be + * one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle. + * The default value for \p flags is ::cudaMemAttachSingle + * If the ::cudaMemAttachGlobal flag is specified, the memory can be accessed + * by any stream on any device. + * If the ::cudaMemAttachHost flag is specified, the program makes a guarantee + * that it won't access the memory on the device from any stream on a device that + * has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess. + * If the ::cudaMemAttachSingle flag is specified and \p stream is associated with + * a device that has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess, + * the program makes a guarantee that it will only access the memory on the device + * from \p stream. It is illegal to attach singly to the NULL stream, because the + * NULL stream is a virtual global stream and not a specific stream. An error will + * be returned in this case. + * + * When memory is associated with a single stream, the Unified Memory system will + * allow CPU access to this memory region so long as all operations in \p stream + * have completed, regardless of whether other streams are active. In effect, + * this constrains exclusive ownership of the managed memory region by + * an active GPU to per-stream activity instead of whole-GPU activity. + * + * Accessing memory on the device from streams that are not associated with + * it will produce undefined results. No error checking is performed by the + * Unified Memory system to ensure that kernels launched into other streams + * do not access this region. + * + * It is a program's responsibility to order calls to ::cudaStreamAttachMemAsync + * via events, synchronization or other means to ensure legal access to memory + * at all times. Data visibility and coherency will be changed appropriately + * for all kernels which follow a stream-association change. + * + * If \p stream is destroyed while data is associated with it, the association is + * removed and the association reverts to the default visibility of the allocation + * as specified at ::cudaMallocManaged. For __managed__ variables, the default + * association is always ::cudaMemAttachGlobal. Note that destroying a stream is an + * asynchronous operation, and as a result, the change to default association won't + * happen until all work in the stream has completed. + * + * \param stream - Stream in which to enqueue the attach operation + * \param devPtr - Pointer to memory (must be a pointer to managed memory) + * \param length - Length of memory (must be zero, defaults to zero) + * \param flags - Must be one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle (defaults to ::cudaMemAttachSingle) + * + * \return + * ::cudaSuccess, + * ::cudaErrorNotReady, + * ::cudaErrorInvalidValue + * ::cudaErrorInvalidResourceHandle + * \notefnerr + * + * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, ::cudaMallocManaged + */ +template +static __inline__ __host__ cudaError_t cudaStreamAttachMemAsync( + cudaStream_t stream, + T *devPtr, + size_t length = 0, + unsigned int flags = cudaMemAttachSingle +) +{ + return ::cudaStreamAttachMemAsync(stream, (void*)devPtr, length, flags); +} + +template +static __inline__ __host__ cudaError_t cudaMalloc( + T **devPtr, + size_t size +) +{ + return ::cudaMalloc((void**)(void*)devPtr, size); +} + +template +static __inline__ __host__ cudaError_t cudaMallocHost( + T **ptr, + size_t size, + unsigned int flags = 0 +) +{ + return cudaMallocHost((void**)(void*)ptr, size, flags); +} + +template +static __inline__ __host__ cudaError_t cudaMallocPitch( + T **devPtr, + size_t *pitch, + size_t width, + size_t height +) +{ + return ::cudaMallocPitch((void**)(void*)devPtr, pitch, width, height); +} + +#if defined(__CUDACC__) + +/** + * \brief \hl Copies data to the given symbol on the device + * + * Copies \p count bytes from the memory area pointed to by \p src + * to the memory area \p offset bytes from the start of symbol + * \p symbol. The memory areas may not overlap. \p symbol is a variable that + * resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice. + * + * \param symbol - Device symbol reference + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_sync + * \note_string_api_deprecation + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync + */ +template +static __inline__ __host__ cudaError_t cudaMemcpyToSymbol( + const T &symbol, + const void *src, + size_t count, + size_t offset = 0, + enum cudaMemcpyKind kind = cudaMemcpyHostToDevice +) +{ + return ::cudaMemcpyToSymbol((const void*)&symbol, src, count, offset, kind); +} + +/** + * \brief \hl Copies data to the given symbol on the device + * + * Copies \p count bytes from the memory area pointed to by \p src + * to the memory area \p offset bytes from the start of symbol + * \p symbol. The memory areas may not overlap. \p symbol is a variable that + * resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice. + * + * ::cudaMemcpyToSymbolAsync() is asynchronous with respect to the host, so + * the call may return before the copy is complete. The copy can optionally + * be associated to a stream by passing a non-zero \p stream argument. If + * \p kind is ::cudaMemcpyHostToDevice and \p stream is non-zero, the copy + * may overlap with operations in other streams. + * + * \param symbol - Device symbol reference + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_async + * \note_string_api_deprecation + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyFromSymbolAsync + */ +template +static __inline__ __host__ cudaError_t cudaMemcpyToSymbolAsync( + const T &symbol, + const void *src, + size_t count, + size_t offset = 0, + enum cudaMemcpyKind kind = cudaMemcpyHostToDevice, + cudaStream_t stream = 0 +) +{ + return ::cudaMemcpyToSymbolAsync((const void*)&symbol, src, count, offset, kind, stream); +} + +/** + * \brief \hl Copies data from the given symbol on the device + * + * Copies \p count bytes from the memory area \p offset bytes + * from the start of symbol \p symbol to the memory area pointed to by \p dst. + * The memory areas may not overlap. \p symbol is a variable that + * resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice. + * + * \param dst - Destination memory address + * \param symbol - Device symbol reference + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_sync + * \note_string_api_deprecation + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync + */ +template +static __inline__ __host__ cudaError_t cudaMemcpyFromSymbol( + void *dst, + const T &symbol, + size_t count, + size_t offset = 0, + enum cudaMemcpyKind kind = cudaMemcpyDeviceToHost +) +{ + return ::cudaMemcpyFromSymbol(dst, (const void*)&symbol, count, offset, kind); +} + +/** + * \brief \hl Copies data from the given symbol on the device + * + * Copies \p count bytes from the memory area \p offset bytes + * from the start of symbol \p symbol to the memory area pointed to by \p dst. + * The memory areas may not overlap. \p symbol is a variable that resides in + * global or constant memory space. \p kind can be either + * ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice. + * + * ::cudaMemcpyFromSymbolAsync() is asynchronous with respect to the host, so + * the call may return before the copy is complete. The copy can optionally be + * associated to a stream by passing a non-zero \p stream argument. If \p kind + * is ::cudaMemcpyDeviceToHost and \p stream is non-zero, the copy may overlap + * with operations in other streams. + * + * \param dst - Destination memory address + * \param symbol - Device symbol reference + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_async + * \note_string_api_deprecation + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync + */ +template +static __inline__ __host__ cudaError_t cudaMemcpyFromSymbolAsync( + void *dst, + const T &symbol, + size_t count, + size_t offset = 0, + enum cudaMemcpyKind kind = cudaMemcpyDeviceToHost, + cudaStream_t stream = 0 +) +{ + return ::cudaMemcpyFromSymbolAsync(dst, (const void*)&symbol, count, offset, kind, stream); +} + +/** + * \brief \hl Finds the address associated with a CUDA symbol + * + * Returns in \p *devPtr the address of symbol \p symbol on the device. + * \p symbol can either be a variable that resides in global or constant memory space. + * If \p symbol cannot be found, or if \p symbol is not declared + * in the global or constant memory space, \p *devPtr is unchanged and the error + * ::cudaErrorInvalidSymbol is returned. + * + * \param devPtr - Return device pointer associated with symbol + * \param symbol - Device symbol reference + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * + * \sa \ref ::cudaGetSymbolAddress(void**, const void*) "cudaGetSymbolAddress (C API)", + * \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API)" + */ +template +static __inline__ __host__ cudaError_t cudaGetSymbolAddress( + void **devPtr, + const T &symbol +) +{ + return ::cudaGetSymbolAddress(devPtr, (const void*)&symbol); +} + +/** + * \brief \hl Finds the size of the object associated with a CUDA symbol + * + * Returns in \p *size the size of symbol \p symbol. \p symbol must be a + * variable that resides in global or constant memory space. + * If \p symbol cannot be found, or if \p symbol is not declared + * in global or constant memory space, \p *size is unchanged and the error + * ::cudaErrorInvalidSymbol is returned. + * + * \param size - Size of object associated with symbol + * \param symbol - Device symbol reference + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * + * \sa \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress (C++ API)", + * \ref ::cudaGetSymbolSize(size_t*, const void*) "cudaGetSymbolSize (C API)" + */ +template +static __inline__ __host__ cudaError_t cudaGetSymbolSize( + size_t *size, + const T &symbol +) +{ + return ::cudaGetSymbolSize(size, (const void*)&symbol); +} + +/** + * \brief \hl Binds a memory area to a texture + * + * Binds \p size bytes of the memory area pointed to by \p devPtr to texture + * reference \p tex. \p desc describes how the memory is interpreted when + * fetching values from the texture. The \p offset parameter is an optional + * byte offset as with the low-level + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture()" + * function. Any memory previously bound to \p tex is unbound. + * + * \param offset - Offset in bytes + * \param tex - Texture to bind + * \param devPtr - Memory area on device + * \param desc - Channel format + * \param size - Size of the memory area pointed to by devPtr + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __inline__ __host__ cudaError_t cudaBindTexture( + size_t *offset, + const struct texture &tex, + const void *devPtr, + const struct cudaChannelFormatDesc &desc, + size_t size = UINT_MAX +) +{ + return ::cudaBindTexture(offset, &tex, devPtr, &desc, size); +} + +/** + * \brief \hl Binds a memory area to a texture + * + * Binds \p size bytes of the memory area pointed to by \p devPtr to texture + * reference \p tex. The channel descriptor is inherited from the texture + * reference type. The \p offset parameter is an optional byte offset as with + * the low-level + * ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) + * function. Any memory previously bound to \p tex is unbound. + * + * \param offset - Offset in bytes + * \param tex - Texture to bind + * \param devPtr - Memory area on device + * \param size - Size of the memory area pointed to by devPtr + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __inline__ __host__ cudaError_t cudaBindTexture( + size_t *offset, + const struct texture &tex, + const void *devPtr, + size_t size = UINT_MAX +) +{ + return cudaBindTexture(offset, tex, devPtr, tex.channelDesc, size); +} + +/** + * \brief \hl Binds a 2D memory area to a texture + * + * Binds the 2D memory area pointed to by \p devPtr to the + * texture reference \p tex. The size of the area is constrained by + * \p width in texel units, \p height in texel units, and \p pitch in byte + * units. \p desc describes how the memory is interpreted when fetching values + * from the texture. Any memory previously bound to \p tex is unbound. + * + * Since the hardware enforces an alignment requirement on texture base + * addresses, + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D()" + * returns in \p *offset a byte offset that + * must be applied to texture fetches in order to read from the desired memory. + * This offset must be divided by the texel size and passed to kernels that + * read from the texture so they can be applied to the ::tex2D() function. + * If the device memory pointer was returned from ::cudaMalloc(), the offset is + * guaranteed to be 0 and NULL may be passed as the \p offset parameter. + * + * \param offset - Offset in bytes + * \param tex - Texture reference to bind + * \param devPtr - 2D memory area on device + * \param desc - Channel format + * \param width - Width in texel units + * \param height - Height in texel units + * \param pitch - Pitch in bytes + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __inline__ __host__ cudaError_t cudaBindTexture2D( + size_t *offset, + const struct texture &tex, + const void *devPtr, + const struct cudaChannelFormatDesc &desc, + size_t width, + size_t height, + size_t pitch +) +{ + return ::cudaBindTexture2D(offset, &tex, devPtr, &desc, width, height, pitch); +} + +/** + * \brief \hl Binds a 2D memory area to a texture + * + * Binds the 2D memory area pointed to by \p devPtr to the + * texture reference \p tex. The size of the area is constrained by + * \p width in texel units, \p height in texel units, and \p pitch in byte + * units. The channel descriptor is inherited from the texture reference + * type. Any memory previously bound to \p tex is unbound. + * + * Since the hardware enforces an alignment requirement on texture base + * addresses, + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D()" + * returns in \p *offset a byte offset that + * must be applied to texture fetches in order to read from the desired memory. + * This offset must be divided by the texel size and passed to kernels that + * read from the texture so they can be applied to the ::tex2D() function. + * If the device memory pointer was returned from ::cudaMalloc(), the offset is + * guaranteed to be 0 and NULL may be passed as the \p offset parameter. + * + * \param offset - Offset in bytes + * \param tex - Texture reference to bind + * \param devPtr - 2D memory area on device + * \param width - Width in texel units + * \param height - Height in texel units + * \param pitch - Pitch in bytes + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __inline__ __host__ cudaError_t cudaBindTexture2D( + size_t *offset, + const struct texture &tex, + const void *devPtr, + size_t width, + size_t height, + size_t pitch +) +{ + return ::cudaBindTexture2D(offset, &tex, devPtr, &tex.channelDesc, width, height, pitch); +} + +/** + * \brief \hl Binds an array to a texture + * + * Binds the CUDA array \p array to the texture reference \p tex. + * \p desc describes how the memory is interpreted when fetching values from + * the texture. Any CUDA array previously bound to \p tex is unbound. + * + * \param tex - Texture to bind + * \param array - Memory array on device + * \param desc - Channel format + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __inline__ __host__ cudaError_t cudaBindTextureToArray( + const struct texture &tex, + cudaArray_const_t array, + const struct cudaChannelFormatDesc &desc +) +{ + return ::cudaBindTextureToArray(&tex, array, &desc); +} + +/** + * \brief \hl Binds an array to a texture + * + * Binds the CUDA array \p array to the texture reference \p tex. + * The channel descriptor is inherited from the CUDA array. Any CUDA array + * previously bound to \p tex is unbound. + * + * \param tex - Texture to bind + * \param array - Memory array on device + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __inline__ __host__ cudaError_t cudaBindTextureToArray( + const struct texture &tex, + cudaArray_const_t array +) +{ + struct cudaChannelFormatDesc desc; + cudaError_t err = ::cudaGetChannelDesc(&desc, array); + + return err == cudaSuccess ? cudaBindTextureToArray(tex, array, desc) : err; +} + +/** + * \brief \hl Binds a mipmapped array to a texture + * + * Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p tex. + * \p desc describes how the memory is interpreted when fetching values from + * the texture. Any CUDA mipmapped array previously bound to \p tex is unbound. + * + * \param tex - Texture to bind + * \param mipmappedArray - Memory mipmapped array on device + * \param desc - Channel format + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __inline__ __host__ cudaError_t cudaBindTextureToMipmappedArray( + const struct texture &tex, + cudaMipmappedArray_const_t mipmappedArray, + const struct cudaChannelFormatDesc &desc +) +{ + return ::cudaBindTextureToMipmappedArray(&tex, mipmappedArray, &desc); +} + +/** + * \brief \hl Binds a mipmapped array to a texture + * + * Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p tex. + * The channel descriptor is inherited from the CUDA array. Any CUDA mipmapped array + * previously bound to \p tex is unbound. + * + * \param tex - Texture to bind + * \param mipmappedArray - Memory mipmapped array on device + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __inline__ __host__ cudaError_t cudaBindTextureToMipmappedArray( + const struct texture &tex, + cudaMipmappedArray_const_t mipmappedArray +) +{ + struct cudaChannelFormatDesc desc; + cudaArray_t levelArray; + cudaError_t err = ::cudaGetMipmappedArrayLevel(&levelArray, mipmappedArray, 0); + + if (err != cudaSuccess) { + return err; + } + err = ::cudaGetChannelDesc(&desc, levelArray); + + return err == cudaSuccess ? cudaBindTextureToMipmappedArray(tex, mipmappedArray, desc) : err; +} + +/** + * \brief \hl Unbinds a texture + * + * Unbinds the texture bound to \p tex. + * + * \param tex - Texture to unbind + * + * \return ::cudaSuccess + * \notefnerr + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template +static __inline__ __host__ cudaError_t cudaUnbindTexture( + const struct texture &tex +) +{ + return ::cudaUnbindTexture(&tex); +} + +/** + * \brief \hl Get the alignment offset of a texture + * + * Returns in \p *offset the offset that was returned when texture reference + * \p tex was bound. + * + * \param offset - Offset of texture reference in bytes + * \param tex - Texture to get offset of + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidTexture, + * ::cudaErrorInvalidTextureBinding + * \notefnerr + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)" + */ +template +static __inline__ __host__ cudaError_t cudaGetTextureAlignmentOffset( + size_t *offset, + const struct texture &tex +) +{ + return ::cudaGetTextureAlignmentOffset(offset, &tex); +} + +/** + * \brief \hl Sets the preferred cache configuration for a device function + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this sets through \p cacheConfig the preferred cache configuration + * for the function specified via \p func. This is only a preference. The + * runtime will use the requested configuration if possible, but it is free to + * choose a different configuration if required to execute \p func. + * + * \p func must be a pointer to a function that executes on the device. + * The parameter specified by \p func must be declared as a \p __global__ + * function. If the specified function does not exist, + * then ::cudaErrorInvalidDeviceFunction is returned. + * + * This setting does nothing on devices where the size of the L1 cache and + * shared memory are fixed. + * + * Launching a kernel with a different preference than the most recent + * preference setting may insert a device-side synchronization point. + * + * The supported cache configurations are: + * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default) + * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache + * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory + * + * \param func - device function pointer + * \param cacheConfig - Requested cache configuration + * + * \return + * ::cudaSuccess, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidDeviceFunction + * \notefnerr + * + * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)", + * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)", + * ::cudaSetDoubleForDevice, + * ::cudaSetDoubleForHost, + * \ref ::cudaSetupArgument(T, size_t) "cudaSetupArgument (C++ API)", + * ::cudaThreadGetCacheConfig, + * ::cudaThreadSetCacheConfig + */ +template +static __inline__ __host__ cudaError_t cudaFuncSetCacheConfig( + T *func, + enum cudaFuncCache cacheConfig +) +{ + return ::cudaFuncSetCacheConfig((const void*)func, cacheConfig); +} + +template +static __inline__ __host__ cudaError_t cudaFuncSetSharedMemConfig( + T *func, + enum cudaSharedMemConfig config +) +{ + return ::cudaFuncSetSharedMemConfig((const void*)func, config); +} + +/** + * \brief Returns occupancy for a device function + * + * Returns in \p *numBlocks the maximum number of active blocks per + * streaming multiprocessor for the device function. + * + * \param numBlocks - Returned occupancy + * \param func - Kernel function for which occupancy is calulated + * \param blockSize - Block size the kernel is intended to be launched with + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * + * \return + * ::cudaSuccess, + * ::cudaErrorCudartUnloading, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSize + * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + */ +template +static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor( + int *numBlocks, + T func, + int blockSize, + size_t dynamicSMemSize) +{ + return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, cudaOccupancyDefault); +} + +/** + * \brief Returns occupancy for a device function with the specified flags + * + * Returns in \p *numBlocks the maximum number of active blocks per + * streaming multiprocessor for the device function. + * + * The \p flags parameter controls how special cases are handled. Valid flags include: + * + * - ::cudaOccupancyDefault: keeps the default behavior as + * ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * + * - ::cudaOccupancyDisableCachingOverride: suppresses the default behavior + * on platform where global caching affects occupancy. On such platforms, if caching + * is enabled, but per-block SM resource usage would result in zero occupancy, the + * occupancy calculator will calculate the occupancy as if caching is disabled. + * Setting this flag makes the occupancy calculator to return 0 in such cases. + * More information can be found about this feature in the "Unified L1/Texture Cache" + * section of the Maxwell tuning guide. + * + * \param numBlocks - Returned occupancy + * \param func - Kernel function for which occupancy is calulated + * \param blockSize - Block size the kernel is intended to be launched with + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * \param flags - Requested behavior for the occupancy calculator + * + * \return + * ::cudaSuccess, + * ::cudaErrorCudartUnloading, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * \sa ::cudaOccupancyMaxPotentialBlockSize + * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + */ +template +static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + int *numBlocks, + T func, + int blockSize, + size_t dynamicSMemSize, + unsigned int flags) +{ + return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, flags); +} + +/** + * Helper functor for cudaOccupancyMaxPotentialBlockSize + */ +class __cudaOccupancyB2DHelper { + size_t n; +public: + inline __host__ CUDART_DEVICE __cudaOccupancyB2DHelper(size_t n_) : n(n_) {} + inline __host__ CUDART_DEVICE size_t operator()(int) + { + return n; + } +}; + +/** + * \brief Returns grid and block size that achieves maximum potential occupancy for a device function + * + * Returns in \p *minGridSize and \p *blocksize a suggested grid / + * block size pair that achieves the best potential occupancy + * (i.e. the maximum number of active warps with the smallest number + * of blocks). + * + * The \p flags parameter controls how special cases are handled. Valid flags include: + * + * - ::cudaOccupancyDefault: keeps the default behavior as + * ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + * + * - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior + * on platform where global caching affects occupancy. On such platforms, if caching + * is enabled, but per-block SM resource usage would result in zero occupancy, the + * occupancy calculator will calculate the occupancy as if caching is disabled. + * Setting this flag makes the occupancy calculator to return 0 in such cases. + * More information can be found about this feature in the "Unified L1/Texture Cache" + * section of the Maxwell tuning guide. + * + * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy + * \param blockSize - Returned block size + * \param func - Device function symbol + * \param blockSizeToDynamicSMemSize - A unary function / functor that takes block size, and returns the size, in bytes, of dynamic shared memory needed for a block + * \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit. + * \param flags - Requested behavior for the occupancy calculator + * + * \return + * ::cudaSuccess, + * ::cudaErrorCudartUnloading, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSize + * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags + */ + +template +static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags( + int *minGridSize, + int *blockSize, + T func, + UnaryFunction blockSizeToDynamicSMemSize, + int blockSizeLimit = 0, + unsigned int flags = 0) +{ + cudaError_t status; + + // Device and function properties + int device; + struct cudaFuncAttributes attr; + + // Limits + int maxThreadsPerMultiProcessor; + int warpSize; + int devMaxThreadsPerBlock; + int multiProcessorCount; + int funcMaxThreadsPerBlock; + int occupancyLimit; + int granularity; + + // Recorded maximum + int maxBlockSize = 0; + int numBlocks = 0; + int maxOccupancy = 0; + + // Temporary + int blockSizeToTryAligned; + int blockSizeToTry; + int blockSizeLimitAligned; + int occupancyInBlocks; + int occupancyInThreads; + size_t dynamicSMemSize; + + /////////////////////////// + // Check user input + /////////////////////////// + + if (!minGridSize || !blockSize || !func) { + return cudaErrorInvalidValue; + } + + ////////////////////////////////////////////// + // Obtain device and function properties + ////////////////////////////////////////////// + + status = ::cudaGetDevice(&device); + if (status != cudaSuccess) { + return status; + } + + status = cudaDeviceGetAttribute( + &maxThreadsPerMultiProcessor, + cudaDevAttrMaxThreadsPerMultiProcessor, + device); + if (status != cudaSuccess) { + return status; + } + + status = cudaDeviceGetAttribute( + &warpSize, + cudaDevAttrWarpSize, + device); + if (status != cudaSuccess) { + return status; + } + + status = cudaDeviceGetAttribute( + &devMaxThreadsPerBlock, + cudaDevAttrMaxThreadsPerBlock, + device); + if (status != cudaSuccess) { + return status; + } + + status = cudaDeviceGetAttribute( + &multiProcessorCount, + cudaDevAttrMultiProcessorCount, + device); + if (status != cudaSuccess) { + return status; + } + + status = cudaFuncGetAttributes(&attr, func); + if (status != cudaSuccess) { + return status; + } + + funcMaxThreadsPerBlock = attr.maxThreadsPerBlock; + + ///////////////////////////////////////////////////////////////////////////////// + // Try each block size, and pick the block size with maximum occupancy + ///////////////////////////////////////////////////////////////////////////////// + + occupancyLimit = maxThreadsPerMultiProcessor; + granularity = warpSize; + + if (blockSizeLimit == 0) { + blockSizeLimit = devMaxThreadsPerBlock; + } + + if (devMaxThreadsPerBlock < blockSizeLimit) { + blockSizeLimit = devMaxThreadsPerBlock; + } + + if (funcMaxThreadsPerBlock < blockSizeLimit) { + blockSizeLimit = funcMaxThreadsPerBlock; + } + + blockSizeLimitAligned = ((blockSizeLimit + (granularity - 1)) / granularity) * granularity; + + for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) { + // This is needed for the first iteration, because + // blockSizeLimitAligned could be greater than blockSizeLimit + // + if (blockSizeLimit < blockSizeToTryAligned) { + blockSizeToTry = blockSizeLimit; + } else { + blockSizeToTry = blockSizeToTryAligned; + } + + dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry); + + status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + &occupancyInBlocks, + func, + blockSizeToTry, + dynamicSMemSize, + flags); + + if (status != cudaSuccess) { + return status; + } + + occupancyInThreads = blockSizeToTry * occupancyInBlocks; + + if (occupancyInThreads > maxOccupancy) { + maxBlockSize = blockSizeToTry; + numBlocks = occupancyInBlocks; + maxOccupancy = occupancyInThreads; + } + + // Early out if we have reached the maximum + // + if (occupancyLimit == maxOccupancy) { + break; + } + } + + /////////////////////////// + // Return best available + /////////////////////////// + + // Suggested min grid size to achieve a full machine launch + // + *minGridSize = numBlocks * multiProcessorCount; + *blockSize = maxBlockSize; + + return status; +} + +/** + * \brief Returns grid and block size that achieves maximum potential occupancy for a device function + * + * Returns in \p *minGridSize and \p *blocksize a suggested grid / + * block size pair that achieves the best potential occupancy + * (i.e. the maximum number of active warps with the smallest number + * of blocks). + * + * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy + * \param blockSize - Returned block size + * \param func - Device function symbol + * \param blockSizeToDynamicSMemSize - A unary function / functor that takes block size, and returns the size, in bytes, of dynamic shared memory needed for a block + * \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit. + * + * \return + * ::cudaSuccess, + * ::cudaErrorCudartUnloading, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSize + * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags + */ + +template +static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMem( + int *minGridSize, + int *blockSize, + T func, + UnaryFunction blockSizeToDynamicSMemSize, + int blockSizeLimit = 0) +{ + return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, blockSizeToDynamicSMemSize, blockSizeLimit, cudaOccupancyDefault); +} + +/** + * \brief Returns grid and block size that achieves maximum potential occupancy for a device function + * + * Returns in \p *minGridSize and \p *blocksize a suggested grid / + * block size pair that achieves the best potential occupancy + * (i.e. the maximum number of active warps with the smallest number + * of blocks). + * + * Use \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem if the + * amount of per-block dynamic shared memory changes with different + * block sizes. + * + * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy + * \param blockSize - Returned block size + * \param func - Device function symbol + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit. + * + * \return + * ::cudaSuccess, + * ::cudaErrorCudartUnloading, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * + * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + */ +template +static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSize( + int *minGridSize, + int *blockSize, + T func, + size_t dynamicSMemSize = 0, + int blockSizeLimit = 0) +{ + return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, cudaOccupancyDefault); +} + +/** + * \brief Returns grid and block size that achived maximum potential occupancy for a device function with the specified flags + * + * Returns in \p *minGridSize and \p *blocksize a suggested grid / + * block size pair that achieves the best potential occupancy + * (i.e. the maximum number of active warps with the smallest number + * of blocks). + * + * The \p flags parameter controls how special cases are handle. Valid flags include: + * + * - ::cudaOccupancyDefault: keeps the default behavior as + * ::cudaOccupancyMaxPotentialBlockSize + * + * - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior + * on platform where global caching affects occupancy. On such platforms, if caching + * is enabled, but per-block SM resource usage would result in zero occupancy, the + * occupancy calculator will calculate the occupancy as if caching is disabled. + * Setting this flag makes the occupancy calculator to return 0 in such cases. + * More information can be found about this feature in the "Unified L1/Texture Cache" + * section of the Maxwell tuning guide. + * + * Use \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem if the + * amount of per-block dynamic shared memory changes with different + * block sizes. + * + * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy + * \param blockSize - Returned block size + * \param func - Device function symbol + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit. + * \param flags - Requested behavior for the occupancy calculator + * + * \return + * ::cudaSuccess, + * ::cudaErrorCudartUnloading, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * + * \sa ::cudaOccupancyMaxPotentialBlockSize + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + */ +template +static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeWithFlags( + int *minGridSize, + int *blockSize, + T func, + size_t dynamicSMemSize = 0, + int blockSizeLimit = 0, + unsigned int flags = 0) +{ + return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, flags); +} + +/** + * \brief \hl Launches a device function + * + * \deprecated This function is deprecated as of CUDA 7.0 + * + * Launches the function \p func on the device. The parameter \p func must + * be a function that executes on the device. The parameter specified by \p func + * must be declared as a \p __global__ function. + * \ref ::cudaLaunch(T*) "cudaLaunch()" must be preceded by a call to + * ::cudaConfigureCall() since it pops the data that was pushed by + * ::cudaConfigureCall() from the execution stack. + * + * \param func - Device function pointer + * to execute + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidConfiguration, + * ::cudaErrorLaunchFailure, + * ::cudaErrorLaunchTimeout, + * ::cudaErrorLaunchOutOfResources, + * ::cudaErrorSharedObjectSymbolNotFound, + * ::cudaErrorSharedObjectInitFailed, + * ::cudaErrorInvalidPtx, + * ::cudaErrorNoKernelImageForDevice, + * ::cudaErrorJitCompilerNotFound + * \notefnerr + * + * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)", + * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)", + * \ref ::cudaLaunch(const void*) "cudaLaunch (C API)", + * ::cudaSetDoubleForDevice, + * ::cudaSetDoubleForHost, + * \ref ::cudaSetupArgument(T, size_t) "cudaSetupArgument (C++ API)", + * ::cudaThreadGetCacheConfig, + * ::cudaThreadSetCacheConfig + */ +template +static __inline__ __host__ cudaError_t cudaLaunch( + T *func +) +{ + return ::cudaLaunch((const void*)func); +} + +/** + * \brief \hl Find out attributes for a given function + * + * This function obtains the attributes of a function specified via \p entry. + * The parameter \p entry must be a pointer to a function that executes + * on the device. The parameter specified by \p entry must be declared as a \p __global__ + * function. The fetched attributes are placed in \p attr. If the specified + * function does not exist, then ::cudaErrorInvalidDeviceFunction is returned. + * + * Note that some function attributes such as + * \ref ::cudaFuncAttributes::maxThreadsPerBlock "maxThreadsPerBlock" + * may vary based on the device that is currently being used. + * + * \param attr - Return pointer to function's attributes + * \param entry - Function to get attributes of + * + * \return + * ::cudaSuccess, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidDeviceFunction + * \notefnerr + * + * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)", + * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", + * ::cudaSetDoubleForDevice, + * ::cudaSetDoubleForHost, + * \ref ::cudaSetupArgument(T, size_t) "cudaSetupArgument (C++ API)" + */ +template +static __inline__ __host__ cudaError_t cudaFuncGetAttributes( + struct cudaFuncAttributes *attr, + T *entry +) +{ + return ::cudaFuncGetAttributes(attr, (const void*)entry); +} + +/** + * \brief \hl Set attributes for a given function + * + * This function sets the attributes of a function specified via \p entry. + * The parameter \p entry must be a pointer to a function that executes + * on the device. The parameter specified by \p entry must be declared as a \p __global__ + * function. The enumeration defined by \p attr is set to the value defined by \p value. + * If the specified function does not exist, then ::cudaErrorInvalidDeviceFunction is returned. + * If the specified attribute cannot be written, or if the value is incorrect, + * then ::cudaErrorInvalidValue is returned. + * + * Valid values for \p attr are: + * - ::cudaFuncAttributeMaxDynamicSharedMemorySize - Maximum size of dynamic shared memory per block + * - ::cudaFuncAttributePreferredSharedMemoryCarveout - Preferred shared memory-L1 cache split ratio in percent of maximum shared memory. + * + * \param entry - Function to get attributes of + * \param attr - Attribute to set + * \param value - Value to set + * + * \return + * ::cudaSuccess, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue + * \notefnerr + * + * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)", + * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", + * ::cudaSetDoubleForDevice, + * ::cudaSetDoubleForHost, + * \ref ::cudaSetupArgument(T, size_t) "cudaSetupArgument (C++ API)" + */ +template +static __inline__ __host__ cudaError_t cudaFuncSetAttribute( + T *entry, + enum cudaFuncAttribute attr, + int value +) +{ + return ::cudaFuncSetAttribute((const void*)entry, attr, value); +} + +/** + * \brief \hl Binds an array to a surface + * + * Binds the CUDA array \p array to the surface reference \p surf. + * \p desc describes how the memory is interpreted when dealing with + * the surface. Any CUDA array previously bound to \p surf is unbound. + * + * \param surf - Surface to bind + * \param array - Memory array on device + * \param desc - Channel format + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSurface + * \notefnerr + * + * \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)", + * \ref ::cudaBindSurfaceToArray(const struct surface&, cudaArray_const_t) "cudaBindSurfaceToArray (C++ API, inherited channel descriptor)" + */ +template +static __inline__ __host__ cudaError_t cudaBindSurfaceToArray( + const struct surface &surf, + cudaArray_const_t array, + const struct cudaChannelFormatDesc &desc +) +{ + return ::cudaBindSurfaceToArray(&surf, array, &desc); +} + +/** + * \brief \hl Binds an array to a surface + * + * Binds the CUDA array \p array to the surface reference \p surf. + * The channel descriptor is inherited from the CUDA array. Any CUDA array + * previously bound to \p surf is unbound. + * + * \param surf - Surface to bind + * \param array - Memory array on device + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSurface + * \notefnerr + * + * \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)", + * \ref ::cudaBindSurfaceToArray(const struct surface&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindSurfaceToArray (C++ API)" + */ +template +static __inline__ __host__ cudaError_t cudaBindSurfaceToArray( + const struct surface &surf, + cudaArray_const_t array +) +{ + struct cudaChannelFormatDesc desc; + cudaError_t err = ::cudaGetChannelDesc(&desc, array); + + return err == cudaSuccess ? cudaBindSurfaceToArray(surf, array, desc) : err; +} + +#endif /* __CUDACC__ */ + +/** @} */ /* END CUDART_HIGHLEVEL */ + +#endif /* __cplusplus && !__CUDACC_RTC__ */ + +#if !defined(__CUDACC_RTC__) +#if defined(__GNUC__) +#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))) +#pragma GCC diagnostic pop +#endif +#elif defined(_MSC_VER) +#pragma warning(pop) +#endif +#endif + +#endif /* !__CUDA_RUNTIME_H__ */ diff --git a/include/external/CUDA/cuda_runtime_api.h b/include/external/CUDA/cuda_runtime_api.h new file mode 100755 index 000000000..4f2997cdd --- /dev/null +++ b/include/external/CUDA/cuda_runtime_api.h @@ -0,0 +1,7422 @@ +/* + * Copyright 1993-2017 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_RUNTIME_API_H__) +#define __CUDA_RUNTIME_API_H__ + +/** + * \latexonly + * \page sync_async API synchronization behavior + * + * \section memcpy_sync_async_behavior Memcpy + * The API provides memcpy/memset functions in both synchronous and asynchronous forms, + * the latter having an \e "Async" suffix. This is a misnomer as each function + * may exhibit synchronous or asynchronous behavior depending on the arguments + * passed to the function. In the reference documentation, each memcpy function is + * categorized as \e synchronous or \e asynchronous, corresponding to the definitions + * below. + * + * \subsection MemcpySynchronousBehavior Synchronous + * + *
    + *
  1. For transfers from pageable host memory to device memory, a stream sync is performed + * before the copy is initiated. The function will return once the pageable + * buffer has been copied to the staging memory for DMA transfer to device memory, + * but the DMA to final destination may not have completed. + * + *
  2. For transfers from pinned host memory to device memory, the function is synchronous + * with respect to the host. + * + *
  3. For transfers from device to either pageable or pinned host memory, the function returns + * only once the copy has completed. + * + *
  4. For transfers from device memory to device memory, no host-side synchronization is + * performed. + * + *
  5. For transfers from any host memory to any host memory, the function is fully + * synchronous with respect to the host. + *
+ * + * \subsection MemcpyAsynchronousBehavior Asynchronous + * + *
    + *
  1. For transfers from device memory to pageable host memory, the function + * will return only once the copy has completed. + * + *
  2. For transfers from any host memory to any host memory, the function is fully + * synchronous with respect to the host. + * + *
  3. For all other transfers, the function is fully asynchronous. If pageable + * memory must first be staged to pinned memory, this will be handled + * asynchronously with a worker thread. + *
+ * + * \section memset_sync_async_behavior Memset + * The cudaMemset functions are asynchronous with respect to the host + * except when the target memory is pinned host memory. The \e Async + * versions are always asynchronous with respect to the host. + * + * \section kernel_launch_details Kernel Launches + * Kernel launches are asynchronous with respect to the host. Details of + * concurrent kernel execution and data transfers can be found in the CUDA + * Programmers Guide. + * + * \endlatexonly + */ + +/** + * There are two levels for the runtime API. + * + * The C API (cuda_runtime_api.h) is + * a C-style interface that does not require compiling with \p nvcc. + * + * The \ref CUDART_HIGHLEVEL "C++ API" (cuda_runtime.h) is a + * C++-style interface built on top of the C API. It wraps some of the + * C API routines, using overloading, references and default arguments. + * These wrappers can be used from C++ code and can be compiled with any C++ + * compiler. The C++ API also has some CUDA-specific wrappers that wrap + * C API routines that deal with symbols, textures, and device functions. + * These wrappers require the use of \p nvcc because they depend on code being + * generated by the compiler. For example, the execution configuration syntax + * to invoke kernels is only available in source code compiled with \p nvcc. + */ + +/** CUDA Runtime API Version */ +#define CUDART_VERSION 9000 + +#include "host_defines.h" +#include "builtin_types.h" + +#include "cuda_device_runtime_api.h" + +#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) || defined(__CUDA_API_VERSION_INTERNAL) + #define __CUDART_API_PER_THREAD_DEFAULT_STREAM + #define __CUDART_API_PTDS(api) api ## _ptds + #define __CUDART_API_PTSZ(api) api ## _ptsz +#else + #define __CUDART_API_PTDS(api) api + #define __CUDART_API_PTSZ(api) api +#endif + +#if defined(__CUDART_API_PER_THREAD_DEFAULT_STREAM) + #define cudaMemcpy __CUDART_API_PTDS(cudaMemcpy) + #define cudaMemcpyToSymbol __CUDART_API_PTDS(cudaMemcpyToSymbol) + #define cudaMemcpyFromSymbol __CUDART_API_PTDS(cudaMemcpyFromSymbol) + #define cudaMemcpy2D __CUDART_API_PTDS(cudaMemcpy2D) + #define cudaMemcpyToArray __CUDART_API_PTDS(cudaMemcpyToArray) + #define cudaMemcpy2DToArray __CUDART_API_PTDS(cudaMemcpy2DToArray) + #define cudaMemcpyFromArray __CUDART_API_PTDS(cudaMemcpyFromArray) + #define cudaMemcpy2DFromArray __CUDART_API_PTDS(cudaMemcpy2DFromArray) + #define cudaMemcpyArrayToArray __CUDART_API_PTDS(cudaMemcpyArrayToArray) + #define cudaMemcpy2DArrayToArray __CUDART_API_PTDS(cudaMemcpy2DArrayToArray) + #define cudaMemcpy3D __CUDART_API_PTDS(cudaMemcpy3D) + #define cudaMemcpy3DPeer __CUDART_API_PTDS(cudaMemcpy3DPeer) + #define cudaMemset __CUDART_API_PTDS(cudaMemset) + #define cudaMemset2D __CUDART_API_PTDS(cudaMemset2D) + #define cudaMemset3D __CUDART_API_PTDS(cudaMemset3D) + #define cudaMemcpyAsync __CUDART_API_PTSZ(cudaMemcpyAsync) + #define cudaMemcpyToSymbolAsync __CUDART_API_PTSZ(cudaMemcpyToSymbolAsync) + #define cudaMemcpyFromSymbolAsync __CUDART_API_PTSZ(cudaMemcpyFromSymbolAsync) + #define cudaMemcpy2DAsync __CUDART_API_PTSZ(cudaMemcpy2DAsync) + #define cudaMemcpyToArrayAsync __CUDART_API_PTSZ(cudaMemcpyToArrayAsync) + #define cudaMemcpy2DToArrayAsync __CUDART_API_PTSZ(cudaMemcpy2DToArrayAsync) + #define cudaMemcpyFromArrayAsync __CUDART_API_PTSZ(cudaMemcpyFromArrayAsync) + #define cudaMemcpy2DFromArrayAsync __CUDART_API_PTSZ(cudaMemcpy2DFromArrayAsync) + #define cudaMemcpy3DAsync __CUDART_API_PTSZ(cudaMemcpy3DAsync) + #define cudaMemcpy3DPeerAsync __CUDART_API_PTSZ(cudaMemcpy3DPeerAsync) + #define cudaMemsetAsync __CUDART_API_PTSZ(cudaMemsetAsync) + #define cudaMemset2DAsync __CUDART_API_PTSZ(cudaMemset2DAsync) + #define cudaMemset3DAsync __CUDART_API_PTSZ(cudaMemset3DAsync) + #define cudaStreamQuery __CUDART_API_PTSZ(cudaStreamQuery) + #define cudaStreamGetFlags __CUDART_API_PTSZ(cudaStreamGetFlags) + #define cudaStreamGetPriority __CUDART_API_PTSZ(cudaStreamGetPriority) + #define cudaEventRecord __CUDART_API_PTSZ(cudaEventRecord) + #define cudaStreamWaitEvent __CUDART_API_PTSZ(cudaStreamWaitEvent) + #define cudaStreamAddCallback __CUDART_API_PTSZ(cudaStreamAddCallback) + #define cudaStreamAttachMemAsync __CUDART_API_PTSZ(cudaStreamAttachMemAsync) + #define cudaStreamSynchronize __CUDART_API_PTSZ(cudaStreamSynchronize) + #define cudaLaunch __CUDART_API_PTSZ(cudaLaunch) + #define cudaLaunchKernel __CUDART_API_PTSZ(cudaLaunchKernel) + #define cudaMemPrefetchAsync __CUDART_API_PTSZ(cudaMemPrefetchAsync) + #define cudaLaunchCooperativeKernel __CUDART_API_PTSZ(cudaLaunchCooperativeKernel) +#endif + +/** \cond impl_private */ +#if !defined(__dv) + +#if defined(__cplusplus) + +#define __dv(v) \ + = v + +#else /* __cplusplus */ + +#define __dv(v) + +#endif /* __cplusplus */ + +#endif /* !__dv */ +/** \endcond impl_private */ + +#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)) /** Visible to SM>=3.5 and "__host__ __device__" only **/ + +#define CUDART_DEVICE __device__ + +#else + +#define CUDART_DEVICE + +#endif /** CUDART_DEVICE */ + +#if defined(__cplusplus) +extern "C" { +#endif /* __cplusplus */ + +/** + * \defgroup CUDART_DEVICE Device Management + * + * ___MANBRIEF___ device management functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the device management functions of the CUDA runtime + * application programming interface. + * + * @{ + */ + +/** + * \brief Destroy all allocations and reset all state on the current device + * in the current process. + * + * Explicitly destroys and cleans up all resources associated with the current + * device in the current process. Any subsequent API call to this device will + * reinitialize the device. + * + * Note that this function will reset the device immediately. It is the caller's + * responsibility to ensure that the device is not being accessed by any + * other host threads from the process when this function is called. + * + * \return + * ::cudaSuccess + * \notefnerr + * + * \sa ::cudaDeviceSynchronize + */ +extern __host__ cudaError_t CUDARTAPI cudaDeviceReset(void); + +/** + * \brief Wait for compute device to finish + * + * Blocks until the device has completed all preceding requested tasks. + * ::cudaDeviceSynchronize() returns an error if one of the preceding tasks + * has failed. If the ::cudaDeviceScheduleBlockingSync flag was set for + * this device, the host thread will block until the device has finished + * its work. + * + * \return + * ::cudaSuccess + * \notefnerr + * + * \sa + * ::cudaDeviceReset, + * ::cuCtxSynchronize + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceSynchronize(void); + +/** + * \brief Set resource limits + * + * Setting \p limit to \p value is a request by the application to update + * the current limit maintained by the device. The driver is free to + * modify the requested value to meet h/w requirements (this could be + * clamping to minimum or maximum values, rounding up to nearest element + * size, etc). The application can use ::cudaDeviceGetLimit() to find out + * exactly what the limit has been set to. + * + * Setting each ::cudaLimit has its own specific restrictions, so each is + * discussed here. + * + * - ::cudaLimitStackSize controls the stack size in bytes of each GPU thread. + * + * - ::cudaLimitPrintfFifoSize controls the size in bytes of the shared FIFO + * used by the ::printf() and ::fprintf() device system calls. Setting + * ::cudaLimitPrintfFifoSize must not be performed after launching any kernel + * that uses the ::printf() or ::fprintf() device system calls - in such case + * ::cudaErrorInvalidValue will be returned. + * + * - ::cudaLimitMallocHeapSize controls the size in bytes of the heap used by + * the ::malloc() and ::free() device system calls. Setting + * ::cudaLimitMallocHeapSize must not be performed after launching any kernel + * that uses the ::malloc() or ::free() device system calls - in such case + * ::cudaErrorInvalidValue will be returned. + * + * - ::cudaLimitDevRuntimeSyncDepth controls the maximum nesting depth of a + * grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting + * this limit must be performed before any launch of a kernel that uses the + * device runtime and calls ::cudaDeviceSynchronize() above the default sync + * depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail + * with error code ::cudaErrorSyncDepthExceeded if the limitation is + * violated. This limit can be set smaller than the default or up the maximum + * launch depth of 24. When setting this limit, keep in mind that additional + * levels of sync depth require the runtime to reserve large amounts of + * device memory which can no longer be used for user allocations. If these + * reservations of device memory fail, ::cudaDeviceSetLimit will return + * ::cudaErrorMemoryAllocation, and the limit can be reset to a lower value. + * This limit is only applicable to devices of compute capability 3.5 and + * higher. Attempting to set this limit on devices of compute capability less + * than 3.5 will result in the error ::cudaErrorUnsupportedLimit being + * returned. + * + * - ::cudaLimitDevRuntimePendingLaunchCount controls the maximum number of + * outstanding device runtime launches that can be made from the current + * device. A grid is outstanding from the point of launch up until the grid + * is known to have been completed. Device runtime launches which violate + * this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when + * ::cudaGetLastError() is called after launch. If more pending launches than + * the default (2048 launches) are needed for a module using the device + * runtime, this limit can be increased. Keep in mind that being able to + * sustain additional pending launches will require the runtime to reserve + * larger amounts of device memory upfront which can no longer be used for + * allocations. If these reservations fail, ::cudaDeviceSetLimit will return + * ::cudaErrorMemoryAllocation, and the limit can be reset to a lower value. + * This limit is only applicable to devices of compute capability 3.5 and + * higher. Attempting to set this limit on devices of compute capability less + * than 3.5 will result in the error ::cudaErrorUnsupportedLimit being + * returned. + * + * \param limit - Limit to set + * \param value - Size of limit + * + * \return + * ::cudaSuccess, + * ::cudaErrorUnsupportedLimit, + * ::cudaErrorInvalidValue, + * ::cudaErrorMemoryAllocation + * \notefnerr + * + * \sa + * ::cudaDeviceGetLimit, + * ::cuCtxSetLimit + */ +extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit, size_t value); + +/** + * \brief Returns resource limits + * + * Returns in \p *pValue the current size of \p limit. The supported + * ::cudaLimit values are: + * - ::cudaLimitStackSize: stack size in bytes of each GPU thread; + * - ::cudaLimitPrintfFifoSize: size in bytes of the shared FIFO used by the + * ::printf() and ::fprintf() device system calls. + * - ::cudaLimitMallocHeapSize: size in bytes of the heap used by the + * ::malloc() and ::free() device system calls; + * - ::cudaLimitDevRuntimeSyncDepth: maximum grid depth at which a + * thread can isssue the device runtime call ::cudaDeviceSynchronize() + * to wait on child grid launches to complete. + * - ::cudaLimitDevRuntimePendingLaunchCount: maximum number of outstanding + * device runtime launches. + * + * \param limit - Limit to query + * \param pValue - Returned size of the limit + * + * \return + * ::cudaSuccess, + * ::cudaErrorUnsupportedLimit, + * ::cudaErrorInvalidValue + * \notefnerr + * + * \sa + * ::cudaDeviceSetLimit, + * ::cuCtxGetLimit + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit); + +/** + * \brief Returns the preferred cache configuration for the current device. + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this returns through \p pCacheConfig the preferred cache + * configuration for the current device. This is only a preference. The + * runtime will use the requested configuration if possible, but it is free to + * choose a different configuration if required to execute functions. + * + * This will return a \p pCacheConfig of ::cudaFuncCachePreferNone on devices + * where the size of the L1 cache and shared memory are fixed. + * + * The supported cache configurations are: + * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default) + * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache + * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory + * - ::cudaFuncCachePreferEqual: prefer equal size L1 cache and shared memory + * + * \param pCacheConfig - Returned cache configuration + * + * \return + * ::cudaSuccess, + * ::cudaErrorInitializationError + * \notefnerr + * + * \sa cudaDeviceSetCacheConfig, + * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", + * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)", + * ::cuCtxGetCacheConfig + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig); + +/** + * \brief Returns numerical values that correspond to the least and + * greatest stream priorities. + * + * Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond + * to the least and greatest stream priorities respectively. Stream priorities + * follow a convention where lower numbers imply greater priorities. The range of + * meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority]. + * If the user attempts to create a stream with a priority value that is + * outside the the meaningful range as specified by this API, the priority is + * automatically clamped down or up to either \p *leastPriority or \p *greatestPriority + * respectively. See ::cudaStreamCreateWithPriority for details on creating a + * priority stream. + * A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value + * is not desired. + * + * This function will return '0' in both \p *leastPriority and \p *greatestPriority if + * the current context's device does not support stream priorities + * (see ::cudaDeviceGetAttribute). + * + * \param leastPriority - Pointer to an int in which the numerical value for least + * stream priority is returned + * \param greatestPriority - Pointer to an int in which the numerical value for greatest + * stream priority is returned + * + * \return + * ::cudaSuccess, + * ::cudaErrorInitializationError + * \notefnerr + * + * \sa ::cudaStreamCreateWithPriority, + * ::cudaStreamGetPriority, + * ::cuCtxGetStreamPriorityRange + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority); + +/** + * \brief Sets the preferred cache configuration for the current device. + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this sets through \p cacheConfig the preferred cache + * configuration for the current device. This is only a preference. The + * runtime will use the requested configuration if possible, but it is free to + * choose a different configuration if required to execute the function. Any + * function preference set via + * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)" + * or + * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)" + * will be preferred over this device-wide setting. Setting the device-wide + * cache configuration to ::cudaFuncCachePreferNone will cause subsequent + * kernel launches to prefer to not change the cache configuration unless + * required to launch the kernel. + * + * This setting does nothing on devices where the size of the L1 cache and + * shared memory are fixed. + * + * Launching a kernel with a different preference than the most recent + * preference setting may insert a device-side synchronization point. + * + * The supported cache configurations are: + * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default) + * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache + * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory + * - ::cudaFuncCachePreferEqual: prefer equal size L1 cache and shared memory + * + * \param cacheConfig - Requested cache configuration + * + * \return + * ::cudaSuccess, + * ::cudaErrorInitializationError + * \notefnerr + * + * \sa ::cudaDeviceGetCacheConfig, + * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", + * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)", + * ::cuCtxSetCacheConfig + */ +extern __host__ cudaError_t CUDARTAPI cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig); + +/** + * \brief Returns the shared memory configuration for the current device. + * + * This function will return in \p pConfig the current size of shared memory banks + * on the current device. On devices with configurable shared memory banks, + * ::cudaDeviceSetSharedMemConfig can be used to change this setting, so that all + * subsequent kernel launches will by default use the new bank size. When + * ::cudaDeviceGetSharedMemConfig is called on devices without configurable shared + * memory, it will return the fixed bank size of the hardware. + * + * The returned bank configurations can be either: + * - ::cudaSharedMemBankSizeFourByte - shared memory bank width is four bytes. + * - ::cudaSharedMemBankSizeEightByte - shared memory bank width is eight bytes. + * + * \param pConfig - Returned cache configuration + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInitializationError + * \notefnerr + * + * \sa ::cudaDeviceSetCacheConfig, + * ::cudaDeviceGetCacheConfig, + * ::cudaDeviceSetSharedMemConfig, + * ::cudaFuncSetCacheConfig, + * ::cuCtxGetSharedMemConfig + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig); + +/** + * \brief Sets the shared memory configuration for the current device. + * + * On devices with configurable shared memory banks, this function will set + * the shared memory bank size which is used for all subsequent kernel launches. + * Any per-function setting of shared memory set via ::cudaFuncSetSharedMemConfig + * will override the device wide setting. + * + * Changing the shared memory configuration between launches may introduce + * a device side synchronization point. + * + * Changing the shared memory bank size will not increase shared memory usage + * or affect occupancy of kernels, but may have major effects on performance. + * Larger bank sizes will allow for greater potential bandwidth to shared memory, + * but will change what kinds of accesses to shared memory will result in bank + * conflicts. + * + * This function will do nothing on devices with fixed shared memory bank size. + * + * The supported bank configurations are: + * - ::cudaSharedMemBankSizeDefault: set bank width the device default (currently, + * four bytes) + * - ::cudaSharedMemBankSizeFourByte: set shared memory bank width to be four bytes + * natively. + * - ::cudaSharedMemBankSizeEightByte: set shared memory bank width to be eight + * bytes natively. + * + * \param config - Requested cache configuration + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInitializationError + * \notefnerr + * + * \sa ::cudaDeviceSetCacheConfig, + * ::cudaDeviceGetCacheConfig, + * ::cudaDeviceGetSharedMemConfig, + * ::cudaFuncSetCacheConfig, + * ::cuCtxSetSharedMemConfig + */ +extern __host__ cudaError_t CUDARTAPI cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config); + +/** + * \brief Returns a handle to a compute device + * + * Returns in \p *device a device ordinal given a PCI bus ID string. + * + * \param device - Returned device ordinal + * + * \param pciBusId - String in one of the following forms: + * [domain]:[bus]:[device].[function] + * [domain]:[bus]:[device] + * [bus]:[device].[function] + * where \p domain, \p bus, \p device, and \p function are all hexadecimal values + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidDevice + * \notefnerr + * + * \sa + * ::cudaDeviceGetPCIBusId, + * ::cuDeviceGetByPCIBusId + */ +extern __host__ cudaError_t CUDARTAPI cudaDeviceGetByPCIBusId(int *device, const char *pciBusId); + +/** + * \brief Returns a PCI Bus Id string for the device + * + * Returns an ASCII string identifying the device \p dev in the NULL-terminated + * string pointed to by \p pciBusId. \p len specifies the maximum length of the + * string that may be returned. + * + * \param pciBusId - Returned identifier string for the device in the following format + * [domain]:[bus]:[device].[function] + * where \p domain, \p bus, \p device, and \p function are all hexadecimal values. + * pciBusId should be large enough to store 13 characters including the NULL-terminator. + * + * \param len - Maximum length of string to store in \p name + * + * \param device - Device to get identifier string for + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidDevice + * \notefnerr + * + * \sa + * ::cudaDeviceGetByPCIBusId, + * ::cuDeviceGetPCIBusId + */ +extern __host__ cudaError_t CUDARTAPI cudaDeviceGetPCIBusId(char *pciBusId, int len, int device); + +/** + * \brief Gets an interprocess handle for a previously allocated event + * + * Takes as input a previously allocated event. This event must have been + * created with the ::cudaEventInterprocess and ::cudaEventDisableTiming + * flags set. This opaque handle may be copied into other processes and + * opened with ::cudaIpcOpenEventHandle to allow efficient hardware + * synchronization between GPU work in different processes. + * + * After the event has been been opened in the importing process, + * ::cudaEventRecord, ::cudaEventSynchronize, ::cudaStreamWaitEvent and + * ::cudaEventQuery may be used in either process. Performing operations + * on the imported event after the exported event has been freed + * with ::cudaEventDestroy will result in undefined behavior. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux operating systems. IPC functionality is not supported + * on Tegra platforms. + * + * \param handle - Pointer to a user allocated cudaIpcEventHandle + * in which to return the opaque event handle + * \param event - Event allocated with ::cudaEventInterprocess and + * ::cudaEventDisableTiming flags. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorMemoryAllocation, + * ::cudaErrorMapBufferObjectFailed, + * ::cudaErrorNotSupported + * + * \sa + * ::cudaEventCreate, + * ::cudaEventDestroy, + * ::cudaEventSynchronize, + * ::cudaEventQuery, + * ::cudaStreamWaitEvent, + * ::cudaIpcOpenEventHandle, + * ::cudaIpcGetMemHandle, + * ::cudaIpcOpenMemHandle, + * ::cudaIpcCloseMemHandle, + * ::cuIpcGetEventHandle + */ +extern __host__ cudaError_t CUDARTAPI cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event); + +/** + * \brief Opens an interprocess event handle for use in the current process + * + * Opens an interprocess event handle exported from another process with + * ::cudaIpcGetEventHandle. This function returns a ::cudaEvent_t that behaves like + * a locally created event with the ::cudaEventDisableTiming flag specified. + * This event must be freed with ::cudaEventDestroy. + * + * Performing operations on the imported event after the exported event has + * been freed with ::cudaEventDestroy will result in undefined behavior. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux operating systems. IPC functionality is not supported + * on Tegra platforms. + * + * \param event - Returns the imported event + * \param handle - Interprocess handle to open + * + * \returns + * ::cudaSuccess, + * ::cudaErrorMapBufferObjectFailed, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorNotSupported + * + * \sa + * ::cudaEventCreate, + * ::cudaEventDestroy, + * ::cudaEventSynchronize, + * ::cudaEventQuery, + * ::cudaStreamWaitEvent, + * ::cudaIpcGetEventHandle, + * ::cudaIpcGetMemHandle, + * ::cudaIpcOpenMemHandle, + * ::cudaIpcCloseMemHandle, + * ::cuIpcOpenEventHandle + */ +extern __host__ cudaError_t CUDARTAPI cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle); + + +/** + * \brief Gets an interprocess memory handle for an existing device memory + * allocation + * + * Takes a pointer to the base of an existing device memory allocation created + * with ::cudaMalloc and exports it for use in another process. This is a + * lightweight operation and may be called multiple times on an allocation + * without adverse effects. + * + * If a region of memory is freed with ::cudaFree and a subsequent call + * to ::cudaMalloc returns memory with the same device address, + * ::cudaIpcGetMemHandle will return a unique handle for the + * new memory. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux operating systems. IPC functionality is not supported + * on Tegra platforms. + * + * \param handle - Pointer to user allocated ::cudaIpcMemHandle to return + * the handle in. + * \param devPtr - Base pointer to previously allocated device memory + * + * \returns + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorMemoryAllocation, + * ::cudaErrorMapBufferObjectFailed, + * ::cudaErrorNotSupported + * + * \sa + * ::cudaMalloc, + * ::cudaFree, + * ::cudaIpcGetEventHandle, + * ::cudaIpcOpenEventHandle, + * ::cudaIpcOpenMemHandle, + * ::cudaIpcCloseMemHandle, + * ::cuIpcGetMemHandle + */ +extern __host__ cudaError_t CUDARTAPI cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr); + +/** + * \brief Opens an interprocess memory handle exported from another process + * and returns a device pointer usable in the local process. + * + * Maps memory exported from another process with ::cudaIpcGetMemHandle into + * the current device address space. For contexts on different devices + * ::cudaIpcOpenMemHandle can attempt to enable peer access between the + * devices as if the user called ::cudaDeviceEnablePeerAccess. This behavior is + * controlled by the ::cudaIpcMemLazyEnablePeerAccess flag. + * ::cudaDeviceCanAccessPeer can determine if a mapping is possible. + * + * Contexts that may open ::cudaIpcMemHandles are restricted in the following way. + * ::cudaIpcMemHandles from each device in a given process may only be opened + * by one context per device per other process. + * + * Memory returned from ::cudaIpcOpenMemHandle must be freed with + * ::cudaIpcCloseMemHandle. + * + * Calling ::cudaFree on an exported memory region before calling + * ::cudaIpcCloseMemHandle in the importing context will result in undefined + * behavior. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux operating systems. IPC functionality is not supported + * on Tegra platforms. + * + * \param devPtr - Returned device pointer + * \param handle - ::cudaIpcMemHandle to open + * \param flags - Flags for this operation. Must be specified as ::cudaIpcMemLazyEnablePeerAccess + * + * \returns + * ::cudaSuccess, + * ::cudaErrorMapBufferObjectFailed, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorTooManyPeers, + * ::cudaErrorNotSupported + * + * \note No guarantees are made about the address returned in \p *devPtr. + * In particular, multiple processes may not receive the same address for the same \p handle. + * + * \sa + * ::cudaMalloc, + * ::cudaFree, + * ::cudaIpcGetEventHandle, + * ::cudaIpcOpenEventHandle, + * ::cudaIpcGetMemHandle, + * ::cudaIpcCloseMemHandle, + * ::cudaDeviceEnablePeerAccess, + * ::cudaDeviceCanAccessPeer, + * ::cuIpcOpenMemHandle + */ +extern __host__ cudaError_t CUDARTAPI cudaIpcOpenMemHandle(void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags); + +/** + * \brief Close memory mapped with cudaIpcOpenMemHandle + * + * Unmaps memory returnd by ::cudaIpcOpenMemHandle. The original allocation + * in the exporting process as well as imported mappings in other processes + * will be unaffected. + * + * Any resources used to enable peer access will be freed if this is the + * last mapping using them. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux operating systems. IPC functionality is not supported + * on Tegra platforms. + * + * \param devPtr - Device pointer returned by ::cudaIpcOpenMemHandle + * + * \returns + * ::cudaSuccess, + * ::cudaErrorMapBufferObjectFailed, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorNotSupported + * + * \sa + * ::cudaMalloc, + * ::cudaFree, + * ::cudaIpcGetEventHandle, + * ::cudaIpcOpenEventHandle, + * ::cudaIpcGetMemHandle, + * ::cudaIpcOpenMemHandle, + * ::cuIpcCloseMemHandle + */ +extern __host__ cudaError_t CUDARTAPI cudaIpcCloseMemHandle(void *devPtr); + +/** @} */ /* END CUDART_DEVICE */ + +/** + * \defgroup CUDART_THREAD_DEPRECATED Thread Management [DEPRECATED] + * + * ___MANBRIEF___ deprecated thread management functions of the CUDA runtime + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes deprecated thread management functions of the CUDA runtime + * application programming interface. + * + * @{ + */ + +/** + * \brief Exit and clean up from CUDA launches + * + * \deprecated + * + * Note that this function is deprecated because its name does not + * reflect its behavior. Its functionality is identical to the + * non-deprecated function ::cudaDeviceReset(), which should be used + * instead. + * + * Explicitly destroys all cleans up all resources associated with the current + * device in the current process. Any subsequent API call to this device will + * reinitialize the device. + * + * Note that this function will reset the device immediately. It is the caller's + * responsibility to ensure that the device is not being accessed by any + * other host threads from the process when this function is called. + * + * \return + * ::cudaSuccess + * \notefnerr + * + * \sa ::cudaDeviceReset + */ +extern __host__ cudaError_t CUDARTAPI cudaThreadExit(void); + +/** + * \brief Wait for compute device to finish + * + * \deprecated + * + * Note that this function is deprecated because its name does not + * reflect its behavior. Its functionality is similar to the + * non-deprecated function ::cudaDeviceSynchronize(), which should be used + * instead. + * + * Blocks until the device has completed all preceding requested tasks. + * ::cudaThreadSynchronize() returns an error if one of the preceding tasks + * has failed. If the ::cudaDeviceScheduleBlockingSync flag was set for + * this device, the host thread will block until the device has finished + * its work. + * + * \return + * ::cudaSuccess + * \notefnerr + * + * \sa ::cudaDeviceSynchronize + */ +extern __host__ cudaError_t CUDARTAPI cudaThreadSynchronize(void); + +/** + * \brief Set resource limits + * + * \deprecated + * + * Note that this function is deprecated because its name does not + * reflect its behavior. Its functionality is identical to the + * non-deprecated function ::cudaDeviceSetLimit(), which should be used + * instead. + * + * Setting \p limit to \p value is a request by the application to update + * the current limit maintained by the device. The driver is free to + * modify the requested value to meet h/w requirements (this could be + * clamping to minimum or maximum values, rounding up to nearest element + * size, etc). The application can use ::cudaThreadGetLimit() to find out + * exactly what the limit has been set to. + * + * Setting each ::cudaLimit has its own specific restrictions, so each is + * discussed here. + * + * - ::cudaLimitStackSize controls the stack size of each GPU thread. + * + * - ::cudaLimitPrintfFifoSize controls the size of the shared FIFO + * used by the ::printf() and ::fprintf() device system calls. + * Setting ::cudaLimitPrintfFifoSize must be performed before + * launching any kernel that uses the ::printf() or ::fprintf() device + * system calls, otherwise ::cudaErrorInvalidValue will be returned. + * + * - ::cudaLimitMallocHeapSize controls the size of the heap used + * by the ::malloc() and ::free() device system calls. Setting + * ::cudaLimitMallocHeapSize must be performed before launching + * any kernel that uses the ::malloc() or ::free() device system calls, + * otherwise ::cudaErrorInvalidValue will be returned. + * + * \param limit - Limit to set + * \param value - Size in bytes of limit + * + * \return + * ::cudaSuccess, + * ::cudaErrorUnsupportedLimit, + * ::cudaErrorInvalidValue + * \notefnerr + * + * \sa ::cudaDeviceSetLimit + */ +extern __host__ cudaError_t CUDARTAPI cudaThreadSetLimit(enum cudaLimit limit, size_t value); + +/** + * \brief Returns resource limits + * + * \deprecated + * + * Note that this function is deprecated because its name does not + * reflect its behavior. Its functionality is identical to the + * non-deprecated function ::cudaDeviceGetLimit(), which should be used + * instead. + * + * Returns in \p *pValue the current size of \p limit. The supported + * ::cudaLimit values are: + * - ::cudaLimitStackSize: stack size of each GPU thread; + * - ::cudaLimitPrintfFifoSize: size of the shared FIFO used by the + * ::printf() and ::fprintf() device system calls. + * - ::cudaLimitMallocHeapSize: size of the heap used by the + * ::malloc() and ::free() device system calls; + * + * \param limit - Limit to query + * \param pValue - Returned size in bytes of limit + * + * \return + * ::cudaSuccess, + * ::cudaErrorUnsupportedLimit, + * ::cudaErrorInvalidValue + * \notefnerr + * + * \sa ::cudaDeviceGetLimit + */ +extern __host__ cudaError_t CUDARTAPI cudaThreadGetLimit(size_t *pValue, enum cudaLimit limit); + +/** + * \brief Returns the preferred cache configuration for the current device. + * + * \deprecated + * + * Note that this function is deprecated because its name does not + * reflect its behavior. Its functionality is identical to the + * non-deprecated function ::cudaDeviceGetCacheConfig(), which should be + * used instead. + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this returns through \p pCacheConfig the preferred cache + * configuration for the current device. This is only a preference. The + * runtime will use the requested configuration if possible, but it is free to + * choose a different configuration if required to execute functions. + * + * This will return a \p pCacheConfig of ::cudaFuncCachePreferNone on devices + * where the size of the L1 cache and shared memory are fixed. + * + * The supported cache configurations are: + * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default) + * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache + * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory + * + * \param pCacheConfig - Returned cache configuration + * + * \return + * ::cudaSuccess, + * ::cudaErrorInitializationError + * \notefnerr + * + * \sa ::cudaDeviceGetCacheConfig + */ +extern __host__ cudaError_t CUDARTAPI cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig); + +/** + * \brief Sets the preferred cache configuration for the current device. + * + * \deprecated + * + * Note that this function is deprecated because its name does not + * reflect its behavior. Its functionality is identical to the + * non-deprecated function ::cudaDeviceSetCacheConfig(), which should be + * used instead. + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this sets through \p cacheConfig the preferred cache + * configuration for the current device. This is only a preference. The + * runtime will use the requested configuration if possible, but it is free to + * choose a different configuration if required to execute the function. Any + * function preference set via + * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)" + * or + * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)" + * will be preferred over this device-wide setting. Setting the device-wide + * cache configuration to ::cudaFuncCachePreferNone will cause subsequent + * kernel launches to prefer to not change the cache configuration unless + * required to launch the kernel. + * + * This setting does nothing on devices where the size of the L1 cache and + * shared memory are fixed. + * + * Launching a kernel with a different preference than the most recent + * preference setting may insert a device-side synchronization point. + * + * The supported cache configurations are: + * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default) + * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache + * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory + * + * \param cacheConfig - Requested cache configuration + * + * \return + * ::cudaSuccess, + * ::cudaErrorInitializationError + * \notefnerr + * + * \sa ::cudaDeviceSetCacheConfig + */ +extern __host__ cudaError_t CUDARTAPI cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig); + +/** @} */ /* END CUDART_THREAD_DEPRECATED */ + +/** + * \defgroup CUDART_ERROR Error Handling + * + * ___MANBRIEF___ error handling functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the error handling functions of the CUDA runtime + * application programming interface. + * + * @{ + */ + +/** + * \brief Returns the last error from a runtime call + * + * Returns the last error that has been produced by any of the runtime calls + * in the same host thread and resets it to ::cudaSuccess. + * + * \return + * ::cudaSuccess, + * ::cudaErrorMissingConfiguration, + * ::cudaErrorMemoryAllocation, + * ::cudaErrorInitializationError, + * ::cudaErrorLaunchFailure, + * ::cudaErrorLaunchTimeout, + * ::cudaErrorLaunchOutOfResources, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidConfiguration, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidPitchValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorUnmapBufferObjectFailed, + * ::cudaErrorInvalidDevicePointer, + * ::cudaErrorInvalidTexture, + * ::cudaErrorInvalidTextureBinding, + * ::cudaErrorInvalidChannelDescriptor, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorInvalidFilterSetting, + * ::cudaErrorInvalidNormSetting, + * ::cudaErrorUnknown, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorInsufficientDriver, + * ::cudaErrorSetOnActiveProcess, + * ::cudaErrorStartupFailure, + * ::cudaErrorInvalidPtx, + * ::cudaErrorNoKernelImageForDevice, + * ::cudaErrorJitCompilerNotFound + * \notefnerr + * + * \sa ::cudaPeekAtLastError, ::cudaGetErrorName, ::cudaGetErrorString, ::cudaError + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void); + +/** + * \brief Returns the last error from a runtime call + * + * Returns the last error that has been produced by any of the runtime calls + * in the same host thread. Note that this call does not reset the error to + * ::cudaSuccess like ::cudaGetLastError(). + * + * \return + * ::cudaSuccess, + * ::cudaErrorMissingConfiguration, + * ::cudaErrorMemoryAllocation, + * ::cudaErrorInitializationError, + * ::cudaErrorLaunchFailure, + * ::cudaErrorLaunchTimeout, + * ::cudaErrorLaunchOutOfResources, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidConfiguration, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidPitchValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorUnmapBufferObjectFailed, + * ::cudaErrorInvalidDevicePointer, + * ::cudaErrorInvalidTexture, + * ::cudaErrorInvalidTextureBinding, + * ::cudaErrorInvalidChannelDescriptor, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorInvalidFilterSetting, + * ::cudaErrorInvalidNormSetting, + * ::cudaErrorUnknown, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorInsufficientDriver, + * ::cudaErrorSetOnActiveProcess, + * ::cudaErrorStartupFailure, + * ::cudaErrorInvalidPtx, + * ::cudaErrorNoKernelImageForDevice, + * ::cudaErrorJitCompilerNotFound + * \notefnerr + * + * \sa ::cudaGetLastError, ::cudaGetErrorName, ::cudaGetErrorString, ::cudaError + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void); + +/** + * \brief Returns the string representation of an error code enum name + * + * Returns a string containing the name of an error code in the enum. If the error + * code is not recognized, "unrecognized error code" is returned. + * + * \param error - Error code to convert to string + * + * \return + * \p char* pointer to a NULL-terminated string + * + * \sa ::cudaGetErrorString, ::cudaGetLastError, ::cudaPeekAtLastError, ::cudaError, + * ::cuGetErrorName + */ +extern __host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error); + +/** + * \brief Returns the description string for an error code + * + * Returns the description string for an error code. If the error + * code is not recognized, "unrecognized error code" is returned. + * + * \param error - Error code to convert to string + * + * \return + * \p char* pointer to a NULL-terminated string + * + * \sa ::cudaGetErrorName, ::cudaGetLastError, ::cudaPeekAtLastError, ::cudaError, + * ::cuGetErrorString + */ +extern __host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error); +/** @} */ /* END CUDART_ERROR */ + +/** + * \addtogroup CUDART_DEVICE + * + * @{ + */ + +/** + * \brief Returns the number of compute-capable devices + * + * Returns in \p *count the number of devices with compute capability greater + * or equal to 2.0 that are available for execution. If there is no such + * device then ::cudaGetDeviceCount() will return ::cudaErrorNoDevice. + * If no driver can be loaded to determine if any such devices exist then + * ::cudaGetDeviceCount() will return ::cudaErrorInsufficientDriver. + * + * \param count - Returns the number of devices with compute capability + * greater or equal to 2.0 + * + * \return + * ::cudaSuccess, + * ::cudaErrorNoDevice, + * ::cudaErrorInsufficientDriver + * \notefnerr + * + * \sa ::cudaGetDevice, ::cudaSetDevice, ::cudaGetDeviceProperties, + * ::cudaChooseDevice, + * ::cuDeviceGetCount + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count); + +/** + * \brief Returns information about the compute-device + * + * Returns in \p *prop the properties of device \p dev. The ::cudaDeviceProp + * structure is defined as: + * \code + struct cudaDeviceProp { + char name[256]; + size_t totalGlobalMem; + size_t sharedMemPerBlock; + int regsPerBlock; + int warpSize; + size_t memPitch; + int maxThreadsPerBlock; + int maxThreadsDim[3]; + int maxGridSize[3]; + int clockRate; + size_t totalConstMem; + int major; + int minor; + size_t textureAlignment; + size_t texturePitchAlignment; + int deviceOverlap; + int multiProcessorCount; + int kernelExecTimeoutEnabled; + int integrated; + int canMapHostMemory; + int computeMode; + int maxTexture1D; + int maxTexture1DMipmap; + int maxTexture1DLinear; + int maxTexture2D[2]; + int maxTexture2DMipmap[2]; + int maxTexture2DLinear[3]; + int maxTexture2DGather[2]; + int maxTexture3D[3]; + int maxTexture3DAlt[3]; + int maxTextureCubemap; + int maxTexture1DLayered[2]; + int maxTexture2DLayered[3]; + int maxTextureCubemapLayered[2]; + int maxSurface1D; + int maxSurface2D[2]; + int maxSurface3D[3]; + int maxSurface1DLayered[2]; + int maxSurface2DLayered[3]; + int maxSurfaceCubemap; + int maxSurfaceCubemapLayered[2]; + size_t surfaceAlignment; + int concurrentKernels; + int ECCEnabled; + int pciBusID; + int pciDeviceID; + int pciDomainID; + int tccDriver; + int asyncEngineCount; + int unifiedAddressing; + int memoryClockRate; + int memoryBusWidth; + int l2CacheSize; + int maxThreadsPerMultiProcessor; + int streamPrioritiesSupported; + int globalL1CacheSupported; + int localL1CacheSupported; + size_t sharedMemPerMultiprocessor; + int regsPerMultiprocessor; + int managedMemSupported; + int isMultiGpuBoard; + int multiGpuBoardGroupID; + int singleToDoublePrecisionPerfRatio; + int pageableMemoryAccess; + int concurrentManagedAccess; + int computePreemptionSupported; + int canUseHostPointerForRegisteredMem; + int cooperativeLaunch; + int cooperativeMultiDeviceLaunch; + } + \endcode + * where: + * - \ref ::cudaDeviceProp::name "name[256]" is an ASCII string identifying + * the device; + * - \ref ::cudaDeviceProp::totalGlobalMem "totalGlobalMem" is the total + * amount of global memory available on the device in bytes; + * - \ref ::cudaDeviceProp::sharedMemPerBlock "sharedMemPerBlock" is the + * maximum amount of shared memory available to a thread block in bytes; + * - \ref ::cudaDeviceProp::regsPerBlock "regsPerBlock" is the maximum number + * of 32-bit registers available to a thread block; + * - \ref ::cudaDeviceProp::warpSize "warpSize" is the warp size in threads; + * - \ref ::cudaDeviceProp::memPitch "memPitch" is the maximum pitch in + * bytes allowed by the memory copy functions that involve memory regions + * allocated through ::cudaMallocPitch(); + * - \ref ::cudaDeviceProp::maxThreadsPerBlock "maxThreadsPerBlock" is the + * maximum number of threads per block; + * - \ref ::cudaDeviceProp::maxThreadsDim "maxThreadsDim[3]" contains the + * maximum size of each dimension of a block; + * - \ref ::cudaDeviceProp::maxGridSize "maxGridSize[3]" contains the + * maximum size of each dimension of a grid; + * - \ref ::cudaDeviceProp::clockRate "clockRate" is the clock frequency in + * kilohertz; + * - \ref ::cudaDeviceProp::totalConstMem "totalConstMem" is the total amount + * of constant memory available on the device in bytes; + * - \ref ::cudaDeviceProp::major "major", + * \ref ::cudaDeviceProp::minor "minor" are the major and minor revision + * numbers defining the device's compute capability; + * - \ref ::cudaDeviceProp::textureAlignment "textureAlignment" is the + * alignment requirement; texture base addresses that are aligned to + * \ref ::cudaDeviceProp::textureAlignment "textureAlignment" bytes do not + * need an offset applied to texture fetches; + * - \ref ::cudaDeviceProp::texturePitchAlignment "texturePitchAlignment" is the + * pitch alignment requirement for 2D texture references that are bound to + * pitched memory; + * - \ref ::cudaDeviceProp::deviceOverlap "deviceOverlap" is 1 if the device + * can concurrently copy memory between host and device while executing a + * kernel, or 0 if not. Deprecated, use instead asyncEngineCount. + * - \ref ::cudaDeviceProp::multiProcessorCount "multiProcessorCount" is the + * number of multiprocessors on the device; + * - \ref ::cudaDeviceProp::kernelExecTimeoutEnabled "kernelExecTimeoutEnabled" + * is 1 if there is a run time limit for kernels executed on the device, or + * 0 if not. + * - \ref ::cudaDeviceProp::integrated "integrated" is 1 if the device is an + * integrated (motherboard) GPU and 0 if it is a discrete (card) component. + * - \ref ::cudaDeviceProp::canMapHostMemory "canMapHostMemory" is 1 if the + * device can map host memory into the CUDA address space for use with + * ::cudaHostAlloc()/::cudaHostGetDevicePointer(), or 0 if not; + * - \ref ::cudaDeviceProp::computeMode "computeMode" is the compute mode + * that the device is currently in. Available modes are as follows: + * - cudaComputeModeDefault: Default mode - Device is not restricted and + * multiple threads can use ::cudaSetDevice() with this device. + * - cudaComputeModeExclusive: Compute-exclusive mode - Only one thread will + * be able to use ::cudaSetDevice() with this device. + * - cudaComputeModeProhibited: Compute-prohibited mode - No threads can use + * ::cudaSetDevice() with this device. + * - cudaComputeModeExclusiveProcess: Compute-exclusive-process mode - Many + * threads in one process will be able to use ::cudaSetDevice() with this device. + *
If ::cudaSetDevice() is called on an already occupied \p device with + * computeMode ::cudaComputeModeExclusive, ::cudaErrorDeviceAlreadyInUse + * will be immediately returned indicating the device cannot be used. + * When an occupied exclusive mode device is chosen with ::cudaSetDevice, + * all subsequent non-device management runtime functions will return + * ::cudaErrorDevicesUnavailable. + * - \ref ::cudaDeviceProp::maxTexture1D "maxTexture1D" is the maximum 1D + * texture size. + * - \ref ::cudaDeviceProp::maxTexture1DMipmap "maxTexture1DMipmap" is the maximum + * 1D mipmapped texture texture size. + * - \ref ::cudaDeviceProp::maxTexture1DLinear "maxTexture1DLinear" is the maximum + * 1D texture size for textures bound to linear memory. + * - \ref ::cudaDeviceProp::maxTexture2D "maxTexture2D[2]" contains the maximum + * 2D texture dimensions. + * - \ref ::cudaDeviceProp::maxTexture2DMipmap "maxTexture2DMipmap[2]" contains the + * maximum 2D mipmapped texture dimensions. + * - \ref ::cudaDeviceProp::maxTexture2DLinear "maxTexture2DLinear[3]" contains the + * maximum 2D texture dimensions for 2D textures bound to pitch linear memory. + * - \ref ::cudaDeviceProp::maxTexture2DGather "maxTexture2DGather[2]" contains the + * maximum 2D texture dimensions if texture gather operations have to be performed. + * - \ref ::cudaDeviceProp::maxTexture3D "maxTexture3D[3]" contains the maximum + * 3D texture dimensions. + * - \ref ::cudaDeviceProp::maxTexture3DAlt "maxTexture3DAlt[3]" + * contains the maximum alternate 3D texture dimensions. + * - \ref ::cudaDeviceProp::maxTextureCubemap "maxTextureCubemap" is the + * maximum cubemap texture width or height. + * - \ref ::cudaDeviceProp::maxTexture1DLayered "maxTexture1DLayered[2]" contains + * the maximum 1D layered texture dimensions. + * - \ref ::cudaDeviceProp::maxTexture2DLayered "maxTexture2DLayered[3]" contains + * the maximum 2D layered texture dimensions. + * - \ref ::cudaDeviceProp::maxTextureCubemapLayered "maxTextureCubemapLayered[2]" + * contains the maximum cubemap layered texture dimensions. + * - \ref ::cudaDeviceProp::maxSurface1D "maxSurface1D" is the maximum 1D + * surface size. + * - \ref ::cudaDeviceProp::maxSurface2D "maxSurface2D[2]" contains the maximum + * 2D surface dimensions. + * - \ref ::cudaDeviceProp::maxSurface3D "maxSurface3D[3]" contains the maximum + * 3D surface dimensions. + * - \ref ::cudaDeviceProp::maxSurface1DLayered "maxSurface1DLayered[2]" contains + * the maximum 1D layered surface dimensions. + * - \ref ::cudaDeviceProp::maxSurface2DLayered "maxSurface2DLayered[3]" contains + * the maximum 2D layered surface dimensions. + * - \ref ::cudaDeviceProp::maxSurfaceCubemap "maxSurfaceCubemap" is the maximum + * cubemap surface width or height. + * - \ref ::cudaDeviceProp::maxSurfaceCubemapLayered "maxSurfaceCubemapLayered[2]" + * contains the maximum cubemap layered surface dimensions. + * - \ref ::cudaDeviceProp::surfaceAlignment "surfaceAlignment" specifies the + * alignment requirements for surfaces. + * - \ref ::cudaDeviceProp::concurrentKernels "concurrentKernels" is 1 if the + * device supports executing multiple kernels within the same context + * simultaneously, or 0 if not. It is not guaranteed that multiple kernels + * will be resident on the device concurrently so this feature should not be + * relied upon for correctness; + * - \ref ::cudaDeviceProp::ECCEnabled "ECCEnabled" is 1 if the device has ECC + * support turned on, or 0 if not. + * - \ref ::cudaDeviceProp::pciBusID "pciBusID" is the PCI bus identifier of + * the device. + * - \ref ::cudaDeviceProp::pciDeviceID "pciDeviceID" is the PCI device + * (sometimes called slot) identifier of the device. + * - \ref ::cudaDeviceProp::pciDomainID "pciDomainID" is the PCI domain identifier + * of the device. + * - \ref ::cudaDeviceProp::tccDriver "tccDriver" is 1 if the device is using a + * TCC driver or 0 if not. + * - \ref ::cudaDeviceProp::asyncEngineCount "asyncEngineCount" is 1 when the + * device can concurrently copy memory between host and device while executing + * a kernel. It is 2 when the device can concurrently copy memory between host + * and device in both directions and execute a kernel at the same time. It is + * 0 if neither of these is supported. + * - \ref ::cudaDeviceProp::unifiedAddressing "unifiedAddressing" is 1 if the device + * shares a unified address space with the host and 0 otherwise. + * - \ref ::cudaDeviceProp::memoryClockRate "memoryClockRate" is the peak memory + * clock frequency in kilohertz. + * - \ref ::cudaDeviceProp::memoryBusWidth "memoryBusWidth" is the memory bus width + * in bits. + * - \ref ::cudaDeviceProp::l2CacheSize "l2CacheSize" is L2 cache size in bytes. + * - \ref ::cudaDeviceProp::maxThreadsPerMultiProcessor "maxThreadsPerMultiProcessor" + * is the number of maximum resident threads per multiprocessor. + * - \ref ::cudaDeviceProp::streamPrioritiesSupported "streamPrioritiesSupported" + * is 1 if the device supports stream priorities, or 0 if it is not supported. + * - \ref ::cudaDeviceProp::globalL1CacheSupported "globalL1CacheSupported" + * is 1 if the device supports caching of globals in L1 cache, or 0 if it is not supported. + * - \ref ::cudaDeviceProp::localL1CacheSupported "localL1CacheSupported" + * is 1 if the device supports caching of locals in L1 cache, or 0 if it is not supported. + * - \ref ::cudaDeviceProp::sharedMemPerMultiprocessor "sharedMemPerMultiprocessor" is the + * maximum amount of shared memory available to a multiprocessor in bytes; this amount is + * shared by all thread blocks simultaneously resident on a multiprocessor; + * - \ref ::cudaDeviceProp::regsPerMultiprocessor "regsPerMultiprocessor" is the maximum number + * of 32-bit registers available to a multiprocessor; this number is shared + * by all thread blocks simultaneously resident on a multiprocessor; + * - \ref ::cudaDeviceProp::managedMemory "managedMemory" + * is 1 if the device supports allocating managed memory on this system, or 0 if it is not supported. + * - \ref ::cudaDeviceProp::isMultiGpuBoard "isMultiGpuBoard" + * is 1 if the device is on a multi-GPU board (e.g. Gemini cards), and 0 if not; + * - \ref ::cudaDeviceProp::multiGpuBoardGroupID "multiGpuBoardGroupID" is a unique identifier + * for a group of devices associated with the same board. + * Devices on the same multi-GPU board will share the same identifier; + * - \ref ::cudaDeviceProp::singleToDoublePrecisionPerfRatio "singleToDoublePrecisionPerfRatio" + * is the ratio of single precision performance (in floating-point operations per second) + * to double precision performance. + * - \ref ::cudaDeviceProp::pageableMemoryAccess "pageableMemoryAccess" is 1 if the device supports + * coherently accessing pageable memory without calling cudaHostRegister on it, and 0 otherwise. + * - \ref ::cudaDeviceProp::concurrentManagedAccess "concurrentManagedAccess" is 1 if the device can + * coherently access managed memory concurrently with the CPU, and 0 otherwise. + * - \ref ::cudaDeviceProp::computePreemptionSupported "computePreemptionSupported" is 1 if the device + * supports Compute Preemption, and 0 otherwise. + * - \ref ::cudaDeviceProp::canUseHostPointerForRegisteredMem "canUseHostPointerForRegisteredMem" is 1 if + * the device can access host registered memory at the same virtual address as the CPU, and 0 otherwise. + * - \ref ::cudaDeviceProp::cooperativeLaunch "cooperativeLaunch" is 1 if the device supports launching + * cooperative kernels via ::cudaLaunchCooperativeKernel, and 0 otherwise. + * - \ref ::cudaDeviceProp::cooperativeMultiDeviceLaunch "cooperativeMultiDeviceLaunch" is 1 if the device + * supports launching cooperative kernels via ::cudaLaunchCooperativeKernelMultiDevice, and 0 otherwise. + * + * \param prop - Properties for the specified device + * \param device - Device number to get properties for + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice + * + * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice, ::cudaChooseDevice, + * ::cudaDeviceGetAttribute, + * ::cuDeviceGetAttribute, + * ::cuDeviceGetName + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device); + +/** + * \brief Returns information about the device + * + * Returns in \p *value the integer value of the attribute \p attr on device + * \p device. The supported attributes are: + * - ::cudaDevAttrMaxThreadsPerBlock: Maximum number of threads per block; + * - ::cudaDevAttrMaxBlockDimX: Maximum x-dimension of a block; + * - ::cudaDevAttrMaxBlockDimY: Maximum y-dimension of a block; + * - ::cudaDevAttrMaxBlockDimZ: Maximum z-dimension of a block; + * - ::cudaDevAttrMaxGridDimX: Maximum x-dimension of a grid; + * - ::cudaDevAttrMaxGridDimY: Maximum y-dimension of a grid; + * - ::cudaDevAttrMaxGridDimZ: Maximum z-dimension of a grid; + * - ::cudaDevAttrMaxSharedMemoryPerBlock: Maximum amount of shared memory + * available to a thread block in bytes; + * - ::cudaDevAttrTotalConstantMemory: Memory available on device for + * __constant__ variables in a CUDA C kernel in bytes; + * - ::cudaDevAttrWarpSize: Warp size in threads; + * - ::cudaDevAttrMaxPitch: Maximum pitch in bytes allowed by the memory copy + * functions that involve memory regions allocated through ::cudaMallocPitch(); + * - ::cudaDevAttrMaxTexture1DWidth: Maximum 1D texture width; + * - ::cudaDevAttrMaxTexture1DLinearWidth: Maximum width for a 1D texture bound + * to linear memory; + * - ::cudaDevAttrMaxTexture1DMipmappedWidth: Maximum mipmapped 1D texture width; + * - ::cudaDevAttrMaxTexture2DWidth: Maximum 2D texture width; + * - ::cudaDevAttrMaxTexture2DHeight: Maximum 2D texture height; + * - ::cudaDevAttrMaxTexture2DLinearWidth: Maximum width for a 2D texture + * bound to linear memory; + * - ::cudaDevAttrMaxTexture2DLinearHeight: Maximum height for a 2D texture + * bound to linear memory; + * - ::cudaDevAttrMaxTexture2DLinearPitch: Maximum pitch in bytes for a 2D + * texture bound to linear memory; + * - ::cudaDevAttrMaxTexture2DMipmappedWidth: Maximum mipmapped 2D texture + * width; + * - ::cudaDevAttrMaxTexture2DMipmappedHeight: Maximum mipmapped 2D texture + * height; + * - ::cudaDevAttrMaxTexture3DWidth: Maximum 3D texture width; + * - ::cudaDevAttrMaxTexture3DHeight: Maximum 3D texture height; + * - ::cudaDevAttrMaxTexture3DDepth: Maximum 3D texture depth; + * - ::cudaDevAttrMaxTexture3DWidthAlt: Alternate maximum 3D texture width, + * 0 if no alternate maximum 3D texture size is supported; + * - ::cudaDevAttrMaxTexture3DHeightAlt: Alternate maximum 3D texture height, + * 0 if no alternate maximum 3D texture size is supported; + * - ::cudaDevAttrMaxTexture3DDepthAlt: Alternate maximum 3D texture depth, + * 0 if no alternate maximum 3D texture size is supported; + * - ::cudaDevAttrMaxTextureCubemapWidth: Maximum cubemap texture width or + * height; + * - ::cudaDevAttrMaxTexture1DLayeredWidth: Maximum 1D layered texture width; + * - ::cudaDevAttrMaxTexture1DLayeredLayers: Maximum layers in a 1D layered + * texture; + * - ::cudaDevAttrMaxTexture2DLayeredWidth: Maximum 2D layered texture width; + * - ::cudaDevAttrMaxTexture2DLayeredHeight: Maximum 2D layered texture height; + * - ::cudaDevAttrMaxTexture2DLayeredLayers: Maximum layers in a 2D layered + * texture; + * - ::cudaDevAttrMaxTextureCubemapLayeredWidth: Maximum cubemap layered + * texture width or height; + * - ::cudaDevAttrMaxTextureCubemapLayeredLayers: Maximum layers in a cubemap + * layered texture; + * - ::cudaDevAttrMaxSurface1DWidth: Maximum 1D surface width; + * - ::cudaDevAttrMaxSurface2DWidth: Maximum 2D surface width; + * - ::cudaDevAttrMaxSurface2DHeight: Maximum 2D surface height; + * - ::cudaDevAttrMaxSurface3DWidth: Maximum 3D surface width; + * - ::cudaDevAttrMaxSurface3DHeight: Maximum 3D surface height; + * - ::cudaDevAttrMaxSurface3DDepth: Maximum 3D surface depth; + * - ::cudaDevAttrMaxSurface1DLayeredWidth: Maximum 1D layered surface width; + * - ::cudaDevAttrMaxSurface1DLayeredLayers: Maximum layers in a 1D layered + * surface; + * - ::cudaDevAttrMaxSurface2DLayeredWidth: Maximum 2D layered surface width; + * - ::cudaDevAttrMaxSurface2DLayeredHeight: Maximum 2D layered surface height; + * - ::cudaDevAttrMaxSurface2DLayeredLayers: Maximum layers in a 2D layered + * surface; + * - ::cudaDevAttrMaxSurfaceCubemapWidth: Maximum cubemap surface width; + * - ::cudaDevAttrMaxSurfaceCubemapLayeredWidth: Maximum cubemap layered + * surface width; + * - ::cudaDevAttrMaxSurfaceCubemapLayeredLayers: Maximum layers in a cubemap + * layered surface; + * - ::cudaDevAttrMaxRegistersPerBlock: Maximum number of 32-bit registers + * available to a thread block; + * - ::cudaDevAttrClockRate: Peak clock frequency in kilohertz; + * - ::cudaDevAttrTextureAlignment: Alignment requirement; texture base + * addresses aligned to ::textureAlign bytes do not need an offset applied + * to texture fetches; + * - ::cudaDevAttrTexturePitchAlignment: Pitch alignment requirement for 2D + * texture references bound to pitched memory; + * - ::cudaDevAttrGpuOverlap: 1 if the device can concurrently copy memory + * between host and device while executing a kernel, or 0 if not; + * - ::cudaDevAttrMultiProcessorCount: Number of multiprocessors on the device; + * - ::cudaDevAttrKernelExecTimeout: 1 if there is a run time limit for kernels + * executed on the device, or 0 if not; + * - ::cudaDevAttrIntegrated: 1 if the device is integrated with the memory + * subsystem, or 0 if not; + * - ::cudaDevAttrCanMapHostMemory: 1 if the device can map host memory into + * the CUDA address space, or 0 if not; + * - ::cudaDevAttrComputeMode: Compute mode is the compute mode that the device + * is currently in. Available modes are as follows: + * - ::cudaComputeModeDefault: Default mode - Device is not restricted and + * multiple threads can use ::cudaSetDevice() with this device. + * - ::cudaComputeModeExclusive: Compute-exclusive mode - Only one thread will + * be able to use ::cudaSetDevice() with this device. + * - ::cudaComputeModeProhibited: Compute-prohibited mode - No threads can use + * ::cudaSetDevice() with this device. + * - ::cudaComputeModeExclusiveProcess: Compute-exclusive-process mode - Many + * threads in one process will be able to use ::cudaSetDevice() with this + * device. + * - ::cudaDevAttrConcurrentKernels: 1 if the device supports executing + * multiple kernels within the same context simultaneously, or 0 if + * not. It is not guaranteed that multiple kernels will be resident on the + * device concurrently so this feature should not be relied upon for + * correctness; + * - ::cudaDevAttrEccEnabled: 1 if error correction is enabled on the device, + * 0 if error correction is disabled or not supported by the device; + * - ::cudaDevAttrPciBusId: PCI bus identifier of the device; + * - ::cudaDevAttrPciDeviceId: PCI device (also known as slot) identifier of + * the device; + * - ::cudaDevAttrTccDriver: 1 if the device is using a TCC driver. TCC is only + * available on Tesla hardware running Windows Vista or later; + * - ::cudaDevAttrMemoryClockRate: Peak memory clock frequency in kilohertz; + * - ::cudaDevAttrGlobalMemoryBusWidth: Global memory bus width in bits; + * - ::cudaDevAttrL2CacheSize: Size of L2 cache in bytes. 0 if the device + * doesn't have L2 cache; + * - ::cudaDevAttrMaxThreadsPerMultiProcessor: Maximum resident threads per + * multiprocessor; + * - ::cudaDevAttrUnifiedAddressing: 1 if the device shares a unified address + * space with the host, or 0 if not; + * - ::cudaDevAttrComputeCapabilityMajor: Major compute capability version + * number; + * - ::cudaDevAttrComputeCapabilityMinor: Minor compute capability version + * number; + * - ::cudaDevAttrStreamPrioritiesSupported: 1 if the device supports stream + * priorities, or 0 if not; + * - ::cudaDevAttrGlobalL1CacheSupported: 1 if device supports caching globals + * in L1 cache, 0 if not; + * - ::cudaDevAttrLocalL1CacheSupported: 1 if device supports caching locals + * in L1 cache, 0 if not; + * - ::cudaDevAttrMaxSharedMemoryPerMultiprocessor: Maximum amount of shared memory + * available to a multiprocessor in bytes; this amount is shared by all + * thread blocks simultaneously resident on a multiprocessor; + * - ::cudaDevAttrMaxRegistersPerMultiprocessor: Maximum number of 32-bit registers + * available to a multiprocessor; this number is shared by all thread blocks + * simultaneously resident on a multiprocessor; + * - ::cudaDevAttrManagedMemSupported: 1 if device supports allocating + * managed memory, 0 if not; + * - ::cudaDevAttrIsMultiGpuBoard: 1 if device is on a multi-GPU board, 0 if not; + * - ::cudaDevAttrMultiGpuBoardGroupID: Unique identifier for a group of devices on the + * same multi-GPU board; + * - ::cudaDevAttrHostNativeAtomicSupported: 1 if the link between the device and the + * host supports native atomic operations; + * - ::cudaDevAttrSingleToDoublePrecisionPerfRatio: Ratio of single precision performance + * (in floating-point operations per second) to double precision performance; + * - ::cudaDevAttrPageableMemoryAccess: 1 if the device supports coherently accessing + * pageable memory without calling cudaHostRegister on it, and 0 otherwise. + * - ::cudaDevAttrConcurrentManagedAccess: 1 if the device can coherently access managed + * memory concurrently with the CPU, and 0 otherwise. + * - ::cudaDevAttrComputePreemptionSupported: 1 if the device supports + * Compute Preemption, 0 if not. + * - ::cudaDevAttrCanUseHostPointerForRegisteredMem: 1 if the device can access host + * registered memory at the same virtual address as the CPU, and 0 otherwise. + * - ::cudaDevAttrCooperativeLaunch: 1 if the device supports launching cooperative kernels + * via ::cudaLaunchCooperativeKernel, and 0 otherwise. + * - ::cudaDevAttrCooperativeMultiDeviceLaunch: 1 if the device supports launching cooperative + * kernels via ::cudaLaunchCooperativeKernelMultiDevice, and 0 otherwise. + * + * \param value - Returned device attribute value + * \param attr - Device attribute to query + * \param device - Device number to query + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidValue + * \notefnerr + * + * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice, ::cudaChooseDevice, + * ::cudaGetDeviceProperties, + * ::cuDeviceGetAttribute + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device); + +/** + * \brief Queries attributes of the link between two devices. + * + * Returns in \p *value the value of the requested attribute \p attrib of the + * link between \p srcDevice and \p dstDevice. The supported attributes are: + * - ::CudaDevP2PAttrPerformanceRank: A relative value indicating the + * performance of the link between two devices. Lower value means better + * performance (0 being the value used for most performant link). + * - ::CudaDevP2PAttrAccessSupported: 1 if peer access is enabled. + * - ::CudaDevP2PAttrNativeAtomicSupported: 1 if native atomic operations over + * the link are supported. + * + * Returns ::cudaErrorInvalidDevice if \p srcDevice or \p dstDevice are not valid + * or if they represent the same device. + * + * Returns ::cudaErrorInvalidValue if \p attrib is not valid or if \p value is + * a null pointer. + * + * \param value - Returned value of the requested attribute + * \param attrib - The requested attribute of the link between \p srcDevice and \p dstDevice. + * \param srcDevice - The source device of the target link. + * \param dstDevice - The destination device of the target link. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidValue + * \notefnerr + * + * \sa ::cudaCtxEnablePeerAccess, + * ::cudaCtxDisablePeerAccess, + * ::cudaCtxCanAccessPeer, + * ::cuDeviceGetP2PAttribute + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr, int srcDevice, int dstDevice); + +/** + * \brief Select compute-device which best matches criteria + * + * Returns in \p *device the device which has properties that best match + * \p *prop. + * + * \param device - Device with best match + * \param prop - Desired device properties + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * + * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice, + * ::cudaGetDeviceProperties + */ +extern __host__ cudaError_t CUDARTAPI cudaChooseDevice(int *device, const struct cudaDeviceProp *prop); + +/** + * \brief Set device to be used for GPU executions + * + * Sets \p device as the current device for the calling host thread. + * Valid device id's are 0 to (::cudaGetDeviceCount() - 1). + * + * Any device memory subsequently allocated from this host thread + * using ::cudaMalloc(), ::cudaMallocPitch() or ::cudaMallocArray() + * will be physically resident on \p device. Any host memory allocated + * from this host thread using ::cudaMallocHost() or ::cudaHostAlloc() + * or ::cudaHostRegister() will have its lifetime associated with + * \p device. Any streams or events created from this host thread will + * be associated with \p device. Any kernels launched from this host + * thread using the <<<>>> operator or ::cudaLaunchKernel() will be executed + * on \p device. + * + * This call may be made from any host thread, to any device, and at + * any time. This function will do no synchronization with the previous + * or new device, and should be considered a very low overhead call. + * + * \param device - Device on which the active host thread should execute the + * device code. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorDeviceAlreadyInUse + * \notefnerr + * + * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaGetDeviceProperties, + * ::cudaChooseDevice, + * ::cuCtxSetCurrent + */ +extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device); + +/** + * \brief Returns which device is currently being used + * + * Returns in \p *device the current device for the calling host thread. + * + * \param device - Returns the device on which the active host thread + * executes the device code. + * + * \return + * ::cudaSuccess + * \notefnerr + * + * \sa ::cudaGetDeviceCount, ::cudaSetDevice, ::cudaGetDeviceProperties, + * ::cudaChooseDevice, + * ::cuCtxGetCurrent + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device); + +/** + * \brief Set a list of devices that can be used for CUDA + * + * Sets a list of devices for CUDA execution in priority order using + * \p device_arr. The parameter \p len specifies the number of elements in the + * list. CUDA will try devices from the list sequentially until it finds one + * that works. If this function is not called, or if it is called with a \p len + * of 0, then CUDA will go back to its default behavior of trying devices + * sequentially from a default list containing all of the available CUDA + * devices in the system. If a specified device ID in the list does not exist, + * this function will return ::cudaErrorInvalidDevice. If \p len is not 0 and + * \p device_arr is NULL or if \p len exceeds the number of devices in + * the system, then ::cudaErrorInvalidValue is returned. + * + * \param device_arr - List of devices to try + * \param len - Number of devices in specified list + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidDevice + * \notefnerr + * + * \sa ::cudaGetDeviceCount, ::cudaSetDevice, ::cudaGetDeviceProperties, + * ::cudaSetDeviceFlags, + * ::cudaChooseDevice + */ +extern __host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr, int len); + +/** + * \brief Sets flags to be used for device executions + * + * Records \p flags as the flags to use when initializing the current + * device. If no device has been made current to the calling thread, + * then \p flags will be applied to the initialization of any device + * initialized by the calling host thread, unless that device has had + * its initialization flags set explicitly by this or any host thread. + * + * If the current device has been set and that device has already been + * initialized then this call will fail with the error + * ::cudaErrorSetOnActiveProcess. In this case it is necessary + * to reset \p device using ::cudaDeviceReset() before the device's + * initialization flags may be set. + * + * The two LSBs of the \p flags parameter can be used to control how the CPU + * thread interacts with the OS scheduler when waiting for results from the + * device. + * + * - ::cudaDeviceScheduleAuto: The default value if the \p flags parameter is + * zero, uses a heuristic based on the number of active CUDA contexts in the + * process \p C and the number of logical processors in the system \p P. If + * \p C \> \p P, then CUDA will yield to other OS threads when waiting for the + * device, otherwise CUDA will not yield while waiting for results and + * actively spin on the processor. + * - ::cudaDeviceScheduleSpin: Instruct CUDA to actively spin when waiting for + * results from the device. This can decrease latency when waiting for the + * device, but may lower the performance of CPU threads if they are performing + * work in parallel with the CUDA thread. + * - ::cudaDeviceScheduleYield: Instruct CUDA to yield its thread when waiting + * for results from the device. This can increase latency when waiting for the + * device, but can increase the performance of CPU threads performing work in + * parallel with the device. + * - ::cudaDeviceScheduleBlockingSync: Instruct CUDA to block the CPU thread + * on a synchronization primitive when waiting for the device to finish work. + * - ::cudaDeviceBlockingSync: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the device to finish work.
+ * \ref deprecated "Deprecated:" This flag was deprecated as of CUDA 4.0 and + * replaced with ::cudaDeviceScheduleBlockingSync. + * - ::cudaDeviceMapHost: This flag enables allocating pinned + * host memory that is accessible to the device. It is implicit for the + * runtime but may be absent if a context is created using the driver API. + * If this flag is not set, ::cudaHostGetDevicePointer() will always return + * a failure code. + * - ::cudaDeviceLmemResizeToMax: Instruct CUDA to not reduce local memory + * after resizing local memory for a kernel. This can prevent thrashing by + * local memory allocations when launching many kernels with high local + * memory usage at the cost of potentially increased memory usage. + * + * \param flags - Parameters for device operation + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorSetOnActiveProcess + * + * \sa ::cudaGetDeviceFlags, ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaGetDeviceProperties, + * ::cudaSetDevice, ::cudaSetValidDevices, + * ::cudaChooseDevice, + * ::cuDevicePrimaryCtxSetFlags + */ +extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags( unsigned int flags ); + +/** + * \brief Gets the flags for the current device + * + * Returns in \p flags the flags for the current device. If there is a + * current device for the calling thread, and the device has been initialized + * or flags have been set on that device specifically, the flags for the + * device are returned. If there is no current device, but flags have been + * set for the thread with ::cudaSetDeviceFlags, the thread flags are returned. + * Finally, if there is no current device and no thread flags, the flags for + * the first device are returned, which may be the default flags. Compare + * to the behavior of ::cudaSetDeviceFlags. + * + * Typically, the flags returned should match the behavior that will be seen + * if the calling thread uses a device after this call, without any change to + * the flags or current device inbetween by this or another thread. Note that + * if the device is not initialized, it is possible for another thread to + * change the flags for the current device before it is initialized. + * Additionally, when using exclusive mode, if this thread has not requested a + * specific device, it may use a device other than the first device, contrary + * to the assumption made by this function. + * + * If a context has been created via the driver API and is current to the + * calling thread, the flags for that context are always returned. + * + * Flags returned by this function may specifically include ::cudaDeviceMapHost + * even though it is not accepted by ::cudaSetDeviceFlags because it is + * implicit in runtime API flags. The reason for this is that the current + * context may have been created via the driver API in which case the flag is + * not implicit and may be unset. + * + * \param flags - Pointer to store the device flags + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice + * + * \sa ::cudaGetDevice, ::cudaGetDeviceProperties, + * ::cudaSetDevice, ::cudaSetDeviceFlags, + * ::cuCtxGetFlags, + * ::cuDevicePrimaryCtxGetState + */ +extern __host__ cudaError_t CUDARTAPI cudaGetDeviceFlags( unsigned int *flags ); +/** @} */ /* END CUDART_DEVICE */ + +/** + * \defgroup CUDART_STREAM Stream Management + * + * ___MANBRIEF___ stream management functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the stream management functions of the CUDA runtime + * application programming interface. + * + * @{ + */ + +/** + * \brief Create an asynchronous stream + * + * Creates a new asynchronous stream. + * + * \param pStream - Pointer to new stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * + * \sa ::cudaStreamCreateWithPriority, + * ::cudaStreamCreateWithFlags, + * ::cudaStreamGetPriority, + * ::cudaStreamGetFlags, + * ::cudaStreamQuery, + * ::cudaStreamSynchronize, + * ::cudaStreamWaitEvent, + * ::cudaStreamAddCallback, + * ::cudaStreamDestroy, + * ::cuStreamCreate + */ +extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream); + +/** + * \brief Create an asynchronous stream + * + * Creates a new asynchronous stream. The \p flags argument determines the + * behaviors of the stream. Valid values for \p flags are + * - ::cudaStreamDefault: Default stream creation flag. + * - ::cudaStreamNonBlocking: Specifies that work running in the created + * stream may run concurrently with work in stream 0 (the NULL stream), and that + * the created stream should perform no implicit synchronization with stream 0. + * + * \param pStream - Pointer to new stream identifier + * \param flags - Parameters for stream creation + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * + * \sa ::cudaStreamCreate, + * ::cudaStreamCreateWithPriority, + * ::cudaStreamGetFlags, + * ::cudaStreamQuery, + * ::cudaStreamSynchronize, + * ::cudaStreamWaitEvent, + * ::cudaStreamAddCallback, + * ::cudaStreamDestroy, + * ::cuStreamCreate + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags); + +/** + * \brief Create an asynchronous stream with the specified priority + * + * Creates a stream with the specified priority and returns a handle in \p pStream. + * This API alters the scheduler priority of work in the stream. Work in a higher + * priority stream may preempt work already executing in a low priority stream. + * + * \p priority follows a convention where lower numbers represent higher priorities. + * '0' represents default priority. The range of meaningful numerical priorities can + * be queried using ::cudaDeviceGetStreamPriorityRange. If the specified priority is + * outside the numerical range returned by ::cudaDeviceGetStreamPriorityRange, + * it will automatically be clamped to the lowest or the highest number in the range. + * + * \param pStream - Pointer to new stream identifier + * \param flags - Flags for stream creation. See ::cudaStreamCreateWithFlags for a list of valid flags that can be passed + * \param priority - Priority of the stream. Lower numbers represent higher priorities. + * See ::cudaDeviceGetStreamPriorityRange for more information about + * the meaningful stream priorities that can be passed. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * + * \note Stream priorities are supported only on GPUs + * with compute capability 3.5 or higher. + * + * \note In the current implementation, only compute kernels launched in + * priority streams are affected by the stream's priority. Stream priorities have + * no effect on host-to-device and device-to-host memory operations. + * + * \sa ::cudaStreamCreate, + * ::cudaStreamCreateWithFlags, + * ::cudaDeviceGetStreamPriorityRange, + * ::cudaStreamGetPriority, + * ::cudaStreamQuery, + * ::cudaStreamWaitEvent, + * ::cudaStreamAddCallback, + * ::cudaStreamSynchronize, + * ::cudaStreamDestroy, + * ::cuStreamCreateWithPriority + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags, int priority); + +/** + * \brief Query the priority of a stream + * + * Query the priority of a stream. The priority is returned in in \p priority. + * Note that if the stream was created with a priority outside the meaningful + * numerical range returned by ::cudaDeviceGetStreamPriorityRange, + * this function returns the clamped priority. + * See ::cudaStreamCreateWithPriority for details about priority clamping. + * + * \param hStream - Handle to the stream to be queried + * \param priority - Pointer to a signed integer in which the stream's priority is returned + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle + * \notefnerr + * + * \sa ::cudaStreamCreateWithPriority, + * ::cudaDeviceGetStreamPriorityRange, + * ::cudaStreamGetFlags, + * ::cuStreamGetPriority + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetPriority(cudaStream_t hStream, int *priority); + +/** + * \brief Query the flags of a stream + * + * Query the flags of a stream. The flags are returned in \p flags. + * See ::cudaStreamCreateWithFlags for a list of valid flags. + * + * \param hStream - Handle to the stream to be queried + * \param flags - Pointer to an unsigned integer in which the stream's flags are returned + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle + * \note_null_stream + * \notefnerr + * + * \sa ::cudaStreamCreateWithPriority, + * ::cudaStreamCreateWithFlags, + * ::cudaStreamGetPriority, + * ::cuStreamGetFlags + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags); + +/** + * \brief Destroys and cleans up an asynchronous stream + * + * Destroys and cleans up the asynchronous stream specified by \p stream. + * + * In case the device is still doing work in the stream \p stream + * when ::cudaStreamDestroy() is called, the function will return immediately + * and the resources associated with \p stream will be released automatically + * once the device has completed all work in \p stream. + * + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle + * \note_null_stream + * \notefnerr + * + * \sa ::cudaStreamCreate, + * ::cudaStreamCreateWithFlags, + * ::cudaStreamQuery, + * ::cudaStreamWaitEvent, + * ::cudaStreamSynchronize, + * ::cudaStreamAddCallback, + * ::cuStreamDestroy + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream); + +/** + * \brief Make a compute stream wait on an event + * + * Makes all future work submitted to \p stream wait until \p event reports + * completion before beginning execution. This synchronization will be + * performed efficiently on the device. The event \p event may + * be from a different context than \p stream, in which case this function + * will perform cross-device synchronization. + * + * The stream \p stream will wait only for the completion of the most recent + * host call to ::cudaEventRecord() on \p event. Once this call has returned, + * any functions (including ::cudaEventRecord() and ::cudaEventDestroy()) may be + * called on \p event again, and the subsequent calls will not have any effect + * on \p stream. + * + * If ::cudaEventRecord() has not been called on \p event, this call acts as if + * the record has already completed, and so is a functional no-op. + * + * \param stream - Stream to wait + * \param event - Event to wait on + * \param flags - Parameters for the operation (must be 0) + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle + * \note_null_stream + * \notefnerr + * + * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, + * ::cuStreamWaitEvent + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags); + +#ifdef _WIN32 +#define CUDART_CB __stdcall +#else +#define CUDART_CB +#endif + +/** + * Type of stream callback functions. + * \param stream The stream as passed to ::cudaStreamAddCallback, may be NULL. + * \param status ::cudaSuccess or any persistent error on the stream. + * \param userData User parameter provided at registration. + */ +typedef void (CUDART_CB *cudaStreamCallback_t)(cudaStream_t stream, cudaError_t status, void *userData); + +/** + * \brief Add a callback to a compute stream + * + * Adds a callback to be called on the host after all currently enqueued + * items in the stream have completed. For each + * cudaStreamAddCallback call, a callback will be executed exactly once. + * The callback will block later work in the stream until it is finished. + * + * The callback may be passed ::cudaSuccess or an error code. In the event + * of a device error, all subsequently executed callbacks will receive an + * appropriate ::cudaError_t. + * + * Callbacks must not make any CUDA API calls. Attempting to use CUDA APIs + * will result in ::cudaErrorNotPermitted. Callbacks must not perform any + * synchronization that may depend on outstanding device work or other callbacks + * that are not mandated to run earlier. Callbacks without a mandated order + * (in independent streams) execute in undefined order and may be serialized. + * + * For the purposes of Unified Memory, callback execution makes a number of + * guarantees: + *
    + *
  • The callback stream is considered idle for the duration of the + * callback. Thus, for example, a callback may always use memory attached + * to the callback stream.
  • + *
  • The start of execution of a callback has the same effect as + * synchronizing an event recorded in the same stream immediately prior to + * the callback. It thus synchronizes streams which have been "joined" + * prior to the callback.
  • + *
  • Adding device work to any stream does not have the effect of making + * the stream active until all preceding callbacks have executed. Thus, for + * example, a callback might use global attached memory even if work has + * been added to another stream, if it has been properly ordered with an + * event.
  • + *
  • Completion of a callback does not cause a stream to become + * active except as described above. The callback stream will remain idle + * if no device work follows the callback, and will remain idle across + * consecutive callbacks without device work in between. Thus, for example, + * stream synchronization can be done by signaling from a callback at the + * end of the stream.
  • + *
+ * + * \param stream - Stream to add callback to + * \param callback - The function to call once preceding stream operations are complete + * \param userData - User specified data to be passed to the callback function + * \param flags - Reserved for future use, must be 0 + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorNotSupported + * \note_null_stream + * \notefnerr + * + * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamSynchronize, ::cudaStreamWaitEvent, ::cudaStreamDestroy, ::cudaMallocManaged, ::cudaStreamAttachMemAsync, + * ::cuStreamAddCallback + */ +extern __host__ cudaError_t CUDARTAPI cudaStreamAddCallback(cudaStream_t stream, + cudaStreamCallback_t callback, void *userData, unsigned int flags); + +/** + * \brief Waits for stream tasks to complete + * + * Blocks until \p stream has completed all operations. If the + * ::cudaDeviceScheduleBlockingSync flag was set for this device, + * the host thread will block until the stream is finished with + * all of its tasks. + * + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle + * \note_null_stream + * \notefnerr + * + * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamWaitEvent, ::cudaStreamAddCallback, ::cudaStreamDestroy, + * ::cuStreamSynchronize + */ +extern __host__ cudaError_t CUDARTAPI cudaStreamSynchronize(cudaStream_t stream); + +/** + * \brief Queries an asynchronous stream for completion status + * + * Returns ::cudaSuccess if all operations in \p stream have + * completed, or ::cudaErrorNotReady if not. + * + * For the purposes of Unified Memory, a return value of ::cudaSuccess + * is equivalent to having called ::cudaStreamSynchronize(). + * + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorNotReady, + * ::cudaErrorInvalidResourceHandle + * \note_null_stream + * \notefnerr + * + * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, + * ::cuStreamQuery + */ +extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream); + +/** + * \brief Attach memory to a stream asynchronously + * + * Enqueues an operation in \p stream to specify stream association of + * \p length bytes of memory starting from \p devPtr. This function is a + * stream-ordered operation, meaning that it is dependent on, and will + * only take effect when, previous work in stream has completed. Any + * previous association is automatically replaced. + * + * \p devPtr must point to an address within managed memory space declared + * using the __managed__ keyword or allocated with ::cudaMallocManaged. + * + * \p length must be zero, to indicate that the entire allocation's + * stream association is being changed. Currently, it's not possible + * to change stream association for a portion of an allocation. The default + * value for \p length is zero. + * + * The stream association is specified using \p flags which must be + * one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle. + * The default value for \p flags is ::cudaMemAttachSingle + * If the ::cudaMemAttachGlobal flag is specified, the memory can be accessed + * by any stream on any device. + * If the ::cudaMemAttachHost flag is specified, the program makes a guarantee + * that it won't access the memory on the device from any stream on a device that + * has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess. + * If the ::cudaMemAttachSingle flag is specified and \p stream is associated with + * a device that has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess, + * the program makes a guarantee that it will only access the memory on the device + * from \p stream. It is illegal to attach singly to the NULL stream, because the + * NULL stream is a virtual global stream and not a specific stream. An error will + * be returned in this case. + * + * When memory is associated with a single stream, the Unified Memory system will + * allow CPU access to this memory region so long as all operations in \p stream + * have completed, regardless of whether other streams are active. In effect, + * this constrains exclusive ownership of the managed memory region by + * an active GPU to per-stream activity instead of whole-GPU activity. + * + * Accessing memory on the device from streams that are not associated with + * it will produce undefined results. No error checking is performed by the + * Unified Memory system to ensure that kernels launched into other streams + * do not access this region. + * + * It is a program's responsibility to order calls to ::cudaStreamAttachMemAsync + * via events, synchronization or other means to ensure legal access to memory + * at all times. Data visibility and coherency will be changed appropriately + * for all kernels which follow a stream-association change. + * + * If \p stream is destroyed while data is associated with it, the association is + * removed and the association reverts to the default visibility of the allocation + * as specified at ::cudaMallocManaged. For __managed__ variables, the default + * association is always ::cudaMemAttachGlobal. Note that destroying a stream is an + * asynchronous operation, and as a result, the change to default association won't + * happen until all work in the stream has completed. + * + * \param stream - Stream in which to enqueue the attach operation + * \param devPtr - Pointer to memory (must be a pointer to managed memory) + * \param length - Length of memory (must be zero, defaults to zero) + * \param flags - Must be one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle (defaults to ::cudaMemAttachSingle) + * + * \return + * ::cudaSuccess, + * ::cudaErrorNotReady, + * ::cudaErrorInvalidValue + * ::cudaErrorInvalidResourceHandle + * \notefnerr + * + * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, ::cudaMallocManaged, + * ::cuStreamAttachMemAsync + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length __dv(0), unsigned int flags __dv(cudaMemAttachSingle)); + +/** @} */ /* END CUDART_STREAM */ + +/** + * \defgroup CUDART_EVENT Event Management + * + * ___MANBRIEF___ event management functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the event management functions of the CUDA runtime + * application programming interface. + * + * @{ + */ + +/** + * \brief Creates an event object + * + * Creates an event object using ::cudaEventDefault. + * + * \param event - Newly created event + * + * \return + * ::cudaSuccess, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidValue, + * ::cudaErrorLaunchFailure, + * ::cudaErrorMemoryAllocation + * \notefnerr + * + * \sa \ref ::cudaEventCreate(cudaEvent_t*, unsigned int) "cudaEventCreate (C++ API)", + * ::cudaEventCreateWithFlags, ::cudaEventRecord, ::cudaEventQuery, + * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime, + * ::cudaStreamWaitEvent, + * ::cuEventCreate + */ +extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event); + +/** + * \brief Creates an event object with the specified flags + * + * Creates an event object with the specified flags. Valid flags include: + * - ::cudaEventDefault: Default event creation flag. + * - ::cudaEventBlockingSync: Specifies that event should use blocking + * synchronization. A host thread that uses ::cudaEventSynchronize() to wait + * on an event created with this flag will block until the event actually + * completes. + * - ::cudaEventDisableTiming: Specifies that the created event does not need + * to record timing data. Events created with this flag specified and + * the ::cudaEventBlockingSync flag not specified will provide the best + * performance when used with ::cudaStreamWaitEvent() and ::cudaEventQuery(). + * - ::cudaEventInterprocess: Specifies that the created event may be used as an + * interprocess event by ::cudaIpcGetEventHandle(). ::cudaEventInterprocess must + * be specified along with ::cudaEventDisableTiming. + * + * \param event - Newly created event + * \param flags - Flags for new event + * + * \return + * ::cudaSuccess, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidValue, + * ::cudaErrorLaunchFailure, + * ::cudaErrorMemoryAllocation + * \notefnerr + * + * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)", + * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime, + * ::cudaStreamWaitEvent, + * ::cuEventCreate + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags); + +/** + * \brief Records an event + * + * Records an event. See note about NULL stream behavior. Since operation + * is asynchronous, ::cudaEventQuery() or ::cudaEventSynchronize() must + * be used to determine when the event has actually been recorded. + * + * If ::cudaEventRecord() has previously been called on \p event, then this + * call will overwrite any existing state in \p event. Any subsequent calls + * which examine the status of \p event will only examine the completion of + * this most recent call to ::cudaEventRecord(). + * + * \param event - Event to record + * \param stream - Stream in which to record event + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorLaunchFailure + * \note_null_stream + * \notefnerr + * + * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)", + * ::cudaEventCreateWithFlags, ::cudaEventQuery, + * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime, + * ::cudaStreamWaitEvent, + * ::cuEventRecord + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0)); + +/** + * \brief Queries an event's status + * + * Query the status of all device work preceding the most recent call to + * ::cudaEventRecord() (in the appropriate compute streams, as specified by the + * arguments to ::cudaEventRecord()). + * + * If this work has successfully been completed by the device, or if + * ::cudaEventRecord() has not been called on \p event, then ::cudaSuccess is + * returned. If this work has not yet been completed by the device then + * ::cudaErrorNotReady is returned. + * + * For the purposes of Unified Memory, a return value of ::cudaSuccess + * is equivalent to having called ::cudaEventSynchronize(). + * + * \param event - Event to query + * + * \return + * ::cudaSuccess, + * ::cudaErrorNotReady, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorLaunchFailure + * \notefnerr + * + * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)", + * ::cudaEventCreateWithFlags, ::cudaEventRecord, + * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime, + * ::cuEventQuery + */ +extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event); + +/** + * \brief Waits for an event to complete + * + * Wait until the completion of all device work preceding the most recent + * call to ::cudaEventRecord() (in the appropriate compute streams, as specified + * by the arguments to ::cudaEventRecord()). + * + * If ::cudaEventRecord() has not been called on \p event, ::cudaSuccess is + * returned immediately. + * + * Waiting for an event that was created with the ::cudaEventBlockingSync + * flag will cause the calling CPU thread to block until the event has + * been completed by the device. If the ::cudaEventBlockingSync flag has + * not been set, then the CPU thread will busy-wait until the event has + * been completed by the device. + * + * \param event - Event to wait for + * + * \return + * ::cudaSuccess, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorLaunchFailure + * \notefnerr + * + * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)", + * ::cudaEventCreateWithFlags, ::cudaEventRecord, + * ::cudaEventQuery, ::cudaEventDestroy, ::cudaEventElapsedTime, + * ::cuEventSynchronize + */ +extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event); + +/** + * \brief Destroys an event object + * + * Destroys the event specified by \p event. + * + * In case \p event has been recorded but has not yet been completed + * when ::cudaEventDestroy() is called, the function will return immediately and + * the resources associated with \p event will be released automatically once + * the device has completed \p event. + * + * \param event - Event to destroy + * + * \return + * ::cudaSuccess, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidValue, + * ::cudaErrorLaunchFailure + * \notefnerr + * + * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)", + * ::cudaEventCreateWithFlags, ::cudaEventQuery, + * ::cudaEventSynchronize, ::cudaEventRecord, ::cudaEventElapsedTime, + * ::cuEventDestroy + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event); + +/** + * \brief Computes the elapsed time between events + * + * Computes the elapsed time between two events (in milliseconds with a + * resolution of around 0.5 microseconds). + * + * If either event was last recorded in a non-NULL stream, the resulting time + * may be greater than expected (even if both used the same stream handle). This + * happens because the ::cudaEventRecord() operation takes place asynchronously + * and there is no guarantee that the measured latency is actually just between + * the two events. Any number of other different stream operations could execute + * in between the two measured events, thus altering the timing in a significant + * way. + * + * If ::cudaEventRecord() has not been called on either event, then + * ::cudaErrorInvalidResourceHandle is returned. If ::cudaEventRecord() has been + * called on both events but one or both of them has not yet been completed + * (that is, ::cudaEventQuery() would return ::cudaErrorNotReady on at least one + * of the events), ::cudaErrorNotReady is returned. If either event was created + * with the ::cudaEventDisableTiming flag, then this function will return + * ::cudaErrorInvalidResourceHandle. + * + * \param ms - Time between \p start and \p end in ms + * \param start - Starting event + * \param end - Ending event + * + * \return + * ::cudaSuccess, + * ::cudaErrorNotReady, + * ::cudaErrorInvalidValue, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorLaunchFailure + * \notefnerr + * + * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)", + * ::cudaEventCreateWithFlags, ::cudaEventQuery, + * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventRecord, + * ::cuEventElapsedTime + */ +extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms, cudaEvent_t start, cudaEvent_t end); + +/** @} */ /* END CUDART_EVENT */ + +/** + * \defgroup CUDART_EXECUTION Execution Control + * + * ___MANBRIEF___ execution control functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the execution control functions of the CUDA runtime + * application programming interface. + * + * Some functions have overloaded C++ API template versions documented separately in the + * \ref CUDART_HIGHLEVEL "C++ API Routines" module. + * + * @{ + */ + +/** + * \brief Launches a device function + * + * The function invokes kernel \p func on \p gridDim (\p gridDim.x × \p gridDim.y + * × \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x × + * \p blockDim.y × \p blockDim.z) threads. + * + * If the kernel has N parameters the \p args should point to array of N pointers. + * Each pointer, from args[0] to args[N - 1], point to the region + * of memory from which the actual parameter will be copied. + * + * For templated functions, pass the function symbol as follows: + * func_name + * + * \p sharedMem sets the amount of dynamic shared memory that will be available to + * each thread block. + * + * \p stream specifies a stream the invocation is associated to. + * + * \param func - Device function symbol + * \param gridDim - Grid dimentions + * \param blockDim - Block dimentions + * \param args - Arguments + * \param sharedMem - Shared memory + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidConfiguration, + * ::cudaErrorLaunchFailure, + * ::cudaErrorLaunchTimeout, + * ::cudaErrorLaunchOutOfResources, + * ::cudaErrorSharedObjectInitFailed, + * ::cudaErrorInvalidPtx, + * ::cudaErrorNoKernelImageForDevice, + * ::cudaErrorJitCompilerNotFound + * \note_null_stream + * \notefnerr + * + * \sa + * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)", + * ::cuLaunchKernel + */ +extern __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream); + +/** + * \brief Launches a device function where thread blocks can cooperate and synchronize as they execute + * + * The function invokes kernel \p func on \p gridDim (\p gridDim.x × \p gridDim.y + * × \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x × + * \p blockDim.y × \p blockDim.z) threads. + * + * The device on which this kernel is invoked must have a non-zero value for + * the device attribute ::cudaDevAttrCooperativeLaunch. + * + * The total number of blocks launched cannot exceed the maximum number of blocks per + * multiprocessor as returned by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor (or + * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors + * as specified by the device attribute ::cudaDevAttrMultiProcessorCount. + * + * The kernel cannot make use of CUDA dynamic parallelism. + * + * If the kernel has N parameters the \p args should point to array of N pointers. + * Each pointer, from args[0] to args[N - 1], point to the region + * of memory from which the actual parameter will be copied. + * + * For templated functions, pass the function symbol as follows: + * func_name + * + * \p sharedMem sets the amount of dynamic shared memory that will be available to + * each thread block. + * + * \p stream specifies a stream the invocation is associated to. + * + * \param func - Device function symbol + * \param gridDim - Grid dimentions + * \param blockDim - Block dimentions + * \param args - Arguments + * \param sharedMem - Shared memory + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidConfiguration, + * ::cudaErrorLaunchFailure, + * ::cudaErrorLaunchTimeout, + * ::cudaErrorLaunchOutOfResources, + * ::cudaErrorCooperativeLaunchTooLarge, + * ::cudaErrorSharedObjectInitFailed + * \note_null_stream + * \notefnerr + * + * \sa + * \ref ::cudaLaunchCooperativeKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchCooperativeKernel (C++ API)", + * ::cudaLaunchCooperativeKernelMultiDevice, + * ::cuLaunchCooperativeKernel + */ +extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream); + +/** + * \brief Launches device functions on multiple devices where thread blocks can cooperate and synchronize as they execute + * + * Invokes kernels as specified in the \p launchParamsList array where each element + * of the array specifies all the parameters required to perform a single kernel launch. + * These kernels can cooperate and synchronize as they execute. The size of the array is + * specified by \p numDevices. + * + * No two kernels can be launched on the same device. All the devices targeted by this + * multi-device launch must be identical. All devices must have a non-zero value for the + * device attribute ::cudaDevAttrCooperativeLaunch. + * + * The same kernel must be launched on all devices. Note that any __device__ or __constant__ + * variables are independently instantiated on every device. It is the application's + * responsiblity to ensure these variables are initialized and used appropriately. + * + * The size of the grids as specified in blocks, the size of the blocks themselves and the + * amount of shared memory used by each thread block must also match across all launched kernels. + * + * The streams used to launch these kernels must have been created via either ::cudaStreamCreate + * or ::cudaStreamCreateWithPriority or ::cudaStreamCreateWithPriority. The NULL stream or + * ::cudaStreamLegacy or ::cudaStreamPerThread cannot be used. + * + * The total number of blocks launched per kernel cannot exceed the maximum number of blocks + * per multiprocessor as returned by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor (or + * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors + * as specified by the device attribute ::cudaDevAttrMultiProcessorCount. Since the + * total number of blocks launched per device has to match across all devices, the maximum + * number of blocks that can be launched per device will be limited by the device with the + * least number of multiprocessors. + * + * The kernel cannot make use of CUDA dynamic parallelism. + * + * The ::cudaLaunchParams structure is defined as: + * \code + struct cudaLaunchParams + { + void *func; + dim3 gridDim; + dim3 blockDim; + void **args; + size_t sharedMem; + cudaStream_t stream; + }; + * \endcode + * where: + * - ::cudaLaunchParams::func specifies the kernel to be launched. This same functions must + * be launched on all devices. For templated functions, pass the function symbol as follows: + * func_name + * - ::cudaLaunchParams::gridDim specifies the width, height and depth of the grid in blocks. + * This must match across all kernels launched. + * - ::cudaLaunchParams::blockDim is the width, height and depth of each thread block. This + * must match across all kernels launched. + * - ::cudaLaunchParams::args specifies the arguments to the kernel. If the kernel has + * N parameters then ::cudaLaunchParams::args should point to array of N pointers. Each + * pointer, from ::cudaLaunchParams::args[0] to ::cudaLaunchParams::args[N - 1], + * point to the region of memory from which the actual parameter will be copied. + * - ::cudaLaunchParams::sharedMem is the dynamic shared-memory size per thread block in bytes. + * This must match across all kernels launched. + * - ::cudaLaunchParams::stream is the handle to the stream to perform the launch in. This cannot + * be the NULL stream or ::cudaStreamLegacy or ::cudaStreamPerThread. + * + * By default, the kernel won't begin execution on any GPU until all prior work in all the specified + * streams has completed. This behavior can be overridden by specifying the flag + * ::cudaCooperativeLaunchMultiDeviceNoPreSync. When this flag is specified, each kernel + * will only wait for prior work in the stream corresponding to that GPU to complete before it begins + * execution. + * + * Similarly, by default, any subsequent work pushed in any of the specified streams will not begin + * execution until the kernels on all GPUs have completed. This behavior can be overridden by specifying + * the flag ::cudaCooperativeLaunchMultiDeviceNoPostSync. When this flag is specified, + * any subsequent work pushed in any of the specified streams will only wait for the kernel launched + * on the GPU corresponding to that stream to complete before it begins execution. + * + * \param launchParamsList - List of launch parameters, one per device + * \param numDevices - Size of the \p launchParamsList array + * \param flags - Flags to control launch behavior + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidConfiguration, + * ::cudaErrorLaunchFailure, + * ::cudaErrorLaunchTimeout, + * ::cudaErrorLaunchOutOfResources, + * ::cudaErrorCooperativeLaunchTooLarge, + * ::cudaErrorSharedObjectInitFailed + * \note_null_stream + * \notefnerr + * + * \sa + * \ref ::cudaLaunchCooperativeKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchCooperativeKernel (C++ API)", + * ::cudaLaunchCooperativeKernel, + * ::cuLaunchCooperativeKernelMultiDevice + */ +extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *launchParamsList, unsigned int numDevices, unsigned int flags __dv(0)); + +/** + * \brief Sets the preferred cache configuration for a device function + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this sets through \p cacheConfig the preferred cache configuration + * for the function specified via \p func. This is only a preference. The + * runtime will use the requested configuration if possible, but it is free to + * choose a different configuration if required to execute \p func. + * + * \p func is a device function symbol and must be declared as a + * \c __global__ function. If the specified function does not exist, + * then ::cudaErrorInvalidDeviceFunction is returned. For templated functions, + * pass the function symbol as follows: func_name + * + * This setting does nothing on devices where the size of the L1 cache and + * shared memory are fixed. + * + * Launching a kernel with a different preference than the most recent + * preference setting may insert a device-side synchronization point. + * + * The supported cache configurations are: + * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default) + * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache + * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory + * - ::cudaFuncCachePreferEqual: prefer equal size L1 cache and shared memory + * + * \param func - Device function symbol + * \param cacheConfig - Requested cache configuration + * + * \return + * ::cudaSuccess, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidDeviceFunction + * \notefnerr + * \note_string_api_deprecation2 + * + * \sa ::cudaConfigureCall, + * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", + * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)", + * ::cudaSetDoubleForDevice, + * ::cudaSetDoubleForHost, + * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)", + * ::cudaThreadGetCacheConfig, + * ::cudaThreadSetCacheConfig, + * ::cuFuncSetCacheConfig + */ +extern __host__ cudaError_t CUDARTAPI cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig); + +/** + * \brief Sets the shared memory configuration for a device function + * + * On devices with configurable shared memory banks, this function will + * force all subsequent launches of the specified device function to have + * the given shared memory bank size configuration. On any given launch of the + * function, the shared memory configuration of the device will be temporarily + * changed if needed to suit the function's preferred configuration. Changes in + * shared memory configuration between subsequent launches of functions, + * may introduce a device side synchronization point. + * + * Any per-function setting of shared memory bank size set via + * ::cudaFuncSetSharedMemConfig will override the device wide setting set by + * ::cudaDeviceSetSharedMemConfig. + * + * Changing the shared memory bank size will not increase shared memory usage + * or affect occupancy of kernels, but may have major effects on performance. + * Larger bank sizes will allow for greater potential bandwidth to shared memory, + * but will change what kinds of accesses to shared memory will result in bank + * conflicts. + * + * This function will do nothing on devices with fixed shared memory bank size. + * + * For templated functions, pass the function symbol as follows: + * func_name + * + * The supported bank configurations are: + * - ::cudaSharedMemBankSizeDefault: use the device's shared memory configuration + * when launching this function. + * - ::cudaSharedMemBankSizeFourByte: set shared memory bank width to be + * four bytes natively when launching this function. + * - ::cudaSharedMemBankSizeEightByte: set shared memory bank width to be eight + * bytes natively when launching this function. + * + * \param func - Device function symbol + * \param config - Requested shared memory configuration + * + * \return + * ::cudaSuccess, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * \notefnerr + * \note_string_api_deprecation2 + * + * \sa ::cudaConfigureCall, + * ::cudaDeviceSetSharedMemConfig, + * ::cudaDeviceGetSharedMemConfig, + * ::cudaDeviceSetCacheConfig, + * ::cudaDeviceGetCacheConfig, + * ::cudaFuncSetCacheConfig, + * ::cuFuncSetSharedMemConfig + */ +extern __host__ cudaError_t CUDARTAPI cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config); + +/** + * \brief Find out attributes for a given function + * + * This function obtains the attributes of a function specified via \p func. + * \p func is a device function symbol and must be declared as a + * \c __global__ function. The fetched attributes are placed in \p attr. + * If the specified function does not exist, then + * ::cudaErrorInvalidDeviceFunction is returned. For templated functions, pass + * the function symbol as follows: func_name + * + * Note that some function attributes such as + * \ref ::cudaFuncAttributes::maxThreadsPerBlock "maxThreadsPerBlock" + * may vary based on the device that is currently being used. + * + * \param attr - Return pointer to function's attributes + * \param func - Device function symbol + * + * \return + * ::cudaSuccess, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidDeviceFunction + * \notefnerr + * \note_string_api_deprecation2 + * + * \sa ::cudaConfigureCall, + * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)", + * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)", + * ::cudaSetDoubleForDevice, + * ::cudaSetDoubleForHost, + * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)", + * ::cuFuncGetAttribute + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func); + + +/** + * \brief Set attributes for a given function + * + * This function sets the attributes of a function specified via \p func. + * The parameter \p func must be a pointer to a function that executes + * on the device. The parameter specified by \p func must be declared as a \p __global__ + * function. The enumeration defined by \p attr is set to the value defined by \p value. + * If the specified function does not exist, then ::cudaErrorInvalidDeviceFunction is returned. + * If the specified attribute cannot be written, or if the value is incorrect, + * then ::cudaErrorInvalidValue is returned. + * + * Valid values for \p attr are: + * - ::cudaFuncAttributeMaxDynamicSharedMemorySize - Maximum size of dynamic shared memory per block + * - ::cudaFuncAttributePreferredSharedMemoryCarveout - Preferred shared memory-L1 cache split ratio in percent of maximum shared memory + * + * \param func - Function to get attributes of + * \param attr - Attribute to set + * \param value - Value to set + * + * \return + * ::cudaSuccess, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue + * \notefnerr + * + * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)", + * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", + * ::cudaSetDoubleForDevice, + * ::cudaSetDoubleForHost, + * \ref ::cudaSetupArgument(T, size_t) "cudaSetupArgument (C++ API)" + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value); + +/** + * \brief Converts a double argument to be executed on a device + * + * \param d - Double to convert + * + * \deprecated This function is deprecated as of CUDA 7.5 + * + * Converts the double value of \p d to an internal float representation if + * the device does not support double arithmetic. If the device does natively + * support doubles, then this function does nothing. + * + * \return + * ::cudaSuccess + * \notefnerr + * + * \sa + * \ref ::cudaLaunch(const void*) "cudaLaunch (C API)", + * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", + * ::cudaSetDoubleForHost, + * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)" + */ +extern __host__ cudaError_t CUDARTAPI cudaSetDoubleForDevice(double *d); + +/** + * \brief Converts a double argument after execution on a device + * + * \deprecated This function is deprecated as of CUDA 7.5 + * + * Converts the double value of \p d from a potentially internal float + * representation if the device does not support double arithmetic. If the + * device does natively support doubles, then this function does nothing. + * + * \param d - Double to convert + * + * \return + * ::cudaSuccess + * \notefnerr + * + * \sa + * \ref ::cudaLaunch(const void*) "cudaLaunch (C API)", + * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", + * ::cudaSetDoubleForDevice, + * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)" + */ +extern __host__ cudaError_t CUDARTAPI cudaSetDoubleForHost(double *d); + +/** @} */ /* END CUDART_EXECUTION */ + +/** + * \defgroup CUDART_OCCUPANCY Occupancy + * + * ___MANBRIEF___ occupancy calculation functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the occupancy calculation functions of the CUDA runtime + * application programming interface. + * + * Besides the occupancy calculator functions + * (\ref ::cudaOccupancyMaxActiveBlocksPerMultiprocessor and \ref ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags), + * there are also C++ only occupancy-based launch configuration functions documented in + * \ref CUDART_HIGHLEVEL "C++ API Routines" module. + * + * See + * \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)", + * \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSize (C++ API)", + * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)", + * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)" + * + * @{ + */ + +/** + * \brief Returns occupancy for a device function + * + * Returns in \p *numBlocks the maximum number of active blocks per + * streaming multiprocessor for the device function. + * + * \param numBlocks - Returned occupancy + * \param func - Kernel function for which occupancy is calculated + * \param blockSize - Block size the kernel is intended to be launched with + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * + * \return + * ::cudaSuccess, + * ::cudaErrorCudartUnloading, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, + * \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)", + * \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API)", + * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)", + * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API)", + * ::cuOccupancyMaxActiveBlocksPerMultiprocessor + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize); + +/** + * \brief Returns occupancy for a device function with the specified flags + * + * Returns in \p *numBlocks the maximum number of active blocks per + * streaming multiprocessor for the device function. + * + * The \p flags parameter controls how special cases are handled. Valid flags include: + * + * - ::cudaOccupancyDefault: keeps the default behavior as + * ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * + * - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior + * on platform where global caching affects occupancy. On such platforms, if caching + * is enabled, but per-block SM resource usage would result in zero occupancy, the + * occupancy calculator will calculate the occupancy as if caching is disabled. + * Setting this flag makes the occupancy calculator to return 0 in such cases. + * More information can be found about this feature in the "Unified L1/Texture Cache" + * section of the Maxwell tuning guide. + * + * \param numBlocks - Returned occupancy + * \param func - Kernel function for which occupancy is calculated + * \param blockSize - Block size the kernel is intended to be launched with + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * \param flags - Requested behavior for the occupancy calculator + * + * \return + * ::cudaSuccess, + * ::cudaErrorCudartUnloading, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor, + * \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)", + * \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API)", + * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)", + * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API)", + * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize, unsigned int flags); + +/** @} */ /* END CUDA_OCCUPANCY */ + +/** + * \defgroup CUDART_EXECUTION_DEPRECATED Execution Control [DEPRECATED] + * + * ___MANBRIEF___ deprecated execution control functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the deprecated execution control functions of the CUDA runtime + * application programming interface. + * + * Some functions have overloaded C++ API template versions documented separately in the + * \ref CUDART_HIGHLEVEL "C++ API Routines" module. + * + * @{ + */ + +/** + * \brief Configure a device-launch + * + * \deprecated This function is deprecated as of CUDA 7.0 + * + * Specifies the grid and block dimensions for the device call to be executed + * similar to the execution configuration syntax. ::cudaConfigureCall() is + * stack based. Each call pushes data on top of an execution stack. This data + * contains the dimension for the grid and thread blocks, together with any + * arguments for the call. + * + * \param gridDim - Grid dimensions + * \param blockDim - Block dimensions + * \param sharedMem - Shared memory + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidConfiguration + * \note_null_stream + * \notefnerr + * + * \sa + * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)", + * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", + * \ref ::cudaLaunch(const void*) "cudaLaunch (C API)", + * ::cudaSetDoubleForDevice, + * ::cudaSetDoubleForHost, + * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)", + */ +extern __host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem __dv(0), cudaStream_t stream __dv(0)); + +/** + * \brief Configure a device launch + * + * \deprecated This function is deprecated as of CUDA 7.0 + * + * Pushes \p size bytes of the argument pointed to by \p arg at \p offset + * bytes from the start of the parameter passing area, which starts at + * offset 0. The arguments are stored in the top of the execution stack. + * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument()" + * must be preceded by a call to ::cudaConfigureCall(). + * + * \param arg - Argument to push for a kernel launch + * \param size - Size of argument + * \param offset - Offset in argument stack to push new arg + * + * \return + * ::cudaSuccess + * \notefnerr + * + * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)", + * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", + * \ref ::cudaLaunch(const void*) "cudaLaunch (C API)", + * ::cudaSetDoubleForDevice, + * ::cudaSetDoubleForHost, + * \ref ::cudaSetupArgument(T, size_t) "cudaSetupArgument (C++ API)", + */ +extern __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg, size_t size, size_t offset); + +/** + * \brief Launches a device function + * + * \deprecated This function is deprecated as of CUDA 7.0 + * + * Launches the function \p func on the device. The parameter \p func must + * be a device function symbol. The parameter specified by \p func must be + * declared as a \p __global__ function. For templated functions, pass the + * function symbol as follows: func_name + * \ref ::cudaLaunch(const void*) "cudaLaunch()" must be preceded by a call to + * ::cudaConfigureCall() since it pops the data that was pushed by + * ::cudaConfigureCall() from the execution stack. + * + * \param func - Device function symbol + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidConfiguration, + * ::cudaErrorLaunchFailure, + * ::cudaErrorLaunchTimeout, + * ::cudaErrorLaunchOutOfResources, + * ::cudaErrorSharedObjectInitFailed, + * ::cudaErrorInvalidPtx, + * ::cudaErrorNoKernelImageForDevice, + * ::cudaErrorJitCompilerNotFound + * \notefnerr + * \note_string_api_deprecation_50 + * + * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)", + * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", + * \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)", + * ::cudaSetDoubleForDevice, + * ::cudaSetDoubleForHost, + * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)", + * ::cudaThreadGetCacheConfig, + * ::cudaThreadSetCacheConfig + */ +extern __host__ cudaError_t CUDARTAPI cudaLaunch(const void *func); + + +/** @} */ /* END CUDART_EXECUTION_DEPRECATED */ + + +/** + * \defgroup CUDART_MEMORY Memory Management + * + * ___MANBRIEF___ memory management functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the memory management functions of the CUDA runtime + * application programming interface. + * + * Some functions have overloaded C++ API template versions documented separately in the + * \ref CUDART_HIGHLEVEL "C++ API Routines" module. + * + * @{ + */ + +/** + * \brief Allocates memory that will be automatically managed by the Unified Memory system + * + * Allocates \p size bytes of managed memory on the device and returns in + * \p *devPtr a pointer to the allocated memory. If the device doesn't support + * allocating managed memory, ::cudaErrorNotSupported is returned. Support + * for managed memory can be queried using the device attribute + * ::cudaDevAttrManagedMemory. The allocated memory is suitably + * aligned for any kind of variable. The memory is not cleared. If \p size + * is 0, ::cudaMallocManaged returns ::cudaErrorInvalidValue. The pointer + * is valid on the CPU and on all GPUs in the system that support managed memory. + * All accesses to this pointer must obey the Unified Memory programming model. + * + * \p flags specifies the default stream association for this allocation. + * \p flags must be one of ::cudaMemAttachGlobal or ::cudaMemAttachHost. The + * default value for \p flags is ::cudaMemAttachGlobal. + * If ::cudaMemAttachGlobal is specified, then this memory is accessible from + * any stream on any device. If ::cudaMemAttachHost is specified, then the + * allocation should not be accessed from devices that have a zero value for the + * device attribute ::cudaDevAttrConcurrentManagedAccess; an explicit call to + * ::cudaStreamAttachMemAsync will be required to enable access on such devices. + * + * If the association is later changed via ::cudaStreamAttachMemAsync to + * a single stream, the default association, as specifed during ::cudaMallocManaged, + * is restored when that stream is destroyed. For __managed__ variables, the + * default association is always ::cudaMemAttachGlobal. Note that destroying a + * stream is an asynchronous operation, and as a result, the change to default + * association won't happen until all work in the stream has completed. + * + * Memory allocated with ::cudaMallocManaged should be released with ::cudaFree. + * + * Device memory oversubscription is possible for GPUs that have a non-zero value for the + * device attribute ::cudaDevAttrConcurrentManagedAccess. Managed memory on + * such GPUs may be evicted from device memory to host memory at any time by the Unified + * Memory driver in order to make room for other allocations. + * + * In a multi-GPU system where all GPUs have a non-zero value for the device attribute + * ::cudaDevAttrConcurrentManagedAccess, managed memory may not be populated when this + * API returns and instead may be populated on access. In such systems, managed memory can + * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to + * maintain data locality and prevent excessive page faults to the extent possible. The application + * can also guide the driver about memory usage patterns via ::cudaMemAdvise. The application + * can also explicitly migrate memory to a desired processor's memory via + * ::cudaMemPrefetchAsync. + * + * In a multi-GPU system where all of the GPUs have a zero value for the device attribute + * ::cudaDevAttrConcurrentManagedAccess and all the GPUs have peer-to-peer support + * with each other, the physical storage for managed memory is created on the GPU which is active + * at the time ::cudaMallocManaged is called. All other GPUs will reference the data at reduced + * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate + * memory among such GPUs. + * + * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and + * where the value of the device attribute ::cudaDevAttrConcurrentManagedAccess + * is zero for at least one of those GPUs, the location chosen for physical storage of managed + * memory is system-dependent. + * - On Linux, the location chosen will be device memory as long as the current set of active + * contexts are on devices that either have peer-to-peer support with each other or have a + * non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess. + * If there is an active context on a GPU that does not have a non-zero value for that device + * attribute and it does not have peer-to-peer support with the other devices that have active + * contexts on them, then the location for physical storage will be 'zero-copy' or host memory. + * Note that this means that managed memory that is located in device memory is migrated to + * host memory if a new context is created on a GPU that doesn't have a non-zero value for + * the device attribute and does not support peer-to-peer with at least one of the other devices + * that has an active context. This in turn implies that context creation may fail if there is + * insufficient host memory to migrate all managed allocations. + * - On Windows, the physical storage is always created in 'zero-copy' or host memory. + * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these + * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to + * restrict CUDA to only use those GPUs that have peer-to-peer support. + * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a non-zero + * value to force the driver to always use device memory for physical storage. + * When this environment variable is set to a non-zero value, all devices used in + * that process that support managed memory have to be peer-to-peer compatible + * with each other. The error ::cudaErrorInvalidDevice will be returned if a device + * that supports managed memory is used and it is not peer-to-peer compatible with + * any of the other managed memory supporting devices that were previously used in + * that process, even if ::cudaDeviceReset has been called on those devices. These + * environment variables are described in the CUDA programming guide under the + * "CUDA environment variables" section. + * + * \param devPtr - Pointer to allocated device memory + * \param size - Requested allocation size in bytes + * \param flags - Must be either ::cudaMemAttachGlobal or ::cudaMemAttachHost (defaults to ::cudaMemAttachGlobal) + * + * \return + * ::cudaSuccess, + * ::cudaErrorMemoryAllocation, + * ::cudaErrorNotSupported, + * ::cudaErrorInvalidValue + * + * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray, + * ::cudaMalloc3D, ::cudaMalloc3DArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaHostAlloc, ::cudaDeviceGetAttribute, ::cudaStreamAttachMemAsync, + * ::cuMemAllocManaged + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMallocManaged(void **devPtr, size_t size, unsigned int flags __dv(cudaMemAttachGlobal)); + + +/** + * \brief Allocate memory on the device + * + * Allocates \p size bytes of linear memory on the device and returns in + * \p *devPtr a pointer to the allocated memory. The allocated memory is + * suitably aligned for any kind of variable. The memory is not cleared. + * ::cudaMalloc() returns ::cudaErrorMemoryAllocation in case of failure. + * + * The device version of ::cudaFree cannot be used with a \p *devPtr + * allocated using the host API, and vice versa. + * + * \param devPtr - Pointer to allocated device memory + * \param size - Requested allocation size in bytes + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorMemoryAllocation + * + * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray, + * ::cudaMalloc3D, ::cudaMalloc3DArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaHostAlloc, + * ::cuMemAlloc + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size); + +/** + * \brief Allocates page-locked memory on the host + * + * Allocates \p size bytes of host memory that is page-locked and accessible + * to the device. The driver tracks the virtual memory ranges allocated with + * this function and automatically accelerates calls to functions such as + * ::cudaMemcpy*(). Since the memory can be accessed directly by the device, + * it can be read or written with much higher bandwidth than pageable memory + * obtained with functions such as ::malloc(). Allocating excessive amounts of + * memory with ::cudaMallocHost() may degrade system performance, since it + * reduces the amount of memory available to the system for paging. As a + * result, this function is best used sparingly to allocate staging areas for + * data exchange between host and device. + * + * \param ptr - Pointer to allocated host memory + * \param size - Requested allocation size in bytes + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorMemoryAllocation + * \notefnerr + * + * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaMallocArray, ::cudaMalloc3D, + * ::cudaMalloc3DArray, ::cudaHostAlloc, ::cudaFree, ::cudaFreeArray, + * \ref ::cudaMallocHost(void**, size_t, unsigned int) "cudaMallocHost (C++ API)", + * ::cudaFreeHost, ::cudaHostAlloc, + * ::cuMemAllocHost + */ +extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size); + +/** + * \brief Allocates pitched memory on the device + * + * Allocates at least \p width (in bytes) * \p height bytes of linear memory + * on the device and returns in \p *devPtr a pointer to the allocated memory. + * The function may pad the allocation to ensure that corresponding pointers + * in any given row will continue to meet the alignment requirements for + * coalescing as the address is updated from row to row. The pitch returned in + * \p *pitch by ::cudaMallocPitch() is the width in bytes of the allocation. + * The intended usage of \p pitch is as a separate parameter of the allocation, + * used to compute addresses within the 2D array. Given the row and column of + * an array element of type \p T, the address is computed as: + * \code + T* pElement = (T*)((char*)BaseAddress + Row * pitch) + Column; + \endcode + * + * For allocations of 2D arrays, it is recommended that programmers consider + * performing pitch allocations using ::cudaMallocPitch(). Due to pitch + * alignment restrictions in the hardware, this is especially true if the + * application will be performing 2D memory copies between different regions + * of device memory (whether linear memory or CUDA arrays). + * + * \param devPtr - Pointer to allocated pitched device memory + * \param pitch - Pitch for allocation + * \param width - Requested pitched allocation width (in bytes) + * \param height - Requested pitched allocation height + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorMemoryAllocation + * \notefnerr + * + * \sa ::cudaMalloc, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaMalloc3D, ::cudaMalloc3DArray, + * ::cudaHostAlloc, + * ::cuMemAllocPitch + */ +extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t height); + +/** + * \brief Allocate an array on the device + * + * Allocates a CUDA array according to the ::cudaChannelFormatDesc structure + * \p desc and returns a handle to the new CUDA array in \p *array. + * + * The ::cudaChannelFormatDesc is defined as: + * \code + struct cudaChannelFormatDesc { + int x, y, z, w; + enum cudaChannelFormatKind f; + }; + \endcode + * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned, + * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat. + * + * The \p flags parameter enables different options to be specified that affect + * the allocation, as follows. + * - ::cudaArrayDefault: This flag's value is defined to be 0 and provides default array allocation + * - ::cudaArraySurfaceLoadStore: Allocates an array that can be read from or written to using a surface reference + * - ::cudaArrayTextureGather: This flag indicates that texture gather operations will be performed on the array. + * + * \p width and \p height must meet certain size requirements. See ::cudaMalloc3DArray() for more details. + * + * \param array - Pointer to allocated array in device memory + * \param desc - Requested channel format + * \param width - Requested array allocation width + * \param height - Requested array allocation height + * \param flags - Requested properties of allocated array + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorMemoryAllocation + * \notefnerr + * + * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaFreeArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaMalloc3D, ::cudaMalloc3DArray, + * ::cudaHostAlloc, + * ::cuArrayCreate + */ +extern __host__ cudaError_t CUDARTAPI cudaMallocArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height __dv(0), unsigned int flags __dv(0)); + +/** + * \brief Frees memory on the device + * + * Frees the memory space pointed to by \p devPtr, which must have been + * returned by a previous call to ::cudaMalloc() or ::cudaMallocPitch(). + * Otherwise, or if ::cudaFree(\p devPtr) has already been called before, + * an error is returned. If \p devPtr is 0, no operation is performed. + * ::cudaFree() returns ::cudaErrorInvalidDevicePointer in case of failure. + * + * The device version of ::cudaFree cannot be used with a \p *devPtr + * allocated using the host API, and vice versa. + * + * \param devPtr - Device pointer to memory to free + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevicePointer, + * ::cudaErrorInitializationError + * \notefnerr + * + * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaMallocArray, ::cudaFreeArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaMalloc3D, ::cudaMalloc3DArray, + * ::cudaHostAlloc, + * ::cuMemFree + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr); + +/** + * \brief Frees page-locked memory + * + * Frees the memory space pointed to by \p hostPtr, which must have been + * returned by a previous call to ::cudaMallocHost() or ::cudaHostAlloc(). + * + * \param ptr - Pointer to memory to free + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInitializationError + * \notefnerr + * + * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, + * ::cudaFreeArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaHostAlloc, + * ::cuMemFreeHost + */ +extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr); + +/** + * \brief Frees an array on the device + * + * Frees the CUDA array \p array, which must have been * returned by a + * previous call to ::cudaMallocArray(). If ::cudaFreeArray(\p array) has + * already been called before, ::cudaErrorInvalidValue is returned. If + * \p devPtr is 0, no operation is performed. + * + * \param array - Pointer to array to free + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInitializationError + * \notefnerr + * + * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaHostAlloc, + * ::cuArrayDestroy + */ +extern __host__ cudaError_t CUDARTAPI cudaFreeArray(cudaArray_t array); + +/** + * \brief Frees a mipmapped array on the device + * + * Frees the CUDA mipmapped array \p mipmappedArray, which must have been + * returned by a previous call to ::cudaMallocMipmappedArray(). + * If ::cudaFreeMipmappedArray(\p mipmappedArray) has already been called before, + * ::cudaErrorInvalidValue is returned. + * + * \param mipmappedArray - Pointer to mipmapped array to free + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInitializationError + * \notefnerr + * + * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaHostAlloc, + * ::cuMipmappedArrayDestroy + */ +extern __host__ cudaError_t CUDARTAPI cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray); + + +/** + * \brief Allocates page-locked memory on the host + * + * Allocates \p size bytes of host memory that is page-locked and accessible + * to the device. The driver tracks the virtual memory ranges allocated with + * this function and automatically accelerates calls to functions such as + * ::cudaMemcpy(). Since the memory can be accessed directly by the device, it + * can be read or written with much higher bandwidth than pageable memory + * obtained with functions such as ::malloc(). Allocating excessive amounts of + * pinned memory may degrade system performance, since it reduces the amount + * of memory available to the system for paging. As a result, this function is + * best used sparingly to allocate staging areas for data exchange between host + * and device. + * + * The \p flags parameter enables different options to be specified that affect + * the allocation, as follows. + * - ::cudaHostAllocDefault: This flag's value is defined to be 0 and causes + * ::cudaHostAlloc() to emulate ::cudaMallocHost(). + * - ::cudaHostAllocPortable: The memory returned by this call will be + * considered as pinned memory by all CUDA contexts, not just the one that + * performed the allocation. + * - ::cudaHostAllocMapped: Maps the allocation into the CUDA address space. + * The device pointer to the memory may be obtained by calling + * ::cudaHostGetDevicePointer(). + * - ::cudaHostAllocWriteCombined: Allocates the memory as write-combined (WC). + * WC memory can be transferred across the PCI Express bus more quickly on some + * system configurations, but cannot be read efficiently by most CPUs. WC + * memory is a good option for buffers that will be written by the CPU and read + * by the device via mapped pinned memory or host->device transfers. + * + * All of these flags are orthogonal to one another: a developer may allocate + * memory that is portable, mapped and/or write-combined with no restrictions. + * + * ::cudaSetDeviceFlags() must have been called with the ::cudaDeviceMapHost + * flag in order for the ::cudaHostAllocMapped flag to have any effect. + * + * The ::cudaHostAllocMapped flag may be specified on CUDA contexts for devices + * that do not support mapped pinned memory. The failure is deferred to + * ::cudaHostGetDevicePointer() because the memory may be mapped into other + * CUDA contexts via the ::cudaHostAllocPortable flag. + * + * Memory allocated by this function must be freed with ::cudaFreeHost(). + * + * \param pHost - Device pointer to allocated memory + * \param size - Requested allocation size in bytes + * \param flags - Requested properties of allocated memory + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorMemoryAllocation + * \notefnerr + * + * \sa ::cudaSetDeviceFlags, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, + * ::cuMemHostAlloc + */ +extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size, unsigned int flags); + +/** + * \brief Registers an existing host memory range for use by CUDA + * + * Page-locks the memory range specified by \p ptr and \p size and maps it + * for the device(s) as specified by \p flags. This memory range also is added + * to the same tracking mechanism as ::cudaHostAlloc() to automatically accelerate + * calls to functions such as ::cudaMemcpy(). Since the memory can be accessed + * directly by the device, it can be read or written with much higher bandwidth + * than pageable memory that has not been registered. Page-locking excessive + * amounts of memory may degrade system performance, since it reduces the amount + * of memory available to the system for paging. As a result, this function is + * best used sparingly to register staging areas for data exchange between + * host and device. + * + * ::cudaHostRegister is not supported on non I/O coherent devices. + * + * The \p flags parameter enables different options to be specified that + * affect the allocation, as follows. + * + * - ::cudaHostRegisterDefault: On a system with unified virtual addressing, + * the memory will be both mapped and portable. On a system with no unified + * virtual addressing, the memory will be neither mapped nor portable. + * + * - ::cudaHostRegisterPortable: The memory returned by this call will be + * considered as pinned memory by all CUDA contexts, not just the one that + * performed the allocation. + * + * - ::cudaHostRegisterMapped: Maps the allocation into the CUDA address + * space. The device pointer to the memory may be obtained by calling + * ::cudaHostGetDevicePointer(). + * + * - ::cudaHostRegisterIoMemory: The passed memory pointer is treated as + * pointing to some memory-mapped I/O space, e.g. belonging to a + * third-party PCIe device, and it will marked as non cache-coherent and + * contiguous. + * + * All of these flags are orthogonal to one another: a developer may page-lock + * memory that is portable or mapped with no restrictions. + * + * The CUDA context must have been created with the ::cudaMapHost flag in + * order for the ::cudaHostRegisterMapped flag to have any effect. + * + * The ::cudaHostRegisterMapped flag may be specified on CUDA contexts for + * devices that do not support mapped pinned memory. The failure is deferred + * to ::cudaHostGetDevicePointer() because the memory may be mapped into + * other CUDA contexts via the ::cudaHostRegisterPortable flag. + * + * For devices that have a non-zero value for the device attribute + * ::cudaDevAttrCanUseHostPointerForRegisteredMem, the memory + * can also be accessed from the device using the host pointer \p ptr. + * The device pointer returned by ::cudaHostGetDevicePointer() may or may not + * match the original host pointer \p ptr and depends on the devices visible to the + * application. If all devices visible to the application have a non-zero value for the + * device attribute, the device pointer returned by ::cudaHostGetDevicePointer() + * will match the original pointer \p ptr. If any device visible to the application + * has a zero value for the device attribute, the device pointer returned by + * ::cudaHostGetDevicePointer() will not match the original host pointer \p ptr, + * but it will be suitable for use on all devices provided Unified Virtual Addressing + * is enabled. In such systems, it is valid to access the memory using either pointer + * on devices that have a non-zero value for the device attribute. Note however that + * such devices should access the memory using only of the two pointers and not both. + * + * The memory page-locked by this function must be unregistered with ::cudaHostUnregister(). + * + * \param ptr - Host pointer to memory to page-lock + * \param size - Size in bytes of the address range to page-lock in bytes + * \param flags - Flags for allocation request + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorMemoryAllocation, + * ::cudaErrorHostMemoryAlreadyRegistered, + * ::cudaErrorNotSupported + * \notefnerr + * + * \sa ::cudaHostUnregister, ::cudaHostGetFlags, ::cudaHostGetDevicePointer, + * ::cuMemHostRegister + */ +extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size, unsigned int flags); + +/** + * \brief Unregisters a memory range that was registered with cudaHostRegister + * + * Unmaps the memory range whose base address is specified by \p ptr, and makes + * it pageable again. + * + * The base address must be the same one specified to ::cudaHostRegister(). + * + * \param ptr - Host pointer to memory to unregister + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorHostMemoryNotRegistered + * \notefnerr + * + * \sa ::cudaHostUnregister, + * ::cuMemHostUnregister + */ +extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr); + +/** + * \brief Passes back device pointer of mapped host memory allocated by + * cudaHostAlloc or registered by cudaHostRegister + * + * Passes back the device pointer corresponding to the mapped, pinned host + * buffer allocated by ::cudaHostAlloc() or registered by ::cudaHostRegister(). + * + * ::cudaHostGetDevicePointer() will fail if the ::cudaDeviceMapHost flag was + * not specified before deferred context creation occurred, or if called on a + * device that does not support mapped, pinned memory. + * + * For devices that have a non-zero value for the device attribute + * ::cudaDevAttrCanUseHostPointerForRegisteredMem, the memory + * can also be accessed from the device using the host pointer \p pHost. + * The device pointer returned by ::cudaHostGetDevicePointer() may or may not + * match the original host pointer \p pHost and depends on the devices visible to the + * application. If all devices visible to the application have a non-zero value for the + * device attribute, the device pointer returned by ::cudaHostGetDevicePointer() + * will match the original pointer \p pHost. If any device visible to the application + * has a zero value for the device attribute, the device pointer returned by + * ::cudaHostGetDevicePointer() will not match the original host pointer \p pHost, + * but it will be suitable for use on all devices provided Unified Virtual Addressing + * is enabled. In such systems, it is valid to access the memory using either pointer + * on devices that have a non-zero value for the device attribute. Note however that + * such devices should access the memory using only of the two pointers and not both. + * + * \p flags provides for future releases. For now, it must be set to 0. + * + * \param pDevice - Returned device pointer for mapped memory + * \param pHost - Requested host pointer mapping + * \param flags - Flags for extensions (must be 0 for now) + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorMemoryAllocation + * \notefnerr + * + * \sa ::cudaSetDeviceFlags, ::cudaHostAlloc, + * ::cuMemHostGetDevicePointer + */ +extern __host__ cudaError_t CUDARTAPI cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags); + +/** + * \brief Passes back flags used to allocate pinned host memory allocated by + * cudaHostAlloc + * + * ::cudaHostGetFlags() will fail if the input pointer does not + * reside in an address range allocated by ::cudaHostAlloc(). + * + * \param pFlags - Returned flags word + * \param pHost - Host pointer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * + * \sa ::cudaHostAlloc, + * ::cuMemHostGetFlags + */ +extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags, void *pHost); + +/** + * \brief Allocates logical 1D, 2D, or 3D memory objects on the device + * + * Allocates at least \p width * \p height * \p depth bytes of linear memory + * on the device and returns a ::cudaPitchedPtr in which \p ptr is a pointer + * to the allocated memory. The function may pad the allocation to ensure + * hardware alignment requirements are met. The pitch returned in the \p pitch + * field of \p pitchedDevPtr is the width in bytes of the allocation. + * + * The returned ::cudaPitchedPtr contains additional fields \p xsize and + * \p ysize, the logical width and height of the allocation, which are + * equivalent to the \p width and \p height \p extent parameters provided by + * the programmer during allocation. + * + * For allocations of 2D and 3D objects, it is highly recommended that + * programmers perform allocations using ::cudaMalloc3D() or + * ::cudaMallocPitch(). Due to alignment restrictions in the hardware, this is + * especially true if the application will be performing memory copies + * involving 2D or 3D objects (whether linear memory or CUDA arrays). + * + * \param pitchedDevPtr - Pointer to allocated pitched device memory + * \param extent - Requested allocation size (\p width field in bytes) + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorMemoryAllocation + * \notefnerr + * + * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMemcpy3D, ::cudaMemset3D, + * ::cudaMalloc3DArray, ::cudaMallocArray, ::cudaFreeArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaHostAlloc, ::make_cudaPitchedPtr, ::make_cudaExtent, + * ::cuMemAllocPitch + */ +extern __host__ cudaError_t CUDARTAPI cudaMalloc3D(struct cudaPitchedPtr* pitchedDevPtr, struct cudaExtent extent); + +/** + * \brief Allocate an array on the device + * + * Allocates a CUDA array according to the ::cudaChannelFormatDesc structure + * \p desc and returns a handle to the new CUDA array in \p *array. + * + * The ::cudaChannelFormatDesc is defined as: + * \code + struct cudaChannelFormatDesc { + int x, y, z, w; + enum cudaChannelFormatKind f; + }; + \endcode + * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned, + * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat. + * + * ::cudaMalloc3DArray() can allocate the following: + * + * - A 1D array is allocated if the height and depth extents are both zero. + * - A 2D array is allocated if only the depth extent is zero. + * - A 3D array is allocated if all three extents are non-zero. + * - A 1D layered CUDA array is allocated if only the height extent is zero and + * the cudaArrayLayered flag is set. Each layer is a 1D array. The number of layers is + * determined by the depth extent. + * - A 2D layered CUDA array is allocated if all three extents are non-zero and + * the cudaArrayLayered flag is set. Each layer is a 2D array. The number of layers is + * determined by the depth extent. + * - A cubemap CUDA array is allocated if all three extents are non-zero and the + * cudaArrayCubemap flag is set. Width must be equal to height, and depth must be six. A cubemap is + * a special type of 2D layered CUDA array, where the six layers represent the six faces of a cube. + * The order of the six layers in memory is the same as that listed in ::cudaGraphicsCubeFace. + * - A cubemap layered CUDA array is allocated if all three extents are non-zero, and both, + * cudaArrayCubemap and cudaArrayLayered flags are set. Width must be equal to height, and depth must be + * a multiple of six. A cubemap layered CUDA array is a special type of 2D layered CUDA array that consists + * of a collection of cubemaps. The first six layers represent the first cubemap, the next six layers form + * the second cubemap, and so on. + * + * + * The \p flags parameter enables different options to be specified that affect + * the allocation, as follows. + * - ::cudaArrayDefault: This flag's value is defined to be 0 and provides default array allocation + * - ::cudaArrayLayered: Allocates a layered CUDA array, with the depth extent indicating the number of layers + * - ::cudaArrayCubemap: Allocates a cubemap CUDA array. Width must be equal to height, and depth must be six. + * If the cudaArrayLayered flag is also set, depth must be a multiple of six. + * - ::cudaArraySurfaceLoadStore: Allocates a CUDA array that could be read from or written to using a surface + * reference. + * - ::cudaArrayTextureGather: This flag indicates that texture gather operations will be performed on the CUDA + * array. Texture gather can only be performed on 2D CUDA arrays. + * + * The width, height and depth extents must meet certain size requirements as listed in the following table. + * All values are specified in elements. + * + * Note that 2D CUDA arrays have different size requirements if the ::cudaArrayTextureGather flag is set. In that + * case, the valid range for (width, height, depth) is ((1,maxTexture2DGather[0]), (1,maxTexture2DGather[1]), 0). + * + * \xmlonly + * + * + * + * + * + * + * + * CUDA array type + * Valid extents that must always be met {(width range in elements), + * (height range), (depth range)} + * Valid extents with cudaArraySurfaceLoadStore set {(width range in + * elements), (height range), (depth range)} + * + * + * + * + * 1D + * { (1,maxTexture1D), 0, 0 } + * { (1,maxSurface1D), 0, 0 } + * + * + * 2D + * { (1,maxTexture2D[0]), (1,maxTexture2D[1]), 0 } + * { (1,maxSurface2D[0]), (1,maxSurface2D[1]), 0 } + * + * + * 3D + * { (1,maxTexture3D[0]), (1,maxTexture3D[1]), (1,maxTexture3D[2]) } + * OR { (1,maxTexture3DAlt[0]), (1,maxTexture3DAlt[1]), + * (1,maxTexture3DAlt[2]) } + * { (1,maxSurface3D[0]), (1,maxSurface3D[1]), (1,maxSurface3D[2]) } + * + * + * 1D Layered + * { (1,maxTexture1DLayered[0]), 0, (1,maxTexture1DLayered[1]) } + * { (1,maxSurface1DLayered[0]), 0, (1,maxSurface1DLayered[1]) } + * + * + * 2D Layered + * { (1,maxTexture2DLayered[0]), (1,maxTexture2DLayered[1]), + * (1,maxTexture2DLayered[2]) } + * { (1,maxSurface2DLayered[0]), (1,maxSurface2DLayered[1]), + * (1,maxSurface2DLayered[2]) } + * + * + * Cubemap + * { (1,maxTextureCubemap), (1,maxTextureCubemap), 6 } + * { (1,maxSurfaceCubemap), (1,maxSurfaceCubemap), 6 } + * + * + * Cubemap Layered + * { (1,maxTextureCubemapLayered[0]), (1,maxTextureCubemapLayered[0]), + * (1,maxTextureCubemapLayered[1]) } + * { (1,maxSurfaceCubemapLayered[0]), (1,maxSurfaceCubemapLayered[0]), + * (1,maxSurfaceCubemapLayered[1]) } + * + * + * + *
+ * \endxmlonly + * + * \param array - Pointer to allocated array in device memory + * \param desc - Requested channel format + * \param extent - Requested allocation size (\p width field in elements) + * \param flags - Flags for extensions + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorMemoryAllocation + * \notefnerr + * + * \sa ::cudaMalloc3D, ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, + * ::cudaFreeArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaHostAlloc, + * ::make_cudaExtent, + * ::cuArray3DCreate + */ +extern __host__ cudaError_t CUDARTAPI cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int flags __dv(0)); + +/** + * \brief Allocate a mipmapped array on the device + * + * Allocates a CUDA mipmapped array according to the ::cudaChannelFormatDesc structure + * \p desc and returns a handle to the new CUDA mipmapped array in \p *mipmappedArray. + * \p numLevels specifies the number of mipmap levels to be allocated. This value is + * clamped to the range [1, 1 + floor(log2(max(width, height, depth)))]. + * + * The ::cudaChannelFormatDesc is defined as: + * \code + struct cudaChannelFormatDesc { + int x, y, z, w; + enum cudaChannelFormatKind f; + }; + \endcode + * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned, + * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat. + * + * ::cudaMallocMipmappedArray() can allocate the following: + * + * - A 1D mipmapped array is allocated if the height and depth extents are both zero. + * - A 2D mipmapped array is allocated if only the depth extent is zero. + * - A 3D mipmapped array is allocated if all three extents are non-zero. + * - A 1D layered CUDA mipmapped array is allocated if only the height extent is zero and + * the cudaArrayLayered flag is set. Each layer is a 1D mipmapped array. The number of layers is + * determined by the depth extent. + * - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and + * the cudaArrayLayered flag is set. Each layer is a 2D mipmapped array. The number of layers is + * determined by the depth extent. + * - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the + * cudaArrayCubemap flag is set. Width must be equal to height, and depth must be six. + * The order of the six layers in memory is the same as that listed in ::cudaGraphicsCubeFace. + * - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero, and both, + * cudaArrayCubemap and cudaArrayLayered flags are set. Width must be equal to height, and depth must be + * a multiple of six. A cubemap layered CUDA mipmapped array is a special type of 2D layered CUDA mipmapped + * array that consists of a collection of cubemap mipmapped arrays. The first six layers represent the + * first cubemap mipmapped array, the next six layers form the second cubemap mipmapped array, and so on. + * + * + * The \p flags parameter enables different options to be specified that affect + * the allocation, as follows. + * - ::cudaArrayDefault: This flag's value is defined to be 0 and provides default mipmapped array allocation + * - ::cudaArrayLayered: Allocates a layered CUDA mipmapped array, with the depth extent indicating the number of layers + * - ::cudaArrayCubemap: Allocates a cubemap CUDA mipmapped array. Width must be equal to height, and depth must be six. + * If the cudaArrayLayered flag is also set, depth must be a multiple of six. + * - ::cudaArraySurfaceLoadStore: This flag indicates that individual mipmap levels of the CUDA mipmapped array + * will be read from or written to using a surface reference. + * - ::cudaArrayTextureGather: This flag indicates that texture gather operations will be performed on the CUDA + * array. Texture gather can only be performed on 2D CUDA mipmapped arrays, and the gather operations are + * performed only on the most detailed mipmap level. + * + * The width, height and depth extents must meet certain size requirements as listed in the following table. + * All values are specified in elements. + * + * \xmlonly + * + * + * + * + * + * + * + * CUDA array type + * Valid extents that must always be met {(width range in elements), + * (height range), (depth range)} + * Valid extents with cudaArraySurfaceLoadStore set {(width range in + * elements), (height range), (depth range)} + * + * + * + * + * 1D + * { (1,maxTexture1DMipmap), 0, 0 } + * { (1,maxSurface1D), 0, 0 } + * + * + * 2D + * { (1,maxTexture2DMipmap[0]), (1,maxTexture2DMipmap[1]), 0 } + * { (1,maxSurface2D[0]), (1,maxSurface2D[1]), 0 } + * + * + * 3D + * { (1,maxTexture3D[0]), (1,maxTexture3D[1]), (1,maxTexture3D[2]) } + * OR { (1,maxTexture3DAlt[0]), (1,maxTexture3DAlt[1]), + * (1,maxTexture3DAlt[2]) } + * { (1,maxSurface3D[0]), (1,maxSurface3D[1]), (1,maxSurface3D[2]) } + * + * + * 1D Layered + * { (1,maxTexture1DLayered[0]), 0, (1,maxTexture1DLayered[1]) } + * { (1,maxSurface1DLayered[0]), 0, (1,maxSurface1DLayered[1]) } + * + * + * 2D Layered + * { (1,maxTexture2DLayered[0]), (1,maxTexture2DLayered[1]), + * (1,maxTexture2DLayered[2]) } + * { (1,maxSurface2DLayered[0]), (1,maxSurface2DLayered[1]), + * (1,maxSurface2DLayered[2]) } + * + * + * Cubemap + * { (1,maxTextureCubemap), (1,maxTextureCubemap), 6 } + * { (1,maxSurfaceCubemap), (1,maxSurfaceCubemap), 6 } + * + * + * Cubemap Layered + * { (1,maxTextureCubemapLayered[0]), (1,maxTextureCubemapLayered[0]), + * (1,maxTextureCubemapLayered[1]) } + * { (1,maxSurfaceCubemapLayered[0]), (1,maxSurfaceCubemapLayered[0]), + * (1,maxSurfaceCubemapLayered[1]) } + * + * + * + *
+ * \endxmlonly + * + * \param mipmappedArray - Pointer to allocated mipmapped array in device memory + * \param desc - Requested channel format + * \param extent - Requested allocation size (\p width field in elements) + * \param numLevels - Number of mipmap levels to allocate + * \param flags - Flags for extensions + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorMemoryAllocation + * \notefnerr + * + * \sa ::cudaMalloc3D, ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, + * ::cudaFreeArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaHostAlloc, + * ::make_cudaExtent, + * ::cuMipmappedArrayCreate + */ +extern __host__ cudaError_t CUDARTAPI cudaMallocMipmappedArray(cudaMipmappedArray_t *mipmappedArray, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int numLevels, unsigned int flags __dv(0)); + +/** + * \brief Gets a mipmap level of a CUDA mipmapped array + * + * Returns in \p *levelArray a CUDA array that represents a single mipmap level + * of the CUDA mipmapped array \p mipmappedArray. + * + * If \p level is greater than the maximum number of levels in this mipmapped array, + * ::cudaErrorInvalidValue is returned. + * + * \param levelArray - Returned mipmap level CUDA array + * \param mipmappedArray - CUDA mipmapped array + * \param level - Mipmap level + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * + * \sa ::cudaMalloc3D, ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, + * ::cudaFreeArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaHostAlloc, + * ::make_cudaExtent, + * ::cuMipmappedArrayGetLevel + */ +extern __host__ cudaError_t CUDARTAPI cudaGetMipmappedArrayLevel(cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray, unsigned int level); + +/** + * \brief Copies data between 3D objects + * +\code +struct cudaExtent { + size_t width; + size_t height; + size_t depth; +}; +struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d); + +struct cudaPos { + size_t x; + size_t y; + size_t z; +}; +struct cudaPos make_cudaPos(size_t x, size_t y, size_t z); + +struct cudaMemcpy3DParms { + cudaArray_t srcArray; + struct cudaPos srcPos; + struct cudaPitchedPtr srcPtr; + cudaArray_t dstArray; + struct cudaPos dstPos; + struct cudaPitchedPtr dstPtr; + struct cudaExtent extent; + enum cudaMemcpyKind kind; +}; +\endcode + * + * ::cudaMemcpy3D() copies data betwen two 3D objects. The source and + * destination objects may be in either host memory, device memory, or a CUDA + * array. The source, destination, extent, and kind of copy performed is + * specified by the ::cudaMemcpy3DParms struct which should be initialized to + * zero before use: +\code +cudaMemcpy3DParms myParms = {0}; +\endcode + * + * The struct passed to ::cudaMemcpy3D() must specify one of \p srcArray or + * \p srcPtr and one of \p dstArray or \p dstPtr. Passing more than one + * non-zero source or destination will cause ::cudaMemcpy3D() to return an + * error. + * + * The \p srcPos and \p dstPos fields are optional offsets into the source and + * destination objects and are defined in units of each object's elements. The + * element for a host or device pointer is assumed to be unsigned char. + * + * The \p extent field defines the dimensions of the transferred area in + * elements. If a CUDA array is participating in the copy, the extent is + * defined in terms of that array's elements. If no CUDA array is + * participating in the copy then the extents are defined in elements of + * unsigned char. + * + * The \p kind field defines the direction of the copy. It must be one of + * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * For ::cudaMemcpyHostToHost or ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost + * passed as kind and cudaArray type passed as source or destination, if the kind + * implies cudaArray type to be present on the host, ::cudaMemcpy3D() will + * disregard that implication and silently correct the kind based on the fact that + * cudaArray type can only be present on the device. + * + * If the source and destination are both arrays, ::cudaMemcpy3D() will return + * an error if they do not have the same element size. + * + * The source and destination object may not overlap. If overlapping source + * and destination objects are specified, undefined behavior will result. + * + * The source object must lie entirely within the region defined by \p srcPos + * and \p extent. The destination object must lie entirely within the region + * defined by \p dstPos and \p extent. + * + * ::cudaMemcpy3D() returns an error if the pitch of \p srcPtr or \p dstPtr + * exceeds the maximum allowed. The pitch of a ::cudaPitchedPtr allocated + * with ::cudaMalloc3D() will always be valid. + * + * \param p - 3D memory copy parameters + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidPitchValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_sync + * + * \sa ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaMemset3D, ::cudaMemcpy3DAsync, + * ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::make_cudaExtent, ::make_cudaPos, + * ::cuMemcpy3D + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3DParms *p); + +/** + * \brief Copies memory between devices + * + * Perform a 3D memory copy according to the parameters specified in + * \p p. See the definition of the ::cudaMemcpy3DPeerParms structure + * for documentation of its parameters. + * + * Note that this function is synchronous with respect to the host only if + * the source or destination of the transfer is host memory. Note also + * that this copy is serialized with respect to all pending and future + * asynchronous work in to the current device, the copy's source device, + * and the copy's destination device (use ::cudaMemcpy3DPeerAsync to avoid + * this synchronization). + * + * \param p - Parameters for the memory copy + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidDevice + * \notefnerr + * \note_sync + * + * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync, ::cudaMemcpyPeerAsync, + * ::cudaMemcpy3DPeerAsync, + * ::cuMemcpy3DPeer + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p); + +/** + * \brief Copies data between 3D objects + * +\code +struct cudaExtent { + size_t width; + size_t height; + size_t depth; +}; +struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d); + +struct cudaPos { + size_t x; + size_t y; + size_t z; +}; +struct cudaPos make_cudaPos(size_t x, size_t y, size_t z); + +struct cudaMemcpy3DParms { + cudaArray_t srcArray; + struct cudaPos srcPos; + struct cudaPitchedPtr srcPtr; + cudaArray_t dstArray; + struct cudaPos dstPos; + struct cudaPitchedPtr dstPtr; + struct cudaExtent extent; + enum cudaMemcpyKind kind; +}; +\endcode + * + * ::cudaMemcpy3DAsync() copies data betwen two 3D objects. The source and + * destination objects may be in either host memory, device memory, or a CUDA + * array. The source, destination, extent, and kind of copy performed is + * specified by the ::cudaMemcpy3DParms struct which should be initialized to + * zero before use: +\code +cudaMemcpy3DParms myParms = {0}; +\endcode + * + * The struct passed to ::cudaMemcpy3DAsync() must specify one of \p srcArray + * or \p srcPtr and one of \p dstArray or \p dstPtr. Passing more than one + * non-zero source or destination will cause ::cudaMemcpy3DAsync() to return an + * error. + * + * The \p srcPos and \p dstPos fields are optional offsets into the source and + * destination objects and are defined in units of each object's elements. The + * element for a host or device pointer is assumed to be unsigned char. + * For CUDA arrays, positions must be in the range [0, 2048) for any + * dimension. + * + * The \p extent field defines the dimensions of the transferred area in + * elements. If a CUDA array is participating in the copy, the extent is + * defined in terms of that array's elements. If no CUDA array is + * participating in the copy then the extents are defined in elements of + * unsigned char. + * + * The \p kind field defines the direction of the copy. It must be one of + * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * For ::cudaMemcpyHostToHost or ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost + * passed as kind and cudaArray type passed as source or destination, if the kind + * implies cudaArray type to be present on the host, ::cudaMemcpy3DAsync() will + * disregard that implication and silently correct the kind based on the fact that + * cudaArray type can only be present on the device. + * + * If the source and destination are both arrays, ::cudaMemcpy3DAsync() will + * return an error if they do not have the same element size. + * + * The source and destination object may not overlap. If overlapping source + * and destination objects are specified, undefined behavior will result. + * + * The source object must lie entirely within the region defined by \p srcPos + * and \p extent. The destination object must lie entirely within the region + * defined by \p dstPos and \p extent. + * + * ::cudaMemcpy3DAsync() returns an error if the pitch of \p srcPtr or + * \p dstPtr exceeds the maximum allowed. The pitch of a + * ::cudaPitchedPtr allocated with ::cudaMalloc3D() will always be valid. + * + * ::cudaMemcpy3DAsync() is asynchronous with respect to the host, so + * the call may return before the copy is complete. The copy can optionally + * be associated to a stream by passing a non-zero \p stream argument. If + * \p kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream + * is non-zero, the copy may overlap with operations in other streams. + * + * The device version of this function only handles device to device copies and + * cannot be given local or shared pointers. + * + * \param p - 3D memory copy parameters + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidPitchValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaMemset3D, ::cudaMemcpy3D, + * ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::make_cudaExtent, ::make_cudaPos, + * ::cuMemcpy3DAsync + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0)); + +/** + * \brief Copies memory between devices asynchronously. + * + * Perform a 3D memory copy according to the parameters specified in + * \p p. See the definition of the ::cudaMemcpy3DPeerParms structure + * for documentation of its parameters. + * + * \param p - Parameters for the memory copy + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidDevice + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync, ::cudaMemcpyPeerAsync, + * ::cudaMemcpy3DPeerAsync, + * ::cuMemcpy3DPeerAsync + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0)); + +/** + * \brief Gets free and total device memory + * + * Returns in \p *free and \p *total respectively, the free and total amount of + * memory available for allocation by the device in bytes. + * + * \param free - Returned free memory in bytes + * \param total - Returned total memory in bytes + * + * \return + * ::cudaSuccess, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidValue, + * ::cudaErrorLaunchFailure + * \notefnerr + * + * \sa + * ::cuMemGetInfo + */ +extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free, size_t *total); + +/** + * \brief Gets info about the specified cudaArray + * + * Returns in \p *desc, \p *extent and \p *flags respectively, the type, shape + * and flags of \p array. + * + * Any of \p *desc, \p *extent and \p *flags may be specified as NULL. + * + * \param desc - Returned array type + * \param extent - Returned array shape. 2D arrays will have depth of zero + * \param flags - Returned array flags + * \param array - The ::cudaArray to get info for + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * + * \sa + * ::cuArrayGetDescriptor, + * ::cuArray3DGetDescriptor + */ +extern __host__ cudaError_t CUDARTAPI cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent, unsigned int *flags, cudaArray_t array); + +/** + * \brief Copies data between host and device + * + * Copies \p count bytes from the memory area pointed to by \p src to the + * memory area pointed to by \p dst, where \p kind specifies the direction + * of the copy, and must be one of ::cudaMemcpyHostToHost, + * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. Calling + * ::cudaMemcpy() with dst and src pointers that do not match the direction of + * the copy results in an undefined behavior. + * + * \param dst - Destination memory address + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * + * \note_sync + * + * \sa ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpyDtoH, + * ::cuMemcpyHtoD, + * ::cuMemcpyDtoD, + * ::cuMemcpy + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind); + +/** + * \brief Copies memory between two devices + * + * Copies memory from one device to memory on another device. \p dst is the + * base device pointer of the destination memory and \p dstDevice is the + * destination device. \p src is the base device pointer of the source memory + * and \p srcDevice is the source device. \p count specifies the number of bytes + * to copy. + * + * Note that this function is asynchronous with respect to the host, but + * serialized with respect all pending and future asynchronous work in to the + * current device, \p srcDevice, and \p dstDevice (use ::cudaMemcpyPeerAsync + * to avoid this synchronization). + * + * \param dst - Destination device pointer + * \param dstDevice - Destination device + * \param src - Source device pointer + * \param srcDevice - Source device + * \param count - Size of memory copy in bytes + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidDevice + * \notefnerr + * \note_sync + * + * \sa ::cudaMemcpy, ::cudaMemcpyAsync, ::cudaMemcpyPeerAsync, + * ::cudaMemcpy3DPeerAsync, + * ::cuMemcpyPeer + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeer(void *dst, int dstDevice, const void *src, int srcDevice, size_t count); + +/** + * \brief Copies data between host and device + * + * Copies \p count bytes from the memory area pointed to by \p src to the + * CUDA array \p dst starting at the upper left corner + * (\p wOffset, \p hOffset), where \p kind specifies the direction + * of the copy, and must be one of ::cudaMemcpyHostToHost, + * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * + * \param dst - Destination memory address + * \param wOffset - Destination starting X offset + * \param hOffset - Destination starting Y offset + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_sync + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpyHtoA, + * ::cuMemcpyDtoA + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind); + +/** + * \brief Copies data between host and device + * + * Copies \p count bytes from the CUDA array \p src starting at the upper + * left corner (\p wOffset, hOffset) to the memory area pointed to by \p dst, + * where \p kind specifies the direction of the copy, and must be one of + * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * + * \param dst - Destination memory address + * \param src - Source memory address + * \param wOffset - Source starting X offset + * \param hOffset - Source starting Y offset + * \param count - Size in bytes to copy + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_sync + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpyAtoH, + * ::cuMemcpyAtoD + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind); + +/** + * \brief Copies data between host and device + * + * Copies \p count bytes from the CUDA array \p src starting at the upper + * left corner (\p wOffsetSrc, \p hOffsetSrc) to the CUDA array \p dst + * starting at the upper left corner (\p wOffsetDst, \p hOffsetDst) where + * \p kind specifies the direction of the copy, and must be one of + * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * + * \param dst - Destination memory address + * \param wOffsetDst - Destination starting X offset + * \param hOffsetDst - Destination starting Y offset + * \param src - Source memory address + * \param wOffsetSrc - Source starting X offset + * \param hOffsetSrc - Source starting Y offset + * \param count - Size in bytes to copy + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpyAtoA + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)); + +/** + * \brief Copies data between host and device + * + * Copies a matrix (\p height rows of \p width bytes each) from the memory + * area pointed to by \p src to the memory area pointed to by \p dst, where + * \p kind specifies the direction of the copy, and must be one of + * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. \p dpitch and + * \p spitch are the widths in memory in bytes of the 2D arrays pointed to by + * \p dst and \p src, including any padding added to the end of each row. The + * memory areas may not overlap. \p width must not exceed either \p dpitch or + * \p spitch. Calling ::cudaMemcpy2D() with \p dst and \p src pointers that do + * not match the direction of the copy results in an undefined behavior. + * ::cudaMemcpy2D() returns an error if \p dpitch or \p spitch exceeds + * the maximum allowed. + * + * \param dst - Destination memory address + * \param dpitch - Pitch of destination memory + * \param src - Source memory address + * \param spitch - Pitch of source memory + * \param width - Width of matrix transfer (columns in bytes) + * \param height - Height of matrix transfer (rows) + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidPitchValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * + * \sa ::cudaMemcpy, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpy2D, + * ::cuMemcpy2DUnaligned + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind); + +/** + * \brief Copies data between host and device + * + * Copies a matrix (\p height rows of \p width bytes each) from the memory + * area pointed to by \p src to the CUDA array \p dst starting at the + * upper left corner (\p wOffset, \p hOffset) where \p kind specifies the + * direction of the copy, and must be one of ::cudaMemcpyHostToHost, + * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * \p spitch is the width in memory in bytes of the 2D array pointed to by + * \p src, including any padding added to the end of each row. \p wOffset + + * \p width must not exceed the width of the CUDA array \p dst. \p width must + * not exceed \p spitch. ::cudaMemcpy2DToArray() returns an error if \p spitch + * exceeds the maximum allowed. + * + * \param dst - Destination memory address + * \param wOffset - Destination starting X offset + * \param hOffset - Destination starting Y offset + * \param src - Source memory address + * \param spitch - Pitch of source memory + * \param width - Width of matrix transfer (columns in bytes) + * \param height - Height of matrix transfer (rows) + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidPitchValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_sync + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpy2D, + * ::cuMemcpy2DUnaligned + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind); + +/** + * \brief Copies data between host and device + * + * Copies a matrix (\p height rows of \p width bytes each) from the CUDA + * array \p srcArray starting at the upper left corner + * (\p wOffset, \p hOffset) to the memory area pointed to by \p dst, where + * \p kind specifies the direction of the copy, and must be one of + * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. \p dpitch is the + * width in memory in bytes of the 2D array pointed to by \p dst, including any + * padding added to the end of each row. \p wOffset + \p width must not exceed + * the width of the CUDA array \p src. \p width must not exceed \p dpitch. + * ::cudaMemcpy2DFromArray() returns an error if \p dpitch exceeds the maximum + * allowed. + * + * \param dst - Destination memory address + * \param dpitch - Pitch of destination memory + * \param src - Source memory address + * \param wOffset - Source starting X offset + * \param hOffset - Source starting Y offset + * \param width - Width of matrix transfer (columns in bytes) + * \param height - Height of matrix transfer (rows) + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidPitchValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_sync + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpy2D, + * ::cuMemcpy2DUnaligned + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind); + +/** + * \brief Copies data between host and device + * + * Copies a matrix (\p height rows of \p width bytes each) from the CUDA + * array \p srcArray starting at the upper left corner + * (\p wOffsetSrc, \p hOffsetSrc) to the CUDA array \p dst starting at + * the upper left corner (\p wOffsetDst, \p hOffsetDst), where \p kind + * specifies the direction of the copy, and must be one of + * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * \p wOffsetDst + \p width must not exceed the width of the CUDA array \p dst. + * \p wOffsetSrc + \p width must not exceed the width of the CUDA array \p src. + * + * \param dst - Destination memory address + * \param wOffsetDst - Destination starting X offset + * \param hOffsetDst - Destination starting Y offset + * \param src - Source memory address + * \param wOffsetSrc - Source starting X offset + * \param hOffsetSrc - Source starting Y offset + * \param width - Width of matrix transfer (columns in bytes) + * \param height - Height of matrix transfer (rows) + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_sync + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpy2D, + * ::cuMemcpy2DUnaligned + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)); + +/** + * \brief Copies data to the given symbol on the device + * + * Copies \p count bytes from the memory area pointed to by \p src + * to the memory area pointed to by \p offset bytes from the start of symbol + * \p symbol. The memory areas may not overlap. \p symbol is a variable that + * resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. + * Passing ::cudaMemcpyDefault is recommended, in which case the type of + * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault + * is only allowed on systems that support unified virtual addressing. + * + * \param symbol - Device symbol address + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_sync + * \note_string_api_deprecation + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpy, + * ::cuMemcpyHtoD, + * ::cuMemcpyDtoD + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const void *symbol, const void *src, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice)); + +/** + * \brief Copies data from the given symbol on the device + * + * Copies \p count bytes from the memory area pointed to by \p offset bytes + * from the start of symbol \p symbol to the memory area pointed to by \p dst. + * The memory areas may not overlap. \p symbol is a variable that + * resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. + * Passing ::cudaMemcpyDefault is recommended, in which case the type of + * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault + * is only allowed on systems that support unified virtual addressing. + * + * \param dst - Destination memory address + * \param symbol - Device symbol address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_sync + * \note_string_api_deprecation + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpy, + * ::cuMemcpyDtoH, + * ::cuMemcpyDtoD + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(void *dst, const void *symbol, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost)); + + +/** + * \brief Copies data between host and device + * + * Copies \p count bytes from the memory area pointed to by \p src to the + * memory area pointed to by \p dst, where \p kind specifies the + * direction of the copy, and must be one of ::cudaMemcpyHostToHost, + * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * + * The memory areas may not overlap. Calling ::cudaMemcpyAsync() with \p dst and + * \p src pointers that do not match the direction of the copy results in an + * undefined behavior. + * + * ::cudaMemcpyAsync() is asynchronous with respect to the host, so the call + * may return before the copy is complete. The copy can optionally be + * associated to a stream by passing a non-zero \p stream argument. If \p kind + * is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and the \p stream is + * non-zero, the copy may overlap with operations in other streams. + * + * The device version of this function only handles device to device copies and + * cannot be given local or shared pointers. + * + * \param dst - Destination memory address + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param kind - Type of transfer + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync + * ::cuMemcpyAsync, + * ::cuMemcpyDtoHAsync, + * ::cuMemcpyHtoDAsync, + * ::cuMemcpyDtoDAsync + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + +/** + * \brief Copies memory between two devices asynchronously. + * + * Copies memory from one device to memory on another device. \p dst is the + * base device pointer of the destination memory and \p dstDevice is the + * destination device. \p src is the base device pointer of the source memory + * and \p srcDevice is the source device. \p count specifies the number of bytes + * to copy. + * + * Note that this function is asynchronous with respect to the host and all work + * on other devices. + * + * \param dst - Destination device pointer + * \param dstDevice - Destination device + * \param src - Source device pointer + * \param srcDevice - Source device + * \param count - Size of memory copy in bytes + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidDevice + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync, + * ::cudaMemcpy3DPeerAsync, + * ::cuMemcpyPeerAsync + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice, size_t count, cudaStream_t stream __dv(0)); + +/** + * \brief Copies data between host and device + * + * Copies \p count bytes from the memory area pointed to by \p src to the + * CUDA array \p dst starting at the upper left corner + * (\p wOffset, \p hOffset), where \p kind specifies the + * direction of the copy, and must be one of ::cudaMemcpyHostToHost, + * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * + * ::cudaMemcpyToArrayAsync() is asynchronous with respect to the host, so + * the call may return before the copy is complete. The copy can optionally + * be associated to a stream by passing a non-zero \p stream argument. If \p + * kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream + * is non-zero, the copy may overlap with operations in other streams. + * + * \param dst - Destination memory address + * \param wOffset - Destination starting X offset + * \param hOffset - Destination starting Y offset + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param kind - Type of transfer + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpyHtoAAsync, + * ::cuMemcpy2DAsync + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + +/** + * \brief Copies data between host and device + * + * Copies \p count bytes from the CUDA array \p src starting at the upper + * left corner (\p wOffset, hOffset) to the memory area pointed to by \p dst, + * where \p kind specifies the direction of the copy, and must be one of + * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * + * ::cudaMemcpyFromArrayAsync() is asynchronous with respect to the host, so + * the call may return before the copy is complete. The copy can optionally + * be associated to a stream by passing a non-zero \p stream argument. If \p + * kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream + * is non-zero, the copy may overlap with operations in other streams. + * + * \param dst - Destination memory address + * \param src - Source memory address + * \param wOffset - Source starting X offset + * \param hOffset - Source starting Y offset + * \param count - Size in bytes to copy + * \param kind - Type of transfer + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpyAtoHAsync, + * ::cuMemcpy2DAsync + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + +/** + * \brief Copies data between host and device + * + * Copies a matrix (\p height rows of \p width bytes each) from the memory + * area pointed to by \p src to the memory area pointed to by \p dst, where + * \p kind specifies the direction of the copy, and must be one of + * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * \p dpitch and \p spitch are the widths in memory in bytes of the 2D arrays + * pointed to by \p dst and \p src, including any padding added to the end of + * each row. The memory areas may not overlap. \p width must not exceed either + * \p dpitch or \p spitch. + * + * Calling ::cudaMemcpy2DAsync() with \p dst and \p src pointers that do not + * match the direction of the copy results in an undefined behavior. + * ::cudaMemcpy2DAsync() returns an error if \p dpitch or \p spitch is greater + * than the maximum allowed. + * + * ::cudaMemcpy2DAsync() is asynchronous with respect to the host, so + * the call may return before the copy is complete. The copy can optionally + * be associated to a stream by passing a non-zero \p stream argument. If + * \p kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and + * \p stream is non-zero, the copy may overlap with operations in other + * streams. + * + * The device version of this function only handles device to device copies and + * cannot be given local or shared pointers. + * + * \param dst - Destination memory address + * \param dpitch - Pitch of destination memory + * \param src - Source memory address + * \param spitch - Pitch of source memory + * \param width - Width of matrix transfer (columns in bytes) + * \param height - Height of matrix transfer (rows) + * \param kind - Type of transfer + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidPitchValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpy2DAsync + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + +/** + * \brief Copies data between host and device + * + * Copies a matrix (\p height rows of \p width bytes each) from the memory + * area pointed to by \p src to the CUDA array \p dst starting at the + * upper left corner (\p wOffset, \p hOffset) where \p kind specifies the + * direction of the copy, and must be one of ::cudaMemcpyHostToHost, + * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * \p spitch is the width in memory in bytes of the 2D array pointed to by + * \p src, including any padding added to the end of each row. \p wOffset + + * \p width must not exceed the width of the CUDA array \p dst. \p width must + * not exceed \p spitch. ::cudaMemcpy2DToArrayAsync() returns an error if + * \p spitch exceeds the maximum allowed. + * + * ::cudaMemcpy2DToArrayAsync() is asynchronous with respect to the host, so + * the call may return before the copy is complete. The copy can optionally + * be associated to a stream by passing a non-zero \p stream argument. If + * \p kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and + * \p stream is non-zero, the copy may overlap with operations in other + * streams. + * + * \param dst - Destination memory address + * \param wOffset - Destination starting X offset + * \param hOffset - Destination starting Y offset + * \param src - Source memory address + * \param spitch - Pitch of source memory + * \param width - Width of matrix transfer (columns in bytes) + * \param height - Height of matrix transfer (rows) + * \param kind - Type of transfer + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidPitchValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpy2DAsync + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + +/** + * \brief Copies data between host and device + * + * Copies a matrix (\p height rows of \p width bytes each) from the CUDA + * array \p srcArray starting at the upper left corner + * (\p wOffset, \p hOffset) to the memory area pointed to by \p dst, where + * \p kind specifies the direction of the copy, and must be one of + * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * \p dpitch is the width in memory in bytes of the 2D + * array pointed to by \p dst, including any padding added to the end of each + * row. \p wOffset + \p width must not exceed the width of the CUDA array + * \p src. \p width must not exceed \p dpitch. ::cudaMemcpy2DFromArrayAsync() + * returns an error if \p dpitch exceeds the maximum allowed. + * + * ::cudaMemcpy2DFromArrayAsync() is asynchronous with respect to the host, so + * the call may return before the copy is complete. The copy can optionally be + * associated to a stream by passing a non-zero \p stream argument. If \p kind + * is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream is + * non-zero, the copy may overlap with operations in other streams. + * + * \param dst - Destination memory address + * \param dpitch - Pitch of destination memory + * \param src - Source memory address + * \param wOffset - Source starting X offset + * \param hOffset - Source starting Y offset + * \param width - Width of matrix transfer (columns in bytes) + * \param height - Height of matrix transfer (rows) + * \param kind - Type of transfer + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidPitchValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpy2DAsync + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + +/** + * \brief Copies data to the given symbol on the device + * + * Copies \p count bytes from the memory area pointed to by \p src + * to the memory area pointed to by \p offset bytes from the start of symbol + * \p symbol. The memory areas may not overlap. \p symbol is a variable that + * resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. + * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer + * is inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * + * ::cudaMemcpyToSymbolAsync() is asynchronous with respect to the host, so + * the call may return before the copy is complete. The copy can optionally + * be associated to a stream by passing a non-zero \p stream argument. If + * \p kind is ::cudaMemcpyHostToDevice and \p stream is non-zero, the copy + * may overlap with operations in other streams. + * + * \param symbol - Device symbol address + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_async + * \note_null_stream + * \note_string_api_deprecation + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpyAsync, + * ::cuMemcpyHtoDAsync, + * ::cuMemcpyDtoDAsync + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + +/** + * \brief Copies data from the given symbol on the device + * + * Copies \p count bytes from the memory area pointed to by \p offset bytes + * from the start of symbol \p symbol to the memory area pointed to by \p dst. + * The memory areas may not overlap. \p symbol is a variable that resides in + * global or constant memory space. \p kind can be either + * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. + * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer + * is inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * + * ::cudaMemcpyFromSymbolAsync() is asynchronous with respect to the host, so + * the call may return before the copy is complete. The copy can optionally be + * associated to a stream by passing a non-zero \p stream argument. If \p kind + * is ::cudaMemcpyDeviceToHost and \p stream is non-zero, the copy may overlap + * with operations in other streams. + * + * \param dst - Destination memory address + * \param symbol - Device symbol address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_async + * \note_null_stream + * \note_string_api_deprecation + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, + * ::cuMemcpyAsync, + * ::cuMemcpyDtoHAsync, + * ::cuMemcpyDtoDAsync + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + + +/** + * \brief Initializes or sets device memory to a value + * + * Fills the first \p count bytes of the memory area pointed to by \p devPtr + * with the constant byte value \p value. + * + * Note that this function is asynchronous with respect to the host unless + * \p devPtr refers to pinned host memory. + * + * \param devPtr - Pointer to device memory + * \param value - Value to set for each byte of specified memory + * \param count - Size in bytes to set + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * \notefnerr + * \note_memset + * + * \sa + * ::cuMemsetD8, + * ::cuMemsetD16, + * ::cuMemsetD32 + */ +extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value, size_t count); + +/** + * \brief Initializes or sets device memory to a value + * + * Sets to the specified value \p value a matrix (\p height rows of \p width + * bytes each) pointed to by \p dstPtr. \p pitch is the width in bytes of the + * 2D array pointed to by \p dstPtr, including any padding added to the end + * of each row. This function performs fastest when the pitch is one that has + * been passed back by ::cudaMallocPitch(). + * + * Note that this function is asynchronous with respect to the host unless + * \p devPtr refers to pinned host memory. + * + * \param devPtr - Pointer to 2D device memory + * \param pitch - Pitch in bytes of 2D device memory + * \param value - Value to set for each byte of specified memory + * \param width - Width of matrix set (columns in bytes) + * \param height - Height of matrix set (rows) + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * \notefnerr + * \note_memset + * + * \sa ::cudaMemset, ::cudaMemset3D, ::cudaMemsetAsync, + * ::cudaMemset2DAsync, ::cudaMemset3DAsync, + * ::cuMemsetD2D8, + * ::cuMemsetD2D16, + * ::cuMemsetD2D32 + */ +extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch, int value, size_t width, size_t height); + +/** + * \brief Initializes or sets device memory to a value + * + * Initializes each element of a 3D array to the specified value \p value. + * The object to initialize is defined by \p pitchedDevPtr. The \p pitch field + * of \p pitchedDevPtr is the width in memory in bytes of the 3D array pointed + * to by \p pitchedDevPtr, including any padding added to the end of each row. + * The \p xsize field specifies the logical width of each row in bytes, while + * the \p ysize field specifies the height of each 2D slice in rows. + * + * The extents of the initialized region are specified as a \p width in bytes, + * a \p height in rows, and a \p depth in slices. + * + * Extents with \p width greater than or equal to the \p xsize of + * \p pitchedDevPtr may perform significantly faster than extents narrower + * than the \p xsize. Secondarily, extents with \p height equal to the + * \p ysize of \p pitchedDevPtr will perform faster than when the \p height is + * shorter than the \p ysize. + * + * This function performs fastest when the \p pitchedDevPtr has been allocated + * by ::cudaMalloc3D(). + * + * Note that this function is asynchronous with respect to the host unless + * \p pitchedDevPtr refers to pinned host memory. + * + * \param pitchedDevPtr - Pointer to pitched device memory + * \param value - Value to set for each byte of specified memory + * \param extent - Size parameters for where to set device memory (\p width field in bytes) + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * \notefnerr + * \note_memset + * + * \sa ::cudaMemset, ::cudaMemset2D, + * ::cudaMemsetAsync, ::cudaMemset2DAsync, ::cudaMemset3DAsync, + * ::cudaMalloc3D, ::make_cudaPitchedPtr, + * ::make_cudaExtent + */ +extern __host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent); + +/** + * \brief Initializes or sets device memory to a value + * + * Fills the first \p count bytes of the memory area pointed to by \p devPtr + * with the constant byte value \p value. + * + * ::cudaMemsetAsync() is asynchronous with respect to the host, so + * the call may return before the memset is complete. The operation can optionally + * be associated to a stream by passing a non-zero \p stream argument. + * If \p stream is non-zero, the operation may overlap with operations in other streams. + * + * The device version of this function only handles device to device copies and + * cannot be given local or shared pointers. + * + * \param devPtr - Pointer to device memory + * \param value - Value to set for each byte of specified memory + * \param count - Size in bytes to set + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D, + * ::cudaMemset2DAsync, ::cudaMemset3DAsync, + * ::cuMemsetD8Async, + * ::cuMemsetD16Async, + * ::cuMemsetD32Async + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream __dv(0)); + +/** + * \brief Initializes or sets device memory to a value + * + * Sets to the specified value \p value a matrix (\p height rows of \p width + * bytes each) pointed to by \p dstPtr. \p pitch is the width in bytes of the + * 2D array pointed to by \p dstPtr, including any padding added to the end + * of each row. This function performs fastest when the pitch is one that has + * been passed back by ::cudaMallocPitch(). + * + * ::cudaMemset2DAsync() is asynchronous with respect to the host, so + * the call may return before the memset is complete. The operation can optionally + * be associated to a stream by passing a non-zero \p stream argument. + * If \p stream is non-zero, the operation may overlap with operations in other streams. + * + * The device version of this function only handles device to device copies and + * cannot be given local or shared pointers. + * + * \param devPtr - Pointer to 2D device memory + * \param pitch - Pitch in bytes of 2D device memory + * \param value - Value to set for each byte of specified memory + * \param width - Width of matrix set (columns in bytes) + * \param height - Height of matrix set (rows) + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D, + * ::cudaMemsetAsync, ::cudaMemset3DAsync, + * ::cuMemsetD2D8Async, + * ::cuMemsetD2D16Async, + * ::cuMemsetD2D32Async + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream __dv(0)); + +/** + * \brief Initializes or sets device memory to a value + * + * Initializes each element of a 3D array to the specified value \p value. + * The object to initialize is defined by \p pitchedDevPtr. The \p pitch field + * of \p pitchedDevPtr is the width in memory in bytes of the 3D array pointed + * to by \p pitchedDevPtr, including any padding added to the end of each row. + * The \p xsize field specifies the logical width of each row in bytes, while + * the \p ysize field specifies the height of each 2D slice in rows. + * + * The extents of the initialized region are specified as a \p width in bytes, + * a \p height in rows, and a \p depth in slices. + * + * Extents with \p width greater than or equal to the \p xsize of + * \p pitchedDevPtr may perform significantly faster than extents narrower + * than the \p xsize. Secondarily, extents with \p height equal to the + * \p ysize of \p pitchedDevPtr will perform faster than when the \p height is + * shorter than the \p ysize. + * + * This function performs fastest when the \p pitchedDevPtr has been allocated + * by ::cudaMalloc3D(). + * + * ::cudaMemset3DAsync() is asynchronous with respect to the host, so + * the call may return before the memset is complete. The operation can optionally + * be associated to a stream by passing a non-zero \p stream argument. + * If \p stream is non-zero, the operation may overlap with operations in other streams. + * + * The device version of this function only handles device to device copies and + * cannot be given local or shared pointers. + * + * \param pitchedDevPtr - Pointer to pitched device memory + * \param value - Value to set for each byte of specified memory + * \param extent - Size parameters for where to set device memory (\p width field in bytes) + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D, + * ::cudaMemsetAsync, ::cudaMemset2DAsync, + * ::cudaMalloc3D, ::make_cudaPitchedPtr, + * ::make_cudaExtent + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream __dv(0)); + +/** + * \brief Finds the address associated with a CUDA symbol + * + * Returns in \p *devPtr the address of symbol \p symbol on the device. + * \p symbol is a variable that resides in global or constant memory space. + * If \p symbol cannot be found, or if \p symbol is not declared in the + * global or constant memory space, \p *devPtr is unchanged and the error + * ::cudaErrorInvalidSymbol is returned. + * + * \param devPtr - Return device pointer associated with symbol + * \param symbol - Device symbol address + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_string_api_deprecation + * + * \sa + * \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress (C++ API)", + * \ref ::cudaGetSymbolSize(size_t*, const void*) "cudaGetSymbolSize (C API)", + * ::cuModuleGetGlobal + */ +extern __host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr, const void *symbol); + +/** + * \brief Finds the size of the object associated with a CUDA symbol + * + * Returns in \p *size the size of symbol \p symbol. \p symbol is a variable that + * resides in global or constant memory space. If \p symbol cannot be found, or + * if \p symbol is not declared in global or constant memory space, \p *size is + * unchanged and the error ::cudaErrorInvalidSymbol is returned. + * + * \param size - Size of object associated with symbol + * \param symbol - Device symbol address + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_string_api_deprecation + * + * \sa + * \ref ::cudaGetSymbolAddress(void**, const void*) "cudaGetSymbolAddress (C API)", + * \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API)", + * ::cuModuleGetGlobal + */ +extern __host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size, const void *symbol); + +/** + * \brief Prefetches memory to the specified destination device + * + * Prefetches memory to the specified destination device. \p devPtr is the + * base device pointer of the memory to be prefetched and \p dstDevice is the + * destination device. \p count specifies the number of bytes to copy. \p stream + * is the stream in which the operation is enqueued. The memory range must refer + * to managed memory allocated via ::cudaMallocManaged or declared via __managed__ variables. + * + * Passing in cudaCpuDeviceId for \p dstDevice will prefetch the data to host memory. If + * \p dstDevice is a GPU, then the device attribute ::cudaDevAttrConcurrentManagedAccess + * must be non-zero. Additionally, \p stream must be associated with a device that has a + * non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess. + * + * The start address and end address of the memory range will be rounded down and rounded up + * respectively to be aligned to CPU page size before the prefetch operation is enqueued + * in the stream. + * + * If no physical memory has been allocated for this region, then this memory region + * will be populated and mapped on the destination device. If there's insufficient + * memory to prefetch the desired region, the Unified Memory driver may evict pages from other + * ::cudaMallocManaged allocations to host memory in order to make room. Device memory + * allocated using ::cudaMalloc or ::cudaMallocArray will not be evicted. + * + * By default, any mappings to the previous location of the migrated pages are removed and + * mappings for the new location are only setup on \p dstDevice. The exact behavior however + * also depends on the settings applied to this memory range via ::cudaMemAdvise as described + * below: + * + * If ::cudaMemAdviseSetReadMostly was set on any subset of this memory range, + * then that subset will create a read-only copy of the pages on \p dstDevice. + * + * If ::cudaMemAdviseSetPreferredLocation was called on any subset of this memory + * range, then the pages will be migrated to \p dstDevice even if \p dstDevice is not the + * preferred location of any pages in the memory range. + * + * If ::cudaMemAdviseSetAccessedBy was called on any subset of this memory range, + * then mappings to those pages from all the appropriate processors are updated to + * refer to the new location if establishing such a mapping is possible. Otherwise, + * those mappings are cleared. + * + * Note that this API is not required for functionality and only serves to improve performance + * by allowing the application to migrate data to a suitable location before it is accessed. + * Memory accesses to this range are always coherent and are allowed even when the data is + * actively being migrated. + * + * Note that this function is asynchronous with respect to the host and all work + * on other devices. + * + * \param devPtr - Pointer to be prefetched + * \param count - Size in bytes + * \param dstDevice - Destination device to prefetch to + * \param stream - Stream to enqueue prefetch operation + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidDevice + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync, + * ::cudaMemcpy3DPeerAsync, ::cudaMemAdvise, + * ::cuMemPrefetchAsync + */ +extern __host__ cudaError_t CUDARTAPI cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice, cudaStream_t stream __dv(0)); + +/** + * \brief Advise about the usage of a given memory range + * + * Advise the Unified Memory subsystem about the usage pattern for the memory range + * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory + * range will be rounded down and rounded up respectively to be aligned to CPU page size before the + * advice is applied. The memory range must refer to managed memory allocated via ::cudaMallocManaged + * or declared via __managed__ variables. + * + * The \p advice parameter can take the following values: + * - ::cudaMemAdviseSetReadMostly: This implies that the data is mostly going to be read + * from and only occasionally written to. Any read accesses from any processor to this region will create a + * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cudaMemPrefetchAsync + * is called on this region, it will create a read-only copy of the data on the destination processor. + * If any processor writes to this region, all copies of the corresponding page will be invalidated + * except for the one where the write occurred. The \p device argument is ignored for this advice. + * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + * that has a non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess. + * Also, if a context is created on a device that does not have the device attribute + * ::cudaDevAttrConcurrentManagedAccess set, then read-duplication will not occur until + * all such contexts are destroyed. + * - ::cudaMemAdviceUnsetReadMostly: Undoes the effect of ::cudaMemAdviceReadMostly and also prevents the + * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated + * copies of the data will be collapsed into a single copy. The location for the collapsed + * copy will be the preferred location if the page has a preferred location and one of the read-duplicated + * copies was resident at that location. Otherwise, the location chosen is arbitrary. + * - ::cudaMemAdviseSetPreferredLocation: This advice sets the preferred location for the + * data to be the memory belonging to \p device. Passing in cudaCpuDeviceId for \p device sets the + * preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the + * device attribute ::cudaDevAttrConcurrentManagedAccess. Setting the preferred location + * does not cause data to migrate to that location immediately. Instead, it guides the migration policy + * when a fault occurs on that memory region. If the data is already in its preferred location and the + * faulting processor can establish a mapping without requiring the data to be migrated, then + * data migration will be avoided. On the other hand, if the data is not in its preferred location + * or if a direct mapping cannot be established, then it will be migrated to the processor accessing + * it. It is important to note that setting the preferred location does not prevent data prefetching + * done using ::cudaMemPrefetchAsync. + * Having a preferred location can override the page thrash detection and resolution logic in the Unified + * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device + * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But + * if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + * If ::cudaMemAdviseSetReadMostly is also set on this memory region or any subset of it, then the + * policies associated with that advice will override the policies of this advice. + * - ::cudaMemAdviseUnsetPreferredLocation: Undoes the effect of ::cudaMemAdviseSetPreferredLocation + * and changes the preferred location to none. + * - ::cudaMemAdviseSetAccessedBy: This advice implies that the data will be accessed by \p device. + * Passing in ::cudaCpuDeviceId for \p device will set the advice for the CPU. If \p device is a GPU, then + * the device attribute ::cudaDevAttrConcurrentManagedAccess must be non-zero. + * This advice does not cause data migration and has no impact on the location of the data per se. Instead, + * it causes the data to always be mapped in the specified processor's page tables, as long as the + * location of the data permits a mapping to be established. If the data gets migrated for any reason, + * the mappings are updated accordingly. + * This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data + * over to the other GPUs is not as important because the accesses are infrequent and the overhead of + * migration may be too high. But preventing faults can still help improve performance, and so having + * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the + * ::cudaMemAdviceSetAccessedBy flag set for this data will now have its mapping updated to point to the + * page in host memory. + * If ::cudaMemAdviseSetReadMostly is also set on this memory region or any subset of it, then the + * policies associated with that advice will override the policies of this advice. Additionally, if the + * preferred location of this memory region or any subset of it is also \p device, then the policies + * associated with ::cudaMemAdviseSetPreferredLocation will override the policies of this advice. + * - ::cudaMemAdviseUnsetAccessedBy: Undoes the effect of ::cudaMemAdviseSetAccessedBy. Any mappings to + * the data from \p device may be removed at any time causing accesses to result in non-fatal page faults. + * + * \param devPtr - Pointer to memory to set the advice for + * \param count - Size in bytes of the memory range + * \param advice - Advice to be applied for the specified memory range + * \param device - Device to apply the advice for + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidDevice + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync, + * ::cudaMemcpy3DPeerAsync, ::cudaMemPrefetchAsync, + * ::cuMemAdvise + */ +extern __host__ cudaError_t CUDARTAPI cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice, int device); + +/** +* \brief Query an attribute of a given memory range +* +* Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The +* memory range must refer to managed memory allocated via ::cudaMallocManaged or declared via +* __managed__ variables. +* +* The \p attribute parameter can take the following values: +* - ::cudaMemRangeAttributeReadMostly: If this attribute is specified, \p data will be interpreted +* as a 32-bit integer, and \p dataSize must be 4. The result returned will be 1 if all pages in the given +* memory range have read-duplication enabled, or 0 otherwise. +* - ::cudaMemRangeAttributePreferredLocation: If this attribute is specified, \p data will be +* interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be a GPU device +* id if all pages in the memory range have that GPU as their preferred location, or it will be cudaCpuDeviceId +* if all pages in the memory range have the CPU as their preferred location, or it will be cudaInvalidDeviceId +* if either all the pages don't have the same preferred location or some of the pages don't have a +* preferred location at all. Note that the actual location of the pages in the memory range at the time of +* the query may be different from the preferred location. +* - ::cudaMemRangeAttributeAccessedBy: If this attribute is specified, \p data will be interpreted +* as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned +* will be a list of device ids that had ::cudaMemAdviceSetAccessedBy set for that entire memory range. +* If any device does not have that advice set for the entire memory range, that device will not be included. +* If \p data is larger than the number of devices that have that advice set for that memory range, +* cudaInvalidDeviceId will be returned in all the extra space provided. For ex., if \p dataSize is 12 +* (i.e. \p data has 3 elements) and only device 0 has the advice set, then the result returned will be +* { 0, cudaInvalidDeviceId, cudaInvalidDeviceId }. If \p data is smaller than the number of devices that have +* that advice set, then only as many devices will be returned as can fit in the array. There is no +* guarantee on which specific devices will be returned, however. +* - ::cudaMemRangeAttributeLastPrefetchLocation: If this attribute is specified, \p data will be +* interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be the last location +* to which all pages in the memory range were prefetched explicitly via ::cudaMemPrefetchAsync. This will either be +* a GPU id or cudaCpuDeviceId depending on whether the last location for prefetch was a GPU or the CPU +* respectively. If any page in the memory range was never explicitly prefetched or if all pages were not +* prefetched to the same location, cudaInvalidDeviceId will be returned. Note that this simply returns the +* last location that the applicaton requested to prefetch the memory range to. It gives no indication as to +* whether the prefetch operation to that location has completed or even begun. +* +* \param data - A pointers to a memory location where the result +* of each attribute query will be written to. +* \param dataSize - Array containing the size of data +* \param attribute - The attribute to query +* \param devPtr - Start of the range to query +* \param count - Size of the range to query + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cudaMemRangeGetAttributes, ::cudaMemPrefetchAsync, + * ::cudaMemAdvise, + * ::cuMemRangeGetAttribute + */ +extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttribute(void *data, size_t dataSize, enum cudaMemRangeAttribute attribute, const void *devPtr, size_t count); + +/** + * \brief Query attributes of a given memory range. + * + * Query attributes of the memory range starting at \p devPtr with a size of \p count bytes. The + * memory range must refer to managed memory allocated via ::cudaMallocManaged or declared via + * __managed__ variables. The \p attributes array will be interpreted to have \p numAttributes + * entries. The \p dataSizes array will also be interpreted to have \p numAttributes entries. + * The results of the query will be stored in \p data. + * + * The list of supported attributes are given below. Please refer to ::cudaMemRangeGetAttribute for + * attribute descriptions and restrictions. + * + * - ::cudaMemRangeAttributeReadMostly + * - ::cudaMemRangeAttributePreferredLocation + * - ::cudaMemRangeAttributeAccessedBy + * - ::cudaMemRangeAttributeLastPrefetchLocation + * + * \param data - A two-dimensional array containing pointers to memory + * locations where the result of each attribute query will be written to. + * \param dataSizes - Array containing the sizes of each result + * \param attributes - An array of attributes to query + * (numAttributes and the number of attributes in this array should match) + * \param numAttributes - Number of attributes to query + * \param devPtr - Start of the range to query + * \param count - Size of the range to query + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * + * \sa ::cudaMemRangeGetAttribute, ::cudaMemAdvise + * ::cudaMemPrefetchAsync, + * ::cuMemRangeGetAttributes + */ +extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttributes(void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes, size_t numAttributes, const void *devPtr, size_t count); + +/** @} */ /* END CUDART_MEMORY */ + +/** + * \defgroup CUDART_UNIFIED Unified Addressing + * + * ___MANBRIEF___ unified addressing functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the unified addressing functions of the CUDA + * runtime application programming interface. + * + * @{ + * + * \section CUDART_UNIFIED_overview Overview + * + * CUDA devices can share a unified address space with the host. + * For these devices there is no distinction between a device + * pointer and a host pointer -- the same pointer value may be + * used to access memory from the host program and from a kernel + * running on the device (with exceptions enumerated below). + * + * \section CUDART_UNIFIED_support Supported Platforms + * + * Whether or not a device supports unified addressing may be + * queried by calling ::cudaGetDeviceProperties() with the device + * property ::cudaDeviceProp::unifiedAddressing. + * + * Unified addressing is automatically enabled in 64-bit processes . + * + * Unified addressing is not yet supported on Windows Vista or + * Windows 7 for devices that do not use the TCC driver model. + * + * \section CUDART_UNIFIED_lookup Looking Up Information from Pointer Values + * + * It is possible to look up information about the memory which backs a + * pointer value. For instance, one may want to know if a pointer points + * to host or device memory. As another example, in the case of device + * memory, one may want to know on which CUDA device the memory + * resides. These properties may be queried using the function + * ::cudaPointerGetAttributes() + * + * Since pointers are unique, it is not necessary to specify information + * about the pointers specified to ::cudaMemcpy() and other copy functions. + * The copy direction ::cudaMemcpyDefault may be used to specify that the + * CUDA runtime should infer the location of the pointer from its value. + * + * \section CUDART_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory + * + * All host memory allocated through all devices using ::cudaMallocHost() and + * ::cudaHostAlloc() is always directly accessible from all devices that + * support unified addressing. This is the case regardless of whether or + * not the flags ::cudaHostAllocPortable and ::cudaHostAllocMapped are + * specified. + * + * The pointer value through which allocated host memory may be accessed + * in kernels on all devices that support unified addressing is the same + * as the pointer value through which that memory is accessed on the host. + * It is not necessary to call ::cudaHostGetDevicePointer() to get the device + * pointer for these allocations. + * + * Note that this is not the case for memory allocated using the flag + * ::cudaHostAllocWriteCombined, as discussed below. + * + * \section CUDART_UNIFIED_autopeerregister Direct Access of Peer Memory + + * Upon enabling direct access from a device that supports unified addressing + * to another peer device that supports unified addressing using + * ::cudaDeviceEnablePeerAccess() all memory allocated in the peer device using + * ::cudaMalloc() and ::cudaMallocPitch() will immediately be accessible + * by the current device. The device pointer value through + * which any peer's memory may be accessed in the current device + * is the same pointer value through which that memory may be + * accessed from the peer device. + * + * \section CUDART_UNIFIED_exceptions Exceptions, Disjoint Addressing + * + * Not all memory may be accessed on devices through the same pointer + * value through which they are accessed on the host. These exceptions + * are host memory registered using ::cudaHostRegister() and host memory + * allocated using the flag ::cudaHostAllocWriteCombined. For these + * exceptions, there exists a distinct host and device address for the + * memory. The device address is guaranteed to not overlap any valid host + * pointer range and is guaranteed to have the same value across all devices + * that support unified addressing. + * + * This device address may be queried using ::cudaHostGetDevicePointer() + * when a device using unified addressing is current. Either the host + * or the unified device pointer value may be used to refer to this memory + * in ::cudaMemcpy() and similar functions using the ::cudaMemcpyDefault + * memory direction. + * + */ + +/** + * \brief Returns attributes about a specified pointer + * + * Returns in \p *attributes the attributes of the pointer \p ptr. + * If pointer was not allocated in, mapped by or registered with context + * supporting unified addressing ::cudaErrorInvalidValue is returned. + * + * The ::cudaPointerAttributes structure is defined as: + * \code + struct cudaPointerAttributes { + enum cudaMemoryType memoryType; + int device; + void *devicePointer; + void *hostPointer; + int isManaged; + } + \endcode + * In this structure, the individual fields mean + * + * - \ref ::cudaPointerAttributes::memoryType "memoryType" identifies the physical + * location of the memory associated with pointer \p ptr. It can be + * ::cudaMemoryTypeHost for host memory or ::cudaMemoryTypeDevice for device + * memory. + * + * - \ref ::cudaPointerAttributes::device "device" is the device against which + * \p ptr was allocated. If \p ptr has memory type ::cudaMemoryTypeDevice + * then this identifies the device on which the memory referred to by \p ptr + * physically resides. If \p ptr has memory type ::cudaMemoryTypeHost then this + * identifies the device which was current when the allocation was made + * (and if that device is deinitialized then this allocation will vanish + * with that device's state). + * + * - \ref ::cudaPointerAttributes::devicePointer "devicePointer" is + * the device pointer alias through which the memory referred to by \p ptr + * may be accessed on the current device. + * If the memory referred to by \p ptr cannot be accessed directly by the + * current device then this is NULL. + * + * - \ref ::cudaPointerAttributes::hostPointer "hostPointer" is + * the host pointer alias through which the memory referred to by \p ptr + * may be accessed on the host. + * If the memory referred to by \p ptr cannot be accessed directly by the + * host then this is NULL. + * + * - \ref ::cudaPointerAttributes::isManaged "isManaged" indicates if + * the pointer \p ptr points to managed memory or not. + * + * \param attributes - Attributes for the specified pointer + * \param ptr - Pointer to get attributes for + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidValue + * + * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice, + * ::cudaChooseDevice, + * ::cuPointerGetAttributes + */ +extern __host__ cudaError_t CUDARTAPI cudaPointerGetAttributes(struct cudaPointerAttributes *attributes, const void *ptr); + +/** @} */ /* END CUDART_UNIFIED */ + +/** + * \defgroup CUDART_PEER Peer Device Memory Access + * + * ___MANBRIEF___ peer device memory access functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the peer device memory access functions of the CUDA runtime + * application programming interface. + * + * @{ + */ + +/** + * \brief Queries if a device may directly access a peer device's memory. + * + * Returns in \p *canAccessPeer a value of 1 if device \p device is capable of + * directly accessing memory from \p peerDevice and 0 otherwise. If direct + * access of \p peerDevice from \p device is possible, then access may be + * enabled by calling ::cudaDeviceEnablePeerAccess(). + * + * \param canAccessPeer - Returned access capability + * \param device - Device from which allocations on \p peerDevice are to + * be directly accessed. + * \param peerDevice - Device on which the allocations to be directly accessed + * by \p device reside. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice + * \notefnerr + * + * \sa ::cudaDeviceEnablePeerAccess, + * ::cudaDeviceDisablePeerAccess, + * ::cuDeviceCanAccessPeer + */ +extern __host__ cudaError_t CUDARTAPI cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice); + +/** + * \brief Enables direct access to memory allocations on a peer device. + * + * On success, all allocations from \p peerDevice will immediately be accessible by + * the current device. They will remain accessible until access is explicitly + * disabled using ::cudaDeviceDisablePeerAccess() or either device is reset using + * ::cudaDeviceReset(). + * + * Note that access granted by this call is unidirectional and that in order to access + * memory on the current device from \p peerDevice, a separate symmetric call + * to ::cudaDeviceEnablePeerAccess() is required. + * + * Each device can support a system-wide maximum of eight peer connections. + * + * Peer access is not supported in 32 bit applications. + * + * Returns ::cudaErrorInvalidDevice if ::cudaDeviceCanAccessPeer() indicates + * that the current device cannot directly access memory from \p peerDevice. + * + * Returns ::cudaErrorPeerAccessAlreadyEnabled if direct access of + * \p peerDevice from the current device has already been enabled. + * + * Returns ::cudaErrorInvalidValue if \p flags is not 0. + * + * \param peerDevice - Peer device to enable direct access to from the current device + * \param flags - Reserved for future use and must be set to 0 + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorPeerAccessAlreadyEnabled, + * ::cudaErrorInvalidValue + * \notefnerr + * + * \sa ::cudaDeviceCanAccessPeer, + * ::cudaDeviceDisablePeerAccess, + * ::cuCtxEnablePeerAccess + */ +extern __host__ cudaError_t CUDARTAPI cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags); + +/** + * \brief Disables direct access to memory allocations on a peer device. + * + * Returns ::cudaErrorPeerAccessNotEnabled if direct access to memory on + * \p peerDevice has not yet been enabled from the current device. + * + * \param peerDevice - Peer device to disable direct access to + * + * \return + * ::cudaSuccess, + * ::cudaErrorPeerAccessNotEnabled, + * ::cudaErrorInvalidDevice + * \notefnerr + * + * \sa ::cudaDeviceCanAccessPeer, + * ::cudaDeviceEnablePeerAccess, + * ::cuCtxDisablePeerAccess + */ +extern __host__ cudaError_t CUDARTAPI cudaDeviceDisablePeerAccess(int peerDevice); + +/** @} */ /* END CUDART_PEER */ + +/** \defgroup CUDART_OPENGL OpenGL Interoperability */ + +/** \defgroup CUDART_OPENGL_DEPRECATED OpenGL Interoperability [DEPRECATED] */ + +/** \defgroup CUDART_D3D9 Direct3D 9 Interoperability */ + +/** \defgroup CUDART_D3D9_DEPRECATED Direct3D 9 Interoperability [DEPRECATED] */ + +/** \defgroup CUDART_D3D10 Direct3D 10 Interoperability */ + +/** \defgroup CUDART_D3D10_DEPRECATED Direct3D 10 Interoperability [DEPRECATED] */ + +/** \defgroup CUDART_D3D11 Direct3D 11 Interoperability */ + +/** \defgroup CUDART_D3D11_DEPRECATED Direct3D 11 Interoperability [DEPRECATED] */ + +/** \defgroup CUDART_VDPAU VDPAU Interoperability */ + +/** \defgroup CUDART_EGL EGL Interoperability */ + +/** + * \defgroup CUDART_INTEROP Graphics Interoperability + * + * ___MANBRIEF___ graphics interoperability functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the graphics interoperability functions of the CUDA + * runtime application programming interface. + * + * @{ + */ + +/** + * \brief Unregisters a graphics resource for access by CUDA + * + * Unregisters the graphics resource \p resource so it is not accessible by + * CUDA unless registered again. + * + * If \p resource is invalid then ::cudaErrorInvalidResourceHandle is + * returned. + * + * \param resource - Resource to unregister + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaGraphicsD3D9RegisterResource, + * ::cudaGraphicsD3D10RegisterResource, + * ::cudaGraphicsD3D11RegisterResource, + * ::cudaGraphicsGLRegisterBuffer, + * ::cudaGraphicsGLRegisterImage, + * ::cuGraphicsUnregisterResource + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource); + +/** + * \brief Set usage flags for mapping a graphics resource + * + * Set \p flags for mapping the graphics resource \p resource. + * + * Changes to \p flags will take effect the next time \p resource is mapped. + * The \p flags argument may be any of the following: + * - ::cudaGraphicsMapFlagsNone: Specifies no hints about how \p resource will + * be used. It is therefore assumed that CUDA may read from or write to \p resource. + * - ::cudaGraphicsMapFlagsReadOnly: Specifies that CUDA will not write to \p resource. + * - ::cudaGraphicsMapFlagsWriteDiscard: Specifies CUDA will not read from \p resource and will + * write over the entire contents of \p resource, so none of the data + * previously stored in \p resource will be preserved. + * + * If \p resource is presently mapped for access by CUDA then ::cudaErrorUnknown is returned. + * If \p flags is not one of the above values then ::cudaErrorInvalidValue is returned. + * + * \param resource - Registered resource to set flags for + * \param flags - Parameters for resource mapping + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown, + * \notefnerr + * + * \sa + * ::cudaGraphicsMapResources, + * ::cuGraphicsResourceSetMapFlags + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(cudaGraphicsResource_t resource, unsigned int flags); + +/** + * \brief Map graphics resources for access by CUDA + * + * Maps the \p count graphics resources in \p resources for access by CUDA. + * + * The resources in \p resources may be accessed by CUDA until they + * are unmapped. The graphics API from which \p resources were registered + * should not access any resources while they are mapped by CUDA. If an + * application does so, the results are undefined. + * + * This function provides the synchronization guarantee that any graphics calls + * issued before ::cudaGraphicsMapResources() will complete before any subsequent CUDA + * work issued in \p stream begins. + * + * If \p resources contains any duplicate entries then ::cudaErrorInvalidResourceHandle + * is returned. If any of \p resources are presently mapped for access by + * CUDA then ::cudaErrorUnknown is returned. + * + * \param count - Number of resources to map + * \param resources - Resources to map for CUDA + * \param stream - Stream for synchronization + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \note_null_stream + * \notefnerr + * + * \sa + * ::cudaGraphicsResourceGetMappedPointer, + * ::cudaGraphicsSubResourceGetMappedArray, + * ::cudaGraphicsUnmapResources, + * ::cuGraphicsMapResources + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)); + +/** + * \brief Unmap graphics resources. + * + * Unmaps the \p count graphics resources in \p resources. + * + * Once unmapped, the resources in \p resources may not be accessed by CUDA + * until they are mapped again. + * + * This function provides the synchronization guarantee that any CUDA work issued + * in \p stream before ::cudaGraphicsUnmapResources() will complete before any + * subsequently issued graphics work begins. + * + * If \p resources contains any duplicate entries then ::cudaErrorInvalidResourceHandle + * is returned. If any of \p resources are not presently mapped for access by + * CUDA then ::cudaErrorUnknown is returned. + * + * \param count - Number of resources to unmap + * \param resources - Resources to unmap + * \param stream - Stream for synchronization + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \note_null_stream + * \notefnerr + * + * \sa + * ::cudaGraphicsMapResources, + * ::cuGraphicsUnmapResources + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)); + +/** + * \brief Get an device pointer through which to access a mapped graphics resource. + * + * Returns in \p *devPtr a pointer through which the mapped graphics resource + * \p resource may be accessed. + * Returns in \p *size the size of the memory in bytes which may be accessed from that pointer. + * The value set in \p devPtr may change every time that \p resource is mapped. + * + * If \p resource is not a buffer then it cannot be accessed via a pointer and + * ::cudaErrorUnknown is returned. + * If \p resource is not mapped then ::cudaErrorUnknown is returned. + * * + * \param devPtr - Returned pointer through which \p resource may be accessed + * \param size - Returned size of the buffer accessible starting at \p *devPtr + * \param resource - Mapped resource to access + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaGraphicsMapResources, + * ::cudaGraphicsSubResourceGetMappedArray, + * ::cuGraphicsResourceGetMappedPointer + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(void **devPtr, size_t *size, cudaGraphicsResource_t resource); + +/** + * \brief Get an array through which to access a subresource of a mapped graphics resource. + * + * Returns in \p *array an array through which the subresource of the mapped + * graphics resource \p resource which corresponds to array index \p arrayIndex + * and mipmap level \p mipLevel may be accessed. The value set in \p array may + * change every time that \p resource is mapped. + * + * If \p resource is not a texture then it cannot be accessed via an array and + * ::cudaErrorUnknown is returned. + * If \p arrayIndex is not a valid array index for \p resource then + * ::cudaErrorInvalidValue is returned. + * If \p mipLevel is not a valid mipmap level for \p resource then + * ::cudaErrorInvalidValue is returned. + * If \p resource is not mapped then ::cudaErrorUnknown is returned. + * + * \param array - Returned array through which a subresource of \p resource may be accessed + * \param resource - Mapped resource to access + * \param arrayIndex - Array index for array textures or cubemap face + * index as defined by ::cudaGraphicsCubeFace for + * cubemap textures for the subresource to access + * \param mipLevel - Mipmap level for the subresource to access + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaGraphicsResourceGetMappedPointer, + * ::cuGraphicsSubResourceGetMappedArray + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray(cudaArray_t *array, cudaGraphicsResource_t resource, unsigned int arrayIndex, unsigned int mipLevel); + +/** + * \brief Get a mipmapped array through which to access a mapped graphics resource. + * + * Returns in \p *mipmappedArray a mipmapped array through which the mapped + * graphics resource \p resource may be accessed. The value set in \p mipmappedArray may + * change every time that \p resource is mapped. + * + * If \p resource is not a texture then it cannot be accessed via an array and + * ::cudaErrorUnknown is returned. + * If \p resource is not mapped then ::cudaErrorUnknown is returned. + * + * \param mipmappedArray - Returned mipmapped array through which \p resource may be accessed + * \param resource - Mapped resource to access + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaGraphicsResourceGetMappedPointer, + * ::cuGraphicsResourceGetMappedMipmappedArray + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedMipmappedArray(cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource); + +/** @} */ /* END CUDART_INTEROP */ + +/** + * \defgroup CUDART_TEXTURE Texture Reference Management + * + * ___MANBRIEF___ texture reference management functions of the CUDA runtime + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the low level texture reference management functions + * of the CUDA runtime application programming interface. + * + * Some functions have overloaded C++ API template versions documented separately in the + * \ref CUDART_HIGHLEVEL "C++ API Routines" module. + * + * @{ + */ + +/** + * \brief Get the channel descriptor of an array + * + * Returns in \p *desc the channel descriptor of the CUDA array \p array. + * + * \param desc - Channel format + * \param array - Memory array on device + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * + * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)", + * ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", + * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)" + */ +extern __host__ cudaError_t CUDARTAPI cudaGetChannelDesc(struct cudaChannelFormatDesc *desc, cudaArray_const_t array); + +/** + * \brief Returns a channel descriptor using the specified format + * + * Returns a channel descriptor with format \p f and number of bits of each + * component \p x, \p y, \p z, and \p w. The ::cudaChannelFormatDesc is + * defined as: + * \code + struct cudaChannelFormatDesc { + int x, y, z, w; + enum cudaChannelFormatKind f; + }; + * \endcode + * + * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned, + * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat. + * + * \param x - X component + * \param y - Y component + * \param z - Z component + * \param w - W component + * \param f - Channel format + * + * \return + * Channel descriptor with format \p f + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", + * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)", + * ::cuTexRefSetFormat + */ +extern __host__ struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc(int x, int y, int z, int w, enum cudaChannelFormatKind f); + + +/** + * \brief Binds a memory area to a texture + * + * Binds \p size bytes of the memory area pointed to by \p devPtr to the + * texture reference \p texref. \p desc describes how the memory is interpreted + * when fetching values from the texture. Any memory previously bound to + * \p texref is unbound. + * + * Since the hardware enforces an alignment requirement on texture base + * addresses, + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture()" + * returns in \p *offset a byte offset that + * must be applied to texture fetches in order to read from the desired memory. + * This offset must be divided by the texel size and passed to kernels that + * read from the texture so they can be applied to the ::tex1Dfetch() function. + * If the device memory pointer was returned from ::cudaMalloc(), the offset is + * guaranteed to be 0 and NULL may be passed as the \p offset parameter. + * + * The total number of elements (or texels) in the linear address range + * cannot exceed ::cudaDeviceProp::maxTexture1DLinear[0]. + * The number of elements is computed as (\p size / elementSize), + * where elementSize is determined from \p desc. + * + * \param offset - Offset in bytes + * \param texref - Texture to bind + * \param devPtr - Memory area on device + * \param desc - Channel format + * \param size - Size of the memory area pointed to by devPtr + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * + * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)", + * ::cuTexRefSetAddress, + * ::cuTexRefSetAddressMode, + * ::cuTexRefSetFormat, + * ::cuTexRefSetFlags, + * ::cuTexRefSetBorderColor + */ +extern __host__ cudaError_t CUDARTAPI cudaBindTexture(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t size __dv(UINT_MAX)); + +/** + * \brief Binds a 2D memory area to a texture + * + * Binds the 2D memory area pointed to by \p devPtr to the + * texture reference \p texref. The size of the area is constrained by + * \p width in texel units, \p height in texel units, and \p pitch in byte + * units. \p desc describes how the memory is interpreted when fetching values + * from the texture. Any memory previously bound to \p texref is unbound. + * + * Since the hardware enforces an alignment requirement on texture base + * addresses, ::cudaBindTexture2D() returns in \p *offset a byte offset that + * must be applied to texture fetches in order to read from the desired memory. + * This offset must be divided by the texel size and passed to kernels that + * read from the texture so they can be applied to the ::tex2D() function. + * If the device memory pointer was returned from ::cudaMalloc(), the offset is + * guaranteed to be 0 and NULL may be passed as the \p offset parameter. + * + * \p width and \p height, which are specified in elements (or texels), cannot + * exceed ::cudaDeviceProp::maxTexture2DLinear[0] and ::cudaDeviceProp::maxTexture2DLinear[1] + * respectively. \p pitch, which is specified in bytes, cannot exceed + * ::cudaDeviceProp::maxTexture2DLinear[2]. + * + * The driver returns ::cudaErrorInvalidValue if \p pitch is not a multiple of + * ::cudaDeviceProp::texturePitchAlignment. + * + * \param offset - Offset in bytes + * \param texref - Texture reference to bind + * \param devPtr - 2D memory area on device + * \param desc - Channel format + * \param width - Width in texel units + * \param height - Height in texel units + * \param pitch - Pitch in bytes + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * + * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaBindTextureToArray (C API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)", + * ::cuTexRefSetAddress2D, + * ::cuTexRefSetFormat, + * ::cuTexRefSetFlags, + * ::cuTexRefSetAddressMode, + * ::cuTexRefSetBorderColor + */ +extern __host__ cudaError_t CUDARTAPI cudaBindTexture2D(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t width, size_t height, size_t pitch); + +/** + * \brief Binds an array to a texture + * + * Binds the CUDA array \p array to the texture reference \p texref. + * \p desc describes how the memory is interpreted when fetching values from + * the texture. Any CUDA array previously bound to \p texref is unbound. + * + * \param texref - Texture to bind + * \param array - Memory array on device + * \param desc - Channel format + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * + * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", + * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", + * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)", + * ::cuTexRefSetArray, + * ::cuTexRefSetFormat, + * ::cuTexRefSetFlags, + * ::cuTexRefSetAddressMode, + * ::cuTexRefSetFilterMode, + * ::cuTexRefSetBorderColor, + * ::cuTexRefSetMaxAnisotropy + */ +extern __host__ cudaError_t CUDARTAPI cudaBindTextureToArray(const struct textureReference *texref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc); + +/** + * \brief Binds a mipmapped array to a texture + * + * Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p texref. + * \p desc describes how the memory is interpreted when fetching values from + * the texture. Any CUDA mipmapped array previously bound to \p texref is unbound. + * + * \param texref - Texture to bind + * \param mipmappedArray - Memory mipmapped array on device + * \param desc - Channel format + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * + * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", + * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", + * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)", + * ::cuTexRefSetMipmappedArray, + * ::cuTexRefSetMipmapFilterMode + * ::cuTexRefSetMipmapLevelClamp, + * ::cuTexRefSetMipmapLevelBias, + * ::cuTexRefSetFormat, + * ::cuTexRefSetFlags, + * ::cuTexRefSetAddressMode, + * ::cuTexRefSetBorderColor, + * ::cuTexRefSetMaxAnisotropy + */ +extern __host__ cudaError_t CUDARTAPI cudaBindTextureToMipmappedArray(const struct textureReference *texref, cudaMipmappedArray_const_t mipmappedArray, const struct cudaChannelFormatDesc *desc); + +/** + * \brief Unbinds a texture + * + * Unbinds the texture bound to \p texref. + * + * \param texref - Texture to unbind + * + * \return + * ::cudaSuccess + * \notefnerr + * + * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", + * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)" + */ +extern __host__ cudaError_t CUDARTAPI cudaUnbindTexture(const struct textureReference *texref); + +/** + * \brief Get the alignment offset of a texture + * + * Returns in \p *offset the offset that was returned when texture reference + * \p texref was bound. + * + * \param offset - Offset of texture reference in bytes + * \param texref - Texture to get offset of + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidTexture, + * ::cudaErrorInvalidTextureBinding + * \notefnerr + * + * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", + * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +extern __host__ cudaError_t CUDARTAPI cudaGetTextureAlignmentOffset(size_t *offset, const struct textureReference *texref); + +/** + * \brief Get the texture reference associated with a symbol + * + * Returns in \p *texref the structure associated to the texture reference + * defined by symbol \p symbol. + * + * \param texref - Texture reference associated with symbol + * \param symbol - Texture to get reference for + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_string_api_deprecation_50 + * + * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)", + * ::cudaGetChannelDesc, + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)", + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", + * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)", + * ::cuModuleGetTexRef + */ +extern __host__ cudaError_t CUDARTAPI cudaGetTextureReference(const struct textureReference **texref, const void *symbol); + +/** @} */ /* END CUDART_TEXTURE */ + +/** + * \defgroup CUDART_SURFACE Surface Reference Management + * + * ___MANBRIEF___ surface reference management functions of the CUDA runtime + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the low level surface reference management functions + * of the CUDA runtime application programming interface. + * + * Some functions have overloaded C++ API template versions documented separately in the + * \ref CUDART_HIGHLEVEL "C++ API Routines" module. + * + * @{ + */ + +/** + * \brief Binds an array to a surface + * + * Binds the CUDA array \p array to the surface reference \p surfref. + * \p desc describes how the memory is interpreted when fetching values from + * the surface. Any CUDA array previously bound to \p surfref is unbound. + * + * \param surfref - Surface to bind + * \param array - Memory array on device + * \param desc - Channel format + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSurface + * \notefnerr + * + * \sa \ref ::cudaBindSurfaceToArray(const struct surface< T, dim>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindSurfaceToArray (C++ API)", + * \ref ::cudaBindSurfaceToArray(const struct surface< T, dim>&, cudaArray_const_t) "cudaBindSurfaceToArray (C++ API, inherited channel descriptor)", + * ::cudaGetSurfaceReference, + * ::cuSurfRefSetArray + */ +extern __host__ cudaError_t CUDARTAPI cudaBindSurfaceToArray(const struct surfaceReference *surfref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc); + +/** + * \brief Get the surface reference associated with a symbol + * + * Returns in \p *surfref the structure associated to the surface reference + * defined by symbol \p symbol. + * + * \param surfref - Surface reference associated with symbol + * \param symbol - Surface to get reference for + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidSurface + * \notefnerr + * \note_string_api_deprecation_50 + * + * \sa + * \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)", + * ::cuModuleGetSurfRef + */ +extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceReference(const struct surfaceReference **surfref, const void *symbol); + +/** @} */ /* END CUDART_SURFACE */ + +/** + * \defgroup CUDART_TEXTURE_OBJECT Texture Object Management + * + * ___MANBRIEF___ texture object management functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the low level texture object management functions + * of the CUDA runtime application programming interface. The texture + * object API is only supported on devices of compute capability 3.0 or higher. + * + * @{ + */ + +/** + * \brief Creates a texture object + * + * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes + * the data to texture from. \p pTexDesc describes how the data should be sampled. + * \p pResViewDesc is an optional argument that specifies an alternate format for + * the data described by \p pResDesc, and also describes the subresource region + * to restrict access to when texturing. \p pResViewDesc can only be specified if + * the type of resource is a CUDA array or a CUDA mipmapped array. + * + * Texture objects are only supported on devices of compute capability 3.0 or higher. + * Additionally, a texture object is an opaque value, and, as such, should only be + * accessed through CUDA API calls. + * + * The ::cudaResourceDesc structure is defined as: + * \code + struct cudaResourceDesc { + enum cudaResourceType resType; + + union { + struct { + cudaArray_t array; + } array; + struct { + cudaMipmappedArray_t mipmap; + } mipmap; + struct { + void *devPtr; + struct cudaChannelFormatDesc desc; + size_t sizeInBytes; + } linear; + struct { + void *devPtr; + struct cudaChannelFormatDesc desc; + size_t width; + size_t height; + size_t pitchInBytes; + } pitch2D; + } res; + }; + * \endcode + * where: + * - ::cudaResourceDesc::resType specifies the type of resource to texture from. + * CUresourceType is defined as: + * \code + enum cudaResourceType { + cudaResourceTypeArray = 0x00, + cudaResourceTypeMipmappedArray = 0x01, + cudaResourceTypeLinear = 0x02, + cudaResourceTypePitch2D = 0x03 + }; + * \endcode + * + * \par + * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeArray, ::cudaResourceDesc::res::array::array + * must be set to a valid CUDA array handle. + * + * \par + * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeMipmappedArray, ::cudaResourceDesc::res::mipmap::mipmap + * must be set to a valid CUDA mipmapped array handle and ::cudaTextureDesc::normalizedCoords must be set to true. + * + * \par + * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeLinear, ::cudaResourceDesc::res::linear::devPtr + * must be set to a valid device pointer, that is aligned to ::cudaDeviceProp::textureAlignment. + * ::cudaResourceDesc::res::linear::desc describes the format and the number of components per array element. ::cudaResourceDesc::res::linear::sizeInBytes + * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed + * ::cudaDeviceProp::maxTexture1DLinear. The number of elements is computed as (sizeInBytes / sizeof(desc)). + * + * \par + * If ::cudaResourceDesc::resType is set to ::cudaResourceTypePitch2D, ::cudaResourceDesc::res::pitch2D::devPtr + * must be set to a valid device pointer, that is aligned to ::cudaDeviceProp::textureAlignment. + * ::cudaResourceDesc::res::pitch2D::desc describes the format and the number of components per array element. ::cudaResourceDesc::res::pitch2D::width + * and ::cudaResourceDesc::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed + * ::cudaDeviceProp::maxTexture2DLinear[0] and ::cudaDeviceProp::maxTexture2DLinear[1] respectively. + * ::cudaResourceDesc::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to + * ::cudaDeviceProp::texturePitchAlignment. Pitch cannot exceed ::cudaDeviceProp::maxTexture2DLinear[2]. + * + * + * The ::cudaTextureDesc struct is defined as + * \code + struct cudaTextureDesc { + enum cudaTextureAddressMode addressMode[3]; + enum cudaTextureFilterMode filterMode; + enum cudaTextureReadMode readMode; + int sRGB; + float borderColor[4]; + int normalizedCoords; + unsigned int maxAnisotropy; + enum cudaTextureFilterMode mipmapFilterMode; + float mipmapLevelBias; + float minMipmapLevelClamp; + float maxMipmapLevelClamp; + }; + * \endcode + * where + * - ::cudaTextureDesc::addressMode specifies the addressing mode for each dimension of the texture data. ::cudaTextureAddressMode is defined as: + * \code + enum cudaTextureAddressMode { + cudaAddressModeWrap = 0, + cudaAddressModeClamp = 1, + cudaAddressModeMirror = 2, + cudaAddressModeBorder = 3 + }; + * \endcode + * This is ignored if ::cudaResourceDesc::resType is ::cudaResourceTypeLinear. Also, if ::cudaTextureDesc::normalizedCoords + * is set to zero, ::cudaAddressModeWrap and ::cudaAddressModeMirror won't be supported and will be switched to ::cudaAddressModeClamp. + * + * - ::cudaTextureDesc::filterMode specifies the filtering mode to be used when fetching from the texture. ::cudaTextureFilterMode is defined as: + * \code + enum cudaTextureFilterMode { + cudaFilterModePoint = 0, + cudaFilterModeLinear = 1 + }; + * \endcode + * This is ignored if ::cudaResourceDesc::resType is ::cudaResourceTypeLinear. + * + * - ::cudaTextureDesc::readMode specifies whether integer data should be converted to floating point or not. ::cudaTextureReadMode is defined as: + * \code + enum cudaTextureReadMode { + cudaReadModeElementType = 0, + cudaReadModeNormalizedFloat = 1 + }; + * \endcode + * Note that this applies only to 8-bit and 16-bit integer formats. 32-bit integer format would not be promoted, regardless of + * whether or not this ::cudaTextureDesc::readMode is set ::cudaReadModeNormalizedFloat is specified. + * + * - ::cudaTextureDesc::sRGB specifies whether sRGB to linear conversion should be performed during texture fetch. + * + * - ::cudaTextureDesc::borderColor specifies the float values of color. where: + * ::cudaTextureDesc::borderColor[0] contains value of 'R', + * ::cudaTextureDesc::borderColor[1] contains value of 'G', + * ::cudaTextureDesc::borderColor[2] contains value of 'B', + * ::cudaTextureDesc::borderColor[3] contains value of 'A' + * Note that application using integer border color values will need to these values to float. + * The values are set only when the addressing mode specified by ::cudaTextureDesc::addressMode is cudaAddressModeBorder. + * + * - ::cudaTextureDesc::normalizedCoords specifies whether the texture coordinates will be normalized or not. + * + * - ::cudaTextureDesc::maxAnisotropy specifies the maximum anistropy ratio to be used when doing anisotropic filtering. This value will be + * clamped to the range [1,16]. + * + * - ::cudaTextureDesc::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels. + * + * - ::cudaTextureDesc::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level. + * + * - ::cudaTextureDesc::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to. + * + * - ::cudaTextureDesc::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to. + * + * + * The ::cudaResourceViewDesc struct is defined as + * \code + struct cudaResourceViewDesc { + enum cudaResourceViewFormat format; + size_t width; + size_t height; + size_t depth; + unsigned int firstMipmapLevel; + unsigned int lastMipmapLevel; + unsigned int firstLayer; + unsigned int lastLayer; + }; + * \endcode + * where: + * - ::cudaResourceViewDesc::format specifies how the data contained in the CUDA array or CUDA mipmapped array should + * be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block + * compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a 32-bit unsigned integer format + * with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have + * a 32-bit unsigned int with 2 channels. The other BC formats require the underlying resource to have the same 32-bit unsigned int + * format but with 4 channels. + * + * - ::cudaResourceViewDesc::width specifies the new width of the texture data. If the resource view format is a block + * compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats, + * this value has to be equal to that of the original resource. + * + * - ::cudaResourceViewDesc::height specifies the new height of the texture data. If the resource view format is a block + * compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats, + * this value has to be equal to that of the original resource. + * + * - ::cudaResourceViewDesc::depth specifies the new depth of the texture data. This value has to be equal to that of the + * original resource. + * + * - ::cudaResourceViewDesc::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero. + * For non-mipmapped resources, this value has to be zero.::cudaTextureDesc::minMipmapLevelClamp and ::cudaTextureDesc::maxMipmapLevelClamp + * will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified, + * then the actual minimum mipmap level clamp will be 3.2. + * + * - ::cudaResourceViewDesc::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value + * has to be zero. + * + * - ::cudaResourceViewDesc::firstLayer specifies the first layer index for layered textures. This will be the new layer zero. + * For non-layered resources, this value has to be zero. + * + * - ::cudaResourceViewDesc::lastLayer specifies the last layer index for layered textures. For non-layered resources, + * this value has to be zero. + * + * + * \param pTexObject - Texture object to create + * \param pResDesc - Resource descriptor + * \param pTexDesc - Texture descriptor + * \param pResViewDesc - Resource view descriptor + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * + * \sa + * ::cudaDestroyTextureObject, + * ::cuTexObjectCreate + */ + +extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject(cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc, const struct cudaTextureDesc *pTexDesc, const struct cudaResourceViewDesc *pResViewDesc); + +/** + * \brief Destroys a texture object + * + * Destroys the texture object specified by \p texObject. + * + * \param texObject - Texture object to destroy + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * + * \sa + * ::cudaCreateTextureObject, + * ::cuTexObjectDestroy + */ +extern __host__ cudaError_t CUDARTAPI cudaDestroyTextureObject(cudaTextureObject_t texObject); + +/** + * \brief Returns a texture object's resource descriptor + * + * Returns the resource descriptor for the texture object specified by \p texObject. + * + * \param pResDesc - Resource descriptor + * \param texObject - Texture object + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * + * \sa + * ::cudaCreateTextureObject, + * ::cuTexObjectGetResourceDesc + */ +extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject); + +/** + * \brief Returns a texture object's texture descriptor + * + * Returns the texture descriptor for the texture object specified by \p texObject. + * + * \param pTexDesc - Texture descriptor + * \param texObject - Texture object + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * + * \sa + * ::cudaCreateTextureObject, + * ::cuTexObjectGetTextureDesc + */ +extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject); + +/** + * \brief Returns a texture object's resource view descriptor + * + * Returns the resource view descriptor for the texture object specified by \p texObject. + * If no resource view was specified, ::cudaErrorInvalidValue is returned. + * + * \param pResViewDesc - Resource view descriptor + * \param texObject - Texture object + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * + * \sa + * ::cudaCreateTextureObject, + * ::cuTexObjectGetResourceViewDesc + */ +extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject); + +/** @} */ /* END CUDART_TEXTURE_OBJECT */ + +/** + * \defgroup CUDART_SURFACE_OBJECT Surface Object Management + * + * ___MANBRIEF___ surface object management functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the low level texture object management functions + * of the CUDA runtime application programming interface. The surface object + * API is only supported on devices of compute capability 3.0 or higher. + * + * @{ + */ + +/** + * \brief Creates a surface object + * + * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes + * the data to perform surface load/stores on. ::cudaResourceDesc::resType must be + * ::cudaResourceTypeArray and ::cudaResourceDesc::res::array::array + * must be set to a valid CUDA array handle. + * + * Surface objects are only supported on devices of compute capability 3.0 or higher. + * Additionally, a surface object is an opaque value, and, as such, should only be + * accessed through CUDA API calls. + * + * \param pSurfObject - Surface object to create + * \param pResDesc - Resource descriptor + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * + * \sa + * ::cudaDestroySurfaceObject, + * ::cuSurfObjectCreate + */ + +extern __host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc); + +/** + * \brief Destroys a surface object + * + * Destroys the surface object specified by \p surfObject. + * + * \param surfObject - Surface object to destroy + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * + * \sa + * ::cudaCreateSurfaceObject, + * ::cuSurfObjectDestroy + */ +extern __host__ cudaError_t CUDARTAPI cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject); + +/** + * \brief Returns a surface object's resource descriptor + * Returns the resource descriptor for the surface object specified by \p surfObject. + * + * \param pResDesc - Resource descriptor + * \param surfObject - Surface object + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * + * \sa + * ::cudaCreateSurfaceObject, + * ::cuSurfObjectGetResourceDesc + */ +extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject); + +/** @} */ /* END CUDART_SURFACE_OBJECT */ + +/** + * \defgroup CUDART__VERSION Version Management + * + * @{ + */ + +/** + * \brief Returns the CUDA driver version + * + * Returns in \p *driverVersion the version number of the installed CUDA + * driver. If no driver is installed, then 0 is returned as the driver + * version (via \p driverVersion). This function automatically returns + * ::cudaErrorInvalidValue if the \p driverVersion argument is NULL. + * + * \param driverVersion - Returns the CUDA driver version. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * + * \sa + * ::cudaRuntimeGetVersion, + * ::cuDriverGetVersion + */ +extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion); + +/** + * \brief Returns the CUDA Runtime version + * + * Returns in \p *runtimeVersion the version number of the installed CUDA + * Runtime. This function automatically returns ::cudaErrorInvalidValue if + * the \p runtimeVersion argument is NULL. + * + * \param runtimeVersion - Returns the CUDA Runtime version. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * + * \sa + * ::cudaDriverGetVersion, + * ::cuDriverGetVersion + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion); + +/** @} */ /* END CUDART__VERSION */ + +/** \cond impl_private */ +extern __host__ cudaError_t CUDARTAPI cudaGetExportTable(const void **ppExportTable, const cudaUUID_t *pExportTableId); +/** \endcond impl_private */ + +/** + * \defgroup CUDART_HIGHLEVEL C++ API Routines + * + * ___MANBRIEF___ C++ high level API functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the C++ high level API functions of the CUDA runtime + * application programming interface. To use these functions, your + * application needs to be compiled with the \p nvcc compiler. + * + * \brief C++-style interface built on top of CUDA runtime API + */ + +/** + * \defgroup CUDART_DRIVER Interactions with the CUDA Driver API + * + * ___MANBRIEF___ interactions between CUDA Driver API and CUDA Runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the interactions between the CUDA Driver API and the CUDA Runtime API + * + * @{ + * + * \section CUDART_CUDA_primary Primary Contexts + * + * There exists a one to one relationship between CUDA devices in the CUDA Runtime + * API and ::CUcontext s in the CUDA Driver API within a process. The specific + * context which the CUDA Runtime API uses for a device is called the device's + * primary context. From the perspective of the CUDA Runtime API, a device and + * its primary context are synonymous. + * + * \section CUDART_CUDA_init Initialization and Tear-Down + * + * CUDA Runtime API calls operate on the CUDA Driver API ::CUcontext which is current to + * to the calling host thread. + * + * The function ::cudaSetDevice() makes the primary context for the + * specified device current to the calling thread by calling ::cuCtxSetCurrent(). + * + * The CUDA Runtime API will automatically initialize the primary context for + * a device at the first CUDA Runtime API call which requires an active context. + * If no ::CUcontext is current to the calling thread when a CUDA Runtime API call + * which requires an active context is made, then the primary context for a device + * will be selected, made current to the calling thread, and initialized. + * + * The context which the CUDA Runtime API initializes will be initialized using + * the parameters specified by the CUDA Runtime API functions + * ::cudaSetDeviceFlags(), + * ::cudaD3D9SetDirect3DDevice(), + * ::cudaD3D10SetDirect3DDevice(), + * ::cudaD3D11SetDirect3DDevice(), + * ::cudaGLSetGLDevice(), and + * ::cudaVDPAUSetVDPAUDevice(). + * Note that these functions will fail with ::cudaErrorSetOnActiveProcess if they are + * called when the primary context for the specified device has already been initialized. + * (or if the current device has already been initialized, in the case of + * ::cudaSetDeviceFlags()). + * + * Primary contexts will remain active until they are explicitly deinitialized + * using ::cudaDeviceReset(). The function ::cudaDeviceReset() will deinitialize the + * primary context for the calling thread's current device immediately. The context + * will remain current to all of the threads that it was current to. The next CUDA + * Runtime API call on any thread which requires an active context will trigger the + * reinitialization of that device's primary context. + * + * Note that there is no reference counting of the primary context's lifetime. It is + * recommended that the primary context not be deinitialized except just before exit + * or to recover from an unspecified launch failure. + * + * \section CUDART_CUDA_context Context Interoperability + * + * Note that the use of multiple ::CUcontext s per device within a single process + * will substantially degrade performance and is strongly discouraged. Instead, + * it is highly recommended that the implicit one-to-one device-to-context mapping + * for the process provided by the CUDA Runtime API be used. + * + * If a non-primary ::CUcontext created by the CUDA Driver API is current to a + * thread then the CUDA Runtime API calls to that thread will operate on that + * ::CUcontext, with some exceptions listed below. Interoperability between data + * types is discussed in the following sections. + * + * The function ::cudaPointerGetAttributes() will return the error + * ::cudaErrorIncompatibleDriverContext if the pointer being queried was allocated by a + * non-primary context. The function ::cudaDeviceEnablePeerAccess() and the rest of + * the peer access API may not be called when a non-primary ::CUcontext is current. + * To use the pointer query and peer access APIs with a context created using the + * CUDA Driver API, it is necessary that the CUDA Driver API be used to access + * these features. + * + * All CUDA Runtime API state (e.g, global variables' addresses and values) travels + * with its underlying ::CUcontext. In particular, if a ::CUcontext is moved from one + * thread to another then all CUDA Runtime API state will move to that thread as well. + * + * Please note that attaching to legacy contexts (those with a version of 3010 as returned + * by ::cuCtxGetApiVersion()) is not possible. The CUDA Runtime will return + * ::cudaErrorIncompatibleDriverContext in such cases. + * + * \section CUDART_CUDA_stream Interactions between CUstream and cudaStream_t + * + * The types ::CUstream and ::cudaStream_t are identical and may be used interchangeably. + * + * \section CUDART_CUDA_event Interactions between CUevent and cudaEvent_t + * + * The types ::CUevent and ::cudaEvent_t are identical and may be used interchangeably. + * + * \section CUDART_CUDA_array Interactions between CUarray and cudaArray_t + * + * The types ::CUarray and struct ::cudaArray * represent the same data type and may be used + * interchangeably by casting the two types between each other. + * + * In order to use a ::CUarray in a CUDA Runtime API function which takes a struct ::cudaArray *, + * it is necessary to explicitly cast the ::CUarray to a struct ::cudaArray *. + * + * In order to use a struct ::cudaArray * in a CUDA Driver API function which takes a ::CUarray, + * it is necessary to explicitly cast the struct ::cudaArray * to a ::CUarray . + * + * \section CUDART_CUDA_graphicsResource Interactions between CUgraphicsResource and cudaGraphicsResource_t + * + * The types ::CUgraphicsResource and ::cudaGraphicsResource_t represent the same data type and may be used + * interchangeably by casting the two types between each other. + * + * In order to use a ::CUgraphicsResource in a CUDA Runtime API function which takes a + * ::cudaGraphicsResource_t, it is necessary to explicitly cast the ::CUgraphicsResource + * to a ::cudaGraphicsResource_t. + * + * In order to use a ::cudaGraphicsResource_t in a CUDA Driver API function which takes a + * ::CUgraphicsResource, it is necessary to explicitly cast the ::cudaGraphicsResource_t + * to a ::CUgraphicsResource. + * + * @} + */ + +#if defined(__CUDA_API_VERSION_INTERNAL) + #undef cudaMemcpy + #undef cudaMemcpyToSymbol + #undef cudaMemcpyFromSymbol + #undef cudaMemcpy2D + #undef cudaMemcpyToArray + #undef cudaMemcpy2DToArray + #undef cudaMemcpyFromArray + #undef cudaMemcpy2DFromArray + #undef cudaMemcpyArrayToArray + #undef cudaMemcpy2DArrayToArray + #undef cudaMemcpy3D + #undef cudaMemcpy3DPeer + #undef cudaMemset + #undef cudaMemset2D + #undef cudaMemset3D + #undef cudaMemcpyAsync + #undef cudaMemcpyToSymbolAsync + #undef cudaMemcpyFromSymbolAsync + #undef cudaMemcpy2DAsync + #undef cudaMemcpyToArrayAsync + #undef cudaMemcpy2DToArrayAsync + #undef cudaMemcpyFromArrayAsync + #undef cudaMemcpy2DFromArrayAsync + #undef cudaMemcpy3DAsync + #undef cudaMemcpy3DPeerAsync + #undef cudaMemsetAsync + #undef cudaMemset2DAsync + #undef cudaMemset3DAsync + #undef cudaStreamQuery + #undef cudaStreamGetFlags + #undef cudaStreamGetPriority + #undef cudaEventRecord + #undef cudaStreamWaitEvent + #undef cudaStreamAddCallback + #undef cudaStreamAttachMemAsync + #undef cudaStreamSynchronize + #undef cudaLaunch + #undef cudaLaunchKernel + #undef cudaMemPrefetchAsync + #undef cudaLaunchCooperativeKernel + extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind); + extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const void *symbol, const void *src, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice)); + extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(void *dst, const void *symbol, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost)); + extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind); + extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind); + extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind); + extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind); + extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind); + extern __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)); + extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)); + extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3DParms *p); + extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p); + extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value, size_t count); + extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch, int value, size_t width, size_t height); + extern __host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent); + extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0)); + extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0)); + extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream __dv(0)); + extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream __dv(0)); + extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream __dv(0)); + extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream); + extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags); + extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetPriority(cudaStream_t hStream, int *priority); + extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0)); + extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags); + extern __host__ cudaError_t CUDARTAPI cudaStreamAddCallback(cudaStream_t stream, cudaStreamCallback_t callback, void *userData, unsigned int flags); + extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length, unsigned int flags); + extern __host__ cudaError_t CUDARTAPI cudaStreamSynchronize(cudaStream_t stream); + extern __host__ cudaError_t CUDARTAPI cudaLaunch(const void *func); + extern __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream); + extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream); + extern __host__ cudaError_t CUDARTAPI cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice, cudaStream_t stream); +#elif defined(__CUDART_API_PER_THREAD_DEFAULT_STREAM) + // nvcc stubs reference the 'cudaLaunch' identifier even if it was defined + // to 'cudaLaunch_ptsz'. Redirect through a static inline function. + #undef cudaLaunch + static __inline__ __host__ cudaError_t cudaLaunch(const void *func) + { + return cudaLaunch_ptsz(func); + } + #define cudaLaunch __CUDART_API_PTSZ(cudaLaunch) +#endif + +#if defined(__cplusplus) +} + +#endif /* __cplusplus */ + +#undef __dv + +#endif /* !__CUDA_RUNTIME_API_H__ */ diff --git a/include/external/CUDA/cudnn.h b/include/external/CUDA/cudnn.h new file mode 100755 index 000000000..b375596c1 --- /dev/null +++ b/include/external/CUDA/cudnn.h @@ -0,0 +1,1805 @@ +/* + * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + + /* cudnn : Neural Networks Library + + */ + +#if !defined(CUDNN_H_) +#define CUDNN_H_ + +#define CUDNN_MAJOR 7 +#define CUDNN_MINOR 0 +#define CUDNN_PATCHLEVEL 2 + +#define CUDNN_VERSION (CUDNN_MAJOR * 1000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL) + +#include "driver_types.h" +#include "cuda_runtime.h" + +#ifndef CUDNNWINAPI +#ifdef _WIN32 +#define CUDNNWINAPI __stdcall +#else +#define CUDNNWINAPI +#endif +#endif + +#if defined (__cplusplus) +extern "C" { +#endif + +struct cudnnContext; +typedef struct cudnnContext *cudnnHandle_t; + +size_t CUDNNWINAPI cudnnGetVersion(void); + +/* Returns CUDA Runtime version statically linked against cudnn */ +size_t CUDNNWINAPI cudnnGetCudartVersion(void); + +/* + * CUDNN return codes + */ +typedef enum +{ + CUDNN_STATUS_SUCCESS = 0, + CUDNN_STATUS_NOT_INITIALIZED = 1, + CUDNN_STATUS_ALLOC_FAILED = 2, + CUDNN_STATUS_BAD_PARAM = 3, + CUDNN_STATUS_INTERNAL_ERROR = 4, + CUDNN_STATUS_INVALID_VALUE = 5, + CUDNN_STATUS_ARCH_MISMATCH = 6, + CUDNN_STATUS_MAPPING_ERROR = 7, + CUDNN_STATUS_EXECUTION_FAILED = 8, + CUDNN_STATUS_NOT_SUPPORTED = 9, + CUDNN_STATUS_LICENSE_ERROR = 10, + CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING = 11, + CUDNN_STATUS_RUNTIME_IN_PROGRESS = 12, + CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 13, +} cudnnStatus_t; + +/* human-readable error messages */ +const char * CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status); + +/* Forward definition in this version only */ +typedef struct cudnnRuntimeTag_t cudnnRuntimeTag_t; + +typedef enum +{ + CUDNN_ERRQUERY_RAWCODE = 0, + CUDNN_ERRQUERY_NONBLOCKING = 1, + CUDNN_ERRQUERY_BLOCKING = 2, +} cudnnErrQueryMode_t; + +cudnnStatus_t CUDNNWINAPI cudnnQueryRuntimeError( + cudnnHandle_t handle, + cudnnStatus_t *rstatus, + cudnnErrQueryMode_t mode, + cudnnRuntimeTag_t *tag ); + +#ifndef __LIBRARY_TYPES_H__ + +typedef enum libraryPropertyType_t +{ + MAJOR_VERSION, + MINOR_VERSION, + PATCH_LEVEL +} libraryPropertyType; + +#endif + +cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type, int *value); + +cudnnStatus_t CUDNNWINAPI cudnnCreate (cudnnHandle_t *handle); +cudnnStatus_t CUDNNWINAPI cudnnDestroy (cudnnHandle_t handle); +cudnnStatus_t CUDNNWINAPI cudnnSetStream (cudnnHandle_t handle, cudaStream_t streamId); +cudnnStatus_t CUDNNWINAPI cudnnGetStream (cudnnHandle_t handle, cudaStream_t *streamId); + +/* Data structures to represent Image/Filter and the Neural Network Layer */ +typedef struct cudnnTensorStruct* cudnnTensorDescriptor_t; +typedef struct cudnnConvolutionStruct* cudnnConvolutionDescriptor_t; +typedef struct cudnnPoolingStruct* cudnnPoolingDescriptor_t; +typedef struct cudnnFilterStruct* cudnnFilterDescriptor_t; +typedef struct cudnnLRNStruct* cudnnLRNDescriptor_t; +typedef struct cudnnActivationStruct* cudnnActivationDescriptor_t; +typedef struct cudnnSpatialTransformerStruct* cudnnSpatialTransformerDescriptor_t; +typedef struct cudnnOpTensorStruct* cudnnOpTensorDescriptor_t; +typedef struct cudnnReduceTensorStruct* cudnnReduceTensorDescriptor_t; +typedef struct cudnnCTCLossStruct* cudnnCTCLossDescriptor_t; +/* +* CUDNN data type +*/ +typedef enum +{ + CUDNN_DATA_FLOAT = 0, + CUDNN_DATA_DOUBLE = 1, + CUDNN_DATA_HALF = 2, + CUDNN_DATA_INT8 = 3, + CUDNN_DATA_INT32 = 4, + CUDNN_DATA_INT8x4 = 5 +} cudnnDataType_t; + +/* +* CUDNN math type +*/ +typedef enum { + CUDNN_DEFAULT_MATH = 0, + CUDNN_TENSOR_OP_MATH = 1, +} cudnnMathType_t; + +/* + * CUDNN propagate Nan + */ +typedef enum{ + CUDNN_NOT_PROPAGATE_NAN = 0, + CUDNN_PROPAGATE_NAN = 1, +} cudnnNanPropagation_t; + +/* + * CUDNN Determinism + */ +typedef enum +{ + CUDNN_NON_DETERMINISTIC = 0, + CUDNN_DETERMINISTIC = 1, +} cudnnDeterminism_t; + +/* Maximum supported number of tensor dimensions */ +#define CUDNN_DIM_MAX 8 + +/* Create an instance of a generic Tensor descriptor */ +cudnnStatus_t CUDNNWINAPI cudnnCreateTensorDescriptor( + cudnnTensorDescriptor_t *tensorDesc ); + +typedef enum +{ + CUDNN_TENSOR_NCHW = 0, /* row major (wStride = 1, hStride = w) */ + CUDNN_TENSOR_NHWC = 1, /* feature maps interleaved ( cStride = 1 )*/ + CUDNN_TENSOR_NCHW_VECT_C = 2 /* each image point is vector of element of C : the length of the vector is carried by the data type*/ +} cudnnTensorFormat_t; + +cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptor( + cudnnTensorDescriptor_t tensorDesc, + cudnnTensorFormat_t format, + cudnnDataType_t dataType, /* image data type */ + int n, /* number of inputs (batch size) */ + int c, /* number of input feature maps */ + int h, /* height of input section */ + int w ); /* width of input section */ + +cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptorEx( + cudnnTensorDescriptor_t tensorDesc, + cudnnDataType_t dataType, /* image data type */ + int n, /* number of inputs (batch size) */ + int c, /* number of input feature maps */ + int h, /* height of input section */ + int w, /* width of input section */ + int nStride, + int cStride, + int hStride, + int wStride ); + +cudnnStatus_t CUDNNWINAPI cudnnGetTensor4dDescriptor( + const cudnnTensorDescriptor_t tensorDesc, + cudnnDataType_t *dataType, /* image data type */ + int *n, /* number of inputs (batch size) */ + int *c, /* number of input feature maps */ + int *h, /* height of input section */ + int *w, /* width of input section */ + int *nStride, + int *cStride, + int *hStride, + int *wStride ); + +cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptor( + cudnnTensorDescriptor_t tensorDesc, + cudnnDataType_t dataType, + int nbDims, + const int dimA[], + const int strideA[] ); + +cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptorEx( + cudnnTensorDescriptor_t tensorDesc, + cudnnTensorFormat_t format, + cudnnDataType_t dataType, + int nbDims, + const int dimA[] ); + +cudnnStatus_t CUDNNWINAPI cudnnGetTensorNdDescriptor( + const cudnnTensorDescriptor_t tensorDesc, + int nbDimsRequested, + cudnnDataType_t *dataType, + int *nbDims, + int dimA[], + int strideA[] ); + +cudnnStatus_t CUDNNWINAPI cudnnGetTensorSizeInBytes( + const cudnnTensorDescriptor_t tensorDesc, + size_t *size); + +/* PixelOffset( n, c, h, w ) = n *input_stride + c * feature_stride + h * h_stride + w * w_stride + + 1)Example of all images in row major order one batch of features after the other (with an optional padding on row) + input_stride : c x h x h_stride + feature_stride : h x h_stride + h_stride : >= w ( h_stride = w if no padding) + w_stride : 1 + + + 2)Example of all images in row major with features maps interleaved + input_stride : c x h x h_stride + feature_stride : 1 + h_stride : w x c + w_stride : c + + 3)Example of all images in column major order one batch of features after the other (with optional padding on column) + input_stride : c x w x w_stride + feature_stride : w x w_stride + h_stride : 1 + w_stride : >= h + +*/ + +/* Destroy an instance of Tensor4d descriptor */ +cudnnStatus_t CUDNNWINAPI cudnnDestroyTensorDescriptor( + cudnnTensorDescriptor_t tensorDesc ); + + +/* Tensor layout conversion helper (y = alpha * x + beta * y) */ +cudnnStatus_t CUDNNWINAPI cudnnTransformTensor( + cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y ); + + +/* Tensor Bias addition : C = alpha * A + beta * C */ +cudnnStatus_t CUDNNWINAPI cudnnAddTensor( + cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t aDesc, + const void *A, + const void *beta, + const cudnnTensorDescriptor_t cDesc, + void *C ); + +/* +* CUDNN OpTensor op type +*/ +typedef enum +{ + CUDNN_OP_TENSOR_ADD = 0, + CUDNN_OP_TENSOR_MUL = 1, + CUDNN_OP_TENSOR_MIN = 2, + CUDNN_OP_TENSOR_MAX = 3, + CUDNN_OP_TENSOR_SQRT = 4, + CUDNN_OP_TENSOR_NOT = 5, +} cudnnOpTensorOp_t; + +cudnnStatus_t CUDNNWINAPI cudnnCreateOpTensorDescriptor( + cudnnOpTensorDescriptor_t *opTensorDesc ); + +cudnnStatus_t CUDNNWINAPI cudnnSetOpTensorDescriptor( + cudnnOpTensorDescriptor_t opTensorDesc, + cudnnOpTensorOp_t opTensorOp, + cudnnDataType_t opTensorCompType, + cudnnNanPropagation_t opTensorNanOpt ); + +cudnnStatus_t CUDNNWINAPI cudnnGetOpTensorDescriptor( + const cudnnOpTensorDescriptor_t opTensorDesc, + cudnnOpTensorOp_t *opTensorOp, + cudnnDataType_t *opTensorCompType, + cudnnNanPropagation_t *opTensorNanOpt ); + +cudnnStatus_t CUDNNWINAPI cudnnDestroyOpTensorDescriptor( + cudnnOpTensorDescriptor_t opTensorDesc ); + +/* Tensor operation : C = op( alpha1 * A, alpha2 * B ) + beta * C */ +/* B tensor is ignored for CUDNN_OP_TENSOR_SQRT, CUDNN_OP_TENSOR_NOT. */ +cudnnStatus_t CUDNNWINAPI cudnnOpTensor( + cudnnHandle_t handle, + const cudnnOpTensorDescriptor_t opTensorDesc, + const void *alpha1, + const cudnnTensorDescriptor_t aDesc, + const void *A, + const void *alpha2, + const cudnnTensorDescriptor_t bDesc, + const void *B, + const void *beta, + const cudnnTensorDescriptor_t cDesc, + void *C ); + +/* +* CUDNN ReduceTensor op type +*/ +typedef enum +{ + CUDNN_REDUCE_TENSOR_ADD = 0, + CUDNN_REDUCE_TENSOR_MUL = 1, + CUDNN_REDUCE_TENSOR_MIN = 2, + CUDNN_REDUCE_TENSOR_MAX = 3, + CUDNN_REDUCE_TENSOR_AMAX = 4, + CUDNN_REDUCE_TENSOR_AVG = 5, + CUDNN_REDUCE_TENSOR_NORM1 = 6, + CUDNN_REDUCE_TENSOR_NORM2 = 7, + CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8, +} cudnnReduceTensorOp_t; + +/* +* CUDNN ReduceTensor indices type +*/ +typedef enum +{ + CUDNN_REDUCE_TENSOR_NO_INDICES = 0, + CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1, +} cudnnReduceTensorIndices_t; + +/* +* CUDNN tensor indices type size (all unsigned) +* Currently not supported, default is 32 bit unsigned. +*/ +typedef enum +{ + CUDNN_32BIT_INDICES = 0, + CUDNN_64BIT_INDICES = 1, + CUDNN_16BIT_INDICES = 2, + CUDNN_8BIT_INDICES = 3, +} cudnnIndicesType_t; + +cudnnStatus_t CUDNNWINAPI cudnnCreateReduceTensorDescriptor( + cudnnReduceTensorDescriptor_t *reduceTensorDesc ); + +cudnnStatus_t CUDNNWINAPI cudnnSetReduceTensorDescriptor( + cudnnReduceTensorDescriptor_t reduceTensorDesc, + cudnnReduceTensorOp_t reduceTensorOp, + cudnnDataType_t reduceTensorCompType, + cudnnNanPropagation_t reduceTensorNanOpt, + cudnnReduceTensorIndices_t reduceTensorIndices, + cudnnIndicesType_t reduceTensorIndicesType ); + +cudnnStatus_t CUDNNWINAPI cudnnGetReduceTensorDescriptor( + const cudnnReduceTensorDescriptor_t reduceTensorDesc, + cudnnReduceTensorOp_t *reduceTensorOp, + cudnnDataType_t *reduceTensorCompType, + cudnnNanPropagation_t *reduceTensorNanOpt, + cudnnReduceTensorIndices_t *reduceTensorIndices, + cudnnIndicesType_t *reduceTensorIndicesType ); + +cudnnStatus_t CUDNNWINAPI cudnnDestroyReduceTensorDescriptor( + cudnnReduceTensorDescriptor_t reduceTensorDesc ); + + /* Helper function to return the minimum size of the index space to be passed to the reduction given the input and output tensors */ +cudnnStatus_t CUDNNWINAPI cudnnGetReductionIndicesSize( + cudnnHandle_t handle, + const cudnnReduceTensorDescriptor_t reduceTensorDesc, + const cudnnTensorDescriptor_t aDesc, + const cudnnTensorDescriptor_t cDesc, + size_t *sizeInBytes ); + + /* Helper function to return the minimum size of the workspace to be passed to the reduction given the input and output tensors */ +cudnnStatus_t CUDNNWINAPI cudnnGetReductionWorkspaceSize( + cudnnHandle_t handle, + const cudnnReduceTensorDescriptor_t reduceTensorDesc, + const cudnnTensorDescriptor_t aDesc, + const cudnnTensorDescriptor_t cDesc, + size_t *sizeInBytes ); + +/* Tensor operation : C = reduce op( alpha * A ) + beta * C */ +/* The NaN propagation enum applies to only the min and max reduce ops; the other reduce ops propagate NaN as usual. */ +/* The indices space is ignored for reduce ops other than min or max. */ +cudnnStatus_t CUDNNWINAPI cudnnReduceTensor( + cudnnHandle_t handle, + const cudnnReduceTensorDescriptor_t reduceTensorDesc, + void *indices, + size_t indicesSizeInBytes, + void *workspace, + size_t workspaceSizeInBytes, + const void *alpha, + const cudnnTensorDescriptor_t aDesc, + const void *A, + const void *beta, + const cudnnTensorDescriptor_t cDesc, + void *C ); + +/* Set all values of a tensor to a given value : y[i] = value[0] */ +cudnnStatus_t CUDNNWINAPI cudnnSetTensor( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t yDesc, + void *y, + const void *valuePtr ); + +/* Scale all values of a tensor by a given factor : y[i] = alpha * y[i] */ +cudnnStatus_t CUDNNWINAPI cudnnScaleTensor( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t yDesc, + void *y, + const void *alpha ); + +/* + * convolution mode + */ +typedef enum +{ + CUDNN_CONVOLUTION = 0, + CUDNN_CROSS_CORRELATION = 1 +} cudnnConvolutionMode_t; + + +/* Create an instance of FilterStruct */ +cudnnStatus_t CUDNNWINAPI cudnnCreateFilterDescriptor( + cudnnFilterDescriptor_t *filterDesc ); + + +cudnnStatus_t CUDNNWINAPI cudnnSetFilter4dDescriptor( + cudnnFilterDescriptor_t filterDesc, + cudnnDataType_t dataType, /* image data type */ + cudnnTensorFormat_t format, + int k, /* number of output feature maps */ + int c, /* number of input feature maps */ + int h, /* height of each input filter */ + int w ); /* width of each input filter */ + + +cudnnStatus_t CUDNNWINAPI cudnnGetFilter4dDescriptor( + const cudnnFilterDescriptor_t filterDesc, + cudnnDataType_t *dataType, /* image data type */ + cudnnTensorFormat_t *format, + int *k, /* number of output feature maps */ + int *c, /* number of input feature maps */ + int *h, /* height of each input filter */ + int *w ); /* width of each input filter */ + + +cudnnStatus_t CUDNNWINAPI cudnnSetFilterNdDescriptor( + cudnnFilterDescriptor_t filterDesc, + cudnnDataType_t dataType, /* image data type */ + cudnnTensorFormat_t format, + int nbDims, + const int filterDimA[] ); + +cudnnStatus_t CUDNNWINAPI cudnnGetFilterNdDescriptor( + const cudnnFilterDescriptor_t filterDesc, + int nbDimsRequested, + cudnnDataType_t *dataType, /* image data type */ + cudnnTensorFormat_t *format, + int *nbDims, + int filterDimA[] ); + + +cudnnStatus_t CUDNNWINAPI cudnnDestroyFilterDescriptor( + cudnnFilterDescriptor_t filterDesc ); + +/* Create an instance of convolution descriptor */ +cudnnStatus_t CUDNNWINAPI cudnnCreateConvolutionDescriptor( + cudnnConvolutionDescriptor_t *convDesc ); + +cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionMathType( cudnnConvolutionDescriptor_t convDesc, + cudnnMathType_t mathType ); + +cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionMathType( cudnnConvolutionDescriptor_t convDesc, + cudnnMathType_t *mathType ); + +cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionGroupCount( cudnnConvolutionDescriptor_t convDesc, + int groupCount ); + +cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionGroupCount( cudnnConvolutionDescriptor_t convDesc, + int *groupCount ); + +cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor( cudnnConvolutionDescriptor_t convDesc, + int pad_h, /* zero-padding height */ + int pad_w, /* zero-padding width */ + int u, /* vertical filter stride */ + int v, /* horizontal filter stride */ + int dilation_h, /* filter dilation in the vertical dimension */ + int dilation_w, /* filter dilation in the horizontal dimension */ + cudnnConvolutionMode_t mode, + cudnnDataType_t computeType + ); + +cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor( const cudnnConvolutionDescriptor_t convDesc, + int* pad_h, /* zero-padding height */ + int* pad_w, /* zero-padding width */ + int* u, /* vertical filter stride */ + int* v, /* horizontal filter stride */ + int* dilation_h, /* filter dilation in the vertical dimension */ + int* dilation_w, /* filter dilation in the horizontal dimension */ + cudnnConvolutionMode_t* mode, + cudnnDataType_t *computeType + ); + +/* Helper function to return the dimensions of the output tensor given a convolution descriptor */ +cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dForwardOutputDim( + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t inputTensorDesc, + const cudnnFilterDescriptor_t filterDesc, + int *n, + int *c, + int *h, + int *w ); + + +cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionNdDescriptor( + cudnnConvolutionDescriptor_t convDesc, + int arrayLength, /* nbDims-2 size */ + const int padA[], + const int filterStrideA[], + const int dilationA[], + cudnnConvolutionMode_t mode, + cudnnDataType_t computeType ); /* convolution data type */ + +cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdDescriptor( + const cudnnConvolutionDescriptor_t convDesc, + int arrayLengthRequested, + int *arrayLength, + int padA[], + int strideA[], + int dilationA[], + cudnnConvolutionMode_t *mode, + cudnnDataType_t *computeType ); /* convolution data type */ + + +/* Helper function to return the dimensions of the output tensor given a convolution descriptor */ +cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdForwardOutputDim( + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t inputTensorDesc, + const cudnnFilterDescriptor_t filterDesc, + int nbDims, + int tensorOuputDimA[] ); + +/* Destroy an instance of convolution descriptor */ +cudnnStatus_t CUDNNWINAPI cudnnDestroyConvolutionDescriptor( + cudnnConvolutionDescriptor_t convDesc ); + + +/* helper function to provide the convolution algo that fit best the requirement */ +typedef enum +{ + CUDNN_CONVOLUTION_FWD_NO_WORKSPACE = 0, + CUDNN_CONVOLUTION_FWD_PREFER_FASTEST = 1, + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT = 2, +} cudnnConvolutionFwdPreference_t; + + +typedef enum +{ + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 0, + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1, + CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2, + CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 3, + CUDNN_CONVOLUTION_FWD_ALGO_FFT = 4, + CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 5, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 6, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED = 7, + CUDNN_CONVOLUTION_FWD_ALGO_COUNT = 8 +} cudnnConvolutionFwdAlgo_t; + +typedef struct { + cudnnConvolutionFwdAlgo_t algo; + cudnnStatus_t status; + float time; + size_t memory; + cudnnDeterminism_t determinism; + cudnnMathType_t mathType; + int reserved[3]; +} cudnnConvolutionFwdAlgoPerf_t; + +cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithmMaxCount( cudnnHandle_t handle, + int *count); + +cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithm( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnFilterDescriptor_t wDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t yDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionFwdAlgoPerf_t *perfResults ); + +cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithmEx( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t yDesc, + void *y, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionFwdAlgoPerf_t *perfResults, + void *workSpace, + size_t workSpaceSizeInBytes ); + + +cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnFilterDescriptor_t wDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t yDesc, + cudnnConvolutionFwdPreference_t preference, + size_t memoryLimitInBytes, + cudnnConvolutionFwdAlgo_t *algo ); + + +cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm_v7( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t srcDesc, + const cudnnFilterDescriptor_t filterDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t destDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionFwdAlgoPerf_t *perfResults); + +/* + * convolution algorithm (which requires potentially some workspace) + */ + + /* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/ +cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardWorkspaceSize( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnFilterDescriptor_t wDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t yDesc, + cudnnConvolutionFwdAlgo_t algo, + size_t *sizeInBytes ); + + +/* Convolution functions: All of the form "output = alpha * Op(inputs) + beta * output" */ + +/* Function to perform the forward pass for batch convolution */ +cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward( + cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionFwdAlgo_t algo, + void *workSpace, + size_t workSpaceSizeInBytes, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y ); + +/* Fused conv/bias/activation operation : y = Act( alpha1 * conv(x) + alpha2 * z + bias ) */ +cudnnStatus_t CUDNNWINAPI cudnnConvolutionBiasActivationForward( + cudnnHandle_t handle, + const void *alpha1, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionFwdAlgo_t algo, + void *workSpace, + size_t workSpaceSizeInBytes, + const void *alpha2, + const cudnnTensorDescriptor_t zDesc, + const void *z, + const cudnnTensorDescriptor_t biasDesc, + const void *bias, + const cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t yDesc, + void *y ); + +/* Function to compute the bias gradient for batch convolution */ +cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardBias( + cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const void *beta, + const cudnnTensorDescriptor_t dbDesc, + void *db ); + + +/* helper function to provide the convolution algo that fit best the requirement */ +typedef enum +{ + CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE = 0, + CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST = 1, + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT = 2, +} cudnnConvolutionBwdFilterPreference_t; + +typedef enum +{ + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 0, /* non-deterministic */ + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 1, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 2, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 3, /* non-deterministic */ + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD = 4, /* not implemented */ + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING = 6, + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT = 7 +} cudnnConvolutionBwdFilterAlgo_t; + + +typedef struct { + cudnnConvolutionBwdFilterAlgo_t algo; + cudnnStatus_t status; + float time; + size_t memory; + cudnnDeterminism_t determinism; + cudnnMathType_t mathType; + int reserved[3]; +} cudnnConvolutionBwdFilterAlgoPerf_t; + +cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithmMaxCount( cudnnHandle_t handle, + int *count); + +cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithm( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnFilterDescriptor_t dwDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdFilterAlgoPerf_t *perfResults ); + +cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithmEx( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnTensorDescriptor_t dyDesc, + const void *y, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnFilterDescriptor_t dwDesc, + void *dw, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, + void *workSpace, + size_t workSpaceSizeInBytes ); + +cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnFilterDescriptor_t dwDesc, + cudnnConvolutionBwdFilterPreference_t preference, + size_t memoryLimitInBytes, + cudnnConvolutionBwdFilterAlgo_t *algo ); + +cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm_v7( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t srcDesc, + const cudnnTensorDescriptor_t diffDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnFilterDescriptor_t gradDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdFilterAlgoPerf_t *perfResults); + +/* + * convolution algorithm (which requires potentially some workspace) + */ + + /* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/ +cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterWorkspaceSize( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnFilterDescriptor_t gradDesc, + cudnnConvolutionBwdFilterAlgo_t algo, + size_t *sizeInBytes ); + +cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardFilter( + cudnnHandle_t handle, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionBwdFilterAlgo_t algo, + void *workSpace, + size_t workSpaceSizeInBytes, + const void *beta, + const cudnnFilterDescriptor_t dwDesc, + void *dw ); + +/*********************************************************/ +/* helper function to provide the convolution algo that fit best the requirement */ +typedef enum +{ + CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE = 0, + CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST = 1, + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT = 2, +} cudnnConvolutionBwdDataPreference_t; + +typedef enum +{ + CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 0, /* non-deterministic */ + CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 1, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 2, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 3, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 4, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5, + CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT = 6 +} cudnnConvolutionBwdDataAlgo_t; + +typedef struct { + cudnnConvolutionBwdDataAlgo_t algo; + cudnnStatus_t status; + float time; + size_t memory; + cudnnDeterminism_t determinism; + cudnnMathType_t mathType; + int reserved[3]; +} cudnnConvolutionBwdDataAlgoPerf_t; + +cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithmMaxCount( cudnnHandle_t handle, + int *count); + +cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithm( + cudnnHandle_t handle, + const cudnnFilterDescriptor_t wDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t dxDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdDataAlgoPerf_t *perfResults ); + +cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithmEx( + cudnnHandle_t handle, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t dxDesc, + void *dx, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdDataAlgoPerf_t *perfResults, + void *workSpace, + size_t workSpaceSizeInBytes ); + +cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm( + cudnnHandle_t handle, + const cudnnFilterDescriptor_t wDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t dxDesc, + cudnnConvolutionBwdDataPreference_t preference, + size_t memoryLimitInBytes, + cudnnConvolutionBwdDataAlgo_t *algo ); + +cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm_v7( + cudnnHandle_t handle, + const cudnnFilterDescriptor_t filterDesc, + const cudnnTensorDescriptor_t diffDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t gradDesc, + const int requestedAlgoCount, + int *returnedAlgoCount, + cudnnConvolutionBwdDataAlgoPerf_t *perfResults); + + /* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/ +cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataWorkspaceSize( + cudnnHandle_t handle, + const cudnnFilterDescriptor_t wDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t dxDesc, + cudnnConvolutionBwdDataAlgo_t algo, + size_t *sizeInBytes ); + + +cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardData( + cudnnHandle_t handle, + const void *alpha, + const cudnnFilterDescriptor_t wDesc, + const void *w, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionBwdDataAlgo_t algo, + void *workSpace, + size_t workSpaceSizeInBytes, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx ); + + +cudnnStatus_t CUDNNWINAPI cudnnIm2Col( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const cudnnFilterDescriptor_t wDesc, + const cudnnConvolutionDescriptor_t convDesc, + void *colBuffer ); + + +/* + * softmax algorithm + */ +typedef enum +{ + CUDNN_SOFTMAX_FAST = 0, /* straightforward implementation */ + CUDNN_SOFTMAX_ACCURATE = 1, /* subtract max from every point to avoid overflow */ + CUDNN_SOFTMAX_LOG = 2 +} cudnnSoftmaxAlgorithm_t; + +typedef enum +{ + CUDNN_SOFTMAX_MODE_INSTANCE = 0, /* compute the softmax over all C, H, W for each N */ + CUDNN_SOFTMAX_MODE_CHANNEL = 1 /* compute the softmax over all C for each H, W, N */ +} cudnnSoftmaxMode_t; + +/* Softmax functions: All of the form "output = alpha * Op(inputs) + beta * output" */ + +/* Function to perform forward softmax */ +cudnnStatus_t CUDNNWINAPI cudnnSoftmaxForward( + cudnnHandle_t handle, + cudnnSoftmaxAlgorithm_t algo, + cudnnSoftmaxMode_t mode, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y ); + +/* Function to perform backward softmax */ +cudnnStatus_t CUDNNWINAPI cudnnSoftmaxBackward( + cudnnHandle_t handle, + cudnnSoftmaxAlgorithm_t algo, + cudnnSoftmaxMode_t mode, + const void *alpha, + const cudnnTensorDescriptor_t yDesc, + const void *y, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx ); + +/* + * pooling mode + */ +typedef enum +{ + CUDNN_POOLING_MAX = 0, + CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1, /* count for average includes padded values */ + CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2, /* count for average does not include padded values */ + CUDNN_POOLING_MAX_DETERMINISTIC = 3 +} cudnnPoolingMode_t; + +/* Create an instance of pooling descriptor */ +cudnnStatus_t CUDNNWINAPI cudnnCreatePoolingDescriptor( + cudnnPoolingDescriptor_t *poolingDesc ); + +cudnnStatus_t CUDNNWINAPI cudnnSetPooling2dDescriptor( + cudnnPoolingDescriptor_t poolingDesc, + cudnnPoolingMode_t mode, + cudnnNanPropagation_t maxpoolingNanOpt, + int windowHeight, + int windowWidth, + int verticalPadding, + int horizontalPadding, + int verticalStride, + int horizontalStride ); + +cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dDescriptor( + const cudnnPoolingDescriptor_t poolingDesc, + cudnnPoolingMode_t *mode, + cudnnNanPropagation_t *maxpoolingNanOpt, + int *windowHeight, + int *windowWidth, + int *verticalPadding, + int *horizontalPadding, + int *verticalStride, + int *horizontalStride ); + +cudnnStatus_t CUDNNWINAPI cudnnSetPoolingNdDescriptor( + cudnnPoolingDescriptor_t poolingDesc, + const cudnnPoolingMode_t mode, + const cudnnNanPropagation_t maxpoolingNanOpt, + int nbDims, + const int windowDimA[], + const int paddingA[], + const int strideA[] ); + +cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdDescriptor( + const cudnnPoolingDescriptor_t poolingDesc, + int nbDimsRequested, + cudnnPoolingMode_t *mode, + cudnnNanPropagation_t *maxpoolingNanOpt, + int *nbDims, + int windowDimA[], + int paddingA[], + int strideA[] ); + +cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdForwardOutputDim( + const cudnnPoolingDescriptor_t poolingDesc, + const cudnnTensorDescriptor_t inputTensorDesc, + int nbDims, + int outputTensorDimA[] ); + +cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dForwardOutputDim( + const cudnnPoolingDescriptor_t poolingDesc, + const cudnnTensorDescriptor_t inputTensorDesc, + int *n, + int *c, + int *h, + int *w ); + + +/* Destroy an instance of pooling descriptor */ +cudnnStatus_t CUDNNWINAPI cudnnDestroyPoolingDescriptor( + cudnnPoolingDescriptor_t poolingDesc ); + +/* Pooling functions: All of the form "output = alpha * Op(inputs) + beta * output" */ + +/* Function to perform forward pooling */ +cudnnStatus_t CUDNNWINAPI cudnnPoolingForward( + cudnnHandle_t handle, + const cudnnPoolingDescriptor_t poolingDesc, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y ); + +/* Function to perform backward pooling */ +cudnnStatus_t CUDNNWINAPI cudnnPoolingBackward( + cudnnHandle_t handle, + const cudnnPoolingDescriptor_t poolingDesc, + const void *alpha, + const cudnnTensorDescriptor_t yDesc, + const void *y, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx ); + +/* + * activation mode + */ +typedef enum +{ + CUDNN_ACTIVATION_SIGMOID = 0, + CUDNN_ACTIVATION_RELU = 1, + CUDNN_ACTIVATION_TANH = 2, + CUDNN_ACTIVATION_CLIPPED_RELU = 3, + CUDNN_ACTIVATION_ELU = 4 +} cudnnActivationMode_t; + +/* Activation functions: All of the form "output = alpha * Op(inputs) + beta * output" */ +cudnnStatus_t CUDNNWINAPI cudnnCreateActivationDescriptor( + cudnnActivationDescriptor_t *activationDesc); + +cudnnStatus_t CUDNNWINAPI cudnnSetActivationDescriptor( + cudnnActivationDescriptor_t activationDesc, + cudnnActivationMode_t mode, + cudnnNanPropagation_t reluNanOpt, + double coef ); /* ceiling for clipped RELU, alpha for ELU */ + +cudnnStatus_t CUDNNWINAPI cudnnGetActivationDescriptor( + const cudnnActivationDescriptor_t activationDesc, + cudnnActivationMode_t *mode, + cudnnNanPropagation_t *reluNanOpt, + double* coef ); /* ceiling for clipped RELU, alpha for ELU */ + +cudnnStatus_t CUDNNWINAPI cudnnDestroyActivationDescriptor( + cudnnActivationDescriptor_t activationDesc); + +/* Function to perform forward activation */ +cudnnStatus_t CUDNNWINAPI cudnnActivationForward( + cudnnHandle_t handle, + cudnnActivationDescriptor_t activationDesc, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y ); + +/* Function to perform backward activation */ +cudnnStatus_t CUDNNWINAPI cudnnActivationBackward( + cudnnHandle_t handle, + cudnnActivationDescriptor_t activationDesc, + const void *alpha, + const cudnnTensorDescriptor_t yDesc, + const void *y, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx ); + +/* +* Create an instance of LRN (Local Response Normalization) descriptor +* Uses lrnN=5, lrnAlpha=1e-4, lrnBeta=0.75, lrnK=2.0 as defaults from Krizhevsky'12 ImageNet paper +*/ +cudnnStatus_t CUDNNWINAPI cudnnCreateLRNDescriptor( + cudnnLRNDescriptor_t *normDesc ); + +#define CUDNN_LRN_MIN_N 1 /* minimum allowed lrnN */ +#define CUDNN_LRN_MAX_N 16 /* maximum allowed lrnN */ +#define CUDNN_LRN_MIN_K 1e-5 /* minimum allowed lrnK */ +#define CUDNN_LRN_MIN_BETA 0.01 /* minimum allowed lrnBeta */ + +/* LRN layer mode */ +typedef enum +{ + CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0,/* Normalize across tensor's dimA[1] dimension */ +} cudnnLRNMode_t; + +/* +* Uses a window [center-lookBehind, center+lookAhead], where +* lookBehind = floor( (lrnN-1)/2 ), lookAhead = lrnN-lookBehind-1. +* Values of double parameters cast to tensor data type. +*/ +cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor( + cudnnLRNDescriptor_t normDesc, + unsigned lrnN, + double lrnAlpha, + double lrnBeta, + double lrnK ); +/* +* Retrieve the settings currently stored in an LRN layer descriptor +* Any of the provided pointers can be NULL (no corresponding value will be returned) +*/ +cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor( + cudnnLRNDescriptor_t normDesc, + unsigned* lrnN, + double* lrnAlpha, + double* lrnBeta, + double* lrnK ); + +/* Destroy an instance of LRN descriptor */ +cudnnStatus_t CUDNNWINAPI cudnnDestroyLRNDescriptor( cudnnLRNDescriptor_t lrnDesc ); + +/* LRN functions: output = alpha * normalize(x) + beta * old_y */ + +/* LRN cross-channel forward computation. Double parameters cast to tensor data type */ +cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelForward( + cudnnHandle_t handle, + cudnnLRNDescriptor_t normDesc, + cudnnLRNMode_t lrnMode, + const void* alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y ); + +/* LRN cross-channel backward computation. Double parameters cast to tensor data type */ +cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelBackward( + cudnnHandle_t handle, + cudnnLRNDescriptor_t normDesc, + cudnnLRNMode_t lrnMode, + const void* alpha, + const cudnnTensorDescriptor_t yDesc, + const void *y, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx); + +typedef enum +{ + CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0, +} cudnnDivNormMode_t; + +/* LCN/divisive normalization functions: y = alpha * normalize(x) + beta * y */ +cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationForward( + cudnnHandle_t handle, + cudnnLRNDescriptor_t normDesc, + cudnnDivNormMode_t mode, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */ + const void *x, + const void *means, /* if NULL, means are assumed to be zero */ + void *temp, + void *temp2, + const void *beta, + const cudnnTensorDescriptor_t yDesc, + void *y ); + +cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationBackward( + cudnnHandle_t handle, + cudnnLRNDescriptor_t normDesc, + cudnnDivNormMode_t mode, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */ + const void *x, + const void *means, /* if NULL, means are assumed to be zero */ + const void *dy, + void *temp, + void *temp2, + const void *beta, + const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */ + void *dx, /* output x differential */ + void *dMeans ); /* output means differential, can be NULL */ + +typedef enum +{ + /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */ + CUDNN_BATCHNORM_PER_ACTIVATION = 0, + + /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */ + CUDNN_BATCHNORM_SPATIAL = 1, + + /* + * bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors). + * May be faster than CUDNN_BATCHNORM_SPATIAL but imposes some limits on the range of values + */ + CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2, +} cudnnBatchNormMode_t; + +#define CUDNN_BN_MIN_EPSILON 1e-5 /* Minimum epsilon allowed to be used in the Batch Normalization formula */ + +/* +* Derives a tensor descriptor from layer data descriptor for BatchNormalization +* scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for +* bnScaleBiasMeanVarDesc and bnScaleBiasDiffDesc in Batch Normalization forward and backward functions. +*/ +cudnnStatus_t CUDNNWINAPI cudnnDeriveBNTensorDescriptor( + cudnnTensorDescriptor_t derivedBnDesc, + const cudnnTensorDescriptor_t xDesc, + cudnnBatchNormMode_t mode ); + +/* Computes y = BN(x). Also accumulates moving averages of mean and inverse variances */ +cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTraining( + cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + + const void *alpha, /* alpha[0] = result blend factor */ + const void *beta, /* beta[0] = dest layer blend factor */ + + const cudnnTensorDescriptor_t xDesc, + const void *x, /* NxCxHxW */ + const cudnnTensorDescriptor_t yDesc, + void *y, /* NxCxHxW */ + + /* Shared desc for the next 6 tensors in the argument list. + Data type to be set as follows: + type = (typeOf(x) == double) ? double : float + Dimensions for this descriptor depend on normalization mode + - Spatial Normalization : tensors are expected to have dims 1xCx1x1 + (normalization is performed across NxHxW) + - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW + (normalization is performed across N) */ + const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, + + /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */ + const void *bnScale, + const void *bnBias, + + /* MUST use factor=1 in the very first call of a complete training cycle. + Use a factor=1/(1+n) at N-th call to the function to get + Cumulative Moving Average (CMA) behavior + CMA[n] = (x[1]+...+x[n])/n + Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) = + ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) = + CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */ + double exponentialAverageFactor, + + /* Used in Training phase only. + runningMean = newMean*factor + runningMean*(1-factor) */ + void *resultRunningMean, + /* Output in training mode, input in inference. Is the moving average + of variance[x] (factor is applied in the same way as for runningMean) */ + void *resultRunningVariance, + + /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */ + double epsilon, + + /* Optionally save intermediate results from the forward pass here + - can be reused to speed up backward pass. NULL if unused */ + void *resultSaveMean, + void *resultSaveInvVariance ); + +/* +* Performs Batch Normalization during Inference: +* y[i] = bnScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + bnBias[k] +* with bnScale, bnBias, runningMean, runningInvVariance tensors indexed +* according to spatial or per-activation mode. Refer to cudnnBatchNormalizationForwardTraining +* above for notes on function arguments. +*/ +cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardInference( + cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + const void *alpha, /* alpha[0] = result blend factor */ + const void *beta, /* beta[0] = dest layer blend factor */ + const cudnnTensorDescriptor_t xDesc, + const void *x, /* NxCxHxW */ + const cudnnTensorDescriptor_t yDesc, + void *y, /* NxCxHxW */ + const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, + const void *bnScale, + const void *bnBias, + const void *estimatedMean, + const void *estimatedVariance, + double epsilon ); + +/* Performs backward pass of Batch Normalization layer. Returns x gradient, +* bnScale gradient and bnBias gradient */ +cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackward( + cudnnHandle_t handle, + cudnnBatchNormMode_t mode, + const void *alphaDataDiff, + const void *betaDataDiff, + const void *alphaParamDiff, + const void *betaParamDiff, + const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */ + const void *x, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const cudnnTensorDescriptor_t dxDesc, + void *dx, + /* Shared tensor desc for the 4 tensors below */ + const cudnnTensorDescriptor_t dBnScaleBiasDesc, + const void *bnScale, /* bnBias doesn't affect backpropagation */ + /* scale and bias diff are not backpropagated below this layer */ + void *dBnScaleResult, + void *dBnBiasResult, + /* Same epsilon as forward pass */ + double epsilon, + + /* Optionally cached intermediate results from + forward pass */ + const void *savedMean, + const void *savedInvVariance ); + + +/* APIs for spatial transformer network*/ +typedef enum { + CUDNN_SAMPLER_BILINEAR=0, +} cudnnSamplerType_t; + +cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor( + cudnnSpatialTransformerDescriptor_t *stDesc); + +cudnnStatus_t CUDNNWINAPI cudnnSetSpatialTransformerNdDescriptor( + cudnnSpatialTransformerDescriptor_t stDesc, + cudnnSamplerType_t samplerType, + cudnnDataType_t dataType, + const int nbDims, + const int dimA[]); + +cudnnStatus_t CUDNNWINAPI cudnnDestroySpatialTransformerDescriptor( + cudnnSpatialTransformerDescriptor_t stDesc); + +cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorForward( + cudnnHandle_t handle, + const cudnnSpatialTransformerDescriptor_t stDesc, + const void *theta, + void *grid); + +cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorBackward( + cudnnHandle_t handle, + const cudnnSpatialTransformerDescriptor_t stDesc, + const void *dgrid, + void *dtheta); + +cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerForward( + cudnnHandle_t handle, + cudnnSpatialTransformerDescriptor_t stDesc, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *grid, + const void *beta, + cudnnTensorDescriptor_t yDesc, + void *y); + +cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerBackward( + cudnnHandle_t handle, + cudnnSpatialTransformerDescriptor_t stDesc, + const void *alpha, + const cudnnTensorDescriptor_t xDesc, + const void *x, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, + void *dx, + const void *alphaDgrid, + const cudnnTensorDescriptor_t dyDesc, + const void *dy, + const void *grid, + const void *betaDgrid, + void *dgrid); + +typedef struct cudnnDropoutStruct * cudnnDropoutDescriptor_t; + +cudnnStatus_t CUDNNWINAPI cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t * dropoutDesc); + +cudnnStatus_t CUDNNWINAPI cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc); + +/*helper function to determine size of the states to be passed to cudnnSetDropoutDescriptor */ +cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t * sizeInBytes); + +/*helper function to determine size of the reserve space to be passed to dropout forward/backward calls */ +cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t * sizeInBytes); + +cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, + cudnnHandle_t handle, + float dropout, + void * states, + size_t stateSizeInBytes, + unsigned long long seed); + +// Restores the dropout descriptor to a previously saved-off state +cudnnStatus_t CUDNNWINAPI cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, + cudnnHandle_t handle, + float dropout, + void * states, + size_t stateSizeInBytes, + unsigned long long seed); + +cudnnStatus_t CUDNNWINAPI cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, + cudnnHandle_t handle, + float * dropout, + void ** states, + unsigned long long * seed); + +cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(cudnnHandle_t handle, + const cudnnDropoutDescriptor_t dropoutDesc, + const cudnnTensorDescriptor_t xdesc, + const void * x, + const cudnnTensorDescriptor_t ydesc, + void * y, + void * reserveSpace, + size_t reserveSpaceSizeInBytes); + +cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(cudnnHandle_t handle, + const cudnnDropoutDescriptor_t dropoutDesc, + const cudnnTensorDescriptor_t dydesc, + const void * dy, + const cudnnTensorDescriptor_t dxdesc, + void * dx, + void * reserveSpace, + size_t reserveSpaceSizeInBytes); + +/* RNN API */ +typedef enum + { + CUDNN_RNN_RELU = 0, /* Stock RNN with ReLu activation */ + CUDNN_RNN_TANH = 1, /* Stock RNN with tanh activation */ + CUDNN_LSTM = 2, /* LSTM with no peephole connections */ + CUDNN_GRU = 3 /* Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1); */ + } cudnnRNNMode_t; + +typedef enum + { + CUDNN_UNIDIRECTIONAL = 0, + CUDNN_BIDIRECTIONAL = 1 /* Using output concatination at each step. Do we also want to support output sum? */ + } cudnnDirectionMode_t; + +typedef enum + { + CUDNN_LINEAR_INPUT = 0, + CUDNN_SKIP_INPUT = 1 + } cudnnRNNInputMode_t; + + +typedef enum + { + CUDNN_RNN_ALGO_STANDARD = 0, + CUDNN_RNN_ALGO_PERSIST_STATIC = 1, + CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 2 + } cudnnRNNAlgo_t; + +struct cudnnRNNStruct; +typedef struct cudnnRNNStruct* cudnnRNNDescriptor_t; + +cudnnStatus_t CUDNNWINAPI cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t * rnnDesc); +cudnnStatus_t CUDNNWINAPI cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc); + +struct cudnnPersistentRNNPlan; +typedef struct cudnnPersistentRNNPlan *cudnnPersistentRNNPlan_t; + + +/* Expensive. Creates the plan for the specific settings. */ +cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc, + const int minibatch, + const cudnnDataType_t dataType, + cudnnPersistentRNNPlan_t * plan); + +/* Attaches the plan to the descriptor. */ +cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc, + cudnnPersistentRNNPlan_t plan); + +cudnnStatus_t CUDNNWINAPI cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan); + +cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(cudnnHandle_t handle, + cudnnRNNDescriptor_t rnnDesc, + const int hiddenSize, + const int numLayers, + cudnnDropoutDescriptor_t dropoutDesc, /* Between layers, not between recurrent steps. */ + cudnnRNNInputMode_t inputMode, + cudnnDirectionMode_t direction, + cudnnRNNMode_t mode, + cudnnRNNAlgo_t algo, + cudnnDataType_t dataType); + +cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor(cudnnHandle_t cudnnHandle, + cudnnRNNDescriptor_t rnnDesc, + int * hiddenSize, + int * numLayers, + cudnnDropoutDescriptor_t * dropoutDesc, + cudnnRNNInputMode_t * inputMode, + cudnnDirectionMode_t * direction, + cudnnRNNMode_t * mode, + cudnnRNNAlgo_t * algo, + cudnnDataType_t * dataType); + +cudnnStatus_t CUDNNWINAPI cudnnSetRNNMatrixMathType (cudnnRNNDescriptor_t desc, cudnnMathType_t math); + +/* dataType in the RNN descriptor is used to determine math precision */ +/* dataType in weight descriptors and input descriptors is used to describe storage */ +cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize( cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const int seqLength, + const cudnnTensorDescriptor_t *xDesc, + size_t *sizeInBytes); + +cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize( cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const int seqLength, + const cudnnTensorDescriptor_t *xDesc, + size_t *sizeInBytes); + + +cudnnStatus_t CUDNNWINAPI cudnnGetRNNParamsSize( cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const cudnnTensorDescriptor_t xDesc, + size_t *sizeInBytes, + cudnnDataType_t dataType); + +cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams( cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const int layer, + const cudnnTensorDescriptor_t xDesc, + const cudnnFilterDescriptor_t wDesc, + const void * w, + const int linLayerID, + cudnnFilterDescriptor_t linLayerMatDesc, + void ** linLayerMat); + +cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams( cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const int layer, + const cudnnTensorDescriptor_t xDesc, + const cudnnFilterDescriptor_t wDesc, + const void * w, + const int linLayerID, + cudnnFilterDescriptor_t linLayerBiasDesc, + void ** linLayerBias); + +cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference( cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const int seqLength, + const cudnnTensorDescriptor_t * xDesc, + const void * x, + const cudnnTensorDescriptor_t hxDesc, + const void * hx, + const cudnnTensorDescriptor_t cxDesc, + const void * cx, + const cudnnFilterDescriptor_t wDesc, + const void * w, + const cudnnTensorDescriptor_t *yDesc, + void * y, + const cudnnTensorDescriptor_t hyDesc, + void * hy, + const cudnnTensorDescriptor_t cyDesc, + void * cy, + void * workspace, + size_t workSpaceSizeInBytes); + +cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining( cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const int seqLength, + const cudnnTensorDescriptor_t *xDesc, + const void * x, + const cudnnTensorDescriptor_t hxDesc, + const void * hx, + const cudnnTensorDescriptor_t cxDesc, + const void * cx, + const cudnnFilterDescriptor_t wDesc, + const void * w, + const cudnnTensorDescriptor_t *yDesc, + void * y, + const cudnnTensorDescriptor_t hyDesc, + void * hy, + const cudnnTensorDescriptor_t cyDesc, + void * cy, + void * workspace, + size_t workSpaceSizeInBytes, + void * reserveSpace, + size_t reserveSpaceSizeInBytes); + +cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardData( cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const int seqLength, + const cudnnTensorDescriptor_t * yDesc, + const void * y, + const cudnnTensorDescriptor_t * dyDesc, + const void * dy, + const cudnnTensorDescriptor_t dhyDesc, + const void * dhy, + const cudnnTensorDescriptor_t dcyDesc, + const void * dcy, + const cudnnFilterDescriptor_t wDesc, + const void * w, + const cudnnTensorDescriptor_t hxDesc, + const void * hx, + const cudnnTensorDescriptor_t cxDesc, + const void * cx, + const cudnnTensorDescriptor_t * dxDesc, + void * dx, + const cudnnTensorDescriptor_t dhxDesc, + void * dhx, + const cudnnTensorDescriptor_t dcxDesc, + void * dcx, + void * workspace, + size_t workSpaceSizeInBytes, + void * reserveSpace, + size_t reserveSpaceSizeInBytes ); + + +cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights( cudnnHandle_t handle, + const cudnnRNNDescriptor_t rnnDesc, + const int seqLength, + const cudnnTensorDescriptor_t * xDesc, + const void * x, + const cudnnTensorDescriptor_t hxDesc, + const void * hx, + const cudnnTensorDescriptor_t * yDesc, + const void * y, + const void * workspace, + size_t workSpaceSizeInBytes, + const cudnnFilterDescriptor_t dwDesc, + void * dw, + const void * reserveSpace, + size_t reserveSpaceSizeInBytes ); + +typedef enum +{ + CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0, + CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 +}cudnnCTCLossAlgo_t; + +/* +* Create an instance of a CTC (Connectionist Temporal Classification) loss descriptor +*/ +cudnnStatus_t CUDNNWINAPI cudnnCreateCTCLossDescriptor( cudnnCTCLossDescriptor_t* ctcLossDesc ); + +cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptor( + cudnnCTCLossDescriptor_t ctcLossDesc, + cudnnDataType_t compType ); + +cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptor( + cudnnCTCLossDescriptor_t ctcLossDesc, + cudnnDataType_t* compType ); + +cudnnStatus_t CUDNNWINAPI cudnnDestroyCTCLossDescriptor( cudnnCTCLossDescriptor_t ctcLossDesc ); + +/* return the ctc costs and gradients, given the probabilities and labels */ +cudnnStatus_t CUDNNWINAPI cudnnCTCLoss( cudnnHandle_t handle, + const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the mini batch size, A is the alphabet size) */ + const void * probs, /* probabilities after softmax, in GPU memory */ + const int * labels, /* labels, in CPU memory */ + const int * labelLengths, /* the length of each label, in CPU memory */ + const int * inputLengths, /* the lengths of timing steps in each batch, in CPU memory */ + void * costs, /* the returned costs of CTC, in GPU memory */ + const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */ + const void * gradients, /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */ + cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */ + cudnnCTCLossDescriptor_t ctcLossDesc, + void * workspace, /* pointer to the workspace, in GPU memory */ + size_t workSpaceSizeInBytes); /* the workspace size needed */ + +/* return the workspace size needed for ctc */ +cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossWorkspaceSize( + cudnnHandle_t handle, + const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the mini batch size, A is the alphabet size) */ + const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A. To compute costs only, set it to NULL */ + const int * labels, /* labels, in CPU memory */ + const int * labelLengths, /* the length of each label, in CPU memory */ + const int * inputLengths, /* the lengths of timing steps in each batch, in CPU memory */ + cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */ + cudnnCTCLossDescriptor_t ctcLossDesc, + size_t *sizeInBytes ); /* pointer to the returned workspace size */ + + +/* DEPRECATED routines to be removed next release : + User should use the non-suffixed version (which has the API and functionality of _v6 version) + Routines with _v5 suffix has the functionality of the non-suffixed routines in the CUDNN V6 + */ + +cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(cudnnHandle_t handle, + cudnnRNNDescriptor_t rnnDesc, + const int hiddenSize, + const int numLayers, + cudnnDropoutDescriptor_t dropoutDesc, /* Between layers, not between recurrent steps. */ + cudnnRNNInputMode_t inputMode, + cudnnDirectionMode_t direction, + cudnnRNNMode_t mode, + cudnnRNNAlgo_t algo, + cudnnDataType_t dataType); + +cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v5(cudnnRNNDescriptor_t rnnDesc, + int hiddenSize, + int numLayers, + cudnnDropoutDescriptor_t dropoutDesc, /* Between layers, not between recurrent steps. */ + cudnnRNNInputMode_t inputMode, + cudnnDirectionMode_t direction, + cudnnRNNMode_t mode, + cudnnDataType_t dataType); +#if defined (__cplusplus) +} +#endif + +#endif /* CUDNN_H_ */ + diff --git a/include/external/CUDA/cusparse.h b/include/external/CUDA/cusparse.h new file mode 100644 index 000000000..0381c2b0b --- /dev/null +++ b/include/external/CUDA/cusparse.h @@ -0,0 +1,6257 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(CUSPARSE_H_) +#define CUSPARSE_H_ + + +#ifndef CUSPARSEAPI +#ifdef _WIN32 +#define CUSPARSEAPI __stdcall +#else +#define CUSPARSEAPI +#endif +#endif + +#include "driver_types.h" +#include "cuComplex.h" /* import complex data type */ + +#include "cuda_fp16.h" + +#include "library_types.h" + +#if defined(__cplusplus) +extern "C" { +#endif /* __cplusplus */ + +/* CUSPARSE status type returns */ +typedef enum{ + CUSPARSE_STATUS_SUCCESS=0, + CUSPARSE_STATUS_NOT_INITIALIZED=1, + CUSPARSE_STATUS_ALLOC_FAILED=2, + CUSPARSE_STATUS_INVALID_VALUE=3, + CUSPARSE_STATUS_ARCH_MISMATCH=4, + CUSPARSE_STATUS_MAPPING_ERROR=5, + CUSPARSE_STATUS_EXECUTION_FAILED=6, + CUSPARSE_STATUS_INTERNAL_ERROR=7, + CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED=8, + CUSPARSE_STATUS_ZERO_PIVOT=9 +} cusparseStatus_t; + +/* Opaque structure holding CUSPARSE library context */ +struct cusparseContext; +typedef struct cusparseContext *cusparseHandle_t; + +/* Opaque structure holding the matrix descriptor */ +struct cusparseMatDescr; +typedef struct cusparseMatDescr *cusparseMatDescr_t; + +/* Opaque structure holding the sparse triangular solve information */ +struct cusparseSolveAnalysisInfo; +typedef struct cusparseSolveAnalysisInfo *cusparseSolveAnalysisInfo_t; + +/* Opaque structures holding the sparse triangular solve information */ +struct csrsv2Info; +typedef struct csrsv2Info *csrsv2Info_t; + +struct bsrsv2Info; +typedef struct bsrsv2Info *bsrsv2Info_t; + +struct bsrsm2Info; +typedef struct bsrsm2Info *bsrsm2Info_t; + +/* Opaque structures holding incomplete Cholesky information */ +struct csric02Info; +typedef struct csric02Info *csric02Info_t; + +struct bsric02Info; +typedef struct bsric02Info *bsric02Info_t; + +/* Opaque structures holding incomplete LU information */ +struct csrilu02Info; +typedef struct csrilu02Info *csrilu02Info_t; + +struct bsrilu02Info; +typedef struct bsrilu02Info *bsrilu02Info_t; + +/* Opaque structures holding the hybrid (HYB) storage information */ +struct cusparseHybMat; +typedef struct cusparseHybMat *cusparseHybMat_t; + +/* Opaque structures holding sparse gemm information */ +struct csrgemm2Info; +typedef struct csrgemm2Info *csrgemm2Info_t; + +/* Opaque structure holding the sorting information */ +struct csru2csrInfo; +typedef struct csru2csrInfo *csru2csrInfo_t; + +/* Opaque structure holding the coloring information */ +struct cusparseColorInfo; +typedef struct cusparseColorInfo *cusparseColorInfo_t; + +/* Opaque structure holding the prune information */ +struct pruneInfo; +typedef struct pruneInfo *pruneInfo_t; + +/* Types definitions */ +typedef enum { + CUSPARSE_POINTER_MODE_HOST = 0, + CUSPARSE_POINTER_MODE_DEVICE = 1 +} cusparsePointerMode_t; + +typedef enum { + CUSPARSE_ACTION_SYMBOLIC = 0, + CUSPARSE_ACTION_NUMERIC = 1 +} cusparseAction_t; + +typedef enum { + CUSPARSE_MATRIX_TYPE_GENERAL = 0, + CUSPARSE_MATRIX_TYPE_SYMMETRIC = 1, + CUSPARSE_MATRIX_TYPE_HERMITIAN = 2, + CUSPARSE_MATRIX_TYPE_TRIANGULAR = 3 +} cusparseMatrixType_t; + +typedef enum { + CUSPARSE_FILL_MODE_LOWER = 0, + CUSPARSE_FILL_MODE_UPPER = 1 +} cusparseFillMode_t; + +typedef enum { + CUSPARSE_DIAG_TYPE_NON_UNIT = 0, + CUSPARSE_DIAG_TYPE_UNIT = 1 +} cusparseDiagType_t; + +typedef enum { + CUSPARSE_INDEX_BASE_ZERO = 0, + CUSPARSE_INDEX_BASE_ONE = 1 +} cusparseIndexBase_t; + +typedef enum { + CUSPARSE_OPERATION_NON_TRANSPOSE = 0, + CUSPARSE_OPERATION_TRANSPOSE = 1, + CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2 +} cusparseOperation_t; + +typedef enum { + CUSPARSE_DIRECTION_ROW = 0, + CUSPARSE_DIRECTION_COLUMN = 1 +} cusparseDirection_t; + +typedef enum { + CUSPARSE_HYB_PARTITION_AUTO = 0, // automatically decide how to split the data into regular/irregular part + CUSPARSE_HYB_PARTITION_USER = 1, // store data into regular part up to a user specified treshhold + CUSPARSE_HYB_PARTITION_MAX = 2 // store all data in the regular part +} cusparseHybPartition_t; + +// used in csrsv2, csric02, and csrilu02 +typedef enum { + CUSPARSE_SOLVE_POLICY_NO_LEVEL = 0, // no level information is generated, only reports structural zero. + CUSPARSE_SOLVE_POLICY_USE_LEVEL = 1 +} cusparseSolvePolicy_t; + +typedef enum { + CUSPARSE_SIDE_LEFT =0, + CUSPARSE_SIDE_RIGHT=1 +} cusparseSideMode_t; + +typedef enum { + CUSPARSE_COLOR_ALG0 = 0, // default + CUSPARSE_COLOR_ALG1 = 1 +} cusparseColorAlg_t; + +typedef enum { + CUSPARSE_ALG0 = 0, //default, naive + CUSPARSE_ALG1 = 1 //merge path +} cusparseAlgMode_t; + +/* CUSPARSE initialization and managment routines */ +cusparseStatus_t CUSPARSEAPI cusparseCreate(cusparseHandle_t *handle); +cusparseStatus_t CUSPARSEAPI cusparseDestroy(cusparseHandle_t handle); +cusparseStatus_t CUSPARSEAPI cusparseGetVersion(cusparseHandle_t handle, int *version); +cusparseStatus_t CUSPARSEAPI cusparseGetProperty(libraryPropertyType type, int *value); +cusparseStatus_t CUSPARSEAPI cusparseSetStream(cusparseHandle_t handle, cudaStream_t streamId); +cusparseStatus_t CUSPARSEAPI cusparseGetStream(cusparseHandle_t handle, cudaStream_t *streamId); + + +/* CUSPARSE type creation, destruction, set and get routines */ +cusparseStatus_t CUSPARSEAPI cusparseGetPointerMode(cusparseHandle_t handle, cusparsePointerMode_t *mode); +cusparseStatus_t CUSPARSEAPI cusparseSetPointerMode(cusparseHandle_t handle, cusparsePointerMode_t mode); + +/* sparse matrix descriptor */ +/* When the matrix descriptor is created, its fields are initialized to: + CUSPARSE_MATRIX_TYPE_GENERAL + CUSPARSE_INDEX_BASE_ZERO + All other fields are uninitialized +*/ +cusparseStatus_t CUSPARSEAPI cusparseCreateMatDescr(cusparseMatDescr_t *descrA); +cusparseStatus_t CUSPARSEAPI cusparseDestroyMatDescr (cusparseMatDescr_t descrA); + +cusparseStatus_t CUSPARSEAPI cusparseCopyMatDescr(cusparseMatDescr_t dest, const cusparseMatDescr_t src); + +cusparseStatus_t CUSPARSEAPI cusparseSetMatType(cusparseMatDescr_t descrA, cusparseMatrixType_t type); +cusparseMatrixType_t CUSPARSEAPI cusparseGetMatType(const cusparseMatDescr_t descrA); + +cusparseStatus_t CUSPARSEAPI cusparseSetMatFillMode(cusparseMatDescr_t descrA, cusparseFillMode_t fillMode); +cusparseFillMode_t CUSPARSEAPI cusparseGetMatFillMode(const cusparseMatDescr_t descrA); + +cusparseStatus_t CUSPARSEAPI cusparseSetMatDiagType(cusparseMatDescr_t descrA, cusparseDiagType_t diagType); +cusparseDiagType_t CUSPARSEAPI cusparseGetMatDiagType(const cusparseMatDescr_t descrA); + +cusparseStatus_t CUSPARSEAPI cusparseSetMatIndexBase(cusparseMatDescr_t descrA, cusparseIndexBase_t base); +cusparseIndexBase_t CUSPARSEAPI cusparseGetMatIndexBase(const cusparseMatDescr_t descrA); + +/* sparse triangular solve and incomplete-LU and Cholesky (algorithm 1) */ +cusparseStatus_t CUSPARSEAPI cusparseCreateSolveAnalysisInfo(cusparseSolveAnalysisInfo_t *info); +cusparseStatus_t CUSPARSEAPI cusparseDestroySolveAnalysisInfo(cusparseSolveAnalysisInfo_t info); +cusparseStatus_t CUSPARSEAPI cusparseGetLevelInfo(cusparseHandle_t handle, + cusparseSolveAnalysisInfo_t info, + int *nlevels, + int **levelPtr, + int **levelInd); + +/* sparse triangular solve (algorithm 2) */ +cusparseStatus_t CUSPARSEAPI cusparseCreateCsrsv2Info(csrsv2Info_t *info); +cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrsv2Info(csrsv2Info_t info); + +/* incomplete Cholesky (algorithm 2)*/ +cusparseStatus_t CUSPARSEAPI cusparseCreateCsric02Info(csric02Info_t *info); +cusparseStatus_t CUSPARSEAPI cusparseDestroyCsric02Info(csric02Info_t info); + +cusparseStatus_t CUSPARSEAPI cusparseCreateBsric02Info(bsric02Info_t *info); +cusparseStatus_t CUSPARSEAPI cusparseDestroyBsric02Info(bsric02Info_t info); + +/* incomplete LU (algorithm 2) */ +cusparseStatus_t CUSPARSEAPI cusparseCreateCsrilu02Info(csrilu02Info_t *info); +cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrilu02Info(csrilu02Info_t info); + +cusparseStatus_t CUSPARSEAPI cusparseCreateBsrilu02Info(bsrilu02Info_t *info); +cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrilu02Info(bsrilu02Info_t info); + +/* block-CSR triangular solve (algorithm 2) */ +cusparseStatus_t CUSPARSEAPI cusparseCreateBsrsv2Info(bsrsv2Info_t *info); +cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrsv2Info(bsrsv2Info_t info); + +cusparseStatus_t CUSPARSEAPI cusparseCreateBsrsm2Info(bsrsm2Info_t *info); +cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrsm2Info(bsrsm2Info_t info); + +/* hybrid (HYB) format */ +cusparseStatus_t CUSPARSEAPI cusparseCreateHybMat(cusparseHybMat_t *hybA); +cusparseStatus_t CUSPARSEAPI cusparseDestroyHybMat(cusparseHybMat_t hybA); + +/* sorting information */ +cusparseStatus_t CUSPARSEAPI cusparseCreateCsru2csrInfo(csru2csrInfo_t *info); +cusparseStatus_t CUSPARSEAPI cusparseDestroyCsru2csrInfo(csru2csrInfo_t info); + +/* coloring info */ +cusparseStatus_t CUSPARSEAPI cusparseCreateColorInfo(cusparseColorInfo_t *info); +cusparseStatus_t CUSPARSEAPI cusparseDestroyColorInfo(cusparseColorInfo_t info); + +cusparseStatus_t CUSPARSEAPI cusparseSetColorAlgs(cusparseColorInfo_t info, cusparseColorAlg_t alg); +cusparseStatus_t CUSPARSEAPI cusparseGetColorAlgs(cusparseColorInfo_t info, cusparseColorAlg_t *alg); + +/* prune information */ +cusparseStatus_t CUSPARSEAPI cusparseCreatePruneInfo(pruneInfo_t *info); + +cusparseStatus_t CUSPARSEAPI cusparseDestroyPruneInfo(pruneInfo_t info); + + +/* --- Sparse Level 1 routines --- */ + +/* Description: Addition of a scalar multiple of a sparse vector x + and a dense vector y. */ +cusparseStatus_t CUSPARSEAPI cusparseSaxpyi(cusparseHandle_t handle, + int nnz, + const float *alpha, + const float *xVal, + const int *xInd, + float *y, + cusparseIndexBase_t idxBase); + +cusparseStatus_t CUSPARSEAPI cusparseDaxpyi(cusparseHandle_t handle, + int nnz, + const double *alpha, + const double *xVal, + const int *xInd, + double *y, + cusparseIndexBase_t idxBase); + +cusparseStatus_t CUSPARSEAPI cusparseCaxpyi(cusparseHandle_t handle, + int nnz, + const cuComplex *alpha, + const cuComplex *xVal, + const int *xInd, + cuComplex *y, + cusparseIndexBase_t idxBase); + +cusparseStatus_t CUSPARSEAPI cusparseZaxpyi(cusparseHandle_t handle, + int nnz, + const cuDoubleComplex *alpha, + const cuDoubleComplex *xVal, + const int *xInd, + cuDoubleComplex *y, + cusparseIndexBase_t idxBase); + +/* Description: dot product of a sparse vector x and a dense vector y. */ +cusparseStatus_t CUSPARSEAPI cusparseSdoti(cusparseHandle_t handle, + int nnz, + const float *xVal, + const int *xInd, + const float *y, + float *resultDevHostPtr, + cusparseIndexBase_t idxBase); + +cusparseStatus_t CUSPARSEAPI cusparseDdoti(cusparseHandle_t handle, + int nnz, + const double *xVal, + const int *xInd, + const double *y, + double *resultDevHostPtr, + cusparseIndexBase_t idxBase); + +cusparseStatus_t CUSPARSEAPI cusparseCdoti(cusparseHandle_t handle, + int nnz, + const cuComplex *xVal, + const int *xInd, + const cuComplex *y, + cuComplex *resultDevHostPtr, + cusparseIndexBase_t idxBase); + +cusparseStatus_t CUSPARSEAPI cusparseZdoti(cusparseHandle_t handle, + int nnz, + const cuDoubleComplex *xVal, + const int *xInd, + const cuDoubleComplex *y, + cuDoubleComplex *resultDevHostPtr, + cusparseIndexBase_t idxBase); + +/* Description: dot product of complex conjugate of a sparse vector x + and a dense vector y. */ +cusparseStatus_t CUSPARSEAPI cusparseCdotci(cusparseHandle_t handle, + int nnz, + const cuComplex *xVal, + const int *xInd, + const cuComplex *y, + cuComplex *resultDevHostPtr, + cusparseIndexBase_t idxBase); + +cusparseStatus_t CUSPARSEAPI cusparseZdotci(cusparseHandle_t handle, + int nnz, + const cuDoubleComplex *xVal, + const int *xInd, + const cuDoubleComplex *y, + cuDoubleComplex *resultDevHostPtr, + cusparseIndexBase_t idxBase); + + +/* Description: Gather of non-zero elements from dense vector y into + sparse vector x. */ +cusparseStatus_t CUSPARSEAPI cusparseSgthr(cusparseHandle_t handle, + int nnz, + const float *y, + float *xVal, + const int *xInd, + cusparseIndexBase_t idxBase); + +cusparseStatus_t CUSPARSEAPI cusparseDgthr(cusparseHandle_t handle, + int nnz, + const double *y, + double *xVal, + const int *xInd, + cusparseIndexBase_t idxBase); + +cusparseStatus_t CUSPARSEAPI cusparseCgthr(cusparseHandle_t handle, + int nnz, + const cuComplex *y, + cuComplex *xVal, + const int *xInd, + cusparseIndexBase_t idxBase); + +cusparseStatus_t CUSPARSEAPI cusparseZgthr(cusparseHandle_t handle, + int nnz, + const cuDoubleComplex *y, + cuDoubleComplex *xVal, + const int *xInd, + cusparseIndexBase_t idxBase); + +/* Description: Gather of non-zero elements from desne vector y into + sparse vector x (also replacing these elements in y by zeros). */ +cusparseStatus_t CUSPARSEAPI cusparseSgthrz(cusparseHandle_t handle, + int nnz, + float *y, + float *xVal, + const int *xInd, + cusparseIndexBase_t idxBase); + +cusparseStatus_t CUSPARSEAPI cusparseDgthrz(cusparseHandle_t handle, + int nnz, + double *y, + double *xVal, + const int *xInd, + cusparseIndexBase_t idxBase); + +cusparseStatus_t CUSPARSEAPI cusparseCgthrz(cusparseHandle_t handle, + int nnz, + cuComplex *y, + cuComplex *xVal, + const int *xInd, + cusparseIndexBase_t idxBase); + +cusparseStatus_t CUSPARSEAPI cusparseZgthrz(cusparseHandle_t handle, + int nnz, + cuDoubleComplex *y, + cuDoubleComplex *xVal, + const int *xInd, + cusparseIndexBase_t idxBase); + +/* Description: Scatter of elements of the sparse vector x into + dense vector y. */ +cusparseStatus_t CUSPARSEAPI cusparseSsctr(cusparseHandle_t handle, + int nnz, + const float *xVal, + const int *xInd, + float *y, + cusparseIndexBase_t idxBase); + +cusparseStatus_t CUSPARSEAPI cusparseDsctr(cusparseHandle_t handle, + int nnz, + const double *xVal, + const int *xInd, + double *y, + cusparseIndexBase_t idxBase); + +cusparseStatus_t CUSPARSEAPI cusparseCsctr(cusparseHandle_t handle, + int nnz, + const cuComplex *xVal, + const int *xInd, + cuComplex *y, + cusparseIndexBase_t idxBase); + +cusparseStatus_t CUSPARSEAPI cusparseZsctr(cusparseHandle_t handle, + int nnz, + const cuDoubleComplex *xVal, + const int *xInd, + cuDoubleComplex *y, + cusparseIndexBase_t idxBase); + +/* Description: Givens rotation, where c and s are cosine and sine, + x and y are sparse and dense vectors, respectively. */ +cusparseStatus_t CUSPARSEAPI cusparseSroti(cusparseHandle_t handle, + int nnz, + float *xVal, + const int *xInd, + float *y, + const float *c, + const float *s, + cusparseIndexBase_t idxBase); + +cusparseStatus_t CUSPARSEAPI cusparseDroti(cusparseHandle_t handle, + int nnz, + double *xVal, + const int *xInd, + double *y, + const double *c, + const double *s, + cusparseIndexBase_t idxBase); + + +/* --- Sparse Level 2 routines --- */ + +cusparseStatus_t CUSPARSEAPI cusparseSgemvi(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + const float *alpha, /* host or device pointer */ + const float *A, + int lda, + int nnz, + const float *xVal, + const int *xInd, + const float *beta, /* host or device pointer */ + float *y, + cusparseIndexBase_t idxBase, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseSgemvi_bufferSize( cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + int nnz, + int *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseDgemvi(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + const double *alpha, /* host or device pointer */ + const double *A, + int lda, + int nnz, + const double *xVal, + const int *xInd, + const double *beta, /* host or device pointer */ + double *y, + cusparseIndexBase_t idxBase, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDgemvi_bufferSize( cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + int nnz, + int *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseCgemvi(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *A, + int lda, + int nnz, + const cuComplex *xVal, + const int *xInd, + const cuComplex *beta, /* host or device pointer */ + cuComplex *y, + cusparseIndexBase_t idxBase, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseCgemvi_bufferSize( cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + int nnz, + int *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseZgemvi(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *A, + int lda, + int nnz, + const cuDoubleComplex *xVal, + const int *xInd, + const cuDoubleComplex *beta, /* host or device pointer */ + cuDoubleComplex *y, + cusparseIndexBase_t idxBase, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseZgemvi_bufferSize( cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + int nnz, + int *pBufferSize); + + +/* Description: Matrix-vector multiplication y = alpha * op(A) * x + beta * y, + where A is a sparse matrix in CSR storage format, x and y are dense vectors. */ +cusparseStatus_t CUSPARSEAPI cusparseScsrmv(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const float *alpha, + const cusparseMatDescr_t descrA, + const float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const float *x, + const float *beta, + float *y); + +cusparseStatus_t CUSPARSEAPI cusparseDcsrmv(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const double *alpha, + const cusparseMatDescr_t descrA, + const double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const double *x, + const double *beta, + double *y); + +cusparseStatus_t CUSPARSEAPI cusparseCcsrmv(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const cuComplex *alpha, + const cusparseMatDescr_t descrA, + const cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cuComplex *x, + const cuComplex *beta, + cuComplex *y); + +cusparseStatus_t CUSPARSEAPI cusparseZcsrmv(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const cuDoubleComplex *alpha, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cuDoubleComplex *x, + const cuDoubleComplex *beta, + cuDoubleComplex *y); + +//Returns number of bytes +cusparseStatus_t CUSPARSEAPI cusparseCsrmvEx_bufferSize(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const void *alpha, + cudaDataType alphatype, + const cusparseMatDescr_t descrA, + const void *csrValA, + cudaDataType csrValAtype, + const int *csrRowPtrA, + const int *csrColIndA, + const void *x, + cudaDataType xtype, + const void *beta, + cudaDataType betatype, + void *y, + cudaDataType ytype, + cudaDataType executiontype, + size_t *bufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseCsrmvEx(cusparseHandle_t handle, + cusparseAlgMode_t alg, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const void *alpha, + cudaDataType alphatype, + const cusparseMatDescr_t descrA, + const void *csrValA, + cudaDataType csrValAtype, + const int *csrRowPtrA, + const int *csrColIndA, + const void *x, + cudaDataType xtype, + const void *beta, + cudaDataType betatype, + void *y, + cudaDataType ytype, + cudaDataType executiontype, + void* buffer); + +/* Description: Matrix-vector multiplication y = alpha * op(A) * x + beta * y, + where A is a sparse matrix in CSR storage format, x and y are dense vectors + using a Merge Path load-balancing implementation. */ + cusparseStatus_t CUSPARSEAPI cusparseScsrmv_mp(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const float *alpha, + const cusparseMatDescr_t descrA, + const float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const float *x, + const float *beta, + float *y); + +cusparseStatus_t CUSPARSEAPI cusparseDcsrmv_mp(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const double *alpha, + const cusparseMatDescr_t descrA, + const double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const double *x, + const double *beta, + double *y); + +cusparseStatus_t CUSPARSEAPI cusparseCcsrmv_mp(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const cuComplex *alpha, + const cusparseMatDescr_t descrA, + const cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cuComplex *x, + const cuComplex *beta, + cuComplex *y); + +cusparseStatus_t CUSPARSEAPI cusparseZcsrmv_mp(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + int nnz, + const cuDoubleComplex *alpha, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cuDoubleComplex *x, + const cuDoubleComplex *beta, + cuDoubleComplex *y); + + +/* Description: Matrix-vector multiplication y = alpha * op(A) * x + beta * y, + where A is a sparse matrix in HYB storage format, x and y are dense vectors. */ +cusparseStatus_t CUSPARSEAPI cusparseShybmv(cusparseHandle_t handle, + cusparseOperation_t transA, + const float *alpha, + const cusparseMatDescr_t descrA, + const cusparseHybMat_t hybA, + const float *x, + const float *beta, + float *y); + +cusparseStatus_t CUSPARSEAPI cusparseDhybmv(cusparseHandle_t handle, + cusparseOperation_t transA, + const double *alpha, + const cusparseMatDescr_t descrA, + const cusparseHybMat_t hybA, + const double *x, + const double *beta, + double *y); + +cusparseStatus_t CUSPARSEAPI cusparseChybmv(cusparseHandle_t handle, + cusparseOperation_t transA, + const cuComplex *alpha, + const cusparseMatDescr_t descrA, + const cusparseHybMat_t hybA, + const cuComplex *x, + const cuComplex *beta, + cuComplex *y); + +cusparseStatus_t CUSPARSEAPI cusparseZhybmv(cusparseHandle_t handle, + cusparseOperation_t transA, + const cuDoubleComplex *alpha, + const cusparseMatDescr_t descrA, + const cusparseHybMat_t hybA, + const cuDoubleComplex *x, + const cuDoubleComplex *beta, + cuDoubleComplex *y); + +/* Description: Matrix-vector multiplication y = alpha * op(A) * x + beta * y, + where A is a sparse matrix in BSR storage format, x and y are dense vectors. */ +cusparseStatus_t CUSPARSEAPI cusparseSbsrmv(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nb, + int nnzb, + const float *alpha, + const cusparseMatDescr_t descrA, + const float *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int blockDim, + const float *x, + const float *beta, + float *y); + +cusparseStatus_t CUSPARSEAPI cusparseDbsrmv(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nb, + int nnzb, + const double *alpha, + const cusparseMatDescr_t descrA, + const double *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int blockDim, + const double *x, + const double *beta, + double *y); + +cusparseStatus_t CUSPARSEAPI cusparseCbsrmv(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nb, + int nnzb, + const cuComplex *alpha, + const cusparseMatDescr_t descrA, + const cuComplex *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int blockDim, + const cuComplex *x, + const cuComplex *beta, + cuComplex *y); + +cusparseStatus_t CUSPARSEAPI cusparseZbsrmv(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nb, + int nnzb, + const cuDoubleComplex *alpha, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int blockDim, + const cuDoubleComplex *x, + const cuDoubleComplex *beta, + cuDoubleComplex *y); + +/* Description: Matrix-vector multiplication y = alpha * op(A) * x + beta * y, + where A is a sparse matrix in extended BSR storage format, x and y are dense + vectors. */ +cusparseStatus_t CUSPARSEAPI cusparseSbsrxmv(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int sizeOfMask, + int mb, + int nb, + int nnzb, + const float *alpha, + const cusparseMatDescr_t descrA, + const float *bsrSortedValA, + const int *bsrSortedMaskPtrA, + const int *bsrSortedRowPtrA, + const int *bsrSortedEndPtrA, + const int *bsrSortedColIndA, + int blockDim, + const float *x, + const float *beta, + float *y); + + +cusparseStatus_t CUSPARSEAPI cusparseDbsrxmv(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int sizeOfMask, + int mb, + int nb, + int nnzb, + const double *alpha, + const cusparseMatDescr_t descrA, + const double *bsrSortedValA, + const int *bsrSortedMaskPtrA, + const int *bsrSortedRowPtrA, + const int *bsrSortedEndPtrA, + const int *bsrSortedColIndA, + int blockDim, + const double *x, + const double *beta, + double *y); + +cusparseStatus_t CUSPARSEAPI cusparseCbsrxmv(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int sizeOfMask, + int mb, + int nb, + int nnzb, + const cuComplex *alpha, + const cusparseMatDescr_t descrA, + const cuComplex *bsrSortedValA, + const int *bsrSortedMaskPtrA, + const int *bsrSortedRowPtrA, + const int *bsrSortedEndPtrA, + const int *bsrSortedColIndA, + int blockDim, + const cuComplex *x, + const cuComplex *beta, + cuComplex *y); + + +cusparseStatus_t CUSPARSEAPI cusparseZbsrxmv(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int sizeOfMask, + int mb, + int nb, + int nnzb, + const cuDoubleComplex *alpha, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *bsrSortedValA, + const int *bsrSortedMaskPtrA, + const int *bsrSortedRowPtrA, + const int *bsrSortedEndPtrA, + const int *bsrSortedColIndA, + int blockDim, + const cuDoubleComplex *x, + const cuDoubleComplex *beta, + cuDoubleComplex *y); + +/* Description: Solution of triangular linear system op(A) * x = alpha * f, + where A is a sparse matrix in CSR storage format, rhs f and solution x + are dense vectors. This routine implements algorithm 1 for the solve. */ +cusparseStatus_t CUSPARSEAPI cusparseCsrsv_analysisEx(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const void *csrSortedValA, + cudaDataType csrSortedValAtype, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info, + cudaDataType executiontype); + +cusparseStatus_t CUSPARSEAPI cusparseScsrsv_analysis(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info); + +cusparseStatus_t CUSPARSEAPI cusparseDcsrsv_analysis(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info); + +cusparseStatus_t CUSPARSEAPI cusparseCcsrsv_analysis(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info); + +cusparseStatus_t CUSPARSEAPI cusparseZcsrsv_analysis(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info); + +cusparseStatus_t CUSPARSEAPI cusparseCsrsv_solveEx(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + const void *alpha, + cudaDataType alphatype, + const cusparseMatDescr_t descrA, + const void *csrSortedValA, + cudaDataType csrSortedValAtype, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info, + const void *f, + cudaDataType ftype, + void *x, + cudaDataType xtype, + cudaDataType executiontype); + +cusparseStatus_t CUSPARSEAPI cusparseScsrsv_solve(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + const float *alpha, + const cusparseMatDescr_t descrA, + const float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info, + const float *f, + float *x); + +cusparseStatus_t CUSPARSEAPI cusparseDcsrsv_solve(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + const double *alpha, + const cusparseMatDescr_t descrA, + const double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info, + const double *f, + double *x); + +cusparseStatus_t CUSPARSEAPI cusparseCcsrsv_solve(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + const cuComplex *alpha, + const cusparseMatDescr_t descrA, + const cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info, + const cuComplex *f, + cuComplex *x); + +cusparseStatus_t CUSPARSEAPI cusparseZcsrsv_solve(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + const cuDoubleComplex *alpha, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info, + const cuDoubleComplex *f, + cuDoubleComplex *x); + +/* Description: Solution of triangular linear system op(A) * x = alpha * f, + where A is a sparse matrix in CSR storage format, rhs f and solution y + are dense vectors. This routine implements algorithm 1 for this problem. + Also, it provides a utility function to query size of buffer used. */ +cusparseStatus_t CUSPARSEAPI cusparseXcsrsv2_zeroPivot(cusparseHandle_t handle, + csrsv2Info_t info, + int *position); + +cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_bufferSize(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrsv2Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_bufferSize(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrsv2Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_bufferSize(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrsv2Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_bufferSize(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrsv2Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_bufferSizeExt(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrsv2Info_t info, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_bufferSizeExt(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrsv2Info_t info, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_bufferSizeExt(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrsv2Info_t info, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_bufferSizeExt(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrsv2Info_t info, + size_t *pBufferSize); + + +cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_analysis(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrsv2Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_analysis(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrsv2Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_analysis(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrsv2Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_analysis(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrsv2Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_solve(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const float *alpha, + const cusparseMatDescr_t descrA, + const float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrsv2Info_t info, + const float *f, + float *x, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_solve(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const double *alpha, + const cusparseMatDescr_t descrA, + const double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrsv2Info_t info, + const double *f, + double *x, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_solve(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cuComplex *alpha, + const cusparseMatDescr_t descrA, + const cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrsv2Info_t info, + const cuComplex *f, + cuComplex *x, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_solve(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cuDoubleComplex *alpha, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrsv2Info_t info, + const cuDoubleComplex *f, + cuDoubleComplex *x, + cusparseSolvePolicy_t policy, + void *pBuffer); + +/* Description: Solution of triangular linear system op(A) * x = alpha * f, + where A is a sparse matrix in block-CSR storage format, rhs f and solution y + are dense vectors. This routine implements algorithm 2 for this problem. + Also, it provides a utility function to query size of buffer used. */ +cusparseStatus_t CUSPARSEAPI cusparseXbsrsv2_zeroPivot(cusparseHandle_t handle, + bsrsv2Info_t info, + int *position); + + +cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + float *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int blockDim, + bsrsv2Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + double *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int blockDim, + bsrsv2Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuComplex *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int blockDim, + bsrsv2Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuDoubleComplex *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int blockDim, + bsrsv2Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + float *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int blockSize, + bsrsv2Info_t info, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + double *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int blockSize, + bsrsv2Info_t info, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuComplex *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int blockSize, + bsrsv2Info_t info, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuDoubleComplex *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int blockSize, + bsrsv2Info_t info, + size_t *pBufferSize); + + +cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + const float *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int blockDim, + bsrsv2Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + const double *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int blockDim, + bsrsv2Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + const cuComplex *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int blockDim, + bsrsv2Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int blockDim, + bsrsv2Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + + +cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_solve(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const float *alpha, + const cusparseMatDescr_t descrA, + const float *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int blockDim, + bsrsv2Info_t info, + const float *f, + float *x, + cusparseSolvePolicy_t policy, + void *pBuffer); + + +cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_solve(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const double *alpha, + const cusparseMatDescr_t descrA, + const double *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int blockDim, + bsrsv2Info_t info, + const double *f, + double *x, + cusparseSolvePolicy_t policy, + void *pBuffer); + + +cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_solve(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cuComplex *alpha, + const cusparseMatDescr_t descrA, + const cuComplex *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int blockDim, + bsrsv2Info_t info, + const cuComplex *f, + cuComplex *x, + cusparseSolvePolicy_t policy, + void *pBuffer); + + +cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_solve(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + int mb, + int nnzb, + const cuDoubleComplex *alpha, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int blockDim, + bsrsv2Info_t info, + const cuDoubleComplex *f, + cuDoubleComplex *x, + cusparseSolvePolicy_t policy, + void *pBuffer); + +/* Description: Solution of triangular linear system op(A) * x = alpha * f, + where A is a sparse matrix in HYB storage format, rhs f and solution x + are dense vectors. */ +cusparseStatus_t CUSPARSEAPI cusparseShybsv_analysis(cusparseHandle_t handle, + cusparseOperation_t transA, + const cusparseMatDescr_t descrA, + cusparseHybMat_t hybA, + cusparseSolveAnalysisInfo_t info); + +cusparseStatus_t CUSPARSEAPI cusparseDhybsv_analysis(cusparseHandle_t handle, + cusparseOperation_t transA, + const cusparseMatDescr_t descrA, + cusparseHybMat_t hybA, + cusparseSolveAnalysisInfo_t info); + +cusparseStatus_t CUSPARSEAPI cusparseChybsv_analysis(cusparseHandle_t handle, + cusparseOperation_t transA, + const cusparseMatDescr_t descrA, + cusparseHybMat_t hybA, + cusparseSolveAnalysisInfo_t info); + +cusparseStatus_t CUSPARSEAPI cusparseZhybsv_analysis(cusparseHandle_t handle, + cusparseOperation_t transA, + const cusparseMatDescr_t descrA, + cusparseHybMat_t hybA, + cusparseSolveAnalysisInfo_t info); + +cusparseStatus_t CUSPARSEAPI cusparseShybsv_solve(cusparseHandle_t handle, + cusparseOperation_t trans, + const float *alpha, + const cusparseMatDescr_t descra, + const cusparseHybMat_t hybA, + cusparseSolveAnalysisInfo_t info, + const float *f, + float *x); + +cusparseStatus_t CUSPARSEAPI cusparseChybsv_solve(cusparseHandle_t handle, + cusparseOperation_t trans, + const cuComplex *alpha, + const cusparseMatDescr_t descra, + const cusparseHybMat_t hybA, + cusparseSolveAnalysisInfo_t info, + const cuComplex *f, + cuComplex *x); + +cusparseStatus_t CUSPARSEAPI cusparseDhybsv_solve(cusparseHandle_t handle, + cusparseOperation_t trans, + const double *alpha, + const cusparseMatDescr_t descra, + const cusparseHybMat_t hybA, + cusparseSolveAnalysisInfo_t info, + const double *f, + double *x); + +cusparseStatus_t CUSPARSEAPI cusparseZhybsv_solve(cusparseHandle_t handle, + cusparseOperation_t trans, + const cuDoubleComplex *alpha, + const cusparseMatDescr_t descra, + const cusparseHybMat_t hybA, + cusparseSolveAnalysisInfo_t info, + const cuDoubleComplex *f, + cuDoubleComplex *x); + + +/* --- Sparse Level 3 routines --- */ + +/* Description: sparse - dense matrix multiplication C = alpha * op(A) * B + beta * C, + where A is a sparse matrix in CSR format, B and C are dense tall matrices. */ +cusparseStatus_t CUSPARSEAPI cusparseScsrmm(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + int k, + int nnz, + const float *alpha, + const cusparseMatDescr_t descrA, + const float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const float *B, + int ldb, + const float *beta, + float *C, + int ldc); + +cusparseStatus_t CUSPARSEAPI cusparseDcsrmm(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + int k, + int nnz, + const double *alpha, + const cusparseMatDescr_t descrA, + const double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const double *B, + int ldb, + const double *beta, + double *C, + int ldc); + +cusparseStatus_t CUSPARSEAPI cusparseCcsrmm(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + int k, + int nnz, + const cuComplex *alpha, + const cusparseMatDescr_t descrA, + const cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cuComplex *B, + int ldb, + const cuComplex *beta, + cuComplex *C, + int ldc); + +cusparseStatus_t CUSPARSEAPI cusparseZcsrmm(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + int k, + int nnz, + const cuDoubleComplex *alpha, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cuDoubleComplex *B, + int ldb, + const cuDoubleComplex *beta, + cuDoubleComplex *C, + int ldc); + +/* Description: sparse - dense matrix multiplication C = alpha * op(A) * B + beta * C, + where A is a sparse matrix in CSR format, B and C are dense tall matrices. + This routine allows transposition of matrix B, which may improve performance. */ +cusparseStatus_t CUSPARSEAPI cusparseScsrmm2(cusparseHandle_t handle, + cusparseOperation_t transA, + cusparseOperation_t transB, + int m, + int n, + int k, + int nnz, + const float *alpha, + const cusparseMatDescr_t descrA, + const float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const float *B, + int ldb, + const float *beta, + float *C, + int ldc); + +cusparseStatus_t CUSPARSEAPI cusparseDcsrmm2(cusparseHandle_t handle, + cusparseOperation_t transA, + cusparseOperation_t transB, + int m, + int n, + int k, + int nnz, + const double *alpha, + const cusparseMatDescr_t descrA, + const double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const double *B, + int ldb, + const double *beta, + double *C, + int ldc); + +cusparseStatus_t CUSPARSEAPI cusparseCcsrmm2(cusparseHandle_t handle, + cusparseOperation_t transA, + cusparseOperation_t transB, + int m, + int n, + int k, + int nnz, + const cuComplex *alpha, + const cusparseMatDescr_t descrA, + const cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cuComplex *B, + int ldb, + const cuComplex *beta, + cuComplex *C, + int ldc); + +cusparseStatus_t CUSPARSEAPI cusparseZcsrmm2(cusparseHandle_t handle, + cusparseOperation_t transA, + cusparseOperation_t transB, + int m, + int n, + int k, + int nnz, + const cuDoubleComplex *alpha, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cuDoubleComplex *B, + int ldb, + const cuDoubleComplex *beta, + cuDoubleComplex *C, + int ldc); + +/* Description: sparse - dense matrix multiplication C = alpha * op(A) * B + beta * C, + where A is a sparse matrix in block-CSR format, B and C are dense tall matrices. + This routine allows transposition of matrix B, which may improve performance. */ +cusparseStatus_t CUSPARSEAPI cusparseSbsrmm(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transB, + int mb, + int n, + int kb, + int nnzb, + const float *alpha, + const cusparseMatDescr_t descrA, + const float *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + const int blockSize, + const float *B, + const int ldb, + const float *beta, + float *C, + int ldc); + +cusparseStatus_t CUSPARSEAPI cusparseDbsrmm(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transB, + int mb, + int n, + int kb, + int nnzb, + const double *alpha, + const cusparseMatDescr_t descrA, + const double *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + const int blockSize, + const double *B, + const int ldb, + const double *beta, + double *C, + int ldc); + +cusparseStatus_t CUSPARSEAPI cusparseCbsrmm(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transB, + int mb, + int n, + int kb, + int nnzb, + const cuComplex *alpha, + const cusparseMatDescr_t descrA, + const cuComplex *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + const int blockSize, + const cuComplex *B, + const int ldb, + const cuComplex *beta, + cuComplex *C, + int ldc); + +cusparseStatus_t CUSPARSEAPI cusparseZbsrmm(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transB, + int mb, + int n, + int kb, + int nnzb, + const cuDoubleComplex *alpha, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + const int blockSize, + const cuDoubleComplex *B, + const int ldb, + const cuDoubleComplex *beta, + cuDoubleComplex *C, + int ldc); + + +/* Description: dense - sparse matrix multiplication C = alpha * A * B + beta * C, + where A is column-major dense matrix, B is a sparse matrix in CSC format, + and C is column-major dense matrix. */ +cusparseStatus_t CUSPARSEAPI cusparseSgemmi(cusparseHandle_t handle, + int m, + int n, + int k, + int nnz, + const float *alpha, /* host or device pointer */ + const float *A, + int lda, + const float *cscValB, + const int *cscColPtrB, + const int *cscRowIndB, + const float *beta, /* host or device pointer */ + float *C, + int ldc); + +cusparseStatus_t CUSPARSEAPI cusparseDgemmi(cusparseHandle_t handle, + int m, + int n, + int k, + int nnz, + const double *alpha, /* host or device pointer */ + const double *A, + int lda, + const double *cscValB, + const int *cscColPtrB, + const int *cscRowIndB, + const double *beta, /* host or device pointer */ + double *C, + int ldc); + +cusparseStatus_t CUSPARSEAPI cusparseCgemmi(cusparseHandle_t handle, + int m, + int n, + int k, + int nnz, + const cuComplex *alpha, /* host or device pointer */ + const cuComplex *A, + int lda, + const cuComplex *cscValB, + const int *cscColPtrB, + const int *cscRowIndB, + const cuComplex *beta, /* host or device pointer */ + cuComplex *C, + int ldc); + +cusparseStatus_t CUSPARSEAPI cusparseZgemmi(cusparseHandle_t handle, + int m, + int n, + int k, + int nnz, + const cuDoubleComplex *alpha, /* host or device pointer */ + const cuDoubleComplex *A, + int lda, + const cuDoubleComplex *cscValB, + const int *cscColPtrB, + const int *cscRowIndB, + const cuDoubleComplex *beta, /* host or device pointer */ + cuDoubleComplex *C, + int ldc); + + +/* Description: Solution of triangular linear system op(A) * X = alpha * F, + with multiple right-hand-sides, where A is a sparse matrix in CSR storage + format, rhs F and solution X are dense tall matrices. + This routine implements algorithm 1 for this problem. */ +cusparseStatus_t CUSPARSEAPI cusparseScsrsm_analysis(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info); + +cusparseStatus_t CUSPARSEAPI cusparseDcsrsm_analysis(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info); + +cusparseStatus_t CUSPARSEAPI cusparseCcsrsm_analysis(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info); + +cusparseStatus_t CUSPARSEAPI cusparseZcsrsm_analysis(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info); + + +cusparseStatus_t CUSPARSEAPI cusparseScsrsm_solve(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + const float *alpha, + const cusparseMatDescr_t descrA, + const float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info, + const float *F, + int ldf, + float *X, + int ldx); + +cusparseStatus_t CUSPARSEAPI cusparseDcsrsm_solve(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + const double *alpha, + const cusparseMatDescr_t descrA, + const double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info, + const double *F, + int ldf, + double *X, + int ldx); + +cusparseStatus_t CUSPARSEAPI cusparseCcsrsm_solve(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + const cuComplex *alpha, + const cusparseMatDescr_t descrA, + const cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info, + const cuComplex *F, + int ldf, + cuComplex *X, + int ldx); + +cusparseStatus_t CUSPARSEAPI cusparseZcsrsm_solve(cusparseHandle_t handle, + cusparseOperation_t transA, + int m, + int n, + const cuDoubleComplex *alpha, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info, + const cuDoubleComplex *F, + int ldf, + cuDoubleComplex *X, + int ldx); + +/* Description: Solution of triangular linear system op(A) * X = alpha * F, + with multiple right-hand-sides, where A is a sparse matrix in CSR storage + format, rhs F and solution X are dense tall matrices. + This routine implements algorithm 2 for this problem. */ +cusparseStatus_t CUSPARSEAPI cusparseXbsrsm2_zeroPivot(cusparseHandle_t handle, + bsrsm2Info_t info, + int *position); + +cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transXY, + int mb, + int n, + int nnzb, + const cusparseMatDescr_t descrA, + float *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transXY, + int mb, + int n, + int nnzb, + const cusparseMatDescr_t descrA, + double *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transXY, + int mb, + int n, + int nnzb, + const cusparseMatDescr_t descrA, + cuComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transXY, + int mb, + int n, + int nnzb, + const cusparseMatDescr_t descrA, + cuDoubleComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + int *pBufferSizeInBytes); + + +cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transB, + int mb, + int n, + int nnzb, + const cusparseMatDescr_t descrA, + float *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transB, + int mb, + int n, + int nnzb, + const cusparseMatDescr_t descrA, + double *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transB, + int mb, + int n, + int nnzb, + const cusparseMatDescr_t descrA, + cuComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transB, + int mb, + int n, + int nnzb, + const cusparseMatDescr_t descrA, + cuDoubleComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + size_t *pBufferSize); + + +cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transXY, + int mb, + int n, + int nnzb, + const cusparseMatDescr_t descrA, + const float *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transXY, + int mb, + int n, + int nnzb, + const cusparseMatDescr_t descrA, + const double *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transXY, + int mb, + int n, + int nnzb, + const cusparseMatDescr_t descrA, + const cuComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transXY, + int mb, + int n, + int nnzb, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + + +cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_solve(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transXY, + int mb, + int n, + int nnzb, + const float *alpha, + const cusparseMatDescr_t descrA, + const float *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + const float *F, + int ldf, + float *X, + int ldx, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_solve(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transXY, + int mb, + int n, + int nnzb, + const double *alpha, + const cusparseMatDescr_t descrA, + const double *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + const double *F, + int ldf, + double *X, + int ldx, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_solve(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transXY, + int mb, + int n, + int nnzb, + const cuComplex *alpha, + const cusparseMatDescr_t descrA, + const cuComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + const cuComplex *F, + int ldf, + cuComplex *X, + int ldx, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_solve(cusparseHandle_t handle, + cusparseDirection_t dirA, + cusparseOperation_t transA, + cusparseOperation_t transXY, + int mb, + int n, + int nnzb, + const cuDoubleComplex *alpha, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockSize, + bsrsm2Info_t info, + const cuDoubleComplex *F, + int ldf, + cuDoubleComplex *X, + int ldx, + cusparseSolvePolicy_t policy, + void *pBuffer); + + +/* --- Preconditioners --- */ + +/* Description: Compute the incomplete-LU factorization with 0 fill-in (ILU0) + of the matrix A stored in CSR format based on the information in the opaque + structure info that was obtained from the analysis phase (csrsv_analysis). + This routine implements algorithm 1 for this problem. */ +cusparseStatus_t CUSPARSEAPI cusparseCsrilu0Ex(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + const cusparseMatDescr_t descrA, + void *csrSortedValA_ValM, + cudaDataType csrSortedValA_ValMtype, + /* matrix A values are updated inplace + to be the preconditioner M values */ + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info, + cudaDataType executiontype); + +cusparseStatus_t CUSPARSEAPI cusparseScsrilu0(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + const cusparseMatDescr_t descrA, + float *csrSortedValA_ValM, + /* matrix A values are updated inplace + to be the preconditioner M values */ + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info); + +cusparseStatus_t CUSPARSEAPI cusparseDcsrilu0(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + const cusparseMatDescr_t descrA, + double *csrSortedValA_ValM, + /* matrix A values are updated inplace + to be the preconditioner M values */ + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info); + +cusparseStatus_t CUSPARSEAPI cusparseCcsrilu0(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + const cusparseMatDescr_t descrA, + cuComplex *csrSortedValA_ValM, + /* matrix A values are updated inplace + to be the preconditioner M values */ + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info); + +cusparseStatus_t CUSPARSEAPI cusparseZcsrilu0(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + const cusparseMatDescr_t descrA, + cuDoubleComplex *csrSortedValA_ValM, + /* matrix A values are updated inplace + to be the preconditioner M values */ + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info); + +/* Description: Compute the incomplete-LU factorization with 0 fill-in (ILU0) + of the matrix A stored in CSR format based on the information in the opaque + structure info that was obtained from the analysis phase (csrsv2_analysis). + This routine implements algorithm 2 for this problem. */ +cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_numericBoost(cusparseHandle_t handle, + csrilu02Info_t info, + int enable_boost, + double *tol, + float *boost_val); + +cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_numericBoost(cusparseHandle_t handle, + csrilu02Info_t info, + int enable_boost, + double *tol, + double *boost_val); + +cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_numericBoost(cusparseHandle_t handle, + csrilu02Info_t info, + int enable_boost, + double *tol, + cuComplex *boost_val); + +cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_numericBoost(cusparseHandle_t handle, + csrilu02Info_t info, + int enable_boost, + double *tol, + cuDoubleComplex *boost_val); + +cusparseStatus_t CUSPARSEAPI cusparseXcsrilu02_zeroPivot(cusparseHandle_t handle, + csrilu02Info_t info, + int *position); + +cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_bufferSize(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrilu02Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_bufferSize(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrilu02Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_bufferSize(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrilu02Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_bufferSize(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrilu02Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_bufferSizeExt(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + float *csrSortedVal, + const int *csrSortedRowPtr, + const int *csrSortedColInd, + csrilu02Info_t info, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_bufferSizeExt(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + double *csrSortedVal, + const int *csrSortedRowPtr, + const int *csrSortedColInd, + csrilu02Info_t info, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_bufferSizeExt(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuComplex *csrSortedVal, + const int *csrSortedRowPtr, + const int *csrSortedColInd, + csrilu02Info_t info, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_bufferSizeExt(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuDoubleComplex *csrSortedVal, + const int *csrSortedRowPtr, + const int *csrSortedColInd, + csrilu02Info_t info, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_analysis(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrilu02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_analysis(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrilu02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_analysis(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrilu02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_analysis(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrilu02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseScsrilu02(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + float *csrSortedValA_valM, + /* matrix A values are updated inplace + to be the preconditioner M values */ + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrilu02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + double *csrSortedValA_valM, + /* matrix A values are updated inplace + to be the preconditioner M values */ + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrilu02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuComplex *csrSortedValA_valM, + /* matrix A values are updated inplace + to be the preconditioner M values */ + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrilu02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuDoubleComplex *csrSortedValA_valM, + /* matrix A values are updated inplace + to be the preconditioner M values */ + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csrilu02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +/* Description: Compute the incomplete-LU factorization with 0 fill-in (ILU0) + of the matrix A stored in block-CSR format based on the information in the opaque + structure info that was obtained from the analysis phase (bsrsv2_analysis). + This routine implements algorithm 2 for this problem. */ +cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_numericBoost(cusparseHandle_t handle, + bsrilu02Info_t info, + int enable_boost, + double *tol, + float *boost_val); + +cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_numericBoost(cusparseHandle_t handle, + bsrilu02Info_t info, + int enable_boost, + double *tol, + double *boost_val); + +cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_numericBoost(cusparseHandle_t handle, + bsrilu02Info_t info, + int enable_boost, + double *tol, + cuComplex *boost_val); + +cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_numericBoost(cusparseHandle_t handle, + bsrilu02Info_t info, + int enable_boost, + double *tol, + cuDoubleComplex *boost_val); + +cusparseStatus_t CUSPARSEAPI cusparseXbsrilu02_zeroPivot(cusparseHandle_t handle, + bsrilu02Info_t info, + int *position); + +cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + float *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockDim, + bsrilu02Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + double *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockDim, + bsrilu02Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockDim, + bsrilu02Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuDoubleComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockDim, + bsrilu02Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + float *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockSize, + bsrilu02Info_t info, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + double *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockSize, + bsrilu02Info_t info, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockSize, + bsrilu02Info_t info, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuDoubleComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockSize, + bsrilu02Info_t info, + size_t *pBufferSize); + + +cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + float *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockDim, + bsrilu02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + double *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockDim, + bsrilu02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockDim, + bsrilu02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuDoubleComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockDim, + bsrilu02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + + +cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descra, + float *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockDim, + bsrilu02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descra, + double *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockDim, + bsrilu02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descra, + cuComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockDim, + bsrilu02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descra, + cuDoubleComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockDim, + bsrilu02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +/* Description: Compute the incomplete-Cholesky factorization with 0 fill-in (IC0) + of the matrix A stored in CSR format based on the information in the opaque + structure info that was obtained from the analysis phase (csrsv_analysis). + This routine implements algorithm 1 for this problem. */ +cusparseStatus_t CUSPARSEAPI cusparseScsric0(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + const cusparseMatDescr_t descrA, + float *csrSortedValA_ValM, + /* matrix A values are updated inplace + to be the preconditioner M values */ + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info); + +cusparseStatus_t CUSPARSEAPI cusparseDcsric0(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + const cusparseMatDescr_t descrA, + double *csrSortedValA_ValM, + /* matrix A values are updated inplace + to be the preconditioner M values */ + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info); + +cusparseStatus_t CUSPARSEAPI cusparseCcsric0(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + const cusparseMatDescr_t descrA, + cuComplex *csrSortedValA_ValM, + /* matrix A values are updated inplace + to be the preconditioner M values */ + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info); + +cusparseStatus_t CUSPARSEAPI cusparseZcsric0(cusparseHandle_t handle, + cusparseOperation_t trans, + int m, + const cusparseMatDescr_t descrA, + cuDoubleComplex *csrSortedValA_ValM, + /* matrix A values are updated inplace + to be the preconditioner M values */ + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseSolveAnalysisInfo_t info); + +/* Description: Compute the incomplete-Cholesky factorization with 0 fill-in (IC0) + of the matrix A stored in CSR format based on the information in the opaque + structure info that was obtained from the analysis phase (csrsv2_analysis). + This routine implements algorithm 2 for this problem. */ +cusparseStatus_t CUSPARSEAPI cusparseXcsric02_zeroPivot(cusparseHandle_t handle, + csric02Info_t info, + int *position); + +cusparseStatus_t CUSPARSEAPI cusparseScsric02_bufferSize(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csric02Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseDcsric02_bufferSize(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csric02Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseCcsric02_bufferSize(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csric02Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseZcsric02_bufferSize(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csric02Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseScsric02_bufferSizeExt(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + float *csrSortedVal, + const int *csrSortedRowPtr, + const int *csrSortedColInd, + csric02Info_t info, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseDcsric02_bufferSizeExt(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + double *csrSortedVal, + const int *csrSortedRowPtr, + const int *csrSortedColInd, + csric02Info_t info, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseCcsric02_bufferSizeExt(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuComplex *csrSortedVal, + const int *csrSortedRowPtr, + const int *csrSortedColInd, + csric02Info_t info, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseZcsric02_bufferSizeExt(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuDoubleComplex *csrSortedVal, + const int *csrSortedRowPtr, + const int *csrSortedColInd, + csric02Info_t info, + size_t *pBufferSize); + + +cusparseStatus_t CUSPARSEAPI cusparseScsric02_analysis(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csric02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + + +cusparseStatus_t CUSPARSEAPI cusparseDcsric02_analysis(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csric02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseCcsric02_analysis(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csric02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseZcsric02_analysis(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csric02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseScsric02(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + float *csrSortedValA_valM, + /* matrix A values are updated inplace + to be the preconditioner M values */ + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csric02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDcsric02(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + double *csrSortedValA_valM, + /* matrix A values are updated inplace + to be the preconditioner M values */ + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csric02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseCcsric02(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuComplex *csrSortedValA_valM, + /* matrix A values are updated inplace + to be the preconditioner M values */ + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csric02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseZcsric02(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + cuDoubleComplex *csrSortedValA_valM, + /* matrix A values are updated inplace + to be the preconditioner M values */ + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + csric02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +/* Description: Compute the incomplete-Cholesky factorization with 0 fill-in (IC0) + of the matrix A stored in block-CSR format based on the information in the opaque + structure info that was obtained from the analysis phase (bsrsv2_analysis). + This routine implements algorithm 1 for this problem. */ +cusparseStatus_t CUSPARSEAPI cusparseXbsric02_zeroPivot(cusparseHandle_t handle, + bsric02Info_t info, + int *position); + +cusparseStatus_t CUSPARSEAPI cusparseSbsric02_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + float *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockDim, + bsric02Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseDbsric02_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + double *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockDim, + bsric02Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseCbsric02_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockDim, + bsric02Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseZbsric02_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuDoubleComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockDim, + bsric02Info_t info, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseSbsric02_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + float *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockSize, + bsric02Info_t info, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseDbsric02_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + double *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockSize, + bsric02Info_t info, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseCbsric02_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockSize, + bsric02Info_t info, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseZbsric02_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuDoubleComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockSize, + bsric02Info_t info, + size_t *pBufferSize); + + + +cusparseStatus_t CUSPARSEAPI cusparseSbsric02_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + const float *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockDim, + bsric02Info_t info, + cusparseSolvePolicy_t policy, + void *pInputBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDbsric02_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + const double *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockDim, + bsric02Info_t info, + cusparseSolvePolicy_t policy, + void *pInputBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseCbsric02_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + const cuComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockDim, + bsric02Info_t info, + cusparseSolvePolicy_t policy, + void *pInputBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseZbsric02_analysis(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockDim, + bsric02Info_t info, + cusparseSolvePolicy_t policy, + void *pInputBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseSbsric02(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + float *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockDim, + bsric02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDbsric02(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + double *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockDim, + bsric02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseCbsric02(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockDim, + bsric02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseZbsric02(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nnzb, + const cusparseMatDescr_t descrA, + cuDoubleComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int blockDim, + bsric02Info_t info, + cusparseSolvePolicy_t policy, + void *pBuffer); + + +/* Description: Solution of tridiagonal linear system A * X = F, + with multiple right-hand-sides. The coefficient matrix A is + composed of lower (dl), main (d) and upper (du) diagonals, and + the right-hand-sides F are overwritten with the solution X. + These routine use pivoting. */ +cusparseStatus_t CUSPARSEAPI cusparseSgtsv( + cusparseHandle_t handle, + int m, + int n, + const float *dl, + const float *d, + const float *du, + float *B, + int ldb); + +cusparseStatus_t CUSPARSEAPI cusparseDgtsv( + cusparseHandle_t handle, + int m, + int n, + const double *dl, + const double *d, + const double *du, + double *B, + int ldb); + +cusparseStatus_t CUSPARSEAPI cusparseCgtsv( + cusparseHandle_t handle, + int m, + int n, + const cuComplex *dl, + const cuComplex *d, + const cuComplex *du, + cuComplex *B, + int ldb); + +cusparseStatus_t CUSPARSEAPI cusparseZgtsv( + cusparseHandle_t handle, + int m, + int n, + const cuDoubleComplex *dl, + const cuDoubleComplex *d, + const cuDoubleComplex *du, + cuDoubleComplex *B, + int ldb); + + +cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + const float *dl, + const float *d, + const float *du, + const float *B, + int ldb, + size_t *bufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + const double *dl, + const double *d, + const double *du, + const double *B, + int ldb, + size_t *bufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + const cuComplex *dl, + const cuComplex *d, + const cuComplex *du, + const cuComplex *B, + int ldb, + size_t *bufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + const cuDoubleComplex *dl, + const cuDoubleComplex *d, + const cuDoubleComplex *du, + const cuDoubleComplex *B, + int ldb, + size_t *bufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseSgtsv2( + cusparseHandle_t handle, + int m, + int n, + const float *dl, + const float *d, + const float *du, + float *B, + int ldb, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDgtsv2( + cusparseHandle_t handle, + int m, + int n, + const double *dl, + const double *d, + const double *du, + double *B, + int ldb, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseCgtsv2( + cusparseHandle_t handle, + int m, + int n, + const cuComplex *dl, + const cuComplex *d, + const cuComplex *du, + cuComplex *B, + int ldb, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseZgtsv2( + cusparseHandle_t handle, + int m, + int n, + const cuDoubleComplex *dl, + const cuDoubleComplex *d, + const cuDoubleComplex *du, + cuDoubleComplex *B, + int ldb, + void* pBuffer); + + +/* Description: Solution of tridiagonal linear system A * X = F, + with multiple right-hand-sides. The coefficient matrix A is + composed of lower (dl), main (d) and upper (du) diagonals, and + the right-hand-sides F are overwritten with the solution X. + These routine does not use pivoting. */ +cusparseStatus_t CUSPARSEAPI cusparseSgtsv_nopivot( + cusparseHandle_t handle, + int m, + int n, + const float *dl, + const float *d, + const float *du, + float *B, + int ldb); + +cusparseStatus_t CUSPARSEAPI cusparseDgtsv_nopivot( + cusparseHandle_t handle, + int m, + int n, + const double *dl, + const double *d, + const double *du, + double *B, + int ldb); + +cusparseStatus_t CUSPARSEAPI cusparseCgtsv_nopivot( + cusparseHandle_t handle, + int m, + int n, + const cuComplex *dl, + const cuComplex *d, + const cuComplex *du, + cuComplex *B, + int ldb); + +cusparseStatus_t CUSPARSEAPI cusparseZgtsv_nopivot( + cusparseHandle_t handle, + int m, + int n, + const cuDoubleComplex *dl, + const cuDoubleComplex *d, + const cuDoubleComplex *du, + cuDoubleComplex *B, + int ldb); + + +cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_nopivot_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + const float *dl, + const float *d, + const float *du, + const float *B, + int ldb, + size_t *bufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_nopivot_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + const double *dl, + const double *d, + const double *du, + const double *B, + int ldb, + size_t *bufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_nopivot_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + const cuComplex *dl, + const cuComplex *d, + const cuComplex *du, + const cuComplex *B, + int ldb, + size_t *bufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_nopivot_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + const cuDoubleComplex *dl, + const cuDoubleComplex *d, + const cuDoubleComplex *du, + const cuDoubleComplex *B, + int ldb, + size_t *bufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_nopivot( + cusparseHandle_t handle, + int m, + int n, + const float *dl, + const float *d, + const float *du, + float *B, + int ldb, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_nopivot( + cusparseHandle_t handle, + int m, + int n, + const double *dl, + const double *d, + const double *du, + double *B, + int ldb, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_nopivot( + cusparseHandle_t handle, + int m, + int n, + const cuComplex *dl, + const cuComplex *d, + const cuComplex *du, + cuComplex *B, + int ldb, + void* pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_nopivot( + cusparseHandle_t handle, + int m, + int n, + const cuDoubleComplex *dl, + const cuDoubleComplex *d, + const cuDoubleComplex *du, + cuDoubleComplex *B, + int ldb, + void* pBuffer); + +/* Description: Solution of a set of tridiagonal linear systems + A_{i} * x_{i} = f_{i} for i=1,...,batchCount. The coefficient + matrices A_{i} are composed of lower (dl), main (d) and upper (du) + diagonals and stored separated by a batchStride. Also, the + right-hand-sides/solutions f_{i}/x_{i} are separated by a batchStride. */ +cusparseStatus_t CUSPARSEAPI cusparseSgtsvStridedBatch( + cusparseHandle_t handle, + int m, + const float *dl, + const float *d, + const float *du, + float *x, + int batchCount, + int batchStride); + +cusparseStatus_t CUSPARSEAPI cusparseDgtsvStridedBatch( + cusparseHandle_t handle, + int m, + const double *dl, + const double *d, + const double *du, + double *x, + int batchCount, + int batchStride); + +cusparseStatus_t CUSPARSEAPI cusparseCgtsvStridedBatch( + cusparseHandle_t handle, + int m, + const cuComplex *dl, + const cuComplex *d, + const cuComplex *du, + cuComplex *x, + int batchCount, + int batchStride); + +cusparseStatus_t CUSPARSEAPI cusparseZgtsvStridedBatch( + cusparseHandle_t handle, + int m, + const cuDoubleComplex *dl, + const cuDoubleComplex *d, + const cuDoubleComplex *du, + cuDoubleComplex *x, + int batchCount, + int batchStride); + + +cusparseStatus_t CUSPARSEAPI cusparseSgtsv2StridedBatch_bufferSizeExt( + cusparseHandle_t handle, + int m, + const float *dl, + const float *d, + const float *du, + const float *x, + int batchCount, + int batchStride, + size_t *bufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseDgtsv2StridedBatch_bufferSizeExt( + cusparseHandle_t handle, + int m, + const double *dl, + const double *d, + const double *du, + const double *x, + int batchCount, + int batchStride, + size_t *bufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseCgtsv2StridedBatch_bufferSizeExt( + cusparseHandle_t handle, + int m, + const cuComplex *dl, + const cuComplex *d, + const cuComplex *du, + const cuComplex *x, + int batchCount, + int batchStride, + size_t *bufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseZgtsv2StridedBatch_bufferSizeExt( + cusparseHandle_t handle, + int m, + const cuDoubleComplex *dl, + const cuDoubleComplex *d, + const cuDoubleComplex *du, + const cuDoubleComplex *x, + int batchCount, + int batchStride, + size_t *bufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseSgtsv2StridedBatch( + cusparseHandle_t handle, + int m, + const float *dl, + const float *d, + const float *du, + float *x, + int batchCount, + int batchStride, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDgtsv2StridedBatch( + cusparseHandle_t handle, + int m, + const double *dl, + const double *d, + const double *du, + double *x, + int batchCount, + int batchStride, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseCgtsv2StridedBatch( + cusparseHandle_t handle, + int m, + const cuComplex *dl, + const cuComplex *d, + const cuComplex *du, + cuComplex *x, + int batchCount, + int batchStride, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseZgtsv2StridedBatch( + cusparseHandle_t handle, + int m, + const cuDoubleComplex *dl, + const cuDoubleComplex *d, + const cuDoubleComplex *du, + cuDoubleComplex *x, + int batchCount, + int batchStride, + void *pBuffer); + +/* --- Sparse Level 4 routines --- */ + +/* Description: Compute sparse - sparse matrix multiplication for matrices + stored in CSR format. */ +cusparseStatus_t CUSPARSEAPI cusparseXcsrgemmNnz(cusparseHandle_t handle, + cusparseOperation_t transA, + cusparseOperation_t transB, + int m, + int n, + int k, + const cusparseMatDescr_t descrA, + const int nnzA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cusparseMatDescr_t descrB, + const int nnzB, + const int *csrSortedRowPtrB, + const int *csrSortedColIndB, + const cusparseMatDescr_t descrC, + int *csrSortedRowPtrC, + int *nnzTotalDevHostPtr); + +cusparseStatus_t CUSPARSEAPI cusparseScsrgemm(cusparseHandle_t handle, + cusparseOperation_t transA, + cusparseOperation_t transB, + int m, + int n, + int k, + const cusparseMatDescr_t descrA, + const int nnzA, + const float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cusparseMatDescr_t descrB, + const int nnzB, + const float *csrSortedValB, + const int *csrSortedRowPtrB, + const int *csrSortedColIndB, + const cusparseMatDescr_t descrC, + float *csrSortedValC, + const int *csrSortedRowPtrC, + int *csrSortedColIndC); + +cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm(cusparseHandle_t handle, + cusparseOperation_t transA, + cusparseOperation_t transB, + int m, + int n, + int k, + const cusparseMatDescr_t descrA, + int nnzA, + const double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const double *csrSortedValB, + const int *csrSortedRowPtrB, + const int *csrSortedColIndB, + const cusparseMatDescr_t descrC, + double *csrSortedValC, + const int *csrSortedRowPtrC, + int *csrSortedColIndC); + +cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm(cusparseHandle_t handle, + cusparseOperation_t transA, + cusparseOperation_t transB, + int m, + int n, + int k, + const cusparseMatDescr_t descrA, + int nnzA, + const cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const cuComplex *csrSortedValB, + const int *csrSortedRowPtrB, + const int *csrSortedColIndB, + const cusparseMatDescr_t descrC, + cuComplex *csrSortedValC, + const int *csrSortedRowPtrC, + int *csrSortedColIndC); + +cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm(cusparseHandle_t handle, + cusparseOperation_t transA, + cusparseOperation_t transB, + int m, + int n, + int k, + const cusparseMatDescr_t descrA, + int nnzA, + const cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const cuDoubleComplex *csrSortedValB, + const int *csrSortedRowPtrB, + const int *csrSortedColIndB, + const cusparseMatDescr_t descrC, + cuDoubleComplex *csrSortedValC, + const int *csrSortedRowPtrC, + int *csrSortedColIndC); + +/* Description: Compute sparse - sparse matrix multiplication for matrices + stored in CSR format. */ + +cusparseStatus_t CUSPARSEAPI cusparseCreateCsrgemm2Info(csrgemm2Info_t *info); + +cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrgemm2Info(csrgemm2Info_t info); + +cusparseStatus_t CUSPARSEAPI cusparseScsrgemm2_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + int k, + const float *alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const int *csrSortedRowPtrB, + const int *csrSortedColIndB, + const float *beta, + const cusparseMatDescr_t descrD, + int nnzD, + const int *csrSortedRowPtrD, + const int *csrSortedColIndD, + csrgemm2Info_t info, + size_t *pBufferSizeInBytes ); + +cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm2_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + int k, + const double *alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const int *csrSortedRowPtrB, + const int *csrSortedColIndB, + const double *beta, + const cusparseMatDescr_t descrD, + int nnzD, + const int *csrSortedRowPtrD, + const int *csrSortedColIndD, + csrgemm2Info_t info, + size_t *pBufferSizeInBytes ); + +cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm2_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + int k, + const cuComplex *alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const int *csrSortedRowPtrB, + const int *csrSortedColIndB, + const cuComplex *beta, + const cusparseMatDescr_t descrD, + int nnzD, + const int *csrSortedRowPtrD, + const int *csrSortedColIndD, + csrgemm2Info_t info, + size_t *pBufferSizeInBytes ); + +cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm2_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + int k, + const cuDoubleComplex *alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const int *csrSortedRowPtrB, + const int *csrSortedColIndB, + const cuDoubleComplex *beta, + const cusparseMatDescr_t descrD, + int nnzD, + const int *csrSortedRowPtrD, + const int *csrSortedColIndD, + csrgemm2Info_t info, + size_t *pBufferSizeInBytes ); + + +cusparseStatus_t CUSPARSEAPI cusparseXcsrgemm2Nnz(cusparseHandle_t handle, + int m, + int n, + int k, + const cusparseMatDescr_t descrA, + int nnzA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const int *csrSortedRowPtrB, + const int *csrSortedColIndB, + const cusparseMatDescr_t descrD, + int nnzD, + const int *csrSortedRowPtrD, + const int *csrSortedColIndD, + const cusparseMatDescr_t descrC, + int *csrSortedRowPtrC, + int *nnzTotalDevHostPtr, + const csrgemm2Info_t info, + void *pBuffer ); + + +cusparseStatus_t CUSPARSEAPI cusparseScsrgemm2(cusparseHandle_t handle, + int m, + int n, + int k, + const float *alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const float *csrSortedValB, + const int *csrSortedRowPtrB, + const int *csrSortedColIndB, + const float *beta, + const cusparseMatDescr_t descrD, + int nnzD, + const float *csrSortedValD, + const int *csrSortedRowPtrD, + const int *csrSortedColIndD, + const cusparseMatDescr_t descrC, + float *csrSortedValC, + const int *csrSortedRowPtrC, + int *csrSortedColIndC, + const csrgemm2Info_t info, + void *pBuffer ); + +cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm2(cusparseHandle_t handle, + int m, + int n, + int k, + const double *alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const double *csrSortedValB, + const int *csrSortedRowPtrB, + const int *csrSortedColIndB, + const double *beta, + const cusparseMatDescr_t descrD, + int nnzD, + const double *csrSortedValD, + const int *csrSortedRowPtrD, + const int *csrSortedColIndD, + const cusparseMatDescr_t descrC, + double *csrSortedValC, + const int *csrSortedRowPtrC, + int *csrSortedColIndC, + const csrgemm2Info_t info, + void *pBuffer ); + + +cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm2(cusparseHandle_t handle, + int m, + int n, + int k, + const cuComplex *alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const cuComplex *csrSortedValB, + const int *csrSortedRowPtrB, + const int *csrSortedColIndB, + const cuComplex *beta, + const cusparseMatDescr_t descrD, + int nnzD, + const cuComplex *csrSortedValD, + const int *csrSortedRowPtrD, + const int *csrSortedColIndD, + const cusparseMatDescr_t descrC, + cuComplex *csrSortedValC, + const int *csrSortedRowPtrC, + int *csrSortedColIndC, + const csrgemm2Info_t info, + void *pBuffer ); + + +cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm2(cusparseHandle_t handle, + int m, + int n, + int k, + const cuDoubleComplex *alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const cuDoubleComplex *csrSortedValB, + const int *csrSortedRowPtrB, + const int *csrSortedColIndB, + const cuDoubleComplex *beta, + const cusparseMatDescr_t descrD, + int nnzD, + const cuDoubleComplex *csrSortedValD, + const int *csrSortedRowPtrD, + const int *csrSortedColIndD, + const cusparseMatDescr_t descrC, + cuDoubleComplex *csrSortedValC, + const int *csrSortedRowPtrC, + int *csrSortedColIndC, + const csrgemm2Info_t info, + void *pBuffer ); + + +/* Description: Compute sparse - sparse matrix addition of matrices + stored in CSR format */ +cusparseStatus_t CUSPARSEAPI cusparseXcsrgeamNnz(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + int nnzA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cusparseMatDescr_t descrB, + int nnzB, + const int *csrSortedRowPtrB, + const int *csrSortedColIndB, + const cusparseMatDescr_t descrC, + int *csrSortedRowPtrC, + int *nnzTotalDevHostPtr); + +cusparseStatus_t CUSPARSEAPI cusparseScsrgeam(cusparseHandle_t handle, + int m, + int n, + const float *alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const float *beta, + const cusparseMatDescr_t descrB, + int nnzB, + const float *csrSortedValB, + const int *csrSortedRowPtrB, + const int *csrSortedColIndB, + const cusparseMatDescr_t descrC, + float *csrSortedValC, + int *csrSortedRowPtrC, + int *csrSortedColIndC); + +cusparseStatus_t CUSPARSEAPI cusparseDcsrgeam(cusparseHandle_t handle, + int m, + int n, + const double *alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const double *beta, + const cusparseMatDescr_t descrB, + int nnzB, + const double *csrSortedValB, + const int *csrSortedRowPtrB, + const int *csrSortedColIndB, + const cusparseMatDescr_t descrC, + double *csrSortedValC, + int *csrSortedRowPtrC, + int *csrSortedColIndC); + +cusparseStatus_t CUSPARSEAPI cusparseCcsrgeam(cusparseHandle_t handle, + int m, + int n, + const cuComplex *alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cuComplex *beta, + const cusparseMatDescr_t descrB, + int nnzB, + const cuComplex *csrSortedValB, + const int *csrSortedRowPtrB, + const int *csrSortedColIndB, + const cusparseMatDescr_t descrC, + cuComplex *csrSortedValC, + int *csrSortedRowPtrC, + int *csrSortedColIndC); + +cusparseStatus_t CUSPARSEAPI cusparseZcsrgeam(cusparseHandle_t handle, + int m, + int n, + const cuDoubleComplex *alpha, + const cusparseMatDescr_t descrA, + int nnzA, + const cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cuDoubleComplex *beta, + const cusparseMatDescr_t descrB, + int nnzB, + const cuDoubleComplex *csrSortedValB, + const int *csrSortedRowPtrB, + const int *csrSortedColIndB, + const cusparseMatDescr_t descrC, + cuDoubleComplex *csrSortedValC, + int *csrSortedRowPtrC, + int *csrSortedColIndC); + + +/* --- Sparse Matrix Reorderings --- */ + +/* Description: Find an approximate coloring of a matrix stored in CSR format. */ +cusparseStatus_t CUSPARSEAPI cusparseScsrcolor(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const float *fractionToColor, + int *ncolors, + int *coloring, + int *reordering, + const cusparseColorInfo_t info); + +cusparseStatus_t CUSPARSEAPI cusparseDcsrcolor(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const double *fractionToColor, + int *ncolors, + int *coloring, + int *reordering, + const cusparseColorInfo_t info); + +cusparseStatus_t CUSPARSEAPI cusparseCcsrcolor(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const float *fractionToColor, + int *ncolors, + int *coloring, + int *reordering, + const cusparseColorInfo_t info); + +cusparseStatus_t CUSPARSEAPI cusparseZcsrcolor(cusparseHandle_t handle, + int m, + int nnz, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const double *fractionToColor, + int *ncolors, + int *coloring, + int *reordering, + const cusparseColorInfo_t info); + +/* --- Sparse Format Conversion --- */ + +/* Description: This routine finds the total number of non-zero elements and + the number of non-zero elements per row or column in the dense matrix A. */ +cusparseStatus_t CUSPARSEAPI cusparseSnnz(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const float *A, + int lda, + int *nnzPerRowCol, + int *nnzTotalDevHostPtr); + +cusparseStatus_t CUSPARSEAPI cusparseDnnz(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const double *A, + int lda, + int *nnzPerRowCol, + int *nnzTotalDevHostPtr); + +cusparseStatus_t CUSPARSEAPI cusparseCnnz(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuComplex *A, + int lda, + int *nnzPerRowCol, + int *nnzTotalDevHostPtr); + +cusparseStatus_t CUSPARSEAPI cusparseZnnz(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *A, + int lda, + int *nnzPerRowCol, + int *nnzTotalDevHostPtr); + +/* --- Sparse Format Conversion --- */ + +/* Description: This routine finds the total number of non-zero elements and + the number of non-zero elements per row in a noncompressed csr matrix A. */ +cusparseStatus_t CUSPARSEAPI cusparseSnnz_compress(cusparseHandle_t handle, + int m, + const cusparseMatDescr_t descr, + const float *values, + const int *rowPtr, + int *nnzPerRow, + int *nnzTotal, + float tol); + +cusparseStatus_t CUSPARSEAPI cusparseDnnz_compress(cusparseHandle_t handle, + int m, + const cusparseMatDescr_t descr, + const double *values, + const int *rowPtr, + int *nnzPerRow, + int *nnzTotal, + double tol); + +cusparseStatus_t CUSPARSEAPI cusparseCnnz_compress(cusparseHandle_t handle, + int m, + const cusparseMatDescr_t descr, + const cuComplex *values, + const int *rowPtr, + int *nnzPerRow, + int *nnzTotal, + cuComplex tol); + +cusparseStatus_t CUSPARSEAPI cusparseZnnz_compress(cusparseHandle_t handle, + int m, + const cusparseMatDescr_t descr, + const cuDoubleComplex *values, + const int *rowPtr, + int *nnzPerRow, + int *nnzTotal, + cuDoubleComplex tol); +/* Description: This routine takes as input a csr form where the values may have 0 elements + and compresses it to return a csr form with no zeros. */ + +cusparseStatus_t CUSPARSEAPI cusparseScsr2csr_compress(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descra, + const float *inVal, + const int *inColInd, + const int * inRowPtr, + int inNnz, + int *nnzPerRow, + float *outVal, + int *outColInd, + int *outRowPtr, + float tol); + +cusparseStatus_t CUSPARSEAPI cusparseDcsr2csr_compress(cusparseHandle_t handle, + int m, //number of rows + int n, + const cusparseMatDescr_t descra, + const double *inVal, //csr values array-the elements which are below a certain tolerance will be remvoed + const int *inColInd, + const int * inRowPtr, //corresponding input noncompressed row pointer + int inNnz, + int *nnzPerRow, //output: returns number of nonzeros per row + double *outVal, + int *outColInd, + int *outRowPtr, + double tol); + +cusparseStatus_t CUSPARSEAPI cusparseCcsr2csr_compress(cusparseHandle_t handle, + int m, //number of rows + int n, + const cusparseMatDescr_t descra, + const cuComplex *inVal, //csr values array-the elements which are below a certain tolerance will be remvoed + const int *inColInd, + const int * inRowPtr, //corresponding input noncompressed row pointer + int inNnz, + int *nnzPerRow, //output: returns number of nonzeros per row + cuComplex *outVal, + int *outColInd, + int *outRowPtr, + cuComplex tol); + +cusparseStatus_t CUSPARSEAPI cusparseZcsr2csr_compress(cusparseHandle_t handle, + int m, //number of rows + int n, + const cusparseMatDescr_t descra, + const cuDoubleComplex *inVal, //csr values array-the elements which are below a certain tolerance will be remvoed + const int *inColInd, + const int * inRowPtr, //corresponding input noncompressed row pointer + int inNnz, + int *nnzPerRow, //output: returns number of nonzeros per row + cuDoubleComplex *outVal, + int *outColInd, + int *outRowPtr, + cuDoubleComplex tol); + +/* Description: This routine converts a dense matrix to a sparse matrix + in the CSR storage format, using the information computed by the + nnz routine. */ +cusparseStatus_t CUSPARSEAPI cusparseSdense2csr(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const float *A, + int lda, + const int *nnzPerRow, + float *csrSortedValA, + int *csrSortedRowPtrA, + int *csrSortedColIndA); + +cusparseStatus_t CUSPARSEAPI cusparseDdense2csr(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const double *A, + int lda, + const int *nnzPerRow, + double *csrSortedValA, + int *csrSortedRowPtrA, + int *csrSortedColIndA); + +cusparseStatus_t CUSPARSEAPI cusparseCdense2csr(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuComplex *A, + int lda, + const int *nnzPerRow, + cuComplex *csrSortedValA, + int *csrSortedRowPtrA, + int *csrSortedColIndA); + +cusparseStatus_t CUSPARSEAPI cusparseZdense2csr(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *A, + int lda, + const int *nnzPerRow, + cuDoubleComplex *csrSortedValA, + int *csrSortedRowPtrA, + int *csrSortedColIndA); + +/* Description: This routine converts a sparse matrix in CSR storage format + to a dense matrix. */ +cusparseStatus_t CUSPARSEAPI cusparseScsr2dense(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + float *A, + int lda); + +cusparseStatus_t CUSPARSEAPI cusparseDcsr2dense(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + double *A, + int lda); + +cusparseStatus_t CUSPARSEAPI cusparseCcsr2dense(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cuComplex *A, + int lda); + +cusparseStatus_t CUSPARSEAPI cusparseZcsr2dense(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cuDoubleComplex *A, + int lda); + +/* Description: This routine converts a dense matrix to a sparse matrix + in the CSC storage format, using the information computed by the + nnz routine. */ +cusparseStatus_t CUSPARSEAPI cusparseSdense2csc(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const float *A, + int lda, + const int *nnzPerCol, + float *cscSortedValA, + int *cscSortedRowIndA, + int *cscSortedColPtrA); + +cusparseStatus_t CUSPARSEAPI cusparseDdense2csc(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const double *A, + int lda, + const int *nnzPerCol, + double *cscSortedValA, + int *cscSortedRowIndA, + int *cscSortedColPtrA); + +cusparseStatus_t CUSPARSEAPI cusparseCdense2csc(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuComplex *A, + int lda, + const int *nnzPerCol, + cuComplex *cscSortedValA, + int *cscSortedRowIndA, + int *cscSortedColPtrA); + +cusparseStatus_t CUSPARSEAPI cusparseZdense2csc(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *A, + int lda, + const int *nnzPerCol, + cuDoubleComplex *cscSortedValA, + int *cscSortedRowIndA, + int *cscSortedColPtrA); + +/* Description: This routine converts a sparse matrix in CSC storage format + to a dense matrix. */ +cusparseStatus_t CUSPARSEAPI cusparseScsc2dense(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const float *cscSortedValA, + const int *cscSortedRowIndA, + const int *cscSortedColPtrA, + float *A, + int lda); + +cusparseStatus_t CUSPARSEAPI cusparseDcsc2dense(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const double *cscSortedValA, + const int *cscSortedRowIndA, + const int *cscSortedColPtrA, + double *A, + int lda); + +cusparseStatus_t CUSPARSEAPI cusparseCcsc2dense(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuComplex *cscSortedValA, + const int *cscSortedRowIndA, + const int *cscSortedColPtrA, + cuComplex *A, + int lda); + +cusparseStatus_t CUSPARSEAPI cusparseZcsc2dense(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *cscSortedValA, + const int *cscSortedRowIndA, + const int *cscSortedColPtrA, + cuDoubleComplex *A, + int lda); + +/* Description: This routine compresses the indecis of rows or columns. + It can be interpreted as a conversion from COO to CSR sparse storage + format. */ +cusparseStatus_t CUSPARSEAPI cusparseXcoo2csr(cusparseHandle_t handle, + const int *cooRowInd, + int nnz, + int m, + int *csrSortedRowPtr, + cusparseIndexBase_t idxBase); + +/* Description: This routine uncompresses the indecis of rows or columns. + It can be interpreted as a conversion from CSR to COO sparse storage + format. */ +cusparseStatus_t CUSPARSEAPI cusparseXcsr2coo(cusparseHandle_t handle, + const int *csrSortedRowPtr, + int nnz, + int m, + int *cooRowInd, + cusparseIndexBase_t idxBase); + +/* Description: This routine converts a matrix from CSR to CSC sparse + storage format. The resulting matrix can be re-interpreted as a + transpose of the original matrix in CSR storage format. */ +cusparseStatus_t CUSPARSEAPI cusparseCsr2cscEx(cusparseHandle_t handle, + int m, + int n, + int nnz, + const void *csrSortedVal, + cudaDataType csrSortedValtype, + const int *csrSortedRowPtr, + const int *csrSortedColInd, + void *cscSortedVal, + cudaDataType cscSortedValtype, + int *cscSortedRowInd, + int *cscSortedColPtr, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, + cudaDataType executiontype); + +cusparseStatus_t CUSPARSEAPI cusparseScsr2csc(cusparseHandle_t handle, + int m, + int n, + int nnz, + const float *csrSortedVal, + const int *csrSortedRowPtr, + const int *csrSortedColInd, + float *cscSortedVal, + int *cscSortedRowInd, + int *cscSortedColPtr, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase); + +cusparseStatus_t CUSPARSEAPI cusparseDcsr2csc(cusparseHandle_t handle, + int m, + int n, + int nnz, + const double *csrSortedVal, + const int *csrSortedRowPtr, + const int *csrSortedColInd, + double *cscSortedVal, + int *cscSortedRowInd, + int *cscSortedColPtr, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase); + +cusparseStatus_t CUSPARSEAPI cusparseCcsr2csc(cusparseHandle_t handle, + int m, + int n, + int nnz, + const cuComplex *csrSortedVal, + const int *csrSortedRowPtr, + const int *csrSortedColInd, + cuComplex *cscSortedVal, + int *cscSortedRowInd, + int *cscSortedColPtr, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase); + +cusparseStatus_t CUSPARSEAPI cusparseZcsr2csc(cusparseHandle_t handle, + int m, + int n, + int nnz, + const cuDoubleComplex *csrSortedVal, + const int *csrSortedRowPtr, + const int *csrSortedColInd, + cuDoubleComplex *cscSortedVal, + int *cscSortedRowInd, + int *cscSortedColPtr, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase); + +/* Description: This routine converts a dense matrix to a sparse matrix + in HYB storage format. */ +cusparseStatus_t CUSPARSEAPI cusparseSdense2hyb(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const float *A, + int lda, + const int *nnzPerRow, + cusparseHybMat_t hybA, + int userEllWidth, + cusparseHybPartition_t partitionType); + +cusparseStatus_t CUSPARSEAPI cusparseDdense2hyb(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const double *A, + int lda, + const int *nnzPerRow, + cusparseHybMat_t hybA, + int userEllWidth, + cusparseHybPartition_t partitionType); + +cusparseStatus_t CUSPARSEAPI cusparseCdense2hyb(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuComplex *A, + int lda, + const int *nnzPerRow, + cusparseHybMat_t hybA, + int userEllWidth, + cusparseHybPartition_t partitionType); + +cusparseStatus_t CUSPARSEAPI cusparseZdense2hyb(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *A, + int lda, + const int *nnzPerRow, + cusparseHybMat_t hybA, + int userEllWidth, + cusparseHybPartition_t partitionType); + +/* Description: This routine converts a sparse matrix in HYB storage format + to a dense matrix. */ +cusparseStatus_t CUSPARSEAPI cusparseShyb2dense(cusparseHandle_t handle, + const cusparseMatDescr_t descrA, + const cusparseHybMat_t hybA, + float *A, + int lda); + +cusparseStatus_t CUSPARSEAPI cusparseDhyb2dense(cusparseHandle_t handle, + const cusparseMatDescr_t descrA, + const cusparseHybMat_t hybA, + double *A, + int lda); + +cusparseStatus_t CUSPARSEAPI cusparseChyb2dense(cusparseHandle_t handle, + const cusparseMatDescr_t descrA, + const cusparseHybMat_t hybA, + cuComplex *A, + int lda); + +cusparseStatus_t CUSPARSEAPI cusparseZhyb2dense(cusparseHandle_t handle, + const cusparseMatDescr_t descrA, + const cusparseHybMat_t hybA, + cuDoubleComplex *A, + int lda); + +/* Description: This routine converts a sparse matrix in CSR storage format + to a sparse matrix in HYB storage format. */ +cusparseStatus_t CUSPARSEAPI cusparseScsr2hyb(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseHybMat_t hybA, + int userEllWidth, + cusparseHybPartition_t partitionType); + +cusparseStatus_t CUSPARSEAPI cusparseDcsr2hyb(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseHybMat_t hybA, + int userEllWidth, + cusparseHybPartition_t partitionType); + +cusparseStatus_t CUSPARSEAPI cusparseCcsr2hyb(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseHybMat_t hybA, + int userEllWidth, + cusparseHybPartition_t partitionType); + +cusparseStatus_t CUSPARSEAPI cusparseZcsr2hyb(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + cusparseHybMat_t hybA, + int userEllWidth, + cusparseHybPartition_t partitionType); + +/* Description: This routine converts a sparse matrix in HYB storage format + to a sparse matrix in CSR storage format. */ +cusparseStatus_t CUSPARSEAPI cusparseShyb2csr(cusparseHandle_t handle, + const cusparseMatDescr_t descrA, + const cusparseHybMat_t hybA, + float *csrSortedValA, + int *csrSortedRowPtrA, + int *csrSortedColIndA); + +cusparseStatus_t CUSPARSEAPI cusparseDhyb2csr(cusparseHandle_t handle, + const cusparseMatDescr_t descrA, + const cusparseHybMat_t hybA, + double *csrSortedValA, + int *csrSortedRowPtrA, + int *csrSortedColIndA); + +cusparseStatus_t CUSPARSEAPI cusparseChyb2csr(cusparseHandle_t handle, + const cusparseMatDescr_t descrA, + const cusparseHybMat_t hybA, + cuComplex *csrSortedValA, + int *csrSortedRowPtrA, + int *csrSortedColIndA); + +cusparseStatus_t CUSPARSEAPI cusparseZhyb2csr(cusparseHandle_t handle, + const cusparseMatDescr_t descrA, + const cusparseHybMat_t hybA, + cuDoubleComplex *csrSortedValA, + int *csrSortedRowPtrA, + int *csrSortedColIndA); + +/* Description: This routine converts a sparse matrix in CSC storage format + to a sparse matrix in HYB storage format. */ +cusparseStatus_t CUSPARSEAPI cusparseScsc2hyb(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const float *cscSortedValA, + const int *cscSortedRowIndA, + const int *cscSortedColPtrA, + cusparseHybMat_t hybA, + int userEllWidth, + cusparseHybPartition_t partitionType); + +cusparseStatus_t CUSPARSEAPI cusparseDcsc2hyb(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const double *cscSortedValA, + const int *cscSortedRowIndA, + const int *cscSortedColPtrA, + cusparseHybMat_t hybA, + int userEllWidth, + cusparseHybPartition_t partitionType); + +cusparseStatus_t CUSPARSEAPI cusparseCcsc2hyb(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuComplex *cscSortedValA, + const int *cscSortedRowIndA, + const int *cscSortedColPtrA, + cusparseHybMat_t hybA, + int userEllWidth, + cusparseHybPartition_t partitionType); + +cusparseStatus_t CUSPARSEAPI cusparseZcsc2hyb(cusparseHandle_t handle, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *cscSortedValA, + const int *cscSortedRowIndA, + const int *cscSortedColPtrA, + cusparseHybMat_t hybA, + int userEllWidth, + cusparseHybPartition_t partitionType); + +/* Description: This routine converts a sparse matrix in HYB storage format + to a sparse matrix in CSC storage format. */ +cusparseStatus_t CUSPARSEAPI cusparseShyb2csc(cusparseHandle_t handle, + const cusparseMatDescr_t descrA, + const cusparseHybMat_t hybA, + float *cscSortedVal, + int *cscSortedRowInd, + int *cscSortedColPtr); + +cusparseStatus_t CUSPARSEAPI cusparseDhyb2csc(cusparseHandle_t handle, + const cusparseMatDescr_t descrA, + const cusparseHybMat_t hybA, + double *cscSortedVal, + int *cscSortedRowInd, + int *cscSortedColPtr); + +cusparseStatus_t CUSPARSEAPI cusparseChyb2csc(cusparseHandle_t handle, + const cusparseMatDescr_t descrA, + const cusparseHybMat_t hybA, + cuComplex *cscSortedVal, + int *cscSortedRowInd, + int *cscSortedColPtr); + +cusparseStatus_t CUSPARSEAPI cusparseZhyb2csc(cusparseHandle_t handle, + const cusparseMatDescr_t descrA, + const cusparseHybMat_t hybA, + cuDoubleComplex *cscSortedVal, + int *cscSortedRowInd, + int *cscSortedColPtr); + +/* Description: This routine converts a sparse matrix in CSR storage format + to a sparse matrix in block-CSR storage format. */ +cusparseStatus_t CUSPARSEAPI cusparseXcsr2bsrNnz(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + int blockDim, + const cusparseMatDescr_t descrC, + int *bsrSortedRowPtrC, + int *nnzTotalDevHostPtr); + +cusparseStatus_t CUSPARSEAPI cusparseScsr2bsr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + int blockDim, + const cusparseMatDescr_t descrC, + float *bsrSortedValC, + int *bsrSortedRowPtrC, + int *bsrSortedColIndC); + +cusparseStatus_t CUSPARSEAPI cusparseDcsr2bsr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + int blockDim, + const cusparseMatDescr_t descrC, + double *bsrSortedValC, + int *bsrSortedRowPtrC, + int *bsrSortedColIndC); + +cusparseStatus_t CUSPARSEAPI cusparseCcsr2bsr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + int blockDim, + const cusparseMatDescr_t descrC, + cuComplex *bsrSortedValC, + int *bsrSortedRowPtrC, + int *bsrSortedColIndC); + +cusparseStatus_t CUSPARSEAPI cusparseZcsr2bsr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + int blockDim, + const cusparseMatDescr_t descrC, + cuDoubleComplex *bsrSortedValC, + int *bsrSortedRowPtrC, + int *bsrSortedColIndC); + +/* Description: This routine converts a sparse matrix in block-CSR storage format + to a sparse matrix in CSR storage format. */ +cusparseStatus_t CUSPARSEAPI cusparseSbsr2csr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + const cusparseMatDescr_t descrA, + const float *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int blockDim, + const cusparseMatDescr_t descrC, + float *csrSortedValC, + int *csrSortedRowPtrC, + int *csrSortedColIndC); + +cusparseStatus_t CUSPARSEAPI cusparseDbsr2csr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + const cusparseMatDescr_t descrA, + const double *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int blockDim, + const cusparseMatDescr_t descrC, + double *csrSortedValC, + int *csrSortedRowPtrC, + int *csrSortedColIndC); + +cusparseStatus_t CUSPARSEAPI cusparseCbsr2csr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + const cusparseMatDescr_t descrA, + const cuComplex *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int blockDim, + const cusparseMatDescr_t descrC, + cuComplex *csrSortedValC, + int *csrSortedRowPtrC, + int *csrSortedColIndC); + +cusparseStatus_t CUSPARSEAPI cusparseZbsr2csr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int blockDim, + const cusparseMatDescr_t descrC, + cuDoubleComplex *csrSortedValC, + int *csrSortedRowPtrC, + int *csrSortedColIndC); + +/* Description: This routine converts a sparse matrix in general block-CSR storage format + to a sparse matrix in general block-CSC storage format. */ +cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc_bufferSize(cusparseHandle_t handle, + int mb, + int nb, + int nnzb, + const float *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int rowBlockDim, + int colBlockDim, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc_bufferSize(cusparseHandle_t handle, + int mb, + int nb, + int nnzb, + const double *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int rowBlockDim, + int colBlockDim, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc_bufferSize(cusparseHandle_t handle, + int mb, + int nb, + int nnzb, + const cuComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int rowBlockDim, + int colBlockDim, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc_bufferSize(cusparseHandle_t handle, + int mb, + int nb, + int nnzb, + const cuDoubleComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int rowBlockDim, + int colBlockDim, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc_bufferSizeExt(cusparseHandle_t handle, + int mb, + int nb, + int nnzb, + const float *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int rowBlockDim, + int colBlockDim, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc_bufferSizeExt(cusparseHandle_t handle, + int mb, + int nb, + int nnzb, + const double *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int rowBlockDim, + int colBlockDim, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc_bufferSizeExt(cusparseHandle_t handle, + int mb, + int nb, + int nnzb, + const cuComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int rowBlockDim, + int colBlockDim, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc_bufferSizeExt(cusparseHandle_t handle, + int mb, + int nb, + int nnzb, + const cuDoubleComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int rowBlockDim, + int colBlockDim, + size_t *pBufferSize); + + +cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc(cusparseHandle_t handle, + int mb, + int nb, + int nnzb, + const float *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int rowBlockDim, + int colBlockDim, + float *bscVal, + int *bscRowInd, + int *bscColPtr, + cusparseAction_t copyValues, + cusparseIndexBase_t baseIdx, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc(cusparseHandle_t handle, + int mb, + int nb, + int nnzb, + const double *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int rowBlockDim, + int colBlockDim, + double *bscVal, + int *bscRowInd, + int *bscColPtr, + cusparseAction_t copyValues, + cusparseIndexBase_t baseIdx, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc(cusparseHandle_t handle, + int mb, + int nb, + int nnzb, + const cuComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int rowBlockDim, + int colBlockDim, + cuComplex *bscVal, + int *bscRowInd, + int *bscColPtr, + cusparseAction_t copyValues, + cusparseIndexBase_t baseIdx, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc(cusparseHandle_t handle, + int mb, + int nb, + int nnzb, + const cuDoubleComplex *bsrSortedVal, + const int *bsrSortedRowPtr, + const int *bsrSortedColInd, + int rowBlockDim, + int colBlockDim, + cuDoubleComplex *bscVal, + int *bscRowInd, + int *bscColPtr, + cusparseAction_t copyValues, + cusparseIndexBase_t baseIdx, + void *pBuffer); + +/* Description: This routine converts a sparse matrix in general block-CSR storage format + to a sparse matrix in CSR storage format. */ +cusparseStatus_t CUSPARSEAPI cusparseXgebsr2csr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + const cusparseMatDescr_t descrA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int rowBlockDim, + int colBlockDim, + const cusparseMatDescr_t descrC, + int *csrSortedRowPtrC, + int *csrSortedColIndC ); + +cusparseStatus_t CUSPARSEAPI cusparseSgebsr2csr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + const cusparseMatDescr_t descrA, + const float *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int rowBlockDim, + int colBlockDim, + const cusparseMatDescr_t descrC, + float *csrSortedValC, + int *csrSortedRowPtrC, + int *csrSortedColIndC ); + + +cusparseStatus_t CUSPARSEAPI cusparseDgebsr2csr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + const cusparseMatDescr_t descrA, + const double *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int rowBlockDim, + int colBlockDim, + const cusparseMatDescr_t descrC, + double *csrSortedValC, + int *csrSortedRowPtrC, + int *csrSortedColIndC ); + + +cusparseStatus_t CUSPARSEAPI cusparseCgebsr2csr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + const cusparseMatDescr_t descrA, + const cuComplex *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int rowBlockDim, + int colBlockDim, + const cusparseMatDescr_t descrC, + cuComplex *csrSortedValC, + int *csrSortedRowPtrC, + int *csrSortedColIndC ); + + +cusparseStatus_t CUSPARSEAPI cusparseZgebsr2csr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int rowBlockDim, + int colBlockDim, + const cusparseMatDescr_t descrC, + cuDoubleComplex *csrSortedValC, + int *csrSortedRowPtrC, + int *csrSortedColIndC ); + +/* Description: This routine converts a sparse matrix in CSR storage format + to a sparse matrix in general block-CSR storage format. */ +cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + int rowBlockDim, + int colBlockDim, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + int rowBlockDim, + int colBlockDim, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + int rowBlockDim, + int colBlockDim, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + int rowBlockDim, + int colBlockDim, + int *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + int rowBlockDim, + int colBlockDim, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + int rowBlockDim, + int colBlockDim, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + int rowBlockDim, + int colBlockDim, + size_t *pBufferSize); + +cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + int rowBlockDim, + int colBlockDim, + size_t *pBufferSize); + + + +cusparseStatus_t CUSPARSEAPI cusparseXcsr2gebsrNnz(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cusparseMatDescr_t descrC, + int *bsrSortedRowPtrC, + int rowBlockDim, + int colBlockDim, + int *nnzTotalDevHostPtr, + void *pBuffer ); + +cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const float *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cusparseMatDescr_t descrC, + float *bsrSortedValC, + int *bsrSortedRowPtrC, + int *bsrSortedColIndC, + int rowBlockDim, + int colBlockDim, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const double *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cusparseMatDescr_t descrC, + double *bsrSortedValC, + int *bsrSortedRowPtrC, + int *bsrSortedColIndC, + int rowBlockDim, + int colBlockDim, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cusparseMatDescr_t descrC, + cuComplex *bsrSortedValC, + int *bsrSortedRowPtrC, + int *bsrSortedColIndC, + int rowBlockDim, + int colBlockDim, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int m, + int n, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *csrSortedValA, + const int *csrSortedRowPtrA, + const int *csrSortedColIndA, + const cusparseMatDescr_t descrC, + cuDoubleComplex *bsrSortedValC, + int *bsrSortedRowPtrC, + int *bsrSortedColIndC, + int rowBlockDim, + int colBlockDim, + void *pBuffer); + +/* Description: This routine converts a sparse matrix in general block-CSR storage format + to a sparse matrix in general block-CSR storage format with different block size. */ +cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const float *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + int rowBlockDimC, + int colBlockDimC, + int *pBufferSizeInBytes ); + +cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const double *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + int rowBlockDimC, + int colBlockDimC, + int *pBufferSizeInBytes ); + +cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const cuComplex *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + int rowBlockDimC, + int colBlockDimC, + int *pBufferSizeInBytes ); + +cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr_bufferSize(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + int rowBlockDimC, + int colBlockDimC, + int *pBufferSizeInBytes ); + + +cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const float *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + int rowBlockDimC, + int colBlockDimC, + size_t *pBufferSize ); + +cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const double *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + int rowBlockDimC, + int colBlockDimC, + size_t *pBufferSize ); + +cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const cuComplex *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + int rowBlockDimC, + int colBlockDimC, + size_t *pBufferSize ); + +cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr_bufferSizeExt(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + int rowBlockDimC, + int colBlockDimC, + size_t *pBufferSize ); + + + +cusparseStatus_t CUSPARSEAPI cusparseXgebsr2gebsrNnz(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + const cusparseMatDescr_t descrC, + int *bsrSortedRowPtrC, + int rowBlockDimC, + int colBlockDimC, + int *nnzTotalDevHostPtr, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const float *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + const cusparseMatDescr_t descrC, + float *bsrSortedValC, + int *bsrSortedRowPtrC, + int *bsrSortedColIndC, + int rowBlockDimC, + int colBlockDimC, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const double *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + const cusparseMatDescr_t descrC, + double *bsrSortedValC, + int *bsrSortedRowPtrC, + int *bsrSortedColIndC, + int rowBlockDimC, + int colBlockDimC, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const cuComplex *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + const cusparseMatDescr_t descrC, + cuComplex *bsrSortedValC, + int *bsrSortedRowPtrC, + int *bsrSortedColIndC, + int rowBlockDimC, + int colBlockDimC, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr(cusparseHandle_t handle, + cusparseDirection_t dirA, + int mb, + int nb, + int nnzb, + const cusparseMatDescr_t descrA, + const cuDoubleComplex *bsrSortedValA, + const int *bsrSortedRowPtrA, + const int *bsrSortedColIndA, + int rowBlockDimA, + int colBlockDimA, + const cusparseMatDescr_t descrC, + cuDoubleComplex *bsrSortedValC, + int *bsrSortedRowPtrC, + int *bsrSortedColIndC, + int rowBlockDimC, + int colBlockDimC, + void *pBuffer); + +/* --- Sparse Matrix Sorting --- */ + +/* Description: Create a identity sequence p=[0,1,...,n-1]. */ +cusparseStatus_t CUSPARSEAPI cusparseCreateIdentityPermutation(cusparseHandle_t handle, + int n, + int *p); + +/* Description: Sort sparse matrix stored in COO format */ +cusparseStatus_t CUSPARSEAPI cusparseXcoosort_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + int nnz, + const int *cooRowsA, + const int *cooColsA, + size_t *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseXcoosortByRow(cusparseHandle_t handle, + int m, + int n, + int nnz, + int *cooRowsA, + int *cooColsA, + int *P, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseXcoosortByColumn(cusparseHandle_t handle, + int m, + int n, + int nnz, + int *cooRowsA, + int *cooColsA, + int *P, + void *pBuffer); + +/* Description: Sort sparse matrix stored in CSR format */ +cusparseStatus_t CUSPARSEAPI cusparseXcsrsort_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + int nnz, + const int *csrRowPtrA, + const int *csrColIndA, + size_t *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseXcsrsort(cusparseHandle_t handle, + int m, + int n, + int nnz, + const cusparseMatDescr_t descrA, + const int *csrRowPtrA, + int *csrColIndA, + int *P, + void *pBuffer); + +/* Description: Sort sparse matrix stored in CSC format */ +cusparseStatus_t CUSPARSEAPI cusparseXcscsort_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + int nnz, + const int *cscColPtrA, + const int *cscRowIndA, + size_t *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseXcscsort(cusparseHandle_t handle, + int m, + int n, + int nnz, + const cusparseMatDescr_t descrA, + const int *cscColPtrA, + int *cscRowIndA, + int *P, + void *pBuffer); + +/* Description: Wrapper that sorts sparse matrix stored in CSR format + (without exposing the permutation). */ +cusparseStatus_t CUSPARSEAPI cusparseScsru2csr_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + int nnz, + float *csrVal, + const int *csrRowPtr, + int *csrColInd, + csru2csrInfo_t info, + size_t *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseDcsru2csr_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + int nnz, + double *csrVal, + const int *csrRowPtr, + int *csrColInd, + csru2csrInfo_t info, + size_t *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseCcsru2csr_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + int nnz, + cuComplex *csrVal, + const int *csrRowPtr, + int *csrColInd, + csru2csrInfo_t info, + size_t *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseZcsru2csr_bufferSizeExt(cusparseHandle_t handle, + int m, + int n, + int nnz, + cuDoubleComplex *csrVal, + const int *csrRowPtr, + int *csrColInd, + csru2csrInfo_t info, + size_t *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseScsru2csr(cusparseHandle_t handle, + int m, + int n, + int nnz, + const cusparseMatDescr_t descrA, + float *csrVal, + const int *csrRowPtr, + int *csrColInd, + csru2csrInfo_t info, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDcsru2csr(cusparseHandle_t handle, + int m, + int n, + int nnz, + const cusparseMatDescr_t descrA, + double *csrVal, + const int *csrRowPtr, + int *csrColInd, + csru2csrInfo_t info, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseCcsru2csr(cusparseHandle_t handle, + int m, + int n, + int nnz, + const cusparseMatDescr_t descrA, + cuComplex *csrVal, + const int *csrRowPtr, + int *csrColInd, + csru2csrInfo_t info, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseZcsru2csr(cusparseHandle_t handle, + int m, + int n, + int nnz, + const cusparseMatDescr_t descrA, + cuDoubleComplex *csrVal, + const int *csrRowPtr, + int *csrColInd, + csru2csrInfo_t info, + void *pBuffer); + +/* Description: Wrapper that un-sorts sparse matrix stored in CSR format + (without exposing the permutation). */ +cusparseStatus_t CUSPARSEAPI cusparseScsr2csru(cusparseHandle_t handle, + int m, + int n, + int nnz, + const cusparseMatDescr_t descrA, + float *csrVal, + const int *csrRowPtr, + int *csrColInd, + csru2csrInfo_t info, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDcsr2csru(cusparseHandle_t handle, + int m, + int n, + int nnz, + const cusparseMatDescr_t descrA, + double *csrVal, + const int *csrRowPtr, + int *csrColInd, + csru2csrInfo_t info, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseCcsr2csru(cusparseHandle_t handle, + int m, + int n, + int nnz, + const cusparseMatDescr_t descrA, + cuComplex *csrVal, + const int *csrRowPtr, + int *csrColInd, + csru2csrInfo_t info, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseZcsr2csru(cusparseHandle_t handle, + int m, + int n, + int nnz, + const cusparseMatDescr_t descrA, + cuDoubleComplex *csrVal, + const int *csrRowPtr, + int *csrColInd, + csru2csrInfo_t info, + void *pBuffer); + +/* Description: prune dense matrix to a sparse matrix with CSR format */ +#if defined(__cplusplus) +cusparseStatus_t CUSPARSEAPI cusparseHpruneDense2csr_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + const __half *A, + int lda, + const __half *threshold, + const cusparseMatDescr_t descrC, + const __half *csrValC, + const int *csrRowPtrC, + const int *csrColIndC, + size_t *pBufferSizeInBytes); +#endif + +cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csr_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + const float *A, + int lda, + const float *threshold, + const cusparseMatDescr_t descrC, + const float *csrValC, + const int *csrRowPtrC, + const int *csrColIndC, + size_t *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csr_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + const double *A, + int lda, + const double *threshold, + const cusparseMatDescr_t descrC, + const double *csrValC, + const int *csrRowPtrC, + const int *csrColIndC, + size_t *pBufferSizeInBytes); + +#if defined(__cplusplus) +cusparseStatus_t CUSPARSEAPI cusparseHpruneDense2csrNnz( + cusparseHandle_t handle, + int m, + int n, + const __half *A, + int lda, + const __half *threshold, + const cusparseMatDescr_t descrC, + int *csrRowPtrC, + int *nnzTotalDevHostPtr, + void *pBuffer); +#endif + +cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrNnz( + cusparseHandle_t handle, + int m, + int n, + const float *A, + int lda, + const float *threshold, + const cusparseMatDescr_t descrC, + int *csrRowPtrC, + int *nnzTotalDevHostPtr, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrNnz( + cusparseHandle_t handle, + int m, + int n, + const double *A, + int lda, + const double *threshold, + const cusparseMatDescr_t descrC, + int *csrRowPtrC, + int *nnzTotalDevHostPtr, + void *pBuffer); + +#if defined(__cplusplus) +cusparseStatus_t CUSPARSEAPI cusparseHpruneDense2csr( + cusparseHandle_t handle, + int m, + int n, + const __half *A, + int lda, + const __half *threshold, + const cusparseMatDescr_t descrC, + __half *csrValC, + const int *csrRowPtrC, + int *csrColIndC, + void *pBuffer); +#endif + +cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csr( + cusparseHandle_t handle, + int m, + int n, + const float *A, + int lda, + const float *threshold, + const cusparseMatDescr_t descrC, + float *csrValC, + const int *csrRowPtrC, + int *csrColIndC, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csr( + cusparseHandle_t handle, + int m, + int n, + const double *A, + int lda, + const double *threshold, + const cusparseMatDescr_t descrC, + double *csrValC, + const int *csrRowPtrC, + int *csrColIndC, + void *pBuffer); + +/* Description: prune sparse matrix with CSR format to another sparse matrix with CSR format */ +#if defined(__cplusplus) +cusparseStatus_t CUSPARSEAPI cusparseHpruneCsr2csr_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const __half *csrValA, + const int *csrRowPtrA, + const int *csrColIndA, + const __half *threshold, + const cusparseMatDescr_t descrC, + const __half *csrValC, + const int *csrRowPtrC, + const int *csrColIndC, + size_t *pBufferSizeInBytes); +#endif + +cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csr_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const float *csrValA, + const int *csrRowPtrA, + const int *csrColIndA, + const float *threshold, + const cusparseMatDescr_t descrC, + const float *csrValC, + const int *csrRowPtrC, + const int *csrColIndC, + size_t *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csr_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const double *csrValA, + const int *csrRowPtrA, + const int *csrColIndA, + const double *threshold, + const cusparseMatDescr_t descrC, + const double *csrValC, + const int *csrRowPtrC, + const int *csrColIndC, + size_t *pBufferSizeInBytes); + +#if defined(__cplusplus) +cusparseStatus_t CUSPARSEAPI cusparseHpruneCsr2csrNnz( + cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const __half *csrValA, + const int *csrRowPtrA, + const int *csrColIndA, + const __half *threshold, + const cusparseMatDescr_t descrC, + int *csrRowPtrC, + int *nnzTotalDevHostPtr, /* can be on host or device */ + void *pBuffer); +#endif + +cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrNnz( + cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const float *csrValA, + const int *csrRowPtrA, + const int *csrColIndA, + const float *threshold, + const cusparseMatDescr_t descrC, + int *csrRowPtrC, + int *nnzTotalDevHostPtr, /* can be on host or device */ + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrNnz( + cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const double *csrValA, + const int *csrRowPtrA, + const int *csrColIndA, + const double *threshold, + const cusparseMatDescr_t descrC, + int *csrRowPtrC, + int *nnzTotalDevHostPtr, /* can be on host or device */ + void *pBuffer); + +#if defined(__cplusplus) +cusparseStatus_t CUSPARSEAPI cusparseHpruneCsr2csr( + cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const __half *csrValA, + const int *csrRowPtrA, + const int *csrColIndA, + const __half *threshold, + const cusparseMatDescr_t descrC, + __half *csrValC, + const int *csrRowPtrC, + int *csrColIndC, + void *pBuffer); +#endif + +cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csr( + cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const float *csrValA, + const int *csrRowPtrA, + const int *csrColIndA, + const float *threshold, + const cusparseMatDescr_t descrC, + float *csrValC, + const int *csrRowPtrC, + int *csrColIndC, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csr( + cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const double *csrValA, + const int *csrRowPtrA, + const int *csrColIndA, + const double *threshold, + const cusparseMatDescr_t descrC, + double *csrValC, + const int *csrRowPtrC, + int *csrColIndC, + void *pBuffer); + +/* Description: prune dense matrix to a sparse matrix with CSR format by percentage */ +#if defined(__cplusplus) +cusparseStatus_t CUSPARSEAPI cusparseHpruneDense2csrByPercentage_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + const __half *A, + int lda, + float percentage, /* between 0 to 100 */ + const cusparseMatDescr_t descrC, + const __half *csrValC, + const int *csrRowPtrC, + const int *csrColIndC, + pruneInfo_t info, + size_t *pBufferSizeInBytes); +#endif + +cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrByPercentage_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + const float *A, + int lda, + float percentage, /* between 0 to 100 */ + const cusparseMatDescr_t descrC, + const float *csrValC, + const int *csrRowPtrC, + const int *csrColIndC, + pruneInfo_t info, + size_t *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrByPercentage_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + const double *A, + int lda, + float percentage, /* between 0 to 100 */ + const cusparseMatDescr_t descrC, + const double *csrValC, + const int *csrRowPtrC, + const int *csrColIndC, + pruneInfo_t info, + size_t *pBufferSizeInBytes); + +#if defined(__cplusplus) +cusparseStatus_t CUSPARSEAPI cusparseHpruneDense2csrNnzByPercentage( + cusparseHandle_t handle, + int m, + int n, + const __half *A, + int lda, + float percentage, /* between 0 to 100 */ + const cusparseMatDescr_t descrC, + int *csrRowPtrC, + int *nnzTotalDevHostPtr, /* can be on host or device */ + pruneInfo_t info, + void *pBuffer); +#endif + +cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrNnzByPercentage( + cusparseHandle_t handle, + int m, + int n, + const float *A, + int lda, + float percentage, /* between 0 to 100 */ + const cusparseMatDescr_t descrC, + int *csrRowPtrC, + int *nnzTotalDevHostPtr, /* can be on host or device */ + pruneInfo_t info, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrNnzByPercentage( + cusparseHandle_t handle, + int m, + int n, + const double *A, + int lda, + float percentage, /* between 0 to 100 */ + const cusparseMatDescr_t descrC, + int *csrRowPtrC, + int *nnzTotalDevHostPtr, /* can be on host or device */ + pruneInfo_t info, + void *pBuffer); + +#if defined(__cplusplus) +cusparseStatus_t CUSPARSEAPI cusparseHpruneDense2csrByPercentage( + cusparseHandle_t handle, + int m, + int n, + const __half *A, + int lda, + float percentage, /* between 0 to 100 */ + const cusparseMatDescr_t descrC, + __half *csrValC, + const int *csrRowPtrC, + int *csrColIndC, + pruneInfo_t info, + void *pBuffer); +#endif + +cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrByPercentage( + cusparseHandle_t handle, + int m, + int n, + const float *A, + int lda, + float percentage, /* between 0 to 100 */ + const cusparseMatDescr_t descrC, + float *csrValC, + const int *csrRowPtrC, + int *csrColIndC, + pruneInfo_t info, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrByPercentage( + cusparseHandle_t handle, + int m, + int n, + const double *A, + int lda, + float percentage, /* between 0 to 100 */ + const cusparseMatDescr_t descrC, + double *csrValC, + const int *csrRowPtrC, + int *csrColIndC, + pruneInfo_t info, + void *pBuffer); + + +/* Description: prune sparse matrix to a sparse matrix with CSR format by percentage*/ +#if defined(__cplusplus) +cusparseStatus_t CUSPARSEAPI cusparseHpruneCsr2csrByPercentage_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const __half *csrValA, + const int *csrRowPtrA, + const int *csrColIndA, + float percentage, /* between 0 to 100 */ + const cusparseMatDescr_t descrC, + const __half *csrValC, + const int *csrRowPtrC, + const int *csrColIndC, + pruneInfo_t info, + size_t *pBufferSizeInBytes); +#endif + +cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrByPercentage_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const float *csrValA, + const int *csrRowPtrA, + const int *csrColIndA, + float percentage, /* between 0 to 100 */ + const cusparseMatDescr_t descrC, + const float *csrValC, + const int *csrRowPtrC, + const int *csrColIndC, + pruneInfo_t info, + size_t *pBufferSizeInBytes); + +cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrByPercentage_bufferSizeExt( + cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const double *csrValA, + const int *csrRowPtrA, + const int *csrColIndA, + float percentage, /* between 0 to 100 */ + const cusparseMatDescr_t descrC, + const double *csrValC, + const int *csrRowPtrC, + const int *csrColIndC, + pruneInfo_t info, + size_t *pBufferSizeInBytes); + +#if defined(__cplusplus) +cusparseStatus_t CUSPARSEAPI cusparseHpruneCsr2csrNnzByPercentage( + cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const __half *csrValA, + const int *csrRowPtrA, + const int *csrColIndA, + float percentage, /* between 0 to 100 */ + const cusparseMatDescr_t descrC, + int *csrRowPtrC, + int *nnzTotalDevHostPtr, /* can be on host or device */ + pruneInfo_t info, + void *pBuffer); +#endif + +cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrNnzByPercentage( + cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const float *csrValA, + const int *csrRowPtrA, + const int *csrColIndA, + float percentage, /* between 0 to 100 */ + const cusparseMatDescr_t descrC, + int *csrRowPtrC, + int *nnzTotalDevHostPtr, /* can be on host or device */ + pruneInfo_t info, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrNnzByPercentage( + cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const double *csrValA, + const int *csrRowPtrA, + const int *csrColIndA, + float percentage, /* between 0 to 100 */ + const cusparseMatDescr_t descrC, + int *csrRowPtrC, + int *nnzTotalDevHostPtr, /* can be on host or device */ + pruneInfo_t info, + void *pBuffer); + +#if defined(__cplusplus) +cusparseStatus_t CUSPARSEAPI cusparseHpruneCsr2csrByPercentage( + cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const __half *csrValA, + const int *csrRowPtrA, + const int *csrColIndA, + float percentage, /* between 0 to 100 */ + const cusparseMatDescr_t descrC, + __half *csrValC, + const int *csrRowPtrC, + int *csrColIndC, + pruneInfo_t info, + void *pBuffer); +#endif + +cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrByPercentage( + cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const float *csrValA, + const int *csrRowPtrA, + const int *csrColIndA, + float percentage, /* between 0 to 100 */ + const cusparseMatDescr_t descrC, + float *csrValC, + const int *csrRowPtrC, + int *csrColIndC, + pruneInfo_t info, + void *pBuffer); + +cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrByPercentage( + cusparseHandle_t handle, + int m, + int n, + int nnzA, + const cusparseMatDescr_t descrA, + const double *csrValA, + const int *csrRowPtrA, + const int *csrColIndA, + float percentage, /* between 0 to 100 */ + const cusparseMatDescr_t descrC, + double *csrValC, + const int *csrRowPtrC, + int *csrColIndC, + pruneInfo_t info, + void *pBuffer); + + + + +#if defined(__cplusplus) +} +#endif /* __cplusplus */ + +#endif /* !defined(CUSPARSE_H_) */ + diff --git a/include/external/CUDA/device_types.h b/include/external/CUDA/device_types.h new file mode 100755 index 000000000..1eab7bd3b --- /dev/null +++ b/include/external/CUDA/device_types.h @@ -0,0 +1,69 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__DEVICE_TYPES_H__) +#define __DEVICE_TYPES_H__ + +#include "host_defines.h" + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +enum __device_builtin__ cudaRoundMode +{ + cudaRoundNearest, + cudaRoundZero, + cudaRoundPosInf, + cudaRoundMinInf +}; + +#endif /* !__DEVICE_TYPES_H__ */ diff --git a/include/external/CUDA/driver_functions.h b/include/external/CUDA/driver_functions.h new file mode 100755 index 000000000..7ea235c1e --- /dev/null +++ b/include/external/CUDA/driver_functions.h @@ -0,0 +1,145 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__DRIVER_FUNCTIONS_H__) +#define __DRIVER_FUNCTIONS_H__ + +#include "builtin_types.h" +#include "host_defines.h" +#include "driver_types.h" + +/** + * \addtogroup CUDART_MEMORY + * + * @{ + */ + +/** + * \brief Returns a cudaPitchedPtr based on input parameters + * + * Returns a ::cudaPitchedPtr based on the specified input parameters \p d, + * \p p, \p xsz, and \p ysz. + * + * \param d - Pointer to allocated memory + * \param p - Pitch of allocated memory in bytes + * \param xsz - Logical width of allocation in elements + * \param ysz - Logical height of allocation in elements + * + * \return + * ::cudaPitchedPtr specified by \p d, \p p, \p xsz, and \p ysz + * + * \sa make_cudaExtent, make_cudaPos + */ +static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz) +{ + struct cudaPitchedPtr s; + + s.ptr = d; + s.pitch = p; + s.xsize = xsz; + s.ysize = ysz; + + return s; +} + +/** + * \brief Returns a cudaPos based on input parameters + * + * Returns a ::cudaPos based on the specified input parameters \p x, + * \p y, and \p z. + * + * \param x - X position + * \param y - Y position + * \param z - Z position + * + * \return + * ::cudaPos specified by \p x, \p y, and \p z + * + * \sa make_cudaExtent, make_cudaPitchedPtr + */ +static __inline__ __host__ struct cudaPos make_cudaPos(size_t x, size_t y, size_t z) +{ + struct cudaPos p; + + p.x = x; + p.y = y; + p.z = z; + + return p; +} + +/** + * \brief Returns a cudaExtent based on input parameters + * + * Returns a ::cudaExtent based on the specified input parameters \p w, + * \p h, and \p d. + * + * \param w - Width in elements when referring to array memory, in bytes when referring to linear memory + * \param h - Height in elements + * \param d - Depth in elements + * + * \return + * ::cudaExtent specified by \p w, \p h, and \p d + * + * \sa make_cudaPitchedPtr, make_cudaPos + */ +static __inline__ __host__ struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d) +{ + struct cudaExtent e; + + e.width = w; + e.height = h; + e.depth = d; + + return e; +} + +/** @} */ /* END CUDART_MEMORY */ + +#endif /* !__DRIVER_FUNCTIONS_H__ */ diff --git a/include/external/CUDA/driver_types.h b/include/external/CUDA/driver_types.h new file mode 100755 index 000000000..fd11843b5 --- /dev/null +++ b/include/external/CUDA/driver_types.h @@ -0,0 +1,1610 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__DRIVER_TYPES_H__) +#define __DRIVER_TYPES_H__ + +#include "host_defines.h" +#include "vector_types.h" + +/** + * \defgroup CUDART_TYPES Data types used by CUDA Runtime + * \ingroup CUDART + * + * @{ + */ + +/******************************************************************************* +* * +* TYPE DEFINITIONS USED BY RUNTIME API * +* * +*******************************************************************************/ + +#if !defined(__CUDA_INTERNAL_COMPILATION__) + +#if !defined(__CUDACC_RTC__) +#include +#include +#endif /* !defined(__CUDACC_RTC__) */ + +#define cudaHostAllocDefault 0x00 /**< Default page-locked allocation flag */ +#define cudaHostAllocPortable 0x01 /**< Pinned memory accessible by all CUDA contexts */ +#define cudaHostAllocMapped 0x02 /**< Map allocation into device space */ +#define cudaHostAllocWriteCombined 0x04 /**< Write-combined memory */ + +#define cudaHostRegisterDefault 0x00 /**< Default host memory registration flag */ +#define cudaHostRegisterPortable 0x01 /**< Pinned memory accessible by all CUDA contexts */ +#define cudaHostRegisterMapped 0x02 /**< Map registered memory into device space */ +#define cudaHostRegisterIoMemory 0x04 /**< Memory-mapped I/O space */ + +#define cudaPeerAccessDefault 0x00 /**< Default peer addressing enable flag */ + +#define cudaStreamDefault 0x00 /**< Default stream flag */ +#define cudaStreamNonBlocking 0x01 /**< Stream does not synchronize with stream 0 (the NULL stream) */ + + /** + * Legacy stream handle + * + * Stream handle that can be passed as a cudaStream_t to use an implicit stream + * with legacy synchronization behavior. + * + * See details of the \link_sync_behavior + */ +#define cudaStreamLegacy ((cudaStream_t)0x1) + +/** + * Per-thread stream handle + * + * Stream handle that can be passed as a cudaStream_t to use an implicit stream + * with per-thread synchronization behavior. + * + * See details of the \link_sync_behavior + */ +#define cudaStreamPerThread ((cudaStream_t)0x2) + +#define cudaEventDefault 0x00 /**< Default event flag */ +#define cudaEventBlockingSync 0x01 /**< Event uses blocking synchronization */ +#define cudaEventDisableTiming 0x02 /**< Event will not record timing data */ +#define cudaEventInterprocess 0x04 /**< Event is suitable for interprocess use. cudaEventDisableTiming must be set */ + +#define cudaDeviceScheduleAuto 0x00 /**< Device flag - Automatic scheduling */ +#define cudaDeviceScheduleSpin 0x01 /**< Device flag - Spin default scheduling */ +#define cudaDeviceScheduleYield 0x02 /**< Device flag - Yield default scheduling */ +#define cudaDeviceScheduleBlockingSync 0x04 /**< Device flag - Use blocking synchronization */ +#define cudaDeviceBlockingSync 0x04 /**< Device flag - Use blocking synchronization + * \deprecated This flag was deprecated as of CUDA 4.0 and + * replaced with ::cudaDeviceScheduleBlockingSync. */ +#define cudaDeviceScheduleMask 0x07 /**< Device schedule flags mask */ +#define cudaDeviceMapHost 0x08 /**< Device flag - Support mapped pinned allocations */ +#define cudaDeviceLmemResizeToMax 0x10 /**< Device flag - Keep local memory allocation after launch */ +#define cudaDeviceMask 0x1f /**< Device flags mask */ + +#define cudaArrayDefault 0x00 /**< Default CUDA array allocation flag */ +#define cudaArrayLayered 0x01 /**< Must be set in cudaMalloc3DArray to create a layered CUDA array */ +#define cudaArraySurfaceLoadStore 0x02 /**< Must be set in cudaMallocArray or cudaMalloc3DArray in order to bind surfaces to the CUDA array */ +#define cudaArrayCubemap 0x04 /**< Must be set in cudaMalloc3DArray to create a cubemap CUDA array */ +#define cudaArrayTextureGather 0x08 /**< Must be set in cudaMallocArray or cudaMalloc3DArray in order to perform texture gather operations on the CUDA array */ + +#define cudaIpcMemLazyEnablePeerAccess 0x01 /**< Automatically enable peer access between remote devices as needed */ + +#define cudaMemAttachGlobal 0x01 /**< Memory can be accessed by any stream on any device*/ +#define cudaMemAttachHost 0x02 /**< Memory cannot be accessed by any stream on any device */ +#define cudaMemAttachSingle 0x04 /**< Memory can only be accessed by a single stream on the associated device */ + +#define cudaOccupancyDefault 0x00 /**< Default behavior */ +#define cudaOccupancyDisableCachingOverride 0x01 /**< Assume global caching is enabled and cannot be automatically turned off */ + +#define cudaCpuDeviceId ((int)-1) /**< Device id that represents the CPU */ +#define cudaInvalidDeviceId ((int)-2) /**< Device id that represents an invalid device */ + +/** + * If set, each kernel launched as part of ::cudaLaunchCooperativeKernelMultiDevice only + * waits for prior work in the stream corresponding to that GPU to complete before the + * kernel begins execution. + */ +#define cudaCooperativeLaunchMultiDeviceNoPreSync 0x01 + +/** + * If set, any subsequent work pushed in a stream that participated in a call to + * ::cudaLaunchCooperativeKernelMultiDevice will only wait for the kernel launched on + * the GPU corresponding to that stream to complete before it begins execution. + */ +#define cudaCooperativeLaunchMultiDeviceNoPostSync 0x02 + +#endif /* !__CUDA_INTERNAL_COMPILATION__ */ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +/** + * CUDA error types + */ +enum __device_builtin__ cudaError +{ + /** + * The API call returned with no errors. In the case of query calls, this + * can also mean that the operation being queried is complete (see + * ::cudaEventQuery() and ::cudaStreamQuery()). + */ + cudaSuccess = 0, + + /** + * The device function being invoked (usually via ::cudaLaunchKernel()) was not + * previously configured via the ::cudaConfigureCall() function. + */ + cudaErrorMissingConfiguration = 1, + + /** + * The API call failed because it was unable to allocate enough memory to + * perform the requested operation. + */ + cudaErrorMemoryAllocation = 2, + + /** + * The API call failed because the CUDA driver and runtime could not be + * initialized. + */ + cudaErrorInitializationError = 3, + + /** + * An exception occurred on the device while executing a kernel. Common + * causes include dereferencing an invalid device pointer and accessing + * out of bounds shared memory. The device cannot be used until + * ::cudaThreadExit() is called. All existing device memory allocations + * are invalid and must be reconstructed if the program is to continue + * using CUDA. + */ + cudaErrorLaunchFailure = 4, + + /** + * This indicated that a previous kernel launch failed. This was previously + * used for device emulation of kernel launches. + * \deprecated + * This error return is deprecated as of CUDA 3.1. Device emulation mode was + * removed with the CUDA 3.1 release. + */ + cudaErrorPriorLaunchFailure = 5, + + /** + * This indicates that the device kernel took too long to execute. This can + * only occur if timeouts are enabled - see the device property + * \ref ::cudaDeviceProp::kernelExecTimeoutEnabled "kernelExecTimeoutEnabled" + * for more information. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + cudaErrorLaunchTimeout = 6, + + /** + * This indicates that a launch did not occur because it did not have + * appropriate resources. Although this error is similar to + * ::cudaErrorInvalidConfiguration, this error usually indicates that the + * user has attempted to pass too many arguments to the device kernel, or the + * kernel launch specifies too many threads for the kernel's register count. + */ + cudaErrorLaunchOutOfResources = 7, + + /** + * The requested device function does not exist or is not compiled for the + * proper device architecture. + */ + cudaErrorInvalidDeviceFunction = 8, + + /** + * This indicates that a kernel launch is requesting resources that can + * never be satisfied by the current device. Requesting more shared memory + * per block than the device supports will trigger this error, as will + * requesting too many threads or blocks. See ::cudaDeviceProp for more + * device limitations. + */ + cudaErrorInvalidConfiguration = 9, + + /** + * This indicates that the device ordinal supplied by the user does not + * correspond to a valid CUDA device. + */ + cudaErrorInvalidDevice = 10, + + /** + * This indicates that one or more of the parameters passed to the API call + * is not within an acceptable range of values. + */ + cudaErrorInvalidValue = 11, + + /** + * This indicates that one or more of the pitch-related parameters passed + * to the API call is not within the acceptable range for pitch. + */ + cudaErrorInvalidPitchValue = 12, + + /** + * This indicates that the symbol name/identifier passed to the API call + * is not a valid name or identifier. + */ + cudaErrorInvalidSymbol = 13, + + /** + * This indicates that the buffer object could not be mapped. + */ + cudaErrorMapBufferObjectFailed = 14, + + /** + * This indicates that the buffer object could not be unmapped. + */ + cudaErrorUnmapBufferObjectFailed = 15, + + /** + * This indicates that at least one host pointer passed to the API call is + * not a valid host pointer. + */ + cudaErrorInvalidHostPointer = 16, + + /** + * This indicates that at least one device pointer passed to the API call is + * not a valid device pointer. + */ + cudaErrorInvalidDevicePointer = 17, + + /** + * This indicates that the texture passed to the API call is not a valid + * texture. + */ + cudaErrorInvalidTexture = 18, + + /** + * This indicates that the texture binding is not valid. This occurs if you + * call ::cudaGetTextureAlignmentOffset() with an unbound texture. + */ + cudaErrorInvalidTextureBinding = 19, + + /** + * This indicates that the channel descriptor passed to the API call is not + * valid. This occurs if the format is not one of the formats specified by + * ::cudaChannelFormatKind, or if one of the dimensions is invalid. + */ + cudaErrorInvalidChannelDescriptor = 20, + + /** + * This indicates that the direction of the memcpy passed to the API call is + * not one of the types specified by ::cudaMemcpyKind. + */ + cudaErrorInvalidMemcpyDirection = 21, + + /** + * This indicated that the user has taken the address of a constant variable, + * which was forbidden up until the CUDA 3.1 release. + * \deprecated + * This error return is deprecated as of CUDA 3.1. Variables in constant + * memory may now have their address taken by the runtime via + * ::cudaGetSymbolAddress(). + */ + cudaErrorAddressOfConstant = 22, + + /** + * This indicated that a texture fetch was not able to be performed. + * This was previously used for device emulation of texture operations. + * \deprecated + * This error return is deprecated as of CUDA 3.1. Device emulation mode was + * removed with the CUDA 3.1 release. + */ + cudaErrorTextureFetchFailed = 23, + + /** + * This indicated that a texture was not bound for access. + * This was previously used for device emulation of texture operations. + * \deprecated + * This error return is deprecated as of CUDA 3.1. Device emulation mode was + * removed with the CUDA 3.1 release. + */ + cudaErrorTextureNotBound = 24, + + /** + * This indicated that a synchronization operation had failed. + * This was previously used for some device emulation functions. + * \deprecated + * This error return is deprecated as of CUDA 3.1. Device emulation mode was + * removed with the CUDA 3.1 release. + */ + cudaErrorSynchronizationError = 25, + + /** + * This indicates that a non-float texture was being accessed with linear + * filtering. This is not supported by CUDA. + */ + cudaErrorInvalidFilterSetting = 26, + + /** + * This indicates that an attempt was made to read a non-float texture as a + * normalized float. This is not supported by CUDA. + */ + cudaErrorInvalidNormSetting = 27, + + /** + * Mixing of device and device emulation code was not allowed. + * \deprecated + * This error return is deprecated as of CUDA 3.1. Device emulation mode was + * removed with the CUDA 3.1 release. + */ + cudaErrorMixedDeviceExecution = 28, + + /** + * This indicates that a CUDA Runtime API call cannot be executed because + * it is being called during process shut down, at a point in time after + * CUDA driver has been unloaded. + */ + cudaErrorCudartUnloading = 29, + + /** + * This indicates that an unknown internal error has occurred. + */ + cudaErrorUnknown = 30, + + /** + * This indicates that the API call is not yet implemented. Production + * releases of CUDA will never return this error. + * \deprecated + * This error return is deprecated as of CUDA 4.1. + */ + cudaErrorNotYetImplemented = 31, + + /** + * This indicated that an emulated device pointer exceeded the 32-bit address + * range. + * \deprecated + * This error return is deprecated as of CUDA 3.1. Device emulation mode was + * removed with the CUDA 3.1 release. + */ + cudaErrorMemoryValueTooLarge = 32, + + /** + * This indicates that a resource handle passed to the API call was not + * valid. Resource handles are opaque types like ::cudaStream_t and + * ::cudaEvent_t. + */ + cudaErrorInvalidResourceHandle = 33, + + /** + * This indicates that asynchronous operations issued previously have not + * completed yet. This result is not actually an error, but must be indicated + * differently than ::cudaSuccess (which indicates completion). Calls that + * may return this value include ::cudaEventQuery() and ::cudaStreamQuery(). + */ + cudaErrorNotReady = 34, + + /** + * This indicates that the installed NVIDIA CUDA driver is older than the + * CUDA runtime library. This is not a supported configuration. Users should + * install an updated NVIDIA display driver to allow the application to run. + */ + cudaErrorInsufficientDriver = 35, + + /** + * This indicates that the user has called ::cudaSetValidDevices(), + * ::cudaSetDeviceFlags(), ::cudaD3D9SetDirect3DDevice(), + * ::cudaD3D10SetDirect3DDevice, ::cudaD3D11SetDirect3DDevice(), or + * ::cudaVDPAUSetVDPAUDevice() after initializing the CUDA runtime by + * calling non-device management operations (allocating memory and + * launching kernels are examples of non-device management operations). + * This error can also be returned if using runtime/driver + * interoperability and there is an existing ::CUcontext active on the + * host thread. + */ + cudaErrorSetOnActiveProcess = 36, + + /** + * This indicates that the surface passed to the API call is not a valid + * surface. + */ + cudaErrorInvalidSurface = 37, + + /** + * This indicates that no CUDA-capable devices were detected by the installed + * CUDA driver. + */ + cudaErrorNoDevice = 38, + + /** + * This indicates that an uncorrectable ECC error was detected during + * execution. + */ + cudaErrorECCUncorrectable = 39, + + /** + * This indicates that a link to a shared object failed to resolve. + */ + cudaErrorSharedObjectSymbolNotFound = 40, + + /** + * This indicates that initialization of a shared object failed. + */ + cudaErrorSharedObjectInitFailed = 41, + + /** + * This indicates that the ::cudaLimit passed to the API call is not + * supported by the active device. + */ + cudaErrorUnsupportedLimit = 42, + + /** + * This indicates that multiple global or constant variables (across separate + * CUDA source files in the application) share the same string name. + */ + cudaErrorDuplicateVariableName = 43, + + /** + * This indicates that multiple textures (across separate CUDA source + * files in the application) share the same string name. + */ + cudaErrorDuplicateTextureName = 44, + + /** + * This indicates that multiple surfaces (across separate CUDA source + * files in the application) share the same string name. + */ + cudaErrorDuplicateSurfaceName = 45, + + /** + * This indicates that all CUDA devices are busy or unavailable at the current + * time. Devices are often busy/unavailable due to use of + * ::cudaComputeModeExclusive, ::cudaComputeModeProhibited or when long + * running CUDA kernels have filled up the GPU and are blocking new work + * from starting. They can also be unavailable due to memory constraints + * on a device that already has active CUDA work being performed. + */ + cudaErrorDevicesUnavailable = 46, + + /** + * This indicates that the device kernel image is invalid. + */ + cudaErrorInvalidKernelImage = 47, + + /** + * This indicates that there is no kernel image available that is suitable + * for the device. This can occur when a user specifies code generation + * options for a particular CUDA source file that do not include the + * corresponding device configuration. + */ + cudaErrorNoKernelImageForDevice = 48, + + /** + * This indicates that the current context is not compatible with this + * the CUDA Runtime. This can only occur if you are using CUDA + * Runtime/Driver interoperability and have created an existing Driver + * context using the driver API. The Driver context may be incompatible + * either because the Driver context was created using an older version + * of the API, because the Runtime API call expects a primary driver + * context and the Driver context is not primary, or because the Driver + * context has been destroyed. Please see \ref CUDART_DRIVER "Interactions + * with the CUDA Driver API" for more information. + */ + cudaErrorIncompatibleDriverContext = 49, + + /** + * This error indicates that a call to ::cudaDeviceEnablePeerAccess() is + * trying to re-enable peer addressing on from a context which has already + * had peer addressing enabled. + */ + cudaErrorPeerAccessAlreadyEnabled = 50, + + /** + * This error indicates that ::cudaDeviceDisablePeerAccess() is trying to + * disable peer addressing which has not been enabled yet via + * ::cudaDeviceEnablePeerAccess(). + */ + cudaErrorPeerAccessNotEnabled = 51, + + /** + * This indicates that a call tried to access an exclusive-thread device that + * is already in use by a different thread. + */ + cudaErrorDeviceAlreadyInUse = 54, + + /** + * This indicates profiler is not initialized for this run. This can + * happen when the application is running with external profiling tools + * like visual profiler. + */ + cudaErrorProfilerDisabled = 55, + + /** + * \deprecated + * This error return is deprecated as of CUDA 5.0. It is no longer an error + * to attempt to enable/disable the profiling via ::cudaProfilerStart or + * ::cudaProfilerStop without initialization. + */ + cudaErrorProfilerNotInitialized = 56, + + /** + * \deprecated + * This error return is deprecated as of CUDA 5.0. It is no longer an error + * to call cudaProfilerStart() when profiling is already enabled. + */ + cudaErrorProfilerAlreadyStarted = 57, + + /** + * \deprecated + * This error return is deprecated as of CUDA 5.0. It is no longer an error + * to call cudaProfilerStop() when profiling is already disabled. + */ + cudaErrorProfilerAlreadyStopped = 58, + + /** + * An assert triggered in device code during kernel execution. The device + * cannot be used again until ::cudaThreadExit() is called. All existing + * allocations are invalid and must be reconstructed if the program is to + * continue using CUDA. + */ + cudaErrorAssert = 59, + + /** + * This error indicates that the hardware resources required to enable + * peer access have been exhausted for one or more of the devices + * passed to ::cudaEnablePeerAccess(). + */ + cudaErrorTooManyPeers = 60, + + /** + * This error indicates that the memory range passed to ::cudaHostRegister() + * has already been registered. + */ + cudaErrorHostMemoryAlreadyRegistered = 61, + + /** + * This error indicates that the pointer passed to ::cudaHostUnregister() + * does not correspond to any currently registered memory region. + */ + cudaErrorHostMemoryNotRegistered = 62, + + /** + * This error indicates that an OS call failed. + */ + cudaErrorOperatingSystem = 63, + + /** + * This error indicates that P2P access is not supported across the given + * devices. + */ + cudaErrorPeerAccessUnsupported = 64, + + /** + * This error indicates that a device runtime grid launch did not occur + * because the depth of the child grid would exceed the maximum supported + * number of nested grid launches. + */ + cudaErrorLaunchMaxDepthExceeded = 65, + + /** + * This error indicates that a grid launch did not occur because the kernel + * uses file-scoped textures which are unsupported by the device runtime. + * Kernels launched via the device runtime only support textures created with + * the Texture Object API's. + */ + cudaErrorLaunchFileScopedTex = 66, + + /** + * This error indicates that a grid launch did not occur because the kernel + * uses file-scoped surfaces which are unsupported by the device runtime. + * Kernels launched via the device runtime only support surfaces created with + * the Surface Object API's. + */ + cudaErrorLaunchFileScopedSurf = 67, + + /** + * This error indicates that a call to ::cudaDeviceSynchronize made from + * the device runtime failed because the call was made at grid depth greater + * than than either the default (2 levels of grids) or user specified device + * limit ::cudaLimitDevRuntimeSyncDepth. To be able to synchronize on + * launched grids at a greater depth successfully, the maximum nested + * depth at which ::cudaDeviceSynchronize will be called must be specified + * with the ::cudaLimitDevRuntimeSyncDepth limit to the ::cudaDeviceSetLimit + * api before the host-side launch of a kernel using the device runtime. + * Keep in mind that additional levels of sync depth require the runtime + * to reserve large amounts of device memory that cannot be used for + * user allocations. + */ + cudaErrorSyncDepthExceeded = 68, + + /** + * This error indicates that a device runtime grid launch failed because + * the launch would exceed the limit ::cudaLimitDevRuntimePendingLaunchCount. + * For this launch to proceed successfully, ::cudaDeviceSetLimit must be + * called to set the ::cudaLimitDevRuntimePendingLaunchCount to be higher + * than the upper bound of outstanding launches that can be issued to the + * device runtime. Keep in mind that raising the limit of pending device + * runtime launches will require the runtime to reserve device memory that + * cannot be used for user allocations. + */ + cudaErrorLaunchPendingCountExceeded = 69, + + /** + * This error indicates the attempted operation is not permitted. + */ + cudaErrorNotPermitted = 70, + + /** + * This error indicates the attempted operation is not supported + * on the current system or device. + */ + cudaErrorNotSupported = 71, + + /** + * Device encountered an error in the call stack during kernel execution, + * possibly due to stack corruption or exceeding the stack size limit. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + cudaErrorHardwareStackError = 72, + + /** + * The device encountered an illegal instruction during kernel execution + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + cudaErrorIllegalInstruction = 73, + + /** + * The device encountered a load or store instruction + * on a memory address which is not aligned. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + cudaErrorMisalignedAddress = 74, + + /** + * While executing a kernel, the device encountered an instruction + * which can only operate on memory locations in certain address spaces + * (global, shared, or local), but was supplied a memory address not + * belonging to an allowed address space. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + cudaErrorInvalidAddressSpace = 75, + + /** + * The device encountered an invalid program counter. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + cudaErrorInvalidPc = 76, + + /** + * The device encountered a load or store instruction on an invalid memory address. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + cudaErrorIllegalAddress = 77, + + /** + * A PTX compilation failed. The runtime may fall back to compiling PTX if + * an application does not contain a suitable binary for the current device. + */ + cudaErrorInvalidPtx = 78, + + /** + * This indicates an error with the OpenGL or DirectX context. + */ + cudaErrorInvalidGraphicsContext = 79, + + /** + * This indicates that an uncorrectable NVLink error was detected during the + * execution. + */ + cudaErrorNvlinkUncorrectable = 80, + + /** + * This indicates that the PTX JIT compiler library was not found. The JIT Compiler + * library is used for PTX compilation. The runtime may fall back to compiling PTX + * if an application does not contain a suitable binary for the current device. + */ + cudaErrorJitCompilerNotFound = 81, + + /** + * This error indicates that the number of blocks launched per grid for a kernel that was + * launched via either ::cudaLaunchCooperativeKernel or ::cudaLaunchCooperativeKernelMultiDevice + * exceeds the maximum number of blocks as allowed by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * or ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors + * as specified by the device attribute ::cudaDevAttrMultiProcessorCount. + */ + cudaErrorCooperativeLaunchTooLarge = 82, + + /** + * This indicates an internal startup failure in the CUDA runtime. + */ + cudaErrorStartupFailure = 0x7f, + + /** + * Any unhandled CUDA driver error is added to this value and returned via + * the runtime. Production releases of CUDA should not return such errors. + * \deprecated + * This error return is deprecated as of CUDA 4.1. + */ + cudaErrorApiFailureBase = 10000 +}; + +/** + * Channel format kind + */ +enum __device_builtin__ cudaChannelFormatKind +{ + cudaChannelFormatKindSigned = 0, /**< Signed channel format */ + cudaChannelFormatKindUnsigned = 1, /**< Unsigned channel format */ + cudaChannelFormatKindFloat = 2, /**< Float channel format */ + cudaChannelFormatKindNone = 3 /**< No channel format */ +}; + +/** + * CUDA Channel format descriptor + */ +struct __device_builtin__ cudaChannelFormatDesc +{ + int x; /**< x */ + int y; /**< y */ + int z; /**< z */ + int w; /**< w */ + enum cudaChannelFormatKind f; /**< Channel format kind */ +}; + +/** + * CUDA array + */ +typedef struct cudaArray *cudaArray_t; + +/** + * CUDA array (as source copy argument) + */ +typedef const struct cudaArray *cudaArray_const_t; + +struct cudaArray; + +/** + * CUDA mipmapped array + */ +typedef struct cudaMipmappedArray *cudaMipmappedArray_t; + +/** + * CUDA mipmapped array (as source argument) + */ +typedef const struct cudaMipmappedArray *cudaMipmappedArray_const_t; + +struct cudaMipmappedArray; + +/** + * CUDA memory types + */ +enum __device_builtin__ cudaMemoryType +{ + cudaMemoryTypeHost = 1, /**< Host memory */ + cudaMemoryTypeDevice = 2 /**< Device memory */ +}; + +/** + * CUDA memory copy types + */ +enum __device_builtin__ cudaMemcpyKind +{ + cudaMemcpyHostToHost = 0, /**< Host -> Host */ + cudaMemcpyHostToDevice = 1, /**< Host -> Device */ + cudaMemcpyDeviceToHost = 2, /**< Device -> Host */ + cudaMemcpyDeviceToDevice = 3, /**< Device -> Device */ + cudaMemcpyDefault = 4 /**< Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing */ +}; + +/** + * CUDA Pitched memory pointer + * + * \sa ::make_cudaPitchedPtr + */ +struct __device_builtin__ cudaPitchedPtr +{ + void *ptr; /**< Pointer to allocated memory */ + size_t pitch; /**< Pitch of allocated memory in bytes */ + size_t xsize; /**< Logical width of allocation in elements */ + size_t ysize; /**< Logical height of allocation in elements */ +}; + +/** + * CUDA extent + * + * \sa ::make_cudaExtent + */ +struct __device_builtin__ cudaExtent +{ + size_t width; /**< Width in elements when referring to array memory, in bytes when referring to linear memory */ + size_t height; /**< Height in elements */ + size_t depth; /**< Depth in elements */ +}; + +/** + * CUDA 3D position + * + * \sa ::make_cudaPos + */ +struct __device_builtin__ cudaPos +{ + size_t x; /**< x */ + size_t y; /**< y */ + size_t z; /**< z */ +}; + +/** + * CUDA 3D memory copying parameters + */ +struct __device_builtin__ cudaMemcpy3DParms +{ + cudaArray_t srcArray; /**< Source memory address */ + struct cudaPos srcPos; /**< Source position offset */ + struct cudaPitchedPtr srcPtr; /**< Pitched source memory address */ + + cudaArray_t dstArray; /**< Destination memory address */ + struct cudaPos dstPos; /**< Destination position offset */ + struct cudaPitchedPtr dstPtr; /**< Pitched destination memory address */ + + struct cudaExtent extent; /**< Requested memory copy size */ + enum cudaMemcpyKind kind; /**< Type of transfer */ +}; + +/** + * CUDA 3D cross-device memory copying parameters + */ +struct __device_builtin__ cudaMemcpy3DPeerParms +{ + cudaArray_t srcArray; /**< Source memory address */ + struct cudaPos srcPos; /**< Source position offset */ + struct cudaPitchedPtr srcPtr; /**< Pitched source memory address */ + int srcDevice; /**< Source device */ + + cudaArray_t dstArray; /**< Destination memory address */ + struct cudaPos dstPos; /**< Destination position offset */ + struct cudaPitchedPtr dstPtr; /**< Pitched destination memory address */ + int dstDevice; /**< Destination device */ + + struct cudaExtent extent; /**< Requested memory copy size */ +}; + +/** + * CUDA graphics interop resource + */ +struct cudaGraphicsResource; + +/** + * CUDA graphics interop register flags + */ +enum __device_builtin__ cudaGraphicsRegisterFlags +{ + cudaGraphicsRegisterFlagsNone = 0, /**< Default */ + cudaGraphicsRegisterFlagsReadOnly = 1, /**< CUDA will not write to this resource */ + cudaGraphicsRegisterFlagsWriteDiscard = 2, /**< CUDA will only write to and will not read from this resource */ + cudaGraphicsRegisterFlagsSurfaceLoadStore = 4, /**< CUDA will bind this resource to a surface reference */ + cudaGraphicsRegisterFlagsTextureGather = 8 /**< CUDA will perform texture gather operations on this resource */ +}; + +/** + * CUDA graphics interop map flags + */ +enum __device_builtin__ cudaGraphicsMapFlags +{ + cudaGraphicsMapFlagsNone = 0, /**< Default; Assume resource can be read/written */ + cudaGraphicsMapFlagsReadOnly = 1, /**< CUDA will not write to this resource */ + cudaGraphicsMapFlagsWriteDiscard = 2 /**< CUDA will only write to and will not read from this resource */ +}; + +/** + * CUDA graphics interop array indices for cube maps + */ +enum __device_builtin__ cudaGraphicsCubeFace +{ + cudaGraphicsCubeFacePositiveX = 0x00, /**< Positive X face of cubemap */ + cudaGraphicsCubeFaceNegativeX = 0x01, /**< Negative X face of cubemap */ + cudaGraphicsCubeFacePositiveY = 0x02, /**< Positive Y face of cubemap */ + cudaGraphicsCubeFaceNegativeY = 0x03, /**< Negative Y face of cubemap */ + cudaGraphicsCubeFacePositiveZ = 0x04, /**< Positive Z face of cubemap */ + cudaGraphicsCubeFaceNegativeZ = 0x05 /**< Negative Z face of cubemap */ +}; + +/** + * CUDA resource types + */ +enum __device_builtin__ cudaResourceType +{ + cudaResourceTypeArray = 0x00, /**< Array resource */ + cudaResourceTypeMipmappedArray = 0x01, /**< Mipmapped array resource */ + cudaResourceTypeLinear = 0x02, /**< Linear resource */ + cudaResourceTypePitch2D = 0x03 /**< Pitch 2D resource */ +}; + +/** + * CUDA texture resource view formats + */ +enum __device_builtin__ cudaResourceViewFormat +{ + cudaResViewFormatNone = 0x00, /**< No resource view format (use underlying resource format) */ + cudaResViewFormatUnsignedChar1 = 0x01, /**< 1 channel unsigned 8-bit integers */ + cudaResViewFormatUnsignedChar2 = 0x02, /**< 2 channel unsigned 8-bit integers */ + cudaResViewFormatUnsignedChar4 = 0x03, /**< 4 channel unsigned 8-bit integers */ + cudaResViewFormatSignedChar1 = 0x04, /**< 1 channel signed 8-bit integers */ + cudaResViewFormatSignedChar2 = 0x05, /**< 2 channel signed 8-bit integers */ + cudaResViewFormatSignedChar4 = 0x06, /**< 4 channel signed 8-bit integers */ + cudaResViewFormatUnsignedShort1 = 0x07, /**< 1 channel unsigned 16-bit integers */ + cudaResViewFormatUnsignedShort2 = 0x08, /**< 2 channel unsigned 16-bit integers */ + cudaResViewFormatUnsignedShort4 = 0x09, /**< 4 channel unsigned 16-bit integers */ + cudaResViewFormatSignedShort1 = 0x0a, /**< 1 channel signed 16-bit integers */ + cudaResViewFormatSignedShort2 = 0x0b, /**< 2 channel signed 16-bit integers */ + cudaResViewFormatSignedShort4 = 0x0c, /**< 4 channel signed 16-bit integers */ + cudaResViewFormatUnsignedInt1 = 0x0d, /**< 1 channel unsigned 32-bit integers */ + cudaResViewFormatUnsignedInt2 = 0x0e, /**< 2 channel unsigned 32-bit integers */ + cudaResViewFormatUnsignedInt4 = 0x0f, /**< 4 channel unsigned 32-bit integers */ + cudaResViewFormatSignedInt1 = 0x10, /**< 1 channel signed 32-bit integers */ + cudaResViewFormatSignedInt2 = 0x11, /**< 2 channel signed 32-bit integers */ + cudaResViewFormatSignedInt4 = 0x12, /**< 4 channel signed 32-bit integers */ + cudaResViewFormatHalf1 = 0x13, /**< 1 channel 16-bit floating point */ + cudaResViewFormatHalf2 = 0x14, /**< 2 channel 16-bit floating point */ + cudaResViewFormatHalf4 = 0x15, /**< 4 channel 16-bit floating point */ + cudaResViewFormatFloat1 = 0x16, /**< 1 channel 32-bit floating point */ + cudaResViewFormatFloat2 = 0x17, /**< 2 channel 32-bit floating point */ + cudaResViewFormatFloat4 = 0x18, /**< 4 channel 32-bit floating point */ + cudaResViewFormatUnsignedBlockCompressed1 = 0x19, /**< Block compressed 1 */ + cudaResViewFormatUnsignedBlockCompressed2 = 0x1a, /**< Block compressed 2 */ + cudaResViewFormatUnsignedBlockCompressed3 = 0x1b, /**< Block compressed 3 */ + cudaResViewFormatUnsignedBlockCompressed4 = 0x1c, /**< Block compressed 4 unsigned */ + cudaResViewFormatSignedBlockCompressed4 = 0x1d, /**< Block compressed 4 signed */ + cudaResViewFormatUnsignedBlockCompressed5 = 0x1e, /**< Block compressed 5 unsigned */ + cudaResViewFormatSignedBlockCompressed5 = 0x1f, /**< Block compressed 5 signed */ + cudaResViewFormatUnsignedBlockCompressed6H = 0x20, /**< Block compressed 6 unsigned half-float */ + cudaResViewFormatSignedBlockCompressed6H = 0x21, /**< Block compressed 6 signed half-float */ + cudaResViewFormatUnsignedBlockCompressed7 = 0x22 /**< Block compressed 7 */ +}; + +/** + * CUDA resource descriptor + */ +struct __device_builtin__ cudaResourceDesc { + enum cudaResourceType resType; /**< Resource type */ + + union { + struct { + cudaArray_t array; /**< CUDA array */ + } array; + struct { + cudaMipmappedArray_t mipmap; /**< CUDA mipmapped array */ + } mipmap; + struct { + void *devPtr; /**< Device pointer */ + struct cudaChannelFormatDesc desc; /**< Channel descriptor */ + size_t sizeInBytes; /**< Size in bytes */ + } linear; + struct { + void *devPtr; /**< Device pointer */ + struct cudaChannelFormatDesc desc; /**< Channel descriptor */ + size_t width; /**< Width of the array in elements */ + size_t height; /**< Height of the array in elements */ + size_t pitchInBytes; /**< Pitch between two rows in bytes */ + } pitch2D; + } res; +}; + +/** + * CUDA resource view descriptor + */ +struct __device_builtin__ cudaResourceViewDesc +{ + enum cudaResourceViewFormat format; /**< Resource view format */ + size_t width; /**< Width of the resource view */ + size_t height; /**< Height of the resource view */ + size_t depth; /**< Depth of the resource view */ + unsigned int firstMipmapLevel; /**< First defined mipmap level */ + unsigned int lastMipmapLevel; /**< Last defined mipmap level */ + unsigned int firstLayer; /**< First layer index */ + unsigned int lastLayer; /**< Last layer index */ +}; + +/** + * CUDA pointer attributes + */ +struct __device_builtin__ cudaPointerAttributes +{ + /** + * The physical location of the memory, ::cudaMemoryTypeHost or + * ::cudaMemoryTypeDevice. + */ + enum cudaMemoryType memoryType; + + /** + * The device against which the memory was allocated or registered. + * If the memory type is ::cudaMemoryTypeDevice then this identifies + * the device on which the memory referred physically resides. If + * the memory type is ::cudaMemoryTypeHost then this identifies the + * device which was current when the memory was allocated or registered + * (and if that device is deinitialized then this allocation will vanish + * with that device's state). + */ + int device; + + /** + * The address which may be dereferenced on the current device to access + * the memory or NULL if no such address exists. + */ + void *devicePointer; + + /** + * The address which may be dereferenced on the host to access the + * memory or NULL if no such address exists. + */ + void *hostPointer; + + /** + * Indicates if this pointer points to managed memory + */ + int isManaged; +}; + +/** + * CUDA function attributes + */ +struct __device_builtin__ cudaFuncAttributes +{ + /** + * The size in bytes of statically-allocated shared memory per block + * required by this function. This does not include dynamically-allocated + * shared memory requested by the user at runtime. + */ + size_t sharedSizeBytes; + + /** + * The size in bytes of user-allocated constant memory required by this + * function. + */ + size_t constSizeBytes; + + /** + * The size in bytes of local memory used by each thread of this function. + */ + size_t localSizeBytes; + + /** + * The maximum number of threads per block, beyond which a launch of the + * function would fail. This number depends on both the function and the + * device on which the function is currently loaded. + */ + int maxThreadsPerBlock; + + /** + * The number of registers used by each thread of this function. + */ + int numRegs; + + /** + * The PTX virtual architecture version for which the function was + * compiled. This value is the major PTX version * 10 + the minor PTX + * version, so a PTX version 1.3 function would return the value 13. + */ + int ptxVersion; + + /** + * The binary architecture version for which the function was compiled. + * This value is the major binary version * 10 + the minor binary version, + * so a binary version 1.3 function would return the value 13. + */ + int binaryVersion; + + /** + * The attribute to indicate whether the function has been compiled with + * user specified option "-Xptxas --dlcm=ca" set. + */ + int cacheModeCA; + + /** + * The maximum size in bytes of dynamic shared memory per block for + * this function. Any launch must have a dynamic shared memory size + * smaller than this value. + */ + int maxDynamicSharedSizeBytes; + + /** + * On devices where the L1 cache and shared memory use the same hardware resources, + * this sets the shared memory carveout preference, in percent of the maximum shared memory. + * This is only a hint, and the driver can choose a different ratio if required to execute the function. + */ + int preferredShmemCarveout; +}; + +/** + * CUDA function attributes that can be set using cudaFuncSetAttribute + */ +enum __device_builtin__ cudaFuncAttribute +{ + cudaFuncAttributeMaxDynamicSharedMemorySize = 8, /**< Maximum dynamic shared memory size */ + cudaFuncAttributePreferredSharedMemoryCarveout = 9, /**< Preferred shared memory-L1 cache split ratio */ + cudaFuncAttributeMax +}; + +/** + * CUDA function cache configurations + */ +enum __device_builtin__ cudaFuncCache +{ + cudaFuncCachePreferNone = 0, /**< Default function cache configuration, no preference */ + cudaFuncCachePreferShared = 1, /**< Prefer larger shared memory and smaller L1 cache */ + cudaFuncCachePreferL1 = 2, /**< Prefer larger L1 cache and smaller shared memory */ + cudaFuncCachePreferEqual = 3 /**< Prefer equal size L1 cache and shared memory */ +}; + +/** + * CUDA shared memory configuration + */ + +enum __device_builtin__ cudaSharedMemConfig +{ + cudaSharedMemBankSizeDefault = 0, + cudaSharedMemBankSizeFourByte = 1, + cudaSharedMemBankSizeEightByte = 2 +}; + +/** + * Shared memory carveout configurations + */ +enum __device_builtin__ cudaSharedCarveout { + cudaSharedmemCarveoutDefault = -1, /* * < no preference for shared memory or L1 (default) */ + cudaSharedmemCarveoutMaxShared = 100, /* * < prefer maximum available shared memory, minimum L1 cache */ + cudaSharedmemCarveoutMaxL1 = 0 /* * < prefer maximum available L1 cache, minimum shared memory */ +}; + +/** + * CUDA device compute modes + */ +enum __device_builtin__ cudaComputeMode +{ + cudaComputeModeDefault = 0, /**< Default compute mode (Multiple threads can use ::cudaSetDevice() with this device) */ + cudaComputeModeExclusive = 1, /**< Compute-exclusive-thread mode (Only one thread in one process will be able to use ::cudaSetDevice() with this device) */ + cudaComputeModeProhibited = 2, /**< Compute-prohibited mode (No threads can use ::cudaSetDevice() with this device) */ + cudaComputeModeExclusiveProcess = 3 /**< Compute-exclusive-process mode (Many threads in one process will be able to use ::cudaSetDevice() with this device) */ +}; + +/** + * CUDA Limits + */ +enum __device_builtin__ cudaLimit +{ + cudaLimitStackSize = 0x00, /**< GPU thread stack size */ + cudaLimitPrintfFifoSize = 0x01, /**< GPU printf/fprintf FIFO size */ + cudaLimitMallocHeapSize = 0x02, /**< GPU malloc heap size */ + cudaLimitDevRuntimeSyncDepth = 0x03, /**< GPU device runtime synchronize depth */ + cudaLimitDevRuntimePendingLaunchCount = 0x04 /**< GPU device runtime pending launch count */ +}; + +/** + * CUDA Memory Advise values + */ +enum __device_builtin__ cudaMemoryAdvise +{ + cudaMemAdviseSetReadMostly = 1, /**< Data will mostly be read and only occassionally be written to */ + cudaMemAdviseUnsetReadMostly = 2, /**< Undo the effect of ::cudaMemAdviseSetReadMostly */ + cudaMemAdviseSetPreferredLocation = 3, /**< Set the preferred location for the data as the specified device */ + cudaMemAdviseUnsetPreferredLocation = 4, /**< Clear the preferred location for the data */ + cudaMemAdviseSetAccessedBy = 5, /**< Data will be accessed by the specified device, so prevent page faults as much as possible */ + cudaMemAdviseUnsetAccessedBy = 6 /**< Let the Unified Memory subsystem decide on the page faulting policy for the specified device */ +}; + +/** + * CUDA range attributes + */ +enum __device_builtin__ cudaMemRangeAttribute +{ + cudaMemRangeAttributeReadMostly = 1, /**< Whether the range will mostly be read and only occassionally be written to */ + cudaMemRangeAttributePreferredLocation = 2, /**< The preferred location of the range */ + cudaMemRangeAttributeAccessedBy = 3, /**< Memory range has ::cudaMemAdviseSetAccessedBy set for specified device */ + cudaMemRangeAttributeLastPrefetchLocation = 4 /**< The last location to which the range was prefetched */ +}; + +/** + * CUDA Profiler Output modes + */ +enum __device_builtin__ cudaOutputMode +{ + cudaKeyValuePair = 0x00, /**< Output mode Key-Value pair format. */ + cudaCSV = 0x01 /**< Output mode Comma separated values format. */ +}; + +/** + * CUDA device attributes + */ +enum __device_builtin__ cudaDeviceAttr +{ + cudaDevAttrMaxThreadsPerBlock = 1, /**< Maximum number of threads per block */ + cudaDevAttrMaxBlockDimX = 2, /**< Maximum block dimension X */ + cudaDevAttrMaxBlockDimY = 3, /**< Maximum block dimension Y */ + cudaDevAttrMaxBlockDimZ = 4, /**< Maximum block dimension Z */ + cudaDevAttrMaxGridDimX = 5, /**< Maximum grid dimension X */ + cudaDevAttrMaxGridDimY = 6, /**< Maximum grid dimension Y */ + cudaDevAttrMaxGridDimZ = 7, /**< Maximum grid dimension Z */ + cudaDevAttrMaxSharedMemoryPerBlock = 8, /**< Maximum shared memory available per block in bytes */ + cudaDevAttrTotalConstantMemory = 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */ + cudaDevAttrWarpSize = 10, /**< Warp size in threads */ + cudaDevAttrMaxPitch = 11, /**< Maximum pitch in bytes allowed by memory copies */ + cudaDevAttrMaxRegistersPerBlock = 12, /**< Maximum number of 32-bit registers available per block */ + cudaDevAttrClockRate = 13, /**< Peak clock frequency in kilohertz */ + cudaDevAttrTextureAlignment = 14, /**< Alignment requirement for textures */ + cudaDevAttrGpuOverlap = 15, /**< Device can possibly copy memory and execute a kernel concurrently */ + cudaDevAttrMultiProcessorCount = 16, /**< Number of multiprocessors on device */ + cudaDevAttrKernelExecTimeout = 17, /**< Specifies whether there is a run time limit on kernels */ + cudaDevAttrIntegrated = 18, /**< Device is integrated with host memory */ + cudaDevAttrCanMapHostMemory = 19, /**< Device can map host memory into CUDA address space */ + cudaDevAttrComputeMode = 20, /**< Compute mode (See ::cudaComputeMode for details) */ + cudaDevAttrMaxTexture1DWidth = 21, /**< Maximum 1D texture width */ + cudaDevAttrMaxTexture2DWidth = 22, /**< Maximum 2D texture width */ + cudaDevAttrMaxTexture2DHeight = 23, /**< Maximum 2D texture height */ + cudaDevAttrMaxTexture3DWidth = 24, /**< Maximum 3D texture width */ + cudaDevAttrMaxTexture3DHeight = 25, /**< Maximum 3D texture height */ + cudaDevAttrMaxTexture3DDepth = 26, /**< Maximum 3D texture depth */ + cudaDevAttrMaxTexture2DLayeredWidth = 27, /**< Maximum 2D layered texture width */ + cudaDevAttrMaxTexture2DLayeredHeight = 28, /**< Maximum 2D layered texture height */ + cudaDevAttrMaxTexture2DLayeredLayers = 29, /**< Maximum layers in a 2D layered texture */ + cudaDevAttrSurfaceAlignment = 30, /**< Alignment requirement for surfaces */ + cudaDevAttrConcurrentKernels = 31, /**< Device can possibly execute multiple kernels concurrently */ + cudaDevAttrEccEnabled = 32, /**< Device has ECC support enabled */ + cudaDevAttrPciBusId = 33, /**< PCI bus ID of the device */ + cudaDevAttrPciDeviceId = 34, /**< PCI device ID of the device */ + cudaDevAttrTccDriver = 35, /**< Device is using TCC driver model */ + cudaDevAttrMemoryClockRate = 36, /**< Peak memory clock frequency in kilohertz */ + cudaDevAttrGlobalMemoryBusWidth = 37, /**< Global memory bus width in bits */ + cudaDevAttrL2CacheSize = 38, /**< Size of L2 cache in bytes */ + cudaDevAttrMaxThreadsPerMultiProcessor = 39, /**< Maximum resident threads per multiprocessor */ + cudaDevAttrAsyncEngineCount = 40, /**< Number of asynchronous engines */ + cudaDevAttrUnifiedAddressing = 41, /**< Device shares a unified address space with the host */ + cudaDevAttrMaxTexture1DLayeredWidth = 42, /**< Maximum 1D layered texture width */ + cudaDevAttrMaxTexture1DLayeredLayers = 43, /**< Maximum layers in a 1D layered texture */ + cudaDevAttrMaxTexture2DGatherWidth = 45, /**< Maximum 2D texture width if cudaArrayTextureGather is set */ + cudaDevAttrMaxTexture2DGatherHeight = 46, /**< Maximum 2D texture height if cudaArrayTextureGather is set */ + cudaDevAttrMaxTexture3DWidthAlt = 47, /**< Alternate maximum 3D texture width */ + cudaDevAttrMaxTexture3DHeightAlt = 48, /**< Alternate maximum 3D texture height */ + cudaDevAttrMaxTexture3DDepthAlt = 49, /**< Alternate maximum 3D texture depth */ + cudaDevAttrPciDomainId = 50, /**< PCI domain ID of the device */ + cudaDevAttrTexturePitchAlignment = 51, /**< Pitch alignment requirement for textures */ + cudaDevAttrMaxTextureCubemapWidth = 52, /**< Maximum cubemap texture width/height */ + cudaDevAttrMaxTextureCubemapLayeredWidth = 53, /**< Maximum cubemap layered texture width/height */ + cudaDevAttrMaxTextureCubemapLayeredLayers = 54, /**< Maximum layers in a cubemap layered texture */ + cudaDevAttrMaxSurface1DWidth = 55, /**< Maximum 1D surface width */ + cudaDevAttrMaxSurface2DWidth = 56, /**< Maximum 2D surface width */ + cudaDevAttrMaxSurface2DHeight = 57, /**< Maximum 2D surface height */ + cudaDevAttrMaxSurface3DWidth = 58, /**< Maximum 3D surface width */ + cudaDevAttrMaxSurface3DHeight = 59, /**< Maximum 3D surface height */ + cudaDevAttrMaxSurface3DDepth = 60, /**< Maximum 3D surface depth */ + cudaDevAttrMaxSurface1DLayeredWidth = 61, /**< Maximum 1D layered surface width */ + cudaDevAttrMaxSurface1DLayeredLayers = 62, /**< Maximum layers in a 1D layered surface */ + cudaDevAttrMaxSurface2DLayeredWidth = 63, /**< Maximum 2D layered surface width */ + cudaDevAttrMaxSurface2DLayeredHeight = 64, /**< Maximum 2D layered surface height */ + cudaDevAttrMaxSurface2DLayeredLayers = 65, /**< Maximum layers in a 2D layered surface */ + cudaDevAttrMaxSurfaceCubemapWidth = 66, /**< Maximum cubemap surface width */ + cudaDevAttrMaxSurfaceCubemapLayeredWidth = 67, /**< Maximum cubemap layered surface width */ + cudaDevAttrMaxSurfaceCubemapLayeredLayers = 68, /**< Maximum layers in a cubemap layered surface */ + cudaDevAttrMaxTexture1DLinearWidth = 69, /**< Maximum 1D linear texture width */ + cudaDevAttrMaxTexture2DLinearWidth = 70, /**< Maximum 2D linear texture width */ + cudaDevAttrMaxTexture2DLinearHeight = 71, /**< Maximum 2D linear texture height */ + cudaDevAttrMaxTexture2DLinearPitch = 72, /**< Maximum 2D linear texture pitch in bytes */ + cudaDevAttrMaxTexture2DMipmappedWidth = 73, /**< Maximum mipmapped 2D texture width */ + cudaDevAttrMaxTexture2DMipmappedHeight = 74, /**< Maximum mipmapped 2D texture height */ + cudaDevAttrComputeCapabilityMajor = 75, /**< Major compute capability version number */ + cudaDevAttrComputeCapabilityMinor = 76, /**< Minor compute capability version number */ + cudaDevAttrMaxTexture1DMipmappedWidth = 77, /**< Maximum mipmapped 1D texture width */ + cudaDevAttrStreamPrioritiesSupported = 78, /**< Device supports stream priorities */ + cudaDevAttrGlobalL1CacheSupported = 79, /**< Device supports caching globals in L1 */ + cudaDevAttrLocalL1CacheSupported = 80, /**< Device supports caching locals in L1 */ + cudaDevAttrMaxSharedMemoryPerMultiprocessor = 81, /**< Maximum shared memory available per multiprocessor in bytes */ + cudaDevAttrMaxRegistersPerMultiprocessor = 82, /**< Maximum number of 32-bit registers available per multiprocessor */ + cudaDevAttrManagedMemory = 83, /**< Device can allocate managed memory on this system */ + cudaDevAttrIsMultiGpuBoard = 84, /**< Device is on a multi-GPU board */ + cudaDevAttrMultiGpuBoardGroupID = 85, /**< Unique identifier for a group of devices on the same multi-GPU board */ + cudaDevAttrHostNativeAtomicSupported = 86, /**< Link between the device and the host supports native atomic operations */ + cudaDevAttrSingleToDoublePrecisionPerfRatio = 87, /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */ + cudaDevAttrPageableMemoryAccess = 88, /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */ + cudaDevAttrConcurrentManagedAccess = 89, /**< Device can coherently access managed memory concurrently with the CPU */ + cudaDevAttrComputePreemptionSupported = 90, /**< Device supports Compute Preemption */ + cudaDevAttrCanUseHostPointerForRegisteredMem = 91, /**< Device can access host registered memory at the same virtual address as the CPU */ + cudaDevAttrReserved92 = 92, + cudaDevAttrReserved93 = 93, + cudaDevAttrReserved94 = 94, + cudaDevAttrCooperativeLaunch = 95, /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel*/ + cudaDevAttrCooperativeMultiDeviceLaunch = 96, /**< Device can participate in cooperative kernels launched via ::cudaLaunchCooperativeKernelMultiDevice */ + cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 /**< The maximum optin shared memory per block. This value may vary by chip. See ::cudaFuncSetAttribute */ +}; + +/** + * CUDA device P2P attributes + */ + +enum __device_builtin__ cudaDeviceP2PAttr { + cudaDevP2PAttrPerformanceRank = 1, /**< A relative value indicating the performance of the link between two devices */ + cudaDevP2PAttrAccessSupported = 2, /**< Peer access is enabled */ + cudaDevP2PAttrNativeAtomicSupported = 3 /**< Native atomic operation over the link supported */ +}; +/** + * CUDA device properties + */ +struct __device_builtin__ cudaDeviceProp +{ + char name[256]; /**< ASCII string identifying device */ + size_t totalGlobalMem; /**< Global memory available on device in bytes */ + size_t sharedMemPerBlock; /**< Shared memory available per block in bytes */ + int regsPerBlock; /**< 32-bit registers available per block */ + int warpSize; /**< Warp size in threads */ + size_t memPitch; /**< Maximum pitch in bytes allowed by memory copies */ + int maxThreadsPerBlock; /**< Maximum number of threads per block */ + int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */ + int maxGridSize[3]; /**< Maximum size of each dimension of a grid */ + int clockRate; /**< Clock frequency in kilohertz */ + size_t totalConstMem; /**< Constant memory available on device in bytes */ + int major; /**< Major compute capability */ + int minor; /**< Minor compute capability */ + size_t textureAlignment; /**< Alignment requirement for textures */ + size_t texturePitchAlignment; /**< Pitch alignment requirement for texture references bound to pitched memory */ + int deviceOverlap; /**< Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. */ + int multiProcessorCount; /**< Number of multiprocessors on device */ + int kernelExecTimeoutEnabled; /**< Specified whether there is a run time limit on kernels */ + int integrated; /**< Device is integrated as opposed to discrete */ + int canMapHostMemory; /**< Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer */ + int computeMode; /**< Compute mode (See ::cudaComputeMode) */ + int maxTexture1D; /**< Maximum 1D texture size */ + int maxTexture1DMipmap; /**< Maximum 1D mipmapped texture size */ + int maxTexture1DLinear; /**< Maximum size for 1D textures bound to linear memory */ + int maxTexture2D[2]; /**< Maximum 2D texture dimensions */ + int maxTexture2DMipmap[2]; /**< Maximum 2D mipmapped texture dimensions */ + int maxTexture2DLinear[3]; /**< Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory */ + int maxTexture2DGather[2]; /**< Maximum 2D texture dimensions if texture gather operations have to be performed */ + int maxTexture3D[3]; /**< Maximum 3D texture dimensions */ + int maxTexture3DAlt[3]; /**< Maximum alternate 3D texture dimensions */ + int maxTextureCubemap; /**< Maximum Cubemap texture dimensions */ + int maxTexture1DLayered[2]; /**< Maximum 1D layered texture dimensions */ + int maxTexture2DLayered[3]; /**< Maximum 2D layered texture dimensions */ + int maxTextureCubemapLayered[2];/**< Maximum Cubemap layered texture dimensions */ + int maxSurface1D; /**< Maximum 1D surface size */ + int maxSurface2D[2]; /**< Maximum 2D surface dimensions */ + int maxSurface3D[3]; /**< Maximum 3D surface dimensions */ + int maxSurface1DLayered[2]; /**< Maximum 1D layered surface dimensions */ + int maxSurface2DLayered[3]; /**< Maximum 2D layered surface dimensions */ + int maxSurfaceCubemap; /**< Maximum Cubemap surface dimensions */ + int maxSurfaceCubemapLayered[2];/**< Maximum Cubemap layered surface dimensions */ + size_t surfaceAlignment; /**< Alignment requirements for surfaces */ + int concurrentKernels; /**< Device can possibly execute multiple kernels concurrently */ + int ECCEnabled; /**< Device has ECC support enabled */ + int pciBusID; /**< PCI bus ID of the device */ + int pciDeviceID; /**< PCI device ID of the device */ + int pciDomainID; /**< PCI domain ID of the device */ + int tccDriver; /**< 1 if device is a Tesla device using TCC driver, 0 otherwise */ + int asyncEngineCount; /**< Number of asynchronous engines */ + int unifiedAddressing; /**< Device shares a unified address space with the host */ + int memoryClockRate; /**< Peak memory clock frequency in kilohertz */ + int memoryBusWidth; /**< Global memory bus width in bits */ + int l2CacheSize; /**< Size of L2 cache in bytes */ + int maxThreadsPerMultiProcessor;/**< Maximum resident threads per multiprocessor */ + int streamPrioritiesSupported; /**< Device supports stream priorities */ + int globalL1CacheSupported; /**< Device supports caching globals in L1 */ + int localL1CacheSupported; /**< Device supports caching locals in L1 */ + size_t sharedMemPerMultiprocessor; /**< Shared memory available per multiprocessor in bytes */ + int regsPerMultiprocessor; /**< 32-bit registers available per multiprocessor */ + int managedMemory; /**< Device supports allocating managed memory on this system */ + int isMultiGpuBoard; /**< Device is on a multi-GPU board */ + int multiGpuBoardGroupID; /**< Unique identifier for a group of devices on the same multi-GPU board */ + int hostNativeAtomicSupported; /**< Link between the device and the host supports native atomic operations */ + int singleToDoublePrecisionPerfRatio; /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */ + int pageableMemoryAccess; /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */ + int concurrentManagedAccess; /**< Device can coherently access managed memory concurrently with the CPU */ + int computePreemptionSupported; /**< Device supports Compute Preemption */ + int canUseHostPointerForRegisteredMem; /**< Device can access host registered memory at the same virtual address as the CPU */ + int cooperativeLaunch; /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel */ + int cooperativeMultiDeviceLaunch; /**< Device can participate in cooperative kernels launched via ::cudaLaunchCooperativeKernelMultiDevice */ + size_t sharedMemPerBlockOptin; /**< Per device maximum shared memory per block usable by special opt in */ +}; + +#define cudaDevicePropDontCare \ + { \ + {'\0'}, /* char name[256]; */ \ + 0, /* size_t totalGlobalMem; */ \ + 0, /* size_t sharedMemPerBlock; */ \ + 0, /* int regsPerBlock; */ \ + 0, /* int warpSize; */ \ + 0, /* size_t memPitch; */ \ + 0, /* int maxThreadsPerBlock; */ \ + {0, 0, 0}, /* int maxThreadsDim[3]; */ \ + {0, 0, 0}, /* int maxGridSize[3]; */ \ + 0, /* int clockRate; */ \ + 0, /* size_t totalConstMem; */ \ + -1, /* int major; */ \ + -1, /* int minor; */ \ + 0, /* size_t textureAlignment; */ \ + 0, /* size_t texturePitchAlignment */ \ + -1, /* int deviceOverlap; */ \ + 0, /* int multiProcessorCount; */ \ + 0, /* int kernelExecTimeoutEnabled */ \ + 0, /* int integrated */ \ + 0, /* int canMapHostMemory */ \ + 0, /* int computeMode */ \ + 0, /* int maxTexture1D */ \ + 0, /* int maxTexture1DMipmap */ \ + 0, /* int maxTexture1DLinear */ \ + {0, 0}, /* int maxTexture2D[2] */ \ + {0, 0}, /* int maxTexture2DMipmap[2] */ \ + {0, 0, 0}, /* int maxTexture2DLinear[3] */ \ + {0, 0}, /* int maxTexture2DGather[2] */ \ + {0, 0, 0}, /* int maxTexture3D[3] */ \ + {0, 0, 0}, /* int maxTexture3DAlt[3] */ \ + 0, /* int maxTextureCubemap */ \ + {0, 0}, /* int maxTexture1DLayered[2] */ \ + {0, 0, 0}, /* int maxTexture2DLayered[3] */ \ + {0, 0}, /* int maxTextureCubemapLayered[2] */ \ + 0, /* int maxSurface1D */ \ + {0, 0}, /* int maxSurface2D[2] */ \ + {0, 0, 0}, /* int maxSurface3D[3] */ \ + {0, 0}, /* int maxSurface1DLayered[2] */ \ + {0, 0, 0}, /* int maxSurface2DLayered[3] */ \ + 0, /* int maxSurfaceCubemap */ \ + {0, 0}, /* int maxSurfaceCubemapLayered[2] */ \ + 0, /* size_t surfaceAlignment */ \ + 0, /* int concurrentKernels */ \ + 0, /* int ECCEnabled */ \ + 0, /* int pciBusID */ \ + 0, /* int pciDeviceID */ \ + 0, /* int pciDomainID */ \ + 0, /* int tccDriver */ \ + 0, /* int asyncEngineCount */ \ + 0, /* int unifiedAddressing */ \ + 0, /* int memoryClockRate */ \ + 0, /* int memoryBusWidth */ \ + 0, /* int l2CacheSize */ \ + 0, /* int maxThreadsPerMultiProcessor */ \ + 0, /* int streamPrioritiesSupported */ \ + 0, /* int globalL1CacheSupported */ \ + 0, /* int localL1CacheSupported */ \ + 0, /* size_t sharedMemPerMultiprocessor; */ \ + 0, /* int regsPerMultiprocessor; */ \ + 0, /* int managedMemory */ \ + 0, /* int isMultiGpuBoard */ \ + 0, /* int multiGpuBoardGroupID */ \ + 0, /* int hostNativeAtomicSupported */ \ + 0, /* int singleToDoublePrecisionPerfRatio */ \ + 0, /* int pageableMemoryAccess */ \ + 0, /* int concurrentManagedAccess */ \ + 0, /* int computePreemptionSupported */ \ + 0, /* int canUseHostPointerForRegisteredMem */ \ + 0, /* int cooperativeLaunch */ \ + 0, /* int cooperativeMultiDeviceLaunch */ \ + 0, /* size_t sharedMemPerBlockOptin */ \ + } /**< Empty device properties */ + +/** + * CUDA IPC Handle Size + */ +#define CUDA_IPC_HANDLE_SIZE 64 + +/** + * CUDA IPC event handle + */ +typedef __device_builtin__ struct __device_builtin__ cudaIpcEventHandle_st +{ + char reserved[CUDA_IPC_HANDLE_SIZE]; +}cudaIpcEventHandle_t; + +/** + * CUDA IPC memory handle + */ +typedef __device_builtin__ struct __device_builtin__ cudaIpcMemHandle_st +{ + char reserved[CUDA_IPC_HANDLE_SIZE]; +}cudaIpcMemHandle_t; + +/******************************************************************************* +* * +* SHORTHAND TYPE DEFINITION USED BY RUNTIME API * +* * +*******************************************************************************/ + +/** + * CUDA Error types + */ +typedef __device_builtin__ enum cudaError cudaError_t; + +/** + * CUDA stream + */ +typedef __device_builtin__ struct CUstream_st *cudaStream_t; + +/** + * CUDA event types + */ +typedef __device_builtin__ struct CUevent_st *cudaEvent_t; + +/** + * CUDA graphics resource types + */ +typedef __device_builtin__ struct cudaGraphicsResource *cudaGraphicsResource_t; + +/** + * CUDA UUID types + */ +typedef __device_builtin__ struct CUuuid_st cudaUUID_t; + +/** + * CUDA output file modes + */ +typedef __device_builtin__ enum cudaOutputMode cudaOutputMode_t; + +/** + * CUDA cooperative group scope + */ +enum __device_builtin__ cudaCGScope { + cudaCGScopeInvalid = 0, /**< Invalid cooperative group scope */ + cudaCGScopeGrid = 1, /**< Scope represented by a grid_group */ + cudaCGScopeMultiGrid = 2 /**< Scope represented by a multi_grid_group */ +}; + +/** + * CUDA launch parameters + */ +struct __device_builtin__ cudaLaunchParams +{ + void *func; /**< Device function symbol */ + dim3 gridDim; /**< Grid dimentions */ + dim3 blockDim; /**< Block dimentions */ + void **args; /**< Arguments */ + size_t sharedMem; /**< Shared memory */ + cudaStream_t stream; /**< Stream identifier */ +}; + +/** @} */ +/** @} */ /* END CUDART_TYPES */ + +#endif /* !__DRIVER_TYPES_H__ */ diff --git a/include/external/CUDA/host_config.h b/include/external/CUDA/host_config.h new file mode 100755 index 000000000..1bd79e551 --- /dev/null +++ b/include/external/CUDA/host_config.h @@ -0,0 +1,50 @@ +/* + * Copyright 1993-2017 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#include "crt/host_config.h" diff --git a/include/external/CUDA/host_defines.h b/include/external/CUDA/host_defines.h new file mode 100755 index 000000000..33507ae51 --- /dev/null +++ b/include/external/CUDA/host_defines.h @@ -0,0 +1,50 @@ +/* + * Copyright 1993-2017 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#include "crt/host_defines.h" diff --git a/include/external/CUDA/library_types.h b/include/external/CUDA/library_types.h new file mode 100755 index 000000000..c36f0d18c --- /dev/null +++ b/include/external/CUDA/library_types.h @@ -0,0 +1,80 @@ +/* + * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__LIBRARY_TYPES_H__) +#define __LIBRARY_TYPES_H__ + + +typedef enum cudaDataType_t +{ + CUDA_R_16F= 2, /* real as a half */ + CUDA_C_16F= 6, /* complex as a pair of half numbers */ + CUDA_R_32F= 0, /* real as a float */ + CUDA_C_32F= 4, /* complex as a pair of float numbers */ + CUDA_R_64F= 1, /* real as a double */ + CUDA_C_64F= 5, /* complex as a pair of double numbers */ + CUDA_R_8I = 3, /* real as a signed char */ + CUDA_C_8I = 7, /* complex as a pair of signed char numbers */ + CUDA_R_8U = 8, /* real as a unsigned char */ + CUDA_C_8U = 9, /* complex as a pair of unsigned char numbers */ + CUDA_R_32I= 10, /* real as a signed int */ + CUDA_C_32I= 11, /* complex as a pair of signed int numbers */ + CUDA_R_32U= 12, /* real as a unsigned int */ + CUDA_C_32U= 13 /* complex as a pair of unsigned int numbers */ +} cudaDataType; + + +typedef enum libraryPropertyType_t +{ + MAJOR_VERSION, + MINOR_VERSION, + PATCH_LEVEL +} libraryPropertyType; + +#endif /* !__LIBRARY_TYPES_H__ */ diff --git a/include/external/CUDA/nvml.h b/include/external/CUDA/nvml.h new file mode 100755 index 000000000..0790b3aad --- /dev/null +++ b/include/external/CUDA/nvml.h @@ -0,0 +1,5628 @@ +/* + * Copyright 1993-2017 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO USER: + * + * This source code is subject to NVIDIA ownership rights under U.S. and + * international Copyright laws. Users and possessors of this source code + * are hereby granted a nonexclusive, royalty-free license to use this code + * in individual and commercial software. + * + * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE + * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR + * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL, + * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE + * OR PERFORMANCE OF THIS SOURCE CODE. + * + * U.S. Government End Users. This source code is a "commercial item" as + * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of + * "commercial computer software" and "commercial computer software + * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995) + * and is provided to the U.S. Government only as a commercial end item. + * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through + * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the + * source code with only those rights set forth herein. + * + * Any use of this source code in individual and commercial software must + * include, in the user documentation and internal comments to the code, + * the above Disclaimer and U.S. Government End Users Notice. + */ + +/* +NVML API Reference + +The NVIDIA Management Library (NVML) is a C-based programmatic interface for monitoring and +managing various states within NVIDIA Tesla &tm; GPUs. It is intended to be a platform for building +3rd party applications, and is also the underlying library for the NVIDIA-supported nvidia-smi +tool. NVML is thread-safe so it is safe to make simultaneous NVML calls from multiple threads. + +API Documentation + +Supported platforms: +- Windows: Windows Server 2008 R2 64bit, Windows Server 2012 R2 64bit, Windows 7 64bit, Windows 8 64bit, Windows 10 64bit +- Linux: 32-bit and 64-bit +- Hypervisors: Windows Server 2008R2/2012 Hyper-V 64bit, Citrix XenServer 6.2 SP1+, VMware ESX 5.1/5.5 + +Supported products: +- Full Support + - All Tesla products, starting with the Fermi architecture + - All Quadro products, starting with the Fermi architecture + - All GRID products, starting with the Kepler architecture + - Selected GeForce Titan products +- Limited Support + - All Geforce products, starting with the Fermi architecture + +The NVML library can be found at \%ProgramW6432\%\\"NVIDIA Corporation"\\NVSMI\\ on Windows. It is +not be added to the system path by default. To dynamically link to NVML, add this path to the PATH +environmental variable. To dynamically load NVML, call LoadLibrary with this path. + +On Linux the NVML library will be found on the standard library path. For 64 bit Linux, both the 32 bit +and 64 bit NVML libraries will be installed. + +Online documentation for this library is available at http://docs.nvidia.com/deploy/nvml-api/index.html +*/ + +#ifndef __nvml_nvml_h__ +#define __nvml_nvml_h__ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * On Windows, set up methods for DLL export + * define NVML_STATIC_IMPORT when using nvml_loader library + */ +#if defined _WINDOWS + #if !defined NVML_STATIC_IMPORT + #if defined NVML_LIB_EXPORT + #define DECLDIR __declspec(dllexport) + #else + #define DECLDIR __declspec(dllimport) + #endif + #else + #define DECLDIR + #endif +#else + #define DECLDIR +#endif + +/** + * NVML API versioning support + */ +#define NVML_API_VERSION 9 +#define NVML_API_VERSION_STR "9" +#define nvmlInit nvmlInit_v2 +#define nvmlDeviceGetPciInfo nvmlDeviceGetPciInfo_v3 +#define nvmlDeviceGetCount nvmlDeviceGetCount_v2 +#define nvmlDeviceGetHandleByIndex nvmlDeviceGetHandleByIndex_v2 +#define nvmlDeviceGetHandleByPciBusId nvmlDeviceGetHandleByPciBusId_v2 +#define nvmlDeviceGetNvLinkRemotePciInfo nvmlDeviceGetNvLinkRemotePciInfo_v2 + +/***************************************************************************************************/ +/** @defgroup nvmlDeviceStructs Device Structs + * @{ + */ +/***************************************************************************************************/ + +/** + * Special constant that some fields take when they are not available. + * Used when only part of the struct is not available. + * + * Each structure explicitly states when to check for this value. + */ +#define NVML_VALUE_NOT_AVAILABLE (-1) + +typedef struct nvmlDevice_st* nvmlDevice_t; + +/** + * Buffer size guaranteed to be large enough for pci bus id + */ +#define NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE 32 + +/** + * Buffer size guaranteed to be large enough for pci bus id for ::busIdLegacy + */ +#define NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE 16 + +/** + * PCI information about a GPU device. + */ +typedef struct nvmlPciInfo_st +{ + char busIdLegacy[NVML_DEVICE_PCI_BUS_ID_BUFFER_V2_SIZE]; //!< The legacy tuple domain:bus:device.function PCI identifier (& NULL terminator) + unsigned int domain; //!< The PCI domain on which the device's bus resides, 0 to 0xffffffff + unsigned int bus; //!< The bus on which the device resides, 0 to 0xff + unsigned int device; //!< The device's id on the bus, 0 to 31 + unsigned int pciDeviceId; //!< The combined 16-bit device id and 16-bit vendor id + + // Added in NVML 2.285 API + unsigned int pciSubSystemId; //!< The 32-bit Sub System Device ID + + char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (& NULL terminator) +} nvmlPciInfo_t; + +/** + * Detailed ECC error counts for a device. + * + * @deprecated Different GPU families can have different memory error counters + * See \ref nvmlDeviceGetMemoryErrorCounter + */ +typedef struct nvmlEccErrorCounts_st +{ + unsigned long long l1Cache; //!< L1 cache errors + unsigned long long l2Cache; //!< L2 cache errors + unsigned long long deviceMemory; //!< Device memory errors + unsigned long long registerFile; //!< Register file errors +} nvmlEccErrorCounts_t; + +/** + * Utilization information for a device. + * Each sample period may be between 1 second and 1/6 second, depending on the product being queried. + */ +typedef struct nvmlUtilization_st +{ + unsigned int gpu; //!< Percent of time over the past sample period during which one or more kernels was executing on the GPU + unsigned int memory; //!< Percent of time over the past sample period during which global (device) memory was being read or written +} nvmlUtilization_t; + +/** + * Memory allocation information for a device. + */ +typedef struct nvmlMemory_st +{ + unsigned long long total; //!< Total installed FB memory (in bytes) + unsigned long long free; //!< Unallocated FB memory (in bytes) + unsigned long long used; //!< Allocated FB memory (in bytes). Note that the driver/GPU always sets aside a small amount of memory for bookkeeping +} nvmlMemory_t; + +/** + * BAR1 Memory allocation Information for a device + */ +typedef struct nvmlBAR1Memory_st +{ + unsigned long long bar1Total; //!< Total BAR1 Memory (in bytes) + unsigned long long bar1Free; //!< Unallocated BAR1 Memory (in bytes) + unsigned long long bar1Used; //!< Allocated Used Memory (in bytes) +}nvmlBAR1Memory_t; + +/** + * Information about running compute processes on the GPU + */ +typedef struct nvmlProcessInfo_st +{ + unsigned int pid; //!< Process ID + unsigned long long usedGpuMemory; //!< Amount of used GPU memory in bytes. + //! Under WDDM, \ref NVML_VALUE_NOT_AVAILABLE is always reported + //! because Windows KMD manages all the memory and not the NVIDIA driver +} nvmlProcessInfo_t; + +/** + * Enum to represent type of bridge chip + */ +typedef enum nvmlBridgeChipType_enum +{ + NVML_BRIDGE_CHIP_PLX = 0, + NVML_BRIDGE_CHIP_BRO4 = 1 +}nvmlBridgeChipType_t; + +/** + * Maximum number of NvLink links supported + */ +#define NVML_NVLINK_MAX_LINKS 6 + +/** + * Enum to represent the NvLink utilization counter packet units + */ +typedef enum nvmlNvLinkUtilizationCountUnits_enum +{ + NVML_NVLINK_COUNTER_UNIT_CYCLES = 0, // count by cycles + NVML_NVLINK_COUNTER_UNIT_PACKETS = 1, // count by packets + NVML_NVLINK_COUNTER_UNIT_BYTES = 2, // count by bytes + + // this must be last + NVML_NVLINK_COUNTER_UNIT_COUNT +} nvmlNvLinkUtilizationCountUnits_t; + +/** + * Enum to represent the NvLink utilization counter packet types to count + * ** this is ONLY applicable with the units as packets or bytes + * ** as specified in \a nvmlNvLinkUtilizationCountUnits_t + * ** all packet filter descriptions are target GPU centric + * ** these can be "OR'd" together + */ +typedef enum nvmlNvLinkUtilizationCountPktTypes_enum +{ + NVML_NVLINK_COUNTER_PKTFILTER_NOP = 0x1, // no operation packets + NVML_NVLINK_COUNTER_PKTFILTER_READ = 0x2, // read packets + NVML_NVLINK_COUNTER_PKTFILTER_WRITE = 0x4, // write packets + NVML_NVLINK_COUNTER_PKTFILTER_RATOM = 0x8, // reduction atomic requests + NVML_NVLINK_COUNTER_PKTFILTER_NRATOM = 0x10, // non-reduction atomic requests + NVML_NVLINK_COUNTER_PKTFILTER_FLUSH = 0x20, // flush requests + NVML_NVLINK_COUNTER_PKTFILTER_RESPDATA = 0x40, // responses with data + NVML_NVLINK_COUNTER_PKTFILTER_RESPNODATA = 0x80, // responses without data + NVML_NVLINK_COUNTER_PKTFILTER_ALL = 0xFF // all packets +} nvmlNvLinkUtilizationCountPktTypes_t; + +/** + * Struct to define the NVLINK counter controls + */ +typedef struct nvmlNvLinkUtilizationControl_st +{ + nvmlNvLinkUtilizationCountUnits_t units; + nvmlNvLinkUtilizationCountPktTypes_t pktfilter; +} nvmlNvLinkUtilizationControl_t; + +/** + * Enum to represent NvLink queryable capabilities + */ +typedef enum nvmlNvLinkCapability_enum +{ + NVML_NVLINK_CAP_P2P_SUPPORTED = 0, // P2P over NVLink is supported + NVML_NVLINK_CAP_SYSMEM_ACCESS = 1, // Access to system memory is supported + NVML_NVLINK_CAP_P2P_ATOMICS = 2, // P2P atomics are supported + NVML_NVLINK_CAP_SYSMEM_ATOMICS= 3, // System memory atomics are supported + NVML_NVLINK_CAP_SLI_BRIDGE = 4, // SLI is supported over this link + NVML_NVLINK_CAP_VALID = 5, // Link is supported on this device + // should be last + NVML_NVLINK_CAP_COUNT +} nvmlNvLinkCapability_t; + +/** + * Enum to represent NvLink queryable error counters + */ +typedef enum nvmlNvLinkErrorCounter_enum +{ + NVML_NVLINK_ERROR_DL_REPLAY = 0, // Data link transmit replay error counter + NVML_NVLINK_ERROR_DL_RECOVERY = 1, // Data link transmit recovery error counter + NVML_NVLINK_ERROR_DL_CRC_FLIT = 2, // Data link receive flow control digit CRC error counter + NVML_NVLINK_ERROR_DL_CRC_DATA = 3, // Data link receive data CRC error counter + + // this must be last + NVML_NVLINK_ERROR_COUNT +} nvmlNvLinkErrorCounter_t; + +/** + * Represents level relationships within a system between two GPUs + * The enums are spaced to allow for future relationships + */ +typedef enum nvmlGpuLevel_enum +{ + NVML_TOPOLOGY_INTERNAL = 0, // e.g. Tesla K80 + NVML_TOPOLOGY_SINGLE = 10, // all devices that only need traverse a single PCIe switch + NVML_TOPOLOGY_MULTIPLE = 20, // all devices that need not traverse a host bridge + NVML_TOPOLOGY_HOSTBRIDGE = 30, // all devices that are connected to the same host bridge + NVML_TOPOLOGY_CPU = 40, // all devices that are connected to the same CPU but possibly multiple host bridges + NVML_TOPOLOGY_SYSTEM = 50, // all devices in the system + + // there is purposefully no COUNT here because of the need for spacing above +} nvmlGpuTopologyLevel_t; + +/* P2P Capability Index Status*/ +typedef enum nvmlGpuP2PStatus_enum +{ + NVML_P2P_STATUS_OK = 0, + NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED, + NVML_P2P_STATUS_GPU_NOT_SUPPORTED, + NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED, + NVML_P2P_STATUS_DISABLED_BY_REGKEY, + NVML_P2P_STATUS_NOT_SUPPORTED, + NVML_P2P_STATUS_UNKNOWN + +} nvmlGpuP2PStatus_t; + +/* P2P Capability Index*/ +typedef enum nvmlGpuP2PCapsIndex_enum +{ + NVML_P2P_CAPS_INDEX_READ = 0, + NVML_P2P_CAPS_INDEX_WRITE, + NVML_P2P_CAPS_INDEX_NVLINK, + NVML_P2P_CAPS_INDEX_ATOMICS, + NVML_P2P_CAPS_INDEX_PROP, + NVML_P2P_CAPS_INDEX_UNKNOWN +}nvmlGpuP2PCapsIndex_t; + +/** + * Maximum limit on Physical Bridges per Board + */ +#define NVML_MAX_PHYSICAL_BRIDGE (128) + +/** + * Information about the Bridge Chip Firmware + */ +typedef struct nvmlBridgeChipInfo_st +{ + nvmlBridgeChipType_t type; //!< Type of Bridge Chip + unsigned int fwVersion; //!< Firmware Version. 0=Version is unavailable +}nvmlBridgeChipInfo_t; + +/** + * This structure stores the complete Hierarchy of the Bridge Chip within the board. The immediate + * bridge is stored at index 0 of bridgeInfoList, parent to immediate bridge is at index 1 and so forth. + */ +typedef struct nvmlBridgeChipHierarchy_st +{ + unsigned char bridgeCount; //!< Number of Bridge Chips on the Board + nvmlBridgeChipInfo_t bridgeChipInfo[NVML_MAX_PHYSICAL_BRIDGE]; //!< Hierarchy of Bridge Chips on the board +}nvmlBridgeChipHierarchy_t; + +/** + * Represents Type of Sampling Event + */ +typedef enum nvmlSamplingType_enum +{ + NVML_TOTAL_POWER_SAMPLES = 0, //!< To represent total power drawn by GPU + NVML_GPU_UTILIZATION_SAMPLES = 1, //!< To represent percent of time during which one or more kernels was executing on the GPU + NVML_MEMORY_UTILIZATION_SAMPLES = 2, //!< To represent percent of time during which global (device) memory was being read or written + NVML_ENC_UTILIZATION_SAMPLES = 3, //!< To represent percent of time during which NVENC remains busy + NVML_DEC_UTILIZATION_SAMPLES = 4, //!< To represent percent of time during which NVDEC remains busy + NVML_PROCESSOR_CLK_SAMPLES = 5, //!< To represent processor clock samples + NVML_MEMORY_CLK_SAMPLES = 6, //!< To represent memory clock samples + + // Keep this last + NVML_SAMPLINGTYPE_COUNT +}nvmlSamplingType_t; + +/** + * Represents the queryable PCIe utilization counters + */ +typedef enum nvmlPcieUtilCounter_enum +{ + NVML_PCIE_UTIL_TX_BYTES = 0, // 1KB granularity + NVML_PCIE_UTIL_RX_BYTES = 1, // 1KB granularity + + // Keep this last + NVML_PCIE_UTIL_COUNT +} nvmlPcieUtilCounter_t; + +/** + * Represents the type for sample value returned + */ +typedef enum nvmlValueType_enum +{ + NVML_VALUE_TYPE_DOUBLE = 0, + NVML_VALUE_TYPE_UNSIGNED_INT = 1, + NVML_VALUE_TYPE_UNSIGNED_LONG = 2, + NVML_VALUE_TYPE_UNSIGNED_LONG_LONG = 3, + NVML_VALUE_TYPE_SIGNED_LONG_LONG = 4, + + // Keep this last + NVML_VALUE_TYPE_COUNT +}nvmlValueType_t; + + +/** + * Union to represent different types of Value + */ +typedef union nvmlValue_st +{ + double dVal; //!< If the value is double + unsigned int uiVal; //!< If the value is unsigned int + unsigned long ulVal; //!< If the value is unsigned long + unsigned long long ullVal; //!< If the value is unsigned long long + signed long long sllVal; //!< If the value is signed long long +}nvmlValue_t; + +/** + * Information for Sample + */ +typedef struct nvmlSample_st +{ + unsigned long long timeStamp; //!< CPU Timestamp in microseconds + nvmlValue_t sampleValue; //!< Sample Value +}nvmlSample_t; + +/** + * Represents type of perf policy for which violation times can be queried + */ +typedef enum nvmlPerfPolicyType_enum +{ + NVML_PERF_POLICY_POWER = 0, //!< How long did power violations cause the GPU to be below application clocks + NVML_PERF_POLICY_THERMAL = 1, //!< How long did thermal violations cause the GPU to be below application clocks + NVML_PERF_POLICY_SYNC_BOOST = 2, //!< How long did sync boost cause the GPU to be below application clocks + NVML_PERF_POLICY_BOARD_LIMIT = 3, //!< How long did the board limit cause the GPU to be below application clocks + NVML_PERF_POLICY_LOW_UTILIZATION = 4, //!< How long did low utilization cause the GPU to be below application clocks + NVML_PERF_POLICY_RELIABILITY = 5, //!< How long did the board reliability limit cause the GPU to be below application clocks + + NVML_PERF_POLICY_TOTAL_APP_CLOCKS = 10, //!< Total time the GPU was held below application clocks by any limiter (0 - 5 above) + NVML_PERF_POLICY_TOTAL_BASE_CLOCKS = 11, //!< Total time the GPU was held below base clocks + + // Keep this last + NVML_PERF_POLICY_COUNT +}nvmlPerfPolicyType_t; + +/** + * Struct to hold perf policy violation status data + */ +typedef struct nvmlViolationTime_st +{ + unsigned long long referenceTime; //!< referenceTime represents CPU timestamp in microseconds + unsigned long long violationTime; //!< violationTime in Nanoseconds +}nvmlViolationTime_t; + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlDeviceEnumvs Device Enums + * @{ + */ +/***************************************************************************************************/ + +/** + * Generic enable/disable enum. + */ +typedef enum nvmlEnableState_enum +{ + NVML_FEATURE_DISABLED = 0, //!< Feature disabled + NVML_FEATURE_ENABLED = 1 //!< Feature enabled +} nvmlEnableState_t; + +//! Generic flag used to specify the default behavior of some functions. See description of particular functions for details. +#define nvmlFlagDefault 0x00 +//! Generic flag used to force some behavior. See description of particular functions for details. +#define nvmlFlagForce 0x01 + +/** + * * The Brand of the GPU + * */ +typedef enum nvmlBrandType_enum +{ + NVML_BRAND_UNKNOWN = 0, + NVML_BRAND_QUADRO = 1, + NVML_BRAND_TESLA = 2, + NVML_BRAND_NVS = 3, + NVML_BRAND_GRID = 4, + NVML_BRAND_GEFORCE = 5, + + // Keep this last + NVML_BRAND_COUNT +} nvmlBrandType_t; + +/** + * Temperature thresholds. + */ +typedef enum nvmlTemperatureThresholds_enum +{ + NVML_TEMPERATURE_THRESHOLD_SHUTDOWN = 0, // Temperature at which the GPU will shut down + // for HW protection + NVML_TEMPERATURE_THRESHOLD_SLOWDOWN = 1, // Temperature at which the GPU will begin HW slowdown + NVML_TEMPERATURE_THRESHOLD_MEM_MAX = 2, // Memory Temperature at which the GPU will begin SW slowdown + NVML_TEMPERATURE_THRESHOLD_GPU_MAX = 3, // GPU Temperature at which the GPU can be throttled below base clock + // Keep this last + NVML_TEMPERATURE_THRESHOLD_COUNT +} nvmlTemperatureThresholds_t; + +/** + * Temperature sensors. + */ +typedef enum nvmlTemperatureSensors_enum +{ + NVML_TEMPERATURE_GPU = 0, //!< Temperature sensor for the GPU die + + // Keep this last + NVML_TEMPERATURE_COUNT +} nvmlTemperatureSensors_t; + +/** + * Compute mode. + * + * NVML_COMPUTEMODE_EXCLUSIVE_PROCESS was added in CUDA 4.0. + * Earlier CUDA versions supported a single exclusive mode, + * which is equivalent to NVML_COMPUTEMODE_EXCLUSIVE_THREAD in CUDA 4.0 and beyond. + */ +typedef enum nvmlComputeMode_enum +{ + NVML_COMPUTEMODE_DEFAULT = 0, //!< Default compute mode -- multiple contexts per device + NVML_COMPUTEMODE_EXCLUSIVE_THREAD = 1, //!< Support Removed + NVML_COMPUTEMODE_PROHIBITED = 2, //!< Compute-prohibited mode -- no contexts per device + NVML_COMPUTEMODE_EXCLUSIVE_PROCESS = 3, //!< Compute-exclusive-process mode -- only one context per device, usable from multiple threads at a time + + // Keep this last + NVML_COMPUTEMODE_COUNT +} nvmlComputeMode_t; + +/** + * ECC bit types. + * + * @deprecated See \ref nvmlMemoryErrorType_t for a more flexible type + */ +#define nvmlEccBitType_t nvmlMemoryErrorType_t + +/** + * Single bit ECC errors + * + * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_CORRECTED + */ +#define NVML_SINGLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_CORRECTED + +/** + * Double bit ECC errors + * + * @deprecated Mapped to \ref NVML_MEMORY_ERROR_TYPE_UNCORRECTED + */ +#define NVML_DOUBLE_BIT_ECC NVML_MEMORY_ERROR_TYPE_UNCORRECTED + +/** + * Memory error types + */ +typedef enum nvmlMemoryErrorType_enum +{ + /** + * A memory error that was corrected + * + * For ECC errors, these are single bit errors + * For Texture memory, these are errors fixed by resend + */ + NVML_MEMORY_ERROR_TYPE_CORRECTED = 0, + /** + * A memory error that was not corrected + * + * For ECC errors, these are double bit errors + * For Texture memory, these are errors where the resend fails + */ + NVML_MEMORY_ERROR_TYPE_UNCORRECTED = 1, + + + // Keep this last + NVML_MEMORY_ERROR_TYPE_COUNT //!< Count of memory error types + +} nvmlMemoryErrorType_t; + +/** + * ECC counter types. + * + * Note: Volatile counts are reset each time the driver loads. On Windows this is once per boot. On Linux this can be more frequent. + * On Linux the driver unloads when no active clients exist. If persistence mode is enabled or there is always a driver + * client active (e.g. X11), then Linux also sees per-boot behavior. If not, volatile counts are reset each time a compute app + * is run. + */ +typedef enum nvmlEccCounterType_enum +{ + NVML_VOLATILE_ECC = 0, //!< Volatile counts are reset each time the driver loads. + NVML_AGGREGATE_ECC = 1, //!< Aggregate counts persist across reboots (i.e. for the lifetime of the device) + + // Keep this last + NVML_ECC_COUNTER_TYPE_COUNT //!< Count of memory counter types +} nvmlEccCounterType_t; + +/** + * Clock types. + * + * All speeds are in Mhz. + */ +typedef enum nvmlClockType_enum +{ + NVML_CLOCK_GRAPHICS = 0, //!< Graphics clock domain + NVML_CLOCK_SM = 1, //!< SM clock domain + NVML_CLOCK_MEM = 2, //!< Memory clock domain + NVML_CLOCK_VIDEO = 3, //!< Video encoder/decoder clock domain + + // Keep this last + NVML_CLOCK_COUNT //usedGpuMemory is not supported + + + unsigned long long time; //!< Amount of time in ms during which the compute context was active. The time is reported as 0 if + //!< the process is not terminated + + unsigned long long startTime; //!< CPU Timestamp in usec representing start time for the process + + unsigned int isRunning; //!< Flag to represent if the process is running (1 for running, 0 for terminated) + + unsigned int reserved[5]; //!< Reserved for future use +} nvmlAccountingStats_t; + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlVgpuConstants Vgpu Constants + * @{ + */ +/***************************************************************************************************/ + +/** + * Buffer size guaranteed to be large enough for \ref nvmlVgpuTypeGetLicense + */ +#define NVML_GRID_LICENSE_BUFFER_SIZE 128 + +#define NVML_VGPU_NAME_BUFFER_SIZE 64 + +#define NVML_MAX_VGPU_TYPES_PER_PGPU 17 + +#define NVML_MAX_VGPU_INSTANCES_PER_PGPU 24 + +#define NVML_GRID_LICENSE_FEATURE_MAX_COUNT 3 + +#define NVML_GRID_LICENSE_INFO_MAX_LENGTH 128 + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlVgpuEnum Vgpu Enum + * @{ + */ +/***************************************************************************************************/ + +/*! + * Types of VM identifiers + */ +typedef enum nvmlVgpuVmIdType { + NVML_VGPU_VM_ID_DOMAIN_ID = 0, //!< VM ID represents DOMAIN ID + NVML_VGPU_VM_ID_UUID = 1, //!< VM ID represents UUID +} nvmlVgpuVmIdType_t; + +// vGPU GUEST info state. +typedef enum nvmlVgpuGuestInfoState_enum +{ + NVML_VGPU_INSTANCE_GUEST_INFO_STATE_UNINITIALIZED = 0, //= 0 and < \a unitCount + * @param unit Reference in which to return the unit handle + * + * @return + * - \ref NVML_SUCCESS if \a unit has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a unit is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlUnitGetHandleByIndex(unsigned int index, nvmlUnit_t *unit); + +/** + * Retrieves the static information associated with a unit. + * + * For S-class products. + * + * See \ref nvmlUnitInfo_t for details on available unit info. + * + * @param unit The identifier of the target unit + * @param info Reference in which to return the unit information + * + * @return + * - \ref NVML_SUCCESS if \a info has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a info is NULL + */ +nvmlReturn_t DECLDIR nvmlUnitGetUnitInfo(nvmlUnit_t unit, nvmlUnitInfo_t *info); + +/** + * Retrieves the LED state associated with this unit. + * + * For S-class products. + * + * See \ref nvmlLedState_t for details on allowed states. + * + * @param unit The identifier of the target unit + * @param state Reference in which to return the current LED state + * + * @return + * - \ref NVML_SUCCESS if \a state has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a state is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlUnitSetLedState() + */ +nvmlReturn_t DECLDIR nvmlUnitGetLedState(nvmlUnit_t unit, nvmlLedState_t *state); + +/** + * Retrieves the PSU stats for the unit. + * + * For S-class products. + * + * See \ref nvmlPSUInfo_t for details on available PSU info. + * + * @param unit The identifier of the target unit + * @param psu Reference in which to return the PSU information + * + * @return + * - \ref NVML_SUCCESS if \a psu has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a psu is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlUnitGetPsuInfo(nvmlUnit_t unit, nvmlPSUInfo_t *psu); + +/** + * Retrieves the temperature readings for the unit, in degrees C. + * + * For S-class products. + * + * Depending on the product, readings may be available for intake (type=0), + * exhaust (type=1) and board (type=2). + * + * @param unit The identifier of the target unit + * @param type The type of reading to take + * @param temp Reference in which to return the intake temperature + * + * @return + * - \ref NVML_SUCCESS if \a temp has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a type is invalid or \a temp is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlUnitGetTemperature(nvmlUnit_t unit, unsigned int type, unsigned int *temp); + +/** + * Retrieves the fan speed readings for the unit. + * + * For S-class products. + * + * See \ref nvmlUnitFanSpeeds_t for details on available fan speed info. + * + * @param unit The identifier of the target unit + * @param fanSpeeds Reference in which to return the fan speed information + * + * @return + * - \ref NVML_SUCCESS if \a fanSpeeds has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid or \a fanSpeeds is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlUnitGetFanSpeedInfo(nvmlUnit_t unit, nvmlUnitFanSpeeds_t *fanSpeeds); + +/** + * Retrieves the set of GPU devices that are attached to the specified unit. + * + * For S-class products. + * + * The \a deviceCount argument is expected to be set to the size of the input \a devices array. + * + * @param unit The identifier of the target unit + * @param deviceCount Reference in which to provide the \a devices array size, and + * to return the number of attached GPU devices + * @param devices Reference in which to return the references to the attached GPU devices + * + * @return + * - \ref NVML_SUCCESS if \a deviceCount and \a devices have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a deviceCount indicates that the \a devices array is too small + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit is invalid, either of \a deviceCount or \a devices is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlUnitGetDevices(nvmlUnit_t unit, unsigned int *deviceCount, nvmlDevice_t *devices); + +/** + * Retrieves the IDs and firmware versions for any Host Interface Cards (HICs) in the system. + * + * For S-class products. + * + * The \a hwbcCount argument is expected to be set to the size of the input \a hwbcEntries array. + * The HIC must be connected to an S-class system for it to be reported by this function. + * + * @param hwbcCount Size of hwbcEntries array + * @param hwbcEntries Array holding information about hwbc + * + * @return + * - \ref NVML_SUCCESS if \a hwbcCount and \a hwbcEntries have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if either \a hwbcCount or \a hwbcEntries is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a hwbcCount indicates that the \a hwbcEntries array is too small + */ +nvmlReturn_t DECLDIR nvmlSystemGetHicVersion(unsigned int *hwbcCount, nvmlHwbcEntry_t *hwbcEntries); +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlDeviceQueries Device Queries + * This chapter describes that queries that NVML can perform against each device. + * In each case the device is identified with an nvmlDevice_t handle. This handle is obtained by + * calling one of \ref nvmlDeviceGetHandleByIndex(), \ref nvmlDeviceGetHandleBySerial(), + * \ref nvmlDeviceGetHandleByPciBusId(). or \ref nvmlDeviceGetHandleByUUID(). + * @{ + */ +/***************************************************************************************************/ + + /** + * Retrieves the number of compute devices in the system. A compute device is a single GPU. + * + * For all products. + * + * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system + * even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device. + * Update your code to handle this error, or use NVML 4.304 or older nvml header file. + * For backward binary compatibility reasons _v1 version of the API is still present in the shared + * library. + * Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to. + * + * @param deviceCount Reference in which to return the number of accessible devices + * + * @return + * - \ref NVML_SUCCESS if \a deviceCount has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a deviceCount is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCount(unsigned int *deviceCount); + +/** + * Acquire the handle for a particular device, based on its index. + * + * For all products. + * + * Valid indices are derived from the \a accessibleDevices count returned by + * \ref nvmlDeviceGetCount(). For example, if \a accessibleDevices is 2 the valid indices + * are 0 and 1, corresponding to GPU 0 and GPU 1. + * + * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it + * is recommended that devices be looked up by their PCI ids or UUID. See + * \ref nvmlDeviceGetHandleByUUID() and \ref nvmlDeviceGetHandleByPciBusId(). + * + * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. + * + * Starting from NVML 5, this API causes NVML to initialize the target GPU + * NVML may initialize additional GPUs if: + * - The target GPU is an SLI slave + * + * Note: New nvmlDeviceGetCount_v2 (default in NVML 5.319) returns count of all devices in the system + * even if nvmlDeviceGetHandleByIndex_v2 returns NVML_ERROR_NO_PERMISSION for such device. + * Update your code to handle this error, or use NVML 4.304 or older nvml header file. + * For backward binary compatibility reasons _v1 version of the API is still present in the shared + * library. + * Old _v1 version of nvmlDeviceGetCount doesn't count devices that NVML has no permission to talk to. + * + * This means that nvmlDeviceGetHandleByIndex_v2 and _v1 can return different devices for the same index. + * If you don't touch macros that map old (_v1) versions to _v2 versions at the top of the file you don't + * need to worry about that. + * + * @param index The index of the target GPU, >= 0 and < \a accessibleDevices + * @param device Reference in which to return the device handle + * + * @return + * - \ref NVML_SUCCESS if \a device has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a device is NULL + * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to talk to this device + * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetIndex + * @see nvmlDeviceGetCount + */ +nvmlReturn_t DECLDIR nvmlDeviceGetHandleByIndex(unsigned int index, nvmlDevice_t *device); + +/** + * Acquire the handle for a particular device, based on its board serial number. + * + * For Fermi &tm; or newer fully supported devices. + * + * This number corresponds to the value printed directly on the board, and to the value returned by + * \ref nvmlDeviceGetSerial(). + * + * @deprecated Since more than one GPU can exist on a single board this function is deprecated in favor + * of \ref nvmlDeviceGetHandleByUUID. + * For dual GPU boards this function will return NVML_ERROR_INVALID_ARGUMENT. + * + * Starting from NVML 5, this API causes NVML to initialize the target GPU + * NVML may initialize additional GPUs as it searches for the target GPU + * + * @param serial The board serial number of the target GPU + * @param device Reference in which to return the device handle + * + * @return + * - \ref NVML_SUCCESS if \a device has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a serial is invalid, \a device is NULL or more than one + * device has the same serial (dual GPU boards) + * - \ref NVML_ERROR_NOT_FOUND if \a serial does not match a valid device on the system + * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables + * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs + * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetSerial + * @see nvmlDeviceGetHandleByUUID + */ +nvmlReturn_t DECLDIR nvmlDeviceGetHandleBySerial(const char *serial, nvmlDevice_t *device); + +/** + * Acquire the handle for a particular device, based on its globally unique immutable UUID associated with each device. + * + * For all products. + * + * @param uuid The UUID of the target GPU + * @param device Reference in which to return the device handle + * + * Starting from NVML 5, this API causes NVML to initialize the target GPU + * NVML may initialize additional GPUs as it searches for the target GPU + * + * @return + * - \ref NVML_SUCCESS if \a device has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a uuid is invalid or \a device is null + * - \ref NVML_ERROR_NOT_FOUND if \a uuid does not match a valid device on the system + * - \ref NVML_ERROR_INSUFFICIENT_POWER if any attached devices have improperly attached external power cables + * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs + * - \ref NVML_ERROR_GPU_IS_LOST if any GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetUUID + */ +nvmlReturn_t DECLDIR nvmlDeviceGetHandleByUUID(const char *uuid, nvmlDevice_t *device); + +/** + * Acquire the handle for a particular device, based on its PCI bus id. + * + * For all products. + * + * This value corresponds to the nvmlPciInfo_t::busId returned by \ref nvmlDeviceGetPciInfo(). + * + * Starting from NVML 5, this API causes NVML to initialize the target GPU + * NVML may initialize additional GPUs if: + * - The target GPU is an SLI slave + * + * \note NVML 4.304 and older version of nvmlDeviceGetHandleByPciBusId"_v1" returns NVML_ERROR_NOT_FOUND + * instead of NVML_ERROR_NO_PERMISSION. + * + * @param pciBusId The PCI bus id of the target GPU + * @param device Reference in which to return the device handle + * + * @return + * - \ref NVML_SUCCESS if \a device has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciBusId is invalid or \a device is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a pciBusId does not match a valid device on the system + * - \ref NVML_ERROR_INSUFFICIENT_POWER if the attached device has improperly attached external power cables + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to talk to this device + * - \ref NVML_ERROR_IRQ_ISSUE if NVIDIA kernel detected an interrupt issue with the attached GPUs + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetHandleByPciBusId(const char *pciBusId, nvmlDevice_t *device); + +/** + * Retrieves the name of this device. + * + * For all products. + * + * The name is an alphanumeric string that denotes a particular product, e.g. Tesla &tm; C2070. It will not + * exceed 64 characters in length (including the NULL terminator). See \ref + * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. + * + * @param device The identifier of the target device + * @param name Reference in which to return the product name + * @param length The maximum allowed length of the string returned in \a name + * + * @return + * - \ref NVML_SUCCESS if \a name has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a name is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetName(nvmlDevice_t device, char *name, unsigned int length); + +/** + * Retrieves the brand of this device. + * + * For all products. + * + * The type is a member of \ref nvmlBrandType_t defined above. + * + * @param device The identifier of the target device + * @param type Reference in which to return the product brand type + * + * @return + * - \ref NVML_SUCCESS if \a name has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a type is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetBrand(nvmlDevice_t device, nvmlBrandType_t *type); + +/** + * Retrieves the NVML index of this device. + * + * For all products. + * + * Valid indices are derived from the \a accessibleDevices count returned by + * \ref nvmlDeviceGetCount(). For example, if \a accessibleDevices is 2 the valid indices + * are 0 and 1, corresponding to GPU 0 and GPU 1. + * + * The order in which NVML enumerates devices has no guarantees of consistency between reboots. For that reason it + * is recommended that devices be looked up by their PCI ids or GPU UUID. See + * \ref nvmlDeviceGetHandleByPciBusId() and \ref nvmlDeviceGetHandleByUUID(). + * + * Note: The NVML index may not correlate with other APIs, such as the CUDA device index. + * + * @param device The identifier of the target device + * @param index Reference in which to return the NVML index of the device + * + * @return + * - \ref NVML_SUCCESS if \a index has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a index is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetHandleByIndex() + * @see nvmlDeviceGetCount() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetIndex(nvmlDevice_t device, unsigned int *index); + +/** + * Retrieves the globally unique board serial number associated with this device's board. + * + * For all products with an inforom. + * + * The serial number is an alphanumeric string that will not exceed 30 characters (including the NULL terminator). + * This number matches the serial number tag that is physically attached to the board. See \ref + * nvmlConstants::NVML_DEVICE_SERIAL_BUFFER_SIZE. + * + * @param device The identifier of the target device + * @param serial Reference in which to return the board/module serial number + * @param length The maximum allowed length of the string returned in \a serial + * + * @return + * - \ref NVML_SUCCESS if \a serial has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a serial is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSerial(nvmlDevice_t device, char *serial, unsigned int length); + +/** + * Retrieves an array of unsigned ints (sized to cpuSetSize) of bitmasks with the ideal CPU affinity for the device + * For example, if processors 0, 1, 32, and 33 are ideal for the device and cpuSetSize == 2, + * result[0] = 0x3, result[1] = 0x3 + * + * For Kepler &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device The identifier of the target device + * @param cpuSetSize The size of the cpuSet array that is safe to access + * @param cpuSet Array reference in which to return a bitmask of CPUs, 64 CPUs per + * unsigned long on 64-bit machines, 32 on 32-bit machines + * + * @return + * - \ref NVML_SUCCESS if \a cpuAffinity has been filled + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, cpuSetSize == 0, or cpuSet is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCpuAffinity(nvmlDevice_t device, unsigned int cpuSetSize, unsigned long *cpuSet); + +/** + * Sets the ideal affinity for the calling thread and device using the guidelines + * given in nvmlDeviceGetCpuAffinity(). Note, this is a change as of version 8.0. + * Older versions set the affinity for a calling process and all children. + * Currently supports up to 64 processors. + * + * For Kepler &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if the calling process has been successfully bound + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetCpuAffinity(nvmlDevice_t device); + +/** + * Clear all affinity bindings for the calling thread. Note, this is a change as of version + * 8.0 as older versions cleared the affinity for a calling process and all children. + * + * For Kepler &tm; or newer fully supported devices. + * Supported on Linux only. + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if the calling process has been successfully unbound + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceClearCpuAffinity(nvmlDevice_t device); + +/** + * Retrieve the common ancestor for two devices + * For all products. + * Supported on Linux only. + * + * @param device1 The identifier of the first device + * @param device2 The identifier of the second device + * @param pathInfo A \ref nvmlGpuTopologyLevel_t that gives the path type + * + * @return + * - \ref NVML_SUCCESS if \a pathInfo has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device1, or \a device2 is invalid, or \a pathInfo is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature + * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTopologyCommonAncestor(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuTopologyLevel_t *pathInfo); + +/** + * Retrieve the set of GPUs that are nearest to a given device at a specific interconnectivity level + * For all products. + * Supported on Linux only. + * + * @param device The identifier of the first device + * @param level The \ref nvmlGpuTopologyLevel_t level to search for other GPUs + * @param count When zero, is set to the number of matching GPUs such that \a deviceArray + * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count + * number of device handles. + * @param deviceArray An array of device handles for GPUs found at \a level + * + * @return + * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a level, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count + * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature + * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTopologyNearestGpus(nvmlDevice_t device, nvmlGpuTopologyLevel_t level, unsigned int *count, nvmlDevice_t *deviceArray); + +/** + * Retrieve the set of GPUs that have a CPU affinity with the given CPU number + * For all products. + * Supported on Linux only. + * + * @param cpuNumber The CPU number + * @param count When zero, is set to the number of matching GPUs such that \a deviceArray + * can be malloc'd. When non-zero, \a deviceArray will be filled with \a count + * number of device handles. + * @param deviceArray An array of device handles for GPUs found with affinity to \a cpuNumber + * + * @return + * - \ref NVML_SUCCESS if \a deviceArray or \a count (if initially zero) has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a cpuNumber, or \a count is invalid, or \a deviceArray is NULL with a non-zero \a count + * - \ref NVML_ERROR_NOT_SUPPORTED if the device or OS does not support this feature + * - \ref NVML_ERROR_UNKNOWN an error has occurred in underlying topology discovery + */ +nvmlReturn_t DECLDIR nvmlSystemGetTopologyGpuSet(unsigned int cpuNumber, unsigned int *count, nvmlDevice_t *deviceArray); + +/** + * Retrieve the status for a given p2p capability index between a given pair of GPU + * + * @param device1 The first device + * @param device2 The second device + * @param p2pIndex p2p Capability Index being looked for between \a device1 and \a device2 + * @param p2pStatus Reference in which to return the status of the \a p2pIndex + * between \a device1 and \a device2 + * @return + * - \ref NVML_SUCCESS if \a p2pStatus has been populated + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device1 or \a device2 or \a p2pIndex is invalid or \a p2pStatus is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetP2PStatus(nvmlDevice_t device1, nvmlDevice_t device2, nvmlGpuP2PCapsIndex_t p2pIndex,nvmlGpuP2PStatus_t *p2pStatus); + +/** + * Retrieves the globally unique immutable UUID associated with this device, as a 5 part hexadecimal string, + * that augments the immutable, board serial identifier. + * + * For all products. + * + * The UUID is a globally unique identifier. It is the only available identifier for pre-Fermi-architecture products. + * It does NOT correspond to any identifier printed on the board. It will not exceed 80 characters in length + * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. + * + * @param device The identifier of the target device + * @param uuid Reference in which to return the GPU UUID + * @param length The maximum allowed length of the string returned in \a uuid + * + * @return + * - \ref NVML_SUCCESS if \a uuid has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a uuid is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetUUID(nvmlDevice_t device, char *uuid, unsigned int length); + +/** + * Retrieves minor number for the device. The minor number for the device is such that the Nvidia device node file for + * each GPU will have the form /dev/nvidia[minor number]. + * + * For all products. + * Supported only for Linux + * + * @param device The identifier of the target device + * @param minorNumber Reference in which to return the minor number for the device + * @return + * - \ref NVML_SUCCESS if the minor number is successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minorNumber is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMinorNumber(nvmlDevice_t device, unsigned int *minorNumber); + +/** + * Retrieves the the device board part number which is programmed into the board's InfoROM + * + * For all products. + * + * @param device Identifier of the target device + * @param partNumber Reference to the buffer to return + * @param length Length of the buffer reference + * + * @return + * - \ref NVML_SUCCESS if \a partNumber has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NOT_SUPPORTED if the needed VBIOS fields have not been filled + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a serial is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetBoardPartNumber(nvmlDevice_t device, char* partNumber, unsigned int length); + +/** + * Retrieves the version information for the device's infoROM object. + * + * For all products with an inforom. + * + * Fermi and higher parts have non-volatile on-board memory for persisting device info, such as aggregate + * ECC counts. The version of the data structures in this memory may change from time to time. It will not + * exceed 16 characters in length (including the NULL terminator). + * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE. + * + * See \ref nvmlInforomObject_t for details on the available infoROM objects. + * + * @param device The identifier of the target device + * @param object The target infoROM object + * @param version Reference in which to return the infoROM version + * @param length The maximum allowed length of the string returned in \a version + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetInforomImageVersion + */ +nvmlReturn_t DECLDIR nvmlDeviceGetInforomVersion(nvmlDevice_t device, nvmlInforomObject_t object, char *version, unsigned int length); + +/** + * Retrieves the global infoROM image version + * + * For all products with an inforom. + * + * Image version just like VBIOS version uniquely describes the exact version of the infoROM flashed on the board + * in contrast to infoROM object version which is only an indicator of supported features. + * Version string will not exceed 16 characters in length (including the NULL terminator). + * See \ref nvmlConstants::NVML_DEVICE_INFOROM_VERSION_BUFFER_SIZE. + * + * @param device The identifier of the target device + * @param version Reference in which to return the infoROM image version + * @param length The maximum allowed length of the string returned in \a version + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a version is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have an infoROM + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetInforomVersion + */ +nvmlReturn_t DECLDIR nvmlDeviceGetInforomImageVersion(nvmlDevice_t device, char *version, unsigned int length); + +/** + * Retrieves the checksum of the configuration stored in the device's infoROM. + * + * For all products with an inforom. + * + * Can be used to make sure that two GPUs have the exact same configuration. + * Current checksum takes into account configuration stored in PWR and ECC infoROM objects. + * Checksum can change between driver releases or when user changes configuration (e.g. disable/enable ECC) + * + * @param device The identifier of the target device + * @param checksum Reference in which to return the infoROM configuration checksum + * + * @return + * - \ref NVML_SUCCESS if \a checksum has been set + * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's checksum couldn't be retrieved due to infoROM corruption + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a checksum is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetInforomConfigurationChecksum(nvmlDevice_t device, unsigned int *checksum); + +/** + * Reads the infoROM from the flash and verifies the checksums. + * + * For all products with an inforom. + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if infoROM is not corrupted + * - \ref NVML_ERROR_CORRUPTED_INFOROM if the device's infoROM is corrupted + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceValidateInforom(nvmlDevice_t device); + +/** + * Retrieves the display mode for the device. + * + * For all products. + * + * This method indicates whether a physical display (e.g. monitor) is currently connected to + * any of the device's connectors. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param display Reference in which to return the display mode + * + * @return + * - \ref NVML_SUCCESS if \a display has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a display is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDisplayMode(nvmlDevice_t device, nvmlEnableState_t *display); + +/** + * Retrieves the display active state for the device. + * + * For all products. + * + * This method indicates whether a display is initialized on the device. + * For example whether X Server is attached to this device and has allocated memory for the screen. + * + * Display can be active even when no monitor is physically attached. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param isActive Reference in which to return the display active state + * + * @return + * - \ref NVML_SUCCESS if \a isActive has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isActive is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDisplayActive(nvmlDevice_t device, nvmlEnableState_t *isActive); + +/** + * Retrieves the persistence mode associated with this device. + * + * For all products. + * For Linux only. + * + * When driver persistence mode is enabled the driver software state is not torn down when the last + * client disconnects. By default this feature is disabled. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param mode Reference in which to return the current driver persistence mode + * + * @return + * - \ref NVML_SUCCESS if \a mode has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetPersistenceMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t *mode); + +/** + * Retrieves the PCI attributes of this device. + * + * For all products. + * + * See \ref nvmlPciInfo_t for details on the available PCI info. + * + * NOTE: If you are linking against a driver earlier than r384.40, then nvmlDeviceGetPciInfo_v2 must be used. This + * API does not populate pci->busId. pci->busIdLegacy will be populated for both nvmlDeviceGetPciInfo and + * nvmlDeviceGetPciInfo_v2. + * + * @param device The identifier of the target device + * @param pci Reference in which to return the PCI info + * + * @return + * - \ref NVML_SUCCESS if \a pci has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pci is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *pci); +nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo_v2(nvmlDevice_t device, nvmlPciInfo_t *pci); + +/** + * Retrieves the maximum PCIe link generation possible with this device and system + * + * I.E. for a generation 2 PCIe device attached to a generation 1 PCIe bus the max link generation this function will + * report is generation 1. + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param maxLinkGen Reference in which to return the max PCIe link generation + * + * @return + * - \ref NVML_SUCCESS if \a maxLinkGen has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkGen is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkGeneration(nvmlDevice_t device, unsigned int *maxLinkGen); + +/** + * Retrieves the maximum PCIe link width possible with this device and system + * + * I.E. for a device with a 16x PCIe bus width attached to a 8x PCIe system bus this function will report + * a max link width of 8. + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param maxLinkWidth Reference in which to return the max PCIe link generation + * + * @return + * - \ref NVML_SUCCESS if \a maxLinkWidth has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a maxLinkWidth is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMaxPcieLinkWidth(nvmlDevice_t device, unsigned int *maxLinkWidth); + +/** + * Retrieves the current PCIe link generation + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param currLinkGen Reference in which to return the current PCIe link generation + * + * @return + * - \ref NVML_SUCCESS if \a currLinkGen has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkGen is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkGeneration(nvmlDevice_t device, unsigned int *currLinkGen); + +/** + * Retrieves the current PCIe link width + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param currLinkWidth Reference in which to return the current PCIe link generation + * + * @return + * - \ref NVML_SUCCESS if \a currLinkWidth has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a currLinkWidth is null + * - \ref NVML_ERROR_NOT_SUPPORTED if PCIe link information is not available + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCurrPcieLinkWidth(nvmlDevice_t device, unsigned int *currLinkWidth); + +/** + * Retrieve PCIe utilization information. + * This function is querying a byte counter over a 20ms interval and thus is the + * PCIe throughput over that interval. + * + * For Maxwell &tm; or newer fully supported devices. + * + * This method is not supported in virtual machines running virtual GPU (vGPU). + * + * @param device The identifier of the target device + * @param counter The specific counter that should be queried \ref nvmlPcieUtilCounter_t + * @param value Reference in which to return throughput in KB/s + * + * @return + * - \ref NVML_SUCCESS if \a value has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a counter is invalid, or \a value is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPcieThroughput(nvmlDevice_t device, nvmlPcieUtilCounter_t counter, unsigned int *value); + +/** + * Retrieve the PCIe replay counter. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param value Reference in which to return the counter's value + * + * @return + * - \ref NVML_SUCCESS if \a value and \a rollover have been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a value or \a rollover are NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPcieReplayCounter(nvmlDevice_t device, unsigned int *value); + +/** + * Retrieves the current clock speeds for the device. + * + * For Fermi &tm; or newer fully supported devices. + * + * See \ref nvmlClockType_t for details on available clock information. + * + * @param device The identifier of the target device + * @param type Identify which clock domain to query + * @param clock Reference in which to return the clock speed in MHz + * + * @return + * - \ref NVML_SUCCESS if \a clock has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device cannot report the specified clock + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); + +/** + * Retrieves the maximum clock speeds for the device. + * + * For Fermi &tm; or newer fully supported devices. + * + * See \ref nvmlClockType_t for details on available clock information. + * + * \note On GPUs from Fermi family current P0 clocks (reported by \ref nvmlDeviceGetClockInfo) can differ from max clocks + * by few MHz. + * + * @param device The identifier of the target device + * @param type Identify which clock domain to query + * @param clock Reference in which to return the clock speed in MHz + * + * @return + * - \ref NVML_SUCCESS if \a clock has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device cannot report the specified clock + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); + +/** + * Retrieves the current setting of a clock that applications will use unless an overspec situation occurs. + * Can be changed using \ref nvmlDeviceSetApplicationsClocks. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param clockType Identify which clock domain to query + * @param clockMHz Reference in which to return the clock in MHz + * + * @return + * - \ref NVML_SUCCESS if \a clockMHz has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); + +/** + * Retrieves the default applications clock that GPU boots with or + * defaults to after \ref nvmlDeviceResetApplicationsClocks call. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param clockType Identify which clock domain to query + * @param clockMHz Reference in which to return the default clock in MHz + * + * @return + * - \ref NVML_SUCCESS if \a clockMHz has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * \see nvmlDeviceGetApplicationsClock + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDefaultApplicationsClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); + +/** + * Resets the application clock to the default value + * + * This is the applications clock that will be used after system reboot or driver reload. + * Default value is constant, but the current value an be changed using \ref nvmlDeviceSetApplicationsClocks. + * + * On Pascal and newer hardware, if clocks were previously locked with \ref nvmlDeviceSetApplicationsClocks, + * this call will unlock clocks. This returns clocks their default behavior ofautomatically boosting above + * base clocks as thermal limits allow. + * + * @see nvmlDeviceGetApplicationsClock + * @see nvmlDeviceSetApplicationsClocks + * + * For Fermi &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceResetApplicationsClocks(nvmlDevice_t device); + +/** + * Retrieves the clock speed for the clock specified by the clock type and clock ID. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param clockType Identify which clock domain to query + * @param clockId Identify which clock in the domain to query + * @param clockMHz Reference in which to return the clock in MHz + * + * @return + * - \ref NVML_SUCCESS if \a clockMHz has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetClock(nvmlDevice_t device, nvmlClockType_t clockType, nvmlClockId_t clockId, unsigned int *clockMHz); + +/** + * Retrieves the customer defined maximum boost clock speed specified by the given clock type. + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param clockType Identify which clock domain to query + * @param clockMHz Reference in which to return the clock in MHz + * + * @return + * - \ref NVML_SUCCESS if \a clockMHz has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clockMHz is NULL or \a clockType is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device or the \a clockType on this device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMaxCustomerBoostClock(nvmlDevice_t device, nvmlClockType_t clockType, unsigned int *clockMHz); + +/** + * Retrieves the list of possible memory clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param count Reference in which to provide the \a clocksMHz array size, and + * to return the number of elements + * @param clocksMHz Reference in which to return the clock in MHz + * + * @return + * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to the number of + * required elements) + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetApplicationsClocks + * @see nvmlDeviceGetSupportedGraphicsClocks + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedMemoryClocks(nvmlDevice_t device, unsigned int *count, unsigned int *clocksMHz); + +/** + * Retrieves the list of possible graphics clocks that can be used as an argument for \ref nvmlDeviceSetApplicationsClocks. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param memoryClockMHz Memory clock for which to return possible graphics clocks + * @param count Reference in which to provide the \a clocksMHz array size, and + * to return the number of elements + * @param clocksMHz Reference in which to return the clocks in MHz + * + * @return + * - \ref NVML_SUCCESS if \a count and \a clocksMHz have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NOT_FOUND if the specified \a memoryClockMHz is not a supported frequency + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clock is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetApplicationsClocks + * @see nvmlDeviceGetSupportedMemoryClocks + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedGraphicsClocks(nvmlDevice_t device, unsigned int memoryClockMHz, unsigned int *count, unsigned int *clocksMHz); + +/** + * Retrieve the current state of Auto Boosted clocks on a device and store it in \a isEnabled + * + * For Kepler &tm; or newer fully supported devices. + * + * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates + * to maximize performance as thermal limits allow. + * + * On Pascal and newer hardware, Auto Aoosted clocks are controlled through application clocks. + * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost + * behavior. + * + * @param device The identifier of the target device + * @param isEnabled Where to store the current state of Auto Boosted clocks of the target device + * @param defaultIsEnabled Where to store the default Auto Boosted clocks behavior of the target device that the device will + * revert to when no applications are using the GPU + * + * @return + * - \ref NVML_SUCCESS If \a isEnabled has been been set with the Auto Boosted clocks state of \a device + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isEnabled is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t *isEnabled, nvmlEnableState_t *defaultIsEnabled); + +/** + * Try to set the current state of Auto Boosted clocks on a device. + * + * For Kepler &tm; or newer fully supported devices. + * + * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates + * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock + * rates are desired. + * + * Non-root users may use this API by default but can be restricted by root from using this API by calling + * \ref nvmlDeviceSetAPIRestriction with apiType=NVML_RESTRICTED_API_SET_AUTO_BOOSTED_CLOCKS. + * Note: Persistence Mode is required to modify current Auto Boost settings, therefore, it must be enabled. + * + * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. + * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost + * behavior. + * + * @param device The identifier of the target device + * @param enabled What state to try to set Auto Boosted clocks of the target device to + * + * @return + * - \ref NVML_SUCCESS If the Auto Boosted clocks were successfully set to the state specified by \a enabled + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + */ +nvmlReturn_t DECLDIR nvmlDeviceSetAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled); + +/** + * Try to set the default state of Auto Boosted clocks on a device. This is the default state that Auto Boosted clocks will + * return to when no compute running processes (e.g. CUDA application which have an active context) are running + * + * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. + * Requires root/admin permissions. + * + * Auto Boosted clocks are enabled by default on some hardware, allowing the GPU to run at higher clock rates + * to maximize performance as thermal limits allow. Auto Boosted clocks should be disabled if fixed clock + * rates are desired. + * + * On Pascal and newer hardware, Auto Boosted clocks are controlled through application clocks. + * Use \ref nvmlDeviceSetApplicationsClocks and \ref nvmlDeviceResetApplicationsClocks to control Auto Boost + * behavior. + * + * @param device The identifier of the target device + * @param enabled What state to try to set default Auto Boosted clocks of the target device to + * @param flags Flags that change the default behavior. Currently Unused. + * + * @return + * - \ref NVML_SUCCESS If the Auto Boosted clock's default state was successfully set to the state specified by \a enabled + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_NO_PERMISSION If the calling user does not have permission to change Auto Boosted clock's default state. + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support Auto Boosted clocks + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + */ +nvmlReturn_t DECLDIR nvmlDeviceSetDefaultAutoBoostedClocksEnabled(nvmlDevice_t device, nvmlEnableState_t enabled, unsigned int flags); + + +/** + * Retrieves the intended operating speed of the device's fan. + * + * Note: The reported speed is the intended fan speed. If the fan is physically blocked and unable to spin, the + * output will not match the actual fan speed. + * + * For all discrete products with dedicated fans. + * + * The fan speed is expressed as a percent of the maximum, i.e. full speed is 100%. + * + * @param device The identifier of the target device + * @param speed Reference in which to return the fan speed percentage + * + * @return + * - \ref NVML_SUCCESS if \a speed has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a speed is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a fan + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetFanSpeed(nvmlDevice_t device, unsigned int *speed); + +/** + * Retrieves the current temperature readings for the device, in degrees C. + * + * For all products. + * + * See \ref nvmlTemperatureSensors_t for details on available temperature sensors. + * + * @param device The identifier of the target device + * @param sensorType Flag that indicates which sensor reading to retrieve + * @param temp Reference in which to return the temperature reading + * + * @return + * - \ref NVML_SUCCESS if \a temp has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a sensorType is invalid or \a temp is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have the specified sensor + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTemperature(nvmlDevice_t device, nvmlTemperatureSensors_t sensorType, unsigned int *temp); + +/** + * Retrieves the temperature threshold for the GPU with the specified threshold type in degrees C. + * + * For Kepler &tm; or newer fully supported devices. + * + * See \ref nvmlTemperatureThresholds_t for details on available temperature thresholds. + * + * @param device The identifier of the target device + * @param thresholdType The type of threshold value queried + * @param temp Reference in which to return the temperature reading + * @return + * - \ref NVML_SUCCESS if \a temp has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a thresholdType is invalid or \a temp is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not have a temperature sensor or is unsupported + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTemperatureThreshold(nvmlDevice_t device, nvmlTemperatureThresholds_t thresholdType, unsigned int *temp); + +/** + * Retrieves the current performance state for the device. + * + * For Fermi &tm; or newer fully supported devices. + * + * See \ref nvmlPstates_t for details on allowed performance states. + * + * @param device The identifier of the target device + * @param pState Reference in which to return the performance state reading + * + * @return + * - \ref NVML_SUCCESS if \a pState has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPerformanceState(nvmlDevice_t device, nvmlPstates_t *pState); + +/** + * Retrieves current clocks throttling reasons. + * + * For all fully supported products. + * + * \note More than one bit can be enabled at the same time. Multiple reasons can be affecting clocks at once. + * + * @param device The identifier of the target device + * @param clocksThrottleReasons Reference in which to return bitmask of active clocks throttle + * reasons + * + * @return + * - \ref NVML_SUCCESS if \a clocksThrottleReasons has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a clocksThrottleReasons is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlClocksThrottleReasons + * @see nvmlDeviceGetSupportedClocksThrottleReasons + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCurrentClocksThrottleReasons(nvmlDevice_t device, unsigned long long *clocksThrottleReasons); + +/** + * Retrieves bitmask of supported clocks throttle reasons that can be returned by + * \ref nvmlDeviceGetCurrentClocksThrottleReasons + * + * For all fully supported products. + * + * This method is not supported in virtual machines running virtual GPU (vGPU). + * + * @param device The identifier of the target device + * @param supportedClocksThrottleReasons Reference in which to return bitmask of supported + * clocks throttle reasons + * + * @return + * - \ref NVML_SUCCESS if \a supportedClocksThrottleReasons has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a supportedClocksThrottleReasons is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlClocksThrottleReasons + * @see nvmlDeviceGetCurrentClocksThrottleReasons + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedClocksThrottleReasons(nvmlDevice_t device, unsigned long long *supportedClocksThrottleReasons); + +/** + * Deprecated: Use \ref nvmlDeviceGetPerformanceState. This function exposes an incorrect generalization. + * + * Retrieve the current performance state for the device. + * + * For Fermi &tm; or newer fully supported devices. + * + * See \ref nvmlPstates_t for details on allowed performance states. + * + * @param device The identifier of the target device + * @param pState Reference in which to return the performance state reading + * + * @return + * - \ref NVML_SUCCESS if \a pState has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pState is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerState(nvmlDevice_t device, nvmlPstates_t *pState); + +/** + * This API has been deprecated. + * + * Retrieves the power management mode associated with this device. + * + * For products from the Fermi family. + * - Requires \a NVML_INFOROM_POWER version 3.0 or higher. + * + * For from the Kepler or newer families. + * - Does not require \a NVML_INFOROM_POWER object. + * + * This flag indicates whether any power management algorithm is currently active on the device. An + * enabled state does not necessarily mean the device is being actively throttled -- only that + * that the driver will do so if the appropriate conditions are met. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param mode Reference in which to return the current power management mode + * + * @return + * - \ref NVML_SUCCESS if \a mode has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementMode(nvmlDevice_t device, nvmlEnableState_t *mode); + +/** + * Retrieves the power management limit associated with this device. + * + * For Fermi &tm; or newer fully supported devices. + * + * The power limit defines the upper boundary for the card's power draw. If + * the card's total power draw reaches this limit the power management algorithm kicks in. + * + * This reading is only available if power management mode is supported. + * See \ref nvmlDeviceGetPowerManagementMode. + * + * @param device The identifier of the target device + * @param limit Reference in which to return the power management limit in milliwatts + * + * @return + * - \ref NVML_SUCCESS if \a limit has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimit(nvmlDevice_t device, unsigned int *limit); + +/** + * Retrieves information about possible values of power management limits on this device. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param minLimit Reference in which to return the minimum power management limit in milliwatts + * @param maxLimit Reference in which to return the maximum power management limit in milliwatts + * + * @return + * - \ref NVML_SUCCESS if \a minLimit and \a maxLimit have been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minLimit or \a maxLimit is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetPowerManagementLimit + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementLimitConstraints(nvmlDevice_t device, unsigned int *minLimit, unsigned int *maxLimit); + +/** + * Retrieves default power management limit on this device, in milliwatts. + * Default power management limit is a power management limit that the device boots with. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param defaultLimit Reference in which to return the default power management limit in milliwatts + * + * @return + * - \ref NVML_SUCCESS if \a defaultLimit has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerManagementDefaultLimit(nvmlDevice_t device, unsigned int *defaultLimit); + +/** + * Retrieves power usage for this GPU in milliwatts and its associated circuitry (e.g. memory) + * + * For Fermi &tm; or newer fully supported devices. + * + * On Fermi and Kepler GPUs the reading is accurate to within +/- 5% of current power draw. + * + * It is only available if power management mode is supported. See \ref nvmlDeviceGetPowerManagementMode. + * + * @param device The identifier of the target device + * @param power Reference in which to return the power usage information + * + * @return + * - \ref NVML_SUCCESS if \a power has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a power is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support power readings + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetPowerUsage(nvmlDevice_t device, unsigned int *power); + +/** + * Retrieves total energy consumption for this GPU in millijoules (mJ) since the driver was last reloaded + * + * For newer than Pascal &tm; fully supported devices. + * + * @param device The identifier of the target device + * @param energy Reference in which to return the energy consumption information + * + * @return + * - \ref NVML_SUCCESS if \a energy has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a energy is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support energy readings + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTotalEnergyConsumption(nvmlDevice_t device, unsigned long long *energy); + +/** + * Get the effective power limit that the driver enforces after taking into account all limiters + * + * Note: This can be different from the \ref nvmlDeviceGetPowerManagementLimit if other limits are set elsewhere + * This includes the out of band power limit interface + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The device to communicate with + * @param limit Reference in which to return the power management limit in milliwatts + * + * @return + * - \ref NVML_SUCCESS if \a limit has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a limit is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetEnforcedPowerLimit(nvmlDevice_t device, unsigned int *limit); + +/** + * Retrieves the current GOM and pending GOM (the one that GPU will switch to after reboot). + * + * For GK110 M-class and X-class Tesla &tm; products from the Kepler family. + * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products. + * Not supported on Quadro ® and Tesla &tm; C-class products. + * + * @param device The identifier of the target device + * @param current Reference in which to return the current GOM + * @param pending Reference in which to return the pending GOM + * + * @return + * - \ref NVML_SUCCESS if \a mode has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a current or \a pending is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlGpuOperationMode_t + * @see nvmlDeviceSetGpuOperationMode + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t *current, nvmlGpuOperationMode_t *pending); + +/** + * Retrieves the amount of used, free and total memory available on the device, in bytes. + * + * For all products. + * + * Enabling ECC reduces the amount of total available memory, due to the extra required parity bits. + * Under WDDM most device memory is allocated and managed on startup by Windows. + * + * Under Linux and Windows TCC, the reported amount of used memory is equal to the sum of memory allocated + * by all active channels on the device. + * + * See \ref nvmlMemory_t for details on available memory info. + * + * @param device The identifier of the target device + * @param memory Reference in which to return the memory information + * + * @return + * - \ref NVML_SUCCESS if \a memory has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memory is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMemoryInfo(nvmlDevice_t device, nvmlMemory_t *memory); + +/** + * Retrieves the current compute mode for the device. + * + * For all products. + * + * See \ref nvmlComputeMode_t for details on allowed compute modes. + * + * @param device The identifier of the target device + * @param mode Reference in which to return the current compute mode + * + * @return + * - \ref NVML_SUCCESS if \a mode has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetComputeMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetComputeMode(nvmlDevice_t device, nvmlComputeMode_t *mode); + +/** + * Retrieves the CUDA compute capability of the device. + * + * For all products. + * + * Returns the major and minor compute capability version numbers of the + * device. The major and minor versions are equivalent to the + * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR and + * CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR attributes that would be + * returned by CUDA's cuDeviceGetAttribute(). + * + * @param device The identifier of the target device + * @param major Reference in which to return the major CUDA compute capability + * @param minor Reference in which to return the minor CUDA compute capability + * + * @return + * - \ref NVML_SUCCESS if \a major and \a minor have been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a major or \a minor are NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCudaComputeCapability(nvmlDevice_t device, int *major, int *minor); + +/** + * Retrieves the current and pending ECC modes for the device. + * + * For Fermi &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. + * + * Changing ECC modes requires a reboot. The "pending" ECC mode refers to the target mode following + * the next reboot. + * + * See \ref nvmlEnableState_t for details on allowed modes. + * + * @param device The identifier of the target device + * @param current Reference in which to return the current ECC mode + * @param pending Reference in which to return the pending ECC mode + * + * @return + * - \ref NVML_SUCCESS if \a current and \a pending have been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or either \a current or \a pending is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetEccMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetEccMode(nvmlDevice_t device, nvmlEnableState_t *current, nvmlEnableState_t *pending); + +/** + * Retrieves the device boardId from 0-N. + * Devices with the same boardId indicate GPUs connected to the same PLX. Use in conjunction with + * \ref nvmlDeviceGetMultiGpuBoard() to decide if they are on the same board as well. + * The boardId returned is a unique ID for the current configuration. Uniqueness and ordering across + * reboots and system configurations is not guaranteed (i.e. if a Tesla K40c returns 0x100 and + * the two GPUs on a Tesla K10 in the same system returns 0x200 it is not guaranteed they will + * always return those values but they will always be different from each other). + * + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param boardId Reference in which to return the device's board ID + * + * @return + * - \ref NVML_SUCCESS if \a boardId has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a boardId is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetBoardId(nvmlDevice_t device, unsigned int *boardId); + +/** + * Retrieves whether the device is on a Multi-GPU Board + * Devices that are on multi-GPU boards will set \a multiGpuBool to a non-zero value. + * + * For Fermi &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param multiGpuBool Reference in which to return a zero or non-zero value + * to indicate whether the device is on a multi GPU board + * + * @return + * - \ref NVML_SUCCESS if \a multiGpuBool has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a multiGpuBool is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMultiGpuBoard(nvmlDevice_t device, unsigned int *multiGpuBool); + +/** + * Retrieves the total ECC error counts for the device. + * + * For Fermi &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. + * Requires ECC Mode to be enabled. + * + * The total error count is the sum of errors across each of the separate memory systems, i.e. the total set of + * errors across the entire device. + * + * See \ref nvmlMemoryErrorType_t for a description of available error types.\n + * See \ref nvmlEccCounterType_t for a description of available counter types. + * + * @param device The identifier of the target device + * @param errorType Flag that specifies the type of the errors. + * @param counterType Flag that specifies the counter-type of the errors. + * @param eccCounts Reference in which to return the specified ECC errors + * + * @return + * - \ref NVML_SUCCESS if \a eccCounts has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceClearEccErrorCounts() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetTotalEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, unsigned long long *eccCounts); + +/** + * Retrieves the detailed ECC error counts for the device. + * + * @deprecated This API supports only a fixed set of ECC error locations + * On different GPU architectures different locations are supported + * See \ref nvmlDeviceGetMemoryErrorCounter + * + * For Fermi &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based ECC counts. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other ECC counts. + * Requires ECC Mode to be enabled. + * + * Detailed errors provide separate ECC counts for specific parts of the memory system. + * + * Reports zero for unsupported ECC error counters when a subset of ECC error counters are supported. + * + * See \ref nvmlMemoryErrorType_t for a description of available bit types.\n + * See \ref nvmlEccCounterType_t for a description of available counter types.\n + * See \ref nvmlEccErrorCounts_t for a description of provided detailed ECC counts. + * + * @param device The identifier of the target device + * @param errorType Flag that specifies the type of the errors. + * @param counterType Flag that specifies the counter-type of the errors. + * @param eccCounts Reference in which to return the specified ECC errors + * + * @return + * - \ref NVML_SUCCESS if \a eccCounts has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a errorType or \a counterType is invalid, or \a eccCounts is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceClearEccErrorCounts() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlEccErrorCounts_t *eccCounts); + +/** + * Retrieves the requested memory error counter for the device. + * + * For Fermi &tm; or newer fully supported devices. + * Requires \a NVML_INFOROM_ECC version 2.0 or higher to report aggregate location-based memory error counts. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher to report all other memory error counts. + * + * Only applicable to devices with ECC. + * + * Requires ECC Mode to be enabled. + * + * See \ref nvmlMemoryErrorType_t for a description of available memory error types.\n + * See \ref nvmlEccCounterType_t for a description of available counter types.\n + * See \ref nvmlMemoryLocation_t for a description of available counter locations.\n + * + * @param device The identifier of the target device + * @param errorType Flag that specifies the type of error. + * @param counterType Flag that specifies the counter-type of the errors. + * @param locationType Specifies the location of the counter. + * @param count Reference in which to return the ECC counter + * + * @return + * - \ref NVML_SUCCESS if \a count has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a bitTyp,e \a counterType or \a locationType is + * invalid, or \a count is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support ECC error reporting in the specified memory + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetMemoryErrorCounter(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, + nvmlEccCounterType_t counterType, + nvmlMemoryLocation_t locationType, unsigned long long *count); + +/** + * Retrieves the current utilization rates for the device's major subsystems. + * + * For Fermi &tm; or newer fully supported devices. + * + * See \ref nvmlUtilization_t for details on available utilization rates. + * + * \note During driver initialization when ECC is enabled one can see high GPU and Memory Utilization readings. + * This is caused by ECC Memory Scrubbing mechanism that is performed during driver initialization. + * + * @param device The identifier of the target device + * @param utilization Reference in which to return the utilization information + * + * @return + * - \ref NVML_SUCCESS if \a utilization has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a utilization is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtilization_t *utilization); + +/** + * Retrieves the current utilization and sampling size in microseconds for the Encoder + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param utilization Reference to an unsigned int for encoder utilization info + * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US + * + * @return + * - \ref NVML_SUCCESS if \a utilization has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetEncoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); + +/** + * Retrieves the current capacity of the device's encoder, in macroblocks per second. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param encoderQueryType Type of encoder to query + * @param encoderCapacity Reference to an unsigned int for the encoder capacity + * + * @return + * - \ref NVML_SUCCESS if \a encoderCapacity is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a encoderCapacity is NULL, or \a device or \a encoderQueryType + * are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if device does not support the encoder specified in \a encodeQueryType + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetEncoderCapacity (nvmlDevice_t device, nvmlEncoderType_t encoderQueryType, unsigned int *encoderCapacity); + +/** + * Retrieves the current encoder statistics for a given device. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param sessionCount Reference to an unsigned int for count of active encoder sessions + * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions + * @param averageLatency Reference to an unsigned int for encode latency in microseconds + * + * @return + * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount, or \a device or \a averageFps, + * or \a averageLatency is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetEncoderStats (nvmlDevice_t device, unsigned int *sessionCount, + unsigned int *averageFps, unsigned int *averageLatency); + +/** + * Retrieves information about active encoder sessions on a target device. + * + * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfos. The + * array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions + * written to the buffer. + * + * If the supplied buffer is not large enough to accomodate the active session array, the function returns + * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. + * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return + * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param sessionCount Reference to caller supplied array size, and returns the number of sessions. + * @param sessionInfos Reference in which to return the session information + * + * @return + * - \ref NVML_SUCCESS if \a sessionInfos is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL. + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetEncoderSessions(nvmlDevice_t device, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfos); + +/** + * Retrieves the current utilization and sampling size in microseconds for the Decoder + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param utilization Reference to an unsigned int for decoder utilization info + * @param samplingPeriodUs Reference to an unsigned int for the sampling period in US + * + * @return + * - \ref NVML_SUCCESS if \a utilization has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); + +/** + * Retrieves the current and pending driver model for the device. + * + * For Fermi &tm; or newer fully supported devices. + * For windows only. + * + * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached + * to the device it must run in WDDM mode. TCC mode is preferred if a display is not attached. + * + * See \ref nvmlDriverModel_t for details on available driver models. + * + * @param device The identifier of the target device + * @param current Reference in which to return the current driver model + * @param pending Reference in which to return the pending driver model + * + * @return + * - \ref NVML_SUCCESS if either \a current and/or \a pending have been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or both \a current and \a pending are NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceSetDriverModel() + */ +nvmlReturn_t DECLDIR nvmlDeviceGetDriverModel(nvmlDevice_t device, nvmlDriverModel_t *current, nvmlDriverModel_t *pending); + +/** + * Get VBIOS version of the device. + * + * For all products. + * + * The VBIOS version may change from time to time. It will not exceed 32 characters in length + * (including the NULL terminator). See \ref nvmlConstants::NVML_DEVICE_VBIOS_VERSION_BUFFER_SIZE. + * + * @param device The identifier of the target device + * @param version Reference to which to return the VBIOS version + * @param length The maximum allowed length of the string returned in \a version + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a version is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVbiosVersion(nvmlDevice_t device, char *version, unsigned int length); + +/** + * Get Bridge Chip Information for all the bridge chips on the board. + * + * For all fully supported products. + * Only applicable to multi-GPU products. + * + * @param device The identifier of the target device + * @param bridgeHierarchy Reference to the returned bridge chip Hierarchy + * + * @return + * - \ref NVML_SUCCESS if bridge chip exists + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a bridgeInfo is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if bridge chip not supported on the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetBridgeChipInfo(nvmlDevice_t device, nvmlBridgeChipHierarchy_t *bridgeHierarchy); + +/** + * Get information about processes with a compute context on a device + * + * For Fermi &tm; or newer fully supported devices. + * + * This function returns information only about compute running processes (e.g. CUDA application which have + * active context). Any graphics applications (e.g. using OpenGL, DirectX) won't be listed by this function. + * + * To query the current number of running compute processes, call this function with *infoCount = 0. The + * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call + * \a infos is allowed to be NULL. + * + * The usedGpuMemory field returned is all of the memory used by the application. + * + * Keep in mind that information returned by this call is dynamic and the number of elements might change in + * time. Allocate more space for \a infos table in case new compute processes are spawned. + * + * @param device The identifier of the target device + * @param infoCount Reference in which to provide the \a infos array size, and + * to return the number of returned elements + * @param infos Reference in which to return the process information + * + * @return + * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small + * \a infoCount will contain minimal amount of space necessary for + * the call to complete + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see \ref nvmlSystemGetProcessName + */ +nvmlReturn_t DECLDIR nvmlDeviceGetComputeRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); + +/** + * Get information about processes with a graphics context on a device + * + * For Kepler &tm; or newer fully supported devices. + * + * This function returns information only about graphics based processes + * (eg. applications using OpenGL, DirectX) + * + * To query the current number of running graphics processes, call this function with *infoCount = 0. The + * return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if none are running. For this call + * \a infos is allowed to be NULL. + * + * The usedGpuMemory field returned is all of the memory used by the application. + * + * Keep in mind that information returned by this call is dynamic and the number of elements might change in + * time. Allocate more space for \a infos table in case new graphics processes are spawned. + * + * @param device The identifier of the target device + * @param infoCount Reference in which to provide the \a infos array size, and + * to return the number of returned elements + * @param infos Reference in which to return the process information + * + * @return + * - \ref NVML_SUCCESS if \a infoCount and \a infos have been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a infoCount indicates that the \a infos array is too small + * \a infoCount will contain minimal amount of space necessary for + * the call to complete + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, either of \a infoCount or \a infos is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see \ref nvmlSystemGetProcessName + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGraphicsRunningProcesses(nvmlDevice_t device, unsigned int *infoCount, nvmlProcessInfo_t *infos); + +/** + * Check if the GPU devices are on the same physical board. + * + * For all fully supported products. + * + * @param device1 The first GPU device + * @param device2 The second GPU device + * @param onSameBoard Reference in which to return the status. + * Non-zero indicates that the GPUs are on the same board. + * + * @return + * - \ref NVML_SUCCESS if \a onSameBoard has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a dev1 or \a dev2 are invalid or \a onSameBoard is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this check is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the either GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceOnSameBoard(nvmlDevice_t device1, nvmlDevice_t device2, int *onSameBoard); + +/** + * Retrieves the root/admin permissions on the target API. See \a nvmlRestrictedAPI_t for the list of supported APIs. + * If an API is restricted only root users can call that API. See \a nvmlDeviceSetAPIRestriction to change current permissions. + * + * For all fully supported products. + * + * @param device The identifier of the target device + * @param apiType Target API type for this operation + * @param isRestricted Reference in which to return the current restriction + * NVML_FEATURE_ENABLED indicates that the API is root-only + * NVML_FEATURE_DISABLED indicates that the API is accessible to all users + * + * @return + * - \ref NVML_SUCCESS if \a isRestricted has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a apiType incorrect or \a isRestricted is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device or the device does not support + * the feature that is being queried (E.G. Enabling/disabling Auto Boosted clocks is + * not supported by the device) + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlRestrictedAPI_t + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t *isRestricted); + +/** + * Gets recent samples for the GPU. + * + * For Kepler &tm; or newer fully supported devices. + * + * Based on type, this method can be used to fetch the power, utilization or clock samples maintained in the buffer by + * the driver. + * + * Power, Utilization and Clock samples are returned as type "unsigned int" for the union nvmlValue_t. + * + * To get the size of samples that user needs to allocate, the method is invoked with samples set to NULL. + * The returned samplesCount will provide the number of samples that can be queried. The user needs to + * allocate the buffer with size as samplesCount * sizeof(nvmlSample_t). + * + * lastSeenTimeStamp represents CPU timestamp in microseconds. Set it to 0 to fetch all the samples maintained by the + * underlying buffer. Set lastSeenTimeStamp to one of the timeStamps retrieved from the date of the previous query + * to get more recent samples. + * + * This method fetches the number of entries which can be accommodated in the provided samples array, and the + * reference samplesCount is updated to indicate how many samples were actually retrieved. The advantage of using this + * method for samples in contrast to polling via existing methods is to get get higher frequency data at lower polling cost. + * + * @param device The identifier for the target device + * @param type Type of sampling event + * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. + * @param sampleValType Output parameter to represent the type of sample value as described in nvmlSampleVal_t + * @param sampleCount Reference to provide the number of elements which can be queried in samples array + * @param samples Reference in which samples are returned + + * @return + * - \ref NVML_SUCCESS if samples are successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a samplesCount is NULL or + * reference to \a sampleCount is 0 for non null \a samples + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSamples(nvmlDevice_t device, nvmlSamplingType_t type, unsigned long long lastSeenTimeStamp, + nvmlValueType_t *sampleValType, unsigned int *sampleCount, nvmlSample_t *samples); + +/** + * Gets Total, Available and Used size of BAR1 memory. + * + * BAR1 is used to map the FB (device memory) so that it can be directly accessed by the CPU or by 3rd party + * devices (peer-to-peer on the PCIE bus). + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param bar1Memory Reference in which BAR1 memory + * information is returned. + * + * @return + * - \ref NVML_SUCCESS if BAR1 memory is successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a bar1Memory is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetBAR1MemoryInfo(nvmlDevice_t device, nvmlBAR1Memory_t *bar1Memory); + + +/** + * Gets the duration of time during which the device was throttled (lower than requested clocks) due to power + * or thermal constraints. + * + * The method is important to users who are tying to understand if their GPUs throttle at any point during their applications. The + * difference in violation times at two different reference times gives the indication of GPU throttling event. + * + * Violation for thermal capping is not supported at this time. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param perfPolicyType Represents Performance policy which can trigger GPU throttling + * @param violTime Reference to which violation time related information is returned + * + * + * @return + * - \ref NVML_SUCCESS if violation time is successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a perfPolicyType is invalid, or \a violTime is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if this query is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * + */ +nvmlReturn_t DECLDIR nvmlDeviceGetViolationStatus(nvmlDevice_t device, nvmlPerfPolicyType_t perfPolicyType, nvmlViolationTime_t *violTime); + +/** + * @} + */ + +/** @addtogroup nvmlAccountingStats + * @{ + */ + +/** + * Queries the state of per process accounting mode. + * + * For Kepler &tm; or newer fully supported devices. + * + * See \ref nvmlDeviceGetAccountingStats for more details. + * See \ref nvmlDeviceSetAccountingMode + * + * @param device The identifier of the target device + * @param mode Reference in which to return the current accounting mode + * + * @return + * - \ref NVML_SUCCESS if the mode has been successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode are NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAccountingMode(nvmlDevice_t device, nvmlEnableState_t *mode); + +/** + * Queries process's accounting stats. + * + * For Kepler &tm; or newer fully supported devices. + * + * Accounting stats capture GPU utilization and other statistics across the lifetime of a process. + * Accounting stats can be queried during life time of the process and after its termination. + * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and + * updated to actual running time after its termination. + * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old + * processes. + * + * See \ref nvmlAccountingStats_t for description of each returned metric. + * List of processes that can be queried can be retrieved from \ref nvmlDeviceGetAccountingPids. + * + * @note Accounting Mode needs to be on. See \ref nvmlDeviceGetAccountingMode. + * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be + * queried since they don't contribute to GPU utilization. + * @note In case of pid collision stats of only the latest process (that terminated last) will be reported + * + * @warning On Kepler devices per process statistics are accurate only if there's one process running on a GPU. + * + * @param device The identifier of the target device + * @param pid Process Id of the target process to query stats for + * @param stats Reference in which to return the process's accounting stats + * + * @return + * - \ref NVML_SUCCESS if stats have been successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a stats are NULL + * - \ref NVML_ERROR_NOT_FOUND if process stats were not found + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetAccountingBufferSize + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAccountingStats(nvmlDevice_t device, unsigned int pid, nvmlAccountingStats_t *stats); + +/** + * Queries list of processes that can be queried for accounting stats. The list of processes returned + * can be in running or terminated state. + * + * For Kepler &tm; or newer fully supported devices. + * + * To just query the number of processes ready to be queried, call this function with *count = 0 and + * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty. + * + * For more details see \ref nvmlDeviceGetAccountingStats. + * + * @note In case of PID collision some processes might not be accessible before the circular buffer is full. + * + * @param device The identifier of the target device + * @param count Reference in which to provide the \a pids array size, and + * to return the number of elements ready to be queried + * @param pids Reference in which to return list of process ids + * + * @return + * - \ref NVML_SUCCESS if pids were successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a count is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to + * expected value) + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetAccountingBufferSize + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAccountingPids(nvmlDevice_t device, unsigned int *count, unsigned int *pids); + +/** + * Returns the number of processes that the circular buffer with accounting pids can hold. + * + * For Kepler &tm; or newer fully supported devices. + * + * This is the maximum number of processes that accounting information will be stored for before information + * about oldest processes will get overwritten by information about new processes. + * + * @param device The identifier of the target device + * @param bufferSize Reference in which to provide the size (in number of elements) + * of the circular buffer for accounting stats. + * + * @return + * - \ref NVML_SUCCESS if buffer size was successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a bufferSize is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature or accounting mode is disabled + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetAccountingStats + * @see nvmlDeviceGetAccountingPids + */ +nvmlReturn_t DECLDIR nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsigned int *bufferSize); + +/** @} */ + +/** @addtogroup nvmlDeviceQueries + * @{ + */ + +/** + * Returns the list of retired pages by source, including pages that are pending retirement + * The address information provided from this API is the hardware address of the page that was retired. Note + * that this does not match the virtual address used in CUDA, but will match the address information in XID 63 + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param cause Filter page addresses by cause of retirement + * @param pageCount Reference in which to provide the \a addresses buffer size, and + * to return the number of retired pages that match \a cause + * Set to 0 to query the size without allocating an \a addresses buffer + * @param addresses Buffer to write the page addresses into + * + * @return + * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the + * matching page addresses. \a pageCount is set to the needed size. + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or + * \a addresses is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageRetirementCause_t cause, + unsigned int *pageCount, unsigned long long *addresses); + +/** + * Check if any pages are pending retirement and need a reboot to fully retire. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param isPending Reference in which to return the pending status + * + * @return + * - \ref NVML_SUCCESS if \a isPending was populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a isPending is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPagesPendingStatus(nvmlDevice_t device, nvmlEnableState_t *isPending); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlUnitCommands Unit Commands + * This chapter describes NVML operations that change the state of the unit. For S-class products. + * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION + * error code when invoking any of these methods. + * @{ + */ +/***************************************************************************************************/ + +/** + * Set the LED state for the unit. The LED can be either green (0) or amber (1). + * + * For S-class products. + * Requires root/admin permissions. + * + * This operation takes effect immediately. + * + * + * Current S-Class products don't provide unique LEDs for each unit. As such, both front + * and back LEDs will be toggled in unison regardless of which unit is specified with this command. + * + * See \ref nvmlLedColor_t for available colors. + * + * @param unit The identifier of the target unit + * @param color The target LED color + * + * @return + * - \ref NVML_SUCCESS if the LED color has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a unit or \a color is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if this is not an S-class product + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlUnitGetLedState() + */ +nvmlReturn_t DECLDIR nvmlUnitSetLedState(nvmlUnit_t unit, nvmlLedColor_t color); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlDeviceCommands Device Commands + * This chapter describes NVML operations that change the state of the device. + * Each of these requires root/admin access. Non-admin users will see an NVML_ERROR_NO_PERMISSION + * error code when invoking any of these methods. + * @{ + */ +/***************************************************************************************************/ + +/** + * Set the persistence mode for the device. + * + * For all products. + * For Linux only. + * Requires root/admin permissions. + * + * The persistence mode determines whether the GPU driver software is torn down after the last client + * exits. + * + * This operation takes effect immediately. It is not persistent across reboots. After each reboot the + * persistence mode is reset to "Disabled". + * + * See \ref nvmlEnableState_t for available modes. + * + * @param device The identifier of the target device + * @param mode The target persistence mode + * + * @return + * - \ref NVML_SUCCESS if the persistence mode was set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetPersistenceMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceSetPersistenceMode(nvmlDevice_t device, nvmlEnableState_t mode); + +/** + * Set the compute mode for the device. + * + * For all products. + * Requires root/admin permissions. + * + * The compute mode determines whether a GPU can be used for compute operations and whether it can + * be shared across contexts. + * + * This operation takes effect immediately. Under Linux it is not persistent across reboots and + * always resets to "Default". Under windows it is persistent. + * + * Under windows compute mode may only be set to DEFAULT when running in WDDM + * + * See \ref nvmlComputeMode_t for details on available compute modes. + * + * @param device The identifier of the target device + * @param mode The target compute mode + * + * @return + * - \ref NVML_SUCCESS if the compute mode was set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetComputeMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceSetComputeMode(nvmlDevice_t device, nvmlComputeMode_t mode); + +/** + * Set the ECC mode for the device. + * + * For Kepler &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher. + * Requires root/admin permissions. + * + * The ECC mode determines whether the GPU enables its ECC support. + * + * This operation takes effect after the next reboot. + * + * See \ref nvmlEnableState_t for details on available modes. + * + * @param device The identifier of the target device + * @param ecc The target ECC mode + * + * @return + * - \ref NVML_SUCCESS if the ECC mode was set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a ecc is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetEccMode() + */ +nvmlReturn_t DECLDIR nvmlDeviceSetEccMode(nvmlDevice_t device, nvmlEnableState_t ecc); + +/** + * Clear the ECC error and other memory error counts for the device. + * + * For Kepler &tm; or newer fully supported devices. + * Only applicable to devices with ECC. + * Requires \a NVML_INFOROM_ECC version 2.0 or higher to clear aggregate location-based ECC counts. + * Requires \a NVML_INFOROM_ECC version 1.0 or higher to clear all other ECC counts. + * Requires root/admin permissions. + * Requires ECC Mode to be enabled. + * + * Sets all of the specified ECC counters to 0, including both detailed and total counts. + * + * This operation takes effect immediately. + * + * See \ref nvmlMemoryErrorType_t for details on available counter types. + * + * @param device The identifier of the target device + * @param counterType Flag that indicates which type of errors should be cleared. + * + * @return + * - \ref NVML_SUCCESS if the error counts were cleared + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a counterType is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see + * - nvmlDeviceGetDetailedEccErrors() + * - nvmlDeviceGetTotalEccErrors() + */ +nvmlReturn_t DECLDIR nvmlDeviceClearEccErrorCounts(nvmlDevice_t device, nvmlEccCounterType_t counterType); + +/** + * Set the driver model for the device. + * + * For Fermi &tm; or newer fully supported devices. + * For windows only. + * Requires root/admin permissions. + * + * On Windows platforms the device driver can run in either WDDM or WDM (TCC) mode. If a display is attached + * to the device it must run in WDDM mode. + * + * It is possible to force the change to WDM (TCC) while the display is still attached with a force flag (nvmlFlagForce). + * This should only be done if the host is subsequently powered down and the display is detached from the device + * before the next reboot. + * + * This operation takes effect after the next reboot. + * + * Windows driver model may only be set to WDDM when running in DEFAULT compute mode. + * + * Change driver model to WDDM is not supported when GPU doesn't support graphics acceleration or + * will not support it after reboot. See \ref nvmlDeviceSetGpuOperationMode. + * + * See \ref nvmlDriverModel_t for details on available driver models. + * See \ref nvmlFlagDefault and \ref nvmlFlagForce + * + * @param device The identifier of the target device + * @param driverModel The target driver model + * @param flags Flags that change the default behavior + * + * @return + * - \ref NVML_SUCCESS if the driver model has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a driverModel is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the platform is not windows or the device does not support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetDriverModel() + */ +nvmlReturn_t DECLDIR nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverModel_t driverModel, unsigned int flags); + +/** + * Set clocks that applications will lock to. + * + * Sets the clocks that compute and graphics applications will be running at. + * e.g. CUDA driver requests these clocks during context creation which means this property + * defines clocks at which CUDA applications will be running unless some overspec event + * occurs (e.g. over power, over thermal or external HW brake). + * + * Can be used as a setting to request constant performance. + * + * On Pascal and newer hardware, this will automatically disable automatic boosting of clocks. + * + * On K80 and newer Kepler and Maxwell GPUs, users desiring fixed performance should also call + * \ref nvmlDeviceSetAutoBoostedClocksEnabled to prevent clocks from automatically boosting + * above the clock value being set. + * + * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. + * Requires root/admin permissions. + * + * See \ref nvmlDeviceGetSupportedMemoryClocks and \ref nvmlDeviceGetSupportedGraphicsClocks + * for details on how to list available clocks combinations. + * + * After system reboot or driver reload applications clocks go back to their default value. + * See \ref nvmlDeviceResetApplicationsClocks. + * + * @param device The identifier of the target device + * @param memClockMHz Requested memory clock in MHz + * @param graphicsClockMHz Requested graphics clock in MHz + * + * @return + * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memClockMHz and \a graphicsClockMHz + * is not a valid clock combination + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int memClockMHz, unsigned int graphicsClockMHz); + +/** + * Set new power limit of this device. + * + * For Kepler &tm; or newer fully supported devices. + * Requires root/admin permissions. + * + * See \ref nvmlDeviceGetPowerManagementLimitConstraints to check the allowed ranges of values. + * + * \note Limit is not persistent across reboots or driver unloads. + * Enable persistent mode to prevent driver from unloading when no application is using the device. + * + * @param device The identifier of the target device + * @param limit Power management limit in milliwatts to set + * + * @return + * - \ref NVML_SUCCESS if \a limit has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a defaultLimit is out of range + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceGetPowerManagementLimitConstraints + * @see nvmlDeviceGetPowerManagementDefaultLimit + */ +nvmlReturn_t DECLDIR nvmlDeviceSetPowerManagementLimit(nvmlDevice_t device, unsigned int limit); + +/** + * Sets new GOM. See \a nvmlGpuOperationMode_t for details. + * + * For GK110 M-class and X-class Tesla &tm; products from the Kepler family. + * Modes \ref NVML_GOM_LOW_DP and \ref NVML_GOM_ALL_ON are supported on fully supported GeForce products. + * Not supported on Quadro ® and Tesla &tm; C-class products. + * Requires root/admin permissions. + * + * Changing GOMs requires a reboot. + * The reboot requirement might be removed in the future. + * + * Compute only GOMs don't support graphics acceleration. Under windows switching to these GOMs when + * pending driver model is WDDM is not supported. See \ref nvmlDeviceSetDriverModel. + * + * @param device The identifier of the target device + * @param mode Target GOM + * + * @return + * - \ref NVML_SUCCESS if \a mode has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a mode incorrect + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support GOM or specific mode + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlGpuOperationMode_t + * @see nvmlDeviceGetGpuOperationMode + */ +nvmlReturn_t DECLDIR nvmlDeviceSetGpuOperationMode(nvmlDevice_t device, nvmlGpuOperationMode_t mode); + +/** + * Changes the root/admin restructions on certain APIs. See \a nvmlRestrictedAPI_t for the list of supported APIs. + * This method can be used by a root/admin user to give non-root/admin access to certain otherwise-restricted APIs. + * The new setting lasts for the lifetime of the NVIDIA driver; it is not persistent. See \a nvmlDeviceGetAPIRestriction + * to query the current restriction settings. + * + * For Kepler &tm; or newer fully supported devices. + * Requires root/admin permissions. + * + * @param device The identifier of the target device + * @param apiType Target API type for this operation + * @param isRestricted The target restriction + * + * @return + * - \ref NVML_SUCCESS if \a isRestricted has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a apiType incorrect + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support changing API restrictions or the device does not support + * the feature that api restrictions are being set for (E.G. Enabling/disabling auto + * boosted clocks is not supported by the device) + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlRestrictedAPI_t + */ +nvmlReturn_t DECLDIR nvmlDeviceSetAPIRestriction(nvmlDevice_t device, nvmlRestrictedAPI_t apiType, nvmlEnableState_t isRestricted); + +/** + * @} + */ + +/** @addtogroup nvmlAccountingStats + * @{ + */ + +/** + * Enables or disables per process accounting. + * + * For Kepler &tm; or newer fully supported devices. + * Requires root/admin permissions. + * + * @note This setting is not persistent and will default to disabled after driver unloads. + * Enable persistence mode to be sure the setting doesn't switch off to disabled. + * + * @note Enabling accounting mode has no negative impact on the GPU performance. + * + * @note Disabling accounting clears all accounting pids information. + * + * See \ref nvmlDeviceGetAccountingMode + * See \ref nvmlDeviceGetAccountingStats + * See \ref nvmlDeviceClearAccountingPids + * + * @param device The identifier of the target device + * @param mode The target accounting mode + * + * @return + * - \ref NVML_SUCCESS if the new mode has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a mode are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetAccountingMode(nvmlDevice_t device, nvmlEnableState_t mode); + +/** + * Clears accounting information about all processes that have already terminated. + * + * For Kepler &tm; or newer fully supported devices. + * Requires root/admin permissions. + * + * See \ref nvmlDeviceGetAccountingMode + * See \ref nvmlDeviceGetAccountingStats + * See \ref nvmlDeviceSetAccountingMode + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if accounting information has been cleared + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device are invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceClearAccountingPids(nvmlDevice_t device); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup NvLink NvLink Methods + * This chapter describes methods that NVML can perform on NVLINK enabled devices. + * @{ + */ +/***************************************************************************************************/ + +/** + * Retrieves the state of the device's NvLink for the link specified + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param isActive \a nvmlEnableState_t where NVML_FEATURE_ENABLED indicates that + * the link is active and NVML_FEATURE_DISABLED indicates it + * is inactive + * + * @return + * - \ref NVML_SUCCESS if \a isActive has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a isActive is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkState(nvmlDevice_t device, unsigned int link, nvmlEnableState_t *isActive); + +/** + * Retrieves the version of the device's NvLink for the link specified + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param version Requested NvLink version + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a version is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkVersion(nvmlDevice_t device, unsigned int link, unsigned int *version); + +/** + * Retrieves the requested capability from the device's NvLink for the link specified + * Please refer to the \a nvmlNvLinkCapability_t structure for the specific caps that can be queried + * The return value should be treated as a boolean. + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param capability Specifies the \a nvmlNvLinkCapability_t to be queried + * @param capResult A boolean for the queried capability indicating that feature is available + * + * @return + * - \ref NVML_SUCCESS if \a capResult has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a capability is invalid or \a capResult is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkCapability(nvmlDevice_t device, unsigned int link, + nvmlNvLinkCapability_t capability, unsigned int *capResult); + +/** + * Retrieves the PCI information for the remote node on a NvLink link + * Note: pciSubSystemId is not filled in this function and is indeterminate + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param pci \a nvmlPciInfo_t of the remote node for the specified link + * + * @return + * - \ref NVML_SUCCESS if \a pci has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid or \a pci is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkRemotePciInfo(nvmlDevice_t device, unsigned int link, nvmlPciInfo_t *pci); + +/** + * Retrieves the specified error counter value + * Please refer to \a nvmlNvLinkErrorCounter_t for error counters that are available + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param counter Specifies the NvLink counter to be queried + * @param counterValue Returned counter value + * + * @return + * - \ref NVML_SUCCESS if \a counter has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid or \a counterValue is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkErrorCounter(nvmlDevice_t device, unsigned int link, + nvmlNvLinkErrorCounter_t counter, unsigned long long *counterValue); + +/** + * Resets all error counters to zero + * Please refer to \a nvmlNvLinkErrorCounter_t for the list of error counters that are reset + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * + * @return + * - \ref NVML_SUCCESS if the reset is successful + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device or \a link is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkErrorCounters(nvmlDevice_t device, unsigned int link); + +/** + * Set the NVLINK utilization counter control information for the specified counter, 0 or 1. + * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition. Performs a reset + * of the counters if the reset parameter is non-zero. + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param counter Specifies the counter that should be set (0 or 1). + * @param link Specifies the NvLink link to be queried + * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to set + * @param reset Resets the counters on set if non-zero + * + * @return + * - \ref NVML_SUCCESS if the control has been set successfully + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, + nvmlNvLinkUtilizationControl_t *control, unsigned int reset); + +/** + * Get the NVLINK utilization counter control information for the specified counter, 0 or 1. + * Please refer to \a nvmlNvLinkUtilizationControl_t for the structure definition + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param counter Specifies the counter that should be set (0 or 1). + * @param link Specifies the NvLink link to be queried + * @param control A reference to the \a nvmlNvLinkUtilizationControl_t to place information + * + * @return + * - \ref NVML_SUCCESS if the control has been set successfully + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, \a link, or \a control is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationControl(nvmlDevice_t device, unsigned int link, unsigned int counter, + nvmlNvLinkUtilizationControl_t *control); + + +/** + * Retrieve the NVLINK utilization counter based on the current control for a specified counter. + * In general it is good practice to use \a nvmlDeviceSetNvLinkUtilizationControl + * before reading the utilization counters as they have no default state + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param counter Specifies the counter that should be read (0 or 1). + * @param rxcounter Receive counter return value + * @param txcounter Transmit counter return value + * + * @return + * - \ref NVML_SUCCESS if \a rxcounter and \a txcounter have been successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a counter, or \a link is invalid or \a rxcounter or \a txcounter are NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetNvLinkUtilizationCounter(nvmlDevice_t device, unsigned int link, unsigned int counter, + unsigned long long *rxcounter, unsigned long long *txcounter); + +/** + * Freeze the NVLINK utilization counters + * Both the receive and transmit counters are operated on by this function + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be queried + * @param counter Specifies the counter that should be frozen (0 or 1). + * @param freeze NVML_FEATURE_ENABLED = freeze the receive and transmit counters + * NVML_FEATURE_DISABLED = unfreeze the receive and transmit counters + * + * @return + * - \ref NVML_SUCCESS if counters were successfully frozen or unfrozen + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, \a counter, or \a freeze is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceFreezeNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, + unsigned int counter, nvmlEnableState_t freeze); + +/** + * Reset the NVLINK utilization counters + * Both the receive and transmit counters are operated on by this function + * + * For Pascal &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param link Specifies the NvLink link to be reset + * @param counter Specifies the counter that should be reset (0 or 1) + * + * @return + * - \ref NVML_SUCCESS if counters were successfully reset + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device, \a link, or \a counter is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceResetNvLinkUtilizationCounter (nvmlDevice_t device, unsigned int link, unsigned int counter); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlEvents Event Handling Methods + * This chapter describes methods that NVML can perform against each device to register and wait for + * some event to occur. + * @{ + */ +/***************************************************************************************************/ + +/** + * Create an empty set of events. + * Event set should be freed by \ref nvmlEventSetFree + * + * For Fermi &tm; or newer fully supported devices. + * @param set Reference in which to return the event handle + * + * @return + * - \ref NVML_SUCCESS if the event has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a set is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventSetFree + */ +nvmlReturn_t DECLDIR nvmlEventSetCreate(nvmlEventSet_t *set); + +/** + * Starts recording of events on a specified devices and add the events to specified \ref nvmlEventSet_t + * + * For Fermi &tm; or newer fully supported devices. + * Ecc events are available only on ECC enabled devices (see \ref nvmlDeviceGetTotalEccErrors) + * Power capping events are available only on Power Management enabled devices (see \ref nvmlDeviceGetPowerManagementMode) + * + * For Linux only. + * + * \b IMPORTANT: Operations on \a set are not thread safe + * + * This call starts recording of events on specific device. + * All events that occurred before this call are not recorded. + * Checking if some event occurred can be done with \ref nvmlEventSetWait + * + * If function reports NVML_ERROR_UNKNOWN, event set is in undefined state and should be freed. + * If function reports NVML_ERROR_NOT_SUPPORTED, event set can still be used. None of the requested eventTypes + * are registered in that case. + * + * @param device The identifier of the target device + * @param eventTypes Bitmask of \ref nvmlEventType to record + * @param set Set to which add new event types + * + * @return + * - \ref NVML_SUCCESS if the event has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventTypes is invalid or \a set is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the platform does not support this feature or some of requested event types + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventType + * @see nvmlDeviceGetSupportedEventTypes + * @see nvmlEventSetWait + * @see nvmlEventSetFree + */ +nvmlReturn_t DECLDIR nvmlDeviceRegisterEvents(nvmlDevice_t device, unsigned long long eventTypes, nvmlEventSet_t set); + +/** + * Returns information about events supported on device + * + * For Fermi &tm; or newer fully supported devices. + * + * Events are not supported on Windows. So this function returns an empty mask in \a eventTypes on Windows. + * + * @param device The identifier of the target device + * @param eventTypes Reference in which to return bitmask of supported events + * + * @return + * - \ref NVML_SUCCESS if the eventTypes has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a eventType is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventType + * @see nvmlDeviceRegisterEvents + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedEventTypes(nvmlDevice_t device, unsigned long long *eventTypes); + +/** + * Waits on events and delivers events + * + * For Fermi &tm; or newer fully supported devices. + * + * If some events are ready to be delivered at the time of the call, function returns immediately. + * If there are no events ready to be delivered, function sleeps till event arrives + * but not longer than specified timeout. This function in certain conditions can return before + * specified timeout passes (e.g. when interrupt arrives) + * + * In case of xid error, the function returns the most recent xid error type seen by the system. If there are multiple + * xid errors generated before nvmlEventSetWait is invoked then the last seen xid error type is returned for all + * xid error events. + * + * @param set Reference to set of events to wait on + * @param data Reference in which to return event data + * @param timeoutms Maximum amount of wait time in milliseconds for registered event + * + * @return + * - \ref NVML_SUCCESS if the data has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a data is NULL + * - \ref NVML_ERROR_TIMEOUT if no event arrived in specified timeout or interrupt arrived + * - \ref NVML_ERROR_GPU_IS_LOST if a GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlEventType + * @see nvmlDeviceRegisterEvents + */ +nvmlReturn_t DECLDIR nvmlEventSetWait(nvmlEventSet_t set, nvmlEventData_t * data, unsigned int timeoutms); + +/** + * Releases events in the set + * + * For Fermi &tm; or newer fully supported devices. + * + * @param set Reference to events to be released + * + * @return + * - \ref NVML_SUCCESS if the event has been successfully released + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlDeviceRegisterEvents + */ +nvmlReturn_t DECLDIR nvmlEventSetFree(nvmlEventSet_t set); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlZPI Drain states + * This chapter describes methods that NVML can perform against each device to control their drain state + * and recognition by NVML and NVIDIA kernel driver. These methods can be used with out-of-band tools to + * power on/off GPUs, enable robust reset scenarios, etc. + * @{ + */ +/***************************************************************************************************/ + +/** + * Modify the drain state of a GPU. This method forces a GPU to no longer accept new incoming requests. + * Any new NVML process will no longer see this GPU. Persistence mode for this GPU must be turned off before + * this call is made. + * Must be called as administrator. + * For Linux only. + * + * For Pascal &tm; or newer fully supported devices. + * Some Kepler devices supported. + * + * @param pciInfo The PCI address of the GPU drain state to be modified + * @param newState The drain state that should be entered, see \ref nvmlEnableState_t + * + * @return + * - \ref NVML_SUCCESS if counters were successfully reset + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a newState is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation + * - \ref NVML_ERROR_IN_USE if the device has persistence mode turned on + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceModifyDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t newState); + +/** + * Query the drain state of a GPU. This method is used to check if a GPU is in a currently draining + * state. + * For Linux only. + * + * For Pascal &tm; or newer fully supported devices. + * Some Kepler devices supported. + * + * @param pciInfo The PCI address of the GPU drain state to be queried + * @param currentState The current drain state for this GPU, see \ref nvmlEnableState_t + * + * @return + * - \ref NVML_SUCCESS if counters were successfully reset + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex or \a currentState is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceQueryDrainState (nvmlPciInfo_t *pciInfo, nvmlEnableState_t *currentState); + +/** + * This method will remove the specified GPU from the view of both NVML and the NVIDIA kernel driver + * as long as no other processes are attached. If other processes are attached, this call will return + * NVML_ERROR_IN_USE and the GPU will be returned to its original "draining" state. Note: the + * only situation where a process can still be attached after nvmlDeviceModifyDrainState() is called + * to initiate the draining state is if that process was using, and is still using, a GPU before the + * call was made. Also note, persistence mode counts as an attachment to the GPU thus it must be disabled + * prior to this call. + * + * For long-running NVML processes please note that this will change the enumeration of current GPUs. + * For example, if there are four GPUs present and GPU1 is removed, the new enumeration will be 0-2. + * Also, device handles after the removed GPU will not be valid and must be re-established. + * Must be run as administrator. + * For Linux only. + * + * For Pascal &tm; or newer fully supported devices. + * Some Kepler devices supported. + * + * @param pciInfo The PCI address of the GPU to be removed + * + * @return + * - \ref NVML_SUCCESS if counters were successfully reset + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a nvmlIndex is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_IN_USE if the device is still in use and cannot be removed + */ +nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu (nvmlPciInfo_t *pciInfo); + +/** + * Request the OS and the NVIDIA kernel driver to rediscover a portion of the PCI subsystem looking for GPUs that + * were previously removed. The portion of the PCI tree can be narrowed by specifying a domain, bus, and device. + * If all are zeroes then the entire PCI tree will be searched. Please note that for long-running NVML processes + * the enumeration will change based on how many GPUs are discovered and where they are inserted in bus order. + * + * In addition, all newly discovered GPUs will be initialized and their ECC scrubbed which may take several seconds + * per GPU. Also, all device handles are no longer guaranteed to be valid post discovery. + * + * Must be run as administrator. + * For Linux only. + * + * For Pascal &tm; or newer fully supported devices. + * Some Kepler devices supported. + * + * @param pciInfo The PCI tree to be searched. Only the domain, bus, and device + * fields are used in this call. + * + * @return + * - \ref NVML_SUCCESS if counters were successfully reset + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pciInfo is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the operating system does not support this feature + * - \ref NVML_ERROR_OPERATING_SYSTEM if the operating system is denying this feature + * - \ref NVML_ERROR_NO_PERMISSION if the calling process has insufficient permissions to perform operation + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceDiscoverGpus (nvmlPciInfo_t *pciInfo); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlFieldValueQueries Field Value Queries + * This chapter describes NVML operations that are associated with retrieving Field Values from NVML + * @{ + */ +/***************************************************************************************************/ + +/** + * Request values for a list of fields for a device. This API allows multiple fields to be queried at once. + * If any of the underlying fieldIds are populated by the same driver call, the results for those field IDs + * will be populated from a single call rather than making a driver call for each fieldId. + * + * @param device The device handle of the GPU to request field values for + * @param valuesCount Number of entries in values that should be retrieved + * @param values Array of \a valuesCount structures to hold field values. + * Each value's fieldId must be populated prior to this call + * + * @return + * - \ref NVML_SUCCESS if any values in \a values were populated. Note that you must + * check the nvmlReturn field of each value for each individual + * status + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a values is NULL + */ +nvmlReturn_t DECLDIR nvmlDeviceGetFieldValues(nvmlDevice_t device, int valuesCount, nvmlFieldValue_t *values); + + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlGridQueries Grid Queries + * This chapter describes NVML operations that are associated with NVIDIA GRID products. + * @{ + */ +/***************************************************************************************************/ + +/** + * This method is used to get the virtualization mode corresponding to the GPU. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device Identifier of the target device + * @param pVirtualMode Reference to virtualization mode. One of NVML_GPU_VIRTUALIZATION_? + * + * @return + * - \ref NVML_SUCCESS if \a pVirtualMode is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t *pVirtualMode); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlGridCommands Grid Commands + * This chapter describes NVML operations that are associated with NVIDIA GRID products. + * @{ + */ +/***************************************************************************************************/ + +/** + * This method is used to set the virtualization mode corresponding to the GPU. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device Identifier of the target device + * @param virtualMode virtualization mode. One of NVML_GPU_VIRTUALIZATION_? + * + * @return + * - \ref NVML_SUCCESS if \a pVirtualMode is set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a pVirtualMode is NULL + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_SUPPORTED if setting of virtualization mode is not supported. + * - \ref NVML_ERROR_NO_PERMISSION if setting of virtualization mode is not allowed for this client. + */ +nvmlReturn_t DECLDIR nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGpuVirtualizationMode_t virtualMode); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlVgpu vGPU Management + * @{ + * + * Set of APIs supporting GRID vGPU + */ +/***************************************************************************************************/ + +/** + * Retrieve the supported vGPU types on a physical GPU (device). + * + * An array of supported vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer + * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount + * is used to return the number of vGPU types written to the buffer. + * + * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns + * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. + * To query the number of vGPU types supported for the GPU, call this function with *vgpuCount = 0. + * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are supported. + * + * @param device The identifier of the target device + * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types + * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL or \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_VGPU_ECC_NOT_SUPPORTED if ECC is enabled on the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetSupportedVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); + +/** + * Retrieve the currently creatable vGPU types on a physical GPU (device). + * + * An array of creatable vGPU types for the physical GPU indicated by \a device is returned in the caller-supplied buffer + * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount + * is used to return the number of vGPU types written to the buffer. + * + * The creatable vGPU types for a device may differ over time, as there may be restrictions on what type of vGPU types + * can concurrently run on a device. For example, if only one vGPU type is allowed at a time on a device, then the creatable + * list will be restricted to whatever vGPU type is already running on the device. + * + * If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns + * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. + * To query the number of vGPU types createable for the GPU, call this function with *vgpuCount = 0. + * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are creatable. + * + * @param device The identifier of the target device + * @param vgpuCount Pointer to caller-supplied array size, and returns number of vGPU types + * @param vgpuTypeIds Pointer to caller-supplied array in which to return list of vGPU types + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_INSUFFICIENT_SIZE \a vgpuTypeIds buffer is too small, array element count is returned in \a vgpuCount + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuCount is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_VGPU_ECC_NOT_SUPPORTED if ECC is enabled on the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetCreatableVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuTypeId_t *vgpuTypeIds); + +/** + * Retrieve the class of a vGPU type. It will not exceed 64 characters in length (including the NUL terminator). + * See \ref nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param vgpuTypeClass Pointer to string array to return class in + * @param size Size of string + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeClass is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetClass(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeClass, unsigned int *size); + +/** + * Retrieve the vGPU type name. + * + * The name is an alphanumeric string that denotes a particular vGPU, e.g. GRID M60-2Q. It will not + * exceed 64 characters in length (including the NUL terminator). See \ref + * nvmlConstants::NVML_DEVICE_NAME_BUFFER_SIZE. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param vgpuTypeName Pointer to buffer to return name + * @param size Size of buffer + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a name is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetName(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeName, unsigned int *size); + +/** + * Retrieve the device ID of a vGPU type. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param deviceID Device ID and vendor ID of the device contained in single 32 bit value + * @param subsystemID Subsytem ID and subsytem vendor ID of the device contained in single 32 bit value + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a deviceId or \a subsystemID are NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetDeviceID(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *deviceID, unsigned long long *subsystemID); + +/** + * Retrieve the vGPU framebuffer size in bytes. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param fbSize Pointer to framebuffer size in bytes + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a fbSize is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetFramebufferSize(nvmlVgpuTypeId_t vgpuTypeId, unsigned long long *fbSize); + +/** + * Retrieve count of vGPU's supported display heads. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param numDisplayHeads Pointer to number of display heads + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a numDisplayHeads is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetNumDisplayHeads(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *numDisplayHeads); + +/** + * Retrieve vGPU display head's maximum supported resolution. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param displayIndex Zero-based index of display head + * @param xdim Pointer to maximum number of pixels in X dimension + * @param ydim Pointer to maximum number of pixels in Y dimension + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a xdim or \a ydim are NULL, or \a displayIndex + * is out of range. + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetResolution(nvmlVgpuTypeId_t vgpuTypeId, unsigned int displayIndex, unsigned int *xdim, unsigned int *ydim); + +/** + * Retrieve license requirements for a vGPU type + * + * The license type and version required to run the specified vGPU type is returned as an alphanumeric string, in the form + * ",", for example "GRID-Virtual-PC,2.0". If a vGPU is runnable with* more than one type of license, + * the licenses are delimited by a semicolon, for example "GRID-Virtual-PC,2.0;GRID-Virtual-WS,2.0;GRID-Virtual-WS-Ext,2.0". + * + * The total length of the returned string will not exceed 128 characters, including the NUL terminator. + * See \ref nvmlVgpuConstants::NVML_GRID_LICENSE_BUFFER_SIZE. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param vgpuTypeLicenseString Pointer to buffer to return license info + * @param size Size of \a vgpuTypeLicenseString buffer + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid, or \a vgpuTypeLicenseString is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetLicense(nvmlVgpuTypeId_t vgpuTypeId, char *vgpuTypeLicenseString, unsigned int size); + +/** + * Retrieve the static frame rate limit value of the vGPU type + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuTypeId Handle to vGPU type + * @param frameRateLimit Reference to return the frame rate limit value + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a frameRateLimit is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetFrameRateLimit(nvmlVgpuTypeId_t vgpuTypeId, unsigned int *frameRateLimit); + +/** + * Retrieve the maximum number of vGPU instances creatable on a device for given vGPU type + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param vgpuTypeId Handle to vGPU type + * @param vgpuInstanceCount Pointer to get the max number of vGPU instances + * that can be created on a deicve for given vgpuTypeId + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuTypeId is invalid or is not supported on target device, + * or \a vgpuInstanceCount is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstances(nvmlDevice_t device, nvmlVgpuTypeId_t vgpuTypeId, unsigned int *vgpuInstanceCount); + +/** + * Retrieve the active vGPU instances on a device. + * + * An array of active vGPU instances is returned in the caller-supplied buffer pointed at by \a vgpuInstances. The + * array elememt count is passed in \a vgpuCount, and \a vgpuCount is used to return the number of vGPU instances + * written to the buffer. + * + * If the supplied buffer is not large enough to accomodate the vGPU instance array, the function returns + * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuInstance_t array required in \a vgpuCount. + * To query the number of active vGPU instances, call this function with *vgpuCount = 0. The code will return + * NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU Types are supported. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param vgpuCount Pointer which passes in the array size as well as get + * back the number of types + * @param vgpuInstances Pointer to array in which to return list of vGPU instances + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a vgpuCount is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetActiveVgpus(nvmlDevice_t device, unsigned int *vgpuCount, nvmlVgpuInstance_t *vgpuInstances); + +/** + * Retrieve the VM ID associated with a vGPU instance. + * + * The VM ID is returned as a string, not exceeding 80 characters in length (including the NUL terminator). + * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. + * + * The format of the VM ID varies by platform, and is indicated by the type identifier returned in \a vmIdType. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param vmId Pointer to caller-supplied buffer to hold VM ID + * @param size Size of buffer in bytes + * @param vmIdType Pointer to hold VM ID type + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a vmId or \a vmIdType are NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmID(nvmlVgpuInstance_t vgpuInstance, char *vmId, unsigned int size, nvmlVgpuVmIdType_t *vmIdType); + +/** + * Retrieve the UUID of a vGPU instance. + * + * The UUID is a globally unique identifier associated with the vGPU, and is returned as a 5-part hexadecimal string, + * not exceeding 80 characters in length (including the NULL terminator). + * See \ref nvmlConstants::NVML_DEVICE_UUID_BUFFER_SIZE. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param uuid Pointer to caller-supplied buffer to hold vGPU UUID + * @param size Size of buffer in bytes + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a uuid is NULL + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetUUID(nvmlVgpuInstance_t vgpuInstance, char *uuid, unsigned int size); + +/** + * Retrieve the NVIDIA driver version installed in the VM associated with a vGPU. + * + * The version is returned as an alphanumeric string in the caller-supplied buffer \a version. The length of the version + * string will not exceed 80 characters in length (including the NUL terminator). + * See \ref nvmlConstants::NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE. + * + * nvmlVgpuInstanceGetVmDriverVersion() may be called at any time for a vGPU instance. The guest VM driver version is + * returned as "Unknown" if no NVIDIA driver is installed in the VM, or the VM has not yet booted to the point where the + * NVIDIA driver is loaded and initialized. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param version Caller-supplied buffer to return driver version string + * @param length Size of \a version buffer + * + * @return + * - \ref NVML_SUCCESS if \a version has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmDriverVersion(nvmlVgpuInstance_t vgpuInstance, char* version, unsigned int length); + +/** + * Retrieve the framebuffer usage in bytes. + * + * Framebuffer usage is the amont of vGPU framebuffer memory that is currently in use by the VM. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance The identifier of the target instance + * @param fbUsage Pointer to framebuffer usage in bytes + * + * @return + * - \ref NVML_SUCCESS successful completion + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a fbUsage is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFbUsage(nvmlVgpuInstance_t vgpuInstance, unsigned long long *fbUsage); + +/** + * Retrieve the current licensing state of the vGPU instance. + * + * If the vGPU is currently licensed, \a licensed is set to 1, otherwise it is set to 0. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param licensed Reference to return the licensing status + * + * @return + * - \ref NVML_SUCCESS if \a licensed has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a licensed is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseStatus(nvmlVgpuInstance_t vgpuInstance, unsigned int *licensed); + +/** + * Retrieve the vGPU type of a vGPU instance. + * + * Returns the vGPU type ID of vgpu assigned to the vGPU instance. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param vgpuTypeId Reference to return the vgpuTypeId + * + * @return + * - \ref NVML_SUCCESS if \a vgpuTypeId has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a vgpuTypeId is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetType(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuTypeId_t *vgpuTypeId); + +/** + * Retrieve the frame rate limit set for the vGPU instance. + * + * Returns the value of the frame rate limit set for the vGPU instance + * + * For Kepler &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param frameRateLimit Reference to return the frame rate limit + * + * @return + * - \ref NVML_SUCCESS if \a frameRateLimit has been set + * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a frameRateLimit is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFrameRateLimit(nvmlVgpuInstance_t vgpuInstance, unsigned int *frameRateLimit); + +/** + * Retrieve the encoder Capacity of a vGPU instance, in macroblocks per second. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param encoderCapacity Reference to an unsigned int for the encoder capacity + * + * @return + * - \ref NVML_SUCCESS if \a encoderCapacity has been retrived + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a encoderQueryType is invalid + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int *encoderCapacity); + +/** + * Set the encoder Capacity of a vGPU instance, in macroblocks per second. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param encoderCapacity Unsigned int for the encoder capacity value + * + * @return + * - \ref NVML_SUCCESS if \a encoderCapacity has been set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceSetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int encoderCapacity); + +/** + * Retrieves current utilization for vGPUs on a physical GPU (device). + * + * For Kepler &tm; or newer fully supported devices. + * + * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for vGPU instances running + * on a device. Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer + * pointed at by \a utilizationSamples. One utilization sample structure is returned per vGPU instance, and includes the + * CPU timestamp at which the samples were recorded. Individual utilization values are returned as "unsigned int" values + * in nvmlValue_t unions. The function sets the caller-supplied \a sampleValType to NVML_VALUE_TYPE_UNSIGNED_INT to + * indicate the returned value type. + * + * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with + * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance + * count in \a vgpuInstanceSamplesCount, or NVML_SUCCESS if the current vGPU instance count is zero. The caller should allocate + * a buffer of size vgpuInstanceSamplesCount * sizeof(nvmlVgpuInstanceUtilizationSample_t). Invoke the function again with + * the allocated buffer passed in \a utilizationSamples, and \a vgpuInstanceSamplesCount set to the number of entries the + * buffer is sized for. + * + * On successful return, the function updates \a vgpuInstanceSampleCount with the number of vGPU utilization sample + * structures that were actually written. This may differ from a previously read value as vGPU instances are created or + * destroyed. + * + * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 + * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp + * to a timeStamp retrieved from a previous query to read utilization since the previous query. + * + * @param device The identifier for the target device + * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. + * @param sampleValType Pointer to caller-supplied buffer to hold the type of returned sample values + * @param vgpuInstanceSamplesCount Pointer to caller-supplied array size, and returns number of vGPU instances + * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU utilization samples are returned + + * @return + * - \ref NVML_SUCCESS if utilization samples are successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuInstanceSamplesCount or \a sampleValType is + * NULL, or a sample count of 0 is passed with a non-NULL \a utilizationSamples + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuInstanceSamplesCount is too small to return samples for all + * vGPU instances currently executing on the device + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, + nvmlValueType_t *sampleValType, unsigned int *vgpuInstanceSamplesCount, + nvmlVgpuInstanceUtilizationSample_t *utilizationSamples); + +/** + * Retrieves current utilization for processes running on vGPUs on a physical GPU (device). + * + * For Maxwell &tm; or newer fully supported devices. + * + * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running on + * vGPU instances active on a device. Utilization values are returned as an array of utilization sample structures in the + * caller-supplied buffer pointed at by \a utilizationSamples. One utilization sample structure is returned per process running + * on vGPU instances, that had some non-zero utilization during the last sample period. It includes the CPU timestamp at which + * the samples were recorded. Individual utilization values are returned as "unsigned int" values. + * + * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with + * \a utilizationSamples set to NULL. The function will return NVML_ERROR_INSUFFICIENT_SIZE, with the current vGPU instance + * count in \a vgpuProcessSamplesCount. The caller should allocate a buffer of size + * vgpuProcessSamplesCount * sizeof(nvmlVgpuProcessUtilizationSample_t). Invoke the function again with + * the allocated buffer passed in \a utilizationSamples, and \a vgpuProcessSamplesCount set to the number of entries the + * buffer is sized for. + * + * On successful return, the function updates \a vgpuSubProcessSampleCount with the number of vGPU sub process utilization sample + * structures that were actually written. This may differ from a previously read value depending on the number of processes that are active + * in any given sample period. + * + * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 + * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp + * to a timeStamp retrieved from a previous query to read utilization since the previous query. + * + * @param device The identifier for the target device + * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. + * @param vgpuProcessSamplesCount Pointer to caller-supplied array size, and returns number of processes running on vGPU instances + * @param utilizationSamples Pointer to caller-supplied buffer in which vGPU sub process utilization samples are returned + + * @return + * - \ref NVML_SUCCESS if utilization samples are successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a vgpuProcessSamplesCount or a sample count of 0 is + * passed with a non-NULL \a utilizationSamples + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if supplied \a vgpuProcessSamplesCount is too small to return samples for all + * vGPU instances currently executing on the device + * - \ref NVML_ERROR_NOT_SUPPORTED if vGPU is not supported by the device + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_NOT_FOUND if sample entries are not found + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuProcessUtilization(nvmlDevice_t device, unsigned long long lastSeenTimeStamp, + unsigned int *vgpuProcessSamplesCount, + nvmlVgpuProcessUtilizationSample_t *utilizationSamples); +/** + * Retrieve the GRID licensable features. + * + * Identifies whether the system supports GRID Software Licensing. If it does, return the list of licensable feature(s) + * and their current license status. + * + * @param device Identifier of the target device + * @param pGridLicensableFeatures Pointer to structure in which GRID licensable features are returned + * + * @return + * - \ref NVML_SUCCESS if licensable features are successfully retrieved + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a pGridLicensableFeatures is NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures(nvmlDevice_t device, nvmlGridLicensableFeatures_t *pGridLicensableFeatures); + +/** + * Retrieves the current encoder statistics of a vGPU Instance + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param sessionCount Reference to an unsigned int for count of active encoder sessions + * @param averageFps Reference to an unsigned int for trailing average FPS of all active sessions + * @param averageLatency Reference to an unsigned int for encode latency in microseconds + * + * @return + * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount , or \a averageFps or \a averageLatency is NULL + * or \a vgpuInstance is invalid. + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, + unsigned int *averageFps, unsigned int *averageLatency); + +/** + * Retrieves information about all active encoder sessions on a vGPU Instance. + * + * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The + * array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions + * written to the buffer. + * + * If the supplied buffer is not large enough to accomodate the active session array, the function returns + * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. + * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return + * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance Identifier of the target vGPU instance + * @param sessionCount Reference to caller supplied array size, and returns + * the number of sessions. + * @param sessionInfo Reference to caller supplied array in which the list + * of session information us returned. + * + * @return + * - \ref NVML_SUCCESS if \a sessionInfo is fetched + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is + returned in \a sessionCount + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL or \a vgpuInstance is invalid.. + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfo); + +/** + * Retrieves the current utilization and process ID + * + * For Maxwell &tm; or newer fully supported devices. + * + * Reads recent utilization of GPU SM (3D/Compute), framebuffer, video encoder, and video decoder for processes running. + * Utilization values are returned as an array of utilization sample structures in the caller-supplied buffer pointed at + * by \a utilization. One utilization sample structure is returned per process running, that had some non-zero utilization + * during the last sample period. It includes the CPU timestamp at which the samples were recorded. Individual utilization values + * are returned as "unsigned int" values. + * + * To read utilization values, first determine the size of buffer required to hold the samples by invoking the function with + * \a utilization set to NULL. The caller should allocate a buffer of size + * processSamplesCount * sizeof(nvmlProcessUtilizationSample_t). Invoke the function again with the allocated buffer passed + * in \a utilization, and \a processSamplesCount set to the number of entries the buffer is sized for. + * + * On successful return, the function updates \a processSamplesCount with the number of process utilization sample + * structures that were actually written. This may differ from a previously read value as instances are created or + * destroyed. + * + * lastSeenTimeStamp represents the CPU timestamp in microseconds at which utilization samples were last read. Set it to 0 + * to read utilization based on all the samples maintained by the driver's internal sample buffer. Set lastSeenTimeStamp + * to a timeStamp retrieved from a previous query to read utilization since the previous query. + * + * @param device The identifier of the target device + * @param utilization Pointer to caller-supplied buffer in which guest process utilization samples are returned + * @param processSamplesCount Pointer to caller-supplied array size, and returns number of processes running + * @param lastSeenTimeStamp Return only samples with timestamp greater than lastSeenTimeStamp. + + * @return + * - \ref NVML_SUCCESS if \a utilization has been populated + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a utilization is NULL, or \a samplingPeriodUs is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization, + unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp); + +/** @} */ + +/** + * NVML API versioning support + */ +#if defined(__NVML_API_VERSION_INTERNAL) +#undef nvmlDeviceGetNvLinkRemotePciInfo +#undef nvmlDeviceGetPciInfo +#undef nvmlDeviceGetCount +#undef nvmlDeviceGetHandleByIndex +#undef nvmlDeviceGetHandleByPciBusId +#undef nvmlInit +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/external/CUDA/nvrtc.h b/include/external/CUDA/nvrtc.h new file mode 100755 index 000000000..1d2acd272 --- /dev/null +++ b/include/external/CUDA/nvrtc.h @@ -0,0 +1,525 @@ +// +// NVIDIA_COPYRIGHT_BEGIN +// +// Copyright (c) 2014-2017, NVIDIA CORPORATION. All rights reserved. +// +// NVIDIA CORPORATION and its licensors retain all intellectual property +// and proprietary rights in and to this software, related documentation +// and any modifications thereto. Any use, reproduction, disclosure or +// distribution of this software and related documentation without an express +// license agreement from NVIDIA CORPORATION is strictly prohibited. +// +// NVIDIA_COPYRIGHT_END +// + +#ifndef __NVRTC_H__ +#define __NVRTC_H__ + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +#include + + +/*************************************************************************//** + * + * \defgroup error Error Handling + * + * NVRTC defines the following enumeration type and function for API call + * error handling. + * + ****************************************************************************/ + + +/** + * \ingroup error + * \brief The enumerated type nvrtcResult defines API call result codes. + * NVRTC API functions return nvrtcResult to indicate the call + * result. + */ +typedef enum { + NVRTC_SUCCESS = 0, + NVRTC_ERROR_OUT_OF_MEMORY = 1, + NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2, + NVRTC_ERROR_INVALID_INPUT = 3, + NVRTC_ERROR_INVALID_PROGRAM = 4, + NVRTC_ERROR_INVALID_OPTION = 5, + NVRTC_ERROR_COMPILATION = 6, + NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7, + NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8, + NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9, + NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10, + NVRTC_ERROR_INTERNAL_ERROR = 11 +} nvrtcResult; + + +/** + * \ingroup error + * \brief nvrtcGetErrorString is a helper function that returns a string + * describing the given nvrtcResult code, e.g., NVRTC_SUCCESS to + * \c "NVRTC_SUCCESS". + * For unrecognized enumeration values, it returns + * \c "NVRTC_ERROR unknown". + * + * \param [in] result CUDA Runtime Compilation API result code. + * \return Message string for the given #nvrtcResult code. + */ +const char *nvrtcGetErrorString(nvrtcResult result); + + +/*************************************************************************//** + * + * \defgroup query General Information Query + * + * NVRTC defines the following function for general information query. + * + ****************************************************************************/ + + +/** + * \ingroup query + * \brief nvrtcVersion sets the output parameters \p major and \p minor + * with the CUDA Runtime Compilation version number. + * + * \param [out] major CUDA Runtime Compilation major version number. + * \param [out] minor CUDA Runtime Compilation minor version number. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * + */ +nvrtcResult nvrtcVersion(int *major, int *minor); + + +/*************************************************************************//** + * + * \defgroup compilation Compilation + * + * NVRTC defines the following type and functions for actual compilation. + * + ****************************************************************************/ + + +/** + * \ingroup compilation + * \brief nvrtcProgram is the unit of compilation, and an opaque handle for + * a program. + * + * To compile a CUDA program string, an instance of nvrtcProgram must be + * created first with ::nvrtcCreateProgram, then compiled with + * ::nvrtcCompileProgram. + */ +typedef struct _nvrtcProgram *nvrtcProgram; + + +/** + * \ingroup compilation + * \brief nvrtcCreateProgram creates an instance of nvrtcProgram with the + * given input parameters, and sets the output parameter \p prog with + * it. + * + * \param [out] prog CUDA Runtime Compilation program. + * \param [in] src CUDA program source. + * \param [in] name CUDA program name.\n + * \p name can be \c NULL; \c "default_program" is + * used when \p name is \c NULL. + * \param [in] numHeaders Number of headers used.\n + * \p numHeaders must be greater than or equal to 0. + * \param [in] headers Sources of the headers.\n + * \p headers can be \c NULL when \p numHeaders is + * 0. + * \param [in] includeNames Name of each header by which they can be + * included in the CUDA program source.\n + * \p includeNames can be \c NULL when \p numHeaders + * is 0. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink + * - \link #nvrtcResult NVRTC_ERROR_PROGRAM_CREATION_FAILURE \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink + * + * \see ::nvrtcDestroyProgram + */ +nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, + const char *src, + const char *name, + int numHeaders, + const char * const *headers, + const char * const *includeNames); + + +/** + * \ingroup compilation + * \brief nvrtcDestroyProgram destroys the given program. + * + * \param [in] prog CUDA Runtime Compilation program. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink + * + * \see ::nvrtcCreateProgram + */ +nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog); + + +/** + * \ingroup compilation + * \brief nvrtcCompileProgram compiles the given program. + * + * It supports compile options listed in \ref options. + */ +nvrtcResult nvrtcCompileProgram(nvrtcProgram prog, + int numOptions, const char * const *options); + + +/** + * \ingroup compilation + * \brief nvrtcGetPTXSize sets \p ptxSizeRet with the size of the PTX + * generated by the previous compilation of \p prog (including the + * trailing \c NULL). + * + * \param [in] prog CUDA Runtime Compilation program. + * \param [out] ptxSizeRet Size of the generated PTX (including the trailing + * \c NULL). + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink + * + * \see ::nvrtcGetPTX + */ +nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet); + + +/** + * \ingroup compilation + * \brief nvrtcGetPTX stores the PTX generated by the previous compilation + * of \p prog in the memory pointed by \p ptx. + * + * \param [in] prog CUDA Runtime Compilation program. + * \param [out] ptx Compiled result. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink + * + * \see ::nvrtcGetPTXSize + */ +nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx); + + +/** + * \ingroup compilation + * \brief nvrtcGetProgramLogSize sets \p logSizeRet with the size of the + * log generated by the previous compilation of \p prog (including the + * trailing \c NULL). + * + * Note that compilation log may be generated with warnings and informative + * messages, even when the compilation of \p prog succeeds. + * + * \param [in] prog CUDA Runtime Compilation program. + * \param [out] logSizeRet Size of the compilation log + * (including the trailing \c NULL). + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink + * + * \see ::nvrtcGetProgramLog + */ +nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet); + + +/** + * \ingroup compilation + * \brief nvrtcGetProgramLog stores the log generated by the previous + * compilation of \p prog in the memory pointed by \p log. + * + * \param [in] prog CUDA Runtime Compilation program. + * \param [out] log Compilation log. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink + * + * \see ::nvrtcGetProgramLogSize + */ +nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log); + + +/** + * \ingroup compilation + * \brief nvrtcAddNameExpression notes the given name expression + * denoting a __global__ function or function template + * instantiation. + * + * The identical name expression string must be provided on a subsequent + * call to nvrtcGetLoweredName to extract the lowered name. + * \param [in] prog CUDA Runtime Compilation program. + * \param [in] name_expression constant expression denoting a __global__ + * function or function template instantiation. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION \endlink + * + * \see ::nvrtcGetLoweredName + */ +nvrtcResult nvrtcAddNameExpression(nvrtcProgram prog, + const char * const name_expression); + +/** + * \ingroup compilation + * \brief nvrtcGetLoweredName extracts the lowered (mangled) name + * for a __global__ function or function template instantiation, + * and updates *lowered_name to point to it. The memory containing + * the name is released when the NVRTC program is destroyed by + * nvrtcDestroyProgram. + * The identical name expression must have been previously + * provided to nvrtcAddNameExpression. + * + * \param [in] prog CUDA Runtime Compilation program. + * \param [in] name_expression constant expression denoting a __global__ + * function or function template instantiation. + * \param [out] lowered_name initialized by the function to point to a + * C string containing the lowered (mangled) + * name corresponding to the provided name expression. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION \endlink + * - \link #nvrtcResult NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID \endlink + * + * \see ::nvrtcAddNameExpression + */ +nvrtcResult nvrtcGetLoweredName(nvrtcProgram prog, + const char *const name_expression, + const char** lowered_name); + + +/** + * \defgroup options Supported Compile Options + * + * NVRTC supports the compile options below. + * Option names with two preceding dashs (\c --) are long option names and + * option names with one preceding dash (\c -) are short option names. + * Short option names can be used instead of long option names. + * When a compile option takes an argument, an assignment operator (\c =) + * is used to separate the compile option argument from the compile option + * name, e.g., \c "--gpu-architecture=compute_30". + * Alternatively, the compile option name and the argument can be specified in + * separate strings without an assignment operator, .e.g, + * \c "--gpu-architecture" \c "compute_30". + * Single-character short option names, such as \c -D, \c -U, and \c -I, do + * not require an assignment operator, and the compile option name and the + * argument can be present in the same string with or without spaces between + * them. + * For instance, \c "-D=", \c "-D", and \c "-D " are all + * supported. + * + * The valid compiler options are: + * + * - Compilation targets + * - \c --gpu-architecture=\ (\c -arch)\n + * Specify the name of the class of GPU architectures for which the + * input must be compiled.\n + * - Valid \s: + * - \c compute_30 + * - \c compute_32 + * - \c compute_35 + * - \c compute_37 + * - \c compute_50 + * - \c compute_52 + * - \c compute_53 + * - \c compute_60 + * - \c compute_61 + * - \c compute_62 + * - \c compute_70 + * - \c compute_72 + * - Default: \c compute_30 + * - Separate compilation / whole-program compilation + * - \c --device-c (\c -dc)\n + * Generate relocatable code that can be linked with other relocatable + * device code. It is equivalent to --relocatable-device-code=true. + * - \c --device-w (\c -dw)\n + * Generate non-relocatable code. It is equivalent to + * \c --relocatable-device-code=false. + * - \c --relocatable-device-code={true|false} (\c -rdc)\n + * Enable (disable) the generation of relocatable device code. + * - Default: \c false + * - Debugging support + * - \c --device-debug (\c -G)\n + * Generate debug information. + * - \c --generate-line-info (\c -lineinfo)\n + * Generate line-number information. + * - Code generation + * - \c --maxrregcount=\ (\c -maxrregcount)\n + * Specify the maximum amount of registers that GPU functions can use. + * Until a function-specific limit, a higher value will generally + * increase the performance of individual GPU threads that execute this + * function. However, because thread registers are allocated from a + * global register pool on each GPU, a higher value of this option will + * also reduce the maximum thread block size, thereby reducing the amount + * of thread parallelism. Hence, a good maxrregcount value is the result + * of a trade-off. If this option is not specified, then no maximum is + * assumed. Value less than the minimum registers required by ABI will + * be bumped up by the compiler to ABI minimum limit. + * - \c --ftz={true|false} (\c -ftz)\n + * When performing single-precision floating-point operations, flush + * denormal values to zero or preserve denormal values. + * \c --use_fast_math implies \c --ftz=true. + * - Default: \c false + * - \c --prec-sqrt={true|false} (\c -prec-sqrt)\n + * For single-precision floating-point square root, use IEEE + * round-to-nearest mode or use a faster approximation. + * \c --use_fast_math implies \c --prec-sqrt=false. + * - Default: \c true + * - \c --prec-div={true|false} (\c -prec-div)\n + * For single-precision floating-point division and reciprocals, use IEEE + * round-to-nearest mode or use a faster approximation. + * \c --use_fast_math implies \c --prec-div=false. + * - Default: \c true + * - \c --fmad={true|false} (\c -fmad)\n + * Enables (disables) the contraction of floating-point multiplies and + * adds/subtracts into floating-point multiply-add operations (FMAD, + * FFMA, or DFMA). \c --use_fast_math implies \c --fmad=true. + * - Default: \c true + * - \c --use_fast_math (\c -use_fast_math)\n + * Make use of fast math operations. + * \c --use_fast_math implies \c --ftz=true \c --prec-div=false + * \c --prec-sqrt=false \c --fmad=true. + * - Preprocessing + * - \c --define-macro=\ (\c -D)\n + * \c \ can be either \c \ or \c \. + * - \c \ \n + * Predefine \c \ as a macro with definition \c 1. + * - \c \=\ \n + * The contents of \c \ are tokenized and preprocessed + * as if they appeared during translation phase three in a \c \#define + * directive. In particular, the definition will be truncated by + * embedded new line characters. + * - \c --undefine-macro=\ (\c -U)\n + * Cancel any previous definition of \c \. + * - \c --include-path=\ (\c -I)\n + * Add the directory \c \ to the list of directories to be + * searched for headers. These paths are searched after the list of + * headers given to ::nvrtcCreateProgram. + * - \c --pre-include=\ (\c -include)\n + * Preinclude \c \ during preprocessing. + * - Language Dialect + * - \c --std={c++11|c++14} (\c -std={c++11|c++14})\n + * Set language dialect to C++11 or C++14. + * - \c --builtin-move-forward={true|false} (\c -builtin-move-forward)\n + * Provide builtin definitions of \c std::move and \c std::forward, + * when C++11 language dialect is selected. + * - Default: \c true + * - \c --builtin-initializer-list={true|false} + * (\c -builtin-initializer-list)\n + * Provide builtin definitions of \c std::initializer_list class and + * member functions when C++11 language dialect is selected. + * - Default: \c true + * - Misc. + * - \c --disable-warnings (\c -w)\n + * Inhibit all warning messages. + * - \c --restrict (\c -restrict)\n + * Programmer assertion that all kernel pointer parameters are restrict + * pointers. + * - \c --device-as-default-execution-space + * (\c -default-device)\n + * Treat entities with no execution space annotation as \c __device__ + * entities. + * + * \param [in] prog CUDA Runtime Compilation program. + * \param [in] numOptions Number of compiler options passed. + * \param [in] options Compiler options in the form of C string array.\n + * \p options can be \c NULL when \p numOptions is 0. + * + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink + * - \link #nvrtcResult NVRTC_ERROR_INVALID_OPTION \endlink + * - \link #nvrtcResult NVRTC_ERROR_COMPILATION \endlink + * - \link #nvrtcResult NVRTC_ERROR_BUILTIN_OPERATION_FAILURE \endlink + */ + + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + + +/* The utility function 'nvrtcGetTypeName' is not available by default. Define + the macro 'NVRTC_GET_TYPE_NAME' to a non-zero value to make it available. +*/ + +#if NVRTC_GET_TYPE_NAME || __DOXYGEN_ONLY__ + +#if NVRTC_USE_CXXABI || __clang__ || __GNUC__ || __DOXYGEN_ONLY__ +#include +#include + +#elif defined(_WIN32) +#include +#include +#endif /* NVRTC_USE_CXXABI || __clang__ || __GNUC__ */ + + +#include +#include + + +/*************************************************************************//** + * + * \defgroup hosthelper Host Helper + * + * NVRTC defines the following functions for easier interaction with host code. + * + ****************************************************************************/ + +/** + * \ingroup hosthelper + * \brief nvrtcGetTypeName stores the source level name of the template type argument + * T in the given std::string location. + * + * This function is only provided when the macro NVRTC_GET_TYPE_NAME is + * defined with a non-zero value. It uses abi::__cxa_demangle or UnDecorateSymbolName + * function calls to extract the type name, when using gcc/clang or cl.exe compilers, + * respectively. If the name extraction fails, it will return NVRTC_INTERNAL_ERROR, + * otherwise *result is initialized with the extracted name. + * + * \param [in] result: pointer to std::string in which to store the type name. + * \return + * - \link #nvrtcResult NVRTC_SUCCESS \endlink + * - \link #nvrtcResult NVRTC_ERROR_INTERNAL_ERROR \endlink + * + */ + +template +nvrtcResult nvrtcGetTypeName(std::string *result) +{ + const char *name = typeid(T).name(); + +#if USE_CXXABI || __clang__ || __GNUC__ + int status; + char *undecorated_name = abi::__cxa_demangle(name, 0, 0, &status); + if (status == 0) { + *result = undecorated_name; + free(undecorated_name); + return NVRTC_SUCCESS; + } +#elif defined(_WIN32) + char undecorated_name[4096]; + if(UnDecorateSymbolName(name, undecorated_name, + sizeof(undecorated_name) / sizeof(*undecorated_name), + UNDNAME_COMPLETE) ) { + *result = undecorated_name; + return NVRTC_SUCCESS; + } +#endif /* USE_CXXABI || __clang__ || __GNUC__ */ + return NVRTC_ERROR_INTERNAL_ERROR; +} +#endif /* NVRTC_GET_TYPE_NAME */ + +#endif /* __NVRTC_H__ */ diff --git a/include/external/CUDA/surface_types.h b/include/external/CUDA/surface_types.h new file mode 100755 index 000000000..95ff57ca1 --- /dev/null +++ b/include/external/CUDA/surface_types.h @@ -0,0 +1,119 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__SURFACE_TYPES_H__) +#define __SURFACE_TYPES_H__ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "driver_types.h" + +/** + * \addtogroup CUDART_TYPES + * + * @{ + */ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#define cudaSurfaceType1D 0x01 +#define cudaSurfaceType2D 0x02 +#define cudaSurfaceType3D 0x03 +#define cudaSurfaceTypeCubemap 0x0C +#define cudaSurfaceType1DLayered 0xF1 +#define cudaSurfaceType2DLayered 0xF2 +#define cudaSurfaceTypeCubemapLayered 0xFC + +/** + * CUDA Surface boundary modes + */ +enum __device_builtin__ cudaSurfaceBoundaryMode +{ + cudaBoundaryModeZero = 0, /**< Zero boundary mode */ + cudaBoundaryModeClamp = 1, /**< Clamp boundary mode */ + cudaBoundaryModeTrap = 2 /**< Trap boundary mode */ +}; + +/** + * CUDA Surface format modes + */ +enum __device_builtin__ cudaSurfaceFormatMode +{ + cudaFormatModeForced = 0, /**< Forced format mode */ + cudaFormatModeAuto = 1 /**< Auto format mode */ +}; + +/** + * CUDA Surface reference + */ +struct __device_builtin__ surfaceReference +{ + /** + * Channel descriptor for surface reference + */ + struct cudaChannelFormatDesc channelDesc; +}; + +/** + * An opaque value that represents a CUDA Surface object + */ +typedef __device_builtin__ unsigned long long cudaSurfaceObject_t; + +/** @} */ +/** @} */ /* END CUDART_TYPES */ + +#endif /* !__SURFACE_TYPES_H__ */ diff --git a/include/external/CUDA/texture_types.h b/include/external/CUDA/texture_types.h new file mode 100755 index 000000000..dda31dd72 --- /dev/null +++ b/include/external/CUDA/texture_types.h @@ -0,0 +1,217 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__TEXTURE_TYPES_H__) +#define __TEXTURE_TYPES_H__ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "driver_types.h" + +/** + * \addtogroup CUDART_TYPES + * + * @{ + */ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#define cudaTextureType1D 0x01 +#define cudaTextureType2D 0x02 +#define cudaTextureType3D 0x03 +#define cudaTextureTypeCubemap 0x0C +#define cudaTextureType1DLayered 0xF1 +#define cudaTextureType2DLayered 0xF2 +#define cudaTextureTypeCubemapLayered 0xFC + +/** + * CUDA texture address modes + */ +enum __device_builtin__ cudaTextureAddressMode +{ + cudaAddressModeWrap = 0, /**< Wrapping address mode */ + cudaAddressModeClamp = 1, /**< Clamp to edge address mode */ + cudaAddressModeMirror = 2, /**< Mirror address mode */ + cudaAddressModeBorder = 3 /**< Border address mode */ +}; + +/** + * CUDA texture filter modes + */ +enum __device_builtin__ cudaTextureFilterMode +{ + cudaFilterModePoint = 0, /**< Point filter mode */ + cudaFilterModeLinear = 1 /**< Linear filter mode */ +}; + +/** + * CUDA texture read modes + */ +enum __device_builtin__ cudaTextureReadMode +{ + cudaReadModeElementType = 0, /**< Read texture as specified element type */ + cudaReadModeNormalizedFloat = 1 /**< Read texture as normalized float */ +}; + +/** + * CUDA texture reference + */ +struct __device_builtin__ textureReference +{ + /** + * Indicates whether texture reads are normalized or not + */ + int normalized; + /** + * Texture filter mode + */ + enum cudaTextureFilterMode filterMode; + /** + * Texture address mode for up to 3 dimensions + */ + enum cudaTextureAddressMode addressMode[3]; + /** + * Channel descriptor for the texture reference + */ + struct cudaChannelFormatDesc channelDesc; + /** + * Perform sRGB->linear conversion during texture read + */ + int sRGB; + /** + * Limit to the anisotropy ratio + */ + unsigned int maxAnisotropy; + /** + * Mipmap filter mode + */ + enum cudaTextureFilterMode mipmapFilterMode; + /** + * Offset applied to the supplied mipmap level + */ + float mipmapLevelBias; + /** + * Lower end of the mipmap level range to clamp access to + */ + float minMipmapLevelClamp; + /** + * Upper end of the mipmap level range to clamp access to + */ + float maxMipmapLevelClamp; + int __cudaReserved[15]; +}; + +/** + * CUDA texture descriptor + */ +struct __device_builtin__ cudaTextureDesc +{ + /** + * Texture address mode for up to 3 dimensions + */ + enum cudaTextureAddressMode addressMode[3]; + /** + * Texture filter mode + */ + enum cudaTextureFilterMode filterMode; + /** + * Texture read mode + */ + enum cudaTextureReadMode readMode; + /** + * Perform sRGB->linear conversion during texture read + */ + int sRGB; + /** + * Texture Border Color + */ + float borderColor[4]; + /** + * Indicates whether texture reads are normalized or not + */ + int normalizedCoords; + /** + * Limit to the anisotropy ratio + */ + unsigned int maxAnisotropy; + /** + * Mipmap filter mode + */ + enum cudaTextureFilterMode mipmapFilterMode; + /** + * Offset applied to the supplied mipmap level + */ + float mipmapLevelBias; + /** + * Lower end of the mipmap level range to clamp access to + */ + float minMipmapLevelClamp; + /** + * Upper end of the mipmap level range to clamp access to + */ + float maxMipmapLevelClamp; +}; + +/** + * An opaque value that represents a CUDA texture object + */ +typedef __device_builtin__ unsigned long long cudaTextureObject_t; + +/** @} */ +/** @} */ /* END CUDART_TYPES */ + +#endif /* !__TEXTURE_TYPES_H__ */ diff --git a/include/external/CUDA/vector_functions.h b/include/external/CUDA/vector_functions.h new file mode 100755 index 000000000..8ffb37122 --- /dev/null +++ b/include/external/CUDA/vector_functions.h @@ -0,0 +1,177 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__VECTOR_FUNCTIONS_H__) +#define __VECTOR_FUNCTIONS_H__ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "builtin_types.h" +#include "host_defines.h" +#include "vector_types.h" + +#if defined(__CUDACC_RTC__) +#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__ +#endif /* __CUDACC_RTC__ */ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x); + +__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x); + +__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y); + +__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y); + +__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z); + +__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z); + +__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w); + +__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w); + +__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x); + +__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x); + +__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y); + +__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y); + +__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z); + +__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z); + +__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w); + +__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w); + +__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x); + +__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x); + +__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y); + +__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y); + +__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z); + +__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z); + +__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w); + +__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w); + +__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x); + +__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x); + +__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y); + +__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y); + +__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z); + +__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z); + +__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w); + +__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w); + +__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x); + +__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y); + +__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z); + +__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w); + +__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x); + +__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x); + +__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y); + +__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y); + +__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z); + +__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z); + +__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w); + +__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w); + +__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x); + +__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y); + +__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z); + +__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w); + +#undef __VECTOR_FUNCTIONS_DECL__ + +#if !defined(__CUDACC_RTC__) +#include "vector_functions.hpp" +#endif /* !__CUDACC_RTC__ */ + +#endif /* !__VECTOR_FUNCTIONS_H__ */ diff --git a/include/external/CUDA/vector_functions.hpp b/include/external/CUDA/vector_functions.hpp new file mode 100755 index 000000000..2ee5d5890 --- /dev/null +++ b/include/external/CUDA/vector_functions.hpp @@ -0,0 +1,318 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__VECTOR_FUNCTIONS_HPP__) +#define __VECTOR_FUNCTIONS_HPP__ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "builtin_types.h" +#include "host_defines.h" +#include "vector_types.h" + +#if defined(__CUDACC_RTC__) +#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__ +#endif /* __CUDACC_RTC__ */ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x) +{ + char1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x) +{ + uchar1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y) +{ + char2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y) +{ + uchar2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z) +{ + char3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z) +{ + uchar3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w) +{ + char4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w) +{ + uchar4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x) +{ + short1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x) +{ + ushort1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y) +{ + short2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y) +{ + ushort2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z) +{ + short3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z) +{ + ushort3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w) +{ + short4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w) +{ + ushort4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x) +{ + int1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x) +{ + uint1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y) +{ + int2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y) +{ + uint2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z) +{ + int3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z) +{ + uint3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w) +{ + int4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w) +{ + uint4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x) +{ + long1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x) +{ + ulong1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y) +{ + long2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y) +{ + ulong2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z) +{ + long3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z) +{ + ulong3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w) +{ + long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w) +{ + ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x) +{ + float1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y) +{ + float2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z) +{ + float3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w) +{ + float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x) +{ + longlong1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x) +{ + ulonglong1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y) +{ + longlong2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y) +{ + ulonglong2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z) +{ + longlong3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z) +{ + ulonglong3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w) +{ + longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w) +{ + ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x) +{ + double1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y) +{ + double2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z) +{ + double3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w) +{ + double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +#undef __VECTOR_FUNCTIONS_DECL__ + +#endif /* !__VECTOR_FUNCTIONS_HPP__ */ + diff --git a/include/external/CUDA/vector_types.h b/include/external/CUDA/vector_types.h new file mode 100755 index 000000000..63d9e680b --- /dev/null +++ b/include/external/CUDA/vector_types.h @@ -0,0 +1,425 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__VECTOR_TYPES_H__) +#define __VECTOR_TYPES_H__ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "host_defines.h" + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#if !defined(__CUDACC__) && !defined(__CUDACC_RTC__) && \ + defined(_WIN32) && !defined(_WIN64) + +#pragma warning(push) +#pragma warning(disable: 4201 4408) + +#define __cuda_builtin_vector_align8(tag, members) \ +struct __device_builtin__ tag \ +{ \ + union \ + { \ + struct { members }; \ + struct { long long int :1,:0; }; \ + }; \ +} + +#else /* !__CUDACC__ && !__CUDACC_RTC__ && _WIN32 && !_WIN64 */ + +#define __cuda_builtin_vector_align8(tag, members) \ +struct __device_builtin__ __align__(8) tag \ +{ \ + members \ +} + +#endif /* !__CUDACC__ && !__CUDACC_RTC__ && _WIN32 && !_WIN64 */ + +struct __device_builtin__ char1 +{ + signed char x; +}; + +struct __device_builtin__ uchar1 +{ + unsigned char x; +}; + + +struct __device_builtin__ __align__(2) char2 +{ + signed char x, y; +}; + +struct __device_builtin__ __align__(2) uchar2 +{ + unsigned char x, y; +}; + +struct __device_builtin__ char3 +{ + signed char x, y, z; +}; + +struct __device_builtin__ uchar3 +{ + unsigned char x, y, z; +}; + +struct __device_builtin__ __align__(4) char4 +{ + signed char x, y, z, w; +}; + +struct __device_builtin__ __align__(4) uchar4 +{ + unsigned char x, y, z, w; +}; + +struct __device_builtin__ short1 +{ + short x; +}; + +struct __device_builtin__ ushort1 +{ + unsigned short x; +}; + +struct __device_builtin__ __align__(4) short2 +{ + short x, y; +}; + +struct __device_builtin__ __align__(4) ushort2 +{ + unsigned short x, y; +}; + +struct __device_builtin__ short3 +{ + short x, y, z; +}; + +struct __device_builtin__ ushort3 +{ + unsigned short x, y, z; +}; + +__cuda_builtin_vector_align8(short4, short x; short y; short z; short w;); +__cuda_builtin_vector_align8(ushort4, unsigned short x; unsigned short y; unsigned short z; unsigned short w;); + +struct __device_builtin__ int1 +{ + int x; +}; + +struct __device_builtin__ uint1 +{ + unsigned int x; +}; + +__cuda_builtin_vector_align8(int2, int x; int y;); +__cuda_builtin_vector_align8(uint2, unsigned int x; unsigned int y;); + +struct __device_builtin__ int3 +{ + int x, y, z; +}; + +struct __device_builtin__ uint3 +{ + unsigned int x, y, z; +}; + +struct __device_builtin__ __builtin_align__(16) int4 +{ + int x, y, z, w; +}; + +struct __device_builtin__ __builtin_align__(16) uint4 +{ + unsigned int x, y, z, w; +}; + +struct __device_builtin__ long1 +{ + long int x; +}; + +struct __device_builtin__ ulong1 +{ + unsigned long x; +}; + +#if defined(_WIN32) +__cuda_builtin_vector_align8(long2, long int x; long int y;); +__cuda_builtin_vector_align8(ulong2, unsigned long int x; unsigned long int y;); +#else /* !_WIN32 */ + +struct __device_builtin__ __align__(2*sizeof(long int)) long2 +{ + long int x, y; +}; + +struct __device_builtin__ __align__(2*sizeof(unsigned long int)) ulong2 +{ + unsigned long int x, y; +}; + +#endif /* _WIN32 */ + +struct __device_builtin__ long3 +{ + long int x, y, z; +}; + +struct __device_builtin__ ulong3 +{ + unsigned long int x, y, z; +}; + +struct __device_builtin__ __builtin_align__(16) long4 +{ + long int x, y, z, w; +}; + +struct __device_builtin__ __builtin_align__(16) ulong4 +{ + unsigned long int x, y, z, w; +}; + +struct __device_builtin__ float1 +{ + float x; +}; + +#if !defined(__CUDACC__) && defined(__arm__) && \ + defined(__ARM_PCS_VFP) && __GNUC__ == 4 && __GNUC_MINOR__ == 6 + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-pedantic" + +struct __device_builtin__ __attribute__((aligned(8))) float2 +{ + float x; float y; float __cuda_gnu_arm_ice_workaround[0]; +}; + +#pragma GCC poison __cuda_gnu_arm_ice_workaround +#pragma GCC diagnostic pop + +#else /* !__CUDACC__ && __arm__ && __ARM_PCS_VFP && + __GNUC__ == 4&& __GNUC_MINOR__ == 6 */ + +__cuda_builtin_vector_align8(float2, float x; float y;); + +#endif /* !__CUDACC__ && __arm__ && __ARM_PCS_VFP && + __GNUC__ == 4&& __GNUC_MINOR__ == 6 */ + +struct __device_builtin__ float3 +{ + float x, y, z; +}; + +struct __device_builtin__ __builtin_align__(16) float4 +{ + float x, y, z, w; +}; + +struct __device_builtin__ longlong1 +{ + long long int x; +}; + +struct __device_builtin__ ulonglong1 +{ + unsigned long long int x; +}; + +struct __device_builtin__ __builtin_align__(16) longlong2 +{ + long long int x, y; +}; + +struct __device_builtin__ __builtin_align__(16) ulonglong2 +{ + unsigned long long int x, y; +}; + +struct __device_builtin__ longlong3 +{ + long long int x, y, z; +}; + +struct __device_builtin__ ulonglong3 +{ + unsigned long long int x, y, z; +}; + +struct __device_builtin__ __builtin_align__(16) longlong4 +{ + long long int x, y, z ,w; +}; + +struct __device_builtin__ __builtin_align__(16) ulonglong4 +{ + unsigned long long int x, y, z, w; +}; + +struct __device_builtin__ double1 +{ + double x; +}; + +struct __device_builtin__ __builtin_align__(16) double2 +{ + double x, y; +}; + +struct __device_builtin__ double3 +{ + double x, y, z; +}; + +struct __device_builtin__ __builtin_align__(16) double4 +{ + double x, y, z, w; +}; + +#if !defined(__CUDACC__) && defined(_WIN32) && !defined(_WIN64) + +#pragma warning(pop) + +#endif /* !__CUDACC__ && _WIN32 && !_WIN64 */ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +typedef __device_builtin__ struct char1 char1; +typedef __device_builtin__ struct uchar1 uchar1; +typedef __device_builtin__ struct char2 char2; +typedef __device_builtin__ struct uchar2 uchar2; +typedef __device_builtin__ struct char3 char3; +typedef __device_builtin__ struct uchar3 uchar3; +typedef __device_builtin__ struct char4 char4; +typedef __device_builtin__ struct uchar4 uchar4; +typedef __device_builtin__ struct short1 short1; +typedef __device_builtin__ struct ushort1 ushort1; +typedef __device_builtin__ struct short2 short2; +typedef __device_builtin__ struct ushort2 ushort2; +typedef __device_builtin__ struct short3 short3; +typedef __device_builtin__ struct ushort3 ushort3; +typedef __device_builtin__ struct short4 short4; +typedef __device_builtin__ struct ushort4 ushort4; +typedef __device_builtin__ struct int1 int1; +typedef __device_builtin__ struct uint1 uint1; +typedef __device_builtin__ struct int2 int2; +typedef __device_builtin__ struct uint2 uint2; +typedef __device_builtin__ struct int3 int3; +typedef __device_builtin__ struct uint3 uint3; +typedef __device_builtin__ struct int4 int4; +typedef __device_builtin__ struct uint4 uint4; +typedef __device_builtin__ struct long1 long1; +typedef __device_builtin__ struct ulong1 ulong1; +typedef __device_builtin__ struct long2 long2; +typedef __device_builtin__ struct ulong2 ulong2; +typedef __device_builtin__ struct long3 long3; +typedef __device_builtin__ struct ulong3 ulong3; +typedef __device_builtin__ struct long4 long4; +typedef __device_builtin__ struct ulong4 ulong4; +typedef __device_builtin__ struct float1 float1; +typedef __device_builtin__ struct float2 float2; +typedef __device_builtin__ struct float3 float3; +typedef __device_builtin__ struct float4 float4; +typedef __device_builtin__ struct longlong1 longlong1; +typedef __device_builtin__ struct ulonglong1 ulonglong1; +typedef __device_builtin__ struct longlong2 longlong2; +typedef __device_builtin__ struct ulonglong2 ulonglong2; +typedef __device_builtin__ struct longlong3 longlong3; +typedef __device_builtin__ struct ulonglong3 ulonglong3; +typedef __device_builtin__ struct longlong4 longlong4; +typedef __device_builtin__ struct ulonglong4 ulonglong4; +typedef __device_builtin__ struct double1 double1; +typedef __device_builtin__ struct double2 double2; +typedef __device_builtin__ struct double3 double3; +typedef __device_builtin__ struct double4 double4; + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +struct __device_builtin__ dim3 +{ + unsigned int x, y, z; +#if defined(__cplusplus) + __host__ __device__ dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {} + __host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {} + __host__ __device__ operator uint3(void) { uint3 t; t.x = x; t.y = y; t.z = z; return t; } +#endif /* __cplusplus */ +}; + +typedef __device_builtin__ struct dim3 dim3; + +#undef __cuda_builtin_vector_align8 + +#endif /* !__VECTOR_TYPES_H__ */ diff --git a/include/tools/sys/getenv.hpp b/include/tools/sys/getenv.hpp new file mode 100755 index 000000000..e10664b6f --- /dev/null +++ b/include/tools/sys/getenv.hpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2015, PHILIPPE TILLET. All rights reserved. + * + * This file is part of ISAAC. + * + * ISAAC is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#ifndef TDL_TOOLS_SYS_GETENV_HPP +#define TDL_TOOLS_SYS_GETENV_HPP + +#include +#include + +namespace tdl +{ + +namespace tools +{ + + inline std::string getenv(const char * name) + { + #ifdef _MSC_VER + char* cache_path = 0; + std::size_t sz = 0; + _dupenv_s(&cache_path, &sz, name); + #else + const char * cache_path = std::getenv(name); + #endif + if(!cache_path) + return ""; + std::string result(cache_path); + #ifdef _MSC_VER + free(cache_path); + #endif + return result; + } + +} + +} + +#endif diff --git a/include/tools/sys/mkdir.hpp b/include/tools/sys/mkdir.hpp new file mode 100755 index 000000000..5d82a7535 --- /dev/null +++ b/include/tools/sys/mkdir.hpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2015, PHILIPPE TILLET. All rights reserved. + * + * This file is part of ISAAC. + * + * ISAAC is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301 USA + */ + +#ifndef TDL_TOOLS_SYS_MKDIR_HPP +#define TDL_TOOLS_SYS_MKDIR_HPP + +#include +#include +#include +#include +#include +#if defined(_WIN32) + #include +#endif + +namespace tdl +{ + +namespace tools +{ + + inline int mkdir(std::string const & path) + { + #if defined(_WIN32) + return _mkdir(path.c_str()); + #else + return ::mkdir(path.c_str(), 0777); + #endif + } + + inline int mkpath(std::string const & path) + { + int status = 0; + size_t pp = 0; + size_t sp; + while ((sp = path.find('/', pp)) != std::string::npos) + { + if (sp != pp){ + status = mkdir(path.substr(0, sp)); + } + pp = sp + 1; + } + return (status==0 || errno==EEXIST)?0:-1; + } + +} + +} + +#endif diff --git a/lib/driver/backend.cpp b/lib/driver/backend.cpp new file mode 100755 index 000000000..bddb419df --- /dev/null +++ b/lib/driver/backend.cpp @@ -0,0 +1,196 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#include "driver/dispatch.h" +#include "driver/backend.h" +#include "driver/buffer.h" +#include "driver/context.h" +#include "driver/stream.h" +#include "driver/kernel.h" + +#include +#include +#include + +namespace tdl +{ + +namespace driver +{ + +/*-----------------------------------*/ +//---------- Modules ----------------*/ +/*-----------------------------------*/ + +void backend::modules::release(){ + for(auto & x: cache_) + delete x.second; + cache_.clear(); +} + +Module& backend::modules::get(Stream const & stream, std::string const & name, std::string const & src){ + std::tuple key(stream, name); + if(cache_.find(key)==cache_.end()) + return *cache_.insert(std::make_pair(key, new Module(stream.context(), src))).first->second; + return *cache_.at(key); +} + +std::map, Module * > backend::modules::cache_; + +/*-----------------------------------*/ +//----------- Kernels --------------*/ +/*-----------------------------------*/ + +void backend::kernels::release(){ + for(auto & x: cache_) + delete x.second; + cache_.clear(); +} + +Kernel & backend::kernels::get(Module const & program, std::string const & name){ + std::tuple key(program, name); + if(cache_.find(key)==cache_.end()) + return *cache_.insert(std::make_pair(key, new Kernel(program, name.c_str()))).first->second; + return *cache_.at(key); +} + +std::map, Kernel * > backend::kernels::cache_; + +/*-----------------------------------*/ +//------------ Queues --------------*/ +/*-----------------------------------*/ + +void backend::streams::init(std::list const & contexts){ + for(Context const * ctx : contexts) + if(cache_.find(*ctx)==cache_.end()) + cache_.insert(std::make_pair(*ctx, std::vector{new Stream(*ctx)})); +} + +void backend::streams::release(){ + for(auto & x: cache_) + for(auto & y: x.second) + delete y; + cache_.clear(); +} + +Stream & backend::streams::get_default() +{ return get(contexts::get_default(), 0); } + +Stream & backend::streams::get(Context const & context, unsigned int id){ + init(std::list(1,&context)); + for(auto & x : cache_) + if(x.first==context) + return *x.second[id]; + throw; +} + +void backend::streams::get(Context const & context, std::vector & queues){ + init(std::list(1,&context)); + queues = cache_.at(context); +} + +std::map > backend::streams::cache_; + +/*-----------------------------------*/ +//------------ Contexts ------------*/ +/*-----------------------------------*/ + +void backend::contexts::init(std::vector const & platforms){ + for(Platform const & platform: platforms){ + for(Device const & device: platform.devices()) + cache_.push_back(new Context(device)); + } +} + +void backend::contexts::release(){ + for(auto & x: cache_) + delete x; + cache_.clear(); +} + +Context const & backend::contexts::get_default(){ + backend::init(); + std::list::const_iterator it = cache_.begin(); + std::advance(it, default_device); + return **it; +} + +void backend::contexts::get(std::list & contexts){ + backend::init(); + contexts = cache_; +} + +std::list backend::contexts::cache_; + + + +/*-----------------------------------*/ +//------------ General -------------*/ +/*-----------------------------------*/ + +std::vector backend::devices(){ + std::vector platforms = backend::platforms(); + std::vector result; + for(Platform const & platform: platforms){ + auto devices = platform.devices(); + result.insert(result.end(), devices.begin(), devices.end()); + } + return result; +} + +std::vector backend::platforms(){ + std::vector platforms; + //if CUDA is here + if(dispatch::cuinit()) + platforms.push_back(Platform()); + if(platforms.empty()) + throw std::runtime_error("ISAAC: No backend available. Make sure CUDA is available in your library path"); + return platforms; +} + +void backend::synchronize(Context const & context){ + for(Stream * queue: streams::cache_.at(context)) + queue->synchronize(); +} + + +void backend::release(){ + backend::kernels::release(); +// backend::programs::release(); + backend::streams::release(); + backend::contexts::release(); +} + + +void backend::init(){ + if(!contexts::cache_.empty()) + return; + std::vector platforms = backend::platforms(); + contexts::init(platforms); + streams::init(contexts::cache_); +} + +unsigned int backend::default_device = 0; + +} + +} diff --git a/lib/driver/buffer.cpp b/lib/driver/buffer.cpp new file mode 100755 index 000000000..aa770a05d --- /dev/null +++ b/lib/driver/buffer.cpp @@ -0,0 +1,60 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#include +#include "driver/stream.h" +#include "driver/buffer.h" +#include "driver/context.h" +#include "driver/dispatch.h" + + +namespace tdl +{ + +namespace driver +{ + +Buffer::Buffer(Context const & context, size_t size) : context_(context) +{ + ContextSwitcher ctx_switch(context_); + dispatch::cuMemAlloc(&*cu_, size); +} + +Buffer::Buffer(Context const & context, CUdeviceptr cu, bool take_ownership): + context_(context), cu_(cu, take_ownership) +{ } + +void Buffer::set_zero(Stream const & queue, size_t size) +{ + ContextSwitcher ctx_switch(context_); + dispatch::cuMemsetD8Async(*cu_, 0, size, queue); +} + +Handle const & Buffer::cu() const +{ return cu_; } + +Handle & Buffer::cu() +{ return cu_; } + +} + +} diff --git a/lib/driver/context.cpp b/lib/driver/context.cpp new file mode 100755 index 000000000..9da2c6978 --- /dev/null +++ b/lib/driver/context.cpp @@ -0,0 +1,99 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#include +#include + +#include "driver/context.h" +#include "driver/module.h" + +#include "tools/sys/getenv.hpp" +#include "tools/sys/mkdir.hpp" + +namespace tdl +{ + +namespace driver +{ + +std::string Context::get_cache_path(){ + //user-specified cache path + std::string result = tools::getenv("ISAAC_CACHE_PATH"); + if(!result.empty()){ + if(tools::mkpath(result)==0) + return result; + } + //create in home + result = tools::getenv("HOME"); + if(!result.empty()) + { + result = result + "/.isaac/cache/"; + if(tools::mkpath(result)==0) + return result; + } + //couldn't find a directory + return ""; +} + +CUdevice Context::device(CUcontext context){ + dispatch::cuCtxPushCurrent_v2(context); + CUdevice res; + dispatch::cuCtxGetDevice(&res); + dispatch::cuCtxPopCurrent_v2(NULL); + return res; +} + +Context::Context(CUcontext context, bool take_ownership): cu_(context, take_ownership), device_(device(context), false), cache_path_(get_cache_path()) +{ } + +Context::Context(Device const & device): device_(device), cache_path_(get_cache_path()) +{ + dispatch::cuCtxCreate(&*cu_, CU_CTX_SCHED_AUTO, (CUdevice)device); + dispatch::cuCtxPopCurrent_v2(NULL); +} + +Device const & Context::device() const +{ return device_; } + +std::string const & Context::cache_path() const +{ return cache_path_; } + +Handle const & Context::cu() const +{ return cu_; } + +/* Context Switcher */ +ContextSwitcher::ContextSwitcher(Context const & ctx): ctx_(ctx) +{ + dispatch::cuCtxPushCurrent_v2(ctx_); +} + +ContextSwitcher::~ContextSwitcher() +{ + CUcontext tmp; + dispatch::cuCtxPopCurrent_v2(&tmp); + assert(tmp==(CUcontext)ctx_ && "Switching back to invalid context!"); +} + + + +} +} diff --git a/lib/driver/device.cpp b/lib/driver/device.cpp new file mode 100755 index 000000000..13f10f6a0 --- /dev/null +++ b/lib/driver/device.cpp @@ -0,0 +1,197 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#include +#include +#include +#include +#include + +#include "driver/device.h" + +namespace tdl +{ + +namespace driver +{ + +/* Architecture [NVidia] */ +Device::Architecture Device::nv_arch(std::pair sm) const{ + switch(sm.first) + { + case 7: + switch(sm.second) + { + case 0: return Architecture::SM_7_0; + } + + case 6: + switch(sm.second) + { + case 0: return Architecture::SM_6_0; + case 1: return Architecture::SM_6_1; + } + + case 5: + switch(sm.second) + { + case 0: return Architecture::SM_5_0; + case 2: return Architecture::SM_5_2; + default: return Architecture::UNKNOWN; + } + + case 3: + switch(sm.second) + { + case 0: return Architecture::SM_3_0; + case 5: return Architecture::SM_3_5; + case 7: return Architecture::SM_3_7; + default: return Architecture::UNKNOWN; + } + + case 2: + switch(sm.second) + { + case 0: return Architecture::SM_2_0; + case 1: return Architecture::SM_2_1; + default: return Architecture::UNKNOWN; + } + + default: return Architecture::UNKNOWN; + } +} + +template +int Device::cuGetInfo() const{ + int res; + dispatch::cuDeviceGetAttribute(&res, attr, *cu_); + return res; +} + +nvmlDevice_t Device::nvml_device() const{ + std::map map; + std::string key = pci_bus_id(); + if(map.find(key)==map.end()){ + nvmlDevice_t device; + dispatch::nvmlDeviceGetHandleByPciBusId_v2(key.c_str(), &device); + return map.insert(std::make_pair(key, device)).first->second; + } + return map.at(key); +} + +/* Architecture */ +Device::Architecture Device::architecture() const +{ return nv_arch(compute_capability()); } + +/* Attributes */ +size_t Device::address_bits() const +{ return sizeof(size_t)*8; } + +driver::Platform Device::platform() const +{ return Platform(); } + +std::string Device::name() const{ + char tmp[128]; + dispatch::cuDeviceGetName(tmp, 128, *cu_); + return std::string(tmp); +} + +std::string Device::pci_bus_id() const{ + char tmp[128]; + dispatch::cuDeviceGetPCIBusId(tmp, 128, *cu_); + return std::string(tmp); +} + +void Device::interpret_as(std::pair cc){ + interpreted_as_ = std::make_shared>(cc); +} + +std::pair Device::compute_capability() const{ + if(interpreted_as_) + return *interpreted_as_; + size_t _major = cuGetInfo(); + size_t _minor = cuGetInfo(); + return std::make_pair(_major, _minor); +} + +size_t Device::max_threads_per_block() const +{ return cuGetInfo(); } + +size_t Device::max_shared_memory() const +{ return cuGetInfo(); } + +size_t Device::warp_size() const +{ return cuGetInfo(); } + + +std::vector Device::max_block_dim() const{ + std::vector result(3); + result[0] = cuGetInfo(); + result[1] = cuGetInfo(); + result[2] = cuGetInfo(); + return result; +} + +size_t Device::current_sm_clock() const{ + unsigned int result; + dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_SM, &result); + return result; +} + +size_t Device::max_sm_clock() const{ + unsigned int result; + dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_SM, &result); + return result; +} + + +size_t Device::current_mem_clock() const{ + unsigned int result; + dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_MEM, &result); + return result; +} + +size_t Device::max_mem_clock() const{ + unsigned int result; + dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_MEM, &result); + return result; +} + +/* Infos */ +std::string Device::infos() const{ + std::ostringstream oss; + std::vector max_wi_sizes = max_block_dim(); + oss << "Platform: " << platform().name() << std::endl; + oss << "Name: " << name() << std::endl; + oss << "Maximum total work-group size: " << max_threads_per_block() << std::endl; + oss << "Maximum individual work-group sizes: " << max_wi_sizes[0] << ", " << max_wi_sizes[1] << ", " << max_wi_sizes[2] << std::endl; + oss << "Local memory size: " << max_shared_memory() << std::endl; + return oss.str(); +} + +Handle const & Device::cu() const +{ return cu_; } + +} + +} + diff --git a/lib/driver/dispatch.cpp b/lib/driver/dispatch.cpp new file mode 100755 index 000000000..4551bf072 --- /dev/null +++ b/lib/driver/dispatch.cpp @@ -0,0 +1,363 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#include +#include "driver/dispatch.h" +#include "driver/context.h" + +namespace tdl +{ +namespace driver +{ + +//Helpers for function definition +#define DEFINE0(init, hlib, ret, fname) ret dispatch::fname()\ +{return f_impl(hlib, fname, fname ## _, #fname); } + +#define DEFINE1(init, hlib, ret, fname, t1) ret dispatch::fname(t1 a)\ +{return f_impl(hlib, fname, fname ## _, #fname, a); } + +#define DEFINE2(init, hlib, ret, fname, t1, t2) ret dispatch::fname(t1 a, t2 b)\ +{return f_impl(hlib, fname, fname ## _, #fname, a, b); } + +#define DEFINE3(init, hlib, ret, fname, t1, t2, t3) ret dispatch::fname(t1 a, t2 b, t3 c)\ +{return f_impl(hlib, fname, fname ## _, #fname, a, b, c); } + +#define DEFINE4(init, hlib, ret, fname, t1, t2, t3, t4) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d)\ +{return f_impl(hlib, fname, fname ## _, #fname, a, b, c, d); } + +#define DEFINE5(init, hlib, ret, fname, t1, t2, t3, t4, t5) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e)\ +{return f_impl(hlib, fname, fname ## _, #fname, a, b, c, d, e); } + +#define DEFINE6(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f)\ +{return f_impl(hlib, fname, fname ## _, #fname, a, b, c, d, e, f); } + +#define DEFINE7(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g)\ +{return f_impl(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g); } + +#define DEFINE8(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h)\ +{return f_impl(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h); } + +#define DEFINE9(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i)\ +{return f_impl(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i); } + +#define DEFINE10(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j)\ +{return f_impl(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j); } + +#define DEFINE11(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k)\ +{return f_impl(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k); } + +#define DEFINE13(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m)\ +{return f_impl(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m); } + +#define DEFINE19(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m, t14 n, t15 o, t16 p, t17 q, t18 r, t19 s)\ +{return f_impl(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s); } + +//Specialized helpers for CUDA +#define CUDA_DEFINE1(ret, fname, t1) DEFINE1(cuinit, cuda_, ret, fname, t1) +#define CUDA_DEFINE2(ret, fname, t1, t2) DEFINE2(cuinit, cuda_, ret, fname, t1, t2) +#define CUDA_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(cuinit, cuda_, ret, fname, t1, t2, t3) +#define CUDA_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(cuinit, cuda_, ret, fname, t1, t2, t3, t4) +#define CUDA_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5) +#define CUDA_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6) +#define CUDA_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7) +#define CUDA_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) +#define CUDA_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) +#define CUDA_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) +#define CUDA_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) + +#define NVRTC_DEFINE1(ret, fname, t1) DEFINE1(nvrtcinit, nvrtc_, ret, fname, t1) +#define NVRTC_DEFINE2(ret, fname, t1, t2) DEFINE2(nvrtcinit, nvrtc_, ret, fname, t1, t2) +#define NVRTC_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3) +#define NVRTC_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4) +#define NVRTC_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5) +#define NVRTC_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6) +#define NVRTC_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7) +#define NVRTC_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) +#define NVRTC_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) +#define NVRTC_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) +#define NVRTC_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) + +#define NVML_DEFINE0(ret, fname) DEFINE0(nvmlinit, nvml_, ret, fname) +#define NVML_DEFINE1(ret, fname, t1) DEFINE1(nvmlinit, nvml_, ret, fname, t1) +#define NVML_DEFINE2(ret, fname, t1, t2) DEFINE2(nvmlinit, nvml_, ret, fname, t1, t2) +#define NVML_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvmlinit, nvml_, ret, fname, t1, t2, t3) + +#define CUBLAS_DEFINE1(ret, fname, t1) DEFINE1(cublasinit, cublas_, ret, fname, t1) +#define CUBLAS_DEFINE13(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) DEFINE13(cublasinit, cublas_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) +#define CUBLAS_DEFINE19(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19) DEFINE19(cublasinit, cublas_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19) + +#define CUDNN_DEFINE1(ret, fname, t1) DEFINE1(cudnninit, cudnn_, ret, fname, t1) +#define CUDNN_DEFINE2(ret, fname, t1, t2) DEFINE2(cudnninit, cudnn_, ret, fname, t1, t2) +#define CUDNN_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(cudnninit, cudnn_, ret, fname, t1, t2, t3) +#define CUDNN_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5) +#define CUDNN_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5, t6) +#define CUDNN_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5, t6, t7) +#define CUDNN_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) +#define CUDNN_DEFINE13(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) DEFINE13(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) + + +bool dispatch::cuinit(){ + if(cuda_==nullptr) + cuda_ = dlopen("libcuda.so", RTLD_LAZY); + CUresult (*fptr)(unsigned int); + cuInit_ = dlsym(cuda_, "cuInit"); + *reinterpret_cast(&fptr) = cuInit_; + CUresult res = (*fptr)(0); + check(res); + return cuda_ != nullptr; +} + +bool dispatch::nvrtcinit(){ + if(nvrtc_==nullptr) + nvrtc_ = dlopen("libnvrtc.so", RTLD_LAZY); + return nvrtc_ != nullptr; +} + +bool dispatch::nvmlinit(){ + if(nvml_==nullptr) + nvml_ = dlopen("libnvidia-ml.so", RTLD_LAZY); + nvmlReturn_t (*fptr)(); + nvmlInit_v2_ = dlsym(nvml_, "nvmlInit_v2"); + *reinterpret_cast(&fptr) = nvmlInit_v2_; + nvmlReturn_t res = (*fptr)(); + check(res); + return res; +} + +bool dispatch::cublasinit(){ + if(cublas_==nullptr) + cublas_ = dlopen("libcublas.so", RTLD_LAZY); + return cublas_ != nullptr; +} + +bool dispatch::cudnninit(){ + if(cudnn_==nullptr) + cudnn_ = dlopen("libcudnn.so", RTLD_LAZY); + return cudnn_ != nullptr; +} + +//CUDA +CUDA_DEFINE1(CUresult, cuCtxDestroy_v2, CUcontext) +CUDA_DEFINE2(CUresult, cuEventCreate, CUevent *, unsigned int) +CUDA_DEFINE2(CUresult, cuDeviceGet, CUdevice *, int) +CUDA_DEFINE3(CUresult, cuMemcpyDtoH_v2, void *, CUdeviceptr, size_t) +CUDA_DEFINE2(CUresult, cuStreamCreate, CUstream *, unsigned int) +CUDA_DEFINE3(CUresult, cuEventElapsedTime, float *, CUevent, CUevent) +CUDA_DEFINE1(CUresult, cuMemFree_v2, CUdeviceptr) +CUDA_DEFINE4(CUresult, cuMemcpyDtoHAsync_v2, void *, CUdeviceptr, size_t, CUstream) +CUDA_DEFINE1(CUresult, cuDriverGetVersion, int *) +CUDA_DEFINE3(CUresult, cuDeviceGetName, char *, int, CUdevice) +CUDA_DEFINE3(CUresult, cuDeviceGetPCIBusId, char *, int, CUdevice) +CUDA_DEFINE4(CUresult, cuModuleGetGlobal_v2, CUdeviceptr*, size_t*, CUmodule, const char*) + +CUDA_DEFINE4(CUresult, cuMemcpyHtoDAsync_v2, CUdeviceptr, const void *, size_t, CUstream) +CUDA_DEFINE2(CUresult, cuModuleLoad, CUmodule *, const char *) +CUDA_DEFINE11(CUresult, cuLaunchKernel, CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **) +CUDA_DEFINE1(CUresult, cuModuleUnload, CUmodule) +CUDA_DEFINE5(CUresult, cuModuleLoadDataEx, CUmodule *, const void *, unsigned int, CUjit_option *, void **) +CUDA_DEFINE3(CUresult, cuDeviceGetAttribute, int *, CUdevice_attribute, CUdevice) +CUDA_DEFINE1(CUresult, cuDeviceGetCount, int *) +CUDA_DEFINE3(CUresult, cuMemcpyHtoD_v2, CUdeviceptr, const void *, size_t ) +CUDA_DEFINE1(CUresult, cuInit, unsigned int) +CUDA_DEFINE2(CUresult, cuEventRecord, CUevent, CUstream) +CUDA_DEFINE3(CUresult, cuCtxCreate_v2, CUcontext *, unsigned int, CUdevice) +CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule, const char *) +CUDA_DEFINE1(CUresult, cuStreamSynchronize, CUstream) +CUDA_DEFINE1(CUresult, cuStreamDestroy_v2, CUstream) +CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent) +CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr*, size_t) +CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr) +CUDA_DEFINE1(CUresult, cuCtxGetDevice, CUdevice*) +CUDA_DEFINE1(CUresult, cuCtxGetCurrent, CUcontext*) +CUDA_DEFINE1(CUresult, cuCtxSetCurrent, CUcontext) +CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t, CUstream) +CUDA_DEFINE1(CUresult, cuCtxPushCurrent_v2, CUcontext) +CUDA_DEFINE1(CUresult, cuCtxPopCurrent_v2, CUcontext*) + +NVRTC_DEFINE3(nvrtcResult, nvrtcCompileProgram, nvrtcProgram, int, const char **) +NVRTC_DEFINE2(nvrtcResult, nvrtcGetProgramLogSize, nvrtcProgram, size_t *) +NVRTC_DEFINE2(nvrtcResult, nvrtcGetPTX, nvrtcProgram, char *) +NVRTC_DEFINE2(nvrtcResult, nvrtcGetPTXSize, nvrtcProgram, size_t *) +NVRTC_DEFINE6(nvrtcResult, nvrtcCreateProgram, nvrtcProgram *, const char *, const char *, int, const char **, const char **) +NVRTC_DEFINE2(nvrtcResult, nvrtcGetProgramLog, nvrtcProgram, char *) + +NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *, nvmlDevice_t*) +NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*) +NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*) + +cublasHandle_t dispatch::cublasHandle(Context const & ctx){ + static std::map handles; + auto pr = handles.insert({ctx, cublasHandle_t()}); + if(pr.second) + cublasCreate_v2(&pr.first->second); + return pr.first->second; +} + +cudnnHandle_t dispatch::cudnnHandle(Context const & ctx){ + static std::map handles; + auto pr = handles.insert({ctx, cudnnHandle_t()}); + if(pr.second) + cudnnCreate(&pr.first->second); + return pr.first->second; +} + +CUBLAS_DEFINE1(cublasStatus_t, cublasCreate_v2, cublasHandle_t*) +cublasStatus_t dispatch::cublasGetStream_v2(cublasHandle_t h, cudaStream_t *a) +{ return f_impl(cublas_, cublasGetStream_v2, cublasGetStream_v2_, "cublasGetStream_v2", h, a); } +cublasStatus_t dispatch::cublasSetStream_v2(cublasHandle_t h, cudaStream_t a) +{ return f_impl(cublas_, cublasSetStream_v2, cublasSetStream_v2_, "cublasSetStream_v2", h, a); } +cublasStatus_t dispatch::cublasSgemm_v2(cublasHandle_t h, cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc) +{ return f_impl(cublas_, cublasSgemm_v2, cublasSgemm_v2_, "cublasSgemm_v2", h, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);} +cublasStatus_t dispatch::cublasDgemm_v2(cublasHandle_t h, cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc) +{ return f_impl(cublas_, cublasDgemm_v2, cublasDgemm_v2_, "cublasDgemm_v2", h, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);} +cublasStatus_t dispatch::cublasHgemm(cublasHandle_t h, cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, half* alpha, const half *A, int lda, const half *B, int ldb, half* beta, half *C, int ldc) +{ return f_impl(cublas_, cublasHgemm, cublasHgemm_, "cublasHgemm", h, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);} +CUBLAS_DEFINE19(cublasStatus_t, cublasGemmEx, cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const void*, const void*, cudaDataType, int, const void*, cudaDataType, int, const void*, void*, cudaDataType, int, cudaDataType, cublasGemmAlgo_t) + +//cuDNN +CUDNN_DEFINE1(cudnnStatus_t, cudnnCreateConvolutionDescriptor, cudnnConvolutionDescriptor_t*) +CUDNN_DEFINE1(cudnnStatus_t, cudnnCreateTensorDescriptor, cudnnTensorDescriptor_t*) +CUDNN_DEFINE1(cudnnStatus_t, cudnnCreateFilterDescriptor, cudnnFilterDescriptor_t*) +CUDNN_DEFINE1(cudnnStatus_t, cudnnCreate, cudnnHandle_t*) +CUDNN_DEFINE7(cudnnStatus_t, cudnnSetTensor4dDescriptor, cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, int, int, int) +CUDNN_DEFINE7(cudnnStatus_t, cudnnSetFilter4dDescriptor, cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, int, int, int) +CUDNN_DEFINE5(cudnnStatus_t, cudnnSetTensorNdDescriptorEx, cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, const int*) +CUDNN_DEFINE5(cudnnStatus_t, cudnnSetFilterNdDescriptor, cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, const int*) +CUDNN_DEFINE1(cudnnStatus_t, cudnnCreatePoolingDescriptor, cudnnPoolingDescriptor_t*) +CUDNN_DEFINE7(cudnnStatus_t, cudnnSetPoolingNdDescriptor, cudnnPoolingDescriptor_t, const cudnnPoolingMode_t, const cudnnNanPropagation_t, int, const int*, const int*, const int*) +CUDNN_DEFINE8(cudnnStatus_t, cudnnPoolingForward, cudnnHandle_t, const cudnnPoolingDescriptor_t, const void*, const cudnnTensorDescriptor_t, const void*, const void*, const cudnnTensorDescriptor_t, void*) + + +CUDNN_DEFINE8(cudnnStatus_t, cudnnSetConvolution2dDescriptor, cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t) +CUDNN_DEFINE7(cudnnStatus_t, cudnnSetConvolutionNdDescriptor, cudnnConvolutionDescriptor_t, int, const int*, const int*, const int*, cudnnConvolutionMode_t, cudnnDataType_t) +CUDNN_DEFINE8(cudnnStatus_t, cudnnGetConvolutionForwardAlgorithm, cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t, cudnnConvolutionFwdAlgo_t *) +CUDNN_DEFINE7(cudnnStatus_t, cudnnGetConvolutionForwardWorkspaceSize, cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t*) +CUDNN_DEFINE13(cudnnStatus_t, cudnnConvolutionForward, cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *) +CUDNN_DEFINE2(cudnnStatus_t, cudnnSetStream, cudnnHandle_t, cudaStream_t) +CUDNN_DEFINE7(cudnnStatus_t, cudnnTransformTensor, cudnnHandle_t, const void*, const cudnnTensorDescriptor_t, const void*, const void*, const cudnnTensorDescriptor_t, void*) + + +void dispatch::release(){ + if(cuda_){ + dlclose(cuda_); + cuda_ = nullptr; + } + if(nvrtc_){ + dlclose(nvrtc_); + nvrtc_ = nullptr; + } + if(cublas_){ + dlclose(cublas_); + cublas_ = nullptr; + } + if(cudnn_){ + dlclose(cudnn_); + cudnn_ = nullptr; + } +} + +void* dispatch::cuda_; +void* dispatch::nvrtc_; +void* dispatch::nvml_; +void* dispatch::cublas_; +void* dispatch::cudnn_; + +//CUDA +void* dispatch::cuCtxGetCurrent_; +void* dispatch::cuCtxSetCurrent_; +void* dispatch::cuCtxDestroy_v2_; +void* dispatch::cuEventCreate_; +void* dispatch::cuDeviceGet_; +void* dispatch::cuMemcpyDtoH_v2_; +void* dispatch::cuStreamCreate_; +void* dispatch::cuEventElapsedTime_; +void* dispatch::cuMemFree_v2_; +void* dispatch::cuMemcpyDtoHAsync_v2_; +void* dispatch::cuDriverGetVersion_; +void* dispatch::cuDeviceGetName_; +void* dispatch::cuDeviceGetPCIBusId_; +void* dispatch::cuModuleGetGlobal_v2_; + +void* dispatch::cuMemcpyHtoDAsync_v2_; +void* dispatch::cuModuleLoad_; +void* dispatch::cuLaunchKernel_; +void* dispatch::cuModuleUnload_; +void* dispatch::cuModuleLoadDataEx_; +void* dispatch::cuDeviceGetAttribute_; +void* dispatch::cuDeviceGetCount_; +void* dispatch::cuMemcpyHtoD_v2_; +void* dispatch::cuInit_; +void* dispatch::cuEventRecord_; +void* dispatch::cuCtxCreate_v2_; +void* dispatch::cuModuleGetFunction_; +void* dispatch::cuStreamSynchronize_; +void* dispatch::cuStreamDestroy_v2_; +void* dispatch::cuEventDestroy_v2_; +void* dispatch::cuMemAlloc_v2_; +void* dispatch::cuPointerGetAttribute_; +void* dispatch::cuCtxGetDevice_; +void* dispatch::cuMemsetD8Async_; +void* dispatch::cuCtxPushCurrent_v2_; +void* dispatch::cuCtxPopCurrent_v2_; + +void* dispatch::nvrtcCompileProgram_; +void* dispatch::nvrtcGetProgramLogSize_; +void* dispatch::nvrtcGetPTX_; +void* dispatch::nvrtcGetPTXSize_; +void* dispatch::nvrtcCreateProgram_; +void* dispatch::nvrtcGetProgramLog_; + +void* dispatch::nvmlInit_v2_; +void* dispatch::nvmlDeviceGetHandleByPciBusId_v2_; +void* dispatch::nvmlDeviceGetClockInfo_; +void* dispatch::nvmlDeviceGetMaxClockInfo_; + +void* dispatch::cublasCreate_v2_; +void* dispatch::cublasGetStream_v2_; +void* dispatch::cublasSetStream_v2_; +void* dispatch::cublasHgemm_; +void* dispatch::cublasSgemm_v2_; +void* dispatch::cublasDgemm_v2_; +void* dispatch::cublasGemmEx_; + +void* dispatch::cudnnCreateConvolutionDescriptor_; +void* dispatch::cudnnCreatePoolingDescriptor_; +void* dispatch::cudnnCreateTensorDescriptor_; +void* dispatch::cudnnCreateFilterDescriptor_; +void* dispatch::cudnnCreate_; +void* dispatch::cudnnSetTensor4dDescriptor_; +void* dispatch::cudnnSetFilter4dDescriptor_; +void* dispatch::cudnnSetTensorNdDescriptorEx_; +void* dispatch::cudnnSetFilterNdDescriptor_; +void* dispatch::cudnnSetPoolingNdDescriptor_; +void* dispatch::cudnnSetConvolution2dDescriptor_; +void* dispatch::cudnnSetConvolutionNdDescriptor_; +void* dispatch::cudnnGetConvolutionForwardAlgorithm_; +void* dispatch::cudnnGetConvolutionForwardWorkspaceSize_; +void* dispatch::cudnnConvolutionForward_; +void* dispatch::cudnnPoolingForward_; +void* dispatch::cudnnSetStream_; +void* dispatch::cudnnTransformTensor_; + +} +} diff --git a/lib/driver/error.cpp b/lib/driver/error.cpp new file mode 100755 index 000000000..42d3a780d --- /dev/null +++ b/lib/driver/error.cpp @@ -0,0 +1,155 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#include "driver/error.h" + +namespace tdl +{ +namespace driver +{ + +void check(CUresult err) +{ + using namespace exception::cuda; + switch(err) + { + case CUDA_SUCCESS : break; + case CUDA_ERROR_INVALID_VALUE : throw invalid_value(); + case CUDA_ERROR_OUT_OF_MEMORY : throw out_of_memory(); + case CUDA_ERROR_NOT_INITIALIZED : throw not_initialized(); + case CUDA_ERROR_DEINITIALIZED : throw deinitialized(); + case CUDA_ERROR_PROFILER_DISABLED : throw profiler_disabled(); + case CUDA_ERROR_PROFILER_NOT_INITIALIZED : throw profiler_not_initialized(); + case CUDA_ERROR_PROFILER_ALREADY_STARTED : throw profiler_already_started(); + case CUDA_ERROR_PROFILER_ALREADY_STOPPED : throw profiler_already_stopped(); + case CUDA_ERROR_NO_DEVICE : throw no_device(); + case CUDA_ERROR_INVALID_DEVICE : throw invalid_device(); + case CUDA_ERROR_INVALID_IMAGE : throw invalid_image(); + case CUDA_ERROR_INVALID_CONTEXT : throw invalid_context(); + case CUDA_ERROR_CONTEXT_ALREADY_CURRENT : throw context_already_current(); + case CUDA_ERROR_MAP_FAILED : throw map_failed(); + case CUDA_ERROR_UNMAP_FAILED : throw unmap_failed(); + case CUDA_ERROR_ARRAY_IS_MAPPED : throw array_is_mapped(); + case CUDA_ERROR_ALREADY_MAPPED : throw already_mapped(); + case CUDA_ERROR_NO_BINARY_FOR_GPU : throw no_binary_for_gpu(); + case CUDA_ERROR_ALREADY_ACQUIRED : throw already_acquired(); + case CUDA_ERROR_NOT_MAPPED : throw not_mapped(); + case CUDA_ERROR_NOT_MAPPED_AS_ARRAY : throw not_mapped_as_array(); + case CUDA_ERROR_NOT_MAPPED_AS_POINTER : throw not_mapped_as_pointer(); + case CUDA_ERROR_ECC_UNCORRECTABLE : throw ecc_uncorrectable(); + case CUDA_ERROR_UNSUPPORTED_LIMIT : throw unsupported_limit(); + case CUDA_ERROR_CONTEXT_ALREADY_IN_USE : throw context_already_in_use(); + case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED : throw peer_access_unsupported(); + case CUDA_ERROR_INVALID_PTX : throw invalid_ptx(); + case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT : throw invalid_graphics_context(); + case CUDA_ERROR_INVALID_SOURCE : throw invalid_source(); + case CUDA_ERROR_FILE_NOT_FOUND : throw file_not_found(); + case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND : throw shared_object_symbol_not_found(); + case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED : throw shared_object_init_failed(); + case CUDA_ERROR_OPERATING_SYSTEM : throw operating_system(); + case CUDA_ERROR_INVALID_HANDLE : throw invalid_handle(); + case CUDA_ERROR_NOT_FOUND : throw not_found(); + case CUDA_ERROR_NOT_READY : throw not_ready(); + case CUDA_ERROR_ILLEGAL_ADDRESS : throw illegal_address(); + case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES : throw launch_out_of_resources(); + case CUDA_ERROR_LAUNCH_TIMEOUT : throw launch_timeout(); + case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING : throw launch_incompatible_texturing(); + case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED : throw peer_access_already_enabled(); + case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED : throw peer_access_not_enabled(); + case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE : throw primary_context_active(); + case CUDA_ERROR_CONTEXT_IS_DESTROYED : throw context_is_destroyed(); + case CUDA_ERROR_ASSERT : throw assert_error(); + case CUDA_ERROR_TOO_MANY_PEERS : throw too_many_peers(); + case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED : throw host_memory_already_registered(); + case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED : throw host_memory_not_registered(); + case CUDA_ERROR_HARDWARE_STACK_ERROR : throw hardware_stack_error(); + case CUDA_ERROR_ILLEGAL_INSTRUCTION : throw illegal_instruction(); + case CUDA_ERROR_MISALIGNED_ADDRESS : throw misaligned_address(); + case CUDA_ERROR_INVALID_ADDRESS_SPACE : throw invalid_address_space(); + case CUDA_ERROR_INVALID_PC : throw invalid_pc(); + case CUDA_ERROR_LAUNCH_FAILED : throw launch_failed(); + case CUDA_ERROR_NOT_PERMITTED : throw not_permitted(); + case CUDA_ERROR_NOT_SUPPORTED : throw not_supported(); + case CUDA_ERROR_UNKNOWN : throw unknown(); + default : throw unknown(); + } +} + +void check(nvrtcResult err){ + using namespace exception::nvrtc; + + switch(err) + { + case NVRTC_SUCCESS: break; + case NVRTC_ERROR_OUT_OF_MEMORY: throw out_of_memory(); + case NVRTC_ERROR_PROGRAM_CREATION_FAILURE: throw program_creation_failure(); + case NVRTC_ERROR_INVALID_INPUT: throw invalid_input(); + case NVRTC_ERROR_INVALID_PROGRAM: throw invalid_program(); + case NVRTC_ERROR_INVALID_OPTION: throw invalid_option(); + case NVRTC_ERROR_COMPILATION: throw compilation(); + case NVRTC_ERROR_BUILTIN_OPERATION_FAILURE: throw builtin_operation_failure(); + default: throw unknown_error(); + } +} + +void check(cublasStatus_t err){ + using namespace exception::cublas; + switch(err) + { + case CUBLAS_STATUS_SUCCESS : break; + case CUBLAS_STATUS_NOT_INITIALIZED : throw not_initialized(); + case CUBLAS_STATUS_ALLOC_FAILED : throw alloc_failed(); + case CUBLAS_STATUS_INVALID_VALUE : throw invalid_value(); + case CUBLAS_STATUS_ARCH_MISMATCH : throw arch_mismatch(); + case CUBLAS_STATUS_MAPPING_ERROR : throw mapping_error(); + case CUBLAS_STATUS_EXECUTION_FAILED: throw execution_failed(); + case CUBLAS_STATUS_INTERNAL_ERROR : throw internal_error(); + case CUBLAS_STATUS_NOT_SUPPORTED : throw not_supported(); + case CUBLAS_STATUS_LICENSE_ERROR : throw license_error(); + default : throw unknown(); + } +} + +void check(cudnnStatus_t err){ + using namespace exception::cudnn; + switch(err) + { + case CUDNN_STATUS_SUCCESS: break; + case CUDNN_STATUS_NOT_INITIALIZED: throw not_initialized(); + case CUDNN_STATUS_ALLOC_FAILED: throw alloc_failed(); + case CUDNN_STATUS_BAD_PARAM: throw bad_param(); + case CUDNN_STATUS_INTERNAL_ERROR: throw internal_error(); + case CUDNN_STATUS_INVALID_VALUE: throw invalid_value(); + case CUDNN_STATUS_ARCH_MISMATCH: throw arch_mismatch(); + case CUDNN_STATUS_MAPPING_ERROR: throw mapping_error(); + case CUDNN_STATUS_EXECUTION_FAILED: throw execution_failed(); + case CUDNN_STATUS_NOT_SUPPORTED: throw not_supported(); + case CUDNN_STATUS_LICENSE_ERROR: throw license_error(); + case CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING: throw runtime_prerequisite_missing(); + case CUDNN_STATUS_RUNTIME_IN_PROGRESS: throw runtime_in_progress(); + case CUDNN_STATUS_RUNTIME_FP_OVERFLOW: throw runtime_fp_overflow(); + } +} + +} +} + diff --git a/lib/driver/event.cpp b/lib/driver/event.cpp new file mode 100755 index 000000000..dc554d808 --- /dev/null +++ b/lib/driver/event.cpp @@ -0,0 +1,40 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#include "driver/event.h" + +namespace tdl +{ +namespace driver +{ + +float Event::elapsed_time() const{ + float time; + dispatch::cuEventElapsedTime(&time, cu_->first, cu_->second); + return time; +} + +Handle const & Event::cu() const +{ return cu_; } + +} +} diff --git a/lib/driver/handle.cpp b/lib/driver/handle.cpp new file mode 100755 index 000000000..a01a099bd --- /dev/null +++ b/lib/driver/handle.cpp @@ -0,0 +1,66 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#include +#include +#include "driver/handle.h" + +namespace tdl +{ + +namespace driver +{ + +//CUDA +inline void _delete(CUcontext x) { dispatch::cuCtxDestroy(x); } +inline void _delete(CUdeviceptr x) { dispatch::cuMemFree(x); } +inline void _delete(CUstream x) { dispatch::cuStreamDestroy(x); } +inline void _delete(CUdevice) { } +inline void _delete(CUevent x) { dispatch::cuEventDestroy(x); } +inline void _delete(CUfunction) { } +inline void _delete(CUmodule x) { dispatch::cuModuleUnload(x); } +inline void _delete(cu_event_t x) { _delete(x.first); _delete(x.second); } +inline void _delete(cu_platform){} + +//Constructor +template +Handle::Handle(CUType cu, bool take_ownership): h_(new CUType(cu)), has_ownership_(take_ownership) +{ } + + +template +Handle::~Handle(){ + if(has_ownership_ && h_ && h_.unique() && *h_) + _delete(*h_); +} + +template class Handle; +template class Handle; +template class Handle; +template class Handle; +template class Handle; +template class Handle; +template class Handle; +template class Handle; + +} +} diff --git a/lib/driver/kernel.cpp b/lib/driver/kernel.cpp new file mode 100755 index 000000000..6e536b767 --- /dev/null +++ b/lib/driver/kernel.cpp @@ -0,0 +1,67 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#include +#include + +#include "driver/kernel.h" +#include "driver/buffer.h" + +namespace tdl +{ + +namespace driver +{ + +Kernel::Kernel(Module const & program, const char * name) : program_(program), address_bits_(program.context().device().address_bits()){ + cu_params_store_.reserve(64); + cu_params_.reserve(64); + dispatch::cuModuleGetFunction(&*cu_, program, name); +} + +void Kernel::setArg(unsigned int index, std::size_t size, void* ptr){ + if(index + 1> cu_params_store_.size()){ + cu_params_store_.resize(index+1); + cu_params_.resize(index+1); + } + cu_params_store_[index].reset(malloc(size), free); + memcpy(cu_params_store_[index].get(), ptr, size); + cu_params_[index] = cu_params_store_[index].get(); +} + +void Kernel::setArg(unsigned int index, Buffer const & data) +{ return setArg(index, (CUdeviceptr)data);} + +void* const* Kernel::cu_params() const +{ return cu_params_.data(); } + +Handle const & Kernel::cu() const +{ return cu_; } + +Module const & Kernel::module() const +{ return program_; } + + +} + +} + diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp new file mode 100755 index 000000000..06a6ff437 --- /dev/null +++ b/lib/driver/module.cpp @@ -0,0 +1,118 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#include +#include + +#include "driver/module.h" +#include "driver/context.h" +#include "driver/error.h" + +#include "tools/sys/getenv.hpp" + +namespace tdl +{ +namespace driver +{ + +CUjit_target_enum cutarget(Device::Architecture arch){ + switch(arch){ + case Device::Architecture::SM_2_0: return CU_TARGET_COMPUTE_20; + case Device::Architecture::SM_2_1: return CU_TARGET_COMPUTE_21; + case Device::Architecture::SM_3_0: return CU_TARGET_COMPUTE_30; + case Device::Architecture::SM_3_5: return CU_TARGET_COMPUTE_35; + case Device::Architecture::SM_3_7: return CU_TARGET_COMPUTE_37; + case Device::Architecture::SM_5_0: return CU_TARGET_COMPUTE_50; + case Device::Architecture::SM_5_2: return CU_TARGET_COMPUTE_52; + case Device::Architecture::SM_6_0: return CU_TARGET_COMPUTE_60; + case Device::Architecture::SM_6_1: return CU_TARGET_COMPUTE_61; + default: throw; + } +} + +inline std::pair ptx(std::pair sm){ + if(sm.first == 7) return {6, 0}; + if(sm.first == 6) return {5, 0}; + if(sm.first == 5) return {4, 3}; + throw; +} + +std::string Module::header(Device const & device){ + auto cc = device.compute_capability(); + auto vptx = ptx(cc); + std::string header; + header += ".version " + std::to_string(vptx.first) + "." + std::to_string(vptx.second) + "\n"; + header += ".target sm_" + std::to_string(cc.first) + std::to_string(cc.second) + "\n"; + header += ".address_size 64\n"; + return header; +} + +Module::Module(Context const & context, std::string const & source) : context_(context), source_(header(context.device()) + source){ + ContextSwitcher ctx_switch(context_); + + //Path to custom PTX compiler + std::string compiler = tools::getenv("ISAAC_PTXAS"); + if(compiler.size()){ + auto cc = context.device().compute_capability(); + std::string out = context.cache_path() + "tmp.o"; + std::string opt = " --gpu-name sm_" + std::to_string(cc.first) + std::to_string(cc.second) + + " -o " + out + + " -ias \"" + source_ + "\""; + std::string cmd = compiler + opt; + if(std::system(cmd.c_str()) != 0) + throw; + dispatch::cuModuleLoad(&*cu_, out.c_str()); + } + //JIT Compilation + else{ + CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; + unsigned int errbufsize = 8096; + std::string errbuf(errbufsize, 0); + //CUjit_target_enum target = cutarget(context.device().architecture()); + void* optval[] = {(void*)(uintptr_t)errbufsize, (void*)errbuf.data()}; + try{ + dispatch::cuModuleLoadDataEx(&*cu_, source_.data(), 2, opt, optval); + }catch(exception::cuda::base const &){ + std::cerr << "Compilation Failed! Log: " << std::endl; + std::cerr << errbuf << std::endl; + throw; + } + } +} + +Context const & Module::context() const +{ return context_; } + +Handle const & Module::cu() const +{ return cu_; } + +Buffer Module::symbol(const char *name) const{ + CUdeviceptr handle; + size_t size; + dispatch::cuModuleGetGlobal_v2(&handle, &size, *cu_, name); + return Buffer(context_, handle, false); +} + + +} +} + diff --git a/lib/driver/platform.cpp b/lib/driver/platform.cpp new file mode 100755 index 000000000..b9722933d --- /dev/null +++ b/lib/driver/platform.cpp @@ -0,0 +1,56 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + + +#include "driver/platform.h" +#include "driver/device.h" + +#include + +namespace tdl +{ +namespace driver +{ + +std::string Platform::version() const{ + int version; + dispatch::cuDriverGetVersion(&version); + return std::to_string(version); +} + +std::string Platform::name() const +{ return (std::string)"CUDA"; } + +std::vector Platform::devices() const{ + std::vector devices; + int N; + dispatch::cuDeviceGetCount(&N); + for(int i = 0 ; i < N ; ++i){ + CUdevice device; + dispatch::cuDeviceGet(&device, i); + devices.push_back(Device(device)); + } + return devices; +} + +} +} diff --git a/lib/driver/stream.cpp b/lib/driver/stream.cpp new file mode 100755 index 000000000..0296eba40 --- /dev/null +++ b/lib/driver/stream.cpp @@ -0,0 +1,95 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#include +#include +#include + +#include "driver/backend.h" +#include "driver/stream.h" +#include "driver/context.h" +#include "driver/device.h" +#include "driver/event.h" +#include "driver/kernel.h" +#include "driver/buffer.h" + +namespace tdl +{ + +namespace driver +{ + +inline CUcontext cucontext(){ + CUcontext result; + dispatch::cuCtxGetCurrent(&result); + return result; +} + +Stream::Stream(CUstream stream, bool take_ownership): context_(cucontext(), take_ownership), cu_(stream, take_ownership) +{} + +Stream::Stream(Context const & context): context_(context), cu_(CUstream(), true) +{ + ContextSwitcher ctx_switch(context_); + dispatch::cuStreamCreate(&*cu_, 0); +} + +void Stream::synchronize() +{ + ContextSwitcher ctx_switch(context_); + dispatch::cuStreamSynchronize(*cu_); +} + +Context const & Stream::context() const +{ return context_; } + +void Stream::enqueue(Kernel const & kernel, std::array grid, std::array block, std::vector const *, Event* event){ + ContextSwitcher ctx_switch(context_); + if(event) + dispatch::cuEventRecord(((cu_event_t)*event).first, *cu_); + dispatch::cuLaunchKernel(kernel, grid[0], grid[1], grid[2], block[0], block[1], block[2], 0, *cu_,(void**)kernel.cu_params(), NULL); + if(event) + dispatch::cuEventRecord(((cu_event_t)*event).second, *cu_); +} + +void Stream::write(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr){ + ContextSwitcher ctx_switch(context_); + if(blocking) + dispatch::cuMemcpyHtoD(buffer + offset, ptr, size); + else + dispatch::cuMemcpyHtoDAsync(buffer + offset, ptr, size, *cu_); +} + +void Stream::read(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr){ + ContextSwitcher ctx_switch(context_); + if(blocking) + dispatch::cuMemcpyDtoH(ptr, buffer + offset, size); + else + dispatch::cuMemcpyDtoHAsync(ptr, buffer + offset, size, *cu_); +} + +Handle const & Stream::cu() const +{ return cu_; } + +} + +} From f697fcb8877f7f66b3d61f634d569060d2a04841 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 8 Feb 2019 18:05:43 -0500 Subject: [PATCH 064/494] [driver] fixed some bugs --- include/driver/platform.h | 2 +- lib/driver/dispatch.cpp | 25 -------------- lib/driver/error.cpp | 17 --------- lib/driver/module.cpp | 73 +++++++-------------------------------- lib/driver/platform.cpp | 3 -- 5 files changed, 13 insertions(+), 107 deletions(-) diff --git a/include/driver/platform.h b/include/driver/platform.h index 2a3b8fcdb..add506e82 100755 --- a/include/driver/platform.h +++ b/include/driver/platform.h @@ -40,7 +40,7 @@ class Platform { public: //Accessors - std::string name() const; + std::string name() const { return "CUDA"; } std::string version() const; std::vector devices() const; private: diff --git a/lib/driver/dispatch.cpp b/lib/driver/dispatch.cpp index 4551bf072..2d0cd5232 100755 --- a/lib/driver/dispatch.cpp +++ b/lib/driver/dispatch.cpp @@ -85,18 +85,6 @@ namespace driver #define CUDA_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) #define CUDA_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) -#define NVRTC_DEFINE1(ret, fname, t1) DEFINE1(nvrtcinit, nvrtc_, ret, fname, t1) -#define NVRTC_DEFINE2(ret, fname, t1, t2) DEFINE2(nvrtcinit, nvrtc_, ret, fname, t1, t2) -#define NVRTC_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3) -#define NVRTC_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4) -#define NVRTC_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5) -#define NVRTC_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6) -#define NVRTC_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7) -#define NVRTC_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) -#define NVRTC_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) -#define NVRTC_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) -#define NVRTC_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(nvrtcinit, nvrtc_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) - #define NVML_DEFINE0(ret, fname) DEFINE0(nvmlinit, nvml_, ret, fname) #define NVML_DEFINE1(ret, fname, t1) DEFINE1(nvmlinit, nvml_, ret, fname, t1) #define NVML_DEFINE2(ret, fname, t1, t2) DEFINE2(nvmlinit, nvml_, ret, fname, t1, t2) @@ -127,12 +115,6 @@ bool dispatch::cuinit(){ return cuda_ != nullptr; } -bool dispatch::nvrtcinit(){ - if(nvrtc_==nullptr) - nvrtc_ = dlopen("libnvrtc.so", RTLD_LAZY); - return nvrtc_ != nullptr; -} - bool dispatch::nvmlinit(){ if(nvml_==nullptr) nvml_ = dlopen("libnvidia-ml.so", RTLD_LAZY); @@ -194,13 +176,6 @@ CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t, CUst CUDA_DEFINE1(CUresult, cuCtxPushCurrent_v2, CUcontext) CUDA_DEFINE1(CUresult, cuCtxPopCurrent_v2, CUcontext*) -NVRTC_DEFINE3(nvrtcResult, nvrtcCompileProgram, nvrtcProgram, int, const char **) -NVRTC_DEFINE2(nvrtcResult, nvrtcGetProgramLogSize, nvrtcProgram, size_t *) -NVRTC_DEFINE2(nvrtcResult, nvrtcGetPTX, nvrtcProgram, char *) -NVRTC_DEFINE2(nvrtcResult, nvrtcGetPTXSize, nvrtcProgram, size_t *) -NVRTC_DEFINE6(nvrtcResult, nvrtcCreateProgram, nvrtcProgram *, const char *, const char *, int, const char **, const char **) -NVRTC_DEFINE2(nvrtcResult, nvrtcGetProgramLog, nvrtcProgram, char *) - NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *, nvmlDevice_t*) NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*) NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*) diff --git a/lib/driver/error.cpp b/lib/driver/error.cpp index 42d3a780d..7e7dc9d75 100755 --- a/lib/driver/error.cpp +++ b/lib/driver/error.cpp @@ -94,23 +94,6 @@ void check(CUresult err) } } -void check(nvrtcResult err){ - using namespace exception::nvrtc; - - switch(err) - { - case NVRTC_SUCCESS: break; - case NVRTC_ERROR_OUT_OF_MEMORY: throw out_of_memory(); - case NVRTC_ERROR_PROGRAM_CREATION_FAILURE: throw program_creation_failure(); - case NVRTC_ERROR_INVALID_INPUT: throw invalid_input(); - case NVRTC_ERROR_INVALID_PROGRAM: throw invalid_program(); - case NVRTC_ERROR_INVALID_OPTION: throw invalid_option(); - case NVRTC_ERROR_COMPILATION: throw compilation(); - case NVRTC_ERROR_BUILTIN_OPERATION_FAILURE: throw builtin_operation_failure(); - default: throw unknown_error(); - } -} - void check(cublasStatus_t err){ using namespace exception::cublas; switch(err) diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 06a6ff437..c61482cbc 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -34,68 +34,19 @@ namespace tdl namespace driver { -CUjit_target_enum cutarget(Device::Architecture arch){ - switch(arch){ - case Device::Architecture::SM_2_0: return CU_TARGET_COMPUTE_20; - case Device::Architecture::SM_2_1: return CU_TARGET_COMPUTE_21; - case Device::Architecture::SM_3_0: return CU_TARGET_COMPUTE_30; - case Device::Architecture::SM_3_5: return CU_TARGET_COMPUTE_35; - case Device::Architecture::SM_3_7: return CU_TARGET_COMPUTE_37; - case Device::Architecture::SM_5_0: return CU_TARGET_COMPUTE_50; - case Device::Architecture::SM_5_2: return CU_TARGET_COMPUTE_52; - case Device::Architecture::SM_6_0: return CU_TARGET_COMPUTE_60; - case Device::Architecture::SM_6_1: return CU_TARGET_COMPUTE_61; - default: throw; - } -} - -inline std::pair ptx(std::pair sm){ - if(sm.first == 7) return {6, 0}; - if(sm.first == 6) return {5, 0}; - if(sm.first == 5) return {4, 3}; - throw; -} - -std::string Module::header(Device const & device){ - auto cc = device.compute_capability(); - auto vptx = ptx(cc); - std::string header; - header += ".version " + std::to_string(vptx.first) + "." + std::to_string(vptx.second) + "\n"; - header += ".target sm_" + std::to_string(cc.first) + std::to_string(cc.second) + "\n"; - header += ".address_size 64\n"; - return header; -} - -Module::Module(Context const & context, std::string const & source) : context_(context), source_(header(context.device()) + source){ +Module::Module(Context const & context, std::string const & source) : context_(context), source_(source){ ContextSwitcher ctx_switch(context_); - - //Path to custom PTX compiler - std::string compiler = tools::getenv("ISAAC_PTXAS"); - if(compiler.size()){ - auto cc = context.device().compute_capability(); - std::string out = context.cache_path() + "tmp.o"; - std::string opt = " --gpu-name sm_" + std::to_string(cc.first) + std::to_string(cc.second) - + " -o " + out - + " -ias \"" + source_ + "\""; - std::string cmd = compiler + opt; - if(std::system(cmd.c_str()) != 0) - throw; - dispatch::cuModuleLoad(&*cu_, out.c_str()); - } - //JIT Compilation - else{ - CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; - unsigned int errbufsize = 8096; - std::string errbuf(errbufsize, 0); - //CUjit_target_enum target = cutarget(context.device().architecture()); - void* optval[] = {(void*)(uintptr_t)errbufsize, (void*)errbuf.data()}; - try{ - dispatch::cuModuleLoadDataEx(&*cu_, source_.data(), 2, opt, optval); - }catch(exception::cuda::base const &){ - std::cerr << "Compilation Failed! Log: " << std::endl; - std::cerr << errbuf << std::endl; - throw; - } + // JIT compile source-code + CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; + unsigned int errbufsize = 8096; + std::string errbuf(errbufsize, 0); + void* optval[] = {(void*)(uintptr_t)errbufsize, (void*)errbuf.data()}; + try{ + dispatch::cuModuleLoadDataEx(&*cu_, source_.data(), 2, opt, optval); + }catch(exception::cuda::base const &){ + std::cerr << "Compilation Failed! Log: " << std::endl; + std::cerr << errbuf << std::endl; + throw; } } diff --git a/lib/driver/platform.cpp b/lib/driver/platform.cpp index b9722933d..2fa9933d3 100755 --- a/lib/driver/platform.cpp +++ b/lib/driver/platform.cpp @@ -37,9 +37,6 @@ std::string Platform::version() const{ return std::to_string(version); } -std::string Platform::name() const -{ return (std::string)"CUDA"; } - std::vector Platform::devices() const{ std::vector devices; int N; From 77dd99efe8bc3787cd2b61043d0c0915b67c0364 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 8 Feb 2019 23:32:17 -0500 Subject: [PATCH 065/494] [code generation] bug fixes in grid axes binding --- CMakeLists.txt | 1 - examples/matrix.cpp | 40 +++++++++++-------------------------- include/codegen/selection.h | 2 +- lib/codegen/selection.cpp | 19 +++++++++--------- 4 files changed, 22 insertions(+), 40 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b4c28cebc..2531e84ca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,7 +34,6 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${LLVM_CXXFLAGS} -std=c++11") # TDL file(GLOB_RECURSE LIBTDL_SRC lib/*.cpp) add_library(tdl SHARED ${LIBTDL_SRC} ${BISON_Parser_OUTPUTS} ${FLEX_Lexer_OUTPUTS}) -message(STATUS ${llvm_libs}) target_link_libraries(tdl ${llvm_libs}) # Examples diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 1283e651d..3831f2593 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -31,22 +31,10 @@ extern translation_unit *ast_root; const char src[] = "\ void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K){\ - int32 rx[32] = get_global_range[32](0);\ - int32 ry[32] = get_global_range[32](1);\ - int32 rka[8] = 0 ... 8;\ - int32 rkb[8] = 0 ... 8;\ - fp32 C[32, 32] = 0;\ - int32 k;\ - fp32* pa[32, 8] = a + rx[:, newaxis] + rka[newaxis, :]*M;\ - fp32* pb[32, 8] = b + ry[:, newaxis] + rkb[newaxis, :]*K;\ - fp32* pc[32, 32] = c + rx[:, newaxis] + ry[newaxis, :]*M;\ - for(k = K; k > 0; k = k - 8){\ - fp32 a[32, 8] = *pa;\ - fp32 b[32, 8] = *pb;\ - C = C + 1;\ - pa = pa + 8*M;\ - pb = pb + 8*K;\ - }\ + int32 rx[16] = get_global_range[16](0);\ + int32 ry[16] = get_global_range[16](1);\ + fp32 C[16, 16] = 1;\ + fp32* pc[16, 16] = c + rx[:, newaxis] + ry[newaxis, :]*M;\ *pc = C;\ }\ "; @@ -151,14 +139,10 @@ int main() { // tuning parameters tune.run(module); std::vector params = { - // asm + // c0 2, 8, 1, - // bsn + // c1 4, 4, 1, - // pa - 2, 4, 1, - // pb - 1, 8, 1, }; std::map> errors; unsigned i = 0; @@ -184,7 +168,7 @@ int main() { // generate machine code std::string src = generate_machine_code(llvm_module, "nvptx64-nvidia-cuda", compute_data_layout(true, true)); - std::cout << src << std::endl; +// std::cout << src << std::endl; // compile machine code CUdevice cu_device; @@ -220,16 +204,16 @@ int main() { void *args[] = { &d_a, &d_b, &d_c, &M, &N, &K}; int num_regs; cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, cu_kernel); - unsigned TM = params[0]*params[1]; - unsigned TN = params[3]*params[4]; - unsigned nthreads = params[1]*params[2]*params[7]*params[8]; + unsigned TM = 16; + unsigned TN = 16; + unsigned nthreads = 32; checkCudaErrors(cuLaunchKernel(cu_kernel, M/TM, N/TN, 1, nthreads, 1, 1, 0, cu_stream, args, NULL)); checkCudaErrors(cuStreamSynchronize(cu_stream)); // Write back checkCudaErrors(cuMemcpyDtoH(c.data(), d_c, sizeof(numeric_t) * c.size())); for(size_t i = 0; i < M*N; i++) - if(c[i] == 32) - std::cout << i << " " << "success" << std::endl; + if(c[i] != 1) + std::cout << i << " " << "failure" << std::endl; return 0; } diff --git a/include/codegen/selection.h b/include/codegen/selection.h index 5aea5564f..179e664ce 100644 --- a/include/codegen/selection.h +++ b/include/codegen/selection.h @@ -109,7 +109,7 @@ private: tmap_t tmap_; allocation *alloc_; tune *params_; - std::map> axes_; + std::map axes_; }; } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index cb22d972c..d4d6394a1 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -63,7 +63,8 @@ Value* shared_tile::shared_offset(indices_t idx) { return result; } -shared_tile::shared_tile(Type *ty, const shapes_t &shapes, Value *ptr, llvm::IRBuilder<> &builder): tile(ty, shapes), ptr_(ptr), builder_(builder) { +shared_tile::shared_tile(Type *ty, const shapes_t &shapes, Value *ptr, llvm::IRBuilder<> &builder): + tile(ty, shapes), ptr_(ptr), builder_(builder) { } @@ -236,8 +237,8 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id std::vector thread_id_in_warp = delinearize(u_thread_id, warp_size, builder); std::vector warp_id = delinearize(u_warp_id, n_warps, builder); // Create axes - std::vector axes(dim); for(unsigned k = 0; k < dim; k++) { + std::string str_k = std::to_string(k); Value *warp_size_k = builder.getInt32(warp_size[k]); Value *contiguous_k = builder.getInt32(contiguous[k]); Value *thread_id = builder.CreateAdd(thread_id_in_warp[k], builder.CreateMul(warp_id[k], warp_size_k)); @@ -247,12 +248,10 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id std::vector idx_list(per_thread); for(unsigned n = 0 ; n < per_thread; n++){ unsigned offset = n / contiguous[k] * per_block + n % contiguous[k]; - idx_list[n] = builder.CreateAdd(thread_id, builder.getInt32(offset)); + idx_list[n] = builder.CreateAdd(thread_id, builder.getInt32(offset), "idx_" + str_k + "_" + std::to_string(n)); } - axes[k] = distributed_axis{idx_list}; + axes_[params_->get_param(v, "p0.d" + str_k)] = distributed_axis{idx_list}; } - // Store axes - axes_[v] = axes; } void selection::create_grids(std::vector &grids, @@ -327,7 +326,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, for(size_t d = 0; d < shapes.size(); d++){ if(shapes[d] > 1){ unsigned *x = params_->get_param(v, "p0.d" + std::to_string(d)); - axes[d] = axes_.at(references.at(x))[d]; + axes[d] = axes_.at(x); } else axes[d].values = {builder.getInt32(0)}; @@ -337,6 +336,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, // constant range if(dynamic_cast(v)) T->for_each([&](indices_t idx){ + assert(idx.size() == 1); T->set_value(idx, idx[0]); }); @@ -397,8 +397,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & Value *offset = builder.CreateMul(builder.getInt32(shapes[0]), group_id); result->for_each([&](indices_t idx){ BinaryOperator *bin = static_cast(idx[0]); - result->set_value(idx, builder.CreateAdd(bin->getOperand(1), - builder.CreateAdd(bin->getOperand(0), offset))); + result->set_value(idx, builder.CreateAdd(bin, offset)); }); } // reshape @@ -430,8 +429,8 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & for(size_t k = 0; k < in_idx.size(); k++){ if(in_shapes[k] == 1) in_idx[k] = builder.getInt32(0); - result->set_value(out_idx, in_tile->get_value(in_idx)); } + result->set_value(out_idx, in_tile->get_value(in_idx)); }); } // copy to shared From 4c8dbcccdcc4b5b57918ab5dc621106d26f35b19 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 8 Feb 2019 23:49:18 -0500 Subject: [PATCH 066/494] test --- examples/matrix.cpp | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 3831f2593..a6fb5c168 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -33,8 +33,20 @@ const char src[] = void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K){\ int32 rx[16] = get_global_range[16](0);\ int32 ry[16] = get_global_range[16](1);\ - fp32 C[16, 16] = 1;\ + int32 rka[8] = 0 ... 8;\ + int32 rkb[8] = 0 ... 8;\ + fp32 C[16, 16] = 0;\ + int32 k;\ + fp32* pa[16, 8] = a + rx[:, newaxis] + rka[newaxis, :]*M;\ + fp32* pb[16, 8] = b + ry[:, newaxis] + rkb[newaxis, :]*K;\ fp32* pc[16, 16] = c + rx[:, newaxis] + ry[newaxis, :]*M;\ + for(k = K; k > 0; k = k - 8){\ + fp32 a[16, 8] = *pa;\ + fp32 b[16, 8] = *pb;\ + C = C + 1;\ + pa = pa + 8*M;\ + pb = pb + 8*K;\ + }\ *pc = C;\ }\ "; @@ -139,10 +151,14 @@ int main() { // tuning parameters tune.run(module); std::vector params = { - // c0 + // asm 2, 8, 1, - // c1 + // bsn 4, 4, 1, + // pa + 2, 4, 1, + // pb + 1, 8, 1, }; std::map> errors; unsigned i = 0; @@ -206,14 +222,14 @@ int main() { cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, cu_kernel); unsigned TM = 16; unsigned TN = 16; - unsigned nthreads = 32; + unsigned nthreads = params[1]*params[2]*params[7]*params[8]; checkCudaErrors(cuLaunchKernel(cu_kernel, M/TM, N/TN, 1, nthreads, 1, 1, 0, cu_stream, args, NULL)); checkCudaErrors(cuStreamSynchronize(cu_stream)); // Write back checkCudaErrors(cuMemcpyDtoH(c.data(), d_c, sizeof(numeric_t) * c.size())); - for(size_t i = 0; i < M*N; i++) - if(c[i] != 1) - std::cout << i << " " << "failure" << std::endl; - return 0; + std::cout << c[0] << " " << c[1] << " " << c[2] << " " << c[3] << std::endl; +// for(size_t i = 0; i < M*N; i++) +// if(c[i] != 32) +// std::cout << i << " " << "success" << std::endl; } From d39f97ef38073d36b9f54f3485e35eb6dd029347 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 9 Feb 2019 19:20:50 -0500 Subject: [PATCH 067/494] [code generation] simple matrix-multiplication working --- examples/matrix.cpp | 54 ++++++++++++++++++++++++++------------- lib/codegen/selection.cpp | 4 ++- 2 files changed, 39 insertions(+), 19 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index a6fb5c168..03d206ba0 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -31,19 +31,21 @@ extern translation_unit *ast_root; const char src[] = "\ void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K){\ - int32 rx[16] = get_global_range[16](0);\ - int32 ry[16] = get_global_range[16](1);\ + int32 rxa[16] = get_global_range[16](0);\ + int32 ryb[16] = get_global_range[16](1);\ int32 rka[8] = 0 ... 8;\ int32 rkb[8] = 0 ... 8;\ + int32 rxc[16] = get_global_range[16](0);\ + int32 ryc[16] = get_global_range[16](1);\ fp32 C[16, 16] = 0;\ int32 k;\ - fp32* pa[16, 8] = a + rx[:, newaxis] + rka[newaxis, :]*M;\ - fp32* pb[16, 8] = b + ry[:, newaxis] + rkb[newaxis, :]*K;\ - fp32* pc[16, 16] = c + rx[:, newaxis] + ry[newaxis, :]*M;\ + fp32* pa[16, 8] = a + rxa[:, newaxis] + rka[newaxis, :]*M;\ + fp32* pb[16, 8] = b + ryb[:, newaxis] + rkb[newaxis, :]*K;\ + fp32* pc[16, 16] = c + rxc[:, newaxis] + ryc[newaxis, :]*M;\ for(k = K; k > 0; k = k - 8){\ fp32 a[16, 8] = *pa;\ fp32 b[16, 8] = *pb;\ - C = C + 1;\ + C = dot(a, b, C);\ pa = pa + 8*M;\ pb = pb + 8*K;\ }\ @@ -127,6 +129,17 @@ static void compile_machine_code(CUdevice &device, CUcontext &context, CUmodule checkCudaErrors(cuModuleGetFunction(&function, module, name.c_str())); } +template +void simple_gemm(std::vector &c, const std::vector &a, const std::vector &b, size_t M, size_t N, size_t K){ + for(size_t m = 0; m < M; m++) + for(size_t n = 0; n < N; n++){ + T acc = 0; + for(size_t k = 0; k < K; k++) + acc += a[m + k*M] * b[n + k*N]; + c[m + n*M] = acc; + } +} + int main() { // create AST from Triton-C source YY_BUFFER_STATE buffer = yy_scan_string(src); @@ -151,14 +164,18 @@ int main() { // tuning parameters tune.run(module); std::vector params = { - // asm + // a0 2, 8, 1, - // bsn + // b0 4, 4, 1, - // pa + // c0 + 2, 8, 1, + // c1 + 4, 4, 1, + // a1 2, 4, 1, - // pb - 1, 8, 1, + // b1 + 1, 8, 1 }; std::map> errors; unsigned i = 0; @@ -194,12 +211,14 @@ int main() { CUstream cu_stream; int major, minor; compile_machine_code(cu_device, cu_context, cu_module, cu_kernel, cu_stream, major, minor, src, "test"); +// std::cout << src << std::endl; // execute machine code // Allocate buffers typedef float numeric_t; - size_t M = 256, N = 256, K = 256; + size_t M = 32, N = 32, K = 32; std::vector c(M*N); + std::vector rc(M*N); std::vector a(M*K); std::vector b(K*N); for(size_t i = 0; i < a.size(); i++) @@ -222,14 +241,13 @@ int main() { cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, cu_kernel); unsigned TM = 16; unsigned TN = 16; - unsigned nthreads = params[1]*params[2]*params[7]*params[8]; + unsigned nthreads = 32; checkCudaErrors(cuLaunchKernel(cu_kernel, M/TM, N/TN, 1, nthreads, 1, 1, 0, cu_stream, args, NULL)); checkCudaErrors(cuStreamSynchronize(cu_stream)); // Write back checkCudaErrors(cuMemcpyDtoH(c.data(), d_c, sizeof(numeric_t) * c.size())); - - std::cout << c[0] << " " << c[1] << " " << c[2] << " " << c[3] << std::endl; -// for(size_t i = 0; i < M*N; i++) -// if(c[i] != 32) -// std::cout << i << " " << "success" << std::endl; + simple_gemm(rc, a, b, M, N, K); + for(size_t i = 0; i < M*N; i++) + if(std::abs(c[i] - rc[i])/std::max(c[i], rc[i]) > 1e-4) + std::cout << i << " " << c[i] << " " << rc[i] << std::endl; } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index d4d6394a1..1f29275f9 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -317,6 +317,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, if(is_shared){ size_t offset = alloc_->get_offset(v); Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset)); + ptr = builder.CreateBitCast(ptr, ty->getPointerTo(ptr->getType()->getPointerAddressSpace())); tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)}); } // create distributed tile @@ -445,6 +446,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & ir::value *A = ins->get_operand(0); ir::value *B = ins->get_operand(1); ir::value *C = ins->get_operand(2); + Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {llvm_type(C->get_type()->get_scalar_ty(), ctx)}); result->for_each([&](indices_t idx){ Value *res = tmap_.at(C)->get_value(idx); unsigned NK = A->get_type()->get_tile_shapes()[1]; @@ -453,7 +455,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & indices_t b_idx = {idx[1], builder.getInt32(K)}; Value *a = tmap_.at(A)->get_value(a_idx); Value *b = tmap_.at(B)->get_value(b_idx); - res = builder.CreateAdd(res, builder.CreateMul(a, b)); + res = builder.CreateCall(f_mul_add, {a, b, res}); } result->set_value(idx, res); }); From 4a0736ce206ffc3df0589cac20a9c71896b4f4e4 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 9 Feb 2019 23:56:53 -0500 Subject: [PATCH 068/494] [code generation] in-place CSE in shared memory reads --- examples/matrix.cpp | 2 ++ include/codegen/selection.h | 4 +++ lib/codegen/selection.cpp | 55 +++++++++++++++++++++++++++++++++++-- 3 files changed, 58 insertions(+), 3 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 03d206ba0..5783f6f3d 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -20,6 +20,7 @@ #include "llvm/Target/TargetOptions.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/LegacyPassManager.h" +#include "llvm/Transforms/Scalar/EarlyCSE.h" typedef struct yy_buffer_state * YY_BUFFER_STATE; extern int yyparse(); @@ -194,6 +195,7 @@ int main() { liveness.run(module); allocation.run(); selection.run(module, llvm_module); + // llvm source llvm::PrintModulePass print(llvm::outs()); llvm::AnalysisManager analysis; diff --git a/include/codegen/selection.h b/include/codegen/selection.h index 179e664ce..1e413f707 100644 --- a/include/codegen/selection.h +++ b/include/codegen/selection.h @@ -44,6 +44,9 @@ protected: class shared_tile: public tile { private: + void extract_constant(llvm::Value *arg, llvm::Value *&non_cst, llvm::Value *&cst); + void extract_constant(const indices_t &arg_idx, indices_t &non_cst_idx, indices_t &cst_idx); + llvm::Value* shared_offset(indices_t idx); public: @@ -54,6 +57,7 @@ public: private: llvm::Value *ptr_; llvm::IRBuilder<> &builder_; + std::map ptr_cache_; }; class distributed_tile: public tile{ diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 1f29275f9..24f8ea482 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -7,7 +7,7 @@ #include "ir/module.h" #include "ir/function.h" #include "ir/type.h" - +#include "llvm/Transforms/Scalar/EarlyCSE.h" namespace tdl{ namespace codegen{ @@ -55,6 +55,51 @@ void distributed_tile::for_each(std::function fn) { } /* Shared Tile */ +void shared_tile::extract_constant(Value *arg, Value *&non_cst, Value *&cst) { + BinaryOperator *bin_op = dyn_cast(arg); + Constant *_0 = ConstantInt::get(Type::getInt32Ty(arg->getContext()), 0); + if(dyn_cast(arg)){ + cst = arg; + non_cst = _0; + return; + } + if(!bin_op || bin_op->getOpcode() != llvm::BinaryOperator::Add){ + non_cst = arg; + cst = _0; + return; + } + Constant *cst_lhs = dyn_cast(bin_op->getOperand(0)); + Constant *cst_rhs = dyn_cast(bin_op->getOperand(1)); + if(cst_lhs && cst_rhs){ + cst = arg; + non_cst = _0; + } + else if(cst_lhs){ + cst = cst_lhs; + non_cst = bin_op->getOperand(1); + } + else if(cst_rhs){ + cst = cst_rhs; + non_cst = bin_op->getOperand(0); + } + else{ + non_cst = arg; + cst = _0; + } +} + +void shared_tile::extract_constant(const indices_t &arg_idx, indices_t &non_cst_idx, indices_t &cst_idx) { + non_cst_idx.clear(); + cst_idx.clear(); + for(Value *idx: arg_idx){ + Value *non_cst, *cst; + extract_constant(idx, non_cst, cst); + non_cst_idx.push_back(non_cst); + cst_idx.push_back(cst); + } +} + + Value* shared_tile::shared_offset(indices_t idx) { Value *result = builder_.getInt32(0); result = builder_.CreateAdd(result, idx[0]); @@ -65,7 +110,6 @@ Value* shared_tile::shared_offset(indices_t idx) { shared_tile::shared_tile(Type *ty, const shapes_t &shapes, Value *ptr, llvm::IRBuilder<> &builder): tile(ty, shapes), ptr_(ptr), builder_(builder) { - } void shared_tile::set_value(indices_t idx, Value *value) { @@ -74,7 +118,12 @@ void shared_tile::set_value(indices_t idx, Value *value) { } Value* shared_tile::get_value(indices_t idx) { - Value *ptr = builder_.CreateGEP(ptr_, shared_offset(idx)); + indices_t non_cst_idx, cst_idx; + extract_constant(idx, non_cst_idx, cst_idx); + Value *&base_ptr = ptr_cache_[non_cst_idx]; + if(base_ptr == nullptr) + base_ptr = builder_.CreateGEP(ptr_, shared_offset(non_cst_idx)); + Value *ptr = builder_.CreateGEP(base_ptr, shared_offset(cst_idx)); return builder_.CreateLoad(ptr); } From 3d07e909c6ee9183c2ef24e2456c274ce76a1456 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 10 Feb 2019 18:29:25 -0500 Subject: [PATCH 069/494] attempting vectorization --- examples/matrix.cpp | 2 +- include/codegen/selection.h | 13 +++++-- lib/codegen/selection.cpp | 68 +++++++++++++++++++++++++++++++------ 3 files changed, 69 insertions(+), 14 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 5783f6f3d..9f9e2a724 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -213,7 +213,7 @@ int main() { CUstream cu_stream; int major, minor; compile_machine_code(cu_device, cu_context, cu_module, cu_kernel, cu_stream, major, minor, src, "test"); -// std::cout << src << std::endl; + std::cout << src << std::endl; // execute machine code // Allocate buffers diff --git a/include/codegen/selection.h b/include/codegen/selection.h index 1e413f707..73c72f120 100644 --- a/include/codegen/selection.h +++ b/include/codegen/selection.h @@ -25,6 +25,7 @@ class tune; typedef std::vector indices_t; struct distributed_axis { + size_t contiguous; std::vector values; }; @@ -33,7 +34,7 @@ protected: typedef std::vector shapes_t; public: - tile(llvm::Type *ty, const shapes_t &shapes): shapes_(shapes){ } + tile(llvm::Type *ty, const shapes_t &shapes): ty_(ty), shapes_(shapes){ } virtual void set_value(indices_t idx, llvm::Value *v) = 0; virtual llvm::Value* get_value(indices_t idx) = 0; @@ -69,7 +70,9 @@ private: void init_indices(); public: - distributed_tile(llvm::Type *ty, const shapes_t& shapes, const axes_t &axes); + distributed_tile(llvm::Type *ty, const shapes_t& shapes, const axes_t &axes, llvm::IRBuilder<> &builder); + void set_vectorized_iteration() { vectorized_ = true; } + void unset_vectorized_iteration() { vectorized_ = false; } void set_value(indices_t idx, llvm::Value *v); llvm::Value* get_value(indices_t idx); void for_each(std::function fn); @@ -78,6 +81,9 @@ private: axes_t axes_; indices_map_t indices_; values_t values_; + size_t vector_size_; + llvm::IRBuilder<> &builder_; + bool vectorized_; }; @@ -86,6 +92,9 @@ class selection{ typedef std::map tmap_t; private: + // utils + llvm::Type *make_vector_ty(llvm::Type *ty, size_t vector_size); + // LLVM conversions llvm::Type* llvm_type(ir::type *ty, llvm::LLVMContext &ctx); llvm::Value* llvm_value(ir::value *v, llvm::IRBuilder<> &builder); diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 24f8ea482..501e25f49 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -34,24 +34,52 @@ void distributed_tile::init_indices() { } } -distributed_tile::distributed_tile(Type *ty, const shapes_t &shapes, const axes_t &axes) - : tile(ty, shapes), axes_(axes) { +distributed_tile::distributed_tile(Type *ty, const shapes_t &shapes, const axes_t &axes, llvm::IRBuilder<> &builder) + : tile(ty, shapes), axes_(axes), builder_(builder), vectorized_(true) { init_indices(); for(size_t i = 0; i < indices_.size(); i++) values_.push_back(UndefValue::get(ty)); + // vectorization + vector_size_ = 1; + if(ty->isVectorTy()) + vector_size_ = ty->getVectorNumElements(); } void distributed_tile::set_value(indices_t idx, Value *v) { - values_[indices_[idx]] = v; + unsigned value_idx = indices_[idx]; + Value *&result = values_[value_idx/vector_size_*vector_size_]; + if(v->getType() == result->getType()) { + assert(value_idx % vector_size_ == 0); + result = v; + } + // insert scalar in vector + else { + assert(vector_size_==1 || result->getType()->isVectorTy()); + assert(v->getType()->getScalarType() == result->getType()->getScalarType()); + result = builder_.CreateInsertElement(result, v, value_idx % vector_size_); + } } Value* distributed_tile::get_value(indices_t idx) { - return values_[indices_[idx]]; + unsigned value_idx = indices_[idx]; + Value *&result = values_[value_idx/vector_size_*vector_size_]; + if(vectorized_ || vector_size_ == 1) { + assert(value_idx % vector_size_ == 0); + return result; + } + // extract scalar from vector + else { + assert(result->getType()->isVectorTy()); + return builder_.CreateExtractElement(result, value_idx % vector_size_); + } + return result; } void distributed_tile::for_each(std::function fn) { - for(auto &idx: indices_) - fn(idx.first); + for(auto &idx: indices_) { + if(!vectorized_ || (idx.second % vector_size_ == 0)) + fn(idx.first); + } } /* Shared Tile */ @@ -121,12 +149,23 @@ Value* shared_tile::get_value(indices_t idx) { indices_t non_cst_idx, cst_idx; extract_constant(idx, non_cst_idx, cst_idx); Value *&base_ptr = ptr_cache_[non_cst_idx]; - if(base_ptr == nullptr) + if(base_ptr == nullptr){ base_ptr = builder_.CreateGEP(ptr_, shared_offset(non_cst_idx)); +// Type *vec_ty = VectorType::get(base_ptr->getType()->getPointerElementType(), vec_); +// Type *vec_ptr_ty = PointerType::get(vec_ty, base_ptr->getType()->getPointerElementType()); +// base_ptr = builder_.CreateBitCast(base_ptr, vec_ptr_ty); + } Value *ptr = builder_.CreateGEP(base_ptr, shared_offset(cst_idx)); return builder_.CreateLoad(ptr); } +/* helper to make vector type */ +llvm::Type *selection::make_vector_ty(llvm::Type *ty, size_t vector_size) { + if(vector_size == 1) + return ty; + return VectorType::get(ty, vector_size); +} + /* convert ir::type to Type */ Type *selection::llvm_type(ir::type *ty, LLVMContext &ctx) { // function @@ -299,7 +338,7 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id unsigned offset = n / contiguous[k] * per_block + n % contiguous[k]; idx_list[n] = builder.CreateAdd(thread_id, builder.getInt32(offset), "idx_" + str_k + "_" + std::to_string(n)); } - axes_[params_->get_param(v, "p0.d" + str_k)] = distributed_axis{idx_list}; + axes_[params_->get_param(v, "p0.d" + str_k)] = distributed_axis{contiguous[k], idx_list}; } } @@ -378,17 +417,22 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, unsigned *x = params_->get_param(v, "p0.d" + std::to_string(d)); axes[d] = axes_.at(x); } - else + else{ + axes[d].contiguous = 1; axes[d].values = {builder.getInt32(0)}; + } } - distributed_tile *T = new distributed_tile(ty, shapes, axes); + distributed_tile *T = new distributed_tile(make_vector_ty(ty, axes[0].contiguous), shapes, axes, builder); tmap_.insert({v, T}); // constant range - if(dynamic_cast(v)) + if(dynamic_cast(v)){ + T->unset_vectorized_iteration(); T->for_each([&](indices_t idx){ assert(idx.size() == 1); T->set_value(idx, idx[0]); }); + T->set_vectorized_iteration(); + } } } @@ -454,6 +498,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & else if(dynamic_cast(ins)) { ir::value* in = ins->get_operand(0); distributed_tile *in_tile = (distributed_tile*)tmap_.at(in); + in_tile->unset_vectorized_iteration(); result->for_each([&](indices_t out_idx){ indices_t in_idx; for(size_t k = 0; k < shapes.size(); k++){ @@ -462,6 +507,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & } result->set_value(out_idx, in_tile->get_value(in_idx)); }); + in_tile->set_vectorized_iteration(); } // splat else if(dynamic_cast(ins)) { From 8ab5ca3de31fc68559629321697b588398d4f3c4 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 10 Feb 2019 20:41:07 -0500 Subject: [PATCH 070/494] blabla --- include/codegen/selection.h | 37 ++++++++++++++++------ lib/codegen/selection.cpp | 61 ++++++++++++++++++++++++------------- 2 files changed, 67 insertions(+), 31 deletions(-) diff --git a/include/codegen/selection.h b/include/codegen/selection.h index 73c72f120..729c36adb 100644 --- a/include/codegen/selection.h +++ b/include/codegen/selection.h @@ -70,23 +70,40 @@ private: void init_indices(); public: - distributed_tile(llvm::Type *ty, const shapes_t& shapes, const axes_t &axes, llvm::IRBuilder<> &builder); - void set_vectorized_iteration() { vectorized_ = true; } - void unset_vectorized_iteration() { vectorized_ = false; } - void set_value(indices_t idx, llvm::Value *v); + distributed_tile(llvm::Type *ty, const shapes_t& shapes, const axes_t &axes); + virtual void for_each(std::function fn) = 0; + +protected: + axes_t axes_; + indices_map_t indices_; + values_t values_; +}; + +class serialized_distributed_tile: public distributed_tile { +public: + using distributed_tile::distributed_tile; + +public: + void set_value(indices_t, llvm::Value *); + llvm::Value* get_value(indices_t idx); + void for_each(std::function fn); +}; + +class vectorized_distributed_tile: public distributed_tile { +private: + llvm::Type *make_vector_ty(llvm::Type *ty, size_t vector_size); + +public: + vectorized_distributed_tile(llvm::Type *ty, const shapes_t& shapes, const axes_t &axes, llvm::IRBuilder<> &builder); + void set_value(indices_t, llvm::Value *); llvm::Value* get_value(indices_t idx); void for_each(std::function fn); private: - axes_t axes_; - indices_map_t indices_; - values_t values_; - size_t vector_size_; llvm::IRBuilder<> &builder_; - bool vectorized_; + size_t vector_size_; }; - class selection{ typedef std::map vmap_t; typedef std::map tmap_t; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 501e25f49..2510f89e2 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -34,17 +34,43 @@ void distributed_tile::init_indices() { } } -distributed_tile::distributed_tile(Type *ty, const shapes_t &shapes, const axes_t &axes, llvm::IRBuilder<> &builder) - : tile(ty, shapes), axes_(axes), builder_(builder), vectorized_(true) { + +distributed_tile::distributed_tile(Type *ty, const shapes_t &shapes, const axes_t &axes) + : tile(ty, shapes), axes_(axes) { init_indices(); for(size_t i = 0; i < indices_.size(); i++) - values_.push_back(UndefValue::get(ty)); - // vectorization - vector_size_ = 1; - if(ty->isVectorTy()) - vector_size_ = ty->getVectorNumElements(); + values_.push_back(UndefValue::get(ty_)); } +/* Serialized distributed tile */ +void serialized_distributed_tile::set_value(indices_t idx, Value *v) { + values_[indices_[idx]] = v; +} + +void serialized_distributed_tile::get_value(indices_t idx) { + return values_[indices_[idx]]; +} + +void serialized_distributed_tile::for_each(std::function fn) { + for(auto &idx: indices_) + fn(idx.first); +} + +/* Vectorized distributed tile */ +llvm::Type *vectorized_distributed_tile::make_vector_ty(llvm::Type *ty, size_t vector_size) { + if(vector_size == 1) + return ty; + return VectorType::get(ty, vector_size); +} + +vectorized_distributed_tile::vectorized_distributed_tile(Type *ty, const shapes_t &shapes, const axes_t &axes, llvm::IRBuilder<> &builder) + : distributed_tile(make_vector_ty(ty, axes[0].contiguous), shapes), axes_(axes), builder_(builder) { + vector_size_ = 1; + if(ty_->isVectorTy()) + vector_size_ = ty_->getVectorNumElements(); +} + + void distributed_tile::set_value(indices_t idx, Value *v) { unsigned value_idx = indices_[idx]; Value *&result = values_[value_idx/vector_size_*vector_size_]; @@ -54,6 +80,7 @@ void distributed_tile::set_value(indices_t idx, Value *v) { } // insert scalar in vector else { + std::cout << v->getType()->getScalarType()->getTypeID() << " " << result->getType()->getScalarType()->getTypeID() << std::endl; assert(vector_size_==1 || result->getType()->isVectorTy()); assert(v->getType()->getScalarType() == result->getType()->getScalarType()); result = builder_.CreateInsertElement(result, v, value_idx % vector_size_); @@ -63,7 +90,7 @@ void distributed_tile::set_value(indices_t idx, Value *v) { Value* distributed_tile::get_value(indices_t idx) { unsigned value_idx = indices_[idx]; Value *&result = values_[value_idx/vector_size_*vector_size_]; - if(vectorized_ || vector_size_ == 1) { + if(vectorize_ || vector_size_ == 1) { assert(value_idx % vector_size_ == 0); return result; } @@ -77,7 +104,7 @@ Value* distributed_tile::get_value(indices_t idx) { void distributed_tile::for_each(std::function fn) { for(auto &idx: indices_) { - if(!vectorized_ || (idx.second % vector_size_ == 0)) + if(!vectorize_ || (idx.second % vector_size_ == 0)) fn(idx.first); } } @@ -142,6 +169,8 @@ shared_tile::shared_tile(Type *ty, const shapes_t &shapes, Value *ptr, llvm::IRB void shared_tile::set_value(indices_t idx, Value *value) { Value *ptr = builder_.CreateGEP(ptr_, shared_offset(idx)); + unsigned addr_space = ptr->getType()->getPointerAddressSpace(); + ptr = builder_.CreateBitCast(ptr, value->getType()->getPointerTo(addr_space)); builder_.CreateStore(value, ptr); } @@ -159,13 +188,6 @@ Value* shared_tile::get_value(indices_t idx) { return builder_.CreateLoad(ptr); } -/* helper to make vector type */ -llvm::Type *selection::make_vector_ty(llvm::Type *ty, size_t vector_size) { - if(vector_size == 1) - return ty; - return VectorType::get(ty, vector_size); -} - /* convert ir::type to Type */ Type *selection::llvm_type(ir::type *ty, LLVMContext &ctx) { // function @@ -422,16 +444,15 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, axes[d].values = {builder.getInt32(0)}; } } - distributed_tile *T = new distributed_tile(make_vector_ty(ty, axes[0].contiguous), shapes, axes, builder); + bool vectorize = dynamic_cast(v); + distributed_tile *T = new distributed_tile(ty, shapes, axes, builder, vectorize); tmap_.insert({v, T}); // constant range if(dynamic_cast(v)){ - T->unset_vectorized_iteration(); T->for_each([&](indices_t idx){ assert(idx.size() == 1); T->set_value(idx, idx[0]); }); - T->set_vectorized_iteration(); } } @@ -498,7 +519,6 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & else if(dynamic_cast(ins)) { ir::value* in = ins->get_operand(0); distributed_tile *in_tile = (distributed_tile*)tmap_.at(in); - in_tile->unset_vectorized_iteration(); result->for_each([&](indices_t out_idx){ indices_t in_idx; for(size_t k = 0; k < shapes.size(); k++){ @@ -507,7 +527,6 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & } result->set_value(out_idx, in_tile->get_value(in_idx)); }); - in_tile->set_vectorized_iteration(); } // splat else if(dynamic_cast(ins)) { From b2e487491f7cdc64a3c67fb028c36432b3c2fa94 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 10 Feb 2019 21:59:41 -0500 Subject: [PATCH 071/494] [code generation] now vectorizing shared memory stores --- examples/matrix.cpp | 3 ++ include/codegen/selection.h | 35 ++++---------- include/codegen/tune.h | 1 + include/ir/builder.h | 1 + include/ir/instructions.h | 6 +++ lib/codegen/selection.cpp | 95 ++++++++++++++----------------------- lib/ir/builder.cpp | 4 ++ lib/ir/instructions.cpp | 4 ++ 8 files changed, 64 insertions(+), 85 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 9f9e2a724..7af5c0fb9 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -9,6 +9,7 @@ #include "codegen/shared_copy.h" #include "codegen/allocation.h" #include "codegen/liveness.h" +#include "codegen/vectorize.h" #include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/Module.h" #include "llvm/IR/LLVMContext.h" @@ -160,6 +161,7 @@ int main() { tdl::codegen::tune tune; tdl::codegen::liveness liveness; tdl::codegen::allocation allocation(&liveness); + tdl::codegen::vectorize vectorize(&tune); tdl::codegen::selection selection(&allocation, &tune); // tuning parameters @@ -194,6 +196,7 @@ int main() { shared.run(module); liveness.run(module); allocation.run(); + vectorize.run(module); selection.run(module, llvm_module); // llvm source diff --git a/include/codegen/selection.h b/include/codegen/selection.h index 729c36adb..18b77f42c 100644 --- a/include/codegen/selection.h +++ b/include/codegen/selection.h @@ -68,42 +68,25 @@ class distributed_tile: public tile{ private: void init_indices(); - -public: - distributed_tile(llvm::Type *ty, const shapes_t& shapes, const axes_t &axes); - virtual void for_each(std::function fn) = 0; - -protected: - axes_t axes_; - indices_map_t indices_; - values_t values_; -}; - -class serialized_distributed_tile: public distributed_tile { -public: - using distributed_tile::distributed_tile; - -public: - void set_value(indices_t, llvm::Value *); - llvm::Value* get_value(indices_t idx); - void for_each(std::function fn); -}; - -class vectorized_distributed_tile: public distributed_tile { -private: llvm::Type *make_vector_ty(llvm::Type *ty, size_t vector_size); public: - vectorized_distributed_tile(llvm::Type *ty, const shapes_t& shapes, const axes_t &axes, llvm::IRBuilder<> &builder); - void set_value(indices_t, llvm::Value *); + distributed_tile(llvm::Type *ty, const shapes_t& shapes, const axes_t &axes, llvm::IRBuilder<> &builder, bool vectorize); + void set_value(indices_t idx, llvm::Value *v); llvm::Value* get_value(indices_t idx); + unsigned get_linear_index(indices_t idx); void for_each(std::function fn); + const distributed_axis &axis(unsigned dim) { return axes_.at(dim); } private: - llvm::IRBuilder<> &builder_; + axes_t axes_; + indices_map_t indices_; + values_t values_; size_t vector_size_; + llvm::IRBuilder<> &builder_; }; + class selection{ typedef std::map vmap_t; typedef std::map tmap_t; diff --git a/include/codegen/tune.h b/include/codegen/tune.h index d1fc67549..dfa1fcc97 100644 --- a/include/codegen/tune.h +++ b/include/codegen/tune.h @@ -32,6 +32,7 @@ public: std::vector get_params(ir::module& mod); std::map get_params(ir::instruction* i); unsigned *get_param(ir::value *value, const std::string &key) { return params_[value][key]; } + void copy(ir::value *dst, ir::value *src) { params_[dst] = params_[src]; } bool check_constraints(ir::module &fn, std::map> &errors); void run(ir::module &mod); diff --git a/include/ir/builder.h b/include/ir/builder.h index 438390940..f0b0d1aa3 100644 --- a/include/ir/builder.h +++ b/include/ir/builder.h @@ -118,6 +118,7 @@ public: value *create_matmul(value *A, value *B, value *C, const std::string &name = ""); // Intrinsics value *create_copy_to_shared(value *arg, const std::string &name = ""); + value *create_vectorize(value *arg, const std::string &name = ""); private: context &ctx_; basic_block *block_; diff --git a/include/ir/instructions.h b/include/ir/instructions.h index cc694fd7b..0745f62c0 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -397,6 +397,12 @@ public: instruction *next = nullptr); }; +class vectorize_inst: public unary_inst{ + using unary_inst::unary_inst; + +public: + static vectorize_inst* create(value *arg, const std::string &name = "", instruction *next = nullptr); +}; } } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 2510f89e2..d6a4a0bc5 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -34,79 +34,36 @@ void distributed_tile::init_indices() { } } - -distributed_tile::distributed_tile(Type *ty, const shapes_t &shapes, const axes_t &axes) - : tile(ty, shapes), axes_(axes) { - init_indices(); - for(size_t i = 0; i < indices_.size(); i++) - values_.push_back(UndefValue::get(ty_)); -} - -/* Serialized distributed tile */ -void serialized_distributed_tile::set_value(indices_t idx, Value *v) { - values_[indices_[idx]] = v; -} - -void serialized_distributed_tile::get_value(indices_t idx) { - return values_[indices_[idx]]; -} - -void serialized_distributed_tile::for_each(std::function fn) { - for(auto &idx: indices_) - fn(idx.first); -} - -/* Vectorized distributed tile */ -llvm::Type *vectorized_distributed_tile::make_vector_ty(llvm::Type *ty, size_t vector_size) { +llvm::Type *distributed_tile::make_vector_ty(llvm::Type *ty, size_t vector_size) { if(vector_size == 1) return ty; return VectorType::get(ty, vector_size); } -vectorized_distributed_tile::vectorized_distributed_tile(Type *ty, const shapes_t &shapes, const axes_t &axes, llvm::IRBuilder<> &builder) - : distributed_tile(make_vector_ty(ty, axes[0].contiguous), shapes), axes_(axes), builder_(builder) { - vector_size_ = 1; - if(ty_->isVectorTy()) - vector_size_ = ty_->getVectorNumElements(); +distributed_tile::distributed_tile(Type *ty, const shapes_t &shapes, const axes_t &axes, llvm::IRBuilder<> &builder, bool vectorize) + : tile(make_vector_ty(ty, vectorize?axes[0].contiguous:1), shapes), axes_(axes), builder_(builder) { + vector_size_ = vectorize?ty_->getVectorNumElements():1; + init_indices(); + for(size_t i = 0; i < indices_.size(); i++) + values_.push_back(UndefValue::get(ty_)); } - void distributed_tile::set_value(indices_t idx, Value *v) { - unsigned value_idx = indices_[idx]; - Value *&result = values_[value_idx/vector_size_*vector_size_]; - if(v->getType() == result->getType()) { - assert(value_idx % vector_size_ == 0); - result = v; - } - // insert scalar in vector - else { - std::cout << v->getType()->getScalarType()->getTypeID() << " " << result->getType()->getScalarType()->getTypeID() << std::endl; - assert(vector_size_==1 || result->getType()->isVectorTy()); - assert(v->getType()->getScalarType() == result->getType()->getScalarType()); - result = builder_.CreateInsertElement(result, v, value_idx % vector_size_); - } + values_[indices_[idx]] = v; } Value* distributed_tile::get_value(indices_t idx) { - unsigned value_idx = indices_[idx]; - Value *&result = values_[value_idx/vector_size_*vector_size_]; - if(vectorize_ || vector_size_ == 1) { - assert(value_idx % vector_size_ == 0); - return result; - } - // extract scalar from vector - else { - assert(result->getType()->isVectorTy()); - return builder_.CreateExtractElement(result, value_idx % vector_size_); - } - return result; + return values_[indices_[idx]]; +} + +unsigned distributed_tile::get_linear_index(indices_t idx) { + return indices_[idx]; } void distributed_tile::for_each(std::function fn) { - for(auto &idx: indices_) { - if(!vectorize_ || (idx.second % vector_size_ == 0)) + for(auto &idx: indices_) + if(idx.second % vector_size_ == 0) fn(idx.first); - } } /* Shared Tile */ @@ -444,7 +401,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, axes[d].values = {builder.getInt32(0)}; } } - bool vectorize = dynamic_cast(v); + bool vectorize = dynamic_cast(v); distributed_tile *T = new distributed_tile(ty, shapes, axes, builder, vectorize); tmap_.insert({v, T}); // constant range @@ -548,6 +505,26 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & result->set_value(out_idx, in_tile->get_value(in_idx)); }); } + // vectorize + else if(dynamic_cast(ins)) { + distributed_tile* in = (distributed_tile*)tmap_.at(ins->get_operand(0)); + unsigned vector_size = result->axis(0).contiguous; + std::map packets; + in->for_each([&](indices_t idx){ + unsigned linear = in->get_linear_index(idx); + unsigned id = linear / vector_size; + if(linear % vector_size == 0) + packets[id] = result->get_value(idx); + packets[id] = builder.CreateInsertElement(packets[id], in->get_value(idx), linear % vector_size); + std::cout << linear << std::endl; + }); + result->for_each([&](indices_t idx){ + unsigned linear = in->get_linear_index(idx); + unsigned id = linear / vector_size; + if(linear % vector_size == 0) + result->set_value(idx, packets[id]); + }); + } // copy to shared else if(dynamic_cast(ins)) { distributed_tile* in = (distributed_tile*)tmap_.at(ins->get_operand(0)); diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index 848f668b7..7422a47fc 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -277,5 +277,9 @@ value *builder::create_copy_to_shared(value *arg, const std::string &name) { return insert(copy_to_shared_inst::create(arg, name)); } +value *builder::create_vectorize(value *arg, const std::string &name) { + return insert(vectorize_inst::create(arg, name)); +} + } } diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 3cbabd45e..522f4c029 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -399,5 +399,9 @@ copy_to_shared_inst* copy_to_shared_inst::create(value *arg, const std::string & return new copy_to_shared_inst(arg->get_type(), arg, name, next); } +vectorize_inst* vectorize_inst::create(value *arg, const std::string &name, instruction *next) { + return new vectorize_inst(arg->get_type(), arg, name, next); +} + } } From f8e522ada89bef53ef8707568a306845aea3f354 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 11 Feb 2019 17:27:16 -0500 Subject: [PATCH 072/494] blabla --- examples/matrix.cpp | 16 +++++--- include/codegen/selection.h | 2 + include/codegen/shared_copy.h | 5 +++ lib/codegen/selection.cpp | 73 ++++++++++++++++++++++++++++------- lib/codegen/shared_copy.cpp | 28 +++++++++----- lib/ir/module.cpp | 3 ++ 6 files changed, 98 insertions(+), 29 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 7af5c0fb9..8360fcf3c 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -1,6 +1,7 @@ #include #include #include "cuda.h" +#include "llvm/IR/Verifier.h" #include "ast/ast.h" #include "ir/context.h" #include "ir/module.h" @@ -22,6 +23,7 @@ #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/Transforms/Scalar/EarlyCSE.h" +#include "llvm/Analysis/LoopPass.h" typedef struct yy_buffer_state * YY_BUFFER_STATE; extern int yyparse(); @@ -44,12 +46,14 @@ void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K){\ fp32* pa[16, 8] = a + rxa[:, newaxis] + rka[newaxis, :]*M;\ fp32* pb[16, 8] = b + ryb[:, newaxis] + rkb[newaxis, :]*K;\ fp32* pc[16, 16] = c + rxc[:, newaxis] + ryc[newaxis, :]*M;\ + fp32 a[16, 8] = *pa;\ + fp32 b[16, 8] = *pb;\ for(k = K; k > 0; k = k - 8){\ - fp32 a[16, 8] = *pa;\ - fp32 b[16, 8] = *pb;\ C = dot(a, b, C);\ pa = pa + 8*M;\ pb = pb + 8*K;\ + a = *pa;\ + b = *pb;\ }\ *pc = C;\ }\ @@ -200,11 +204,11 @@ int main() { selection.run(module, llvm_module); // llvm source - llvm::PrintModulePass print(llvm::outs()); - llvm::AnalysisManager analysis; - print.run(llvm_module, analysis); + llvm::legacy::PassManager manager; + manager.add(llvm::createPrintModulePass(llvm::outs())); +// manager.add(llvm::createVerifierPass(true)); + manager.run(llvm_module); - // generate machine code std::string src = generate_machine_code(llvm_module, "nvptx64-nvidia-cuda", compute_data_layout(true, true)); // std::cout << src << std::endl; diff --git a/include/codegen/selection.h b/include/codegen/selection.h index 18b77f42c..4dedbd088 100644 --- a/include/codegen/selection.h +++ b/include/codegen/selection.h @@ -54,6 +54,7 @@ public: shared_tile(llvm::Type* ty, const shapes_t &shapes, llvm::Value* ptr, llvm::IRBuilder<> &builder); void set_value(indices_t, llvm::Value *); llvm::Value* get_value(indices_t idx); + llvm::Value* get_pointer() { return ptr_; } private: llvm::Value *ptr_; @@ -102,6 +103,7 @@ private: llvm::Constant* llvm_constant(ir::constant *cst, llvm::LLVMContext &ctx); // grid construction + bool is_shared(ir::value *v); void create_grids(std::vector &grids, std::map &references, ir::function *fn); diff --git a/include/codegen/shared_copy.h b/include/codegen/shared_copy.h index 46cd8cbc8..8512ec358 100644 --- a/include/codegen/shared_copy.h +++ b/include/codegen/shared_copy.h @@ -5,11 +5,16 @@ namespace tdl { namespace ir { class module; + class value; + class builder; } namespace codegen{ class place_shared_copy { +private: + void add(ir::value *x, ir::builder &builder); + public: void run(ir::module &mod); }; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index d6a4a0bc5..5782212ad 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -8,6 +8,7 @@ #include "ir/function.h" #include "ir/type.h" #include "llvm/Transforms/Scalar/EarlyCSE.h" +#include "llvm/Analysis/LoopInfo.h" namespace tdl{ namespace codegen{ @@ -61,6 +62,7 @@ unsigned distributed_tile::get_linear_index(indices_t idx) { } void distributed_tile::for_each(std::function fn) { + std::cout << "vector size: " << vector_size_ << std::endl; for(auto &idx: indices_) if(idx.second % vector_size_ == 0) fn(idx.first); @@ -345,8 +347,7 @@ void selection::create_grids(std::vector &grids, bind_references(op); // bind const auto& shapes = v->get_type()->get_tile_shapes(); - bool is_shared = dynamic_cast(v); - if(is_shared) + if(is_shared(v)) return; for(size_t d = 0; d < shapes.size(); d++){ if(shapes[d] == 1) @@ -368,6 +369,18 @@ void selection::create_grids(std::vector &grids, grids.push_back(ref.second); } +bool selection::is_shared(ir::value *v) { + if(auto *phi = dynamic_cast(v)){ + bool result = true; + for(ir::value *op: phi->ops()) + result = result && is_shared(op); + return result; + } + else + return (bool)dynamic_cast(v); + +} + void selection::create_tile(ir::value *v, IRBuilder<> &builder, const std::map& references, std::set &seen, Value *sh_mem_ptr) { @@ -380,12 +393,33 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, const auto& shapes = v->get_type()->get_tile_shapes(); Type* ty = llvm_type(v->get_type()->get_scalar_ty(), ctx); // create shared tile - bool is_shared = dynamic_cast(v); - if(is_shared){ - size_t offset = alloc_->get_offset(v); - Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset)); - ptr = builder.CreateBitCast(ptr, ty->getPointerTo(ptr->getType()->getPointerAddressSpace())); - tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)}); + if(is_shared(v)){ + // shared copy + PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr->getType()->getPointerAddressSpace()); + if(dynamic_cast(v)) { + size_t offset = alloc_->get_offset(v); + Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset)); + ptr = builder.CreateBitCast(ptr, ptr_ty); + tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)}); + } + // phi-node (double-buffering) + else if(auto *phi = dynamic_cast(v)) { + BasicBlock *parent = (BasicBlock*)vmap_[phi->get_parent()]; + builder.SetInsertPoint(&*parent->getFirstInsertionPt()); + PHINode *ptr = builder.CreatePHI(ptr_ty, 2); + for(ir::value *op: phi->ops()){ + ir::instruction *inc_val = dynamic_cast(op); + BasicBlock *inc_block = (BasicBlock*)vmap_[inc_val->get_parent()]; + size_t offset = alloc_->get_offset(inc_val); + builder.SetInsertPoint(inc_block); + Value *inc_ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(0)); + inc_ptr = builder.CreateBitCast(inc_ptr, ptr_ty); + ptr->addIncoming(inc_ptr, inc_block); + } + tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)}); + } + else + throw std::runtime_error("unknown shared memory tile"); } // create distributed tile else { @@ -532,6 +566,8 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & ti->set_value(idx, in->get_value(idx)); }); } + else if(is_shared(ins)) + return; // matrix multiplication else if(dynamic_cast(ins)) { ir::value *A = ins->get_operand(0); @@ -607,14 +643,10 @@ void selection::run(ir::module &src, Module &dst){ dst_builder.SetInsertPoint((BasicBlock*)vmap_[fn->blocks()[0]]); // allocate shared memory Value *sh_mem_ptr = nullptr; - if(unsigned alloc_size = alloc_->get_allocated_size()){ + if(alloc_->get_allocated_size()){ Type *int_8_ty = Type::getInt8Ty(dst_ctx); - ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_size); Type *ptr_ty = PointerType::get(int_8_ty, 3); - GlobalVariable *sh_mem_array = - new GlobalVariable(*dst_fn->getParent(), array_ty, false, GlobalVariable::InternalLinkage, - nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); - sh_mem_ptr = dst_builder.CreateBitCast(sh_mem_array, ptr_ty); + sh_mem_ptr = Constant::getNullValue(ptr_ty); } // create grids init_grids(fn, dst_builder, sh_mem_ptr); @@ -628,6 +660,19 @@ void selection::run(ir::module &src, Module &dst){ for(ir::basic_block *block: fn->blocks()) for(ir::instruction *inst: block->get_inst_list()) if(auto *phi = dynamic_cast(inst)){ + if(is_shared(phi)){ +// PHINode *ptr = (PHINode*)(((shared_tile*)tmap_.at(phi))->get_pointer()); +// for(ir::value *op: phi->ops()){ +// ir::instruction *inc_val = dynamic_cast(op); +// BasicBlock *inc_block = (BasicBlock*)vmap_[inc_val->get_parent()]; +// size_t offset = alloc_->get_offset(inc_val); +// dst_builder.SetInsertPoint(inc_block); +// Value *inc_ptr = dst_builder.CreateGEP(sh_mem_ptr, dst_builder.getInt32(offset)); +// inc_ptr = dst_builder.CreateBitCast(inc_ptr, ptr->getType()); +// ptr->addIncoming(inc_ptr, inc_block); +// } + continue; + } for(unsigned n = 0; n < phi->get_num_incoming(); n++){ ir::value *inc_val = phi->get_incoming_value(n); ir::basic_block *inc_block = phi->get_incoming_block(n); diff --git a/lib/codegen/shared_copy.cpp b/lib/codegen/shared_copy.cpp index a6c64e08d..08bac4f9a 100644 --- a/lib/codegen/shared_copy.cpp +++ b/lib/codegen/shared_copy.cpp @@ -8,21 +8,31 @@ namespace tdl { namespace codegen{ +void place_shared_copy::add(ir::value *x, ir::builder &builder) { + if(auto *phi = dynamic_cast(x)) { + for(auto *op: phi->ops()) + add(op, builder); + } + else { + if(auto *i = dynamic_cast(x)){ + ir::basic_block* block = i->get_parent(); + auto it = std::find(block->begin(), block->end(), i); + builder.set_insert_point(++it); + } + ir::instruction *rx = (ir::instruction*)builder.create_copy_to_shared(x); + x->replace_all_uses_with(rx); + rx->set_operand(0, x); + } +} + void place_shared_copy::run(ir::module &mod) { ir::builder &builder = mod.get_builder(); for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()) if(dynamic_cast(i)){ - builder.set_insert_point(i); - ir::value *x = i->get_operand(0); - ir::value *y = i->get_operand(1); - ir::instruction *rx = (ir::instruction*)builder.create_copy_to_shared(x); - ir::instruction *ry = (ir::instruction*)builder.create_copy_to_shared(y); - x->replace_all_uses_with(rx); - y->replace_all_uses_with(ry); - rx->set_operand(0, x); - ry->set_operand(0, y); + add(i->get_operand(0), builder); + add(i->get_operand(1), builder); } } diff --git a/lib/ir/module.cpp b/lib/ir/module.cpp index d95e21c3b..29636657f 100644 --- a/lib/ir/module.cpp +++ b/lib/ir/module.cpp @@ -61,6 +61,9 @@ ir::value *module::try_remove_trivial_phis(ir::phi_node *&phi){ } ir::value *module::add_phi_operands(const std::string& name, ir::phi_node *&phi){ + // already initialized + if(phi->get_num_operands()) + return phi; ir::basic_block *block = phi->get_parent(); for(ir::basic_block *pred: block->get_predecessors()){ ir::value *value = get_value(name, pred); From e45d6bbb60ea433635522d6e6961bafec78e8b18 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 12 Feb 2019 11:00:24 -0500 Subject: [PATCH 073/494] some cleaning --- examples/matrix.cpp | 14 +++--- include/codegen/allocation.h | 7 +-- include/codegen/liveness.h | 7 ++- include/codegen/selection.h | 7 ++- lib/codegen/allocation.cpp | 6 ++- lib/codegen/liveness.cpp | 3 +- lib/codegen/selection.cpp | 82 ++++++++++++++++++------------------ 7 files changed, 72 insertions(+), 54 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 8360fcf3c..9fdbcf96a 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -11,6 +11,7 @@ #include "codegen/allocation.h" #include "codegen/liveness.h" #include "codegen/vectorize.h" +#include "codegen/buffer_info.h" #include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/Module.h" #include "llvm/IR/LLVMContext.h" @@ -162,11 +163,12 @@ int main() { // create passes tdl::codegen::place_shared_copy shared; + tdl::codegen::buffer_info_pass buffer_info; tdl::codegen::tune tune; - tdl::codegen::liveness liveness; - tdl::codegen::allocation allocation(&liveness); + tdl::codegen::liveness liveness(&buffer_info); + tdl::codegen::allocation allocation(&liveness, &buffer_info); tdl::codegen::vectorize vectorize(&tune); - tdl::codegen::selection selection(&allocation, &tune); + tdl::codegen::selection selection(&allocation, &tune, &buffer_info); // tuning parameters tune.run(module); @@ -186,7 +188,6 @@ int main() { }; std::map> errors; unsigned i = 0; - std::cout << tune.get_params(module).size() << std::endl; for(unsigned *x: tune.get_params(module)) *x = params[i++]; tune.check_constraints(module, errors); @@ -198,6 +199,7 @@ int main() { // run passes shared.run(module); + buffer_info.run(module); liveness.run(module); allocation.run(); vectorize.run(module); @@ -206,7 +208,7 @@ int main() { // llvm source llvm::legacy::PassManager manager; manager.add(llvm::createPrintModulePass(llvm::outs())); -// manager.add(llvm::createVerifierPass(true)); + manager.add(llvm::createVerifierPass(true)); manager.run(llvm_module); std::string src = generate_machine_code(llvm_module, "nvptx64-nvidia-cuda", compute_data_layout(true, true)); @@ -220,7 +222,7 @@ int main() { CUstream cu_stream; int major, minor; compile_machine_code(cu_device, cu_context, cu_module, cu_kernel, cu_stream, major, minor, src, "test"); - std::cout << src << std::endl; +// std::cout << src << std::endl; // execute machine code // Allocate buffers diff --git a/include/codegen/allocation.h b/include/codegen/allocation.h index 4b90cf46a..96366b526 100644 --- a/include/codegen/allocation.h +++ b/include/codegen/allocation.h @@ -16,12 +16,12 @@ namespace codegen{ class layout; class target_tuner; class liveness; -class loop_info; +class buffer_info_pass; class allocation { public: - allocation(liveness *live) - : liveness_(live){ } + allocation(liveness *live, buffer_info_pass *buffer_info) + : liveness_(live), buffer_info_(buffer_info){ } // accessors unsigned get_offset(ir::value *x) const { return offsets_.at(x); } @@ -36,6 +36,7 @@ private: size_t allocated_size_; // dependences liveness *liveness_; + buffer_info_pass *buffer_info_; }; } diff --git a/include/codegen/liveness.h b/include/codegen/liveness.h index 11d377c62..fd4faf2f3 100644 --- a/include/codegen/liveness.h +++ b/include/codegen/liveness.h @@ -15,6 +15,8 @@ namespace codegen{ typedef unsigned slot_index; +class buffer_info_pass; + struct segment { slot_index start; slot_index end; @@ -35,11 +37,13 @@ private: typedef std::map has_storage_map_t; public: - /// Intervals iterators... + // Intervals iterators using iterator = intervals_map_t::iterator; using const_iterator = intervals_map_t::const_iterator; public: + // constructor + liveness(buffer_info_pass *info): info_(info){ } // accessors const intervals_map_t& intervals() const { return intervals_; } @@ -49,6 +53,7 @@ public: void run(ir::module &mod); private: + buffer_info_pass *info_; has_storage_map_t has_dedicated_storage_; indices_map_t indices_; intervals_map_t intervals_; diff --git a/include/codegen/selection.h b/include/codegen/selection.h index 4dedbd088..6580ade98 100644 --- a/include/codegen/selection.h +++ b/include/codegen/selection.h @@ -7,6 +7,7 @@ #include "ir/module.h" #include "ir/function.h" #include "ir/type.h" +#include "codegen/buffer_info.h" namespace llvm{ @@ -22,6 +23,8 @@ namespace codegen{ class allocation; class tune; +class buffer_info_pass; + typedef std::vector indices_t; struct distributed_axis { @@ -103,7 +106,6 @@ private: llvm::Constant* llvm_constant(ir::constant *cst, llvm::LLVMContext &ctx); // grid construction - bool is_shared(ir::value *v); void create_grids(std::vector &grids, std::map &references, ir::function *fn); @@ -116,7 +118,7 @@ private: void lower_tile_instruction(ir::instruction *src, llvm::IRBuilder<> &builder); public: - selection(allocation *alloc, tune *params): alloc_(alloc), params_(params){ } + selection(allocation *alloc, tune *params, buffer_info_pass *buffer_info): alloc_(alloc), params_(params), buffer_info_(buffer_info){ } void run(ir::module &src, llvm::Module &dst); private: @@ -124,6 +126,7 @@ private: tmap_t tmap_; allocation *alloc_; tune *params_; + buffer_info_pass *buffer_info_; std::map axes_; }; diff --git a/lib/codegen/allocation.cpp b/lib/codegen/allocation.cpp index 7a5154280..74c9f4c58 100644 --- a/lib/codegen/allocation.cpp +++ b/lib/codegen/allocation.cpp @@ -1,6 +1,7 @@ #include "codegen/allocation.h" #include "codegen/liveness.h" #include "codegen/layout.h" +#include "codegen/buffer_info.h" #include "ir/basic_block.h" #include "ir/type.h" #include "ir/value.h" @@ -16,7 +17,10 @@ void allocation::run(){ typedef std::multimap triples_map_type; auto get_num_bytes = [&](ir::value *x){ - return x->get_type()->get_tile_bitwidth(); + unsigned result = x->get_type()->get_tile_bitwidth(); + if(buffer_info_->is_double(x)) + result *= 2; + return result; }; std::vector I; diff --git a/lib/codegen/liveness.cpp b/lib/codegen/liveness.cpp index bf4c99be2..05b803f8f 100644 --- a/lib/codegen/liveness.cpp +++ b/lib/codegen/liveness.cpp @@ -1,4 +1,5 @@ #include "codegen/liveness.h" +#include "codegen/buffer_info.h" #include "ir/basic_block.h" #include "ir/function.h" #include "ir/module.h" @@ -23,7 +24,7 @@ for(ir::function *fn: mod.get_function_list()){ // Creates live intervals for(auto i: indices_){ ir::value *v = i.first; - if(!dynamic_cast(v)) + if(!info_->is_shared(v) || info_->get_reference(v)) continue; unsigned start = i.second; unsigned end = start; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 5782212ad..a743a5162 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -62,7 +62,6 @@ unsigned distributed_tile::get_linear_index(indices_t idx) { } void distributed_tile::for_each(std::function fn) { - std::cout << "vector size: " << vector_size_ << std::endl; for(auto &idx: indices_) if(idx.second % vector_size_ == 0) fn(idx.first); @@ -347,7 +346,7 @@ void selection::create_grids(std::vector &grids, bind_references(op); // bind const auto& shapes = v->get_type()->get_tile_shapes(); - if(is_shared(v)) + if(buffer_info_->is_shared(v)) return; for(size_t d = 0; d < shapes.size(); d++){ if(shapes[d] == 1) @@ -369,18 +368,6 @@ void selection::create_grids(std::vector &grids, grids.push_back(ref.second); } -bool selection::is_shared(ir::value *v) { - if(auto *phi = dynamic_cast(v)){ - bool result = true; - for(ir::value *op: phi->ops()) - result = result && is_shared(op); - return result; - } - else - return (bool)dynamic_cast(v); - -} - void selection::create_tile(ir::value *v, IRBuilder<> &builder, const std::map& references, std::set &seen, Value *sh_mem_ptr) { @@ -393,7 +380,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, const auto& shapes = v->get_type()->get_tile_shapes(); Type* ty = llvm_type(v->get_type()->get_scalar_ty(), ctx); // create shared tile - if(is_shared(v)){ + if(buffer_info_->is_shared(v)){ // shared copy PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr->getType()->getPointerAddressSpace()); if(dynamic_cast(v)) { @@ -405,17 +392,8 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, // phi-node (double-buffering) else if(auto *phi = dynamic_cast(v)) { BasicBlock *parent = (BasicBlock*)vmap_[phi->get_parent()]; - builder.SetInsertPoint(&*parent->getFirstInsertionPt()); + builder.SetInsertPoint(parent); PHINode *ptr = builder.CreatePHI(ptr_ty, 2); - for(ir::value *op: phi->ops()){ - ir::instruction *inc_val = dynamic_cast(op); - BasicBlock *inc_block = (BasicBlock*)vmap_[inc_val->get_parent()]; - size_t offset = alloc_->get_offset(inc_val); - builder.SetInsertPoint(inc_block); - Value *inc_ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(0)); - inc_ptr = builder.CreateBitCast(inc_ptr, ptr_ty); - ptr->addIncoming(inc_ptr, inc_block); - } tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)}); } else @@ -550,7 +528,6 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & if(linear % vector_size == 0) packets[id] = result->get_value(idx); packets[id] = builder.CreateInsertElement(packets[id], in->get_value(idx), linear % vector_size); - std::cout << linear << std::endl; }); result->for_each([&](indices_t idx){ unsigned linear = in->get_linear_index(idx); @@ -566,7 +543,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & ti->set_value(idx, in->get_value(idx)); }); } - else if(is_shared(ins)) + else if(buffer_info_->is_shared(ins)) return; // matrix multiplication else if(dynamic_cast(ins)) { @@ -643,10 +620,14 @@ void selection::run(ir::module &src, Module &dst){ dst_builder.SetInsertPoint((BasicBlock*)vmap_[fn->blocks()[0]]); // allocate shared memory Value *sh_mem_ptr = nullptr; - if(alloc_->get_allocated_size()){ + if(unsigned alloc_size = alloc_->get_allocated_size()){ Type *int_8_ty = Type::getInt8Ty(dst_ctx); + ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_size); Type *ptr_ty = PointerType::get(int_8_ty, 3); - sh_mem_ptr = Constant::getNullValue(ptr_ty); + GlobalVariable *sh_mem_array = + new GlobalVariable(*dst_fn->getParent(), array_ty, false, GlobalVariable::ExternalLinkage, + nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); + sh_mem_ptr = dst_builder.CreateBitCast(sh_mem_array, ptr_ty); } // create grids init_grids(fn, dst_builder, sh_mem_ptr); @@ -660,17 +641,38 @@ void selection::run(ir::module &src, Module &dst){ for(ir::basic_block *block: fn->blocks()) for(ir::instruction *inst: block->get_inst_list()) if(auto *phi = dynamic_cast(inst)){ - if(is_shared(phi)){ -// PHINode *ptr = (PHINode*)(((shared_tile*)tmap_.at(phi))->get_pointer()); -// for(ir::value *op: phi->ops()){ -// ir::instruction *inc_val = dynamic_cast(op); -// BasicBlock *inc_block = (BasicBlock*)vmap_[inc_val->get_parent()]; -// size_t offset = alloc_->get_offset(inc_val); -// dst_builder.SetInsertPoint(inc_block); -// Value *inc_ptr = dst_builder.CreateGEP(sh_mem_ptr, dst_builder.getInt32(offset)); -// inc_ptr = dst_builder.CreateBitCast(inc_ptr, ptr->getType()); -// ptr->addIncoming(inc_ptr, inc_block); -// } + if(buffer_info_->is_shared(phi)) { + BasicBlock *parent = (BasicBlock*)vmap_[phi->get_parent()]; + unsigned id_pre = 0, id_loop = 1; + if(phi->get_incoming_block(0) == phi->get_parent()) + std::swap(id_pre, id_loop); + ir::value *pre_value = phi->get_incoming_value(id_pre); + ir::value *loop_value = phi->get_incoming_value(id_loop); + BasicBlock *pre_block = (BasicBlock*)vmap_[phi->get_incoming_block(id_pre)]; + BasicBlock *loop_block = (BasicBlock*)vmap_[phi->get_incoming_block(id_loop)]; + int pre_offset = alloc_->get_offset(pre_value); + int loop_offset = alloc_->get_offset(loop_value); + dst_builder.SetInsertPoint(&*parent->getFirstInsertionPt()); + PHINode *ptr = (PHINode*)(((shared_tile*)tmap_.at(phi))->get_pointer()); + // offset + PHINode *offset = dst_builder.CreatePHI(dst_builder.getInt32Ty(), 2); + dst_builder.SetInsertPoint(parent->getFirstNonPHI()); + Value *next_offset = dst_builder.CreateNeg(offset); + offset->addIncoming(dst_builder.getInt32((loop_offset - pre_offset)/4), pre_block); + offset->addIncoming(next_offset, loop_block); + // next pointer + Value *pre_ptr = dst_builder.CreateGEP(sh_mem_ptr, dst_builder.getInt32(pre_offset)); + pre_ptr = dst_builder.CreateBitCast(pre_ptr, ptr->getType()); + Value *next_ptr = dst_builder.CreateGEP(ptr, offset); + ptr->addIncoming(pre_ptr, pre_block); + ptr->addIncoming(next_ptr, loop_block); + // barrier + Function *barrier = Intrinsic::getDeclaration(dst_fn->getParent(), Intrinsic::nvvm_barrier0); + dst_builder.SetInsertPoint(pre_block->getTerminator()); + dst_builder.CreateCall(barrier, {}); + dst_builder.SetInsertPoint(loop_block->getTerminator()); + dst_builder.CreateCall(barrier, {}); + continue; } for(unsigned n = 0; n < phi->get_num_incoming(); n++){ From 41aad4800ca74af1623c4ef750538cd00545a279 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 12 Feb 2019 11:47:52 -0500 Subject: [PATCH 074/494] [code generation] added double-buffering --- examples/matrix.cpp | 4 +- include/codegen/allocation.h | 3 ++ include/codegen/buffer_info.h | 34 ++++++++++++++++ include/codegen/vectorize.h | 27 +++++++++++++ lib/codegen/allocation.cpp | 14 +++---- lib/codegen/buffer_info.cpp | 65 ++++++++++++++++++++++++++++++ lib/codegen/selection.cpp | 76 +++++++++++++++++------------------ lib/codegen/vectorize.cpp | 28 +++++++++++++ 8 files changed, 204 insertions(+), 47 deletions(-) create mode 100644 include/codegen/buffer_info.h create mode 100644 include/codegen/vectorize.h create mode 100644 lib/codegen/buffer_info.cpp create mode 100644 lib/codegen/vectorize.cpp diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 9fdbcf96a..31e4a173e 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -212,7 +212,7 @@ int main() { manager.run(llvm_module); std::string src = generate_machine_code(llvm_module, "nvptx64-nvidia-cuda", compute_data_layout(true, true)); -// std::cout << src << std::endl; + std::cout << src << std::endl; // compile machine code CUdevice cu_device; @@ -222,7 +222,7 @@ int main() { CUstream cu_stream; int major, minor; compile_machine_code(cu_device, cu_context, cu_module, cu_kernel, cu_stream, major, minor, src, "test"); -// std::cout << src << std::endl; + std::cout << src << std::endl; // execute machine code // Allocate buffers diff --git a/include/codegen/allocation.h b/include/codegen/allocation.h index 96366b526..5bd5e85a2 100644 --- a/include/codegen/allocation.h +++ b/include/codegen/allocation.h @@ -23,6 +23,9 @@ public: allocation(liveness *live, buffer_info_pass *buffer_info) : liveness_(live), buffer_info_(buffer_info){ } + // utilities + unsigned get_num_bytes(ir::value *x); + // accessors unsigned get_offset(ir::value *x) const { return offsets_.at(x); } unsigned get_allocated_size() const { return allocated_size_; } diff --git a/include/codegen/buffer_info.h b/include/codegen/buffer_info.h new file mode 100644 index 000000000..2cce9d829 --- /dev/null +++ b/include/codegen/buffer_info.h @@ -0,0 +1,34 @@ +#ifndef TDL_INCLUDE_CODEGEN_BUFFER_INFO_PASS_H +#define TDL_INCLUDE_CODEGEN_BUFFER_INFO_PASS_H + +#include +#include + +namespace tdl { + +namespace ir { + class module; + class value; +} + +namespace codegen{ + +class buffer_info_pass { +public: + void run(ir::module &mod); + // queries + bool is_double(ir::value *x); + bool is_shared(ir::value *x); + ir::value *get_reference(ir::value *x); + +private: + std::set shared_; + std::set double_; + std::map refs_; +}; + + +} +} + +#endif diff --git a/include/codegen/vectorize.h b/include/codegen/vectorize.h new file mode 100644 index 000000000..c9c28a79c --- /dev/null +++ b/include/codegen/vectorize.h @@ -0,0 +1,27 @@ +#ifndef TDL_INCLUDE_CODEGEN_VECTORIZE_H +#define TDL_INCLUDE_CODEGEN_VECTORIZE_H + +namespace tdl { + +namespace ir { + class module; +} + +namespace codegen{ + +class tune; + +class vectorize { +public: + vectorize(tune *params): params_(params){} + void run(ir::module &mod); + +private: + tune *params_; +}; + + +} +} + +#endif diff --git a/lib/codegen/allocation.cpp b/lib/codegen/allocation.cpp index 74c9f4c58..c4957e477 100644 --- a/lib/codegen/allocation.cpp +++ b/lib/codegen/allocation.cpp @@ -11,18 +11,18 @@ namespace tdl{ namespace codegen{ +unsigned allocation::get_num_bytes(ir::value *x) { + unsigned result = x->get_type()->get_tile_bitwidth(); + if(buffer_info_->is_double(x)) + result *= 2; + return result; +} + void allocation::run(){ using std::max; using std::min; typedef std::multimap triples_map_type; - auto get_num_bytes = [&](ir::value *x){ - unsigned result = x->get_type()->get_tile_bitwidth(); - if(buffer_info_->is_double(x)) - result *= 2; - return result; - }; - std::vector I; for(auto x: liveness_->intervals()) I.push_back(x.first); diff --git a/lib/codegen/buffer_info.cpp b/lib/codegen/buffer_info.cpp new file mode 100644 index 000000000..6be951a22 --- /dev/null +++ b/lib/codegen/buffer_info.cpp @@ -0,0 +1,65 @@ +#include "codegen/buffer_info.h" +#include "ir/module.h" +#include "ir/function.h" +#include "ir/basic_block.h" +#include "ir/instructions.h" +#include "ir/type.h" + +namespace tdl { + +namespace codegen{ + + +// run pass on module +void buffer_info_pass::run(ir::module &mod) { + for(ir::function *fn: mod.get_function_list()) + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *i: block->get_inst_list()) { + if(!i->get_type()->is_tile_ty()) + continue; + // handle phi + if(auto *phi = dynamic_cast(i)){ + // determine if the value is in shared memory + bool is_shared = true; + bool is_double = false; + for(unsigned n = 0; n < phi->get_num_incoming(); n++){ + ir::value *inc_val = phi->get_incoming_value(n); + ir::value *inc_block = phi->get_incoming_block(n); + is_shared = is_shared && dynamic_cast(inc_val); + is_double = is_double || inc_block == phi->get_parent(); + } + // add to shared + if(is_shared) + shared_.insert(phi); + // add to double-buffered + if(is_double) + double_.insert(phi); + // set references of input + for(unsigned n = 0; n < phi->get_num_incoming(); n++){ + ir::value *inc_val = phi->get_incoming_value(n); + assert(refs_[inc_val] == nullptr); + refs_[inc_val] = phi; + } + } + // handle shared copy + if(auto *copy = dynamic_cast(i)) + shared_.insert(copy); + } +} + +// query double-buffered status +bool buffer_info_pass::is_double(ir::value *x) +{ return double_.find(x) != double_.end(); } + +// query shared status +bool buffer_info_pass::is_shared(ir::value *x) +{ return shared_.find(x) != shared_.end(); } + +// get reference if any +ir::value *buffer_info_pass::get_reference(ir::value *x) +{ return refs_[x]; } + + + +} +} diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index a743a5162..b9857eb12 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -384,17 +384,42 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, // shared copy PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr->getType()->getPointerAddressSpace()); if(dynamic_cast(v)) { - size_t offset = alloc_->get_offset(v); - Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset)); - ptr = builder.CreateBitCast(ptr, ptr_ty); - tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)}); + if(buffer_info_->get_reference(v) == nullptr){ + size_t offset = alloc_->get_offset(v); + Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset)); + ptr = builder.CreateBitCast(ptr, ptr_ty); + tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)}); + } } // phi-node (double-buffering) else if(auto *phi = dynamic_cast(v)) { BasicBlock *parent = (BasicBlock*)vmap_[phi->get_parent()]; - builder.SetInsertPoint(parent); + unsigned id_pre = 0, id_loop = 1; + if(phi->get_incoming_block(0) == phi->get_parent()) + std::swap(id_pre, id_loop); + ir::value *pre_value = phi->get_incoming_value(id_pre); + ir::value *loop_value = phi->get_incoming_value(id_loop); + BasicBlock *pre_block = (BasicBlock*)vmap_[phi->get_incoming_block(id_pre)]; + BasicBlock *loop_block = (BasicBlock*)vmap_[phi->get_incoming_block(id_loop)]; + if(parent->empty()) + builder.SetInsertPoint(parent); + else + builder.SetInsertPoint(&*parent->getFirstInsertionPt()); PHINode *ptr = builder.CreatePHI(ptr_ty, 2); + // offset + PHINode *offset = builder.CreatePHI(builder.getInt32Ty(), 2); + Value *next_offset = builder.CreateNeg(offset); + offset->addIncoming(builder.getInt32(alloc_->get_num_bytes(phi) / 2 / 4), pre_block); + offset->addIncoming(next_offset, loop_block); + // next pointer + Value *pre_ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(alloc_->get_offset(phi))); + pre_ptr = builder.CreateBitCast(pre_ptr, ptr->getType()); + Value *next_ptr = builder.CreateGEP(ptr, offset); + ptr->addIncoming(pre_ptr, pre_block); + ptr->addIncoming(next_ptr, loop_block); tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)}); + tmap_.insert({pre_value, new shared_tile(ty, shapes, pre_ptr, builder)}); + tmap_.insert({loop_value, new shared_tile(ty, shapes, next_ptr, builder)}); } else throw std::runtime_error("unknown shared memory tile"); @@ -633,46 +658,21 @@ void selection::run(ir::module &src, Module &dst){ init_grids(fn, dst_builder, sh_mem_ptr); // iterate through block for(ir::basic_block *block: fn->blocks()) { - dst_builder.SetInsertPoint((BasicBlock*)vmap_[block]); - for(ir::instruction *i: block->get_inst_list()) + BasicBlock *parent = (BasicBlock*)vmap_[block]; + dst_builder.SetInsertPoint(parent); + for(ir::instruction *i: block->get_inst_list()){ + if(dynamic_cast(i)) + dst_builder.SetInsertPoint(&*parent->getFirstInsertionPt()); lower_instruction(i, dst_builder); + if(dynamic_cast(i)) + dst_builder.SetInsertPoint(parent); + } } // add phi operands for(ir::basic_block *block: fn->blocks()) for(ir::instruction *inst: block->get_inst_list()) if(auto *phi = dynamic_cast(inst)){ if(buffer_info_->is_shared(phi)) { - BasicBlock *parent = (BasicBlock*)vmap_[phi->get_parent()]; - unsigned id_pre = 0, id_loop = 1; - if(phi->get_incoming_block(0) == phi->get_parent()) - std::swap(id_pre, id_loop); - ir::value *pre_value = phi->get_incoming_value(id_pre); - ir::value *loop_value = phi->get_incoming_value(id_loop); - BasicBlock *pre_block = (BasicBlock*)vmap_[phi->get_incoming_block(id_pre)]; - BasicBlock *loop_block = (BasicBlock*)vmap_[phi->get_incoming_block(id_loop)]; - int pre_offset = alloc_->get_offset(pre_value); - int loop_offset = alloc_->get_offset(loop_value); - dst_builder.SetInsertPoint(&*parent->getFirstInsertionPt()); - PHINode *ptr = (PHINode*)(((shared_tile*)tmap_.at(phi))->get_pointer()); - // offset - PHINode *offset = dst_builder.CreatePHI(dst_builder.getInt32Ty(), 2); - dst_builder.SetInsertPoint(parent->getFirstNonPHI()); - Value *next_offset = dst_builder.CreateNeg(offset); - offset->addIncoming(dst_builder.getInt32((loop_offset - pre_offset)/4), pre_block); - offset->addIncoming(next_offset, loop_block); - // next pointer - Value *pre_ptr = dst_builder.CreateGEP(sh_mem_ptr, dst_builder.getInt32(pre_offset)); - pre_ptr = dst_builder.CreateBitCast(pre_ptr, ptr->getType()); - Value *next_ptr = dst_builder.CreateGEP(ptr, offset); - ptr->addIncoming(pre_ptr, pre_block); - ptr->addIncoming(next_ptr, loop_block); - // barrier - Function *barrier = Intrinsic::getDeclaration(dst_fn->getParent(), Intrinsic::nvvm_barrier0); - dst_builder.SetInsertPoint(pre_block->getTerminator()); - dst_builder.CreateCall(barrier, {}); - dst_builder.SetInsertPoint(loop_block->getTerminator()); - dst_builder.CreateCall(barrier, {}); - continue; } for(unsigned n = 0; n < phi->get_num_incoming(); n++){ diff --git a/lib/codegen/vectorize.cpp b/lib/codegen/vectorize.cpp new file mode 100644 index 000000000..41a1afd10 --- /dev/null +++ b/lib/codegen/vectorize.cpp @@ -0,0 +1,28 @@ +#include "codegen/vectorize.h" +#include "codegen/tune.h" +#include "ir/module.h" +#include "ir/function.h" +#include "ir/basic_block.h" +#include "ir/instructions.h" + +namespace tdl { + +namespace codegen{ + +void vectorize::run(ir::module &mod) { + ir::builder &builder = mod.get_builder(); + for(ir::function *fn: mod.get_function_list()) + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *i: block->get_inst_list()) + if(dynamic_cast(i)){ + builder.set_insert_point(i); + ir::value *x = i->get_operand(0); + ir::instruction *rx = (ir::instruction*)builder.create_vectorize(x); + x->replace_all_uses_with(rx); + rx->set_operand(0, x); + params_->copy(rx, x); + } +} + +} +} From 32562677e983bcf7208e32c5cd56ce994a9291c4 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 12 Feb 2019 19:36:16 -0500 Subject: [PATCH 075/494] [code generation] added barriers placement --- examples/matrix.cpp | 8 ++-- include/codegen/allocation.h | 1 + include/codegen/barriers.h | 50 ++++++++++++++++++++ include/codegen/shared_copy.h | 11 ++++- include/ir/builder.h | 2 + include/ir/instructions.h | 9 ++++ lib/codegen/barriers.cpp | 89 +++++++++++++++++++++++++++++++++++ lib/codegen/selection.cpp | 5 ++ lib/codegen/shared_copy.cpp | 9 ++-- lib/ir/builder.cpp | 4 ++ lib/ir/instructions.cpp | 8 ++++ 11 files changed, 188 insertions(+), 8 deletions(-) create mode 100644 include/codegen/barriers.h create mode 100644 lib/codegen/barriers.cpp diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 31e4a173e..938a4eddb 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -12,6 +12,7 @@ #include "codegen/liveness.h" #include "codegen/vectorize.h" #include "codegen/buffer_info.h" +#include "codegen/barriers.h" #include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/Module.h" #include "llvm/IR/LLVMContext.h" @@ -167,6 +168,7 @@ int main() { tdl::codegen::tune tune; tdl::codegen::liveness liveness(&buffer_info); tdl::codegen::allocation allocation(&liveness, &buffer_info); + tdl::codegen::barriers barriers(&allocation, &buffer_info); tdl::codegen::vectorize vectorize(&tune); tdl::codegen::selection selection(&allocation, &tune, &buffer_info); @@ -202,17 +204,18 @@ int main() { buffer_info.run(module); liveness.run(module); allocation.run(); + barriers.run(module); vectorize.run(module); selection.run(module, llvm_module); // llvm source llvm::legacy::PassManager manager; - manager.add(llvm::createPrintModulePass(llvm::outs())); +// manager.add(llvm::createPrintModulePass(llvm::outs())); manager.add(llvm::createVerifierPass(true)); manager.run(llvm_module); std::string src = generate_machine_code(llvm_module, "nvptx64-nvidia-cuda", compute_data_layout(true, true)); - std::cout << src << std::endl; +// std::cout << src << std::endl; // compile machine code CUdevice cu_device; @@ -222,7 +225,6 @@ int main() { CUstream cu_stream; int major, minor; compile_machine_code(cu_device, cu_context, cu_module, cu_kernel, cu_stream, major, minor, src, "test"); - std::cout << src << std::endl; // execute machine code // Allocate buffers diff --git a/include/codegen/allocation.h b/include/codegen/allocation.h index 5bd5e85a2..ad58ccea7 100644 --- a/include/codegen/allocation.h +++ b/include/codegen/allocation.h @@ -3,6 +3,7 @@ #include #include +#include namespace tdl{ diff --git a/include/codegen/barriers.h b/include/codegen/barriers.h new file mode 100644 index 000000000..9b476ae75 --- /dev/null +++ b/include/codegen/barriers.h @@ -0,0 +1,50 @@ +#ifndef TDL_INCLUDE_CODEGEN_BARRIERS_H +#define TDL_INCLUDE_CODEGEN_BARRIERS_H + +#include +#include +#include + +namespace tdl { + +namespace ir { + class module; + class basic_block; + class instruction; + class value; + class builder; +} + +namespace codegen{ + +class allocation; +class buffer_info_pass; + +class barriers { +private: + typedef std::pair interval_t; + typedef std::vector interval_vec_t; + +private: + void insert_barrier(ir::instruction *instr, ir::builder &builder); + bool intersect(const interval_vec_t &X, interval_t x); + bool intersect(const interval_vec_t &X, const interval_vec_t &Y); + void add_reference(ir::value *v, interval_vec_t &res); + void get_read_intervals(ir::instruction *i, interval_vec_t &res); + void get_written_intervals(ir::instruction *i, interval_vec_t &res); + void add(ir::basic_block *block, interval_vec_t ¬_synced, std::set &insert_pts); + +public: + barriers(allocation *alloc, buffer_info_pass *buffer_info): alloc_(alloc), buffer_info_(buffer_info) {} + void run(ir::module &mod); + +private: + allocation *alloc_; + buffer_info_pass *buffer_info_; +}; + + +} +} + +#endif diff --git a/include/codegen/shared_copy.h b/include/codegen/shared_copy.h index 8512ec358..927915805 100644 --- a/include/codegen/shared_copy.h +++ b/include/codegen/shared_copy.h @@ -1,19 +1,28 @@ #ifndef TDL_INCLUDE_CODEGEN_SHARED_COPY_H #define TDL_INCLUDE_CODEGEN_SHARED_COPY_H +#include +#include + namespace tdl { namespace ir { class module; class value; class builder; + class basic_block; } namespace codegen{ class place_shared_copy { private: - void add(ir::value *x, ir::builder &builder); + typedef std::pair interval_t; + typedef std::vector interval_vec_t; + +private: + bool intersect(const interval_vec_t &I, interval_t i); + void add_copies(ir::value *x, ir::builder &builder); public: void run(ir::module &mod); diff --git a/include/ir/builder.h b/include/ir/builder.h index f0b0d1aa3..d7e49cf14 100644 --- a/include/ir/builder.h +++ b/include/ir/builder.h @@ -119,6 +119,8 @@ public: // Intrinsics value *create_copy_to_shared(value *arg, const std::string &name = ""); value *create_vectorize(value *arg, const std::string &name = ""); + value *create_barrier(const std::string &name = ""); + private: context &ctx_; basic_block *block_; diff --git a/include/ir/instructions.h b/include/ir/instructions.h index 0745f62c0..08f472786 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -397,6 +397,15 @@ public: instruction *next = nullptr); }; +class barrier_inst: public instruction{ +private: + barrier_inst(context &ctx, const std::string &name, instruction *next); + +public: + static barrier_inst* create(context &ctx, const std::string &name = "", + instruction *next = nullptr); +}; + class vectorize_inst: public unary_inst{ using unary_inst::unary_inst; diff --git a/lib/codegen/barriers.cpp b/lib/codegen/barriers.cpp new file mode 100644 index 000000000..f21c1e1d6 --- /dev/null +++ b/lib/codegen/barriers.cpp @@ -0,0 +1,89 @@ +#include +#include "codegen/barriers.h" +#include "codegen/allocation.h" +#include "codegen/buffer_info.h" +#include "ir/module.h" +#include "ir/function.h" +#include "ir/basic_block.h" +#include "ir/instructions.h" + +namespace tdl { + +namespace codegen{ + +bool barriers::intersect(const interval_vec_t &X, interval_t x) { + return std::any_of(X.begin(), X.end(), [&](const interval_t &y){ + bool left_intersect = y.first <= x.first && x.first < y.second; + bool right_intersect = y.first <= x.second && x.second < y.second; + return left_intersect || right_intersect; + }); +} + +bool barriers::intersect(const interval_vec_t &X, const interval_vec_t &Y) { + return std::any_of(Y.begin(), Y.end(), [&](const interval_t &y){ + return intersect(X, y); + }); +} + +void barriers::add_reference(ir::value *v, interval_vec_t &res){ + if(buffer_info_->is_shared(v)){ + unsigned offset = alloc_->get_offset(v); + unsigned num_bytes = alloc_->get_num_bytes(v); + res.push_back(interval_t(offset, offset + num_bytes)); + } +} + +void barriers::get_read_intervals(ir::instruction *i, interval_vec_t &res){ + for(ir::value *op: i->ops()) + add_reference(op, res); +} + +void barriers::get_written_intervals(ir::instruction *i, interval_vec_t &res){ + if(!dynamic_cast(i)) + add_reference(i, res); +} + +void barriers::insert_barrier(ir::instruction *instr, ir::builder &builder) { + if(auto *phi = dynamic_cast(instr)) { + for(unsigned n = 0; n < phi->get_num_incoming(); n++){ + ir::basic_block *block = phi->get_incoming_block(n); + builder.set_insert_point(block->get_inst_list().back()); + builder.create_barrier(); + } + } + else{ + builder.set_insert_point(instr); + builder.create_barrier(); + } +} + +void barriers::add(ir::basic_block *block, interval_vec_t ¬_synced, std::set &insert_pts) { + for(ir::instruction *i: block->get_inst_list()){ + interval_vec_t read, written; + get_read_intervals(i, read); + get_written_intervals(i, written); + if(intersect(not_synced, read) + || intersect(not_synced, written)) { + not_synced.clear(); + insert_pts.insert(i); + } + std::copy(written.begin(), written.end(), std::back_inserter(not_synced)); + } +} + +void barriers::run(ir::module &mod) { + ir::builder &builder = mod.get_builder(); + for(ir::function *fn: mod.get_function_list()){ + // find barrier location + interval_vec_t not_synced; + std::set insert_pts; + for(ir::basic_block *block: fn->blocks()) + add(block, not_synced, insert_pts); + // insert barrier + for(ir::instruction *i: insert_pts) + insert_barrier(i, builder); + } +} + +} +} diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index b9857eb12..9ef405e06 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -211,6 +211,11 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::functionget_dest()); return builder.Insert(BranchInst::Create(dest)); } + if(dynamic_cast(inst)){ + Module *module = builder.GetInsertBlock()->getModule(); + Function *barrier = Intrinsic::getDeclaration(module, Intrinsic::nvvm_barrier0); + return builder.CreateCall(barrier, {}); + } if(auto* ii = dynamic_cast(inst)){ Type *ty = type(ii->get_type()->get_scalar_ty()); unsigned num_ops = ii->get_num_operands(); diff --git a/lib/codegen/shared_copy.cpp b/lib/codegen/shared_copy.cpp index 08bac4f9a..07d6a5c29 100644 --- a/lib/codegen/shared_copy.cpp +++ b/lib/codegen/shared_copy.cpp @@ -1,3 +1,4 @@ +#include #include "codegen/shared_copy.h" #include "ir/module.h" #include "ir/function.h" @@ -8,10 +9,10 @@ namespace tdl { namespace codegen{ -void place_shared_copy::add(ir::value *x, ir::builder &builder) { +void place_shared_copy::add_copies(ir::value *x, ir::builder &builder) { if(auto *phi = dynamic_cast(x)) { for(auto *op: phi->ops()) - add(op, builder); + add_copies(op, builder); } else { if(auto *i = dynamic_cast(x)){ @@ -31,8 +32,8 @@ void place_shared_copy::run(ir::module &mod) { for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()) if(dynamic_cast(i)){ - add(i->get_operand(0), builder); - add(i->get_operand(1), builder); + add_copies(i->get_operand(0), builder); + add_copies(i->get_operand(1), builder); } } diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index 7422a47fc..cb5edd2b6 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -281,5 +281,9 @@ value *builder::create_vectorize(value *arg, const std::string &name) { return insert(vectorize_inst::create(arg, name)); } +value *builder::create_barrier(const std::string &name) { + return insert(barrier_inst::create(ctx_, name)); +} + } } diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 522f4c029..f335bbeea 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -403,5 +403,13 @@ vectorize_inst* vectorize_inst::create(value *arg, const std::string &name, inst return new vectorize_inst(arg->get_type(), arg, name, next); } +barrier_inst::barrier_inst(context &ctx, const std::string &name, + instruction *next) + : instruction(type::get_void_ty(ctx), 0, name, next){ } + +barrier_inst* barrier_inst::create(context &ctx, const std::string &name, instruction *next) { + return new barrier_inst(ctx, name, next); +} + } } From 896e856b071c73a93a58dd116c173a76fa6f5cad Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 13 Feb 2019 15:41:03 -0500 Subject: [PATCH 076/494] [syntax] added support for predicated expressions --- examples/matrix.cpp | 9 ++++++--- include/ast/ast.h | 12 ++++++++---- include/ast/parser.y | 11 +++++++---- include/ast/scanner.l | 9 ++++++--- include/ir/instructions.h | 1 + lib/ast/lowering.cpp | 1 + 6 files changed, 29 insertions(+), 14 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 938a4eddb..f5334769e 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -50,6 +50,9 @@ void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K){\ fp32* pc[16, 16] = c + rxc[:, newaxis] + ryc[newaxis, :]*M;\ fp32 a[16, 8] = *pa;\ fp32 b[16, 8] = *pb;\ + int1 checkc0[16] = (rxc < M);\ + int1 checkc1[16] = (ryc < N);\ + int1 checkc[16, 16] = checkc0[:, newaxis] && checkc1[newaxis, :];\ for(k = K; k > 0; k = k - 8){\ C = dot(a, b, C);\ pa = pa + 8*M;\ @@ -57,7 +60,7 @@ void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K){\ a = *pa;\ b = *pb;\ }\ - *pc = C;\ + @checkc *pc = C;\ }\ "; @@ -215,7 +218,7 @@ int main() { manager.run(llvm_module); std::string src = generate_machine_code(llvm_module, "nvptx64-nvidia-cuda", compute_data_layout(true, true)); -// std::cout << src << std::endl; + std::cout << src << std::endl; // compile machine code CUdevice cu_device; @@ -229,7 +232,7 @@ int main() { // execute machine code // Allocate buffers typedef float numeric_t; - size_t M = 32, N = 32, K = 32; + size_t M = 128, N = 128, K = 128; std::vector c(M*N); std::vector rc(M*N); std::vector a(M*K); diff --git a/include/ast/ast.h b/include/ast/ast.h index 7a2a62563..6471b2296 100644 --- a/include/ast/ast.h +++ b/include/ast/ast.h @@ -51,8 +51,8 @@ enum UNARY_OP_T{ enum TYPE_T{ VOID_T, - UINT8_T, UINT16_T, UINT32_T, UINT64_T, - INT8_T, INT16_T, INT32_T, INT64_T, + UINT1_T, UINT8_T, UINT16_T, UINT32_T, UINT64_T, + INT1_T, INT8_T, INT16_T, INT32_T, INT64_T, FLOAT32_T, FLOAT64_T }; @@ -313,17 +313,21 @@ public: }; class statement: public node{ + +private: + expression *pred_; }; class expression_statement: public statement{ public: - expression_statement(node *expr) - : expr_((expression*)expr){ } + expression_statement(node *expr, node *pred = nullptr) + : expr_((expression*)expr), pred_((expression*)pred){ } ir::value* codegen(ir::module * mod) const; private: expression *expr_; + expression *pred_; }; class compound_statement: public statement{ diff --git a/include/ast/parser.y b/include/ast/parser.y index 0b68443ce..442bee12e 100644 --- a/include/ast/parser.y +++ b/include/ast/parser.y @@ -47,9 +47,9 @@ TYPE_T get_type_spec(node *op) { return ((token*)op)->type; } %token AND_OP OR_OP MUL_ASSIGN DIV_ASSIGN MOD_ASSIGN ADD_ASSIGN %token SUB_ASSIGN LEFT_ASSIGN RIGHT_ASSIGN AND_ASSIGN %token XOR_ASSIGN OR_ASSIGN TYPE_NAME -%token VOID UINT8 UINT16 UINT32 UINT64 INT8 INT16 INT32 INT64 FP32 FP64 +%token VOID UINT1 UINT8 UINT16 UINT32 UINT64 INT1 INT8 INT16 INT32 INT64 FP32 FP64 %token IF ELSE FOR -%token NEWAXIS ELLIPSIS +%token NEWAXIS ELLIPSIS AT %token GET_GLOBAL_RANGE DOT %start translation_unit @@ -62,10 +62,12 @@ TYPE_T get_type_spec(node *op) { return ((token*)op)->type; } type_specifier : VOID { $$ = new token(VOID_T); } + | UINT1 { $$ = new token(UINT1_T); } | UINT8 { $$ = new token(UINT8_T); } | UINT16 { $$ = new token(UINT16_T); } | UINT32 { $$ = new token(UINT32_T); } | UINT64 { $$ = new token(UINT64_T); } + | INT1 { $$ = new token(INT1_T);} | INT8 { $$ = new token(INT8_T); } | INT16 { $$ = new token(INT16_T); } | INT32 { $$ = new token(INT32_T); } @@ -282,11 +284,12 @@ statement_list : statement { $$ = new list((statement*)$1); } | statement_list statement { $$ = append_ptr_list($1, $2); } ; - + expression_statement : ';' { $$ = new no_op(); } | expression ';' { $$ = new expression_statement($1); } - ; + | AT primary_expression expression ';' { $$ = new expression_statement($3, $2); } + ; selection_statement : IF '(' expression ')' statement { $$ = new selection_statement($3, $5); } diff --git a/include/ast/scanner.l b/include/ast/scanner.l index 6b5ed66b0..8e2d89f14 100644 --- a/include/ast/scanner.l +++ b/include/ast/scanner.l @@ -16,15 +16,18 @@ int comment(); %} %% -"newaxis" { count(); return(NEWAXIS); } -"if" { count(); return(IF); } +"@" { count(); return(AT); } +"newaxis" { count(); return(NEWAXIS); } +"if" { count(); return(IF); } "else" { count(); return(ELSE); } -"for" { count(); return(FOR); } +"for" { count(); return(FOR); } "void" { count(); return(VOID); } +"uint1" { count(); return(UINT1); } "uint8" { count(); return(UINT8); } "uint16" { count(); return(UINT16); } "uint32" { count(); return(UINT32); } "uint64" { count(); return(UINT64); } +"int1" { count(); return(INT1); } "int8" { count(); return(INT8); } "int16" { count(); return(INT16); } "int32" { count(); return(INT32); } diff --git a/include/ir/instructions.h b/include/ir/instructions.h index 08f472786..28feeb442 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -31,6 +31,7 @@ public: private: basic_block *parent_; + value *pred_; }; //===----------------------------------------------------------------------===// diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index c9d8c6ff8..36bd50adb 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -151,6 +151,7 @@ ir::type* declaration_specifier::type(ir::module *mod) const { ir::context &ctx = mod->get_context(); switch (spec_) { case VOID_T: return ir::type::get_void_ty(ctx); + case INT1_T: return ir::type::get_int1_ty(ctx); case INT8_T: return ir::type::get_int8_ty(ctx); case INT16_T: return ir::type::get_int16_ty(ctx); case INT32_T: return ir::type::get_int32_ty(ctx); From 5f5959dc6e763464ba3db251f8671fdcbd89d562 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 15 Feb 2019 11:14:50 -0500 Subject: [PATCH 077/494] [code generation] added masked loads --- CMakeLists.txt | 4 +- examples/matrix.cpp | 21 ++++++---- include/ast/ast.h | 9 +++-- include/ast/parser.y | 4 +- include/codegen/selection.h | 4 +- include/ir/instructions.h | 4 ++ lib/ast/lowering.cpp | 38 +++++++++++------- lib/codegen/selection.cpp | 80 +++++++++++++++++++++++++++++-------- lib/codegen/tune.cpp | 12 +++++- lib/ir/instructions.cpp | 4 +- lib/ir/type.cpp | 2 +- 11 files changed, 128 insertions(+), 54 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2531e84ca..814206cfc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,7 +15,7 @@ find_package(LLVM REQUIRED CONFIG) message(STATUS ${LLVM_INCLUDE_DIRS}) include_directories(${LLVM_INCLUDE_DIRS}) add_definitions(${LLVM_DEFINITIONS}) -llvm_map_components_to_libnames(llvm_libs support core irreader MC NVPTXCodeGen all) +#llvm_map_components_to_libnames(llvm_libs all) #Default build type if(NOT CMAKE_BUILD_TYPE) @@ -34,7 +34,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${LLVM_CXXFLAGS} -std=c++11") # TDL file(GLOB_RECURSE LIBTDL_SRC lib/*.cpp) add_library(tdl SHARED ${LIBTDL_SRC} ${BISON_Parser_OUTPUTS} ${FLEX_Lexer_OUTPUTS}) -target_link_libraries(tdl ${llvm_libs}) +target_link_libraries(tdl LLVM) # Examples add_subdirectory(examples) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index f5334769e..8145ddb90 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -36,7 +36,7 @@ extern translation_unit *ast_root; const char src[] = "\ -void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K){\ +void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K, int32 bound){\ int32 rxa[16] = get_global_range[16](0);\ int32 ryb[16] = get_global_range[16](1);\ int32 rka[8] = 0 ... 8;\ @@ -50,15 +50,17 @@ void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K){\ fp32* pc[16, 16] = c + rxc[:, newaxis] + ryc[newaxis, :]*M;\ fp32 a[16, 8] = *pa;\ fp32 b[16, 8] = *pb;\ - int1 checkc0[16] = (rxc < M);\ - int1 checkc1[16] = (ryc < N);\ + int1 checkc0[16] = rxc < M;\ + int1 checkc1[16] = ryc < N;\ int1 checkc[16, 16] = checkc0[:, newaxis] && checkc1[newaxis, :];\ for(k = K; k > 0; k = k - 8){\ + int1 sanitya[16, 8] = (k >= bound);\ + int1 sanityb[16, 8] = (k >= bound);\ C = dot(a, b, C);\ pa = pa + 8*M;\ pb = pb + 8*K;\ - a = *pa;\ - b = *pb;\ + @sanitya a = *pa;\ + @sanityb b = *pb;\ }\ @checkc *pc = C;\ }\ @@ -201,6 +203,8 @@ int main() { for(auto &e: x.second) std::cout << e << std::endl; } + if(errors.size()) + exit(EXIT_FAILURE); // run passes shared.run(module); @@ -213,7 +217,7 @@ int main() { // llvm source llvm::legacy::PassManager manager; -// manager.add(llvm::createPrintModulePass(llvm::outs())); + manager.add(llvm::createPrintModulePass(llvm::outs())); manager.add(llvm::createVerifierPass(true)); manager.run(llvm_module); @@ -233,6 +237,7 @@ int main() { // Allocate buffers typedef float numeric_t; size_t M = 128, N = 128, K = 128; + size_t bound = 8; std::vector c(M*N); std::vector rc(M*N); std::vector a(M*K); @@ -252,13 +257,13 @@ int main() { checkCudaErrors(cuMemcpyHtoD(d_b, b.data(), sizeof(numeric_t) * b.size())); checkCudaErrors(cuMemcpyHtoD(d_c, c.data(), sizeof(numeric_t) * c.size())); // Launch kernel - void *args[] = { &d_a, &d_b, &d_c, &M, &N, &K}; + void *args[] = { &d_a, &d_b, &d_c, &M, &N, &K, &bound}; int num_regs; cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, cu_kernel); unsigned TM = 16; unsigned TN = 16; unsigned nthreads = 32; - checkCudaErrors(cuLaunchKernel(cu_kernel, M/TM, N/TN, 1, nthreads, 1, 1, 0, cu_stream, args, NULL)); + checkCudaErrors(cuLaunchKernel(cu_kernel, (M + TM - 1)/TM, (N + TN - 1)/TN, 1, nthreads, 1, 1, 0, cu_stream, args, NULL)); checkCudaErrors(cuStreamSynchronize(cu_stream)); // Write back checkCudaErrors(cuMemcpyDtoH(c.data(), d_c, sizeof(numeric_t) * c.size())); diff --git a/include/ast/ast.h b/include/ast/ast.h index 6471b2296..4a9889093 100644 --- a/include/ast/ast.h +++ b/include/ast/ast.h @@ -185,7 +185,8 @@ private: public: binary_operator(BIN_OP_T op, node *lhs, node *rhs) - : op_(op), lhs_((expression*)lhs), rhs_((expression*)rhs) { } + : op_(op), lhs_((expression*)lhs), rhs_((expression*)rhs) { + } ir::value* codegen(ir::module *) const; private: @@ -320,14 +321,14 @@ private: class expression_statement: public statement{ public: - expression_statement(node *expr, node *pred = nullptr) - : expr_((expression*)expr), pred_((expression*)pred){ } + expression_statement(node *expr, node *mask = nullptr) + : expr_((expression*)expr), mask_((expression*)mask){ } ir::value* codegen(ir::module * mod) const; private: expression *expr_; - expression *pred_; + expression *mask_; }; class compound_statement: public statement{ diff --git a/include/ast/parser.y b/include/ast/parser.y index 442bee12e..905541d70 100644 --- a/include/ast/parser.y +++ b/include/ast/parser.y @@ -121,7 +121,7 @@ primary_expression | constant ELLIPSIS constant { $$ = new constant_range($1, $3); } | builtin { $$ = $1; } | STRING_LITERAL { $$ = new string_literal(yytext); } - | '(' expression ')' { $$ = $1; } + | '(' expression ')' { $$ = $2; } ; slice @@ -155,7 +155,7 @@ unary_operator cast_expression : unary_expression { $$ = $1; } - | '(' type_name ')' cast_expression { $$ = new cast_operator($1, $2); } + | '(' type_name ')' cast_expression { $$ = new cast_operator($2, $4); } ; multiplicative_expression diff --git a/include/codegen/selection.h b/include/codegen/selection.h index 6580ade98..2531dc74c 100644 --- a/include/codegen/selection.h +++ b/include/codegen/selection.h @@ -54,13 +54,15 @@ private: llvm::Value* shared_offset(indices_t idx); public: - shared_tile(llvm::Type* ty, const shapes_t &shapes, llvm::Value* ptr, llvm::IRBuilder<> &builder); + shared_tile(llvm::Type* ty, const shapes_t &shapes, llvm::Value* ptr, llvm::IRBuilder<> &builder, llvm::Value* offset = nullptr); void set_value(indices_t, llvm::Value *); llvm::Value* get_value(indices_t idx); llvm::Value* get_pointer() { return ptr_; } + llvm::Value* get_offset() { return offset_; } private: llvm::Value *ptr_; + llvm::Value *offset_; llvm::IRBuilder<> &builder_; std::map ptr_cache_; }; diff --git a/include/ir/instructions.h b/include/ir/instructions.h index 28feeb442..6c835ec2e 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -26,12 +26,16 @@ public: const basic_block *get_parent() const { return parent_; } basic_block *get_parent() { return parent_; } void erase_from_parent(); + // mask + value* set_mask(value *mask) { mask_ = mask; } + value* get_mask() { return mask_; } // helpers bool has_tile_result_or_op(); private: basic_block *parent_; value *pred_; + value *mask_; }; //===----------------------------------------------------------------------===// diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 36bd50adb..4a9d7ff3e 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -16,33 +16,34 @@ namespace ast{ /* node */ ir::value *node::explicit_cast(ir::builder &builder, ir::value *src, ir::type *dst_ty){ - ir::type *src_ty = src->get_type()->get_scalar_ty(); + ir::type *src_scalar_ty = src->get_type()->get_scalar_ty(); + ir::type *dst_scalar_ty = dst_ty->get_scalar_ty(); bool src_signed = false; bool dst_signed = false; - if(src_ty == dst_ty) + if(src_scalar_ty == dst_scalar_ty) return src; - else if(src_ty->is_integer_ty() && src_signed && dst_ty->is_floating_point_ty()) + else if(src_scalar_ty->is_integer_ty() && src_signed && dst_scalar_ty->is_floating_point_ty()) return builder.create_si_to_fp(src, dst_ty); - else if(src_ty->is_integer_ty() && !src_signed && dst_ty->is_floating_point_ty()) + else if(src_scalar_ty->is_integer_ty() && !src_signed && dst_scalar_ty->is_floating_point_ty()) return builder.create_ui_to_fp(src, dst_ty); - else if(src_ty->is_floating_point_ty() && dst_ty->is_integer_ty() && dst_signed) + else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_integer_ty() && dst_signed) return builder.create_fp_to_si(src, dst_ty); - else if(src_ty->is_floating_point_ty() && dst_ty->is_integer_ty() && !dst_signed) + else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_integer_ty() && !dst_signed) return builder.create_fp_to_ui(src, dst_ty); - else if(src_ty->is_floating_point_ty() && dst_ty->is_floating_point_ty() && - src_ty->get_fp_mantissa_width() < dst_ty->get_fp_mantissa_width()) + else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_floating_point_ty() && + src_scalar_ty->get_fp_mantissa_width() < dst_scalar_ty->get_fp_mantissa_width()) return builder.create_fp_ext(src, dst_ty); - else if(src_ty->is_floating_point_ty() && dst_ty->is_floating_point_ty() && - src_ty->get_fp_mantissa_width() > dst_ty->get_fp_mantissa_width()) + else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_floating_point_ty() && + src_scalar_ty->get_fp_mantissa_width() > dst_scalar_ty->get_fp_mantissa_width()) return builder.create_fp_trunc(src, dst_ty); - else if(src_ty->is_integer_ty() && dst_ty->is_integer_ty() && - src_ty->get_integer_bitwidth()) + else if(src_scalar_ty->is_integer_ty() && dst_scalar_ty->is_integer_ty() && + src_scalar_ty->get_integer_bitwidth()) return builder.create_int_cast(src, dst_ty, dst_signed); else @@ -247,7 +248,14 @@ ir::value* compound_statement::codegen(ir::module* mod) const{ /* expression statement */ ir::value* expression_statement::codegen(ir::module *mod) const{ - return expr_->codegen(mod); + ir::value *expr = expr_->codegen(mod); + if(mask_) { + ir::instruction *itn = dynamic_cast(expr); + assert(itn); + ir::value *mask = mask_->codegen(mod); + itn->set_mask(mask); + } + return expr; } /* Iteration statement */ @@ -325,7 +333,7 @@ ir::value* initializer::codegen(ir::module * mod) const{ ir::value *value = ir::undef_value::get(ty); if(expr_){ value = expr_->codegen(mod); - value = explicit_cast(mod->get_builder(), value, ty->get_scalar_ty()); + value = explicit_cast(mod->get_builder(), value, ty); implicit_broadcast(mod, value, ty); } value->set_name(name); @@ -526,7 +534,7 @@ ir::value *assignment_expression::codegen(ir::module *mod) const{ assert(x->get_op()==DEREF); assert(x->lvalue()); ir::value *ptr = x->lvalue()->codegen(mod); - mod->get_builder().create_store(ptr, rvalue); + rvalue = mod->get_builder().create_store(ptr, rvalue); } return rvalue; } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 9ef405e06..85fdb2189 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -1,6 +1,7 @@ #include "codegen/selection.h" #include "codegen/tune.h" #include "codegen/allocation.h" +#include "llvm/IR/InstrTypes.h" #include "llvm/IR/Module.h" #include "llvm/IR/IRBuilder.h" #include "ir/context.h" @@ -9,6 +10,8 @@ #include "ir/type.h" #include "llvm/Transforms/Scalar/EarlyCSE.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/IR/BasicBlock.h" namespace tdl{ namespace codegen{ @@ -121,8 +124,8 @@ Value* shared_tile::shared_offset(indices_t idx) { return result; } -shared_tile::shared_tile(Type *ty, const shapes_t &shapes, Value *ptr, llvm::IRBuilder<> &builder): - tile(ty, shapes), ptr_(ptr), builder_(builder) { +shared_tile::shared_tile(Type *ty, const shapes_t &shapes, Value *ptr, llvm::IRBuilder<> &builder, Value *offset): + tile(ty, shapes), ptr_(ptr), builder_(builder), offset_(offset) { } void shared_tile::set_value(indices_t idx, Value *value) { @@ -404,25 +407,17 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, std::swap(id_pre, id_loop); ir::value *pre_value = phi->get_incoming_value(id_pre); ir::value *loop_value = phi->get_incoming_value(id_loop); - BasicBlock *pre_block = (BasicBlock*)vmap_[phi->get_incoming_block(id_pre)]; - BasicBlock *loop_block = (BasicBlock*)vmap_[phi->get_incoming_block(id_loop)]; if(parent->empty()) builder.SetInsertPoint(parent); else builder.SetInsertPoint(&*parent->getFirstInsertionPt()); PHINode *ptr = builder.CreatePHI(ptr_ty, 2); - // offset PHINode *offset = builder.CreatePHI(builder.getInt32Ty(), 2); - Value *next_offset = builder.CreateNeg(offset); - offset->addIncoming(builder.getInt32(alloc_->get_num_bytes(phi) / 2 / 4), pre_block); - offset->addIncoming(next_offset, loop_block); // next pointer Value *pre_ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(alloc_->get_offset(phi))); pre_ptr = builder.CreateBitCast(pre_ptr, ptr->getType()); Value *next_ptr = builder.CreateGEP(ptr, offset); - ptr->addIncoming(pre_ptr, pre_block); - ptr->addIncoming(next_ptr, loop_block); - tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)}); + tmap_.insert({phi, new shared_tile(ty, shapes, ptr, builder, offset)}); tmap_.insert({pre_value, new shared_tile(ty, shapes, pre_ptr, builder)}); tmap_.insert({loop_value, new shared_tile(ty, shapes, next_ptr, builder)}); } @@ -483,14 +478,43 @@ void selection::init_grids(ir::function *fn, IRBuilder<> &builder, Value *sh_mem void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> &builder) { - Module *module = builder.GetInsertBlock()->getModule(); + BasicBlock *block = builder.GetInsertBlock(); + Module *module = block->getModule(); + Function *function = block->getParent(); + ir::value *mask = ins->get_mask(); LLVMContext &ctx = builder.getContext(); + // helper to handle masks + auto insert_masked = [&](indices_t idx, std::function insert_value) { + BasicBlock *block = builder.GetInsertBlock(); + Value *result; + if(mask){ + Value *llvm_mask = tmap_.at(mask)->get_value(idx); + BasicBlock *then_bb = BasicBlock::Create(ctx, "", function); + BasicBlock *done_bb = BasicBlock::Create(ctx, "", function); + builder.CreateCondBr(llvm_mask, then_bb, done_bb); + builder.SetInsertPoint(then_bb); + result = insert_value(); + builder.CreateBr(done_bb); + builder.SetInsertPoint(done_bb); + if(!ins->get_type()->is_void_ty()){ + Type *ty = result->getType(); + PHINode *phi = builder.CreatePHI(ty, 2); + phi->addIncoming(llvm::UndefValue::get(ty), block); + phi->addIncoming(result, then_bb); + return (Value*)phi; + } + } + else + result = insert_value(); + return result; + }; + // store if(auto *x = dynamic_cast(ins)) { distributed_tile* ptr = (distributed_tile*)tmap_.at(x->get_pointer_operand()); tile *value = tmap_.at(x->get_value_operand()); ptr->for_each([&](indices_t idx){ - builder.CreateStore(value->get_value(idx), ptr->get_value(idx)); + insert_masked(idx, [&]{ return builder.CreateStore(value->get_value(idx), ptr->get_value(idx)); }); }); } else { @@ -511,7 +535,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & Value *offset = builder.CreateMul(builder.getInt32(shapes[0]), group_id); result->for_each([&](indices_t idx){ BinaryOperator *bin = static_cast(idx[0]); - result->set_value(idx, builder.CreateAdd(bin, offset)); + result->set_value(idx, insert_masked(idx, [&]{ return builder.CreateAdd(bin, offset); })); }); } // reshape @@ -530,7 +554,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & // splat else if(dynamic_cast(ins)) { result->for_each([&](indices_t idx) { - result->set_value(idx, llvm_value(ins->get_operand(0), builder)); + result->set_value(idx, insert_masked(idx, [&]{ return llvm_value(ins->get_operand(0), builder); })); }); } // broadcast @@ -603,7 +627,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & else return llvm_value(x, builder); }; - result->set_value(idx, llvm_inst(ins, value, builder)); + result->set_value(idx, insert_masked(idx, [&]() { return llvm_inst(ins, value, builder); })); }); } } @@ -625,6 +649,7 @@ void selection::run(ir::module &src, Module &dst){ vmap_.clear(); LLVMContext &dst_ctx = dst.getContext(); IRBuilder<> dst_builder(dst_ctx); + std::map block_of; // iterate over functions for(ir::function *fn: src.get_function_list()) { @@ -661,6 +686,7 @@ void selection::run(ir::module &src, Module &dst){ } // create grids init_grids(fn, dst_builder, sh_mem_ptr); + std::map last_block; // iterate through block for(ir::basic_block *block: fn->blocks()) { BasicBlock *parent = (BasicBlock*)vmap_[block]; @@ -671,6 +697,7 @@ void selection::run(ir::module &src, Module &dst){ lower_instruction(i, dst_builder); if(dynamic_cast(i)) dst_builder.SetInsertPoint(parent); + last_block[block] = dst_builder.GetInsertBlock(); } } // add phi operands @@ -678,12 +705,31 @@ void selection::run(ir::module &src, Module &dst){ for(ir::instruction *inst: block->get_inst_list()) if(auto *phi = dynamic_cast(inst)){ if(buffer_info_->is_shared(phi)) { + PHINode *ptr = (PHINode*)((shared_tile*)tmap_.at(phi))->get_pointer(); + PHINode *offset = (PHINode*)((shared_tile*)tmap_.at(phi))->get_offset(); + for(unsigned n = 0; n < phi->get_num_incoming(); n++){ + ir::value *inc_val = phi->get_incoming_value(n); + ir::basic_block *inc_block = phi->get_incoming_block(n); + BasicBlock *llvm_inc_block = last_block.at(inc_block); + shared_tile *inc_shared = (shared_tile*)tmap_.at(inc_val); + GetElementPtrInst *inc_ptr = dyn_cast(inc_shared->get_pointer()); + if(inc_ptr && ptr == inc_ptr->getPointerOperand()){ + dst_builder.SetInsertPoint(llvm_inc_block->getTerminator()); + Value *next_offset = dst_builder.CreateNeg(offset); + offset->addIncoming(next_offset, llvm_inc_block); + } + else { + offset->addIncoming(dst_builder.getInt32(alloc_->get_num_bytes(phi)/(2*4)), llvm_inc_block); + } + ptr->addIncoming(inc_shared->get_pointer(), llvm_inc_block); + } continue; } for(unsigned n = 0; n < phi->get_num_incoming(); n++){ ir::value *inc_val = phi->get_incoming_value(n); ir::basic_block *inc_block = phi->get_incoming_block(n); - BasicBlock *llvm_inc_block = (BasicBlock*)vmap_[inc_block]; + std::cout << typeid(*inc_val).name() << " " << inc_val << " " << inc_block << std::endl; + BasicBlock *llvm_inc_block = last_block.at(inc_block); if(phi->get_type()->is_tile_ty()) { distributed_tile *phi_tile = (distributed_tile*)tmap_.at(phi); distributed_tile *inc_tile = (distributed_tile*)tmap_.at(inc_val); diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index c98b2ae66..924392cab 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -67,10 +67,17 @@ void tune::init_c_graph(ir::instruction *v) { } // Element-wise else if(dynamic_cast(v)){ + std::cout << typeid(*v).name() << std::endl; for(unsigned i = 0; i < shapes.size(); i ++) - for(ir::value* op: v->ops()){ + for(ir::value* op: v->ops()) add_constraint({v, i}, {op, i}); - } + } + + /* Add mask constraints */ + if(ir::value *mask = v->get_mask()){ + std::cout << typeid(*mask).name() << " " << typeid(*v->ops()[0]).name() << std::endl; + for(unsigned i = 0; i < shapes.size(); i++) + add_constraint({v->ops()[0], i}, {mask, i}); } } @@ -99,6 +106,7 @@ std::vector tune::get_params(ir::module &mod) { for(ir::instruction *i : block->get_inst_list()) for(auto &x: params_[i]) if(seen.insert(x.second).second && *x.second == 0){ + std::cout << typeid(*i).name() << std::endl; result.push_back(x.second); } return result; diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index f335bbeea..56f583141 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -186,8 +186,8 @@ cast_inst *cast_inst::create(op_t op, value *arg, type *ty, const std::string &n cast_inst *cast_inst::create_integer_cast(value *arg, type *ty, bool is_signed, const std::string &name, instruction *next){ type *arg_ty = arg->get_type(); assert(arg_ty->is_int_or_tileint_ty() && ty->is_int_or_tileint_ty() && "Invalid integer cast!"); - unsigned arg_bits = arg_ty->get_integer_bitwidth(); - unsigned dst_bits = ty->get_integer_bitwidth(); + unsigned arg_bits = arg_ty->get_scalar_ty()->get_integer_bitwidth(); + unsigned dst_bits = ty->get_scalar_ty()->get_integer_bitwidth(); op_t op = (arg_bits == dst_bits ? ic::BitCast : (arg_bits > dst_bits ? ic::Trunc : (is_signed ? ic::SExt : ic::ZExt))); diff --git a/lib/ir/type.cpp b/lib/ir/type.cpp index 075bcd88b..c790120fb 100644 --- a/lib/ir/type.cpp +++ b/lib/ir/type.cpp @@ -33,7 +33,7 @@ unsigned type::get_primitive_size_in_bits() const { } unsigned type::get_integer_bitwidth() const -{ return ((integer_type*)(this))->get_bitwidth(); } +{ assert(id_ == IntegerTyID); return ((integer_type*)(this))->get_bitwidth(); } unsigned type::get_tile_bitwidth() const { return ((tile_type*)(this))->get_bitwidth(); } From cf1a583dbff2a615c9df0f1ef7f668973f2d9d0c Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 15 Feb 2019 22:03:09 -0500 Subject: [PATCH 078/494] bla --- examples/matrix.cpp | 6 +++--- include/ast/ast.h | 9 ++++++--- include/ir/instructions.h | 18 ++++++++++++------ lib/ast/lowering.cpp | 8 ++++---- lib/codegen/selection.cpp | 14 +++++++++----- lib/codegen/tune.cpp | 6 ++---- 6 files changed, 36 insertions(+), 25 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 8145ddb90..b96d32bdb 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -54,8 +54,8 @@ void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K, int32 bound){\ int1 checkc1[16] = ryc < N;\ int1 checkc[16, 16] = checkc0[:, newaxis] && checkc1[newaxis, :];\ for(k = K; k > 0; k = k - 8){\ - int1 sanitya[16, 8] = (k >= bound);\ - int1 sanityb[16, 8] = (k >= bound);\ + int1 sanitya[16, 8] = (k > 16);\ + int1 sanityb[16, 8] = (k > 16);\ C = dot(a, b, C);\ pa = pa + 8*M;\ pb = pb + 8*K;\ @@ -236,7 +236,7 @@ int main() { // execute machine code // Allocate buffers typedef float numeric_t; - size_t M = 128, N = 128, K = 128; + size_t M = 32, N = 32, K = 32; size_t bound = 8; std::vector c(M*N); std::vector rc(M*N); diff --git a/include/ast/ast.h b/include/ast/ast.h index 4a9889093..51e3f97f2 100644 --- a/include/ast/ast.h +++ b/include/ast/ast.h @@ -287,14 +287,17 @@ public: class assignment_expression: public expression{ public: assignment_expression(node *lvalue, ASSIGN_OP_T op, node *rvalue) - : lvalue_((named_expression*)lvalue), op_(op), rvalue_((expression*)rvalue) { } + : lhs_((named_expression*)lvalue), op_(op), rhs_((expression*)rvalue) { } + + const expression *lhs() const { return lhs_; } + const expression *rhs() const { return rhs_; } ir::value* codegen(ir::module *mod) const; public: ASSIGN_OP_T op_; - const expression *lvalue_; - const expression *rvalue_; + const expression *lhs_; + const expression *rhs_; }; diff --git a/include/ir/instructions.h b/include/ir/instructions.h index 6c835ec2e..74d5040f3 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -16,26 +16,32 @@ class context; //===----------------------------------------------------------------------===// class instruction: public user{ +public: + struct mask_info_t { + value *pred; + value *else_value; + }; + protected: // constructors instruction(type *ty, unsigned num_ops, const std::string &name = "", instruction *next = nullptr); public: // parent - void set_parent(basic_block *block) { parent_ = block; } - const basic_block *get_parent() const { return parent_; } - basic_block *get_parent() { return parent_; } + void set_parent(basic_block *block) { parent_ = block; } + const basic_block *get_parent() const { return parent_; } + basic_block *get_parent() { return parent_; } void erase_from_parent(); // mask - value* set_mask(value *mask) { mask_ = mask; } - value* get_mask() { return mask_; } + value* set_mask(value *pred, value *else_value = nullptr) { mask_ = {pred, else_value}; } + const mask_info_t get_mask() const { return mask_; } // helpers bool has_tile_result_or_op(); private: basic_block *parent_; value *pred_; - value *mask_; + mask_info_t mask_; }; //===----------------------------------------------------------------------===// diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 4a9d7ff3e..6f44f24c5 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -527,16 +527,16 @@ ir::value *conditional_expression::codegen(ir::module *mod) const{ /* Assignment expression */ ir::value *assignment_expression::codegen(ir::module *mod) const{ - ir::value *rvalue = rvalue_->codegen(mod); + ir::value *rhs = rhs_->codegen(mod); if(auto *x = dynamic_cast(lvalue_)) - mod->set_value(x->id()->name(), rvalue); + mod->set_value(x->id()->name(), rhs); else if(auto* x = dynamic_cast(lvalue_)){ assert(x->get_op()==DEREF); assert(x->lvalue()); ir::value *ptr = x->lvalue()->codegen(mod); - rvalue = mod->get_builder().create_store(ptr, rvalue); + rhs = mod->get_builder().create_store(ptr, rhs); } - return rvalue; + return rhs; } /* Type name */ diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 85fdb2189..1b9116e51 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -481,14 +481,16 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & BasicBlock *block = builder.GetInsertBlock(); Module *module = block->getModule(); Function *function = block->getParent(); - ir::value *mask = ins->get_mask(); + ir::instruction::mask_info_t mask = ins->get_mask(); LLVMContext &ctx = builder.getContext(); // helper to handle masks auto insert_masked = [&](indices_t idx, std::function insert_value) { BasicBlock *block = builder.GetInsertBlock(); Value *result; - if(mask){ - Value *llvm_mask = tmap_.at(mask)->get_value(idx); + if(mask.pred){ +// if(mask.else_value) +// std::cout << mask.else_value << std::endl; + Value *llvm_mask = tmap_.at(mask.pred)->get_value(idx); BasicBlock *then_bb = BasicBlock::Create(ctx, "", function); BasicBlock *done_bb = BasicBlock::Create(ctx, "", function); builder.CreateCondBr(llvm_mask, then_bb, done_bb); @@ -499,7 +501,10 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & if(!ins->get_type()->is_void_ty()){ Type *ty = result->getType(); PHINode *phi = builder.CreatePHI(ty, 2); - phi->addIncoming(llvm::UndefValue::get(ty), block); +// if(mask.else_value) +// phi->addIncoming(tmap_.at(mask.else_value)->get_value(idx), block); +// else + phi->addIncoming(llvm::UndefValue::get(ty), block); phi->addIncoming(result, then_bb); return (Value*)phi; } @@ -728,7 +733,6 @@ void selection::run(ir::module &src, Module &dst){ for(unsigned n = 0; n < phi->get_num_incoming(); n++){ ir::value *inc_val = phi->get_incoming_value(n); ir::basic_block *inc_block = phi->get_incoming_block(n); - std::cout << typeid(*inc_val).name() << " " << inc_val << " " << inc_block << std::endl; BasicBlock *llvm_inc_block = last_block.at(inc_block); if(phi->get_type()->is_tile_ty()) { distributed_tile *phi_tile = (distributed_tile*)tmap_.at(phi); diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 924392cab..8919f171b 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -67,17 +67,15 @@ void tune::init_c_graph(ir::instruction *v) { } // Element-wise else if(dynamic_cast(v)){ - std::cout << typeid(*v).name() << std::endl; for(unsigned i = 0; i < shapes.size(); i ++) for(ir::value* op: v->ops()) add_constraint({v, i}, {op, i}); } /* Add mask constraints */ - if(ir::value *mask = v->get_mask()){ - std::cout << typeid(*mask).name() << " " << typeid(*v->ops()[0]).name() << std::endl; + if(ir::value *pred = v->get_mask().pred){ for(unsigned i = 0; i < shapes.size(); i++) - add_constraint({v->ops()[0], i}, {mask, i}); + add_constraint({v->ops()[0], i}, {pred, i}); } } From f3094a512bc641c320b12549268302ccc6094923 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 17 Feb 2019 21:35:03 -0500 Subject: [PATCH 079/494] [syntax tree] fixed bugs in control flow lowering --- examples/matrix.cpp | 13 ++++-- include/ast/ast.h | 9 ++-- include/ir/instructions.h | 56 +++++++++++++++++++++--- include/ir/module.h | 5 +++ include/ir/value.h | 2 +- lib/ast/lowering.cpp | 18 +++++--- lib/ir/instructions.cpp | 90 ++++++++++++++++++++++++++++++++++++++- lib/ir/module.cpp | 43 +++++++++++++++---- 8 files changed, 201 insertions(+), 35 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index b96d32bdb..c275b8a06 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -5,6 +5,7 @@ #include "ast/ast.h" #include "ir/context.h" #include "ir/module.h" +#include "ir/print.h" #include "codegen/selection.h" #include "codegen/tune.h" #include "codegen/shared_copy.h" @@ -54,13 +55,15 @@ void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K, int32 bound){\ int1 checkc1[16] = ryc < N;\ int1 checkc[16, 16] = checkc0[:, newaxis] && checkc1[newaxis, :];\ for(k = K; k > 0; k = k - 8){\ - int1 sanitya[16, 8] = (k > 16);\ - int1 sanityb[16, 8] = (k > 16);\ + int1 checka[16, 8] = (k > 8);\ + int1 checkb[16, 8] = (k > 8);\ C = dot(a, b, C);\ pa = pa + 8*M;\ pb = pb + 8*K;\ - @sanitya a = *pa;\ - @sanityb b = *pb;\ + @checka a = *pa;\ + @checkb b = *pb;\ + if(k <= 8){\ + }\ }\ @checkc *pc = C;\ }\ @@ -167,6 +170,8 @@ int main() { llvm::LLVMContext llvm_context; llvm::Module llvm_module("test", llvm_context); + tdl::ir::print(module, std::cout); + // create passes tdl::codegen::place_shared_copy shared; tdl::codegen::buffer_info_pass buffer_info; diff --git a/include/ast/ast.h b/include/ast/ast.h index 51e3f97f2..4a9889093 100644 --- a/include/ast/ast.h +++ b/include/ast/ast.h @@ -287,17 +287,14 @@ public: class assignment_expression: public expression{ public: assignment_expression(node *lvalue, ASSIGN_OP_T op, node *rvalue) - : lhs_((named_expression*)lvalue), op_(op), rhs_((expression*)rvalue) { } - - const expression *lhs() const { return lhs_; } - const expression *rhs() const { return rhs_; } + : lvalue_((named_expression*)lvalue), op_(op), rvalue_((expression*)rvalue) { } ir::value* codegen(ir::module *mod) const; public: ASSIGN_OP_T op_; - const expression *lhs_; - const expression *rhs_; + const expression *lvalue_; + const expression *rvalue_; }; diff --git a/include/ir/instructions.h b/include/ir/instructions.h index 74d5040f3..a18a94e1c 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -22,6 +22,8 @@ public: value *else_value; }; + virtual std::string repr_impl() const = 0; + protected: // constructors instruction(type *ty, unsigned num_ops, const std::string &name = "", instruction *next = nullptr); @@ -37,6 +39,8 @@ public: const mask_info_t get_mask() const { return mask_; } // helpers bool has_tile_result_or_op(); + // repr + std::string repr() const { return repr_impl(); } private: basic_block *parent_; @@ -51,6 +55,7 @@ private: class phi_node: public instruction{ private: phi_node(type *ty, unsigned num_reserved, const std::string &name, instruction *next); + std::string repr_impl() const { return "phi"; } public: void set_incoming_value(unsigned i, value *v); @@ -60,6 +65,9 @@ public: unsigned get_num_incoming() { return get_num_operands(); } void add_incoming(value *v, basic_block *block); + // Type + void set_type(type *ty) { ty_ = ty; } + // Factory methods static phi_node* create(type *ty, unsigned num_reserved, const std::string &name = "", instruction *next = nullptr); @@ -75,6 +83,10 @@ private: class binary_operator: public instruction{ public: typedef llvm::BinaryOperator::BinaryOps op_t; + using llop = llvm::BinaryOperator::BinaryOps; + +private: + std::string repr_impl() const; protected: // Constructors @@ -116,7 +128,10 @@ public: class cmp_inst: public instruction{ public: typedef llvm::CmpInst::Predicate pred_t; - using pcmp = llvm::CmpInst; + using llop = llvm::CmpInst; + +private: + std::string repr_impl() const; protected: cmp_inst(type *ty, pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next); @@ -164,6 +179,9 @@ protected: class cast_inst: public unary_inst{ using ic = llvm::Instruction::CastOps; +private: + std::string repr_impl() const; + public: typedef llvm::CastInst::CastOps op_t; @@ -219,6 +237,8 @@ class terminator_inst: public instruction{ // return instruction class return_inst: public terminator_inst{ +private: + std::string repr_impl() const { return "ret"; } return_inst(context &ctx, value *ret_val, instruction *next); public: @@ -234,6 +254,9 @@ public: // base branch instruction class branch_inst: public terminator_inst{ +private: + std::string repr_impl() const { return "br"; } + protected: using terminator_inst::terminator_inst; @@ -246,8 +269,9 @@ public: // conditional branch class cond_branch_inst: public branch_inst { - cond_branch_inst(basic_block *if_dst, basic_block *else_dst, value *cond, instruction *next); +private: friend class branch_inst; + cond_branch_inst(basic_block *if_dst, basic_block *else_dst, value *cond, instruction *next); public: basic_block *get_true_dest() { return (basic_block*)get_operand(0); } @@ -257,6 +281,7 @@ public: // unconditional branch class uncond_branch_inst: public branch_inst { +private: friend class branch_inst; uncond_branch_inst(basic_block *dst, instruction *next); @@ -269,6 +294,7 @@ public: class getelementptr_inst: public instruction{ private: + std::string repr_impl() const { return "getelementptr"; } getelementptr_inst(type *pointee_ty, value *ptr, const std::vector &idx, const std::string &name, instruction *next); private: @@ -297,6 +323,7 @@ private: class load_inst: public unary_inst{ private: + std::string repr_impl() const { return "load"; } load_inst(value *ptr, const std::string &name, instruction *next); private: @@ -312,6 +339,7 @@ public: class store_inst: public instruction{ private: + std::string repr_impl() const { return "store"; } store_inst(value *ptr, value *v, const std::string &name, instruction *next); public: @@ -330,36 +358,43 @@ public: class retile_inst: public unary_inst { protected: - retile_inst(value *arg, const std::vector &shapes, const std::string &name, instruction *next); + retile_inst(value *arg, const std::vector &shape_suffix, const std::string &name, instruction *next); + static std::string shape_suffix(ir::type* ty); }; // reshape class reshape_inst: public retile_inst { +private: using retile_inst::retile_inst; + std::string repr_impl() const { return "reshape" + shape_suffix(get_type()); } public: - static instruction* create(value *arg, const std::vector &shapes, + static instruction* create(value *arg, const std::vector &shape_suffix, const std::string &name = "", instruction *next = nullptr); }; // splat class splat_inst: public retile_inst { +private: using retile_inst::retile_inst; + std::string repr_impl() const { return "splat" + shape_suffix(get_type()); } public: - static instruction* create(value *arg, const std::vector &shapes, + static instruction* create(value *arg, const std::vector &shape_suffix, const std::string &name = "", instruction *next = nullptr); }; // broadcast class broadcast_inst: public retile_inst { +private: using retile_inst::retile_inst; + std::string repr_impl() const { return "broadcast" + shape_suffix(get_type()); } public: - static instruction* create(value *arg, const std::vector &shapes, + static instruction* create(value *arg, const std::vector &shape_suffix, const std::string &name = "", instruction *next = nullptr); }; @@ -374,7 +409,9 @@ protected: }; class get_global_range_inst: public builtin_inst { +private: get_global_range_inst(type *ty, unsigned axis, const std::string &name, instruction *next); + std::string repr_impl() const { return "get_global_range(" + std::to_string(axis_) + ")"; } public: static instruction* create(context &ctx, unsigned axis, unsigned size, @@ -387,7 +424,9 @@ private: }; class matmul_inst: public builtin_inst { +private: matmul_inst(value *A, value *B, value *C, const std::string &name, instruction *next); + std::string repr_impl() const { return "dot"; } public: static instruction* create(value *A, value *B, value *C, @@ -401,7 +440,9 @@ public: //===----------------------------------------------------------------------===// class copy_to_shared_inst: public unary_inst{ +private: using unary_inst::unary_inst; + std::string repr_impl() const { return "copy_to_shared"; } public: static copy_to_shared_inst* create(value *arg, const std::string &name = "", @@ -411,6 +452,7 @@ public: class barrier_inst: public instruction{ private: barrier_inst(context &ctx, const std::string &name, instruction *next); + std::string repr_impl() const { return "barrier"; } public: static barrier_inst* create(context &ctx, const std::string &name = "", @@ -418,7 +460,9 @@ public: }; class vectorize_inst: public unary_inst{ +private: using unary_inst::unary_inst; + std::string repr_impl() const { return "vectorize"; } public: static vectorize_inst* create(value *arg, const std::string &name = "", instruction *next = nullptr); diff --git a/include/ir/module.h b/include/ir/module.h index d016b68e3..3509210cd 100644 --- a/include/ir/module.h +++ b/include/ir/module.h @@ -42,9 +42,13 @@ public: // Setters void set_value(const std::string& name, basic_block* block, value *x); void set_value(const std::string& name, value* x); + void set_type(const std::string& name, basic_block* block, type* x); + void set_type(const std::string& name, type* x); // Getters value *get_value(const std::string& name, basic_block* block); value *get_value(const std::string& name); + type *get_type(const std::string& name, basic_block* block); + type *get_type(const std::string& name); // Seal block -- no more predecessors will be added void seal_block(basic_block *block); // Functions @@ -58,6 +62,7 @@ private: context &context_; builder builder_; std::map values_; + std::map types_; std::set sealed_blocks_; std::map> incomplete_phis_; functions_list_t functions_; diff --git a/include/ir/value.h b/include/ir/value.h index cce66949e..b183a189f 100644 --- a/include/ir/value.h +++ b/include/ir/value.h @@ -33,10 +33,10 @@ public: type* get_type() const { return ty_; } private: - type *ty_; std::string name_; protected: + type *ty_; std::set users_; }; diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 6f44f24c5..c4dec7feb 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -212,6 +212,7 @@ void function::bind_parameters(ir::module *mod, ir::function *fn) const{ if(id_i){ args[i]->set_name(id_i->name()); mod->set_value(id_i->name(), nullptr, args[i]); + mod->set_type(id_i->name(), nullptr, args[i]->get_type()); } } } @@ -262,7 +263,8 @@ ir::value* expression_statement::codegen(ir::module *mod) const{ ir::value* iteration_statement::codegen(ir::module *mod) const{ ir::builder &builder = mod->get_builder(); ir::context &ctx = mod->get_context(); - ir::function *fn = builder.get_insert_block()->get_parent(); + ir::basic_block *current_bb = builder.get_insert_block(); + ir::function *fn = current_bb->get_parent(); ir::basic_block *loop_bb = ir::basic_block::create(ctx, "loop", fn); init_->codegen(mod); builder.create_br(loop_bb); @@ -270,8 +272,10 @@ ir::value* iteration_statement::codegen(ir::module *mod) const{ statements_->codegen(mod); exec_->codegen(mod); ir::value *cond = stop_->codegen(mod); + ir::basic_block *stop_bb = builder.get_insert_block(); ir::basic_block *next_bb = ir::basic_block::create(ctx, "postloop", fn); builder.create_cond_br(cond, loop_bb, next_bb); + mod->seal_block(stop_bb); mod->seal_block(loop_bb); mod->seal_block(builder.get_insert_block()); mod->seal_block(next_bb); @@ -296,8 +300,7 @@ ir::value* selection_statement::codegen(ir::module* mod) const{ // Then builder.set_insert_point(then_bb); then_value_->codegen(mod); - if(else_value_) - builder.create_br(endif_bb); + builder.create_br(endif_bb); mod->seal_block(then_bb); // Else if(else_value_){ @@ -338,6 +341,7 @@ ir::value* initializer::codegen(ir::module * mod) const{ } value->set_name(name); mod->set_value(name, value); + mod->set_type(name, ty); return value; } @@ -527,16 +531,16 @@ ir::value *conditional_expression::codegen(ir::module *mod) const{ /* Assignment expression */ ir::value *assignment_expression::codegen(ir::module *mod) const{ - ir::value *rhs = rhs_->codegen(mod); + ir::value *rvalue = rvalue_->codegen(mod); if(auto *x = dynamic_cast(lvalue_)) - mod->set_value(x->id()->name(), rhs); + mod->set_value(x->id()->name(), rvalue); else if(auto* x = dynamic_cast(lvalue_)){ assert(x->get_op()==DEREF); assert(x->lvalue()); ir::value *ptr = x->lvalue()->codegen(mod); - rhs = mod->get_builder().create_store(ptr, rhs); + rvalue = mod->get_builder().create_store(ptr, rvalue); } - return rhs; + return rvalue; } /* Type name */ diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 56f583141..acf0c0329 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -73,6 +73,30 @@ phi_node* phi_node::create(type *ty, unsigned num_reserved, const std::string &n // binary_operator classes //===----------------------------------------------------------------------===// +std::string binary_operator::repr_impl() const { + switch(op_) { + case llop::Add : return "add"; + case llop::FAdd : return "fadd"; + case llop::Sub : return "sub"; + case llop::FSub : return "fsub"; + case llop::Mul : return "mul"; + case llop::FMul : return "fmul"; + case llop::UDiv : return "udiv"; + case llop::SDiv : return "sdiv"; + case llop::FDiv : return "fdiv"; + case llop::URem : return "urem"; + case llop::SRem : return "srem"; + case llop::FRem : return "frem"; + case llop::Shl : return "shl"; + case llop::LShr : return "lshr"; + case llop::AShr : return "ashr"; + case llop::And : return "and"; + case llop::Or : return "or"; + case llop::Xor : return "xor"; + default: throw std::runtime_error("unknown binary operator"); + } +} + binary_operator::binary_operator(op_t op, value *lhs, value *rhs, type *ty, const std::string &name, instruction *next) : instruction(ty, 2, name, next), op_(op){ set_operand(0, lhs); @@ -108,6 +132,38 @@ binary_operator *binary_operator::create_not(value *arg, const std::string &name //===----------------------------------------------------------------------===// // cmp_inst +std::string cmp_inst::repr_impl() const { + switch (pred_) { + case llop::FCMP_FALSE : return "false"; + case llop::FCMP_OEQ : return "fcmp_oeq"; + case llop::FCMP_OGT : return "fcmp_ogt"; + case llop::FCMP_OGE : return "fcmp_oge"; + case llop::FCMP_OLT : return "fcmp_olt"; + case llop::FCMP_OLE : return "fcmp_ole"; + case llop::FCMP_ONE : return "fcmp_one"; + case llop::FCMP_ORD : return "fcmp_ord"; + case llop::FCMP_UNO : return "fcmp_uno"; + case llop::FCMP_UEQ : return "fcmp_ueq"; + case llop::FCMP_UGT : return "fcmp_ugt"; + case llop::FCMP_UGE : return "fcmp_uge"; + case llop::FCMP_ULT : return "fcmp_ult"; + case llop::FCMP_ULE : return "fcmp_ule"; + case llop::FCMP_UNE : return "fcmp_une"; + case llop::FCMP_TRUE : return "true"; + case llop::ICMP_EQ : return "icmp_eq"; + case llop::ICMP_NE : return "icmp_ne"; + case llop::ICMP_UGT : return "icmp_ugt"; + case llop::ICMP_UGE : return "icmp_uge"; + case llop::ICMP_ULT : return "icmp_ult"; + case llop::ICMP_ULE : return "icmp_ule"; + case llop::ICMP_SGT : return "icmp_sgt"; + case llop::ICMP_SGE : return "icmp_sge"; + case llop::ICMP_SLT : return "icmp_slt"; + case llop::ICMP_SLE : return "icmp_sle"; + default: throw std::runtime_error("unreachable"); + } +} + cmp_inst::cmp_inst(type *ty, cmp_inst::pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next) : instruction(ty, 2, name, next), pred_(pred) { set_operand(0, lhs); @@ -123,11 +179,11 @@ type* cmp_inst::make_cmp_result_type(type *ty){ bool cmp_inst::is_fp_predicate(pred_t pred) { - return pred >= pcmp::FIRST_FCMP_PREDICATE && pred <= pcmp::LAST_FCMP_PREDICATE; + return pred >= llop::FIRST_FCMP_PREDICATE && pred <= llop::LAST_FCMP_PREDICATE; } bool cmp_inst::is_int_predicate(pred_t pred) { - return pred >= pcmp::FIRST_ICMP_PREDICATE && pred <= pcmp::LAST_ICMP_PREDICATE; + return pred >= llop::FIRST_ICMP_PREDICATE && pred <= llop::LAST_ICMP_PREDICATE; } // icmp_inst @@ -157,6 +213,24 @@ unary_inst::unary_inst(type *ty, value *v, const std::string &name, instruction // cast_inst classes //===----------------------------------------------------------------------===// +std::string cast_inst::repr_impl() const { + switch (op_){ + case ic::Trunc: return "trunc"; + case ic::ZExt: return "zext"; + case ic::SExt: return "sext"; + case ic::FPTrunc: return "fp_trunc"; + case ic::FPExt: return "fp_ext"; + case ic::UIToFP: return "ui_to_fp"; + case ic::SIToFP: return "si_to_fp"; + case ic::FPToUI: return "fp_to_ui"; + case ic::FPToSI: return "fp_to_si"; + case ic::PtrToInt: return "ptr_to_int"; + case ic::IntToPtr: return "int_to_ptr"; + case ic::BitCast: return "bitcast"; + case ic::AddrSpaceCast: return "addr_space_cast"; + default: throw std::runtime_error("unreachable"); + } +} // TODO bool cast_inst::is_valid(op_t op, value *arg, type *ty) { return true; @@ -331,6 +405,18 @@ store_inst* store_inst::create(value *ptr, value *v, const std::string &name, in // retile_inst classes //===----------------------------------------------------------------------===// +std::string retile_inst::shape_suffix(ir::type* ty){ + std::string res = "["; + const auto& shapes = ty->get_tile_shapes(); + for(unsigned i = 0; i < shapes.size(); i++){ + res += std::to_string(ty->get_tile_shapes()[i]); + if(i < shapes.size() - 1) + res += ", "; + } + res += "]"; + return res; +} + retile_inst::retile_inst(value *arg, const std::vector &shapes, const std::string &name, instruction *next) : unary_inst(tile_type::get(arg->get_type()->get_scalar_ty(), shapes), arg, name, next) { } diff --git a/lib/ir/module.cpp b/lib/ir/module.cpp index 29636657f..57a143ffe 100644 --- a/lib/ir/module.cpp +++ b/lib/ir/module.cpp @@ -29,6 +29,14 @@ void module::set_value(const std::string& name, ir::value *value){ return set_value(name, builder_.get_insert_block(), value); } +void module::set_type(const std::string& name, ir::basic_block *block, ir::type *type){ + types_[val_key_t{name, block}] = type; +} + +void module::set_type(const std::string& name, ir::type *type){ + return set_type(name, builder_.get_insert_block(), type); +} + ir::phi_node* module::make_phi(ir::type *ty, unsigned num_values, ir::basic_block *block){ basic_block::iterator insert = block->get_first_non_phi(); if(insert != block->end()){ @@ -42,14 +50,14 @@ ir::phi_node* module::make_phi(ir::type *ty, unsigned num_values, ir::basic_bloc ir::value *module::try_remove_trivial_phis(ir::phi_node *&phi){ // find non-self references - std::vector non_self_ref; - std::copy_if(phi->ops().begin(), phi->ops().end(), std::back_inserter(non_self_ref), - [phi](ir::value* op){ return op != phi; }); + std::set non_self_ref; + std::copy_if(phi->ops().begin(), phi->ops().end(), std::inserter(non_self_ref, non_self_ref.begin()), + [phi](ir::value* op){ return op != phi && op; }); // non-trivial - if(non_self_ref.size() > 1) + if(non_self_ref.size() != 1) return phi; // unique value or self-reference - ir::value *same = non_self_ref[0]; + ir::value *same = *non_self_ref.begin(); std::set users = phi->get_users(); phi->replace_all_uses_with(same); phi->erase_from_parent(); @@ -57,9 +65,12 @@ ir::value *module::try_remove_trivial_phis(ir::phi_node *&phi){ if(auto *uphi = dynamic_cast(u)) if(uphi != phi) try_remove_trivial_phis(uphi); + if(auto *new_phi = dynamic_cast(same)) + return try_remove_trivial_phis(new_phi); return same; } + ir::value *module::add_phi_operands(const std::string& name, ir::phi_node *&phi){ // already initialized if(phi->get_num_operands()) @@ -75,9 +86,9 @@ ir::value *module::add_phi_operands(const std::string& name, ir::phi_node *&phi) ir::value *module::get_value_recursive(const std::string& name, ir::basic_block *block) { ir::value *result; auto &preds = block->get_predecessors(); + if(block) if(sealed_blocks_.find(block) == sealed_blocks_.end()){ - ir::value *pred = get_value(name, preds.front()); - incomplete_phis_[block][name] = make_phi(pred->get_type(), 1, block); + incomplete_phis_[block][name] = make_phi(get_type(name, block), 1, block); result = (ir::value*)incomplete_phis_[block][name]; } else if(preds.size() <= 1){ @@ -85,8 +96,7 @@ ir::value *module::get_value_recursive(const std::string& name, ir::basic_block result = get_value(name, has_pred?preds.front():nullptr); } else{ - ir::value *pred = get_value(name, preds.front()); - result = make_phi(pred->get_type(), 1, block); + result = make_phi(get_type(name, block), 1, block); set_value(name, block, result); result = add_phi_operands(name, (ir::phi_node*&)result); } @@ -112,6 +122,21 @@ ir::value *module::get_value(const std::string& name) { return get_value(name, builder_.get_insert_block()); } +ir::type *module::get_type(const std::string &name, basic_block *block) { + val_key_t key(name, block); + if(types_.find(key) != types_.end()) + return types_.at(key); + assert(block); + const auto& predecessors = block->get_predecessors(); + if(predecessors.empty()) + return get_type(name, nullptr); + return get_type(name, predecessors[0]); +} + +ir::type *module::get_type(const std::string &name) { + return types_.at({name, builder_.get_insert_block()}); +} + void module::seal_block(ir::basic_block *block){ for(auto &x: incomplete_phis_[block]) add_phi_operands(x.first, x.second); From 90ec0ae2c02fd5ac74d96a094d352828ccb65e13 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 18 Feb 2019 22:54:08 -0500 Subject: [PATCH 080/494] [code generation] some more bugfixing with nested control flow --- examples/matrix.cpp | 10 +++++-- include/codegen/shared_copy.h | 8 +++++- include/ir/module.h | 2 +- lib/ast/lowering.cpp | 6 ++-- lib/codegen/allocation.cpp | 12 +++++--- lib/codegen/barriers.cpp | 4 +-- lib/codegen/buffer_info.cpp | 36 ++++++++++++++++-------- lib/codegen/selection.cpp | 53 ++++++++++++++++++++--------------- lib/codegen/shared_copy.cpp | 30 ++++++++------------ lib/ir/module.cpp | 12 ++++---- 10 files changed, 102 insertions(+), 71 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index c275b8a06..5c7aee602 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -63,6 +63,7 @@ void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K, int32 bound){\ @checka a = *pa;\ @checkb b = *pb;\ if(k <= 8){\ + @checka a = *pa;\ }\ }\ @checkc *pc = C;\ @@ -170,11 +171,10 @@ int main() { llvm::LLVMContext llvm_context; llvm::Module llvm_module("test", llvm_context); - tdl::ir::print(module, std::cout); // create passes - tdl::codegen::place_shared_copy shared; tdl::codegen::buffer_info_pass buffer_info; + tdl::codegen::place_shared_copy shared(&buffer_info); tdl::codegen::tune tune; tdl::codegen::liveness liveness(&buffer_info); tdl::codegen::allocation allocation(&liveness, &buffer_info); @@ -211,10 +211,14 @@ int main() { if(errors.size()) exit(EXIT_FAILURE); + // print + // run passes - shared.run(module); + tdl::ir::print(module, std::cout); buffer_info.run(module); + shared.run(module); liveness.run(module); + tdl::ir::print(module, std::cout); allocation.run(); barriers.run(module); vectorize.run(module); diff --git a/include/codegen/shared_copy.h b/include/codegen/shared_copy.h index 927915805..be043b18c 100644 --- a/include/codegen/shared_copy.h +++ b/include/codegen/shared_copy.h @@ -15,6 +15,8 @@ namespace ir { namespace codegen{ +class buffer_info_pass; + class place_shared_copy { private: typedef std::pair interval_t; @@ -22,10 +24,14 @@ private: private: bool intersect(const interval_vec_t &I, interval_t i); - void add_copies(ir::value *x, ir::builder &builder); + void add_copy(ir::value *x, ir::builder &builder); public: + place_shared_copy(buffer_info_pass *info): info_(info) { } void run(ir::module &mod); + +private: + buffer_info_pass *info_; }; diff --git a/include/ir/module.h b/include/ir/module.h index 3509210cd..a5769c05f 100644 --- a/include/ir/module.h +++ b/include/ir/module.h @@ -30,7 +30,7 @@ public: private: phi_node *make_phi(type *ty, unsigned num_values, basic_block *block); - value *try_remove_trivial_phis(ir::phi_node *&phi); + value *try_remove_trivial_phis(ir::phi_node *&phi, ir::value** pre_user); value *add_phi_operands(const std::string& name, phi_node *&phi); value *get_value_recursive(const std::string& name, basic_block *block); void push_function(function *fn) { functions_.push_back(fn); } diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index c4dec7feb..231c83930 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -292,6 +292,10 @@ ir::value* selection_statement::codegen(ir::module* mod) const{ ir::basic_block *then_bb = ir::basic_block::create(ctx, "then", fn); ir::basic_block *else_bb = else_value_?ir::basic_block::create(ctx, "else", fn):nullptr; ir::basic_block *endif_bb = ir::basic_block::create(ctx, "endif", fn); + mod->seal_block(then_bb); + if(else_value_) + mod->seal_block(else_bb); + // Branch if(else_value_) builder.create_cond_br(cond, then_bb, else_bb); @@ -301,13 +305,11 @@ ir::value* selection_statement::codegen(ir::module* mod) const{ builder.set_insert_point(then_bb); then_value_->codegen(mod); builder.create_br(endif_bb); - mod->seal_block(then_bb); // Else if(else_value_){ builder.set_insert_point(else_bb); else_value_->codegen(mod); builder.create_br(endif_bb); - mod->seal_block(else_bb); } // Endif builder.set_insert_point(endif_bb); diff --git a/lib/codegen/allocation.cpp b/lib/codegen/allocation.cpp index c4957e477..34ba1e59a 100644 --- a/lib/codegen/allocation.cpp +++ b/lib/codegen/allocation.cpp @@ -102,10 +102,14 @@ void allocation::run(){ for(ir::value *y: interferences[x]) Adj = std::max(Adj, starts[y] + get_num_bytes(y)); offsets_[x] = starts[x] + colors[x] * Adj; - if(auto *phi = dynamic_cast(x)) - for(ir::value *px: phi->ops()){ - if(offsets_.find(px) == offsets_.end()) - offsets_[px] = offsets_[x]; + if(buffer_info_->is_double(x)){ + ir::phi_node *phi = (ir::phi_node*)x; + for(unsigned i = 0; i < phi->get_num_incoming(); i++){ + ir::value *inc_val = phi->get_incoming_value(i); + assert(offsets_.find(inc_val) == offsets_.end()); + offsets_[inc_val] = offsets_[phi]; + std::cout << x->get_name() << " " << inc_val->get_name() << " " << inc_val << std::endl; + } } } diff --git a/lib/codegen/barriers.cpp b/lib/codegen/barriers.cpp index f21c1e1d6..0466d5ef3 100644 --- a/lib/codegen/barriers.cpp +++ b/lib/codegen/barriers.cpp @@ -26,7 +26,7 @@ bool barriers::intersect(const interval_vec_t &X, const interval_vec_t &Y) { } void barriers::add_reference(ir::value *v, interval_vec_t &res){ - if(buffer_info_->is_shared(v)){ + if(dynamic_cast(v)){ unsigned offset = alloc_->get_offset(v); unsigned num_bytes = alloc_->get_num_bytes(v); res.push_back(interval_t(offset, offset + num_bytes)); @@ -51,7 +51,7 @@ void barriers::insert_barrier(ir::instruction *instr, ir::builder &builder) { builder.create_barrier(); } } - else{ + else { builder.set_insert_point(instr); builder.create_barrier(); } diff --git a/lib/codegen/buffer_info.cpp b/lib/codegen/buffer_info.cpp index 6be951a22..435f8ea8e 100644 --- a/lib/codegen/buffer_info.cpp +++ b/lib/codegen/buffer_info.cpp @@ -12,25 +12,37 @@ namespace codegen{ // run pass on module void buffer_info_pass::run(ir::module &mod) { + // Find which buffers are shared + for(ir::function *fn: mod.get_function_list()) + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *i: block->get_inst_list()) + if(dynamic_cast(i)){ + shared_.insert(i->get_operand(0)); + shared_.insert(i->get_operand(1)); + } + + // Handles phi nodes for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()) { if(!i->get_type()->is_tile_ty()) continue; // handle phi - if(auto *phi = dynamic_cast(i)){ + if(auto *phi = dynamic_cast(i)) + if(is_shared(phi)){ // determine if the value is in shared memory - bool is_shared = true; bool is_double = false; for(unsigned n = 0; n < phi->get_num_incoming(); n++){ - ir::value *inc_val = phi->get_incoming_value(n); - ir::value *inc_block = phi->get_incoming_block(n); - is_shared = is_shared && dynamic_cast(inc_val); - is_double = is_double || inc_block == phi->get_parent(); + ir::basic_block *inc_block = phi->get_incoming_block(n); + ir::value *terminator = inc_block->get_inst_list().back(); + if(auto *br = dynamic_cast(terminator)) + is_double = is_double || br->get_true_dest() == phi->get_parent() + || br->get_false_dest() == phi->get_parent(); + else if(auto *br = dynamic_cast(terminator)) + is_double = is_double || br->get_dest() == phi->get_parent(); + else + throw std::runtime_error("unreachable"); } - // add to shared - if(is_shared) - shared_.insert(phi); // add to double-buffered if(is_double) double_.insert(phi); @@ -41,10 +53,10 @@ void buffer_info_pass::run(ir::module &mod) { refs_[inc_val] = phi; } } - // handle shared copy - if(auto *copy = dynamic_cast(i)) - shared_.insert(copy); } + + for(auto &ref: refs_) + shared_.insert(ref.first); } // query double-buffered status diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 1b9116e51..b7e3461d8 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -299,6 +299,7 @@ std::vector delinearize(Value *trailing, std::vector &shapes, } void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { + std::cout << "name: " << v->get_name() << std::endl; const auto& shapes = v->get_type()->get_tile_shapes(); size_t dim = shapes.size(); std::vector contiguous(dim); @@ -354,7 +355,7 @@ void selection::create_grids(std::vector &grids, bind_references(op); // bind const auto& shapes = v->get_type()->get_tile_shapes(); - if(buffer_info_->is_shared(v)) + if(dynamic_cast(v) || buffer_info_->is_double(v)) return; for(size_t d = 0; d < shapes.size(); d++){ if(shapes[d] == 1) @@ -388,7 +389,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, const auto& shapes = v->get_type()->get_tile_shapes(); Type* ty = llvm_type(v->get_type()->get_scalar_ty(), ctx); // create shared tile - if(buffer_info_->is_shared(v)){ + if(dynamic_cast(v) || (buffer_info_->is_double(v))){ // shared copy PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr->getType()->getPointerAddressSpace()); if(dynamic_cast(v)) { @@ -478,6 +479,7 @@ void selection::init_grids(ir::function *fn, IRBuilder<> &builder, Value *sh_mem void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> &builder) { + std::cout << "lowering " << ins->get_name() << std::endl; BasicBlock *block = builder.GetInsertBlock(); Module *module = block->getModule(); Function *function = block->getParent(); @@ -602,7 +604,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & ti->set_value(idx, in->get_value(idx)); }); } - else if(buffer_info_->is_shared(ins)) + else if(dynamic_cast(ins) || (buffer_info_->is_double(ins))) return; // matrix multiplication else if(dynamic_cast(ins)) { @@ -694,13 +696,15 @@ void selection::run(ir::module &src, Module &dst){ std::map last_block; // iterate through block for(ir::basic_block *block: fn->blocks()) { + std::cout << "block: " << block->get_name() << std::endl; BasicBlock *parent = (BasicBlock*)vmap_[block]; dst_builder.SetInsertPoint(parent); for(ir::instruction *i: block->get_inst_list()){ - if(dynamic_cast(i)) + if(dynamic_cast(i) && !parent->empty()){ dst_builder.SetInsertPoint(&*parent->getFirstInsertionPt()); + } lower_instruction(i, dst_builder); - if(dynamic_cast(i)) + if(dynamic_cast(i) && !parent->empty()) dst_builder.SetInsertPoint(parent); last_block[block] = dst_builder.GetInsertBlock(); } @@ -709,7 +713,7 @@ void selection::run(ir::module &src, Module &dst){ for(ir::basic_block *block: fn->blocks()) for(ir::instruction *inst: block->get_inst_list()) if(auto *phi = dynamic_cast(inst)){ - if(buffer_info_->is_shared(phi)) { + if(buffer_info_->is_double(phi)) { PHINode *ptr = (PHINode*)((shared_tile*)tmap_.at(phi))->get_pointer(); PHINode *offset = (PHINode*)((shared_tile*)tmap_.at(phi))->get_offset(); for(unsigned n = 0; n < phi->get_num_incoming(); n++){ @@ -728,25 +732,28 @@ void selection::run(ir::module &src, Module &dst){ } ptr->addIncoming(inc_shared->get_pointer(), llvm_inc_block); } - continue; } - for(unsigned n = 0; n < phi->get_num_incoming(); n++){ - ir::value *inc_val = phi->get_incoming_value(n); - ir::basic_block *inc_block = phi->get_incoming_block(n); - BasicBlock *llvm_inc_block = last_block.at(inc_block); - if(phi->get_type()->is_tile_ty()) { - distributed_tile *phi_tile = (distributed_tile*)tmap_.at(phi); - distributed_tile *inc_tile = (distributed_tile*)tmap_.at(inc_val); - phi_tile->for_each([&](indices_t idx){ - PHINode *llvm_phi = (PHINode*)phi_tile->get_value(idx); - Value *llvm_inc_val = inc_tile->get_value(idx); + else { + std::cout << "phi: " << phi->get_name() << std::endl; + for(unsigned n = 0; n < phi->get_num_incoming(); n++){ + ir::value *inc_val = phi->get_incoming_value(n); + ir::basic_block *inc_block = phi->get_incoming_block(n); + BasicBlock *llvm_inc_block = last_block.at(inc_block); + std::cout << "incoming block: " << inc_block->get_name() << " " << llvm_inc_block->getName().str() << std::endl; + if(phi->get_type()->is_tile_ty()) { + distributed_tile *phi_tile = (distributed_tile*)tmap_.at(phi); + distributed_tile *inc_tile = (distributed_tile*)tmap_.at(inc_val); + phi_tile->for_each([&](indices_t idx){ + PHINode *llvm_phi = (PHINode*)phi_tile->get_value(idx); + Value *llvm_inc_val = inc_tile->get_value(idx); + llvm_phi->addIncoming(llvm_inc_val, llvm_inc_block); + }); + } + else { + PHINode *llvm_phi = (PHINode*)vmap_.at(phi); + Value *llvm_inc_val = vmap_.at(inc_val); llvm_phi->addIncoming(llvm_inc_val, llvm_inc_block); - }); - } - else { - PHINode *llvm_phi = (PHINode*)vmap_.at(phi); - Value *llvm_inc_val = vmap_.at(inc_val); - llvm_phi->addIncoming(llvm_inc_val, llvm_inc_block); + } } } } diff --git a/lib/codegen/shared_copy.cpp b/lib/codegen/shared_copy.cpp index 07d6a5c29..f759003bd 100644 --- a/lib/codegen/shared_copy.cpp +++ b/lib/codegen/shared_copy.cpp @@ -1,5 +1,6 @@ #include #include "codegen/shared_copy.h" +#include "codegen/buffer_info.h" #include "ir/module.h" #include "ir/function.h" #include "ir/basic_block.h" @@ -9,21 +10,16 @@ namespace tdl { namespace codegen{ -void place_shared_copy::add_copies(ir::value *x, ir::builder &builder) { - if(auto *phi = dynamic_cast(x)) { - for(auto *op: phi->ops()) - add_copies(op, builder); - } - else { - if(auto *i = dynamic_cast(x)){ - ir::basic_block* block = i->get_parent(); - auto it = std::find(block->begin(), block->end(), i); - builder.set_insert_point(++it); - } - ir::instruction *rx = (ir::instruction*)builder.create_copy_to_shared(x); - x->replace_all_uses_with(rx); - rx->set_operand(0, x); +void place_shared_copy::add_copy(ir::value *x, ir::builder &builder) { + if(auto *i = dynamic_cast(x)){ + ir::basic_block* block = i->get_parent(); + std::cout << "adding copy: " << x->get_name() << " " << block->get_name() << std::endl; + auto it = std::find(block->begin(), block->end(), i); + builder.set_insert_point(++it); } + ir::instruction *rx = (ir::instruction*)builder.create_copy_to_shared(x); + x->replace_all_uses_with(rx); + rx->set_operand(0, x); } void place_shared_copy::run(ir::module &mod) { @@ -31,10 +27,8 @@ void place_shared_copy::run(ir::module &mod) { for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()) - if(dynamic_cast(i)){ - add_copies(i->get_operand(0), builder); - add_copies(i->get_operand(1), builder); - } + if(info_->is_shared(i) && !info_->is_double(i)) + add_copy(i, builder); } } diff --git a/lib/ir/module.cpp b/lib/ir/module.cpp index 57a143ffe..fe6f0d48f 100644 --- a/lib/ir/module.cpp +++ b/lib/ir/module.cpp @@ -48,7 +48,7 @@ ir::phi_node* module::make_phi(ir::type *ty, unsigned num_values, ir::basic_bloc return res; } -ir::value *module::try_remove_trivial_phis(ir::phi_node *&phi){ +ir::value *module::try_remove_trivial_phis(ir::phi_node *&phi, ir::value** pre_user){ // find non-self references std::set non_self_ref; std::copy_if(phi->ops().begin(), phi->ops().end(), std::inserter(non_self_ref, non_self_ref.begin()), @@ -61,12 +61,12 @@ ir::value *module::try_remove_trivial_phis(ir::phi_node *&phi){ std::set users = phi->get_users(); phi->replace_all_uses_with(same); phi->erase_from_parent(); + if(pre_user) + *pre_user = same; for(ir::user* u: users) if(auto *uphi = dynamic_cast(u)) if(uphi != phi) - try_remove_trivial_phis(uphi); - if(auto *new_phi = dynamic_cast(same)) - return try_remove_trivial_phis(new_phi); + try_remove_trivial_phis(uphi, &same); return same; } @@ -80,10 +80,11 @@ ir::value *module::add_phi_operands(const std::string& name, ir::phi_node *&phi) ir::value *value = get_value(name, pred); phi->add_incoming(value, pred); } - return try_remove_trivial_phis(phi); + return try_remove_trivial_phis(phi, nullptr); } ir::value *module::get_value_recursive(const std::string& name, ir::basic_block *block) { + std::cout << "getting value " << name << std::endl; ir::value *result; auto &preds = block->get_predecessors(); if(block) @@ -141,6 +142,7 @@ void module::seal_block(ir::basic_block *block){ for(auto &x: incomplete_phis_[block]) add_phi_operands(x.first, x.second); sealed_blocks_.insert(block); + incomplete_phis_[block].clear(); } /* functions */ From 5618a15dc10131ca311a4edde9932ea0036bd62f Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 20 Feb 2019 22:55:20 -0500 Subject: [PATCH 081/494] [code generation] more bugfixes in control flow --- examples/matrix.cpp | 5 ++--- include/ast/ast.h | 12 ++++++++++ include/ast/parser.y | 6 ++++- include/ast/scanner.l | 1 + include/codegen/buffer_info.h | 3 +++ include/ir/module.h | 18 ++++++++++++++- lib/ast/lowering.cpp | 41 +++++++++++++++++++++++++---------- lib/codegen/allocation.cpp | 2 -- lib/codegen/buffer_info.cpp | 19 +++++++++------- lib/codegen/selection.cpp | 16 ++++++-------- lib/codegen/shared_copy.cpp | 1 - lib/codegen/tune.cpp | 1 - lib/ir/module.cpp | 20 ++++++++++++----- 13 files changed, 103 insertions(+), 42 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 5c7aee602..df239e6ff 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -62,8 +62,8 @@ void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K, int32 bound){\ pb = pb + 8*K;\ @checka a = *pa;\ @checkb b = *pb;\ - if(k <= 8){\ - @checka a = *pa;\ + if(k > 8){\ + continue;\ }\ }\ @checkc *pc = C;\ @@ -218,7 +218,6 @@ int main() { buffer_info.run(module); shared.run(module); liveness.run(module); - tdl::ir::print(module, std::cout); allocation.run(); barriers.run(module); vectorize.run(module); diff --git a/include/ast/ast.h b/include/ast/ast.h index 4a9889093..529c4b01b 100644 --- a/include/ast/ast.h +++ b/include/ast/ast.h @@ -374,6 +374,18 @@ private: const node *statements_; }; +// Jump + +class jump_statement: public statement{ +public: + using statement::statement; +}; + +class continue_statement: public jump_statement{ +public: + ir::value* codegen(ir::module *mod) const; +}; + class no_op: public statement { }; // Types diff --git a/include/ast/parser.y b/include/ast/parser.y index 905541d70..43c530e12 100644 --- a/include/ast/parser.y +++ b/include/ast/parser.y @@ -48,7 +48,7 @@ TYPE_T get_type_spec(node *op) { return ((token*)op)->type; } %token SUB_ASSIGN LEFT_ASSIGN RIGHT_ASSIGN AND_ASSIGN %token XOR_ASSIGN OR_ASSIGN TYPE_NAME %token VOID UINT1 UINT8 UINT16 UINT32 UINT64 INT1 INT8 INT16 INT32 INT64 FP32 FP64 -%token IF ELSE FOR +%token IF ELSE FOR CONTINUE %token NEWAXIS ELLIPSIS AT %token GET_GLOBAL_RANGE DOT @@ -266,6 +266,7 @@ statement | expression_statement { $$ = $1; } | selection_statement { $$ = $1; } | iteration_statement { $$ = $1; } + | jump_statement { $$ = $1; } ; compound_statement @@ -300,6 +301,9 @@ iteration_statement : FOR '(' expression_statement expression_statement expression ')' statement { $$ = new iteration_statement($3, $4, $5, $7); } ; +jump_statement + : CONTINUE ';' { $$ = new continue_statement(); } +; /* -------------------------- */ /* Declarator */ diff --git a/include/ast/scanner.l b/include/ast/scanner.l index 8e2d89f14..80da95dad 100644 --- a/include/ast/scanner.l +++ b/include/ast/scanner.l @@ -37,6 +37,7 @@ int comment(); "..." { count(); return(ELLIPSIS); } "get_global_range" { count(); return GET_GLOBAL_RANGE; } "dot" { count(); return DOT;} +"continue" { count(); return(CONTINUE); } {L}({L}|{D})* { count(); return(check_type()); } diff --git a/include/codegen/buffer_info.h b/include/codegen/buffer_info.h index 2cce9d829..0d22608c2 100644 --- a/include/codegen/buffer_info.h +++ b/include/codegen/buffer_info.h @@ -9,6 +9,7 @@ namespace tdl { namespace ir { class module; class value; + class phi_node; } namespace codegen{ @@ -19,8 +20,10 @@ public: // queries bool is_double(ir::value *x); bool is_shared(ir::value *x); + bool is_loop_latch(ir::phi_node *phi, ir::value *terminator); ir::value *get_reference(ir::value *x); + private: std::set shared_; std::set double_; diff --git a/include/ir/module.h b/include/ir/module.h index a5769c05f..347178fda 100644 --- a/include/ir/module.h +++ b/include/ir/module.h @@ -4,9 +4,17 @@ #include #include #include +#include #include "builder.h" namespace tdl{ + +namespace ast{ + +class iteration_statement; + +} + namespace ir{ class basic_block; @@ -27,10 +35,14 @@ class module { public: typedef std::map symbols_map_t; typedef std::vector functions_list_t; + struct current_iteration_info_t{ + ast::iteration_statement *statement; + basic_block *block; + }; private: phi_node *make_phi(type *ty, unsigned num_values, basic_block *block); - value *try_remove_trivial_phis(ir::phi_node *&phi, ir::value** pre_user); + value *try_remove_trivial_phis(ir::phi_node *&phi, value **pre_user); value *add_phi_operands(const std::string& name, phi_node *&phi); value *get_value_recursive(const std::string& name, basic_block *block); void push_function(function *fn) { functions_.push_back(fn); } @@ -44,11 +56,13 @@ public: void set_value(const std::string& name, value* x); void set_type(const std::string& name, basic_block* block, type* x); void set_type(const std::string& name, type* x); + void set_continue_fn(std::function fn); // Getters value *get_value(const std::string& name, basic_block* block); value *get_value(const std::string& name); type *get_type(const std::string& name, basic_block* block); type *get_type(const std::string& name); + std::function get_continue_fn(); // Seal block -- no more predecessors will be added void seal_block(basic_block *block); // Functions @@ -67,6 +81,8 @@ private: std::map> incomplete_phis_; functions_list_t functions_; symbols_map_t symbols_; + std::function continue_fn_; + std::map current_phi_; }; } diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 231c83930..f11c40955 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -141,6 +141,11 @@ void node::implicit_broadcast(ir::module *mod, ir::value *&lhs, ir::value *&rhs) rhs = builder.create_broadcast(rhs, shapes); } +/* Helper */ +inline bool is_terminator(ir::value* x) { + return x && dynamic_cast(x); +} + /* Translation unit */ ir::value* translation_unit::codegen(ir::module *mod) const{ decls_->codegen(mod); @@ -242,8 +247,13 @@ ir::value* function_definition::codegen(ir::module *mod) const{ ir::value* compound_statement::codegen(ir::module* mod) const{ if(decls_) decls_->codegen(mod); - if(statements_) - statements_->codegen(mod); + if(statements_){ + for(statement *stmt: statements_->values()){ + ir::value *current = stmt->codegen(mod); + if(is_terminator(current)) + return current; + } + } return nullptr; } @@ -266,15 +276,18 @@ ir::value* iteration_statement::codegen(ir::module *mod) const{ ir::basic_block *current_bb = builder.get_insert_block(); ir::function *fn = current_bb->get_parent(); ir::basic_block *loop_bb = ir::basic_block::create(ctx, "loop", fn); + ir::basic_block *next_bb = ir::basic_block::create(ctx, "postloop", fn); + mod->set_continue_fn([&](){ + exec_->codegen(mod); + ir::value *cond = stop_->codegen(mod); + return builder.create_cond_br(cond, loop_bb, next_bb); + }); init_->codegen(mod); builder.create_br(loop_bb); builder.set_insert_point(loop_bb); - statements_->codegen(mod); - exec_->codegen(mod); - ir::value *cond = stop_->codegen(mod); + if(!is_terminator(statements_->codegen(mod))) + mod->get_continue_fn()(); ir::basic_block *stop_bb = builder.get_insert_block(); - ir::basic_block *next_bb = ir::basic_block::create(ctx, "postloop", fn); - builder.create_cond_br(cond, loop_bb, next_bb); mod->seal_block(stop_bb); mod->seal_block(loop_bb); mod->seal_block(builder.get_insert_block()); @@ -303,16 +316,22 @@ ir::value* selection_statement::codegen(ir::module* mod) const{ builder.create_cond_br(cond, then_bb, endif_bb); // Then builder.set_insert_point(then_bb); - then_value_->codegen(mod); - builder.create_br(endif_bb); + if(!is_terminator(then_value_->codegen(mod))) + builder.create_br(endif_bb); // Else if(else_value_){ builder.set_insert_point(else_bb); - else_value_->codegen(mod); - builder.create_br(endif_bb); + if(!is_terminator(else_value_->codegen(mod))) + builder.create_br(endif_bb); } // Endif builder.set_insert_point(endif_bb); + return nullptr; +} + +/* Continue statement */ +ir::value* continue_statement::codegen(ir::module *mod) const{ + return mod->get_continue_fn()(); } /* Declaration */ diff --git a/lib/codegen/allocation.cpp b/lib/codegen/allocation.cpp index 34ba1e59a..696b46cb9 100644 --- a/lib/codegen/allocation.cpp +++ b/lib/codegen/allocation.cpp @@ -106,9 +106,7 @@ void allocation::run(){ ir::phi_node *phi = (ir::phi_node*)x; for(unsigned i = 0; i < phi->get_num_incoming(); i++){ ir::value *inc_val = phi->get_incoming_value(i); - assert(offsets_.find(inc_val) == offsets_.end()); offsets_[inc_val] = offsets_[phi]; - std::cout << x->get_name() << " " << inc_val->get_name() << " " << inc_val << std::endl; } } } diff --git a/lib/codegen/buffer_info.cpp b/lib/codegen/buffer_info.cpp index 435f8ea8e..4d20fa6e9 100644 --- a/lib/codegen/buffer_info.cpp +++ b/lib/codegen/buffer_info.cpp @@ -11,6 +11,16 @@ namespace codegen{ // run pass on module +bool buffer_info_pass::is_loop_latch(ir::phi_node *phi, ir::value *terminator){ + if(auto *br = dynamic_cast(terminator)) + return br->get_true_dest() == phi->get_parent() + || br->get_false_dest() == phi->get_parent(); + else if(auto *br = dynamic_cast(terminator)) + return br->get_dest() == phi->get_parent(); + else + throw std::runtime_error("unreachable"); +} + void buffer_info_pass::run(ir::module &mod) { // Find which buffers are shared for(ir::function *fn: mod.get_function_list()) @@ -35,13 +45,7 @@ void buffer_info_pass::run(ir::module &mod) { for(unsigned n = 0; n < phi->get_num_incoming(); n++){ ir::basic_block *inc_block = phi->get_incoming_block(n); ir::value *terminator = inc_block->get_inst_list().back(); - if(auto *br = dynamic_cast(terminator)) - is_double = is_double || br->get_true_dest() == phi->get_parent() - || br->get_false_dest() == phi->get_parent(); - else if(auto *br = dynamic_cast(terminator)) - is_double = is_double || br->get_dest() == phi->get_parent(); - else - throw std::runtime_error("unreachable"); + is_double = is_double || is_loop_latch(phi, terminator); } // add to double-buffered if(is_double) @@ -49,7 +53,6 @@ void buffer_info_pass::run(ir::module &mod) { // set references of input for(unsigned n = 0; n < phi->get_num_incoming(); n++){ ir::value *inc_val = phi->get_incoming_value(n); - assert(refs_[inc_val] == nullptr); refs_[inc_val] = phi; } } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index b7e3461d8..11bcf2738 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -299,7 +299,6 @@ std::vector delinearize(Value *trailing, std::vector &shapes, } void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { - std::cout << "name: " << v->get_name() << std::endl; const auto& shapes = v->get_type()->get_tile_shapes(); size_t dim = shapes.size(); std::vector contiguous(dim); @@ -406,8 +405,6 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, unsigned id_pre = 0, id_loop = 1; if(phi->get_incoming_block(0) == phi->get_parent()) std::swap(id_pre, id_loop); - ir::value *pre_value = phi->get_incoming_value(id_pre); - ir::value *loop_value = phi->get_incoming_value(id_loop); if(parent->empty()) builder.SetInsertPoint(parent); else @@ -419,8 +416,13 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, pre_ptr = builder.CreateBitCast(pre_ptr, ptr->getType()); Value *next_ptr = builder.CreateGEP(ptr, offset); tmap_.insert({phi, new shared_tile(ty, shapes, ptr, builder, offset)}); - tmap_.insert({pre_value, new shared_tile(ty, shapes, pre_ptr, builder)}); - tmap_.insert({loop_value, new shared_tile(ty, shapes, next_ptr, builder)}); + for(unsigned i = 0; i < phi->get_num_incoming(); i++) { + ir::basic_block* inc_block = phi->get_incoming_block(i); + ir::value* inc_value = phi->get_incoming_value(i); + ir::value* terminator = inc_block->get_inst_list().back(); + bool is_loop_latch = buffer_info_->is_loop_latch(phi, terminator); + tmap_.insert({inc_value, new shared_tile(ty, shapes, is_loop_latch?next_ptr:pre_ptr, builder)}); + } } else throw std::runtime_error("unknown shared memory tile"); @@ -479,7 +481,6 @@ void selection::init_grids(ir::function *fn, IRBuilder<> &builder, Value *sh_mem void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> &builder) { - std::cout << "lowering " << ins->get_name() << std::endl; BasicBlock *block = builder.GetInsertBlock(); Module *module = block->getModule(); Function *function = block->getParent(); @@ -696,7 +697,6 @@ void selection::run(ir::module &src, Module &dst){ std::map last_block; // iterate through block for(ir::basic_block *block: fn->blocks()) { - std::cout << "block: " << block->get_name() << std::endl; BasicBlock *parent = (BasicBlock*)vmap_[block]; dst_builder.SetInsertPoint(parent); for(ir::instruction *i: block->get_inst_list()){ @@ -734,12 +734,10 @@ void selection::run(ir::module &src, Module &dst){ } } else { - std::cout << "phi: " << phi->get_name() << std::endl; for(unsigned n = 0; n < phi->get_num_incoming(); n++){ ir::value *inc_val = phi->get_incoming_value(n); ir::basic_block *inc_block = phi->get_incoming_block(n); BasicBlock *llvm_inc_block = last_block.at(inc_block); - std::cout << "incoming block: " << inc_block->get_name() << " " << llvm_inc_block->getName().str() << std::endl; if(phi->get_type()->is_tile_ty()) { distributed_tile *phi_tile = (distributed_tile*)tmap_.at(phi); distributed_tile *inc_tile = (distributed_tile*)tmap_.at(inc_val); diff --git a/lib/codegen/shared_copy.cpp b/lib/codegen/shared_copy.cpp index f759003bd..60c31199f 100644 --- a/lib/codegen/shared_copy.cpp +++ b/lib/codegen/shared_copy.cpp @@ -13,7 +13,6 @@ namespace codegen{ void place_shared_copy::add_copy(ir::value *x, ir::builder &builder) { if(auto *i = dynamic_cast(x)){ ir::basic_block* block = i->get_parent(); - std::cout << "adding copy: " << x->get_name() << " " << block->get_name() << std::endl; auto it = std::find(block->begin(), block->end(), i); builder.set_insert_point(++it); } diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 8919f171b..9d4a08f2e 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -104,7 +104,6 @@ std::vector tune::get_params(ir::module &mod) { for(ir::instruction *i : block->get_inst_list()) for(auto &x: params_[i]) if(seen.insert(x.second).second && *x.second == 0){ - std::cout << typeid(*i).name() << std::endl; result.push_back(x.second); } return result; diff --git a/lib/ir/module.cpp b/lib/ir/module.cpp index fe6f0d48f..a8a11ff1c 100644 --- a/lib/ir/module.cpp +++ b/lib/ir/module.cpp @@ -37,6 +37,14 @@ void module::set_type(const std::string& name, ir::type *type){ return set_type(name, builder_.get_insert_block(), type); } +void module::set_continue_fn(std::function fn) { + continue_fn_ = fn; +} + +std::function module::get_continue_fn() { + return continue_fn_; +} + ir::phi_node* module::make_phi(ir::type *ty, unsigned num_values, ir::basic_block *block){ basic_block::iterator insert = block->get_first_non_phi(); if(insert != block->end()){ @@ -61,8 +69,6 @@ ir::value *module::try_remove_trivial_phis(ir::phi_node *&phi, ir::value** pre_u std::set users = phi->get_users(); phi->replace_all_uses_with(same); phi->erase_from_parent(); - if(pre_user) - *pre_user = same; for(ir::user* u: users) if(auto *uphi = dynamic_cast(u)) if(uphi != phi) @@ -80,11 +86,10 @@ ir::value *module::add_phi_operands(const std::string& name, ir::phi_node *&phi) ir::value *value = get_value(name, pred); phi->add_incoming(value, pred); } - return try_remove_trivial_phis(phi, nullptr); + return phi; } ir::value *module::get_value_recursive(const std::string& name, ir::basic_block *block) { - std::cout << "getting value " << name << std::endl; ir::value *result; auto &preds = block->get_predecessors(); if(block) @@ -101,6 +106,8 @@ ir::value *module::get_value_recursive(const std::string& name, ir::basic_block set_value(name, block, result); result = add_phi_operands(name, (ir::phi_node*&)result); } + if(auto *phi = dynamic_cast(result)) + result = try_remove_trivial_phis(phi, nullptr); set_value(name, block, result); return result; } @@ -138,9 +145,12 @@ ir::type *module::get_type(const std::string &name) { return types_.at({name, builder_.get_insert_block()}); } + void module::seal_block(ir::basic_block *block){ - for(auto &x: incomplete_phis_[block]) + for(auto &x: incomplete_phis_[block]){ add_phi_operands(x.first, x.second); + try_remove_trivial_phis(x.second, nullptr); + } sealed_blocks_.insert(block); incomplete_phis_[block].clear(); } From 7cda55df164f2a45cdb35a1b621e54943e555e7b Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 21 Feb 2019 18:00:27 -0500 Subject: [PATCH 082/494] [code generation] implements hidden operands in user (e.g., mask) --- examples/matrix.cpp | 16 +++++++++++++--- include/ir/instructions.h | 6 +++--- include/ir/value.h | 11 ++++++++--- lib/ast/lowering.cpp | 2 +- lib/codegen/selection.cpp | 6 +++--- lib/codegen/tune.cpp | 2 +- lib/ir/value.cpp | 8 ++++++-- 7 files changed, 35 insertions(+), 16 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index df239e6ff..d54f19e38 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -57,14 +57,25 @@ void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K, int32 bound){\ for(k = K; k > 0; k = k - 8){\ int1 checka[16, 8] = (k > 8);\ int1 checkb[16, 8] = (k > 8);\ + int1 checka0[16];\ + int1 checka1[8];\ + int1 checkb0[16];\ + int1 checkb1[8];\ C = dot(a, b, C);\ pa = pa + 8*M;\ pb = pb + 8*K;\ @checka a = *pa;\ @checkb b = *pb;\ - if(k > 8){\ + if(k > 8)\ continue;\ - }\ + checka0 = rxa < M;\ + checka1 = rka < k;\ + checkb0 = ryb < N;\ + checkb1 = rkb < k;\ + checka = checka0[:, newaxis] && checka1[newaxis, :];\ + checkb = checkb0[:, newaxis] && checkb1[newaxis, :];\ + @checka a = *pa;\ + @checkb b = *pb;\ }\ @checkc *pc = C;\ }\ @@ -211,7 +222,6 @@ int main() { if(errors.size()) exit(EXIT_FAILURE); - // print // run passes tdl::ir::print(module, std::cout); diff --git a/include/ir/instructions.h b/include/ir/instructions.h index a18a94e1c..047126cf2 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -35,8 +35,8 @@ public: basic_block *get_parent() { return parent_; } void erase_from_parent(); // mask - value* set_mask(value *pred, value *else_value = nullptr) { mask_ = {pred, else_value}; } - const mask_info_t get_mask() const { return mask_; } + void set_mask_pred(value *pred) { resize_hidden(1); set_operand(get_num_operands(), pred); } + value* get_mask_pred() const { if(get_num_hidden() == 0) return nullptr; return get_operand(get_num_operands()); } // helpers bool has_tile_result_or_op(); // repr @@ -45,7 +45,7 @@ public: private: basic_block *parent_; value *pred_; - mask_info_t mask_; + value *mask_pred_; }; //===----------------------------------------------------------------------===// diff --git a/include/ir/value.h b/include/ir/value.h index b183a189f..4db869f52 100644 --- a/include/ir/value.h +++ b/include/ir/value.h @@ -51,20 +51,23 @@ public: typedef ops_t::const_iterator const_op_iterator; protected: - void resize_ops(unsigned n) { ops_.resize(n); } + void resize_ops(unsigned num_ops) { ops_.resize(num_ops + num_hidden_); num_ops_ = num_ops; } + void resize_hidden(unsigned num_hidden) { ops_.resize(num_ops_ + num_hidden); num_hidden_ = num_hidden; } public: // Constructor user(type *ty, unsigned num_ops, const std::string &name = "") - : value(ty, name), ops_(num_ops){ } + : value(ty, name), ops_(num_ops), num_ops_(num_ops), num_hidden_(0){ + } // Operands const ops_t& ops() { return ops_; } op_iterator op_begin() { return ops_.begin(); } op_iterator op_end() { return ops_.end(); } void set_operand(unsigned i, value *x); - value *get_operand(unsigned i); + value *get_operand(unsigned i) const; unsigned get_num_operands() const ; + unsigned get_num_hidden() const; // Utils void replace_all_uses_with(value *target); @@ -72,6 +75,8 @@ public: private: ops_t ops_; + unsigned num_ops_; + unsigned num_hidden_; }; } diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index f11c40955..dee27d139 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -264,7 +264,7 @@ ir::value* expression_statement::codegen(ir::module *mod) const{ ir::instruction *itn = dynamic_cast(expr); assert(itn); ir::value *mask = mask_->codegen(mod); - itn->set_mask(mask); + itn->set_mask_pred(mask); } return expr; } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 11bcf2738..ed17b2fcb 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -484,16 +484,16 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & BasicBlock *block = builder.GetInsertBlock(); Module *module = block->getModule(); Function *function = block->getParent(); - ir::instruction::mask_info_t mask = ins->get_mask(); + ir::value* mask_pred = ins->get_mask_pred(); LLVMContext &ctx = builder.getContext(); // helper to handle masks auto insert_masked = [&](indices_t idx, std::function insert_value) { BasicBlock *block = builder.GetInsertBlock(); Value *result; - if(mask.pred){ + if(mask_pred){ // if(mask.else_value) // std::cout << mask.else_value << std::endl; - Value *llvm_mask = tmap_.at(mask.pred)->get_value(idx); + Value *llvm_mask = tmap_.at(mask_pred)->get_value(idx); BasicBlock *then_bb = BasicBlock::Create(ctx, "", function); BasicBlock *done_bb = BasicBlock::Create(ctx, "", function); builder.CreateCondBr(llvm_mask, then_bb, done_bb); diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 9d4a08f2e..5de551924 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -73,7 +73,7 @@ void tune::init_c_graph(ir::instruction *v) { } /* Add mask constraints */ - if(ir::value *pred = v->get_mask().pred){ + if(ir::value *pred = v->get_mask_pred()){ for(unsigned i = 0; i < shapes.size(); i++) add_constraint({v->ops()[0], i}, {pred, i}); } diff --git a/lib/ir/value.cpp b/lib/ir/value.cpp index 23d30caf9..2d43f8e9d 100644 --- a/lib/ir/value.cpp +++ b/lib/ir/value.cpp @@ -43,13 +43,17 @@ void user::set_operand(unsigned i, value *x) { x->add_use(this); } -value* user::get_operand(unsigned i) { +value* user::get_operand(unsigned i) const { assert(i < ops_.size() && "get_operand() out of range!"); return ops_[i]; } unsigned user::get_num_operands() const { - return ops_.size(); + return num_ops_; +} + +unsigned user::get_num_hidden() const { + return num_hidden_; } void user::replace_all_uses_with(value *target) { From 8f4798b81aa9df97d032c8a8ee79eda891ff3f45 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 23 Feb 2019 11:37:01 -0500 Subject: [PATCH 083/494] [intermediate representation] transitioning towards more flexible tile shapes --- TODO | 1 + examples/matrix.cpp | 4 +++ include/ast/ast.h | 40 ++++++++++++++++------ include/ast/parser.y | 30 +++++++++++----- include/ast/scanner.l | 4 +++ include/codegen/barriers.h | 2 +- include/codegen/selection.h | 1 + include/ir/builder.h | 9 ++--- include/ir/constant.h | 28 +++++++++++---- include/ir/context_impl.h | 7 ++-- include/ir/instructions.h | 11 +++--- include/ir/module.h | 7 ++++ include/ir/type.h | 34 ++++++++++++++++--- lib/ast/lowering.cpp | 68 ++++++++++++++++++++++++------------- lib/codegen/barriers.cpp | 27 ++++++++------- lib/codegen/selection.cpp | 28 ++++++++++----- lib/codegen/tune.cpp | 17 ++++++---- lib/ir/builder.cpp | 8 ++--- lib/ir/constant.cpp | 30 +++++++++++----- lib/ir/instructions.cpp | 12 +++---- lib/ir/type.cpp | 15 +++++--- 21 files changed, 268 insertions(+), 115 deletions(-) diff --git a/TODO b/TODO index 340218d7a..72f439161 100644 --- a/TODO +++ b/TODO @@ -2,3 +2,4 @@ - proper naming scheme - symbols table - name conflicts on globals? + - separate header for typedef (e.g., type::tile_shapes_t) to reduce compilation time diff --git a/examples/matrix.cpp b/examples/matrix.cpp index d54f19e38..86ec4f249 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -6,6 +6,7 @@ #include "ir/context.h" #include "ir/module.h" #include "ir/print.h" +#include "ir/context_impl.h" #include "codegen/selection.h" #include "codegen/tune.h" #include "codegen/shared_copy.h" @@ -182,6 +183,9 @@ int main() { llvm::LLVMContext llvm_context; llvm::Module llvm_module("test", llvm_context); +// context.p_impl->mp_constants_[0]->set_value(16); +// context.p_impl->mp_constants_[1]->set_value(16); +// context.p_impl->mp_constants_[2]->set_value(8); // create passes tdl::codegen::buffer_info_pass buffer_info; diff --git a/include/ast/ast.h b/include/ast/ast.h index 529c4b01b..a24d0480f 100644 --- a/include/ast/ast.h +++ b/include/ast/ast.h @@ -5,7 +5,7 @@ #include #include #include - +#include namespace tdl{ @@ -56,6 +56,12 @@ enum TYPE_T{ FLOAT32_T, FLOAT64_T }; +enum STORAGE_SPEC_T{ + TUNABLE_T, + KERNEL_T, + READONLY_T, WRITEONLY_T, +}; + class pointer; class identifier; class constant; @@ -75,7 +81,7 @@ public: template class list: public node { public: - list(const T& x): values_{x} {} + list(const T& x): values_(1, x) {} node* append(const T& x){ values_.push_back(x); @@ -389,16 +395,30 @@ public: class no_op: public statement { }; // Types - class declaration_specifier: public node{ public: - declaration_specifier(TYPE_T spec) - : spec_(spec) { } + using node::node; + virtual ir::type* type(ir::module *mod) const = 0; +}; +class typed_declaration_specifier: public declaration_specifier { +public: + typed_declaration_specifier(TYPE_T ty): ty_(ty){ } ir::type* type(ir::module *mod) const; private: - const TYPE_T spec_; + const TYPE_T ty_; +}; + +class storage_declaration_specifier: public declaration_specifier { +public: + storage_declaration_specifier(STORAGE_SPEC_T storage_spec, node *decl_spec) + : storage_spec_(storage_spec), decl_spec_((declaration_specifier*)decl_spec) {} + ir::type* type(ir::module *mod) const; + +private: + const STORAGE_SPEC_T storage_spec_; + const declaration_specifier* decl_spec_; }; class declarator; @@ -495,7 +515,7 @@ public: : declarator((node*)((declarator*)decl)->id()), decl_((declarator*)decl), expr_((expression*)init){ } - void specifier(const declaration_specifier *spec); + void set_specifier(const declaration_specifier *spec); ir::value* codegen(ir::module *) const; public: @@ -535,17 +555,17 @@ public: class translation_unit: public node{ public: translation_unit(node *item) - : decls_((list*)item) { } + : decls_(item) { } translation_unit *add(node *item) { - decls_->append(item); + decls_.append(item); return this; } ir::value* codegen(ir::module * mod) const; private: - list* decls_; + list decls_; }; } diff --git a/include/ast/parser.y b/include/ast/parser.y index 43c530e12..826204f8a 100644 --- a/include/ast/parser.y +++ b/include/ast/parser.y @@ -20,12 +20,14 @@ struct token: public node{ token(BIN_OP_T value): bin_op(value){ } token(UNARY_OP_T value): unary_op(value){ } token(TYPE_T value): type(value){ } + token(STORAGE_SPEC_T value): storage_spec(value){ } union { ASSIGN_OP_T assign_op; BIN_OP_T bin_op; UNARY_OP_T unary_op; TYPE_T type; + STORAGE_SPEC_T storage_spec; }; }; @@ -39,10 +41,12 @@ node* append_ptr_list(node *result, node *in){ ASSIGN_OP_T get_assign_op(node *op) { return ((token*)op)->assign_op; } UNARY_OP_T get_unary_op(node *op) { return ((token*)op)->unary_op; } TYPE_T get_type_spec(node *op) { return ((token*)op)->type; } +STORAGE_SPEC_T get_storage_spec(node *op) { return ((token*)op)->storage_spec;} %} %token IDENTIFIER CONSTANT STRING_LITERAL +%token TUNABLE KERNEL READONLY WRITEONLY %token PTR_OP INC_OP DEC_OP LEFT_OP RIGHT_OP LE_OP GE_OP EQ_OP NE_OP %token AND_OP OR_OP MUL_ASSIGN DIV_ASSIGN MOD_ASSIGN ADD_ASSIGN %token SUB_ASSIGN LEFT_ASSIGN RIGHT_ASSIGN AND_ASSIGN @@ -87,17 +91,12 @@ abstract_declarator ; direct_abstract_declarator - : '[' constant_list ']' { $$ = new tile(nullptr, $1); } + : '[' primary_expression_list ']' { $$ = new tile(nullptr, $1); } constant : CONSTANT { $$ = new constant(atoi(yytext)); } ; -constant_list - : constant { $$ = new list((constant*)$1); } - | constant_list ',' constant { $$ = append_ptr_list($1, $3); } - ; - type_name : declaration_specifiers { $$ = new type_name($1, nullptr); } | declaration_specifiers abstract_declarator { $$ = new type_name($1, $2); } @@ -112,7 +111,7 @@ identifier ; builtin - : GET_GLOBAL_RANGE '[' constant ']' '(' constant ')' { $$ = new get_global_range($3, $6); } + : GET_GLOBAL_RANGE '[' primary_expression ']' '(' constant ')' { $$ = new get_global_range($3, $6); } | DOT '(' expression ',' expression ',' expression ')' { $$ = new matmul_expression($3, $5, $7); } primary_expression @@ -124,6 +123,11 @@ primary_expression | '(' expression ')' { $$ = $2; } ; +primary_expression_list + : primary_expression { $$ = new list((expression*)$1); } + | primary_expression_list ',' primary_expression { $$ = append_ptr_list($1, $3); } + ; + slice : ':' { $$ = new slice(tdl::ast::ALL); } | NEWAXIS { $$ = new slice(tdl::ast::NEWAXIS); } @@ -312,7 +316,7 @@ jump_statement direct_declarator : identifier { $$ = $1; } - | identifier '[' constant_list ']' { $$ = new tile($1, $3); } + | identifier '[' primary_expression_list ']' { $$ = new tile($1, $3); } | identifier '(' parameter_list ')' { $$ = new function($1, $3); } | identifier '(' ')' { $$ = new function($1, nullptr); } ; @@ -330,7 +334,8 @@ parameter_declaration declaration_specifiers - : type_specifier { $$ = new declaration_specifier(get_type_spec($1)); } + : type_specifier { $$ = new typed_declaration_specifier(get_type_spec($1)); } + | storage_class_specifier declaration_specifiers { $$ = new storage_declaration_specifier(get_storage_spec($1), $2); } ; init_declarator_list @@ -354,6 +359,13 @@ init_declarator | declarator '=' initialization_expression { $$ = new initializer($1, $3); } ; +storage_class_specifier + : TUNABLE { $$ = new token(TUNABLE_T); } + | KERNEL { $$ = new token(KERNEL_T); } + | READONLY { $$ = new token(READONLY_T); } + | WRITEONLY { $$ = new token(WRITEONLY_T); } +; + /* -------------------------- */ /* Translation Unit */ /* -------------------------- */ diff --git a/include/ast/scanner.l b/include/ast/scanner.l index 80da95dad..885404ca3 100644 --- a/include/ast/scanner.l +++ b/include/ast/scanner.l @@ -16,6 +16,10 @@ int comment(); %} %% +"tunable" { count(); return(TUNABLE); } +"kernel" { count(); return(KERNEL); } +"readonly" { count(); return(READONLY); } +"writeonly" { count(); return(WRITEONLY); } "@" { count(); return(AT); } "newaxis" { count(); return(NEWAXIS); } "if" { count(); return(IF); } diff --git a/include/codegen/barriers.h b/include/codegen/barriers.h index 9b476ae75..5199f94ad 100644 --- a/include/codegen/barriers.h +++ b/include/codegen/barriers.h @@ -32,7 +32,7 @@ private: void add_reference(ir::value *v, interval_vec_t &res); void get_read_intervals(ir::instruction *i, interval_vec_t &res); void get_written_intervals(ir::instruction *i, interval_vec_t &res); - void add(ir::basic_block *block, interval_vec_t ¬_synced, std::set &insert_pts); + void add(ir::basic_block *block, interval_vec_t ¬_synced, ir::builder &builder); public: barriers(allocation *alloc, buffer_info_pass *buffer_info): alloc_(alloc), buffer_info_(buffer_info) {} diff --git a/include/codegen/selection.h b/include/codegen/selection.h index 2531dc74c..ec733ed57 100644 --- a/include/codegen/selection.h +++ b/include/codegen/selection.h @@ -100,6 +100,7 @@ class selection{ private: // utils llvm::Type *make_vector_ty(llvm::Type *ty, size_t vector_size); + std::vector extract_shapes(ir::value *v); // LLVM conversions llvm::Type* llvm_type(ir::type *ty, llvm::LLVMContext &ctx); diff --git a/include/ir/builder.h b/include/ir/builder.h index d7e49cf14..a6c0013fd 100644 --- a/include/ir/builder.h +++ b/include/ir/builder.h @@ -6,6 +6,7 @@ #include #include "instructions.h" #include "basic_block.h" +#include "type.h" namespace tdl{ namespace ir{ @@ -110,11 +111,11 @@ public: value *create_load(value *arg, const std::string &name = ""); value *create_store(value *ptr, value *val, const std::string &name = ""); // Tile instruction - value *create_splat(value *arg, const std::vector &shapes, const std::string &name = ""); - value *create_reshape(value *arg, const std::vector &shapes, const std::string &name = ""); - value *create_broadcast(value *arg, const std::vector &shapes, const std::string &name = ""); + value *create_splat(value *arg, const type::tile_shapes_t &shapes, const std::string &name = ""); + value *create_reshape(value *arg, const type::tile_shapes_t &shapes, const std::string &name = ""); + value *create_broadcast(value *arg, const type::tile_shapes_t &shapes, const std::string &name = ""); // Built-in instruction - value *create_get_global_range(unsigned axis, unsigned size, const std::string &name = ""); + value *create_get_global_range(unsigned axis, type::tile_shapes_t::value_type size, const std::string &name = ""); value *create_matmul(value *A, value *B, value *C, const std::string &name = ""); // Intrinsics value *create_copy_to_shared(value *arg, const std::string &name = ""); diff --git a/include/ir/constant.h b/include/ir/constant.h index 78814283c..132902d3a 100644 --- a/include/ir/constant.h +++ b/include/ir/constant.h @@ -2,6 +2,7 @@ #define TDL_INCLUDE_IR_CONSTANT_H #include "value.h" +#include namespace tdl{ namespace ir{ @@ -28,28 +29,43 @@ public: static undef_value* get(type* ty); }; + /* Constant int */ class constant_int: public constant{ +protected: constant_int(type *ty, uint64_t value); public: uint64_t get_value() const { return value_; } - static constant *get(type *ty, uint64_t value); + static constant_int *get(type *ty, uint64_t value); + +protected: + uint64_t value_; +}; + +/* Metaparameter int */ +class metaparameter: public constant_int{ + metaparameter(type *ty, unsigned lo, unsigned hi); + +public: + static metaparameter *create(context &ctx, type *ty, unsigned lo, unsigned hi); + void set_value(uint64_t value) { value_ = value; } private: - uint64_t value_; + unsigned lo_; + unsigned hi_; }; /* constant range */ class constant_range: public constant{ - constant_range(type *ty, uint64_t first, uint64_t last); + constant_range(type *ty, constant_int* first, constant_int* last); public: - static constant *get(constant *first, constant *last); + static constant *get(constant_int *first, constant_int *last); private: - uint64_t first_; - uint64_t last_; + constant_int* first_; + constant_int* last_; }; /* constant fp */ diff --git a/include/ir/context_impl.h b/include/ir/context_impl.h index cb3acc186..b9017b39c 100644 --- a/include/ir/context_impl.h +++ b/include/ir/context_impl.h @@ -12,6 +12,7 @@ class context; class constant_int; class constant_fp; class undef_value; +class metaparameter; /* Context impl */ class context_impl { @@ -26,13 +27,15 @@ public: integer_type int1_ty, int8_ty, int16_ty, int32_ty, int64_ty, int128_ty; // Pointer types std::map, pointer_type*> ptr_tys; - std::map>, tile_type*> tile_tys; + std::map, tile_type*> tile_tys; // Int constants - std::map int_constants_; + std::map, constant_int*> int_constants_; // Float constants std::map fp_constants_; // undef values std::map uv_constants_; + // Metaparameters + std::vector mp_constants_; }; } diff --git a/include/ir/instructions.h b/include/ir/instructions.h index 047126cf2..ae752f78e 100644 --- a/include/ir/instructions.h +++ b/include/ir/instructions.h @@ -3,6 +3,7 @@ #include #include "value.h" +#include "ir/type.h" #include "llvm/IR/Instructions.h" namespace tdl{ @@ -358,7 +359,7 @@ public: class retile_inst: public unary_inst { protected: - retile_inst(value *arg, const std::vector &shape_suffix, const std::string &name, instruction *next); + retile_inst(value *arg, const type::tile_shapes_t &shapes, const std::string &name, instruction *next); static std::string shape_suffix(ir::type* ty); }; @@ -370,7 +371,7 @@ private: std::string repr_impl() const { return "reshape" + shape_suffix(get_type()); } public: - static instruction* create(value *arg, const std::vector &shape_suffix, + static instruction* create(value *arg, const type::tile_shapes_t &shape_suffix, const std::string &name = "", instruction *next = nullptr); }; @@ -382,7 +383,7 @@ private: std::string repr_impl() const { return "splat" + shape_suffix(get_type()); } public: - static instruction* create(value *arg, const std::vector &shape_suffix, + static instruction* create(value *arg, const type::tile_shapes_t &shape_suffix, const std::string &name = "", instruction *next = nullptr); }; @@ -394,7 +395,7 @@ private: std::string repr_impl() const { return "broadcast" + shape_suffix(get_type()); } public: - static instruction* create(value *arg, const std::vector &shape_suffix, + static instruction* create(value *arg, const type::tile_shapes_t &shape_suffix, const std::string &name = "", instruction *next = nullptr); }; @@ -414,7 +415,7 @@ private: std::string repr_impl() const { return "get_global_range(" + std::to_string(axis_) + ")"; } public: - static instruction* create(context &ctx, unsigned axis, unsigned size, + static instruction* create(context &ctx, unsigned axis, type::tile_shapes_t::value_type size, const std::string &name = "", instruction *next = nullptr); unsigned get_axis() const { return axis_; } diff --git a/include/ir/module.h b/include/ir/module.h index 347178fda..26b6c6769 100644 --- a/include/ir/module.h +++ b/include/ir/module.h @@ -3,6 +3,7 @@ #include #include +#include #include #include #include "builder.h" @@ -12,6 +13,7 @@ namespace tdl{ namespace ast{ class iteration_statement; +class compound_statement; } @@ -69,6 +71,10 @@ public: const functions_list_t &get_function_list() const { return functions_; } functions_list_t &get_function_list() { return functions_; } function *get_or_insert_function(const std::string &name, function_type *ty); + // Scope + void push_scope(const ast::compound_statement* scope) { scopes_.push(scope); } + void pop_scope() { scopes_.pop(); } + const ast::compound_statement* get_scope() { return scopes_.top(); } private: @@ -83,6 +89,7 @@ private: symbols_map_t symbols_; std::function continue_fn_; std::map current_phi_; + std::stack scopes_; }; } diff --git a/include/ir/type.h b/include/ir/type.h index 9f29b465b..6e2049ddd 100644 --- a/include/ir/type.h +++ b/include/ir/type.h @@ -3,6 +3,7 @@ #include #include +#include namespace tdl{ namespace ir{ @@ -10,9 +11,13 @@ namespace ir{ class context; class value; class integer_type; +class constant_int; /* Type */ class type { +public: + typedef std::vector tile_shapes_t; + protected: typedef std::vector contained_tys_vec_t; typedef contained_tys_vec_t::iterator ty_iterator; @@ -54,7 +59,7 @@ public: unsigned get_tile_bitwidth() const; unsigned get_primitive_size_in_bits() const; type *get_scalar_ty() const; - const std::vector &get_tile_shapes() const; + const tile_shapes_t& get_tile_shapes() const; unsigned get_tile_num_elements() const; type *get_tile_element_ty() const; unsigned get_pointer_address_space() const; @@ -94,9 +99,25 @@ public: static integer_type *get_int64_ty(context &ctx); static integer_type *get_int128_ty(context &ctx); + // Attributes + type* set_tunable() { is_tunable_ = true; return this; } + type* set_readonly() { is_readonly_ = true; return this; } + type* set_writeonly() { is_writeonly_ = true; return this; } + type* set_kernel() { is_kernel_ = true; return this; } + + bool get_tunable() { return is_tunable_; } + bool get_readonly() { return is_readonly_; } + bool get_writeonly() { return is_writeonly_; } + bool get_kernel() { return is_kernel_; } + private: context &ctx_; id_t id_; + // attributes + bool is_tunable_; + bool is_readonly_; + bool is_writeonly_; + bool is_kernel_; protected: contained_tys_vec_t contained_tys_; @@ -132,21 +153,24 @@ public: class tile_type: public composite_type { private: - tile_type(type *ty, const std::vector &shapes); + tile_type(type *ty, const tile_shapes_t &shapes); static bool is_valid_elt_ty(type *ty); public: // accessors - const std::vector& get_shapes() const { return shapes_; } + const tile_shapes_t& get_shapes() const { return shapes_; } unsigned get_num_elements() const; unsigned get_bitwidth() const; // factory methods - static tile_type* get(type *ty, const std::vector &shapes); + static tile_type* get(type *ty, const tile_shapes_t &shapes); static tile_type* get_same_shapes(type *ty, type *ref); + // shortcut to get a 1 element in the shape + static tile_shapes_t::value_type make_one(context &ctx); + private: - std::vector shapes_; + tile_shapes_t shapes_; }; class pointer_type: public type { diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index dee27d139..993856b2d 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -100,6 +100,7 @@ void node::implicit_broadcast(ir::module *mod, ir::value *&lhs, ir::value *&rhs) ir::builder &builder = mod->get_builder(); ir::type *lhs_ty = lhs->get_type(); ir::type *rhs_ty = rhs->get_type(); + ir::type::tile_shapes_t::value_type one = ir::tile_type::make_one(mod->get_context()); // Both are scalar if(!lhs_ty->is_tile_ty() && !rhs_ty->is_tile_ty()) return; @@ -111,30 +112,30 @@ void node::implicit_broadcast(ir::module *mod, ir::value *&lhs, ir::value *&rhs) return; } // Both are arrays - std::vector lhs_shapes = lhs->get_type()->get_tile_shapes(); - std::vector rhs_shapes = rhs->get_type()->get_tile_shapes(); + auto lhs_shapes = lhs->get_type()->get_tile_shapes(); + auto rhs_shapes = rhs->get_type()->get_tile_shapes(); if(lhs_shapes == rhs_shapes) return; int lhs_dim = lhs_shapes.size(); int rhs_dim = rhs_shapes.size(); - std::vector &shortest = (lhs_dim < rhs_dim)?lhs_shapes:rhs_shapes; - std::vector &longest = (lhs_dim < rhs_dim)?rhs_shapes:lhs_shapes; + auto &shortest = (lhs_dim < rhs_dim)?lhs_shapes:rhs_shapes; + auto &longest = (lhs_dim < rhs_dim)?rhs_shapes:lhs_shapes; size_t ndim = longest.size(); int off = longest.size() - shortest.size(); for(int i = longest.size() - 1; i>= 0; i--){ - if(shortest[off + i] != longest[i] && shortest[off + i] != 1 && longest[i] != 1) + if(shortest[off + i] != longest[i] && shortest[off + i] != one && longest[i] != one) throw std::runtime_error("cannot broadcast"); } // Pad for(size_t i = 0; i < off; i++) - shortest.insert(shortest.begin(), 1); + shortest.insert(shortest.begin(), one); ir::value *&target = (lhs_dim < rhs_dim)?lhs:rhs; if(off > 0) target = builder.create_reshape(target, shortest); // Broadcast - std::vector shapes(ndim); + ir::type::tile_shapes_t shapes(ndim); for(size_t i = 0; i < ndim; i++) - shapes[i] = std::max(shortest[i], longest[i]); + shapes[i] = shortest[i]==one?longest[i]:shortest[i]; if(shapes != lhs_shapes) lhs = builder.create_broadcast(lhs, shapes); if(shapes != rhs_shapes) @@ -148,14 +149,15 @@ inline bool is_terminator(ir::value* x) { /* Translation unit */ ir::value* translation_unit::codegen(ir::module *mod) const{ - decls_->codegen(mod); + mod->push_scope(nullptr); + decls_.codegen(mod); return nullptr; } /* Declaration specifier */ -ir::type* declaration_specifier::type(ir::module *mod) const { +ir::type* typed_declaration_specifier::type(ir::module *mod) const { ir::context &ctx = mod->get_context(); - switch (spec_) { + switch (ty_) { case VOID_T: return ir::type::get_void_ty(ctx); case INT1_T: return ir::type::get_int1_ty(ctx); case INT8_T: return ir::type::get_int8_ty(ctx); @@ -164,7 +166,18 @@ ir::type* declaration_specifier::type(ir::module *mod) const { case INT64_T: return ir::type::get_int64_ty(ctx); case FLOAT32_T: return ir::type::get_float_ty(ctx); case FLOAT64_T: return ir::type::get_double_ty(ctx); - default: throw std::runtime_error("unreachable"); + default: throw std::runtime_error("unreachable"); + } +} + +ir::type* storage_declaration_specifier::type(ir::module *mod) const { + ir::type* result = decl_spec_->type(mod); + switch(storage_spec_){ + case TUNABLE_T: return result->set_tunable(); + case KERNEL_T: return result->set_kernel(); + case READONLY_T: return result->set_readonly(); + case WRITEONLY_T: return result->set_writeonly(); + default: throw std::runtime_error("unreachable"); } } @@ -194,10 +207,10 @@ const std::string &identifier::name() const{ } // Tile -ir::type* tile::type_impl(ir::module*, ir::type *type) const{ - std::vector shapes; +ir::type* tile::type_impl(ir::module *mod, ir::type *type) const{ + ir::type::tile_shapes_t shapes; for(constant *cst: shapes_->values()) - shapes.push_back(cst->value()); + shapes.push_back((ir::constant_int*)cst->codegen(mod)); return ir::tile_type::get(type, shapes); } @@ -245,6 +258,7 @@ ir::value* function_definition::codegen(ir::module *mod) const{ /* Statements */ ir::value* compound_statement::codegen(ir::module* mod) const{ + mod->push_scope(this); if(decls_) decls_->codegen(mod); if(statements_){ @@ -254,6 +268,7 @@ ir::value* compound_statement::codegen(ir::module* mod) const{ return current; } } + mod->pop_scope(); return nullptr; } @@ -337,7 +352,7 @@ ir::value* continue_statement::codegen(ir::module *mod) const{ /* Declaration */ ir::value* declaration::codegen(ir::module* mod) const{ for(initializer *init: init_->values()) - init->specifier(spec_); + init->set_specifier(spec_); init_->codegen(mod); return nullptr; } @@ -347,7 +362,7 @@ ir::type* initializer::type_impl(ir::module *mod, ir::type *type) const{ return decl_->type(mod, type); } -void initializer::specifier(const declaration_specifier *spec) { +void initializer::set_specifier(const declaration_specifier *spec) { spec_ = spec; } @@ -355,6 +370,11 @@ ir::value* initializer::codegen(ir::module * mod) const{ ir::type *ty = decl_->type(mod, spec_->type(mod)); std::string name = decl_->id()->name(); ir::value *value = ir::undef_value::get(ty); + if(ty->get_tunable()){ + assert(expr_ == nullptr); + //TODO + value = ir::metaparameter::create(mod->get_context(), ty, 4, 8); + } if(expr_){ value = expr_->codegen(mod); value = explicit_cast(mod->get_builder(), value, ty); @@ -464,7 +484,7 @@ ir::value* binary_operator::codegen(ir::module *mod) const{ // get_global_range ir::value* get_global_range::codegen(ir::module *mod) const { ir::builder &builder = mod->get_builder(); - return builder.create_get_global_range(axis_->value(), size_->value()); + return builder.create_get_global_range(axis_->value(), (ir::constant_int*)size_->codegen(mod)); } @@ -487,11 +507,13 @@ ir::value* matmul_expression::codegen(ir::module *mod) const { ir::value* indexing_expression::codegen(ir::module *mod) const{ ir::value *in = mod->get_value(id_->name()); const std::vector &slices = slices_->values(); - std::vector in_shapes = in->get_type()->get_tile_shapes(); - std::vector out_shapes(slices.size()); + auto in_shapes = in->get_type()->get_tile_shapes(); + ir::type::tile_shapes_t::value_type one = ir::tile_type::make_one(mod->get_context()); + ir::type::tile_shapes_t out_shapes(slices.size()); + // create shapes size_t current = 0; for(size_t i = 0; i < out_shapes.size(); i++) - out_shapes[i] = (slices[i]->type()==NEWAXIS)?1:in_shapes[current++]; + out_shapes[i] = (slices[i]->type()==NEWAXIS)?one:in_shapes[current++]; return mod->get_builder().create_reshape(in, out_shapes); } @@ -586,8 +608,8 @@ int constant::value() const{ /* Constant range */ ir::value* constant_range::codegen(ir::module *mod) const{ - return ir::constant_range::get((ir::constant*)first_->codegen(mod), - (ir::constant*)last_->codegen(mod)); + return ir::constant_range::get((ir::constant_int*)first_->codegen(mod), + (ir::constant_int*)last_->codegen(mod)); } /* Named */ diff --git a/lib/codegen/barriers.cpp b/lib/codegen/barriers.cpp index 0466d5ef3..df017931b 100644 --- a/lib/codegen/barriers.cpp +++ b/lib/codegen/barriers.cpp @@ -45,10 +45,15 @@ void barriers::get_written_intervals(ir::instruction *i, interval_vec_t &res){ void barriers::insert_barrier(ir::instruction *instr, ir::builder &builder) { if(auto *phi = dynamic_cast(instr)) { + std::set incoming; for(unsigned n = 0; n < phi->get_num_incoming(); n++){ - ir::basic_block *block = phi->get_incoming_block(n); - builder.set_insert_point(block->get_inst_list().back()); - builder.create_barrier(); + ir::instruction *inc_val = dynamic_cast(phi->get_incoming_value(n)); + assert(inc_val); + if(incoming.insert(inc_val).second){ + ir::basic_block *block = inc_val->get_parent(); + builder.set_insert_point(block->get_inst_list().back()); + builder.create_barrier(); + } } } else { @@ -57,15 +62,15 @@ void barriers::insert_barrier(ir::instruction *instr, ir::builder &builder) { } } -void barriers::add(ir::basic_block *block, interval_vec_t ¬_synced, std::set &insert_pts) { - for(ir::instruction *i: block->get_inst_list()){ +void barriers::add(ir::basic_block *block, interval_vec_t ¬_synced, ir::builder &builder) { + ir::basic_block::inst_list_t instructions = block->get_inst_list(); + for(ir::instruction *i: instructions){ interval_vec_t read, written; get_read_intervals(i, read); get_written_intervals(i, written); - if(intersect(not_synced, read) - || intersect(not_synced, written)) { + if(intersect(not_synced, read)) { not_synced.clear(); - insert_pts.insert(i); + insert_barrier(i, builder); } std::copy(written.begin(), written.end(), std::back_inserter(not_synced)); } @@ -76,12 +81,8 @@ void barriers::run(ir::module &mod) { for(ir::function *fn: mod.get_function_list()){ // find barrier location interval_vec_t not_synced; - std::set insert_pts; for(ir::basic_block *block: fn->blocks()) - add(block, not_synced, insert_pts); - // insert barrier - for(ir::instruction *i: insert_pts) - insert_barrier(i, builder); + add(block, not_synced, builder); } } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index ed17b2fcb..8665714a2 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -44,6 +44,7 @@ llvm::Type *distributed_tile::make_vector_ty(llvm::Type *ty, size_t vector_size) return VectorType::get(ty, vector_size); } + distributed_tile::distributed_tile(Type *ty, const shapes_t &shapes, const axes_t &axes, llvm::IRBuilder<> &builder, bool vectorize) : tile(make_vector_ty(ty, vectorize?axes[0].contiguous:1), shapes), axes_(axes), builder_(builder) { vector_size_ = vectorize?ty_->getVectorNumElements():1; @@ -149,6 +150,16 @@ Value* shared_tile::get_value(indices_t idx) { return builder_.CreateLoad(ptr); } +/* Utils */ +std::vector selection::extract_shapes(ir::value *v) { + const auto& shapes = v->get_type()->get_tile_shapes(); + std::vector result(shapes.size()); + for(ir::constant_int* cst: shapes) + result.push_back(cst->get_value()); + return result; +} + + /* convert ir::type to Type */ Type *selection::llvm_type(ir::type *ty, LLVMContext &ctx) { // function @@ -299,11 +310,12 @@ std::vector delinearize(Value *trailing, std::vector &shapes, } void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { - const auto& shapes = v->get_type()->get_tile_shapes(); + const auto& shapes = extract_shapes(v); size_t dim = shapes.size(); std::vector contiguous(dim); std::vector warp_size(dim); std::vector n_warps(dim); + std::cout << v->get_name() << " " << typeid(*v).name() << std::endl; for(unsigned i = 0; i < shapes.size(); i++){ std::string str_i = std::to_string(i); contiguous[i] = *params_->get_param(v, "p0.d" + str_i); @@ -336,7 +348,7 @@ void selection::create_grids(std::vector &grids, // get number of dimensions greater than 1 auto get_tile_gt1_dim = [&](ir::value *v){ unsigned result = 0; - for(unsigned shape: v->get_type()->get_tile_shapes()) { + for(unsigned shape: extract_shapes(v)) { result += (shape > 1)?shape:0; } return result; @@ -353,7 +365,7 @@ void selection::create_grids(std::vector &grids, for(ir::value *op: user->ops()) bind_references(op); // bind - const auto& shapes = v->get_type()->get_tile_shapes(); + const auto& shapes = extract_shapes(v); if(dynamic_cast(v) || buffer_info_->is_double(v)) return; for(size_t d = 0; d < shapes.size(); d++){ @@ -385,7 +397,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, for(ir::value *op: user->ops()) create_tile(op, builder, references, seen, sh_mem_ptr); LLVMContext &ctx = builder.getContext(); - const auto& shapes = v->get_type()->get_tile_shapes(); + const auto& shapes = extract_shapes(v); Type* ty = llvm_type(v->get_type()->get_scalar_ty(), ctx); // create shared tile if(dynamic_cast(v) || (buffer_info_->is_double(v))){ @@ -429,7 +441,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, } // create distributed tile else { - const auto &shapes = v->get_type()->get_tile_shapes(); + const auto &shapes = extract_shapes(v); std::vector axes(shapes.size()); for(size_t d = 0; d < shapes.size(); d++){ if(shapes[d] > 1){ @@ -530,7 +542,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & distributed_tile* result = (distributed_tile*)ti; if(!ins->get_type()->is_tile_ty()) return; - const auto& shapes = ins->get_type()->get_tile_shapes(); + const auto& shapes = extract_shapes(ins); // global_range if(auto *x = dynamic_cast(ins)) { static std::array ctaid = { @@ -568,7 +580,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & // broadcast else if(dynamic_cast(ins)) { ir::value* in = ins->get_operand(0); - const auto& in_shapes = in->get_type()->get_tile_shapes(); + const auto& in_shapes = extract_shapes(in); distributed_tile *in_tile = (distributed_tile*)tmap_.at(in); result->for_each([&](indices_t out_idx){ indices_t in_idx = out_idx; @@ -615,7 +627,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {llvm_type(C->get_type()->get_scalar_ty(), ctx)}); result->for_each([&](indices_t idx){ Value *res = tmap_.at(C)->get_value(idx); - unsigned NK = A->get_type()->get_tile_shapes()[1]; + unsigned NK = extract_shapes(A)[1]; for(unsigned K = 0; K < NK; ++K){ indices_t a_idx = {idx[0], builder.getInt32(K)}; indices_t b_idx = {idx[1], builder.getInt32(K)}; diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 5de551924..3dc5c4e87 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -3,6 +3,8 @@ #include "ir/type.h" #include "ir/module.h" #include "ir/function.h" +#include "ir/context_impl.h" + #include @@ -29,7 +31,8 @@ void tune::init_c_phi(ir::instruction *v) { void tune::init_c_graph(ir::instruction *v) { // Reference shape - std::vector shapes; + ir::type::tile_shapes_t::value_type one = ir::tile_type::make_one(v->get_parent()->get_context()); + ir::type::tile_shapes_t shapes; if(auto *store = dynamic_cast(v)) shapes = store->get_pointer_operand()->get_type()->get_tile_shapes(); else @@ -39,7 +42,7 @@ void tune::init_c_graph(ir::instruction *v) { ir::value *op = v->get_operand(0); unsigned current = 0; for(unsigned i = 0; i < shapes.size(); i ++){ - if(shapes[i] == 1) + if(shapes[i] == one) static_params_.insert({{v, i}, 1}); else add_constraint({v, i}, {op, current++}); @@ -99,6 +102,7 @@ void tune::connected_components(node_t x, const std::vector vals, st std::vector tune::get_params(ir::module &mod) { std::vector result; std::set seen; + for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i : block->get_inst_list()) @@ -143,8 +147,9 @@ void tune::create_grids(std::vector &grids, // get number of dimensions greater than 1 auto get_tile_gt1_dim = [&](ir::value *v){ unsigned result = 0; - for(unsigned shape: v->get_type()->get_tile_shapes()) { - result += (shape > 1)?shape:0; + auto one = ir::tile_type::make_one(fn->get_fn_type()->get_context()); + for(ir::constant_int *shape: v->get_type()->get_tile_shapes()) { + result += (shape != one); } return result; }; @@ -194,8 +199,8 @@ for(ir::function *fn: mod.get_function_list()){ unsigned *s1 = params_[i]["p1.d" + strk]; unsigned *s2 = params_[i]["p2.d" + strk]; unsigned multiple = (*s0)*(*s1)*(*s2); - if(shapes[k] % multiple != 0) - errors[i].push_back("for dim " + strk + ": shape (" + to_string(shapes[k]) + ")" + if(shapes[k]->get_value() % multiple != 0) + errors[i].push_back("for dim " + strk + ": shape (" + to_string(shapes[k]->get_value()) + ")" " is not a multiple of layout (" + to_string(multiple) + ")"); } // the number of thread per warp must be 32 diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index cb5edd2b6..6622125b5 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -244,15 +244,15 @@ value *builder::create_store(value *ptr, value *val, const std::string &name){ // tile instructions //===----------------------------------------------------------------------===// -value *builder::create_reshape(value *arg, const std::vector &shapes, const std::string &name) { +value *builder::create_reshape(value *arg, const type::tile_shapes_t &shapes, const std::string &name) { return insert(reshape_inst::create(arg, shapes, name)); } -value *builder::create_splat(value *arg, const std::vector &shapes, const std::string &name) { +value *builder::create_splat(value *arg, const type::tile_shapes_t &shapes, const std::string &name) { return insert(splat_inst::create(arg, shapes, name)); } -value *builder::create_broadcast(value *arg, const std::vector &shapes, const std::string &name) { +value *builder::create_broadcast(value *arg, const type::tile_shapes_t &shapes, const std::string &name) { return insert(broadcast_inst::create(arg, shapes, name)); } @@ -260,7 +260,7 @@ value *builder::create_broadcast(value *arg, const std::vector &shapes // built-in instructions //===----------------------------------------------------------------------===// -value *builder::create_get_global_range(unsigned axis, unsigned size, const std::string &name) { +value *builder::create_get_global_range(unsigned axis, type::tile_shapes_t::value_type size, const std::string &name) { return insert(get_global_range_inst::create(ctx_, axis, size, name)); } diff --git a/lib/ir/constant.cpp b/lib/ir/constant.cpp index 58f3b1ab7..87f669e4d 100644 --- a/lib/ir/constant.cpp +++ b/lib/ir/constant.cpp @@ -48,24 +48,27 @@ constant *constant::get_all_ones_value(type *ty) { constant_int::constant_int(type *ty, uint64_t value) : constant(ty, 0), value_(value){ } -constant *constant_int::get(type *ty, uint64_t value) { - return new constant_int(ty, value); +constant_int *constant_int::get(type *ty, uint64_t value) { + context_impl *impl = ty->get_context().p_impl.get(); + constant_int *& cst = impl->int_constants_[std::make_pair(ty, value)]; + if(cst == nullptr) + cst = new constant_int(ty, value); + return cst; } // constant_range // FIXME use something like APInt -constant_range::constant_range(type *ty, uint64_t first, uint64_t last) +constant_range::constant_range(type *ty, constant_int *first, constant_int *last) : constant(ty, 0), first_(first), last_(last){ } -constant *constant_range::get(constant *first, constant *last) { +constant *constant_range::get(constant_int *first, constant_int *last) { assert(first->get_type()->is_integer_ty()); assert(first->get_type() == last->get_type()); unsigned vfirst = ((constant_int*)first)->get_value(); - unsigned vlast = ((constant_int*)last)->get_value(); - assert(vlast > vfirst); - type *ty = tile_type::get(first->get_type(), {vlast - vfirst}); - return new constant_range(ty, vfirst, vlast); + assert(vfirst == 0); + type *ty = tile_type::get(first->get_type(), {last}); + return new constant_range(ty, first, last); } @@ -94,6 +97,17 @@ constant *constant_fp::get(context &ctx, double v){ return result; } +// metaparameter +metaparameter::metaparameter(type *ty, unsigned lo, unsigned hi) + : constant_int(ty, 0), lo_(lo), hi_(hi){ } + +metaparameter* metaparameter::create(context &ctx, type *ty, unsigned lo, unsigned hi) { + context_impl *impl = ctx.p_impl.get(); + metaparameter *result = new metaparameter(ty, lo, hi); + impl->mp_constants_.push_back(result); + return result; +} + // undef value undef_value::undef_value(type *ty) : constant(ty, 0) { } diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index acf0c0329..38adcc377 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -409,7 +409,7 @@ std::string retile_inst::shape_suffix(ir::type* ty){ std::string res = "["; const auto& shapes = ty->get_tile_shapes(); for(unsigned i = 0; i < shapes.size(); i++){ - res += std::to_string(ty->get_tile_shapes()[i]); + res += std::to_string(ty->get_tile_shapes()[i]->get_value()); if(i < shapes.size() - 1) res += ", "; } @@ -417,13 +417,13 @@ std::string retile_inst::shape_suffix(ir::type* ty){ return res; } -retile_inst::retile_inst(value *arg, const std::vector &shapes, +retile_inst::retile_inst(value *arg, const type::tile_shapes_t &shapes, const std::string &name, instruction *next) : unary_inst(tile_type::get(arg->get_type()->get_scalar_ty(), shapes), arg, name, next) { } // reshape -instruction* reshape_inst::create(value *arg, const std::vector &shapes, +instruction* reshape_inst::create(value *arg, const type::tile_shapes_t &shapes, const std::string &name, instruction *next) { return new reshape_inst(arg, shapes, name, next); } @@ -431,14 +431,14 @@ instruction* reshape_inst::create(value *arg, const std::vector &shape // splat -instruction* splat_inst::create(value *arg, const std::vector &shapes, +instruction* splat_inst::create(value *arg, const type::tile_shapes_t &shapes, const std::string &name, instruction *next) { return new splat_inst(arg, shapes, name, next); } // broadcast -instruction* broadcast_inst::create(value *arg, const std::vector &shapes, +instruction* broadcast_inst::create(value *arg, const type::tile_shapes_t &shapes, const std::string &name, instruction *next) { return new broadcast_inst(arg, shapes, name, next); } @@ -470,7 +470,7 @@ get_global_range_inst::get_global_range_inst(type *ty, unsigned axis, } -instruction* get_global_range_inst::create(context &ctx, unsigned axis, unsigned size, +instruction* get_global_range_inst::create(context &ctx, unsigned axis, type::tile_shapes_t::value_type size, const std::string &name, instruction *next) { type *int_ty = type::get_int32_ty(ctx); type *tile_ty = tile_type::get(int_ty, {size}); diff --git a/lib/ir/type.cpp b/lib/ir/type.cpp index c790120fb..5aebd94a5 100644 --- a/lib/ir/type.cpp +++ b/lib/ir/type.cpp @@ -3,6 +3,7 @@ #include "ir/context.h" #include "ir/context_impl.h" #include "ir/value.h" +#include "ir/constant.h" namespace tdl{ namespace ir{ @@ -63,7 +64,7 @@ type * type::get_pointer_element_ty() const { } -const std::vector &type::get_tile_shapes() const { +const type::tile_shapes_t &type::get_tile_shapes() const { assert(is_tile_ty()); return ((tile_type*)this)->get_shapes(); } @@ -148,7 +149,7 @@ bool composite_type::index_valid(value *idx) const{ // tile_type class //===----------------------------------------------------------------------===// -tile_type::tile_type(type *ty, const std::vector &shapes) +tile_type::tile_type(type *ty, const tile_shapes_t &shapes) : composite_type(ty->get_context(), TileTyID), shapes_(shapes) { contained_tys_.push_back(ty); } @@ -159,8 +160,8 @@ bool tile_type::is_valid_elt_ty(type *ty) { unsigned tile_type::get_num_elements() const { unsigned res = 1; - for(unsigned shape: shapes_) - res *= shape; + for(auto shape: shapes_) + res *= shape->get_value(); return res; } @@ -168,7 +169,7 @@ unsigned tile_type::get_bitwidth() const { return get_num_elements() * get_tile_element_ty()->get_primitive_size_in_bits(); } -tile_type* tile_type::get(type *elt_ty, const std::vector &shapes) { +tile_type* tile_type::get(type *elt_ty, const tile_shapes_t &shapes) { assert(elt_ty && "Can't get a tile of type!"); assert(shapes.size() && "Can't create a tile with empty shapes!"); assert(is_valid_elt_ty(elt_ty) && "Invalid type for pointer element!"); @@ -185,6 +186,10 @@ tile_type* tile_type::get_same_shapes(type *ty, type *ref){ return get(ty, ref->get_tile_shapes()); } +type::tile_shapes_t::value_type tile_type::make_one(ir::context& ctx){ + return constant_int::get(type::get_int32_ty(ctx), 1); +} + //===----------------------------------------------------------------------===// // function_type class From 1b5f7f21397860a4dd8f3a086a1140e9cd4c5cf6 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 23 Feb 2019 22:24:12 -0500 Subject: [PATCH 084/494] [code generation] basic metaparameter support --- TODO | 4 ++- examples/matrix.cpp | 40 +++++++++++++++-------------- include/ast/ast.h | 9 +++++-- include/ast/parser.y | 5 ++-- include/ast/scanner.l | 1 + include/ir/module.h | 2 ++ include/ir/type.h | 15 ----------- lib/ast/lowering.cpp | 41 ++++++++++++++++++++---------- lib/codegen/selection.cpp | 53 ++++++++++++++++----------------------- lib/ir/builder.cpp | 2 +- lib/ir/module.cpp | 12 ++++++--- 11 files changed, 97 insertions(+), 87 deletions(-) diff --git a/TODO b/TODO index 72f439161..299ceba34 100644 --- a/TODO +++ b/TODO @@ -1,5 +1,7 @@ +[Frontend] + - SCOPES + [Intermediate Representation] - proper naming scheme - symbols table - - name conflicts on globals? - separate header for typedef (e.g., type::tile_shapes_t) to reduce compilation time diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 86ec4f249..9c425c6f3 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -38,29 +38,31 @@ extern translation_unit *ast_root; const char src[] = "\ +const tunable int32 TM;\ +const tunable int32 TN;\ void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K, int32 bound){\ - int32 rxa[16] = get_global_range[16](0);\ - int32 ryb[16] = get_global_range[16](1);\ + int32 rxa[TM] = get_global_range[TM](0);\ + int32 ryb[TN] = get_global_range[TN](1);\ int32 rka[8] = 0 ... 8;\ int32 rkb[8] = 0 ... 8;\ - int32 rxc[16] = get_global_range[16](0);\ - int32 ryc[16] = get_global_range[16](1);\ - fp32 C[16, 16] = 0;\ + int32 rxc[TM] = get_global_range[TM](0);\ + int32 ryc[TN] = get_global_range[TN](1);\ + fp32 C[TM, TN] = 0;\ int32 k;\ - fp32* pa[16, 8] = a + rxa[:, newaxis] + rka[newaxis, :]*M;\ - fp32* pb[16, 8] = b + ryb[:, newaxis] + rkb[newaxis, :]*K;\ - fp32* pc[16, 16] = c + rxc[:, newaxis] + ryc[newaxis, :]*M;\ - fp32 a[16, 8] = *pa;\ - fp32 b[16, 8] = *pb;\ - int1 checkc0[16] = rxc < M;\ - int1 checkc1[16] = ryc < N;\ - int1 checkc[16, 16] = checkc0[:, newaxis] && checkc1[newaxis, :];\ + fp32* pa[TM, 8] = a + rxa[:, newaxis] + rka[newaxis, :]*M;\ + fp32* pb[TN, 8] = b + ryb[:, newaxis] + rkb[newaxis, :]*K;\ + fp32* pc[TM, TN] = c + rxc[:, newaxis] + ryc[newaxis, :]*M;\ + fp32 a[TM, 8] = *pa;\ + fp32 b[TN, 8] = *pb;\ + int1 checkc0[TM] = rxc < M;\ + int1 checkc1[TN] = ryc < N;\ + int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :];\ for(k = K; k > 0; k = k - 8){\ - int1 checka[16, 8] = (k > 8);\ - int1 checkb[16, 8] = (k > 8);\ - int1 checka0[16];\ + int1 checka[TM, 8] = (k > 8);\ + int1 checkb[TN, 8] = (k > 8);\ + int1 checka0[TM];\ int1 checka1[8];\ - int1 checkb0[16];\ + int1 checkb0[TN];\ int1 checkb1[8];\ C = dot(a, b, C);\ pa = pa + 8*M;\ @@ -183,8 +185,8 @@ int main() { llvm::LLVMContext llvm_context; llvm::Module llvm_module("test", llvm_context); -// context.p_impl->mp_constants_[0]->set_value(16); -// context.p_impl->mp_constants_[1]->set_value(16); + context.p_impl->mp_constants_[0]->set_value(16); + context.p_impl->mp_constants_[1]->set_value(16); // context.p_impl->mp_constants_[2]->set_value(8); // create passes diff --git a/include/ast/ast.h b/include/ast/ast.h index a24d0480f..32077f1a2 100644 --- a/include/ast/ast.h +++ b/include/ast/ast.h @@ -57,6 +57,7 @@ enum TYPE_T{ }; enum STORAGE_SPEC_T{ + CONST_T, TUNABLE_T, KERNEL_T, READONLY_T, WRITEONLY_T, @@ -399,12 +400,14 @@ class declaration_specifier: public node{ public: using node::node; virtual ir::type* type(ir::module *mod) const = 0; + virtual std::vector storage() const = 0; }; class typed_declaration_specifier: public declaration_specifier { public: typed_declaration_specifier(TYPE_T ty): ty_(ty){ } ir::type* type(ir::module *mod) const; + std::vector storage() const; private: const TYPE_T ty_; @@ -415,6 +418,7 @@ public: storage_declaration_specifier(STORAGE_SPEC_T storage_spec, node *decl_spec) : storage_spec_(storage_spec), decl_spec_((declaration_specifier*)decl_spec) {} ir::type* type(ir::module *mod) const; + std::vector storage() const; private: const STORAGE_SPEC_T storage_spec_; @@ -429,6 +433,7 @@ public: decl_((declarator*)decl) { } ir::type* type(ir::module *mod) const; + std::vector storage() const; const identifier* id() const; public: @@ -485,10 +490,10 @@ private: public: tile(node *id, node *shapes) - : declarator(id), shapes_((list*)(shapes)) { } + : declarator(id), shapes_((list*)(shapes)) { } public: - const list* shapes_; + const list* shapes_; }; class function: public declarator{ diff --git a/include/ast/parser.y b/include/ast/parser.y index 826204f8a..960ae25a5 100644 --- a/include/ast/parser.y +++ b/include/ast/parser.y @@ -46,7 +46,7 @@ STORAGE_SPEC_T get_storage_spec(node *op) { return ((token*)op)->storage_spec;} %} %token IDENTIFIER CONSTANT STRING_LITERAL -%token TUNABLE KERNEL READONLY WRITEONLY +%token TUNABLE KERNEL READONLY WRITEONLY CONST %token PTR_OP INC_OP DEC_OP LEFT_OP RIGHT_OP LE_OP GE_OP EQ_OP NE_OP %token AND_OP OR_OP MUL_ASSIGN DIV_ASSIGN MOD_ASSIGN ADD_ASSIGN %token SUB_ASSIGN LEFT_ASSIGN RIGHT_ASSIGN AND_ASSIGN @@ -360,7 +360,8 @@ init_declarator ; storage_class_specifier - : TUNABLE { $$ = new token(TUNABLE_T); } + : CONST { $$ = new token(CONST_T); } + | TUNABLE { $$ = new token(TUNABLE_T); } | KERNEL { $$ = new token(KERNEL_T); } | READONLY { $$ = new token(READONLY_T); } | WRITEONLY { $$ = new token(WRITEONLY_T); } diff --git a/include/ast/scanner.l b/include/ast/scanner.l index 885404ca3..4f32c1f69 100644 --- a/include/ast/scanner.l +++ b/include/ast/scanner.l @@ -16,6 +16,7 @@ int comment(); %} %% +"const" { count(); return(CONST); } "tunable" { count(); return(TUNABLE); } "kernel" { count(); return(KERNEL); } "readonly" { count(); return(READONLY); } diff --git a/include/ir/module.h b/include/ir/module.h index 26b6c6769..41a729999 100644 --- a/include/ir/module.h +++ b/include/ir/module.h @@ -58,6 +58,7 @@ public: void set_value(const std::string& name, value* x); void set_type(const std::string& name, basic_block* block, type* x); void set_type(const std::string& name, type* x); + void set_const(const std::string& name); void set_continue_fn(std::function fn); // Getters value *get_value(const std::string& name, basic_block* block); @@ -83,6 +84,7 @@ private: builder builder_; std::map values_; std::map types_; + std::set const_; std::set sealed_blocks_; std::map> incomplete_phis_; functions_list_t functions_; diff --git a/include/ir/type.h b/include/ir/type.h index 6e2049ddd..1cd74e259 100644 --- a/include/ir/type.h +++ b/include/ir/type.h @@ -99,25 +99,10 @@ public: static integer_type *get_int64_ty(context &ctx); static integer_type *get_int128_ty(context &ctx); - // Attributes - type* set_tunable() { is_tunable_ = true; return this; } - type* set_readonly() { is_readonly_ = true; return this; } - type* set_writeonly() { is_writeonly_ = true; return this; } - type* set_kernel() { is_kernel_ = true; return this; } - - bool get_tunable() { return is_tunable_; } - bool get_readonly() { return is_readonly_; } - bool get_writeonly() { return is_writeonly_; } - bool get_kernel() { return is_kernel_; } private: context &ctx_; id_t id_; - // attributes - bool is_tunable_; - bool is_readonly_; - bool is_writeonly_; - bool is_kernel_; protected: contained_tys_vec_t contained_tys_; diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 993856b2d..714d8b16f 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -170,22 +170,31 @@ ir::type* typed_declaration_specifier::type(ir::module *mod) const { } } -ir::type* storage_declaration_specifier::type(ir::module *mod) const { - ir::type* result = decl_spec_->type(mod); - switch(storage_spec_){ - case TUNABLE_T: return result->set_tunable(); - case KERNEL_T: return result->set_kernel(); - case READONLY_T: return result->set_readonly(); - case WRITEONLY_T: return result->set_writeonly(); - default: throw std::runtime_error("unreachable"); - } +std::vector typed_declaration_specifier::storage() const { + return {}; } + +ir::type* storage_declaration_specifier::type(ir::module *mod) const { + return decl_spec_->type(mod); +} + +std::vector storage_declaration_specifier::storage() const { + auto result = decl_spec_->storage(); + result.push_back(storage_spec_); + return result; +} + + /* Parameter */ ir::type* parameter::type(ir::module *mod) const { return decl_->type(mod, spec_->type(mod)); } +std::vector parameter::storage() const { + return spec_->storage(); +} + const identifier *parameter::id() const { return decl_->id(); } @@ -209,8 +218,11 @@ const std::string &identifier::name() const{ // Tile ir::type* tile::type_impl(ir::module *mod, ir::type *type) const{ ir::type::tile_shapes_t shapes; - for(constant *cst: shapes_->values()) - shapes.push_back((ir::constant_int*)cst->codegen(mod)); + for(expression *expr: shapes_->values()){ + ir::constant_int *shape = dynamic_cast(expr->codegen(mod)); + assert(shape); + shapes.push_back(shape); + } return ir::tile_type::get(type, shapes); } @@ -368,11 +380,12 @@ void initializer::set_specifier(const declaration_specifier *spec) { ir::value* initializer::codegen(ir::module * mod) const{ ir::type *ty = decl_->type(mod, spec_->type(mod)); + std::vector storage = spec_->storage(); std::string name = decl_->id()->name(); ir::value *value = ir::undef_value::get(ty); - if(ty->get_tunable()){ + if(std::find(storage.begin(), storage.end(), TUNABLE_T) != storage.end()){ assert(expr_ == nullptr); - //TODO + //TODO: implement ranges value = ir::metaparameter::create(mod->get_context(), ty, 4, 8); } if(expr_){ @@ -383,6 +396,8 @@ ir::value* initializer::codegen(ir::module * mod) const{ value->set_name(name); mod->set_value(name, value); mod->set_type(name, ty); + if(std::find(storage.begin(), storage.end(), CONST_T) != storage.end()) + mod->set_const(name); return value; } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 8665714a2..a028134fa 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -44,7 +44,6 @@ llvm::Type *distributed_tile::make_vector_ty(llvm::Type *ty, size_t vector_size) return VectorType::get(ty, vector_size); } - distributed_tile::distributed_tile(Type *ty, const shapes_t &shapes, const axes_t &axes, llvm::IRBuilder<> &builder, bool vectorize) : tile(make_vector_ty(ty, vectorize?axes[0].contiguous:1), shapes), axes_(axes), builder_(builder) { vector_size_ = vectorize?ty_->getVectorNumElements():1; @@ -150,16 +149,6 @@ Value* shared_tile::get_value(indices_t idx) { return builder_.CreateLoad(ptr); } -/* Utils */ -std::vector selection::extract_shapes(ir::value *v) { - const auto& shapes = v->get_type()->get_tile_shapes(); - std::vector result(shapes.size()); - for(ir::constant_int* cst: shapes) - result.push_back(cst->get_value()); - return result; -} - - /* convert ir::type to Type */ Type *selection::llvm_type(ir::type *ty, LLVMContext &ctx) { // function @@ -310,12 +299,11 @@ std::vector delinearize(Value *trailing, std::vector &shapes, } void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { - const auto& shapes = extract_shapes(v); + const auto& shapes = v->get_type()->get_tile_shapes(); size_t dim = shapes.size(); std::vector contiguous(dim); std::vector warp_size(dim); std::vector n_warps(dim); - std::cout << v->get_name() << " " << typeid(*v).name() << std::endl; for(unsigned i = 0; i < shapes.size(); i++){ std::string str_i = std::to_string(i); contiguous[i] = *params_->get_param(v, "p0.d" + str_i); @@ -332,7 +320,7 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id Value *thread_id = builder.CreateAdd(thread_id_in_warp[k], builder.CreateMul(warp_id[k], warp_size_k)); thread_id = builder.CreateMul(thread_id, contiguous_k); unsigned per_block = contiguous[k] * warp_size[k] * n_warps[k]; - unsigned per_thread = contiguous[k] * shapes[k] / per_block; + unsigned per_thread = contiguous[k] * shapes[k]->get_value() / per_block; std::vector idx_list(per_thread); for(unsigned n = 0 ; n < per_thread; n++){ unsigned offset = n / contiguous[k] * per_block + n % contiguous[k]; @@ -348,8 +336,8 @@ void selection::create_grids(std::vector &grids, // get number of dimensions greater than 1 auto get_tile_gt1_dim = [&](ir::value *v){ unsigned result = 0; - for(unsigned shape: extract_shapes(v)) { - result += (shape > 1)?shape:0; + for(ir::constant_int* shape: v->get_type()->get_tile_shapes()) { + result += (shape->get_value() > 1)?shape->get_value():0; } return result; }; @@ -365,11 +353,11 @@ void selection::create_grids(std::vector &grids, for(ir::value *op: user->ops()) bind_references(op); // bind - const auto& shapes = extract_shapes(v); + const auto& shapes = v->get_type()->get_tile_shapes(); if(dynamic_cast(v) || buffer_info_->is_double(v)) return; for(size_t d = 0; d < shapes.size(); d++){ - if(shapes[d] == 1) + if(shapes[d]->get_value() == 1) continue; unsigned *x = params_->get_param(v, "p0.d" + std::to_string(d)); ir::value *&r = references[x]; @@ -397,7 +385,10 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, for(ir::value *op: user->ops()) create_tile(op, builder, references, seen, sh_mem_ptr); LLVMContext &ctx = builder.getContext(); - const auto& shapes = extract_shapes(v); + const auto& shapes = v->get_type()->get_tile_shapes(); + std::vector shapes2; + for(ir::constant_int* shape: shapes) + shapes2.push_back(shape->get_value()); Type* ty = llvm_type(v->get_type()->get_scalar_ty(), ctx); // create shared tile if(dynamic_cast(v) || (buffer_info_->is_double(v))){ @@ -408,7 +399,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, size_t offset = alloc_->get_offset(v); Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset)); ptr = builder.CreateBitCast(ptr, ptr_ty); - tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)}); + tmap_.insert({v, new shared_tile(ty, shapes2, ptr, builder)}); } } // phi-node (double-buffering) @@ -427,13 +418,13 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, Value *pre_ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(alloc_->get_offset(phi))); pre_ptr = builder.CreateBitCast(pre_ptr, ptr->getType()); Value *next_ptr = builder.CreateGEP(ptr, offset); - tmap_.insert({phi, new shared_tile(ty, shapes, ptr, builder, offset)}); + tmap_.insert({phi, new shared_tile(ty, shapes2, ptr, builder, offset)}); for(unsigned i = 0; i < phi->get_num_incoming(); i++) { ir::basic_block* inc_block = phi->get_incoming_block(i); ir::value* inc_value = phi->get_incoming_value(i); ir::value* terminator = inc_block->get_inst_list().back(); bool is_loop_latch = buffer_info_->is_loop_latch(phi, terminator); - tmap_.insert({inc_value, new shared_tile(ty, shapes, is_loop_latch?next_ptr:pre_ptr, builder)}); + tmap_.insert({inc_value, new shared_tile(ty, shapes2, is_loop_latch?next_ptr:pre_ptr, builder)}); } } else @@ -441,10 +432,10 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, } // create distributed tile else { - const auto &shapes = extract_shapes(v); + const auto &shapes = v->get_type()->get_tile_shapes(); std::vector axes(shapes.size()); for(size_t d = 0; d < shapes.size(); d++){ - if(shapes[d] > 1){ + if(shapes[d]->get_value() > 1){ unsigned *x = params_->get_param(v, "p0.d" + std::to_string(d)); axes[d] = axes_.at(x); } @@ -454,7 +445,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, } } bool vectorize = dynamic_cast(v); - distributed_tile *T = new distributed_tile(ty, shapes, axes, builder, vectorize); + distributed_tile *T = new distributed_tile(ty, shapes2, axes, builder, vectorize); tmap_.insert({v, T}); // constant range if(dynamic_cast(v)){ @@ -542,7 +533,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & distributed_tile* result = (distributed_tile*)ti; if(!ins->get_type()->is_tile_ty()) return; - const auto& shapes = extract_shapes(ins); + const auto& shapes = ins->get_type()->get_tile_shapes(); // global_range if(auto *x = dynamic_cast(ins)) { static std::array ctaid = { @@ -552,7 +543,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & }; Function *get_group_id = Intrinsic::getDeclaration(module, ctaid[x->get_axis()]); Value *group_id = builder.CreateCall(get_group_id, {}); - Value *offset = builder.CreateMul(builder.getInt32(shapes[0]), group_id); + Value *offset = builder.CreateMul(builder.getInt32(shapes[0]->get_value()), group_id); result->for_each([&](indices_t idx){ BinaryOperator *bin = static_cast(idx[0]); result->set_value(idx, insert_masked(idx, [&]{ return builder.CreateAdd(bin, offset); })); @@ -565,7 +556,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & result->for_each([&](indices_t out_idx){ indices_t in_idx; for(size_t k = 0; k < shapes.size(); k++){ - if(shapes[k] > 1) + if(shapes[k]->get_value() > 1) in_idx.push_back(out_idx[k]); } result->set_value(out_idx, in_tile->get_value(in_idx)); @@ -580,12 +571,12 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & // broadcast else if(dynamic_cast(ins)) { ir::value* in = ins->get_operand(0); - const auto& in_shapes = extract_shapes(in); + const auto& in_shapes = in->get_type()->get_tile_shapes(); distributed_tile *in_tile = (distributed_tile*)tmap_.at(in); result->for_each([&](indices_t out_idx){ indices_t in_idx = out_idx; for(size_t k = 0; k < in_idx.size(); k++){ - if(in_shapes[k] == 1) + if(in_shapes[k]->get_value() == 1) in_idx[k] = builder.getInt32(0); } result->set_value(out_idx, in_tile->get_value(in_idx)); @@ -627,7 +618,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {llvm_type(C->get_type()->get_scalar_ty(), ctx)}); result->for_each([&](indices_t idx){ Value *res = tmap_.at(C)->get_value(idx); - unsigned NK = extract_shapes(A)[1]; + unsigned NK = A->get_type()->get_tile_shapes()[1]->get_value(); for(unsigned K = 0; K < NK; ++K){ indices_t a_idx = {idx[0], builder.getInt32(K)}; indices_t b_idx = {idx[1], builder.getInt32(K)}; diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index 6622125b5..c2b338236 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -10,7 +10,7 @@ namespace tdl{ namespace ir{ builder::builder(context &ctx): - ctx_(ctx){} + ctx_(ctx), block_(nullptr), insert_point_(nullptr) {} //===----------------------------------------------------------------------===// // utilities diff --git a/lib/ir/module.cpp b/lib/ir/module.cpp index a8a11ff1c..66a4784cc 100644 --- a/lib/ir/module.cpp +++ b/lib/ir/module.cpp @@ -37,6 +37,10 @@ void module::set_type(const std::string& name, ir::type *type){ return set_type(name, builder_.get_insert_block(), type); } +void module::set_const(const std::string& name){ + const_.insert(name); +} + void module::set_continue_fn(std::function fn) { continue_fn_ = fn; } @@ -91,10 +95,12 @@ ir::value *module::add_phi_operands(const std::string& name, ir::phi_node *&phi) ir::value *module::get_value_recursive(const std::string& name, ir::basic_block *block) { ir::value *result; + bool is_const = const_.find(name) != const_.end(); auto &preds = block->get_predecessors(); + ir::type *ty = get_type(name, block); if(block) - if(sealed_blocks_.find(block) == sealed_blocks_.end()){ - incomplete_phis_[block][name] = make_phi(get_type(name, block), 1, block); + if(!is_const && sealed_blocks_.find(block) == sealed_blocks_.end()){ + incomplete_phis_[block][name] = make_phi(ty, 1, block); result = (ir::value*)incomplete_phis_[block][name]; } else if(preds.size() <= 1){ @@ -102,7 +108,7 @@ ir::value *module::get_value_recursive(const std::string& name, ir::basic_block result = get_value(name, has_pred?preds.front():nullptr); } else{ - result = make_phi(get_type(name, block), 1, block); + result = make_phi(ty, 1, block); set_value(name, block, result); result = add_phi_operands(name, (ir::phi_node*&)result); } From 6b49818282fa9fa4ffc5142e01c97039d5802a69 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 24 Feb 2019 14:20:40 -0500 Subject: [PATCH 085/494] [filesystem] rename tdl -> triton --- CMakeLists.txt | 14 +-- examples/CMakeLists.txt | 2 +- examples/matrix.cpp | 97 ++++++++++++------- include/{ => triton}/ast/ast.h | 0 include/{ => triton}/ast/parser.y | 4 +- include/{ => triton}/ast/scanner.l | 0 include/{ => triton}/codegen/allocation.h | 0 include/{ => triton}/codegen/barriers.h | 0 include/{ => triton}/codegen/buffer_info.h | 0 include/{ => triton}/codegen/layout.h | 0 include/{ => triton}/codegen/liveness.h | 0 include/{ => triton}/codegen/selection.h | 10 +- include/{ => triton}/codegen/shared_copy.h | 0 include/{ => triton}/codegen/tune.h | 0 include/{ => triton}/codegen/vectorize.h | 0 include/{ => triton}/driver/backend.h | 0 include/{ => triton}/driver/buffer.h | 4 +- include/{ => triton}/driver/context.h | 4 +- include/{ => triton}/driver/cublas.h | 14 +-- include/{ => triton}/driver/device.h | 4 +- include/{ => triton}/driver/dispatch.h | 10 +- include/{ => triton}/driver/error.h | 2 +- include/{ => triton}/driver/event.h | 2 +- include/{ => triton}/driver/handle.h | 2 +- include/{ => triton}/driver/kernel.h | 4 +- include/{ => triton}/driver/module.h | 6 +- include/{ => triton}/driver/platform.h | 2 +- include/{ => triton}/driver/stream.h | 8 +- .../external/CUDA/builtin_types.h | 0 .../external/CUDA/channel_descriptor.h | 0 .../external/CUDA/crt/host_config.h | 0 .../external/CUDA/crt/host_defines.h | 0 .../{ => triton}/external/CUDA/cuComplex.h | 0 include/{ => triton}/external/CUDA/cublas.h | 0 .../{ => triton}/external/CUDA/cublas_api.h | 0 .../{ => triton}/external/CUDA/cublas_v2.h | 0 include/{ => triton}/external/CUDA/cuda.h | 0 .../external/CUDA/cuda_device_runtime_api.h | 0 .../{ => triton}/external/CUDA/cuda_fp16.h | 0 .../{ => triton}/external/CUDA/cuda_fp16.hpp | 0 .../{ => triton}/external/CUDA/cuda_runtime.h | 0 .../external/CUDA/cuda_runtime_api.h | 0 include/{ => triton}/external/CUDA/cudnn.h | 0 include/{ => triton}/external/CUDA/cusparse.h | 0 .../{ => triton}/external/CUDA/device_types.h | 0 .../external/CUDA/driver_functions.h | 0 .../{ => triton}/external/CUDA/driver_types.h | 0 .../{ => triton}/external/CUDA/host_config.h | 0 .../{ => triton}/external/CUDA/host_defines.h | 0 .../external/CUDA/library_types.h | 0 include/{ => triton}/external/CUDA/nvml.h | 0 include/{ => triton}/external/CUDA/nvrtc.h | 0 .../external/CUDA/surface_types.h | 0 .../external/CUDA/texture_types.h | 0 .../external/CUDA/vector_functions.h | 0 .../external/CUDA/vector_functions.hpp | 0 .../{ => triton}/external/CUDA/vector_types.h | 0 include/{ => triton}/ir/basic_block.h | 0 include/{ => triton}/ir/builder.h | 0 include/{ => triton}/ir/constant.h | 0 include/{ => triton}/ir/context.h | 2 +- include/{ => triton}/ir/context_impl.h | 2 +- include/{ => triton}/ir/function.h | 0 include/{ => triton}/ir/instructions.h | 2 +- include/{ => triton}/ir/module.h | 0 include/triton/ir/print.h | 17 ++++ include/{ => triton}/ir/type.h | 0 include/{ => triton}/ir/value.h | 0 include/{ => triton}/tools/sys/getenv.hpp | 0 include/{ => triton}/tools/sys/mkdir.hpp | 0 lib/ast/lowering.cpp | 14 +-- lib/codegen/allocation.cpp | 18 ++-- lib/codegen/barriers.cpp | 14 +-- lib/codegen/buffer_info.cpp | 12 +-- lib/codegen/layout.cpp | 10 +- lib/codegen/liveness.cpp | 14 +-- lib/codegen/selection.cpp | 14 +-- lib/codegen/shared_copy.cpp | 12 +-- lib/codegen/tune.cpp | 12 +-- lib/codegen/vectorize.cpp | 12 +-- lib/driver/backend.cpp | 12 +-- lib/driver/buffer.cpp | 8 +- lib/driver/context.cpp | 8 +- lib/driver/device.cpp | 2 +- lib/driver/dispatch.cpp | 4 +- lib/driver/error.cpp | 2 +- lib/driver/event.cpp | 2 +- lib/driver/handle.cpp | 2 +- lib/driver/kernel.cpp | 4 +- lib/driver/module.cpp | 8 +- lib/driver/platform.cpp | 4 +- lib/driver/stream.cpp | 14 +-- lib/ir/basic_block.cpp | 8 +- lib/ir/builder.cpp | 10 +- lib/ir/constant.cpp | 8 +- lib/ir/context.cpp | 6 +- lib/ir/function.cpp | 6 +- lib/ir/instructions.cpp | 10 +- lib/ir/module.cpp | 10 +- lib/ir/print.cpp | 59 +++++++++++ lib/ir/type.cpp | 10 +- lib/ir/value.cpp | 4 +- 102 files changed, 323 insertions(+), 222 deletions(-) rename include/{ => triton}/ast/ast.h (100%) rename include/{ => triton}/ast/parser.y (98%) rename include/{ => triton}/ast/scanner.l (100%) rename include/{ => triton}/codegen/allocation.h (100%) rename include/{ => triton}/codegen/barriers.h (100%) rename include/{ => triton}/codegen/buffer_info.h (100%) rename include/{ => triton}/codegen/layout.h (100%) rename include/{ => triton}/codegen/liveness.h (100%) rename include/{ => triton}/codegen/selection.h (96%) rename include/{ => triton}/codegen/shared_copy.h (100%) rename include/{ => triton}/codegen/tune.h (100%) rename include/{ => triton}/codegen/vectorize.h (100%) rename include/{ => triton}/driver/backend.h (100%) rename include/{ => triton}/driver/buffer.h (95%) rename include/{ => triton}/driver/context.h (96%) rename include/{ => triton}/driver/cublas.h (97%) rename include/{ => triton}/driver/device.h (97%) rename include/{ => triton}/driver/dispatch.h (98%) rename include/{ => triton}/driver/error.h (99%) rename include/{ => triton}/driver/event.h (97%) rename include/{ => triton}/driver/handle.h (98%) rename include/{ => triton}/driver/kernel.h (96%) rename include/{ => triton}/driver/module.h (94%) rename include/{ => triton}/driver/platform.h (97%) rename include/{ => triton}/driver/stream.h (94%) rename include/{ => triton}/external/CUDA/builtin_types.h (100%) rename include/{ => triton}/external/CUDA/channel_descriptor.h (100%) rename include/{ => triton}/external/CUDA/crt/host_config.h (100%) rename include/{ => triton}/external/CUDA/crt/host_defines.h (100%) rename include/{ => triton}/external/CUDA/cuComplex.h (100%) rename include/{ => triton}/external/CUDA/cublas.h (100%) rename include/{ => triton}/external/CUDA/cublas_api.h (100%) rename include/{ => triton}/external/CUDA/cublas_v2.h (100%) rename include/{ => triton}/external/CUDA/cuda.h (100%) rename include/{ => triton}/external/CUDA/cuda_device_runtime_api.h (100%) rename include/{ => triton}/external/CUDA/cuda_fp16.h (100%) rename include/{ => triton}/external/CUDA/cuda_fp16.hpp (100%) rename include/{ => triton}/external/CUDA/cuda_runtime.h (100%) rename include/{ => triton}/external/CUDA/cuda_runtime_api.h (100%) rename include/{ => triton}/external/CUDA/cudnn.h (100%) rename include/{ => triton}/external/CUDA/cusparse.h (100%) rename include/{ => triton}/external/CUDA/device_types.h (100%) rename include/{ => triton}/external/CUDA/driver_functions.h (100%) rename include/{ => triton}/external/CUDA/driver_types.h (100%) rename include/{ => triton}/external/CUDA/host_config.h (100%) rename include/{ => triton}/external/CUDA/host_defines.h (100%) rename include/{ => triton}/external/CUDA/library_types.h (100%) rename include/{ => triton}/external/CUDA/nvml.h (100%) rename include/{ => triton}/external/CUDA/nvrtc.h (100%) rename include/{ => triton}/external/CUDA/surface_types.h (100%) rename include/{ => triton}/external/CUDA/texture_types.h (100%) rename include/{ => triton}/external/CUDA/vector_functions.h (100%) rename include/{ => triton}/external/CUDA/vector_functions.hpp (100%) rename include/{ => triton}/external/CUDA/vector_types.h (100%) rename include/{ => triton}/ir/basic_block.h (100%) rename include/{ => triton}/ir/builder.h (100%) rename include/{ => triton}/ir/constant.h (100%) rename include/{ => triton}/ir/context.h (90%) rename include/{ => triton}/ir/context_impl.h (97%) rename include/{ => triton}/ir/function.h (100%) rename include/{ => triton}/ir/instructions.h (99%) rename include/{ => triton}/ir/module.h (100%) create mode 100644 include/triton/ir/print.h rename include/{ => triton}/ir/type.h (100%) rename include/{ => triton}/ir/value.h (100%) rename include/{ => triton}/tools/sys/getenv.hpp (100%) rename include/{ => triton}/tools/sys/mkdir.hpp (100%) create mode 100644 lib/ir/print.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 814206cfc..3326b3ff6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,12 +1,12 @@ cmake_minimum_required(VERSION 2.8) -project(TDL) +project(triton) include(CTest) # FLEX/YACC find_package(BISON) find_package(FLEX) -BISON_TARGET(Parser ${CMAKE_CURRENT_SOURCE_DIR}/include/ast/parser.y ${CMAKE_CURRENT_BINARY_DIR}/parser.cpp) -FLEX_TARGET(Lexer ${CMAKE_CURRENT_SOURCE_DIR}/include/ast/scanner.l ${CMAKE_CURRENT_BINARY_DIR}/scanner.cpp) +BISON_TARGET(Parser ${CMAKE_CURRENT_SOURCE_DIR}/include/triton/ast/parser.y ${CMAKE_CURRENT_BINARY_DIR}/parser.cpp) +FLEX_TARGET(Lexer ${CMAKE_CURRENT_SOURCE_DIR}/include/triton/ast/scanner.l ${CMAKE_CURRENT_BINARY_DIR}/scanner.cpp) get_filename_component(BISON_Parser_INCLUDE_DIRECTORIES ${BISON_Parser_OUTPUT_HEADER} DIRECTORY) include_directories(${BISON_Parser_INCLUDE_DIRECTORIES}) @@ -31,10 +31,10 @@ add_custom_target( ALL SOURCES ${ALL_SRC} ) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${LLVM_CXXFLAGS} -std=c++11") -# TDL -file(GLOB_RECURSE LIBTDL_SRC lib/*.cpp) -add_library(tdl SHARED ${LIBTDL_SRC} ${BISON_Parser_OUTPUTS} ${FLEX_Lexer_OUTPUTS}) -target_link_libraries(tdl LLVM) +# Triton +file(GLOB_RECURSE LIBTRITON_SRC lib/*.cpp) +add_library(triton SHARED ${LIBTRITON_SRC} ${BISON_Parser_OUTPUTS} ${FLEX_Lexer_OUTPUTS}) +target_link_libraries(triton LLVM) # Examples add_subdirectory(examples) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 8419125c2..4a235d45a 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -2,5 +2,5 @@ foreach(PROG matrix) add_executable(${PROG} ${PROG}.cpp) set_target_properties(${PROG} PROPERTIES OUTPUT_NAME ${PROG}) include_directories(/usr/local/cuda/include/) - target_link_libraries(${PROG} tdl cuda) + target_link_libraries(${PROG} triton cuda) endforeach(PROG) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 9c425c6f3..e0fd34646 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -2,19 +2,19 @@ #include #include "cuda.h" #include "llvm/IR/Verifier.h" -#include "ast/ast.h" -#include "ir/context.h" -#include "ir/module.h" -#include "ir/print.h" -#include "ir/context_impl.h" -#include "codegen/selection.h" -#include "codegen/tune.h" -#include "codegen/shared_copy.h" -#include "codegen/allocation.h" -#include "codegen/liveness.h" -#include "codegen/vectorize.h" -#include "codegen/buffer_info.h" -#include "codegen/barriers.h" +#include "triton/ast/ast.h" +#include "triton/ir/context.h" +#include "triton/ir/module.h" +#include "triton/ir/print.h" +#include "triton/ir/context_impl.h" +#include "triton/codegen/selection.h" +#include "triton/codegen/tune.h" +#include "triton/codegen/shared_copy.h" +#include "triton/codegen/allocation.h" +#include "triton/codegen/liveness.h" +#include "triton/codegen/vectorize.h" +#include "triton/codegen/buffer_info.h" +#include "triton/codegen/barriers.h" #include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/Module.h" #include "llvm/IR/LLVMContext.h" @@ -40,33 +40,35 @@ const char src[] = "\ const tunable int32 TM;\ const tunable int32 TN;\ -void test(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K, int32 bound){\ +const tunable int32 TK;\ +\ +void matmul(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K, int32 bound){\ int32 rxa[TM] = get_global_range[TM](0);\ int32 ryb[TN] = get_global_range[TN](1);\ - int32 rka[8] = 0 ... 8;\ - int32 rkb[8] = 0 ... 8;\ + int32 rka[TK] = 0 ... TK;\ + int32 rkb[TK] = 0 ... TK;\ int32 rxc[TM] = get_global_range[TM](0);\ int32 ryc[TN] = get_global_range[TN](1);\ fp32 C[TM, TN] = 0;\ int32 k;\ - fp32* pa[TM, 8] = a + rxa[:, newaxis] + rka[newaxis, :]*M;\ - fp32* pb[TN, 8] = b + ryb[:, newaxis] + rkb[newaxis, :]*K;\ + fp32* pa[TM, TK] = a + rxa[:, newaxis] + rka[newaxis, :]*M;\ + fp32* pb[TN, TK] = b + ryb[:, newaxis] + rkb[newaxis, :]*K;\ fp32* pc[TM, TN] = c + rxc[:, newaxis] + ryc[newaxis, :]*M;\ - fp32 a[TM, 8] = *pa;\ - fp32 b[TN, 8] = *pb;\ + fp32 a[TM, TK] = *pa;\ + fp32 b[TN, TK] = *pb;\ int1 checkc0[TM] = rxc < M;\ int1 checkc1[TN] = ryc < N;\ int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :];\ - for(k = K; k > 0; k = k - 8){\ - int1 checka[TM, 8] = (k > 8);\ - int1 checkb[TN, 8] = (k > 8);\ + for(k = K; k > 0; k = k - TK){\ + int1 checka[TM, TK] = (k > 8);\ + int1 checkb[TN, TK] = (k > 8);\ int1 checka0[TM];\ - int1 checka1[8];\ + int1 checka1[TK];\ int1 checkb0[TN];\ - int1 checkb1[8];\ + int1 checkb1[TK];\ C = dot(a, b, C);\ - pa = pa + 8*M;\ - pb = pb + 8*K;\ + pa = pa + TK*M;\ + pb = pb + TK*K;\ @checka a = *pa;\ @checkb b = *pb;\ if(k > 8)\ @@ -171,6 +173,24 @@ void simple_gemm(std::vector &c, const std::vector &a, const std::vector const & ranges, std::function const &)> const & f){ + size_t D = ranges.size(); + std::vector values(D, 0); + // Start with innermost loop + size_t i = D - 1; + while(true){ + //Execute function + f(values); + //Increment counters + while(values[i]++ == ranges[i] - 1){ + if(i == 0) + return; + values[i--] = 0; + } + i = D - 1; + } +} + int main() { // create AST from Triton-C source YY_BUFFER_STATE buffer = yy_scan_string(src); @@ -183,11 +203,9 @@ int main() { tdl::ir::module module("matrix", context); program->codegen(&module); llvm::LLVMContext llvm_context; - llvm::Module llvm_module("test", llvm_context); + llvm::Module llvm_module("matmul", llvm_context); + - context.p_impl->mp_constants_[0]->set_value(16); - context.p_impl->mp_constants_[1]->set_value(16); -// context.p_impl->mp_constants_[2]->set_value(8); // create passes tdl::codegen::buffer_info_pass buffer_info; @@ -202,6 +220,8 @@ int main() { // tuning parameters tune.run(module); std::vector params = { + // shapes + 16, 16, 8, // a0 2, 8, 1, // b0 @@ -215,10 +235,15 @@ int main() { // b1 1, 8, 1 }; - std::map> errors; + // meta-parameters unsigned i = 0; + context.p_impl->mp_constants_[0]->set_value(params[0]); + context.p_impl->mp_constants_[1]->set_value(params[1]); + context.p_impl->mp_constants_[2]->set_value(params[2]); for(unsigned *x: tune.get_params(module)) - *x = params[i++]; + *x = params[3 + i++]; + // constraints + std::map> errors; tune.check_constraints(module, errors); std::cout << "errors: " << errors.size() << std::endl; for(auto &x: errors){ @@ -255,7 +280,7 @@ int main() { CUfunction cu_kernel; CUstream cu_stream; int major, minor; - compile_machine_code(cu_device, cu_context, cu_module, cu_kernel, cu_stream, major, minor, src, "test"); + compile_machine_code(cu_device, cu_context, cu_module, cu_kernel, cu_stream, major, minor, src, "matmul"); // execute machine code // Allocate buffers @@ -284,8 +309,8 @@ int main() { void *args[] = { &d_a, &d_b, &d_c, &M, &N, &K, &bound}; int num_regs; cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, cu_kernel); - unsigned TM = 16; - unsigned TN = 16; + unsigned TM = context.p_impl->mp_constants_[0]->get_value(); + unsigned TN = context.p_impl->mp_constants_[1]->get_value(); unsigned nthreads = 32; checkCudaErrors(cuLaunchKernel(cu_kernel, (M + TM - 1)/TM, (N + TN - 1)/TN, 1, nthreads, 1, 1, 0, cu_stream, args, NULL)); checkCudaErrors(cuStreamSynchronize(cu_stream)); diff --git a/include/ast/ast.h b/include/triton/ast/ast.h similarity index 100% rename from include/ast/ast.h rename to include/triton/ast/ast.h diff --git a/include/ast/parser.y b/include/triton/ast/parser.y similarity index 98% rename from include/ast/parser.y rename to include/triton/ast/parser.y index 960ae25a5..3fdcbd60b 100644 --- a/include/ast/parser.y +++ b/include/triton/ast/parser.y @@ -6,7 +6,7 @@ class node; } using namespace tdl::ast; #define YYSTYPE node* -#include "../include/ast/ast.h" +#include "../include/triton/ast/ast.h" extern char* yytext; void yyerror(const char *s); @@ -117,7 +117,7 @@ builtin primary_expression : identifier { $$ = new named_expression($1); } | constant { $$ = $1; } - | constant ELLIPSIS constant { $$ = new constant_range($1, $3); } + | primary_expression ELLIPSIS primary_expression { $$ = new constant_range($1, $3); } | builtin { $$ = $1; } | STRING_LITERAL { $$ = new string_literal(yytext); } | '(' expression ')' { $$ = $2; } diff --git a/include/ast/scanner.l b/include/triton/ast/scanner.l similarity index 100% rename from include/ast/scanner.l rename to include/triton/ast/scanner.l diff --git a/include/codegen/allocation.h b/include/triton/codegen/allocation.h similarity index 100% rename from include/codegen/allocation.h rename to include/triton/codegen/allocation.h diff --git a/include/codegen/barriers.h b/include/triton/codegen/barriers.h similarity index 100% rename from include/codegen/barriers.h rename to include/triton/codegen/barriers.h diff --git a/include/codegen/buffer_info.h b/include/triton/codegen/buffer_info.h similarity index 100% rename from include/codegen/buffer_info.h rename to include/triton/codegen/buffer_info.h diff --git a/include/codegen/layout.h b/include/triton/codegen/layout.h similarity index 100% rename from include/codegen/layout.h rename to include/triton/codegen/layout.h diff --git a/include/codegen/liveness.h b/include/triton/codegen/liveness.h similarity index 100% rename from include/codegen/liveness.h rename to include/triton/codegen/liveness.h diff --git a/include/codegen/selection.h b/include/triton/codegen/selection.h similarity index 96% rename from include/codegen/selection.h rename to include/triton/codegen/selection.h index ec733ed57..3f515ee4d 100644 --- a/include/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -3,11 +3,11 @@ #include "llvm/IR/Module.h" #include "llvm/IR/IRBuilder.h" -#include "ir/context.h" -#include "ir/module.h" -#include "ir/function.h" -#include "ir/type.h" -#include "codegen/buffer_info.h" +#include "triton/ir/context.h" +#include "triton/ir/module.h" +#include "triton/ir/function.h" +#include "triton/ir/type.h" +#include "triton/codegen/buffer_info.h" namespace llvm{ diff --git a/include/codegen/shared_copy.h b/include/triton/codegen/shared_copy.h similarity index 100% rename from include/codegen/shared_copy.h rename to include/triton/codegen/shared_copy.h diff --git a/include/codegen/tune.h b/include/triton/codegen/tune.h similarity index 100% rename from include/codegen/tune.h rename to include/triton/codegen/tune.h diff --git a/include/codegen/vectorize.h b/include/triton/codegen/vectorize.h similarity index 100% rename from include/codegen/vectorize.h rename to include/triton/codegen/vectorize.h diff --git a/include/driver/backend.h b/include/triton/driver/backend.h similarity index 100% rename from include/driver/backend.h rename to include/triton/driver/backend.h diff --git a/include/driver/buffer.h b/include/triton/driver/buffer.h similarity index 95% rename from include/driver/buffer.h rename to include/triton/driver/buffer.h index 475cf2273..1d4130cd0 100755 --- a/include/driver/buffer.h +++ b/include/triton/driver/buffer.h @@ -23,8 +23,8 @@ #ifndef TDL_INCLUDE_DRIVER_BUFFER_H #define TDL_INCLUDE_DRIVER_BUFFER_H -#include "driver/handle.h" -#include "driver/context.h" +#include "triton/driver/handle.h" +#include "triton/driver/context.h" namespace tdl { diff --git a/include/driver/context.h b/include/triton/driver/context.h similarity index 96% rename from include/driver/context.h rename to include/triton/driver/context.h index bd98faded..339a25c72 100755 --- a/include/driver/context.h +++ b/include/triton/driver/context.h @@ -23,8 +23,8 @@ #ifndef TDL_INCLUDE_DRIVER_CONTEXT_H #define TDL_INCLUDE_DRIVER_CONTEXT_H -#include "driver/device.h" -#include "driver/handle.h" +#include "triton/driver/device.h" +#include "triton/driver/handle.h" namespace tdl { diff --git a/include/driver/cublas.h b/include/triton/driver/cublas.h similarity index 97% rename from include/driver/cublas.h rename to include/triton/driver/cublas.h index 9e1688a97..175b7f089 100755 --- a/include/driver/cublas.h +++ b/include/triton/driver/cublas.h @@ -24,13 +24,13 @@ #define TDL_INCLUDE_DRIVER_CUBLAS_H #include "isaac/templates/common.hpp" -#include "driver/dispatch.h" -#include "driver/buffer.h" -#include "driver/stream.h" -#include "driver/backend.h" -#include "driver/error.h" -#include "tools/bench.hpp" -#include "tools/collections.hpp" +#include "triton/driver/dispatch.h" +#include "triton/driver/buffer.h" +#include "triton/driver/stream.h" +#include "triton/driver/backend.h" +#include "triton/driver/error.h" +#include "triton/tools/bench.hpp" +#include "triton/tools/collections.hpp" namespace tdl { diff --git a/include/driver/device.h b/include/triton/driver/device.h similarity index 97% rename from include/driver/device.h rename to include/triton/driver/device.h index cffaf64b2..3be7ca04f 100755 --- a/include/driver/device.h +++ b/include/triton/driver/device.h @@ -23,8 +23,8 @@ #ifndef TDL_INCLUDE_DRIVER_DEVICE_H #define TDL_INCLUDE_DRIVER_DEVICE_H -#include "driver/platform.h" -#include "driver/handle.h" +#include "triton/driver/platform.h" +#include "triton/driver/handle.h" namespace tdl { diff --git a/include/driver/dispatch.h b/include/triton/driver/dispatch.h similarity index 98% rename from include/driver/dispatch.h rename to include/triton/driver/dispatch.h index 910fdc001..42ce6729f 100755 --- a/include/driver/dispatch.h +++ b/include/triton/driver/dispatch.h @@ -27,11 +27,11 @@ #include //CUDA Backend -#include "external/CUDA/cuda.h" -#include "external/CUDA/nvrtc.h" -#include "external/CUDA/cublas_v2.h" -#include "external/CUDA/cudnn.h" -#include "external/CUDA/nvml.h" +#include "triton/external/CUDA/cuda.h" +#include "triton/external/CUDA/nvrtc.h" +#include "triton/external/CUDA/cublas_v2.h" +#include "triton/external/CUDA/cudnn.h" +#include "triton/external/CUDA/nvml.h" //Exceptions #include diff --git a/include/driver/error.h b/include/triton/driver/error.h similarity index 99% rename from include/driver/error.h rename to include/triton/driver/error.h index d1589aad5..b837dea92 100755 --- a/include/driver/error.h +++ b/include/triton/driver/error.h @@ -24,7 +24,7 @@ #define TDL_INCLUDE_DRIVER_ERROR_H #include -#include "driver/dispatch.h" +#include "triton/driver/dispatch.h" namespace tdl diff --git a/include/driver/event.h b/include/triton/driver/event.h similarity index 97% rename from include/driver/event.h rename to include/triton/driver/event.h index 23f2c557f..79fbbb56f 100755 --- a/include/driver/event.h +++ b/include/triton/driver/event.h @@ -23,7 +23,7 @@ #ifndef TDL_INCLUDE_DRIVER_EVENT_H #define TDL_INCLUDE_DRIVER_EVENT_H -#include "driver/handle.h" +#include "triton/driver/handle.h" namespace tdl { diff --git a/include/driver/handle.h b/include/triton/driver/handle.h similarity index 98% rename from include/driver/handle.h rename to include/triton/driver/handle.h index eb7c90705..cb8463584 100755 --- a/include/driver/handle.h +++ b/include/triton/driver/handle.h @@ -27,7 +27,7 @@ #include #include #include -#include "driver/dispatch.h" +#include "triton/driver/dispatch.h" namespace tdl { diff --git a/include/driver/kernel.h b/include/triton/driver/kernel.h similarity index 96% rename from include/driver/kernel.h rename to include/triton/driver/kernel.h index 60d4dc108..1fbf7935a 100755 --- a/include/driver/kernel.h +++ b/include/triton/driver/kernel.h @@ -23,8 +23,8 @@ #ifndef TDL_INCLUDE_DRIVER_KERNEL_H #define TDL_INCLUDE_DRIVER_KERNEL_H -#include "driver/module.h" -#include "driver/handle.h" +#include "triton/driver/module.h" +#include "triton/driver/handle.h" #include diff --git a/include/driver/module.h b/include/triton/driver/module.h similarity index 94% rename from include/driver/module.h rename to include/triton/driver/module.h index 2a1093233..913e90853 100755 --- a/include/driver/module.h +++ b/include/triton/driver/module.h @@ -24,9 +24,9 @@ #define TDL_INCLUDE_DRIVER_MODULE_H #include -#include "driver/handle.h" -#include "driver/context.h" -#include "driver/buffer.h" +#include "triton/driver/handle.h" +#include "triton/driver/context.h" +#include "triton/driver/buffer.h" namespace tdl { diff --git a/include/driver/platform.h b/include/triton/driver/platform.h similarity index 97% rename from include/driver/platform.h rename to include/triton/driver/platform.h index add506e82..d39c48e72 100755 --- a/include/driver/platform.h +++ b/include/triton/driver/platform.h @@ -26,7 +26,7 @@ #include #include -#include "driver/handle.h" +#include "triton/driver/handle.h" namespace tdl { diff --git a/include/driver/stream.h b/include/triton/driver/stream.h similarity index 94% rename from include/driver/stream.h rename to include/triton/driver/stream.h index 5ff59356c..8e5783892 100755 --- a/include/driver/stream.h +++ b/include/triton/driver/stream.h @@ -24,10 +24,10 @@ #define TDL_INCLUDE_DRIVER_STREAM_H #include -#include "driver/context.h" -#include "driver/device.h" -#include "driver/handle.h" -#include "driver/buffer.h" +#include "triton/driver/context.h" +#include "triton/driver/device.h" +#include "triton/driver/handle.h" +#include "triton/driver/buffer.h" namespace tdl { diff --git a/include/external/CUDA/builtin_types.h b/include/triton/external/CUDA/builtin_types.h similarity index 100% rename from include/external/CUDA/builtin_types.h rename to include/triton/external/CUDA/builtin_types.h diff --git a/include/external/CUDA/channel_descriptor.h b/include/triton/external/CUDA/channel_descriptor.h similarity index 100% rename from include/external/CUDA/channel_descriptor.h rename to include/triton/external/CUDA/channel_descriptor.h diff --git a/include/external/CUDA/crt/host_config.h b/include/triton/external/CUDA/crt/host_config.h similarity index 100% rename from include/external/CUDA/crt/host_config.h rename to include/triton/external/CUDA/crt/host_config.h diff --git a/include/external/CUDA/crt/host_defines.h b/include/triton/external/CUDA/crt/host_defines.h similarity index 100% rename from include/external/CUDA/crt/host_defines.h rename to include/triton/external/CUDA/crt/host_defines.h diff --git a/include/external/CUDA/cuComplex.h b/include/triton/external/CUDA/cuComplex.h similarity index 100% rename from include/external/CUDA/cuComplex.h rename to include/triton/external/CUDA/cuComplex.h diff --git a/include/external/CUDA/cublas.h b/include/triton/external/CUDA/cublas.h similarity index 100% rename from include/external/CUDA/cublas.h rename to include/triton/external/CUDA/cublas.h diff --git a/include/external/CUDA/cublas_api.h b/include/triton/external/CUDA/cublas_api.h similarity index 100% rename from include/external/CUDA/cublas_api.h rename to include/triton/external/CUDA/cublas_api.h diff --git a/include/external/CUDA/cublas_v2.h b/include/triton/external/CUDA/cublas_v2.h similarity index 100% rename from include/external/CUDA/cublas_v2.h rename to include/triton/external/CUDA/cublas_v2.h diff --git a/include/external/CUDA/cuda.h b/include/triton/external/CUDA/cuda.h similarity index 100% rename from include/external/CUDA/cuda.h rename to include/triton/external/CUDA/cuda.h diff --git a/include/external/CUDA/cuda_device_runtime_api.h b/include/triton/external/CUDA/cuda_device_runtime_api.h similarity index 100% rename from include/external/CUDA/cuda_device_runtime_api.h rename to include/triton/external/CUDA/cuda_device_runtime_api.h diff --git a/include/external/CUDA/cuda_fp16.h b/include/triton/external/CUDA/cuda_fp16.h similarity index 100% rename from include/external/CUDA/cuda_fp16.h rename to include/triton/external/CUDA/cuda_fp16.h diff --git a/include/external/CUDA/cuda_fp16.hpp b/include/triton/external/CUDA/cuda_fp16.hpp similarity index 100% rename from include/external/CUDA/cuda_fp16.hpp rename to include/triton/external/CUDA/cuda_fp16.hpp diff --git a/include/external/CUDA/cuda_runtime.h b/include/triton/external/CUDA/cuda_runtime.h similarity index 100% rename from include/external/CUDA/cuda_runtime.h rename to include/triton/external/CUDA/cuda_runtime.h diff --git a/include/external/CUDA/cuda_runtime_api.h b/include/triton/external/CUDA/cuda_runtime_api.h similarity index 100% rename from include/external/CUDA/cuda_runtime_api.h rename to include/triton/external/CUDA/cuda_runtime_api.h diff --git a/include/external/CUDA/cudnn.h b/include/triton/external/CUDA/cudnn.h similarity index 100% rename from include/external/CUDA/cudnn.h rename to include/triton/external/CUDA/cudnn.h diff --git a/include/external/CUDA/cusparse.h b/include/triton/external/CUDA/cusparse.h similarity index 100% rename from include/external/CUDA/cusparse.h rename to include/triton/external/CUDA/cusparse.h diff --git a/include/external/CUDA/device_types.h b/include/triton/external/CUDA/device_types.h similarity index 100% rename from include/external/CUDA/device_types.h rename to include/triton/external/CUDA/device_types.h diff --git a/include/external/CUDA/driver_functions.h b/include/triton/external/CUDA/driver_functions.h similarity index 100% rename from include/external/CUDA/driver_functions.h rename to include/triton/external/CUDA/driver_functions.h diff --git a/include/external/CUDA/driver_types.h b/include/triton/external/CUDA/driver_types.h similarity index 100% rename from include/external/CUDA/driver_types.h rename to include/triton/external/CUDA/driver_types.h diff --git a/include/external/CUDA/host_config.h b/include/triton/external/CUDA/host_config.h similarity index 100% rename from include/external/CUDA/host_config.h rename to include/triton/external/CUDA/host_config.h diff --git a/include/external/CUDA/host_defines.h b/include/triton/external/CUDA/host_defines.h similarity index 100% rename from include/external/CUDA/host_defines.h rename to include/triton/external/CUDA/host_defines.h diff --git a/include/external/CUDA/library_types.h b/include/triton/external/CUDA/library_types.h similarity index 100% rename from include/external/CUDA/library_types.h rename to include/triton/external/CUDA/library_types.h diff --git a/include/external/CUDA/nvml.h b/include/triton/external/CUDA/nvml.h similarity index 100% rename from include/external/CUDA/nvml.h rename to include/triton/external/CUDA/nvml.h diff --git a/include/external/CUDA/nvrtc.h b/include/triton/external/CUDA/nvrtc.h similarity index 100% rename from include/external/CUDA/nvrtc.h rename to include/triton/external/CUDA/nvrtc.h diff --git a/include/external/CUDA/surface_types.h b/include/triton/external/CUDA/surface_types.h similarity index 100% rename from include/external/CUDA/surface_types.h rename to include/triton/external/CUDA/surface_types.h diff --git a/include/external/CUDA/texture_types.h b/include/triton/external/CUDA/texture_types.h similarity index 100% rename from include/external/CUDA/texture_types.h rename to include/triton/external/CUDA/texture_types.h diff --git a/include/external/CUDA/vector_functions.h b/include/triton/external/CUDA/vector_functions.h similarity index 100% rename from include/external/CUDA/vector_functions.h rename to include/triton/external/CUDA/vector_functions.h diff --git a/include/external/CUDA/vector_functions.hpp b/include/triton/external/CUDA/vector_functions.hpp similarity index 100% rename from include/external/CUDA/vector_functions.hpp rename to include/triton/external/CUDA/vector_functions.hpp diff --git a/include/external/CUDA/vector_types.h b/include/triton/external/CUDA/vector_types.h similarity index 100% rename from include/external/CUDA/vector_types.h rename to include/triton/external/CUDA/vector_types.h diff --git a/include/ir/basic_block.h b/include/triton/ir/basic_block.h similarity index 100% rename from include/ir/basic_block.h rename to include/triton/ir/basic_block.h diff --git a/include/ir/builder.h b/include/triton/ir/builder.h similarity index 100% rename from include/ir/builder.h rename to include/triton/ir/builder.h diff --git a/include/ir/constant.h b/include/triton/ir/constant.h similarity index 100% rename from include/ir/constant.h rename to include/triton/ir/constant.h diff --git a/include/ir/context.h b/include/triton/ir/context.h similarity index 90% rename from include/ir/context.h rename to include/triton/ir/context.h index c7382a0cb..d3018aa6f 100644 --- a/include/ir/context.h +++ b/include/triton/ir/context.h @@ -2,7 +2,7 @@ #define TDL_INCLUDE_IR_CONTEXT_H #include -#include "ir/type.h" +#include "triton/ir/type.h" namespace tdl{ namespace ir{ diff --git a/include/ir/context_impl.h b/include/triton/ir/context_impl.h similarity index 97% rename from include/ir/context_impl.h rename to include/triton/ir/context_impl.h index b9017b39c..623f58e40 100644 --- a/include/ir/context_impl.h +++ b/include/triton/ir/context_impl.h @@ -3,7 +3,7 @@ #include #include -#include "ir/type.h" +#include "triton/ir/type.h" namespace tdl{ namespace ir{ diff --git a/include/ir/function.h b/include/triton/ir/function.h similarity index 100% rename from include/ir/function.h rename to include/triton/ir/function.h diff --git a/include/ir/instructions.h b/include/triton/ir/instructions.h similarity index 99% rename from include/ir/instructions.h rename to include/triton/ir/instructions.h index ae752f78e..93dc0b78e 100644 --- a/include/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -3,7 +3,7 @@ #include #include "value.h" -#include "ir/type.h" +#include "triton/ir/type.h" #include "llvm/IR/Instructions.h" namespace tdl{ diff --git a/include/ir/module.h b/include/triton/ir/module.h similarity index 100% rename from include/ir/module.h rename to include/triton/ir/module.h diff --git a/include/triton/ir/print.h b/include/triton/ir/print.h new file mode 100644 index 000000000..a31929282 --- /dev/null +++ b/include/triton/ir/print.h @@ -0,0 +1,17 @@ +#ifndef TDL_INCLUDE_IR_PRINT_H +#define TDL_INCLUDE_IR_PRINT_H + + +#include "builder.h" + +namespace tdl{ +namespace ir{ + +class module; + +void print(module &mod, std::ostream& os); + +} +} + +#endif diff --git a/include/ir/type.h b/include/triton/ir/type.h similarity index 100% rename from include/ir/type.h rename to include/triton/ir/type.h diff --git a/include/ir/value.h b/include/triton/ir/value.h similarity index 100% rename from include/ir/value.h rename to include/triton/ir/value.h diff --git a/include/tools/sys/getenv.hpp b/include/triton/tools/sys/getenv.hpp similarity index 100% rename from include/tools/sys/getenv.hpp rename to include/triton/tools/sys/getenv.hpp diff --git a/include/tools/sys/mkdir.hpp b/include/triton/tools/sys/mkdir.hpp similarity index 100% rename from include/tools/sys/mkdir.hpp rename to include/triton/tools/sys/mkdir.hpp diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 714d8b16f..4d1fa5048 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -1,12 +1,12 @@ #include #include -#include "ast/ast.h" -#include "ir/constant.h" -#include "ir/function.h" -#include "ir/module.h" -#include "ir/basic_block.h" -#include "ir/builder.h" -#include "ir/type.h" +#include "triton/ast/ast.h" +#include "triton/ir/constant.h" +#include "triton/ir/function.h" +#include "triton/ir/module.h" +#include "triton/ir/basic_block.h" +#include "triton/ir/builder.h" +#include "triton/ir/type.h" #include diff --git a/lib/codegen/allocation.cpp b/lib/codegen/allocation.cpp index 696b46cb9..d396ec790 100644 --- a/lib/codegen/allocation.cpp +++ b/lib/codegen/allocation.cpp @@ -1,12 +1,12 @@ -#include "codegen/allocation.h" -#include "codegen/liveness.h" -#include "codegen/layout.h" -#include "codegen/buffer_info.h" -#include "ir/basic_block.h" -#include "ir/type.h" -#include "ir/value.h" -#include "ir/function.h" -#include "ir/instructions.h" +#include "triton/codegen/allocation.h" +#include "triton/codegen/liveness.h" +#include "triton/codegen/layout.h" +#include "triton/codegen/buffer_info.h" +#include "triton/ir/basic_block.h" +#include "triton/ir/type.h" +#include "triton/ir/value.h" +#include "triton/ir/function.h" +#include "triton/ir/instructions.h" namespace tdl{ namespace codegen{ diff --git a/lib/codegen/barriers.cpp b/lib/codegen/barriers.cpp index df017931b..c40c08186 100644 --- a/lib/codegen/barriers.cpp +++ b/lib/codegen/barriers.cpp @@ -1,11 +1,11 @@ #include -#include "codegen/barriers.h" -#include "codegen/allocation.h" -#include "codegen/buffer_info.h" -#include "ir/module.h" -#include "ir/function.h" -#include "ir/basic_block.h" -#include "ir/instructions.h" +#include "triton/codegen/barriers.h" +#include "triton/codegen/allocation.h" +#include "triton/codegen/buffer_info.h" +#include "triton/ir/module.h" +#include "triton/ir/function.h" +#include "triton/ir/basic_block.h" +#include "triton/ir/instructions.h" namespace tdl { diff --git a/lib/codegen/buffer_info.cpp b/lib/codegen/buffer_info.cpp index 4d20fa6e9..c8fe08df6 100644 --- a/lib/codegen/buffer_info.cpp +++ b/lib/codegen/buffer_info.cpp @@ -1,9 +1,9 @@ -#include "codegen/buffer_info.h" -#include "ir/module.h" -#include "ir/function.h" -#include "ir/basic_block.h" -#include "ir/instructions.h" -#include "ir/type.h" +#include "triton/codegen/buffer_info.h" +#include "triton/ir/module.h" +#include "triton/ir/function.h" +#include "triton/ir/basic_block.h" +#include "triton/ir/instructions.h" +#include "triton/ir/type.h" namespace tdl { diff --git a/lib/codegen/layout.cpp b/lib/codegen/layout.cpp index 58f81227a..040954caa 100644 --- a/lib/codegen/layout.cpp +++ b/lib/codegen/layout.cpp @@ -1,8 +1,8 @@ -#include "codegen/layout.h" -#include "ir/function.h" -#include "ir/module.h" -#include "ir/basic_block.h" -#include "ir/instructions.h" +#include "triton/codegen/layout.h" +#include "triton/ir/function.h" +#include "triton/ir/module.h" +#include "triton/ir/basic_block.h" +#include "triton/ir/instructions.h" namespace tdl{ namespace codegen{ diff --git a/lib/codegen/liveness.cpp b/lib/codegen/liveness.cpp index 05b803f8f..f5e5a79c6 100644 --- a/lib/codegen/liveness.cpp +++ b/lib/codegen/liveness.cpp @@ -1,10 +1,10 @@ -#include "codegen/liveness.h" -#include "codegen/buffer_info.h" -#include "ir/basic_block.h" -#include "ir/function.h" -#include "ir/module.h" -#include "ir/instructions.h" -#include "ir/value.h" +#include "triton/codegen/liveness.h" +#include "triton/codegen/buffer_info.h" +#include "triton/ir/basic_block.h" +#include "triton/ir/function.h" +#include "triton/ir/module.h" +#include "triton/ir/instructions.h" +#include "triton/ir/value.h" namespace tdl{ namespace codegen{ diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index a028134fa..4cb2cf827 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -1,13 +1,13 @@ -#include "codegen/selection.h" -#include "codegen/tune.h" -#include "codegen/allocation.h" +#include "triton/codegen/selection.h" +#include "triton/codegen/tune.h" +#include "triton/codegen/allocation.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Module.h" #include "llvm/IR/IRBuilder.h" -#include "ir/context.h" -#include "ir/module.h" -#include "ir/function.h" -#include "ir/type.h" +#include "triton/ir/context.h" +#include "triton/ir/module.h" +#include "triton/ir/function.h" +#include "triton/ir/type.h" #include "llvm/Transforms/Scalar/EarlyCSE.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" diff --git a/lib/codegen/shared_copy.cpp b/lib/codegen/shared_copy.cpp index 60c31199f..b7276bbfe 100644 --- a/lib/codegen/shared_copy.cpp +++ b/lib/codegen/shared_copy.cpp @@ -1,10 +1,10 @@ #include -#include "codegen/shared_copy.h" -#include "codegen/buffer_info.h" -#include "ir/module.h" -#include "ir/function.h" -#include "ir/basic_block.h" -#include "ir/instructions.h" +#include "triton/codegen/shared_copy.h" +#include "triton/codegen/buffer_info.h" +#include "triton/ir/module.h" +#include "triton/ir/function.h" +#include "triton/ir/basic_block.h" +#include "triton/ir/instructions.h" namespace tdl { diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 3dc5c4e87..302676697 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -1,9 +1,9 @@ -#include "codegen/tune.h" -#include "ir/instructions.h" -#include "ir/type.h" -#include "ir/module.h" -#include "ir/function.h" -#include "ir/context_impl.h" +#include "triton/codegen/tune.h" +#include "triton/ir/instructions.h" +#include "triton/ir/type.h" +#include "triton/ir/module.h" +#include "triton/ir/function.h" +#include "triton/ir/context_impl.h" #include diff --git a/lib/codegen/vectorize.cpp b/lib/codegen/vectorize.cpp index 41a1afd10..4c45928a2 100644 --- a/lib/codegen/vectorize.cpp +++ b/lib/codegen/vectorize.cpp @@ -1,9 +1,9 @@ -#include "codegen/vectorize.h" -#include "codegen/tune.h" -#include "ir/module.h" -#include "ir/function.h" -#include "ir/basic_block.h" -#include "ir/instructions.h" +#include "triton/codegen/vectorize.h" +#include "triton/codegen/tune.h" +#include "triton/ir/module.h" +#include "triton/ir/function.h" +#include "triton/ir/basic_block.h" +#include "triton/ir/instructions.h" namespace tdl { diff --git a/lib/driver/backend.cpp b/lib/driver/backend.cpp index bddb419df..cdf72027d 100755 --- a/lib/driver/backend.cpp +++ b/lib/driver/backend.cpp @@ -20,12 +20,12 @@ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "driver/dispatch.h" -#include "driver/backend.h" -#include "driver/buffer.h" -#include "driver/context.h" -#include "driver/stream.h" -#include "driver/kernel.h" +#include "triton/driver/dispatch.h" +#include "triton/driver/backend.h" +#include "triton/driver/buffer.h" +#include "triton/driver/context.h" +#include "triton/driver/stream.h" +#include "triton/driver/kernel.h" #include #include diff --git a/lib/driver/buffer.cpp b/lib/driver/buffer.cpp index aa770a05d..069e8abe1 100755 --- a/lib/driver/buffer.cpp +++ b/lib/driver/buffer.cpp @@ -21,10 +21,10 @@ */ #include -#include "driver/stream.h" -#include "driver/buffer.h" -#include "driver/context.h" -#include "driver/dispatch.h" +#include "triton/driver/stream.h" +#include "triton/driver/buffer.h" +#include "triton/driver/context.h" +#include "triton/driver/dispatch.h" namespace tdl diff --git a/lib/driver/context.cpp b/lib/driver/context.cpp index 9da2c6978..8e365db81 100755 --- a/lib/driver/context.cpp +++ b/lib/driver/context.cpp @@ -23,11 +23,11 @@ #include #include -#include "driver/context.h" -#include "driver/module.h" +#include "triton/driver/context.h" +#include "triton/driver/module.h" -#include "tools/sys/getenv.hpp" -#include "tools/sys/mkdir.hpp" +#include "triton/tools/sys/getenv.hpp" +#include "triton/tools/sys/mkdir.hpp" namespace tdl { diff --git a/lib/driver/device.cpp b/lib/driver/device.cpp index 13f10f6a0..22f640d7b 100755 --- a/lib/driver/device.cpp +++ b/lib/driver/device.cpp @@ -26,7 +26,7 @@ #include #include -#include "driver/device.h" +#include "triton/driver/device.h" namespace tdl { diff --git a/lib/driver/dispatch.cpp b/lib/driver/dispatch.cpp index 2d0cd5232..d7d19727d 100755 --- a/lib/driver/dispatch.cpp +++ b/lib/driver/dispatch.cpp @@ -21,8 +21,8 @@ */ #include -#include "driver/dispatch.h" -#include "driver/context.h" +#include "triton/driver/dispatch.h" +#include "triton/driver/context.h" namespace tdl { diff --git a/lib/driver/error.cpp b/lib/driver/error.cpp index 7e7dc9d75..9759a1323 100755 --- a/lib/driver/error.cpp +++ b/lib/driver/error.cpp @@ -20,7 +20,7 @@ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "driver/error.h" +#include "triton/driver/error.h" namespace tdl { diff --git a/lib/driver/event.cpp b/lib/driver/event.cpp index dc554d808..b8841dc0f 100755 --- a/lib/driver/event.cpp +++ b/lib/driver/event.cpp @@ -20,7 +20,7 @@ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "driver/event.h" +#include "triton/driver/event.h" namespace tdl { diff --git a/lib/driver/handle.cpp b/lib/driver/handle.cpp index a01a099bd..cd7ee4195 100755 --- a/lib/driver/handle.cpp +++ b/lib/driver/handle.cpp @@ -22,7 +22,7 @@ #include #include -#include "driver/handle.h" +#include "triton/driver/handle.h" namespace tdl { diff --git a/lib/driver/kernel.cpp b/lib/driver/kernel.cpp index 6e536b767..180e46cc7 100755 --- a/lib/driver/kernel.cpp +++ b/lib/driver/kernel.cpp @@ -23,8 +23,8 @@ #include #include -#include "driver/kernel.h" -#include "driver/buffer.h" +#include "triton/driver/kernel.h" +#include "triton/driver/buffer.h" namespace tdl { diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index c61482cbc..d897489fe 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -23,11 +23,11 @@ #include #include -#include "driver/module.h" -#include "driver/context.h" -#include "driver/error.h" +#include "triton/driver/module.h" +#include "triton/driver/context.h" +#include "triton/driver/error.h" -#include "tools/sys/getenv.hpp" +#include "triton/tools/sys/getenv.hpp" namespace tdl { diff --git a/lib/driver/platform.cpp b/lib/driver/platform.cpp index 2fa9933d3..ada13de41 100755 --- a/lib/driver/platform.cpp +++ b/lib/driver/platform.cpp @@ -21,8 +21,8 @@ */ -#include "driver/platform.h" -#include "driver/device.h" +#include "triton/driver/platform.h" +#include "triton/driver/device.h" #include diff --git a/lib/driver/stream.cpp b/lib/driver/stream.cpp index 0296eba40..39996d473 100755 --- a/lib/driver/stream.cpp +++ b/lib/driver/stream.cpp @@ -24,13 +24,13 @@ #include #include -#include "driver/backend.h" -#include "driver/stream.h" -#include "driver/context.h" -#include "driver/device.h" -#include "driver/event.h" -#include "driver/kernel.h" -#include "driver/buffer.h" +#include "triton/driver/backend.h" +#include "triton/driver/stream.h" +#include "triton/driver/context.h" +#include "triton/driver/device.h" +#include "triton/driver/event.h" +#include "triton/driver/kernel.h" +#include "triton/driver/buffer.h" namespace tdl { diff --git a/lib/ir/basic_block.cpp b/lib/ir/basic_block.cpp index c7d8493e8..15fa3188c 100644 --- a/lib/ir/basic_block.cpp +++ b/lib/ir/basic_block.cpp @@ -1,7 +1,7 @@ -#include "ir/basic_block.h" -#include "ir/instructions.h" -#include "ir/type.h" -#include "ir/function.h" +#include "triton/ir/basic_block.h" +#include "triton/ir/instructions.h" +#include "triton/ir/type.h" +#include "triton/ir/function.h" namespace tdl { namespace ir { diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index c2b338236..467a7ef71 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -1,9 +1,9 @@ #include -#include "ir/basic_block.h" -#include "ir/builder.h" -#include "ir/constant.h" -#include "ir/instructions.h" -#include "ir/type.h" +#include "triton/ir/basic_block.h" +#include "triton/ir/builder.h" +#include "triton/ir/constant.h" +#include "triton/ir/instructions.h" +#include "triton/ir/type.h" #include "llvm/IR/Instruction.h" namespace tdl{ diff --git a/lib/ir/constant.cpp b/lib/ir/constant.cpp index 87f669e4d..5a6cfbc7a 100644 --- a/lib/ir/constant.cpp +++ b/lib/ir/constant.cpp @@ -1,8 +1,8 @@ #include -#include "ir/constant.h" -#include "ir/type.h" -#include "ir/context.h" -#include "ir/context_impl.h" +#include "triton/ir/constant.h" +#include "triton/ir/type.h" +#include "triton/ir/context.h" +#include "triton/ir/context_impl.h" namespace tdl{ namespace ir{ diff --git a/lib/ir/context.cpp b/lib/ir/context.cpp index a3fd665e1..33a852bda 100644 --- a/lib/ir/context.cpp +++ b/lib/ir/context.cpp @@ -1,6 +1,6 @@ -#include "ir/context_impl.h" -#include "ir/context.h" -#include "ir/type.h" +#include "triton/ir/context_impl.h" +#include "triton/ir/context.h" +#include "triton/ir/type.h" namespace tdl{ namespace ir{ diff --git a/lib/ir/function.cpp b/lib/ir/function.cpp index 6d3329f21..75a465e8f 100644 --- a/lib/ir/function.cpp +++ b/lib/ir/function.cpp @@ -1,6 +1,6 @@ -#include "ir/function.h" -#include "ir/type.h" -#include "ir/module.h" +#include "triton/ir/function.h" +#include "triton/ir/type.h" +#include "triton/ir/module.h" namespace tdl{ namespace ir{ diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 38adcc377..93ba4602a 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -1,8 +1,8 @@ -#include "ir/context.h" -#include "ir/basic_block.h" -#include "ir/instructions.h" -#include "ir/constant.h" -#include "ir/type.h" +#include "triton/ir/context.h" +#include "triton/ir/basic_block.h" +#include "triton/ir/instructions.h" +#include "triton/ir/constant.h" +#include "triton/ir/type.h" namespace tdl{ namespace ir{ diff --git a/lib/ir/module.cpp b/lib/ir/module.cpp index 66a4784cc..28825fe4b 100644 --- a/lib/ir/module.cpp +++ b/lib/ir/module.cpp @@ -1,8 +1,8 @@ -#include "ir/basic_block.h" -#include "ir/module.h" -#include "ir/type.h" -#include "ir/constant.h" -#include "ir/function.h" +#include "triton/ir/basic_block.h" +#include "triton/ir/module.h" +#include "triton/ir/type.h" +#include "triton/ir/constant.h" +#include "triton/ir/function.h" namespace tdl{ namespace ir{ diff --git a/lib/ir/print.cpp b/lib/ir/print.cpp new file mode 100644 index 000000000..34829aec0 --- /dev/null +++ b/lib/ir/print.cpp @@ -0,0 +1,59 @@ +#include "triton/ir/basic_block.h" +#include "triton/ir/module.h" +#include "triton/ir/type.h" +#include "triton/ir/constant.h" +#include "triton/ir/function.h" +#include "triton/ir/instructions.h" +#include "triton/ir/print.h" + +namespace tdl{ +namespace ir{ + +std::string get_name(ir::value *v, unsigned i) { + if(v->get_name().empty()){ + std::string name = "%" + std::to_string(i); + v->set_name(name); + } + return v->get_name(); +} + + +void print(module &mod, std::ostream& os) { + unsigned cnt = 0; + for(ir::function *fn: mod.get_function_list()){ + os << "{" << std::endl; + for(ir::basic_block *block: fn->blocks()){ + auto const &predecessors = block->get_predecessors(); + os << block->get_name() << ":"; + if(!predecessors.empty()){ + os << " "; + os << "; preds = "; + auto const &predecessors = block->get_predecessors(); + for(ir::basic_block *pred: predecessors) + os << pred->get_name() << (pred!=predecessors.back()?", ":""); + } + os << std::endl; + for(ir::instruction *inst: block->get_inst_list()){ + os << " "; + if(ir::value *pred = inst->get_mask_pred()) + os << "@" << get_name(pred, cnt++) << " "; + if(!inst->get_type()->is_void_ty()) + os << get_name(inst, cnt++) << " = "; + os << inst->repr(); + ir::instruction::ops_t ops = inst->ops(); + size_t num_ops = inst->get_num_operands(); + if(num_ops > 0) + os << " "; + for(unsigned i = 0; i < num_ops; i++) + os << get_name(ops[i], cnt++) << (i < num_ops - 1?", ":""); + os << ";" << std::endl; + } + os << std::endl; + } + os << "}" << std::endl; + } +} + + +} +} diff --git a/lib/ir/type.cpp b/lib/ir/type.cpp index 5aebd94a5..10a8d582e 100644 --- a/lib/ir/type.cpp +++ b/lib/ir/type.cpp @@ -1,9 +1,9 @@ #include -#include "ir/type.h" -#include "ir/context.h" -#include "ir/context_impl.h" -#include "ir/value.h" -#include "ir/constant.h" +#include "triton/ir/type.h" +#include "triton/ir/context.h" +#include "triton/ir/context_impl.h" +#include "triton/ir/value.h" +#include "triton/ir/constant.h" namespace tdl{ namespace ir{ diff --git a/lib/ir/value.cpp b/lib/ir/value.cpp index 2d43f8e9d..3827af220 100644 --- a/lib/ir/value.cpp +++ b/lib/ir/value.cpp @@ -1,5 +1,5 @@ -#include "ir/value.h" -#include "ir/instructions.h" +#include "triton/ir/value.h" +#include "triton/ir/instructions.h" #include #include From daa828ec18baf3e8a89d2cb8b63257e519eb75da Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 24 Feb 2019 14:22:44 -0500 Subject: [PATCH 086/494] [general] rename namespace tdl -> triton --- examples/matrix.cpp | 26 +++++++++++++------------- include/triton/ast/ast.h | 2 +- include/triton/ast/parser.y | 8 ++++---- include/triton/codegen/allocation.h | 2 +- include/triton/codegen/barriers.h | 2 +- include/triton/codegen/buffer_info.h | 2 +- include/triton/codegen/layout.h | 2 +- include/triton/codegen/liveness.h | 2 +- include/triton/codegen/selection.h | 2 +- include/triton/codegen/shared_copy.h | 2 +- include/triton/codegen/tune.h | 2 +- include/triton/codegen/vectorize.h | 2 +- include/triton/driver/backend.h | 2 +- include/triton/driver/buffer.h | 2 +- include/triton/driver/context.h | 2 +- include/triton/driver/cublas.h | 2 +- include/triton/driver/device.h | 2 +- include/triton/driver/dispatch.h | 2 +- include/triton/driver/error.h | 2 +- include/triton/driver/event.h | 2 +- include/triton/driver/handle.h | 2 +- include/triton/driver/kernel.h | 2 +- include/triton/driver/module.h | 2 +- include/triton/driver/platform.h | 2 +- include/triton/driver/stream.h | 2 +- include/triton/ir/basic_block.h | 2 +- include/triton/ir/builder.h | 2 +- include/triton/ir/constant.h | 2 +- include/triton/ir/context.h | 2 +- include/triton/ir/context_impl.h | 2 +- include/triton/ir/function.h | 2 +- include/triton/ir/instructions.h | 2 +- include/triton/ir/module.h | 2 +- include/triton/ir/print.h | 2 +- include/triton/ir/type.h | 2 +- include/triton/ir/value.h | 2 +- include/triton/tools/sys/getenv.hpp | 2 +- include/triton/tools/sys/mkdir.hpp | 2 +- lib/ast/lowering.cpp | 2 +- lib/codegen/allocation.cpp | 2 +- lib/codegen/barriers.cpp | 2 +- lib/codegen/buffer_info.cpp | 2 +- lib/codegen/layout.cpp | 2 +- lib/codegen/liveness.cpp | 2 +- lib/codegen/selection.cpp | 2 +- lib/codegen/shared_copy.cpp | 2 +- lib/codegen/tune.cpp | 2 +- lib/codegen/vectorize.cpp | 2 +- lib/driver/backend.cpp | 2 +- lib/driver/buffer.cpp | 2 +- lib/driver/context.cpp | 2 +- lib/driver/device.cpp | 2 +- lib/driver/dispatch.cpp | 2 +- lib/driver/error.cpp | 2 +- lib/driver/event.cpp | 2 +- lib/driver/handle.cpp | 2 +- lib/driver/kernel.cpp | 2 +- lib/driver/module.cpp | 2 +- lib/driver/platform.cpp | 2 +- lib/driver/stream.cpp | 2 +- lib/ir/basic_block.cpp | 2 +- lib/ir/builder.cpp | 2 +- lib/ir/constant.cpp | 2 +- lib/ir/context.cpp | 2 +- lib/ir/function.cpp | 2 +- lib/ir/instructions.cpp | 2 +- lib/ir/module.cpp | 2 +- lib/ir/print.cpp | 2 +- lib/ir/type.cpp | 2 +- lib/ir/value.cpp | 2 +- 70 files changed, 85 insertions(+), 85 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index e0fd34646..382f8a324 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -33,7 +33,7 @@ typedef struct yy_buffer_state * YY_BUFFER_STATE; extern int yyparse(); extern YY_BUFFER_STATE yy_scan_string(const char * str); extern void yy_delete_buffer(YY_BUFFER_STATE buffer); -using tdl::ast::translation_unit; +using triton::ast::translation_unit; extern translation_unit *ast_root; const char src[] = @@ -199,8 +199,8 @@ int main() { translation_unit *program = ast_root; // create Triton-IR from AST - tdl::ir::context context; - tdl::ir::module module("matrix", context); + triton::ir::context context; + triton::ir::module module("matrix", context); program->codegen(&module); llvm::LLVMContext llvm_context; llvm::Module llvm_module("matmul", llvm_context); @@ -208,14 +208,14 @@ int main() { // create passes - tdl::codegen::buffer_info_pass buffer_info; - tdl::codegen::place_shared_copy shared(&buffer_info); - tdl::codegen::tune tune; - tdl::codegen::liveness liveness(&buffer_info); - tdl::codegen::allocation allocation(&liveness, &buffer_info); - tdl::codegen::barriers barriers(&allocation, &buffer_info); - tdl::codegen::vectorize vectorize(&tune); - tdl::codegen::selection selection(&allocation, &tune, &buffer_info); + triton::codegen::buffer_info_pass buffer_info; + triton::codegen::place_shared_copy shared(&buffer_info); + triton::codegen::tune tune; + triton::codegen::liveness liveness(&buffer_info); + triton::codegen::allocation allocation(&liveness, &buffer_info); + triton::codegen::barriers barriers(&allocation, &buffer_info); + triton::codegen::vectorize vectorize(&tune); + triton::codegen::selection selection(&allocation, &tune, &buffer_info); // tuning parameters tune.run(module); @@ -243,7 +243,7 @@ int main() { for(unsigned *x: tune.get_params(module)) *x = params[3 + i++]; // constraints - std::map> errors; + std::map> errors; tune.check_constraints(module, errors); std::cout << "errors: " << errors.size() << std::endl; for(auto &x: errors){ @@ -255,7 +255,7 @@ int main() { // run passes - tdl::ir::print(module, std::cout); + triton::ir::print(module, std::cout); buffer_info.run(module); shared.run(module); liveness.run(module); diff --git a/include/triton/ast/ast.h b/include/triton/ast/ast.h index 32077f1a2..b2d07ddd2 100644 --- a/include/triton/ast/ast.h +++ b/include/triton/ast/ast.h @@ -8,7 +8,7 @@ #include -namespace tdl{ +namespace triton{ namespace ir{ diff --git a/include/triton/ast/parser.y b/include/triton/ast/parser.y index 3fdcbd60b..cf3c011a2 100644 --- a/include/triton/ast/parser.y +++ b/include/triton/ast/parser.y @@ -1,10 +1,10 @@ %{ -namespace tdl{ +namespace triton{ namespace ast{ class node; } } -using namespace tdl::ast; +using namespace triton::ast; #define YYSTYPE node* #include "../include/triton/ast/ast.h" @@ -129,8 +129,8 @@ primary_expression_list ; slice - : ':' { $$ = new slice(tdl::ast::ALL); } - | NEWAXIS { $$ = new slice(tdl::ast::NEWAXIS); } + : ':' { $$ = new slice(triton::ast::ALL); } + | NEWAXIS { $$ = new slice(triton::ast::NEWAXIS); } slice_list : slice { $$ = new list((slice*)$1); } diff --git a/include/triton/codegen/allocation.h b/include/triton/codegen/allocation.h index ad58ccea7..1f2a7656c 100644 --- a/include/triton/codegen/allocation.h +++ b/include/triton/codegen/allocation.h @@ -5,7 +5,7 @@ #include #include -namespace tdl{ +namespace triton{ namespace ir{ class value; diff --git a/include/triton/codegen/barriers.h b/include/triton/codegen/barriers.h index 5199f94ad..546b36893 100644 --- a/include/triton/codegen/barriers.h +++ b/include/triton/codegen/barriers.h @@ -5,7 +5,7 @@ #include #include -namespace tdl { +namespace triton { namespace ir { class module; diff --git a/include/triton/codegen/buffer_info.h b/include/triton/codegen/buffer_info.h index 0d22608c2..c9b954a58 100644 --- a/include/triton/codegen/buffer_info.h +++ b/include/triton/codegen/buffer_info.h @@ -4,7 +4,7 @@ #include #include -namespace tdl { +namespace triton { namespace ir { class module; diff --git a/include/triton/codegen/layout.h b/include/triton/codegen/layout.h index d63a5dbe2..a18f6439f 100644 --- a/include/triton/codegen/layout.h +++ b/include/triton/codegen/layout.h @@ -4,7 +4,7 @@ #include #include -namespace tdl { +namespace triton { namespace ir { class module; diff --git a/include/triton/codegen/liveness.h b/include/triton/codegen/liveness.h index fd4faf2f3..010bb4e2a 100644 --- a/include/triton/codegen/liveness.h +++ b/include/triton/codegen/liveness.h @@ -3,7 +3,7 @@ #include -namespace tdl{ +namespace triton{ namespace ir{ class value; diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 3f515ee4d..13bc3b6b6 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -18,7 +18,7 @@ namespace llvm{ class LLVMContext; } -namespace tdl{ +namespace triton{ namespace codegen{ class allocation; diff --git a/include/triton/codegen/shared_copy.h b/include/triton/codegen/shared_copy.h index be043b18c..3a3d7363b 100644 --- a/include/triton/codegen/shared_copy.h +++ b/include/triton/codegen/shared_copy.h @@ -4,7 +4,7 @@ #include #include -namespace tdl { +namespace triton { namespace ir { class module; diff --git a/include/triton/codegen/tune.h b/include/triton/codegen/tune.h index dfa1fcc97..cb1d5b509 100644 --- a/include/triton/codegen/tune.h +++ b/include/triton/codegen/tune.h @@ -5,7 +5,7 @@ #include #include -namespace tdl{ +namespace triton{ namespace ir{ class value; diff --git a/include/triton/codegen/vectorize.h b/include/triton/codegen/vectorize.h index c9c28a79c..fe6df9dcf 100644 --- a/include/triton/codegen/vectorize.h +++ b/include/triton/codegen/vectorize.h @@ -1,7 +1,7 @@ #ifndef TDL_INCLUDE_CODEGEN_VECTORIZE_H #define TDL_INCLUDE_CODEGEN_VECTORIZE_H -namespace tdl { +namespace triton { namespace ir { class module; diff --git a/include/triton/driver/backend.h b/include/triton/driver/backend.h index f71e2b424..0af719c29 100755 --- a/include/triton/driver/backend.h +++ b/include/triton/driver/backend.h @@ -28,7 +28,7 @@ #include -namespace tdl +namespace triton { namespace driver { diff --git a/include/triton/driver/buffer.h b/include/triton/driver/buffer.h index 1d4130cd0..21603f9c4 100755 --- a/include/triton/driver/buffer.h +++ b/include/triton/driver/buffer.h @@ -26,7 +26,7 @@ #include "triton/driver/handle.h" #include "triton/driver/context.h" -namespace tdl +namespace triton { namespace driver { diff --git a/include/triton/driver/context.h b/include/triton/driver/context.h index 339a25c72..f1c6bca7a 100755 --- a/include/triton/driver/context.h +++ b/include/triton/driver/context.h @@ -26,7 +26,7 @@ #include "triton/driver/device.h" #include "triton/driver/handle.h" -namespace tdl +namespace triton { namespace driver { diff --git a/include/triton/driver/cublas.h b/include/triton/driver/cublas.h index 175b7f089..857709106 100755 --- a/include/triton/driver/cublas.h +++ b/include/triton/driver/cublas.h @@ -32,7 +32,7 @@ #include "triton/tools/bench.hpp" #include "triton/tools/collections.hpp" -namespace tdl +namespace triton { namespace driver { diff --git a/include/triton/driver/device.h b/include/triton/driver/device.h index 3be7ca04f..2263cffc6 100755 --- a/include/triton/driver/device.h +++ b/include/triton/driver/device.h @@ -26,7 +26,7 @@ #include "triton/driver/platform.h" #include "triton/driver/handle.h" -namespace tdl +namespace triton { namespace driver diff --git a/include/triton/driver/dispatch.h b/include/triton/driver/dispatch.h index 42ce6729f..aa1d412de 100755 --- a/include/triton/driver/dispatch.h +++ b/include/triton/driver/dispatch.h @@ -37,7 +37,7 @@ #include #include -namespace tdl +namespace triton { namespace driver { diff --git a/include/triton/driver/error.h b/include/triton/driver/error.h index b837dea92..dd695e8c8 100755 --- a/include/triton/driver/error.h +++ b/include/triton/driver/error.h @@ -27,7 +27,7 @@ #include "triton/driver/dispatch.h" -namespace tdl +namespace triton { namespace driver diff --git a/include/triton/driver/event.h b/include/triton/driver/event.h index 79fbbb56f..65f29beaf 100755 --- a/include/triton/driver/event.h +++ b/include/triton/driver/event.h @@ -25,7 +25,7 @@ #include "triton/driver/handle.h" -namespace tdl +namespace triton { namespace driver diff --git a/include/triton/driver/handle.h b/include/triton/driver/handle.h index cb8463584..19cdf62f8 100755 --- a/include/triton/driver/handle.h +++ b/include/triton/driver/handle.h @@ -29,7 +29,7 @@ #include #include "triton/driver/dispatch.h" -namespace tdl +namespace triton { namespace driver diff --git a/include/triton/driver/kernel.h b/include/triton/driver/kernel.h index 1fbf7935a..b29d7b1a4 100755 --- a/include/triton/driver/kernel.h +++ b/include/triton/driver/kernel.h @@ -28,7 +28,7 @@ #include -namespace tdl +namespace triton { namespace driver diff --git a/include/triton/driver/module.h b/include/triton/driver/module.h index 913e90853..3a964df38 100755 --- a/include/triton/driver/module.h +++ b/include/triton/driver/module.h @@ -28,7 +28,7 @@ #include "triton/driver/context.h" #include "triton/driver/buffer.h" -namespace tdl +namespace triton { namespace driver diff --git a/include/triton/driver/platform.h b/include/triton/driver/platform.h index d39c48e72..514e07625 100755 --- a/include/triton/driver/platform.h +++ b/include/triton/driver/platform.h @@ -28,7 +28,7 @@ #include "triton/driver/handle.h" -namespace tdl +namespace triton { namespace driver diff --git a/include/triton/driver/stream.h b/include/triton/driver/stream.h index 8e5783892..a94a33c54 100755 --- a/include/triton/driver/stream.h +++ b/include/triton/driver/stream.h @@ -29,7 +29,7 @@ #include "triton/driver/handle.h" #include "triton/driver/buffer.h" -namespace tdl +namespace triton { namespace driver diff --git a/include/triton/ir/basic_block.h b/include/triton/ir/basic_block.h index a01cff008..63de2a18b 100644 --- a/include/triton/ir/basic_block.h +++ b/include/triton/ir/basic_block.h @@ -5,7 +5,7 @@ #include #include "value.h" -namespace tdl{ +namespace triton{ namespace ir{ class context; diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h index a6c0013fd..577daad4e 100644 --- a/include/triton/ir/builder.h +++ b/include/triton/ir/builder.h @@ -8,7 +8,7 @@ #include "basic_block.h" #include "type.h" -namespace tdl{ +namespace triton{ namespace ir{ class basic_block; diff --git a/include/triton/ir/constant.h b/include/triton/ir/constant.h index 132902d3a..11403c6dd 100644 --- a/include/triton/ir/constant.h +++ b/include/triton/ir/constant.h @@ -4,7 +4,7 @@ #include "value.h" #include -namespace tdl{ +namespace triton{ namespace ir{ class type; diff --git a/include/triton/ir/context.h b/include/triton/ir/context.h index d3018aa6f..1433d741d 100644 --- a/include/triton/ir/context.h +++ b/include/triton/ir/context.h @@ -4,7 +4,7 @@ #include #include "triton/ir/type.h" -namespace tdl{ +namespace triton{ namespace ir{ class type; diff --git a/include/triton/ir/context_impl.h b/include/triton/ir/context_impl.h index 623f58e40..54e109862 100644 --- a/include/triton/ir/context_impl.h +++ b/include/triton/ir/context_impl.h @@ -5,7 +5,7 @@ #include #include "triton/ir/type.h" -namespace tdl{ +namespace triton{ namespace ir{ class context; diff --git a/include/triton/ir/function.h b/include/triton/ir/function.h index 4f0762067..9b44d7b1a 100644 --- a/include/triton/ir/function.h +++ b/include/triton/ir/function.h @@ -5,7 +5,7 @@ #include "value.h" #include "constant.h" -namespace tdl{ +namespace triton{ namespace ir{ class function; diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index 93dc0b78e..54f313f88 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -6,7 +6,7 @@ #include "triton/ir/type.h" #include "llvm/IR/Instructions.h" -namespace tdl{ +namespace triton{ namespace ir{ class basic_block; diff --git a/include/triton/ir/module.h b/include/triton/ir/module.h index 41a729999..59c5d0a0a 100644 --- a/include/triton/ir/module.h +++ b/include/triton/ir/module.h @@ -8,7 +8,7 @@ #include #include "builder.h" -namespace tdl{ +namespace triton{ namespace ast{ diff --git a/include/triton/ir/print.h b/include/triton/ir/print.h index a31929282..c5a034ea3 100644 --- a/include/triton/ir/print.h +++ b/include/triton/ir/print.h @@ -4,7 +4,7 @@ #include "builder.h" -namespace tdl{ +namespace triton{ namespace ir{ class module; diff --git a/include/triton/ir/type.h b/include/triton/ir/type.h index 1cd74e259..1977ff47c 100644 --- a/include/triton/ir/type.h +++ b/include/triton/ir/type.h @@ -5,7 +5,7 @@ #include #include -namespace tdl{ +namespace triton{ namespace ir{ class context; diff --git a/include/triton/ir/value.h b/include/triton/ir/value.h index 4db869f52..08b26d715 100644 --- a/include/triton/ir/value.h +++ b/include/triton/ir/value.h @@ -6,7 +6,7 @@ #include #include -namespace tdl{ +namespace triton{ namespace ir{ class type; diff --git a/include/triton/tools/sys/getenv.hpp b/include/triton/tools/sys/getenv.hpp index e10664b6f..6e45ad5f2 100755 --- a/include/triton/tools/sys/getenv.hpp +++ b/include/triton/tools/sys/getenv.hpp @@ -25,7 +25,7 @@ #include #include -namespace tdl +namespace triton { namespace tools diff --git a/include/triton/tools/sys/mkdir.hpp b/include/triton/tools/sys/mkdir.hpp index 5d82a7535..e6c289535 100755 --- a/include/triton/tools/sys/mkdir.hpp +++ b/include/triton/tools/sys/mkdir.hpp @@ -31,7 +31,7 @@ #include #endif -namespace tdl +namespace triton { namespace tools diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 4d1fa5048..bff7e2ed6 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -10,7 +10,7 @@ #include -namespace tdl{ +namespace triton{ namespace ast{ diff --git a/lib/codegen/allocation.cpp b/lib/codegen/allocation.cpp index d396ec790..c4dd538f8 100644 --- a/lib/codegen/allocation.cpp +++ b/lib/codegen/allocation.cpp @@ -8,7 +8,7 @@ #include "triton/ir/function.h" #include "triton/ir/instructions.h" -namespace tdl{ +namespace triton{ namespace codegen{ unsigned allocation::get_num_bytes(ir::value *x) { diff --git a/lib/codegen/barriers.cpp b/lib/codegen/barriers.cpp index c40c08186..b84a945d8 100644 --- a/lib/codegen/barriers.cpp +++ b/lib/codegen/barriers.cpp @@ -7,7 +7,7 @@ #include "triton/ir/basic_block.h" #include "triton/ir/instructions.h" -namespace tdl { +namespace triton { namespace codegen{ diff --git a/lib/codegen/buffer_info.cpp b/lib/codegen/buffer_info.cpp index c8fe08df6..92e27fd23 100644 --- a/lib/codegen/buffer_info.cpp +++ b/lib/codegen/buffer_info.cpp @@ -5,7 +5,7 @@ #include "triton/ir/instructions.h" #include "triton/ir/type.h" -namespace tdl { +namespace triton { namespace codegen{ diff --git a/lib/codegen/layout.cpp b/lib/codegen/layout.cpp index 040954caa..0722321b8 100644 --- a/lib/codegen/layout.cpp +++ b/lib/codegen/layout.cpp @@ -4,7 +4,7 @@ #include "triton/ir/basic_block.h" #include "triton/ir/instructions.h" -namespace tdl{ +namespace triton{ namespace codegen{ diff --git a/lib/codegen/liveness.cpp b/lib/codegen/liveness.cpp index f5e5a79c6..5e1987b9e 100644 --- a/lib/codegen/liveness.cpp +++ b/lib/codegen/liveness.cpp @@ -6,7 +6,7 @@ #include "triton/ir/instructions.h" #include "triton/ir/value.h" -namespace tdl{ +namespace triton{ namespace codegen{ diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 4cb2cf827..1b09ca4f9 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -13,7 +13,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/IR/BasicBlock.h" -namespace tdl{ +namespace triton{ namespace codegen{ using namespace llvm; diff --git a/lib/codegen/shared_copy.cpp b/lib/codegen/shared_copy.cpp index b7276bbfe..ce6f53fbe 100644 --- a/lib/codegen/shared_copy.cpp +++ b/lib/codegen/shared_copy.cpp @@ -6,7 +6,7 @@ #include "triton/ir/basic_block.h" #include "triton/ir/instructions.h" -namespace tdl { +namespace triton { namespace codegen{ diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 302676697..846cd985e 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -8,7 +8,7 @@ #include -namespace tdl{ +namespace triton{ namespace codegen{ void tune::add_constraint(node_t x, node_t y) { diff --git a/lib/codegen/vectorize.cpp b/lib/codegen/vectorize.cpp index 4c45928a2..c9757c6aa 100644 --- a/lib/codegen/vectorize.cpp +++ b/lib/codegen/vectorize.cpp @@ -5,7 +5,7 @@ #include "triton/ir/basic_block.h" #include "triton/ir/instructions.h" -namespace tdl { +namespace triton { namespace codegen{ diff --git a/lib/driver/backend.cpp b/lib/driver/backend.cpp index cdf72027d..88e8630e9 100755 --- a/lib/driver/backend.cpp +++ b/lib/driver/backend.cpp @@ -31,7 +31,7 @@ #include #include -namespace tdl +namespace triton { namespace driver diff --git a/lib/driver/buffer.cpp b/lib/driver/buffer.cpp index 069e8abe1..1ac650397 100755 --- a/lib/driver/buffer.cpp +++ b/lib/driver/buffer.cpp @@ -27,7 +27,7 @@ #include "triton/driver/dispatch.h" -namespace tdl +namespace triton { namespace driver diff --git a/lib/driver/context.cpp b/lib/driver/context.cpp index 8e365db81..ddaed2b91 100755 --- a/lib/driver/context.cpp +++ b/lib/driver/context.cpp @@ -29,7 +29,7 @@ #include "triton/tools/sys/getenv.hpp" #include "triton/tools/sys/mkdir.hpp" -namespace tdl +namespace triton { namespace driver diff --git a/lib/driver/device.cpp b/lib/driver/device.cpp index 22f640d7b..44f9e29bd 100755 --- a/lib/driver/device.cpp +++ b/lib/driver/device.cpp @@ -28,7 +28,7 @@ #include "triton/driver/device.h" -namespace tdl +namespace triton { namespace driver diff --git a/lib/driver/dispatch.cpp b/lib/driver/dispatch.cpp index d7d19727d..9e7a01330 100755 --- a/lib/driver/dispatch.cpp +++ b/lib/driver/dispatch.cpp @@ -24,7 +24,7 @@ #include "triton/driver/dispatch.h" #include "triton/driver/context.h" -namespace tdl +namespace triton { namespace driver { diff --git a/lib/driver/error.cpp b/lib/driver/error.cpp index 9759a1323..f3cce16d5 100755 --- a/lib/driver/error.cpp +++ b/lib/driver/error.cpp @@ -22,7 +22,7 @@ #include "triton/driver/error.h" -namespace tdl +namespace triton { namespace driver { diff --git a/lib/driver/event.cpp b/lib/driver/event.cpp index b8841dc0f..60397882b 100755 --- a/lib/driver/event.cpp +++ b/lib/driver/event.cpp @@ -22,7 +22,7 @@ #include "triton/driver/event.h" -namespace tdl +namespace triton { namespace driver { diff --git a/lib/driver/handle.cpp b/lib/driver/handle.cpp index cd7ee4195..090568919 100755 --- a/lib/driver/handle.cpp +++ b/lib/driver/handle.cpp @@ -24,7 +24,7 @@ #include #include "triton/driver/handle.h" -namespace tdl +namespace triton { namespace driver diff --git a/lib/driver/kernel.cpp b/lib/driver/kernel.cpp index 180e46cc7..6cd6dd2e7 100755 --- a/lib/driver/kernel.cpp +++ b/lib/driver/kernel.cpp @@ -26,7 +26,7 @@ #include "triton/driver/kernel.h" #include "triton/driver/buffer.h" -namespace tdl +namespace triton { namespace driver diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index d897489fe..2748742a7 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -29,7 +29,7 @@ #include "triton/tools/sys/getenv.hpp" -namespace tdl +namespace triton { namespace driver { diff --git a/lib/driver/platform.cpp b/lib/driver/platform.cpp index ada13de41..0fe23ccac 100755 --- a/lib/driver/platform.cpp +++ b/lib/driver/platform.cpp @@ -26,7 +26,7 @@ #include -namespace tdl +namespace triton { namespace driver { diff --git a/lib/driver/stream.cpp b/lib/driver/stream.cpp index 39996d473..d82b0437c 100755 --- a/lib/driver/stream.cpp +++ b/lib/driver/stream.cpp @@ -32,7 +32,7 @@ #include "triton/driver/kernel.h" #include "triton/driver/buffer.h" -namespace tdl +namespace triton { namespace driver diff --git a/lib/ir/basic_block.cpp b/lib/ir/basic_block.cpp index 15fa3188c..456f0f820 100644 --- a/lib/ir/basic_block.cpp +++ b/lib/ir/basic_block.cpp @@ -3,7 +3,7 @@ #include "triton/ir/type.h" #include "triton/ir/function.h" -namespace tdl { +namespace triton { namespace ir { class phi_node; diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index 467a7ef71..5d798acc8 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -6,7 +6,7 @@ #include "triton/ir/type.h" #include "llvm/IR/Instruction.h" -namespace tdl{ +namespace triton{ namespace ir{ builder::builder(context &ctx): diff --git a/lib/ir/constant.cpp b/lib/ir/constant.cpp index 5a6cfbc7a..929af2228 100644 --- a/lib/ir/constant.cpp +++ b/lib/ir/constant.cpp @@ -4,7 +4,7 @@ #include "triton/ir/context.h" #include "triton/ir/context_impl.h" -namespace tdl{ +namespace triton{ namespace ir{ diff --git a/lib/ir/context.cpp b/lib/ir/context.cpp index 33a852bda..e0a6976e0 100644 --- a/lib/ir/context.cpp +++ b/lib/ir/context.cpp @@ -2,7 +2,7 @@ #include "triton/ir/context.h" #include "triton/ir/type.h" -namespace tdl{ +namespace triton{ namespace ir{ //===----------------------------------------------------------------------===// diff --git a/lib/ir/function.cpp b/lib/ir/function.cpp index 75a465e8f..758fd8bc3 100644 --- a/lib/ir/function.cpp +++ b/lib/ir/function.cpp @@ -2,7 +2,7 @@ #include "triton/ir/type.h" #include "triton/ir/module.h" -namespace tdl{ +namespace triton{ namespace ir{ diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 93ba4602a..984ab9b4a 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -4,7 +4,7 @@ #include "triton/ir/constant.h" #include "triton/ir/type.h" -namespace tdl{ +namespace triton{ namespace ir{ //===----------------------------------------------------------------------===// diff --git a/lib/ir/module.cpp b/lib/ir/module.cpp index 28825fe4b..6e7ecd464 100644 --- a/lib/ir/module.cpp +++ b/lib/ir/module.cpp @@ -4,7 +4,7 @@ #include "triton/ir/constant.h" #include "triton/ir/function.h" -namespace tdl{ +namespace triton{ namespace ir{ /* Module */ diff --git a/lib/ir/print.cpp b/lib/ir/print.cpp index 34829aec0..d1313bbcb 100644 --- a/lib/ir/print.cpp +++ b/lib/ir/print.cpp @@ -6,7 +6,7 @@ #include "triton/ir/instructions.h" #include "triton/ir/print.h" -namespace tdl{ +namespace triton{ namespace ir{ std::string get_name(ir::value *v, unsigned i) { diff --git a/lib/ir/type.cpp b/lib/ir/type.cpp index 10a8d582e..862039220 100644 --- a/lib/ir/type.cpp +++ b/lib/ir/type.cpp @@ -5,7 +5,7 @@ #include "triton/ir/value.h" #include "triton/ir/constant.h" -namespace tdl{ +namespace triton{ namespace ir{ //===----------------------------------------------------------------------===// diff --git a/lib/ir/value.cpp b/lib/ir/value.cpp index 3827af220..b404e5eea 100644 --- a/lib/ir/value.cpp +++ b/lib/ir/value.cpp @@ -3,7 +3,7 @@ #include #include -namespace tdl{ +namespace triton{ namespace ir{ class type; From 6dc88878aceacddc7dbca911b7b404685f63cdeb Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 24 Feb 2019 23:22:28 -0500 Subject: [PATCH 087/494] [code generation] bugfix in double-buffering --- examples/matrix.cpp | 49 +++++++++++++------------------------ lib/codegen/allocation.cpp | 2 +- lib/codegen/buffer_info.cpp | 2 +- lib/codegen/selection.cpp | 21 +++++++++++----- lib/codegen/vectorize.cpp | 4 ++- 5 files changed, 37 insertions(+), 41 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 382f8a324..a04c1d95e 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -60,27 +60,11 @@ void matmul(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K, int32 bound){\ int1 checkc1[TN] = ryc < N;\ int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :];\ for(k = K; k > 0; k = k - TK){\ - int1 checka[TM, TK] = (k > 8);\ - int1 checkb[TN, TK] = (k > 8);\ - int1 checka0[TM];\ - int1 checka1[TK];\ - int1 checkb0[TN];\ - int1 checkb1[TK];\ C = dot(a, b, C);\ pa = pa + TK*M;\ pb = pb + TK*K;\ - @checka a = *pa;\ - @checkb b = *pb;\ - if(k > 8)\ - continue;\ - checka0 = rxa < M;\ - checka1 = rka < k;\ - checkb0 = ryb < N;\ - checkb1 = rkb < k;\ - checka = checka0[:, newaxis] && checka1[newaxis, :];\ - checkb = checkb0[:, newaxis] && checkb1[newaxis, :];\ - @checka a = *pa;\ - @checkb b = *pb;\ + a = *pa;\ + b = *pb;\ }\ @checkc *pc = C;\ }\ @@ -219,21 +203,22 @@ int main() { // tuning parameters tune.run(module); + std::vector params = { // shapes - 16, 16, 8, + 8, 8, 8, // a0 - 2, 8, 1, + 1, 8, 1, // b0 - 4, 4, 1, + 1, 8, 1, // c0 - 2, 8, 1, + 1, 8, 1, // c1 - 4, 4, 1, + 1, 4, 2, // a1 - 2, 4, 1, + 1, 4, 2, // b1 - 1, 8, 1 + 1, 4, 2 }; // meta-parameters unsigned i = 0; @@ -255,23 +240,22 @@ int main() { // run passes - triton::ir::print(module, std::cout); buffer_info.run(module); shared.run(module); liveness.run(module); allocation.run(); barriers.run(module); +// triton::ir::print(module, std::cout); vectorize.run(module); selection.run(module, llvm_module); // llvm source llvm::legacy::PassManager manager; - manager.add(llvm::createPrintModulePass(llvm::outs())); +// manager.add(llvm::createPrintModulePass(llvm::outs())); manager.add(llvm::createVerifierPass(true)); manager.run(llvm_module); std::string src = generate_machine_code(llvm_module, "nvptx64-nvidia-cuda", compute_data_layout(true, true)); - std::cout << src << std::endl; // compile machine code CUdevice cu_device; @@ -285,16 +269,17 @@ int main() { // execute machine code // Allocate buffers typedef float numeric_t; - size_t M = 32, N = 32, K = 32; + size_t M = 128, N = 128, K = 128; size_t bound = 8; std::vector c(M*N); std::vector rc(M*N); std::vector a(M*K); std::vector b(K*N); + srand(0); for(size_t i = 0; i < a.size(); i++) - a[i] = (float)rand() / RAND_MAX; + a[i] = (float)rand()/RAND_MAX; for(size_t i = 0; i < b.size(); i++) - b[i] = (float)rand() / RAND_MAX; + b[i] = (float)rand()/RAND_MAX; for(size_t i = 0; i < c.size(); i++) c[i] = 0; CUdeviceptr d_a, d_b, d_c; @@ -311,7 +296,7 @@ int main() { cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, cu_kernel); unsigned TM = context.p_impl->mp_constants_[0]->get_value(); unsigned TN = context.p_impl->mp_constants_[1]->get_value(); - unsigned nthreads = 32; + unsigned nthreads = params[10]*params[13]*params[11]*params[14]; checkCudaErrors(cuLaunchKernel(cu_kernel, (M + TM - 1)/TM, (N + TN - 1)/TN, 1, nthreads, 1, 1, 0, cu_stream, args, NULL)); checkCudaErrors(cuStreamSynchronize(cu_stream)); // Write back diff --git a/lib/codegen/allocation.cpp b/lib/codegen/allocation.cpp index c4dd538f8..9a3d5e39d 100644 --- a/lib/codegen/allocation.cpp +++ b/lib/codegen/allocation.cpp @@ -12,7 +12,7 @@ namespace triton{ namespace codegen{ unsigned allocation::get_num_bytes(ir::value *x) { - unsigned result = x->get_type()->get_tile_bitwidth(); + unsigned result = x->get_type()->get_tile_bitwidth() / 8; if(buffer_info_->is_double(x)) result *= 2; return result; diff --git a/lib/codegen/buffer_info.cpp b/lib/codegen/buffer_info.cpp index 92e27fd23..4d2a3c676 100644 --- a/lib/codegen/buffer_info.cpp +++ b/lib/codegen/buffer_info.cpp @@ -16,7 +16,7 @@ bool buffer_info_pass::is_loop_latch(ir::phi_node *phi, ir::value *terminator){ return br->get_true_dest() == phi->get_parent() || br->get_false_dest() == phi->get_parent(); else if(auto *br = dynamic_cast(terminator)) - return br->get_dest() == phi->get_parent(); + return false; else throw std::runtime_error("unreachable"); } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 1b09ca4f9..1e7f9c4e4 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -376,6 +376,13 @@ void selection::create_grids(std::vector &grids, grids.push_back(ref.second); } +bool static inline has_phi_user(ir::value *v) { + for(ir::user *usr: v->get_users()){ + if(dynamic_cast(usr)) + return true; + } + return false; +} void selection::create_tile(ir::value *v, IRBuilder<> &builder, const std::map& references, std::set &seen, Value *sh_mem_ptr) { @@ -394,8 +401,9 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, if(dynamic_cast(v) || (buffer_info_->is_double(v))){ // shared copy PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr->getType()->getPointerAddressSpace()); + // TODO - buffer info not up-to-date with references if(dynamic_cast(v)) { - if(buffer_info_->get_reference(v) == nullptr){ + if(!has_phi_user(v)){ size_t offset = alloc_->get_offset(v); Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset)); ptr = builder.CreateBitCast(ptr, ptr_ty); @@ -417,7 +425,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, // next pointer Value *pre_ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(alloc_->get_offset(phi))); pre_ptr = builder.CreateBitCast(pre_ptr, ptr->getType()); - Value *next_ptr = builder.CreateGEP(ptr, offset); + Value *next_ptr = builder.CreateGEP(ptr, offset, "next_ptr"); tmap_.insert({phi, new shared_tile(ty, shapes2, ptr, builder, offset)}); for(unsigned i = 0; i < phi->get_num_incoming(); i++) { ir::basic_block* inc_block = phi->get_incoming_block(i); @@ -720,12 +728,13 @@ void selection::run(ir::module &src, Module &dst){ PHINode *ptr = (PHINode*)((shared_tile*)tmap_.at(phi))->get_pointer(); PHINode *offset = (PHINode*)((shared_tile*)tmap_.at(phi))->get_offset(); for(unsigned n = 0; n < phi->get_num_incoming(); n++){ - ir::value *inc_val = phi->get_incoming_value(n); - ir::basic_block *inc_block = phi->get_incoming_block(n); + ir::basic_block* inc_block = phi->get_incoming_block(n); + ir::value* inc_val = phi->get_incoming_value(n); + ir::value* terminator = inc_block->get_inst_list().back(); BasicBlock *llvm_inc_block = last_block.at(inc_block); shared_tile *inc_shared = (shared_tile*)tmap_.at(inc_val); - GetElementPtrInst *inc_ptr = dyn_cast(inc_shared->get_pointer()); - if(inc_ptr && ptr == inc_ptr->getPointerOperand()){ + bool is_loop_latch = buffer_info_->is_loop_latch(phi, terminator); + if(is_loop_latch){ dst_builder.SetInsertPoint(llvm_inc_block->getTerminator()); Value *next_offset = dst_builder.CreateNeg(offset); offset->addIncoming(next_offset, llvm_inc_block); diff --git a/lib/codegen/vectorize.cpp b/lib/codegen/vectorize.cpp index c9757c6aa..57c2142c9 100644 --- a/lib/codegen/vectorize.cpp +++ b/lib/codegen/vectorize.cpp @@ -15,8 +15,10 @@ void vectorize::run(ir::module &mod) { for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()) if(dynamic_cast(i)){ - builder.set_insert_point(i); ir::value *x = i->get_operand(0); + if(*params_->get_param(x, "p0.d0") == 1) + continue; + builder.set_insert_point(i); ir::instruction *rx = (ir::instruction*)builder.create_vectorize(x); x->replace_all_uses_with(rx); rx->set_operand(0, x); From 338f2918351f26a2d0fc05d4e4d0f302c9b4d0f4 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 25 Feb 2019 11:41:45 -0500 Subject: [PATCH 088/494] [code generation] now ordered iterations across distributed tiles --- include/triton/codegen/selection.h | 6 ++++-- lib/codegen/selection.cpp | 16 ++++++++-------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 13bc3b6b6..c73f65c8e 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -69,8 +69,9 @@ private: class distributed_tile: public tile{ typedef std::vector axes_t; + typedef std::vector ordered_indices_vec_t; typedef std::map indices_map_t; - typedef std::vector values_t; + typedef std::map values_map_t; private: void init_indices(); @@ -87,7 +88,8 @@ public: private: axes_t axes_; indices_map_t indices_; - values_t values_; + values_map_t values_; + ordered_indices_vec_t ordered_indices_; size_t vector_size_; llvm::IRBuilder<> &builder_; }; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 1e7f9c4e4..031c45548 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -27,6 +27,8 @@ void distributed_tile::init_indices() { for(size_t d = 0; d < id.size(); d++) current.push_back(axes_[d].values[id[d]]); indices_[current] = indices_.size(); + values_[current] = UndefValue::get(ty_); + ordered_indices_.push_back(current); id[0]++; while(id[k] == axes_[k].values.size()){ if(k == id.size() - 1) @@ -48,16 +50,14 @@ distributed_tile::distributed_tile(Type *ty, const shapes_t &shapes, const axes_ : tile(make_vector_ty(ty, vectorize?axes[0].contiguous:1), shapes), axes_(axes), builder_(builder) { vector_size_ = vectorize?ty_->getVectorNumElements():1; init_indices(); - for(size_t i = 0; i < indices_.size(); i++) - values_.push_back(UndefValue::get(ty_)); } void distributed_tile::set_value(indices_t idx, Value *v) { - values_[indices_[idx]] = v; + values_[idx] = v; } Value* distributed_tile::get_value(indices_t idx) { - return values_[indices_[idx]]; + return values_[idx]; } unsigned distributed_tile::get_linear_index(indices_t idx) { @@ -65,9 +65,9 @@ unsigned distributed_tile::get_linear_index(indices_t idx) { } void distributed_tile::for_each(std::function fn) { - for(auto &idx: indices_) - if(idx.second % vector_size_ == 0) - fn(idx.first); + for(unsigned i = 0; i < ordered_indices_.size(); i++) + if(i % vector_size_ == 0) + fn(ordered_indices_[i]); } /* Shared Tile */ @@ -600,7 +600,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & unsigned id = linear / vector_size; if(linear % vector_size == 0) packets[id] = result->get_value(idx); - packets[id] = builder.CreateInsertElement(packets[id], in->get_value(idx), linear % vector_size); + packets[id] = builder.CreateInsertElement(packets.at(id), in->get_value(idx), linear % vector_size); }); result->for_each([&](indices_t idx){ unsigned linear = in->get_linear_index(idx); From 68dea75aa04b2a3733d70401679198e69bbd1582 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 26 Feb 2019 12:36:37 -0500 Subject: [PATCH 089/494] [syntax tree] more fixes in lowering phi nodes --- examples/matrix.cpp | 49 +++++++++++++------------ include/triton/ast/ast.h | 6 +++- include/triton/ast/parser.y | 3 +- include/triton/ast/scanner.l | 1 + include/triton/codegen/selection.h | 2 ++ include/triton/ir/function.h | 15 ++++++-- include/triton/ir/module.h | 2 +- lib/ast/lowering.cpp | 15 ++++++++ lib/codegen/selection.cpp | 58 ++++++++++++++++++++++++------ lib/ir/module.cpp | 8 ++--- 10 files changed, 118 insertions(+), 41 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index a04c1d95e..2846ab844 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -42,7 +42,7 @@ const tunable int32 TM;\ const tunable int32 TN;\ const tunable int32 TK;\ \ -void matmul(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K, int32 bound){\ +void matmul(restrict readonly fp32 *a, restrict readonly fp32 *b, fp32 *c, int32 M, int32 N, int32 K, int32 bound){\ int32 rxa[TM] = get_global_range[TM](0);\ int32 ryb[TN] = get_global_range[TN](1);\ int32 rka[TK] = 0 ... TK;\ @@ -56,16 +56,19 @@ void matmul(fp32 *a, fp32 *b, fp32 *c, int32 M, int32 N, int32 K, int32 bound){\ fp32* pc[TM, TN] = c + rxc[:, newaxis] + ryc[newaxis, :]*M;\ fp32 a[TM, TK] = *pa;\ fp32 b[TN, TK] = *pb;\ - int1 checkc0[TM] = rxc < M;\ - int1 checkc1[TN] = ryc < N;\ - int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :];\ - for(k = K; k > 0; k = k - TK){\ - C = dot(a, b, C);\ - pa = pa + TK*M;\ - pb = pb + TK*K;\ - a = *pa;\ - b = *pb;\ - }\ + int1 checkc0[TM];\ + int1 checkc1[TN];\ + int1 checkc[TM, TN];\ + for(k = K; k > 0; k = k - TK){\ + C = dot(a, b, C);\ + pa = pa + TK*M;\ + pb = pb + TK*K;\ + a = *pa;\ + b = *pb;\ + }\ + checkc0 = rxc < M;\ + checkc1 = ryc < N;\ + checkc = checkc0[:, newaxis] && checkc1[newaxis, :];\ @checkc *pc = C;\ }\ "; @@ -203,23 +206,23 @@ int main() { // tuning parameters tune.run(module); - std::vector params = { // shapes - 8, 8, 8, + 16, 16, 8, // a0 - 1, 8, 1, + 2, 8, 1, // b0 - 1, 8, 1, + 4, 4, 1, // c0 - 1, 8, 1, + 2, 8, 1, // c1 - 1, 4, 2, + 4, 4, 1, // a1 - 1, 4, 2, + 2, 4, 1, // b1 - 1, 4, 2 + 1, 8, 1 }; + // meta-parameters unsigned i = 0; context.p_impl->mp_constants_[0]->set_value(params[0]); @@ -240,12 +243,13 @@ int main() { // run passes + triton::ir::print(module, std::cout); + exit(EXIT_FAILURE); buffer_info.run(module); shared.run(module); liveness.run(module); allocation.run(); barriers.run(module); -// triton::ir::print(module, std::cout); vectorize.run(module); selection.run(module, llvm_module); @@ -256,6 +260,7 @@ int main() { manager.run(llvm_module); std::string src = generate_machine_code(llvm_module, "nvptx64-nvidia-cuda", compute_data_layout(true, true)); + std::cout << src << std::endl; // compile machine code CUdevice cu_device; @@ -277,9 +282,9 @@ int main() { std::vector b(K*N); srand(0); for(size_t i = 0; i < a.size(); i++) - a[i] = (float)rand()/RAND_MAX; + a[i] = 1; for(size_t i = 0; i < b.size(); i++) - b[i] = (float)rand()/RAND_MAX; + b[i] = 1; for(size_t i = 0; i < c.size(); i++) c[i] = 0; CUdeviceptr d_a, d_b, d_c; diff --git a/include/triton/ast/ast.h b/include/triton/ast/ast.h index b2d07ddd2..ab96bfa36 100644 --- a/include/triton/ast/ast.h +++ b/include/triton/ast/ast.h @@ -60,7 +60,9 @@ enum STORAGE_SPEC_T{ CONST_T, TUNABLE_T, KERNEL_T, - READONLY_T, WRITEONLY_T, + RESTRICT_T, + READONLY_T, + WRITEONLY_T }; class pointer; @@ -505,6 +507,8 @@ public: : declarator(id), args_((list*)args) { } void bind_parameters(ir::module *mod, ir::function *fn) const; + unsigned get_num_args() const { return args_->values().size(); } + parameter* get_arg(unsigned i) const { return args_->values().at(i); } public: const list* args_; diff --git a/include/triton/ast/parser.y b/include/triton/ast/parser.y index cf3c011a2..c12c7a662 100644 --- a/include/triton/ast/parser.y +++ b/include/triton/ast/parser.y @@ -46,7 +46,7 @@ STORAGE_SPEC_T get_storage_spec(node *op) { return ((token*)op)->storage_spec;} %} %token IDENTIFIER CONSTANT STRING_LITERAL -%token TUNABLE KERNEL READONLY WRITEONLY CONST +%token TUNABLE KERNEL RESTRICT READONLY WRITEONLY CONST %token PTR_OP INC_OP DEC_OP LEFT_OP RIGHT_OP LE_OP GE_OP EQ_OP NE_OP %token AND_OP OR_OP MUL_ASSIGN DIV_ASSIGN MOD_ASSIGN ADD_ASSIGN %token SUB_ASSIGN LEFT_ASSIGN RIGHT_ASSIGN AND_ASSIGN @@ -363,6 +363,7 @@ storage_class_specifier : CONST { $$ = new token(CONST_T); } | TUNABLE { $$ = new token(TUNABLE_T); } | KERNEL { $$ = new token(KERNEL_T); } + | RESTRICT { $$ = new token(RESTRICT_T); } | READONLY { $$ = new token(READONLY_T); } | WRITEONLY { $$ = new token(WRITEONLY_T); } ; diff --git a/include/triton/ast/scanner.l b/include/triton/ast/scanner.l index 4f32c1f69..4c0635dbc 100644 --- a/include/triton/ast/scanner.l +++ b/include/triton/ast/scanner.l @@ -19,6 +19,7 @@ int comment(); "const" { count(); return(CONST); } "tunable" { count(); return(TUNABLE); } "kernel" { count(); return(KERNEL); } +"restrict" { count(); return(RESTRICT); } "readonly" { count(); return(READONLY); } "writeonly" { count(); return(WRITEONLY); } "@" { count(); return(AT); } diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index c73f65c8e..b0ebc8c5e 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -55,6 +55,7 @@ private: public: shared_tile(llvm::Type* ty, const shapes_t &shapes, llvm::Value* ptr, llvm::IRBuilder<> &builder, llvm::Value* offset = nullptr); + void set_vector_size(unsigned vector_size); void set_value(indices_t, llvm::Value *); llvm::Value* get_value(indices_t idx); llvm::Value* get_pointer() { return ptr_; } @@ -65,6 +66,7 @@ private: llvm::Value *offset_; llvm::IRBuilder<> &builder_; std::map ptr_cache_; + unsigned vector_size_; }; class distributed_tile: public tile{ diff --git a/include/triton/ir/function.h b/include/triton/ir/function.h index 9b44d7b1a..cc00b4a92 100644 --- a/include/triton/ir/function.h +++ b/include/triton/ir/function.h @@ -2,6 +2,7 @@ #define TDL_INCLUDE_IR_FUNCTION_H #include +#include #include "value.h" #include "constant.h" @@ -27,8 +28,10 @@ private: }; /* Attribute */ -class attribute { - +enum attribute_t { + readonly, + writeonly, + noalias }; /* Function */ @@ -41,6 +44,8 @@ class function: public global_object{ typedef blocks_t::iterator block_iterator; typedef blocks_t::const_iterator const_block_iterator; + typedef std::map> attr_map_t; + private: function(function_type *ty, linkage_types_t linkage, const std::string &name = "", module *parent = nullptr); @@ -49,6 +54,7 @@ public: // accessors const args_t &args() { return args_; } function_type* get_fn_type() { return fn_ty_; } + // factory methods static function *create(function_type *ty, linkage_types_t linkage, const std::string &name, module *mod); @@ -56,12 +62,17 @@ public: const blocks_t &blocks() { return blocks_; } void insert_block(basic_block* block, basic_block *next = nullptr); + // attributes + void add_attr(unsigned arg_id, attribute_t attr) { attrs_[arg_id].insert(attr); } + const attr_map_t &attrs() { return attrs_; } + private: module *parent_; bool init_; function_type *fn_ty_; args_t args_; blocks_t blocks_; + attr_map_t attrs_; }; } diff --git a/include/triton/ir/module.h b/include/triton/ir/module.h index 59c5d0a0a..633e356f2 100644 --- a/include/triton/ir/module.h +++ b/include/triton/ir/module.h @@ -44,7 +44,7 @@ public: private: phi_node *make_phi(type *ty, unsigned num_values, basic_block *block); - value *try_remove_trivial_phis(ir::phi_node *&phi, value **pre_user); + value *try_remove_trivial_phis(ir::phi_node *&phi); value *add_phi_operands(const std::string& name, phi_node *&phi); value *get_value_recursive(const std::string& name, basic_block *block); void push_function(function *fn) { functions_.push_back(fn); } diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index bff7e2ed6..49d173f1b 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -255,10 +255,25 @@ ir::type* function::type_impl(ir::module* mod, ir::type *type) const{ } /* Function definition */ +ir::attribute_t get_ir_attr(STORAGE_SPEC_T spec){ + switch(spec){ + case RESTRICT_T: return ir::noalias; + case READONLY_T: return ir::readonly; + case WRITEONLY_T: return ir::writeonly; + default: throw std::runtime_error("cannot convert storage specifier to IR function attribute"); + } +} + ir::value* function_definition::codegen(ir::module *mod) const{ ir::function_type *prototype = (ir::function_type*)header_->type(mod, spec_->type(mod)); const std::string &name = header_->id()->name(); ir::function *fn = mod->get_or_insert_function(name, prototype); + for(unsigned i = 0; i < header_->get_num_args(); i++){ + parameter *param = header_->get_arg(i); + std::vector storage = param->storage(); + for(STORAGE_SPEC_T spec: storage) + fn->add_attr(1 + i, get_ir_attr(spec)); + } header_->bind_parameters(mod, fn); ir::basic_block *entry = ir::basic_block::create(mod->get_context(), "entry", fn); mod->seal_block(entry); diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 031c45548..3114a05db 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -12,6 +12,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Attributes.h" namespace triton{ namespace codegen{ @@ -125,7 +126,7 @@ Value* shared_tile::shared_offset(indices_t idx) { } shared_tile::shared_tile(Type *ty, const shapes_t &shapes, Value *ptr, llvm::IRBuilder<> &builder, Value *offset): - tile(ty, shapes), ptr_(ptr), builder_(builder), offset_(offset) { + tile(ty, shapes), ptr_(ptr), builder_(builder), offset_(offset), vector_size_(1){ } void shared_tile::set_value(indices_t idx, Value *value) { @@ -135,18 +136,33 @@ void shared_tile::set_value(indices_t idx, Value *value) { builder_.CreateStore(value, ptr); } +void shared_tile::set_vector_size(unsigned vector_size) { + vector_size_ = vector_size; +} + Value* shared_tile::get_value(indices_t idx) { indices_t non_cst_idx, cst_idx; extract_constant(idx, non_cst_idx, cst_idx); Value *&base_ptr = ptr_cache_[non_cst_idx]; if(base_ptr == nullptr){ base_ptr = builder_.CreateGEP(ptr_, shared_offset(non_cst_idx)); -// Type *vec_ty = VectorType::get(base_ptr->getType()->getPointerElementType(), vec_); -// Type *vec_ptr_ty = PointerType::get(vec_ty, base_ptr->getType()->getPointerElementType()); -// base_ptr = builder_.CreateBitCast(base_ptr, vec_ptr_ty); + if(vector_size_ > 1){ + Type *vec_ty = VectorType::get(base_ptr->getType()->getPointerElementType(), vector_size_); + Type *vec_ptr_ty = PointerType::get(vec_ty, base_ptr->getType()->getPointerAddressSpace()); + base_ptr = builder_.CreateBitCast(base_ptr, vec_ptr_ty); + } } - Value *ptr = builder_.CreateGEP(base_ptr, shared_offset(cst_idx)); - return builder_.CreateLoad(ptr); + Value *offset = shared_offset(cst_idx); + Value *div = offset; + if(vector_size_ > 1) + div = builder_.CreateUDiv(offset, builder_.getInt32(vector_size_)); + Value *ptr = builder_.CreateGEP(base_ptr, div); + Value *result = builder_.CreateLoad(ptr); + if(vector_size_ > 1) { + Value *rem = builder_.CreateURem(offset, builder_.getInt32(vector_size_)); + result = builder_.CreateExtractElement(result, rem); + } + return result; } /* convert ir::type to Type */ @@ -623,15 +639,20 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & ir::value *A = ins->get_operand(0); ir::value *B = ins->get_operand(1); ir::value *C = ins->get_operand(2); + shared_tile *TA = (shared_tile*)tmap_.at(A); + shared_tile *TB = (shared_tile*)tmap_.at(B); + distributed_tile *TC = (distributed_tile*)tmap_.at(C); + TA->set_vector_size(TC->axis(0).contiguous); + TB->set_vector_size(TC->axis(1).contiguous); Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {llvm_type(C->get_type()->get_scalar_ty(), ctx)}); result->for_each([&](indices_t idx){ - Value *res = tmap_.at(C)->get_value(idx); + Value *res = TC->get_value(idx); unsigned NK = A->get_type()->get_tile_shapes()[1]->get_value(); for(unsigned K = 0; K < NK; ++K){ indices_t a_idx = {idx[0], builder.getInt32(K)}; indices_t b_idx = {idx[1], builder.getInt32(K)}; - Value *a = tmap_.at(A)->get_value(a_idx); - Value *b = tmap_.at(B)->get_value(b_idx); + Value *a = TA->get_value(a_idx); + Value *b = TB->get_value(b_idx); res = builder.CreateCall(f_mul_add, {a, b, res}); } result->set_value(idx, res); @@ -660,10 +681,20 @@ void selection::lower_instruction(ir::instruction *src, IRBuilder<> &builder) { } else { Instruction *i = (Instruction*)llvm_value(src, builder); + std::cout << "instruction: " << src->get_name() << " " << src->has_tile_result_or_op() << std::endl; vmap_[src] = i; } } +inline llvm::Attribute::AttrKind llvm_attr(ir::attribute_t attr) { + switch(attr){ + case ir::noalias: return llvm::Attribute::NoAlias; + case ir::readonly: return llvm::Attribute::ReadOnly; + case ir::writeonly: return llvm::Attribute::WriteOnly; + default: throw std::runtime_error("cannot convert ir::attribute_t to llvm::Attribute"); + } +} + void selection::run(ir::module &src, Module &dst){ vmap_.clear(); LLVMContext &dst_ctx = dst.getContext(); @@ -675,7 +706,13 @@ void selection::run(ir::module &src, Module &dst){ // create LLVM function FunctionType *fn_ty = (FunctionType*)llvm_type(fn->get_fn_type(), dst_ctx); Function *dst_fn = Function::Create(fn_ty, Function::ExternalLinkage, fn->get_name(), &dst); - // Set metadata + // set attributes + for(auto attr_pair: fn->attrs()){ + unsigned id = attr_pair.first; + for(ir::attribute_t attr: attr_pair.second) + dst_fn->addAttribute(id, llvm_attr(attr)); + } + // set metadata llvm::Metadata *md_args[] = { llvm::ValueAsMetadata::get(dst_fn), llvm::MDString::get(dst_ctx, "kernel"), @@ -760,6 +797,7 @@ void selection::run(ir::module &src, Module &dst){ }); } else { + std::cout << phi->get_name() << " " << inc_val->get_name() << std::endl; PHINode *llvm_phi = (PHINode*)vmap_.at(phi); Value *llvm_inc_val = vmap_.at(inc_val); llvm_phi->addIncoming(llvm_inc_val, llvm_inc_block); diff --git a/lib/ir/module.cpp b/lib/ir/module.cpp index 6e7ecd464..163894890 100644 --- a/lib/ir/module.cpp +++ b/lib/ir/module.cpp @@ -60,7 +60,7 @@ ir::phi_node* module::make_phi(ir::type *ty, unsigned num_values, ir::basic_bloc return res; } -ir::value *module::try_remove_trivial_phis(ir::phi_node *&phi, ir::value** pre_user){ +ir::value *module::try_remove_trivial_phis(ir::phi_node *&phi){ // find non-self references std::set non_self_ref; std::copy_if(phi->ops().begin(), phi->ops().end(), std::inserter(non_self_ref, non_self_ref.begin()), @@ -76,7 +76,7 @@ ir::value *module::try_remove_trivial_phis(ir::phi_node *&phi, ir::value** pre_u for(ir::user* u: users) if(auto *uphi = dynamic_cast(u)) if(uphi != phi) - try_remove_trivial_phis(uphi, &same); + try_remove_trivial_phis(uphi); return same; } @@ -113,7 +113,7 @@ ir::value *module::get_value_recursive(const std::string& name, ir::basic_block result = add_phi_operands(name, (ir::phi_node*&)result); } if(auto *phi = dynamic_cast(result)) - result = try_remove_trivial_phis(phi, nullptr); + result = try_remove_trivial_phis(phi); set_value(name, block, result); return result; } @@ -155,7 +155,7 @@ ir::type *module::get_type(const std::string &name) { void module::seal_block(ir::basic_block *block){ for(auto &x: incomplete_phis_[block]){ add_phi_operands(x.first, x.second); - try_remove_trivial_phis(x.second, nullptr); + set_value(x.first, try_remove_trivial_phis(x.second)); } sealed_blocks_.insert(block); incomplete_phis_[block].clear(); From 017702590bcf8cd6e52557849afeb055df6c38d2 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 26 Feb 2019 14:20:58 -0500 Subject: [PATCH 090/494] [intermediate representation] added ternary_inst --- examples/matrix.cpp | 30 ++++++++++++++++++++++++------ include/triton/ast/parser.y | 2 +- include/triton/ir/builder.h | 2 ++ include/triton/ir/instructions.h | 17 +++++++++++++++++ lib/codegen/selection.cpp | 2 -- lib/ir/builder.cpp | 10 ++++++++++ lib/ir/instructions.cpp | 15 +++++++++++++++ lib/ir/module.cpp | 3 ++- 8 files changed, 71 insertions(+), 10 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 2846ab844..4129990f2 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -60,11 +60,27 @@ void matmul(restrict readonly fp32 *a, restrict readonly fp32 *b, fp32 *c, int32 int1 checkc1[TN];\ int1 checkc[TM, TN];\ for(k = K; k > 0; k = k - TK){\ + int1 checka[TM, TK] = (k > bound);\ + int1 checkb[TN, TK] = (k > bound);\ + int1 checka0[TM];\ + int1 checka1[TK];\ + int1 checkb0[TN];\ + int1 checkb1[TK];\ C = dot(a, b, C);\ pa = pa + TK*M;\ pb = pb + TK*K;\ - a = *pa;\ - b = *pb;\ + @checka a = *pa;\ + @checkb b = *pb;\ + if(k > bound)\ + continue;\ + checka0 = rxa < M;\ + checka1 = rka < k;\ + checkb0 = ryb < N;\ + checkb1 = rkb < k;\ + checka = checka0[:, newaxis] && checka1[newaxis, :];\ + checkb = checkb0[:, newaxis] && checkb1[newaxis, :];\ + @checka a = *pa;\ + @checkb b = *pb;\ }\ checkc0 = rxc < M;\ checkc1 = ryc < N;\ @@ -243,14 +259,13 @@ int main() { // run passes - triton::ir::print(module, std::cout); - exit(EXIT_FAILURE); buffer_info.run(module); shared.run(module); liveness.run(module); allocation.run(); barriers.run(module); vectorize.run(module); + triton::ir::print(module, std::cout); selection.run(module, llvm_module); // llvm source @@ -260,7 +275,7 @@ int main() { manager.run(llvm_module); std::string src = generate_machine_code(llvm_module, "nvptx64-nvidia-cuda", compute_data_layout(true, true)); - std::cout << src << std::endl; +// std::cout << src << std::endl; // compile machine code CUdevice cu_device; @@ -308,6 +323,9 @@ int main() { checkCudaErrors(cuMemcpyDtoH(c.data(), d_c, sizeof(numeric_t) * c.size())); simple_gemm(rc, a, b, M, N, K); for(size_t i = 0; i < M*N; i++) - if(std::abs(c[i] - rc[i])/std::max(c[i], rc[i]) > 1e-4) + if(std::abs(c[i] - rc[i])/std::max(c[i], rc[i]) > 1e-4){ std::cout << i << " " << c[i] << " " << rc[i] << std::endl; + exit(EXIT_FAILURE); + } + std::cout << "Pass!" << std::endl; } diff --git a/include/triton/ast/parser.y b/include/triton/ast/parser.y index c12c7a662..a7d46e5a7 100644 --- a/include/triton/ast/parser.y +++ b/include/triton/ast/parser.y @@ -226,7 +226,7 @@ logical_or_expression /* Conditional */ conditional_expression : logical_or_expression { $$ = $1; } - | logical_or_expression '?' conditional_expression ':' conditional_expression { $$ = new conditional_expression($1, $2, $3); } + | logical_or_expression '?' conditional_expression ':' conditional_expression { $$ = new conditional_expression($1, $3, $5); } ; /* Assignment */ diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h index 577daad4e..08bd1ce12 100644 --- a/include/triton/ir/builder.h +++ b/include/triton/ir/builder.h @@ -50,6 +50,8 @@ public: value* create_br(basic_block *dest); value* create_cond_br(value *cond, basic_block* if_dest, basic_block* else_dest); value* create_ret_void(); + // Tile-level control flow + value *create_ternary(value *cond, value *true_value, value *false_value, const std::string &name = ""); // Cast instructions value *create_cast(cast_inst::op_t op, value *v, type *dst_ty, const std::string &name = ""); value* create_si_to_fp(value *src, type *dst_ty, const std::string &name = ""); diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index 54f313f88..74e71db0a 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -289,6 +289,23 @@ private: public: basic_block *get_dest() { return (basic_block*)get_operand(0); } }; + +// ternary +class ternary_inst: public instruction { +private: + std::string repr_impl() const { return "ternary"; } + ternary_inst(value *cond, value *true_value, value *false_value, + const std::string &name, instruction *next); + +public: + value *get_cond() { return get_operand(0); } + value *get_true_value() { return get_operand(1); } + value *get_false_value() { return get_operand(2); } + static ternary_inst* create(value *cond, value *true_value, value *false_value, + const std::string &name = "", instruction *next = nullptr); + +}; + //===----------------------------------------------------------------------===// // getelementptr_inst classes //===----------------------------------------------------------------------===// diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 3114a05db..b1f88a8b1 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -681,7 +681,6 @@ void selection::lower_instruction(ir::instruction *src, IRBuilder<> &builder) { } else { Instruction *i = (Instruction*)llvm_value(src, builder); - std::cout << "instruction: " << src->get_name() << " " << src->has_tile_result_or_op() << std::endl; vmap_[src] = i; } } @@ -797,7 +796,6 @@ void selection::run(ir::module &src, Module &dst){ }); } else { - std::cout << phi->get_name() << " " << inc_val->get_name() << std::endl; PHINode *llvm_phi = (PHINode*)vmap_.at(phi); Value *llvm_inc_val = vmap_.at(inc_val); llvm_phi->addIncoming(llvm_inc_val, llvm_inc_block); diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index 5d798acc8..023ccb999 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -67,6 +67,16 @@ value *builder::create_ret_void() { return insert(return_inst::create(ctx_)); } + +//===----------------------------------------------------------------------===// +// tile-level control-flow instructions +//===----------------------------------------------------------------------===// + +value *builder::create_ternary(value *cond, value *true_value, value *false_value, const std::string &name){ + return insert(ternary_inst::create(cond, true_value, false_value, name)); +} + + //===----------------------------------------------------------------------===// // cast instructions //===----------------------------------------------------------------------===// diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 984ab9b4a..47273bf63 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -311,6 +311,21 @@ cond_branch_inst::cond_branch_inst(basic_block *if_dst, basic_block *else_dst, v set_operand(2, cond); } +// ternary_inst +ternary_inst::ternary_inst(value *cond, value *true_value, value *false_value, const std::string &name, instruction *next) + : instruction(true_value->get_type(), 3) { + assert(true_value->get_type() == false_value->get_type()); + set_operand(0, cond); + set_operand(1, true_value); + set_operand(2, false_value); +} + +ternary_inst *ternary_inst::create(value *cond, value *true_value, value *false_value, + const std::string &name, instruction *next) { + return new ternary_inst(cond, true_value, false_value, name, next); +} + + //===----------------------------------------------------------------------===// // getelementptr_inst classes //===----------------------------------------------------------------------===// diff --git a/lib/ir/module.cpp b/lib/ir/module.cpp index 163894890..e5764d010 100644 --- a/lib/ir/module.cpp +++ b/lib/ir/module.cpp @@ -155,7 +155,8 @@ ir::type *module::get_type(const std::string &name) { void module::seal_block(ir::basic_block *block){ for(auto &x: incomplete_phis_[block]){ add_phi_operands(x.first, x.second); - set_value(x.first, try_remove_trivial_phis(x.second)); + if(get_value(x.first) == x.second) + set_value(x.first, try_remove_trivial_phis(x.second)); } sealed_blocks_.insert(block); incomplete_phis_[block].clear(); From 36acf22fd3c83dacc760f2a7ade9a9bc8865cf36 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 28 Feb 2019 23:46:11 -0500 Subject: [PATCH 091/494] better masking --- examples/matrix.cpp | 8 ++- include/triton/ir/builder.h | 3 +- include/triton/ir/instructions.h | 45 +++++++++++++- lib/ast/lowering.cpp | 28 +++++++-- lib/codegen/selection.cpp | 102 +++++++++++++++++++------------ lib/codegen/tune.cpp | 13 ++-- lib/ir/builder.cpp | 8 ++- lib/ir/instructions.cpp | 70 ++++++++++++++------- lib/ir/print.cpp | 12 +++- 9 files changed, 203 insertions(+), 86 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 4129990f2..e023dc2ee 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -209,6 +209,7 @@ int main() { llvm::Module llvm_module("matmul", llvm_context); + triton::ir::print(module, std::cout); // create passes triton::codegen::buffer_info_pass buffer_info; @@ -220,6 +221,7 @@ int main() { triton::codegen::vectorize vectorize(&tune); triton::codegen::selection selection(&allocation, &tune, &buffer_info); + // tuning parameters tune.run(module); std::vector params = { @@ -246,6 +248,9 @@ int main() { context.p_impl->mp_constants_[2]->set_value(params[2]); for(unsigned *x: tune.get_params(module)) *x = params[3 + i++]; + + + // constraints std::map> errors; tune.check_constraints(module, errors); @@ -265,12 +270,11 @@ int main() { allocation.run(); barriers.run(module); vectorize.run(module); - triton::ir::print(module, std::cout); selection.run(module, llvm_module); // llvm source llvm::legacy::PassManager manager; -// manager.add(llvm::createPrintModulePass(llvm::outs())); + manager.add(llvm::createPrintModulePass(llvm::outs())); manager.add(llvm::createVerifierPass(true)); manager.run(llvm_module); diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h index 08bd1ce12..509ae8e47 100644 --- a/include/triton/ir/builder.h +++ b/include/triton/ir/builder.h @@ -51,7 +51,8 @@ public: value* create_cond_br(value *cond, basic_block* if_dest, basic_block* else_dest); value* create_ret_void(); // Tile-level control flow - value *create_ternary(value *cond, value *true_value, value *false_value, const std::string &name = ""); + value *create_mask(value *pred, const std::string &name = ""); + value *create_merge(value *mask_true, value *value_true, value *mask_false, value *value_false, const std::string &name = ""); // Cast instructions value *create_cast(cast_inst::op_t op, value *v, type *dst_ty, const std::string &name = ""); value* create_si_to_fp(value *src, type *dst_ty, const std::string &name = ""); diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index 74e71db0a..2328a4a8f 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -16,6 +16,7 @@ class context; // instruction classes //===----------------------------------------------------------------------===// +class result_reference; class instruction: public user{ public: struct mask_info_t { @@ -27,7 +28,7 @@ public: protected: // constructors - instruction(type *ty, unsigned num_ops, const std::string &name = "", instruction *next = nullptr); + instruction(type *ty, unsigned num_ops, unsigned num_results = 1, const std::string &name = "", instruction *next = nullptr); public: // parent @@ -38,15 +39,33 @@ public: // mask void set_mask_pred(value *pred) { resize_hidden(1); set_operand(get_num_operands(), pred); } value* get_mask_pred() const { if(get_num_hidden() == 0) return nullptr; return get_operand(get_num_operands()); } + void set_mask_else(value *x) { resize_hidden(2); set_operand(get_num_operands() + 1, x); } + value* get_mask_else() const { if(get_num_hidden() < 2) return nullptr; return get_operand(get_num_operands() + 1); } // helpers bool has_tile_result_or_op(); // repr std::string repr() const { return repr_impl(); } + // results + unsigned get_num_results() const { return results_.size(); } + value* get_result(unsigned i) { return results_.at(i); } private: basic_block *parent_; value *pred_; value *mask_pred_; + std::vector results_; +}; + +// result reference +class result_reference: public value { +public: + result_reference(instruction *ref, unsigned arg_id, const std::string &name = ""); + instruction *get_ref(); + unsigned get_arg_id(); + +private: + instruction *ref_; + unsigned arg_id_; }; //===----------------------------------------------------------------------===// @@ -303,6 +322,30 @@ public: value *get_false_value() { return get_operand(2); } static ternary_inst* create(value *cond, value *true_value, value *false_value, const std::string &name = "", instruction *next = nullptr); +}; + +// mask +class mask_inst: public instruction { +private: + std::string repr_impl() const { return "mask"; } + mask_inst(ir::value *pred, const std::string &name, instruction *next); + +public: + static mask_inst* create(ir::value *pred, const std::string &name = "", instruction *next = nullptr); +}; + +// merge +class merge_inst: public instruction { +private: + std::string repr_impl() const { return "merge"; } + merge_inst(ir::value *mask_true, ir::value *value_true, + ir::value *mask_false, ir::value *value_false, + const std::string &name, instruction *next); + +public: + static merge_inst* create(ir::value *mask_true, ir::value *value_true, + ir::value *mask_false, ir::value *value_false, + const std::string &name = "", instruction *next = nullptr); }; diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 49d173f1b..34cc36e2f 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -301,12 +301,20 @@ ir::value* compound_statement::codegen(ir::module* mod) const{ /* expression statement */ ir::value* expression_statement::codegen(ir::module *mod) const{ + ir::builder &builder = mod->get_builder(); ir::value *expr = expr_->codegen(mod); if(mask_) { - ir::instruction *itn = dynamic_cast(expr); - assert(itn); - ir::value *mask = mask_->codegen(mod); - itn->set_mask_pred(mask); + ir::value *pred = mask_->codegen(mod); + ir::mask_inst *mask = (ir::mask_inst*)builder.create_mask(pred); + ir::value *true_value = expr_->codegen(mod); + ir::type *ty = true_value->get_type(); + if(auto *itn = dynamic_cast(true_value)) + itn->set_mask_pred(mask->get_result(0)); + if(expr->get_type()->is_void_ty()) + return expr; + ir::merge_inst *merge = (ir::merge_inst*)builder.create_merge(mask->get_result(0), true_value, + mask->get_result(1), ir::undef_value::get(ty)); + return merge; } return expr; } @@ -596,10 +604,18 @@ ir::value *conditional_expression::llvm_op(ir::builder &builder, ir::value *cond } ir::value *conditional_expression::codegen(ir::module *mod) const{ + ir::builder &builder = mod->get_builder(); ir::value *cond = cond_->codegen(mod); - ir::value *true_value = true_value_->codegen(mod); ir::value *false_value = false_value_->codegen(mod); - return llvm_op(mod->get_builder(), cond, true_value, false_value, ""); + ir::value *true_value = true_value_->codegen(mod); + bool is_float, is_ptr, is_int, is_signed; + implicit_cast(builder, true_value, false_value, is_float, is_ptr, is_int, is_signed); + implicit_broadcast(mod, true_value, false_value); + ir::instruction *itn = dynamic_cast(true_value); + assert(itn); + itn->set_mask_pred(cond); + itn->set_mask_else(false_value); + return itn; } /* Assignment expression */ diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index b1f88a8b1..1c8269257 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -472,7 +472,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, distributed_tile *T = new distributed_tile(ty, shapes2, axes, builder, vectorize); tmap_.insert({v, T}); // constant range - if(dynamic_cast(v)){ + if(dynamic_cast(v) && !dynamic_cast(v)){ T->for_each([&](indices_t idx){ assert(idx.size() == 1); T->set_value(idx, idx[0]); @@ -494,15 +494,21 @@ void selection::init_grids(ir::function *fn, IRBuilder<> &builder, Value *sh_mem std::vector grids; std::map references; create_grids(grids, references, fn); - for(ir::value* i: grids) - init_axes(i, builder, u_thread_warp_id, u_warp_id); + for(ir::value* i: grids){ + if(auto *instr = dynamic_cast(i)) + for(unsigned r = 0; r < instr->get_num_results(); r++) + init_axes(instr->get_result(r), builder, u_thread_warp_id, u_warp_id); + else + init_axes(i, builder, u_thread_warp_id, u_warp_id); + } // create tile std::set seen; for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()){ if(!i->get_type()->is_tile_ty()) continue; - create_tile(i, builder, references, seen, sh_mem_ptr); + for(unsigned r = 0; r < i->get_num_results(); r++) + create_tile(i->get_result(r), builder, references, seen, sh_mem_ptr); } } @@ -510,46 +516,43 @@ void selection::init_grids(ir::function *fn, IRBuilder<> &builder, Value *sh_mem void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> &builder) { BasicBlock *block = builder.GetInsertBlock(); Module *module = block->getModule(); - Function *function = block->getParent(); - ir::value* mask_pred = ins->get_mask_pred(); LLVMContext &ctx = builder.getContext(); - // helper to handle masks - auto insert_masked = [&](indices_t idx, std::function insert_value) { - BasicBlock *block = builder.GetInsertBlock(); - Value *result; - if(mask_pred){ -// if(mask.else_value) -// std::cout << mask.else_value << std::endl; - Value *llvm_mask = tmap_.at(mask_pred)->get_value(idx); - BasicBlock *then_bb = BasicBlock::Create(ctx, "", function); - BasicBlock *done_bb = BasicBlock::Create(ctx, "", function); - builder.CreateCondBr(llvm_mask, then_bb, done_bb); - builder.SetInsertPoint(then_bb); - result = insert_value(); - builder.CreateBr(done_bb); - builder.SetInsertPoint(done_bb); - if(!ins->get_type()->is_void_ty()){ - Type *ty = result->getType(); - PHINode *phi = builder.CreatePHI(ty, 2); -// if(mask.else_value) -// phi->addIncoming(tmap_.at(mask.else_value)->get_value(idx), block); +// // helper to handle masks +// auto insert_masked = [&](indices_t idx, std::function insert_value) { +// BasicBlock *block = builder.GetInsertBlock(); +// Value *result; +// if(mask_pred){ +// Value *llvm_mask = tmap_.at(mask_pred)->get_value(idx); +// BasicBlock *then_bb = BasicBlock::Create(ctx, "", function); +// BasicBlock *done_bb = BasicBlock::Create(ctx, "", function); +// builder.CreateCondBr(llvm_mask, then_bb, done_bb); +// builder.SetInsertPoint(then_bb); +// result = insert_value(); +// builder.CreateBr(done_bb); +// builder.SetInsertPoint(done_bb); +// if(!ins->get_type()->is_void_ty()){ +// Type *ty = result->getType(); +// PHINode *phi = builder.CreatePHI(ty, 2); +// if(mask_else) +// phi->addIncoming(tmap_.at(mask_else)->get_value(idx), block); // else - phi->addIncoming(llvm::UndefValue::get(ty), block); - phi->addIncoming(result, then_bb); - return (Value*)phi; - } - } - else - result = insert_value(); - return result; - }; +// phi->addIncoming(llvm::UndefValue::get(ty), block); +// phi->addIncoming(result, then_bb); +// return (Value*)phi; +// } +// } +// else +// result = insert_value(); +// return result; +// }; + std::cout << ins->get_name() << " " << typeid(*ins).name() << std::endl; // store if(auto *x = dynamic_cast(ins)) { distributed_tile* ptr = (distributed_tile*)tmap_.at(x->get_pointer_operand()); tile *value = tmap_.at(x->get_value_operand()); ptr->for_each([&](indices_t idx){ - insert_masked(idx, [&]{ return builder.CreateStore(value->get_value(idx), ptr->get_value(idx)); }); + builder.CreateStore(value->get_value(idx), ptr->get_value(idx)); }); } else { @@ -570,9 +573,30 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & Value *offset = builder.CreateMul(builder.getInt32(shapes[0]->get_value()), group_id); result->for_each([&](indices_t idx){ BinaryOperator *bin = static_cast(idx[0]); - result->set_value(idx, insert_masked(idx, [&]{ return builder.CreateAdd(bin, offset); })); + result->set_value(idx, builder.CreateAdd(bin, offset)); }); } + // mask + else if(dynamic_cast(ins)) { +// distributed_tile* pred = (distributed_tile*)ins->get_operand(0); +// BasicBlock *mask_done_bb = BasicBlock::Create(ctx, "mask_done"); +// pred->for_each([&](indices_t idx){ +// BasicBlock *mask_if_bb = BasicBlock::Create(ctx, "mask_if"); +// BasicBlock* mask_else_bb = BasicBlock::Create(ctx, "mask_else"); +// builder.CreateCondBr(pred->get_value(idx), mask_if_bb, mask_else_bb); +// builder.SetInsertPoint(mask_if_bb); +// builder.CreateBr(mask_done_bb); +// builder.SetInsertPoint(mask_else_bb); +// builder.CreateBr(mask_done_bb); +// }); +// builder.SetInsertPoint(mask_done_bb); + } + // merge + else if(dynamic_cast(ins)) { +// result->for_each([&](indices_t idx){ +// std::cout << "merge" << std::endl; +// }); + } // reshape else if(dynamic_cast(ins)) { ir::value* in = ins->get_operand(0); @@ -589,7 +613,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & // splat else if(dynamic_cast(ins)) { result->for_each([&](indices_t idx) { - result->set_value(idx, insert_masked(idx, [&]{ return llvm_value(ins->get_operand(0), builder); })); + result->set_value(idx, llvm_value(ins->get_operand(0), builder)); }); } // broadcast @@ -667,7 +691,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & else return llvm_value(x, builder); }; - result->set_value(idx, insert_masked(idx, [&]() { return llvm_inst(ins, value, builder); })); + result->set_value(idx, llvm_inst(ins, value, builder)); }); } } diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 846cd985e..0c64401de 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -70,15 +70,10 @@ void tune::init_c_graph(ir::instruction *v) { } // Element-wise else if(dynamic_cast(v)){ - for(unsigned i = 0; i < shapes.size(); i ++) - for(ir::value* op: v->ops()) - add_constraint({v, i}, {op, i}); - } - - /* Add mask constraints */ - if(ir::value *pred = v->get_mask_pred()){ - for(unsigned i = 0; i < shapes.size(); i++) - add_constraint({v->ops()[0], i}, {pred, i}); + for(unsigned k = 0; k < v->get_num_results(); k++) + for(unsigned i = 0; i < shapes.size(); i ++) + for(ir::value* op: v->ops()) + add_constraint({v->get_result(k), i}, {op, i}); } } diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index 023ccb999..b3c1174ce 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -72,8 +72,12 @@ value *builder::create_ret_void() { // tile-level control-flow instructions //===----------------------------------------------------------------------===// -value *builder::create_ternary(value *cond, value *true_value, value *false_value, const std::string &name){ - return insert(ternary_inst::create(cond, true_value, false_value, name)); +value *builder::create_mask(value *pred, const std::string &name){ + return insert(mask_inst::create(pred, name)); +} + +value *builder::create_merge(value *mask_true, value *value_true, value *mask_false, value *value_false, const std::string &name) { + return insert(merge_inst::create(mask_true, value_true, mask_false, value_false, name)); } diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 47273bf63..94a70e802 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -11,7 +11,7 @@ namespace ir{ // instruction classes //===----------------------------------------------------------------------===// -instruction::instruction(type *ty, unsigned num_ops, const std::string &name, instruction *next) +instruction::instruction(type *ty, unsigned num_ops, unsigned num_results, const std::string &name, instruction *next) : user(ty, num_ops, name) { if(next){ basic_block *block = next->get_parent(); @@ -19,6 +19,11 @@ instruction::instruction(type *ty, unsigned num_ops, const std::string &name, in auto it = std::find(block->begin(), block->end(), next); block->get_inst_list().insert(it, next); } + if(num_results == 1) + results_.push_back(this); + else + for(unsigned i = 0; i < num_results; i++) + results_.push_back(new result_reference(this, i)); } void instruction::erase_from_parent() { @@ -32,12 +37,17 @@ bool instruction::has_tile_result_or_op() { return result; } + +// result reference +result_reference::result_reference(instruction *ref, unsigned arg_id, const std::string &name) + : value(ref->get_type(), name), arg_id_(arg_id){ } + //===----------------------------------------------------------------------===// // phi_node classes //===----------------------------------------------------------------------===// phi_node::phi_node(type *ty, unsigned num_reserved, std::string const &name, instruction *next) - : instruction(ty, 0, name, next) { + : instruction(ty, 0, 1, name, next) { blocks_.reserve(num_reserved); } @@ -98,7 +108,7 @@ std::string binary_operator::repr_impl() const { } binary_operator::binary_operator(op_t op, value *lhs, value *rhs, type *ty, const std::string &name, instruction *next) - : instruction(ty, 2, name, next), op_(op){ + : instruction(ty, 2, 1, name, next), op_(op){ set_operand(0, lhs); set_operand(1, rhs); } @@ -165,7 +175,7 @@ std::string cmp_inst::repr_impl() const { } cmp_inst::cmp_inst(type *ty, cmp_inst::pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next) - : instruction(ty, 2, name, next), pred_(pred) { + : instruction(ty, 2, 1, name, next), pred_(pred) { set_operand(0, lhs); set_operand(1, rhs); } @@ -205,7 +215,7 @@ fcmp_inst* fcmp_inst::create(pred_t pred, value *lhs, value *rhs, const std::str //===----------------------------------------------------------------------===// unary_inst::unary_inst(type *ty, value *v, const std::string &name, instruction *next) - : instruction(ty, 1, name, next) { + : instruction(ty, 1, 1, name, next) { set_operand(0, v); } @@ -275,7 +285,7 @@ cast_inst *cast_inst::create_integer_cast(value *arg, type *ty, bool is_signed, // return_inst return_inst::return_inst(context &ctx, value *ret_val, instruction *next) - : terminator_inst(type::get_void_ty(ctx), ret_val!=nullptr, "", next){ + : terminator_inst(type::get_void_ty(ctx), ret_val!=nullptr, 0, "", next){ if(ret_val) set_operand(0, ret_val); } @@ -298,40 +308,54 @@ branch_inst* branch_inst::create(value *cond, basic_block *if_dst, basic_block * // uncond_branch_inst uncond_branch_inst::uncond_branch_inst(basic_block *dst, instruction *next) - : branch_inst(type::get_void_ty(dst->get_context()), 1, "", next){ + : branch_inst(type::get_void_ty(dst->get_context()), 1, 0, "", next){ set_operand(0, dst); } // cond_branch_inst cond_branch_inst::cond_branch_inst(basic_block *if_dst, basic_block *else_dst, value *cond, instruction *next) - : branch_inst(type::get_void_ty(if_dst->get_context()), 3, "", next){ + : branch_inst(type::get_void_ty(if_dst->get_context()), 3, 0, "", next){ assert(cond->get_type()->is_integer_ty(1) && "May only branch on boolean predicates!"); set_operand(0, if_dst); set_operand(1, else_dst); set_operand(2, cond); } -// ternary_inst -ternary_inst::ternary_inst(value *cond, value *true_value, value *false_value, const std::string &name, instruction *next) - : instruction(true_value->get_type(), 3) { - assert(true_value->get_type() == false_value->get_type()); - set_operand(0, cond); - set_operand(1, true_value); - set_operand(2, false_value); +// mask_inst +mask_inst::mask_inst(value *pred, const std::string &name, instruction *next) + : instruction(pred->get_type(), 1, 2, name, next) { + set_operand(0, pred); } -ternary_inst *ternary_inst::create(value *cond, value *true_value, value *false_value, - const std::string &name, instruction *next) { - return new ternary_inst(cond, true_value, false_value, name, next); +mask_inst* mask_inst::create(value *pred, const std::string &name, instruction *next) { + return new mask_inst(pred, name, next); } +// merge_inst +merge_inst::merge_inst(value *mask_true, value *value_true, + value *mask_false, value *value_false, + const std::string &name, instruction *next) + : instruction(value_true->get_type(), 4, 1, name, next) { + set_operand(0, mask_true); + set_operand(1, value_true); + set_operand(2, mask_false); + set_operand(3, value_false); +} + +merge_inst* merge_inst::create(value *mask_true, value *value_true, + value *mask_false, value *value_false, + const std::string &name, instruction *next) { + return new merge_inst(mask_true, value_true, mask_false, value_false, name, next); +} + + //===----------------------------------------------------------------------===// // getelementptr_inst classes //===----------------------------------------------------------------------===// getelementptr_inst::getelementptr_inst(type *pointee_ty, value *ptr, const std::vector &idx, const std::string &name, instruction *next) - : instruction(get_return_type(pointee_ty, ptr, idx), 1 + idx.size(), name, next), + : instruction(get_return_type(pointee_ty, ptr, idx), 1 + idx.size(), 1, name, next), source_elt_ty(pointee_ty), res_elt_ty(get_indexed_type(pointee_ty, idx)){ type *expected_ty = ((pointer_type*)(get_type()->get_scalar_ty()))->get_element_ty(); @@ -407,7 +431,7 @@ load_inst* load_inst::create(value *ptr, const std::string &name, instruction *n // store store_inst::store_inst(value *ptr, value *v, const std::string &name, instruction *next) - : instruction(type::get_void_ty(ptr->get_type()->get_context()), 2, name, next) { + : instruction(type::get_void_ty(ptr->get_type()->get_context()), 2, 1, name, next) { set_operand(0, ptr); set_operand(1, v); } @@ -465,7 +489,7 @@ instruction* broadcast_inst::create(value *arg, const type::tile_shapes_t &shape matmul_inst::matmul_inst(value *A, value *B, value *C, const std::string &name, instruction *next) - : builtin_inst(C->get_type(), 3, name, next) { + : builtin_inst(C->get_type(), 3, 0, name, next) { set_operand(0, A); set_operand(1, B); set_operand(2, C); @@ -481,7 +505,7 @@ instruction *matmul_inst::create(value *A, value *B, value *C, //===----------------------------------------------------------------------===// get_global_range_inst::get_global_range_inst(type *ty, unsigned axis, const std::string &name, instruction *next) - : builtin_inst(ty, 0, name, next), axis_(axis) { + : builtin_inst(ty, 0, 1, name, next), axis_(axis) { } @@ -506,7 +530,7 @@ vectorize_inst* vectorize_inst::create(value *arg, const std::string &name, inst barrier_inst::barrier_inst(context &ctx, const std::string &name, instruction *next) - : instruction(type::get_void_ty(ctx), 0, name, next){ } + : instruction(type::get_void_ty(ctx), 0, 1, name, next){ } barrier_inst* barrier_inst::create(context &ctx, const std::string &name, instruction *next) { return new barrier_inst(ctx, name, next); diff --git a/lib/ir/print.cpp b/lib/ir/print.cpp index d1313bbcb..f4f117ff7 100644 --- a/lib/ir/print.cpp +++ b/lib/ir/print.cpp @@ -37,13 +37,19 @@ void print(module &mod, std::ostream& os) { os << " "; if(ir::value *pred = inst->get_mask_pred()) os << "@" << get_name(pred, cnt++) << " "; - if(!inst->get_type()->is_void_ty()) - os << get_name(inst, cnt++) << " = "; + unsigned num_results = inst->get_num_results(); + for(unsigned i = 0; i < num_results; i++){ + os << get_name(inst->get_result(i), cnt++); + if(i < num_results - 1) + os << ", "; + else + os << " = "; + } os << inst->repr(); ir::instruction::ops_t ops = inst->ops(); size_t num_ops = inst->get_num_operands(); if(num_ops > 0) - os << " "; + os << " ";; for(unsigned i = 0; i < num_ops; i++) os << get_name(ops[i], cnt++) << (i < num_ops - 1?", ":""); os << ";" << std::endl; From 08fcfbca47b80b733ac721d2ce6050b7f1f76a69 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 1 Mar 2019 14:36:17 -0500 Subject: [PATCH 092/494] [code generation] better predication --- examples/matrix.cpp | 3 +- include/triton/ast/ast.h | 2 + include/triton/codegen/selection.h | 3 + include/triton/ir/instructions.h | 4 ++ lib/ast/lowering.cpp | 12 ++-- lib/codegen/selection.cpp | 111 +++++++++++++++-------------- 6 files changed, 76 insertions(+), 59 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index e023dc2ee..752f60bfe 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -209,7 +209,6 @@ int main() { llvm::Module llvm_module("matmul", llvm_context); - triton::ir::print(module, std::cout); // create passes triton::codegen::buffer_info_pass buffer_info; @@ -264,12 +263,14 @@ int main() { // run passes + triton::ir::print(module, std::cout); buffer_info.run(module); shared.run(module); liveness.run(module); allocation.run(); barriers.run(module); vectorize.run(module); + triton::ir::print(module, std::cout); selection.run(module, llvm_module); // llvm source diff --git a/include/triton/ast/ast.h b/include/triton/ast/ast.h index ab96bfa36..b9ae16ea0 100644 --- a/include/triton/ast/ast.h +++ b/include/triton/ast/ast.h @@ -299,6 +299,8 @@ public: : lvalue_((named_expression*)lvalue), op_(op), rvalue_((expression*)rvalue) { } ir::value* codegen(ir::module *mod) const; + const expression *lvalue() const { return lvalue_; } + const expression *rvalue() const { return rvalue_; } public: ASSIGN_OP_T op_; diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index b0ebc8c5e..c8632262e 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -100,6 +100,7 @@ private: class selection{ typedef std::map vmap_t; typedef std::map tmap_t; + typedef std::map, llvm::BasicBlock*> pmap_t; private: // utils @@ -131,6 +132,8 @@ public: private: vmap_t vmap_; tmap_t tmap_; + pmap_t pmap_; + pmap_t last_block_; allocation *alloc_; tune *params_; buffer_info_pass *buffer_info_; diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index 2328a4a8f..2d8e7d91d 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -346,6 +346,10 @@ public: static merge_inst* create(ir::value *mask_true, ir::value *value_true, ir::value *mask_false, ir::value *value_false, const std::string &name = "", instruction *next = nullptr); + ir::value *get_mask_true() { return get_operand(0); } + ir::value *get_value_true() { return get_operand(1); } + ir::value *get_mask_false() { return get_operand(2); } + ir::value *get_value_false() { return get_operand(3); } }; diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 34cc36e2f..c9ad27a01 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -302,21 +302,25 @@ ir::value* compound_statement::codegen(ir::module* mod) const{ /* expression statement */ ir::value* expression_statement::codegen(ir::module *mod) const{ ir::builder &builder = mod->get_builder(); - ir::value *expr = expr_->codegen(mod); if(mask_) { ir::value *pred = mask_->codegen(mod); ir::mask_inst *mask = (ir::mask_inst*)builder.create_mask(pred); ir::value *true_value = expr_->codegen(mod); + assignment_expression *assignment = dynamic_cast(expr_); + assert(assignment); + ir::type *ty = true_value->get_type(); if(auto *itn = dynamic_cast(true_value)) itn->set_mask_pred(mask->get_result(0)); - if(expr->get_type()->is_void_ty()) - return expr; + if(ty->is_void_ty()) + return true_value; ir::merge_inst *merge = (ir::merge_inst*)builder.create_merge(mask->get_result(0), true_value, mask->get_result(1), ir::undef_value::get(ty)); + std::string name = ((named_expression*)assignment->lvalue())->id()->name(); + mod->set_value(name, merge); return merge; } - return expr; + return expr_->codegen(mod); } /* Iteration statement */ diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 1c8269257..52f8011c9 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -517,41 +517,21 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & BasicBlock *block = builder.GetInsertBlock(); Module *module = block->getModule(); LLVMContext &ctx = builder.getContext(); -// // helper to handle masks -// auto insert_masked = [&](indices_t idx, std::function insert_value) { -// BasicBlock *block = builder.GetInsertBlock(); -// Value *result; -// if(mask_pred){ -// Value *llvm_mask = tmap_.at(mask_pred)->get_value(idx); -// BasicBlock *then_bb = BasicBlock::Create(ctx, "", function); -// BasicBlock *done_bb = BasicBlock::Create(ctx, "", function); -// builder.CreateCondBr(llvm_mask, then_bb, done_bb); -// builder.SetInsertPoint(then_bb); -// result = insert_value(); -// builder.CreateBr(done_bb); -// builder.SetInsertPoint(done_bb); -// if(!ins->get_type()->is_void_ty()){ -// Type *ty = result->getType(); -// PHINode *phi = builder.CreatePHI(ty, 2); -// if(mask_else) -// phi->addIncoming(tmap_.at(mask_else)->get_value(idx), block); -// else -// phi->addIncoming(llvm::UndefValue::get(ty), block); -// phi->addIncoming(result, then_bb); -// return (Value*)phi; -// } -// } -// else -// result = insert_value(); -// return result; -// }; - - std::cout << ins->get_name() << " " << typeid(*ins).name() << std::endl; + Function *fn = block->getParent(); + ir::value *mask = ins->get_mask_pred(); + auto set_mask_insert_pt = [&](indices_t idx){ + if(mask){ + distributed_tile *mask_tile = (distributed_tile*)tmap_.at(ins->get_mask_pred()); + BasicBlock *block = pmap_.at({mask_tile, idx}); + builder.SetInsertPoint(block->getTerminator()); + } + }; // store if(auto *x = dynamic_cast(ins)) { distributed_tile* ptr = (distributed_tile*)tmap_.at(x->get_pointer_operand()); tile *value = tmap_.at(x->get_value_operand()); ptr->for_each([&](indices_t idx){ + set_mask_insert_pt(idx); builder.CreateStore(value->get_value(idx), ptr->get_value(idx)); }); } @@ -578,24 +558,46 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & } // mask else if(dynamic_cast(ins)) { -// distributed_tile* pred = (distributed_tile*)ins->get_operand(0); -// BasicBlock *mask_done_bb = BasicBlock::Create(ctx, "mask_done"); -// pred->for_each([&](indices_t idx){ -// BasicBlock *mask_if_bb = BasicBlock::Create(ctx, "mask_if"); -// BasicBlock* mask_else_bb = BasicBlock::Create(ctx, "mask_else"); -// builder.CreateCondBr(pred->get_value(idx), mask_if_bb, mask_else_bb); -// builder.SetInsertPoint(mask_if_bb); -// builder.CreateBr(mask_done_bb); -// builder.SetInsertPoint(mask_else_bb); -// builder.CreateBr(mask_done_bb); -// }); -// builder.SetInsertPoint(mask_done_bb); + distributed_tile* pred = (distributed_tile*)tmap_.at(ins->get_operand(0)); + distributed_tile* mask_tile_true = (distributed_tile*)tmap_.at(ins->get_result(0)); + distributed_tile* mask_tile_false = (distributed_tile*)tmap_.at(ins->get_result(1)); + pred->for_each([&](indices_t idx){ + BasicBlock *mask_then_bb = BasicBlock::Create(ctx, "mask_then", fn); + BasicBlock* mask_else_bb = BasicBlock::Create(ctx, "mask_else", fn); + BasicBlock *mask_done_bb = BasicBlock::Create(ctx, "mask_done", fn); + builder.CreateCondBr(pred->get_value(idx), mask_then_bb, mask_else_bb); + builder.SetInsertPoint(mask_then_bb); + builder.CreateBr(mask_done_bb); + builder.SetInsertPoint(mask_else_bb); + builder.CreateBr(mask_done_bb); + builder.SetInsertPoint(mask_done_bb); + pmap_.insert({{mask_tile_true, idx}, mask_then_bb}); + pmap_.insert({{mask_tile_false, idx}, mask_else_bb}); + last_block_.insert({{mask_tile_true, idx}, mask_done_bb}); + last_block_.insert({{mask_tile_false, idx}, mask_done_bb}); + }); } // merge - else if(dynamic_cast(ins)) { -// result->for_each([&](indices_t idx){ -// std::cout << "merge" << std::endl; -// }); + else if(auto *merge = dynamic_cast(ins)) { + distributed_tile* mask_tile_true = (distributed_tile*)tmap_.at(merge->get_mask_true()); + distributed_tile *value_tile_true = (distributed_tile*)tmap_.at(merge->get_value_true()); + distributed_tile* mask_tile_false = (distributed_tile*)tmap_.at(merge->get_mask_false()); + distributed_tile *value_tile_false = (distributed_tile*)tmap_.at(merge->get_value_false()); + result->for_each([&](indices_t idx){ + BasicBlock *block_true = pmap_.at({mask_tile_true, idx}); + Value *value_true = value_tile_true->get_value(idx); + BasicBlock *block_false = pmap_.at({mask_tile_false, idx}); + Value *value_false = value_tile_false->get_value(idx); + BasicBlock *block_done = last_block_.at({mask_tile_true, idx}); + if(block_done->empty()) + builder.SetInsertPoint(block_done); + else + builder.SetInsertPoint(block_done->getTerminator()); + PHINode *phi = builder.CreatePHI(value_true->getType(), 2); + phi->addIncoming(value_true, block_true); + phi->addIncoming(value_false,block_false); + result->set_value(idx, phi); + }); } // reshape else if(dynamic_cast(ins)) { @@ -691,12 +693,13 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & else return llvm_value(x, builder); }; + set_mask_insert_pt(idx); result->set_value(idx, llvm_inst(ins, value, builder)); }); } } - - + if(mask) + builder.SetInsertPoint(block); } void selection::lower_instruction(ir::instruction *src, IRBuilder<> &builder) { @@ -722,7 +725,6 @@ void selection::run(ir::module &src, Module &dst){ vmap_.clear(); LLVMContext &dst_ctx = dst.getContext(); IRBuilder<> dst_builder(dst_ctx); - std::map block_of; // iterate over functions for(ir::function *fn: src.get_function_list()) { @@ -771,12 +773,13 @@ void selection::run(ir::module &src, Module &dst){ BasicBlock *parent = (BasicBlock*)vmap_[block]; dst_builder.SetInsertPoint(parent); for(ir::instruction *i: block->get_inst_list()){ - if(dynamic_cast(i) && !parent->empty()){ - dst_builder.SetInsertPoint(&*parent->getFirstInsertionPt()); - } + BasicBlock *current = dst_builder.GetInsertBlock(); + bool phi_inserted = (dynamic_cast(i) || dynamic_cast(i)) && !current->empty(); + if(phi_inserted) + dst_builder.SetInsertPoint(&*current->getFirstInsertionPt()); lower_instruction(i, dst_builder); - if(dynamic_cast(i) && !parent->empty()) - dst_builder.SetInsertPoint(parent); + if(phi_inserted) + dst_builder.SetInsertPoint(current); last_block[block] = dst_builder.GetInsertBlock(); } } From 2467c5e504b4381dcd2a05ef309a37c2ad9eecb4 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 1 Mar 2019 21:53:35 -0500 Subject: [PATCH 093/494] [code generation] added ternary operator --- examples/matrix.cpp | 7 +++--- lib/ast/lowering.cpp | 46 +++++++++++++++++++++++++++++---------- lib/codegen/selection.cpp | 9 ++++---- lib/ir/instructions.cpp | 4 ++-- 4 files changed, 46 insertions(+), 20 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 752f60bfe..369d0ac3e 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -79,8 +79,8 @@ void matmul(restrict readonly fp32 *a, restrict readonly fp32 *b, fp32 *c, int32 checkb1 = rkb < k;\ checka = checka0[:, newaxis] && checka1[newaxis, :];\ checkb = checkb0[:, newaxis] && checkb1[newaxis, :];\ - @checka a = *pa;\ - @checkb b = *pb;\ + a = checka ? *pa : 0;\ + b = checkb ? *pb : 0;\ }\ checkc0 = rxc < M;\ checkc1 = ryc < N;\ @@ -220,6 +220,7 @@ int main() { triton::codegen::vectorize vectorize(&tune); triton::codegen::selection selection(&allocation, &tune, &buffer_info); + triton::ir::print(module, std::cout); // tuning parameters tune.run(module); @@ -280,7 +281,7 @@ int main() { manager.run(llvm_module); std::string src = generate_machine_code(llvm_module, "nvptx64-nvidia-cuda", compute_data_layout(true, true)); -// std::cout << src << std::endl; + std::cout << src << std::endl; // compile machine code CUdevice cu_device; diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index c9ad27a01..9745ff0e2 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -603,23 +603,47 @@ ir::value* cast_operator::codegen(ir::module *mod) const{ } /* Conditional expression */ -ir::value *conditional_expression::llvm_op(ir::builder &builder, ir::value *cond, ir::value *true_value, ir::value *false_value, const std::string &name) const{ - return nullptr; -} - ir::value *conditional_expression::codegen(ir::module *mod) const{ ir::builder &builder = mod->get_builder(); - ir::value *cond = cond_->codegen(mod); - ir::value *false_value = false_value_->codegen(mod); + ir::value *pred = cond_->codegen(mod); + ir::instruction *mask = (ir::instruction*)builder.create_mask(pred); + ir::value *true_mask = mask->get_result(0); + ir::value *false_mask = mask->get_result(1); ir::value *true_value = true_value_->codegen(mod); + ir::value *false_value = false_value_->codegen(mod); + if(auto *itn = dynamic_cast(true_value)) + itn->set_mask_pred(true_mask); + if(auto *itn = dynamic_cast(false_value)) + itn->set_mask_pred(false_mask); bool is_float, is_ptr, is_int, is_signed; + ir::value *uncasted_true_value = true_value; + ir::value *uncasted_false_value = false_value; implicit_cast(builder, true_value, false_value, is_float, is_ptr, is_int, is_signed); implicit_broadcast(mod, true_value, false_value); - ir::instruction *itn = dynamic_cast(true_value); - assert(itn); - itn->set_mask_pred(cond); - itn->set_mask_else(false_value); - return itn; + { + ir::value *current = true_value; + while(current != uncasted_true_value) { + if(auto *itn = dynamic_cast(current)){ + itn->set_mask_pred(true_mask); + current = itn->get_operand(0); + } + else + break; + } + } + { + ir::value *current = false_value; + while(current != uncasted_false_value) { + if(auto *itn = dynamic_cast(current)){ + itn->set_mask_pred(false_mask); + current = itn->get_operand(0); + } + else + break; + } + } + ir::value *result = builder.create_merge(true_mask, true_value, false_mask, false_value); + return result; } /* Assignment expression */ diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 52f8011c9..d0f0a1310 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -589,10 +589,10 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & BasicBlock *block_false = pmap_.at({mask_tile_false, idx}); Value *value_false = value_tile_false->get_value(idx); BasicBlock *block_done = last_block_.at({mask_tile_true, idx}); - if(block_done->empty()) - builder.SetInsertPoint(block_done); - else + if(block_done->getTerminator()) builder.SetInsertPoint(block_done->getTerminator()); + else + builder.SetInsertPoint(block_done); PHINode *phi = builder.CreatePHI(value_true->getType(), 2); phi->addIncoming(value_true, block_true); phi->addIncoming(value_false,block_false); @@ -615,6 +615,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & // splat else if(dynamic_cast(ins)) { result->for_each([&](indices_t idx) { + set_mask_insert_pt(idx); result->set_value(idx, llvm_value(ins->get_operand(0), builder)); }); } @@ -703,7 +704,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & } void selection::lower_instruction(ir::instruction *src, IRBuilder<> &builder) { - if(src->has_tile_result_or_op()) { + if(src->has_tile_result_or_op() || (src->get_mask_pred() && src->get_mask_pred()->get_type()->is_tile_ty())) { lower_tile_instruction(src, builder); } else { diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 94a70e802..2a44ec4fb 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -32,8 +32,8 @@ void instruction::erase_from_parent() { bool instruction::has_tile_result_or_op() { bool result = get_type()->is_tile_ty(); - for(ir::value *v: ops()) - result |= v->get_type()->is_tile_ty(); + for(unsigned i = 0; i < get_num_operands(); i++) + result |= get_operand(i)->get_type()->is_tile_ty(); return result; } From 1f30e111ecdd14b40171ef42c79ce493a32d4778 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 2 Mar 2019 16:03:26 -0500 Subject: [PATCH 094/494] [code generation] more optimizations --- examples/matrix.cpp | 31 +++++++++++++++++-------------- include/triton/ir/module.h | 14 +++++++------- lib/ast/lowering.cpp | 14 +++++++++----- lib/codegen/selection.cpp | 35 ++++++++++++++++++++++++++++++++++- lib/ir/module.cpp | 25 +------------------------ 5 files changed, 68 insertions(+), 51 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 369d0ac3e..20a911387 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -47,21 +47,21 @@ void matmul(restrict readonly fp32 *a, restrict readonly fp32 *b, fp32 *c, int32 int32 ryb[TN] = get_global_range[TN](1);\ int32 rka[TK] = 0 ... TK;\ int32 rkb[TK] = 0 ... TK;\ - int32 rxc[TM] = get_global_range[TM](0);\ - int32 ryc[TN] = get_global_range[TN](1);\ + int32 rxc[TM];\ + int32 ryc[TN];\ fp32 C[TM, TN] = 0;\ int32 k;\ - fp32* pa[TM, TK] = a + rxa[:, newaxis] + rka[newaxis, :]*M;\ - fp32* pb[TN, TK] = b + ryb[:, newaxis] + rkb[newaxis, :]*K;\ - fp32* pc[TM, TN] = c + rxc[:, newaxis] + ryc[newaxis, :]*M;\ + fp32* pa[TM, TK] = a + rka[newaxis, :]*M + rxa[:, newaxis];\ + fp32* pb[TN, TK] = b + rkb[newaxis, :]*K + ryb[:, newaxis];\ + fp32* pc[TM, TN];\ fp32 a[TM, TK] = *pa;\ fp32 b[TN, TK] = *pb;\ int1 checkc0[TM];\ int1 checkc1[TN];\ int1 checkc[TM, TN];\ for(k = K; k > 0; k = k - TK){\ - int1 checka[TM, TK] = (k > bound);\ - int1 checkb[TN, TK] = (k > bound);\ + int1 checka[TM, TK];\ + int1 checkb[TN, TK];\ int1 checka0[TM];\ int1 checka1[TK];\ int1 checkb0[TN];\ @@ -69,6 +69,8 @@ void matmul(restrict readonly fp32 *a, restrict readonly fp32 *b, fp32 *c, int32 C = dot(a, b, C);\ pa = pa + TK*M;\ pb = pb + TK*K;\ + checka = k > bound;\ + checkb = k > bound;\ @checka a = *pa;\ @checkb b = *pb;\ if(k > bound)\ @@ -82,6 +84,9 @@ void matmul(restrict readonly fp32 *a, restrict readonly fp32 *b, fp32 *c, int32 a = checka ? *pa : 0;\ b = checkb ? *pb : 0;\ }\ + rxc = get_global_range[TM](0);\ + ryc = get_global_range[TN](1);\ + pc = c + ryc[newaxis, :]*M + rxc[:, newaxis];\ checkc0 = rxc < M;\ checkc1 = ryc < N;\ checkc = checkc0[:, newaxis] && checkc1[newaxis, :];\ @@ -231,16 +236,15 @@ int main() { 2, 8, 1, // b0 4, 4, 1, - // c0 - 2, 8, 1, - // c1 - 4, 4, 1, + // c + 2, 4, 8, 4, 1, 1, // a1 2, 4, 1, // b1 1, 8, 1 }; + // meta-parameters unsigned i = 0; context.p_impl->mp_constants_[0]->set_value(params[0]); @@ -257,21 +261,20 @@ int main() { std::cout << "errors: " << errors.size() << std::endl; for(auto &x: errors){ for(auto &e: x.second) - std::cout << e << std::endl; + std::cout << x.first->get_name() << " " << e << std::endl; } if(errors.size()) exit(EXIT_FAILURE); + // run passes - triton::ir::print(module, std::cout); buffer_info.run(module); shared.run(module); liveness.run(module); allocation.run(); barriers.run(module); vectorize.run(module); - triton::ir::print(module, std::cout); selection.run(module, llvm_module); // llvm source diff --git a/include/triton/ir/module.h b/include/triton/ir/module.h index 633e356f2..e4026f8b6 100644 --- a/include/triton/ir/module.h +++ b/include/triton/ir/module.h @@ -30,6 +30,10 @@ class constant; class global_value; /* Module */ +struct scope { + std::map types; +}; + class module { typedef std::pair val_key_t; friend class function; @@ -56,15 +60,11 @@ public: // Setters void set_value(const std::string& name, basic_block* block, value *x); void set_value(const std::string& name, value* x); - void set_type(const std::string& name, basic_block* block, type* x); - void set_type(const std::string& name, type* x); void set_const(const std::string& name); void set_continue_fn(std::function fn); // Getters value *get_value(const std::string& name, basic_block* block); value *get_value(const std::string& name); - type *get_type(const std::string& name, basic_block* block); - type *get_type(const std::string& name); std::function get_continue_fn(); // Seal block -- no more predecessors will be added void seal_block(basic_block *block); @@ -73,9 +73,9 @@ public: functions_list_t &get_function_list() { return functions_; } function *get_or_insert_function(const std::string &name, function_type *ty); // Scope - void push_scope(const ast::compound_statement* scope) { scopes_.push(scope); } + void add_new_scope() { if(scopes_.empty()) scopes_.push(scope()); else scopes_.push(scope(get_scope())); } void pop_scope() { scopes_.pop(); } - const ast::compound_statement* get_scope() { return scopes_.top(); } + scope& get_scope() { return scopes_.top(); } private: @@ -91,7 +91,7 @@ private: symbols_map_t symbols_; std::function continue_fn_; std::map current_phi_; - std::stack scopes_; + std::stack scopes_; }; } diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 9745ff0e2..d70955d98 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -149,7 +149,7 @@ inline bool is_terminator(ir::value* x) { /* Translation unit */ ir::value* translation_unit::codegen(ir::module *mod) const{ - mod->push_scope(nullptr); + mod->add_new_scope(); decls_.codegen(mod); return nullptr; } @@ -242,7 +242,7 @@ void function::bind_parameters(ir::module *mod, ir::function *fn) const{ if(id_i){ args[i]->set_name(id_i->name()); mod->set_value(id_i->name(), nullptr, args[i]); - mod->set_type(id_i->name(), nullptr, args[i]->get_type()); + mod->get_scope().types[id_i->name()] = args[i]->get_type(); } } } @@ -285,7 +285,7 @@ ir::value* function_definition::codegen(ir::module *mod) const{ /* Statements */ ir::value* compound_statement::codegen(ir::module* mod) const{ - mod->push_scope(this); + mod->add_new_scope(); if(decls_) decls_->codegen(mod); if(statements_){ @@ -422,7 +422,7 @@ ir::value* initializer::codegen(ir::module * mod) const{ } value->set_name(name); mod->set_value(name, value); - mod->set_type(name, ty); + mod->get_scope().types[name] = ty; if(std::find(storage.begin(), storage.end(), CONST_T) != storage.end()) mod->set_const(name); return value; @@ -649,8 +649,12 @@ ir::value *conditional_expression::codegen(ir::module *mod) const{ /* Assignment expression */ ir::value *assignment_expression::codegen(ir::module *mod) const{ ir::value *rvalue = rvalue_->codegen(mod); - if(auto *x = dynamic_cast(lvalue_)) + if(auto *x = dynamic_cast(lvalue_)){ + ir::type *ty = mod->get_scope().types.at(x->id()->name()); + rvalue = explicit_cast(mod->get_builder(), rvalue, ty); + implicit_broadcast(mod, rvalue, ty); mod->set_value(x->id()->name(), rvalue); + } else if(auto* x = dynamic_cast(lvalue_)){ assert(x->get_op()==DEREF); assert(x->lvalue()); diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index d0f0a1310..32a713428 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -214,6 +214,38 @@ Constant *selection::llvm_constant(ir::constant *cst, LLVMContext &ctx) { throw std::runtime_error("unknown conversion from ir::constant to Constant"); } +inline Value *Reassociate(Value *V, IRBuilder<> &Builder){ + BinaryOperator *BinOp = dyn_cast(V); + if(BinOp) + if(BinOp->getOpcode()==BinaryOperator::BinaryOps::Add){ + Value *LHS = Reassociate(BinOp->getOperand(0), Builder); + Value *RHS = Reassociate(BinOp->getOperand(1), Builder); + if(BinaryOperator *BinLHS = dyn_cast(LHS)) + if(BinLHS->getOpcode()==BinaryOperator::BinaryOps::Add){ + Value *LLHS = BinLHS->getOperand(0); + Value *RLHS = BinLHS->getOperand(1); + // (cst + x) + y -> cst + (x + y) + if(isa(LLHS)) + return Builder.CreateAdd(LLHS, Builder.CreateAdd(RLHS, RHS)); + // (x + cst) + y -> cst + (x + y) + if(isa(RLHS)) + return Builder.CreateAdd(RLHS, Builder.CreateAdd(LLHS, RHS)); + } + if(BinaryOperator *BinRHS = dyn_cast(RHS)) + if(BinRHS->getOpcode()==BinaryOperator::BinaryOps::Add){ + Value *LRHS = BinRHS->getOperand(0); + Value *RRHS = BinRHS->getOperand(1); + // x + (cst + y) -> cst + (x + y) + if(isa(LRHS)) + return Builder.CreateAdd(LRHS, Builder.CreateAdd(RRHS, LHS)); + // x + (cst + y) -> cst + (x + y) + if(isa(LRHS)) + return Builder.CreateAdd(RRHS, Builder.CreateAdd(LRHS, LHS)); + } + return BinOp; + } + return V; +} /* convert ir::instruction to llvm::Instruction */ Instruction *selection::llvm_inst(ir::instruction *inst, std::function value, IRBuilder<> &builder) { @@ -271,8 +303,9 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::functionidx_begin(), ii->idx_end(), std::back_inserter(idx_vals), [&value](ir::value* x){ return value(x);}); Type *source_ty = type(ii->get_source_elt_ty()->get_scalar_ty()); + idx_vals[0] = Reassociate(idx_vals[0], builder); Value *arg = value(ii->get_operand(0)); - return builder.Insert(GetElementPtrInst::Create(source_ty, arg, idx_vals)); + return builder.Insert(GetElementPtrInst::CreateInBounds(source_ty, arg, idx_vals)); } if(ir::load_inst* ii = dynamic_cast(inst)){ Value *ptr = value(ii->get_pointer_operand()); diff --git a/lib/ir/module.cpp b/lib/ir/module.cpp index e5764d010..14f1337e1 100644 --- a/lib/ir/module.cpp +++ b/lib/ir/module.cpp @@ -29,14 +29,6 @@ void module::set_value(const std::string& name, ir::value *value){ return set_value(name, builder_.get_insert_block(), value); } -void module::set_type(const std::string& name, ir::basic_block *block, ir::type *type){ - types_[val_key_t{name, block}] = type; -} - -void module::set_type(const std::string& name, ir::type *type){ - return set_type(name, builder_.get_insert_block(), type); -} - void module::set_const(const std::string& name){ const_.insert(name); } @@ -97,7 +89,7 @@ ir::value *module::get_value_recursive(const std::string& name, ir::basic_block ir::value *result; bool is_const = const_.find(name) != const_.end(); auto &preds = block->get_predecessors(); - ir::type *ty = get_type(name, block); + ir::type *ty = get_scope().types.at(name); if(block) if(!is_const && sealed_blocks_.find(block) == sealed_blocks_.end()){ incomplete_phis_[block][name] = make_phi(ty, 1, block); @@ -136,21 +128,6 @@ ir::value *module::get_value(const std::string& name) { return get_value(name, builder_.get_insert_block()); } -ir::type *module::get_type(const std::string &name, basic_block *block) { - val_key_t key(name, block); - if(types_.find(key) != types_.end()) - return types_.at(key); - assert(block); - const auto& predecessors = block->get_predecessors(); - if(predecessors.empty()) - return get_type(name, nullptr); - return get_type(name, predecessors[0]); -} - -ir::type *module::get_type(const std::string &name) { - return types_.at({name, builder_.get_insert_block()}); -} - void module::seal_block(ir::basic_block *block){ for(auto &x: incomplete_phis_[block]){ From 4189e130bffb894e41e156b764496448cd97b6a9 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 3 Mar 2019 23:16:33 -0500 Subject: [PATCH 095/494] [general] added support for constant memory declaration --- examples/matrix.cpp | 3 + include/triton/ast/ast.h | 37 ++++-- include/triton/ast/parser.y | 10 +- include/triton/ast/scanner.l | 198 +++++++++++++---------------- include/triton/codegen/selection.h | 2 + include/triton/ir/constant.h | 6 + include/triton/ir/module.h | 6 +- include/triton/ir/type.h | 5 +- lib/ast/lowering.cpp | 44 ++++--- lib/codegen/selection.cpp | 37 +++++- lib/ir/constant.cpp | 7 + 11 files changed, 211 insertions(+), 144 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 20a911387..0baf844dc 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -38,6 +38,9 @@ extern translation_unit *ast_root; const char src[] = "\ +__constant__ int32* delta = alloc_const int32[16];\ +__constant__ int32* masks = alloc_const int32[16];\ +\ const tunable int32 TM;\ const tunable int32 TN;\ const tunable int32 TK;\ diff --git a/include/triton/ast/ast.h b/include/triton/ast/ast.h index b9ae16ea0..f511ac132 100644 --- a/include/triton/ast/ast.h +++ b/include/triton/ast/ast.h @@ -62,6 +62,7 @@ enum STORAGE_SPEC_T{ KERNEL_T, RESTRICT_T, READONLY_T, + CONSTANT_SPACE_T, WRITEONLY_T }; @@ -142,6 +143,16 @@ class builtin_expression: public node{ }; +class typed_declaration_specifier; +class alloc_const: public builtin_expression{ +public: + alloc_const(node *spec, node *size): spec_((typed_declaration_specifier*)spec), size_((constant*)size) { } + ir::value* codegen(ir::module *mod) const; + +private: + const typed_declaration_specifier* spec_; + const constant* size_; +}; class get_global_range: public builtin_expression{ public: @@ -447,13 +458,18 @@ public: /* Declarators */ class declarator: public node{ - virtual ir::type* type_impl(ir::module *mod, ir::type *type) const = 0; +protected: + typedef std::vector storage_spec_vec_t; + typedef const storage_spec_vec_t& storage_spec_vec_const_ref_t; + +public: + virtual ir::type* type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const = 0; public: declarator(node *lhs) : lhs_((declarator*)lhs), ptr_(nullptr){ } - ir::type* type(ir::module *mod, ir::type *type) const; + ir::type* type(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const; const identifier* id() const { return (const identifier*)lhs_; @@ -464,13 +480,18 @@ public: return this; } + void set_addr_space(unsigned addr_space){ + addr_space_ = addr_space; + } + protected: declarator *lhs_; pointer *ptr_; + unsigned addr_space_; }; class identifier: public declarator { - ir::type* type_impl(ir::module *mod, ir::type *type) const; + ir::type* type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const; public: identifier(char *&name): declarator(this), name_(name) { } @@ -482,7 +503,7 @@ private: class pointer: public declarator{ private: - ir::type* type_impl(ir::module *mod, ir::type *type) const; + ir::type* type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const; public: pointer(node *id): declarator(id) { } @@ -490,7 +511,7 @@ public: class tile: public declarator{ private: - ir::type* type_impl(ir::module *mod, ir::type *type) const; + ir::type* type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const; public: tile(node *id, node *shapes) @@ -502,7 +523,7 @@ public: class function: public declarator{ private: - ir::type* type_impl(ir::module *mod, ir::type *type) const; + ir::type* type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const; public: function(node *id, node *args) @@ -519,7 +540,7 @@ public: class initializer : public declarator{ private: - ir::type* type_impl(ir::module * mod, ir::type *type) const; + ir::type* type_impl(ir::module * mod, ir::type *type, storage_spec_vec_const_ref_t storage) const; public: initializer(node *decl, node *init) @@ -531,7 +552,7 @@ public: public: const declaration_specifier *spec_; - const declarator *decl_; + declarator *decl_; const expression *expr_; }; diff --git a/include/triton/ast/parser.y b/include/triton/ast/parser.y index a7d46e5a7..acd31d995 100644 --- a/include/triton/ast/parser.y +++ b/include/triton/ast/parser.y @@ -8,6 +8,7 @@ using namespace triton::ast; #define YYSTYPE node* #include "../include/triton/ast/ast.h" +#define YYERROR_VERBOSE 1 extern char* yytext; void yyerror(const char *s); int yylex(void); @@ -42,11 +43,10 @@ ASSIGN_OP_T get_assign_op(node *op) { return ((token*)op)->assign_op; } UNARY_OP_T get_unary_op(node *op) { return ((token*)op)->unary_op; } TYPE_T get_type_spec(node *op) { return ((token*)op)->type; } STORAGE_SPEC_T get_storage_spec(node *op) { return ((token*)op)->storage_spec;} - %} %token IDENTIFIER CONSTANT STRING_LITERAL -%token TUNABLE KERNEL RESTRICT READONLY WRITEONLY CONST +%token TUNABLE KERNEL RESTRICT READONLY WRITEONLY CONST CONSTANT_SPACE %token PTR_OP INC_OP DEC_OP LEFT_OP RIGHT_OP LE_OP GE_OP EQ_OP NE_OP %token AND_OP OR_OP MUL_ASSIGN DIV_ASSIGN MOD_ASSIGN ADD_ASSIGN %token SUB_ASSIGN LEFT_ASSIGN RIGHT_ASSIGN AND_ASSIGN @@ -54,7 +54,7 @@ STORAGE_SPEC_T get_storage_spec(node *op) { return ((token*)op)->storage_spec;} %token VOID UINT1 UINT8 UINT16 UINT32 UINT64 INT1 INT8 INT16 INT32 INT64 FP32 FP64 %token IF ELSE FOR CONTINUE %token NEWAXIS ELLIPSIS AT -%token GET_GLOBAL_RANGE DOT +%token GET_GLOBAL_RANGE DOT ALLOC_CONST %start translation_unit %% @@ -112,7 +112,8 @@ identifier builtin : GET_GLOBAL_RANGE '[' primary_expression ']' '(' constant ')' { $$ = new get_global_range($3, $6); } - | DOT '(' expression ',' expression ',' expression ')' { $$ = new matmul_expression($3, $5, $7); } + | DOT '(' expression ',' expression ',' expression ')' { $$ = new matmul_expression($3, $5, $7); } + | ALLOC_CONST type_specifier '[' constant ']' { $$ = new alloc_const(new typed_declaration_specifier(get_type_spec($2)), $4); } primary_expression : identifier { $$ = new named_expression($1); } @@ -366,6 +367,7 @@ storage_class_specifier | RESTRICT { $$ = new token(RESTRICT_T); } | READONLY { $$ = new token(READONLY_T); } | WRITEONLY { $$ = new token(WRITEONLY_T); } + | CONSTANT_SPACE { $$ = new token(CONSTANT_SPACE_T); } ; /* -------------------------- */ diff --git a/include/triton/ast/scanner.l b/include/triton/ast/scanner.l index 4c0635dbc..56cc777a7 100644 --- a/include/triton/ast/scanner.l +++ b/include/triton/ast/scanner.l @@ -8,133 +8,107 @@ IS (u|U|l|L)* %{ #include #include "parser.hpp" - -void count(); -int check_type(); -int comment(); - %} %% -"const" { count(); return(CONST); } -"tunable" { count(); return(TUNABLE); } -"kernel" { count(); return(KERNEL); } -"restrict" { count(); return(RESTRICT); } -"readonly" { count(); return(READONLY); } -"writeonly" { count(); return(WRITEONLY); } -"@" { count(); return(AT); } -"newaxis" { count(); return(NEWAXIS); } -"if" { count(); return(IF); } -"else" { count(); return(ELSE); } -"for" { count(); return(FOR); } -"void" { count(); return(VOID); } -"uint1" { count(); return(UINT1); } -"uint8" { count(); return(UINT8); } -"uint16" { count(); return(UINT16); } -"uint32" { count(); return(UINT32); } -"uint64" { count(); return(UINT64); } -"int1" { count(); return(INT1); } -"int8" { count(); return(INT8); } -"int16" { count(); return(INT16); } -"int32" { count(); return(INT32); } -"int64" { count(); return(INT64); } -"fp32" { count(); return(FP32); } -"fp64" { count(); return(FP64); } -"..." { count(); return(ELLIPSIS); } -"get_global_range" { count(); return GET_GLOBAL_RANGE; } -"dot" { count(); return DOT;} -"continue" { count(); return(CONTINUE); } +"__constant__" { return(CONSTANT_SPACE); } +"const" { return(CONST); } +"tunable" { return(TUNABLE); } +"kernel" { return(KERNEL); } +"restrict" { return(RESTRICT); } +"readonly" { return(READONLY); } +"writeonly" { return(WRITEONLY); } +"@" { return(AT); } +"newaxis" { return(NEWAXIS); } +"if" { return(IF); } +"else" { return(ELSE); } +"for" { return(FOR); } +"void" { return(VOID); } +"uint1" { return(UINT1); } +"uint8" { return(UINT8); } +"uint16" { return(UINT16); } +"uint32" { return(UINT32); } +"uint64" { return(UINT64); } +"int1" { return(INT1); } +"int8" { return(INT8); } +"int16" { return(INT16); } +"int32" { return(INT32); } +"int64" { return(INT64); } +"fp32" { return(FP32); } +"fp64" { return(FP64); } +"..." { return(ELLIPSIS); } +"get_global_range" { return GET_GLOBAL_RANGE; } +"dot" { return DOT;} +"continue" { return(CONTINUE); } +"alloc_const" { return(ALLOC_CONST); } +{L}({L}|{D})* { return(IDENTIFIER); } -{L}({L}|{D})* { count(); return(check_type()); } +0[xX]{H}+{IS}? { return(CONSTANT); } +0{D}+{IS}? { return(CONSTANT); } +{D}+{IS}? { return(CONSTANT); } +L?'(\\.|[^\\'])+' { return(CONSTANT); } -0[xX]{H}+{IS}? { count(); return(CONSTANT); } -0{D}+{IS}? { count(); return(CONSTANT); } -{D}+{IS}? { count(); return(CONSTANT); } -L?'(\\.|[^\\'])+' { count(); return(CONSTANT); } +{D}+{E}{FS}? { return(CONSTANT); } +{D}*"."{D}+({E})?{FS}? { return(CONSTANT); } +{D}+"."{D}*({E})?{FS}? { return(CONSTANT); } -{D}+{E}{FS}? { count(); return(CONSTANT); } -{D}*"."{D}+({E})?{FS}? { count(); return(CONSTANT); } -{D}+"."{D}*({E})?{FS}? { count(); return(CONSTANT); } +L?\"(\\.|[^\\"])*\" { return(STRING_LITERAL); } -L?\"(\\.|[^\\"])*\" { count(); return(STRING_LITERAL); } +">>=" { return(RIGHT_ASSIGN); } +"<<=" { return(LEFT_ASSIGN); } +"+=" { return(ADD_ASSIGN); } +"-=" { return(SUB_ASSIGN); } +"*=" { return(MUL_ASSIGN); } +"/=" { return(DIV_ASSIGN); } +"%=" { return(MOD_ASSIGN); } +"&=" { return(AND_ASSIGN); } +"^=" { return(XOR_ASSIGN); } +"|=" { return(OR_ASSIGN); } +">>" { return(RIGHT_OP); } +"<<" { return(LEFT_OP); } +"++" { return(INC_OP); } +"--" { return(DEC_OP); } +"->" { return(PTR_OP); } +"&&" { return(AND_OP); } +"||" { return(OR_OP); } +"<=" { return(LE_OP); } +">=" { return(GE_OP); } +"==" { return(EQ_OP); } +"!=" { return(NE_OP); } +";" { return(';'); } +("{"|"<%") { return('{'); } +("}"|"%>") { return('}'); } +"," { return(','); } +":" { return(':'); } +"=" { return('='); } +"(" { return('('); } +")" { return(')'); } +("["|"<:") { return('['); } +("]"|":>") { return(']'); } +"." { return('.'); } +"&" { return('&'); } +"!" { return('!'); } +"~" { return('~'); } +"-" { return('-'); } +"+" { return('+'); } +"*" { return('*'); } +"/" { return('/'); } +"%" { return('%'); } +"<" { return('<'); } +">" { return('>'); } +"^" { return('^'); } +"|" { return('|'); } +"?" { return('?'); } -">>=" { count(); return(RIGHT_ASSIGN); } -"<<=" { count(); return(LEFT_ASSIGN); } -"+=" { count(); return(ADD_ASSIGN); } -"-=" { count(); return(SUB_ASSIGN); } -"*=" { count(); return(MUL_ASSIGN); } -"/=" { count(); return(DIV_ASSIGN); } -"%=" { count(); return(MOD_ASSIGN); } -"&=" { count(); return(AND_ASSIGN); } -"^=" { count(); return(XOR_ASSIGN); } -"|=" { count(); return(OR_ASSIGN); } -">>" { count(); return(RIGHT_OP); } -"<<" { count(); return(LEFT_OP); } -"++" { count(); return(INC_OP); } -"--" { count(); return(DEC_OP); } -"->" { count(); return(PTR_OP); } -"&&" { count(); return(AND_OP); } -"||" { count(); return(OR_OP); } -"<=" { count(); return(LE_OP); } -">=" { count(); return(GE_OP); } -"==" { count(); return(EQ_OP); } -"!=" { count(); return(NE_OP); } -";" { count(); return(';'); } -("{"|"<%") { count(); return('{'); } -("}"|"%>") { count(); return('}'); } -"," { count(); return(','); } -":" { count(); return(':'); } -"=" { count(); return('='); } -"(" { count(); return('('); } -")" { count(); return(')'); } -("["|"<:") { count(); return('['); } -("]"|":>") { count(); return(']'); } -"." { count(); return('.'); } -"&" { count(); return('&'); } -"!" { count(); return('!'); } -"~" { count(); return('~'); } -"-" { count(); return('-'); } -"+" { count(); return('+'); } -"*" { count(); return('*'); } -"/" { count(); return('/'); } -"%" { count(); return('%'); } -"<" { count(); return('<'); } -">" { count(); return('>'); } -"^" { count(); return('^'); } -"|" { count(); return('|'); } -"?" { count(); return('?'); } - -[ \t\v\n\f] { count(); } -. { /* ignore bad characters */ } +[ \t\v\n\f] { } +. { /* ignore bad characters */ } %% int yywrap() { return(1); } - -int column = 0; - -void count() -{ - int i; - - for (i = 0; yytext[i] != '\0'; i++) - if (yytext[i] == '\n') - column = 0; - else if (yytext[i] == '\t') - column += 8 - (column % 8); - else - column++; - //ECHO; -} - void yyerror (const char *s) /* Called by yyparse on error */ { printf ("Error: %s\n", s); } - -int check_type() -{ - return(IDENTIFIER); -} diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index c8632262e..5c81ca8a0 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -112,6 +112,8 @@ private: llvm::Value* llvm_value(ir::value *v, llvm::IRBuilder<> &builder); llvm::Instruction* llvm_inst(ir::instruction *inst, std::function value, llvm::IRBuilder<> &builder); llvm::Constant* llvm_constant(ir::constant *cst, llvm::LLVMContext &ctx); + llvm::Value* llvm_alloc_const(ir::alloc_const *v, llvm::Module *module, llvm::IRBuilder<> &builder); + llvm::ArrayType* llvm_linearized_tile_type(ir::type *ty, llvm::LLVMContext &ctx); // grid construction void create_grids(std::vector &grids, diff --git a/include/triton/ir/constant.h b/include/triton/ir/constant.h index 11403c6dd..9f2baf618 100644 --- a/include/triton/ir/constant.h +++ b/include/triton/ir/constant.h @@ -106,6 +106,12 @@ public: unsigned addr_space = 0); }; +/* global variable */ +class alloc_const: public global_object { +public: + alloc_const(type *ty, constant_int *size, + const std::string &name = ""); +}; } } diff --git a/include/triton/ir/module.h b/include/triton/ir/module.h index e4026f8b6..4ec681f67 100644 --- a/include/triton/ir/module.h +++ b/include/triton/ir/module.h @@ -28,6 +28,7 @@ class attribute; class function_type; class constant; class global_value; +class alloc_const; /* Module */ struct scope { @@ -76,7 +77,9 @@ public: void add_new_scope() { if(scopes_.empty()) scopes_.push(scope()); else scopes_.push(scope(get_scope())); } void pop_scope() { scopes_.pop(); } scope& get_scope() { return scopes_.top(); } - + // Const allocation + void add_alloc(ir::alloc_const* x) { allocs_.push_back(x); } + const std::vector& allocs() { return allocs_; } private: std::string name_; @@ -92,6 +95,7 @@ private: std::function continue_fn_; std::map current_phi_; std::stack scopes_; + std::vector allocs_; }; } diff --git a/include/triton/ir/type.h b/include/triton/ir/type.h index 1977ff47c..04da05b60 100644 --- a/include/triton/ir/type.h +++ b/include/triton/ir/type.h @@ -165,9 +165,8 @@ private: public: // accessors - unsigned get_address_space() const { return address_space_; } - type *get_element_ty() const { return contained_tys_[0]; } - + unsigned get_address_space() const { return address_space_; } + type *get_element_ty() const { return contained_tys_[0]; } // factory methods static pointer_type* get(type *ty, unsigned address_space); diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index d70955d98..49fe03206 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -188,7 +188,7 @@ std::vector storage_declaration_specifier::storage() const { /* Parameter */ ir::type* parameter::type(ir::module *mod) const { - return decl_->type(mod, spec_->type(mod)); + return decl_->type(mod, spec_->type(mod), {}); } std::vector parameter::storage() const { @@ -200,14 +200,14 @@ const identifier *parameter::id() const { } /* Declarators */ -ir::type* declarator::type(ir::module *mod, ir::type *type) const{ +ir::type* declarator::type(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const{ if(ptr_) - return type_impl(mod, ptr_->type(mod, type)); - return type_impl(mod, type); + return type_impl(mod, ptr_->type(mod, type, storage), storage); + return type_impl(mod, type, storage); } // Identifier -ir::type* identifier::type_impl(ir::module *, ir::type *type) const{ +ir::type* identifier::type_impl(ir::module *, ir::type *type, storage_spec_vec_const_ref_t) const{ return type; } @@ -216,7 +216,7 @@ const std::string &identifier::name() const{ } // Tile -ir::type* tile::type_impl(ir::module *mod, ir::type *type) const{ +ir::type* tile::type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t) const{ ir::type::tile_shapes_t shapes; for(expression *expr: shapes_->values()){ ir::constant_int *shape = dynamic_cast(expr->codegen(mod)); @@ -228,8 +228,9 @@ ir::type* tile::type_impl(ir::module *mod, ir::type *type) const{ // Pointer -ir::type* pointer::type_impl(ir::module*, ir::type *type) const{ - return ir::pointer_type::get(type, 1); +ir::type* pointer::type_impl(ir::module*, ir::type *type, storage_spec_vec_const_ref_t storage) const{ + bool is_ptr_to_const = std::find(storage.begin(), storage.end(), CONSTANT_SPACE_T) != storage.end(); + return ir::pointer_type::get(type, is_ptr_to_const?4:1); } // Function @@ -247,7 +248,7 @@ void function::bind_parameters(ir::module *mod, ir::function *fn) const{ } } -ir::type* function::type_impl(ir::module* mod, ir::type *type) const{ +ir::type* function::type_impl(ir::module* mod, ir::type *type, storage_spec_vec_const_ref_t) const{ std::vector types; for(parameter* param: args_->values()) types.push_back(param->type(mod)); @@ -265,7 +266,7 @@ ir::attribute_t get_ir_attr(STORAGE_SPEC_T spec){ } ir::value* function_definition::codegen(ir::module *mod) const{ - ir::function_type *prototype = (ir::function_type*)header_->type(mod, spec_->type(mod)); + ir::function_type *prototype = (ir::function_type*)header_->type(mod, spec_->type(mod), spec_->storage()); const std::string &name = header_->id()->name(); ir::function *fn = mod->get_or_insert_function(name, prototype); for(unsigned i = 0; i < header_->get_num_args(); i++){ @@ -397,8 +398,8 @@ ir::value* declaration::codegen(ir::module* mod) const{ } /* Initializer */ -ir::type* initializer::type_impl(ir::module *mod, ir::type *type) const{ - return decl_->type(mod, type); +ir::type* initializer::type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const{ + return decl_->type(mod, type, storage); } void initializer::set_specifier(const declaration_specifier *spec) { @@ -406,8 +407,8 @@ void initializer::set_specifier(const declaration_specifier *spec) { } ir::value* initializer::codegen(ir::module * mod) const{ - ir::type *ty = decl_->type(mod, spec_->type(mod)); std::vector storage = spec_->storage(); + ir::type *ty = decl_->type(mod, spec_->type(mod), storage); std::string name = decl_->id()->name(); ir::value *value = ir::undef_value::get(ty); if(std::find(storage.begin(), storage.end(), TUNABLE_T) != storage.end()){ @@ -423,6 +424,8 @@ ir::value* initializer::codegen(ir::module * mod) const{ value->set_name(name); mod->set_value(name, value); mod->get_scope().types[name] = ty; + if(auto *x = dynamic_cast(value)) + mod->add_alloc(x); if(std::find(storage.begin(), storage.end(), CONST_T) != storage.end()) mod->set_const(name); return value; @@ -523,13 +526,21 @@ ir::value* binary_operator::codegen(ir::module *mod) const{ /* Builtin expression */ +// alloc constant +ir::value* alloc_const::codegen(ir::module *mod) const { + ir::type *ty = spec_->type(mod); + ir::constant_int *size = (ir::constant_int*)size_->codegen(mod); + ir::alloc_const *res = new ir::alloc_const(ty, size); + return res; +} + // get_global_range ir::value* get_global_range::codegen(ir::module *mod) const { ir::builder &builder = mod->get_builder(); return builder.create_get_global_range(axis_->value(), (ir::constant_int*)size_->codegen(mod)); } - +// matmul ir::value* matmul_expression::codegen(ir::module *mod) const { ir::value *A = A_->codegen(mod); ir::value *B = B_->codegen(mod); @@ -666,7 +677,7 @@ ir::value *assignment_expression::codegen(ir::module *mod) const{ /* Type name */ ir::type *type_name::type(ir::module *mod) const{ - return decl_->type(mod, spec_->type(mod)); + return decl_->type(mod, spec_->type(mod), {}); } /* String literal */ @@ -693,6 +704,9 @@ ir::value* constant_range::codegen(ir::module *mod) const{ /* Named */ ir::value* named_expression::codegen(ir::module *mod) const{ const std::string &name = id()->name(); + const auto& declarations = mod->get_scope().types; + if(declarations.find(name) == declarations.end()) + throw std::runtime_error("variable " + name + " not declared"); return mod->get_value(name); } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 32a713428..3f79c9375 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -315,6 +315,16 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::function &builder) { + unsigned size = ((ir::constant_int*)v->get_operand(0))->get_value(); + Type *element_ty = llvm_type(v->get_type()->get_pointer_element_ty(), module->getContext()); + Type *array_ty = llvm::ArrayType::get(element_ty, size); + Value *array = new llvm::GlobalVariable(*module, array_ty, false, llvm::GlobalVariable::ExternalLinkage, + nullptr, v->get_name(), nullptr, llvm::GlobalVariable::NotThreadLocal, 4); + return builder.CreateBitCast(array, element_ty->getPointerTo(4)); +} + /* convert ir::value to llvm::Value */ Value* selection::llvm_value(ir::value *v, IRBuilder<> &builder) { assert(!v->get_type()->is_tile_ty()); @@ -324,6 +334,20 @@ Value* selection::llvm_value(ir::value *v, IRBuilder<> &builder) { // create operands if(auto *cc = dynamic_cast(v)) return llvm_constant(cc, ctx); + // alloc const + if(auto *cc = dynamic_cast(v)){ + BasicBlock *block = builder.GetInsertBlock(); + Module *module = block->getModule(); + unsigned size = ((ir::constant_int*)cc->get_operand(0))->get_value(); + Type *element_ty = llvm_type(cc->get_type()->get_pointer_element_ty(), ctx); + Type *array_ty = llvm::ArrayType::get(element_ty, size); + if(vmap_.find(v) == vmap_.end()){ + Value *array = new llvm::GlobalVariable(*module, array_ty, false, llvm::GlobalVariable::ExternalLinkage, + nullptr, cc->get_name(), nullptr, llvm::GlobalVariable::NotThreadLocal, 4); + vmap_[v] = builder.CreateBitCast(array, array->getType()->getArrayElementType()->getPointerTo(4)); + } + return vmap_.at(v); + } // instruction if(auto *ii = dynamic_cast(v)){ auto value = [&](ir::value *x) { return llvm_value(x, builder); }; @@ -755,11 +779,22 @@ inline llvm::Attribute::AttrKind llvm_attr(ir::attribute_t attr) { } } +ArrayType* selection::llvm_linearized_tile_type(ir::type *ty, LLVMContext &ctx) { + unsigned size = 1; + for(ir::constant_int* shape: ty->get_tile_shapes()) + size *= shape->get_value(); + return ArrayType::get(llvm_type(ty->get_scalar_ty(), ctx), size); +} + void selection::run(ir::module &src, Module &dst){ vmap_.clear(); LLVMContext &dst_ctx = dst.getContext(); IRBuilder<> dst_builder(dst_ctx); + for(ir::alloc_const *x: src.allocs()) { + vmap_[x] = llvm_alloc_const(x, &dst, dst_builder); + } + // iterate over functions for(ir::function *fn: src.get_function_list()) { // create LLVM function @@ -795,7 +830,7 @@ void selection::run(ir::module &src, Module &dst){ ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_size); Type *ptr_ty = PointerType::get(int_8_ty, 3); GlobalVariable *sh_mem_array = - new GlobalVariable(*dst_fn->getParent(), array_ty, false, GlobalVariable::ExternalLinkage, + new GlobalVariable(dst, array_ty, false, GlobalVariable::ExternalLinkage, nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); sh_mem_ptr = dst_builder.CreateBitCast(sh_mem_array, ptr_ty); } diff --git a/lib/ir/constant.cpp b/lib/ir/constant.cpp index 929af2228..314714c04 100644 --- a/lib/ir/constant.cpp +++ b/lib/ir/constant.cpp @@ -135,5 +135,12 @@ global_object::global_object(type *ty, unsigned num_ops, : global_value(ty, num_ops, linkage, name, addr_space) { } +/* alloc const */ +alloc_const::alloc_const(type *ty, constant_int *size, const std::string &name) + : global_object(ty, 1, global_value::external, name, 4) { + set_operand(0, size); +} + + } } From 20ff9543acc0849d4ac204fe326e77e22a780024 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 5 Mar 2019 21:03:19 -0500 Subject: [PATCH 096/494] [abstract syntax tree] improved the grammar --- examples/matrix.cpp | 70 +++++++++++++++--------------------- include/triton/ast/ast.h | 19 +++++----- include/triton/ast/parser.y | 23 ++++++------ include/triton/ast/scanner.l | 4 +-- lib/ast/lowering.cpp | 14 +++----- 5 files changed, 53 insertions(+), 77 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 0baf844dc..1a7484ab0 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -38,61 +38,47 @@ extern translation_unit *ast_root; const char src[] = "\ -__constant__ int32* delta = alloc_const int32[16];\ -__constant__ int32* masks = alloc_const int32[16];\ -\ const tunable int32 TM;\ const tunable int32 TN;\ const tunable int32 TK;\ \ -void matmul(restrict readonly fp32 *a, restrict readonly fp32 *b, fp32 *c, int32 M, int32 N, int32 K, int32 bound){\ +void matmul(restrict read_only fp32 *a, restrict read_only fp32 *b, fp32 *c,\ + int32 M, int32 N, int32 K, int32 bound){\ int32 rxa[TM] = get_global_range[TM](0);\ int32 ryb[TN] = get_global_range[TN](1);\ int32 rka[TK] = 0 ... TK;\ int32 rkb[TK] = 0 ... TK;\ - int32 rxc[TM];\ - int32 ryc[TN];\ fp32 C[TM, TN] = 0;\ - int32 k;\ fp32* pa[TM, TK] = a + rka[newaxis, :]*M + rxa[:, newaxis];\ fp32* pb[TN, TK] = b + rkb[newaxis, :]*K + ryb[:, newaxis];\ - fp32* pc[TM, TN];\ fp32 a[TM, TK] = *pa;\ fp32 b[TN, TK] = *pb;\ - int1 checkc0[TM];\ - int1 checkc1[TN];\ - int1 checkc[TM, TN];\ - for(k = K; k > 0; k = k - TK){\ - int1 checka[TM, TK];\ - int1 checkb[TN, TK];\ - int1 checka0[TM];\ - int1 checka1[TK];\ - int1 checkb0[TN];\ - int1 checkb1[TK];\ - C = dot(a, b, C);\ - pa = pa + TK*M;\ - pb = pb + TK*K;\ - checka = k > bound;\ - checkb = k > bound;\ - @checka a = *pa;\ - @checkb b = *pb;\ - if(k > bound)\ - continue;\ - checka0 = rxa < M;\ - checka1 = rka < k;\ - checkb0 = ryb < N;\ - checkb1 = rkb < k;\ - checka = checka0[:, newaxis] && checka1[newaxis, :];\ - checkb = checkb0[:, newaxis] && checkb1[newaxis, :];\ - a = checka ? *pa : 0;\ - b = checkb ? *pb : 0;\ - }\ - rxc = get_global_range[TM](0);\ - ryc = get_global_range[TN](1);\ - pc = c + ryc[newaxis, :]*M + rxc[:, newaxis];\ - checkc0 = rxc < M;\ - checkc1 = ryc < N;\ - checkc = checkc0[:, newaxis] && checkc1[newaxis, :];\ + for(int32 k = K; k > 0;){\ + C = dot(a, b, C);\ + pa = pa + TK*M;\ + pb = pb + TK*K;\ + k = k - TK;\ + int1 checka[TM, TK] = k > bound;\ + int1 checkb[TN, TK] = k > bound;\ + @checka a = *pa;\ + @checkb b = *pb;\ + if(k > bound)\ + continue;\ + int1 checka0[TM] = rxa < M;\ + int1 checka1[TK] = rka < k;\ + int1 checkb0[TN] = ryb < N;\ + int1 checkb1[TK] = rkb < k;\ + checka = checka0[:, newaxis] && checka1[newaxis, :];\ + checkb = checkb0[:, newaxis] && checkb1[newaxis, :];\ + a = checka ? *pa : 0;\ + b = checkb ? *pb : 0;\ + }\ + int32 rxc[TM] = get_global_range[TM](0);\ + int32 ryc[TN] = get_global_range[TN](1);\ + fp32* pc[TM, TN] = c + ryc[newaxis, :]*M + rxc[:, newaxis];\ + int1 checkc0[TM] = rxc < M;\ + int1 checkc1[TN] = ryc < N;\ + int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :];\ @checkc *pc = C;\ }\ "; diff --git a/include/triton/ast/ast.h b/include/triton/ast/ast.h index f511ac132..cc77f66b5 100644 --- a/include/triton/ast/ast.h +++ b/include/triton/ast/ast.h @@ -323,7 +323,10 @@ public: class initializer; class declaration_specifier; -class declaration: public node{ +class block_item: public node{ +}; + +class declaration: public block_item{ public: declaration(node *spec, node *init) : spec_((declaration_specifier*)spec), init_((list*)init) { } @@ -335,10 +338,7 @@ public: const list *init_; }; -class statement: public node{ - -private: - expression *pred_; +class statement: public block_item{ }; class expression_statement: public statement{ @@ -353,19 +353,19 @@ private: expression *mask_; }; + class compound_statement: public statement{ typedef list* declarations_t; typedef list* statements_t; public: - compound_statement(node* decls, node* statements) - : decls_((declarations_t)decls), statements_((statements_t)statements) {} + compound_statement(node* items) + : items_((list*)items){} ir::value* codegen(ir::module * mod) const; private: - declarations_t decls_; - statements_t statements_; + list* items_; }; class selection_statement: public statement{ @@ -413,7 +413,6 @@ class no_op: public statement { }; // Types class declaration_specifier: public node{ public: - using node::node; virtual ir::type* type(ir::module *mod) const = 0; virtual std::vector storage() const = 0; }; diff --git a/include/triton/ast/parser.y b/include/triton/ast/parser.y index acd31d995..724f4240b 100644 --- a/include/triton/ast/parser.y +++ b/include/triton/ast/parser.y @@ -275,21 +275,16 @@ statement ; compound_statement - : '{' '}' { $$ = new compound_statement(nullptr, nullptr); } - | '{' statement_list '}' { $$ = new compound_statement(nullptr, $2); } - | '{' declaration_list '}' { $$ = new compound_statement($2, nullptr); } - | '{' declaration_list statement_list '}' { $$ = new compound_statement($2, $3);} - ; + : '{' '}' { $$ = new compound_statement(nullptr); } + | '{' block_item_list '}' { $$ = new compound_statement($2); } +block_item_list + : block_item { $$ = new list((block_item*)$1); } + | block_item_list block_item { $$ = append_ptr_list($1, $2); } -declaration_list - : declaration { $$ = new list((declaration*)$1); } - | declaration_list declaration { $$ = append_ptr_list($1, $2); } - -statement_list - : statement { $$ = new list((statement*)$1); } - | statement_list statement { $$ = append_ptr_list($1, $2); } - ; +block_item + : declaration { $$ = $1; } + | statement { $$ = $1; } expression_statement : ';' { $$ = new no_op(); } @@ -304,6 +299,8 @@ selection_statement iteration_statement : FOR '(' expression_statement expression_statement expression ')' statement { $$ = new iteration_statement($3, $4, $5, $7); } + | FOR '(' declaration expression_statement ')' statement { $$ = new iteration_statement($3, $4, nullptr, $6); } + | FOR '(' declaration expression_statement expression ')' statement { $$ = new iteration_statement($3, $4, $5, $7); } ; jump_statement diff --git a/include/triton/ast/scanner.l b/include/triton/ast/scanner.l index 56cc777a7..e3bf32be0 100644 --- a/include/triton/ast/scanner.l +++ b/include/triton/ast/scanner.l @@ -16,8 +16,8 @@ IS (u|U|l|L)* "tunable" { return(TUNABLE); } "kernel" { return(KERNEL); } "restrict" { return(RESTRICT); } -"readonly" { return(READONLY); } -"writeonly" { return(WRITEONLY); } +"read_only" { return(READONLY); } +"write_only" { return(WRITEONLY); } "@" { return(AT); } "newaxis" { return(NEWAXIS); } "if" { return(IF); } diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 49fe03206..db9c9ed2a 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -287,15 +287,8 @@ ir::value* function_definition::codegen(ir::module *mod) const{ /* Statements */ ir::value* compound_statement::codegen(ir::module* mod) const{ mod->add_new_scope(); - if(decls_) - decls_->codegen(mod); - if(statements_){ - for(statement *stmt: statements_->values()){ - ir::value *current = stmt->codegen(mod); - if(is_terminator(current)) - return current; - } - } + if(items_) + items_->codegen(mod); mod->pop_scope(); return nullptr; } @@ -333,7 +326,8 @@ ir::value* iteration_statement::codegen(ir::module *mod) const{ ir::basic_block *loop_bb = ir::basic_block::create(ctx, "loop", fn); ir::basic_block *next_bb = ir::basic_block::create(ctx, "postloop", fn); mod->set_continue_fn([&](){ - exec_->codegen(mod); + if(exec_) + exec_->codegen(mod); ir::value *cond = stop_->codegen(mod); return builder.create_cond_br(cond, loop_bb, next_bb); }); From c5073a5af61e463396119d27d1fbbce977e7d6d7 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 5 Mar 2019 23:45:58 -0500 Subject: [PATCH 097/494] [abstract syntax tree] better error messages --- examples/matrix.cpp | 92 +++++++++--------- include/triton/ast/ast.h | 6 ++ include/triton/ast/parser.y | 9 +- include/triton/ast/scanner.l | 179 +++++++++++++++++------------------ lib/ast/lowering.cpp | 40 ++++++++ 5 files changed, 187 insertions(+), 139 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 1a7484ab0..ec818ae58 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -36,52 +36,52 @@ extern void yy_delete_buffer(YY_BUFFER_STATE buffer); using triton::ast::translation_unit; extern translation_unit *ast_root; -const char src[] = -"\ -const tunable int32 TM;\ -const tunable int32 TN;\ -const tunable int32 TK;\ -\ -void matmul(restrict read_only fp32 *a, restrict read_only fp32 *b, fp32 *c,\ - int32 M, int32 N, int32 K, int32 bound){\ - int32 rxa[TM] = get_global_range[TM](0);\ - int32 ryb[TN] = get_global_range[TN](1);\ - int32 rka[TK] = 0 ... TK;\ - int32 rkb[TK] = 0 ... TK;\ - fp32 C[TM, TN] = 0;\ - fp32* pa[TM, TK] = a + rka[newaxis, :]*M + rxa[:, newaxis];\ - fp32* pb[TN, TK] = b + rkb[newaxis, :]*K + ryb[:, newaxis];\ - fp32 a[TM, TK] = *pa;\ - fp32 b[TN, TK] = *pb;\ - for(int32 k = K; k > 0;){\ - C = dot(a, b, C);\ - pa = pa + TK*M;\ - pb = pb + TK*K;\ - k = k - TK;\ - int1 checka[TM, TK] = k > bound;\ - int1 checkb[TN, TK] = k > bound;\ - @checka a = *pa;\ - @checkb b = *pb;\ - if(k > bound)\ - continue;\ - int1 checka0[TM] = rxa < M;\ - int1 checka1[TK] = rka < k;\ - int1 checkb0[TN] = ryb < N;\ - int1 checkb1[TK] = rkb < k;\ - checka = checka0[:, newaxis] && checka1[newaxis, :];\ - checkb = checkb0[:, newaxis] && checkb1[newaxis, :];\ - a = checka ? *pa : 0;\ - b = checkb ? *pb : 0;\ - }\ - int32 rxc[TM] = get_global_range[TM](0);\ - int32 ryc[TN] = get_global_range[TN](1);\ - fp32* pc[TM, TN] = c + ryc[newaxis, :]*M + rxc[:, newaxis];\ - int1 checkc0[TM] = rxc < M;\ - int1 checkc1[TN] = ryc < N;\ - int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :];\ - @checkc *pc = C;\ -}\ -"; +const char* src = +R"( +const tunable int32 TM; +const tunable int32 TN; +const tunable int32 TK; + +void matmul(restrict read_only fp32 *a, restrict read_only fp32 *b, fp32 *c, + int32 M, int32 N, int32 K, int32 bound){ + int32 rxa[TM] = get_global_range[TM](0) + int32 ryb[TN] = get_global_range[TN](1); + int32 rka[TK] = 0 ... TK; + int32 rkb[TK] = 0 ... TK; + fp32 C[TM, TN] = 0; + fp32* pa[TM, TK] = a + rka[newaxis, :]*M + rxa[:, newaxis]; + fp32* pb[TN, TK] = b + rkb[newaxis, :]*K + ryb[:, newaxis]; + fp32 a[TM, TK] = *pa; + fp32 b[TN, TK] = *pb; + for(int32 k = K; k > 0;){ + C = dot(a, b, C); + pa = pa + TK*M; + pb = pb + TK*K; + k = k - TK; + int1 checka[TM, TK] = k > bound; + int1 checkb[TN, TK] = k > bound; + @checka a = *pa; + @checkb b = *pb; + if(k > bound) + continue; + int1 checka0[TM] = rxa < M; + int1 checka1[TK] = rka < k; + int1 checkb0[TN] = ryb < N; + int1 checkb1[TK] = rkb < k; + checka = checka0[:, newaxis] && checka1[newaxis, :]; + checkb = checkb0[:, newaxis] && checkb1[newaxis, :]; + a = checka ? *pa : 0; + b = checkb ? *pb : 0; + } + int32 rxc[TM] = get_global_range[TM](0); + int32 ryc[TN] = get_global_range[TN](1); + fp32* pc[TM, TN] = c + ryc[newaxis, :]*M + rxc[:, newaxis]; + int1 checkc0[TM] = rxc < M; + int1 checkc1[TN] = ryc < N; + int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + @checkc *pc = C; +} +)"; static std::string compute_data_layout(bool is64Bit, bool UseShortPointers) { std::string Ret = "e"; diff --git a/include/triton/ast/ast.h b/include/triton/ast/ast.h index cc77f66b5..b286c5a79 100644 --- a/include/triton/ast/ast.h +++ b/include/triton/ast/ast.h @@ -599,6 +599,12 @@ private: list decls_; }; +void update_location(const char *t); +void print_error(const char *error); +char return_impl(char t, const char * yytext); +yytokentype return_impl(yytokentype t, const char * yytext); +void return_void(const char * yytext); + } } diff --git a/include/triton/ast/parser.y b/include/triton/ast/parser.y index 724f4240b..8ce55f372 100644 --- a/include/triton/ast/parser.y +++ b/include/triton/ast/parser.y @@ -1,3 +1,5 @@ +%define parse.error verbose + %{ namespace triton{ namespace ast{ @@ -8,7 +10,6 @@ using namespace triton::ast; #define YYSTYPE node* #include "../include/triton/ast/ast.h" -#define YYERROR_VERBOSE 1 extern char* yytext; void yyerror(const char *s); int yylex(void); @@ -44,7 +45,7 @@ UNARY_OP_T get_unary_op(node *op) { return ((token*)op)->unary_op; } TYPE_T get_type_spec(node *op) { return ((token*)op)->type; } STORAGE_SPEC_T get_storage_spec(node *op) { return ((token*)op)->storage_spec;} %} - + %token IDENTIFIER CONSTANT STRING_LITERAL %token TUNABLE KERNEL RESTRICT READONLY WRITEONLY CONST CONSTANT_SPACE %token PTR_OP INC_OP DEC_OP LEFT_OP RIGHT_OP LE_OP GE_OP EQ_OP NE_OP @@ -385,3 +386,7 @@ function_definition : declaration_specifiers declarator compound_statement { $$ = new function_definition($1, $2, $3); } ; +%% +void yyerror (const char *s){ + print_error(s); +} diff --git a/include/triton/ast/scanner.l b/include/triton/ast/scanner.l index e3bf32be0..91b700655 100644 --- a/include/triton/ast/scanner.l +++ b/include/triton/ast/scanner.l @@ -8,107 +8,104 @@ IS (u|U|l|L)* %{ #include #include "parser.hpp" +#include "../include/triton/ast/ast.h" +using triton::ast::return_impl; +using triton::ast::return_void; %} %% -"__constant__" { return(CONSTANT_SPACE); } -"const" { return(CONST); } -"tunable" { return(TUNABLE); } -"kernel" { return(KERNEL); } -"restrict" { return(RESTRICT); } -"read_only" { return(READONLY); } -"write_only" { return(WRITEONLY); } -"@" { return(AT); } -"newaxis" { return(NEWAXIS); } -"if" { return(IF); } -"else" { return(ELSE); } -"for" { return(FOR); } -"void" { return(VOID); } -"uint1" { return(UINT1); } -"uint8" { return(UINT8); } -"uint16" { return(UINT16); } -"uint32" { return(UINT32); } -"uint64" { return(UINT64); } -"int1" { return(INT1); } -"int8" { return(INT8); } -"int16" { return(INT16); } -"int32" { return(INT32); } -"int64" { return(INT64); } -"fp32" { return(FP32); } -"fp64" { return(FP64); } -"..." { return(ELLIPSIS); } -"get_global_range" { return GET_GLOBAL_RANGE; } -"dot" { return DOT;} -"continue" { return(CONTINUE); } -"alloc_const" { return(ALLOC_CONST); } -{L}({L}|{D})* { return(IDENTIFIER); } +"__constant__" { return return_impl(CONSTANT_SPACE, yytext); } +"const" { return return_impl(CONST, yytext); } +"tunable" { return return_impl(TUNABLE, yytext); } +"kernel" { return return_impl(KERNEL, yytext); } +"restrict" { return return_impl(RESTRICT, yytext); } +"read_only" { return return_impl(READONLY, yytext); } +"write_only" { return return_impl(WRITEONLY, yytext); } +"@" { return return_impl(AT, yytext); } +"newaxis" { return return_impl(NEWAXIS, yytext); } +"if" { return return_impl(IF, yytext); } +"else" { return return_impl(ELSE, yytext); } +"for" { return return_impl(FOR, yytext); } +"void" { return return_impl(VOID, yytext); } +"uint1" { return return_impl(UINT1, yytext); } +"uint8" { return return_impl(UINT8, yytext); } +"uint16" { return return_impl(UINT16, yytext); } +"uint32" { return return_impl(UINT32, yytext); } +"uint64" { return return_impl(UINT64, yytext); } +"int1" { return return_impl(INT1, yytext); } +"int8" { return return_impl(INT8, yytext); } +"int16" { return return_impl(INT16, yytext); } +"int32" { return return_impl(INT32, yytext); } +"int64" { return return_impl(INT64, yytext); } +"fp32" { return return_impl(FP32, yytext); } +"fp64" { return return_impl(FP64, yytext); } +"..." { return return_impl(ELLIPSIS, yytext); } +"get_global_range" { return return_impl(GET_GLOBAL_RANGE, yytext); } +"dot" { return return_impl(DOT, yytext); } +"continue" { return return_impl(CONTINUE, yytext); } +"alloc_const" { return return_impl(ALLOC_CONST, yytext); } +{L}({L}|{D})* { return return_impl(IDENTIFIER, yytext); } -0[xX]{H}+{IS}? { return(CONSTANT); } -0{D}+{IS}? { return(CONSTANT); } -{D}+{IS}? { return(CONSTANT); } -L?'(\\.|[^\\'])+' { return(CONSTANT); } +0[xX]{H}+{IS}? { return return_impl(CONSTANT, yytext); } +0{D}+{IS}? { return return_impl(CONSTANT, yytext); } +{D}+{IS}? { return return_impl(CONSTANT, yytext); } +L?'(\\.|[^\\'])+' { return return_impl(CONSTANT, yytext); } -{D}+{E}{FS}? { return(CONSTANT); } -{D}*"."{D}+({E})?{FS}? { return(CONSTANT); } -{D}+"."{D}*({E})?{FS}? { return(CONSTANT); } +{D}+{E}{FS}? { return return_impl(CONSTANT, yytext); } +{D}*"."{D}+({E})?{FS}? { return return_impl(CONSTANT, yytext); } +{D}+"."{D}*({E})?{FS}? { return return_impl(CONSTANT, yytext); } -L?\"(\\.|[^\\"])*\" { return(STRING_LITERAL); } +L?\"(\\.|[^\\"])*\" { return return_impl(STRING_LITERAL, yytext); } -">>=" { return(RIGHT_ASSIGN); } -"<<=" { return(LEFT_ASSIGN); } -"+=" { return(ADD_ASSIGN); } -"-=" { return(SUB_ASSIGN); } -"*=" { return(MUL_ASSIGN); } -"/=" { return(DIV_ASSIGN); } -"%=" { return(MOD_ASSIGN); } -"&=" { return(AND_ASSIGN); } -"^=" { return(XOR_ASSIGN); } -"|=" { return(OR_ASSIGN); } -">>" { return(RIGHT_OP); } -"<<" { return(LEFT_OP); } -"++" { return(INC_OP); } -"--" { return(DEC_OP); } -"->" { return(PTR_OP); } -"&&" { return(AND_OP); } -"||" { return(OR_OP); } -"<=" { return(LE_OP); } -">=" { return(GE_OP); } -"==" { return(EQ_OP); } -"!=" { return(NE_OP); } -";" { return(';'); } -("{"|"<%") { return('{'); } -("}"|"%>") { return('}'); } -"," { return(','); } -":" { return(':'); } -"=" { return('='); } -"(" { return('('); } -")" { return(')'); } -("["|"<:") { return('['); } -("]"|":>") { return(']'); } -"." { return('.'); } -"&" { return('&'); } -"!" { return('!'); } -"~" { return('~'); } -"-" { return('-'); } -"+" { return('+'); } -"*" { return('*'); } -"/" { return('/'); } -"%" { return('%'); } -"<" { return('<'); } -">" { return('>'); } -"^" { return('^'); } -"|" { return('|'); } -"?" { return('?'); } - -[ \t\v\n\f] { } +">>=" { return return_impl(RIGHT_ASSIGN, yytext); } +"<<=" { return return_impl(LEFT_ASSIGN, yytext); } +"+=" { return return_impl(ADD_ASSIGN, yytext); } +"-=" { return return_impl(SUB_ASSIGN, yytext); } +"*=" { return return_impl(MUL_ASSIGN, yytext); } +"/=" { return return_impl(DIV_ASSIGN, yytext); } +"%=" { return return_impl(MOD_ASSIGN, yytext); } +"&=" { return return_impl(AND_ASSIGN, yytext); } +"^=" { return return_impl(XOR_ASSIGN, yytext); } +"|=" { return return_impl(OR_ASSIGN, yytext); } +">>" { return return_impl(RIGHT_OP, yytext); } +"<<" { return return_impl(LEFT_OP, yytext); } +"++" { return return_impl(INC_OP, yytext); } +"--" { return return_impl(DEC_OP, yytext); } +"->" { return return_impl(PTR_OP, yytext); } +"&&" { return return_impl(AND_OP, yytext); } +"||" { return return_impl(OR_OP, yytext); } +"<=" { return return_impl(LE_OP, yytext); } +">=" { return return_impl(GE_OP, yytext); } +"==" { return return_impl(EQ_OP, yytext); } +"!=" { return return_impl(NE_OP, yytext); } +";" { return return_impl(';', yytext); } +("{"|"<%") { return return_impl('{', yytext); } +("}"|"%>") { return return_impl('}', yytext); } +"," { return return_impl(',', yytext); } +":" { return return_impl(':', yytext); } +"=" { return return_impl('=', yytext); } +"(" { return return_impl('(', yytext); } +")" { return return_impl(')', yytext); } +("["|"<:") { return return_impl('[', yytext); } +("]"|":>") { return return_impl(']', yytext); } +"." { return return_impl('.', yytext); } +"&" { return return_impl('&', yytext); } +"!" { return return_impl('!', yytext); } +"~" { return return_impl('~', yytext); } +"-" { return return_impl('-', yytext); } +"+" { return return_impl('+', yytext); } +"*" { return return_impl('*', yytext); } +"/" { return return_impl('/', yytext); } +"%" { return return_impl('%', yytext); } +"<" { return return_impl('<', yytext); } +">" { return return_impl('>', yytext); } +"^" { return return_impl('^', yytext); } +"|" { return return_impl('|', yytext); } +"?" { return return_impl('?', yytext); } +[ \t\v\n\f] { return_void(yytext);} . { /* ignore bad characters */ } %% int yywrap() { return(1); } - -void yyerror (const char *s) /* Called by yyparse on error */ -{ - printf ("Error: %s\n", s); -} diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index db9c9ed2a..cac80b262 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -8,12 +8,16 @@ #include "triton/ir/builder.h" #include "triton/ir/type.h" #include +#include namespace triton{ namespace ast{ +static int current_line = 0; +static int current_column = 0; + /* node */ ir::value *node::explicit_cast(ir::builder &builder, ir::value *src, ir::type *dst_ty){ ir::type *src_scalar_ty = src->get_type()->get_scalar_ty(); @@ -705,6 +709,42 @@ ir::value* named_expression::codegen(ir::module *mod) const{ } +// begin token +void update_location(const char *text) { + for (int i = 0; text[i] != '\0'; i++){ + if (text[i] == '\n'){ + current_column = 0; + current_line++; + } + else if (text[i] == '\t') + current_column += 8 - (current_column % 8); + else + current_column++; + } +} + +void print_error(const char *cerror) { + std::string error(cerror); + auto it = error.find("syntax error,"); + error.replace(it, 13, ""); + std::cerr << "error at line " << current_line << " (column " << current_column << "): " << error << std::endl; + throw std::runtime_error("compilation failed"); +} + +char return_impl(char t, const char * yytext) { + update_location(yytext); + return t; +} + +yytokentype return_impl(yytokentype t, const char * yytext){ + update_location(yytext); + return t; +} + +void return_void(const char * yytext){ + update_location(yytext); +} + } } From d049679aa285e27569718072308d470034f90335 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 8 Mar 2019 23:58:42 -0500 Subject: [PATCH 098/494] [general] added simple jit interface --- examples/matrix.cpp | 284 +++++-------------------------- include/triton/driver/backend.h | 54 +++--- include/triton/driver/buffer.h | 18 +- include/triton/driver/context.h | 18 +- include/triton/driver/cublas.h | 22 +-- include/triton/driver/device.h | 10 +- include/triton/driver/dispatch.h | 6 +- include/triton/driver/event.h | 6 +- include/triton/driver/handle.h | 16 +- include/triton/driver/kernel.h | 16 +- include/triton/driver/module.h | 20 +-- include/triton/driver/platform.h | 8 +- include/triton/driver/stream.h | 28 +-- include/triton/jit.h | 45 +++++ lib/driver/backend.cpp | 70 ++++---- lib/driver/buffer.cpp | 10 +- lib/driver/context.cpp | 18 +- lib/driver/device.cpp | 42 ++--- lib/driver/dispatch.cpp | 8 +- lib/driver/event.cpp | 2 +- lib/driver/handle.cpp | 20 +-- lib/driver/kernel.cpp | 12 +- lib/driver/module.cpp | 10 +- lib/driver/platform.cpp | 12 +- lib/driver/stream.cpp | 16 +- lib/jit.cpp | 151 ++++++++++++++++ 26 files changed, 458 insertions(+), 464 deletions(-) create mode 100644 include/triton/jit.h create mode 100644 lib/jit.cpp diff --git a/examples/matrix.cpp b/examples/matrix.cpp index ec818ae58..70125c90d 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -1,40 +1,8 @@ #include #include -#include "cuda.h" -#include "llvm/IR/Verifier.h" -#include "triton/ast/ast.h" -#include "triton/ir/context.h" -#include "triton/ir/module.h" -#include "triton/ir/print.h" -#include "triton/ir/context_impl.h" -#include "triton/codegen/selection.h" -#include "triton/codegen/tune.h" -#include "triton/codegen/shared_copy.h" -#include "triton/codegen/allocation.h" -#include "triton/codegen/liveness.h" -#include "triton/codegen/vectorize.h" -#include "triton/codegen/buffer_info.h" -#include "triton/codegen/barriers.h" -#include "llvm/IR/IRPrintingPasses.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/PassManager.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/TargetSelect.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetOptions.h" -#include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/IR/LegacyPassManager.h" -#include "llvm/Transforms/Scalar/EarlyCSE.h" -#include "llvm/Analysis/LoopPass.h" - -typedef struct yy_buffer_state * YY_BUFFER_STATE; -extern int yyparse(); -extern YY_BUFFER_STATE yy_scan_string(const char * str); -extern void yy_delete_buffer(YY_BUFFER_STATE buffer); -using triton::ast::translation_unit; -extern translation_unit *ast_root; +#include "triton/jit.h" +#include "triton/driver/backend.h" +#include "triton/driver/stream.h" const char* src = R"( @@ -44,7 +12,7 @@ const tunable int32 TK; void matmul(restrict read_only fp32 *a, restrict read_only fp32 *b, fp32 *c, int32 M, int32 N, int32 K, int32 bound){ - int32 rxa[TM] = get_global_range[TM](0) + int32 rxa[TM] = get_global_range[TM](0); int32 ryb[TN] = get_global_range[TN](1); int32 rka[TK] = 0 ... TK; int32 rkb[TK] = 0 ... TK; @@ -83,81 +51,6 @@ void matmul(restrict read_only fp32 *a, restrict read_only fp32 *b, fp32 *c, } )"; -static std::string compute_data_layout(bool is64Bit, bool UseShortPointers) { - std::string Ret = "e"; - if (!is64Bit) - Ret += "-p:32:32"; - else if (UseShortPointers) - Ret += "-p3:32:32-p4:32:32-p5:32:32"; - Ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; - return Ret; -} - -static std::string generate_machine_code(llvm::Module &module, const std::string &target_triple, const std::string &data_layout) { - llvm::InitializeAllTargetInfos(); - llvm::InitializeAllTargets(); - llvm::InitializeAllTargetMCs(); - llvm::InitializeAllAsmParsers(); - llvm::InitializeAllAsmPrinters(); - - module.setTargetTriple(target_triple); - std::string error; - auto target = llvm::TargetRegistry::lookupTarget(module.getTargetTriple(), error); - llvm::TargetMachine *machine = target->createTargetMachine(module.getTargetTriple(), "sm_52", "", - llvm::TargetOptions(), llvm::Reloc::Model(), - llvm::None, llvm::CodeGenOpt::Aggressive); - module.setDataLayout(data_layout); - - // emit machine code - llvm::legacy::PassManager pass; - llvm::SmallVector buffer; - llvm::raw_svector_ostream stream(buffer); - machine->addPassesToEmitFile(pass, stream, nullptr, llvm::TargetMachine::CGFT_AssemblyFile); - pass.run(module); - std::string src(buffer.begin(), buffer.end()); - return src; -} - -static void __checkCudaErrors( CUresult err, const char *file, const int line ) -{ - if( CUDA_SUCCESS != err) { - fprintf(stderr, - "CUDA Driver API error = %04d from file <%s>, line %i.\n", - err, file, line ); - exit(-1); - } -} -#define checkCudaErrors(err) __checkCudaErrors (err, __FILE__, __LINE__) - -static void compile_machine_code(CUdevice &device, CUcontext &context, CUmodule &module, - CUfunction &function, CUstream &stream, int &major, int &minor, - const std::string &src, const std::string &name) { - int numDevices; - - // Initialize - checkCudaErrors(cuInit(0)); - checkCudaErrors(cuDeviceGetCount(&numDevices)); - checkCudaErrors(cuDeviceGet(&device, 0)); - checkCudaErrors(cuDeviceComputeCapability(&major, &minor, device)); - checkCudaErrors(cuCtxCreate(&context, 0, device)); - checkCudaErrors(cuStreamCreate(&stream, 0)); - - // Compile program - CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; - unsigned int errbufsize = 8096; - std::string errbuf(errbufsize, 0); - const void *cpterr = static_cast(errbuf.data()); - void *pterr = const_cast(cpterr); - void* optval[] = {(void*)(uintptr_t)errbufsize, pterr}; - int err = cuModuleLoadDataEx(&module, src.data(), 2, opt, optval); - if(err != CUDA_SUCCESS){ - std::cerr << "Compilation Failed! Log: " << std::endl; - std::cerr << errbuf << std::endl; - } - - // Get function - checkCudaErrors(cuModuleGetFunction(&function, module, name.c_str())); -} template void simple_gemm(std::vector &c, const std::vector &a, const std::vector &b, size_t M, size_t N, size_t K){ @@ -170,54 +63,7 @@ void simple_gemm(std::vector &c, const std::vector &a, const std::vector const & ranges, std::function const &)> const & f){ - size_t D = ranges.size(); - std::vector values(D, 0); - // Start with innermost loop - size_t i = D - 1; - while(true){ - //Execute function - f(values); - //Increment counters - while(values[i]++ == ranges[i] - 1){ - if(i == 0) - return; - values[i--] = 0; - } - i = D - 1; - } -} - int main() { - // create AST from Triton-C source - YY_BUFFER_STATE buffer = yy_scan_string(src); - yyparse(); - yy_delete_buffer(buffer); - translation_unit *program = ast_root; - - // create Triton-IR from AST - triton::ir::context context; - triton::ir::module module("matrix", context); - program->codegen(&module); - llvm::LLVMContext llvm_context; - llvm::Module llvm_module("matmul", llvm_context); - - - - // create passes - triton::codegen::buffer_info_pass buffer_info; - triton::codegen::place_shared_copy shared(&buffer_info); - triton::codegen::tune tune; - triton::codegen::liveness liveness(&buffer_info); - triton::codegen::allocation allocation(&liveness, &buffer_info); - triton::codegen::barriers barriers(&allocation, &buffer_info); - triton::codegen::vectorize vectorize(&tune); - triton::codegen::selection selection(&allocation, &tune, &buffer_info); - - triton::ir::print(module, std::cout); - - // tuning parameters - tune.run(module); std::vector params = { // shapes 16, 16, 8, @@ -232,97 +78,49 @@ int main() { // b1 1, 8, 1 }; + unsigned TM = params[0]; + unsigned TN = params[1]; + unsigned nthreads = params[10]*params[13]*params[11]*params[14]; + auto context = triton::driver::backend::contexts::get_default(); + triton::jit jit(context); + jit.add_module(src, params); + triton::driver::kernel kernel = jit.get_function("matmul"); - // meta-parameters - unsigned i = 0; - context.p_impl->mp_constants_[0]->set_value(params[0]); - context.p_impl->mp_constants_[1]->set_value(params[1]); - context.p_impl->mp_constants_[2]->set_value(params[2]); - for(unsigned *x: tune.get_params(module)) - *x = params[3 + i++]; - - - - // constraints - std::map> errors; - tune.check_constraints(module, errors); - std::cout << "errors: " << errors.size() << std::endl; - for(auto &x: errors){ - for(auto &e: x.second) - std::cout << x.first->get_name() << " " << e << std::endl; - } - if(errors.size()) - exit(EXIT_FAILURE); - - - - // run passes - buffer_info.run(module); - shared.run(module); - liveness.run(module); - allocation.run(); - barriers.run(module); - vectorize.run(module); - selection.run(module, llvm_module); - - // llvm source - llvm::legacy::PassManager manager; - manager.add(llvm::createPrintModulePass(llvm::outs())); - manager.add(llvm::createVerifierPass(true)); - manager.run(llvm_module); - - std::string src = generate_machine_code(llvm_module, "nvptx64-nvidia-cuda", compute_data_layout(true, true)); - std::cout << src << std::endl; - - // compile machine code - CUdevice cu_device; - CUcontext cu_context; - CUmodule cu_module; - CUfunction cu_kernel; - CUstream cu_stream; - int major, minor; - compile_machine_code(cu_device, cu_context, cu_module, cu_kernel, cu_stream, major, minor, src, "matmul"); - - // execute machine code - // Allocate buffers - typedef float numeric_t; size_t M = 128, N = 128, K = 128; size_t bound = 8; - std::vector c(M*N); - std::vector rc(M*N); - std::vector a(M*K); - std::vector b(K*N); + std::vector hc(M*N); + std::vector rc(M*N); + std::vector ha(M*K); + std::vector hb(K*N); srand(0); - for(size_t i = 0; i < a.size(); i++) - a[i] = 1; - for(size_t i = 0; i < b.size(); i++) - b[i] = 1; - for(size_t i = 0; i < c.size(); i++) - c[i] = 0; - CUdeviceptr d_a, d_b, d_c; - checkCudaErrors(cuMemAlloc(&d_a, sizeof(numeric_t) * a.size())); - checkCudaErrors(cuMemAlloc(&d_b, sizeof(numeric_t) * b.size())); - checkCudaErrors(cuMemAlloc(&d_c, sizeof(numeric_t) * c.size())); - // Copy buffers - checkCudaErrors(cuMemcpyHtoD(d_a, a.data(), sizeof(numeric_t) * a.size())); - checkCudaErrors(cuMemcpyHtoD(d_b, b.data(), sizeof(numeric_t) * b.size())); - checkCudaErrors(cuMemcpyHtoD(d_c, c.data(), sizeof(numeric_t) * c.size())); - // Launch kernel - void *args[] = { &d_a, &d_b, &d_c, &M, &N, &K, &bound}; - int num_regs; - cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, cu_kernel); - unsigned TM = context.p_impl->mp_constants_[0]->get_value(); - unsigned TN = context.p_impl->mp_constants_[1]->get_value(); - unsigned nthreads = params[10]*params[13]*params[11]*params[14]; - checkCudaErrors(cuLaunchKernel(cu_kernel, (M + TM - 1)/TM, (N + TN - 1)/TN, 1, nthreads, 1, 1, 0, cu_stream, args, NULL)); - checkCudaErrors(cuStreamSynchronize(cu_stream)); - // Write back - checkCudaErrors(cuMemcpyDtoH(c.data(), d_c, sizeof(numeric_t) * c.size())); - simple_gemm(rc, a, b, M, N, K); + for(size_t i = 0; i < ha.size(); i++) + ha[i] = 1; + for(size_t i = 0; i < hb.size(); i++) + hb[i] = 1; + for(size_t i = 0; i < hc.size(); i++) + hc[i] = 0; + triton::driver::buffer dc(context, hc.size()*4); + triton::driver::buffer da(context, ha.size()*4); + triton::driver::buffer db(context, hb.size()*4); + triton::driver::stream stream(context); + stream.write(da, true, 0, ha); + stream.write(db, true, 0, hb); + stream.write(dc, true, 0, hc); + kernel.setArg(0, da); + kernel.setArg(1, db); + kernel.setArg(2, dc); + kernel.setArg(3, M); + kernel.setArg(4, N); + kernel.setArg(5, K); + kernel.setArg(6, bound); + stream.enqueue(kernel, {(M + TM - 1)/TM, (N + TN - 1)/TN, 1}, {nthreads, 1, 1}); + stream.synchronize(); + stream.read(dc, true, 0, hc); + simple_gemm(rc, ha, hb, M, N, K); for(size_t i = 0; i < M*N; i++) - if(std::abs(c[i] - rc[i])/std::max(c[i], rc[i]) > 1e-4){ - std::cout << i << " " << c[i] << " " << rc[i] << std::endl; + if(std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ + std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; exit(EXIT_FAILURE); } std::cout << "Pass!" << std::endl; diff --git a/include/triton/driver/backend.h b/include/triton/driver/backend.h index 0af719c29..d08422377 100755 --- a/include/triton/driver/backend.h +++ b/include/triton/driver/backend.h @@ -33,13 +33,13 @@ namespace triton namespace driver { -class Buffer; -class Stream; -class Device; -class Context; -class Platform; -class Module; -class Kernel; +class buffer; +class stream; +class device; +class context; +class platform; +class module; +class kernel; struct backend { @@ -49,9 +49,9 @@ struct backend friend class backend; public: static void release(); - static Module& get(Stream const & stream, std::string const & name, std::string const &src); + static module& get(driver::stream const & stream, std::string const & name, std::string const &src); private: - static std::map, Module * > cache_; + static std::map, module * > cache_; }; class kernels @@ -59,53 +59,53 @@ struct backend friend class backend; public: static void release(); - static Kernel & get(Module const & program, std::string const & name); + static kernel & get(driver::module const & program, std::string const & name); private: - static std::map, Kernel * > cache_; + static std::map, kernel * > cache_; }; class contexts { friend class backend; private: - static void init(std::vector const &); + static void init(std::vector const &); static void release(); public: - static Context const & get_default(); + static driver::context const & get_default(); template - static Context const & import(T context) + static driver::context const & import(T ctx) { - for(driver::Context const * x: cache_) - if((T)*x==context) + for(driver::context const * x: cache_) + if((T)*x==ctx) return *x; - cache_.emplace_back(new Context(context, false)); + cache_.emplace_back(new driver::context(ctx, false)); return *cache_.back(); } - static void get(std::list &); + static void get(std::list &); private: - static std::list cache_; + static std::list cache_; }; class streams { friend class backend; private: - static void init(std::list const &); + static void init(std::list const &); static void release(); public: - static void get(Context const &, std::vector &streams); - static Stream & get(Context const &, unsigned int id = 0); - static Stream & get_default(); + static void get(driver::context const &, std::vector &streams); + static stream & get(driver::context const &, unsigned int id = 0); + static stream & get_default(); private: - static std::map< Context, std::vector > cache_; + static std::map< context, std::vector > cache_; }; static void init(); static void release(); - static std::vector devices(); - static std::vector platforms(); - static void synchronize(Context const &); + static std::vector devices(); + static std::vector platforms(); + static void synchronize(driver::context const &); static unsigned int default_device; }; diff --git a/include/triton/driver/buffer.h b/include/triton/driver/buffer.h index 21603f9c4..351a58026 100755 --- a/include/triton/driver/buffer.h +++ b/include/triton/driver/buffer.h @@ -31,21 +31,21 @@ namespace triton namespace driver { -class Stream; +class stream; // Buffer -class Buffer: public HandleInterface +class buffer: public handle_interface { public: - Buffer(Context const & context, size_t size); - Buffer(Context const & context, CUdeviceptr cu, bool take_ownership); - void set_zero(Stream const & queue, size_t size); - Handle const & cu() const; - Handle & cu(); + buffer(driver::context const & context, size_t size); + buffer(driver::context const & context, CUdeviceptr cu, bool take_ownership); + void set_zero(stream const & queue, size_t size); + handle const & cu() const; + handle & cu(); private: - Context context_; - Handle cu_; + context context_; + handle cu_; }; } diff --git a/include/triton/driver/context.h b/include/triton/driver/context.h index f1c6bca7a..fbca8c88a 100755 --- a/include/triton/driver/context.h +++ b/include/triton/driver/context.h @@ -31,7 +31,7 @@ namespace triton namespace driver { -class Context: public HandleInterface +class context: public handle_interface { private: static std::string get_cache_path(); @@ -39,25 +39,25 @@ private: public: //Constructors - explicit Context(CUcontext context, bool take_ownership = true); - explicit Context(Device const & device); + explicit context(CUcontext context, bool take_ownership = true); + explicit context(driver::device const & dvc); //Accessors - Device const & device() const; + driver::device const & device() const; std::string const & cache_path() const; - Handle const & cu() const; + handle const & cu() const; private: - Handle cu_; - Device device_; + handle cu_; + driver::device dvc_; std::string cache_path_; }; class ContextSwitcher{ public: - ContextSwitcher(Context const & ctx); + ContextSwitcher(driver::context const & ctx); ~ContextSwitcher(); private: - Context const & ctx_; + driver::context const & ctx_; }; } diff --git a/include/triton/driver/cublas.h b/include/triton/driver/cublas.h index 857709106..b58fa0856 100755 --- a/include/triton/driver/cublas.h +++ b/include/triton/driver/cublas.h @@ -51,7 +51,7 @@ static const std::vector cublasAlgorithms = { static const std::map cudtype = {{FLOAT_TYPE, CUDA_R_32F}, {DOUBLE_TYPE,CUDA_R_64F}}; static const std::map cuop = {{'N', CUBLAS_OP_N}, {'T', CUBLAS_OP_T}}; -inline cublasGemmAlgo_t cublasGemmFastest(Stream& stream, cublasHandle_t handle, cudaDataType cudt, cublasOperation_t AT, cublasOperation_t BT, int32_t M, int32_t N, int32_t K, +inline cublasGemmAlgo_t cublasGemmFastest(stream& stream, cublasHandle_t handle, cudaDataType cudt, cublasOperation_t AT, cublasOperation_t BT, int32_t M, int32_t N, int32_t K, void* alpha, CUdeviceptr A, int32_t lda, CUdeviceptr B, int32_t ldb, void* beta, CUdeviceptr C, int32_t ldc){ @@ -84,7 +84,7 @@ inline void cublasGemmEx(cublasHandle_t handle, cudaDataType cudt, cublasOperati /* Simplified API for default GEMM */ -inline void cublasGemm(DType dtype, Stream& stream, char cAT, char cBT, int32_t M, int32_t N, int32_t K, scalar alpha, Buffer const & A, int32_t lda, Buffer const & B, int32_t ldb, scalar beta, Buffer& C, int32_t ldc, cublasGemmAlgo_t* fastest = NULL, cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT){ +inline void cublasGemm(DType dtype, stream& stream, char cAT, char cBT, int32_t M, int32_t N, int32_t K, scalar alpha, buffer const & A, int32_t lda, buffer const & B, int32_t ldb, scalar beta, buffer& C, int32_t ldc, cublasGemmAlgo_t* fastest = NULL, cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT){ ContextSwitcher ctx_switch(stream.context()); cublasHandle_t handle = dispatch::cublasHandle(stream.context()); dispatch::cublasSetStream_v2(handle, (CUstream)stream); @@ -111,9 +111,9 @@ inline cudnnTensorFormat_t format(cudnnDataType_t cutype){ } } -inline void cudnnConv(DType dtype, Stream& stream, int32_t D, int32_t H, int32_t W, int32_t N, int32_t K, int32_t M, int32_t P, int32_t Q, int32_t C, int32_t T, int32_t R, int32_t S, - int32_t pad_d, int32_t pad_h, int32_t pad_w, int32_t stride_d, int32_t stride_h, int32_t stride_w, scalar alpha, Buffer const & I, Buffer const & F, scalar beta, Buffer const & O){ - driver::Context const & ctx = stream.context(); +inline void cudnnConv(DType dtype, stream& stream, int32_t D, int32_t H, int32_t W, int32_t N, int32_t K, int32_t M, int32_t P, int32_t Q, int32_t C, int32_t T, int32_t R, int32_t S, + int32_t pad_d, int32_t pad_h, int32_t pad_w, int32_t stride_d, int32_t stride_h, int32_t stride_w, scalar alpha, buffer const & I, buffer const & F, scalar beta, buffer const & O){ + driver::driver::context const & ctx = stream.context(); ContextSwitcher switch_ctx(ctx); std::vector pad = {pad_d, pad_h, pad_w}; @@ -154,16 +154,16 @@ inline void cudnnConv(DType dtype, Stream& stream, int32_t D, int32_t H, int32_t size_t workspace_size; dispatch::cudnnGetConvolutionForwardWorkspaceSize(handle, tI, tF, conv, tO, algo, &workspace_size); - static Buffer work(ctx, 1024*1024*64); + static buffer work(ctx, 1024*1024*64); CUdeviceptr twork = work; CUdeviceptr pI = I, pF = F, pO = O; dispatch::cudnnConvolutionForward(handle, alpha.data(), tI, (void*)pI, tF, (void*)pF, conv, algo, (void*)twork, workspace_size, beta.data(), tO, (void*)pO); } -inline void cudnnPool(DType dtype, Stream& stream, int32_t D, int32_t H, int32_t W, int32_t N, int32_t K, int32_t M, int32_t P, int32_t Q, int32_t T, int32_t R, int32_t S, - int32_t pad_d, int32_t pad_h, int32_t pad_w, int32_t stride_d, int32_t stride_h, int32_t stride_w, scalar alpha, Buffer const & I, scalar beta, Buffer const & O){ - driver::Context const & ctx = stream.context(); +inline void cudnnPool(DType dtype, stream& stream, int32_t D, int32_t H, int32_t W, int32_t N, int32_t K, int32_t M, int32_t P, int32_t Q, int32_t T, int32_t R, int32_t S, + int32_t pad_d, int32_t pad_h, int32_t pad_w, int32_t stride_d, int32_t stride_h, int32_t stride_w, scalar alpha, buffer const & I, scalar beta, buffer const & O){ + driver::driver::context const & ctx = stream.context(); ContextSwitcher switch_ctx(ctx); std::vector pad = {pad_d, pad_h, pad_w}; @@ -200,11 +200,11 @@ inline void cudnnPool(DType dtype, Stream& stream, int32_t D, int32_t H, int32_t dispatch::cudnnPoolingForward(handle, desc, alpha.data(), tI, (void*)pI, beta.data(), tO, (void*)pO); } -inline void cudnnTransformTensor(driver::Stream & stream, +inline void cudnnTransformTensor(driver::stream & stream, DType in_dtype, DType out_dtype, cudnnTensorFormat_t in_layout, cudnnTensorFormat_t out_layout, int32_t N, int32_t C, int32_t D, int32_t H, int32_t W, - scalar alpha, driver::Buffer const & I, scalar beta, driver::Buffer& O) + scalar alpha, driver::buffer const & I, scalar beta, driver::buffer& O) { cudnnHandle_t handle = dispatch::cudnnHandle(stream.context()); dispatch::cudnnSetStream(handle, (CUstream)stream); diff --git a/include/triton/driver/device.h b/include/triton/driver/device.h index 2263cffc6..7f64b614a 100755 --- a/include/triton/driver/device.h +++ b/include/triton/driver/device.h @@ -33,7 +33,7 @@ namespace driver { // Device -class Device: public HandleInterface +class device: public handle_interface { public: //Supported architectures @@ -61,14 +61,14 @@ private: inline nvmlDevice_t nvml_device() const; public: - Device(CUdevice cu = CUdevice(), bool take_ownership = true): cu_(cu, take_ownership){} + device(CUdevice cu = CUdevice(), bool take_ownership = true): cu_(cu, take_ownership){} //Accessors Architecture architecture() const; - Handle const & cu() const; + handle const & cu() const; //Informations std::string infos() const; size_t address_bits() const; - driver::Platform platform() const; + driver::platform platform() const; std::vector max_block_dim() const; size_t max_threads_per_block() const; size_t max_shared_memory() const; @@ -87,7 +87,7 @@ public: size_t max_mem_clock() const; private: - Handle cu_; + handle cu_; std::shared_ptr> interpreted_as_; }; diff --git a/include/triton/driver/dispatch.h b/include/triton/driver/dispatch.h index aa1d412de..2357756d6 100755 --- a/include/triton/driver/dispatch.h +++ b/include/triton/driver/dispatch.h @@ -42,7 +42,7 @@ namespace triton namespace driver { -class Context; +class context; template void check(T){} void check(nvrtcResult err); @@ -137,7 +137,7 @@ public: static nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char **headers, const char **includeNames); static nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log); - static cublasHandle_t cublasHandle(Context const & ctx); + static cublasHandle_t cublasHandle(driver::context const & ctx); static cublasStatus_t cublasCreate_v2(cublasHandle_t* h); static cublasStatus_t cublasGetStream_v2(cublasHandle_t h, cudaStream_t *streamId); static cublasStatus_t cublasSetStream_v2(cublasHandle_t h, cudaStream_t streamId); @@ -146,7 +146,7 @@ public: static cublasStatus_t cublasHgemm (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, half* alpha, const half *A, int lda, const half *B, int ldb, half* beta, half *C, int ldc); static cublasStatus_t cublasGemmEx(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const void *alpha, const void *A, cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb, const void *beta, void *C, cudaDataType Ctype, int ldc, cudaDataType computeType, cublasGemmAlgo_t algo); - static cudnnHandle_t cudnnHandle(Context const & ctx); + static cudnnHandle_t cudnnHandle(driver::context const & ctx); static cudnnStatus_t cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc); static cudnnStatus_t cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t* convDesc); static cudnnStatus_t cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc); diff --git a/include/triton/driver/event.h b/include/triton/driver/event.h index 65f29beaf..41cb4fdf4 100755 --- a/include/triton/driver/event.h +++ b/include/triton/driver/event.h @@ -32,14 +32,14 @@ namespace driver { // Event -class Event: public HandleInterface +class Event: public handle_interface { public: float elapsed_time() const; - Handle const & cu() const; + handle const & cu() const; private: - Handle cu_; + handle cu_; }; } diff --git a/include/triton/driver/handle.h b/include/triton/driver/handle.h index 19cdf62f8..c7241cb41 100755 --- a/include/triton/driver/handle.h +++ b/include/triton/driver/handle.h @@ -49,24 +49,24 @@ private: }; template -class HandleInterface{ +class handle_interface{ public: //Accessors operator CUType() const { return *(((T*)this)->cu().h_); } //Comparison - bool operator==(HandleInterface const & y) { return (CUType)(*this) == (CUType)(y); } - bool operator!=(HandleInterface const & y) { return (CUType)(*this) != (CUType)(y); } - bool operator<(HandleInterface const & y) { return (CUType)(*this) < (CUType)(y); } + bool operator==(handle_interface const & y) { return (CUType)(*this) == (CUType)(y); } + bool operator!=(handle_interface const & y) { return (CUType)(*this) != (CUType)(y); } + bool operator<(handle_interface const & y) { return (CUType)(*this) < (CUType)(y); } }; template -class Handle{ +class handle{ public: - template friend class HandleInterface; + template friend class handle_interface; public: //Constructors - Handle(CUType cu = CUType(), bool take_ownership = true); - ~Handle(); + handle(CUType cu = CUType(), bool take_ownership = true); + ~handle(); CUType& operator*() { return *h_; } CUType const & operator*() const { return *h_; } CUType* operator->() const { return h_.get(); } diff --git a/include/triton/driver/kernel.h b/include/triton/driver/kernel.h index b29d7b1a4..c4fc207d4 100755 --- a/include/triton/driver/kernel.h +++ b/include/triton/driver/kernel.h @@ -34,27 +34,27 @@ namespace triton namespace driver { -class Buffer; +class buffer; // Kernel -class Kernel: public HandleInterface +class kernel: public handle_interface { public: //Constructors - Kernel(Module const & program, const char * name); + kernel(driver::module const & program, const char * name); //Accessors - Handle const & cu() const; - Module const & module() const; + handle const & cu() const; + driver::module const & module() const; //Arguments setters void setArg(unsigned int index, std::size_t size, void* ptr); - void setArg(unsigned int index, Buffer const &); + void setArg(unsigned int index, buffer const &); template void setArg(unsigned int index, T value) { setArg(index, sizeof(T), (void*)&value); } //Arguments getters void* const* cu_params() const; private: - Handle cu_; - Module program_; + handle cu_; + driver::module program_; unsigned int address_bits_; std::vector > cu_params_store_; std::vector cu_params_; diff --git a/include/triton/driver/module.h b/include/triton/driver/module.h index 3a964df38..43d9db4ee 100755 --- a/include/triton/driver/module.h +++ b/include/triton/driver/module.h @@ -34,22 +34,22 @@ namespace triton namespace driver { -class Context; -class Device; +class context; +class device; -class Module: public HandleInterface +class module: public handle_interface { - static std::string header(Device const & device); + static std::string header(device const & device); public: - Module(Context const & context, std::string const & source); - Context const & context() const; - Handle const & cu() const; - Buffer symbol(const char * name) const; + module(driver::context const & context, std::string const & source); + driver::context const & context() const; + handle const & cu() const; + buffer symbol(const char * name) const; private: - Handle cu_; - Context context_; + handle cu_; + driver::context context_; std::string source_; }; diff --git a/include/triton/driver/platform.h b/include/triton/driver/platform.h index 514e07625..5ab7d8d28 100755 --- a/include/triton/driver/platform.h +++ b/include/triton/driver/platform.h @@ -34,17 +34,17 @@ namespace triton namespace driver { -class Device; +class device; -class Platform +class platform { public: //Accessors std::string name() const { return "CUDA"; } std::string version() const; - std::vector devices() const; + std::vector devices() const; private: - Handle cu_; + handle cu_; }; } diff --git a/include/triton/driver/stream.h b/include/triton/driver/stream.h index a94a33c54..c420fa45f 100755 --- a/include/triton/driver/stream.h +++ b/include/triton/driver/stream.h @@ -35,43 +35,43 @@ namespace triton namespace driver { -class Kernel; +class kernel; class Event; class Range; -class Buffer; +class buffer; // Command Queue -class Stream: public HandleInterface +class stream: public handle_interface { public: //Constructors - Stream(CUstream stream, bool take_ownership); - Stream(Context const & context); + stream(CUstream stream, bool take_ownership); + stream(driver::context const & context); //Accessors - Handle const & cu() const; - Context const & context() const; + handle const & cu() const; + driver::context const & context() const; //Synchronize void synchronize(); //Enqueue - void enqueue(Kernel const & kernel, std::array grid, std::array block, std::vector const * = NULL, Event *event = NULL); + void enqueue(kernel const & kernel, std::array grid, std::array block, std::vector const * = NULL, Event *event = NULL); // Write - void write(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr); + void write(driver::buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr); - template void write(Buffer const & buffer, bool blocking, std::size_t offset, std::vector const & x) + template void write(driver::buffer const & buffer, bool blocking, std::size_t offset, std::vector const & x) { write(buffer, blocking, offset, x.size()*sizeof(T), x.data()); } // Read - void read(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr); + void read(driver::buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr); - template void read(Buffer const & buffer, bool blocking, std::size_t offset, std::vector& x) + template void read(driver::buffer const & buffer, bool blocking, std::size_t offset, std::vector& x) { read(buffer, blocking, offset, x.size()*sizeof(T), x.data()); } private: - Context context_; - Handle cu_; + driver::context context_; + handle cu_; }; diff --git a/include/triton/jit.h b/include/triton/jit.h new file mode 100644 index 000000000..07651c258 --- /dev/null +++ b/include/triton/jit.h @@ -0,0 +1,45 @@ +#ifndef TDL_INCLUDE_JIT_H +#define TDL_INCLUDE_JIT_H + +#include +#include +#include "llvm/IR/LLVMContext.h" +#include "triton/ir/context.h" +#include "triton/driver/module.h" +#include "triton/driver/kernel.h" + +namespace llvm { + class Module; +} + +namespace triton { + +namespace ir { +class module; +class context; +} + +class jit { +private: + void init_llvm(); + std::string compute_data_layout(bool is64Bit = true, bool UseShortPointers = true); + std::unique_ptr make_llvm_module(triton::ir::module &module, const std::vector& params); + std::unique_ptr make_triton_module(const std::string &src); + +public: + jit(driver::context context); + void add_module(ir::module &module, const std::vector& params = {}); + void add_module(const std::string &src, const std::vector& params = {}); + driver::kernel get_function(const std::string &name); + +private: + std::vector modules_; + driver::context driver_context_; + llvm::LLVMContext llvm_context_; + ir::context triton_context_; +}; + + +} + +#endif diff --git a/lib/driver/backend.cpp b/lib/driver/backend.cpp index 88e8630e9..b5094790f 100755 --- a/lib/driver/backend.cpp +++ b/lib/driver/backend.cpp @@ -47,14 +47,14 @@ void backend::modules::release(){ cache_.clear(); } -Module& backend::modules::get(Stream const & stream, std::string const & name, std::string const & src){ - std::tuple key(stream, name); +module& backend::modules::get(driver::stream const & stream, std::string const & name, std::string const & src){ + std::tuple key(stream, name); if(cache_.find(key)==cache_.end()) - return *cache_.insert(std::make_pair(key, new Module(stream.context(), src))).first->second; + return *cache_.insert(std::make_pair(key, new module(stream.context(), src))).first->second; return *cache_.at(key); } -std::map, Module * > backend::modules::cache_; +std::map, module * > backend::modules::cache_; /*-----------------------------------*/ //----------- Kernels --------------*/ @@ -66,23 +66,23 @@ void backend::kernels::release(){ cache_.clear(); } -Kernel & backend::kernels::get(Module const & program, std::string const & name){ - std::tuple key(program, name); +kernel & backend::kernels::get(driver::module const & program, std::string const & name){ + std::tuple key(program, name); if(cache_.find(key)==cache_.end()) - return *cache_.insert(std::make_pair(key, new Kernel(program, name.c_str()))).first->second; + return *cache_.insert(std::make_pair(key, new kernel(program, name.c_str()))).first->second; return *cache_.at(key); } -std::map, Kernel * > backend::kernels::cache_; +std::map, kernel * > backend::kernels::cache_; /*-----------------------------------*/ //------------ Queues --------------*/ /*-----------------------------------*/ -void backend::streams::init(std::list const & contexts){ - for(Context const * ctx : contexts) +void backend::streams::init(std::list const & contexts){ + for(context const * ctx : contexts) if(cache_.find(*ctx)==cache_.end()) - cache_.insert(std::make_pair(*ctx, std::vector{new Stream(*ctx)})); + cache_.insert(std::make_pair(*ctx, std::vector{new stream(*ctx)})); } void backend::streams::release(){ @@ -92,32 +92,32 @@ void backend::streams::release(){ cache_.clear(); } -Stream & backend::streams::get_default() +stream & backend::streams::get_default() { return get(contexts::get_default(), 0); } -Stream & backend::streams::get(Context const & context, unsigned int id){ - init(std::list(1,&context)); +stream & backend::streams::get(driver::context const & context, unsigned int id){ + init(std::list(1,&context)); for(auto & x : cache_) if(x.first==context) return *x.second[id]; throw; } -void backend::streams::get(Context const & context, std::vector & queues){ - init(std::list(1,&context)); +void backend::streams::get(driver::context const & context, std::vector & queues){ + init(std::list(1,&context)); queues = cache_.at(context); } -std::map > backend::streams::cache_; +std::map > backend::streams::cache_; /*-----------------------------------*/ //------------ Contexts ------------*/ /*-----------------------------------*/ -void backend::contexts::init(std::vector const & platforms){ - for(Platform const & platform: platforms){ - for(Device const & device: platform.devices()) - cache_.push_back(new Context(device)); +void backend::contexts::init(std::vector const & platforms){ + for(platform const & platform: platforms){ + for(device const & device: platform.devices()) + cache_.push_back(new context(device)); } } @@ -127,19 +127,19 @@ void backend::contexts::release(){ cache_.clear(); } -Context const & backend::contexts::get_default(){ +driver::context const & backend::contexts::get_default(){ backend::init(); - std::list::const_iterator it = cache_.begin(); + std::list::const_iterator it = cache_.begin(); std::advance(it, default_device); return **it; } -void backend::contexts::get(std::list & contexts){ +void backend::contexts::get(std::list & contexts){ backend::init(); contexts = cache_; } -std::list backend::contexts::cache_; +std::list backend::contexts::cache_; @@ -147,28 +147,28 @@ std::list backend::contexts::cache_; //------------ General -------------*/ /*-----------------------------------*/ -std::vector backend::devices(){ - std::vector platforms = backend::platforms(); - std::vector result; - for(Platform const & platform: platforms){ +std::vector backend::devices(){ + std::vector platforms = backend::platforms(); + std::vector result; + for(platform const & platform: platforms){ auto devices = platform.devices(); result.insert(result.end(), devices.begin(), devices.end()); } return result; } -std::vector backend::platforms(){ - std::vector platforms; +std::vector backend::platforms(){ + std::vector platforms; //if CUDA is here if(dispatch::cuinit()) - platforms.push_back(Platform()); + platforms.push_back(platform()); if(platforms.empty()) throw std::runtime_error("ISAAC: No backend available. Make sure CUDA is available in your library path"); return platforms; } -void backend::synchronize(Context const & context){ - for(Stream * queue: streams::cache_.at(context)) +void backend::synchronize(driver::context const & context){ + for(stream * queue: streams::cache_.at(context)) queue->synchronize(); } @@ -184,7 +184,7 @@ void backend::release(){ void backend::init(){ if(!contexts::cache_.empty()) return; - std::vector platforms = backend::platforms(); + std::vector platforms = backend::platforms(); contexts::init(platforms); streams::init(contexts::cache_); } diff --git a/lib/driver/buffer.cpp b/lib/driver/buffer.cpp index 1ac650397..129565a27 100755 --- a/lib/driver/buffer.cpp +++ b/lib/driver/buffer.cpp @@ -33,26 +33,26 @@ namespace triton namespace driver { -Buffer::Buffer(Context const & context, size_t size) : context_(context) +buffer::buffer(driver::context const & context, size_t size) : context_(context) { ContextSwitcher ctx_switch(context_); dispatch::cuMemAlloc(&*cu_, size); } -Buffer::Buffer(Context const & context, CUdeviceptr cu, bool take_ownership): +buffer::buffer(driver::context const & context, CUdeviceptr cu, bool take_ownership): context_(context), cu_(cu, take_ownership) { } -void Buffer::set_zero(Stream const & queue, size_t size) +void buffer::set_zero(stream const & queue, size_t size) { ContextSwitcher ctx_switch(context_); dispatch::cuMemsetD8Async(*cu_, 0, size, queue); } -Handle const & Buffer::cu() const +handle const & buffer::cu() const { return cu_; } -Handle & Buffer::cu() +handle & buffer::cu() { return cu_; } } diff --git a/lib/driver/context.cpp b/lib/driver/context.cpp index ddaed2b91..6177749b5 100755 --- a/lib/driver/context.cpp +++ b/lib/driver/context.cpp @@ -35,7 +35,7 @@ namespace triton namespace driver { -std::string Context::get_cache_path(){ +std::string context::get_cache_path(){ //user-specified cache path std::string result = tools::getenv("ISAAC_CACHE_PATH"); if(!result.empty()){ @@ -54,7 +54,7 @@ std::string Context::get_cache_path(){ return ""; } -CUdevice Context::device(CUcontext context){ +CUdevice context::device(CUcontext context){ dispatch::cuCtxPushCurrent_v2(context); CUdevice res; dispatch::cuCtxGetDevice(&res); @@ -62,26 +62,26 @@ CUdevice Context::device(CUcontext context){ return res; } -Context::Context(CUcontext context, bool take_ownership): cu_(context, take_ownership), device_(device(context), false), cache_path_(get_cache_path()) +context::context(CUcontext context, bool take_ownership): cu_(context, take_ownership), dvc_(device(context), false), cache_path_(get_cache_path()) { } -Context::Context(Device const & device): device_(device), cache_path_(get_cache_path()) +context::context(driver::device const & device): dvc_(device), cache_path_(get_cache_path()) { dispatch::cuCtxCreate(&*cu_, CU_CTX_SCHED_AUTO, (CUdevice)device); dispatch::cuCtxPopCurrent_v2(NULL); } -Device const & Context::device() const -{ return device_; } +device const & context::device() const +{ return dvc_; } -std::string const & Context::cache_path() const +std::string const & context::cache_path() const { return cache_path_; } -Handle const & Context::cu() const +handle const & context::cu() const { return cu_; } /* Context Switcher */ -ContextSwitcher::ContextSwitcher(Context const & ctx): ctx_(ctx) +ContextSwitcher::ContextSwitcher(driver::context const & ctx): ctx_(ctx) { dispatch::cuCtxPushCurrent_v2(ctx_); } diff --git a/lib/driver/device.cpp b/lib/driver/device.cpp index 44f9e29bd..3f7783fbc 100755 --- a/lib/driver/device.cpp +++ b/lib/driver/device.cpp @@ -35,7 +35,7 @@ namespace driver { /* Architecture [NVidia] */ -Device::Architecture Device::nv_arch(std::pair sm) const{ +device::Architecture device::nv_arch(std::pair sm) const{ switch(sm.first) { case 7: @@ -81,13 +81,13 @@ Device::Architecture Device::nv_arch(std::pair sm) c } template -int Device::cuGetInfo() const{ +int device::cuGetInfo() const{ int res; dispatch::cuDeviceGetAttribute(&res, attr, *cu_); return res; } -nvmlDevice_t Device::nvml_device() const{ +nvmlDevice_t device::nvml_device() const{ std::map map; std::string key = pci_bus_id(); if(map.find(key)==map.end()){ @@ -99,33 +99,33 @@ nvmlDevice_t Device::nvml_device() const{ } /* Architecture */ -Device::Architecture Device::architecture() const +device::Architecture device::architecture() const { return nv_arch(compute_capability()); } /* Attributes */ -size_t Device::address_bits() const +size_t device::address_bits() const { return sizeof(size_t)*8; } -driver::Platform Device::platform() const -{ return Platform(); } +driver::platform device::platform() const +{ return platform(); } -std::string Device::name() const{ +std::string device::name() const{ char tmp[128]; dispatch::cuDeviceGetName(tmp, 128, *cu_); return std::string(tmp); } -std::string Device::pci_bus_id() const{ +std::string device::pci_bus_id() const{ char tmp[128]; dispatch::cuDeviceGetPCIBusId(tmp, 128, *cu_); return std::string(tmp); } -void Device::interpret_as(std::pair cc){ +void device::interpret_as(std::pair cc){ interpreted_as_ = std::make_shared>(cc); } -std::pair Device::compute_capability() const{ +std::pair device::compute_capability() const{ if(interpreted_as_) return *interpreted_as_; size_t _major = cuGetInfo(); @@ -133,17 +133,17 @@ std::pair Device::compute_capability() const{ return std::make_pair(_major, _minor); } -size_t Device::max_threads_per_block() const +size_t device::max_threads_per_block() const { return cuGetInfo(); } -size_t Device::max_shared_memory() const +size_t device::max_shared_memory() const { return cuGetInfo(); } -size_t Device::warp_size() const +size_t device::warp_size() const { return cuGetInfo(); } -std::vector Device::max_block_dim() const{ +std::vector device::max_block_dim() const{ std::vector result(3); result[0] = cuGetInfo(); result[1] = cuGetInfo(); @@ -151,33 +151,33 @@ std::vector Device::max_block_dim() const{ return result; } -size_t Device::current_sm_clock() const{ +size_t device::current_sm_clock() const{ unsigned int result; dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_SM, &result); return result; } -size_t Device::max_sm_clock() const{ +size_t device::max_sm_clock() const{ unsigned int result; dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_SM, &result); return result; } -size_t Device::current_mem_clock() const{ +size_t device::current_mem_clock() const{ unsigned int result; dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_MEM, &result); return result; } -size_t Device::max_mem_clock() const{ +size_t device::max_mem_clock() const{ unsigned int result; dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_MEM, &result); return result; } /* Infos */ -std::string Device::infos() const{ +std::string device::infos() const{ std::ostringstream oss; std::vector max_wi_sizes = max_block_dim(); oss << "Platform: " << platform().name() << std::endl; @@ -188,7 +188,7 @@ std::string Device::infos() const{ return oss.str(); } -Handle const & Device::cu() const +handle const & device::cu() const { return cu_; } } diff --git a/lib/driver/dispatch.cpp b/lib/driver/dispatch.cpp index 9e7a01330..25e4638f8 100755 --- a/lib/driver/dispatch.cpp +++ b/lib/driver/dispatch.cpp @@ -180,16 +180,16 @@ NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *, nvmlD NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*) NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*) -cublasHandle_t dispatch::cublasHandle(Context const & ctx){ - static std::map handles; +cublasHandle_t dispatch::cublasHandle(driver::context const & ctx){ + static std::map handles; auto pr = handles.insert({ctx, cublasHandle_t()}); if(pr.second) cublasCreate_v2(&pr.first->second); return pr.first->second; } -cudnnHandle_t dispatch::cudnnHandle(Context const & ctx){ - static std::map handles; +cudnnHandle_t dispatch::cudnnHandle(driver::context const & ctx){ + static std::map handles; auto pr = handles.insert({ctx, cudnnHandle_t()}); if(pr.second) cudnnCreate(&pr.first->second); diff --git a/lib/driver/event.cpp b/lib/driver/event.cpp index 60397882b..ddd5f3874 100755 --- a/lib/driver/event.cpp +++ b/lib/driver/event.cpp @@ -33,7 +33,7 @@ float Event::elapsed_time() const{ return time; } -Handle const & Event::cu() const +handle const & Event::cu() const { return cu_; } } diff --git a/lib/driver/handle.cpp b/lib/driver/handle.cpp index 090568919..c0144fcfe 100755 --- a/lib/driver/handle.cpp +++ b/lib/driver/handle.cpp @@ -43,24 +43,24 @@ inline void _delete(cu_platform){} //Constructor template -Handle::Handle(CUType cu, bool take_ownership): h_(new CUType(cu)), has_ownership_(take_ownership) +handle::handle(CUType cu, bool take_ownership): h_(new CUType(cu)), has_ownership_(take_ownership) { } template -Handle::~Handle(){ +handle::~handle(){ if(has_ownership_ && h_ && h_.unique() && *h_) _delete(*h_); } -template class Handle; -template class Handle; -template class Handle; -template class Handle; -template class Handle; -template class Handle; -template class Handle; -template class Handle; +template class handle; +template class handle; +template class handle; +template class handle; +template class handle; +template class handle; +template class handle; +template class handle; } } diff --git a/lib/driver/kernel.cpp b/lib/driver/kernel.cpp index 6cd6dd2e7..994bf3cfa 100755 --- a/lib/driver/kernel.cpp +++ b/lib/driver/kernel.cpp @@ -32,13 +32,13 @@ namespace triton namespace driver { -Kernel::Kernel(Module const & program, const char * name) : program_(program), address_bits_(program.context().device().address_bits()){ +kernel::kernel(driver::module const & program, const char * name) : program_(program), address_bits_(program.context().device().address_bits()){ cu_params_store_.reserve(64); cu_params_.reserve(64); dispatch::cuModuleGetFunction(&*cu_, program, name); } -void Kernel::setArg(unsigned int index, std::size_t size, void* ptr){ +void kernel::setArg(unsigned int index, std::size_t size, void* ptr){ if(index + 1> cu_params_store_.size()){ cu_params_store_.resize(index+1); cu_params_.resize(index+1); @@ -48,16 +48,16 @@ void Kernel::setArg(unsigned int index, std::size_t size, void* ptr){ cu_params_[index] = cu_params_store_[index].get(); } -void Kernel::setArg(unsigned int index, Buffer const & data) +void kernel::setArg(unsigned int index, buffer const & data) { return setArg(index, (CUdeviceptr)data);} -void* const* Kernel::cu_params() const +void* const* kernel::cu_params() const { return cu_params_.data(); } -Handle const & Kernel::cu() const +handle const & kernel::cu() const { return cu_; } -Module const & Kernel::module() const +driver::module const & kernel::module() const { return program_; } diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 2748742a7..21dee6027 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -34,7 +34,7 @@ namespace triton namespace driver { -Module::Module(Context const & context, std::string const & source) : context_(context), source_(source){ +module::module(driver::context const & context, std::string const & source) : context_(context), source_(source){ ContextSwitcher ctx_switch(context_); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; @@ -50,17 +50,17 @@ Module::Module(Context const & context, std::string const & source) : context_(c } } -Context const & Module::context() const +driver::context const & module::context() const { return context_; } -Handle const & Module::cu() const +handle const & module::cu() const { return cu_; } -Buffer Module::symbol(const char *name) const{ +buffer module::symbol(const char *name) const{ CUdeviceptr handle; size_t size; dispatch::cuModuleGetGlobal_v2(&handle, &size, *cu_, name); - return Buffer(context_, handle, false); + return buffer(context_, handle, false); } diff --git a/lib/driver/platform.cpp b/lib/driver/platform.cpp index 0fe23ccac..b6ff27112 100755 --- a/lib/driver/platform.cpp +++ b/lib/driver/platform.cpp @@ -31,20 +31,20 @@ namespace triton namespace driver { -std::string Platform::version() const{ +std::string platform::version() const{ int version; dispatch::cuDriverGetVersion(&version); return std::to_string(version); } -std::vector Platform::devices() const{ - std::vector devices; +std::vector platform::devices() const{ + std::vector devices; int N; dispatch::cuDeviceGetCount(&N); for(int i = 0 ; i < N ; ++i){ - CUdevice device; - dispatch::cuDeviceGet(&device, i); - devices.push_back(Device(device)); + CUdevice dvc; + dispatch::cuDeviceGet(&dvc, i); + devices.push_back(driver::device(dvc)); } return devices; } diff --git a/lib/driver/stream.cpp b/lib/driver/stream.cpp index d82b0437c..0b318811a 100755 --- a/lib/driver/stream.cpp +++ b/lib/driver/stream.cpp @@ -44,25 +44,25 @@ inline CUcontext cucontext(){ return result; } -Stream::Stream(CUstream stream, bool take_ownership): context_(cucontext(), take_ownership), cu_(stream, take_ownership) +stream::stream(CUstream stream, bool take_ownership): context_(cucontext(), take_ownership), cu_(stream, take_ownership) {} -Stream::Stream(Context const & context): context_(context), cu_(CUstream(), true) +stream::stream(driver::context const & context): context_(context), cu_(CUstream(), true) { ContextSwitcher ctx_switch(context_); dispatch::cuStreamCreate(&*cu_, 0); } -void Stream::synchronize() +void stream::synchronize() { ContextSwitcher ctx_switch(context_); dispatch::cuStreamSynchronize(*cu_); } -Context const & Stream::context() const +driver::context const & stream::context() const { return context_; } -void Stream::enqueue(Kernel const & kernel, std::array grid, std::array block, std::vector const *, Event* event){ +void stream::enqueue(kernel const & kernel, std::array grid, std::array block, std::vector const *, Event* event){ ContextSwitcher ctx_switch(context_); if(event) dispatch::cuEventRecord(((cu_event_t)*event).first, *cu_); @@ -71,7 +71,7 @@ void Stream::enqueue(Kernel const & kernel, std::array grid, std::arr dispatch::cuEventRecord(((cu_event_t)*event).second, *cu_); } -void Stream::write(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr){ +void stream::write(buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr){ ContextSwitcher ctx_switch(context_); if(blocking) dispatch::cuMemcpyHtoD(buffer + offset, ptr, size); @@ -79,7 +79,7 @@ void Stream::write(Buffer const & buffer, bool blocking, std::size_t offset, std dispatch::cuMemcpyHtoDAsync(buffer + offset, ptr, size, *cu_); } -void Stream::read(Buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr){ +void stream::read(buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr){ ContextSwitcher ctx_switch(context_); if(blocking) dispatch::cuMemcpyDtoH(ptr, buffer + offset, size); @@ -87,7 +87,7 @@ void Stream::read(Buffer const & buffer, bool blocking, std::size_t offset, std: dispatch::cuMemcpyDtoHAsync(ptr, buffer + offset, size, *cu_); } -Handle const & Stream::cu() const +handle const & stream::cu() const { return cu_; } } diff --git a/lib/jit.cpp b/lib/jit.cpp new file mode 100644 index 000000000..17787c352 --- /dev/null +++ b/lib/jit.cpp @@ -0,0 +1,151 @@ +#include "triton/jit.h" +#include +#include "triton/ast/ast.h" +#include "triton/ir/context.h" +#include "triton/ir/context_impl.h" +#include "triton/codegen/selection.h" +#include "triton/codegen/tune.h" +#include "triton/codegen/shared_copy.h" +#include "triton/codegen/allocation.h" +#include "triton/codegen/liveness.h" +#include "triton/codegen/vectorize.h" +#include "triton/codegen/buffer_info.h" +#include "triton/codegen/barriers.h" +#include "llvm/IR/IRPrintingPasses.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/Transforms/Scalar/EarlyCSE.h" +#include "llvm/Analysis/LoopPass.h" + +typedef struct yy_buffer_state * YY_BUFFER_STATE; +extern int yyparse(); +extern YY_BUFFER_STATE yy_scan_string(const char * str); +extern void yy_delete_buffer(YY_BUFFER_STATE buffer); +using triton::ast::translation_unit; +extern translation_unit *ast_root; + +namespace triton { + +void jit::init_llvm() { + static bool init = false; + if(!init){ + llvm::InitializeAllTargetInfos(); + llvm::InitializeAllTargets(); + llvm::InitializeAllTargetMCs(); + llvm::InitializeAllAsmParsers(); + llvm::InitializeAllAsmPrinters(); + init = true; + } +} + +std::unique_ptr jit::make_llvm_module(ir::module &module, const std::vector& params) { + llvm::Module* result = new llvm::Module("matmul", llvm_context_); + + // create passes + codegen::buffer_info_pass buffer_info; + codegen::place_shared_copy shared(&buffer_info); + codegen::tune tune; + codegen::liveness liveness(&buffer_info); + codegen::allocation allocation(&liveness, &buffer_info); + codegen::barriers barriers(&allocation, &buffer_info); + codegen::vectorize vectorize(&tune); + codegen::selection selection(&allocation, &tune, &buffer_info); + + // tuning parameters + tune.run(module); + unsigned i = 0; + triton_context_.p_impl->mp_constants_[0]->set_value(params[0]); + triton_context_.p_impl->mp_constants_[1]->set_value(params[1]); + triton_context_.p_impl->mp_constants_[2]->set_value(params[2]); + for(unsigned *x: tune.get_params(module)) + *x = params[3 + i++]; + // constraints + std::map> errors; + tune.check_constraints(module, errors); + std::cout << "errors: " << errors.size() << std::endl; + for(auto &x: errors){ + for(auto &e: x.second) + std::cout << x.first->get_name() << " " << e << std::endl; + } + if(errors.size()) + exit(EXIT_FAILURE); + + // generate ptx + buffer_info.run(module); + shared.run(module); + liveness.run(module); + allocation.run(); + barriers.run(module); + vectorize.run(module); + selection.run(module, *result); + + return std::unique_ptr(result); +} + +std::unique_ptr jit::make_triton_module(const std::string &src) { + // create AST from Triton-C source + YY_BUFFER_STATE buffer = yy_scan_string(src.c_str()); + yyparse(); + yy_delete_buffer(buffer); + translation_unit *program = ast_root; + // create Triton-IR from AST + ir::module* module = new ir::module("matrix", triton_context_); + program->codegen(module); + return std::unique_ptr(module); +} + + +jit::jit(driver::context context): driver_context_(context) { +} + +std::string jit::compute_data_layout(bool is_64bit, bool use_short_pointers) { + std::string ret = "e"; + if (!is_64bit) + ret += "-p:32:32"; + else if (use_short_pointers) + ret += "-p3:32:32-p4:32:32-p5:32:32"; + ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; + return ret; +} + +void jit::add_module(ir::module &tt_module, const std::vector ¶ms) { + init_llvm(); + auto ll_module = make_llvm_module(tt_module, params); + ll_module->setTargetTriple("nvptx64-nvidia-cuda"); + std::string error; + auto target = llvm::TargetRegistry::lookupTarget(ll_module->getTargetTriple(), error); + llvm::TargetMachine *machine = target->createTargetMachine(ll_module->getTargetTriple(), "sm_52", "", + llvm::TargetOptions(), llvm::Reloc::Model(), + llvm::None, llvm::CodeGenOpt::Aggressive); + ll_module->setDataLayout(compute_data_layout()); + + // emit machine code + llvm::legacy::PassManager pass; + llvm::SmallVector buffer; + llvm::raw_svector_ostream stream(buffer); + machine->addPassesToEmitFile(pass, stream, nullptr, llvm::TargetMachine::CGFT_AssemblyFile); + pass.run(*ll_module); + std::string src(buffer.begin(), buffer.end()); + + modules_.push_back(driver::module(driver_context_, src)); +} + +void jit::add_module(const std::string &src, const std::vector ¶ms) { + auto ptt_module = make_triton_module(src); + add_module(*ptt_module, params); +} + +driver::kernel jit::get_function(const std::string &name) { + return driver::kernel(modules_.front(), name.c_str()); +} + + +} From 5f292630442cc2ffdda120eccd31dd3d831ee511 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 9 Mar 2019 12:05:12 -0500 Subject: [PATCH 099/494] [code generation] now using ir::metaparameter* for all tunable metaparameters --- include/triton/codegen/selection.h | 6 ++-- include/triton/codegen/tune.h | 17 +++++---- include/triton/ir/builder.h | 5 +++ include/triton/ir/constant.h | 4 ++- lib/codegen/selection.cpp | 16 ++++----- lib/codegen/tune.cpp | 57 ++++++++++++++++-------------- lib/codegen/vectorize.cpp | 2 +- lib/ir/builder.cpp | 15 ++++++++ lib/ir/constant.cpp | 2 +- lib/jit.cpp | 5 +-- 10 files changed, 80 insertions(+), 49 deletions(-) diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 5c81ca8a0..291fbf827 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -117,9 +117,9 @@ private: // grid construction void create_grids(std::vector &grids, - std::map &references, + std::map &references, ir::function *fn); - void create_tile(ir::value *v, llvm::IRBuilder<> &builder, const std::map &references, std::set &seen, llvm::Value *sh_mem_ptr); + void create_tile(ir::value *v, llvm::IRBuilder<> &builder, const std::map &references, std::set &seen, llvm::Value *sh_mem_ptr); void init_axes(ir::value *i, llvm::IRBuilder<> &builder, llvm::Value *u_thread_id, llvm::Value *u_warp_id); void init_grids(ir::function *fn, llvm::IRBuilder<> &builder, llvm::Value *sh_mem_ptr); @@ -139,7 +139,7 @@ private: allocation *alloc_; tune *params_; buffer_info_pass *buffer_info_; - std::map axes_; + std::map axes_; }; } diff --git a/include/triton/codegen/tune.h b/include/triton/codegen/tune.h index cb1d5b509..5979290fa 100644 --- a/include/triton/codegen/tune.h +++ b/include/triton/codegen/tune.h @@ -12,6 +12,7 @@ namespace ir{ class module; class instruction; class function; + class metaparameter; } namespace codegen{ @@ -24,24 +25,28 @@ private: void add_constraint(node_t x, node_t y); void init_c_phi(ir::instruction *i); void init_c_graph(ir::instruction *v); - void connected_components(node_t x, const std::vector vals, std::set &nodes, graph_t &graph); - void create_grids(std::vector &grids, std::map &references, ir::function *fn); + void connected_components(node_t x, const std::vector mps, std::set &nodes, graph_t &graph); + void create_grids(std::vector &grids, std::map &references, ir::function *fn); public: - std::vector get_params(ir::module& mod); - std::map get_params(ir::instruction* i); - unsigned *get_param(ir::value *value, const std::string &key) { return params_[value][key]; } + std::vector get_params(ir::module& mod); + std::map get_params(ir::instruction* i); + ir::metaparameter* get_param(ir::value *value, const std::string &key) { return params_[value][key]; } void copy(ir::value *dst, ir::value *src) { params_[dst] = params_[src]; } bool check_constraints(ir::module &fn, std::map> &errors); void run(ir::module &mod); + ir::metaparameter* get_num_threads(); + ir::metaparameter* get_global_range_size(unsigned axis); private: - std::map> params_; std::vector pool_; graph_t dependencies_; std::set nodes_; std::map static_params_; + std::map> params_; + ir::metaparameter *num_threads_; + std::vector global_range_sizes_; }; diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h index 509ae8e47..852f55aa9 100644 --- a/include/triton/ir/builder.h +++ b/include/triton/ir/builder.h @@ -35,6 +35,11 @@ public: // Constants value *get_int32(unsigned val); // Types + type *get_int1_ty(); + type *get_int8_ty(); + type *get_int16_ty(); + type *get_int32_ty(); + type *get_int64_ty(); type *get_float_ty(); type *get_double_ty(); // Insert diff --git a/include/triton/ir/constant.h b/include/triton/ir/constant.h index 9f2baf618..e3bd2ab24 100644 --- a/include/triton/ir/constant.h +++ b/include/triton/ir/constant.h @@ -49,11 +49,13 @@ class metaparameter: public constant_int{ public: static metaparameter *create(context &ctx, type *ty, unsigned lo, unsigned hi); - void set_value(uint64_t value) { value_ = value; } + void set_value(uint64_t value) { has_value_ = true; value_ = value; } + bool has_value() { return has_value_; } private: unsigned lo_; unsigned hi_; + bool has_value_; }; /* constant range */ diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 3f79c9375..a9f7e8524 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -379,9 +379,9 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id std::vector n_warps(dim); for(unsigned i = 0; i < shapes.size(); i++){ std::string str_i = std::to_string(i); - contiguous[i] = *params_->get_param(v, "p0.d" + str_i); - warp_size[i] = *params_->get_param(v, "p1.d" + str_i); - n_warps[i] = *params_->get_param(v, "p2.d" + str_i); + contiguous[i] = params_->get_param(v, "p0.d" + str_i)->get_value(); + warp_size[i] = params_->get_param(v, "p1.d" + str_i)->get_value(); + n_warps[i] = params_->get_param(v, "p2.d" + str_i)->get_value(); } std::vector thread_id_in_warp = delinearize(u_thread_id, warp_size, builder); std::vector warp_id = delinearize(u_warp_id, n_warps, builder); @@ -404,7 +404,7 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id } void selection::create_grids(std::vector &grids, - std::map &references, + std::map &references, ir::function *fn) { // get number of dimensions greater than 1 auto get_tile_gt1_dim = [&](ir::value *v){ @@ -432,7 +432,7 @@ void selection::create_grids(std::vector &grids, for(size_t d = 0; d < shapes.size(); d++){ if(shapes[d]->get_value() == 1) continue; - unsigned *x = params_->get_param(v, "p0.d" + std::to_string(d)); + ir::metaparameter *x = params_->get_param(v, "p0.d" + std::to_string(d)); ir::value *&r = references[x]; if(!r || get_tile_gt1_dim(v) > get_tile_gt1_dim(r)) r = v; @@ -457,7 +457,7 @@ bool static inline has_phi_user(ir::value *v) { return false; } void selection::create_tile(ir::value *v, IRBuilder<> &builder, - const std::map& references, + const std::map& references, std::set &seen, Value *sh_mem_ptr) { if(!v->get_type()->is_tile_ty() || !seen.insert(v).second) return; @@ -517,7 +517,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, std::vector axes(shapes.size()); for(size_t d = 0; d < shapes.size(); d++){ if(shapes[d]->get_value() > 1){ - unsigned *x = params_->get_param(v, "p0.d" + std::to_string(d)); + ir::metaparameter *x = params_->get_param(v, "p0.d" + std::to_string(d)); axes[d] = axes_.at(x); } else{ @@ -549,7 +549,7 @@ void selection::init_grids(ir::function *fn, IRBuilder<> &builder, Value *sh_mem Value *u_warp_id = builder.CreateUDiv(u_thread_id, warp_size); // create grid std::vector grids; - std::map references; + std::map references; create_grids(grids, references, fn); for(ir::value* i: grids){ if(auto *instr = dynamic_cast(i)) diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 0c64401de..4972abd71 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -4,6 +4,7 @@ #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/context_impl.h" +#include "triton/ir/constant.h" #include @@ -77,43 +78,44 @@ void tune::init_c_graph(ir::instruction *v) { } } -void tune::connected_components(node_t x, const std::vector vals, std::set &nodes, graph_t &graph) { +void tune::connected_components(node_t x, const std::vector mps, std::set &nodes, graph_t &graph) { if(nodes.find(x) != nodes.end()){ nodes.erase(x); std::string suffix = ".d" + std::to_string(x.second); - params_[x.first].insert({"p0" + suffix, vals[0]}); - params_[x.first].insert({"p1" + suffix, vals[1]}); - params_[x.first].insert({"p2" + suffix, vals[2]}); + params_[x.first].insert({"p0" + suffix, mps[0]}); + params_[x.first].insert({"p1" + suffix, mps[1]}); + params_[x.first].insert({"p2" + suffix, mps[2]}); if(static_params_.find(x) != static_params_.end()){ - *vals[0] = static_params_.at(x); - *vals[1] = static_params_.at(x); - *vals[2] = static_params_.at(x); + mps[0]->set_value(static_params_.at(x)); + mps[1]->set_value(static_params_.at(x)); + mps[2]->set_value(static_params_.at(x)); } for(const node_t &y: graph[x]) - connected_components(y, vals, nodes, graph); + connected_components(y, mps, nodes, graph); } } -std::vector tune::get_params(ir::module &mod) { - std::vector result; - std::set seen; +std::vector tune::get_params(ir::module &mod) { + std::vector result; + std::set seen; for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i : block->get_inst_list()) for(auto &x: params_[i]) - if(seen.insert(x.second).second && *x.second == 0){ + if(seen.insert(x.second).second && !x.second->has_value()){ result.push_back(x.second); } return result; } -std::map tune::get_params(ir::instruction* i) { +std::map tune::get_params(ir::instruction* i) { return params_.at(i); } void tune::run(ir::module &mod) { + ir::context &ctx = mod.get_context(); for(ir::function *fn: mod.get_function_list()){ // Build constraints graph for(ir::basic_block *block: fn->blocks()) @@ -128,16 +130,17 @@ void tune::run(ir::module &mod) { init_c_phi(i); // Layout parameters while(!nodes_.empty()){ - unsigned *v0 = new unsigned(0); - unsigned *v1 = new unsigned(0); - unsigned *v2 = new unsigned(0); - connected_components(*nodes_.begin(), {v0, v1, v2}, nodes_, dependencies_); + ir::type *ty = mod.get_builder().get_int32_ty(); + ir::metaparameter *mp0 = ir::metaparameter::create(ctx, ty, 1, 4); + ir::metaparameter *mp1 = ir::metaparameter::create(ctx, ty, 4, 32); + ir::metaparameter *mp2 = ir::metaparameter::create(ctx, ty, 4, 32); + connected_components(*nodes_.begin(), {mp0, mp1, mp2}, nodes_, dependencies_); } } } void tune::create_grids(std::vector &grids, - std::map &references, + std::map &references, ir::function *fn) { // get number of dimensions greater than 1 auto get_tile_gt1_dim = [&](ir::value *v){ @@ -154,7 +157,7 @@ void tune::create_grids(std::vector &grids, if(!i->get_type()->is_tile_ty()) continue; for(auto ¶m: params_.at(i)){ - if(*param.second == 1) + if(param.second->get_value() == 1) continue; ir::instruction *&r = references[param.second]; if(!r || get_tile_gt1_dim(i) > get_tile_gt1_dim(r)) @@ -173,14 +176,14 @@ for(ir::function *fn: mod.get_function_list()){ using std::to_string; // initialize grids - std::map references; + std::map references; std::vector grids; create_grids(grids, references, fn); // number of warps int num_warps = 1; for(size_t k = 0; k < grids.front()->get_type()->get_tile_shapes().size(); k++) - num_warps *= *params_[grids.front()]["p2.d" + to_string(k)]; + num_warps *= params_[grids.front()]["p2.d" + to_string(k)]->get_value(); // check constraints for(ir::instruction *i: grids){ @@ -190,10 +193,10 @@ for(ir::function *fn: mod.get_function_list()){ // must device the shape for(size_t k = 0; k < shapes.size(); k++) { std::string strk = to_string(k); - unsigned *s0 = params_[i]["p0.d" + strk]; - unsigned *s1 = params_[i]["p1.d" + strk]; - unsigned *s2 = params_[i]["p2.d" + strk]; - unsigned multiple = (*s0)*(*s1)*(*s2); + ir::metaparameter *mp0 = params_[i]["p0.d" + strk]; + ir::metaparameter *mp1 = params_[i]["p1.d" + strk]; + ir::metaparameter *mp2 = params_[i]["p2.d" + strk]; + unsigned multiple = mp0->get_value()*mp1->get_value()*mp2->get_value(); if(shapes[k]->get_value() % multiple != 0) errors[i].push_back("for dim " + strk + ": shape (" + to_string(shapes[k]->get_value()) + ")" " is not a multiple of layout (" + to_string(multiple) + ")"); @@ -201,14 +204,14 @@ for(ir::function *fn: mod.get_function_list()){ // the number of thread per warp must be 32 int num_threads = 1; for(size_t k = 0; k < shapes.size(); k++) - num_threads *= *params_[i]["p1.d" + to_string(k)]; + num_threads *= params_[i]["p1.d" + to_string(k)]->get_value(); if(num_threads != 32) errors[i].push_back("number of threads per warp (" + to_string(num_threads) + ") must be 32"); // The number of warps required by the layout is the same // for all tiles in the function int required_num_warps = 1; for(size_t k = 0; k < shapes.size(); k++) - required_num_warps *= *params_[i]["p2.d" + to_string(k)]; + required_num_warps *= params_[i]["p2.d" + to_string(k)]->get_value(); if(required_num_warps != num_warps) errors[i].push_back("number of warps (" + to_string(required_num_warps) + ") must be " + to_string(num_warps)); } diff --git a/lib/codegen/vectorize.cpp b/lib/codegen/vectorize.cpp index 57c2142c9..672e97dc1 100644 --- a/lib/codegen/vectorize.cpp +++ b/lib/codegen/vectorize.cpp @@ -16,7 +16,7 @@ void vectorize::run(ir::module &mod) { for(ir::instruction *i: block->get_inst_list()) if(dynamic_cast(i)){ ir::value *x = i->get_operand(0); - if(*params_->get_param(x, "p0.d0") == 1) + if(params_->get_param(x, "p0.d0")->get_value() == 1) continue; builder.set_insert_point(i); ir::instruction *rx = (ir::instruction*)builder.create_vectorize(x); diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index b3c1174ce..db0ae9e94 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -41,6 +41,21 @@ value *builder::get_int32(unsigned val) { return constant_int::get(type::get_int32_ty(ctx_), val); } +type *builder::get_int1_ty() +{ return type::get_int1_ty(ctx_); } + +type *builder::get_int8_ty() +{ return type::get_int8_ty(ctx_); } + +type *builder::get_int16_ty() +{ return type::get_int16_ty(ctx_); } + +type *builder::get_int32_ty() +{ return type::get_int32_ty(ctx_); } + +type *builder::get_int64_ty() +{ return type::get_int64_ty(ctx_); } + type *builder::get_float_ty() { return type::get_float_ty(ctx_); } diff --git a/lib/ir/constant.cpp b/lib/ir/constant.cpp index 314714c04..bfb6fdb9b 100644 --- a/lib/ir/constant.cpp +++ b/lib/ir/constant.cpp @@ -99,7 +99,7 @@ constant *constant_fp::get(context &ctx, double v){ // metaparameter metaparameter::metaparameter(type *ty, unsigned lo, unsigned hi) - : constant_int(ty, 0), lo_(lo), hi_(hi){ } + : constant_int(ty, 0), lo_(lo), hi_(hi), has_value_(false){ } metaparameter* metaparameter::create(context &ctx, type *ty, unsigned lo, unsigned hi) { context_impl *impl = ctx.p_impl.get(); diff --git a/lib/jit.cpp b/lib/jit.cpp index 17787c352..9db98ca32 100644 --- a/lib/jit.cpp +++ b/lib/jit.cpp @@ -65,8 +65,9 @@ std::unique_ptr jit::make_llvm_module(ir::module &module, const st triton_context_.p_impl->mp_constants_[0]->set_value(params[0]); triton_context_.p_impl->mp_constants_[1]->set_value(params[1]); triton_context_.p_impl->mp_constants_[2]->set_value(params[2]); - for(unsigned *x: tune.get_params(module)) - *x = params[3 + i++]; + for(ir::metaparameter *x: tune.get_params(module)){ + x->set_value(params[3 + i++]); + } // constraints std::map> errors; tune.check_constraints(module, errors); From b7212028126972b296ce7a0eeb8acfdef03c04cd Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 9 Mar 2019 12:31:21 -0500 Subject: [PATCH 100/494] [code generation] uniformized shape and layout metaparameters --- examples/matrix.cpp | 14 ++++++-------- lib/codegen/tune.cpp | 6 ++++++ lib/jit.cpp | 9 +++------ 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 70125c90d..29fc4bf41 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -65,22 +65,20 @@ void simple_gemm(std::vector &c, const std::vector &a, const std::vector params = { - // shapes - 16, 16, 8, // a0 - 2, 8, 1, + 2, 8, 1, 16, // b0 - 4, 4, 1, + 4, 4, 1, 16, // c 2, 4, 8, 4, 1, 1, // a1 - 2, 4, 1, + 2, 4, 1, 8, // b1 1, 8, 1 }; - unsigned TM = params[0]; - unsigned TN = params[1]; - unsigned nthreads = params[10]*params[13]*params[11]*params[14]; + unsigned TM = params[6]; + unsigned TN = params[10]; + unsigned nthreads = params[1]*params[2]*params[15]*params[16]; auto context = triton::driver::backend::contexts::get_default(); triton::jit jit(context); diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 4972abd71..96c07e49c 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -85,6 +85,12 @@ void tune::connected_components(node_t x, const std::vector params_[x.first].insert({"p0" + suffix, mps[0]}); params_[x.first].insert({"p1" + suffix, mps[1]}); params_[x.first].insert({"p2" + suffix, mps[2]}); + ir::type *ty = x.first->get_type(); + if(ty->is_tile_ty()){ + ir::type::tile_shapes_t::value_type shape = ty->get_tile_shapes().at(x.second); + if(auto mp = dynamic_cast(shape)) + params_[x.first].insert({"shape" + suffix, mp}); + } if(static_params_.find(x) != static_params_.end()){ mps[0]->set_value(static_params_.at(x)); mps[1]->set_value(static_params_.at(x)); diff --git a/lib/jit.cpp b/lib/jit.cpp index 9db98ca32..0c44c4e6d 100644 --- a/lib/jit.cpp +++ b/lib/jit.cpp @@ -62,12 +62,9 @@ std::unique_ptr jit::make_llvm_module(ir::module &module, const st // tuning parameters tune.run(module); unsigned i = 0; - triton_context_.p_impl->mp_constants_[0]->set_value(params[0]); - triton_context_.p_impl->mp_constants_[1]->set_value(params[1]); - triton_context_.p_impl->mp_constants_[2]->set_value(params[2]); - for(ir::metaparameter *x: tune.get_params(module)){ - x->set_value(params[3 + i++]); - } + for(ir::metaparameter *x: tune.get_params(module)) + x->set_value(params[i++]); + // constraints std::map> errors; tune.check_constraints(module, errors); From 9a3537662de67b65fe25ee3538159d7291673251 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 9 Mar 2019 14:44:13 -0500 Subject: [PATCH 101/494] [jit] can now infer launch parameters from triton module --- examples/matrix.cpp | 8 ++++--- include/triton/codegen/tune.h | 11 +++++---- include/triton/jit.h | 8 +++++++ lib/codegen/tune.cpp | 43 +++++++++++++++++++++++++++++++++++ lib/jit.cpp | 8 +++++++ 5 files changed, 71 insertions(+), 7 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 29fc4bf41..0548075e7 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -76,14 +76,13 @@ int main() { // b1 1, 8, 1 }; - unsigned TM = params[6]; - unsigned TN = params[10]; - unsigned nthreads = params[1]*params[2]*params[15]*params[16]; + auto context = triton::driver::backend::contexts::get_default(); triton::jit jit(context); jit.add_module(src, params); triton::driver::kernel kernel = jit.get_function("matmul"); + triton::jit::launch_information info = jit.get_launch_info("matmul"); size_t M = 128, N = 128, K = 128; size_t bound = 8; @@ -112,6 +111,9 @@ int main() { kernel.setArg(4, N); kernel.setArg(5, K); kernel.setArg(6, bound); + unsigned TM = info.global_range_size[0]; + unsigned TN = info.global_range_size[1]; + unsigned nthreads = info.num_threads; stream.enqueue(kernel, {(M + TM - 1)/TM, (N + TN - 1)/TN, 1}, {nthreads, 1, 1}); stream.synchronize(); stream.read(dc, true, 0, hc); diff --git a/include/triton/codegen/tune.h b/include/triton/codegen/tune.h index 5979290fa..e6c427ca9 100644 --- a/include/triton/codegen/tune.h +++ b/include/triton/codegen/tune.h @@ -30,14 +30,16 @@ private: public: + tune(); std::vector get_params(ir::module& mod); std::map get_params(ir::instruction* i); ir::metaparameter* get_param(ir::value *value, const std::string &key) { return params_[value][key]; } void copy(ir::value *dst, ir::value *src) { params_[dst] = params_[src]; } bool check_constraints(ir::module &fn, std::map> &errors); void run(ir::module &mod); - ir::metaparameter* get_num_threads(); - ir::metaparameter* get_global_range_size(unsigned axis); + unsigned get_num_global_range(); + unsigned get_global_range_size(unsigned axis); + unsigned get_num_threads(); private: std::vector pool_; @@ -45,8 +47,9 @@ private: std::set nodes_; std::map static_params_; std::map> params_; - ir::metaparameter *num_threads_; - std::vector global_range_sizes_; + std::vector num_threads_mp_vec_; + std::map global_range_sizes_; + unsigned num_global_ranges_; }; diff --git a/include/triton/jit.h b/include/triton/jit.h index 07651c258..b3055c125 100644 --- a/include/triton/jit.h +++ b/include/triton/jit.h @@ -20,6 +20,12 @@ class context; } class jit { +public: + struct launch_information{ + std::vector global_range_size; + unsigned num_threads; + }; + private: void init_llvm(); std::string compute_data_layout(bool is64Bit = true, bool UseShortPointers = true); @@ -31,12 +37,14 @@ public: void add_module(ir::module &module, const std::vector& params = {}); void add_module(const std::string &src, const std::vector& params = {}); driver::kernel get_function(const std::string &name); + launch_information get_launch_info(const std::string &name); private: std::vector modules_; driver::context driver_context_; llvm::LLVMContext llvm_context_; ir::context triton_context_; + std::map launch_info_map_; }; diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 96c07e49c..09f8e4846 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -12,6 +12,8 @@ namespace triton{ namespace codegen{ +tune::tune(): num_global_ranges_(0){ } + void tune::add_constraint(node_t x, node_t y) { dependencies_[x].insert(y); dependencies_[y].insert(x); @@ -91,6 +93,11 @@ void tune::connected_components(node_t x, const std::vector if(auto mp = dynamic_cast(shape)) params_[x.first].insert({"shape" + suffix, mp}); } + if(auto range = dynamic_cast(x.first)){ + unsigned ax = range->get_axis(); + global_range_sizes_[ax] = params_[x.first].at("shape.d0"); + num_global_ranges_ = std::max(num_global_ranges_, ax + 1); + } if(static_params_.find(x) != static_params_.end()){ mps[0]->set_value(static_params_.at(x)); mps[1]->set_value(static_params_.at(x)); @@ -122,6 +129,7 @@ std::map tune::get_params(ir::instruction* i) void tune::run(ir::module &mod) { ir::context &ctx = mod.get_context(); + // Create metaparameters for(ir::function *fn: mod.get_function_list()){ // Build constraints graph for(ir::basic_block *block: fn->blocks()) @@ -143,6 +151,19 @@ void tune::run(ir::module &mod) { connected_components(*nodes_.begin(), {mp0, mp1, mp2}, nodes_, dependencies_); } } + +// // Get launch info +// for(ir::function *fn: mod.get_function_list()){ +// std::map references; +// std::vector grids; +// create_grids(grids, references, fn); +// ir::instruction *first = grids.front(); +// for(unsigned i = 0; i < first->get_type()->get_tile_shapes().size(); i++){ +// std::string suffix = ".d" + std::to_string(i); +// num_threads_mp_vec_.push_back(params_.at(first).at("p1" + suffix)); +// num_threads_mp_vec_.push_back(params_.at(first).at("p2" + suffix)); +// } +// } } void tune::create_grids(std::vector &grids, @@ -186,6 +207,12 @@ for(ir::function *fn: mod.get_function_list()){ std::vector grids; create_grids(grids, references, fn); + for(unsigned i = 0; i < grids.front()->get_type()->get_tile_shapes().size(); i++){ + std::string suffix = ".d" + std::to_string(i); + num_threads_mp_vec_.push_back(params_.at(grids.front()).at("p1" + suffix)); + num_threads_mp_vec_.push_back(params_.at(grids.front()).at("p2" + suffix)); + } + // number of warps int num_warps = 1; for(size_t k = 0; k < grids.front()->get_type()->get_tile_shapes().size(); k++) @@ -225,5 +252,21 @@ for(ir::function *fn: mod.get_function_list()){ } } +unsigned tune::get_num_global_range() { + return num_global_ranges_; +} + +unsigned tune::get_global_range_size(unsigned axis) { + return global_range_sizes_.at(axis)->get_value(); +} + +unsigned tune::get_num_threads() { + unsigned result = 1; + for(ir::metaparameter *mp: num_threads_mp_vec_) + result *= mp->get_value(); + return result; +} + + } } diff --git a/lib/jit.cpp b/lib/jit.cpp index 0c44c4e6d..517c3c882 100644 --- a/lib/jit.cpp +++ b/lib/jit.cpp @@ -85,6 +85,11 @@ std::unique_ptr jit::make_llvm_module(ir::module &module, const st vectorize.run(module); selection.run(module, *result); + // launch information + auto &launch_info_map = launch_info_map_[result->getName()]; + for(unsigned i = 0; i < tune.get_num_global_range(); i++) + launch_info_map.global_range_size.push_back(tune.get_global_range_size(i)); + launch_info_map.num_threads = tune.get_num_threads(); return std::unique_ptr(result); } @@ -145,5 +150,8 @@ driver::kernel jit::get_function(const std::string &name) { return driver::kernel(modules_.front(), name.c_str()); } +jit::launch_information jit::get_launch_info(const std::string &name) { + return launch_info_map_.at(name); +} } From 9e2cfddf4c977a8e34c0165e17b07c7a6d77d6fd Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 9 Mar 2019 17:17:55 -0500 Subject: [PATCH 102/494] [examples] some cleaning --- examples/matrix.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 0548075e7..19cf9d036 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -64,6 +64,11 @@ void simple_gemm(std::vector &c, const std::vector &a, const std::vector params = { // a0 2, 8, 1, 16, @@ -76,13 +81,7 @@ int main() { // b1 1, 8, 1 }; - - - auto context = triton::driver::backend::contexts::get_default(); - triton::jit jit(context); jit.add_module(src, params); - triton::driver::kernel kernel = jit.get_function("matmul"); - triton::jit::launch_information info = jit.get_launch_info("matmul"); size_t M = 128, N = 128, K = 128; size_t bound = 8; @@ -104,6 +103,7 @@ int main() { stream.write(da, true, 0, ha); stream.write(db, true, 0, hb); stream.write(dc, true, 0, hc); + triton::driver::kernel kernel = jit.get_function("matmul"); kernel.setArg(0, da); kernel.setArg(1, db); kernel.setArg(2, dc); @@ -111,6 +111,7 @@ int main() { kernel.setArg(4, N); kernel.setArg(5, K); kernel.setArg(6, bound); + triton::jit::launch_information info = jit.get_launch_info("matmul"); unsigned TM = info.global_range_size[0]; unsigned TN = info.global_range_size[1]; unsigned nthreads = info.num_threads; From d2e7d7890d98a1312dc8445416848482c56a3b54 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 10 Mar 2019 00:42:36 -0500 Subject: [PATCH 103/494] [jit] preparing auto-tuning --- examples/matrix.cpp | 69 ++++++++++--------- include/triton/driver/module.h | 10 ++- include/triton/ir/constant.h | 2 + include/triton/jit.h | 14 +++- lib/driver/module.cpp | 60 ++++++++++++++++- lib/jit.cpp | 119 +++++++++++++++++++++------------ 6 files changed, 195 insertions(+), 79 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 19cf9d036..2f54c1085 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -64,25 +64,10 @@ void simple_gemm(std::vector &c, const std::vector &a, const std::vector params = { - // a0 - 2, 8, 1, 16, - // b0 - 4, 4, 1, 16, - // c - 2, 4, 8, 4, 1, 1, - // a1 - 2, 4, 1, 8, - // b1 - 1, 8, 1 - }; - jit.add_module(src, params); + // matrix multiplication parameters size_t M = 128, N = 128, K = 128; size_t bound = 8; std::vector hc(M*N); @@ -103,20 +88,44 @@ int main() { stream.write(da, true, 0, ha); stream.write(db, true, 0, hb); stream.write(dc, true, 0, hc); - triton::driver::kernel kernel = jit.get_function("matmul"); - kernel.setArg(0, da); - kernel.setArg(1, db); - kernel.setArg(2, dc); - kernel.setArg(3, M); - kernel.setArg(4, N); - kernel.setArg(5, K); - kernel.setArg(6, bound); - triton::jit::launch_information info = jit.get_launch_info("matmul"); - unsigned TM = info.global_range_size[0]; - unsigned TN = info.global_range_size[1]; - unsigned nthreads = info.num_threads; - stream.enqueue(kernel, {(M + TM - 1)/TM, (N + TN - 1)/TN, 1}, {nthreads, 1, 1}); stream.synchronize(); + + // benchmark a given matrix multiplication kernel + auto benchmark = [&](triton::driver::kernel kernel, + triton::jit::launch_information info) { + kernel.setArg(0, da); + kernel.setArg(1, db); + kernel.setArg(2, dc); + kernel.setArg(3, M); + kernel.setArg(4, N); + kernel.setArg(5, K); + kernel.setArg(6, bound); + unsigned TM = info.global_range_size[0]; + unsigned TN = info.global_range_size[1]; + unsigned nthreads = info.num_threads; + stream.enqueue(kernel, {(M + TM - 1)/TM, (N + TN - 1)/TN, 1}, {nthreads, 1, 1}); + stream.synchronize(); + return float(0); + }; + + // just-in-time compile source-code + std::vector params = { + // a0 + 2, 8, 1, 16, + // b0 + 4, 4, 1, 16, + // c + 2, 4, 8, 4, 1, 1, + // a1 + 2, 4, 1, 8, + // b1 + 1, 8, 1 + }; + triton::jit jit(context); + jit.add_module(src, params); + triton::driver::kernel kernel = jit.get_function("matmul"); + triton::jit::launch_information info = jit.get_launch_info("matmul"); + benchmark(kernel, info); stream.read(dc, true, 0, hc); simple_gemm(rc, ha, hb, M, N, K); for(size_t i = 0; i < M*N; i++) diff --git a/include/triton/driver/module.h b/include/triton/driver/module.h index 43d9db4ee..f69db71d6 100755 --- a/include/triton/driver/module.h +++ b/include/triton/driver/module.h @@ -28,6 +28,11 @@ #include "triton/driver/context.h" #include "triton/driver/buffer.h" +namespace llvm +{ + class Module; +} + namespace triton { @@ -40,9 +45,12 @@ class device; class module: public handle_interface { static std::string header(device const & device); + std::string compile_llvm_module(llvm::Module* module); + void init_llvm(); public: - module(driver::context const & context, std::string const & source); + module(driver::context const & context, llvm::Module *module); + module(driver::context const & context, const std::string& source); driver::context const & context() const; handle const & cu() const; buffer symbol(const char * name) const; diff --git a/include/triton/ir/constant.h b/include/triton/ir/constant.h index e3bd2ab24..317cba2ff 100644 --- a/include/triton/ir/constant.h +++ b/include/triton/ir/constant.h @@ -51,6 +51,8 @@ public: static metaparameter *create(context &ctx, type *ty, unsigned lo, unsigned hi); void set_value(uint64_t value) { has_value_ = true; value_ = value; } bool has_value() { return has_value_; } + unsigned get_lo() { return lo_; } + unsigned get_hi() { return hi_; } private: unsigned lo_; diff --git a/include/triton/jit.h b/include/triton/jit.h index b3055c125..db446a42e 100644 --- a/include/triton/jit.h +++ b/include/triton/jit.h @@ -7,6 +7,7 @@ #include "triton/ir/context.h" #include "triton/driver/module.h" #include "triton/driver/kernel.h" +#include namespace llvm { class Module; @@ -14,9 +15,14 @@ namespace llvm { namespace triton { +namespace codegen{ +class tune; +} + namespace ir { class module; class context; +class metaparameter; } class jit { @@ -25,15 +31,17 @@ public: std::vector global_range_size; unsigned num_threads; }; + typedef std::function benchmark_t; private: - void init_llvm(); - std::string compute_data_layout(bool is64Bit = true, bool UseShortPointers = true); - std::unique_ptr make_llvm_module(triton::ir::module &module, const std::vector& params); + std::string compute_data_layout(bool is_64bit = true, bool use_short_pointers = true); + std::unique_ptr make_llvm_module(triton::ir::module &module, codegen::tune &tune); std::unique_ptr make_triton_module(const std::string &src); public: jit(driver::context context); + void autotune(ir::module &module, benchmark_t benchmark); + void autotune(const std::string &src, benchmark_t benchmark); void add_module(ir::module &module, const std::vector& params = {}); void add_module(const std::string &src, const std::vector& params = {}); driver::kernel get_function(const std::string &name); diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 21dee6027..c482acf08 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -26,14 +26,72 @@ #include "triton/driver/module.h" #include "triton/driver/context.h" #include "triton/driver/error.h" - #include "triton/tools/sys/getenv.hpp" +#include "llvm/IR/IRPrintingPasses.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/Transforms/Scalar/EarlyCSE.h" +#include "llvm/Analysis/LoopPass.h" namespace triton { namespace driver { +std::string module::compile_llvm_module(llvm::Module* module) { + init_llvm(); + + // create machine + module->setTargetTriple("nvptx64-nvidia-cuda"); + std::string error; + auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error); + llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), "sm_52", "", + llvm::TargetOptions(), llvm::Reloc::Model(), + llvm::None, llvm::CodeGenOpt::Aggressive); + + // set data layout + std::string layout = "e"; + bool is_64bit = true; + bool use_short_pointers = true; + if (!is_64bit) + layout += "-p:32:32"; + else if (use_short_pointers) + layout += "-p3:32:32-p4:32:32-p5:32:32"; + layout += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; + module->setDataLayout(layout); + + // emit machine code + llvm::legacy::PassManager pass; + llvm::SmallVector buffer; + llvm::raw_svector_ostream stream(buffer); + machine->addPassesToEmitFile(pass, stream, nullptr, llvm::TargetMachine::CGFT_AssemblyFile); + pass.run(*module); + + return std::string(buffer.begin(), buffer.end()); +} + +void module::init_llvm() { + static bool init = false; + if(!init){ + llvm::InitializeAllTargetInfos(); + llvm::InitializeAllTargets(); + llvm::InitializeAllTargetMCs(); + llvm::InitializeAllAsmParsers(); + llvm::InitializeAllAsmPrinters(); + init = true; + } +} + +module::module(driver::context const & context, llvm::Module* ll_module): module(context, compile_llvm_module(ll_module)){ } + module::module(driver::context const & context, std::string const & source) : context_(context), source_(source){ ContextSwitcher ctx_switch(context_); // JIT compile source-code diff --git a/lib/jit.cpp b/lib/jit.cpp index 517c3c882..8b1ab9886 100644 --- a/lib/jit.cpp +++ b/lib/jit.cpp @@ -34,37 +34,55 @@ extern translation_unit *ast_root; namespace triton { -void jit::init_llvm() { - static bool init = false; - if(!init){ - llvm::InitializeAllTargetInfos(); - llvm::InitializeAllTargets(); - llvm::InitializeAllTargetMCs(); - llvm::InitializeAllAsmParsers(); - llvm::InitializeAllAsmPrinters(); - init = true; +void loop_nest(std::vector const & ranges, std::function const &)> const & f){ + size_t D = ranges.size(); + std::vector values(D, 0); + // Start with innermost loop + size_t i = D - 1; + while(true){ + //Execute function + f(values); + //Increment counters + while(values[i]++ == ranges[i] - 1){ + if(i == 0) + return; + values[i--] = 0; + } + i = D - 1; } } -std::unique_ptr jit::make_llvm_module(ir::module &module, const std::vector& params) { +template +void loop_nest(std::vector> const & iterates, std::function)> const & f){ + //Ranges to iterate over + std::vector ranges; + for(auto const & x: iterates) + ranges.push_back(x.size()); + //Proxy function + auto proxy = [&](std::vector const & idx){ + std::vector x(iterates.size()); + for(size_t i = 0; i < x.size(); ++i) + x[i] = iterates[i][idx[i]]; + f(x); + }; + //Iterate + loop_nest(ranges, proxy); +} + + + +std::unique_ptr jit::make_llvm_module(ir::module &module, codegen::tune & tune) { llvm::Module* result = new llvm::Module("matmul", llvm_context_); // create passes codegen::buffer_info_pass buffer_info; codegen::place_shared_copy shared(&buffer_info); - codegen::tune tune; codegen::liveness liveness(&buffer_info); codegen::allocation allocation(&liveness, &buffer_info); codegen::barriers barriers(&allocation, &buffer_info); codegen::vectorize vectorize(&tune); codegen::selection selection(&allocation, &tune, &buffer_info); - // tuning parameters - tune.run(module); - unsigned i = 0; - for(ir::metaparameter *x: tune.get_params(module)) - x->set_value(params[i++]); - // constraints std::map> errors; tune.check_constraints(module, errors); @@ -109,36 +127,49 @@ std::unique_ptr jit::make_triton_module(const std::string &src) { jit::jit(driver::context context): driver_context_(context) { } -std::string jit::compute_data_layout(bool is_64bit, bool use_short_pointers) { - std::string ret = "e"; - if (!is_64bit) - ret += "-p:32:32"; - else if (use_short_pointers) - ret += "-p3:32:32-p4:32:32-p5:32:32"; - ret += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; - return ret; + +void jit::autotune(ir::module &tt_module, benchmark_t benchmark) { + // find metaparameters + codegen::tune tune; + tune.run(tt_module); + auto mps = tune.get_params(tt_module); + // create parameter ranges + std::vector> ranges; + for(ir::metaparameter *mp: mps){ + std::vector current; + for(unsigned x = mp->get_lo(); x <= mp->get_hi(); x*=2) + current.push_back(x); + ranges.push_back(current); + } + // iterate over parameters + loop_nest(ranges, [&](const std::vector params){ + std::map> errors; + unsigned i = 0; + for(ir::metaparameter *mp: mps) + mp->set_value(params[i++]); + tune.check_constraints(tt_module, errors); + if(errors.size()) + return; + std::cout << "valid" << std::endl; + }); +} + +void jit::autotune(const std::string &src, benchmark_t benchmark) { + auto ptt_module = make_triton_module(src); + autotune(*ptt_module, benchmark); } void jit::add_module(ir::module &tt_module, const std::vector ¶ms) { - init_llvm(); - auto ll_module = make_llvm_module(tt_module, params); - ll_module->setTargetTriple("nvptx64-nvidia-cuda"); - std::string error; - auto target = llvm::TargetRegistry::lookupTarget(ll_module->getTargetTriple(), error); - llvm::TargetMachine *machine = target->createTargetMachine(ll_module->getTargetTriple(), "sm_52", "", - llvm::TargetOptions(), llvm::Reloc::Model(), - llvm::None, llvm::CodeGenOpt::Aggressive); - ll_module->setDataLayout(compute_data_layout()); - - // emit machine code - llvm::legacy::PassManager pass; - llvm::SmallVector buffer; - llvm::raw_svector_ostream stream(buffer); - machine->addPassesToEmitFile(pass, stream, nullptr, llvm::TargetMachine::CGFT_AssemblyFile); - pass.run(*ll_module); - std::string src(buffer.begin(), buffer.end()); - - modules_.push_back(driver::module(driver_context_, src)); + // set parameters + codegen::tune tune; + tune.run(tt_module); + unsigned i = 0; + for(ir::metaparameter* mp: tune.get_params(tt_module)) + mp->set_value(params[i++]); + // compiler to llvm + auto ll_module = make_llvm_module(tt_module, tune); + // send llvm module to driver + modules_.push_back(driver::module(driver_context_, &*ll_module)); } void jit::add_module(const std::string &src, const std::vector ¶ms) { From c96a263896d7684e7a66e79a776320ff346c14f6 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 10 Mar 2019 10:45:21 -0400 Subject: [PATCH 104/494] [jit] changed default metaparameter ranges --- examples/matrix.cpp | 1 + lib/ast/lowering.cpp | 2 +- lib/codegen/tune.cpp | 6 +++--- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 2f54c1085..e7177cfb2 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -123,6 +123,7 @@ int main() { }; triton::jit jit(context); jit.add_module(src, params); + jit.autotune(src, benchmark); triton::driver::kernel kernel = jit.get_function("matmul"); triton::jit::launch_information info = jit.get_launch_info("matmul"); benchmark(kernel, info); diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index cac80b262..5dda59ce9 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -412,7 +412,7 @@ ir::value* initializer::codegen(ir::module * mod) const{ if(std::find(storage.begin(), storage.end(), TUNABLE_T) != storage.end()){ assert(expr_ == nullptr); //TODO: implement ranges - value = ir::metaparameter::create(mod->get_context(), ty, 4, 8); + value = ir::metaparameter::create(mod->get_context(), ty, 8, 64); } if(expr_){ value = expr_->codegen(mod); diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 09f8e4846..73559b50d 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -145,9 +145,9 @@ void tune::run(ir::module &mod) { // Layout parameters while(!nodes_.empty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - ir::metaparameter *mp0 = ir::metaparameter::create(ctx, ty, 1, 4); - ir::metaparameter *mp1 = ir::metaparameter::create(ctx, ty, 4, 32); - ir::metaparameter *mp2 = ir::metaparameter::create(ctx, ty, 4, 32); + ir::metaparameter *mp0 = ir::metaparameter::create(ctx, ty, 2, 2); + ir::metaparameter *mp1 = ir::metaparameter::create(ctx, ty, 4, 8); + ir::metaparameter *mp2 = ir::metaparameter::create(ctx, ty, 1, 4); connected_components(*nodes_.begin(), {mp0, mp1, mp2}, nodes_, dependencies_); } } From 94e315ea8a0a5080fe1b1dfad088927f7fae8600 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 10 Mar 2019 23:10:17 -0400 Subject: [PATCH 105/494] Reparameterized in terms of micro- and nano- tiles --- examples/matrix.cpp | 24 ++++++++++--- include/triton/codegen/tune.h | 2 +- lib/ast/lowering.cpp | 2 +- lib/codegen/selection.cpp | 12 +++---- lib/codegen/tune.cpp | 64 +++++++++++++++-------------------- lib/codegen/vectorize.cpp | 2 +- lib/jit.cpp | 9 +++-- 7 files changed, 62 insertions(+), 53 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index e7177cfb2..e8a169656 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -108,18 +108,32 @@ int main() { return float(0); }; + +// std::vector params = { +// // a0 +// 2, 8, 1, 16, +// // b0 +// 4, 4, 1, 16, +// // c +// 2, 4, 8, 4, 1, 1, +// // a1 +// 2, 4, 1, 8, +// // b1 +// 1, 8, 1 +// }; + // just-in-time compile source-code std::vector params = { // a0 - 2, 8, 1, 16, + 8, 2, 16, // b0 - 4, 4, 1, 16, + 4, 4, 16, // c - 2, 4, 8, 4, 1, 1, + 8, 4, 2, 4, // a1 - 2, 4, 1, 8, + 4, 2, 8, // b1 - 1, 8, 1 + 8, 1 }; triton::jit jit(context); jit.add_module(src, params); diff --git a/include/triton/codegen/tune.h b/include/triton/codegen/tune.h index e6c427ca9..d84ddfe5e 100644 --- a/include/triton/codegen/tune.h +++ b/include/triton/codegen/tune.h @@ -47,9 +47,9 @@ private: std::set nodes_; std::map static_params_; std::map> params_; - std::vector num_threads_mp_vec_; std::map global_range_sizes_; unsigned num_global_ranges_; + unsigned num_threads_; }; diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 5dda59ce9..3a5b2696e 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -412,7 +412,7 @@ ir::value* initializer::codegen(ir::module * mod) const{ if(std::find(storage.begin(), storage.end(), TUNABLE_T) != storage.end()){ assert(expr_ == nullptr); //TODO: implement ranges - value = ir::metaparameter::create(mod->get_context(), ty, 8, 64); + value = ir::metaparameter::create(mod->get_context(), ty, 8, 128); } if(expr_){ value = expr_->codegen(mod); diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index a9f7e8524..fbdd33162 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -379,9 +379,9 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id std::vector n_warps(dim); for(unsigned i = 0; i < shapes.size(); i++){ std::string str_i = std::to_string(i); - contiguous[i] = params_->get_param(v, "p0.d" + str_i)->get_value(); - warp_size[i] = params_->get_param(v, "p1.d" + str_i)->get_value(); - n_warps[i] = params_->get_param(v, "p2.d" + str_i)->get_value(); + contiguous[i] = params_->get_param(v, "nts.d" + str_i)->get_value(); + warp_size[i] = params_->get_param(v, "mts.d" + str_i)->get_value(); + n_warps[i] = shapes[i]->get_value() / (contiguous[i] * warp_size[i]); } std::vector thread_id_in_warp = delinearize(u_thread_id, warp_size, builder); std::vector warp_id = delinearize(u_warp_id, n_warps, builder); @@ -399,7 +399,7 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id unsigned offset = n / contiguous[k] * per_block + n % contiguous[k]; idx_list[n] = builder.CreateAdd(thread_id, builder.getInt32(offset), "idx_" + str_k + "_" + std::to_string(n)); } - axes_[params_->get_param(v, "p0.d" + str_k)] = distributed_axis{contiguous[k], idx_list}; + axes_[params_->get_param(v, "nts.d" + str_k)] = distributed_axis{contiguous[k], idx_list}; } } @@ -432,7 +432,7 @@ void selection::create_grids(std::vector &grids, for(size_t d = 0; d < shapes.size(); d++){ if(shapes[d]->get_value() == 1) continue; - ir::metaparameter *x = params_->get_param(v, "p0.d" + std::to_string(d)); + ir::metaparameter *x = params_->get_param(v, "nts.d" + std::to_string(d)); ir::value *&r = references[x]; if(!r || get_tile_gt1_dim(v) > get_tile_gt1_dim(r)) r = v; @@ -517,7 +517,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, std::vector axes(shapes.size()); for(size_t d = 0; d < shapes.size(); d++){ if(shapes[d]->get_value() > 1){ - ir::metaparameter *x = params_->get_param(v, "p0.d" + std::to_string(d)); + ir::metaparameter *x = params_->get_param(v, "nts.d" + std::to_string(d)); axes[d] = axes_.at(x); } else{ diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 73559b50d..703459952 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -84,9 +84,8 @@ void tune::connected_components(node_t x, const std::vector if(nodes.find(x) != nodes.end()){ nodes.erase(x); std::string suffix = ".d" + std::to_string(x.second); - params_[x.first].insert({"p0" + suffix, mps[0]}); - params_[x.first].insert({"p1" + suffix, mps[1]}); - params_[x.first].insert({"p2" + suffix, mps[2]}); + params_[x.first].insert({"nts" + suffix, mps[0]}); + params_[x.first].insert({"mts" + suffix, mps[1]}); ir::type *ty = x.first->get_type(); if(ty->is_tile_ty()){ ir::type::tile_shapes_t::value_type shape = ty->get_tile_shapes().at(x.second); @@ -101,7 +100,6 @@ void tune::connected_components(node_t x, const std::vector if(static_params_.find(x) != static_params_.end()){ mps[0]->set_value(static_params_.at(x)); mps[1]->set_value(static_params_.at(x)); - mps[2]->set_value(static_params_.at(x)); } for(const node_t &y: graph[x]) connected_components(y, mps, nodes, graph); @@ -145,25 +143,11 @@ void tune::run(ir::module &mod) { // Layout parameters while(!nodes_.empty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - ir::metaparameter *mp0 = ir::metaparameter::create(ctx, ty, 2, 2); - ir::metaparameter *mp1 = ir::metaparameter::create(ctx, ty, 4, 8); - ir::metaparameter *mp2 = ir::metaparameter::create(ctx, ty, 1, 4); - connected_components(*nodes_.begin(), {mp0, mp1, mp2}, nodes_, dependencies_); + ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 2, 2); + ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 4, 8); + connected_components(*nodes_.begin(), {nts, mts}, nodes_, dependencies_); } } - -// // Get launch info -// for(ir::function *fn: mod.get_function_list()){ -// std::map references; -// std::vector grids; -// create_grids(grids, references, fn); -// ir::instruction *first = grids.front(); -// for(unsigned i = 0; i < first->get_type()->get_tile_shapes().size(); i++){ -// std::string suffix = ".d" + std::to_string(i); -// num_threads_mp_vec_.push_back(params_.at(first).at("p1" + suffix)); -// num_threads_mp_vec_.push_back(params_.at(first).at("p2" + suffix)); -// } -// } } void tune::create_grids(std::vector &grids, @@ -207,16 +191,26 @@ for(ir::function *fn: mod.get_function_list()){ std::vector grids; create_grids(grids, references, fn); - for(unsigned i = 0; i < grids.front()->get_type()->get_tile_shapes().size(); i++){ - std::string suffix = ".d" + std::to_string(i); - num_threads_mp_vec_.push_back(params_.at(grids.front()).at("p1" + suffix)); - num_threads_mp_vec_.push_back(params_.at(grids.front()).at("p2" + suffix)); + auto get_num_warps = [&](ir::instruction *i, unsigned axis) { + std::string strk = to_string(axis); + unsigned mts = params_[i]["mts.d" + strk]->get_value(); + unsigned nts = params_[i]["nts.d" + strk]->get_value(); + unsigned shape = i->get_type()->get_tile_shapes()[axis]->get_value(); + return shape / (mts * nts); + }; + + num_threads_ = 1; + ir::instruction *first = grids.front(); + for(unsigned k = 0; k < first->get_type()->get_tile_shapes().size(); k++){ + std::string suffix = ".d" + std::to_string(k); + num_threads_ *= params_.at(first).at("mts" + suffix)->get_value(); + num_threads_ *= get_num_warps(first, k); } // number of warps int num_warps = 1; - for(size_t k = 0; k < grids.front()->get_type()->get_tile_shapes().size(); k++) - num_warps *= params_[grids.front()]["p2.d" + to_string(k)]->get_value(); + for(size_t k = 0; k < first->get_type()->get_tile_shapes().size(); k++) + num_warps *= get_num_warps(first, k); // check constraints for(ir::instruction *i: grids){ @@ -226,10 +220,9 @@ for(ir::function *fn: mod.get_function_list()){ // must device the shape for(size_t k = 0; k < shapes.size(); k++) { std::string strk = to_string(k); - ir::metaparameter *mp0 = params_[i]["p0.d" + strk]; - ir::metaparameter *mp1 = params_[i]["p1.d" + strk]; - ir::metaparameter *mp2 = params_[i]["p2.d" + strk]; - unsigned multiple = mp0->get_value()*mp1->get_value()*mp2->get_value(); + ir::metaparameter *mts = params_[i]["mts.d" + strk]; + ir::metaparameter *nts = params_[i]["nts.d" + strk]; + unsigned multiple = mts->get_value()*nts->get_value(); if(shapes[k]->get_value() % multiple != 0) errors[i].push_back("for dim " + strk + ": shape (" + to_string(shapes[k]->get_value()) + ")" " is not a multiple of layout (" + to_string(multiple) + ")"); @@ -237,14 +230,14 @@ for(ir::function *fn: mod.get_function_list()){ // the number of thread per warp must be 32 int num_threads = 1; for(size_t k = 0; k < shapes.size(); k++) - num_threads *= params_[i]["p1.d" + to_string(k)]->get_value(); + num_threads *= params_[i]["mts.d" + to_string(k)]->get_value(); if(num_threads != 32) errors[i].push_back("number of threads per warp (" + to_string(num_threads) + ") must be 32"); // The number of warps required by the layout is the same // for all tiles in the function int required_num_warps = 1; for(size_t k = 0; k < shapes.size(); k++) - required_num_warps *= params_[i]["p2.d" + to_string(k)]->get_value(); + required_num_warps *= get_num_warps(i, k); if(required_num_warps != num_warps) errors[i].push_back("number of warps (" + to_string(required_num_warps) + ") must be " + to_string(num_warps)); } @@ -261,10 +254,7 @@ unsigned tune::get_global_range_size(unsigned axis) { } unsigned tune::get_num_threads() { - unsigned result = 1; - for(ir::metaparameter *mp: num_threads_mp_vec_) - result *= mp->get_value(); - return result; + return num_threads_; } diff --git a/lib/codegen/vectorize.cpp b/lib/codegen/vectorize.cpp index 672e97dc1..e1319634b 100644 --- a/lib/codegen/vectorize.cpp +++ b/lib/codegen/vectorize.cpp @@ -16,7 +16,7 @@ void vectorize::run(ir::module &mod) { for(ir::instruction *i: block->get_inst_list()) if(dynamic_cast(i)){ ir::value *x = i->get_operand(0); - if(params_->get_param(x, "p0.d0")->get_value() == 1) + if(params_->get_param(x, "nts.d0")->get_value() == 1) continue; builder.set_insert_point(i); ir::instruction *rx = (ir::instruction*)builder.create_vectorize(x); diff --git a/lib/jit.cpp b/lib/jit.cpp index 8b1ab9886..ba42b22b2 100644 --- a/lib/jit.cpp +++ b/lib/jit.cpp @@ -86,7 +86,6 @@ std::unique_ptr jit::make_llvm_module(ir::module &module, codegen: // constraints std::map> errors; tune.check_constraints(module, errors); - std::cout << "errors: " << errors.size() << std::endl; for(auto &x: errors){ for(auto &e: x.second) std::cout << x.first->get_name() << " " << e << std::endl; @@ -150,7 +149,13 @@ void jit::autotune(ir::module &tt_module, benchmark_t benchmark) { tune.check_constraints(tt_module, errors); if(errors.size()) return; - std::cout << "valid" << std::endl; + ir::module copy(tt_module); + auto ll_module = make_llvm_module(copy, tune); + driver::module module(driver_context_, &*ll_module); + driver::kernel kernel(module, "matmul"); + launch_information info = launch_info_map_.at("matmul"); + benchmark(kernel, info); + std::cout << "benchmarked" << std::endl; }); } From 614f83baee2ae0ff0bec970992800d5967c35881 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 11 Mar 2019 12:00:50 -0400 Subject: [PATCH 106/494] [jit] basic auto-tuning support --- examples/matrix.cpp | 41 +++++++----- include/triton/codegen/tune.h | 6 +- include/triton/jit.h | 38 ++++++++++- lib/codegen/tune.cpp | 47 ++++++++------ lib/jit.cpp | 118 ++++++++++++++++------------------ 5 files changed, 151 insertions(+), 99 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index e8a169656..f08f81eb2 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -63,6 +63,24 @@ void simple_gemm(std::vector &c, const std::vector &a, const std::vector(high_resolution_clock::now() - _start); } + +private: + high_resolution_clock::time_point _start; +}; + int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); @@ -90,6 +108,7 @@ int main() { stream.write(dc, true, 0, hc); stream.synchronize(); + // benchmark a given matrix multiplication kernel auto benchmark = [&](triton::driver::kernel kernel, triton::jit::launch_information info) { @@ -103,25 +122,17 @@ int main() { unsigned TM = info.global_range_size[0]; unsigned TN = info.global_range_size[1]; unsigned nthreads = info.num_threads; + timer t; + t.start(); stream.enqueue(kernel, {(M + TM - 1)/TM, (N + TN - 1)/TN, 1}, {nthreads, 1, 1}); stream.synchronize(); - return float(0); + double ts = t.get().count()*1e-9; + double tflops = 2*M*N*K / ts * 1e-12; + std::cout << tflops << std::endl; + return ts; }; -// std::vector params = { -// // a0 -// 2, 8, 1, 16, -// // b0 -// 4, 4, 1, 16, -// // c -// 2, 4, 8, 4, 1, 1, -// // a1 -// 2, 4, 1, 8, -// // b1 -// 1, 8, 1 -// }; - // just-in-time compile source-code std::vector params = { // a0 @@ -136,8 +147,8 @@ int main() { 8, 1 }; triton::jit jit(context); - jit.add_module(src, params); jit.autotune(src, benchmark); + jit.add_module(src, params); triton::driver::kernel kernel = jit.get_function("matmul"); triton::jit::launch_information info = jit.get_launch_info("matmul"); benchmark(kernel, info); diff --git a/include/triton/codegen/tune.h b/include/triton/codegen/tune.h index d84ddfe5e..9fd321572 100644 --- a/include/triton/codegen/tune.h +++ b/include/triton/codegen/tune.h @@ -17,6 +17,8 @@ namespace ir{ namespace codegen{ +class place_shared_copy; + class tune { typedef std::pair node_t; typedef std::map > graph_t; @@ -35,8 +37,9 @@ public: std::map get_params(ir::instruction* i); ir::metaparameter* get_param(ir::value *value, const std::string &key) { return params_[value][key]; } void copy(ir::value *dst, ir::value *src) { params_[dst] = params_[src]; } - bool check_constraints(ir::module &fn, std::map> &errors); + bool check_constraints(std::map> &errors); void run(ir::module &mod); + void init(ir::module &mod); unsigned get_num_global_range(); unsigned get_global_range_size(unsigned axis); unsigned get_num_threads(); @@ -50,6 +53,7 @@ private: std::map global_range_sizes_; unsigned num_global_ranges_; unsigned num_threads_; + std::vector grids_; }; diff --git a/include/triton/jit.h b/include/triton/jit.h index db446a42e..a01c43685 100644 --- a/include/triton/jit.h +++ b/include/triton/jit.h @@ -7,6 +7,14 @@ #include "triton/ir/context.h" #include "triton/driver/module.h" #include "triton/driver/kernel.h" +#include "triton/codegen/selection.h" +#include "triton/codegen/tune.h" +#include "triton/codegen/shared_copy.h" +#include "triton/codegen/allocation.h" +#include "triton/codegen/liveness.h" +#include "triton/codegen/vectorize.h" +#include "triton/codegen/buffer_info.h" +#include "triton/codegen/barriers.h" #include namespace llvm { @@ -33,14 +41,40 @@ public: }; typedef std::function benchmark_t; + struct passes_wrapper { + passes_wrapper(): shared(&buffer_info), liveness(&buffer_info), + allocation(&liveness, &buffer_info), + barriers(&allocation, &buffer_info), + vectorize(&tune), + selection(&allocation, &tune, &buffer_info){ } + + void init(ir::module &module) { + // generate ptx + buffer_info.run(module); + shared.run(module); + liveness.run(module); + allocation.run(); + barriers.run(module); + vectorize.run(module); + } + + codegen::tune tune; + codegen::buffer_info_pass buffer_info; + codegen::place_shared_copy shared; + codegen::liveness liveness; + codegen::allocation allocation; + codegen::barriers barriers; + codegen::vectorize vectorize; + codegen::selection selection; + }; + private: std::string compute_data_layout(bool is_64bit = true, bool use_short_pointers = true); - std::unique_ptr make_llvm_module(triton::ir::module &module, codegen::tune &tune); + std::unique_ptr make_llvm_module(triton::ir::module &module, passes_wrapper &passes); std::unique_ptr make_triton_module(const std::string &src); public: jit(driver::context context); - void autotune(ir::module &module, benchmark_t benchmark); void autotune(const std::string &src, benchmark_t benchmark); void add_module(ir::module &module, const std::vector& params = {}); void add_module(const std::string &src, const std::vector& params = {}); diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 703459952..5bd7b0708 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -1,4 +1,5 @@ #include "triton/codegen/tune.h" +#include "triton/codegen/shared_copy.h" #include "triton/ir/instructions.h" #include "triton/ir/type.h" #include "triton/ir/module.h" @@ -143,13 +144,37 @@ void tune::run(ir::module &mod) { // Layout parameters while(!nodes_.empty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 2, 2); + ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 2, 4); ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 4, 8); connected_components(*nodes_.begin(), {nts, mts}, nodes_, dependencies_); } } } +void tune::init(ir::module &mod) { + for(ir::function *fn: mod.get_function_list()){ + // initialize grids + std::map references; + create_grids(grids_, references, fn); + } + // number of warps + auto get_num_warps = [&](ir::instruction *i, unsigned axis) { + std::string strk = std::to_string(axis); + unsigned mts = params_[i]["mts.d" + strk]->get_value(); + unsigned nts = params_[i]["nts.d" + strk]->get_value(); + unsigned shape = i->get_type()->get_tile_shapes()[axis]->get_value(); + return shape / (mts * nts); + }; + // number of threads + num_threads_ = 1; + ir::instruction *first = grids_.front(); + for(unsigned k = 0; k < first->get_type()->get_tile_shapes().size(); k++){ + std::string suffix = ".d" + std::to_string(k); + num_threads_ *= params_.at(first).at("mts" + suffix)->get_value(); + num_threads_ *= get_num_warps(first, k); + } +} + void tune::create_grids(std::vector &grids, std::map &references, ir::function *fn) { @@ -182,15 +207,9 @@ void tune::create_grids(std::vector &grids, } -bool tune::check_constraints(ir::module &mod, std::map> &errors) { -for(ir::function *fn: mod.get_function_list()){ +bool tune::check_constraints(std::map> &errors) { using std::to_string; - // initialize grids - std::map references; - std::vector grids; - create_grids(grids, references, fn); - auto get_num_warps = [&](ir::instruction *i, unsigned axis) { std::string strk = to_string(axis); unsigned mts = params_[i]["mts.d" + strk]->get_value(); @@ -199,21 +218,14 @@ for(ir::function *fn: mod.get_function_list()){ return shape / (mts * nts); }; - num_threads_ = 1; - ir::instruction *first = grids.front(); - for(unsigned k = 0; k < first->get_type()->get_tile_shapes().size(); k++){ - std::string suffix = ".d" + std::to_string(k); - num_threads_ *= params_.at(first).at("mts" + suffix)->get_value(); - num_threads_ *= get_num_warps(first, k); - } - // number of warps + ir::instruction *first = grids_.front(); int num_warps = 1; for(size_t k = 0; k < first->get_type()->get_tile_shapes().size(); k++) num_warps *= get_num_warps(first, k); // check constraints - for(ir::instruction *i: grids){ + for(ir::instruction *i: grids_){ ir::type *ty = i->get_type(); const auto &shapes = ty->get_tile_shapes(); // for each dimension, the product of layout components @@ -243,7 +255,6 @@ for(ir::function *fn: mod.get_function_list()){ } return errors.empty(); } -} unsigned tune::get_num_global_range() { return num_global_ranges_; diff --git a/lib/jit.cpp b/lib/jit.cpp index ba42b22b2..64a73710c 100644 --- a/lib/jit.cpp +++ b/lib/jit.cpp @@ -1,16 +1,9 @@ -#include "triton/jit.h" +#include "triton/jit.h" #include #include "triton/ast/ast.h" #include "triton/ir/context.h" #include "triton/ir/context_impl.h" -#include "triton/codegen/selection.h" -#include "triton/codegen/tune.h" -#include "triton/codegen/shared_copy.h" -#include "triton/codegen/allocation.h" -#include "triton/codegen/liveness.h" -#include "triton/codegen/vectorize.h" -#include "triton/codegen/buffer_info.h" -#include "triton/codegen/barriers.h" +#include "triton/driver/device.h" #include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/Module.h" #include "llvm/IR/LLVMContext.h" @@ -71,42 +64,15 @@ void loop_nest(std::vector> const & iterates, std::function jit::make_llvm_module(ir::module &module, codegen::tune & tune) { + +std::unique_ptr jit::make_llvm_module(ir::module &module, passes_wrapper &passes) { llvm::Module* result = new llvm::Module("matmul", llvm_context_); - - // create passes - codegen::buffer_info_pass buffer_info; - codegen::place_shared_copy shared(&buffer_info); - codegen::liveness liveness(&buffer_info); - codegen::allocation allocation(&liveness, &buffer_info); - codegen::barriers barriers(&allocation, &buffer_info); - codegen::vectorize vectorize(&tune); - codegen::selection selection(&allocation, &tune, &buffer_info); - - // constraints - std::map> errors; - tune.check_constraints(module, errors); - for(auto &x: errors){ - for(auto &e: x.second) - std::cout << x.first->get_name() << " " << e << std::endl; - } - if(errors.size()) - exit(EXIT_FAILURE); - - // generate ptx - buffer_info.run(module); - shared.run(module); - liveness.run(module); - allocation.run(); - barriers.run(module); - vectorize.run(module); - selection.run(module, *result); - + passes.selection.run(module, *result); // launch information auto &launch_info_map = launch_info_map_[result->getName()]; - for(unsigned i = 0; i < tune.get_num_global_range(); i++) - launch_info_map.global_range_size.push_back(tune.get_global_range_size(i)); - launch_info_map.num_threads = tune.get_num_threads(); + for(unsigned i = 0; i < passes.tune.get_num_global_range(); i++) + launch_info_map.global_range_size.push_back(passes.tune.get_global_range_size(i)); + launch_info_map.num_threads = passes.tune.get_num_threads(); return std::unique_ptr(result); } @@ -127,11 +93,14 @@ jit::jit(driver::context context): driver_context_(context) { } -void jit::autotune(ir::module &tt_module, benchmark_t benchmark) { +void jit::autotune(const std::string &src, benchmark_t benchmark) { // find metaparameters - codegen::tune tune; - tune.run(tt_module); - auto mps = tune.get_params(tt_module); + auto ptt_module = make_triton_module(src); + ir::module &tt_module = *ptt_module; + // set parameters + passes_wrapper passes; + passes.tune.run(tt_module); + auto mps = passes.tune.get_params(tt_module); // create parameter ranges std::vector> ranges; for(ir::metaparameter *mp: mps){ @@ -141,39 +110,62 @@ void jit::autotune(ir::module &tt_module, benchmark_t benchmark) { ranges.push_back(current); } // iterate over parameters + unsigned i; loop_nest(ranges, [&](const std::vector params){ std::map> errors; - unsigned i = 0; + i = 0; for(ir::metaparameter *mp: mps) mp->set_value(params[i++]); - tune.check_constraints(tt_module, errors); - if(errors.size()) + passes.tune.init(tt_module); + if(!passes.tune.check_constraints(errors)) return; - ir::module copy(tt_module); - auto ll_module = make_llvm_module(copy, tune); + // Deep copy of the module and tuner + auto ptt_module = make_triton_module(src); + ir::module &tt_module = *ptt_module; + passes_wrapper passes; + passes.tune.run(tt_module); + i = 0; + for(ir::metaparameter* mp: passes.tune.get_params(tt_module)){ + mp->set_value(params[i++]); + } + passes.tune.init(tt_module); + passes.init(tt_module); + const driver::device &device = driver_context_.device(); + if(passes.allocation.get_allocated_size() > device.max_shared_memory()) + return; + if(passes.tune.get_num_threads() > device.max_threads_per_block()) + return; + // Compile + auto ll_module = make_llvm_module(tt_module, passes); driver::module module(driver_context_, &*ll_module); driver::kernel kernel(module, "matmul"); launch_information info = launch_info_map_.at("matmul"); + for(unsigned p: params) + std::cout << p << " " << std::flush; + std::cout << std::endl; benchmark(kernel, info); - std::cout << "benchmarked" << std::endl; }); } -void jit::autotune(const std::string &src, benchmark_t benchmark) { - auto ptt_module = make_triton_module(src); - autotune(*ptt_module, benchmark); -} - void jit::add_module(ir::module &tt_module, const std::vector ¶ms) { // set parameters - codegen::tune tune; - tune.run(tt_module); + passes_wrapper passes; + passes.tune.run(tt_module); unsigned i = 0; - for(ir::metaparameter* mp: tune.get_params(tt_module)) + for(ir::metaparameter* mp: passes.tune.get_params(tt_module)) mp->set_value(params[i++]); - // compiler to llvm - auto ll_module = make_llvm_module(tt_module, tune); - // send llvm module to driver + passes.tune.init(tt_module); + passes.init(tt_module); + // check constraints + std::map> errors; + passes.tune.check_constraints(errors); + if(errors.size()) + throw std::runtime_error("invalid parameters"); + if(passes.allocation.get_allocated_size() > driver_context_.device().max_shared_memory()) + throw std::runtime_error("invalid parameters"); + // triton module -> llvm module + auto ll_module = make_llvm_module(tt_module, passes); + // llvm module -> machine code modules_.push_back(driver::module(driver_context_, &*ll_module)); } From 87c85ed50dec3d5fdeb8ea55ef61ed2b5f355f42 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 11 Mar 2019 19:30:21 -0400 Subject: [PATCH 107/494] [code generation] reparameterization --- examples/matrix.cpp | 2 +- lib/ast/lowering.cpp | 2 +- lib/codegen/selection.cpp | 23 +++++++++++++++++++++-- lib/codegen/tune.cpp | 26 ++++++-------------------- lib/jit.cpp | 1 - 5 files changed, 29 insertions(+), 25 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index f08f81eb2..dfa64e5c3 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -86,7 +86,7 @@ int main() { auto context = triton::driver::backend::contexts::get_default(); // matrix multiplication parameters - size_t M = 128, N = 128, K = 128; + size_t M = 512, N = 512, K = 512; size_t bound = 8; std::vector hc(M*N); std::vector rc(M*N); diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 3a5b2696e..5dda59ce9 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -412,7 +412,7 @@ ir::value* initializer::codegen(ir::module * mod) const{ if(std::find(storage.begin(), storage.end(), TUNABLE_T) != storage.end()){ assert(expr_ == nullptr); //TODO: implement ranges - value = ir::metaparameter::create(mod->get_context(), ty, 8, 128); + value = ir::metaparameter::create(mod->get_context(), ty, 8, 64); } if(expr_){ value = expr_->codegen(mod); diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index fbdd33162..7523665a8 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -371,18 +371,37 @@ std::vector delinearize(Value *trailing, std::vector &shapes, return result; } +inline int32_t ceil(int32_t num, int32_t div){ + return (num + div - 1)/div; +} + +inline void to_warps(const std::vector &bs, std::vector &nw, std::vector &ws){ + static const size_t warp_size = 32; + size_t nthreads = 1, nwarps = 1; + nw.resize(bs.size()); + ws.resize(bs.size()); + for(size_t i = 0; i < bs.size(); ++i){ + nthreads *= bs[i]; + nw[i] = ceil(nthreads, nwarps*warp_size); + nwarps *= nw[i]; + } + for(size_t i = 0; i < bs.size(); ++i) + ws[i] = bs[i] / nw[i]; +} + void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { const auto& shapes = v->get_type()->get_tile_shapes(); size_t dim = shapes.size(); std::vector contiguous(dim); + std::vector block_size(dim); std::vector warp_size(dim); std::vector n_warps(dim); for(unsigned i = 0; i < shapes.size(); i++){ std::string str_i = std::to_string(i); contiguous[i] = params_->get_param(v, "nts.d" + str_i)->get_value(); - warp_size[i] = params_->get_param(v, "mts.d" + str_i)->get_value(); - n_warps[i] = shapes[i]->get_value() / (contiguous[i] * warp_size[i]); + block_size[i] = params_->get_param(v, "mts.d" + str_i)->get_value(); } + to_warps(block_size, n_warps, warp_size); std::vector thread_id_in_warp = delinearize(u_thread_id, warp_size, builder); std::vector warp_id = delinearize(u_warp_id, n_warps, builder); // Create axes diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 5bd7b0708..dcf817ec8 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -144,8 +144,8 @@ void tune::run(ir::module &mod) { // Layout parameters while(!nodes_.empty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 2, 4); - ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 4, 8); + ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 2, 2); + ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 4, 32); connected_components(*nodes_.begin(), {nts, mts}, nodes_, dependencies_); } } @@ -157,21 +157,12 @@ void tune::init(ir::module &mod) { std::map references; create_grids(grids_, references, fn); } - // number of warps - auto get_num_warps = [&](ir::instruction *i, unsigned axis) { - std::string strk = std::to_string(axis); - unsigned mts = params_[i]["mts.d" + strk]->get_value(); - unsigned nts = params_[i]["nts.d" + strk]->get_value(); - unsigned shape = i->get_type()->get_tile_shapes()[axis]->get_value(); - return shape / (mts * nts); - }; // number of threads num_threads_ = 1; ir::instruction *first = grids_.front(); for(unsigned k = 0; k < first->get_type()->get_tile_shapes().size(); k++){ std::string suffix = ".d" + std::to_string(k); num_threads_ *= params_.at(first).at("mts" + suffix)->get_value(); - num_threads_ *= get_num_warps(first, k); } } @@ -243,15 +234,10 @@ bool tune::check_constraints(std::map> &er int num_threads = 1; for(size_t k = 0; k < shapes.size(); k++) num_threads *= params_[i]["mts.d" + to_string(k)]->get_value(); - if(num_threads != 32) - errors[i].push_back("number of threads per warp (" + to_string(num_threads) + ") must be 32"); - // The number of warps required by the layout is the same - // for all tiles in the function - int required_num_warps = 1; - for(size_t k = 0; k < shapes.size(); k++) - required_num_warps *= get_num_warps(i, k); - if(required_num_warps != num_warps) - errors[i].push_back("number of warps (" + to_string(required_num_warps) + ") must be " + to_string(num_warps)); + if(num_threads % 32 != 0) + errors[i].push_back("number of threads per block (" + to_string(num_threads) + ") must be multiple of 32"); + if(num_threads != num_threads_) + errors[i].push_back("Number of threads must be the same for all tiles (" + to_string(num_threads_) + ")"); } return errors.empty(); } diff --git a/lib/jit.cpp b/lib/jit.cpp index 64a73710c..150ff40a6 100644 --- a/lib/jit.cpp +++ b/lib/jit.cpp @@ -142,7 +142,6 @@ void jit::autotune(const std::string &src, benchmark_t benchmark) { launch_information info = launch_info_map_.at("matmul"); for(unsigned p: params) std::cout << p << " " << std::flush; - std::cout << std::endl; benchmark(kernel, info); }); } From b73c3bdd25064bcd3bb87e2af6e2b89f434d3a6c Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 11 Mar 2019 22:22:43 -0400 Subject: [PATCH 108/494] [examples] removed dependency on isaac for auto-tuning --- examples/matrix.cpp | 80 +++++++++++++++++++++++++++----------- include/triton/ir/module.h | 14 ++++--- include/triton/jit.h | 4 +- lib/ast/lowering.cpp | 3 +- lib/codegen/tune.cpp | 2 +- lib/jit.cpp | 15 ++++++- 6 files changed, 86 insertions(+), 32 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index dfa64e5c3..ef5dbf36d 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -81,13 +81,37 @@ private: high_resolution_clock::time_point _start; }; +template +T min(std::vector x) +{ return *std::min_element(x.begin(), x.end()); } + + +template +double bench(OP const & op, SYNC const & sync, triton::driver::device const & device) +{ + timer tmr; + std::vector times; + double total_time = 0; + op(); + sync(); + while(total_time*1e-9 < 1e-3){ + float norm = (float)device.current_sm_clock()/device.max_sm_clock(); + tmr.start(); + op(); + sync(); + times.push_back(norm*tmr.get().count()); + total_time+=times.back(); + } + return min(times); +} + int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); + triton::jit jit(context); // matrix multiplication parameters size_t M = 512, N = 512, K = 512; - size_t bound = 8; std::vector hc(M*N); std::vector rc(M*N); std::vector ha(M*K); @@ -112,6 +136,22 @@ int main() { // benchmark a given matrix multiplication kernel auto benchmark = [&](triton::driver::kernel kernel, triton::jit::launch_information info) { + // launch info + unsigned TM = info.global_range_size[0]; + unsigned TN = info.global_range_size[1]; + unsigned nthreads = info.num_threads; + std::array grid = {(M + TM - 1)/TM, (N + TN - 1)/TN, 1}; + // fast bounds-checking + unsigned TK = jit.get_int("TK"); + unsigned lasti = (grid[0]*TM - 1)*TM + TM - 1; + unsigned lastj = (grid[1]*TN - 1)*TN + TN - 1; + unsigned lastk = TK - 1; + bool AT = false; + bool BT = true; + unsigned last_safe_a = (AT==false)?(M*K - 1 - lasti)/M - lastk : M*K - 1 - lasti*K - lastk; + unsigned last_safe_b = (BT==true)?(N*K - 1 - lastj)/N - lastk : N*K - 1 - lastj*K - lastk; + int32_t bound = std::max(1, std::max(K - last_safe_a, K - last_safe_b)); + // set argument kernel.setArg(0, da); kernel.setArg(1, db); kernel.setArg(2, dc); @@ -119,39 +159,33 @@ int main() { kernel.setArg(4, N); kernel.setArg(5, K); kernel.setArg(6, bound); - unsigned TM = info.global_range_size[0]; - unsigned TN = info.global_range_size[1]; - unsigned nthreads = info.num_threads; - timer t; - t.start(); - stream.enqueue(kernel, {(M + TM - 1)/TM, (N + TN - 1)/TN, 1}, {nthreads, 1, 1}); + // dry run + stream.enqueue(kernel, grid, {nthreads, 1, 1}); stream.synchronize(); - double ts = t.get().count()*1e-9; + // benchmark + double ts = bench([&](){stream.enqueue(kernel, grid, {nthreads, 1, 1});}, + [&](){ stream.synchronize(); }, + context.device()); + ts = ts * 1e-9; double tflops = 2*M*N*K / ts * 1e-12; - std::cout << tflops << std::endl; - return ts; + return tflops; }; // just-in-time compile source-code std::vector params = { - // a0 - 8, 2, 16, - // b0 - 4, 4, 16, - // c - 8, 4, 2, 4, - // a1 - 4, 2, 8, - // b1 - 8, 1 + 16, 2, 64, + 32, 2, 64, + 16, 8, 2, 2, + 8, 1, 8, + 4, 1 }; - triton::jit jit(context); - jit.autotune(src, benchmark); + +// jit.autotune(src, benchmark); jit.add_module(src, params); triton::driver::kernel kernel = jit.get_function("matmul"); triton::jit::launch_information info = jit.get_launch_info("matmul"); - benchmark(kernel, info); + std::cout << benchmark(kernel, info) << std::endl; stream.read(dc, true, 0, hc); simple_gemm(rc, ha, hb, M, N, K); for(size_t i = 0; i < M*N; i++) diff --git a/include/triton/ir/module.h b/include/triton/ir/module.h index 4ec681f67..3d2d5afb9 100644 --- a/include/triton/ir/module.h +++ b/include/triton/ir/module.h @@ -74,12 +74,15 @@ public: functions_list_t &get_function_list() { return functions_; } function *get_or_insert_function(const std::string &name, function_type *ty); // Scope - void add_new_scope() { if(scopes_.empty()) scopes_.push(scope()); else scopes_.push(scope(get_scope())); } - void pop_scope() { scopes_.pop(); } - scope& get_scope() { return scopes_.top(); } + void add_new_scope() { if(scopes_.empty()) scopes_.push(scope()); else scopes_.push(scope(get_scope())); } + void pop_scope() { scopes_.pop(); } + scope& get_scope() { return scopes_.top(); } // Const allocation - void add_alloc(ir::alloc_const* x) { allocs_.push_back(x); } - const std::vector& allocs() { return allocs_; } + void add_alloc(ir::alloc_const* x) { allocs_.push_back(x); } + const std::vector& allocs() { return allocs_; } + // Register global + void register_global(const std::string& name, ir::value *x) { globals_[name] = x; } + const std::map& globals() const { return globals_; } private: std::string name_; @@ -96,6 +99,7 @@ private: std::map current_phi_; std::stack scopes_; std::vector allocs_; + std::map globals_; }; } diff --git a/include/triton/jit.h b/include/triton/jit.h index a01c43685..0d90d63b0 100644 --- a/include/triton/jit.h +++ b/include/triton/jit.h @@ -39,7 +39,7 @@ public: std::vector global_range_size; unsigned num_threads; }; - typedef std::function benchmark_t; + typedef std::function benchmark_t; struct passes_wrapper { passes_wrapper(): shared(&buffer_info), liveness(&buffer_info), @@ -80,6 +80,7 @@ public: void add_module(const std::string &src, const std::vector& params = {}); driver::kernel get_function(const std::string &name); launch_information get_launch_info(const std::string &name); + unsigned get_int(const std::string &name); private: std::vector modules_; @@ -87,6 +88,7 @@ private: llvm::LLVMContext llvm_context_; ir::context triton_context_; std::map launch_info_map_; + std::map global_ints_; }; diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 5dda59ce9..04d03aa99 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -412,7 +412,8 @@ ir::value* initializer::codegen(ir::module * mod) const{ if(std::find(storage.begin(), storage.end(), TUNABLE_T) != storage.end()){ assert(expr_ == nullptr); //TODO: implement ranges - value = ir::metaparameter::create(mod->get_context(), ty, 8, 64); + value = ir::metaparameter::create(mod->get_context(), ty, 8, (name=="TK")?8:64); + mod->register_global(name, value); } if(expr_){ value = expr_->codegen(mod); diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index dcf817ec8..f3a9cedfb 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -144,7 +144,7 @@ void tune::run(ir::module &mod) { // Layout parameters while(!nodes_.empty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 2, 2); + ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 1, 2); ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 4, 32); connected_components(*nodes_.begin(), {nts, mts}, nodes_, dependencies_); } diff --git a/lib/jit.cpp b/lib/jit.cpp index 150ff40a6..64e0865fa 100644 --- a/lib/jit.cpp +++ b/lib/jit.cpp @@ -111,6 +111,7 @@ void jit::autotune(const std::string &src, benchmark_t benchmark) { } // iterate over parameters unsigned i; + double best = 0; loop_nest(ranges, [&](const std::vector params){ std::map> errors; i = 0; @@ -142,7 +143,12 @@ void jit::autotune(const std::string &src, benchmark_t benchmark) { launch_information info = launch_info_map_.at("matmul"); for(unsigned p: params) std::cout << p << " " << std::flush; - benchmark(kernel, info); + // add globals + for(auto x: tt_module.globals()) + global_ints_[x.first] = ((ir::metaparameter*)x.second)->get_value(); + double perf = benchmark(kernel, info); + best = std::max(perf, best); + std::cout << perf << " [ " << best << " ] " << std::endl; }); } @@ -166,6 +172,9 @@ void jit::add_module(ir::module &tt_module, const std::vector ¶ms) auto ll_module = make_llvm_module(tt_module, passes); // llvm module -> machine code modules_.push_back(driver::module(driver_context_, &*ll_module)); + // add globals + for(auto x: tt_module.globals()) + global_ints_[x.first] = ((ir::metaparameter*)x.second)->get_value(); } void jit::add_module(const std::string &src, const std::vector ¶ms) { @@ -181,4 +190,8 @@ jit::launch_information jit::get_launch_info(const std::string &name) { return launch_info_map_.at(name); } +unsigned jit::get_int(const std::string &name){ + return global_ints_.at(name); +} + } From 02775a226e951e7e3813b30728f9a8d2f1708345 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 18 Mar 2019 23:12:14 -0700 Subject: [PATCH 109/494] [driver] adding opencl in the driver API --- examples/CMakeLists.txt | 2 +- examples/matrix.cpp | 17 +- include/triton/driver/backend.h | 85 +- include/triton/driver/buffer.h | 34 +- include/triton/driver/context.h | 49 +- include/triton/driver/cublas.h | 12 +- include/triton/driver/device.h | 22 +- include/triton/driver/dispatch.h | 110 +- include/triton/driver/event.h | 2 +- include/triton/driver/handle.h | 20 +- include/triton/driver/kernel.h | 32 +- include/triton/driver/module.h | 29 +- include/triton/driver/platform.h | 35 +- include/triton/driver/stream.h | 54 +- include/triton/external/CL/cl.h | 1468 ++ include/triton/external/CL/cl.hpp | 12947 ++++++++++++++++ include/triton/external/CL/cl2.hpp | 9677 ++++++++++++ include/triton/external/CL/cl_d3d10.h | 131 + include/triton/external/CL/cl_d3d11.h | 131 + .../triton/external/CL/cl_dx9_media_sharing.h | 132 + .../external/CL/cl_dx9_media_sharing_intel.h | 182 + include/triton/external/CL/cl_egl.h | 136 + include/triton/external/CL/cl_ext.h | 670 + include/triton/external/CL/cl_ext_intel.h | 429 + include/triton/external/CL/cl_gl.h | 167 + include/triton/external/CL/cl_gl_ext.h | 74 + include/triton/external/CL/cl_platform.h | 1458 ++ .../CL/cl_va_api_media_sharing_intel.h | 172 + include/triton/external/CL/opencl.h | 59 + include/triton/jit.h | 10 +- lib/driver/backend.cpp | 145 +- lib/driver/buffer.cpp | 48 +- lib/driver/context.cpp | 81 +- lib/driver/device.cpp | 105 +- lib/driver/dispatch.cpp | 129 +- lib/driver/handle.cpp | 10 +- lib/driver/kernel.cpp | 48 +- lib/driver/module.cpp | 50 +- lib/driver/platform.cpp | 34 +- lib/driver/stream.cpp | 81 +- lib/jit.cpp | 21 +- 41 files changed, 28700 insertions(+), 398 deletions(-) create mode 100644 include/triton/external/CL/cl.h create mode 100644 include/triton/external/CL/cl.hpp create mode 100644 include/triton/external/CL/cl2.hpp create mode 100644 include/triton/external/CL/cl_d3d10.h create mode 100644 include/triton/external/CL/cl_d3d11.h create mode 100644 include/triton/external/CL/cl_dx9_media_sharing.h create mode 100644 include/triton/external/CL/cl_dx9_media_sharing_intel.h create mode 100644 include/triton/external/CL/cl_egl.h create mode 100644 include/triton/external/CL/cl_ext.h create mode 100644 include/triton/external/CL/cl_ext_intel.h create mode 100644 include/triton/external/CL/cl_gl.h create mode 100644 include/triton/external/CL/cl_gl_ext.h create mode 100644 include/triton/external/CL/cl_platform.h create mode 100644 include/triton/external/CL/cl_va_api_media_sharing_intel.h create mode 100644 include/triton/external/CL/opencl.h diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 4a235d45a..e577a1d81 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -2,5 +2,5 @@ foreach(PROG matrix) add_executable(${PROG} ${PROG}.cpp) set_target_properties(${PROG} PROPERTIES OUTPUT_NAME ${PROG}) include_directories(/usr/local/cuda/include/) - target_link_libraries(${PROG} triton cuda) + target_link_libraries(${PROG} triton) endforeach(PROG) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index ef5dbf36d..18032f247 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -87,7 +87,7 @@ T min(std::vector x) template -double bench(OP const & op, SYNC const & sync, triton::driver::device const & device) +double bench(OP const & op, SYNC const & sync, triton::driver::cu_device const & device) { timer tmr; std::vector times; @@ -108,6 +108,7 @@ double bench(OP const & op, SYNC const & sync, triton::driver::device const & de int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); + exit(EXIT_SUCCESS); triton::jit jit(context); // matrix multiplication parameters @@ -123,10 +124,10 @@ int main() { hb[i] = 1; for(size_t i = 0; i < hc.size(); i++) hc[i] = 0; - triton::driver::buffer dc(context, hc.size()*4); - triton::driver::buffer da(context, ha.size()*4); - triton::driver::buffer db(context, hb.size()*4); - triton::driver::stream stream(context); + triton::driver::cu_buffer dc(context, hc.size()*4); + triton::driver::cu_buffer da(context, ha.size()*4); + triton::driver::cu_buffer db(context, hb.size()*4); + triton::driver::cu_stream stream(context); stream.write(da, true, 0, ha); stream.write(db, true, 0, hb); stream.write(dc, true, 0, hc); @@ -134,7 +135,7 @@ int main() { // benchmark a given matrix multiplication kernel - auto benchmark = [&](triton::driver::kernel kernel, + auto benchmark = [&](triton::driver::cu_kernel kernel, triton::jit::launch_information info) { // launch info unsigned TM = info.global_range_size[0]; @@ -165,7 +166,7 @@ int main() { // benchmark double ts = bench([&](){stream.enqueue(kernel, grid, {nthreads, 1, 1});}, [&](){ stream.synchronize(); }, - context.device()); + context->device()); ts = ts * 1e-9; double tflops = 2*M*N*K / ts * 1e-12; return tflops; @@ -183,7 +184,7 @@ int main() { // jit.autotune(src, benchmark); jit.add_module(src, params); - triton::driver::kernel kernel = jit.get_function("matmul"); + triton::driver::cu_kernel kernel = jit.get_function("matmul"); triton::jit::launch_information info = jit.get_launch_info("matmul"); std::cout << benchmark(kernel, info) << std::endl; stream.read(dc, true, 0, hc); diff --git a/include/triton/driver/backend.h b/include/triton/driver/backend.h index d08422377..d830df391 100755 --- a/include/triton/driver/backend.h +++ b/include/triton/driver/backend.h @@ -26,6 +26,7 @@ #include #include #include +#include "triton/driver/context.h" namespace triton @@ -44,68 +45,104 @@ class kernel; struct backend { + // platforms + class platforms + { + friend class backend; + private: + static void init(); + + public: + static void get(std::vector &results); + + private: + static std::vector cache_; + }; + + // devices + class devices + { + friend class backend; + + private: + static void init(const std::vector &platforms); + + public: + static void get(std::vector& devs); + + private: + static std::vector cache_; + }; + + // modules class modules { friend class backend; + public: static void release(); - static module& get(driver::stream const & stream, std::string const & name, std::string const &src); + static driver::module* get(driver::stream* stream, std::string const & name, std::string const &src); + private: - static std::map, module * > cache_; + static std::map, driver::module*> cache_; }; + // kernels class kernels { friend class backend; public: static void release(); - static kernel & get(driver::module const & program, std::string const & name); + static driver::kernel* get(driver::module* mod, const std::string & name); private: - static std::map, kernel * > cache_; + static std::map, driver::kernel*> cache_; }; + // contexts class contexts { friend class backend; private: - static void init(std::vector const &); + static void init(const std::vector &); static void release(); public: - static driver::context const & get_default(); - template - static driver::context const & import(T ctx) + static driver::context* get_default(); + + static driver::context* import(CUcontext ctx) { - for(driver::context const * x: cache_) - if((T)*x==ctx) - return *x; - cache_.emplace_back(new driver::context(ctx, false)); - return *cache_.back(); + for(driver::context* x: cache_){ + driver::cu_context* cu_x = (driver::cu_context*)x; + if(*cu_x->cu()==ctx) + return x; + } + cache_.emplace_back(new driver::cu_context(ctx, false)); + return cache_.back(); } - static void get(std::list &); + + static void get(std::list &); + private: - static std::list cache_; + static std::list cache_; }; + // streams class streams { friend class backend; private: - static void init(std::list const &); + static void init(std::list const &); static void release(); public: - static void get(driver::context const &, std::vector &streams); - static stream & get(driver::context const &, unsigned int id = 0); - static stream & get_default(); + static void get(driver::context*, std::vector &streams); + static driver::stream* get(driver::context*, unsigned int id = 0); + static driver::stream* get_default(); private: - static std::map< context, std::vector > cache_; + static std::map > cache_; }; static void init(); static void release(); - - static std::vector devices(); - static std::vector platforms(); - static void synchronize(driver::context const &); + static void synchronize(triton::driver::context *); static unsigned int default_device; }; diff --git a/include/triton/driver/buffer.h b/include/triton/driver/buffer.h index 351a58026..c4ca53650 100755 --- a/include/triton/driver/buffer.h +++ b/include/triton/driver/buffer.h @@ -31,21 +31,33 @@ namespace triton namespace driver { -class stream; +class cu_stream; -// Buffer -class buffer: public handle_interface +// Base +class buffer : public polymorphic_resource { +public: + buffer(driver::context* ctx, CUdeviceptr cl, bool take_ownership); + buffer(driver::context* ctx, cl_mem cl, bool take_ownership); + driver::context* context(); + +protected: + driver::context* context_; +}; + +// OpenCL +class ocl_buffer: public buffer { public: - buffer(driver::context const & context, size_t size); - buffer(driver::context const & context, CUdeviceptr cu, bool take_ownership); - void set_zero(stream const & queue, size_t size); - handle const & cu() const; - handle & cu(); + ocl_buffer(driver::context* context, size_t size); +}; -private: - context context_; - handle cu_; +// CUDA +class cu_buffer: public buffer +{ +public: + cu_buffer(driver::context* context, size_t size); + cu_buffer(driver::context* context, CUdeviceptr cu, bool take_ownership); + void set_zero(cu_stream const & queue, size_t size); }; } diff --git a/include/triton/driver/context.h b/include/triton/driver/context.h index fbca8c88a..842d0a82c 100755 --- a/include/triton/driver/context.h +++ b/include/triton/driver/context.h @@ -31,35 +31,50 @@ namespace triton namespace driver { -class context: public handle_interface -{ -private: +class context: public polymorphic_resource{ +protected: static std::string get_cache_path(); - static CUdevice device(CUcontext); public: - //Constructors - explicit context(CUcontext context, bool take_ownership = true); - explicit context(driver::device const & dvc); - //Accessors - driver::device const & device() const; + context(driver::device *dev, CUcontext cu, bool take_ownership); + context(driver::device *dev, cl_context cl, bool take_ownership); + driver::device* device() const; std::string const & cache_path() const; - handle const & cu() const; -private: - handle cu_; - driver::device dvc_; +protected: + driver::device* dev_; std::string cache_path_; }; -class ContextSwitcher{ +// CUDA +class cu_context: public context { public: - ContextSwitcher(driver::context const & ctx); - ~ContextSwitcher(); + class context_switcher{ + public: + context_switcher(driver::context const & ctx); + ~context_switcher(); + private: + driver::cu_context const & ctx_; + }; + private: - driver::context const & ctx_; + static CUdevice get_device_of(CUcontext); + +public: + //Constructors + cu_context(CUcontext cu, bool take_ownership = true); + cu_context(driver::device* dev); }; +// OpenCL +class ocl_context: public context { +public: + ocl_context(driver::device* dev); +}; + + + + } } diff --git a/include/triton/driver/cublas.h b/include/triton/driver/cublas.h index b58fa0856..2553dcb89 100755 --- a/include/triton/driver/cublas.h +++ b/include/triton/driver/cublas.h @@ -84,7 +84,7 @@ inline void cublasGemmEx(cublasHandle_t handle, cudaDataType cudt, cublasOperati /* Simplified API for default GEMM */ -inline void cublasGemm(DType dtype, stream& stream, char cAT, char cBT, int32_t M, int32_t N, int32_t K, scalar alpha, buffer const & A, int32_t lda, buffer const & B, int32_t ldb, scalar beta, buffer& C, int32_t ldc, cublasGemmAlgo_t* fastest = NULL, cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT){ +inline void cublasGemm(DType dtype, stream& stream, char cAT, char cBT, int32_t M, int32_t N, int32_t K, scalar alpha, cu_buffer const & A, int32_t lda, cu_buffer const & B, int32_t ldb, scalar beta, cu_buffer& C, int32_t ldc, cublasGemmAlgo_t* fastest = NULL, cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT){ ContextSwitcher ctx_switch(stream.context()); cublasHandle_t handle = dispatch::cublasHandle(stream.context()); dispatch::cublasSetStream_v2(handle, (CUstream)stream); @@ -112,7 +112,7 @@ inline cudnnTensorFormat_t format(cudnnDataType_t cutype){ } inline void cudnnConv(DType dtype, stream& stream, int32_t D, int32_t H, int32_t W, int32_t N, int32_t K, int32_t M, int32_t P, int32_t Q, int32_t C, int32_t T, int32_t R, int32_t S, - int32_t pad_d, int32_t pad_h, int32_t pad_w, int32_t stride_d, int32_t stride_h, int32_t stride_w, scalar alpha, buffer const & I, buffer const & F, scalar beta, buffer const & O){ + int32_t pad_d, int32_t pad_h, int32_t pad_w, int32_t stride_d, int32_t stride_h, int32_t stride_w, scalar alpha, cu_buffer const & I, cu_buffer const & F, scalar beta, cu_buffer const & O){ driver::driver::context const & ctx = stream.context(); ContextSwitcher switch_ctx(ctx); @@ -154,7 +154,7 @@ inline void cudnnConv(DType dtype, stream& stream, int32_t D, int32_t H, int32_t size_t workspace_size; dispatch::cudnnGetConvolutionForwardWorkspaceSize(handle, tI, tF, conv, tO, algo, &workspace_size); - static buffer work(ctx, 1024*1024*64); + static cu_buffer work(ctx, 1024*1024*64); CUdeviceptr twork = work; CUdeviceptr pI = I, pF = F, pO = O; dispatch::cudnnConvolutionForward(handle, alpha.data(), tI, (void*)pI, tF, (void*)pF, conv, algo, (void*)twork, workspace_size, beta.data(), tO, (void*)pO); @@ -162,7 +162,7 @@ inline void cudnnConv(DType dtype, stream& stream, int32_t D, int32_t H, int32_t inline void cudnnPool(DType dtype, stream& stream, int32_t D, int32_t H, int32_t W, int32_t N, int32_t K, int32_t M, int32_t P, int32_t Q, int32_t T, int32_t R, int32_t S, - int32_t pad_d, int32_t pad_h, int32_t pad_w, int32_t stride_d, int32_t stride_h, int32_t stride_w, scalar alpha, buffer const & I, scalar beta, buffer const & O){ + int32_t pad_d, int32_t pad_h, int32_t pad_w, int32_t stride_d, int32_t stride_h, int32_t stride_w, scalar alpha, cu_buffer const & I, scalar beta, cu_buffer const & O){ driver::driver::context const & ctx = stream.context(); ContextSwitcher switch_ctx(ctx); @@ -200,11 +200,11 @@ inline void cudnnPool(DType dtype, stream& stream, int32_t D, int32_t H, int32_t dispatch::cudnnPoolingForward(handle, desc, alpha.data(), tI, (void*)pI, beta.data(), tO, (void*)pO); } -inline void cudnnTransformTensor(driver::stream & stream, +inline void cudnnTransformTensor(driver::cu_stream & stream, DType in_dtype, DType out_dtype, cudnnTensorFormat_t in_layout, cudnnTensorFormat_t out_layout, int32_t N, int32_t C, int32_t D, int32_t H, int32_t W, - scalar alpha, driver::buffer const & I, scalar beta, driver::buffer& O) + scalar alpha, driver::cu_buffer const & I, scalar beta, driver::cu_buffer& O) { cudnnHandle_t handle = dispatch::cudnnHandle(stream.context()); dispatch::cudnnSetStream(handle, (CUstream)stream); diff --git a/include/triton/driver/device.h b/include/triton/driver/device.h index 7f64b614a..2945ab766 100755 --- a/include/triton/driver/device.h +++ b/include/triton/driver/device.h @@ -32,9 +32,20 @@ namespace triton namespace driver { -// Device -class device: public handle_interface -{ +// Base device +class device: public polymorphic_resource{ +public: + using polymorphic_resource::polymorphic_resource; +}; + +// OpenCL device +class ocl_device: public device { +public: + ocl_device(cl_device_id cl, bool take_ownership = true): device(cl, take_ownership) { } +}; + +// CUDA device +class cu_device: public device { public: //Supported architectures enum class Architecture{ @@ -61,14 +72,12 @@ private: inline nvmlDevice_t nvml_device() const; public: - device(CUdevice cu = CUdevice(), bool take_ownership = true): cu_(cu, take_ownership){} + cu_device(CUdevice cu = CUdevice(), bool take_ownership = true): device(cu, take_ownership){} //Accessors Architecture architecture() const; - handle const & cu() const; //Informations std::string infos() const; size_t address_bits() const; - driver::platform platform() const; std::vector max_block_dim() const; size_t max_threads_per_block() const; size_t max_shared_memory() const; @@ -87,7 +96,6 @@ public: size_t max_mem_clock() const; private: - handle cu_; std::shared_ptr> interpreted_as_; }; diff --git a/include/triton/driver/dispatch.h b/include/triton/driver/dispatch.h index 2357756d6..f579602c4 100755 --- a/include/triton/driver/dispatch.h +++ b/include/triton/driver/dispatch.h @@ -28,10 +28,11 @@ //CUDA Backend #include "triton/external/CUDA/cuda.h" -#include "triton/external/CUDA/nvrtc.h" #include "triton/external/CUDA/cublas_v2.h" #include "triton/external/CUDA/cudnn.h" #include "triton/external/CUDA/nvml.h" +#include "triton/external/CL/cl.h" +#include "triton/external/CL/cl_ext.h" //Exceptions #include @@ -42,10 +43,9 @@ namespace triton namespace driver { -class context; +class cu_context; template void check(T){} -void check(nvrtcResult err); void check(CUresult err); void check(cublasStatus_t err); void check(cudnnStatus_t err); @@ -79,14 +79,48 @@ private: } public: - static bool nvrtcinit(); + static bool clinit(); static bool nvmlinit(); static bool cuinit(); static bool cublasinit(); static bool cudnninit(); - static void release(); + //OpenCL + static cl_int clBuildProgram(cl_program, cl_uint, const cl_device_id *, const char *, void (*)(cl_program, void *), void *); + static cl_int clEnqueueNDRangeKernel(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *); + static cl_int clSetKernelArg(cl_kernel, cl_uint, size_t, const void *); + static cl_int clReleaseMemObject(cl_mem); + static cl_int clFinish(cl_command_queue); + static cl_int clGetMemObjectInfo(cl_mem, cl_mem_info, size_t, void *, size_t *); + static cl_int clGetCommandQueueInfo(cl_command_queue, cl_command_queue_info, size_t, void *, size_t *); + static cl_int clReleaseContext(cl_context); + static cl_int clReleaseEvent(cl_event); + static cl_int clEnqueueWriteBuffer(cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *); + static cl_int clEnqueueReadBuffer(cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *); + static cl_int clGetProgramBuildInfo(cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *); + static cl_int clReleaseDevice(cl_device_id); + static cl_context clCreateContext(const cl_context_properties *, cl_uint, const cl_device_id *, void (*)(const char *, const void *, size_t, void *), void *, cl_int *); + static cl_int clGetDeviceIDs(cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *); + static cl_int clGetContextInfo(cl_context, cl_context_info, size_t, void *, size_t *); + static cl_int clGetDeviceInfo(cl_device_id, cl_device_info, size_t, void *, size_t *); + static cl_int clReleaseCommandQueue(cl_command_queue); + static cl_int clGetPlatformIDs(cl_uint, cl_platform_id *, cl_uint *); + static cl_int clGetPlatformInfo(cl_platform_id, cl_platform_info, size_t, void *, size_t *); + static cl_int clGetEventProfilingInfo(cl_event, cl_profiling_info, size_t, void *, size_t *); + static cl_program clCreateProgramWithBinary(cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *); + static cl_command_queue clCreateCommandQueue(cl_context, cl_device_id, cl_command_queue_properties, cl_int *); + static cl_int clRetainEvent(cl_event); + static cl_int clReleaseProgram(cl_program); + static cl_int clFlush(cl_command_queue); + static cl_int clGetProgramInfo(cl_program, cl_program_info, size_t, void *, size_t *); + static cl_int clGetKernelInfo(cl_kernel, cl_kernel_info, size_t, void *, size_t *); + static cl_int clGetKernelWorkGroupInfo(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *); + static cl_kernel clCreateKernel(cl_program, const char *, cl_int *); + static cl_mem clCreateBuffer(cl_context, cl_mem_flags, size_t, void *, cl_int *); + static cl_program clCreateProgramWithSource(cl_context, cl_uint, const char **, const size_t *, cl_int *); + static cl_int clReleaseKernel(cl_kernel); + //CUDA static CUresult cuCtxGetCurrent(CUcontext *pctx); static CUresult cuCtxSetCurrent(CUcontext ctx); @@ -130,14 +164,7 @@ public: static nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); - static nvrtcResult nvrtcCompileProgram(nvrtcProgram prog, int numOptions, const char **options); - static nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet); - static nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx); - static nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet); - static nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char **headers, const char **includeNames); - static nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log); - - static cublasHandle_t cublasHandle(driver::context const & ctx); + static cublasHandle_t cublasHandle(driver::cu_context const & ctx); static cublasStatus_t cublasCreate_v2(cublasHandle_t* h); static cublasStatus_t cublasGetStream_v2(cublasHandle_t h, cudaStream_t *streamId); static cublasStatus_t cublasSetStream_v2(cublasHandle_t h, cudaStream_t streamId); @@ -146,7 +173,7 @@ public: static cublasStatus_t cublasHgemm (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, half* alpha, const half *A, int lda, const half *B, int ldb, half* beta, half *C, int ldc); static cublasStatus_t cublasGemmEx(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const void *alpha, const void *A, cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb, const void *beta, void *C, cudaDataType Ctype, int ldc, cudaDataType computeType, cublasGemmAlgo_t algo); - static cudnnHandle_t cudnnHandle(driver::context const & ctx); + static cudnnHandle_t cudnnHandle(driver::cu_context const & ctx); static cudnnStatus_t cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc); static cudnnStatus_t cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t* convDesc); static cudnnStatus_t cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc); @@ -167,13 +194,50 @@ public: static cudnnStatus_t cudnnTransformTensor(cudnnHandle_t handle, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y); private: + + // Libraries + static void* opencl_; static void* cuda_; - static void* nvrtc_; static void* nvml_; static void* cublas_; static void* cudnn_; - //CUDA + // OpenCL functions + static void* clBuildProgram_; + static void* clEnqueueNDRangeKernel_; + static void* clSetKernelArg_; + static void* clReleaseMemObject_; + static void* clFinish_; + static void* clGetMemObjectInfo_; + static void* clGetCommandQueueInfo_; + static void* clReleaseContext_; + static void* clReleaseEvent_; + static void* clEnqueueWriteBuffer_; + static void* clEnqueueReadBuffer_; + static void* clGetProgramBuildInfo_; + static void* clReleaseDevice_; + static void* clCreateContext_; + static void* clGetDeviceIDs_; + static void* clGetContextInfo_; + static void* clGetDeviceInfo_; + static void* clReleaseCommandQueue_; + static void* clGetPlatformIDs_; + static void* clGetPlatformInfo_; + static void* clGetEventProfilingInfo_; + static void* clCreateProgramWithBinary_; + static void* clCreateCommandQueue_; + static void* clRetainEvent_; + static void* clReleaseProgram_; + static void* clFlush_; + static void* clGetProgramInfo_; + static void* clGetKernelInfo_; + static void* clGetKernelWorkGroupInfo_; + static void* clCreateKernel_; + static void* clCreateBuffer_; + static void* clCreateProgramWithSource_; + static void* clReleaseKernel_; + + // CUDA functions static void* cuCtxGetCurrent_; static void* cuCtxSetCurrent_; static void* cuCtxDestroy_v2_; @@ -188,7 +252,6 @@ private: static void* cuDeviceGetName_; static void* cuDeviceGetPCIBusId_; static void* cuModuleGetGlobal_v2_; - static void* cuMemcpyHtoDAsync_v2_; static void* cuModuleLoad_; static void* cuLaunchKernel_; @@ -210,19 +273,12 @@ private: static void* cuMemsetD8Async_; static void* cuCtxPushCurrent_v2_; static void* cuCtxPopCurrent_v2_; - + // NVML static void* nvmlInit_v2_; static void* nvmlDeviceGetHandleByPciBusId_v2_; static void* nvmlDeviceGetClockInfo_; static void* nvmlDeviceGetMaxClockInfo_; - - static void* nvrtcCompileProgram_; - static void* nvrtcGetProgramLogSize_; - static void* nvrtcGetPTX_; - static void* nvrtcGetPTXSize_; - static void* nvrtcCreateProgram_; - static void* nvrtcGetProgramLog_; - + // cuBLAS static void* cublasCreate_v2_; static void* cublasGetStream_v2_; static void* cublasSetStream_v2_; @@ -230,7 +286,7 @@ private: static void* cublasSgemm_v2_; static void* cublasDgemm_v2_; static void* cublasGemmEx_; - + // cuDNN static void* cudnnCreateConvolutionDescriptor_; static void* cudnnCreatePoolingDescriptor_; static void* cudnnCreateTensorDescriptor_; diff --git a/include/triton/driver/event.h b/include/triton/driver/event.h index 41cb4fdf4..3343ba6e3 100755 --- a/include/triton/driver/event.h +++ b/include/triton/driver/event.h @@ -32,7 +32,7 @@ namespace driver { // Event -class Event: public handle_interface +class Event { public: float elapsed_time() const; diff --git a/include/triton/driver/handle.h b/include/triton/driver/handle.h index c7241cb41..3bffea395 100755 --- a/include/triton/driver/handle.h +++ b/include/triton/driver/handle.h @@ -41,8 +41,8 @@ struct cu_event_t{ CUevent second; }; -struct cu_platform{ - cu_platform() : status_(dispatch::cuInit(0)) { } +struct CUPlatform{ + CUPlatform() : status_(dispatch::cuInit(0)) { } operator bool() const { return status_; } private: CUresult status_; @@ -76,6 +76,22 @@ protected: bool has_ownership_; }; +template +class polymorphic_resource { +public: + polymorphic_resource(CUType cu, bool take_ownership): cu_(cu, take_ownership){} + polymorphic_resource(CLType cl, bool take_ownership): cl_(cl, take_ownership){} + + handle cu() { return cu_; } + handle cl() { return cl_; } + const handle& cu() const { return cu_; } + const handle& cl() const { return cl_; } + +protected: + handle cl_; + handle cu_; +}; + } } diff --git a/include/triton/driver/kernel.h b/include/triton/driver/kernel.h index c4fc207d4..6a8f114f4 100755 --- a/include/triton/driver/kernel.h +++ b/include/triton/driver/kernel.h @@ -34,28 +34,38 @@ namespace triton namespace driver { -class buffer; +class cu_buffer; -// Kernel -class kernel: public handle_interface -{ +// Base +class kernel: public polymorphic_resource { +public: + kernel(driver::module* program, CUfunction fn, bool has_ownership); + kernel(driver::module* program, cl_kernel fn, bool has_ownership); + driver::module* module(); + +private: + driver::module* program_; +}; + +// OpenCL +class ocl_kernel: public kernel { +}; + +// CUDA +class cu_kernel: public kernel { public: //Constructors - kernel(driver::module const & program, const char * name); - //Accessors - handle const & cu() const; - driver::module const & module() const; + cu_kernel(driver::module* program, const char * name); //Arguments setters void setArg(unsigned int index, std::size_t size, void* ptr); - void setArg(unsigned int index, buffer const &); + void setArg(unsigned int index, cu_buffer const &); template void setArg(unsigned int index, T value) { setArg(index, sizeof(T), (void*)&value); } //Arguments getters void* const* cu_params() const; private: handle cu_; - driver::module program_; - unsigned int address_bits_; + driver::cu_module* program_; std::vector > cu_params_store_; std::vector cu_params_; }; diff --git a/include/triton/driver/module.h b/include/triton/driver/module.h index f69db71d6..ef45243fd 100755 --- a/include/triton/driver/module.h +++ b/include/triton/driver/module.h @@ -39,25 +39,30 @@ namespace triton namespace driver { -class context; -class device; +class cu_context; +class cu_device; -class module: public handle_interface -{ - static std::string header(device const & device); +class module: public polymorphic_resource { +public: + module(driver::context* ctx, CUmodule mod, bool has_ownership); + module(driver::context* ctx, cl_program mod, bool has_ownership); + driver::context* context() const; + +protected: + driver::context* ctx_; +}; + +class cu_module: public module { + static std::string header(driver::cu_device const & device); std::string compile_llvm_module(llvm::Module* module); void init_llvm(); public: - module(driver::context const & context, llvm::Module *module); - module(driver::context const & context, const std::string& source); - driver::context const & context() const; - handle const & cu() const; - buffer symbol(const char * name) const; + cu_module(driver::context* context, llvm::Module *module); + cu_module(driver::context* context, const std::string& source); + cu_buffer symbol(const char * name) const; private: - handle cu_; - driver::context context_; std::string source_; }; diff --git a/include/triton/driver/platform.h b/include/triton/driver/platform.h index 5ab7d8d28..9ea1e9f3b 100755 --- a/include/triton/driver/platform.h +++ b/include/triton/driver/platform.h @@ -39,12 +39,37 @@ class device; class platform { public: - //Accessors - std::string name() const { return "CUDA"; } - std::string version() const; - std::vector devices() const; + // Constructor + platform(const std::string& name): name_(name){ } + // Accessors + std::string name() const { return name_; } + // Virtual methods + virtual std::string version() const = 0; + virtual void devices(std::vector &devices) const = 0; private: - handle cu_; + std::string name_; +}; + +class cu_platform: public platform +{ +public: + cu_platform(): platform("CUDA") { } + std::string version() const; + void devices(std::vector &devices) const; + +private: + handle cu_; +}; + +class cl_platform: public platform +{ +public: + cl_platform(cl_platform_id cl): platform("OpenCL"), cl_(cl) { } + std::string version() const; + void devices(std::vector &devices) const; + +private: + handle cl_; }; } diff --git a/include/triton/driver/stream.h b/include/triton/driver/stream.h index c420fa45f..cb2ae7d4d 100755 --- a/include/triton/driver/stream.h +++ b/include/triton/driver/stream.h @@ -35,43 +35,55 @@ namespace triton namespace driver { -class kernel; +class cu_kernel; class Event; class Range; -class buffer; +class cu_buffer; -// Command Queue -class stream: public handle_interface -{ +// Base +class stream: public polymorphic_resource { +public: + stream(driver::context *ctx, CUstream, bool has_ownership); + stream(driver::context *ctx, cl_command_queue, bool has_ownership); + driver::context* context() const; + virtual void synchronize() = 0; + +protected: + driver::context *ctx_; +}; + +// OpenCL +class cl_stream: public stream { +public: + // Constructors + cl_stream(driver::context *ctx); + + // Synchronize + void synchronize(); +}; + +// CUDA +class cu_stream: public stream { public: //Constructors - stream(CUstream stream, bool take_ownership); - stream(driver::context const & context); - - //Accessors - handle const & cu() const; - driver::context const & context() const; + cu_stream(CUstream str, bool take_ownership); + cu_stream(driver::context* context); //Synchronize void synchronize(); //Enqueue - void enqueue(kernel const & kernel, std::array grid, std::array block, std::vector const * = NULL, Event *event = NULL); + void enqueue(cu_kernel const & cu_kernel, std::array grid, std::array block, std::vector const * = NULL, Event *event = NULL); // Write - void write(driver::buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr); - - template void write(driver::buffer const & buffer, bool blocking, std::size_t offset, std::vector const & x) + void write(driver::cu_buffer const & cu_buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr); + template void write(driver::cu_buffer const & buffer, bool blocking, std::size_t offset, std::vector const & x) { write(buffer, blocking, offset, x.size()*sizeof(T), x.data()); } // Read - void read(driver::buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr); - - template void read(driver::buffer const & buffer, bool blocking, std::size_t offset, std::vector& x) + void read(driver::cu_buffer const & cu_buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr); + template void read(driver::cu_buffer const & buffer, bool blocking, std::size_t offset, std::vector& x) { read(buffer, blocking, offset, x.size()*sizeof(T), x.data()); } -private: - driver::context context_; - handle cu_; }; diff --git a/include/triton/external/CL/cl.h b/include/triton/external/CL/cl.h new file mode 100644 index 000000000..1e164eb83 --- /dev/null +++ b/include/triton/external/CL/cl.h @@ -0,0 +1,1468 @@ +/******************************************************************************* + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +#ifndef __OPENCL_CL_H +#define __OPENCL_CL_H + +#ifdef __APPLE__ +#include +#else +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/******************************************************************************/ + +typedef struct _cl_platform_id * cl_platform_id; +typedef struct _cl_device_id * cl_device_id; +typedef struct _cl_context * cl_context; +typedef struct _cl_command_queue * cl_command_queue; +typedef struct _cl_mem * cl_mem; +typedef struct _cl_program * cl_program; +typedef struct _cl_kernel * cl_kernel; +typedef struct _cl_event * cl_event; +typedef struct _cl_sampler * cl_sampler; + +typedef cl_uint cl_bool; /* WARNING! Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ +typedef cl_ulong cl_bitfield; +typedef cl_bitfield cl_device_type; +typedef cl_uint cl_platform_info; +typedef cl_uint cl_device_info; +typedef cl_bitfield cl_device_fp_config; +typedef cl_uint cl_device_mem_cache_type; +typedef cl_uint cl_device_local_mem_type; +typedef cl_bitfield cl_device_exec_capabilities; +typedef cl_bitfield cl_device_svm_capabilities; +typedef cl_bitfield cl_command_queue_properties; +typedef intptr_t cl_device_partition_property; +typedef cl_bitfield cl_device_affinity_domain; + +typedef intptr_t cl_context_properties; +typedef cl_uint cl_context_info; +typedef cl_bitfield cl_queue_properties; +typedef cl_uint cl_command_queue_info; +typedef cl_uint cl_channel_order; +typedef cl_uint cl_channel_type; +typedef cl_bitfield cl_mem_flags; +typedef cl_bitfield cl_svm_mem_flags; +typedef cl_uint cl_mem_object_type; +typedef cl_uint cl_mem_info; +typedef cl_bitfield cl_mem_migration_flags; +typedef cl_uint cl_image_info; +typedef cl_uint cl_buffer_create_type; +typedef cl_uint cl_addressing_mode; +typedef cl_uint cl_filter_mode; +typedef cl_uint cl_sampler_info; +typedef cl_bitfield cl_map_flags; +typedef intptr_t cl_pipe_properties; +typedef cl_uint cl_pipe_info; +typedef cl_uint cl_program_info; +typedef cl_uint cl_program_build_info; +typedef cl_uint cl_program_binary_type; +typedef cl_int cl_build_status; +typedef cl_uint cl_kernel_info; +typedef cl_uint cl_kernel_arg_info; +typedef cl_uint cl_kernel_arg_address_qualifier; +typedef cl_uint cl_kernel_arg_access_qualifier; +typedef cl_bitfield cl_kernel_arg_type_qualifier; +typedef cl_uint cl_kernel_work_group_info; +typedef cl_uint cl_kernel_sub_group_info; +typedef cl_uint cl_event_info; +typedef cl_uint cl_command_type; +typedef cl_uint cl_profiling_info; +typedef cl_bitfield cl_sampler_properties; +typedef cl_uint cl_kernel_exec_info; + +typedef struct _cl_image_format { + cl_channel_order image_channel_order; + cl_channel_type image_channel_data_type; +} cl_image_format; + +typedef struct _cl_image_desc { + cl_mem_object_type image_type; + size_t image_width; + size_t image_height; + size_t image_depth; + size_t image_array_size; + size_t image_row_pitch; + size_t image_slice_pitch; + cl_uint num_mip_levels; + cl_uint num_samples; +#ifdef __GNUC__ + __extension__ /* Prevents warnings about anonymous union in -pedantic builds */ +#endif + union { + cl_mem buffer; + cl_mem mem_object; + }; +} cl_image_desc; + +typedef struct _cl_buffer_region { + size_t origin; + size_t size; +} cl_buffer_region; + + +/******************************************************************************/ + +/* Error Codes */ +#define CL_SUCCESS 0 +#define CL_DEVICE_NOT_FOUND -1 +#define CL_DEVICE_NOT_AVAILABLE -2 +#define CL_COMPILER_NOT_AVAILABLE -3 +#define CL_MEM_OBJECT_ALLOCATION_FAILURE -4 +#define CL_OUT_OF_RESOURCES -5 +#define CL_OUT_OF_HOST_MEMORY -6 +#define CL_PROFILING_INFO_NOT_AVAILABLE -7 +#define CL_MEM_COPY_OVERLAP -8 +#define CL_IMAGE_FORMAT_MISMATCH -9 +#define CL_IMAGE_FORMAT_NOT_SUPPORTED -10 +#define CL_BUILD_PROGRAM_FAILURE -11 +#define CL_MAP_FAILURE -12 +#define CL_MISALIGNED_SUB_BUFFER_OFFSET -13 +#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14 +#define CL_COMPILE_PROGRAM_FAILURE -15 +#define CL_LINKER_NOT_AVAILABLE -16 +#define CL_LINK_PROGRAM_FAILURE -17 +#define CL_DEVICE_PARTITION_FAILED -18 +#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE -19 + +#define CL_INVALID_VALUE -30 +#define CL_INVALID_DEVICE_TYPE -31 +#define CL_INVALID_PLATFORM -32 +#define CL_INVALID_DEVICE -33 +#define CL_INVALID_CONTEXT -34 +#define CL_INVALID_QUEUE_PROPERTIES -35 +#define CL_INVALID_COMMAND_QUEUE -36 +#define CL_INVALID_HOST_PTR -37 +#define CL_INVALID_MEM_OBJECT -38 +#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39 +#define CL_INVALID_IMAGE_SIZE -40 +#define CL_INVALID_SAMPLER -41 +#define CL_INVALID_BINARY -42 +#define CL_INVALID_BUILD_OPTIONS -43 +#define CL_INVALID_PROGRAM -44 +#define CL_INVALID_PROGRAM_EXECUTABLE -45 +#define CL_INVALID_KERNEL_NAME -46 +#define CL_INVALID_KERNEL_DEFINITION -47 +#define CL_INVALID_KERNEL -48 +#define CL_INVALID_ARG_INDEX -49 +#define CL_INVALID_ARG_VALUE -50 +#define CL_INVALID_ARG_SIZE -51 +#define CL_INVALID_KERNEL_ARGS -52 +#define CL_INVALID_WORK_DIMENSION -53 +#define CL_INVALID_WORK_GROUP_SIZE -54 +#define CL_INVALID_WORK_ITEM_SIZE -55 +#define CL_INVALID_GLOBAL_OFFSET -56 +#define CL_INVALID_EVENT_WAIT_LIST -57 +#define CL_INVALID_EVENT -58 +#define CL_INVALID_OPERATION -59 +#define CL_INVALID_GL_OBJECT -60 +#define CL_INVALID_BUFFER_SIZE -61 +#define CL_INVALID_MIP_LEVEL -62 +#define CL_INVALID_GLOBAL_WORK_SIZE -63 +#define CL_INVALID_PROPERTY -64 +#define CL_INVALID_IMAGE_DESCRIPTOR -65 +#define CL_INVALID_COMPILER_OPTIONS -66 +#define CL_INVALID_LINKER_OPTIONS -67 +#define CL_INVALID_DEVICE_PARTITION_COUNT -68 +#define CL_INVALID_PIPE_SIZE -69 +#define CL_INVALID_DEVICE_QUEUE -70 +#define CL_INVALID_SPEC_ID -71 +#define CL_MAX_SIZE_RESTRICTION_EXCEEDED -72 + +/* OpenCL Version */ +#define CL_VERSION_1_0 1 +#define CL_VERSION_1_1 1 +#define CL_VERSION_1_2 1 +#define CL_VERSION_2_0 1 +#define CL_VERSION_2_1 1 +#define CL_VERSION_2_2 1 + +/* cl_bool */ +#define CL_FALSE 0 +#define CL_TRUE 1 +#define CL_BLOCKING CL_TRUE +#define CL_NON_BLOCKING CL_FALSE + +/* cl_platform_info */ +#define CL_PLATFORM_PROFILE 0x0900 +#define CL_PLATFORM_VERSION 0x0901 +#define CL_PLATFORM_NAME 0x0902 +#define CL_PLATFORM_VENDOR 0x0903 +#define CL_PLATFORM_EXTENSIONS 0x0904 +#define CL_PLATFORM_HOST_TIMER_RESOLUTION 0x0905 + +/* cl_device_type - bitfield */ +#define CL_DEVICE_TYPE_DEFAULT (1 << 0) +#define CL_DEVICE_TYPE_CPU (1 << 1) +#define CL_DEVICE_TYPE_GPU (1 << 2) +#define CL_DEVICE_TYPE_ACCELERATOR (1 << 3) +#define CL_DEVICE_TYPE_CUSTOM (1 << 4) +#define CL_DEVICE_TYPE_ALL 0xFFFFFFFF + +/* cl_device_info */ +#define CL_DEVICE_TYPE 0x1000 +#define CL_DEVICE_VENDOR_ID 0x1001 +#define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002 +#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003 +#define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004 +#define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B +#define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C +#define CL_DEVICE_ADDRESS_BITS 0x100D +#define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E +#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F +#define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010 +#define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011 +#define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012 +#define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013 +#define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014 +#define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015 +#define CL_DEVICE_IMAGE_SUPPORT 0x1016 +#define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017 +#define CL_DEVICE_MAX_SAMPLERS 0x1018 +#define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019 +#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A +#define CL_DEVICE_SINGLE_FP_CONFIG 0x101B +#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C +#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D +#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E +#define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F +#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020 +#define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021 +#define CL_DEVICE_LOCAL_MEM_TYPE 0x1022 +#define CL_DEVICE_LOCAL_MEM_SIZE 0x1023 +#define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024 +#define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025 +#define CL_DEVICE_ENDIAN_LITTLE 0x1026 +#define CL_DEVICE_AVAILABLE 0x1027 +#define CL_DEVICE_COMPILER_AVAILABLE 0x1028 +#define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029 +#define CL_DEVICE_QUEUE_PROPERTIES 0x102A /* deprecated */ +#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES 0x102A +#define CL_DEVICE_NAME 0x102B +#define CL_DEVICE_VENDOR 0x102C +#define CL_DRIVER_VERSION 0x102D +#define CL_DEVICE_PROFILE 0x102E +#define CL_DEVICE_VERSION 0x102F +#define CL_DEVICE_EXTENSIONS 0x1030 +#define CL_DEVICE_PLATFORM 0x1031 +#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032 +#define CL_DEVICE_HALF_FP_CONFIG 0x1033 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034 +#define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035 /* deprecated */ +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C +#define CL_DEVICE_OPENCL_C_VERSION 0x103D +#define CL_DEVICE_LINKER_AVAILABLE 0x103E +#define CL_DEVICE_BUILT_IN_KERNELS 0x103F +#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE 0x1040 +#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE 0x1041 +#define CL_DEVICE_PARENT_DEVICE 0x1042 +#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES 0x1043 +#define CL_DEVICE_PARTITION_PROPERTIES 0x1044 +#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN 0x1045 +#define CL_DEVICE_PARTITION_TYPE 0x1046 +#define CL_DEVICE_REFERENCE_COUNT 0x1047 +#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC 0x1048 +#define CL_DEVICE_PRINTF_BUFFER_SIZE 0x1049 +#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A +#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B +#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS 0x104C +#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE 0x104D +#define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES 0x104E +#define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE 0x104F +#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE 0x1050 +#define CL_DEVICE_MAX_ON_DEVICE_QUEUES 0x1051 +#define CL_DEVICE_MAX_ON_DEVICE_EVENTS 0x1052 +#define CL_DEVICE_SVM_CAPABILITIES 0x1053 +#define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE 0x1054 +#define CL_DEVICE_MAX_PIPE_ARGS 0x1055 +#define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS 0x1056 +#define CL_DEVICE_PIPE_MAX_PACKET_SIZE 0x1057 +#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT 0x1058 +#define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT 0x1059 +#define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT 0x105A +#define CL_DEVICE_IL_VERSION 0x105B +#define CL_DEVICE_MAX_NUM_SUB_GROUPS 0x105C +#define CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS 0x105D + +/* cl_device_fp_config - bitfield */ +#define CL_FP_DENORM (1 << 0) +#define CL_FP_INF_NAN (1 << 1) +#define CL_FP_ROUND_TO_NEAREST (1 << 2) +#define CL_FP_ROUND_TO_ZERO (1 << 3) +#define CL_FP_ROUND_TO_INF (1 << 4) +#define CL_FP_FMA (1 << 5) +#define CL_FP_SOFT_FLOAT (1 << 6) +#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT (1 << 7) + +/* cl_device_mem_cache_type */ +#define CL_NONE 0x0 +#define CL_READ_ONLY_CACHE 0x1 +#define CL_READ_WRITE_CACHE 0x2 + +/* cl_device_local_mem_type */ +#define CL_LOCAL 0x1 +#define CL_GLOBAL 0x2 + +/* cl_device_exec_capabilities - bitfield */ +#define CL_EXEC_KERNEL (1 << 0) +#define CL_EXEC_NATIVE_KERNEL (1 << 1) + +/* cl_command_queue_properties - bitfield */ +#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0) +#define CL_QUEUE_PROFILING_ENABLE (1 << 1) +#define CL_QUEUE_ON_DEVICE (1 << 2) +#define CL_QUEUE_ON_DEVICE_DEFAULT (1 << 3) + +/* cl_context_info */ +#define CL_CONTEXT_REFERENCE_COUNT 0x1080 +#define CL_CONTEXT_DEVICES 0x1081 +#define CL_CONTEXT_PROPERTIES 0x1082 +#define CL_CONTEXT_NUM_DEVICES 0x1083 + +/* cl_context_properties */ +#define CL_CONTEXT_PLATFORM 0x1084 +#define CL_CONTEXT_INTEROP_USER_SYNC 0x1085 + +/* cl_device_partition_property */ +#define CL_DEVICE_PARTITION_EQUALLY 0x1086 +#define CL_DEVICE_PARTITION_BY_COUNTS 0x1087 +#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END 0x0 +#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN 0x1088 + +/* cl_device_affinity_domain */ +#define CL_DEVICE_AFFINITY_DOMAIN_NUMA (1 << 0) +#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE (1 << 1) +#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE (1 << 2) +#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE (1 << 3) +#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE (1 << 4) +#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5) + +/* cl_device_svm_capabilities */ +#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER (1 << 0) +#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER (1 << 1) +#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM (1 << 2) +#define CL_DEVICE_SVM_ATOMICS (1 << 3) + +/* cl_command_queue_info */ +#define CL_QUEUE_CONTEXT 0x1090 +#define CL_QUEUE_DEVICE 0x1091 +#define CL_QUEUE_REFERENCE_COUNT 0x1092 +#define CL_QUEUE_PROPERTIES 0x1093 +#define CL_QUEUE_SIZE 0x1094 +#define CL_QUEUE_DEVICE_DEFAULT 0x1095 + +/* cl_mem_flags and cl_svm_mem_flags - bitfield */ +#define CL_MEM_READ_WRITE (1 << 0) +#define CL_MEM_WRITE_ONLY (1 << 1) +#define CL_MEM_READ_ONLY (1 << 2) +#define CL_MEM_USE_HOST_PTR (1 << 3) +#define CL_MEM_ALLOC_HOST_PTR (1 << 4) +#define CL_MEM_COPY_HOST_PTR (1 << 5) +/* reserved (1 << 6) */ +#define CL_MEM_HOST_WRITE_ONLY (1 << 7) +#define CL_MEM_HOST_READ_ONLY (1 << 8) +#define CL_MEM_HOST_NO_ACCESS (1 << 9) +#define CL_MEM_SVM_FINE_GRAIN_BUFFER (1 << 10) /* used by cl_svm_mem_flags only */ +#define CL_MEM_SVM_ATOMICS (1 << 11) /* used by cl_svm_mem_flags only */ +#define CL_MEM_KERNEL_READ_AND_WRITE (1 << 12) + +/* cl_mem_migration_flags - bitfield */ +#define CL_MIGRATE_MEM_OBJECT_HOST (1 << 0) +#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED (1 << 1) + +/* cl_channel_order */ +#define CL_R 0x10B0 +#define CL_A 0x10B1 +#define CL_RG 0x10B2 +#define CL_RA 0x10B3 +#define CL_RGB 0x10B4 +#define CL_RGBA 0x10B5 +#define CL_BGRA 0x10B6 +#define CL_ARGB 0x10B7 +#define CL_INTENSITY 0x10B8 +#define CL_LUMINANCE 0x10B9 +#define CL_Rx 0x10BA +#define CL_RGx 0x10BB +#define CL_RGBx 0x10BC +#define CL_DEPTH 0x10BD +#define CL_DEPTH_STENCIL 0x10BE +#define CL_sRGB 0x10BF +#define CL_sRGBx 0x10C0 +#define CL_sRGBA 0x10C1 +#define CL_sBGRA 0x10C2 +#define CL_ABGR 0x10C3 + +/* cl_channel_type */ +#define CL_SNORM_INT8 0x10D0 +#define CL_SNORM_INT16 0x10D1 +#define CL_UNORM_INT8 0x10D2 +#define CL_UNORM_INT16 0x10D3 +#define CL_UNORM_SHORT_565 0x10D4 +#define CL_UNORM_SHORT_555 0x10D5 +#define CL_UNORM_INT_101010 0x10D6 +#define CL_SIGNED_INT8 0x10D7 +#define CL_SIGNED_INT16 0x10D8 +#define CL_SIGNED_INT32 0x10D9 +#define CL_UNSIGNED_INT8 0x10DA +#define CL_UNSIGNED_INT16 0x10DB +#define CL_UNSIGNED_INT32 0x10DC +#define CL_HALF_FLOAT 0x10DD +#define CL_FLOAT 0x10DE +#define CL_UNORM_INT24 0x10DF +#define CL_UNORM_INT_101010_2 0x10E0 + +/* cl_mem_object_type */ +#define CL_MEM_OBJECT_BUFFER 0x10F0 +#define CL_MEM_OBJECT_IMAGE2D 0x10F1 +#define CL_MEM_OBJECT_IMAGE3D 0x10F2 +#define CL_MEM_OBJECT_IMAGE2D_ARRAY 0x10F3 +#define CL_MEM_OBJECT_IMAGE1D 0x10F4 +#define CL_MEM_OBJECT_IMAGE1D_ARRAY 0x10F5 +#define CL_MEM_OBJECT_IMAGE1D_BUFFER 0x10F6 +#define CL_MEM_OBJECT_PIPE 0x10F7 + +/* cl_mem_info */ +#define CL_MEM_TYPE 0x1100 +#define CL_MEM_FLAGS 0x1101 +#define CL_MEM_SIZE 0x1102 +#define CL_MEM_HOST_PTR 0x1103 +#define CL_MEM_MAP_COUNT 0x1104 +#define CL_MEM_REFERENCE_COUNT 0x1105 +#define CL_MEM_CONTEXT 0x1106 +#define CL_MEM_ASSOCIATED_MEMOBJECT 0x1107 +#define CL_MEM_OFFSET 0x1108 +#define CL_MEM_USES_SVM_POINTER 0x1109 + +/* cl_image_info */ +#define CL_IMAGE_FORMAT 0x1110 +#define CL_IMAGE_ELEMENT_SIZE 0x1111 +#define CL_IMAGE_ROW_PITCH 0x1112 +#define CL_IMAGE_SLICE_PITCH 0x1113 +#define CL_IMAGE_WIDTH 0x1114 +#define CL_IMAGE_HEIGHT 0x1115 +#define CL_IMAGE_DEPTH 0x1116 +#define CL_IMAGE_ARRAY_SIZE 0x1117 +#define CL_IMAGE_BUFFER 0x1118 +#define CL_IMAGE_NUM_MIP_LEVELS 0x1119 +#define CL_IMAGE_NUM_SAMPLES 0x111A + +/* cl_pipe_info */ +#define CL_PIPE_PACKET_SIZE 0x1120 +#define CL_PIPE_MAX_PACKETS 0x1121 + +/* cl_addressing_mode */ +#define CL_ADDRESS_NONE 0x1130 +#define CL_ADDRESS_CLAMP_TO_EDGE 0x1131 +#define CL_ADDRESS_CLAMP 0x1132 +#define CL_ADDRESS_REPEAT 0x1133 +#define CL_ADDRESS_MIRRORED_REPEAT 0x1134 + +/* cl_filter_mode */ +#define CL_FILTER_NEAREST 0x1140 +#define CL_FILTER_LINEAR 0x1141 + +/* cl_sampler_info */ +#define CL_SAMPLER_REFERENCE_COUNT 0x1150 +#define CL_SAMPLER_CONTEXT 0x1151 +#define CL_SAMPLER_NORMALIZED_COORDS 0x1152 +#define CL_SAMPLER_ADDRESSING_MODE 0x1153 +#define CL_SAMPLER_FILTER_MODE 0x1154 +#define CL_SAMPLER_MIP_FILTER_MODE 0x1155 +#define CL_SAMPLER_LOD_MIN 0x1156 +#define CL_SAMPLER_LOD_MAX 0x1157 + +/* cl_map_flags - bitfield */ +#define CL_MAP_READ (1 << 0) +#define CL_MAP_WRITE (1 << 1) +#define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2) + +/* cl_program_info */ +#define CL_PROGRAM_REFERENCE_COUNT 0x1160 +#define CL_PROGRAM_CONTEXT 0x1161 +#define CL_PROGRAM_NUM_DEVICES 0x1162 +#define CL_PROGRAM_DEVICES 0x1163 +#define CL_PROGRAM_SOURCE 0x1164 +#define CL_PROGRAM_BINARY_SIZES 0x1165 +#define CL_PROGRAM_BINARIES 0x1166 +#define CL_PROGRAM_NUM_KERNELS 0x1167 +#define CL_PROGRAM_KERNEL_NAMES 0x1168 +#define CL_PROGRAM_IL 0x1169 +#define CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT 0x116A +#define CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT 0x116B + +/* cl_program_build_info */ +#define CL_PROGRAM_BUILD_STATUS 0x1181 +#define CL_PROGRAM_BUILD_OPTIONS 0x1182 +#define CL_PROGRAM_BUILD_LOG 0x1183 +#define CL_PROGRAM_BINARY_TYPE 0x1184 +#define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185 + +/* cl_program_binary_type */ +#define CL_PROGRAM_BINARY_TYPE_NONE 0x0 +#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT 0x1 +#define CL_PROGRAM_BINARY_TYPE_LIBRARY 0x2 +#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE 0x4 + +/* cl_build_status */ +#define CL_BUILD_SUCCESS 0 +#define CL_BUILD_NONE -1 +#define CL_BUILD_ERROR -2 +#define CL_BUILD_IN_PROGRESS -3 + +/* cl_kernel_info */ +#define CL_KERNEL_FUNCTION_NAME 0x1190 +#define CL_KERNEL_NUM_ARGS 0x1191 +#define CL_KERNEL_REFERENCE_COUNT 0x1192 +#define CL_KERNEL_CONTEXT 0x1193 +#define CL_KERNEL_PROGRAM 0x1194 +#define CL_KERNEL_ATTRIBUTES 0x1195 +#define CL_KERNEL_MAX_NUM_SUB_GROUPS 0x11B9 +#define CL_KERNEL_COMPILE_NUM_SUB_GROUPS 0x11BA + +/* cl_kernel_arg_info */ +#define CL_KERNEL_ARG_ADDRESS_QUALIFIER 0x1196 +#define CL_KERNEL_ARG_ACCESS_QUALIFIER 0x1197 +#define CL_KERNEL_ARG_TYPE_NAME 0x1198 +#define CL_KERNEL_ARG_TYPE_QUALIFIER 0x1199 +#define CL_KERNEL_ARG_NAME 0x119A + +/* cl_kernel_arg_address_qualifier */ +#define CL_KERNEL_ARG_ADDRESS_GLOBAL 0x119B +#define CL_KERNEL_ARG_ADDRESS_LOCAL 0x119C +#define CL_KERNEL_ARG_ADDRESS_CONSTANT 0x119D +#define CL_KERNEL_ARG_ADDRESS_PRIVATE 0x119E + +/* cl_kernel_arg_access_qualifier */ +#define CL_KERNEL_ARG_ACCESS_READ_ONLY 0x11A0 +#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY 0x11A1 +#define CL_KERNEL_ARG_ACCESS_READ_WRITE 0x11A2 +#define CL_KERNEL_ARG_ACCESS_NONE 0x11A3 + +/* cl_kernel_arg_type_qualifier */ +#define CL_KERNEL_ARG_TYPE_NONE 0 +#define CL_KERNEL_ARG_TYPE_CONST (1 << 0) +#define CL_KERNEL_ARG_TYPE_RESTRICT (1 << 1) +#define CL_KERNEL_ARG_TYPE_VOLATILE (1 << 2) +#define CL_KERNEL_ARG_TYPE_PIPE (1 << 3) + +/* cl_kernel_work_group_info */ +#define CL_KERNEL_WORK_GROUP_SIZE 0x11B0 +#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1 +#define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2 +#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3 +#define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4 +#define CL_KERNEL_GLOBAL_WORK_SIZE 0x11B5 + +/* cl_kernel_sub_group_info */ +#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE 0x2033 +#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE 0x2034 +#define CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT 0x11B8 + +/* cl_kernel_exec_info */ +#define CL_KERNEL_EXEC_INFO_SVM_PTRS 0x11B6 +#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM 0x11B7 + +/* cl_event_info */ +#define CL_EVENT_COMMAND_QUEUE 0x11D0 +#define CL_EVENT_COMMAND_TYPE 0x11D1 +#define CL_EVENT_REFERENCE_COUNT 0x11D2 +#define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3 +#define CL_EVENT_CONTEXT 0x11D4 + +/* cl_command_type */ +#define CL_COMMAND_NDRANGE_KERNEL 0x11F0 +#define CL_COMMAND_TASK 0x11F1 +#define CL_COMMAND_NATIVE_KERNEL 0x11F2 +#define CL_COMMAND_READ_BUFFER 0x11F3 +#define CL_COMMAND_WRITE_BUFFER 0x11F4 +#define CL_COMMAND_COPY_BUFFER 0x11F5 +#define CL_COMMAND_READ_IMAGE 0x11F6 +#define CL_COMMAND_WRITE_IMAGE 0x11F7 +#define CL_COMMAND_COPY_IMAGE 0x11F8 +#define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9 +#define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA +#define CL_COMMAND_MAP_BUFFER 0x11FB +#define CL_COMMAND_MAP_IMAGE 0x11FC +#define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD +#define CL_COMMAND_MARKER 0x11FE +#define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF +#define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200 +#define CL_COMMAND_READ_BUFFER_RECT 0x1201 +#define CL_COMMAND_WRITE_BUFFER_RECT 0x1202 +#define CL_COMMAND_COPY_BUFFER_RECT 0x1203 +#define CL_COMMAND_USER 0x1204 +#define CL_COMMAND_BARRIER 0x1205 +#define CL_COMMAND_MIGRATE_MEM_OBJECTS 0x1206 +#define CL_COMMAND_FILL_BUFFER 0x1207 +#define CL_COMMAND_FILL_IMAGE 0x1208 +#define CL_COMMAND_SVM_FREE 0x1209 +#define CL_COMMAND_SVM_MEMCPY 0x120A +#define CL_COMMAND_SVM_MEMFILL 0x120B +#define CL_COMMAND_SVM_MAP 0x120C +#define CL_COMMAND_SVM_UNMAP 0x120D + +/* command execution status */ +#define CL_COMPLETE 0x0 +#define CL_RUNNING 0x1 +#define CL_SUBMITTED 0x2 +#define CL_QUEUED 0x3 + +/* cl_buffer_create_type */ +#define CL_BUFFER_CREATE_TYPE_REGION 0x1220 + +/* cl_profiling_info */ +#define CL_PROFILING_COMMAND_QUEUED 0x1280 +#define CL_PROFILING_COMMAND_SUBMIT 0x1281 +#define CL_PROFILING_COMMAND_START 0x1282 +#define CL_PROFILING_COMMAND_END 0x1283 +#define CL_PROFILING_COMMAND_COMPLETE 0x1284 + +/********************************************************************************************************/ + +/* Platform API */ +extern CL_API_ENTRY cl_int CL_API_CALL +clGetPlatformIDs(cl_uint /* num_entries */, + cl_platform_id * /* platforms */, + cl_uint * /* num_platforms */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetPlatformInfo(cl_platform_id /* platform */, + cl_platform_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Device APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceIDs(cl_platform_id /* platform */, + cl_device_type /* device_type */, + cl_uint /* num_entries */, + cl_device_id * /* devices */, + cl_uint * /* num_devices */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceInfo(cl_device_id /* device */, + cl_device_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCreateSubDevices(cl_device_id /* in_device */, + const cl_device_partition_property * /* properties */, + cl_uint /* num_devices */, + cl_device_id * /* out_devices */, + cl_uint * /* num_devices_ret */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseDevice(cl_device_id /* device */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetDefaultDeviceCommandQueue(cl_context /* context */, + cl_device_id /* device */, + cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_2_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceAndHostTimer(cl_device_id /* device */, + cl_ulong* /* device_timestamp */, + cl_ulong* /* host_timestamp */) CL_API_SUFFIX__VERSION_2_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetHostTimer(cl_device_id /* device */, + cl_ulong * /* host_timestamp */) CL_API_SUFFIX__VERSION_2_1; + + +/* Context APIs */ +extern CL_API_ENTRY cl_context CL_API_CALL +clCreateContext(const cl_context_properties * /* properties */, + cl_uint /* num_devices */, + const cl_device_id * /* devices */, + void (CL_CALLBACK * /* pfn_notify */)(const char *, const void *, size_t, void *), + void * /* user_data */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_context CL_API_CALL +clCreateContextFromType(const cl_context_properties * /* properties */, + cl_device_type /* device_type */, + void (CL_CALLBACK * /* pfn_notify*/ )(const char *, const void *, size_t, void *), + void * /* user_data */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseContext(cl_context /* context */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetContextInfo(cl_context /* context */, + cl_context_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Command Queue APIs */ +extern CL_API_ENTRY cl_command_queue CL_API_CALL +clCreateCommandQueueWithProperties(cl_context /* context */, + cl_device_id /* device */, + const cl_queue_properties * /* properties */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseCommandQueue(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetCommandQueueInfo(cl_command_queue /* command_queue */, + cl_command_queue_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Memory Object APIs */ +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateBuffer(cl_context /* context */, + cl_mem_flags /* flags */, + size_t /* size */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateSubBuffer(cl_mem /* buffer */, + cl_mem_flags /* flags */, + cl_buffer_create_type /* buffer_create_type */, + const void * /* buffer_create_info */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateImage(cl_context /* context */, + cl_mem_flags /* flags */, + const cl_image_format * /* image_format */, + const cl_image_desc * /* image_desc */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreatePipe(cl_context /* context */, + cl_mem_flags /* flags */, + cl_uint /* pipe_packet_size */, + cl_uint /* pipe_max_packets */, + const cl_pipe_properties * /* properties */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseMemObject(cl_mem /* memobj */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetSupportedImageFormats(cl_context /* context */, + cl_mem_flags /* flags */, + cl_mem_object_type /* image_type */, + cl_uint /* num_entries */, + cl_image_format * /* image_formats */, + cl_uint * /* num_image_formats */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetMemObjectInfo(cl_mem /* memobj */, + cl_mem_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetImageInfo(cl_mem /* image */, + cl_image_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetPipeInfo(cl_mem /* pipe */, + cl_pipe_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_2_0; + + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetMemObjectDestructorCallback(cl_mem /* memobj */, + void (CL_CALLBACK * /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), + void * /*user_data */ ) CL_API_SUFFIX__VERSION_1_1; + +/* SVM Allocation APIs */ +extern CL_API_ENTRY void * CL_API_CALL +clSVMAlloc(cl_context /* context */, + cl_svm_mem_flags /* flags */, + size_t /* size */, + cl_uint /* alignment */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY void CL_API_CALL +clSVMFree(cl_context /* context */, + void * /* svm_pointer */) CL_API_SUFFIX__VERSION_2_0; + +/* Sampler APIs */ +extern CL_API_ENTRY cl_sampler CL_API_CALL +clCreateSamplerWithProperties(cl_context /* context */, + const cl_sampler_properties * /* normalized_coords */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseSampler(cl_sampler /* sampler */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetSamplerInfo(cl_sampler /* sampler */, + cl_sampler_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Program Object APIs */ +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithSource(cl_context /* context */, + cl_uint /* count */, + const char ** /* strings */, + const size_t * /* lengths */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithBinary(cl_context /* context */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const size_t * /* lengths */, + const unsigned char ** /* binaries */, + cl_int * /* binary_status */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithBuiltInKernels(cl_context /* context */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* kernel_names */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithIL(cl_context /* context */, + const void* /* il */, + size_t /* length */, + cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_2_1; + + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseProgram(cl_program /* program */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clBuildProgram(cl_program /* program */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* options */, + void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), + void * /* user_data */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCompileProgram(cl_program /* program */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* options */, + cl_uint /* num_input_headers */, + const cl_program * /* input_headers */, + const char ** /* header_include_names */, + void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), + void * /* user_data */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_program CL_API_CALL +clLinkProgram(cl_context /* context */, + cl_uint /* num_devices */, + const cl_device_id * /* device_list */, + const char * /* options */, + cl_uint /* num_input_programs */, + const cl_program * /* input_programs */, + void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), + void * /* user_data */, + cl_int * /* errcode_ret */ ) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetProgramReleaseCallback(cl_program /* program */, + void (CL_CALLBACK * /* pfn_notify */)(cl_program /* program */, void * /* user_data */), + void * /* user_data */) CL_API_SUFFIX__VERSION_2_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetProgramSpecializationConstant(cl_program /* program */, + cl_uint /* spec_id */, + size_t /* spec_size */, + const void* /* spec_value */) CL_API_SUFFIX__VERSION_2_2; + + +extern CL_API_ENTRY cl_int CL_API_CALL +clUnloadPlatformCompiler(cl_platform_id /* platform */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetProgramInfo(cl_program /* program */, + cl_program_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetProgramBuildInfo(cl_program /* program */, + cl_device_id /* device */, + cl_program_build_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Kernel Object APIs */ +extern CL_API_ENTRY cl_kernel CL_API_CALL +clCreateKernel(cl_program /* program */, + const char * /* kernel_name */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCreateKernelsInProgram(cl_program /* program */, + cl_uint /* num_kernels */, + cl_kernel * /* kernels */, + cl_uint * /* num_kernels_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_kernel CL_API_CALL +clCloneKernel(cl_kernel /* source_kernel */, + cl_int* /* errcode_ret */) CL_API_SUFFIX__VERSION_2_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseKernel(cl_kernel /* kernel */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelArg(cl_kernel /* kernel */, + cl_uint /* arg_index */, + size_t /* arg_size */, + const void * /* arg_value */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelArgSVMPointer(cl_kernel /* kernel */, + cl_uint /* arg_index */, + const void * /* arg_value */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelExecInfo(cl_kernel /* kernel */, + cl_kernel_exec_info /* param_name */, + size_t /* param_value_size */, + const void * /* param_value */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelInfo(cl_kernel /* kernel */, + cl_kernel_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelArgInfo(cl_kernel /* kernel */, + cl_uint /* arg_indx */, + cl_kernel_arg_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelWorkGroupInfo(cl_kernel /* kernel */, + cl_device_id /* device */, + cl_kernel_work_group_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelSubGroupInfo(cl_kernel /* kernel */, + cl_device_id /* device */, + cl_kernel_sub_group_info /* param_name */, + size_t /* input_value_size */, + const void* /*input_value */, + size_t /* param_value_size */, + void* /* param_value */, + size_t* /* param_value_size_ret */ ) CL_API_SUFFIX__VERSION_2_1; + + +/* Event Object APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clWaitForEvents(cl_uint /* num_events */, + const cl_event * /* event_list */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetEventInfo(cl_event /* event */, + cl_event_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_event CL_API_CALL +clCreateUserEvent(cl_context /* context */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseEvent(cl_event /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetUserEventStatus(cl_event /* event */, + cl_int /* execution_status */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetEventCallback( cl_event /* event */, + cl_int /* command_exec_callback_type */, + void (CL_CALLBACK * /* pfn_notify */)(cl_event, cl_int, void *), + void * /* user_data */) CL_API_SUFFIX__VERSION_1_1; + +/* Profiling APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clGetEventProfilingInfo(cl_event /* event */, + cl_profiling_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +/* Flush and Finish APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clFlush(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clFinish(cl_command_queue /* command_queue */) CL_API_SUFFIX__VERSION_1_0; + +/* Enqueued Commands APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_read */, + size_t /* offset */, + size_t /* size */, + void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadBufferRect(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_read */, + const size_t * /* buffer_offset */, + const size_t * /* host_offset */, + const size_t * /* region */, + size_t /* buffer_row_pitch */, + size_t /* buffer_slice_pitch */, + size_t /* host_row_pitch */, + size_t /* host_slice_pitch */, + void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_write */, + size_t /* offset */, + size_t /* size */, + const void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteBufferRect(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_write */, + const size_t * /* buffer_offset */, + const size_t * /* host_offset */, + const size_t * /* region */, + size_t /* buffer_row_pitch */, + size_t /* buffer_slice_pitch */, + size_t /* host_row_pitch */, + size_t /* host_slice_pitch */, + const void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueFillBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + const void * /* pattern */, + size_t /* pattern_size */, + size_t /* offset */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBuffer(cl_command_queue /* command_queue */, + cl_mem /* src_buffer */, + cl_mem /* dst_buffer */, + size_t /* src_offset */, + size_t /* dst_offset */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBufferRect(cl_command_queue /* command_queue */, + cl_mem /* src_buffer */, + cl_mem /* dst_buffer */, + const size_t * /* src_origin */, + const size_t * /* dst_origin */, + const size_t * /* region */, + size_t /* src_row_pitch */, + size_t /* src_slice_pitch */, + size_t /* dst_row_pitch */, + size_t /* dst_slice_pitch */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + cl_bool /* blocking_read */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + size_t /* row_pitch */, + size_t /* slice_pitch */, + void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + cl_bool /* blocking_write */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + size_t /* input_row_pitch */, + size_t /* input_slice_pitch */, + const void * /* ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueFillImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + const void * /* fill_color */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyImage(cl_command_queue /* command_queue */, + cl_mem /* src_image */, + cl_mem /* dst_image */, + const size_t * /* src_origin[3] */, + const size_t * /* dst_origin[3] */, + const size_t * /* region[3] */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyImageToBuffer(cl_command_queue /* command_queue */, + cl_mem /* src_image */, + cl_mem /* dst_buffer */, + const size_t * /* src_origin[3] */, + const size_t * /* region[3] */, + size_t /* dst_offset */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBufferToImage(cl_command_queue /* command_queue */, + cl_mem /* src_buffer */, + cl_mem /* dst_image */, + size_t /* src_offset */, + const size_t * /* dst_origin[3] */, + const size_t * /* region[3] */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY void * CL_API_CALL +clEnqueueMapBuffer(cl_command_queue /* command_queue */, + cl_mem /* buffer */, + cl_bool /* blocking_map */, + cl_map_flags /* map_flags */, + size_t /* offset */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY void * CL_API_CALL +clEnqueueMapImage(cl_command_queue /* command_queue */, + cl_mem /* image */, + cl_bool /* blocking_map */, + cl_map_flags /* map_flags */, + const size_t * /* origin[3] */, + const size_t * /* region[3] */, + size_t * /* image_row_pitch */, + size_t * /* image_slice_pitch */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueUnmapMemObject(cl_command_queue /* command_queue */, + cl_mem /* memobj */, + void * /* mapped_ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueMigrateMemObjects(cl_command_queue /* command_queue */, + cl_uint /* num_mem_objects */, + const cl_mem * /* mem_objects */, + cl_mem_migration_flags /* flags */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueNDRangeKernel(cl_command_queue /* command_queue */, + cl_kernel /* kernel */, + cl_uint /* work_dim */, + const size_t * /* global_work_offset */, + const size_t * /* global_work_size */, + const size_t * /* local_work_size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueNativeKernel(cl_command_queue /* command_queue */, + void (CL_CALLBACK * /*user_func*/)(void *), + void * /* args */, + size_t /* cb_args */, + cl_uint /* num_mem_objects */, + const cl_mem * /* mem_list */, + const void ** /* args_mem_loc */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueMarkerWithWaitList(cl_command_queue /* command_queue */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueBarrierWithWaitList(cl_command_queue /* command_queue */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMFree(cl_command_queue /* command_queue */, + cl_uint /* num_svm_pointers */, + void *[] /* svm_pointers[] */, + void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */, + cl_uint /* num_svm_pointers */, + void *[] /* svm_pointers[] */, + void * /* user_data */), + void * /* user_data */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMemcpy(cl_command_queue /* command_queue */, + cl_bool /* blocking_copy */, + void * /* dst_ptr */, + const void * /* src_ptr */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMemFill(cl_command_queue /* command_queue */, + void * /* svm_ptr */, + const void * /* pattern */, + size_t /* pattern_size */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMap(cl_command_queue /* command_queue */, + cl_bool /* blocking_map */, + cl_map_flags /* flags */, + void * /* svm_ptr */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMUnmap(cl_command_queue /* command_queue */, + void * /* svm_ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMigrateMem(cl_command_queue /* command_queue */, + cl_uint /* num_svm_pointers */, + const void ** /* svm_pointers */, + const size_t * /* sizes */, + cl_mem_migration_flags /* flags */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_2_1; + + +/* Extension function access + * + * Returns the extension function address for the given function name, + * or NULL if a valid function can not be found. The client must + * check to make sure the address is not NULL, before using or + * calling the returned function address. + */ +extern CL_API_ENTRY void * CL_API_CALL +clGetExtensionFunctionAddressForPlatform(cl_platform_id /* platform */, + const char * /* func_name */) CL_API_SUFFIX__VERSION_1_2; + + +/* Deprecated OpenCL 1.1 APIs */ +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateImage2D(cl_context /* context */, + cl_mem_flags /* flags */, + const cl_image_format * /* image_format */, + size_t /* image_width */, + size_t /* image_height */, + size_t /* image_row_pitch */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateImage3D(cl_context /* context */, + cl_mem_flags /* flags */, + const cl_image_format * /* image_format */, + size_t /* image_width */, + size_t /* image_height */, + size_t /* image_depth */, + size_t /* image_row_pitch */, + size_t /* image_slice_pitch */, + void * /* host_ptr */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clEnqueueMarker(cl_command_queue /* command_queue */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clEnqueueWaitForEvents(cl_command_queue /* command_queue */, + cl_uint /* num_events */, + const cl_event * /* event_list */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clEnqueueBarrier(cl_command_queue /* command_queue */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL +clGetExtensionFunctionAddress(const char * /* func_name */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +/* Deprecated OpenCL 2.0 APIs */ +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_command_queue CL_API_CALL +clCreateCommandQueue(cl_context /* context */, + cl_device_id /* device */, + cl_command_queue_properties /* properties */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED; + + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_sampler CL_API_CALL +clCreateSampler(cl_context /* context */, + cl_bool /* normalized_coords */, + cl_addressing_mode /* addressing_mode */, + cl_filter_mode /* filter_mode */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int CL_API_CALL +clEnqueueTask(cl_command_queue /* command_queue */, + cl_kernel /* kernel */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_H */ + diff --git a/include/triton/external/CL/cl.hpp b/include/triton/external/CL/cl.hpp new file mode 100644 index 000000000..6634f8c76 --- /dev/null +++ b/include/triton/external/CL/cl.hpp @@ -0,0 +1,12947 @@ +/******************************************************************************* + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +/*! \file + * + * \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33) and + * OpenCL 1.2 (rev 15) + * \author Benedict R. Gaster, Laurent Morichetti and Lee Howes + * + * Additions and fixes from: + * Brian Cole, March 3rd 2010 and April 2012 + * Matt Gruenke, April 2012. + * Bruce Merry, February 2013. + * Tom Deakin and Simon McIntosh-Smith, July 2013 + * + * \version 1.2.9 + * \date December 2015 + * + * Optional extension support + * + * cl + * cl_ext_device_fission + * #define USE_CL_DEVICE_FISSION + */ + +/*! \mainpage + * \section intro Introduction + * For many large applications C++ is the language of choice and so it seems + * reasonable to define C++ bindings for OpenCL. + * + * + * The interface is contained with a single C++ header file \em cl.hpp and all + * definitions are contained within the namespace \em cl. There is no additional + * requirement to include \em cl.h and to use either the C++ or original C + * bindings it is enough to simply include \em cl.hpp. + * + * The bindings themselves are lightweight and correspond closely to the + * underlying C API. Using the C++ bindings introduces no additional execution + * overhead. + * + * For detail documentation on the bindings see: + * + * The OpenCL C++ Wrapper API 1.2 (revision 09) + * http://www.khronos.org/registry/cl/specs/opencl-cplusplus-1.2.pdf + * + * \section example Example + * + * The following example shows a general use case for the C++ + * bindings, including support for the optional exception feature and + * also the supplied vector and string classes, see following sections for + * decriptions of these features. + * + * \code + * #define __CL_ENABLE_EXCEPTIONS + * + * #if defined(__APPLE__) || defined(__MACOSX) + * #include + * #else + * #include + * #endif + * #include + * #include + * #include + * + * const char * helloStr = "__kernel void " + * "hello(void) " + * "{ " + * " " + * "} "; + * + * int + * main(void) + * { + * cl_int err = CL_SUCCESS; + * try { + * + * std::vector platforms; + * cl::Platform::get(&platforms); + * if (platforms.size() == 0) { + * std::cout << "Platform size 0\n"; + * return -1; + * } + * + * cl_context_properties properties[] = + * { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0}; + * cl::Context context(CL_DEVICE_TYPE_CPU, properties); + * + * std::vector devices = context.getInfo(); + * + * cl::Program::Sources source(1, + * std::make_pair(helloStr,strlen(helloStr))); + * cl::Program program_ = cl::Program(context, source); + * program_.build(devices); + * + * cl::Kernel kernel(program_, "hello", &err); + * + * cl::Event event; + * cl::CommandQueue queue(context, devices[0], 0, &err); + * queue.enqueueNDRangeKernel( + * kernel, + * cl::NullRange, + * cl::NDRange(4,4), + * cl::NullRange, + * NULL, + * &event); + * + * event.wait(); + * } + * catch (cl::Error err) { + * std::cerr + * << "ERROR: " + * << err.what() + * << "(" + * << err.err() + * << ")" + * << std::endl; + * } + * + * return EXIT_SUCCESS; + * } + * + * \endcode + * + */ +#ifndef CL_HPP_ +#define CL_HPP_ + +#ifdef _WIN32 + +#include + +#if defined(USE_DX_INTEROP) +#include +#include +#endif +#endif // _WIN32 + +#if defined(_MSC_VER) +#include +#endif // _MSC_VER + +// +#if defined(USE_CL_DEVICE_FISSION) +#include +#endif + +#if defined(__APPLE__) || defined(__MACOSX) +#include +#else +#include +#endif // !__APPLE__ + +#if (_MSC_VER >= 1700) || (__cplusplus >= 201103L) +#define CL_HPP_RVALUE_REFERENCES_SUPPORTED +#define CL_HPP_CPP11_ATOMICS_SUPPORTED +#include +#endif + +#if (__cplusplus >= 201103L) +#define CL_HPP_NOEXCEPT noexcept +#else +#define CL_HPP_NOEXCEPT +#endif + + +// To avoid accidentally taking ownership of core OpenCL types +// such as cl_kernel constructors are made explicit +// under OpenCL 1.2 +#if defined(CL_VERSION_1_2) && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) +#define __CL_EXPLICIT_CONSTRUCTORS explicit +#else // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) +#define __CL_EXPLICIT_CONSTRUCTORS +#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + +// Define deprecated prefixes and suffixes to ensure compilation +// in case they are not pre-defined +#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) +#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) +#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED) +#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) + +#if !defined(CL_CALLBACK) +#define CL_CALLBACK +#endif //CL_CALLBACK + +#include +#include +#include + +#if defined(__CL_ENABLE_EXCEPTIONS) +#include +#endif // #if defined(__CL_ENABLE_EXCEPTIONS) + +#if !defined(__NO_STD_VECTOR) +#include +#endif + +#if !defined(__NO_STD_STRING) +#include +#endif + +#if defined(__ANDROID__) || defined(linux) || defined(__APPLE__) || defined(__MACOSX) +#include +#endif // linux + +#include + + +/*! \namespace cl + * + * \brief The OpenCL C++ bindings are defined within this namespace. + * + */ +namespace cl { + +class Memory; + +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) +#define __INIT_CL_EXT_FCN_PTR(name) \ + if(!pfn_##name) { \ + pfn_##name = (PFN_##name) \ + clGetExtensionFunctionAddress(#name); \ + if(!pfn_##name) { \ + } \ + } +#endif // #if defined(CL_VERSION_1_1) + +#if defined(CL_VERSION_1_2) +#define __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, name) \ + if(!pfn_##name) { \ + pfn_##name = (PFN_##name) \ + clGetExtensionFunctionAddressForPlatform(platform, #name); \ + if(!pfn_##name) { \ + } \ + } +#endif // #if defined(CL_VERSION_1_1) + +class Program; +class Device; +class Context; +class CommandQueue; +class Memory; +class Buffer; + +#if defined(__CL_ENABLE_EXCEPTIONS) +/*! \brief Exception class + * + * This may be thrown by API functions when __CL_ENABLE_EXCEPTIONS is defined. + */ +class Error : public std::exception +{ +private: + cl_int err_; + const char * errStr_; +public: + /*! \brief Create a new CL error exception for a given error code + * and corresponding message. + * + * \param err error code value. + * + * \param errStr a descriptive string that must remain in scope until + * handling of the exception has concluded. If set, it + * will be returned by what(). + */ + Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr) + {} + + ~Error() throw() {} + + /*! \brief Get error string associated with exception + * + * \return A memory pointer to the error message string. + */ + virtual const char * what() const throw () + { + if (errStr_ == NULL) { + return "empty"; + } + else { + return errStr_; + } + } + + /*! \brief Get error code associated with exception + * + * \return The error code. + */ + cl_int err(void) const { return err_; } +}; + +#define __ERR_STR(x) #x +#else +#define __ERR_STR(x) NULL +#endif // __CL_ENABLE_EXCEPTIONS + + +namespace detail +{ +#if defined(__CL_ENABLE_EXCEPTIONS) +static inline cl_int errHandler ( + cl_int err, + const char * errStr = NULL) +{ + if (err != CL_SUCCESS) { + throw Error(err, errStr); + } + return err; +} +#else +static inline cl_int errHandler (cl_int err, const char * errStr = NULL) +{ + (void) errStr; // suppress unused variable warning + return err; +} +#endif // __CL_ENABLE_EXCEPTIONS +} + + + +//! \cond DOXYGEN_DETAIL +#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS) +#define __GET_DEVICE_INFO_ERR __ERR_STR(clGetDeviceInfo) +#define __GET_PLATFORM_INFO_ERR __ERR_STR(clGetPlatformInfo) +#define __GET_DEVICE_IDS_ERR __ERR_STR(clGetDeviceIDs) +#define __GET_PLATFORM_IDS_ERR __ERR_STR(clGetPlatformIDs) +#define __GET_CONTEXT_INFO_ERR __ERR_STR(clGetContextInfo) +#define __GET_EVENT_INFO_ERR __ERR_STR(clGetEventInfo) +#define __GET_EVENT_PROFILE_INFO_ERR __ERR_STR(clGetEventProfileInfo) +#define __GET_MEM_OBJECT_INFO_ERR __ERR_STR(clGetMemObjectInfo) +#define __GET_IMAGE_INFO_ERR __ERR_STR(clGetImageInfo) +#define __GET_SAMPLER_INFO_ERR __ERR_STR(clGetSamplerInfo) +#define __GET_KERNEL_INFO_ERR __ERR_STR(clGetKernelInfo) +#if defined(CL_VERSION_1_2) +#define __GET_KERNEL_ARG_INFO_ERR __ERR_STR(clGetKernelArgInfo) +#endif // #if defined(CL_VERSION_1_2) +#define __GET_KERNEL_WORK_GROUP_INFO_ERR __ERR_STR(clGetKernelWorkGroupInfo) +#define __GET_PROGRAM_INFO_ERR __ERR_STR(clGetProgramInfo) +#define __GET_PROGRAM_BUILD_INFO_ERR __ERR_STR(clGetProgramBuildInfo) +#define __GET_COMMAND_QUEUE_INFO_ERR __ERR_STR(clGetCommandQueueInfo) + +#define __CREATE_CONTEXT_ERR __ERR_STR(clCreateContext) +#define __CREATE_CONTEXT_FROM_TYPE_ERR __ERR_STR(clCreateContextFromType) +#define __GET_SUPPORTED_IMAGE_FORMATS_ERR __ERR_STR(clGetSupportedImageFormats) + +#define __CREATE_BUFFER_ERR __ERR_STR(clCreateBuffer) +#define __COPY_ERR __ERR_STR(cl::copy) +#define __CREATE_SUBBUFFER_ERR __ERR_STR(clCreateSubBuffer) +#define __CREATE_GL_BUFFER_ERR __ERR_STR(clCreateFromGLBuffer) +#define __CREATE_GL_RENDER_BUFFER_ERR __ERR_STR(clCreateFromGLBuffer) +#define __GET_GL_OBJECT_INFO_ERR __ERR_STR(clGetGLObjectInfo) +#if defined(CL_VERSION_1_2) +#define __CREATE_IMAGE_ERR __ERR_STR(clCreateImage) +#define __CREATE_GL_TEXTURE_ERR __ERR_STR(clCreateFromGLTexture) +#define __IMAGE_DIMENSION_ERR __ERR_STR(Incorrect image dimensions) +#endif // #if defined(CL_VERSION_1_2) +#define __CREATE_SAMPLER_ERR __ERR_STR(clCreateSampler) +#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR __ERR_STR(clSetMemObjectDestructorCallback) + +#define __CREATE_USER_EVENT_ERR __ERR_STR(clCreateUserEvent) +#define __SET_USER_EVENT_STATUS_ERR __ERR_STR(clSetUserEventStatus) +#define __SET_EVENT_CALLBACK_ERR __ERR_STR(clSetEventCallback) +#define __WAIT_FOR_EVENTS_ERR __ERR_STR(clWaitForEvents) + +#define __CREATE_KERNEL_ERR __ERR_STR(clCreateKernel) +#define __SET_KERNEL_ARGS_ERR __ERR_STR(clSetKernelArg) +#define __CREATE_PROGRAM_WITH_SOURCE_ERR __ERR_STR(clCreateProgramWithSource) +#define __CREATE_PROGRAM_WITH_BINARY_ERR __ERR_STR(clCreateProgramWithBinary) +#if defined(CL_VERSION_1_2) +#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR __ERR_STR(clCreateProgramWithBuiltInKernels) +#endif // #if defined(CL_VERSION_1_2) +#define __BUILD_PROGRAM_ERR __ERR_STR(clBuildProgram) +#if defined(CL_VERSION_1_2) +#define __COMPILE_PROGRAM_ERR __ERR_STR(clCompileProgram) +#define __LINK_PROGRAM_ERR __ERR_STR(clLinkProgram) +#endif // #if defined(CL_VERSION_1_2) +#define __CREATE_KERNELS_IN_PROGRAM_ERR __ERR_STR(clCreateKernelsInProgram) + +#define __CREATE_COMMAND_QUEUE_ERR __ERR_STR(clCreateCommandQueue) +#define __SET_COMMAND_QUEUE_PROPERTY_ERR __ERR_STR(clSetCommandQueueProperty) +#define __ENQUEUE_READ_BUFFER_ERR __ERR_STR(clEnqueueReadBuffer) +#define __ENQUEUE_READ_BUFFER_RECT_ERR __ERR_STR(clEnqueueReadBufferRect) +#define __ENQUEUE_WRITE_BUFFER_ERR __ERR_STR(clEnqueueWriteBuffer) +#define __ENQUEUE_WRITE_BUFFER_RECT_ERR __ERR_STR(clEnqueueWriteBufferRect) +#define __ENQEUE_COPY_BUFFER_ERR __ERR_STR(clEnqueueCopyBuffer) +#define __ENQEUE_COPY_BUFFER_RECT_ERR __ERR_STR(clEnqueueCopyBufferRect) +#define __ENQUEUE_FILL_BUFFER_ERR __ERR_STR(clEnqueueFillBuffer) +#define __ENQUEUE_READ_IMAGE_ERR __ERR_STR(clEnqueueReadImage) +#define __ENQUEUE_WRITE_IMAGE_ERR __ERR_STR(clEnqueueWriteImage) +#define __ENQUEUE_COPY_IMAGE_ERR __ERR_STR(clEnqueueCopyImage) +#define __ENQUEUE_FILL_IMAGE_ERR __ERR_STR(clEnqueueFillImage) +#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR __ERR_STR(clEnqueueCopyImageToBuffer) +#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR __ERR_STR(clEnqueueCopyBufferToImage) +#define __ENQUEUE_MAP_BUFFER_ERR __ERR_STR(clEnqueueMapBuffer) +#define __ENQUEUE_MAP_IMAGE_ERR __ERR_STR(clEnqueueMapImage) +#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR __ERR_STR(clEnqueueUnMapMemObject) +#define __ENQUEUE_NDRANGE_KERNEL_ERR __ERR_STR(clEnqueueNDRangeKernel) +#define __ENQUEUE_TASK_ERR __ERR_STR(clEnqueueTask) +#define __ENQUEUE_NATIVE_KERNEL __ERR_STR(clEnqueueNativeKernel) +#if defined(CL_VERSION_1_2) +#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR __ERR_STR(clEnqueueMigrateMemObjects) +#endif // #if defined(CL_VERSION_1_2) + +#define __ENQUEUE_ACQUIRE_GL_ERR __ERR_STR(clEnqueueAcquireGLObjects) +#define __ENQUEUE_RELEASE_GL_ERR __ERR_STR(clEnqueueReleaseGLObjects) + + +#define __RETAIN_ERR __ERR_STR(Retain Object) +#define __RELEASE_ERR __ERR_STR(Release Object) +#define __FLUSH_ERR __ERR_STR(clFlush) +#define __FINISH_ERR __ERR_STR(clFinish) +#define __VECTOR_CAPACITY_ERR __ERR_STR(Vector capacity error) + +/** + * CL 1.2 version that uses device fission. + */ +#if defined(CL_VERSION_1_2) +#define __CREATE_SUB_DEVICES __ERR_STR(clCreateSubDevices) +#else +#define __CREATE_SUB_DEVICES __ERR_STR(clCreateSubDevicesEXT) +#endif // #if defined(CL_VERSION_1_2) + +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) +#define __ENQUEUE_MARKER_ERR __ERR_STR(clEnqueueMarker) +#define __ENQUEUE_WAIT_FOR_EVENTS_ERR __ERR_STR(clEnqueueWaitForEvents) +#define __ENQUEUE_BARRIER_ERR __ERR_STR(clEnqueueBarrier) +#define __UNLOAD_COMPILER_ERR __ERR_STR(clUnloadCompiler) +#define __CREATE_GL_TEXTURE_2D_ERR __ERR_STR(clCreateFromGLTexture2D) +#define __CREATE_GL_TEXTURE_3D_ERR __ERR_STR(clCreateFromGLTexture3D) +#define __CREATE_IMAGE2D_ERR __ERR_STR(clCreateImage2D) +#define __CREATE_IMAGE3D_ERR __ERR_STR(clCreateImage3D) +#endif // #if defined(CL_VERSION_1_1) + +#endif // __CL_USER_OVERRIDE_ERROR_STRINGS +//! \endcond + +/** + * CL 1.2 marker and barrier commands + */ +#if defined(CL_VERSION_1_2) +#define __ENQUEUE_MARKER_WAIT_LIST_ERR __ERR_STR(clEnqueueMarkerWithWaitList) +#define __ENQUEUE_BARRIER_WAIT_LIST_ERR __ERR_STR(clEnqueueBarrierWithWaitList) +#endif // #if defined(CL_VERSION_1_2) + +#if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING) +typedef std::string STRING_CLASS; +#elif !defined(__USE_DEV_STRING) + +/*! \class string + * \brief Simple string class, that provides a limited subset of std::string + * functionality but avoids many of the issues that come with that class. + + * \note Deprecated. Please use std::string as default or + * re-define the string class to match the std::string + * interface by defining STRING_CLASS + */ +class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED string +{ +private: + ::size_t size_; + char * str_; +public: + //! \brief Constructs an empty string, allocating no memory. + string(void) : size_(0), str_(NULL) + { + } + + /*! \brief Constructs a string populated from an arbitrary value of + * specified size. + * + * An extra '\0' is added, in case none was contained in str. + * + * \param str the initial value of the string instance. Note that '\0' + * characters receive no special treatment. If NULL, + * the string is left empty, with a size of 0. + * + * \param size the number of characters to copy from str. + */ + string(const char * str, ::size_t size) : + size_(size), + str_(NULL) + { + if( size > 0 ) { + str_ = new char[size_+1]; + if (str_ != NULL) { + memcpy(str_, str, size_ * sizeof(char)); + str_[size_] = '\0'; + } + else { + size_ = 0; + } + } + } + + /*! \brief Constructs a string populated from a null-terminated value. + * + * \param str the null-terminated initial value of the string instance. + * If NULL, the string is left empty, with a size of 0. + */ + string(const char * str) : + size_(0), + str_(NULL) + { + if( str ) { + size_= ::strlen(str); + } + if( size_ > 0 ) { + str_ = new char[size_ + 1]; + if (str_ != NULL) { + memcpy(str_, str, (size_ + 1) * sizeof(char)); + } + } + } + + void resize( ::size_t n ) + { + if( size_ == n ) { + return; + } + if (n == 0) { + if( str_ ) { + delete [] str_; + } + str_ = NULL; + size_ = 0; + } + else { + char *newString = new char[n + 1]; + ::size_t copySize = n; + if( size_ < n ) { + copySize = size_; + } + size_ = n; + + if(str_) { + memcpy(newString, str_, (copySize + 1) * sizeof(char)); + } + if( copySize < size_ ) { + memset(newString + copySize, 0, size_ - copySize); + } + newString[size_] = '\0'; + + delete [] str_; + str_ = newString; + } + } + + const char& operator[] ( ::size_t pos ) const + { + return str_[pos]; + } + + char& operator[] ( ::size_t pos ) + { + return str_[pos]; + } + + /*! \brief Copies the value of another string to this one. + * + * \param rhs the string to copy. + * + * \returns a reference to the modified instance. + */ + string& operator=(const string& rhs) + { + if (this == &rhs) { + return *this; + } + + if( str_ != NULL ) { + delete [] str_; + str_ = NULL; + size_ = 0; + } + + if (rhs.size_ == 0 || rhs.str_ == NULL) { + str_ = NULL; + size_ = 0; + } + else { + str_ = new char[rhs.size_ + 1]; + size_ = rhs.size_; + + if (str_ != NULL) { + memcpy(str_, rhs.str_, (size_ + 1) * sizeof(char)); + } + else { + size_ = 0; + } + } + + return *this; + } + + /*! \brief Constructs a string by copying the value of another instance. + * + * \param rhs the string to copy. + */ + string(const string& rhs) : + size_(0), + str_(NULL) + { + *this = rhs; + } + + //! \brief Destructor - frees memory used to hold the current value. + ~string() + { + delete[] str_; + str_ = NULL; + } + + //! \brief Queries the length of the string, excluding any added '\0's. + ::size_t size(void) const { return size_; } + + //! \brief Queries the length of the string, excluding any added '\0's. + ::size_t length(void) const { return size(); } + + /*! \brief Returns a pointer to the private copy held by this instance, + * or "" if empty/unset. + */ + const char * c_str(void) const { return (str_) ? str_ : "";} +} CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; +typedef cl::string STRING_CLASS; +#endif // #elif !defined(__USE_DEV_STRING) + +#if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR) +#define VECTOR_CLASS std::vector +#elif !defined(__USE_DEV_VECTOR) +#define VECTOR_CLASS cl::vector + +#if !defined(__MAX_DEFAULT_VECTOR_SIZE) +#define __MAX_DEFAULT_VECTOR_SIZE 10 +#endif + +/*! \class vector + * \brief Fixed sized vector implementation that mirroring + * + * \note Deprecated. Please use std::vector as default or + * re-define the vector class to match the std::vector + * interface by defining VECTOR_CLASS + + * \note Not recommended for use with custom objects as + * current implementation will construct N elements + * + * std::vector functionality. + * \brief Fixed sized vector compatible with std::vector. + * + * \note + * This differs from std::vector<> not just in memory allocation, + * but also in terms of when members are constructed, destroyed, + * and assigned instead of being copy constructed. + * + * \param T type of element contained in the vector. + * + * \param N maximum size of the vector. + */ +template +class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED vector +{ +private: + T data_[N]; + unsigned int size_; + +public: + //! \brief Constructs an empty vector with no memory allocated. + vector() : + size_(static_cast(0)) + {} + + //! \brief Deallocates the vector's memory and destroys all of its elements. + ~vector() + { + clear(); + } + + //! \brief Returns the number of elements currently contained. + unsigned int size(void) const + { + return size_; + } + + /*! \brief Empties the vector of all elements. + * \note + * This does not deallocate memory but will invoke destructors + * on contained elements. + */ + void clear() + { + while(!empty()) { + pop_back(); + } + } + + /*! \brief Appends an element after the last valid element. + * Calling this on a vector that has reached capacity will throw an + * exception if exceptions are enabled. + */ + void push_back (const T& x) + { + if (size() < N) { + new (&data_[size_]) T(x); + size_++; + } else { + detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR); + } + } + + /*! \brief Removes the last valid element from the vector. + * Calling this on an empty vector will throw an exception + * if exceptions are enabled. + */ + void pop_back(void) + { + if (size_ != 0) { + --size_; + data_[size_].~T(); + } else { + detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR); + } + } + + /*! \brief Constructs with a value copied from another. + * + * \param vec the vector to copy. + */ + vector(const vector& vec) : + size_(vec.size_) + { + if (size_ != 0) { + assign(vec.begin(), vec.end()); + } + } + + /*! \brief Constructs with a specified number of initial elements. + * + * \param size number of initial elements. + * + * \param val value of initial elements. + */ + vector(unsigned int size, const T& val = T()) : + size_(0) + { + for (unsigned int i = 0; i < size; i++) { + push_back(val); + } + } + + /*! \brief Overwrites the current content with that copied from another + * instance. + * + * \param rhs vector to copy. + * + * \returns a reference to this. + */ + vector& operator=(const vector& rhs) + { + if (this == &rhs) { + return *this; + } + + if (rhs.size_ != 0) { + assign(rhs.begin(), rhs.end()); + } else { + clear(); + } + + return *this; + } + + /*! \brief Tests equality against another instance. + * + * \param vec the vector against which to compare. + */ + bool operator==(vector &vec) + { + if (size() != vec.size()) { + return false; + } + + for( unsigned int i = 0; i < size(); ++i ) { + if( operator[](i) != vec[i] ) { + return false; + } + } + return true; + } + + //! \brief Conversion operator to T*. + operator T* () { return data_; } + + //! \brief Conversion operator to const T*. + operator const T* () const { return data_; } + + //! \brief Tests whether this instance has any elements. + bool empty (void) const + { + return size_==0; + } + + //! \brief Returns the maximum number of elements this instance can hold. + unsigned int max_size (void) const + { + return N; + } + + //! \brief Returns the maximum number of elements this instance can hold. + unsigned int capacity () const + { + return N; + } + + //! \brief Resizes the vector to the given size + void resize(unsigned int newSize, T fill = T()) + { + if (newSize > N) + { + detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR); + } + else + { + while (size_ < newSize) + { + new (&data_[size_]) T(fill); + size_++; + } + while (size_ > newSize) + { + --size_; + data_[size_].~T(); + } + } + } + + /*! \brief Returns a reference to a given element. + * + * \param index which element to access. * + * \note + * The caller is responsible for ensuring index is >= 0 and < size(). + */ + T& operator[](int index) + { + return data_[index]; + } + + /*! \brief Returns a const reference to a given element. + * + * \param index which element to access. + * + * \note + * The caller is responsible for ensuring index is >= 0 and < size(). + */ + const T& operator[](int index) const + { + return data_[index]; + } + + /*! \brief Assigns elements of the vector based on a source iterator range. + * + * \param start Beginning iterator of source range + * \param end Enditerator of source range + * + * \note + * Will throw an exception if exceptions are enabled and size exceeded. + */ + template + void assign(I start, I end) + { + clear(); + while(start != end) { + push_back(*start); + start++; + } + } + + /*! \class iterator + * \brief Const iterator class for vectors + */ + class iterator + { + private: + const vector *vec_; + int index_; + + /** + * Internal iterator constructor to capture reference + * to the vector it iterates over rather than taking + * the vector by copy. + */ + iterator (const vector &vec, int index) : + vec_(&vec) + { + if( !vec.empty() ) { + index_ = index; + } else { + index_ = -1; + } + } + + public: + iterator(void) : + index_(-1), + vec_(NULL) + { + } + + iterator(const iterator& rhs) : + vec_(rhs.vec_), + index_(rhs.index_) + { + } + + ~iterator(void) {} + + static iterator begin(const cl::vector &vec) + { + iterator i(vec, 0); + + return i; + } + + static iterator end(const cl::vector &vec) + { + iterator i(vec, vec.size()); + + return i; + } + + bool operator==(iterator i) + { + return ((vec_ == i.vec_) && + (index_ == i.index_)); + } + + bool operator!=(iterator i) + { + return (!(*this==i)); + } + + iterator& operator++() + { + ++index_; + return *this; + } + + iterator operator++(int) + { + iterator retVal(*this); + ++index_; + return retVal; + } + + iterator& operator--() + { + --index_; + return *this; + } + + iterator operator--(int) + { + iterator retVal(*this); + --index_; + return retVal; + } + + const T& operator *() const + { + return (*vec_)[index_]; + } + }; + + iterator begin(void) + { + return iterator::begin(*this); + } + + iterator begin(void) const + { + return iterator::begin(*this); + } + + iterator end(void) + { + return iterator::end(*this); + } + + iterator end(void) const + { + return iterator::end(*this); + } + + T& front(void) + { + return data_[0]; + } + + T& back(void) + { + return data_[size_]; + } + + const T& front(void) const + { + return data_[0]; + } + + const T& back(void) const + { + return data_[size_-1]; + } +} CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; +#endif // #if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR) + + + + + +namespace detail { +#define __DEFAULT_NOT_INITIALIZED 1 +#define __DEFAULT_BEING_INITIALIZED 2 +#define __DEFAULT_INITIALIZED 4 + + /* + * Compare and exchange primitives are needed for handling of defaults + */ + +#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED + inline int compare_exchange(std::atomic * dest, int exchange, int comparand) +#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED + inline int compare_exchange(volatile int * dest, int exchange, int comparand) +#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED + { +#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED + std::atomic_compare_exchange_strong(dest, &comparand, exchange); + return comparand; +#elif _MSC_VER + return (int)(_InterlockedCompareExchange( + (volatile long*)dest, + (long)exchange, + (long)comparand)); +#else // !_MSC_VER && !CL_HPP_CPP11_ATOMICS_SUPPORTED + return (__sync_val_compare_and_swap( + dest, + comparand, + exchange)); +#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED + } + + inline void fence() { +#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED + std::atomic_thread_fence(std::memory_order_seq_cst); +#elif _MSC_VER // !CL_HPP_CPP11_ATOMICS_SUPPORTED + _ReadWriteBarrier(); +#else // !_MSC_VER && !CL_HPP_CPP11_ATOMICS_SUPPORTED + __sync_synchronize(); +#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED + } +} // namespace detail + + +/*! \brief class used to interface between C++ and + * OpenCL C calls that require arrays of size_t values, whose + * size is known statically. + */ +template +class size_t +{ +private: + ::size_t data_[N]; + +public: + //! \brief Initialize size_t to all 0s + size_t() + { + for( int i = 0; i < N; ++i ) { + data_[i] = 0; + } + } + + ::size_t& operator[](int index) + { + return data_[index]; + } + + const ::size_t& operator[](int index) const + { + return data_[index]; + } + + //! \brief Conversion operator to T*. + operator ::size_t* () { return data_; } + + //! \brief Conversion operator to const T*. + operator const ::size_t* () const { return data_; } +}; + +namespace detail { + +// Generic getInfoHelper. The final parameter is used to guide overload +// resolution: the actual parameter passed is an int, which makes this +// a worse conversion sequence than a specialization that declares the +// parameter as an int. +template +inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long) +{ + return f(name, sizeof(T), param, NULL); +} + +// Specialized getInfoHelper for VECTOR_CLASS params +template +inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, long) +{ + ::size_t required; + cl_int err = f(name, 0, NULL, &required); + if (err != CL_SUCCESS) { + return err; + } + + T* value = (T*) alloca(required); + err = f(name, required, value, NULL); + if (err != CL_SUCCESS) { + return err; + } + + param->assign(&value[0], &value[required/sizeof(T)]); + return CL_SUCCESS; +} + +/* Specialization for reference-counted types. This depends on the + * existence of Wrapper::cl_type, and none of the other types having the + * cl_type member. Note that simplify specifying the parameter as Wrapper + * does not work, because when using a derived type (e.g. Context) the generic + * template will provide a better match. + */ +template +inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, int, typename T::cl_type = 0) +{ + ::size_t required; + cl_int err = f(name, 0, NULL, &required); + if (err != CL_SUCCESS) { + return err; + } + + typename T::cl_type * value = (typename T::cl_type *) alloca(required); + err = f(name, required, value, NULL); + if (err != CL_SUCCESS) { + return err; + } + + ::size_t elements = required / sizeof(typename T::cl_type); + param->assign(&value[0], &value[elements]); + for (::size_t i = 0; i < elements; i++) + { + if (value[i] != NULL) + { + err = (*param)[i].retain(); + if (err != CL_SUCCESS) { + return err; + } + } + } + return CL_SUCCESS; +} + +// Specialized for getInfo +template +inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS* param, int) +{ + cl_int err = f(name, param->size() * sizeof(char *), &(*param)[0], NULL); + + if (err != CL_SUCCESS) { + return err; + } + + return CL_SUCCESS; +} + +// Specialized GetInfoHelper for STRING_CLASS params +template +inline cl_int getInfoHelper(Func f, cl_uint name, STRING_CLASS* param, long) +{ +#if defined(__NO_STD_VECTOR) || defined(__NO_STD_STRING) + ::size_t required; + cl_int err = f(name, 0, NULL, &required); + if (err != CL_SUCCESS) { + return err; + } + + char* value = (char*)alloca(required); + err = f(name, required, value, NULL); + if (err != CL_SUCCESS) { + return err; + } + + *param = value; + return CL_SUCCESS; +#else + ::size_t required; + cl_int err = f(name, 0, NULL, &required); + if (err != CL_SUCCESS) { + return err; + } + + // std::string has a constant data member + // a char vector does not + VECTOR_CLASS value(required); + err = f(name, required, value.data(), NULL); + if (err != CL_SUCCESS) { + return err; + } + if (param) { + param->assign(value.begin(), value.end()); + } +#endif + return CL_SUCCESS; +} + +// Specialized GetInfoHelper for cl::size_t params +template +inline cl_int getInfoHelper(Func f, cl_uint name, size_t* param, long) +{ + ::size_t required; + cl_int err = f(name, 0, NULL, &required); + if (err != CL_SUCCESS) { + return err; + } + + ::size_t* value = (::size_t*) alloca(required); + err = f(name, required, value, NULL); + if (err != CL_SUCCESS) { + return err; + } + + for(int i = 0; i < N; ++i) { + (*param)[i] = value[i]; + } + + return CL_SUCCESS; +} + +template struct ReferenceHandler; + +/* Specialization for reference-counted types. This depends on the + * existence of Wrapper::cl_type, and none of the other types having the + * cl_type member. Note that simplify specifying the parameter as Wrapper + * does not work, because when using a derived type (e.g. Context) the generic + * template will provide a better match. + */ +template +inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0) +{ + typename T::cl_type value; + cl_int err = f(name, sizeof(value), &value, NULL); + if (err != CL_SUCCESS) { + return err; + } + *param = value; + if (value != NULL) + { + err = param->retain(); + if (err != CL_SUCCESS) { + return err; + } + } + return CL_SUCCESS; +} + +#define __PARAM_NAME_INFO_1_0(F) \ + F(cl_platform_info, CL_PLATFORM_PROFILE, STRING_CLASS) \ + F(cl_platform_info, CL_PLATFORM_VERSION, STRING_CLASS) \ + F(cl_platform_info, CL_PLATFORM_NAME, STRING_CLASS) \ + F(cl_platform_info, CL_PLATFORM_VENDOR, STRING_CLASS) \ + F(cl_platform_info, CL_PLATFORM_EXTENSIONS, STRING_CLASS) \ + \ + F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \ + F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, ::size_t) \ + F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, VECTOR_CLASS< ::size_t>) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \ + F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, ::size_t) \ + F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, ::size_t) \ + F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, ::size_t) \ + F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, ::size_t) \ + F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, ::size_t) \ + F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \ + F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, ::size_t) \ + F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \ + F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \ + F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \ + F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \ + F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\ + F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \ + F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \ + F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \ + F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, ::size_t) \ + F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \ + F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \ + F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \ + F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \ + F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties) \ + F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \ + F(cl_device_info, CL_DEVICE_NAME, STRING_CLASS) \ + F(cl_device_info, CL_DEVICE_VENDOR, STRING_CLASS) \ + F(cl_device_info, CL_DRIVER_VERSION, STRING_CLASS) \ + F(cl_device_info, CL_DEVICE_PROFILE, STRING_CLASS) \ + F(cl_device_info, CL_DEVICE_VERSION, STRING_CLASS) \ + F(cl_device_info, CL_DEVICE_EXTENSIONS, STRING_CLASS) \ + \ + F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \ + F(cl_context_info, CL_CONTEXT_DEVICES, VECTOR_CLASS) \ + F(cl_context_info, CL_CONTEXT_PROPERTIES, VECTOR_CLASS) \ + \ + F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \ + F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \ + F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \ + F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_int) \ + \ + F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \ + F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \ + F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \ + F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \ + \ + F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \ + F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \ + F(cl_mem_info, CL_MEM_SIZE, ::size_t) \ + F(cl_mem_info, CL_MEM_HOST_PTR, void*) \ + F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \ + F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \ + F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \ + \ + F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \ + F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, ::size_t) \ + F(cl_image_info, CL_IMAGE_ROW_PITCH, ::size_t) \ + F(cl_image_info, CL_IMAGE_SLICE_PITCH, ::size_t) \ + F(cl_image_info, CL_IMAGE_WIDTH, ::size_t) \ + F(cl_image_info, CL_IMAGE_HEIGHT, ::size_t) \ + F(cl_image_info, CL_IMAGE_DEPTH, ::size_t) \ + \ + F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \ + F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \ + F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_bool) \ + F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_addressing_mode) \ + F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_filter_mode) \ + \ + F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \ + F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \ + F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \ + F(cl_program_info, CL_PROGRAM_DEVICES, VECTOR_CLASS) \ + F(cl_program_info, CL_PROGRAM_SOURCE, STRING_CLASS) \ + F(cl_program_info, CL_PROGRAM_BINARY_SIZES, VECTOR_CLASS< ::size_t>) \ + F(cl_program_info, CL_PROGRAM_BINARIES, VECTOR_CLASS) \ + \ + F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \ + F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, STRING_CLASS) \ + F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, STRING_CLASS) \ + \ + F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, STRING_CLASS) \ + F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \ + F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \ + F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \ + F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \ + \ + F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, ::size_t) \ + F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::size_t<3>) \ + F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \ + \ + F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \ + F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \ + F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \ + F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties) + +#if defined(CL_VERSION_1_1) +#define __PARAM_NAME_INFO_1_1(F) \ + F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \ + F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \ + F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \ + F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) \ + F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, STRING_CLASS) \ + \ + F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \ + F(cl_mem_info, CL_MEM_OFFSET, ::size_t) \ + \ + F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, ::size_t) \ + F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \ + \ + F(cl_event_info, CL_EVENT_CONTEXT, cl::Context) +#endif // CL_VERSION_1_1 + + +#if defined(CL_VERSION_1_2) +#define __PARAM_NAME_INFO_1_2(F) \ + F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer) \ + \ + F(cl_program_info, CL_PROGRAM_NUM_KERNELS, ::size_t) \ + F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, STRING_CLASS) \ + \ + F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \ + \ + F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, STRING_CLASS) \ + \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, STRING_CLASS) \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, STRING_CLASS) \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_QUALIFIER, cl_kernel_arg_type_qualifier) \ + \ + F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl_device_id) \ + F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, VECTOR_CLASS) \ + F(cl_device_info, CL_DEVICE_PARTITION_TYPE, VECTOR_CLASS) \ + F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, ::size_t) \ + F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \ + F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, STRING_CLASS) +#endif // #if defined(CL_VERSION_1_2) + +#if defined(USE_CL_DEVICE_FISSION) +#define __PARAM_NAME_DEVICE_FISSION(F) \ + F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \ + F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, VECTOR_CLASS) \ + F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, VECTOR_CLASS) \ + F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \ + F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, VECTOR_CLASS) +#endif // USE_CL_DEVICE_FISSION + +template +struct param_traits {}; + +#define __CL_DECLARE_PARAM_TRAITS(token, param_name, T) \ +struct token; \ +template<> \ +struct param_traits \ +{ \ + enum { value = param_name }; \ + typedef T param_type; \ +}; + +__PARAM_NAME_INFO_1_0(__CL_DECLARE_PARAM_TRAITS) +#if defined(CL_VERSION_1_1) +__PARAM_NAME_INFO_1_1(__CL_DECLARE_PARAM_TRAITS) +#endif // CL_VERSION_1_1 +#if defined(CL_VERSION_1_2) +__PARAM_NAME_INFO_1_2(__CL_DECLARE_PARAM_TRAITS) +#endif // CL_VERSION_1_1 + +#if defined(USE_CL_DEVICE_FISSION) +__PARAM_NAME_DEVICE_FISSION(__CL_DECLARE_PARAM_TRAITS); +#endif // USE_CL_DEVICE_FISSION + +#ifdef CL_PLATFORM_ICD_SUFFIX_KHR +__CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_CLASS) +#endif + +#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong) +#endif + +#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, VECTOR_CLASS< ::size_t>) +#endif +#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_SIMD_WIDTH_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint) +#endif + +#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint) +#endif +#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint) +#endif +#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint) +#endif +#ifdef CL_DEVICE_WARP_SIZE_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint) +#endif +#ifdef CL_DEVICE_GPU_OVERLAP_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool) +#endif +#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool) +#endif +#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool) +#endif + +// Convenience functions + +template +inline cl_int +getInfo(Func f, cl_uint name, T* param) +{ + return getInfoHelper(f, name, param, 0); +} + +template +struct GetInfoFunctor0 +{ + Func f_; const Arg0& arg0_; + cl_int operator ()( + cl_uint param, ::size_t size, void* value, ::size_t* size_ret) + { return f_(arg0_, param, size, value, size_ret); } +}; + +template +struct GetInfoFunctor1 +{ + Func f_; const Arg0& arg0_; const Arg1& arg1_; + cl_int operator ()( + cl_uint param, ::size_t size, void* value, ::size_t* size_ret) + { return f_(arg0_, arg1_, param, size, value, size_ret); } +}; + +template +inline cl_int +getInfo(Func f, const Arg0& arg0, cl_uint name, T* param) +{ + GetInfoFunctor0 f0 = { f, arg0 }; + return getInfoHelper(f0, name, param, 0); +} + +template +inline cl_int +getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param) +{ + GetInfoFunctor1 f0 = { f, arg0, arg1 }; + return getInfoHelper(f0, name, param, 0); +} + +template +struct ReferenceHandler +{ }; + +#if defined(CL_VERSION_1_2) +/** + * OpenCL 1.2 devices do have retain/release. + */ +template <> +struct ReferenceHandler +{ + /** + * Retain the device. + * \param device A valid device created using createSubDevices + * \return + * CL_SUCCESS if the function executed successfully. + * CL_INVALID_DEVICE if device was not a valid subdevice + * CL_OUT_OF_RESOURCES + * CL_OUT_OF_HOST_MEMORY + */ + static cl_int retain(cl_device_id device) + { return ::clRetainDevice(device); } + /** + * Retain the device. + * \param device A valid device created using createSubDevices + * \return + * CL_SUCCESS if the function executed successfully. + * CL_INVALID_DEVICE if device was not a valid subdevice + * CL_OUT_OF_RESOURCES + * CL_OUT_OF_HOST_MEMORY + */ + static cl_int release(cl_device_id device) + { return ::clReleaseDevice(device); } +}; +#else // #if defined(CL_VERSION_1_2) +/** + * OpenCL 1.1 devices do not have retain/release. + */ +template <> +struct ReferenceHandler +{ + // cl_device_id does not have retain(). + static cl_int retain(cl_device_id) + { return CL_SUCCESS; } + // cl_device_id does not have release(). + static cl_int release(cl_device_id) + { return CL_SUCCESS; } +}; +#endif // #if defined(CL_VERSION_1_2) + +template <> +struct ReferenceHandler +{ + // cl_platform_id does not have retain(). + static cl_int retain(cl_platform_id) + { return CL_SUCCESS; } + // cl_platform_id does not have release(). + static cl_int release(cl_platform_id) + { return CL_SUCCESS; } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_context context) + { return ::clRetainContext(context); } + static cl_int release(cl_context context) + { return ::clReleaseContext(context); } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_command_queue queue) + { return ::clRetainCommandQueue(queue); } + static cl_int release(cl_command_queue queue) + { return ::clReleaseCommandQueue(queue); } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_mem memory) + { return ::clRetainMemObject(memory); } + static cl_int release(cl_mem memory) + { return ::clReleaseMemObject(memory); } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_sampler sampler) + { return ::clRetainSampler(sampler); } + static cl_int release(cl_sampler sampler) + { return ::clReleaseSampler(sampler); } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_program program) + { return ::clRetainProgram(program); } + static cl_int release(cl_program program) + { return ::clReleaseProgram(program); } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_kernel kernel) + { return ::clRetainKernel(kernel); } + static cl_int release(cl_kernel kernel) + { return ::clReleaseKernel(kernel); } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_event event) + { return ::clRetainEvent(event); } + static cl_int release(cl_event event) + { return ::clReleaseEvent(event); } +}; + + +// Extracts version number with major in the upper 16 bits, minor in the lower 16 +static cl_uint getVersion(const char *versionInfo) +{ + int highVersion = 0; + int lowVersion = 0; + int index = 7; + while(versionInfo[index] != '.' ) { + highVersion *= 10; + highVersion += versionInfo[index]-'0'; + ++index; + } + ++index; + while(versionInfo[index] != ' ' && versionInfo[index] != '\0') { + lowVersion *= 10; + lowVersion += versionInfo[index]-'0'; + ++index; + } + return (highVersion << 16) | lowVersion; +} + +static cl_uint getPlatformVersion(cl_platform_id platform) +{ + ::size_t size = 0; + clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size); + char *versionInfo = (char *) alloca(size); + clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, &versionInfo[0], &size); + return getVersion(versionInfo); +} + +static cl_uint getDevicePlatformVersion(cl_device_id device) +{ + cl_platform_id platform; + clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL); + return getPlatformVersion(platform); +} + +#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) +static cl_uint getContextPlatformVersion(cl_context context) +{ + // The platform cannot be queried directly, so we first have to grab a + // device and obtain its context + ::size_t size = 0; + clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size); + if (size == 0) + return 0; + cl_device_id *devices = (cl_device_id *) alloca(size); + clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices, NULL); + return getDevicePlatformVersion(devices[0]); +} +#endif // #if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + +template +class Wrapper +{ +public: + typedef T cl_type; + +protected: + cl_type object_; + +public: + Wrapper() : object_(NULL) { } + + Wrapper(const cl_type &obj) : object_(obj) { } + + ~Wrapper() + { + if (object_ != NULL) { release(); } + } + + Wrapper(const Wrapper& rhs) + { + object_ = rhs.object_; + if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } + } + +#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + Wrapper(Wrapper&& rhs) CL_HPP_NOEXCEPT + { + object_ = rhs.object_; + rhs.object_ = NULL; + } +#endif + + Wrapper& operator = (const Wrapper& rhs) + { + if (this != &rhs) { + if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } + object_ = rhs.object_; + if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } + } + return *this; + } + +#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + Wrapper& operator = (Wrapper&& rhs) + { + if (this != &rhs) { + if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } + object_ = rhs.object_; + rhs.object_ = NULL; + } + return *this; + } +#endif + + Wrapper& operator = (const cl_type &rhs) + { + if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } + object_ = rhs; + return *this; + } + + cl_type operator ()() const { return object_; } + + cl_type& operator ()() { return object_; } + +protected: + template + friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type); + + cl_int retain() const + { + return ReferenceHandler::retain(object_); + } + + cl_int release() const + { + return ReferenceHandler::release(object_); + } +}; + +template <> +class Wrapper +{ +public: + typedef cl_device_id cl_type; + +protected: + cl_type object_; + bool referenceCountable_; + + static bool isReferenceCountable(cl_device_id device) + { + bool retVal = false; + if (device != NULL) { + int version = getDevicePlatformVersion(device); + if(version > ((1 << 16) + 1)) { + retVal = true; + } + } + return retVal; + } + +public: + Wrapper() : object_(NULL), referenceCountable_(false) + { + } + + Wrapper(const cl_type &obj) : object_(obj), referenceCountable_(false) + { + referenceCountable_ = isReferenceCountable(obj); + } + + ~Wrapper() + { + if (object_ != NULL) { release(); } + } + + Wrapper(const Wrapper& rhs) + { + object_ = rhs.object_; + referenceCountable_ = isReferenceCountable(object_); + if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } + } + +#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + Wrapper(Wrapper&& rhs) CL_HPP_NOEXCEPT + { + object_ = rhs.object_; + referenceCountable_ = rhs.referenceCountable_; + rhs.object_ = NULL; + rhs.referenceCountable_ = false; + } +#endif + + Wrapper& operator = (const Wrapper& rhs) + { + if (this != &rhs) { + if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } + object_ = rhs.object_; + referenceCountable_ = rhs.referenceCountable_; + if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } + } + return *this; + } + +#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + Wrapper& operator = (Wrapper&& rhs) + { + if (this != &rhs) { + if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } + object_ = rhs.object_; + referenceCountable_ = rhs.referenceCountable_; + rhs.object_ = NULL; + rhs.referenceCountable_ = false; + } + return *this; + } +#endif + + Wrapper& operator = (const cl_type &rhs) + { + if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } + object_ = rhs; + referenceCountable_ = isReferenceCountable(object_); + return *this; + } + + cl_type operator ()() const { return object_; } + + cl_type& operator ()() { return object_; } + +protected: + template + friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type); + + template + friend inline cl_int getInfoHelper(Func, cl_uint, VECTOR_CLASS*, int, typename U::cl_type); + + cl_int retain() const + { + if( referenceCountable_ ) { + return ReferenceHandler::retain(object_); + } + else { + return CL_SUCCESS; + } + } + + cl_int release() const + { + if( referenceCountable_ ) { + return ReferenceHandler::release(object_); + } + else { + return CL_SUCCESS; + } + } +}; + +} // namespace detail +//! \endcond + +/*! \stuct ImageFormat + * \brief Adds constructors and member functions for cl_image_format. + * + * \see cl_image_format + */ +struct ImageFormat : public cl_image_format +{ + //! \brief Default constructor - performs no initialization. + ImageFormat(){} + + //! \brief Initializing constructor. + ImageFormat(cl_channel_order order, cl_channel_type type) + { + image_channel_order = order; + image_channel_data_type = type; + } + + //! \brief Assignment operator. + ImageFormat& operator = (const ImageFormat& rhs) + { + if (this != &rhs) { + this->image_channel_data_type = rhs.image_channel_data_type; + this->image_channel_order = rhs.image_channel_order; + } + return *this; + } +}; + +/*! \brief Class interface for cl_device_id. + * + * \note Copies of these objects are inexpensive, since they don't 'own' + * any underlying resources or data structures. + * + * \see cl_device_id + */ +class Device : public detail::Wrapper +{ +public: + //! \brief Default constructor - initializes to NULL. + Device() : detail::Wrapper() { } + + /*! \brief Constructor from cl_device_id. + * + * This simply copies the device ID value, which is an inexpensive operation. + */ + __CL_EXPLICIT_CONSTRUCTORS Device(const cl_device_id &device) : detail::Wrapper(device) { } + + /*! \brief Returns the first device on the default context. + * + * \see Context::getDefault() + */ + static Device getDefault(cl_int * err = NULL); + + /*! \brief Assignment operator from cl_device_id. + * + * This simply copies the device ID value, which is an inexpensive operation. + */ + Device& operator = (const cl_device_id& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Device(const Device& dev) : detail::Wrapper(dev) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Device& operator = (const Device &dev) + { + detail::Wrapper::operator=(dev); + return *this; + } + +#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Device(Device&& dev) CL_HPP_NOEXCEPT : detail::Wrapper(std::move(dev)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Device& operator = (Device &&dev) + { + detail::Wrapper::operator=(std::move(dev)); + return *this; + } +#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + + //! \brief Wrapper for clGetDeviceInfo(). + template + cl_int getInfo(cl_device_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetDeviceInfo, object_, name, param), + __GET_DEVICE_INFO_ERR); + } + + //! \brief Wrapper for clGetDeviceInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_device_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /** + * CL 1.2 version + */ +#if defined(CL_VERSION_1_2) + //! \brief Wrapper for clCreateSubDevicesEXT(). + cl_int createSubDevices( + const cl_device_partition_property * properties, + VECTOR_CLASS* devices) + { + cl_uint n = 0; + cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_SUB_DEVICES); + } + + cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id)); + err = clCreateSubDevices(object_, properties, n, ids, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_SUB_DEVICES); + } + + devices->assign(&ids[0], &ids[n]); + return CL_SUCCESS; + } +#endif // #if defined(CL_VERSION_1_2) + +/** + * CL 1.1 version that uses device fission. + */ +#if defined(CL_VERSION_1_1) +#if defined(USE_CL_DEVICE_FISSION) + cl_int createSubDevices( + const cl_device_partition_property_ext * properties, + VECTOR_CLASS* devices) + { + typedef CL_API_ENTRY cl_int + ( CL_API_CALL * PFN_clCreateSubDevicesEXT)( + cl_device_id /*in_device*/, + const cl_device_partition_property_ext * /* properties */, + cl_uint /*num_entries*/, + cl_device_id * /*out_devices*/, + cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL; + __INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT); + + cl_uint n = 0; + cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_SUB_DEVICES); + } + + cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id)); + err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_SUB_DEVICES); + } + + devices->assign(&ids[0], &ids[n]); + return CL_SUCCESS; + } +#endif // #if defined(USE_CL_DEVICE_FISSION) +#endif // #if defined(CL_VERSION_1_1) +}; + +/*! \brief Class interface for cl_platform_id. + * + * \note Copies of these objects are inexpensive, since they don't 'own' + * any underlying resources or data structures. + * + * \see cl_platform_id + */ +class Platform : public detail::Wrapper +{ +public: + //! \brief Default constructor - initializes to NULL. + Platform() : detail::Wrapper() { } + + /*! \brief Constructor from cl_platform_id. + * + * This simply copies the platform ID value, which is an inexpensive operation. + */ + __CL_EXPLICIT_CONSTRUCTORS Platform(const cl_platform_id &platform) : detail::Wrapper(platform) { } + + /*! \brief Assignment operator from cl_platform_id. + * + * This simply copies the platform ID value, which is an inexpensive operation. + */ + Platform& operator = (const cl_platform_id& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetPlatformInfo(). + cl_int getInfo(cl_platform_info name, STRING_CLASS* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetPlatformInfo, object_, name, param), + __GET_PLATFORM_INFO_ERR); + } + + //! \brief Wrapper for clGetPlatformInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_platform_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /*! \brief Gets a list of devices for this platform. + * + * Wraps clGetDeviceIDs(). + */ + cl_int getDevices( + cl_device_type type, + VECTOR_CLASS* devices) const + { + cl_uint n = 0; + if( devices == NULL ) { + return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR); + } + cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_DEVICE_IDS_ERR); + } + + cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id)); + err = ::clGetDeviceIDs(object_, type, n, ids, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_DEVICE_IDS_ERR); + } + + devices->assign(&ids[0], &ids[n]); + return CL_SUCCESS; + } + +#if defined(USE_DX_INTEROP) + /*! \brief Get the list of available D3D10 devices. + * + * \param d3d_device_source. + * + * \param d3d_object. + * + * \param d3d_device_set. + * + * \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device + * values returned in devices can be used to identify a specific OpenCL + * device. If \a devices argument is NULL, this argument is ignored. + * + * \return One of the following values: + * - CL_SUCCESS if the function is executed successfully. + * + * The application can query specific capabilities of the OpenCL device(s) + * returned by cl::getDevices. This can be used by the application to + * determine which device(s) to use. + * + * \note In the case that exceptions are enabled and a return value + * other than CL_SUCCESS is generated, then cl::Error exception is + * generated. + */ + cl_int getDevices( + cl_d3d10_device_source_khr d3d_device_source, + void * d3d_object, + cl_d3d10_device_set_khr d3d_device_set, + VECTOR_CLASS* devices) const + { + typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)( + cl_platform_id platform, + cl_d3d10_device_source_khr d3d_device_source, + void * d3d_object, + cl_d3d10_device_set_khr d3d_device_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint* num_devices); + + if( devices == NULL ) { + return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR); + } + + static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL; + __INIT_CL_EXT_FCN_PTR_PLATFORM(object_, clGetDeviceIDsFromD3D10KHR); + + cl_uint n = 0; + cl_int err = pfn_clGetDeviceIDsFromD3D10KHR( + object_, + d3d_device_source, + d3d_object, + d3d_device_set, + 0, + NULL, + &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_DEVICE_IDS_ERR); + } + + cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id)); + err = pfn_clGetDeviceIDsFromD3D10KHR( + object_, + d3d_device_source, + d3d_object, + d3d_device_set, + n, + ids, + NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_DEVICE_IDS_ERR); + } + + devices->assign(&ids[0], &ids[n]); + return CL_SUCCESS; + } +#endif + + /*! \brief Gets a list of available platforms. + * + * Wraps clGetPlatformIDs(). + */ + static cl_int get( + VECTOR_CLASS* platforms) + { + cl_uint n = 0; + + if( platforms == NULL ) { + return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR); + } + + cl_int err = ::clGetPlatformIDs(0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + cl_platform_id* ids = (cl_platform_id*) alloca( + n * sizeof(cl_platform_id)); + err = ::clGetPlatformIDs(n, ids, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + platforms->assign(&ids[0], &ids[n]); + return CL_SUCCESS; + } + + /*! \brief Gets the first available platform. + * + * Wraps clGetPlatformIDs(), returning the first result. + */ + static cl_int get( + Platform * platform) + { + cl_uint n = 0; + + if( platform == NULL ) { + return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR); + } + + cl_int err = ::clGetPlatformIDs(0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + cl_platform_id* ids = (cl_platform_id*) alloca( + n * sizeof(cl_platform_id)); + err = ::clGetPlatformIDs(n, ids, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + *platform = ids[0]; + return CL_SUCCESS; + } + + /*! \brief Gets the first available platform, returning it by value. + * + * Wraps clGetPlatformIDs(), returning the first result. + */ + static Platform get( + cl_int * errResult = NULL) + { + Platform platform; + cl_uint n = 0; + cl_int err = ::clGetPlatformIDs(0, NULL, &n); + if (err != CL_SUCCESS) { + detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + if (errResult != NULL) { + *errResult = err; + } + return Platform(); + } + + cl_platform_id* ids = (cl_platform_id*) alloca( + n * sizeof(cl_platform_id)); + err = ::clGetPlatformIDs(n, ids, NULL); + + if (err != CL_SUCCESS) { + detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + if (errResult != NULL) { + *errResult = err; + } + return Platform(); + } + + + return Platform(ids[0]); + } + + static Platform getDefault( + cl_int *errResult = NULL ) + { + return get(errResult); + } + + +#if defined(CL_VERSION_1_2) + //! \brief Wrapper for clUnloadCompiler(). + cl_int + unloadCompiler() + { + return ::clUnloadPlatformCompiler(object_); + } +#endif // #if defined(CL_VERSION_1_2) +}; // class Platform + +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) +/** + * Unload the OpenCL compiler. + * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead. + */ +inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int +UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; +inline cl_int +UnloadCompiler() +{ + return ::clUnloadCompiler(); +} +#endif // #if defined(CL_VERSION_1_1) + +/*! \brief Class interface for cl_context. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_context as the original. For details, see + * clRetainContext() and clReleaseContext(). + * + * \see cl_context + */ +class Context + : public detail::Wrapper +{ +private: + +#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED + static std::atomic default_initialized_; +#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED + static volatile int default_initialized_; +#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED + static Context default_; + static volatile cl_int default_error_; +public: + /*! \brief Constructs a context including a list of specified devices. + * + * Wraps clCreateContext(). + */ + Context( + const VECTOR_CLASS& devices, + cl_context_properties* properties = NULL, + void (CL_CALLBACK * notifyFptr)( + const char *, + const void *, + ::size_t, + void *) = NULL, + void* data = NULL, + cl_int* err = NULL) + { + cl_int error; + + ::size_t numDevices = devices.size(); + cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id)); + for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { + deviceIDs[deviceIndex] = (devices[deviceIndex])(); + } + + object_ = ::clCreateContext( + properties, (cl_uint) numDevices, + deviceIDs, + notifyFptr, data, &error); + + detail::errHandler(error, __CREATE_CONTEXT_ERR); + if (err != NULL) { + *err = error; + } + } + + Context( + const Device& device, + cl_context_properties* properties = NULL, + void (CL_CALLBACK * notifyFptr)( + const char *, + const void *, + ::size_t, + void *) = NULL, + void* data = NULL, + cl_int* err = NULL) + { + cl_int error; + + cl_device_id deviceID = device(); + + object_ = ::clCreateContext( + properties, 1, + &deviceID, + notifyFptr, data, &error); + + detail::errHandler(error, __CREATE_CONTEXT_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! \brief Constructs a context including all or a subset of devices of a specified type. + * + * Wraps clCreateContextFromType(). + */ + Context( + cl_device_type type, + cl_context_properties* properties = NULL, + void (CL_CALLBACK * notifyFptr)( + const char *, + const void *, + ::size_t, + void *) = NULL, + void* data = NULL, + cl_int* err = NULL) + { + cl_int error; + +#if !defined(__APPLE__) && !defined(__MACOS) + cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 }; + + if (properties == NULL) { + // Get a valid platform ID as we cannot send in a blank one + VECTOR_CLASS platforms; + error = Platform::get(&platforms); + if (error != CL_SUCCESS) { + detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR); + if (err != NULL) { + *err = error; + } + return; + } + + // Check the platforms we found for a device of our specified type + cl_context_properties platform_id = 0; + for (unsigned int i = 0; i < platforms.size(); i++) { + + VECTOR_CLASS devices; + +#if defined(__CL_ENABLE_EXCEPTIONS) + try { +#endif + + error = platforms[i].getDevices(type, &devices); + +#if defined(__CL_ENABLE_EXCEPTIONS) + } catch (Error) {} + // Catch if exceptions are enabled as we don't want to exit if first platform has no devices of type + // We do error checking next anyway, and can throw there if needed +#endif + + // Only squash CL_SUCCESS and CL_DEVICE_NOT_FOUND + if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND) { + detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR); + if (err != NULL) { + *err = error; + } + } + + if (devices.size() > 0) { + platform_id = (cl_context_properties)platforms[i](); + break; + } + } + + if (platform_id == 0) { + detail::errHandler(CL_DEVICE_NOT_FOUND, __CREATE_CONTEXT_FROM_TYPE_ERR); + if (err != NULL) { + *err = CL_DEVICE_NOT_FOUND; + } + return; + } + + prop[1] = platform_id; + properties = &prop[0]; + } +#endif + object_ = ::clCreateContextFromType( + properties, type, notifyFptr, data, &error); + + detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Context(const Context& ctx) : detail::Wrapper(ctx) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Context& operator = (const Context &ctx) + { + detail::Wrapper::operator=(ctx); + return *this; + } + +#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Context(Context&& ctx) CL_HPP_NOEXCEPT : detail::Wrapper(std::move(ctx)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Context& operator = (Context &&ctx) + { + detail::Wrapper::operator=(std::move(ctx)); + return *this; + } +#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + + /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT. + * + * \note All calls to this function return the same cl_context as the first. + */ + static Context getDefault(cl_int * err = NULL) + { + int state = detail::compare_exchange( + &default_initialized_, + __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED); + + if (state & __DEFAULT_INITIALIZED) { + if (err != NULL) { + *err = default_error_; + } + return default_; + } + + if (state & __DEFAULT_BEING_INITIALIZED) { + // Assume writes will propagate eventually... + while(default_initialized_ != __DEFAULT_INITIALIZED) { + detail::fence(); + } + + if (err != NULL) { + *err = default_error_; + } + return default_; + } + + cl_int error; + default_ = Context( + CL_DEVICE_TYPE_DEFAULT, + NULL, + NULL, + NULL, + &error); + + detail::fence(); + + default_error_ = error; + // Assume writes will propagate eventually... + default_initialized_ = __DEFAULT_INITIALIZED; + + detail::fence(); + + if (err != NULL) { + *err = default_error_; + } + return default_; + + } + + //! \brief Default constructor - initializes to NULL. + Context() : detail::Wrapper() { } + + /*! \brief Constructor from cl_context - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_context + * into the new Context object. + */ + __CL_EXPLICIT_CONSTRUCTORS Context(const cl_context& context) : detail::Wrapper(context) { } + + /*! \brief Assignment operator from cl_context - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseContext() on the value previously held by this instance. + */ + Context& operator = (const cl_context& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetContextInfo(). + template + cl_int getInfo(cl_context_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetContextInfo, object_, name, param), + __GET_CONTEXT_INFO_ERR); + } + + //! \brief Wrapper for clGetContextInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_context_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /*! \brief Gets a list of supported image formats. + * + * Wraps clGetSupportedImageFormats(). + */ + cl_int getSupportedImageFormats( + cl_mem_flags flags, + cl_mem_object_type type, + VECTOR_CLASS* formats) const + { + cl_uint numEntries; + + if (!formats) { + return CL_SUCCESS; + } + + cl_int err = ::clGetSupportedImageFormats( + object_, + flags, + type, + 0, + NULL, + &numEntries); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR); + } + + if (numEntries > 0) { + ImageFormat* value = (ImageFormat*) + alloca(numEntries * sizeof(ImageFormat)); + err = ::clGetSupportedImageFormats( + object_, + flags, + type, + numEntries, + (cl_image_format*)value, + NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR); + } + + formats->assign(&value[0], &value[numEntries]); + } + else { + formats->clear(); + } + return CL_SUCCESS; + } +}; + +inline Device Device::getDefault(cl_int * err) +{ + cl_int error; + Device device; + + Context context = Context::getDefault(&error); + detail::errHandler(error, __CREATE_CONTEXT_ERR); + + if (error != CL_SUCCESS) { + if (err != NULL) { + *err = error; + } + } + else { + device = context.getInfo()[0]; + if (err != NULL) { + *err = CL_SUCCESS; + } + } + + return device; +} + + +#ifdef _WIN32 +#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED +__declspec(selectany) std::atomic Context::default_initialized_; +#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED +__declspec(selectany) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED; +#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED +__declspec(selectany) Context Context::default_; +__declspec(selectany) volatile cl_int Context::default_error_ = CL_SUCCESS; +#else // !_WIN32 +#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED +__attribute__((weak)) std::atomic Context::default_initialized_; +#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED +__attribute__((weak)) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED; +#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED +__attribute__((weak)) Context Context::default_; +__attribute__((weak)) volatile cl_int Context::default_error_ = CL_SUCCESS; +#endif // !_WIN32 + +/*! \brief Class interface for cl_event. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_event as the original. For details, see + * clRetainEvent() and clReleaseEvent(). + * + * \see cl_event + */ +class Event : public detail::Wrapper +{ +public: + //! \brief Default constructor - initializes to NULL. + Event() : detail::Wrapper() { } + + /*! \brief Constructor from cl_event - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_event + * into the new Event object. + */ + __CL_EXPLICIT_CONSTRUCTORS Event(const cl_event& event) : detail::Wrapper(event) { } + + /*! \brief Assignment operator from cl_event - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseEvent() on the value previously held by this instance. + */ + Event& operator = (const cl_event& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetEventInfo(). + template + cl_int getInfo(cl_event_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetEventInfo, object_, name, param), + __GET_EVENT_INFO_ERR); + } + + //! \brief Wrapper for clGetEventInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_event_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + //! \brief Wrapper for clGetEventProfilingInfo(). + template + cl_int getProfilingInfo(cl_profiling_info name, T* param) const + { + return detail::errHandler(detail::getInfo( + &::clGetEventProfilingInfo, object_, name, param), + __GET_EVENT_PROFILE_INFO_ERR); + } + + //! \brief Wrapper for clGetEventProfilingInfo() that returns by value. + template typename + detail::param_traits::param_type + getProfilingInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_profiling_info, name>::param_type param; + cl_int result = getProfilingInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /*! \brief Blocks the calling thread until this event completes. + * + * Wraps clWaitForEvents(). + */ + cl_int wait() const + { + return detail::errHandler( + ::clWaitForEvents(1, &object_), + __WAIT_FOR_EVENTS_ERR); + } + +#if defined(CL_VERSION_1_1) + /*! \brief Registers a user callback function for a specific command execution status. + * + * Wraps clSetEventCallback(). + */ + cl_int setCallback( + cl_int type, + void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *), + void * user_data = NULL) + { + return detail::errHandler( + ::clSetEventCallback( + object_, + type, + pfn_notify, + user_data), + __SET_EVENT_CALLBACK_ERR); + } +#endif + + /*! \brief Blocks the calling thread until every event specified is complete. + * + * Wraps clWaitForEvents(). + */ + static cl_int + waitForEvents(const VECTOR_CLASS& events) + { + return detail::errHandler( + ::clWaitForEvents( + (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : NULL), + __WAIT_FOR_EVENTS_ERR); + } +}; + +#if defined(CL_VERSION_1_1) +/*! \brief Class interface for user events (a subset of cl_event's). + * + * See Event for details about copy semantics, etc. + */ +class UserEvent : public Event +{ +public: + /*! \brief Constructs a user event on a given context. + * + * Wraps clCreateUserEvent(). + */ + UserEvent( + const Context& context, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateUserEvent( + context(), + &error); + + detail::errHandler(error, __CREATE_USER_EVENT_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + UserEvent() : Event() { } + + /*! \brief Sets the execution status of a user event object. + * + * Wraps clSetUserEventStatus(). + */ + cl_int setStatus(cl_int status) + { + return detail::errHandler( + ::clSetUserEventStatus(object_,status), + __SET_USER_EVENT_STATUS_ERR); + } +}; +#endif + +/*! \brief Blocks the calling thread until every event specified is complete. + * + * Wraps clWaitForEvents(). + */ +inline static cl_int +WaitForEvents(const VECTOR_CLASS& events) +{ + return detail::errHandler( + ::clWaitForEvents( + (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : NULL), + __WAIT_FOR_EVENTS_ERR); +} + +/*! \brief Class interface for cl_mem. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_mem as the original. For details, see + * clRetainMemObject() and clReleaseMemObject(). + * + * \see cl_mem + */ +class Memory : public detail::Wrapper +{ +public: + //! \brief Default constructor - initializes to NULL. + Memory() : detail::Wrapper() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_mem + * into the new Memory object. + */ + __CL_EXPLICIT_CONSTRUCTORS Memory(const cl_mem& memory) : detail::Wrapper(memory) { } + + /*! \brief Assignment operator from cl_mem - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseMemObject() on the value previously held by this instance. + */ + Memory& operator = (const cl_mem& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Memory(const Memory& mem) : detail::Wrapper(mem) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Memory& operator = (const Memory &mem) + { + detail::Wrapper::operator=(mem); + return *this; + } + +#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Memory(Memory&& mem) CL_HPP_NOEXCEPT : detail::Wrapper(std::move(mem)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Memory& operator = (Memory &&mem) + { + detail::Wrapper::operator=(std::move(mem)); + return *this; + } +#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + + //! \brief Wrapper for clGetMemObjectInfo(). + template + cl_int getInfo(cl_mem_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetMemObjectInfo, object_, name, param), + __GET_MEM_OBJECT_INFO_ERR); + } + + //! \brief Wrapper for clGetMemObjectInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_mem_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + +#if defined(CL_VERSION_1_1) + /*! \brief Registers a callback function to be called when the memory object + * is no longer needed. + * + * Wraps clSetMemObjectDestructorCallback(). + * + * Repeated calls to this function, for a given cl_mem value, will append + * to the list of functions called (in reverse order) when memory object's + * resources are freed and the memory object is deleted. + * + * \note + * The registered callbacks are associated with the underlying cl_mem + * value - not the Memory class instance. + */ + cl_int setDestructorCallback( + void (CL_CALLBACK * pfn_notify)(cl_mem, void *), + void * user_data = NULL) + { + return detail::errHandler( + ::clSetMemObjectDestructorCallback( + object_, + pfn_notify, + user_data), + __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR); + } +#endif + +}; + +// Pre-declare copy functions +class Buffer; +template< typename IteratorType > +cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer ); +template< typename IteratorType > +cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator ); +template< typename IteratorType > +cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer ); +template< typename IteratorType > +cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator ); + + +/*! \brief Class interface for Buffer Memory Objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Buffer : public Memory +{ +public: + + /*! \brief Constructs a Buffer in a specified context. + * + * Wraps clCreateBuffer(). + * + * \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was + * specified. Note alignment & exclusivity requirements. + */ + Buffer( + const Context& context, + cl_mem_flags flags, + ::size_t size, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error); + + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! \brief Constructs a Buffer in the default context. + * + * Wraps clCreateBuffer(). + * + * \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was + * specified. Note alignment & exclusivity requirements. + * + * \see Context::getDefault() + */ + Buffer( + cl_mem_flags flags, + ::size_t size, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + + Context context = Context::getDefault(err); + + object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error); + + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! + * \brief Construct a Buffer from a host container via iterators. + * IteratorType must be random access. + * If useHostPtr is specified iterators must represent contiguous data. + */ + template< typename IteratorType > + Buffer( + IteratorType startIterator, + IteratorType endIterator, + bool readOnly, + bool useHostPtr = false, + cl_int* err = NULL) + { + typedef typename std::iterator_traits::value_type DataType; + cl_int error; + + cl_mem_flags flags = 0; + if( readOnly ) { + flags |= CL_MEM_READ_ONLY; + } + else { + flags |= CL_MEM_READ_WRITE; + } + if( useHostPtr ) { + flags |= CL_MEM_USE_HOST_PTR; + } + + ::size_t size = sizeof(DataType)*(endIterator - startIterator); + + Context context = Context::getDefault(err); + + if( useHostPtr ) { + object_ = ::clCreateBuffer(context(), flags, size, static_cast(&*startIterator), &error); + } else { + object_ = ::clCreateBuffer(context(), flags, size, 0, &error); + } + + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + + if( !useHostPtr ) { + error = cl::copy(startIterator, endIterator, *this); + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + } + + /*! + * \brief Construct a Buffer from a host container via iterators using a specified context. + * IteratorType must be random access. + * If useHostPtr is specified iterators must represent contiguous data. + */ + template< typename IteratorType > + Buffer(const Context &context, IteratorType startIterator, IteratorType endIterator, + bool readOnly, bool useHostPtr = false, cl_int* err = NULL); + + /*! + * \brief Construct a Buffer from a host container via iterators using a specified queue. + * If useHostPtr is specified iterators must represent contiguous data. + */ + template< typename IteratorType > + Buffer(const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, + bool readOnly, bool useHostPtr = false, cl_int* err = NULL); + + //! \brief Default constructor - initializes to NULL. + Buffer() : Memory() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Buffer(const cl_mem& buffer) : Memory(buffer) { } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Buffer& operator = (const cl_mem& rhs) + { + Memory::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Buffer(const Buffer& buf) : Memory(buf) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Buffer& operator = (const Buffer &buf) + { + Memory::operator=(buf); + return *this; + } + +#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Buffer(Buffer&& buf) CL_HPP_NOEXCEPT : Memory(std::move(buf)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Buffer& operator = (Buffer &&buf) + { + Memory::operator=(std::move(buf)); + return *this; + } +#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + +#if defined(CL_VERSION_1_1) + /*! \brief Creates a new buffer object from this. + * + * Wraps clCreateSubBuffer(). + */ + Buffer createSubBuffer( + cl_mem_flags flags, + cl_buffer_create_type buffer_create_type, + const void * buffer_create_info, + cl_int * err = NULL) + { + Buffer result; + cl_int error; + result.object_ = ::clCreateSubBuffer( + object_, + flags, + buffer_create_type, + buffer_create_info, + &error); + + detail::errHandler(error, __CREATE_SUBBUFFER_ERR); + if (err != NULL) { + *err = error; + } + + return result; + } +#endif +}; + +#if defined (USE_DX_INTEROP) +/*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's. + * + * This is provided to facilitate interoperability with Direct3D. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class BufferD3D10 : public Buffer +{ +public: + typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)( + cl_context context, cl_mem_flags flags, ID3D10Buffer* buffer, + cl_int* errcode_ret); + + /*! \brief Constructs a BufferD3D10, in a specified context, from a + * given ID3D10Buffer. + * + * Wraps clCreateFromD3D10BufferKHR(). + */ + BufferD3D10( + const Context& context, + cl_mem_flags flags, + ID3D10Buffer* bufobj, + cl_int * err = NULL) + { + static PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR = NULL; + +#if defined(CL_VERSION_1_2) + vector props = context.getInfo(); + cl_platform platform = -1; + for( int i = 0; i < props.size(); ++i ) { + if( props[i] == CL_CONTEXT_PLATFORM ) { + platform = props[i+1]; + } + } + __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clCreateFromD3D10BufferKHR); +#endif +#if defined(CL_VERSION_1_1) + __INIT_CL_EXT_FCN_PTR(clCreateFromD3D10BufferKHR); +#endif + + cl_int error; + object_ = pfn_clCreateFromD3D10BufferKHR( + context(), + flags, + bufobj, + &error); + + detail::errHandler(error, __CREATE_GL_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + BufferD3D10() : Buffer() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS BufferD3D10(const cl_mem& buffer) : Buffer(buffer) { } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + BufferD3D10& operator = (const cl_mem& rhs) + { + Buffer::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + BufferD3D10(const BufferD3D10& buf) : Buffer(buf) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + BufferD3D10& operator = (const BufferD3D10 &buf) + { + Buffer::operator=(buf); + return *this; + } + +#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + BufferD3D10(BufferD3D10&& buf) CL_HPP_NOEXCEPT : Buffer(std::move(buf)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + BufferD3D10& operator = (BufferD3D10 &&buf) + { + Buffer::operator=(std::move(buf)); + return *this; + } +#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) +}; +#endif + +/*! \brief Class interface for GL Buffer Memory Objects. + * + * This is provided to facilitate interoperability with OpenGL. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class BufferGL : public Buffer +{ +public: + /*! \brief Constructs a BufferGL in a specified context, from a given + * GL buffer. + * + * Wraps clCreateFromGLBuffer(). + */ + BufferGL( + const Context& context, + cl_mem_flags flags, + cl_GLuint bufobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLBuffer( + context(), + flags, + bufobj, + &error); + + detail::errHandler(error, __CREATE_GL_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + BufferGL() : Buffer() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS BufferGL(const cl_mem& buffer) : Buffer(buffer) { } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + BufferGL& operator = (const cl_mem& rhs) + { + Buffer::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + BufferGL(const BufferGL& buf) : Buffer(buf) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + BufferGL& operator = (const BufferGL &buf) + { + Buffer::operator=(buf); + return *this; + } + +#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + BufferGL(BufferGL&& buf) CL_HPP_NOEXCEPT : Buffer(std::move(buf)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + BufferGL& operator = (BufferGL &&buf) + { + Buffer::operator=(std::move(buf)); + return *this; + } +#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + + //! \brief Wrapper for clGetGLObjectInfo(). + cl_int getObjectInfo( + cl_gl_object_type *type, + cl_GLuint * gl_object_name) + { + return detail::errHandler( + ::clGetGLObjectInfo(object_,type,gl_object_name), + __GET_GL_OBJECT_INFO_ERR); + } +}; + +/*! \brief C++ base class for Image Memory objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Image : public Memory +{ +protected: + //! \brief Default constructor - initializes to NULL. + Image() : Memory() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image(const cl_mem& image) : Memory(image) { } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image& operator = (const cl_mem& rhs) + { + Memory::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image(const Image& img) : Memory(img) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image& operator = (const Image &img) + { + Memory::operator=(img); + return *this; + } + +#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Image(Image&& img) CL_HPP_NOEXCEPT : Memory(std::move(img)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Image& operator = (Image &&img) + { + Memory::operator=(std::move(img)); + return *this; + } +#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + +public: + //! \brief Wrapper for clGetImageInfo(). + template + cl_int getImageInfo(cl_image_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetImageInfo, object_, name, param), + __GET_IMAGE_INFO_ERR); + } + + //! \brief Wrapper for clGetImageInfo() that returns by value. + template typename + detail::param_traits::param_type + getImageInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_image_info, name>::param_type param; + cl_int result = getImageInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } +}; + +#if defined(CL_VERSION_1_2) +/*! \brief Class interface for 1D Image Memory objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Image1D : public Image +{ +public: + /*! \brief Constructs a 1D Image in a specified context. + * + * Wraps clCreateImage(). + */ + Image1D( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t width, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + cl_image_desc desc = + { + CL_MEM_OBJECT_IMAGE1D, + width, + 0, 0, 0, 0, 0, 0, 0, 0 + }; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + Image1D() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image1D(const cl_mem& image1D) : Image(image1D) { } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image1D& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image1D(const Image1D& img) : Image(img) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image1D& operator = (const Image1D &img) + { + Image::operator=(img); + return *this; + } + +#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Image1D(Image1D&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Image1D& operator = (Image1D &&img) + { + Image::operator=(std::move(img)); + return *this; + } +#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) +}; + +/*! \class Image1DBuffer + * \brief Image interface for 1D buffer images. + */ +class Image1DBuffer : public Image +{ +public: + Image1DBuffer( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t width, + const Buffer &buffer, + cl_int* err = NULL) + { + cl_int error; + cl_image_desc desc = + { + CL_MEM_OBJECT_IMAGE1D_BUFFER, + width, + 0, 0, 0, 0, 0, 0, 0, + buffer() + }; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + NULL, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } + + Image1DBuffer() { } + + __CL_EXPLICIT_CONSTRUCTORS Image1DBuffer(const cl_mem& image1D) : Image(image1D) { } + + Image1DBuffer& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image1DBuffer(const Image1DBuffer& img) : Image(img) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image1DBuffer& operator = (const Image1DBuffer &img) + { + Image::operator=(img); + return *this; + } + +#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Image1DBuffer(Image1DBuffer&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Image1DBuffer& operator = (Image1DBuffer &&img) + { + Image::operator=(std::move(img)); + return *this; + } +#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) +}; + +/*! \class Image1DArray + * \brief Image interface for arrays of 1D images. + */ +class Image1DArray : public Image +{ +public: + Image1DArray( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t arraySize, + ::size_t width, + ::size_t rowPitch, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + cl_image_desc desc = + { + CL_MEM_OBJECT_IMAGE1D_ARRAY, + width, + 0, 0, // height, depth (unused) + arraySize, + rowPitch, + 0, 0, 0, 0 + }; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } + + Image1DArray() { } + + __CL_EXPLICIT_CONSTRUCTORS Image1DArray(const cl_mem& imageArray) : Image(imageArray) { } + + Image1DArray& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image1DArray(const Image1DArray& img) : Image(img) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image1DArray& operator = (const Image1DArray &img) + { + Image::operator=(img); + return *this; + } + +#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Image1DArray(Image1DArray&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Image1DArray& operator = (Image1DArray &&img) + { + Image::operator=(std::move(img)); + return *this; + } +#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) +}; +#endif // #if defined(CL_VERSION_1_2) + + +/*! \brief Class interface for 2D Image Memory objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Image2D : public Image +{ +public: + /*! \brief Constructs a 1D Image in a specified context. + * + * Wraps clCreateImage(). + */ + Image2D( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t width, + ::size_t height, + ::size_t row_pitch = 0, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + bool useCreateImage; + +#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + // Run-time decision based on the actual platform + { + cl_uint version = detail::getContextPlatformVersion(context()); + useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above + } +#elif defined(CL_VERSION_1_2) + useCreateImage = true; +#else + useCreateImage = false; +#endif + +#if defined(CL_VERSION_1_2) + if (useCreateImage) + { + cl_image_desc desc = + { + CL_MEM_OBJECT_IMAGE2D, + width, + height, + 0, 0, // depth, array size (unused) + row_pitch, + 0, 0, 0, 0 + }; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // #if defined(CL_VERSION_1_2) +#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + if (!useCreateImage) + { + object_ = ::clCreateImage2D( + context(), flags,&format, width, height, row_pitch, host_ptr, &error); + + detail::errHandler(error, __CREATE_IMAGE2D_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + } + + //! \brief Default constructor - initializes to NULL. + Image2D() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image2D(const cl_mem& image2D) : Image(image2D) { } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image2D& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image2D(const Image2D& img) : Image(img) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image2D& operator = (const Image2D &img) + { + Image::operator=(img); + return *this; + } + +#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Image2D(Image2D&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Image2D& operator = (Image2D &&img) + { + Image::operator=(std::move(img)); + return *this; + } +#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) +}; + + +#if !defined(CL_VERSION_1_2) +/*! \brief Class interface for GL 2D Image Memory objects. + * + * This is provided to facilitate interoperability with OpenGL. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + * \note Deprecated for OpenCL 1.2. Please use ImageGL instead. + */ +class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED : public Image2D +{ +public: + /*! \brief Constructs an Image2DGL in a specified context, from a given + * GL Texture. + * + * Wraps clCreateFromGLTexture2D(). + */ + Image2DGL( + const Context& context, + cl_mem_flags flags, + cl_GLenum target, + cl_GLint miplevel, + cl_GLuint texobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLTexture2D( + context(), + flags, + target, + miplevel, + texobj, + &error); + + detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR); + if (err != NULL) { + *err = error; + } + + } + + //! \brief Default constructor - initializes to NULL. + Image2DGL() : Image2D() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image2DGL(const cl_mem& image) : Image2D(image) { } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image2DGL& operator = (const cl_mem& rhs) + { + Image2D::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image2DGL(const Image2DGL& img) : Image2D(img) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image2DGL& operator = (const Image2DGL &img) + { + Image2D::operator=(img); + return *this; + } + +#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Image2DGL(Image2DGL&& img) CL_HPP_NOEXCEPT : Image2D(std::move(img)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Image2DGL& operator = (Image2DGL &&img) + { + Image2D::operator=(std::move(img)); + return *this; + } +#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) +}; +#endif // #if !defined(CL_VERSION_1_2) + +#if defined(CL_VERSION_1_2) +/*! \class Image2DArray + * \brief Image interface for arrays of 2D images. + */ +class Image2DArray : public Image +{ +public: + Image2DArray( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t arraySize, + ::size_t width, + ::size_t height, + ::size_t rowPitch, + ::size_t slicePitch, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + cl_image_desc desc = + { + CL_MEM_OBJECT_IMAGE2D_ARRAY, + width, + height, + 0, // depth (unused) + arraySize, + rowPitch, + slicePitch, + 0, 0, 0 + }; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } + + Image2DArray() { } + + __CL_EXPLICIT_CONSTRUCTORS Image2DArray(const cl_mem& imageArray) : Image(imageArray) { } + + Image2DArray& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image2DArray(const Image2DArray& img) : Image(img) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image2DArray& operator = (const Image2DArray &img) + { + Image::operator=(img); + return *this; + } + +#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Image2DArray(Image2DArray&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Image2DArray& operator = (Image2DArray &&img) + { + Image::operator=(std::move(img)); + return *this; + } +#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) +}; +#endif // #if defined(CL_VERSION_1_2) + +/*! \brief Class interface for 3D Image Memory objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Image3D : public Image +{ +public: + /*! \brief Constructs a 3D Image in a specified context. + * + * Wraps clCreateImage(). + */ + Image3D( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t width, + ::size_t height, + ::size_t depth, + ::size_t row_pitch = 0, + ::size_t slice_pitch = 0, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + bool useCreateImage; + +#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + // Run-time decision based on the actual platform + { + cl_uint version = detail::getContextPlatformVersion(context()); + useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above + } +#elif defined(CL_VERSION_1_2) + useCreateImage = true; +#else + useCreateImage = false; +#endif + +#if defined(CL_VERSION_1_2) + if (useCreateImage) + { + cl_image_desc desc = + { + CL_MEM_OBJECT_IMAGE3D, + width, + height, + depth, + 0, // array size (unused) + row_pitch, + slice_pitch, + 0, 0, 0 + }; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // #if defined(CL_VERSION_1_2) +#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + if (!useCreateImage) + { + object_ = ::clCreateImage3D( + context(), flags, &format, width, height, depth, row_pitch, + slice_pitch, host_ptr, &error); + + detail::errHandler(error, __CREATE_IMAGE3D_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + } + + //! \brief Default constructor - initializes to NULL. + Image3D() : Image() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image3D(const cl_mem& image3D) : Image(image3D) { } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image3D& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image3D(const Image3D& img) : Image(img) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image3D& operator = (const Image3D &img) + { + Image::operator=(img); + return *this; + } + +#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Image3D(Image3D&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Image3D& operator = (Image3D &&img) + { + Image::operator=(std::move(img)); + return *this; + } +#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) +}; + +#if !defined(CL_VERSION_1_2) +/*! \brief Class interface for GL 3D Image Memory objects. + * + * This is provided to facilitate interoperability with OpenGL. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Image3DGL : public Image3D +{ +public: + /*! \brief Constructs an Image3DGL in a specified context, from a given + * GL Texture. + * + * Wraps clCreateFromGLTexture3D(). + */ + Image3DGL( + const Context& context, + cl_mem_flags flags, + cl_GLenum target, + cl_GLint miplevel, + cl_GLuint texobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLTexture3D( + context(), + flags, + target, + miplevel, + texobj, + &error); + + detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + Image3DGL() : Image3D() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image3DGL(const cl_mem& image) : Image3D(image) { } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image3DGL& operator = (const cl_mem& rhs) + { + Image3D::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image3DGL(const Image3DGL& img) : Image3D(img) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image3DGL& operator = (const Image3DGL &img) + { + Image3D::operator=(img); + return *this; + } + +#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Image3DGL(Image3DGL&& img) CL_HPP_NOEXCEPT : Image3D(std::move(img)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Image3DGL& operator = (Image3DGL &&img) + { + Image3D::operator=(std::move(img)); + return *this; + } +#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) +}; +#endif // #if !defined(CL_VERSION_1_2) + +#if defined(CL_VERSION_1_2) +/*! \class ImageGL + * \brief general image interface for GL interop. + * We abstract the 2D and 3D GL images into a single instance here + * that wraps all GL sourced images on the grounds that setup information + * was performed by OpenCL anyway. + */ +class ImageGL : public Image +{ +public: + ImageGL( + const Context& context, + cl_mem_flags flags, + cl_GLenum target, + cl_GLint miplevel, + cl_GLuint texobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLTexture( + context(), + flags, + target, + miplevel, + texobj, + &error); + + detail::errHandler(error, __CREATE_GL_TEXTURE_ERR); + if (err != NULL) { + *err = error; + } + } + + ImageGL() : Image() { } + + __CL_EXPLICIT_CONSTRUCTORS ImageGL(const cl_mem& image) : Image(image) { } + + ImageGL& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + ImageGL(const ImageGL& img) : Image(img) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + ImageGL& operator = (const ImageGL &img) + { + Image::operator=(img); + return *this; + } + +#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + ImageGL(ImageGL&& img) CL_HPP_NOEXCEPT : Image(std::move(img)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + ImageGL& operator = (ImageGL &&img) + { + Image::operator=(std::move(img)); + return *this; + } +#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) +}; +#endif // #if defined(CL_VERSION_1_2) + +/*! \brief Class interface for GL Render Buffer Memory Objects. +* +* This is provided to facilitate interoperability with OpenGL. +* +* See Memory for details about copy semantics, etc. +* +* \see Memory +*/ +class BufferRenderGL : +#if defined(CL_VERSION_1_2) + public ImageGL +#else // #if defined(CL_VERSION_1_2) + public Image2DGL +#endif //#if defined(CL_VERSION_1_2) +{ +public: + /*! \brief Constructs a BufferRenderGL in a specified context, from a given + * GL Renderbuffer. + * + * Wraps clCreateFromGLRenderbuffer(). + */ + BufferRenderGL( + const Context& context, + cl_mem_flags flags, + cl_GLuint bufobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLRenderbuffer( + context(), + flags, + bufobj, + &error); + + detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. +#if defined(CL_VERSION_1_2) + BufferRenderGL() : ImageGL() {}; +#else // #if defined(CL_VERSION_1_2) + BufferRenderGL() : Image2DGL() {}; +#endif //#if defined(CL_VERSION_1_2) + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ +#if defined(CL_VERSION_1_2) + __CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : ImageGL(buffer) { } +#else // #if defined(CL_VERSION_1_2) + __CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : Image2DGL(buffer) { } +#endif //#if defined(CL_VERSION_1_2) + + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + BufferRenderGL& operator = (const cl_mem& rhs) + { +#if defined(CL_VERSION_1_2) + ImageGL::operator=(rhs); +#else // #if defined(CL_VERSION_1_2) + Image2DGL::operator=(rhs); +#endif //#if defined(CL_VERSION_1_2) + + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ +#if defined(CL_VERSION_1_2) + BufferRenderGL(const BufferRenderGL& buf) : ImageGL(buf) {} +#else // #if defined(CL_VERSION_1_2) + BufferRenderGL(const BufferRenderGL& buf) : Image2DGL(buf) {} +#endif //#if defined(CL_VERSION_1_2) + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + BufferRenderGL& operator = (const BufferRenderGL &rhs) + { +#if defined(CL_VERSION_1_2) + ImageGL::operator=(rhs); +#else // #if defined(CL_VERSION_1_2) + Image2DGL::operator=(rhs); +#endif //#if defined(CL_VERSION_1_2) + return *this; + } + +#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ +#if defined(CL_VERSION_1_2) + BufferRenderGL(BufferRenderGL&& buf) CL_HPP_NOEXCEPT : ImageGL(std::move(buf)) {} +#else // #if defined(CL_VERSION_1_2) + BufferRenderGL(BufferRenderGL&& buf) CL_HPP_NOEXCEPT : Image2DGL(std::move(buf)) {} +#endif //#if defined(CL_VERSION_1_2) + + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + BufferRenderGL& operator = (BufferRenderGL &&buf) + { +#if defined(CL_VERSION_1_2) + ImageGL::operator=(std::move(buf)); +#else // #if defined(CL_VERSION_1_2) + Image2DGL::operator=(std::move(buf)); +#endif //#if defined(CL_VERSION_1_2) + + return *this; + } +#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + + //! \brief Wrapper for clGetGLObjectInfo(). + cl_int getObjectInfo( + cl_gl_object_type *type, + cl_GLuint * gl_object_name) + { + return detail::errHandler( + ::clGetGLObjectInfo(object_, type, gl_object_name), + __GET_GL_OBJECT_INFO_ERR); + } +}; + +/*! \brief Class interface for cl_sampler. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_sampler as the original. For details, see + * clRetainSampler() and clReleaseSampler(). + * + * \see cl_sampler + */ +class Sampler : public detail::Wrapper +{ +public: + //! \brief Default constructor - initializes to NULL. + Sampler() { } + + /*! \brief Constructs a Sampler in a specified context. + * + * Wraps clCreateSampler(). + */ + Sampler( + const Context& context, + cl_bool normalized_coords, + cl_addressing_mode addressing_mode, + cl_filter_mode filter_mode, + cl_int* err = NULL) + { + cl_int error; + object_ = ::clCreateSampler( + context(), + normalized_coords, + addressing_mode, + filter_mode, + &error); + + detail::errHandler(error, __CREATE_SAMPLER_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! \brief Constructor from cl_sampler - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_sampler + * into the new Sampler object. + */ + __CL_EXPLICIT_CONSTRUCTORS Sampler(const cl_sampler& sampler) : detail::Wrapper(sampler) { } + + /*! \brief Assignment operator from cl_sampler - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseSampler() on the value previously held by this instance. + */ + Sampler& operator = (const cl_sampler& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Sampler(const Sampler& sam) : detail::Wrapper(sam) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Sampler& operator = (const Sampler &sam) + { + detail::Wrapper::operator=(sam); + return *this; + } + +#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Sampler(Sampler&& sam) CL_HPP_NOEXCEPT : detail::Wrapper(std::move(sam)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Sampler& operator = (Sampler &&sam) + { + detail::Wrapper::operator=(std::move(sam)); + return *this; + } +#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + + //! \brief Wrapper for clGetSamplerInfo(). + template + cl_int getInfo(cl_sampler_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetSamplerInfo, object_, name, param), + __GET_SAMPLER_INFO_ERR); + } + + //! \brief Wrapper for clGetSamplerInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_sampler_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } +}; + +class Program; +class CommandQueue; +class Kernel; + +//! \brief Class interface for specifying NDRange values. +class NDRange +{ +private: + size_t<3> sizes_; + cl_uint dimensions_; + +public: + //! \brief Default constructor - resulting range has zero dimensions. + NDRange() + : dimensions_(0) + { } + + //! \brief Constructs one-dimensional range. + NDRange(::size_t size0) + : dimensions_(1) + { + sizes_[0] = size0; + } + + //! \brief Constructs two-dimensional range. + NDRange(::size_t size0, ::size_t size1) + : dimensions_(2) + { + sizes_[0] = size0; + sizes_[1] = size1; + } + + //! \brief Constructs three-dimensional range. + NDRange(::size_t size0, ::size_t size1, ::size_t size2) + : dimensions_(3) + { + sizes_[0] = size0; + sizes_[1] = size1; + sizes_[2] = size2; + } + + /*! \brief Conversion operator to const ::size_t *. + * + * \returns a pointer to the size of the first dimension. + */ + operator const ::size_t*() const { + return (const ::size_t*) sizes_; + } + + //! \brief Queries the number of dimensions in the range. + ::size_t dimensions() const { return dimensions_; } +}; + +//! \brief A zero-dimensional range. +static const NDRange NullRange; + +//! \brief Local address wrapper for use with Kernel::setArg +struct LocalSpaceArg +{ + ::size_t size_; +}; + +namespace detail { + +template +struct KernelArgumentHandler +{ + static ::size_t size(const T&) { return sizeof(T); } + static const T* ptr(const T& value) { return &value; } +}; + +template <> +struct KernelArgumentHandler +{ + static ::size_t size(const LocalSpaceArg& value) { return value.size_; } + static const void* ptr(const LocalSpaceArg&) { return NULL; } +}; + +} +//! \endcond + +/*! __local + * \brief Helper function for generating LocalSpaceArg objects. + * Deprecated. Replaced with Local. + */ +inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED LocalSpaceArg +__local(::size_t size) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; +inline LocalSpaceArg +__local(::size_t size) +{ + LocalSpaceArg ret = { size }; + return ret; +} + +/*! Local + * \brief Helper function for generating LocalSpaceArg objects. + */ +inline LocalSpaceArg +Local(::size_t size) +{ + LocalSpaceArg ret = { size }; + return ret; +} + +//class KernelFunctor; + +/*! \brief Class interface for cl_kernel. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_kernel as the original. For details, see + * clRetainKernel() and clReleaseKernel(). + * + * \see cl_kernel + */ +class Kernel : public detail::Wrapper +{ +public: + inline Kernel(const Program& program, const char* name, cl_int* err = NULL); + + //! \brief Default constructor - initializes to NULL. + Kernel() { } + + /*! \brief Constructor from cl_kernel - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_kernel + * into the new Kernel object. + */ + __CL_EXPLICIT_CONSTRUCTORS Kernel(const cl_kernel& kernel) : detail::Wrapper(kernel) { } + + /*! \brief Assignment operator from cl_kernel - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseKernel() on the value previously held by this instance. + */ + Kernel& operator = (const cl_kernel& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Kernel(const Kernel& kernel) : detail::Wrapper(kernel) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Kernel& operator = (const Kernel &kernel) + { + detail::Wrapper::operator=(kernel); + return *this; + } + +#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Kernel(Kernel&& kernel) CL_HPP_NOEXCEPT : detail::Wrapper(std::move(kernel)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Kernel& operator = (Kernel &&kernel) + { + detail::Wrapper::operator=(std::move(kernel)); + return *this; + } +#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + + template + cl_int getInfo(cl_kernel_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetKernelInfo, object_, name, param), + __GET_KERNEL_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_kernel_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + +#if defined(CL_VERSION_1_2) + template + cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param), + __GET_KERNEL_ARG_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getArgInfo(cl_uint argIndex, cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_kernel_arg_info, name>::param_type param; + cl_int result = getArgInfo(argIndex, name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } +#endif // #if defined(CL_VERSION_1_2) + + template + cl_int getWorkGroupInfo( + const Device& device, cl_kernel_work_group_info name, T* param) const + { + return detail::errHandler( + detail::getInfo( + &::clGetKernelWorkGroupInfo, object_, device(), name, param), + __GET_KERNEL_WORK_GROUP_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getWorkGroupInfo(const Device& device, cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_kernel_work_group_info, name>::param_type param; + cl_int result = getWorkGroupInfo(device, name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + template + cl_int setArg(cl_uint index, const T &value) + { + return detail::errHandler( + ::clSetKernelArg( + object_, + index, + detail::KernelArgumentHandler::size(value), + detail::KernelArgumentHandler::ptr(value)), + __SET_KERNEL_ARGS_ERR); + } + + cl_int setArg(cl_uint index, ::size_t size, const void* argPtr) + { + return detail::errHandler( + ::clSetKernelArg(object_, index, size, argPtr), + __SET_KERNEL_ARGS_ERR); + } +}; + +/*! \class Program + * \brief Program interface that implements cl_program. + */ +class Program : public detail::Wrapper +{ +public: + typedef VECTOR_CLASS > Binaries; + typedef VECTOR_CLASS > Sources; + + Program( + const STRING_CLASS& source, + bool build = false, + cl_int* err = NULL) + { + cl_int error; + + const char * strings = source.c_str(); + const ::size_t length = source.size(); + + Context context = Context::getDefault(err); + + object_ = ::clCreateProgramWithSource( + context(), (cl_uint)1, &strings, &length, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); + + if (error == CL_SUCCESS && build) { + + error = ::clBuildProgram( + object_, + 0, + NULL, + "", + NULL, + NULL); + + detail::errHandler(error, __BUILD_PROGRAM_ERR); + } + + if (err != NULL) { + *err = error; + } + } + + Program( + const Context& context, + const STRING_CLASS& source, + bool build = false, + cl_int* err = NULL) + { + cl_int error; + + const char * strings = source.c_str(); + const ::size_t length = source.size(); + + object_ = ::clCreateProgramWithSource( + context(), (cl_uint)1, &strings, &length, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); + + if (error == CL_SUCCESS && build) { + + error = ::clBuildProgram( + object_, + 0, + NULL, + "", + NULL, + NULL); + + detail::errHandler(error, __BUILD_PROGRAM_ERR); + } + + if (err != NULL) { + *err = error; + } + } + + Program( + const Context& context, + const Sources& sources, + cl_int* err = NULL) + { + cl_int error; + + const ::size_t n = (::size_t)sources.size(); + ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t)); + const char** strings = (const char**) alloca(n * sizeof(const char*)); + + for (::size_t i = 0; i < n; ++i) { + strings[i] = sources[(int)i].first; + lengths[i] = sources[(int)i].second; + } + + object_ = ::clCreateProgramWithSource( + context(), (cl_uint)n, strings, lengths, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); + if (err != NULL) { + *err = error; + } + } + + /** + * Construct a program object from a list of devices and a per-device list of binaries. + * \param context A valid OpenCL context in which to construct the program. + * \param devices A vector of OpenCL device objects for which the program will be created. + * \param binaries A vector of pairs of a pointer to a binary object and its length. + * \param binaryStatus An optional vector that on completion will be resized to + * match the size of binaries and filled with values to specify if each binary + * was successfully loaded. + * Set to CL_SUCCESS if the binary was successfully loaded. + * Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL. + * Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device. + * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors: + * CL_INVALID_CONTEXT if context is not a valid context. + * CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; + * or if any entry in binaries is NULL or has length 0. + * CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context. + * CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device. + * CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host. + */ + Program( + const Context& context, + const VECTOR_CLASS& devices, + const Binaries& binaries, + VECTOR_CLASS* binaryStatus = NULL, + cl_int* err = NULL) + { + cl_int error; + + const ::size_t numDevices = devices.size(); + + // Catch size mismatch early and return + if(binaries.size() != numDevices) { + error = CL_INVALID_VALUE; + detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR); + if (err != NULL) { + *err = error; + } + return; + } + + ::size_t* lengths = (::size_t*) alloca(numDevices * sizeof(::size_t)); + const unsigned char** images = (const unsigned char**) alloca(numDevices * sizeof(const unsigned char**)); + + for (::size_t i = 0; i < numDevices; ++i) { + images[i] = (const unsigned char*)binaries[i].first; + lengths[i] = binaries[(int)i].second; + } + + cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id)); + for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { + deviceIDs[deviceIndex] = (devices[deviceIndex])(); + } + + if(binaryStatus) { + binaryStatus->resize(numDevices); + } + + object_ = ::clCreateProgramWithBinary( + context(), (cl_uint) devices.size(), + deviceIDs, + lengths, images, (binaryStatus != NULL && numDevices > 0) + ? &binaryStatus->front() + : NULL, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR); + if (err != NULL) { + *err = error; + } + } + + +#if defined(CL_VERSION_1_2) + /** + * Create program using builtin kernels. + * \param kernelNames Semi-colon separated list of builtin kernel names + */ + Program( + const Context& context, + const VECTOR_CLASS& devices, + const STRING_CLASS& kernelNames, + cl_int* err = NULL) + { + cl_int error; + + + ::size_t numDevices = devices.size(); + cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id)); + for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { + deviceIDs[deviceIndex] = (devices[deviceIndex])(); + } + + object_ = ::clCreateProgramWithBuiltInKernels( + context(), + (cl_uint) devices.size(), + deviceIDs, + kernelNames.c_str(), + &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // #if defined(CL_VERSION_1_2) + + Program() { } + + __CL_EXPLICIT_CONSTRUCTORS Program(const cl_program& program) : detail::Wrapper(program) { } + + Program& operator = (const cl_program& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Program(const Program& program) : detail::Wrapper(program) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Program& operator = (const Program &program) + { + detail::Wrapper::operator=(program); + return *this; + } + +#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Program(Program&& program) CL_HPP_NOEXCEPT : detail::Wrapper(std::move(program)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Program& operator = (Program &&program) + { + detail::Wrapper::operator=(std::move(program)); + return *this; + } +#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + + cl_int build( + const VECTOR_CLASS& devices, + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL) const + { + ::size_t numDevices = devices.size(); + cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id)); + for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { + deviceIDs[deviceIndex] = (devices[deviceIndex])(); + } + + return detail::errHandler( + ::clBuildProgram( + object_, + (cl_uint) + devices.size(), + deviceIDs, + options, + notifyFptr, + data), + __BUILD_PROGRAM_ERR); + } + + cl_int build( + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL) const + { + return detail::errHandler( + ::clBuildProgram( + object_, + 0, + NULL, + options, + notifyFptr, + data), + __BUILD_PROGRAM_ERR); + } + +#if defined(CL_VERSION_1_2) + cl_int compile( + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL) const + { + return detail::errHandler( + ::clCompileProgram( + object_, + 0, + NULL, + options, + 0, + NULL, + NULL, + notifyFptr, + data), + __COMPILE_PROGRAM_ERR); + } +#endif + + template + cl_int getInfo(cl_program_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetProgramInfo, object_, name, param), + __GET_PROGRAM_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_program_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + template + cl_int getBuildInfo( + const Device& device, cl_program_build_info name, T* param) const + { + return detail::errHandler( + detail::getInfo( + &::clGetProgramBuildInfo, object_, device(), name, param), + __GET_PROGRAM_BUILD_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getBuildInfo(const Device& device, cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_program_build_info, name>::param_type param; + cl_int result = getBuildInfo(device, name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + cl_int createKernels(VECTOR_CLASS* kernels) + { + cl_uint numKernels; + cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR); + } + + Kernel* value = (Kernel*) alloca(numKernels * sizeof(Kernel)); + err = ::clCreateKernelsInProgram( + object_, numKernels, (cl_kernel*) value, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR); + } + + kernels->assign(&value[0], &value[numKernels]); + return CL_SUCCESS; + } +}; + +#if defined(CL_VERSION_1_2) +inline Program linkProgram( + Program input1, + Program input2, + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL, + cl_int* err = NULL) +{ + cl_int error_local = CL_SUCCESS; + + cl_program programs[2] = { input1(), input2() }; + + Context ctx = input1.getInfo(&error_local); + if(error_local!=CL_SUCCESS) { + detail::errHandler(error_local, __LINK_PROGRAM_ERR); + } + + cl_program prog = ::clLinkProgram( + ctx(), + 0, + NULL, + options, + 2, + programs, + notifyFptr, + data, + &error_local); + + detail::errHandler(error_local,__COMPILE_PROGRAM_ERR); + if (err != NULL) { + *err = error_local; + } + + return Program(prog); +} + +inline Program linkProgram( + VECTOR_CLASS inputPrograms, + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL, + cl_int* err = NULL) +{ + cl_int error_local = CL_SUCCESS; + + cl_program * programs = (cl_program*) alloca(inputPrograms.size() * sizeof(cl_program)); + + if (programs != NULL) { + for (unsigned int i = 0; i < inputPrograms.size(); i++) { + programs[i] = inputPrograms[i](); + } + } + + Context ctx; + if(inputPrograms.size() > 0) { + ctx = inputPrograms[0].getInfo(&error_local); + if(error_local!=CL_SUCCESS) { + detail::errHandler(error_local, __LINK_PROGRAM_ERR); + } + } + cl_program prog = ::clLinkProgram( + ctx(), + 0, + NULL, + options, + (cl_uint)inputPrograms.size(), + programs, + notifyFptr, + data, + &error_local); + + detail::errHandler(error_local,__COMPILE_PROGRAM_ERR); + if (err != NULL) { + *err = error_local; + } + + return Program(prog); +} +#endif + +template<> +inline VECTOR_CLASS cl::Program::getInfo(cl_int* err) const +{ + VECTOR_CLASS< ::size_t> sizes = getInfo(); + VECTOR_CLASS binaries; + for (VECTOR_CLASS< ::size_t>::iterator s = sizes.begin(); s != sizes.end(); ++s) + { + char *ptr = NULL; + if (*s != 0) + ptr = new char[*s]; + binaries.push_back(ptr); + } + + cl_int result = getInfo(CL_PROGRAM_BINARIES, &binaries); + if (err != NULL) { + *err = result; + } + return binaries; +} + +inline Kernel::Kernel(const Program& program, const char* name, cl_int* err) +{ + cl_int error; + + object_ = ::clCreateKernel(program(), name, &error); + detail::errHandler(error, __CREATE_KERNEL_ERR); + + if (err != NULL) { + *err = error; + } + +} + +/*! \class CommandQueue + * \brief CommandQueue interface for cl_command_queue. + */ +class CommandQueue : public detail::Wrapper +{ +private: +#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED + static std::atomic default_initialized_; +#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED + static volatile int default_initialized_; +#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED + static CommandQueue default_; + static volatile cl_int default_error_; +public: + CommandQueue( + cl_command_queue_properties properties, + cl_int* err = NULL) + { + cl_int error; + + Context context = Context::getDefault(&error); + detail::errHandler(error, __CREATE_CONTEXT_ERR); + + if (error != CL_SUCCESS) { + if (err != NULL) { + *err = error; + } + } + else { + Device device = context.getInfo()[0]; + + object_ = ::clCreateCommandQueue( + context(), device(), properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + if (err != NULL) { + *err = error; + } + } + } + /*! + * \brief Constructs a CommandQueue for an implementation defined device in the given context + */ + explicit CommandQueue( + const Context& context, + cl_command_queue_properties properties = 0, + cl_int* err = NULL) + { + cl_int error; + VECTOR_CLASS devices; + error = context.getInfo(CL_CONTEXT_DEVICES, &devices); + + detail::errHandler(error, __CREATE_CONTEXT_ERR); + + if (error != CL_SUCCESS) + { + if (err != NULL) { + *err = error; + } + return; + } + + object_ = ::clCreateCommandQueue(context(), devices[0](), properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + + if (err != NULL) { + *err = error; + } + + } + + CommandQueue( + const Context& context, + const Device& device, + cl_command_queue_properties properties = 0, + cl_int* err = NULL) + { + cl_int error; + object_ = ::clCreateCommandQueue( + context(), device(), properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + CommandQueue(const CommandQueue& queue) : detail::Wrapper(queue) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + CommandQueue& operator = (const CommandQueue &queue) + { + detail::Wrapper::operator=(queue); + return *this; + } + +#if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + CommandQueue(CommandQueue&& queue) CL_HPP_NOEXCEPT : detail::Wrapper(std::move(queue)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + CommandQueue& operator = (CommandQueue &&queue) + { + detail::Wrapper::operator=(std::move(queue)); + return *this; + } +#endif // #if defined(CL_HPP_RVALUE_REFERENCES_SUPPORTED) + + static CommandQueue getDefault(cl_int * err = NULL) + { + int state = detail::compare_exchange( + &default_initialized_, + __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED); + + if (state & __DEFAULT_INITIALIZED) { + if (err != NULL) { + *err = default_error_; + } + return default_; + } + + if (state & __DEFAULT_BEING_INITIALIZED) { + // Assume writes will propagate eventually... + while(default_initialized_ != __DEFAULT_INITIALIZED) { + detail::fence(); + } + + if (err != NULL) { + *err = default_error_; + } + return default_; + } + + cl_int error; + + Context context = Context::getDefault(&error); + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + + if (error != CL_SUCCESS) { + if (err != NULL) { + *err = error; + } + } + else { + Device device = context.getInfo()[0]; + + default_ = CommandQueue(context, device, 0, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + if (err != NULL) { + *err = error; + } + } + + detail::fence(); + + default_error_ = error; + // Assume writes will propagate eventually... + default_initialized_ = __DEFAULT_INITIALIZED; + + detail::fence(); + + if (err != NULL) { + *err = default_error_; + } + return default_; + + } + + CommandQueue() { } + + __CL_EXPLICIT_CONSTRUCTORS CommandQueue(const cl_command_queue& commandQueue) : detail::Wrapper(commandQueue) { } + + CommandQueue& operator = (const cl_command_queue& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + template + cl_int getInfo(cl_command_queue_info name, T* param) const + { + return detail::errHandler( + detail::getInfo( + &::clGetCommandQueueInfo, object_, name, param), + __GET_COMMAND_QUEUE_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_command_queue_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + cl_int enqueueReadBuffer( + const Buffer& buffer, + cl_bool blocking, + ::size_t offset, + ::size_t size, + void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueReadBuffer( + object_, buffer(), blocking, offset, size, + ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_READ_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueWriteBuffer( + const Buffer& buffer, + cl_bool blocking, + ::size_t offset, + ::size_t size, + const void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueWriteBuffer( + object_, buffer(), blocking, offset, size, + ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_WRITE_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueCopyBuffer( + const Buffer& src, + const Buffer& dst, + ::size_t src_offset, + ::size_t dst_offset, + ::size_t size, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyBuffer( + object_, src(), dst(), src_offset, dst_offset, size, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQEUE_COPY_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueReadBufferRect( + const Buffer& buffer, + cl_bool blocking, + const size_t<3>& buffer_offset, + const size_t<3>& host_offset, + const size_t<3>& region, + ::size_t buffer_row_pitch, + ::size_t buffer_slice_pitch, + ::size_t host_row_pitch, + ::size_t host_slice_pitch, + void *ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueReadBufferRect( + object_, + buffer(), + blocking, + (const ::size_t *)buffer_offset, + (const ::size_t *)host_offset, + (const ::size_t *)region, + buffer_row_pitch, + buffer_slice_pitch, + host_row_pitch, + host_slice_pitch, + ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_READ_BUFFER_RECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueWriteBufferRect( + const Buffer& buffer, + cl_bool blocking, + const size_t<3>& buffer_offset, + const size_t<3>& host_offset, + const size_t<3>& region, + ::size_t buffer_row_pitch, + ::size_t buffer_slice_pitch, + ::size_t host_row_pitch, + ::size_t host_slice_pitch, + const void *ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueWriteBufferRect( + object_, + buffer(), + blocking, + (const ::size_t *)buffer_offset, + (const ::size_t *)host_offset, + (const ::size_t *)region, + buffer_row_pitch, + buffer_slice_pitch, + host_row_pitch, + host_slice_pitch, + ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_WRITE_BUFFER_RECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueCopyBufferRect( + const Buffer& src, + const Buffer& dst, + const size_t<3>& src_origin, + const size_t<3>& dst_origin, + const size_t<3>& region, + ::size_t src_row_pitch, + ::size_t src_slice_pitch, + ::size_t dst_row_pitch, + ::size_t dst_slice_pitch, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyBufferRect( + object_, + src(), + dst(), + (const ::size_t *)src_origin, + (const ::size_t *)dst_origin, + (const ::size_t *)region, + src_row_pitch, + src_slice_pitch, + dst_row_pitch, + dst_slice_pitch, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQEUE_COPY_BUFFER_RECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if defined(CL_VERSION_1_2) + /** + * Enqueue a command to fill a buffer object with a pattern + * of a given size. The pattern is specified a as vector. + * \tparam PatternType The datatype of the pattern field. + * The pattern type must be an accepted OpenCL data type. + */ + template + cl_int enqueueFillBuffer( + const Buffer& buffer, + PatternType pattern, + ::size_t offset, + ::size_t size, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueFillBuffer( + object_, + buffer(), + static_cast(&pattern), + sizeof(PatternType), + offset, + size, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_FILL_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif // #if defined(CL_VERSION_1_2) + + cl_int enqueueReadImage( + const Image& image, + cl_bool blocking, + const size_t<3>& origin, + const size_t<3>& region, + ::size_t row_pitch, + ::size_t slice_pitch, + void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueReadImage( + object_, image(), blocking, (const ::size_t *) origin, + (const ::size_t *) region, row_pitch, slice_pitch, ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_READ_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueWriteImage( + const Image& image, + cl_bool blocking, + const size_t<3>& origin, + const size_t<3>& region, + ::size_t row_pitch, + ::size_t slice_pitch, + const void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueWriteImage( + object_, image(), blocking, (const ::size_t *) origin, + (const ::size_t *) region, row_pitch, slice_pitch, ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_WRITE_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueCopyImage( + const Image& src, + const Image& dst, + const size_t<3>& src_origin, + const size_t<3>& dst_origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyImage( + object_, src(), dst(), (const ::size_t *) src_origin, + (const ::size_t *)dst_origin, (const ::size_t *) region, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_COPY_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if defined(CL_VERSION_1_2) + /** + * Enqueue a command to fill an image object with a specified color. + * \param fillColor is the color to use to fill the image. + * This is a four component RGBA floating-point color value if + * the image channel data type is not an unnormalized signed or + * unsigned data type. + */ + cl_int enqueueFillImage( + const Image& image, + cl_float4 fillColor, + const size_t<3>& origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueFillImage( + object_, + image(), + static_cast(&fillColor), + (const ::size_t *) origin, + (const ::size_t *) region, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_FILL_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * Enqueue a command to fill an image object with a specified color. + * \param fillColor is the color to use to fill the image. + * This is a four component RGBA signed integer color value if + * the image channel data type is an unnormalized signed integer + * type. + */ + cl_int enqueueFillImage( + const Image& image, + cl_int4 fillColor, + const size_t<3>& origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueFillImage( + object_, + image(), + static_cast(&fillColor), + (const ::size_t *) origin, + (const ::size_t *) region, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_FILL_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * Enqueue a command to fill an image object with a specified color. + * \param fillColor is the color to use to fill the image. + * This is a four component RGBA unsigned integer color value if + * the image channel data type is an unnormalized unsigned integer + * type. + */ + cl_int enqueueFillImage( + const Image& image, + cl_uint4 fillColor, + const size_t<3>& origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueFillImage( + object_, + image(), + static_cast(&fillColor), + (const ::size_t *) origin, + (const ::size_t *) region, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_FILL_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif // #if defined(CL_VERSION_1_2) + + cl_int enqueueCopyImageToBuffer( + const Image& src, + const Buffer& dst, + const size_t<3>& src_origin, + const size_t<3>& region, + ::size_t dst_offset, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyImageToBuffer( + object_, src(), dst(), (const ::size_t *) src_origin, + (const ::size_t *) region, dst_offset, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueCopyBufferToImage( + const Buffer& src, + const Image& dst, + ::size_t src_offset, + const size_t<3>& dst_origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyBufferToImage( + object_, src(), dst(), src_offset, + (const ::size_t *) dst_origin, (const ::size_t *) region, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + void* enqueueMapBuffer( + const Buffer& buffer, + cl_bool blocking, + cl_map_flags flags, + ::size_t offset, + ::size_t size, + const VECTOR_CLASS* events = NULL, + Event* event = NULL, + cl_int* err = NULL) const + { + cl_event tmp; + cl_int error; + void * result = ::clEnqueueMapBuffer( + object_, buffer(), blocking, flags, offset, size, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL, + &error); + + detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + if (event != NULL && error == CL_SUCCESS) + *event = tmp; + + return result; + } + + void* enqueueMapImage( + const Image& buffer, + cl_bool blocking, + cl_map_flags flags, + const size_t<3>& origin, + const size_t<3>& region, + ::size_t * row_pitch, + ::size_t * slice_pitch, + const VECTOR_CLASS* events = NULL, + Event* event = NULL, + cl_int* err = NULL) const + { + cl_event tmp; + cl_int error; + void * result = ::clEnqueueMapImage( + object_, buffer(), blocking, flags, + (const ::size_t *) origin, (const ::size_t *) region, + row_pitch, slice_pitch, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL, + &error); + + detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + if (event != NULL && error == CL_SUCCESS) + *event = tmp; + return result; + } + + cl_int enqueueUnmapMemObject( + const Memory& memory, + void* mapped_ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueUnmapMemObject( + object_, memory(), mapped_ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_UNMAP_MEM_OBJECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if defined(CL_VERSION_1_2) + /** + * Enqueues a marker command which waits for either a list of events to complete, + * or all previously enqueued commands to complete. + * + * Enqueues a marker command which waits for either a list of events to complete, + * or if the list is empty it waits for all commands previously enqueued in command_queue + * to complete before it completes. This command returns an event which can be waited on, + * i.e. this event can be waited on to insure that all events either in the event_wait_list + * or all previously enqueued commands, queued before this command to command_queue, + * have completed. + */ + cl_int enqueueMarkerWithWaitList( + const VECTOR_CLASS *events = 0, + Event *event = 0) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueMarkerWithWaitList( + object_, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_MARKER_WAIT_LIST_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * A synchronization point that enqueues a barrier operation. + * + * Enqueues a barrier command which waits for either a list of events to complete, + * or if the list is empty it waits for all commands previously enqueued in command_queue + * to complete before it completes. This command blocks command execution, that is, any + * following commands enqueued after it do not execute until it completes. This command + * returns an event which can be waited on, i.e. this event can be waited on to insure that + * all events either in the event_wait_list or all previously enqueued commands, queued + * before this command to command_queue, have completed. + */ + cl_int enqueueBarrierWithWaitList( + const VECTOR_CLASS *events = 0, + Event *event = 0) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueBarrierWithWaitList( + object_, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_BARRIER_WAIT_LIST_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * Enqueues a command to indicate with which device a set of memory objects + * should be associated. + */ + cl_int enqueueMigrateMemObjects( + const VECTOR_CLASS &memObjects, + cl_mem_migration_flags flags, + const VECTOR_CLASS* events = NULL, + Event* event = NULL + ) const + { + cl_event tmp; + + cl_mem* localMemObjects = static_cast(alloca(memObjects.size() * sizeof(cl_mem))); + for( int i = 0; i < (int)memObjects.size(); ++i ) { + localMemObjects[i] = memObjects[i](); + } + + + cl_int err = detail::errHandler( + ::clEnqueueMigrateMemObjects( + object_, + (cl_uint)memObjects.size(), + static_cast(localMemObjects), + flags, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_UNMAP_MEM_OBJECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif // #if defined(CL_VERSION_1_2) + + cl_int enqueueNDRangeKernel( + const Kernel& kernel, + const NDRange& offset, + const NDRange& global, + const NDRange& local = NullRange, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueNDRangeKernel( + object_, kernel(), (cl_uint) global.dimensions(), + offset.dimensions() != 0 ? (const ::size_t*) offset : NULL, + (const ::size_t*) global, + local.dimensions() != 0 ? (const ::size_t*) local : NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_NDRANGE_KERNEL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueTask( + const Kernel& kernel, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueTask( + object_, kernel(), + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_TASK_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueNativeKernel( + void (CL_CALLBACK *userFptr)(void *), + std::pair args, + const VECTOR_CLASS* mem_objects = NULL, + const VECTOR_CLASS* mem_locs = NULL, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_mem * mems = (mem_objects != NULL && mem_objects->size() > 0) + ? (cl_mem*) alloca(mem_objects->size() * sizeof(cl_mem)) + : NULL; + + if (mems != NULL) { + for (unsigned int i = 0; i < mem_objects->size(); i++) { + mems[i] = ((*mem_objects)[i])(); + } + } + + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueNativeKernel( + object_, userFptr, args.first, args.second, + (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, + mems, + (mem_locs != NULL && mem_locs->size() > 0) ? (const void **) &mem_locs->front() : NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_NATIVE_KERNEL); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) + CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueMarker( + object_, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_MARKER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + cl_int enqueueWaitForEvents(const VECTOR_CLASS& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + { + return detail::errHandler( + ::clEnqueueWaitForEvents( + object_, + (cl_uint) events.size(), + events.size() > 0 ? (const cl_event*) &events.front() : NULL), + __ENQUEUE_WAIT_FOR_EVENTS_ERR); + } +#endif // #if defined(CL_VERSION_1_1) + + cl_int enqueueAcquireGLObjects( + const VECTOR_CLASS* mem_objects = NULL, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueAcquireGLObjects( + object_, + (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, + (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_ACQUIRE_GL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueReleaseGLObjects( + const VECTOR_CLASS* mem_objects = NULL, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueReleaseGLObjects( + object_, + (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, + (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_RELEASE_GL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if defined (USE_DX_INTEROP) +typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem* mem_objects, cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, cl_event* event); +typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem* mem_objects, cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, cl_event* event); + + cl_int enqueueAcquireD3D10Objects( + const VECTOR_CLASS* mem_objects = NULL, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL; +#if defined(CL_VERSION_1_2) + cl_context context = getInfo(); + cl::Device device(getInfo()); + cl_platform_id platform = device.getInfo(); + __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueAcquireD3D10ObjectsKHR); +#endif +#if defined(CL_VERSION_1_1) + __INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR); +#endif + + cl_event tmp; + cl_int err = detail::errHandler( + pfn_clEnqueueAcquireD3D10ObjectsKHR( + object_, + (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, + (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_ACQUIRE_GL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueReleaseD3D10Objects( + const VECTOR_CLASS* mem_objects = NULL, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) const + { + static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL; +#if defined(CL_VERSION_1_2) + cl_context context = getInfo(); + cl::Device device(getInfo()); + cl_platform_id platform = device.getInfo(); + __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueReleaseD3D10ObjectsKHR); +#endif // #if defined(CL_VERSION_1_2) +#if defined(CL_VERSION_1_1) + __INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR); +#endif // #if defined(CL_VERSION_1_1) + + cl_event tmp; + cl_int err = detail::errHandler( + pfn_clEnqueueReleaseD3D10ObjectsKHR( + object_, + (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, + (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_RELEASE_GL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif + +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) + CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + { + return detail::errHandler( + ::clEnqueueBarrier(object_), + __ENQUEUE_BARRIER_ERR); + } +#endif // #if defined(CL_VERSION_1_1) + + cl_int flush() const + { + return detail::errHandler(::clFlush(object_), __FLUSH_ERR); + } + + cl_int finish() const + { + return detail::errHandler(::clFinish(object_), __FINISH_ERR); + } +}; + +#ifdef _WIN32 +#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED +__declspec(selectany) std::atomic CommandQueue::default_initialized_; +#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED +__declspec(selectany) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED; +#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED +__declspec(selectany) CommandQueue CommandQueue::default_; +__declspec(selectany) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS; +#else // !_WIN32 +#ifdef CL_HPP_CPP11_ATOMICS_SUPPORTED +__attribute__((weak)) std::atomic CommandQueue::default_initialized_; +#else // !CL_HPP_CPP11_ATOMICS_SUPPORTED +__attribute__((weak)) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED; +#endif // !CL_HPP_CPP11_ATOMICS_SUPPORTED +__attribute__((weak)) CommandQueue CommandQueue::default_; +__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS; +#endif // !_WIN32 + +template< typename IteratorType > +Buffer::Buffer( + const Context &context, + IteratorType startIterator, + IteratorType endIterator, + bool readOnly, + bool useHostPtr, + cl_int* err) +{ + typedef typename std::iterator_traits::value_type DataType; + cl_int error; + + cl_mem_flags flags = 0; + if( readOnly ) { + flags |= CL_MEM_READ_ONLY; + } + else { + flags |= CL_MEM_READ_WRITE; + } + if( useHostPtr ) { + flags |= CL_MEM_USE_HOST_PTR; + } + + ::size_t size = sizeof(DataType)*(endIterator - startIterator); + + if( useHostPtr ) { + object_ = ::clCreateBuffer(context(), flags, size, static_cast(&*startIterator), &error); + } else { + object_ = ::clCreateBuffer(context(), flags, size, 0, &error); + } + + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + + if( !useHostPtr ) { + CommandQueue queue(context, 0, &error); + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + + error = cl::copy(queue, startIterator, endIterator, *this); + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } +} + +template< typename IteratorType > +Buffer::Buffer( + const CommandQueue &queue, + IteratorType startIterator, + IteratorType endIterator, + bool readOnly, + bool useHostPtr, + cl_int* err) +{ + typedef typename std::iterator_traits::value_type DataType; + cl_int error; + + cl_mem_flags flags = 0; + if (readOnly) { + flags |= CL_MEM_READ_ONLY; + } + else { + flags |= CL_MEM_READ_WRITE; + } + if (useHostPtr) { + flags |= CL_MEM_USE_HOST_PTR; + } + + ::size_t size = sizeof(DataType)*(endIterator - startIterator); + + Context context = queue.getInfo(); + + if (useHostPtr) { + object_ = ::clCreateBuffer(context(), flags, size, static_cast(&*startIterator), &error); + } + else { + object_ = ::clCreateBuffer(context(), flags, size, 0, &error); + } + + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + + if (!useHostPtr) { + error = cl::copy(queue, startIterator, endIterator, *this); + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } +} + +inline cl_int enqueueReadBuffer( + const Buffer& buffer, + cl_bool blocking, + ::size_t offset, + ::size_t size, + void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event); +} + +inline cl_int enqueueWriteBuffer( + const Buffer& buffer, + cl_bool blocking, + ::size_t offset, + ::size_t size, + const void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event); +} + +inline void* enqueueMapBuffer( + const Buffer& buffer, + cl_bool blocking, + cl_map_flags flags, + ::size_t offset, + ::size_t size, + const VECTOR_CLASS* events = NULL, + Event* event = NULL, + cl_int* err = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + + void * result = ::clEnqueueMapBuffer( + queue(), buffer(), blocking, flags, offset, size, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (cl_event*) event, + &error); + + detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + return result; +} + +inline cl_int enqueueUnmapMemObject( + const Memory& memory, + void* mapped_ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + if (error != CL_SUCCESS) { + return error; + } + + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueUnmapMemObject( + queue(), memory(), mapped_ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_UNMAP_MEM_OBJECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; +} + +inline cl_int enqueueCopyBuffer( + const Buffer& src, + const Buffer& dst, + ::size_t src_offset, + ::size_t dst_offset, + ::size_t size, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event); +} + +/** + * Blocking copy operation between iterators and a buffer. + * Host to Device. + * Uses default command queue. + */ +template< typename IteratorType > +inline cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer ) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + if (error != CL_SUCCESS) + return error; + + return cl::copy(queue, startIterator, endIterator, buffer); +} + +/** + * Blocking copy operation between iterators and a buffer. + * Device to Host. + * Uses default command queue. + */ +template< typename IteratorType > +inline cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator ) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + if (error != CL_SUCCESS) + return error; + + return cl::copy(queue, buffer, startIterator, endIterator); +} + +/** + * Blocking copy operation between iterators and a buffer. + * Host to Device. + * Uses specified queue. + */ +template< typename IteratorType > +inline cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer ) +{ + typedef typename std::iterator_traits::value_type DataType; + cl_int error; + + ::size_t length = endIterator-startIterator; + ::size_t byteLength = length*sizeof(DataType); + + DataType *pointer = + static_cast(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error)); + // if exceptions enabled, enqueueMapBuffer will throw + if( error != CL_SUCCESS ) { + return error; + } +#if defined(_MSC_VER) + std::copy( + startIterator, + endIterator, + stdext::checked_array_iterator( + pointer, length)); +#else + std::copy(startIterator, endIterator, pointer); +#endif + Event endEvent; + error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent); + // if exceptions enabled, enqueueUnmapMemObject will throw + if( error != CL_SUCCESS ) { + return error; + } + endEvent.wait(); + return CL_SUCCESS; +} + +/** + * Blocking copy operation between iterators and a buffer. + * Device to Host. + * Uses specified queue. + */ +template< typename IteratorType > +inline cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator ) +{ + typedef typename std::iterator_traits::value_type DataType; + cl_int error; + + ::size_t length = endIterator-startIterator; + ::size_t byteLength = length*sizeof(DataType); + + DataType *pointer = + static_cast(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error)); + // if exceptions enabled, enqueueMapBuffer will throw + if( error != CL_SUCCESS ) { + return error; + } + std::copy(pointer, pointer + length, startIterator); + Event endEvent; + error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent); + // if exceptions enabled, enqueueUnmapMemObject will throw + if( error != CL_SUCCESS ) { + return error; + } + endEvent.wait(); + return CL_SUCCESS; +} + +#if defined(CL_VERSION_1_1) +inline cl_int enqueueReadBufferRect( + const Buffer& buffer, + cl_bool blocking, + const size_t<3>& buffer_offset, + const size_t<3>& host_offset, + const size_t<3>& region, + ::size_t buffer_row_pitch, + ::size_t buffer_slice_pitch, + ::size_t host_row_pitch, + ::size_t host_slice_pitch, + void *ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueReadBufferRect( + buffer, + blocking, + buffer_offset, + host_offset, + region, + buffer_row_pitch, + buffer_slice_pitch, + host_row_pitch, + host_slice_pitch, + ptr, + events, + event); +} + +inline cl_int enqueueWriteBufferRect( + const Buffer& buffer, + cl_bool blocking, + const size_t<3>& buffer_offset, + const size_t<3>& host_offset, + const size_t<3>& region, + ::size_t buffer_row_pitch, + ::size_t buffer_slice_pitch, + ::size_t host_row_pitch, + ::size_t host_slice_pitch, + const void *ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueWriteBufferRect( + buffer, + blocking, + buffer_offset, + host_offset, + region, + buffer_row_pitch, + buffer_slice_pitch, + host_row_pitch, + host_slice_pitch, + ptr, + events, + event); +} + +inline cl_int enqueueCopyBufferRect( + const Buffer& src, + const Buffer& dst, + const size_t<3>& src_origin, + const size_t<3>& dst_origin, + const size_t<3>& region, + ::size_t src_row_pitch, + ::size_t src_slice_pitch, + ::size_t dst_row_pitch, + ::size_t dst_slice_pitch, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyBufferRect( + src, + dst, + src_origin, + dst_origin, + region, + src_row_pitch, + src_slice_pitch, + dst_row_pitch, + dst_slice_pitch, + events, + event); +} +#endif + +inline cl_int enqueueReadImage( + const Image& image, + cl_bool blocking, + const size_t<3>& origin, + const size_t<3>& region, + ::size_t row_pitch, + ::size_t slice_pitch, + void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueReadImage( + image, + blocking, + origin, + region, + row_pitch, + slice_pitch, + ptr, + events, + event); +} + +inline cl_int enqueueWriteImage( + const Image& image, + cl_bool blocking, + const size_t<3>& origin, + const size_t<3>& region, + ::size_t row_pitch, + ::size_t slice_pitch, + const void* ptr, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueWriteImage( + image, + blocking, + origin, + region, + row_pitch, + slice_pitch, + ptr, + events, + event); +} + +inline cl_int enqueueCopyImage( + const Image& src, + const Image& dst, + const size_t<3>& src_origin, + const size_t<3>& dst_origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyImage( + src, + dst, + src_origin, + dst_origin, + region, + events, + event); +} + +inline cl_int enqueueCopyImageToBuffer( + const Image& src, + const Buffer& dst, + const size_t<3>& src_origin, + const size_t<3>& region, + ::size_t dst_offset, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyImageToBuffer( + src, + dst, + src_origin, + region, + dst_offset, + events, + event); +} + +inline cl_int enqueueCopyBufferToImage( + const Buffer& src, + const Image& dst, + ::size_t src_offset, + const size_t<3>& dst_origin, + const size_t<3>& region, + const VECTOR_CLASS* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyBufferToImage( + src, + dst, + src_offset, + dst_origin, + region, + events, + event); +} + + +inline cl_int flush(void) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.flush(); +} + +inline cl_int finish(void) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + + return queue.finish(); +} + +// Kernel Functor support +// New interface as of September 2011 +// Requires the C++11 std::tr1::function (note do not support TR1) +// Visual Studio 2010 and GCC 4.2 + +struct EnqueueArgs +{ + CommandQueue queue_; + const NDRange offset_; + const NDRange global_; + const NDRange local_; + VECTOR_CLASS events_; + + EnqueueArgs(NDRange global) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(NullRange) + { + + } + + EnqueueArgs(NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(local) + { + + } + + EnqueueArgs(NDRange offset, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(offset), + global_(global), + local_(local) + { + + } + + EnqueueArgs(Event e, NDRange global) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(NullRange) + { + events_.push_back(e); + } + + EnqueueArgs(Event e, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(local) + { + events_.push_back(e); + } + + EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(offset), + global_(global), + local_(local) + { + events_.push_back(e); + } + + EnqueueArgs(const VECTOR_CLASS &events, NDRange global) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(NullRange), + events_(events) + { + + } + + EnqueueArgs(const VECTOR_CLASS &events, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(local), + events_(events) + { + + } + + EnqueueArgs(const VECTOR_CLASS &events, NDRange offset, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(offset), + global_(global), + local_(local), + events_(events) + { + + } + + EnqueueArgs(CommandQueue &queue, NDRange global) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(NullRange) + { + + } + + EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(local) + { + + } + + EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) : + queue_(queue), + offset_(offset), + global_(global), + local_(local) + { + + } + + EnqueueArgs(CommandQueue &queue, Event e, NDRange global) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(NullRange) + { + events_.push_back(e); + } + + EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(local) + { + events_.push_back(e); + } + + EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) : + queue_(queue), + offset_(offset), + global_(global), + local_(local) + { + events_.push_back(e); + } + + EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS &events, NDRange global) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(NullRange), + events_(events) + { + + } + + EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS &events, NDRange global, NDRange local) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(local), + events_(events) + { + + } + + EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS &events, NDRange offset, NDRange global, NDRange local) : + queue_(queue), + offset_(offset), + global_(global), + local_(local), + events_(events) + { + + } +}; + +namespace detail { + +class NullType {}; + +template +struct SetArg +{ + static void set (Kernel kernel, T0 arg) + { + kernel.setArg(index, arg); + } +}; + +template +struct SetArg +{ + static void set (Kernel, NullType) + { + } +}; + +template < + typename T0, typename T1, typename T2, typename T3, + typename T4, typename T5, typename T6, typename T7, + typename T8, typename T9, typename T10, typename T11, + typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, + typename T20, typename T21, typename T22, typename T23, + typename T24, typename T25, typename T26, typename T27, + typename T28, typename T29, typename T30, typename T31 + +> +class KernelFunctorGlobal +{ +private: + Kernel kernel_; + +public: + KernelFunctorGlobal( + Kernel kernel) : + kernel_(kernel) + {} + + KernelFunctorGlobal( + const Program& program, + const STRING_CLASS name, + cl_int * err = NULL) : + kernel_(program, name.c_str(), err) + {} + + Event operator() ( + const EnqueueArgs& args, + T0 t0, + T1 t1 = NullType(), + T2 t2 = NullType(), + T3 t3 = NullType(), + T4 t4 = NullType(), + T5 t5 = NullType(), + T6 t6 = NullType(), + T7 t7 = NullType(), + T8 t8 = NullType(), + T9 t9 = NullType(), + T10 t10 = NullType(), + T11 t11 = NullType(), + T12 t12 = NullType(), + T13 t13 = NullType(), + T14 t14 = NullType(), + T15 t15 = NullType(), + T16 t16 = NullType(), + T17 t17 = NullType(), + T18 t18 = NullType(), + T19 t19 = NullType(), + T20 t20 = NullType(), + T21 t21 = NullType(), + T22 t22 = NullType(), + T23 t23 = NullType(), + T24 t24 = NullType(), + T25 t25 = NullType(), + T26 t26 = NullType(), + T27 t27 = NullType(), + T28 t28 = NullType(), + T29 t29 = NullType(), + T30 t30 = NullType(), + T31 t31 = NullType() + + ) + { + Event event; + SetArg<0, T0>::set(kernel_, t0); + SetArg<1, T1>::set(kernel_, t1); + SetArg<2, T2>::set(kernel_, t2); + SetArg<3, T3>::set(kernel_, t3); + SetArg<4, T4>::set(kernel_, t4); + SetArg<5, T5>::set(kernel_, t5); + SetArg<6, T6>::set(kernel_, t6); + SetArg<7, T7>::set(kernel_, t7); + SetArg<8, T8>::set(kernel_, t8); + SetArg<9, T9>::set(kernel_, t9); + SetArg<10, T10>::set(kernel_, t10); + SetArg<11, T11>::set(kernel_, t11); + SetArg<12, T12>::set(kernel_, t12); + SetArg<13, T13>::set(kernel_, t13); + SetArg<14, T14>::set(kernel_, t14); + SetArg<15, T15>::set(kernel_, t15); + SetArg<16, T16>::set(kernel_, t16); + SetArg<17, T17>::set(kernel_, t17); + SetArg<18, T18>::set(kernel_, t18); + SetArg<19, T19>::set(kernel_, t19); + SetArg<20, T20>::set(kernel_, t20); + SetArg<21, T21>::set(kernel_, t21); + SetArg<22, T22>::set(kernel_, t22); + SetArg<23, T23>::set(kernel_, t23); + SetArg<24, T24>::set(kernel_, t24); + SetArg<25, T25>::set(kernel_, t25); + SetArg<26, T26>::set(kernel_, t26); + SetArg<27, T27>::set(kernel_, t27); + SetArg<28, T28>::set(kernel_, t28); + SetArg<29, T29>::set(kernel_, t29); + SetArg<30, T30>::set(kernel_, t30); + SetArg<31, T31>::set(kernel_, t31); + + + args.queue_.enqueueNDRangeKernel( + kernel_, + args.offset_, + args.global_, + args.local_, + &args.events_, + &event); + + return event; + } + +}; + +//------------------------------------------------------------------------------------------------------ + + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26, + typename T27, + typename T28, + typename T29, + typename T30, + typename T31> +struct functionImplementation_ +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + T30, + T31> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 32)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + T30, + T31); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26, + T27 arg27, + T28 arg28, + T29 arg29, + T30 arg30, + T31 arg31) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26, + arg27, + arg28, + arg29, + arg30, + arg31); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26, + typename T27, + typename T28, + typename T29, + typename T30> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + T30, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + T30, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 31)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + T30); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26, + T27 arg27, + T28 arg28, + T29 arg29, + T30 arg30) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26, + arg27, + arg28, + arg29, + arg30); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26, + typename T27, + typename T28, + typename T29> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 30)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26, + T27 arg27, + T28 arg28, + T29 arg29) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26, + arg27, + arg28, + arg29); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26, + typename T27, + typename T28> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 29)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26, + T27 arg27, + T28 arg28) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26, + arg27, + arg28); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26, + typename T27> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 28)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26, + T27 arg27) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26, + arg27); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 27)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 26)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 25)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 24)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 23)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 22)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 21)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 20)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 19)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 18)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 17)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 16)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 15)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 14)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 13)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 12)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 11)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 10)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 9)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 8)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 7)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 6)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 5)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 4)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3); + } + + +}; + +template< + typename T0, + typename T1, + typename T2> +struct functionImplementation_ +< T0, + T1, + T2, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 3)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2); + } + + +}; + +template< + typename T0, + typename T1> +struct functionImplementation_ +< T0, + T1, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 2)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1) + { + return functor_( + enqueueArgs, + arg0, + arg1); + } + + +}; + +template< + typename T0> +struct functionImplementation_ +< T0, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 1)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0) + { + return functor_( + enqueueArgs, + arg0); + } + + +}; + + + + + +} // namespace detail + +//---------------------------------------------------------------------------------------------- + +template < + typename T0, typename T1 = detail::NullType, typename T2 = detail::NullType, + typename T3 = detail::NullType, typename T4 = detail::NullType, + typename T5 = detail::NullType, typename T6 = detail::NullType, + typename T7 = detail::NullType, typename T8 = detail::NullType, + typename T9 = detail::NullType, typename T10 = detail::NullType, + typename T11 = detail::NullType, typename T12 = detail::NullType, + typename T13 = detail::NullType, typename T14 = detail::NullType, + typename T15 = detail::NullType, typename T16 = detail::NullType, + typename T17 = detail::NullType, typename T18 = detail::NullType, + typename T19 = detail::NullType, typename T20 = detail::NullType, + typename T21 = detail::NullType, typename T22 = detail::NullType, + typename T23 = detail::NullType, typename T24 = detail::NullType, + typename T25 = detail::NullType, typename T26 = detail::NullType, + typename T27 = detail::NullType, typename T28 = detail::NullType, + typename T29 = detail::NullType, typename T30 = detail::NullType, + typename T31 = detail::NullType + +> +struct make_kernel : + public detail::functionImplementation_< + T0, T1, T2, T3, + T4, T5, T6, T7, + T8, T9, T10, T11, + T12, T13, T14, T15, + T16, T17, T18, T19, + T20, T21, T22, T23, + T24, T25, T26, T27, + T28, T29, T30, T31 + + > +{ +public: + typedef detail::KernelFunctorGlobal< + T0, T1, T2, T3, + T4, T5, T6, T7, + T8, T9, T10, T11, + T12, T13, T14, T15, + T16, T17, T18, T19, + T20, T21, T22, T23, + T24, T25, T26, T27, + T28, T29, T30, T31 + + > FunctorType; + + make_kernel( + const Program& program, + const STRING_CLASS name, + cl_int * err = NULL) : + detail::functionImplementation_< + T0, T1, T2, T3, + T4, T5, T6, T7, + T8, T9, T10, T11, + T12, T13, T14, T15, + T16, T17, T18, T19, + T20, T21, T22, T23, + T24, T25, T26, T27, + T28, T29, T30, T31 + + >( + FunctorType(program, name, err)) + {} + + make_kernel( + const Kernel kernel) : + detail::functionImplementation_< + T0, T1, T2, T3, + T4, T5, T6, T7, + T8, T9, T10, T11, + T12, T13, T14, T15, + T16, T17, T18, T19, + T20, T21, T22, T23, + T24, T25, T26, T27, + T28, T29, T30, T31 + + >( + FunctorType(kernel)) + {} +}; + + +//---------------------------------------------------------------------------------------------------------------------- + +#undef __ERR_STR +#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS) +#undef __GET_DEVICE_INFO_ERR +#undef __GET_PLATFORM_INFO_ERR +#undef __GET_DEVICE_IDS_ERR +#undef __GET_CONTEXT_INFO_ERR +#undef __GET_EVENT_INFO_ERR +#undef __GET_EVENT_PROFILE_INFO_ERR +#undef __GET_MEM_OBJECT_INFO_ERR +#undef __GET_IMAGE_INFO_ERR +#undef __GET_SAMPLER_INFO_ERR +#undef __GET_KERNEL_INFO_ERR +#undef __GET_KERNEL_ARG_INFO_ERR +#undef __GET_KERNEL_WORK_GROUP_INFO_ERR +#undef __GET_PROGRAM_INFO_ERR +#undef __GET_PROGRAM_BUILD_INFO_ERR +#undef __GET_COMMAND_QUEUE_INFO_ERR + +#undef __CREATE_CONTEXT_ERR +#undef __CREATE_CONTEXT_FROM_TYPE_ERR +#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR + +#undef __CREATE_BUFFER_ERR +#undef __CREATE_SUBBUFFER_ERR +#undef __CREATE_IMAGE2D_ERR +#undef __CREATE_IMAGE3D_ERR +#undef __CREATE_SAMPLER_ERR +#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR + +#undef __CREATE_USER_EVENT_ERR +#undef __SET_USER_EVENT_STATUS_ERR +#undef __SET_EVENT_CALLBACK_ERR +#undef __SET_PRINTF_CALLBACK_ERR + +#undef __WAIT_FOR_EVENTS_ERR + +#undef __CREATE_KERNEL_ERR +#undef __SET_KERNEL_ARGS_ERR +#undef __CREATE_PROGRAM_WITH_SOURCE_ERR +#undef __CREATE_PROGRAM_WITH_BINARY_ERR +#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR +#undef __BUILD_PROGRAM_ERR +#undef __CREATE_KERNELS_IN_PROGRAM_ERR + +#undef __CREATE_COMMAND_QUEUE_ERR +#undef __SET_COMMAND_QUEUE_PROPERTY_ERR +#undef __ENQUEUE_READ_BUFFER_ERR +#undef __ENQUEUE_WRITE_BUFFER_ERR +#undef __ENQUEUE_READ_BUFFER_RECT_ERR +#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR +#undef __ENQEUE_COPY_BUFFER_ERR +#undef __ENQEUE_COPY_BUFFER_RECT_ERR +#undef __ENQUEUE_READ_IMAGE_ERR +#undef __ENQUEUE_WRITE_IMAGE_ERR +#undef __ENQUEUE_COPY_IMAGE_ERR +#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR +#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR +#undef __ENQUEUE_MAP_BUFFER_ERR +#undef __ENQUEUE_MAP_IMAGE_ERR +#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR +#undef __ENQUEUE_NDRANGE_KERNEL_ERR +#undef __ENQUEUE_TASK_ERR +#undef __ENQUEUE_NATIVE_KERNEL + +#undef __CL_EXPLICIT_CONSTRUCTORS + +#undef __UNLOAD_COMPILER_ERR +#endif //__CL_USER_OVERRIDE_ERROR_STRINGS + +#undef __CL_FUNCTION_TYPE + +// Extensions +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_VERSION_1_1) +#undef __INIT_CL_EXT_FCN_PTR +#endif // #if defined(CL_VERSION_1_1) +#undef __CREATE_SUB_DEVICES + +#if defined(USE_CL_DEVICE_FISSION) +#undef __PARAM_NAME_DEVICE_FISSION +#endif // USE_CL_DEVICE_FISSION + +#undef __DEFAULT_NOT_INITIALIZED +#undef __DEFAULT_BEING_INITIALIZED +#undef __DEFAULT_INITIALIZED + +#undef CL_HPP_RVALUE_REFERENCES_SUPPORTED +#undef CL_HPP_NOEXCEPT + +} // namespace cl + +#endif // CL_HPP_ diff --git a/include/triton/external/CL/cl2.hpp b/include/triton/external/CL/cl2.hpp new file mode 100644 index 000000000..4ac48227a --- /dev/null +++ b/include/triton/external/CL/cl2.hpp @@ -0,0 +1,9677 @@ +/******************************************************************************* + * Copyright (c) 2008-2016 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +/*! \file + * + * \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33), + * OpenCL 1.2 (rev 15) and OpenCL 2.0 (rev 29) + * \author Lee Howes and Bruce Merry + * + * Derived from the OpenCL 1.x C++ bindings written by + * Benedict R. Gaster, Laurent Morichetti and Lee Howes + * With additions and fixes from: + * Brian Cole, March 3rd 2010 and April 2012 + * Matt Gruenke, April 2012. + * Bruce Merry, February 2013. + * Tom Deakin and Simon McIntosh-Smith, July 2013 + * James Price, 2015- + * + * \version 2.0.10 + * \date 2016-07-20 + * + * Optional extension support + * + * cl_ext_device_fission + * #define CL_HPP_USE_CL_DEVICE_FISSION + * cl_khr_d3d10_sharing + * #define CL_HPP_USE_DX_INTEROP + * cl_khr_sub_groups + * #define CL_HPP_USE_CL_SUB_GROUPS_KHR + * cl_khr_image2d_from_buffer + * #define CL_HPP_USE_CL_IMAGE2D_FROM_BUFFER_KHR + * + * Doxygen documentation for this header is available here: + * + * http://khronosgroup.github.io/OpenCL-CLHPP/ + * + * The latest version of this header can be found on the GitHub releases page: + * + * https://github.com/KhronosGroup/OpenCL-CLHPP/releases + * + * Bugs and patches can be submitted to the GitHub repository: + * + * https://github.com/KhronosGroup/OpenCL-CLHPP + */ + +/*! \mainpage + * \section intro Introduction + * For many large applications C++ is the language of choice and so it seems + * reasonable to define C++ bindings for OpenCL. + * + * The interface is contained with a single C++ header file \em cl2.hpp and all + * definitions are contained within the namespace \em cl. There is no additional + * requirement to include \em cl.h and to use either the C++ or original C + * bindings; it is enough to simply include \em cl2.hpp. + * + * The bindings themselves are lightweight and correspond closely to the + * underlying C API. Using the C++ bindings introduces no additional execution + * overhead. + * + * There are numerous compatibility, portability and memory management + * fixes in the new header as well as additional OpenCL 2.0 features. + * As a result the header is not directly backward compatible and for this + * reason we release it as cl2.hpp rather than a new version of cl.hpp. + * + * + * \section compatibility Compatibility + * Due to the evolution of the underlying OpenCL API the 2.0 C++ bindings + * include an updated approach to defining supported feature versions + * and the range of valid underlying OpenCL runtime versions supported. + * + * The combination of preprocessor macros CL_HPP_TARGET_OPENCL_VERSION and + * CL_HPP_MINIMUM_OPENCL_VERSION control this range. These are three digit + * decimal values representing OpenCL runime versions. The default for + * the target is 200, representing OpenCL 2.0 and the minimum is also + * defined as 200. These settings would use 2.0 API calls only. + * If backward compatibility with a 1.2 runtime is required, the minimum + * version may be set to 120. + * + * Note that this is a compile-time setting, and so affects linking against + * a particular SDK version rather than the versioning of the loaded runtime. + * + * The earlier versions of the header included basic vector and string + * classes based loosely on STL versions. These were difficult to + * maintain and very rarely used. For the 2.0 header we now assume + * the presence of the standard library unless requested otherwise. + * We use std::array, std::vector, std::shared_ptr and std::string + * throughout to safely manage memory and reduce the chance of a + * recurrance of earlier memory management bugs. + * + * These classes are used through typedefs in the cl namespace: + * cl::array, cl::vector, cl::pointer and cl::string. + * In addition cl::allocate_pointer forwards to std::allocate_shared + * by default. + * In all cases these standard library classes can be replaced with + * custom interface-compatible versions using the CL_HPP_NO_STD_ARRAY, + * CL_HPP_NO_STD_VECTOR, CL_HPP_NO_STD_UNIQUE_PTR and + * CL_HPP_NO_STD_STRING macros. + * + * The OpenCL 1.x versions of the C++ bindings included a size_t wrapper + * class to interface with kernel enqueue. This caused unpleasant interactions + * with the standard size_t declaration and led to namespacing bugs. + * In the 2.0 version we have replaced this with a std::array-based interface. + * However, the old behaviour can be regained for backward compatibility + * using the CL_HPP_ENABLE_SIZE_T_COMPATIBILITY macro. + * + * Finally, the program construction interface used a clumsy vector-of-pairs + * design in the earlier versions. We have replaced that with a cleaner + * vector-of-vectors and vector-of-strings design. However, for backward + * compatibility old behaviour can be regained with the + * CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY macro. + * + * In OpenCL 2.0 OpenCL C is not entirely backward compatibility with + * earlier versions. As a result a flag must be passed to the OpenCL C + * compiled to request OpenCL 2.0 compilation of kernels with 1.2 as + * the default in the absence of the flag. + * In some cases the C++ bindings automatically compile code for ease. + * For those cases the compilation defaults to OpenCL C 2.0. + * If this is not wanted, the CL_HPP_CL_1_2_DEFAULT_BUILD macro may + * be specified to assume 1.2 compilation. + * If more fine-grained decisions on a per-kernel bases are required + * then explicit build operations that take the flag should be used. + * + * + * \section parameterization Parameters + * This header may be parameterized by a set of preprocessor macros. + * + * - CL_HPP_TARGET_OPENCL_VERSION + * + * Defines the target OpenCL runtime version to build the header + * against. Defaults to 200, representing OpenCL 2.0. + * + * - CL_HPP_NO_STD_STRING + * + * Do not use the standard library string class. cl::string is not + * defined and may be defined by the user before cl2.hpp is + * included. + * + * - CL_HPP_NO_STD_VECTOR + * + * Do not use the standard library vector class. cl::vector is not + * defined and may be defined by the user before cl2.hpp is + * included. + * + * - CL_HPP_NO_STD_ARRAY + * + * Do not use the standard library array class. cl::array is not + * defined and may be defined by the user before cl2.hpp is + * included. + * + * - CL_HPP_NO_STD_UNIQUE_PTR + * + * Do not use the standard library unique_ptr class. cl::pointer and + * the cl::allocate_pointer functions are not defined and may be + * defined by the user before cl2.hpp is included. + * + * - CL_HPP_ENABLE_DEVICE_FISSION + * + * Enables device fission for OpenCL 1.2 platforms. + * + * - CL_HPP_ENABLE_EXCEPTIONS + * + * Enable exceptions for use in the C++ bindings header. This is the + * preferred error handling mechanism but is not required. + * + * - CL_HPP_ENABLE_SIZE_T_COMPATIBILITY + * + * Backward compatibility option to support cl.hpp-style size_t + * class. Replaces the updated std::array derived version and + * removal of size_t from the namespace. Note that in this case the + * new size_t class is placed in the cl::compatibility namespace and + * thus requires an additional using declaration for direct backward + * compatibility. + * + * - CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY + * + * Enable older vector of pairs interface for construction of + * programs. + * + * - CL_HPP_CL_1_2_DEFAULT_BUILD + * + * Default to OpenCL C 1.2 compilation rather than OpenCL C 2.0 + * applies to use of cl::Program construction and other program + * build variants. + * + * + * \section example Example + * + * The following example shows a general use case for the C++ + * bindings, including support for the optional exception feature and + * also the supplied vector and string classes, see following sections for + * decriptions of these features. + * + * \code + #define CL_HPP_ENABLE_EXCEPTIONS + #define CL_HPP_TARGET_OPENCL_VERSION 200 + + #include + #include + #include + #include + #include + + const int numElements = 32; + + int main(void) + { + // Filter for a 2.0 platform and set it as the default + std::vector platforms; + cl::Platform::get(&platforms); + cl::Platform plat; + for (auto &p : platforms) { + std::string platver = p.getInfo(); + if (platver.find("OpenCL 2.") != std::string::npos) { + plat = p; + } + } + if (plat() == 0) { + std::cout << "No OpenCL 2.0 platform found."; + return -1; + } + + cl::Platform newP = cl::Platform::setDefault(plat); + if (newP != plat) { + std::cout << "Error setting default platform."; + return -1; + } + + // Use C++11 raw string literals for kernel source code + std::string kernel1{R"CLC( + global int globalA; + kernel void updateGlobal() + { + globalA = 75; + } + )CLC"}; + std::string kernel2{R"CLC( + typedef struct { global int *bar; } Foo; + kernel void vectorAdd(global const Foo* aNum, global const int *inputA, global const int *inputB, + global int *output, int val, write_only pipe int outPipe, queue_t childQueue) + { + output[get_global_id(0)] = inputA[get_global_id(0)] + inputB[get_global_id(0)] + val + *(aNum->bar); + write_pipe(outPipe, &val); + queue_t default_queue = get_default_queue(); + ndrange_t ndrange = ndrange_1D(get_global_size(0)/2, get_global_size(0)/2); + + // Have a child kernel write into third quarter of output + enqueue_kernel(default_queue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, + ^{ + output[get_global_size(0)*2 + get_global_id(0)] = + inputA[get_global_size(0)*2 + get_global_id(0)] + inputB[get_global_size(0)*2 + get_global_id(0)] + globalA; + }); + + // Have a child kernel write into last quarter of output + enqueue_kernel(childQueue, CLK_ENQUEUE_FLAGS_WAIT_KERNEL, ndrange, + ^{ + output[get_global_size(0)*3 + get_global_id(0)] = + inputA[get_global_size(0)*3 + get_global_id(0)] + inputB[get_global_size(0)*3 + get_global_id(0)] + globalA + 2; + }); + } + )CLC"}; + + // New simpler string interface style + std::vector programStrings {kernel1, kernel2}; + + cl::Program vectorAddProgram(programStrings); + try { + vectorAddProgram.build("-cl-std=CL2.0"); + } + catch (...) { + // Print build info for all devices + cl_int buildErr = CL_SUCCESS; + auto buildInfo = vectorAddProgram.getBuildInfo(&buildErr); + for (auto &pair : buildInfo) { + std::cerr << pair.second << std::endl << std::endl; + } + + return 1; + } + + typedef struct { int *bar; } Foo; + + // Get and run kernel that initializes the program-scope global + // A test for kernels that take no arguments + auto program2Kernel = + cl::KernelFunctor<>(vectorAddProgram, "updateGlobal"); + program2Kernel( + cl::EnqueueArgs( + cl::NDRange(1))); + + ////////////////// + // SVM allocations + + auto anSVMInt = cl::allocate_svm>(); + *anSVMInt = 5; + cl::SVMAllocator>> svmAllocReadOnly; + auto fooPointer = cl::allocate_pointer(svmAllocReadOnly); + fooPointer->bar = anSVMInt.get(); + cl::SVMAllocator> svmAlloc; + std::vector>> inputA(numElements, 1, svmAlloc); + cl::coarse_svm_vector inputB(numElements, 2, svmAlloc); + + // + ////////////// + + // Traditional cl_mem allocations + std::vector output(numElements, 0xdeadbeef); + cl::Buffer outputBuffer(begin(output), end(output), false); + cl::Pipe aPipe(sizeof(cl_int), numElements / 2); + + // Default command queue, also passed in as a parameter + cl::DeviceCommandQueue defaultDeviceQueue = cl::DeviceCommandQueue::makeDefault( + cl::Context::getDefault(), cl::Device::getDefault()); + + auto vectorAddKernel = + cl::KernelFunctor< + decltype(fooPointer)&, + int*, + cl::coarse_svm_vector&, + cl::Buffer, + int, + cl::Pipe&, + cl::DeviceCommandQueue + >(vectorAddProgram, "vectorAdd"); + + // Ensure that the additional SVM pointer is available to the kernel + // This one was not passed as a parameter + vectorAddKernel.setSVMPointers(anSVMInt); + + // Hand control of coarse allocations to runtime + cl::enqueueUnmapSVM(anSVMInt); + cl::enqueueUnmapSVM(fooPointer); + cl::unmapSVM(inputB); + cl::unmapSVM(output2); + + cl_int error; + vectorAddKernel( + cl::EnqueueArgs( + cl::NDRange(numElements/2), + cl::NDRange(numElements/2)), + fooPointer, + inputA.data(), + inputB, + outputBuffer, + 3, + aPipe, + defaultDeviceQueue, + error + ); + + cl::copy(outputBuffer, begin(output), end(output)); + // Grab the SVM output vector using a map + cl::mapSVM(output2); + + cl::Device d = cl::Device::getDefault(); + + std::cout << "Output:\n"; + for (int i = 1; i < numElements; ++i) { + std::cout << "\t" << output[i] << "\n"; + } + std::cout << "\n\n"; + + return 0; + } + * + * \endcode + * + */ +#ifndef CL_HPP_ +#define CL_HPP_ + +/* Handle deprecated preprocessor definitions. In each case, we only check for + * the old name if the new name is not defined, so that user code can define + * both and hence work with either version of the bindings. + */ +#if !defined(CL_HPP_USE_DX_INTEROP) && defined(USE_DX_INTEROP) +# pragma message("cl2.hpp: USE_DX_INTEROP is deprecated. Define CL_HPP_USE_DX_INTEROP instead") +# define CL_HPP_USE_DX_INTEROP +#endif +#if !defined(CL_HPP_USE_CL_DEVICE_FISSION) && defined(USE_CL_DEVICE_FISSION) +# pragma message("cl2.hpp: USE_CL_DEVICE_FISSION is deprecated. Define CL_HPP_USE_CL_DEVICE_FISSION instead") +# define CL_HPP_USE_CL_DEVICE_FISSION +#endif +#if !defined(CL_HPP_ENABLE_EXCEPTIONS) && defined(__CL_ENABLE_EXCEPTIONS) +# pragma message("cl2.hpp: __CL_ENABLE_EXCEPTIONS is deprecated. Define CL_HPP_ENABLE_EXCEPTIONS instead") +# define CL_HPP_ENABLE_EXCEPTIONS +#endif +#if !defined(CL_HPP_NO_STD_VECTOR) && defined(__NO_STD_VECTOR) +# pragma message("cl2.hpp: __NO_STD_VECTOR is deprecated. Define CL_HPP_NO_STD_VECTOR instead") +# define CL_HPP_NO_STD_VECTOR +#endif +#if !defined(CL_HPP_NO_STD_STRING) && defined(__NO_STD_STRING) +# pragma message("cl2.hpp: __NO_STD_STRING is deprecated. Define CL_HPP_NO_STD_STRING instead") +# define CL_HPP_NO_STD_STRING +#endif +#if defined(VECTOR_CLASS) +# pragma message("cl2.hpp: VECTOR_CLASS is deprecated. Alias cl::vector instead") +#endif +#if defined(STRING_CLASS) +# pragma message("cl2.hpp: STRING_CLASS is deprecated. Alias cl::string instead.") +#endif +#if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS) && defined(__CL_USER_OVERRIDE_ERROR_STRINGS) +# pragma message("cl2.hpp: __CL_USER_OVERRIDE_ERROR_STRINGS is deprecated. Define CL_HPP_USER_OVERRIDE_ERROR_STRINGS instead") +# define CL_HPP_USER_OVERRIDE_ERROR_STRINGS +#endif + +/* Warn about features that are no longer supported + */ +#if defined(__USE_DEV_VECTOR) +# pragma message("cl2.hpp: __USE_DEV_VECTOR is no longer supported. Expect compilation errors") +#endif +#if defined(__USE_DEV_STRING) +# pragma message("cl2.hpp: __USE_DEV_STRING is no longer supported. Expect compilation errors") +#endif + +/* Detect which version to target */ +#if !defined(CL_HPP_TARGET_OPENCL_VERSION) +# pragma message("cl2.hpp: CL_HPP_TARGET_OPENCL_VERSION is not defined. It will default to 200 (OpenCL 2.0)") +# define CL_HPP_TARGET_OPENCL_VERSION 200 +#endif +#if CL_HPP_TARGET_OPENCL_VERSION != 100 && CL_HPP_TARGET_OPENCL_VERSION != 110 && CL_HPP_TARGET_OPENCL_VERSION != 120 && CL_HPP_TARGET_OPENCL_VERSION != 200 +# pragma message("cl2.hpp: CL_HPP_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120 or 200). It will be set to 200") +# undef CL_HPP_TARGET_OPENCL_VERSION +# define CL_HPP_TARGET_OPENCL_VERSION 200 +#endif + +#if !defined(CL_HPP_MINIMUM_OPENCL_VERSION) +# define CL_HPP_MINIMUM_OPENCL_VERSION 200 +#endif +#if CL_HPP_MINIMUM_OPENCL_VERSION != 100 && CL_HPP_MINIMUM_OPENCL_VERSION != 110 && CL_HPP_MINIMUM_OPENCL_VERSION != 120 && CL_HPP_MINIMUM_OPENCL_VERSION != 200 +# pragma message("cl2.hpp: CL_HPP_MINIMUM_OPENCL_VERSION is not a valid value (100, 110, 120 or 200). It will be set to 100") +# undef CL_HPP_MINIMUM_OPENCL_VERSION +# define CL_HPP_MINIMUM_OPENCL_VERSION 100 +#endif +#if CL_HPP_MINIMUM_OPENCL_VERSION > CL_HPP_TARGET_OPENCL_VERSION +# error "CL_HPP_MINIMUM_OPENCL_VERSION must not be greater than CL_HPP_TARGET_OPENCL_VERSION" +#endif + +#if CL_HPP_MINIMUM_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS) +# define CL_USE_DEPRECATED_OPENCL_1_0_APIS +#endif +#if CL_HPP_MINIMUM_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) +# define CL_USE_DEPRECATED_OPENCL_1_1_APIS +#endif +#if CL_HPP_MINIMUM_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) +# define CL_USE_DEPRECATED_OPENCL_1_2_APIS +#endif +#if CL_HPP_MINIMUM_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS) +# define CL_USE_DEPRECATED_OPENCL_2_0_APIS +#endif + +#ifdef _WIN32 + +#include + +#if defined(CL_HPP_USE_DX_INTEROP) +#include +#include +#endif +#endif // _WIN32 + +#if defined(_MSC_VER) +#include +#endif // _MSC_VER + + // Check for a valid C++ version + +// Need to do both tests here because for some reason __cplusplus is not +// updated in visual studio +#if (!defined(_MSC_VER) && __cplusplus < 201103L) || (defined(_MSC_VER) && _MSC_VER < 1700) +#error Visual studio 2013 or another C++11-supporting compiler required +#endif + +// +#if defined(CL_HPP_USE_CL_DEVICE_FISSION) || defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) +#include +#endif + +#if defined(__APPLE__) || defined(__MACOSX) +#include +#else +#include +#endif // !__APPLE__ + +#if (__cplusplus >= 201103L) +#define CL_HPP_NOEXCEPT_ noexcept +#else +#define CL_HPP_NOEXCEPT_ +#endif + +#if defined(_MSC_VER) +# define CL_HPP_DEFINE_STATIC_MEMBER_ __declspec(selectany) +#else +# define CL_HPP_DEFINE_STATIC_MEMBER_ __attribute__((weak)) +#endif // !_MSC_VER + +// Define deprecated prefixes and suffixes to ensure compilation +// in case they are not pre-defined +#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) +#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) +#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED) +#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) + +#if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED) +#define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED) +#if !defined(CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED) +#define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED) + +#if !defined(CL_CALLBACK) +#define CL_CALLBACK +#endif //CL_CALLBACK + +#include +#include +#include +#include +#include +#include + + +// Define a size_type to represent a correctly resolved size_t +#if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY) +namespace cl { + using size_type = ::size_t; +} // namespace cl +#else // #if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY) +namespace cl { + using size_type = size_t; +} // namespace cl +#endif // #if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY) + + +#if defined(CL_HPP_ENABLE_EXCEPTIONS) +#include +#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS) + +#if !defined(CL_HPP_NO_STD_VECTOR) +#include +namespace cl { + template < class T, class Alloc = std::allocator > + using vector = std::vector; +} // namespace cl +#endif // #if !defined(CL_HPP_NO_STD_VECTOR) + +#if !defined(CL_HPP_NO_STD_STRING) +#include +namespace cl { + using string = std::string; +} // namespace cl +#endif // #if !defined(CL_HPP_NO_STD_STRING) + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 + +#if !defined(CL_HPP_NO_STD_UNIQUE_PTR) +#include +namespace cl { + // Replace unique_ptr and allocate_pointer for internal use + // to allow user to replace them + template + using pointer = std::unique_ptr; +} // namespace cl +#endif +#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 +#if !defined(CL_HPP_NO_STD_ARRAY) +#include +namespace cl { + template < class T, size_type N > + using array = std::array; +} // namespace cl +#endif // #if !defined(CL_HPP_NO_STD_ARRAY) + +// Define size_type appropriately to allow backward-compatibility +// use of the old size_t interface class +#if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY) +namespace cl { + namespace compatibility { + /*! \brief class used to interface between C++ and + * OpenCL C calls that require arrays of size_t values, whose + * size is known statically. + */ + template + class size_t + { + private: + size_type data_[N]; + + public: + //! \brief Initialize size_t to all 0s + size_t() + { + for (int i = 0; i < N; ++i) { + data_[i] = 0; + } + } + + size_t(const array &rhs) + { + for (int i = 0; i < N; ++i) { + data_[i] = rhs[i]; + } + } + + size_type& operator[](int index) + { + return data_[index]; + } + + const size_type& operator[](int index) const + { + return data_[index]; + } + + //! \brief Conversion operator to T*. + operator size_type* () { return data_; } + + //! \brief Conversion operator to const T*. + operator const size_type* () const { return data_; } + + operator array() const + { + array ret; + + for (int i = 0; i < N; ++i) { + ret[i] = data_[i]; + } + return ret; + } + }; + } // namespace compatibility + + template + using size_t = compatibility::size_t; +} // namespace cl +#endif // #if defined(CL_HPP_ENABLE_SIZE_T_COMPATIBILITY) + +// Helper alias to avoid confusing the macros +namespace cl { + namespace detail { + using size_t_array = array; + } // namespace detail +} // namespace cl + + +/*! \namespace cl + * + * \brief The OpenCL C++ bindings are defined within this namespace. + * + */ +namespace cl { + class Memory; + +#define CL_HPP_INIT_CL_EXT_FCN_PTR_(name) \ + if (!pfn_##name) { \ + pfn_##name = (PFN_##name) \ + clGetExtensionFunctionAddress(#name); \ + if (!pfn_##name) { \ + } \ + } + +#define CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, name) \ + if (!pfn_##name) { \ + pfn_##name = (PFN_##name) \ + clGetExtensionFunctionAddressForPlatform(platform, #name); \ + if (!pfn_##name) { \ + } \ + } + + class Program; + class Device; + class Context; + class CommandQueue; + class DeviceCommandQueue; + class Memory; + class Buffer; + class Pipe; + +#if defined(CL_HPP_ENABLE_EXCEPTIONS) + /*! \brief Exception class + * + * This may be thrown by API functions when CL_HPP_ENABLE_EXCEPTIONS is defined. + */ + class Error : public std::exception + { + private: + cl_int err_; + const char * errStr_; + public: + /*! \brief Create a new CL error exception for a given error code + * and corresponding message. + * + * \param err error code value. + * + * \param errStr a descriptive string that must remain in scope until + * handling of the exception has concluded. If set, it + * will be returned by what(). + */ + Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr) + {} + + ~Error() throw() {} + + /*! \brief Get error string associated with exception + * + * \return A memory pointer to the error message string. + */ + virtual const char * what() const throw () + { + if (errStr_ == NULL) { + return "empty"; + } + else { + return errStr_; + } + } + + /*! \brief Get error code associated with exception + * + * \return The error code. + */ + cl_int err(void) const { return err_; } + }; +#define CL_HPP_ERR_STR_(x) #x +#else +#define CL_HPP_ERR_STR_(x) NULL +#endif // CL_HPP_ENABLE_EXCEPTIONS + + +namespace detail +{ +#if defined(CL_HPP_ENABLE_EXCEPTIONS) +static inline cl_int errHandler ( + cl_int err, + const char * errStr = NULL) +{ + if (err != CL_SUCCESS) { + throw Error(err, errStr); + } + return err; +} +#else +static inline cl_int errHandler (cl_int err, const char * errStr = NULL) +{ + (void) errStr; // suppress unused variable warning + return err; +} +#endif // CL_HPP_ENABLE_EXCEPTIONS +} + + + +//! \cond DOXYGEN_DETAIL +#if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS) +#define __GET_DEVICE_INFO_ERR CL_HPP_ERR_STR_(clGetDeviceInfo) +#define __GET_PLATFORM_INFO_ERR CL_HPP_ERR_STR_(clGetPlatformInfo) +#define __GET_DEVICE_IDS_ERR CL_HPP_ERR_STR_(clGetDeviceIDs) +#define __GET_PLATFORM_IDS_ERR CL_HPP_ERR_STR_(clGetPlatformIDs) +#define __GET_CONTEXT_INFO_ERR CL_HPP_ERR_STR_(clGetContextInfo) +#define __GET_EVENT_INFO_ERR CL_HPP_ERR_STR_(clGetEventInfo) +#define __GET_EVENT_PROFILE_INFO_ERR CL_HPP_ERR_STR_(clGetEventProfileInfo) +#define __GET_MEM_OBJECT_INFO_ERR CL_HPP_ERR_STR_(clGetMemObjectInfo) +#define __GET_IMAGE_INFO_ERR CL_HPP_ERR_STR_(clGetImageInfo) +#define __GET_SAMPLER_INFO_ERR CL_HPP_ERR_STR_(clGetSamplerInfo) +#define __GET_KERNEL_INFO_ERR CL_HPP_ERR_STR_(clGetKernelInfo) +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 +#define __GET_KERNEL_ARG_INFO_ERR CL_HPP_ERR_STR_(clGetKernelArgInfo) +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 +#define __GET_KERNEL_WORK_GROUP_INFO_ERR CL_HPP_ERR_STR_(clGetKernelWorkGroupInfo) +#define __GET_PROGRAM_INFO_ERR CL_HPP_ERR_STR_(clGetProgramInfo) +#define __GET_PROGRAM_BUILD_INFO_ERR CL_HPP_ERR_STR_(clGetProgramBuildInfo) +#define __GET_COMMAND_QUEUE_INFO_ERR CL_HPP_ERR_STR_(clGetCommandQueueInfo) + +#define __CREATE_CONTEXT_ERR CL_HPP_ERR_STR_(clCreateContext) +#define __CREATE_CONTEXT_FROM_TYPE_ERR CL_HPP_ERR_STR_(clCreateContextFromType) +#define __GET_SUPPORTED_IMAGE_FORMATS_ERR CL_HPP_ERR_STR_(clGetSupportedImageFormats) + +#define __CREATE_BUFFER_ERR CL_HPP_ERR_STR_(clCreateBuffer) +#define __COPY_ERR CL_HPP_ERR_STR_(cl::copy) +#define __CREATE_SUBBUFFER_ERR CL_HPP_ERR_STR_(clCreateSubBuffer) +#define __CREATE_GL_BUFFER_ERR CL_HPP_ERR_STR_(clCreateFromGLBuffer) +#define __CREATE_GL_RENDER_BUFFER_ERR CL_HPP_ERR_STR_(clCreateFromGLBuffer) +#define __GET_GL_OBJECT_INFO_ERR CL_HPP_ERR_STR_(clGetGLObjectInfo) +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 +#define __CREATE_IMAGE_ERR CL_HPP_ERR_STR_(clCreateImage) +#define __CREATE_GL_TEXTURE_ERR CL_HPP_ERR_STR_(clCreateFromGLTexture) +#define __IMAGE_DIMENSION_ERR CL_HPP_ERR_STR_(Incorrect image dimensions) +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 +#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR CL_HPP_ERR_STR_(clSetMemObjectDestructorCallback) + +#define __CREATE_USER_EVENT_ERR CL_HPP_ERR_STR_(clCreateUserEvent) +#define __SET_USER_EVENT_STATUS_ERR CL_HPP_ERR_STR_(clSetUserEventStatus) +#define __SET_EVENT_CALLBACK_ERR CL_HPP_ERR_STR_(clSetEventCallback) +#define __WAIT_FOR_EVENTS_ERR CL_HPP_ERR_STR_(clWaitForEvents) + +#define __CREATE_KERNEL_ERR CL_HPP_ERR_STR_(clCreateKernel) +#define __SET_KERNEL_ARGS_ERR CL_HPP_ERR_STR_(clSetKernelArg) +#define __CREATE_PROGRAM_WITH_SOURCE_ERR CL_HPP_ERR_STR_(clCreateProgramWithSource) +#define __CREATE_PROGRAM_WITH_BINARY_ERR CL_HPP_ERR_STR_(clCreateProgramWithBinary) +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 +#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR CL_HPP_ERR_STR_(clCreateProgramWithBuiltInKernels) +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 +#define __BUILD_PROGRAM_ERR CL_HPP_ERR_STR_(clBuildProgram) +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 +#define __COMPILE_PROGRAM_ERR CL_HPP_ERR_STR_(clCompileProgram) +#define __LINK_PROGRAM_ERR CL_HPP_ERR_STR_(clLinkProgram) +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 +#define __CREATE_KERNELS_IN_PROGRAM_ERR CL_HPP_ERR_STR_(clCreateKernelsInProgram) + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 +#define __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR CL_HPP_ERR_STR_(clCreateCommandQueueWithProperties) +#define __CREATE_SAMPLER_WITH_PROPERTIES_ERR CL_HPP_ERR_STR_(clCreateSamplerWithProperties) +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200 +#define __SET_COMMAND_QUEUE_PROPERTY_ERR CL_HPP_ERR_STR_(clSetCommandQueueProperty) +#define __ENQUEUE_READ_BUFFER_ERR CL_HPP_ERR_STR_(clEnqueueReadBuffer) +#define __ENQUEUE_READ_BUFFER_RECT_ERR CL_HPP_ERR_STR_(clEnqueueReadBufferRect) +#define __ENQUEUE_WRITE_BUFFER_ERR CL_HPP_ERR_STR_(clEnqueueWriteBuffer) +#define __ENQUEUE_WRITE_BUFFER_RECT_ERR CL_HPP_ERR_STR_(clEnqueueWriteBufferRect) +#define __ENQEUE_COPY_BUFFER_ERR CL_HPP_ERR_STR_(clEnqueueCopyBuffer) +#define __ENQEUE_COPY_BUFFER_RECT_ERR CL_HPP_ERR_STR_(clEnqueueCopyBufferRect) +#define __ENQUEUE_FILL_BUFFER_ERR CL_HPP_ERR_STR_(clEnqueueFillBuffer) +#define __ENQUEUE_READ_IMAGE_ERR CL_HPP_ERR_STR_(clEnqueueReadImage) +#define __ENQUEUE_WRITE_IMAGE_ERR CL_HPP_ERR_STR_(clEnqueueWriteImage) +#define __ENQUEUE_COPY_IMAGE_ERR CL_HPP_ERR_STR_(clEnqueueCopyImage) +#define __ENQUEUE_FILL_IMAGE_ERR CL_HPP_ERR_STR_(clEnqueueFillImage) +#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR CL_HPP_ERR_STR_(clEnqueueCopyImageToBuffer) +#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR CL_HPP_ERR_STR_(clEnqueueCopyBufferToImage) +#define __ENQUEUE_MAP_BUFFER_ERR CL_HPP_ERR_STR_(clEnqueueMapBuffer) +#define __ENQUEUE_MAP_IMAGE_ERR CL_HPP_ERR_STR_(clEnqueueMapImage) +#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR CL_HPP_ERR_STR_(clEnqueueUnMapMemObject) +#define __ENQUEUE_NDRANGE_KERNEL_ERR CL_HPP_ERR_STR_(clEnqueueNDRangeKernel) +#define __ENQUEUE_NATIVE_KERNEL CL_HPP_ERR_STR_(clEnqueueNativeKernel) +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 +#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR CL_HPP_ERR_STR_(clEnqueueMigrateMemObjects) +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 + +#define __ENQUEUE_ACQUIRE_GL_ERR CL_HPP_ERR_STR_(clEnqueueAcquireGLObjects) +#define __ENQUEUE_RELEASE_GL_ERR CL_HPP_ERR_STR_(clEnqueueReleaseGLObjects) + +#define __CREATE_PIPE_ERR CL_HPP_ERR_STR_(clCreatePipe) +#define __GET_PIPE_INFO_ERR CL_HPP_ERR_STR_(clGetPipeInfo) + + +#define __RETAIN_ERR CL_HPP_ERR_STR_(Retain Object) +#define __RELEASE_ERR CL_HPP_ERR_STR_(Release Object) +#define __FLUSH_ERR CL_HPP_ERR_STR_(clFlush) +#define __FINISH_ERR CL_HPP_ERR_STR_(clFinish) +#define __VECTOR_CAPACITY_ERR CL_HPP_ERR_STR_(Vector capacity error) + +/** + * CL 1.2 version that uses device fission. + */ +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 +#define __CREATE_SUB_DEVICES_ERR CL_HPP_ERR_STR_(clCreateSubDevices) +#else +#define __CREATE_SUB_DEVICES_ERR CL_HPP_ERR_STR_(clCreateSubDevicesEXT) +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 + +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) +#define __ENQUEUE_MARKER_ERR CL_HPP_ERR_STR_(clEnqueueMarker) +#define __ENQUEUE_WAIT_FOR_EVENTS_ERR CL_HPP_ERR_STR_(clEnqueueWaitForEvents) +#define __ENQUEUE_BARRIER_ERR CL_HPP_ERR_STR_(clEnqueueBarrier) +#define __UNLOAD_COMPILER_ERR CL_HPP_ERR_STR_(clUnloadCompiler) +#define __CREATE_GL_TEXTURE_2D_ERR CL_HPP_ERR_STR_(clCreateFromGLTexture2D) +#define __CREATE_GL_TEXTURE_3D_ERR CL_HPP_ERR_STR_(clCreateFromGLTexture3D) +#define __CREATE_IMAGE2D_ERR CL_HPP_ERR_STR_(clCreateImage2D) +#define __CREATE_IMAGE3D_ERR CL_HPP_ERR_STR_(clCreateImage3D) +#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + +/** + * Deprecated APIs for 2.0 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) +#define __CREATE_COMMAND_QUEUE_ERR CL_HPP_ERR_STR_(clCreateCommandQueue) +#define __ENQUEUE_TASK_ERR CL_HPP_ERR_STR_(clEnqueueTask) +#define __CREATE_SAMPLER_ERR CL_HPP_ERR_STR_(clCreateSampler) +#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + +/** + * CL 1.2 marker and barrier commands + */ +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 +#define __ENQUEUE_MARKER_WAIT_LIST_ERR CL_HPP_ERR_STR_(clEnqueueMarkerWithWaitList) +#define __ENQUEUE_BARRIER_WAIT_LIST_ERR CL_HPP_ERR_STR_(clEnqueueBarrierWithWaitList) +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 + +#endif // CL_HPP_USER_OVERRIDE_ERROR_STRINGS +//! \endcond + + +namespace detail { + +// Generic getInfoHelper. The final parameter is used to guide overload +// resolution: the actual parameter passed is an int, which makes this +// a worse conversion sequence than a specialization that declares the +// parameter as an int. +template +inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long) +{ + return f(name, sizeof(T), param, NULL); +} + +// Specialized for getInfo +// Assumes that the output vector was correctly resized on the way in +template +inline cl_int getInfoHelper(Func f, cl_uint name, vector>* param, int) +{ + if (name != CL_PROGRAM_BINARIES) { + return CL_INVALID_VALUE; + } + if (param) { + // Create array of pointers, calculate total size and pass pointer array in + size_type numBinaries = param->size(); + vector binariesPointers(numBinaries); + + for (size_type i = 0; i < numBinaries; ++i) + { + binariesPointers[i] = (*param)[i].data(); + } + + cl_int err = f(name, numBinaries * sizeof(unsigned char*), binariesPointers.data(), NULL); + + if (err != CL_SUCCESS) { + return err; + } + } + + + return CL_SUCCESS; +} + +// Specialized getInfoHelper for vector params +template +inline cl_int getInfoHelper(Func f, cl_uint name, vector* param, long) +{ + size_type required; + cl_int err = f(name, 0, NULL, &required); + if (err != CL_SUCCESS) { + return err; + } + const size_type elements = required / sizeof(T); + + // Temporary to avoid changing param on an error + vector localData(elements); + err = f(name, required, localData.data(), NULL); + if (err != CL_SUCCESS) { + return err; + } + if (param) { + *param = std::move(localData); + } + + return CL_SUCCESS; +} + +/* Specialization for reference-counted types. This depends on the + * existence of Wrapper::cl_type, and none of the other types having the + * cl_type member. Note that simplify specifying the parameter as Wrapper + * does not work, because when using a derived type (e.g. Context) the generic + * template will provide a better match. + */ +template +inline cl_int getInfoHelper( + Func f, cl_uint name, vector* param, int, typename T::cl_type = 0) +{ + size_type required; + cl_int err = f(name, 0, NULL, &required); + if (err != CL_SUCCESS) { + return err; + } + + const size_type elements = required / sizeof(typename T::cl_type); + + vector value(elements); + err = f(name, required, value.data(), NULL); + if (err != CL_SUCCESS) { + return err; + } + + if (param) { + // Assign to convert CL type to T for each element + param->resize(elements); + + // Assign to param, constructing with retain behaviour + // to correctly capture each underlying CL object + for (size_type i = 0; i < elements; i++) { + (*param)[i] = T(value[i], true); + } + } + return CL_SUCCESS; +} + +// Specialized GetInfoHelper for string params +template +inline cl_int getInfoHelper(Func f, cl_uint name, string* param, long) +{ + size_type required; + cl_int err = f(name, 0, NULL, &required); + if (err != CL_SUCCESS) { + return err; + } + + // std::string has a constant data member + // a char vector does not + if (required > 0) { + vector value(required); + err = f(name, required, value.data(), NULL); + if (err != CL_SUCCESS) { + return err; + } + if (param) { + param->assign(begin(value), prev(end(value))); + } + } + else if (param) { + param->assign(""); + } + return CL_SUCCESS; +} + +// Specialized GetInfoHelper for clsize_t params +template +inline cl_int getInfoHelper(Func f, cl_uint name, array* param, long) +{ + size_type required; + cl_int err = f(name, 0, NULL, &required); + if (err != CL_SUCCESS) { + return err; + } + + size_type elements = required / sizeof(size_type); + vector value(elements, 0); + + err = f(name, required, value.data(), NULL); + if (err != CL_SUCCESS) { + return err; + } + + // Bound the copy with N to prevent overruns + // if passed N > than the amount copied + if (elements > N) { + elements = N; + } + for (size_type i = 0; i < elements; ++i) { + (*param)[i] = value[i]; + } + + return CL_SUCCESS; +} + +template struct ReferenceHandler; + +/* Specialization for reference-counted types. This depends on the + * existence of Wrapper::cl_type, and none of the other types having the + * cl_type member. Note that simplify specifying the parameter as Wrapper + * does not work, because when using a derived type (e.g. Context) the generic + * template will provide a better match. + */ +template +inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0) +{ + typename T::cl_type value; + cl_int err = f(name, sizeof(value), &value, NULL); + if (err != CL_SUCCESS) { + return err; + } + *param = value; + if (value != NULL) + { + err = param->retain(); + if (err != CL_SUCCESS) { + return err; + } + } + return CL_SUCCESS; +} + +#define CL_HPP_PARAM_NAME_INFO_1_0_(F) \ + F(cl_platform_info, CL_PLATFORM_PROFILE, string) \ + F(cl_platform_info, CL_PLATFORM_VERSION, string) \ + F(cl_platform_info, CL_PLATFORM_NAME, string) \ + F(cl_platform_info, CL_PLATFORM_VENDOR, string) \ + F(cl_platform_info, CL_PLATFORM_EXTENSIONS, string) \ + \ + F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \ + F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, size_type) \ + F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, cl::vector) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \ + F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, size_type) \ + F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, size_type) \ + F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, size_type) \ + F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, size_type) \ + F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, size_type) \ + F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \ + F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, size_type) \ + F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \ + F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \ + F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \ + F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \ + F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\ + F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \ + F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \ + F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \ + F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, size_type) \ + F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \ + F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \ + F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \ + F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \ + F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \ + F(cl_device_info, CL_DEVICE_NAME, string) \ + F(cl_device_info, CL_DEVICE_VENDOR, string) \ + F(cl_device_info, CL_DRIVER_VERSION, string) \ + F(cl_device_info, CL_DEVICE_PROFILE, string) \ + F(cl_device_info, CL_DEVICE_VERSION, string) \ + F(cl_device_info, CL_DEVICE_EXTENSIONS, string) \ + \ + F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \ + F(cl_context_info, CL_CONTEXT_DEVICES, cl::vector) \ + F(cl_context_info, CL_CONTEXT_PROPERTIES, cl::vector) \ + \ + F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \ + F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \ + F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \ + F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_int) \ + \ + F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \ + F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \ + F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \ + F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \ + \ + F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \ + F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \ + F(cl_mem_info, CL_MEM_SIZE, size_type) \ + F(cl_mem_info, CL_MEM_HOST_PTR, void*) \ + F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \ + F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \ + F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \ + \ + F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \ + F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, size_type) \ + F(cl_image_info, CL_IMAGE_ROW_PITCH, size_type) \ + F(cl_image_info, CL_IMAGE_SLICE_PITCH, size_type) \ + F(cl_image_info, CL_IMAGE_WIDTH, size_type) \ + F(cl_image_info, CL_IMAGE_HEIGHT, size_type) \ + F(cl_image_info, CL_IMAGE_DEPTH, size_type) \ + \ + F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \ + F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \ + F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_bool) \ + F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_addressing_mode) \ + F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_filter_mode) \ + \ + F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \ + F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \ + F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \ + F(cl_program_info, CL_PROGRAM_DEVICES, cl::vector) \ + F(cl_program_info, CL_PROGRAM_SOURCE, string) \ + F(cl_program_info, CL_PROGRAM_BINARY_SIZES, cl::vector) \ + F(cl_program_info, CL_PROGRAM_BINARIES, cl::vector>) \ + \ + F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \ + F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, string) \ + F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, string) \ + \ + F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, string) \ + F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \ + F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \ + F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \ + F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \ + \ + F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, size_type) \ + F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::detail::size_t_array) \ + F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \ + \ + F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \ + F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \ + F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \ + F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties) + + +#define CL_HPP_PARAM_NAME_INFO_1_1_(F) \ + F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \ + F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \ + F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \ + F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, string) \ + \ + F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \ + F(cl_mem_info, CL_MEM_OFFSET, size_type) \ + \ + F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, size_type) \ + F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \ + \ + F(cl_event_info, CL_EVENT_CONTEXT, cl::Context) + +#define CL_HPP_PARAM_NAME_INFO_1_2_(F) \ + F(cl_program_info, CL_PROGRAM_NUM_KERNELS, size_type) \ + F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, string) \ + \ + F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \ + \ + F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, string) \ + \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, string) \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, string) \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_QUALIFIER, cl_kernel_arg_type_qualifier) \ + \ + F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl::Device) \ + F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, cl::vector) \ + F(cl_device_info, CL_DEVICE_PARTITION_TYPE, cl::vector) \ + F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, size_type) \ + F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \ + F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, string) \ + \ + F(cl_image_info, CL_IMAGE_ARRAY_SIZE, size_type) \ + F(cl_image_info, CL_IMAGE_NUM_MIP_LEVELS, cl_uint) \ + F(cl_image_info, CL_IMAGE_NUM_SAMPLES, cl_uint) + +#define CL_HPP_PARAM_NAME_INFO_2_0_(F) \ + F(cl_device_info, CL_DEVICE_QUEUE_ON_HOST_PROPERTIES, cl_command_queue_properties) \ + F(cl_device_info, CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES, cl_command_queue_properties) \ + F(cl_device_info, CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE, cl_uint) \ + F(cl_device_info, CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_ON_DEVICE_QUEUES, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_ON_DEVICE_EVENTS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_PIPE_ARGS, cl_uint) \ + F(cl_device_info, CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS, cl_uint) \ + F(cl_device_info, CL_DEVICE_PIPE_MAX_PACKET_SIZE, cl_uint) \ + F(cl_device_info, CL_DEVICE_SVM_CAPABILITIES, cl_device_svm_capabilities) \ + F(cl_device_info, CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT, cl_uint) \ + F(cl_command_queue_info, CL_QUEUE_SIZE, cl_uint) \ + F(cl_mem_info, CL_MEM_USES_SVM_POINTER, cl_bool) \ + F(cl_program_build_info, CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE, size_type) \ + F(cl_pipe_info, CL_PIPE_PACKET_SIZE, cl_uint) \ + F(cl_pipe_info, CL_PIPE_MAX_PACKETS, cl_uint) + +#define CL_HPP_PARAM_NAME_DEVICE_FISSION_(F) \ + F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \ + F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, cl::vector) \ + F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, cl::vector) \ + F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \ + F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, cl::vector) + +template +struct param_traits {}; + +#define CL_HPP_DECLARE_PARAM_TRAITS_(token, param_name, T) \ +struct token; \ +template<> \ +struct param_traits \ +{ \ + enum { value = param_name }; \ + typedef T param_type; \ +}; + +CL_HPP_PARAM_NAME_INFO_1_0_(CL_HPP_DECLARE_PARAM_TRAITS_) +#if CL_HPP_TARGET_OPENCL_VERSION >= 110 +CL_HPP_PARAM_NAME_INFO_1_1_(CL_HPP_DECLARE_PARAM_TRAITS_) +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110 +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 +CL_HPP_PARAM_NAME_INFO_1_2_(CL_HPP_DECLARE_PARAM_TRAITS_) +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110 +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 +CL_HPP_PARAM_NAME_INFO_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_) +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110 + + +// Flags deprecated in OpenCL 2.0 +#define CL_HPP_PARAM_NAME_INFO_1_0_DEPRECATED_IN_2_0_(F) \ + F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties) + +#define CL_HPP_PARAM_NAME_INFO_1_1_DEPRECATED_IN_2_0_(F) \ + F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) + +#define CL_HPP_PARAM_NAME_INFO_1_2_DEPRECATED_IN_2_0_(F) \ + F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer) + +// Include deprecated query flags based on versions +// Only include deprecated 1.0 flags if 2.0 not active as there is an enum clash +#if CL_HPP_TARGET_OPENCL_VERSION > 100 && CL_HPP_MINIMUM_OPENCL_VERSION < 200 && CL_HPP_TARGET_OPENCL_VERSION < 200 +CL_HPP_PARAM_NAME_INFO_1_0_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_) +#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 110 +#if CL_HPP_TARGET_OPENCL_VERSION > 110 && CL_HPP_MINIMUM_OPENCL_VERSION < 200 +CL_HPP_PARAM_NAME_INFO_1_1_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_) +#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120 +#if CL_HPP_TARGET_OPENCL_VERSION > 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 200 +CL_HPP_PARAM_NAME_INFO_1_2_DEPRECATED_IN_2_0_(CL_HPP_DECLARE_PARAM_TRAITS_) +#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200 + +#if defined(CL_HPP_USE_CL_DEVICE_FISSION) +CL_HPP_PARAM_NAME_DEVICE_FISSION_(CL_HPP_DECLARE_PARAM_TRAITS_); +#endif // CL_HPP_USE_CL_DEVICE_FISSION + +#ifdef CL_PLATFORM_ICD_SUFFIX_KHR +CL_HPP_DECLARE_PARAM_TRAITS_(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, string) +#endif + +#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong) +#endif + +#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, vector) +#endif +#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_SIMD_WIDTH_AMD +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint) +#endif + +#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint) +#endif +#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint) +#endif +#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint) +#endif +#ifdef CL_DEVICE_WARP_SIZE_NV +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint) +#endif +#ifdef CL_DEVICE_GPU_OVERLAP_NV +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool) +#endif +#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool) +#endif +#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV +CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool) +#endif + +// Convenience functions + +template +inline cl_int +getInfo(Func f, cl_uint name, T* param) +{ + return getInfoHelper(f, name, param, 0); +} + +template +struct GetInfoFunctor0 +{ + Func f_; const Arg0& arg0_; + cl_int operator ()( + cl_uint param, size_type size, void* value, size_type* size_ret) + { return f_(arg0_, param, size, value, size_ret); } +}; + +template +struct GetInfoFunctor1 +{ + Func f_; const Arg0& arg0_; const Arg1& arg1_; + cl_int operator ()( + cl_uint param, size_type size, void* value, size_type* size_ret) + { return f_(arg0_, arg1_, param, size, value, size_ret); } +}; + +template +inline cl_int +getInfo(Func f, const Arg0& arg0, cl_uint name, T* param) +{ + GetInfoFunctor0 f0 = { f, arg0 }; + return getInfoHelper(f0, name, param, 0); +} + +template +inline cl_int +getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param) +{ + GetInfoFunctor1 f0 = { f, arg0, arg1 }; + return getInfoHelper(f0, name, param, 0); +} + + +template +struct ReferenceHandler +{ }; + +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 +/** + * OpenCL 1.2 devices do have retain/release. + */ +template <> +struct ReferenceHandler +{ + /** + * Retain the device. + * \param device A valid device created using createSubDevices + * \return + * CL_SUCCESS if the function executed successfully. + * CL_INVALID_DEVICE if device was not a valid subdevice + * CL_OUT_OF_RESOURCES + * CL_OUT_OF_HOST_MEMORY + */ + static cl_int retain(cl_device_id device) + { return ::clRetainDevice(device); } + /** + * Retain the device. + * \param device A valid device created using createSubDevices + * \return + * CL_SUCCESS if the function executed successfully. + * CL_INVALID_DEVICE if device was not a valid subdevice + * CL_OUT_OF_RESOURCES + * CL_OUT_OF_HOST_MEMORY + */ + static cl_int release(cl_device_id device) + { return ::clReleaseDevice(device); } +}; +#else // CL_HPP_TARGET_OPENCL_VERSION >= 120 +/** + * OpenCL 1.1 devices do not have retain/release. + */ +template <> +struct ReferenceHandler +{ + // cl_device_id does not have retain(). + static cl_int retain(cl_device_id) + { return CL_SUCCESS; } + // cl_device_id does not have release(). + static cl_int release(cl_device_id) + { return CL_SUCCESS; } +}; +#endif // ! (CL_HPP_TARGET_OPENCL_VERSION >= 120) + +template <> +struct ReferenceHandler +{ + // cl_platform_id does not have retain(). + static cl_int retain(cl_platform_id) + { return CL_SUCCESS; } + // cl_platform_id does not have release(). + static cl_int release(cl_platform_id) + { return CL_SUCCESS; } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_context context) + { return ::clRetainContext(context); } + static cl_int release(cl_context context) + { return ::clReleaseContext(context); } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_command_queue queue) + { return ::clRetainCommandQueue(queue); } + static cl_int release(cl_command_queue queue) + { return ::clReleaseCommandQueue(queue); } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_mem memory) + { return ::clRetainMemObject(memory); } + static cl_int release(cl_mem memory) + { return ::clReleaseMemObject(memory); } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_sampler sampler) + { return ::clRetainSampler(sampler); } + static cl_int release(cl_sampler sampler) + { return ::clReleaseSampler(sampler); } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_program program) + { return ::clRetainProgram(program); } + static cl_int release(cl_program program) + { return ::clReleaseProgram(program); } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_kernel kernel) + { return ::clRetainKernel(kernel); } + static cl_int release(cl_kernel kernel) + { return ::clReleaseKernel(kernel); } +}; + +template <> +struct ReferenceHandler +{ + static cl_int retain(cl_event event) + { return ::clRetainEvent(event); } + static cl_int release(cl_event event) + { return ::clReleaseEvent(event); } +}; + + +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120 +// Extracts version number with major in the upper 16 bits, minor in the lower 16 +static cl_uint getVersion(const vector &versionInfo) +{ + int highVersion = 0; + int lowVersion = 0; + int index = 7; + while(versionInfo[index] != '.' ) { + highVersion *= 10; + highVersion += versionInfo[index]-'0'; + ++index; + } + ++index; + while(versionInfo[index] != ' ' && versionInfo[index] != '\0') { + lowVersion *= 10; + lowVersion += versionInfo[index]-'0'; + ++index; + } + return (highVersion << 16) | lowVersion; +} + +static cl_uint getPlatformVersion(cl_platform_id platform) +{ + size_type size = 0; + clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size); + + vector versionInfo(size); + clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, versionInfo.data(), &size); + return getVersion(versionInfo); +} + +static cl_uint getDevicePlatformVersion(cl_device_id device) +{ + cl_platform_id platform; + clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL); + return getPlatformVersion(platform); +} + +static cl_uint getContextPlatformVersion(cl_context context) +{ + // The platform cannot be queried directly, so we first have to grab a + // device and obtain its context + size_type size = 0; + clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size); + if (size == 0) + return 0; + vector devices(size/sizeof(cl_device_id)); + clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices.data(), NULL); + return getDevicePlatformVersion(devices[0]); +} +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120 + +template +class Wrapper +{ +public: + typedef T cl_type; + +protected: + cl_type object_; + +public: + Wrapper() : object_(NULL) { } + + Wrapper(const cl_type &obj, bool retainObject) : object_(obj) + { + if (retainObject) { + detail::errHandler(retain(), __RETAIN_ERR); + } + } + + ~Wrapper() + { + if (object_ != NULL) { release(); } + } + + Wrapper(const Wrapper& rhs) + { + object_ = rhs.object_; + detail::errHandler(retain(), __RETAIN_ERR); + } + + Wrapper(Wrapper&& rhs) CL_HPP_NOEXCEPT_ + { + object_ = rhs.object_; + rhs.object_ = NULL; + } + + Wrapper& operator = (const Wrapper& rhs) + { + if (this != &rhs) { + detail::errHandler(release(), __RELEASE_ERR); + object_ = rhs.object_; + detail::errHandler(retain(), __RETAIN_ERR); + } + return *this; + } + + Wrapper& operator = (Wrapper&& rhs) + { + if (this != &rhs) { + detail::errHandler(release(), __RELEASE_ERR); + object_ = rhs.object_; + rhs.object_ = NULL; + } + return *this; + } + + Wrapper& operator = (const cl_type &rhs) + { + detail::errHandler(release(), __RELEASE_ERR); + object_ = rhs; + return *this; + } + + const cl_type& operator ()() const { return object_; } + + cl_type& operator ()() { return object_; } + + const cl_type get() const { return object_; } + + cl_type get() { return object_; } + + +protected: + template + friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type); + + cl_int retain() const + { + if (object_ != nullptr) { + return ReferenceHandler::retain(object_); + } + else { + return CL_SUCCESS; + } + } + + cl_int release() const + { + if (object_ != nullptr) { + return ReferenceHandler::release(object_); + } + else { + return CL_SUCCESS; + } + } +}; + +template <> +class Wrapper +{ +public: + typedef cl_device_id cl_type; + +protected: + cl_type object_; + bool referenceCountable_; + + static bool isReferenceCountable(cl_device_id device) + { + bool retVal = false; +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 +#if CL_HPP_MINIMUM_OPENCL_VERSION < 120 + if (device != NULL) { + int version = getDevicePlatformVersion(device); + if(version > ((1 << 16) + 1)) { + retVal = true; + } + } +#else // CL_HPP_MINIMUM_OPENCL_VERSION < 120 + retVal = true; +#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120 +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 + return retVal; + } + +public: + Wrapper() : object_(NULL), referenceCountable_(false) + { + } + + Wrapper(const cl_type &obj, bool retainObject) : + object_(obj), + referenceCountable_(false) + { + referenceCountable_ = isReferenceCountable(obj); + + if (retainObject) { + detail::errHandler(retain(), __RETAIN_ERR); + } + } + + ~Wrapper() + { + release(); + } + + Wrapper(const Wrapper& rhs) + { + object_ = rhs.object_; + referenceCountable_ = isReferenceCountable(object_); + detail::errHandler(retain(), __RETAIN_ERR); + } + + Wrapper(Wrapper&& rhs) CL_HPP_NOEXCEPT_ + { + object_ = rhs.object_; + referenceCountable_ = rhs.referenceCountable_; + rhs.object_ = NULL; + rhs.referenceCountable_ = false; + } + + Wrapper& operator = (const Wrapper& rhs) + { + if (this != &rhs) { + detail::errHandler(release(), __RELEASE_ERR); + object_ = rhs.object_; + referenceCountable_ = rhs.referenceCountable_; + detail::errHandler(retain(), __RETAIN_ERR); + } + return *this; + } + + Wrapper& operator = (Wrapper&& rhs) + { + if (this != &rhs) { + detail::errHandler(release(), __RELEASE_ERR); + object_ = rhs.object_; + referenceCountable_ = rhs.referenceCountable_; + rhs.object_ = NULL; + rhs.referenceCountable_ = false; + } + return *this; + } + + Wrapper& operator = (const cl_type &rhs) + { + detail::errHandler(release(), __RELEASE_ERR); + object_ = rhs; + referenceCountable_ = isReferenceCountable(object_); + return *this; + } + + const cl_type& operator ()() const { return object_; } + + cl_type& operator ()() { return object_; } + + cl_type get() const { return object_; } + +protected: + template + friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type); + + template + friend inline cl_int getInfoHelper(Func, cl_uint, vector*, int, typename U::cl_type); + + cl_int retain() const + { + if( object_ != nullptr && referenceCountable_ ) { + return ReferenceHandler::retain(object_); + } + else { + return CL_SUCCESS; + } + } + + cl_int release() const + { + if (object_ != nullptr && referenceCountable_) { + return ReferenceHandler::release(object_); + } + else { + return CL_SUCCESS; + } + } +}; + +template +inline bool operator==(const Wrapper &lhs, const Wrapper &rhs) +{ + return lhs() == rhs(); +} + +template +inline bool operator!=(const Wrapper &lhs, const Wrapper &rhs) +{ + return !operator==(lhs, rhs); +} + +} // namespace detail +//! \endcond + + +using BuildLogType = vector::param_type>>; +#if defined(CL_HPP_ENABLE_EXCEPTIONS) +/** +* Exception class for build errors to carry build info +*/ +class BuildError : public Error +{ +private: + BuildLogType buildLogs; +public: + BuildError(cl_int err, const char * errStr, const BuildLogType &vec) : Error(err, errStr), buildLogs(vec) + { + } + + BuildLogType getBuildLog() const + { + return buildLogs; + } +}; +namespace detail { + static inline cl_int buildErrHandler( + cl_int err, + const char * errStr, + const BuildLogType &buildLogs) + { + if (err != CL_SUCCESS) { + throw BuildError(err, errStr, buildLogs); + } + return err; + } +} // namespace detail + +#else +namespace detail { + static inline cl_int buildErrHandler( + cl_int err, + const char * errStr, + const BuildLogType &buildLogs) + { + (void)buildLogs; // suppress unused variable warning + (void)errStr; + return err; + } +} // namespace detail +#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS) + + +/*! \stuct ImageFormat + * \brief Adds constructors and member functions for cl_image_format. + * + * \see cl_image_format + */ +struct ImageFormat : public cl_image_format +{ + //! \brief Default constructor - performs no initialization. + ImageFormat(){} + + //! \brief Initializing constructor. + ImageFormat(cl_channel_order order, cl_channel_type type) + { + image_channel_order = order; + image_channel_data_type = type; + } + + //! \brief Assignment operator. + ImageFormat& operator = (const ImageFormat& rhs) + { + if (this != &rhs) { + this->image_channel_data_type = rhs.image_channel_data_type; + this->image_channel_order = rhs.image_channel_order; + } + return *this; + } +}; + +/*! \brief Class interface for cl_device_id. + * + * \note Copies of these objects are inexpensive, since they don't 'own' + * any underlying resources or data structures. + * + * \see cl_device_id + */ +class Device : public detail::Wrapper +{ +private: + static std::once_flag default_initialized_; + static Device default_; + static cl_int default_error_; + + /*! \brief Create the default context. + * + * This sets @c default_ and @c default_error_. It does not throw + * @c cl::Error. + */ + static void makeDefault(); + + /*! \brief Create the default platform from a provided platform. + * + * This sets @c default_. It does not throw + * @c cl::Error. + */ + static void makeDefaultProvided(const Device &p) { + default_ = p; + } + +public: +#ifdef CL_HPP_UNIT_TEST_ENABLE + /*! \brief Reset the default. + * + * This sets @c default_ to an empty value to support cleanup in + * the unit test framework. + * This function is not thread safe. + */ + static void unitTestClearDefault() { + default_ = Device(); + } +#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE + + //! \brief Default constructor - initializes to NULL. + Device() : detail::Wrapper() { } + + /*! \brief Constructor from cl_device_id. + * + * This simply copies the device ID value, which is an inexpensive operation. + */ + explicit Device(const cl_device_id &device, bool retainObject = false) : + detail::Wrapper(device, retainObject) { } + + /*! \brief Returns the first device on the default context. + * + * \see Context::getDefault() + */ + static Device getDefault( + cl_int *errResult = NULL) + { + std::call_once(default_initialized_, makeDefault); + detail::errHandler(default_error_); + if (errResult != NULL) { + *errResult = default_error_; + } + return default_; + } + + /** + * Modify the default device to be used by + * subsequent operations. + * Will only set the default if no default was previously created. + * @return updated default device. + * Should be compared to the passed value to ensure that it was updated. + */ + static Device setDefault(const Device &default_device) + { + std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_device)); + detail::errHandler(default_error_); + return default_; + } + + /*! \brief Assignment operator from cl_device_id. + * + * This simply copies the device ID value, which is an inexpensive operation. + */ + Device& operator = (const cl_device_id& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Device(const Device& dev) : detail::Wrapper(dev) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Device& operator = (const Device &dev) + { + detail::Wrapper::operator=(dev); + return *this; + } + + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Device(Device&& dev) CL_HPP_NOEXCEPT_ : detail::Wrapper(std::move(dev)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Device& operator = (Device &&dev) + { + detail::Wrapper::operator=(std::move(dev)); + return *this; + } + + //! \brief Wrapper for clGetDeviceInfo(). + template + cl_int getInfo(cl_device_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetDeviceInfo, object_, name, param), + __GET_DEVICE_INFO_ERR); + } + + //! \brief Wrapper for clGetDeviceInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_device_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /** + * CL 1.2 version + */ +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 + //! \brief Wrapper for clCreateSubDevices(). + cl_int createSubDevices( + const cl_device_partition_property * properties, + vector* devices) + { + cl_uint n = 0; + cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR); + } + + vector ids(n); + err = clCreateSubDevices(object_, properties, n, ids.data(), NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR); + } + + // Cannot trivially assign because we need to capture intermediates + // with safe construction + if (devices) { + devices->resize(ids.size()); + + // Assign to param, constructing with retain behaviour + // to correctly capture each underlying CL object + for (size_type i = 0; i < ids.size(); i++) { + // We do not need to retain because this device is being created + // by the runtime + (*devices)[i] = Device(ids[i], false); + } + } + + return CL_SUCCESS; + } +#elif defined(CL_HPP_USE_CL_DEVICE_FISSION) + +/** + * CL 1.1 version that uses device fission extension. + */ + cl_int createSubDevices( + const cl_device_partition_property_ext * properties, + vector* devices) + { + typedef CL_API_ENTRY cl_int + ( CL_API_CALL * PFN_clCreateSubDevicesEXT)( + cl_device_id /*in_device*/, + const cl_device_partition_property_ext * /* properties */, + cl_uint /*num_entries*/, + cl_device_id * /*out_devices*/, + cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL; + CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateSubDevicesEXT); + + cl_uint n = 0; + cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR); + } + + vector ids(n); + err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids.data(), NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_SUB_DEVICES_ERR); + } + // Cannot trivially assign because we need to capture intermediates + // with safe construction + if (devices) { + devices->resize(ids.size()); + + // Assign to param, constructing with retain behaviour + // to correctly capture each underlying CL object + for (size_type i = 0; i < ids.size(); i++) { + // We do not need to retain because this device is being created + // by the runtime + (*devices)[i] = Device(ids[i], false); + } + } + return CL_SUCCESS; + } +#endif // defined(CL_HPP_USE_CL_DEVICE_FISSION) +}; + +CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Device::default_initialized_; +CL_HPP_DEFINE_STATIC_MEMBER_ Device Device::default_; +CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Device::default_error_ = CL_SUCCESS; + +/*! \brief Class interface for cl_platform_id. + * + * \note Copies of these objects are inexpensive, since they don't 'own' + * any underlying resources or data structures. + * + * \see cl_platform_id + */ +class Platform : public detail::Wrapper +{ +private: + static std::once_flag default_initialized_; + static Platform default_; + static cl_int default_error_; + + /*! \brief Create the default context. + * + * This sets @c default_ and @c default_error_. It does not throw + * @c cl::Error. + */ + static void makeDefault() { + /* Throwing an exception from a call_once invocation does not do + * what we wish, so we catch it and save the error. + */ +#if defined(CL_HPP_ENABLE_EXCEPTIONS) + try +#endif + { + // If default wasn't passed ,generate one + // Otherwise set it + cl_uint n = 0; + + cl_int err = ::clGetPlatformIDs(0, NULL, &n); + if (err != CL_SUCCESS) { + default_error_ = err; + return; + } + if (n == 0) { + default_error_ = CL_INVALID_PLATFORM; + return; + } + + vector ids(n); + err = ::clGetPlatformIDs(n, ids.data(), NULL); + if (err != CL_SUCCESS) { + default_error_ = err; + return; + } + + default_ = Platform(ids[0]); + } +#if defined(CL_HPP_ENABLE_EXCEPTIONS) + catch (cl::Error &e) { + default_error_ = e.err(); + } +#endif + } + + /*! \brief Create the default platform from a provided platform. + * + * This sets @c default_. It does not throw + * @c cl::Error. + */ + static void makeDefaultProvided(const Platform &p) { + default_ = p; + } + +public: +#ifdef CL_HPP_UNIT_TEST_ENABLE + /*! \brief Reset the default. + * + * This sets @c default_ to an empty value to support cleanup in + * the unit test framework. + * This function is not thread safe. + */ + static void unitTestClearDefault() { + default_ = Platform(); + } +#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE + + //! \brief Default constructor - initializes to NULL. + Platform() : detail::Wrapper() { } + + /*! \brief Constructor from cl_platform_id. + * + * \param retainObject will cause the constructor to retain its cl object. + * Defaults to false to maintain compatibility with + * earlier versions. + * This simply copies the platform ID value, which is an inexpensive operation. + */ + explicit Platform(const cl_platform_id &platform, bool retainObject = false) : + detail::Wrapper(platform, retainObject) { } + + /*! \brief Assignment operator from cl_platform_id. + * + * This simply copies the platform ID value, which is an inexpensive operation. + */ + Platform& operator = (const cl_platform_id& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + static Platform getDefault( + cl_int *errResult = NULL) + { + std::call_once(default_initialized_, makeDefault); + detail::errHandler(default_error_); + if (errResult != NULL) { + *errResult = default_error_; + } + return default_; + } + + /** + * Modify the default platform to be used by + * subsequent operations. + * Will only set the default if no default was previously created. + * @return updated default platform. + * Should be compared to the passed value to ensure that it was updated. + */ + static Platform setDefault(const Platform &default_platform) + { + std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_platform)); + detail::errHandler(default_error_); + return default_; + } + + //! \brief Wrapper for clGetPlatformInfo(). + cl_int getInfo(cl_platform_info name, string* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetPlatformInfo, object_, name, param), + __GET_PLATFORM_INFO_ERR); + } + + //! \brief Wrapper for clGetPlatformInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_platform_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /*! \brief Gets a list of devices for this platform. + * + * Wraps clGetDeviceIDs(). + */ + cl_int getDevices( + cl_device_type type, + vector* devices) const + { + cl_uint n = 0; + if( devices == NULL ) { + return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR); + } + cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_DEVICE_IDS_ERR); + } + + vector ids(n); + err = ::clGetDeviceIDs(object_, type, n, ids.data(), NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_DEVICE_IDS_ERR); + } + + // Cannot trivially assign because we need to capture intermediates + // with safe construction + // We must retain things we obtain from the API to avoid releasing + // API-owned objects. + if (devices) { + devices->resize(ids.size()); + + // Assign to param, constructing with retain behaviour + // to correctly capture each underlying CL object + for (size_type i = 0; i < ids.size(); i++) { + (*devices)[i] = Device(ids[i], true); + } + } + return CL_SUCCESS; + } + +#if defined(CL_HPP_USE_DX_INTEROP) + /*! \brief Get the list of available D3D10 devices. + * + * \param d3d_device_source. + * + * \param d3d_object. + * + * \param d3d_device_set. + * + * \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device + * values returned in devices can be used to identify a specific OpenCL + * device. If \a devices argument is NULL, this argument is ignored. + * + * \return One of the following values: + * - CL_SUCCESS if the function is executed successfully. + * + * The application can query specific capabilities of the OpenCL device(s) + * returned by cl::getDevices. This can be used by the application to + * determine which device(s) to use. + * + * \note In the case that exceptions are enabled and a return value + * other than CL_SUCCESS is generated, then cl::Error exception is + * generated. + */ + cl_int getDevices( + cl_d3d10_device_source_khr d3d_device_source, + void * d3d_object, + cl_d3d10_device_set_khr d3d_device_set, + vector* devices) const + { + typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)( + cl_platform_id platform, + cl_d3d10_device_source_khr d3d_device_source, + void * d3d_object, + cl_d3d10_device_set_khr d3d_device_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint* num_devices); + + if( devices == NULL ) { + return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR); + } + + static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL; + CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(object_, clGetDeviceIDsFromD3D10KHR); + + cl_uint n = 0; + cl_int err = pfn_clGetDeviceIDsFromD3D10KHR( + object_, + d3d_device_source, + d3d_object, + d3d_device_set, + 0, + NULL, + &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_DEVICE_IDS_ERR); + } + + vector ids(n); + err = pfn_clGetDeviceIDsFromD3D10KHR( + object_, + d3d_device_source, + d3d_object, + d3d_device_set, + n, + ids.data(), + NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_DEVICE_IDS_ERR); + } + + // Cannot trivially assign because we need to capture intermediates + // with safe construction + // We must retain things we obtain from the API to avoid releasing + // API-owned objects. + if (devices) { + devices->resize(ids.size()); + + // Assign to param, constructing with retain behaviour + // to correctly capture each underlying CL object + for (size_type i = 0; i < ids.size(); i++) { + (*devices)[i] = Device(ids[i], true); + } + } + return CL_SUCCESS; + } +#endif + + /*! \brief Gets a list of available platforms. + * + * Wraps clGetPlatformIDs(). + */ + static cl_int get( + vector* platforms) + { + cl_uint n = 0; + + if( platforms == NULL ) { + return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR); + } + + cl_int err = ::clGetPlatformIDs(0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + vector ids(n); + err = ::clGetPlatformIDs(n, ids.data(), NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + if (platforms) { + platforms->resize(ids.size()); + + // Platforms don't reference count + for (size_type i = 0; i < ids.size(); i++) { + (*platforms)[i] = Platform(ids[i]); + } + } + return CL_SUCCESS; + } + + /*! \brief Gets the first available platform. + * + * Wraps clGetPlatformIDs(), returning the first result. + */ + static cl_int get( + Platform * platform) + { + cl_int err; + Platform default_platform = Platform::getDefault(&err); + if (platform) { + *platform = default_platform; + } + return err; + } + + /*! \brief Gets the first available platform, returning it by value. + * + * \return Returns a valid platform if one is available. + * If no platform is available will return a null platform. + * Throws an exception if no platforms are available + * or an error condition occurs. + * Wraps clGetPlatformIDs(), returning the first result. + */ + static Platform get( + cl_int * errResult = NULL) + { + cl_int err; + Platform default_platform = Platform::getDefault(&err); + if (errResult) { + *errResult = err; + } + return default_platform; + } + +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 + //! \brief Wrapper for clUnloadCompiler(). + cl_int + unloadCompiler() + { + return ::clUnloadPlatformCompiler(object_); + } +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 +}; // class Platform + +CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Platform::default_initialized_; +CL_HPP_DEFINE_STATIC_MEMBER_ Platform Platform::default_; +CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Platform::default_error_ = CL_SUCCESS; + + +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) +/** + * Unload the OpenCL compiler. + * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead. + */ +inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int +UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; +inline cl_int +UnloadCompiler() +{ + return ::clUnloadCompiler(); +} +#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + +/*! \brief Class interface for cl_context. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_context as the original. For details, see + * clRetainContext() and clReleaseContext(). + * + * \see cl_context + */ +class Context + : public detail::Wrapper +{ +private: + static std::once_flag default_initialized_; + static Context default_; + static cl_int default_error_; + + /*! \brief Create the default context from the default device type in the default platform. + * + * This sets @c default_ and @c default_error_. It does not throw + * @c cl::Error. + */ + static void makeDefault() { + /* Throwing an exception from a call_once invocation does not do + * what we wish, so we catch it and save the error. + */ +#if defined(CL_HPP_ENABLE_EXCEPTIONS) + try +#endif + { +#if !defined(__APPLE__) && !defined(__MACOS) + const Platform &p = Platform::getDefault(); + cl_platform_id defaultPlatform = p(); + cl_context_properties properties[3] = { + CL_CONTEXT_PLATFORM, (cl_context_properties)defaultPlatform, 0 + }; +#else // #if !defined(__APPLE__) && !defined(__MACOS) + cl_context_properties *properties = nullptr; +#endif // #if !defined(__APPLE__) && !defined(__MACOS) + + default_ = Context( + CL_DEVICE_TYPE_DEFAULT, + properties, + NULL, + NULL, + &default_error_); + } +#if defined(CL_HPP_ENABLE_EXCEPTIONS) + catch (cl::Error &e) { + default_error_ = e.err(); + } +#endif + } + + + /*! \brief Create the default context from a provided Context. + * + * This sets @c default_. It does not throw + * @c cl::Error. + */ + static void makeDefaultProvided(const Context &c) { + default_ = c; + } + +public: +#ifdef CL_HPP_UNIT_TEST_ENABLE + /*! \brief Reset the default. + * + * This sets @c default_ to an empty value to support cleanup in + * the unit test framework. + * This function is not thread safe. + */ + static void unitTestClearDefault() { + default_ = Context(); + } +#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE + + /*! \brief Constructs a context including a list of specified devices. + * + * Wraps clCreateContext(). + */ + Context( + const vector& devices, + cl_context_properties* properties = NULL, + void (CL_CALLBACK * notifyFptr)( + const char *, + const void *, + size_type, + void *) = NULL, + void* data = NULL, + cl_int* err = NULL) + { + cl_int error; + + size_type numDevices = devices.size(); + vector deviceIDs(numDevices); + + for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { + deviceIDs[deviceIndex] = (devices[deviceIndex])(); + } + + object_ = ::clCreateContext( + properties, (cl_uint) numDevices, + deviceIDs.data(), + notifyFptr, data, &error); + + detail::errHandler(error, __CREATE_CONTEXT_ERR); + if (err != NULL) { + *err = error; + } + } + + Context( + const Device& device, + cl_context_properties* properties = NULL, + void (CL_CALLBACK * notifyFptr)( + const char *, + const void *, + size_type, + void *) = NULL, + void* data = NULL, + cl_int* err = NULL) + { + cl_int error; + + cl_device_id deviceID = device(); + + object_ = ::clCreateContext( + properties, 1, + &deviceID, + notifyFptr, data, &error); + + detail::errHandler(error, __CREATE_CONTEXT_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! \brief Constructs a context including all or a subset of devices of a specified type. + * + * Wraps clCreateContextFromType(). + */ + Context( + cl_device_type type, + cl_context_properties* properties = NULL, + void (CL_CALLBACK * notifyFptr)( + const char *, + const void *, + size_type, + void *) = NULL, + void* data = NULL, + cl_int* err = NULL) + { + cl_int error; + +#if !defined(__APPLE__) && !defined(__MACOS) + cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 }; + + if (properties == NULL) { + // Get a valid platform ID as we cannot send in a blank one + vector platforms; + error = Platform::get(&platforms); + if (error != CL_SUCCESS) { + detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR); + if (err != NULL) { + *err = error; + } + return; + } + + // Check the platforms we found for a device of our specified type + cl_context_properties platform_id = 0; + for (unsigned int i = 0; i < platforms.size(); i++) { + + vector devices; + +#if defined(CL_HPP_ENABLE_EXCEPTIONS) + try { +#endif + + error = platforms[i].getDevices(type, &devices); + +#if defined(CL_HPP_ENABLE_EXCEPTIONS) + } catch (Error) {} + // Catch if exceptions are enabled as we don't want to exit if first platform has no devices of type + // We do error checking next anyway, and can throw there if needed +#endif + + // Only squash CL_SUCCESS and CL_DEVICE_NOT_FOUND + if (error != CL_SUCCESS && error != CL_DEVICE_NOT_FOUND) { + detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR); + if (err != NULL) { + *err = error; + } + } + + if (devices.size() > 0) { + platform_id = (cl_context_properties)platforms[i](); + break; + } + } + + if (platform_id == 0) { + detail::errHandler(CL_DEVICE_NOT_FOUND, __CREATE_CONTEXT_FROM_TYPE_ERR); + if (err != NULL) { + *err = CL_DEVICE_NOT_FOUND; + } + return; + } + + prop[1] = platform_id; + properties = &prop[0]; + } +#endif + object_ = ::clCreateContextFromType( + properties, type, notifyFptr, data, &error); + + detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Context(const Context& ctx) : detail::Wrapper(ctx) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Context& operator = (const Context &ctx) + { + detail::Wrapper::operator=(ctx); + return *this; + } + + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Context(Context&& ctx) CL_HPP_NOEXCEPT_ : detail::Wrapper(std::move(ctx)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Context& operator = (Context &&ctx) + { + detail::Wrapper::operator=(std::move(ctx)); + return *this; + } + + + /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT. + * + * \note All calls to this function return the same cl_context as the first. + */ + static Context getDefault(cl_int * err = NULL) + { + std::call_once(default_initialized_, makeDefault); + detail::errHandler(default_error_); + if (err != NULL) { + *err = default_error_; + } + return default_; + } + + /** + * Modify the default context to be used by + * subsequent operations. + * Will only set the default if no default was previously created. + * @return updated default context. + * Should be compared to the passed value to ensure that it was updated. + */ + static Context setDefault(const Context &default_context) + { + std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_context)); + detail::errHandler(default_error_); + return default_; + } + + //! \brief Default constructor - initializes to NULL. + Context() : detail::Wrapper() { } + + /*! \brief Constructor from cl_context - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_context + * into the new Context object. + */ + explicit Context(const cl_context& context, bool retainObject = false) : + detail::Wrapper(context, retainObject) { } + + /*! \brief Assignment operator from cl_context - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseContext() on the value previously held by this instance. + */ + Context& operator = (const cl_context& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetContextInfo(). + template + cl_int getInfo(cl_context_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetContextInfo, object_, name, param), + __GET_CONTEXT_INFO_ERR); + } + + //! \brief Wrapper for clGetContextInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_context_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /*! \brief Gets a list of supported image formats. + * + * Wraps clGetSupportedImageFormats(). + */ + cl_int getSupportedImageFormats( + cl_mem_flags flags, + cl_mem_object_type type, + vector* formats) const + { + cl_uint numEntries; + + if (!formats) { + return CL_SUCCESS; + } + + cl_int err = ::clGetSupportedImageFormats( + object_, + flags, + type, + 0, + NULL, + &numEntries); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR); + } + + if (numEntries > 0) { + vector value(numEntries); + err = ::clGetSupportedImageFormats( + object_, + flags, + type, + numEntries, + (cl_image_format*)value.data(), + NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR); + } + + formats->assign(begin(value), end(value)); + } + else { + // If no values are being returned, ensure an empty vector comes back + formats->clear(); + } + + return CL_SUCCESS; + } +}; + +inline void Device::makeDefault() +{ + /* Throwing an exception from a call_once invocation does not do + * what we wish, so we catch it and save the error. + */ +#if defined(CL_HPP_ENABLE_EXCEPTIONS) + try +#endif + { + cl_int error = 0; + + Context context = Context::getDefault(&error); + detail::errHandler(error, __CREATE_CONTEXT_ERR); + + if (error != CL_SUCCESS) { + default_error_ = error; + } + else { + default_ = context.getInfo()[0]; + default_error_ = CL_SUCCESS; + } + } +#if defined(CL_HPP_ENABLE_EXCEPTIONS) + catch (cl::Error &e) { + default_error_ = e.err(); + } +#endif +} + +CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag Context::default_initialized_; +CL_HPP_DEFINE_STATIC_MEMBER_ Context Context::default_; +CL_HPP_DEFINE_STATIC_MEMBER_ cl_int Context::default_error_ = CL_SUCCESS; + +/*! \brief Class interface for cl_event. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_event as the original. For details, see + * clRetainEvent() and clReleaseEvent(). + * + * \see cl_event + */ +class Event : public detail::Wrapper +{ +public: + //! \brief Default constructor - initializes to NULL. + Event() : detail::Wrapper() { } + + /*! \brief Constructor from cl_event - takes ownership. + * + * \param retainObject will cause the constructor to retain its cl object. + * Defaults to false to maintain compatibility with + * earlier versions. + * This effectively transfers ownership of a refcount on the cl_event + * into the new Event object. + */ + explicit Event(const cl_event& event, bool retainObject = false) : + detail::Wrapper(event, retainObject) { } + + /*! \brief Assignment operator from cl_event - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseEvent() on the value previously held by this instance. + */ + Event& operator = (const cl_event& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetEventInfo(). + template + cl_int getInfo(cl_event_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetEventInfo, object_, name, param), + __GET_EVENT_INFO_ERR); + } + + //! \brief Wrapper for clGetEventInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_event_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + //! \brief Wrapper for clGetEventProfilingInfo(). + template + cl_int getProfilingInfo(cl_profiling_info name, T* param) const + { + return detail::errHandler(detail::getInfo( + &::clGetEventProfilingInfo, object_, name, param), + __GET_EVENT_PROFILE_INFO_ERR); + } + + //! \brief Wrapper for clGetEventProfilingInfo() that returns by value. + template typename + detail::param_traits::param_type + getProfilingInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_profiling_info, name>::param_type param; + cl_int result = getProfilingInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /*! \brief Blocks the calling thread until this event completes. + * + * Wraps clWaitForEvents(). + */ + cl_int wait() const + { + return detail::errHandler( + ::clWaitForEvents(1, &object_), + __WAIT_FOR_EVENTS_ERR); + } + +#if CL_HPP_TARGET_OPENCL_VERSION >= 110 + /*! \brief Registers a user callback function for a specific command execution status. + * + * Wraps clSetEventCallback(). + */ + cl_int setCallback( + cl_int type, + void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *), + void * user_data = NULL) + { + return detail::errHandler( + ::clSetEventCallback( + object_, + type, + pfn_notify, + user_data), + __SET_EVENT_CALLBACK_ERR); + } +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110 + + /*! \brief Blocks the calling thread until every event specified is complete. + * + * Wraps clWaitForEvents(). + */ + static cl_int + waitForEvents(const vector& events) + { + return detail::errHandler( + ::clWaitForEvents( + (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : NULL), + __WAIT_FOR_EVENTS_ERR); + } +}; + +#if CL_HPP_TARGET_OPENCL_VERSION >= 110 +/*! \brief Class interface for user events (a subset of cl_event's). + * + * See Event for details about copy semantics, etc. + */ +class UserEvent : public Event +{ +public: + /*! \brief Constructs a user event on a given context. + * + * Wraps clCreateUserEvent(). + */ + UserEvent( + const Context& context, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateUserEvent( + context(), + &error); + + detail::errHandler(error, __CREATE_USER_EVENT_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + UserEvent() : Event() { } + + /*! \brief Sets the execution status of a user event object. + * + * Wraps clSetUserEventStatus(). + */ + cl_int setStatus(cl_int status) + { + return detail::errHandler( + ::clSetUserEventStatus(object_,status), + __SET_USER_EVENT_STATUS_ERR); + } +}; +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110 + +/*! \brief Blocks the calling thread until every event specified is complete. + * + * Wraps clWaitForEvents(). + */ +inline static cl_int +WaitForEvents(const vector& events) +{ + return detail::errHandler( + ::clWaitForEvents( + (cl_uint) events.size(), (events.size() > 0) ? (cl_event*)&events.front() : NULL), + __WAIT_FOR_EVENTS_ERR); +} + +/*! \brief Class interface for cl_mem. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_mem as the original. For details, see + * clRetainMemObject() and clReleaseMemObject(). + * + * \see cl_mem + */ +class Memory : public detail::Wrapper +{ +public: + //! \brief Default constructor - initializes to NULL. + Memory() : detail::Wrapper() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * Optionally transfer ownership of a refcount on the cl_mem + * into the new Memory object. + * + * \param retainObject will cause the constructor to retain its cl object. + * Defaults to false to maintain compatibility with + * earlier versions. + * + * See Memory for further details. + */ + explicit Memory(const cl_mem& memory, bool retainObject) : + detail::Wrapper(memory, retainObject) { } + + /*! \brief Assignment operator from cl_mem - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseMemObject() on the value previously held by this instance. + */ + Memory& operator = (const cl_mem& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Memory(const Memory& mem) : detail::Wrapper(mem) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Memory& operator = (const Memory &mem) + { + detail::Wrapper::operator=(mem); + return *this; + } + + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Memory(Memory&& mem) CL_HPP_NOEXCEPT_ : detail::Wrapper(std::move(mem)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Memory& operator = (Memory &&mem) + { + detail::Wrapper::operator=(std::move(mem)); + return *this; + } + + + //! \brief Wrapper for clGetMemObjectInfo(). + template + cl_int getInfo(cl_mem_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetMemObjectInfo, object_, name, param), + __GET_MEM_OBJECT_INFO_ERR); + } + + //! \brief Wrapper for clGetMemObjectInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_mem_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + +#if CL_HPP_TARGET_OPENCL_VERSION >= 110 + /*! \brief Registers a callback function to be called when the memory object + * is no longer needed. + * + * Wraps clSetMemObjectDestructorCallback(). + * + * Repeated calls to this function, for a given cl_mem value, will append + * to the list of functions called (in reverse order) when memory object's + * resources are freed and the memory object is deleted. + * + * \note + * The registered callbacks are associated with the underlying cl_mem + * value - not the Memory class instance. + */ + cl_int setDestructorCallback( + void (CL_CALLBACK * pfn_notify)(cl_mem, void *), + void * user_data = NULL) + { + return detail::errHandler( + ::clSetMemObjectDestructorCallback( + object_, + pfn_notify, + user_data), + __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR); + } +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110 + +}; + +// Pre-declare copy functions +class Buffer; +template< typename IteratorType > +cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer ); +template< typename IteratorType > +cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator ); +template< typename IteratorType > +cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer ); +template< typename IteratorType > +cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator ); + + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 +namespace detail +{ + class SVMTraitNull + { + public: + static cl_svm_mem_flags getSVMMemFlags() + { + return 0; + } + }; +} // namespace detail + +template +class SVMTraitReadWrite +{ +public: + static cl_svm_mem_flags getSVMMemFlags() + { + return CL_MEM_READ_WRITE | + Trait::getSVMMemFlags(); + } +}; + +template +class SVMTraitReadOnly +{ +public: + static cl_svm_mem_flags getSVMMemFlags() + { + return CL_MEM_READ_ONLY | + Trait::getSVMMemFlags(); + } +}; + +template +class SVMTraitWriteOnly +{ +public: + static cl_svm_mem_flags getSVMMemFlags() + { + return CL_MEM_WRITE_ONLY | + Trait::getSVMMemFlags(); + } +}; + +template> +class SVMTraitCoarse +{ +public: + static cl_svm_mem_flags getSVMMemFlags() + { + return Trait::getSVMMemFlags(); + } +}; + +template> +class SVMTraitFine +{ +public: + static cl_svm_mem_flags getSVMMemFlags() + { + return CL_MEM_SVM_FINE_GRAIN_BUFFER | + Trait::getSVMMemFlags(); + } +}; + +template> +class SVMTraitAtomic +{ +public: + static cl_svm_mem_flags getSVMMemFlags() + { + return + CL_MEM_SVM_FINE_GRAIN_BUFFER | + CL_MEM_SVM_ATOMICS | + Trait::getSVMMemFlags(); + } +}; + +// Pre-declare SVM map function +template +inline cl_int enqueueMapSVM( + T* ptr, + cl_bool blocking, + cl_map_flags flags, + size_type size, + const vector* events = NULL, + Event* event = NULL); + +/** + * STL-like allocator class for managing SVM objects provided for convenience. + * + * Note that while this behaves like an allocator for the purposes of constructing vectors and similar objects, + * care must be taken when using with smart pointers. + * The allocator should not be used to construct a unique_ptr if we are using coarse-grained SVM mode because + * the coarse-grained management behaviour would behave incorrectly with respect to reference counting. + * + * Instead the allocator embeds a Deleter which may be used with unique_ptr and is used + * with the allocate_shared and allocate_ptr supplied operations. + */ +template +class SVMAllocator { +private: + Context context_; + +public: + typedef T value_type; + typedef value_type* pointer; + typedef const value_type* const_pointer; + typedef value_type& reference; + typedef const value_type& const_reference; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + + template + struct rebind + { + typedef SVMAllocator other; + }; + + template + friend class SVMAllocator; + + SVMAllocator() : + context_(Context::getDefault()) + { + } + + explicit SVMAllocator(cl::Context context) : + context_(context) + { + } + + + SVMAllocator(const SVMAllocator &other) : + context_(other.context_) + { + } + + template + SVMAllocator(const SVMAllocator &other) : + context_(other.context_) + { + } + + ~SVMAllocator() + { + } + + pointer address(reference r) CL_HPP_NOEXCEPT_ + { + return std::addressof(r); + } + + const_pointer address(const_reference r) CL_HPP_NOEXCEPT_ + { + return std::addressof(r); + } + + /** + * Allocate an SVM pointer. + * + * If the allocator is coarse-grained, this will take ownership to allow + * containers to correctly construct data in place. + */ + pointer allocate( + size_type size, + typename cl::SVMAllocator::const_pointer = 0) + { + // Allocate memory with default alignment matching the size of the type + void* voidPointer = + clSVMAlloc( + context_(), + SVMTrait::getSVMMemFlags(), + size*sizeof(T), + 0); + pointer retValue = reinterpret_cast( + voidPointer); +#if defined(CL_HPP_ENABLE_EXCEPTIONS) + if (!retValue) { + std::bad_alloc excep; + throw excep; + } +#endif // #if defined(CL_HPP_ENABLE_EXCEPTIONS) + + // If allocation was coarse-grained then map it + if (!(SVMTrait::getSVMMemFlags() & CL_MEM_SVM_FINE_GRAIN_BUFFER)) { + cl_int err = enqueueMapSVM(retValue, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, size*sizeof(T)); + if (err != CL_SUCCESS) { + std::bad_alloc excep; + throw excep; + } + } + + // If exceptions disabled, return null pointer from allocator + return retValue; + } + + void deallocate(pointer p, size_type) + { + clSVMFree(context_(), p); + } + + /** + * Return the maximum possible allocation size. + * This is the minimum of the maximum sizes of all devices in the context. + */ + size_type max_size() const CL_HPP_NOEXCEPT_ + { + size_type maxSize = std::numeric_limits::max() / sizeof(T); + + for (const Device &d : context_.getInfo()) { + maxSize = std::min( + maxSize, + static_cast(d.getInfo())); + } + + return maxSize; + } + + template< class U, class... Args > + void construct(U* p, Args&&... args) + { + new(p)T(args...); + } + + template< class U > + void destroy(U* p) + { + p->~U(); + } + + /** + * Returns true if the contexts match. + */ + inline bool operator==(SVMAllocator const& rhs) + { + return (context_==rhs.context_); + } + + inline bool operator!=(SVMAllocator const& a) + { + return !operator==(a); + } +}; // class SVMAllocator return cl::pointer(tmp, detail::Deleter{alloc, copies}); + + +template +class SVMAllocator { +public: + typedef void value_type; + typedef value_type* pointer; + typedef const value_type* const_pointer; + + template + struct rebind + { + typedef SVMAllocator other; + }; + + template + friend class SVMAllocator; +}; + +#if !defined(CL_HPP_NO_STD_UNIQUE_PTR) +namespace detail +{ + template + class Deleter { + private: + Alloc alloc_; + size_type copies_; + + public: + typedef typename std::allocator_traits::pointer pointer; + + Deleter(const Alloc &alloc, size_type copies) : alloc_{ alloc }, copies_{ copies } + { + } + + void operator()(pointer ptr) const { + Alloc tmpAlloc{ alloc_ }; + std::allocator_traits::destroy(tmpAlloc, std::addressof(*ptr)); + std::allocator_traits::deallocate(tmpAlloc, ptr, copies_); + } + }; +} // namespace detail + +/** + * Allocation operation compatible with std::allocate_ptr. + * Creates a unique_ptr by default. + * This requirement is to ensure that the control block is not + * allocated in memory inaccessible to the host. + */ +template +cl::pointer> allocate_pointer(const Alloc &alloc_, Args&&... args) +{ + Alloc alloc(alloc_); + static const size_type copies = 1; + + // Ensure that creation of the management block and the + // object are dealt with separately such that we only provide a deleter + + T* tmp = std::allocator_traits::allocate(alloc, copies); + if (!tmp) { + std::bad_alloc excep; + throw excep; + } + try { + std::allocator_traits::construct( + alloc, + std::addressof(*tmp), + std::forward(args)...); + + return cl::pointer>(tmp, detail::Deleter{alloc, copies}); + } + catch (std::bad_alloc b) + { + std::allocator_traits::deallocate(alloc, tmp, copies); + throw; + } +} + +template< class T, class SVMTrait, class... Args > +cl::pointer>> allocate_svm(Args... args) +{ + SVMAllocator alloc; + return cl::allocate_pointer(alloc, args...); +} + +template< class T, class SVMTrait, class... Args > +cl::pointer>> allocate_svm(const cl::Context &c, Args... args) +{ + SVMAllocator alloc(c); + return cl::allocate_pointer(alloc, args...); +} +#endif // #if !defined(CL_HPP_NO_STD_UNIQUE_PTR) + +/*! \brief Vector alias to simplify contruction of coarse-grained SVM containers. + * + */ +template < class T > +using coarse_svm_vector = vector>>; + +/*! \brief Vector alias to simplify contruction of fine-grained SVM containers. +* +*/ +template < class T > +using fine_svm_vector = vector>>; + +/*! \brief Vector alias to simplify contruction of fine-grained SVM containers that support platform atomics. +* +*/ +template < class T > +using atomic_svm_vector = vector>>; + +#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 + + +/*! \brief Class interface for Buffer Memory Objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Buffer : public Memory +{ +public: + + /*! \brief Constructs a Buffer in a specified context. + * + * Wraps clCreateBuffer(). + * + * \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was + * specified. Note alignment & exclusivity requirements. + */ + Buffer( + const Context& context, + cl_mem_flags flags, + size_type size, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error); + + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! \brief Constructs a Buffer in the default context. + * + * Wraps clCreateBuffer(). + * + * \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was + * specified. Note alignment & exclusivity requirements. + * + * \see Context::getDefault() + */ + Buffer( + cl_mem_flags flags, + size_type size, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + + Context context = Context::getDefault(err); + + object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error); + + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! + * \brief Construct a Buffer from a host container via iterators. + * IteratorType must be random access. + * If useHostPtr is specified iterators must represent contiguous data. + */ + template< typename IteratorType > + Buffer( + IteratorType startIterator, + IteratorType endIterator, + bool readOnly, + bool useHostPtr = false, + cl_int* err = NULL) + { + typedef typename std::iterator_traits::value_type DataType; + cl_int error; + + cl_mem_flags flags = 0; + if( readOnly ) { + flags |= CL_MEM_READ_ONLY; + } + else { + flags |= CL_MEM_READ_WRITE; + } + if( useHostPtr ) { + flags |= CL_MEM_USE_HOST_PTR; + } + + size_type size = sizeof(DataType)*(endIterator - startIterator); + + Context context = Context::getDefault(err); + + if( useHostPtr ) { + object_ = ::clCreateBuffer(context(), flags, size, static_cast(&*startIterator), &error); + } else { + object_ = ::clCreateBuffer(context(), flags, size, 0, &error); + } + + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + + if( !useHostPtr ) { + error = cl::copy(startIterator, endIterator, *this); + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + } + + /*! + * \brief Construct a Buffer from a host container via iterators using a specified context. + * IteratorType must be random access. + * If useHostPtr is specified iterators must represent contiguous data. + */ + template< typename IteratorType > + Buffer(const Context &context, IteratorType startIterator, IteratorType endIterator, + bool readOnly, bool useHostPtr = false, cl_int* err = NULL); + + /*! + * \brief Construct a Buffer from a host container via iterators using a specified queue. + * If useHostPtr is specified iterators must be random access. + */ + template< typename IteratorType > + Buffer(const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, + bool readOnly, bool useHostPtr = false, cl_int* err = NULL); + + //! \brief Default constructor - initializes to NULL. + Buffer() : Memory() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * \param retainObject will cause the constructor to retain its cl object. + * Defaults to false to maintain compatibility with earlier versions. + * + * See Memory for further details. + */ + explicit Buffer(const cl_mem& buffer, bool retainObject = false) : + Memory(buffer, retainObject) { } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Buffer& operator = (const cl_mem& rhs) + { + Memory::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Buffer(const Buffer& buf) : Memory(buf) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Buffer& operator = (const Buffer &buf) + { + Memory::operator=(buf); + return *this; + } + + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Buffer(Buffer&& buf) CL_HPP_NOEXCEPT_ : Memory(std::move(buf)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Buffer& operator = (Buffer &&buf) + { + Memory::operator=(std::move(buf)); + return *this; + } + +#if CL_HPP_TARGET_OPENCL_VERSION >= 110 + /*! \brief Creates a new buffer object from this. + * + * Wraps clCreateSubBuffer(). + */ + Buffer createSubBuffer( + cl_mem_flags flags, + cl_buffer_create_type buffer_create_type, + const void * buffer_create_info, + cl_int * err = NULL) + { + Buffer result; + cl_int error; + result.object_ = ::clCreateSubBuffer( + object_, + flags, + buffer_create_type, + buffer_create_info, + &error); + + detail::errHandler(error, __CREATE_SUBBUFFER_ERR); + if (err != NULL) { + *err = error; + } + + return result; + } +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110 +}; + +#if defined (CL_HPP_USE_DX_INTEROP) +/*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's. + * + * This is provided to facilitate interoperability with Direct3D. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class BufferD3D10 : public Buffer +{ +public: + + + /*! \brief Constructs a BufferD3D10, in a specified context, from a + * given ID3D10Buffer. + * + * Wraps clCreateFromD3D10BufferKHR(). + */ + BufferD3D10( + const Context& context, + cl_mem_flags flags, + ID3D10Buffer* bufobj, + cl_int * err = NULL) : pfn_clCreateFromD3D10BufferKHR(nullptr) + { + typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)( + cl_context context, cl_mem_flags flags, ID3D10Buffer* buffer, + cl_int* errcode_ret); + PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR; +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 + vector props = context.getInfo(); + cl_platform platform = -1; + for( int i = 0; i < props.size(); ++i ) { + if( props[i] == CL_CONTEXT_PLATFORM ) { + platform = props[i+1]; + } + } + CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clCreateFromD3D10BufferKHR); +#elif CL_HPP_TARGET_OPENCL_VERSION >= 110 + CL_HPP_INIT_CL_EXT_FCN_PTR_(clCreateFromD3D10BufferKHR); +#endif + + cl_int error; + object_ = pfn_clCreateFromD3D10BufferKHR( + context(), + flags, + bufobj, + &error); + + detail::errHandler(error, __CREATE_GL_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + BufferD3D10() : Buffer() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * \param retainObject will cause the constructor to retain its cl object. + * Defaults to false to maintain compatibility with + * earlier versions. + * See Memory for further details. + */ + explicit BufferD3D10(const cl_mem& buffer, bool retainObject = false) : + Buffer(buffer, retainObject) { } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + BufferD3D10& operator = (const cl_mem& rhs) + { + Buffer::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + BufferD3D10(const BufferD3D10& buf) : + Buffer(buf) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + BufferD3D10& operator = (const BufferD3D10 &buf) + { + Buffer::operator=(buf); + return *this; + } + + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + BufferD3D10(BufferD3D10&& buf) CL_HPP_NOEXCEPT_ : Buffer(std::move(buf)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + BufferD3D10& operator = (BufferD3D10 &&buf) + { + Buffer::operator=(std::move(buf)); + return *this; + } +}; +#endif + +/*! \brief Class interface for GL Buffer Memory Objects. + * + * This is provided to facilitate interoperability with OpenGL. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class BufferGL : public Buffer +{ +public: + /*! \brief Constructs a BufferGL in a specified context, from a given + * GL buffer. + * + * Wraps clCreateFromGLBuffer(). + */ + BufferGL( + const Context& context, + cl_mem_flags flags, + cl_GLuint bufobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLBuffer( + context(), + flags, + bufobj, + &error); + + detail::errHandler(error, __CREATE_GL_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + BufferGL() : Buffer() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * \param retainObject will cause the constructor to retain its cl object. + * Defaults to false to maintain compatibility with + * earlier versions. + * See Memory for further details. + */ + explicit BufferGL(const cl_mem& buffer, bool retainObject = false) : + Buffer(buffer, retainObject) { } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + BufferGL& operator = (const cl_mem& rhs) + { + Buffer::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + BufferGL(const BufferGL& buf) : Buffer(buf) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + BufferGL& operator = (const BufferGL &buf) + { + Buffer::operator=(buf); + return *this; + } + + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + BufferGL(BufferGL&& buf) CL_HPP_NOEXCEPT_ : Buffer(std::move(buf)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + BufferGL& operator = (BufferGL &&buf) + { + Buffer::operator=(std::move(buf)); + return *this; + } + + //! \brief Wrapper for clGetGLObjectInfo(). + cl_int getObjectInfo( + cl_gl_object_type *type, + cl_GLuint * gl_object_name) + { + return detail::errHandler( + ::clGetGLObjectInfo(object_,type,gl_object_name), + __GET_GL_OBJECT_INFO_ERR); + } +}; + +/*! \brief Class interface for GL Render Buffer Memory Objects. + * + * This is provided to facilitate interoperability with OpenGL. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class BufferRenderGL : public Buffer +{ +public: + /*! \brief Constructs a BufferRenderGL in a specified context, from a given + * GL Renderbuffer. + * + * Wraps clCreateFromGLRenderbuffer(). + */ + BufferRenderGL( + const Context& context, + cl_mem_flags flags, + cl_GLuint bufobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLRenderbuffer( + context(), + flags, + bufobj, + &error); + + detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + BufferRenderGL() : Buffer() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * \param retainObject will cause the constructor to retain its cl object. + * Defaults to false to maintain compatibility with + * earlier versions. + * See Memory for further details. + */ + explicit BufferRenderGL(const cl_mem& buffer, bool retainObject = false) : + Buffer(buffer, retainObject) { } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + BufferRenderGL& operator = (const cl_mem& rhs) + { + Buffer::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + BufferRenderGL(const BufferRenderGL& buf) : Buffer(buf) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + BufferRenderGL& operator = (const BufferRenderGL &buf) + { + Buffer::operator=(buf); + return *this; + } + + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + BufferRenderGL(BufferRenderGL&& buf) CL_HPP_NOEXCEPT_ : Buffer(std::move(buf)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + BufferRenderGL& operator = (BufferRenderGL &&buf) + { + Buffer::operator=(std::move(buf)); + return *this; + } + + //! \brief Wrapper for clGetGLObjectInfo(). + cl_int getObjectInfo( + cl_gl_object_type *type, + cl_GLuint * gl_object_name) + { + return detail::errHandler( + ::clGetGLObjectInfo(object_,type,gl_object_name), + __GET_GL_OBJECT_INFO_ERR); + } +}; + +/*! \brief C++ base class for Image Memory objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Image : public Memory +{ +protected: + //! \brief Default constructor - initializes to NULL. + Image() : Memory() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * \param retainObject will cause the constructor to retain its cl object. + * Defaults to false to maintain compatibility with + * earlier versions. + * See Memory for further details. + */ + explicit Image(const cl_mem& image, bool retainObject = false) : + Memory(image, retainObject) { } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image& operator = (const cl_mem& rhs) + { + Memory::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image(const Image& img) : Memory(img) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image& operator = (const Image &img) + { + Memory::operator=(img); + return *this; + } + + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Image(Image&& img) CL_HPP_NOEXCEPT_ : Memory(std::move(img)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Image& operator = (Image &&img) + { + Memory::operator=(std::move(img)); + return *this; + } + + +public: + //! \brief Wrapper for clGetImageInfo(). + template + cl_int getImageInfo(cl_image_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetImageInfo, object_, name, param), + __GET_IMAGE_INFO_ERR); + } + + //! \brief Wrapper for clGetImageInfo() that returns by value. + template typename + detail::param_traits::param_type + getImageInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_image_info, name>::param_type param; + cl_int result = getImageInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } +}; + +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 +/*! \brief Class interface for 1D Image Memory objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Image1D : public Image +{ +public: + /*! \brief Constructs a 1D Image in a specified context. + * + * Wraps clCreateImage(). + */ + Image1D( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + size_type width, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + cl_image_desc desc = + { + CL_MEM_OBJECT_IMAGE1D, + width, + 0, 0, 0, 0, 0, 0, 0, 0 + }; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + Image1D() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * \param retainObject will cause the constructor to retain its cl object. + * Defaults to false to maintain compatibility with + * earlier versions. + * See Memory for further details. + */ + explicit Image1D(const cl_mem& image1D, bool retainObject = false) : + Image(image1D, retainObject) { } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image1D& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image1D(const Image1D& img) : Image(img) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image1D& operator = (const Image1D &img) + { + Image::operator=(img); + return *this; + } + + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Image1D(Image1D&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Image1D& operator = (Image1D &&img) + { + Image::operator=(std::move(img)); + return *this; + } + +}; + +/*! \class Image1DBuffer + * \brief Image interface for 1D buffer images. + */ +class Image1DBuffer : public Image +{ +public: + Image1DBuffer( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + size_type width, + const Buffer &buffer, + cl_int* err = NULL) + { + cl_int error; + cl_image_desc desc = + { + CL_MEM_OBJECT_IMAGE1D_BUFFER, + width, + 0, 0, 0, 0, 0, 0, 0, + buffer() + }; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + NULL, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } + + Image1DBuffer() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * \param retainObject will cause the constructor to retain its cl object. + * Defaults to false to maintain compatibility with + * earlier versions. + * See Memory for further details. + */ + explicit Image1DBuffer(const cl_mem& image1D, bool retainObject = false) : + Image(image1D, retainObject) { } + + Image1DBuffer& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image1DBuffer(const Image1DBuffer& img) : Image(img) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image1DBuffer& operator = (const Image1DBuffer &img) + { + Image::operator=(img); + return *this; + } + + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Image1DBuffer(Image1DBuffer&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Image1DBuffer& operator = (Image1DBuffer &&img) + { + Image::operator=(std::move(img)); + return *this; + } + +}; + +/*! \class Image1DArray + * \brief Image interface for arrays of 1D images. + */ +class Image1DArray : public Image +{ +public: + Image1DArray( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + size_type arraySize, + size_type width, + size_type rowPitch, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + cl_image_desc desc = + { + CL_MEM_OBJECT_IMAGE1D_ARRAY, + width, + 0, 0, // height, depth (unused) + arraySize, + rowPitch, + 0, 0, 0, 0 + }; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } + + Image1DArray() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * \param retainObject will cause the constructor to retain its cl object. + * Defaults to false to maintain compatibility with + * earlier versions. + * See Memory for further details. + */ + explicit Image1DArray(const cl_mem& imageArray, bool retainObject = false) : + Image(imageArray, retainObject) { } + + + Image1DArray& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image1DArray(const Image1DArray& img) : Image(img) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image1DArray& operator = (const Image1DArray &img) + { + Image::operator=(img); + return *this; + } + + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Image1DArray(Image1DArray&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Image1DArray& operator = (Image1DArray &&img) + { + Image::operator=(std::move(img)); + return *this; + } + +}; +#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 120 + + +/*! \brief Class interface for 2D Image Memory objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Image2D : public Image +{ +public: + /*! \brief Constructs a 2D Image in a specified context. + * + * Wraps clCreateImage(). + */ + Image2D( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + size_type width, + size_type height, + size_type row_pitch = 0, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + bool useCreateImage; + +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120 + // Run-time decision based on the actual platform + { + cl_uint version = detail::getContextPlatformVersion(context()); + useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above + } +#elif CL_HPP_TARGET_OPENCL_VERSION >= 120 + useCreateImage = true; +#else + useCreateImage = false; +#endif + +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 + if (useCreateImage) + { + cl_image_desc desc = + { + CL_MEM_OBJECT_IMAGE2D, + width, + height, + 0, 0, // depth, array size (unused) + row_pitch, + 0, 0, 0, 0 + }; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 +#if CL_HPP_MINIMUM_OPENCL_VERSION < 120 + if (!useCreateImage) + { + object_ = ::clCreateImage2D( + context(), flags,&format, width, height, row_pitch, host_ptr, &error); + + detail::errHandler(error, __CREATE_IMAGE2D_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120 + } + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 || defined(CL_HPP_USE_CL_IMAGE2D_FROM_BUFFER_KHR) + /*! \brief Constructs a 2D Image from a buffer. + * \note This will share storage with the underlying buffer. + * + * Wraps clCreateImage(). + */ + Image2D( + const Context& context, + ImageFormat format, + const Buffer &sourceBuffer, + size_type width, + size_type height, + size_type row_pitch = 0, + cl_int* err = nullptr) + { + cl_int error; + + cl_image_desc desc = + { + CL_MEM_OBJECT_IMAGE2D, + width, + height, + 0, 0, // depth, array size (unused) + row_pitch, + 0, 0, 0, + // Use buffer as input to image + sourceBuffer() + }; + object_ = ::clCreateImage( + context(), + 0, // flags inherited from buffer + &format, + &desc, + nullptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != nullptr) { + *err = error; + } + } +#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 200 || defined(CL_HPP_USE_CL_IMAGE2D_FROM_BUFFER_KHR) + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 + /*! \brief Constructs a 2D Image from an image. + * \note This will share storage with the underlying image but may + * reinterpret the channel order and type. + * + * The image will be created matching with a descriptor matching the source. + * + * \param order is the channel order to reinterpret the image data as. + * The channel order may differ as described in the OpenCL + * 2.0 API specification. + * + * Wraps clCreateImage(). + */ + Image2D( + const Context& context, + cl_channel_order order, + const Image &sourceImage, + cl_int* err = nullptr) + { + cl_int error; + + // Descriptor fields have to match source image + size_type sourceWidth = + sourceImage.getImageInfo(); + size_type sourceHeight = + sourceImage.getImageInfo(); + size_type sourceRowPitch = + sourceImage.getImageInfo(); + cl_uint sourceNumMIPLevels = + sourceImage.getImageInfo(); + cl_uint sourceNumSamples = + sourceImage.getImageInfo(); + cl_image_format sourceFormat = + sourceImage.getImageInfo(); + + // Update only the channel order. + // Channel format inherited from source. + sourceFormat.image_channel_order = order; + cl_image_desc desc = + { + CL_MEM_OBJECT_IMAGE2D, + sourceWidth, + sourceHeight, + 0, 0, // depth (unused), array size (unused) + sourceRowPitch, + 0, // slice pitch (unused) + sourceNumMIPLevels, + sourceNumSamples, + // Use buffer as input to image + sourceImage() + }; + object_ = ::clCreateImage( + context(), + 0, // flags should be inherited from mem_object + &sourceFormat, + &desc, + nullptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != nullptr) { + *err = error; + } + } +#endif //#if CL_HPP_TARGET_OPENCL_VERSION >= 200 + + //! \brief Default constructor - initializes to NULL. + Image2D() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * \param retainObject will cause the constructor to retain its cl object. + * Defaults to false to maintain compatibility with + * earlier versions. + * See Memory for further details. + */ + explicit Image2D(const cl_mem& image2D, bool retainObject = false) : + Image(image2D, retainObject) { } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image2D& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image2D(const Image2D& img) : Image(img) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image2D& operator = (const Image2D &img) + { + Image::operator=(img); + return *this; + } + + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Image2D(Image2D&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Image2D& operator = (Image2D &&img) + { + Image::operator=(std::move(img)); + return *this; + } + +}; + + +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) +/*! \brief Class interface for GL 2D Image Memory objects. + * + * This is provided to facilitate interoperability with OpenGL. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + * \note Deprecated for OpenCL 1.2. Please use ImageGL instead. + */ +class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL : public Image2D +{ +public: + /*! \brief Constructs an Image2DGL in a specified context, from a given + * GL Texture. + * + * Wraps clCreateFromGLTexture2D(). + */ + Image2DGL( + const Context& context, + cl_mem_flags flags, + cl_GLenum target, + cl_GLint miplevel, + cl_GLuint texobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLTexture2D( + context(), + flags, + target, + miplevel, + texobj, + &error); + + detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR); + if (err != NULL) { + *err = error; + } + + } + + //! \brief Default constructor - initializes to NULL. + Image2DGL() : Image2D() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * \param retainObject will cause the constructor to retain its cl object. + * Defaults to false to maintain compatibility with + * earlier versions. + * See Memory for further details. + */ + explicit Image2DGL(const cl_mem& image, bool retainObject = false) : + Image2D(image, retainObject) { } + + /*! \brief Assignment from cl_mem - performs shallow copy. + *c + * See Memory for further details. + */ + Image2DGL& operator = (const cl_mem& rhs) + { + Image2D::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image2DGL(const Image2DGL& img) : Image2D(img) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image2DGL& operator = (const Image2DGL &img) + { + Image2D::operator=(img); + return *this; + } + + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Image2DGL(Image2DGL&& img) CL_HPP_NOEXCEPT_ : Image2D(std::move(img)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Image2DGL& operator = (Image2DGL &&img) + { + Image2D::operator=(std::move(img)); + return *this; + } + +} CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; +#endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS + +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 +/*! \class Image2DArray + * \brief Image interface for arrays of 2D images. + */ +class Image2DArray : public Image +{ +public: + Image2DArray( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + size_type arraySize, + size_type width, + size_type height, + size_type rowPitch, + size_type slicePitch, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + cl_image_desc desc = + { + CL_MEM_OBJECT_IMAGE2D_ARRAY, + width, + height, + 0, // depth (unused) + arraySize, + rowPitch, + slicePitch, + 0, 0, 0 + }; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } + + Image2DArray() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * \param retainObject will cause the constructor to retain its cl object. + * Defaults to false to maintain compatibility with + * earlier versions. + * See Memory for further details. + */ + explicit Image2DArray(const cl_mem& imageArray, bool retainObject = false) : Image(imageArray, retainObject) { } + + Image2DArray& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image2DArray(const Image2DArray& img) : Image(img) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image2DArray& operator = (const Image2DArray &img) + { + Image::operator=(img); + return *this; + } + + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Image2DArray(Image2DArray&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Image2DArray& operator = (Image2DArray &&img) + { + Image::operator=(std::move(img)); + return *this; + } +}; +#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 120 + +/*! \brief Class interface for 3D Image Memory objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Image3D : public Image +{ +public: + /*! \brief Constructs a 3D Image in a specified context. + * + * Wraps clCreateImage(). + */ + Image3D( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + size_type width, + size_type height, + size_type depth, + size_type row_pitch = 0, + size_type slice_pitch = 0, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + bool useCreateImage; + +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120 + // Run-time decision based on the actual platform + { + cl_uint version = detail::getContextPlatformVersion(context()); + useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above + } +#elif CL_HPP_TARGET_OPENCL_VERSION >= 120 + useCreateImage = true; +#else + useCreateImage = false; +#endif + +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 + if (useCreateImage) + { + cl_image_desc desc = + { + CL_MEM_OBJECT_IMAGE3D, + width, + height, + depth, + 0, // array size (unused) + row_pitch, + slice_pitch, + 0, 0, 0 + }; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 +#if CL_HPP_MINIMUM_OPENCL_VERSION < 120 + if (!useCreateImage) + { + object_ = ::clCreateImage3D( + context(), flags, &format, width, height, depth, row_pitch, + slice_pitch, host_ptr, &error); + + detail::errHandler(error, __CREATE_IMAGE3D_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 120 + } + + //! \brief Default constructor - initializes to NULL. + Image3D() : Image() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * \param retainObject will cause the constructor to retain its cl object. + * Defaults to false to maintain compatibility with + * earlier versions. + * See Memory for further details. + */ + explicit Image3D(const cl_mem& image3D, bool retainObject = false) : + Image(image3D, retainObject) { } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image3D& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image3D(const Image3D& img) : Image(img) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image3D& operator = (const Image3D &img) + { + Image::operator=(img); + return *this; + } + + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Image3D(Image3D&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Image3D& operator = (Image3D &&img) + { + Image::operator=(std::move(img)); + return *this; + } +}; + +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) +/*! \brief Class interface for GL 3D Image Memory objects. + * + * This is provided to facilitate interoperability with OpenGL. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Image3DGL : public Image3D +{ +public: + /*! \brief Constructs an Image3DGL in a specified context, from a given + * GL Texture. + * + * Wraps clCreateFromGLTexture3D(). + */ + Image3DGL( + const Context& context, + cl_mem_flags flags, + cl_GLenum target, + cl_GLint miplevel, + cl_GLuint texobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLTexture3D( + context(), + flags, + target, + miplevel, + texobj, + &error); + + detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + Image3DGL() : Image3D() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * \param retainObject will cause the constructor to retain its cl object. + * Defaults to false to maintain compatibility with + * earlier versions. + * See Memory for further details. + */ + explicit Image3DGL(const cl_mem& image, bool retainObject = false) : + Image3D(image, retainObject) { } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image3DGL& operator = (const cl_mem& rhs) + { + Image3D::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image3DGL(const Image3DGL& img) : Image3D(img) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Image3DGL& operator = (const Image3DGL &img) + { + Image3D::operator=(img); + return *this; + } + + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Image3DGL(Image3DGL&& img) CL_HPP_NOEXCEPT_ : Image3D(std::move(img)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Image3DGL& operator = (Image3DGL &&img) + { + Image3D::operator=(std::move(img)); + return *this; + } +}; +#endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS + +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 +/*! \class ImageGL + * \brief general image interface for GL interop. + * We abstract the 2D and 3D GL images into a single instance here + * that wraps all GL sourced images on the grounds that setup information + * was performed by OpenCL anyway. + */ +class ImageGL : public Image +{ +public: + ImageGL( + const Context& context, + cl_mem_flags flags, + cl_GLenum target, + cl_GLint miplevel, + cl_GLuint texobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLTexture( + context(), + flags, + target, + miplevel, + texobj, + &error); + + detail::errHandler(error, __CREATE_GL_TEXTURE_ERR); + if (err != NULL) { + *err = error; + } + } + + ImageGL() : Image() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * \param retainObject will cause the constructor to retain its cl object. + * Defaults to false to maintain compatibility with + * earlier versions. + * See Memory for further details. + */ + explicit ImageGL(const cl_mem& image, bool retainObject = false) : + Image(image, retainObject) { } + + ImageGL& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + ImageGL(const ImageGL& img) : Image(img) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + ImageGL& operator = (const ImageGL &img) + { + Image::operator=(img); + return *this; + } + + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + ImageGL(ImageGL&& img) CL_HPP_NOEXCEPT_ : Image(std::move(img)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + ImageGL& operator = (ImageGL &&img) + { + Image::operator=(std::move(img)); + return *this; + } +}; +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 + + + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 +/*! \brief Class interface for Pipe Memory Objects. +* +* See Memory for details about copy semantics, etc. +* +* \see Memory +*/ +class Pipe : public Memory +{ +public: + + /*! \brief Constructs a Pipe in a specified context. + * + * Wraps clCreatePipe(). + * @param context Context in which to create the pipe. + * @param flags Bitfield. Only CL_MEM_READ_WRITE and CL_MEM_HOST_NO_ACCESS are valid. + * @param packet_size Size in bytes of a single packet of the pipe. + * @param max_packets Number of packets that may be stored in the pipe. + * + */ + Pipe( + const Context& context, + cl_uint packet_size, + cl_uint max_packets, + cl_int* err = NULL) + { + cl_int error; + + cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS; + object_ = ::clCreatePipe(context(), flags, packet_size, max_packets, nullptr, &error); + + detail::errHandler(error, __CREATE_PIPE_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! \brief Constructs a Pipe in a the default context. + * + * Wraps clCreatePipe(). + * @param flags Bitfield. Only CL_MEM_READ_WRITE and CL_MEM_HOST_NO_ACCESS are valid. + * @param packet_size Size in bytes of a single packet of the pipe. + * @param max_packets Number of packets that may be stored in the pipe. + * + */ + Pipe( + cl_uint packet_size, + cl_uint max_packets, + cl_int* err = NULL) + { + cl_int error; + + Context context = Context::getDefault(err); + + cl_mem_flags flags = CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS; + object_ = ::clCreatePipe(context(), flags, packet_size, max_packets, nullptr, &error); + + detail::errHandler(error, __CREATE_PIPE_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + Pipe() : Memory() { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * \param retainObject will cause the constructor to retain its cl object. + * Defaults to false to maintain compatibility with earlier versions. + * + * See Memory for further details. + */ + explicit Pipe(const cl_mem& pipe, bool retainObject = false) : + Memory(pipe, retainObject) { } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Pipe& operator = (const cl_mem& rhs) + { + Memory::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Pipe(const Pipe& pipe) : Memory(pipe) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Pipe& operator = (const Pipe &pipe) + { + Memory::operator=(pipe); + return *this; + } + + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Pipe(Pipe&& pipe) CL_HPP_NOEXCEPT_ : Memory(std::move(pipe)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Pipe& operator = (Pipe &&pipe) + { + Memory::operator=(std::move(pipe)); + return *this; + } + + //! \brief Wrapper for clGetMemObjectInfo(). + template + cl_int getInfo(cl_pipe_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetPipeInfo, object_, name, param), + __GET_PIPE_INFO_ERR); + } + + //! \brief Wrapper for clGetMemObjectInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_pipe_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } +}; // class Pipe +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200 + + +/*! \brief Class interface for cl_sampler. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_sampler as the original. For details, see + * clRetainSampler() and clReleaseSampler(). + * + * \see cl_sampler + */ +class Sampler : public detail::Wrapper +{ +public: + //! \brief Default constructor - initializes to NULL. + Sampler() { } + + /*! \brief Constructs a Sampler in a specified context. + * + * Wraps clCreateSampler(). + */ + Sampler( + const Context& context, + cl_bool normalized_coords, + cl_addressing_mode addressing_mode, + cl_filter_mode filter_mode, + cl_int* err = NULL) + { + cl_int error; + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 + cl_sampler_properties sampler_properties[] = { + CL_SAMPLER_NORMALIZED_COORDS, normalized_coords, + CL_SAMPLER_ADDRESSING_MODE, addressing_mode, + CL_SAMPLER_FILTER_MODE, filter_mode, + 0 }; + object_ = ::clCreateSamplerWithProperties( + context(), + sampler_properties, + &error); + + detail::errHandler(error, __CREATE_SAMPLER_WITH_PROPERTIES_ERR); + if (err != NULL) { + *err = error; + } +#else + object_ = ::clCreateSampler( + context(), + normalized_coords, + addressing_mode, + filter_mode, + &error); + + detail::errHandler(error, __CREATE_SAMPLER_ERR); + if (err != NULL) { + *err = error; + } +#endif + } + + /*! \brief Constructor from cl_sampler - takes ownership. + * + * \param retainObject will cause the constructor to retain its cl object. + * Defaults to false to maintain compatibility with + * earlier versions. + * This effectively transfers ownership of a refcount on the cl_sampler + * into the new Sampler object. + */ + explicit Sampler(const cl_sampler& sampler, bool retainObject = false) : + detail::Wrapper(sampler, retainObject) { } + + /*! \brief Assignment operator from cl_sampler - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseSampler() on the value previously held by this instance. + */ + Sampler& operator = (const cl_sampler& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Sampler(const Sampler& sam) : detail::Wrapper(sam) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Sampler& operator = (const Sampler &sam) + { + detail::Wrapper::operator=(sam); + return *this; + } + + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Sampler(Sampler&& sam) CL_HPP_NOEXCEPT_ : detail::Wrapper(std::move(sam)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Sampler& operator = (Sampler &&sam) + { + detail::Wrapper::operator=(std::move(sam)); + return *this; + } + + //! \brief Wrapper for clGetSamplerInfo(). + template + cl_int getInfo(cl_sampler_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetSamplerInfo, object_, name, param), + __GET_SAMPLER_INFO_ERR); + } + + //! \brief Wrapper for clGetSamplerInfo() that returns by value. + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_sampler_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } +}; + +class Program; +class CommandQueue; +class DeviceCommandQueue; +class Kernel; + +//! \brief Class interface for specifying NDRange values. +class NDRange +{ +private: + size_type sizes_[3]; + cl_uint dimensions_; + +public: + //! \brief Default constructor - resulting range has zero dimensions. + NDRange() + : dimensions_(0) + { + sizes_[0] = 0; + sizes_[1] = 0; + sizes_[2] = 0; + } + + //! \brief Constructs one-dimensional range. + NDRange(size_type size0) + : dimensions_(1) + { + sizes_[0] = size0; + sizes_[1] = 1; + sizes_[2] = 1; + } + + //! \brief Constructs two-dimensional range. + NDRange(size_type size0, size_type size1) + : dimensions_(2) + { + sizes_[0] = size0; + sizes_[1] = size1; + sizes_[2] = 1; + } + + //! \brief Constructs three-dimensional range. + NDRange(size_type size0, size_type size1, size_type size2) + : dimensions_(3) + { + sizes_[0] = size0; + sizes_[1] = size1; + sizes_[2] = size2; + } + + /*! \brief Conversion operator to const size_type *. + * + * \returns a pointer to the size of the first dimension. + */ + operator const size_type*() const { + return sizes_; + } + + //! \brief Queries the number of dimensions in the range. + size_type dimensions() const + { + return dimensions_; + } + + //! \brief Returns the size of the object in bytes based on the + // runtime number of dimensions + size_type size() const + { + return dimensions_*sizeof(size_type); + } + + size_type* get() + { + return sizes_; + } + + const size_type* get() const + { + return sizes_; + } +}; + +//! \brief A zero-dimensional range. +static const NDRange NullRange; + +//! \brief Local address wrapper for use with Kernel::setArg +struct LocalSpaceArg +{ + size_type size_; +}; + +namespace detail { + +template +struct KernelArgumentHandler; + +// Enable for objects that are not subclasses of memory +// Pointers, constants etc +template +struct KernelArgumentHandler::value>::type> +{ + static size_type size(const T&) { return sizeof(T); } + static const T* ptr(const T& value) { return &value; } +}; + +// Enable for subclasses of memory where we want to get a reference to the cl_mem out +// and pass that in for safety +template +struct KernelArgumentHandler::value>::type> +{ + static size_type size(const T&) { return sizeof(cl_mem); } + static const cl_mem* ptr(const T& value) { return &(value()); } +}; + +// Specialization for DeviceCommandQueue defined later + +template <> +struct KernelArgumentHandler +{ + static size_type size(const LocalSpaceArg& value) { return value.size_; } + static const void* ptr(const LocalSpaceArg&) { return NULL; } +}; + +} +//! \endcond + +/*! Local + * \brief Helper function for generating LocalSpaceArg objects. + */ +inline LocalSpaceArg +Local(size_type size) +{ + LocalSpaceArg ret = { size }; + return ret; +} + +/*! \brief Class interface for cl_kernel. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_kernel as the original. For details, see + * clRetainKernel() and clReleaseKernel(). + * + * \see cl_kernel + */ +class Kernel : public detail::Wrapper +{ +public: + inline Kernel(const Program& program, const char* name, cl_int* err = NULL); + + //! \brief Default constructor - initializes to NULL. + Kernel() { } + + /*! \brief Constructor from cl_kernel - takes ownership. + * + * \param retainObject will cause the constructor to retain its cl object. + * Defaults to false to maintain compatibility with + * earlier versions. + * This effectively transfers ownership of a refcount on the cl_kernel + * into the new Kernel object. + */ + explicit Kernel(const cl_kernel& kernel, bool retainObject = false) : + detail::Wrapper(kernel, retainObject) { } + + /*! \brief Assignment operator from cl_kernel - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseKernel() on the value previously held by this instance. + */ + Kernel& operator = (const cl_kernel& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Kernel(const Kernel& kernel) : detail::Wrapper(kernel) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Kernel& operator = (const Kernel &kernel) + { + detail::Wrapper::operator=(kernel); + return *this; + } + + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Kernel(Kernel&& kernel) CL_HPP_NOEXCEPT_ : detail::Wrapper(std::move(kernel)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Kernel& operator = (Kernel &&kernel) + { + detail::Wrapper::operator=(std::move(kernel)); + return *this; + } + + template + cl_int getInfo(cl_kernel_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetKernelInfo, object_, name, param), + __GET_KERNEL_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_kernel_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 + template + cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param), + __GET_KERNEL_ARG_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getArgInfo(cl_uint argIndex, cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_kernel_arg_info, name>::param_type param; + cl_int result = getArgInfo(argIndex, name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 + + template + cl_int getWorkGroupInfo( + const Device& device, cl_kernel_work_group_info name, T* param) const + { + return detail::errHandler( + detail::getInfo( + &::clGetKernelWorkGroupInfo, object_, device(), name, param), + __GET_KERNEL_WORK_GROUP_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getWorkGroupInfo(const Device& device, cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_kernel_work_group_info, name>::param_type param; + cl_int result = getWorkGroupInfo(device, name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 +#if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) + cl_int getSubGroupInfo(const cl::Device &dev, cl_kernel_sub_group_info name, const cl::NDRange &range, size_type* param) const + { + typedef clGetKernelSubGroupInfoKHR_fn PFN_clGetKernelSubGroupInfoKHR; + static PFN_clGetKernelSubGroupInfoKHR pfn_clGetKernelSubGroupInfoKHR = NULL; + CL_HPP_INIT_CL_EXT_FCN_PTR_(clGetKernelSubGroupInfoKHR); + + return detail::errHandler( + pfn_clGetKernelSubGroupInfoKHR(object_, dev(), name, range.size(), range.get(), sizeof(size_type), param, nullptr), + __GET_KERNEL_ARG_INFO_ERR); + } + + template + size_type getSubGroupInfo(const cl::Device &dev, const cl::NDRange &range, cl_int* err = NULL) const + { + size_type param; + cl_int result = getSubGroupInfo(dev, name, range, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } +#endif // #if defined(CL_HPP_USE_CL_SUB_GROUPS_KHR) +#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 + /*! \brief setArg overload taking a shared_ptr type + */ + template + cl_int setArg(cl_uint index, const cl::pointer &argPtr) + { + return detail::errHandler( + ::clSetKernelArgSVMPointer(object_, index, argPtr.get()), + __SET_KERNEL_ARGS_ERR); + } + + /*! \brief setArg overload taking a vector type. + */ + template + cl_int setArg(cl_uint index, const cl::vector &argPtr) + { + return detail::errHandler( + ::clSetKernelArgSVMPointer(object_, index, argPtr.data()), + __SET_KERNEL_ARGS_ERR); + } + + /*! \brief setArg overload taking a pointer type + */ + template + typename std::enable_if::value, cl_int>::type + setArg(cl_uint index, const T argPtr) + { + return detail::errHandler( + ::clSetKernelArgSVMPointer(object_, index, argPtr), + __SET_KERNEL_ARGS_ERR); + } +#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 + + /*! \brief setArg overload taking a POD type + */ + template + typename std::enable_if::value, cl_int>::type + setArg(cl_uint index, const T &value) + { + return detail::errHandler( + ::clSetKernelArg( + object_, + index, + detail::KernelArgumentHandler::size(value), + detail::KernelArgumentHandler::ptr(value)), + __SET_KERNEL_ARGS_ERR); + } + + cl_int setArg(cl_uint index, size_type size, const void* argPtr) + { + return detail::errHandler( + ::clSetKernelArg(object_, index, size, argPtr), + __SET_KERNEL_ARGS_ERR); + } + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 + /*! + * Specify a vector of SVM pointers that the kernel may access in + * addition to its arguments. + */ + cl_int setSVMPointers(const vector &pointerList) + { + return detail::errHandler( + ::clSetKernelExecInfo( + object_, + CL_KERNEL_EXEC_INFO_SVM_PTRS, + sizeof(void*)*pointerList.size(), + pointerList.data())); + } + + /*! + * Specify a std::array of SVM pointers that the kernel may access in + * addition to its arguments. + */ + template + cl_int setSVMPointers(const std::array &pointerList) + { + return detail::errHandler( + ::clSetKernelExecInfo( + object_, + CL_KERNEL_EXEC_INFO_SVM_PTRS, + sizeof(void*)*pointerList.size(), + pointerList.data())); + } + + /*! \brief Enable fine-grained system SVM. + * + * \note It is only possible to enable fine-grained system SVM if all devices + * in the context associated with kernel support it. + * + * \param svmEnabled True if fine-grained system SVM is requested. False otherwise. + * \return CL_SUCCESS if the function was executed succesfully. CL_INVALID_OPERATION + * if no devices in the context support fine-grained system SVM. + * + * \see clSetKernelExecInfo + */ + cl_int enableFineGrainedSystemSVM(bool svmEnabled) + { + cl_bool svmEnabled_ = svmEnabled ? CL_TRUE : CL_FALSE; + return detail::errHandler( + ::clSetKernelExecInfo( + object_, + CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM, + sizeof(cl_bool), + &svmEnabled_ + ) + ); + } + + template + void setSVMPointersHelper(std::array &pointerList, const pointer &t0, const pointer &t1, Ts & ... ts) + { + pointerList[index] = static_cast(t0.get()); + setSVMPointersHelper(pointerList, t1, ts...); + } + + template + typename std::enable_if::value, void>::type + setSVMPointersHelper(std::array &pointerList, T0 t0, T1 t1, Ts... ts) + { + pointerList[index] = static_cast(t0); + setSVMPointersHelper(pointerList, t1, ts...); + } + + template + void setSVMPointersHelper(std::array &pointerList, const pointer &t0) + { + pointerList[index] = static_cast(t0.get()); + } + + + template + typename std::enable_if::value, void>::type + setSVMPointersHelper(std::array &pointerList, T0 t0) + { + pointerList[index] = static_cast(t0); + } + + template + cl_int setSVMPointers(const T0 &t0, Ts & ... ts) + { + std::array pointerList; + + setSVMPointersHelper<0, 1 + sizeof...(Ts)>(pointerList, t0, ts...); + return detail::errHandler( + ::clSetKernelExecInfo( + object_, + CL_KERNEL_EXEC_INFO_SVM_PTRS, + sizeof(void*)*(1 + sizeof...(Ts)), + pointerList.data())); + } +#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 +}; + +/*! \class Program + * \brief Program interface that implements cl_program. + */ +class Program : public detail::Wrapper +{ +public: +#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY) + typedef vector> Binaries; + typedef vector Sources; +#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY) + typedef vector > Binaries; + typedef vector > Sources; +#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY) + + Program( + const string& source, + bool build = false, + cl_int* err = NULL) + { + cl_int error; + + const char * strings = source.c_str(); + const size_type length = source.size(); + + Context context = Context::getDefault(err); + + object_ = ::clCreateProgramWithSource( + context(), (cl_uint)1, &strings, &length, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); + + if (error == CL_SUCCESS && build) { + + error = ::clBuildProgram( + object_, + 0, + NULL, +#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD) + "-cl-std=CL2.0", +#else + "", +#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD) + NULL, + NULL); + + detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo()); + } + + if (err != NULL) { + *err = error; + } + } + + Program( + const Context& context, + const string& source, + bool build = false, + cl_int* err = NULL) + { + cl_int error; + + const char * strings = source.c_str(); + const size_type length = source.size(); + + object_ = ::clCreateProgramWithSource( + context(), (cl_uint)1, &strings, &length, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); + + if (error == CL_SUCCESS && build) { + error = ::clBuildProgram( + object_, + 0, + NULL, +#if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD) + "-cl-std=CL2.0", +#else + "", +#endif // #if !defined(CL_HPP_CL_1_2_DEFAULT_BUILD) + NULL, + NULL); + + detail::buildErrHandler(error, __BUILD_PROGRAM_ERR, getBuildInfo()); + } + + if (err != NULL) { + *err = error; + } + } + + /** + * Create a program from a vector of source strings and the default context. + * Does not compile or link the program. + */ + Program( + const Sources& sources, + cl_int* err = NULL) + { + cl_int error; + Context context = Context::getDefault(err); + + const size_type n = (size_type)sources.size(); + + vector lengths(n); + vector strings(n); + + for (size_type i = 0; i < n; ++i) { +#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY) + strings[i] = sources[(int)i].data(); + lengths[i] = sources[(int)i].length(); +#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY) + strings[i] = sources[(int)i].first; + lengths[i] = sources[(int)i].second; +#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY) + } + + object_ = ::clCreateProgramWithSource( + context(), (cl_uint)n, strings.data(), lengths.data(), &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); + if (err != NULL) { + *err = error; + } + } + + /** + * Create a program from a vector of source strings and a provided context. + * Does not compile or link the program. + */ + Program( + const Context& context, + const Sources& sources, + cl_int* err = NULL) + { + cl_int error; + + const size_type n = (size_type)sources.size(); + + vector lengths(n); + vector strings(n); + + for (size_type i = 0; i < n; ++i) { +#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY) + strings[i] = sources[(int)i].data(); + lengths[i] = sources[(int)i].length(); +#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY) + strings[i] = sources[(int)i].first; + lengths[i] = sources[(int)i].second; +#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY) + } + + object_ = ::clCreateProgramWithSource( + context(), (cl_uint)n, strings.data(), lengths.data(), &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); + if (err != NULL) { + *err = error; + } + } + + /** + * Construct a program object from a list of devices and a per-device list of binaries. + * \param context A valid OpenCL context in which to construct the program. + * \param devices A vector of OpenCL device objects for which the program will be created. + * \param binaries A vector of pairs of a pointer to a binary object and its length. + * \param binaryStatus An optional vector that on completion will be resized to + * match the size of binaries and filled with values to specify if each binary + * was successfully loaded. + * Set to CL_SUCCESS if the binary was successfully loaded. + * Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL. + * Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device. + * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors: + * CL_INVALID_CONTEXT if context is not a valid context. + * CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; + * or if any entry in binaries is NULL or has length 0. + * CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context. + * CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device. + * CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host. + */ + Program( + const Context& context, + const vector& devices, + const Binaries& binaries, + vector* binaryStatus = NULL, + cl_int* err = NULL) + { + cl_int error; + + const size_type numDevices = devices.size(); + + // Catch size mismatch early and return + if(binaries.size() != numDevices) { + error = CL_INVALID_VALUE; + detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR); + if (err != NULL) { + *err = error; + } + return; + } + + + vector lengths(numDevices); + vector images(numDevices); +#if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY) + for (size_type i = 0; i < numDevices; ++i) { + images[i] = binaries[i].data(); + lengths[i] = binaries[(int)i].size(); + } +#else // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY) + for (size_type i = 0; i < numDevices; ++i) { + images[i] = (const unsigned char*)binaries[i].first; + lengths[i] = binaries[(int)i].second; + } +#endif // #if !defined(CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY) + + vector deviceIDs(numDevices); + for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { + deviceIDs[deviceIndex] = (devices[deviceIndex])(); + } + + if(binaryStatus) { + binaryStatus->resize(numDevices); + } + + object_ = ::clCreateProgramWithBinary( + context(), (cl_uint) devices.size(), + deviceIDs.data(), + lengths.data(), images.data(), (binaryStatus != NULL && numDevices > 0) + ? &binaryStatus->front() + : NULL, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR); + if (err != NULL) { + *err = error; + } + } + + +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 + /** + * Create program using builtin kernels. + * \param kernelNames Semi-colon separated list of builtin kernel names + */ + Program( + const Context& context, + const vector& devices, + const string& kernelNames, + cl_int* err = NULL) + { + cl_int error; + + + size_type numDevices = devices.size(); + vector deviceIDs(numDevices); + for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { + deviceIDs[deviceIndex] = (devices[deviceIndex])(); + } + + object_ = ::clCreateProgramWithBuiltInKernels( + context(), + (cl_uint) devices.size(), + deviceIDs.data(), + kernelNames.c_str(), + &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 + + Program() { } + + + /*! \brief Constructor from cl_mem - takes ownership. + * + * \param retainObject will cause the constructor to retain its cl object. + * Defaults to false to maintain compatibility with + * earlier versions. + */ + explicit Program(const cl_program& program, bool retainObject = false) : + detail::Wrapper(program, retainObject) { } + + Program& operator = (const cl_program& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + Program(const Program& program) : detail::Wrapper(program) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + Program& operator = (const Program &program) + { + detail::Wrapper::operator=(program); + return *this; + } + + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + Program(Program&& program) CL_HPP_NOEXCEPT_ : detail::Wrapper(std::move(program)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + Program& operator = (Program &&program) + { + detail::Wrapper::operator=(std::move(program)); + return *this; + } + + cl_int build( + const vector& devices, + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL) const + { + size_type numDevices = devices.size(); + vector deviceIDs(numDevices); + + for( size_type deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { + deviceIDs[deviceIndex] = (devices[deviceIndex])(); + } + + cl_int buildError = ::clBuildProgram( + object_, + (cl_uint) + devices.size(), + deviceIDs.data(), + options, + notifyFptr, + data); + + return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, getBuildInfo()); + } + + cl_int build( + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL) const + { + cl_int buildError = ::clBuildProgram( + object_, + 0, + NULL, + options, + notifyFptr, + data); + + + return detail::buildErrHandler(buildError, __BUILD_PROGRAM_ERR, getBuildInfo()); + } + +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 + cl_int compile( + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL) const + { + cl_int error = ::clCompileProgram( + object_, + 0, + NULL, + options, + 0, + NULL, + NULL, + notifyFptr, + data); + return detail::buildErrHandler(error, __COMPILE_PROGRAM_ERR, getBuildInfo()); + } +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 + + template + cl_int getInfo(cl_program_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetProgramInfo, object_, name, param), + __GET_PROGRAM_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_program_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + template + cl_int getBuildInfo( + const Device& device, cl_program_build_info name, T* param) const + { + return detail::errHandler( + detail::getInfo( + &::clGetProgramBuildInfo, object_, device(), name, param), + __GET_PROGRAM_BUILD_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getBuildInfo(const Device& device, cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_program_build_info, name>::param_type param; + cl_int result = getBuildInfo(device, name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /** + * Build info function that returns a vector of device/info pairs for the specified + * info type and for all devices in the program. + * On an error reading the info for any device, an empty vector of info will be returned. + */ + template + vector::param_type>> + getBuildInfo(cl_int *err = NULL) const + { + cl_int result = CL_SUCCESS; + + auto devs = getInfo(&result); + vector::param_type>> + devInfo; + + // If there was an initial error from getInfo return the error + if (result != CL_SUCCESS) { + if (err != NULL) { + *err = result; + } + return devInfo; + } + + for (const cl::Device &d : devs) { + typename detail::param_traits< + detail::cl_program_build_info, name>::param_type param; + result = getBuildInfo(d, name, ¶m); + devInfo.push_back( + std::pair::param_type> + (d, param)); + if (result != CL_SUCCESS) { + // On error, leave the loop and return the error code + break; + } + } + if (err != NULL) { + *err = result; + } + if (result != CL_SUCCESS) { + devInfo.clear(); + } + return devInfo; + } + + cl_int createKernels(vector* kernels) + { + cl_uint numKernels; + cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR); + } + + vector value(numKernels); + + err = ::clCreateKernelsInProgram( + object_, numKernels, value.data(), NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR); + } + + if (kernels) { + kernels->resize(value.size()); + + // Assign to param, constructing with retain behaviour + // to correctly capture each underlying CL object + for (size_type i = 0; i < value.size(); i++) { + // We do not need to retain because this kernel is being created + // by the runtime + (*kernels)[i] = Kernel(value[i], false); + } + } + return CL_SUCCESS; + } +}; + +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 +inline Program linkProgram( + Program input1, + Program input2, + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL, + cl_int* err = NULL) +{ + cl_int error_local = CL_SUCCESS; + + cl_program programs[2] = { input1(), input2() }; + + Context ctx = input1.getInfo(&error_local); + if(error_local!=CL_SUCCESS) { + detail::errHandler(error_local, __LINK_PROGRAM_ERR); + } + + cl_program prog = ::clLinkProgram( + ctx(), + 0, + NULL, + options, + 2, + programs, + notifyFptr, + data, + &error_local); + + detail::errHandler(error_local,__COMPILE_PROGRAM_ERR); + if (err != NULL) { + *err = error_local; + } + + return Program(prog); +} + +inline Program linkProgram( + vector inputPrograms, + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL, + cl_int* err = NULL) +{ + cl_int error_local = CL_SUCCESS; + + vector programs(inputPrograms.size()); + + for (unsigned int i = 0; i < inputPrograms.size(); i++) { + programs[i] = inputPrograms[i](); + } + + Context ctx; + if(inputPrograms.size() > 0) { + ctx = inputPrograms[0].getInfo(&error_local); + if(error_local!=CL_SUCCESS) { + detail::errHandler(error_local, __LINK_PROGRAM_ERR); + } + } + cl_program prog = ::clLinkProgram( + ctx(), + 0, + NULL, + options, + (cl_uint)inputPrograms.size(), + programs.data(), + notifyFptr, + data, + &error_local); + + detail::errHandler(error_local,__COMPILE_PROGRAM_ERR); + if (err != NULL) { + *err = error_local; + } + + return Program(prog, false); +} +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 + +// Template specialization for CL_PROGRAM_BINARIES +template <> +inline cl_int cl::Program::getInfo(cl_program_info name, vector>* param) const +{ + if (name != CL_PROGRAM_BINARIES) { + return CL_INVALID_VALUE; + } + if (param) { + // Resize the parameter array appropriately for each allocation + // and pass down to the helper + + vector sizes = getInfo(); + size_type numBinaries = sizes.size(); + + // Resize the parameter array and constituent arrays + param->resize(numBinaries); + for (size_type i = 0; i < numBinaries; ++i) { + (*param)[i].resize(sizes[i]); + } + + return detail::errHandler( + detail::getInfo(&::clGetProgramInfo, object_, name, param), + __GET_PROGRAM_INFO_ERR); + } + + return CL_SUCCESS; +} + +template<> +inline vector> cl::Program::getInfo(cl_int* err) const +{ + vector> binariesVectors; + + cl_int result = getInfo(CL_PROGRAM_BINARIES, &binariesVectors); + if (err != NULL) { + *err = result; + } + return binariesVectors; +} + +inline Kernel::Kernel(const Program& program, const char* name, cl_int* err) +{ + cl_int error; + + object_ = ::clCreateKernel(program(), name, &error); + detail::errHandler(error, __CREATE_KERNEL_ERR); + + if (err != NULL) { + *err = error; + } + +} + +enum class QueueProperties : cl_command_queue_properties +{ + None = 0, + Profiling = CL_QUEUE_PROFILING_ENABLE, + OutOfOrder = CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, +}; + +inline QueueProperties operator|(QueueProperties lhs, QueueProperties rhs) +{ + return static_cast(static_cast(lhs) | static_cast(rhs)); +} + +/*! \class CommandQueue + * \brief CommandQueue interface for cl_command_queue. + */ +class CommandQueue : public detail::Wrapper +{ +private: + static std::once_flag default_initialized_; + static CommandQueue default_; + static cl_int default_error_; + + /*! \brief Create the default command queue returned by @ref getDefault. + * + * It sets default_error_ to indicate success or failure. It does not throw + * @c cl::Error. + */ + static void makeDefault() + { + /* We don't want to throw an error from this function, so we have to + * catch and set the error flag. + */ +#if defined(CL_HPP_ENABLE_EXCEPTIONS) + try +#endif + { + int error; + Context context = Context::getDefault(&error); + + if (error != CL_SUCCESS) { + default_error_ = error; + } + else { + Device device = Device::getDefault(); + default_ = CommandQueue(context, device, 0, &default_error_); + } + } +#if defined(CL_HPP_ENABLE_EXCEPTIONS) + catch (cl::Error &e) { + default_error_ = e.err(); + } +#endif + } + + /*! \brief Create the default command queue. + * + * This sets @c default_. It does not throw + * @c cl::Error. + */ + static void makeDefaultProvided(const CommandQueue &c) { + default_ = c; + } + +public: +#ifdef CL_HPP_UNIT_TEST_ENABLE + /*! \brief Reset the default. + * + * This sets @c default_ to an empty value to support cleanup in + * the unit test framework. + * This function is not thread safe. + */ + static void unitTestClearDefault() { + default_ = CommandQueue(); + } +#endif // #ifdef CL_HPP_UNIT_TEST_ENABLE + + + /*! + * \brief Constructs a CommandQueue based on passed properties. + * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified. + */ + CommandQueue( + cl_command_queue_properties properties, + cl_int* err = NULL) + { + cl_int error; + + Context context = Context::getDefault(&error); + detail::errHandler(error, __CREATE_CONTEXT_ERR); + + if (error != CL_SUCCESS) { + if (err != NULL) { + *err = error; + } + } + else { + Device device = context.getInfo()[0]; + bool useWithProperties; + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200 + // Run-time decision based on the actual platform + { + cl_uint version = detail::getContextPlatformVersion(context()); + useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above + } +#elif CL_HPP_TARGET_OPENCL_VERSION >= 200 + useWithProperties = true; +#else + useWithProperties = false; +#endif + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 + if (useWithProperties) { + cl_queue_properties queue_properties[] = { + CL_QUEUE_PROPERTIES, properties, 0 }; + if ((properties & CL_QUEUE_ON_DEVICE) == 0) { + object_ = ::clCreateCommandQueueWithProperties( + context(), device(), queue_properties, &error); + } + else { + error = CL_INVALID_QUEUE_PROPERTIES; + } + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200 +#if CL_HPP_MINIMUM_OPENCL_VERSION < 200 + if (!useWithProperties) { + object_ = ::clCreateCommandQueue( + context(), device(), properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200 + } + } + + /*! + * \brief Constructs a CommandQueue based on passed properties. + * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified. + */ + CommandQueue( + QueueProperties properties, + cl_int* err = NULL) + { + cl_int error; + + Context context = Context::getDefault(&error); + detail::errHandler(error, __CREATE_CONTEXT_ERR); + + if (error != CL_SUCCESS) { + if (err != NULL) { + *err = error; + } + } + else { + Device device = context.getInfo()[0]; + bool useWithProperties; + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200 + // Run-time decision based on the actual platform + { + cl_uint version = detail::getContextPlatformVersion(context()); + useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above + } +#elif CL_HPP_TARGET_OPENCL_VERSION >= 200 + useWithProperties = true; +#else + useWithProperties = false; +#endif + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 + if (useWithProperties) { + cl_queue_properties queue_properties[] = { + CL_QUEUE_PROPERTIES, static_cast(properties), 0 }; + + object_ = ::clCreateCommandQueueWithProperties( + context(), device(), queue_properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200 +#if CL_HPP_MINIMUM_OPENCL_VERSION < 200 + if (!useWithProperties) { + object_ = ::clCreateCommandQueue( + context(), device(), static_cast(properties), &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200 + + } + } + + /*! + * \brief Constructs a CommandQueue for an implementation defined device in the given context + * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified. + */ + explicit CommandQueue( + const Context& context, + cl_command_queue_properties properties = 0, + cl_int* err = NULL) + { + cl_int error; + bool useWithProperties; + vector devices; + error = context.getInfo(CL_CONTEXT_DEVICES, &devices); + + detail::errHandler(error, __CREATE_CONTEXT_ERR); + + if (error != CL_SUCCESS) + { + if (err != NULL) { + *err = error; + } + return; + } + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200 + // Run-time decision based on the actual platform + { + cl_uint version = detail::getContextPlatformVersion(context()); + useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above + } +#elif CL_HPP_TARGET_OPENCL_VERSION >= 200 + useWithProperties = true; +#else + useWithProperties = false; +#endif + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 + if (useWithProperties) { + cl_queue_properties queue_properties[] = { + CL_QUEUE_PROPERTIES, properties, 0 }; + if ((properties & CL_QUEUE_ON_DEVICE) == 0) { + object_ = ::clCreateCommandQueueWithProperties( + context(), devices[0](), queue_properties, &error); + } + else { + error = CL_INVALID_QUEUE_PROPERTIES; + } + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200 +#if CL_HPP_MINIMUM_OPENCL_VERSION < 200 + if (!useWithProperties) { + object_ = ::clCreateCommandQueue( + context(), devices[0](), properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200 + } + + /*! + * \brief Constructs a CommandQueue for an implementation defined device in the given context + * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified. + */ + explicit CommandQueue( + const Context& context, + QueueProperties properties, + cl_int* err = NULL) + { + cl_int error; + bool useWithProperties; + vector devices; + error = context.getInfo(CL_CONTEXT_DEVICES, &devices); + + detail::errHandler(error, __CREATE_CONTEXT_ERR); + + if (error != CL_SUCCESS) + { + if (err != NULL) { + *err = error; + } + return; + } + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200 + // Run-time decision based on the actual platform + { + cl_uint version = detail::getContextPlatformVersion(context()); + useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above + } +#elif CL_HPP_TARGET_OPENCL_VERSION >= 200 + useWithProperties = true; +#else + useWithProperties = false; +#endif + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 + if (useWithProperties) { + cl_queue_properties queue_properties[] = { + CL_QUEUE_PROPERTIES, static_cast(properties), 0 }; + object_ = ::clCreateCommandQueueWithProperties( + context(), devices[0](), queue_properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200 +#if CL_HPP_MINIMUM_OPENCL_VERSION < 200 + if (!useWithProperties) { + object_ = ::clCreateCommandQueue( + context(), devices[0](), static_cast(properties), &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200 + } + + /*! + * \brief Constructs a CommandQueue for a passed device and context + * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified. + */ + CommandQueue( + const Context& context, + const Device& device, + cl_command_queue_properties properties = 0, + cl_int* err = NULL) + { + cl_int error; + bool useWithProperties; + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200 + // Run-time decision based on the actual platform + { + cl_uint version = detail::getContextPlatformVersion(context()); + useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above + } +#elif CL_HPP_TARGET_OPENCL_VERSION >= 200 + useWithProperties = true; +#else + useWithProperties = false; +#endif + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 + if (useWithProperties) { + cl_queue_properties queue_properties[] = { + CL_QUEUE_PROPERTIES, properties, 0 }; + object_ = ::clCreateCommandQueueWithProperties( + context(), device(), queue_properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200 +#if CL_HPP_MINIMUM_OPENCL_VERSION < 200 + if (!useWithProperties) { + object_ = ::clCreateCommandQueue( + context(), device(), properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200 + } + + /*! + * \brief Constructs a CommandQueue for a passed device and context + * Will return an CL_INVALID_QUEUE_PROPERTIES error if CL_QUEUE_ON_DEVICE is specified. + */ + CommandQueue( + const Context& context, + const Device& device, + QueueProperties properties, + cl_int* err = NULL) + { + cl_int error; + bool useWithProperties; + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 && CL_HPP_MINIMUM_OPENCL_VERSION < 200 + // Run-time decision based on the actual platform + { + cl_uint version = detail::getContextPlatformVersion(context()); + useWithProperties = (version >= 0x20000); // OpenCL 2.0 or above + } +#elif CL_HPP_TARGET_OPENCL_VERSION >= 200 + useWithProperties = true; +#else + useWithProperties = false; +#endif + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 + if (useWithProperties) { + cl_queue_properties queue_properties[] = { + CL_QUEUE_PROPERTIES, static_cast(properties), 0 }; + object_ = ::clCreateCommandQueueWithProperties( + context(), device(), queue_properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200 +#if CL_HPP_MINIMUM_OPENCL_VERSION < 200 + if (!useWithProperties) { + object_ = ::clCreateCommandQueue( + context(), device(), static_cast(properties), &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // CL_HPP_MINIMUM_OPENCL_VERSION < 200 + } + + static CommandQueue getDefault(cl_int * err = NULL) + { + std::call_once(default_initialized_, makeDefault); +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 + detail::errHandler(default_error_, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); +#else // CL_HPP_TARGET_OPENCL_VERSION >= 200 + detail::errHandler(default_error_, __CREATE_COMMAND_QUEUE_ERR); +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 200 + if (err != NULL) { + *err = default_error_; + } + return default_; + } + + /** + * Modify the default command queue to be used by + * subsequent operations. + * Will only set the default if no default was previously created. + * @return updated default command queue. + * Should be compared to the passed value to ensure that it was updated. + */ + static CommandQueue setDefault(const CommandQueue &default_queue) + { + std::call_once(default_initialized_, makeDefaultProvided, std::cref(default_queue)); + detail::errHandler(default_error_); + return default_; + } + + CommandQueue() { } + + + /*! \brief Constructor from cl_mem - takes ownership. + * + * \param retainObject will cause the constructor to retain its cl object. + * Defaults to false to maintain compatibility with + * earlier versions. + */ + explicit CommandQueue(const cl_command_queue& commandQueue, bool retainObject = false) : + detail::Wrapper(commandQueue, retainObject) { } + + CommandQueue& operator = (const cl_command_queue& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + CommandQueue(const CommandQueue& queue) : detail::Wrapper(queue) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + CommandQueue& operator = (const CommandQueue &queue) + { + detail::Wrapper::operator=(queue); + return *this; + } + + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + CommandQueue(CommandQueue&& queue) CL_HPP_NOEXCEPT_ : detail::Wrapper(std::move(queue)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + CommandQueue& operator = (CommandQueue &&queue) + { + detail::Wrapper::operator=(std::move(queue)); + return *this; + } + + template + cl_int getInfo(cl_command_queue_info name, T* param) const + { + return detail::errHandler( + detail::getInfo( + &::clGetCommandQueueInfo, object_, name, param), + __GET_COMMAND_QUEUE_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_command_queue_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + cl_int enqueueReadBuffer( + const Buffer& buffer, + cl_bool blocking, + size_type offset, + size_type size, + void* ptr, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueReadBuffer( + object_, buffer(), blocking, offset, size, + ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_READ_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueWriteBuffer( + const Buffer& buffer, + cl_bool blocking, + size_type offset, + size_type size, + const void* ptr, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueWriteBuffer( + object_, buffer(), blocking, offset, size, + ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_WRITE_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueCopyBuffer( + const Buffer& src, + const Buffer& dst, + size_type src_offset, + size_type dst_offset, + size_type size, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyBuffer( + object_, src(), dst(), src_offset, dst_offset, size, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQEUE_COPY_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueReadBufferRect( + const Buffer& buffer, + cl_bool blocking, + const array& buffer_offset, + const array& host_offset, + const array& region, + size_type buffer_row_pitch, + size_type buffer_slice_pitch, + size_type host_row_pitch, + size_type host_slice_pitch, + void *ptr, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueReadBufferRect( + object_, + buffer(), + blocking, + buffer_offset.data(), + host_offset.data(), + region.data(), + buffer_row_pitch, + buffer_slice_pitch, + host_row_pitch, + host_slice_pitch, + ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_READ_BUFFER_RECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueWriteBufferRect( + const Buffer& buffer, + cl_bool blocking, + const array& buffer_offset, + const array& host_offset, + const array& region, + size_type buffer_row_pitch, + size_type buffer_slice_pitch, + size_type host_row_pitch, + size_type host_slice_pitch, + const void *ptr, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueWriteBufferRect( + object_, + buffer(), + blocking, + buffer_offset.data(), + host_offset.data(), + region.data(), + buffer_row_pitch, + buffer_slice_pitch, + host_row_pitch, + host_slice_pitch, + ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_WRITE_BUFFER_RECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueCopyBufferRect( + const Buffer& src, + const Buffer& dst, + const array& src_origin, + const array& dst_origin, + const array& region, + size_type src_row_pitch, + size_type src_slice_pitch, + size_type dst_row_pitch, + size_type dst_slice_pitch, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyBufferRect( + object_, + src(), + dst(), + src_origin.data(), + dst_origin.data(), + region.data(), + src_row_pitch, + src_slice_pitch, + dst_row_pitch, + dst_slice_pitch, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQEUE_COPY_BUFFER_RECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 + /** + * Enqueue a command to fill a buffer object with a pattern + * of a given size. The pattern is specified as a vector type. + * \tparam PatternType The datatype of the pattern field. + * The pattern type must be an accepted OpenCL data type. + * \tparam offset Is the offset in bytes into the buffer at + * which to start filling. This must be a multiple of + * the pattern size. + * \tparam size Is the size in bytes of the region to fill. + * This must be a multiple of the pattern size. + */ + template + cl_int enqueueFillBuffer( + const Buffer& buffer, + PatternType pattern, + size_type offset, + size_type size, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueFillBuffer( + object_, + buffer(), + static_cast(&pattern), + sizeof(PatternType), + offset, + size, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_FILL_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 + + cl_int enqueueReadImage( + const Image& image, + cl_bool blocking, + const array& origin, + const array& region, + size_type row_pitch, + size_type slice_pitch, + void* ptr, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueReadImage( + object_, + image(), + blocking, + origin.data(), + region.data(), + row_pitch, + slice_pitch, + ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_READ_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueWriteImage( + const Image& image, + cl_bool blocking, + const array& origin, + const array& region, + size_type row_pitch, + size_type slice_pitch, + const void* ptr, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueWriteImage( + object_, + image(), + blocking, + origin.data(), + region.data(), + row_pitch, + slice_pitch, + ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_WRITE_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueCopyImage( + const Image& src, + const Image& dst, + const array& src_origin, + const array& dst_origin, + const array& region, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyImage( + object_, + src(), + dst(), + src_origin.data(), + dst_origin.data(), + region.data(), + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_COPY_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 + /** + * Enqueue a command to fill an image object with a specified color. + * \param fillColor is the color to use to fill the image. + * This is a four component RGBA floating-point color value if + * the image channel data type is not an unnormalized signed or + * unsigned data type. + */ + cl_int enqueueFillImage( + const Image& image, + cl_float4 fillColor, + const array& origin, + const array& region, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueFillImage( + object_, + image(), + static_cast(&fillColor), + origin.data(), + region.data(), + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_FILL_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * Enqueue a command to fill an image object with a specified color. + * \param fillColor is the color to use to fill the image. + * This is a four component RGBA signed integer color value if + * the image channel data type is an unnormalized signed integer + * type. + */ + cl_int enqueueFillImage( + const Image& image, + cl_int4 fillColor, + const array& origin, + const array& region, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueFillImage( + object_, + image(), + static_cast(&fillColor), + origin.data(), + region.data(), + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_FILL_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * Enqueue a command to fill an image object with a specified color. + * \param fillColor is the color to use to fill the image. + * This is a four component RGBA unsigned integer color value if + * the image channel data type is an unnormalized unsigned integer + * type. + */ + cl_int enqueueFillImage( + const Image& image, + cl_uint4 fillColor, + const array& origin, + const array& region, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueFillImage( + object_, + image(), + static_cast(&fillColor), + origin.data(), + region.data(), + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_FILL_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 + + cl_int enqueueCopyImageToBuffer( + const Image& src, + const Buffer& dst, + const array& src_origin, + const array& region, + size_type dst_offset, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyImageToBuffer( + object_, + src(), + dst(), + src_origin.data(), + region.data(), + dst_offset, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueCopyBufferToImage( + const Buffer& src, + const Image& dst, + size_type src_offset, + const array& dst_origin, + const array& region, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyBufferToImage( + object_, + src(), + dst(), + src_offset, + dst_origin.data(), + region.data(), + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + void* enqueueMapBuffer( + const Buffer& buffer, + cl_bool blocking, + cl_map_flags flags, + size_type offset, + size_type size, + const vector* events = NULL, + Event* event = NULL, + cl_int* err = NULL) const + { + cl_event tmp; + cl_int error; + void * result = ::clEnqueueMapBuffer( + object_, buffer(), blocking, flags, offset, size, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL, + &error); + + detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + if (event != NULL && error == CL_SUCCESS) + *event = tmp; + + return result; + } + + void* enqueueMapImage( + const Image& buffer, + cl_bool blocking, + cl_map_flags flags, + const array& origin, + const array& region, + size_type * row_pitch, + size_type * slice_pitch, + const vector* events = NULL, + Event* event = NULL, + cl_int* err = NULL) const + { + cl_event tmp; + cl_int error; + void * result = ::clEnqueueMapImage( + object_, buffer(), blocking, flags, + origin.data(), + region.data(), + row_pitch, slice_pitch, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL, + &error); + + detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + if (event != NULL && error == CL_SUCCESS) + *event = tmp; + return result; + } + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 + /** + * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer. + * This variant takes a raw SVM pointer. + */ + template + cl_int enqueueMapSVM( + T* ptr, + cl_bool blocking, + cl_map_flags flags, + size_type size, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler(::clEnqueueSVMMap( + object_, blocking, flags, static_cast(ptr), size, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_MAP_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + + /** + * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer. + * This variant takes a cl::pointer instance. + */ + template + cl_int enqueueMapSVM( + cl::pointer &ptr, + cl_bool blocking, + cl_map_flags flags, + size_type size, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler(::clEnqueueSVMMap( + object_, blocking, flags, static_cast(ptr.get()), size, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_MAP_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * Enqueues a command that will allow the host to update a region of a coarse-grained SVM buffer. + * This variant takes a cl::vector instance. + */ + template + cl_int enqueueMapSVM( + cl::vector &container, + cl_bool blocking, + cl_map_flags flags, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler(::clEnqueueSVMMap( + object_, blocking, flags, static_cast(container.data()), container.size(), + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_MAP_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 + + cl_int enqueueUnmapMemObject( + const Memory& memory, + void* mapped_ptr, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueUnmapMemObject( + object_, memory(), mapped_ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_UNMAP_MEM_OBJECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 + /** + * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime. + * This variant takes a raw SVM pointer. + */ + template + cl_int enqueueUnmapSVM( + T* ptr, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueSVMUnmap( + object_, static_cast(ptr), + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_UNMAP_MEM_OBJECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime. + * This variant takes a cl::pointer instance. + */ + template + cl_int enqueueUnmapSVM( + cl::pointer &ptr, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueSVMUnmap( + object_, static_cast(ptr.get()), + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_UNMAP_MEM_OBJECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * Enqueues a command that will release a coarse-grained SVM buffer back to the OpenCL runtime. + * This variant takes a cl::vector instance. + */ + template + cl_int enqueueUnmapSVM( + cl::vector &container, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueSVMUnmap( + object_, static_cast(container.data()), + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_UNMAP_MEM_OBJECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 + +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 + /** + * Enqueues a marker command which waits for either a list of events to complete, + * or all previously enqueued commands to complete. + * + * Enqueues a marker command which waits for either a list of events to complete, + * or if the list is empty it waits for all commands previously enqueued in command_queue + * to complete before it completes. This command returns an event which can be waited on, + * i.e. this event can be waited on to insure that all events either in the event_wait_list + * or all previously enqueued commands, queued before this command to command_queue, + * have completed. + */ + cl_int enqueueMarkerWithWaitList( + const vector *events = 0, + Event *event = 0) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueMarkerWithWaitList( + object_, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_MARKER_WAIT_LIST_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * A synchronization point that enqueues a barrier operation. + * + * Enqueues a barrier command which waits for either a list of events to complete, + * or if the list is empty it waits for all commands previously enqueued in command_queue + * to complete before it completes. This command blocks command execution, that is, any + * following commands enqueued after it do not execute until it completes. This command + * returns an event which can be waited on, i.e. this event can be waited on to insure that + * all events either in the event_wait_list or all previously enqueued commands, queued + * before this command to command_queue, have completed. + */ + cl_int enqueueBarrierWithWaitList( + const vector *events = 0, + Event *event = 0) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueBarrierWithWaitList( + object_, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_BARRIER_WAIT_LIST_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * Enqueues a command to indicate with which device a set of memory objects + * should be associated. + */ + cl_int enqueueMigrateMemObjects( + const vector &memObjects, + cl_mem_migration_flags flags, + const vector* events = NULL, + Event* event = NULL + ) const + { + cl_event tmp; + + vector localMemObjects(memObjects.size()); + + for( int i = 0; i < (int)memObjects.size(); ++i ) { + localMemObjects[i] = memObjects[i](); + } + + + cl_int err = detail::errHandler( + ::clEnqueueMigrateMemObjects( + object_, + (cl_uint)memObjects.size(), + localMemObjects.data(), + flags, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_UNMAP_MEM_OBJECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 + + cl_int enqueueNDRangeKernel( + const Kernel& kernel, + const NDRange& offset, + const NDRange& global, + const NDRange& local = NullRange, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueNDRangeKernel( + object_, kernel(), (cl_uint) global.dimensions(), + offset.dimensions() != 0 ? (const size_type*) offset : NULL, + (const size_type*) global, + local.dimensions() != 0 ? (const size_type*) local : NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_NDRANGE_KERNEL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) + CL_EXT_PREFIX__VERSION_1_2_DEPRECATED cl_int enqueueTask( + const Kernel& kernel, + const vector* events = NULL, + Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueTask( + object_, kernel(), + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_TASK_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) + + cl_int enqueueNativeKernel( + void (CL_CALLBACK *userFptr)(void *), + std::pair args, + const vector* mem_objects = NULL, + const vector* mem_locs = NULL, + const vector* events = NULL, + Event* event = NULL) const + { + size_type elements = 0; + if (mem_objects != NULL) { + elements = mem_objects->size(); + } + vector mems(elements); + for (unsigned int i = 0; i < elements; i++) { + mems[i] = ((*mem_objects)[i])(); + } + + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueNativeKernel( + object_, userFptr, args.first, args.second, + (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, + mems.data(), + (mem_locs != NULL && mem_locs->size() > 0) ? (const void **) &mem_locs->front() : NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_NATIVE_KERNEL); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueMarker( + object_, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_MARKER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + cl_int enqueueWaitForEvents(const vector& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + { + return detail::errHandler( + ::clEnqueueWaitForEvents( + object_, + (cl_uint) events.size(), + events.size() > 0 ? (const cl_event*) &events.front() : NULL), + __ENQUEUE_WAIT_FOR_EVENTS_ERR); + } +#endif // defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + + cl_int enqueueAcquireGLObjects( + const vector* mem_objects = NULL, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueAcquireGLObjects( + object_, + (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, + (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_ACQUIRE_GL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueReleaseGLObjects( + const vector* mem_objects = NULL, + const vector* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueReleaseGLObjects( + object_, + (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, + (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_RELEASE_GL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if defined (CL_HPP_USE_DX_INTEROP) +typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem* mem_objects, cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, cl_event* event); +typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem* mem_objects, cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, cl_event* event); + + cl_int enqueueAcquireD3D10Objects( + const vector* mem_objects = NULL, + const vector* events = NULL, + Event* event = NULL) const + { + static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL; +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 + cl_context context = getInfo(); + cl::Device device(getInfo()); + cl_platform_id platform = device.getInfo(); + CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueAcquireD3D10ObjectsKHR); +#endif +#if CL_HPP_TARGET_OPENCL_VERSION >= 110 + CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueAcquireD3D10ObjectsKHR); +#endif + + cl_event tmp; + cl_int err = detail::errHandler( + pfn_clEnqueueAcquireD3D10ObjectsKHR( + object_, + (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, + (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_ACQUIRE_GL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueReleaseD3D10Objects( + const vector* mem_objects = NULL, + const vector* events = NULL, + Event* event = NULL) const + { + static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL; +#if CL_HPP_TARGET_OPENCL_VERSION >= 120 + cl_context context = getInfo(); + cl::Device device(getInfo()); + cl_platform_id platform = device.getInfo(); + CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_(platform, clEnqueueReleaseD3D10ObjectsKHR); +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 120 +#if CL_HPP_TARGET_OPENCL_VERSION >= 110 + CL_HPP_INIT_CL_EXT_FCN_PTR_(clEnqueueReleaseD3D10ObjectsKHR); +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110 + + cl_event tmp; + cl_int err = detail::errHandler( + pfn_clEnqueueReleaseD3D10ObjectsKHR( + object_, + (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, + (mem_objects != NULL && mem_objects->size() > 0) ? (const cl_mem *) &mem_objects->front(): NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_RELEASE_GL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif + +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + { + return detail::errHandler( + ::clEnqueueBarrier(object_), + __ENQUEUE_BARRIER_ERR); + } +#endif // CL_USE_DEPRECATED_OPENCL_1_1_APIS + + cl_int flush() const + { + return detail::errHandler(::clFlush(object_), __FLUSH_ERR); + } + + cl_int finish() const + { + return detail::errHandler(::clFinish(object_), __FINISH_ERR); + } +}; // CommandQueue + +CL_HPP_DEFINE_STATIC_MEMBER_ std::once_flag CommandQueue::default_initialized_; +CL_HPP_DEFINE_STATIC_MEMBER_ CommandQueue CommandQueue::default_; +CL_HPP_DEFINE_STATIC_MEMBER_ cl_int CommandQueue::default_error_ = CL_SUCCESS; + + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 +enum class DeviceQueueProperties : cl_command_queue_properties +{ + None = 0, + Profiling = CL_QUEUE_PROFILING_ENABLE, +}; + +inline DeviceQueueProperties operator|(DeviceQueueProperties lhs, DeviceQueueProperties rhs) +{ + return static_cast(static_cast(lhs) | static_cast(rhs)); +} + +/*! \class DeviceCommandQueue + * \brief DeviceCommandQueue interface for device cl_command_queues. + */ +class DeviceCommandQueue : public detail::Wrapper +{ +public: + + /*! + * Trivial empty constructor to create a null queue. + */ + DeviceCommandQueue() { } + + /*! + * Default construct device command queue on default context and device + */ + DeviceCommandQueue(DeviceQueueProperties properties, cl_int* err = NULL) + { + cl_int error; + cl::Context context = cl::Context::getDefault(); + cl::Device device = cl::Device::getDefault(); + + cl_command_queue_properties mergedProperties = + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | static_cast(properties); + + cl_queue_properties queue_properties[] = { + CL_QUEUE_PROPERTIES, mergedProperties, 0 }; + object_ = ::clCreateCommandQueueWithProperties( + context(), device(), queue_properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! + * Create a device command queue for a specified device in the passed context. + */ + DeviceCommandQueue( + const Context& context, + const Device& device, + DeviceQueueProperties properties = DeviceQueueProperties::None, + cl_int* err = NULL) + { + cl_int error; + + cl_command_queue_properties mergedProperties = + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | static_cast(properties); + cl_queue_properties queue_properties[] = { + CL_QUEUE_PROPERTIES, mergedProperties, 0 }; + object_ = ::clCreateCommandQueueWithProperties( + context(), device(), queue_properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! + * Create a device command queue for a specified device in the passed context. + */ + DeviceCommandQueue( + const Context& context, + const Device& device, + cl_uint queueSize, + DeviceQueueProperties properties = DeviceQueueProperties::None, + cl_int* err = NULL) + { + cl_int error; + + cl_command_queue_properties mergedProperties = + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | static_cast(properties); + cl_queue_properties queue_properties[] = { + CL_QUEUE_PROPERTIES, mergedProperties, + CL_QUEUE_SIZE, queueSize, + 0 }; + object_ = ::clCreateCommandQueueWithProperties( + context(), device(), queue_properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! \brief Constructor from cl_command_queue - takes ownership. + * + * \param retainObject will cause the constructor to retain its cl object. + * Defaults to false to maintain compatibility with + * earlier versions. + */ + explicit DeviceCommandQueue(const cl_command_queue& commandQueue, bool retainObject = false) : + detail::Wrapper(commandQueue, retainObject) { } + + DeviceCommandQueue& operator = (const cl_command_queue& rhs) + { + detail::Wrapper::operator=(rhs); + return *this; + } + + /*! \brief Copy constructor to forward copy to the superclass correctly. + * Required for MSVC. + */ + DeviceCommandQueue(const DeviceCommandQueue& queue) : detail::Wrapper(queue) {} + + /*! \brief Copy assignment to forward copy to the superclass correctly. + * Required for MSVC. + */ + DeviceCommandQueue& operator = (const DeviceCommandQueue &queue) + { + detail::Wrapper::operator=(queue); + return *this; + } + + /*! \brief Move constructor to forward move to the superclass correctly. + * Required for MSVC. + */ + DeviceCommandQueue(DeviceCommandQueue&& queue) CL_HPP_NOEXCEPT_ : detail::Wrapper(std::move(queue)) {} + + /*! \brief Move assignment to forward move to the superclass correctly. + * Required for MSVC. + */ + DeviceCommandQueue& operator = (DeviceCommandQueue &&queue) + { + detail::Wrapper::operator=(std::move(queue)); + return *this; + } + + template + cl_int getInfo(cl_command_queue_info name, T* param) const + { + return detail::errHandler( + detail::getInfo( + &::clGetCommandQueueInfo, object_, name, param), + __GET_COMMAND_QUEUE_INFO_ERR); + } + + template typename + detail::param_traits::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_command_queue_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /*! + * Create a new default device command queue for the default device, + * in the default context and of the default size. + * If there is already a default queue for the specified device this + * function will return the pre-existing queue. + */ + static DeviceCommandQueue makeDefault( + cl_int *err = nullptr) + { + cl_int error; + cl::Context context = cl::Context::getDefault(); + cl::Device device = cl::Device::getDefault(); + + cl_command_queue_properties properties = + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT; + cl_queue_properties queue_properties[] = { + CL_QUEUE_PROPERTIES, properties, + 0 }; + DeviceCommandQueue deviceQueue( + ::clCreateCommandQueueWithProperties( + context(), device(), queue_properties, &error)); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); + if (err != NULL) { + *err = error; + } + + return deviceQueue; + } + + /*! + * Create a new default device command queue for the specified device + * and of the default size. + * If there is already a default queue for the specified device this + * function will return the pre-existing queue. + */ + static DeviceCommandQueue makeDefault( + const Context &context, const Device &device, cl_int *err = nullptr) + { + cl_int error; + + cl_command_queue_properties properties = + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT; + cl_queue_properties queue_properties[] = { + CL_QUEUE_PROPERTIES, properties, + 0 }; + DeviceCommandQueue deviceQueue( + ::clCreateCommandQueueWithProperties( + context(), device(), queue_properties, &error)); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); + if (err != NULL) { + *err = error; + } + + return deviceQueue; + } + + /*! + * Create a new default device command queue for the specified device + * and of the requested size in bytes. + * If there is already a default queue for the specified device this + * function will return the pre-existing queue. + */ + static DeviceCommandQueue makeDefault( + const Context &context, const Device &device, cl_uint queueSize, cl_int *err = nullptr) + { + cl_int error; + + cl_command_queue_properties properties = + CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_ON_DEVICE | CL_QUEUE_ON_DEVICE_DEFAULT; + cl_queue_properties queue_properties[] = { + CL_QUEUE_PROPERTIES, properties, + CL_QUEUE_SIZE, queueSize, + 0 }; + DeviceCommandQueue deviceQueue( + ::clCreateCommandQueueWithProperties( + context(), device(), queue_properties, &error)); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_WITH_PROPERTIES_ERR); + if (err != NULL) { + *err = error; + } + + return deviceQueue; + } +}; // DeviceCommandQueue + +namespace detail +{ + // Specialization for device command queue + template <> + struct KernelArgumentHandler + { + static size_type size(const cl::DeviceCommandQueue&) { return sizeof(cl_command_queue); } + static const cl_command_queue* ptr(const cl::DeviceCommandQueue& value) { return &(value()); } + }; +} // namespace detail + +#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 + + +template< typename IteratorType > +Buffer::Buffer( + const Context &context, + IteratorType startIterator, + IteratorType endIterator, + bool readOnly, + bool useHostPtr, + cl_int* err) +{ + typedef typename std::iterator_traits::value_type DataType; + cl_int error; + + cl_mem_flags flags = 0; + if( readOnly ) { + flags |= CL_MEM_READ_ONLY; + } + else { + flags |= CL_MEM_READ_WRITE; + } + if( useHostPtr ) { + flags |= CL_MEM_USE_HOST_PTR; + } + + size_type size = sizeof(DataType)*(endIterator - startIterator); + + if( useHostPtr ) { + object_ = ::clCreateBuffer(context(), flags, size, static_cast(&*startIterator), &error); + } else { + object_ = ::clCreateBuffer(context(), flags, size, 0, &error); + } + + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + + if( !useHostPtr ) { + CommandQueue queue(context, 0, &error); + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + + error = cl::copy(queue, startIterator, endIterator, *this); + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } +} + +template< typename IteratorType > +Buffer::Buffer( + const CommandQueue &queue, + IteratorType startIterator, + IteratorType endIterator, + bool readOnly, + bool useHostPtr, + cl_int* err) +{ + typedef typename std::iterator_traits::value_type DataType; + cl_int error; + + cl_mem_flags flags = 0; + if (readOnly) { + flags |= CL_MEM_READ_ONLY; + } + else { + flags |= CL_MEM_READ_WRITE; + } + if (useHostPtr) { + flags |= CL_MEM_USE_HOST_PTR; + } + + size_type size = sizeof(DataType)*(endIterator - startIterator); + + Context context = queue.getInfo(); + + if (useHostPtr) { + object_ = ::clCreateBuffer(context(), flags, size, static_cast(&*startIterator), &error); + } + else { + object_ = ::clCreateBuffer(context(), flags, size, 0, &error); + } + + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + + if (!useHostPtr) { + error = cl::copy(queue, startIterator, endIterator, *this); + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } +} + +inline cl_int enqueueReadBuffer( + const Buffer& buffer, + cl_bool blocking, + size_type offset, + size_type size, + void* ptr, + const vector* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event); +} + +inline cl_int enqueueWriteBuffer( + const Buffer& buffer, + cl_bool blocking, + size_type offset, + size_type size, + const void* ptr, + const vector* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event); +} + +inline void* enqueueMapBuffer( + const Buffer& buffer, + cl_bool blocking, + cl_map_flags flags, + size_type offset, + size_type size, + const vector* events = NULL, + Event* event = NULL, + cl_int* err = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + + void * result = ::clEnqueueMapBuffer( + queue(), buffer(), blocking, flags, offset, size, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (cl_event*) event, + &error); + + detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + return result; +} + + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 +/** + * Enqueues to the default queue a command that will allow the host to + * update a region of a coarse-grained SVM buffer. + * This variant takes a raw SVM pointer. + */ +template +inline cl_int enqueueMapSVM( + T* ptr, + cl_bool blocking, + cl_map_flags flags, + size_type size, + const vector* events, + Event* event) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + if (error != CL_SUCCESS) { + return detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + } + + return queue.enqueueMapSVM( + ptr, blocking, flags, size, events, event); +} + +/** + * Enqueues to the default queue a command that will allow the host to + * update a region of a coarse-grained SVM buffer. + * This variant takes a cl::pointer instance. + */ +template +inline cl_int enqueueMapSVM( + cl::pointer ptr, + cl_bool blocking, + cl_map_flags flags, + size_type size, + const vector* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + if (error != CL_SUCCESS) { + return detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + } + + return queue.enqueueMapSVM( + ptr, blocking, flags, size, events, event); +} + +/** + * Enqueues to the default queue a command that will allow the host to + * update a region of a coarse-grained SVM buffer. + * This variant takes a cl::vector instance. + */ +template +inline cl_int enqueueMapSVM( + cl::vector container, + cl_bool blocking, + cl_map_flags flags, + const vector* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + if (error != CL_SUCCESS) { + return detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + } + + return queue.enqueueMapSVM( + container, blocking, flags, events, event); +} + +#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 + +inline cl_int enqueueUnmapMemObject( + const Memory& memory, + void* mapped_ptr, + const vector* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + if (error != CL_SUCCESS) { + return error; + } + + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueUnmapMemObject( + queue(), memory(), mapped_ptr, + (events != NULL) ? (cl_uint)events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*)&events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_UNMAP_MEM_OBJECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; +} + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 +/** + * Enqueues to the default queue a command that will release a coarse-grained + * SVM buffer back to the OpenCL runtime. + * This variant takes a raw SVM pointer. + */ +template +inline cl_int enqueueUnmapSVM( + T* ptr, + const vector* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + if (error != CL_SUCCESS) { + return detail::errHandler(error, __ENQUEUE_UNMAP_MEM_OBJECT_ERR); + } + + return detail::errHandler(queue.enqueueUnmapSVM(ptr, events, event), + __ENQUEUE_UNMAP_MEM_OBJECT_ERR); + +} + +/** + * Enqueues to the default queue a command that will release a coarse-grained + * SVM buffer back to the OpenCL runtime. + * This variant takes a cl::pointer instance. + */ +template +inline cl_int enqueueUnmapSVM( + cl::pointer &ptr, + const vector* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + if (error != CL_SUCCESS) { + return detail::errHandler(error, __ENQUEUE_UNMAP_MEM_OBJECT_ERR); + } + + return detail::errHandler(queue.enqueueUnmapSVM(ptr, events, event), + __ENQUEUE_UNMAP_MEM_OBJECT_ERR); +} + +/** + * Enqueues to the default queue a command that will release a coarse-grained + * SVM buffer back to the OpenCL runtime. + * This variant takes a cl::vector instance. + */ +template +inline cl_int enqueueUnmapSVM( + cl::vector &container, + const vector* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + if (error != CL_SUCCESS) { + return detail::errHandler(error, __ENQUEUE_UNMAP_MEM_OBJECT_ERR); + } + + return detail::errHandler(queue.enqueueUnmapSVM(container, events, event), + __ENQUEUE_UNMAP_MEM_OBJECT_ERR); +} + +#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 + +inline cl_int enqueueCopyBuffer( + const Buffer& src, + const Buffer& dst, + size_type src_offset, + size_type dst_offset, + size_type size, + const vector* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event); +} + +/** + * Blocking copy operation between iterators and a buffer. + * Host to Device. + * Uses default command queue. + */ +template< typename IteratorType > +inline cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer ) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + if (error != CL_SUCCESS) + return error; + + return cl::copy(queue, startIterator, endIterator, buffer); +} + +/** + * Blocking copy operation between iterators and a buffer. + * Device to Host. + * Uses default command queue. + */ +template< typename IteratorType > +inline cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator ) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + if (error != CL_SUCCESS) + return error; + + return cl::copy(queue, buffer, startIterator, endIterator); +} + +/** + * Blocking copy operation between iterators and a buffer. + * Host to Device. + * Uses specified queue. + */ +template< typename IteratorType > +inline cl_int copy( const CommandQueue &queue, IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer ) +{ + typedef typename std::iterator_traits::value_type DataType; + cl_int error; + + size_type length = endIterator-startIterator; + size_type byteLength = length*sizeof(DataType); + + DataType *pointer = + static_cast(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error)); + // if exceptions enabled, enqueueMapBuffer will throw + if( error != CL_SUCCESS ) { + return error; + } +#if defined(_MSC_VER) + std::copy( + startIterator, + endIterator, + stdext::checked_array_iterator( + pointer, length)); +#else + std::copy(startIterator, endIterator, pointer); +#endif + Event endEvent; + error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent); + // if exceptions enabled, enqueueUnmapMemObject will throw + if( error != CL_SUCCESS ) { + return error; + } + endEvent.wait(); + return CL_SUCCESS; +} + +/** + * Blocking copy operation between iterators and a buffer. + * Device to Host. + * Uses specified queue. + */ +template< typename IteratorType > +inline cl_int copy( const CommandQueue &queue, const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator ) +{ + typedef typename std::iterator_traits::value_type DataType; + cl_int error; + + size_type length = endIterator-startIterator; + size_type byteLength = length*sizeof(DataType); + + DataType *pointer = + static_cast(queue.enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error)); + // if exceptions enabled, enqueueMapBuffer will throw + if( error != CL_SUCCESS ) { + return error; + } + std::copy(pointer, pointer + length, startIterator); + Event endEvent; + error = queue.enqueueUnmapMemObject(buffer, pointer, 0, &endEvent); + // if exceptions enabled, enqueueUnmapMemObject will throw + if( error != CL_SUCCESS ) { + return error; + } + endEvent.wait(); + return CL_SUCCESS; +} + + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 +/** + * Blocking SVM map operation - performs a blocking map underneath. + */ +template +inline cl_int mapSVM(cl::vector &container) +{ + return enqueueMapSVM(container, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE); +} + +/** +* Blocking SVM map operation - performs a blocking map underneath. +*/ +template +inline cl_int unmapSVM(cl::vector &container) +{ + return enqueueUnmapSVM(container); +} + +#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 + +#if CL_HPP_TARGET_OPENCL_VERSION >= 110 +inline cl_int enqueueReadBufferRect( + const Buffer& buffer, + cl_bool blocking, + const array& buffer_offset, + const array& host_offset, + const array& region, + size_type buffer_row_pitch, + size_type buffer_slice_pitch, + size_type host_row_pitch, + size_type host_slice_pitch, + void *ptr, + const vector* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueReadBufferRect( + buffer, + blocking, + buffer_offset, + host_offset, + region, + buffer_row_pitch, + buffer_slice_pitch, + host_row_pitch, + host_slice_pitch, + ptr, + events, + event); +} + +inline cl_int enqueueWriteBufferRect( + const Buffer& buffer, + cl_bool blocking, + const array& buffer_offset, + const array& host_offset, + const array& region, + size_type buffer_row_pitch, + size_type buffer_slice_pitch, + size_type host_row_pitch, + size_type host_slice_pitch, + const void *ptr, + const vector* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueWriteBufferRect( + buffer, + blocking, + buffer_offset, + host_offset, + region, + buffer_row_pitch, + buffer_slice_pitch, + host_row_pitch, + host_slice_pitch, + ptr, + events, + event); +} + +inline cl_int enqueueCopyBufferRect( + const Buffer& src, + const Buffer& dst, + const array& src_origin, + const array& dst_origin, + const array& region, + size_type src_row_pitch, + size_type src_slice_pitch, + size_type dst_row_pitch, + size_type dst_slice_pitch, + const vector* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyBufferRect( + src, + dst, + src_origin, + dst_origin, + region, + src_row_pitch, + src_slice_pitch, + dst_row_pitch, + dst_slice_pitch, + events, + event); +} +#endif // CL_HPP_TARGET_OPENCL_VERSION >= 110 + +inline cl_int enqueueReadImage( + const Image& image, + cl_bool blocking, + const array& origin, + const array& region, + size_type row_pitch, + size_type slice_pitch, + void* ptr, + const vector* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueReadImage( + image, + blocking, + origin, + region, + row_pitch, + slice_pitch, + ptr, + events, + event); +} + +inline cl_int enqueueWriteImage( + const Image& image, + cl_bool blocking, + const array& origin, + const array& region, + size_type row_pitch, + size_type slice_pitch, + const void* ptr, + const vector* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueWriteImage( + image, + blocking, + origin, + region, + row_pitch, + slice_pitch, + ptr, + events, + event); +} + +inline cl_int enqueueCopyImage( + const Image& src, + const Image& dst, + const array& src_origin, + const array& dst_origin, + const array& region, + const vector* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyImage( + src, + dst, + src_origin, + dst_origin, + region, + events, + event); +} + +inline cl_int enqueueCopyImageToBuffer( + const Image& src, + const Buffer& dst, + const array& src_origin, + const array& region, + size_type dst_offset, + const vector* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyImageToBuffer( + src, + dst, + src_origin, + region, + dst_offset, + events, + event); +} + +inline cl_int enqueueCopyBufferToImage( + const Buffer& src, + const Image& dst, + size_type src_offset, + const array& dst_origin, + const array& region, + const vector* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyBufferToImage( + src, + dst, + src_offset, + dst_origin, + region, + events, + event); +} + + +inline cl_int flush(void) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.flush(); +} + +inline cl_int finish(void) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + + return queue.finish(); +} + +class EnqueueArgs +{ +private: + CommandQueue queue_; + const NDRange offset_; + const NDRange global_; + const NDRange local_; + vector events_; + + template + friend class KernelFunctor; + +public: + EnqueueArgs(NDRange global) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(NullRange) + { + + } + + EnqueueArgs(NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(local) + { + + } + + EnqueueArgs(NDRange offset, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(offset), + global_(global), + local_(local) + { + + } + + EnqueueArgs(Event e, NDRange global) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(NullRange) + { + events_.push_back(e); + } + + EnqueueArgs(Event e, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(local) + { + events_.push_back(e); + } + + EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(offset), + global_(global), + local_(local) + { + events_.push_back(e); + } + + EnqueueArgs(const vector &events, NDRange global) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(NullRange), + events_(events) + { + + } + + EnqueueArgs(const vector &events, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(local), + events_(events) + { + + } + + EnqueueArgs(const vector &events, NDRange offset, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(offset), + global_(global), + local_(local), + events_(events) + { + + } + + EnqueueArgs(CommandQueue &queue, NDRange global) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(NullRange) + { + + } + + EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(local) + { + + } + + EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) : + queue_(queue), + offset_(offset), + global_(global), + local_(local) + { + + } + + EnqueueArgs(CommandQueue &queue, Event e, NDRange global) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(NullRange) + { + events_.push_back(e); + } + + EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(local) + { + events_.push_back(e); + } + + EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) : + queue_(queue), + offset_(offset), + global_(global), + local_(local) + { + events_.push_back(e); + } + + EnqueueArgs(CommandQueue &queue, const vector &events, NDRange global) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(NullRange), + events_(events) + { + + } + + EnqueueArgs(CommandQueue &queue, const vector &events, NDRange global, NDRange local) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(local), + events_(events) + { + + } + + EnqueueArgs(CommandQueue &queue, const vector &events, NDRange offset, NDRange global, NDRange local) : + queue_(queue), + offset_(offset), + global_(global), + local_(local), + events_(events) + { + + } +}; + + +//---------------------------------------------------------------------------------------------- + + +/** + * Type safe kernel functor. + * + */ +template +class KernelFunctor +{ +private: + Kernel kernel_; + + template + void setArgs(T0&& t0, T1s&&... t1s) + { + kernel_.setArg(index, t0); + setArgs(std::forward(t1s)...); + } + + template + void setArgs(T0&& t0) + { + kernel_.setArg(index, t0); + } + + template + void setArgs() + { + } + + +public: + KernelFunctor(Kernel kernel) : kernel_(kernel) + {} + + KernelFunctor( + const Program& program, + const string name, + cl_int * err = NULL) : + kernel_(program, name.c_str(), err) + {} + + //! \brief Return type of the functor + typedef Event result_type; + + /** + * Enqueue kernel. + * @param args Launch parameters of the kernel. + * @param t0... List of kernel arguments based on the template type of the functor. + */ + Event operator() ( + const EnqueueArgs& args, + Ts... ts) + { + Event event; + setArgs<0>(std::forward(ts)...); + + args.queue_.enqueueNDRangeKernel( + kernel_, + args.offset_, + args.global_, + args.local_, + &args.events_, + &event); + + return event; + } + + /** + * Enqueue kernel with support for error code. + * @param args Launch parameters of the kernel. + * @param t0... List of kernel arguments based on the template type of the functor. + * @param error Out parameter returning the error code from the execution. + */ + Event operator() ( + const EnqueueArgs& args, + Ts... ts, + cl_int &error) + { + Event event; + setArgs<0>(std::forward(ts)...); + + error = args.queue_.enqueueNDRangeKernel( + kernel_, + args.offset_, + args.global_, + args.local_, + &args.events_, + &event); + + return event; + } + +#if CL_HPP_TARGET_OPENCL_VERSION >= 200 + cl_int setSVMPointers(const vector &pointerList) + { + return kernel_.setSVMPointers(pointerList); + } + + template + cl_int setSVMPointers(const T0 &t0, T1s &... ts) + { + return kernel_.setSVMPointers(t0, ts...); + } +#endif // #if CL_HPP_TARGET_OPENCL_VERSION >= 200 + + Kernel getKernel() + { + return kernel_; + } +}; + +namespace compatibility { + /** + * Backward compatibility class to ensure that cl.hpp code works with cl2.hpp. + * Please use KernelFunctor directly. + */ + template + struct make_kernel + { + typedef KernelFunctor FunctorType; + + FunctorType functor_; + + make_kernel( + const Program& program, + const string name, + cl_int * err = NULL) : + functor_(FunctorType(program, name, err)) + {} + + make_kernel( + const Kernel kernel) : + functor_(FunctorType(kernel)) + {} + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + Ts...); + + Event operator()( + const EnqueueArgs& enqueueArgs, + Ts... args) + { + return functor_( + enqueueArgs, args...); + } + }; +} // namespace compatibility + + +//---------------------------------------------------------------------------------------------------------------------- + +#undef CL_HPP_ERR_STR_ +#if !defined(CL_HPP_USER_OVERRIDE_ERROR_STRINGS) +#undef __GET_DEVICE_INFO_ERR +#undef __GET_PLATFORM_INFO_ERR +#undef __GET_DEVICE_IDS_ERR +#undef __GET_CONTEXT_INFO_ERR +#undef __GET_EVENT_INFO_ERR +#undef __GET_EVENT_PROFILE_INFO_ERR +#undef __GET_MEM_OBJECT_INFO_ERR +#undef __GET_IMAGE_INFO_ERR +#undef __GET_SAMPLER_INFO_ERR +#undef __GET_KERNEL_INFO_ERR +#undef __GET_KERNEL_ARG_INFO_ERR +#undef __GET_KERNEL_WORK_GROUP_INFO_ERR +#undef __GET_PROGRAM_INFO_ERR +#undef __GET_PROGRAM_BUILD_INFO_ERR +#undef __GET_COMMAND_QUEUE_INFO_ERR + +#undef __CREATE_CONTEXT_ERR +#undef __CREATE_CONTEXT_FROM_TYPE_ERR +#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR + +#undef __CREATE_BUFFER_ERR +#undef __CREATE_SUBBUFFER_ERR +#undef __CREATE_IMAGE2D_ERR +#undef __CREATE_IMAGE3D_ERR +#undef __CREATE_SAMPLER_ERR +#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR + +#undef __CREATE_USER_EVENT_ERR +#undef __SET_USER_EVENT_STATUS_ERR +#undef __SET_EVENT_CALLBACK_ERR +#undef __SET_PRINTF_CALLBACK_ERR + +#undef __WAIT_FOR_EVENTS_ERR + +#undef __CREATE_KERNEL_ERR +#undef __SET_KERNEL_ARGS_ERR +#undef __CREATE_PROGRAM_WITH_SOURCE_ERR +#undef __CREATE_PROGRAM_WITH_BINARY_ERR +#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR +#undef __BUILD_PROGRAM_ERR +#undef __CREATE_KERNELS_IN_PROGRAM_ERR + +#undef __CREATE_COMMAND_QUEUE_ERR +#undef __SET_COMMAND_QUEUE_PROPERTY_ERR +#undef __ENQUEUE_READ_BUFFER_ERR +#undef __ENQUEUE_WRITE_BUFFER_ERR +#undef __ENQUEUE_READ_BUFFER_RECT_ERR +#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR +#undef __ENQEUE_COPY_BUFFER_ERR +#undef __ENQEUE_COPY_BUFFER_RECT_ERR +#undef __ENQUEUE_READ_IMAGE_ERR +#undef __ENQUEUE_WRITE_IMAGE_ERR +#undef __ENQUEUE_COPY_IMAGE_ERR +#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR +#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR +#undef __ENQUEUE_MAP_BUFFER_ERR +#undef __ENQUEUE_MAP_IMAGE_ERR +#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR +#undef __ENQUEUE_NDRANGE_KERNEL_ERR +#undef __ENQUEUE_TASK_ERR +#undef __ENQUEUE_NATIVE_KERNEL + +#undef __UNLOAD_COMPILER_ERR +#undef __CREATE_SUB_DEVICES_ERR + +#undef __CREATE_PIPE_ERR +#undef __GET_PIPE_INFO_ERR + +#endif //CL_HPP_USER_OVERRIDE_ERROR_STRINGS + +// Extensions +#undef CL_HPP_INIT_CL_EXT_FCN_PTR_ +#undef CL_HPP_INIT_CL_EXT_FCN_PTR_PLATFORM_ + +#if defined(CL_HPP_USE_CL_DEVICE_FISSION) +#undef CL_HPP_PARAM_NAME_DEVICE_FISSION_ +#endif // CL_HPP_USE_CL_DEVICE_FISSION + +#undef CL_HPP_NOEXCEPT_ +#undef CL_HPP_DEFINE_STATIC_MEMBER_ + +} // namespace cl + +#endif // CL_HPP_ diff --git a/include/triton/external/CL/cl_d3d10.h b/include/triton/external/CL/cl_d3d10.h new file mode 100644 index 000000000..d5960a43f --- /dev/null +++ b/include/triton/external/CL/cl_d3d10.h @@ -0,0 +1,131 @@ +/********************************************************************************** + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +#ifndef __OPENCL_CL_D3D10_H +#define __OPENCL_CL_D3D10_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/****************************************************************************** + * cl_khr_d3d10_sharing */ +#define cl_khr_d3d10_sharing 1 + +typedef cl_uint cl_d3d10_device_source_khr; +typedef cl_uint cl_d3d10_device_set_khr; + +/******************************************************************************/ + +/* Error Codes */ +#define CL_INVALID_D3D10_DEVICE_KHR -1002 +#define CL_INVALID_D3D10_RESOURCE_KHR -1003 +#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR -1004 +#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR -1005 + +/* cl_d3d10_device_source_nv */ +#define CL_D3D10_DEVICE_KHR 0x4010 +#define CL_D3D10_DXGI_ADAPTER_KHR 0x4011 + +/* cl_d3d10_device_set_nv */ +#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR 0x4012 +#define CL_ALL_DEVICES_FOR_D3D10_KHR 0x4013 + +/* cl_context_info */ +#define CL_CONTEXT_D3D10_DEVICE_KHR 0x4014 +#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C + +/* cl_mem_info */ +#define CL_MEM_D3D10_RESOURCE_KHR 0x4015 + +/* cl_image_info */ +#define CL_IMAGE_D3D10_SUBRESOURCE_KHR 0x4016 + +/* cl_command_type */ +#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR 0x4017 +#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR 0x4018 + +/******************************************************************************/ + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)( + cl_platform_id platform, + cl_d3d10_device_source_khr d3d_device_source, + void * d3d_object, + cl_d3d10_device_set_khr d3d_device_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D10Buffer * resource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D10Texture2D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D10Texture3D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_D3D10_H */ + diff --git a/include/triton/external/CL/cl_d3d11.h b/include/triton/external/CL/cl_d3d11.h new file mode 100644 index 000000000..39f907239 --- /dev/null +++ b/include/triton/external/CL/cl_d3d11.h @@ -0,0 +1,131 @@ +/********************************************************************************** + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +#ifndef __OPENCL_CL_D3D11_H +#define __OPENCL_CL_D3D11_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/****************************************************************************** + * cl_khr_d3d11_sharing */ +#define cl_khr_d3d11_sharing 1 + +typedef cl_uint cl_d3d11_device_source_khr; +typedef cl_uint cl_d3d11_device_set_khr; + +/******************************************************************************/ + +/* Error Codes */ +#define CL_INVALID_D3D11_DEVICE_KHR -1006 +#define CL_INVALID_D3D11_RESOURCE_KHR -1007 +#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR -1008 +#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR -1009 + +/* cl_d3d11_device_source */ +#define CL_D3D11_DEVICE_KHR 0x4019 +#define CL_D3D11_DXGI_ADAPTER_KHR 0x401A + +/* cl_d3d11_device_set */ +#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR 0x401B +#define CL_ALL_DEVICES_FOR_D3D11_KHR 0x401C + +/* cl_context_info */ +#define CL_CONTEXT_D3D11_DEVICE_KHR 0x401D +#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D + +/* cl_mem_info */ +#define CL_MEM_D3D11_RESOURCE_KHR 0x401E + +/* cl_image_info */ +#define CL_IMAGE_D3D11_SUBRESOURCE_KHR 0x401F + +/* cl_command_type */ +#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR 0x4020 +#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR 0x4021 + +/******************************************************************************/ + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)( + cl_platform_id platform, + cl_d3d11_device_source_khr d3d_device_source, + void * d3d_object, + cl_d3d11_device_set_khr d3d_device_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D11Buffer * resource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D11Texture2D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D11Texture3D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_2; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_D3D11_H */ + diff --git a/include/triton/external/CL/cl_dx9_media_sharing.h b/include/triton/external/CL/cl_dx9_media_sharing.h new file mode 100644 index 000000000..2729e8b9e --- /dev/null +++ b/include/triton/external/CL/cl_dx9_media_sharing.h @@ -0,0 +1,132 @@ +/********************************************************************************** + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H +#define __OPENCL_CL_DX9_MEDIA_SHARING_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/******************************************************************************/ +/* cl_khr_dx9_media_sharing */ +#define cl_khr_dx9_media_sharing 1 + +typedef cl_uint cl_dx9_media_adapter_type_khr; +typedef cl_uint cl_dx9_media_adapter_set_khr; + +#if defined(_WIN32) +#include +typedef struct _cl_dx9_surface_info_khr +{ + IDirect3DSurface9 *resource; + HANDLE shared_handle; +} cl_dx9_surface_info_khr; +#endif + + +/******************************************************************************/ + +/* Error Codes */ +#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR -1010 +#define CL_INVALID_DX9_MEDIA_SURFACE_KHR -1011 +#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR -1012 +#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR -1013 + +/* cl_media_adapter_type_khr */ +#define CL_ADAPTER_D3D9_KHR 0x2020 +#define CL_ADAPTER_D3D9EX_KHR 0x2021 +#define CL_ADAPTER_DXVA_KHR 0x2022 + +/* cl_media_adapter_set_khr */ +#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2023 +#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2024 + +/* cl_context_info */ +#define CL_CONTEXT_ADAPTER_D3D9_KHR 0x2025 +#define CL_CONTEXT_ADAPTER_D3D9EX_KHR 0x2026 +#define CL_CONTEXT_ADAPTER_DXVA_KHR 0x2027 + +/* cl_mem_info */ +#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR 0x2028 +#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR 0x2029 + +/* cl_image_info */ +#define CL_IMAGE_DX9_MEDIA_PLANE_KHR 0x202A + +/* cl_command_type */ +#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR 0x202B +#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR 0x202C + +/******************************************************************************/ + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)( + cl_platform_id platform, + cl_uint num_media_adapters, + cl_dx9_media_adapter_type_khr * media_adapter_type, + void * media_adapters, + cl_dx9_media_adapter_set_khr media_adapter_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)( + cl_context context, + cl_mem_flags flags, + cl_dx9_media_adapter_type_khr adapter_type, + void * surface_info, + cl_uint plane, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_2; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_DX9_MEDIA_SHARING_H */ + diff --git a/include/triton/external/CL/cl_dx9_media_sharing_intel.h b/include/triton/external/CL/cl_dx9_media_sharing_intel.h new file mode 100644 index 000000000..331bab97c --- /dev/null +++ b/include/triton/external/CL/cl_dx9_media_sharing_intel.h @@ -0,0 +1,182 @@ +/********************************************************************************** + * Copyright (c) 2008-2016 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ +/*****************************************************************************\ + +Copyright (c) 2013-2016 Intel Corporation All Rights Reserved. + +THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE +MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +File Name: cl_dx9_media_sharing_intel.h + +Abstract: + +Notes: + +\*****************************************************************************/ + +#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H +#define __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H + +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/*************************************** +* cl_intel_dx9_media_sharing extension * +****************************************/ + +#define cl_intel_dx9_media_sharing 1 + +typedef cl_uint cl_dx9_device_source_intel; +typedef cl_uint cl_dx9_device_set_intel; + +/* error codes */ +#define CL_INVALID_DX9_DEVICE_INTEL -1010 +#define CL_INVALID_DX9_RESOURCE_INTEL -1011 +#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL -1012 +#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL -1013 + +/* cl_dx9_device_source_intel */ +#define CL_D3D9_DEVICE_INTEL 0x4022 +#define CL_D3D9EX_DEVICE_INTEL 0x4070 +#define CL_DXVA_DEVICE_INTEL 0x4071 + +/* cl_dx9_device_set_intel */ +#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL 0x4024 +#define CL_ALL_DEVICES_FOR_DX9_INTEL 0x4025 + +/* cl_context_info */ +#define CL_CONTEXT_D3D9_DEVICE_INTEL 0x4026 +#define CL_CONTEXT_D3D9EX_DEVICE_INTEL 0x4072 +#define CL_CONTEXT_DXVA_DEVICE_INTEL 0x4073 + +/* cl_mem_info */ +#define CL_MEM_DX9_RESOURCE_INTEL 0x4027 +#define CL_MEM_DX9_SHARED_HANDLE_INTEL 0x4074 + +/* cl_image_info */ +#define CL_IMAGE_DX9_PLANE_INTEL 0x4075 + +/* cl_command_type */ +#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL 0x402A +#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL 0x402B +/******************************************************************************/ + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceIDsFromDX9INTEL( + cl_platform_id /* platform */, + cl_dx9_device_source_intel /* dx9_device_source */, + void* /* dx9_object */, + cl_dx9_device_set_intel /* dx9_device_set */, + cl_uint /* num_entries */, + cl_device_id* /* devices */, + cl_uint* /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1; + +typedef CL_API_ENTRY cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)( + cl_platform_id /* platform */, + cl_dx9_device_source_intel /* dx9_device_source */, + void* /* dx9_object */, + cl_dx9_device_set_intel /* dx9_device_set */, + cl_uint /* num_entries */, + cl_device_id* /* devices */, + cl_uint* /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromDX9MediaSurfaceINTEL( + cl_context /* context */, + cl_mem_flags /* flags */, + IDirect3DSurface9* /* resource */, + HANDLE /* sharedHandle */, + UINT /* plane */, + cl_int* /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)( + cl_context /* context */, + cl_mem_flags /* flags */, + IDirect3DSurface9* /* resource */, + HANDLE /* sharedHandle */, + UINT /* plane */, + cl_int* /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireDX9ObjectsINTEL( + cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_1; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)( + cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseDX9ObjectsINTEL( + cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_1; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)( + cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_1; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H */ + diff --git a/include/triton/external/CL/cl_egl.h b/include/triton/external/CL/cl_egl.h new file mode 100644 index 000000000..a765bd526 --- /dev/null +++ b/include/triton/external/CL/cl_egl.h @@ -0,0 +1,136 @@ +/******************************************************************************* + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +#ifndef __OPENCL_CL_EGL_H +#define __OPENCL_CL_EGL_H + +#ifdef __APPLE__ + +#else +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + + +/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */ +#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR 0x202F +#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR 0x202D +#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR 0x202E + +/* Error type for clCreateFromEGLImageKHR */ +#define CL_INVALID_EGL_OBJECT_KHR -1093 +#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR -1092 + +/* CLeglImageKHR is an opaque handle to an EGLImage */ +typedef void* CLeglImageKHR; + +/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */ +typedef void* CLeglDisplayKHR; + +/* CLeglSyncKHR is an opaque handle to an EGLSync object */ +typedef void* CLeglSyncKHR; + +/* properties passed to clCreateFromEGLImageKHR */ +typedef intptr_t cl_egl_image_properties_khr; + + +#define cl_khr_egl_image 1 + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromEGLImageKHR(cl_context /* context */, + CLeglDisplayKHR /* egldisplay */, + CLeglImageKHR /* eglimage */, + cl_mem_flags /* flags */, + const cl_egl_image_properties_khr * /* properties */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)( + cl_context context, + CLeglDisplayKHR egldisplay, + CLeglImageKHR eglimage, + cl_mem_flags flags, + const cl_egl_image_properties_khr * properties, + cl_int * errcode_ret); + + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireEGLObjectsKHR(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event); + + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseEGLObjectsKHR(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event); + + +#define cl_khr_egl_event 1 + +extern CL_API_ENTRY cl_event CL_API_CALL +clCreateEventFromEGLSyncKHR(cl_context /* context */, + CLeglSyncKHR /* sync */, + CLeglDisplayKHR /* display */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)( + cl_context context, + CLeglSyncKHR sync, + CLeglDisplayKHR display, + cl_int * errcode_ret); + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_EGL_H */ diff --git a/include/triton/external/CL/cl_ext.h b/include/triton/external/CL/cl_ext.h new file mode 100644 index 000000000..b57190d16 --- /dev/null +++ b/include/triton/external/CL/cl_ext.h @@ -0,0 +1,670 @@ +/******************************************************************************* + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +/* $Revision: 11928 $ on $Date: 2010-07-13 09:04:56 -0700 (Tue, 13 Jul 2010) $ */ + +/* cl_ext.h contains OpenCL extensions which don't have external */ +/* (OpenGL, D3D) dependencies. */ + +#ifndef __CL_EXT_H +#define __CL_EXT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __APPLE__ + #include + #include +#else + #include +#endif + +/* cl_khr_fp64 extension - no extension #define since it has no functions */ +#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032 + +/* cl_khr_fp16 extension - no extension #define since it has no functions */ +#define CL_DEVICE_HALF_FP_CONFIG 0x1033 + +/* Memory object destruction + * + * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR + * + * Registers a user callback function that will be called when the memory object is deleted and its resources + * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback + * stack associated with memobj. The registered user callback functions are called in the reverse order in + * which they were registered. The user callback functions are called and then the memory object is deleted + * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be + * notified when the memory referenced by host_ptr, specified when the memory object is created and used as + * the storage bits for the memory object, can be reused or freed. + * + * The application may not call CL api's with the cl_mem object passed to the pfn_notify. + * + * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS) + * before using. + */ +#define cl_APPLE_SetMemObjectDestructor 1 +cl_int CL_API_ENTRY clSetMemObjectDestructorAPPLE( cl_mem /* memobj */, + void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), + void * /*user_data */ ) CL_EXT_SUFFIX__VERSION_1_0; + + +/* Context Logging Functions + * + * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext(). + * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS) + * before using. + * + * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger + */ +#define cl_APPLE_ContextLoggingFunctions 1 +extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE( const char * /* errstr */, + const void * /* private_info */, + size_t /* cb */, + void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0; + +/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */ +extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE( const char * /* errstr */, + const void * /* private_info */, + size_t /* cb */, + void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0; + +/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */ +extern void CL_API_ENTRY clLogMessagesToStderrAPPLE( const char * /* errstr */, + const void * /* private_info */, + size_t /* cb */, + void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0; + + +/************************ +* cl_khr_icd extension * +************************/ +#define cl_khr_icd 1 + +/* cl_platform_info */ +#define CL_PLATFORM_ICD_SUFFIX_KHR 0x0920 + +/* Additional Error Codes */ +#define CL_PLATFORM_NOT_FOUND_KHR -1001 + +extern CL_API_ENTRY cl_int CL_API_CALL +clIcdGetPlatformIDsKHR(cl_uint /* num_entries */, + cl_platform_id * /* platforms */, + cl_uint * /* num_platforms */); + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)( + cl_uint /* num_entries */, + cl_platform_id * /* platforms */, + cl_uint * /* num_platforms */); + + +/* Extension: cl_khr_image2D_buffer + * + * This extension allows a 2D image to be created from a cl_mem buffer without a copy. + * The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t. + * Both the sampler and sampler-less read_image built-in functions are supported for 2D images + * and 2D images created from a buffer. Similarly, the write_image built-ins are also supported + * for 2D images created from a buffer. + * + * When the 2D image from buffer is created, the client must specify the width, + * height, image format (i.e. channel order and channel data type) and optionally the row pitch + * + * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels. + * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels. + */ + +/************************************* + * cl_khr_initalize_memory extension * + *************************************/ + +#define CL_CONTEXT_MEMORY_INITIALIZE_KHR 0x2030 + + +/************************************** + * cl_khr_terminate_context extension * + **************************************/ + +#define CL_DEVICE_TERMINATE_CAPABILITY_KHR 0x2031 +#define CL_CONTEXT_TERMINATE_KHR 0x2032 + +#define cl_khr_terminate_context 1 +extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2; + + +/* + * Extension: cl_khr_spir + * + * This extension adds support to create an OpenCL program object from a + * Standard Portable Intermediate Representation (SPIR) instance + */ + +#define CL_DEVICE_SPIR_VERSIONS 0x40E0 +#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE 0x40E1 + + +/***************************************** + * cl_khr_create_command_queue extension * + *****************************************/ +#define cl_khr_create_command_queue 1 + +typedef cl_bitfield cl_queue_properties_khr; + +extern CL_API_ENTRY cl_command_queue CL_API_CALL +clCreateCommandQueueWithPropertiesKHR( cl_context /* context */, + cl_device_id /* device */, + const cl_queue_properties_khr* /* properties */, + cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2; +typedef CL_API_ENTRY cl_command_queue +(CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)( cl_context /* context */, + cl_device_id /* device */, + const cl_queue_properties_khr* /* properties */, + cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2; + + +/****************************************** +* cl_nv_device_attribute_query extension * +******************************************/ +/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */ +#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000 +#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001 +#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002 +#define CL_DEVICE_WARP_SIZE_NV 0x4003 +#define CL_DEVICE_GPU_OVERLAP_NV 0x4004 +#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005 +#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006 + +/********************************* +* cl_amd_device_memory_flags * +*********************************/ +#define cl_amd_device_memory_flags 1 + +#define CL_MEM_USE_PERSISTENT_MEM_AMD (1 << 6) // Alloc from GPU's CPU visible heap + +/* cl_device_info */ +#define CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT 0x4032 + +/********************************* +* cl_amd_device_attribute_query * +*********************************/ +#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD 0x4036 +#define CL_DEVICE_TOPOLOGY_AMD 0x4037 +#define CL_DEVICE_BOARD_NAME_AMD 0x4038 +#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD 0x4039 +#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD 0x4040 +#define CL_DEVICE_SIMD_WIDTH_AMD 0x4041 +#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD 0x4042 +#define CL_DEVICE_WAVEFRONT_WIDTH_AMD 0x4043 +#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD 0x4044 +#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD 0x4045 +#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD 0x4046 +#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD 0x4047 +#define CL_DEVICE_LOCAL_MEM_BANKS_AMD 0x4048 + +typedef union +{ + struct { cl_uint type; cl_uint data[5]; } raw; + struct { cl_uint type; cl_char unused[17]; cl_char bus; cl_char device; cl_char function; } pcie; +} cl_device_topology_amd; + +#define CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD 1 + + +/************************** +* cl_amd_offline_devices * +**************************/ +#define CL_CONTEXT_OFFLINE_DEVICES_AMD 0x403F + +/********************************* +* cl_arm_printf extension +*********************************/ +#define CL_PRINTF_CALLBACK_ARM 0x40B0 +#define CL_PRINTF_BUFFERSIZE_ARM 0x40B1 + +#ifdef CL_VERSION_1_1 + /*********************************** + * cl_ext_device_fission extension * + ***********************************/ + #define cl_ext_device_fission 1 + + extern CL_API_ENTRY cl_int CL_API_CALL + clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + typedef CL_API_ENTRY cl_int + (CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + extern CL_API_ENTRY cl_int CL_API_CALL + clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + typedef CL_API_ENTRY cl_int + (CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + typedef cl_ulong cl_device_partition_property_ext; + extern CL_API_ENTRY cl_int CL_API_CALL + clCreateSubDevicesEXT( cl_device_id /*in_device*/, + const cl_device_partition_property_ext * /* properties */, + cl_uint /*num_entries*/, + cl_device_id * /*out_devices*/, + cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + typedef CL_API_ENTRY cl_int + ( CL_API_CALL * clCreateSubDevicesEXT_fn)( cl_device_id /*in_device*/, + const cl_device_partition_property_ext * /* properties */, + cl_uint /*num_entries*/, + cl_device_id * /*out_devices*/, + cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + /* cl_device_partition_property_ext */ + #define CL_DEVICE_PARTITION_EQUALLY_EXT 0x4050 + #define CL_DEVICE_PARTITION_BY_COUNTS_EXT 0x4051 + #define CL_DEVICE_PARTITION_BY_NAMES_EXT 0x4052 + #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT 0x4053 + + /* clDeviceGetInfo selectors */ + #define CL_DEVICE_PARENT_DEVICE_EXT 0x4054 + #define CL_DEVICE_PARTITION_TYPES_EXT 0x4055 + #define CL_DEVICE_AFFINITY_DOMAINS_EXT 0x4056 + #define CL_DEVICE_REFERENCE_COUNT_EXT 0x4057 + #define CL_DEVICE_PARTITION_STYLE_EXT 0x4058 + + /* error codes */ + #define CL_DEVICE_PARTITION_FAILED_EXT -1057 + #define CL_INVALID_PARTITION_COUNT_EXT -1058 + #define CL_INVALID_PARTITION_NAME_EXT -1059 + + /* CL_AFFINITY_DOMAINs */ + #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT 0x1 + #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT 0x2 + #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT 0x3 + #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT 0x4 + #define CL_AFFINITY_DOMAIN_NUMA_EXT 0x10 + #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT 0x100 + + /* cl_device_partition_property_ext list terminators */ + #define CL_PROPERTIES_LIST_END_EXT ((cl_device_partition_property_ext) 0) + #define CL_PARTITION_BY_COUNTS_LIST_END_EXT ((cl_device_partition_property_ext) 0) + #define CL_PARTITION_BY_NAMES_LIST_END_EXT ((cl_device_partition_property_ext) 0 - 1) + + /* cl_ext_atomic_counters_32 and cl_ext_atomic_counters_64 extensions + * no extension #define since they have no functions + */ + #define CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT 0x4032 + +/********************************* +* cl_qcom_ext_host_ptr extension +*********************************/ + +#define CL_MEM_EXT_HOST_PTR_QCOM (1 << 29) + +#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM 0x40A0 +#define CL_DEVICE_PAGE_SIZE_QCOM 0x40A1 +#define CL_IMAGE_ROW_ALIGNMENT_QCOM 0x40A2 +#define CL_IMAGE_SLICE_ALIGNMENT_QCOM 0x40A3 +#define CL_MEM_HOST_UNCACHED_QCOM 0x40A4 +#define CL_MEM_HOST_WRITEBACK_QCOM 0x40A5 +#define CL_MEM_HOST_WRITETHROUGH_QCOM 0x40A6 +#define CL_MEM_HOST_WRITE_COMBINING_QCOM 0x40A7 + +typedef cl_uint cl_image_pitch_info_qcom; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceImageInfoQCOM(cl_device_id device, + size_t image_width, + size_t image_height, + const cl_image_format *image_format, + cl_image_pitch_info_qcom param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret); + +typedef struct _cl_mem_ext_host_ptr +{ + /* Type of external memory allocation. */ + /* Legal values will be defined in layered extensions. */ + cl_uint allocation_type; + + /* Host cache policy for this external memory allocation. */ + cl_uint host_cache_policy; + +} cl_mem_ext_host_ptr; + +/********************************* +* cl_qcom_ion_host_ptr extension +*********************************/ + +#define CL_MEM_ION_HOST_PTR_QCOM 0x40A8 + +typedef struct _cl_mem_ion_host_ptr +{ + /* Type of external memory allocation. */ + /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */ + cl_mem_ext_host_ptr ext_host_ptr; + + /* ION file descriptor */ + int ion_filedesc; + + /* Host pointer to the ION allocated memory */ + void* ion_hostptr; + +} cl_mem_ion_host_ptr; + +#endif /* CL_VERSION_1_1 */ + +#if defined(CL_VERSION_1_2) + +/****************************************** + * cl_img_yuv_image extension * + ******************************************/ + +/* Image formats used in clCreateImage */ +#define CL_NV21_IMG 0x40D0 +#define CL_YV12_IMG 0x40D1 + +/****************************************** + * cl_img_cached_allocations extension * + ******************************************/ + +/* Flag values used by clCreteBuffer */ +#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG (1 << 26) +#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG (1 << 27) + +/****************************************** + * cl_img_use_gralloc_ptr extension * + ******************************************/ + +/* Flag values used by clCreteBuffer */ +#define CL_MEM_USE_GRALLOC_PTR_IMG (1 << 28) + +/* To be used by clGetEventInfo: */ +#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG 0x40D2 +#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG 0x40D3 + +/* Error code from clEnqueueReleaseGrallocObjectsIMG */ +#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG 0x40D4 + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireGrallocObjectsIMG(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseGrallocObjectsIMG(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +#endif /* CL_VERSION_1_2 */ + +#ifdef CL_VERSION_2_0 +/********************************* +* cl_khr_subgroups extension +*********************************/ +#define cl_khr_subgroups 1 + +/* cl_kernel_sub_group_info is declared in CL.h. */ + +/* cl_kernel_sub_group_info */ +#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR 0x2033 +#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR 0x2034 + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelSubGroupInfoKHR(cl_kernel /* in_kernel */, + cl_device_id /*in_device*/, + cl_kernel_sub_group_info /* param_name */, + size_t /*input_value_size*/, + const void * /*input_value*/, + size_t /*param_value_size*/, + void* /*param_value*/, + size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED; + +typedef CL_API_ENTRY cl_int + ( CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel /* in_kernel */, + cl_device_id /*in_device*/, + cl_kernel_sub_group_info /* param_name */, + size_t /*input_value_size*/, + const void * /*input_value*/, + size_t /*param_value_size*/, + void* /*param_value*/, + size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED; +#endif /* CL_VERSION_2_0 */ + +#ifdef CL_VERSION_2_1 +/********************************* +* cl_khr_priority_hints extension +*********************************/ +#define cl_khr_priority_hints 1 + +typedef cl_uint cl_queue_priority_khr; + +/* cl_command_queue_properties */ +#define CL_QUEUE_PRIORITY_KHR 0x1096 + +/* cl_queue_priority_khr */ +#define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0) +#define CL_QUEUE_PRIORITY_MED_KHR (1<<1) +#define CL_QUEUE_PRIORITY_LOW_KHR (1<<2) + +#endif /* CL_VERSION_2_1 */ + +#ifdef CL_VERSION_2_1 +/********************************* +* cl_khr_throttle_hints extension +*********************************/ +#define cl_khr_throttle_hints 1 + +typedef cl_uint cl_queue_throttle_khr; + +/* cl_command_queue_properties */ +#define CL_QUEUE_THROTTLE_KHR 0x1097 + +/* cl_queue_throttle_khr */ +#define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0) +#define CL_QUEUE_THROTTLE_MED_KHR (1<<1) +#define CL_QUEUE_THROTTLE_LOW_KHR (1<<2) + +#endif /* CL_VERSION_2_1 */ + +#ifdef CL_VERSION_2_2 +/********************************* +* cl_khr_subgroup_named_barrier +*********************************/ +#define cl_khr_subgroup_named_barrier 1 + +/* cl_device_info */ +#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR 0x2035 + +#endif /* CL_VERSION_2_2 */ + +/********************************** + * cl_arm_import_memory extension * + **********************************/ + +#ifdef CL_VERSION_1_0 + +typedef intptr_t cl_import_properties_arm; + +/* Default and valid proporties name for cl_arm_import_memory */ +#define CL_IMPORT_TYPE_ARM 0x40B2 + +/* Host process memory type default value for CL_IMPORT_TYPE_ARM property */ +#define CL_IMPORT_TYPE_HOST_ARM 0x40B3 + +/* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */ +#define CL_IMPORT_TYPE_DMA_BUF_ARM 0x40B4 + +/* Secure DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */ +#define CL_IMPORT_TYPE_SECURE_ARM 0x40B5 + +/* This extension adds a new function that allows for direct memory import into + * OpenCL via the clImportMemoryARM function. + * + * Memory imported through this interface will be mapped into the device's page + * tables directly, providing zero copy access. It will never fall back to copy + * operations and aliased buffers. + * + * Types of memory supported for import are specified as additional extension + * strings. + * + * This extension produces cl_mem allocations which are compatible with all other + * users of cl_mem in the standard API. + * + * This extension maps pages with the same properties as the normal buffer creation + * function clCreateBuffer. + */ +extern CL_API_ENTRY cl_mem CL_API_CALL +clImportMemoryARM( cl_context context, + cl_mem_flags flags, + const cl_import_properties_arm *properties, + void *memory, + size_t size, + cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0; + + +#endif /* CL_VERSION_1_0 */ + +/****************************************** + * cl_arm_shared_virtual_memory extension * + ******************************************/ + +#ifdef CL_VERSION_1_2 + +/* Used by clGetDeviceInfo */ +#define CL_DEVICE_SVM_CAPABILITIES_ARM 0x40B6 + +/* Used by clGetMemObjectInfo */ +#define CL_MEM_USES_SVM_POINTER_ARM 0x40B7 + +/* Used by clSetKernelExecInfoARM: */ +#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM 0x40B8 +#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM 0x40B9 + +/* To be used by clGetEventInfo: */ +#define CL_COMMAND_SVM_FREE_ARM 0x40BA +#define CL_COMMAND_SVM_MEMCPY_ARM 0x40BB +#define CL_COMMAND_SVM_MEMFILL_ARM 0x40BC +#define CL_COMMAND_SVM_MAP_ARM 0x40BD +#define CL_COMMAND_SVM_UNMAP_ARM 0x40BE + +/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */ +#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM (1 << 0) +#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM (1 << 1) +#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM (1 << 2) +#define CL_DEVICE_SVM_ATOMICS_ARM (1 << 3) + +/* Flag values used by clSVMAllocARM: */ +#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM (1 << 10) +#define CL_MEM_SVM_ATOMICS_ARM (1 << 11) + +typedef cl_bitfield cl_svm_mem_flags_arm; +typedef cl_uint cl_kernel_exec_info_arm; +typedef cl_bitfield cl_device_svm_capabilities_arm; + +extern CL_API_ENTRY void * CL_API_CALL +clSVMAllocARM(cl_context /* context */, + cl_svm_mem_flags_arm /* flags */, + size_t /* size */, + cl_uint /* alignment */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY void CL_API_CALL +clSVMFreeARM(cl_context /* context */, + void * /* svm_pointer */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMFreeARM(cl_command_queue /* command_queue */, + cl_uint /* num_svm_pointers */, + void *[] /* svm_pointers[] */, + void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */, + cl_uint /* num_svm_pointers */, + void *[] /* svm_pointers[] */, + void * /* user_data */), + void * /* user_data */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMemcpyARM(cl_command_queue /* command_queue */, + cl_bool /* blocking_copy */, + void * /* dst_ptr */, + const void * /* src_ptr */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMemFillARM(cl_command_queue /* command_queue */, + void * /* svm_ptr */, + const void * /* pattern */, + size_t /* pattern_size */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMapARM(cl_command_queue /* command_queue */, + cl_bool /* blocking_map */, + cl_map_flags /* flags */, + void * /* svm_ptr */, + size_t /* size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMUnmapARM(cl_command_queue /* command_queue */, + void * /* svm_ptr */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelArgSVMPointerARM(cl_kernel /* kernel */, + cl_uint /* arg_index */, + const void * /* arg_value */) CL_EXT_SUFFIX__VERSION_1_2; +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelExecInfoARM(cl_kernel /* kernel */, + cl_kernel_exec_info_arm /* param_name */, + size_t /* param_value_size */, + const void * /* param_value */) CL_EXT_SUFFIX__VERSION_1_2; + +#endif /* CL_VERSION_1_2 */ + +#ifdef __cplusplus +} +#endif + + +#endif /* __CL_EXT_H */ diff --git a/include/triton/external/CL/cl_ext_intel.h b/include/triton/external/CL/cl_ext_intel.h new file mode 100644 index 000000000..1c358cfc1 --- /dev/null +++ b/include/triton/external/CL/cl_ext_intel.h @@ -0,0 +1,429 @@ +/******************************************************************************* + * Copyright (c) 2008-2017 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ +/*****************************************************************************\ + +Copyright (c) 2013-2017 Intel Corporation All Rights Reserved. + +THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE +MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +File Name: cl_ext_intel.h + +Abstract: + +Notes: + +\*****************************************************************************/ + +#ifndef __CL_EXT_INTEL_H +#define __CL_EXT_INTEL_H + +#ifdef __APPLE__ + #include + #include +#else + #include + #include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/*************************************** +* cl_intel_thread_local_exec extension * +****************************************/ + +#define cl_intel_thread_local_exec 1 + +#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL (((cl_bitfield)1) << 31) + +/*********************************************** +* cl_intel_device_partition_by_names extension * +************************************************/ + +#define cl_intel_device_partition_by_names 1 + +#define CL_DEVICE_PARTITION_BY_NAMES_INTEL 0x4052 +#define CL_PARTITION_BY_NAMES_LIST_END_INTEL -1 + +/************************************************ +* cl_intel_accelerator extension * +* cl_intel_motion_estimation extension * +* cl_intel_advanced_motion_estimation extension * +*************************************************/ + +#define cl_intel_accelerator 1 +#define cl_intel_motion_estimation 1 +#define cl_intel_advanced_motion_estimation 1 + +typedef struct _cl_accelerator_intel* cl_accelerator_intel; +typedef cl_uint cl_accelerator_type_intel; +typedef cl_uint cl_accelerator_info_intel; + +typedef struct _cl_motion_estimation_desc_intel { + cl_uint mb_block_type; + cl_uint subpixel_mode; + cl_uint sad_adjust_mode; + cl_uint search_path_type; +} cl_motion_estimation_desc_intel; + +/* error codes */ +#define CL_INVALID_ACCELERATOR_INTEL -1094 +#define CL_INVALID_ACCELERATOR_TYPE_INTEL -1095 +#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL -1096 +#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL -1097 + +/* cl_accelerator_type_intel */ +#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL 0x0 + +/* cl_accelerator_info_intel */ +#define CL_ACCELERATOR_DESCRIPTOR_INTEL 0x4090 +#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL 0x4091 +#define CL_ACCELERATOR_CONTEXT_INTEL 0x4092 +#define CL_ACCELERATOR_TYPE_INTEL 0x4093 + +/* cl_motion_detect_desc_intel flags */ +#define CL_ME_MB_TYPE_16x16_INTEL 0x0 +#define CL_ME_MB_TYPE_8x8_INTEL 0x1 +#define CL_ME_MB_TYPE_4x4_INTEL 0x2 + +#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0 +#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1 +#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL 0x2 + +#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0 +#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x1 + +#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL 0x0 +#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL 0x1 +#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL 0x5 + +#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL 0x0 +#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL 0x1 +#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL 0x2 +#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL 0x4 + +#define CL_ME_FORWARD_INPUT_MODE_INTEL 0x1 +#define CL_ME_BACKWARD_INPUT_MODE_INTEL 0x2 +#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL 0x3 + +#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL 16 +#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL 21 +#define CL_ME_BIDIR_WEIGHT_HALF_INTEL 32 +#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 43 +#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 48 + +#define CL_ME_COST_PENALTY_NONE_INTEL 0x0 +#define CL_ME_COST_PENALTY_LOW_INTEL 0x1 +#define CL_ME_COST_PENALTY_NORMAL_INTEL 0x2 +#define CL_ME_COST_PENALTY_HIGH_INTEL 0x3 + +#define CL_ME_COST_PRECISION_QPEL_INTEL 0x0 +#define CL_ME_COST_PRECISION_HPEL_INTEL 0x1 +#define CL_ME_COST_PRECISION_PEL_INTEL 0x2 +#define CL_ME_COST_PRECISION_DPEL_INTEL 0x3 + +#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0 +#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 +#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2 +#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3 + +#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4 +#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4 +#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5 +#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6 +#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7 +#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8 + +#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0 +#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 +#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2 +#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3 + +/* cl_device_info */ +#define CL_DEVICE_ME_VERSION_INTEL 0x407E + +#define CL_ME_VERSION_LEGACY_INTEL 0x0 +#define CL_ME_VERSION_ADVANCED_VER_1_INTEL 0x1 +#define CL_ME_VERSION_ADVANCED_VER_2_INTEL 0x2 + +extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL +clCreateAcceleratorINTEL( + cl_context /* context */, + cl_accelerator_type_intel /* accelerator_type */, + size_t /* descriptor_size */, + const void* /* descriptor */, + cl_int* /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)( + cl_context /* context */, + cl_accelerator_type_intel /* accelerator_type */, + size_t /* descriptor_size */, + const void* /* descriptor */, + cl_int* /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetAcceleratorInfoINTEL( + cl_accelerator_intel /* accelerator */, + cl_accelerator_info_intel /* param_name */, + size_t /* param_value_size */, + void* /* param_value */, + size_t* /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)( + cl_accelerator_intel /* accelerator */, + cl_accelerator_info_intel /* param_name */, + size_t /* param_value_size */, + void* /* param_value */, + size_t* /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainAcceleratorINTEL( + cl_accelerator_intel /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)( + cl_accelerator_intel /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseAcceleratorINTEL( + cl_accelerator_intel /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)( + cl_accelerator_intel /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2; + +/****************************************** +* cl_intel_simultaneous_sharing extension * +*******************************************/ + +#define cl_intel_simultaneous_sharing 1 + +#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL 0x4104 +#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL 0x4105 + +/*********************************** +* cl_intel_egl_image_yuv extension * +************************************/ + +#define cl_intel_egl_image_yuv 1 + +#define CL_EGL_YUV_PLANE_INTEL 0x4107 + +/******************************** +* cl_intel_packed_yuv extension * +*********************************/ + +#define cl_intel_packed_yuv 1 + +#define CL_YUYV_INTEL 0x4076 +#define CL_UYVY_INTEL 0x4077 +#define CL_YVYU_INTEL 0x4078 +#define CL_VYUY_INTEL 0x4079 + +/******************************************** +* cl_intel_required_subgroup_size extension * +*********************************************/ + +#define cl_intel_required_subgroup_size 1 + +#define CL_DEVICE_SUB_GROUP_SIZES_INTEL 0x4108 +#define CL_KERNEL_SPILL_MEM_SIZE_INTEL 0x4109 +#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL 0x410A + +/**************************************** +* cl_intel_driver_diagnostics extension * +*****************************************/ + +#define cl_intel_driver_diagnostics 1 + +typedef cl_uint cl_diagnostics_verbose_level; + +#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL 0x4106 + +#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL ( 0xff ) +#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL ( 1 ) +#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL ( 1 << 1 ) +#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL ( 1 << 2 ) + +/******************************** +* cl_intel_planar_yuv extension * +*********************************/ + +#define CL_NV12_INTEL 0x410E + +#define CL_MEM_NO_ACCESS_INTEL ( 1 << 24 ) +#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL ( 1 << 25 ) + +#define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL 0x417E +#define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL 0x417F + +/******************************************************* +* cl_intel_device_side_avc_motion_estimation extension * +********************************************************/ + +#define CL_DEVICE_AVC_ME_VERSION_INTEL 0x410B +#define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C +#define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL 0x410D + +#define CL_AVC_ME_VERSION_0_INTEL 0x0; // No support. +#define CL_AVC_ME_VERSION_1_INTEL 0x1; // First supported version. + +#define CL_AVC_ME_MAJOR_16x16_INTEL 0x0 +#define CL_AVC_ME_MAJOR_16x8_INTEL 0x1 +#define CL_AVC_ME_MAJOR_8x16_INTEL 0x2 +#define CL_AVC_ME_MAJOR_8x8_INTEL 0x3 + +#define CL_AVC_ME_MINOR_8x8_INTEL 0x0 +#define CL_AVC_ME_MINOR_8x4_INTEL 0x1 +#define CL_AVC_ME_MINOR_4x8_INTEL 0x2 +#define CL_AVC_ME_MINOR_4x4_INTEL 0x3 + +#define CL_AVC_ME_MAJOR_FORWARD_INTEL 0x0 +#define CL_AVC_ME_MAJOR_BACKWARD_INTEL 0x1 +#define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL 0x2 + +#define CL_AVC_ME_PARTITION_MASK_ALL_INTEL 0x0 +#define CL_AVC_ME_PARTITION_MASK_16x16_INTEL 0x7E +#define CL_AVC_ME_PARTITION_MASK_16x8_INTEL 0x7D +#define CL_AVC_ME_PARTITION_MASK_8x16_INTEL 0x7B +#define CL_AVC_ME_PARTITION_MASK_8x8_INTEL 0x77 +#define CL_AVC_ME_PARTITION_MASK_8x4_INTEL 0x6F +#define CL_AVC_ME_PARTITION_MASK_4x8_INTEL 0x5F +#define CL_AVC_ME_PARTITION_MASK_4x4_INTEL 0x3F + +#define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL 0x0 +#define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL 0x1 +#define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL 0x2 +#define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL 0x3 +#define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL 0x4 +#define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL 0x5 +#define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL 0x6 +#define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL 0x7 +#define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL 0x8 +#define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL 0x9 +#define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL 0x2 +#define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL 0xa + +#define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0 +#define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x2 + +#define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0 +#define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1 +#define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL 0x3 + +#define CL_AVC_ME_COST_PRECISION_QPEL_INTEL 0x0 +#define CL_AVC_ME_COST_PRECISION_HPEL_INTEL 0x1 +#define CL_AVC_ME_COST_PRECISION_PEL_INTEL 0x2 +#define CL_AVC_ME_COST_PRECISION_DPEL_INTEL 0x3 + +#define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL 0x10 +#define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL 0x15 +#define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL 0x20 +#define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 0x2B +#define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 0x30 + +#define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL 0x0 +#define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL 0x2 +#define CL_AVC_ME_BORDER_REACHED_TOP_INTEL 0x4 +#define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL 0x8 + +#define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL 0x0 +#define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL 0x4000 + +#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL ( 0x1 << 24 ) +#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL ( 0x2 << 24 ) +#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL ( 0x3 << 24 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL ( 0x55 << 24 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL ( 0xAA << 24 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL ( 0xFF << 24 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL ( 0x1 << 24 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL ( 0x2 << 24 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL ( 0x1 << 26 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL ( 0x2 << 26 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL ( 0x1 << 28 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL ( 0x2 << 28 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL ( 0x1 << 30 ) +#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL ( 0x2 << 30 ) + +#define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL 0x00 +#define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL 0x80 + +#define CL_AVC_ME_INTRA_16x16_INTEL 0x0 +#define CL_AVC_ME_INTRA_8x8_INTEL 0x1 +#define CL_AVC_ME_INTRA_4x4_INTEL 0x2 + +#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL 0x6 +#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL 0x5 +#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL 0x3 + +#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL 0x60 +#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL 0x10 +#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8 +#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4 + +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7 +#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8 +#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0 +#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1 +#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2 +#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3 + +#define CL_AVC_ME_FRAME_FORWARD_INTEL 0x1 +#define CL_AVC_ME_FRAME_BACKWARD_INTEL 0x2 +#define CL_AVC_ME_FRAME_DUAL_INTEL 0x3 + +#define CL_AVC_ME_SLICE_TYPE_PRED_INTEL 0x0 +#define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL 0x1 +#define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL 0x2 + +#define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL 0x0 +#define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL 0x1 + +#ifdef __cplusplus +} +#endif + +#endif /* __CL_EXT_INTEL_H */ + diff --git a/include/triton/external/CL/cl_gl.h b/include/triton/external/CL/cl_gl.h new file mode 100644 index 000000000..945daa83d --- /dev/null +++ b/include/triton/external/CL/cl_gl.h @@ -0,0 +1,167 @@ +/********************************************************************************** + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +#ifndef __OPENCL_CL_GL_H +#define __OPENCL_CL_GL_H + +#ifdef __APPLE__ +#include +#else +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef cl_uint cl_gl_object_type; +typedef cl_uint cl_gl_texture_info; +typedef cl_uint cl_gl_platform_info; +typedef struct __GLsync *cl_GLsync; + +/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken */ +#define CL_GL_OBJECT_BUFFER 0x2000 +#define CL_GL_OBJECT_TEXTURE2D 0x2001 +#define CL_GL_OBJECT_TEXTURE3D 0x2002 +#define CL_GL_OBJECT_RENDERBUFFER 0x2003 +#define CL_GL_OBJECT_TEXTURE2D_ARRAY 0x200E +#define CL_GL_OBJECT_TEXTURE1D 0x200F +#define CL_GL_OBJECT_TEXTURE1D_ARRAY 0x2010 +#define CL_GL_OBJECT_TEXTURE_BUFFER 0x2011 + +/* cl_gl_texture_info */ +#define CL_GL_TEXTURE_TARGET 0x2004 +#define CL_GL_MIPMAP_LEVEL 0x2005 +#define CL_GL_NUM_SAMPLES 0x2012 + + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromGLBuffer(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLuint /* bufobj */, + int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromGLTexture(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLenum /* target */, + cl_GLint /* miplevel */, + cl_GLuint /* texture */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromGLRenderbuffer(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLuint /* renderbuffer */, + cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetGLObjectInfo(cl_mem /* memobj */, + cl_gl_object_type * /* gl_object_type */, + cl_GLuint * /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetGLTextureInfo(cl_mem /* memobj */, + cl_gl_texture_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireGLObjects(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseGLObjects(cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem * /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0; + + +/* Deprecated OpenCL 1.1 APIs */ +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateFromGLTexture2D(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLenum /* target */, + cl_GLint /* miplevel */, + cl_GLuint /* texture */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateFromGLTexture3D(cl_context /* context */, + cl_mem_flags /* flags */, + cl_GLenum /* target */, + cl_GLint /* miplevel */, + cl_GLuint /* texture */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +/* cl_khr_gl_sharing extension */ + +#define cl_khr_gl_sharing 1 + +typedef cl_uint cl_gl_context_info; + +/* Additional Error Codes */ +#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000 + +/* cl_gl_context_info */ +#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006 +#define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007 + +/* Additional cl_context_properties */ +#define CL_GL_CONTEXT_KHR 0x2008 +#define CL_EGL_DISPLAY_KHR 0x2009 +#define CL_GLX_DISPLAY_KHR 0x200A +#define CL_WGL_HDC_KHR 0x200B +#define CL_CGL_SHAREGROUP_KHR 0x200C + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetGLContextInfoKHR(const cl_context_properties * /* properties */, + cl_gl_context_info /* param_name */, + size_t /* param_value_size */, + void * /* param_value */, + size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)( + const cl_context_properties * properties, + cl_gl_context_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret); + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_GL_H */ diff --git a/include/triton/external/CL/cl_gl_ext.h b/include/triton/external/CL/cl_gl_ext.h new file mode 100644 index 000000000..e3c14c640 --- /dev/null +++ b/include/triton/external/CL/cl_gl_ext.h @@ -0,0 +1,74 @@ +/********************************************************************************** + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have */ +/* OpenGL dependencies. */ + +#ifndef __OPENCL_CL_GL_EXT_H +#define __OPENCL_CL_GL_EXT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __APPLE__ + #include +#else + #include +#endif + +/* + * For each extension, follow this template + * cl_VEN_extname extension */ +/* #define cl_VEN_extname 1 + * ... define new types, if any + * ... define new tokens, if any + * ... define new APIs, if any + * + * If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header + * This allows us to avoid having to decide whether to include GL headers or GLES here. + */ + +/* + * cl_khr_gl_event extension + * See section 9.9 in the OpenCL 1.1 spec for more information + */ +#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D + +extern CL_API_ENTRY cl_event CL_API_CALL +clCreateEventFromGLsyncKHR(cl_context /* context */, + cl_GLsync /* cl_GLsync */, + cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_GL_EXT_H */ diff --git a/include/triton/external/CL/cl_platform.h b/include/triton/external/CL/cl_platform.h new file mode 100644 index 000000000..33ffb8cdc --- /dev/null +++ b/include/triton/external/CL/cl_platform.h @@ -0,0 +1,1458 @@ +/********************************************************************************** + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +/* $Revision: 11803 $ on $Date: 2010-06-25 10:02:12 -0700 (Fri, 25 Jun 2010) $ */ + +#ifndef __CL_PLATFORM_H +#define __CL_PLATFORM_H + +#ifdef __APPLE__ + /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */ + #include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_WIN32) + #define CL_API_ENTRY + #define CL_API_CALL __stdcall + #define CL_CALLBACK __stdcall +#else + #define CL_API_ENTRY + #define CL_API_CALL + #define CL_CALLBACK +#endif + +/* + * Deprecation flags refer to the last version of the header in which the + * feature was not deprecated. + * + * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without + * deprecation but is deprecated in versions later than 1.1. + */ + +#ifdef __APPLE__ + #define CL_EXTENSION_WEAK_LINK __attribute__((weak_import)) + #define CL_API_SUFFIX__VERSION_1_0 AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_0 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER + #define CL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define GCL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + + #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 + #else + #warning This path should never happen outside of internal operating system development. AvailabilityMacros do not function correctly here! + #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #endif +#else + #define CL_EXTENSION_WEAK_LINK + #define CL_API_SUFFIX__VERSION_1_0 + #define CL_EXT_SUFFIX__VERSION_1_0 + #define CL_API_SUFFIX__VERSION_1_1 + #define CL_EXT_SUFFIX__VERSION_1_1 + #define CL_API_SUFFIX__VERSION_1_2 + #define CL_EXT_SUFFIX__VERSION_1_2 + #define CL_API_SUFFIX__VERSION_2_0 + #define CL_EXT_SUFFIX__VERSION_2_0 + #define CL_API_SUFFIX__VERSION_2_1 + #define CL_EXT_SUFFIX__VERSION_2_1 + #define CL_API_SUFFIX__VERSION_2_2 + #define CL_EXT_SUFFIX__VERSION_2_2 + + #ifdef __GNUC__ + #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS + #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS + #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS + #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED + #endif + #elif defined(_WIN32) + #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED __declspec(deprecated) + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED __declspec(deprecated) + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS + #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED __declspec(deprecated) + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS + #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED __declspec(deprecated) + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS + #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED __declspec(deprecated) + #endif + #else + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + + #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED + + #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED + + #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED + #endif +#endif + +#if (defined (_WIN32) && defined(_MSC_VER)) + +/* scalar types */ +typedef signed __int8 cl_char; +typedef unsigned __int8 cl_uchar; +typedef signed __int16 cl_short; +typedef unsigned __int16 cl_ushort; +typedef signed __int32 cl_int; +typedef unsigned __int32 cl_uint; +typedef signed __int64 cl_long; +typedef unsigned __int64 cl_ulong; + +typedef unsigned __int16 cl_half; +typedef float cl_float; +typedef double cl_double; + +/* Macro names and corresponding values defined by OpenCL */ +#define CL_CHAR_BIT 8 +#define CL_SCHAR_MAX 127 +#define CL_SCHAR_MIN (-127-1) +#define CL_CHAR_MAX CL_SCHAR_MAX +#define CL_CHAR_MIN CL_SCHAR_MIN +#define CL_UCHAR_MAX 255 +#define CL_SHRT_MAX 32767 +#define CL_SHRT_MIN (-32767-1) +#define CL_USHRT_MAX 65535 +#define CL_INT_MAX 2147483647 +#define CL_INT_MIN (-2147483647-1) +#define CL_UINT_MAX 0xffffffffU +#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) +#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) +#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) + +#define CL_FLT_DIG 6 +#define CL_FLT_MANT_DIG 24 +#define CL_FLT_MAX_10_EXP +38 +#define CL_FLT_MAX_EXP +128 +#define CL_FLT_MIN_10_EXP -37 +#define CL_FLT_MIN_EXP -125 +#define CL_FLT_RADIX 2 +#define CL_FLT_MAX 340282346638528859811704183484516925440.0f +#define CL_FLT_MIN 1.175494350822287507969e-38f +#define CL_FLT_EPSILON 1.1920928955078125e-7f + +#define CL_HALF_DIG 3 +#define CL_HALF_MANT_DIG 11 +#define CL_HALF_MAX_10_EXP +4 +#define CL_HALF_MAX_EXP +16 +#define CL_HALF_MIN_10_EXP -4 +#define CL_HALF_MIN_EXP -13 +#define CL_HALF_RADIX 2 +#define CL_HALF_MAX 65504.0f +#define CL_HALF_MIN 6.103515625e-05f +#define CL_HALF_EPSILON 9.765625e-04f + +#define CL_DBL_DIG 15 +#define CL_DBL_MANT_DIG 53 +#define CL_DBL_MAX_10_EXP +308 +#define CL_DBL_MAX_EXP +1024 +#define CL_DBL_MIN_10_EXP -307 +#define CL_DBL_MIN_EXP -1021 +#define CL_DBL_RADIX 2 +#define CL_DBL_MAX 1.7976931348623158e+308 +#define CL_DBL_MIN 2.225073858507201383090e-308 +#define CL_DBL_EPSILON 2.220446049250313080847e-16 + +#define CL_M_E 2.7182818284590452354 +#define CL_M_LOG2E 1.4426950408889634074 +#define CL_M_LOG10E 0.43429448190325182765 +#define CL_M_LN2 0.69314718055994530942 +#define CL_M_LN10 2.30258509299404568402 +#define CL_M_PI 3.14159265358979323846 +#define CL_M_PI_2 1.57079632679489661923 +#define CL_M_PI_4 0.78539816339744830962 +#define CL_M_1_PI 0.31830988618379067154 +#define CL_M_2_PI 0.63661977236758134308 +#define CL_M_2_SQRTPI 1.12837916709551257390 +#define CL_M_SQRT2 1.41421356237309504880 +#define CL_M_SQRT1_2 0.70710678118654752440 + +#define CL_M_E_F 2.718281828f +#define CL_M_LOG2E_F 1.442695041f +#define CL_M_LOG10E_F 0.434294482f +#define CL_M_LN2_F 0.693147181f +#define CL_M_LN10_F 2.302585093f +#define CL_M_PI_F 3.141592654f +#define CL_M_PI_2_F 1.570796327f +#define CL_M_PI_4_F 0.785398163f +#define CL_M_1_PI_F 0.318309886f +#define CL_M_2_PI_F 0.636619772f +#define CL_M_2_SQRTPI_F 1.128379167f +#define CL_M_SQRT2_F 1.414213562f +#define CL_M_SQRT1_2_F 0.707106781f + +#define CL_NAN (CL_INFINITY - CL_INFINITY) +#define CL_HUGE_VALF ((cl_float) 1e50) +#define CL_HUGE_VAL ((cl_double) 1e500) +#define CL_MAXFLOAT CL_FLT_MAX +#define CL_INFINITY CL_HUGE_VALF + +#else + +#include + +/* scalar types */ +typedef int8_t cl_char; +typedef uint8_t cl_uchar; +typedef int16_t cl_short __attribute__((aligned(2))); +typedef uint16_t cl_ushort __attribute__((aligned(2))); +typedef int32_t cl_int __attribute__((aligned(4))); +typedef uint32_t cl_uint __attribute__((aligned(4))); +typedef int64_t cl_long __attribute__((aligned(8))); +typedef uint64_t cl_ulong __attribute__((aligned(8))); + +typedef uint16_t cl_half __attribute__((aligned(2))); +typedef float cl_float __attribute__((aligned(4))); +typedef double cl_double __attribute__((aligned(8))); + +/* Macro names and corresponding values defined by OpenCL */ +#define CL_CHAR_BIT 8 +#define CL_SCHAR_MAX 127 +#define CL_SCHAR_MIN (-127-1) +#define CL_CHAR_MAX CL_SCHAR_MAX +#define CL_CHAR_MIN CL_SCHAR_MIN +#define CL_UCHAR_MAX 255 +#define CL_SHRT_MAX 32767 +#define CL_SHRT_MIN (-32767-1) +#define CL_USHRT_MAX 65535 +#define CL_INT_MAX 2147483647 +#define CL_INT_MIN (-2147483647-1) +#define CL_UINT_MAX 0xffffffffU +#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) +#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) +#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) + +#define CL_FLT_DIG 6 +#define CL_FLT_MANT_DIG 24 +#define CL_FLT_MAX_10_EXP +38 +#define CL_FLT_MAX_EXP +128 +#define CL_FLT_MIN_10_EXP -37 +#define CL_FLT_MIN_EXP -125 +#define CL_FLT_RADIX 2 +#define CL_FLT_MAX 340282346638528859811704183484516925440.0f +#define CL_FLT_MIN 1.175494350822287507969e-38f +#define CL_FLT_EPSILON 1.1920928955078125e-7f + +#define CL_HALF_DIG 3 +#define CL_HALF_MANT_DIG 11 +#define CL_HALF_MAX_10_EXP +4 +#define CL_HALF_MAX_EXP +16 +#define CL_HALF_MIN_10_EXP -4 +#define CL_HALF_MIN_EXP -13 +#define CL_HALF_RADIX 2 +#define CL_HALF_MAX 65504.0f +#define CL_HALF_MIN 6.103515625e-05f +#define CL_HALF_EPSILON 9.765625e-04f + +#define CL_DBL_DIG 15 +#define CL_DBL_MANT_DIG 53 +#define CL_DBL_MAX_10_EXP +308 +#define CL_DBL_MAX_EXP +1024 +#define CL_DBL_MIN_10_EXP -307 +#define CL_DBL_MIN_EXP -1021 +#define CL_DBL_RADIX 2 +#define CL_DBL_MAX 179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0 +#define CL_DBL_MIN 2.225073858507201383090e-308 +#define CL_DBL_EPSILON 2.220446049250313080847e-16 + +#define CL_M_E 2.7182818284590452354 +#define CL_M_LOG2E 1.4426950408889634074 +#define CL_M_LOG10E 0.43429448190325182765 +#define CL_M_LN2 0.69314718055994530942 +#define CL_M_LN10 2.30258509299404568402 +#define CL_M_PI 3.14159265358979323846 +#define CL_M_PI_2 1.57079632679489661923 +#define CL_M_PI_4 0.78539816339744830962 +#define CL_M_1_PI 0.31830988618379067154 +#define CL_M_2_PI 0.63661977236758134308 +#define CL_M_2_SQRTPI 1.12837916709551257390 +#define CL_M_SQRT2 1.41421356237309504880 +#define CL_M_SQRT1_2 0.70710678118654752440 + +#define CL_M_E_F 2.718281828f +#define CL_M_LOG2E_F 1.442695041f +#define CL_M_LOG10E_F 0.434294482f +#define CL_M_LN2_F 0.693147181f +#define CL_M_LN10_F 2.302585093f +#define CL_M_PI_F 3.141592654f +#define CL_M_PI_2_F 1.570796327f +#define CL_M_PI_4_F 0.785398163f +#define CL_M_1_PI_F 0.318309886f +#define CL_M_2_PI_F 0.636619772f +#define CL_M_2_SQRTPI_F 1.128379167f +#define CL_M_SQRT2_F 1.414213562f +#define CL_M_SQRT1_2_F 0.707106781f + +#if defined( __GNUC__ ) + #define CL_HUGE_VALF __builtin_huge_valf() + #define CL_HUGE_VAL __builtin_huge_val() + #define CL_NAN __builtin_nanf( "" ) +#else + #define CL_HUGE_VALF ((cl_float) 1e50) + #define CL_HUGE_VAL ((cl_double) 1e500) + float nanf( const char * ); + #define CL_NAN nanf( "" ) +#endif +#define CL_MAXFLOAT CL_FLT_MAX +#define CL_INFINITY CL_HUGE_VALF + +#endif + +#include + +/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */ +typedef unsigned int cl_GLuint; +typedef int cl_GLint; +typedef unsigned int cl_GLenum; + +/* + * Vector types + * + * Note: OpenCL requires that all types be naturally aligned. + * This means that vector types must be naturally aligned. + * For example, a vector of four floats must be aligned to + * a 16 byte boundary (calculated as 4 * the natural 4-byte + * alignment of the float). The alignment qualifiers here + * will only function properly if your compiler supports them + * and if you don't actively work to defeat them. For example, + * in order for a cl_float4 to be 16 byte aligned in a struct, + * the start of the struct must itself be 16-byte aligned. + * + * Maintaining proper alignment is the user's responsibility. + */ + +/* Define basic vector types */ +#if defined( __VEC__ ) + #include /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */ + typedef __vector unsigned char __cl_uchar16; + typedef __vector signed char __cl_char16; + typedef __vector unsigned short __cl_ushort8; + typedef __vector signed short __cl_short8; + typedef __vector unsigned int __cl_uint4; + typedef __vector signed int __cl_int4; + typedef __vector float __cl_float4; + #define __CL_UCHAR16__ 1 + #define __CL_CHAR16__ 1 + #define __CL_USHORT8__ 1 + #define __CL_SHORT8__ 1 + #define __CL_UINT4__ 1 + #define __CL_INT4__ 1 + #define __CL_FLOAT4__ 1 +#endif + +#if defined( __SSE__ ) + #if defined( __MINGW64__ ) + #include + #else + #include + #endif + #if defined( __GNUC__ ) + typedef float __cl_float4 __attribute__((vector_size(16))); + #else + typedef __m128 __cl_float4; + #endif + #define __CL_FLOAT4__ 1 +#endif + +#if defined( __SSE2__ ) + #if defined( __MINGW64__ ) + #include + #else + #include + #endif + #if defined( __GNUC__ ) + typedef cl_uchar __cl_uchar16 __attribute__((vector_size(16))); + typedef cl_char __cl_char16 __attribute__((vector_size(16))); + typedef cl_ushort __cl_ushort8 __attribute__((vector_size(16))); + typedef cl_short __cl_short8 __attribute__((vector_size(16))); + typedef cl_uint __cl_uint4 __attribute__((vector_size(16))); + typedef cl_int __cl_int4 __attribute__((vector_size(16))); + typedef cl_ulong __cl_ulong2 __attribute__((vector_size(16))); + typedef cl_long __cl_long2 __attribute__((vector_size(16))); + typedef cl_double __cl_double2 __attribute__((vector_size(16))); + #else + typedef __m128i __cl_uchar16; + typedef __m128i __cl_char16; + typedef __m128i __cl_ushort8; + typedef __m128i __cl_short8; + typedef __m128i __cl_uint4; + typedef __m128i __cl_int4; + typedef __m128i __cl_ulong2; + typedef __m128i __cl_long2; + typedef __m128d __cl_double2; + #endif + #define __CL_UCHAR16__ 1 + #define __CL_CHAR16__ 1 + #define __CL_USHORT8__ 1 + #define __CL_SHORT8__ 1 + #define __CL_INT4__ 1 + #define __CL_UINT4__ 1 + #define __CL_ULONG2__ 1 + #define __CL_LONG2__ 1 + #define __CL_DOUBLE2__ 1 +#endif + +#if defined( __MMX__ ) + #include + #if defined( __GNUC__ ) + typedef cl_uchar __cl_uchar8 __attribute__((vector_size(8))); + typedef cl_char __cl_char8 __attribute__((vector_size(8))); + typedef cl_ushort __cl_ushort4 __attribute__((vector_size(8))); + typedef cl_short __cl_short4 __attribute__((vector_size(8))); + typedef cl_uint __cl_uint2 __attribute__((vector_size(8))); + typedef cl_int __cl_int2 __attribute__((vector_size(8))); + typedef cl_ulong __cl_ulong1 __attribute__((vector_size(8))); + typedef cl_long __cl_long1 __attribute__((vector_size(8))); + typedef cl_float __cl_float2 __attribute__((vector_size(8))); + #else + typedef __m64 __cl_uchar8; + typedef __m64 __cl_char8; + typedef __m64 __cl_ushort4; + typedef __m64 __cl_short4; + typedef __m64 __cl_uint2; + typedef __m64 __cl_int2; + typedef __m64 __cl_ulong1; + typedef __m64 __cl_long1; + typedef __m64 __cl_float2; + #endif + #define __CL_UCHAR8__ 1 + #define __CL_CHAR8__ 1 + #define __CL_USHORT4__ 1 + #define __CL_SHORT4__ 1 + #define __CL_INT2__ 1 + #define __CL_UINT2__ 1 + #define __CL_ULONG1__ 1 + #define __CL_LONG1__ 1 + #define __CL_FLOAT2__ 1 +#endif + +#if defined( __AVX__ ) + #if defined( __MINGW64__ ) + #include + #else + #include + #endif + #if defined( __GNUC__ ) + typedef cl_float __cl_float8 __attribute__((vector_size(32))); + typedef cl_double __cl_double4 __attribute__((vector_size(32))); + #else + typedef __m256 __cl_float8; + typedef __m256d __cl_double4; + #endif + #define __CL_FLOAT8__ 1 + #define __CL_DOUBLE4__ 1 +#endif + +/* Define capabilities for anonymous struct members. */ +#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L +#define __CL_HAS_ANON_STRUCT__ 1 +#define __CL_ANON_STRUCT__ +#elif defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) +#define __CL_HAS_ANON_STRUCT__ 1 +#define __CL_ANON_STRUCT__ __extension__ +#elif defined( _WIN32) && defined(_MSC_VER) + #if _MSC_VER >= 1500 + /* Microsoft Developer Studio 2008 supports anonymous structs, but + * complains by default. */ + #define __CL_HAS_ANON_STRUCT__ 1 + #define __CL_ANON_STRUCT__ + /* Disable warning C4201: nonstandard extension used : nameless + * struct/union */ + #pragma warning( push ) + #pragma warning( disable : 4201 ) + #endif +#else +#define __CL_HAS_ANON_STRUCT__ 0 +#define __CL_ANON_STRUCT__ +#endif + +/* Define alignment keys */ +#if defined( __GNUC__ ) + #define CL_ALIGNED(_x) __attribute__ ((aligned(_x))) +#elif defined( _WIN32) && (_MSC_VER) + /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements */ + /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx */ + /* #include */ + /* #define CL_ALIGNED(_x) _CRT_ALIGN(_x) */ + #define CL_ALIGNED(_x) +#else + #warning Need to implement some method to align data here + #define CL_ALIGNED(_x) +#endif + +/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */ +#if __CL_HAS_ANON_STRUCT__ + /* .xyzw and .s0123...{f|F} are supported */ + #define CL_HAS_NAMED_VECTOR_FIELDS 1 + /* .hi and .lo are supported */ + #define CL_HAS_HI_LO_VECTOR_FIELDS 1 +#endif + +/* Define cl_vector types */ + +/* ---- cl_charn ---- */ +typedef union +{ + cl_char CL_ALIGNED(2) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_char lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2; +#endif +}cl_char2; + +typedef union +{ + cl_char CL_ALIGNED(4) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[2]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4; +#endif +}cl_char4; + +/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */ +typedef cl_char4 cl_char3; + +typedef union +{ + cl_char CL_ALIGNED(8) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[4]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4[2]; +#endif +#if defined( __CL_CHAR8__ ) + __cl_char8 v8; +#endif +}cl_char8; + +typedef union +{ + cl_char CL_ALIGNED(16) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[8]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4[4]; +#endif +#if defined( __CL_CHAR8__ ) + __cl_char8 v8[2]; +#endif +#if defined( __CL_CHAR16__ ) + __cl_char16 v16; +#endif +}cl_char16; + + +/* ---- cl_ucharn ---- */ +typedef union +{ + cl_uchar CL_ALIGNED(2) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_uchar lo, hi; }; +#endif +#if defined( __cl_uchar2__) + __cl_uchar2 v2; +#endif +}cl_uchar2; + +typedef union +{ + cl_uchar CL_ALIGNED(4) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[2]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4; +#endif +}cl_uchar4; + +/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */ +typedef cl_uchar4 cl_uchar3; + +typedef union +{ + cl_uchar CL_ALIGNED(8) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[4]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4[2]; +#endif +#if defined( __CL_UCHAR8__ ) + __cl_uchar8 v8; +#endif +}cl_uchar8; + +typedef union +{ + cl_uchar CL_ALIGNED(16) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[8]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4[4]; +#endif +#if defined( __CL_UCHAR8__ ) + __cl_uchar8 v8[2]; +#endif +#if defined( __CL_UCHAR16__ ) + __cl_uchar16 v16; +#endif +}cl_uchar16; + + +/* ---- cl_shortn ---- */ +typedef union +{ + cl_short CL_ALIGNED(4) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_short lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2; +#endif +}cl_short2; + +typedef union +{ + cl_short CL_ALIGNED(8) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[2]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4; +#endif +}cl_short4; + +/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */ +typedef cl_short4 cl_short3; + +typedef union +{ + cl_short CL_ALIGNED(16) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[4]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4[2]; +#endif +#if defined( __CL_SHORT8__ ) + __cl_short8 v8; +#endif +}cl_short8; + +typedef union +{ + cl_short CL_ALIGNED(32) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[8]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4[4]; +#endif +#if defined( __CL_SHORT8__ ) + __cl_short8 v8[2]; +#endif +#if defined( __CL_SHORT16__ ) + __cl_short16 v16; +#endif +}cl_short16; + + +/* ---- cl_ushortn ---- */ +typedef union +{ + cl_ushort CL_ALIGNED(4) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_ushort lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2; +#endif +}cl_ushort2; + +typedef union +{ + cl_ushort CL_ALIGNED(8) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[2]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4; +#endif +}cl_ushort4; + +/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */ +typedef cl_ushort4 cl_ushort3; + +typedef union +{ + cl_ushort CL_ALIGNED(16) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[4]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4[2]; +#endif +#if defined( __CL_USHORT8__ ) + __cl_ushort8 v8; +#endif +}cl_ushort8; + +typedef union +{ + cl_ushort CL_ALIGNED(32) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[8]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4[4]; +#endif +#if defined( __CL_USHORT8__ ) + __cl_ushort8 v8[2]; +#endif +#if defined( __CL_USHORT16__ ) + __cl_ushort16 v16; +#endif +}cl_ushort16; + + +/* ---- cl_halfn ---- */ +typedef union +{ + cl_half CL_ALIGNED(4) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_half x, y; }; + __CL_ANON_STRUCT__ struct{ cl_half s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_half lo, hi; }; +#endif +#if defined( __CL_HALF2__) + __cl_half2 v2; +#endif +}cl_half2; + +typedef union +{ + cl_half CL_ALIGNED(8) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; }; +#endif +#if defined( __CL_HALF2__) + __cl_half2 v2[2]; +#endif +#if defined( __CL_HALF4__) + __cl_half4 v4; +#endif +}cl_half4; + +/* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */ +typedef cl_half4 cl_half3; + +typedef union +{ + cl_half CL_ALIGNED(16) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; }; +#endif +#if defined( __CL_HALF2__) + __cl_half2 v2[4]; +#endif +#if defined( __CL_HALF4__) + __cl_half4 v4[2]; +#endif +#if defined( __CL_HALF8__ ) + __cl_half8 v8; +#endif +}cl_half8; + +typedef union +{ + cl_half CL_ALIGNED(32) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; }; +#endif +#if defined( __CL_HALF2__) + __cl_half2 v2[8]; +#endif +#if defined( __CL_HALF4__) + __cl_half4 v4[4]; +#endif +#if defined( __CL_HALF8__ ) + __cl_half8 v8[2]; +#endif +#if defined( __CL_HALF16__ ) + __cl_half16 v16; +#endif +}cl_half16; + +/* ---- cl_intn ---- */ +typedef union +{ + cl_int CL_ALIGNED(8) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_int lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2; +#endif +}cl_int2; + +typedef union +{ + cl_int CL_ALIGNED(16) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[2]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4; +#endif +}cl_int4; + +/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */ +typedef cl_int4 cl_int3; + +typedef union +{ + cl_int CL_ALIGNED(32) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[4]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4[2]; +#endif +#if defined( __CL_INT8__ ) + __cl_int8 v8; +#endif +}cl_int8; + +typedef union +{ + cl_int CL_ALIGNED(64) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[8]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4[4]; +#endif +#if defined( __CL_INT8__ ) + __cl_int8 v8[2]; +#endif +#if defined( __CL_INT16__ ) + __cl_int16 v16; +#endif +}cl_int16; + + +/* ---- cl_uintn ---- */ +typedef union +{ + cl_uint CL_ALIGNED(8) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_uint lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2; +#endif +}cl_uint2; + +typedef union +{ + cl_uint CL_ALIGNED(16) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[2]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4; +#endif +}cl_uint4; + +/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */ +typedef cl_uint4 cl_uint3; + +typedef union +{ + cl_uint CL_ALIGNED(32) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[4]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4[2]; +#endif +#if defined( __CL_UINT8__ ) + __cl_uint8 v8; +#endif +}cl_uint8; + +typedef union +{ + cl_uint CL_ALIGNED(64) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[8]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4[4]; +#endif +#if defined( __CL_UINT8__ ) + __cl_uint8 v8[2]; +#endif +#if defined( __CL_UINT16__ ) + __cl_uint16 v16; +#endif +}cl_uint16; + +/* ---- cl_longn ---- */ +typedef union +{ + cl_long CL_ALIGNED(16) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_long lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2; +#endif +}cl_long2; + +typedef union +{ + cl_long CL_ALIGNED(32) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[2]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4; +#endif +}cl_long4; + +/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */ +typedef cl_long4 cl_long3; + +typedef union +{ + cl_long CL_ALIGNED(64) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[4]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4[2]; +#endif +#if defined( __CL_LONG8__ ) + __cl_long8 v8; +#endif +}cl_long8; + +typedef union +{ + cl_long CL_ALIGNED(128) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[8]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4[4]; +#endif +#if defined( __CL_LONG8__ ) + __cl_long8 v8[2]; +#endif +#if defined( __CL_LONG16__ ) + __cl_long16 v16; +#endif +}cl_long16; + + +/* ---- cl_ulongn ---- */ +typedef union +{ + cl_ulong CL_ALIGNED(16) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_ulong lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2; +#endif +}cl_ulong2; + +typedef union +{ + cl_ulong CL_ALIGNED(32) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[2]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4; +#endif +}cl_ulong4; + +/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */ +typedef cl_ulong4 cl_ulong3; + +typedef union +{ + cl_ulong CL_ALIGNED(64) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[4]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4[2]; +#endif +#if defined( __CL_ULONG8__ ) + __cl_ulong8 v8; +#endif +}cl_ulong8; + +typedef union +{ + cl_ulong CL_ALIGNED(128) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[8]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4[4]; +#endif +#if defined( __CL_ULONG8__ ) + __cl_ulong8 v8[2]; +#endif +#if defined( __CL_ULONG16__ ) + __cl_ulong16 v16; +#endif +}cl_ulong16; + + +/* --- cl_floatn ---- */ + +typedef union +{ + cl_float CL_ALIGNED(8) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_float lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2; +#endif +}cl_float2; + +typedef union +{ + cl_float CL_ALIGNED(16) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_float2 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[2]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4; +#endif +}cl_float4; + +/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */ +typedef cl_float4 cl_float3; + +typedef union +{ + cl_float CL_ALIGNED(32) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_float4 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[4]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4[2]; +#endif +#if defined( __CL_FLOAT8__ ) + __cl_float8 v8; +#endif +}cl_float8; + +typedef union +{ + cl_float CL_ALIGNED(64) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[8]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4[4]; +#endif +#if defined( __CL_FLOAT8__ ) + __cl_float8 v8[2]; +#endif +#if defined( __CL_FLOAT16__ ) + __cl_float16 v16; +#endif +}cl_float16; + +/* --- cl_doublen ---- */ + +typedef union +{ + cl_double CL_ALIGNED(16) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_double lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2; +#endif +}cl_double2; + +typedef union +{ + cl_double CL_ALIGNED(32) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[2]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4; +#endif +}cl_double4; + +/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */ +typedef cl_double4 cl_double3; + +typedef union +{ + cl_double CL_ALIGNED(64) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[4]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4[2]; +#endif +#if defined( __CL_DOUBLE8__ ) + __cl_double8 v8; +#endif +}cl_double8; + +typedef union +{ + cl_double CL_ALIGNED(128) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[8]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4[4]; +#endif +#if defined( __CL_DOUBLE8__ ) + __cl_double8 v8[2]; +#endif +#if defined( __CL_DOUBLE16__ ) + __cl_double16 v16; +#endif +}cl_double16; + +/* Macro to facilitate debugging + * Usage: + * Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. + * The first line ends with: CL_PROGRAM_STRING_DEBUG_INFO \" + * Each line thereafter of OpenCL C source must end with: \n\ + * The last line ends in "; + * + * Example: + * + * const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\ + * kernel void foo( int a, float * b ) \n\ + * { \n\ + * // my comment \n\ + * *b[ get_global_id(0)] = a; \n\ + * } \n\ + * "; + * + * This should correctly set up the line, (column) and file information for your source + * string so you can do source level debugging. + */ +#define __CL_STRINGIFY( _x ) # _x +#define _CL_STRINGIFY( _x ) __CL_STRINGIFY( _x ) +#define CL_PROGRAM_STRING_DEBUG_INFO "#line " _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n" + +#ifdef __cplusplus +} +#endif + +#undef __CL_HAS_ANON_STRUCT__ +#undef __CL_ANON_STRUCT__ +#if defined( _WIN32) && defined(_MSC_VER) + #if _MSC_VER >=1500 + #pragma warning( pop ) + #endif +#endif + +#endif /* __CL_PLATFORM_H */ diff --git a/include/triton/external/CL/cl_va_api_media_sharing_intel.h b/include/triton/external/CL/cl_va_api_media_sharing_intel.h new file mode 100644 index 000000000..284442885 --- /dev/null +++ b/include/triton/external/CL/cl_va_api_media_sharing_intel.h @@ -0,0 +1,172 @@ +/********************************************************************************** + * Copyright (c) 2008-2016 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ +/*****************************************************************************\ + +Copyright (c) 2013-2016 Intel Corporation All Rights Reserved. + +THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS +CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE +MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +File Name: cl_va_api_media_sharing_intel.h + +Abstract: + +Notes: + +\*****************************************************************************/ + + +#ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H +#define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/****************************************** +* cl_intel_va_api_media_sharing extension * +*******************************************/ + +#define cl_intel_va_api_media_sharing 1 + +/* error codes */ +#define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL -1098 +#define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL -1099 +#define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL -1100 +#define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL -1101 + +/* cl_va_api_device_source_intel */ +#define CL_VA_API_DISPLAY_INTEL 0x4094 + +/* cl_va_api_device_set_intel */ +#define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL 0x4095 +#define CL_ALL_DEVICES_FOR_VA_API_INTEL 0x4096 + +/* cl_context_info */ +#define CL_CONTEXT_VA_API_DISPLAY_INTEL 0x4097 + +/* cl_mem_info */ +#define CL_MEM_VA_API_MEDIA_SURFACE_INTEL 0x4098 + +/* cl_image_info */ +#define CL_IMAGE_VA_API_PLANE_INTEL 0x4099 + +/* cl_command_type */ +#define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL 0x409A +#define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL 0x409B + +typedef cl_uint cl_va_api_device_source_intel; +typedef cl_uint cl_va_api_device_set_intel; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceIDsFromVA_APIMediaAdapterINTEL( + cl_platform_id /* platform */, + cl_va_api_device_source_intel /* media_adapter_type */, + void* /* media_adapter */, + cl_va_api_device_set_intel /* media_adapter_set */, + cl_uint /* num_entries */, + cl_device_id* /* devices */, + cl_uint* /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)( + cl_platform_id /* platform */, + cl_va_api_device_source_intel /* media_adapter_type */, + void* /* media_adapter */, + cl_va_api_device_set_intel /* media_adapter_set */, + cl_uint /* num_entries */, + cl_device_id* /* devices */, + cl_uint* /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromVA_APIMediaSurfaceINTEL( + cl_context /* context */, + cl_mem_flags /* flags */, + VASurfaceID* /* surface */, + cl_uint /* plane */, + cl_int* /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)( + cl_context /* context */, + cl_mem_flags /* flags */, + VASurfaceID* /* surface */, + cl_uint /* plane */, + cl_int* /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireVA_APIMediaSurfacesINTEL( + cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)( + cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseVA_APIMediaSurfacesINTEL( + cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)( + cl_command_queue /* command_queue */, + cl_uint /* num_objects */, + const cl_mem* /* mem_objects */, + cl_uint /* num_events_in_wait_list */, + const cl_event* /* event_wait_list */, + cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_2; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H */ + diff --git a/include/triton/external/CL/opencl.h b/include/triton/external/CL/opencl.h new file mode 100644 index 000000000..9855cd75e --- /dev/null +++ b/include/triton/external/CL/opencl.h @@ -0,0 +1,59 @@ +/******************************************************************************* + * Copyright (c) 2008-2015 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS + * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS + * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT + * https://www.khronos.org/registry/ + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */ + +#ifndef __OPENCL_H +#define __OPENCL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __APPLE__ + +#include +#include +#include +#include + +#else + +#include +#include +#include +#include + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_H */ + diff --git a/include/triton/jit.h b/include/triton/jit.h index 0d90d63b0..a2c63bbf8 100644 --- a/include/triton/jit.h +++ b/include/triton/jit.h @@ -39,7 +39,7 @@ public: std::vector global_range_size; unsigned num_threads; }; - typedef std::function benchmark_t; + typedef std::function benchmark_t; struct passes_wrapper { passes_wrapper(): shared(&buffer_info), liveness(&buffer_info), @@ -74,17 +74,17 @@ private: std::unique_ptr make_triton_module(const std::string &src); public: - jit(driver::context context); + jit(driver::context* context); void autotune(const std::string &src, benchmark_t benchmark); void add_module(ir::module &module, const std::vector& params = {}); void add_module(const std::string &src, const std::vector& params = {}); - driver::kernel get_function(const std::string &name); + driver::cu_kernel get_function(const std::string &name); launch_information get_launch_info(const std::string &name); unsigned get_int(const std::string &name); private: - std::vector modules_; - driver::context driver_context_; + std::vector modules_; + driver::context* driver_context_; llvm::LLVMContext llvm_context_; ir::context triton_context_; std::map launch_info_map_; diff --git a/lib/driver/backend.cpp b/lib/driver/backend.cpp index b5094790f..726548c1c 100755 --- a/lib/driver/backend.cpp +++ b/lib/driver/backend.cpp @@ -37,6 +37,58 @@ namespace triton namespace driver { +/*-----------------------------------*/ +//----------- Platforms ------------*/ +/*-----------------------------------*/ + +void backend::platforms::init() { + if(!cache_.empty()) + return; + //if CUDA is here + if(dispatch::cuinit()){ + cache_.push_back(new cu_platform()); + } + //if OpenCL is here + if(dispatch::clinit()){ + cl_uint num_platforms; + dispatch::clGetPlatformIDs(0, nullptr, &num_platforms); + std::vector ids(num_platforms); + dispatch::clGetPlatformIDs(num_platforms, ids.data(), nullptr); + for(cl_platform_id id: ids) + cache_.push_back(new cl_platform(id)); + } + if(cache_.empty()) + throw std::runtime_error("ISAAC: No backend available. Make sure CUDA is available in your library path"); +} + +void backend::platforms::get(std::vector &results) { + std::copy(cache_.begin(), cache_.end(), std::back_inserter(results)); +} + +std::vector backend::platforms::cache_; + + +/*-----------------------------------*/ +//----------- Devices --------------*/ +/*-----------------------------------*/ + +void backend::devices::init(std::vector const & platforms) { + if(!cache_.empty()) + return; + for(driver::platform* pf: platforms) + pf->devices(cache_); + if(cache_.empty()) + throw std::runtime_error("ISAAC: No device available. Make sure that your platform is configured properly"); +} + +void backend::devices::get(std::vector &devs) { + std::copy(cache_.begin(), cache_.end(), std::back_inserter(devs)); +} + +std::vector backend::devices::cache_; + + + /*-----------------------------------*/ //---------- Modules ----------------*/ /*-----------------------------------*/ @@ -47,14 +99,14 @@ void backend::modules::release(){ cache_.clear(); } -module& backend::modules::get(driver::stream const & stream, std::string const & name, std::string const & src){ - std::tuple key(stream, name); +driver::module* backend::modules::get(driver::stream* stream, std::string const & name, std::string const & src){ + std::tuple key(stream, name); if(cache_.find(key)==cache_.end()) - return *cache_.insert(std::make_pair(key, new module(stream.context(), src))).first->second; - return *cache_.at(key); + return &*cache_.insert(std::make_pair(key, new driver::cu_module(((driver::cu_stream*)stream)->context(), src))).first->second; + return &*cache_.at(key); } -std::map, module * > backend::modules::cache_; +std::map, driver::module*> backend::modules::cache_; /*-----------------------------------*/ //----------- Kernels --------------*/ @@ -66,23 +118,23 @@ void backend::kernels::release(){ cache_.clear(); } -kernel & backend::kernels::get(driver::module const & program, std::string const & name){ - std::tuple key(program, name); +driver::kernel* backend::kernels::get(driver::module *mod, std::string const & name){ + std::tuple key(mod, name); if(cache_.find(key)==cache_.end()) - return *cache_.insert(std::make_pair(key, new kernel(program, name.c_str()))).first->second; - return *cache_.at(key); + return &*cache_.insert(std::make_pair(key, new driver::cu_kernel((driver::cu_module*)mod, name.c_str()))).first->second; + return cache_.at(key); } -std::map, kernel * > backend::kernels::cache_; +std::map, driver::kernel*> backend::kernels::cache_; /*-----------------------------------*/ //------------ Queues --------------*/ /*-----------------------------------*/ -void backend::streams::init(std::list const & contexts){ - for(context const * ctx : contexts) - if(cache_.find(*ctx)==cache_.end()) - cache_.insert(std::make_pair(*ctx, std::vector{new stream(*ctx)})); +void backend::streams::init(std::list const & contexts){ + for(driver::context* ctx : contexts) + if(cache_.find(ctx)==cache_.end()) + cache_.insert(std::make_pair(ctx, std::vector{new driver::cu_stream(ctx)})); } void backend::streams::release(){ @@ -92,33 +144,31 @@ void backend::streams::release(){ cache_.clear(); } -stream & backend::streams::get_default() +driver::stream* backend::streams::get_default() { return get(contexts::get_default(), 0); } -stream & backend::streams::get(driver::context const & context, unsigned int id){ - init(std::list(1,&context)); +driver::stream* backend::streams::get(driver::context* context, unsigned int id){ + init(std::list(1,context)); for(auto & x : cache_) if(x.first==context) - return *x.second[id]; + return x.second[id]; throw; } -void backend::streams::get(driver::context const & context, std::vector & queues){ - init(std::list(1,&context)); +void backend::streams::get(driver::context* context, std::vector & queues){ + init(std::list(1,context)); queues = cache_.at(context); } -std::map > backend::streams::cache_; +std::map> backend::streams::cache_; /*-----------------------------------*/ //------------ Contexts ------------*/ /*-----------------------------------*/ -void backend::contexts::init(std::vector const & platforms){ - for(platform const & platform: platforms){ - for(device const & device: platform.devices()) - cache_.push_back(new context(device)); - } +void backend::contexts::init(std::vector const & devices){ + for(driver::device* dvc: devices) + cache_.push_back(new cu_context(dvc)); } void backend::contexts::release(){ @@ -127,19 +177,19 @@ void backend::contexts::release(){ cache_.clear(); } -driver::context const & backend::contexts::get_default(){ +driver::context* backend::contexts::get_default(){ backend::init(); - std::list::const_iterator it = cache_.begin(); + auto it = cache_.begin(); std::advance(it, default_device); - return **it; + return *it; } -void backend::contexts::get(std::list & contexts){ +void backend::contexts::get(std::list & contexts){ backend::init(); contexts = cache_; } -std::list backend::contexts::cache_; +std::list backend::contexts::cache_; @@ -147,28 +197,8 @@ std::list backend::contexts::cache_; //------------ General -------------*/ /*-----------------------------------*/ -std::vector backend::devices(){ - std::vector platforms = backend::platforms(); - std::vector result; - for(platform const & platform: platforms){ - auto devices = platform.devices(); - result.insert(result.end(), devices.begin(), devices.end()); - } - return result; -} - -std::vector backend::platforms(){ - std::vector platforms; - //if CUDA is here - if(dispatch::cuinit()) - platforms.push_back(platform()); - if(platforms.empty()) - throw std::runtime_error("ISAAC: No backend available. Make sure CUDA is available in your library path"); - return platforms; -} - -void backend::synchronize(driver::context const & context){ - for(stream * queue: streams::cache_.at(context)) +void backend::synchronize(driver::context* context){ + for(driver::stream * queue: streams::cache_.at(context)) queue->synchronize(); } @@ -184,8 +214,13 @@ void backend::release(){ void backend::init(){ if(!contexts::cache_.empty()) return; - std::vector platforms = backend::platforms(); - contexts::init(platforms); + // initialize platforms + backend::platforms::init(); + // initialize devices + backend::devices::init(platforms::cache_); + // initialize contexts + backend::contexts::init(devices::cache_); + // initialize streams streams::init(contexts::cache_); } diff --git a/lib/driver/buffer.cpp b/lib/driver/buffer.cpp index 129565a27..520347c7d 100755 --- a/lib/driver/buffer.cpp +++ b/lib/driver/buffer.cpp @@ -33,28 +33,46 @@ namespace triton namespace driver { -buffer::buffer(driver::context const & context, size_t size) : context_(context) -{ - ContextSwitcher ctx_switch(context_); + +// + +buffer::buffer(driver::context* ctx, CUdeviceptr cu, bool take_ownership) + : polymorphic_resource(cu, take_ownership), context_(ctx) { } + +buffer::buffer(driver::context* ctx, cl_mem cl, bool take_ownership) + : polymorphic_resource(cl, take_ownership), context_(ctx) { } + +driver::context* buffer::context() { + return context_; +} + +// + +ocl_buffer::ocl_buffer(driver::context* context, size_t size) + : buffer(context, cl_mem(), true){ + cl_int err; + dispatch::clCreateBuffer(*context->cl(), CL_MEM_READ_WRITE, size, NULL, &err); +} + + +// + +cu_buffer::cu_buffer(driver::context* context, size_t size) + : buffer(context, CUdeviceptr(), true) { + cu_context::context_switcher ctx_switch(*context_); dispatch::cuMemAlloc(&*cu_, size); } -buffer::buffer(driver::context const & context, CUdeviceptr cu, bool take_ownership): - context_(context), cu_(cu, take_ownership) -{ } +cu_buffer::cu_buffer(driver::context* context, CUdeviceptr cu, bool take_ownership) + : buffer(context, cu, take_ownership){ +} -void buffer::set_zero(stream const & queue, size_t size) +void cu_buffer::set_zero(cu_stream const & queue, size_t size) { - ContextSwitcher ctx_switch(context_); - dispatch::cuMemsetD8Async(*cu_, 0, size, queue); + cu_context::context_switcher ctx_switch(*context_); + dispatch::cuMemsetD8Async(*cu_, 0, size, *queue.cu()); } -handle const & buffer::cu() const -{ return cu_; } - -handle & buffer::cu() -{ return cu_; } - } } diff --git a/lib/driver/context.cpp b/lib/driver/context.cpp index 6177749b5..56654c19d 100755 --- a/lib/driver/context.cpp +++ b/lib/driver/context.cpp @@ -35,9 +35,28 @@ namespace triton namespace driver { +/* ------------------------ */ +// BASE // +/* ------------------------ */ + +context::context(driver::device *dev, CUcontext cu, bool take_ownership): + polymorphic_resource(cu, take_ownership), + dev_(dev), cache_path_(get_cache_path()) { +} + +context::context(driver::device *dev, cl_context cl, bool take_ownership): + polymorphic_resource(cl, take_ownership), + dev_(dev), cache_path_(get_cache_path()){ + +} + +driver::device* context::device() const { + return dev_; +} + std::string context::get_cache_path(){ //user-specified cache path - std::string result = tools::getenv("ISAAC_CACHE_PATH"); + std::string result = tools::getenv("TRITON_CACHE_PATH"); if(!result.empty()){ if(tools::mkpath(result)==0) return result; @@ -46,7 +65,7 @@ std::string context::get_cache_path(){ result = tools::getenv("HOME"); if(!result.empty()) { - result = result + "/.isaac/cache/"; + result = result + "/.triton/cache/"; if(tools::mkpath(result)==0) return result; } @@ -54,7 +73,28 @@ std::string context::get_cache_path(){ return ""; } -CUdevice context::device(CUcontext context){ +std::string const & context::cache_path() const{ + return cache_path_; +} + + +/* ------------------------ */ +// CUDA // +/* ------------------------ */ + +// RAII context switcher +cu_context::context_switcher::context_switcher(const context &ctx): ctx_((const cu_context&)ctx) { + dispatch::cuCtxPushCurrent_v2(*ctx_.cu()); +} + +cu_context::context_switcher::~context_switcher() { + CUcontext tmp; + dispatch::cuCtxPopCurrent_v2(&tmp); + assert(tmp==(CUcontext)ctx_ && "Switching back to invalid context!"); +} + +// import CUdevice +CUdevice cu_context::get_device_of(CUcontext context){ dispatch::cuCtxPushCurrent_v2(context); CUdevice res; dispatch::cuCtxGetDevice(&res); @@ -62,35 +102,24 @@ CUdevice context::device(CUcontext context){ return res; } -context::context(CUcontext context, bool take_ownership): cu_(context, take_ownership), dvc_(device(context), false), cache_path_(get_cache_path()) -{ } +// wrapper for cuda context +cu_context::cu_context(CUcontext context, bool take_ownership): driver::context(new driver::cu_device(get_device_of(context), false), + context, take_ownership) { +} -context::context(driver::device const & device): dvc_(device), cache_path_(get_cache_path()) -{ - dispatch::cuCtxCreate(&*cu_, CU_CTX_SCHED_AUTO, (CUdevice)device); +cu_context::cu_context(driver::device* device): context(device, CUcontext(), true){ + dispatch::cuCtxCreate(&*cu_, CU_CTX_SCHED_AUTO, *((driver::cu_device*)dev_)->cu()); dispatch::cuCtxPopCurrent_v2(NULL); } -device const & context::device() const -{ return dvc_; } -std::string const & context::cache_path() const -{ return cache_path_; } +/* ------------------------ */ +// OpenCL // +/* ------------------------ */ -handle const & context::cu() const -{ return cu_; } - -/* Context Switcher */ -ContextSwitcher::ContextSwitcher(driver::context const & ctx): ctx_(ctx) -{ - dispatch::cuCtxPushCurrent_v2(ctx_); -} - -ContextSwitcher::~ContextSwitcher() -{ - CUcontext tmp; - dispatch::cuCtxPopCurrent_v2(&tmp); - assert(tmp==(CUcontext)ctx_ && "Switching back to invalid context!"); +ocl_context::ocl_context(driver::device* dev): context(dev, cl_context(), true) { + cl_int err; + *cl_ = dispatch::clCreateContext(nullptr, 1, &*dev->cl(), nullptr, nullptr, &err); } diff --git a/lib/driver/device.cpp b/lib/driver/device.cpp index 3f7783fbc..0b9852e7b 100755 --- a/lib/driver/device.cpp +++ b/lib/driver/device.cpp @@ -34,34 +34,34 @@ namespace triton namespace driver { -/* Architecture [NVidia] */ -device::Architecture device::nv_arch(std::pair sm) const{ - switch(sm.first) - { + +/* ------------------------ */ +// CUDA // +/* ------------------------ */ + +// Architecture +cu_device::Architecture cu_device::nv_arch(std::pair sm) const { + switch(sm.first) { case 7: - switch(sm.second) - { + switch(sm.second){ case 0: return Architecture::SM_7_0; } case 6: - switch(sm.second) - { + switch(sm.second){ case 0: return Architecture::SM_6_0; case 1: return Architecture::SM_6_1; } case 5: - switch(sm.second) - { + switch(sm.second){ case 0: return Architecture::SM_5_0; case 2: return Architecture::SM_5_2; default: return Architecture::UNKNOWN; } case 3: - switch(sm.second) - { + switch(sm.second){ case 0: return Architecture::SM_3_0; case 5: return Architecture::SM_3_5; case 7: return Architecture::SM_3_7; @@ -69,8 +69,7 @@ device::Architecture device::nv_arch(std::pair sm) c } case 2: - switch(sm.second) - { + switch(sm.second){ case 0: return Architecture::SM_2_0; case 1: return Architecture::SM_2_1; default: return Architecture::UNKNOWN; @@ -80,14 +79,16 @@ device::Architecture device::nv_arch(std::pair sm) c } } +// information query template -int device::cuGetInfo() const{ +int cu_device::cuGetInfo() const{ int res; dispatch::cuDeviceGetAttribute(&res, attr, *cu_); return res; } -nvmlDevice_t device::nvml_device() const{ +// convert to nvml +nvmlDevice_t cu_device::nvml_device() const{ std::map map; std::string key = pci_bus_id(); if(map.find(key)==map.end()){ @@ -98,34 +99,37 @@ nvmlDevice_t device::nvml_device() const{ return map.at(key); } -/* Architecture */ -device::Architecture device::architecture() const -{ return nv_arch(compute_capability()); } +// architecture +cu_device::Architecture cu_device::architecture() const{ + return nv_arch(compute_capability()); +} -/* Attributes */ -size_t device::address_bits() const -{ return sizeof(size_t)*8; } +// number of address bits +size_t cu_device::address_bits() const{ + return sizeof(size_t)*8; +} -driver::platform device::platform() const -{ return platform(); } - -std::string device::name() const{ +// name +std::string cu_device::name() const { char tmp[128]; dispatch::cuDeviceGetName(tmp, 128, *cu_); return std::string(tmp); } -std::string device::pci_bus_id() const{ +// PCI bus ID +std::string cu_device::pci_bus_id() const{ char tmp[128]; dispatch::cuDeviceGetPCIBusId(tmp, 128, *cu_); return std::string(tmp); } -void device::interpret_as(std::pair cc){ +// force the device to be interpreted as a particular cc +void cu_device::interpret_as(std::pair cc){ interpreted_as_ = std::make_shared>(cc); } -std::pair device::compute_capability() const{ +// compute capability +std::pair cu_device::compute_capability() const { if(interpreted_as_) return *interpreted_as_; size_t _major = cuGetInfo(); @@ -133,17 +137,24 @@ std::pair device::compute_capability() const{ return std::make_pair(_major, _minor); } -size_t device::max_threads_per_block() const -{ return cuGetInfo(); } +// maximum number of threads per block +size_t cu_device::max_threads_per_block() const { + return cuGetInfo(); +} -size_t device::max_shared_memory() const -{ return cuGetInfo(); } +// maximum amount of shared memory per block +size_t cu_device::max_shared_memory() const { + return cuGetInfo(); +} -size_t device::warp_size() const -{ return cuGetInfo(); } +// warp size +size_t cu_device::warp_size() const { + return cuGetInfo(); +} -std::vector device::max_block_dim() const{ +// maximum block dimensions +std::vector cu_device::max_block_dim() const { std::vector result(3); result[0] = cuGetInfo(); result[1] = cuGetInfo(); @@ -151,36 +162,39 @@ std::vector device::max_block_dim() const{ return result; } -size_t device::current_sm_clock() const{ +// current SM clock +size_t cu_device::current_sm_clock() const{ unsigned int result; dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_SM, &result); return result; } -size_t device::max_sm_clock() const{ +// max SM clock +size_t cu_device::max_sm_clock() const{ unsigned int result; dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_SM, &result); return result; } - -size_t device::current_mem_clock() const{ +// current memory clock +size_t cu_device::current_mem_clock() const{ unsigned int result; dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_MEM, &result); return result; } -size_t device::max_mem_clock() const{ +// max memory clock +size_t cu_device::max_mem_clock() const{ unsigned int result; dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_MEM, &result); return result; } -/* Infos */ -std::string device::infos() const{ +// print infos +std::string cu_device::infos() const{ std::ostringstream oss; std::vector max_wi_sizes = max_block_dim(); - oss << "Platform: " << platform().name() << std::endl; + oss << "Platform: CUDA" << std::endl; oss << "Name: " << name() << std::endl; oss << "Maximum total work-group size: " << max_threads_per_block() << std::endl; oss << "Maximum individual work-group sizes: " << max_wi_sizes[0] << ", " << max_wi_sizes[1] << ", " << max_wi_sizes[2] << std::endl; @@ -188,9 +202,6 @@ std::string device::infos() const{ return oss.str(); } -handle const & device::cu() const -{ return cu_; } - } } diff --git a/lib/driver/dispatch.cpp b/lib/driver/dispatch.cpp index 25e4638f8..b2e556d8e 100755 --- a/lib/driver/dispatch.cpp +++ b/lib/driver/dispatch.cpp @@ -72,6 +72,17 @@ namespace driver #define DEFINE19(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m, t14 n, t15 o, t16 p, t17 q, t18 r, t19 s)\ {return f_impl(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s); } +//Specialized helpers for OpenCL +#define OCL_DEFINE1(ret, fname, t1) DEFINE1(clinit, opencl_, ret, fname, t1) +#define OCL_DEFINE2(ret, fname, t1, t2) DEFINE2(clinit, opencl_, ret, fname, t1, t2) +#define OCL_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(clinit, opencl_, ret, fname, t1, t2, t3) +#define OCL_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(clinit, opencl_, ret, fname, t1, t2, t3, t4) +#define OCL_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(clinit, opencl_, ret, fname, t1, t2, t3, t4, t5) +#define OCL_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(clinit, opencl_, ret, fname, t1, t2, t3, t4, t5, t6) +#define OCL_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(clinit, opencl_, ret, fname, t1, t2, t3, t4, t5, t6, t7) +#define OCL_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(clinit, opencl_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) +#define OCL_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(clinit, opencl_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) + //Specialized helpers for CUDA #define CUDA_DEFINE1(ret, fname, t1) DEFINE1(cuinit, cuda_, ret, fname, t1) #define CUDA_DEFINE2(ret, fname, t1, t2) DEFINE2(cuinit, cuda_, ret, fname, t1, t2) @@ -104,15 +115,24 @@ namespace driver #define CUDNN_DEFINE13(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) DEFINE13(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) +bool dispatch::clinit() +{ + if(opencl_==nullptr) + opencl_ = dlopen("libOpenCL.so", RTLD_LAZY); + return opencl_ != nullptr; +} + bool dispatch::cuinit(){ if(cuda_==nullptr) cuda_ = dlopen("libcuda.so", RTLD_LAZY); + if(cuda_ == nullptr) + return false; CUresult (*fptr)(unsigned int); - cuInit_ = dlsym(cuda_, "cuInit"); - *reinterpret_cast(&fptr) = cuInit_; - CUresult res = (*fptr)(0); - check(res); - return cuda_ != nullptr; + cuInit_ = dlsym(cuda_, "cuInit"); + *reinterpret_cast(&fptr) = cuInit_; + CUresult res = (*fptr)(0); + check(res); + return true; } bool dispatch::nvmlinit(){ @@ -180,17 +200,17 @@ NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *, nvmlD NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*) NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*) -cublasHandle_t dispatch::cublasHandle(driver::context const & ctx){ - static std::map handles; - auto pr = handles.insert({ctx, cublasHandle_t()}); +cublasHandle_t dispatch::cublasHandle(const cu_context &ctx){ + static std::map handles; + auto pr = handles.insert({*ctx.cu(), cublasHandle_t()}); if(pr.second) cublasCreate_v2(&pr.first->second); return pr.first->second; } -cudnnHandle_t dispatch::cudnnHandle(driver::context const & ctx){ - static std::map handles; - auto pr = handles.insert({ctx, cudnnHandle_t()}); +cudnnHandle_t dispatch::cudnnHandle(driver::cu_context const & ctx){ + static std::map handles; + auto pr = handles.insert({*ctx.cu(), cudnnHandle_t()}); if(pr.second) cudnnCreate(&pr.first->second); return pr.first->second; @@ -231,16 +251,51 @@ CUDNN_DEFINE13(cudnnStatus_t, cudnnConvolutionForward, cudnnHandle_t, const void CUDNN_DEFINE2(cudnnStatus_t, cudnnSetStream, cudnnHandle_t, cudaStream_t) CUDNN_DEFINE7(cudnnStatus_t, cudnnTransformTensor, cudnnHandle_t, const void*, const cudnnTensorDescriptor_t, const void*, const void*, const cudnnTensorDescriptor_t, void*) +// OpenCL +cl_int dispatch::clBuildProgram(cl_program a, cl_uint b, const cl_device_id * c, const char * d, void (*e)(cl_program, void *), void * f) +{ return f_impl(opencl_, clBuildProgram, clBuildProgram_, "clBuildProgram", a, b, c, d, e, f); } +cl_context dispatch::clCreateContext(const cl_context_properties * a, cl_uint b, const cl_device_id * c, void (*d)(const char *, const void *, size_t, void *), void * e, cl_int * f) +{ return f_impl(opencl_, dispatch::clCreateContext, dispatch::clCreateContext_, "clCreateContext", a, b, c, d, e, f); } + +OCL_DEFINE9(cl_int, clEnqueueNDRangeKernel, cl_command_queue, cl_kernel, cl_uint, const size_t*, const size_t*, const size_t*, cl_uint, const cl_event*, cl_event*) +OCL_DEFINE4(cl_int, clSetKernelArg, cl_kernel, cl_uint, size_t, const void *) +OCL_DEFINE1(cl_int, clReleaseMemObject, cl_mem) +OCL_DEFINE1(cl_int, clFinish, cl_command_queue) +OCL_DEFINE5(cl_int, clGetMemObjectInfo, cl_mem, cl_mem_info, size_t, void *, size_t *) +OCL_DEFINE5(cl_int, clGetCommandQueueInfo, cl_command_queue, cl_command_queue_info, size_t, void *, size_t *) +OCL_DEFINE1(cl_int, clReleaseContext, cl_context) +OCL_DEFINE1(cl_int, clReleaseEvent, cl_event) +OCL_DEFINE9(cl_int, clEnqueueWriteBuffer, cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *) +OCL_DEFINE9(cl_int, clEnqueueReadBuffer, cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *) +OCL_DEFINE6(cl_int, clGetProgramBuildInfo, cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *) +OCL_DEFINE1(cl_int, clReleaseDevice, cl_device_id) +OCL_DEFINE5(cl_int, clGetDeviceIDs, cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *) +OCL_DEFINE5(cl_int, clGetContextInfo, cl_context, cl_context_info, size_t, void *, size_t *) +OCL_DEFINE5(cl_int, clGetDeviceInfo, cl_device_id, cl_device_info, size_t, void *, size_t *) +OCL_DEFINE1(cl_int, clReleaseCommandQueue, cl_command_queue) +OCL_DEFINE3(cl_int, clGetPlatformIDs, cl_uint, cl_platform_id *, cl_uint *) +OCL_DEFINE5(cl_int, clGetPlatformInfo, cl_platform_id, cl_platform_info, size_t, void *, size_t *) +OCL_DEFINE5(cl_int, clGetEventProfilingInfo, cl_event, cl_profiling_info, size_t, void *, size_t *) +OCL_DEFINE7(cl_program, clCreateProgramWithBinary, cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *) +OCL_DEFINE4(cl_command_queue, clCreateCommandQueue, cl_context, cl_device_id, cl_command_queue_properties, cl_int *) +OCL_DEFINE1(cl_int, clRetainEvent, cl_event) +OCL_DEFINE1(cl_int, clReleaseProgram, cl_program) +OCL_DEFINE1(cl_int, clFlush, cl_command_queue) +OCL_DEFINE5(cl_int, clGetProgramInfo, cl_program, cl_program_info, size_t, void *, size_t *) +OCL_DEFINE5(cl_int, clGetKernelInfo, cl_kernel, cl_kernel_info, size_t, void *, size_t *) +OCL_DEFINE6(cl_int, clGetKernelWorkGroupInfo, cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *) +OCL_DEFINE3(cl_kernel, clCreateKernel, cl_program, const char *, cl_int *) +OCL_DEFINE5(cl_mem, clCreateBuffer, cl_context, cl_mem_flags, size_t, void *, cl_int *) +OCL_DEFINE5(cl_program, clCreateProgramWithSource, cl_context, cl_uint, const char **, const size_t *, cl_int *) +OCL_DEFINE1(cl_int, clReleaseKernel, cl_kernel) + +// Release void dispatch::release(){ if(cuda_){ dlclose(cuda_); cuda_ = nullptr; } - if(nvrtc_){ - dlclose(nvrtc_); - nvrtc_ = nullptr; - } if(cublas_){ dlclose(cublas_); cublas_ = nullptr; @@ -251,12 +306,47 @@ void dispatch::release(){ } } +void * dispatch::opencl_; void* dispatch::cuda_; -void* dispatch::nvrtc_; void* dispatch::nvml_; void* dispatch::cublas_; void* dispatch::cudnn_; +//OpenCL +void* dispatch::clBuildProgram_; +void* dispatch::clEnqueueNDRangeKernel_; +void* dispatch::clSetKernelArg_; +void* dispatch::clReleaseMemObject_; +void* dispatch::clFinish_; +void* dispatch::clGetMemObjectInfo_; +void* dispatch::clGetCommandQueueInfo_; +void* dispatch::clReleaseContext_; +void* dispatch::clReleaseEvent_; +void* dispatch::clEnqueueWriteBuffer_; +void* dispatch::clEnqueueReadBuffer_; +void* dispatch::clGetProgramBuildInfo_; +void* dispatch::clReleaseDevice_; +void* dispatch::clCreateContext_; +void* dispatch::clGetDeviceIDs_; +void* dispatch::clGetContextInfo_; +void* dispatch::clGetDeviceInfo_; +void* dispatch::clReleaseCommandQueue_; +void* dispatch::clGetPlatformIDs_; +void* dispatch::clGetPlatformInfo_; +void* dispatch::clGetEventProfilingInfo_; +void* dispatch::clCreateProgramWithBinary_; +void* dispatch::clCreateCommandQueue_; +void* dispatch::clRetainEvent_; +void* dispatch::clReleaseProgram_; +void* dispatch::clFlush_; +void* dispatch::clGetProgramInfo_; +void* dispatch::clGetKernelInfo_; +void* dispatch::clGetKernelWorkGroupInfo_; +void* dispatch::clCreateKernel_; +void* dispatch::clCreateBuffer_; +void* dispatch::clCreateProgramWithSource_; +void* dispatch::clReleaseKernel_; + //CUDA void* dispatch::cuCtxGetCurrent_; void* dispatch::cuCtxSetCurrent_; @@ -295,13 +385,6 @@ void* dispatch::cuMemsetD8Async_; void* dispatch::cuCtxPushCurrent_v2_; void* dispatch::cuCtxPopCurrent_v2_; -void* dispatch::nvrtcCompileProgram_; -void* dispatch::nvrtcGetProgramLogSize_; -void* dispatch::nvrtcGetPTX_; -void* dispatch::nvrtcGetPTXSize_; -void* dispatch::nvrtcCreateProgram_; -void* dispatch::nvrtcGetProgramLog_; - void* dispatch::nvmlInit_v2_; void* dispatch::nvmlDeviceGetHandleByPciBusId_v2_; void* dispatch::nvmlDeviceGetClockInfo_; diff --git a/lib/driver/handle.cpp b/lib/driver/handle.cpp index c0144fcfe..3396b1b2b 100755 --- a/lib/driver/handle.cpp +++ b/lib/driver/handle.cpp @@ -30,6 +30,9 @@ namespace triton namespace driver { +//OpenCL +inline void _delete(cl_platform_id) { } +inline void _delete(cl_device_id x) { dispatch::clReleaseDevice(x); } //CUDA inline void _delete(CUcontext x) { dispatch::cuCtxDestroy(x); } inline void _delete(CUdeviceptr x) { dispatch::cuMemFree(x); } @@ -39,7 +42,7 @@ inline void _delete(CUevent x) { dispatch::cuEventDestroy(x); } inline void _delete(CUfunction) { } inline void _delete(CUmodule x) { dispatch::cuModuleUnload(x); } inline void _delete(cu_event_t x) { _delete(x.first); _delete(x.second); } -inline void _delete(cu_platform){} +inline void _delete(CUPlatform){} //Constructor template @@ -60,7 +63,10 @@ template class handle; template class handle; template class handle; template class handle; -template class handle; +template class handle; + +template class handle; +template class handle; } } diff --git a/lib/driver/kernel.cpp b/lib/driver/kernel.cpp index 994bf3cfa..d0656a230 100755 --- a/lib/driver/kernel.cpp +++ b/lib/driver/kernel.cpp @@ -32,13 +32,39 @@ namespace triton namespace driver { -kernel::kernel(driver::module const & program, const char * name) : program_(program), address_bits_(program.context().device().address_bits()){ - cu_params_store_.reserve(64); - cu_params_.reserve(64); - dispatch::cuModuleGetFunction(&*cu_, program, name); + +/* ------------------------ */ +// Base // +/* ------------------------ */ + +kernel::kernel(driver::module *program, CUfunction fn, bool has_ownership): + polymorphic_resource(fn, has_ownership), program_(program){ } -void kernel::setArg(unsigned int index, std::size_t size, void* ptr){ +kernel::kernel(driver::module *program, cl_kernel fn, bool has_ownership): + polymorphic_resource(fn, has_ownership), program_(program){ +} + +driver::module* kernel::module() { + return program_; +} + +/* ------------------------ */ +// OpenCL // +/* ------------------------ */ + + +/* ------------------------ */ +// CUDA // +/* ------------------------ */ + +cu_kernel::cu_kernel(driver::module *program, const char * name) : kernel(program, CUfunction(), true) { + cu_params_store_.reserve(64); + cu_params_.reserve(64); + dispatch::cuModuleGetFunction(&*cu_, *program->cu(), name); +} + +void cu_kernel::setArg(unsigned int index, std::size_t size, void* ptr){ if(index + 1> cu_params_store_.size()){ cu_params_store_.resize(index+1); cu_params_.resize(index+1); @@ -48,18 +74,12 @@ void kernel::setArg(unsigned int index, std::size_t size, void* ptr){ cu_params_[index] = cu_params_store_[index].get(); } -void kernel::setArg(unsigned int index, buffer const & data) -{ return setArg(index, (CUdeviceptr)data);} +void cu_kernel::setArg(unsigned int index, cu_buffer const & data) +{ return setArg(index, data.cu());} -void* const* kernel::cu_params() const +void* const* cu_kernel::cu_params() const { return cu_params_.data(); } -handle const & kernel::cu() const -{ return cu_; } - -driver::module const & kernel::module() const -{ return program_; } - } diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index c482acf08..98779e918 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -46,9 +46,34 @@ namespace triton namespace driver { -std::string module::compile_llvm_module(llvm::Module* module) { - init_llvm(); +/* ------------------------ */ +// Base // +/* ------------------------ */ +module::module(driver::context* ctx, CUmodule mod, bool has_ownership) + : polymorphic_resource(mod, has_ownership), ctx_(ctx) { +} + +module::module(driver::context* ctx, cl_program mod, bool has_ownership) + : polymorphic_resource(mod, has_ownership), ctx_(ctx) { +} + +driver::context* module::context() const { + return ctx_; +} + + +/* ------------------------ */ +// OpenCL // +/* ------------------------ */ + + +/* ------------------------ */ +// CUDA // +/* ------------------------ */ + +std::string cu_module::compile_llvm_module(llvm::Module* module) { + init_llvm(); // create machine module->setTargetTriple("nvptx64-nvidia-cuda"); std::string error; @@ -67,18 +92,17 @@ std::string module::compile_llvm_module(llvm::Module* module) { layout += "-p3:32:32-p4:32:32-p5:32:32"; layout += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; module->setDataLayout(layout); - // emit machine code llvm::legacy::PassManager pass; llvm::SmallVector buffer; llvm::raw_svector_ostream stream(buffer); machine->addPassesToEmitFile(pass, stream, nullptr, llvm::TargetMachine::CGFT_AssemblyFile); pass.run(*module); - + // done return std::string(buffer.begin(), buffer.end()); } -void module::init_llvm() { +void cu_module::init_llvm() { static bool init = false; if(!init){ llvm::InitializeAllTargetInfos(); @@ -90,10 +114,10 @@ void module::init_llvm() { } } -module::module(driver::context const & context, llvm::Module* ll_module): module(context, compile_llvm_module(ll_module)){ } +cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } -module::module(driver::context const & context, std::string const & source) : context_(context), source_(source){ - ContextSwitcher ctx_switch(context_); +cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ + cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; unsigned int errbufsize = 8096; @@ -108,17 +132,11 @@ module::module(driver::context const & context, std::string const & source) : co } } -driver::context const & module::context() const -{ return context_; } - -handle const & module::cu() const -{ return cu_; } - -buffer module::symbol(const char *name) const{ +cu_buffer cu_module::symbol(const char *name) const{ CUdeviceptr handle; size_t size; dispatch::cuModuleGetGlobal_v2(&handle, &size, *cu_, name); - return buffer(context_, handle, false); + return cu_buffer(ctx_, handle, false); } diff --git a/lib/driver/platform.cpp b/lib/driver/platform.cpp index b6ff27112..60d2bd128 100755 --- a/lib/driver/platform.cpp +++ b/lib/driver/platform.cpp @@ -31,22 +31,46 @@ namespace triton namespace driver { -std::string platform::version() const{ + +/* ------------------------ */ +// CUDA // +/* ------------------------ */ + +std::string cu_platform::version() const{ int version; dispatch::cuDriverGetVersion(&version); return std::to_string(version); } -std::vector platform::devices() const{ - std::vector devices; +void cu_platform::devices(std::vector &devices) const{ int N; dispatch::cuDeviceGetCount(&N); for(int i = 0 ; i < N ; ++i){ CUdevice dvc; dispatch::cuDeviceGet(&dvc, i); - devices.push_back(driver::device(dvc)); + devices.push_back(new driver::cu_device(dvc)); } - return devices; +} + +/* ------------------------ */ +// OpenCL // +/* ------------------------ */ + +std::string cl_platform::version() const { + size_t size; + dispatch::clGetPlatformInfo(*cl_, CL_PLATFORM_VERSION, 0, nullptr, &size); + std::string result(size, 0); + dispatch::clGetPlatformInfo(*cl_, CL_PLATFORM_VERSION, size, (void*)&*result.begin(), nullptr); + return result; +} + +void cl_platform::devices(std::vector &devices) const{ + cl_uint num_devices; + dispatch::clGetDeviceIDs(*cl_, CL_DEVICE_TYPE_GPU, 0, nullptr, &num_devices); + std::vector ids(num_devices); + dispatch::clGetDeviceIDs(*cl_, CL_DEVICE_TYPE_GPU, num_devices, ids.data(), nullptr); + for(cl_device_id id: ids) + devices.push_back(new driver::ocl_device(id)); } } diff --git a/lib/driver/stream.cpp b/lib/driver/stream.cpp index 0b318811a..a8d8f5c43 100755 --- a/lib/driver/stream.cpp +++ b/lib/driver/stream.cpp @@ -38,57 +38,84 @@ namespace triton namespace driver { -inline CUcontext cucontext(){ +/* ------------------------ */ +// Base // +/* ------------------------ */ + +stream::stream(driver::context *ctx, CUstream cu, bool has_ownership) + : polymorphic_resource(cu, has_ownership), ctx_(ctx) { + +} + +stream::stream(driver::context *ctx, cl_command_queue cl, bool has_ownership) + : polymorphic_resource(cl, has_ownership), ctx_(ctx) { + +} + +driver::context* stream::context() const { + return ctx_; +} + + +/* ------------------------ */ +// OpenCL // +/* ------------------------ */ + + +void cl_stream::synchronize() { + dispatch::clFinish(*cl_); +} + + +/* ------------------------ */ +// CUDA // +/* ------------------------ */ + +inline CUcontext get_context() { CUcontext result; dispatch::cuCtxGetCurrent(&result); return result; } -stream::stream(CUstream stream, bool take_ownership): context_(cucontext(), take_ownership), cu_(stream, take_ownership) -{} +cu_stream::cu_stream(CUstream str, bool take_ownership): + stream(backend::contexts::import(get_context()), str, take_ownership) { +} -stream::stream(driver::context const & context): context_(context), cu_(CUstream(), true) -{ - ContextSwitcher ctx_switch(context_); +cu_stream::cu_stream(driver::context *context): stream((driver::cu_context*)context, CUstream(), true) { + cu_context::context_switcher ctx_switch(*ctx_); dispatch::cuStreamCreate(&*cu_, 0); } -void stream::synchronize() -{ - ContextSwitcher ctx_switch(context_); +void cu_stream::synchronize() { + cu_context::context_switcher ctx_switch(*ctx_); dispatch::cuStreamSynchronize(*cu_); } -driver::context const & stream::context() const -{ return context_; } - -void stream::enqueue(kernel const & kernel, std::array grid, std::array block, std::vector const *, Event* event){ - ContextSwitcher ctx_switch(context_); +void cu_stream::enqueue(driver::cu_kernel const & kernel, std::array grid, std::array block, std::vector const *, Event* event) { + cu_context::context_switcher ctx_switch(*ctx_); if(event) - dispatch::cuEventRecord(((cu_event_t)*event).first, *cu_); - dispatch::cuLaunchKernel(kernel, grid[0], grid[1], grid[2], block[0], block[1], block[2], 0, *cu_,(void**)kernel.cu_params(), NULL); + dispatch::cuEventRecord(event->cu()->first, *cu_); + dispatch::cuLaunchKernel(*kernel.cu(), grid[0], grid[1], grid[2], block[0], block[1], block[2], 0, *cu_,(void**)kernel.cu_params(), NULL); if(event) - dispatch::cuEventRecord(((cu_event_t)*event).second, *cu_); + dispatch::cuEventRecord(event->cu()->second, *cu_); } -void stream::write(buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr){ - ContextSwitcher ctx_switch(context_); +void cu_stream::write(driver::cu_buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) { + cu_context::context_switcher ctx_switch(*ctx_); if(blocking) - dispatch::cuMemcpyHtoD(buffer + offset, ptr, size); + dispatch::cuMemcpyHtoD(*buffer.cu() + offset, ptr, size); else - dispatch::cuMemcpyHtoDAsync(buffer + offset, ptr, size, *cu_); + dispatch::cuMemcpyHtoDAsync(*buffer.cu() + offset, ptr, size, *cu_); } -void stream::read(buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr){ - ContextSwitcher ctx_switch(context_); +void cu_stream::read(driver::cu_buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) { + cu_context::context_switcher ctx_switch(*ctx_); if(blocking) - dispatch::cuMemcpyDtoH(ptr, buffer + offset, size); + dispatch::cuMemcpyDtoH(ptr, *buffer.cu() + offset, size); else - dispatch::cuMemcpyDtoHAsync(ptr, buffer + offset, size, *cu_); + dispatch::cuMemcpyDtoHAsync(ptr, *buffer.cu() + offset, size, *cu_); } -handle const & stream::cu() const -{ return cu_; } } diff --git a/lib/jit.cpp b/lib/jit.cpp index 64e0865fa..b4f93049a 100644 --- a/lib/jit.cpp +++ b/lib/jit.cpp @@ -89,7 +89,7 @@ std::unique_ptr jit::make_triton_module(const std::string &src) { } -jit::jit(driver::context context): driver_context_(context) { +jit::jit(driver::context *context): driver_context_(context) { } @@ -131,15 +131,15 @@ void jit::autotune(const std::string &src, benchmark_t benchmark) { } passes.tune.init(tt_module); passes.init(tt_module); - const driver::device &device = driver_context_.device(); - if(passes.allocation.get_allocated_size() > device.max_shared_memory()) + driver::cu_device* device = (driver::cu_device*)driver_context_->device(); + if(passes.allocation.get_allocated_size() > device->max_shared_memory()) return; - if(passes.tune.get_num_threads() > device.max_threads_per_block()) + if(passes.tune.get_num_threads() > device->max_threads_per_block()) return; // Compile auto ll_module = make_llvm_module(tt_module, passes); - driver::module module(driver_context_, &*ll_module); - driver::kernel kernel(module, "matmul"); + driver::cu_module module(driver_context_, &*ll_module); + driver::cu_kernel kernel(&module, "matmul"); launch_information info = launch_info_map_.at("matmul"); for(unsigned p: params) std::cout << p << " " << std::flush; @@ -166,12 +166,13 @@ void jit::add_module(ir::module &tt_module, const std::vector ¶ms) passes.tune.check_constraints(errors); if(errors.size()) throw std::runtime_error("invalid parameters"); - if(passes.allocation.get_allocated_size() > driver_context_.device().max_shared_memory()) + driver::cu_device* device = (driver::cu_device*)driver_context_->device(); + if(passes.allocation.get_allocated_size() > device->max_shared_memory()) throw std::runtime_error("invalid parameters"); // triton module -> llvm module auto ll_module = make_llvm_module(tt_module, passes); // llvm module -> machine code - modules_.push_back(driver::module(driver_context_, &*ll_module)); + modules_.push_back(driver::cu_module(driver_context_, &*ll_module)); // add globals for(auto x: tt_module.globals()) global_ints_[x.first] = ((ir::metaparameter*)x.second)->get_value(); @@ -182,8 +183,8 @@ void jit::add_module(const std::string &src, const std::vector ¶ms add_module(*ptt_module, params); } -driver::kernel jit::get_function(const std::string &name) { - return driver::kernel(modules_.front(), name.c_str()); +driver::cu_kernel jit::get_function(const std::string &name) { + return driver::cu_kernel(&modules_.front(), name.c_str()); } jit::launch_information jit::get_launch_info(const std::string &name) { From b6305f4388c3d341133aadc07bd9aa93dce71e81 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 19 Mar 2019 23:59:48 -0700 Subject: [PATCH 110/494] [driver] added more genericity for opencl support --- examples/matrix.cpp | 20 ++++---- include/triton/driver/backend.h | 6 ++- include/triton/driver/context.h | 2 + include/triton/driver/device.h | 2 + include/triton/driver/handle.h | 1 + include/triton/driver/kernel.h | 20 ++++++-- include/triton/driver/module.h | 20 +++++++- include/triton/driver/stream.h | 7 ++- lib/driver/backend.cpp | 16 ++++--- lib/driver/context.cpp | 9 ++++ lib/driver/device.cpp | 8 +++- lib/driver/handle.cpp | 11 +++++ lib/driver/kernel.cpp | 25 +++++++++- lib/driver/module.cpp | 81 ++++++++++++++++++++++----------- lib/driver/stream.cpp | 18 +++++++- 15 files changed, 188 insertions(+), 58 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 18032f247..60ba87318 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -135,7 +135,7 @@ int main() { // benchmark a given matrix multiplication kernel - auto benchmark = [&](triton::driver::cu_kernel kernel, + auto benchmark = [&](triton::driver::kernel* kernel, triton::jit::launch_information info) { // launch info unsigned TM = info.global_range_size[0]; @@ -153,20 +153,20 @@ int main() { unsigned last_safe_b = (BT==true)?(N*K - 1 - lastj)/N - lastk : N*K - 1 - lastj*K - lastk; int32_t bound = std::max(1, std::max(K - last_safe_a, K - last_safe_b)); // set argument - kernel.setArg(0, da); - kernel.setArg(1, db); - kernel.setArg(2, dc); - kernel.setArg(3, M); - kernel.setArg(4, N); - kernel.setArg(5, K); - kernel.setArg(6, bound); + kernel->setArg(0, da); + kernel->setArg(1, db); + kernel->setArg(2, dc); + kernel->setArg(3, M); + kernel->setArg(4, N); + kernel->setArg(5, K); + kernel->setArg(6, bound); // dry run stream.enqueue(kernel, grid, {nthreads, 1, 1}); stream.synchronize(); // benchmark double ts = bench([&](){stream.enqueue(kernel, grid, {nthreads, 1, 1});}, [&](){ stream.synchronize(); }, - context->device()); + (triton::driver::cu_device&)*context->device()); ts = ts * 1e-9; double tflops = 2*M*N*K / ts * 1e-12; return tflops; @@ -186,7 +186,7 @@ int main() { jit.add_module(src, params); triton::driver::cu_kernel kernel = jit.get_function("matmul"); triton::jit::launch_information info = jit.get_launch_info("matmul"); - std::cout << benchmark(kernel, info) << std::endl; + std::cout << benchmark(&kernel, info) << std::endl; stream.read(dc, true, 0, hc); simple_gemm(rc, ha, hb, M, N, K); for(size_t i = 0; i < M*N; i++) diff --git a/include/triton/driver/backend.h b/include/triton/driver/backend.h index d830df391..a91fa7c7a 100755 --- a/include/triton/driver/backend.h +++ b/include/triton/driver/backend.h @@ -28,6 +28,10 @@ #include #include "triton/driver/context.h" +namespace llvm +{ +class Module; +} namespace triton { @@ -81,7 +85,7 @@ struct backend public: static void release(); - static driver::module* get(driver::stream* stream, std::string const & name, std::string const &src); + static driver::module* get(driver::stream* stream, std::string const & name, llvm::Module *src); private: static std::map, driver::module*> cache_; diff --git a/include/triton/driver/context.h b/include/triton/driver/context.h index 842d0a82c..379fe6962 100755 --- a/include/triton/driver/context.h +++ b/include/triton/driver/context.h @@ -40,6 +40,8 @@ public: context(driver::device *dev, cl_context cl, bool take_ownership); driver::device* device() const; std::string const & cache_path() const; + // factory methods + static context* create(driver::device *dev); protected: driver::device* dev_; diff --git a/include/triton/driver/device.h b/include/triton/driver/device.h index 2945ab766..97071ec27 100755 --- a/include/triton/driver/device.h +++ b/include/triton/driver/device.h @@ -32,6 +32,8 @@ namespace triton namespace driver { +class context; + // Base device class device: public polymorphic_resource{ public: diff --git a/include/triton/driver/handle.h b/include/triton/driver/handle.h index 3bffea395..6de493722 100755 --- a/include/triton/driver/handle.h +++ b/include/triton/driver/handle.h @@ -81,6 +81,7 @@ class polymorphic_resource { public: polymorphic_resource(CUType cu, bool take_ownership): cu_(cu, take_ownership){} polymorphic_resource(CLType cl, bool take_ownership): cl_(cl, take_ownership){} + virtual ~polymorphic_resource() { } handle cu() { return cu_; } handle cl() { return cl_; } diff --git a/include/triton/driver/kernel.h b/include/triton/driver/kernel.h index 6a8f114f4..0657e775f 100755 --- a/include/triton/driver/kernel.h +++ b/include/triton/driver/kernel.h @@ -41,14 +41,27 @@ class kernel: public polymorphic_resource { public: kernel(driver::module* program, CUfunction fn, bool has_ownership); kernel(driver::module* program, cl_kernel fn, bool has_ownership); + // Getters driver::module* module(); - + // Factory methods + static kernel* create(driver::module* program, const char* name); + // Arguments setters + virtual void setArg(unsigned int index, std::size_t size, void* ptr) = 0; + virtual void setArg(unsigned int index, buffer *) = 0; + template void setArg(unsigned int index, T value) { setArg(index, sizeof(T), (void*)&value); } private: driver::module* program_; }; // OpenCL class ocl_kernel: public kernel { +public: + //Constructors + ocl_kernel(driver::module* program, const char* name); + // Arguments setters + void setArg(unsigned int index, std::size_t size, void* ptr); + void setArg(unsigned int index, driver::buffer* buffer); + }; // CUDA @@ -56,10 +69,9 @@ class cu_kernel: public kernel { public: //Constructors cu_kernel(driver::module* program, const char * name); - //Arguments setters + // Arguments setters void setArg(unsigned int index, std::size_t size, void* ptr); - void setArg(unsigned int index, cu_buffer const &); - template void setArg(unsigned int index, T value) { setArg(index, sizeof(T), (void*)&value); } + void setArg(unsigned int index, driver::buffer* buffer); //Arguments getters void* const* cu_params() const; diff --git a/include/triton/driver/module.h b/include/triton/driver/module.h index ef45243fd..92a237d3d 100755 --- a/include/triton/driver/module.h +++ b/include/triton/driver/module.h @@ -31,6 +31,8 @@ namespace llvm { class Module; + template + class SmallVectorImpl; } namespace triton @@ -42,20 +44,34 @@ namespace driver class cu_context; class cu_device; +// Base class module: public polymorphic_resource { +protected: + void init_llvm(); + public: module(driver::context* ctx, CUmodule mod, bool has_ownership); module(driver::context* ctx, cl_program mod, bool has_ownership); + static module* create(driver::context* ctx, llvm::Module *src); driver::context* context() const; + void compile_llvm_module(llvm::Module* module, const std::string& triple, + const std::string &proc, std::string layout, + llvm::SmallVectorImpl &buffer); protected: driver::context* ctx_; }; +// OpenCL +class ocl_module: public module{ + +public: + ocl_module(driver::context* context, llvm::Module *module); +}; + +// CUDA class cu_module: public module { - static std::string header(driver::cu_device const & device); std::string compile_llvm_module(llvm::Module* module); - void init_llvm(); public: cu_module(driver::context* context, llvm::Module *module); diff --git a/include/triton/driver/stream.h b/include/triton/driver/stream.h index cb2ae7d4d..723edbc13 100755 --- a/include/triton/driver/stream.h +++ b/include/triton/driver/stream.h @@ -35,7 +35,7 @@ namespace triton namespace driver { -class cu_kernel; +class kernel; class Event; class Range; class cu_buffer; @@ -45,6 +45,9 @@ class stream: public polymorphic_resource { public: stream(driver::context *ctx, CUstream, bool has_ownership); stream(driver::context *ctx, cl_command_queue, bool has_ownership); + // factory + static driver::stream* create(driver::context* ctx); + // accessors driver::context* context() const; virtual void synchronize() = 0; @@ -73,7 +76,7 @@ public: void synchronize(); //Enqueue - void enqueue(cu_kernel const & cu_kernel, std::array grid, std::array block, std::vector const * = NULL, Event *event = NULL); + void enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const * = NULL, Event *event = NULL); // Write void write(driver::cu_buffer const & cu_buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr); diff --git a/lib/driver/backend.cpp b/lib/driver/backend.cpp index 726548c1c..628f0c225 100755 --- a/lib/driver/backend.cpp +++ b/lib/driver/backend.cpp @@ -99,10 +99,11 @@ void backend::modules::release(){ cache_.clear(); } -driver::module* backend::modules::get(driver::stream* stream, std::string const & name, std::string const & src){ +driver::module* backend::modules::get(driver::stream* stream, std::string const & name, llvm::Module* src){ std::tuple key(stream, name); - if(cache_.find(key)==cache_.end()) - return &*cache_.insert(std::make_pair(key, new driver::cu_module(((driver::cu_stream*)stream)->context(), src))).first->second; + if(cache_.find(key)==cache_.end()){ + return &*cache_.insert({key, driver::module::create(stream->context(), src)}).first->second; + } return &*cache_.at(key); } @@ -120,8 +121,9 @@ void backend::kernels::release(){ driver::kernel* backend::kernels::get(driver::module *mod, std::string const & name){ std::tuple key(mod, name); - if(cache_.find(key)==cache_.end()) - return &*cache_.insert(std::make_pair(key, new driver::cu_kernel((driver::cu_module*)mod, name.c_str()))).first->second; + if(cache_.find(key)==cache_.end()){ + return &*cache_.insert({key, driver::kernel::create(mod, name.c_str())}).first->second; + } return cache_.at(key); } @@ -134,7 +136,7 @@ std::map, driver::kernel*> backend::ker void backend::streams::init(std::list const & contexts){ for(driver::context* ctx : contexts) if(cache_.find(ctx)==cache_.end()) - cache_.insert(std::make_pair(ctx, std::vector{new driver::cu_stream(ctx)})); + cache_.insert(std::make_pair(ctx, std::vector{driver::stream::create(ctx)})); } void backend::streams::release(){ @@ -168,7 +170,7 @@ std::map> backend::streams::cache void backend::contexts::init(std::vector const & devices){ for(driver::device* dvc: devices) - cache_.push_back(new cu_context(dvc)); + cache_.push_back(driver::context::create(dvc)); } void backend::contexts::release(){ diff --git a/lib/driver/context.cpp b/lib/driver/context.cpp index 56654c19d..6e1618713 100755 --- a/lib/driver/context.cpp +++ b/lib/driver/context.cpp @@ -50,6 +50,15 @@ context::context(driver::device *dev, cl_context cl, bool take_ownership): } +context* context::create(driver::device *dev){ + if(dynamic_cast(dev)) + return new cu_context(dev); + if(dynamic_cast(dev)) + return new ocl_context(dev); + throw std::runtime_error("unknown context"); +} + + driver::device* context::device() const { return dev_; } diff --git a/lib/driver/device.cpp b/lib/driver/device.cpp index 0b9852e7b..0fe875075 100755 --- a/lib/driver/device.cpp +++ b/lib/driver/device.cpp @@ -27,6 +27,7 @@ #include #include "triton/driver/device.h" +#include "triton/driver/context.h" namespace triton { @@ -35,11 +36,16 @@ namespace driver { +/* ------------------------ */ +// OpenCL // +/* ------------------------ */ + + /* ------------------------ */ // CUDA // /* ------------------------ */ -// Architecture +// architecture cu_device::Architecture cu_device::nv_arch(std::pair sm) const { switch(sm.first) { case 7: diff --git a/lib/driver/handle.cpp b/lib/driver/handle.cpp index 3396b1b2b..c9534c4af 100755 --- a/lib/driver/handle.cpp +++ b/lib/driver/handle.cpp @@ -33,6 +33,12 @@ namespace driver //OpenCL inline void _delete(cl_platform_id) { } inline void _delete(cl_device_id x) { dispatch::clReleaseDevice(x); } +inline void _delete(cl_context x) { dispatch::clReleaseContext(x); } +inline void _delete(cl_program x) { dispatch::clReleaseProgram(x); } +inline void _delete(cl_kernel x) { dispatch::clReleaseKernel(x); } +inline void _delete(cl_command_queue x) { dispatch::clReleaseCommandQueue(x); } +inline void _delete(cl_mem x) { dispatch::clReleaseMemObject(x); } + //CUDA inline void _delete(CUcontext x) { dispatch::cuCtxDestroy(x); } inline void _delete(CUdeviceptr x) { dispatch::cuMemFree(x); } @@ -67,6 +73,11 @@ template class handle; template class handle; template class handle; +template class handle; +template class handle; +template class handle; +template class handle; +template class handle; } } diff --git a/lib/driver/kernel.cpp b/lib/driver/kernel.cpp index d0656a230..1490ad21d 100755 --- a/lib/driver/kernel.cpp +++ b/lib/driver/kernel.cpp @@ -45,6 +45,14 @@ kernel::kernel(driver::module *program, cl_kernel fn, bool has_ownership): polymorphic_resource(fn, has_ownership), program_(program){ } +kernel* kernel::create(driver::module* program, const char* name) { + if(dynamic_cast(program)) + return new cu_kernel(program, name); + if(dynamic_cast(program)) + return new ocl_kernel(program, name); + throw std::runtime_error("unknown program"); +} + driver::module* kernel::module() { return program_; } @@ -53,6 +61,19 @@ driver::module* kernel::module() { // OpenCL // /* ------------------------ */ +ocl_kernel::ocl_kernel(driver::module* program, const char* name): kernel(program, cl_kernel(), true) { + cl_int err; + *cl_ = dispatch::clCreateKernel(*program->cl(), name, &err); +} + +void ocl_kernel::setArg(unsigned int index, std::size_t size, void* ptr) { + dispatch::clSetKernelArg(*cl_, index, size, ptr); +} + +void ocl_kernel::setArg(unsigned int index, driver::buffer* buffer) { + dispatch::clSetKernelArg(*cl_, index, sizeof(cl_mem), (void*)&*buffer->cl()); +} + /* ------------------------ */ // CUDA // @@ -74,8 +95,8 @@ void cu_kernel::setArg(unsigned int index, std::size_t size, void* ptr){ cu_params_[index] = cu_params_store_[index].get(); } -void cu_kernel::setArg(unsigned int index, cu_buffer const & data) -{ return setArg(index, data.cu());} +void cu_kernel::setArg(unsigned int index, driver::buffer* data) +{ return kernel::setArg(index, *data->cu());} void* const* cu_kernel::cu_params() const { return cu_params_.data(); } diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 98779e918..03793945c 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -50,6 +50,18 @@ namespace driver // Base // /* ------------------------ */ +void module::init_llvm() { + static bool init = false; + if(!init){ + llvm::InitializeAllTargetInfos(); + llvm::InitializeAllTargets(); + llvm::InitializeAllTargetMCs(); + llvm::InitializeAllAsmParsers(); + llvm::InitializeAllAsmPrinters(); + init = true; + } +} + module::module(driver::context* ctx, CUmodule mod, bool has_ownership) : polymorphic_resource(mod, has_ownership), ctx_(ctx) { } @@ -62,26 +74,56 @@ driver::context* module::context() const { return ctx_; } +module* module::create(driver::context* ctx, llvm::Module *src) { + if(dynamic_cast(ctx)) + return new cu_module(ctx, src); + if(dynamic_cast(ctx)) + return new ocl_module(ctx, src); + throw std::runtime_error("unknown context"); +} + +void module::compile_llvm_module(llvm::Module* module, const std::string& triple, + const std::string &proc, std::string layout, + llvm::SmallVectorImpl &buffer) { + init_llvm(); + // create machine + module->setTargetTriple(triple); + std::string error; + auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error); + llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, "", + llvm::TargetOptions(), llvm::Reloc::Model(), + llvm::None, llvm::CodeGenOpt::Aggressive); + + + // set data layout + if(layout.empty()) + layout = module->getDataLayoutStr(); + module->setDataLayout(layout); + + // emit machine code + llvm::legacy::PassManager pass; + llvm::raw_svector_ostream stream(buffer); + machine->addPassesToEmitFile(pass, stream, nullptr, llvm::TargetMachine::CGFT_AssemblyFile); + pass.run(*module); +} /* ------------------------ */ // OpenCL // /* ------------------------ */ +ocl_module::ocl_module(driver::context * context, llvm::Module* src): module(context, cl_program(), true) { + init_llvm(); + llvm::SmallVector buffer; + module::compile_llvm_module(src, "amdgcn-amd-amdpal", "gfx902", "", buffer); + throw std::runtime_error("need to implement opencl module creation"); +} + /* ------------------------ */ // CUDA // /* ------------------------ */ std::string cu_module::compile_llvm_module(llvm::Module* module) { - init_llvm(); - // create machine - module->setTargetTriple("nvptx64-nvidia-cuda"); - std::string error; - auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error); - llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), "sm_52", "", - llvm::TargetOptions(), llvm::Reloc::Model(), - llvm::None, llvm::CodeGenOpt::Aggressive); - // set data layout std::string layout = "e"; bool is_64bit = true; @@ -91,28 +133,13 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { else if (use_short_pointers) layout += "-p3:32:32-p4:32:32-p5:32:32"; layout += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; - module->setDataLayout(layout); - // emit machine code - llvm::legacy::PassManager pass; + // create llvm::SmallVector buffer; - llvm::raw_svector_ostream stream(buffer); - machine->addPassesToEmitFile(pass, stream, nullptr, llvm::TargetMachine::CGFT_AssemblyFile); - pass.run(*module); - // done + module::compile_llvm_module(module, "nvptx64-nvidia-cuda", "sm_52", layout, buffer); return std::string(buffer.begin(), buffer.end()); } -void cu_module::init_llvm() { - static bool init = false; - if(!init){ - llvm::InitializeAllTargetInfos(); - llvm::InitializeAllTargets(); - llvm::InitializeAllTargetMCs(); - llvm::InitializeAllAsmParsers(); - llvm::InitializeAllAsmPrinters(); - init = true; - } -} + cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } diff --git a/lib/driver/stream.cpp b/lib/driver/stream.cpp index a8d8f5c43..fa7d25621 100755 --- a/lib/driver/stream.cpp +++ b/lib/driver/stream.cpp @@ -52,6 +52,15 @@ stream::stream(driver::context *ctx, cl_command_queue cl, bool has_ownership) } +driver::stream* stream::create(driver::context* ctx) { + if(dynamic_cast(ctx)) + return new cu_stream(ctx); + if(dynamic_cast(ctx)) + return new cl_stream(ctx); + throw std::runtime_error("unknown context"); +} + + driver::context* stream::context() const { return ctx_; } @@ -61,6 +70,10 @@ driver::context* stream::context() const { // OpenCL // /* ------------------------ */ +cl_stream::cl_stream(driver::context *ctx): stream(ctx, cl_command_queue(), true) { + cl_int err; + *cl_ = dispatch::clCreateCommandQueue(*ctx->cl(), *ctx->device()->cl(), 0, &err); +} void cl_stream::synchronize() { dispatch::clFinish(*cl_); @@ -91,11 +104,12 @@ void cu_stream::synchronize() { dispatch::cuStreamSynchronize(*cu_); } -void cu_stream::enqueue(driver::cu_kernel const & kernel, std::array grid, std::array block, std::vector const *, Event* event) { +void cu_stream::enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const *, Event* event) { + driver::cu_kernel* cu_kernel = (driver::cu_kernel*)kernel; cu_context::context_switcher ctx_switch(*ctx_); if(event) dispatch::cuEventRecord(event->cu()->first, *cu_); - dispatch::cuLaunchKernel(*kernel.cu(), grid[0], grid[1], grid[2], block[0], block[1], block[2], 0, *cu_,(void**)kernel.cu_params(), NULL); + dispatch::cuLaunchKernel(*kernel->cu(), grid[0], grid[1], grid[2], block[0], block[1], block[2], 0, *cu_,(void**)cu_kernel->cu_params(), NULL); if(event) dispatch::cuEventRecord(event->cu()->second, *cu_); } From 907bbb1ad2f867ddd93e284cfdd4bcc16f9f0732 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 20 Mar 2019 17:32:17 -0700 Subject: [PATCH 111/494] [driver] now debugging AMD runtime --- examples/matrix.cpp | 36 ++++++++++++++++------------------ include/triton/driver/buffer.h | 1 + include/triton/driver/stream.h | 34 +++++++++++++++++--------------- include/triton/jit.h | 14 ++++++------- lib/driver/buffer.cpp | 8 ++++++++ lib/driver/module.cpp | 5 +++-- lib/driver/stream.cpp | 23 ++++++++++++++++------ lib/jit.cpp | 26 ++++++++++++------------ 8 files changed, 84 insertions(+), 63 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 60ba87318..bbe9e25bf 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -87,7 +87,7 @@ T min(std::vector x) template -double bench(OP const & op, SYNC const & sync, triton::driver::cu_device const & device) +double bench(OP const & op, SYNC const & sync) { timer tmr; std::vector times; @@ -95,7 +95,7 @@ double bench(OP const & op, SYNC const & sync, triton::driver::cu_device const & op(); sync(); while(total_time*1e-9 < 1e-3){ - float norm = (float)device.current_sm_clock()/device.max_sm_clock(); + float norm = 1; tmr.start(); op(); sync(); @@ -108,7 +108,6 @@ double bench(OP const & op, SYNC const & sync, triton::driver::cu_device const & int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); - exit(EXIT_SUCCESS); triton::jit jit(context); // matrix multiplication parameters @@ -124,14 +123,14 @@ int main() { hb[i] = 1; for(size_t i = 0; i < hc.size(); i++) hc[i] = 0; - triton::driver::cu_buffer dc(context, hc.size()*4); - triton::driver::cu_buffer da(context, ha.size()*4); - triton::driver::cu_buffer db(context, hb.size()*4); - triton::driver::cu_stream stream(context); - stream.write(da, true, 0, ha); - stream.write(db, true, 0, hb); - stream.write(dc, true, 0, hc); - stream.synchronize(); + triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*4); + triton::driver::buffer* da = triton::driver::buffer::create(context, ha.size()*4); + triton::driver::buffer* db = triton::driver::buffer::create(context, hb.size()*4); + triton::driver::stream* stream = triton::driver::stream::create(context); + stream->write(da, true, 0, ha); + stream->write(db, true, 0, hb); + stream->write(dc, true, 0, hc); + stream->synchronize(); // benchmark a given matrix multiplication kernel @@ -161,12 +160,11 @@ int main() { kernel->setArg(5, K); kernel->setArg(6, bound); // dry run - stream.enqueue(kernel, grid, {nthreads, 1, 1}); - stream.synchronize(); + stream->enqueue(kernel, grid, {nthreads, 1, 1}); + stream->synchronize(); // benchmark - double ts = bench([&](){stream.enqueue(kernel, grid, {nthreads, 1, 1});}, - [&](){ stream.synchronize(); }, - (triton::driver::cu_device&)*context->device()); + double ts = bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, + [&](){ stream->synchronize(); }); ts = ts * 1e-9; double tflops = 2*M*N*K / ts * 1e-12; return tflops; @@ -184,10 +182,10 @@ int main() { // jit.autotune(src, benchmark); jit.add_module(src, params); - triton::driver::cu_kernel kernel = jit.get_function("matmul"); + triton::driver::kernel* kernel = jit.get_function("matmul"); triton::jit::launch_information info = jit.get_launch_info("matmul"); - std::cout << benchmark(&kernel, info) << std::endl; - stream.read(dc, true, 0, hc); + std::cout << benchmark(kernel, info) << std::endl; + stream->read(dc, true, 0, hc); simple_gemm(rc, ha, hb, M, N, K); for(size_t i = 0; i < M*N; i++) if(std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ diff --git a/include/triton/driver/buffer.h b/include/triton/driver/buffer.h index c4ca53650..08bfede1d 100755 --- a/include/triton/driver/buffer.h +++ b/include/triton/driver/buffer.h @@ -38,6 +38,7 @@ class buffer : public polymorphic_resource { public: buffer(driver::context* ctx, CUdeviceptr cl, bool take_ownership); buffer(driver::context* ctx, cl_mem cl, bool take_ownership); + static buffer* create(driver::context* ctx, size_t size); driver::context* context(); protected: diff --git a/include/triton/driver/stream.h b/include/triton/driver/stream.h index 723edbc13..18bedbce0 100755 --- a/include/triton/driver/stream.h +++ b/include/triton/driver/stream.h @@ -49,7 +49,16 @@ public: static driver::stream* create(driver::context* ctx); // accessors driver::context* context() const; + // methods virtual void synchronize() = 0; + virtual void enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const * = NULL, Event *event = NULL) = 0; + virtual void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr) = 0; + virtual void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr) = 0; + // template helpers + template void write(driver::buffer* buf, bool blocking, std::size_t offset, std::vector const & x) + { write(buf, blocking, offset, x.size()*sizeof(T), x.data()); } + template void read(driver::buffer* buf, bool blocking, std::size_t offset, std::vector& x) + { read(buf, blocking, offset, x.size()*sizeof(T), x.data()); } protected: driver::context *ctx_; @@ -61,32 +70,25 @@ public: // Constructors cl_stream(driver::context *ctx); - // Synchronize + // Overridden void synchronize(); + void enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const *, Event *event); + void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr); + void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr); }; // CUDA class cu_stream: public stream { public: - //Constructors + // Constructors cu_stream(CUstream str, bool take_ownership); cu_stream(driver::context* context); - //Synchronize + // Overridden void synchronize(); - - //Enqueue - void enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const * = NULL, Event *event = NULL); - - // Write - void write(driver::cu_buffer const & cu_buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr); - template void write(driver::cu_buffer const & buffer, bool blocking, std::size_t offset, std::vector const & x) - { write(buffer, blocking, offset, x.size()*sizeof(T), x.data()); } - - // Read - void read(driver::cu_buffer const & cu_buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr); - template void read(driver::cu_buffer const & buffer, bool blocking, std::size_t offset, std::vector& x) - { read(buffer, blocking, offset, x.size()*sizeof(T), x.data()); } + void enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const *, Event *event); + void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr); + void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr); }; diff --git a/include/triton/jit.h b/include/triton/jit.h index a2c63bbf8..ecf22daf0 100644 --- a/include/triton/jit.h +++ b/include/triton/jit.h @@ -39,14 +39,14 @@ public: std::vector global_range_size; unsigned num_threads; }; - typedef std::function benchmark_t; + typedef std::function benchmark_t; struct passes_wrapper { passes_wrapper(): shared(&buffer_info), liveness(&buffer_info), - allocation(&liveness, &buffer_info), - barriers(&allocation, &buffer_info), - vectorize(&tune), - selection(&allocation, &tune, &buffer_info){ } + allocation(&liveness, &buffer_info), + barriers(&allocation, &buffer_info), + vectorize(&tune), + selection(&allocation, &tune, &buffer_info){ } void init(ir::module &module) { // generate ptx @@ -78,12 +78,12 @@ public: void autotune(const std::string &src, benchmark_t benchmark); void add_module(ir::module &module, const std::vector& params = {}); void add_module(const std::string &src, const std::vector& params = {}); - driver::cu_kernel get_function(const std::string &name); + driver::kernel* get_function(const std::string &name); launch_information get_launch_info(const std::string &name); unsigned get_int(const std::string &name); private: - std::vector modules_; + std::vector modules_; driver::context* driver_context_; llvm::LLVMContext llvm_context_; ir::context triton_context_; diff --git a/lib/driver/buffer.cpp b/lib/driver/buffer.cpp index 520347c7d..433d33b2e 100755 --- a/lib/driver/buffer.cpp +++ b/lib/driver/buffer.cpp @@ -46,6 +46,14 @@ driver::context* buffer::context() { return context_; } +buffer* buffer::create(driver::context* ctx, size_t size) { + if(dynamic_cast(ctx)) + return new cu_buffer(ctx, size); + if(dynamic_cast(ctx)) + return new ocl_buffer(ctx, size); + throw std::runtime_error("unknown context"); +} + // ocl_buffer::ocl_buffer(driver::context* context, size_t size) diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 03793945c..5796cc7e5 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -77,7 +77,7 @@ driver::context* module::context() const { module* module::create(driver::context* ctx, llvm::Module *src) { if(dynamic_cast(ctx)) return new cu_module(ctx, src); - if(dynamic_cast(ctx)) + if(dynamic_cast(ctx)) return new ocl_module(ctx, src); throw std::runtime_error("unknown context"); } @@ -100,11 +100,13 @@ void module::compile_llvm_module(llvm::Module* module, const std::string& triple layout = module->getDataLayoutStr(); module->setDataLayout(layout); + std::cout << "compiling" << std::endl; // emit machine code llvm::legacy::PassManager pass; llvm::raw_svector_ostream stream(buffer); machine->addPassesToEmitFile(pass, stream, nullptr, llvm::TargetMachine::CGFT_AssemblyFile); pass.run(*module); + std::cout << "compiled" << std::endl; } /* ------------------------ */ @@ -115,7 +117,6 @@ ocl_module::ocl_module(driver::context * context, llvm::Module* src): module(con init_llvm(); llvm::SmallVector buffer; module::compile_llvm_module(src, "amdgcn-amd-amdpal", "gfx902", "", buffer); - throw std::runtime_error("need to implement opencl module creation"); } diff --git a/lib/driver/stream.cpp b/lib/driver/stream.cpp index fa7d25621..35e369716 100755 --- a/lib/driver/stream.cpp +++ b/lib/driver/stream.cpp @@ -79,6 +79,17 @@ void cl_stream::synchronize() { dispatch::clFinish(*cl_); } +void cl_stream::enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const *, Event* event) { + cl_int err = dispatch::clEnqueueNDRangeKernel(*cl_, *kernel->cl(), grid.size(), NULL, (const size_t*)grid.data(), (const size_t*)block.data(), 0, NULL, NULL); +} + +void cl_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) { + cl_int err = dispatch::clEnqueueWriteBuffer(*cl_, *buffer->cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL); +} + +void cl_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) { + cl_int err = dispatch::clEnqueueReadBuffer(*cl_, *buffer->cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL); +} /* ------------------------ */ // CUDA // @@ -114,20 +125,20 @@ void cu_stream::enqueue(driver::kernel* kernel, std::array grid, std: dispatch::cuEventRecord(event->cu()->second, *cu_); } -void cu_stream::write(driver::cu_buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) { +void cu_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) { cu_context::context_switcher ctx_switch(*ctx_); if(blocking) - dispatch::cuMemcpyHtoD(*buffer.cu() + offset, ptr, size); + dispatch::cuMemcpyHtoD(*buffer->cu() + offset, ptr, size); else - dispatch::cuMemcpyHtoDAsync(*buffer.cu() + offset, ptr, size, *cu_); + dispatch::cuMemcpyHtoDAsync(*buffer->cu() + offset, ptr, size, *cu_); } -void cu_stream::read(driver::cu_buffer const & buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) { +void cu_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) { cu_context::context_switcher ctx_switch(*ctx_); if(blocking) - dispatch::cuMemcpyDtoH(ptr, *buffer.cu() + offset, size); + dispatch::cuMemcpyDtoH(ptr, *buffer->cu() + offset, size); else - dispatch::cuMemcpyDtoHAsync(ptr, *buffer.cu() + offset, size, *cu_); + dispatch::cuMemcpyDtoHAsync(ptr, *buffer->cu() + offset, size, *cu_); } diff --git a/lib/jit.cpp b/lib/jit.cpp index b4f93049a..38da020a4 100644 --- a/lib/jit.cpp +++ b/lib/jit.cpp @@ -131,15 +131,15 @@ void jit::autotune(const std::string &src, benchmark_t benchmark) { } passes.tune.init(tt_module); passes.init(tt_module); - driver::cu_device* device = (driver::cu_device*)driver_context_->device(); - if(passes.allocation.get_allocated_size() > device->max_shared_memory()) - return; - if(passes.tune.get_num_threads() > device->max_threads_per_block()) - return; +// driver::device* device = driver_context_->device(); +// if(passes.allocation.get_allocated_size() > device->max_shared_memory()) +// return; +// if(passes.tune.get_num_threads() > device->max_threads_per_block()) +// return; // Compile auto ll_module = make_llvm_module(tt_module, passes); - driver::cu_module module(driver_context_, &*ll_module); - driver::cu_kernel kernel(&module, "matmul"); + driver::module* module = driver::module::create(driver_context_, &*ll_module); + driver::kernel* kernel = driver::kernel::create(module, "matmul"); launch_information info = launch_info_map_.at("matmul"); for(unsigned p: params) std::cout << p << " " << std::flush; @@ -166,13 +166,13 @@ void jit::add_module(ir::module &tt_module, const std::vector ¶ms) passes.tune.check_constraints(errors); if(errors.size()) throw std::runtime_error("invalid parameters"); - driver::cu_device* device = (driver::cu_device*)driver_context_->device(); - if(passes.allocation.get_allocated_size() > device->max_shared_memory()) - throw std::runtime_error("invalid parameters"); +// driver::device* device = driver_context_->device(); +// if(passes.allocation.get_allocated_size() > device->max_shared_memory()) +// throw std::runtime_error("invalid parameters"); // triton module -> llvm module auto ll_module = make_llvm_module(tt_module, passes); // llvm module -> machine code - modules_.push_back(driver::cu_module(driver_context_, &*ll_module)); + modules_.push_back(driver::module::create(driver_context_, &*ll_module)); // add globals for(auto x: tt_module.globals()) global_ints_[x.first] = ((ir::metaparameter*)x.second)->get_value(); @@ -183,8 +183,8 @@ void jit::add_module(const std::string &src, const std::vector ¶ms add_module(*ptt_module, params); } -driver::cu_kernel jit::get_function(const std::string &name) { - return driver::cu_kernel(&modules_.front(), name.c_str()); +driver::kernel *jit::get_function(const std::string &name) { + return driver::kernel::create(modules_.front(), name.c_str()); } jit::launch_information jit::get_launch_info(const std::string &name) { From 49fd6ece995d46fea83a4629c37d39f04c293f4a Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 21 Mar 2019 23:51:47 -0700 Subject: [PATCH 112/494] some cleaning --- include/triton/driver/buffer.h | 6 + include/triton/driver/context.h | 5 + include/triton/driver/device.h | 5 + include/triton/driver/dispatch.h | 3 + include/triton/driver/event.h | 4 +- include/triton/driver/handle.h | 12 +- include/triton/driver/kernel.h | 5 + include/triton/driver/module.h | 7 +- include/triton/driver/platform.h | 11 ++ include/triton/driver/stream.h | 13 +- lib/codegen/selection.cpp | 273 ++++++++++++++++++------------- lib/driver/buffer.cpp | 13 +- lib/driver/context.cpp | 13 +- lib/driver/dispatch.cpp | 2 + lib/driver/error.cpp | 61 +++++++ lib/driver/event.cpp | 4 +- lib/driver/kernel.cpp | 21 ++- lib/driver/module.cpp | 107 ++++++++++-- lib/driver/platform.cpp | 14 +- lib/driver/stream.cpp | 26 ++- 20 files changed, 423 insertions(+), 182 deletions(-) diff --git a/include/triton/driver/buffer.h b/include/triton/driver/buffer.h index 08bfede1d..667892525 100755 --- a/include/triton/driver/buffer.h +++ b/include/triton/driver/buffer.h @@ -45,6 +45,12 @@ protected: driver::context* context_; }; +// CPU +class cpu_buffer: public buffer +{ + +}; + // OpenCL class ocl_buffer: public buffer { diff --git a/include/triton/driver/context.h b/include/triton/driver/context.h index 379fe6962..f56e45451 100755 --- a/include/triton/driver/context.h +++ b/include/triton/driver/context.h @@ -48,6 +48,11 @@ protected: std::string cache_path_; }; +// CPU +class cpu_context: public context { + +}; + // CUDA class cu_context: public context { public: diff --git a/include/triton/driver/device.h b/include/triton/driver/device.h index 97071ec27..8a5f7790b 100755 --- a/include/triton/driver/device.h +++ b/include/triton/driver/device.h @@ -40,6 +40,11 @@ public: using polymorphic_resource::polymorphic_resource; }; +// CPU device +class cpu_device: public device { + +}; + // OpenCL device class ocl_device: public device { public: diff --git a/include/triton/driver/dispatch.h b/include/triton/driver/dispatch.h index f579602c4..1e0459931 100755 --- a/include/triton/driver/dispatch.h +++ b/include/triton/driver/dispatch.h @@ -49,6 +49,7 @@ template void check(T){} void check(CUresult err); void check(cublasStatus_t err); void check(cudnnStatus_t err); +void check(cl_int err); class dispatch { @@ -117,6 +118,7 @@ public: static cl_int clGetKernelInfo(cl_kernel, cl_kernel_info, size_t, void *, size_t *); static cl_int clGetKernelWorkGroupInfo(cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *); static cl_kernel clCreateKernel(cl_program, const char *, cl_int *); + static cl_int clCreateKernelsInProgram(cl_program, cl_uint, cl_kernel*, cl_uint*); static cl_mem clCreateBuffer(cl_context, cl_mem_flags, size_t, void *, cl_int *); static cl_program clCreateProgramWithSource(cl_context, cl_uint, const char **, const size_t *, cl_int *); static cl_int clReleaseKernel(cl_kernel); @@ -233,6 +235,7 @@ private: static void* clGetKernelInfo_; static void* clGetKernelWorkGroupInfo_; static void* clCreateKernel_; + static void* clCreateKernelsInProgram_; static void* clCreateBuffer_; static void* clCreateProgramWithSource_; static void* clReleaseKernel_; diff --git a/include/triton/driver/event.h b/include/triton/driver/event.h index 3343ba6e3..633f03d7d 100755 --- a/include/triton/driver/event.h +++ b/include/triton/driver/event.h @@ -31,8 +31,8 @@ namespace triton namespace driver { -// Event -class Event +// event +class event { public: float elapsed_time() const; diff --git a/include/triton/driver/handle.h b/include/triton/driver/handle.h index 6de493722..2bbe7ba42 100755 --- a/include/triton/driver/handle.h +++ b/include/triton/driver/handle.h @@ -35,6 +35,12 @@ namespace triton namespace driver { +enum backend_t { + CUDA, + OpenCL +}; + +// helpers for CUDA struct cu_event_t{ operator bool() const { return first && second; } CUevent first; @@ -79,18 +85,20 @@ protected: template class polymorphic_resource { public: - polymorphic_resource(CUType cu, bool take_ownership): cu_(cu, take_ownership){} - polymorphic_resource(CLType cl, bool take_ownership): cl_(cl, take_ownership){} + polymorphic_resource(CUType cu, bool take_ownership): cu_(cu, take_ownership), backend_(CUDA){} + polymorphic_resource(CLType cl, bool take_ownership): cl_(cl, take_ownership), backend_(OpenCL){} virtual ~polymorphic_resource() { } handle cu() { return cu_; } handle cl() { return cl_; } const handle& cu() const { return cu_; } const handle& cl() const { return cl_; } + backend_t backend() { return backend_; } protected: handle cl_; handle cu_; + backend_t backend_; }; } diff --git a/include/triton/driver/kernel.h b/include/triton/driver/kernel.h index 0657e775f..a68b81840 100755 --- a/include/triton/driver/kernel.h +++ b/include/triton/driver/kernel.h @@ -53,6 +53,11 @@ private: driver::module* program_; }; +// CPU +class cpu_kernel: public kernel { + +}; + // OpenCL class ocl_kernel: public kernel { public: diff --git a/include/triton/driver/module.h b/include/triton/driver/module.h index 92a237d3d..a149812cf 100755 --- a/include/triton/driver/module.h +++ b/include/triton/driver/module.h @@ -56,12 +56,17 @@ public: driver::context* context() const; void compile_llvm_module(llvm::Module* module, const std::string& triple, const std::string &proc, std::string layout, - llvm::SmallVectorImpl &buffer); + llvm::SmallVectorImpl &buffer, std::vector files = {}); protected: driver::context* ctx_; }; +// CPU +class cpu_module: public module{ + +}; + // OpenCL class ocl_module: public module{ diff --git a/include/triton/driver/platform.h b/include/triton/driver/platform.h index 9ea1e9f3b..ff06e4b01 100755 --- a/include/triton/driver/platform.h +++ b/include/triton/driver/platform.h @@ -50,6 +50,7 @@ private: std::string name_; }; +// CUDA class cu_platform: public platform { public: @@ -61,6 +62,7 @@ private: handle cu_; }; +// OpenCL class cl_platform: public platform { public: @@ -72,6 +74,15 @@ private: handle cl_; }; +// CPU +class cpu_platform: public platform +{ +public: + cpu_platform(): platform("CPU") { } + std::string version() const; + void devices(std::vector &devices) const; +}; + } } diff --git a/include/triton/driver/stream.h b/include/triton/driver/stream.h index 18bedbce0..6044a0376 100755 --- a/include/triton/driver/stream.h +++ b/include/triton/driver/stream.h @@ -36,7 +36,7 @@ namespace driver { class kernel; -class Event; +class event; class Range; class cu_buffer; @@ -51,7 +51,7 @@ public: driver::context* context() const; // methods virtual void synchronize() = 0; - virtual void enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const * = NULL, Event *event = NULL) = 0; + virtual void enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const * = NULL, event *event = NULL) = 0; virtual void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr) = 0; virtual void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr) = 0; // template helpers @@ -64,6 +64,11 @@ protected: driver::context *ctx_; }; +// CPU +class cpu_stream: public stream { + +}; + // OpenCL class cl_stream: public stream { public: @@ -72,7 +77,7 @@ public: // Overridden void synchronize(); - void enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const *, Event *event); + void enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const *, event *event); void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr); void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr); }; @@ -86,7 +91,7 @@ public: // Overridden void synchronize(); - void enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const *, Event *event); + void enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const *, event *event); void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr); void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr); }; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 7523665a8..f8667b5f7 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -19,6 +19,59 @@ namespace codegen{ using namespace llvm; +inline void set_kernel(llvm::IRBuilder<>& builder, llvm::LLVMContext &ctx, llvm::Module *module, llvm::Function* fn) { + fn->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); +// module->getOrInsertNamedMetadata("opencl.ocl.version")->addOperand(llvm::MDTuple::get(ctx, {llvm::ValueAsMetadata::get(builder.getInt32(2)), llvm::ValueAsMetadata::get(builder.getInt32(0))})); + + // // set metadata + // llvm::Metadata *md_args[] = { + // llvm::ValueAsMetadata::get(dst_fn), + // llvm::MDString::get(dst_ctx, "kernel"), + // llvm::ValueAsMetadata::get(dst_builder.getInt32(1)) + // }; + // module->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(llvm::MDNode::get(dst_ctx, md_args)); +} + +inline Instruction* add_barrier(llvm::Module *module, llvm::IRBuilder<>& builder) { +// Function *barrier = Intrinsic::getDeclaration(module, Intrinsic::nvvm_barrier0); +// return builder.CreateCall(barrier, {}); + + Function *barrier = Intrinsic::getDeclaration(module, Intrinsic::amdgcn_s_barrier); + return builder.CreateCall(barrier, {}); +} + +inline Value* get_global_offset(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned stride, unsigned ax) { +// static std::array ctaid = { +// Intrinsic::nvvm_read_ptx_sreg_ctaid_x, +// Intrinsic::nvvm_read_ptx_sreg_ctaid_y, +// Intrinsic::nvvm_read_ptx_sreg_ctaid_z +// }; + static std::array ids = { + Intrinsic::amdgcn_workgroup_id_x, + Intrinsic::amdgcn_workgroup_id_y, + Intrinsic::amdgcn_workgroup_id_z + }; + Value* get_group_id = Intrinsic::getDeclaration(module, ids[ax]); + Value* group_id = builder.CreateCall(get_group_id, {}); + Value* result = builder.CreateMul(builder.getInt32(stride), group_id); + return result; +} + +inline Value* get_local_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax) { +// static std::array ids = { +// Intrinsic::nvvm_read_ptx_sreg_tid_x, +// Intrinsic::nvvm_read_ptx_sreg_tid_y, +// Intrinsic::nvvm_read_ptx_sreg_tid_z +// }; + static std::array ids = { + Intrinsic::amdgcn_workitem_id_x, + Intrinsic::amdgcn_workitem_id_y, + Intrinsic::amdgcn_workitem_id_z + }; + Function *get_local_id = Intrinsic::getDeclaration(module, ids[ax]); + return builder.CreateCall(get_local_id, {}); +} + /* Distributed Tile */ void distributed_tile::init_indices() { std::vector id(axes_.size(), 0); @@ -264,8 +317,7 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::function(inst)){ Module *module = builder.GetInsertBlock()->getModule(); - Function *barrier = Intrinsic::getDeclaration(module, Intrinsic::nvvm_barrier0); - return builder.CreateCall(barrier, {}); + return add_barrier(module, builder); } if(auto* ii = dynamic_cast(inst)){ Type *ty = type(ii->get_type()->get_scalar_ty()); @@ -561,9 +613,8 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, void selection::init_grids(ir::function *fn, IRBuilder<> &builder, Value *sh_mem_ptr){ // fetch linear ID Module *mod = builder.GetInsertBlock()->getParent()->getParent(); - Function *get_thread_id = Intrinsic::getDeclaration(mod, Intrinsic::nvvm_read_ptx_sreg_tid_x); Value *warp_size = builder.getInt32(32); - Value *u_thread_id = builder.CreateCall(get_thread_id, {}); + Value* u_thread_id = get_local_id(mod, builder, 0); Value *u_thread_warp_id = builder.CreateURem(u_thread_id, warp_size); Value *u_warp_id = builder.CreateUDiv(u_thread_id, warp_size); // create grid @@ -619,14 +670,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & const auto& shapes = ins->get_type()->get_tile_shapes(); // global_range if(auto *x = dynamic_cast(ins)) { - static std::array ctaid = { - Intrinsic::nvvm_read_ptx_sreg_ctaid_x, - Intrinsic::nvvm_read_ptx_sreg_ctaid_y, - Intrinsic::nvvm_read_ptx_sreg_ctaid_z - }; - Function *get_group_id = Intrinsic::getDeclaration(module, ctaid[x->get_axis()]); - Value *group_id = builder.CreateCall(get_group_id, {}); - Value *offset = builder.CreateMul(builder.getInt32(shapes[0]->get_value()), group_id); + Value *offset = get_global_offset(module, builder, shapes[0]->get_value(), x->get_axis()); result->for_each([&](indices_t idx){ BinaryOperator *bin = static_cast(idx[0]); result->set_value(idx, builder.CreateAdd(bin, offset)); @@ -739,27 +783,27 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & return; // matrix multiplication else if(dynamic_cast(ins)) { - ir::value *A = ins->get_operand(0); - ir::value *B = ins->get_operand(1); - ir::value *C = ins->get_operand(2); - shared_tile *TA = (shared_tile*)tmap_.at(A); - shared_tile *TB = (shared_tile*)tmap_.at(B); - distributed_tile *TC = (distributed_tile*)tmap_.at(C); - TA->set_vector_size(TC->axis(0).contiguous); - TB->set_vector_size(TC->axis(1).contiguous); - Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {llvm_type(C->get_type()->get_scalar_ty(), ctx)}); - result->for_each([&](indices_t idx){ - Value *res = TC->get_value(idx); - unsigned NK = A->get_type()->get_tile_shapes()[1]->get_value(); - for(unsigned K = 0; K < NK; ++K){ - indices_t a_idx = {idx[0], builder.getInt32(K)}; - indices_t b_idx = {idx[1], builder.getInt32(K)}; - Value *a = TA->get_value(a_idx); - Value *b = TB->get_value(b_idx); - res = builder.CreateCall(f_mul_add, {a, b, res}); - } - result->set_value(idx, res); - }); +// ir::value *A = ins->get_operand(0); +// ir::value *B = ins->get_operand(1); +// ir::value *C = ins->get_operand(2); +// shared_tile *TA = (shared_tile*)tmap_.at(A); +// shared_tile *TB = (shared_tile*)tmap_.at(B); +// distributed_tile *TC = (distributed_tile*)tmap_.at(C); +// TA->set_vector_size(TC->axis(0).contiguous); +// TB->set_vector_size(TC->axis(1).contiguous); +// Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {llvm_type(C->get_type()->get_scalar_ty(), ctx)}); +// result->for_each([&](indices_t idx){ +// Value *res = TC->get_value(idx); +// unsigned NK = A->get_type()->get_tile_shapes()[1]->get_value(); +// for(unsigned K = 0; K < NK; ++K){ +// indices_t a_idx = {idx[0], builder.getInt32(K)}; +// indices_t b_idx = {idx[1], builder.getInt32(K)}; +// Value *a = TA->get_value(a_idx); +// Value *b = TB->get_value(b_idx); +// res = builder.CreateCall(f_mul_add, {a, b, res}); +// } +// result->set_value(idx, res); +// }); } // element-wise else { @@ -805,7 +849,7 @@ ArrayType* selection::llvm_linearized_tile_type(ir::type *ty, LLVMContext &ctx) return ArrayType::get(llvm_type(ty->get_scalar_ty(), ctx), size); } -void selection::run(ir::module &src, Module &dst){ +void selection::run(ir::module &src, Module &dst) { vmap_.clear(); LLVMContext &dst_ctx = dst.getContext(); IRBuilder<> dst_builder(dst_ctx); @@ -825,13 +869,7 @@ void selection::run(ir::module &src, Module &dst){ for(ir::attribute_t attr: attr_pair.second) dst_fn->addAttribute(id, llvm_attr(attr)); } - // set metadata - llvm::Metadata *md_args[] = { - llvm::ValueAsMetadata::get(dst_fn), - llvm::MDString::get(dst_ctx, "kernel"), - llvm::ValueAsMetadata::get(dst_builder.getInt32(1)) - }; - dst.getOrInsertNamedMetadata("nvvm.annotations")->addOperand(llvm::MDNode::get(dst_ctx, md_args)); + set_kernel(dst_builder, dst_ctx, &dst, dst_fn); // map parameters for(unsigned i = 0; i < fn->args().size(); i++) @@ -842,82 +880,83 @@ void selection::run(ir::module &src, Module &dst){ vmap_[block] = dst_block; } dst_builder.SetInsertPoint((BasicBlock*)vmap_[fn->blocks()[0]]); - // allocate shared memory - Value *sh_mem_ptr = nullptr; - if(unsigned alloc_size = alloc_->get_allocated_size()){ - Type *int_8_ty = Type::getInt8Ty(dst_ctx); - ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_size); - Type *ptr_ty = PointerType::get(int_8_ty, 3); - GlobalVariable *sh_mem_array = - new GlobalVariable(dst, array_ty, false, GlobalVariable::ExternalLinkage, - nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); - sh_mem_ptr = dst_builder.CreateBitCast(sh_mem_array, ptr_ty); - } - // create grids - init_grids(fn, dst_builder, sh_mem_ptr); - std::map last_block; - // iterate through block - for(ir::basic_block *block: fn->blocks()) { - BasicBlock *parent = (BasicBlock*)vmap_[block]; - dst_builder.SetInsertPoint(parent); - for(ir::instruction *i: block->get_inst_list()){ - BasicBlock *current = dst_builder.GetInsertBlock(); - bool phi_inserted = (dynamic_cast(i) || dynamic_cast(i)) && !current->empty(); - if(phi_inserted) - dst_builder.SetInsertPoint(&*current->getFirstInsertionPt()); - lower_instruction(i, dst_builder); - if(phi_inserted) - dst_builder.SetInsertPoint(current); - last_block[block] = dst_builder.GetInsertBlock(); - } - } - // add phi operands - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *inst: block->get_inst_list()) - if(auto *phi = dynamic_cast(inst)){ - if(buffer_info_->is_double(phi)) { - PHINode *ptr = (PHINode*)((shared_tile*)tmap_.at(phi))->get_pointer(); - PHINode *offset = (PHINode*)((shared_tile*)tmap_.at(phi))->get_offset(); - for(unsigned n = 0; n < phi->get_num_incoming(); n++){ - ir::basic_block* inc_block = phi->get_incoming_block(n); - ir::value* inc_val = phi->get_incoming_value(n); - ir::value* terminator = inc_block->get_inst_list().back(); - BasicBlock *llvm_inc_block = last_block.at(inc_block); - shared_tile *inc_shared = (shared_tile*)tmap_.at(inc_val); - bool is_loop_latch = buffer_info_->is_loop_latch(phi, terminator); - if(is_loop_latch){ - dst_builder.SetInsertPoint(llvm_inc_block->getTerminator()); - Value *next_offset = dst_builder.CreateNeg(offset); - offset->addIncoming(next_offset, llvm_inc_block); - } - else { - offset->addIncoming(dst_builder.getInt32(alloc_->get_num_bytes(phi)/(2*4)), llvm_inc_block); - } - ptr->addIncoming(inc_shared->get_pointer(), llvm_inc_block); - } - } - else { - for(unsigned n = 0; n < phi->get_num_incoming(); n++){ - ir::value *inc_val = phi->get_incoming_value(n); - ir::basic_block *inc_block = phi->get_incoming_block(n); - BasicBlock *llvm_inc_block = last_block.at(inc_block); - if(phi->get_type()->is_tile_ty()) { - distributed_tile *phi_tile = (distributed_tile*)tmap_.at(phi); - distributed_tile *inc_tile = (distributed_tile*)tmap_.at(inc_val); - phi_tile->for_each([&](indices_t idx){ - PHINode *llvm_phi = (PHINode*)phi_tile->get_value(idx); - Value *llvm_inc_val = inc_tile->get_value(idx); - llvm_phi->addIncoming(llvm_inc_val, llvm_inc_block); - }); - } - else { - PHINode *llvm_phi = (PHINode*)vmap_.at(phi); - Value *llvm_inc_val = vmap_.at(inc_val); - llvm_phi->addIncoming(llvm_inc_val, llvm_inc_block); - } - } - } - } + dst_builder.CreateRetVoid(); +// // allocate shared memory +// Value *sh_mem_ptr = nullptr; +// if(unsigned alloc_size = alloc_->get_allocated_size()){ +// Type *int_8_ty = Type::getInt8Ty(dst_ctx); +// ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_size); +// Type *ptr_ty = PointerType::get(int_8_ty, 3); +// GlobalVariable *sh_mem_array = +// new GlobalVariable(dst, array_ty, false, GlobalVariable::ExternalLinkage, +// nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); +// sh_mem_ptr = dst_builder.CreateBitCast(sh_mem_array, ptr_ty); +// } +// // create grids +// init_grids(fn, dst_builder, sh_mem_ptr); +// std::map last_block; +// // iterate through block +// for(ir::basic_block *block: fn->blocks()) { +// BasicBlock *parent = (BasicBlock*)vmap_[block]; +// dst_builder.SetInsertPoint(parent); +// for(ir::instruction *i: block->get_inst_list()){ +// BasicBlock *current = dst_builder.GetInsertBlock(); +// bool phi_inserted = (dynamic_cast(i) || dynamic_cast(i)) && !current->empty(); +// if(phi_inserted) +// dst_builder.SetInsertPoint(&*current->getFirstInsertionPt()); +// lower_instruction(i, dst_builder); +// if(phi_inserted) +// dst_builder.SetInsertPoint(current); +// last_block[block] = dst_builder.GetInsertBlock(); +// } +// } +// // add phi operands +// for(ir::basic_block *block: fn->blocks()) +// for(ir::instruction *inst: block->get_inst_list()) +// if(auto *phi = dynamic_cast(inst)){ +// if(buffer_info_->is_double(phi)) { +// PHINode *ptr = (PHINode*)((shared_tile*)tmap_.at(phi))->get_pointer(); +// PHINode *offset = (PHINode*)((shared_tile*)tmap_.at(phi))->get_offset(); +// for(unsigned n = 0; n < phi->get_num_incoming(); n++){ +// ir::basic_block* inc_block = phi->get_incoming_block(n); +// ir::value* inc_val = phi->get_incoming_value(n); +// ir::value* terminator = inc_block->get_inst_list().back(); +// BasicBlock *llvm_inc_block = last_block.at(inc_block); +// shared_tile *inc_shared = (shared_tile*)tmap_.at(inc_val); +// bool is_loop_latch = buffer_info_->is_loop_latch(phi, terminator); +// if(is_loop_latch){ +// dst_builder.SetInsertPoint(llvm_inc_block->getTerminator()); +// Value *next_offset = dst_builder.CreateNeg(offset); +// offset->addIncoming(next_offset, llvm_inc_block); +// } +// else { +// offset->addIncoming(dst_builder.getInt32(alloc_->get_num_bytes(phi)/(2*4)), llvm_inc_block); +// } +// ptr->addIncoming(inc_shared->get_pointer(), llvm_inc_block); +// } +// } +// else { +// for(unsigned n = 0; n < phi->get_num_incoming(); n++){ +// ir::value *inc_val = phi->get_incoming_value(n); +// ir::basic_block *inc_block = phi->get_incoming_block(n); +// BasicBlock *llvm_inc_block = last_block.at(inc_block); +// if(phi->get_type()->is_tile_ty()) { +// distributed_tile *phi_tile = (distributed_tile*)tmap_.at(phi); +// distributed_tile *inc_tile = (distributed_tile*)tmap_.at(inc_val); +// phi_tile->for_each([&](indices_t idx){ +// PHINode *llvm_phi = (PHINode*)phi_tile->get_value(idx); +// Value *llvm_inc_val = inc_tile->get_value(idx); +// llvm_phi->addIncoming(llvm_inc_val, llvm_inc_block); +// }); +// } +// else { +// PHINode *llvm_phi = (PHINode*)vmap_.at(phi); +// Value *llvm_inc_val = vmap_.at(inc_val); +// llvm_phi->addIncoming(llvm_inc_val, llvm_inc_block); +// } +// } +// } +// } } } diff --git a/lib/driver/buffer.cpp b/lib/driver/buffer.cpp index 433d33b2e..6bec0c66e 100755 --- a/lib/driver/buffer.cpp +++ b/lib/driver/buffer.cpp @@ -47,11 +47,11 @@ driver::context* buffer::context() { } buffer* buffer::create(driver::context* ctx, size_t size) { - if(dynamic_cast(ctx)) - return new cu_buffer(ctx, size); - if(dynamic_cast(ctx)) - return new ocl_buffer(ctx, size); - throw std::runtime_error("unknown context"); + switch(ctx->backend()){ + case CUDA: return new cu_buffer(ctx, size); + case OpenCL: return new ocl_buffer(ctx, size); + default: throw std::runtime_error("unknown backend"); + } } // @@ -59,7 +59,8 @@ buffer* buffer::create(driver::context* ctx, size_t size) { ocl_buffer::ocl_buffer(driver::context* context, size_t size) : buffer(context, cl_mem(), true){ cl_int err; - dispatch::clCreateBuffer(*context->cl(), CL_MEM_READ_WRITE, size, NULL, &err); + *cl_ = dispatch::clCreateBuffer(*context->cl(), CL_MEM_READ_WRITE, size, NULL, &err); + check(err); } diff --git a/lib/driver/context.cpp b/lib/driver/context.cpp index 6e1618713..5b82492b9 100755 --- a/lib/driver/context.cpp +++ b/lib/driver/context.cpp @@ -51,11 +51,11 @@ context::context(driver::device *dev, cl_context cl, bool take_ownership): } context* context::create(driver::device *dev){ - if(dynamic_cast(dev)) - return new cu_context(dev); - if(dynamic_cast(dev)) - return new ocl_context(dev); - throw std::runtime_error("unknown context"); + switch(dev->backend()){ + case CUDA: return new cu_context(dev); + case OpenCL: return new ocl_context(dev); + default: throw std::runtime_error("unknown backend"); + } } @@ -99,7 +99,7 @@ cu_context::context_switcher::context_switcher(const context &ctx): ctx_((const cu_context::context_switcher::~context_switcher() { CUcontext tmp; dispatch::cuCtxPopCurrent_v2(&tmp); - assert(tmp==(CUcontext)ctx_ && "Switching back to invalid context!"); + assert(tmp==*ctx_.cu() && "Switching back to invalid context!"); } // import CUdevice @@ -129,6 +129,7 @@ cu_context::cu_context(driver::device* device): context(device, CUcontext(), tru ocl_context::ocl_context(driver::device* dev): context(dev, cl_context(), true) { cl_int err; *cl_ = dispatch::clCreateContext(nullptr, 1, &*dev->cl(), nullptr, nullptr, &err); + check(err); } diff --git a/lib/driver/dispatch.cpp b/lib/driver/dispatch.cpp index b2e556d8e..4dfd6df6e 100755 --- a/lib/driver/dispatch.cpp +++ b/lib/driver/dispatch.cpp @@ -286,6 +286,7 @@ OCL_DEFINE5(cl_int, clGetProgramInfo, cl_program, cl_program_info, size_t, void OCL_DEFINE5(cl_int, clGetKernelInfo, cl_kernel, cl_kernel_info, size_t, void *, size_t *) OCL_DEFINE6(cl_int, clGetKernelWorkGroupInfo, cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *) OCL_DEFINE3(cl_kernel, clCreateKernel, cl_program, const char *, cl_int *) +OCL_DEFINE4(cl_int, clCreateKernelsInProgram, cl_program, cl_uint, cl_kernel*, cl_uint*) OCL_DEFINE5(cl_mem, clCreateBuffer, cl_context, cl_mem_flags, size_t, void *, cl_int *) OCL_DEFINE5(cl_program, clCreateProgramWithSource, cl_context, cl_uint, const char **, const size_t *, cl_int *) OCL_DEFINE1(cl_int, clReleaseKernel, cl_kernel) @@ -343,6 +344,7 @@ void* dispatch::clGetProgramInfo_; void* dispatch::clGetKernelInfo_; void* dispatch::clGetKernelWorkGroupInfo_; void* dispatch::clCreateKernel_; +void* dispatch::clCreateKernelsInProgram_; void* dispatch::clCreateBuffer_; void* dispatch::clCreateProgramWithSource_; void* dispatch::clReleaseKernel_; diff --git a/lib/driver/error.cpp b/lib/driver/error.cpp index f3cce16d5..99b2401dd 100755 --- a/lib/driver/error.cpp +++ b/lib/driver/error.cpp @@ -133,6 +133,67 @@ void check(cudnnStatus_t err){ } } +void check(cl_int err) +{ + using namespace exception::ocl; + switch(err) + { + case CL_SUCCESS: break; + case CL_DEVICE_NOT_FOUND: throw device_not_found(); + case CL_DEVICE_NOT_AVAILABLE: throw device_not_available(); + case CL_COMPILER_NOT_AVAILABLE: throw compiler_not_available(); + case CL_MEM_OBJECT_ALLOCATION_FAILURE: throw mem_object_allocation_failure(); + case CL_OUT_OF_RESOURCES: throw out_of_resources(); + case CL_OUT_OF_HOST_MEMORY: throw out_of_host_memory(); + case CL_PROFILING_INFO_NOT_AVAILABLE: throw profiling_info_not_available(); + case CL_MEM_COPY_OVERLAP: throw mem_copy_overlap(); + case CL_IMAGE_FORMAT_MISMATCH: throw image_format_mismatch(); + case CL_IMAGE_FORMAT_NOT_SUPPORTED: throw image_format_not_supported(); + case CL_BUILD_PROGRAM_FAILURE: throw build_program_failure(); + case CL_MAP_FAILURE: throw map_failure(); + + case CL_INVALID_VALUE: throw invalid_value(); + case CL_INVALID_DEVICE_TYPE: throw invalid_device_type(); + case CL_INVALID_PLATFORM: throw invalid_platform(); + case CL_INVALID_DEVICE: throw invalid_device(); + case CL_INVALID_CONTEXT: throw invalid_context(); + case CL_INVALID_QUEUE_PROPERTIES: throw invalid_queue_properties(); + case CL_INVALID_COMMAND_QUEUE: throw invalid_command_queue(); + case CL_INVALID_HOST_PTR: throw invalid_host_ptr(); + case CL_INVALID_MEM_OBJECT: throw invalid_mem_object(); + case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: throw invalid_image_format_descriptor(); + case CL_INVALID_IMAGE_SIZE: throw invalid_image_size(); + case CL_INVALID_SAMPLER: throw invalid_sampler(); + case CL_INVALID_BINARY: throw invalid_binary(); + case CL_INVALID_BUILD_OPTIONS: throw invalid_build_options(); + case CL_INVALID_PROGRAM: throw invalid_program(); + case CL_INVALID_PROGRAM_EXECUTABLE: throw invalid_program_executable(); + case CL_INVALID_KERNEL_NAME: throw invalid_kernel_name(); + case CL_INVALID_KERNEL_DEFINITION: throw invalid_kernel_definition(); + case CL_INVALID_KERNEL: throw invalid_kernel(); + case CL_INVALID_ARG_INDEX: throw invalid_arg_index(); + case CL_INVALID_ARG_VALUE: throw invalid_arg_value(); + case CL_INVALID_ARG_SIZE: throw invalid_arg_size(); + case CL_INVALID_KERNEL_ARGS: throw invalid_kernel_args(); + case CL_INVALID_WORK_DIMENSION: throw invalid_work_dimension(); + case CL_INVALID_WORK_GROUP_SIZE: throw invalid_work_group_size(); + case CL_INVALID_WORK_ITEM_SIZE: throw invalid_work_item_size(); + case CL_INVALID_GLOBAL_OFFSET: throw invalid_global_offset(); + case CL_INVALID_EVENT_WAIT_LIST: throw invalid_event_wait_list(); + case CL_INVALID_EVENT: throw invalid_event(); + case CL_INVALID_OPERATION: throw invalid_operation(); + case CL_INVALID_GL_OBJECT: throw invalid_gl_object(); + case CL_INVALID_BUFFER_SIZE: throw invalid_buffer_size(); + case CL_INVALID_MIP_LEVEL: throw invalid_mip_level(); + case CL_INVALID_GLOBAL_WORK_SIZE: throw invalid_global_work_size(); + #ifdef CL_INVALID_PROPERTY + case CL_INVALID_PROPERTY: throw invalid_property(); + #endif + default: throw; + } +} + + } } diff --git a/lib/driver/event.cpp b/lib/driver/event.cpp index ddd5f3874..ad341d701 100755 --- a/lib/driver/event.cpp +++ b/lib/driver/event.cpp @@ -27,13 +27,13 @@ namespace triton namespace driver { -float Event::elapsed_time() const{ +float event::elapsed_time() const{ float time; dispatch::cuEventElapsedTime(&time, cu_->first, cu_->second); return time; } -handle const & Event::cu() const +handle const & event::cu() const { return cu_; } } diff --git a/lib/driver/kernel.cpp b/lib/driver/kernel.cpp index 1490ad21d..4e40e6196 100755 --- a/lib/driver/kernel.cpp +++ b/lib/driver/kernel.cpp @@ -46,11 +46,11 @@ kernel::kernel(driver::module *program, cl_kernel fn, bool has_ownership): } kernel* kernel::create(driver::module* program, const char* name) { - if(dynamic_cast(program)) - return new cu_kernel(program, name); - if(dynamic_cast(program)) - return new ocl_kernel(program, name); - throw std::runtime_error("unknown program"); + switch(program->backend()){ + case CUDA: return new cu_kernel(program, name); + case OpenCL: return new ocl_kernel(program, name); + default: throw std::runtime_error("unknown backend"); + } } driver::module* kernel::module() { @@ -62,16 +62,21 @@ driver::module* kernel::module() { /* ------------------------ */ ocl_kernel::ocl_kernel(driver::module* program, const char* name): kernel(program, cl_kernel(), true) { +// cl_uint res; +// check(dispatch::clCreateKernelsInProgram(*program->cl(), 0, NULL, &res)); +// std::cout << res << std::endl; cl_int err; - *cl_ = dispatch::clCreateKernel(*program->cl(), name, &err); + std::cout << *program->cl() << std::endl; + *cl_ = dispatch::clCreateKernel(*program->cl(), "matmul", &err); + check(err); } void ocl_kernel::setArg(unsigned int index, std::size_t size, void* ptr) { - dispatch::clSetKernelArg(*cl_, index, size, ptr); + check(dispatch::clSetKernelArg(*cl_, index, size, ptr)); } void ocl_kernel::setArg(unsigned int index, driver::buffer* buffer) { - dispatch::clSetKernelArg(*cl_, index, sizeof(cl_mem), (void*)&*buffer->cl()); + check(dispatch::clSetKernelArg(*cl_, index, sizeof(cl_mem), (void*)&*buffer->cl())); } diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 5796cc7e5..989337b23 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -28,9 +28,15 @@ #include "triton/driver/error.h" #include "triton/tools/sys/getenv.hpp" #include "llvm/IR/IRPrintingPasses.h" +#include "llvm/Bitcode/BitcodeWriter.h" +#include "llvm/IR/Verifier.h" #include "llvm/IR/Module.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/PassManager.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Linker/Linker.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/AsmParser/Parser.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/TargetSelect.h" @@ -75,38 +81,60 @@ driver::context* module::context() const { } module* module::create(driver::context* ctx, llvm::Module *src) { - if(dynamic_cast(ctx)) - return new cu_module(ctx, src); - if(dynamic_cast(ctx)) - return new ocl_module(ctx, src); - throw std::runtime_error("unknown context"); + switch(ctx->backend()){ + case CUDA: return new cu_module(ctx, src); + case OpenCL: return new ocl_module(ctx, src); + default: throw std::runtime_error("unknown backend"); + } } void module::compile_llvm_module(llvm::Module* module, const std::string& triple, const std::string &proc, std::string layout, - llvm::SmallVectorImpl &buffer) { + llvm::SmallVectorImpl &buffer, + std::vector files) { init_llvm(); // create machine module->setTargetTriple(triple); std::string error; auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error); - llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, "", - llvm::TargetOptions(), llvm::Reloc::Model(), - llvm::None, llvm::CodeGenOpt::Aggressive); - + llvm::TargetOptions opt; + opt.AllowFPOpFusion = llvm::FPOpFusion::Fast; + opt.UnsafeFPMath = false; + opt.NoInfsFPMath = false; + opt.NoNaNsFPMath = true; + llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, "", opt, + llvm::Reloc::PIC_, llvm::None, llvm::CodeGenOpt::Aggressive); // set data layout if(layout.empty()) - layout = module->getDataLayoutStr(); - module->setDataLayout(layout); + module->setDataLayout(machine->createDataLayout()); + else + module->setDataLayout(layout); + + // link + for (std::string& file: files) { + std::string path = "/opt/rocm/lib/" + file; + llvm::SMDiagnostic err; + std::unique_ptr mlib = llvm::parseIRFile(path, err, module->getContext()); + if (mlib.get() == nullptr) { + std::string msg = err.getMessage(); + std::cerr << "Fail to load bitcode file " << path << "\n" + << "line " << err.getLineNo() << ":" << msg; + } + mlib->setTargetTriple(module->getTargetTriple()); + mlib->setDataLayout(module->getDataLayout()); + for (llvm::Function &f : mlib->functions()) { + f.addFnAttr(llvm::Attribute::AlwaysInline); + } + llvm::Linker::linkModules(*module, std::move(mlib)); + } - std::cout << "compiling" << std::endl; // emit machine code llvm::legacy::PassManager pass; llvm::raw_svector_ostream stream(buffer); - machine->addPassesToEmitFile(pass, stream, nullptr, llvm::TargetMachine::CGFT_AssemblyFile); + machine->addPassesToEmitFile(pass, stream, nullptr, llvm::TargetMachine::CGFT_ObjectFile); pass.run(*module); - std::cout << "compiled" << std::endl; +// std::cout << std::string(buffer.begin(), buffer.end()) << std::endl; } /* ------------------------ */ @@ -114,9 +142,56 @@ void module::compile_llvm_module(llvm::Module* module, const std::string& triple /* ------------------------ */ ocl_module::ocl_module(driver::context * context, llvm::Module* src): module(context, cl_program(), true) { +// const char* x = "__kernel void matmul(){ }"; +// cl_int err; +// *cl_ = dispatch::clCreateProgramWithSource(*context->cl(), 1, &x, NULL, &err); +// check(err); +// return; + init_llvm(); llvm::SmallVector buffer; - module::compile_llvm_module(src, "amdgcn-amd-amdpal", "gfx902", "", buffer); + std::vector files = { + "oclc_daz_opt_on.amdgcn.bc", + "ocml.amdgcn.bc", + "hc.amdgcn.bc", + "ockl.amdgcn.bc", + "oclc_correctly_rounded_sqrt_off.amdgcn.bc", + "oclc_correctly_rounded_sqrt_on.amdgcn.bc", + "oclc_daz_opt_off.amdgcn.bc", + "oclc_finite_only_off.amdgcn.bc", + "oclc_finite_only_on.amdgcn.bc", + "oclc_isa_version_803.amdgcn.bc", + "oclc_isa_version_900.amdgcn.bc", + "oclc_unsafe_math_off.amdgcn.bc", + "oclc_unsafe_math_on.amdgcn.bc", + "oclc_isa_version_700.amdgcn.bc", + "opencl.amdgcn.bc" + }; + module::compile_llvm_module(src, "amdgcn-amd-amdpal", "gfx902", "", buffer, files); + + + +// llvm::BitcodeWriter writer(buffer); +// writer.writeModule(*src); +// llvm::legacy::PassManager pass; +// llvm::raw_svector_ostream stream(buffer); +// pass.add(llvm::createPrintModulePass(stream)); +// pass.run(*src); + size_t sizes[] = {buffer.size()}; + const unsigned char* data[] = {(unsigned char*)buffer.data()}; + cl_int status; + cl_int err; + *cl_ = dispatch::clCreateProgramWithBinary(*context->cl(), 1, &*context->device()->cl(), sizes, data, &status, &err); + check(err); + check(status); + try{ + dispatch::clBuildProgram(*cl_, 1, &*context->device()->cl(), NULL, NULL, NULL); + } + catch(...){ + char log[2048]; + dispatch::clGetProgramBuildInfo(*cl_, *context->device()->cl(), CL_PROGRAM_BUILD_LOG, 1024, log, NULL); + std::cout << log << std::endl; + } } diff --git a/lib/driver/platform.cpp b/lib/driver/platform.cpp index 60d2bd128..ad85773dd 100755 --- a/lib/driver/platform.cpp +++ b/lib/driver/platform.cpp @@ -58,20 +58,26 @@ void cu_platform::devices(std::vector &devices) const{ std::string cl_platform::version() const { size_t size; - dispatch::clGetPlatformInfo(*cl_, CL_PLATFORM_VERSION, 0, nullptr, &size); + check(dispatch::clGetPlatformInfo(*cl_, CL_PLATFORM_VERSION, 0, nullptr, &size)); std::string result(size, 0); - dispatch::clGetPlatformInfo(*cl_, CL_PLATFORM_VERSION, size, (void*)&*result.begin(), nullptr); + check(dispatch::clGetPlatformInfo(*cl_, CL_PLATFORM_VERSION, size, (void*)&*result.begin(), nullptr)); return result; } void cl_platform::devices(std::vector &devices) const{ cl_uint num_devices; - dispatch::clGetDeviceIDs(*cl_, CL_DEVICE_TYPE_GPU, 0, nullptr, &num_devices); + check(dispatch::clGetDeviceIDs(*cl_, CL_DEVICE_TYPE_GPU, 0, nullptr, &num_devices)); std::vector ids(num_devices); - dispatch::clGetDeviceIDs(*cl_, CL_DEVICE_TYPE_GPU, num_devices, ids.data(), nullptr); + check(dispatch::clGetDeviceIDs(*cl_, CL_DEVICE_TYPE_GPU, num_devices, ids.data(), nullptr)); for(cl_device_id id: ids) devices.push_back(new driver::ocl_device(id)); } +/* ------------------------ */ +// Vulkan // +/* ------------------------ */ + + + } } diff --git a/lib/driver/stream.cpp b/lib/driver/stream.cpp index 35e369716..c3551c832 100755 --- a/lib/driver/stream.cpp +++ b/lib/driver/stream.cpp @@ -44,23 +44,20 @@ namespace driver stream::stream(driver::context *ctx, CUstream cu, bool has_ownership) : polymorphic_resource(cu, has_ownership), ctx_(ctx) { - } stream::stream(driver::context *ctx, cl_command_queue cl, bool has_ownership) : polymorphic_resource(cl, has_ownership), ctx_(ctx) { - } driver::stream* stream::create(driver::context* ctx) { - if(dynamic_cast(ctx)) - return new cu_stream(ctx); - if(dynamic_cast(ctx)) - return new cl_stream(ctx); - throw std::runtime_error("unknown context"); + switch(ctx->backend()){ + case CUDA: return new cu_stream(ctx); + case OpenCL: return new cl_stream(ctx); + default: throw std::runtime_error("unknown backend"); + } } - driver::context* stream::context() const { return ctx_; } @@ -73,22 +70,23 @@ driver::context* stream::context() const { cl_stream::cl_stream(driver::context *ctx): stream(ctx, cl_command_queue(), true) { cl_int err; *cl_ = dispatch::clCreateCommandQueue(*ctx->cl(), *ctx->device()->cl(), 0, &err); + check(err); } void cl_stream::synchronize() { - dispatch::clFinish(*cl_); + check(dispatch::clFinish(*cl_)); } -void cl_stream::enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const *, Event* event) { - cl_int err = dispatch::clEnqueueNDRangeKernel(*cl_, *kernel->cl(), grid.size(), NULL, (const size_t*)grid.data(), (const size_t*)block.data(), 0, NULL, NULL); +void cl_stream::enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const *, event* event) { + check(dispatch::clEnqueueNDRangeKernel(*cl_, *kernel->cl(), grid.size(), NULL, (const size_t*)grid.data(), (const size_t*)block.data(), 0, NULL, NULL)); } void cl_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) { - cl_int err = dispatch::clEnqueueWriteBuffer(*cl_, *buffer->cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL); + check(dispatch::clEnqueueWriteBuffer(*cl_, *buffer->cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL)); } void cl_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) { - cl_int err = dispatch::clEnqueueReadBuffer(*cl_, *buffer->cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL); + check(dispatch::clEnqueueReadBuffer(*cl_, *buffer->cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL)); } /* ------------------------ */ @@ -115,7 +113,7 @@ void cu_stream::synchronize() { dispatch::cuStreamSynchronize(*cu_); } -void cu_stream::enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const *, Event* event) { +void cu_stream::enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const *, event* event) { driver::cu_kernel* cu_kernel = (driver::cu_kernel*)kernel; cu_context::context_switcher ctx_switch(*ctx_); if(event) From 9de9feff4adc8d376e2789e877adceee3eb6bbf8 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 23 Mar 2019 13:40:42 -0700 Subject: [PATCH 113/494] [jit] added runtime for host but compilation still needs to be implemented --- include/triton/codegen/selection.h | 6 +- include/triton/codegen/tune.h | 2 - include/triton/driver/buffer.h | 8 +- include/triton/driver/context.h | 10 +- include/triton/driver/device.h | 9 +- include/triton/driver/handle.h | 50 +++++- include/triton/driver/kernel.h | 26 ++- include/triton/driver/module.h | 8 +- include/triton/driver/platform.h | 6 +- include/triton/driver/stream.h | 15 +- include/triton/jit.h | 7 +- lib/codegen/selection.cpp | 261 ++++++++++++----------------- lib/driver/backend.cpp | 5 + lib/driver/buffer.cpp | 12 ++ lib/driver/context.cpp | 12 ++ lib/driver/handle.cpp | 20 ++- lib/driver/kernel.cpp | 34 +++- lib/driver/module.cpp | 81 +++++---- lib/driver/platform.cpp | 9 +- lib/driver/stream.cpp | 33 ++++ lib/jit.cpp | 9 +- 21 files changed, 389 insertions(+), 234 deletions(-) diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 291fbf827..11acf28e7 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -24,6 +24,7 @@ namespace codegen{ class allocation; class tune; class buffer_info_pass; +class target; typedef std::vector indices_t; @@ -128,7 +129,9 @@ private: void lower_tile_instruction(ir::instruction *src, llvm::IRBuilder<> &builder); public: - selection(allocation *alloc, tune *params, buffer_info_pass *buffer_info): alloc_(alloc), params_(params), buffer_info_(buffer_info){ } + selection(allocation *alloc, tune *params, buffer_info_pass *buffer_info, target *tgt) + : alloc_(alloc), params_(params), buffer_info_(buffer_info), tgt_(tgt){ } + void run(ir::module &src, llvm::Module &dst); private: @@ -138,6 +141,7 @@ private: pmap_t last_block_; allocation *alloc_; tune *params_; + target *tgt_; buffer_info_pass *buffer_info_; std::map axes_; }; diff --git a/include/triton/codegen/tune.h b/include/triton/codegen/tune.h index 9fd321572..9b81fcb53 100644 --- a/include/triton/codegen/tune.h +++ b/include/triton/codegen/tune.h @@ -17,8 +17,6 @@ namespace ir{ namespace codegen{ -class place_shared_copy; - class tune { typedef std::pair node_t; typedef std::map > graph_t; diff --git a/include/triton/driver/buffer.h b/include/triton/driver/buffer.h index 667892525..ed314216a 100755 --- a/include/triton/driver/buffer.h +++ b/include/triton/driver/buffer.h @@ -34,10 +34,11 @@ namespace driver class cu_stream; // Base -class buffer : public polymorphic_resource { +class buffer : public polymorphic_resource { public: buffer(driver::context* ctx, CUdeviceptr cl, bool take_ownership); buffer(driver::context* ctx, cl_mem cl, bool take_ownership); + buffer(driver::context* ctx, host_buffer_t hst, bool take_ownership); static buffer* create(driver::context* ctx, size_t size); driver::context* context(); @@ -46,9 +47,10 @@ protected: }; // CPU -class cpu_buffer: public buffer +class host_buffer: public buffer { - +public: + host_buffer(driver::context* context, size_t size); }; // OpenCL diff --git a/include/triton/driver/context.h b/include/triton/driver/context.h index f56e45451..7a31e85a1 100755 --- a/include/triton/driver/context.h +++ b/include/triton/driver/context.h @@ -31,13 +31,14 @@ namespace triton namespace driver { -class context: public polymorphic_resource{ +class context: public polymorphic_resource{ protected: static std::string get_cache_path(); public: context(driver::device *dev, CUcontext cu, bool take_ownership); context(driver::device *dev, cl_context cl, bool take_ownership); + context(driver::device *dev, host_context_t hst, bool take_ownership); driver::device* device() const; std::string const & cache_path() const; // factory methods @@ -48,9 +49,10 @@ protected: std::string cache_path_; }; -// CPU -class cpu_context: public context { - +// Host +class host_context: public context { +public: + host_context(driver::device* dev); }; // CUDA diff --git a/include/triton/driver/device.h b/include/triton/driver/device.h index 8a5f7790b..d99e47fe2 100755 --- a/include/triton/driver/device.h +++ b/include/triton/driver/device.h @@ -35,14 +35,15 @@ namespace driver class context; // Base device -class device: public polymorphic_resource{ +class device: public polymorphic_resource{ public: using polymorphic_resource::polymorphic_resource; }; -// CPU device -class cpu_device: public device { - +// Host device +class host_device: public device { +public: + host_device(): device(host_device_t(), true){ } }; // OpenCL device diff --git a/include/triton/driver/handle.h b/include/triton/driver/handle.h index 2bbe7ba42..f87a8ffa6 100755 --- a/include/triton/driver/handle.h +++ b/include/triton/driver/handle.h @@ -24,11 +24,18 @@ #define TDL_INCLUDE_DRIVER_HANDLE_H #include +#include #include #include #include #include "triton/driver/dispatch.h" +namespace llvm +{ +class ExecutionEngine; +class Function; +} + namespace triton { @@ -37,10 +44,43 @@ namespace driver enum backend_t { CUDA, - OpenCL + OpenCL, + Host }; -// helpers for CUDA +// Host handles +struct host_platform_t{ + +}; + +struct host_device_t{ + +}; + +struct host_context_t{ + +}; + +struct host_stream_t{ + +}; + +struct host_module_t{ + std::string error; + llvm::ExecutionEngine* engine; + std::map functions; +}; + +struct host_function_t{ + llvm::Function* fn; +}; + +struct host_buffer_t{ + char* data; +}; + + +// Extra CUDA handles struct cu_event_t{ operator bool() const { return first && second; } CUevent first; @@ -82,22 +122,26 @@ protected: bool has_ownership_; }; -template +template class polymorphic_resource { public: polymorphic_resource(CUType cu, bool take_ownership): cu_(cu, take_ownership), backend_(CUDA){} polymorphic_resource(CLType cl, bool take_ownership): cl_(cl, take_ownership), backend_(OpenCL){} + polymorphic_resource(HostType hst, bool take_ownership): hst_(hst, take_ownership), backend_(Host){} virtual ~polymorphic_resource() { } handle cu() { return cu_; } handle cl() { return cl_; } + handle hst() { return hst_; } const handle& cu() const { return cu_; } const handle& cl() const { return cl_; } + const handle& hst() const { return hst_; } backend_t backend() { return backend_; } protected: handle cl_; handle cu_; + handle hst_; backend_t backend_; }; diff --git a/include/triton/driver/kernel.h b/include/triton/driver/kernel.h index a68b81840..5a4669086 100755 --- a/include/triton/driver/kernel.h +++ b/include/triton/driver/kernel.h @@ -28,6 +28,11 @@ #include +namespace llvm +{ +class GenericValue; +} + namespace triton { @@ -37,10 +42,11 @@ namespace driver class cu_buffer; // Base -class kernel: public polymorphic_resource { +class kernel: public polymorphic_resource { public: kernel(driver::module* program, CUfunction fn, bool has_ownership); kernel(driver::module* program, cl_kernel fn, bool has_ownership); + kernel(driver::module* program, host_function_t fn, bool has_ownership); // Getters driver::module* module(); // Factory methods @@ -53,9 +59,19 @@ private: driver::module* program_; }; -// CPU -class cpu_kernel: public kernel { - +// Host +class host_kernel: public kernel { +public: + //Constructors + host_kernel(driver::module* program, const char* name); + // Arguments setters + void setArg(unsigned int index, std::size_t size, void* ptr); + void setArg(unsigned int index, driver::buffer* buffer); + // Params + const std::vector& params(); +private: + std::vector > params_store_; + std::vector params_; }; // OpenCL @@ -81,8 +97,6 @@ public: void* const* cu_params() const; private: - handle cu_; - driver::cu_module* program_; std::vector > cu_params_store_; std::vector cu_params_; }; diff --git a/include/triton/driver/module.h b/include/triton/driver/module.h index a149812cf..706b90c47 100755 --- a/include/triton/driver/module.h +++ b/include/triton/driver/module.h @@ -45,13 +45,14 @@ class cu_context; class cu_device; // Base -class module: public polymorphic_resource { +class module: public polymorphic_resource { protected: void init_llvm(); public: module(driver::context* ctx, CUmodule mod, bool has_ownership); module(driver::context* ctx, cl_program mod, bool has_ownership); + module(driver::context* ctx, host_module_t mod, bool has_ownership); static module* create(driver::context* ctx, llvm::Module *src); driver::context* context() const; void compile_llvm_module(llvm::Module* module, const std::string& triple, @@ -63,8 +64,9 @@ protected: }; // CPU -class cpu_module: public module{ - +class host_module: public module{ +public: + host_module(driver::context* context, llvm::Module *module); }; // OpenCL diff --git a/include/triton/driver/platform.h b/include/triton/driver/platform.h index ff06e4b01..45b5399c5 100755 --- a/include/triton/driver/platform.h +++ b/include/triton/driver/platform.h @@ -74,11 +74,11 @@ private: handle cl_; }; -// CPU -class cpu_platform: public platform +// Host +class host_platform: public platform { public: - cpu_platform(): platform("CPU") { } + host_platform(): platform("CPU") { } std::string version() const; void devices(std::vector &devices) const; }; diff --git a/include/triton/driver/stream.h b/include/triton/driver/stream.h index 6044a0376..76d72af39 100755 --- a/include/triton/driver/stream.h +++ b/include/triton/driver/stream.h @@ -41,10 +41,11 @@ class Range; class cu_buffer; // Base -class stream: public polymorphic_resource { +class stream: public polymorphic_resource { public: stream(driver::context *ctx, CUstream, bool has_ownership); stream(driver::context *ctx, cl_command_queue, bool has_ownership); + stream(driver::context *ctx, host_stream_t, bool has_ownership); // factory static driver::stream* create(driver::context* ctx); // accessors @@ -64,9 +65,17 @@ protected: driver::context *ctx_; }; -// CPU -class cpu_stream: public stream { +// Host +class host_stream: public stream { +public: + // Constructors + host_stream(driver::context *ctx); + // Overridden + void synchronize(); + void enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const *, event *event); + void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr); + void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr); }; // OpenCL diff --git a/include/triton/jit.h b/include/triton/jit.h index ecf22daf0..93f08f280 100644 --- a/include/triton/jit.h +++ b/include/triton/jit.h @@ -15,6 +15,7 @@ #include "triton/codegen/vectorize.h" #include "triton/codegen/buffer_info.h" #include "triton/codegen/barriers.h" +#include "triton/codegen/target.h" #include namespace llvm { @@ -42,11 +43,12 @@ public: typedef std::function benchmark_t; struct passes_wrapper { - passes_wrapper(): shared(&buffer_info), liveness(&buffer_info), + passes_wrapper(codegen::target* target) + : shared(&buffer_info), liveness(&buffer_info), allocation(&liveness, &buffer_info), barriers(&allocation, &buffer_info), vectorize(&tune), - selection(&allocation, &tune, &buffer_info){ } + selection(&allocation, &tune, &buffer_info, target) { } void init(ir::module &module) { // generate ptx @@ -89,6 +91,7 @@ private: ir::context triton_context_; std::map launch_info_map_; std::map global_ints_; + std::unique_ptr target_; }; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index f8667b5f7..d893dbeee 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -1,6 +1,7 @@ #include "triton/codegen/selection.h" #include "triton/codegen/tune.h" #include "triton/codegen/allocation.h" +#include "triton/codegen/target.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Module.h" #include "llvm/IR/IRBuilder.h" @@ -19,59 +20,6 @@ namespace codegen{ using namespace llvm; -inline void set_kernel(llvm::IRBuilder<>& builder, llvm::LLVMContext &ctx, llvm::Module *module, llvm::Function* fn) { - fn->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); -// module->getOrInsertNamedMetadata("opencl.ocl.version")->addOperand(llvm::MDTuple::get(ctx, {llvm::ValueAsMetadata::get(builder.getInt32(2)), llvm::ValueAsMetadata::get(builder.getInt32(0))})); - - // // set metadata - // llvm::Metadata *md_args[] = { - // llvm::ValueAsMetadata::get(dst_fn), - // llvm::MDString::get(dst_ctx, "kernel"), - // llvm::ValueAsMetadata::get(dst_builder.getInt32(1)) - // }; - // module->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(llvm::MDNode::get(dst_ctx, md_args)); -} - -inline Instruction* add_barrier(llvm::Module *module, llvm::IRBuilder<>& builder) { -// Function *barrier = Intrinsic::getDeclaration(module, Intrinsic::nvvm_barrier0); -// return builder.CreateCall(barrier, {}); - - Function *barrier = Intrinsic::getDeclaration(module, Intrinsic::amdgcn_s_barrier); - return builder.CreateCall(barrier, {}); -} - -inline Value* get_global_offset(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned stride, unsigned ax) { -// static std::array ctaid = { -// Intrinsic::nvvm_read_ptx_sreg_ctaid_x, -// Intrinsic::nvvm_read_ptx_sreg_ctaid_y, -// Intrinsic::nvvm_read_ptx_sreg_ctaid_z -// }; - static std::array ids = { - Intrinsic::amdgcn_workgroup_id_x, - Intrinsic::amdgcn_workgroup_id_y, - Intrinsic::amdgcn_workgroup_id_z - }; - Value* get_group_id = Intrinsic::getDeclaration(module, ids[ax]); - Value* group_id = builder.CreateCall(get_group_id, {}); - Value* result = builder.CreateMul(builder.getInt32(stride), group_id); - return result; -} - -inline Value* get_local_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax) { -// static std::array ids = { -// Intrinsic::nvvm_read_ptx_sreg_tid_x, -// Intrinsic::nvvm_read_ptx_sreg_tid_y, -// Intrinsic::nvvm_read_ptx_sreg_tid_z -// }; - static std::array ids = { - Intrinsic::amdgcn_workitem_id_x, - Intrinsic::amdgcn_workitem_id_y, - Intrinsic::amdgcn_workitem_id_z - }; - Function *get_local_id = Intrinsic::getDeclaration(module, ids[ax]); - return builder.CreateCall(get_local_id, {}); -} - /* Distributed Tile */ void distributed_tile::init_indices() { std::vector id(axes_.size(), 0); @@ -317,7 +265,7 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::function(inst)){ Module *module = builder.GetInsertBlock()->getModule(); - return add_barrier(module, builder); + return tgt_->add_barrier(module, builder); } if(auto* ii = dynamic_cast(inst)){ Type *ty = type(ii->get_type()->get_scalar_ty()); @@ -614,7 +562,7 @@ void selection::init_grids(ir::function *fn, IRBuilder<> &builder, Value *sh_mem // fetch linear ID Module *mod = builder.GetInsertBlock()->getParent()->getParent(); Value *warp_size = builder.getInt32(32); - Value* u_thread_id = get_local_id(mod, builder, 0); + Value* u_thread_id = tgt_->get_local_id(mod, builder, 0); Value *u_thread_warp_id = builder.CreateURem(u_thread_id, warp_size); Value *u_warp_id = builder.CreateUDiv(u_thread_id, warp_size); // create grid @@ -670,7 +618,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & const auto& shapes = ins->get_type()->get_tile_shapes(); // global_range if(auto *x = dynamic_cast(ins)) { - Value *offset = get_global_offset(module, builder, shapes[0]->get_value(), x->get_axis()); + Value *offset = tgt_->get_global_offset(module, builder, shapes[0]->get_value(), x->get_axis()); result->for_each([&](indices_t idx){ BinaryOperator *bin = static_cast(idx[0]); result->set_value(idx, builder.CreateAdd(bin, offset)); @@ -783,27 +731,27 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & return; // matrix multiplication else if(dynamic_cast(ins)) { -// ir::value *A = ins->get_operand(0); -// ir::value *B = ins->get_operand(1); -// ir::value *C = ins->get_operand(2); -// shared_tile *TA = (shared_tile*)tmap_.at(A); -// shared_tile *TB = (shared_tile*)tmap_.at(B); -// distributed_tile *TC = (distributed_tile*)tmap_.at(C); -// TA->set_vector_size(TC->axis(0).contiguous); -// TB->set_vector_size(TC->axis(1).contiguous); -// Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {llvm_type(C->get_type()->get_scalar_ty(), ctx)}); -// result->for_each([&](indices_t idx){ -// Value *res = TC->get_value(idx); -// unsigned NK = A->get_type()->get_tile_shapes()[1]->get_value(); -// for(unsigned K = 0; K < NK; ++K){ -// indices_t a_idx = {idx[0], builder.getInt32(K)}; -// indices_t b_idx = {idx[1], builder.getInt32(K)}; -// Value *a = TA->get_value(a_idx); -// Value *b = TB->get_value(b_idx); -// res = builder.CreateCall(f_mul_add, {a, b, res}); -// } -// result->set_value(idx, res); -// }); + ir::value *A = ins->get_operand(0); + ir::value *B = ins->get_operand(1); + ir::value *C = ins->get_operand(2); + shared_tile *TA = (shared_tile*)tmap_.at(A); + shared_tile *TB = (shared_tile*)tmap_.at(B); + distributed_tile *TC = (distributed_tile*)tmap_.at(C); + TA->set_vector_size(TC->axis(0).contiguous); + TB->set_vector_size(TC->axis(1).contiguous); + Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {llvm_type(C->get_type()->get_scalar_ty(), ctx)}); + result->for_each([&](indices_t idx){ + Value *res = TC->get_value(idx); + unsigned NK = A->get_type()->get_tile_shapes()[1]->get_value(); + for(unsigned K = 0; K < NK; ++K){ + indices_t a_idx = {idx[0], builder.getInt32(K)}; + indices_t b_idx = {idx[1], builder.getInt32(K)}; + Value *a = TA->get_value(a_idx); + Value *b = TB->get_value(b_idx); + res = builder.CreateCall(f_mul_add, {a, b, res}); + } + result->set_value(idx, res); + }); } // element-wise else { @@ -869,7 +817,7 @@ void selection::run(ir::module &src, Module &dst) { for(ir::attribute_t attr: attr_pair.second) dst_fn->addAttribute(id, llvm_attr(attr)); } - set_kernel(dst_builder, dst_ctx, &dst, dst_fn); + tgt_->set_kernel(dst_builder, dst_ctx, &dst, dst_fn); // map parameters for(unsigned i = 0; i < fn->args().size(); i++) @@ -880,83 +828,86 @@ void selection::run(ir::module &src, Module &dst) { vmap_[block] = dst_block; } dst_builder.SetInsertPoint((BasicBlock*)vmap_[fn->blocks()[0]]); - dst_builder.CreateRetVoid(); -// // allocate shared memory -// Value *sh_mem_ptr = nullptr; -// if(unsigned alloc_size = alloc_->get_allocated_size()){ -// Type *int_8_ty = Type::getInt8Ty(dst_ctx); -// ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_size); -// Type *ptr_ty = PointerType::get(int_8_ty, 3); -// GlobalVariable *sh_mem_array = -// new GlobalVariable(dst, array_ty, false, GlobalVariable::ExternalLinkage, -// nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); -// sh_mem_ptr = dst_builder.CreateBitCast(sh_mem_array, ptr_ty); -// } -// // create grids -// init_grids(fn, dst_builder, sh_mem_ptr); -// std::map last_block; -// // iterate through block -// for(ir::basic_block *block: fn->blocks()) { -// BasicBlock *parent = (BasicBlock*)vmap_[block]; -// dst_builder.SetInsertPoint(parent); -// for(ir::instruction *i: block->get_inst_list()){ -// BasicBlock *current = dst_builder.GetInsertBlock(); -// bool phi_inserted = (dynamic_cast(i) || dynamic_cast(i)) && !current->empty(); -// if(phi_inserted) -// dst_builder.SetInsertPoint(&*current->getFirstInsertionPt()); -// lower_instruction(i, dst_builder); -// if(phi_inserted) -// dst_builder.SetInsertPoint(current); -// last_block[block] = dst_builder.GetInsertBlock(); -// } -// } -// // add phi operands -// for(ir::basic_block *block: fn->blocks()) -// for(ir::instruction *inst: block->get_inst_list()) -// if(auto *phi = dynamic_cast(inst)){ -// if(buffer_info_->is_double(phi)) { -// PHINode *ptr = (PHINode*)((shared_tile*)tmap_.at(phi))->get_pointer(); -// PHINode *offset = (PHINode*)((shared_tile*)tmap_.at(phi))->get_offset(); -// for(unsigned n = 0; n < phi->get_num_incoming(); n++){ -// ir::basic_block* inc_block = phi->get_incoming_block(n); -// ir::value* inc_val = phi->get_incoming_value(n); -// ir::value* terminator = inc_block->get_inst_list().back(); -// BasicBlock *llvm_inc_block = last_block.at(inc_block); -// shared_tile *inc_shared = (shared_tile*)tmap_.at(inc_val); -// bool is_loop_latch = buffer_info_->is_loop_latch(phi, terminator); -// if(is_loop_latch){ -// dst_builder.SetInsertPoint(llvm_inc_block->getTerminator()); -// Value *next_offset = dst_builder.CreateNeg(offset); -// offset->addIncoming(next_offset, llvm_inc_block); -// } -// else { -// offset->addIncoming(dst_builder.getInt32(alloc_->get_num_bytes(phi)/(2*4)), llvm_inc_block); -// } -// ptr->addIncoming(inc_shared->get_pointer(), llvm_inc_block); -// } -// } -// else { -// for(unsigned n = 0; n < phi->get_num_incoming(); n++){ -// ir::value *inc_val = phi->get_incoming_value(n); -// ir::basic_block *inc_block = phi->get_incoming_block(n); -// BasicBlock *llvm_inc_block = last_block.at(inc_block); -// if(phi->get_type()->is_tile_ty()) { -// distributed_tile *phi_tile = (distributed_tile*)tmap_.at(phi); -// distributed_tile *inc_tile = (distributed_tile*)tmap_.at(inc_val); -// phi_tile->for_each([&](indices_t idx){ -// PHINode *llvm_phi = (PHINode*)phi_tile->get_value(idx); -// Value *llvm_inc_val = inc_tile->get_value(idx); -// llvm_phi->addIncoming(llvm_inc_val, llvm_inc_block); -// }); -// } -// else { -// PHINode *llvm_phi = (PHINode*)vmap_.at(phi); -// Value *llvm_inc_val = vmap_.at(inc_val); -// llvm_phi->addIncoming(llvm_inc_val, llvm_inc_block); -// } -// } -// } -// } + + // allocate shared memory + Value *sh_mem_ptr = nullptr; + if(unsigned alloc_size = alloc_->get_allocated_size()){ + Type *int_8_ty = Type::getInt8Ty(dst_ctx); + ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_size); + Type *ptr_ty = PointerType::get(int_8_ty, 3); + GlobalVariable *sh_mem_array = + new GlobalVariable(dst, array_ty, false, GlobalVariable::ExternalLinkage, + nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); + sh_mem_ptr = dst_builder.CreateBitCast(sh_mem_array, ptr_ty); + } + + // create grids + init_grids(fn, dst_builder, sh_mem_ptr); + + // iterate through block + std::map last_block; + for(ir::basic_block *block: fn->blocks()) { + BasicBlock *parent = (BasicBlock*)vmap_[block]; + dst_builder.SetInsertPoint(parent); + for(ir::instruction *i: block->get_inst_list()){ + BasicBlock *current = dst_builder.GetInsertBlock(); + bool phi_inserted = (dynamic_cast(i) || dynamic_cast(i)) && !current->empty(); + if(phi_inserted) + dst_builder.SetInsertPoint(&*current->getFirstInsertionPt()); + lower_instruction(i, dst_builder); + if(phi_inserted) + dst_builder.SetInsertPoint(current); + last_block[block] = dst_builder.GetInsertBlock(); + } + } + + // add phi operands + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *inst: block->get_inst_list()) + if(auto *phi = dynamic_cast(inst)){ + if(buffer_info_->is_double(phi)) { + PHINode *ptr = (PHINode*)((shared_tile*)tmap_.at(phi))->get_pointer(); + PHINode *offset = (PHINode*)((shared_tile*)tmap_.at(phi))->get_offset(); + for(unsigned n = 0; n < phi->get_num_incoming(); n++){ + ir::basic_block* inc_block = phi->get_incoming_block(n); + ir::value* inc_val = phi->get_incoming_value(n); + ir::value* terminator = inc_block->get_inst_list().back(); + BasicBlock *llvm_inc_block = last_block.at(inc_block); + shared_tile *inc_shared = (shared_tile*)tmap_.at(inc_val); + bool is_loop_latch = buffer_info_->is_loop_latch(phi, terminator); + if(is_loop_latch){ + dst_builder.SetInsertPoint(llvm_inc_block->getTerminator()); + Value *next_offset = dst_builder.CreateNeg(offset); + offset->addIncoming(next_offset, llvm_inc_block); + } + else { + offset->addIncoming(dst_builder.getInt32(alloc_->get_num_bytes(phi)/(2*4)), llvm_inc_block); + } + ptr->addIncoming(inc_shared->get_pointer(), llvm_inc_block); + } + } + else { + for(unsigned n = 0; n < phi->get_num_incoming(); n++){ + ir::value *inc_val = phi->get_incoming_value(n); + ir::basic_block *inc_block = phi->get_incoming_block(n); + BasicBlock *llvm_inc_block = last_block.at(inc_block); + if(phi->get_type()->is_tile_ty()) { + distributed_tile *phi_tile = (distributed_tile*)tmap_.at(phi); + distributed_tile *inc_tile = (distributed_tile*)tmap_.at(inc_val); + phi_tile->for_each([&](indices_t idx){ + PHINode *llvm_phi = (PHINode*)phi_tile->get_value(idx); + Value *llvm_inc_val = inc_tile->get_value(idx); + llvm_phi->addIncoming(llvm_inc_val, llvm_inc_block); + }); + } + else { + PHINode *llvm_phi = (PHINode*)vmap_.at(phi); + Value *llvm_inc_val = vmap_.at(inc_val); + llvm_phi->addIncoming(llvm_inc_val, llvm_inc_block); + } + } + } + } } } diff --git a/lib/driver/backend.cpp b/lib/driver/backend.cpp index 628f0c225..9761e94e7 100755 --- a/lib/driver/backend.cpp +++ b/lib/driver/backend.cpp @@ -57,6 +57,11 @@ void backend::platforms::init() { for(cl_platform_id id: ids) cache_.push_back(new cl_platform(id)); } + //if host is here + bool host_visible = true; + if(host_visible){ + cache_.push_back(new host_platform()); + } if(cache_.empty()) throw std::runtime_error("ISAAC: No backend available. Make sure CUDA is available in your library path"); } diff --git a/lib/driver/buffer.cpp b/lib/driver/buffer.cpp index 6bec0c66e..a64e0aeca 100755 --- a/lib/driver/buffer.cpp +++ b/lib/driver/buffer.cpp @@ -42,6 +42,10 @@ buffer::buffer(driver::context* ctx, CUdeviceptr cu, bool take_ownership) buffer::buffer(driver::context* ctx, cl_mem cl, bool take_ownership) : polymorphic_resource(cl, take_ownership), context_(ctx) { } +buffer::buffer(driver::context* ctx, host_buffer_t hst, bool take_ownership) + : polymorphic_resource(hst, take_ownership), context_(ctx) { } + + driver::context* buffer::context() { return context_; } @@ -50,12 +54,20 @@ buffer* buffer::create(driver::context* ctx, size_t size) { switch(ctx->backend()){ case CUDA: return new cu_buffer(ctx, size); case OpenCL: return new ocl_buffer(ctx, size); + case Host: return new host_buffer(ctx, size); default: throw std::runtime_error("unknown backend"); } } // +host_buffer::host_buffer(driver::context *context, size_t size) + : buffer(context, host_buffer_t(), true){ + hst_->data = new char[size]; +} + +// + ocl_buffer::ocl_buffer(driver::context* context, size_t size) : buffer(context, cl_mem(), true){ cl_int err; diff --git a/lib/driver/context.cpp b/lib/driver/context.cpp index 5b82492b9..f9d7d0662 100755 --- a/lib/driver/context.cpp +++ b/lib/driver/context.cpp @@ -47,13 +47,18 @@ context::context(driver::device *dev, CUcontext cu, bool take_ownership): context::context(driver::device *dev, cl_context cl, bool take_ownership): polymorphic_resource(cl, take_ownership), dev_(dev), cache_path_(get_cache_path()){ +} +context::context(driver::device *dev, host_context_t hst, bool take_ownership): + polymorphic_resource(hst, take_ownership), + dev_(dev), cache_path_(get_cache_path()){ } context* context::create(driver::device *dev){ switch(dev->backend()){ case CUDA: return new cu_context(dev); case OpenCL: return new ocl_context(dev); + case Host: return new host_context(dev); default: throw std::runtime_error("unknown backend"); } } @@ -86,6 +91,13 @@ std::string const & context::cache_path() const{ return cache_path_; } +/* ------------------------ */ +// Host // +/* ------------------------ */ + +host_context::host_context(driver::device* dev): context(dev, host_context_t(), true){ + +} /* ------------------------ */ // CUDA // diff --git a/lib/driver/handle.cpp b/lib/driver/handle.cpp index c9534c4af..603cf2b0d 100755 --- a/lib/driver/handle.cpp +++ b/lib/driver/handle.cpp @@ -30,6 +30,15 @@ namespace triton namespace driver { +//Host +inline void _delete(host_platform_t) { } +inline void _delete(host_device_t) { } +inline void _delete(host_context_t) { } +inline void _delete(host_module_t) { } +inline void _delete(host_stream_t) { } +inline void _delete(host_buffer_t x) { if(x.data) delete[] x.data; } +inline void _delete(host_function_t) { } + //OpenCL inline void _delete(cl_platform_id) { } inline void _delete(cl_device_id x) { dispatch::clReleaseDevice(x); } @@ -58,7 +67,7 @@ handle::handle(CUType cu, bool take_ownership): h_(new CUType(cu)), has_ template handle::~handle(){ - if(has_ownership_ && h_ && h_.unique() && *h_) + if(has_ownership_ && h_ && h_.unique()) _delete(*h_); } @@ -79,5 +88,14 @@ template class handle; template class handle; template class handle; +template class handle; +template class handle; +template class handle; +template class handle; +template class handle; +template class handle; +template class handle; + + } } diff --git a/lib/driver/kernel.cpp b/lib/driver/kernel.cpp index 4e40e6196..efd366a5a 100755 --- a/lib/driver/kernel.cpp +++ b/lib/driver/kernel.cpp @@ -22,7 +22,7 @@ #include #include - +#include "llvm/ExecutionEngine/GenericValue.h" #include "triton/driver/kernel.h" #include "triton/driver/buffer.h" @@ -45,10 +45,15 @@ kernel::kernel(driver::module *program, cl_kernel fn, bool has_ownership): polymorphic_resource(fn, has_ownership), program_(program){ } +kernel::kernel(driver::module *program, host_function_t fn, bool has_ownership): + polymorphic_resource(fn, has_ownership), program_(program){ +} + kernel* kernel::create(driver::module* program, const char* name) { switch(program->backend()){ case CUDA: return new cu_kernel(program, name); case OpenCL: return new ocl_kernel(program, name); + case Host: return new host_kernel(program, name); default: throw std::runtime_error("unknown backend"); } } @@ -57,6 +62,32 @@ driver::module* kernel::module() { return program_; } +/* ------------------------ */ +// Host // +/* ------------------------ */ + +host_kernel::host_kernel(driver::module* program, const char *name): kernel(program, host_function_t(), true) { + hst_->fn = program->hst()->functions.at(name); +} + +void host_kernel::setArg(unsigned int index, std::size_t size, void* ptr){ + if(index + 1> params_store_.size()){ + params_store_.resize(index+1); + params_.resize(index+1); + } + params_store_[index].reset(malloc(size), free); + memcpy(params_store_[index].get(), ptr, size); + params_[index] = llvm::GenericValue(params_store_[index].get()); +} + +void host_kernel::setArg(unsigned int index, driver::buffer* buffer){ + kernel::setArg(index, (void*)buffer->hst()->data); +} + +const std::vector& host_kernel::params(){ + return params_; +} + /* ------------------------ */ // OpenCL // /* ------------------------ */ @@ -66,7 +97,6 @@ ocl_kernel::ocl_kernel(driver::module* program, const char* name): kernel(progra // check(dispatch::clCreateKernelsInProgram(*program->cl(), 0, NULL, &res)); // std::cout << res << std::endl; cl_int err; - std::cout << *program->cl() << std::endl; *cl_ = dispatch::clCreateKernel(*program->cl(), "matmul", &err); check(err); } diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 989337b23..c2c6ebad5 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -22,7 +22,7 @@ #include #include - +#include #include "triton/driver/module.h" #include "triton/driver/context.h" #include "triton/driver/error.h" @@ -40,12 +40,17 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/TargetSelect.h" +#include "llvm/Support/Host.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/LegacyPassManager.h" #include "llvm/Transforms/Scalar/EarlyCSE.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/ExecutionEngine/ExecutionEngine.h" +#include "llvm/ExecutionEngine/OrcMCJITReplacement.h" +#include +#include "llvm/Transforms/Utils/Cloning.h" namespace triton { @@ -76,6 +81,10 @@ module::module(driver::context* ctx, cl_program mod, bool has_ownership) : polymorphic_resource(mod, has_ownership), ctx_(ctx) { } +module::module(driver::context* ctx, host_module_t mod, bool has_ownership) + : polymorphic_resource(mod, has_ownership), ctx_(ctx) { +} + driver::context* module::context() const { return ctx_; } @@ -84,6 +93,7 @@ module* module::create(driver::context* ctx, llvm::Module *src) { switch(ctx->backend()){ case CUDA: return new cu_module(ctx, src); case OpenCL: return new ocl_module(ctx, src); + case Host: return new host_module(ctx, src); default: throw std::runtime_error("unknown backend"); } } @@ -91,7 +101,7 @@ module* module::create(driver::context* ctx, llvm::Module *src) { void module::compile_llvm_module(llvm::Module* module, const std::string& triple, const std::string &proc, std::string layout, llvm::SmallVectorImpl &buffer, - std::vector files) { + std::vector paths) { init_llvm(); // create machine module->setTargetTriple(triple); @@ -112,8 +122,7 @@ void module::compile_llvm_module(llvm::Module* module, const std::string& triple module->setDataLayout(layout); // link - for (std::string& file: files) { - std::string path = "/opt/rocm/lib/" + file; + for (std::string& path: paths) { llvm::SMDiagnostic err; std::unique_ptr mlib = llvm::parseIRFile(path, err, module->getContext()); if (mlib.get() == nullptr) { @@ -137,46 +146,44 @@ void module::compile_llvm_module(llvm::Module* module, const std::string& triple // std::cout << std::string(buffer.begin(), buffer.end()) << std::endl; } + +/* ------------------------ */ +// Host // +/* ------------------------ */ + +host_module::host_module(driver::context * context, llvm::Module* src): module(context, host_module_t(), true) { + init_llvm(); + // host info +// std::string triple = llvm::sys::getDefaultTargetTriple(); +// std::string cpu = llvm::sys::getHostCPUName(); +// llvm::SmallVector buffer; +// module::compile_llvm_module(src, triple, cpu, "", buffer); + + // create execution engine +// llvm::legacy::PassManager pass; +// pass.add(llvm::createPrintModulePass(llvm::outs())); +// pass.add(llvm::createVerifierPass()); +// pass.run(*src); + auto cloned = llvm::CloneModule(*src); + for(llvm::Function& fn: cloned->functions()) + hst_->functions[fn.getName()] = &fn; + llvm::EngineBuilder builder(std::move(cloned)); + builder.setErrorStr(&hst_->error); + builder.setMCJITMemoryManager(llvm::make_unique()); + builder.setOptLevel(llvm::CodeGenOpt::Aggressive); + builder.setEngineKind(llvm::EngineKind::JIT); + builder.setUseOrcMCJITReplacement(true); + hst_->engine = builder.create(); +} + /* ------------------------ */ // OpenCL // /* ------------------------ */ ocl_module::ocl_module(driver::context * context, llvm::Module* src): module(context, cl_program(), true) { -// const char* x = "__kernel void matmul(){ }"; -// cl_int err; -// *cl_ = dispatch::clCreateProgramWithSource(*context->cl(), 1, &x, NULL, &err); -// check(err); -// return; - init_llvm(); llvm::SmallVector buffer; - std::vector files = { - "oclc_daz_opt_on.amdgcn.bc", - "ocml.amdgcn.bc", - "hc.amdgcn.bc", - "ockl.amdgcn.bc", - "oclc_correctly_rounded_sqrt_off.amdgcn.bc", - "oclc_correctly_rounded_sqrt_on.amdgcn.bc", - "oclc_daz_opt_off.amdgcn.bc", - "oclc_finite_only_off.amdgcn.bc", - "oclc_finite_only_on.amdgcn.bc", - "oclc_isa_version_803.amdgcn.bc", - "oclc_isa_version_900.amdgcn.bc", - "oclc_unsafe_math_off.amdgcn.bc", - "oclc_unsafe_math_on.amdgcn.bc", - "oclc_isa_version_700.amdgcn.bc", - "opencl.amdgcn.bc" - }; - module::compile_llvm_module(src, "amdgcn-amd-amdpal", "gfx902", "", buffer, files); - - - -// llvm::BitcodeWriter writer(buffer); -// writer.writeModule(*src); -// llvm::legacy::PassManager pass; -// llvm::raw_svector_ostream stream(buffer); -// pass.add(llvm::createPrintModulePass(stream)); -// pass.run(*src); + module::compile_llvm_module(src, "amdgcn-amd-amdpal", "gfx902", "", buffer); size_t sizes[] = {buffer.size()}; const unsigned char* data[] = {(unsigned char*)buffer.data()}; cl_int status; diff --git a/lib/driver/platform.cpp b/lib/driver/platform.cpp index ad85773dd..93484a4ee 100755 --- a/lib/driver/platform.cpp +++ b/lib/driver/platform.cpp @@ -74,9 +74,16 @@ void cl_platform::devices(std::vector &devices) const{ } /* ------------------------ */ -// Vulkan // +// Host // /* ------------------------ */ +std::string host_platform::version() const { + return "1.0"; +} + +void host_platform::devices(std::vector &devices) const { + devices.push_back(new driver::host_device()); +} } diff --git a/lib/driver/stream.cpp b/lib/driver/stream.cpp index c3551c832..e9818d7bd 100755 --- a/lib/driver/stream.cpp +++ b/lib/driver/stream.cpp @@ -31,6 +31,8 @@ #include "triton/driver/event.h" #include "triton/driver/kernel.h" #include "triton/driver/buffer.h" +#include "llvm/ExecutionEngine/ExecutionEngine.h" +#include "llvm/ExecutionEngine/GenericValue.h" namespace triton { @@ -50,10 +52,15 @@ stream::stream(driver::context *ctx, cl_command_queue cl, bool has_ownership) : polymorphic_resource(cl, has_ownership), ctx_(ctx) { } +stream::stream(driver::context *ctx, host_stream_t cl, bool has_ownership) + : polymorphic_resource(cl, has_ownership), ctx_(ctx) { +} + driver::stream* stream::create(driver::context* ctx) { switch(ctx->backend()){ case CUDA: return new cu_stream(ctx); case OpenCL: return new cl_stream(ctx); + case Host: return new host_stream(ctx); default: throw std::runtime_error("unknown backend"); } } @@ -62,6 +69,32 @@ driver::context* stream::context() const { return ctx_; } +/* ------------------------ */ +// Host // +/* ------------------------ */ + +host_stream::host_stream(driver::context *ctx): stream(ctx, host_stream_t(), true) { + +} + +void host_stream::synchronize() { + +} + +void host_stream::enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const *, event* event) { + driver::host_kernel* hst_kernel = (host_kernel*)kernel; + llvm::ExecutionEngine* engine = kernel->module()->hst()->engine; + engine->runFunction(kernel->hst()->fn, llvm::ArrayRef(hst_kernel->params())); +} + +void host_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) { + +} + +void host_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) { + +} + /* ------------------------ */ // OpenCL // diff --git a/lib/jit.cpp b/lib/jit.cpp index 38da020a4..058ca1ea2 100644 --- a/lib/jit.cpp +++ b/lib/jit.cpp @@ -1,6 +1,7 @@ #include "triton/jit.h" #include #include "triton/ast/ast.h" +#include "triton/codegen/target.h" #include "triton/ir/context.h" #include "triton/ir/context_impl.h" #include "triton/driver/device.h" @@ -89,7 +90,7 @@ std::unique_ptr jit::make_triton_module(const std::string &src) { } -jit::jit(driver::context *context): driver_context_(context) { +jit::jit(driver::context *context): driver_context_(context), target_(new triton::codegen::cpu_target()) { } @@ -98,7 +99,7 @@ void jit::autotune(const std::string &src, benchmark_t benchmark) { auto ptt_module = make_triton_module(src); ir::module &tt_module = *ptt_module; // set parameters - passes_wrapper passes; + passes_wrapper passes(target_.get()); passes.tune.run(tt_module); auto mps = passes.tune.get_params(tt_module); // create parameter ranges @@ -123,7 +124,7 @@ void jit::autotune(const std::string &src, benchmark_t benchmark) { // Deep copy of the module and tuner auto ptt_module = make_triton_module(src); ir::module &tt_module = *ptt_module; - passes_wrapper passes; + passes_wrapper passes(target_.get()); passes.tune.run(tt_module); i = 0; for(ir::metaparameter* mp: passes.tune.get_params(tt_module)){ @@ -154,7 +155,7 @@ void jit::autotune(const std::string &src, benchmark_t benchmark) { void jit::add_module(ir::module &tt_module, const std::vector ¶ms) { // set parameters - passes_wrapper passes; + passes_wrapper passes(target_.get()); passes.tune.run(tt_module); unsigned i = 0; for(ir::metaparameter* mp: passes.tune.get_params(tt_module)) From be55b3a081e388826e03c553bbae1252d05e225d Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 23 Mar 2019 16:52:53 -0700 Subject: [PATCH 114/494] saving progress --- lib/driver/backend.cpp | 10 ++++----- lib/driver/module.cpp | 48 +++++++++++++++++++++++++++++++++++------- lib/jit.cpp | 2 +- 3 files changed, 46 insertions(+), 14 deletions(-) diff --git a/lib/driver/backend.cpp b/lib/driver/backend.cpp index 9761e94e7..1699eb088 100755 --- a/lib/driver/backend.cpp +++ b/lib/driver/backend.cpp @@ -57,11 +57,11 @@ void backend::platforms::init() { for(cl_platform_id id: ids) cache_.push_back(new cl_platform(id)); } - //if host is here - bool host_visible = true; - if(host_visible){ - cache_.push_back(new host_platform()); - } +// //if host is here +// bool host_visible = true; +// if(host_visible){ +// cache_.push_back(new host_platform()); +// } if(cache_.empty()) throw std::runtime_error("ISAAC: No backend available. Make sure CUDA is available in your library path"); } diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index c2c6ebad5..1e7407234 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -23,6 +23,8 @@ #include #include #include +#include +#include "llvm/IR/IRBuilder.h" #include "triton/driver/module.h" #include "triton/driver/context.h" #include "triton/driver/error.h" @@ -108,11 +110,11 @@ void module::compile_llvm_module(llvm::Module* module, const std::string& triple std::string error; auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error); llvm::TargetOptions opt; - opt.AllowFPOpFusion = llvm::FPOpFusion::Fast; - opt.UnsafeFPMath = false; - opt.NoInfsFPMath = false; - opt.NoNaNsFPMath = true; - llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, "", opt, +// opt.AllowFPOpFusion = llvm::FPOpFusion::Fast; +// opt.UnsafeFPMath = false; +// opt.NoInfsFPMath = false; +// opt.NoNaNsFPMath = true; + llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, "code-object-v3", opt, llvm::Reloc::PIC_, llvm::None, llvm::CodeGenOpt::Aggressive); // set data layout @@ -139,11 +141,12 @@ void module::compile_llvm_module(llvm::Module* module, const std::string& triple } // emit machine code + for (llvm::Function &f : module->functions()) + f.addFnAttr(llvm::Attribute::AlwaysInline); llvm::legacy::PassManager pass; llvm::raw_svector_ostream stream(buffer); machine->addPassesToEmitFile(pass, stream, nullptr, llvm::TargetMachine::CGFT_ObjectFile); pass.run(*module); -// std::cout << std::string(buffer.begin(), buffer.end()) << std::endl; } @@ -182,15 +185,42 @@ host_module::host_module(driver::context * context, llvm::Module* src): module(c ocl_module::ocl_module(driver::context * context, llvm::Module* src): module(context, cl_program(), true) { init_llvm(); +// std::vector files = { +// "opencl.amdgcn.bc", +// "ocml.amdgcn.bc", +// "ockl.amdgcn.bc", +// "oclc_correctly_rounded_sqrt_off.amdgcn.bc", +// "oclc_daz_opt_on.amdgcn.bc", +// "oclc_finite_only_off.amdgcn.bc", +// "oclc_isa_version_902.amdgcn.bc", +// "oclc_unsafe_math_off.amdgcn.bc" +// }; +// for(auto&x : files) +// x = "/opt/rocm/lib/" + x; + + llvm::LLVMContext ctx; +// llvm::IRBuilder<> builder(ctx); +// auto dummy = new llvm::Module("matmul", ctx); +// llvm::Function *fn = llvm::Function::Create(llvm::FunctionType::get(builder.getVoidTy(), {}, false), llvm::Function::ExternalLinkage, "matmul", dummy); +// llvm::BasicBlock *entry = llvm::BasicBlock::Create(ctx, "entry", fn); +// builder.SetInsertPoint(entry); +// builder.CreateRetVoid(); llvm::SmallVector buffer; - module::compile_llvm_module(src, "amdgcn-amd-amdpal", "gfx902", "", buffer); + llvm::SMDiagnostic error; + auto dummy = llvm::parseIRFile("test.bc", error, ctx); + module::compile_llvm_module(dummy.get(), "amdgcn-amd-amdhsa-amdgizcl", "gfx902", "", buffer); + + +// std::ifstream fin("test.o", std::ios::in | std::ios::binary ); +// std::vector buffer(9296); +// fin.read(buffer.data(), buffer.size()); size_t sizes[] = {buffer.size()}; const unsigned char* data[] = {(unsigned char*)buffer.data()}; cl_int status; cl_int err; *cl_ = dispatch::clCreateProgramWithBinary(*context->cl(), 1, &*context->device()->cl(), sizes, data, &status, &err); - check(err); check(status); + check(err); try{ dispatch::clBuildProgram(*cl_, 1, &*context->device()->cl(), NULL, NULL, NULL); } @@ -198,6 +228,8 @@ ocl_module::ocl_module(driver::context * context, llvm::Module* src): module(con char log[2048]; dispatch::clGetProgramBuildInfo(*cl_, *context->device()->cl(), CL_PROGRAM_BUILD_LOG, 1024, log, NULL); std::cout << log << std::endl; + std::cout << "T_T" << std::endl; + throw; } } diff --git a/lib/jit.cpp b/lib/jit.cpp index 058ca1ea2..9162b73c6 100644 --- a/lib/jit.cpp +++ b/lib/jit.cpp @@ -90,7 +90,7 @@ std::unique_ptr jit::make_triton_module(const std::string &src) { } -jit::jit(driver::context *context): driver_context_(context), target_(new triton::codegen::cpu_target()) { +jit::jit(driver::context *context): driver_context_(context), target_(new triton::codegen::amd_cl_target()) { } From deb7a1cc5cea0f966f90a20bbafbb8b4c0f70e1c Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 23 Mar 2019 18:58:25 -0700 Subject: [PATCH 115/494] Hack to make OpenCL for AMD work --- examples/matrix.cpp | 7 +++-- lib/codegen/selection.cpp | 2 ++ lib/driver/module.cpp | 64 ++++++++++++++------------------------- 3 files changed, 28 insertions(+), 45 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index bbe9e25bf..e16d6b2dd 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -111,7 +111,7 @@ int main() { triton::jit jit(context); // matrix multiplication parameters - size_t M = 512, N = 512, K = 512; + int32_t M = 128, N = 128, K = 128; std::vector hc(M*N); std::vector rc(M*N); std::vector ha(M*K); @@ -163,8 +163,9 @@ int main() { stream->enqueue(kernel, grid, {nthreads, 1, 1}); stream->synchronize(); // benchmark - double ts = bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, - [&](){ stream->synchronize(); }); +// double ts = bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, +// [&](){ stream->synchronize(); }); + double ts = 1; ts = ts * 1e-9; double tflops = 2*M*N*K / ts * 1e-12; return tflops; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index d893dbeee..aff4dfbff 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -748,6 +748,8 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & indices_t b_idx = {idx[1], builder.getInt32(K)}; Value *a = TA->get_value(a_idx); Value *b = TB->get_value(b_idx); +// a = ConstantFP::get(builder.getFloatTy(), 1); +// b = ConstantFP::get(builder.getFloatTy(), 1); res = builder.CreateCall(f_mul_add, {a, b, res}); } result->set_value(idx, res); diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 1e7407234..4c0018c3d 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -1,22 +1,22 @@ /* Copyright 2015-2017 Philippe Tillet -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files -* (the "Software"), to deal in the Software without restriction, -* including without limitation the rights to use, copy, modify, merge, -* publish, distribute, sublicense, and/or sell copies of the Software, -* and to permit persons to whom the Software is furnished to do so, +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, * subject to the following conditions: -* -* The above copyright notice and this permission notice shall be +* +* The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ @@ -185,37 +185,17 @@ host_module::host_module(driver::context * context, llvm::Module* src): module(c ocl_module::ocl_module(driver::context * context, llvm::Module* src): module(context, cl_program(), true) { init_llvm(); -// std::vector files = { -// "opencl.amdgcn.bc", -// "ocml.amdgcn.bc", -// "ockl.amdgcn.bc", -// "oclc_correctly_rounded_sqrt_off.amdgcn.bc", -// "oclc_daz_opt_on.amdgcn.bc", -// "oclc_finite_only_off.amdgcn.bc", -// "oclc_isa_version_902.amdgcn.bc", -// "oclc_unsafe_math_off.amdgcn.bc" -// }; -// for(auto&x : files) -// x = "/opt/rocm/lib/" + x; - - llvm::LLVMContext ctx; -// llvm::IRBuilder<> builder(ctx); -// auto dummy = new llvm::Module("matmul", ctx); -// llvm::Function *fn = llvm::Function::Create(llvm::FunctionType::get(builder.getVoidTy(), {}, false), llvm::Function::ExternalLinkage, "matmul", dummy); -// llvm::BasicBlock *entry = llvm::BasicBlock::Create(ctx, "entry", fn); -// builder.SetInsertPoint(entry); -// builder.CreateRetVoid(); llvm::SmallVector buffer; - llvm::SMDiagnostic error; - auto dummy = llvm::parseIRFile("test.bc", error, ctx); - module::compile_llvm_module(dummy.get(), "amdgcn-amd-amdhsa-amdgizcl", "gfx902", "", buffer); + module::compile_llvm_module(src, "amdgcn-amd-amdhsa-amdgizcl", "gfx902", "", buffer); + std::ofstream output("tmp.o", std::ios::binary); + std::copy(buffer.begin(), buffer.end(), std::ostreambuf_iterator(output)); + system("ld.lld tmp.o -shared -o test.o"); -// std::ifstream fin("test.o", std::ios::in | std::ios::binary ); -// std::vector buffer(9296); -// fin.read(buffer.data(), buffer.size()); - size_t sizes[] = {buffer.size()}; - const unsigned char* data[] = {(unsigned char*)buffer.data()}; + std::ifstream input("test.o", std::ios::in | std::ios::binary ); + std::vector in_buffer(std::istreambuf_iterator(input), {}); + size_t sizes[] = {in_buffer.size()}; + const unsigned char* data[] = {(unsigned char*)in_buffer.data()}; cl_int status; cl_int err; *cl_ = dispatch::clCreateProgramWithBinary(*context->cl(), 1, &*context->device()->cl(), sizes, data, &status, &err); From 8d35c98920a19dd03ff3897b2fd28adceee3275a Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 25 Mar 2019 14:10:24 -0700 Subject: [PATCH 116/494] [code generation] search space pruning --- examples/matrix.cpp | 59 ++++++++++++-------------------- include/triton/ast/parser.y | 11 ++++-- include/triton/driver/device.h | 11 ++++-- include/triton/driver/dispatch.h | 2 +- include/triton/driver/handle.h | 13 +++---- include/triton/ir/constant.h | 12 +++---- lib/ast/lowering.cpp | 12 ++++--- lib/codegen/tune.cpp | 16 +++++++-- lib/driver/device.cpp | 10 +++++- lib/driver/handle.cpp | 11 +++--- lib/driver/module.cpp | 40 ++++++---------------- lib/driver/stream.cpp | 3 +- lib/ir/constant.cpp | 16 +++++++-- lib/jit.cpp | 33 +++++++++--------- 14 files changed, 131 insertions(+), 118 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index e16d6b2dd..624872f9c 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -6,9 +6,9 @@ const char* src = R"( -const tunable int32 TM; -const tunable int32 TN; -const tunable int32 TK; +const tunable int32 TM = {16, 32, 64}; +const tunable int32 TN = {16, 32, 64}; +const tunable int32 TK = {8, 16}; void matmul(restrict read_only fp32 *a, restrict read_only fp32 *b, fp32 *c, int32 M, int32 N, int32 K, int32 bound){ @@ -26,20 +26,8 @@ void matmul(restrict read_only fp32 *a, restrict read_only fp32 *b, fp32 *c, pa = pa + TK*M; pb = pb + TK*K; k = k - TK; - int1 checka[TM, TK] = k > bound; - int1 checkb[TN, TK] = k > bound; - @checka a = *pa; - @checkb b = *pb; - if(k > bound) - continue; - int1 checka0[TM] = rxa < M; - int1 checka1[TK] = rka < k; - int1 checkb0[TN] = ryb < N; - int1 checkb1[TK] = rkb < k; - checka = checka0[:, newaxis] && checka1[newaxis, :]; - checkb = checkb0[:, newaxis] && checkb1[newaxis, :]; - a = checka ? *pa : 0; - b = checkb ? *pb : 0; + a = *pa; + b = *pb; } int32 rxc[TM] = get_global_range[TM](0); int32 ryc[TN] = get_global_range[TN](1); @@ -87,22 +75,17 @@ T min(std::vector x) template -double bench(OP const & op, SYNC const & sync) +double bench(OP const & op, SYNC const & sync, unsigned repeat = 20) { timer tmr; - std::vector times; - double total_time = 0; op(); sync(); - while(total_time*1e-9 < 1e-3){ - float norm = 1; - tmr.start(); + tmr.start(); + for(unsigned i = 0; i < repeat; i++) op(); - sync(); - times.push_back(norm*tmr.get().count()); - total_time+=times.back(); - } - return min(times); + sync(); + double time = tmr.get().count(); + return time / repeat; } int main() { @@ -111,16 +94,16 @@ int main() { triton::jit jit(context); // matrix multiplication parameters - int32_t M = 128, N = 128, K = 128; + int32_t M = 512, N = 512, K = 512; std::vector hc(M*N); std::vector rc(M*N); std::vector ha(M*K); std::vector hb(K*N); srand(0); for(size_t i = 0; i < ha.size(); i++) - ha[i] = 1; + ha[i] = (float)rand()/RAND_MAX; for(size_t i = 0; i < hb.size(); i++) - hb[i] = 1; + hb[i] = (float)rand()/RAND_MAX; for(size_t i = 0; i < hc.size(); i++) hc[i] = 0; triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*4); @@ -163,11 +146,10 @@ int main() { stream->enqueue(kernel, grid, {nthreads, 1, 1}); stream->synchronize(); // benchmark -// double ts = bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, -// [&](){ stream->synchronize(); }); - double ts = 1; + double ts = bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, + [&](){ stream->synchronize(); }); ts = ts * 1e-9; - double tflops = 2*M*N*K / ts * 1e-12; + double tflops = 2.*M*N*K / ts * 1e-12; return tflops; }; @@ -177,11 +159,12 @@ int main() { 16, 2, 64, 32, 2, 64, 16, 8, 2, 2, - 8, 1, 8, - 4, 1 + 8, 8, + 4, }; +// params = {8, 2, 64, 16, 2, 64, 4, 16, 2, 2, 8, 8, 4}; -// jit.autotune(src, benchmark); + jit.autotune(src, benchmark); jit.add_module(src, params); triton::driver::kernel* kernel = jit.get_function("matmul"); triton::jit::launch_information info = jit.get_launch_info("matmul"); diff --git a/include/triton/ast/parser.y b/include/triton/ast/parser.y index 8ce55f372..ae4b7d4e3 100644 --- a/include/triton/ast/parser.y +++ b/include/triton/ast/parser.y @@ -94,10 +94,15 @@ abstract_declarator direct_abstract_declarator : '[' primary_expression_list ']' { $$ = new tile(nullptr, $1); } -constant : +constant: CONSTANT { $$ = new constant(atoi(yytext)); } ; - + +constant_list: + constant { $$ = new list((constant*)$1); } + | constant_list ',' constant { $$ = append_ptr_list($1, $3); } + ; + type_name : declaration_specifiers { $$ = new type_name($1, nullptr); } | declaration_specifiers abstract_declarator { $$ = new type_name($1, $2); } @@ -259,7 +264,7 @@ expression /* Initialization */ initialization_expression : assignment_expression { $$ = $1; } - | '{' constant '}' { $$ = $2; } + | '{' constant_list '}' { $$ = $2; } ; diff --git a/include/triton/driver/device.h b/include/triton/driver/device.h index d99e47fe2..a08bd3cc8 100755 --- a/include/triton/driver/device.h +++ b/include/triton/driver/device.h @@ -38,18 +38,24 @@ class context; class device: public polymorphic_resource{ public: using polymorphic_resource::polymorphic_resource; + virtual size_t max_threads_per_block() const = 0; + virtual size_t max_shared_memory() const = 0; }; // Host device class host_device: public device { public: host_device(): device(host_device_t(), true){ } + size_t max_threads_per_block() const { return 1; } + size_t max_shared_memory() const { return 0; } }; // OpenCL device class ocl_device: public device { public: ocl_device(cl_device_id cl, bool take_ownership = true): device(cl, take_ownership) { } + size_t max_threads_per_block() const; + size_t max_shared_memory() const; }; // CUDA device @@ -87,8 +93,6 @@ public: std::string infos() const; size_t address_bits() const; std::vector max_block_dim() const; - size_t max_threads_per_block() const; - size_t max_shared_memory() const; size_t warp_size() const; //Compute Capability void interpret_as(std::pair cc); @@ -99,7 +103,8 @@ public: //Clocks size_t current_sm_clock() const; size_t current_mem_clock() const; - + size_t max_threads_per_block() const; + size_t max_shared_memory() const; size_t max_sm_clock() const; size_t max_mem_clock() const; diff --git a/include/triton/driver/dispatch.h b/include/triton/driver/dispatch.h index 1e0459931..2d06bb397 100755 --- a/include/triton/driver/dispatch.h +++ b/include/triton/driver/dispatch.h @@ -87,7 +87,7 @@ public: static bool cudnninit(); static void release(); - //OpenCL + // OpenCL static cl_int clBuildProgram(cl_program, cl_uint, const cl_device_id *, const char *, void (*)(cl_program, void *), void *); static cl_int clEnqueueNDRangeKernel(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *); static cl_int clSetKernelArg(cl_kernel, cl_uint, size_t, const void *); diff --git a/include/triton/driver/handle.h b/include/triton/driver/handle.h index f87a8ffa6..d3b6f151c 100755 --- a/include/triton/driver/handle.h +++ b/include/triton/driver/handle.h @@ -105,20 +105,21 @@ public: bool operator<(handle_interface const & y) { return (CUType)(*this) < (CUType)(y); } }; -template +template class handle{ public: template friend class handle_interface; public: //Constructors - handle(CUType cu = CUType(), bool take_ownership = true); + handle(T h, bool take_ownership = true); + handle(); ~handle(); - CUType& operator*() { return *h_; } - CUType const & operator*() const { return *h_; } - CUType* operator->() const { return h_.get(); } + T& operator*() { return *h_; } + T const & operator*() const { return *h_; } + T* operator->() const { return h_.get(); } protected: - std::shared_ptr h_; + std::shared_ptr h_; bool has_ownership_; }; diff --git a/include/triton/ir/constant.h b/include/triton/ir/constant.h index 317cba2ff..0c18787ea 100644 --- a/include/triton/ir/constant.h +++ b/include/triton/ir/constant.h @@ -44,19 +44,19 @@ protected: }; /* Metaparameter int */ -class metaparameter: public constant_int{ - metaparameter(type *ty, unsigned lo, unsigned hi); +class metaparameter: public constant_int { +private: + metaparameter(type *ty, const std::vector& space); public: static metaparameter *create(context &ctx, type *ty, unsigned lo, unsigned hi); + static metaparameter *create(context &ctx, type *ty, const std::vector& space); void set_value(uint64_t value) { has_value_ = true; value_ = value; } bool has_value() { return has_value_; } - unsigned get_lo() { return lo_; } - unsigned get_hi() { return hi_; } + const std::vector& get_space() { return space_; } private: - unsigned lo_; - unsigned hi_; + std::vector space_; bool has_value_; }; diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 04d03aa99..77ba26464 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -410,12 +410,16 @@ ir::value* initializer::codegen(ir::module * mod) const{ std::string name = decl_->id()->name(); ir::value *value = ir::undef_value::get(ty); if(std::find(storage.begin(), storage.end(), TUNABLE_T) != storage.end()){ - assert(expr_ == nullptr); - //TODO: implement ranges - value = ir::metaparameter::create(mod->get_context(), ty, 8, (name=="TK")?8:64); + auto csts = dynamic_cast*>((node*)expr_); + if(csts == nullptr) + throw std::runtime_error("must specify constant list for metaparameters"); + std::vector values; + for(constant* cst: csts->values()) + values.push_back(cst->value()); + value = ir::metaparameter::create(mod->get_context(), ty, values); mod->register_global(name, value); } - if(expr_){ + else if(expr_){ value = expr_->codegen(mod); value = explicit_cast(mod->get_builder(), value, ty); implicit_broadcast(mod, value, ty); diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index f3a9cedfb..8a9a35aa0 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -144,11 +144,23 @@ void tune::run(ir::module &mod) { // Layout parameters while(!nodes_.empty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 1, 2); + ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 2, 4); ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 4, 32); connected_components(*nodes_.begin(), {nts, mts}, nodes_, dependencies_); } } + + // Simplify metaparameters + std::set fixed_io_nts; + for(ir::function *fn: mod.get_function_list()) + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *i : block->get_inst_list()) + if(dynamic_cast(i) || dynamic_cast(i)) + if(i->get_type()->is_tile_ty()) + for(unsigned d = 1; d < i->get_type()->get_tile_shapes().size(); d++) + fixed_io_nts.insert(params_.at(i).at("nts.d" + std::to_string(d))); + for(ir::metaparameter* mp: fixed_io_nts) + mp->set_value(1); } void tune::init(ir::module &mod) { @@ -234,7 +246,7 @@ bool tune::check_constraints(std::map> &er int num_threads = 1; for(size_t k = 0; k < shapes.size(); k++) num_threads *= params_[i]["mts.d" + to_string(k)]->get_value(); - if(num_threads % 32 != 0) + if(num_threads % 64 != 0) errors[i].push_back("number of threads per block (" + to_string(num_threads) + ") must be multiple of 32"); if(num_threads != num_threads_) errors[i].push_back("Number of threads must be the same for all tiles (" + to_string(num_threads_) + ")"); diff --git a/lib/driver/device.cpp b/lib/driver/device.cpp index 0fe875075..62c41fb98 100755 --- a/lib/driver/device.cpp +++ b/lib/driver/device.cpp @@ -25,7 +25,7 @@ #include #include #include - +#include "triton/driver/helpers/CL/infos.hpp" #include "triton/driver/device.h" #include "triton/driver/context.h" @@ -40,6 +40,14 @@ namespace driver // OpenCL // /* ------------------------ */ +// maximum amount of shared memory per block +size_t ocl_device::max_shared_memory() const { + return ocl::info(*cl_); +} + +size_t ocl_device::max_threads_per_block() const { + return ocl::info(*cl_).at(0); +} /* ------------------------ */ // CUDA // diff --git a/lib/driver/handle.cpp b/lib/driver/handle.cpp index 603cf2b0d..c698cb8b5 100755 --- a/lib/driver/handle.cpp +++ b/lib/driver/handle.cpp @@ -60,13 +60,16 @@ inline void _delete(cu_event_t x) { _delete(x.first); _delete(x.second); } inline void _delete(CUPlatform){} //Constructor -template -handle::handle(CUType cu, bool take_ownership): h_(new CUType(cu)), has_ownership_(take_ownership) +template +handle::handle(T cu, bool take_ownership): h_(new T(cu)), has_ownership_(take_ownership) { } +template +handle::handle(): has_ownership_(false){ } -template -handle::~handle(){ + +template +handle::~handle(){ if(has_ownership_ && h_ && h_.unique()) _delete(*h_); } diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 4c0018c3d..6e3533983 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -53,6 +53,10 @@ #include "llvm/ExecutionEngine/OrcMCJITReplacement.h" #include #include "llvm/Transforms/Utils/Cloning.h" +#include "lld/Common/Driver.h" +#include "lld/Common/Args.h" +#include "lld/Common/ErrorHandler.h" +#include "lld/Common/LLVM.h" namespace triton { @@ -110,36 +114,17 @@ void module::compile_llvm_module(llvm::Module* module, const std::string& triple std::string error; auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error); llvm::TargetOptions opt; -// opt.AllowFPOpFusion = llvm::FPOpFusion::Fast; -// opt.UnsafeFPMath = false; -// opt.NoInfsFPMath = false; -// opt.NoNaNsFPMath = true; + opt.AllowFPOpFusion = llvm::FPOpFusion::Fast; + opt.UnsafeFPMath = false; + opt.NoInfsFPMath = false; + opt.NoNaNsFPMath = true; llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, "code-object-v3", opt, llvm::Reloc::PIC_, llvm::None, llvm::CodeGenOpt::Aggressive); - // set data layout if(layout.empty()) module->setDataLayout(machine->createDataLayout()); else module->setDataLayout(layout); - - // link - for (std::string& path: paths) { - llvm::SMDiagnostic err; - std::unique_ptr mlib = llvm::parseIRFile(path, err, module->getContext()); - if (mlib.get() == nullptr) { - std::string msg = err.getMessage(); - std::cerr << "Fail to load bitcode file " << path << "\n" - << "line " << err.getLineNo() << ":" << msg; - } - mlib->setTargetTriple(module->getTargetTriple()); - mlib->setDataLayout(module->getDataLayout()); - for (llvm::Function &f : mlib->functions()) { - f.addFnAttr(llvm::Attribute::AlwaysInline); - } - llvm::Linker::linkModules(*module, std::move(mlib)); - } - // emit machine code for (llvm::Function &f : module->functions()) f.addFnAttr(llvm::Attribute::AlwaysInline); @@ -187,12 +172,10 @@ ocl_module::ocl_module(driver::context * context, llvm::Module* src): module(con init_llvm(); llvm::SmallVector buffer; module::compile_llvm_module(src, "amdgcn-amd-amdhsa-amdgizcl", "gfx902", "", buffer); - - std::ofstream output("tmp.o", std::ios::binary); + std::ofstream output("/tmp/tmp.o", std::ios::binary); std::copy(buffer.begin(), buffer.end(), std::ostreambuf_iterator(output)); - system("ld.lld tmp.o -shared -o test.o"); - - std::ifstream input("test.o", std::ios::in | std::ios::binary ); + system("ld.lld-8 /tmp/tmp.o -shared -o /tmp/tmp.o"); + std::ifstream input("/tmp/tmp.o", std::ios::in | std::ios::binary ); std::vector in_buffer(std::istreambuf_iterator(input), {}); size_t sizes[] = {in_buffer.size()}; const unsigned char* data[] = {(unsigned char*)in_buffer.data()}; @@ -208,7 +191,6 @@ ocl_module::ocl_module(driver::context * context, llvm::Module* src): module(con char log[2048]; dispatch::clGetProgramBuildInfo(*cl_, *context->device()->cl(), CL_PROGRAM_BUILD_LOG, 1024, log, NULL); std::cout << log << std::endl; - std::cout << "T_T" << std::endl; throw; } } diff --git a/lib/driver/stream.cpp b/lib/driver/stream.cpp index e9818d7bd..937750c23 100755 --- a/lib/driver/stream.cpp +++ b/lib/driver/stream.cpp @@ -111,7 +111,8 @@ void cl_stream::synchronize() { } void cl_stream::enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const *, event* event) { - check(dispatch::clEnqueueNDRangeKernel(*cl_, *kernel->cl(), grid.size(), NULL, (const size_t*)grid.data(), (const size_t*)block.data(), 0, NULL, NULL)); + std::array global = {grid[0]*block[0], grid[1]*block[1], grid[2]*block[2]}; + check(dispatch::clEnqueueNDRangeKernel(*cl_, *kernel->cl(), grid.size(), NULL, (const size_t*)global.data(), (const size_t*)block.data(), 0, NULL, NULL)); } void cl_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) { diff --git a/lib/ir/constant.cpp b/lib/ir/constant.cpp index bfb6fdb9b..5df644842 100644 --- a/lib/ir/constant.cpp +++ b/lib/ir/constant.cpp @@ -98,12 +98,22 @@ constant *constant_fp::get(context &ctx, double v){ } // metaparameter -metaparameter::metaparameter(type *ty, unsigned lo, unsigned hi) - : constant_int(ty, 0), lo_(lo), hi_(hi), has_value_(false){ } +metaparameter::metaparameter(type *ty, const std::vector &space) + : constant_int(ty, 0), space_(space), has_value_(false){ } metaparameter* metaparameter::create(context &ctx, type *ty, unsigned lo, unsigned hi) { context_impl *impl = ctx.p_impl.get(); - metaparameter *result = new metaparameter(ty, lo, hi); + std::vector space; + for(unsigned i = lo; i <= hi; i *= 2) + space.push_back(i); + metaparameter *result = new metaparameter(ty, space); + impl->mp_constants_.push_back(result); + return result; +} + +metaparameter* metaparameter::create(context &ctx, type *ty, const std::vector &space) { + context_impl *impl = ctx.p_impl.get(); + metaparameter *result = new metaparameter(ty, space); impl->mp_constants_.push_back(result); return result; } diff --git a/lib/jit.cpp b/lib/jit.cpp index 9162b73c6..f76870a86 100644 --- a/lib/jit.cpp +++ b/lib/jit.cpp @@ -5,6 +5,7 @@ #include "triton/ir/context.h" #include "triton/ir/context_impl.h" #include "triton/driver/device.h" +#include "triton/driver/error.h" #include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/Module.h" #include "llvm/IR/LLVMContext.h" @@ -71,6 +72,7 @@ std::unique_ptr jit::make_llvm_module(ir::module &module, passes_w passes.selection.run(module, *result); // launch information auto &launch_info_map = launch_info_map_[result->getName()]; + launch_info_map.global_range_size.clear(); for(unsigned i = 0; i < passes.tune.get_num_global_range(); i++) launch_info_map.global_range_size.push_back(passes.tune.get_global_range_size(i)); launch_info_map.num_threads = passes.tune.get_num_threads(); @@ -104,12 +106,8 @@ void jit::autotune(const std::string &src, benchmark_t benchmark) { auto mps = passes.tune.get_params(tt_module); // create parameter ranges std::vector> ranges; - for(ir::metaparameter *mp: mps){ - std::vector current; - for(unsigned x = mp->get_lo(); x <= mp->get_hi(); x*=2) - current.push_back(x); - ranges.push_back(current); - } + for(ir::metaparameter *mp: mps) + ranges.push_back(mp->get_space()); // iterate over parameters unsigned i; double best = 0; @@ -132,22 +130,23 @@ void jit::autotune(const std::string &src, benchmark_t benchmark) { } passes.tune.init(tt_module); passes.init(tt_module); -// driver::device* device = driver_context_->device(); -// if(passes.allocation.get_allocated_size() > device->max_shared_memory()) -// return; -// if(passes.tune.get_num_threads() > device->max_threads_per_block()) -// return; + driver::device* device = driver_context_->device(); + if(passes.allocation.get_allocated_size() > device->max_shared_memory()) + return; + if(passes.tune.get_num_threads() > device->max_threads_per_block()) + return; // Compile auto ll_module = make_llvm_module(tt_module, passes); - driver::module* module = driver::module::create(driver_context_, &*ll_module); - driver::kernel* kernel = driver::kernel::create(module, "matmul"); + std::unique_ptr module(driver::module::create(driver_context_, &*ll_module)); + std::unique_ptr kernel(driver::kernel::create(module.get(), "matmul")); launch_information info = launch_info_map_.at("matmul"); for(unsigned p: params) std::cout << p << " " << std::flush; // add globals for(auto x: tt_module.globals()) global_ints_[x.first] = ((ir::metaparameter*)x.second)->get_value(); - double perf = benchmark(kernel, info); + double perf; + perf = benchmark(kernel.get(), info); best = std::max(perf, best); std::cout << perf << " [ " << best << " ] " << std::endl; }); @@ -167,9 +166,9 @@ void jit::add_module(ir::module &tt_module, const std::vector ¶ms) passes.tune.check_constraints(errors); if(errors.size()) throw std::runtime_error("invalid parameters"); -// driver::device* device = driver_context_->device(); -// if(passes.allocation.get_allocated_size() > device->max_shared_memory()) -// throw std::runtime_error("invalid parameters"); + driver::device* device = driver_context_->device(); + if(passes.allocation.get_allocated_size() > device->max_shared_memory()) + throw std::runtime_error("invalid parameters"); // triton module -> llvm module auto ll_module = make_llvm_module(tt_module, passes); // llvm module -> machine code From 9d6fc1c051db42b67d869c70063024b0c2e8c50c Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 26 Mar 2019 15:55:48 -0700 Subject: [PATCH 117/494] [code generation] bugfix in single buffering --- examples/matrix.cpp | 5 +- include/triton/codegen/barriers.h | 3 +- include/triton/codegen/buffer_info.h | 2 + include/triton/ir/basic_block.h | 2 + include/triton/jit.h | 3 ++ lib/codegen/allocation.cpp | 5 +- lib/codegen/barriers.cpp | 68 ++++++++++++++++++++++++---- lib/codegen/buffer_info.cpp | 10 ++++ lib/codegen/liveness.cpp | 57 ++++++++++++++--------- lib/codegen/selection.cpp | 9 ++-- lib/codegen/shared_copy.cpp | 6 +++ lib/driver/module.cpp | 4 ++ lib/ir/basic_block.cpp | 2 + 13 files changed, 135 insertions(+), 41 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 624872f9c..16f5c4434 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -21,11 +21,10 @@ void matmul(restrict read_only fp32 *a, restrict read_only fp32 *b, fp32 *c, fp32* pb[TN, TK] = b + rkb[newaxis, :]*K + ryb[:, newaxis]; fp32 a[TM, TK] = *pa; fp32 b[TN, TK] = *pb; - for(int32 k = K; k > 0;){ + for(int32 k = K; k > 0; k = k - TK){ C = dot(a, b, C); pa = pa + TK*M; pb = pb + TK*K; - k = k - TK; a = *pa; b = *pb; } @@ -164,7 +163,7 @@ int main() { }; // params = {8, 2, 64, 16, 2, 64, 4, 16, 2, 2, 8, 8, 4}; - jit.autotune(src, benchmark); +// jit.autotune(src, benchmark); jit.add_module(src, params); triton::driver::kernel* kernel = jit.get_function("matmul"); triton::jit::launch_information info = jit.get_launch_info("matmul"); diff --git a/include/triton/codegen/barriers.h b/include/triton/codegen/barriers.h index 546b36893..336ec255a 100644 --- a/include/triton/codegen/barriers.h +++ b/include/triton/codegen/barriers.h @@ -26,13 +26,14 @@ private: typedef std::vector interval_vec_t; private: + interval_vec_t join(const std::vector& intervals); void insert_barrier(ir::instruction *instr, ir::builder &builder); bool intersect(const interval_vec_t &X, interval_t x); bool intersect(const interval_vec_t &X, const interval_vec_t &Y); void add_reference(ir::value *v, interval_vec_t &res); void get_read_intervals(ir::instruction *i, interval_vec_t &res); void get_written_intervals(ir::instruction *i, interval_vec_t &res); - void add(ir::basic_block *block, interval_vec_t ¬_synced, ir::builder &builder); + std::pair transfer(ir::basic_block *block, const interval_vec_t &written_to, const interval_vec_t &read_from, std::set &insert_loc); public: barriers(allocation *alloc, buffer_info_pass *buffer_info): alloc_(alloc), buffer_info_(buffer_info) {} diff --git a/include/triton/codegen/buffer_info.h b/include/triton/codegen/buffer_info.h index c9b954a58..58f140d61 100644 --- a/include/triton/codegen/buffer_info.h +++ b/include/triton/codegen/buffer_info.h @@ -19,9 +19,11 @@ public: void run(ir::module &mod); // queries bool is_double(ir::value *x); + void add_shared(ir::value *v); bool is_shared(ir::value *x); bool is_loop_latch(ir::phi_node *phi, ir::value *terminator); ir::value *get_reference(ir::value *x); + void replace(ir::value* before, ir::value *after); private: diff --git a/include/triton/ir/basic_block.h b/include/triton/ir/basic_block.h index 63de2a18b..09eb3ad64 100644 --- a/include/triton/ir/basic_block.h +++ b/include/triton/ir/basic_block.h @@ -58,6 +58,7 @@ public: // predecessors const std::vector& get_predecessors() const { return preds_; } + const std::vector& get_successors() const { return succs_; } void add_predecessor(basic_block* pred); // factory functions @@ -68,6 +69,7 @@ private: std::string name_; function *parent_; std::vector preds_; + std::vector succs_; inst_list_t inst_list_; }; diff --git a/include/triton/jit.h b/include/triton/jit.h index 93f08f280..b53884e36 100644 --- a/include/triton/jit.h +++ b/include/triton/jit.h @@ -5,6 +5,7 @@ #include #include "llvm/IR/LLVMContext.h" #include "triton/ir/context.h" +#include "triton/ir/print.h" #include "triton/driver/module.h" #include "triton/driver/kernel.h" #include "triton/codegen/selection.h" @@ -54,10 +55,12 @@ public: // generate ptx buffer_info.run(module); shared.run(module); + triton::ir::print(module, std::cout); liveness.run(module); allocation.run(); barriers.run(module); vectorize.run(module); + triton::ir::print(module, std::cout); } codegen::tune tune; diff --git a/lib/codegen/allocation.cpp b/lib/codegen/allocation.cpp index 9a3d5e39d..c8ce9f60c 100644 --- a/lib/codegen/allocation.cpp +++ b/lib/codegen/allocation.cpp @@ -29,7 +29,7 @@ void allocation::run(){ std::vector J = I; triples_map_type H; - H.insert({0, segment{0, 100}}); + H.insert({0, segment{0, 1024}}); std::vector V; std::map starts; @@ -116,6 +116,9 @@ void allocation::run(){ for(auto &x: offsets_){ allocated_size_ = std::max(allocated_size_, x.second + get_num_bytes(x.first)); } + std::cout << "Allocated: " << allocated_size_ << std::endl; + for(auto &x: offsets_) + std::cout << x.first->get_name() << " " << x.second << std::endl; } } diff --git a/lib/codegen/barriers.cpp b/lib/codegen/barriers.cpp index b84a945d8..d7b126ee0 100644 --- a/lib/codegen/barriers.cpp +++ b/lib/codegen/barriers.cpp @@ -6,6 +6,7 @@ #include "triton/ir/function.h" #include "triton/ir/basic_block.h" #include "triton/ir/instructions.h" +#include "triton/ir/cfg.h" namespace triton { @@ -62,27 +63,76 @@ void barriers::insert_barrier(ir::instruction *instr, ir::builder &builder) { } } -void barriers::add(ir::basic_block *block, interval_vec_t ¬_synced, ir::builder &builder) { +barriers::interval_vec_t barriers::join(const std::vector& intervals) { + barriers::interval_vec_t result; + for(auto x: intervals) + for(interval_t i: x) + result.push_back(i); + return result; +} + +std::pair barriers::transfer(ir::basic_block *block, + const interval_vec_t &written_to, + const interval_vec_t &read_from, + std::set& insert_loc) { ir::basic_block::inst_list_t instructions = block->get_inst_list(); + interval_vec_t new_written_to = written_to; + interval_vec_t new_read_from = read_from; for(ir::instruction *i: instructions){ interval_vec_t read, written; get_read_intervals(i, read); get_written_intervals(i, written); - if(intersect(not_synced, read)) { - not_synced.clear(); - insert_barrier(i, builder); + bool read_while_written = intersect(new_written_to, read); + bool written_while_read = intersect(new_read_from, written); + // double buffering: write and phi-node read won't intersect + if(dynamic_cast(i) && + buffer_info_->is_double(buffer_info_->get_reference(i))) + written_while_read = false; + if(read_while_written || written_while_read) { + insert_loc.insert(i); + new_written_to.clear(); + new_read_from.clear(); } - std::copy(written.begin(), written.end(), std::back_inserter(not_synced)); + std::copy(written.begin(), written.end(), std::back_inserter(new_written_to)); + std::copy(read.begin(), read.end(), std::back_inserter(new_read_from)); } + return std::make_pair(new_written_to, new_read_from); } void barriers::run(ir::module &mod) { ir::builder &builder = mod.get_builder(); for(ir::function *fn: mod.get_function_list()){ - // find barrier location - interval_vec_t not_synced; - for(ir::basic_block *block: fn->blocks()) - add(block, not_synced, builder); + std::vector rpo = ir::cfg::reverse_post_order(fn); + std::map written_to; + std::map read_from; + std::set insert_locs; + size_t n_inserted_im1 = 0; + bool done = false; + do{ + // find barrier location + for(ir::basic_block *block: rpo){ + // written to + std::vector pred_written_to; + for(ir::basic_block* pred: block->get_predecessors()) + pred_written_to.push_back(written_to[pred]); + // read from + std::vector pred_read_from; + for(ir::basic_block* pred: block->get_predecessors()) + pred_read_from.push_back(read_from[pred]); + // apply transfer function + auto result = transfer(block, join(pred_written_to), join(pred_read_from), insert_locs); + written_to[block] = result.first; + read_from[block] = result.second; + } + size_t n_inserted_i = insert_locs.size(); + done = (n_inserted_im1 == n_inserted_i); + n_inserted_im1 = n_inserted_i; + }while(!done); + for(ir::instruction* i: insert_locs){ + std::cout << i->get_name() << std::endl; + insert_barrier(i, builder); + } } } diff --git a/lib/codegen/buffer_info.cpp b/lib/codegen/buffer_info.cpp index 4d2a3c676..dff371a64 100644 --- a/lib/codegen/buffer_info.cpp +++ b/lib/codegen/buffer_info.cpp @@ -21,6 +21,16 @@ bool buffer_info_pass::is_loop_latch(ir::phi_node *phi, ir::value *terminator){ throw std::runtime_error("unreachable"); } +void buffer_info_pass::replace(ir::value* before, ir::value *after) { + shared_.erase(before); + shared_.insert(after); + if(refs_.find(before) != refs_.end()){ + ir::value* v = refs_.at(before); + refs_.erase(before); + refs_.insert({after, v}); + } +} + void buffer_info_pass::run(ir::module &mod) { // Find which buffers are shared for(ir::function *fn: mod.get_function_list()) diff --git a/lib/codegen/liveness.cpp b/lib/codegen/liveness.cpp index 5e1987b9e..c7c067052 100644 --- a/lib/codegen/liveness.cpp +++ b/lib/codegen/liveness.cpp @@ -11,30 +11,43 @@ namespace codegen{ // Entry point -void liveness::run(ir::module &mod) { -for(ir::function *fn: mod.get_function_list()){ - // Assigns index to each instruction - slot_index index = 0; - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *instr: block->get_inst_list()){ - index += 1; - indices_.insert({instr, index}); - } - // Liveness analysis - // Creates live intervals - for(auto i: indices_){ - ir::value *v = i.first; - if(!info_->is_shared(v) || info_->get_reference(v)) - continue; - unsigned start = i.second; - unsigned end = start; - for(ir::value *u: v->get_users()){ - start = std::min(start, indices_.at(u)); - end = std::max(end, indices_.at(u)); - } - intervals_[v] = segment{start, end}; +inline bool is_shared(ir::value* v) { + if(auto x = dynamic_cast(v)) + return true; + if(auto x = dynamic_cast(v)){ + bool res = true; + for(unsigned inc = 0; inc < x->get_num_incoming(); inc++) + res = res && is_shared(x->get_incoming_value(inc)); + return res; } + return false; } + +void liveness::run(ir::module &mod) { + for(ir::function *fn: mod.get_function_list()){ + // Assigns index to each instruction + slot_index index = 0; + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *instr: block->get_inst_list()){ + index += 1; + indices_.insert({instr, index}); + } + // Liveness analysis + // Creates live intervals + for(auto i: indices_){ + ir::value *v = i.first; + if(!info_->is_shared(v) || info_->get_reference(v)) + continue; + unsigned start = i.second; + unsigned end = start; + for(ir::value *u: v->get_users()){ + start = std::min(start, indices_.at(u)); + end = std::max(end, indices_.at(u)); + } + intervals_[v] = segment{start, end}; + } + std::cout << "Number of intervals: " << intervals_.size() << std::endl; + } } } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index aff4dfbff..7810e6540 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -748,8 +748,6 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & indices_t b_idx = {idx[1], builder.getInt32(K)}; Value *a = TA->get_value(a_idx); Value *b = TB->get_value(b_idx); -// a = ConstantFP::get(builder.getFloatTy(), 1); -// b = ConstantFP::get(builder.getFloatTy(), 1); res = builder.CreateCall(f_mul_add, {a, b, res}); } result->set_value(idx, res); @@ -846,6 +844,7 @@ void selection::run(ir::module &src, Module &dst) { // create grids init_grids(fn, dst_builder, sh_mem_ptr); + // iterate through block std::map last_block; for(ir::basic_block *block: fn->blocks()) { @@ -854,10 +853,10 @@ void selection::run(ir::module &src, Module &dst) { for(ir::instruction *i: block->get_inst_list()){ BasicBlock *current = dst_builder.GetInsertBlock(); bool phi_inserted = (dynamic_cast(i) || dynamic_cast(i)) && !current->empty(); - if(phi_inserted) - dst_builder.SetInsertPoint(&*current->getFirstInsertionPt()); + if(phi_inserted && current->getFirstNonPHI()) + dst_builder.SetInsertPoint(&*current->getFirstNonPHI()); lower_instruction(i, dst_builder); - if(phi_inserted) + if(phi_inserted && current->getFirstNonPHI()) dst_builder.SetInsertPoint(current); last_block[block] = dst_builder.GetInsertBlock(); } diff --git a/lib/codegen/shared_copy.cpp b/lib/codegen/shared_copy.cpp index ce6f53fbe..6c05b7807 100644 --- a/lib/codegen/shared_copy.cpp +++ b/lib/codegen/shared_copy.cpp @@ -28,6 +28,12 @@ void place_shared_copy::run(ir::module &mod) { for(ir::instruction *i: block->get_inst_list()) if(info_->is_shared(i) && !info_->is_double(i)) add_copy(i, builder); + + for(ir::function *fn: mod.get_function_list()) + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *i: block->get_inst_list()) + if(auto* cts = dynamic_cast(i)) + info_->replace(cts->get_operand(0), cts); } } diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 6e3533983..8792495a2 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -109,6 +109,10 @@ void module::compile_llvm_module(llvm::Module* module, const std::string& triple llvm::SmallVectorImpl &buffer, std::vector paths) { init_llvm(); +// llvm::legacy::PassManager passes; +// passes.add(llvm::createPrintModulePass(llvm::outs())); +// passes.add(llvm::createVerifierPass()); +// passes.run(*module); // create machine module->setTargetTriple(triple); std::string error; diff --git a/lib/ir/basic_block.cpp b/lib/ir/basic_block.cpp index 456f0f820..0654156a3 100644 --- a/lib/ir/basic_block.cpp +++ b/lib/ir/basic_block.cpp @@ -21,6 +21,8 @@ basic_block* basic_block::create(context &ctx, const std::string &name, function void basic_block::add_predecessor(basic_block *pred) { preds_.push_back(pred); + if(pred) + pred->succs_.push_back(this); } From e04253c0dd3358013915d672e059553abb82f7c3 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 27 Mar 2019 11:13:36 -0700 Subject: [PATCH 118/494] [code generation] basic CPU backend --- examples/matrix.cpp | 26 +++++++++++-------------- include/triton/driver/kernel.h | 4 ++-- include/triton/jit.h | 14 ++++++-------- lib/ast/lowering.cpp | 2 +- lib/codegen/allocation.cpp | 3 --- lib/codegen/barriers.cpp | 1 - lib/codegen/liveness.cpp | 1 - lib/codegen/selection.cpp | 34 +++++++++++++++++++++++---------- lib/codegen/tune.cpp | 2 +- lib/driver/backend.cpp | 26 ++++++++++++------------- lib/driver/buffer.cpp | 3 +++ lib/driver/kernel.cpp | 4 ++-- lib/driver/module.cpp | 35 ++++++++++++++++++++++++++++++---- lib/driver/stream.cpp | 10 +++++++--- lib/jit.cpp | 13 +++++++++---- 15 files changed, 110 insertions(+), 68 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 16f5c4434..b26034884 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -19,22 +19,18 @@ void matmul(restrict read_only fp32 *a, restrict read_only fp32 *b, fp32 *c, fp32 C[TM, TN] = 0; fp32* pa[TM, TK] = a + rka[newaxis, :]*M + rxa[:, newaxis]; fp32* pb[TN, TK] = b + rkb[newaxis, :]*K + ryb[:, newaxis]; - fp32 a[TM, TK] = *pa; - fp32 b[TN, TK] = *pb; - for(int32 k = K; k > 0; k = k - TK){ + for(int32 k = K; k > 0;){ + fp32 a[TM, TK] = *pa; + fp32 b[TN, TK] = *pb; C = dot(a, b, C); pa = pa + TK*M; pb = pb + TK*K; - a = *pa; - b = *pb; + k = k - TK; } int32 rxc[TM] = get_global_range[TM](0); int32 ryc[TN] = get_global_range[TN](1); fp32* pc[TM, TN] = c + ryc[newaxis, :]*M + rxc[:, newaxis]; - int1 checkc0[TM] = rxc < M; - int1 checkc1[TN] = ryc < N; - int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - @checkc *pc = C; + *pc = C; } )"; @@ -93,7 +89,7 @@ int main() { triton::jit jit(context); // matrix multiplication parameters - int32_t M = 512, N = 512, K = 512; + int32_t M = 256, N = 256, K = 256; std::vector hc(M*N); std::vector rc(M*N); std::vector ha(M*K); @@ -155,11 +151,11 @@ int main() { // just-in-time compile source-code std::vector params = { - 16, 2, 64, - 32, 2, 64, - 16, 8, 2, 2, - 8, 8, - 4, + 1, 4, 8, + 1, 4, 8, + 1, 1, 4, 4, + 1, 8, + 1, }; // params = {8, 2, 64, 16, 2, 64, 4, 16, 2, 2, 8, 8, 4}; diff --git a/include/triton/driver/kernel.h b/include/triton/driver/kernel.h index 5a4669086..5d68ffd62 100755 --- a/include/triton/driver/kernel.h +++ b/include/triton/driver/kernel.h @@ -68,10 +68,10 @@ public: void setArg(unsigned int index, std::size_t size, void* ptr); void setArg(unsigned int index, driver::buffer* buffer); // Params - const std::vector& params(); + const std::vector& params(); private: std::vector > params_store_; - std::vector params_; + std::vector params_; }; // OpenCL diff --git a/include/triton/jit.h b/include/triton/jit.h index b53884e36..20b0e01f4 100644 --- a/include/triton/jit.h +++ b/include/triton/jit.h @@ -52,15 +52,13 @@ public: selection(&allocation, &tune, &buffer_info, target) { } void init(ir::module &module) { - // generate ptx - buffer_info.run(module); - shared.run(module); - triton::ir::print(module, std::cout); - liveness.run(module); - allocation.run(); - barriers.run(module); +// buffer_info.run(module); +// shared.run(module); +// liveness.run(module); +// allocation.run(); +// barriers.run(module); vectorize.run(module); - triton::ir::print(module, std::cout); +// triton::ir::print(module, std::cout); } codegen::tune tune; diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 77ba26464..84dcbcf3b 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -234,7 +234,7 @@ ir::type* tile::type_impl(ir::module *mod, ir::type *type, storage_spec_vec_cons // Pointer ir::type* pointer::type_impl(ir::module*, ir::type *type, storage_spec_vec_const_ref_t storage) const{ bool is_ptr_to_const = std::find(storage.begin(), storage.end(), CONSTANT_SPACE_T) != storage.end(); - return ir::pointer_type::get(type, is_ptr_to_const?4:1); + return ir::pointer_type::get(type, is_ptr_to_const?4:0); } // Function diff --git a/lib/codegen/allocation.cpp b/lib/codegen/allocation.cpp index c8ce9f60c..fd272a243 100644 --- a/lib/codegen/allocation.cpp +++ b/lib/codegen/allocation.cpp @@ -116,9 +116,6 @@ void allocation::run(){ for(auto &x: offsets_){ allocated_size_ = std::max(allocated_size_, x.second + get_num_bytes(x.first)); } - std::cout << "Allocated: " << allocated_size_ << std::endl; - for(auto &x: offsets_) - std::cout << x.first->get_name() << " " << x.second << std::endl; } } diff --git a/lib/codegen/barriers.cpp b/lib/codegen/barriers.cpp index d7b126ee0..bb3611f85 100644 --- a/lib/codegen/barriers.cpp +++ b/lib/codegen/barriers.cpp @@ -130,7 +130,6 @@ void barriers::run(ir::module &mod) { n_inserted_im1 = n_inserted_i; }while(!done); for(ir::instruction* i: insert_locs){ - std::cout << i->get_name() << std::endl; insert_barrier(i, builder); } } diff --git a/lib/codegen/liveness.cpp b/lib/codegen/liveness.cpp index c7c067052..ca33bd487 100644 --- a/lib/codegen/liveness.cpp +++ b/lib/codegen/liveness.cpp @@ -46,7 +46,6 @@ void liveness::run(ir::module &mod) { } intervals_[v] = segment{start, end}; } - std::cout << "Number of intervals: " << intervals_.size() << std::endl; } } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 7810e6540..8c49749f3 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -810,7 +810,21 @@ void selection::run(ir::module &src, Module &dst) { for(ir::function *fn: src.get_function_list()) { // create LLVM function FunctionType *fn_ty = (FunctionType*)llvm_type(fn->get_fn_type(), dst_ctx); - Function *dst_fn = Function::Create(fn_ty, Function::ExternalLinkage, fn->get_name(), &dst); + Type *dst_fn_ret_ty = fn_ty->getReturnType(); + std::vector dst_fn_args_ty; + for(unsigned i = 0; i < fn_ty->getNumParams(); i++) + dst_fn_args_ty.push_back(fn_ty->getParamType(i)); + dst_fn_args_ty.push_back(dst_builder.getInt32Ty()); + dst_fn_args_ty.push_back(dst_builder.getInt32Ty()); + dst_fn_args_ty.push_back(dst_builder.getInt32Ty()); + FunctionType *dst_fn_ty = FunctionType::get(dst_fn_ret_ty, dst_fn_args_ty, false); + // grid indices + fn->get_fn_type()->get_return_ty(); + Function *dst_fn = Function::Create(dst_fn_ty, Function::ExternalLinkage, fn->get_name(), &dst); + + + + // set attributes for(auto attr_pair: fn->attrs()){ unsigned id = attr_pair.first; @@ -831,15 +845,15 @@ void selection::run(ir::module &src, Module &dst) { // allocate shared memory Value *sh_mem_ptr = nullptr; - if(unsigned alloc_size = alloc_->get_allocated_size()){ - Type *int_8_ty = Type::getInt8Ty(dst_ctx); - ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_size); - Type *ptr_ty = PointerType::get(int_8_ty, 3); - GlobalVariable *sh_mem_array = - new GlobalVariable(dst, array_ty, false, GlobalVariable::ExternalLinkage, - nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); - sh_mem_ptr = dst_builder.CreateBitCast(sh_mem_array, ptr_ty); - } +// if(unsigned alloc_size = alloc_->get_allocated_size()){ +// Type *int_8_ty = Type::getInt8Ty(dst_ctx); +// ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_size); +// Type *ptr_ty = PointerType::get(int_8_ty, 3); +// GlobalVariable *sh_mem_array = +// new GlobalVariable(dst, array_ty, false, GlobalVariable::ExternalLinkage, +// nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); +// sh_mem_ptr = dst_builder.CreateBitCast(sh_mem_array, ptr_ty); +// } // create grids init_grids(fn, dst_builder, sh_mem_ptr); diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 8a9a35aa0..803d7661e 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -246,7 +246,7 @@ bool tune::check_constraints(std::map> &er int num_threads = 1; for(size_t k = 0; k < shapes.size(); k++) num_threads *= params_[i]["mts.d" + to_string(k)]->get_value(); - if(num_threads % 64 != 0) + if(num_threads % 1 != 0) errors[i].push_back("number of threads per block (" + to_string(num_threads) + ") must be multiple of 32"); if(num_threads != num_threads_) errors[i].push_back("Number of threads must be the same for all tiles (" + to_string(num_threads_) + ")"); diff --git a/lib/driver/backend.cpp b/lib/driver/backend.cpp index 1699eb088..dbcaba6d0 100755 --- a/lib/driver/backend.cpp +++ b/lib/driver/backend.cpp @@ -48,20 +48,20 @@ void backend::platforms::init() { if(dispatch::cuinit()){ cache_.push_back(new cu_platform()); } - //if OpenCL is here - if(dispatch::clinit()){ - cl_uint num_platforms; - dispatch::clGetPlatformIDs(0, nullptr, &num_platforms); - std::vector ids(num_platforms); - dispatch::clGetPlatformIDs(num_platforms, ids.data(), nullptr); - for(cl_platform_id id: ids) - cache_.push_back(new cl_platform(id)); - } -// //if host is here -// bool host_visible = true; -// if(host_visible){ -// cache_.push_back(new host_platform()); +// //if OpenCL is here +// if(dispatch::clinit()){ +// cl_uint num_platforms; +// dispatch::clGetPlatformIDs(0, nullptr, &num_platforms); +// std::vector ids(num_platforms); +// dispatch::clGetPlatformIDs(num_platforms, ids.data(), nullptr); +// for(cl_platform_id id: ids) +// cache_.push_back(new cl_platform(id)); // } + //if host is here + bool host_visible = true; + if(host_visible){ + cache_.push_back(new host_platform()); + } if(cache_.empty()) throw std::runtime_error("ISAAC: No backend available. Make sure CUDA is available in your library path"); } diff --git a/lib/driver/buffer.cpp b/lib/driver/buffer.cpp index a64e0aeca..b5030d710 100755 --- a/lib/driver/buffer.cpp +++ b/lib/driver/buffer.cpp @@ -64,6 +64,9 @@ buffer* buffer::create(driver::context* ctx, size_t size) { host_buffer::host_buffer(driver::context *context, size_t size) : buffer(context, host_buffer_t(), true){ hst_->data = new char[size]; + std::cout << size << std::endl; + std::cout << "allocating " << (float*)hst_->data << std::endl; + std::cout << *((float*)(hst_->data) + 512*500) << std::endl; } // diff --git a/lib/driver/kernel.cpp b/lib/driver/kernel.cpp index efd366a5a..81c797047 100755 --- a/lib/driver/kernel.cpp +++ b/lib/driver/kernel.cpp @@ -77,14 +77,14 @@ void host_kernel::setArg(unsigned int index, std::size_t size, void* ptr){ } params_store_[index].reset(malloc(size), free); memcpy(params_store_[index].get(), ptr, size); - params_[index] = llvm::GenericValue(params_store_[index].get()); + params_[index] = params_store_[index].get(); } void host_kernel::setArg(unsigned int index, driver::buffer* buffer){ kernel::setArg(index, (void*)buffer->hst()->data); } -const std::vector& host_kernel::params(){ +const std::vector &host_kernel::params(){ return params_; } diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 8792495a2..4960dc94f 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -151,11 +151,38 @@ host_module::host_module(driver::context * context, llvm::Module* src): module(c // llvm::SmallVector buffer; // module::compile_llvm_module(src, triple, cpu, "", buffer); + // create kernel wrapper + llvm::LLVMContext &ctx = src->getContext(); + llvm::Type *void_ty = llvm::Type::getVoidTy(ctx); + llvm::Type *args_ty = llvm::Type::getInt8PtrTy(ctx)->getPointerTo(); + llvm::Type *int32_ty = llvm::Type::getInt32Ty(ctx); + llvm::FunctionType *main_ty = llvm::FunctionType::get(void_ty, {args_ty, int32_ty, int32_ty, int32_ty}, false); + llvm::Function* main = llvm::Function::Create(main_ty, llvm::Function::ExternalLinkage, "main", src); + llvm::Function* fn = src->getFunction("matmul"); + llvm::FunctionType *fn_ty = fn->getFunctionType(); + std::vector fn_args(fn_ty->getNumParams()); + std::vector ptrs(fn_args.size() - 3); + llvm::BasicBlock* entry = llvm::BasicBlock::Create(ctx, "entry", main); + llvm::IRBuilder<> ir_builder(ctx); + ir_builder.SetInsertPoint(entry); + for(unsigned i = 0; i < ptrs.size(); i++) + ptrs[i] = ir_builder.CreateGEP(main->arg_begin(), ir_builder.getInt32(i)); + for(unsigned i = 0; i < ptrs.size(); i++){ + llvm::Value* addr = ir_builder.CreateBitCast(ir_builder.CreateLoad(ptrs[i]), fn_ty->getParamType(i)->getPointerTo()); + fn_args[i] = ir_builder.CreateLoad(addr); + } + fn_args[fn_args.size() - 3] = main->arg_begin() + 1; + fn_args[fn_args.size() - 2] = main->arg_begin() + 2; + fn_args[fn_args.size() - 1] = main->arg_begin() + 3; + ir_builder.CreateCall(fn, fn_args); + ir_builder.CreateRetVoid(); + + // create execution engine -// llvm::legacy::PassManager pass; -// pass.add(llvm::createPrintModulePass(llvm::outs())); -// pass.add(llvm::createVerifierPass()); -// pass.run(*src); + llvm::legacy::PassManager pass; + pass.add(llvm::createPrintModulePass(llvm::outs())); + pass.add(llvm::createVerifierPass()); + pass.run(*src); auto cloned = llvm::CloneModule(*src); for(llvm::Function& fn: cloned->functions()) hst_->functions[fn.getName()] = &fn; diff --git a/lib/driver/stream.cpp b/lib/driver/stream.cpp index 937750c23..92fed604d 100755 --- a/lib/driver/stream.cpp +++ b/lib/driver/stream.cpp @@ -84,15 +84,19 @@ void host_stream::synchronize() { void host_stream::enqueue(driver::kernel* kernel, std::array grid, std::array block, std::vector const *, event* event) { driver::host_kernel* hst_kernel = (host_kernel*)kernel; llvm::ExecutionEngine* engine = kernel->module()->hst()->engine; - engine->runFunction(kernel->hst()->fn, llvm::ArrayRef(hst_kernel->params())); + void (*fn)(char**, int32_t, int32_t, int32_t) = (void(*)(char**, int32_t, int32_t, int32_t))engine->getFunctionAddress("main"); + for(size_t i = 0; i < grid[0]; i++) + for(size_t j = 0; j < grid[1]; j++) + for(size_t k = 0; k < grid[2]; k++) + fn((char**)hst_kernel->params().data(), int32_t(i), int32_t(j), int32_t(k)); } void host_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) { - + std::memcpy((void*)buffer->hst()->data, ptr, size); } void host_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) { - + std::memcpy(ptr, (const void*)buffer->hst()->data, size); } diff --git a/lib/jit.cpp b/lib/jit.cpp index f76870a86..52327d85c 100644 --- a/lib/jit.cpp +++ b/lib/jit.cpp @@ -92,7 +92,7 @@ std::unique_ptr jit::make_triton_module(const std::string &src) { } -jit::jit(driver::context *context): driver_context_(context), target_(new triton::codegen::amd_cl_target()) { +jit::jit(driver::context *context): driver_context_(context), target_(new triton::codegen::cpu_target()) { } @@ -164,11 +164,16 @@ void jit::add_module(ir::module &tt_module, const std::vector ¶ms) // check constraints std::map> errors; passes.tune.check_constraints(errors); + for(auto x: errors){ + std::cout << x.first << std::endl; + for(auto str: x.second) + std::cout << str << std::endl; + } if(errors.size()) throw std::runtime_error("invalid parameters"); - driver::device* device = driver_context_->device(); - if(passes.allocation.get_allocated_size() > device->max_shared_memory()) - throw std::runtime_error("invalid parameters"); +// driver::device* device = driver_context_->device(); +// if(passes.allocation.get_allocated_size() > device->max_shared_memory()) +// throw std::runtime_error("invalid parameters"); // triton module -> llvm module auto ll_module = make_llvm_module(tt_module, passes); // llvm module -> machine code From bc2a257d5c7bb1705d2f7703e8eb7fad7853957f Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 27 Mar 2019 11:29:42 -0700 Subject: [PATCH 119/494] [code generation] more flexibility in backend selection --- examples/matrix.cpp | 2 +- include/triton/jit.h | 16 ++++++++++------ lib/codegen/selection.cpp | 38 +++++++++++++++++++++----------------- lib/codegen/tune.cpp | 2 +- lib/driver/backend.cpp | 18 +++++++++--------- lib/driver/module.cpp | 8 ++++---- lib/jit.cpp | 2 +- 7 files changed, 47 insertions(+), 39 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index b26034884..592d7d631 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -157,7 +157,7 @@ int main() { 1, 8, 1, }; -// params = {8, 2, 64, 16, 2, 64, 4, 16, 2, 2, 8, 8, 4}; + params = {8, 2, 64, 16, 2, 64, 4, 16, 2, 2, 8, 8, 4}; // jit.autotune(src, benchmark); jit.add_module(src, params); diff --git a/include/triton/jit.h b/include/triton/jit.h index 20b0e01f4..b9b2f96c8 100644 --- a/include/triton/jit.h +++ b/include/triton/jit.h @@ -49,14 +49,17 @@ public: allocation(&liveness, &buffer_info), barriers(&allocation, &buffer_info), vectorize(&tune), - selection(&allocation, &tune, &buffer_info, target) { } + selection(&allocation, &tune, &buffer_info, target), + target_(target) { } void init(ir::module &module) { -// buffer_info.run(module); -// shared.run(module); -// liveness.run(module); -// allocation.run(); -// barriers.run(module); + if(target_->is_gpu()){ + buffer_info.run(module); + shared.run(module); + liveness.run(module); + allocation.run(); + barriers.run(module); + } vectorize.run(module); // triton::ir::print(module, std::cout); } @@ -69,6 +72,7 @@ public: codegen::barriers barriers; codegen::vectorize vectorize; codegen::selection selection; + codegen::target* target_; }; private: diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 8c49749f3..546b0e76f 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -810,14 +810,17 @@ void selection::run(ir::module &src, Module &dst) { for(ir::function *fn: src.get_function_list()) { // create LLVM function FunctionType *fn_ty = (FunctionType*)llvm_type(fn->get_fn_type(), dst_ctx); - Type *dst_fn_ret_ty = fn_ty->getReturnType(); - std::vector dst_fn_args_ty; - for(unsigned i = 0; i < fn_ty->getNumParams(); i++) - dst_fn_args_ty.push_back(fn_ty->getParamType(i)); - dst_fn_args_ty.push_back(dst_builder.getInt32Ty()); - dst_fn_args_ty.push_back(dst_builder.getInt32Ty()); - dst_fn_args_ty.push_back(dst_builder.getInt32Ty()); - FunctionType *dst_fn_ty = FunctionType::get(dst_fn_ret_ty, dst_fn_args_ty, false); + FunctionType *dst_fn_ty = fn_ty; + if(!tgt_->is_gpu()){ + Type *dst_fn_ret_ty = fn_ty->getReturnType(); + std::vector dst_fn_args_ty; + for(unsigned i = 0; i < fn_ty->getNumParams(); i++) + dst_fn_args_ty.push_back(fn_ty->getParamType(i)); + dst_fn_args_ty.push_back(dst_builder.getInt32Ty()); + dst_fn_args_ty.push_back(dst_builder.getInt32Ty()); + dst_fn_args_ty.push_back(dst_builder.getInt32Ty()); + dst_fn_ty = FunctionType::get(dst_fn_ret_ty, dst_fn_args_ty, false); + } // grid indices fn->get_fn_type()->get_return_ty(); Function *dst_fn = Function::Create(dst_fn_ty, Function::ExternalLinkage, fn->get_name(), &dst); @@ -845,15 +848,16 @@ void selection::run(ir::module &src, Module &dst) { // allocate shared memory Value *sh_mem_ptr = nullptr; -// if(unsigned alloc_size = alloc_->get_allocated_size()){ -// Type *int_8_ty = Type::getInt8Ty(dst_ctx); -// ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_size); -// Type *ptr_ty = PointerType::get(int_8_ty, 3); -// GlobalVariable *sh_mem_array = -// new GlobalVariable(dst, array_ty, false, GlobalVariable::ExternalLinkage, -// nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); -// sh_mem_ptr = dst_builder.CreateBitCast(sh_mem_array, ptr_ty); -// } + if(tgt_->is_gpu()) + if(unsigned alloc_size = alloc_->get_allocated_size()){ + Type *int_8_ty = Type::getInt8Ty(dst_ctx); + ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_size); + Type *ptr_ty = PointerType::get(int_8_ty, 3); + GlobalVariable *sh_mem_array = + new GlobalVariable(dst, array_ty, false, GlobalVariable::ExternalLinkage, + nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); + sh_mem_ptr = dst_builder.CreateBitCast(sh_mem_array, ptr_ty); + } // create grids init_grids(fn, dst_builder, sh_mem_ptr); diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 803d7661e..8a9a35aa0 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -246,7 +246,7 @@ bool tune::check_constraints(std::map> &er int num_threads = 1; for(size_t k = 0; k < shapes.size(); k++) num_threads *= params_[i]["mts.d" + to_string(k)]->get_value(); - if(num_threads % 1 != 0) + if(num_threads % 64 != 0) errors[i].push_back("number of threads per block (" + to_string(num_threads) + ") must be multiple of 32"); if(num_threads != num_threads_) errors[i].push_back("Number of threads must be the same for all tiles (" + to_string(num_threads_) + ")"); diff --git a/lib/driver/backend.cpp b/lib/driver/backend.cpp index dbcaba6d0..9761e94e7 100755 --- a/lib/driver/backend.cpp +++ b/lib/driver/backend.cpp @@ -48,15 +48,15 @@ void backend::platforms::init() { if(dispatch::cuinit()){ cache_.push_back(new cu_platform()); } -// //if OpenCL is here -// if(dispatch::clinit()){ -// cl_uint num_platforms; -// dispatch::clGetPlatformIDs(0, nullptr, &num_platforms); -// std::vector ids(num_platforms); -// dispatch::clGetPlatformIDs(num_platforms, ids.data(), nullptr); -// for(cl_platform_id id: ids) -// cache_.push_back(new cl_platform(id)); -// } + //if OpenCL is here + if(dispatch::clinit()){ + cl_uint num_platforms; + dispatch::clGetPlatformIDs(0, nullptr, &num_platforms); + std::vector ids(num_platforms); + dispatch::clGetPlatformIDs(num_platforms, ids.data(), nullptr); + for(cl_platform_id id: ids) + cache_.push_back(new cl_platform(id)); + } //if host is here bool host_visible = true; if(host_visible){ diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 4960dc94f..72639c515 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -179,10 +179,10 @@ host_module::host_module(driver::context * context, llvm::Module* src): module(c // create execution engine - llvm::legacy::PassManager pass; - pass.add(llvm::createPrintModulePass(llvm::outs())); - pass.add(llvm::createVerifierPass()); - pass.run(*src); +// llvm::legacy::PassManager pass; +// pass.add(llvm::createPrintModulePass(llvm::outs())); +// pass.add(llvm::createVerifierPass()); +// pass.run(*src); auto cloned = llvm::CloneModule(*src); for(llvm::Function& fn: cloned->functions()) hst_->functions[fn.getName()] = &fn; diff --git a/lib/jit.cpp b/lib/jit.cpp index 52327d85c..c037be2cc 100644 --- a/lib/jit.cpp +++ b/lib/jit.cpp @@ -92,7 +92,7 @@ std::unique_ptr jit::make_triton_module(const std::string &src) { } -jit::jit(driver::context *context): driver_context_(context), target_(new triton::codegen::cpu_target()) { +jit::jit(driver::context *context): driver_context_(context), target_(new triton::codegen::amd_cl_target()) { } From fdf85598060aed2fa8571fbed903838884750a63 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 27 Mar 2019 20:01:35 -0400 Subject: [PATCH 120/494] [general] added missing files --- include/triton/codegen/target.h | 63 ++++ include/triton/driver/helpers/CL/infos.hpp | 413 +++++++++++++++++++++ include/triton/ir/cfg.h | 20 + lib/codegen/target.cpp | 118 ++++++ lib/codegen/tune.cpp | 4 +- lib/ir/cfg.cpp | 32 ++ 6 files changed, 648 insertions(+), 2 deletions(-) create mode 100644 include/triton/codegen/target.h create mode 100644 include/triton/driver/helpers/CL/infos.hpp create mode 100644 include/triton/ir/cfg.h create mode 100644 lib/codegen/target.cpp create mode 100644 lib/ir/cfg.cpp diff --git a/include/triton/codegen/target.h b/include/triton/codegen/target.h new file mode 100644 index 000000000..e2dc4518a --- /dev/null +++ b/include/triton/codegen/target.h @@ -0,0 +1,63 @@ +#ifndef TDL_INCLUDE_IR_CODEGEN_TARGET_H +#define TDL_INCLUDE_IR_CODEGEN_TARGET_H + +#include +#include +#include +#include "llvm/IR/IRBuilder.h" + +namespace llvm{ +class Instruction; +class Value; +class Module; +class LLVMContext; +class Function; +} + +namespace triton{ +namespace codegen{ + +class target { +public: + target(bool is_gpu): is_gpu_(is_gpu){} + virtual void set_kernel(llvm::IRBuilder<>& builder, llvm::LLVMContext &ctx, llvm::Module *module, llvm::Function* fn) = 0; + virtual llvm::Instruction* add_barrier(llvm::Module *module, llvm::IRBuilder<>& builder) = 0; + virtual llvm::Value* get_global_offset(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned stride, unsigned ax) = 0; + virtual llvm::Value* get_local_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax) = 0; + bool is_gpu() const; + +private: + bool is_gpu_; +}; + +class amd_cl_target: public target { +public: + amd_cl_target(): target(true){} + void set_kernel(llvm::IRBuilder<>& builder, llvm::LLVMContext &ctx, llvm::Module *module, llvm::Function* fn); + llvm::Instruction* add_barrier(llvm::Module *module, llvm::IRBuilder<>& builder); + llvm::Value* get_global_offset(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned stride, unsigned ax); + llvm::Value* get_local_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); +}; + +class nvidia_cu_target: public target { +public: + nvidia_cu_target(): target(true){} + void set_kernel(llvm::IRBuilder<>& builder, llvm::LLVMContext &ctx, llvm::Module *module, llvm::Function* fn); + llvm::Instruction* add_barrier(llvm::Module *module, llvm::IRBuilder<>& builder); + llvm::Value* get_global_offset(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned stride, unsigned ax); + llvm::Value* get_local_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); +}; + +class cpu_target: public target { +public: + cpu_target(): target(false){} + void set_kernel(llvm::IRBuilder<>& builder, llvm::LLVMContext &ctx, llvm::Module *module, llvm::Function* fn); + llvm::Instruction* add_barrier(llvm::Module *module, llvm::IRBuilder<>& builder); + llvm::Value* get_global_offset(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned stride, unsigned ax); + llvm::Value* get_local_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); +}; + +} +} + +#endif diff --git a/include/triton/driver/helpers/CL/infos.hpp b/include/triton/driver/helpers/CL/infos.hpp new file mode 100644 index 000000000..dcd80928c --- /dev/null +++ b/include/triton/driver/helpers/CL/infos.hpp @@ -0,0 +1,413 @@ +#ifndef ISAAC_DRIVER_HELPERS_OCL_INFOS_HPP_ +#define ISAAC_DRIVER_HELPERS_OCL_INFOS_HPP_ + +/* ========================================================================= + Copyright (c) 2010-2012, Institute for Microelectronics, + Institute for Analysis and Scientific Computing, + TU Wien. + + ----------------- + ViennaCL - The Vienna Computing Library + ----------------- + + Project Head: Karl Rupp rupp@iue.tuwien.ac.at + + (A list of authors and contributors can be found in the PDF manual) + + License: MIT (X11), see file LICENSE in the base directory +============================================================================= */ + + + +#include "triton/driver/error.h" +#include +#include + +namespace triton +{ +namespace driver +{ +namespace ocl +{ + + /** @brief Implementation details for the OpenCL managment layer in ViennaCL */ +namespace detail{ + +/** @brief Helper class for obtaining informations from the OpenCL backend. Deprecated! */ +template +struct info; + +/** \cond */ +template<> +struct info +{ + typedef cl_mem_info type; + + static void get(cl_mem handle, cl_mem_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret) + { + cl_int err = dispatch::clGetMemObjectInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); + check(err); + } +}; + +template<> +struct info +{ + typedef cl_device_info type; + + static void get(cl_device_id handle, cl_device_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret) + { + cl_int err = dispatch::clGetDeviceInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); + check(err); + } +}; + +template<> +struct info +{ + typedef cl_kernel_info type; + + static void get(cl_kernel handle, cl_kernel_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ + cl_int err = dispatch::clGetKernelInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); + check(err); + } + + static void get(cl_kernel handle, cl_device_id dev_id, cl_kernel_work_group_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ + cl_int err = dispatch::clGetKernelWorkGroupInfo(handle, dev_id, param_name,param_value_size,param_value,param_value_size_ret); + check(err); + } +}; + +template<> +struct info +{ + typedef cl_context_info type; + + static void get(cl_context handle, cl_context_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ + cl_int err = dispatch::clGetContextInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); + check(err); + } +}; + +template<> +struct info +{ + typedef cl_program_info type; + + static void get(cl_program handle, cl_program_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ + cl_int err = dispatch::clGetProgramInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); + check(err); + } + + static void get(cl_program handle, cl_device_id device, cl_program_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ + cl_int err = dispatch::clGetProgramBuildInfo(handle,device,param_name,param_value_size,param_value,param_value_size_ret); + check(err); + } +}; + + +template<> +struct info +{ + typedef cl_profiling_info type; + static void get(cl_event handle, cl_profiling_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ + cl_int err = dispatch::clGetEventProfilingInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); + check(err); + } +}; + +template<> +struct info +{ + typedef cl_command_queue_info type; + static void get(cl_command_queue handle, cl_profiling_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ + cl_int err = dispatch::clGetCommandQueueInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); + check(err); + } +}; + +template<> +struct info +{ + typedef cl_command_queue_info type; + static void get(cl_platform_id handle, cl_profiling_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ + cl_int err = dispatch::clGetPlatformInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); + check(err); + } +}; + +//Info getter +//Some intelligence is needed for some types +template +struct get_info_impl{ + + template + RES_T operator()(MEM_T const & mem, INFO_T const & info){ + RES_T res; + detail::info::get(mem,info,sizeof(RES_T),&res,NULL); + return res; + } + + template + RES_T operator()(MEM_T const & mem, ARG_MEM_T const & arg_mem, INFO_T const & info){ + RES_T res; + detail::info::get(mem,arg_mem, info,sizeof(RES_T),&res,NULL); + return res; + } +}; + +template<> +struct get_info_impl{ + + template + std::string operator()(const MEM_T &mem, const INFO_T &info){ + char buff[1024]; + detail::info::get(mem,info,1024,buff,NULL); + return std::string(buff); + } + + template + std::string operator()(MEM_T const & mem, ARG_MEM_T const & arg_mem, INFO_T const & info){ + char buff[1024]; + detail::info::get(mem,arg_mem,info,1024,buff,NULL); + return std::string(buff); + } +}; + +template +struct get_info_impl > +{ + template + std::vector operator()(const MEM_T &mem, const INFO_T &info) + { + size_t vec_size; + detail::info::get(mem,info,0,NULL,&vec_size); + std::vector res(vec_size/sizeof(T)); + detail::info::get(mem,info,vec_size,res.data(),NULL); + return res; + } + + template + std::vector operator()(MEM_T const & mem, ARG_MEM_T const & arg_mem, INFO_T const & info) + { + size_t vec_size; + detail::info::get(mem,arg_mem,info,0,NULL,&vec_size); + std::vector res(vec_size/sizeof(T)); + detail::info::get(mem,arg_mem,info,vec_size,res.data(),NULL); + return res; + } +}; + +template::type param> +struct return_type; +/** \endcond */ + +/** \cond */ + #define SET_INFO_RETURN_TYPE(DATA_TYPE,NAME,RETURN_TYPE) template<> struct return_type { typedef RETURN_TYPE Result; } + +SET_INFO_RETURN_TYPE(cl_command_queue, CL_QUEUE_CONTEXT, cl_context); +SET_INFO_RETURN_TYPE(cl_command_queue, CL_QUEUE_DEVICE, cl_device_id); +SET_INFO_RETURN_TYPE(cl_command_queue, CL_QUEUE_REFERENCE_COUNT, cl_uint); +SET_INFO_RETURN_TYPE(cl_command_queue, CL_QUEUE_PROPERTIES, cl_command_queue_properties); + +SET_INFO_RETURN_TYPE(cl_context, CL_CONTEXT_DEVICES, std::vector); +SET_INFO_RETURN_TYPE(cl_context, CL_CONTEXT_NUM_DEVICES, cl_uint); +SET_INFO_RETURN_TYPE(cl_context, CL_CONTEXT_REFERENCE_COUNT, cl_uint); +SET_INFO_RETURN_TYPE(cl_context, CL_CONTEXT_PROPERTIES, cl_context_properties); + +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_ADDRESS_BITS, cl_uint); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_AVAILABLE, cl_bool); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_COMPILER_AVAILABLE, cl_bool); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint); + +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_ENDIAN_LITTLE, cl_bool); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_EXTENSIONS, std::string); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong); +//SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_IMAGE_SUPPORT, cl_bool); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_IMAGE2D_MAX_HEIGHT , size_t); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_IMAGE2D_MAX_WIDTH , size_t); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_IMAGE3D_MAX_DEPTH , size_t); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_IMAGE3D_MAX_HEIGHT , size_t); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_IMAGE3D_MAX_WIDTH , size_t); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MAX_CLOCK_FREQUENCY , cl_uint); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MAX_COMPUTE_UNITS , cl_uint); //The minimum value is 1 +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MAX_CONSTANT_ARGS , cl_uint); //The minimum value is 8 +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE , cl_ulong); //The minimum value is 64 KB +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE , cl_ulong); //The minimum value is max (1/4th of CL_DEVICE_GLOBAL_MEM_SIZE, 128*1024*1024) +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MAX_PARAMETER_SIZE , size_t); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MAX_READ_IMAGE_ARGS , cl_uint); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MAX_SAMPLERS , cl_uint); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE , size_t); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS , cl_uint); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MAX_WORK_ITEM_SIZES , std::vector); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MAX_WRITE_IMAGE_ARGS , cl_uint); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MEM_BASE_ADDR_ALIGN , cl_uint); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE , cl_uint); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_NAME , std::string); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_PLATFORM , cl_platform_id); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR , cl_uint); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT , cl_uint); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT , cl_uint); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT , cl_uint); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE , cl_uint); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_PROFILE , std::string); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_PROFILING_TIMER_RESOLUTION , size_t); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_QUEUE_PROPERTIES , cl_command_queue_properties); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_SINGLE_FP_CONFIG , cl_device_fp_config); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_TYPE , cl_device_type); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_VENDOR , std::string); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_VENDOR_ID , cl_uint); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_VERSION , std::string); +SET_INFO_RETURN_TYPE(cl_device_id, CL_DRIVER_VERSION , std::string); + +SET_INFO_RETURN_TYPE(cl_event, CL_PROFILING_COMMAND_QUEUED, cl_ulong); +SET_INFO_RETURN_TYPE(cl_event, CL_PROFILING_COMMAND_SUBMIT, cl_ulong); +SET_INFO_RETURN_TYPE(cl_event, CL_PROFILING_COMMAND_START, cl_ulong); +SET_INFO_RETURN_TYPE(cl_event, CL_PROFILING_COMMAND_END, cl_ulong); + +SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_FUNCTION_NAME, std::string); +SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_NUM_ARGS, cl_uint); +SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_REFERENCE_COUNT, cl_uint); +SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_CONTEXT, cl_context); +SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_PROGRAM, cl_program); + + +SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_WORK_GROUP_SIZE, size_t); +SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_COMPILE_WORK_GROUP_SIZE, std::vector); +SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong); +SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, size_t); + +SET_INFO_RETURN_TYPE(cl_mem,CL_MEM_TYPE, cl_mem_object_type); +SET_INFO_RETURN_TYPE(cl_mem,CL_MEM_FLAGS, cl_mem_flags); +SET_INFO_RETURN_TYPE(cl_mem,CL_MEM_SIZE, size_t); +SET_INFO_RETURN_TYPE(cl_mem,CL_MEM_HOST_PTR, void*); +SET_INFO_RETURN_TYPE(cl_mem,CL_MEM_MAP_COUNT, cl_uint); +SET_INFO_RETURN_TYPE(cl_mem,CL_MEM_REFERENCE_COUNT, cl_uint); +SET_INFO_RETURN_TYPE(cl_mem,CL_MEM_CONTEXT, cl_context); + +SET_INFO_RETURN_TYPE(cl_program,CL_PROGRAM_CONTEXT,cl_context); +SET_INFO_RETURN_TYPE(cl_program,CL_PROGRAM_DEVICES,std::vector); +SET_INFO_RETURN_TYPE(cl_program,CL_PROGRAM_NUM_DEVICES,cl_uint); +SET_INFO_RETURN_TYPE(cl_program,CL_PROGRAM_SOURCE,std::string); +SET_INFO_RETURN_TYPE(cl_program,CL_PROGRAM_BINARY_SIZES,std::vector); +SET_INFO_RETURN_TYPE(cl_program,CL_PROGRAM_BINARIES,std::vector); +//Build +SET_INFO_RETURN_TYPE(cl_program,CL_PROGRAM_BUILD_STATUS, cl_build_status); +SET_INFO_RETURN_TYPE(cl_program,CL_PROGRAM_BUILD_OPTIONS, std::string); +SET_INFO_RETURN_TYPE(cl_program,CL_PROGRAM_BUILD_LOG, std::string); + +SET_INFO_RETURN_TYPE(cl_platform_id,CL_PLATFORM_PROFILE, std::string); +SET_INFO_RETURN_TYPE(cl_platform_id,CL_PLATFORM_VERSION, std::string); +SET_INFO_RETURN_TYPE(cl_platform_id,CL_PLATFORM_NAME, std::string); +SET_INFO_RETURN_TYPE(cl_platform_id,CL_PLATFORM_VENDOR, std::string); +SET_INFO_RETURN_TYPE(cl_platform_id,CL_PLATFORM_EXTENSIONS, std::string); + +#undef SET_INFO_RETURN_TYPE + + /** \endcond */ +} + +template +typename detail::return_type::Result info(cl_device_id const & handle){ + typedef typename detail::return_type::Result res_t; + return detail::get_info_impl()(handle,param); +} + +template +typename detail::return_type::Result info(cl_mem const & handle){ + typedef typename detail::return_type::Result res_t; + return detail::get_info_impl()(handle,param); +} + +//Program + +template +typename detail::return_type::Result info(cl_program const & handle){ + typedef typename detail::return_type::Result res_t; + return detail::get_info_impl()(handle,param); +} + +template<> +inline typename detail::return_type::Result info(cl_program const & handle) +{ + std::vector res; + std::vector sizes = info(handle); + for(size_t s: sizes) + res.push_back(new unsigned char[s]); + dispatch::clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(unsigned char**), (void*)res.data(), NULL); + return res; +} + +template +typename detail::return_type::Result info(cl_program const & phandle, cl_device_id const & dhandle){ + typedef typename detail::return_type::Result res_t; + return detail::get_info_impl()(phandle,dhandle,param); +} + +//Kernel +template +typename detail::return_type::Result info(cl_kernel const & handle){ + typedef typename detail::return_type::Result res_t; + return detail::get_info_impl()(handle,param); +} + +template +typename detail::return_type::Result info(cl_kernel const & khandle, cl_device_id const & dhandle){ + typedef typename detail::return_type::Result res_t; + return detail::get_info_impl()(khandle,dhandle,param); +} + +//Context +template +typename detail::return_type::Result info(cl_context const & handle){ + typedef typename detail::return_type::Result res_t; + return detail::get_info_impl()(handle,param); +} + +//Event +template +typename detail::return_type::Result info(cl_event const & handle){ + typedef typename detail::return_type::Result res_t; + return detail::get_info_impl()(handle,param); +} + +//Command queue +template +typename detail::return_type::Result info(cl_command_queue const & handle){ + typedef typename detail::return_type::Result res_t; + return detail::get_info_impl()(handle,param); +} + +//Plaftform +template +typename detail::return_type::Result info(cl_platform_id const & handle){ + typedef typename detail::return_type::Result res_t; + return detail::get_info_impl()(handle,param); +} + +template::type param> +typename detail::return_type::Result info(OCL_TYPE const & handle){ + return info(handle.get()); +} + + + +template::type param> +typename detail::return_type::Result info(OCL_TYPE const & handle, OCL_TYPE_ARG const & arg_handle){ + return info(handle.get(), arg_handle.get()); +} + +} +} +} +#endif // INFOS_HPP diff --git a/include/triton/ir/cfg.h b/include/triton/ir/cfg.h new file mode 100644 index 000000000..8a00a32ef --- /dev/null +++ b/include/triton/ir/cfg.h @@ -0,0 +1,20 @@ +#ifndef TDL_INCLUDE_IR_CFG_H +#define TDL_INCLUDE_IR_CFG_H + +#include + +namespace triton{ +namespace ir{ + +class function; +class basic_block; + +class cfg { +public: + static std::vector reverse_post_order(function* fn); +}; + +} +} + +#endif diff --git a/lib/codegen/target.cpp b/lib/codegen/target.cpp new file mode 100644 index 000000000..27a982a6c --- /dev/null +++ b/lib/codegen/target.cpp @@ -0,0 +1,118 @@ +#include "triton/codegen/target.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/IRBuilder.h" + +using namespace llvm; + +namespace triton{ +namespace codegen{ + +// base +bool target::is_gpu() const { + return is_gpu_; +} + +// AMD +void amd_cl_target::set_kernel(IRBuilder<>& builder, LLVMContext &ctx, Module *module, Function* fn) { + fn->setCallingConv(CallingConv::AMDGPU_KERNEL); +} + +Instruction* amd_cl_target::add_barrier(Module *module, IRBuilder<>& builder) { + Function *barrier = Intrinsic::getDeclaration(module, Intrinsic::amdgcn_s_barrier); + return builder.CreateCall(barrier, {}); +} + +Value* amd_cl_target::get_global_offset(Module *module, IRBuilder<>& builder, unsigned stride, unsigned ax) { + static std::array ids = { + Intrinsic::amdgcn_workgroup_id_x, + Intrinsic::amdgcn_workgroup_id_y, + Intrinsic::amdgcn_workgroup_id_z + }; + Value* get_group_id = Intrinsic::getDeclaration(module, ids[ax]); + Value* group_id = builder.CreateCall(get_group_id, {}); + Value* result = builder.CreateMul(builder.getInt32(stride), group_id); + return result; +} + +Value* amd_cl_target::get_local_id(Module *module, IRBuilder<>& builder, unsigned ax) { + static std::array ids = { + Intrinsic::amdgcn_workitem_id_x, + Intrinsic::amdgcn_workitem_id_y, + Intrinsic::amdgcn_workitem_id_z + }; + Function *get_local_id = Intrinsic::getDeclaration(module, ids[ax]); + return builder.CreateCall(get_local_id, {}); +} + +// NVIDIA + +void nvidia_cu_target::set_kernel(IRBuilder<>& builder, LLVMContext &ctx, Module *module, Function* fn){ + // set metadata + Metadata *md_args[] = { + ValueAsMetadata::get(fn), + MDString::get(ctx, "kernel"), + ValueAsMetadata::get(builder.getInt32(1)) + }; + module->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(MDNode::get(ctx, md_args)); +} + +Instruction* nvidia_cu_target::add_barrier(Module *module, IRBuilder<>& builder) { + Function *barrier = Intrinsic::getDeclaration(module, Intrinsic::nvvm_barrier0); + return builder.CreateCall(barrier, {}); +} + +Value* nvidia_cu_target::get_global_offset(Module *module, IRBuilder<>& builder, unsigned stride, unsigned ax) { + static std::array ids = { + Intrinsic::nvvm_read_ptx_sreg_ctaid_x, + Intrinsic::nvvm_read_ptx_sreg_ctaid_y, + Intrinsic::nvvm_read_ptx_sreg_ctaid_z + }; + Value* get_group_id = Intrinsic::getDeclaration(module, ids[ax]); + Value* group_id = builder.CreateCall(get_group_id, {}); + Value* result = builder.CreateMul(builder.getInt32(stride), group_id); + return result; +} + +Value* nvidia_cu_target::get_local_id(Module *module, IRBuilder<>& builder, unsigned ax) { + static std::array ids = { + Intrinsic::nvvm_read_ptx_sreg_tid_x, + Intrinsic::nvvm_read_ptx_sreg_tid_y, + Intrinsic::nvvm_read_ptx_sreg_tid_z + }; + Function *get_local_id = Intrinsic::getDeclaration(module, ids[ax]); + return builder.CreateCall(get_local_id, {}); +} + +// CPU + +void cpu_target::set_kernel(IRBuilder<>& builder, LLVMContext &ctx, Module *module, Function* fn) { + // normal cpu functions can be kernels +} + +Instruction* cpu_target::add_barrier(Module *module, IRBuilder<>& builder) { + // no barrier on CPU + return (Instruction*)builder.CreateAdd(builder.getInt32(0), builder.getInt32(0)); +} + +Value* cpu_target::get_global_offset(Module *module, IRBuilder<>& builder, unsigned stride, unsigned ax) { + const Function *fn = builder.GetInsertBlock()->getParent(); + size_t num_params = fn->getFunctionType()->getNumParams(); + static std::array ids = { + fn->arg_begin() + num_params - 3, + fn->arg_begin() + num_params - 2, + fn->arg_begin() + num_params - 1 + }; + Value* result = builder.CreateMul(builder.getInt32(stride), (Argument*)ids[ax]); + return result; +} + +Value* cpu_target::get_local_id(Module *module, IRBuilder<>& builder, unsigned ax) { + return builder.getInt32(0); +} + + +} +} diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 8a9a35aa0..4353b1332 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -6,6 +6,7 @@ #include "triton/ir/function.h" #include "triton/ir/context_impl.h" #include "triton/ir/constant.h" +#include "triton/driver/device.h" #include @@ -242,12 +243,11 @@ bool tune::check_constraints(std::map> &er errors[i].push_back("for dim " + strk + ": shape (" + to_string(shapes[k]->get_value()) + ")" " is not a multiple of layout (" + to_string(multiple) + ")"); } - // the number of thread per warp must be 32 int num_threads = 1; for(size_t k = 0; k < shapes.size(); k++) num_threads *= params_[i]["mts.d" + to_string(k)]->get_value(); if(num_threads % 64 != 0) - errors[i].push_back("number of threads per block (" + to_string(num_threads) + ") must be multiple of 32"); + errors[i].push_back("number of threads per block (" + to_string(num_threads) + ") must be multiple of warp size"); if(num_threads != num_threads_) errors[i].push_back("Number of threads must be the same for all tiles (" + to_string(num_threads_) + ")"); } diff --git a/lib/ir/cfg.cpp b/lib/ir/cfg.cpp new file mode 100644 index 000000000..e1c0b6776 --- /dev/null +++ b/lib/ir/cfg.cpp @@ -0,0 +1,32 @@ +#include "triton/ir/cfg.h" +#include "triton/ir/basic_block.h" +#include "triton/ir/function.h" +#include +#include + +namespace triton{ +namespace ir{ + +std::vector cfg::reverse_post_order(function* fn) { + std::stack stack; + std::set visited; + std::vector result; + // initialize stack + for(ir::basic_block* block: fn->blocks()) + if(block->get_predecessors().empty()) + stack.push(block); + // DFS + while(!stack.empty()) { + basic_block* current = stack.top(); + stack.pop(); + result.push_back(current); + visited.insert(current); + for(basic_block* succ: current->get_successors()) + if(visited.find(succ) == visited.end()) + stack.push(succ); + } + return std::move(result); +} + +} +} From 2c3ae0675ef83335c32539b07148dc2c4d03e405 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 27 Mar 2019 21:12:01 -0400 Subject: [PATCH 121/494] [JIT] re-added nvidia compatibility --- examples/matrix.cpp | 38 +++++++++++++------ include/triton/driver/device.h | 20 +++++++--- include/triton/driver/module.h | 9 ++++- include/triton/external/CL/cl.h | 2 +- include/triton/external/CL/cl_d3d10.h | 4 +- include/triton/external/CL/cl_d3d11.h | 4 +- .../triton/external/CL/cl_dx9_media_sharing.h | 4 +- include/triton/external/CL/cl_egl.h | 2 +- include/triton/external/CL/cl_ext.h | 2 +- include/triton/external/CL/cl_ext_intel.h | 4 +- include/triton/external/CL/cl_gl.h | 2 +- include/triton/external/CL/cl_gl_ext.h | 2 +- .../CL/cl_va_api_media_sharing_intel.h | 4 +- include/triton/external/CL/opencl.h | 8 ++-- include/triton/jit.h | 2 +- lib/codegen/tune.cpp | 2 +- lib/driver/device.cpp | 19 ++++++++++ lib/driver/module.cpp | 28 +++++++------- lib/jit.cpp | 3 +- 19 files changed, 106 insertions(+), 53 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 592d7d631..171c1d3f5 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -8,7 +8,7 @@ const char* src = R"( const tunable int32 TM = {16, 32, 64}; const tunable int32 TN = {16, 32, 64}; -const tunable int32 TK = {8, 16}; +const tunable int32 TK = {8}; void matmul(restrict read_only fp32 *a, restrict read_only fp32 *b, fp32 *c, int32 M, int32 N, int32 K, int32 bound){ @@ -19,18 +19,35 @@ void matmul(restrict read_only fp32 *a, restrict read_only fp32 *b, fp32 *c, fp32 C[TM, TN] = 0; fp32* pa[TM, TK] = a + rka[newaxis, :]*M + rxa[:, newaxis]; fp32* pb[TN, TK] = b + rkb[newaxis, :]*K + ryb[:, newaxis]; + fp32 a[TM, TK] = *pa; + fp32 b[TN, TK] = *pb; for(int32 k = K; k > 0;){ - fp32 a[TM, TK] = *pa; - fp32 b[TN, TK] = *pb; C = dot(a, b, C); pa = pa + TK*M; pb = pb + TK*K; k = k - TK; + int1 checka[TM, TK] = k > bound; + int1 checkb[TN, TK] = k > bound; + @checka a = *pa; + @checkb b = *pb; + if(k > bound) + continue; + int1 checka0[TM] = rxa < M; + int1 checka1[TK] = rka < k; + int1 checkb0[TN] = ryb < N; + int1 checkb1[TK] = rkb < k; + checka = checka0[:, newaxis] && checka1[newaxis, :]; + checkb = checkb0[:, newaxis] && checkb1[newaxis, :]; + a = checka ? *pa : 0; + b = checkb ? *pb : 0; } int32 rxc[TM] = get_global_range[TM](0); int32 ryc[TN] = get_global_range[TN](1); fp32* pc[TM, TN] = c + ryc[newaxis, :]*M + rxc[:, newaxis]; - *pc = C; + int1 checkc0[TM] = rxc < M; + int1 checkc1[TN] = ryc < N; + int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + @checkc *pc = C; } )"; @@ -89,7 +106,7 @@ int main() { triton::jit jit(context); // matrix multiplication parameters - int32_t M = 256, N = 256, K = 256; + int32_t M = 512, N = 512, K = 512; std::vector hc(M*N); std::vector rc(M*N); std::vector ha(M*K); @@ -151,13 +168,12 @@ int main() { // just-in-time compile source-code std::vector params = { - 1, 4, 8, - 1, 4, 8, - 1, 1, 4, 4, - 1, 8, - 1, + 16, 2, 64, + 32, 2, 64, + 16, 8, 2, 2, + 8, 8, + 4 }; - params = {8, 2, 64, 16, 2, 64, 4, 16, 2, 2, 8, 8, 4}; // jit.autotune(src, benchmark); jit.add_module(src, params); diff --git a/include/triton/driver/device.h b/include/triton/driver/device.h index a08bd3cc8..34d299e91 100755 --- a/include/triton/driver/device.h +++ b/include/triton/driver/device.h @@ -29,6 +29,11 @@ namespace triton { +namespace codegen +{ +class target; +} + namespace driver { @@ -40,6 +45,7 @@ public: using polymorphic_resource::polymorphic_resource; virtual size_t max_threads_per_block() const = 0; virtual size_t max_shared_memory() const = 0; + virtual std::unique_ptr make_target() const = 0; }; // Host device @@ -48,6 +54,7 @@ public: host_device(): device(host_device_t(), true){ } size_t max_threads_per_block() const { return 1; } size_t max_shared_memory() const { return 0; } + std::unique_ptr make_target() const; }; // OpenCL device @@ -56,6 +63,7 @@ public: ocl_device(cl_device_id cl, bool take_ownership = true): device(cl, take_ownership) { } size_t max_threads_per_block() const; size_t max_shared_memory() const; + std::unique_ptr make_target() const; }; // CUDA device @@ -87,26 +95,28 @@ private: public: cu_device(CUdevice cu = CUdevice(), bool take_ownership = true): device(cu, take_ownership){} - //Accessors + // Accessors Architecture architecture() const; - //Informations + // Informations std::string infos() const; size_t address_bits() const; std::vector max_block_dim() const; size_t warp_size() const; - //Compute Capability + // Compute Capability void interpret_as(std::pair cc); std::pair compute_capability() const; - //Identifier + // Identifier std::string name() const; std::string pci_bus_id() const; - //Clocks + // Clocks size_t current_sm_clock() const; size_t current_mem_clock() const; size_t max_threads_per_block() const; size_t max_shared_memory() const; size_t max_sm_clock() const; size_t max_mem_clock() const; + // Target + std::unique_ptr make_target() const; private: std::shared_ptr> interpreted_as_; diff --git a/include/triton/driver/module.h b/include/triton/driver/module.h index 706b90c47..89ff3c39d 100755 --- a/include/triton/driver/module.h +++ b/include/triton/driver/module.h @@ -49,6 +49,11 @@ class module: public polymorphic_resource { protected: void init_llvm(); + enum file_type_t{ + Object, + Assembly + }; + public: module(driver::context* ctx, CUmodule mod, bool has_ownership); module(driver::context* ctx, cl_program mod, bool has_ownership); @@ -57,7 +62,9 @@ public: driver::context* context() const; void compile_llvm_module(llvm::Module* module, const std::string& triple, const std::string &proc, std::string layout, - llvm::SmallVectorImpl &buffer, std::vector files = {}); + llvm::SmallVectorImpl &buffer, + const std::string &features, + file_type_t file_type); protected: driver::context* ctx_; diff --git a/include/triton/external/CL/cl.h b/include/triton/external/CL/cl.h index 1e164eb83..8d58f8f77 100644 --- a/include/triton/external/CL/cl.h +++ b/include/triton/external/CL/cl.h @@ -32,7 +32,7 @@ #ifdef __APPLE__ #include #else -#include +#include "cl_platform.h" #endif #ifdef __cplusplus diff --git a/include/triton/external/CL/cl_d3d10.h b/include/triton/external/CL/cl_d3d10.h index d5960a43f..aebf6e7f9 100644 --- a/include/triton/external/CL/cl_d3d10.h +++ b/include/triton/external/CL/cl_d3d10.h @@ -32,8 +32,8 @@ #define __OPENCL_CL_D3D10_H #include -#include -#include +#include "cl.h" +#include "cl_platform.h" #ifdef __cplusplus extern "C" { diff --git a/include/triton/external/CL/cl_d3d11.h b/include/triton/external/CL/cl_d3d11.h index 39f907239..93a25b2a9 100644 --- a/include/triton/external/CL/cl_d3d11.h +++ b/include/triton/external/CL/cl_d3d11.h @@ -32,8 +32,8 @@ #define __OPENCL_CL_D3D11_H #include -#include -#include +#include "cl.h" +#include "cl_platform.h" #ifdef __cplusplus extern "C" { diff --git a/include/triton/external/CL/cl_dx9_media_sharing.h b/include/triton/external/CL/cl_dx9_media_sharing.h index 2729e8b9e..784ea6ba8 100644 --- a/include/triton/external/CL/cl_dx9_media_sharing.h +++ b/include/triton/external/CL/cl_dx9_media_sharing.h @@ -31,8 +31,8 @@ #ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H #define __OPENCL_CL_DX9_MEDIA_SHARING_H -#include -#include +#include "cl.h" +#include "cl_platform.h" #ifdef __cplusplus extern "C" { diff --git a/include/triton/external/CL/cl_egl.h b/include/triton/external/CL/cl_egl.h index a765bd526..73811ffd5 100644 --- a/include/triton/external/CL/cl_egl.h +++ b/include/triton/external/CL/cl_egl.h @@ -32,7 +32,7 @@ #ifdef __APPLE__ #else -#include +#include "cl.h" #endif #ifdef __cplusplus diff --git a/include/triton/external/CL/cl_ext.h b/include/triton/external/CL/cl_ext.h index b57190d16..a0a493545 100644 --- a/include/triton/external/CL/cl_ext.h +++ b/include/triton/external/CL/cl_ext.h @@ -42,7 +42,7 @@ extern "C" { #include #include #else - #include + #include "cl.h" #endif /* cl_khr_fp64 extension - no extension #define since it has no functions */ diff --git a/include/triton/external/CL/cl_ext_intel.h b/include/triton/external/CL/cl_ext_intel.h index 1c358cfc1..ca1971df6 100644 --- a/include/triton/external/CL/cl_ext_intel.h +++ b/include/triton/external/CL/cl_ext_intel.h @@ -56,8 +56,8 @@ Notes: #include #include #else - #include - #include + #include "cl.h" + #include "cl_platform.h" #endif #ifdef __cplusplus diff --git a/include/triton/external/CL/cl_gl.h b/include/triton/external/CL/cl_gl.h index 945daa83d..e18b0cf2d 100644 --- a/include/triton/external/CL/cl_gl.h +++ b/include/triton/external/CL/cl_gl.h @@ -32,7 +32,7 @@ #ifdef __APPLE__ #include #else -#include +#include "cl.h" #endif #ifdef __cplusplus diff --git a/include/triton/external/CL/cl_gl_ext.h b/include/triton/external/CL/cl_gl_ext.h index e3c14c640..65f34891e 100644 --- a/include/triton/external/CL/cl_gl_ext.h +++ b/include/triton/external/CL/cl_gl_ext.h @@ -41,7 +41,7 @@ extern "C" { #ifdef __APPLE__ #include #else - #include + #include "cl_gl.h" #endif /* diff --git a/include/triton/external/CL/cl_va_api_media_sharing_intel.h b/include/triton/external/CL/cl_va_api_media_sharing_intel.h index 284442885..e1d6d79a9 100644 --- a/include/triton/external/CL/cl_va_api_media_sharing_intel.h +++ b/include/triton/external/CL/cl_va_api_media_sharing_intel.h @@ -53,8 +53,8 @@ Notes: #ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H #define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H -#include -#include +#include "cl.h" +#include "cl_platform.h" #include #ifdef __cplusplus diff --git a/include/triton/external/CL/opencl.h b/include/triton/external/CL/opencl.h index 9855cd75e..f207750d6 100644 --- a/include/triton/external/CL/opencl.h +++ b/include/triton/external/CL/opencl.h @@ -44,10 +44,10 @@ extern "C" { #else -#include -#include -#include -#include +#include "cl.h" +#include "cl_gl.h" +#include "cl_gl_ext.h" +#include "cl_ext.h" #endif diff --git a/include/triton/jit.h b/include/triton/jit.h index b9b2f96c8..c033cf204 100644 --- a/include/triton/jit.h +++ b/include/triton/jit.h @@ -61,7 +61,7 @@ public: barriers.run(module); } vectorize.run(module); -// triton::ir::print(module, std::cout); + triton::ir::print(module, std::cout); } codegen::tune tune; diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 4353b1332..cf36e36c9 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -145,7 +145,7 @@ void tune::run(ir::module &mod) { // Layout parameters while(!nodes_.empty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 2, 4); + ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 2, 2); ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 4, 32); connected_components(*nodes_.begin(), {nts, mts}, nodes_, dependencies_); } diff --git a/lib/driver/device.cpp b/lib/driver/device.cpp index 62c41fb98..950ddc3f3 100755 --- a/lib/driver/device.cpp +++ b/lib/driver/device.cpp @@ -28,6 +28,7 @@ #include "triton/driver/helpers/CL/infos.hpp" #include "triton/driver/device.h" #include "triton/driver/context.h" +#include "triton/codegen/target.h" namespace triton { @@ -35,6 +36,14 @@ namespace triton namespace driver { +/* ------------------------ */ +// Host // +/* ------------------------ */ + +std::unique_ptr host_device::make_target() const { + return std::unique_ptr(new codegen::cpu_target()); +} + /* ------------------------ */ // OpenCL // @@ -49,6 +58,10 @@ size_t ocl_device::max_threads_per_block() const { return ocl::info(*cl_).at(0); } +std::unique_ptr ocl_device::make_target() const { + return std::unique_ptr(new codegen::amd_cl_target()); +} + /* ------------------------ */ // CUDA // /* ------------------------ */ @@ -216,6 +229,12 @@ std::string cu_device::infos() const{ return oss.str(); } +// target +std::unique_ptr cu_device::make_target() const { + return std::unique_ptr(new codegen::nvidia_cu_target()); +} + + } } diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 72639c515..e822794a5 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -53,10 +53,6 @@ #include "llvm/ExecutionEngine/OrcMCJITReplacement.h" #include #include "llvm/Transforms/Utils/Cloning.h" -#include "lld/Common/Driver.h" -#include "lld/Common/Args.h" -#include "lld/Common/ErrorHandler.h" -#include "lld/Common/LLVM.h" namespace triton { @@ -107,12 +103,9 @@ module* module::create(driver::context* ctx, llvm::Module *src) { void module::compile_llvm_module(llvm::Module* module, const std::string& triple, const std::string &proc, std::string layout, llvm::SmallVectorImpl &buffer, - std::vector paths) { + const std::string& features, + file_type_t ft) { init_llvm(); -// llvm::legacy::PassManager passes; -// passes.add(llvm::createPrintModulePass(llvm::outs())); -// passes.add(llvm::createVerifierPass()); -// passes.run(*module); // create machine module->setTargetTriple(triple); std::string error; @@ -122,7 +115,7 @@ void module::compile_llvm_module(llvm::Module* module, const std::string& triple opt.UnsafeFPMath = false; opt.NoInfsFPMath = false; opt.NoNaNsFPMath = true; - llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, "code-object-v3", opt, + llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt, llvm::Reloc::PIC_, llvm::None, llvm::CodeGenOpt::Aggressive); // set data layout if(layout.empty()) @@ -134,7 +127,14 @@ void module::compile_llvm_module(llvm::Module* module, const std::string& triple f.addFnAttr(llvm::Attribute::AlwaysInline); llvm::legacy::PassManager pass; llvm::raw_svector_ostream stream(buffer); - machine->addPassesToEmitFile(pass, stream, nullptr, llvm::TargetMachine::CGFT_ObjectFile); + // convert triton file type to llvm file type + auto ll_file_type = [&](module::file_type_t type){ + if(type == Object) + return llvm::TargetMachine::CGFT_ObjectFile; + return llvm::TargetMachine::CGFT_AssemblyFile; + }; + // emit + machine->addPassesToEmitFile(pass, stream, nullptr, ll_file_type(ft)); pass.run(*module); } @@ -149,7 +149,7 @@ host_module::host_module(driver::context * context, llvm::Module* src): module(c // std::string triple = llvm::sys::getDefaultTargetTriple(); // std::string cpu = llvm::sys::getHostCPUName(); // llvm::SmallVector buffer; -// module::compile_llvm_module(src, triple, cpu, "", buffer); +// module::compile_llvm_module(src, triple, cpu, "", buffer, "", Assembly); // create kernel wrapper llvm::LLVMContext &ctx = src->getContext(); @@ -202,7 +202,7 @@ host_module::host_module(driver::context * context, llvm::Module* src): module(c ocl_module::ocl_module(driver::context * context, llvm::Module* src): module(context, cl_program(), true) { init_llvm(); llvm::SmallVector buffer; - module::compile_llvm_module(src, "amdgcn-amd-amdhsa-amdgizcl", "gfx902", "", buffer); + module::compile_llvm_module(src, "amdgcn-amd-amdhsa-amdgizcl", "gfx902", "", buffer, "code-object-v3", Object); std::ofstream output("/tmp/tmp.o", std::ios::binary); std::copy(buffer.begin(), buffer.end(), std::ostreambuf_iterator(output)); system("ld.lld-8 /tmp/tmp.o -shared -o /tmp/tmp.o"); @@ -243,7 +243,7 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { layout += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; // create llvm::SmallVector buffer; - module::compile_llvm_module(module, "nvptx64-nvidia-cuda", "sm_52", layout, buffer); + module::compile_llvm_module(module, "nvptx64-nvidia-cuda", "sm_52", layout, buffer, "", Assembly); return std::string(buffer.begin(), buffer.end()); } diff --git a/lib/jit.cpp b/lib/jit.cpp index c037be2cc..068a824f0 100644 --- a/lib/jit.cpp +++ b/lib/jit.cpp @@ -92,7 +92,8 @@ std::unique_ptr jit::make_triton_module(const std::string &src) { } -jit::jit(driver::context *context): driver_context_(context), target_(new triton::codegen::amd_cl_target()) { +jit::jit(driver::context *context): driver_context_(context), + target_(context->device()->make_target()) { } From 0c607c9392622b697e99d1c03f2d50504aeb3d51 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 28 Mar 2019 07:11:06 -0400 Subject: [PATCH 122/494] [examples] normalize benchmark by max_clock / current_clock --- examples/matrix.cpp | 25 ++++++++++++++++-------- include/triton/driver/device.h | 1 + include/triton/driver/dispatch.h | 2 ++ include/triton/jit.h | 1 - lib/ast/lowering.cpp | 2 +- lib/codegen/tune.cpp | 2 +- lib/driver/device.cpp | 33 ++++++++++++++++++-------------- lib/driver/dispatch.cpp | 2 ++ lib/driver/module.cpp | 5 +---- 9 files changed, 44 insertions(+), 29 deletions(-) diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 171c1d3f5..e630e5164 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -87,19 +87,28 @@ T min(std::vector x) template -double bench(OP const & op, SYNC const & sync, unsigned repeat = 20) +double bench(OP const & op, SYNC const & sync, triton::driver::device const & device) { timer tmr; + std::vector times; + double total_time = 0; op(); sync(); - tmr.start(); - for(unsigned i = 0; i < repeat; i++) + while(total_time*1e-9 < 1e-3){ + float norm = 1; + // normalize clock if possible to get roughly constant result + if(auto cu_device = dynamic_cast(&device)) + norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock(); + tmr.start(); op(); - sync(); - double time = tmr.get().count(); - return time / repeat; + sync(); + times.push_back(norm*tmr.get().count()); + total_time+=times.back(); + } + return min(times); } + int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); @@ -159,7 +168,7 @@ int main() { stream->synchronize(); // benchmark double ts = bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, - [&](){ stream->synchronize(); }); + [&](){ stream->synchronize(); }, *context->device()); ts = ts * 1e-9; double tflops = 2.*M*N*K / ts * 1e-12; return tflops; @@ -175,7 +184,7 @@ int main() { 4 }; -// jit.autotune(src, benchmark); + jit.autotune(src, benchmark); jit.add_module(src, params); triton::driver::kernel* kernel = jit.get_function("matmul"); triton::jit::launch_information info = jit.get_launch_info("matmul"); diff --git a/include/triton/driver/device.h b/include/triton/driver/device.h index 34d299e91..f4a786a31 100755 --- a/include/triton/driver/device.h +++ b/include/triton/driver/device.h @@ -115,6 +115,7 @@ public: size_t max_shared_memory() const; size_t max_sm_clock() const; size_t max_mem_clock() const; + void set_max_clock(); // Target std::unique_ptr make_target() const; diff --git a/include/triton/driver/dispatch.h b/include/triton/driver/dispatch.h index 2d06bb397..71411b1ca 100755 --- a/include/triton/driver/dispatch.h +++ b/include/triton/driver/dispatch.h @@ -165,6 +165,7 @@ public: static nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2( const char* pciBusId, nvmlDevice_t* device); static nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); + static nvmlReturn_t nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int mem_clock, unsigned int sm_clock); static cublasHandle_t cublasHandle(driver::cu_context const & ctx); static cublasStatus_t cublasCreate_v2(cublasHandle_t* h); @@ -281,6 +282,7 @@ private: static void* nvmlDeviceGetHandleByPciBusId_v2_; static void* nvmlDeviceGetClockInfo_; static void* nvmlDeviceGetMaxClockInfo_; + static void* nvmlDeviceSetApplicationsClocks_; // cuBLAS static void* cublasCreate_v2_; static void* cublasGetStream_v2_; diff --git a/include/triton/jit.h b/include/triton/jit.h index c033cf204..c4809d254 100644 --- a/include/triton/jit.h +++ b/include/triton/jit.h @@ -61,7 +61,6 @@ public: barriers.run(module); } vectorize.run(module); - triton::ir::print(module, std::cout); } codegen::tune tune; diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 84dcbcf3b..77ba26464 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -234,7 +234,7 @@ ir::type* tile::type_impl(ir::module *mod, ir::type *type, storage_spec_vec_cons // Pointer ir::type* pointer::type_impl(ir::module*, ir::type *type, storage_spec_vec_const_ref_t storage) const{ bool is_ptr_to_const = std::find(storage.begin(), storage.end(), CONSTANT_SPACE_T) != storage.end(); - return ir::pointer_type::get(type, is_ptr_to_const?4:0); + return ir::pointer_type::get(type, is_ptr_to_const?4:1); } // Function diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index cf36e36c9..4353b1332 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -145,7 +145,7 @@ void tune::run(ir::module &mod) { // Layout parameters while(!nodes_.empty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 2, 2); + ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 2, 4); ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 4, 32); connected_components(*nodes_.begin(), {nts, mts}, nodes_, dependencies_); } diff --git a/lib/driver/device.cpp b/lib/driver/device.cpp index 950ddc3f3..ae66c50c8 100755 --- a/lib/driver/device.cpp +++ b/lib/driver/device.cpp @@ -1,22 +1,22 @@ /* Copyright 2015-2017 Philippe Tillet -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files -* (the "Software"), to deal in the Software without restriction, -* including without limitation the rights to use, copy, modify, merge, -* publish, distribute, sublicense, and/or sell copies of the Software, -* and to permit persons to whom the Software is furnished to do so, +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, * subject to the following conditions: -* -* The above copyright notice and this permission notice shall be +* +* The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ @@ -217,6 +217,11 @@ size_t cu_device::max_mem_clock() const{ return result; } +// max memory clock +void cu_device::set_max_clock() { + dispatch::nvmlDeviceSetApplicationsClocks(nvml_device(), max_mem_clock(), max_sm_clock()); +} + // print infos std::string cu_device::infos() const{ std::ostringstream oss; diff --git a/lib/driver/dispatch.cpp b/lib/driver/dispatch.cpp index 4dfd6df6e..7bb0fd001 100755 --- a/lib/driver/dispatch.cpp +++ b/lib/driver/dispatch.cpp @@ -199,6 +199,7 @@ CUDA_DEFINE1(CUresult, cuCtxPopCurrent_v2, CUcontext*) NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *, nvmlDevice_t*) NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*) NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*) +NVML_DEFINE3(nvmlReturn_t, nvmlDeviceSetApplicationsClocks, nvmlDevice_t, unsigned int, unsigned int) cublasHandle_t dispatch::cublasHandle(const cu_context &ctx){ static std::map handles; @@ -391,6 +392,7 @@ void* dispatch::nvmlInit_v2_; void* dispatch::nvmlDeviceGetHandleByPciBusId_v2_; void* dispatch::nvmlDeviceGetClockInfo_; void* dispatch::nvmlDeviceGetMaxClockInfo_; +void* dispatch::nvmlDeviceSetApplicationsClocks_; void* dispatch::cublasCreate_v2_; void* dispatch::cublasGetStream_v2_; diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index e822794a5..8346961fe 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -106,6 +106,7 @@ void module::compile_llvm_module(llvm::Module* module, const std::string& triple const std::string& features, file_type_t ft) { init_llvm(); + // create machine module->setTargetTriple(triple); std::string error; @@ -179,10 +180,6 @@ host_module::host_module(driver::context * context, llvm::Module* src): module(c // create execution engine -// llvm::legacy::PassManager pass; -// pass.add(llvm::createPrintModulePass(llvm::outs())); -// pass.add(llvm::createVerifierPass()); -// pass.run(*src); auto cloned = llvm::CloneModule(*src); for(llvm::Function& fn: cloned->functions()) hst_->functions[fn.getName()] = &fn; From 3413aad582414cc843b81bd974893ee652e3f214 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 25 Apr 2019 16:17:36 -0400 Subject: [PATCH 123/494] [general] major overhaul of triton-c/triton-ir/triton-jit: - Added alloc const - Added atomics - Pruning tuning space - Added example for dot/conv/shift - Bugfixes --- examples/CMakeLists.txt | 7 +- examples/cpp/CMakeLists.txt | 6 + examples/{matrix.cpp => cpp/blocksparse.cpp} | 87 ++---- examples/cpp/common.hpp | 286 ++++++++++++++++++ examples/cpp/conv.cpp | 236 +++++++++++++++ examples/cpp/dot.cpp | 162 ++++++++++ examples/cpp/shift.cpp | 212 +++++++++++++ examples/cpp/shift.ptx | 93 ++++++ include/triton/ast/ast.h | 68 ++++- include/triton/ast/parser.y | 9 +- include/triton/ast/scanner.l | 8 +- include/triton/codegen/layout.h | 45 --- include/triton/codegen/optimize_cse.h | 27 ++ include/triton/codegen/optimize_dot.h | 31 ++ include/triton/codegen/optimize_trans.h | 33 ++ include/triton/codegen/selection.h | 13 +- include/triton/codegen/shared_copy.h | 41 --- .../{allocation.h => shmem_allocation.h} | 12 +- .../codegen/{barriers.h => shmem_barriers.h} | 12 +- .../codegen/{buffer_info.h => shmem_info.h} | 5 +- .../codegen/{liveness.h => shmem_liveness.h} | 8 +- include/triton/codegen/target.h | 4 + include/triton/driver/module.h | 2 +- include/triton/ir/builder.h | 7 +- include/triton/ir/constant.h | 1 + include/triton/ir/instructions.h | 82 ++++- include/triton/ir/module.h | 1 + include/triton/jit.h | 61 ++-- lib/ast/lowering.cpp | 148 ++++++--- lib/codegen/buffer_info.cpp | 90 ------ lib/codegen/layout.cpp | 56 ---- lib/codegen/loop_info.cpp | 0 lib/codegen/optimize_cse.cpp | 14 + lib/codegen/optimize_dot.cpp | 50 +++ lib/codegen/optimize_trans.cpp | 71 +++++ lib/codegen/selection.cpp | 154 +++++++--- lib/codegen/shared_copy.cpp | 40 --- .../{allocation.cpp => shmem_allocation.cpp} | 13 +- .../{barriers.cpp => shmem_barriers.cpp} | 40 +-- lib/codegen/shmem_info.cpp | 135 +++++++++ .../{liveness.cpp => shmem_liveness.cpp} | 18 +- lib/codegen/target.cpp | 28 +- lib/codegen/tune.cpp | 35 ++- lib/driver/buffer.cpp | 3 - lib/driver/module.cpp | 11 +- lib/ir/builder.cpp | 24 +- lib/ir/instructions.cpp | 89 +++++- lib/ir/module.cpp | 3 + lib/ir/type.cpp | 2 +- lib/jit.cpp | 38 ++- 50 files changed, 2051 insertions(+), 570 deletions(-) create mode 100644 examples/cpp/CMakeLists.txt rename examples/{matrix.cpp => cpp/blocksparse.cpp} (69%) create mode 100644 examples/cpp/common.hpp create mode 100644 examples/cpp/conv.cpp create mode 100644 examples/cpp/dot.cpp create mode 100644 examples/cpp/shift.cpp create mode 100644 examples/cpp/shift.ptx delete mode 100644 include/triton/codegen/layout.h create mode 100644 include/triton/codegen/optimize_cse.h create mode 100644 include/triton/codegen/optimize_dot.h create mode 100644 include/triton/codegen/optimize_trans.h delete mode 100644 include/triton/codegen/shared_copy.h rename include/triton/codegen/{allocation.h => shmem_allocation.h} (79%) rename include/triton/codegen/{barriers.h => shmem_barriers.h} (82%) rename include/triton/codegen/{buffer_info.h => shmem_info.h} (84%) rename include/triton/codegen/{liveness.h => shmem_liveness.h} (90%) delete mode 100644 lib/codegen/buffer_info.cpp delete mode 100644 lib/codegen/layout.cpp delete mode 100644 lib/codegen/loop_info.cpp create mode 100644 lib/codegen/optimize_cse.cpp create mode 100644 lib/codegen/optimize_dot.cpp create mode 100644 lib/codegen/optimize_trans.cpp delete mode 100644 lib/codegen/shared_copy.cpp rename lib/codegen/{allocation.cpp => shmem_allocation.cpp} (91%) rename lib/codegen/{barriers.cpp => shmem_barriers.cpp} (75%) create mode 100644 lib/codegen/shmem_info.cpp rename lib/codegen/{liveness.cpp => shmem_liveness.cpp} (67%) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index e577a1d81..2322a85f7 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,6 +1 @@ -foreach(PROG matrix) - add_executable(${PROG} ${PROG}.cpp) - set_target_properties(${PROG} PROPERTIES OUTPUT_NAME ${PROG}) - include_directories(/usr/local/cuda/include/) - target_link_libraries(${PROG} triton) -endforeach(PROG) +add_subdirectory(cpp) diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt new file mode 100644 index 000000000..db1e5421f --- /dev/null +++ b/examples/cpp/CMakeLists.txt @@ -0,0 +1,6 @@ +foreach(PROG dot conv shift) + add_executable(${PROG} ${PROG}.cpp) + set_target_properties(${PROG} PROPERTIES OUTPUT_NAME ${PROG}) + include_directories(/usr/local/cuda/include/) + target_link_libraries(${PROG} triton) +endforeach(PROG) diff --git a/examples/matrix.cpp b/examples/cpp/blocksparse.cpp similarity index 69% rename from examples/matrix.cpp rename to examples/cpp/blocksparse.cpp index e630e5164..5a816aff1 100644 --- a/examples/matrix.cpp +++ b/examples/cpp/blocksparse.cpp @@ -1,17 +1,18 @@ #include #include +#include "common.hpp" #include "triton/jit.h" #include "triton/driver/backend.h" #include "triton/driver/stream.h" const char* src = R"( -const tunable int32 TM = {16, 32, 64}; -const tunable int32 TN = {16, 32, 64}; +const tunable int32 TM = {16, 32, 64, 128}; +const tunable int32 TN = {8}; const tunable int32 TK = {8}; -void matmul(restrict read_only fp32 *a, restrict read_only fp32 *b, fp32 *c, - int32 M, int32 N, int32 K, int32 bound){ +void blocksparse(restrict read_only fp32 *a, restrict read_only fp32 *b, fp32 *c, + int32 M, int32 N, int32 K, int32 bound){ int32 rxa[TM] = get_global_range[TM](0); int32 ryb[TN] = get_global_range[TN](1); int32 rka[TK] = 0 ... TK; @@ -22,9 +23,9 @@ void matmul(restrict read_only fp32 *a, restrict read_only fp32 *b, fp32 *c, fp32 a[TM, TK] = *pa; fp32 b[TN, TK] = *pb; for(int32 k = K; k > 0;){ - C = dot(a, b, C); + C = dot(a, trans(b), C); pa = pa + TK*M; - pb = pb + TK*K; + pb = pb + TK*N; k = k - TK; int1 checka[TM, TK] = k > bound; int1 checkb[TN, TK] = k > bound; @@ -51,71 +52,24 @@ void matmul(restrict read_only fp32 *a, restrict read_only fp32 *b, fp32 *c, } )"; - -template -void simple_gemm(std::vector &c, const std::vector &a, const std::vector &b, size_t M, size_t N, size_t K){ - for(size_t m = 0; m < M; m++) - for(size_t n = 0; n < N; n++){ - T acc = 0; - for(size_t k = 0; k < K; k++) - acc += a[m + k*M] * b[n + k*N]; - c[m + n*M] = acc; +std::vector make_deltas(std::vector mask, int K, int N){ + std::vector>> pairs(N); + unsigned int current = 0; + for(int k = 0; k < K; k++) + for(int n = 0; n < N; n++){ + if(mask[k + n*K]) + pairs[n].push_back({current, k}); } } -class timer{ - typedef std::chrono::high_resolution_clock high_resolution_clock; - typedef std::chrono::nanoseconds nanoseconds; - -public: - explicit timer(bool run = false) - { if (run) start(); } - - void start() - { _start = high_resolution_clock::now(); } - - nanoseconds get() const - { return std::chrono::duration_cast(high_resolution_clock::now() - _start); } - -private: - high_resolution_clock::time_point _start; -}; - -template -T min(std::vector x) -{ return *std::min_element(x.begin(), x.end()); } - - -template -double bench(OP const & op, SYNC const & sync, triton::driver::device const & device) -{ - timer tmr; - std::vector times; - double total_time = 0; - op(); - sync(); - while(total_time*1e-9 < 1e-3){ - float norm = 1; - // normalize clock if possible to get roughly constant result - if(auto cu_device = dynamic_cast(&device)) - norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock(); - tmr.start(); - op(); - sync(); - times.push_back(norm*tmr.get().count()); - total_time+=times.back(); - } - return min(times); -} - - int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); triton::jit jit(context); + // matrix multiplication parameters - int32_t M = 512, N = 512, K = 512; + int32_t M = 512, N = 32, K = 2048; std::vector hc(M*N); std::vector rc(M*N); std::vector ha(M*K); @@ -183,14 +137,13 @@ int main() { 8, 8, 4 }; - - jit.autotune(src, benchmark); - jit.add_module(src, params); + jit.autotune("matmul",src, benchmark); + jit.add_module("matmul", src, params); triton::driver::kernel* kernel = jit.get_function("matmul"); triton::jit::launch_information info = jit.get_launch_info("matmul"); - std::cout << benchmark(kernel, info) << std::endl; + std::cout << "Performance: " << benchmark(kernel, info) << " TFLOPS " << std::endl; stream->read(dc, true, 0, hc); - simple_gemm(rc, ha, hb, M, N, K); + simple_gemm(rc, ha, hb, M, N, K); for(size_t i = 0; i < M*N; i++) if(std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; diff --git a/examples/cpp/common.hpp b/examples/cpp/common.hpp new file mode 100644 index 000000000..8a16b9457 --- /dev/null +++ b/examples/cpp/common.hpp @@ -0,0 +1,286 @@ +#include +#include +#include "triton/driver/device.h" +#include + +template +void simple_gemm(std::vector &c, const std::vector &a, const std::vector &b, size_t M, size_t N, size_t K){ + for(size_t m = 0; m < M; m++) + for(size_t n = 0; n < N; n++){ + T acc = 0; + for(size_t k = 0; k < K; k++) + acc += (AT?a[k + m*K]:a[m + k*M]) * (BT?b[n + k*N]:b[k + n*K]); + c[m + n*M] = acc; + } +} + + +class timer{ + typedef std::chrono::high_resolution_clock high_resolution_clock; + typedef std::chrono::nanoseconds nanoseconds; + +public: + explicit timer(bool run = false) + { if (run) start(); } + + void start() + { _start = high_resolution_clock::now(); } + + nanoseconds get() const + { return std::chrono::duration_cast(high_resolution_clock::now() - _start); } + +private: + high_resolution_clock::time_point _start; +}; + +template +T min(std::vector x) +{ return *std::min_element(x.begin(), x.end()); } + + +template +double bench(OP const & op, SYNC const & sync, triton::driver::device const & device) +{ + timer tmr; + std::vector times; + double total_time = 0; + op(); + sync(); + while(total_time*1e-9 < 1e-3){ + float norm = 1; + // normalize clock if possible to get roughly constant result + if(auto cu_device = dynamic_cast(&device)) + norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock(); + tmr.start(); + op(); + sync(); + times.push_back(norm*tmr.get().count()); + total_time+=times.back(); + } + return min(times); +} + +// + +void build_conv_lut(int TK, + int stride_d, int stride_h, int stride_w, int stride_c, + int pad_d, int pad_h, int pad_w, + int T, int R, int S, + std::vector& res, std::vector& masks) { + /* convolution parameters */ + int F = T * R * S; + int Nlut = (TK + F - 1) / F * F; + int upsample_w = 1; + int upsample_h = 1; + int upsample_d = 1; + /* unpack index wrt filters */ + auto unpack = [&](int32_t trs){ + int32_t tr = trs / S; + int32_t s = trs - tr*S; + int32_t t = tr / R; + int32_t r = tr - t*R; + return std::make_tuple(t, r, s); + }; + /* increments */ + for(size_t i = 0; i < Nlut; ++i) + res[i] = (((i + TK) % Nlut) - i); + /* deltas */ + size_t Ds0 = Nlut; + size_t Ds1 = upsample_w; + size_t Ds2 = upsample_h; + size_t Ds3 = upsample_d; + for(size_t pd = 0; pd < Ds3; ++pd) + for(size_t ph = 0; ph < Ds2; ++ph) + for(size_t pw = 0; pw < Ds1; ++pw){ + int32_t* deltas_ptr = &res[Nlut + pw*Ds0 + ph*Ds0*Ds1 + pd*Ds0*Ds1*Ds2]; + // cumulative increments + for(size_t i = 0; i < Ds0; ++i){ + int32_t ctrs = i; + int32_t c = ctrs / F; + int32_t t, r, s; + std::tie(t, r, s) = unpack(ctrs % F); + // next indices + int32_t nextctrs = ctrs + TK; + int32_t nextc = nextctrs / F; + int32_t nextt, nextr, nexts; + std::tie(nextt, nextr, nexts) = unpack(nextctrs % F); + // diffs + int32_t cdiff = nextc - c; + int32_t tdiff = (nextt + pd)/upsample_d - (t + pd)/upsample_d; + int32_t rdiff = (nextr + ph)/upsample_h - (r + ph)/upsample_h; + int32_t sdiff = (nexts + pw)/upsample_w - (s + pw)/upsample_w; + // delta pointers + deltas_ptr[i] = cdiff*stride_c + sdiff*stride_w + rdiff*stride_h + tdiff*stride_d; + } + } + + /* Masks */ + size_t Ms0 = Nlut; + size_t Ms1 = 2*pad_w + 1; + size_t Ms2 = 2*pad_h + 1; + size_t Ms3 = 2*pad_d + 1; + + for(size_t pd = 0; pd < Ms3; ++pd) + for(size_t ph = 0; ph < Ms2; ++ph) + for(size_t pw = 0; pw < Ms1; ++pw){ + int32_t* masks_ptr = &masks[Nlut + pw*Ms0 + ph*Ms0*Ms1 + pd*Ms0*Ms1*Ms2]; + for(size_t i = 0; i < Ms0; ++i){ + int32_t t, r, s; + int32_t mask = 0x0; + for(size_t j = 0; j < TK; ++j){ + std::tie(t, r, s) = unpack((i + j) % F); + bool in_bounds_d = (t + pd) >= pad_d && (t + pd) < (T + pad_d); + bool in_bounds_h = (r + ph) >= pad_h && (r + ph) < (R + pad_h); + bool in_bounds_w = (s + pw) >= pad_w && (s + pw) < (S + pad_w); + mask |= (in_bounds_d && in_bounds_h && in_bounds_w) << j; + } + masks_ptr[i] = mask; + } + } + for(size_t i = 0; i < Nlut; ++i) + masks[i] = 0x0; +} + + +// Index computation +inline int32_t idx(int32_t x, int32_t y, int32_t z, int32_t w, int32_t u, + int32_t /*s0*/, int32_t s1, int32_t s2, int32_t s3, int32_t s4) +{ return u + w*s4 + z*s4*s3 + y*s4*s3*s2 + x*s4*s3*s2*s1; } + + +// Pack + +template T clamp(T x, T lo, T hi){ + return std::max(lo, std::min(x, hi)); +} + + +template +T pack(U* tmp, U scale); + +template<> +double pack(double* tmp, double scale) +{ return tmp[0]*scale; } + +template<> +float pack(float* tmp, float scale) +{ return tmp[0]*scale; } + +template<> +int pack(float* tmp, float scale) +{ + int res = 0; + for(int i = 0; i < 4; i++){ + int8_t clamped = std::round(clamp(tmp[i]*scale, (float)-128, (float)127)); + res |= (clamped & 0xFF) << (8*i); + } + return res; +} + +template struct pack_increment +{ enum{ VALUE = 1}; }; + +template<> struct pack_increment +{ enum{ VALUE = 4}; }; + +// Dot +template +inline T dot(T x, T y, T z) +{ + return std::fma(x, y, z); +} + +inline int dot(int x, int y, int z){ + int res = 0; + for(int i = 0; i < 4; i++){ + int32_t a = ((x >> (8*i)) & 0x000000FF); + int32_t b = ((y >> (8*i)) & 0x000000FF); + res += (*(int8_t*)(&a)) * (*(int8_t*)(&b)); + } + return res + z; +} + + + +template +void cpp_conv_nchw(int32_t C, int32_t N, int32_t K, + int32_t D, int32_t H, int32_t W, + int32_t T, int32_t R, int32_t S, + int32_t pad_d, int32_t pad_h, int32_t pad_w, + int32_t stride_d, int32_t stride_h, int32_t stride_w, + int32_t M, int32_t P, int32_t Q, + std::vector& O, + const std::vector& I, + const std::vector& F) +{ + static const int PACK_IN = pack_increment::VALUE; + static const int PACK_OUT = pack_increment::VALUE; + if(C % PACK_IN != 0) throw std::runtime_error("Number of input channels must be a multiple of 4"); + if(K % PACK_OUT != 0) throw std::runtime_error("Number of output channels must be a multiple of 4"); + C /= PACK_IN; + K /= PACK_OUT; + int32_t Kout = K; + IN_DTYPE accs[PACK_OUT]; + float tmp[PACK_OUT]; + for(int32_t m = 0 ; m < M; ++m) + for(int32_t p = 0 ; p < P; ++p) + for(int32_t q = 0; q < Q; ++q) + for(int32_t n = 0; n < N; ++n) + for(int32_t k = 0; k < Kout ; ++k) + { + for(int32_t i = 0; i < PACK_OUT; ++i) + accs[i] = 0; + int32_t mm = m*stride_d - pad_d; + int32_t pp = p*stride_h - pad_h; + int32_t qq = q*stride_w - pad_w; + for(int32_t kk = 0; kk < PACK_OUT; ++kk) + for(int32_t c = 0; c < C; ++c) + for(int32_t t = 0; t < T; ++t) + for(int32_t r = 0; r < R; ++r) + for(int32_t s = 0; s < S; ++s){ + int32_t d = mm + t; + int32_t h = pp + r; + int32_t w = qq + s; + bool in_bounds = (d >= 0 && h >= 0 && w >= 0 && d < D && h < H && w < W); + IN_DTYPE i = in_bounds?I[idx(n, c, d, h, w, N, C, D, H, W)]:0; + IN_DTYPE f = F[idx(c, t, r, s, k*PACK_OUT + kk, C, T, R, S, K*PACK_OUT)]; + accs[kk] = dot(i, f, accs[kk]); + } + for(int32_t kk = 0; kk < PACK_OUT; ++kk){ + tmp[kk] = accs[kk]; + } + O[idx(n, k, m, p, q, N, K, M, P, Q)] = tmp[0]; + } +} + + +// input layout: C, H, W, BS +// filter layout: C, K +// output layout: K, H, W, BS +template +void shift_conv(int32_t C, int32_t H, int32_t W, int32_t BS, + int32_t K, + std::vector& O, + const std::vector& I, + const std::vector& F, + const std::vector shift_h, + const std::vector shift_w) +{ + OUT_DTYPE acc; + for(int32_t p = 0; p < H; ++p) + for(int32_t q = 0; q < W; ++q) + for(int32_t bs = 0; bs < BS; ++bs) + for(int32_t k = 0; k < K; ++k) + { + acc = 0; + for(int32_t c = 0; c < C; ++c){ + int32_t h = p + shift_h[c]; + int32_t w = q + shift_w[c]; + bool in_bounds = (h >= 0 && w >= 0 && h < H && w < W); + IN_DTYPE a = in_bounds?I[bs + w*BS + h*BS*W + c*BS*H*W]:0; + IN_DTYPE b = F[k + c*K]; + acc = dot(a, b, acc); + } + O[bs + q*BS + p*BS*W + k*BS*H*W] = acc; + } +} diff --git a/examples/cpp/conv.cpp b/examples/cpp/conv.cpp new file mode 100644 index 000000000..721489b9f --- /dev/null +++ b/examples/cpp/conv.cpp @@ -0,0 +1,236 @@ +#include +#include +#include "common.hpp" +#include "triton/jit.h" +#include "triton/driver/backend.h" +#include "triton/driver/stream.h" + +std::string src = +R"( +const tunable int32 TM = {16, 32, 64}; +const tunable int32 TN = {16, 32, 64}; +const tunable int32 TK = {8}; + +__constant__ int32* delta = alloc_const int32[18]; +__constant__ int32* masks = alloc_const int32[1024]; + +void conv(read_only restrict fp32 *a, + read_only restrict fp32 *b, + fp32 *c, + int32 M, int32 N, int32 K, + int32 AN, int32 AH, int32 AW, + int32 CN, int32 CK, int32 CP, int32 CQ, + int32 AC, int32 AR, int32 AS, + int32 lda_n, int32 lda_c, int32 lda_h, int32 lda_w, + int32 ldc_n, int32 ldc_k, int32 ldc_p, int32 ldc_q, + int32 pad_h, int32 pad_w, + int32 bound){ + int32 rxa[TM] = get_global_range[TM](0); + int32 rb0[TN] = get_global_range[TN](1); + int32 rka[TK] = 0 ... TK; + int32 rb1[TK] = 0 ... TK; + fp32 C[TM, TN] = 0; + int32 ranh[TM] = rxa / CQ; + int32 raw[TM] = rxa % CQ - pad_w; + int32 ran[TM] = ranh / CP; + int32 rah[TM] = ranh % CP - pad_h; + int32 ra0[TM] = ran*lda_n + rah*lda_h + raw*lda_w; + int32 racr[TK] = rka / AS; + int32 ras[TK] = rka % AS; + int32 rac[TK] = racr / AR; + int32 rar[TK] = racr % AR; + int32 ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; + fp32* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis]; + fp32* pb[TN, TK] = b + rb1[newaxis, :]*CK + rb0[:, newaxis]; + __constant__ int32* pincd[TK] = delta + rka; + __constant__ int32* pd[TK] = delta + AR*AS + rka; + int32 d[TK] = *pd; + int32 incd[TK] = *pincd; + int32 maskh[TM] = pad_h + min(rah, 0) + max(rah + AR - AH, 0); + int32 maskw[TM] = pad_w + min(raw, 0) + max(raw + AS - AW, 0); + __constant__ int32* pm[TM] = masks + AR*AS + maskw*AR*AS + maskh*AR*AS*(2*pad_w + 1); + __constant__ int32* pincm[TM] = delta; + int32 incm[TM] = *pincm; + int32 checka0[TM] = *pm; + int32 checka1[TK] = 1 << rka; + int1 checka[TM, TK] = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; + fp32 a[TM, TK] = checka ? *pa : 0; + fp32 b[TN, TK] = *pb; + for(int32 k = K; k > 0; k = k - TK){ + C = dot(a, trans(b), C); + pb = pb + TK*CK; + pa = pa + d[newaxis, :]; + b = *pb; + pd = pd + incd; + pincd = pincd + incd; + d = *pd; + incd = *pincd; + pm = pm + incm; + pincm = pincm + incm; + incm = *pincm; + checka0 = *pm; + checka = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; + a = checka ? *pa : 0; + } + int32 rxc[TM] = get_global_range[TM](0); + int32 rc1[TN] = get_global_range[TN](1); + int32 rcn[TM] = rxc / (CP*CQ); + int32 rcpq[TM] = rxc % (CP*CQ); + int32 rc0[TM] = rcn * ldc_n + rcpq; + fp32* pc[TM, TN] = c + rc1[newaxis, :]*ldc_k + rc0[:, newaxis]; + int1 checkc0[TM] = rxc < M; + int1 checkc1[TN] = rc1 < N; + int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + @checkc *pc = C; +})"; + + + +int main() { + // initialize default compute device + auto context = triton::driver::backend::contexts::get_default(); + // initialize just-in-time compiler + triton::jit jit(context); + // initialization + int32_t AN = 4, CK = 32; + int32_t AD = 1, AH = 24, AW = 240; + int32_t BC = 64, BT = 1, BR = 3, BS = 3; + int32_t pad_d = 0, pad_h = 1, pad_w = 1; + int32_t stride_d = 1, stride_h = 1, stride_w = 1; + int32_t upsample_d = 1, upsample_h = 1, upsample_w = 1; + int32_t CM = (AD*upsample_d - BT + 1 + 2*pad_d + stride_d - 1)/stride_d; + int32_t CP = (AH*upsample_h - BR + 1 + 2*pad_h + stride_h - 1)/stride_h; + int32_t CQ = (AW*upsample_w - BS + 1 + 2*pad_w + stride_w - 1)/stride_w; + // equivalent matmul dimensions + int32_t M = AN*CM*CP*CQ; + int32_t N = CK; + int32_t K = BC*BT*BR*BS; + std::vector hc(AN*CP*CQ*CK); + std::vector rc(AN*CP*CQ*CK); + std::vector ha(AN*BC*AH*AW); + std::vector hb(BC*BR*BS*CK); + srand(0); + for(size_t i = 0; i < ha.size(); i++) + ha[i] = 1; + for(size_t i = 0; i < hb.size(); i++) + hb[i] = 1; + for(size_t i = 0; i < hc.size(); i++) + hc[i] = 0; + triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*4); + triton::driver::buffer* da = triton::driver::buffer::create(context, ha.size()*4); + triton::driver::buffer* db = triton::driver::buffer::create(context, hb.size()*4); + triton::driver::stream* stream = triton::driver::stream::create(context); + stream->write(da, true, 0, ha); + stream->write(db, true, 0, hb); + stream->write(dc, true, 0, hc); + stream->synchronize(); + // memory strides for data + int32_t stride_i_w = 1; + int32_t stride_i_h = AW*stride_i_w; + int32_t stride_i_d = AH*stride_i_h; + int32_t stride_i_c = AD*stride_i_d; + int32_t stride_i_n = BC*stride_i_c; + // memory strides for filters + int32_t stride_f_k = 1; + int32_t stride_f_s = CK*stride_f_k; + int32_t stride_f_r = BS*stride_f_s; + int32_t stride_f_t = BR*stride_f_r; + int32_t stride_f_c = BT*stride_f_t; + // memory stride for activations + int32_t stride_o_q = 1; + int32_t stride_o_p = CQ*stride_o_q; + int32_t stride_o_m = CP*stride_o_p; + int32_t stride_o_k = CM*stride_o_m; + int32_t stride_o_n = CK*stride_o_k; + // look-up table + int TK = 8; + int F = BT * BR * BS; + int nlut = (TK + F - 1) / F * F; + std::vector h_delta(nlut + upsample_d*upsample_h*upsample_w*nlut); + std::vector h_masks(nlut + (2*pad_h+1)*(2*pad_w+1)*(2*pad_d+1)*nlut); + build_conv_lut(TK, stride_i_d, stride_i_h, stride_i_w, stride_i_c, pad_d, pad_h, pad_w, BT, BR, BS, h_delta, h_masks); + // benchmark a given convolution kernel + auto benchmark = [&](triton::driver::kernel* kernel, + triton::jit::launch_information info) { + // launch info + unsigned TM = info.global_range_size[0]; + unsigned TN = info.global_range_size[1]; + unsigned TK = jit.get_int("TK"); + // initialize constant memory + triton::driver::buffer* delta = jit.get_buffer("delta"); + triton::driver::buffer* masks = jit.get_buffer("masks"); + stream->write(delta, false, 0, h_delta.size()*4, h_delta.data()); + stream->write(masks, false, 0, h_masks.size()*4, h_masks.data()); + stream->synchronize(); + // launch info + unsigned nthreads = info.num_threads; + std::array grid = {(M + TM - 1)/TM, (N + TN - 1)/TN, 1}; + // fast bounds-checking + unsigned lasti = (grid[0]*TM - 1)*TM + TM - 1; + unsigned lastj = (grid[1]*TN - 1)*TN + TN - 1; + unsigned lastk = TK - 1; + bool AT = false; + bool BT = true; + unsigned last_safe_a = (AT==false)?(M*K - 1 - lasti)/M - lastk : M*K - 1 - lasti*K - lastk; + unsigned last_safe_b = (BT==true)?(N*K - 1 - lastj)/N - lastk : N*K - 1 - lastj*K - lastk; + int32_t bound = std::max(1, std::max(K - last_safe_a, K - last_safe_b)); + // set arguments + kernel->setArg(0, da); + kernel->setArg(1, db); + kernel->setArg(2, dc); + kernel->setArg(3, M); + kernel->setArg(4, N); + kernel->setArg(5, K); + kernel->setArg(6, AN); + kernel->setArg(7, AH); + kernel->setArg(8, AW); + kernel->setArg(9, AN); + kernel->setArg(10, CK); + kernel->setArg(11, CP); + kernel->setArg(12, CQ); + kernel->setArg(13, BC); + kernel->setArg(14, BR); + kernel->setArg(15, BS); + kernel->setArg(16, stride_i_n); + kernel->setArg(17, stride_i_c); + kernel->setArg(18, stride_i_h); + kernel->setArg(19, stride_i_w); + kernel->setArg(20, stride_o_n); + kernel->setArg(21, stride_o_k); + kernel->setArg(22, stride_o_p); + kernel->setArg(23, stride_o_q); + kernel->setArg(24, pad_h); + kernel->setArg(25, pad_w); + kernel->setArg(26, bound); + // dry run + stream->enqueue(kernel, grid, {nthreads, 1, 1}); + stream->synchronize(); + // benchmark + double ts = bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, + [&](){ stream->synchronize(); }, *context->device()); + ts = ts * 1e-9; + double tflops = 2.*M*N*K / ts * 1e-12; + return tflops; + }; + // run + std::vector params = { + 16, 2, 64, + 32, 2, 64, + 16, 8, 2, 2, + 8, 8, + 4 + }; +// jit.autotune("conv", src, benchmark); + jit.add_module("conv", src, params); + triton::driver::kernel* kernel = jit.get_function("conv"); + triton::jit::launch_information info = jit.get_launch_info("conv"); + std::cout << "Performance: " << benchmark(kernel, info) << " TFLOPS " << std::endl; + stream->read(dc, true, 0, hc); + cpp_conv_nchw(BC, AN, CK, AD, AH, AW, BT, BR, BS, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, CM, CP, CQ, rc, ha, hb); + for(size_t i = 0; i < M*N; i++) + if(std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ + std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; + exit(EXIT_FAILURE); + } + std::cout << "Pass!" << std::endl; +} diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp new file mode 100644 index 000000000..7bda6c775 --- /dev/null +++ b/examples/cpp/dot.cpp @@ -0,0 +1,162 @@ +#include +#include +#include "common.hpp" +#include "triton/jit.h" +#include "triton/driver/backend.h" +#include "triton/driver/stream.h" + +const char* src = +R"( +const tunable int32 TM = {16, 32, 64, 128}; +const tunable int32 TN = {16, 32, 64, 128}; +const tunable int32 TK = {8}; +const tunable int32 GZ = {1}; + +void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, + int32 M, int32 N, int32 K, + int32 lda, int32 ldb, int32 ldc, + int32 *locks, int32 grid0, int32 grid1) { + int32 rxa[TM] = get_global_range[TM](0); + int32 ryb[TN] = get_global_range[TN](1); + int32 rz = get_global_range[1](2); + int32 rka[TK] = 0 ... TK; + int32 rkb[TK] = 0 ... TK; + fp32 c[TM, TN] = 0; + int32 div = K / GZ; + int32 rem = K % GZ; + K = select(rz < rem, div - 1, div); + int32 offk = select(rz < rem, rz*(div + 1), rz*div + rem); + fp32* pa[TM, TK] = A + (offk + rka[newaxis, :])*lda + rxa[:, newaxis]; + fp32* pb[TN, TK] = B + (offk + rkb[newaxis, :])*ldb + ryb[:, newaxis]; + fp32 a[TM, TK] = *pa; + fp32 b[TN, TK] = *pb; + int32 last_a = ((M*K - 1) - (TM*TK + 1)) / lda; + int32 last_b = ((K*N - 1) - (TN*TK + 1)) / ldb; + last_a = last_a / TK * TK; + last_b = last_b / TK * TK; + int32 bound = K - max(last_a, last_b); + for(int32 k = K; k > bound; k = k - TK){ + c = dot(a, trans(b), c); + pa = pa + TK*lda; + pb = pb + TK*ldb; + a = *pa; + b = *pb; + } + int32 rxc[TM] = get_global_range[TM](0); + int32 ryc[TN] = get_global_range[TN](1); + for(int32 k = bound; k > 0; k = k - 1){ + int1 checka[TM, 1] = rxc[:, newaxis] < M; + int1 checkb[TN, 1] = ryc[:, newaxis] < N; + fp32* pa[TM, 1] = A + (offk + K - k)*lda + rxc[:, newaxis]; + fp32* pb[TN, 1] = B + (offk + K - k)*ldb + ryc[:, newaxis]; + fp32 a[TM, 1] = checka ? *pa : 0; + fp32 b[TN, 1] = checkb ? *pb : 0; + c = dot(a, trans(b), c); + } + int32 ridx = get_range_id(0); + int32 ridy = get_range_id(1); + fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; + int32 *plock = locks + ridx + ridy*grid0; + for(int32 L = __atomic_cas(plock, 0, 1); L == 1; L = __atomic_cas(plock, 0, 1)){} + int32 *pcount = plock + grid0*grid1; + int32 count = *pcount; + int32 countp1 = select(count == GZ - 1, 0, count + 1); + int1 checkc0[TM] = rxc < M; + int1 checkc1[TN] = ryc < N; + int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + if(count == 0) { + @checkc *pc = c; + *pcount = countp1; + } + else { + @checkc *pc = c + (checkc ? *pc : 0); + *pcount = countp1; + } + __atomic_cas(plock, 1, 0); +} +)"; + +int main() { + // initialize default compute device + auto context = triton::driver::backend::contexts::get_default(); + triton::jit jit(context); + + // matrix multiplication parameters + int32_t M = 512, N = 512, K = 512; + std::vector hc(M*N); + std::vector rc(M*N); + std::vector ha(M*K); + std::vector hb(K*N); + std::vector hlocks(2048); + srand(0); + for(size_t i = 0; i < ha.size(); i++) + ha[i] = (float)rand()/RAND_MAX; + for(size_t i = 0; i < hb.size(); i++) + hb[i] = (float)rand()/RAND_MAX; + for(size_t i = 0; i < hc.size(); i++) + hc[i] = 0; + triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*4); + triton::driver::buffer* da = triton::driver::buffer::create(context, ha.size()*4); + triton::driver::buffer* db = triton::driver::buffer::create(context, hb.size()*4); + triton::driver::buffer* dlocks = triton::driver::buffer::create(context, hlocks.size()*4); + triton::driver::stream* stream = triton::driver::stream::create(context); + stream->write(da, true, 0, ha); + stream->write(db, true, 0, hb); + stream->write(dc, true, 0, hc); + stream->synchronize(); + + + // benchmark a given matrix multiplication kernel + auto benchmark = [&](triton::driver::kernel* kernel, + triton::jit::launch_information info) { + // launch info + unsigned TM = info.global_range_size[0]; + unsigned TN = info.global_range_size[1]; + unsigned nthreads = info.num_threads; + unsigned GZ = jit.get_int("GZ"); + std::array grid = {(M + TM - 1)/TM, (N + TN - 1)/TN, GZ}; + // init locks + stream->write(dlocks, true, 0, hlocks); + // set argument + kernel->setArg(0, da); + kernel->setArg(1, db); + kernel->setArg(2, dc); + kernel->setArg(3, M); + kernel->setArg(4, N); + kernel->setArg(5, K); + kernel->setArg(6, M); + kernel->setArg(7, N); + kernel->setArg(8, M); + kernel->setArg(9, dlocks); + kernel->setArg(10, grid[0]); + kernel->setArg(11, grid[1]); + // dry run + stream->enqueue(kernel, grid, {nthreads, 1, 1}); + stream->synchronize(); + // benchmark + double ts = bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, + [&](){ stream->synchronize(); }, *context->device()); + ts = ts * 1e-9; + double tflops = 2.*M*N*K / ts * 1e-12; + return tflops; + }; + + + // just-in-time compile source-code + std::vector params = { + 16, 2, 64, 16, 2, 64, 16, 8, 2, 2, 8, 8, 8, 1 + }; +// jit.autotune("matmul",src, benchmark); + jit.add_module("matmul", src, params); + triton::driver::kernel* kernel = jit.get_function("matmul"); + triton::jit::launch_information info = jit.get_launch_info("matmul"); + std::cout << "Performance: " << benchmark(kernel, info) << " TFLOPS " << std::endl; + stream->read(dc, true, 0, hc); + simple_gemm(rc, ha, hb, M, N, K); + for(size_t i = 0; i < M*N; i++) + if(std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ + std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; + exit(EXIT_FAILURE); + } + std::cout << "Pass!" << std::endl; +} diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp new file mode 100644 index 000000000..f75046e2f --- /dev/null +++ b/examples/cpp/shift.cpp @@ -0,0 +1,212 @@ +#include +#include +#include "common.hpp" +#include "triton/jit.h" +#include "triton/driver/backend.h" +#include "triton/driver/stream.h" + +// K = channels +// M = batch * height * width +// N = number of feature maps + +const char* src = +R"( +const tunable int32 TM = {16, 32, 64, 128}; +const tunable int32 TN = {16, 32, 64, 128}; +const tunable int32 TK = {8}; + +__constant__ int32* delta = alloc_const int32[256]; +__constant__ int32* masks = alloc_const int32[8192]; + +void shift(restrict read_only fp32 *a, restrict read_only fp32 *b, fp32 *c, + int32 M, int32 N, int32 K, + int32 ABS, int32 AH, int32 AW, int32 AR, int32 AS){ + int32 rxa[TM] = get_global_range[TM](0); + int32 ryb[TN] = get_global_range[TN](1); + int32 rka[TK] = 0 ... TK; + int32 rkb[TK] = 0 ... TK; + fp32 C[TM, TN] = 0; + fp32* pxa[TM, TK] = a + rxa[:, newaxis]; + fp32* pb[TN, TK] = b + rkb[newaxis, :]*N + ryb[:, newaxis]; + __constant__ int32* pd[TK] = delta + rka; + int32 pad_h = AR/2; + int32 pad_w = AS/2; + int32 rawhc[TM] = rxa / ABS; + int32 raw[TM] = rawhc % AW - pad_w; + int32 rahc[TM] = rawhc / AW; + int32 rah[TM] = rahc % AH - pad_h; + int32 maskh[TM] = pad_h + min(rah, 0) + max(rah + AR - AH, 0); + int32 maskw[TM] = pad_w + min(raw, 0) + max(raw + AS - AW, 0); + __constant__ int32* pxm[TM] = masks + maskh*K + maskw*K*(2*pad_h + 1); + __constant__ int32* pm[TM, TK] = pxm[:, newaxis] + rka[newaxis, :]; + for(int32 k = K; k > 0; k = k - TK){ + int32 delta[TK] = *pd; + fp32 *pa[TM, TK] = pxa + delta[newaxis, :]; + int1 m[TM, TK] = *pm > 0; + fp32 a[TM, TK] = m ? *pa : 0; + fp32 b[TN, TK] = *pb; + C = dot(a, trans(b), C); + pb = pb + TK*N; + pd = pd + TK; + pm = pm + TK; + } + int32 rxc[TM] = get_global_range[TM](0); + int32 ryc[TN] = get_global_range[TN](1); + fp32* pc[TM, TN] = c + ryc[newaxis, :]*M + rxc[:, newaxis]; + int1 checkc0[TM] = rxc < M; + int1 checkc1[TN] = ryc < N; + int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + @checkc *pc = C; +} +)"; + +std::vector shift_deltas(// strides + int32_t stride_w, int32_t stride_h, int32_t stride_c, + // shift + int32_t C, + const std::vector& shift_h, + const std::vector& shift_w) { + std::vector res(C); + for(unsigned c = 0; c < C; c++){ + res[c] = c*stride_c; + res[c] += shift_h[c]*stride_h; + res[c] += shift_w[c]*stride_w; + } + return res; +} + +std::vector shift_masks(int32_t C, + const std::vector& shift_h, + const std::vector& shift_w, + int32_t R, int32_t S) { + size_t S0 = C; + size_t S1 = R; + size_t S2 = S; + std::vector res(S0*S1*S2); + for(size_t ph = 0; ph < S1; ++ph) + for(size_t pw = 0; pw < S2; ++pw){ + int32_t* ptr = &res[ph*S0 + pw*S0*S1]; + for(size_t i = 0; i < S0; ++i){ + bool in_bounds_h = shift_h[i] + ph >= 0 && shift_h[i] + ph < R; + bool in_bounds_w = shift_w[i] + pw >= 0 && shift_w[i] + pw < S; + ptr[i] = in_bounds_h && in_bounds_w; + } + } + return res; +} + +int main() { + // initialize default compute device + auto context = triton::driver::backend::contexts::get_default(); + // initialize just-in-time compiler + triton::jit jit(context); + // initialization + int32_t R = 3, S = 3; + int32_t BS = 4, F = 128; + int32_t H = 32, W = 32; + int32_t C = 128; + // equivalent matmul dimensions + int32_t M = BS*H*W; + int32_t N = F; + int32_t K = C; + std::cout << M << " " << N << " " << K << std::endl; + std::vector hc(BS*H*W*F); + std::vector rc(BS*H*W*F); + std::vector ha(BS*C*H*W); + std::vector hb(F*C); + // strides + int32_t stride_i_bs = 1; + int32_t stride_i_w = BS*stride_i_bs; + int32_t stride_i_h = W*stride_i_w; + int32_t stride_i_c = H*stride_i_h; + // random shifts + std::vector shift_h(C); + std::vector shift_w(C); + for(int32_t c = 0; c < C; c++){ + shift_h[c] = rand() % R - R/2; + shift_w[c] = rand() % S - S/2; + } + // initialize buffers + srand(0); + for(int c = 0 ; c < C; c++) + for(int h = 0 ; h < H; h++) + for(int w = 0 ; w < W; w++) + for(int bs = 0 ; bs < BS; bs++){ + float value = (float)rand() / RAND_MAX; + size_t idx = bs + w*stride_i_w + h*stride_i_h + c*stride_i_c; + ha[idx] = value; + } + for(size_t i = 0; i < hb.size(); i++) + hb[i] = (float)rand() / RAND_MAX; + for(size_t i = 0; i < hc.size(); i++) + hc[i] = 0; + triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*4); + triton::driver::buffer* da = triton::driver::buffer::create(context, ha.size()*4); + triton::driver::buffer* db = triton::driver::buffer::create(context, hb.size()*4); + triton::driver::stream* stream = triton::driver::stream::create(context); + stream->write(da, true, 0, ha); + stream->write(db, true, 0, hb); + stream->write(dc, true, 0, hc); + stream->synchronize(); + std::vector h_delta = shift_deltas(stride_i_w, stride_i_h, stride_i_c, C, shift_h, shift_w); + std::vector h_masks = shift_masks(C, shift_h, shift_w, R, S); + // benchmark a given matrix multiplication kernel + auto benchmark = [&](triton::driver::kernel* kernel, + triton::jit::launch_information info) { + // launch info + unsigned TM = info.global_range_size[0]; + unsigned TN = info.global_range_size[1]; + unsigned nthreads = info.num_threads; + // initialize constant memory + triton::driver::buffer* delta = jit.get_buffer("delta"); + triton::driver::buffer* masks = jit.get_buffer("masks"); + stream->write(delta, false, 0, h_delta.size()*4, h_delta.data()); + stream->write(masks, false, 0, h_masks.size()*4, h_masks.data()); + stream->synchronize(); + // set argument + kernel->setArg(0, da); + kernel->setArg(1, db); + kernel->setArg(2, dc); + kernel->setArg(3, M); + kernel->setArg(4, N); + kernel->setArg(5, K); + kernel->setArg(6, BS); + kernel->setArg(7, H); + kernel->setArg(8, W); + kernel->setArg(9, R); + kernel->setArg(10, S); + // dry run + std::array grid = {(M + TM - 1)/TM, (N + TN - 1)/TN, 1}; + stream->enqueue(kernel, grid, {nthreads, 1, 1}); + stream->synchronize(); + // benchmark + double ts = bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, + [&](){ stream->synchronize(); }, *context->device()); + ts = ts * 1e-9; + double tflops = 2.*M*N*K / ts * 1e-12; + return tflops; + }; + + // shift + std::vector params = { + 16, 2, 64, + 32, 2, 64, + 16, 8, 2, 2, + 8, 8, + 4 + }; +// jit.autotune("shift", src, benchmark); + jit.add_module("shift", src, params); + triton::driver::kernel* kernel = jit.get_function("shift"); + triton::jit::launch_information info = jit.get_launch_info("shift"); + std::cout << "Performance: " << benchmark(kernel, info) << " TFLOPS " << std::endl; + stream->read(dc, true, 0, hc); + shift_conv(C, H, W, BS, F, rc, ha, hb, shift_h, shift_w); + for(size_t i = 0; i < M*N; i++) + if(std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ + std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; + exit(EXIT_FAILURE); + } + std::cout << "Pass!" << std::endl; + +} diff --git a/examples/cpp/shift.ptx b/examples/cpp/shift.ptx new file mode 100644 index 000000000..62a841909 --- /dev/null +++ b/examples/cpp/shift.ptx @@ -0,0 +1,93 @@ +// +// Generated by NVIDIA NVVM Compiler +// +// Compiler Build ID: CL-24817639 +// Cuda compilation tools, release 10.0, V10.0.130 +// Based on LLVM 3.4svn +// + +.version 6.3 +.target sm_60 +.address_size 64 + + // .globl _Z25shift_cuda_forward_kernelPKfPKiPfiiii + +.visible .entry shift( + .param .u64 _Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_0, + .param .u64 _Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_1, + .param .u64 _Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_2, + .param .u32 _Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_3, + .param .u32 _Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_4, + .param .u32 _Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_5, + .param .u32 _Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_6 +) +{ + .reg .pred %p<10>; + .reg .f32 %f<2>; + .reg .b32 %r<31>; + .reg .b64 %rd<13>; + + + ld.param.u64 %rd1, [_Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_0]; + ld.param.u64 %rd3, [_Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_1]; + ld.param.u64 %rd2, [_Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_2]; + ld.param.u32 %r3, [_Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_3]; + ld.param.u32 %r4, [_Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_4]; + ld.param.u32 %r5, [_Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_5]; + ld.param.u32 %r6, [_Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_6]; + cvta.to.global.u64 %rd4, %rd3; + mov.u32 %r7, %ntid.x; + mov.u32 %r8, %ctaid.x; + mov.u32 %r9, %tid.x; + mad.lo.s32 %r1, %r7, %r8, %r9; + mul.lo.s32 %r10, %r4, %r3; + mul.lo.s32 %r11, %r10, %r5; + mul.lo.s32 %r12, %r11, %r6; + mul.lo.s32 %r13, %r5, %r4; + mul.lo.s32 %r14, %r13, %r6; + rem.s32 %r15, %r1, %r14; + sub.s32 %r16, %r1, %r15; + mul.lo.s32 %r17, %r6, %r5; + div.s32 %r18, %r15, %r17; + mul.lo.s32 %r19, %r18, %r17; + sub.s32 %r20, %r15, %r19; + div.s32 %r21, %r20, %r5; + mul.lo.s32 %r22, %r21, %r6; + sub.s32 %r23, %r20, %r22; + shl.b32 %r24, %r18, 1; + mul.wide.s32 %rd5, %r24, 4; + add.s64 %rd6, %rd4, %rd5; + ld.global.nc.u32 %r25, [%rd6]; + add.s32 %r26, %r25, %r21; + ld.global.nc.u32 %r27, [%rd6+4]; + add.s32 %r28, %r23, %r27; + add.s32 %r29, %r16, %r19; + mad.lo.s32 %r30, %r26, %r5, %r29; + add.s32 %r2, %r30, %r28; + setp.lt.s32 %p1, %r1, %r12; + setp.gt.s32 %p2, %r26, -1; + and.pred %p3, %p1, %p2; + setp.lt.s32 %p4, %r26, %r5; + and.pred %p5, %p3, %p4; + setp.gt.s32 %p6, %r28, -1; + and.pred %p7, %p5, %p6; + setp.lt.s32 %p8, %r28, %r6; + and.pred %p9, %p7, %p8; + @!%p9 bra BB0_2; + bra.uni BB0_1; + +BB0_1: + cvta.to.global.u64 %rd7, %rd1; + mul.wide.s32 %rd8, %r1, 4; + add.s64 %rd9, %rd7, %rd8; + ld.global.nc.f32 %f1, [%rd9]; + cvta.to.global.u64 %rd10, %rd2; + mul.wide.s32 %rd11, %r2, 4; + add.s64 %rd12, %rd10, %rd11; + st.global.f32 [%rd12], %f1; + +BB0_2: + ret; +} + + diff --git a/include/triton/ast/ast.h b/include/triton/ast/ast.h index b286c5a79..8eccd6f92 100644 --- a/include/triton/ast/ast.h +++ b/include/triton/ast/ast.h @@ -74,8 +74,8 @@ class constant; class node { protected: static ir::value* explicit_cast(ir::builder &builder, ir::value *src, ir::type *dst_ty); + static void implicit_broadcast(ir::module *mod, ir::type *dst_ty, ir::value *&src); static void implicit_broadcast(ir::module *mod, ir::value *&lhs, ir::value *&rhs); - static void implicit_broadcast(ir::module *mod, ir::value *&arg, ir::type *ty); static void implicit_cast(ir::builder &builder, ir::value *&lhs, ir::value *&rhs, bool &is_float, bool &is_ptr, bool &is_int, bool &is_signed); public: @@ -164,6 +164,27 @@ private: const constant* axis_; }; +class get_range_id: public builtin_expression{ +public: + get_range_id(node *axis): axis_((constant*)axis) { } + ir::value* codegen(ir::module *) const; + +private: + const constant* axis_; +}; + +class atomic_cas: public builtin_expression{ +public: + atomic_cas(node *ptr, node *cmp, node *val): ptr_(ptr), cmp_(cmp), val_(val) { } + ir::value* codegen(ir::module *) const; + +private: + const node *ptr_; + const node *cmp_; + const node *val_; +}; + + class matmul_expression: public builtin_expression{ public: matmul_expression(node* A, node *B, node *C): @@ -176,6 +197,49 @@ private: const expression *C_; }; +class max_expression: public builtin_expression{ +public: + max_expression(node* x, node* y) + : x_((expression*)x), y_((expression*)y){ } + ir::value* codegen(ir::module *) const; + +private: + const expression *x_; + const expression *y_; +}; + +class min_expression: public builtin_expression{ +public: + min_expression(node* x, node* y) + : x_((expression*)x), y_((expression*)y){ } + ir::value* codegen(ir::module *mod) const; + +private: + const expression *x_; + const expression *y_; +}; + +class select_expression: public builtin_expression{ +public: + select_expression(node* pred, node* if_value, node* else_value) + : pred_((expression*)pred), if_value_((expression*)if_value), else_value_((expression*)else_value) { } + ir::value* codegen(ir::module *mod) const; + +private: + const expression *pred_; + const expression *if_value_; + const expression *else_value_; +}; + +class trans_expression: public builtin_expression{ +public: + trans_expression(node *arg): arg_(arg) {} + ir::value* codegen(ir::module *mod) const; + +private: + node* arg_; +}; + class indexing_expression: public postfix_expression{ public: @@ -189,6 +253,8 @@ private: const list* slices_; }; + + class named_expression: public expression { public: named_expression(node *id): id_((const identifier*)id) { lvalue_ = this; } diff --git a/include/triton/ast/parser.y b/include/triton/ast/parser.y index ae4b7d4e3..5302c7d14 100644 --- a/include/triton/ast/parser.y +++ b/include/triton/ast/parser.y @@ -55,7 +55,7 @@ STORAGE_SPEC_T get_storage_spec(node *op) { return ((token*)op)->storage_spec;} %token VOID UINT1 UINT8 UINT16 UINT32 UINT64 INT1 INT8 INT16 INT32 INT64 FP32 FP64 %token IF ELSE FOR CONTINUE %token NEWAXIS ELLIPSIS AT -%token GET_GLOBAL_RANGE DOT ALLOC_CONST +%token GET_GLOBAL_RANGE GET_RANGE_ID DOT TRANS MAX MIN SELECT ATOMIC_CAS ALLOC_CONST %start translation_unit %% @@ -118,8 +118,15 @@ identifier builtin : GET_GLOBAL_RANGE '[' primary_expression ']' '(' constant ')' { $$ = new get_global_range($3, $6); } + | GET_RANGE_ID '(' constant ')' { $$ = new get_range_id($3); } | DOT '(' expression ',' expression ',' expression ')' { $$ = new matmul_expression($3, $5, $7); } | ALLOC_CONST type_specifier '[' constant ']' { $$ = new alloc_const(new typed_declaration_specifier(get_type_spec($2)), $4); } + | TRANS '(' expression ')' { $$ = new trans_expression($3); } + | MAX '(' expression ',' expression ')' { $$ = new max_expression($3, $5); } + | MIN '(' expression ',' expression ')' { $$ = new min_expression($3, $5); } + | SELECT '(' expression ',' expression ',' expression ')' { $$ = new select_expression($3, $5, $7); } + | ATOMIC_CAS '(' expression ',' expression ',' expression ')' { $$ = new atomic_cas($3, $5, $7); } + ; primary_expression : identifier { $$ = new named_expression($1); } diff --git a/include/triton/ast/scanner.l b/include/triton/ast/scanner.l index 91b700655..e4e018a14 100644 --- a/include/triton/ast/scanner.l +++ b/include/triton/ast/scanner.l @@ -41,7 +41,13 @@ using triton::ast::return_void; "fp64" { return return_impl(FP64, yytext); } "..." { return return_impl(ELLIPSIS, yytext); } "get_global_range" { return return_impl(GET_GLOBAL_RANGE, yytext); } +"get_range_id" { return return_impl(GET_RANGE_ID, yytext); } +"__atomic_cas" { return return_impl(ATOMIC_CAS, yytext); } "dot" { return return_impl(DOT, yytext); } +"max" { return return_impl(MAX, yytext); } +"min" { return return_impl(MIN, yytext); } +"select" { return return_impl(SELECT, yytext); } +"trans" { return return_impl(TRANS, yytext); } "continue" { return return_impl(CONTINUE, yytext); } "alloc_const" { return return_impl(ALLOC_CONST, yytext); } {L}({L}|{D})* { return return_impl(IDENTIFIER, yytext); } @@ -52,8 +58,6 @@ using triton::ast::return_void; L?'(\\.|[^\\'])+' { return return_impl(CONSTANT, yytext); } {D}+{E}{FS}? { return return_impl(CONSTANT, yytext); } -{D}*"."{D}+({E})?{FS}? { return return_impl(CONSTANT, yytext); } -{D}+"."{D}*({E})?{FS}? { return return_impl(CONSTANT, yytext); } L?\"(\\.|[^\\"])*\" { return return_impl(STRING_LITERAL, yytext); } diff --git a/include/triton/codegen/layout.h b/include/triton/codegen/layout.h deleted file mode 100644 index a18f6439f..000000000 --- a/include/triton/codegen/layout.h +++ /dev/null @@ -1,45 +0,0 @@ -#ifndef TDL_INCLUDE_IR_CODEGEN_LAYOUT_H -#define TDL_INCLUDE_IR_CODEGEN_LAYOUT_H - -#include -#include - -namespace triton { - -namespace ir { - class module; - class instruction; - class value; -} - -namespace codegen{ - -struct shared_view_info{ - ir::value *usr; - bool has_dedicated_storage; -}; - -class layout { -private: - typedef std::vector shared_view_val_t; - - void add_phi_nodes(ir::value *v); - void add_shared_views(ir::value *v); - -public: - // accessors - unsigned get_num_shared_views(ir::value *v); - shared_view_info get_shared_view(ir::value *v, unsigned idx); - - // run - void run(ir::module &mod); - -private: - std::map shared_views_; -}; - - -} -} - -#endif diff --git a/include/triton/codegen/optimize_cse.h b/include/triton/codegen/optimize_cse.h new file mode 100644 index 000000000..d718f318e --- /dev/null +++ b/include/triton/codegen/optimize_cse.h @@ -0,0 +1,27 @@ +#ifndef TDL_INCLUDE_CODEGEN_OPTIMIZE_CSE_H +#define TDL_INCLUDE_CODEGEN_OPTIMIZE_CSE_H + +#include +#include +#include + +namespace triton { + +namespace ir { + class module; +} + +namespace codegen{ +class tune; + +class optimize_cse { +public: + optimize_cse() {} + void run(ir::module &mod); +}; + + +} +} + +#endif diff --git a/include/triton/codegen/optimize_dot.h b/include/triton/codegen/optimize_dot.h new file mode 100644 index 000000000..76d8368dc --- /dev/null +++ b/include/triton/codegen/optimize_dot.h @@ -0,0 +1,31 @@ +#ifndef TDL_INCLUDE_CODEGEN_OPTIMIZE_DOT_H +#define TDL_INCLUDE_CODEGEN_OPTIMIZE_DOT_H + +#include +#include +#include + +namespace triton { + +namespace ir { + class module; +} + +namespace codegen{ + +class tune; + +class optimize_dot { +public: + optimize_dot(tune* params): params_(params) {} + void run(ir::module &mod); + +private: + tune* params_; +}; + + +} +} + +#endif diff --git a/include/triton/codegen/optimize_trans.h b/include/triton/codegen/optimize_trans.h new file mode 100644 index 000000000..beaace2a5 --- /dev/null +++ b/include/triton/codegen/optimize_trans.h @@ -0,0 +1,33 @@ +#ifndef TDL_INCLUDE_CODEGEN_OPTIMIZE_TRANS_H +#define TDL_INCLUDE_CODEGEN_OPTIMIZE_TRANS_H + +#include +#include +#include + +namespace triton { + +namespace ir { + class module; + class value; + class instruction; + class trans_inst; + class builder; +} + +namespace codegen{ + +class optimize_trans { +private: + ir::value *replace_phi(ir::value* value, std::vector& to_delete, ir::builder &builder); + +public: + optimize_trans() {} + void run(ir::module &mod); +}; + + +} +} + +#endif diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 11acf28e7..d9ce08c53 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -7,7 +7,7 @@ #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/type.h" -#include "triton/codegen/buffer_info.h" +#include "triton/codegen/shmem_info.h" namespace llvm{ @@ -21,9 +21,9 @@ namespace llvm{ namespace triton{ namespace codegen{ -class allocation; +class shmem_allocation; class tune; -class buffer_info_pass; +class shmem_info; class target; typedef std::vector indices_t; @@ -129,7 +129,7 @@ private: void lower_tile_instruction(ir::instruction *src, llvm::IRBuilder<> &builder); public: - selection(allocation *alloc, tune *params, buffer_info_pass *buffer_info, target *tgt) + selection(shmem_allocation *alloc, tune *params, shmem_info *buffer_info, target *tgt) : alloc_(alloc), params_(params), buffer_info_(buffer_info), tgt_(tgt){ } void run(ir::module &src, llvm::Module &dst); @@ -139,11 +139,12 @@ private: tmap_t tmap_; pmap_t pmap_; pmap_t last_block_; - allocation *alloc_; + shmem_allocation *alloc_; tune *params_; target *tgt_; - buffer_info_pass *buffer_info_; + shmem_info *buffer_info_; std::map axes_; + llvm::Value *sh_mem_ptr_; }; } diff --git a/include/triton/codegen/shared_copy.h b/include/triton/codegen/shared_copy.h deleted file mode 100644 index 3a3d7363b..000000000 --- a/include/triton/codegen/shared_copy.h +++ /dev/null @@ -1,41 +0,0 @@ -#ifndef TDL_INCLUDE_CODEGEN_SHARED_COPY_H -#define TDL_INCLUDE_CODEGEN_SHARED_COPY_H - -#include -#include - -namespace triton { - -namespace ir { - class module; - class value; - class builder; - class basic_block; -} - -namespace codegen{ - -class buffer_info_pass; - -class place_shared_copy { -private: - typedef std::pair interval_t; - typedef std::vector interval_vec_t; - -private: - bool intersect(const interval_vec_t &I, interval_t i); - void add_copy(ir::value *x, ir::builder &builder); - -public: - place_shared_copy(buffer_info_pass *info): info_(info) { } - void run(ir::module &mod); - -private: - buffer_info_pass *info_; -}; - - -} -} - -#endif diff --git a/include/triton/codegen/allocation.h b/include/triton/codegen/shmem_allocation.h similarity index 79% rename from include/triton/codegen/allocation.h rename to include/triton/codegen/shmem_allocation.h index 1f2a7656c..27a96f285 100644 --- a/include/triton/codegen/allocation.h +++ b/include/triton/codegen/shmem_allocation.h @@ -16,12 +16,12 @@ namespace codegen{ class layout; class target_tuner; -class liveness; -class buffer_info_pass; +class shmem_liveness; +class shmem_info; -class allocation { +class shmem_allocation { public: - allocation(liveness *live, buffer_info_pass *buffer_info) + shmem_allocation(shmem_liveness *live, shmem_info *buffer_info) : liveness_(live), buffer_info_(buffer_info){ } // utilities @@ -39,8 +39,8 @@ private: std::map num_bytes_; size_t allocated_size_; // dependences - liveness *liveness_; - buffer_info_pass *buffer_info_; + shmem_liveness *liveness_; + shmem_info *buffer_info_; }; } diff --git a/include/triton/codegen/barriers.h b/include/triton/codegen/shmem_barriers.h similarity index 82% rename from include/triton/codegen/barriers.h rename to include/triton/codegen/shmem_barriers.h index 336ec255a..271b745cc 100644 --- a/include/triton/codegen/barriers.h +++ b/include/triton/codegen/shmem_barriers.h @@ -17,10 +17,10 @@ namespace ir { namespace codegen{ -class allocation; -class buffer_info_pass; +class shmem_allocation; +class shmem_info; -class barriers { +class shmem_barriers { private: typedef std::pair interval_t; typedef std::vector interval_vec_t; @@ -36,12 +36,12 @@ private: std::pair transfer(ir::basic_block *block, const interval_vec_t &written_to, const interval_vec_t &read_from, std::set &insert_loc); public: - barriers(allocation *alloc, buffer_info_pass *buffer_info): alloc_(alloc), buffer_info_(buffer_info) {} + shmem_barriers(shmem_allocation *alloc, shmem_info *buffer_info): alloc_(alloc), buffer_info_(buffer_info) {} void run(ir::module &mod); private: - allocation *alloc_; - buffer_info_pass *buffer_info_; + shmem_allocation *alloc_; + shmem_info *buffer_info_; }; diff --git a/include/triton/codegen/buffer_info.h b/include/triton/codegen/shmem_info.h similarity index 84% rename from include/triton/codegen/buffer_info.h rename to include/triton/codegen/shmem_info.h index 58f140d61..f8325d00b 100644 --- a/include/triton/codegen/buffer_info.h +++ b/include/triton/codegen/shmem_info.h @@ -10,18 +10,19 @@ namespace ir { class module; class value; class phi_node; + class instruction; } namespace codegen{ -class buffer_info_pass { +class shmem_info { public: void run(ir::module &mod); // queries bool is_double(ir::value *x); void add_shared(ir::value *v); bool is_shared(ir::value *x); - bool is_loop_latch(ir::phi_node *phi, ir::value *terminator); + bool is_loop_latch(ir::phi_node *phi, ir::instruction *terminator); ir::value *get_reference(ir::value *x); void replace(ir::value* before, ir::value *after); diff --git a/include/triton/codegen/liveness.h b/include/triton/codegen/shmem_liveness.h similarity index 90% rename from include/triton/codegen/liveness.h rename to include/triton/codegen/shmem_liveness.h index 010bb4e2a..69210d03f 100644 --- a/include/triton/codegen/liveness.h +++ b/include/triton/codegen/shmem_liveness.h @@ -15,7 +15,7 @@ namespace codegen{ typedef unsigned slot_index; -class buffer_info_pass; +class shmem_info; struct segment { slot_index start; @@ -30,7 +30,7 @@ struct segment { } }; -class liveness { +class shmem_liveness { private: typedef std::map indices_map_t; typedef std::map intervals_map_t; @@ -43,7 +43,7 @@ public: public: // constructor - liveness(buffer_info_pass *info): info_(info){ } + shmem_liveness(shmem_info *info): info_(info){ } // accessors const intervals_map_t& intervals() const { return intervals_; } @@ -53,7 +53,7 @@ public: void run(ir::module &mod); private: - buffer_info_pass *info_; + shmem_info *info_; has_storage_map_t has_dedicated_storage_; indices_map_t indices_; intervals_map_t intervals_; diff --git a/include/triton/codegen/target.h b/include/triton/codegen/target.h index e2dc4518a..9079fc869 100644 --- a/include/triton/codegen/target.h +++ b/include/triton/codegen/target.h @@ -24,6 +24,7 @@ public: virtual llvm::Instruction* add_barrier(llvm::Module *module, llvm::IRBuilder<>& builder) = 0; virtual llvm::Value* get_global_offset(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned stride, unsigned ax) = 0; virtual llvm::Value* get_local_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax) = 0; + virtual llvm::Value* get_block_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax) = 0; bool is_gpu() const; private: @@ -37,6 +38,7 @@ public: llvm::Instruction* add_barrier(llvm::Module *module, llvm::IRBuilder<>& builder); llvm::Value* get_global_offset(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned stride, unsigned ax); llvm::Value* get_local_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); + llvm::Value* get_block_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); }; class nvidia_cu_target: public target { @@ -46,6 +48,7 @@ public: llvm::Instruction* add_barrier(llvm::Module *module, llvm::IRBuilder<>& builder); llvm::Value* get_global_offset(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned stride, unsigned ax); llvm::Value* get_local_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); + llvm::Value* get_block_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); }; class cpu_target: public target { @@ -55,6 +58,7 @@ public: llvm::Instruction* add_barrier(llvm::Module *module, llvm::IRBuilder<>& builder); llvm::Value* get_global_offset(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned stride, unsigned ax); llvm::Value* get_local_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); + llvm::Value* get_block_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); }; } diff --git a/include/triton/driver/module.h b/include/triton/driver/module.h index 89ff3c39d..7fe2d6f06 100755 --- a/include/triton/driver/module.h +++ b/include/triton/driver/module.h @@ -90,7 +90,7 @@ class cu_module: public module { public: cu_module(driver::context* context, llvm::Module *module); cu_module(driver::context* context, const std::string& source); - cu_buffer symbol(const char * name) const; + cu_buffer* symbol(const char * name) const; private: std::string source_; diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h index 852f55aa9..51dd656d3 100644 --- a/include/triton/ir/builder.h +++ b/include/triton/ir/builder.h @@ -67,6 +67,7 @@ public: value* create_fp_ext(value *src, type *dst_ty, const std::string &name = ""); value* create_fp_trunc(value *src, type *dst_ty, const std::string &name = ""); value* create_int_cast(value *src, type *dst_ty, bool is_signed, const std::string &name = ""); + value *create_downcast(value *arg, const std::string &name = ""); // Phi instruction phi_node* create_phi(type *ty, unsigned num_reserved, const std::string &name = ""); // Binary instructions @@ -124,7 +125,11 @@ public: value *create_broadcast(value *arg, const type::tile_shapes_t &shapes, const std::string &name = ""); // Built-in instruction value *create_get_global_range(unsigned axis, type::tile_shapes_t::value_type size, const std::string &name = ""); - value *create_matmul(value *A, value *B, value *C, const std::string &name = ""); + value *create_get_range_id(unsigned axis, const std::string &name = ""); + value *create_atomic_cas(value *ptr, value *cmp, value *val, const std::string &name = ""); + value *create_dot(value *A, value *B, value *C, const std::string &name = ""); + value *create_trans(value *A, const std::string &name = ""); + value *create_select(value *pred, value *if_value, value *else_value, const std::string &name = ""); // Intrinsics value *create_copy_to_shared(value *arg, const std::string &name = ""); value *create_vectorize(value *arg, const std::string &name = ""); diff --git a/include/triton/ir/constant.h b/include/triton/ir/constant.h index 0c18787ea..43aa41c6d 100644 --- a/include/triton/ir/constant.h +++ b/include/triton/ir/constant.h @@ -54,6 +54,7 @@ public: void set_value(uint64_t value) { has_value_ = true; value_ = value; } bool has_value() { return has_value_; } const std::vector& get_space() { return space_; } + void set_space(const std::vector &space) { space_ = space; } private: std::vector space_; diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index 2d8e7d91d..961bb43ce 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -464,6 +464,17 @@ public: }; +// downcast + +class downcast_inst: public unary_inst { +private: + using unary_inst::unary_inst; + std::string repr_impl() const { return "downcast"; } + +public: + static instruction* create(value *arg, const std::string &name = "", instruction *next = nullptr); +}; + //===----------------------------------------------------------------------===// // builtin_inst classes //===----------------------------------------------------------------------===// @@ -488,17 +499,76 @@ private: unsigned axis_; }; -class matmul_inst: public builtin_inst { +class get_range_id_inst: public builtin_inst { private: - matmul_inst(value *A, value *B, value *C, const std::string &name, instruction *next); - std::string repr_impl() const { return "dot"; } + get_range_id_inst(type *ty, unsigned axis, const std::string &name, instruction *next); + std::string repr_impl() const { return "get_range_id(" + std::to_string(axis_) + ")"; } public: - static instruction* create(value *A, value *B, value *C, - const std::string &name = "", - instruction *next = nullptr); + static instruction* create(context &ctx, unsigned axis, const std::string &name = "", instruction *next = nullptr); + unsigned get_axis() const { return axis_; } + +private: + unsigned axis_; }; +class atomic_cas_inst: public builtin_inst { +private: + atomic_cas_inst(value *ptr, value *cmp, value *val, const std::string &name, instruction *next); + std::string repr_impl() const { return "atomic_cas"; } + +public: + static instruction* create(value *ptr, value *cmp, value *val, const std::string &name = "", instruction *next = nullptr); +}; + +class dot_inst: public builtin_inst { +public: + enum TransT { NoTrans, Trans }; + +private: + dot_inst(value *A, value *B, value *C, TransT AT, TransT BT, const std::string &name, instruction *next); + std::string repr_impl() const { return std::string("dot.") + ((AT_==NoTrans)?"n":"t") + ((BT_==NoTrans)?"n":"t"); } + +public: + static instruction* create_nn(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr); + static instruction* create_nt(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr); + static instruction* create_tn(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr); + static instruction* create_tt(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr); + bool is_a_trans() { return AT_ == Trans; } + bool is_b_trans() { return BT_ == Trans; } + +private: + TransT AT_; + TransT BT_; +}; + +//class outer_inst: public builtin_inst { +//private: +// outer_inst(value *A, value *B, value *C, const std::string &name, instruction *next); +//public: +// static instruction* create(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr); +//}; + +class trans_inst: public builtin_inst { +public: + ir::type* get_res_ty(ir::type* in); + +private: + trans_inst(value *arg, const std::string& name, instruction* next); + std::string repr_impl() const { return "trans"; } + +public: + static instruction* create(value *arg, const std::string &name = "", instruction *next = nullptr); +}; + +class select_inst: public builtin_inst { +private: + select_inst(value *pred, value *if_value, value *else_value, const std::string& name, instruction* next); + std::string repr_impl() const { return "select"; } + +public: + static instruction* create(value *pred, value *if_value, value *else_value, const std::string &name = "", instruction *next = nullptr); +}; //===----------------------------------------------------------------------===// // intrinsics classes diff --git a/include/triton/ir/module.h b/include/triton/ir/module.h index 3d2d5afb9..13d99d436 100644 --- a/include/triton/ir/module.h +++ b/include/triton/ir/module.h @@ -66,6 +66,7 @@ public: // Getters value *get_value(const std::string& name, basic_block* block); value *get_value(const std::string& name); + const std::string& get_name(); std::function get_continue_fn(); // Seal block -- no more predecessors will be added void seal_block(basic_block *block); diff --git a/include/triton/jit.h b/include/triton/jit.h index c4809d254..b9c502aad 100644 --- a/include/triton/jit.h +++ b/include/triton/jit.h @@ -10,13 +10,15 @@ #include "triton/driver/kernel.h" #include "triton/codegen/selection.h" #include "triton/codegen/tune.h" -#include "triton/codegen/shared_copy.h" -#include "triton/codegen/allocation.h" -#include "triton/codegen/liveness.h" -#include "triton/codegen/vectorize.h" -#include "triton/codegen/buffer_info.h" -#include "triton/codegen/barriers.h" +#include "triton/codegen/optimize_dot.h" +#include "triton/codegen/optimize_cse.h" +#include "triton/codegen/optimize_trans.h" +#include "triton/codegen/shmem_allocation.h" +#include "triton/codegen/shmem_liveness.h" +#include "triton/codegen/shmem_info.h" +#include "triton/codegen/shmem_barriers.h" #include "triton/codegen/target.h" +#include "triton/codegen/vectorize.h" #include namespace llvm { @@ -45,48 +47,59 @@ public: struct passes_wrapper { passes_wrapper(codegen::target* target) - : shared(&buffer_info), liveness(&buffer_info), - allocation(&liveness, &buffer_info), - barriers(&allocation, &buffer_info), + : shmem_liveness(&shmem_info), + shmem_allocation(&shmem_liveness, &shmem_info), + shmem_barriers(&shmem_allocation, &shmem_info), vectorize(&tune), - selection(&allocation, &tune, &buffer_info, target), + selection(&shmem_allocation, &tune, &shmem_info, target), + optimize_dot(&tune), + optimize_cse(), + optimize_trans(), target_(target) { } - void init(ir::module &module) { + void target_independent(ir::module &module) { + optimize_dot.run(module); + optimize_trans.run(module); +// ir::print(module, std::cout); + } + + void target_dependent(ir::module &module) { if(target_->is_gpu()){ - buffer_info.run(module); - shared.run(module); - liveness.run(module); - allocation.run(); - barriers.run(module); + shmem_info.run(module); + shmem_liveness.run(module); + shmem_allocation.run(); + shmem_barriers.run(module); } vectorize.run(module); } codegen::tune tune; - codegen::buffer_info_pass buffer_info; - codegen::place_shared_copy shared; - codegen::liveness liveness; - codegen::allocation allocation; - codegen::barriers barriers; + codegen::shmem_info shmem_info; + codegen::shmem_liveness shmem_liveness; + codegen::shmem_allocation shmem_allocation; + codegen::shmem_barriers shmem_barriers; codegen::vectorize vectorize; codegen::selection selection; + codegen::optimize_dot optimize_dot; + codegen::optimize_cse optimize_cse; + codegen::optimize_trans optimize_trans; codegen::target* target_; }; private: std::string compute_data_layout(bool is_64bit = true, bool use_short_pointers = true); std::unique_ptr make_llvm_module(triton::ir::module &module, passes_wrapper &passes); - std::unique_ptr make_triton_module(const std::string &src); + std::unique_ptr make_triton_module(const std::string &name, const std::string &src); public: jit(driver::context* context); - void autotune(const std::string &src, benchmark_t benchmark); + void autotune(const std::string &name, const std::string &src, benchmark_t benchmark); void add_module(ir::module &module, const std::vector& params = {}); - void add_module(const std::string &src, const std::vector& params = {}); + void add_module(const std::string &name, const std::string &src, const std::vector& params = {}); driver::kernel* get_function(const std::string &name); launch_information get_launch_info(const std::string &name); unsigned get_int(const std::string &name); + driver::buffer *get_buffer(const std::string &name); private: std::vector modules_; diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 77ba26464..45906cef8 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -95,55 +95,75 @@ void node::implicit_cast(ir::builder &builder, ir::value *&lhs, ir::value *&rhs, throw std::runtime_error("unreachable"); } -void node::implicit_broadcast(ir::module *mod, ir::value *&arg, ir::type *ty) { - ir::value *tmp = ir::undef_value::get(ty); - implicit_broadcast(mod, arg, tmp); -} - -void node::implicit_broadcast(ir::module *mod, ir::value *&lhs, ir::value *&rhs){ - ir::builder &builder = mod->get_builder(); +void node::implicit_broadcast(ir::module *mod, ir::value *&lhs, ir::value *&rhs) { ir::type *lhs_ty = lhs->get_type(); ir::type *rhs_ty = rhs->get_type(); - ir::type::tile_shapes_t::value_type one = ir::tile_type::make_one(mod->get_context()); - // Both are scalar + ir::type *res_ty = nullptr; if(!lhs_ty->is_tile_ty() && !rhs_ty->is_tile_ty()) return; - // One argument is scalar - if(lhs_ty->is_tile_ty() ^ rhs_ty->is_tile_ty()){ - auto &shapes = lhs_ty->is_tile_ty()?lhs_ty->get_tile_shapes():rhs_ty->get_tile_shapes(); - auto &scalar = lhs_ty->is_tile_ty()?rhs:lhs; - scalar = builder.create_splat(scalar, shapes); + else if(lhs_ty->is_tile_ty() && !rhs_ty->is_tile_ty()) + res_ty = lhs_ty; + else if(!lhs_ty->is_tile_ty() && rhs_ty->is_tile_ty()) + res_ty = rhs_ty; + else{ + auto lhs_shapes = lhs_ty->get_tile_shapes(); + auto rhs_shapes = rhs_ty->get_tile_shapes(); + size_t lhs_size = lhs_shapes.size(); + size_t rhs_size = rhs_shapes.size(); + size_t res_size = std::max(lhs_size, rhs_size); + ir::type::tile_shapes_t res_shapes(res_size); + ir::type::tile_shapes_t::value_type one = ir::tile_type::make_one(mod->get_context()); + for(int i = 0; i < res_size; i++){ + if(i >= res_size - lhs_size && i >= res_size - rhs_size) + res_shapes[i] = lhs_shapes[i]==one?rhs_shapes[i]:lhs_shapes[i]; + else if(i >= res_size - lhs_size) + res_shapes[i] = lhs_shapes[i]; + else if(i >= res_size - rhs_size) + res_shapes[i] = rhs_shapes[i]; + } + res_ty = ir::tile_type::get(lhs_ty->get_scalar_ty(), res_shapes); + } + implicit_broadcast(mod, res_ty, rhs); + implicit_broadcast(mod, res_ty, lhs); +} + +void node::implicit_broadcast(ir::module *mod, ir::type *ty, ir::value *&src){ + ir::builder &builder = mod->get_builder(); + ir::type *src_ty = src->get_type(); + ir::type::tile_shapes_t::value_type one = ir::tile_type::make_one(mod->get_context()); + // Both are scalar + if(!ty->is_tile_ty() && !src_ty->is_tile_ty()) + return; + // Broadcast scalar + if(ty->is_tile_ty() && !src_ty->is_tile_ty()){ + src = builder.create_splat(src, ty->get_tile_shapes()); + return; + } + // Downcast tile + if(!ty->is_tile_ty() && src_ty->is_tile_ty()){ + for(ir::constant *shape: src_ty->get_tile_shapes()) + if(shape != one) + throw std::runtime_error("cannot downcast"); + src = builder.create_downcast(src); return; } // Both are arrays - auto lhs_shapes = lhs->get_type()->get_tile_shapes(); - auto rhs_shapes = rhs->get_type()->get_tile_shapes(); - if(lhs_shapes == rhs_shapes) - return; - int lhs_dim = lhs_shapes.size(); - int rhs_dim = rhs_shapes.size(); - auto &shortest = (lhs_dim < rhs_dim)?lhs_shapes:rhs_shapes; - auto &longest = (lhs_dim < rhs_dim)?rhs_shapes:lhs_shapes; - size_t ndim = longest.size(); - int off = longest.size() - shortest.size(); - for(int i = longest.size() - 1; i>= 0; i--){ - if(shortest[off + i] != longest[i] && shortest[off + i] != one && longest[i] != one) - throw std::runtime_error("cannot broadcast"); - } + auto dst_shapes = ty->get_tile_shapes(); + auto src_shapes = src_ty->get_tile_shapes(); + int dst_dim = dst_shapes.size(); + int src_dim = src_shapes.size(); // Pad + int off = dst_dim - src_dim; for(size_t i = 0; i < off; i++) - shortest.insert(shortest.begin(), one); - ir::value *&target = (lhs_dim < rhs_dim)?lhs:rhs; + src_shapes.insert(src_shapes.begin(), one); if(off > 0) - target = builder.create_reshape(target, shortest); + src = builder.create_reshape(src, src_shapes); // Broadcast - ir::type::tile_shapes_t shapes(ndim); - for(size_t i = 0; i < ndim; i++) - shapes[i] = shortest[i]==one?longest[i]:shortest[i]; - if(shapes != lhs_shapes) - lhs = builder.create_broadcast(lhs, shapes); - if(shapes != rhs_shapes) - rhs = builder.create_broadcast(rhs, shapes); + for(int i = dst_dim - 1; i>= 0; i--) + if(dst_shapes[i] != src_shapes[i] && dst_shapes[i] != one && src_shapes[i] != one) + throw std::runtime_error("cannot broadcast"); + if(dst_shapes != src_shapes) + src = builder.create_broadcast(src, dst_shapes); } /* Helper */ @@ -336,7 +356,9 @@ ir::value* iteration_statement::codegen(ir::module *mod) const{ return builder.create_cond_br(cond, loop_bb, next_bb); }); init_->codegen(mod); - builder.create_br(loop_bb); + ir::value *cond = stop_->codegen(mod); + builder.create_cond_br(cond, loop_bb, next_bb); +// builder.create_br(loop_bb); builder.set_insert_point(loop_bb); if(!is_terminator(statements_->codegen(mod))) mod->get_continue_fn()(); @@ -378,6 +400,7 @@ ir::value* selection_statement::codegen(ir::module* mod) const{ builder.create_br(endif_bb); } // Endif + mod->seal_block(endif_bb); builder.set_insert_point(endif_bb); return nullptr; } @@ -422,7 +445,7 @@ ir::value* initializer::codegen(ir::module * mod) const{ else if(expr_){ value = expr_->codegen(mod); value = explicit_cast(mod->get_builder(), value, ty); - implicit_broadcast(mod, value, ty); + implicit_broadcast(mod, ty, value); } value->set_name(name); mod->set_value(name, value); @@ -543,6 +566,19 @@ ir::value* get_global_range::codegen(ir::module *mod) const { return builder.create_get_global_range(axis_->value(), (ir::constant_int*)size_->codegen(mod)); } +// get_range_id +ir::value* get_range_id::codegen(ir::module *mod) const { + return mod->get_builder().create_get_range_id(axis_->value()); +} + +// atomic cas +ir::value* atomic_cas::codegen(ir::module *mod) const { + ir::value *ptr = ptr_->codegen(mod); + ir::value *cmp = cmp_->codegen(mod); + ir::value *val = val_->codegen(mod); + return mod->get_builder().create_atomic_cas(ptr, cmp, val); +} + // matmul ir::value* matmul_expression::codegen(ir::module *mod) const { ir::value *A = A_->codegen(mod); @@ -554,10 +590,37 @@ ir::value* matmul_expression::codegen(ir::module *mod) const { // ir::type *tile_ty = ir::tile_type::get(scalar_ty, {M, N}); // ir::value *tmp = ir::undef_value::get(tile_ty); // implicit_broadcast(mod, tmp, C); - return mod->get_builder().create_matmul(A, B, C); + return mod->get_builder().create_dot(A, B, C); } +// min +ir::value* min_expression::codegen(ir::module *mod) const { + ir::value* cmp = binary_operator(LT, (node*)x_, (node*)y_).codegen(mod); + ir::value* x = ((ir::cmp_inst*)cmp)->get_operand(0); + ir::value* y = ((ir::cmp_inst*)cmp)->get_operand(1); + return mod->get_builder().create_select(cmp, x, y); +} +// max +ir::value* max_expression::codegen(ir::module *mod) const { + ir::value* cmp = binary_operator(GT, (node*)x_, (node*)y_).codegen(mod); + ir::value* x = ((ir::cmp_inst*)cmp)->get_operand(0); + ir::value* y = ((ir::cmp_inst*)cmp)->get_operand(1); + return mod->get_builder().create_select(cmp, x, y); +} + +// select +ir::value* select_expression::codegen(ir::module *mod) const { + ir::value* pred = pred_->codegen(mod); + ir::value* if_value = if_value_->codegen(mod); + ir::value* else_value = else_value_->codegen(mod); + return mod->get_builder().create_select(pred, if_value, else_value); +} + +// Trans +ir::value* trans_expression::codegen(ir::module *mod) const { + return mod->get_builder().create_trans(arg_->codegen(mod)); +} /* Postfix expression */ ir::value* indexing_expression::codegen(ir::module *mod) const{ @@ -573,6 +636,7 @@ ir::value* indexing_expression::codegen(ir::module *mod) const{ return mod->get_builder().create_reshape(in, out_shapes); } + /* Unary operator */ ir::value *unary_operator::llvm_op(ir::builder &builder, ir::value *arg, const std::string &name) const{ ir::type *atype = arg->get_type(); @@ -666,7 +730,7 @@ ir::value *assignment_expression::codegen(ir::module *mod) const{ if(auto *x = dynamic_cast(lvalue_)){ ir::type *ty = mod->get_scope().types.at(x->id()->name()); rvalue = explicit_cast(mod->get_builder(), rvalue, ty); - implicit_broadcast(mod, rvalue, ty); + implicit_broadcast(mod, ty, rvalue); mod->set_value(x->id()->name(), rvalue); } else if(auto* x = dynamic_cast(lvalue_)){ diff --git a/lib/codegen/buffer_info.cpp b/lib/codegen/buffer_info.cpp deleted file mode 100644 index dff371a64..000000000 --- a/lib/codegen/buffer_info.cpp +++ /dev/null @@ -1,90 +0,0 @@ -#include "triton/codegen/buffer_info.h" -#include "triton/ir/module.h" -#include "triton/ir/function.h" -#include "triton/ir/basic_block.h" -#include "triton/ir/instructions.h" -#include "triton/ir/type.h" - -namespace triton { - -namespace codegen{ - - -// run pass on module -bool buffer_info_pass::is_loop_latch(ir::phi_node *phi, ir::value *terminator){ - if(auto *br = dynamic_cast(terminator)) - return br->get_true_dest() == phi->get_parent() - || br->get_false_dest() == phi->get_parent(); - else if(auto *br = dynamic_cast(terminator)) - return false; - else - throw std::runtime_error("unreachable"); -} - -void buffer_info_pass::replace(ir::value* before, ir::value *after) { - shared_.erase(before); - shared_.insert(after); - if(refs_.find(before) != refs_.end()){ - ir::value* v = refs_.at(before); - refs_.erase(before); - refs_.insert({after, v}); - } -} - -void buffer_info_pass::run(ir::module &mod) { - // Find which buffers are shared - for(ir::function *fn: mod.get_function_list()) - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i: block->get_inst_list()) - if(dynamic_cast(i)){ - shared_.insert(i->get_operand(0)); - shared_.insert(i->get_operand(1)); - } - - // Handles phi nodes - for(ir::function *fn: mod.get_function_list()) - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i: block->get_inst_list()) { - if(!i->get_type()->is_tile_ty()) - continue; - // handle phi - if(auto *phi = dynamic_cast(i)) - if(is_shared(phi)){ - // determine if the value is in shared memory - bool is_double = false; - for(unsigned n = 0; n < phi->get_num_incoming(); n++){ - ir::basic_block *inc_block = phi->get_incoming_block(n); - ir::value *terminator = inc_block->get_inst_list().back(); - is_double = is_double || is_loop_latch(phi, terminator); - } - // add to double-buffered - if(is_double) - double_.insert(phi); - // set references of input - for(unsigned n = 0; n < phi->get_num_incoming(); n++){ - ir::value *inc_val = phi->get_incoming_value(n); - refs_[inc_val] = phi; - } - } - } - - for(auto &ref: refs_) - shared_.insert(ref.first); -} - -// query double-buffered status -bool buffer_info_pass::is_double(ir::value *x) -{ return double_.find(x) != double_.end(); } - -// query shared status -bool buffer_info_pass::is_shared(ir::value *x) -{ return shared_.find(x) != shared_.end(); } - -// get reference if any -ir::value *buffer_info_pass::get_reference(ir::value *x) -{ return refs_[x]; } - - - -} -} diff --git a/lib/codegen/layout.cpp b/lib/codegen/layout.cpp deleted file mode 100644 index 0722321b8..000000000 --- a/lib/codegen/layout.cpp +++ /dev/null @@ -1,56 +0,0 @@ -#include "triton/codegen/layout.h" -#include "triton/ir/function.h" -#include "triton/ir/module.h" -#include "triton/ir/basic_block.h" -#include "triton/ir/instructions.h" - -namespace triton{ -namespace codegen{ - - -shared_view_info layout::get_shared_view(ir::value *v, unsigned idx){ - return shared_views_.at(v)[idx]; -} - -unsigned layout::get_num_shared_views(ir::value *v){ - return shared_views_.at(v).size(); -} - -// Phi node -void layout::add_phi_nodes(ir::value *v){ - if(ir::phi_node *phi = dynamic_cast(v)) - if(shared_views_.find(phi) != shared_views_.end()) - for(ir::value *v: phi->ops()){ - shared_views_[v] = shared_views_[phi]; - for(shared_view_info &info: shared_views_[v]) - info.has_dedicated_storage = false; - } -} - -// Memory Layout -void layout::add_shared_views(ir::value *v){ - // GEMM has shared inputs - if(dynamic_cast(v)) - shared_views_[v].push_back({v, true}); - if(dynamic_cast(v)) - shared_views_[v].push_back({v, true}); -} - -// Entry point -void layout::run(ir::module &mod) { -for(ir::function *fn: mod.get_function_list()){ - // Non-phis - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *instr: block->get_inst_list()) { - add_shared_views(instr); - } - // Phi nodes - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *instr: block->get_inst_list()) { - add_phi_nodes(instr); - } -} -} - -} -} diff --git a/lib/codegen/loop_info.cpp b/lib/codegen/loop_info.cpp deleted file mode 100644 index e69de29bb..000000000 diff --git a/lib/codegen/optimize_cse.cpp b/lib/codegen/optimize_cse.cpp new file mode 100644 index 000000000..b0c07a99e --- /dev/null +++ b/lib/codegen/optimize_cse.cpp @@ -0,0 +1,14 @@ +#include "triton/ir/function.h" +#include "triton/ir/basic_block.h" +#include "triton/ir/module.h" +#include "triton/codegen/optimize_cse.h" + +namespace triton { +namespace codegen{ + + +void optimize_cse::run(ir::module &mod) { +} + +} +} diff --git a/lib/codegen/optimize_dot.cpp b/lib/codegen/optimize_dot.cpp new file mode 100644 index 000000000..67e3f8569 --- /dev/null +++ b/lib/codegen/optimize_dot.cpp @@ -0,0 +1,50 @@ +#include "triton/ir/function.h" +#include "triton/ir/basic_block.h" +#include "triton/ir/module.h" +#include "triton/codegen/optimize_dot.h" +#include "triton/codegen/tune.h" + +namespace triton { +namespace codegen{ + +inline bool is_trans(ir::value *v){ + return dynamic_cast(v) != nullptr; +} + +void optimize_dot::run(ir::module &mod) { + ir::builder &builder = mod.get_builder(); + std::vector to_delete; + // iterate + for(ir::function *fn: mod.get_function_list()) + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *i: block->get_inst_list()) + if(auto dot = dynamic_cast(i)) + if(dot->get_operand(1)->get_type()->get_tile_shapes()[1]->get_value() != 1) + if(!dot->is_a_trans() && !dot->is_b_trans()){ + builder.set_insert_point(i); + ir::value *A = dot->get_operand(0); + ir::value *B = dot->get_operand(1); + ir::value *D = dot->get_operand(2); + // dot(op(a), trans(b)) + if(is_trans(B)){ + ir::value* BN = ((ir::trans_inst*)B)->get_operand(0); + ir::instruction *NT = builder.insert(ir::dot_inst::create_nt(A, BN, D)); + dot->replace_all_uses_with(NT); + to_delete.push_back((ir::instruction*)B); + to_delete.push_back(dot); + } + // dot(op(a), b) + if(!is_trans(B)){ + ir::value* BT = builder.create_trans(B); + ir::instruction *NT = builder.insert(ir::dot_inst::create_nt(A, BT, D)); + dot->replace_all_uses_with(NT); + to_delete.push_back(dot); + } + } + + for(ir::instruction* i: to_delete) + i->erase_from_parent(); +} + +} +} diff --git a/lib/codegen/optimize_trans.cpp b/lib/codegen/optimize_trans.cpp new file mode 100644 index 000000000..b6ad7cfd2 --- /dev/null +++ b/lib/codegen/optimize_trans.cpp @@ -0,0 +1,71 @@ +#include "triton/ir/module.h" +#include "triton/ir/function.h" +#include "triton/codegen/optimize_trans.h" + +namespace triton { +namespace codegen{ + + +ir::value* optimize_trans::replace_phi(ir::value* value, + std::vector& to_delete, + ir::builder& builder){ + if(auto phi = dynamic_cast(value)) { + // transpose operands + std::vector incs; + for(unsigned n = 0; n < phi->get_num_incoming(); n++) + incs.push_back(replace_phi(phi->get_incoming_value(n), to_delete, builder)); + // create phi for transposed values + builder.set_insert_point(phi); + ir::phi_node* result = builder.create_phi(incs[0]->get_type(), incs.size(), phi->get_name()); + for(unsigned n = 0; n < phi->get_num_incoming(); n++) + result->add_incoming(incs[n], phi->get_incoming_block(n)); + phi->replace_all_uses_with(result); + to_delete.push_back(phi); + return result; + } + else if(auto i = dynamic_cast(value)){ + ir::basic_block* block = i->get_parent(); + auto it = std::find(block->begin(), block->end(), i); + it++; + builder.set_insert_point(it); + ir::instruction *trans = (ir::instruction*)builder.create_trans(i); + i->replace_all_uses_with(trans); + trans->set_operand(0, i); + return trans; + } + throw std::runtime_error("cannot transpose phi"); +} + + +void optimize_trans::run(ir::module &mod) { + ir::builder &builder = mod.get_builder(); + std::vector to_delete; + // iterate + for(ir::function *fn: mod.get_function_list()) + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction* i: block->get_inst_list()){ + // filter transposition + if(auto trans = dynamic_cast(i)) { + auto users = trans->get_users(); + auto ops = trans->ops(); + if(users.size() > 1 || ops.size() > 1) + continue; + ir::value* op = *ops.begin(); + // chains of transpositions + // TODO + + // trans(phi) -> phi(trans(), trans()...) + if(dynamic_cast(op)){ + ir::value* new_phi = replace_phi(op, to_delete, builder); + to_delete.push_back(trans); + trans->replace_all_uses_with(new_phi); + } + } + } + // erase dead code + for(ir::instruction* i: to_delete) + i->erase_from_parent(); +} + +} +} diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 546b0e76f..c04b4cdfb 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -1,6 +1,6 @@ #include "triton/codegen/selection.h" #include "triton/codegen/tune.h" -#include "triton/codegen/allocation.h" +#include "triton/codegen/shmem_allocation.h" #include "triton/codegen/target.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Module.h" @@ -309,7 +309,47 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::function(inst)){ Value *ptr = value(ii->get_pointer_operand()); - return builder.Insert(new LoadInst(ptr)); + LoadInst *result = new LoadInst(ptr); + return builder.Insert(result); + } + if(ir::store_inst* ii = dynamic_cast(inst)){ + Value *val = value(ii->get_value_operand()); + Value *ptr = value(ii->get_pointer_operand()); + builder.CreateStore(val, ptr); + return nullptr; + } + if(ir::select_inst* ii = dynamic_cast(inst)){ + Value *pred = value(ii->get_operand(0)); + Value *if_value = value(ii->get_operand(1)); + Value *else_value = value(ii->get_operand(2)); + return builder.Insert(SelectInst::Create(pred, if_value, else_value)); + } + if(ir::get_range_id_inst* ii = dynamic_cast(inst)){ + Value *offset = tgt_->get_block_id(builder.GetInsertBlock()->getModule(), builder, ii->get_axis()); + return (Instruction*)builder.CreateAdd(offset, builder.getInt32(0)); + } + if(ir::atomic_cas_inst* ii = dynamic_cast(inst)){ + BasicBlock *current = builder.GetInsertBlock(); + Module *module = current->getModule(); + Value *tid = tgt_->get_local_id(module, builder, 0); + Value *pred = builder.CreateICmpEQ(tid, builder.getInt32(0)); + BasicBlock *tid_0_bb = BasicBlock::Create(ctx, "tid_0", current->getParent()); + BasicBlock *tid_0_done_bb = BasicBlock::Create(ctx, "tid_0_done", current->getParent()); + Value *ptr = builder.CreateGEP(sh_mem_ptr_, builder.getInt32(alloc_->get_offset(ii))); + ptr = builder.CreateBitCast(ptr, PointerType::get(builder.getInt32Ty(), ptr->getType()->getPointerAddressSpace())); + builder.CreateCondBr(pred, tid_0_bb, tid_0_done_bb); + builder.SetInsertPoint(tid_0_bb); + Value *cas_ptr = value(ii->get_operand(0)); + Value *cas_cmp = value(ii->get_operand(1)); + Value *cas_val = value(ii->get_operand(2)); + Value *old = builder.CreateAtomicCmpXchg(cas_ptr, cas_cmp, cas_val, AtomicOrdering::Monotonic, AtomicOrdering::Monotonic); + old = builder.CreateExtractValue(old, {0}); + builder.CreateStore(old, ptr); + builder.CreateBr(tid_0_done_bb); + builder.SetInsertPoint(tid_0_done_bb); + tgt_->add_barrier(module, builder); + Value *res = builder.CreateLoad(ptr); + return (Instruction*)res; } // unknown instruction throw std::runtime_error("unknown conversion from ir::instruction to Instruction"); @@ -446,7 +486,7 @@ void selection::create_grids(std::vector &grids, bind_references(op); // bind const auto& shapes = v->get_type()->get_tile_shapes(); - if(dynamic_cast(v) || buffer_info_->is_double(v)) + if(buffer_info_->is_shared(v)) return; for(size_t d = 0; d < shapes.size(); d++){ if(shapes[d]->get_value() == 1) @@ -490,20 +530,11 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, shapes2.push_back(shape->get_value()); Type* ty = llvm_type(v->get_type()->get_scalar_ty(), ctx); // create shared tile - if(dynamic_cast(v) || (buffer_info_->is_double(v))){ + if(buffer_info_->is_shared(v)){ // shared copy PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr->getType()->getPointerAddressSpace()); - // TODO - buffer info not up-to-date with references - if(dynamic_cast(v)) { - if(!has_phi_user(v)){ - size_t offset = alloc_->get_offset(v); - Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset)); - ptr = builder.CreateBitCast(ptr, ptr_ty); - tmap_.insert({v, new shared_tile(ty, shapes2, ptr, builder)}); - } - } // phi-node (double-buffering) - else if(auto *phi = dynamic_cast(v)) { + if(auto *phi = dynamic_cast(v)) { BasicBlock *parent = (BasicBlock*)vmap_[phi->get_parent()]; unsigned id_pre = 0, id_loop = 1; if(phi->get_incoming_block(0) == phi->get_parent()) @@ -522,13 +553,19 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, for(unsigned i = 0; i < phi->get_num_incoming(); i++) { ir::basic_block* inc_block = phi->get_incoming_block(i); ir::value* inc_value = phi->get_incoming_value(i); - ir::value* terminator = inc_block->get_inst_list().back(); + ir::instruction* terminator = inc_block->get_inst_list().back(); bool is_loop_latch = buffer_info_->is_loop_latch(phi, terminator); tmap_.insert({inc_value, new shared_tile(ty, shapes2, is_loop_latch?next_ptr:pre_ptr, builder)}); } } - else - throw std::runtime_error("unknown shared memory tile"); + else { + if(!has_phi_user(v)){ + size_t offset = alloc_->get_offset(v); + Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset)); + ptr = builder.CreateBitCast(ptr, ptr_ty); + tmap_.insert({v, new shared_tile(ty, shapes2, ptr, builder)}); + } + } } // create distributed tile else { @@ -607,10 +644,16 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & tile *value = tmap_.at(x->get_value_operand()); ptr->for_each([&](indices_t idx){ set_mask_insert_pt(idx); - builder.CreateStore(value->get_value(idx), ptr->get_value(idx)); + StoreInst *store = new StoreInst(value->get_value(idx), ptr->get_value(idx)); +// store->setAlignment(16); + builder.Insert(store); }); } else { + if(auto *x = dynamic_cast(ins)){ + vmap_[x] = tmap_[x->get_operand(0)]->get_value({builder.getInt32(0)}); + return; + } tile *ti = tmap_[ins]; distributed_tile* result = (distributed_tile*)ti; if(!ins->get_type()->is_tile_ty()) @@ -727,31 +770,67 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & ti->set_value(idx, in->get_value(idx)); }); } - else if(dynamic_cast(ins) || (buffer_info_->is_double(ins))) + // trans + else if(dynamic_cast(ins)) { + distributed_tile* in = (distributed_tile*)tmap_.at(ins->get_operand(0)); + in->for_each([&](indices_t idx){ + indices_t out_idx = idx; + std::rotate(out_idx.begin(), out_idx.begin() + 1, out_idx.end()); + ti->set_value(out_idx, in->get_value(idx)); + }); + } + else if(buffer_info_->is_shared(ins)) return; - // matrix multiplication - else if(dynamic_cast(ins)) { + // dot + else if(auto dot = dynamic_cast(ins)) { ir::value *A = ins->get_operand(0); ir::value *B = ins->get_operand(1); ir::value *C = ins->get_operand(2); - shared_tile *TA = (shared_tile*)tmap_.at(A); - shared_tile *TB = (shared_tile*)tmap_.at(B); + bool AT = dot->is_a_trans(); + bool BT = dot->is_b_trans(); distributed_tile *TC = (distributed_tile*)tmap_.at(C); - TA->set_vector_size(TC->axis(0).contiguous); - TB->set_vector_size(TC->axis(1).contiguous); Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {llvm_type(C->get_type()->get_scalar_ty(), ctx)}); - result->for_each([&](indices_t idx){ - Value *res = TC->get_value(idx); - unsigned NK = A->get_type()->get_tile_shapes()[1]->get_value(); - for(unsigned K = 0; K < NK; ++K){ - indices_t a_idx = {idx[0], builder.getInt32(K)}; - indices_t b_idx = {idx[1], builder.getInt32(K)}; + if(dot->get_operand(0)->get_type()->get_tile_shapes()[1]->get_value() != 1) + { + shared_tile *TA = (shared_tile*)tmap_.at(A); + shared_tile *TB = (shared_tile*)tmap_.at(B); + TA->set_vector_size(TC->axis(0).contiguous); + TB->set_vector_size(TC->axis(1).contiguous); + result->for_each([&](indices_t idx){ + Value *res = TC->get_value(idx); + unsigned NK = A->get_type()->get_tile_shapes()[1]->get_value(); + for(unsigned K = 0; K < NK; ++K){ + indices_t a_idx = {idx[0], builder.getInt32(K)}; + indices_t b_idx = {builder.getInt32(K), idx[1]}; + if(AT) + std::swap(a_idx[0], a_idx[1]); + if(BT) + std::swap(b_idx[0], b_idx[1]); + Value *a = TA->get_value(a_idx); + Value *b = TB->get_value(b_idx); + res = builder.CreateCall(f_mul_add, {a, b, res}); + } + result->set_value(idx, res); + }); + } + else + { + distributed_tile *TA = (distributed_tile*)tmap_.at(A); + distributed_tile *TB = (distributed_tile*)tmap_.at(B); + result->for_each([&](indices_t idx){ + Value *res = TC->get_value(idx); + indices_t a_idx = {idx[0], builder.getInt32(0)}; + indices_t b_idx = {builder.getInt32(0), idx[1]}; + if(AT) + std::swap(a_idx[0], a_idx[1]); + if(BT) + std::swap(b_idx[0], b_idx[1]); Value *a = TA->get_value(a_idx); Value *b = TB->get_value(b_idx); res = builder.CreateCall(f_mul_add, {a, b, res}); - } - result->set_value(idx, res); - }); + result->set_value(idx, res); + }); + } } // element-wise else { @@ -858,6 +937,7 @@ void selection::run(ir::module &src, Module &dst) { nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); sh_mem_ptr = dst_builder.CreateBitCast(sh_mem_array, ptr_ty); } + sh_mem_ptr_ = sh_mem_ptr; // create grids init_grids(fn, dst_builder, sh_mem_ptr); @@ -890,7 +970,7 @@ void selection::run(ir::module &src, Module &dst) { for(unsigned n = 0; n < phi->get_num_incoming(); n++){ ir::basic_block* inc_block = phi->get_incoming_block(n); ir::value* inc_val = phi->get_incoming_value(n); - ir::value* terminator = inc_block->get_inst_list().back(); + ir::instruction* terminator = inc_block->get_inst_list().back(); BasicBlock *llvm_inc_block = last_block.at(inc_block); shared_tile *inc_shared = (shared_tile*)tmap_.at(inc_val); bool is_loop_latch = buffer_info_->is_loop_latch(phi, terminator); @@ -920,8 +1000,8 @@ void selection::run(ir::module &src, Module &dst) { }); } else { - PHINode *llvm_phi = (PHINode*)vmap_.at(phi); - Value *llvm_inc_val = vmap_.at(inc_val); + PHINode *llvm_phi = (PHINode*)llvm_value(phi, dst_builder); + Value *llvm_inc_val = llvm_value(inc_val, dst_builder); llvm_phi->addIncoming(llvm_inc_val, llvm_inc_block); } } diff --git a/lib/codegen/shared_copy.cpp b/lib/codegen/shared_copy.cpp deleted file mode 100644 index 6c05b7807..000000000 --- a/lib/codegen/shared_copy.cpp +++ /dev/null @@ -1,40 +0,0 @@ -#include -#include "triton/codegen/shared_copy.h" -#include "triton/codegen/buffer_info.h" -#include "triton/ir/module.h" -#include "triton/ir/function.h" -#include "triton/ir/basic_block.h" -#include "triton/ir/instructions.h" - -namespace triton { - -namespace codegen{ - -void place_shared_copy::add_copy(ir::value *x, ir::builder &builder) { - if(auto *i = dynamic_cast(x)){ - ir::basic_block* block = i->get_parent(); - auto it = std::find(block->begin(), block->end(), i); - builder.set_insert_point(++it); - } - ir::instruction *rx = (ir::instruction*)builder.create_copy_to_shared(x); - x->replace_all_uses_with(rx); - rx->set_operand(0, x); -} - -void place_shared_copy::run(ir::module &mod) { - ir::builder &builder = mod.get_builder(); - for(ir::function *fn: mod.get_function_list()) - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i: block->get_inst_list()) - if(info_->is_shared(i) && !info_->is_double(i)) - add_copy(i, builder); - - for(ir::function *fn: mod.get_function_list()) - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i: block->get_inst_list()) - if(auto* cts = dynamic_cast(i)) - info_->replace(cts->get_operand(0), cts); -} - -} -} diff --git a/lib/codegen/allocation.cpp b/lib/codegen/shmem_allocation.cpp similarity index 91% rename from lib/codegen/allocation.cpp rename to lib/codegen/shmem_allocation.cpp index fd272a243..43ab8bc39 100644 --- a/lib/codegen/allocation.cpp +++ b/lib/codegen/shmem_allocation.cpp @@ -1,7 +1,6 @@ -#include "triton/codegen/allocation.h" -#include "triton/codegen/liveness.h" -#include "triton/codegen/layout.h" -#include "triton/codegen/buffer_info.h" +#include "triton/codegen/shmem_allocation.h" +#include "triton/codegen/shmem_liveness.h" +#include "triton/codegen/shmem_info.h" #include "triton/ir/basic_block.h" #include "triton/ir/type.h" #include "triton/ir/value.h" @@ -11,14 +10,14 @@ namespace triton{ namespace codegen{ -unsigned allocation::get_num_bytes(ir::value *x) { - unsigned result = x->get_type()->get_tile_bitwidth() / 8; +unsigned shmem_allocation::get_num_bytes(ir::value *x) { + unsigned result = x->get_type()->get_primitive_size_in_bits() / 8; if(buffer_info_->is_double(x)) result *= 2; return result; } -void allocation::run(){ +void shmem_allocation::run(){ using std::max; using std::min; typedef std::multimap triples_map_type; diff --git a/lib/codegen/barriers.cpp b/lib/codegen/shmem_barriers.cpp similarity index 75% rename from lib/codegen/barriers.cpp rename to lib/codegen/shmem_barriers.cpp index bb3611f85..717b927fd 100644 --- a/lib/codegen/barriers.cpp +++ b/lib/codegen/shmem_barriers.cpp @@ -1,7 +1,7 @@ #include -#include "triton/codegen/barriers.h" -#include "triton/codegen/allocation.h" -#include "triton/codegen/buffer_info.h" +#include "triton/codegen/shmem_barriers.h" +#include "triton/codegen/shmem_allocation.h" +#include "triton/codegen/shmem_info.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" @@ -12,7 +12,7 @@ namespace triton { namespace codegen{ -bool barriers::intersect(const interval_vec_t &X, interval_t x) { +bool shmem_barriers::intersect(const interval_vec_t &X, interval_t x) { return std::any_of(X.begin(), X.end(), [&](const interval_t &y){ bool left_intersect = y.first <= x.first && x.first < y.second; bool right_intersect = y.first <= x.second && x.second < y.second; @@ -20,31 +20,31 @@ bool barriers::intersect(const interval_vec_t &X, interval_t x) { }); } -bool barriers::intersect(const interval_vec_t &X, const interval_vec_t &Y) { +bool shmem_barriers::intersect(const interval_vec_t &X, const interval_vec_t &Y) { return std::any_of(Y.begin(), Y.end(), [&](const interval_t &y){ return intersect(X, y); }); } -void barriers::add_reference(ir::value *v, interval_vec_t &res){ - if(dynamic_cast(v)){ +void shmem_barriers::add_reference(ir::value *v, interval_vec_t &res){ + if(buffer_info_->is_shared(v) && !dynamic_cast(v)){ unsigned offset = alloc_->get_offset(v); unsigned num_bytes = alloc_->get_num_bytes(v); res.push_back(interval_t(offset, offset + num_bytes)); } } -void barriers::get_read_intervals(ir::instruction *i, interval_vec_t &res){ +void shmem_barriers::get_read_intervals(ir::instruction *i, interval_vec_t &res){ for(ir::value *op: i->ops()) add_reference(op, res); } -void barriers::get_written_intervals(ir::instruction *i, interval_vec_t &res){ +void shmem_barriers::get_written_intervals(ir::instruction *i, interval_vec_t &res){ if(!dynamic_cast(i)) add_reference(i, res); } -void barriers::insert_barrier(ir::instruction *instr, ir::builder &builder) { +void shmem_barriers::insert_barrier(ir::instruction *instr, ir::builder &builder) { if(auto *phi = dynamic_cast(instr)) { std::set incoming; for(unsigned n = 0; n < phi->get_num_incoming(); n++){ @@ -63,16 +63,16 @@ void barriers::insert_barrier(ir::instruction *instr, ir::builder &builder) { } } -barriers::interval_vec_t barriers::join(const std::vector& intervals) { - barriers::interval_vec_t result; +shmem_barriers::interval_vec_t shmem_barriers::join(const std::vector& intervals) { + shmem_barriers::interval_vec_t result; for(auto x: intervals) for(interval_t i: x) result.push_back(i); return result; } -std::pair barriers::transfer(ir::basic_block *block, +std::pair shmem_barriers::transfer(ir::basic_block *block, const interval_vec_t &written_to, const interval_vec_t &read_from, std::set& insert_loc) { @@ -83,13 +83,13 @@ std::pair(i) && + if(buffer_info_->is_shared(i) && buffer_info_->is_double(buffer_info_->get_reference(i))) - written_while_read = false; - if(read_while_written || written_while_read) { + write_after_read = false; + if(read_after_write || write_after_read) { insert_loc.insert(i); new_written_to.clear(); new_read_from.clear(); @@ -100,7 +100,7 @@ std::pair rpo = ir::cfg::reverse_post_order(fn); diff --git a/lib/codegen/shmem_info.cpp b/lib/codegen/shmem_info.cpp new file mode 100644 index 000000000..6d3caafab --- /dev/null +++ b/lib/codegen/shmem_info.cpp @@ -0,0 +1,135 @@ +#include "triton/codegen/shmem_info.h" +#include "triton/ir/module.h" +#include "triton/ir/function.h" +#include "triton/ir/basic_block.h" +#include "triton/ir/instructions.h" +#include "triton/ir/type.h" + +namespace triton { + +namespace codegen{ + + +// run pass on module +bool shmem_info::is_loop_latch(ir::phi_node *phi, ir::instruction *terminator){ + if(phi->get_parent() != terminator->get_parent()) + return false; + if(auto *br = dynamic_cast(terminator)) + return br->get_true_dest() == phi->get_parent() + || br->get_false_dest() == phi->get_parent(); + else if(auto *br = dynamic_cast(terminator)) + return false; + else + throw std::runtime_error("unreachable"); +} + +void shmem_info::replace(ir::value* before, ir::value *after) { + shared_.erase(before); + shared_.insert(after); + if(refs_.find(before) != refs_.end()){ + ir::value* v = refs_.at(before); + refs_.erase(before); + refs_.insert({after, v}); + } +} + +inline bool get_is_shared(ir::value* v) { + if(auto x = dynamic_cast(v)) + return true; + if(auto x = dynamic_cast(v)) + return true; + if(auto x = dynamic_cast(v)) + return true; + if(auto x = dynamic_cast(v)){ + bool res = true; + for(unsigned inc = 0; inc < x->get_num_incoming(); inc++) + res = res && get_is_shared(x->get_incoming_value(inc)); + return res; + } + return false; +} + +void add_copy(ir::value *x, ir::builder &builder) { + if(auto phi = dynamic_cast(x)){ + for(unsigned i = 0; i < phi->get_num_incoming(); ++i) + add_copy(phi->get_incoming_value(i), builder); + } + else { + if(get_is_shared(x)) + return; + if(auto *i = dynamic_cast(x)){ + ir::basic_block* block = i->get_parent(); + auto it = std::find(block->begin(), block->end(), i); + builder.set_insert_point(++it); + } + ir::instruction *rx = (ir::instruction*)builder.create_copy_to_shared(x); + x->replace_all_uses_with(rx); + rx->set_operand(0, x); + } +} + +void shmem_info::run(ir::module &mod) { + // Add shared copies + for(ir::function *fn: mod.get_function_list()){ + ir::builder builder(mod.get_context()); + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *i: block->get_inst_list()){ + if(dynamic_cast(i)) + if(i->get_operand(1)->get_type()->get_tile_shapes()[1]->get_value() != 1){ + add_copy(i->get_operand(0), builder); + add_copy(i->get_operand(1), builder); + } + } + } + + // Find which buffers are shared + for(ir::function *fn: mod.get_function_list()) + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *i: block->get_inst_list()) + if(get_is_shared(i)) + shared_.insert(i); + + // double-buffering + for(ir::function *fn: mod.get_function_list()) + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *i: block->get_inst_list()) { + if(!i->get_type()->is_tile_ty()) + continue; + // handle phi + if(auto *phi = dynamic_cast(i)) + if(is_shared(phi)){ + // determine if the value is in shared memory + bool is_double = false; + for(unsigned n = 0; n < phi->get_num_incoming(); n++){ + ir::basic_block *inc_block = phi->get_incoming_block(n); + ir::instruction *terminator = inc_block->get_inst_list().back(); + is_double = is_double || is_loop_latch(phi, terminator); + } + // add to double-buffered + if(is_double) + double_.insert(phi); + // set references of input + for(unsigned n = 0; n < phi->get_num_incoming(); n++){ + ir::value *inc_val = phi->get_incoming_value(n); + refs_[inc_val] = phi; + } + } + } +} + +// query double-buffered status +bool shmem_info::is_double(ir::value *x) +{ return double_.find(x) != double_.end(); } + +// query shared status +bool shmem_info::is_shared(ir::value *x) +{ return shared_.find(x) != shared_.end(); } + +// get reference if any +ir::value *shmem_info::get_reference(ir::value *x) +{ return refs_[x]; } + + + +} +} diff --git a/lib/codegen/liveness.cpp b/lib/codegen/shmem_liveness.cpp similarity index 67% rename from lib/codegen/liveness.cpp rename to lib/codegen/shmem_liveness.cpp index ca33bd487..4d8e9c66b 100644 --- a/lib/codegen/liveness.cpp +++ b/lib/codegen/shmem_liveness.cpp @@ -1,5 +1,5 @@ -#include "triton/codegen/liveness.h" -#include "triton/codegen/buffer_info.h" +#include "triton/codegen/shmem_liveness.h" +#include "triton/codegen/shmem_info.h" #include "triton/ir/basic_block.h" #include "triton/ir/function.h" #include "triton/ir/module.h" @@ -11,19 +11,7 @@ namespace codegen{ // Entry point -inline bool is_shared(ir::value* v) { - if(auto x = dynamic_cast(v)) - return true; - if(auto x = dynamic_cast(v)){ - bool res = true; - for(unsigned inc = 0; inc < x->get_num_incoming(); inc++) - res = res && is_shared(x->get_incoming_value(inc)); - return res; - } - return false; -} - -void liveness::run(ir::module &mod) { +void shmem_liveness::run(ir::module &mod) { for(ir::function *fn: mod.get_function_list()){ // Assigns index to each instruction slot_index index = 0; diff --git a/lib/codegen/target.cpp b/lib/codegen/target.cpp index 27a982a6c..2554bf5c3 100644 --- a/lib/codegen/target.cpp +++ b/lib/codegen/target.cpp @@ -4,6 +4,7 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Value.h" #include "llvm/IR/IRBuilder.h" +#include using namespace llvm; @@ -26,6 +27,12 @@ Instruction* amd_cl_target::add_barrier(Module *module, IRBuilder<>& builder) { } Value* amd_cl_target::get_global_offset(Module *module, IRBuilder<>& builder, unsigned stride, unsigned ax) { + Value* group_id = get_block_id(module, builder, ax); + Value* result = builder.CreateMul(builder.getInt32(stride), group_id); + return result; +} + +Value* amd_cl_target::get_block_id(Module *module, IRBuilder<>& builder, unsigned ax) { static std::array ids = { Intrinsic::amdgcn_workgroup_id_x, Intrinsic::amdgcn_workgroup_id_y, @@ -33,8 +40,7 @@ Value* amd_cl_target::get_global_offset(Module *module, IRBuilder<>& builder, un }; Value* get_group_id = Intrinsic::getDeclaration(module, ids[ax]); Value* group_id = builder.CreateCall(get_group_id, {}); - Value* result = builder.CreateMul(builder.getInt32(stride), group_id); - return result; + return group_id; } Value* amd_cl_target::get_local_id(Module *module, IRBuilder<>& builder, unsigned ax) { @@ -65,6 +71,12 @@ Instruction* nvidia_cu_target::add_barrier(Module *module, IRBuilder<>& builder) } Value* nvidia_cu_target::get_global_offset(Module *module, IRBuilder<>& builder, unsigned stride, unsigned ax) { + Value* group_id = get_block_id(module, builder, ax); + Value* result = builder.CreateMul(builder.getInt32(stride), group_id); + return result; +} + +Value* nvidia_cu_target::get_block_id(Module *module, IRBuilder<>& builder, unsigned ax) { static std::array ids = { Intrinsic::nvvm_read_ptx_sreg_ctaid_x, Intrinsic::nvvm_read_ptx_sreg_ctaid_y, @@ -72,8 +84,7 @@ Value* nvidia_cu_target::get_global_offset(Module *module, IRBuilder<>& builder, }; Value* get_group_id = Intrinsic::getDeclaration(module, ids[ax]); Value* group_id = builder.CreateCall(get_group_id, {}); - Value* result = builder.CreateMul(builder.getInt32(stride), group_id); - return result; + return group_id; } Value* nvidia_cu_target::get_local_id(Module *module, IRBuilder<>& builder, unsigned ax) { @@ -97,7 +108,7 @@ Instruction* cpu_target::add_barrier(Module *module, IRBuilder<>& builder) { return (Instruction*)builder.CreateAdd(builder.getInt32(0), builder.getInt32(0)); } -Value* cpu_target::get_global_offset(Module *module, IRBuilder<>& builder, unsigned stride, unsigned ax) { +Value* cpu_target::get_block_id(Module *module, llvm::IRBuilder<> &builder, unsigned ax) { const Function *fn = builder.GetInsertBlock()->getParent(); size_t num_params = fn->getFunctionType()->getNumParams(); static std::array ids = { @@ -105,7 +116,11 @@ Value* cpu_target::get_global_offset(Module *module, IRBuilder<>& builder, unsig fn->arg_begin() + num_params - 2, fn->arg_begin() + num_params - 1 }; - Value* result = builder.CreateMul(builder.getInt32(stride), (Argument*)ids[ax]); + return (Argument*)ids[ax]; +} + +Value* cpu_target::get_global_offset(Module *module, IRBuilder<>& builder, unsigned stride, unsigned ax) { + Value* result = builder.CreateMul(builder.getInt32(stride), get_block_id(module, builder, ax)); return result; } @@ -113,6 +128,5 @@ Value* cpu_target::get_local_id(Module *module, IRBuilder<>& builder, unsigned a return builder.getInt32(0); } - } } diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 4353b1332..1a1562c8f 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -1,5 +1,4 @@ #include "triton/codegen/tune.h" -#include "triton/codegen/shared_copy.h" #include "triton/ir/instructions.h" #include "triton/ir/type.h" #include "triton/ir/module.h" @@ -40,6 +39,8 @@ void tune::init_c_graph(ir::instruction *v) { ir::type::tile_shapes_t shapes; if(auto *store = dynamic_cast(v)) shapes = store->get_pointer_operand()->get_type()->get_tile_shapes(); + else if(auto *downcast = dynamic_cast(v)) + return; else shapes = v->get_type()->get_tile_shapes(); // Reshape @@ -56,6 +57,14 @@ void tune::init_c_graph(ir::instruction *v) { // Splat else if(dynamic_cast(v)){ + } + // Trans + else if(dynamic_cast(v)){ + ir::value *op = v->get_operand(0); + size_t n_shapes = shapes.size(); + for(unsigned i = 0; i < n_shapes; i++){ + add_constraint({v, (i + 1) % n_shapes}, {op, i}); + } } // Broadcast else if(dynamic_cast(v)){ @@ -68,7 +77,7 @@ void tune::init_c_graph(ir::instruction *v) { } } // Matrix multiplication - else if(dynamic_cast(v)){ + else if(dynamic_cast(v)){ ir::value *D = v->get_operand(2); add_constraint({v, 0}, {D, 0}); add_constraint({v, 1}, {D, 1}); @@ -119,6 +128,13 @@ std::vector tune::get_params(ir::module &mod) { if(seen.insert(x.second).second && !x.second->has_value()){ result.push_back(x.second); } + + for(auto x: mod.globals()){ + if(auto mp = dynamic_cast(x.second)) + if(seen.insert(mp).second && !mp->has_value()) + result.push_back(mp); + } + return result; } @@ -145,23 +161,22 @@ void tune::run(ir::module &mod) { // Layout parameters while(!nodes_.empty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 2, 4); + ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 1, 1); + nts->set_value(1); ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 4, 32); connected_components(*nodes_.begin(), {nts, mts}, nodes_, dependencies_); } } // Simplify metaparameters - std::set fixed_io_nts; for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i : block->get_inst_list()) - if(dynamic_cast(i) || dynamic_cast(i)) - if(i->get_type()->is_tile_ty()) - for(unsigned d = 1; d < i->get_type()->get_tile_shapes().size(); d++) - fixed_io_nts.insert(params_.at(i).at("nts.d" + std::to_string(d))); - for(ir::metaparameter* mp: fixed_io_nts) - mp->set_value(1); + if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ + ir::type *ty = mod.get_builder().get_int32_ty(); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 2)); + *params_.at(i).at("nts.d0") = *tmp; + } } void tune::init(ir::module &mod) { diff --git a/lib/driver/buffer.cpp b/lib/driver/buffer.cpp index b5030d710..a64e0aeca 100755 --- a/lib/driver/buffer.cpp +++ b/lib/driver/buffer.cpp @@ -64,9 +64,6 @@ buffer* buffer::create(driver::context* ctx, size_t size) { host_buffer::host_buffer(driver::context *context, size_t size) : buffer(context, host_buffer_t(), true){ hst_->data = new char[size]; - std::cout << size << std::endl; - std::cout << "allocating " << (float*)hst_->data << std::endl; - std::cout << *((float*)(hst_->data) + 512*500) << std::endl; } // diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 8346961fe..641b900b4 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -106,7 +106,11 @@ void module::compile_llvm_module(llvm::Module* module, const std::string& triple const std::string& features, file_type_t ft) { init_llvm(); - + // debug +// llvm::legacy::PassManager pm; +// pm.add(llvm::createPrintModulePass(llvm::outs())); +// pm.add(llvm::createVerifierPass()); +// pm.run(*module); // create machine module->setTargetTriple(triple); std::string error; @@ -249,6 +253,7 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ +// std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; @@ -264,11 +269,11 @@ cu_module::cu_module(driver::context * context, std::string const & source) : mo } } -cu_buffer cu_module::symbol(const char *name) const{ +cu_buffer* cu_module::symbol(const char *name) const{ CUdeviceptr handle; size_t size; dispatch::cuModuleGetGlobal_v2(&handle, &size, *cu_, name); - return cu_buffer(ctx_, handle, false); + return new cu_buffer(ctx_, handle, false); } diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index db0ae9e94..c913c37e8 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -285,6 +285,10 @@ value *builder::create_broadcast(value *arg, const type::tile_shapes_t &shapes, return insert(broadcast_inst::create(arg, shapes, name)); } +value *builder::create_downcast(value *arg, const std::string &name) { + return insert(downcast_inst::create(arg, name)); +} + //===----------------------------------------------------------------------===// // built-in instructions //===----------------------------------------------------------------------===// @@ -293,8 +297,24 @@ value *builder::create_get_global_range(unsigned axis, type::tile_shapes_t::valu return insert(get_global_range_inst::create(ctx_, axis, size, name)); } -value *builder::create_matmul(value *A, value *B, value *C, const std::string &name) { - return insert(matmul_inst::create(A, B, C, name)); +value *builder::create_get_range_id(unsigned axis, const std::string &name) { + return insert(get_range_id_inst::create(ctx_, axis, name)); +} + +value *builder::create_atomic_cas(value *ptr, value *cmp, value *val, const std::string &name){ + return insert(atomic_cas_inst::create(ptr, cmp, val, name)); +} + +value *builder::create_dot(value *A, value *B, value *C, const std::string &name) { + return insert(dot_inst::create_nn(A, B, C, name)); +} + +value *builder::create_trans(value *A, const std::string &name) { + return insert(trans_inst::create(A, name)); +} + +value *builder::create_select(value *pred, value *if_value, value *else_value, const std::string &name){ + return insert(select_inst::create(pred, if_value, else_value, name)); } //===----------------------------------------------------------------------===// diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 2a44ec4fb..8a9205c4e 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -28,6 +28,8 @@ instruction::instruction(type *ty, unsigned num_ops, unsigned num_results, const void instruction::erase_from_parent() { parent_->erase(this); + for(ir::value* op: ops()) + op->erase_use(this); } bool instruction::has_tile_result_or_op() { @@ -482,27 +484,82 @@ instruction* broadcast_inst::create(value *arg, const type::tile_shapes_t &shape return new broadcast_inst(arg, shapes, name, next); } +// downcast + +instruction* downcast_inst::create(value *arg, const std::string &name, instruction *next) { + return new downcast_inst(arg->get_type()->get_scalar_ty(), arg, name, next); +} //===----------------------------------------------------------------------===// // matmul_inst classes //===----------------------------------------------------------------------===// -matmul_inst::matmul_inst(value *A, value *B, value *C, +dot_inst::dot_inst(value *A, value *B, value *C, TransT AT, TransT BT, const std::string &name, instruction *next) - : builtin_inst(C->get_type(), 3, 0, name, next) { + : builtin_inst(C->get_type(), 3, 1, name, next), AT_(AT), BT_(BT) { set_operand(0, A); set_operand(1, B); set_operand(2, C); } -instruction *matmul_inst::create(value *A, value *B, value *C, +instruction *dot_inst::create_nn(value *A, value *B, value *C, const std::string &name, instruction *next) { - return new matmul_inst(A, B, C, name, next); + return new dot_inst(A, B, C, NoTrans, NoTrans, name, next); +} + +instruction *dot_inst::create_nt(value *A, value *B, value *C, + const std::string &name, instruction *next) { + return new dot_inst(A, B, C, NoTrans, Trans, name, next); +} + +instruction *dot_inst::create_tn(value *A, value *B, value *C, + const std::string &name, instruction *next) { + return new dot_inst(A, B, C, Trans, NoTrans, name, next); +} + +instruction *dot_inst::create_tt(value *A, value *B, value *C, + const std::string &name, instruction *next) { + return new dot_inst(A, B, C, Trans, Trans, name, next); } +//===----------------------------------------------------------------------===// +// trans instructions +//===----------------------------------------------------------------------===// + +ir::type* trans_inst::get_res_ty(ir::type* ty) { + auto shapes = ty->get_tile_shapes(); + std::rotate(shapes.begin(), shapes.begin() + 1, shapes.end()); + return tile_type::get(ty->get_scalar_ty(), shapes); +} + +trans_inst::trans_inst(value *arg, const std::string &name, instruction *next) + : builtin_inst(get_res_ty(arg->get_type()), 1, 1, name, next) { + set_operand(0, arg); +} + +instruction* trans_inst::create(value *arg, const std::string &name, instruction *next) { + return new trans_inst(arg, name, next); +} + +//===----------------------------------------------------------------------===// +// select instructions +//===----------------------------------------------------------------------===// + +select_inst::select_inst(value *pred, value *if_value, value *else_value, const std::string &name, instruction *next) + : builtin_inst(if_value->get_type(), 3, 1, name, next){ + set_operand(0, pred); + set_operand(1, if_value); + set_operand(2, else_value); +} + +instruction* select_inst::create(value *pred, value *if_value, value *else_value, const std::string &name, instruction *next) { + return new select_inst(pred, if_value, else_value, name, next); +} //===----------------------------------------------------------------------===// // builtin instructions //===----------------------------------------------------------------------===// + +// get_global_range get_global_range_inst::get_global_range_inst(type *ty, unsigned axis, const std::string &name, instruction *next) : builtin_inst(ty, 0, 1, name, next), axis_(axis) { @@ -516,6 +573,28 @@ instruction* get_global_range_inst::create(context &ctx, unsigned axis, type::ti return new get_global_range_inst(tile_ty, axis, name, next); } +// get_range_id +get_range_id_inst::get_range_id_inst(type *ty, unsigned axis, const std::string &name, instruction *next) + : builtin_inst(ty, 0, 1, name, next), axis_(axis){ + +} + +instruction* get_range_id_inst::create(context &ctx, unsigned axis, const std::string &name, instruction *next) { + return new get_range_id_inst(type::get_int32_ty(ctx), axis, name, next); +} + +// atomic cas + +atomic_cas_inst::atomic_cas_inst(value *ptr, value *cmp, value *val, const std::string &name, instruction *next) + : builtin_inst(ptr->get_type()->get_pointer_element_ty(), 3, 1, name, next) { + set_operand(0, ptr); + set_operand(1, cmp); + set_operand(2, val); +} + +instruction* atomic_cas_inst::create(value *ptr, value *cmp, value *val, const std::string &name, instruction *next) { + return new atomic_cas_inst(ptr, cmp, val, name, next); +} //===----------------------------------------------------------------------===// // intrinsic instructions //===----------------------------------------------------------------------===// @@ -530,7 +609,7 @@ vectorize_inst* vectorize_inst::create(value *arg, const std::string &name, inst barrier_inst::barrier_inst(context &ctx, const std::string &name, instruction *next) - : instruction(type::get_void_ty(ctx), 0, 1, name, next){ } + : instruction(type::get_void_ty(ctx), 0, 0, name, next){ } barrier_inst* barrier_inst::create(context &ctx, const std::string &name, instruction *next) { return new barrier_inst(ctx, name, next); diff --git a/lib/ir/module.cpp b/lib/ir/module.cpp index 14f1337e1..d8f07ecc4 100644 --- a/lib/ir/module.cpp +++ b/lib/ir/module.cpp @@ -128,6 +128,9 @@ ir::value *module::get_value(const std::string& name) { return get_value(name, builder_.get_insert_block()); } +const std::string& module::get_name() { + return name_; +} void module::seal_block(ir::basic_block *block){ for(auto &x: incomplete_phis_[block]){ diff --git a/lib/ir/type.cpp b/lib/ir/type.cpp index 862039220..215e8f746 100644 --- a/lib/ir/type.cpp +++ b/lib/ir/type.cpp @@ -172,7 +172,7 @@ unsigned tile_type::get_bitwidth() const { tile_type* tile_type::get(type *elt_ty, const tile_shapes_t &shapes) { assert(elt_ty && "Can't get a tile of type!"); assert(shapes.size() && "Can't create a tile with empty shapes!"); - assert(is_valid_elt_ty(elt_ty) && "Invalid type for pointer element!"); + assert(is_valid_elt_ty(elt_ty) && "Invalid type for tile element!"); // look-up context_impl *impl = elt_ty->get_context().p_impl.get(); tile_type *&entry = impl->tile_tys[std::make_pair(elt_ty, shapes)]; diff --git a/lib/jit.cpp b/lib/jit.cpp index 068a824f0..9a4181e2a 100644 --- a/lib/jit.cpp +++ b/lib/jit.cpp @@ -68,7 +68,7 @@ void loop_nest(std::vector> const & iterates, std::function jit::make_llvm_module(ir::module &module, passes_wrapper &passes) { - llvm::Module* result = new llvm::Module("matmul", llvm_context_); + llvm::Module* result = new llvm::Module(module.get_name(), llvm_context_); passes.selection.run(module, *result); // launch information auto &launch_info_map = launch_info_map_[result->getName()]; @@ -79,14 +79,14 @@ std::unique_ptr jit::make_llvm_module(ir::module &module, passes_w return std::unique_ptr(result); } -std::unique_ptr jit::make_triton_module(const std::string &src) { +std::unique_ptr jit::make_triton_module(const std::string &name, const std::string &src) { // create AST from Triton-C source YY_BUFFER_STATE buffer = yy_scan_string(src.c_str()); yyparse(); yy_delete_buffer(buffer); translation_unit *program = ast_root; // create Triton-IR from AST - ir::module* module = new ir::module("matrix", triton_context_); + ir::module* module = new ir::module(name, triton_context_); program->codegen(module); return std::unique_ptr(module); } @@ -97,18 +97,20 @@ jit::jit(driver::context *context): driver_context_(context), } -void jit::autotune(const std::string &src, benchmark_t benchmark) { +void jit::autotune(const std::string &name, const std::string &src, benchmark_t benchmark) { // find metaparameters - auto ptt_module = make_triton_module(src); + auto ptt_module = make_triton_module(name, src); ir::module &tt_module = *ptt_module; // set parameters passes_wrapper passes(target_.get()); + passes.target_independent(tt_module); passes.tune.run(tt_module); auto mps = passes.tune.get_params(tt_module); // create parameter ranges std::vector> ranges; for(ir::metaparameter *mp: mps) ranges.push_back(mp->get_space()); +// std::cout << ranges.size() << std::endl; // iterate over parameters unsigned i; double best = 0; @@ -117,51 +119,56 @@ void jit::autotune(const std::string &src, benchmark_t benchmark) { i = 0; for(ir::metaparameter *mp: mps) mp->set_value(params[i++]); + passes.target_independent(tt_module); passes.tune.init(tt_module); if(!passes.tune.check_constraints(errors)) return; // Deep copy of the module and tuner - auto ptt_module = make_triton_module(src); + auto ptt_module = make_triton_module(name, src); ir::module &tt_module = *ptt_module; passes_wrapper passes(target_.get()); + passes.target_independent(tt_module); passes.tune.run(tt_module); i = 0; for(ir::metaparameter* mp: passes.tune.get_params(tt_module)){ mp->set_value(params[i++]); } passes.tune.init(tt_module); - passes.init(tt_module); + passes.target_dependent(tt_module); driver::device* device = driver_context_->device(); - if(passes.allocation.get_allocated_size() > device->max_shared_memory()) + if(passes.shmem_allocation.get_allocated_size() > device->max_shared_memory()) return; if(passes.tune.get_num_threads() > device->max_threads_per_block()) return; // Compile auto ll_module = make_llvm_module(tt_module, passes); std::unique_ptr module(driver::module::create(driver_context_, &*ll_module)); - std::unique_ptr kernel(driver::kernel::create(module.get(), "matmul")); - launch_information info = launch_info_map_.at("matmul"); + std::unique_ptr kernel(driver::kernel::create(module.get(), name.c_str())); + launch_information info = launch_info_map_.at(name.c_str()); for(unsigned p: params) std::cout << p << " " << std::flush; // add globals for(auto x: tt_module.globals()) global_ints_[x.first] = ((ir::metaparameter*)x.second)->get_value(); + modules_.push_back(module.get()); double perf; perf = benchmark(kernel.get(), info); best = std::max(perf, best); std::cout << perf << " [ " << best << " ] " << std::endl; + modules_.pop_back(); }); } void jit::add_module(ir::module &tt_module, const std::vector ¶ms) { // set parameters passes_wrapper passes(target_.get()); + passes.target_independent(tt_module); passes.tune.run(tt_module); unsigned i = 0; for(ir::metaparameter* mp: passes.tune.get_params(tt_module)) mp->set_value(params[i++]); passes.tune.init(tt_module); - passes.init(tt_module); + passes.target_dependent(tt_module); // check constraints std::map> errors; passes.tune.check_constraints(errors); @@ -184,8 +191,8 @@ void jit::add_module(ir::module &tt_module, const std::vector ¶ms) global_ints_[x.first] = ((ir::metaparameter*)x.second)->get_value(); } -void jit::add_module(const std::string &src, const std::vector ¶ms) { - auto ptt_module = make_triton_module(src); +void jit::add_module(const std::string &name, const std::string &src, const std::vector ¶ms) { + auto ptt_module = make_triton_module(name, src); add_module(*ptt_module, params); } @@ -201,4 +208,9 @@ unsigned jit::get_int(const std::string &name){ return global_ints_.at(name); } +driver::buffer *jit::get_buffer(const std::string &name){ + driver::cu_module *mod = (driver::cu_module*)modules_.front(); + return mod->symbol(name.c_str()); +} + } From b6af06910da8f1433ae8d28ac97b1587862a4cc2 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 25 Apr 2019 16:24:16 -0400 Subject: [PATCH 124/494] [examples] deleted placeholders for not implemented examples --- examples/cpp/blocksparse.cpp | 153 ----------------------------------- 1 file changed, 153 deletions(-) delete mode 100644 examples/cpp/blocksparse.cpp diff --git a/examples/cpp/blocksparse.cpp b/examples/cpp/blocksparse.cpp deleted file mode 100644 index 5a816aff1..000000000 --- a/examples/cpp/blocksparse.cpp +++ /dev/null @@ -1,153 +0,0 @@ -#include -#include -#include "common.hpp" -#include "triton/jit.h" -#include "triton/driver/backend.h" -#include "triton/driver/stream.h" - -const char* src = -R"( -const tunable int32 TM = {16, 32, 64, 128}; -const tunable int32 TN = {8}; -const tunable int32 TK = {8}; - -void blocksparse(restrict read_only fp32 *a, restrict read_only fp32 *b, fp32 *c, - int32 M, int32 N, int32 K, int32 bound){ - int32 rxa[TM] = get_global_range[TM](0); - int32 ryb[TN] = get_global_range[TN](1); - int32 rka[TK] = 0 ... TK; - int32 rkb[TK] = 0 ... TK; - fp32 C[TM, TN] = 0; - fp32* pa[TM, TK] = a + rka[newaxis, :]*M + rxa[:, newaxis]; - fp32* pb[TN, TK] = b + rkb[newaxis, :]*K + ryb[:, newaxis]; - fp32 a[TM, TK] = *pa; - fp32 b[TN, TK] = *pb; - for(int32 k = K; k > 0;){ - C = dot(a, trans(b), C); - pa = pa + TK*M; - pb = pb + TK*N; - k = k - TK; - int1 checka[TM, TK] = k > bound; - int1 checkb[TN, TK] = k > bound; - @checka a = *pa; - @checkb b = *pb; - if(k > bound) - continue; - int1 checka0[TM] = rxa < M; - int1 checka1[TK] = rka < k; - int1 checkb0[TN] = ryb < N; - int1 checkb1[TK] = rkb < k; - checka = checka0[:, newaxis] && checka1[newaxis, :]; - checkb = checkb0[:, newaxis] && checkb1[newaxis, :]; - a = checka ? *pa : 0; - b = checkb ? *pb : 0; - } - int32 rxc[TM] = get_global_range[TM](0); - int32 ryc[TN] = get_global_range[TN](1); - fp32* pc[TM, TN] = c + ryc[newaxis, :]*M + rxc[:, newaxis]; - int1 checkc0[TM] = rxc < M; - int1 checkc1[TN] = ryc < N; - int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - @checkc *pc = C; -} -)"; - -std::vector make_deltas(std::vector mask, int K, int N){ - std::vector>> pairs(N); - unsigned int current = 0; - for(int k = 0; k < K; k++) - for(int n = 0; n < N; n++){ - if(mask[k + n*K]) - pairs[n].push_back({current, k}); - } -} - -int main() { - // initialize default compute device - auto context = triton::driver::backend::contexts::get_default(); - triton::jit jit(context); - - - // matrix multiplication parameters - int32_t M = 512, N = 32, K = 2048; - std::vector hc(M*N); - std::vector rc(M*N); - std::vector ha(M*K); - std::vector hb(K*N); - srand(0); - for(size_t i = 0; i < ha.size(); i++) - ha[i] = (float)rand()/RAND_MAX; - for(size_t i = 0; i < hb.size(); i++) - hb[i] = (float)rand()/RAND_MAX; - for(size_t i = 0; i < hc.size(); i++) - hc[i] = 0; - triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*4); - triton::driver::buffer* da = triton::driver::buffer::create(context, ha.size()*4); - triton::driver::buffer* db = triton::driver::buffer::create(context, hb.size()*4); - triton::driver::stream* stream = triton::driver::stream::create(context); - stream->write(da, true, 0, ha); - stream->write(db, true, 0, hb); - stream->write(dc, true, 0, hc); - stream->synchronize(); - - - // benchmark a given matrix multiplication kernel - auto benchmark = [&](triton::driver::kernel* kernel, - triton::jit::launch_information info) { - // launch info - unsigned TM = info.global_range_size[0]; - unsigned TN = info.global_range_size[1]; - unsigned nthreads = info.num_threads; - std::array grid = {(M + TM - 1)/TM, (N + TN - 1)/TN, 1}; - // fast bounds-checking - unsigned TK = jit.get_int("TK"); - unsigned lasti = (grid[0]*TM - 1)*TM + TM - 1; - unsigned lastj = (grid[1]*TN - 1)*TN + TN - 1; - unsigned lastk = TK - 1; - bool AT = false; - bool BT = true; - unsigned last_safe_a = (AT==false)?(M*K - 1 - lasti)/M - lastk : M*K - 1 - lasti*K - lastk; - unsigned last_safe_b = (BT==true)?(N*K - 1 - lastj)/N - lastk : N*K - 1 - lastj*K - lastk; - int32_t bound = std::max(1, std::max(K - last_safe_a, K - last_safe_b)); - // set argument - kernel->setArg(0, da); - kernel->setArg(1, db); - kernel->setArg(2, dc); - kernel->setArg(3, M); - kernel->setArg(4, N); - kernel->setArg(5, K); - kernel->setArg(6, bound); - // dry run - stream->enqueue(kernel, grid, {nthreads, 1, 1}); - stream->synchronize(); - // benchmark - double ts = bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, - [&](){ stream->synchronize(); }, *context->device()); - ts = ts * 1e-9; - double tflops = 2.*M*N*K / ts * 1e-12; - return tflops; - }; - - - // just-in-time compile source-code - std::vector params = { - 16, 2, 64, - 32, 2, 64, - 16, 8, 2, 2, - 8, 8, - 4 - }; - jit.autotune("matmul",src, benchmark); - jit.add_module("matmul", src, params); - triton::driver::kernel* kernel = jit.get_function("matmul"); - triton::jit::launch_information info = jit.get_launch_info("matmul"); - std::cout << "Performance: " << benchmark(kernel, info) << " TFLOPS " << std::endl; - stream->read(dc, true, 0, hc); - simple_gemm(rc, ha, hb, M, N, K); - for(size_t i = 0; i < M*N; i++) - if(std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ - std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; - exit(EXIT_FAILURE); - } - std::cout << "Pass!" << std::endl; -} From 4b77b764bac45c85627bc2d97d015565dabd2597 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 26 Apr 2019 12:06:10 -0400 Subject: [PATCH 125/494] [triton-c] added support for while loops --- examples/cpp/dot.cpp | 10 +-- examples/cpp/shift.cpp | 2 +- include/triton/ast/ast.h | 13 ++++ include/triton/ast/parser.y | 6 +- include/triton/ast/scanner.l | 128 +++++++++++++++++------------------ include/triton/jit.h | 1 - lib/ast/lowering.cpp | 25 +++++++ lib/codegen/selection.cpp | 1 + lib/driver/module.cpp | 2 +- 9 files changed, 112 insertions(+), 76 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 7bda6c775..6af194bd6 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -10,7 +10,7 @@ R"( const tunable int32 TM = {16, 32, 64, 128}; const tunable int32 TN = {16, 32, 64, 128}; const tunable int32 TK = {8}; -const tunable int32 GZ = {1}; +const tunable int32 GZ = {2}; void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, int32 M, int32 N, int32 K, @@ -57,7 +57,7 @@ void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, int32 ridy = get_range_id(1); fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; int32 *plock = locks + ridx + ridy*grid0; - for(int32 L = __atomic_cas(plock, 0, 1); L == 1; L = __atomic_cas(plock, 0, 1)){} + while(__atomic_cas(plock, 0, 1) == 1); int32 *pcount = plock + grid0*grid1; int32 count = *pcount; int32 countp1 = select(count == GZ - 1, 0, count + 1); @@ -69,7 +69,7 @@ void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, *pcount = countp1; } else { - @checkc *pc = c + (checkc ? *pc : 0); + *pc = c + (checkc ? *pc : 0); *pcount = countp1; } __atomic_cas(plock, 1, 0); @@ -82,7 +82,7 @@ int main() { triton::jit jit(context); // matrix multiplication parameters - int32_t M = 512, N = 512, K = 512; + int32_t M = 256, N = 256, K = 2048; std::vector hc(M*N); std::vector rc(M*N); std::vector ha(M*K); @@ -144,7 +144,7 @@ int main() { // just-in-time compile source-code std::vector params = { - 16, 2, 64, 16, 2, 64, 16, 8, 2, 2, 8, 8, 8, 1 + 16, 2, 64, 16, 2, 64, 16, 8, 2, 2, 8, 8, 8, 4 }; // jit.autotune("matmul",src, benchmark); jit.add_module("matmul", src, params); diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index f75046e2f..026cdfaea 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -195,7 +195,7 @@ int main() { 8, 8, 4 }; -// jit.autotune("shift", src, benchmark); + jit.autotune("shift", src, benchmark); jit.add_module("shift", src, params); triton::driver::kernel* kernel = jit.get_function("shift"); triton::jit::launch_information info = jit.get_launch_info("shift"); diff --git a/include/triton/ast/ast.h b/include/triton/ast/ast.h index 8eccd6f92..ab8a9362c 100644 --- a/include/triton/ast/ast.h +++ b/include/triton/ast/ast.h @@ -462,6 +462,19 @@ private: const node *statements_; }; +class while_statement: public statement{ +public: + while_statement(node *cond, node *statements) + : cond_(cond), statements_(statements) + { } + + ir::value* codegen(ir::module *) const; + +private: + const node *cond_; + const node *statements_; +}; + // Jump class jump_statement: public statement{ diff --git a/include/triton/ast/parser.y b/include/triton/ast/parser.y index 5302c7d14..9dab092de 100644 --- a/include/triton/ast/parser.y +++ b/include/triton/ast/parser.y @@ -53,9 +53,9 @@ STORAGE_SPEC_T get_storage_spec(node *op) { return ((token*)op)->storage_spec;} %token SUB_ASSIGN LEFT_ASSIGN RIGHT_ASSIGN AND_ASSIGN %token XOR_ASSIGN OR_ASSIGN TYPE_NAME %token VOID UINT1 UINT8 UINT16 UINT32 UINT64 INT1 INT8 INT16 INT32 INT64 FP32 FP64 -%token IF ELSE FOR CONTINUE +%token IF ELSE FOR CONTINUE WHILE %token NEWAXIS ELLIPSIS AT -%token GET_GLOBAL_RANGE GET_RANGE_ID DOT TRANS MAX MIN SELECT ATOMIC_CAS ALLOC_CONST +%token GET_GLOBAL_RANGE GET_RANGE_ID DOT TRANS MAX MIN SELECT ATOMIC_CAS ATOMIC_EXCHG ALLOC_CONST %start translation_unit %% @@ -314,7 +314,7 @@ iteration_statement : FOR '(' expression_statement expression_statement expression ')' statement { $$ = new iteration_statement($3, $4, $5, $7); } | FOR '(' declaration expression_statement ')' statement { $$ = new iteration_statement($3, $4, nullptr, $6); } | FOR '(' declaration expression_statement expression ')' statement { $$ = new iteration_statement($3, $4, $5, $7); } - ; + | WHILE '(' expression ')' statement { $$ = new while_statement($3, $5); }; jump_statement : CONTINUE ';' { $$ = new continue_statement(); } diff --git a/include/triton/ast/scanner.l b/include/triton/ast/scanner.l index e4e018a14..9a47f929f 100644 --- a/include/triton/ast/scanner.l +++ b/include/triton/ast/scanner.l @@ -22,92 +22,90 @@ using triton::ast::return_void; "read_only" { return return_impl(READONLY, yytext); } "write_only" { return return_impl(WRITEONLY, yytext); } "@" { return return_impl(AT, yytext); } -"newaxis" { return return_impl(NEWAXIS, yytext); } -"if" { return return_impl(IF, yytext); } -"else" { return return_impl(ELSE, yytext); } -"for" { return return_impl(FOR, yytext); } -"void" { return return_impl(VOID, yytext); } +"newaxis" { return return_impl(NEWAXIS, yytext); } +"if" { return return_impl(IF, yytext); } +"else" { return return_impl(ELSE, yytext); } +"for" { return return_impl(FOR, yytext); } +"while" { return return_impl(WHILE, yytext); } +"void" { return return_impl(VOID, yytext); } "uint1" { return return_impl(UINT1, yytext); } -"uint8" { return return_impl(UINT8, yytext); } -"uint16" { return return_impl(UINT16, yytext); } -"uint32" { return return_impl(UINT32, yytext); } -"uint64" { return return_impl(UINT64, yytext); } +"uint8" { return return_impl(UINT8, yytext); } +"uint16" { return return_impl(UINT16, yytext); } +"uint32" { return return_impl(UINT32, yytext); } +"uint64" { return return_impl(UINT64, yytext); } "int1" { return return_impl(INT1, yytext); } -"int8" { return return_impl(INT8, yytext); } -"int16" { return return_impl(INT16, yytext); } -"int32" { return return_impl(INT32, yytext); } -"int64" { return return_impl(INT64, yytext); } -"fp32" { return return_impl(FP32, yytext); } -"fp64" { return return_impl(FP64, yytext); } +"int8" { return return_impl(INT8, yytext); } +"int16" { return return_impl(INT16, yytext); } +"int32" { return return_impl(INT32, yytext); } +"int64" { return return_impl(INT64, yytext); } +"fp32" { return return_impl(FP32, yytext); } +"fp64" { return return_impl(FP64, yytext); } "..." { return return_impl(ELLIPSIS, yytext); } "get_global_range" { return return_impl(GET_GLOBAL_RANGE, yytext); } "get_range_id" { return return_impl(GET_RANGE_ID, yytext); } "__atomic_cas" { return return_impl(ATOMIC_CAS, yytext); } +"__atomic_exchg" { return return_impl(ATOMIC_EXCHG, yytext); } "dot" { return return_impl(DOT, yytext); } "max" { return return_impl(MAX, yytext); } "min" { return return_impl(MIN, yytext); } -"select" { return return_impl(SELECT, yytext); } +"select" { return return_impl(SELECT, yytext); } "trans" { return return_impl(TRANS, yytext); } "continue" { return return_impl(CONTINUE, yytext); } "alloc_const" { return return_impl(ALLOC_CONST, yytext); } {L}({L}|{D})* { return return_impl(IDENTIFIER, yytext); } - 0[xX]{H}+{IS}? { return return_impl(CONSTANT, yytext); } 0{D}+{IS}? { return return_impl(CONSTANT, yytext); } -{D}+{IS}? { return return_impl(CONSTANT, yytext); } +{D}+{IS}? { return return_impl(CONSTANT, yytext); } L?'(\\.|[^\\'])+' { return return_impl(CONSTANT, yytext); } - -{D}+{E}{FS}? { return return_impl(CONSTANT, yytext); } - +{D}+{E}{FS}? { return return_impl(CONSTANT, yytext); } L?\"(\\.|[^\\"])*\" { return return_impl(STRING_LITERAL, yytext); } - ">>=" { return return_impl(RIGHT_ASSIGN, yytext); } -"<<=" { return return_impl(LEFT_ASSIGN, yytext); } -"+=" { return return_impl(ADD_ASSIGN, yytext); } -"-=" { return return_impl(SUB_ASSIGN, yytext); } -"*=" { return return_impl(MUL_ASSIGN, yytext); } -"/=" { return return_impl(DIV_ASSIGN, yytext); } -"%=" { return return_impl(MOD_ASSIGN, yytext); } -"&=" { return return_impl(AND_ASSIGN, yytext); } -"^=" { return return_impl(XOR_ASSIGN, yytext); } -"|=" { return return_impl(OR_ASSIGN, yytext); } -">>" { return return_impl(RIGHT_OP, yytext); } -"<<" { return return_impl(LEFT_OP, yytext); } -"++" { return return_impl(INC_OP, yytext); } -"--" { return return_impl(DEC_OP, yytext); } -"->" { return return_impl(PTR_OP, yytext); } -"&&" { return return_impl(AND_OP, yytext); } -"||" { return return_impl(OR_OP, yytext); } -"<=" { return return_impl(LE_OP, yytext); } -">=" { return return_impl(GE_OP, yytext); } -"==" { return return_impl(EQ_OP, yytext); } -"!=" { return return_impl(NE_OP, yytext); } -";" { return return_impl(';', yytext); } +"<<=" { return return_impl(LEFT_ASSIGN, yytext); } +"+=" { return return_impl(ADD_ASSIGN, yytext); } +"-=" { return return_impl(SUB_ASSIGN, yytext); } +"*=" { return return_impl(MUL_ASSIGN, yytext); } +"/=" { return return_impl(DIV_ASSIGN, yytext); } +"%=" { return return_impl(MOD_ASSIGN, yytext); } +"&=" { return return_impl(AND_ASSIGN, yytext); } +"^=" { return return_impl(XOR_ASSIGN, yytext); } +"|=" { return return_impl(OR_ASSIGN, yytext); } +">>" { return return_impl(RIGHT_OP, yytext); } +"<<" { return return_impl(LEFT_OP, yytext); } +"++" { return return_impl(INC_OP, yytext); } +"--" { return return_impl(DEC_OP, yytext); } +"->" { return return_impl(PTR_OP, yytext); } +"&&" { return return_impl(AND_OP, yytext); } +"||" { return return_impl(OR_OP, yytext); } +"<=" { return return_impl(LE_OP, yytext); } +">=" { return return_impl(GE_OP, yytext); } +"==" { return return_impl(EQ_OP, yytext); } +"!=" { return return_impl(NE_OP, yytext); } +";" { return return_impl(';', yytext); } ("{"|"<%") { return return_impl('{', yytext); } ("}"|"%>") { return return_impl('}', yytext); } -"," { return return_impl(',', yytext); } -":" { return return_impl(':', yytext); } -"=" { return return_impl('=', yytext); } -"(" { return return_impl('(', yytext); } -")" { return return_impl(')', yytext); } -("["|"<:") { return return_impl('[', yytext); } -("]"|":>") { return return_impl(']', yytext); } -"." { return return_impl('.', yytext); } -"&" { return return_impl('&', yytext); } -"!" { return return_impl('!', yytext); } -"~" { return return_impl('~', yytext); } -"-" { return return_impl('-', yytext); } -"+" { return return_impl('+', yytext); } -"*" { return return_impl('*', yytext); } -"/" { return return_impl('/', yytext); } -"%" { return return_impl('%', yytext); } -"<" { return return_impl('<', yytext); } -">" { return return_impl('>', yytext); } -"^" { return return_impl('^', yytext); } -"|" { return return_impl('|', yytext); } -"?" { return return_impl('?', yytext); } +"," { return return_impl(',', yytext); } +":" { return return_impl(':', yytext); } +"=" { return return_impl('=', yytext); } +"(" { return return_impl('(', yytext); } +")" { return return_impl(')', yytext); } +("["|"<:") { return return_impl('[', yytext); } +("]"|":>") { return return_impl(']', yytext); } +"." { return return_impl('.', yytext); } +"&" { return return_impl('&', yytext); } +"!" { return return_impl('!', yytext); } +"~" { return return_impl('~', yytext); } +"-" { return return_impl('-', yytext); } +"+" { return return_impl('+', yytext); } +"*" { return return_impl('*', yytext); } +"/" { return return_impl('/', yytext); } +"%" { return return_impl('%', yytext); } +"<" { return return_impl('<', yytext); } +">" { return return_impl('>', yytext); } +"^" { return return_impl('^', yytext); } +"|" { return return_impl('|', yytext); } +"?" { return return_impl('?', yytext); } [ \t\v\n\f] { return_void(yytext);} -. { /* ignore bad characters */ } +. { /* ignore bad characters */ } %% diff --git a/include/triton/jit.h b/include/triton/jit.h index b9c502aad..5bbb71d6d 100644 --- a/include/triton/jit.h +++ b/include/triton/jit.h @@ -60,7 +60,6 @@ public: void target_independent(ir::module &module) { optimize_dot.run(module); optimize_trans.run(module); -// ir::print(module, std::cout); } void target_dependent(ir::module &module) { diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index 45906cef8..b222c3712 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -371,6 +371,31 @@ ir::value* iteration_statement::codegen(ir::module *mod) const{ return nullptr; } +/* While statement */ +ir::value* while_statement::codegen(ir::module* mod) const{ + ir::builder &builder = mod->get_builder(); + ir::context &ctx = mod->get_context(); + ir::basic_block *current_bb = builder.get_insert_block(); + ir::function *fn = current_bb->get_parent(); + ir::basic_block *loop_bb = ir::basic_block::create(ctx, "loop", fn); + ir::basic_block *next_bb = ir::basic_block::create(ctx, "postloop", fn); + mod->set_continue_fn([&](){ + ir::value *cond = cond_->codegen(mod); + return builder.create_cond_br(cond, loop_bb, next_bb); + }); + ir::value *cond = cond_->codegen(mod); + builder.create_cond_br(cond, loop_bb, next_bb); + builder.set_insert_point(loop_bb); + if(!is_terminator(statements_->codegen(mod))) + mod->get_continue_fn()(); + ir::basic_block *stop_bb = builder.get_insert_block(); + mod->seal_block(stop_bb); + mod->seal_block(loop_bb); + mod->seal_block(builder.get_insert_block()); + mod->seal_block(next_bb); + builder.set_insert_point(next_bb); +} + /* Selection statement */ ir::value* selection_statement::codegen(ir::module* mod) const{ ir::builder &builder = mod->get_builder(); diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index c04b4cdfb..f71c1d412 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -337,6 +337,7 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::functiongetParent()); Value *ptr = builder.CreateGEP(sh_mem_ptr_, builder.getInt32(alloc_->get_offset(ii))); ptr = builder.CreateBitCast(ptr, PointerType::get(builder.getInt32Ty(), ptr->getType()->getPointerAddressSpace())); + tgt_->add_barrier(module, builder); builder.CreateCondBr(pred, tid_0_bb, tid_0_done_bb); builder.SetInsertPoint(tid_0_bb); Value *cas_ptr = value(ii->get_operand(0)); diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 641b900b4..1f158b2a0 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -253,7 +253,7 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ -// std::cout << source << std::endl; + std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; From af58b8bd81f6c01581a2bd26459941e5b7c998b6 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 27 Apr 2019 14:00:15 -0400 Subject: [PATCH 126/494] [triton-c] predicate in assignment statement now propagates to rhs computations --- examples/cpp/dot.cpp | 2 +- include/triton/ast/ast.h | 4 ++-- include/triton/ir/instructions.h | 6 +++--- include/triton/jit.h | 1 + lib/ast/lowering.cpp | 34 +++++++++++++++++++++----------- lib/codegen/selection.cpp | 4 ++-- lib/driver/module.cpp | 2 +- lib/ir/builder.cpp | 2 +- lib/ir/instructions.cpp | 6 +++--- 9 files changed, 36 insertions(+), 25 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 6af194bd6..3461b8a7c 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -69,7 +69,7 @@ void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, *pcount = countp1; } else { - *pc = c + (checkc ? *pc : 0); + @checkc *pc = c + *pc; *pcount = countp1; } __atomic_cas(plock, 1, 0); diff --git a/include/triton/ast/ast.h b/include/triton/ast/ast.h index ab8a9362c..3d1da3064 100644 --- a/include/triton/ast/ast.h +++ b/include/triton/ast/ast.h @@ -410,13 +410,13 @@ class statement: public block_item{ class expression_statement: public statement{ public: expression_statement(node *expr, node *mask = nullptr) - : expr_((expression*)expr), mask_((expression*)mask){ } + : expr_((expression*)expr), pred_((expression*)mask){ } ir::value* codegen(ir::module * mod) const; private: expression *expr_; - expression *mask_; + expression *pred_; }; diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index 961bb43ce..8e1da57ae 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -335,15 +335,15 @@ public: }; // merge -class merge_inst: public instruction { +class psi_inst: public instruction { private: std::string repr_impl() const { return "merge"; } - merge_inst(ir::value *mask_true, ir::value *value_true, + psi_inst(ir::value *mask_true, ir::value *value_true, ir::value *mask_false, ir::value *value_false, const std::string &name, instruction *next); public: - static merge_inst* create(ir::value *mask_true, ir::value *value_true, + static psi_inst* create(ir::value *mask_true, ir::value *value_true, ir::value *mask_false, ir::value *value_false, const std::string &name = "", instruction *next = nullptr); ir::value *get_mask_true() { return get_operand(0); } diff --git a/include/triton/jit.h b/include/triton/jit.h index 5bbb71d6d..ffef8ed1f 100644 --- a/include/triton/jit.h +++ b/include/triton/jit.h @@ -70,6 +70,7 @@ public: shmem_barriers.run(module); } vectorize.run(module); + ir::print(module, std::cout); } codegen::tune tune; diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index b222c3712..afb80b68b 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -320,23 +320,33 @@ ir::value* compound_statement::codegen(ir::module* mod) const{ /* expression statement */ ir::value* expression_statement::codegen(ir::module *mod) const{ ir::builder &builder = mod->get_builder(); - if(mask_) { - ir::value *pred = mask_->codegen(mod); - ir::mask_inst *mask = (ir::mask_inst*)builder.create_mask(pred); - ir::value *true_value = expr_->codegen(mod); + ir::basic_block *block = builder.get_insert_block(); + if(pred_) { + // check that it is an assignment assignment_expression *assignment = dynamic_cast(expr_); assert(assignment); - - ir::type *ty = true_value->get_type(); - if(auto *itn = dynamic_cast(true_value)) - itn->set_mask_pred(mask->get_result(0)); + // generate mask + ir::value *pred = pred_->codegen(mod); + ir::mask_inst *mask = (ir::mask_inst*)builder.create_mask(pred); + // generate expression + unsigned szbegin = block->get_inst_list().size(); + ir::value *expr = expr_->codegen(mod); + ir::basic_block::iterator begin = block->begin(); + std::advance(begin, szbegin); + // set mask + ir::type *ty = expr->get_type(); + for(auto it = begin; it != builder.get_insert_point(); it++) + (*it)->set_mask_pred(mask->get_result(0)); +// if(auto *itn = dynamic_cast(expr)) +// itn->set_mask_pred(mask->get_result(0)); if(ty->is_void_ty()) - return true_value; - ir::merge_inst *merge = (ir::merge_inst*)builder.create_merge(mask->get_result(0), true_value, + return expr; + // merge with psi + ir::psi_inst *psi = (ir::psi_inst*)builder.create_merge(mask->get_result(0), expr, mask->get_result(1), ir::undef_value::get(ty)); std::string name = ((named_expression*)assignment->lvalue())->id()->name(); - mod->set_value(name, merge); - return merge; + mod->set_value(name, psi); + return psi; } return expr_->codegen(mod); } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index f71c1d412..7750502bd 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -690,7 +690,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & }); } // merge - else if(auto *merge = dynamic_cast(ins)) { + else if(auto *merge = dynamic_cast(ins)) { distributed_tile* mask_tile_true = (distributed_tile*)tmap_.at(merge->get_mask_true()); distributed_tile *value_tile_true = (distributed_tile*)tmap_.at(merge->get_value_true()); distributed_tile* mask_tile_false = (distributed_tile*)tmap_.at(merge->get_mask_false()); @@ -951,7 +951,7 @@ void selection::run(ir::module &src, Module &dst) { dst_builder.SetInsertPoint(parent); for(ir::instruction *i: block->get_inst_list()){ BasicBlock *current = dst_builder.GetInsertBlock(); - bool phi_inserted = (dynamic_cast(i) || dynamic_cast(i)) && !current->empty(); + bool phi_inserted = (dynamic_cast(i) || dynamic_cast(i)) && !current->empty(); if(phi_inserted && current->getFirstNonPHI()) dst_builder.SetInsertPoint(&*current->getFirstNonPHI()); lower_instruction(i, dst_builder); diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 1f158b2a0..641b900b4 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -253,7 +253,7 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ - std::cout << source << std::endl; +// std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index c913c37e8..d82ee2c3b 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -92,7 +92,7 @@ value *builder::create_mask(value *pred, const std::string &name){ } value *builder::create_merge(value *mask_true, value *value_true, value *mask_false, value *value_false, const std::string &name) { - return insert(merge_inst::create(mask_true, value_true, mask_false, value_false, name)); + return insert(psi_inst::create(mask_true, value_true, mask_false, value_false, name)); } diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 8a9205c4e..79c951f6d 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -334,7 +334,7 @@ mask_inst* mask_inst::create(value *pred, const std::string &name, instruction * } // merge_inst -merge_inst::merge_inst(value *mask_true, value *value_true, +psi_inst::psi_inst(value *mask_true, value *value_true, value *mask_false, value *value_false, const std::string &name, instruction *next) : instruction(value_true->get_type(), 4, 1, name, next) { @@ -344,10 +344,10 @@ merge_inst::merge_inst(value *mask_true, value *value_true, set_operand(3, value_false); } -merge_inst* merge_inst::create(value *mask_true, value *value_true, +psi_inst* psi_inst::create(value *mask_true, value *value_true, value *mask_false, value *value_false, const std::string &name, instruction *next) { - return new merge_inst(mask_true, value_true, mask_false, value_false, name, next); + return new psi_inst(mask_true, value_true, mask_false, value_false, name, next); } From 93f53501c66e70a83607f1a3fd062e81d3c0b622 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 28 Apr 2019 00:31:08 -0400 Subject: [PATCH 127/494] [triton-c] added implicit conversion to bool in while/for loops --- examples/cpp/dot.cpp | 8 ++++---- include/triton/jit.h | 2 +- lib/ast/lowering.cpp | 10 +++++----- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 3461b8a7c..3be7d8880 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -10,7 +10,7 @@ R"( const tunable int32 TM = {16, 32, 64, 128}; const tunable int32 TN = {16, 32, 64, 128}; const tunable int32 TK = {8}; -const tunable int32 GZ = {2}; +const tunable int32 GZ = {1}; void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, int32 M, int32 N, int32 K, @@ -57,7 +57,7 @@ void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, int32 ridy = get_range_id(1); fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; int32 *plock = locks + ridx + ridy*grid0; - while(__atomic_cas(plock, 0, 1) == 1); + while(__atomic_cas(plock, 0, 1)); int32 *pcount = plock + grid0*grid1; int32 count = *pcount; int32 countp1 = select(count == GZ - 1, 0, count + 1); @@ -82,7 +82,7 @@ int main() { triton::jit jit(context); // matrix multiplication parameters - int32_t M = 256, N = 256, K = 2048; + int32_t M = 512, N = 512, K = 512; std::vector hc(M*N); std::vector rc(M*N); std::vector ha(M*K); @@ -144,7 +144,7 @@ int main() { // just-in-time compile source-code std::vector params = { - 16, 2, 64, 16, 2, 64, 16, 8, 2, 2, 8, 8, 8, 4 + 16, 2, 64, 16, 2, 64, 16, 8, 2, 2, 8, 8, 8, 1 }; // jit.autotune("matmul",src, benchmark); jit.add_module("matmul", src, params); diff --git a/include/triton/jit.h b/include/triton/jit.h index ffef8ed1f..e28de674b 100644 --- a/include/triton/jit.h +++ b/include/triton/jit.h @@ -70,7 +70,7 @@ public: shmem_barriers.run(module); } vectorize.run(module); - ir::print(module, std::cout); +// ir::print(module, std::cout); } codegen::tune tune; diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index afb80b68b..c7fff7160 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -351,7 +351,7 @@ ir::value* expression_statement::codegen(ir::module *mod) const{ return expr_->codegen(mod); } -/* Iteration statement */ +/* For statement */ ir::value* iteration_statement::codegen(ir::module *mod) const{ ir::builder &builder = mod->get_builder(); ir::context &ctx = mod->get_context(); @@ -362,11 +362,11 @@ ir::value* iteration_statement::codegen(ir::module *mod) const{ mod->set_continue_fn([&](){ if(exec_) exec_->codegen(mod); - ir::value *cond = stop_->codegen(mod); + ir::value *cond = explicit_cast(builder, stop_->codegen(mod), ir::type::get_int1_ty(ctx)); return builder.create_cond_br(cond, loop_bb, next_bb); }); init_->codegen(mod); - ir::value *cond = stop_->codegen(mod); + ir::value *cond = explicit_cast(builder, stop_->codegen(mod), ir::type::get_int1_ty(ctx)); builder.create_cond_br(cond, loop_bb, next_bb); // builder.create_br(loop_bb); builder.set_insert_point(loop_bb); @@ -390,10 +390,10 @@ ir::value* while_statement::codegen(ir::module* mod) const{ ir::basic_block *loop_bb = ir::basic_block::create(ctx, "loop", fn); ir::basic_block *next_bb = ir::basic_block::create(ctx, "postloop", fn); mod->set_continue_fn([&](){ - ir::value *cond = cond_->codegen(mod); + ir::value *cond = explicit_cast(builder, cond_->codegen(mod), ir::type::get_int1_ty(ctx)); return builder.create_cond_br(cond, loop_bb, next_bb); }); - ir::value *cond = cond_->codegen(mod); + ir::value *cond = explicit_cast(builder, cond_->codegen(mod), ir::type::get_int1_ty(ctx)); builder.create_cond_br(cond, loop_bb, next_bb); builder.set_insert_point(loop_bb); if(!is_terminator(statements_->codegen(mod))) From 8e809a9536f5f3cdceb665d67d65abed8b5ad479 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 30 Apr 2019 10:50:54 -0400 Subject: [PATCH 128/494] [examples] added skeleton for tensorflow op --- examples/CMakeLists.txt | 1 + examples/cpp/dot.cpp | 3 +- examples/python/CMakeLists.txt | 1 + examples/python/tensorflow/CMakeLists.txt | 12 ++++ examples/python/tensorflow/blocksparse.cpp | 38 +++++++++++ examples/python/tensorflow/setup.py | 74 ++++++++++++++++++++++ 6 files changed, 127 insertions(+), 2 deletions(-) create mode 100644 examples/python/CMakeLists.txt create mode 100644 examples/python/tensorflow/CMakeLists.txt create mode 100644 examples/python/tensorflow/blocksparse.cpp create mode 100644 examples/python/tensorflow/setup.py diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 2322a85f7..8277f0611 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1 +1,2 @@ add_subdirectory(cpp) +add_subdirectory(python) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 3be7d8880..ecbcab0ed 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -103,6 +103,7 @@ int main() { stream->write(da, true, 0, ha); stream->write(db, true, 0, hb); stream->write(dc, true, 0, hc); + stream->write(dlocks, true, 0, hlocks); stream->synchronize(); @@ -115,8 +116,6 @@ int main() { unsigned nthreads = info.num_threads; unsigned GZ = jit.get_int("GZ"); std::array grid = {(M + TM - 1)/TM, (N + TN - 1)/TN, GZ}; - // init locks - stream->write(dlocks, true, 0, hlocks); // set argument kernel->setArg(0, da); kernel->setArg(1, db); diff --git a/examples/python/CMakeLists.txt b/examples/python/CMakeLists.txt new file mode 100644 index 000000000..82844b5a0 --- /dev/null +++ b/examples/python/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(tensorflow) diff --git a/examples/python/tensorflow/CMakeLists.txt b/examples/python/tensorflow/CMakeLists.txt new file mode 100644 index 000000000..d9cb95051 --- /dev/null +++ b/examples/python/tensorflow/CMakeLists.txt @@ -0,0 +1,12 @@ +execute_process(COMMAND python -c "from os.path import dirname; import tensorflow as tf; print(dirname(dirname(tf.sysconfig.get_include())))" + OUTPUT_VARIABLE TF_INC OUTPUT_STRIP_TRAILING_WHITESPACE) +#execute_process(COMMAND python -c "import tensorflow as tf; print(tf.sysconfig.get_lib())" +# OUTPUT_VARIABLE TF_LIB) +#execute_process(COMMAND python -c "import tensorflow as tf; print(tf.__cxx11_abi_flag__ if \"__cxx11_abi_flag__\" in tf.__dict__ else 0)" +# OUTPUT_VARIABLE TF_ABI) + +set(CUDA_HOME "/usr/local/cuda") +include_directories("${TF_INC}/tensorflow/include") +include_directories("${CUDA_HOME}/include") +add_library(tf_blocksparse SHARED blocksparse.cpp) +#link_libraries(tf_blocksparse ${TF_LIB}) diff --git a/examples/python/tensorflow/blocksparse.cpp b/examples/python/tensorflow/blocksparse.cpp new file mode 100644 index 000000000..b2c4fd573 --- /dev/null +++ b/examples/python/tensorflow/blocksparse.cpp @@ -0,0 +1,38 @@ +#include + +#include "triton/driver/buffer.h" +#include "triton/driver/backend.h" +#include "triton/driver/stream.h" + +#define EIGEN_USE_GPU +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" +#include "tensorflow/core/util/padding.h" +#include "tensorflow/core/util/tensor_format.h" +#include "tensorflow/core/framework/common_shape_fns.h" + +using namespace tensorflow; +using GPUDevice = Eigen::GpuDevice; + +REGISTER_OP("BlockSparseGemm") + .Attr("T: {float}") + .Input("A: float") + .Input("B: float") + .Output("C: float"); + +class BlockSparseGemmOp : public OpKernel { + public: + explicit BlockSparseGemmOp(OpKernelConstruction* context) : OpKernel(context) { + } + + void Compute(OpKernelContext* context){ + GPUDevice device = context->eigen_device(); + triton::driver::cu_stream stream(device.stream(), false); + } + +private: +}; + +REGISTER_KERNEL_BUILDER(Name("BlockSparse").Device(DEVICE_GPU), BlockSparseGemmOp); diff --git a/examples/python/tensorflow/setup.py b/examples/python/tensorflow/setup.py new file mode 100644 index 000000000..957000a06 --- /dev/null +++ b/examples/python/tensorflow/setup.py @@ -0,0 +1,74 @@ +import os, sys +from os.path import dirname +from distutils.core import setup, Extension +from glob import glob +from build import build_clib_subclass, build_ext_subclass + + +def recursive_glob(rootdir='.', suffix=''): + return [os.path.join(looproot, filename) + for looproot, _, filenames in os.walk(rootdir) + for filename in filenames if filename.endswith(suffix)] + +def main(): + + path = os.path.join(os.pardir, 'include') + include = [path, os.path.join(path, 'isaac', 'external', 'CUDA')] + src = recursive_glob(os.path.join(os.pardir,'lib'), 'cpp') + flags = ['-std=c++11', '-fPIC', '-D_GLIBCXX_USE_CXX11_ABI=0'] + core = ('core', {'sources': src, 'include_dirs': include, 'cflags': flags}) + + # Extensions + extensions = [] + + # Isaac + extensions += [Extension('_isaac', + sources=recursive_glob(os.path.join('src','bind'), 'cpp'), + libraries=[], + library_dirs=[], + extra_compile_args=flags, + extra_link_args=[], + include_dirs=include + [os.path.join('src', 'bind')])] + + # Tensorflow + try: + import tensorflow as tf + tf_include = tf.sysconfig.get_include() + extensions += [Extension('_tensorflow', + sources=[os.path.join('src', 'extensions', 'tensorflow.cpp')], + libraries = ['tensorflow_framework'], + extra_compile_args= flags, + include_dirs = include + [tf_include, os.path.join(tf_include, 'external', 'nsync', 'public')], + library_dirs = [tf.sysconfig.get_lib()])] + except ImportError: + pass + + + # Setup + setup( + name='blocksparse', + version='1.0', + author='Philippe Tillet', + author_email='ptillet@g.harvard.edu', + packages=['isaac', 'isaac.pytorch', 'isaac.pytorch.models', 'isaac.pytorch.c_lib'], + libraries=[core], + ext_package='isaac', + ext_modules=extensions, + cmdclass={'build_clib': build_clib_subclass, 'build_ext': build_ext_subclass}, + classifiers=['Environment :: Console', + 'Development Status :: 4 - Beta', + 'Intended Audience :: Developers', + 'Intended Audience :: Other Audience', + 'Intended Audience :: Science/Research', + 'Natural Language :: English', + 'Programming Language :: C++', + 'Programming Language :: Python', + 'Programming Language :: Python :: 3', + 'Topic :: Scientific/Engineering', + 'Topic :: Scientific/Engineering :: Mathematics', + 'Topic :: Scientific/Engineering :: Physics', + 'Topic :: Scientific/Engineering :: Machine Learning'] + ) + +if __name__ == "__main__": + main() From d934d8fb40f57e20ec31f11ec626b1d895e8a9da Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 30 Apr 2019 12:25:14 -0400 Subject: [PATCH 129/494] [examples/python/tensorflow] improved matmul wrapper --- examples/python/tensorflow/blocksparse.cpp | 124 +++++++++++++++++++++ 1 file changed, 124 insertions(+) diff --git a/examples/python/tensorflow/blocksparse.cpp b/examples/python/tensorflow/blocksparse.cpp index b2c4fd573..5cb15f651 100644 --- a/examples/python/tensorflow/blocksparse.cpp +++ b/examples/python/tensorflow/blocksparse.cpp @@ -3,6 +3,7 @@ #include "triton/driver/buffer.h" #include "triton/driver/backend.h" #include "triton/driver/stream.h" +#include "triton/jit.h" #define EIGEN_USE_GPU #include "tensorflow/core/framework/op.h" @@ -16,10 +17,83 @@ using namespace tensorflow; using GPUDevice = Eigen::GpuDevice; + +const char* src = +R"( +const tunable int32 TM = {16, 32, 64, 128}; +const tunable int32 TN = {16, 32, 64, 128}; +const tunable int32 TK = {8}; +const tunable int32 GZ = {1}; + +void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, + int32 M, int32 N, int32 K, + int32 lda, int32 ldb, int32 ldc, + int32 *locks, int32 grid0, int32 grid1) { + int32 rxa[TM] = get_global_range[TM](0); + int32 ryb[TN] = get_global_range[TN](1); + int32 rz = get_global_range[1](2); + int32 rka[TK] = 0 ... TK; + int32 rkb[TK] = 0 ... TK; + fp32 c[TM, TN] = 0; + int32 div = K / GZ; + int32 rem = K % GZ; + K = select(rz < rem, div - 1, div); + int32 offk = select(rz < rem, rz*(div + 1), rz*div + rem); + fp32* pa[TM, TK] = A + (offk + rka[newaxis, :])*lda + rxa[:, newaxis]; + fp32* pb[TN, TK] = B + (offk + rkb[newaxis, :])*ldb + ryb[:, newaxis]; + fp32 a[TM, TK] = *pa; + fp32 b[TN, TK] = *pb; + int32 last_a = ((M*K - 1) - (TM*TK + 1)) / lda; + int32 last_b = ((K*N - 1) - (TN*TK + 1)) / ldb; + last_a = last_a / TK * TK; + last_b = last_b / TK * TK; + int32 bound = K - max(last_a, last_b); + for(int32 k = K; k > bound; k = k - TK){ + c = dot(a, trans(b), c); + pa = pa + TK*lda; + pb = pb + TK*ldb; + a = *pa; + b = *pb; + } + int32 rxc[TM] = get_global_range[TM](0); + int32 ryc[TN] = get_global_range[TN](1); + for(int32 k = bound; k > 0; k = k - 1){ + int1 checka[TM, 1] = rxc[:, newaxis] < M; + int1 checkb[TN, 1] = ryc[:, newaxis] < N; + fp32* pa[TM, 1] = A + (offk + K - k)*lda + rxc[:, newaxis]; + fp32* pb[TN, 1] = B + (offk + K - k)*ldb + ryc[:, newaxis]; + fp32 a[TM, 1] = checka ? *pa : 0; + fp32 b[TN, 1] = checkb ? *pb : 0; + c = dot(a, trans(b), c); + } + int32 ridx = get_range_id(0); + int32 ridy = get_range_id(1); + fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; + int32 *plock = locks + ridx + ridy*grid0; + while(__atomic_cas(plock, 0, 1)); + int32 *pcount = plock + grid0*grid1; + int32 count = *pcount; + int32 countp1 = select(count == GZ - 1, 0, count + 1); + int1 checkc0[TM] = rxc < M; + int1 checkc1[TN] = ryc < N; + int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + if(count == 0) { + @checkc *pc = c; + *pcount = countp1; + } + else { + @checkc *pc = c + *pc; + *pcount = countp1; + } + __atomic_cas(plock, 1, 0); +} +)"; + REGISTER_OP("BlockSparseGemm") .Attr("T: {float}") .Input("A: float") .Input("B: float") + .Input("locks: int") .Output("C: float"); class BlockSparseGemmOp : public OpKernel { @@ -28,8 +102,58 @@ class BlockSparseGemmOp : public OpKernel { } void Compute(OpKernelContext* context){ + // get device/stream GPUDevice device = context->eigen_device(); triton::driver::cu_stream stream(device.stream(), false); + // get inputs + const Tensor& a = context->input(0); + const Tensor& b = context->input(1); + const Tensor& locks = context->input(2); + // get shapes + const int64 M = a.dim_size(0); + const int64 N = b.dim_size(0); + const int64 K = a.dim_size(1); + // allocate output + Tensor* c = nullptr; + TensorShape out_shape({M, N}); + OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &c)); + // return early if possible + if (out_shape.num_elements() == 0) + return; + // wraps into buffers + triton::driver::cu_buffer ta(stream.context(), (CUdeviceptr)a.flat().data(), false); + triton::driver::cu_buffer tb(stream.context(), (CUdeviceptr)b.flat().data(), false); + triton::driver::cu_buffer tlocks(stream.context(), (CUdeviceptr)locks.flat().data(), false); + triton::driver::cu_buffer tc(stream.context(), (CUdeviceptr)c->flat().data(), false); + // launch info + triton::jit jit(stream.context()); + jit.add_module("matmul", src, {16, 2, 64, 16, 2, 64, 16, 8, 2, 2, 8, 8, 8, 1}); + triton::driver::kernel* kernel = jit.get_function("matmul"); + triton::jit::launch_information info = jit.get_launch_info("matmul"); + int64 TM = info.global_range_size[0]; + int64 TN = info.global_range_size[1]; + unsigned nthreads = info.num_threads; + int64 GZ = jit.get_int("GZ"); + std::array grid; + grid[0] = (M + TM - 1)/TM; + grid[1] = (N + TN - 1)/TN; + grid[2] = GZ; + // set argument + kernel->setArg(0, &ta); + kernel->setArg(1, &tb); + kernel->setArg(2, &tc); + kernel->setArg(3, M); + kernel->setArg(4, N); + kernel->setArg(5, K); + kernel->setArg(6, M); + kernel->setArg(7, N); + kernel->setArg(8, M); + kernel->setArg(9, tlocks); + kernel->setArg(10, grid[0]); + kernel->setArg(11, grid[1]); + // dry run + stream.enqueue(kernel, grid, {nthreads, 1, 1}, nullptr, nullptr); + return; } private: From 7b6efc046395a60dd3eccc3858b290d0656ad4dd Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 30 Apr 2019 21:04:30 -0400 Subject: [PATCH 130/494] [examples/python/tensorflow] bugfix in tensorflow wrapper example --- CMakeLists.txt | 6 +- examples/cpp/conv.cpp | 2 +- examples/cpp/dot.cpp | 20 +---- examples/python/tensorflow/CMakeLists.txt | 12 +-- examples/python/tensorflow/blocksparse.cpp | 86 +++++++++------------- examples/python/tensorflow/blocksparse.py | 20 +++++ examples/python/tensorflow/setup.py | 74 ------------------- include/triton/codegen/target.h | 1 + include/triton/jit.h | 14 ++-- lib/ast/lowering.cpp | 1 + lib/driver/module.cpp | 1 - lib/jit.cpp | 24 +++--- 12 files changed, 90 insertions(+), 171 deletions(-) create mode 100644 examples/python/tensorflow/blocksparse.py delete mode 100644 examples/python/tensorflow/setup.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 3326b3ff6..d9e1ac845 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,6 +10,10 @@ FLEX_TARGET(Lexer ${CMAKE_CURRENT_SOURCE_DIR}/include/triton/ast/scanner.l ${CMA get_filename_component(BISON_Parser_INCLUDE_DIRECTORIES ${BISON_Parser_OUTPUT_HEADER} DIRECTORY) include_directories(${BISON_Parser_INCLUDE_DIRECTORIES}) +#execute_process(COMMAND python -c "import tensorflow as tf; print(tf.__cxx11_abi_flag__ if \"__cxx11_abi_flag__\" in tf.__dict__ else 0)" +# OUTPUT_VARIABLE TF_ABI OUTPUT_STRIP_TRAILING_WHITESPACE) +#add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) + # LLVM find_package(LLVM REQUIRED CONFIG) message(STATUS ${LLVM_INCLUDE_DIRS}) @@ -24,7 +28,7 @@ if(NOT CMAKE_BUILD_TYPE) endif() # Gather headers for cmake-based IDEs -file( GLOB_RECURSE ALL_SRC *.cpp *.hpp *.h *.py *.y *.l) +file( GLOB_RECURSE ALL_SRC *.cpp *.hpp *.h *.py *.y *.l CMakeLists*) add_custom_target( ALL SOURCES ${ALL_SRC} ) # Compiler flags diff --git a/examples/cpp/conv.cpp b/examples/cpp/conv.cpp index 721489b9f..150fafb91 100644 --- a/examples/cpp/conv.cpp +++ b/examples/cpp/conv.cpp @@ -5,7 +5,7 @@ #include "triton/driver/backend.h" #include "triton/driver/stream.h" -std::string src = +const char* src = R"( const tunable int32 TM = {16, 32, 64}; const tunable int32 TN = {16, 32, 64}; diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index ecbcab0ed..84fc89417 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -53,26 +53,8 @@ void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, fp32 b[TN, 1] = checkb ? *pb : 0; c = dot(a, trans(b), c); } - int32 ridx = get_range_id(0); - int32 ridy = get_range_id(1); fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - int32 *plock = locks + ridx + ridy*grid0; - while(__atomic_cas(plock, 0, 1)); - int32 *pcount = plock + grid0*grid1; - int32 count = *pcount; - int32 countp1 = select(count == GZ - 1, 0, count + 1); - int1 checkc0[TM] = rxc < M; - int1 checkc1[TN] = ryc < N; - int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - if(count == 0) { - @checkc *pc = c; - *pcount = countp1; - } - else { - @checkc *pc = c + *pc; - *pcount = countp1; - } - __atomic_cas(plock, 1, 0); + *pc = c; } )"; diff --git a/examples/python/tensorflow/CMakeLists.txt b/examples/python/tensorflow/CMakeLists.txt index d9cb95051..5b3b04df6 100644 --- a/examples/python/tensorflow/CMakeLists.txt +++ b/examples/python/tensorflow/CMakeLists.txt @@ -1,12 +1,14 @@ execute_process(COMMAND python -c "from os.path import dirname; import tensorflow as tf; print(dirname(dirname(tf.sysconfig.get_include())))" OUTPUT_VARIABLE TF_INC OUTPUT_STRIP_TRAILING_WHITESPACE) -#execute_process(COMMAND python -c "import tensorflow as tf; print(tf.sysconfig.get_lib())" -# OUTPUT_VARIABLE TF_LIB) -#execute_process(COMMAND python -c "import tensorflow as tf; print(tf.__cxx11_abi_flag__ if \"__cxx11_abi_flag__\" in tf.__dict__ else 0)" -# OUTPUT_VARIABLE TF_ABI) +execute_process(COMMAND python -c "import tensorflow as tf; print(tf.sysconfig.get_lib())" + OUTPUT_VARIABLE TF_LIB OUTPUT_STRIP_TRAILING_WHITESPACE) +execute_process(COMMAND python -c "import tensorflow as tf; print(tf.__cxx11_abi_flag__ if \"__cxx11_abi_flag__\" in tf.__dict__ else 0)" + OUTPUT_VARIABLE TF_ABI OUTPUT_STRIP_TRAILING_WHITESPACE) set(CUDA_HOME "/usr/local/cuda") include_directories("${TF_INC}/tensorflow/include") include_directories("${CUDA_HOME}/include") +link_directories(${TF_LIB}) +add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) add_library(tf_blocksparse SHARED blocksparse.cpp) -#link_libraries(tf_blocksparse ${TF_LIB}) +target_link_libraries(tf_blocksparse tensorflow_framework triton) diff --git a/examples/python/tensorflow/blocksparse.cpp b/examples/python/tensorflow/blocksparse.cpp index 5cb15f651..b68247aec 100644 --- a/examples/python/tensorflow/blocksparse.cpp +++ b/examples/python/tensorflow/blocksparse.cpp @@ -66,35 +66,18 @@ void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, fp32 b[TN, 1] = checkb ? *pb : 0; c = dot(a, trans(b), c); } - int32 ridx = get_range_id(0); - int32 ridy = get_range_id(1); fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - int32 *plock = locks + ridx + ridy*grid0; - while(__atomic_cas(plock, 0, 1)); - int32 *pcount = plock + grid0*grid1; - int32 count = *pcount; - int32 countp1 = select(count == GZ - 1, 0, count + 1); - int1 checkc0[TM] = rxc < M; - int1 checkc1[TN] = ryc < N; - int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - if(count == 0) { - @checkc *pc = c; - *pcount = countp1; - } - else { - @checkc *pc = c + *pc; - *pcount = countp1; - } - __atomic_cas(plock, 1, 0); + *pc = c; } )"; -REGISTER_OP("BlockSparseGemm") +REGISTER_OP("BlockSparseMatMul") + .Input("a: T") + .Input("b: T") + .Input("locks: int32") + .Output("c: T") .Attr("T: {float}") - .Input("A: float") - .Input("B: float") - .Input("locks: int") - .Output("C: float"); +; class BlockSparseGemmOp : public OpKernel { public: @@ -104,59 +87,60 @@ class BlockSparseGemmOp : public OpKernel { void Compute(OpKernelContext* context){ // get device/stream GPUDevice device = context->eigen_device(); - triton::driver::cu_stream stream(device.stream(), false); + triton::driver::cu_stream sstream(device.stream(), false); + triton::driver::context* ctx = sstream.context(); + triton::driver::stream* stream = &sstream; // get inputs const Tensor& a = context->input(0); const Tensor& b = context->input(1); const Tensor& locks = context->input(2); // get shapes - const int64 M = a.dim_size(0); - const int64 N = b.dim_size(0); - const int64 K = a.dim_size(1); + const int32_t M = a.dim_size(0); + const int32_t N = b.dim_size(0); + const int32_t K = a.dim_size(1); // allocate output Tensor* c = nullptr; - TensorShape out_shape({M, N}); + TensorShape out_shape({(int64)M, (int64)N}); OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &c)); // return early if possible if (out_shape.num_elements() == 0) return; - // wraps into buffers - triton::driver::cu_buffer ta(stream.context(), (CUdeviceptr)a.flat().data(), false); - triton::driver::cu_buffer tb(stream.context(), (CUdeviceptr)b.flat().data(), false); - triton::driver::cu_buffer tlocks(stream.context(), (CUdeviceptr)locks.flat().data(), false); - triton::driver::cu_buffer tc(stream.context(), (CUdeviceptr)c->flat().data(), false); - // launch info - triton::jit jit(stream.context()); + // initialize default compute device + triton::jit jit(ctx); + // matrix multiplication parameters + triton::driver::cu_buffer da(ctx, (CUdeviceptr)a.flat().data(), false); + triton::driver::cu_buffer db(ctx, (CUdeviceptr)b.flat().data(), false); + triton::driver::cu_buffer dc(ctx, (CUdeviceptr)c->flat().data(), false); + triton::driver::cu_buffer dlocks(ctx, (CUdeviceptr)locks.flat().data(), false); + stream->synchronize(); + // just-in-time compile source-code jit.add_module("matmul", src, {16, 2, 64, 16, 2, 64, 16, 8, 2, 2, 8, 8, 8, 1}); triton::driver::kernel* kernel = jit.get_function("matmul"); triton::jit::launch_information info = jit.get_launch_info("matmul"); - int64 TM = info.global_range_size[0]; - int64 TN = info.global_range_size[1]; + // launch info + unsigned TM = info.global_range_size[0]; + unsigned TN = info.global_range_size[1]; unsigned nthreads = info.num_threads; - int64 GZ = jit.get_int("GZ"); - std::array grid; - grid[0] = (M + TM - 1)/TM; - grid[1] = (N + TN - 1)/TN; - grid[2] = GZ; + unsigned GZ = jit.get_int("GZ"); + std::array grid = {(M + TM - 1)/TM, (N + TN - 1)/TN, GZ}; // set argument - kernel->setArg(0, &ta); - kernel->setArg(1, &tb); - kernel->setArg(2, &tc); + kernel->setArg(0, *da.cu()); + kernel->setArg(1, *db.cu()); + kernel->setArg(2, *dc.cu()); kernel->setArg(3, M); kernel->setArg(4, N); kernel->setArg(5, K); kernel->setArg(6, M); kernel->setArg(7, N); kernel->setArg(8, M); - kernel->setArg(9, tlocks); + kernel->setArg(9, *dlocks.cu()); kernel->setArg(10, grid[0]); kernel->setArg(11, grid[1]); - // dry run - stream.enqueue(kernel, grid, {nthreads, 1, 1}, nullptr, nullptr); - return; + stream->enqueue(kernel, grid, {nthreads, 1, 1}); + stream->synchronize(); } private: }; -REGISTER_KERNEL_BUILDER(Name("BlockSparse").Device(DEVICE_GPU), BlockSparseGemmOp); +REGISTER_KERNEL_BUILDER(Name("BlockSparseMatMul").Device(DEVICE_GPU).TypeConstraint("T"), BlockSparseGemmOp); diff --git a/examples/python/tensorflow/blocksparse.py b/examples/python/tensorflow/blocksparse.py new file mode 100644 index 000000000..8e83b589b --- /dev/null +++ b/examples/python/tensorflow/blocksparse.py @@ -0,0 +1,20 @@ +import os +import tensorflow as tf +import numpy as np + +data_files_path = tf.resource_loader.get_data_files_path() +library_dir = '/home/philippe/development/triton/build/examples/python/tensorflow' +module = tf.load_op_library(os.path.join(library_dir, 'libtf_blocksparse.so')) + +M, N, K = 512, 512, 512 +a = tf.placeholder(tf.float32, shape=[M, K]) +b = tf.placeholder(tf.float32, shape=[N, K]) +locks = tf.placeholder(tf.int32, shape=[4096]) +c = module.block_sparse_mat_mul(a, b, locks) +# Run +sess = tf.InteractiveSession() +sess.run(tf.global_variables_initializer()) +result = sess.run([c], feed_dict = {locks: np.zeros(4096), + a: np.random.rand(M, K), + b: np.random.rand(N, K)}) +print(result) diff --git a/examples/python/tensorflow/setup.py b/examples/python/tensorflow/setup.py deleted file mode 100644 index 957000a06..000000000 --- a/examples/python/tensorflow/setup.py +++ /dev/null @@ -1,74 +0,0 @@ -import os, sys -from os.path import dirname -from distutils.core import setup, Extension -from glob import glob -from build import build_clib_subclass, build_ext_subclass - - -def recursive_glob(rootdir='.', suffix=''): - return [os.path.join(looproot, filename) - for looproot, _, filenames in os.walk(rootdir) - for filename in filenames if filename.endswith(suffix)] - -def main(): - - path = os.path.join(os.pardir, 'include') - include = [path, os.path.join(path, 'isaac', 'external', 'CUDA')] - src = recursive_glob(os.path.join(os.pardir,'lib'), 'cpp') - flags = ['-std=c++11', '-fPIC', '-D_GLIBCXX_USE_CXX11_ABI=0'] - core = ('core', {'sources': src, 'include_dirs': include, 'cflags': flags}) - - # Extensions - extensions = [] - - # Isaac - extensions += [Extension('_isaac', - sources=recursive_glob(os.path.join('src','bind'), 'cpp'), - libraries=[], - library_dirs=[], - extra_compile_args=flags, - extra_link_args=[], - include_dirs=include + [os.path.join('src', 'bind')])] - - # Tensorflow - try: - import tensorflow as tf - tf_include = tf.sysconfig.get_include() - extensions += [Extension('_tensorflow', - sources=[os.path.join('src', 'extensions', 'tensorflow.cpp')], - libraries = ['tensorflow_framework'], - extra_compile_args= flags, - include_dirs = include + [tf_include, os.path.join(tf_include, 'external', 'nsync', 'public')], - library_dirs = [tf.sysconfig.get_lib()])] - except ImportError: - pass - - - # Setup - setup( - name='blocksparse', - version='1.0', - author='Philippe Tillet', - author_email='ptillet@g.harvard.edu', - packages=['isaac', 'isaac.pytorch', 'isaac.pytorch.models', 'isaac.pytorch.c_lib'], - libraries=[core], - ext_package='isaac', - ext_modules=extensions, - cmdclass={'build_clib': build_clib_subclass, 'build_ext': build_ext_subclass}, - classifiers=['Environment :: Console', - 'Development Status :: 4 - Beta', - 'Intended Audience :: Developers', - 'Intended Audience :: Other Audience', - 'Intended Audience :: Science/Research', - 'Natural Language :: English', - 'Programming Language :: C++', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Topic :: Scientific/Engineering', - 'Topic :: Scientific/Engineering :: Mathematics', - 'Topic :: Scientific/Engineering :: Physics', - 'Topic :: Scientific/Engineering :: Machine Learning'] - ) - -if __name__ == "__main__": - main() diff --git a/include/triton/codegen/target.h b/include/triton/codegen/target.h index 9079fc869..118ee919f 100644 --- a/include/triton/codegen/target.h +++ b/include/triton/codegen/target.h @@ -20,6 +20,7 @@ namespace codegen{ class target { public: target(bool is_gpu): is_gpu_(is_gpu){} + virtual ~target() {} virtual void set_kernel(llvm::IRBuilder<>& builder, llvm::LLVMContext &ctx, llvm::Module *module, llvm::Function* fn) = 0; virtual llvm::Instruction* add_barrier(llvm::Module *module, llvm::IRBuilder<>& builder) = 0; virtual llvm::Value* get_global_offset(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned stride, unsigned ax) = 0; diff --git a/include/triton/jit.h b/include/triton/jit.h index e28de674b..20cb442e9 100644 --- a/include/triton/jit.h +++ b/include/triton/jit.h @@ -89,17 +89,17 @@ public: private: std::string compute_data_layout(bool is_64bit = true, bool use_short_pointers = true); std::unique_ptr make_llvm_module(triton::ir::module &module, passes_wrapper &passes); - std::unique_ptr make_triton_module(const std::string &name, const std::string &src); + std::unique_ptr make_triton_module(const char* name, const char* src); public: jit(driver::context* context); - void autotune(const std::string &name, const std::string &src, benchmark_t benchmark); + void autotune(const char* name, const char* src, benchmark_t benchmark); void add_module(ir::module &module, const std::vector& params = {}); - void add_module(const std::string &name, const std::string &src, const std::vector& params = {}); - driver::kernel* get_function(const std::string &name); - launch_information get_launch_info(const std::string &name); - unsigned get_int(const std::string &name); - driver::buffer *get_buffer(const std::string &name); + void add_module(const char* name, const char* src, const std::vector& params = {}); + driver::kernel* get_function(const char* name); + launch_information get_launch_info(const char* name); + unsigned get_int(const char* name); + driver::buffer* get_buffer(const char* name); private: std::vector modules_; diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp index c7fff7160..3f8623e1c 100644 --- a/lib/ast/lowering.cpp +++ b/lib/ast/lowering.cpp @@ -404,6 +404,7 @@ ir::value* while_statement::codegen(ir::module* mod) const{ mod->seal_block(builder.get_insert_block()); mod->seal_block(next_bb); builder.set_insert_point(next_bb); + return nullptr; } /* Selection statement */ diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 641b900b4..1df832aeb 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -19,7 +19,6 @@ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ - #include #include #include diff --git a/lib/jit.cpp b/lib/jit.cpp index 9a4181e2a..97404ab23 100644 --- a/lib/jit.cpp +++ b/lib/jit.cpp @@ -79,9 +79,9 @@ std::unique_ptr jit::make_llvm_module(ir::module &module, passes_w return std::unique_ptr(result); } -std::unique_ptr jit::make_triton_module(const std::string &name, const std::string &src) { +std::unique_ptr jit::make_triton_module(const char *name, const char *src) { // create AST from Triton-C source - YY_BUFFER_STATE buffer = yy_scan_string(src.c_str()); + YY_BUFFER_STATE buffer = yy_scan_string(src); yyparse(); yy_delete_buffer(buffer); translation_unit *program = ast_root; @@ -97,7 +97,7 @@ jit::jit(driver::context *context): driver_context_(context), } -void jit::autotune(const std::string &name, const std::string &src, benchmark_t benchmark) { +void jit::autotune(const char *name, const char *src, benchmark_t benchmark) { // find metaparameters auto ptt_module = make_triton_module(name, src); ir::module &tt_module = *ptt_module; @@ -143,8 +143,8 @@ void jit::autotune(const std::string &name, const std::string &src, benchmark_t // Compile auto ll_module = make_llvm_module(tt_module, passes); std::unique_ptr module(driver::module::create(driver_context_, &*ll_module)); - std::unique_ptr kernel(driver::kernel::create(module.get(), name.c_str())); - launch_information info = launch_info_map_.at(name.c_str()); + std::unique_ptr kernel(driver::kernel::create(module.get(), name)); + launch_information info = launch_info_map_.at(name); for(unsigned p: params) std::cout << p << " " << std::flush; // add globals @@ -191,26 +191,26 @@ void jit::add_module(ir::module &tt_module, const std::vector ¶ms) global_ints_[x.first] = ((ir::metaparameter*)x.second)->get_value(); } -void jit::add_module(const std::string &name, const std::string &src, const std::vector ¶ms) { +void jit::add_module(const char *name, const char *src, const std::vector ¶ms) { auto ptt_module = make_triton_module(name, src); add_module(*ptt_module, params); } -driver::kernel *jit::get_function(const std::string &name) { - return driver::kernel::create(modules_.front(), name.c_str()); +driver::kernel *jit::get_function(const char *name) { + return driver::kernel::create(modules_.front(), name); } -jit::launch_information jit::get_launch_info(const std::string &name) { +jit::launch_information jit::get_launch_info(const char *name) { return launch_info_map_.at(name); } -unsigned jit::get_int(const std::string &name){ +unsigned jit::get_int(const char *name){ return global_ints_.at(name); } -driver::buffer *jit::get_buffer(const std::string &name){ +driver::buffer *jit::get_buffer(const char *name){ driver::cu_module *mod = (driver::cu_module*)modules_.front(); - return mod->symbol(name.c_str()); + return mod->symbol(name); } } From 55866f1ef62ad54c421cc45e470379214c0a7461 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 1 May 2019 13:38:56 -0400 Subject: [PATCH 131/494] [examples/python/tensorflow] fixed ABI compatibility issue in JIT destructor --- examples/cpp/dot.cpp | 20 +++++++++++++++++++- examples/python/tensorflow/blocksparse.cpp | 21 +++++++++++++++++++-- examples/python/tensorflow/blocksparse.py | 2 +- include/triton/jit.h | 2 +- lib/jit.cpp | 13 +++++++------ 5 files changed, 47 insertions(+), 11 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 84fc89417..ecbcab0ed 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -53,8 +53,26 @@ void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, fp32 b[TN, 1] = checkb ? *pb : 0; c = dot(a, trans(b), c); } + int32 ridx = get_range_id(0); + int32 ridy = get_range_id(1); fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - *pc = c; + int32 *plock = locks + ridx + ridy*grid0; + while(__atomic_cas(plock, 0, 1)); + int32 *pcount = plock + grid0*grid1; + int32 count = *pcount; + int32 countp1 = select(count == GZ - 1, 0, count + 1); + int1 checkc0[TM] = rxc < M; + int1 checkc1[TN] = ryc < N; + int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + if(count == 0) { + @checkc *pc = c; + *pcount = countp1; + } + else { + @checkc *pc = c + *pc; + *pcount = countp1; + } + __atomic_cas(plock, 1, 0); } )"; diff --git a/examples/python/tensorflow/blocksparse.cpp b/examples/python/tensorflow/blocksparse.cpp index b68247aec..a6b05df7f 100644 --- a/examples/python/tensorflow/blocksparse.cpp +++ b/examples/python/tensorflow/blocksparse.cpp @@ -66,8 +66,26 @@ void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, fp32 b[TN, 1] = checkb ? *pb : 0; c = dot(a, trans(b), c); } + int32 ridx = get_range_id(0); + int32 ridy = get_range_id(1); fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - *pc = c; + int32 *plock = locks + ridx + ridy*grid0; + while(__atomic_cas(plock, 0, 1)); + int32 *pcount = plock + grid0*grid1; + int32 count = *pcount; + int32 countp1 = select(count == GZ - 1, 0, count + 1); + int1 checkc0[TM] = rxc < M; + int1 checkc1[TN] = ryc < N; + int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + if(count == 0) { + @checkc *pc = c; + *pcount = countp1; + } + else { + @checkc *pc = c + *pc; + *pcount = countp1; + } + __atomic_cas(plock, 1, 0); } )"; @@ -137,7 +155,6 @@ class BlockSparseGemmOp : public OpKernel { kernel->setArg(10, grid[0]); kernel->setArg(11, grid[1]); stream->enqueue(kernel, grid, {nthreads, 1, 1}); - stream->synchronize(); } private: diff --git a/examples/python/tensorflow/blocksparse.py b/examples/python/tensorflow/blocksparse.py index 8e83b589b..5a721def9 100644 --- a/examples/python/tensorflow/blocksparse.py +++ b/examples/python/tensorflow/blocksparse.py @@ -3,7 +3,7 @@ import tensorflow as tf import numpy as np data_files_path = tf.resource_loader.get_data_files_path() -library_dir = '/home/philippe/development/triton/build/examples/python/tensorflow' +library_dir = '/home/philippe/Development/triton/build/examples/python/tensorflow' module = tf.load_op_library(os.path.join(library_dir, 'libtf_blocksparse.so')) M, N, K = 512, 512, 512 diff --git a/include/triton/jit.h b/include/triton/jit.h index 20cb442e9..a3e554c67 100644 --- a/include/triton/jit.h +++ b/include/triton/jit.h @@ -70,7 +70,6 @@ public: shmem_barriers.run(module); } vectorize.run(module); -// ir::print(module, std::cout); } codegen::tune tune; @@ -93,6 +92,7 @@ private: public: jit(driver::context* context); + ~jit(); void autotune(const char* name, const char* src, benchmark_t benchmark); void add_module(ir::module &module, const std::vector& params = {}); void add_module(const char* name, const char* src, const std::vector& params = {}); diff --git a/lib/jit.cpp b/lib/jit.cpp index 97404ab23..460df4275 100644 --- a/lib/jit.cpp +++ b/lib/jit.cpp @@ -71,11 +71,12 @@ std::unique_ptr jit::make_llvm_module(ir::module &module, passes_w llvm::Module* result = new llvm::Module(module.get_name(), llvm_context_); passes.selection.run(module, *result); // launch information - auto &launch_info_map = launch_info_map_[result->getName()]; - launch_info_map.global_range_size.clear(); + launch_information info; + info.global_range_size.clear(); for(unsigned i = 0; i < passes.tune.get_num_global_range(); i++) - launch_info_map.global_range_size.push_back(passes.tune.get_global_range_size(i)); - launch_info_map.num_threads = passes.tune.get_num_threads(); + info.global_range_size.push_back(passes.tune.get_global_range_size(i)); + info.num_threads = passes.tune.get_num_threads(); + launch_info_map_.insert({result->getName(), info}); return std::unique_ptr(result); } @@ -93,9 +94,9 @@ std::unique_ptr jit::make_triton_module(const char *name, const char jit::jit(driver::context *context): driver_context_(context), - target_(context->device()->make_target()) { -} + target_(context->device()->make_target()) { } +jit::~jit(){ } void jit::autotune(const char *name, const char *src, benchmark_t benchmark) { // find metaparameters From 70f49a56c1416c1ef3a5b5540eb959c788371db9 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 1 May 2019 17:09:01 -0400 Subject: [PATCH 132/494] [examples/python/tensorflow] better skeleton for blocksparse --- examples/python/tensorflow/blocksparse.cpp | 237 ++++++++++----------- examples/python/tensorflow/blocksparse.py | 20 -- examples/python/tensorflow/dot.cpp | 163 ++++++++++++++ 3 files changed, 280 insertions(+), 140 deletions(-) delete mode 100644 examples/python/tensorflow/blocksparse.py create mode 100644 examples/python/tensorflow/dot.cpp diff --git a/examples/python/tensorflow/blocksparse.cpp b/examples/python/tensorflow/blocksparse.cpp index a6b05df7f..85e73d033 100644 --- a/examples/python/tensorflow/blocksparse.cpp +++ b/examples/python/tensorflow/blocksparse.cpp @@ -15,6 +15,9 @@ #include "tensorflow/core/framework/common_shape_fns.h" using namespace tensorflow; +using shape_inference::DimensionHandle; +using shape_inference::InferenceContext; +using shape_inference::ShapeHandle; using GPUDevice = Eigen::GpuDevice; @@ -25,139 +28,133 @@ const tunable int32 TN = {16, 32, 64, 128}; const tunable int32 TK = {8}; const tunable int32 GZ = {1}; -void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, +void bsmm (restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, int32 M, int32 N, int32 K, int32 lda, int32 ldb, int32 ldc, int32 *locks, int32 grid0, int32 grid1) { - int32 rxa[TM] = get_global_range[TM](0); - int32 ryb[TN] = get_global_range[TN](1); - int32 rz = get_global_range[1](2); - int32 rka[TK] = 0 ... TK; - int32 rkb[TK] = 0 ... TK; - fp32 c[TM, TN] = 0; - int32 div = K / GZ; - int32 rem = K % GZ; - K = select(rz < rem, div - 1, div); - int32 offk = select(rz < rem, rz*(div + 1), rz*div + rem); - fp32* pa[TM, TK] = A + (offk + rka[newaxis, :])*lda + rxa[:, newaxis]; - fp32* pb[TN, TK] = B + (offk + rkb[newaxis, :])*ldb + ryb[:, newaxis]; - fp32 a[TM, TK] = *pa; - fp32 b[TN, TK] = *pb; - int32 last_a = ((M*K - 1) - (TM*TK + 1)) / lda; - int32 last_b = ((K*N - 1) - (TN*TK + 1)) / ldb; - last_a = last_a / TK * TK; - last_b = last_b / TK * TK; - int32 bound = K - max(last_a, last_b); - for(int32 k = K; k > bound; k = k - TK){ - c = dot(a, trans(b), c); - pa = pa + TK*lda; - pb = pb + TK*ldb; - a = *pa; - b = *pb; - } - int32 rxc[TM] = get_global_range[TM](0); - int32 ryc[TN] = get_global_range[TN](1); - for(int32 k = bound; k > 0; k = k - 1){ - int1 checka[TM, 1] = rxc[:, newaxis] < M; - int1 checkb[TN, 1] = ryc[:, newaxis] < N; - fp32* pa[TM, 1] = A + (offk + K - k)*lda + rxc[:, newaxis]; - fp32* pb[TN, 1] = B + (offk + K - k)*ldb + ryc[:, newaxis]; - fp32 a[TM, 1] = checka ? *pa : 0; - fp32 b[TN, 1] = checkb ? *pb : 0; - c = dot(a, trans(b), c); - } - int32 ridx = get_range_id(0); - int32 ridy = get_range_id(1); - fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - int32 *plock = locks + ridx + ridy*grid0; - while(__atomic_cas(plock, 0, 1)); - int32 *pcount = plock + grid0*grid1; - int32 count = *pcount; - int32 countp1 = select(count == GZ - 1, 0, count + 1); - int1 checkc0[TM] = rxc < M; - int1 checkc1[TN] = ryc < N; - int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - if(count == 0) { - @checkc *pc = c; - *pcount = countp1; - } - else { - @checkc *pc = c + *pc; - *pcount = countp1; - } - __atomic_cas(plock, 1, 0); + } )"; -REGISTER_OP("BlockSparseMatMul") - .Input("a: T") - .Input("b: T") - .Input("locks: int32") - .Output("c: T") - .Attr("T: {float}") -; +Status XpropShape(InferenceContext* ctx) +{ + int K; TF_RETURN_IF_ERROR(ctx->GetAttr( "K", &K)); + int axis; TF_RETURN_IF_ERROR(ctx->GetAttr("axis", &axis)); -class BlockSparseGemmOp : public OpKernel { + // C ==> K + ShapeHandle x = ctx->input(0); + int rank = ctx->Rank(x); + //printf("XpropShape: %d\n", rank); + if (rank > 0) + { + std::vector shape; + shape.reserve(rank); + for (int i = 0; i < rank; i++) + shape.push_back(i == axis ? ctx->MakeDim(K) : ctx->Dim(x, i)); + + ctx->set_output(0, ctx->MakeShape(shape)); + } + else + ctx->set_output(0, ctx->UnknownShape()); + ctx->set_output(1, ctx->UnknownShape()); + return Status::OK(); +} + + +REGISTER_OP("BlocksparseMatmul") + .Input("x: T") + .Input("w: T") + .Input("lut: int64") + .Input("lut_dx: int64") + .Input("lut_dw: int64") + .Input("gate: ngate * float") + .Output("y: T") + .Output("temp: int32") + .Attr("T: {half, float, bfloat16}") + .Attr("blocks: int >=0") + .Attr("bsize: int") + .Attr("segments: int = 0") + .Attr("segments_dx: int = 0") + .Attr("locks: int = 0") + .Attr("locks_dx: int = 0") + .Attr("axis: int = 1") + .Attr("C: int >=0") + .Attr("K: int >=0") + .Attr("shared: int = 0") + .Attr("shared_dx: int = 0") + .Attr("alpha: float = 1.0") + .Attr("beta: float = 0.0") + .Attr("gated_dw: bool = false") + .Attr("gate_grad: bool = false") + .Attr("bench: int = 0") + .Attr("ngate: int >= 0") + .SetShapeFn(XpropShape) + .Doc(R"doc( +Multiply the matrix "a" by the blocksparse matrix "b". +)doc"); + + +typedef struct bsmm_params +{ + const int* Lut; + const float* Gate; + int* Lock; + //float4* Scratch; + int blocks; + int bsize; + int segments; + int locks; + int C; + int K; + int N; + int shared; + int pcount; + uint blk_a; + uint blk_A; + uint blk_b; + uint blk_B; + float alpha; + float beta; + CUstream stream; +} bsmm_params; + +class BlocksparseMatmulOp : public OpKernel { public: - explicit BlockSparseGemmOp(OpKernelConstruction* context) : OpKernel(context) { + explicit BlocksparseMatmulOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("segments", ¶ms_.segments)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("locks", ¶ms_.locks )); + OP_REQUIRES_OK(ctx, ctx->GetAttr("blocks", ¶ms_.blocks )); + OP_REQUIRES_OK(ctx, ctx->GetAttr("bsize", ¶ms_.bsize )); + OP_REQUIRES_OK(ctx, ctx->GetAttr("C", ¶ms_.C )); + OP_REQUIRES_OK(ctx, ctx->GetAttr("K", ¶ms_.K )); + OP_REQUIRES_OK(ctx, ctx->GetAttr("shared", ¶ms_.shared )); + OP_REQUIRES_OK(ctx, ctx->GetAttr("alpha", ¶ms_.alpha )); + OP_REQUIRES_OK(ctx, ctx->GetAttr("beta", ¶ms_.beta )); + OP_REQUIRES_OK(ctx, ctx->GetAttr("gated_dw", &gated_dw_ )); + OP_REQUIRES_OK(ctx, ctx->GetAttr("axis", &axis_ )); + OP_REQUIRES_OK(ctx, ctx->GetAttr("bench", &bench_)); + OP_REQUIRES(ctx, params_.K < params_.bsize*65536, errors::InvalidArgument("K < bsize*65536")); + OP_REQUIRES(ctx, params_.C < params_.bsize*65536, errors::InvalidArgument("C < bsize*65536")); + params_.pcount = 1; + params_.blk_A = 0; + is_gpu_ = ctx->device_type() == DEVICE_GPU; + if (bench_) { + repeat_ = bench_; + flops_ = (float)(params_.blocks * params_.bsize*params_.bsize); + const char* op = "FPROP"; + sprintf(bench_string_, "%s %02d-%d C:%05d K:%05d blks:%d", op, params_.bsize, axis_, params_.C, params_.K, params_.blocks); + } } void Compute(OpKernelContext* context){ - // get device/stream - GPUDevice device = context->eigen_device(); - triton::driver::cu_stream sstream(device.stream(), false); - triton::driver::context* ctx = sstream.context(); - triton::driver::stream* stream = &sstream; - // get inputs - const Tensor& a = context->input(0); - const Tensor& b = context->input(1); - const Tensor& locks = context->input(2); - // get shapes - const int32_t M = a.dim_size(0); - const int32_t N = b.dim_size(0); - const int32_t K = a.dim_size(1); - // allocate output - Tensor* c = nullptr; - TensorShape out_shape({(int64)M, (int64)N}); - OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &c)); - // return early if possible - if (out_shape.num_elements() == 0) - return; - // initialize default compute device - triton::jit jit(ctx); - // matrix multiplication parameters - triton::driver::cu_buffer da(ctx, (CUdeviceptr)a.flat().data(), false); - triton::driver::cu_buffer db(ctx, (CUdeviceptr)b.flat().data(), false); - triton::driver::cu_buffer dc(ctx, (CUdeviceptr)c->flat().data(), false); - triton::driver::cu_buffer dlocks(ctx, (CUdeviceptr)locks.flat().data(), false); - stream->synchronize(); - // just-in-time compile source-code - jit.add_module("matmul", src, {16, 2, 64, 16, 2, 64, 16, 8, 2, 2, 8, 8, 8, 1}); - triton::driver::kernel* kernel = jit.get_function("matmul"); - triton::jit::launch_information info = jit.get_launch_info("matmul"); - // launch info - unsigned TM = info.global_range_size[0]; - unsigned TN = info.global_range_size[1]; - unsigned nthreads = info.num_threads; - unsigned GZ = jit.get_int("GZ"); - std::array grid = {(M + TM - 1)/TM, (N + TN - 1)/TN, GZ}; - // set argument - kernel->setArg(0, *da.cu()); - kernel->setArg(1, *db.cu()); - kernel->setArg(2, *dc.cu()); - kernel->setArg(3, M); - kernel->setArg(4, N); - kernel->setArg(5, K); - kernel->setArg(6, M); - kernel->setArg(7, N); - kernel->setArg(8, M); - kernel->setArg(9, *dlocks.cu()); - kernel->setArg(10, grid[0]); - kernel->setArg(11, grid[1]); - stream->enqueue(kernel, grid, {nthreads, 1, 1}); } private: + bsmm_params params_; + int axis_, bench_, repeat_, SMs_, major_, grid_n_; + float flops_; + bool gated_dw_, is_gpu_; + char bench_string_[256]; }; -REGISTER_KERNEL_BUILDER(Name("BlockSparseMatMul").Device(DEVICE_GPU).TypeConstraint("T"), BlockSparseGemmOp); +REGISTER_KERNEL_BUILDER(Name("BlocksparseMatmul").Device(DEVICE_GPU).TypeConstraint("T"), BlocksparseMatmulOp); diff --git a/examples/python/tensorflow/blocksparse.py b/examples/python/tensorflow/blocksparse.py deleted file mode 100644 index 5a721def9..000000000 --- a/examples/python/tensorflow/blocksparse.py +++ /dev/null @@ -1,20 +0,0 @@ -import os -import tensorflow as tf -import numpy as np - -data_files_path = tf.resource_loader.get_data_files_path() -library_dir = '/home/philippe/Development/triton/build/examples/python/tensorflow' -module = tf.load_op_library(os.path.join(library_dir, 'libtf_blocksparse.so')) - -M, N, K = 512, 512, 512 -a = tf.placeholder(tf.float32, shape=[M, K]) -b = tf.placeholder(tf.float32, shape=[N, K]) -locks = tf.placeholder(tf.int32, shape=[4096]) -c = module.block_sparse_mat_mul(a, b, locks) -# Run -sess = tf.InteractiveSession() -sess.run(tf.global_variables_initializer()) -result = sess.run([c], feed_dict = {locks: np.zeros(4096), - a: np.random.rand(M, K), - b: np.random.rand(N, K)}) -print(result) diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp new file mode 100644 index 000000000..bc87c71e8 --- /dev/null +++ b/examples/python/tensorflow/dot.cpp @@ -0,0 +1,163 @@ +#include + +#include "triton/driver/buffer.h" +#include "triton/driver/backend.h" +#include "triton/driver/stream.h" +#include "triton/jit.h" + +#define EIGEN_USE_GPU +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" +#include "tensorflow/core/util/padding.h" +#include "tensorflow/core/util/tensor_format.h" +#include "tensorflow/core/framework/common_shape_fns.h" + +using namespace tensorflow; +using GPUDevice = Eigen::GpuDevice; + + +const char* src = +R"( +const tunable int32 TM = {16, 32, 64, 128}; +const tunable int32 TN = {16, 32, 64, 128}; +const tunable int32 TK = {8}; +const tunable int32 GZ = {1}; + +void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, + int32 M, int32 N, int32 K, + int32 lda, int32 ldb, int32 ldc, + int32 *locks, int32 grid0, int32 grid1) { + int32 rxa[TM] = get_global_range[TM](0); + int32 ryb[TN] = get_global_range[TN](1); + int32 rz = get_global_range[1](2); + int32 rka[TK] = 0 ... TK; + int32 rkb[TK] = 0 ... TK; + fp32 c[TM, TN] = 0; + int32 div = K / GZ; + int32 rem = K % GZ; + K = select(rz < rem, div - 1, div); + int32 offk = select(rz < rem, rz*(div + 1), rz*div + rem); + fp32* pa[TM, TK] = A + (offk + rka[newaxis, :])*lda + rxa[:, newaxis]; + fp32* pb[TN, TK] = B + (offk + rkb[newaxis, :])*ldb + ryb[:, newaxis]; + fp32 a[TM, TK] = *pa; + fp32 b[TN, TK] = *pb; + int32 last_a = ((M*K - 1) - (TM*TK + 1)) / lda; + int32 last_b = ((K*N - 1) - (TN*TK + 1)) / ldb; + last_a = last_a / TK * TK; + last_b = last_b / TK * TK; + int32 bound = K - max(last_a, last_b); + for(int32 k = K; k > bound; k = k - TK){ + c = dot(a, trans(b), c); + pa = pa + TK*lda; + pb = pb + TK*ldb; + a = *pa; + b = *pb; + } + int32 rxc[TM] = get_global_range[TM](0); + int32 ryc[TN] = get_global_range[TN](1); + for(int32 k = bound; k > 0; k = k - 1){ + int1 checka[TM, 1] = rxc[:, newaxis] < M; + int1 checkb[TN, 1] = ryc[:, newaxis] < N; + fp32* pa[TM, 1] = A + (offk + K - k)*lda + rxc[:, newaxis]; + fp32* pb[TN, 1] = B + (offk + K - k)*ldb + ryc[:, newaxis]; + fp32 a[TM, 1] = checka ? *pa : 0; + fp32 b[TN, 1] = checkb ? *pb : 0; + c = dot(a, trans(b), c); + } + int32 ridx = get_range_id(0); + int32 ridy = get_range_id(1); + fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; + int32 *plock = locks + ridx + ridy*grid0; + while(__atomic_cas(plock, 0, 1)); + int32 *pcount = plock + grid0*grid1; + int32 count = *pcount; + int32 countp1 = select(count == GZ - 1, 0, count + 1); + int1 checkc0[TM] = rxc < M; + int1 checkc1[TN] = ryc < N; + int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + if(count == 0) { + @checkc *pc = c; + *pcount = countp1; + } + else { + @checkc *pc = c + *pc; + *pcount = countp1; + } + __atomic_cas(plock, 1, 0); +} +)"; + +REGISTER_OP("Dot") + .Input("a: T") + .Input("b: T") + .Input("locks: int32") + .Output("c: T") + .Attr("T: {float}") +; + +class BlockSparseGemmOp : public OpKernel { + public: + explicit BlockSparseGemmOp(OpKernelConstruction* context) : OpKernel(context) { + } + + void Compute(OpKernelContext* context){ + // get device/stream + GPUDevice device = context->eigen_device(); + triton::driver::cu_stream sstream(device.stream(), false); + triton::driver::context* ctx = sstream.context(); + triton::driver::stream* stream = &sstream; + // get inputs + const Tensor& a = context->input(0); + const Tensor& b = context->input(1); + const Tensor& locks = context->input(2); + // get shapes + const int32_t M = a.dim_size(0); + const int32_t N = b.dim_size(0); + const int32_t K = a.dim_size(1); + // allocate output + Tensor* c = nullptr; + TensorShape out_shape({(int64)M, (int64)N}); + OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &c)); + // return early if possible + if (out_shape.num_elements() == 0) + return; + // initialize default compute device + triton::jit jit(ctx); + // matrix multiplication parameters + triton::driver::cu_buffer da(ctx, (CUdeviceptr)a.flat().data(), false); + triton::driver::cu_buffer db(ctx, (CUdeviceptr)b.flat().data(), false); + triton::driver::cu_buffer dc(ctx, (CUdeviceptr)c->flat().data(), false); + triton::driver::cu_buffer dlocks(ctx, (CUdeviceptr)locks.flat().data(), false); + stream->synchronize(); + // just-in-time compile source-code + jit.add_module("matmul", src, {16, 2, 64, 16, 2, 64, 16, 8, 2, 2, 8, 8, 8, 1}); + triton::driver::kernel* kernel = jit.get_function("matmul"); + triton::jit::launch_information info = jit.get_launch_info("matmul"); + // launch info + unsigned TM = info.global_range_size[0]; + unsigned TN = info.global_range_size[1]; + unsigned nthreads = info.num_threads; + unsigned GZ = jit.get_int("GZ"); + std::array grid = {(M + TM - 1)/TM, (N + TN - 1)/TN, GZ}; + // set argument + kernel->setArg(0, *da.cu()); + kernel->setArg(1, *db.cu()); + kernel->setArg(2, *dc.cu()); + kernel->setArg(3, M); + kernel->setArg(4, N); + kernel->setArg(5, K); + kernel->setArg(6, M); + kernel->setArg(7, N); + kernel->setArg(8, M); + kernel->setArg(9, *dlocks.cu()); + kernel->setArg(10, grid[0]); + kernel->setArg(11, grid[1]); + stream->enqueue(kernel, grid, {nthreads, 1, 1}); + } + +private: +}; + +REGISTER_KERNEL_BUILDER(Name("Dot").Device(DEVICE_GPU).TypeConstraint("T"), BlockSparseGemmOp); From 208d1525de4214d5c5589474bcb96d3e9b15c26b Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 2 May 2019 10:43:33 -0400 Subject: [PATCH 133/494] [driver] added spirv-llvm dispatch functions --- examples/cpp/dot.cpp | 3 ++- include/triton/driver/dispatch.h | 28 ++++++++++++++++++++++------ lib/driver/dispatch.cpp | 20 ++++++++++++++++++++ lib/jit.cpp | 3 +-- 4 files changed, 45 insertions(+), 9 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index ecbcab0ed..3be7d8880 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -103,7 +103,6 @@ int main() { stream->write(da, true, 0, ha); stream->write(db, true, 0, hb); stream->write(dc, true, 0, hc); - stream->write(dlocks, true, 0, hlocks); stream->synchronize(); @@ -116,6 +115,8 @@ int main() { unsigned nthreads = info.num_threads; unsigned GZ = jit.get_int("GZ"); std::array grid = {(M + TM - 1)/TM, (N + TN - 1)/TN, GZ}; + // init locks + stream->write(dlocks, true, 0, hlocks); // set argument kernel->setArg(0, da); kernel->setArg(1, db); diff --git a/include/triton/driver/dispatch.h b/include/triton/driver/dispatch.h index 71411b1ca..bd25ba2b2 100755 --- a/include/triton/driver/dispatch.h +++ b/include/triton/driver/dispatch.h @@ -38,6 +38,11 @@ #include #include +namespace llvm { +class PassRegistry; +class Module; +} + namespace triton { namespace driver @@ -85,6 +90,7 @@ public: static bool cuinit(); static bool cublasinit(); static bool cudnninit(); + static bool spvllvminit(); static void release(); // OpenCL @@ -123,10 +129,9 @@ public: static cl_program clCreateProgramWithSource(cl_context, cl_uint, const char **, const size_t *, cl_int *); static cl_int clReleaseKernel(cl_kernel); - //CUDA + // CUDA static CUresult cuCtxGetCurrent(CUcontext *pctx); static CUresult cuCtxSetCurrent(CUcontext ctx); - static CUresult cuCtxDestroy_v2(CUcontext ctx); static CUresult cuEventCreate(CUevent *phEvent, unsigned int Flags); static CUresult cuDeviceGet(CUdevice *device, int ordinal); @@ -139,7 +144,6 @@ public: static CUresult cuDeviceGetName(char *name, int len, CUdevice dev); static CUresult cuDeviceGetPCIBusId(char *id, int len, CUdevice dev); static CUresult cuModuleGetGlobal_v2(CUdeviceptr *dptr, size_t* bytes, CUmodule hmod, const char *name); - static CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); static CUresult cuModuleLoad(CUmodule *module, const char *fname); static CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra); @@ -161,12 +165,12 @@ public: static CUresult cuPointerGetAttribute(void * data, CUpointer_attribute attribute, CUdeviceptr ptr); static CUresult cuCtxGetDevice(CUdevice* result); static CUresult cuMemsetD8Async(CUdeviceptr dst, unsigned char x, size_t N, CUstream stream); - + // NVML static nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2( const char* pciBusId, nvmlDevice_t* device); static nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); static nvmlReturn_t nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int mem_clock, unsigned int sm_clock); - + // CUBLAS static cublasHandle_t cublasHandle(driver::cu_context const & ctx); static cublasStatus_t cublasCreate_v2(cublasHandle_t* h); static cublasStatus_t cublasGetStream_v2(cublasHandle_t h, cudaStream_t *streamId); @@ -175,7 +179,7 @@ public: static cublasStatus_t cublasDgemm_v2 (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc); static cublasStatus_t cublasHgemm (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, half* alpha, const half *A, int lda, const half *B, int ldb, half* beta, half *C, int ldc); static cublasStatus_t cublasGemmEx(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const void *alpha, const void *A, cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb, const void *beta, void *C, cudaDataType Ctype, int ldc, cudaDataType computeType, cublasGemmAlgo_t algo); - + // CUDNN static cudnnHandle_t cudnnHandle(driver::cu_context const & ctx); static cudnnStatus_t cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc); static cudnnStatus_t cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t* convDesc); @@ -196,6 +200,10 @@ public: static cudnnStatus_t cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId); static cudnnStatus_t cudnnTransformTensor(cudnnHandle_t handle, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y); + // SPIR-V libraries + static int initializeLLVMToSPIRVPass(llvm::PassRegistry &); + static bool writeSpirv(llvm::Module *M, std::ostream &OS, std::string &ErrMsg); + private: // Libraries @@ -204,6 +212,10 @@ private: static void* nvml_; static void* cublas_; static void* cudnn_; + static void* vulkan_; + static void* spvllvm_; + static void* spvcross_; + static void* opengl_; // OpenCL functions static void* clBuildProgram_; @@ -310,6 +322,10 @@ private: static void* cudnnPoolingForward_; static void* cudnnSetStream_; static void* cudnnTransformTensor_; + + // LLVM to SPIR-V + static void* initializeLLVMToSPIRVPass_; + static void* writeSpirv_; }; } diff --git a/lib/driver/dispatch.cpp b/lib/driver/dispatch.cpp index 7bb0fd001..f02d4ea2e 100755 --- a/lib/driver/dispatch.cpp +++ b/lib/driver/dispatch.cpp @@ -158,6 +158,12 @@ bool dispatch::cudnninit(){ return cudnn_ != nullptr; } +bool dispatch::spvllvminit(){ + if(spvllvm_==nullptr) + spvllvm_ = dlopen("libLLVMSPIRVLib.so", RTLD_LAZY); + return spvllvm_ != nullptr; +} + //CUDA CUDA_DEFINE1(CUresult, cuCtxDestroy_v2, CUcontext) CUDA_DEFINE2(CUresult, cuEventCreate, CUevent *, unsigned int) @@ -292,6 +298,15 @@ OCL_DEFINE5(cl_mem, clCreateBuffer, cl_context, cl_mem_flags, size_t, void *, cl OCL_DEFINE5(cl_program, clCreateProgramWithSource, cl_context, cl_uint, const char **, const size_t *, cl_int *) OCL_DEFINE1(cl_int, clReleaseKernel, cl_kernel) +// LLVM to SPIR-V +int dispatch::initializeLLVMToSPIRVPass(llvm::PassRegistry ®istry){ + return f_impl(spvllvm_, initializeLLVMToSPIRVPass, initializeLLVMToSPIRVPass_, "initializeLLVMToSPIRVPass", std::ref(registry)); +} + +bool dispatch::writeSpirv(llvm::Module *M, std::ostream &OS, std::string &ErrMsg){ + return f_impl(spvllvm_, writeSpirv, writeSpirv_, "writeSpirv", M, std::ref(OS), std::ref(ErrMsg)); +} + // Release void dispatch::release(){ if(cuda_){ @@ -313,6 +328,7 @@ void* dispatch::cuda_; void* dispatch::nvml_; void* dispatch::cublas_; void* dispatch::cudnn_; +void* dispatch::spvllvm_; //OpenCL void* dispatch::clBuildProgram_; @@ -421,5 +437,9 @@ void* dispatch::cudnnPoolingForward_; void* dispatch::cudnnSetStream_; void* dispatch::cudnnTransformTensor_; +// SPIR-V +void* dispatch::initializeLLVMToSPIRVPass_; +void* dispatch::writeSpirv_; + } } diff --git a/lib/jit.cpp b/lib/jit.cpp index 460df4275..059f96a00 100644 --- a/lib/jit.cpp +++ b/lib/jit.cpp @@ -71,12 +71,11 @@ std::unique_ptr jit::make_llvm_module(ir::module &module, passes_w llvm::Module* result = new llvm::Module(module.get_name(), llvm_context_); passes.selection.run(module, *result); // launch information - launch_information info; + launch_information& info = launch_info_map_[result->getName()]; info.global_range_size.clear(); for(unsigned i = 0; i < passes.tune.get_num_global_range(); i++) info.global_range_size.push_back(passes.tune.get_global_range_size(i)); info.num_threads = passes.tune.get_num_threads(); - launch_info_map_.insert({result->getName(), info}); return std::unique_ptr(result); } From 0d694445e62c8dc03a373f26358cd6193833e88a Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 3 May 2019 14:30:06 -0400 Subject: [PATCH 134/494] [examples] added skeleton for pytorch wrapper --- CMakeLists.txt | 2 +- cmake/FindTensorFlow.cmake | 21 +++++ cmake/FindTorch.cmake | 101 ++++++++++++++++++++++ examples/python/CMakeLists.txt | 1 + examples/python/pytorch/CMakeLists.txt | 6 ++ examples/python/pytorch/conv.cpp | 30 +++++++ examples/python/pytorch/main.py | 11 +++ examples/python/tensorflow/CMakeLists.txt | 24 +++-- 8 files changed, 181 insertions(+), 15 deletions(-) create mode 100644 cmake/FindTensorFlow.cmake create mode 100644 cmake/FindTorch.cmake create mode 100644 examples/python/pytorch/CMakeLists.txt create mode 100644 examples/python/pytorch/conv.cpp create mode 100644 examples/python/pytorch/main.py diff --git a/CMakeLists.txt b/CMakeLists.txt index d9e1ac845..d2e486afb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,7 @@ cmake_minimum_required(VERSION 2.8) project(triton) include(CTest) +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") # FLEX/YACC find_package(BISON) @@ -16,7 +17,6 @@ include_directories(${BISON_Parser_INCLUDE_DIRECTORIES}) # LLVM find_package(LLVM REQUIRED CONFIG) -message(STATUS ${LLVM_INCLUDE_DIRS}) include_directories(${LLVM_INCLUDE_DIRS}) add_definitions(${LLVM_DEFINITIONS}) #llvm_map_components_to_libnames(llvm_libs all) diff --git a/cmake/FindTensorFlow.cmake b/cmake/FindTensorFlow.cmake new file mode 100644 index 000000000..dcbb43924 --- /dev/null +++ b/cmake/FindTensorFlow.cmake @@ -0,0 +1,21 @@ +include(FindPackageHandleStandardArgs) +unset(TENSORFLOW_FOUND) + +execute_process(COMMAND python -c "from os.path import dirname; import tensorflow as tf; print(dirname(dirname(tf.sysconfig.get_include())))" + OUTPUT_VARIABLE TF_INC OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET) +execute_process(COMMAND python -c "import tensorflow as tf; print(tf.sysconfig.get_lib())" + OUTPUT_VARIABLE TF_LIB OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET) +execute_process(COMMAND python -c "import tensorflow as tf; print(tf.__cxx11_abi_flag__ if \"__cxx11_abi_flag__\" in tf.__dict__ else 0)" + OUTPUT_VARIABLE TF_ABI OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET) + +find_package_handle_standard_args(TensorFlow DEFAULT_MSG TF_INC TF_LIB) + +# set external variables for usage in CMakeLists.txt +if(TensorFlow_FOUND) + set(TensorFlow_LIBRARIES ${TF_LIB}) + set(TensorFlow_INCLUDE_DIRS ${TF_INC}) + set(TensorFlow_ABI ${TF_ABI}) +endif() + +# hide locals from GUI +mark_as_advanced(TF_INC TF_LIB TF_ABI) diff --git a/cmake/FindTorch.cmake b/cmake/FindTorch.cmake new file mode 100644 index 000000000..906f021f3 --- /dev/null +++ b/cmake/FindTorch.cmake @@ -0,0 +1,101 @@ +# FindTorch +# ------- +# +# Finds the Torch library +# +# This will define the following variables: +# +# TORCH_FOUND -- True if the system has the Torch library +# TORCH_INCLUDE_DIRS -- The include directories for torch +# TORCH_LIBRARIES -- Libraries to link against +# TORCH_CXX_FLAGS -- Additional (required) compiler flags +# +# and the following imported targets: +# +# torch + +include(FindPackageHandleStandardArgs) + +if (DEFINED ENV{TORCH_INSTALL_PREFIX}) + set(TORCH_INSTALL_PREFIX $ENV{TORCH_INSTALL_PREFIX}) +else() + # Assume we are in /share/cmake/Torch/TorchConfig.cmake + get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) + get_filename_component(TORCH_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/../../../" ABSOLUTE) +endif() + +# Include directories. +if (EXISTS "${TORCH_INSTALL_PREFIX}/include") + set(TORCH_INCLUDE_DIRS + ${TORCH_INSTALL_PREFIX}/include + ${TORCH_INSTALL_PREFIX}/include/torch/csrc/api/include) +else() + set(TORCH_INCLUDE_DIRS + ${TORCH_INSTALL_PREFIX}/include + ${TORCH_INSTALL_PREFIX}/include/torch/csrc/api/include) +endif() + +# Library dependencies. +if (@BUILD_SHARED_LIBS@) + find_package(Caffe2 REQUIRED PATHS ${CMAKE_CURRENT_LIST_DIR}/../Caffe2) +endif() + +if (NOT ANDROID) + find_library(TORCH_LIBRARY torch PATHS "${TORCH_INSTALL_PREFIX}/lib") +else() + find_library(TORCH_LIBRARY NO_CMAKE_FIND_ROOT_PATH torch PATHS "${TORCH_INSTALL_PREFIX}/lib") +endif() +add_library(torch UNKNOWN IMPORTED) +set(TORCH_LIBRARIES torch ${Caffe2_MAIN_LIBS}) + +if (NOT ANDROID) + find_library(C10_LIBRARY c10 PATHS "${TORCH_INSTALL_PREFIX}/lib") +else() + find_library(C10_LIBRARY c10 NO_CMAKE_FIND_ROOT_PATH PATHS "${TORCH_INSTALL_PREFIX}/lib") +endif() +list(APPEND TORCH_LIBRARIES ${C10_LIBRARY}) + +if (@USE_CUDA@) + if(MSVC) + set(NVTOOLEXT_HOME "C:/Program Files/NVIDIA Corporation/NvToolsExt") + if ($ENV{NVTOOLEXT_HOME}) + set(NVTOOLEXT_HOME $ENV{NVTOOLEXT_HOME}) + endif() + set(TORCH_CUDA_LIBRARIES + ${NVTOOLEXT_HOME}/lib/x64/nvToolsExt64_1.lib + ${CUDA_LIBRARIES}) + list(APPEND TORCH_INCLUDE_DIRS ${NVTOOLEXT_HOME}/include) + elseif(APPLE) + set(TORCH_CUDA_LIBRARIES + ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcudart.dylib + ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvrtc.dylib + ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvToolsExt.dylib + ${CUDA_LIBRARIES}) + else() + find_library(LIBNVTOOLSEXT libnvToolsExt.so PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64/) + set(TORCH_CUDA_LIBRARIES + ${CUDA_CUDA_LIB} + ${CUDA_NVRTC_LIB} + ${LIBNVTOOLSEXT} + ${CUDA_LIBRARIES}) + endif() + find_library(C10_CUDA_LIBRARY c10_cuda PATHS "${TORCH_INSTALL_PREFIX}/lib") + list(APPEND TORCH_CUDA_LIBRARIES ${C10_CUDA_LIBRARY}) + list(APPEND TORCH_LIBRARIES ${TORCH_CUDA_LIBRARIES}) +endif() + +# When we build libtorch with the old GCC ABI, dependent libraries must too. +if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + set(TORCH_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=@GLIBCXX_USE_CXX11_ABI@") +endif() + +set_target_properties(torch PROPERTIES + IMPORTED_LOCATION "${TORCH_LIBRARY}" + INTERFACE_INCLUDE_DIRECTORIES "${TORCH_INCLUDE_DIRS}" + CXX_STANDARD 11 +) +if (TORCH_CXX_FLAGS) + set_property(TARGET torch PROPERTY INTERFACE_COMPILE_OPTIONS "${TORCH_CXX_FLAGS}") +endif() + +find_package_handle_standard_args(torch DEFAULT_MSG TORCH_LIBRARY TORCH_INCLUDE_DIRS) diff --git a/examples/python/CMakeLists.txt b/examples/python/CMakeLists.txt index 82844b5a0..a73011f48 100644 --- a/examples/python/CMakeLists.txt +++ b/examples/python/CMakeLists.txt @@ -1 +1,2 @@ add_subdirectory(tensorflow) +add_subdirectory(pytorch) diff --git a/examples/python/pytorch/CMakeLists.txt b/examples/python/pytorch/CMakeLists.txt new file mode 100644 index 000000000..b400e1ef4 --- /dev/null +++ b/examples/python/pytorch/CMakeLists.txt @@ -0,0 +1,6 @@ +find_package(Torch) +if(${Torch_FOUND}) + add_library(torch_triton SHARED conv.cpp) + target_compile_features(torch_triton PRIVATE cxx_range_for) + target_link_libraries(torch_triton "${TORCH_LIBRARIES}") +endif() diff --git a/examples/python/pytorch/conv.cpp b/examples/python/pytorch/conv.cpp new file mode 100644 index 000000000..7230ed62e --- /dev/null +++ b/examples/python/pytorch/conv.cpp @@ -0,0 +1,30 @@ +#include +#include + +#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") +#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x " must be contiguous") +#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) + +at::Tensor conv_forward( + const at::Tensor data, + const at::Tensor weight) { + // Check + CHECK_INPUT(data); + CHECK_INPUT(weight); + // Unpack data shapes + const auto B = data.size(0); + const auto Ci = data.size(1); + const auto H = data.size(2); + const auto W = data.size(3); + // Unpack weight shapes + const auto Cf = weight.size(0); + const auto R = weight.size(1); + const auto S = weight.size(2); + const auto K = weight.size(3); + // Create output + AT_CHECK(Ci == Cf, "Number of channels in data and weights must match"); + return at::empty({B, K, H, W}, at::kFloat); +} + +static auto registry = + torch::jit::RegisterOperators("triton::conv::forward", &conv_forward); diff --git a/examples/python/pytorch/main.py b/examples/python/pytorch/main.py new file mode 100644 index 000000000..b9984438b --- /dev/null +++ b/examples/python/pytorch/main.py @@ -0,0 +1,11 @@ +import math +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Variable +from torch.utils.cpp_extension import load +from torch.distributions import categorical +from itertools import product + +conv_triton = load( 'conv_triton', ['conv.cpp', 'conv.cu'], extra_cflags=['-O3']) diff --git a/examples/python/tensorflow/CMakeLists.txt b/examples/python/tensorflow/CMakeLists.txt index 5b3b04df6..008397c1b 100644 --- a/examples/python/tensorflow/CMakeLists.txt +++ b/examples/python/tensorflow/CMakeLists.txt @@ -1,14 +1,10 @@ -execute_process(COMMAND python -c "from os.path import dirname; import tensorflow as tf; print(dirname(dirname(tf.sysconfig.get_include())))" - OUTPUT_VARIABLE TF_INC OUTPUT_STRIP_TRAILING_WHITESPACE) -execute_process(COMMAND python -c "import tensorflow as tf; print(tf.sysconfig.get_lib())" - OUTPUT_VARIABLE TF_LIB OUTPUT_STRIP_TRAILING_WHITESPACE) -execute_process(COMMAND python -c "import tensorflow as tf; print(tf.__cxx11_abi_flag__ if \"__cxx11_abi_flag__\" in tf.__dict__ else 0)" - OUTPUT_VARIABLE TF_ABI OUTPUT_STRIP_TRAILING_WHITESPACE) - -set(CUDA_HOME "/usr/local/cuda") -include_directories("${TF_INC}/tensorflow/include") -include_directories("${CUDA_HOME}/include") -link_directories(${TF_LIB}) -add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) -add_library(tf_blocksparse SHARED blocksparse.cpp) -target_link_libraries(tf_blocksparse tensorflow_framework triton) +find_package(TensorFlow) +if(${TensorFlow_FOUND}) + set(CUDA_HOME "/usr/local/cuda") + include_directories("${TF_INC}/tensorflow/include") + include_directories("${CUDA_HOME}/include") + link_directories(${TF_LIB}) + add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) + add_library(tf_blocksparse SHARED blocksparse.cpp) + target_link_libraries(tf_blocksparse tensorflow_framework triton) +endif() From 30833c18f11d49e04d27e74e16a463e4a64878de Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 4 May 2019 01:32:34 -0400 Subject: [PATCH 135/494] [codegen/tune] bugfix in heuristics for nano-tile sizes --- cmake/FindTensorFlow.cmake | 1 - cmake/FindTorch.cmake | 104 ++-------------------- examples/cpp/conv.cpp | 2 +- examples/python/pytorch/CMakeLists.txt | 10 ++- examples/python/pytorch/conv.cpp | 115 +++++++++++++++++++++++-- examples/python/pytorch/main.py | 14 ++- lib/codegen/tune.cpp | 18 ++-- 7 files changed, 143 insertions(+), 121 deletions(-) diff --git a/cmake/FindTensorFlow.cmake b/cmake/FindTensorFlow.cmake index dcbb43924..405febbeb 100644 --- a/cmake/FindTensorFlow.cmake +++ b/cmake/FindTensorFlow.cmake @@ -17,5 +17,4 @@ if(TensorFlow_FOUND) set(TensorFlow_ABI ${TF_ABI}) endif() -# hide locals from GUI mark_as_advanced(TF_INC TF_LIB TF_ABI) diff --git a/cmake/FindTorch.cmake b/cmake/FindTorch.cmake index 906f021f3..56b1e7c16 100644 --- a/cmake/FindTorch.cmake +++ b/cmake/FindTorch.cmake @@ -1,101 +1,11 @@ -# FindTorch -# ------- -# -# Finds the Torch library -# -# This will define the following variables: -# -# TORCH_FOUND -- True if the system has the Torch library -# TORCH_INCLUDE_DIRS -- The include directories for torch -# TORCH_LIBRARIES -- Libraries to link against -# TORCH_CXX_FLAGS -- Additional (required) compiler flags -# -# and the following imported targets: -# -# torch - include(FindPackageHandleStandardArgs) +execute_process(COMMAND python -c "import torch; import os; print(os.path.dirname(torch.__file__))" + OUTPUT_VARIABLE TORCH_INSTALL_PREFIX OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET) -if (DEFINED ENV{TORCH_INSTALL_PREFIX}) - set(TORCH_INSTALL_PREFIX $ENV{TORCH_INSTALL_PREFIX}) -else() - # Assume we are in /share/cmake/Torch/TorchConfig.cmake - get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) - get_filename_component(TORCH_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/../../../" ABSOLUTE) +find_package_handle_standard_args(TORCH DEFAULT_MSG TORCH_INSTALL_PREFIX) +if(TORCH_INSTALL_PREFIX) + set(TORCH_INCLUDE_DIRS ${TORCH_INSTALL_PREFIX}/lib/include/ ${TORCH_INSTALL_PREFIX}/lib/include/torch/csrc/api/include) + set(TORCH_LIBRARY_DIRS ${TORCH_INSTALL_PREFIX}/lib/) endif() -# Include directories. -if (EXISTS "${TORCH_INSTALL_PREFIX}/include") - set(TORCH_INCLUDE_DIRS - ${TORCH_INSTALL_PREFIX}/include - ${TORCH_INSTALL_PREFIX}/include/torch/csrc/api/include) -else() - set(TORCH_INCLUDE_DIRS - ${TORCH_INSTALL_PREFIX}/include - ${TORCH_INSTALL_PREFIX}/include/torch/csrc/api/include) -endif() - -# Library dependencies. -if (@BUILD_SHARED_LIBS@) - find_package(Caffe2 REQUIRED PATHS ${CMAKE_CURRENT_LIST_DIR}/../Caffe2) -endif() - -if (NOT ANDROID) - find_library(TORCH_LIBRARY torch PATHS "${TORCH_INSTALL_PREFIX}/lib") -else() - find_library(TORCH_LIBRARY NO_CMAKE_FIND_ROOT_PATH torch PATHS "${TORCH_INSTALL_PREFIX}/lib") -endif() -add_library(torch UNKNOWN IMPORTED) -set(TORCH_LIBRARIES torch ${Caffe2_MAIN_LIBS}) - -if (NOT ANDROID) - find_library(C10_LIBRARY c10 PATHS "${TORCH_INSTALL_PREFIX}/lib") -else() - find_library(C10_LIBRARY c10 NO_CMAKE_FIND_ROOT_PATH PATHS "${TORCH_INSTALL_PREFIX}/lib") -endif() -list(APPEND TORCH_LIBRARIES ${C10_LIBRARY}) - -if (@USE_CUDA@) - if(MSVC) - set(NVTOOLEXT_HOME "C:/Program Files/NVIDIA Corporation/NvToolsExt") - if ($ENV{NVTOOLEXT_HOME}) - set(NVTOOLEXT_HOME $ENV{NVTOOLEXT_HOME}) - endif() - set(TORCH_CUDA_LIBRARIES - ${NVTOOLEXT_HOME}/lib/x64/nvToolsExt64_1.lib - ${CUDA_LIBRARIES}) - list(APPEND TORCH_INCLUDE_DIRS ${NVTOOLEXT_HOME}/include) - elseif(APPLE) - set(TORCH_CUDA_LIBRARIES - ${CUDA_TOOLKIT_ROOT_DIR}/lib/libcudart.dylib - ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvrtc.dylib - ${CUDA_TOOLKIT_ROOT_DIR}/lib/libnvToolsExt.dylib - ${CUDA_LIBRARIES}) - else() - find_library(LIBNVTOOLSEXT libnvToolsExt.so PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64/) - set(TORCH_CUDA_LIBRARIES - ${CUDA_CUDA_LIB} - ${CUDA_NVRTC_LIB} - ${LIBNVTOOLSEXT} - ${CUDA_LIBRARIES}) - endif() - find_library(C10_CUDA_LIBRARY c10_cuda PATHS "${TORCH_INSTALL_PREFIX}/lib") - list(APPEND TORCH_CUDA_LIBRARIES ${C10_CUDA_LIBRARY}) - list(APPEND TORCH_LIBRARIES ${TORCH_CUDA_LIBRARIES}) -endif() - -# When we build libtorch with the old GCC ABI, dependent libraries must too. -if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") - set(TORCH_CXX_FLAGS "-D_GLIBCXX_USE_CXX11_ABI=@GLIBCXX_USE_CXX11_ABI@") -endif() - -set_target_properties(torch PROPERTIES - IMPORTED_LOCATION "${TORCH_LIBRARY}" - INTERFACE_INCLUDE_DIRECTORIES "${TORCH_INCLUDE_DIRS}" - CXX_STANDARD 11 -) -if (TORCH_CXX_FLAGS) - set_property(TARGET torch PROPERTY INTERFACE_COMPILE_OPTIONS "${TORCH_CXX_FLAGS}") -endif() - -find_package_handle_standard_args(torch DEFAULT_MSG TORCH_LIBRARY TORCH_INCLUDE_DIRS) +mark_as_advanced(TORCH_INCLUDE_DIRS TORCH_LIBRARY_DIRS) diff --git a/examples/cpp/conv.cpp b/examples/cpp/conv.cpp index 150fafb91..f8bec004e 100644 --- a/examples/cpp/conv.cpp +++ b/examples/cpp/conv.cpp @@ -217,7 +217,7 @@ int main() { 16, 2, 64, 32, 2, 64, 16, 8, 2, 2, - 8, 8, + 8, 1, 8, 4 }; // jit.autotune("conv", src, benchmark); diff --git a/examples/python/pytorch/CMakeLists.txt b/examples/python/pytorch/CMakeLists.txt index b400e1ef4..22e52c65d 100644 --- a/examples/python/pytorch/CMakeLists.txt +++ b/examples/python/pytorch/CMakeLists.txt @@ -1,6 +1,10 @@ find_package(Torch) -if(${Torch_FOUND}) +if(${TORCH_FOUND}) + set(CUDA_HOME "/usr/local/cuda") + include_directories(${TORCH_INCLUDE_DIRS}) + include_directories("${CUDA_HOME}/include") + link_directories(${TORCH_LIBRARY_DIRS}) + add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) add_library(torch_triton SHARED conv.cpp) - target_compile_features(torch_triton PRIVATE cxx_range_for) - target_link_libraries(torch_triton "${TORCH_LIBRARIES}") + target_link_libraries(torch_triton torch triton) endif() diff --git a/examples/python/pytorch/conv.cpp b/examples/python/pytorch/conv.cpp index 7230ed62e..d3d2bb212 100644 --- a/examples/python/pytorch/conv.cpp +++ b/examples/python/pytorch/conv.cpp @@ -1,13 +1,96 @@ #include +#include +#include "ATen/cuda/CUDAContext.h" #include +#include "triton/jit.h" +#include "triton/driver/stream.h" #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) -at::Tensor conv_forward( - const at::Tensor data, - const at::Tensor weight) { +const char* src = +R"( +const tunable int32 TM = {16, 32, 64}; +const tunable int32 TN = {16, 32, 64}; +const tunable int32 TK = {8}; + +__constant__ int32* delta = alloc_const int32[18]; +__constant__ int32* masks = alloc_const int32[1024]; + +void conv(read_only restrict fp32 *a, + read_only restrict fp32 *b, + fp32 *c, + int32 M, int32 N, int32 K, + int32 AN, int32 AH, int32 AW, + int32 CN, int32 CK, int32 CP, int32 CQ, + int32 AC, int32 AR, int32 AS, + int32 lda_n, int32 lda_c, int32 lda_h, int32 lda_w, + int32 ldc_n, int32 ldc_k, int32 ldc_p, int32 ldc_q, + int32 pad_h, int32 pad_w, + int32 bound){ + int32 rxa[TM] = get_global_range[TM](0); + int32 rb0[TN] = get_global_range[TN](1); + int32 rka[TK] = 0 ... TK; + int32 rb1[TK] = 0 ... TK; + fp32 C[TM, TN] = 0; + int32 ranh[TM] = rxa / CQ; + int32 raw[TM] = rxa % CQ - pad_w; + int32 ran[TM] = ranh / CP; + int32 rah[TM] = ranh % CP - pad_h; + int32 ra0[TM] = ran*lda_n + rah*lda_h + raw*lda_w; + int32 racr[TK] = rka / AS; + int32 ras[TK] = rka % AS; + int32 rac[TK] = racr / AR; + int32 rar[TK] = racr % AR; + int32 ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; + fp32* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis]; + fp32* pb[TN, TK] = b + rb1[newaxis, :]*CK + rb0[:, newaxis]; + __constant__ int32* pincd[TK] = delta + rka; + __constant__ int32* pd[TK] = delta + AR*AS + rka; + int32 d[TK] = *pd; + int32 incd[TK] = *pincd; + int32 maskh[TM] = pad_h + min(rah, 0) + max(rah + AR - AH, 0); + int32 maskw[TM] = pad_w + min(raw, 0) + max(raw + AS - AW, 0); + __constant__ int32* pm[TM] = masks + AR*AS + maskw*AR*AS + maskh*AR*AS*(2*pad_w + 1); + __constant__ int32* pincm[TM] = delta; + int32 incm[TM] = *pincm; + int32 checka0[TM] = *pm; + int32 checka1[TK] = 1 << rka; + int1 checka[TM, TK] = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; + fp32 a[TM, TK] = checka ? *pa : 0; + fp32 b[TN, TK] = *pb; + for(int32 k = K; k > 0; k = k - TK){ + C = dot(a, trans(b), C); + pb = pb + TK*CK; + pa = pa + d[newaxis, :]; + b = *pb; + pd = pd + incd; + pincd = pincd + incd; + d = *pd; + incd = *pincd; + pm = pm + incm; + pincm = pincm + incm; + incm = *pincm; + checka0 = *pm; + checka = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; + a = checka ? *pa : 0; + } + int32 rxc[TM] = get_global_range[TM](0); + int32 rc1[TN] = get_global_range[TN](1); + int32 rcn[TM] = rxc / (CP*CQ); + int32 rcpq[TM] = rxc % (CP*CQ); + int32 rc0[TM] = rcn * ldc_n + rcpq; + fp32* pc[TM, TN] = c + rc1[newaxis, :]*ldc_k + rc0[:, newaxis]; + int1 checkc0[TM] = rxc < M; + int1 checkc1[TN] = rc1 < N; + int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + @checkc *pc = C; +})"; + +torch::Tensor conv_forward( + const torch::Tensor data, + const torch::Tensor weight) { // Check CHECK_INPUT(data); CHECK_INPUT(weight); @@ -21,10 +104,30 @@ at::Tensor conv_forward( const auto R = weight.size(1); const auto S = weight.size(2); const auto K = weight.size(3); - // Create output + // Allocate output AT_CHECK(Ci == Cf, "Number of channels in data and weights must match"); - return at::empty({B, K, H, W}, at::kFloat); + torch::Tensor output = torch::empty({B, K, H, W}, torch::kFloat); + // Wrap CUDA handles + triton::driver::cu_stream sstream(at::cuda::getCurrentCUDAStream(), false); + triton::driver::stream* stream = &sstream; + triton::driver::context* ctx = stream->context(); + triton::driver::cu_buffer d(ctx, (CUdeviceptr)data.storage().data(), false); + triton::driver::cu_buffer w(ctx, (CUdeviceptr)weight.storage().data(), false); + // Create JIT + triton::jit jit(ctx); + std::vector params = { + 16, 2, 64, + 32, 2, 64, + 16, 8, 2, 2, + 8, 8, + 4 + }; + jit.add_module("conv", src, params); + triton::driver::kernel* kernel = jit.get_function("conv"); + triton::jit::launch_information info = jit.get_launch_info("conv"); + + return output; } static auto registry = - torch::jit::RegisterOperators("triton::conv::forward", &conv_forward); + torch::jit::RegisterOperators("triton::conv_forward", &conv_forward); diff --git a/examples/python/pytorch/main.py b/examples/python/pytorch/main.py index b9984438b..d4b11e316 100644 --- a/examples/python/pytorch/main.py +++ b/examples/python/pytorch/main.py @@ -1,11 +1,9 @@ -import math -import numpy as np import torch -import torch.nn as nn -import torch.nn.functional as F from torch.autograd import Variable -from torch.utils.cpp_extension import load -from torch.distributions import categorical -from itertools import product -conv_triton = load( 'conv_triton', ['conv.cpp', 'conv.cu'], extra_cflags=['-O3']) +torch.ops.load_library("/home/philippe/Development/triton/build/examples/python/pytorch/libtorch_triton.so") + +d = torch.empty(64, 64, 64, 64).uniform_(0, 1).cuda() +w = torch.empty(64, 3, 3, 64).uniform_(0, 1).cuda() +a = torch.ops.triton.conv_forward(d, w) +print(a) diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 1a1562c8f..9b71aea4f 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -171,11 +171,19 @@ void tune::run(ir::module &mod) { // Simplify metaparameters for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i : block->get_inst_list()) - if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ - ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 2)); - *params_.at(i).at("nts.d0") = *tmp; + for(ir::instruction *i : block->get_inst_list()){ + if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ + ir::type *ty = mod.get_builder().get_int32_ty(); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 2)); + *params_.at(i).at("nts.d0") = *tmp; + } + if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ + ir::type *ty = mod.get_builder().get_int32_ty(); + std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 2, 2)); + std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 2, 2)); + *params_.at(i).at("nts.d0") = *tmp1; + *params_.at(i).at("nts.d1") = *tmp2; + } } } From 4813bb007c1f9a550c1fa1a95c5a340ad25c9f6d Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 4 May 2019 12:09:27 -0400 Subject: [PATCH 136/494] [codegen] bugfix in builder insert point for predicated instructions --- examples/cpp/dot.cpp | 17 +---------------- include/triton/jit.h | 1 + lib/codegen/selection.cpp | 8 ++++++-- 3 files changed, 8 insertions(+), 18 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 3be7d8880..bf44b7cb5 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -53,26 +53,11 @@ void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, fp32 b[TN, 1] = checkb ? *pb : 0; c = dot(a, trans(b), c); } - int32 ridx = get_range_id(0); - int32 ridy = get_range_id(1); fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - int32 *plock = locks + ridx + ridy*grid0; - while(__atomic_cas(plock, 0, 1)); - int32 *pcount = plock + grid0*grid1; - int32 count = *pcount; - int32 countp1 = select(count == GZ - 1, 0, count + 1); int1 checkc0[TM] = rxc < M; int1 checkc1[TN] = ryc < N; int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - if(count == 0) { - @checkc *pc = c; - *pcount = countp1; - } - else { - @checkc *pc = c + *pc; - *pcount = countp1; - } - __atomic_cas(plock, 1, 0); + @checkc *pc = c; } )"; diff --git a/include/triton/jit.h b/include/triton/jit.h index a3e554c67..b001148e5 100644 --- a/include/triton/jit.h +++ b/include/triton/jit.h @@ -58,6 +58,7 @@ public: target_(target) { } void target_independent(ir::module &module) { +// ir::print(module, std::cout); optimize_dot.run(module); optimize_trans.run(module); } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 7750502bd..c59ca2f12 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -632,11 +632,13 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & LLVMContext &ctx = builder.getContext(); Function *fn = block->getParent(); ir::value *mask = ins->get_mask_pred(); + BasicBlock *last_block = nullptr; auto set_mask_insert_pt = [&](indices_t idx){ if(mask){ distributed_tile *mask_tile = (distributed_tile*)tmap_.at(ins->get_mask_pred()); BasicBlock *block = pmap_.at({mask_tile, idx}); builder.SetInsertPoint(block->getTerminator()); + last_block = last_block_.at({mask_tile, idx}); } }; // store @@ -646,7 +648,6 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & ptr->for_each([&](indices_t idx){ set_mask_insert_pt(idx); StoreInst *store = new StoreInst(value->get_value(idx), ptr->get_value(idx)); -// store->setAlignment(16); builder.Insert(store); }); } @@ -847,8 +848,11 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & }); } } - if(mask) + if(mask){ builder.SetInsertPoint(block); + if(last_block) + builder.SetInsertPoint(last_block); + } } void selection::lower_instruction(ir::instruction *src, IRBuilder<> &builder) { From f80441017c5d6c72fd8327ce2137c3f41a8891d3 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 4 May 2019 20:15:34 -0400 Subject: [PATCH 137/494] [codegen] added leading dimension padding for transposition in shared memory --- examples/cpp/common.hpp | 11 ++ examples/cpp/dot.cpp | 155 +++++++++++------- examples/python/pytorch/conv.cpp | 184 ++++++++++++++++++++-- include/triton/codegen/shmem_allocation.h | 1 + include/triton/jit.h | 1 - lib/codegen/selection.cpp | 26 +-- lib/codegen/shmem_allocation.cpp | 19 ++- 7 files changed, 314 insertions(+), 83 deletions(-) diff --git a/examples/cpp/common.hpp b/examples/cpp/common.hpp index 8a16b9457..87525eb68 100644 --- a/examples/cpp/common.hpp +++ b/examples/cpp/common.hpp @@ -14,6 +14,17 @@ void simple_gemm(std::vector &c, const std::vector &a, const std::vector +void simple_gemm(bool AT, bool BT, std::vector &c, const std::vector &a, const std::vector &b, size_t M, size_t N, size_t K) { + if(AT && BT) + simple_gemm(c, a, b, M, N, K); + else if(AT && !BT) + simple_gemm(c, a, b, M, N, K); + else if(!AT && BT) + simple_gemm(c, a, b, M, N, K); + else + simple_gemm(c, a, b, M, N, K); +} class timer{ typedef std::chrono::high_resolution_clock high_resolution_clock; diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index bf44b7cb5..980f83b31 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -5,63 +5,104 @@ #include "triton/driver/backend.h" #include "triton/driver/stream.h" -const char* src = -R"( -const tunable int32 TM = {16, 32, 64, 128}; -const tunable int32 TN = {16, 32, 64, 128}; -const tunable int32 TK = {8}; -const tunable int32 GZ = {1}; -void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, - int32 M, int32 N, int32 K, - int32 lda, int32 ldb, int32 ldc, - int32 *locks, int32 grid0, int32 grid1) { - int32 rxa[TM] = get_global_range[TM](0); - int32 ryb[TN] = get_global_range[TN](1); - int32 rz = get_global_range[1](2); - int32 rka[TK] = 0 ... TK; - int32 rkb[TK] = 0 ... TK; - fp32 c[TM, TN] = 0; - int32 div = K / GZ; - int32 rem = K % GZ; - K = select(rz < rem, div - 1, div); - int32 offk = select(rz < rem, rz*(div + 1), rz*div + rem); - fp32* pa[TM, TK] = A + (offk + rka[newaxis, :])*lda + rxa[:, newaxis]; - fp32* pb[TN, TK] = B + (offk + rkb[newaxis, :])*ldb + ryb[:, newaxis]; - fp32 a[TM, TK] = *pa; - fp32 b[TN, TK] = *pb; - int32 last_a = ((M*K - 1) - (TM*TK + 1)) / lda; - int32 last_b = ((K*N - 1) - (TN*TK + 1)) / ldb; - last_a = last_a / TK * TK; - last_b = last_b / TK * TK; - int32 bound = K - max(last_a, last_b); - for(int32 k = K; k > bound; k = k - TK){ - c = dot(a, trans(b), c); - pa = pa + TK*lda; - pb = pb + TK*ldb; - a = *pa; - b = *pb; +std::string triton_source(bool AT, bool BT) { + std::string AS0 = "TM", AS1 = "TK"; + std::string BS0 = "TK", BS1 = "TN"; + std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; + std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; + std::string lda0 = "*lda", lda1 = ""; + std::string ldb0 = "", ldb1 = "*ldb"; + std::string usea = AT ? "trans(a)" : "a"; + std::string useb = BT ? "trans(b)" : "b"; + if(AT){ + std::swap(AS0, AS1); + std::swap(bca0, bca1); + std::swap(lda0, lda1); } - int32 rxc[TM] = get_global_range[TM](0); - int32 ryc[TN] = get_global_range[TN](1); - for(int32 k = bound; k > 0; k = k - 1){ - int1 checka[TM, 1] = rxc[:, newaxis] < M; - int1 checkb[TN, 1] = ryc[:, newaxis] < N; - fp32* pa[TM, 1] = A + (offk + K - k)*lda + rxc[:, newaxis]; - fp32* pb[TN, 1] = B + (offk + K - k)*ldb + ryc[:, newaxis]; - fp32 a[TM, 1] = checka ? *pa : 0; - fp32 b[TN, 1] = checkb ? *pb : 0; - c = dot(a, trans(b), c); + if(BT){ + std::swap(BS0, BS1); + std::swap(bcb0, bcb1); + std::swap(ldb0, ldb1); } - fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - int1 checkc0[TM] = rxc < M; - int1 checkc1[TN] = ryc < N; - int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - @checkc *pc = c; + std::string res = + R"( + const tunable int32 TM = {16, 32, 64, 128}; + const tunable int32 TN = {16, 32, 64, 128}; + const tunable int32 TK = {8}; + const tunable int32 GZ = {1}; + + void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, + int32 M, int32 N, int32 K, + int32 lda, int32 ldb, int32 ldc, + int32 *locks, int32 grid0, int32 grid1) { + int32 rxa[TM] = get_global_range[TM](0); + int32 ryb[TN] = get_global_range[TN](1); + int32 rz = get_global_range[1](2); + int32 rka[TK] = 0 ... TK; + int32 rkb[TK] = 0 ... TK; + fp32 c[TM, TN] = 0; + int32 div = K / GZ; + int32 rem = K % GZ; + K = select(rz < rem, div - 1, div); + int32 offk = select(rz < rem, rz*(div + 1), rz*div + rem); + fp32* pa[)" + AS0 + ", " + AS1 + "] = A + (offk + rka" + bca0 + ")" + lda0 + " + rxa" + bca1 + lda1 + R"(; + fp32* pb[)" + BS0 + ", " + BS1 + "] = B + (offk + rkb" + bcb0 + ")" + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; + fp32 a[)" + AS0 + ", " + AS1 + R"(] = *pa; + fp32 b[)" + BS0 + ", " + BS1 + R"(] = *pb; + int32 last_a = ((M*K - 1) - (TM*TK + 1)) / lda; + int32 last_b = ((K*N - 1) - (TN*TK + 1)) / ldb; + last_a = last_a / TK * TK; + last_b = last_b / TK * TK; + int32 bound = K - max(last_a, last_b); + for(int32 k = K; k > bound; k = k - TK){ + c = dot()" + usea + ", " + useb + R"(, c); + pa = pa + TK)" + lda0 + R"(; + pb = pb + TK)" + ldb0 + R"(; + a = *pa; + b = *pb; + } + int32 rxc[TM] = get_global_range[TM](0); + int32 ryc[TN] = get_global_range[TN](1); + for(int32 k = bound; k > 0; k = k - 1){ + int1 checka[TM, 1] = rxc[:, newaxis] < M; + int1 checkb[TN, 1] = ryc[:, newaxis] < N; + fp32* pa[TM, 1] = A + (offk + K - k))" + lda0 + " + rxc[:, newaxis]" + lda1 + R"(; + fp32* pb[TN, 1] = B + (offk + K - k))" + ldb0 + " + ryc[:, newaxis]" + ldb1 + R"(; + fp32 a[TM, 1] = checka ? *pa : 0; + fp32 b[TN, 1] = checkb ? *pb : 0; + c = dot(a, trans(b), c); + } + int32 ridx = get_range_id(0); + int32 ridy = get_range_id(1); + fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; + int32 *plock = locks + ridx + ridy*grid0; + while(__atomic_cas(plock, 0, 1)); + int32 *pcount = plock + grid0*grid1; + int32 count = *pcount; + int32 countp1 = select(count == GZ - 1, 0, count + 1); + int1 checkc0[TM] = rxc < M; + int1 checkc1[TN] = ryc < N; + int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + if(count == 0) { + @checkc *pc = c; + *pcount = countp1; + } + else { + @checkc *pc = c + *pc; + *pcount = countp1; + } + __atomic_cas(plock, 1, 0); + } + )"; + return res; } -)"; + int main() { + bool AT = false; + bool BT = true; + // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); triton::jit jit(context); @@ -128,16 +169,16 @@ int main() { // just-in-time compile source-code - std::vector params = { - 16, 2, 64, 16, 2, 64, 16, 8, 2, 2, 8, 8, 8, 1 - }; -// jit.autotune("matmul",src, benchmark); - jit.add_module("matmul", src, params); + std::string src = triton_source(AT, BT); +// jit.autotune("matmul",src.c_str(), benchmark); + jit.add_module("matmul", src.c_str(), {16, 2, 64, 16, 2, 64, 16, 8, 2, 2, 8, 8, 8, 1}); +// jit.add_module("matmul", src.c_str(), {16, 2, 128, 32, 32, 32, 4, 2, 2, 8, 8, 4, 2, 1}); +// jit.add_module("matmul", src.c_str(), {32, 64, 32, 64, 16, 8, 2, 2, 4, 2, 8, 4, 2, 1}); triton::driver::kernel* kernel = jit.get_function("matmul"); triton::jit::launch_information info = jit.get_launch_info("matmul"); std::cout << "Performance: " << benchmark(kernel, info) << " TFLOPS " << std::endl; stream->read(dc, true, 0, hc); - simple_gemm(rc, ha, hb, M, N, K); + simple_gemm(AT, BT, rc, ha, hb, M, N, K); for(size_t i = 0; i < M*N; i++) if(std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; diff --git a/examples/python/pytorch/conv.cpp b/examples/python/pytorch/conv.cpp index d3d2bb212..71ea8e2be 100644 --- a/examples/python/pytorch/conv.cpp +++ b/examples/python/pytorch/conv.cpp @@ -88,6 +88,85 @@ void conv(read_only restrict fp32 *a, @checkc *pc = C; })"; +void build_conv_lut(int TK, + int stride_d, int stride_h, int stride_w, int stride_c, + int pad_d, int pad_h, int pad_w, + int T, int R, int S, + std::vector& res, std::vector& masks) { + /* convolution parameters */ + int F = T * R * S; + int Nlut = (TK + F - 1) / F * F; + int upsample_w = 1; + int upsample_h = 1; + int upsample_d = 1; + /* unpack index wrt filters */ + auto unpack = [&](int32_t trs){ + int32_t tr = trs / S; + int32_t s = trs - tr*S; + int32_t t = tr / R; + int32_t r = tr - t*R; + return std::make_tuple(t, r, s); + }; + /* increments */ + for(size_t i = 0; i < Nlut; ++i) + res[i] = (((i + TK) % Nlut) - i); + /* deltas */ + size_t Ds0 = Nlut; + size_t Ds1 = upsample_w; + size_t Ds2 = upsample_h; + size_t Ds3 = upsample_d; + for(size_t pd = 0; pd < Ds3; ++pd) + for(size_t ph = 0; ph < Ds2; ++ph) + for(size_t pw = 0; pw < Ds1; ++pw){ + int32_t* deltas_ptr = &res[Nlut + pw*Ds0 + ph*Ds0*Ds1 + pd*Ds0*Ds1*Ds2]; + // cumulative increments + for(size_t i = 0; i < Ds0; ++i){ + int32_t ctrs = i; + int32_t c = ctrs / F; + int32_t t, r, s; + std::tie(t, r, s) = unpack(ctrs % F); + // next indices + int32_t nextctrs = ctrs + TK; + int32_t nextc = nextctrs / F; + int32_t nextt, nextr, nexts; + std::tie(nextt, nextr, nexts) = unpack(nextctrs % F); + // diffs + int32_t cdiff = nextc - c; + int32_t tdiff = (nextt + pd)/upsample_d - (t + pd)/upsample_d; + int32_t rdiff = (nextr + ph)/upsample_h - (r + ph)/upsample_h; + int32_t sdiff = (nexts + pw)/upsample_w - (s + pw)/upsample_w; + // delta pointers + deltas_ptr[i] = cdiff*stride_c + sdiff*stride_w + rdiff*stride_h + tdiff*stride_d; + } + } + + /* Masks */ + size_t Ms0 = Nlut; + size_t Ms1 = 2*pad_w + 1; + size_t Ms2 = 2*pad_h + 1; + size_t Ms3 = 2*pad_d + 1; + + for(size_t pd = 0; pd < Ms3; ++pd) + for(size_t ph = 0; ph < Ms2; ++ph) + for(size_t pw = 0; pw < Ms1; ++pw){ + int32_t* masks_ptr = &masks[Nlut + pw*Ms0 + ph*Ms0*Ms1 + pd*Ms0*Ms1*Ms2]; + for(size_t i = 0; i < Ms0; ++i){ + int32_t t, r, s; + int32_t mask = 0x0; + for(size_t j = 0; j < TK; ++j){ + std::tie(t, r, s) = unpack((i + j) % F); + bool in_bounds_d = (t + pd) >= pad_d && (t + pd) < (T + pad_d); + bool in_bounds_h = (r + ph) >= pad_h && (r + ph) < (R + pad_h); + bool in_bounds_w = (s + pw) >= pad_w && (s + pw) < (S + pad_w); + mask |= (in_bounds_d && in_bounds_h && in_bounds_w) << j; + } + masks_ptr[i] = mask; + } + } + for(size_t i = 0; i < Nlut; ++i) + masks[i] = 0x0; +} + torch::Tensor conv_forward( const torch::Tensor data, const torch::Tensor weight) { @@ -95,37 +174,118 @@ torch::Tensor conv_forward( CHECK_INPUT(data); CHECK_INPUT(weight); // Unpack data shapes - const auto B = data.size(0); - const auto Ci = data.size(1); - const auto H = data.size(2); - const auto W = data.size(3); + const int32_t B = data.size(0); + const int32_t Ci = data.size(1); + const int32_t H = data.size(2); + const int32_t W = data.size(3); // Unpack weight shapes - const auto Cf = weight.size(0); - const auto R = weight.size(1); - const auto S = weight.size(2); - const auto K = weight.size(3); + const int32_t Cf = weight.size(0); + const int32_t T = 1; + const int32_t R = weight.size(1); + const int32_t S = weight.size(2); + const int32_t NF = weight.size(3); + // Conv parameters + int32_t upsample_d = 1, upsample_h = 1, upsample_w = 1; + int32_t pad_d = 0, pad_h = 0, pad_w = 0; + int32_t stride_h = 1, stride_w = 1; + // Output shapes + int32_t P = (H*upsample_h - R + 1 + 2*pad_h + stride_h - 1)/stride_h; + int32_t Q = (W*upsample_w - S + 1 + 2*pad_w + stride_w - 1)/stride_w; // Allocate output AT_CHECK(Ci == Cf, "Number of channels in data and weights must match"); - torch::Tensor output = torch::empty({B, K, H, W}, torch::kFloat); + torch::Tensor output = torch::empty({B, NF, P, Q}, torch::kFloat).cuda(); // Wrap CUDA handles - triton::driver::cu_stream sstream(at::cuda::getCurrentCUDAStream(), false); + c10::DeviceIndex device = output.storage().device().index(); + triton::driver::cu_stream sstream((CUstream)at::cuda::getCurrentCUDAStream(device).stream(), false); triton::driver::stream* stream = &sstream; triton::driver::context* ctx = stream->context(); triton::driver::cu_buffer d(ctx, (CUdeviceptr)data.storage().data(), false); triton::driver::cu_buffer w(ctx, (CUdeviceptr)weight.storage().data(), false); + triton::driver::cu_buffer a(ctx, (CUdeviceptr)output.storage().data(), false); // Create JIT triton::jit jit(ctx); std::vector params = { 16, 2, 64, 32, 2, 64, 16, 8, 2, 2, - 8, 8, + 8, 1, 8, 4 }; jit.add_module("conv", src, params); triton::driver::kernel* kernel = jit.get_function("conv"); triton::jit::launch_information info = jit.get_launch_info("conv"); - + // launch info + unsigned TM = info.global_range_size[0]; + unsigned TN = info.global_range_size[1]; + unsigned TK = jit.get_int("TK"); + // initialize constant memory + int FS = T*R*S; + int nlut = (TK + FS - 1) / FS * FS; + std::vector h_delta(nlut + upsample_d*upsample_h*upsample_w*nlut); + std::vector h_masks(nlut + (2*pad_h+1)*(2*pad_w+1)*(2*pad_d+1)*nlut); + // memory stride for images + int32_t stride_i_w = 1; + int32_t stride_i_h = W*stride_i_w; + int32_t stride_i_d = H*stride_i_h; + int32_t stride_i_c = 1*stride_i_d; + int32_t stride_i_n = Ci*stride_i_c; + // memory stride for activations + int32_t stride_o_q = 1; + int32_t stride_o_p = Q*stride_o_q; + int32_t stride_o_m = P*stride_o_p; + int32_t stride_o_k = 1*stride_o_m; + int32_t stride_o_n = NF*stride_o_k; + build_conv_lut(TK, stride_i_d, stride_i_h, stride_i_w, stride_i_c, pad_d, pad_h, pad_w, T, R, S, h_delta, h_masks); + // equivalent matmul dimensions + int32_t M = B*P*Q; + int32_t N = NF; + int32_t K = Ci*R*S; + triton::driver::buffer* delta = jit.get_buffer("delta"); + triton::driver::buffer* masks = jit.get_buffer("masks"); + stream->write(delta, false, 0, h_delta.size()*4, h_delta.data()); + stream->write(masks, false, 0, h_masks.size()*4, h_masks.data()); + // launch info + unsigned nthreads = info.num_threads; + std::array grid = {(M + TM - 1)/TM, (N + TN - 1)/TN, 1}; + // fast bounds-checking + unsigned lasti = (grid[0]*TM - 1)*TM + TM - 1; + unsigned lastj = (grid[1]*TN - 1)*TN + TN - 1; + unsigned lastk = TK - 1; + bool AT = false; + bool BT = true; + unsigned last_safe_a = (AT==false)?(M*K - 1 - lasti)/M - lastk : M*K - 1 - lasti*K - lastk; + unsigned last_safe_b = (BT==true)?(N*K - 1 - lastj)/N - lastk : N*K - 1 - lastj*K - lastk; + int32_t bound = std::max(1, std::max(K - last_safe_a, K - last_safe_b)); + // set arguments + kernel->setArg(0, *d.cu()); + kernel->setArg(1, *w.cu()); + kernel->setArg(2, *a.cu()); + kernel->setArg(3, M); + kernel->setArg(4, N); + kernel->setArg(5, K); + kernel->setArg(6, B); + kernel->setArg(7, H); + kernel->setArg(8, W); + kernel->setArg(9, B); + kernel->setArg(10, NF); + kernel->setArg(11, P); + kernel->setArg(12, Q); + kernel->setArg(13, Ci); + kernel->setArg(14, R); + kernel->setArg(15, S); + kernel->setArg(16, stride_i_n); + kernel->setArg(17, stride_i_c); + kernel->setArg(18, stride_i_h); + kernel->setArg(19, stride_i_w); + kernel->setArg(20, stride_o_n); + kernel->setArg(21, stride_o_k); + kernel->setArg(22, stride_o_p); + kernel->setArg(23, stride_o_q); + kernel->setArg(24, pad_h); + kernel->setArg(25, pad_w); + kernel->setArg(26, bound); +// // dry run + stream->enqueue(kernel, grid, {nthreads, 1, 1}); return output; } diff --git a/include/triton/codegen/shmem_allocation.h b/include/triton/codegen/shmem_allocation.h index 27a96f285..8a6f175a8 100644 --- a/include/triton/codegen/shmem_allocation.h +++ b/include/triton/codegen/shmem_allocation.h @@ -26,6 +26,7 @@ public: // utilities unsigned get_num_bytes(ir::value *x); + bool is_ld_padded(ir::value* x); // accessors unsigned get_offset(ir::value *x) const { return offsets_.at(x); } diff --git a/include/triton/jit.h b/include/triton/jit.h index b001148e5..a3e554c67 100644 --- a/include/triton/jit.h +++ b/include/triton/jit.h @@ -58,7 +58,6 @@ public: target_(target) { } void target_independent(ir::module &module) { -// ir::print(module, std::cout); optimize_dot.run(module); optimize_trans.run(module); } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index c59ca2f12..7927e5400 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -525,10 +525,12 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, for(ir::value *op: user->ops()) create_tile(op, builder, references, seen, sh_mem_ptr); LLVMContext &ctx = builder.getContext(); - const auto& shapes = v->get_type()->get_tile_shapes(); - std::vector shapes2; - for(ir::constant_int* shape: shapes) - shapes2.push_back(shape->get_value()); + const auto& cshapes = v->get_type()->get_tile_shapes(); + std::vector shapes; + for(ir::constant_int* shape: cshapes) + shapes.push_back(shape->get_value()); + if(alloc_->is_ld_padded(v)) + shapes[0] += 4; Type* ty = llvm_type(v->get_type()->get_scalar_ty(), ctx); // create shared tile if(buffer_info_->is_shared(v)){ @@ -550,13 +552,13 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, Value *pre_ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(alloc_->get_offset(phi))); pre_ptr = builder.CreateBitCast(pre_ptr, ptr->getType()); Value *next_ptr = builder.CreateGEP(ptr, offset, "next_ptr"); - tmap_.insert({phi, new shared_tile(ty, shapes2, ptr, builder, offset)}); + tmap_.insert({phi, new shared_tile(ty, shapes, ptr, builder, offset)}); for(unsigned i = 0; i < phi->get_num_incoming(); i++) { ir::basic_block* inc_block = phi->get_incoming_block(i); ir::value* inc_value = phi->get_incoming_value(i); ir::instruction* terminator = inc_block->get_inst_list().back(); bool is_loop_latch = buffer_info_->is_loop_latch(phi, terminator); - tmap_.insert({inc_value, new shared_tile(ty, shapes2, is_loop_latch?next_ptr:pre_ptr, builder)}); + tmap_.insert({inc_value, new shared_tile(ty, shapes, is_loop_latch?next_ptr:pre_ptr, builder)}); } } else { @@ -564,16 +566,16 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, size_t offset = alloc_->get_offset(v); Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset)); ptr = builder.CreateBitCast(ptr, ptr_ty); - tmap_.insert({v, new shared_tile(ty, shapes2, ptr, builder)}); + tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)}); } } } // create distributed tile else { - const auto &shapes = v->get_type()->get_tile_shapes(); - std::vector axes(shapes.size()); - for(size_t d = 0; d < shapes.size(); d++){ - if(shapes[d]->get_value() > 1){ + const auto &cshapes = v->get_type()->get_tile_shapes(); + std::vector axes(cshapes.size()); + for(size_t d = 0; d < cshapes.size(); d++){ + if(cshapes[d]->get_value() > 1){ ir::metaparameter *x = params_->get_param(v, "nts.d" + std::to_string(d)); axes[d] = axes_.at(x); } @@ -583,7 +585,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, } } bool vectorize = dynamic_cast(v); - distributed_tile *T = new distributed_tile(ty, shapes2, axes, builder, vectorize); + distributed_tile *T = new distributed_tile(ty, shapes, axes, builder, vectorize); tmap_.insert({v, T}); // constant range if(dynamic_cast(v) && !dynamic_cast(v)){ diff --git a/lib/codegen/shmem_allocation.cpp b/lib/codegen/shmem_allocation.cpp index 43ab8bc39..90cf7ef2b 100644 --- a/lib/codegen/shmem_allocation.cpp +++ b/lib/codegen/shmem_allocation.cpp @@ -10,8 +10,24 @@ namespace triton{ namespace codegen{ +bool shmem_allocation::is_ld_padded(ir::value *x) { + if(auto* phi = dynamic_cast(x)) { + bool result = false; + for(unsigned i = 0; i < phi->get_num_incoming(); i++) + result = result | is_ld_padded(phi->get_incoming_value(i)); + return result; + } + if(dynamic_cast(x)) + return true; + return false; +} + unsigned shmem_allocation::get_num_bytes(ir::value *x) { unsigned result = x->get_type()->get_primitive_size_in_bits() / 8; + if(is_ld_padded(x)){ + unsigned ld = x->get_type()->get_tile_shapes()[0]->get_value(); + result += 4 * result / ld; + } if(buffer_info_->is_double(x)) result *= 2; return result; @@ -23,8 +39,9 @@ void shmem_allocation::run(){ typedef std::multimap triples_map_type; std::vector I; - for(auto x: liveness_->intervals()) + for(auto x: liveness_->intervals()){ I.push_back(x.first); + } std::vector J = I; triples_map_type H; From fd91368f986170f9a7182f288fcc8dd56c7e1297 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 6 May 2019 17:47:06 -0400 Subject: [PATCH 138/494] [general] creation of dnn module for gemm/conv triton routines --- examples/cpp/conv.cpp | 211 +++++++----------------------- examples/cpp/dot.cpp | 124 +----------------- examples/python/pytorch/conv.cpp | 35 +++-- examples/python/tensorflow/run.py | 20 +++ include/triton/dnn/conv.h | 197 ++++++++++++++++++++++++++++ include/triton/dnn/gemm.h | 139 ++++++++++++++++++++ include/triton/driver/dispatch.h | 1 + lib/frontend/jit.cpp | 0 8 files changed, 430 insertions(+), 297 deletions(-) create mode 100644 examples/python/tensorflow/run.py create mode 100644 include/triton/dnn/conv.h create mode 100644 include/triton/dnn/gemm.h create mode 100644 lib/frontend/jit.cpp diff --git a/examples/cpp/conv.cpp b/examples/cpp/conv.cpp index f8bec004e..5d1f095b0 100644 --- a/examples/cpp/conv.cpp +++ b/examples/cpp/conv.cpp @@ -4,87 +4,7 @@ #include "triton/jit.h" #include "triton/driver/backend.h" #include "triton/driver/stream.h" - -const char* src = -R"( -const tunable int32 TM = {16, 32, 64}; -const tunable int32 TN = {16, 32, 64}; -const tunable int32 TK = {8}; - -__constant__ int32* delta = alloc_const int32[18]; -__constant__ int32* masks = alloc_const int32[1024]; - -void conv(read_only restrict fp32 *a, - read_only restrict fp32 *b, - fp32 *c, - int32 M, int32 N, int32 K, - int32 AN, int32 AH, int32 AW, - int32 CN, int32 CK, int32 CP, int32 CQ, - int32 AC, int32 AR, int32 AS, - int32 lda_n, int32 lda_c, int32 lda_h, int32 lda_w, - int32 ldc_n, int32 ldc_k, int32 ldc_p, int32 ldc_q, - int32 pad_h, int32 pad_w, - int32 bound){ - int32 rxa[TM] = get_global_range[TM](0); - int32 rb0[TN] = get_global_range[TN](1); - int32 rka[TK] = 0 ... TK; - int32 rb1[TK] = 0 ... TK; - fp32 C[TM, TN] = 0; - int32 ranh[TM] = rxa / CQ; - int32 raw[TM] = rxa % CQ - pad_w; - int32 ran[TM] = ranh / CP; - int32 rah[TM] = ranh % CP - pad_h; - int32 ra0[TM] = ran*lda_n + rah*lda_h + raw*lda_w; - int32 racr[TK] = rka / AS; - int32 ras[TK] = rka % AS; - int32 rac[TK] = racr / AR; - int32 rar[TK] = racr % AR; - int32 ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; - fp32* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis]; - fp32* pb[TN, TK] = b + rb1[newaxis, :]*CK + rb0[:, newaxis]; - __constant__ int32* pincd[TK] = delta + rka; - __constant__ int32* pd[TK] = delta + AR*AS + rka; - int32 d[TK] = *pd; - int32 incd[TK] = *pincd; - int32 maskh[TM] = pad_h + min(rah, 0) + max(rah + AR - AH, 0); - int32 maskw[TM] = pad_w + min(raw, 0) + max(raw + AS - AW, 0); - __constant__ int32* pm[TM] = masks + AR*AS + maskw*AR*AS + maskh*AR*AS*(2*pad_w + 1); - __constant__ int32* pincm[TM] = delta; - int32 incm[TM] = *pincm; - int32 checka0[TM] = *pm; - int32 checka1[TK] = 1 << rka; - int1 checka[TM, TK] = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; - fp32 a[TM, TK] = checka ? *pa : 0; - fp32 b[TN, TK] = *pb; - for(int32 k = K; k > 0; k = k - TK){ - C = dot(a, trans(b), C); - pb = pb + TK*CK; - pa = pa + d[newaxis, :]; - b = *pb; - pd = pd + incd; - pincd = pincd + incd; - d = *pd; - incd = *pincd; - pm = pm + incm; - pincm = pincm + incm; - incm = *pincm; - checka0 = *pm; - checka = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; - a = checka ? *pa : 0; - } - int32 rxc[TM] = get_global_range[TM](0); - int32 rc1[TN] = get_global_range[TN](1); - int32 rcn[TM] = rxc / (CP*CQ); - int32 rcpq[TM] = rxc % (CP*CQ); - int32 rc0[TM] = rcn * ldc_n + rcpq; - fp32* pc[TM, TN] = c + rc1[newaxis, :]*ldc_k + rc0[:, newaxis]; - int1 checkc0[TM] = rxc < M; - int1 checkc1[TN] = rc1 < N; - int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - @checkc *pc = C; -})"; - - +#include "triton/dnn/conv.h" int main() { // initialize default compute device @@ -92,28 +12,28 @@ int main() { // initialize just-in-time compiler triton::jit jit(context); // initialization - int32_t AN = 4, CK = 32; - int32_t AD = 1, AH = 24, AW = 240; - int32_t BC = 64, BT = 1, BR = 3, BS = 3; + int32_t B = 4, NF = 32; + int32_t D = 1, H = 24, W = 240; + int32_t NC = 64, T = 1, R = 3, S = 3; int32_t pad_d = 0, pad_h = 1, pad_w = 1; int32_t stride_d = 1, stride_h = 1, stride_w = 1; int32_t upsample_d = 1, upsample_h = 1, upsample_w = 1; - int32_t CM = (AD*upsample_d - BT + 1 + 2*pad_d + stride_d - 1)/stride_d; - int32_t CP = (AH*upsample_h - BR + 1 + 2*pad_h + stride_h - 1)/stride_h; - int32_t CQ = (AW*upsample_w - BS + 1 + 2*pad_w + stride_w - 1)/stride_w; + int32_t RD = (D*upsample_d - T + 1 + 2*pad_d + stride_d - 1)/stride_d; + int32_t RH = (H*upsample_h - R + 1 + 2*pad_h + stride_h - 1)/stride_h; + int32_t RW = (W*upsample_w - S + 1 + 2*pad_w + stride_w - 1)/stride_w; // equivalent matmul dimensions - int32_t M = AN*CM*CP*CQ; - int32_t N = CK; - int32_t K = BC*BT*BR*BS; - std::vector hc(AN*CP*CQ*CK); - std::vector rc(AN*CP*CQ*CK); - std::vector ha(AN*BC*AH*AW); - std::vector hb(BC*BR*BS*CK); + int32_t M = B*RD*RH*RW; + int32_t N = NF; + int32_t K = NC*T*R*S; + std::vector hc(B*RH*RW*NF); + std::vector rc(B*RH*RW*NF); + std::vector ha(B*NC*H*W); + std::vector hb(NC*R*S*NF); srand(0); for(size_t i = 0; i < ha.size(); i++) - ha[i] = 1; + ha[i] = (float)rand()/RAND_MAX; for(size_t i = 0; i < hb.size(); i++) - hb[i] = 1; + hb[i] = (float)rand()/RAND_MAX; for(size_t i = 0; i < hc.size(); i++) hc[i] = 0; triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*4); @@ -126,36 +46,25 @@ int main() { stream->synchronize(); // memory strides for data int32_t stride_i_w = 1; - int32_t stride_i_h = AW*stride_i_w; - int32_t stride_i_d = AH*stride_i_h; - int32_t stride_i_c = AD*stride_i_d; - int32_t stride_i_n = BC*stride_i_c; - // memory strides for filters - int32_t stride_f_k = 1; - int32_t stride_f_s = CK*stride_f_k; - int32_t stride_f_r = BS*stride_f_s; - int32_t stride_f_t = BR*stride_f_r; - int32_t stride_f_c = BT*stride_f_t; + int32_t stride_i_h = W*stride_i_w; + int32_t stride_i_d = H*stride_i_h; + int32_t stride_i_c = D*stride_i_d; + int32_t stride_i_n = NC*stride_i_c; // memory stride for activations int32_t stride_o_q = 1; - int32_t stride_o_p = CQ*stride_o_q; - int32_t stride_o_m = CP*stride_o_p; - int32_t stride_o_k = CM*stride_o_m; - int32_t stride_o_n = CK*stride_o_k; + int32_t stride_o_p = RW*stride_o_q; + int32_t stride_o_m = RH*stride_o_p; + int32_t stride_o_k = RD*stride_o_m; + int32_t stride_o_n = NF*stride_o_k; // look-up table - int TK = 8; - int F = BT * BR * BS; - int nlut = (TK + F - 1) / F * F; - std::vector h_delta(nlut + upsample_d*upsample_h*upsample_w*nlut); - std::vector h_masks(nlut + (2*pad_h+1)*(2*pad_w+1)*(2*pad_d+1)*nlut); - build_conv_lut(TK, stride_i_d, stride_i_h, stride_i_w, stride_i_c, pad_d, pad_h, pad_w, BT, BR, BS, h_delta, h_masks); + std::vector h_delta, h_masks; + triton::dnn::conv::init_cst(stride_i_d, stride_i_h, stride_i_w, stride_i_c, pad_d, pad_h, pad_w, T, R, S, h_delta, h_masks); // benchmark a given convolution kernel auto benchmark = [&](triton::driver::kernel* kernel, triton::jit::launch_information info) { // launch info unsigned TM = info.global_range_size[0]; unsigned TN = info.global_range_size[1]; - unsigned TK = jit.get_int("TK"); // initialize constant memory triton::driver::buffer* delta = jit.get_buffer("delta"); triton::driver::buffer* masks = jit.get_buffer("masks"); @@ -165,15 +74,6 @@ int main() { // launch info unsigned nthreads = info.num_threads; std::array grid = {(M + TM - 1)/TM, (N + TN - 1)/TN, 1}; - // fast bounds-checking - unsigned lasti = (grid[0]*TM - 1)*TM + TM - 1; - unsigned lastj = (grid[1]*TN - 1)*TN + TN - 1; - unsigned lastk = TK - 1; - bool AT = false; - bool BT = true; - unsigned last_safe_a = (AT==false)?(M*K - 1 - lasti)/M - lastk : M*K - 1 - lasti*K - lastk; - unsigned last_safe_b = (BT==true)?(N*K - 1 - lastj)/N - lastk : N*K - 1 - lastj*K - lastk; - int32_t bound = std::max(1, std::max(K - last_safe_a, K - last_safe_b)); // set arguments kernel->setArg(0, da); kernel->setArg(1, db); @@ -181,52 +81,41 @@ int main() { kernel->setArg(3, M); kernel->setArg(4, N); kernel->setArg(5, K); - kernel->setArg(6, AN); - kernel->setArg(7, AH); - kernel->setArg(8, AW); - kernel->setArg(9, AN); - kernel->setArg(10, CK); - kernel->setArg(11, CP); - kernel->setArg(12, CQ); - kernel->setArg(13, BC); - kernel->setArg(14, BR); - kernel->setArg(15, BS); - kernel->setArg(16, stride_i_n); - kernel->setArg(17, stride_i_c); - kernel->setArg(18, stride_i_h); - kernel->setArg(19, stride_i_w); - kernel->setArg(20, stride_o_n); - kernel->setArg(21, stride_o_k); - kernel->setArg(22, stride_o_p); - kernel->setArg(23, stride_o_q); - kernel->setArg(24, pad_h); - kernel->setArg(25, pad_w); - kernel->setArg(26, bound); + kernel->setArg(6, B); + kernel->setArg(7, H); + kernel->setArg(8, W); + kernel->setArg(9, NF); + kernel->setArg(10, RH); + kernel->setArg(11, RW); + kernel->setArg(12, NC); + kernel->setArg(13, R); + kernel->setArg(14, S); + kernel->setArg(15, stride_i_n); + kernel->setArg(16, stride_i_c); + kernel->setArg(17, stride_i_h); + kernel->setArg(18, stride_i_w); + kernel->setArg(19, stride_o_n); + kernel->setArg(20, stride_o_k); + kernel->setArg(21, stride_o_p); + kernel->setArg(22, stride_o_q); + kernel->setArg(23, pad_h); + kernel->setArg(24, pad_w); // dry run stream->enqueue(kernel, grid, {nthreads, 1, 1}); stream->synchronize(); // benchmark double ts = bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, [&](){ stream->synchronize(); }, *context->device()); - ts = ts * 1e-9; - double tflops = 2.*M*N*K / ts * 1e-12; - return tflops; + return 2.*M*N*K / ts * 1e-3; }; - // run - std::vector params = { - 16, 2, 64, - 32, 2, 64, - 16, 8, 2, 2, - 8, 1, 8, - 4 - }; -// jit.autotune("conv", src, benchmark); - jit.add_module("conv", src, params); + std::string src = triton::dnn::conv::src(); +// jit.autotune("conv", src.c_str(), benchmark); + jit.add_module("conv", src.c_str(), triton::dnn::conv::default_params()); triton::driver::kernel* kernel = jit.get_function("conv"); triton::jit::launch_information info = jit.get_launch_info("conv"); std::cout << "Performance: " << benchmark(kernel, info) << " TFLOPS " << std::endl; stream->read(dc, true, 0, hc); - cpp_conv_nchw(BC, AN, CK, AD, AH, AW, BT, BR, BS, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, CM, CP, CQ, rc, ha, hb); + cpp_conv_nchw(NC, B, NF, D, H, W, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, RD, RH, RW, rc, ha, hb); for(size_t i = 0; i < M*N; i++) if(std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 980f83b31..0c735d9f4 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -4,99 +4,7 @@ #include "triton/jit.h" #include "triton/driver/backend.h" #include "triton/driver/stream.h" - - -std::string triton_source(bool AT, bool BT) { - std::string AS0 = "TM", AS1 = "TK"; - std::string BS0 = "TK", BS1 = "TN"; - std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; - std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; - std::string lda0 = "*lda", lda1 = ""; - std::string ldb0 = "", ldb1 = "*ldb"; - std::string usea = AT ? "trans(a)" : "a"; - std::string useb = BT ? "trans(b)" : "b"; - if(AT){ - std::swap(AS0, AS1); - std::swap(bca0, bca1); - std::swap(lda0, lda1); - } - if(BT){ - std::swap(BS0, BS1); - std::swap(bcb0, bcb1); - std::swap(ldb0, ldb1); - } - std::string res = - R"( - const tunable int32 TM = {16, 32, 64, 128}; - const tunable int32 TN = {16, 32, 64, 128}; - const tunable int32 TK = {8}; - const tunable int32 GZ = {1}; - - void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, - int32 M, int32 N, int32 K, - int32 lda, int32 ldb, int32 ldc, - int32 *locks, int32 grid0, int32 grid1) { - int32 rxa[TM] = get_global_range[TM](0); - int32 ryb[TN] = get_global_range[TN](1); - int32 rz = get_global_range[1](2); - int32 rka[TK] = 0 ... TK; - int32 rkb[TK] = 0 ... TK; - fp32 c[TM, TN] = 0; - int32 div = K / GZ; - int32 rem = K % GZ; - K = select(rz < rem, div - 1, div); - int32 offk = select(rz < rem, rz*(div + 1), rz*div + rem); - fp32* pa[)" + AS0 + ", " + AS1 + "] = A + (offk + rka" + bca0 + ")" + lda0 + " + rxa" + bca1 + lda1 + R"(; - fp32* pb[)" + BS0 + ", " + BS1 + "] = B + (offk + rkb" + bcb0 + ")" + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; - fp32 a[)" + AS0 + ", " + AS1 + R"(] = *pa; - fp32 b[)" + BS0 + ", " + BS1 + R"(] = *pb; - int32 last_a = ((M*K - 1) - (TM*TK + 1)) / lda; - int32 last_b = ((K*N - 1) - (TN*TK + 1)) / ldb; - last_a = last_a / TK * TK; - last_b = last_b / TK * TK; - int32 bound = K - max(last_a, last_b); - for(int32 k = K; k > bound; k = k - TK){ - c = dot()" + usea + ", " + useb + R"(, c); - pa = pa + TK)" + lda0 + R"(; - pb = pb + TK)" + ldb0 + R"(; - a = *pa; - b = *pb; - } - int32 rxc[TM] = get_global_range[TM](0); - int32 ryc[TN] = get_global_range[TN](1); - for(int32 k = bound; k > 0; k = k - 1){ - int1 checka[TM, 1] = rxc[:, newaxis] < M; - int1 checkb[TN, 1] = ryc[:, newaxis] < N; - fp32* pa[TM, 1] = A + (offk + K - k))" + lda0 + " + rxc[:, newaxis]" + lda1 + R"(; - fp32* pb[TN, 1] = B + (offk + K - k))" + ldb0 + " + ryc[:, newaxis]" + ldb1 + R"(; - fp32 a[TM, 1] = checka ? *pa : 0; - fp32 b[TN, 1] = checkb ? *pb : 0; - c = dot(a, trans(b), c); - } - int32 ridx = get_range_id(0); - int32 ridy = get_range_id(1); - fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - int32 *plock = locks + ridx + ridy*grid0; - while(__atomic_cas(plock, 0, 1)); - int32 *pcount = plock + grid0*grid1; - int32 count = *pcount; - int32 countp1 = select(count == GZ - 1, 0, count + 1); - int1 checkc0[TM] = rxc < M; - int1 checkc1[TN] = ryc < N; - int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - if(count == 0) { - @checkc *pc = c; - *pcount = countp1; - } - else { - @checkc *pc = c + *pc; - *pcount = countp1; - } - __atomic_cas(plock, 1, 0); - } - )"; - return res; -} +#include "triton/dnn/gemm.h" int main() { @@ -129,51 +37,31 @@ int main() { stream->write(da, true, 0, ha); stream->write(db, true, 0, hb); stream->write(dc, true, 0, hc); + triton::dnn::gemm::init(stream, dlocks); stream->synchronize(); // benchmark a given matrix multiplication kernel auto benchmark = [&](triton::driver::kernel* kernel, triton::jit::launch_information info) { - // launch info unsigned TM = info.global_range_size[0]; unsigned TN = info.global_range_size[1]; unsigned nthreads = info.num_threads; unsigned GZ = jit.get_int("GZ"); std::array grid = {(M + TM - 1)/TM, (N + TN - 1)/TN, GZ}; - // init locks - stream->write(dlocks, true, 0, hlocks); - // set argument - kernel->setArg(0, da); - kernel->setArg(1, db); - kernel->setArg(2, dc); - kernel->setArg(3, M); - kernel->setArg(4, N); - kernel->setArg(5, K); - kernel->setArg(6, M); - kernel->setArg(7, N); - kernel->setArg(8, M); - kernel->setArg(9, dlocks); - kernel->setArg(10, grid[0]); - kernel->setArg(11, grid[1]); - // dry run + triton::dnn::gemm::set_arg(kernel, da, db, dc, M, N, K, dlocks, grid[0], grid[1]); stream->enqueue(kernel, grid, {nthreads, 1, 1}); stream->synchronize(); - // benchmark double ts = bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, [&](){ stream->synchronize(); }, *context->device()); - ts = ts * 1e-9; - double tflops = 2.*M*N*K / ts * 1e-12; - return tflops; + return 2.*M*N*K / ts * 1e-3; }; // just-in-time compile source-code - std::string src = triton_source(AT, BT); + std::string src = triton::dnn::gemm::src(AT, BT); // jit.autotune("matmul",src.c_str(), benchmark); - jit.add_module("matmul", src.c_str(), {16, 2, 64, 16, 2, 64, 16, 8, 2, 2, 8, 8, 8, 1}); -// jit.add_module("matmul", src.c_str(), {16, 2, 128, 32, 32, 32, 4, 2, 2, 8, 8, 4, 2, 1}); -// jit.add_module("matmul", src.c_str(), {32, 64, 32, 64, 16, 8, 2, 2, 4, 2, 8, 4, 2, 1}); + jit.add_module("matmul", src.c_str(), triton::dnn::gemm::default_params(AT, BT)); triton::driver::kernel* kernel = jit.get_function("matmul"); triton::jit::launch_information info = jit.get_launch_info("matmul"); std::cout << "Performance: " << benchmark(kernel, info) << " TFLOPS " << std::endl; diff --git a/examples/python/pytorch/conv.cpp b/examples/python/pytorch/conv.cpp index 71ea8e2be..2aa46175d 100644 --- a/examples/python/pytorch/conv.cpp +++ b/examples/python/pytorch/conv.cpp @@ -266,24 +266,23 @@ torch::Tensor conv_forward( kernel->setArg(6, B); kernel->setArg(7, H); kernel->setArg(8, W); - kernel->setArg(9, B); - kernel->setArg(10, NF); - kernel->setArg(11, P); - kernel->setArg(12, Q); - kernel->setArg(13, Ci); - kernel->setArg(14, R); - kernel->setArg(15, S); - kernel->setArg(16, stride_i_n); - kernel->setArg(17, stride_i_c); - kernel->setArg(18, stride_i_h); - kernel->setArg(19, stride_i_w); - kernel->setArg(20, stride_o_n); - kernel->setArg(21, stride_o_k); - kernel->setArg(22, stride_o_p); - kernel->setArg(23, stride_o_q); - kernel->setArg(24, pad_h); - kernel->setArg(25, pad_w); - kernel->setArg(26, bound); + kernel->setArg(9, NF); + kernel->setArg(10, P); + kernel->setArg(11, Q); + kernel->setArg(12, Ci); + kernel->setArg(13, R); + kernel->setArg(14, S); + kernel->setArg(15, stride_i_n); + kernel->setArg(16, stride_i_c); + kernel->setArg(17, stride_i_h); + kernel->setArg(18, stride_i_w); + kernel->setArg(19, stride_o_n); + kernel->setArg(20, stride_o_k); + kernel->setArg(21, stride_o_p); + kernel->setArg(22, stride_o_q); + kernel->setArg(23, pad_h); + kernel->setArg(24, pad_w); + kernel->setArg(25, bound); // // dry run stream->enqueue(kernel, grid, {nthreads, 1, 1}); return output; diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py new file mode 100644 index 000000000..5a721def9 --- /dev/null +++ b/examples/python/tensorflow/run.py @@ -0,0 +1,20 @@ +import os +import tensorflow as tf +import numpy as np + +data_files_path = tf.resource_loader.get_data_files_path() +library_dir = '/home/philippe/Development/triton/build/examples/python/tensorflow' +module = tf.load_op_library(os.path.join(library_dir, 'libtf_blocksparse.so')) + +M, N, K = 512, 512, 512 +a = tf.placeholder(tf.float32, shape=[M, K]) +b = tf.placeholder(tf.float32, shape=[N, K]) +locks = tf.placeholder(tf.int32, shape=[4096]) +c = module.block_sparse_mat_mul(a, b, locks) +# Run +sess = tf.InteractiveSession() +sess.run(tf.global_variables_initializer()) +result = sess.run([c], feed_dict = {locks: np.zeros(4096), + a: np.random.rand(M, K), + b: np.random.rand(N, K)}) +print(result) diff --git a/include/triton/dnn/conv.h b/include/triton/dnn/conv.h new file mode 100644 index 000000000..b76726575 --- /dev/null +++ b/include/triton/dnn/conv.h @@ -0,0 +1,197 @@ +#include +#include + +namespace triton{ +namespace dnn{ + +class conv { +public: + enum type { + FPROP, + BPROP, + WGRAD + }; + + static void build_lut(int TK, + int stride_d, int stride_h, int stride_w, int stride_c, + int pad_d, int pad_h, int pad_w, + int T, int R, int S, + std::vector& res, std::vector& masks) { + /* convolution parameters */ + int F = T * R * S; + int Nlut = (TK + F - 1) / F * F; + int upsample_w = 1; + int upsample_h = 1; + int upsample_d = 1; + /* unpack index wrt filters */ + auto unpack = [&](int32_t trs){ + int32_t tr = trs / S; + int32_t s = trs - tr*S; + int32_t t = tr / R; + int32_t r = tr - t*R; + return std::make_tuple(t, r, s); + }; + /* increments */ + for(size_t i = 0; i < Nlut; ++i) + res[i] = (((i + TK) % Nlut) - i); + /* deltas */ + size_t Ds0 = Nlut; + size_t Ds1 = upsample_w; + size_t Ds2 = upsample_h; + size_t Ds3 = upsample_d; + for(size_t pd = 0; pd < Ds3; ++pd) + for(size_t ph = 0; ph < Ds2; ++ph) + for(size_t pw = 0; pw < Ds1; ++pw){ + int32_t* deltas_ptr = &res[Nlut + pw*Ds0 + ph*Ds0*Ds1 + pd*Ds0*Ds1*Ds2]; + // cumulative increments + for(size_t i = 0; i < Ds0; ++i){ + int32_t ctrs = i; + int32_t c = ctrs / F; + int32_t t, r, s; + std::tie(t, r, s) = unpack(ctrs % F); + // next indices + int32_t nextctrs = ctrs + TK; + int32_t nextc = nextctrs / F; + int32_t nextt, nextr, nexts; + std::tie(nextt, nextr, nexts) = unpack(nextctrs % F); + // diffs + int32_t cdiff = nextc - c; + int32_t tdiff = (nextt + pd)/upsample_d - (t + pd)/upsample_d; + int32_t rdiff = (nextr + ph)/upsample_h - (r + ph)/upsample_h; + int32_t sdiff = (nexts + pw)/upsample_w - (s + pw)/upsample_w; + // delta pointers + deltas_ptr[i] = cdiff*stride_c + sdiff*stride_w + rdiff*stride_h + tdiff*stride_d; + } + } + + /* Masks */ + size_t Ms0 = Nlut; + size_t Ms1 = 2*pad_w + 1; + size_t Ms2 = 2*pad_h + 1; + size_t Ms3 = 2*pad_d + 1; + for(size_t pd = 0; pd < Ms3; ++pd) + for(size_t ph = 0; ph < Ms2; ++ph) + for(size_t pw = 0; pw < Ms1; ++pw){ + int32_t* masks_ptr = &masks[Nlut + pw*Ms0 + ph*Ms0*Ms1 + pd*Ms0*Ms1*Ms2]; + for(size_t i = 0; i < Ms0; ++i){ + int32_t t, r, s; + int32_t mask = 0x0; + for(size_t j = 0; j < TK; ++j){ + std::tie(t, r, s) = unpack((i + j) % F); + bool in_bounds_d = (t + pd) >= pad_d && (t + pd) < (T + pad_d); + bool in_bounds_h = (r + ph) >= pad_h && (r + ph) < (R + pad_h); + bool in_bounds_w = (s + pw) >= pad_w && (s + pw) < (S + pad_w); + mask |= (in_bounds_d && in_bounds_h && in_bounds_w) << j; + } + masks_ptr[i] = mask; + } + } + for(size_t i = 0; i < Nlut; ++i) + masks[i] = 0x0; + + } + + static std::vector default_params() { + return {16, 2, 64, 32, 2, 64, 16, 8, 2, 2, 8, 1, 8, 4 }; + } + + static void init_cst(int stride_d, int stride_h, int stride_w, int stride_c, + int pad_d, int pad_h, int pad_w, + int T, int R, int S, + std::vector &h_delta, std::vector &h_masks) { + int upsample_d = 1; + int upsample_h = 1; + int upsample_w = 1; + int TK = 8; + int F = T * R * S; + int nlut = (TK + F - 1) / F * F; + h_delta.resize(nlut + upsample_d*upsample_h*upsample_w*nlut); + h_masks.resize(nlut + (2*pad_h+1)*(2*pad_w+1)*(2*pad_d+1)*nlut); + build_lut(TK, stride_d, stride_h, stride_w, stride_c, pad_d, pad_h, pad_w, T, R, S, h_delta, h_masks); + } + + static std::string src(type ty = FPROP) { + + std::string res = + R"( + const tunable int32 TM = {16, 32, 64}; + const tunable int32 TN = {16, 32, 64}; + const tunable int32 TK = {8}; + + __constant__ int32* delta = alloc_const int32[18]; + __constant__ int32* masks = alloc_const int32[1024]; + + void conv(read_only restrict fp32 *a, + read_only restrict fp32 *b, + fp32 *c, + int32 M, int32 N, int32 K, + int32 B, int32 H, int32 W, + int32 NF, int32 RH, int32 RW, + int32 NC, int32 R, int32 S, + int32 lda_n, int32 lda_c, int32 lda_h, int32 lda_w, + int32 ldc_n, int32 ldc_k, int32 ldc_p, int32 ldc_q, + int32 pad_h, int32 pad_w){ + int32 rxa[TM] = get_global_range[TM](0); + int32 rb0[TN] = get_global_range[TN](1); + int32 rka[TK] = 0 ... TK; + int32 rb1[TK] = 0 ... TK; + fp32 C[TM, TN] = 0; + int32 rabh[TM] = rxa / RW; + int32 raw[TM] = rxa % RW - pad_w; + int32 rab[TM] = rabh / RH; + int32 rah[TM] = rabh % RH - pad_h; + int32 ra0[TM] = rab*lda_n + rah*lda_h + raw*lda_w; + int32 racr[TK] = rka / S; + int32 ras[TK] = rka % S; + int32 rac[TK] = racr / R; + int32 rar[TK] = racr % R; + int32 ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; + fp32* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis]; + fp32* pb[TN, TK] = b + rb1[newaxis, :]*NF + rb0[:, newaxis]; + __constant__ int32* pincd[TK] = delta + rka; + __constant__ int32* pd[TK] = delta + R*S + rka; + int32 d[TK] = *pd; + int32 incd[TK] = *pincd; + int32 maskh[TM] = pad_h + min(rah, 0) + max(rah + R - H, 0); + int32 maskw[TM] = pad_w + min(raw, 0) + max(raw + S - W, 0); + __constant__ int32* pm[TM] = masks + R*S + maskw*R*S + maskh*R*S*(2*pad_w + 1); + __constant__ int32* pincm[TM] = delta; + int32 incm[TM] = *pincm; + int32 checka0[TM] = *pm; + int32 checka1[TK] = 1 << rka; + int1 checka[TM, TK] = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; + fp32 a[TM, TK] = checka ? *pa : 0; + fp32 b[TN, TK] = *pb; + for(int32 k = K; k > 0; k = k - TK){ + C = dot(a, trans(b), C); + pb = pb + TK*NF; + pa = pa + d[newaxis, :]; + b = *pb; + pd = pd + incd; + pincd = pincd + incd; + d = *pd; + incd = *pincd; + pm = pm + incm; + pincm = pincm + incm; + incm = *pincm; + checka0 = *pm; + checka = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; + a = checka ? *pa : 0; + } + int32 rxc[TM] = get_global_range[TM](0); + int32 rc1[TN] = get_global_range[TN](1); + int32 rcn[TM] = rxc / (RH*RW); + int32 rcpq[TM] = rxc % (RH*RW); + int32 rc0[TM] = rcn * ldc_n + rcpq; + fp32* pc[TM, TN] = c + rc1[newaxis, :]*ldc_k + rc0[:, newaxis]; + int1 checkc0[TM] = rxc < M; + int1 checkc1[TN] = rc1 < N; + int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + @checkc *pc = C; + })"; + return res; + } +}; + +} +} diff --git a/include/triton/dnn/gemm.h b/include/triton/dnn/gemm.h new file mode 100644 index 000000000..41345bdd8 --- /dev/null +++ b/include/triton/dnn/gemm.h @@ -0,0 +1,139 @@ +#include "triton/driver/stream.h" +#include "triton/driver/kernel.h" +#include + +namespace triton{ +namespace dnn{ + +class gemm { +public: + + static void init(driver::stream* stream, driver::buffer* locks) { + std::vector hlocks(2048, 0); + stream->write(locks, false, 0, hlocks); + } + + static void set_arg(driver::kernel *kernel, + driver::buffer *a, driver::buffer *b, driver::buffer *c, + int32_t M, int32_t N, int32_t K, + driver::buffer *locks, int32_t grid_0, int32_t grid_1) { + kernel->setArg(0, a); + kernel->setArg(1, b); + kernel->setArg(2, c); + kernel->setArg(3, M); + kernel->setArg(4, N); + kernel->setArg(5, K); + kernel->setArg(6, M); + kernel->setArg(7, N); + kernel->setArg(8, M); + kernel->setArg(9, locks); + kernel->setArg(10, grid_0); + kernel->setArg(11, grid_1); + } + + static std::vector default_params(bool AT, bool BT) { + if(AT && BT) + return {32, 64, 32, 64, 16, 8, 2, 2, 4, 2, 8, 4, 2, 1}; + else if(AT && !BT) + return {32, 64, 32, 64, 16, 8, 2, 2, 4, 2, 8, 4, 2, 1}; + else if(!AT && BT) + return {16, 2, 64, 16, 2, 64, 16, 8, 2, 2, 8, 8, 8, 1}; + else + return {16, 2, 128, 32, 32, 32, 4, 2, 2, 8, 8, 4, 2, 1}; + } + + static std::string src(bool AT, bool BT) { + std::string AS0 = "TM", AS1 = "TK"; + std::string BS0 = "TK", BS1 = "TN"; + std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; + std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; + std::string lda0 = "*lda", lda1 = ""; + std::string ldb0 = "", ldb1 = "*ldb"; + std::string usea = AT ? "trans(a)" : "a"; + std::string useb = BT ? "trans(b)" : "b"; + if(AT){ + std::swap(AS0, AS1); + std::swap(bca0, bca1); + std::swap(lda0, lda1); + } + if(BT){ + std::swap(BS0, BS1); + std::swap(bcb0, bcb1); + std::swap(ldb0, ldb1); + } + std::string res = + R"( + const tunable int32 TM = {16, 32, 64, 128}; + const tunable int32 TN = {16, 32, 64, 128}; + const tunable int32 TK = {8}; + const tunable int32 GZ = {1}; + + void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, + int32 M, int32 N, int32 K, + int32 lda, int32 ldb, int32 ldc, + int32 *locks, int32 grid0, int32 grid1) { + int32 rxa[TM] = get_global_range[TM](0); + int32 ryb[TN] = get_global_range[TN](1); + int32 rz = get_global_range[1](2); + int32 rka[TK] = 0 ... TK; + int32 rkb[TK] = 0 ... TK; + fp32 c[TM, TN] = 0; + int32 div = K / GZ; + int32 rem = K % GZ; + K = select(rz < rem, div - 1, div); + int32 offk = select(rz < rem, rz*(div + 1), rz*div + rem); + fp32* pa[)" + AS0 + ", " + AS1 + "] = A + (offk + rka" + bca0 + ")" + lda0 + " + rxa" + bca1 + lda1 + R"(; + fp32* pb[)" + BS0 + ", " + BS1 + "] = B + (offk + rkb" + bcb0 + ")" + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; + fp32 a[)" + AS0 + ", " + AS1 + R"(] = *pa; + fp32 b[)" + BS0 + ", " + BS1 + R"(] = *pb; + int32 last_a = ((M*K - 1) - (TM*TK + 1)) / lda; + int32 last_b = ((K*N - 1) - (TN*TK + 1)) / ldb; + last_a = last_a / TK * TK; + last_b = last_b / TK * TK; + int32 bound = K - max(last_a, last_b); + for(int32 k = K; k > bound; k = k - TK){ + c = dot()" + usea + ", " + useb + R"(, c); + pa = pa + TK)" + lda0 + R"(; + pb = pb + TK)" + ldb0 + R"(; + a = *pa; + b = *pb; + } + int32 rxc[TM] = get_global_range[TM](0); + int32 ryc[TN] = get_global_range[TN](1); + for(int32 k = bound; k > 0; k = k - 1){ + int1 checka[TM, 1] = rxc[:, newaxis] < M; + int1 checkb[TN, 1] = ryc[:, newaxis] < N; + fp32* pa[TM, 1] = A + (offk + K - k))" + lda0 + " + rxc[:, newaxis]" + lda1 + R"(; + fp32* pb[TN, 1] = B + (offk + K - k))" + ldb0 + " + ryc[:, newaxis]" + ldb1 + R"(; + fp32 a[TM, 1] = checka ? *pa : 0; + fp32 b[TN, 1] = checkb ? *pb : 0; + c = dot(a, trans(b), c); + } + int32 ridx = get_range_id(0); + int32 ridy = get_range_id(1); + fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; + int32 *plock = locks + ridx + ridy*grid0; + while(__atomic_cas(plock, 0, 1)); + int32 *pcount = plock + grid0*grid1; + int32 count = *pcount; + int32 countp1 = select(count == GZ - 1, 0, count + 1); + int1 checkc0[TM] = rxc < M; + int1 checkc1[TN] = ryc < N; + int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + if(count == 0) { + @checkc *pc = c; + *pcount = countp1; + } + else { + @checkc *pc = c + *pc; + *pcount = countp1; + } + __atomic_cas(plock, 1, 0); + } + )"; + return res; + } +}; + +} +} diff --git a/include/triton/driver/dispatch.h b/include/triton/driver/dispatch.h index bd25ba2b2..a2a389cbd 100755 --- a/include/triton/driver/dispatch.h +++ b/include/triton/driver/dispatch.h @@ -204,6 +204,7 @@ public: static int initializeLLVMToSPIRVPass(llvm::PassRegistry &); static bool writeSpirv(llvm::Module *M, std::ostream &OS, std::string &ErrMsg); + private: // Libraries diff --git a/lib/frontend/jit.cpp b/lib/frontend/jit.cpp new file mode 100644 index 000000000..e69de29bb From 615569287e06d7aa39be9d580cc4bae1ad15f8ba Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 6 May 2019 19:30:22 -0400 Subject: [PATCH 139/494] more cleaning of conv --- examples/cpp/conv.cpp | 4 +- include/triton/dnn/conv.h | 173 ++++++++++++++++++++++++++------------ 2 files changed, 120 insertions(+), 57 deletions(-) diff --git a/examples/cpp/conv.cpp b/examples/cpp/conv.cpp index 5d1f095b0..0af076612 100644 --- a/examples/cpp/conv.cpp +++ b/examples/cpp/conv.cpp @@ -25,6 +25,7 @@ int main() { int32_t M = B*RD*RH*RW; int32_t N = NF; int32_t K = NC*T*R*S; + // convolution configuration std::vector hc(B*RH*RW*NF); std::vector rc(B*RH*RW*NF); std::vector ha(B*NC*H*W); @@ -57,8 +58,9 @@ int main() { int32_t stride_o_k = RD*stride_o_m; int32_t stride_o_n = NF*stride_o_k; // look-up table + triton::dnn::conv configuration(B, NC, H, W, R, S, NF, 1, 1, 0, 0); std::vector h_delta, h_masks; - triton::dnn::conv::init_cst(stride_i_d, stride_i_h, stride_i_w, stride_i_c, pad_d, pad_h, pad_w, T, R, S, h_delta, h_masks); + configuration.build_lut(h_delta, h_masks); // benchmark a given convolution kernel auto benchmark = [&](triton::driver::kernel* kernel, triton::jit::launch_information info) { diff --git a/include/triton/dnn/conv.h b/include/triton/dnn/conv.h index b76726575..e3fb91d43 100644 --- a/include/triton/dnn/conv.h +++ b/include/triton/dnn/conv.h @@ -12,81 +12,106 @@ public: WGRAD }; - static void build_lut(int TK, - int stride_d, int stride_h, int stride_w, int stride_c, - int pad_d, int pad_h, int pad_w, - int T, int R, int S, - std::vector& res, std::vector& masks) { - /* convolution parameters */ - int F = T * R * S; - int Nlut = (TK + F - 1) / F * F; - int upsample_w = 1; - int upsample_h = 1; - int upsample_d = 1; + + conv(int B, int NC, int H, int W, int R, int S, int NF, + int upsample_h, int upsample_w, + int pad_h, int pad_w) + : B_(B), NC_(NC), D_(1), H_(H), W_(W), T_(1), R_(R), S_(S), NF_(NF), + upsample_d_(1), upsample_h_(upsample_h), upsample_w_(upsample_w), + pad_d_(0), pad_h_(pad_h), pad_w_(pad_w) + { + RD_ = (D_*upsample_d_ - T_ + 1 + 2*pad_d_ + stride_d_ - 1)/stride_d_; + RH_ = (H_*upsample_h_ - R_ + 1 + 2*pad_h_ + stride_h_ - 1)/stride_h_; + RW_ = (W_*upsample_w_ - S_ + 1 + 2*pad_w_ + stride_w_ - 1)/stride_w_; + M_ = B*RD_*RH_*RW_; + N_ = NF; + K_ = NC*T_*R_*S_; + Fs_ = T_*R_*S_; + TK_ = 8; + Luts_ = (TK_ + Fs_ - 1) / Fs_ * Fs_; + // memory strides for data + stride_a_w_ = 1; + stride_a_h_ = W_*stride_a_w_; + stride_a_d_ = H_*stride_a_h_; + stride_a_c_ = D_*stride_a_d_; + stride_a_n_ = NC_*stride_a_c_; + // memory stride for activations + stride_c_q_ = 1; + stride_c_p_ = RW_*stride_c_q_; + stride_c_m_ = RH_*stride_c_p_; + stride_c_k_ = RD_*stride_c_m_; + stride_c_n_ = NF_*stride_c_k_; + } + + + void build_lut(std::vector& delta, std::vector& masks) { + delta.resize(Luts_ + upsample_d_*upsample_h_*upsample_w_*Luts_); + masks.resize(Luts_ + (2*pad_h_+1)*(2*pad_w_+1)*(2*pad_d_+1)*Luts_); + /* unpack index wrt filters */ auto unpack = [&](int32_t trs){ - int32_t tr = trs / S; - int32_t s = trs - tr*S; - int32_t t = tr / R; - int32_t r = tr - t*R; + int32_t tr = trs / S_; + int32_t s = trs - tr*S_; + int32_t t = tr / R_; + int32_t r = tr - t*R_; return std::make_tuple(t, r, s); }; /* increments */ - for(size_t i = 0; i < Nlut; ++i) - res[i] = (((i + TK) % Nlut) - i); + for(size_t i = 0; i < Luts_; ++i) + delta[i] = (((i + TK_) % Luts_) - i); /* deltas */ - size_t Ds0 = Nlut; - size_t Ds1 = upsample_w; - size_t Ds2 = upsample_h; - size_t Ds3 = upsample_d; + size_t Ds0 = Luts_; + size_t Ds1 = upsample_w_; + size_t Ds2 = upsample_h_; + size_t Ds3 = upsample_d_; for(size_t pd = 0; pd < Ds3; ++pd) for(size_t ph = 0; ph < Ds2; ++ph) for(size_t pw = 0; pw < Ds1; ++pw){ - int32_t* deltas_ptr = &res[Nlut + pw*Ds0 + ph*Ds0*Ds1 + pd*Ds0*Ds1*Ds2]; + int32_t* deltas_ptr = &delta[Luts_ + pw*Ds0 + ph*Ds0*Ds1 + pd*Ds0*Ds1*Ds2]; // cumulative increments for(size_t i = 0; i < Ds0; ++i){ int32_t ctrs = i; - int32_t c = ctrs / F; + int32_t c = ctrs / Fs_; int32_t t, r, s; - std::tie(t, r, s) = unpack(ctrs % F); + std::tie(t, r, s) = unpack(ctrs % Fs_); // next indices - int32_t nextctrs = ctrs + TK; - int32_t nextc = nextctrs / F; + int32_t nextctrs = ctrs + TK_; + int32_t nextc = nextctrs / Fs_; int32_t nextt, nextr, nexts; - std::tie(nextt, nextr, nexts) = unpack(nextctrs % F); + std::tie(nextt, nextr, nexts) = unpack(nextctrs % Fs_); // diffs int32_t cdiff = nextc - c; - int32_t tdiff = (nextt + pd)/upsample_d - (t + pd)/upsample_d; - int32_t rdiff = (nextr + ph)/upsample_h - (r + ph)/upsample_h; - int32_t sdiff = (nexts + pw)/upsample_w - (s + pw)/upsample_w; + int32_t tdiff = (nextt + pd)/upsample_d_ - (t + pd)/upsample_d_; + int32_t rdiff = (nextr + ph)/upsample_h_ - (r + ph)/upsample_h_; + int32_t sdiff = (nexts + pw)/upsample_w_ - (s + pw)/upsample_w_; // delta pointers - deltas_ptr[i] = cdiff*stride_c + sdiff*stride_w + rdiff*stride_h + tdiff*stride_d; + deltas_ptr[i] = cdiff*stride_a_c_ + sdiff*stride_a_w_ + rdiff*stride_a_h_ + tdiff*stride_a_d_; } } /* Masks */ - size_t Ms0 = Nlut; - size_t Ms1 = 2*pad_w + 1; - size_t Ms2 = 2*pad_h + 1; - size_t Ms3 = 2*pad_d + 1; + size_t Ms0 = Luts_; + size_t Ms1 = 2*pad_w_ + 1; + size_t Ms2 = 2*pad_h_ + 1; + size_t Ms3 = 2*pad_d_ + 1; for(size_t pd = 0; pd < Ms3; ++pd) for(size_t ph = 0; ph < Ms2; ++ph) for(size_t pw = 0; pw < Ms1; ++pw){ - int32_t* masks_ptr = &masks[Nlut + pw*Ms0 + ph*Ms0*Ms1 + pd*Ms0*Ms1*Ms2]; + int32_t* masks_ptr = &masks[Luts_ + pw*Ms0 + ph*Ms0*Ms1 + pd*Ms0*Ms1*Ms2]; for(size_t i = 0; i < Ms0; ++i){ int32_t t, r, s; int32_t mask = 0x0; - for(size_t j = 0; j < TK; ++j){ - std::tie(t, r, s) = unpack((i + j) % F); - bool in_bounds_d = (t + pd) >= pad_d && (t + pd) < (T + pad_d); - bool in_bounds_h = (r + ph) >= pad_h && (r + ph) < (R + pad_h); - bool in_bounds_w = (s + pw) >= pad_w && (s + pw) < (S + pad_w); + for(size_t j = 0; j < TK_; ++j){ + std::tie(t, r, s) = unpack((i + j) % Fs_); + bool in_bounds_d = (t + pd) >= pad_d_ && (t + pd) < (T_ + pad_d_); + bool in_bounds_h = (r + ph) >= pad_h_ && (r + ph) < (R_ + pad_h_); + bool in_bounds_w = (s + pw) >= pad_w_ && (s + pw) < (S_ + pad_w_); mask |= (in_bounds_d && in_bounds_h && in_bounds_w) << j; } masks_ptr[i] = mask; } } - for(size_t i = 0; i < Nlut; ++i) + for(size_t i = 0; i < Luts_; ++i) masks[i] = 0x0; } @@ -95,20 +120,6 @@ public: return {16, 2, 64, 32, 2, 64, 16, 8, 2, 2, 8, 1, 8, 4 }; } - static void init_cst(int stride_d, int stride_h, int stride_w, int stride_c, - int pad_d, int pad_h, int pad_w, - int T, int R, int S, - std::vector &h_delta, std::vector &h_masks) { - int upsample_d = 1; - int upsample_h = 1; - int upsample_w = 1; - int TK = 8; - int F = T * R * S; - int nlut = (TK + F - 1) / F * F; - h_delta.resize(nlut + upsample_d*upsample_h*upsample_w*nlut); - h_masks.resize(nlut + (2*pad_h+1)*(2*pad_w+1)*(2*pad_d+1)*nlut); - build_lut(TK, stride_d, stride_h, stride_w, stride_c, pad_d, pad_h, pad_w, T, R, S, h_delta, h_masks); - } static std::string src(type ty = FPROP) { @@ -191,6 +202,56 @@ public: })"; return res; } + +private: + // image size + int B_; + int NC_; + int D_; + int H_; + int W_; + // filter size + int T_; + int R_; + int S_; + int NF_; + // activation size + int RD_; + int RH_; + int RW_; + // upsampling + int upsample_d_; + int upsample_h_; + int upsample_w_; + // padding + int pad_d_; + int pad_h_; + int pad_w_; + // striding + int stride_d_; + int stride_h_; + int stride_w_; + // equivalent matmul + int M_; + int N_; + int K_; + // helpers + int Fs_; + int TK_; + int Luts_; + // memory strides for data + int32_t stride_a_w_; + int32_t stride_a_h_; + int32_t stride_a_d_; + int32_t stride_a_c_; + int32_t stride_a_n_; + // memory stride for activations + int32_t stride_c_q_; + int32_t stride_c_p_; + int32_t stride_c_m_; + int32_t stride_c_k_; + int32_t stride_c_n_; + }; } From 54f888a27030b5f0afe02b5e5e6c81e1b0a76f6e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 8 May 2019 10:09:30 -0400 Subject: [PATCH 140/494] [dnn/conv] some minor fixes --- examples/cpp/conv.cpp | 74 ++------- include/triton/dnn/conv.h | 340 ++++++++++++++++++++++++++------------ 2 files changed, 247 insertions(+), 167 deletions(-) diff --git a/examples/cpp/conv.cpp b/examples/cpp/conv.cpp index 0af076612..4d55babe7 100644 --- a/examples/cpp/conv.cpp +++ b/examples/cpp/conv.cpp @@ -9,22 +9,19 @@ int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); - // initialize just-in-time compiler triton::jit jit(context); + triton::dnn::conv::type ty = triton::dnn::conv::BPROP; // initialization int32_t B = 4, NF = 32; int32_t D = 1, H = 24, W = 240; - int32_t NC = 64, T = 1, R = 3, S = 3; + int32_t NC = 32, T = 1, R = 3, S = 3; int32_t pad_d = 0, pad_h = 1, pad_w = 1; int32_t stride_d = 1, stride_h = 1, stride_w = 1; int32_t upsample_d = 1, upsample_h = 1, upsample_w = 1; int32_t RD = (D*upsample_d - T + 1 + 2*pad_d + stride_d - 1)/stride_d; int32_t RH = (H*upsample_h - R + 1 + 2*pad_h + stride_h - 1)/stride_h; int32_t RW = (W*upsample_w - S + 1 + 2*pad_w + stride_w - 1)/stride_w; - // equivalent matmul dimensions - int32_t M = B*RD*RH*RW; - int32_t N = NF; - int32_t K = NC*T*R*S; + triton::dnn::conv configuration(B, NC, H, W, R, S, NF, 1, 1, pad_h, pad_w, ty); // convolution configuration std::vector hc(B*RH*RW*NF); std::vector rc(B*RH*RW*NF); @@ -36,7 +33,8 @@ int main() { for(size_t i = 0; i < hb.size(); i++) hb[i] = (float)rand()/RAND_MAX; for(size_t i = 0; i < hc.size(); i++) - hc[i] = 0; + hc[i] = (float)rand()/RAND_MAX; + rc = hc; triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*4); triton::driver::buffer* da = triton::driver::buffer::create(context, ha.size()*4); triton::driver::buffer* db = triton::driver::buffer::create(context, hb.size()*4); @@ -45,80 +43,38 @@ int main() { stream->write(db, true, 0, hb); stream->write(dc, true, 0, hc); stream->synchronize(); - // memory strides for data - int32_t stride_i_w = 1; - int32_t stride_i_h = W*stride_i_w; - int32_t stride_i_d = H*stride_i_h; - int32_t stride_i_c = D*stride_i_d; - int32_t stride_i_n = NC*stride_i_c; - // memory stride for activations - int32_t stride_o_q = 1; - int32_t stride_o_p = RW*stride_o_q; - int32_t stride_o_m = RH*stride_o_p; - int32_t stride_o_k = RD*stride_o_m; - int32_t stride_o_n = NF*stride_o_k; // look-up table - triton::dnn::conv configuration(B, NC, H, W, R, S, NF, 1, 1, 0, 0); std::vector h_delta, h_masks; - configuration.build_lut(h_delta, h_masks); + configuration.build_deltas(h_delta); + configuration.build_masks(h_masks); // benchmark a given convolution kernel auto benchmark = [&](triton::driver::kernel* kernel, triton::jit::launch_information info) { - // launch info unsigned TM = info.global_range_size[0]; unsigned TN = info.global_range_size[1]; - // initialize constant memory + unsigned nthreads = info.num_threads; + std::array grid = configuration.get_grid(TM, TN); triton::driver::buffer* delta = jit.get_buffer("delta"); triton::driver::buffer* masks = jit.get_buffer("masks"); stream->write(delta, false, 0, h_delta.size()*4, h_delta.data()); stream->write(masks, false, 0, h_masks.size()*4, h_masks.data()); stream->synchronize(); - // launch info - unsigned nthreads = info.num_threads; - std::array grid = {(M + TM - 1)/TM, (N + TN - 1)/TN, 1}; - // set arguments - kernel->setArg(0, da); - kernel->setArg(1, db); - kernel->setArg(2, dc); - kernel->setArg(3, M); - kernel->setArg(4, N); - kernel->setArg(5, K); - kernel->setArg(6, B); - kernel->setArg(7, H); - kernel->setArg(8, W); - kernel->setArg(9, NF); - kernel->setArg(10, RH); - kernel->setArg(11, RW); - kernel->setArg(12, NC); - kernel->setArg(13, R); - kernel->setArg(14, S); - kernel->setArg(15, stride_i_n); - kernel->setArg(16, stride_i_c); - kernel->setArg(17, stride_i_h); - kernel->setArg(18, stride_i_w); - kernel->setArg(19, stride_o_n); - kernel->setArg(20, stride_o_k); - kernel->setArg(21, stride_o_p); - kernel->setArg(22, stride_o_q); - kernel->setArg(23, pad_h); - kernel->setArg(24, pad_w); - // dry run + configuration.set_arg(kernel, da, db, dc); stream->enqueue(kernel, grid, {nthreads, 1, 1}); stream->synchronize(); - // benchmark double ts = bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, [&](){ stream->synchronize(); }, *context->device()); - return 2.*M*N*K / ts * 1e-3; + return configuration.get_nflops() / ts * 1e-3; }; - std::string src = triton::dnn::conv::src(); + std::string src = configuration.src(); // jit.autotune("conv", src.c_str(), benchmark); - jit.add_module("conv", src.c_str(), triton::dnn::conv::default_params()); + jit.add_module("conv", src.c_str(), configuration.default_params()); triton::driver::kernel* kernel = jit.get_function("conv"); triton::jit::launch_information info = jit.get_launch_info("conv"); std::cout << "Performance: " << benchmark(kernel, info) << " TFLOPS " << std::endl; stream->read(dc, true, 0, hc); - cpp_conv_nchw(NC, B, NF, D, H, W, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, RD, RH, RW, rc, ha, hb); - for(size_t i = 0; i < M*N; i++) + configuration.cpu_ref(rc.data(), ha.data(), hb.data()); + for(size_t i = 0; i < hc.size(); i++) if(std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; exit(EXIT_FAILURE); diff --git a/include/triton/dnn/conv.h b/include/triton/dnn/conv.h index e3fb91d43..11e222f3f 100644 --- a/include/triton/dnn/conv.h +++ b/include/triton/dnn/conv.h @@ -1,5 +1,7 @@ #include #include +#include "triton/driver/stream.h" +#include "triton/driver/kernel.h" namespace triton{ namespace dnn{ @@ -15,10 +17,13 @@ public: conv(int B, int NC, int H, int W, int R, int S, int NF, int upsample_h, int upsample_w, - int pad_h, int pad_w) + int pad_h, int pad_w, + type ty = FPROP) : B_(B), NC_(NC), D_(1), H_(H), W_(W), T_(1), R_(R), S_(S), NF_(NF), upsample_d_(1), upsample_h_(upsample_h), upsample_w_(upsample_w), - pad_d_(0), pad_h_(pad_h), pad_w_(pad_w) + stride_d_(1), stride_h_(1), stride_w_(1), + pad_d_(0), pad_h_(pad_h), pad_w_(pad_w), + ty_(ty) { RD_ = (D_*upsample_d_ - T_ + 1 + 2*pad_d_ + stride_d_ - 1)/stride_d_; RH_ = (H_*upsample_h_ - R_ + 1 + 2*pad_h_ + stride_h_ - 1)/stride_h_; @@ -26,9 +31,6 @@ public: M_ = B*RD_*RH_*RW_; N_ = NF; K_ = NC*T_*R_*S_; - Fs_ = T_*R_*S_; - TK_ = 8; - Luts_ = (TK_ + Fs_ - 1) / Fs_ * Fs_; // memory strides for data stride_a_w_ = 1; stride_a_h_ = W_*stride_a_w_; @@ -41,88 +43,160 @@ public: stride_c_m_ = RH_*stride_c_p_; stride_c_k_ = RD_*stride_c_m_; stride_c_n_ = NF_*stride_c_k_; + // swap a and c for bprop + if(ty_ == BPROP){ + std::swap(stride_a_n_, stride_c_n_); + std::swap(stride_a_c_, stride_c_k_); + std::swap(stride_a_h_, stride_c_p_); + std::swap(stride_a_w_, stride_c_q_); + std::swap(D_, RD_); + std::swap(H_, RH_); + std::swap(W_, RW_); + pad_d_ = (RD_ - D_ + T_ - 1) / 2; + pad_h_ = (RH_ - H_ + R_ - 1) / 2; + pad_w_ = (RW_ - W_ + S_ - 1) / 2; + } + // look-up table info + Fs_ = T_*R_*S_; + TK_ = 8; + Luts_ = (TK_ + Fs_ - 1) / Fs_ * Fs_; } - - void build_lut(std::vector& delta, std::vector& masks) { - delta.resize(Luts_ + upsample_d_*upsample_h_*upsample_w_*Luts_); - masks.resize(Luts_ + (2*pad_h_+1)*(2*pad_w_+1)*(2*pad_d_+1)*Luts_); - - /* unpack index wrt filters */ - auto unpack = [&](int32_t trs){ - int32_t tr = trs / S_; - int32_t s = trs - tr*S_; - int32_t t = tr / R_; - int32_t r = tr - t*R_; - return std::make_tuple(t, r, s); - }; - /* increments */ - for(size_t i = 0; i < Luts_; ++i) - delta[i] = (((i + TK_) % Luts_) - i); - /* deltas */ - size_t Ds0 = Luts_; - size_t Ds1 = upsample_w_; - size_t Ds2 = upsample_h_; - size_t Ds3 = upsample_d_; - for(size_t pd = 0; pd < Ds3; ++pd) - for(size_t ph = 0; ph < Ds2; ++ph) - for(size_t pw = 0; pw < Ds1; ++pw){ - int32_t* deltas_ptr = &delta[Luts_ + pw*Ds0 + ph*Ds0*Ds1 + pd*Ds0*Ds1*Ds2]; - // cumulative increments - for(size_t i = 0; i < Ds0; ++i){ - int32_t ctrs = i; - int32_t c = ctrs / Fs_; - int32_t t, r, s; - std::tie(t, r, s) = unpack(ctrs % Fs_); - // next indices - int32_t nextctrs = ctrs + TK_; - int32_t nextc = nextctrs / Fs_; - int32_t nextt, nextr, nexts; - std::tie(nextt, nextr, nexts) = unpack(nextctrs % Fs_); - // diffs - int32_t cdiff = nextc - c; - int32_t tdiff = (nextt + pd)/upsample_d_ - (t + pd)/upsample_d_; - int32_t rdiff = (nextr + ph)/upsample_h_ - (r + ph)/upsample_h_; - int32_t sdiff = (nexts + pw)/upsample_w_ - (s + pw)/upsample_w_; - // delta pointers - deltas_ptr[i] = cdiff*stride_a_c_ + sdiff*stride_a_w_ + rdiff*stride_a_h_ + tdiff*stride_a_d_; - } + void build_deltas(std::vector& deltas){ + deltas.resize(Luts_ + upsample_d_*upsample_h_*upsample_w_*Luts_); + auto unpack = [&](int32_t trs){ + int32_t tr = trs / S_; + int32_t s = trs - tr*S_; + int32_t t = tr / R_; + int32_t r = tr - t*R_; + return std::make_tuple(t, r, s); + }; + for(size_t i = 0; i < Luts_; ++i) + deltas[i] = (((i + TK_) % Luts_) - i); + size_t Ds0 = Luts_; + size_t Ds1 = upsample_w_; + size_t Ds2 = upsample_h_; + size_t Ds3 = upsample_d_; + for(size_t pd = 0; pd < Ds3; ++pd) + for(size_t ph = 0; ph < Ds2; ++ph) + for(size_t pw = 0; pw < Ds1; ++pw){ + int32_t* deltas_ptr = &deltas[Luts_ + pw*Ds0 + ph*Ds0*Ds1 + pd*Ds0*Ds1*Ds2]; + // cumulative increments + for(size_t i = 0; i < Ds0; ++i){ + int32_t ctrs = i; + int32_t c = ctrs / Fs_; + int32_t t, r, s; + std::tie(t, r, s) = unpack(ctrs % Fs_); + // next indices + int32_t nextctrs = ctrs + TK_; + int32_t nextc = nextctrs / Fs_; + int32_t nextt, nextr, nexts; + std::tie(nextt, nextr, nexts) = unpack(nextctrs % Fs_); + // diffs + int32_t cdiff = nextc - c; + int32_t tdiff = (nextt + pd)/upsample_d_ - (t + pd)/upsample_d_; + int32_t rdiff = (nextr + ph)/upsample_h_ - (r + ph)/upsample_h_; + int32_t sdiff = (nexts + pw)/upsample_w_ - (s + pw)/upsample_w_; + // delta pointers + deltas_ptr[i] = cdiff*stride_a_c_ + sdiff*stride_a_w_ + rdiff*stride_a_h_ + tdiff*stride_a_d_; } + } + } - /* Masks */ - size_t Ms0 = Luts_; - size_t Ms1 = 2*pad_w_ + 1; - size_t Ms2 = 2*pad_h_ + 1; - size_t Ms3 = 2*pad_d_ + 1; - for(size_t pd = 0; pd < Ms3; ++pd) - for(size_t ph = 0; ph < Ms2; ++ph) - for(size_t pw = 0; pw < Ms1; ++pw){ - int32_t* masks_ptr = &masks[Luts_ + pw*Ms0 + ph*Ms0*Ms1 + pd*Ms0*Ms1*Ms2]; - for(size_t i = 0; i < Ms0; ++i){ - int32_t t, r, s; - int32_t mask = 0x0; - for(size_t j = 0; j < TK_; ++j){ - std::tie(t, r, s) = unpack((i + j) % Fs_); - bool in_bounds_d = (t + pd) >= pad_d_ && (t + pd) < (T_ + pad_d_); - bool in_bounds_h = (r + ph) >= pad_h_ && (r + ph) < (R_ + pad_h_); - bool in_bounds_w = (s + pw) >= pad_w_ && (s + pw) < (S_ + pad_w_); - mask |= (in_bounds_d && in_bounds_h && in_bounds_w) << j; - } - masks_ptr[i] = mask; - } + void build_masks(std::vector& masks){ + masks.resize(Luts_ + (2*pad_h_+1)*(2*pad_w_+1)*(2*pad_d_+1)*Luts_); + auto unpack = [&](int32_t trs){ + int32_t tr = trs / S_; + int32_t s = trs - tr*S_; + int32_t t = tr / R_; + int32_t r = tr - t*R_; + return std::make_tuple(t, r, s); + }; + size_t Ms0 = Luts_; + size_t Ms1 = 2*pad_w_ + 1; + size_t Ms2 = 2*pad_h_ + 1; + size_t Ms3 = 2*pad_d_ + 1; + for(size_t pd = 0; pd < Ms3; ++pd) + for(size_t ph = 0; ph < Ms2; ++ph) + for(size_t pw = 0; pw < Ms1; ++pw){ + int32_t* masks_ptr = &masks[Luts_ + pw*Ms0 + ph*Ms0*Ms1 + pd*Ms0*Ms1*Ms2]; + for(size_t i = 0; i < Ms0; ++i){ + int32_t t, r, s; + int32_t mask = 0x0; + for(size_t j = 0; j < TK_; ++j){ + std::tie(t, r, s) = unpack((i + j) % Fs_); + bool in_bounds_d = (t + pd) >= pad_d_ && (t + pd) < (T_ + pad_d_); + bool in_bounds_h = (r + ph) >= pad_h_ && (r + ph) < (R_ + pad_h_); + bool in_bounds_w = (s + pw) >= pad_w_ && (s + pw) < (S_ + pad_w_); + mask |= (in_bounds_d && in_bounds_h && in_bounds_w) << j; + } + masks_ptr[i] = mask; } - for(size_t i = 0; i < Luts_; ++i) - masks[i] = 0x0; - + } + for(size_t i = 0; i < Luts_; ++i) + masks[i] = 0x0; } - static std::vector default_params() { - return {16, 2, 64, 32, 2, 64, 16, 8, 2, 2, 8, 1, 8, 4 }; + std::array get_grid(size_t TM, size_t TN){ + return {(M_ + TM - 1)/TM, (N_ + TN - 1)/TN, 1}; + } + + size_t get_nflops(){ + return 2.*M_*N_*K_; + } + + void set_arg(driver::kernel *kernel, + driver::buffer *a, driver::buffer *b, driver::buffer *c) + { + + if(ty_ == BPROP) + std::swap(a, c); + kernel->setArg(0, a); + kernel->setArg(1, b); + kernel->setArg(2, c); + kernel->setArg(3, M_); + kernel->setArg(4, N_); + kernel->setArg(5, K_); + kernel->setArg(6, B_); + kernel->setArg(7, H_); + kernel->setArg(8, W_); + kernel->setArg(9, NF_); + kernel->setArg(10, RH_); + kernel->setArg(11, RW_); + kernel->setArg(12, NC_); + kernel->setArg(13, R_); + kernel->setArg(14, S_); + kernel->setArg(15, stride_a_n_); + kernel->setArg(16, stride_a_c_); + kernel->setArg(17, stride_a_h_); + kernel->setArg(18, stride_a_w_); + kernel->setArg(19, stride_c_n_); + kernel->setArg(20, stride_c_k_); + kernel->setArg(21, stride_c_p_); + kernel->setArg(22, stride_c_q_); + kernel->setArg(23, pad_h_); + kernel->setArg(24, pad_w_); + } + + std::vector default_params() { + if(ty_ == FPROP) + return {16, 2, 64, 32, 2, 64, 16, 8, 2, 2, 8, 1, 8, 4}; + else + return {16, 2, 64, 16, 32, 16, 4, 2, 2, 4, 2, 8, 4, 2}; } - static std::string src(type ty = FPROP) { - + std::string src() { + std::string bs0 = "TN", bs1 = "TK"; + std::string ldb0 = "*NF", ldb1 = ""; + std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; + std::string b = "b"; + if(ty_ == BPROP){ + std::swap(bs0, bs1); + std::swap(ldb0, ldb1); + std::swap(bcb0, bcb1); + b = "trans(b)"; + } std::string res = R"( const tunable int32 TM = {16, 32, 64}; @@ -158,7 +232,7 @@ public: int32 rar[TK] = racr % R; int32 ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; fp32* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis]; - fp32* pb[TN, TK] = b + rb1[newaxis, :]*NF + rb0[:, newaxis]; + fp32* pb[)" + bs0 + ", " + bs1 + R"(] = b + rb1)" + bcb1 + ldb0 + " + rb0" + bcb0 + ldb1 + R"(; __constant__ int32* pincd[TK] = delta + rka; __constant__ int32* pd[TK] = delta + R*S + rka; int32 d[TK] = *pd; @@ -172,10 +246,10 @@ public: int32 checka1[TK] = 1 << rka; int1 checka[TM, TK] = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; fp32 a[TM, TK] = checka ? *pa : 0; - fp32 b[TN, TK] = *pb; + fp32 b[)" + bs0 + ", " + bs1 + R"(] = *pb; for(int32 k = K; k > 0; k = k - TK){ C = dot(a, trans(b), C); - pb = pb + TK*NF; + pb = pb + TK)" + ldb0 + R"(; pa = pa + d[newaxis, :]; b = *pb; pd = pd + incd; @@ -203,42 +277,90 @@ public: return res; } + template + void cpu_ref(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B) + { + auto idx = [&](int32_t x, int32_t y, int32_t z, int32_t w, int32_t u, + int32_t /*s0*/, int32_t s1, int32_t s2, int32_t s3, int32_t s4) + { return u + w*s4 + z*s4*s3 + y*s4*s3*s2 + x*s4*s3*s2*s1; }; + + if(ty_==BPROP){ + std::swap(A, C); + } + std::cout << A[0] << std::endl; + IN_DTYPE accs[1]; + float tmp[1]; + for(int32_t m = 0 ; m < RD_; ++m) + for(int32_t p = 0 ; p < RH_; ++p) + for(int32_t q = 0; q < RW_; ++q) + for(int32_t n = 0; n < B_; ++n) + for(int32_t k = 0; k < NF_ ; ++k) + { + for(int32_t i = 0; i < 1; ++i) + accs[i] = 0; + int32_t mm = m*stride_d_ - pad_d_; + int32_t pp = p*stride_h_ - pad_h_; + int32_t qq = q*stride_w_ - pad_w_; + for(int32_t kk = 0; kk < 1; ++kk) + for(int32_t c = 0; c < NC_; ++c) + for(int32_t t = 0; t < T_; ++t) + for(int32_t r = 0; r < R_; ++r) + for(int32_t s = 0; s < S_; ++s){ + int32_t d = mm + t; + int32_t h = pp + r; + int32_t w = qq + s; + bool in_bounds = (d >= 0 && h >= 0 && w >= 0 && d < D_ && h < H_ && w < W_); + IN_DTYPE a = in_bounds?A[idx(n, c, d, h, w, B_, NC_, D_, H_, W_)]:0; + IN_DTYPE b; + if(ty_==FPROP) + b = B[idx(c, t, r, s, k*1 + kk, NC_, T_, R_, S_, NF_*1)]; + else + b = B[idx(c, t, s, r, k*1 + kk, NC_, T_, R_, S_, NF_*1)]; + accs[kk] = std::fma(a, b, accs[kk]); + } + for(int32_t kk = 0; kk < 1; ++kk){ + tmp[kk] = accs[kk]; + } + C[idx(n, k, m, p, q, B_, NF_, RD_, RH_, RW_)] = tmp[0]; + } + } + private: // image size - int B_; - int NC_; - int D_; - int H_; - int W_; + int32_t B_; + int32_t NC_; + int32_t D_; + int32_t H_; + int32_t W_; // filter size - int T_; - int R_; - int S_; - int NF_; + int32_t T_; + int32_t R_; + int32_t S_; + int32_t NF_; // activation size - int RD_; - int RH_; - int RW_; + int32_t RD_; + int32_t RH_; + int32_t RW_; // upsampling - int upsample_d_; - int upsample_h_; - int upsample_w_; + int32_t upsample_d_; + int32_t upsample_h_; + int32_t upsample_w_; // padding - int pad_d_; - int pad_h_; - int pad_w_; + int32_t pad_d_; + int32_t pad_h_; + int32_t pad_w_; // striding - int stride_d_; - int stride_h_; - int stride_w_; + int32_t stride_d_; + int32_t stride_h_; + int32_t stride_w_; // equivalent matmul - int M_; - int N_; - int K_; + int32_t M_; + int32_t N_; + int32_t K_; // helpers - int Fs_; - int TK_; - int Luts_; + int32_t Fs_; + int32_t TK_; + int32_t Luts_; // memory strides for data int32_t stride_a_w_; int32_t stride_a_h_; @@ -251,7 +373,9 @@ private: int32_t stride_c_m_; int32_t stride_c_k_; int32_t stride_c_n_; - + // type + type ty_; + bool is_bprop_; }; } From fc4daf11dda6f5b3eb52b3d727be11ad80ef0174 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 8 May 2019 13:58:25 -0400 Subject: [PATCH 141/494] [examples/conv] now deferring shape computations to conv configuration --- examples/cpp/conv.cpp | 16 ++++++-------- include/triton/dnn/conv.h | 45 +++++++++++++++++++++------------------ 2 files changed, 30 insertions(+), 31 deletions(-) diff --git a/examples/cpp/conv.cpp b/examples/cpp/conv.cpp index 4d55babe7..df5b22803 100644 --- a/examples/cpp/conv.cpp +++ b/examples/cpp/conv.cpp @@ -16,24 +16,19 @@ int main() { int32_t D = 1, H = 24, W = 240; int32_t NC = 32, T = 1, R = 3, S = 3; int32_t pad_d = 0, pad_h = 1, pad_w = 1; - int32_t stride_d = 1, stride_h = 1, stride_w = 1; - int32_t upsample_d = 1, upsample_h = 1, upsample_w = 1; - int32_t RD = (D*upsample_d - T + 1 + 2*pad_d + stride_d - 1)/stride_d; - int32_t RH = (H*upsample_h - R + 1 + 2*pad_h + stride_h - 1)/stride_h; - int32_t RW = (W*upsample_w - S + 1 + 2*pad_w + stride_w - 1)/stride_w; triton::dnn::conv configuration(B, NC, H, W, R, S, NF, 1, 1, pad_h, pad_w, ty); // convolution configuration - std::vector hc(B*RH*RW*NF); - std::vector rc(B*RH*RW*NF); - std::vector ha(B*NC*H*W); - std::vector hb(NC*R*S*NF); + std::vector hc(configuration.c_size()); + std::vector rc(configuration.c_size()); + std::vector ha(configuration.a_size()); + std::vector hb(configuration.b_size()); srand(0); for(size_t i = 0; i < ha.size(); i++) ha[i] = (float)rand()/RAND_MAX; for(size_t i = 0; i < hb.size(); i++) hb[i] = (float)rand()/RAND_MAX; for(size_t i = 0; i < hc.size(); i++) - hc[i] = (float)rand()/RAND_MAX; + hc[i] = 0; rc = hc; triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*4); triton::driver::buffer* da = triton::driver::buffer::create(context, ha.size()*4); @@ -74,6 +69,7 @@ int main() { std::cout << "Performance: " << benchmark(kernel, info) << " TFLOPS " << std::endl; stream->read(dc, true, 0, hc); configuration.cpu_ref(rc.data(), ha.data(), hb.data()); +// std::cout << c[0] << std::endl; for(size_t i = 0; i < hc.size(); i++) if(std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; diff --git a/include/triton/dnn/conv.h b/include/triton/dnn/conv.h index 11e222f3f..85bb1e038 100644 --- a/include/triton/dnn/conv.h +++ b/include/triton/dnn/conv.h @@ -28,9 +28,6 @@ public: RD_ = (D_*upsample_d_ - T_ + 1 + 2*pad_d_ + stride_d_ - 1)/stride_d_; RH_ = (H_*upsample_h_ - R_ + 1 + 2*pad_h_ + stride_h_ - 1)/stride_h_; RW_ = (W_*upsample_w_ - S_ + 1 + 2*pad_w_ + stride_w_ - 1)/stride_w_; - M_ = B*RD_*RH_*RW_; - N_ = NF; - K_ = NC*T_*R_*S_; // memory strides for data stride_a_w_ = 1; stride_a_h_ = W_*stride_a_w_; @@ -52,16 +49,33 @@ public: std::swap(D_, RD_); std::swap(H_, RH_); std::swap(W_, RW_); + std::swap(NF_, NC_); pad_d_ = (RD_ - D_ + T_ - 1) / 2; pad_h_ = (RH_ - H_ + R_ - 1) / 2; pad_w_ = (RW_ - W_ + S_ - 1) / 2; } + // equivalent matmul + M_ = B_*RD_*RH_*RW_; + N_ = NF_; + K_ = NC_*T_*R_*S_; // look-up table info Fs_ = T_*R_*S_; TK_ = 8; Luts_ = (TK_ + Fs_ - 1) / Fs_ * Fs_; } + size_t a_size() { + return B_*NC_*D_*H_*W_; + } + + size_t b_size() { + return NC_*NF_*T_*R_*S_; + } + + size_t c_size() { + return B_*NF_*RD_*RH_*RW_; + } + void build_deltas(std::vector& deltas){ deltas.resize(Luts_ + upsample_d_*upsample_h_*upsample_w_*Luts_); auto unpack = [&](int32_t trs){ @@ -148,9 +162,6 @@ public: void set_arg(driver::kernel *kernel, driver::buffer *a, driver::buffer *b, driver::buffer *c) { - - if(ty_ == BPROP) - std::swap(a, c); kernel->setArg(0, a); kernel->setArg(1, b); kernel->setArg(2, c); @@ -179,10 +190,10 @@ public: } std::vector default_params() { - if(ty_ == FPROP) +// if(ty_ == FPROP) return {16, 2, 64, 32, 2, 64, 16, 8, 2, 2, 8, 1, 8, 4}; - else - return {16, 2, 64, 16, 32, 16, 4, 2, 2, 4, 2, 8, 4, 2}; +// else +// return {16, 2, 64, 16, 32, 16, 4, 2, 2, 4, 2, 8, 4, 2}; } @@ -232,7 +243,7 @@ public: int32 rar[TK] = racr % R; int32 ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; fp32* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis]; - fp32* pb[)" + bs0 + ", " + bs1 + R"(] = b + rb1)" + bcb1 + ldb0 + " + rb0" + bcb0 + ldb1 + R"(; + fp32* pb[TN, TK] = b + rb1[newaxis, :]*NF + rb0[:, newaxis]; __constant__ int32* pincd[TK] = delta + rka; __constant__ int32* pd[TK] = delta + R*S + rka; int32 d[TK] = *pd; @@ -246,10 +257,10 @@ public: int32 checka1[TK] = 1 << rka; int1 checka[TM, TK] = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; fp32 a[TM, TK] = checka ? *pa : 0; - fp32 b[)" + bs0 + ", " + bs1 + R"(] = *pb; + fp32 b[TN, TK] = *pb; for(int32 k = K; k > 0; k = k - TK){ C = dot(a, trans(b), C); - pb = pb + TK)" + ldb0 + R"(; + pb = pb + TK*NF; pa = pa + d[newaxis, :]; b = *pb; pd = pd + incd; @@ -284,10 +295,6 @@ public: int32_t /*s0*/, int32_t s1, int32_t s2, int32_t s3, int32_t s4) { return u + w*s4 + z*s4*s3 + y*s4*s3*s2 + x*s4*s3*s2*s1; }; - if(ty_==BPROP){ - std::swap(A, C); - } - std::cout << A[0] << std::endl; IN_DTYPE accs[1]; float tmp[1]; for(int32_t m = 0 ; m < RD_; ++m) @@ -311,11 +318,7 @@ public: int32_t w = qq + s; bool in_bounds = (d >= 0 && h >= 0 && w >= 0 && d < D_ && h < H_ && w < W_); IN_DTYPE a = in_bounds?A[idx(n, c, d, h, w, B_, NC_, D_, H_, W_)]:0; - IN_DTYPE b; - if(ty_==FPROP) - b = B[idx(c, t, r, s, k*1 + kk, NC_, T_, R_, S_, NF_*1)]; - else - b = B[idx(c, t, s, r, k*1 + kk, NC_, T_, R_, S_, NF_*1)]; + IN_DTYPE b = B[idx(c, t, r, s, k*1 + kk, NC_, T_, R_, S_, NF_*1)]; accs[kk] = std::fma(a, b, accs[kk]); } for(int32_t kk = 0; kk < 1; ++kk){ From f6fe9492e4e516d76dc19b6353696ceae667c36e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 11 May 2019 18:09:23 -0400 Subject: [PATCH 142/494] [dnn/conv] added triton-c code for wgrad --- examples/cpp/conv.cpp | 26 ++- include/triton/dnn/conv.h | 444 ++++++++++++++++++++++++-------------- lib/codegen/selection.cpp | 2 + 3 files changed, 299 insertions(+), 173 deletions(-) diff --git a/examples/cpp/conv.cpp b/examples/cpp/conv.cpp index df5b22803..8cddeb588 100644 --- a/examples/cpp/conv.cpp +++ b/examples/cpp/conv.cpp @@ -10,13 +10,13 @@ int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); triton::jit jit(context); - triton::dnn::conv::type ty = triton::dnn::conv::BPROP; + triton::dnn::conv::type ty = triton::dnn::conv::WGRAD; // initialization int32_t B = 4, NF = 32; int32_t D = 1, H = 24, W = 240; int32_t NC = 32, T = 1, R = 3, S = 3; int32_t pad_d = 0, pad_h = 1, pad_w = 1; - triton::dnn::conv configuration(B, NC, H, W, R, S, NF, 1, 1, pad_h, pad_w, ty); + triton::dnn::conv configuration(B, NC, D, H, W, T, R, S, NF, 1, 1, 1, pad_d, pad_h, pad_w, ty); // convolution configuration std::vector hc(configuration.c_size()); std::vector rc(configuration.c_size()); @@ -40,8 +40,10 @@ int main() { stream->synchronize(); // look-up table std::vector h_delta, h_masks; - configuration.build_deltas(h_delta); - configuration.build_masks(h_masks); + if(ty != triton::dnn::conv::WGRAD){ + configuration.build_deltas(h_delta); + configuration.build_masks(h_masks); + } // benchmark a given convolution kernel auto benchmark = [&](triton::driver::kernel* kernel, triton::jit::launch_information info) { @@ -49,10 +51,12 @@ int main() { unsigned TN = info.global_range_size[1]; unsigned nthreads = info.num_threads; std::array grid = configuration.get_grid(TM, TN); - triton::driver::buffer* delta = jit.get_buffer("delta"); - triton::driver::buffer* masks = jit.get_buffer("masks"); - stream->write(delta, false, 0, h_delta.size()*4, h_delta.data()); - stream->write(masks, false, 0, h_masks.size()*4, h_masks.data()); + if(ty != triton::dnn::conv::WGRAD){ + triton::driver::buffer* delta = jit.get_buffer("delta"); + triton::driver::buffer* masks = jit.get_buffer("masks"); + stream->write(delta, false, 0, h_delta.size()*4, h_delta.data()); + stream->write(masks, false, 0, h_masks.size()*4, h_masks.data()); + } stream->synchronize(); configuration.set_arg(kernel, da, db, dc); stream->enqueue(kernel, grid, {nthreads, 1, 1}); @@ -69,11 +73,11 @@ int main() { std::cout << "Performance: " << benchmark(kernel, info) << " TFLOPS " << std::endl; stream->read(dc, true, 0, hc); configuration.cpu_ref(rc.data(), ha.data(), hb.data()); -// std::cout << c[0] << std::endl; - for(size_t i = 0; i < hc.size(); i++) + for(size_t i = 0; i < hc.size(); i++){ if(std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; exit(EXIT_FAILURE); - } + } + } std::cout << "Pass!" << std::endl; } diff --git a/include/triton/dnn/conv.h b/include/triton/dnn/conv.h index 85bb1e038..c29bb925b 100644 --- a/include/triton/dnn/conv.h +++ b/include/triton/dnn/conv.h @@ -1,5 +1,7 @@ #include #include +#include +#include #include "triton/driver/stream.h" #include "triton/driver/kernel.h" @@ -15,74 +17,91 @@ public: }; - conv(int B, int NC, int H, int W, int R, int S, int NF, - int upsample_h, int upsample_w, - int pad_h, int pad_w, + conv(int B, int NC, + int D, int H, int W, + int T, int R, int S, int NF, + int upsample_d, int upsample_h, int upsample_w, + int pad_d, int pad_h, int pad_w, type ty = FPROP) - : B_(B), NC_(NC), D_(1), H_(H), W_(W), T_(1), R_(R), S_(S), NF_(NF), - upsample_d_(1), upsample_h_(upsample_h), upsample_w_(upsample_w), + : NB_(B), NC_(NC), AD_(D), AH_(H), AW_(W), BD_(T), BH_(R), BW_(S), NF_(NF), + upsample_d_(upsample_d), upsample_h_(upsample_h), upsample_w_(upsample_w), stride_d_(1), stride_h_(1), stride_w_(1), - pad_d_(0), pad_h_(pad_h), pad_w_(pad_w), + pad_d_(pad_d), pad_h_(pad_h), pad_w_(pad_w), ty_(ty) { - RD_ = (D_*upsample_d_ - T_ + 1 + 2*pad_d_ + stride_d_ - 1)/stride_d_; - RH_ = (H_*upsample_h_ - R_ + 1 + 2*pad_h_ + stride_h_ - 1)/stride_h_; - RW_ = (W_*upsample_w_ - S_ + 1 + 2*pad_w_ + stride_w_ - 1)/stride_w_; - // memory strides for data - stride_a_w_ = 1; - stride_a_h_ = W_*stride_a_w_; - stride_a_d_ = H_*stride_a_h_; - stride_a_c_ = D_*stride_a_d_; - stride_a_n_ = NC_*stride_a_c_; - // memory stride for activations - stride_c_q_ = 1; - stride_c_p_ = RW_*stride_c_q_; - stride_c_m_ = RH_*stride_c_p_; - stride_c_k_ = RD_*stride_c_m_; - stride_c_n_ = NF_*stride_c_k_; + CD_ = (AD_*upsample_d_ - BD_ + 1 + 2*pad_d_ + stride_d_ - 1)/stride_d_; + CH_ = (AH_*upsample_h_ - BH_ + 1 + 2*pad_h_ + stride_h_ - 1)/stride_h_; + CW_ = (AW_*upsample_w_ - BW_ + 1 + 2*pad_w_ + stride_w_ - 1)/stride_w_; + // shapes + shapes_a_ = {NB_, NC_, AD_, AH_, AW_}; + shapes_b_ = {NC_, BD_, BH_, BW_, NF_}; + shapes_c_ = {NB_, NF_, CD_, CH_, CW_}; // swap a and c for bprop if(ty_ == BPROP){ - std::swap(stride_a_n_, stride_c_n_); - std::swap(stride_a_c_, stride_c_k_); - std::swap(stride_a_h_, stride_c_p_); - std::swap(stride_a_w_, stride_c_q_); - std::swap(D_, RD_); - std::swap(H_, RH_); - std::swap(W_, RW_); - std::swap(NF_, NC_); - pad_d_ = (RD_ - D_ + T_ - 1) / 2; - pad_h_ = (RH_ - H_ + R_ - 1) / 2; - pad_w_ = (RW_ - W_ + S_ - 1) / 2; + pad_d_ = (CD_ - AD_ + BD_ - 1) / 2; + pad_h_ = (CH_ - AH_ + BH_ - 1) / 2; + pad_w_ = (CW_ - AW_ + BW_ - 1) / 2; + shapes_a_.swap(shapes_c_); } + // swap b and c for wgrad + if(ty_ == WGRAD){ + shapes_b_.swap(shapes_c_); + } + // leading dimensions + auto set_ld = [](const std::vector& shapes, + std::vector& ld) { + size_t size = shapes.size(); + ld.resize(size); + ld[4] = 1; + ld[3] = shapes[4]*ld[4]; + ld[2] = shapes[3]*ld[3]; + ld[1] = shapes[2]*ld[2]; + ld[0] = shapes[1]*ld[1]; + }; + set_ld(shapes_a_, ld_a_); + set_ld(shapes_b_, ld_b_); + set_ld(shapes_c_, ld_c_); // equivalent matmul - M_ = B_*RD_*RH_*RW_; - N_ = NF_; - K_ = NC_*T_*R_*S_; + if(ty_ == WGRAD){ + M_ = shapes_c_[0]*shapes_c_[1]*shapes_c_[2]*shapes_c_[3]; + N_ = shapes_c_[4]; + K_ = shapes_b_[0]*shapes_b_[2]*shapes_b_[3]*shapes_b_[4]; + } + else{ + M_ = shapes_c_[0]*shapes_c_[2]*shapes_c_[3]*shapes_c_[4]; + N_ = shapes_c_[1]; + K_ = shapes_b_[0]*shapes_b_[1]*shapes_b_[2]*shapes_b_[3]; + } // look-up table info - Fs_ = T_*R_*S_; + Fs_ = shapes_b_[1]*shapes_b_[2]*shapes_b_[3]; TK_ = 8; Luts_ = (TK_ + Fs_ - 1) / Fs_ * Fs_; } size_t a_size() { - return B_*NC_*D_*H_*W_; + return std::accumulate(shapes_a_.begin(), shapes_a_.end(), + 1, std::multiplies()); } size_t b_size() { - return NC_*NF_*T_*R_*S_; + return std::accumulate(shapes_b_.begin(), shapes_b_.end(), + 1, std::multiplies()); } size_t c_size() { - return B_*NF_*RD_*RH_*RW_; + return std::accumulate(shapes_c_.begin(), shapes_c_.end(), + 1, std::multiplies()); } void build_deltas(std::vector& deltas){ + if(ty_ == WGRAD) + throw std::runtime_error("no look-up table necessary for wgrad"); deltas.resize(Luts_ + upsample_d_*upsample_h_*upsample_w_*Luts_); auto unpack = [&](int32_t trs){ - int32_t tr = trs / S_; - int32_t s = trs - tr*S_; - int32_t t = tr / R_; - int32_t r = tr - t*R_; + int32_t tr = trs / BW_; + int32_t s = trs - tr*BW_; + int32_t t = tr / BH_; + int32_t r = tr - t*BH_; return std::make_tuple(t, r, s); }; for(size_t i = 0; i < Luts_; ++i) @@ -112,18 +131,20 @@ public: int32_t rdiff = (nextr + ph)/upsample_h_ - (r + ph)/upsample_h_; int32_t sdiff = (nexts + pw)/upsample_w_ - (s + pw)/upsample_w_; // delta pointers - deltas_ptr[i] = cdiff*stride_a_c_ + sdiff*stride_a_w_ + rdiff*stride_a_h_ + tdiff*stride_a_d_; + deltas_ptr[i] = cdiff*ld_a_[1] + tdiff*ld_a_[2] + rdiff*ld_a_[3] + sdiff*ld_a_[4]; } } } void build_masks(std::vector& masks){ + if(ty_ == WGRAD) + throw std::runtime_error("no look-up table necessary for wgrad"); masks.resize(Luts_ + (2*pad_h_+1)*(2*pad_w_+1)*(2*pad_d_+1)*Luts_); auto unpack = [&](int32_t trs){ - int32_t tr = trs / S_; - int32_t s = trs - tr*S_; - int32_t t = tr / R_; - int32_t r = tr - t*R_; + int32_t tr = trs / BW_; + int32_t s = trs - tr*BW_; + int32_t t = tr / BH_; + int32_t r = tr - t*BH_; return std::make_tuple(t, r, s); }; size_t Ms0 = Luts_; @@ -139,9 +160,9 @@ public: int32_t mask = 0x0; for(size_t j = 0; j < TK_; ++j){ std::tie(t, r, s) = unpack((i + j) % Fs_); - bool in_bounds_d = (t + pd) >= pad_d_ && (t + pd) < (T_ + pad_d_); - bool in_bounds_h = (r + ph) >= pad_h_ && (r + ph) < (R_ + pad_h_); - bool in_bounds_w = (s + pw) >= pad_w_ && (s + pw) < (S_ + pad_w_); + bool in_bounds_d = (t + pd) >= pad_d_ && (t + pd) < (BD_ + pad_d_); + bool in_bounds_h = (r + ph) >= pad_h_ && (r + ph) < (BH_ + pad_h_); + bool in_bounds_w = (s + pw) >= pad_w_ && (s + pw) < (BW_ + pad_w_); mask |= (in_bounds_d && in_bounds_h && in_bounds_w) << j; } masks_ptr[i] = mask; @@ -168,46 +189,40 @@ public: kernel->setArg(3, M_); kernel->setArg(4, N_); kernel->setArg(5, K_); - kernel->setArg(6, B_); - kernel->setArg(7, H_); - kernel->setArg(8, W_); - kernel->setArg(9, NF_); - kernel->setArg(10, RH_); - kernel->setArg(11, RW_); - kernel->setArg(12, NC_); - kernel->setArg(13, R_); - kernel->setArg(14, S_); - kernel->setArg(15, stride_a_n_); - kernel->setArg(16, stride_a_c_); - kernel->setArg(17, stride_a_h_); - kernel->setArg(18, stride_a_w_); - kernel->setArg(19, stride_c_n_); - kernel->setArg(20, stride_c_k_); - kernel->setArg(21, stride_c_p_); - kernel->setArg(22, stride_c_q_); - kernel->setArg(23, pad_h_); - kernel->setArg(24, pad_w_); + kernel->setArg(6, AH_); + kernel->setArg(7, AW_); + kernel->setArg(8, BH_); + kernel->setArg(9, BW_); + kernel->setArg(10, CH_); + kernel->setArg(11, CW_); + kernel->setArg(12, ld_a_[0]); + kernel->setArg(13, ld_a_[1]); + kernel->setArg(14, ld_a_[2]); + kernel->setArg(15, ld_a_[3]); + kernel->setArg(16, ld_a_[4]); + kernel->setArg(17, ld_b_[0]); + kernel->setArg(18, ld_b_[1]); + kernel->setArg(19, ld_b_[2]); + kernel->setArg(20, ld_b_[3]); + kernel->setArg(21, ld_b_[4]); + kernel->setArg(22, ld_c_[0]); + kernel->setArg(23, ld_c_[1]); + kernel->setArg(24, ld_c_[2]); + kernel->setArg(25, ld_c_[3]); + kernel->setArg(26, ld_c_[4]); + kernel->setArg(27, pad_h_); + kernel->setArg(28, pad_w_); } std::vector default_params() { -// if(ty_ == FPROP) + if(ty_ == FPROP || ty_ == BPROP) return {16, 2, 64, 32, 2, 64, 16, 8, 2, 2, 8, 1, 8, 4}; -// else -// return {16, 2, 64, 16, 32, 16, 4, 2, 2, 4, 2, 8, 4, 2}; + else + return {8, 2, 16, 8, 2, 16, 8, 2, 8, 8}; } - std::string src() { - std::string bs0 = "TN", bs1 = "TK"; - std::string ldb0 = "*NF", ldb1 = ""; - std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; - std::string b = "b"; - if(ty_ == BPROP){ - std::swap(bs0, bs1); - std::swap(ldb0, ldb1); - std::swap(bcb0, bcb1); - b = "trans(b)"; - } + std::string xprop() { std::string res = R"( const tunable int32 TM = {16, 32, 64}; @@ -221,36 +236,37 @@ public: read_only restrict fp32 *b, fp32 *c, int32 M, int32 N, int32 K, - int32 B, int32 H, int32 W, - int32 NF, int32 RH, int32 RW, - int32 NC, int32 R, int32 S, - int32 lda_n, int32 lda_c, int32 lda_h, int32 lda_w, - int32 ldc_n, int32 ldc_k, int32 ldc_p, int32 ldc_q, + int32 AH, int32 AW, + int32 BH, int32 BW, + int32 CH, int32 CW, + int32 lda_n, int32 lda_c, int32 lda_d, int32 lda_h, int32 lda_w, + int32 ldb_c, int32 ldb_t, int32 ldb_r, int32 ldb_s, int32 ldb_k, + int32 ldc_n, int32 ldc_k, int32 ldc_m, int32 ldc_p, int32 ldc_q, int32 pad_h, int32 pad_w){ int32 rxa[TM] = get_global_range[TM](0); int32 rb0[TN] = get_global_range[TN](1); int32 rka[TK] = 0 ... TK; int32 rb1[TK] = 0 ... TK; fp32 C[TM, TN] = 0; - int32 rabh[TM] = rxa / RW; - int32 raw[TM] = rxa % RW - pad_w; - int32 rab[TM] = rabh / RH; - int32 rah[TM] = rabh % RH - pad_h; + int32 rabh[TM] = rxa / CW; + int32 raw[TM] = rxa % CW - pad_w; + int32 rab[TM] = rabh / CH; + int32 rah[TM] = rabh % CH - pad_h; int32 ra0[TM] = rab*lda_n + rah*lda_h + raw*lda_w; - int32 racr[TK] = rka / S; - int32 ras[TK] = rka % S; - int32 rac[TK] = racr / R; - int32 rar[TK] = racr % R; + int32 racr[TK] = rka / BW; + int32 ras[TK] = rka % BW; + int32 rac[TK] = racr / BH; + int32 rar[TK] = racr % BH; int32 ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; fp32* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis]; - fp32* pb[TN, TK] = b + rb1[newaxis, :]*NF + rb0[:, newaxis]; + fp32* pb[TN, TK] = b + rb1[newaxis, :]*ldb_s + rb0[:, newaxis]; __constant__ int32* pincd[TK] = delta + rka; - __constant__ int32* pd[TK] = delta + R*S + rka; + __constant__ int32* pd[TK] = delta + BH*BW + rka; int32 d[TK] = *pd; int32 incd[TK] = *pincd; - int32 maskh[TM] = pad_h + min(rah, 0) + max(rah + R - H, 0); - int32 maskw[TM] = pad_w + min(raw, 0) + max(raw + S - W, 0); - __constant__ int32* pm[TM] = masks + R*S + maskw*R*S + maskh*R*S*(2*pad_w + 1); + int32 maskh[TM] = pad_h + min(rah, 0) + max(rah + BH - AH, 0); + int32 maskw[TM] = pad_w + min(raw, 0) + max(raw + BW - AW, 0); + __constant__ int32* pm[TM] = masks + BH*BW + maskw*BH*BW + maskh*BH*BW*(2*pad_w + 1); __constant__ int32* pincm[TM] = delta; int32 incm[TM] = *pincm; int32 checka0[TM] = *pm; @@ -260,7 +276,7 @@ public: fp32 b[TN, TK] = *pb; for(int32 k = K; k > 0; k = k - TK){ C = dot(a, trans(b), C); - pb = pb + TK*NF; + pb = pb + TK*ldb_s; pa = pa + d[newaxis, :]; b = *pb; pd = pd + incd; @@ -276,8 +292,8 @@ public: } int32 rxc[TM] = get_global_range[TM](0); int32 rc1[TN] = get_global_range[TN](1); - int32 rcn[TM] = rxc / (RH*RW); - int32 rcpq[TM] = rxc % (RH*RW); + int32 rcn[TM] = rxc / (CH*CW); + int32 rcpq[TM] = rxc % (CH*CW); int32 rc0[TM] = rcn * ldc_n + rcpq; fp32* pc[TM, TN] = c + rc1[newaxis, :]*ldc_k + rc0[:, newaxis]; int1 checkc0[TM] = rxc < M; @@ -288,62 +304,169 @@ public: return res; } + // C = A * B + // where A is N,C,AH,AW + // B is N,K,BH,BW + // C is C,CH,CW,K + std::string wgrad() { + std::string res = + R"( + const tunable int32 TM = {16, 32, 64}; + const tunable int32 TN = {16, 32, 64}; + const tunable int32 TK = {8}; + + void conv(read_only restrict fp32 *a, + read_only restrict fp32 *b, + fp32 *c, + int32 M, int32 N, int32 K, + int32 AH, int32 AW, + int32 CH, int32 CW, + int32 BH, int32 BW, + int32 lda_n, int32 lda_c, int32 lda_d, int32 lda_h, int32 lda_w, + int32 ldb_n, int32 ldb_k, int32 ldb_m, int32 ldb_p, int32 ldb_q, + int32 ldc_c, int32 ldc_t, int32 ldc_r, int32 ldc_s, int32 ldc_k, + int32 pad_h, int32 pad_w){ + int32 rxa[TM] = get_global_range[TM](0); + int32 ryb[TN] = get_global_range[TN](1); + int32 rk[TK] = 0 ... TK; + fp32 C[TM, TN] = 0; + int32 racr[TM] = rxa / CW; + int32 raw_base[TM] = rxa % CW - pad_w; + int32 rac[TM] = racr / CH; + int32 rah_base[TM] = racr % CH - pad_h; + fp32* pa_base[TM, TK] = a + rac[:, newaxis]*lda_c; + fp32* pb_base[TN, TK] = b + ryb[:, newaxis]*ldb_k; + for(int32 k = K; k > 0; k = k - TK){ + int32 rknp[TK] = rk / BW; + int32 rkq[TK] = rk % BW; + int32 rkn[TK] = rknp / BH; + int32 rkp[TK] = rknp % BH; + int32 rah[TM, TK] = rah_base[:, newaxis] + rkp[newaxis, :]; + int32 raw[TM, TK] = raw_base[:, newaxis] + rkq[newaxis, :]; + int1 checka[TM, TK] = (rah >= 0) && (rah < AH) && (raw >= 0) && (raw < AW); + fp32* pa[TM, TK] = pa_base + rah*lda_h + raw*lda_w + rkn*lda_n; + fp32* pb[TN, TK] = pb_base + rkp*ldb_p + rkq*ldb_q + rkn*ldb_n; + fp32 A[TM, TK] = checka ? *pa : 0; + fp32 B[TN, TK] = *pb; + C = dot(A, trans(B), C); + rk = rk + TK; + } + int32 rxc[TM] = get_global_range[TM](0); + int32 ryc[TN] = get_global_range[TN](1); + int32 rccr[TM] = rxc / CW; + int32 rcs[TM] = rxa % CW; + int32 rcc[TM] = racr / CH; + int32 rcr[TM] = racr % CH; + int32 rc0[TM] = rcc*ldc_c + rcr*ldc_r + rcs*ldc_s; + fp32* pc[TM, TN] = c + rc0[:, newaxis] + ryc[newaxis, :]*ldc_k; + int1 checkc0[TM] = rxc < M; + int1 checkc1[TN] = ryc < N; + int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + @checkc *pc = C; + })"; + return res; + } + + std::string src() { + if(ty_ == FPROP || ty_ == BPROP) + return xprop(); + else + return wgrad(); + } + + template + void cpu_xprop(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B) + { + IN_DTYPE acc; + for(int32_t n = 0; n < shapes_c_[0]; ++n) + for(int32_t k = 0; k < shapes_c_[1] ; ++k) + for(int32_t cd = 0 ; cd < shapes_c_[2]; ++cd) + for(int32_t ch = 0 ; ch < shapes_c_[3]; ++ch) + for(int32_t cw = 0; cw < shapes_c_[4]; ++cw) + { + acc = 0; + int32_t d = cd*stride_d_ - pad_d_; + int32_t h = ch*stride_h_ - pad_h_; + int32_t w = cw*stride_w_ - pad_w_; + for(int32_t c = 0; c < shapes_b_[0]; ++c) + for(int32_t bd = 0; bd < shapes_b_[1]; ++bd) + for(int32_t bh = 0; bh < shapes_b_[2]; ++bh) + for(int32_t bw = 0; bw < shapes_b_[3]; ++bw){ + int32_t ad = d + bd; + int32_t ah = h + bh; + int32_t aw = w + bw; + bool in_bounds = (ad >= 0 && ad < shapes_a_[2] && + ah >= 0 && ah < shapes_a_[3] && + aw >= 0 && aw < shapes_a_[4]); + IN_DTYPE a = 0; + if(in_bounds) + a = A[n*ld_a_[0] + c*ld_a_[1] + ad*ld_a_[2] + ah*ld_a_[3] + aw*ld_a_[4]]; + IN_DTYPE b = B[c*ld_b_[0] + bd*ld_b_[1] + bh*ld_b_[2] + bw*ld_b_[3] + k*ld_b_[4]]; + acc = std::fma(a, b, acc); + } + C[n*ld_c_[0] + k*ld_c_[1] + cd*ld_c_[2] + ch*ld_c_[3] + cw*ld_c_[4]] = acc; + } + } + + template + void cpu_wgrad(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B) + { + IN_DTYPE acc; + for(int32_t c = 0 ; c < shapes_c_[0]; ++c) + for(int32_t cd = 0; cd < shapes_c_[1]; ++cd) + for(int32_t ch = 0; ch < shapes_c_[2]; ++ch) + for(int32_t cw = 0; cw < shapes_c_[3]; ++cw) + for(int32_t k = 0 ; k < shapes_c_[4]; ++k) + { + acc = 0; + int32_t d = cd*stride_d_ - pad_d_; + int32_t h = ch*stride_h_ - pad_h_; + int32_t w = cw*stride_w_ - pad_w_; + for(int32_t n = 0; n < shapes_b_[0]; ++n) + for(int32_t bd = 0; bd < shapes_b_[2]; ++bd) + for(int32_t bh = 0; bh < shapes_b_[3]; ++bh) + for(int32_t bw = 0; bw < shapes_b_[4]; ++bw){ + int32_t ad = d + bd; + int32_t ah = h + bh; + int32_t aw = w + bw; + bool in_bounds = (ad >= 0 && ad < shapes_a_[2] && + ah >= 0 && ah < shapes_a_[3] && + aw >= 0 && aw < shapes_a_[4]); + IN_DTYPE a = 0; + if(in_bounds) + a = A[n*ld_a_[0] + c*ld_a_[1] + ad*ld_a_[2] + ah*ld_a_[3] + aw*ld_a_[4]]; + IN_DTYPE b = B[n*ld_b_[0] + k*ld_b_[1] + bd*ld_b_[2] + bh*ld_b_[3] + bw*ld_b_[4]]; + acc = std::fma(a, b, acc); + } + C[c*ld_c_[0] + cd*ld_c_[1] + ch*ld_c_[2] + cw*ld_c_[3] + k*ld_c_[4]] = acc; + } + } + template void cpu_ref(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B) { - auto idx = [&](int32_t x, int32_t y, int32_t z, int32_t w, int32_t u, - int32_t /*s0*/, int32_t s1, int32_t s2, int32_t s3, int32_t s4) - { return u + w*s4 + z*s4*s3 + y*s4*s3*s2 + x*s4*s3*s2*s1; }; - - IN_DTYPE accs[1]; - float tmp[1]; - for(int32_t m = 0 ; m < RD_; ++m) - for(int32_t p = 0 ; p < RH_; ++p) - for(int32_t q = 0; q < RW_; ++q) - for(int32_t n = 0; n < B_; ++n) - for(int32_t k = 0; k < NF_ ; ++k) - { - for(int32_t i = 0; i < 1; ++i) - accs[i] = 0; - int32_t mm = m*stride_d_ - pad_d_; - int32_t pp = p*stride_h_ - pad_h_; - int32_t qq = q*stride_w_ - pad_w_; - for(int32_t kk = 0; kk < 1; ++kk) - for(int32_t c = 0; c < NC_; ++c) - for(int32_t t = 0; t < T_; ++t) - for(int32_t r = 0; r < R_; ++r) - for(int32_t s = 0; s < S_; ++s){ - int32_t d = mm + t; - int32_t h = pp + r; - int32_t w = qq + s; - bool in_bounds = (d >= 0 && h >= 0 && w >= 0 && d < D_ && h < H_ && w < W_); - IN_DTYPE a = in_bounds?A[idx(n, c, d, h, w, B_, NC_, D_, H_, W_)]:0; - IN_DTYPE b = B[idx(c, t, r, s, k*1 + kk, NC_, T_, R_, S_, NF_*1)]; - accs[kk] = std::fma(a, b, accs[kk]); - } - for(int32_t kk = 0; kk < 1; ++kk){ - tmp[kk] = accs[kk]; - } - C[idx(n, k, m, p, q, B_, NF_, RD_, RH_, RW_)] = tmp[0]; - } + if(ty_ == FPROP || ty_ == BPROP) + cpu_xprop(C, A, B); + else + cpu_wgrad(C, A, B); } private: // image size - int32_t B_; + int32_t NB_; int32_t NC_; - int32_t D_; - int32_t H_; - int32_t W_; + int32_t AD_; + int32_t AH_; + int32_t AW_; // filter size - int32_t T_; - int32_t R_; - int32_t S_; + int32_t BD_; + int32_t BH_; + int32_t BW_; int32_t NF_; // activation size - int32_t RD_; - int32_t RH_; - int32_t RW_; + int32_t CD_; + int32_t CH_; + int32_t CW_; // upsampling int32_t upsample_d_; int32_t upsample_h_; @@ -364,18 +487,15 @@ private: int32_t Fs_; int32_t TK_; int32_t Luts_; - // memory strides for data - int32_t stride_a_w_; - int32_t stride_a_h_; - int32_t stride_a_d_; - int32_t stride_a_c_; - int32_t stride_a_n_; - // memory stride for activations - int32_t stride_c_q_; - int32_t stride_c_p_; - int32_t stride_c_m_; - int32_t stride_c_k_; - int32_t stride_c_n_; + // memory strides for A + std::vector shapes_a_; + std::vector ld_a_; + // memory strides for B + std::vector shapes_b_; + std::vector ld_b_; + // memory stride for C + std::vector shapes_c_; + std::vector ld_c_; // type type ty_; bool is_bprop_; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 7927e5400..6b638abaa 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -812,7 +812,9 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & std::swap(b_idx[0], b_idx[1]); Value *a = TA->get_value(a_idx); Value *b = TB->get_value(b_idx); +// res = builder.CreateCall(f_mul_add, {ConstantFP::get(a->getType(), 1), ConstantFP::get(b->getType(), 1), res}); res = builder.CreateCall(f_mul_add, {a, b, res}); + } result->set_value(idx, res); }); From 5941501f70a00346f7065f54217ee97c980c423d Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 13 May 2019 00:38:26 -0400 Subject: [PATCH 143/494] [dnn] added Triton-C derivative computations in conv --- examples/cpp/conv.cpp | 4 +- examples/python/pytorch/conv.cpp | 383 ++++++++++--------------------- examples/python/pytorch/main.py | 49 +++- include/triton/dnn/conv.h | 77 +++++-- include/triton/driver/dispatch.h | 14 +- 5 files changed, 246 insertions(+), 281 deletions(-) diff --git a/examples/cpp/conv.cpp b/examples/cpp/conv.cpp index 8cddeb588..5d4d20c7d 100644 --- a/examples/cpp/conv.cpp +++ b/examples/cpp/conv.cpp @@ -10,7 +10,7 @@ int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); triton::jit jit(context); - triton::dnn::conv::type ty = triton::dnn::conv::WGRAD; + triton::dnn::conv::type ty = triton::dnn::conv::FPROP; // initialization int32_t B = 4, NF = 32; int32_t D = 1, H = 24, W = 240; @@ -77,7 +77,7 @@ int main() { if(std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; exit(EXIT_FAILURE); - } + } } std::cout << "Pass!" << std::endl; } diff --git a/examples/python/pytorch/conv.cpp b/examples/python/pytorch/conv.cpp index 2aa46175d..09a3f6eaa 100644 --- a/examples/python/pytorch/conv.cpp +++ b/examples/python/pytorch/conv.cpp @@ -4,170 +4,69 @@ #include #include "triton/jit.h" #include "triton/driver/stream.h" +#include "triton/dnn/conv.h" #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) -const char* src = -R"( -const tunable int32 TM = {16, 32, 64}; -const tunable int32 TN = {16, 32, 64}; -const tunable int32 TK = {8}; - -__constant__ int32* delta = alloc_const int32[18]; -__constant__ int32* masks = alloc_const int32[1024]; - -void conv(read_only restrict fp32 *a, - read_only restrict fp32 *b, - fp32 *c, - int32 M, int32 N, int32 K, - int32 AN, int32 AH, int32 AW, - int32 CN, int32 CK, int32 CP, int32 CQ, - int32 AC, int32 AR, int32 AS, - int32 lda_n, int32 lda_c, int32 lda_h, int32 lda_w, - int32 ldc_n, int32 ldc_k, int32 ldc_p, int32 ldc_q, - int32 pad_h, int32 pad_w, - int32 bound){ - int32 rxa[TM] = get_global_range[TM](0); - int32 rb0[TN] = get_global_range[TN](1); - int32 rka[TK] = 0 ... TK; - int32 rb1[TK] = 0 ... TK; - fp32 C[TM, TN] = 0; - int32 ranh[TM] = rxa / CQ; - int32 raw[TM] = rxa % CQ - pad_w; - int32 ran[TM] = ranh / CP; - int32 rah[TM] = ranh % CP - pad_h; - int32 ra0[TM] = ran*lda_n + rah*lda_h + raw*lda_w; - int32 racr[TK] = rka / AS; - int32 ras[TK] = rka % AS; - int32 rac[TK] = racr / AR; - int32 rar[TK] = racr % AR; - int32 ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; - fp32* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis]; - fp32* pb[TN, TK] = b + rb1[newaxis, :]*CK + rb0[:, newaxis]; - __constant__ int32* pincd[TK] = delta + rka; - __constant__ int32* pd[TK] = delta + AR*AS + rka; - int32 d[TK] = *pd; - int32 incd[TK] = *pincd; - int32 maskh[TM] = pad_h + min(rah, 0) + max(rah + AR - AH, 0); - int32 maskw[TM] = pad_w + min(raw, 0) + max(raw + AS - AW, 0); - __constant__ int32* pm[TM] = masks + AR*AS + maskw*AR*AS + maskh*AR*AS*(2*pad_w + 1); - __constant__ int32* pincm[TM] = delta; - int32 incm[TM] = *pincm; - int32 checka0[TM] = *pm; - int32 checka1[TK] = 1 << rka; - int1 checka[TM, TK] = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; - fp32 a[TM, TK] = checka ? *pa : 0; - fp32 b[TN, TK] = *pb; - for(int32 k = K; k > 0; k = k - TK){ - C = dot(a, trans(b), C); - pb = pb + TK*CK; - pa = pa + d[newaxis, :]; - b = *pb; - pd = pd + incd; - pincd = pincd + incd; - d = *pd; - incd = *pincd; - pm = pm + incm; - pincm = pincm + incm; - incm = *pincm; - checka0 = *pm; - checka = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; - a = checka ? *pa : 0; - } - int32 rxc[TM] = get_global_range[TM](0); - int32 rc1[TN] = get_global_range[TN](1); - int32 rcn[TM] = rxc / (CP*CQ); - int32 rcpq[TM] = rxc % (CP*CQ); - int32 rc0[TM] = rcn * ldc_n + rcpq; - fp32* pc[TM, TN] = c + rc1[newaxis, :]*ldc_k + rc0[:, newaxis]; - int1 checkc0[TM] = rxc < M; - int1 checkc1[TN] = rc1 < N; - int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - @checkc *pc = C; -})"; - -void build_conv_lut(int TK, - int stride_d, int stride_h, int stride_w, int stride_c, - int pad_d, int pad_h, int pad_w, - int T, int R, int S, - std::vector& res, std::vector& masks) { - /* convolution parameters */ - int F = T * R * S; - int Nlut = (TK + F - 1) / F * F; - int upsample_w = 1; - int upsample_h = 1; - int upsample_d = 1; - /* unpack index wrt filters */ - auto unpack = [&](int32_t trs){ - int32_t tr = trs / S; - int32_t s = trs - tr*S; - int32_t t = tr / R; - int32_t r = tr - t*R; - return std::make_tuple(t, r, s); - }; - /* increments */ - for(size_t i = 0; i < Nlut; ++i) - res[i] = (((i + TK) % Nlut) - i); - /* deltas */ - size_t Ds0 = Nlut; - size_t Ds1 = upsample_w; - size_t Ds2 = upsample_h; - size_t Ds3 = upsample_d; - for(size_t pd = 0; pd < Ds3; ++pd) - for(size_t ph = 0; ph < Ds2; ++ph) - for(size_t pw = 0; pw < Ds1; ++pw){ - int32_t* deltas_ptr = &res[Nlut + pw*Ds0 + ph*Ds0*Ds1 + pd*Ds0*Ds1*Ds2]; - // cumulative increments - for(size_t i = 0; i < Ds0; ++i){ - int32_t ctrs = i; - int32_t c = ctrs / F; - int32_t t, r, s; - std::tie(t, r, s) = unpack(ctrs % F); - // next indices - int32_t nextctrs = ctrs + TK; - int32_t nextc = nextctrs / F; - int32_t nextt, nextr, nexts; - std::tie(nextt, nextr, nexts) = unpack(nextctrs % F); - // diffs - int32_t cdiff = nextc - c; - int32_t tdiff = (nextt + pd)/upsample_d - (t + pd)/upsample_d; - int32_t rdiff = (nextr + ph)/upsample_h - (r + ph)/upsample_h; - int32_t sdiff = (nexts + pw)/upsample_w - (s + pw)/upsample_w; - // delta pointers - deltas_ptr[i] = cdiff*stride_c + sdiff*stride_w + rdiff*stride_h + tdiff*stride_d; - } +torch::Tensor conv_common( + int32_t B, int32_t C, int32_t D, int32_t H, int32_t W, + int32_t T, int32_t R, int32_t S, int32_t NF, + int32_t stride_d, int32_t stride_h, int32_t stride_w, + int32_t pad_d, int32_t pad_h, int32_t pad_w, + triton::dnn::conv::type ty, + torch::Tensor torcha, torch::Tensor torchb + ) { + // Configuration + triton::dnn::conv configuration(B, C, D, H, W, T, R, S, NF, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, ty); + // Allocate output + std::vector c_shapes = configuration.c_shapes(); + torch::Tensor torchc; + if(ty == triton::dnn::conv::WGRAD) + torchc = torch::empty({c_shapes[0], c_shapes[2], c_shapes[3], c_shapes[4]}, torch::kFloat).cuda(); + else + torchc = torch::empty({c_shapes[0], c_shapes[1], c_shapes[3], c_shapes[4]}, torch::kFloat).cuda(); + // Wrap CUDA handles + c10::DeviceIndex device = torchc.storage().device().index(); + triton::driver::cu_stream sstream((CUstream)at::cuda::getCurrentCUDAStream(device).stream(), false); + triton::driver::stream* stream = &sstream; + triton::driver::context* ctx = stream->context(); + triton::driver::cu_buffer a(ctx, (CUdeviceptr)torcha.storage().data(), false); + triton::driver::cu_buffer b(ctx, (CUdeviceptr)torchb.storage().data(), false); + triton::driver::cu_buffer c(ctx, (CUdeviceptr)torchc.storage().data(), false); + stream->synchronize(); + // Create JIT + triton::jit jit(ctx); + std::string src = configuration.src(); + jit.add_module("conv", src.c_str(), configuration.default_params()); + triton::driver::kernel* kernel = jit.get_function("conv"); + triton::jit::launch_information info = jit.get_launch_info("conv"); + // launch info + unsigned TM = info.global_range_size[0]; + unsigned TN = info.global_range_size[1]; + // initialize constant memory + if(ty != triton::dnn::conv::WGRAD){ + std::vector h_delta; + std::vector h_masks; + configuration.build_deltas(h_delta); + configuration.build_masks(h_masks); + triton::driver::buffer* delta = jit.get_buffer("delta"); + triton::driver::buffer* masks = jit.get_buffer("masks"); + stream->write(delta, false, 0, h_delta.size()*4, h_delta.data()); + stream->write(masks, false, 0, h_masks.size()*4, h_masks.data()); } - - /* Masks */ - size_t Ms0 = Nlut; - size_t Ms1 = 2*pad_w + 1; - size_t Ms2 = 2*pad_h + 1; - size_t Ms3 = 2*pad_d + 1; - - for(size_t pd = 0; pd < Ms3; ++pd) - for(size_t ph = 0; ph < Ms2; ++ph) - for(size_t pw = 0; pw < Ms1; ++pw){ - int32_t* masks_ptr = &masks[Nlut + pw*Ms0 + ph*Ms0*Ms1 + pd*Ms0*Ms1*Ms2]; - for(size_t i = 0; i < Ms0; ++i){ - int32_t t, r, s; - int32_t mask = 0x0; - for(size_t j = 0; j < TK; ++j){ - std::tie(t, r, s) = unpack((i + j) % F); - bool in_bounds_d = (t + pd) >= pad_d && (t + pd) < (T + pad_d); - bool in_bounds_h = (r + ph) >= pad_h && (r + ph) < (R + pad_h); - bool in_bounds_w = (s + pw) >= pad_w && (s + pw) < (S + pad_w); - mask |= (in_bounds_d && in_bounds_h && in_bounds_w) << j; - } - masks_ptr[i] = mask; - } - } - for(size_t i = 0; i < Nlut; ++i) - masks[i] = 0x0; + // launch info + unsigned nthreads = info.num_threads; + std::array grid = configuration.get_grid(TM, TN); + configuration.set_arg(kernel, &a, &b, &c); + stream->synchronize(); + stream->enqueue(kernel, grid, {nthreads, 1, 1}); + stream->synchronize(); + return torchc; } -torch::Tensor conv_forward( +torch::Tensor conv_fprop( const torch::Tensor data, const torch::Tensor weight) { // Check @@ -176,6 +75,7 @@ torch::Tensor conv_forward( // Unpack data shapes const int32_t B = data.size(0); const int32_t Ci = data.size(1); + const int32_t D = 1; const int32_t H = data.size(2); const int32_t W = data.size(3); // Unpack weight shapes @@ -184,109 +84,76 @@ torch::Tensor conv_forward( const int32_t R = weight.size(1); const int32_t S = weight.size(2); const int32_t NF = weight.size(3); - // Conv parameters - int32_t upsample_d = 1, upsample_h = 1, upsample_w = 1; - int32_t pad_d = 0, pad_h = 0, pad_w = 0; - int32_t stride_h = 1, stride_w = 1; - // Output shapes - int32_t P = (H*upsample_h - R + 1 + 2*pad_h + stride_h - 1)/stride_h; - int32_t Q = (W*upsample_w - S + 1 + 2*pad_w + stride_w - 1)/stride_w; - // Allocate output + // Configuration + const int32_t stride_d = 1, stride_h = 1, stride_w = 1; + const int32_t pad_d = 0, pad_h = 1, pad_w = 1; + // Check AT_CHECK(Ci == Cf, "Number of channels in data and weights must match"); - torch::Tensor output = torch::empty({B, NF, P, Q}, torch::kFloat).cuda(); - // Wrap CUDA handles - c10::DeviceIndex device = output.storage().device().index(); - triton::driver::cu_stream sstream((CUstream)at::cuda::getCurrentCUDAStream(device).stream(), false); - triton::driver::stream* stream = &sstream; - triton::driver::context* ctx = stream->context(); - triton::driver::cu_buffer d(ctx, (CUdeviceptr)data.storage().data(), false); - triton::driver::cu_buffer w(ctx, (CUdeviceptr)weight.storage().data(), false); - triton::driver::cu_buffer a(ctx, (CUdeviceptr)output.storage().data(), false); - // Create JIT - triton::jit jit(ctx); - std::vector params = { - 16, 2, 64, - 32, 2, 64, - 16, 8, 2, 2, - 8, 1, 8, - 4 - }; - jit.add_module("conv", src, params); - triton::driver::kernel* kernel = jit.get_function("conv"); - triton::jit::launch_information info = jit.get_launch_info("conv"); - // launch info - unsigned TM = info.global_range_size[0]; - unsigned TN = info.global_range_size[1]; - unsigned TK = jit.get_int("TK"); - // initialize constant memory - int FS = T*R*S; - int nlut = (TK + FS - 1) / FS * FS; - std::vector h_delta(nlut + upsample_d*upsample_h*upsample_w*nlut); - std::vector h_masks(nlut + (2*pad_h+1)*(2*pad_w+1)*(2*pad_d+1)*nlut); - // memory stride for images - int32_t stride_i_w = 1; - int32_t stride_i_h = W*stride_i_w; - int32_t stride_i_d = H*stride_i_h; - int32_t stride_i_c = 1*stride_i_d; - int32_t stride_i_n = Ci*stride_i_c; - // memory stride for activations - int32_t stride_o_q = 1; - int32_t stride_o_p = Q*stride_o_q; - int32_t stride_o_m = P*stride_o_p; - int32_t stride_o_k = 1*stride_o_m; - int32_t stride_o_n = NF*stride_o_k; - build_conv_lut(TK, stride_i_d, stride_i_h, stride_i_w, stride_i_c, pad_d, pad_h, pad_w, T, R, S, h_delta, h_masks); - // equivalent matmul dimensions - int32_t M = B*P*Q; - int32_t N = NF; - int32_t K = Ci*R*S; - triton::driver::buffer* delta = jit.get_buffer("delta"); - triton::driver::buffer* masks = jit.get_buffer("masks"); - stream->write(delta, false, 0, h_delta.size()*4, h_delta.data()); - stream->write(masks, false, 0, h_masks.size()*4, h_masks.data()); - // launch info - unsigned nthreads = info.num_threads; - std::array grid = {(M + TM - 1)/TM, (N + TN - 1)/TN, 1}; - // fast bounds-checking - unsigned lasti = (grid[0]*TM - 1)*TM + TM - 1; - unsigned lastj = (grid[1]*TN - 1)*TN + TN - 1; - unsigned lastk = TK - 1; - bool AT = false; - bool BT = true; - unsigned last_safe_a = (AT==false)?(M*K - 1 - lasti)/M - lastk : M*K - 1 - lasti*K - lastk; - unsigned last_safe_b = (BT==true)?(N*K - 1 - lastj)/N - lastk : N*K - 1 - lastj*K - lastk; - int32_t bound = std::max(1, std::max(K - last_safe_a, K - last_safe_b)); - // set arguments - kernel->setArg(0, *d.cu()); - kernel->setArg(1, *w.cu()); - kernel->setArg(2, *a.cu()); - kernel->setArg(3, M); - kernel->setArg(4, N); - kernel->setArg(5, K); - kernel->setArg(6, B); - kernel->setArg(7, H); - kernel->setArg(8, W); - kernel->setArg(9, NF); - kernel->setArg(10, P); - kernel->setArg(11, Q); - kernel->setArg(12, Ci); - kernel->setArg(13, R); - kernel->setArg(14, S); - kernel->setArg(15, stride_i_n); - kernel->setArg(16, stride_i_c); - kernel->setArg(17, stride_i_h); - kernel->setArg(18, stride_i_w); - kernel->setArg(19, stride_o_n); - kernel->setArg(20, stride_o_k); - kernel->setArg(21, stride_o_p); - kernel->setArg(22, stride_o_q); - kernel->setArg(23, pad_h); - kernel->setArg(24, pad_w); - kernel->setArg(25, bound); -// // dry run - stream->enqueue(kernel, grid, {nthreads, 1, 1}); - return output; + return conv_common(B, Ci, D, H, W, T, R, S, NF, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, triton::dnn::conv::FPROP, data, weight); +} + +torch::Tensor conv_bprop( + const torch::Tensor derror, + const torch::Tensor weight){ + // Check + CHECK_INPUT(derror); + CHECK_INPUT(weight); + // Unpack data shapes + const int32_t B = derror.size(0); + const int32_t Ki = derror.size(1); + const int32_t M = 1; + const int32_t P = derror.size(2); + const int32_t Q = derror.size(3); + // Unpack weight shapes + const int32_t C = weight.size(0); + const int32_t T = 1; + const int32_t R = weight.size(1); + const int32_t S = weight.size(2); + const int32_t Kw = weight.size(3); + // Compute M, P, Q + const int32_t upsample_d = 1, upsample_h = 1, upsample_w = 1; + const int32_t stride_d = 1, stride_h = 1, stride_w = 1; + const int32_t pad_d = 0, pad_h = 1, pad_w = 1; + const int32_t D = M*stride_d + T - 1 - 2*pad_d + stride_d - 1 / upsample_d; + const int32_t H = P*stride_d + R - 1 - 2*pad_h + stride_h - 1 / upsample_h; + const int32_t W = Q*stride_d + S - 1 - 2*pad_w + stride_w - 1 / upsample_w; + // Check + AT_CHECK(Ki == Kw, "Number of channels in error and weights must match"); + return conv_common(B, C, D, H, W, T, R, S, Kw, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, triton::dnn::conv::BPROP, derror, weight); +} + +torch::Tensor conv_wgrad( + const torch::Tensor data, + const torch::Tensor derror + ){ + // Check + CHECK_INPUT(data); + CHECK_INPUT(derror); + // Unpack data shapes + const int32_t Ba = derror.size(0); + const int32_t C = derror.size(1); + const int32_t D = 1; + const int32_t H = derror.size(2); + const int32_t W = derror.size(3); + // Unpack error shapes + const int32_t Bb = derror.size(0); + const int32_t K = derror.size(1); + const int32_t M = 1; + const int32_t P = derror.size(2); + const int32_t Q = derror.size(3); + // Compute M, P, Q + const int32_t upsample_d = 1, upsample_h = 1, upsample_w = 1; + const int32_t stride_d = 1, stride_h = 1, stride_w = 1; + const int32_t pad_d = 0, pad_h = 1, pad_w = 1; + const int32_t T = (D - M*stride_d + 1 + 2*pad_d - stride_d + 1)*upsample_d; + const int32_t R = (H - P*stride_h + 1 + 2*pad_h - stride_h + 1)*upsample_h; + const int32_t S = (W - Q*stride_w + 1 + 2*pad_w - stride_w + 1)*upsample_w; + // Check + AT_CHECK(Ba == Bb, "Number of channels in error and weights must match"); + return conv_common(Ba, C, D, H, W, T, R, S, K, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, triton::dnn::conv::WGRAD, data, derror); } static auto registry = - torch::jit::RegisterOperators("triton::conv_forward", &conv_forward); + torch::jit::RegisterOperators("triton::conv_fprop", &conv_fprop) + .op("triton::conv_bprop", &conv_bprop) + .op("triton::conv_wgrad", &conv_wgrad); diff --git a/examples/python/pytorch/main.py b/examples/python/pytorch/main.py index d4b11e316..c0568f8b4 100644 --- a/examples/python/pytorch/main.py +++ b/examples/python/pytorch/main.py @@ -1,9 +1,48 @@ import torch -from torch.autograd import Variable +torch.manual_seed(0) +class TritonConv(torch.autograd.Function): + + @staticmethod + def forward(ctx, input, weight): + ctx.save_for_backward(input, weight) + output = torch.ops.triton.conv_fprop(input, weight) + return output + + @staticmethod + def backward(ctx, grad_output): + input, weight = ctx.saved_tensors + grad_input = grad_weight = None + if ctx.needs_input_grad[0]: + grad_input = torch.ops.triton.conv_bprop(grad_output.contiguous(), weight) + if ctx.needs_input_grad[1]: + grad_weight = torch.ops.triton.conv_wgrad(input, grad_output.contiguous()) + return grad_input, grad_weight + + torch.ops.load_library("/home/philippe/Development/triton/build/examples/python/pytorch/libtorch_triton.so") -d = torch.empty(64, 64, 64, 64).uniform_(0, 1).cuda() -w = torch.empty(64, 3, 3, 64).uniform_(0, 1).cuda() -a = torch.ops.triton.conv_forward(d, w) -print(a) +x = torch.autograd.Variable(torch.randn(16, 64, 8, 8).cuda(), requires_grad=True) +w = torch.autograd.Variable(torch.randn(64, 3, 3, 64).cuda(), requires_grad=True) +cuw = torch.autograd.Variable(w.permute(3,0,1,2).cuda(), requires_grad=True) +y_target = torch.autograd.Variable(torch.randn(16, 64, 8, 8).cuda(), requires_grad=True) + +def run(x, w, conv): + y = conv(x, w) + loss = (y - y_target).norm(2) + loss.backward() + return loss, y.clone(), x.grad.clone(), w.grad.clone() + +ttyloss, tty, ttdx, ttdw = run(x, w, TritonConv.apply) +x.grad.zero_() +w.grad.zero_() +culoss, cuy, cudx, cudw = run(x, cuw, lambda x, w: torch.nn.functional.conv2d(x, w, padding=1)) + +print((tty - cuy).norm(2)) +print((ttdx - cudx).norm(2)) +print((ttdw.permute(3,0,1,2) - cudw).norm(2)) +#print(ttdx) +#print(cudx) +#print(ttdw) +#print(cudw) +#print((ttdw.permute(3,0,1,2) - cudw).norm(2)) diff --git a/include/triton/dnn/conv.h b/include/triton/dnn/conv.h index c29bb925b..b2cbefe0f 100644 --- a/include/triton/dnn/conv.h +++ b/include/triton/dnn/conv.h @@ -20,12 +20,12 @@ public: conv(int B, int NC, int D, int H, int W, int T, int R, int S, int NF, - int upsample_d, int upsample_h, int upsample_w, + int stride_d, int stride_h, int stride_w, int pad_d, int pad_h, int pad_w, type ty = FPROP) : NB_(B), NC_(NC), AD_(D), AH_(H), AW_(W), BD_(T), BH_(R), BW_(S), NF_(NF), - upsample_d_(upsample_d), upsample_h_(upsample_h), upsample_w_(upsample_w), - stride_d_(1), stride_h_(1), stride_w_(1), + stride_d_(stride_d), stride_h_(stride_h), stride_w_(stride_w), + upsample_d_(1), upsample_h_(1), upsample_w_(1), pad_d_(pad_d), pad_h_(pad_h), pad_w_(pad_w), ty_(ty) { @@ -93,6 +93,10 @@ public: 1, std::multiplies()); } + std::vector c_shapes() { + return shapes_c_; + } + void build_deltas(std::vector& deltas){ if(ty_ == WGRAD) throw std::runtime_error("no look-up table necessary for wgrad"); @@ -120,6 +124,7 @@ public: int32_t c = ctrs / Fs_; int32_t t, r, s; std::tie(t, r, s) = unpack(ctrs % Fs_); + // next indices int32_t nextctrs = ctrs + TK_; int32_t nextc = nextctrs / Fs_; @@ -223,6 +228,43 @@ public: std::string xprop() { + + std::string declare_pb; + if(ty_ == FPROP){ + declare_pb = R"( + fp32* pb[TN, TK] = b + rkb[newaxis, :]*ldb_s + rb0[:, newaxis]; + )"; + } + else{ + declare_pb = R"( + fp32* pb_base[TN, TK] = b + rb0[:, newaxis]*ldb_c; + int32 rbk[TK] = rkb / (BH*BW); + int32 rbrs[TK] = rkb % (BH*BW); + int32 rbs[TK] = BW - 1 - rbrs % BW; + int32 rbr[TK] = BH - 1 - rbrs / BW; + int32 rb1[TK] = rbk*ldb_k + rbr*ldb_r + rbs*ldb_s; + fp32* pb[TN, TK] = pb_base + rb1[newaxis, :]; + )"; + } + std::string increment_pb; + if(ty_ == FPROP){ + increment_pb = R"( + pb = pb + TK*ldb_s; + )"; + } + else{ + increment_pb = R"( + rbrs = rbrs + TK; + rkb = rkb + TK; + rbk = rkb / (BH*BW); + rbrs = rkb % (BH*BW); + rbs = BW - 1 - rbrs % BW; + rbr = BH - 1 - rbrs / BW; + rb1 = rbk*ldb_k + rbr*ldb_r + rbs*ldb_s; + pb = pb_base + rb1[newaxis, :]; + )"; + } + std::string res = R"( const tunable int32 TM = {16, 32, 64}; @@ -246,7 +288,7 @@ public: int32 rxa[TM] = get_global_range[TM](0); int32 rb0[TN] = get_global_range[TN](1); int32 rka[TK] = 0 ... TK; - int32 rb1[TK] = 0 ... TK; + int32 rkb[TK] = 0 ... TK; fp32 C[TM, TN] = 0; int32 rabh[TM] = rxa / CW; int32 raw[TM] = rxa % CW - pad_w; @@ -258,8 +300,8 @@ public: int32 rac[TK] = racr / BH; int32 rar[TK] = racr % BH; int32 ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; - fp32* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis]; - fp32* pb[TN, TK] = b + rb1[newaxis, :]*ldb_s + rb0[:, newaxis]; + fp32* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis];)" + + declare_pb + R"( __constant__ int32* pincd[TK] = delta + rka; __constant__ int32* pd[TK] = delta + BH*BW + rka; int32 d[TK] = *pd; @@ -276,8 +318,8 @@ public: fp32 b[TN, TK] = *pb; for(int32 k = K; k > 0; k = k - TK){ C = dot(a, trans(b), C); - pb = pb + TK*ldb_s; - pa = pa + d[newaxis, :]; + pa = pa + d[newaxis, :];)" + + increment_pb + R"( b = *pb; pd = pd + incd; pincd = pincd + incd; @@ -288,6 +330,7 @@ public: incm = *pincm; checka0 = *pm; checka = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; + checka = checka && (k > TK); a = checka ? *pa : 0; } int32 rxc[TM] = get_global_range[TM](0); @@ -379,7 +422,7 @@ public: { IN_DTYPE acc; for(int32_t n = 0; n < shapes_c_[0]; ++n) - for(int32_t k = 0; k < shapes_c_[1] ; ++k) + for(int32_t cf = 0; cf < shapes_c_[1] ; ++cf) for(int32_t cd = 0 ; cd < shapes_c_[2]; ++cd) for(int32_t ch = 0 ; ch < shapes_c_[3]; ++ch) for(int32_t cw = 0; cw < shapes_c_[4]; ++cw) @@ -388,7 +431,7 @@ public: int32_t d = cd*stride_d_ - pad_d_; int32_t h = ch*stride_h_ - pad_h_; int32_t w = cw*stride_w_ - pad_w_; - for(int32_t c = 0; c < shapes_b_[0]; ++c) + for(int32_t ac = 0; ac < shapes_a_[1]; ++ac) for(int32_t bd = 0; bd < shapes_b_[1]; ++bd) for(int32_t bh = 0; bh < shapes_b_[2]; ++bh) for(int32_t bw = 0; bw < shapes_b_[3]; ++bw){ @@ -400,11 +443,19 @@ public: aw >= 0 && aw < shapes_a_[4]); IN_DTYPE a = 0; if(in_bounds) - a = A[n*ld_a_[0] + c*ld_a_[1] + ad*ld_a_[2] + ah*ld_a_[3] + aw*ld_a_[4]]; - IN_DTYPE b = B[c*ld_b_[0] + bd*ld_b_[1] + bh*ld_b_[2] + bw*ld_b_[3] + k*ld_b_[4]]; + a = A[n*ld_a_[0] + ac*ld_a_[1] + ad*ld_a_[2] + ah*ld_a_[3] + aw*ld_a_[4]]; + IN_DTYPE b; + if(ty_==FPROP) + b = B[ac*ld_b_[0] + bd*ld_b_[1] + bh*ld_b_[2] + bw*ld_b_[3] + cf*ld_b_[4]]; + else{ + int32_t bdd = bd; + int32_t bhh = bh; + int32_t bww = bw; + b = B[cf*ld_b_[0] + bdd*ld_b_[1] + bhh*ld_b_[2] + bww*ld_b_[3] + ac*ld_b_[4]]; + } acc = std::fma(a, b, acc); } - C[n*ld_c_[0] + k*ld_c_[1] + cd*ld_c_[2] + ch*ld_c_[3] + cw*ld_c_[4]] = acc; + C[n*ld_c_[0] + cf*ld_c_[1] + cd*ld_c_[2] + ch*ld_c_[3] + cw*ld_c_[4]] = acc; } } diff --git a/include/triton/driver/dispatch.h b/include/triton/driver/dispatch.h index a2a389cbd..86b1f2dc1 100755 --- a/include/triton/driver/dispatch.h +++ b/include/triton/driver/dispatch.h @@ -193,12 +193,20 @@ public: static cudnnStatus_t cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc, int pad_h, int pad_w, int u, int v, int upscalex, int upscaley, cudnnConvolutionMode_t mode); static cudnnStatus_t cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc, int arrayLength, const int padA[], const int filterStrideA[], const int upscaleA[], cudnnConvolutionMode_t mode, cudnnDataType_t dataType); static cudnnStatus_t cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode, const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims, const int windowDimA[], const int paddingA[], const int strideA[]); + static cudnnStatus_t cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId); + static cudnnStatus_t cudnnTransformTensor(cudnnHandle_t handle, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y); + // pooling + static cudnnStatus_t cudnnPoolingForward(cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y); + // forward static cudnnStatus_t cudnnGetConvolutionForwardAlgorithm(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes, cudnnConvolutionFwdAlgo_t *algo); static cudnnStatus_t cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo, size_t *sizeInBytes); static cudnnStatus_t cudnnConvolutionForward(cudnnHandle_t handle, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const cudnnFilterDescriptor_t wDesc, const void *w, const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo, void *workSpace, size_t workSpaceSizeInBytes, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y); - static cudnnStatus_t cudnnPoolingForward(cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y); - static cudnnStatus_t cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId); - static cudnnStatus_t cudnnTransformTensor(cudnnHandle_t handle, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y); + // backward data + static cudnnStatus_t cudnnConvolutionBackwardData(cudnnHandle_t handle, const void *alpha, const cudnnFilterDescriptor_t wDesc, const void *w, const cudnnTensorDescriptor_t dyDesc, const void *dy, const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionBwdDataAlgo_t algo, void* workSpace, size_t workSpaceSizeInBytes, const void* beta, const cudnnTensorDescriptor_t dxDesc, void *dx); + static cudnnStatus_t cudnnGetConvolutionBackwardDataAlgorithm(cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,const cudnnTensorDescriptor_t dyDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t dxDesc, cudnnConvolutionBwdDataPreference_t preference, size_t memoryLimitInBytes, cudnnConvolutionBwdDataAlgo_t* algo); + // backward filter + static cudnnStatus_t cudnnConvolutionBackwardFilter(cudnnHandle_t handle, const void *alpha, const cudnnTensorDescriptor_t xDesc,const void *x, const cudnnTensorDescriptor_t dyDesc,const void *dy, const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionBwdFilterAlgo_t algo,void* workSpace, size_t workSpaceSizeInBytes, const void* beta, const cudnnFilterDescriptor_t dwDesc, void *dw); + static cudnnStatus_t cudnnGetConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t dyDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnFilterDescriptor_t dwDesc, cudnnConvolutionBwdFilterPreference_t preference, size_t memoryLimitInBytes, cudnnConvolutionBwdFilterAlgo_t* algo); // SPIR-V libraries static int initializeLLVMToSPIRVPass(llvm::PassRegistry &); From cbfbe72e463542aca79d2ecaf299b017e211f93a Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 13 May 2019 22:29:53 -0400 Subject: [PATCH 144/494] [general] added LICENSE file --- LICENSE | 22 ++++++++++++++++++++++ TODO | 7 ------- 2 files changed, 22 insertions(+), 7 deletions(-) create mode 100755 LICENSE delete mode 100644 TODO diff --git a/LICENSE b/LICENSE new file mode 100755 index 000000000..4ea9858e9 --- /dev/null +++ b/LICENSE @@ -0,0 +1,22 @@ +/* Copyright 2018-2019 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + diff --git a/TODO b/TODO deleted file mode 100644 index 299ceba34..000000000 --- a/TODO +++ /dev/null @@ -1,7 +0,0 @@ -[Frontend] - - SCOPES - -[Intermediate Representation] - - proper naming scheme - - symbols table - - separate header for typedef (e.g., type::tile_shapes_t) to reduce compilation time From be2ba0338275fe6fa6e8999a098f185387224f7e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 14 May 2019 18:12:38 -0400 Subject: [PATCH 145/494] [dnn/conv] optimizations of backpropagation with look-up tables --- examples/cpp/conv.cpp | 4 +- include/triton/dnn/conv.h | 141 +++++++++++++++++++------------------- 2 files changed, 71 insertions(+), 74 deletions(-) diff --git a/examples/cpp/conv.cpp b/examples/cpp/conv.cpp index 5d4d20c7d..8f34b42ef 100644 --- a/examples/cpp/conv.cpp +++ b/examples/cpp/conv.cpp @@ -10,7 +10,7 @@ int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); triton::jit jit(context); - triton::dnn::conv::type ty = triton::dnn::conv::FPROP; + triton::dnn::conv::type ty = triton::dnn::conv::BPROP; // initialization int32_t B = 4, NF = 32; int32_t D = 1, H = 24, W = 240; @@ -66,7 +66,7 @@ int main() { return configuration.get_nflops() / ts * 1e-3; }; std::string src = configuration.src(); -// jit.autotune("conv", src.c_str(), benchmark); + jit.autotune("conv", src.c_str(), benchmark); jit.add_module("conv", src.c_str(), configuration.default_params()); triton::driver::kernel* kernel = jit.get_function("conv"); triton::jit::launch_information info = jit.get_launch_info("conv"); diff --git a/include/triton/dnn/conv.h b/include/triton/dnn/conv.h index b2cbefe0f..26564afe0 100644 --- a/include/triton/dnn/conv.h +++ b/include/triton/dnn/conv.h @@ -74,6 +74,8 @@ public: } // look-up table info Fs_ = shapes_b_[1]*shapes_b_[2]*shapes_b_[3]; + if(ty_ == BPROP) + Fs_ *= shapes_b_[4]; TK_ = 8; Luts_ = (TK_ + Fs_ - 1) / Fs_ * Fs_; } @@ -101,15 +103,24 @@ public: if(ty_ == WGRAD) throw std::runtime_error("no look-up table necessary for wgrad"); deltas.resize(Luts_ + upsample_d_*upsample_h_*upsample_w_*Luts_); - auto unpack = [&](int32_t trs){ + + auto unpack = [&](int32_t ltrs){ + int32_t l = (ty_ == BPROP) ? ltrs % NF_ : ltrs / Fs_; + int32_t trs = (ty_ == BPROP) ? ltrs / NF_ : ltrs % Fs_; int32_t tr = trs / BW_; - int32_t s = trs - tr*BW_; + int32_t s = trs % BW_; int32_t t = tr / BH_; - int32_t r = tr - t*BH_; - return std::make_tuple(t, r, s); + int32_t r = tr % BH_; + if(ty_ == BPROP){ + r = BH_ - 1 - r; + s = BW_ - 1 - s; + } + return std::make_tuple(l, t, r, s); }; + for(size_t i = 0; i < Luts_; ++i) deltas[i] = (((i + TK_) % Luts_) - i); + size_t Ds0 = Luts_; size_t Ds1 = upsample_w_; size_t Ds2 = upsample_h_; @@ -119,17 +130,15 @@ public: for(size_t pw = 0; pw < Ds1; ++pw){ int32_t* deltas_ptr = &deltas[Luts_ + pw*Ds0 + ph*Ds0*Ds1 + pd*Ds0*Ds1*Ds2]; // cumulative increments - for(size_t i = 0; i < Ds0; ++i){ + for(size_t i = 0; i < Ds0; ++i) { + // unpack int32_t ctrs = i; - int32_t c = ctrs / Fs_; - int32_t t, r, s; - std::tie(t, r, s) = unpack(ctrs % Fs_); - + int32_t c, t, r, s; + std::tie(c, t, r, s) = unpack(ctrs); // next indices int32_t nextctrs = ctrs + TK_; - int32_t nextc = nextctrs / Fs_; - int32_t nextt, nextr, nexts; - std::tie(nextt, nextr, nexts) = unpack(nextctrs % Fs_); + int32_t nextc, nextt, nextr, nexts; + std::tie(nextc, nextt, nextr, nexts) = unpack(nextctrs); // diffs int32_t cdiff = nextc - c; int32_t tdiff = (nextt + pd)/upsample_d_ - (t + pd)/upsample_d_; @@ -145,12 +154,18 @@ public: if(ty_ == WGRAD) throw std::runtime_error("no look-up table necessary for wgrad"); masks.resize(Luts_ + (2*pad_h_+1)*(2*pad_w_+1)*(2*pad_d_+1)*Luts_); - auto unpack = [&](int32_t trs){ + auto unpack = [&](int32_t ltrs){ + int32_t l = (ty_ == BPROP) ? ltrs % NF_ : ltrs / Fs_; + int32_t trs = (ty_ == BPROP) ? ltrs / NF_ : ltrs % Fs_; int32_t tr = trs / BW_; - int32_t s = trs - tr*BW_; + int32_t s = trs % BW_; int32_t t = tr / BH_; - int32_t r = tr - t*BH_; - return std::make_tuple(t, r, s); + int32_t r = tr % BH_; + if(ty_ == BPROP){ + r = BH_ - 1 - r; + s = BW_ - 1 - s; + } + return std::make_tuple(l, t, r, s); }; size_t Ms0 = Luts_; size_t Ms1 = 2*pad_w_ + 1; @@ -161,10 +176,10 @@ public: for(size_t pw = 0; pw < Ms1; ++pw){ int32_t* masks_ptr = &masks[Luts_ + pw*Ms0 + ph*Ms0*Ms1 + pd*Ms0*Ms1*Ms2]; for(size_t i = 0; i < Ms0; ++i){ - int32_t t, r, s; + int32_t l, t, r, s; int32_t mask = 0x0; for(size_t j = 0; j < TK_; ++j){ - std::tie(t, r, s) = unpack((i + j) % Fs_); + std::tie(l, t, r, s) = unpack(i + j); bool in_bounds_d = (t + pd) >= pad_d_ && (t + pd) < (BD_ + pad_d_); bool in_bounds_h = (r + ph) >= pad_h_ && (r + ph) < (BH_ + pad_h_); bool in_bounds_w = (s + pw) >= pad_w_ && (s + pw) < (BW_ + pad_w_); @@ -220,50 +235,29 @@ public: } std::vector default_params() { - if(ty_ == FPROP || ty_ == BPROP) + if(ty_ == FPROP) return {16, 2, 64, 32, 2, 64, 16, 8, 2, 2, 8, 1, 8, 4}; + else if(ty_ == BPROP) + return {32, 2, 64, 32, 64, 32, 4, 2, 2, 4, 2, 8, 4, 2}; else return {8, 2, 16, 8, 2, 16, 8, 2, 8, 8}; } std::string xprop() { - - std::string declare_pb; - if(ty_ == FPROP){ - declare_pb = R"( - fp32* pb[TN, TK] = b + rkb[newaxis, :]*ldb_s + rb0[:, newaxis]; - )"; - } - else{ - declare_pb = R"( - fp32* pb_base[TN, TK] = b + rb0[:, newaxis]*ldb_c; - int32 rbk[TK] = rkb / (BH*BW); - int32 rbrs[TK] = rkb % (BH*BW); - int32 rbs[TK] = BW - 1 - rbrs % BW; - int32 rbr[TK] = BH - 1 - rbrs / BW; - int32 rb1[TK] = rbk*ldb_k + rbr*ldb_r + rbs*ldb_s; - fp32* pb[TN, TK] = pb_base + rb1[newaxis, :]; - )"; - } - std::string increment_pb; - if(ty_ == FPROP){ - increment_pb = R"( - pb = pb + TK*ldb_s; - )"; - } - else{ - increment_pb = R"( - rbrs = rbrs + TK; - rkb = rkb + TK; - rbk = rkb / (BH*BW); - rbrs = rkb % (BH*BW); - rbs = BW - 1 - rbrs % BW; - rbr = BH - 1 - rbrs / BW; - rb1 = rbk*ldb_k + rbr*ldb_r + rbs*ldb_s; - pb = pb_base + rb1[newaxis, :]; - )"; - } + bool trans_b = ty_ == FPROP; + std::string BS = trans_b ?"[TN,TK]" : "[TK, TN]"; + std::string bcb0 = trans_b ?"[:, newaxis]" : "[newaxis, :]"; + std::string bcb1 = trans_b ?"[newaxis, :]" : "[:, newaxis]"; + std::string ldb0 = trans_b ?"*ldb_s" : ""; + std::string ldb1 = trans_b ?"" : "*ldb_c"; + std::string useb = trans_b ?"trans(b)" : "b"; + std::string flipr = trans_b?"" : "BH - 1 -"; + std::string flips = trans_b?"" : "BW - 1 -"; + std::string ax = trans_b?"crs" : "rsc"; + std::vector redax = {"BH", "BW", "N"}; + if(trans_b) + redax = {"C", "BH", "BW"}; std::string res = R"( @@ -271,8 +265,8 @@ public: const tunable int32 TN = {16, 32, 64}; const tunable int32 TK = {8}; - __constant__ int32* delta = alloc_const int32[18]; - __constant__ int32* masks = alloc_const int32[1024]; + __constant__ int32* delta = alloc_const int32[1024]; + __constant__ int32* masks = alloc_const int32[4096]; void conv(read_only restrict fp32 *a, read_only restrict fp32 *b, @@ -290,36 +284,39 @@ public: int32 rka[TK] = 0 ... TK; int32 rkb[TK] = 0 ... TK; fp32 C[TM, TN] = 0; + int32 Fs = )" + std::to_string(Fs_) + R"(; int32 rabh[TM] = rxa / CW; int32 raw[TM] = rxa % CW - pad_w; int32 rab[TM] = rabh / CH; int32 rah[TM] = rabh % CH - pad_h; int32 ra0[TM] = rab*lda_n + rah*lda_h + raw*lda_w; - int32 racr[TK] = rka / BW; - int32 ras[TK] = rka % BW; - int32 rac[TK] = racr / BH; - int32 rar[TK] = racr % BH; + int32 ra)" + ax[0] + ax[1] + "[TK] = rka / " + redax[2] + R"(; + int32 ra)" + ax[2] + "[TK] = rka % " + redax[2] + R"(; + int32 ra)" + ax[0] + "[TK] = ra" + ax[0] + ax[1] + " / " + redax[1] + R"(; + int32 ra)" + ax[1] + "[TK] = ra" + ax[0] + ax[1] + " % " + redax[1] + R"(; + rar = )" + flipr + R"( rar; + ras = )" + flips + R"( ras; int32 ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; - fp32* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis];)" - + declare_pb + R"( + fp32* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis]; + fp32* pb)" + BS + " = b + rkb" + bcb1 + ldb0 + " + rb0" + bcb0 + ldb1 + R"(; __constant__ int32* pincd[TK] = delta + rka; - __constant__ int32* pd[TK] = delta + BH*BW + rka; + __constant__ int32* pd[TK] = delta + Fs + rka; int32 d[TK] = *pd; int32 incd[TK] = *pincd; int32 maskh[TM] = pad_h + min(rah, 0) + max(rah + BH - AH, 0); int32 maskw[TM] = pad_w + min(raw, 0) + max(raw + BW - AW, 0); - __constant__ int32* pm[TM] = masks + BH*BW + maskw*BH*BW + maskh*BH*BW*(2*pad_w + 1); + __constant__ int32* pm[TM] = masks + Fs + maskw*Fs + maskh*Fs*(2*pad_w + 1); __constant__ int32* pincm[TM] = delta; int32 incm[TM] = *pincm; int32 checka0[TM] = *pm; int32 checka1[TK] = 1 << rka; int1 checka[TM, TK] = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; fp32 a[TM, TK] = checka ? *pa : 0; - fp32 b[TN, TK] = *pb; + fp32 b)" + BS + R"( = *pb; for(int32 k = K; k > 0; k = k - TK){ - C = dot(a, trans(b), C); - pa = pa + d[newaxis, :];)" - + increment_pb + R"( + C = dot(a, )" + useb + R"(, C); + pa = pa + d[newaxis, :]; + pb = pb + TK)" + ldb0 + R"(; b = *pb; pd = pd + incd; pincd = pincd + incd; @@ -448,9 +445,9 @@ public: if(ty_==FPROP) b = B[ac*ld_b_[0] + bd*ld_b_[1] + bh*ld_b_[2] + bw*ld_b_[3] + cf*ld_b_[4]]; else{ - int32_t bdd = bd; - int32_t bhh = bh; - int32_t bww = bw; + int32_t bdd = shapes_b_[1] - 1 - bd; + int32_t bhh = shapes_b_[2] - 1 - bh; + int32_t bww = shapes_b_[3] - 1 - bw; b = B[cf*ld_b_[0] + bdd*ld_b_[1] + bhh*ld_b_[2] + bww*ld_b_[3] + ac*ld_b_[4]]; } acc = std::fma(a, b, acc); From 15a967c81edaf3eb083a426feb1f89aa16be3a17 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 15 May 2019 11:32:47 -0400 Subject: [PATCH 146/494] [dnn/conv] minor cleaning --- examples/cpp/conv.cpp | 4 ++-- include/triton/dnn/conv.h | 18 +++++++++++++----- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/examples/cpp/conv.cpp b/examples/cpp/conv.cpp index 8f34b42ef..76c8cd096 100644 --- a/examples/cpp/conv.cpp +++ b/examples/cpp/conv.cpp @@ -10,7 +10,7 @@ int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); triton::jit jit(context); - triton::dnn::conv::type ty = triton::dnn::conv::BPROP; + triton::dnn::conv::type ty = triton::dnn::conv::WGRAD; // initialization int32_t B = 4, NF = 32; int32_t D = 1, H = 24, W = 240; @@ -66,7 +66,7 @@ int main() { return configuration.get_nflops() / ts * 1e-3; }; std::string src = configuration.src(); - jit.autotune("conv", src.c_str(), benchmark); +// jit.autotune("conv", src.c_str(), benchmark); jit.add_module("conv", src.c_str(), configuration.default_params()); triton::driver::kernel* kernel = jit.get_function("conv"); triton::jit::launch_information info = jit.get_launch_info("conv"); diff --git a/include/triton/dnn/conv.h b/include/triton/dnn/conv.h index 26564afe0..b2e5cd3dc 100644 --- a/include/triton/dnn/conv.h +++ b/include/triton/dnn/conv.h @@ -211,10 +211,18 @@ public: kernel->setArg(5, K_); kernel->setArg(6, AH_); kernel->setArg(7, AW_); - kernel->setArg(8, BH_); - kernel->setArg(9, BW_); - kernel->setArg(10, CH_); - kernel->setArg(11, CW_); + if(ty_ == WGRAD){ + kernel->setArg(8, CH_); + kernel->setArg(9, CW_); + kernel->setArg(10, BH_); + kernel->setArg(11, BW_); + } + else{ + kernel->setArg(8, BH_); + kernel->setArg(9, BW_); + kernel->setArg(10, CH_); + kernel->setArg(11, CW_); + } kernel->setArg(12, ld_a_[0]); kernel->setArg(13, ld_a_[1]); kernel->setArg(14, ld_a_[2]); @@ -360,8 +368,8 @@ public: fp32 *c, int32 M, int32 N, int32 K, int32 AH, int32 AW, - int32 CH, int32 CW, int32 BH, int32 BW, + int32 CH, int32 CW, int32 lda_n, int32 lda_c, int32 lda_d, int32 lda_h, int32 lda_w, int32 ldb_n, int32 ldb_k, int32 ldb_m, int32 ldb_p, int32 ldb_q, int32 ldc_c, int32 ldc_t, int32 ldc_r, int32 ldc_s, int32 ldc_k, From ece7beea3cf52e7b06b9469b6e8430bb316d44d1 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 15 May 2019 14:57:31 -0400 Subject: [PATCH 147/494] [dnn/conv]: now using look-up table for wgrad computation as well --- examples/cpp/conv.cpp | 17 +- examples/cpp/dot.cpp | 2 +- include/triton/dnn/conv.h | 338 ++++++++++++++++++++++---------------- 3 files changed, 201 insertions(+), 156 deletions(-) diff --git a/examples/cpp/conv.cpp b/examples/cpp/conv.cpp index 76c8cd096..f836edcb4 100644 --- a/examples/cpp/conv.cpp +++ b/examples/cpp/conv.cpp @@ -38,12 +38,6 @@ int main() { stream->write(db, true, 0, hb); stream->write(dc, true, 0, hc); stream->synchronize(); - // look-up table - std::vector h_delta, h_masks; - if(ty != triton::dnn::conv::WGRAD){ - configuration.build_deltas(h_delta); - configuration.build_masks(h_masks); - } // benchmark a given convolution kernel auto benchmark = [&](triton::driver::kernel* kernel, triton::jit::launch_information info) { @@ -51,12 +45,7 @@ int main() { unsigned TN = info.global_range_size[1]; unsigned nthreads = info.num_threads; std::array grid = configuration.get_grid(TM, TN); - if(ty != triton::dnn::conv::WGRAD){ - triton::driver::buffer* delta = jit.get_buffer("delta"); - triton::driver::buffer* masks = jit.get_buffer("masks"); - stream->write(delta, false, 0, h_delta.size()*4, h_delta.data()); - stream->write(masks, false, 0, h_masks.size()*4, h_masks.data()); - } + configuration.init(stream, jit); stream->synchronize(); configuration.set_arg(kernel, da, db, dc); stream->enqueue(kernel, grid, {nthreads, 1, 1}); @@ -66,7 +55,7 @@ int main() { return configuration.get_nflops() / ts * 1e-3; }; std::string src = configuration.src(); -// jit.autotune("conv", src.c_str(), benchmark); + jit.autotune("conv", src.c_str(), benchmark); jit.add_module("conv", src.c_str(), configuration.default_params()); triton::driver::kernel* kernel = jit.get_function("conv"); triton::jit::launch_information info = jit.get_launch_info("conv"); @@ -74,7 +63,7 @@ int main() { stream->read(dc, true, 0, hc); configuration.cpu_ref(rc.data(), ha.data(), hb.data()); for(size_t i = 0; i < hc.size(); i++){ - if(std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ + if(std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; exit(EXIT_FAILURE); } diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 0c735d9f4..3dde373ef 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -68,7 +68,7 @@ int main() { stream->read(dc, true, 0, hc); simple_gemm(AT, BT, rc, ha, hb, M, N, K); for(size_t i = 0; i < M*N; i++) - if(std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ + if(!std::isnan(hc[i]) && std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; exit(EXIT_FAILURE); } diff --git a/include/triton/dnn/conv.h b/include/triton/dnn/conv.h index b2e5cd3dc..0afa77088 100644 --- a/include/triton/dnn/conv.h +++ b/include/triton/dnn/conv.h @@ -4,6 +4,7 @@ #include #include "triton/driver/stream.h" #include "triton/driver/kernel.h" +#include "triton/jit.h" namespace triton{ namespace dnn{ @@ -46,6 +47,9 @@ public: // swap b and c for wgrad if(ty_ == WGRAD){ shapes_b_.swap(shapes_c_); + std::swap(BD_, CD_); + std::swap(BH_, CH_); + std::swap(BW_, CW_); } // leading dimensions auto set_ld = [](const std::vector& shapes, @@ -62,6 +66,8 @@ public: set_ld(shapes_b_, ld_b_); set_ld(shapes_c_, ld_c_); // equivalent matmul + b_trans_ = ty_ != BPROP; + b_lut_ = ty_ == WGRAD; if(ty_ == WGRAD){ M_ = shapes_c_[0]*shapes_c_[1]*shapes_c_[2]*shapes_c_[3]; N_ = shapes_c_[4]; @@ -73,11 +79,20 @@ public: K_ = shapes_b_[0]*shapes_b_[1]*shapes_b_[2]*shapes_b_[3]; } // look-up table info - Fs_ = shapes_b_[1]*shapes_b_[2]*shapes_b_[3]; - if(ty_ == BPROP) - Fs_ *= shapes_b_[4]; + if(ty_ == FPROP) + Fs_ = shapes_b_[1]*shapes_b_[2]*shapes_b_[3]; + else + Fs_ = K_; TK_ = 8; Luts_ = (TK_ + Fs_ - 1) / Fs_ * Fs_; + build_deltas(); + build_masks(); + size_t cst_size = h_b_deltas_.size()*4; + is_b_deltas_cst_ = cst_size < 65536; + cst_size += h_a_deltas_.size()*4; + is_a_deltas_cst = cst_size < 65536; + cst_size += h_masks_.size()*4; + is_mask_cst_ = cst_size < 65536; } size_t a_size() { @@ -99,14 +114,14 @@ public: return shapes_c_; } - void build_deltas(std::vector& deltas){ - if(ty_ == WGRAD) - throw std::runtime_error("no look-up table necessary for wgrad"); - deltas.resize(Luts_ + upsample_d_*upsample_h_*upsample_w_*Luts_); + void build_deltas(){ + h_a_deltas_.resize(Luts_ + upsample_d_*upsample_h_*upsample_w_*Luts_); + if(b_lut_) + h_b_deltas_.resize(Luts_); auto unpack = [&](int32_t ltrs){ - int32_t l = (ty_ == BPROP) ? ltrs % NF_ : ltrs / Fs_; - int32_t trs = (ty_ == BPROP) ? ltrs / NF_ : ltrs % Fs_; + int32_t l = (ty_ == BPROP) ? ltrs % NF_ : ltrs / (BD_*BH_*BW_); + int32_t trs = (ty_ == BPROP) ? ltrs / NF_ : ltrs % (BD_*BH_*BW_); int32_t tr = trs / BW_; int32_t s = trs % BW_; int32_t t = tr / BH_; @@ -119,7 +134,7 @@ public: }; for(size_t i = 0; i < Luts_; ++i) - deltas[i] = (((i + TK_) % Luts_) - i); + h_a_deltas_[i] = (((i + TK_) % Luts_) - i); size_t Ds0 = Luts_; size_t Ds1 = upsample_w_; @@ -128,7 +143,7 @@ public: for(size_t pd = 0; pd < Ds3; ++pd) for(size_t ph = 0; ph < Ds2; ++ph) for(size_t pw = 0; pw < Ds1; ++pw){ - int32_t* deltas_ptr = &deltas[Luts_ + pw*Ds0 + ph*Ds0*Ds1 + pd*Ds0*Ds1*Ds2]; + int32_t* deltas_ptr = &h_a_deltas_[Luts_ + pw*Ds0 + ph*Ds0*Ds1 + pd*Ds0*Ds1*Ds2]; // cumulative increments for(size_t i = 0; i < Ds0; ++i) { // unpack @@ -145,18 +160,31 @@ public: int32_t rdiff = (nextr + ph)/upsample_h_ - (r + ph)/upsample_h_; int32_t sdiff = (nexts + pw)/upsample_w_ - (s + pw)/upsample_w_; // delta pointers - deltas_ptr[i] = cdiff*ld_a_[1] + tdiff*ld_a_[2] + rdiff*ld_a_[3] + sdiff*ld_a_[4]; + if(ty_ == WGRAD) + deltas_ptr[i] = cdiff*ld_a_[0] + tdiff*ld_a_[2] + rdiff*ld_a_[3] + sdiff*ld_a_[4]; + else + deltas_ptr[i] = cdiff*ld_a_[1] + tdiff*ld_a_[2] + rdiff*ld_a_[3] + sdiff*ld_a_[4]; + } + } + + if(ty_ == WGRAD){ + for(size_t i = 0; i < Ds0; ++i) { + int32_t c, t, r, s; + int32_t nextc, nextt, nextr, nexts; + std::tie(c, t, r, s) = unpack(i); + std::tie(nextc, nextt, nextr, nexts) = unpack(i + TK_); + int32_t cdiff = nextc - c, tdiff = nextt - t, rdiff = nextr - r, sdiff = nexts - s; + h_b_deltas_[i] = cdiff*ld_b_[0] + tdiff*ld_b_[2] + rdiff*ld_b_[3] + sdiff*ld_b_[4]; } } } - void build_masks(std::vector& masks){ - if(ty_ == WGRAD) - throw std::runtime_error("no look-up table necessary for wgrad"); - masks.resize(Luts_ + (2*pad_h_+1)*(2*pad_w_+1)*(2*pad_d_+1)*Luts_); + void build_masks(){ + h_masks_.resize(Luts_ + (2*pad_h_+1)*(2*pad_w_+1)*(2*pad_d_+1)*Luts_); + auto unpack = [&](int32_t ltrs){ - int32_t l = (ty_ == BPROP) ? ltrs % NF_ : ltrs / Fs_; - int32_t trs = (ty_ == BPROP) ? ltrs / NF_ : ltrs % Fs_; + int32_t l = (ty_ == BPROP) ? ltrs % NF_ : ltrs / (BD_*BH_*BW_); + int32_t trs = (ty_ == BPROP) ? ltrs / NF_ : ltrs % (BD_*BH_*BW_); int32_t tr = trs / BW_; int32_t s = trs % BW_; int32_t t = tr / BH_; @@ -174,7 +202,7 @@ public: for(size_t pd = 0; pd < Ms3; ++pd) for(size_t ph = 0; ph < Ms2; ++ph) for(size_t pw = 0; pw < Ms1; ++pw){ - int32_t* masks_ptr = &masks[Luts_ + pw*Ms0 + ph*Ms0*Ms1 + pd*Ms0*Ms1*Ms2]; + int32_t* masks_ptr = &h_masks_[Luts_ + pw*Ms0 + ph*Ms0*Ms1 + pd*Ms0*Ms1*Ms2]; for(size_t i = 0; i < Ms0; ++i){ int32_t l, t, r, s; int32_t mask = 0x0; @@ -189,7 +217,7 @@ public: } } for(size_t i = 0; i < Luts_; ++i) - masks[i] = 0x0; + h_masks_[i] = 0x0; } std::array get_grid(size_t TM, size_t TN){ @@ -200,6 +228,27 @@ public: return 2.*M_*N_*K_; } + void init(driver::stream *stream, triton::jit &jit) { + auto init_lut = [&](bool is_cst, const char *name, std::vector host) -> triton::driver::buffer*{ + if(host.empty()) + return nullptr; + size_t nbytes = host.size()*4; + // get buffer + triton::driver::buffer* buffer; + if(is_cst) + buffer = jit.get_buffer(name); + else + buffer = triton::driver::buffer::create(stream->context(), nbytes); + // copy + stream->write(buffer, false, 0, nbytes, host.data()); + return buffer; + }; + + d_a_deltas_ = init_lut(is_a_deltas_cst, "delta", h_a_deltas_); + d_b_deltas_ = init_lut(is_b_deltas_cst_, "b_delta", h_b_deltas_); + d_masks_ = init_lut(is_mask_cst_, "masks", h_masks_); + } + void set_arg(driver::kernel *kernel, driver::buffer *a, driver::buffer *b, driver::buffer *c) { @@ -211,70 +260,107 @@ public: kernel->setArg(5, K_); kernel->setArg(6, AH_); kernel->setArg(7, AW_); + kernel->setArg(8, BH_); + kernel->setArg(9, BW_); + kernel->setArg(10, CH_); + kernel->setArg(11, CW_); + // A arguments if(ty_ == WGRAD){ - kernel->setArg(8, CH_); - kernel->setArg(9, CW_); - kernel->setArg(10, BH_); - kernel->setArg(11, BW_); + kernel->setArg(12, ld_a_[1]); + kernel->setArg(13, ld_a_[0]); } else{ - kernel->setArg(8, BH_); - kernel->setArg(9, BW_); - kernel->setArg(10, CH_); - kernel->setArg(11, CW_); + kernel->setArg(12, ld_a_[0]); + kernel->setArg(13, ld_a_[1]); } - kernel->setArg(12, ld_a_[0]); - kernel->setArg(13, ld_a_[1]); kernel->setArg(14, ld_a_[2]); kernel->setArg(15, ld_a_[3]); kernel->setArg(16, ld_a_[4]); - kernel->setArg(17, ld_b_[0]); - kernel->setArg(18, ld_b_[1]); - kernel->setArg(19, ld_b_[2]); - kernel->setArg(20, ld_b_[3]); - kernel->setArg(21, ld_b_[4]); - kernel->setArg(22, ld_c_[0]); - kernel->setArg(23, ld_c_[1]); - kernel->setArg(24, ld_c_[2]); - kernel->setArg(25, ld_c_[3]); - kernel->setArg(26, ld_c_[4]); + // B arguments + if(ty_ == WGRAD){ + kernel->setArg(17, ld_b_[0]); + kernel->setArg(18, ld_b_[2]); + kernel->setArg(19, ld_b_[3]); + kernel->setArg(20, ld_b_[4]); + kernel->setArg(21, ld_b_[1]); + } + else{ + kernel->setArg(17, ld_b_[0]); + kernel->setArg(18, ld_b_[1]); + kernel->setArg(19, ld_b_[2]); + kernel->setArg(20, ld_b_[3]); + kernel->setArg(21, ld_b_[4]); + } + // C arguments + if(ty_ == WGRAD){ + kernel->setArg(22, ld_c_[0]); + kernel->setArg(23, ld_c_[4]); + kernel->setArg(24, ld_c_[1]); + kernel->setArg(25, ld_c_[2]); + kernel->setArg(26, ld_c_[3]); + } + else{ + kernel->setArg(22, ld_c_[0]); + kernel->setArg(23, ld_c_[1]); + kernel->setArg(24, ld_c_[2]); + kernel->setArg(25, ld_c_[3]); + kernel->setArg(26, ld_c_[4]); + } kernel->setArg(27, pad_h_); kernel->setArg(28, pad_w_); + size_t idx = 29; + if(!is_a_deltas_cst) + kernel->setArg(idx++, d_a_deltas_); + if(!is_b_deltas_cst_) + kernel->setArg(idx++, d_b_deltas_); + if(!is_mask_cst_) + kernel->setArg(idx++, d_masks_); } std::vector default_params() { - if(ty_ == FPROP) + if(ty_==FPROP) return {16, 2, 64, 32, 2, 64, 16, 8, 2, 2, 8, 1, 8, 4}; else if(ty_ == BPROP) return {32, 2, 64, 32, 64, 32, 4, 2, 2, 4, 2, 8, 4, 2}; - else - return {8, 2, 16, 8, 2, 16, 8, 2, 8, 8}; + else if(ty_ == WGRAD) + return {32, 2, 64, 32, 2, 64, 16, 8, 2, 2, 4, 2, 8}; } - std::string xprop() { - bool trans_b = ty_ == FPROP; - std::string BS = trans_b ?"[TN,TK]" : "[TK, TN]"; - std::string bcb0 = trans_b ?"[:, newaxis]" : "[newaxis, :]"; - std::string bcb1 = trans_b ?"[newaxis, :]" : "[:, newaxis]"; - std::string ldb0 = trans_b ?"*ldb_s" : ""; - std::string ldb1 = trans_b ?"" : "*ldb_c"; - std::string useb = trans_b ?"trans(b)" : "b"; - std::string flipr = trans_b?"" : "BH - 1 -"; - std::string flips = trans_b?"" : "BW - 1 -"; - std::string ax = trans_b?"crs" : "rsc"; - std::vector redax = {"BH", "BW", "N"}; - if(trans_b) + std::string src() { + bool is_wgrad = ty_ == WGRAD; + std::string BS = b_trans_ ? "[TN,TK]" : "[TK, TN]"; + std::string bcb0 = b_trans_ ? "[:, newaxis]" : "[newaxis, :]"; + std::string bcb1 = b_trans_ ? "[newaxis, :]" : "[:, newaxis]"; + std::string ldb0 = b_trans_ ? "*ldb_s" : ""; + std::string ldb1 = b_trans_ ? "*ldb_k" : "*ldb_c"; + std::string useb = b_trans_ ? "trans(b)" : "b"; + std::string flipr = b_trans_ ? "" : "BH - 1 -"; + std::string flips = b_trans_ ? "" : "BW - 1 -"; + std::string ax = b_trans_ ? "crs" : "rsc"; + std::vector redax; + if(b_trans_) redax = {"C", "BH", "BW"}; + else + redax = {"BH", "BW", "N"}; + std::string inc_pb = is_wgrad ? "db[newaxis, :]" : "TK" + ldb0; + std::string a_delta_mem = is_a_deltas_cst ? "__constant__" : ""; + std::string b_delta_mem = is_b_deltas_cst_? "__constant__" : ""; + std::string masks_mem = is_mask_cst_? "__constant__" : ""; std::string res = R"( const tunable int32 TM = {16, 32, 64}; const tunable int32 TN = {16, 32, 64}; const tunable int32 TK = {8}; - - __constant__ int32* delta = alloc_const int32[1024]; - __constant__ int32* masks = alloc_const int32[4096]; + )"; + if(is_a_deltas_cst) + res += "__constant__ int32* delta = alloc_const int32[" + std::to_string(h_a_deltas_.size()) + "];\n"; + if(is_wgrad && is_b_deltas_cst_) + res += "__constant__ int32* b_delta = alloc_const int32[" + std::to_string(h_b_deltas_.size()) + "];\n"; + if(is_mask_cst_) + res += "__constant__ int32* masks = alloc_const int32[" + std::to_string(h_masks_.size()) + "];\n"; + res += R"( void conv(read_only restrict fp32 *a, read_only restrict fp32 *b, @@ -286,13 +372,20 @@ public: int32 lda_n, int32 lda_c, int32 lda_d, int32 lda_h, int32 lda_w, int32 ldb_c, int32 ldb_t, int32 ldb_r, int32 ldb_s, int32 ldb_k, int32 ldc_n, int32 ldc_k, int32 ldc_m, int32 ldc_p, int32 ldc_q, - int32 pad_h, int32 pad_w){ + int32 pad_h, int32 pad_w)"; + if(!is_a_deltas_cst) + res += ", int32* delta\n"; + if(is_wgrad && !is_b_deltas_cst_) + res += ", int32* b_delta\n"; + if(!is_mask_cst_) + res += ", int32* masks\n"; + res += R"(){ int32 rxa[TM] = get_global_range[TM](0); int32 rb0[TN] = get_global_range[TN](1); int32 rka[TK] = 0 ... TK; int32 rkb[TK] = 0 ... TK; fp32 C[TM, TN] = 0; - int32 Fs = )" + std::to_string(Fs_) + R"(; + int32 ldlut = )" + std::to_string(Fs_) + R"(; int32 rabh[TM] = rxa / CW; int32 raw[TM] = rxa % CW - pad_w; int32 rab[TM] = rabh / CH; @@ -305,16 +398,31 @@ public: rar = )" + flipr + R"( rar; ras = )" + flips + R"( ras; int32 ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; - fp32* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis]; - fp32* pb)" + BS + " = b + rkb" + bcb1 + ldb0 + " + rb0" + bcb0 + ldb1 + R"(; - __constant__ int32* pincd[TK] = delta + rka; - __constant__ int32* pd[TK] = delta + Fs + rka; + fp32* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis];)"; + if(ty_ == WGRAD){ + res += R"( + int32 rbcr[TK] = rkb / BW; + int32 rbs[TK] = rkb % BW; + int32 rbc[TK] = rbcr / BH; + int32 rbr[TK] = rbcr % BH; + int32 rb1[TK] = rbc*ldb_c + rbr*ldb_r + ras*ldb_s; + )" + b_delta_mem + R"( int32* pdb[TK] = b_delta + rkb; + int32 db[TK] = *pdb;)"; + } + else{ + res += R"( + int32 rb1[TK] = rkb;)"; + } + res += R"( + fp32* pb)" + BS + " = b + rb1" + bcb1 + ldb0 + " + rb0" + bcb0 + ldb1 + R"(; + )" + a_delta_mem + R"( int32* pincd[TK] = delta + rka; + )" + a_delta_mem + R"( int32* pd[TK] = delta + ldlut + rka; int32 d[TK] = *pd; int32 incd[TK] = *pincd; int32 maskh[TM] = pad_h + min(rah, 0) + max(rah + BH - AH, 0); int32 maskw[TM] = pad_w + min(raw, 0) + max(raw + BW - AW, 0); - __constant__ int32* pm[TM] = masks + Fs + maskw*Fs + maskh*Fs*(2*pad_w + 1); - __constant__ int32* pincm[TM] = delta; + )" + masks_mem + R"( int32* pm[TM] = masks + ldlut + maskw*ldlut + maskh*ldlut*(2*pad_w + 1); + )" + a_delta_mem + R"( int32* pincm[TM] = delta; int32 incm[TM] = *pincm; int32 checka0[TM] = *pm; int32 checka1[TK] = 1 << rka; @@ -324,9 +432,15 @@ public: for(int32 k = K; k > 0; k = k - TK){ C = dot(a, )" + useb + R"(, C); pa = pa + d[newaxis, :]; - pb = pb + TK)" + ldb0 + R"(; + pb = pb + )" + inc_pb + R"(; b = *pb; - pd = pd + incd; + pd = pd + incd;)"; + if(ty_ == WGRAD){ + res += R"( + pdb = pdb + incd; + db = *pdb;)"; + } + res += R"( pincd = pincd + incd; d = *pd; incd = *pincd; @@ -342,86 +456,17 @@ public: int32 rc1[TN] = get_global_range[TN](1); int32 rcn[TM] = rxc / (CH*CW); int32 rcpq[TM] = rxc % (CH*CW); - int32 rc0[TM] = rcn * ldc_n + rcpq; + int32 rc0[TM] = rcn * ldc_n + rcpq * ldc_q; fp32* pc[TM, TN] = c + rc1[newaxis, :]*ldc_k + rc0[:, newaxis]; int1 checkc0[TM] = rxc < M; int1 checkc1[TN] = rc1 < N; int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; @checkc *pc = C; })"; + return res; } - // C = A * B - // where A is N,C,AH,AW - // B is N,K,BH,BW - // C is C,CH,CW,K - std::string wgrad() { - std::string res = - R"( - const tunable int32 TM = {16, 32, 64}; - const tunable int32 TN = {16, 32, 64}; - const tunable int32 TK = {8}; - - void conv(read_only restrict fp32 *a, - read_only restrict fp32 *b, - fp32 *c, - int32 M, int32 N, int32 K, - int32 AH, int32 AW, - int32 BH, int32 BW, - int32 CH, int32 CW, - int32 lda_n, int32 lda_c, int32 lda_d, int32 lda_h, int32 lda_w, - int32 ldb_n, int32 ldb_k, int32 ldb_m, int32 ldb_p, int32 ldb_q, - int32 ldc_c, int32 ldc_t, int32 ldc_r, int32 ldc_s, int32 ldc_k, - int32 pad_h, int32 pad_w){ - int32 rxa[TM] = get_global_range[TM](0); - int32 ryb[TN] = get_global_range[TN](1); - int32 rk[TK] = 0 ... TK; - fp32 C[TM, TN] = 0; - int32 racr[TM] = rxa / CW; - int32 raw_base[TM] = rxa % CW - pad_w; - int32 rac[TM] = racr / CH; - int32 rah_base[TM] = racr % CH - pad_h; - fp32* pa_base[TM, TK] = a + rac[:, newaxis]*lda_c; - fp32* pb_base[TN, TK] = b + ryb[:, newaxis]*ldb_k; - for(int32 k = K; k > 0; k = k - TK){ - int32 rknp[TK] = rk / BW; - int32 rkq[TK] = rk % BW; - int32 rkn[TK] = rknp / BH; - int32 rkp[TK] = rknp % BH; - int32 rah[TM, TK] = rah_base[:, newaxis] + rkp[newaxis, :]; - int32 raw[TM, TK] = raw_base[:, newaxis] + rkq[newaxis, :]; - int1 checka[TM, TK] = (rah >= 0) && (rah < AH) && (raw >= 0) && (raw < AW); - fp32* pa[TM, TK] = pa_base + rah*lda_h + raw*lda_w + rkn*lda_n; - fp32* pb[TN, TK] = pb_base + rkp*ldb_p + rkq*ldb_q + rkn*ldb_n; - fp32 A[TM, TK] = checka ? *pa : 0; - fp32 B[TN, TK] = *pb; - C = dot(A, trans(B), C); - rk = rk + TK; - } - int32 rxc[TM] = get_global_range[TM](0); - int32 ryc[TN] = get_global_range[TN](1); - int32 rccr[TM] = rxc / CW; - int32 rcs[TM] = rxa % CW; - int32 rcc[TM] = racr / CH; - int32 rcr[TM] = racr % CH; - int32 rc0[TM] = rcc*ldc_c + rcr*ldc_r + rcs*ldc_s; - fp32* pc[TM, TN] = c + rc0[:, newaxis] + ryc[newaxis, :]*ldc_k; - int1 checkc0[TM] = rxc < M; - int1 checkc1[TN] = ryc < N; - int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - @checkc *pc = C; - })"; - return res; - } - - std::string src() { - if(ty_ == FPROP || ty_ == BPROP) - return xprop(); - else - return wgrad(); - } - template void cpu_xprop(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B) { @@ -552,9 +597,20 @@ private: // memory stride for C std::vector shapes_c_; std::vector ld_c_; + // constant memory + std::vector h_a_deltas_; + std::vector h_b_deltas_; + std::vector h_masks_; + driver::buffer* d_a_deltas_; + driver::buffer* d_b_deltas_; + driver::buffer* d_masks_; + bool is_a_deltas_cst; + bool is_b_deltas_cst_; + bool is_mask_cst_; // type type ty_; - bool is_bprop_; + bool b_trans_; + bool b_lut_; }; } From 34f86177095e414ca8829f370041fc50c85ef729 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 16 May 2019 15:48:02 -0400 Subject: [PATCH 148/494] [dnn/conv] fixed formatting of generated Triton-C code --- examples/cpp/conv.cpp | 6 +- include/triton/dnn/conv.h | 225 +++++++++++++++++++------------------- 2 files changed, 115 insertions(+), 116 deletions(-) diff --git a/examples/cpp/conv.cpp b/examples/cpp/conv.cpp index f836edcb4..025ca0d4b 100644 --- a/examples/cpp/conv.cpp +++ b/examples/cpp/conv.cpp @@ -12,9 +12,9 @@ int main() { triton::jit jit(context); triton::dnn::conv::type ty = triton::dnn::conv::WGRAD; // initialization - int32_t B = 4, NF = 32; - int32_t D = 1, H = 24, W = 240; - int32_t NC = 32, T = 1, R = 3, S = 3; + int32_t B = 32, NF = 128; + int32_t D = 1, H = 56, W = 56; + int32_t NC = 128, T = 1, R = 3, S = 3; int32_t pad_d = 0, pad_h = 1, pad_w = 1; triton::dnn::conv configuration(B, NC, D, H, W, T, R, S, NF, 1, 1, 1, pad_d, pad_h, pad_w, ty); // convolution configuration diff --git a/include/triton/dnn/conv.h b/include/triton/dnn/conv.h index 0afa77088..d3c413b41 100644 --- a/include/triton/dnn/conv.h +++ b/include/triton/dnn/conv.h @@ -350,120 +350,119 @@ public: std::string res = R"( - const tunable int32 TM = {16, 32, 64}; - const tunable int32 TN = {16, 32, 64}; - const tunable int32 TK = {8}; - )"; - if(is_a_deltas_cst) - res += "__constant__ int32* delta = alloc_const int32[" + std::to_string(h_a_deltas_.size()) + "];\n"; - if(is_wgrad && is_b_deltas_cst_) - res += "__constant__ int32* b_delta = alloc_const int32[" + std::to_string(h_b_deltas_.size()) + "];\n"; - if(is_mask_cst_) - res += "__constant__ int32* masks = alloc_const int32[" + std::to_string(h_masks_.size()) + "];\n"; - res += R"( - - void conv(read_only restrict fp32 *a, - read_only restrict fp32 *b, - fp32 *c, - int32 M, int32 N, int32 K, - int32 AH, int32 AW, - int32 BH, int32 BW, - int32 CH, int32 CW, - int32 lda_n, int32 lda_c, int32 lda_d, int32 lda_h, int32 lda_w, - int32 ldb_c, int32 ldb_t, int32 ldb_r, int32 ldb_s, int32 ldb_k, - int32 ldc_n, int32 ldc_k, int32 ldc_m, int32 ldc_p, int32 ldc_q, - int32 pad_h, int32 pad_w)"; - if(!is_a_deltas_cst) - res += ", int32* delta\n"; - if(is_wgrad && !is_b_deltas_cst_) - res += ", int32* b_delta\n"; - if(!is_mask_cst_) - res += ", int32* masks\n"; - res += R"(){ - int32 rxa[TM] = get_global_range[TM](0); - int32 rb0[TN] = get_global_range[TN](1); - int32 rka[TK] = 0 ... TK; - int32 rkb[TK] = 0 ... TK; - fp32 C[TM, TN] = 0; - int32 ldlut = )" + std::to_string(Fs_) + R"(; - int32 rabh[TM] = rxa / CW; - int32 raw[TM] = rxa % CW - pad_w; - int32 rab[TM] = rabh / CH; - int32 rah[TM] = rabh % CH - pad_h; - int32 ra0[TM] = rab*lda_n + rah*lda_h + raw*lda_w; - int32 ra)" + ax[0] + ax[1] + "[TK] = rka / " + redax[2] + R"(; - int32 ra)" + ax[2] + "[TK] = rka % " + redax[2] + R"(; - int32 ra)" + ax[0] + "[TK] = ra" + ax[0] + ax[1] + " / " + redax[1] + R"(; - int32 ra)" + ax[1] + "[TK] = ra" + ax[0] + ax[1] + " % " + redax[1] + R"(; - rar = )" + flipr + R"( rar; - ras = )" + flips + R"( ras; - int32 ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; - fp32* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis];)"; - if(ty_ == WGRAD){ - res += R"( - int32 rbcr[TK] = rkb / BW; - int32 rbs[TK] = rkb % BW; - int32 rbc[TK] = rbcr / BH; - int32 rbr[TK] = rbcr % BH; - int32 rb1[TK] = rbc*ldb_c + rbr*ldb_r + ras*ldb_s; - )" + b_delta_mem + R"( int32* pdb[TK] = b_delta + rkb; - int32 db[TK] = *pdb;)"; - } - else{ - res += R"( - int32 rb1[TK] = rkb;)"; - } - res += R"( - fp32* pb)" + BS + " = b + rb1" + bcb1 + ldb0 + " + rb0" + bcb0 + ldb1 + R"(; - )" + a_delta_mem + R"( int32* pincd[TK] = delta + rka; - )" + a_delta_mem + R"( int32* pd[TK] = delta + ldlut + rka; - int32 d[TK] = *pd; - int32 incd[TK] = *pincd; - int32 maskh[TM] = pad_h + min(rah, 0) + max(rah + BH - AH, 0); - int32 maskw[TM] = pad_w + min(raw, 0) + max(raw + BW - AW, 0); - )" + masks_mem + R"( int32* pm[TM] = masks + ldlut + maskw*ldlut + maskh*ldlut*(2*pad_w + 1); - )" + a_delta_mem + R"( int32* pincm[TM] = delta; - int32 incm[TM] = *pincm; - int32 checka0[TM] = *pm; - int32 checka1[TK] = 1 << rka; - int1 checka[TM, TK] = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; - fp32 a[TM, TK] = checka ? *pa : 0; - fp32 b)" + BS + R"( = *pb; - for(int32 k = K; k > 0; k = k - TK){ - C = dot(a, )" + useb + R"(, C); - pa = pa + d[newaxis, :]; - pb = pb + )" + inc_pb + R"(; - b = *pb; - pd = pd + incd;)"; - if(ty_ == WGRAD){ - res += R"( - pdb = pdb + incd; - db = *pdb;)"; - } - res += R"( - pincd = pincd + incd; - d = *pd; - incd = *pincd; - pm = pm + incm; - pincm = pincm + incm; - incm = *pincm; - checka0 = *pm; - checka = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; - checka = checka && (k > TK); - a = checka ? *pa : 0; - } - int32 rxc[TM] = get_global_range[TM](0); - int32 rc1[TN] = get_global_range[TN](1); - int32 rcn[TM] = rxc / (CH*CW); - int32 rcpq[TM] = rxc % (CH*CW); - int32 rc0[TM] = rcn * ldc_n + rcpq * ldc_q; - fp32* pc[TM, TN] = c + rc1[newaxis, :]*ldc_k + rc0[:, newaxis]; - int1 checkc0[TM] = rxc < M; - int1 checkc1[TN] = rc1 < N; - int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - @checkc *pc = C; - })"; +const tunable int32 TM = {16, 32, 64}; +const tunable int32 TN = {16, 32, 64}; +const tunable int32 TK = {8}; +)"; +if(is_a_deltas_cst) + res += "__constant__ int32* delta = alloc_const int32[" + std::to_string(h_a_deltas_.size()) + "];\n"; +if(is_wgrad && is_b_deltas_cst_) + res += "__constant__ int32* b_delta = alloc_const int32[" + std::to_string(h_b_deltas_.size()) + "];\n"; +if(is_mask_cst_) + res += "__constant__ int32* masks = alloc_const int32[" + std::to_string(h_masks_.size()) + "];\n"; +res += R"( + void conv(read_only restrict fp32 *a, + read_only restrict fp32 *b, + fp32 *c, + int32 M, int32 N, int32 K, + int32 AH, int32 AW, + int32 BH, int32 BW, + int32 CH, int32 CW, + int32 lda_n, int32 lda_c, int32 lda_d, int32 lda_h, int32 lda_w, + int32 ldb_c, int32 ldb_t, int32 ldb_r, int32 ldb_s, int32 ldb_k, + int32 ldc_n, int32 ldc_k, int32 ldc_m, int32 ldc_p, int32 ldc_q, + int32 pad_h, int32 pad_w)"; +if(!is_a_deltas_cst) + res += ", int32* delta"; +if(is_wgrad && !is_b_deltas_cst_) + res += ", int32* b_delta"; +if(!is_mask_cst_) + res += ", int32* masks"; + res += R"(){ + int32 rxa[TM] = get_global_range[TM](0); + int32 rb0[TN] = get_global_range[TN](1); + int32 rka[TK] = 0 ... TK; + int32 rkb[TK] = 0 ... TK; + fp32 C[TM, TN] = 0; + int32 ldlut = )" + std::to_string(Fs_) + R"(; + int32 rabh[TM] = rxa / CW; + int32 raw[TM] = rxa % CW - pad_w; + int32 rab[TM] = rabh / CH; + int32 rah[TM] = rabh % CH - pad_h; + int32 ra0[TM] = rab*lda_n + rah*lda_h + raw*lda_w; + int32 ra)" + ax[0] + ax[1] + "[TK] = rka / " + redax[2] + R"(; + int32 ra)" + ax[2] + "[TK] = rka % " + redax[2] + R"(; + int32 ra)" + ax[0] + "[TK] = ra" + ax[0] + ax[1] + " / " + redax[1] + R"(; + int32 ra)" + ax[1] + "[TK] = ra" + ax[0] + ax[1] + " % " + redax[1] + R"(; + rar = )" + flipr + R"( rar; + ras = )" + flips + R"( ras; + int32 ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; + fp32* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis];)"; +if(ty_ == WGRAD){ + res += R"( + int32 rbcr[TK] = rkb / BW; + int32 rbs[TK] = rkb % BW; + int32 rbc[TK] = rbcr / BH; + int32 rbr[TK] = rbcr % BH; + int32 rb1[TK] = rbc*ldb_c + rbr*ldb_r + ras*ldb_s; + )" + b_delta_mem + R"( int32* pdb[TK] = b_delta + rkb; + int32 db[TK] = *pdb;)"; +} +else{ +res += R"( + int32 rb1[TK] = rkb;)"; +} +res += R"( + fp32* pb)" + BS + " = b + rb1" + bcb1 + ldb0 + " + rb0" + bcb0 + ldb1 + R"(; + )" + a_delta_mem + R"( int32* pincd[TK] = delta + rka; + )" + a_delta_mem + R"( int32* pd[TK] = delta + ldlut + rka; + int32 d[TK] = *pd; + int32 incd[TK] = *pincd; + int32 maskh[TM] = pad_h + min(rah, 0) + max(rah + BH - AH, 0); + int32 maskw[TM] = pad_w + min(raw, 0) + max(raw + BW - AW, 0); + )" + masks_mem + R"( int32* pm[TM] = masks + ldlut + maskw*ldlut + maskh*ldlut*(2*pad_w + 1); + )" + a_delta_mem + R"( int32* pincm[TM] = delta; + int32 incm[TM] = *pincm; + int32 checka0[TM] = *pm; + int32 checka1[TK] = 1 << rka; + int1 checka[TM, TK] = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; + fp32 a[TM, TK] = checka ? *pa : 0; + fp32 b)" + BS + R"( = *pb; + for(int32 k = K; k > 0; k = k - TK){ + C = dot(a, )" + useb + R"(, C); + pa = pa + d[newaxis, :]; + pb = pb + )" + inc_pb + R"(; + b = *pb; + pd = pd + incd;)"; +if(ty_ == WGRAD){ + res += R"( + pdb = pdb + TK; + db = *pdb;)"; +} + res += R"( + pincd = pincd + incd; + d = *pd; + incd = *pincd; + pm = pm + incm; + pincm = pincm + incm; + incm = *pincm; + checka0 = *pm; + checka = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; + checka = checka && (k > TK); + a = checka ? *pa : 0; + } + int32 rxc[TM] = get_global_range[TM](0); + int32 rc1[TN] = get_global_range[TN](1); + int32 rcn[TM] = rxc / (CH*CW); + int32 rcpq[TM] = rxc % (CH*CW); + int32 rc0[TM] = rcn * ldc_n + rcpq * ldc_q; + fp32* pc[TM, TN] = c + rc1[newaxis, :]*ldc_k + rc0[:, newaxis]; + int1 checkc0[TM] = rxc < M; + int1 checkc1[TN] = rc1 < N; + int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + @checkc *pc = C; +})"; return res; } From 600aef72d52474c47935e12f71a1bf0f852d7335 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 17 May 2019 12:28:55 -0400 Subject: [PATCH 149/494] [conv/dnn] now created a separate .h and .cpp file --- examples/python/pytorch/conv.cpp | 12 +- include/triton/dnn/conv.h | 539 ++----------------------------- lib/dnn/conv.cpp | 538 ++++++++++++++++++++++++++++++ lib/frontend/jit.cpp | 0 4 files changed, 561 insertions(+), 528 deletions(-) create mode 100644 lib/dnn/conv.cpp delete mode 100644 lib/frontend/jit.cpp diff --git a/examples/python/pytorch/conv.cpp b/examples/python/pytorch/conv.cpp index 09a3f6eaa..f8636c482 100644 --- a/examples/python/pytorch/conv.cpp +++ b/examples/python/pytorch/conv.cpp @@ -45,18 +45,8 @@ torch::Tensor conv_common( // launch info unsigned TM = info.global_range_size[0]; unsigned TN = info.global_range_size[1]; - // initialize constant memory - if(ty != triton::dnn::conv::WGRAD){ - std::vector h_delta; - std::vector h_masks; - configuration.build_deltas(h_delta); - configuration.build_masks(h_masks); - triton::driver::buffer* delta = jit.get_buffer("delta"); - triton::driver::buffer* masks = jit.get_buffer("masks"); - stream->write(delta, false, 0, h_delta.size()*4, h_delta.data()); - stream->write(masks, false, 0, h_masks.size()*4, h_masks.data()); - } // launch info + configuration.init(stream, jit); unsigned nthreads = info.num_threads; std::array grid = configuration.get_grid(TM, TN); configuration.set_arg(kernel, &a, &b, &c); diff --git a/include/triton/dnn/conv.h b/include/triton/dnn/conv.h index d3c413b41..20b430187 100644 --- a/include/triton/dnn/conv.h +++ b/include/triton/dnn/conv.h @@ -23,533 +23,38 @@ public: int T, int R, int S, int NF, int stride_d, int stride_h, int stride_w, int pad_d, int pad_h, int pad_w, - type ty = FPROP) - : NB_(B), NC_(NC), AD_(D), AH_(H), AW_(W), BD_(T), BH_(R), BW_(S), NF_(NF), - stride_d_(stride_d), stride_h_(stride_h), stride_w_(stride_w), - upsample_d_(1), upsample_h_(1), upsample_w_(1), - pad_d_(pad_d), pad_h_(pad_h), pad_w_(pad_w), - ty_(ty) - { - CD_ = (AD_*upsample_d_ - BD_ + 1 + 2*pad_d_ + stride_d_ - 1)/stride_d_; - CH_ = (AH_*upsample_h_ - BH_ + 1 + 2*pad_h_ + stride_h_ - 1)/stride_h_; - CW_ = (AW_*upsample_w_ - BW_ + 1 + 2*pad_w_ + stride_w_ - 1)/stride_w_; - // shapes - shapes_a_ = {NB_, NC_, AD_, AH_, AW_}; - shapes_b_ = {NC_, BD_, BH_, BW_, NF_}; - shapes_c_ = {NB_, NF_, CD_, CH_, CW_}; - // swap a and c for bprop - if(ty_ == BPROP){ - pad_d_ = (CD_ - AD_ + BD_ - 1) / 2; - pad_h_ = (CH_ - AH_ + BH_ - 1) / 2; - pad_w_ = (CW_ - AW_ + BW_ - 1) / 2; - shapes_a_.swap(shapes_c_); - } - // swap b and c for wgrad - if(ty_ == WGRAD){ - shapes_b_.swap(shapes_c_); - std::swap(BD_, CD_); - std::swap(BH_, CH_); - std::swap(BW_, CW_); - } - // leading dimensions - auto set_ld = [](const std::vector& shapes, - std::vector& ld) { - size_t size = shapes.size(); - ld.resize(size); - ld[4] = 1; - ld[3] = shapes[4]*ld[4]; - ld[2] = shapes[3]*ld[3]; - ld[1] = shapes[2]*ld[2]; - ld[0] = shapes[1]*ld[1]; - }; - set_ld(shapes_a_, ld_a_); - set_ld(shapes_b_, ld_b_); - set_ld(shapes_c_, ld_c_); - // equivalent matmul - b_trans_ = ty_ != BPROP; - b_lut_ = ty_ == WGRAD; - if(ty_ == WGRAD){ - M_ = shapes_c_[0]*shapes_c_[1]*shapes_c_[2]*shapes_c_[3]; - N_ = shapes_c_[4]; - K_ = shapes_b_[0]*shapes_b_[2]*shapes_b_[3]*shapes_b_[4]; - } - else{ - M_ = shapes_c_[0]*shapes_c_[2]*shapes_c_[3]*shapes_c_[4]; - N_ = shapes_c_[1]; - K_ = shapes_b_[0]*shapes_b_[1]*shapes_b_[2]*shapes_b_[3]; - } - // look-up table info - if(ty_ == FPROP) - Fs_ = shapes_b_[1]*shapes_b_[2]*shapes_b_[3]; - else - Fs_ = K_; - TK_ = 8; - Luts_ = (TK_ + Fs_ - 1) / Fs_ * Fs_; - build_deltas(); - build_masks(); - size_t cst_size = h_b_deltas_.size()*4; - is_b_deltas_cst_ = cst_size < 65536; - cst_size += h_a_deltas_.size()*4; - is_a_deltas_cst = cst_size < 65536; - cst_size += h_masks_.size()*4; - is_mask_cst_ = cst_size < 65536; - } + type ty = FPROP); - size_t a_size() { - return std::accumulate(shapes_a_.begin(), shapes_a_.end(), - 1, std::multiplies()); - } - - size_t b_size() { - return std::accumulate(shapes_b_.begin(), shapes_b_.end(), - 1, std::multiplies()); - } - - size_t c_size() { - return std::accumulate(shapes_c_.begin(), shapes_c_.end(), - 1, std::multiplies()); - } - - std::vector c_shapes() { - return shapes_c_; - } - - void build_deltas(){ - h_a_deltas_.resize(Luts_ + upsample_d_*upsample_h_*upsample_w_*Luts_); - if(b_lut_) - h_b_deltas_.resize(Luts_); - - auto unpack = [&](int32_t ltrs){ - int32_t l = (ty_ == BPROP) ? ltrs % NF_ : ltrs / (BD_*BH_*BW_); - int32_t trs = (ty_ == BPROP) ? ltrs / NF_ : ltrs % (BD_*BH_*BW_); - int32_t tr = trs / BW_; - int32_t s = trs % BW_; - int32_t t = tr / BH_; - int32_t r = tr % BH_; - if(ty_ == BPROP){ - r = BH_ - 1 - r; - s = BW_ - 1 - s; - } - return std::make_tuple(l, t, r, s); - }; - - for(size_t i = 0; i < Luts_; ++i) - h_a_deltas_[i] = (((i + TK_) % Luts_) - i); - - size_t Ds0 = Luts_; - size_t Ds1 = upsample_w_; - size_t Ds2 = upsample_h_; - size_t Ds3 = upsample_d_; - for(size_t pd = 0; pd < Ds3; ++pd) - for(size_t ph = 0; ph < Ds2; ++ph) - for(size_t pw = 0; pw < Ds1; ++pw){ - int32_t* deltas_ptr = &h_a_deltas_[Luts_ + pw*Ds0 + ph*Ds0*Ds1 + pd*Ds0*Ds1*Ds2]; - // cumulative increments - for(size_t i = 0; i < Ds0; ++i) { - // unpack - int32_t ctrs = i; - int32_t c, t, r, s; - std::tie(c, t, r, s) = unpack(ctrs); - // next indices - int32_t nextctrs = ctrs + TK_; - int32_t nextc, nextt, nextr, nexts; - std::tie(nextc, nextt, nextr, nexts) = unpack(nextctrs); - // diffs - int32_t cdiff = nextc - c; - int32_t tdiff = (nextt + pd)/upsample_d_ - (t + pd)/upsample_d_; - int32_t rdiff = (nextr + ph)/upsample_h_ - (r + ph)/upsample_h_; - int32_t sdiff = (nexts + pw)/upsample_w_ - (s + pw)/upsample_w_; - // delta pointers - if(ty_ == WGRAD) - deltas_ptr[i] = cdiff*ld_a_[0] + tdiff*ld_a_[2] + rdiff*ld_a_[3] + sdiff*ld_a_[4]; - else - deltas_ptr[i] = cdiff*ld_a_[1] + tdiff*ld_a_[2] + rdiff*ld_a_[3] + sdiff*ld_a_[4]; - } - } - - if(ty_ == WGRAD){ - for(size_t i = 0; i < Ds0; ++i) { - int32_t c, t, r, s; - int32_t nextc, nextt, nextr, nexts; - std::tie(c, t, r, s) = unpack(i); - std::tie(nextc, nextt, nextr, nexts) = unpack(i + TK_); - int32_t cdiff = nextc - c, tdiff = nextt - t, rdiff = nextr - r, sdiff = nexts - s; - h_b_deltas_[i] = cdiff*ld_b_[0] + tdiff*ld_b_[2] + rdiff*ld_b_[3] + sdiff*ld_b_[4]; - } - } - } - - void build_masks(){ - h_masks_.resize(Luts_ + (2*pad_h_+1)*(2*pad_w_+1)*(2*pad_d_+1)*Luts_); - - auto unpack = [&](int32_t ltrs){ - int32_t l = (ty_ == BPROP) ? ltrs % NF_ : ltrs / (BD_*BH_*BW_); - int32_t trs = (ty_ == BPROP) ? ltrs / NF_ : ltrs % (BD_*BH_*BW_); - int32_t tr = trs / BW_; - int32_t s = trs % BW_; - int32_t t = tr / BH_; - int32_t r = tr % BH_; - if(ty_ == BPROP){ - r = BH_ - 1 - r; - s = BW_ - 1 - s; - } - return std::make_tuple(l, t, r, s); - }; - size_t Ms0 = Luts_; - size_t Ms1 = 2*pad_w_ + 1; - size_t Ms2 = 2*pad_h_ + 1; - size_t Ms3 = 2*pad_d_ + 1; - for(size_t pd = 0; pd < Ms3; ++pd) - for(size_t ph = 0; ph < Ms2; ++ph) - for(size_t pw = 0; pw < Ms1; ++pw){ - int32_t* masks_ptr = &h_masks_[Luts_ + pw*Ms0 + ph*Ms0*Ms1 + pd*Ms0*Ms1*Ms2]; - for(size_t i = 0; i < Ms0; ++i){ - int32_t l, t, r, s; - int32_t mask = 0x0; - for(size_t j = 0; j < TK_; ++j){ - std::tie(l, t, r, s) = unpack(i + j); - bool in_bounds_d = (t + pd) >= pad_d_ && (t + pd) < (BD_ + pad_d_); - bool in_bounds_h = (r + ph) >= pad_h_ && (r + ph) < (BH_ + pad_h_); - bool in_bounds_w = (s + pw) >= pad_w_ && (s + pw) < (BW_ + pad_w_); - mask |= (in_bounds_d && in_bounds_h && in_bounds_w) << j; - } - masks_ptr[i] = mask; - } - } - for(size_t i = 0; i < Luts_; ++i) - h_masks_[i] = 0x0; - } - - std::array get_grid(size_t TM, size_t TN){ - return {(M_ + TM - 1)/TM, (N_ + TN - 1)/TN, 1}; - } - - size_t get_nflops(){ - return 2.*M_*N_*K_; - } - - void init(driver::stream *stream, triton::jit &jit) { - auto init_lut = [&](bool is_cst, const char *name, std::vector host) -> triton::driver::buffer*{ - if(host.empty()) - return nullptr; - size_t nbytes = host.size()*4; - // get buffer - triton::driver::buffer* buffer; - if(is_cst) - buffer = jit.get_buffer(name); - else - buffer = triton::driver::buffer::create(stream->context(), nbytes); - // copy - stream->write(buffer, false, 0, nbytes, host.data()); - return buffer; - }; - - d_a_deltas_ = init_lut(is_a_deltas_cst, "delta", h_a_deltas_); - d_b_deltas_ = init_lut(is_b_deltas_cst_, "b_delta", h_b_deltas_); - d_masks_ = init_lut(is_mask_cst_, "masks", h_masks_); - } + // accessors + size_t a_size(); + size_t b_size(); + size_t c_size(); + std::vector c_shapes(); + // initialize + void build_deltas(); + void build_masks(); + void init(driver::stream *stream, triton::jit &jit); + std::array get_grid(size_t TM, size_t TN); void set_arg(driver::kernel *kernel, - driver::buffer *a, driver::buffer *b, driver::buffer *c) - { - kernel->setArg(0, a); - kernel->setArg(1, b); - kernel->setArg(2, c); - kernel->setArg(3, M_); - kernel->setArg(4, N_); - kernel->setArg(5, K_); - kernel->setArg(6, AH_); - kernel->setArg(7, AW_); - kernel->setArg(8, BH_); - kernel->setArg(9, BW_); - kernel->setArg(10, CH_); - kernel->setArg(11, CW_); - // A arguments - if(ty_ == WGRAD){ - kernel->setArg(12, ld_a_[1]); - kernel->setArg(13, ld_a_[0]); - } - else{ - kernel->setArg(12, ld_a_[0]); - kernel->setArg(13, ld_a_[1]); - } - kernel->setArg(14, ld_a_[2]); - kernel->setArg(15, ld_a_[3]); - kernel->setArg(16, ld_a_[4]); - // B arguments - if(ty_ == WGRAD){ - kernel->setArg(17, ld_b_[0]); - kernel->setArg(18, ld_b_[2]); - kernel->setArg(19, ld_b_[3]); - kernel->setArg(20, ld_b_[4]); - kernel->setArg(21, ld_b_[1]); - } - else{ - kernel->setArg(17, ld_b_[0]); - kernel->setArg(18, ld_b_[1]); - kernel->setArg(19, ld_b_[2]); - kernel->setArg(20, ld_b_[3]); - kernel->setArg(21, ld_b_[4]); - } - // C arguments - if(ty_ == WGRAD){ - kernel->setArg(22, ld_c_[0]); - kernel->setArg(23, ld_c_[4]); - kernel->setArg(24, ld_c_[1]); - kernel->setArg(25, ld_c_[2]); - kernel->setArg(26, ld_c_[3]); - } - else{ - kernel->setArg(22, ld_c_[0]); - kernel->setArg(23, ld_c_[1]); - kernel->setArg(24, ld_c_[2]); - kernel->setArg(25, ld_c_[3]); - kernel->setArg(26, ld_c_[4]); - } - kernel->setArg(27, pad_h_); - kernel->setArg(28, pad_w_); - size_t idx = 29; - if(!is_a_deltas_cst) - kernel->setArg(idx++, d_a_deltas_); - if(!is_b_deltas_cst_) - kernel->setArg(idx++, d_b_deltas_); - if(!is_mask_cst_) - kernel->setArg(idx++, d_masks_); - } + driver::buffer *a, driver::buffer *b, driver::buffer *c); - std::vector default_params() { - if(ty_==FPROP) - return {16, 2, 64, 32, 2, 64, 16, 8, 2, 2, 8, 1, 8, 4}; - else if(ty_ == BPROP) - return {32, 2, 64, 32, 64, 32, 4, 2, 2, 4, 2, 8, 4, 2}; - else if(ty_ == WGRAD) - return {32, 2, 64, 32, 2, 64, 16, 8, 2, 2, 4, 2, 8}; - } + // utilities + size_t get_nflops(); + std::vector default_params(); + // source + std::string src(); - std::string src() { - bool is_wgrad = ty_ == WGRAD; - std::string BS = b_trans_ ? "[TN,TK]" : "[TK, TN]"; - std::string bcb0 = b_trans_ ? "[:, newaxis]" : "[newaxis, :]"; - std::string bcb1 = b_trans_ ? "[newaxis, :]" : "[:, newaxis]"; - std::string ldb0 = b_trans_ ? "*ldb_s" : ""; - std::string ldb1 = b_trans_ ? "*ldb_k" : "*ldb_c"; - std::string useb = b_trans_ ? "trans(b)" : "b"; - std::string flipr = b_trans_ ? "" : "BH - 1 -"; - std::string flips = b_trans_ ? "" : "BW - 1 -"; - std::string ax = b_trans_ ? "crs" : "rsc"; - std::vector redax; - if(b_trans_) - redax = {"C", "BH", "BW"}; - else - redax = {"BH", "BW", "N"}; - std::string inc_pb = is_wgrad ? "db[newaxis, :]" : "TK" + ldb0; - std::string a_delta_mem = is_a_deltas_cst ? "__constant__" : ""; - std::string b_delta_mem = is_b_deltas_cst_? "__constant__" : ""; - std::string masks_mem = is_mask_cst_? "__constant__" : ""; - - std::string res = - R"( -const tunable int32 TM = {16, 32, 64}; -const tunable int32 TN = {16, 32, 64}; -const tunable int32 TK = {8}; -)"; -if(is_a_deltas_cst) - res += "__constant__ int32* delta = alloc_const int32[" + std::to_string(h_a_deltas_.size()) + "];\n"; -if(is_wgrad && is_b_deltas_cst_) - res += "__constant__ int32* b_delta = alloc_const int32[" + std::to_string(h_b_deltas_.size()) + "];\n"; -if(is_mask_cst_) - res += "__constant__ int32* masks = alloc_const int32[" + std::to_string(h_masks_.size()) + "];\n"; -res += R"( - - void conv(read_only restrict fp32 *a, - read_only restrict fp32 *b, - fp32 *c, - int32 M, int32 N, int32 K, - int32 AH, int32 AW, - int32 BH, int32 BW, - int32 CH, int32 CW, - int32 lda_n, int32 lda_c, int32 lda_d, int32 lda_h, int32 lda_w, - int32 ldb_c, int32 ldb_t, int32 ldb_r, int32 ldb_s, int32 ldb_k, - int32 ldc_n, int32 ldc_k, int32 ldc_m, int32 ldc_p, int32 ldc_q, - int32 pad_h, int32 pad_w)"; -if(!is_a_deltas_cst) - res += ", int32* delta"; -if(is_wgrad && !is_b_deltas_cst_) - res += ", int32* b_delta"; -if(!is_mask_cst_) - res += ", int32* masks"; - res += R"(){ - int32 rxa[TM] = get_global_range[TM](0); - int32 rb0[TN] = get_global_range[TN](1); - int32 rka[TK] = 0 ... TK; - int32 rkb[TK] = 0 ... TK; - fp32 C[TM, TN] = 0; - int32 ldlut = )" + std::to_string(Fs_) + R"(; - int32 rabh[TM] = rxa / CW; - int32 raw[TM] = rxa % CW - pad_w; - int32 rab[TM] = rabh / CH; - int32 rah[TM] = rabh % CH - pad_h; - int32 ra0[TM] = rab*lda_n + rah*lda_h + raw*lda_w; - int32 ra)" + ax[0] + ax[1] + "[TK] = rka / " + redax[2] + R"(; - int32 ra)" + ax[2] + "[TK] = rka % " + redax[2] + R"(; - int32 ra)" + ax[0] + "[TK] = ra" + ax[0] + ax[1] + " / " + redax[1] + R"(; - int32 ra)" + ax[1] + "[TK] = ra" + ax[0] + ax[1] + " % " + redax[1] + R"(; - rar = )" + flipr + R"( rar; - ras = )" + flips + R"( ras; - int32 ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; - fp32* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis];)"; -if(ty_ == WGRAD){ - res += R"( - int32 rbcr[TK] = rkb / BW; - int32 rbs[TK] = rkb % BW; - int32 rbc[TK] = rbcr / BH; - int32 rbr[TK] = rbcr % BH; - int32 rb1[TK] = rbc*ldb_c + rbr*ldb_r + ras*ldb_s; - )" + b_delta_mem + R"( int32* pdb[TK] = b_delta + rkb; - int32 db[TK] = *pdb;)"; -} -else{ -res += R"( - int32 rb1[TK] = rkb;)"; -} -res += R"( - fp32* pb)" + BS + " = b + rb1" + bcb1 + ldb0 + " + rb0" + bcb0 + ldb1 + R"(; - )" + a_delta_mem + R"( int32* pincd[TK] = delta + rka; - )" + a_delta_mem + R"( int32* pd[TK] = delta + ldlut + rka; - int32 d[TK] = *pd; - int32 incd[TK] = *pincd; - int32 maskh[TM] = pad_h + min(rah, 0) + max(rah + BH - AH, 0); - int32 maskw[TM] = pad_w + min(raw, 0) + max(raw + BW - AW, 0); - )" + masks_mem + R"( int32* pm[TM] = masks + ldlut + maskw*ldlut + maskh*ldlut*(2*pad_w + 1); - )" + a_delta_mem + R"( int32* pincm[TM] = delta; - int32 incm[TM] = *pincm; - int32 checka0[TM] = *pm; - int32 checka1[TK] = 1 << rka; - int1 checka[TM, TK] = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; - fp32 a[TM, TK] = checka ? *pa : 0; - fp32 b)" + BS + R"( = *pb; - for(int32 k = K; k > 0; k = k - TK){ - C = dot(a, )" + useb + R"(, C); - pa = pa + d[newaxis, :]; - pb = pb + )" + inc_pb + R"(; - b = *pb; - pd = pd + incd;)"; -if(ty_ == WGRAD){ - res += R"( - pdb = pdb + TK; - db = *pdb;)"; -} - res += R"( - pincd = pincd + incd; - d = *pd; - incd = *pincd; - pm = pm + incm; - pincm = pincm + incm; - incm = *pincm; - checka0 = *pm; - checka = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; - checka = checka && (k > TK); - a = checka ? *pa : 0; - } - int32 rxc[TM] = get_global_range[TM](0); - int32 rc1[TN] = get_global_range[TN](1); - int32 rcn[TM] = rxc / (CH*CW); - int32 rcpq[TM] = rxc % (CH*CW); - int32 rc0[TM] = rcn * ldc_n + rcpq * ldc_q; - fp32* pc[TM, TN] = c + rc1[newaxis, :]*ldc_k + rc0[:, newaxis]; - int1 checkc0[TM] = rxc < M; - int1 checkc1[TN] = rc1 < N; - int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - @checkc *pc = C; -})"; - return res; - } + // cpu check + template + void cpu_xprop(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B); template - void cpu_xprop(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B) - { - IN_DTYPE acc; - for(int32_t n = 0; n < shapes_c_[0]; ++n) - for(int32_t cf = 0; cf < shapes_c_[1] ; ++cf) - for(int32_t cd = 0 ; cd < shapes_c_[2]; ++cd) - for(int32_t ch = 0 ; ch < shapes_c_[3]; ++ch) - for(int32_t cw = 0; cw < shapes_c_[4]; ++cw) - { - acc = 0; - int32_t d = cd*stride_d_ - pad_d_; - int32_t h = ch*stride_h_ - pad_h_; - int32_t w = cw*stride_w_ - pad_w_; - for(int32_t ac = 0; ac < shapes_a_[1]; ++ac) - for(int32_t bd = 0; bd < shapes_b_[1]; ++bd) - for(int32_t bh = 0; bh < shapes_b_[2]; ++bh) - for(int32_t bw = 0; bw < shapes_b_[3]; ++bw){ - int32_t ad = d + bd; - int32_t ah = h + bh; - int32_t aw = w + bw; - bool in_bounds = (ad >= 0 && ad < shapes_a_[2] && - ah >= 0 && ah < shapes_a_[3] && - aw >= 0 && aw < shapes_a_[4]); - IN_DTYPE a = 0; - if(in_bounds) - a = A[n*ld_a_[0] + ac*ld_a_[1] + ad*ld_a_[2] + ah*ld_a_[3] + aw*ld_a_[4]]; - IN_DTYPE b; - if(ty_==FPROP) - b = B[ac*ld_b_[0] + bd*ld_b_[1] + bh*ld_b_[2] + bw*ld_b_[3] + cf*ld_b_[4]]; - else{ - int32_t bdd = shapes_b_[1] - 1 - bd; - int32_t bhh = shapes_b_[2] - 1 - bh; - int32_t bww = shapes_b_[3] - 1 - bw; - b = B[cf*ld_b_[0] + bdd*ld_b_[1] + bhh*ld_b_[2] + bww*ld_b_[3] + ac*ld_b_[4]]; - } - acc = std::fma(a, b, acc); - } - C[n*ld_c_[0] + cf*ld_c_[1] + cd*ld_c_[2] + ch*ld_c_[3] + cw*ld_c_[4]] = acc; - } - } + void cpu_wgrad(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B); template - void cpu_wgrad(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B) - { - IN_DTYPE acc; - for(int32_t c = 0 ; c < shapes_c_[0]; ++c) - for(int32_t cd = 0; cd < shapes_c_[1]; ++cd) - for(int32_t ch = 0; ch < shapes_c_[2]; ++ch) - for(int32_t cw = 0; cw < shapes_c_[3]; ++cw) - for(int32_t k = 0 ; k < shapes_c_[4]; ++k) - { - acc = 0; - int32_t d = cd*stride_d_ - pad_d_; - int32_t h = ch*stride_h_ - pad_h_; - int32_t w = cw*stride_w_ - pad_w_; - for(int32_t n = 0; n < shapes_b_[0]; ++n) - for(int32_t bd = 0; bd < shapes_b_[2]; ++bd) - for(int32_t bh = 0; bh < shapes_b_[3]; ++bh) - for(int32_t bw = 0; bw < shapes_b_[4]; ++bw){ - int32_t ad = d + bd; - int32_t ah = h + bh; - int32_t aw = w + bw; - bool in_bounds = (ad >= 0 && ad < shapes_a_[2] && - ah >= 0 && ah < shapes_a_[3] && - aw >= 0 && aw < shapes_a_[4]); - IN_DTYPE a = 0; - if(in_bounds) - a = A[n*ld_a_[0] + c*ld_a_[1] + ad*ld_a_[2] + ah*ld_a_[3] + aw*ld_a_[4]]; - IN_DTYPE b = B[n*ld_b_[0] + k*ld_b_[1] + bd*ld_b_[2] + bh*ld_b_[3] + bw*ld_b_[4]]; - acc = std::fma(a, b, acc); - } - C[c*ld_c_[0] + cd*ld_c_[1] + ch*ld_c_[2] + cw*ld_c_[3] + k*ld_c_[4]] = acc; - } - } - - template - void cpu_ref(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B) - { - if(ty_ == FPROP || ty_ == BPROP) - cpu_xprop(C, A, B); - else - cpu_wgrad(C, A, B); - } + void cpu_ref(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B); private: // image size diff --git a/lib/dnn/conv.cpp b/lib/dnn/conv.cpp new file mode 100644 index 000000000..be47b95c5 --- /dev/null +++ b/lib/dnn/conv.cpp @@ -0,0 +1,538 @@ +#include "triton/dnn/conv.h" + +namespace triton{ +namespace dnn{ + +conv::conv(int B, int NC, + int D, int H, int W, + int T, int R, int S, int NF, + int stride_d, int stride_h, int stride_w, + int pad_d, int pad_h, int pad_w, + type ty) + : NB_(B), NC_(NC), AD_(D), AH_(H), AW_(W), BD_(T), BH_(R), BW_(S), NF_(NF), + stride_d_(stride_d), stride_h_(stride_h), stride_w_(stride_w), + upsample_d_(1), upsample_h_(1), upsample_w_(1), + pad_d_(pad_d), pad_h_(pad_h), pad_w_(pad_w), + ty_(ty) +{ + CD_ = (AD_*upsample_d_ - BD_ + 1 + 2*pad_d_ + stride_d_ - 1)/stride_d_; + CH_ = (AH_*upsample_h_ - BH_ + 1 + 2*pad_h_ + stride_h_ - 1)/stride_h_; + CW_ = (AW_*upsample_w_ - BW_ + 1 + 2*pad_w_ + stride_w_ - 1)/stride_w_; + // shapes + shapes_a_ = {NB_, NC_, AD_, AH_, AW_}; + shapes_b_ = {NC_, BD_, BH_, BW_, NF_}; + shapes_c_ = {NB_, NF_, CD_, CH_, CW_}; + // swap a and c for bprop + if(ty_ == BPROP){ + pad_d_ = (CD_ - AD_ + BD_ - 1) / 2; + pad_h_ = (CH_ - AH_ + BH_ - 1) / 2; + pad_w_ = (CW_ - AW_ + BW_ - 1) / 2; + shapes_a_.swap(shapes_c_); + } + // swap b and c for wgrad + if(ty_ == WGRAD){ + shapes_b_.swap(shapes_c_); + std::swap(BD_, CD_); + std::swap(BH_, CH_); + std::swap(BW_, CW_); + } + // leading dimensions + auto set_ld = [](const std::vector& shapes, + std::vector& ld) { + size_t size = shapes.size(); + ld.resize(size); + ld[4] = 1; + ld[3] = shapes[4]*ld[4]; + ld[2] = shapes[3]*ld[3]; + ld[1] = shapes[2]*ld[2]; + ld[0] = shapes[1]*ld[1]; + }; + set_ld(shapes_a_, ld_a_); + set_ld(shapes_b_, ld_b_); + set_ld(shapes_c_, ld_c_); + // equivalent matmul + b_trans_ = ty_ != BPROP; + b_lut_ = ty_ == WGRAD; + if(ty_ == WGRAD){ + M_ = shapes_c_[0]*shapes_c_[1]*shapes_c_[2]*shapes_c_[3]; + N_ = shapes_c_[4]; + K_ = shapes_b_[0]*shapes_b_[2]*shapes_b_[3]*shapes_b_[4]; + } + else{ + M_ = shapes_c_[0]*shapes_c_[2]*shapes_c_[3]*shapes_c_[4]; + N_ = shapes_c_[1]; + K_ = shapes_b_[0]*shapes_b_[1]*shapes_b_[2]*shapes_b_[3]; + } + // look-up table info + if(ty_ == FPROP) + Fs_ = shapes_b_[1]*shapes_b_[2]*shapes_b_[3]; + else + Fs_ = K_; + TK_ = 8; + Luts_ = (TK_ + Fs_ - 1) / Fs_ * Fs_; + build_deltas(); + build_masks(); + size_t cst_size = h_b_deltas_.size()*4; + is_b_deltas_cst_ = cst_size < 65536; + cst_size += h_a_deltas_.size()*4; + is_a_deltas_cst = cst_size < 65536; + cst_size += h_masks_.size()*4; + is_mask_cst_ = cst_size < 65536; +} + +size_t conv::a_size() +{ return std::accumulate(shapes_a_.begin(), shapes_a_.end(), + 1, std::multiplies()); } + +size_t conv::b_size() +{ return std::accumulate(shapes_b_.begin(), shapes_b_.end(), + 1, std::multiplies()); } + +size_t conv::c_size() +{ return std::accumulate(shapes_c_.begin(), shapes_c_.end(), + 1, std::multiplies()); } + +std::vector conv::c_shapes() +{ return shapes_c_; } + +void conv::build_deltas(){ + h_a_deltas_.resize(Luts_ + upsample_d_*upsample_h_*upsample_w_*Luts_); + if(b_lut_) + h_b_deltas_.resize(Luts_); + + auto unpack = [&](int32_t ltrs){ + int32_t l = (ty_ == BPROP) ? ltrs % NF_ : ltrs / (BD_*BH_*BW_); + int32_t trs = (ty_ == BPROP) ? ltrs / NF_ : ltrs % (BD_*BH_*BW_); + int32_t tr = trs / BW_; + int32_t s = trs % BW_; + int32_t t = tr / BH_; + int32_t r = tr % BH_; + if(ty_ == BPROP){ + r = BH_ - 1 - r; + s = BW_ - 1 - s; + } + return std::make_tuple(l, t, r, s); + }; + + for(size_t i = 0; i < Luts_; ++i) + h_a_deltas_[i] = (((i + TK_) % Luts_) - i); + + size_t Ds0 = Luts_; + size_t Ds1 = upsample_w_; + size_t Ds2 = upsample_h_; + size_t Ds3 = upsample_d_; + for(size_t pd = 0; pd < Ds3; ++pd) + for(size_t ph = 0; ph < Ds2; ++ph) + for(size_t pw = 0; pw < Ds1; ++pw){ + int32_t* deltas_ptr = &h_a_deltas_[Luts_ + pw*Ds0 + ph*Ds0*Ds1 + pd*Ds0*Ds1*Ds2]; + // cumulative increments + for(size_t i = 0; i < Ds0; ++i) { + // unpack + int32_t ctrs = i; + int32_t c, t, r, s; + std::tie(c, t, r, s) = unpack(ctrs); + // next indices + int32_t nextctrs = ctrs + TK_; + int32_t nextc, nextt, nextr, nexts; + std::tie(nextc, nextt, nextr, nexts) = unpack(nextctrs); + // diffs + int32_t cdiff = nextc - c; + int32_t tdiff = (nextt + pd)/upsample_d_ - (t + pd)/upsample_d_; + int32_t rdiff = (nextr + ph)/upsample_h_ - (r + ph)/upsample_h_; + int32_t sdiff = (nexts + pw)/upsample_w_ - (s + pw)/upsample_w_; + // delta pointers + if(ty_ == WGRAD) + deltas_ptr[i] = cdiff*ld_a_[0] + tdiff*ld_a_[2] + rdiff*ld_a_[3] + sdiff*ld_a_[4]; + else + deltas_ptr[i] = cdiff*ld_a_[1] + tdiff*ld_a_[2] + rdiff*ld_a_[3] + sdiff*ld_a_[4]; + } + } + + if(ty_ == WGRAD){ + for(size_t i = 0; i < Ds0; ++i) { + int32_t c, t, r, s; + int32_t nextc, nextt, nextr, nexts; + std::tie(c, t, r, s) = unpack(i); + std::tie(nextc, nextt, nextr, nexts) = unpack(i + TK_); + int32_t cdiff = nextc - c, tdiff = nextt - t, rdiff = nextr - r, sdiff = nexts - s; + h_b_deltas_[i] = cdiff*ld_b_[0] + tdiff*ld_b_[2] + rdiff*ld_b_[3] + sdiff*ld_b_[4]; + } + } +} + +void conv::build_masks(){ + h_masks_.resize(Luts_ + (2*pad_h_+1)*(2*pad_w_+1)*(2*pad_d_+1)*Luts_); + + auto unpack = [&](int32_t ltrs){ + int32_t l = (ty_ == BPROP) ? ltrs % NF_ : ltrs / (BD_*BH_*BW_); + int32_t trs = (ty_ == BPROP) ? ltrs / NF_ : ltrs % (BD_*BH_*BW_); + int32_t tr = trs / BW_; + int32_t s = trs % BW_; + int32_t t = tr / BH_; + int32_t r = tr % BH_; + if(ty_ == BPROP){ + r = BH_ - 1 - r; + s = BW_ - 1 - s; + } + return std::make_tuple(l, t, r, s); + }; + size_t Ms0 = Luts_; + size_t Ms1 = 2*pad_w_ + 1; + size_t Ms2 = 2*pad_h_ + 1; + size_t Ms3 = 2*pad_d_ + 1; + for(size_t pd = 0; pd < Ms3; ++pd) + for(size_t ph = 0; ph < Ms2; ++ph) + for(size_t pw = 0; pw < Ms1; ++pw){ + int32_t* masks_ptr = &h_masks_[Luts_ + pw*Ms0 + ph*Ms0*Ms1 + pd*Ms0*Ms1*Ms2]; + for(size_t i = 0; i < Ms0; ++i){ + int32_t l, t, r, s; + int32_t mask = 0x0; + for(size_t j = 0; j < TK_; ++j){ + std::tie(l, t, r, s) = unpack(i + j); + bool in_bounds_d = (t + pd) >= pad_d_ && (t + pd) < (BD_ + pad_d_); + bool in_bounds_h = (r + ph) >= pad_h_ && (r + ph) < (BH_ + pad_h_); + bool in_bounds_w = (s + pw) >= pad_w_ && (s + pw) < (BW_ + pad_w_); + mask |= (in_bounds_d && in_bounds_h && in_bounds_w) << j; + } + masks_ptr[i] = mask; + } + } + for(size_t i = 0; i < Luts_; ++i) + h_masks_[i] = 0x0; +} + +std::array conv::get_grid(size_t TM, size_t TN) +{ return {(M_ + TM - 1)/TM, (N_ + TN - 1)/TN, 1}; } + +size_t conv::get_nflops() +{ return 2.*M_*N_*K_; } + +void conv::init(driver::stream *stream, triton::jit &jit) { + auto init_lut = [&](bool is_cst, const char *name, std::vector host) -> triton::driver::buffer*{ + if(host.empty()) + return nullptr; + size_t nbytes = host.size()*4; + // get buffer + triton::driver::buffer* buffer; + if(is_cst) + buffer = jit.get_buffer(name); + else + buffer = triton::driver::buffer::create(stream->context(), nbytes); + // copy + stream->write(buffer, false, 0, nbytes, host.data()); + return buffer; + }; + + d_a_deltas_ = init_lut(is_a_deltas_cst, "delta", h_a_deltas_); + d_b_deltas_ = init_lut(is_b_deltas_cst_, "b_delta", h_b_deltas_); + d_masks_ = init_lut(is_mask_cst_, "masks", h_masks_); +} + +void conv::set_arg(driver::kernel *kernel, + driver::buffer *a, driver::buffer *b, driver::buffer *c) +{ + kernel->setArg(0, a); + kernel->setArg(1, b); + kernel->setArg(2, c); + kernel->setArg(3, M_); + kernel->setArg(4, N_); + kernel->setArg(5, K_); + kernel->setArg(6, AH_); + kernel->setArg(7, AW_); + kernel->setArg(8, BH_); + kernel->setArg(9, BW_); + kernel->setArg(10, CH_); + kernel->setArg(11, CW_); + // A arguments + if(ty_ == WGRAD){ + kernel->setArg(12, ld_a_[1]); + kernel->setArg(13, ld_a_[0]); + } + else{ + kernel->setArg(12, ld_a_[0]); + kernel->setArg(13, ld_a_[1]); + } + kernel->setArg(14, ld_a_[2]); + kernel->setArg(15, ld_a_[3]); + kernel->setArg(16, ld_a_[4]); + // B arguments + if(ty_ == WGRAD){ + kernel->setArg(17, ld_b_[0]); + kernel->setArg(18, ld_b_[2]); + kernel->setArg(19, ld_b_[3]); + kernel->setArg(20, ld_b_[4]); + kernel->setArg(21, ld_b_[1]); + } + else{ + kernel->setArg(17, ld_b_[0]); + kernel->setArg(18, ld_b_[1]); + kernel->setArg(19, ld_b_[2]); + kernel->setArg(20, ld_b_[3]); + kernel->setArg(21, ld_b_[4]); + } + // C arguments + if(ty_ == WGRAD){ + kernel->setArg(22, ld_c_[0]); + kernel->setArg(23, ld_c_[4]); + kernel->setArg(24, ld_c_[1]); + kernel->setArg(25, ld_c_[2]); + kernel->setArg(26, ld_c_[3]); + } + else{ + kernel->setArg(22, ld_c_[0]); + kernel->setArg(23, ld_c_[1]); + kernel->setArg(24, ld_c_[2]); + kernel->setArg(25, ld_c_[3]); + kernel->setArg(26, ld_c_[4]); + } + kernel->setArg(27, pad_h_); + kernel->setArg(28, pad_w_); + size_t idx = 29; + if(!is_a_deltas_cst) + kernel->setArg(idx++, d_a_deltas_); + if(!is_b_deltas_cst_) + kernel->setArg(idx++, d_b_deltas_); + if(!is_mask_cst_) + kernel->setArg(idx++, d_masks_); +} + +std::vector conv::default_params() { + if(ty_==FPROP) + return {16, 2, 64, 32, 2, 64, 16, 8, 2, 2, 8, 1, 8, 4}; + else if(ty_ == BPROP) + return {32, 2, 64, 32, 64, 32, 4, 2, 2, 4, 2, 8, 4, 2}; + else if(ty_ == WGRAD) + return {32, 2, 64, 32, 2, 64, 16, 8, 2, 2, 4, 2, 8}; +} + + +std::string conv::src() { + bool is_wgrad = ty_ == WGRAD; + std::string BS = b_trans_ ? "[TN,TK]" : "[TK, TN]"; + std::string bcb0 = b_trans_ ? "[:, newaxis]" : "[newaxis, :]"; + std::string bcb1 = b_trans_ ? "[newaxis, :]" : "[:, newaxis]"; + std::string ldb0 = b_trans_ ? "*ldb_s" : ""; + std::string ldb1 = b_trans_ ? "*ldb_k" : "*ldb_c"; + std::string useb = b_trans_ ? "trans(b)" : "b"; + std::string flipr = b_trans_ ? "" : "BH - 1 -"; + std::string flips = b_trans_ ? "" : "BW - 1 -"; + std::string ax = b_trans_ ? "crs" : "rsc"; + std::vector redax; + if(b_trans_) + redax = {"C", "BH", "BW"}; + else + redax = {"BH", "BW", "N"}; + std::string inc_pb = is_wgrad ? "db[newaxis, :]" : "TK" + ldb0; + std::string a_delta_mem = is_a_deltas_cst ? "__constant__" : ""; + std::string b_delta_mem = is_b_deltas_cst_? "__constant__" : ""; + std::string masks_mem = is_mask_cst_? "__constant__" : ""; + + std::string res = + R"( +const tunable int32 TM = {16, 32, 64}; +const tunable int32 TN = {16, 32, 64}; +const tunable int32 TK = {8}; +)"; +if(is_a_deltas_cst) + res += "__constant__ int32* delta = alloc_const int32[" + std::to_string(h_a_deltas_.size()) + "];\n"; +if(is_wgrad && is_b_deltas_cst_) + res += "__constant__ int32* b_delta = alloc_const int32[" + std::to_string(h_b_deltas_.size()) + "];\n"; +if(is_mask_cst_) + res += "__constant__ int32* masks = alloc_const int32[" + std::to_string(h_masks_.size()) + "];\n"; +res += R"( + + void conv(read_only restrict fp32 *a, + read_only restrict fp32 *b, + fp32 *c, + int32 M, int32 N, int32 K, + int32 AH, int32 AW, + int32 BH, int32 BW, + int32 CH, int32 CW, + int32 lda_n, int32 lda_c, int32 lda_d, int32 lda_h, int32 lda_w, + int32 ldb_c, int32 ldb_t, int32 ldb_r, int32 ldb_s, int32 ldb_k, + int32 ldc_n, int32 ldc_k, int32 ldc_m, int32 ldc_p, int32 ldc_q, + int32 pad_h, int32 pad_w)"; +if(!is_a_deltas_cst) + res += ", int32* delta"; +if(is_wgrad && !is_b_deltas_cst_) + res += ", int32* b_delta"; +if(!is_mask_cst_) + res += ", int32* masks"; + res += R"(){ + int32 rxa[TM] = get_global_range[TM](0); + int32 rb0[TN] = get_global_range[TN](1); + int32 rka[TK] = 0 ... TK; + int32 rkb[TK] = 0 ... TK; + fp32 C[TM, TN] = 0; + int32 ldlut = )" + std::to_string(Fs_) + R"(; + int32 rabh[TM] = rxa / CW; + int32 raw[TM] = rxa % CW - pad_w; + int32 rab[TM] = rabh / CH; + int32 rah[TM] = rabh % CH - pad_h; + int32 ra0[TM] = rab*lda_n + rah*lda_h + raw*lda_w; + int32 ra)" + ax[0] + ax[1] + "[TK] = rka / " + redax[2] + R"(; + int32 ra)" + ax[2] + "[TK] = rka % " + redax[2] + R"(; + int32 ra)" + ax[0] + "[TK] = ra" + ax[0] + ax[1] + " / " + redax[1] + R"(; + int32 ra)" + ax[1] + "[TK] = ra" + ax[0] + ax[1] + " % " + redax[1] + R"(; + rar = )" + flipr + R"( rar; + ras = )" + flips + R"( ras; + int32 ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; + fp32* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis];)"; +if(ty_ == WGRAD){ + res += R"( + int32 rbcr[TK] = rkb / BW; + int32 rbs[TK] = rkb % BW; + int32 rbc[TK] = rbcr / BH; + int32 rbr[TK] = rbcr % BH; + int32 rb1[TK] = rbc*ldb_c + rbr*ldb_r + ras*ldb_s; + )" + b_delta_mem + R"( int32* pdb[TK] = b_delta + rkb; + int32 db[TK] = *pdb;)"; +} +else{ +res += R"( + int32 rb1[TK] = rkb;)"; +} +res += R"( + fp32* pb)" + BS + " = b + rb1" + bcb1 + ldb0 + " + rb0" + bcb0 + ldb1 + R"(; + )" + a_delta_mem + R"( int32* pincd[TK] = delta + rka; + )" + a_delta_mem + R"( int32* pd[TK] = delta + ldlut + rka; + int32 d[TK] = *pd; + int32 incd[TK] = *pincd; + int32 maskh[TM] = pad_h + min(rah, 0) + max(rah + BH - AH, 0); + int32 maskw[TM] = pad_w + min(raw, 0) + max(raw + BW - AW, 0); + )" + masks_mem + R"( int32* pm[TM] = masks + ldlut + maskw*ldlut + maskh*ldlut*(2*pad_w + 1); + )" + a_delta_mem + R"( int32* pincm[TM] = delta; + int32 incm[TM] = *pincm; + int32 checka0[TM] = *pm; + int32 checka1[TK] = 1 << rka; + int1 checka[TM, TK] = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; + fp32 a[TM, TK] = checka ? *pa : 0; + fp32 b)" + BS + R"( = *pb; + for(int32 k = K; k > 0; k = k - TK){ + C = dot(a, )" + useb + R"(, C); + pa = pa + d[newaxis, :]; + pb = pb + )" + inc_pb + R"(; + b = *pb; + pd = pd + incd;)"; +if(ty_ == WGRAD){ + res += R"( + pdb = pdb + TK; + db = *pdb;)"; +} + res += R"( + pincd = pincd + incd; + d = *pd; + incd = *pincd; + pm = pm + incm; + pincm = pincm + incm; + incm = *pincm; + checka0 = *pm; + checka = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; + checka = checka && (k > TK); + a = checka ? *pa : 0; + } + int32 rxc[TM] = get_global_range[TM](0); + int32 rc1[TN] = get_global_range[TN](1); + int32 rcn[TM] = rxc / (CH*CW); + int32 rcpq[TM] = rxc % (CH*CW); + int32 rc0[TM] = rcn * ldc_n + rcpq * ldc_q; + fp32* pc[TM, TN] = c + rc1[newaxis, :]*ldc_k + rc0[:, newaxis]; + int1 checkc0[TM] = rxc < M; + int1 checkc1[TN] = rc1 < N; + int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + @checkc *pc = C; +})"; + return res; +} + +template +void conv::cpu_xprop(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B) +{ + IN_DTYPE acc; + for(int32_t n = 0; n < shapes_c_[0]; ++n) + for(int32_t cf = 0; cf < shapes_c_[1] ; ++cf) + for(int32_t cd = 0 ; cd < shapes_c_[2]; ++cd) + for(int32_t ch = 0 ; ch < shapes_c_[3]; ++ch) + for(int32_t cw = 0; cw < shapes_c_[4]; ++cw) + { + acc = 0; + int32_t d = cd*stride_d_ - pad_d_; + int32_t h = ch*stride_h_ - pad_h_; + int32_t w = cw*stride_w_ - pad_w_; + for(int32_t ac = 0; ac < shapes_a_[1]; ++ac) + for(int32_t bd = 0; bd < shapes_b_[1]; ++bd) + for(int32_t bh = 0; bh < shapes_b_[2]; ++bh) + for(int32_t bw = 0; bw < shapes_b_[3]; ++bw){ + int32_t ad = d + bd; + int32_t ah = h + bh; + int32_t aw = w + bw; + bool in_bounds = (ad >= 0 && ad < shapes_a_[2] && + ah >= 0 && ah < shapes_a_[3] && + aw >= 0 && aw < shapes_a_[4]); + IN_DTYPE a = 0; + if(in_bounds) + a = A[n*ld_a_[0] + ac*ld_a_[1] + ad*ld_a_[2] + ah*ld_a_[3] + aw*ld_a_[4]]; + IN_DTYPE b; + if(ty_==FPROP) + b = B[ac*ld_b_[0] + bd*ld_b_[1] + bh*ld_b_[2] + bw*ld_b_[3] + cf*ld_b_[4]]; + else{ + int32_t bdd = shapes_b_[1] - 1 - bd; + int32_t bhh = shapes_b_[2] - 1 - bh; + int32_t bww = shapes_b_[3] - 1 - bw; + b = B[cf*ld_b_[0] + bdd*ld_b_[1] + bhh*ld_b_[2] + bww*ld_b_[3] + ac*ld_b_[4]]; + } + acc = std::fma(a, b, acc); + } + C[n*ld_c_[0] + cf*ld_c_[1] + cd*ld_c_[2] + ch*ld_c_[3] + cw*ld_c_[4]] = acc; + } +} + +template +void conv::cpu_wgrad(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B) +{ + IN_DTYPE acc; + for(int32_t c = 0 ; c < shapes_c_[0]; ++c) + for(int32_t cd = 0; cd < shapes_c_[1]; ++cd) + for(int32_t ch = 0; ch < shapes_c_[2]; ++ch) + for(int32_t cw = 0; cw < shapes_c_[3]; ++cw) + for(int32_t k = 0 ; k < shapes_c_[4]; ++k) + { + acc = 0; + int32_t d = cd*stride_d_ - pad_d_; + int32_t h = ch*stride_h_ - pad_h_; + int32_t w = cw*stride_w_ - pad_w_; + for(int32_t n = 0; n < shapes_b_[0]; ++n) + for(int32_t bd = 0; bd < shapes_b_[2]; ++bd) + for(int32_t bh = 0; bh < shapes_b_[3]; ++bh) + for(int32_t bw = 0; bw < shapes_b_[4]; ++bw){ + int32_t ad = d + bd; + int32_t ah = h + bh; + int32_t aw = w + bw; + bool in_bounds = (ad >= 0 && ad < shapes_a_[2] && + ah >= 0 && ah < shapes_a_[3] && + aw >= 0 && aw < shapes_a_[4]); + IN_DTYPE a = 0; + if(in_bounds) + a = A[n*ld_a_[0] + c*ld_a_[1] + ad*ld_a_[2] + ah*ld_a_[3] + aw*ld_a_[4]]; + IN_DTYPE b = B[n*ld_b_[0] + k*ld_b_[1] + bd*ld_b_[2] + bh*ld_b_[3] + bw*ld_b_[4]]; + acc = std::fma(a, b, acc); + } + C[c*ld_c_[0] + cd*ld_c_[1] + ch*ld_c_[2] + cw*ld_c_[3] + k*ld_c_[4]] = acc; + } +} + +template +void conv::cpu_ref(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B) +{ + if(ty_ == FPROP || ty_ == BPROP) + cpu_xprop(C, A, B); + else + cpu_wgrad(C, A, B); +} + +template void conv::cpu_ref(float*, float*, float*); +template void conv::cpu_xprop(float*, float*, float*); +template void conv::cpu_wgrad(float*, float*, float*); + +} +} diff --git a/lib/frontend/jit.cpp b/lib/frontend/jit.cpp deleted file mode 100644 index e69de29bb..000000000 From b2b55c52c9926245b3264b379532cbc33a0c6341 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 18 May 2019 11:51:49 -0400 Subject: [PATCH 150/494] [triton/python/conv]: Added cache for compiled kernels --- examples/cpp/conv.cpp | 12 +- examples/cpp/dot.cpp | 2 +- examples/cpp/shift.cpp | 6 +- examples/python/pytorch/conv.cpp | 74 +++++++---- examples/python/pytorch/main.py | 6 +- include/triton/dnn/conv.h | 142 +++++++++++++++++++- include/triton/jit.h | 117 ----------------- lib/dnn/conv.cpp | 143 +------------------- lib/driver/module.cpp | 8 +- lib/jit.cpp | 216 ------------------------------- 10 files changed, 210 insertions(+), 516 deletions(-) delete mode 100644 include/triton/jit.h delete mode 100644 lib/jit.cpp diff --git a/examples/cpp/conv.cpp b/examples/cpp/conv.cpp index 025ca0d4b..93f42b94e 100644 --- a/examples/cpp/conv.cpp +++ b/examples/cpp/conv.cpp @@ -1,7 +1,7 @@ #include #include #include "common.hpp" -#include "triton/jit.h" +#include "triton/runtime/jit.h" #include "triton/driver/backend.h" #include "triton/driver/stream.h" #include "triton/dnn/conv.h" @@ -10,11 +10,11 @@ int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); triton::jit jit(context); - triton::dnn::conv::type ty = triton::dnn::conv::WGRAD; + triton::dnn::conv::type ty = triton::dnn::conv::FPROP; // initialization - int32_t B = 32, NF = 128; + int32_t B = 4, NF = 32; int32_t D = 1, H = 56, W = 56; - int32_t NC = 128, T = 1, R = 3, S = 3; + int32_t NC = 32, T = 1, R = 3, S = 3; int32_t pad_d = 0, pad_h = 1, pad_w = 1; triton::dnn::conv configuration(B, NC, D, H, W, T, R, S, NF, 1, 1, 1, pad_d, pad_h, pad_w, ty); // convolution configuration @@ -45,7 +45,7 @@ int main() { unsigned TN = info.global_range_size[1]; unsigned nthreads = info.num_threads; std::array grid = configuration.get_grid(TM, TN); - configuration.init(stream, jit); + configuration.init(stream, (triton::driver::cu_module*)kernel->module()); stream->synchronize(); configuration.set_arg(kernel, da, db, dc); stream->enqueue(kernel, grid, {nthreads, 1, 1}); @@ -55,7 +55,7 @@ int main() { return configuration.get_nflops() / ts * 1e-3; }; std::string src = configuration.src(); - jit.autotune("conv", src.c_str(), benchmark); +// jit.autotune("conv", src.c_str(), benchmark); jit.add_module("conv", src.c_str(), configuration.default_params()); triton::driver::kernel* kernel = jit.get_function("conv"); triton::jit::launch_information info = jit.get_launch_info("conv"); diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 3dde373ef..2beee1c8d 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -1,7 +1,7 @@ #include #include #include "common.hpp" -#include "triton/jit.h" +#include "triton/runtime/jit.h" #include "triton/driver/backend.h" #include "triton/driver/stream.h" #include "triton/dnn/gemm.h" diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index 026cdfaea..4391f775b 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -1,7 +1,7 @@ #include #include #include "common.hpp" -#include "triton/jit.h" +#include "triton/runtime/jit.h" #include "triton/driver/backend.h" #include "triton/driver/stream.h" @@ -158,8 +158,8 @@ int main() { unsigned TN = info.global_range_size[1]; unsigned nthreads = info.num_threads; // initialize constant memory - triton::driver::buffer* delta = jit.get_buffer("delta"); - triton::driver::buffer* masks = jit.get_buffer("masks"); + triton::driver::buffer* delta = ((triton::driver::cu_module*)kernel->module())->symbol("delta"); + triton::driver::buffer* masks = ((triton::driver::cu_module*)kernel->module())->symbol("masks"); stream->write(delta, false, 0, h_delta.size()*4, h_delta.data()); stream->write(masks, false, 0, h_masks.size()*4, h_masks.data()); stream->synchronize(); diff --git a/examples/python/pytorch/conv.cpp b/examples/python/pytorch/conv.cpp index f8636c482..577d23ee0 100644 --- a/examples/python/pytorch/conv.cpp +++ b/examples/python/pytorch/conv.cpp @@ -2,7 +2,7 @@ #include #include "ATen/cuda/CUDAContext.h" #include -#include "triton/jit.h" +#include "triton/runtime/jit.h" #include "triton/driver/stream.h" #include "triton/dnn/conv.h" @@ -10,6 +10,16 @@ #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) +typedef std::tuple conv_key_t; + +static std::map> m_stream; +static std::map> m_jit; +static std::map> m_config; + torch::Tensor conv_common( int32_t B, int32_t C, int32_t D, int32_t H, int32_t W, int32_t T, int32_t R, int32_t S, int32_t NF, @@ -18,41 +28,59 @@ torch::Tensor conv_common( triton::dnn::conv::type ty, torch::Tensor torcha, torch::Tensor torchb ) { - // Configuration - triton::dnn::conv configuration(B, C, D, H, W, T, R, S, NF, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, ty); + // Wrap CUDA handles + c10::DeviceIndex device = torcha.storage().device().index(); + // Get stream + CUstream custream = (CUstream)at::cuda::getCurrentCUDAStream(device).stream(); + triton::driver::stream* stream; + if(m_stream.find(custream) == m_stream.end()) + stream = m_stream.emplace(custream, new triton::driver::cu_stream(custream, false)).first->second.get(); + else + stream = m_stream.at(custream).get(); + // Get context + triton::driver::context* ctx = stream->context(); + // Get configuration + conv_key_t key = {B, C, D, H, W, T, R, S, NF, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, ty}; + triton::dnn::conv* configuration; + if(m_config.find(key) == m_config.end()) + configuration = m_config.emplace(key, new triton::dnn::conv( + B, C, D, H, W, T, R, S, NF, + stride_d, stride_h, stride_w, + pad_d, pad_h, pad_w, ty)).first->second.get(); + else + configuration = m_config.at(key).get(); + // Get JIT + triton::jit* jit; + if(m_jit.find(key) == m_jit.end()){ + jit = m_jit.emplace(key, new triton::jit(ctx)).first->second.get(); + std::string src = configuration->src(); + jit->add_module("conv", src.c_str(), configuration->default_params()); + } + else + jit = m_jit.at(key).get(); + // Get memory + triton::driver::cu_buffer a(ctx, (CUdeviceptr)torcha.storage().data(), false); + triton::driver::cu_buffer b(ctx, (CUdeviceptr)torchb.storage().data(), false); // Allocate output - std::vector c_shapes = configuration.c_shapes(); + std::vector c_shapes = configuration->c_shapes(); torch::Tensor torchc; if(ty == triton::dnn::conv::WGRAD) torchc = torch::empty({c_shapes[0], c_shapes[2], c_shapes[3], c_shapes[4]}, torch::kFloat).cuda(); else torchc = torch::empty({c_shapes[0], c_shapes[1], c_shapes[3], c_shapes[4]}, torch::kFloat).cuda(); - // Wrap CUDA handles - c10::DeviceIndex device = torchc.storage().device().index(); - triton::driver::cu_stream sstream((CUstream)at::cuda::getCurrentCUDAStream(device).stream(), false); - triton::driver::stream* stream = &sstream; - triton::driver::context* ctx = stream->context(); - triton::driver::cu_buffer a(ctx, (CUdeviceptr)torcha.storage().data(), false); - triton::driver::cu_buffer b(ctx, (CUdeviceptr)torchb.storage().data(), false); triton::driver::cu_buffer c(ctx, (CUdeviceptr)torchc.storage().data(), false); - stream->synchronize(); - // Create JIT - triton::jit jit(ctx); - std::string src = configuration.src(); - jit.add_module("conv", src.c_str(), configuration.default_params()); - triton::driver::kernel* kernel = jit.get_function("conv"); - triton::jit::launch_information info = jit.get_launch_info("conv"); + // Add module to JIT + triton::driver::kernel* kernel = jit->get_function("conv"); + triton::jit::launch_information info = jit->get_launch_info("conv"); // launch info unsigned TM = info.global_range_size[0]; unsigned TN = info.global_range_size[1]; // launch info - configuration.init(stream, jit); + configuration->init(stream, (triton::driver::cu_module*)kernel->module()); unsigned nthreads = info.num_threads; - std::array grid = configuration.get_grid(TM, TN); - configuration.set_arg(kernel, &a, &b, &c); - stream->synchronize(); + std::array grid = configuration->get_grid(TM, TN); + configuration->set_arg(kernel, &a, &b, &c); stream->enqueue(kernel, grid, {nthreads, 1, 1}); - stream->synchronize(); return torchc; } diff --git a/examples/python/pytorch/main.py b/examples/python/pytorch/main.py index c0568f8b4..c4601fe0f 100644 --- a/examples/python/pytorch/main.py +++ b/examples/python/pytorch/main.py @@ -1,4 +1,5 @@ import torch +import time torch.manual_seed(0) class TritonConv(torch.autograd.Function): @@ -14,9 +15,9 @@ class TritonConv(torch.autograd.Function): input, weight = ctx.saved_tensors grad_input = grad_weight = None if ctx.needs_input_grad[0]: - grad_input = torch.ops.triton.conv_bprop(grad_output.contiguous(), weight) + grad_input = torch.ops.triton.conv_bprop(grad_output, weight) if ctx.needs_input_grad[1]: - grad_weight = torch.ops.triton.conv_wgrad(input, grad_output.contiguous()) + grad_weight = torch.ops.triton.conv_wgrad(input, grad_output) return grad_input, grad_weight @@ -38,6 +39,7 @@ x.grad.zero_() w.grad.zero_() culoss, cuy, cudx, cudw = run(x, cuw, lambda x, w: torch.nn.functional.conv2d(x, w, padding=1)) + print((tty - cuy).norm(2)) print((ttdx - cudx).norm(2)) print((ttdw.permute(3,0,1,2) - cudw).norm(2)) diff --git a/include/triton/dnn/conv.h b/include/triton/dnn/conv.h index 20b430187..d01007aa6 100644 --- a/include/triton/dnn/conv.h +++ b/include/triton/dnn/conv.h @@ -4,7 +4,6 @@ #include #include "triton/driver/stream.h" #include "triton/driver/kernel.h" -#include "triton/jit.h" namespace triton{ namespace dnn{ @@ -34,7 +33,7 @@ public: // initialize void build_deltas(); void build_masks(); - void init(driver::stream *stream, triton::jit &jit); + void init(driver::stream *stream, driver::cu_module *module); std::array get_grid(size_t TM, size_t TN); void set_arg(driver::kernel *kernel, driver::buffer *a, driver::buffer *b, driver::buffer *c); @@ -44,7 +43,144 @@ public: std::vector default_params(); // source - std::string src(); + std::string src(){ + bool is_wgrad = ty_ == WGRAD; + std::string BS = b_trans_ ? "[TN,TK]" : "[TK, TN]"; + std::string bcb0 = b_trans_ ? "[:, newaxis]" : "[newaxis, :]"; + std::string bcb1 = b_trans_ ? "[newaxis, :]" : "[:, newaxis]"; + std::string ldb0 = b_trans_ ? "*ldb_s" : ""; + std::string ldb1 = b_trans_ ? "*ldb_k" : "*ldb_c"; + std::string useb = b_trans_ ? "trans(b)" : "b"; + std::string flipr = b_trans_ ? "" : "BH - 1 -"; + std::string flips = b_trans_ ? "" : "BW - 1 -"; + std::string ax = b_trans_ ? "crs" : "rsc"; + std::vector redax; + if(b_trans_) + redax = {"C", "BH", "BW"}; + else + redax = {"BH", "BW", "N"}; + std::string inc_pb = is_wgrad ? "db[newaxis, :]" : "TK" + ldb0; + std::string a_delta_mem = is_a_deltas_cst ? "__constant__" : ""; + std::string b_delta_mem = is_b_deltas_cst_? "__constant__" : ""; + std::string masks_mem = is_mask_cst_? "__constant__" : ""; + + std::string res = + R"( + const tunable int32 TM = {16, 32, 64}; + const tunable int32 TN = {16, 32, 64}; + const tunable int32 TK = {8}; + )"; + if(is_a_deltas_cst) + res += "__constant__ int32* delta = alloc_const int32[" + std::to_string(h_a_deltas_.size()) + "];\n"; + if(is_wgrad && is_b_deltas_cst_) + res += "__constant__ int32* b_delta = alloc_const int32[" + std::to_string(h_b_deltas_.size()) + "];\n"; + if(is_mask_cst_) + res += "__constant__ int32* masks = alloc_const int32[" + std::to_string(h_masks_.size()) + "];\n"; + res += R"( + + void conv(read_only restrict fp32 *a, + read_only restrict fp32 *b, + fp32 *c, + int32 M, int32 N, int32 K, + int32 AH, int32 AW, + int32 BH, int32 BW, + int32 CH, int32 CW, + int32 lda_n, int32 lda_c, int32 lda_d, int32 lda_h, int32 lda_w, + int32 ldb_c, int32 ldb_t, int32 ldb_r, int32 ldb_s, int32 ldb_k, + int32 ldc_n, int32 ldc_k, int32 ldc_m, int32 ldc_p, int32 ldc_q, + int32 pad_h, int32 pad_w)"; + if(!is_a_deltas_cst) + res += ", int32* delta"; + if(is_wgrad && !is_b_deltas_cst_) + res += ", int32* b_delta"; + if(!is_mask_cst_) + res += ", int32* masks"; + res += R"(){ + int32 rxa[TM] = get_global_range[TM](0); + int32 rb0[TN] = get_global_range[TN](1); + int32 rka[TK] = 0 ... TK; + int32 rkb[TK] = 0 ... TK; + fp32 C[TM, TN] = 0; + int32 ldlut = )" + std::to_string(Fs_) + R"(; + int32 rabh[TM] = rxa / CW; + int32 raw[TM] = rxa % CW - pad_w; + int32 rab[TM] = rabh / CH; + int32 rah[TM] = rabh % CH - pad_h; + int32 ra0[TM] = rab*lda_n + rah*lda_h + raw*lda_w; + int32 ra)" + ax[0] + ax[1] + "[TK] = rka / " + redax[2] + R"(; + int32 ra)" + ax[2] + "[TK] = rka % " + redax[2] + R"(; + int32 ra)" + ax[0] + "[TK] = ra" + ax[0] + ax[1] + " / " + redax[1] + R"(; + int32 ra)" + ax[1] + "[TK] = ra" + ax[0] + ax[1] + " % " + redax[1] + R"(; + rar = )" + flipr + R"( rar; + ras = )" + flips + R"( ras; + int32 ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; + fp32* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis];)"; + if(ty_ == WGRAD){ + res += R"( + int32 rbcr[TK] = rkb / BW; + int32 rbs[TK] = rkb % BW; + int32 rbc[TK] = rbcr / BH; + int32 rbr[TK] = rbcr % BH; + int32 rb1[TK] = rbc*ldb_c + rbr*ldb_r + ras*ldb_s; + )" + b_delta_mem + R"( int32* pdb[TK] = b_delta + rkb; + int32 db[TK] = *pdb;)"; + } + else{ + res += R"( + int32 rb1[TK] = rkb;)"; + } + res += R"( + fp32* pb)" + BS + " = b + rb1" + bcb1 + ldb0 + " + rb0" + bcb0 + ldb1 + R"(; + )" + a_delta_mem + R"( int32* pincd[TK] = delta + rka; + )" + a_delta_mem + R"( int32* pd[TK] = delta + ldlut + rka; + int32 d[TK] = *pd; + int32 incd[TK] = *pincd; + int32 maskh[TM] = pad_h + min(rah, 0) + max(rah + BH - AH, 0); + int32 maskw[TM] = pad_w + min(raw, 0) + max(raw + BW - AW, 0); + )" + masks_mem + R"( int32* pm[TM] = masks + ldlut + maskw*ldlut + maskh*ldlut*(2*pad_w + 1); + )" + a_delta_mem + R"( int32* pincm[TM] = delta; + int32 incm[TM] = *pincm; + int32 checka0[TM] = *pm; + int32 checka1[TK] = 1 << rka; + int1 checka[TM, TK] = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; + fp32 a[TM, TK] = checka ? *pa : 0; + fp32 b)" + BS + R"( = *pb; + for(int32 k = K; k > 0; k = k - TK){ + C = dot(a, )" + useb + R"(, C); + pa = pa + d[newaxis, :]; + pb = pb + )" + inc_pb + R"(; + b = *pb; + pd = pd + incd;)"; + if(ty_ == WGRAD){ + res += R"( + pdb = pdb + TK; + db = *pdb;)"; + } + res += R"( + pincd = pincd + incd; + d = *pd; + incd = *pincd; + pm = pm + incm; + pincm = pincm + incm; + incm = *pincm; + checka0 = *pm; + checka = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; + checka = checka && (k > TK); + a = checka ? *pa : 0; + } + int32 rxc[TM] = get_global_range[TM](0); + int32 rc1[TN] = get_global_range[TN](1); + int32 rcn[TM] = rxc / (CH*CW); + int32 rcpq[TM] = rxc % (CH*CW); + int32 rc0[TM] = rcn * ldc_n + rcpq * ldc_q; + fp32* pc[TM, TN] = c + rc1[newaxis, :]*ldc_k + rc0[:, newaxis]; + int1 checkc0[TM] = rxc < M; + int1 checkc1[TN] = rc1 < N; + int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + @checkc *pc = C; + })"; + return res; + } // cpu check template diff --git a/include/triton/jit.h b/include/triton/jit.h deleted file mode 100644 index a3e554c67..000000000 --- a/include/triton/jit.h +++ /dev/null @@ -1,117 +0,0 @@ -#ifndef TDL_INCLUDE_JIT_H -#define TDL_INCLUDE_JIT_H - -#include -#include -#include "llvm/IR/LLVMContext.h" -#include "triton/ir/context.h" -#include "triton/ir/print.h" -#include "triton/driver/module.h" -#include "triton/driver/kernel.h" -#include "triton/codegen/selection.h" -#include "triton/codegen/tune.h" -#include "triton/codegen/optimize_dot.h" -#include "triton/codegen/optimize_cse.h" -#include "triton/codegen/optimize_trans.h" -#include "triton/codegen/shmem_allocation.h" -#include "triton/codegen/shmem_liveness.h" -#include "triton/codegen/shmem_info.h" -#include "triton/codegen/shmem_barriers.h" -#include "triton/codegen/target.h" -#include "triton/codegen/vectorize.h" -#include - -namespace llvm { - class Module; -} - -namespace triton { - -namespace codegen{ -class tune; -} - -namespace ir { -class module; -class context; -class metaparameter; -} - -class jit { -public: - struct launch_information{ - std::vector global_range_size; - unsigned num_threads; - }; - typedef std::function benchmark_t; - - struct passes_wrapper { - passes_wrapper(codegen::target* target) - : shmem_liveness(&shmem_info), - shmem_allocation(&shmem_liveness, &shmem_info), - shmem_barriers(&shmem_allocation, &shmem_info), - vectorize(&tune), - selection(&shmem_allocation, &tune, &shmem_info, target), - optimize_dot(&tune), - optimize_cse(), - optimize_trans(), - target_(target) { } - - void target_independent(ir::module &module) { - optimize_dot.run(module); - optimize_trans.run(module); - } - - void target_dependent(ir::module &module) { - if(target_->is_gpu()){ - shmem_info.run(module); - shmem_liveness.run(module); - shmem_allocation.run(); - shmem_barriers.run(module); - } - vectorize.run(module); - } - - codegen::tune tune; - codegen::shmem_info shmem_info; - codegen::shmem_liveness shmem_liveness; - codegen::shmem_allocation shmem_allocation; - codegen::shmem_barriers shmem_barriers; - codegen::vectorize vectorize; - codegen::selection selection; - codegen::optimize_dot optimize_dot; - codegen::optimize_cse optimize_cse; - codegen::optimize_trans optimize_trans; - codegen::target* target_; - }; - -private: - std::string compute_data_layout(bool is_64bit = true, bool use_short_pointers = true); - std::unique_ptr make_llvm_module(triton::ir::module &module, passes_wrapper &passes); - std::unique_ptr make_triton_module(const char* name, const char* src); - -public: - jit(driver::context* context); - ~jit(); - void autotune(const char* name, const char* src, benchmark_t benchmark); - void add_module(ir::module &module, const std::vector& params = {}); - void add_module(const char* name, const char* src, const std::vector& params = {}); - driver::kernel* get_function(const char* name); - launch_information get_launch_info(const char* name); - unsigned get_int(const char* name); - driver::buffer* get_buffer(const char* name); - -private: - std::vector modules_; - driver::context* driver_context_; - llvm::LLVMContext llvm_context_; - ir::context triton_context_; - std::map launch_info_map_; - std::map global_ints_; - std::unique_ptr target_; -}; - - -} - -#endif diff --git a/lib/dnn/conv.cpp b/lib/dnn/conv.cpp index be47b95c5..2c551241c 100644 --- a/lib/dnn/conv.cpp +++ b/lib/dnn/conv.cpp @@ -207,7 +207,7 @@ std::array conv::get_grid(size_t TM, size_t TN) size_t conv::get_nflops() { return 2.*M_*N_*K_; } -void conv::init(driver::stream *stream, triton::jit &jit) { +void conv::init(driver::stream *stream, triton::driver::cu_module* module) { auto init_lut = [&](bool is_cst, const char *name, std::vector host) -> triton::driver::buffer*{ if(host.empty()) return nullptr; @@ -215,7 +215,7 @@ void conv::init(driver::stream *stream, triton::jit &jit) { // get buffer triton::driver::buffer* buffer; if(is_cst) - buffer = jit.get_buffer(name); + buffer = module->symbol(name); else buffer = triton::driver::buffer::create(stream->context(), nbytes); // copy @@ -306,145 +306,6 @@ std::vector conv::default_params() { } -std::string conv::src() { - bool is_wgrad = ty_ == WGRAD; - std::string BS = b_trans_ ? "[TN,TK]" : "[TK, TN]"; - std::string bcb0 = b_trans_ ? "[:, newaxis]" : "[newaxis, :]"; - std::string bcb1 = b_trans_ ? "[newaxis, :]" : "[:, newaxis]"; - std::string ldb0 = b_trans_ ? "*ldb_s" : ""; - std::string ldb1 = b_trans_ ? "*ldb_k" : "*ldb_c"; - std::string useb = b_trans_ ? "trans(b)" : "b"; - std::string flipr = b_trans_ ? "" : "BH - 1 -"; - std::string flips = b_trans_ ? "" : "BW - 1 -"; - std::string ax = b_trans_ ? "crs" : "rsc"; - std::vector redax; - if(b_trans_) - redax = {"C", "BH", "BW"}; - else - redax = {"BH", "BW", "N"}; - std::string inc_pb = is_wgrad ? "db[newaxis, :]" : "TK" + ldb0; - std::string a_delta_mem = is_a_deltas_cst ? "__constant__" : ""; - std::string b_delta_mem = is_b_deltas_cst_? "__constant__" : ""; - std::string masks_mem = is_mask_cst_? "__constant__" : ""; - - std::string res = - R"( -const tunable int32 TM = {16, 32, 64}; -const tunable int32 TN = {16, 32, 64}; -const tunable int32 TK = {8}; -)"; -if(is_a_deltas_cst) - res += "__constant__ int32* delta = alloc_const int32[" + std::to_string(h_a_deltas_.size()) + "];\n"; -if(is_wgrad && is_b_deltas_cst_) - res += "__constant__ int32* b_delta = alloc_const int32[" + std::to_string(h_b_deltas_.size()) + "];\n"; -if(is_mask_cst_) - res += "__constant__ int32* masks = alloc_const int32[" + std::to_string(h_masks_.size()) + "];\n"; -res += R"( - - void conv(read_only restrict fp32 *a, - read_only restrict fp32 *b, - fp32 *c, - int32 M, int32 N, int32 K, - int32 AH, int32 AW, - int32 BH, int32 BW, - int32 CH, int32 CW, - int32 lda_n, int32 lda_c, int32 lda_d, int32 lda_h, int32 lda_w, - int32 ldb_c, int32 ldb_t, int32 ldb_r, int32 ldb_s, int32 ldb_k, - int32 ldc_n, int32 ldc_k, int32 ldc_m, int32 ldc_p, int32 ldc_q, - int32 pad_h, int32 pad_w)"; -if(!is_a_deltas_cst) - res += ", int32* delta"; -if(is_wgrad && !is_b_deltas_cst_) - res += ", int32* b_delta"; -if(!is_mask_cst_) - res += ", int32* masks"; - res += R"(){ - int32 rxa[TM] = get_global_range[TM](0); - int32 rb0[TN] = get_global_range[TN](1); - int32 rka[TK] = 0 ... TK; - int32 rkb[TK] = 0 ... TK; - fp32 C[TM, TN] = 0; - int32 ldlut = )" + std::to_string(Fs_) + R"(; - int32 rabh[TM] = rxa / CW; - int32 raw[TM] = rxa % CW - pad_w; - int32 rab[TM] = rabh / CH; - int32 rah[TM] = rabh % CH - pad_h; - int32 ra0[TM] = rab*lda_n + rah*lda_h + raw*lda_w; - int32 ra)" + ax[0] + ax[1] + "[TK] = rka / " + redax[2] + R"(; - int32 ra)" + ax[2] + "[TK] = rka % " + redax[2] + R"(; - int32 ra)" + ax[0] + "[TK] = ra" + ax[0] + ax[1] + " / " + redax[1] + R"(; - int32 ra)" + ax[1] + "[TK] = ra" + ax[0] + ax[1] + " % " + redax[1] + R"(; - rar = )" + flipr + R"( rar; - ras = )" + flips + R"( ras; - int32 ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; - fp32* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis];)"; -if(ty_ == WGRAD){ - res += R"( - int32 rbcr[TK] = rkb / BW; - int32 rbs[TK] = rkb % BW; - int32 rbc[TK] = rbcr / BH; - int32 rbr[TK] = rbcr % BH; - int32 rb1[TK] = rbc*ldb_c + rbr*ldb_r + ras*ldb_s; - )" + b_delta_mem + R"( int32* pdb[TK] = b_delta + rkb; - int32 db[TK] = *pdb;)"; -} -else{ -res += R"( - int32 rb1[TK] = rkb;)"; -} -res += R"( - fp32* pb)" + BS + " = b + rb1" + bcb1 + ldb0 + " + rb0" + bcb0 + ldb1 + R"(; - )" + a_delta_mem + R"( int32* pincd[TK] = delta + rka; - )" + a_delta_mem + R"( int32* pd[TK] = delta + ldlut + rka; - int32 d[TK] = *pd; - int32 incd[TK] = *pincd; - int32 maskh[TM] = pad_h + min(rah, 0) + max(rah + BH - AH, 0); - int32 maskw[TM] = pad_w + min(raw, 0) + max(raw + BW - AW, 0); - )" + masks_mem + R"( int32* pm[TM] = masks + ldlut + maskw*ldlut + maskh*ldlut*(2*pad_w + 1); - )" + a_delta_mem + R"( int32* pincm[TM] = delta; - int32 incm[TM] = *pincm; - int32 checka0[TM] = *pm; - int32 checka1[TK] = 1 << rka; - int1 checka[TM, TK] = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; - fp32 a[TM, TK] = checka ? *pa : 0; - fp32 b)" + BS + R"( = *pb; - for(int32 k = K; k > 0; k = k - TK){ - C = dot(a, )" + useb + R"(, C); - pa = pa + d[newaxis, :]; - pb = pb + )" + inc_pb + R"(; - b = *pb; - pd = pd + incd;)"; -if(ty_ == WGRAD){ - res += R"( - pdb = pdb + TK; - db = *pdb;)"; -} - res += R"( - pincd = pincd + incd; - d = *pd; - incd = *pincd; - pm = pm + incm; - pincm = pincm + incm; - incm = *pincm; - checka0 = *pm; - checka = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; - checka = checka && (k > TK); - a = checka ? *pa : 0; - } - int32 rxc[TM] = get_global_range[TM](0); - int32 rc1[TN] = get_global_range[TN](1); - int32 rcn[TM] = rxc / (CH*CW); - int32 rcpq[TM] = rxc % (CH*CW); - int32 rc0[TM] = rcn * ldc_n + rcpq * ldc_q; - fp32* pc[TM, TN] = c + rc1[newaxis, :]*ldc_k + rc0[:, newaxis]; - int1 checkc0[TM] = rxc < M; - int1 checkc1[TN] = rc1 < N; - int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - @checkc *pc = C; -})"; - return res; -} - template void conv::cpu_xprop(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B) { diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 1df832aeb..3f595b318 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -100,10 +100,10 @@ module* module::create(driver::context* ctx, llvm::Module *src) { } void module::compile_llvm_module(llvm::Module* module, const std::string& triple, - const std::string &proc, std::string layout, - llvm::SmallVectorImpl &buffer, - const std::string& features, - file_type_t ft) { + const std::string &proc, std::string layout, + llvm::SmallVectorImpl &buffer, + const std::string& features, + file_type_t ft) { init_llvm(); // debug // llvm::legacy::PassManager pm; diff --git a/lib/jit.cpp b/lib/jit.cpp deleted file mode 100644 index 059f96a00..000000000 --- a/lib/jit.cpp +++ /dev/null @@ -1,216 +0,0 @@ -#include "triton/jit.h" -#include -#include "triton/ast/ast.h" -#include "triton/codegen/target.h" -#include "triton/ir/context.h" -#include "triton/ir/context_impl.h" -#include "triton/driver/device.h" -#include "triton/driver/error.h" -#include "llvm/IR/IRPrintingPasses.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/PassManager.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/TargetSelect.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetOptions.h" -#include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/IR/LegacyPassManager.h" -#include "llvm/Transforms/Scalar/EarlyCSE.h" -#include "llvm/Analysis/LoopPass.h" - -typedef struct yy_buffer_state * YY_BUFFER_STATE; -extern int yyparse(); -extern YY_BUFFER_STATE yy_scan_string(const char * str); -extern void yy_delete_buffer(YY_BUFFER_STATE buffer); -using triton::ast::translation_unit; -extern translation_unit *ast_root; - -namespace triton { - -void loop_nest(std::vector const & ranges, std::function const &)> const & f){ - size_t D = ranges.size(); - std::vector values(D, 0); - // Start with innermost loop - size_t i = D - 1; - while(true){ - //Execute function - f(values); - //Increment counters - while(values[i]++ == ranges[i] - 1){ - if(i == 0) - return; - values[i--] = 0; - } - i = D - 1; - } -} - -template -void loop_nest(std::vector> const & iterates, std::function)> const & f){ - //Ranges to iterate over - std::vector ranges; - for(auto const & x: iterates) - ranges.push_back(x.size()); - //Proxy function - auto proxy = [&](std::vector const & idx){ - std::vector x(iterates.size()); - for(size_t i = 0; i < x.size(); ++i) - x[i] = iterates[i][idx[i]]; - f(x); - }; - //Iterate - loop_nest(ranges, proxy); -} - - - - -std::unique_ptr jit::make_llvm_module(ir::module &module, passes_wrapper &passes) { - llvm::Module* result = new llvm::Module(module.get_name(), llvm_context_); - passes.selection.run(module, *result); - // launch information - launch_information& info = launch_info_map_[result->getName()]; - info.global_range_size.clear(); - for(unsigned i = 0; i < passes.tune.get_num_global_range(); i++) - info.global_range_size.push_back(passes.tune.get_global_range_size(i)); - info.num_threads = passes.tune.get_num_threads(); - return std::unique_ptr(result); -} - -std::unique_ptr jit::make_triton_module(const char *name, const char *src) { - // create AST from Triton-C source - YY_BUFFER_STATE buffer = yy_scan_string(src); - yyparse(); - yy_delete_buffer(buffer); - translation_unit *program = ast_root; - // create Triton-IR from AST - ir::module* module = new ir::module(name, triton_context_); - program->codegen(module); - return std::unique_ptr(module); -} - - -jit::jit(driver::context *context): driver_context_(context), - target_(context->device()->make_target()) { } - -jit::~jit(){ } - -void jit::autotune(const char *name, const char *src, benchmark_t benchmark) { - // find metaparameters - auto ptt_module = make_triton_module(name, src); - ir::module &tt_module = *ptt_module; - // set parameters - passes_wrapper passes(target_.get()); - passes.target_independent(tt_module); - passes.tune.run(tt_module); - auto mps = passes.tune.get_params(tt_module); - // create parameter ranges - std::vector> ranges; - for(ir::metaparameter *mp: mps) - ranges.push_back(mp->get_space()); -// std::cout << ranges.size() << std::endl; - // iterate over parameters - unsigned i; - double best = 0; - loop_nest(ranges, [&](const std::vector params){ - std::map> errors; - i = 0; - for(ir::metaparameter *mp: mps) - mp->set_value(params[i++]); - passes.target_independent(tt_module); - passes.tune.init(tt_module); - if(!passes.tune.check_constraints(errors)) - return; - // Deep copy of the module and tuner - auto ptt_module = make_triton_module(name, src); - ir::module &tt_module = *ptt_module; - passes_wrapper passes(target_.get()); - passes.target_independent(tt_module); - passes.tune.run(tt_module); - i = 0; - for(ir::metaparameter* mp: passes.tune.get_params(tt_module)){ - mp->set_value(params[i++]); - } - passes.tune.init(tt_module); - passes.target_dependent(tt_module); - driver::device* device = driver_context_->device(); - if(passes.shmem_allocation.get_allocated_size() > device->max_shared_memory()) - return; - if(passes.tune.get_num_threads() > device->max_threads_per_block()) - return; - // Compile - auto ll_module = make_llvm_module(tt_module, passes); - std::unique_ptr module(driver::module::create(driver_context_, &*ll_module)); - std::unique_ptr kernel(driver::kernel::create(module.get(), name)); - launch_information info = launch_info_map_.at(name); - for(unsigned p: params) - std::cout << p << " " << std::flush; - // add globals - for(auto x: tt_module.globals()) - global_ints_[x.first] = ((ir::metaparameter*)x.second)->get_value(); - modules_.push_back(module.get()); - double perf; - perf = benchmark(kernel.get(), info); - best = std::max(perf, best); - std::cout << perf << " [ " << best << " ] " << std::endl; - modules_.pop_back(); - }); -} - -void jit::add_module(ir::module &tt_module, const std::vector ¶ms) { - // set parameters - passes_wrapper passes(target_.get()); - passes.target_independent(tt_module); - passes.tune.run(tt_module); - unsigned i = 0; - for(ir::metaparameter* mp: passes.tune.get_params(tt_module)) - mp->set_value(params[i++]); - passes.tune.init(tt_module); - passes.target_dependent(tt_module); - // check constraints - std::map> errors; - passes.tune.check_constraints(errors); - for(auto x: errors){ - std::cout << x.first << std::endl; - for(auto str: x.second) - std::cout << str << std::endl; - } - if(errors.size()) - throw std::runtime_error("invalid parameters"); -// driver::device* device = driver_context_->device(); -// if(passes.allocation.get_allocated_size() > device->max_shared_memory()) -// throw std::runtime_error("invalid parameters"); - // triton module -> llvm module - auto ll_module = make_llvm_module(tt_module, passes); - // llvm module -> machine code - modules_.push_back(driver::module::create(driver_context_, &*ll_module)); - // add globals - for(auto x: tt_module.globals()) - global_ints_[x.first] = ((ir::metaparameter*)x.second)->get_value(); -} - -void jit::add_module(const char *name, const char *src, const std::vector ¶ms) { - auto ptt_module = make_triton_module(name, src); - add_module(*ptt_module, params); -} - -driver::kernel *jit::get_function(const char *name) { - return driver::kernel::create(modules_.front(), name); -} - -jit::launch_information jit::get_launch_info(const char *name) { - return launch_info_map_.at(name); -} - -unsigned jit::get_int(const char *name){ - return global_ints_.at(name); -} - -driver::buffer *jit::get_buffer(const char *name){ - driver::cu_module *mod = (driver::cu_module*)modules_.front(); - return mod->symbol(name); -} - -} From f33a1f3fe3dcaeb9af626cb2ae904cb18470dc0e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 19 May 2019 01:31:08 -0400 Subject: [PATCH 151/494] [examples/pytorch] Fixed issues in backward pass of conv --- examples/cpp/conv.cpp | 4 +- examples/python/pytorch/bench.py | 117 +++++++++++++++++ examples/python/pytorch/conv.cpp | 39 +++--- examples/python/pytorch/main.py | 50 ------- examples/python/pytorch/test.py | 22 ++++ examples/python/pytorch/triton.py | 46 +++++++ include/triton/runtime/jit.h | 116 +++++++++++++++++ lib/dnn/conv.cpp | 9 +- lib/runtime/jit.cpp | 209 ++++++++++++++++++++++++++++++ 9 files changed, 541 insertions(+), 71 deletions(-) create mode 100644 examples/python/pytorch/bench.py delete mode 100644 examples/python/pytorch/main.py create mode 100644 examples/python/pytorch/test.py create mode 100644 examples/python/pytorch/triton.py create mode 100644 include/triton/runtime/jit.h create mode 100644 lib/runtime/jit.cpp diff --git a/examples/cpp/conv.cpp b/examples/cpp/conv.cpp index 93f42b94e..70555fd0e 100644 --- a/examples/cpp/conv.cpp +++ b/examples/cpp/conv.cpp @@ -10,12 +10,12 @@ int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); triton::jit jit(context); - triton::dnn::conv::type ty = triton::dnn::conv::FPROP; + triton::dnn::conv::type ty = triton::dnn::conv::BPROP; // initialization int32_t B = 4, NF = 32; int32_t D = 1, H = 56, W = 56; int32_t NC = 32, T = 1, R = 3, S = 3; - int32_t pad_d = 0, pad_h = 1, pad_w = 1; + int32_t pad_d = 0, pad_h = 0, pad_w = 0; triton::dnn::conv configuration(B, NC, D, H, W, T, R, S, NF, 1, 1, 1, pad_d, pad_h, pad_w, ty); // convolution configuration std::vector hc(configuration.c_size()); diff --git a/examples/python/pytorch/bench.py b/examples/python/pytorch/bench.py new file mode 100644 index 000000000..2c8c304b5 --- /dev/null +++ b/examples/python/pytorch/bench.py @@ -0,0 +1,117 @@ +import argparse +import triton +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torchvision import datasets, transforms + +torch.manual_seed(0) + +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = nn.Conv2d(1, 20, (5,5)) + self.conv2 = nn.Conv2d(20, 50, (5,5)) + self.fc1 = nn.Linear(4*4*50, 500) + self.fc2 = nn.Linear(500, 10) + + def forward(self, x): + x = F.relu(self.conv1(x)) + x = F.max_pool2d(x, 2, 2) + x = F.relu(self.conv2(x)) + x = F.max_pool2d(x, 2, 2) + x = x.view(-1, 4*4*50) + x = F.relu(self.fc1(x)) + x = self.fc2(x) + return F.log_softmax(x, dim=1) + +def train(args, model, device, train_loader, optimizer, epoch): + model.train() + for batch_idx, (data, target) in enumerate(train_loader): + data, target = data.to(device), target.to(device) + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + if batch_idx % args.log_interval == 0: + print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + epoch, batch_idx * len(data), len(train_loader.dataset), + 100. * batch_idx / len(train_loader), loss.item())) + +def test(args, model, device, test_loader): + model.eval() + test_loss = 0 + correct = 0 + with torch.no_grad(): + for data, target in test_loader: + data, target = data.to(device), target.to(device) + output = model(data) + test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss + pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability + correct += pred.eq(target.view_as(pred)).sum().item() + + test_loss /= len(test_loader.dataset) + + print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( + test_loss, correct, len(test_loader.dataset), + 100. * correct / len(test_loader.dataset))) + +def main(): + # Training settings + parser = argparse.ArgumentParser(description='PyTorch MNIST Example') + parser.add_argument('--batch-size', type=int, default=64, metavar='N', + help='input batch size for training (default: 64)') + parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', + help='input batch size for testing (default: 1000)') + parser.add_argument('--epochs', type=int, default=10, metavar='N', + help='number of epochs to train (default: 10)') + parser.add_argument('--lr', type=float, default=0.01, metavar='LR', + help='learning rate (default: 0.01)') + parser.add_argument('--momentum', type=float, default=0.5, metavar='M', + help='SGD momentum (default: 0.5)') + parser.add_argument('--no-cuda', action='store_true', default=False, + help='disables CUDA training') + parser.add_argument('--seed', type=int, default=1, metavar='S', + help='random seed (default: 1)') + parser.add_argument('--log-interval', type=int, default=10, metavar='N', + help='how many batches to wait before logging training status') + + parser.add_argument('--save-model', action='store_true', default=False, + help='For Saving the current Model') + args = parser.parse_args() + use_cuda = not args.no_cuda and torch.cuda.is_available() + + torch.manual_seed(args.seed) + + device = torch.device("cuda" if use_cuda else "cpu") + + kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} + train_loader = torch.utils.data.DataLoader( + datasets.MNIST('../data', train=True, download=True, + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])), + batch_size=args.batch_size, shuffle=True, **kwargs) + test_loader = torch.utils.data.DataLoader( + datasets.MNIST('../data', train=False, transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])), + batch_size=args.test_batch_size, shuffle=True, **kwargs) + + + model = Net().to(device) + optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) + + for epoch in range(1, args.epochs + 1): + train(args, model, device, train_loader, optimizer, epoch) + test(args, model, device, test_loader) + + if (args.save_model): + torch.save(model.state_dict(),"mnist_cnn.pt") + +if __name__ == '__main__': + main() diff --git a/examples/python/pytorch/conv.cpp b/examples/python/pytorch/conv.cpp index 577d23ee0..a3002e28a 100644 --- a/examples/python/pytorch/conv.cpp +++ b/examples/python/pytorch/conv.cpp @@ -86,7 +86,8 @@ torch::Tensor conv_common( torch::Tensor conv_fprop( const torch::Tensor data, - const torch::Tensor weight) { + const torch::Tensor weight, + int64_t pad_h, int64_t pad_w) { // Check CHECK_INPUT(data); CHECK_INPUT(weight); @@ -104,7 +105,7 @@ torch::Tensor conv_fprop( const int32_t NF = weight.size(3); // Configuration const int32_t stride_d = 1, stride_h = 1, stride_w = 1; - const int32_t pad_d = 0, pad_h = 1, pad_w = 1; + const int32_t pad_d = 0; // Check AT_CHECK(Ci == Cf, "Number of channels in data and weights must match"); return conv_common(B, Ci, D, H, W, T, R, S, NF, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, triton::dnn::conv::FPROP, data, weight); @@ -112,7 +113,8 @@ torch::Tensor conv_fprop( torch::Tensor conv_bprop( const torch::Tensor derror, - const torch::Tensor weight){ + const torch::Tensor weight, + int64_t pad_h, int64_t pad_w){ // Check CHECK_INPUT(derror); CHECK_INPUT(weight); @@ -131,10 +133,12 @@ torch::Tensor conv_bprop( // Compute M, P, Q const int32_t upsample_d = 1, upsample_h = 1, upsample_w = 1; const int32_t stride_d = 1, stride_h = 1, stride_w = 1; - const int32_t pad_d = 0, pad_h = 1, pad_w = 1; - const int32_t D = M*stride_d + T - 1 - 2*pad_d + stride_d - 1 / upsample_d; - const int32_t H = P*stride_d + R - 1 - 2*pad_h + stride_h - 1 / upsample_h; - const int32_t W = Q*stride_d + S - 1 - 2*pad_w + stride_w - 1 / upsample_w; + int32_t pad_d = 0; + const int32_t D = (M*stride_d + T - 1 - 2*pad_d - stride_d + 1) / upsample_d; + const int32_t H = (P*stride_d + R - 1 - 2*pad_h - stride_h + 1) / upsample_h; + const int32_t W = (Q*stride_d + S - 1 - 2*pad_w - stride_w + 1) / upsample_w; + + // Check AT_CHECK(Ki == Kw, "Number of channels in error and weights must match"); return conv_common(B, C, D, H, W, T, R, S, Kw, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, triton::dnn::conv::BPROP, derror, weight); @@ -142,17 +146,18 @@ torch::Tensor conv_bprop( torch::Tensor conv_wgrad( const torch::Tensor data, - const torch::Tensor derror + const torch::Tensor derror, + int64_t pad_h, int64_t pad_w ){ // Check CHECK_INPUT(data); CHECK_INPUT(derror); // Unpack data shapes - const int32_t Ba = derror.size(0); - const int32_t C = derror.size(1); + const int32_t Ba = data.size(0); + const int32_t C = data.size(1); const int32_t D = 1; - const int32_t H = derror.size(2); - const int32_t W = derror.size(3); + const int32_t H = data.size(2); + const int32_t W = data.size(3); // Unpack error shapes const int32_t Bb = derror.size(0); const int32_t K = derror.size(1); @@ -162,10 +167,12 @@ torch::Tensor conv_wgrad( // Compute M, P, Q const int32_t upsample_d = 1, upsample_h = 1, upsample_w = 1; const int32_t stride_d = 1, stride_h = 1, stride_w = 1; - const int32_t pad_d = 0, pad_h = 1, pad_w = 1; - const int32_t T = (D - M*stride_d + 1 + 2*pad_d - stride_d + 1)*upsample_d; - const int32_t R = (H - P*stride_h + 1 + 2*pad_h - stride_h + 1)*upsample_h; - const int32_t S = (W - Q*stride_w + 1 + 2*pad_w - stride_w + 1)*upsample_w; + const int32_t pad_d = 0; + const int32_t T = (D - M*stride_d + 1 + 2*pad_d + stride_d - 1)*upsample_d; + const int32_t R = (H - P*stride_h + 1 + 2*pad_h + stride_h - 1)*upsample_h; + const int32_t S = (W - Q*stride_w + 1 + 2*pad_w + stride_w - 1)*upsample_w; + + // Check AT_CHECK(Ba == Bb, "Number of channels in error and weights must match"); return conv_common(Ba, C, D, H, W, T, R, S, K, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, triton::dnn::conv::WGRAD, data, derror); diff --git a/examples/python/pytorch/main.py b/examples/python/pytorch/main.py deleted file mode 100644 index c4601fe0f..000000000 --- a/examples/python/pytorch/main.py +++ /dev/null @@ -1,50 +0,0 @@ -import torch -import time -torch.manual_seed(0) - -class TritonConv(torch.autograd.Function): - - @staticmethod - def forward(ctx, input, weight): - ctx.save_for_backward(input, weight) - output = torch.ops.triton.conv_fprop(input, weight) - return output - - @staticmethod - def backward(ctx, grad_output): - input, weight = ctx.saved_tensors - grad_input = grad_weight = None - if ctx.needs_input_grad[0]: - grad_input = torch.ops.triton.conv_bprop(grad_output, weight) - if ctx.needs_input_grad[1]: - grad_weight = torch.ops.triton.conv_wgrad(input, grad_output) - return grad_input, grad_weight - - -torch.ops.load_library("/home/philippe/Development/triton/build/examples/python/pytorch/libtorch_triton.so") - -x = torch.autograd.Variable(torch.randn(16, 64, 8, 8).cuda(), requires_grad=True) -w = torch.autograd.Variable(torch.randn(64, 3, 3, 64).cuda(), requires_grad=True) -cuw = torch.autograd.Variable(w.permute(3,0,1,2).cuda(), requires_grad=True) -y_target = torch.autograd.Variable(torch.randn(16, 64, 8, 8).cuda(), requires_grad=True) - -def run(x, w, conv): - y = conv(x, w) - loss = (y - y_target).norm(2) - loss.backward() - return loss, y.clone(), x.grad.clone(), w.grad.clone() - -ttyloss, tty, ttdx, ttdw = run(x, w, TritonConv.apply) -x.grad.zero_() -w.grad.zero_() -culoss, cuy, cudx, cudw = run(x, cuw, lambda x, w: torch.nn.functional.conv2d(x, w, padding=1)) - - -print((tty - cuy).norm(2)) -print((ttdx - cudx).norm(2)) -print((ttdw.permute(3,0,1,2) - cudw).norm(2)) -#print(ttdx) -#print(cudx) -#print(ttdw) -#print(cudw) -#print((ttdw.permute(3,0,1,2) - cudw).norm(2)) diff --git a/examples/python/pytorch/test.py b/examples/python/pytorch/test.py new file mode 100644 index 000000000..5086f8a5d --- /dev/null +++ b/examples/python/pytorch/test.py @@ -0,0 +1,22 @@ +import torch +import triton + +x = torch.autograd.Variable(torch.randn(16, 64, 8, 8).cuda(), requires_grad=True) +w = torch.autograd.Variable(torch.randn(64, 3, 3, 64).cuda(), requires_grad=True) +cuw = torch.autograd.Variable(w.permute(3,0,1,2).cuda(), requires_grad=True) +y_target = torch.autograd.Variable(torch.randn(16, 64, 6, 6).cuda(), requires_grad=True) + +def run(x, w, conv): + y = conv(x, w) + loss = (y - y_target).norm(2) + loss.backward() + return loss, y.clone(), x.grad.clone(), w.grad.clone() + +ttyloss, tty, ttdx, ttdw = run(x, w, lambda x, w: triton.ConvFunction.apply(x, w, 0)) +x.grad.zero_() +w.grad.zero_() +culoss, cuy, cudx, cudw = run(x, cuw, lambda x, w: torch.nn.functional.conv2d(x, w, padding=0)) + +print((tty - cuy).norm(2)) +print((ttdx - cudx).norm(2)) +print((ttdw.permute(3,0,1,2) - cudw).norm(2)) diff --git a/examples/python/pytorch/triton.py b/examples/python/pytorch/triton.py new file mode 100644 index 000000000..57e17d515 --- /dev/null +++ b/examples/python/pytorch/triton.py @@ -0,0 +1,46 @@ +import torch +import math + +torch.ops.load_library("/home/philippe/Development/triton/build/examples/python/pytorch/libtorch_triton.so") + +class ConvFunction(torch.autograd.Function): + + @staticmethod + def forward(ctx, input, weight, padding): + ctx.save_for_backward(input, weight) + ctx.padding = padding + output = torch.ops.triton.conv_fprop(input, weight, padding, padding) + return output + + @staticmethod + def backward(ctx, grad_output): + input, weight = ctx.saved_tensors + padding = ctx.padding + grad_input = grad_weight = None + if ctx.needs_input_grad[0]: + grad_input = torch.ops.triton.conv_bprop(grad_output, weight, padding, padding) + if ctx.needs_input_grad[1]: + grad_weight = torch.ops.triton.conv_wgrad(input, grad_output, padding, padding) + return grad_input, grad_weight, None + + +class Conv2d(torch.nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, padding = 0): + super(Conv2d, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.padding = padding + self.weight = torch.nn.Parameter(torch.Tensor( + in_channels, kernel_size[0], kernel_size[1], out_channels)) + self.reset_parameters() + + def forward(self, input): + return ConvFunction.apply(input, self.weight, self.padding) + + def reset_parameters(self): + n = self.in_channels + for k in self.kernel_size: + n *= k + stdv = 1. / math.sqrt(n) + self.weight.data.uniform_(-stdv, stdv) diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h new file mode 100644 index 000000000..a114cca8c --- /dev/null +++ b/include/triton/runtime/jit.h @@ -0,0 +1,116 @@ +#ifndef TDL_INCLUDE_JIT_H +#define TDL_INCLUDE_JIT_H + +#include +#include +#include "llvm/IR/LLVMContext.h" +#include "triton/ir/context.h" +#include "triton/ir/print.h" +#include "triton/driver/module.h" +#include "triton/driver/kernel.h" +#include "triton/codegen/selection.h" +#include "triton/codegen/tune.h" +#include "triton/codegen/optimize_dot.h" +#include "triton/codegen/optimize_cse.h" +#include "triton/codegen/optimize_trans.h" +#include "triton/codegen/shmem_allocation.h" +#include "triton/codegen/shmem_liveness.h" +#include "triton/codegen/shmem_info.h" +#include "triton/codegen/shmem_barriers.h" +#include "triton/codegen/target.h" +#include "triton/codegen/vectorize.h" +#include + +namespace llvm { + class Module; +} + +namespace triton { + +namespace codegen{ +class tune; +} + +namespace ir { +class module; +class context; +class metaparameter; +} + +class jit { +public: + struct launch_information{ + std::vector global_range_size; + unsigned num_threads; + }; + typedef std::function benchmark_t; + + struct passes_wrapper { + passes_wrapper(codegen::target* target) + : shmem_liveness(&shmem_info), + shmem_allocation(&shmem_liveness, &shmem_info), + shmem_barriers(&shmem_allocation, &shmem_info), + vectorize(&tune), + selection(&shmem_allocation, &tune, &shmem_info, target), + optimize_dot(&tune), + optimize_cse(), + optimize_trans(), + target_(target) { } + + void target_independent(ir::module &module) { + optimize_dot.run(module); + optimize_trans.run(module); + } + + void target_dependent(ir::module &module) { + if(target_->is_gpu()){ + shmem_info.run(module); + shmem_liveness.run(module); + shmem_allocation.run(); + shmem_barriers.run(module); + } + vectorize.run(module); + } + + codegen::tune tune; + codegen::shmem_info shmem_info; + codegen::shmem_liveness shmem_liveness; + codegen::shmem_allocation shmem_allocation; + codegen::shmem_barriers shmem_barriers; + codegen::vectorize vectorize; + codegen::selection selection; + codegen::optimize_dot optimize_dot; + codegen::optimize_cse optimize_cse; + codegen::optimize_trans optimize_trans; + codegen::target* target_; + }; + +private: + std::string compute_data_layout(bool is_64bit = true, bool use_short_pointers = true); + std::unique_ptr make_llvm_module(triton::ir::module &module, passes_wrapper &passes); + std::unique_ptr make_triton_module(const char* name, const char* src); + +public: + jit(driver::context* context); + ~jit(); + void autotune(const char* name, const char* src, benchmark_t benchmark); + void add_module(ir::module &module, const std::vector& params = {}); + void add_module(const char* name, const char* src, const std::vector& params = {}); + driver::kernel* get_function(const char* name); + launch_information get_launch_info(const char* name); + unsigned get_int(const char* name); + +private: + std::map modules_; + driver::context* driver_context_; + llvm::LLVMContext llvm_context_; + ir::context triton_context_; + std::map launch_info_map_; + std::map global_ints_; + std::shared_ptr target_; +}; + + +} + +#endif diff --git a/lib/dnn/conv.cpp b/lib/dnn/conv.cpp index 2c551241c..621f4f14a 100644 --- a/lib/dnn/conv.cpp +++ b/lib/dnn/conv.cpp @@ -24,10 +24,13 @@ conv::conv(int B, int NC, shapes_c_ = {NB_, NF_, CD_, CH_, CW_}; // swap a and c for bprop if(ty_ == BPROP){ - pad_d_ = (CD_ - AD_ + BD_ - 1) / 2; - pad_h_ = (CH_ - AH_ + BH_ - 1) / 2; - pad_w_ = (CW_ - AW_ + BW_ - 1) / 2; + std::swap(AD_, CD_); + std::swap(AH_, CH_); + std::swap(AW_, CW_); shapes_a_.swap(shapes_c_); + pad_d_ = (CD_*stride_d_ - AD_*upsample_d_ + BD_ - 1 - stride_d_ + 1)/2; + pad_h_ = (CH_*stride_h_ - AH_*upsample_h_ + BH_ - 1 - stride_h_ + 1)/2; + pad_w_ = (CW_*stride_w_ - AW_*upsample_w_ + BW_ - 1 - stride_w_ + 1)/2; } // swap b and c for wgrad if(ty_ == WGRAD){ diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp new file mode 100644 index 000000000..58d3aef73 --- /dev/null +++ b/lib/runtime/jit.cpp @@ -0,0 +1,209 @@ +#include +#include "triton/ast/ast.h" +#include "triton/codegen/target.h" +#include "triton/ir/context.h" +#include "triton/ir/context_impl.h" +#include "triton/driver/device.h" +#include "triton/driver/error.h" +#include "triton/runtime/jit.h" +#include "llvm/IR/IRPrintingPasses.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/Transforms/Scalar/EarlyCSE.h" +#include "llvm/Analysis/LoopPass.h" + +typedef struct yy_buffer_state * YY_BUFFER_STATE; +extern int yyparse(); +extern YY_BUFFER_STATE yy_scan_string(const char * str); +extern void yy_delete_buffer(YY_BUFFER_STATE buffer); +using triton::ast::translation_unit; +extern translation_unit *ast_root; + +namespace triton { + +void loop_nest(std::vector const & ranges, std::function const &)> const & f){ + size_t D = ranges.size(); + std::vector values(D, 0); + // Start with innermost loop + size_t i = D - 1; + while(true){ + //Execute function + f(values); + //Increment counters + while(values[i]++ == ranges[i] - 1){ + if(i == 0) + return; + values[i--] = 0; + } + i = D - 1; + } +} + +template +void loop_nest(std::vector> const & iterates, std::function)> const & f){ + //Ranges to iterate over + std::vector ranges; + for(auto const & x: iterates) + ranges.push_back(x.size()); + //Proxy function + auto proxy = [&](std::vector const & idx){ + std::vector x(iterates.size()); + for(size_t i = 0; i < x.size(); ++i) + x[i] = iterates[i][idx[i]]; + f(x); + }; + //Iterate + loop_nest(ranges, proxy); +} + + + + +std::unique_ptr jit::make_llvm_module(ir::module &module, passes_wrapper &passes) { + llvm::Module* result = new llvm::Module(module.get_name(), llvm_context_); + passes.selection.run(module, *result); + // launch information + launch_information& info = launch_info_map_[result->getName()]; + info.global_range_size.clear(); + for(unsigned i = 0; i < passes.tune.get_num_global_range(); i++) + info.global_range_size.push_back(passes.tune.get_global_range_size(i)); + info.num_threads = passes.tune.get_num_threads(); + return std::unique_ptr(result); +} + +std::unique_ptr jit::make_triton_module(const char *name, const char *src) { + // create AST from Triton-C source + YY_BUFFER_STATE buffer = yy_scan_string(src); + yyparse(); + yy_delete_buffer(buffer); + translation_unit *program = ast_root; + // create Triton-IR from AST + ir::module* module = new ir::module(name, triton_context_); + program->codegen(module); + return std::unique_ptr(module); +} + + +jit::jit(driver::context *context): driver_context_(context), + target_(context->device()->make_target()) { } + +jit::~jit(){ } + +void jit::autotune(const char *name, const char *src, benchmark_t benchmark) { + // find metaparameters + auto ptt_module = make_triton_module(name, src); + ir::module &tt_module = *ptt_module; + // set parameters + passes_wrapper passes(target_.get()); + passes.target_independent(tt_module); + passes.tune.run(tt_module); + auto mps = passes.tune.get_params(tt_module); + // create parameter ranges + std::vector> ranges; + for(ir::metaparameter *mp: mps) + ranges.push_back(mp->get_space()); +// std::cout << ranges.size() << std::endl; + // iterate over parameters + unsigned i; + double best = 0; + loop_nest(ranges, [&](const std::vector params){ + std::map> errors; + i = 0; + for(ir::metaparameter *mp: mps) + mp->set_value(params[i++]); + passes.target_independent(tt_module); + passes.tune.init(tt_module); + if(!passes.tune.check_constraints(errors)) + return; + // Deep copy of the module and tuner + auto ptt_module = make_triton_module(name, src); + ir::module &tt_module = *ptt_module; + passes_wrapper passes(target_.get()); + passes.target_independent(tt_module); + passes.tune.run(tt_module); + i = 0; + for(ir::metaparameter* mp: passes.tune.get_params(tt_module)){ + mp->set_value(params[i++]); + } + passes.tune.init(tt_module); + passes.target_dependent(tt_module); + driver::device* device = driver_context_->device(); + if(passes.shmem_allocation.get_allocated_size() > device->max_shared_memory()) + return; + if(passes.tune.get_num_threads() > device->max_threads_per_block()) + return; + // Compile + auto ll_module = make_llvm_module(tt_module, passes); + std::unique_ptr module(driver::module::create(driver_context_, &*ll_module)); + std::unique_ptr kernel(driver::kernel::create(module.get(), name)); + launch_information info = launch_info_map_.at(name); + for(unsigned p: params) + std::cout << p << " " << std::flush; + // add globals + for(auto x: tt_module.globals()) + global_ints_[x.first] = ((ir::metaparameter*)x.second)->get_value(); + modules_.insert({name, module.get()}); + double perf; + perf = benchmark(kernel.get(), info); + best = std::max(perf, best); + std::cout << perf << " [ " << best << " ] " << std::endl; + modules_.erase(name); + }); +} + +void jit::add_module(ir::module &tt_module, const std::vector ¶ms) { + // set parameters + passes_wrapper passes(target_.get()); + passes.target_independent(tt_module); + passes.tune.run(tt_module); + unsigned i = 0; + for(ir::metaparameter* mp: passes.tune.get_params(tt_module)) + mp->set_value(params[i++]); + passes.tune.init(tt_module); + passes.target_dependent(tt_module); + // check constraints + std::map> errors; + passes.tune.check_constraints(errors); + for(auto x: errors){ + std::cout << x.first << std::endl; + for(auto str: x.second) + std::cout << str << std::endl; + } + if(errors.size()) + throw std::runtime_error("invalid parameters"); + // triton module -> llvm module + auto ll_module = make_llvm_module(tt_module, passes); + // llvm module -> machine code + std::string name = tt_module.get_name(); + modules_.insert({name, driver::module::create(driver_context_, &*ll_module)}); + // add globals + for(auto x: tt_module.globals()) + global_ints_[x.first] = ((ir::metaparameter*)x.second)->get_value(); +} + +void jit::add_module(const char *name, const char *src, const std::vector ¶ms) { + auto ptt_module = make_triton_module(name, src); + add_module(*ptt_module, params); +} + +driver::kernel *jit::get_function(const char *name) { + return driver::kernel::create(modules_.at(name), name); +} + +jit::launch_information jit::get_launch_info(const char *name) { + return launch_info_map_.at(name); +} + +unsigned jit::get_int(const char *name){ + return global_ints_.at(name); +} + +} From e8f23bcade2ae049d51224094c30f54d07e80ed7 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 20 May 2019 12:20:29 -0400 Subject: [PATCH 152/494] [dnn/conv] Added bias and forward stride --- examples/cpp/conv.cpp | 8 +- examples/python/pytorch/bench.py | 223 +++++++++++++++++------------- examples/python/pytorch/conv.cpp | 51 ++++--- examples/python/pytorch/test.py | 10 +- examples/python/pytorch/triton.py | 60 ++++++-- include/triton/dnn/conv.h | 43 ++++-- lib/dnn/conv.cpp | 89 ++++++------ lib/driver/kernel.cpp | 18 ++- 8 files changed, 303 insertions(+), 199 deletions(-) diff --git a/examples/cpp/conv.cpp b/examples/cpp/conv.cpp index 70555fd0e..2bbec482a 100644 --- a/examples/cpp/conv.cpp +++ b/examples/cpp/conv.cpp @@ -10,13 +10,15 @@ int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); triton::jit jit(context); - triton::dnn::conv::type ty = triton::dnn::conv::BPROP; + triton::dnn::conv::type ty = triton::dnn::conv::FPROP; // initialization int32_t B = 4, NF = 32; int32_t D = 1, H = 56, W = 56; int32_t NC = 32, T = 1, R = 3, S = 3; int32_t pad_d = 0, pad_h = 0, pad_w = 0; - triton::dnn::conv configuration(B, NC, D, H, W, T, R, S, NF, 1, 1, 1, pad_d, pad_h, pad_w, ty); + int32_t stride_d = 1, stride_h = 1, stride_w = 1; + int32_t upsample_d = 1, upsample_h = 1, upsample_w = 1; + triton::dnn::conv configuration(B, NC, D, H, W, T, R, S, NF, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, upsample_d, upsample_h, upsample_w, ty); // convolution configuration std::vector hc(configuration.c_size()); std::vector rc(configuration.c_size()); @@ -47,7 +49,7 @@ int main() { std::array grid = configuration.get_grid(TM, TN); configuration.init(stream, (triton::driver::cu_module*)kernel->module()); stream->synchronize(); - configuration.set_arg(kernel, da, db, dc); + configuration.set_arg(kernel, da, db, dc, nullptr); stream->enqueue(kernel, grid, {nthreads, 1, 1}); stream->synchronize(); double ts = bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, diff --git a/examples/python/pytorch/bench.py b/examples/python/pytorch/bench.py index 2c8c304b5..98a782099 100644 --- a/examples/python/pytorch/bench.py +++ b/examples/python/pytorch/bench.py @@ -1,117 +1,142 @@ -import argparse -import triton +'''Train CIFAR10 with PyTorch.''' +from __future__ import print_function + import torch import torch.nn as nn -import torch.nn.functional as F import torch.optim as optim -from torchvision import datasets, transforms +import torch.nn.functional as F +import torch.backends.cudnn as cudnn -torch.manual_seed(0) - -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = nn.Conv2d(1, 20, (5,5)) - self.conv2 = nn.Conv2d(20, 50, (5,5)) - self.fc1 = nn.Linear(4*4*50, 500) - self.fc2 = nn.Linear(500, 10) +import torchvision +import torchvision.transforms as transforms - def forward(self, x): - x = F.relu(self.conv1(x)) - x = F.max_pool2d(x, 2, 2) - x = F.relu(self.conv2(x)) - x = F.max_pool2d(x, 2, 2) - x = x.view(-1, 4*4*50) - x = F.relu(self.fc1(x)) - x = self.fc2(x) - return F.log_softmax(x, dim=1) +import os +import argparse -def train(args, model, device, train_loader, optimizer, epoch): - model.train() - for batch_idx, (data, target) in enumerate(train_loader): - data, target = data.to(device), target.to(device) +from resnet import * +from utils import progress_bar + + +parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training') +parser.add_argument('--lr', default=0.1, type=float, help='learning rate') +parser.add_argument('--resume', '-r', action='store_true', help='resume from checkpoint') +args = parser.parse_args() + +device = 'cuda' if torch.cuda.is_available() else 'cpu' +best_acc = 0 # best test accuracy +start_epoch = 0 # start from epoch 0 or last checkpoint epoch + +# Data +print('==> Preparing data..') +transform_train = transforms.Compose([ + transforms.RandomCrop(32, padding=4), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), +]) + +transform_test = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), +]) + +trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train) +trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2) + +testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test) +testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2) + +classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') + +# Model +print('==> Building model..') +# net = VGG('VGG19') +net = ResNet18() +# net = PreActResNet18() +# net = GoogLeNet() +# net = DenseNet121() +# net = ResNeXt29_2x64d() +# net = MobileNet() +# net = MobileNetV2() +# net = DPN92() +# net = ShuffleNetG2() +# net = SENet18() +#net = ShuffleNetV2(1) +net = net.to(device) +if device == 'cuda': + net = torch.nn.DataParallel(net) + cudnn.benchmark = True + +if args.resume: + # Load checkpoint. + print('==> Resuming from checkpoint..') + assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!' + checkpoint = torch.load('./checkpoint/ckpt.t7') + net.load_state_dict(checkpoint['net']) + best_acc = checkpoint['acc'] + start_epoch = checkpoint['epoch'] + +criterion = nn.CrossEntropyLoss() +optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) + +# Training +def train(epoch): + print('\nEpoch: %d' % epoch) + net.train() + train_loss = 0 + correct = 0 + total = 0 + for batch_idx, (inputs, targets) in enumerate(trainloader): + inputs, targets = inputs.to(device), targets.to(device) optimizer.zero_grad() - output = model(data) - loss = F.nll_loss(output, target) + outputs = net(inputs) + loss = criterion(outputs, targets) loss.backward() optimizer.step() - if batch_idx % args.log_interval == 0: - print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( - epoch, batch_idx * len(data), len(train_loader.dataset), - 100. * batch_idx / len(train_loader), loss.item())) -def test(args, model, device, test_loader): - model.eval() + train_loss += loss.item() + _, predicted = outputs.max(1) + total += targets.size(0) + correct += predicted.eq(targets).sum().item() + + progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' + % (train_loss/(batch_idx+1), 100.*correct/total, correct, total)) + +def test(epoch): + global best_acc + net.eval() test_loss = 0 correct = 0 + total = 0 with torch.no_grad(): - for data, target in test_loader: - data, target = data.to(device), target.to(device) - output = model(data) - test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss - pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability - correct += pred.eq(target.view_as(pred)).sum().item() + for batch_idx, (inputs, targets) in enumerate(testloader): + inputs, targets = inputs.to(device), targets.to(device) + outputs = net(inputs) + loss = criterion(outputs, targets) - test_loss /= len(test_loader.dataset) + test_loss += loss.item() + _, predicted = outputs.max(1) + total += targets.size(0) + correct += predicted.eq(targets).sum().item() - print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( - test_loss, correct, len(test_loader.dataset), - 100. * correct / len(test_loader.dataset))) + progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' + % (test_loss/(batch_idx+1), 100.*correct/total, correct, total)) -def main(): - # Training settings - parser = argparse.ArgumentParser(description='PyTorch MNIST Example') - parser.add_argument('--batch-size', type=int, default=64, metavar='N', - help='input batch size for training (default: 64)') - parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', - help='input batch size for testing (default: 1000)') - parser.add_argument('--epochs', type=int, default=10, metavar='N', - help='number of epochs to train (default: 10)') - parser.add_argument('--lr', type=float, default=0.01, metavar='LR', - help='learning rate (default: 0.01)') - parser.add_argument('--momentum', type=float, default=0.5, metavar='M', - help='SGD momentum (default: 0.5)') - parser.add_argument('--no-cuda', action='store_true', default=False, - help='disables CUDA training') - parser.add_argument('--seed', type=int, default=1, metavar='S', - help='random seed (default: 1)') - parser.add_argument('--log-interval', type=int, default=10, metavar='N', - help='how many batches to wait before logging training status') - - parser.add_argument('--save-model', action='store_true', default=False, - help='For Saving the current Model') - args = parser.parse_args() - use_cuda = not args.no_cuda and torch.cuda.is_available() - - torch.manual_seed(args.seed) - - device = torch.device("cuda" if use_cuda else "cpu") - - kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} - train_loader = torch.utils.data.DataLoader( - datasets.MNIST('../data', train=True, download=True, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])), - batch_size=args.batch_size, shuffle=True, **kwargs) - test_loader = torch.utils.data.DataLoader( - datasets.MNIST('../data', train=False, transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])), - batch_size=args.test_batch_size, shuffle=True, **kwargs) + # Save checkpoint. + acc = 100.*correct/total + if acc > best_acc: + print('Saving..') + state = { + 'net': net.state_dict(), + 'acc': acc, + 'epoch': epoch, + } + if not os.path.isdir('checkpoint'): + os.mkdir('checkpoint') + torch.save(state, './checkpoint/ckpt.t7') + best_acc = acc - model = Net().to(device) - optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) - - for epoch in range(1, args.epochs + 1): - train(args, model, device, train_loader, optimizer, epoch) - test(args, model, device, test_loader) - - if (args.save_model): - torch.save(model.state_dict(),"mnist_cnn.pt") - -if __name__ == '__main__': - main() +for epoch in range(start_epoch, start_epoch+200): + train(epoch) +test(epoch) diff --git a/examples/python/pytorch/conv.cpp b/examples/python/pytorch/conv.cpp index a3002e28a..4ed9785ed 100644 --- a/examples/python/pytorch/conv.cpp +++ b/examples/python/pytorch/conv.cpp @@ -14,7 +14,7 @@ typedef std::tuple conv_key_t; + triton::dnn::conv::type, bool> conv_key_t; static std::map> m_stream; static std::map> m_jit; @@ -26,7 +26,7 @@ torch::Tensor conv_common( int32_t stride_d, int32_t stride_h, int32_t stride_w, int32_t pad_d, int32_t pad_h, int32_t pad_w, triton::dnn::conv::type ty, - torch::Tensor torcha, torch::Tensor torchb + torch::Tensor torcha, torch::Tensor torchb, torch::Tensor torchbias ) { // Wrap CUDA handles c10::DeviceIndex device = torcha.storage().device().index(); @@ -40,13 +40,16 @@ torch::Tensor conv_common( // Get context triton::driver::context* ctx = stream->context(); // Get configuration - conv_key_t key = {B, C, D, H, W, T, R, S, NF, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, ty}; + bool has_bias = torchbias.storage().size() > 0; + conv_key_t key = {B, C, D, H, W, T, R, S, NF, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, ty, has_bias}; triton::dnn::conv* configuration; if(m_config.find(key) == m_config.end()) configuration = m_config.emplace(key, new triton::dnn::conv( B, C, D, H, W, T, R, S, NF, stride_d, stride_h, stride_w, - pad_d, pad_h, pad_w, ty)).first->second.get(); + pad_d, pad_h, pad_w, + 1, 1, 1, + ty, has_bias)).first->second.get(); else configuration = m_config.at(key).get(); // Get JIT @@ -55,12 +58,16 @@ torch::Tensor conv_common( jit = m_jit.emplace(key, new triton::jit(ctx)).first->second.get(); std::string src = configuration->src(); jit->add_module("conv", src.c_str(), configuration->default_params()); + triton::driver::kernel* kernel = jit->get_function("conv"); + configuration->init(stream, (triton::driver::cu_module*)kernel->module()); } else jit = m_jit.at(key).get(); // Get memory triton::driver::cu_buffer a(ctx, (CUdeviceptr)torcha.storage().data(), false); triton::driver::cu_buffer b(ctx, (CUdeviceptr)torchb.storage().data(), false); + triton::driver::cu_buffer cubias(ctx, (CUdeviceptr)torchbias.storage().data(), false); + triton::driver::buffer* bias = has_bias ? &cubias : nullptr; // Allocate output std::vector c_shapes = configuration->c_shapes(); torch::Tensor torchc; @@ -76,10 +83,9 @@ torch::Tensor conv_common( unsigned TM = info.global_range_size[0]; unsigned TN = info.global_range_size[1]; // launch info - configuration->init(stream, (triton::driver::cu_module*)kernel->module()); unsigned nthreads = info.num_threads; std::array grid = configuration->get_grid(TM, TN); - configuration->set_arg(kernel, &a, &b, &c); + configuration->set_arg(kernel, &a, &b, &c, bias); stream->enqueue(kernel, grid, {nthreads, 1, 1}); return torchc; } @@ -87,6 +93,8 @@ torch::Tensor conv_common( torch::Tensor conv_fprop( const torch::Tensor data, const torch::Tensor weight, + const torch::Tensor bias, + int64_t stride_h, int64_t stride_w, int64_t pad_h, int64_t pad_w) { // Check CHECK_INPUT(data); @@ -104,16 +112,19 @@ torch::Tensor conv_fprop( const int32_t S = weight.size(2); const int32_t NF = weight.size(3); // Configuration - const int32_t stride_d = 1, stride_h = 1, stride_w = 1; + const int32_t stride_d = 1; const int32_t pad_d = 0; // Check AT_CHECK(Ci == Cf, "Number of channels in data and weights must match"); - return conv_common(B, Ci, D, H, W, T, R, S, NF, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, triton::dnn::conv::FPROP, data, weight); + return conv_common(B, Ci, D, H, W, T, R, S, NF, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, triton::dnn::conv::FPROP, data, weight, bias); } torch::Tensor conv_bprop( const torch::Tensor derror, const torch::Tensor weight, + const torch::Tensor bias, + int64_t H, int64_t W, + int64_t stride_h, int64_t stride_w, int64_t pad_h, int64_t pad_w){ // Check CHECK_INPUT(derror); @@ -131,22 +142,20 @@ torch::Tensor conv_bprop( const int32_t S = weight.size(2); const int32_t Kw = weight.size(3); // Compute M, P, Q - const int32_t upsample_d = 1, upsample_h = 1, upsample_w = 1; - const int32_t stride_d = 1, stride_h = 1, stride_w = 1; + const int32_t stride_d = 1; int32_t pad_d = 0; - const int32_t D = (M*stride_d + T - 1 - 2*pad_d - stride_d + 1) / upsample_d; - const int32_t H = (P*stride_d + R - 1 - 2*pad_h - stride_h + 1) / upsample_h; - const int32_t W = (Q*stride_d + S - 1 - 2*pad_w - stride_w + 1) / upsample_w; - - + int32_t D = 1; // Check AT_CHECK(Ki == Kw, "Number of channels in error and weights must match"); - return conv_common(B, C, D, H, W, T, R, S, Kw, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, triton::dnn::conv::BPROP, derror, weight); + return conv_common(B, C, D, H, W, T, R, S, Kw, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, triton::dnn::conv::BPROP, derror, weight, bias); } torch::Tensor conv_wgrad( const torch::Tensor data, const torch::Tensor derror, + const torch::Tensor bias, + int64_t R, int64_t S, + int64_t stride_h, int64_t stride_w, int64_t pad_h, int64_t pad_w ){ // Check @@ -166,16 +175,12 @@ torch::Tensor conv_wgrad( const int32_t Q = derror.size(3); // Compute M, P, Q const int32_t upsample_d = 1, upsample_h = 1, upsample_w = 1; - const int32_t stride_d = 1, stride_h = 1, stride_w = 1; + const int32_t stride_d = 1; const int32_t pad_d = 0; - const int32_t T = (D - M*stride_d + 1 + 2*pad_d + stride_d - 1)*upsample_d; - const int32_t R = (H - P*stride_h + 1 + 2*pad_h + stride_h - 1)*upsample_h; - const int32_t S = (W - Q*stride_w + 1 + 2*pad_w + stride_w - 1)*upsample_w; - - + const int32_t T = 1; // Check AT_CHECK(Ba == Bb, "Number of channels in error and weights must match"); - return conv_common(Ba, C, D, H, W, T, R, S, K, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, triton::dnn::conv::WGRAD, data, derror); + return conv_common(Ba, C, D, H, W, T, R, S, K, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, triton::dnn::conv::WGRAD, data, derror, bias); } static auto registry = diff --git a/examples/python/pytorch/test.py b/examples/python/pytorch/test.py index 5086f8a5d..787d6634b 100644 --- a/examples/python/pytorch/test.py +++ b/examples/python/pytorch/test.py @@ -2,6 +2,7 @@ import torch import triton x = torch.autograd.Variable(torch.randn(16, 64, 8, 8).cuda(), requires_grad=True) +bias = torch.autograd.Variable(torch.randn(64).cuda(), requires_grad=True) w = torch.autograd.Variable(torch.randn(64, 3, 3, 64).cuda(), requires_grad=True) cuw = torch.autograd.Variable(w.permute(3,0,1,2).cuda(), requires_grad=True) y_target = torch.autograd.Variable(torch.randn(16, 64, 6, 6).cuda(), requires_grad=True) @@ -10,13 +11,16 @@ def run(x, w, conv): y = conv(x, w) loss = (y - y_target).norm(2) loss.backward() - return loss, y.clone(), x.grad.clone(), w.grad.clone() + return loss, y.clone(), x.grad.clone(), w.grad.clone(), bias.grad.clone() -ttyloss, tty, ttdx, ttdw = run(x, w, lambda x, w: triton.ConvFunction.apply(x, w, 0)) +ttyloss, tty, ttdx, ttdw, ttbias = run(x, w, lambda x, w: triton.ConvFunction.apply(x, w, bias, (1,1), (0,0))) x.grad.zero_() w.grad.zero_() -culoss, cuy, cudx, cudw = run(x, cuw, lambda x, w: torch.nn.functional.conv2d(x, w, padding=0)) +bias.grad.zero_() +culoss, cuy, cudx, cudw, cubias = run(x, cuw, lambda x, w: torch.nn.functional.conv2d(x, w, bias=bias, stride=1, padding=0)) +print(ttdx[0,0,:,:], cudx[0,0,:,:]) print((tty - cuy).norm(2)) print((ttdx - cudx).norm(2)) print((ttdw.permute(3,0,1,2) - cudw).norm(2)) +print((ttbias - cubias).norm(2)) diff --git a/examples/python/pytorch/triton.py b/examples/python/pytorch/triton.py index 57e17d515..ec7c86695 100644 --- a/examples/python/pytorch/triton.py +++ b/examples/python/pytorch/triton.py @@ -1,4 +1,5 @@ import torch +from torch.nn.modules.utils import _single, _pair, _triple import math torch.ops.load_library("/home/philippe/Development/triton/build/examples/python/pytorch/libtorch_triton.so") @@ -6,37 +7,56 @@ torch.ops.load_library("/home/philippe/Development/triton/build/examples/python/ class ConvFunction(torch.autograd.Function): @staticmethod - def forward(ctx, input, weight, padding): - ctx.save_for_backward(input, weight) + def forward(ctx, input, weight, bias, stride, padding): + if bias is None: + bias = torch.empty(0) + ctx.save_for_backward(input, weight, bias) + ctx.stride = stride ctx.padding = padding - output = torch.ops.triton.conv_fprop(input, weight, padding, padding) + output = torch.ops.triton.conv_fprop(input, weight, bias, stride[0], stride[1], padding[0], padding[1]) return output @staticmethod def backward(ctx, grad_output): - input, weight = ctx.saved_tensors + input, weight, bias = ctx.saved_tensors + stride = ctx.stride padding = ctx.padding - grad_input = grad_weight = None + grad_input = grad_weight = grad_bias = None if ctx.needs_input_grad[0]: - grad_input = torch.ops.triton.conv_bprop(grad_output, weight, padding, padding) + grad_input = torch.ops.triton.conv_bprop(grad_output, weight, bias, input.shape[2], input.shape[3], stride[0], stride[1], padding[0], padding[1]) if ctx.needs_input_grad[1]: - grad_weight = torch.ops.triton.conv_wgrad(input, grad_output, padding, padding) - return grad_input, grad_weight, None + grad_weight = torch.ops.triton.conv_wgrad(input, grad_output, bias, weight.shape[1], weight.shape[2], stride[0], stride[1], padding[0], padding[1]) + if ctx.needs_input_grad[2]: + grad_bias = torch.sum(grad_output, (0, 2, 3)) + return grad_input, grad_weight, grad_bias, None, None -class Conv2d(torch.nn.Module): - def __init__(self, in_channels, out_channels, kernel_size, padding = 0): - super(Conv2d, self).__init__() +class _ConvNd(torch.nn.Module): + + def __init__(self, in_channels, out_channels, kernel_size, stride, + padding, dilation, transposed, output_padding, groups, bias): + super(_ConvNd, self).__init__() + # not everything is supported by Triton + assert all(x==1 for x in stride) + assert all(x==1 for x in dilation) + assert transposed == False + assert all(x==0 for x in output_padding) + assert groups == 1 + # initialize self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = kernel_size self.padding = padding self.weight = torch.nn.Parameter(torch.Tensor( in_channels, kernel_size[0], kernel_size[1], out_channels)) + if bias: + self.bias = torch.nn.Parameter(torch.Tensor(out_channels)) + else: + self.register_parameter('bias', None) self.reset_parameters() def forward(self, input): - return ConvFunction.apply(input, self.weight, self.padding) + return ConvFunction.apply(input, self.weight, self.bias, self.padding) def reset_parameters(self): n = self.in_channels @@ -44,3 +64,19 @@ class Conv2d(torch.nn.Module): n *= k stdv = 1. / math.sqrt(n) self.weight.data.uniform_(-stdv, stdv) + if self.bias is not None: + self.bias.data.uniform_(-stdv, stdv) + + + +class Conv2d(_ConvNd): + + def __init__(self, in_channels, out_channels, kernel_size, stride=1, + padding=0, dilation=1, groups=1, bias=True): + kernel_size = _pair(kernel_size) + stride = _pair(stride) + padding = _pair(padding) + dilation = _pair(dilation) + super(Conv2d, self).__init__( + in_channels, out_channels, kernel_size, stride, padding, dilation, + False, _pair(0), groups, bias) diff --git a/include/triton/dnn/conv.h b/include/triton/dnn/conv.h index d01007aa6..313065fc6 100644 --- a/include/triton/dnn/conv.h +++ b/include/triton/dnn/conv.h @@ -22,7 +22,8 @@ public: int T, int R, int S, int NF, int stride_d, int stride_h, int stride_w, int pad_d, int pad_h, int pad_w, - type ty = FPROP); + int upsample_d, int upsample_h, int upsample_w, + type ty = FPROP, bool bias = false); // accessors size_t a_size(); @@ -36,7 +37,8 @@ public: void init(driver::stream *stream, driver::cu_module *module); std::array get_grid(size_t TM, size_t TN); void set_arg(driver::kernel *kernel, - driver::buffer *a, driver::buffer *b, driver::buffer *c); + driver::buffer *a, driver::buffer *b, driver::buffer *c, + driver::buffer *bias); // utilities size_t get_nflops(); @@ -81,6 +83,7 @@ public: void conv(read_only restrict fp32 *a, read_only restrict fp32 *b, fp32 *c, + fp32 *bias, int32 M, int32 N, int32 K, int32 AH, int32 AW, int32 BH, int32 BW, @@ -88,7 +91,9 @@ public: int32 lda_n, int32 lda_c, int32 lda_d, int32 lda_h, int32 lda_w, int32 ldb_c, int32 ldb_t, int32 ldb_r, int32 ldb_s, int32 ldb_k, int32 ldc_n, int32 ldc_k, int32 ldc_m, int32 ldc_p, int32 ldc_q, - int32 pad_h, int32 pad_w)"; + int32 pad_h, int32 pad_w, + int32 stride_h, int32 stride_w, + int32 upsample_h, int32 upsample_w)"; if(!is_a_deltas_cst) res += ", int32* delta"; if(is_wgrad && !is_b_deltas_cst_) @@ -103,9 +108,11 @@ public: fp32 C[TM, TN] = 0; int32 ldlut = )" + std::to_string(Fs_) + R"(; int32 rabh[TM] = rxa / CW; - int32 raw[TM] = rxa % CW - pad_w; + int32 raw[TM] = rxa % CW; int32 rab[TM] = rabh / CH; - int32 rah[TM] = rabh % CH - pad_h; + int32 rah[TM] = rabh % CH; + raw = raw*stride_w - pad_w; + rah = rah*stride_h - pad_h; int32 ra0[TM] = rab*lda_n + rah*lda_h + raw*lda_w; int32 ra)" + ax[0] + ax[1] + "[TK] = rka / " + redax[2] + R"(; int32 ra)" + ax[2] + "[TK] = rka % " + redax[2] + R"(; @@ -173,7 +180,14 @@ public: int32 rcn[TM] = rxc / (CH*CW); int32 rcpq[TM] = rxc % (CH*CW); int32 rc0[TM] = rcn * ldc_n + rcpq * ldc_q; - fp32* pc[TM, TN] = c + rc1[newaxis, :]*ldc_k + rc0[:, newaxis]; + fp32* pc[TM, TN] = c + rc1[newaxis, :]*ldc_k + rc0[:, newaxis];)"; + if(bias_ && ty_==FPROP){ + res += R"( + fp32* pbias[TN] = bias + rc1; + fp32 bias[TN] = *pbias; + C = C + bias[newaxis, :];)"; + } + res += R"( int1 checkc0[TM] = rxc < M; int1 checkc1[TN] = rc1 < N; int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; @@ -208,18 +222,18 @@ private: int32_t CD_; int32_t CH_; int32_t CW_; - // upsampling - int32_t upsample_d_; - int32_t upsample_h_; - int32_t upsample_w_; - // padding - int32_t pad_d_; - int32_t pad_h_; - int32_t pad_w_; // striding int32_t stride_d_; int32_t stride_h_; int32_t stride_w_; + // padding + int32_t pad_d_; + int32_t pad_h_; + int32_t pad_w_; + // upsampling + int32_t upsample_d_; + int32_t upsample_h_; + int32_t upsample_w_; // equivalent matmul int32_t M_; int32_t N_; @@ -249,6 +263,7 @@ private: bool is_mask_cst_; // type type ty_; + bool bias_; bool b_trans_; bool b_lut_; }; diff --git a/lib/dnn/conv.cpp b/lib/dnn/conv.cpp index 621f4f14a..fc97acbba 100644 --- a/lib/dnn/conv.cpp +++ b/lib/dnn/conv.cpp @@ -8,16 +8,18 @@ conv::conv(int B, int NC, int T, int R, int S, int NF, int stride_d, int stride_h, int stride_w, int pad_d, int pad_h, int pad_w, - type ty) + int upsample_d, int upsample_h, int upsample_w, + type ty, bool bias) : NB_(B), NC_(NC), AD_(D), AH_(H), AW_(W), BD_(T), BH_(R), BW_(S), NF_(NF), stride_d_(stride_d), stride_h_(stride_h), stride_w_(stride_w), - upsample_d_(1), upsample_h_(1), upsample_w_(1), pad_d_(pad_d), pad_h_(pad_h), pad_w_(pad_w), - ty_(ty) + upsample_d_(upsample_d), upsample_h_(upsample_h), upsample_w_(upsample_w), + ty_(ty), bias_(bias) { CD_ = (AD_*upsample_d_ - BD_ + 1 + 2*pad_d_ + stride_d_ - 1)/stride_d_; CH_ = (AH_*upsample_h_ - BH_ + 1 + 2*pad_h_ + stride_h_ - 1)/stride_h_; CW_ = (AW_*upsample_w_ - BW_ + 1 + 2*pad_w_ + stride_w_ - 1)/stride_w_; + // shapes shapes_a_ = {NB_, NC_, AD_, AH_, AW_}; shapes_b_ = {NC_, BD_, BH_, BW_, NF_}; @@ -232,65 +234,70 @@ void conv::init(driver::stream *stream, triton::driver::cu_module* module) { } void conv::set_arg(driver::kernel *kernel, - driver::buffer *a, driver::buffer *b, driver::buffer *c) + driver::buffer *a, driver::buffer *b, driver::buffer *c, driver::buffer *bias) { kernel->setArg(0, a); kernel->setArg(1, b); kernel->setArg(2, c); - kernel->setArg(3, M_); - kernel->setArg(4, N_); - kernel->setArg(5, K_); - kernel->setArg(6, AH_); - kernel->setArg(7, AW_); - kernel->setArg(8, BH_); - kernel->setArg(9, BW_); - kernel->setArg(10, CH_); - kernel->setArg(11, CW_); + kernel->setArg(3, bias); + kernel->setArg(4, M_); + kernel->setArg(5, N_); + kernel->setArg(6, K_); + kernel->setArg(7, AH_); + kernel->setArg(8, AW_); + kernel->setArg(9, BH_); + kernel->setArg(10, BW_); + kernel->setArg(11, CH_); + kernel->setArg(12, CW_); // A arguments if(ty_ == WGRAD){ - kernel->setArg(12, ld_a_[1]); - kernel->setArg(13, ld_a_[0]); + kernel->setArg(13, ld_a_[1]); + kernel->setArg(14, ld_a_[0]); } else{ - kernel->setArg(12, ld_a_[0]); - kernel->setArg(13, ld_a_[1]); + kernel->setArg(13, ld_a_[0]); + kernel->setArg(14, ld_a_[1]); } - kernel->setArg(14, ld_a_[2]); - kernel->setArg(15, ld_a_[3]); - kernel->setArg(16, ld_a_[4]); + kernel->setArg(15, ld_a_[2]); + kernel->setArg(16, ld_a_[3]); + kernel->setArg(17, ld_a_[4]); // B arguments if(ty_ == WGRAD){ - kernel->setArg(17, ld_b_[0]); - kernel->setArg(18, ld_b_[2]); - kernel->setArg(19, ld_b_[3]); - kernel->setArg(20, ld_b_[4]); - kernel->setArg(21, ld_b_[1]); - } - else{ - kernel->setArg(17, ld_b_[0]); - kernel->setArg(18, ld_b_[1]); + kernel->setArg(18, ld_b_[0]); kernel->setArg(19, ld_b_[2]); kernel->setArg(20, ld_b_[3]); kernel->setArg(21, ld_b_[4]); + kernel->setArg(22, ld_b_[1]); + } + else{ + kernel->setArg(18, ld_b_[0]); + kernel->setArg(19, ld_b_[1]); + kernel->setArg(20, ld_b_[2]); + kernel->setArg(21, ld_b_[3]); + kernel->setArg(22, ld_b_[4]); } // C arguments if(ty_ == WGRAD){ - kernel->setArg(22, ld_c_[0]); - kernel->setArg(23, ld_c_[4]); + kernel->setArg(23, ld_c_[0]); + kernel->setArg(24, ld_c_[4]); + kernel->setArg(25, ld_c_[1]); + kernel->setArg(26, ld_c_[2]); + kernel->setArg(27, ld_c_[3]); + } + else{ + kernel->setArg(23, ld_c_[0]); kernel->setArg(24, ld_c_[1]); kernel->setArg(25, ld_c_[2]); kernel->setArg(26, ld_c_[3]); + kernel->setArg(27, ld_c_[4]); } - else{ - kernel->setArg(22, ld_c_[0]); - kernel->setArg(23, ld_c_[1]); - kernel->setArg(24, ld_c_[2]); - kernel->setArg(25, ld_c_[3]); - kernel->setArg(26, ld_c_[4]); - } - kernel->setArg(27, pad_h_); - kernel->setArg(28, pad_w_); - size_t idx = 29; + kernel->setArg(28, pad_h_); + kernel->setArg(29, pad_w_); + kernel->setArg(30, stride_h_); + kernel->setArg(31, stride_w_); + kernel->setArg(32, upsample_h_); + kernel->setArg(33, upsample_w_); + size_t idx = 34; if(!is_a_deltas_cst) kernel->setArg(idx++, d_a_deltas_); if(!is_b_deltas_cst_) diff --git a/lib/driver/kernel.cpp b/lib/driver/kernel.cpp index 81c797047..a16e3e6f9 100755 --- a/lib/driver/kernel.cpp +++ b/lib/driver/kernel.cpp @@ -81,7 +81,10 @@ void host_kernel::setArg(unsigned int index, std::size_t size, void* ptr){ } void host_kernel::setArg(unsigned int index, driver::buffer* buffer){ - kernel::setArg(index, (void*)buffer->hst()->data); + if(buffer) + kernel::setArg(index, (void*)buffer->hst()->data); + else + kernel::setArg(index, (std::ptrdiff_t)0); } const std::vector &host_kernel::params(){ @@ -106,7 +109,10 @@ void ocl_kernel::setArg(unsigned int index, std::size_t size, void* ptr) { } void ocl_kernel::setArg(unsigned int index, driver::buffer* buffer) { - check(dispatch::clSetKernelArg(*cl_, index, sizeof(cl_mem), (void*)&*buffer->cl())); + if(buffer) + check(dispatch::clSetKernelArg(*cl_, index, sizeof(cl_mem), (void*)&*buffer->cl())); + else + kernel::setArg(index, (std::ptrdiff_t)0); } @@ -130,8 +136,12 @@ void cu_kernel::setArg(unsigned int index, std::size_t size, void* ptr){ cu_params_[index] = cu_params_store_[index].get(); } -void cu_kernel::setArg(unsigned int index, driver::buffer* data) -{ return kernel::setArg(index, *data->cu());} +void cu_kernel::setArg(unsigned int index, driver::buffer* data){ + if(data) + kernel::setArg(index, *data->cu()); + else + kernel::setArg(index, (std::ptrdiff_t)0); +} void* const* cu_kernel::cu_params() const { return cu_params_.data(); } From 2672812ad01c32013c201daf1289982e3a8e2291 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 22 May 2019 15:25:43 -0400 Subject: [PATCH 153/494] [dnn/conv] No more divergent path in conv::set_arg --- include/triton/dnn/conv.h | 10 ++++++ lib/dnn/conv.cpp | 68 ++++++++++++++++++--------------------- 2 files changed, 41 insertions(+), 37 deletions(-) diff --git a/include/triton/dnn/conv.h b/include/triton/dnn/conv.h index 313065fc6..af8861bb8 100644 --- a/include/triton/dnn/conv.h +++ b/include/triton/dnn/conv.h @@ -266,6 +266,16 @@ private: bool bias_; bool b_trans_; bool b_lut_; + // axis index + int32_t a_inner_idx_; + int32_t a_outer_idx_; + int32_t a_pix_idx_; + int32_t b_inner_idx_; + int32_t b_outer_idx_; + int32_t b_pix_idx_; + int32_t c_outer_0_idx_; + int32_t c_outer_1_idx_; + int32_t c_pix_idx; }; } diff --git a/lib/dnn/conv.cpp b/lib/dnn/conv.cpp index fc97acbba..2cdfecb63 100644 --- a/lib/dnn/conv.cpp +++ b/lib/dnn/conv.cpp @@ -19,11 +19,22 @@ conv::conv(int B, int NC, CD_ = (AD_*upsample_d_ - BD_ + 1 + 2*pad_d_ + stride_d_ - 1)/stride_d_; CH_ = (AH_*upsample_h_ - BH_ + 1 + 2*pad_h_ + stride_h_ - 1)/stride_h_; CW_ = (AW_*upsample_w_ - BW_ + 1 + 2*pad_w_ + stride_w_ - 1)/stride_w_; - // shapes shapes_a_ = {NB_, NC_, AD_, AH_, AW_}; shapes_b_ = {NC_, BD_, BH_, BW_, NF_}; shapes_c_ = {NB_, NF_, CD_, CH_, CW_}; + // a layout - NCHW + a_outer_idx_ = 0; + a_inner_idx_ = 1; + a_pix_idx_ = 2; + // b layout - CRSK + b_inner_idx_ = 0; + b_pix_idx_ = 1; + b_outer_idx_ = 4; + // c layout - NKPQ + c_outer_0_idx_ = 0; + c_outer_1_idx_ = 1; + c_pix_idx = 2; // swap a and c for bprop if(ty_ == BPROP){ std::swap(AD_, CD_); @@ -40,6 +51,10 @@ conv::conv(int B, int NC, std::swap(BD_, CD_); std::swap(BH_, CH_); std::swap(BW_, CW_); + std::swap(a_outer_idx_, a_inner_idx_); + std::swap(b_inner_idx_, c_outer_0_idx_); + std::swap(b_outer_idx_, c_outer_1_idx_); + std::swap(b_pix_idx_, c_pix_idx); } // leading dimensions auto set_ld = [](const std::vector& shapes, @@ -250,51 +265,30 @@ void conv::set_arg(driver::kernel *kernel, kernel->setArg(11, CH_); kernel->setArg(12, CW_); // A arguments - if(ty_ == WGRAD){ - kernel->setArg(13, ld_a_[1]); - kernel->setArg(14, ld_a_[0]); - } - else{ - kernel->setArg(13, ld_a_[0]); - kernel->setArg(14, ld_a_[1]); - } + kernel->setArg(13, ld_a_[a_outer_idx_]); + kernel->setArg(14, ld_a_[a_inner_idx_]); kernel->setArg(15, ld_a_[2]); kernel->setArg(16, ld_a_[3]); kernel->setArg(17, ld_a_[4]); // B arguments - if(ty_ == WGRAD){ - kernel->setArg(18, ld_b_[0]); - kernel->setArg(19, ld_b_[2]); - kernel->setArg(20, ld_b_[3]); - kernel->setArg(21, ld_b_[4]); - kernel->setArg(22, ld_b_[1]); - } - else{ - kernel->setArg(18, ld_b_[0]); - kernel->setArg(19, ld_b_[1]); - kernel->setArg(20, ld_b_[2]); - kernel->setArg(21, ld_b_[3]); - kernel->setArg(22, ld_b_[4]); - } + kernel->setArg(18, ld_b_[b_inner_idx_]); + kernel->setArg(19, ld_b_[b_pix_idx_]); + kernel->setArg(20, ld_b_[b_pix_idx_+1]); + kernel->setArg(21, ld_b_[b_pix_idx_+2]); + kernel->setArg(22, ld_b_[b_outer_idx_]); // C arguments - if(ty_ == WGRAD){ - kernel->setArg(23, ld_c_[0]); - kernel->setArg(24, ld_c_[4]); - kernel->setArg(25, ld_c_[1]); - kernel->setArg(26, ld_c_[2]); - kernel->setArg(27, ld_c_[3]); - } - else{ - kernel->setArg(23, ld_c_[0]); - kernel->setArg(24, ld_c_[1]); - kernel->setArg(25, ld_c_[2]); - kernel->setArg(26, ld_c_[3]); - kernel->setArg(27, ld_c_[4]); - } + kernel->setArg(23, ld_c_[c_outer_0_idx_]); + kernel->setArg(24, ld_c_[c_outer_1_idx_]); + kernel->setArg(25, ld_c_[c_pix_idx]); + kernel->setArg(26, ld_c_[c_pix_idx+1]); + kernel->setArg(27, ld_c_[c_pix_idx+2]); + // pad kernel->setArg(28, pad_h_); kernel->setArg(29, pad_w_); + // stride kernel->setArg(30, stride_h_); kernel->setArg(31, stride_w_); + // dilate kernel->setArg(32, upsample_h_); kernel->setArg(33, upsample_w_); size_t idx = 34; From f8291af7efda8ba26366aec936187a7fe6fba5af Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 22 May 2019 17:49:40 -0400 Subject: [PATCH 154/494] [dnn/conv] removed divergent paths in LUT computations --- include/triton/dnn/conv.h | 11 +++++------ lib/dnn/conv.cpp | 36 ++++++++++++++++++------------------ 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/include/triton/dnn/conv.h b/include/triton/dnn/conv.h index af8861bb8..05a3bd1ee 100644 --- a/include/triton/dnn/conv.h +++ b/include/triton/dnn/conv.h @@ -46,7 +46,6 @@ public: // source std::string src(){ - bool is_wgrad = ty_ == WGRAD; std::string BS = b_trans_ ? "[TN,TK]" : "[TK, TN]"; std::string bcb0 = b_trans_ ? "[:, newaxis]" : "[newaxis, :]"; std::string bcb1 = b_trans_ ? "[newaxis, :]" : "[:, newaxis]"; @@ -61,7 +60,7 @@ public: redax = {"C", "BH", "BW"}; else redax = {"BH", "BW", "N"}; - std::string inc_pb = is_wgrad ? "db[newaxis, :]" : "TK" + ldb0; + std::string inc_pb = b_lut_ ? "db[newaxis, :]" : "TK" + ldb0; std::string a_delta_mem = is_a_deltas_cst ? "__constant__" : ""; std::string b_delta_mem = is_b_deltas_cst_? "__constant__" : ""; std::string masks_mem = is_mask_cst_? "__constant__" : ""; @@ -74,7 +73,7 @@ public: )"; if(is_a_deltas_cst) res += "__constant__ int32* delta = alloc_const int32[" + std::to_string(h_a_deltas_.size()) + "];\n"; - if(is_wgrad && is_b_deltas_cst_) + if(b_lut_ && is_b_deltas_cst_) res += "__constant__ int32* b_delta = alloc_const int32[" + std::to_string(h_b_deltas_.size()) + "];\n"; if(is_mask_cst_) res += "__constant__ int32* masks = alloc_const int32[" + std::to_string(h_masks_.size()) + "];\n"; @@ -96,7 +95,7 @@ public: int32 upsample_h, int32 upsample_w)"; if(!is_a_deltas_cst) res += ", int32* delta"; - if(is_wgrad && !is_b_deltas_cst_) + if(b_lut_ && !is_b_deltas_cst_) res += ", int32* b_delta"; if(!is_mask_cst_) res += ", int32* masks"; @@ -122,7 +121,7 @@ public: ras = )" + flips + R"( ras; int32 ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; fp32* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis];)"; - if(ty_ == WGRAD){ + if(b_lut_){ res += R"( int32 rbcr[TK] = rkb / BW; int32 rbs[TK] = rkb % BW; @@ -158,7 +157,7 @@ public: pb = pb + )" + inc_pb + R"(; b = *pb; pd = pd + incd;)"; - if(ty_ == WGRAD){ + if(b_lut_){ res += R"( pdb = pdb + TK; db = *pdb;)"; diff --git a/lib/dnn/conv.cpp b/lib/dnn/conv.cpp index 2cdfecb63..05b9211a5 100644 --- a/lib/dnn/conv.cpp +++ b/lib/dnn/conv.cpp @@ -73,12 +73,12 @@ conv::conv(int B, int NC, // equivalent matmul b_trans_ = ty_ != BPROP; b_lut_ = ty_ == WGRAD; - if(ty_ == WGRAD){ + if(ty_ == WGRAD) { M_ = shapes_c_[0]*shapes_c_[1]*shapes_c_[2]*shapes_c_[3]; N_ = shapes_c_[4]; K_ = shapes_b_[0]*shapes_b_[2]*shapes_b_[3]*shapes_b_[4]; } - else{ + else { M_ = shapes_c_[0]*shapes_c_[2]*shapes_c_[3]*shapes_c_[4]; N_ = shapes_c_[1]; K_ = shapes_b_[0]*shapes_b_[1]*shapes_b_[2]*shapes_b_[3]; @@ -120,14 +120,14 @@ void conv::build_deltas(){ if(b_lut_) h_b_deltas_.resize(Luts_); - auto unpack = [&](int32_t ltrs){ - int32_t l = (ty_ == BPROP) ? ltrs % NF_ : ltrs / (BD_*BH_*BW_); - int32_t trs = (ty_ == BPROP) ? ltrs / NF_ : ltrs % (BD_*BH_*BW_); + auto unpack = [&](int32_t ltrs) { + int32_t l = (!b_trans_) ? ltrs % NF_ : ltrs / (BD_*BH_*BW_); + int32_t trs = (!b_trans_) ? ltrs / NF_ : ltrs % (BD_*BH_*BW_); int32_t tr = trs / BW_; int32_t s = trs % BW_; int32_t t = tr / BH_; int32_t r = tr % BH_; - if(ty_ == BPROP){ + if(!b_trans_){ r = BH_ - 1 - r; s = BW_ - 1 - s; } @@ -143,7 +143,7 @@ void conv::build_deltas(){ size_t Ds3 = upsample_d_; for(size_t pd = 0; pd < Ds3; ++pd) for(size_t ph = 0; ph < Ds2; ++ph) - for(size_t pw = 0; pw < Ds1; ++pw){ + for(size_t pw = 0; pw < Ds1; ++pw) { int32_t* deltas_ptr = &h_a_deltas_[Luts_ + pw*Ds0 + ph*Ds0*Ds1 + pd*Ds0*Ds1*Ds2]; // cumulative increments for(size_t i = 0; i < Ds0; ++i) { @@ -161,21 +161,21 @@ void conv::build_deltas(){ int32_t rdiff = (nextr + ph)/upsample_h_ - (r + ph)/upsample_h_; int32_t sdiff = (nexts + pw)/upsample_w_ - (s + pw)/upsample_w_; // delta pointers - if(ty_ == WGRAD) - deltas_ptr[i] = cdiff*ld_a_[0] + tdiff*ld_a_[2] + rdiff*ld_a_[3] + sdiff*ld_a_[4]; - else - deltas_ptr[i] = cdiff*ld_a_[1] + tdiff*ld_a_[2] + rdiff*ld_a_[3] + sdiff*ld_a_[4]; + deltas_ptr[i] = cdiff*ld_a_[a_inner_idx_] + tdiff*ld_a_[a_pix_idx_] + rdiff*ld_a_[a_pix_idx_ + 1] + sdiff*ld_a_[a_pix_idx_ + 2]; } } - if(ty_ == WGRAD){ + if(b_lut_) { for(size_t i = 0; i < Ds0; ++i) { int32_t c, t, r, s; int32_t nextc, nextt, nextr, nexts; std::tie(c, t, r, s) = unpack(i); std::tie(nextc, nextt, nextr, nexts) = unpack(i + TK_); - int32_t cdiff = nextc - c, tdiff = nextt - t, rdiff = nextr - r, sdiff = nexts - s; - h_b_deltas_[i] = cdiff*ld_b_[0] + tdiff*ld_b_[2] + rdiff*ld_b_[3] + sdiff*ld_b_[4]; + int32_t cdiff = nextc - c; + int32_t tdiff = nextt - t; + int32_t rdiff = nextr - r; + int32_t sdiff = nexts - s; + h_b_deltas_[i] = cdiff*ld_b_[b_inner_idx_] + tdiff*ld_b_[b_pix_idx_] + rdiff*ld_b_[b_pix_idx_ + 1] + sdiff*ld_b_[b_pix_idx_ + 2]; } } } @@ -184,13 +184,13 @@ void conv::build_masks(){ h_masks_.resize(Luts_ + (2*pad_h_+1)*(2*pad_w_+1)*(2*pad_d_+1)*Luts_); auto unpack = [&](int32_t ltrs){ - int32_t l = (ty_ == BPROP) ? ltrs % NF_ : ltrs / (BD_*BH_*BW_); - int32_t trs = (ty_ == BPROP) ? ltrs / NF_ : ltrs % (BD_*BH_*BW_); + int32_t l = (!b_trans_) ? ltrs % NF_ : ltrs / (BD_*BH_*BW_); + int32_t trs = (!b_trans_) ? ltrs / NF_ : ltrs % (BD_*BH_*BW_); int32_t tr = trs / BW_; int32_t s = trs % BW_; int32_t t = tr / BH_; int32_t r = tr % BH_; - if(ty_ == BPROP){ + if(!b_trans_){ r = BH_ - 1 - r; s = BW_ - 1 - s; } @@ -338,7 +338,7 @@ void conv::cpu_xprop(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B) if(in_bounds) a = A[n*ld_a_[0] + ac*ld_a_[1] + ad*ld_a_[2] + ah*ld_a_[3] + aw*ld_a_[4]]; IN_DTYPE b; - if(ty_==FPROP) + if(b_trans_) b = B[ac*ld_b_[0] + bd*ld_b_[1] + bh*ld_b_[2] + bw*ld_b_[3] + cf*ld_b_[4]]; else{ int32_t bdd = shapes_b_[1] - 1 - bd; From 3f3eb1c2a45f9a64f64f8ad10ce36860f7cbf83b Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 22 May 2019 19:03:33 -0400 Subject: [PATCH 155/494] [dnn/conv] Added the option to have look-up table for filters for all operations --- examples/cpp/conv.cpp | 2 +- include/triton/dnn/conv.h | 18 +++++++++--------- lib/dnn/conv.cpp | 6 +++--- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/cpp/conv.cpp b/examples/cpp/conv.cpp index 2bbec482a..179fca421 100644 --- a/examples/cpp/conv.cpp +++ b/examples/cpp/conv.cpp @@ -14,7 +14,7 @@ int main() { // initialization int32_t B = 4, NF = 32; int32_t D = 1, H = 56, W = 56; - int32_t NC = 32, T = 1, R = 3, S = 3; + int32_t NC = 16, T = 1, R = 3, S = 3; int32_t pad_d = 0, pad_h = 0, pad_w = 0; int32_t stride_d = 1, stride_h = 1, stride_w = 1; int32_t upsample_d = 1, upsample_h = 1, upsample_w = 1; diff --git a/include/triton/dnn/conv.h b/include/triton/dnn/conv.h index 05a3bd1ee..1c9127466 100644 --- a/include/triton/dnn/conv.h +++ b/include/triton/dnn/conv.h @@ -60,7 +60,7 @@ public: redax = {"C", "BH", "BW"}; else redax = {"BH", "BW", "N"}; - std::string inc_pb = b_lut_ ? "db[newaxis, :]" : "TK" + ldb0; + std::string inc_pb = b_lut_ ? "db" + bcb1 : "TK" + ldb0; std::string a_delta_mem = is_a_deltas_cst ? "__constant__" : ""; std::string b_delta_mem = is_b_deltas_cst_? "__constant__" : ""; std::string masks_mem = is_mask_cst_? "__constant__" : ""; @@ -133,13 +133,13 @@ public: } else{ res += R"( - int32 rb1[TK] = rkb;)"; + int32 rb1[TK] = rkb)" + ldb0 + ";"; } res += R"( - fp32* pb)" + BS + " = b + rb1" + bcb1 + ldb0 + " + rb0" + bcb0 + ldb1 + R"(; + fp32* pb)" + BS + " = b + rb1" + bcb1 + " + rb0" + bcb0 + ldb1 + R"(; )" + a_delta_mem + R"( int32* pincd[TK] = delta + rka; - )" + a_delta_mem + R"( int32* pd[TK] = delta + ldlut + rka; - int32 d[TK] = *pd; + )" + a_delta_mem + R"( int32* pda[TK] = delta + ldlut + rka; + int32 da[TK] = *pda; int32 incd[TK] = *pincd; int32 maskh[TM] = pad_h + min(rah, 0) + max(rah + BH - AH, 0); int32 maskw[TM] = pad_w + min(raw, 0) + max(raw + BW - AW, 0); @@ -153,18 +153,18 @@ public: fp32 b)" + BS + R"( = *pb; for(int32 k = K; k > 0; k = k - TK){ C = dot(a, )" + useb + R"(, C); - pa = pa + d[newaxis, :]; + pa = pa + da[newaxis, :]; pb = pb + )" + inc_pb + R"(; b = *pb; - pd = pd + incd;)"; + pda = pda + incd;)"; if(b_lut_){ res += R"( - pdb = pdb + TK; + pdb = pdb + incd; db = *pdb;)"; } res += R"( pincd = pincd + incd; - d = *pd; + da = *pda; incd = *pincd; pm = pm + incm; pincm = pincm + incm; diff --git a/lib/dnn/conv.cpp b/lib/dnn/conv.cpp index 05b9211a5..889a37f00 100644 --- a/lib/dnn/conv.cpp +++ b/lib/dnn/conv.cpp @@ -301,12 +301,12 @@ void conv::set_arg(driver::kernel *kernel, } std::vector conv::default_params() { - if(ty_==FPROP) + if(b_lut_) + return {32, 2, 64, 32, 2, 64, 16, 8, 2, 2, 4, 2, 8}; + else if(ty_ == FPROP) return {16, 2, 64, 32, 2, 64, 16, 8, 2, 2, 8, 1, 8, 4}; else if(ty_ == BPROP) return {32, 2, 64, 32, 64, 32, 4, 2, 2, 4, 2, 8, 4, 2}; - else if(ty_ == WGRAD) - return {32, 2, 64, 32, 2, 64, 16, 8, 2, 2, 4, 2, 8}; } From e526ffc62bfcbb256b8206866345e7693bf7b40b Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 28 May 2019 11:21:21 -0400 Subject: [PATCH 156/494] [examples/pytorch] added a bunch of models for more thorough testing --- examples/python/pytorch/common.hpp | 75 ++++++++ examples/python/pytorch/main.py | 145 ++++++++++++++++ examples/python/pytorch/models/__init__.py | 14 ++ examples/python/pytorch/models/densenet.py | 107 ++++++++++++ examples/python/pytorch/models/dpn.py | 98 +++++++++++ examples/python/pytorch/models/googlenet.py | 107 ++++++++++++ examples/python/pytorch/models/lenet.py | 24 +++ examples/python/pytorch/models/mobilenet.py | 61 +++++++ examples/python/pytorch/models/mobilenetv2.py | 86 ++++++++++ examples/python/pytorch/models/pnasnet.py | 125 ++++++++++++++ .../python/pytorch/models/preact_resnet.py | 118 +++++++++++++ examples/python/pytorch/models/resnet.py | 121 +++++++++++++ examples/python/pytorch/models/resnext.py | 95 ++++++++++ examples/python/pytorch/models/senet.py | 121 +++++++++++++ examples/python/pytorch/models/shufflenet.py | 109 ++++++++++++ .../python/pytorch/models/shufflenetv2.py | 162 ++++++++++++++++++ examples/python/pytorch/models/vgg.py | 47 +++++ examples/python/pytorch/utils.py | 124 ++++++++++++++ 18 files changed, 1739 insertions(+) create mode 100644 examples/python/pytorch/common.hpp create mode 100644 examples/python/pytorch/main.py create mode 100644 examples/python/pytorch/models/__init__.py create mode 100644 examples/python/pytorch/models/densenet.py create mode 100644 examples/python/pytorch/models/dpn.py create mode 100644 examples/python/pytorch/models/googlenet.py create mode 100644 examples/python/pytorch/models/lenet.py create mode 100644 examples/python/pytorch/models/mobilenet.py create mode 100644 examples/python/pytorch/models/mobilenetv2.py create mode 100644 examples/python/pytorch/models/pnasnet.py create mode 100644 examples/python/pytorch/models/preact_resnet.py create mode 100644 examples/python/pytorch/models/resnet.py create mode 100644 examples/python/pytorch/models/resnext.py create mode 100644 examples/python/pytorch/models/senet.py create mode 100644 examples/python/pytorch/models/shufflenet.py create mode 100644 examples/python/pytorch/models/shufflenetv2.py create mode 100644 examples/python/pytorch/models/vgg.py create mode 100644 examples/python/pytorch/utils.py diff --git a/examples/python/pytorch/common.hpp b/examples/python/pytorch/common.hpp new file mode 100644 index 000000000..f2ba2f83b --- /dev/null +++ b/examples/python/pytorch/common.hpp @@ -0,0 +1,75 @@ +#include +#include +#include +#include "triton/driver/device.h" +#include + +class timer{ + typedef std::chrono::high_resolution_clock high_resolution_clock; + typedef std::chrono::nanoseconds nanoseconds; + +public: + explicit timer(bool run = false) + { if (run) start(); } + + void start() + { _start = high_resolution_clock::now(); } + + nanoseconds get() const + { return std::chrono::duration_cast(high_resolution_clock::now() - _start); } + +private: + high_resolution_clock::time_point _start; +}; + +template +T min(std::vector x) +{ return *std::min_element(x.begin(), x.end()); } + + +template +double bench(OP const & op, SYNC const & sync, triton::driver::device const & device) +{ + timer tmr; + std::vector times; + double total_time = 0; + op(); + sync(); + while(total_time*1e-9 < 1e-3){ + float norm = 1; + tmr.start(); + op(); + sync(); + times.push_back(norm*tmr.get().count()); + total_time+=times.back(); + } + return min(times); +} + +// helper function to print a tuple of any size +template +struct TuplePrinter { + static void print(const Tuple& t) + { + TuplePrinter::print(t); + std::cout << ", " << std::get(t); + } +}; + +template +struct TuplePrinter { + static void print(const Tuple& t) + { + std::cout << std::get<0>(t); + } +}; + +template +void print(const std::tuple& t) +{ + std::cout << "("; + TuplePrinter::print(t); + std::cout << ")\n"; +} + + diff --git a/examples/python/pytorch/main.py b/examples/python/pytorch/main.py new file mode 100644 index 000000000..5b3de3790 --- /dev/null +++ b/examples/python/pytorch/main.py @@ -0,0 +1,145 @@ +'''Train CIFAR10 with PyTorch.''' +from __future__ import print_function + +import torch +import torch.nn as nn +import torch.optim as optim +import torch.nn.functional as F +import torch.backends.cudnn as cudnn + +import torchvision +import torchvision.transforms as transforms + +import os +import argparse +import numpy as np +import random + +from models import * +from utils import progress_bar + + +parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training') +parser.add_argument('--lr', default=0.1, type=float, help='learning rate') +parser.add_argument('--resume', '-r', action='store_true', help='resume from checkpoint') +args = parser.parse_args() + +device = 'cuda' if torch.cuda.is_available() else 'cpu' +best_acc = 0 # best test accuracy +start_epoch = 0 # start from epoch 0 or last checkpoint epoch + +# Data +print('==> Preparing data..') +transform_train = transforms.Compose([ + transforms.RandomCrop(32, padding=4), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), +]) + +transform_test = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), +]) + +trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train) +trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2) + +testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test) +testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2) + +classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') + +# Model +print('==> Building model..') +net = LeNet() +# net = VGG('VGG19') +# net = ResNet18() +# net = PreActResNet18() +# net = GoogLeNet() +# net = DenseNet121() +# net = ResNeXt29_2x64d() +# net = MobileNet() +# net = MobileNetV2() +# net = DPN92() +# net = ShuffleNetG2() +# net = SENet18() +# net = ShuffleNetV2(1) +net = net.to(device) +if device == 'cuda': + net = torch.nn.DataParallel(net) + cudnn.benchmark = False + +if args.resume: + # Load checkpoint. + print('==> Resuming from checkpoint..') + assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!' + checkpoint = torch.load('./checkpoint/ckpt.t7') + net.load_state_dict(checkpoint['net']) + best_acc = checkpoint['acc'] + start_epoch = checkpoint['epoch'] + +criterion = nn.CrossEntropyLoss() +optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) + +# Training +def train(epoch): + print('\nEpoch: %d' % epoch) + net.train() + train_loss = 0 + correct = 0 + total = 0 + for batch_idx, (inputs, targets) in enumerate(trainloader): + inputs, targets = inputs.to(device), targets.to(device) + optimizer.zero_grad() + outputs = net(inputs) + loss = criterion(outputs, targets) + loss.backward() + optimizer.step() + + train_loss += loss.item() + _, predicted = outputs.max(1) + total += targets.size(0) + correct += predicted.eq(targets).sum().item() + + progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' + % (train_loss/(batch_idx+1), 100.*correct/total, correct, total)) + +def test(epoch): + global best_acc + net.eval() + test_loss = 0 + correct = 0 + total = 0 + with torch.no_grad(): + for batch_idx, (inputs, targets) in enumerate(testloader): + inputs, targets = inputs.to(device), targets.to(device) + outputs = net(inputs) + loss = criterion(outputs, targets) + + test_loss += loss.item() + _, predicted = outputs.max(1) + total += targets.size(0) + correct += predicted.eq(targets).sum().item() + + progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' + % (test_loss/(batch_idx+1), 100.*correct/total, correct, total)) + + # Save checkpoint. + acc = 100.*correct/total + if acc > best_acc: + print('Saving..') + state = { + 'net': net.state_dict(), + 'acc': acc, + 'epoch': epoch, + } + if not os.path.isdir('checkpoint'): + os.mkdir('checkpoint') + torch.save(state, './checkpoint/ckpt.t7') + best_acc = acc + + +for epoch in range(start_epoch, start_epoch+200): + train(epoch) + test(epoch) diff --git a/examples/python/pytorch/models/__init__.py b/examples/python/pytorch/models/__init__.py new file mode 100644 index 000000000..877893903 --- /dev/null +++ b/examples/python/pytorch/models/__init__.py @@ -0,0 +1,14 @@ +from .vgg import * +from .dpn import * +from .lenet import * +from .senet import * +from .pnasnet import * +from .densenet import * +from .googlenet import * +from .shufflenet import * +from .shufflenetv2 import * +from .resnet import * +from .resnext import * +from .preact_resnet import * +from .mobilenet import * +from .mobilenetv2 import * diff --git a/examples/python/pytorch/models/densenet.py b/examples/python/pytorch/models/densenet.py new file mode 100644 index 000000000..47ebbbe08 --- /dev/null +++ b/examples/python/pytorch/models/densenet.py @@ -0,0 +1,107 @@ +'''DenseNet in PyTorch.''' +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class Bottleneck(nn.Module): + def __init__(self, in_planes, growth_rate): + super(Bottleneck, self).__init__() + self.bn1 = nn.BatchNorm2d(in_planes) + self.conv1 = nn.Conv2d(in_planes, 4*growth_rate, kernel_size=1, bias=False) + self.bn2 = nn.BatchNorm2d(4*growth_rate) + self.conv2 = nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, padding=1, bias=False) + + def forward(self, x): + out = self.conv1(F.relu(self.bn1(x))) + out = self.conv2(F.relu(self.bn2(out))) + out = torch.cat([out,x], 1) + return out + + +class Transition(nn.Module): + def __init__(self, in_planes, out_planes): + super(Transition, self).__init__() + self.bn = nn.BatchNorm2d(in_planes) + self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False) + + def forward(self, x): + out = self.conv(F.relu(self.bn(x))) + out = F.avg_pool2d(out, 2) + return out + + +class DenseNet(nn.Module): + def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10): + super(DenseNet, self).__init__() + self.growth_rate = growth_rate + + num_planes = 2*growth_rate + self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=False) + + self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0]) + num_planes += nblocks[0]*growth_rate + out_planes = int(math.floor(num_planes*reduction)) + self.trans1 = Transition(num_planes, out_planes) + num_planes = out_planes + + self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1]) + num_planes += nblocks[1]*growth_rate + out_planes = int(math.floor(num_planes*reduction)) + self.trans2 = Transition(num_planes, out_planes) + num_planes = out_planes + + self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2]) + num_planes += nblocks[2]*growth_rate + out_planes = int(math.floor(num_planes*reduction)) + self.trans3 = Transition(num_planes, out_planes) + num_planes = out_planes + + self.dense4 = self._make_dense_layers(block, num_planes, nblocks[3]) + num_planes += nblocks[3]*growth_rate + + self.bn = nn.BatchNorm2d(num_planes) + self.linear = nn.Linear(num_planes, num_classes) + + def _make_dense_layers(self, block, in_planes, nblock): + layers = [] + for i in range(nblock): + layers.append(block(in_planes, self.growth_rate)) + in_planes += self.growth_rate + return nn.Sequential(*layers) + + def forward(self, x): + out = self.conv1(x) + out = self.trans1(self.dense1(out)) + out = self.trans2(self.dense2(out)) + out = self.trans3(self.dense3(out)) + out = self.dense4(out) + out = F.avg_pool2d(F.relu(self.bn(out)), 4) + out = out.view(out.size(0), -1) + out = self.linear(out) + return out + +def DenseNet121(): + return DenseNet(Bottleneck, [6,12,24,16], growth_rate=32) + +def DenseNet169(): + return DenseNet(Bottleneck, [6,12,32,32], growth_rate=32) + +def DenseNet201(): + return DenseNet(Bottleneck, [6,12,48,32], growth_rate=32) + +def DenseNet161(): + return DenseNet(Bottleneck, [6,12,36,24], growth_rate=48) + +def densenet_cifar(): + return DenseNet(Bottleneck, [6,12,24,16], growth_rate=12) + +def test(): + net = densenet_cifar() + x = torch.randn(1,3,32,32) + y = net(x) + print(y) + +# test() diff --git a/examples/python/pytorch/models/dpn.py b/examples/python/pytorch/models/dpn.py new file mode 100644 index 000000000..d334367fc --- /dev/null +++ b/examples/python/pytorch/models/dpn.py @@ -0,0 +1,98 @@ +'''Dual Path Networks in PyTorch.''' +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class Bottleneck(nn.Module): + def __init__(self, last_planes, in_planes, out_planes, dense_depth, stride, first_layer): + super(Bottleneck, self).__init__() + self.out_planes = out_planes + self.dense_depth = dense_depth + + self.conv1 = nn.Conv2d(last_planes, in_planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(in_planes) + self.conv2 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=32, bias=False) + self.bn2 = nn.BatchNorm2d(in_planes) + self.conv3 = nn.Conv2d(in_planes, out_planes+dense_depth, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(out_planes+dense_depth) + + self.shortcut = nn.Sequential() + if first_layer: + self.shortcut = nn.Sequential( + nn.Conv2d(last_planes, out_planes+dense_depth, kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(out_planes+dense_depth) + ) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = F.relu(self.bn2(self.conv2(out))) + out = self.bn3(self.conv3(out)) + x = self.shortcut(x) + d = self.out_planes + out = torch.cat([x[:,:d,:,:]+out[:,:d,:,:], x[:,d:,:,:], out[:,d:,:,:]], 1) + out = F.relu(out) + return out + + +class DPN(nn.Module): + def __init__(self, cfg): + super(DPN, self).__init__() + in_planes, out_planes = cfg['in_planes'], cfg['out_planes'] + num_blocks, dense_depth = cfg['num_blocks'], cfg['dense_depth'] + + self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(64) + self.last_planes = 64 + self.layer1 = self._make_layer(in_planes[0], out_planes[0], num_blocks[0], dense_depth[0], stride=1) + self.layer2 = self._make_layer(in_planes[1], out_planes[1], num_blocks[1], dense_depth[1], stride=2) + self.layer3 = self._make_layer(in_planes[2], out_planes[2], num_blocks[2], dense_depth[2], stride=2) + self.layer4 = self._make_layer(in_planes[3], out_planes[3], num_blocks[3], dense_depth[3], stride=2) + self.linear = nn.Linear(out_planes[3]+(num_blocks[3]+1)*dense_depth[3], 10) + + def _make_layer(self, in_planes, out_planes, num_blocks, dense_depth, stride): + strides = [stride] + [1]*(num_blocks-1) + layers = [] + for i,stride in enumerate(strides): + layers.append(Bottleneck(self.last_planes, in_planes, out_planes, dense_depth, stride, i==0)) + self.last_planes = out_planes + (i+2) * dense_depth + return nn.Sequential(*layers) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + out = F.avg_pool2d(out, 4) + out = out.view(out.size(0), -1) + out = self.linear(out) + return out + + +def DPN26(): + cfg = { + 'in_planes': (96,192,384,768), + 'out_planes': (256,512,1024,2048), + 'num_blocks': (2,2,2,2), + 'dense_depth': (16,32,24,128) + } + return DPN(cfg) + +def DPN92(): + cfg = { + 'in_planes': (96,192,384,768), + 'out_planes': (256,512,1024,2048), + 'num_blocks': (3,4,20,3), + 'dense_depth': (16,32,24,128) + } + return DPN(cfg) + + +def test(): + net = DPN92() + x = torch.randn(1,3,32,32) + y = net(x) + print(y) + +# test() diff --git a/examples/python/pytorch/models/googlenet.py b/examples/python/pytorch/models/googlenet.py new file mode 100644 index 000000000..de036d87d --- /dev/null +++ b/examples/python/pytorch/models/googlenet.py @@ -0,0 +1,107 @@ +'''GoogLeNet with PyTorch.''' +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class Inception(nn.Module): + def __init__(self, in_planes, n1x1, n3x3red, n3x3, n5x5red, n5x5, pool_planes): + super(Inception, self).__init__() + # 1x1 conv branch + self.b1 = nn.Sequential( + nn.Conv2d(in_planes, n1x1, kernel_size=1), + nn.BatchNorm2d(n1x1), + nn.ReLU(True), + ) + + # 1x1 conv -> 3x3 conv branch + self.b2 = nn.Sequential( + nn.Conv2d(in_planes, n3x3red, kernel_size=1), + nn.BatchNorm2d(n3x3red), + nn.ReLU(True), + nn.Conv2d(n3x3red, n3x3, kernel_size=3, padding=1), + nn.BatchNorm2d(n3x3), + nn.ReLU(True), + ) + + # 1x1 conv -> 5x5 conv branch + self.b3 = nn.Sequential( + nn.Conv2d(in_planes, n5x5red, kernel_size=1), + nn.BatchNorm2d(n5x5red), + nn.ReLU(True), + nn.Conv2d(n5x5red, n5x5, kernel_size=3, padding=1), + nn.BatchNorm2d(n5x5), + nn.ReLU(True), + nn.Conv2d(n5x5, n5x5, kernel_size=3, padding=1), + nn.BatchNorm2d(n5x5), + nn.ReLU(True), + ) + + # 3x3 pool -> 1x1 conv branch + self.b4 = nn.Sequential( + nn.MaxPool2d(3, stride=1, padding=1), + nn.Conv2d(in_planes, pool_planes, kernel_size=1), + nn.BatchNorm2d(pool_planes), + nn.ReLU(True), + ) + + def forward(self, x): + y1 = self.b1(x) + y2 = self.b2(x) + y3 = self.b3(x) + y4 = self.b4(x) + return torch.cat([y1,y2,y3,y4], 1) + + +class GoogLeNet(nn.Module): + def __init__(self): + super(GoogLeNet, self).__init__() + self.pre_layers = nn.Sequential( + nn.Conv2d(3, 192, kernel_size=3, padding=1), + nn.BatchNorm2d(192), + nn.ReLU(True), + ) + + self.a3 = Inception(192, 64, 96, 128, 16, 32, 32) + self.b3 = Inception(256, 128, 128, 192, 32, 96, 64) + + self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) + + self.a4 = Inception(480, 192, 96, 208, 16, 48, 64) + self.b4 = Inception(512, 160, 112, 224, 24, 64, 64) + self.c4 = Inception(512, 128, 128, 256, 24, 64, 64) + self.d4 = Inception(512, 112, 144, 288, 32, 64, 64) + self.e4 = Inception(528, 256, 160, 320, 32, 128, 128) + + self.a5 = Inception(832, 256, 160, 320, 32, 128, 128) + self.b5 = Inception(832, 384, 192, 384, 48, 128, 128) + + self.avgpool = nn.AvgPool2d(8, stride=1) + self.linear = nn.Linear(1024, 10) + + def forward(self, x): + out = self.pre_layers(x) + out = self.a3(out) + out = self.b3(out) + out = self.maxpool(out) + out = self.a4(out) + out = self.b4(out) + out = self.c4(out) + out = self.d4(out) + out = self.e4(out) + out = self.maxpool(out) + out = self.a5(out) + out = self.b5(out) + out = self.avgpool(out) + out = out.view(out.size(0), -1) + out = self.linear(out) + return out + + +def test(): + net = GoogLeNet() + x = torch.randn(1,3,32,32) + y = net(x) + print(y.size()) + +# test() diff --git a/examples/python/pytorch/models/lenet.py b/examples/python/pytorch/models/lenet.py new file mode 100644 index 000000000..49c4e9572 --- /dev/null +++ b/examples/python/pytorch/models/lenet.py @@ -0,0 +1,24 @@ +'''LeNet in PyTorch.''' +import torch.nn as nn +import torch.nn.functional as F +import triton + +class LeNet(nn.Module): + def __init__(self): + super(LeNet, self).__init__() + self.conv1 = nn.Conv2d(3, 512, 3) + self.conv2 = triton.Conv2d(512, 512, 1) + self.fc1 = nn.Linear(512*7*7, 120) + self.fc2 = nn.Linear(120, 84) + self.fc3 = nn.Linear(84, 10) + + def forward(self, x): + out = F.relu(self.conv1(x)) + out = F.max_pool2d(out, 2) + out = F.relu(self.conv2(out)) + out = F.max_pool2d(out, 2) + out = out.view(out.size(0), -1) + out = F.relu(self.fc1(out)) + out = F.relu(self.fc2(out)) + out = self.fc3(out) + return out diff --git a/examples/python/pytorch/models/mobilenet.py b/examples/python/pytorch/models/mobilenet.py new file mode 100644 index 000000000..497ef1e86 --- /dev/null +++ b/examples/python/pytorch/models/mobilenet.py @@ -0,0 +1,61 @@ +'''MobileNet in PyTorch. + +See the paper "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications" +for more details. +''' +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class Block(nn.Module): + '''Depthwise conv + Pointwise conv''' + def __init__(self, in_planes, out_planes, stride=1): + super(Block, self).__init__() + self.conv1 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=in_planes, bias=False) + self.bn1 = nn.BatchNorm2d(in_planes) + self.conv2 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) + self.bn2 = nn.BatchNorm2d(out_planes) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = F.relu(self.bn2(self.conv2(out))) + return out + + +class MobileNet(nn.Module): + # (128,2) means conv planes=128, conv stride=2, by default conv stride=1 + cfg = [64, (128,2), 128, (256,2), 256, (512,2), 512, 512, 512, 512, 512, (1024,2), 1024] + + def __init__(self, num_classes=10): + super(MobileNet, self).__init__() + self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(32) + self.layers = self._make_layers(in_planes=32) + self.linear = nn.Linear(1024, num_classes) + + def _make_layers(self, in_planes): + layers = [] + for x in self.cfg: + out_planes = x if isinstance(x, int) else x[0] + stride = 1 if isinstance(x, int) else x[1] + layers.append(Block(in_planes, out_planes, stride)) + in_planes = out_planes + return nn.Sequential(*layers) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = self.layers(out) + out = F.avg_pool2d(out, 2) + out = out.view(out.size(0), -1) + out = self.linear(out) + return out + + +def test(): + net = MobileNet() + x = torch.randn(1,3,32,32) + y = net(x) + print(y.size()) + +# test() diff --git a/examples/python/pytorch/models/mobilenetv2.py b/examples/python/pytorch/models/mobilenetv2.py new file mode 100644 index 000000000..17e5823ef --- /dev/null +++ b/examples/python/pytorch/models/mobilenetv2.py @@ -0,0 +1,86 @@ +'''MobileNetV2 in PyTorch. + +See the paper "Inverted Residuals and Linear Bottlenecks: +Mobile Networks for Classification, Detection and Segmentation" for more details. +''' +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class Block(nn.Module): + '''expand + depthwise + pointwise''' + def __init__(self, in_planes, out_planes, expansion, stride): + super(Block, self).__init__() + self.stride = stride + + planes = expansion * in_planes + self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, stride=1, padding=0, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, groups=planes, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.conv3 = nn.Conv2d(planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) + self.bn3 = nn.BatchNorm2d(out_planes) + + self.shortcut = nn.Sequential() + if stride == 1 and in_planes != out_planes: + self.shortcut = nn.Sequential( + nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False), + nn.BatchNorm2d(out_planes), + ) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = F.relu(self.bn2(self.conv2(out))) + out = self.bn3(self.conv3(out)) + out = out + self.shortcut(x) if self.stride==1 else out + return out + + +class MobileNetV2(nn.Module): + # (expansion, out_planes, num_blocks, stride) + cfg = [(1, 16, 1, 1), + (6, 24, 2, 1), # NOTE: change stride 2 -> 1 for CIFAR10 + (6, 32, 3, 2), + (6, 64, 4, 2), + (6, 96, 3, 1), + (6, 160, 3, 2), + (6, 320, 1, 1)] + + def __init__(self, num_classes=10): + super(MobileNetV2, self).__init__() + # NOTE: change conv1 stride 2 -> 1 for CIFAR10 + self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(32) + self.layers = self._make_layers(in_planes=32) + self.conv2 = nn.Conv2d(320, 1280, kernel_size=1, stride=1, padding=0, bias=False) + self.bn2 = nn.BatchNorm2d(1280) + self.linear = nn.Linear(1280, num_classes) + + def _make_layers(self, in_planes): + layers = [] + for expansion, out_planes, num_blocks, stride in self.cfg: + strides = [stride] + [1]*(num_blocks-1) + for stride in strides: + layers.append(Block(in_planes, out_planes, expansion, stride)) + in_planes = out_planes + return nn.Sequential(*layers) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = self.layers(out) + out = F.relu(self.bn2(self.conv2(out))) + # NOTE: change pooling kernel_size 7 -> 4 for CIFAR10 + out = F.avg_pool2d(out, 4) + out = out.view(out.size(0), -1) + out = self.linear(out) + return out + + +def test(): + net = MobileNetV2() + x = torch.randn(2,3,32,32) + y = net(x) + print(y.size()) + +# test() diff --git a/examples/python/pytorch/models/pnasnet.py b/examples/python/pytorch/models/pnasnet.py new file mode 100644 index 000000000..de8c4d51f --- /dev/null +++ b/examples/python/pytorch/models/pnasnet.py @@ -0,0 +1,125 @@ +'''PNASNet in PyTorch. + +Paper: Progressive Neural Architecture Search +''' +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class SepConv(nn.Module): + '''Separable Convolution.''' + def __init__(self, in_planes, out_planes, kernel_size, stride): + super(SepConv, self).__init__() + self.conv1 = nn.Conv2d(in_planes, out_planes, + kernel_size, stride, + padding=(kernel_size-1)//2, + bias=False, groups=in_planes) + self.bn1 = nn.BatchNorm2d(out_planes) + + def forward(self, x): + return self.bn1(self.conv1(x)) + + +class CellA(nn.Module): + def __init__(self, in_planes, out_planes, stride=1): + super(CellA, self).__init__() + self.stride = stride + self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride) + if stride==2: + self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) + self.bn1 = nn.BatchNorm2d(out_planes) + + def forward(self, x): + y1 = self.sep_conv1(x) + y2 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1) + if self.stride==2: + y2 = self.bn1(self.conv1(y2)) + return F.relu(y1+y2) + +class CellB(nn.Module): + def __init__(self, in_planes, out_planes, stride=1): + super(CellB, self).__init__() + self.stride = stride + # Left branch + self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride) + self.sep_conv2 = SepConv(in_planes, out_planes, kernel_size=3, stride=stride) + # Right branch + self.sep_conv3 = SepConv(in_planes, out_planes, kernel_size=5, stride=stride) + if stride==2: + self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) + self.bn1 = nn.BatchNorm2d(out_planes) + # Reduce channels + self.conv2 = nn.Conv2d(2*out_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) + self.bn2 = nn.BatchNorm2d(out_planes) + + def forward(self, x): + # Left branch + y1 = self.sep_conv1(x) + y2 = self.sep_conv2(x) + # Right branch + y3 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1) + if self.stride==2: + y3 = self.bn1(self.conv1(y3)) + y4 = self.sep_conv3(x) + # Concat & reduce channels + b1 = F.relu(y1+y2) + b2 = F.relu(y3+y4) + y = torch.cat([b1,b2], 1) + return F.relu(self.bn2(self.conv2(y))) + +class PNASNet(nn.Module): + def __init__(self, cell_type, num_cells, num_planes): + super(PNASNet, self).__init__() + self.in_planes = num_planes + self.cell_type = cell_type + + self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(num_planes) + + self.layer1 = self._make_layer(num_planes, num_cells=6) + self.layer2 = self._downsample(num_planes*2) + self.layer3 = self._make_layer(num_planes*2, num_cells=6) + self.layer4 = self._downsample(num_planes*4) + self.layer5 = self._make_layer(num_planes*4, num_cells=6) + + self.linear = nn.Linear(num_planes*4, 10) + + def _make_layer(self, planes, num_cells): + layers = [] + for _ in range(num_cells): + layers.append(self.cell_type(self.in_planes, planes, stride=1)) + self.in_planes = planes + return nn.Sequential(*layers) + + def _downsample(self, planes): + layer = self.cell_type(self.in_planes, planes, stride=2) + self.in_planes = planes + return layer + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + out = self.layer5(out) + out = F.avg_pool2d(out, 8) + out = self.linear(out.view(out.size(0), -1)) + return out + + +def PNASNetA(): + return PNASNet(CellA, num_cells=6, num_planes=44) + +def PNASNetB(): + return PNASNet(CellB, num_cells=6, num_planes=32) + + +def test(): + net = PNASNetB() + x = torch.randn(1,3,32,32) + y = net(x) + print(y) + +# test() diff --git a/examples/python/pytorch/models/preact_resnet.py b/examples/python/pytorch/models/preact_resnet.py new file mode 100644 index 000000000..abb1bc313 --- /dev/null +++ b/examples/python/pytorch/models/preact_resnet.py @@ -0,0 +1,118 @@ +'''Pre-activation ResNet in PyTorch. + +Reference: +[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun + Identity Mappings in Deep Residual Networks. arXiv:1603.05027 +''' +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class PreActBlock(nn.Module): + '''Pre-activation version of the BasicBlock.''' + expansion = 1 + + def __init__(self, in_planes, planes, stride=1): + super(PreActBlock, self).__init__() + self.bn1 = nn.BatchNorm2d(in_planes) + self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) + + if stride != 1 or in_planes != self.expansion*planes: + self.shortcut = nn.Sequential( + nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False) + ) + + def forward(self, x): + out = F.relu(self.bn1(x)) + shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x + out = self.conv1(out) + out = self.conv2(F.relu(self.bn2(out))) + out += shortcut + return out + + +class PreActBottleneck(nn.Module): + '''Pre-activation version of the original Bottleneck module.''' + expansion = 4 + + def __init__(self, in_planes, planes, stride=1): + super(PreActBottleneck, self).__init__() + self.bn1 = nn.BatchNorm2d(in_planes) + self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes) + self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False) + + if stride != 1 or in_planes != self.expansion*planes: + self.shortcut = nn.Sequential( + nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False) + ) + + def forward(self, x): + out = F.relu(self.bn1(x)) + shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x + out = self.conv1(out) + out = self.conv2(F.relu(self.bn2(out))) + out = self.conv3(F.relu(self.bn3(out))) + out += shortcut + return out + + +class PreActResNet(nn.Module): + def __init__(self, block, num_blocks, num_classes=10): + super(PreActResNet, self).__init__() + self.in_planes = 64 + + self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) + self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) + self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) + self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) + self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) + self.linear = nn.Linear(512*block.expansion, num_classes) + + def _make_layer(self, block, planes, num_blocks, stride): + strides = [stride] + [1]*(num_blocks-1) + layers = [] + for stride in strides: + layers.append(block(self.in_planes, planes, stride)) + self.in_planes = planes * block.expansion + return nn.Sequential(*layers) + + def forward(self, x): + out = self.conv1(x) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + out = F.avg_pool2d(out, 4) + out = out.view(out.size(0), -1) + out = self.linear(out) + return out + + +def PreActResNet18(): + return PreActResNet(PreActBlock, [2,2,2,2]) + +def PreActResNet34(): + return PreActResNet(PreActBlock, [3,4,6,3]) + +def PreActResNet50(): + return PreActResNet(PreActBottleneck, [3,4,6,3]) + +def PreActResNet101(): + return PreActResNet(PreActBottleneck, [3,4,23,3]) + +def PreActResNet152(): + return PreActResNet(PreActBottleneck, [3,8,36,3]) + + +def test(): + net = PreActResNet18() + y = net((torch.randn(1,3,32,32))) + print(y.size()) + +# test() diff --git a/examples/python/pytorch/models/resnet.py b/examples/python/pytorch/models/resnet.py new file mode 100644 index 000000000..8fe334fd9 --- /dev/null +++ b/examples/python/pytorch/models/resnet.py @@ -0,0 +1,121 @@ +'''ResNet in PyTorch. + +For Pre-activation ResNet, see 'preact_resnet.py'. + +Reference: +[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun + Deep Residual Learning for Image Recognition. arXiv:1512.03385 +''' +import torch +import torch.nn as nn +import torch.nn.functional as F +import triton + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, in_planes, planes, stride=1): + super(BasicBlock, self).__init__() + self.conv1 = triton.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = triton.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + + self.shortcut = nn.Sequential() + if stride != 1 or in_planes != self.expansion*planes: + self.shortcut = nn.Sequential( + triton.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(self.expansion*planes) + ) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = self.bn2(self.conv2(out)) + out += self.shortcut(x) + out = F.relu(out) + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, in_planes, planes, stride=1): + super(Bottleneck, self).__init__() + self.conv1 = triton.Conv2d(in_planes, planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = triton.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.conv3 = triton.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(self.expansion*planes) + + self.shortcut = nn.Sequential() + if stride != 1 or in_planes != self.expansion*planes: + self.shortcut = nn.Sequential( + triton.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(self.expansion*planes) + ) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = F.relu(self.bn2(self.conv2(out))) + out = self.bn3(self.conv3(out)) + out += self.shortcut(x) + out = F.relu(out) + return out + + +class ResNet(nn.Module): + def __init__(self, block, num_blocks, num_classes=10): + super(ResNet, self).__init__() + self.in_planes = 64 + + self.conv1 = triton.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(64) + self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) + self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) + self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) + self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) + self.linear = nn.Linear(512*block.expansion, num_classes) + + def _make_layer(self, block, planes, num_blocks, stride): + strides = [stride] + [1]*(num_blocks-1) + layers = [] + for stride in strides: + layers.append(block(self.in_planes, planes, stride)) + self.in_planes = planes * block.expansion + return nn.Sequential(*layers) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + out = F.avg_pool2d(out, 4) + out = out.view(out.size(0), -1) + out = self.linear(out) + return out + + +def ResNet18(): + return ResNet(BasicBlock, [2,2,2,2]) + +def ResNet34(): + return ResNet(BasicBlock, [3,4,6,3]) + +def ResNet50(): + return ResNet(Bottleneck, [3,4,6,3]) + +def ResNet101(): + return ResNet(Bottleneck, [3,4,23,3]) + +def ResNet152(): + return ResNet(Bottleneck, [3,8,36,3]) + + +def test(): + net = ResNet18() + y = net(torch.randn(1,3,32,32)) + print(y.size()) + +# test() diff --git a/examples/python/pytorch/models/resnext.py b/examples/python/pytorch/models/resnext.py new file mode 100644 index 000000000..7a08f3e7d --- /dev/null +++ b/examples/python/pytorch/models/resnext.py @@ -0,0 +1,95 @@ +'''ResNeXt in PyTorch. + +See the paper "Aggregated Residual Transformations for Deep Neural Networks" for more details. +''' +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class Block(nn.Module): + '''Grouped convolution block.''' + expansion = 2 + + def __init__(self, in_planes, cardinality=32, bottleneck_width=4, stride=1): + super(Block, self).__init__() + group_width = cardinality * bottleneck_width + self.conv1 = nn.Conv2d(in_planes, group_width, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(group_width) + self.conv2 = nn.Conv2d(group_width, group_width, kernel_size=3, stride=stride, padding=1, groups=cardinality, bias=False) + self.bn2 = nn.BatchNorm2d(group_width) + self.conv3 = nn.Conv2d(group_width, self.expansion*group_width, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(self.expansion*group_width) + + self.shortcut = nn.Sequential() + if stride != 1 or in_planes != self.expansion*group_width: + self.shortcut = nn.Sequential( + nn.Conv2d(in_planes, self.expansion*group_width, kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(self.expansion*group_width) + ) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = F.relu(self.bn2(self.conv2(out))) + out = self.bn3(self.conv3(out)) + out += self.shortcut(x) + out = F.relu(out) + return out + + +class ResNeXt(nn.Module): + def __init__(self, num_blocks, cardinality, bottleneck_width, num_classes=10): + super(ResNeXt, self).__init__() + self.cardinality = cardinality + self.bottleneck_width = bottleneck_width + self.in_planes = 64 + + self.conv1 = nn.Conv2d(3, 64, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(64) + self.layer1 = self._make_layer(num_blocks[0], 1) + self.layer2 = self._make_layer(num_blocks[1], 2) + self.layer3 = self._make_layer(num_blocks[2], 2) + # self.layer4 = self._make_layer(num_blocks[3], 2) + self.linear = nn.Linear(cardinality*bottleneck_width*8, num_classes) + + def _make_layer(self, num_blocks, stride): + strides = [stride] + [1]*(num_blocks-1) + layers = [] + for stride in strides: + layers.append(Block(self.in_planes, self.cardinality, self.bottleneck_width, stride)) + self.in_planes = Block.expansion * self.cardinality * self.bottleneck_width + # Increase bottleneck_width by 2 after each stage. + self.bottleneck_width *= 2 + return nn.Sequential(*layers) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + # out = self.layer4(out) + out = F.avg_pool2d(out, 8) + out = out.view(out.size(0), -1) + out = self.linear(out) + return out + + +def ResNeXt29_2x64d(): + return ResNeXt(num_blocks=[3,3,3], cardinality=2, bottleneck_width=64) + +def ResNeXt29_4x64d(): + return ResNeXt(num_blocks=[3,3,3], cardinality=4, bottleneck_width=64) + +def ResNeXt29_8x64d(): + return ResNeXt(num_blocks=[3,3,3], cardinality=8, bottleneck_width=64) + +def ResNeXt29_32x4d(): + return ResNeXt(num_blocks=[3,3,3], cardinality=32, bottleneck_width=4) + +def test_resnext(): + net = ResNeXt29_2x64d() + x = torch.randn(1,3,32,32) + y = net(x) + print(y.size()) + +# test_resnext() diff --git a/examples/python/pytorch/models/senet.py b/examples/python/pytorch/models/senet.py new file mode 100644 index 000000000..98bfa0ca5 --- /dev/null +++ b/examples/python/pytorch/models/senet.py @@ -0,0 +1,121 @@ +'''SENet in PyTorch. + +SENet is the winner of ImageNet-2017. The paper is not released yet. +''' +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class BasicBlock(nn.Module): + def __init__(self, in_planes, planes, stride=1): + super(BasicBlock, self).__init__() + self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + + self.shortcut = nn.Sequential() + if stride != 1 or in_planes != planes: + self.shortcut = nn.Sequential( + nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes) + ) + + # SE layers + self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1) # Use nn.Conv2d instead of nn.Linear + self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = self.bn2(self.conv2(out)) + + # Squeeze + w = F.avg_pool2d(out, out.size(2)) + w = F.relu(self.fc1(w)) + w = F.sigmoid(self.fc2(w)) + # Excitation + out = out * w # New broadcasting feature from v0.2! + + out += self.shortcut(x) + out = F.relu(out) + return out + + +class PreActBlock(nn.Module): + def __init__(self, in_planes, planes, stride=1): + super(PreActBlock, self).__init__() + self.bn1 = nn.BatchNorm2d(in_planes) + self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) + + if stride != 1 or in_planes != planes: + self.shortcut = nn.Sequential( + nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False) + ) + + # SE layers + self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1) + self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1) + + def forward(self, x): + out = F.relu(self.bn1(x)) + shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x + out = self.conv1(out) + out = self.conv2(F.relu(self.bn2(out))) + + # Squeeze + w = F.avg_pool2d(out, out.size(2)) + w = F.relu(self.fc1(w)) + w = F.sigmoid(self.fc2(w)) + # Excitation + out = out * w + + out += shortcut + return out + + +class SENet(nn.Module): + def __init__(self, block, num_blocks, num_classes=10): + super(SENet, self).__init__() + self.in_planes = 64 + + self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(64) + self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) + self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) + self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) + self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) + self.linear = nn.Linear(512, num_classes) + + def _make_layer(self, block, planes, num_blocks, stride): + strides = [stride] + [1]*(num_blocks-1) + layers = [] + for stride in strides: + layers.append(block(self.in_planes, planes, stride)) + self.in_planes = planes + return nn.Sequential(*layers) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + out = F.avg_pool2d(out, 4) + out = out.view(out.size(0), -1) + out = self.linear(out) + return out + + +def SENet18(): + return SENet(PreActBlock, [2,2,2,2]) + + +def test(): + net = SENet18() + y = net(torch.randn(1,3,32,32)) + print(y.size()) + +# test() diff --git a/examples/python/pytorch/models/shufflenet.py b/examples/python/pytorch/models/shufflenet.py new file mode 100644 index 000000000..3682fd3b1 --- /dev/null +++ b/examples/python/pytorch/models/shufflenet.py @@ -0,0 +1,109 @@ +'''ShuffleNet in PyTorch. + +See the paper "ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices" for more details. +''' +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class ShuffleBlock(nn.Module): + def __init__(self, groups): + super(ShuffleBlock, self).__init__() + self.groups = groups + + def forward(self, x): + '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]''' + N,C,H,W = x.size() + g = self.groups + return x.view(N,g,C/g,H,W).permute(0,2,1,3,4).contiguous().view(N,C,H,W) + + +class Bottleneck(nn.Module): + def __init__(self, in_planes, out_planes, stride, groups): + super(Bottleneck, self).__init__() + self.stride = stride + + mid_planes = out_planes/4 + g = 1 if in_planes==24 else groups + self.conv1 = nn.Conv2d(in_planes, mid_planes, kernel_size=1, groups=g, bias=False) + self.bn1 = nn.BatchNorm2d(mid_planes) + self.shuffle1 = ShuffleBlock(groups=g) + self.conv2 = nn.Conv2d(mid_planes, mid_planes, kernel_size=3, stride=stride, padding=1, groups=mid_planes, bias=False) + self.bn2 = nn.BatchNorm2d(mid_planes) + self.conv3 = nn.Conv2d(mid_planes, out_planes, kernel_size=1, groups=groups, bias=False) + self.bn3 = nn.BatchNorm2d(out_planes) + + self.shortcut = nn.Sequential() + if stride == 2: + self.shortcut = nn.Sequential(nn.AvgPool2d(3, stride=2, padding=1)) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = self.shuffle1(out) + out = F.relu(self.bn2(self.conv2(out))) + out = self.bn3(self.conv3(out)) + res = self.shortcut(x) + out = F.relu(torch.cat([out,res], 1)) if self.stride==2 else F.relu(out+res) + return out + + +class ShuffleNet(nn.Module): + def __init__(self, cfg): + super(ShuffleNet, self).__init__() + out_planes = cfg['out_planes'] + num_blocks = cfg['num_blocks'] + groups = cfg['groups'] + + self.conv1 = nn.Conv2d(3, 24, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(24) + self.in_planes = 24 + self.layer1 = self._make_layer(out_planes[0], num_blocks[0], groups) + self.layer2 = self._make_layer(out_planes[1], num_blocks[1], groups) + self.layer3 = self._make_layer(out_planes[2], num_blocks[2], groups) + self.linear = nn.Linear(out_planes[2], 10) + + def _make_layer(self, out_planes, num_blocks, groups): + layers = [] + for i in range(num_blocks): + stride = 2 if i == 0 else 1 + cat_planes = self.in_planes if i == 0 else 0 + layers.append(Bottleneck(self.in_planes, out_planes-cat_planes, stride=stride, groups=groups)) + self.in_planes = out_planes + return nn.Sequential(*layers) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = F.avg_pool2d(out, 4) + out = out.view(out.size(0), -1) + out = self.linear(out) + return out + + +def ShuffleNetG2(): + cfg = { + 'out_planes': [200,400,800], + 'num_blocks': [4,8,4], + 'groups': 2 + } + return ShuffleNet(cfg) + +def ShuffleNetG3(): + cfg = { + 'out_planes': [240,480,960], + 'num_blocks': [4,8,4], + 'groups': 3 + } + return ShuffleNet(cfg) + + +def test(): + net = ShuffleNetG2() + x = torch.randn(1,3,32,32) + y = net(x) + print(y) + +# test() diff --git a/examples/python/pytorch/models/shufflenetv2.py b/examples/python/pytorch/models/shufflenetv2.py new file mode 100644 index 000000000..d24c5dcbb --- /dev/null +++ b/examples/python/pytorch/models/shufflenetv2.py @@ -0,0 +1,162 @@ +'''ShuffleNetV2 in PyTorch. + +See the paper "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design" for more details. +''' +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class ShuffleBlock(nn.Module): + def __init__(self, groups=2): + super(ShuffleBlock, self).__init__() + self.groups = groups + + def forward(self, x): + '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]''' + N, C, H, W = x.size() + g = self.groups + return x.view(N, g, C/g, H, W).permute(0, 2, 1, 3, 4).reshape(N, C, H, W) + + +class SplitBlock(nn.Module): + def __init__(self, ratio): + super(SplitBlock, self).__init__() + self.ratio = ratio + + def forward(self, x): + c = int(x.size(1) * self.ratio) + return x[:, :c, :, :], x[:, c:, :, :] + + +class BasicBlock(nn.Module): + def __init__(self, in_channels, split_ratio=0.5): + super(BasicBlock, self).__init__() + self.split = SplitBlock(split_ratio) + in_channels = int(in_channels * split_ratio) + self.conv1 = nn.Conv2d(in_channels, in_channels, + kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(in_channels) + self.conv2 = nn.Conv2d(in_channels, in_channels, + kernel_size=3, stride=1, padding=1, groups=in_channels, bias=False) + self.bn2 = nn.BatchNorm2d(in_channels) + self.conv3 = nn.Conv2d(in_channels, in_channels, + kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(in_channels) + self.shuffle = ShuffleBlock() + + def forward(self, x): + x1, x2 = self.split(x) + out = F.relu(self.bn1(self.conv1(x2))) + out = self.bn2(self.conv2(out)) + out = F.relu(self.bn3(self.conv3(out))) + out = torch.cat([x1, out], 1) + out = self.shuffle(out) + return out + + +class DownBlock(nn.Module): + def __init__(self, in_channels, out_channels): + super(DownBlock, self).__init__() + mid_channels = out_channels // 2 + # left + self.conv1 = nn.Conv2d(in_channels, in_channels, + kernel_size=3, stride=2, padding=1, groups=in_channels, bias=False) + self.bn1 = nn.BatchNorm2d(in_channels) + self.conv2 = nn.Conv2d(in_channels, mid_channels, + kernel_size=1, bias=False) + self.bn2 = nn.BatchNorm2d(mid_channels) + # right + self.conv3 = nn.Conv2d(in_channels, mid_channels, + kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(mid_channels) + self.conv4 = nn.Conv2d(mid_channels, mid_channels, + kernel_size=3, stride=2, padding=1, groups=mid_channels, bias=False) + self.bn4 = nn.BatchNorm2d(mid_channels) + self.conv5 = nn.Conv2d(mid_channels, mid_channels, + kernel_size=1, bias=False) + self.bn5 = nn.BatchNorm2d(mid_channels) + + self.shuffle = ShuffleBlock() + + def forward(self, x): + # left + out1 = self.bn1(self.conv1(x)) + out1 = F.relu(self.bn2(self.conv2(out1))) + # right + out2 = F.relu(self.bn3(self.conv3(x))) + out2 = self.bn4(self.conv4(out2)) + out2 = F.relu(self.bn5(self.conv5(out2))) + # concat + out = torch.cat([out1, out2], 1) + out = self.shuffle(out) + return out + + +class ShuffleNetV2(nn.Module): + def __init__(self, net_size): + super(ShuffleNetV2, self).__init__() + out_channels = configs[net_size]['out_channels'] + num_blocks = configs[net_size]['num_blocks'] + + self.conv1 = nn.Conv2d(3, 24, kernel_size=3, + stride=1, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(24) + self.in_channels = 24 + self.layer1 = self._make_layer(out_channels[0], num_blocks[0]) + self.layer2 = self._make_layer(out_channels[1], num_blocks[1]) + self.layer3 = self._make_layer(out_channels[2], num_blocks[2]) + self.conv2 = nn.Conv2d(out_channels[2], out_channels[3], + kernel_size=1, stride=1, padding=0, bias=False) + self.bn2 = nn.BatchNorm2d(out_channels[3]) + self.linear = nn.Linear(out_channels[3], 10) + + def _make_layer(self, out_channels, num_blocks): + layers = [DownBlock(self.in_channels, out_channels)] + for i in range(num_blocks): + layers.append(BasicBlock(out_channels)) + self.in_channels = out_channels + return nn.Sequential(*layers) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + # out = F.max_pool2d(out, 3, stride=2, padding=1) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = F.relu(self.bn2(self.conv2(out))) + out = F.avg_pool2d(out, 4) + out = out.view(out.size(0), -1) + out = self.linear(out) + return out + + +configs = { + 0.5: { + 'out_channels': (48, 96, 192, 1024), + 'num_blocks': (3, 7, 3) + }, + + 1: { + 'out_channels': (116, 232, 464, 1024), + 'num_blocks': (3, 7, 3) + }, + 1.5: { + 'out_channels': (176, 352, 704, 1024), + 'num_blocks': (3, 7, 3) + }, + 2: { + 'out_channels': (224, 488, 976, 2048), + 'num_blocks': (3, 7, 3) + } +} + + +def test(): + net = ShuffleNetV2(net_size=0.5) + x = torch.randn(3, 3, 32, 32) + y = net(x) + print(y.shape) + + +# test() diff --git a/examples/python/pytorch/models/vgg.py b/examples/python/pytorch/models/vgg.py new file mode 100644 index 000000000..cb2b3a3ae --- /dev/null +++ b/examples/python/pytorch/models/vgg.py @@ -0,0 +1,47 @@ +'''VGG11/13/16/19 in Pytorch.''' +import torch +import torch.nn as nn +import triton + +cfg = { + 'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], + 'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], + 'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'], + 'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'], +} + + +class VGG(nn.Module): + def __init__(self, vgg_name): + super(VGG, self).__init__() + self.features = self._make_layers(cfg[vgg_name]) + self.classifier = nn.Linear(512, 10) + + def forward(self, x): + out = self.features(x) + out = out.view(out.size(0), -1) + out = self.classifier(out) + return out + + def _make_layers(self, cfg): + layers = [] + in_channels = 3 + for x in cfg: + if x == 'M': + layers += [nn.MaxPool2d(kernel_size=2, stride=2)] + else: + layers += [triton.Conv2d(in_channels, x, kernel_size=3, padding=1), + nn.BatchNorm2d(x), + nn.ReLU(inplace=True)] + in_channels = x + layers += [nn.AvgPool2d(kernel_size=1, stride=1)] + return nn.Sequential(*layers) + + +def test(): + net = VGG('VGG11') + x = torch.randn(2,3,32,32) + y = net(x) + print(y.size()) + +# test() diff --git a/examples/python/pytorch/utils.py b/examples/python/pytorch/utils.py new file mode 100644 index 000000000..4c9b3f90c --- /dev/null +++ b/examples/python/pytorch/utils.py @@ -0,0 +1,124 @@ +'''Some helper functions for PyTorch, including: + - get_mean_and_std: calculate the mean and std value of dataset. + - msr_init: net parameter initialization. + - progress_bar: progress bar mimic xlua.progress. +''' +import os +import sys +import time +import math + +import torch.nn as nn +import torch.nn.init as init + + +def get_mean_and_std(dataset): + '''Compute the mean and std value of dataset.''' + dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True, num_workers=2) + mean = torch.zeros(3) + std = torch.zeros(3) + print('==> Computing mean and std..') + for inputs, targets in dataloader: + for i in range(3): + mean[i] += inputs[:,i,:,:].mean() + std[i] += inputs[:,i,:,:].std() + mean.div_(len(dataset)) + std.div_(len(dataset)) + return mean, std + +def init_params(net): + '''Init layer parameters.''' + for m in net.modules(): + if isinstance(m, nn.Conv2d): + init.kaiming_normal(m.weight, mode='fan_out') + if m.bias: + init.constant(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + init.constant(m.weight, 1) + init.constant(m.bias, 0) + elif isinstance(m, nn.Linear): + init.normal(m.weight, std=1e-3) + if m.bias: + init.constant(m.bias, 0) + + +_, term_width = os.popen('stty size', 'r').read().split() +term_width = int(term_width) + +TOTAL_BAR_LENGTH = 65. +last_time = time.time() +begin_time = last_time +def progress_bar(current, total, msg=None): + global last_time, begin_time + if current == 0: + begin_time = time.time() # Reset for new bar. + + cur_len = int(TOTAL_BAR_LENGTH*current/total) + rest_len = int(TOTAL_BAR_LENGTH - cur_len) - 1 + + sys.stdout.write(' [') + for i in range(cur_len): + sys.stdout.write('=') + sys.stdout.write('>') + for i in range(rest_len): + sys.stdout.write('.') + sys.stdout.write(']') + + cur_time = time.time() + step_time = cur_time - last_time + last_time = cur_time + tot_time = cur_time - begin_time + + L = [] + L.append(' Step: %s' % format_time(step_time)) + L.append(' | Tot: %s' % format_time(tot_time)) + if msg: + L.append(' | ' + msg) + + msg = ''.join(L) + sys.stdout.write(msg) + for i in range(term_width-int(TOTAL_BAR_LENGTH)-len(msg)-3): + sys.stdout.write(' ') + + # Go back to the center of the bar. + for i in range(term_width-int(TOTAL_BAR_LENGTH/2)+2): + sys.stdout.write('\b') + sys.stdout.write(' %d/%d ' % (current+1, total)) + + if current < total-1: + sys.stdout.write('\r') + else: + sys.stdout.write('\n') + sys.stdout.flush() + +def format_time(seconds): + days = int(seconds / 3600/24) + seconds = seconds - days*3600*24 + hours = int(seconds / 3600) + seconds = seconds - hours*3600 + minutes = int(seconds / 60) + seconds = seconds - minutes*60 + secondsf = int(seconds) + seconds = seconds - secondsf + millis = int(seconds*1000) + + f = '' + i = 1 + if days > 0: + f += str(days) + 'D' + i += 1 + if hours > 0 and i <= 2: + f += str(hours) + 'h' + i += 1 + if minutes > 0 and i <= 2: + f += str(minutes) + 'm' + i += 1 + if secondsf > 0 and i <= 2: + f += str(secondsf) + 's' + i += 1 + if millis > 0 and i <= 2: + f += str(millis) + 'ms' + i += 1 + if f == '': + f = '0ms' + return f From a9d078c06f660f9d6aa353080206d69725528a00 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 28 May 2019 14:02:27 -0400 Subject: [PATCH 157/494] [triton/dnn/conv] merged optimizations branch - Added forward/backward support for strided convolution - Added support for bias - Added support for reduction splitting --- cmake/FindTorch.cmake | 5 +- examples/cpp/common.hpp | 242 +- examples/cpp/conv.cpp | 33 +- examples/cpp/dot.cpp | 5 +- examples/cpp/shift.cpp | 5 +- examples/python/pytorch/conv.cpp | 71 +- examples/python/pytorch/test.py | 18 +- examples/python/pytorch/triton.py | 7 +- include/triton/dnn/conv.h | 167 +- include/triton/dnn/gemm.h | 127 +- include/triton/driver/buffer.h | 4 +- include/triton/driver/dispatch.h | 73 - include/triton/external/CUDA/builtin_types.h | 64 - .../triton/external/CUDA/channel_descriptor.h | 412 - .../triton/external/CUDA/crt/host_config.h | 266 - .../triton/external/CUDA/crt/host_defines.h | 216 - include/triton/external/CUDA/cuComplex.h | 338 - include/triton/external/CUDA/cublas.h | 565 -- include/triton/external/CUDA/cublas_api.h | 2977 ------- include/triton/external/CUDA/cublas_v2.h | 274 - .../external/CUDA/cuda_device_runtime_api.h | 248 - include/triton/external/CUDA/cuda_fp16.h | 1969 ----- include/triton/external/CUDA/cuda_fp16.hpp | 1797 ---- include/triton/external/CUDA/cuda_runtime.h | 2040 ----- .../triton/external/CUDA/cuda_runtime_api.h | 7422 ----------------- include/triton/external/CUDA/cudnn.h | 1805 ---- include/triton/external/CUDA/cusparse.h | 6257 -------------- include/triton/external/CUDA/device_types.h | 69 - .../triton/external/CUDA/driver_functions.h | 145 - include/triton/external/CUDA/driver_types.h | 1610 ---- include/triton/external/CUDA/host_config.h | 50 - include/triton/external/CUDA/host_defines.h | 50 - include/triton/external/CUDA/library_types.h | 80 - include/triton/external/CUDA/nvrtc.h | 525 -- include/triton/external/CUDA/surface_types.h | 119 - include/triton/external/CUDA/texture_types.h | 217 - .../triton/external/CUDA/vector_functions.h | 177 - .../triton/external/CUDA/vector_functions.hpp | 318 - include/triton/external/CUDA/vector_types.h | 425 - include/triton/runtime/jit.h | 7 +- include/triton/tools/bench.hpp | 50 + lib/dnn/conv.cpp | 516 +- lib/dnn/gemm.cpp | 137 + lib/driver/buffer.cpp | 4 +- lib/driver/dispatch.cpp | 114 - lib/driver/error.cpp | 39 - lib/runtime/jit.cpp | 12 +- 47 files changed, 732 insertions(+), 31339 deletions(-) delete mode 100755 include/triton/external/CUDA/builtin_types.h delete mode 100755 include/triton/external/CUDA/channel_descriptor.h delete mode 100644 include/triton/external/CUDA/crt/host_config.h delete mode 100644 include/triton/external/CUDA/crt/host_defines.h delete mode 100755 include/triton/external/CUDA/cuComplex.h delete mode 100755 include/triton/external/CUDA/cublas.h delete mode 100755 include/triton/external/CUDA/cublas_api.h delete mode 100644 include/triton/external/CUDA/cublas_v2.h delete mode 100755 include/triton/external/CUDA/cuda_device_runtime_api.h delete mode 100755 include/triton/external/CUDA/cuda_fp16.h delete mode 100755 include/triton/external/CUDA/cuda_fp16.hpp delete mode 100755 include/triton/external/CUDA/cuda_runtime.h delete mode 100755 include/triton/external/CUDA/cuda_runtime_api.h delete mode 100755 include/triton/external/CUDA/cudnn.h delete mode 100644 include/triton/external/CUDA/cusparse.h delete mode 100755 include/triton/external/CUDA/device_types.h delete mode 100755 include/triton/external/CUDA/driver_functions.h delete mode 100755 include/triton/external/CUDA/driver_types.h delete mode 100755 include/triton/external/CUDA/host_config.h delete mode 100755 include/triton/external/CUDA/host_defines.h delete mode 100755 include/triton/external/CUDA/library_types.h delete mode 100755 include/triton/external/CUDA/nvrtc.h delete mode 100755 include/triton/external/CUDA/surface_types.h delete mode 100755 include/triton/external/CUDA/texture_types.h delete mode 100755 include/triton/external/CUDA/vector_functions.h delete mode 100755 include/triton/external/CUDA/vector_functions.hpp delete mode 100755 include/triton/external/CUDA/vector_types.h create mode 100644 include/triton/tools/bench.hpp create mode 100644 lib/dnn/gemm.cpp diff --git a/cmake/FindTorch.cmake b/cmake/FindTorch.cmake index 56b1e7c16..79a814d03 100644 --- a/cmake/FindTorch.cmake +++ b/cmake/FindTorch.cmake @@ -4,7 +4,10 @@ execute_process(COMMAND python -c "import torch; import os; print(os.path.dirnam find_package_handle_standard_args(TORCH DEFAULT_MSG TORCH_INSTALL_PREFIX) if(TORCH_INSTALL_PREFIX) - set(TORCH_INCLUDE_DIRS ${TORCH_INSTALL_PREFIX}/lib/include/ ${TORCH_INSTALL_PREFIX}/lib/include/torch/csrc/api/include) + set(TORCH_INCLUDE_DIRS ${TORCH_INSTALL_PREFIX}/lib/include/ + ${TORCH_INSTALL_PREFIX}/lib/include/torch/csrc/api/include + ${TORCH_INSTALL_PREFIX}/include/ + ${TORCH_INSTALL_PREFIX}/include/torch/csrc/api/include/) set(TORCH_LIBRARY_DIRS ${TORCH_INSTALL_PREFIX}/lib/) endif() diff --git a/examples/cpp/common.hpp b/examples/cpp/common.hpp index 87525eb68..f92bbeb69 100644 --- a/examples/cpp/common.hpp +++ b/examples/cpp/common.hpp @@ -1,5 +1,6 @@ #include #include +#include #include "triton/driver/device.h" #include @@ -26,245 +27,6 @@ void simple_gemm(bool AT, bool BT, std::vector &c, const std::vector &a, c simple_gemm(c, a, b, M, N, K); } -class timer{ - typedef std::chrono::high_resolution_clock high_resolution_clock; - typedef std::chrono::nanoseconds nanoseconds; - -public: - explicit timer(bool run = false) - { if (run) start(); } - - void start() - { _start = high_resolution_clock::now(); } - - nanoseconds get() const - { return std::chrono::duration_cast(high_resolution_clock::now() - _start); } - -private: - high_resolution_clock::time_point _start; -}; - -template -T min(std::vector x) -{ return *std::min_element(x.begin(), x.end()); } - - -template -double bench(OP const & op, SYNC const & sync, triton::driver::device const & device) -{ - timer tmr; - std::vector times; - double total_time = 0; - op(); - sync(); - while(total_time*1e-9 < 1e-3){ - float norm = 1; - // normalize clock if possible to get roughly constant result - if(auto cu_device = dynamic_cast(&device)) - norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock(); - tmr.start(); - op(); - sync(); - times.push_back(norm*tmr.get().count()); - total_time+=times.back(); - } - return min(times); -} - -// - -void build_conv_lut(int TK, - int stride_d, int stride_h, int stride_w, int stride_c, - int pad_d, int pad_h, int pad_w, - int T, int R, int S, - std::vector& res, std::vector& masks) { - /* convolution parameters */ - int F = T * R * S; - int Nlut = (TK + F - 1) / F * F; - int upsample_w = 1; - int upsample_h = 1; - int upsample_d = 1; - /* unpack index wrt filters */ - auto unpack = [&](int32_t trs){ - int32_t tr = trs / S; - int32_t s = trs - tr*S; - int32_t t = tr / R; - int32_t r = tr - t*R; - return std::make_tuple(t, r, s); - }; - /* increments */ - for(size_t i = 0; i < Nlut; ++i) - res[i] = (((i + TK) % Nlut) - i); - /* deltas */ - size_t Ds0 = Nlut; - size_t Ds1 = upsample_w; - size_t Ds2 = upsample_h; - size_t Ds3 = upsample_d; - for(size_t pd = 0; pd < Ds3; ++pd) - for(size_t ph = 0; ph < Ds2; ++ph) - for(size_t pw = 0; pw < Ds1; ++pw){ - int32_t* deltas_ptr = &res[Nlut + pw*Ds0 + ph*Ds0*Ds1 + pd*Ds0*Ds1*Ds2]; - // cumulative increments - for(size_t i = 0; i < Ds0; ++i){ - int32_t ctrs = i; - int32_t c = ctrs / F; - int32_t t, r, s; - std::tie(t, r, s) = unpack(ctrs % F); - // next indices - int32_t nextctrs = ctrs + TK; - int32_t nextc = nextctrs / F; - int32_t nextt, nextr, nexts; - std::tie(nextt, nextr, nexts) = unpack(nextctrs % F); - // diffs - int32_t cdiff = nextc - c; - int32_t tdiff = (nextt + pd)/upsample_d - (t + pd)/upsample_d; - int32_t rdiff = (nextr + ph)/upsample_h - (r + ph)/upsample_h; - int32_t sdiff = (nexts + pw)/upsample_w - (s + pw)/upsample_w; - // delta pointers - deltas_ptr[i] = cdiff*stride_c + sdiff*stride_w + rdiff*stride_h + tdiff*stride_d; - } - } - - /* Masks */ - size_t Ms0 = Nlut; - size_t Ms1 = 2*pad_w + 1; - size_t Ms2 = 2*pad_h + 1; - size_t Ms3 = 2*pad_d + 1; - - for(size_t pd = 0; pd < Ms3; ++pd) - for(size_t ph = 0; ph < Ms2; ++ph) - for(size_t pw = 0; pw < Ms1; ++pw){ - int32_t* masks_ptr = &masks[Nlut + pw*Ms0 + ph*Ms0*Ms1 + pd*Ms0*Ms1*Ms2]; - for(size_t i = 0; i < Ms0; ++i){ - int32_t t, r, s; - int32_t mask = 0x0; - for(size_t j = 0; j < TK; ++j){ - std::tie(t, r, s) = unpack((i + j) % F); - bool in_bounds_d = (t + pd) >= pad_d && (t + pd) < (T + pad_d); - bool in_bounds_h = (r + ph) >= pad_h && (r + ph) < (R + pad_h); - bool in_bounds_w = (s + pw) >= pad_w && (s + pw) < (S + pad_w); - mask |= (in_bounds_d && in_bounds_h && in_bounds_w) << j; - } - masks_ptr[i] = mask; - } - } - for(size_t i = 0; i < Nlut; ++i) - masks[i] = 0x0; -} - - -// Index computation -inline int32_t idx(int32_t x, int32_t y, int32_t z, int32_t w, int32_t u, - int32_t /*s0*/, int32_t s1, int32_t s2, int32_t s3, int32_t s4) -{ return u + w*s4 + z*s4*s3 + y*s4*s3*s2 + x*s4*s3*s2*s1; } - - -// Pack - -template T clamp(T x, T lo, T hi){ - return std::max(lo, std::min(x, hi)); -} - - -template -T pack(U* tmp, U scale); - -template<> -double pack(double* tmp, double scale) -{ return tmp[0]*scale; } - -template<> -float pack(float* tmp, float scale) -{ return tmp[0]*scale; } - -template<> -int pack(float* tmp, float scale) -{ - int res = 0; - for(int i = 0; i < 4; i++){ - int8_t clamped = std::round(clamp(tmp[i]*scale, (float)-128, (float)127)); - res |= (clamped & 0xFF) << (8*i); - } - return res; -} - -template struct pack_increment -{ enum{ VALUE = 1}; }; - -template<> struct pack_increment -{ enum{ VALUE = 4}; }; - -// Dot -template -inline T dot(T x, T y, T z) -{ - return std::fma(x, y, z); -} - -inline int dot(int x, int y, int z){ - int res = 0; - for(int i = 0; i < 4; i++){ - int32_t a = ((x >> (8*i)) & 0x000000FF); - int32_t b = ((y >> (8*i)) & 0x000000FF); - res += (*(int8_t*)(&a)) * (*(int8_t*)(&b)); - } - return res + z; -} - - - -template -void cpp_conv_nchw(int32_t C, int32_t N, int32_t K, - int32_t D, int32_t H, int32_t W, - int32_t T, int32_t R, int32_t S, - int32_t pad_d, int32_t pad_h, int32_t pad_w, - int32_t stride_d, int32_t stride_h, int32_t stride_w, - int32_t M, int32_t P, int32_t Q, - std::vector& O, - const std::vector& I, - const std::vector& F) -{ - static const int PACK_IN = pack_increment::VALUE; - static const int PACK_OUT = pack_increment::VALUE; - if(C % PACK_IN != 0) throw std::runtime_error("Number of input channels must be a multiple of 4"); - if(K % PACK_OUT != 0) throw std::runtime_error("Number of output channels must be a multiple of 4"); - C /= PACK_IN; - K /= PACK_OUT; - int32_t Kout = K; - IN_DTYPE accs[PACK_OUT]; - float tmp[PACK_OUT]; - for(int32_t m = 0 ; m < M; ++m) - for(int32_t p = 0 ; p < P; ++p) - for(int32_t q = 0; q < Q; ++q) - for(int32_t n = 0; n < N; ++n) - for(int32_t k = 0; k < Kout ; ++k) - { - for(int32_t i = 0; i < PACK_OUT; ++i) - accs[i] = 0; - int32_t mm = m*stride_d - pad_d; - int32_t pp = p*stride_h - pad_h; - int32_t qq = q*stride_w - pad_w; - for(int32_t kk = 0; kk < PACK_OUT; ++kk) - for(int32_t c = 0; c < C; ++c) - for(int32_t t = 0; t < T; ++t) - for(int32_t r = 0; r < R; ++r) - for(int32_t s = 0; s < S; ++s){ - int32_t d = mm + t; - int32_t h = pp + r; - int32_t w = qq + s; - bool in_bounds = (d >= 0 && h >= 0 && w >= 0 && d < D && h < H && w < W); - IN_DTYPE i = in_bounds?I[idx(n, c, d, h, w, N, C, D, H, W)]:0; - IN_DTYPE f = F[idx(c, t, r, s, k*PACK_OUT + kk, C, T, R, S, K*PACK_OUT)]; - accs[kk] = dot(i, f, accs[kk]); - } - for(int32_t kk = 0; kk < PACK_OUT; ++kk){ - tmp[kk] = accs[kk]; - } - O[idx(n, k, m, p, q, N, K, M, P, Q)] = tmp[0]; - } -} - - // input layout: C, H, W, BS // filter layout: C, K // output layout: K, H, W, BS @@ -290,7 +52,7 @@ void shift_conv(int32_t C, int32_t H, int32_t W, int32_t BS, bool in_bounds = (h >= 0 && w >= 0 && h < H && w < W); IN_DTYPE a = in_bounds?I[bs + w*BS + h*BS*W + c*BS*H*W]:0; IN_DTYPE b = F[k + c*K]; - acc = dot(a, b, acc); + acc = std::fma(a, b, acc); } O[bs + q*BS + p*BS*W + k*BS*H*W] = acc; } diff --git a/examples/cpp/conv.cpp b/examples/cpp/conv.cpp index 179fca421..e906493ac 100644 --- a/examples/cpp/conv.cpp +++ b/examples/cpp/conv.cpp @@ -1,10 +1,11 @@ #include #include -#include "common.hpp" +#include #include "triton/runtime/jit.h" #include "triton/driver/backend.h" #include "triton/driver/stream.h" #include "triton/dnn/conv.h" +#include "triton/tools/bench.hpp" int main() { // initialize default compute device @@ -12,13 +13,14 @@ int main() { triton::jit jit(context); triton::dnn::conv::type ty = triton::dnn::conv::FPROP; // initialization - int32_t B = 4, NF = 32; - int32_t D = 1, H = 56, W = 56; - int32_t NC = 16, T = 1, R = 3, S = 3; + int32_t B = 64, NF = 64; + int32_t D = 1, H = 8, W = 8; + int32_t NC = 3, T = 1, R = 3, S = 3; int32_t pad_d = 0, pad_h = 0, pad_w = 0; int32_t stride_d = 1, stride_h = 1, stride_w = 1; int32_t upsample_d = 1, upsample_h = 1, upsample_w = 1; - triton::dnn::conv configuration(B, NC, D, H, W, T, R, S, NF, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, upsample_d, upsample_h, upsample_w, ty); + triton::dnn::conv configuration(128, 256, 1, 14, 14, 1, 5, 5, 512, 1, 1, 1, 0, 0, 0, 1, 1, 1, triton::dnn::conv::FPROP, 0); +// triton::dnn::conv configuration(B, NC, D, H, W, T, R, S, NF, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, upsample_d, upsample_h, upsample_w, ty); // convolution configuration std::vector hc(configuration.c_size()); std::vector rc(configuration.c_size()); @@ -43,22 +45,23 @@ int main() { // benchmark a given convolution kernel auto benchmark = [&](triton::driver::kernel* kernel, triton::jit::launch_information info) { + configuration.init(stream, (triton::driver::cu_module*)kernel->module()); unsigned TM = info.global_range_size[0]; unsigned TN = info.global_range_size[1]; unsigned nthreads = info.num_threads; - std::array grid = configuration.get_grid(TM, TN); - configuration.init(stream, (triton::driver::cu_module*)kernel->module()); + unsigned GZ = jit.get_int("GZ"); + configuration.enqueue(stream, kernel, da, db, dc, nullptr, TM, TN, GZ, nthreads); stream->synchronize(); - configuration.set_arg(kernel, da, db, dc, nullptr); - stream->enqueue(kernel, grid, {nthreads, 1, 1}); - stream->synchronize(); - double ts = bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, - [&](){ stream->synchronize(); }, *context->device()); + double ts = triton::tools::bench([&](){ configuration.enqueue(stream, kernel, da, db, dc, nullptr, TM, TN, GZ, nthreads); }, + [&](){ stream->synchronize(); }, nullptr); return configuration.get_nflops() / ts * 1e-3; }; - std::string src = configuration.src(); -// jit.autotune("conv", src.c_str(), benchmark); - jit.add_module("conv", src.c_str(), configuration.default_params()); + std::ostringstream oss; + configuration.src(oss); + std::string src = oss.str(); + triton::jit::tune_res_t best = jit.autotune("conv", src.c_str(), benchmark); + jit.add_module("conv", src.c_str(), best.params); +// jit.add_module("conv", src.c_str(), configuration.default_params()); triton::driver::kernel* kernel = jit.get_function("conv"); triton::jit::launch_information info = jit.get_launch_info("conv"); std::cout << "Performance: " << benchmark(kernel, info) << " TFLOPS " << std::endl; diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 2beee1c8d..4e805ce9d 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -5,6 +5,7 @@ #include "triton/driver/backend.h" #include "triton/driver/stream.h" #include "triton/dnn/gemm.h" +#include "triton/tools/bench.hpp" int main() { @@ -52,8 +53,8 @@ int main() { triton::dnn::gemm::set_arg(kernel, da, db, dc, M, N, K, dlocks, grid[0], grid[1]); stream->enqueue(kernel, grid, {nthreads, 1, 1}); stream->synchronize(); - double ts = bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, - [&](){ stream->synchronize(); }, *context->device()); + double ts = triton::tools::bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, + [&](){ stream->synchronize(); }, context->device()); return 2.*M*N*K / ts * 1e-3; }; diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index 4391f775b..2cd17643e 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -4,6 +4,7 @@ #include "triton/runtime/jit.h" #include "triton/driver/backend.h" #include "triton/driver/stream.h" +#include "triton/tools/bench.hpp" // K = channels // M = batch * height * width @@ -180,8 +181,8 @@ int main() { stream->enqueue(kernel, grid, {nthreads, 1, 1}); stream->synchronize(); // benchmark - double ts = bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, - [&](){ stream->synchronize(); }, *context->device()); + double ts = triton::tools::bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, + [&](){ stream->synchronize(); }, context->device()); ts = ts * 1e-9; double tflops = 2.*M*N*K / ts * 1e-12; return tflops; diff --git a/examples/python/pytorch/conv.cpp b/examples/python/pytorch/conv.cpp index 4ed9785ed..6ccedc75f 100644 --- a/examples/python/pytorch/conv.cpp +++ b/examples/python/pytorch/conv.cpp @@ -1,10 +1,12 @@ +#include +#include #include #include #include "ATen/cuda/CUDAContext.h" -#include #include "triton/runtime/jit.h" #include "triton/driver/stream.h" #include "triton/dnn/conv.h" +#include "triton/tools/bench.hpp" #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x " must be contiguous") @@ -26,10 +28,13 @@ torch::Tensor conv_common( int32_t stride_d, int32_t stride_h, int32_t stride_w, int32_t pad_d, int32_t pad_h, int32_t pad_w, triton::dnn::conv::type ty, - torch::Tensor torcha, torch::Tensor torchb, torch::Tensor torchbias + torch::Tensor torcha, torch::Tensor torchb, torch::Tensor torchbias, + bool autotune = false ) { + // Wrap CUDA handles c10::DeviceIndex device = torcha.storage().device().index(); + // Get stream CUstream custream = (CUstream)at::cuda::getCurrentCUDAStream(device).stream(); triton::driver::stream* stream; @@ -37,8 +42,10 @@ torch::Tensor conv_common( stream = m_stream.emplace(custream, new triton::driver::cu_stream(custream, false)).first->second.get(); else stream = m_stream.at(custream).get(); + // Get context triton::driver::context* ctx = stream->context(); + // Get configuration bool has_bias = torchbias.storage().size() > 0; conv_key_t key = {B, C, D, H, W, T, R, S, NF, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, ty, has_bias}; @@ -52,22 +59,13 @@ torch::Tensor conv_common( ty, has_bias)).first->second.get(); else configuration = m_config.at(key).get(); - // Get JIT - triton::jit* jit; - if(m_jit.find(key) == m_jit.end()){ - jit = m_jit.emplace(key, new triton::jit(ctx)).first->second.get(); - std::string src = configuration->src(); - jit->add_module("conv", src.c_str(), configuration->default_params()); - triton::driver::kernel* kernel = jit->get_function("conv"); - configuration->init(stream, (triton::driver::cu_module*)kernel->module()); - } - else - jit = m_jit.at(key).get(); - // Get memory + + // Bind memory triton::driver::cu_buffer a(ctx, (CUdeviceptr)torcha.storage().data(), false); triton::driver::cu_buffer b(ctx, (CUdeviceptr)torchb.storage().data(), false); triton::driver::cu_buffer cubias(ctx, (CUdeviceptr)torchbias.storage().data(), false); triton::driver::buffer* bias = has_bias ? &cubias : nullptr; + // Allocate output std::vector c_shapes = configuration->c_shapes(); torch::Tensor torchc; @@ -76,17 +74,52 @@ torch::Tensor conv_common( else torchc = torch::empty({c_shapes[0], c_shapes[1], c_shapes[3], c_shapes[4]}, torch::kFloat).cuda(); triton::driver::cu_buffer c(ctx, (CUdeviceptr)torchc.storage().data(), false); - // Add module to JIT + + // Get JIT + triton::jit* jit; + if(m_jit.find(key) == m_jit.end()){ + jit = m_jit.emplace(key, new triton::jit(ctx)).first->second.get(); + std::ostringstream oss; + configuration->src(oss); + std::string src = oss.str(); + // benchmark a given convolution kernel + auto benchmark = [&](triton::driver::kernel* kernel, + triton::jit::launch_information info) { + configuration->init(stream, (triton::driver::cu_module*)kernel->module()); + unsigned TM = info.global_range_size[0]; + unsigned TN = info.global_range_size[1]; + unsigned nthreads = info.num_threads; + unsigned GZ = jit->get_int("GZ"); + configuration->enqueue(stream, kernel, &a, &b, &c, bias, TM, TN, GZ, nthreads); + stream->synchronize(); + double ts = triton::tools::bench([&](){ configuration->enqueue(stream, kernel, &a, &b, &c, bias, TM, TN, GZ, nthreads); }, + [&](){ stream->synchronize(); }, stream->context()->device()); + return configuration->get_nflops() / ts * 1e-3; + }; + // auto-tune and save result + if(autotune) { + triton::jit::tune_res_t best = jit->autotune("conv", src.c_str(), benchmark); + jit->add_module("conv", src.c_str(), best.params); + } + else { + jit->add_module("conv", src.c_str(), configuration->default_params()); + } + triton::driver::kernel* kernel = jit->get_function("conv"); + configuration->init(stream, (triton::driver::cu_module*)kernel->module()); + } + else + jit = m_jit.at(key).get(); + + // Run triton::driver::kernel* kernel = jit->get_function("conv"); triton::jit::launch_information info = jit->get_launch_info("conv"); + unsigned GZ = jit->get_int("GZ"); // launch info unsigned TM = info.global_range_size[0]; unsigned TN = info.global_range_size[1]; - // launch info unsigned nthreads = info.num_threads; - std::array grid = configuration->get_grid(TM, TN); - configuration->set_arg(kernel, &a, &b, &c, bias); - stream->enqueue(kernel, grid, {nthreads, 1, 1}); + // enqueue + configuration->enqueue(stream, kernel, &a, &b, &c, bias, TM, TN, GZ, nthreads); return torchc; } diff --git a/examples/python/pytorch/test.py b/examples/python/pytorch/test.py index 787d6634b..4c80fd187 100644 --- a/examples/python/pytorch/test.py +++ b/examples/python/pytorch/test.py @@ -1,11 +1,14 @@ import torch import triton -x = torch.autograd.Variable(torch.randn(16, 64, 8, 8).cuda(), requires_grad=True) -bias = torch.autograd.Variable(torch.randn(64).cuda(), requires_grad=True) -w = torch.autograd.Variable(torch.randn(64, 3, 3, 64).cuda(), requires_grad=True) +torch.manual_seed(0) +torch.set_printoptions(precision=4) + +x = torch.autograd.Variable(torch.randn(64, 3, 8, 8).cuda(), requires_grad=True) +bias = torch.autograd.Variable(torch.randn(6).cuda(), requires_grad=True) +w = torch.autograd.Variable(torch.randn(3, 3, 3, 6).cuda(), requires_grad=True) cuw = torch.autograd.Variable(w.permute(3,0,1,2).cuda(), requires_grad=True) -y_target = torch.autograd.Variable(torch.randn(16, 64, 6, 6).cuda(), requires_grad=True) +y_target = torch.autograd.Variable(torch.randn(64, 6, 8, 8).cuda(), requires_grad=True) def run(x, w, conv): y = conv(x, w) @@ -13,13 +16,14 @@ def run(x, w, conv): loss.backward() return loss, y.clone(), x.grad.clone(), w.grad.clone(), bias.grad.clone() -ttyloss, tty, ttdx, ttdw, ttbias = run(x, w, lambda x, w: triton.ConvFunction.apply(x, w, bias, (1,1), (0,0))) +ttyloss, tty, ttdx, ttdw, ttbias = run(x, w, lambda x, w: triton.ConvFunction.apply(x, w, bias, (1,1), (1,1))) x.grad.zero_() w.grad.zero_() bias.grad.zero_() -culoss, cuy, cudx, cudw, cubias = run(x, cuw, lambda x, w: torch.nn.functional.conv2d(x, w, bias=bias, stride=1, padding=0)) +culoss, cuy, cudx, cudw, cubias = run(x, cuw, lambda x, w: torch.nn.functional.conv2d(x, w, bias=bias, stride=1, padding=1)) -print(ttdx[0,0,:,:], cudx[0,0,:,:]) +print(ttdx[0,0,:,:]) +print(cudx[0,0,:,:]) print((tty - cuy).norm(2)) print((ttdx - cudx).norm(2)) print((ttdw.permute(3,0,1,2) - cudw).norm(2)) diff --git a/examples/python/pytorch/triton.py b/examples/python/pytorch/triton.py index ec7c86695..18f08ba44 100644 --- a/examples/python/pytorch/triton.py +++ b/examples/python/pytorch/triton.py @@ -2,7 +2,7 @@ import torch from torch.nn.modules.utils import _single, _pair, _triple import math -torch.ops.load_library("/home/philippe/Development/triton/build/examples/python/pytorch/libtorch_triton.so") +torch.ops.load_library("/home/philippe/development/triton/build/examples/python/pytorch/libtorch_triton.so") class ConvFunction(torch.autograd.Function): @@ -37,7 +37,7 @@ class _ConvNd(torch.nn.Module): padding, dilation, transposed, output_padding, groups, bias): super(_ConvNd, self).__init__() # not everything is supported by Triton - assert all(x==1 for x in stride) + assert all(x==1 or x==2 for x in stride) assert all(x==1 for x in dilation) assert transposed == False assert all(x==0 for x in output_padding) @@ -46,6 +46,7 @@ class _ConvNd(torch.nn.Module): self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = kernel_size + self.stride = stride self.padding = padding self.weight = torch.nn.Parameter(torch.Tensor( in_channels, kernel_size[0], kernel_size[1], out_channels)) @@ -56,7 +57,7 @@ class _ConvNd(torch.nn.Module): self.reset_parameters() def forward(self, input): - return ConvFunction.apply(input, self.weight, self.bias, self.padding) + return ConvFunction.apply(input, self.weight, self.bias, self.stride, self.padding) def reset_parameters(self): n = self.in_channels diff --git a/include/triton/dnn/conv.h b/include/triton/dnn/conv.h index 1c9127466..a950c3304 100644 --- a/include/triton/dnn/conv.h +++ b/include/triton/dnn/conv.h @@ -16,6 +16,14 @@ public: WGRAD }; +private: + void set_ld(const std::vector& shapes, + std::vector& ld); + + std::tuple + unpack(int32_t ltrs, bool flip, int32_t EBD, int32_t EBH, int32_t EBW); + +public: conv(int B, int NC, int D, int H, int W, @@ -32,6 +40,7 @@ public: std::vector c_shapes(); // initialize + void build_b_deltas(); void build_deltas(); void build_masks(); void init(driver::stream *stream, driver::cu_module *module); @@ -39,161 +48,17 @@ public: void set_arg(driver::kernel *kernel, driver::buffer *a, driver::buffer *b, driver::buffer *c, driver::buffer *bias); + void enqueue(driver::stream *stream, driver::kernel *kernel, + driver::buffer *a, driver::buffer *b, driver::buffer *c, + driver::buffer *bias, + size_t TM, size_t TN, size_t GZ, size_t nthreads); // utilities size_t get_nflops(); std::vector default_params(); // source - std::string src(){ - std::string BS = b_trans_ ? "[TN,TK]" : "[TK, TN]"; - std::string bcb0 = b_trans_ ? "[:, newaxis]" : "[newaxis, :]"; - std::string bcb1 = b_trans_ ? "[newaxis, :]" : "[:, newaxis]"; - std::string ldb0 = b_trans_ ? "*ldb_s" : ""; - std::string ldb1 = b_trans_ ? "*ldb_k" : "*ldb_c"; - std::string useb = b_trans_ ? "trans(b)" : "b"; - std::string flipr = b_trans_ ? "" : "BH - 1 -"; - std::string flips = b_trans_ ? "" : "BW - 1 -"; - std::string ax = b_trans_ ? "crs" : "rsc"; - std::vector redax; - if(b_trans_) - redax = {"C", "BH", "BW"}; - else - redax = {"BH", "BW", "N"}; - std::string inc_pb = b_lut_ ? "db" + bcb1 : "TK" + ldb0; - std::string a_delta_mem = is_a_deltas_cst ? "__constant__" : ""; - std::string b_delta_mem = is_b_deltas_cst_? "__constant__" : ""; - std::string masks_mem = is_mask_cst_? "__constant__" : ""; - - std::string res = - R"( - const tunable int32 TM = {16, 32, 64}; - const tunable int32 TN = {16, 32, 64}; - const tunable int32 TK = {8}; - )"; - if(is_a_deltas_cst) - res += "__constant__ int32* delta = alloc_const int32[" + std::to_string(h_a_deltas_.size()) + "];\n"; - if(b_lut_ && is_b_deltas_cst_) - res += "__constant__ int32* b_delta = alloc_const int32[" + std::to_string(h_b_deltas_.size()) + "];\n"; - if(is_mask_cst_) - res += "__constant__ int32* masks = alloc_const int32[" + std::to_string(h_masks_.size()) + "];\n"; - res += R"( - - void conv(read_only restrict fp32 *a, - read_only restrict fp32 *b, - fp32 *c, - fp32 *bias, - int32 M, int32 N, int32 K, - int32 AH, int32 AW, - int32 BH, int32 BW, - int32 CH, int32 CW, - int32 lda_n, int32 lda_c, int32 lda_d, int32 lda_h, int32 lda_w, - int32 ldb_c, int32 ldb_t, int32 ldb_r, int32 ldb_s, int32 ldb_k, - int32 ldc_n, int32 ldc_k, int32 ldc_m, int32 ldc_p, int32 ldc_q, - int32 pad_h, int32 pad_w, - int32 stride_h, int32 stride_w, - int32 upsample_h, int32 upsample_w)"; - if(!is_a_deltas_cst) - res += ", int32* delta"; - if(b_lut_ && !is_b_deltas_cst_) - res += ", int32* b_delta"; - if(!is_mask_cst_) - res += ", int32* masks"; - res += R"(){ - int32 rxa[TM] = get_global_range[TM](0); - int32 rb0[TN] = get_global_range[TN](1); - int32 rka[TK] = 0 ... TK; - int32 rkb[TK] = 0 ... TK; - fp32 C[TM, TN] = 0; - int32 ldlut = )" + std::to_string(Fs_) + R"(; - int32 rabh[TM] = rxa / CW; - int32 raw[TM] = rxa % CW; - int32 rab[TM] = rabh / CH; - int32 rah[TM] = rabh % CH; - raw = raw*stride_w - pad_w; - rah = rah*stride_h - pad_h; - int32 ra0[TM] = rab*lda_n + rah*lda_h + raw*lda_w; - int32 ra)" + ax[0] + ax[1] + "[TK] = rka / " + redax[2] + R"(; - int32 ra)" + ax[2] + "[TK] = rka % " + redax[2] + R"(; - int32 ra)" + ax[0] + "[TK] = ra" + ax[0] + ax[1] + " / " + redax[1] + R"(; - int32 ra)" + ax[1] + "[TK] = ra" + ax[0] + ax[1] + " % " + redax[1] + R"(; - rar = )" + flipr + R"( rar; - ras = )" + flips + R"( ras; - int32 ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; - fp32* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis];)"; - if(b_lut_){ - res += R"( - int32 rbcr[TK] = rkb / BW; - int32 rbs[TK] = rkb % BW; - int32 rbc[TK] = rbcr / BH; - int32 rbr[TK] = rbcr % BH; - int32 rb1[TK] = rbc*ldb_c + rbr*ldb_r + ras*ldb_s; - )" + b_delta_mem + R"( int32* pdb[TK] = b_delta + rkb; - int32 db[TK] = *pdb;)"; - } - else{ - res += R"( - int32 rb1[TK] = rkb)" + ldb0 + ";"; - } - res += R"( - fp32* pb)" + BS + " = b + rb1" + bcb1 + " + rb0" + bcb0 + ldb1 + R"(; - )" + a_delta_mem + R"( int32* pincd[TK] = delta + rka; - )" + a_delta_mem + R"( int32* pda[TK] = delta + ldlut + rka; - int32 da[TK] = *pda; - int32 incd[TK] = *pincd; - int32 maskh[TM] = pad_h + min(rah, 0) + max(rah + BH - AH, 0); - int32 maskw[TM] = pad_w + min(raw, 0) + max(raw + BW - AW, 0); - )" + masks_mem + R"( int32* pm[TM] = masks + ldlut + maskw*ldlut + maskh*ldlut*(2*pad_w + 1); - )" + a_delta_mem + R"( int32* pincm[TM] = delta; - int32 incm[TM] = *pincm; - int32 checka0[TM] = *pm; - int32 checka1[TK] = 1 << rka; - int1 checka[TM, TK] = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; - fp32 a[TM, TK] = checka ? *pa : 0; - fp32 b)" + BS + R"( = *pb; - for(int32 k = K; k > 0; k = k - TK){ - C = dot(a, )" + useb + R"(, C); - pa = pa + da[newaxis, :]; - pb = pb + )" + inc_pb + R"(; - b = *pb; - pda = pda + incd;)"; - if(b_lut_){ - res += R"( - pdb = pdb + incd; - db = *pdb;)"; - } - res += R"( - pincd = pincd + incd; - da = *pda; - incd = *pincd; - pm = pm + incm; - pincm = pincm + incm; - incm = *pincm; - checka0 = *pm; - checka = (checka0[:, newaxis] & checka1[newaxis, :]) > 0; - checka = checka && (k > TK); - a = checka ? *pa : 0; - } - int32 rxc[TM] = get_global_range[TM](0); - int32 rc1[TN] = get_global_range[TN](1); - int32 rcn[TM] = rxc / (CH*CW); - int32 rcpq[TM] = rxc % (CH*CW); - int32 rc0[TM] = rcn * ldc_n + rcpq * ldc_q; - fp32* pc[TM, TN] = c + rc1[newaxis, :]*ldc_k + rc0[:, newaxis];)"; - if(bias_ && ty_==FPROP){ - res += R"( - fp32* pbias[TN] = bias + rc1; - fp32 bias[TN] = *pbias; - C = C + bias[newaxis, :];)"; - } - res += R"( - int1 checkc0[TM] = rxc < M; - int1 checkc1[TN] = rc1 < N; - int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - @checkc *pc = C; - })"; - return res; - } + void src(std::ostream &os); // cpu check template @@ -257,6 +122,7 @@ private: driver::buffer* d_a_deltas_; driver::buffer* d_b_deltas_; driver::buffer* d_masks_; + driver::buffer* d_locks_; bool is_a_deltas_cst; bool is_b_deltas_cst_; bool is_mask_cst_; @@ -275,6 +141,9 @@ private: int32_t c_outer_0_idx_; int32_t c_outer_1_idx_; int32_t c_pix_idx; + // maximum grid size for loc + int32_t max_grid_0_; + int32_t max_grid_1_; }; } diff --git a/include/triton/dnn/gemm.h b/include/triton/dnn/gemm.h index 41345bdd8..abc5b8d1b 100644 --- a/include/triton/dnn/gemm.h +++ b/include/triton/dnn/gemm.h @@ -7,132 +7,13 @@ namespace dnn{ class gemm { public: - - static void init(driver::stream* stream, driver::buffer* locks) { - std::vector hlocks(2048, 0); - stream->write(locks, false, 0, hlocks); - } - + static void init(driver::stream* stream, driver::buffer* locks); static void set_arg(driver::kernel *kernel, driver::buffer *a, driver::buffer *b, driver::buffer *c, int32_t M, int32_t N, int32_t K, - driver::buffer *locks, int32_t grid_0, int32_t grid_1) { - kernel->setArg(0, a); - kernel->setArg(1, b); - kernel->setArg(2, c); - kernel->setArg(3, M); - kernel->setArg(4, N); - kernel->setArg(5, K); - kernel->setArg(6, M); - kernel->setArg(7, N); - kernel->setArg(8, M); - kernel->setArg(9, locks); - kernel->setArg(10, grid_0); - kernel->setArg(11, grid_1); - } - - static std::vector default_params(bool AT, bool BT) { - if(AT && BT) - return {32, 64, 32, 64, 16, 8, 2, 2, 4, 2, 8, 4, 2, 1}; - else if(AT && !BT) - return {32, 64, 32, 64, 16, 8, 2, 2, 4, 2, 8, 4, 2, 1}; - else if(!AT && BT) - return {16, 2, 64, 16, 2, 64, 16, 8, 2, 2, 8, 8, 8, 1}; - else - return {16, 2, 128, 32, 32, 32, 4, 2, 2, 8, 8, 4, 2, 1}; - } - - static std::string src(bool AT, bool BT) { - std::string AS0 = "TM", AS1 = "TK"; - std::string BS0 = "TK", BS1 = "TN"; - std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; - std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; - std::string lda0 = "*lda", lda1 = ""; - std::string ldb0 = "", ldb1 = "*ldb"; - std::string usea = AT ? "trans(a)" : "a"; - std::string useb = BT ? "trans(b)" : "b"; - if(AT){ - std::swap(AS0, AS1); - std::swap(bca0, bca1); - std::swap(lda0, lda1); - } - if(BT){ - std::swap(BS0, BS1); - std::swap(bcb0, bcb1); - std::swap(ldb0, ldb1); - } - std::string res = - R"( - const tunable int32 TM = {16, 32, 64, 128}; - const tunable int32 TN = {16, 32, 64, 128}; - const tunable int32 TK = {8}; - const tunable int32 GZ = {1}; - - void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, - int32 M, int32 N, int32 K, - int32 lda, int32 ldb, int32 ldc, - int32 *locks, int32 grid0, int32 grid1) { - int32 rxa[TM] = get_global_range[TM](0); - int32 ryb[TN] = get_global_range[TN](1); - int32 rz = get_global_range[1](2); - int32 rka[TK] = 0 ... TK; - int32 rkb[TK] = 0 ... TK; - fp32 c[TM, TN] = 0; - int32 div = K / GZ; - int32 rem = K % GZ; - K = select(rz < rem, div - 1, div); - int32 offk = select(rz < rem, rz*(div + 1), rz*div + rem); - fp32* pa[)" + AS0 + ", " + AS1 + "] = A + (offk + rka" + bca0 + ")" + lda0 + " + rxa" + bca1 + lda1 + R"(; - fp32* pb[)" + BS0 + ", " + BS1 + "] = B + (offk + rkb" + bcb0 + ")" + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; - fp32 a[)" + AS0 + ", " + AS1 + R"(] = *pa; - fp32 b[)" + BS0 + ", " + BS1 + R"(] = *pb; - int32 last_a = ((M*K - 1) - (TM*TK + 1)) / lda; - int32 last_b = ((K*N - 1) - (TN*TK + 1)) / ldb; - last_a = last_a / TK * TK; - last_b = last_b / TK * TK; - int32 bound = K - max(last_a, last_b); - for(int32 k = K; k > bound; k = k - TK){ - c = dot()" + usea + ", " + useb + R"(, c); - pa = pa + TK)" + lda0 + R"(; - pb = pb + TK)" + ldb0 + R"(; - a = *pa; - b = *pb; - } - int32 rxc[TM] = get_global_range[TM](0); - int32 ryc[TN] = get_global_range[TN](1); - for(int32 k = bound; k > 0; k = k - 1){ - int1 checka[TM, 1] = rxc[:, newaxis] < M; - int1 checkb[TN, 1] = ryc[:, newaxis] < N; - fp32* pa[TM, 1] = A + (offk + K - k))" + lda0 + " + rxc[:, newaxis]" + lda1 + R"(; - fp32* pb[TN, 1] = B + (offk + K - k))" + ldb0 + " + ryc[:, newaxis]" + ldb1 + R"(; - fp32 a[TM, 1] = checka ? *pa : 0; - fp32 b[TN, 1] = checkb ? *pb : 0; - c = dot(a, trans(b), c); - } - int32 ridx = get_range_id(0); - int32 ridy = get_range_id(1); - fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - int32 *plock = locks + ridx + ridy*grid0; - while(__atomic_cas(plock, 0, 1)); - int32 *pcount = plock + grid0*grid1; - int32 count = *pcount; - int32 countp1 = select(count == GZ - 1, 0, count + 1); - int1 checkc0[TM] = rxc < M; - int1 checkc1[TN] = ryc < N; - int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - if(count == 0) { - @checkc *pc = c; - *pcount = countp1; - } - else { - @checkc *pc = c + *pc; - *pcount = countp1; - } - __atomic_cas(plock, 1, 0); - } - )"; - return res; - } + driver::buffer *locks, int32_t grid_0, int32_t grid_1); + static std::vector default_params(bool AT, bool BT); + static std::string src(bool AT, bool BT); }; } diff --git a/include/triton/driver/buffer.h b/include/triton/driver/buffer.h index ed314216a..0502f1ff4 100755 --- a/include/triton/driver/buffer.h +++ b/include/triton/driver/buffer.h @@ -31,7 +31,7 @@ namespace triton namespace driver { -class cu_stream; +class stream; // Base class buffer : public polymorphic_resource { @@ -66,7 +66,7 @@ class cu_buffer: public buffer public: cu_buffer(driver::context* context, size_t size); cu_buffer(driver::context* context, CUdeviceptr cu, bool take_ownership); - void set_zero(cu_stream const & queue, size_t size); + void set_zero(triton::driver::stream *queue, size_t size); }; } diff --git a/include/triton/driver/dispatch.h b/include/triton/driver/dispatch.h index 86b1f2dc1..0e1db604b 100755 --- a/include/triton/driver/dispatch.h +++ b/include/triton/driver/dispatch.h @@ -28,8 +28,6 @@ //CUDA Backend #include "triton/external/CUDA/cuda.h" -#include "triton/external/CUDA/cublas_v2.h" -#include "triton/external/CUDA/cudnn.h" #include "triton/external/CUDA/nvml.h" #include "triton/external/CL/cl.h" #include "triton/external/CL/cl_ext.h" @@ -52,8 +50,6 @@ class cu_context; template void check(T){} void check(CUresult err); -void check(cublasStatus_t err); -void check(cudnnStatus_t err); void check(cl_int err); class dispatch @@ -88,8 +84,6 @@ public: static bool clinit(); static bool nvmlinit(); static bool cuinit(); - static bool cublasinit(); - static bool cudnninit(); static bool spvllvminit(); static void release(); @@ -170,44 +164,6 @@ public: static nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); static nvmlReturn_t nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int mem_clock, unsigned int sm_clock); - // CUBLAS - static cublasHandle_t cublasHandle(driver::cu_context const & ctx); - static cublasStatus_t cublasCreate_v2(cublasHandle_t* h); - static cublasStatus_t cublasGetStream_v2(cublasHandle_t h, cudaStream_t *streamId); - static cublasStatus_t cublasSetStream_v2(cublasHandle_t h, cudaStream_t streamId); - static cublasStatus_t cublasSgemm_v2 (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc); - static cublasStatus_t cublasDgemm_v2 (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc); - static cublasStatus_t cublasHgemm (cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, half* alpha, const half *A, int lda, const half *B, int ldb, half* beta, half *C, int ldc); - static cublasStatus_t cublasGemmEx(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const void *alpha, const void *A, cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb, const void *beta, void *C, cudaDataType Ctype, int ldc, cudaDataType computeType, cublasGemmAlgo_t algo); - // CUDNN - static cudnnHandle_t cudnnHandle(driver::cu_context const & ctx); - static cudnnStatus_t cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc); - static cudnnStatus_t cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t* convDesc); - static cudnnStatus_t cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc); - static cudnnStatus_t cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc); - static cudnnStatus_t cudnnCreate(cudnnHandle_t *handle); - static cudnnStatus_t cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format, cudnnDataType_t dataType, int n, int c, int h, int w); - static cudnnStatus_t cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc, cudnnDataType_t dataType, cudnnTensorFormat_t format, int k, int c, int h, int w); - static cudnnStatus_t cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format, cudnnDataType_t dataType, int nbDims, const int dimA[]); - static cudnnStatus_t cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc, cudnnDataType_t dataType, cudnnTensorFormat_t format, int nbDims, const int filterDimA[]); - static cudnnStatus_t cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc, int pad_h, int pad_w, int u, int v, int upscalex, int upscaley, cudnnConvolutionMode_t mode); - static cudnnStatus_t cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc, int arrayLength, const int padA[], const int filterStrideA[], const int upscaleA[], cudnnConvolutionMode_t mode, cudnnDataType_t dataType); - static cudnnStatus_t cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode, const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims, const int windowDimA[], const int paddingA[], const int strideA[]); - static cudnnStatus_t cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId); - static cudnnStatus_t cudnnTransformTensor(cudnnHandle_t handle, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y); - // pooling - static cudnnStatus_t cudnnPoolingForward(cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y); - // forward - static cudnnStatus_t cudnnGetConvolutionForwardAlgorithm(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes, cudnnConvolutionFwdAlgo_t *algo); - static cudnnStatus_t cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo, size_t *sizeInBytes); - static cudnnStatus_t cudnnConvolutionForward(cudnnHandle_t handle, const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x, const cudnnFilterDescriptor_t wDesc, const void *w, const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo, void *workSpace, size_t workSpaceSizeInBytes, const void *beta, const cudnnTensorDescriptor_t yDesc, void *y); - // backward data - static cudnnStatus_t cudnnConvolutionBackwardData(cudnnHandle_t handle, const void *alpha, const cudnnFilterDescriptor_t wDesc, const void *w, const cudnnTensorDescriptor_t dyDesc, const void *dy, const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionBwdDataAlgo_t algo, void* workSpace, size_t workSpaceSizeInBytes, const void* beta, const cudnnTensorDescriptor_t dxDesc, void *dx); - static cudnnStatus_t cudnnGetConvolutionBackwardDataAlgorithm(cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,const cudnnTensorDescriptor_t dyDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnTensorDescriptor_t dxDesc, cudnnConvolutionBwdDataPreference_t preference, size_t memoryLimitInBytes, cudnnConvolutionBwdDataAlgo_t* algo); - // backward filter - static cudnnStatus_t cudnnConvolutionBackwardFilter(cudnnHandle_t handle, const void *alpha, const cudnnTensorDescriptor_t xDesc,const void *x, const cudnnTensorDescriptor_t dyDesc,const void *dy, const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionBwdFilterAlgo_t algo,void* workSpace, size_t workSpaceSizeInBytes, const void* beta, const cudnnFilterDescriptor_t dwDesc, void *dw); - static cudnnStatus_t cudnnGetConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t dyDesc, const cudnnConvolutionDescriptor_t convDesc, const cudnnFilterDescriptor_t dwDesc, cudnnConvolutionBwdFilterPreference_t preference, size_t memoryLimitInBytes, cudnnConvolutionBwdFilterAlgo_t* algo); - // SPIR-V libraries static int initializeLLVMToSPIRVPass(llvm::PassRegistry &); static bool writeSpirv(llvm::Module *M, std::ostream &OS, std::string &ErrMsg); @@ -219,8 +175,6 @@ private: static void* opencl_; static void* cuda_; static void* nvml_; - static void* cublas_; - static void* cudnn_; static void* vulkan_; static void* spvllvm_; static void* spvcross_; @@ -304,33 +258,6 @@ private: static void* nvmlDeviceGetClockInfo_; static void* nvmlDeviceGetMaxClockInfo_; static void* nvmlDeviceSetApplicationsClocks_; - // cuBLAS - static void* cublasCreate_v2_; - static void* cublasGetStream_v2_; - static void* cublasSetStream_v2_; - static void* cublasHgemm_; - static void* cublasSgemm_v2_; - static void* cublasDgemm_v2_; - static void* cublasGemmEx_; - // cuDNN - static void* cudnnCreateConvolutionDescriptor_; - static void* cudnnCreatePoolingDescriptor_; - static void* cudnnCreateTensorDescriptor_; - static void* cudnnCreateFilterDescriptor_; - static void* cudnnCreate_; - static void* cudnnSetTensor4dDescriptor_; - static void* cudnnSetFilter4dDescriptor_; - static void* cudnnSetTensorNdDescriptorEx_; - static void* cudnnSetFilterNdDescriptor_; - static void* cudnnSetConvolution2dDescriptor_; - static void* cudnnSetConvolutionNdDescriptor_; - static void* cudnnSetPoolingNdDescriptor_; - static void* cudnnGetConvolutionForwardAlgorithm_; - static void* cudnnGetConvolutionForwardWorkspaceSize_; - static void* cudnnConvolutionForward_; - static void* cudnnPoolingForward_; - static void* cudnnSetStream_; - static void* cudnnTransformTensor_; // LLVM to SPIR-V static void* initializeLLVMToSPIRVPass_; diff --git a/include/triton/external/CUDA/builtin_types.h b/include/triton/external/CUDA/builtin_types.h deleted file mode 100755 index 5247c4080..000000000 --- a/include/triton/external/CUDA/builtin_types.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -/******************************************************************************* -* * -* * -* * -*******************************************************************************/ - -#include "device_types.h" -#if !defined(__CUDACC_RTC__) -#define EXCLUDE_FROM_RTC -#include "driver_types.h" -#undef EXCLUDE_FROM_RTC -#endif /* !__CUDACC_RTC__ */ -#include "surface_types.h" -#include "texture_types.h" -#include "vector_types.h" diff --git a/include/triton/external/CUDA/channel_descriptor.h b/include/triton/external/CUDA/channel_descriptor.h deleted file mode 100755 index 150f93bde..000000000 --- a/include/triton/external/CUDA/channel_descriptor.h +++ /dev/null @@ -1,412 +0,0 @@ -/* - * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -#if !defined(__CHANNEL_DESCRIPTOR_H__) -#define __CHANNEL_DESCRIPTOR_H__ - -#if defined(__cplusplus) - -/******************************************************************************* -* * -* * -* * -*******************************************************************************/ - -#include "driver_types.h" -#include "cuda_runtime_api.h" -#include "host_defines.h" -#include "vector_types.h" - -/******************************************************************************* -* * -* * -* * -*******************************************************************************/ - -/** - * \addtogroup CUDART_HIGHLEVEL - * - * @{ - */ - -/** - * \brief \hl Returns a channel descriptor using the specified format - * - * Returns a channel descriptor with format \p f and number of bits of each - * component \p x, \p y, \p z, and \p w. The ::cudaChannelFormatDesc is - * defined as: - * \code - struct cudaChannelFormatDesc { - int x, y, z, w; - enum cudaChannelFormatKind f; - }; - * \endcode - * - * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned, - * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat. - * - * \return - * Channel descriptor with format \p f - * - * \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)", - * ::cudaGetChannelDesc, ::cudaGetTextureReference, - * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (High level)", - * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, size_t) "cudaBindTexture (High level, inherited channel descriptor)", - * \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (High level)", - * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (High level)", - * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (High level, inherited channel descriptor)", - * \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (High level)", - * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (High level)" - */ -template __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone); -} - -static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void) -{ - int e = (int)sizeof(unsigned short) * 8; - - return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat); -} - -static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void) -{ - int e = (int)sizeof(unsigned short) * 8; - - return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat); -} - -static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void) -{ - int e = (int)sizeof(unsigned short) * 8; - - return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat); -} - -static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void) -{ - int e = (int)sizeof(unsigned short) * 8; - - return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(char) * 8; - -#if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__) - return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); -#else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */ - return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); -#endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */ -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(signed char) * 8; - - return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(unsigned char) * 8; - - return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(signed char) * 8; - - return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(unsigned char) * 8; - - return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(signed char) * 8; - - return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(unsigned char) * 8; - - return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(signed char) * 8; - - return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(unsigned char) * 8; - - return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(short) * 8; - - return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(unsigned short) * 8; - - return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(short) * 8; - - return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(unsigned short) * 8; - - return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(short) * 8; - - return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(unsigned short) * 8; - - return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(short) * 8; - - return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(unsigned short) * 8; - - return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(int) * 8; - - return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(unsigned int) * 8; - - return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(int) * 8; - - return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(unsigned int) * 8; - - return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(int) * 8; - - return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(unsigned int) * 8; - - return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(int) * 8; - - return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(unsigned int) * 8; - - return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned); -} - -#if !defined(__LP64__) - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(long) * 8; - - return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(unsigned long) * 8; - - return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(long) * 8; - - return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(unsigned long) * 8; - - return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(long) * 8; - - return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(unsigned long) * 8; - - return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(long) * 8; - - return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(unsigned long) * 8; - - return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned); -} - -#endif /* !__LP64__ */ - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(float) * 8; - - return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(float) * 8; - - return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(float) * 8; - - return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat); -} - -template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) -{ - int e = (int)sizeof(float) * 8; - - return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat); -} - -#endif /* __cplusplus */ - -/** @} */ -/** @} */ /* END CUDART_TEXTURE_HL */ - -#endif /* !__CHANNEL_DESCRIPTOR_H__ */ diff --git a/include/triton/external/CUDA/crt/host_config.h b/include/triton/external/CUDA/crt/host_config.h deleted file mode 100644 index 8b023b528..000000000 --- a/include/triton/external/CUDA/crt/host_config.h +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Copyright 1993-2016 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -#if !defined(__HOST_CONFIG_H__) -#define __HOST_CONFIG_H__ - -/******************************************************************************* -* * -* * -* * -*******************************************************************************/ - -#if defined(__CUDACC__) - -#if defined(__CUDACC_RTC__) - -#define _CRTIMP -#define __THROW - -#else /* __CUDACC_RTC__ */ - -/* check for host compilers that are compatible with nvcc */ -#if !defined(__GNUC__) && !defined(_WIN32) - -#error --- !!! UNSUPPORTED COMPILER !!! --- - -#endif /* !__GNUC__ && !_WIN32 */ - -#if defined(__ICC) - -#if (__ICC != 1500 && __ICC != 1600 && __ICC != 1700) || !defined(__GNUC__) || !defined(__LP64__) - -#error -- unsupported ICC configuration! Only ICC 15.0, ICC 16.0, and ICC 17.0 on Linux x86_64 are supported! - -#endif /* (__ICC != 1500 && __ICC != 1600 && __ICC != 17.0) || !__GNUC__ || !__LP64__ */ - -#endif /* __ICC */ - -#if defined(__PGIC__) - -#if (!(__PGIC__ == 17) && \ - !(__PGIC__ == 99 && __PGIC_MINOR__ == 99)) || \ - !defined(__GNUC__) || !defined(__LP64__) - -#error -- unsupported pgc++ configuration! Only pgc++ 17 on Linux x86_64 is supported! - -#endif /* (!(__PGIC__ == 17) && - !(__PGIC__ == 99 && __PGIC_MINOR__ == 99 )) || - !__GNUC__ || !__LP64__ */ - -#endif /* __PGIC__ */ - -#if defined(__powerpc__) - -#if !defined(__powerpc64__) || !defined(__LITTLE_ENDIAN__) - -#error -- unsupported PPC platform! Only 64-bit little endian PPC is supported! - -#endif /* !__powerpc64__ || !__LITTLE_ENDIAN__ */ - -#if defined(__ibmxl_vrm__) && (__ibmxl_vrm__ < 0x0d010000 && __ibmxl_vrm__ >= 0x0d020000) - -#error -- unsupported xlC version! only xlC 13.1 is supported - -#endif /* __ibmxl_vrm__ && (__ibmxl_vrm__ < 0x0d010000 && __ibmxl_vrm__ >= 0x0d020000) */ - -#endif /* __powerpc__ */ - -#if defined(__GNUC__) - -#if __GNUC__ > 6 - -#error -- unsupported GNU version! gcc versions later than 6 are not supported! - -#endif /* __GNUC__ > 6 */ - -#if defined(__APPLE__) && defined(__MACH__) && !defined(__clang__) -#error -- clang and clang++ are the only supported host compilers on Mac OS X! -#endif /* __APPLE__ && __MACH__ && !__clang__ */ - -#endif /* __GNUC__ */ - -#if defined(_WIN32) - -#if _MSC_VER < 1600 || _MSC_VER > 1911 - -#error -- unsupported Microsoft Visual Studio version! Only the versions 2012, 2013, 2015 and 2017 are supported! - -#elif _MSC_VER == 1600 /* _MSC_VERION == 1600 */ - -#pragma message("support for Microsoft Visual Studio 2010 has been deprecated!") - -#endif /* _MSC_VER < 1600 || _MSC_VER > 1800 || _MSC_VERSION == 1600 */ - -#endif /* _WIN32 */ - -/* configure host compiler */ -#if defined(__APPLE__) - -#define _CRTIMP -#define _ACRTIMP -#define __THROW - -#if defined(__BLOCKS__) /* nvcc does not support closures */ - -#undef __BLOCKS__ - -#endif /* __BLOCKS__ */ - -#elif defined(__ANDROID__) - -#define _CRTIMP -#define _ACRTIMP -#define __THROW - -#elif defined(__QNX__) - -#define _CRTIMP -#define _ACRTIMP -#define __THROW - -#elif defined(__HORIZON__) - -#define _CRTIMP -#define _ACRTIMP -#define __THROW - -#elif defined(__GNUC__) - -#define _CRTIMP -#define _ACRTIMP - -#include /* for __THROW */ - -#elif defined(_WIN32) - -#if _MSC_VER >= 1500 - -#undef _USE_DECLSPECS_FOR_SAL -#define _USE_DECLSPECS_FOR_SAL \ - 1 - -#endif /* _MSC_VER >= 1500 */ - -#if !defined(_CRT_NONSTDC_NO_WARNINGS) - -#define _CRT_NONSTDC_NO_WARNINGS /* to suppress warnings */ - -#endif /* !_CRT_NONSTDC_NO_WARNINGS */ - -#if !defined(_CRT_SECURE_NO_WARNINGS) - -#define _CRT_SECURE_NO_WARNINGS /* to suppress warnings */ - -#endif /* !_CRT_SECURE_NO_WARNINGS */ - -#if !defined(NOMINMAX) - -#define NOMINMAX /* min and max are part of cuda runtime */ - -#endif /* !NOMINMAX */ - -#include /* for _CRTIMP */ -#if _MSC_VER >= 1900 -#include /* for _ACRTIMP */ -#endif /* _MSC_VER >= 1900 */ - -#define __THROW - -#endif /* __APPLE__ */ - -#endif /* __CUDACC_RTC__ */ - - -#if defined(__cplusplus) && defined(__CUDA_ARCH__) && (defined(__PGIC__) || defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER))) - -#if __CUDACC_RTC__ -typedef char *va_list; -#else /* !__CUDACC_RTC__ */ -#include -#endif /* __CUDACC_RTC__ */ - - -#undef va_start -#undef va_end -#undef va_arg - -#ifdef __PGIC__ - -#undef __builtin_va_end - -#define va_start(v,l) __builtin_alt_va_start(v,l) -#define va_end(v) __builtin_va_end(v) -#define va_arg(v,l) __builtin_alt_va_arg(v,l) - -#if (__cplusplus >= 201103L) -#undef va_copy -#define va_copy(d,s) __builtin_va_copy(d,s) -#endif - -#else /* !__PGIC__ */ - - -#define va_start(ap, x) (__cu_va_start(&ap, x)) -#define va_end(ap) (__cu_va_end(&ap)) -#define va_arg(ap, t) (*((t *)__cu_va_arg(&ap, (t *)0))) - -#if (_MSC_VER >= 1800) || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L)) -#undef va_copy -#define va_copy(apd, aps) (__cu_va_copy(&(apd), &(aps))) -#endif /* (_MSC_VER >= 1800) || (defined(__CUDACC_RTC__) && (__cplusplus >= 201103L)) */ -#endif /* __PGIC__ */ - -#endif /* defined(__cplusplus) && (defined(__CUDACC_RTC__) || (defined(_WIN32) && defined(_MSC_VER))) */ - - - -#endif /* __CUDACC__ */ - -#endif /* !__HOST_CONFIG_H__ */ diff --git a/include/triton/external/CUDA/crt/host_defines.h b/include/triton/external/CUDA/crt/host_defines.h deleted file mode 100644 index 556d2e5e1..000000000 --- a/include/triton/external/CUDA/crt/host_defines.h +++ /dev/null @@ -1,216 +0,0 @@ -/* - * Copyright 1993-2017 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -#if !defined(__HOST_DEFINES_H__) -#define __HOST_DEFINES_H__ - -/* CUDA JIT mode (__CUDACC_RTC__) also uses GNU style attributes */ -#if defined(__GNUC__) || defined(__CUDA_LIBDEVICE__) || defined(__CUDACC_RTC__) - -#if defined(__CUDACC_RTC__) -#define __volatile__ volatile -#endif /* __CUDACC_RTC__ */ - -#define __no_return__ \ - __attribute__((noreturn)) - -#if defined(__CUDACC__) || defined(__CUDA_ARCH__) || defined(__CUDA_LIBDEVICE__) -/* gcc allows users to define attributes with underscores, - e.g., __attribute__((__noinline__)). - Consider a non-CUDA source file (e.g. .cpp) that has the - above attribute specification, and includes this header file. In that case, - defining __noinline__ as below would cause a gcc compilation error. - Hence, only define __noinline__ when the code is being processed - by a CUDA compiler component. -*/ -#define __noinline__ \ - __attribute__((noinline)) -#endif /* __CUDACC__ || __CUDA_ARCH__ || __CUDA_LIBDEVICE__ */ - -#define __forceinline__ \ - __inline__ __attribute__((always_inline)) -#define __align__(n) \ - __attribute__((aligned(n))) -#define __thread__ \ - __thread -#define __import__ -#define __export__ -#define __cdecl -#define __annotate__(a) \ - __attribute__((a)) -#define __location__(a) \ - __annotate__(a) -#define CUDARTAPI - -#elif defined(_MSC_VER) - -#if _MSC_VER >= 1400 - -#define __restrict__ \ - __restrict - -#else /* _MSC_VER >= 1400 */ - -#define __restrict__ - -#endif /* _MSC_VER >= 1400 */ - -#define __inline__ \ - __inline -#define __no_return__ \ - __declspec(noreturn) -#define __noinline__ \ - __declspec(noinline) -#define __forceinline__ \ - __forceinline -#define __align__(n) \ - __declspec(align(n)) -#define __thread__ \ - __declspec(thread) -#define __import__ \ - __declspec(dllimport) -#define __export__ \ - __declspec(dllexport) -#define __annotate__(a) \ - __declspec(a) -#define __location__(a) \ - __annotate__(__##a##__) -#define CUDARTAPI \ - __stdcall - -#else /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */ - -#define __inline__ - -#if !defined(__align__) - -#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for '__align__' !!! --- - -#endif /* !__align__ */ - -#if !defined(CUDARTAPI) - -#error --- !!! UNKNOWN COMPILER: please provide a CUDA compatible definition for 'CUDARTAPI' !!! --- - -#endif /* !CUDARTAPI */ - -#endif /* __GNUC__ || __CUDA_LIBDEVICE__ || __CUDACC_RTC__ */ - -#if (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !defined(__clang__)))) || \ - (defined(_MSC_VER) && _MSC_VER < 1900) || \ - (!defined(__GNUC__) && !defined(_MSC_VER)) - -#define __specialization_static \ - static - -#else /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) || - (_MSC_VER && _MSC_VER < 1900) || - (!__GNUC__ && !_MSC_VER) */ - -#define __specialization_static - -#endif /* (__GNUC__ && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 3 && !__clang__))) || - (_MSC_VER && _MSC_VER < 1900) || - (!__GNUC__ && !_MSC_VER) */ - -#if !defined(__CUDACC__) && !defined(__CUDA_LIBDEVICE__) - -#undef __annotate__ -#define __annotate__(a) - -#else /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */ - -#define __launch_bounds__(...) \ - __annotate__(launch_bounds(__VA_ARGS__)) - -#endif /* !__CUDACC__ && !__CUDA_LIBDEVICE__ */ - -#if defined(__CUDACC__) || defined(__CUDA_LIBDEVICE__) || \ - defined(__GNUC__) || defined(_WIN64) - -#define __builtin_align__(a) \ - __align__(a) - -#else /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__ || _WIN64 */ - -#define __builtin_align__(a) - -#endif /* __CUDACC__ || __CUDA_LIBDEVICE__ || __GNUC__ || _WIN64 */ - -#define __host__ \ - __location__(host) -#define __device__ \ - __location__(device) -#define __global__ \ - __location__(global) -#define __shared__ \ - __location__(shared) -#define __constant__ \ - __location__(constant) -#define __managed__ \ - __location__(managed) - -#if !defined(__CUDACC__) -#define __device_builtin__ -#define __device_builtin_texture_type__ -#define __device_builtin_surface_type__ -#define __cudart_builtin__ -#else /* defined(__CUDACC__) */ -#define __device_builtin__ \ - __location__(device_builtin) -#define __device_builtin_texture_type__ \ - __location__(device_builtin_texture_type) -#define __device_builtin_surface_type__ \ - __location__(device_builtin_surface_type) -#define __cudart_builtin__ \ - __location__(cudart_builtin) -#endif /* !defined(__CUDACC__) */ - - -#endif /* !__HOST_DEFINES_H__ */ diff --git a/include/triton/external/CUDA/cuComplex.h b/include/triton/external/CUDA/cuComplex.h deleted file mode 100755 index 78bc90353..000000000 --- a/include/triton/external/CUDA/cuComplex.h +++ /dev/null @@ -1,338 +0,0 @@ -/* - * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -#if !defined(CU_COMPLEX_H_) -#define CU_COMPLEX_H_ - -/* When trying to include C header file in C++ Code extern "C" is required - * But the Standard QNX headers already have ifdef extern in them when compiling C++ Code - * extern "C" cannot be nested - * Hence keep the header out of extern "C" block - */ - -#include /* import fabsf, sqrt */ - -#if defined(__cplusplus) -extern "C" { -#endif /* __cplusplus */ - -#include "vector_types.h" - -typedef float2 cuFloatComplex; - -__host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x) -{ - return x.x; -} - -__host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x) -{ - return x.y; -} - -__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex - (float r, float i) -{ - cuFloatComplex res; - res.x = r; - res.y = i; - return res; -} - -__host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x) -{ - return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x)); -} -__host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x, - cuFloatComplex y) -{ - return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y), - cuCimagf(x) + cuCimagf(y)); -} - -__host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x, - cuFloatComplex y) -{ - return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y), - cuCimagf(x) - cuCimagf(y)); -} - -/* This implementation could suffer from intermediate overflow even though - * the final result would be in range. However, various implementations do - * not guard against this (presumably to avoid losing performance), so we - * don't do it either to stay competitive. - */ -__host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x, - cuFloatComplex y) -{ - cuFloatComplex prod; - prod = make_cuFloatComplex ((cuCrealf(x) * cuCrealf(y)) - - (cuCimagf(x) * cuCimagf(y)), - (cuCrealf(x) * cuCimagf(y)) + - (cuCimagf(x) * cuCrealf(y))); - return prod; -} - -/* This implementation guards against intermediate underflow and overflow - * by scaling. Such guarded implementations are usually the default for - * complex library implementations, with some also offering an unguarded, - * faster version. - */ -__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x, - cuFloatComplex y) -{ - cuFloatComplex quot; - float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y)); - float oos = 1.0f / s; - float ars = cuCrealf(x) * oos; - float ais = cuCimagf(x) * oos; - float brs = cuCrealf(y) * oos; - float bis = cuCimagf(y) * oos; - s = (brs * brs) + (bis * bis); - oos = 1.0f / s; - quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos, - ((ais * brs) - (ars * bis)) * oos); - return quot; -} - -/* - * We would like to call hypotf(), but it's not available on all platforms. - * This discrete implementation guards against intermediate underflow and - * overflow by scaling. Otherwise we would lose half the exponent range. - * There are various ways of doing guarded computation. For now chose the - * simplest and fastest solution, however this may suffer from inaccuracies - * if sqrt and division are not IEEE compliant. - */ -__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x) -{ - float a = cuCrealf(x); - float b = cuCimagf(x); - float v, w, t; - a = fabsf(a); - b = fabsf(b); - if (a > b) { - v = a; - w = b; - } else { - v = b; - w = a; - } - t = w / v; - t = 1.0f + t * t; - t = v * sqrtf(t); - if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) { - t = v + w; - } - return t; -} - -/* Double precision */ -typedef double2 cuDoubleComplex; - -__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x) -{ - return x.x; -} - -__host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x) -{ - return x.y; -} - -__host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex - (double r, double i) -{ - cuDoubleComplex res; - res.x = r; - res.y = i; - return res; -} - -__host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x) -{ - return make_cuDoubleComplex (cuCreal(x), -cuCimag(x)); -} - -__host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x, - cuDoubleComplex y) -{ - return make_cuDoubleComplex (cuCreal(x) + cuCreal(y), - cuCimag(x) + cuCimag(y)); -} - -__host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x, - cuDoubleComplex y) -{ - return make_cuDoubleComplex (cuCreal(x) - cuCreal(y), - cuCimag(x) - cuCimag(y)); -} - -/* This implementation could suffer from intermediate overflow even though - * the final result would be in range. However, various implementations do - * not guard against this (presumably to avoid losing performance), so we - * don't do it either to stay competitive. - */ -__host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x, - cuDoubleComplex y) -{ - cuDoubleComplex prod; - prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) - - (cuCimag(x) * cuCimag(y)), - (cuCreal(x) * cuCimag(y)) + - (cuCimag(x) * cuCreal(y))); - return prod; -} - -/* This implementation guards against intermediate underflow and overflow - * by scaling. Such guarded implementations are usually the default for - * complex library implementations, with some also offering an unguarded, - * faster version. - */ -__host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x, - cuDoubleComplex y) -{ - cuDoubleComplex quot; - double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y))); - double oos = 1.0 / s; - double ars = cuCreal(x) * oos; - double ais = cuCimag(x) * oos; - double brs = cuCreal(y) * oos; - double bis = cuCimag(y) * oos; - s = (brs * brs) + (bis * bis); - oos = 1.0 / s; - quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos, - ((ais * brs) - (ars * bis)) * oos); - return quot; -} - -/* This implementation guards against intermediate underflow and overflow - * by scaling. Otherwise we would lose half the exponent range. There are - * various ways of doing guarded computation. For now chose the simplest - * and fastest solution, however this may suffer from inaccuracies if sqrt - * and division are not IEEE compliant. - */ -__host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x) -{ - double a = cuCreal(x); - double b = cuCimag(x); - double v, w, t; - a = fabs(a); - b = fabs(b); - if (a > b) { - v = a; - w = b; - } else { - v = b; - w = a; - } - t = w / v; - t = 1.0 + t * t; - t = v * sqrt(t); - if ((v == 0.0) || - (v > 1.79769313486231570e+308) || (w > 1.79769313486231570e+308)) { - t = v + w; - } - return t; -} - -#if defined(__cplusplus) -} -#endif /* __cplusplus */ - -/* aliases */ -typedef cuFloatComplex cuComplex; -__host__ __device__ static __inline__ cuComplex make_cuComplex (float x, - float y) -{ - return make_cuFloatComplex (x, y); -} - -/* float-to-double promotion */ -__host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble - (cuFloatComplex c) -{ - return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c)); -} - -__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat -(cuDoubleComplex c) -{ - return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c)); -} - - -__host__ __device__ static __inline__ cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d) -{ - float real_res; - float imag_res; - - real_res = (cuCrealf(x) * cuCrealf(y)) + cuCrealf(d); - imag_res = (cuCrealf(x) * cuCimagf(y)) + cuCimagf(d); - - real_res = -(cuCimagf(x) * cuCimagf(y)) + real_res; - imag_res = (cuCimagf(x) * cuCrealf(y)) + imag_res; - - return make_cuComplex(real_res, imag_res); -} - -__host__ __device__ static __inline__ cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d) -{ - double real_res; - double imag_res; - - real_res = (cuCreal(x) * cuCreal(y)) + cuCreal(d); - imag_res = (cuCreal(x) * cuCimag(y)) + cuCimag(d); - - real_res = -(cuCimag(x) * cuCimag(y)) + real_res; - imag_res = (cuCimag(x) * cuCreal(y)) + imag_res; - - return make_cuDoubleComplex(real_res, imag_res); -} - -#endif /* !defined(CU_COMPLEX_H_) */ diff --git a/include/triton/external/CUDA/cublas.h b/include/triton/external/CUDA/cublas.h deleted file mode 100755 index 34521c06c..000000000 --- a/include/triton/external/CUDA/cublas.h +++ /dev/null @@ -1,565 +0,0 @@ -/* - * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -/* - * This is the public header file for the CUBLAS library, defining the API - * - * CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines) - * on top of the CUDA runtime. - */ - -#if !defined(CUBLAS_H_) -#define CUBLAS_H_ - -#include - -#ifndef CUBLASWINAPI -#ifdef _WIN32 -#define CUBLASWINAPI __stdcall -#else -#define CUBLASWINAPI -#endif -#endif - -#undef CUBLASAPI -#ifdef __CUDACC__ -#define CUBLASAPI __host__ -#else -#define CUBLASAPI -#endif - -#include "cublas_api.h" - -#if defined(__cplusplus) -extern "C" { -#endif - -/* CUBLAS data types */ -#define cublasStatus cublasStatus_t - -cublasStatus CUBLASWINAPI cublasInit (void); -cublasStatus CUBLASWINAPI cublasShutdown (void); -cublasStatus CUBLASWINAPI cublasGetError (void); - -cublasStatus CUBLASWINAPI cublasGetVersion(int *version); -cublasStatus CUBLASWINAPI cublasAlloc (int n, int elemSize, void **devicePtr); - -cublasStatus CUBLASWINAPI cublasFree (void *devicePtr); - - -cublasStatus CUBLASWINAPI cublasSetKernelStream (cudaStream_t stream); - - - -/* ---------------- CUBLAS BLAS1 functions ---------------- */ -/* NRM2 */ -float CUBLASWINAPI cublasSnrm2 (int n, const float *x, int incx); -double CUBLASWINAPI cublasDnrm2 (int n, const double *x, int incx); -float CUBLASWINAPI cublasScnrm2 (int n, const cuComplex *x, int incx); -double CUBLASWINAPI cublasDznrm2 (int n, const cuDoubleComplex *x, int incx); -/*------------------------------------------------------------------------*/ -/* DOT */ -float CUBLASWINAPI cublasSdot (int n, const float *x, int incx, const float *y, - int incy); -double CUBLASWINAPI cublasDdot (int n, const double *x, int incx, const double *y, - int incy); -cuComplex CUBLASWINAPI cublasCdotu (int n, const cuComplex *x, int incx, const cuComplex *y, - int incy); -cuComplex CUBLASWINAPI cublasCdotc (int n, const cuComplex *x, int incx, const cuComplex *y, - int incy); -cuDoubleComplex CUBLASWINAPI cublasZdotu (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, - int incy); -cuDoubleComplex CUBLASWINAPI cublasZdotc (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, - int incy); -/*------------------------------------------------------------------------*/ -/* SCAL */ -void CUBLASWINAPI cublasSscal (int n, float alpha, float *x, int incx); -void CUBLASWINAPI cublasDscal (int n, double alpha, double *x, int incx); -void CUBLASWINAPI cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx); -void CUBLASWINAPI cublasZscal (int n, cuDoubleComplex alpha, cuDoubleComplex *x, int incx); - -void CUBLASWINAPI cublasCsscal (int n, float alpha, cuComplex *x, int incx); -void CUBLASWINAPI cublasZdscal (int n, double alpha, cuDoubleComplex *x, int incx); -/*------------------------------------------------------------------------*/ -/* AXPY */ -void CUBLASWINAPI cublasSaxpy (int n, float alpha, const float *x, int incx, - float *y, int incy); -void CUBLASWINAPI cublasDaxpy (int n, double alpha, const double *x, - int incx, double *y, int incy); -void CUBLASWINAPI cublasCaxpy (int n, cuComplex alpha, const cuComplex *x, - int incx, cuComplex *y, int incy); -void CUBLASWINAPI cublasZaxpy (int n, cuDoubleComplex alpha, const cuDoubleComplex *x, - int incx, cuDoubleComplex *y, int incy); -/*------------------------------------------------------------------------*/ -/* COPY */ -void CUBLASWINAPI cublasScopy (int n, const float *x, int incx, float *y, - int incy); -void CUBLASWINAPI cublasDcopy (int n, const double *x, int incx, double *y, - int incy); -void CUBLASWINAPI cublasCcopy (int n, const cuComplex *x, int incx, cuComplex *y, - int incy); -void CUBLASWINAPI cublasZcopy (int n, const cuDoubleComplex *x, int incx, cuDoubleComplex *y, - int incy); -/*------------------------------------------------------------------------*/ -/* SWAP */ -void CUBLASWINAPI cublasSswap (int n, float *x, int incx, float *y, int incy); -void CUBLASWINAPI cublasDswap (int n, double *x, int incx, double *y, int incy); -void CUBLASWINAPI cublasCswap (int n, cuComplex *x, int incx, cuComplex *y, int incy); -void CUBLASWINAPI cublasZswap (int n, cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy); -/*------------------------------------------------------------------------*/ -/* AMAX */ -int CUBLASWINAPI cublasIsamax (int n, const float *x, int incx); -int CUBLASWINAPI cublasIdamax (int n, const double *x, int incx); -int CUBLASWINAPI cublasIcamax (int n, const cuComplex *x, int incx); -int CUBLASWINAPI cublasIzamax (int n, const cuDoubleComplex *x, int incx); -/*------------------------------------------------------------------------*/ -/* AMIN */ -int CUBLASWINAPI cublasIsamin (int n, const float *x, int incx); -int CUBLASWINAPI cublasIdamin (int n, const double *x, int incx); - -int CUBLASWINAPI cublasIcamin (int n, const cuComplex *x, int incx); -int CUBLASWINAPI cublasIzamin (int n, const cuDoubleComplex *x, int incx); -/*------------------------------------------------------------------------*/ -/* ASUM */ -float CUBLASWINAPI cublasSasum (int n, const float *x, int incx); -double CUBLASWINAPI cublasDasum (int n, const double *x, int incx); -float CUBLASWINAPI cublasScasum (int n, const cuComplex *x, int incx); -double CUBLASWINAPI cublasDzasum (int n, const cuDoubleComplex *x, int incx); -/*------------------------------------------------------------------------*/ -/* ROT */ -void CUBLASWINAPI cublasSrot (int n, float *x, int incx, float *y, int incy, - float sc, float ss); -void CUBLASWINAPI cublasDrot (int n, double *x, int incx, double *y, int incy, - double sc, double ss); -void CUBLASWINAPI cublasCrot (int n, cuComplex *x, int incx, cuComplex *y, - int incy, float c, cuComplex s); -void CUBLASWINAPI cublasZrot (int n, cuDoubleComplex *x, int incx, - cuDoubleComplex *y, int incy, double sc, - cuDoubleComplex cs); -void CUBLASWINAPI cublasCsrot (int n, cuComplex *x, int incx, cuComplex *y, - int incy, float c, float s); -void CUBLASWINAPI cublasZdrot (int n, cuDoubleComplex *x, int incx, - cuDoubleComplex *y, int incy, double c, double s); -/*------------------------------------------------------------------------*/ -/* ROTG */ -void CUBLASWINAPI cublasSrotg (float *sa, float *sb, float *sc, float *ss); -void CUBLASWINAPI cublasDrotg (double *sa, double *sb, double *sc, double *ss); -void CUBLASWINAPI cublasCrotg (cuComplex *ca, cuComplex cb, float *sc, - cuComplex *cs); -void CUBLASWINAPI cublasZrotg (cuDoubleComplex *ca, cuDoubleComplex cb, double *sc, - cuDoubleComplex *cs); -/*------------------------------------------------------------------------*/ -/* ROTM */ -void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy, - const float* sparam); -void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy, - const double* sparam); -/*------------------------------------------------------------------------*/ -/* ROTMG */ -void CUBLASWINAPI cublasSrotmg (float *sd1, float *sd2, float *sx1, - const float *sy1, float* sparam); -void CUBLASWINAPI cublasDrotmg (double *sd1, double *sd2, double *sx1, - const double *sy1, double* sparam); - -/* --------------- CUBLAS BLAS2 functions ---------------- */ -/* GEMV */ -void CUBLASWINAPI cublasSgemv (char trans, int m, int n, float alpha, - const float *A, int lda, const float *x, int incx, - float beta, float *y, int incy); -void CUBLASWINAPI cublasDgemv (char trans, int m, int n, double alpha, - const double *A, int lda, const double *x, int incx, - double beta, double *y, int incy); -void CUBLASWINAPI cublasCgemv (char trans, int m, int n, cuComplex alpha, - const cuComplex *A, int lda, const cuComplex *x, int incx, - cuComplex beta, cuComplex *y, int incy); -void CUBLASWINAPI cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha, - const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx, - cuDoubleComplex beta, cuDoubleComplex *y, int incy); -/*------------------------------------------------------------------------*/ -/* GBMV */ -void CUBLASWINAPI cublasSgbmv (char trans, int m, int n, int kl, int ku, - float alpha, const float *A, int lda, - const float *x, int incx, float beta, float *y, - int incy); -void CUBLASWINAPI cublasDgbmv (char trans, int m, int n, int kl, int ku, - double alpha, const double *A, int lda, - const double *x, int incx, double beta, double *y, - int incy); -void CUBLASWINAPI cublasCgbmv (char trans, int m, int n, int kl, int ku, - cuComplex alpha, const cuComplex *A, int lda, - const cuComplex *x, int incx, cuComplex beta, cuComplex *y, - int incy); -void CUBLASWINAPI cublasZgbmv (char trans, int m, int n, int kl, int ku, - cuDoubleComplex alpha, const cuDoubleComplex *A, int lda, - const cuDoubleComplex *x, int incx, cuDoubleComplex beta, cuDoubleComplex *y, - int incy); -/*------------------------------------------------------------------------*/ -/* TRMV */ -void CUBLASWINAPI cublasStrmv (char uplo, char trans, char diag, int n, - const float *A, int lda, float *x, int incx); -void CUBLASWINAPI cublasDtrmv (char uplo, char trans, char diag, int n, - const double *A, int lda, double *x, int incx); -void CUBLASWINAPI cublasCtrmv (char uplo, char trans, char diag, int n, - const cuComplex *A, int lda, cuComplex *x, int incx); -void CUBLASWINAPI cublasZtrmv (char uplo, char trans, char diag, int n, - const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx); -/*------------------------------------------------------------------------*/ -/* TBMV */ -void CUBLASWINAPI cublasStbmv (char uplo, char trans, char diag, int n, int k, - const float *A, int lda, float *x, int incx); -void CUBLASWINAPI cublasDtbmv (char uplo, char trans, char diag, int n, int k, - const double *A, int lda, double *x, int incx); -void CUBLASWINAPI cublasCtbmv (char uplo, char trans, char diag, int n, int k, - const cuComplex *A, int lda, cuComplex *x, int incx); -void CUBLASWINAPI cublasZtbmv (char uplo, char trans, char diag, int n, int k, - const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx); -/*------------------------------------------------------------------------*/ -/* TPMV */ -void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float *AP, float *x, int incx); - -void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx); - -void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx); - -void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, cuDoubleComplex *x, int incx); -/*------------------------------------------------------------------------*/ -/* TRSV */ -void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float *A, int lda, float *x, int incx); - -void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double *A, int lda, double *x, int incx); - -void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex *A, int lda, cuComplex *x, int incx); - -void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *A, int lda, - cuDoubleComplex *x, int incx); -/*------------------------------------------------------------------------*/ -/* TPSV */ -void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float *AP, - float *x, int incx); - -void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx); - -void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx); - -void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, - cuDoubleComplex *x, int incx); -/*------------------------------------------------------------------------*/ -/* TBSV */ -void CUBLASWINAPI cublasStbsv(char uplo, char trans, - char diag, int n, int k, const float *A, - int lda, float *x, int incx); - -void CUBLASWINAPI cublasDtbsv(char uplo, char trans, - char diag, int n, int k, const double *A, - int lda, double *x, int incx); -void CUBLASWINAPI cublasCtbsv(char uplo, char trans, - char diag, int n, int k, const cuComplex *A, - int lda, cuComplex *x, int incx); - -void CUBLASWINAPI cublasZtbsv(char uplo, char trans, - char diag, int n, int k, const cuDoubleComplex *A, - int lda, cuDoubleComplex *x, int incx); -/*------------------------------------------------------------------------*/ -/* SYMV/HEMV */ -void CUBLASWINAPI cublasSsymv (char uplo, int n, float alpha, const float *A, - int lda, const float *x, int incx, float beta, - float *y, int incy); -void CUBLASWINAPI cublasDsymv (char uplo, int n, double alpha, const double *A, - int lda, const double *x, int incx, double beta, - double *y, int incy); -void CUBLASWINAPI cublasChemv (char uplo, int n, cuComplex alpha, const cuComplex *A, - int lda, const cuComplex *x, int incx, cuComplex beta, - cuComplex *y, int incy); -void CUBLASWINAPI cublasZhemv (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *A, - int lda, const cuDoubleComplex *x, int incx, cuDoubleComplex beta, - cuDoubleComplex *y, int incy); -/*------------------------------------------------------------------------*/ -/* SBMV/HBMV */ -void CUBLASWINAPI cublasSsbmv (char uplo, int n, int k, float alpha, - const float *A, int lda, const float *x, int incx, - float beta, float *y, int incy); -void CUBLASWINAPI cublasDsbmv (char uplo, int n, int k, double alpha, - const double *A, int lda, const double *x, int incx, - double beta, double *y, int incy); -void CUBLASWINAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha, - const cuComplex *A, int lda, const cuComplex *x, int incx, - cuComplex beta, cuComplex *y, int incy); -void CUBLASWINAPI cublasZhbmv (char uplo, int n, int k, cuDoubleComplex alpha, - const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx, - cuDoubleComplex beta, cuDoubleComplex *y, int incy); -/*------------------------------------------------------------------------*/ -/* SPMV/HPMV */ -void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha, - const float *AP, const float *x, - int incx, float beta, float *y, int incy); -void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha, - const double *AP, const double *x, - int incx, double beta, double *y, int incy); -void CUBLASWINAPI cublasChpmv(char uplo, int n, cuComplex alpha, - const cuComplex *AP, const cuComplex *x, - int incx, cuComplex beta, cuComplex *y, int incy); -void CUBLASWINAPI cublasZhpmv(char uplo, int n, cuDoubleComplex alpha, - const cuDoubleComplex *AP, const cuDoubleComplex *x, - int incx, cuDoubleComplex beta, cuDoubleComplex *y, int incy); - -/*------------------------------------------------------------------------*/ -/* GER */ -void CUBLASWINAPI cublasSger (int m, int n, float alpha, const float *x, int incx, - const float *y, int incy, float *A, int lda); -void CUBLASWINAPI cublasDger (int m, int n, double alpha, const double *x, int incx, - const double *y, int incy, double *A, int lda); - -void CUBLASWINAPI cublasCgeru (int m, int n, cuComplex alpha, const cuComplex *x, - int incx, const cuComplex *y, int incy, - cuComplex *A, int lda); -void CUBLASWINAPI cublasCgerc (int m, int n, cuComplex alpha, const cuComplex *x, - int incx, const cuComplex *y, int incy, - cuComplex *A, int lda); -void CUBLASWINAPI cublasZgeru (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x, - int incx, const cuDoubleComplex *y, int incy, - cuDoubleComplex *A, int lda); -void CUBLASWINAPI cublasZgerc (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x, - int incx, const cuDoubleComplex *y, int incy, - cuDoubleComplex *A, int lda); -/*------------------------------------------------------------------------*/ -/* SYR/HER */ -void CUBLASWINAPI cublasSsyr (char uplo, int n, float alpha, const float *x, - int incx, float *A, int lda); -void CUBLASWINAPI cublasDsyr (char uplo, int n, double alpha, const double *x, - int incx, double *A, int lda); - -void CUBLASWINAPI cublasCher (char uplo, int n, float alpha, - const cuComplex *x, int incx, cuComplex *A, int lda); -void CUBLASWINAPI cublasZher (char uplo, int n, double alpha, - const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda); - -/*------------------------------------------------------------------------*/ -/* SPR/HPR */ -void CUBLASWINAPI cublasSspr (char uplo, int n, float alpha, const float *x, - int incx, float *AP); -void CUBLASWINAPI cublasDspr (char uplo, int n, double alpha, const double *x, - int incx, double *AP); -void CUBLASWINAPI cublasChpr (char uplo, int n, float alpha, const cuComplex *x, - int incx, cuComplex *AP); -void CUBLASWINAPI cublasZhpr (char uplo, int n, double alpha, const cuDoubleComplex *x, - int incx, cuDoubleComplex *AP); -/*------------------------------------------------------------------------*/ -/* SYR2/HER2 */ -void CUBLASWINAPI cublasSsyr2 (char uplo, int n, float alpha, const float *x, - int incx, const float *y, int incy, float *A, - int lda); -void CUBLASWINAPI cublasDsyr2 (char uplo, int n, double alpha, const double *x, - int incx, const double *y, int incy, double *A, - int lda); -void CUBLASWINAPI cublasCher2 (char uplo, int n, cuComplex alpha, const cuComplex *x, - int incx, const cuComplex *y, int incy, cuComplex *A, - int lda); -void CUBLASWINAPI cublasZher2 (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *x, - int incx, const cuDoubleComplex *y, int incy, cuDoubleComplex *A, - int lda); - -/*------------------------------------------------------------------------*/ -/* SPR2/HPR2 */ -void CUBLASWINAPI cublasSspr2 (char uplo, int n, float alpha, const float *x, - int incx, const float *y, int incy, float *AP); -void CUBLASWINAPI cublasDspr2 (char uplo, int n, double alpha, - const double *x, int incx, const double *y, - int incy, double *AP); -void CUBLASWINAPI cublasChpr2 (char uplo, int n, cuComplex alpha, - const cuComplex *x, int incx, const cuComplex *y, - int incy, cuComplex *AP); -void CUBLASWINAPI cublasZhpr2 (char uplo, int n, cuDoubleComplex alpha, - const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, - int incy, cuDoubleComplex *AP); -/* ------------------------BLAS3 Functions ------------------------------- */ -/* GEMM */ -void CUBLASWINAPI cublasSgemm (char transa, char transb, int m, int n, int k, - float alpha, const float *A, int lda, - const float *B, int ldb, float beta, float *C, - int ldc); -void CUBLASWINAPI cublasDgemm (char transa, char transb, int m, int n, int k, - double alpha, const double *A, int lda, - const double *B, int ldb, double beta, double *C, - int ldc); -void CUBLASWINAPI cublasCgemm (char transa, char transb, int m, int n, int k, - cuComplex alpha, const cuComplex *A, int lda, - const cuComplex *B, int ldb, cuComplex beta, - cuComplex *C, int ldc); -void CUBLASWINAPI cublasZgemm (char transa, char transb, int m, int n, - int k, cuDoubleComplex alpha, - const cuDoubleComplex *A, int lda, - const cuDoubleComplex *B, int ldb, - cuDoubleComplex beta, cuDoubleComplex *C, - int ldc); -/* -------------------------------------------------------*/ -/* SYRK */ -void CUBLASWINAPI cublasSsyrk (char uplo, char trans, int n, int k, float alpha, - const float *A, int lda, float beta, float *C, - int ldc); -void CUBLASWINAPI cublasDsyrk (char uplo, char trans, int n, int k, - double alpha, const double *A, int lda, - double beta, double *C, int ldc); - -void CUBLASWINAPI cublasCsyrk (char uplo, char trans, int n, int k, - cuComplex alpha, const cuComplex *A, int lda, - cuComplex beta, cuComplex *C, int ldc); -void CUBLASWINAPI cublasZsyrk (char uplo, char trans, int n, int k, - cuDoubleComplex alpha, - const cuDoubleComplex *A, int lda, - cuDoubleComplex beta, - cuDoubleComplex *C, int ldc); -/* ------------------------------------------------------- */ -/* HERK */ -void CUBLASWINAPI cublasCherk (char uplo, char trans, int n, int k, - float alpha, const cuComplex *A, int lda, - float beta, cuComplex *C, int ldc); -void CUBLASWINAPI cublasZherk (char uplo, char trans, int n, int k, - double alpha, - const cuDoubleComplex *A, int lda, - double beta, - cuDoubleComplex *C, int ldc); -/* ------------------------------------------------------- */ -/* SYR2K */ -void CUBLASWINAPI cublasSsyr2k (char uplo, char trans, int n, int k, float alpha, - const float *A, int lda, const float *B, int ldb, - float beta, float *C, int ldc); - -void CUBLASWINAPI cublasDsyr2k (char uplo, char trans, int n, int k, - double alpha, const double *A, int lda, - const double *B, int ldb, double beta, - double *C, int ldc); -void CUBLASWINAPI cublasCsyr2k (char uplo, char trans, int n, int k, - cuComplex alpha, const cuComplex *A, int lda, - const cuComplex *B, int ldb, cuComplex beta, - cuComplex *C, int ldc); - -void CUBLASWINAPI cublasZsyr2k (char uplo, char trans, int n, int k, - cuDoubleComplex alpha, const cuDoubleComplex *A, int lda, - const cuDoubleComplex *B, int ldb, cuDoubleComplex beta, - cuDoubleComplex *C, int ldc); -/* ------------------------------------------------------- */ -/* HER2K */ -void CUBLASWINAPI cublasCher2k (char uplo, char trans, int n, int k, - cuComplex alpha, const cuComplex *A, int lda, - const cuComplex *B, int ldb, float beta, - cuComplex *C, int ldc); - -void CUBLASWINAPI cublasZher2k (char uplo, char trans, int n, int k, - cuDoubleComplex alpha, const cuDoubleComplex *A, int lda, - const cuDoubleComplex *B, int ldb, double beta, - cuDoubleComplex *C, int ldc); - -/*------------------------------------------------------------------------*/ -/* SYMM*/ -void CUBLASWINAPI cublasSsymm (char side, char uplo, int m, int n, float alpha, - const float *A, int lda, const float *B, int ldb, - float beta, float *C, int ldc); -void CUBLASWINAPI cublasDsymm (char side, char uplo, int m, int n, double alpha, - const double *A, int lda, const double *B, int ldb, - double beta, double *C, int ldc); - -void CUBLASWINAPI cublasCsymm (char side, char uplo, int m, int n, cuComplex alpha, - const cuComplex *A, int lda, const cuComplex *B, int ldb, - cuComplex beta, cuComplex *C, int ldc); - -void CUBLASWINAPI cublasZsymm (char side, char uplo, int m, int n, cuDoubleComplex alpha, - const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, - cuDoubleComplex beta, cuDoubleComplex *C, int ldc); -/*------------------------------------------------------------------------*/ -/* HEMM*/ -void CUBLASWINAPI cublasChemm (char side, char uplo, int m, int n, - cuComplex alpha, const cuComplex *A, int lda, - const cuComplex *B, int ldb, cuComplex beta, - cuComplex *C, int ldc); -void CUBLASWINAPI cublasZhemm (char side, char uplo, int m, int n, - cuDoubleComplex alpha, const cuDoubleComplex *A, int lda, - const cuDoubleComplex *B, int ldb, cuDoubleComplex beta, - cuDoubleComplex *C, int ldc); - -/*------------------------------------------------------------------------*/ -/* TRSM*/ -void CUBLASWINAPI cublasStrsm (char side, char uplo, char transa, char diag, - int m, int n, float alpha, const float *A, int lda, - float *B, int ldb); - -void CUBLASWINAPI cublasDtrsm (char side, char uplo, char transa, - char diag, int m, int n, double alpha, - const double *A, int lda, double *B, - int ldb); - -void CUBLASWINAPI cublasCtrsm (char side, char uplo, char transa, char diag, - int m, int n, cuComplex alpha, const cuComplex *A, - int lda, cuComplex *B, int ldb); - -void CUBLASWINAPI cublasZtrsm (char side, char uplo, char transa, - char diag, int m, int n, cuDoubleComplex alpha, - const cuDoubleComplex *A, int lda, - cuDoubleComplex *B, int ldb); -/*------------------------------------------------------------------------*/ -/* TRMM*/ -void CUBLASWINAPI cublasStrmm (char side, char uplo, char transa, char diag, - int m, int n, float alpha, const float *A, int lda, - float *B, int ldb); -void CUBLASWINAPI cublasDtrmm (char side, char uplo, char transa, - char diag, int m, int n, double alpha, - const double *A, int lda, double *B, - int ldb); -void CUBLASWINAPI cublasCtrmm (char side, char uplo, char transa, char diag, - int m, int n, cuComplex alpha, const cuComplex *A, - int lda, cuComplex *B, int ldb); -void CUBLASWINAPI cublasZtrmm (char side, char uplo, char transa, - char diag, int m, int n, cuDoubleComplex alpha, - const cuDoubleComplex *A, int lda, cuDoubleComplex *B, - int ldb); - -#if defined(__cplusplus) -} -#endif /* __cplusplus */ - -#endif /* !defined(CUBLAS_H_) */ diff --git a/include/triton/external/CUDA/cublas_api.h b/include/triton/external/CUDA/cublas_api.h deleted file mode 100755 index ff89141d0..000000000 --- a/include/triton/external/CUDA/cublas_api.h +++ /dev/null @@ -1,2977 +0,0 @@ -/* - * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -/* - * This is the public header file for the CUBLAS library, defining the API - * - * CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines) - * on top of the CUDA runtime. - */ - -#if !defined(CUBLAS_API_H_) -#define CUBLAS_API_H_ - -#ifndef CUBLASWINAPI -#ifdef _WIN32 -#define CUBLASWINAPI __stdcall -#else -#define CUBLASWINAPI -#endif -#endif - -#ifndef CUBLASAPI -#error "This file should not be included without defining CUBLASAPI" -#endif - -#include "driver_types.h" -#include "cuComplex.h" /* import complex data type */ - -#include "cuda_fp16.h" - -#include "library_types.h" - - -#if defined(__cplusplus) -extern "C" { -#endif /* __cplusplus */ - -/* CUBLAS status type returns */ -typedef enum{ - CUBLAS_STATUS_SUCCESS =0, - CUBLAS_STATUS_NOT_INITIALIZED =1, - CUBLAS_STATUS_ALLOC_FAILED =3, - CUBLAS_STATUS_INVALID_VALUE =7, - CUBLAS_STATUS_ARCH_MISMATCH =8, - CUBLAS_STATUS_MAPPING_ERROR =11, - CUBLAS_STATUS_EXECUTION_FAILED=13, - CUBLAS_STATUS_INTERNAL_ERROR =14, - CUBLAS_STATUS_NOT_SUPPORTED =15, - CUBLAS_STATUS_LICENSE_ERROR =16 -} cublasStatus_t; - - -typedef enum { - CUBLAS_FILL_MODE_LOWER=0, - CUBLAS_FILL_MODE_UPPER=1 -} cublasFillMode_t; - -typedef enum { - CUBLAS_DIAG_NON_UNIT=0, - CUBLAS_DIAG_UNIT=1 -} cublasDiagType_t; - -typedef enum { - CUBLAS_SIDE_LEFT =0, - CUBLAS_SIDE_RIGHT=1 -} cublasSideMode_t; - - -typedef enum { - CUBLAS_OP_N=0, - CUBLAS_OP_T=1, - CUBLAS_OP_C=2 -} cublasOperation_t; - - -typedef enum { - CUBLAS_POINTER_MODE_HOST = 0, - CUBLAS_POINTER_MODE_DEVICE = 1 -} cublasPointerMode_t; - -typedef enum { - CUBLAS_ATOMICS_NOT_ALLOWED = 0, - CUBLAS_ATOMICS_ALLOWED = 1 -} cublasAtomicsMode_t; - -/*For different GEMM algorithm */ -typedef enum { - CUBLAS_GEMM_DFALT = -1, - CUBLAS_GEMM_DEFAULT = -1, - CUBLAS_GEMM_ALGO0 = 0, - CUBLAS_GEMM_ALGO1 = 1, - CUBLAS_GEMM_ALGO2 = 2, - CUBLAS_GEMM_ALGO3 = 3, - CUBLAS_GEMM_ALGO4 = 4, - CUBLAS_GEMM_ALGO5 = 5, - CUBLAS_GEMM_ALGO6 = 6, - CUBLAS_GEMM_ALGO7 = 7, - CUBLAS_GEMM_ALGO8 = 8, - CUBLAS_GEMM_ALGO9 = 9, - CUBLAS_GEMM_ALGO10 = 10, - CUBLAS_GEMM_ALGO11 = 11, - CUBLAS_GEMM_ALGO12 = 12, - CUBLAS_GEMM_ALGO13 = 13, - CUBLAS_GEMM_ALGO14 = 14, - CUBLAS_GEMM_ALGO15 = 15, - CUBLAS_GEMM_ALGO16 = 16, - CUBLAS_GEMM_ALGO17 = 17, - CUBLAS_GEMM_DEFAULT_TENSOR_OP = 99, - CUBLAS_GEMM_DFALT_TENSOR_OP = 99, - CUBLAS_GEMM_ALGO0_TENSOR_OP = 100, - CUBLAS_GEMM_ALGO1_TENSOR_OP = 101, - CUBLAS_GEMM_ALGO2_TENSOR_OP = 102, - CUBLAS_GEMM_ALGO3_TENSOR_OP = 103, - CUBLAS_GEMM_ALGO4_TENSOR_OP = 104 -} cublasGemmAlgo_t; - -/*Enum for default math mode/tensor operation*/ -typedef enum { - CUBLAS_DEFAULT_MATH = 0, - CUBLAS_TENSOR_OP_MATH = 1 -} cublasMath_t; - -/* For backward compatibility purposes */ -typedef cudaDataType cublasDataType_t; - -/* Opaque structure holding CUBLAS library context */ -struct cublasContext; -typedef struct cublasContext *cublasHandle_t; - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCreate_v2 (cublasHandle_t *handle); -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDestroy_v2 (cublasHandle_t handle); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle, int *version); -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type, int *value); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetStream_v2 (cublasHandle_t handle, cudaStream_t streamId); -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetStream_v2 (cublasHandle_t handle, cudaStream_t *streamId); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2 (cublasHandle_t handle, cublasPointerMode_t *mode); -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2 (cublasHandle_t handle, cublasPointerMode_t mode); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t *mode); -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t mode); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle, cublasMath_t *mode); -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode); - -/* - * cublasStatus_t - * cublasSetVector (int n, int elemSize, const void *x, int incx, - * void *y, int incy) - * - * copies n elements from a vector x in CPU memory space to a vector y - * in GPU memory space. Elements in both vectors are assumed to have a - * size of elemSize bytes. Storage spacing between consecutive elements - * is incx for the source vector x and incy for the destination vector - * y. In general, y points to an object, or part of an object, allocated - * via cublasAlloc(). Column major format for two-dimensional matrices - * is assumed throughout CUBLAS. Therefore, if the increment for a vector - * is equal to 1, this access a column vector while using an increment - * equal to the leading dimension of the respective matrix accesses a - * row vector. - * - * Return Values - * ------------- - * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized - * CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0 - * CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory - * CUBLAS_STATUS_SUCCESS if the operation completed successfully - */ -cublasStatus_t CUBLASWINAPI cublasSetVector (int n, int elemSize, const void *x, - int incx, void *devicePtr, int incy); - -/* - * cublasStatus_t - * cublasGetVector (int n, int elemSize, const void *x, int incx, - * void *y, int incy) - * - * copies n elements from a vector x in GPU memory space to a vector y - * in CPU memory space. Elements in both vectors are assumed to have a - * size of elemSize bytes. Storage spacing between consecutive elements - * is incx for the source vector x and incy for the destination vector - * y. In general, x points to an object, or part of an object, allocated - * via cublasAlloc(). Column major format for two-dimensional matrices - * is assumed throughout CUBLAS. Therefore, if the increment for a vector - * is equal to 1, this access a column vector while using an increment - * equal to the leading dimension of the respective matrix accesses a - * row vector. - * - * Return Values - * ------------- - * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized - * CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0 - * CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory - * CUBLAS_STATUS_SUCCESS if the operation completed successfully - */ -cublasStatus_t CUBLASWINAPI cublasGetVector (int n, int elemSize, const void *x, - int incx, void *y, int incy); - -/* - * cublasStatus_t - * cublasSetMatrix (int rows, int cols, int elemSize, const void *A, - * int lda, void *B, int ldb) - * - * copies a tile of rows x cols elements from a matrix A in CPU memory - * space to a matrix B in GPU memory space. Each element requires storage - * of elemSize bytes. Both matrices are assumed to be stored in column - * major format, with the leading dimension (i.e. number of rows) of - * source matrix A provided in lda, and the leading dimension of matrix B - * provided in ldb. In general, B points to an object, or part of an - * object, that was allocated via cublasAlloc(). - * - * Return Values - * ------------- - * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialized - * CUBLAS_STATUS_INVALID_VALUE if rows or cols < 0, or elemSize, lda, or - * ldb <= 0 - * CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory - * CUBLAS_STATUS_SUCCESS if the operation completed successfully - */ -cublasStatus_t CUBLASWINAPI cublasSetMatrix (int rows, int cols, int elemSize, - const void *A, int lda, void *B, - int ldb); - -/* - * cublasStatus_t - * cublasGetMatrix (int rows, int cols, int elemSize, const void *A, - * int lda, void *B, int ldb) - * - * copies a tile of rows x cols elements from a matrix A in GPU memory - * space to a matrix B in CPU memory space. Each element requires storage - * of elemSize bytes. Both matrices are assumed to be stored in column - * major format, with the leading dimension (i.e. number of rows) of - * source matrix A provided in lda, and the leading dimension of matrix B - * provided in ldb. In general, A points to an object, or part of an - * object, that was allocated via cublasAlloc(). - * - * Return Values - * ------------- - * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialized - * CUBLAS_STATUS_INVALID_VALUE if rows, cols, eleSize, lda, or ldb <= 0 - * CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory - * CUBLAS_STATUS_SUCCESS if the operation completed successfully - */ -cublasStatus_t CUBLASWINAPI cublasGetMatrix (int rows, int cols, int elemSize, - const void *A, int lda, void *B, - int ldb); - -/* - * cublasStatus - * cublasSetVectorAsync ( int n, int elemSize, const void *x, int incx, - * void *y, int incy, cudaStream_t stream ); - * - * cublasSetVectorAsync has the same functionnality as cublasSetVector - * but the transfer is done asynchronously within the CUDA stream passed - * in parameter. - * - * Return Values - * ------------- - * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized - * CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0 - * CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory - * CUBLAS_STATUS_SUCCESS if the operation completed successfully - */ -cublasStatus_t CUBLASWINAPI cublasSetVectorAsync (int n, int elemSize, - const void *hostPtr, int incx, - void *devicePtr, int incy, - cudaStream_t stream); -/* - * cublasStatus - * cublasGetVectorAsync( int n, int elemSize, const void *x, int incx, - * void *y, int incy, cudaStream_t stream) - * - * cublasGetVectorAsync has the same functionnality as cublasGetVector - * but the transfer is done asynchronously within the CUDA stream passed - * in parameter. - * - * Return Values - * ------------- - * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized - * CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0 - * CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory - * CUBLAS_STATUS_SUCCESS if the operation completed successfully - */ -cublasStatus_t CUBLASWINAPI cublasGetVectorAsync (int n, int elemSize, - const void *devicePtr, int incx, - void *hostPtr, int incy, - cudaStream_t stream); - -/* - * cublasStatus_t - * cublasSetMatrixAsync (int rows, int cols, int elemSize, const void *A, - * int lda, void *B, int ldb, cudaStream_t stream) - * - * cublasSetMatrixAsync has the same functionnality as cublasSetMatrix - * but the transfer is done asynchronously within the CUDA stream passed - * in parameter. - * - * Return Values - * ------------- - * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialized - * CUBLAS_STATUS_INVALID_VALUE if rows or cols < 0, or elemSize, lda, or - * ldb <= 0 - * CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory - * CUBLAS_STATUS_SUCCESS if the operation completed successfully - */ -cublasStatus_t CUBLASWINAPI cublasSetMatrixAsync (int rows, int cols, int elemSize, - const void *A, int lda, void *B, - int ldb, cudaStream_t stream); - -/* - * cublasStatus_t - * cublasGetMatrixAsync (int rows, int cols, int elemSize, const void *A, - * int lda, void *B, int ldb, cudaStream_t stream) - * - * cublasGetMatrixAsync has the same functionnality as cublasGetMatrix - * but the transfer is done asynchronously within the CUDA stream passed - * in parameter. - * - * Return Values - * ------------- - * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialized - * CUBLAS_STATUS_INVALID_VALUE if rows, cols, eleSize, lda, or ldb <= 0 - * CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory - * CUBLAS_STATUS_SUCCESS if the operation completed successfully - */ -cublasStatus_t CUBLASWINAPI cublasGetMatrixAsync (int rows, int cols, int elemSize, - const void *A, int lda, void *B, - int ldb, cudaStream_t stream); - - -CUBLASAPI void CUBLASWINAPI cublasXerbla (const char *srName, int info); -/* ---------------- CUBLAS BLAS1 functions ---------------- */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle, - int n, - const void *x, - cudaDataType xType, - int incx, - void *result, - cudaDataType resultType, - cudaDataType executionType); /* host or device pointer */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSnrm2_v2(cublasHandle_t handle, - int n, - const float *x, - int incx, - float *result); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDnrm2_v2(cublasHandle_t handle, - int n, - const double *x, - int incx, - double *result); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasScnrm2_v2(cublasHandle_t handle, - int n, - const cuComplex *x, - int incx, - float *result); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDznrm2_v2(cublasHandle_t handle, - int n, - const cuDoubleComplex *x, - int incx, - double *result); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDotEx (cublasHandle_t handle, - int n, - const void *x, - cudaDataType xType, - int incx, - const void *y, - cudaDataType yType, - int incy, - void *result, - cudaDataType resultType, - cudaDataType executionType); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDotcEx (cublasHandle_t handle, - int n, - const void *x, - cudaDataType xType, - int incx, - const void *y, - cudaDataType yType, - int incy, - void *result, - cudaDataType resultType, - cudaDataType executionType); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSdot_v2 (cublasHandle_t handle, - int n, - const float *x, - int incx, - const float *y, - int incy, - float *result); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDdot_v2 (cublasHandle_t handle, - int n, - const double *x, - int incx, - const double *y, - int incy, - double *result); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdotu_v2 (cublasHandle_t handle, - int n, - const cuComplex *x, - int incx, - const cuComplex *y, - int incy, - cuComplex *result); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdotc_v2 (cublasHandle_t handle, - int n, - const cuComplex *x, - int incx, - const cuComplex *y, - int incy, - cuComplex *result); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdotu_v2 (cublasHandle_t handle, - int n, - const cuDoubleComplex *x, - int incx, - const cuDoubleComplex *y, - int incy, - cuDoubleComplex *result); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdotc_v2 (cublasHandle_t handle, - int n, - const cuDoubleComplex *x, - int incx, - const cuDoubleComplex *y, - int incy, - cuDoubleComplex *result); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasScalEx(cublasHandle_t handle, - int n, - const void *alpha, /* host or device pointer */ - cudaDataType alphaType, - void *x, - cudaDataType xType, - int incx, - cudaDataType executionType); -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSscal_v2(cublasHandle_t handle, - int n, - const float *alpha, /* host or device pointer */ - float *x, - int incx); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDscal_v2(cublasHandle_t handle, - int n, - const double *alpha, /* host or device pointer */ - double *x, - int incx); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCscal_v2(cublasHandle_t handle, - int n, - const cuComplex *alpha, /* host or device pointer */ - cuComplex *x, - int incx); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsscal_v2(cublasHandle_t handle, - int n, - const float *alpha, /* host or device pointer */ - cuComplex *x, - int incx); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZscal_v2(cublasHandle_t handle, - int n, - const cuDoubleComplex *alpha, /* host or device pointer */ - cuDoubleComplex *x, - int incx); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdscal_v2(cublasHandle_t handle, - int n, - const double *alpha, /* host or device pointer */ - cuDoubleComplex *x, - int incx); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasAxpyEx (cublasHandle_t handle, - int n, - const void *alpha, /* host or device pointer */ - cudaDataType alphaType, - const void *x, - cudaDataType xType, - int incx, - void *y, - cudaDataType yType, - int incy, - cudaDataType executiontype); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSaxpy_v2 (cublasHandle_t handle, - int n, - const float *alpha, /* host or device pointer */ - const float *x, - int incx, - float *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDaxpy_v2 (cublasHandle_t handle, - int n, - const double *alpha, /* host or device pointer */ - const double *x, - int incx, - double *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCaxpy_v2 (cublasHandle_t handle, - int n, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *x, - int incx, - cuComplex *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZaxpy_v2 (cublasHandle_t handle, - int n, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *x, - int incx, - cuDoubleComplex *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasScopy_v2 (cublasHandle_t handle, - int n, - const float *x, - int incx, - float *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDcopy_v2 (cublasHandle_t handle, - int n, - const double *x, - int incx, - double *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCcopy_v2 (cublasHandle_t handle, - int n, - const cuComplex *x, - int incx, - cuComplex *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZcopy_v2 (cublasHandle_t handle, - int n, - const cuDoubleComplex *x, - int incx, - cuDoubleComplex *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSswap_v2 (cublasHandle_t handle, - int n, - float *x, - int incx, - float *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDswap_v2 (cublasHandle_t handle, - int n, - double *x, - int incx, - double *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCswap_v2 (cublasHandle_t handle, - int n, - cuComplex *x, - int incx, - cuComplex *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZswap_v2 (cublasHandle_t handle, - int n, - cuDoubleComplex *x, - int incx, - cuDoubleComplex *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIsamax_v2(cublasHandle_t handle, - int n, - const float *x, - int incx, - int *result); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIdamax_v2(cublasHandle_t handle, - int n, - const double *x, - int incx, - int *result); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIcamax_v2(cublasHandle_t handle, - int n, - const cuComplex *x, - int incx, - int *result); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIzamax_v2(cublasHandle_t handle, - int n, - const cuDoubleComplex *x, - int incx, - int *result); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIsamin_v2(cublasHandle_t handle, - int n, - const float *x, - int incx, - int *result); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIdamin_v2(cublasHandle_t handle, - int n, - const double *x, - int incx, - int *result); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIcamin_v2(cublasHandle_t handle, - int n, - const cuComplex *x, - int incx, - int *result); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIzamin_v2(cublasHandle_t handle, - int n, - const cuDoubleComplex *x, - int incx, - int *result); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSasum_v2(cublasHandle_t handle, - int n, - const float *x, - int incx, - float *result); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDasum_v2(cublasHandle_t handle, - int n, - const double *x, - int incx, - double *result); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasScasum_v2(cublasHandle_t handle, - int n, - const cuComplex *x, - int incx, - float *result); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDzasum_v2(cublasHandle_t handle, - int n, - const cuDoubleComplex *x, - int incx, - double *result); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrot_v2 (cublasHandle_t handle, - int n, - float *x, - int incx, - float *y, - int incy, - const float *c, /* host or device pointer */ - const float *s); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrot_v2 (cublasHandle_t handle, - int n, - double *x, - int incx, - double *y, - int incy, - const double *c, /* host or device pointer */ - const double *s); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCrot_v2 (cublasHandle_t handle, - int n, - cuComplex *x, - int incx, - cuComplex *y, - int incy, - const float *c, /* host or device pointer */ - const cuComplex *s); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsrot_v2(cublasHandle_t handle, - int n, - cuComplex *x, - int incx, - cuComplex *y, - int incy, - const float *c, /* host or device pointer */ - const float *s); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZrot_v2 (cublasHandle_t handle, - int n, - cuDoubleComplex *x, - int incx, - cuDoubleComplex *y, - int incy, - const double *c, /* host or device pointer */ - const cuDoubleComplex *s); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdrot_v2(cublasHandle_t handle, - int n, - cuDoubleComplex *x, - int incx, - cuDoubleComplex *y, - int incy, - const double *c, /* host or device pointer */ - const double *s); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrotg_v2(cublasHandle_t handle, - float *a, /* host or device pointer */ - float *b, /* host or device pointer */ - float *c, /* host or device pointer */ - float *s); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrotg_v2(cublasHandle_t handle, - double *a, /* host or device pointer */ - double *b, /* host or device pointer */ - double *c, /* host or device pointer */ - double *s); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCrotg_v2(cublasHandle_t handle, - cuComplex *a, /* host or device pointer */ - cuComplex *b, /* host or device pointer */ - float *c, /* host or device pointer */ - cuComplex *s); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZrotg_v2(cublasHandle_t handle, - cuDoubleComplex *a, /* host or device pointer */ - cuDoubleComplex *b, /* host or device pointer */ - double *c, /* host or device pointer */ - cuDoubleComplex *s); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle, - int n, - float *x, - int incx, - float *y, - int incy, - const float* param); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle, - int n, - double *x, - int incx, - double *y, - int incy, - const double* param); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrotmg_v2(cublasHandle_t handle, - float *d1, /* host or device pointer */ - float *d2, /* host or device pointer */ - float *x1, /* host or device pointer */ - const float *y1, /* host or device pointer */ - float *param); /* host or device pointer */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrotmg_v2(cublasHandle_t handle, - double *d1, /* host or device pointer */ - double *d2, /* host or device pointer */ - double *x1, /* host or device pointer */ - const double *y1, /* host or device pointer */ - double *param); /* host or device pointer */ - -/* --------------- CUBLAS BLAS2 functions ---------------- */ - -/* GEMV */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemv_v2 (cublasHandle_t handle, - cublasOperation_t trans, - int m, - int n, - const float *alpha, /* host or device pointer */ - const float *A, - int lda, - const float *x, - int incx, - const float *beta, /* host or device pointer */ - float *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemv_v2 (cublasHandle_t handle, - cublasOperation_t trans, - int m, - int n, - const double *alpha, /* host or device pointer */ - const double *A, - int lda, - const double *x, - int incx, - const double *beta, /* host or device pointer */ - double *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemv_v2 (cublasHandle_t handle, - cublasOperation_t trans, - int m, - int n, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *A, - int lda, - const cuComplex *x, - int incx, - const cuComplex *beta, /* host or device pointer */ - cuComplex *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemv_v2 (cublasHandle_t handle, - cublasOperation_t trans, - int m, - int n, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *A, - int lda, - const cuDoubleComplex *x, - int incx, - const cuDoubleComplex *beta, /* host or device pointer */ - cuDoubleComplex *y, - int incy); -/* GBMV */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgbmv_v2 (cublasHandle_t handle, - cublasOperation_t trans, - int m, - int n, - int kl, - int ku, - const float *alpha, /* host or device pointer */ - const float *A, - int lda, - const float *x, - int incx, - const float *beta, /* host or device pointer */ - float *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgbmv_v2 (cublasHandle_t handle, - cublasOperation_t trans, - int m, - int n, - int kl, - int ku, - const double *alpha, /* host or device pointer */ - const double *A, - int lda, - const double *x, - int incx, - const double *beta, /* host or device pointer */ - double *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgbmv_v2 (cublasHandle_t handle, - cublasOperation_t trans, - int m, - int n, - int kl, - int ku, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *A, - int lda, - const cuComplex *x, - int incx, - const cuComplex *beta, /* host or device pointer */ - cuComplex *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgbmv_v2 (cublasHandle_t handle, - cublasOperation_t trans, - int m, - int n, - int kl, - int ku, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *A, - int lda, - const cuDoubleComplex *x, - int incx, - const cuDoubleComplex *beta, /* host or device pointer */ - cuDoubleComplex *y, - int incy); - -/* TRMV */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrmv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - const float *A, - int lda, - float *x, - int incx); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrmv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - const double *A, - int lda, - double *x, - int incx); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrmv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - const cuComplex *A, - int lda, - cuComplex *x, - int incx); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrmv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - const cuDoubleComplex *A, - int lda, - cuDoubleComplex *x, - int incx); - -/* TBMV */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStbmv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - int k, - const float *A, - int lda, - float *x, - int incx); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtbmv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - int k, - const double *A, - int lda, - double *x, - int incx); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtbmv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - int k, - const cuComplex *A, - int lda, - cuComplex *x, - int incx); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtbmv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - int k, - const cuDoubleComplex *A, - int lda, - cuDoubleComplex *x, - int incx); - -/* TPMV */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStpmv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - const float *AP, - float *x, - int incx); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtpmv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - const double *AP, - double *x, - int incx); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtpmv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - const cuComplex *AP, - cuComplex *x, - int incx); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtpmv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - const cuDoubleComplex *AP, - cuDoubleComplex *x, - int incx); - -/* TRSV */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrsv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - const float *A, - int lda, - float *x, - int incx); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrsv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - const double *A, - int lda, - double *x, - int incx); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrsv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - const cuComplex *A, - int lda, - cuComplex *x, - int incx); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrsv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - const cuDoubleComplex *A, - int lda, - cuDoubleComplex *x, - int incx); - -/* TPSV */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStpsv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - const float *AP, - float *x, - int incx); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtpsv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - const double *AP, - double *x, - int incx); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtpsv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - const cuComplex *AP, - cuComplex *x, - int incx); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtpsv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - const cuDoubleComplex *AP, - cuDoubleComplex *x, - int incx); -/* TBSV */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStbsv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - int k, - const float *A, - int lda, - float *x, - int incx); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtbsv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - int k, - const double *A, - int lda, - double *x, - int incx); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtbsv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - int k, - const cuComplex *A, - int lda, - cuComplex *x, - int incx); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtbsv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int n, - int k, - const cuDoubleComplex *A, - int lda, - cuDoubleComplex *x, - int incx); - -/* SYMV/HEMV */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsymv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const float *alpha, /* host or device pointer */ - const float *A, - int lda, - const float *x, - int incx, - const float *beta, /* host or device pointer */ - float *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsymv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const double *alpha, /* host or device pointer */ - const double *A, - int lda, - const double *x, - int incx, - const double *beta, /* host or device pointer */ - double *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsymv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *A, - int lda, - const cuComplex *x, - int incx, - const cuComplex *beta, /* host or device pointer */ - cuComplex *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsymv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *A, - int lda, - const cuDoubleComplex *x, - int incx, - const cuDoubleComplex *beta, /* host or device pointer */ - cuDoubleComplex *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChemv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *A, - int lda, - const cuComplex *x, - int incx, - const cuComplex *beta, /* host or device pointer */ - cuComplex *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhemv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *A, - int lda, - const cuDoubleComplex *x, - int incx, - const cuDoubleComplex *beta, /* host or device pointer */ - cuDoubleComplex *y, - int incy); - -/* SBMV/HBMV */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsbmv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - int k, - const float *alpha, /* host or device pointer */ - const float *A, - int lda, - const float *x, - int incx, - const float *beta, /* host or device pointer */ - float *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsbmv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - int k, - const double *alpha, /* host or device pointer */ - const double *A, - int lda, - const double *x, - int incx, - const double *beta, /* host or device pointer */ - double *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChbmv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - int k, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *A, - int lda, - const cuComplex *x, - int incx, - const cuComplex *beta, /* host or device pointer */ - cuComplex *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhbmv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - int k, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *A, - int lda, - const cuDoubleComplex *x, - int incx, - const cuDoubleComplex *beta, /* host or device pointer */ - cuDoubleComplex *y, - int incy); - -/* SPMV/HPMV */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSspmv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const float *alpha, /* host or device pointer */ - const float *AP, - const float *x, - int incx, - const float *beta, /* host or device pointer */ - float *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDspmv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const double *alpha, /* host or device pointer */ - const double *AP, - const double *x, - int incx, - const double *beta, /* host or device pointer */ - double *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChpmv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *AP, - const cuComplex *x, - int incx, - const cuComplex *beta, /* host or device pointer */ - cuComplex *y, - int incy); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhpmv_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *AP, - const cuDoubleComplex *x, - int incx, - const cuDoubleComplex *beta, /* host or device pointer */ - cuDoubleComplex *y, - int incy); - -/* GER */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSger_v2 (cublasHandle_t handle, - int m, - int n, - const float *alpha, /* host or device pointer */ - const float *x, - int incx, - const float *y, - int incy, - float *A, - int lda); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDger_v2 (cublasHandle_t handle, - int m, - int n, - const double *alpha, /* host or device pointer */ - const double *x, - int incx, - const double *y, - int incy, - double *A, - int lda); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgeru_v2 (cublasHandle_t handle, - int m, - int n, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *x, - int incx, - const cuComplex *y, - int incy, - cuComplex *A, - int lda); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgerc_v2 (cublasHandle_t handle, - int m, - int n, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *x, - int incx, - const cuComplex *y, - int incy, - cuComplex *A, - int lda); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgeru_v2 (cublasHandle_t handle, - int m, - int n, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *x, - int incx, - const cuDoubleComplex *y, - int incy, - cuDoubleComplex *A, - int lda); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgerc_v2 (cublasHandle_t handle, - int m, - int n, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *x, - int incx, - const cuDoubleComplex *y, - int incy, - cuDoubleComplex *A, - int lda); - -/* SYR/HER */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyr_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const float *alpha, /* host or device pointer */ - const float *x, - int incx, - float *A, - int lda); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyr_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const double *alpha, /* host or device pointer */ - const double *x, - int incx, - double *A, - int lda); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyr_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *x, - int incx, - cuComplex *A, - int lda); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyr_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *x, - int incx, - cuDoubleComplex *A, - int lda); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCher_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const float *alpha, /* host or device pointer */ - const cuComplex *x, - int incx, - cuComplex *A, - int lda); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZher_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const double *alpha, /* host or device pointer */ - const cuDoubleComplex *x, - int incx, - cuDoubleComplex *A, - int lda); - -/* SPR/HPR */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSspr_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const float *alpha, /* host or device pointer */ - const float *x, - int incx, - float *AP); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDspr_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const double *alpha, /* host or device pointer */ - const double *x, - int incx, - double *AP); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChpr_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const float *alpha, /* host or device pointer */ - const cuComplex *x, - int incx, - cuComplex *AP); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhpr_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const double *alpha, /* host or device pointer */ - const cuDoubleComplex *x, - int incx, - cuDoubleComplex *AP); - -/* SYR2/HER2 */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyr2_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const float *alpha, /* host or device pointer */ - const float *x, - int incx, - const float *y, - int incy, - float *A, - int lda); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyr2_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const double *alpha, /* host or device pointer */ - const double *x, - int incx, - const double *y, - int incy, - double *A, - int lda); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyr2_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, int n, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *x, - int incx, - const cuComplex *y, - int incy, - cuComplex *A, - int lda); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyr2_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *x, - int incx, - const cuDoubleComplex *y, - int incy, - cuDoubleComplex *A, - int lda); - - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCher2_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, int n, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *x, - int incx, - const cuComplex *y, - int incy, - cuComplex *A, - int lda); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZher2_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *x, - int incx, - const cuDoubleComplex *y, - int incy, - cuDoubleComplex *A, - int lda); - -/* SPR2/HPR2 */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSspr2_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const float *alpha, /* host or device pointer */ - const float *x, - int incx, - const float *y, - int incy, - float *AP); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDspr2_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const double *alpha, /* host or device pointer */ - const double *x, - int incx, - const double *y, - int incy, - double *AP); - - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChpr2_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *x, - int incx, - const cuComplex *y, - int incy, - cuComplex *AP); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhpr2_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *x, - int incx, - const cuDoubleComplex *y, - int incy, - cuDoubleComplex *AP); - -/* ---------------- CUBLAS BLAS3 functions ---------------- */ - -/* GEMM */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemm_v2 (cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const float *alpha, /* host or device pointer */ - const float *A, - int lda, - const float *B, - int ldb, - const float *beta, /* host or device pointer */ - float *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemm_v2 (cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const double *alpha, /* host or device pointer */ - const double *A, - int lda, - const double *B, - int ldb, - const double *beta, /* host or device pointer */ - double *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm_v2 (cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *A, - int lda, - const cuComplex *B, - int ldb, - const cuComplex *beta, /* host or device pointer */ - cuComplex *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3m (cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *A, - int lda, - const cuComplex *B, - int ldb, - const cuComplex *beta, /* host or device pointer */ - cuComplex *C, - int ldc); - CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3mEx (cublasHandle_t handle, - cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, - const cuComplex *alpha, - const void *A, - cudaDataType Atype, - int lda, - const void *B, - cudaDataType Btype, - int ldb, - const cuComplex *beta, - void *C, - cudaDataType Ctype, - int ldc); - - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemm_v2 (cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *A, - int lda, - const cuDoubleComplex *B, - int ldb, - const cuDoubleComplex *beta, /* host or device pointer */ - cuDoubleComplex *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemm3m (cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *A, - int lda, - const cuDoubleComplex *B, - int ldb, - const cuDoubleComplex *beta, /* host or device pointer */ - cuDoubleComplex *C, - int ldc); - -#if defined(__cplusplus) -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHgemm (cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const __half *alpha, /* host or device pointer */ - const __half *A, - int lda, - const __half *B, - int ldb, - const __half *beta, /* host or device pointer */ - __half *C, - int ldc); -#endif -/* IO in FP16/FP32, computation in float */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmEx (cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const float *alpha, /* host or device pointer */ - const void *A, - cudaDataType Atype, - int lda, - const void *B, - cudaDataType Btype, - int ldb, - const float *beta, /* host or device pointer */ - void *C, - cudaDataType Ctype, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGemmEx (cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const void *alpha, /* host or device pointer */ - const void *A, - cudaDataType Atype, - int lda, - const void *B, - cudaDataType Btype, - int ldb, - const void *beta, /* host or device pointer */ - void *C, - cudaDataType Ctype, - int ldc, - cudaDataType computeType, - cublasGemmAlgo_t algo); - -/* IO in Int8 complex/cuComplex, computation in cuComplex */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemmEx (cublasHandle_t handle, - cublasOperation_t transa, cublasOperation_t transb, - int m, int n, int k, - const cuComplex *alpha, - const void *A, - cudaDataType Atype, - int lda, - const void *B, - cudaDataType Btype, - int ldb, - const cuComplex *beta, - void *C, - cudaDataType Ctype, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasUint8gemmBias (cublasHandle_t handle, - cublasOperation_t transa, cublasOperation_t transb, cublasOperation_t transc, - int m, int n, int k, - const unsigned char *A, int A_bias, int lda, - const unsigned char *B, int B_bias, int ldb, - unsigned char *C, int C_bias, int ldc, - int C_mult, int C_shift); - -/* SYRK */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyrk_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - int n, - int k, - const float *alpha, /* host or device pointer */ - const float *A, - int lda, - const float *beta, /* host or device pointer */ - float *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyrk_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - int n, - int k, - const double *alpha, /* host or device pointer */ - const double *A, - int lda, - const double *beta, /* host or device pointer */ - double *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrk_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - int n, - int k, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *A, - int lda, - const cuComplex *beta, /* host or device pointer */ - cuComplex *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyrk_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - int n, - int k, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *A, - int lda, - const cuDoubleComplex *beta, /* host or device pointer */ - cuDoubleComplex *C, - int ldc); -/* IO in Int8 complex/cuComplex, computation in cuComplex */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrkEx ( cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - int n, - int k, - const cuComplex *alpha, /* host or device pointer */ - const void *A, - cudaDataType Atype, - int lda, - const cuComplex *beta, /* host or device pointer */ - void *C, - cudaDataType Ctype, - int ldc); - -/* IO in Int8 complex/cuComplex, computation in cuComplex, Gaussian math */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - int n, - int k, - const cuComplex *alpha, - const void *A, - cudaDataType Atype, - int lda, - const cuComplex *beta, - void *C, - cudaDataType Ctype, - int ldc); - -/* HERK */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherk_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - int n, - int k, - const float *alpha, /* host or device pointer */ - const cuComplex *A, - int lda, - const float *beta, /* host or device pointer */ - cuComplex *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZherk_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - int n, - int k, - const double *alpha, /* host or device pointer */ - const cuDoubleComplex *A, - int lda, - const double *beta, /* host or device pointer */ - cuDoubleComplex *C, - int ldc); - -/* IO in Int8 complex/cuComplex, computation in cuComplex */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherkEx (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - int n, - int k, - const float *alpha, /* host or device pointer */ - const void *A, - cudaDataType Atype, - int lda, - const float *beta, /* host or device pointer */ - void *C, - cudaDataType Ctype, - int ldc); - -/* IO in Int8 complex/cuComplex, computation in cuComplex, Gaussian math */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherk3mEx (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - int n, - int k, - const float *alpha, - const void *A, cudaDataType Atype, - int lda, - const float *beta, - void *C, - cudaDataType Ctype, - int ldc); - - - -/* SYR2K */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - int n, - int k, - const float *alpha, /* host or device pointer */ - const float *A, - int lda, - const float *B, - int ldb, - const float *beta, /* host or device pointer */ - float *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - int n, - int k, - const double *alpha, /* host or device pointer */ - const double *A, - int lda, - const double *B, - int ldb, - const double *beta, /* host or device pointer */ - double *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - int n, - int k, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *A, - int lda, - const cuComplex *B, - int ldb, - const cuComplex *beta, /* host or device pointer */ - cuComplex *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - int n, - int k, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *A, - int lda, - const cuDoubleComplex *B, - int ldb, - const cuDoubleComplex *beta, /* host or device pointer */ - cuDoubleComplex *C, - int ldc); -/* HER2K */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCher2k_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - int n, - int k, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *A, - int lda, - const cuComplex *B, - int ldb, - const float *beta, /* host or device pointer */ - cuComplex *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZher2k_v2 (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - int n, - int k, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *A, - int lda, - const cuDoubleComplex *B, - int ldb, - const double *beta, /* host or device pointer */ - cuDoubleComplex *C, - int ldc); -/* SYRKX : eXtended SYRK*/ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyrkx (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - int n, - int k, - const float *alpha, /* host or device pointer */ - const float *A, - int lda, - const float *B, - int ldb, - const float *beta, /* host or device pointer */ - float *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyrkx (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - int n, - int k, - const double *alpha, /* host or device pointer */ - const double *A, - int lda, - const double *B, - int ldb, - const double *beta, /* host or device pointer */ - double *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrkx (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - int n, - int k, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *A, - int lda, - const cuComplex *B, - int ldb, - const cuComplex *beta, /* host or device pointer */ - cuComplex *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyrkx (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - int n, - int k, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *A, - int lda, - const cuDoubleComplex *B, - int ldb, - const cuDoubleComplex *beta, /* host or device pointer */ - cuDoubleComplex *C, - int ldc); -/* HERKX : eXtended HERK */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherkx (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - int n, - int k, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *A, - int lda, - const cuComplex *B, - int ldb, - const float *beta, /* host or device pointer */ - cuComplex *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZherkx (cublasHandle_t handle, - cublasFillMode_t uplo, - cublasOperation_t trans, - int n, - int k, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *A, - int lda, - const cuDoubleComplex *B, - int ldb, - const double *beta, /* host or device pointer */ - cuDoubleComplex *C, - int ldc); -/* SYMM */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsymm_v2 (cublasHandle_t handle, - cublasSideMode_t side, - cublasFillMode_t uplo, - int m, - int n, - const float *alpha, /* host or device pointer */ - const float *A, - int lda, - const float *B, - int ldb, - const float *beta, /* host or device pointer */ - float *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsymm_v2 (cublasHandle_t handle, - cublasSideMode_t side, - cublasFillMode_t uplo, - int m, - int n, - const double *alpha, /* host or device pointer */ - const double *A, - int lda, - const double *B, - int ldb, - const double *beta, /* host or device pointer */ - double *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsymm_v2 (cublasHandle_t handle, - cublasSideMode_t side, - cublasFillMode_t uplo, - int m, - int n, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *A, - int lda, - const cuComplex *B, - int ldb, - const cuComplex *beta, /* host or device pointer */ - cuComplex *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsymm_v2 (cublasHandle_t handle, - cublasSideMode_t side, - cublasFillMode_t uplo, - int m, - int n, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *A, - int lda, - const cuDoubleComplex *B, - int ldb, - const cuDoubleComplex *beta, /* host or device pointer */ - cuDoubleComplex *C, - int ldc); - -/* HEMM */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChemm_v2 (cublasHandle_t handle, - cublasSideMode_t side, - cublasFillMode_t uplo, - int m, - int n, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *A, - int lda, - const cuComplex *B, - int ldb, - const cuComplex *beta, /* host or device pointer */ - cuComplex *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhemm_v2 (cublasHandle_t handle, - cublasSideMode_t side, - cublasFillMode_t uplo, - int m, - int n, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *A, - int lda, - const cuDoubleComplex *B, - int ldb, - const cuDoubleComplex *beta, /* host or device pointer */ - cuDoubleComplex *C, - int ldc); - -/* TRSM */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrsm_v2 (cublasHandle_t handle, - cublasSideMode_t side, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int m, - int n, - const float *alpha, /* host or device pointer */ - const float *A, - int lda, - float *B, - int ldb); - - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrsm_v2 (cublasHandle_t handle, - cublasSideMode_t side, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int m, - int n, - const double *alpha, /* host or device pointer */ - const double *A, - int lda, - double *B, - int ldb); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(cublasHandle_t handle, - cublasSideMode_t side, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int m, - int n, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *A, - int lda, - cuComplex *B, - int ldb); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(cublasHandle_t handle, - cublasSideMode_t side, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int m, - int n, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *A, - int lda, - cuDoubleComplex *B, - int ldb); - - /* TRMM */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrmm_v2 (cublasHandle_t handle, - cublasSideMode_t side, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int m, - int n, - const float *alpha, /* host or device pointer */ - const float *A, - int lda, - const float *B, - int ldb, - float *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrmm_v2 (cublasHandle_t handle, - cublasSideMode_t side, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int m, - int n, - const double *alpha, /* host or device pointer */ - const double *A, - int lda, - const double *B, - int ldb, - double *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(cublasHandle_t handle, - cublasSideMode_t side, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int m, - int n, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *A, - int lda, - const cuComplex *B, - int ldb, - cuComplex *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int m, - int n, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *A, - int lda, - const cuDoubleComplex *B, - int ldb, - cuDoubleComplex *C, - int ldc); -/* BATCH GEMM */ -#if defined(__cplusplus) -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHgemmBatched (cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const __half *alpha, /* host or device pointer */ - const __half *Aarray[], - int lda, - const __half *Barray[], - int ldb, - const __half *beta, /* host or device pointer */ - __half *Carray[], - int ldc, - int batchCount); -#endif -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmBatched (cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const float *alpha, /* host or device pointer */ - const float *Aarray[], - int lda, - const float *Barray[], - int ldb, - const float *beta, /* host or device pointer */ - float *Carray[], - int ldc, - int batchCount); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemmBatched (cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const double *alpha, /* host or device pointer */ - const double *Aarray[], - int lda, - const double *Barray[], - int ldb, - const double *beta, /* host or device pointer */ - double *Carray[], - int ldc, - int batchCount); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemmBatched (cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *Aarray[], - int lda, - const cuComplex *Barray[], - int ldb, - const cuComplex *beta, /* host or device pointer */ - cuComplex *Carray[], - int ldc, - int batchCount); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched (cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *Aarray[], - int lda, - const cuComplex *Barray[], - int ldb, - const cuComplex *beta, /* host or device pointer */ - cuComplex *Carray[], - int ldc, - int batchCount); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemmBatched (cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *Aarray[], - int lda, - const cuDoubleComplex *Barray[], - int ldb, - const cuDoubleComplex *beta, /* host or device pointer */ - cuDoubleComplex *Carray[], - int ldc, - int batchCount); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched (cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const float *alpha, /* host or device pointer */ - const float *A, - int lda, - long long int strideA, /* purposely signed */ - const float *B, - int ldb, - long long int strideB, - const float *beta, /* host or device pointer */ - float *C, - int ldc, - long long int strideC, - int batchCount); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched (cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const double *alpha, /* host or device pointer */ - const double *A, - int lda, - long long int strideA, /* purposely signed */ - const double *B, - int ldb, - long long int strideB, - const double *beta, /* host or device pointer */ - double *C, - int ldc, - long long int strideC, - int batchCount); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched (cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *A, - int lda, - long long int strideA, /* purposely signed */ - const cuComplex *B, - int ldb, - long long int strideB, - const cuComplex *beta, /* host or device pointer */ - cuComplex *C, - int ldc, - long long int strideC, - int batchCount); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched (cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *A, - int lda, - long long int strideA, /* purposely signed */ - const cuComplex *B, - int ldb, - long long int strideB, - const cuComplex *beta, /* host or device pointer */ - cuComplex *C, - int ldc, - long long int strideC, - int batchCount); - - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemmStridedBatched (cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *A, - int lda, - long long int strideA, /* purposely signed */ - const cuDoubleComplex *B, - int ldb, - long long int strideB, - const cuDoubleComplex *beta, /* host or device poi */ - cuDoubleComplex *C, - int ldc, - long long int strideC, - int batchCount); - -#if defined(__cplusplus) -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHgemmStridedBatched (cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - int k, - const __half *alpha, /* host or device pointer */ - const __half *A, - int lda, - long long int strideA, /* purposely signed */ - const __half *B, - int ldb, - long long int strideB, - const __half *beta, /* host or device pointer */ - __half *C, - int ldc, - long long int strideC, - int batchCount); -#endif -/* ---------------- CUBLAS BLAS-like extension ---------------- */ -/* GEAM */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgeam(cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - const float *alpha, /* host or device pointer */ - const float *A, - int lda, - const float *beta , /* host or device pointer */ - const float *B, - int ldb, - float *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgeam(cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - const double *alpha, /* host or device pointer */ - const double *A, - int lda, - const double *beta, /* host or device pointer */ - const double *B, - int ldb, - double *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgeam(cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *A, - int lda, - const cuComplex *beta, /* host or device pointer */ - const cuComplex *B, - int ldb, - cuComplex *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgeam(cublasHandle_t handle, - cublasOperation_t transa, - cublasOperation_t transb, - int m, - int n, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *A, - int lda, - const cuDoubleComplex *beta, /* host or device pointer */ - const cuDoubleComplex *B, - int ldb, - cuDoubleComplex *C, - int ldc); - -/* Batched LU - GETRF*/ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgetrfBatched(cublasHandle_t handle, - int n, - float *A[], /*Device pointer*/ - int lda, - int *P, /*Device Pointer*/ - int *info, /*Device Pointer*/ - int batchSize); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgetrfBatched(cublasHandle_t handle, - int n, - double *A[], /*Device pointer*/ - int lda, - int *P, /*Device Pointer*/ - int *info, /*Device Pointer*/ - int batchSize); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgetrfBatched(cublasHandle_t handle, - int n, - cuComplex *A[], /*Device pointer*/ - int lda, - int *P, /*Device Pointer*/ - int *info, /*Device Pointer*/ - int batchSize); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgetrfBatched(cublasHandle_t handle, - int n, - cuDoubleComplex *A[], /*Device pointer*/ - int lda, - int *P, /*Device Pointer*/ - int *info, /*Device Pointer*/ - int batchSize); - -/* Batched inversion based on LU factorization from getrf */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgetriBatched(cublasHandle_t handle, - int n, - const float *A[], /*Device pointer*/ - int lda, - const int *P, /*Device pointer*/ - float *C[], /*Device pointer*/ - int ldc, - int *info, - int batchSize); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgetriBatched(cublasHandle_t handle, - int n, - const double *A[], /*Device pointer*/ - int lda, - const int *P, /*Device pointer*/ - double *C[], /*Device pointer*/ - int ldc, - int *info, - int batchSize); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgetriBatched(cublasHandle_t handle, - int n, - const cuComplex *A[], /*Device pointer*/ - int lda, - const int *P, /*Device pointer*/ - cuComplex *C[], /*Device pointer*/ - int ldc, - int *info, - int batchSize); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgetriBatched(cublasHandle_t handle, - int n, - const cuDoubleComplex *A[], /*Device pointer*/ - int lda, - const int *P, /*Device pointer*/ - cuDoubleComplex *C[], /*Device pointer*/ - int ldc, - int *info, - int batchSize); - -/* Batched solver based on LU factorization from getrf */ - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgetrsBatched( cublasHandle_t handle, - cublasOperation_t trans, - int n, - int nrhs, - const float *Aarray[], - int lda, - const int *devIpiv, - float *Barray[], - int ldb, - int *info, - int batchSize); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgetrsBatched( cublasHandle_t handle, - cublasOperation_t trans, - int n, - int nrhs, - const double *Aarray[], - int lda, - const int *devIpiv, - double *Barray[], - int ldb, - int *info, - int batchSize); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgetrsBatched( cublasHandle_t handle, - cublasOperation_t trans, - int n, - int nrhs, - const cuComplex *Aarray[], - int lda, - const int *devIpiv, - cuComplex *Barray[], - int ldb, - int *info, - int batchSize); - - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgetrsBatched( cublasHandle_t handle, - cublasOperation_t trans, - int n, - int nrhs, - const cuDoubleComplex *Aarray[], - int lda, - const int *devIpiv, - cuDoubleComplex *Barray[], - int ldb, - int *info, - int batchSize); - - - -/* TRSM - Batched Triangular Solver */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrsmBatched( cublasHandle_t handle, - cublasSideMode_t side, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int m, - int n, - const float *alpha, /*Host or Device Pointer*/ - const float *A[], - int lda, - float *B[], - int ldb, - int batchCount); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrsmBatched( cublasHandle_t handle, - cublasSideMode_t side, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int m, - int n, - const double *alpha, /*Host or Device Pointer*/ - const double *A[], - int lda, - double *B[], - int ldb, - int batchCount); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrsmBatched( cublasHandle_t handle, - cublasSideMode_t side, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int m, - int n, - const cuComplex *alpha, /*Host or Device Pointer*/ - const cuComplex *A[], - int lda, - cuComplex *B[], - int ldb, - int batchCount); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrsmBatched( cublasHandle_t handle, - cublasSideMode_t side, - cublasFillMode_t uplo, - cublasOperation_t trans, - cublasDiagType_t diag, - int m, - int n, - const cuDoubleComplex *alpha, /*Host or Device Pointer*/ - const cuDoubleComplex *A[], - int lda, - cuDoubleComplex *B[], - int ldb, - int batchCount); - -/* Batched - MATINV*/ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSmatinvBatched(cublasHandle_t handle, - int n, - const float *A[], /*Device pointer*/ - int lda, - float *Ainv[], /*Device pointer*/ - int lda_inv, - int *info, /*Device Pointer*/ - int batchSize); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDmatinvBatched(cublasHandle_t handle, - int n, - const double *A[], /*Device pointer*/ - int lda, - double *Ainv[], /*Device pointer*/ - int lda_inv, - int *info, /*Device Pointer*/ - int batchSize); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCmatinvBatched(cublasHandle_t handle, - int n, - const cuComplex *A[], /*Device pointer*/ - int lda, - cuComplex *Ainv[], /*Device pointer*/ - int lda_inv, - int *info, /*Device Pointer*/ - int batchSize); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZmatinvBatched(cublasHandle_t handle, - int n, - const cuDoubleComplex *A[], /*Device pointer*/ - int lda, - cuDoubleComplex *Ainv[], /*Device pointer*/ - int lda_inv, - int *info, /*Device Pointer*/ - int batchSize); - -/* Batch QR Factorization */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgeqrfBatched( cublasHandle_t handle, - int m, - int n, - float *Aarray[], /*Device pointer*/ - int lda, - float *TauArray[], /* Device pointer*/ - int *info, - int batchSize); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgeqrfBatched( cublasHandle_t handle, - int m, - int n, - double *Aarray[], /*Device pointer*/ - int lda, - double *TauArray[], /* Device pointer*/ - int *info, - int batchSize); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgeqrfBatched( cublasHandle_t handle, - int m, - int n, - cuComplex *Aarray[], /*Device pointer*/ - int lda, - cuComplex *TauArray[], /* Device pointer*/ - int *info, - int batchSize); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgeqrfBatched( cublasHandle_t handle, - int m, - int n, - cuDoubleComplex *Aarray[], /*Device pointer*/ - int lda, - cuDoubleComplex *TauArray[], /* Device pointer*/ - int *info, - int batchSize); -/* Least Square Min only m >= n and Non-transpose supported */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgelsBatched( cublasHandle_t handle, - cublasOperation_t trans, - int m, - int n, - int nrhs, - float *Aarray[], /*Device pointer*/ - int lda, - float *Carray[], /* Device pointer*/ - int ldc, - int *info, - int *devInfoArray, /* Device pointer*/ - int batchSize ); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgelsBatched( cublasHandle_t handle, - cublasOperation_t trans, - int m, - int n, - int nrhs, - double *Aarray[], /*Device pointer*/ - int lda, - double *Carray[], /* Device pointer*/ - int ldc, - int *info, - int *devInfoArray, /* Device pointer*/ - int batchSize); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgelsBatched( cublasHandle_t handle, - cublasOperation_t trans, - int m, - int n, - int nrhs, - cuComplex *Aarray[], /*Device pointer*/ - int lda, - cuComplex *Carray[], /* Device pointer*/ - int ldc, - int *info, - int *devInfoArray, - int batchSize); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgelsBatched( cublasHandle_t handle, - cublasOperation_t trans, - int m, - int n, - int nrhs, - cuDoubleComplex *Aarray[], /*Device pointer*/ - int lda, - cuDoubleComplex *Carray[], /* Device pointer*/ - int ldc, - int *info, - int *devInfoArray, - int batchSize); -/* DGMM */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSdgmm(cublasHandle_t handle, - cublasSideMode_t mode, - int m, - int n, - const float *A, - int lda, - const float *x, - int incx, - float *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDdgmm(cublasHandle_t handle, - cublasSideMode_t mode, - int m, - int n, - const double *A, - int lda, - const double *x, - int incx, - double *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdgmm(cublasHandle_t handle, - cublasSideMode_t mode, - int m, - int n, - const cuComplex *A, - int lda, - const cuComplex *x, - int incx, - cuComplex *C, - int ldc); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdgmm(cublasHandle_t handle, - cublasSideMode_t mode, - int m, - int n, - const cuDoubleComplex *A, - int lda, - const cuDoubleComplex *x, - int incx, - cuDoubleComplex *C, - int ldc); - -/* TPTTR : Triangular Pack format to Triangular format */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStpttr ( cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const float *AP, - float *A, - int lda ); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtpttr ( cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const double *AP, - double *A, - int lda ); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtpttr ( cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const cuComplex *AP, - cuComplex *A, - int lda ); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtpttr ( cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const cuDoubleComplex *AP, - cuDoubleComplex *A, - int lda ); - /* TRTTP : Triangular format to Triangular Pack format */ -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrttp ( cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const float *A, - int lda, - float *AP ); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrttp ( cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const double *A, - int lda, - double *AP ); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrttp ( cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const cuComplex *A, - int lda, - cuComplex *AP ); - -CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrttp ( cublasHandle_t handle, - cublasFillMode_t uplo, - int n, - const cuDoubleComplex *A, - int lda, - cuDoubleComplex *AP ); - -#if defined(__cplusplus) -} -#endif /* __cplusplus */ - -#endif /* !defined(CUBLAS_API_H_) */ diff --git a/include/triton/external/CUDA/cublas_v2.h b/include/triton/external/CUDA/cublas_v2.h deleted file mode 100644 index 5b9553a15..000000000 --- a/include/triton/external/CUDA/cublas_v2.h +++ /dev/null @@ -1,274 +0,0 @@ -/* - * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -/* - * This is the public header file for the new CUBLAS library API, it mapped the generic - * Cublas name functions to the actual _v2 implementations. - */ - -#if !defined(CUBLAS_V2_H_) -#define CUBLAS_V2_H_ - -#undef CUBLASAPI -#ifdef __CUDACC__ -#define CUBLASAPI __host__ __device__ -#else -#define CUBLASAPI -#endif - -#include "cublas_api.h" - -#define cublasCreate cublasCreate_v2 -#define cublasDestroy cublasDestroy_v2 -#define cublasGetVersion cublasGetVersion_v2 -#define cublasSetStream cublasSetStream_v2 -#define cublasGetStream cublasGetStream_v2 -#define cublasGetPointerMode cublasGetPointerMode_v2 -#define cublasSetPointerMode cublasSetPointerMode_v2 - -/* Blas3 Routines */ - -#define cublasSnrm2 cublasSnrm2_v2 -#define cublasDnrm2 cublasDnrm2_v2 -#define cublasScnrm2 cublasScnrm2_v2 -#define cublasDznrm2 cublasDznrm2_v2 - -#define cublasSdot cublasSdot_v2 -#define cublasDdot cublasDdot_v2 -#define cublasCdotu cublasCdotu_v2 -#define cublasCdotc cublasCdotc_v2 -#define cublasZdotu cublasZdotu_v2 -#define cublasZdotc cublasZdotc_v2 - -#define cublasSscal cublasSscal_v2 -#define cublasDscal cublasDscal_v2 -#define cublasCscal cublasCscal_v2 -#define cublasCsscal cublasCsscal_v2 -#define cublasZscal cublasZscal_v2 -#define cublasZdscal cublasZdscal_v2 - -#define cublasSaxpy cublasSaxpy_v2 -#define cublasDaxpy cublasDaxpy_v2 -#define cublasCaxpy cublasCaxpy_v2 -#define cublasZaxpy cublasZaxpy_v2 - -#define cublasScopy cublasScopy_v2 -#define cublasDcopy cublasDcopy_v2 -#define cublasCcopy cublasCcopy_v2 -#define cublasZcopy cublasZcopy_v2 - -#define cublasSswap cublasSswap_v2 -#define cublasDswap cublasDswap_v2 -#define cublasCswap cublasCswap_v2 -#define cublasZswap cublasZswap_v2 - -#define cublasIsamax cublasIsamax_v2 -#define cublasIdamax cublasIdamax_v2 -#define cublasIcamax cublasIcamax_v2 -#define cublasIzamax cublasIzamax_v2 - -#define cublasIsamin cublasIsamin_v2 -#define cublasIdamin cublasIdamin_v2 -#define cublasIcamin cublasIcamin_v2 -#define cublasIzamin cublasIzamin_v2 - -#define cublasSasum cublasSasum_v2 -#define cublasDasum cublasDasum_v2 -#define cublasScasum cublasScasum_v2 -#define cublasDzasum cublasDzasum_v2 - -#define cublasSrot cublasSrot_v2 -#define cublasDrot cublasDrot_v2 -#define cublasCrot cublasCrot_v2 -#define cublasCsrot cublasCsrot_v2 -#define cublasZrot cublasZrot_v2 -#define cublasZdrot cublasZdrot_v2 - -#define cublasSrotg cublasSrotg_v2 -#define cublasDrotg cublasDrotg_v2 -#define cublasCrotg cublasCrotg_v2 -#define cublasZrotg cublasZrotg_v2 - -#define cublasSrotm cublasSrotm_v2 -#define cublasDrotm cublasDrotm_v2 - -#define cublasSrotmg cublasSrotmg_v2 -#define cublasDrotmg cublasDrotmg_v2 - - -/* Blas2 Routines */ - -#define cublasSgemv cublasSgemv_v2 -#define cublasDgemv cublasDgemv_v2 -#define cublasCgemv cublasCgemv_v2 -#define cublasZgemv cublasZgemv_v2 - -#define cublasSgbmv cublasSgbmv_v2 -#define cublasDgbmv cublasDgbmv_v2 -#define cublasCgbmv cublasCgbmv_v2 -#define cublasZgbmv cublasZgbmv_v2 - -#define cublasStrmv cublasStrmv_v2 -#define cublasDtrmv cublasDtrmv_v2 -#define cublasCtrmv cublasCtrmv_v2 -#define cublasZtrmv cublasZtrmv_v2 - -#define cublasStbmv cublasStbmv_v2 -#define cublasDtbmv cublasDtbmv_v2 -#define cublasCtbmv cublasCtbmv_v2 -#define cublasZtbmv cublasZtbmv_v2 - -#define cublasStpmv cublasStpmv_v2 -#define cublasDtpmv cublasDtpmv_v2 -#define cublasCtpmv cublasCtpmv_v2 -#define cublasZtpmv cublasZtpmv_v2 - -#define cublasStrsv cublasStrsv_v2 -#define cublasDtrsv cublasDtrsv_v2 -#define cublasCtrsv cublasCtrsv_v2 -#define cublasZtrsv cublasZtrsv_v2 - -#define cublasStpsv cublasStpsv_v2 -#define cublasDtpsv cublasDtpsv_v2 -#define cublasCtpsv cublasCtpsv_v2 -#define cublasZtpsv cublasZtpsv_v2 - -#define cublasStbsv cublasStbsv_v2 -#define cublasDtbsv cublasDtbsv_v2 -#define cublasCtbsv cublasCtbsv_v2 -#define cublasZtbsv cublasZtbsv_v2 - -#define cublasSsymv cublasSsymv_v2 -#define cublasDsymv cublasDsymv_v2 -#define cublasCsymv cublasCsymv_v2 -#define cublasZsymv cublasZsymv_v2 -#define cublasChemv cublasChemv_v2 -#define cublasZhemv cublasZhemv_v2 - -#define cublasSsbmv cublasSsbmv_v2 -#define cublasDsbmv cublasDsbmv_v2 -#define cublasChbmv cublasChbmv_v2 -#define cublasZhbmv cublasZhbmv_v2 - -#define cublasSspmv cublasSspmv_v2 -#define cublasDspmv cublasDspmv_v2 -#define cublasChpmv cublasChpmv_v2 -#define cublasZhpmv cublasZhpmv_v2 - - -#define cublasSger cublasSger_v2 -#define cublasDger cublasDger_v2 -#define cublasCgeru cublasCgeru_v2 -#define cublasCgerc cublasCgerc_v2 -#define cublasZgeru cublasZgeru_v2 -#define cublasZgerc cublasZgerc_v2 - -#define cublasSsyr cublasSsyr_v2 -#define cublasDsyr cublasDsyr_v2 -#define cublasCsyr cublasCsyr_v2 -#define cublasZsyr cublasZsyr_v2 -#define cublasCher cublasCher_v2 -#define cublasZher cublasZher_v2 - -#define cublasSspr cublasSspr_v2 -#define cublasDspr cublasDspr_v2 -#define cublasChpr cublasChpr_v2 -#define cublasZhpr cublasZhpr_v2 - -#define cublasSsyr2 cublasSsyr2_v2 -#define cublasDsyr2 cublasDsyr2_v2 -#define cublasCsyr2 cublasCsyr2_v2 -#define cublasZsyr2 cublasZsyr2_v2 -#define cublasCher2 cublasCher2_v2 -#define cublasZher2 cublasZher2_v2 - -#define cublasSspr2 cublasSspr2_v2 -#define cublasDspr2 cublasDspr2_v2 -#define cublasChpr2 cublasChpr2_v2 -#define cublasZhpr2 cublasZhpr2_v2 - -/* Blas3 Routines */ - -#define cublasSgemm cublasSgemm_v2 -#define cublasDgemm cublasDgemm_v2 -#define cublasCgemm cublasCgemm_v2 -#define cublasZgemm cublasZgemm_v2 - -#define cublasSsyrk cublasSsyrk_v2 -#define cublasDsyrk cublasDsyrk_v2 -#define cublasCsyrk cublasCsyrk_v2 -#define cublasZsyrk cublasZsyrk_v2 -#define cublasCherk cublasCherk_v2 -#define cublasZherk cublasZherk_v2 - -#define cublasSsyr2k cublasSsyr2k_v2 -#define cublasDsyr2k cublasDsyr2k_v2 -#define cublasCsyr2k cublasCsyr2k_v2 -#define cublasZsyr2k cublasZsyr2k_v2 -#define cublasCher2k cublasCher2k_v2 -#define cublasZher2k cublasZher2k_v2 - -#define cublasSsymm cublasSsymm_v2 -#define cublasDsymm cublasDsymm_v2 -#define cublasCsymm cublasCsymm_v2 -#define cublasZsymm cublasZsymm_v2 -#define cublasChemm cublasChemm_v2 -#define cublasZhemm cublasZhemm_v2 - -#define cublasStrsm cublasStrsm_v2 -#define cublasDtrsm cublasDtrsm_v2 -#define cublasCtrsm cublasCtrsm_v2 -#define cublasZtrsm cublasZtrsm_v2 - -#define cublasStrmm cublasStrmm_v2 -#define cublasDtrmm cublasDtrmm_v2 -#define cublasCtrmm cublasCtrmm_v2 -#define cublasZtrmm cublasZtrmm_v2 - -#endif /* !defined(CUBLAS_V2_H_) */ diff --git a/include/triton/external/CUDA/cuda_device_runtime_api.h b/include/triton/external/CUDA/cuda_device_runtime_api.h deleted file mode 100755 index 44de3afc3..000000000 --- a/include/triton/external/CUDA/cuda_device_runtime_api.h +++ /dev/null @@ -1,248 +0,0 @@ -/* - * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -#if !defined(__CUDA_DEVICE_RUNTIME_API_H__) -#define __CUDA_DEVICE_RUNTIME_API_H__ - -/******************************************************************************* -* * -* * -* * -*******************************************************************************/ - -#if !defined(__CUDACC_RTC__) - -#if (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__) - -#if defined(__cplusplus) -extern "C" { -#endif - -struct cudaFuncAttributes; - -#if defined(_WIN32) -#define __NV_WEAK__ __declspec(nv_weak) -#else -#define __NV_WEAK__ __attribute__((nv_weak)) -#endif - -__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaMalloc(void **p, size_t s) -{ - return cudaErrorUnknown; -} - -__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c) -{ - return cudaErrorUnknown; -} - -__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device) -{ - return cudaErrorUnknown; -} - -__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaGetDevice(int *device) -{ - return cudaErrorUnknown; -} - -__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize) -{ - return cudaErrorUnknown; -} - -__device__ __NV_WEAK__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags) -{ - return cudaErrorUnknown; -} - -#undef __NV_WEAK__ - -#if defined(__cplusplus) -} -#endif - -#endif /* (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__) */ - -#endif /* !defined(__CUDACC_RTC__) */ - -#if defined(__cplusplus) && defined(__CUDACC__) /* Visible to nvcc front-end only */ -#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350) // Visible to SM>=3.5 and "__host__ __device__" only - -#include "driver_types.h" -#include "host_defines.h" - -extern "C" -{ -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceSynchronize(void); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void); -extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error); -extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord_ptsz(cudaEvent_t event, cudaStream_t stream); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion); - -/** - * \ingroup CUDART_EXECUTION - * \brief Obtains a parameter buffer - * - * Obtains a parameter buffer which can be filled with parameters for a kernel launch. - * Parameters passed to ::cudaLaunchDevice must be allocated via this function. - * - * This is a low level API and can only be accessed from Parallel Thread Execution (PTX). - * CUDA user code should use <<< >>> to launch kernels. - * - * \param alignment - Specifies alignment requirement of the parameter buffer - * \param size - Specifies size requirement in bytes - * - * \return - * Returns pointer to the allocated parameterBuffer - * \notefnerr - * - * \sa cudaLaunchDevice - */ -extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBuffer(size_t alignment, size_t size); - -/** - * \ingroup CUDART_EXECUTION - * \brief Launches a specified kernel - * - * Launches a specified kernel with the specified parameter buffer. A parameter buffer can be obtained - * by calling ::cudaGetParameterBuffer(). - * - * This is a low level API and can only be accessed from Parallel Thread Execution (PTX). - * CUDA user code should use <<< >>> to launch the kernels. - * - * \param func - Pointer to the kernel to be launched - * \param parameterBuffer - Holds the parameters to the launched kernel. parameterBuffer can be NULL. (Optional) - * \param gridDimension - Specifies grid dimensions - * \param blockDimension - Specifies block dimensions - * \param sharedMemSize - Specifies size of shared memory - * \param stream - Specifies the stream to be used - * - * \return - * ::cudaSuccess, ::cudaErrorInvalidDevice, ::cudaErrorLaunchMaxDepthExceeded, ::cudaErrorInvalidConfiguration, - * ::cudaErrorStartupFailure, ::cudaErrorLaunchPendingCountExceeded, ::cudaErrorLaunchOutOfResources - * \notefnerr - * \n Please refer to Execution Configuration and Parameter Buffer Layout from the CUDA Programming - * Guide for the detailed descriptions of launch configuration and parameter layout respectively. - * - * \sa cudaGetParameterBuffer - */ -extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream); - -#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) && defined(__CUDA_ARCH__) - // When compiling for the device and per thread default stream is enabled, add - // a static inline redirect to the per thread stream entry points. - - static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI - cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream) - { - return cudaLaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream); - } - - static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI - cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream) - { - return cudaLaunchDeviceV2_ptsz(parameterBuffer, stream); - } -#else - extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream); - extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream); -#endif - -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags); - -extern __device__ __cudart_builtin__ unsigned long long CUDARTAPI cudaCGGetIntrinsicHandle(enum cudaCGScope scope); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGSynchronize(unsigned long long handle, unsigned int flags); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetSize(unsigned int *numThreads, unsigned int *numGrids, unsigned long long handle); -extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetRank(unsigned int *threadRank, unsigned int *gridRank, unsigned long long handle); -} - -template static __inline__ __device__ __cudart_builtin__ cudaError_t cudaMalloc(T **devPtr, size_t size); -template static __inline__ __device__ __cudart_builtin__ cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *attr, T *entry); -template static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize); -template static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize, unsigned int flags); - - -#endif // !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350) -#endif /* defined(__cplusplus) && defined(__CUDACC__) */ - -#endif /* !__CUDA_DEVICE_RUNTIME_API_H__ */ diff --git a/include/triton/external/CUDA/cuda_fp16.h b/include/triton/external/CUDA/cuda_fp16.h deleted file mode 100755 index b724f1e9d..000000000 --- a/include/triton/external/CUDA/cuda_fp16.h +++ /dev/null @@ -1,1969 +0,0 @@ -/* -* Copyright 1993-2014 NVIDIA Corporation. All rights reserved. -* -* NOTICE TO LICENSEE: -* -* This source code and/or documentation ("Licensed Deliverables") are -* subject to NVIDIA intellectual property rights under U.S. and -* international Copyright laws. -* -* These Licensed Deliverables contained herein is PROPRIETARY and -* CONFIDENTIAL to NVIDIA and is being provided under the terms and -* conditions of a form of NVIDIA software license agreement by and -* between NVIDIA and Licensee ("License Agreement") or electronically -* accepted by Licensee. Notwithstanding any terms or conditions to -* the contrary in the License Agreement, reproduction or disclosure -* of the Licensed Deliverables to any third party without the express -* written consent of NVIDIA is prohibited. -* -* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE -* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE -* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS -* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. -* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED -* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, -* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. -* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE -* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY -* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY -* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, -* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS -* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE -* OF THESE LICENSED DELIVERABLES. -* -* U.S. Government End Users. These Licensed Deliverables are a -* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT -* 1995), consisting of "commercial computer software" and "commercial -* computer software documentation" as such terms are used in 48 -* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government -* only as a commercial end item. Consistent with 48 C.F.R.12.212 and -* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all -* U.S. Government End Users acquire the Licensed Deliverables with -* only those rights set forth herein. -* -* Any use of the Licensed Deliverables in individual and commercial -* software must include, in the user documentation and internal -* comments to the code, the above Disclaimer and U.S. Government End -* Users Notice. -*/ - -/** -* \defgroup CUDA_MATH_INTRINSIC_HALF Half Precision Intrinsics -* This section describes half precision intrinsic functions that are -* only supported in device code. -*/ - -/** -* \defgroup CUDA_MATH__HALF_ARITHMETIC Half Arithmetic Functions -* \ingroup CUDA_MATH_INTRINSIC_HALF -*/ - -/** -* \defgroup CUDA_MATH__HALF2_ARITHMETIC Half2 Arithmetic Functions -* \ingroup CUDA_MATH_INTRINSIC_HALF -*/ - -/** -* \defgroup CUDA_MATH__HALF_COMPARISON Half Comparison Functions -* \ingroup CUDA_MATH_INTRINSIC_HALF -*/ - -/** -* \defgroup CUDA_MATH__HALF2_COMPARISON Half2 Comparison Functions -* \ingroup CUDA_MATH_INTRINSIC_HALF -*/ - -/** -* \defgroup CUDA_MATH__HALF_MISC Half Precision Conversion And Data Movement -* \ingroup CUDA_MATH_INTRINSIC_HALF -*/ - -/** -* \defgroup CUDA_MATH__HALF_FUNCTIONS Half Math Functions -* \ingroup CUDA_MATH_INTRINSIC_HALF -*/ - -/** -* \defgroup CUDA_MATH__HALF2_FUNCTIONS Half2 Math Functions -* \ingroup CUDA_MATH_INTRINSIC_HALF -*/ - -#ifndef __CUDA_FP16_H__ -#define __CUDA_FP16_H__ - -#if defined(__cplusplus) && defined(__CUDACC__) - -#if defined(__CUDACC_RTC__) -#define __CUDA_FP16_DECL__ __host__ __device__ -#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__ -#else /* !__CUDACC_RTC__ */ -#define __CUDA_FP16_DECL__ static __device__ __inline__ -#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__ -#endif /* __CUDACC_RTC__ */ - -#define __CUDA_FP16_TYPES_EXIST__ -/* Forward-declaration of structures defined in "cuda_fp16.hpp" */ -struct __half; -struct __half2; - -/* Vector type creation functions, match vector_functions.h */ -__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y); - -#undef __VECTOR_FUNCTIONS_DECL__ - -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Converts float number to half precision in round-to-nearest-even mode -* and returns \p half with converted value. -* -* Converts float number \p a to half precision in round-to-nearest-even mode. -* -* \return Returns \p half result with converted value. -*/ -__CUDA_FP16_DECL__ __half __float2half(const float a); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Converts float number to half precision in round-to-nearest-even mode -* and returns \p half with converted value. -* -* Converts float number \p a to half precision in round-to-nearest-even mode. -* -* \return Returns \p half result with converted value. -*/ -__CUDA_FP16_DECL__ __half __float2half_rn(const float a); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Converts float number to half precision in round-towards-zero mode -* and returns \p half with converted value. -* -* Converts float number \p a to half precision in round-towards-zero mode. -* -* \return Returns \p half result with converted value. -*/ -__CUDA_FP16_DECL__ __half __float2half_rz(const float a); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Converts float number to half precision in round-down mode -* and returns \p half with converted value. -* -* Converts float number \p a to half precision in round-down mode. -* -* \return Returns \p half result with converted value. -*/ -__CUDA_FP16_DECL__ __half __float2half_rd(const float a); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Converts float number to half precision in round-up mode -* and returns \p half with converted value. -* -* Converts float number \p a to half precision in round-up mode. -* -* \return Returns \p half result with converted value. -*/ -__CUDA_FP16_DECL__ __half __float2half_ru(const float a); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Converts \p half number to float. -* -* Converts half number \p a to float. -* -* \return Returns float result with converted value. -*/ -__CUDA_FP16_DECL__ float __half2float(const __half a); - -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a half to a signed integer in round-to-nearest-even mode. -* -* Convert the half-precision floating point value \p h to a signed integer in -* round-to-nearest-even mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ int __half2int_rn(__half h); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a half to a signed integer in round-towards-zero mode. -* -* Convert the half-precision floating point value \p h to a signed integer in -* round-towards-zero mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ int __half2int_rz(__half h); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a half to a signed integer in round-down mode. -* -* Convert the half-precision floating point value \p h to a signed integer in -* round-down mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ int __half2int_rd(__half h); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a half to a signed integer in round-up mode. -* -* Convert the half-precision floating point value \p h to a signed integer in -* round-up mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ int __half2int_ru(__half h); - -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a signed integer to a half in round-to-nearest-even mode. -* -* Convert the signed integer value \p i to a half-precision floating point -* value in round-to-nearest-even mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ __half __int2half_rn(int i); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a signed integer to a half in round-towards-zero mode. -* -* Convert the signed integer value \p i to a half-precision floating point -* value in round-towards-zero mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ __half __int2half_rz(int i); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a signed integer to a half in round-down mode. -* -* Convert the signed integer value \p i to a half-precision floating point -* value in round-down mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ __half __int2half_rd(int i); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a signed integer to a half in round-up mode. -* -* Convert the signed integer value \p i to a half-precision floating point -* value in round-up mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ __half __int2half_ru(int i); - -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a half to a signed short integer in round-to-nearest-even -* mode. -* -* Convert the half-precision floating point value \p h to a signed short -* integer in round-to-nearest-even mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ short int __half2short_rn(__half h); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a half to a signed short integer in round-towards-zero mode. -* -* Convert the half-precision floating point value \p h to a signed short -* integer in round-towards-zero mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ short int __half2short_rz(__half h); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a half to a signed short integer in round-down mode. -* -* Convert the half-precision floating point value \p h to a signed short -* integer in round-down mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ short int __half2short_rd(__half h); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a half to a signed short integer in round-up mode. -* -* Convert the half-precision floating point value \p h to a signed short -* integer in round-up mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ short int __half2short_ru(__half h); - -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a signed short integer to a half in round-to-nearest-even -* mode. -* -* Convert the signed short integer value \p i to a half-precision floating -* point value in round-to-nearest-even mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ __half __short2half_rn(short int i); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a signed short integer to a half in round-towards-zero mode. -* -* Convert the signed short integer value \p i to a half-precision floating -* point value in round-towards-zero mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ __half __short2half_rz(short int i); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a signed short integer to a half in round-down mode. -* -* Convert the signed short integer value \p i to a half-precision floating -* point value in round-down mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ __half __short2half_rd(short int i); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a signed short integer to a half in round-up mode. -* -* Convert the signed short integer value \p i to a half-precision floating -* point value in round-up mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ __half __short2half_ru(short int i); - -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a half to an unsigned integer in round-to-nearest-even mode. -* -* Convert the half-precision floating point value \p h to an unsigned integer -* in round-to-nearest-even mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ unsigned int __half2uint_rn(__half h); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a half to an unsigned integer in round-towards-zero mode. -* -* Convert the half-precision floating point value \p h to an unsigned integer -* in round-towards-zero mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ unsigned int __half2uint_rz(__half h); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a half to an unsigned integer in round-down mode. -* -* Convert the half-precision floating point value \p h to an unsigned integer -* in round-down mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ unsigned int __half2uint_rd(__half h); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a half to an unsigned integer in round-up mode. -* -* Convert the half-precision floating point value \p h to an unsigned integer -* in round-up mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ unsigned int __half2uint_ru(__half h); - -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert an unsigned integer to a half in round-to-nearest-even mode. -* -* Convert the unsigned integer value \p i to a half-precision floating point -* value in round-to-nearest-even mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ __half __uint2half_rn(unsigned int i); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert an unsigned integer to a half in round-towards-zero mode. -* -* Convert the unsigned integer value \p i to a half-precision floating point -* value in round-towards-zero mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ __half __uint2half_rz(unsigned int i); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert an unsigned integer to a half in round-down mode. -* -* Convert the unsigned integer value \p i to a half-precision floating point -* value in round-down mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ __half __uint2half_rd(unsigned int i); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert an unsigned integer to a half in round-up mode. -* -* Convert the unsigned integer value \p i to a half-precision floating point -* value in round-up mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ __half __uint2half_ru(unsigned int i); - -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a half to an unsigned short integer in round-to-nearest-even -* mode. -* -* Convert the half-precision floating point value \p h to an unsigned short -* integer in round-to-nearest-even mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(__half h); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a half to an unsigned short integer in round-towards-zero -* mode. -* -* Convert the half-precision floating point value \p h to an unsigned short -* integer in round-towards-zero mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ unsigned short int __half2ushort_rz(__half h); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a half to an unsigned short integer in round-down mode. -* -* Convert the half-precision floating point value \p h to an unsigned short -* integer in round-down mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(__half h); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a half to an unsigned short integer in round-up mode. -* -* Convert the half-precision floating point value \p h to an unsigned short -* integer in round-up mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(__half h); - -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert an unsigned short integer to a half in round-to-nearest-even -* mode. -* -* Convert the unsigned short integer value \p i to a half-precision floating -* point value in round-to-nearest-even mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ __half __ushort2half_rn(unsigned short int i); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert an unsigned short integer to a half in round-towards-zero -* mode. -* -* Convert the unsigned short integer value \p i to a half-precision floating -* point value in round-towards-zero mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ __half __ushort2half_rz(unsigned short int i); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert an unsigned short integer to a half in round-down mode. -* -* Convert the unsigned short integer value \p i to a half-precision floating -* point value in round-down mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ __half __ushort2half_rd(unsigned short int i); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert an unsigned short integer to a half in round-up mode. -* -* Convert the unsigned short integer value \p i to a half-precision floating -* point value in round-up mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ __half __ushort2half_ru(unsigned short int i); - -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a half to an unsigned 64-bit integer in round-to-nearest-even -* mode. -* -* Convert the half-precision floating point value \p h to an unsigned 64-bit -* integer in round-to-nearest-even mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(__half h); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a half to an unsigned 64-bit integer in round-towards-zero -* mode. -* -* Convert the half-precision floating point value \p h to an unsigned 64-bit -* integer in round-towards-zero mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ unsigned long long int __half2ull_rz(__half h); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a half to an unsigned 64-bit integer in round-down mode. -* -* Convert the half-precision floating point value \p h to an unsigned 64-bit -* integer in round-down mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(__half h); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a half to an unsigned 64-bit integer in round-up mode. -* -* Convert the half-precision floating point value \p h to an unsigned 64-bit -* integer in round-up mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(__half h); - -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert an unsigned 64-bit integer to a half in round-to-nearest-even -* mode. -* -* Convert the unsigned 64-bit integer value \p i to a half-precision floating -* point value in round-to-nearest-even mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ __half __ull2half_rn(unsigned long long int i); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert an unsigned 64-bit integer to a half in round-towards-zero -* mode. -* -* Convert the unsigned 64-bit integer value \p i to a half-precision floating -* point value in round-towards-zero mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ __half __ull2half_rz(unsigned long long int i); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert an unsigned 64-bit integer to a half in round-down mode. -* -* Convert the unsigned 64-bit integer value \p i to a half-precision floating -* point value in round-down mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ __half __ull2half_rd(unsigned long long int i); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert an unsigned 64-bit integer to a half in round-up mode. -* -* Convert the unsigned 64-bit integer value \p i to a half-precision floating -* point value in round-up mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ __half __ull2half_ru(unsigned long long int i); - -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a half to a signed 64-bit integer in round-to-nearest-even -* mode. -* -* Convert the half-precision floating point value \p h to a signed 64-bit -* integer in round-to-nearest-even mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ long long int __half2ll_rn(__half h); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a half to a signed 64-bit integer in round-towards-zero mode. -* -* Convert the half-precision floating point value \p h to a signed 64-bit -* integer in round-towards-zero mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ long long int __half2ll_rz(__half h); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a half to a signed 64-bit integer in round-down mode. -* -* Convert the half-precision floating point value \p h to a signed 64-bit -* integer in round-down mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ long long int __half2ll_rd(__half h); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a half to a signed 64-bit integer in round-up mode. -* -* Convert the half-precision floating point value \p h to a signed 64-bit -* integer in round-up mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ long long int __half2ll_ru(__half h); - -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a signed 64-bit integer to a half in round-to-nearest-even -* mode. -* -* Convert the signed 64-bit integer value \p i to a half-precision floating -* point value in round-to-nearest-even mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ __half __ll2half_rn(long long int i); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a signed 64-bit integer to a half in round-towards-zero mode. -* -* Convert the signed 64-bit integer value \p i to a half-precision floating -* point value in round-towards-zero mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ __half __ll2half_rz(long long int i); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a signed 64-bit integer to a half in round-down mode. -* -* Convert the signed 64-bit integer value \p i to a half-precision floating -* point value in round-down mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ __half __ll2half_rd(long long int i); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Convert a signed 64-bit integer to a half in round-up mode. -* -* Convert the signed 64-bit integer value \p i to a half-precision floating -* point value in round-up mode. -* -* \return Returns converted value. -*/ -__CUDA_FP16_DECL__ __half __ll2half_ru(long long int i); - -/** -* \ingroup CUDA_MATH__HALF_FUNCTIONS -* \brief Truncate input argument to the integral part. -* -* Round \p h to the nearest integer value that does not exceed \p h in -* magnitude. -* -* \return Returns truncated integer value. -*/ -__CUDA_FP16_DECL__ __half htrunc(const __half h); -/** -* \ingroup CUDA_MATH__HALF_FUNCTIONS -* \brief Calculate ceiling of the input argument. -* -* Compute the smallest integer value not less than \p h. -* -* \return Returns ceiling expressed as a half-precision floating point number. -*/ -__CUDA_FP16_DECL__ __half hceil(const __half h); -/** -* \ingroup CUDA_MATH__HALF_FUNCTIONS -* \brief Calculate the largest integer less than or equal to \p h. -* -* Calculate the largest integer value which is less than or equal to \p h. -* -* \return Returns floor expressed as half-precision floating point number. -*/ -__CUDA_FP16_DECL__ __half hfloor(const __half h); -/** -* \ingroup CUDA_MATH__HALF_FUNCTIONS -* \brief Round input to nearest integer value in half-precision floating point -* number. -* -* Round \p h to the nearest integer value in half-precision floating point -* format, with halfway cases rounded to the nearest even integer value. -* -* \return Returns rounded integer value expressed as half-precision floating -* point number. -*/ -__CUDA_FP16_DECL__ __half hrint(const __half h); - -/** -* \ingroup CUDA_MATH__HALF2_FUNCTIONS -* \brief Truncate \p half2 vector input argument to the integral part. -* -* Round each component of vector \p h to the nearest integer value that does -* not exceed \p h in magnitude. -* -* \return Returns \p half2 vector truncated integer value. -*/ -__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h); -/** -* \ingroup CUDA_MATH__HALF2_FUNCTIONS -* \brief Calculate \p half2 vector ceiling of the input argument. -* -* For each component of vector \p h compute the smallest integer value not less -* than \p h. -* -* \return Returns \p half2 vector ceiling expressed as a pair of half-precision -* floating point numbers. -*/ -__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h); -/** -* \ingroup CUDA_MATH__HALF2_FUNCTIONS -* \brief Calculate the largest integer less than or equal to \p h. -* -* For each component of vector \p h calculate the largest integer value which -* is less than or equal to \p h. -* -* \return Returns \p half2 vector floor expressed as a pair of half-precision -* floating point number. -*/ -__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h); -/** -* \ingroup CUDA_MATH__HALF2_FUNCTIONS -* \brief Round input to nearest integer value in half-precision floating point -* number. -* -* Round each component of \p half2 vector \p h to the nearest integer value in -* half-precision floating point format, with halfway cases rounded to the -* nearest even integer value. -* -* \return Returns \p half2 vector of rounded integer values expressed as -* half-precision floating point numbers. -*/ -__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h); - -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Converts input to half precision in round-to-nearest-even mode and -* populates both halves of \p half2 with converted value. -* -* Converts input \p a to half precision in round-to-nearest-even mode and -* populates both halves of \p half2 with converted value. -* -* \return Returns \p half2 with both halves equal to the converted half -* precision number. -*/ -__CUDA_FP16_DECL__ __half2 __float2half2_rn(const float a); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Converts both input floats to half precision in round-to-nearest-even -* mode and returns \p half2 with converted values. -* -* Converts both input floats to half precision in round-to-nearest-even mode -* and combines the results into one \p half2 number. Low 16 bits of the return -* value correspond to the input \p a, high 16 bits correspond to the input \p -* b. -* -* \return Returns \p half2 which has corresponding halves equal to the -* converted input floats. -*/ -__CUDA_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Converts both components of float2 number to half precision in -* round-to-nearest-even mode and returns \p half2 with converted values. -* -* Converts both components of float2 to half precision in round-to-nearest -* mode and combines the results into one \p half2 number. Low 16 bits of the -* return value correspond to \p a.x and high 16 bits of the return value -* correspond to \p a.y. -* -* \return Returns \p half2 which has corresponding halves equal to the -* converted float2 components. -*/ -__CUDA_FP16_DECL__ __half2 __float22half2_rn(const float2 a); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Converts both halves of \p half2 to float2 and returns the result. -* -* Converts both halves of \p half2 input \p a to float2 and returns the -* result. -* -* \return Returns converted float2. -*/ -__CUDA_FP16_DECL__ float2 __half22float2(const __half2 a); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Converts low 16 bits of \p half2 to float and returns the result -* -* Converts low 16 bits of \p half2 input \p a to 32 bit floating point number -* and returns the result. -* -* \return Returns low 16 bits of \p a converted to float. -*/ -__CUDA_FP16_DECL__ float __low2float(const __half2 a); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Returns \p half2 with both halves equal to the input value. -* -* Returns \p half2 number with both halves equal to the input \p a \p half -* number. -* -* \return Returns \p half2 with both halves equal to the input \p a. -*/ -__CUDA_FP16_DECL__ __half2 __half2half2(const __half a); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Converts high 16 bits of \p half2 to float and returns the result -* -* Converts high 16 bits of \p half2 input \p a to 32 bit floating point number -* and returns the result. -* -* \return Returns high 16 bits of \p a converted to float. -*/ -__CUDA_FP16_DECL__ float __high2float(const __half2 a); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Swaps both halves of the \p half2 input. -* -* Swaps both halves of the \p half2 input and returns a new \p half2 number -* with swapped halves. -* -* \return Returns \p half2 with halves swapped. -*/ -__CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Extracts low 16 bits from each of the two \p half2 inputs and combines -* into one \p half2 number. -* -* Extracts low 16 bits from each of the two \p half2 inputs and combines into -* one \p half2 number. Low 16 bits from input \p a is stored in low 16 bits of -* the return value, low 16 bits from input \p b is stored in high 16 bits of -* the return value. -* -* \return Returns \p half2 which contains low 16 bits from \p a and \p b. -*/ -__CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Extracts high 16 bits from each of the two \p half2 inputs and -* combines into one \p half2 number. -* -* Extracts high 16 bits from each of the two \p half2 inputs and combines into -* one \p half2 number. High 16 bits from input \p a is stored in low 16 bits of -* the return value, high 16 bits from input \p b is stored in high 16 bits of -* the return value. -* -* \return Returns \p half2 which contains high 16 bits from \p a and \p b. -*/ -__CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Returns high 16 bits of \p half2 input. -* -* Returns high 16 bits of \p half2 input \p a. -* -* \return Returns \p half which contains high 16 bits of the input. -*/ -__CUDA_FP16_DECL__ __half __high2half(const __half2 a); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Returns low 16 bits of \p half2 input. -* -* Returns low 16 bits of \p half2 input \p a. -* -* \return Returns \p half which contains low 16 bits of the input. -*/ -__CUDA_FP16_DECL__ __half __low2half(const __half2 a); -/** -* \ingroup CUDA_MATH__HALF_COMPARISON -* \brief Checks if the input \p half number is infinite. -* -* Checks if the input \p half number \p a is infinite. -* -* \return Returns -1 iff \p a is equal to negative infinity, 1 iff \p a is -* equal to positive infinity and 0 otherwise. -*/ -__CUDA_FP16_DECL__ int __hisinf(const __half a); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Combines two \p half numbers into one \p half2 number. -* -* Combines two input \p half number \p a and \p b into one \p half2 number. -* Input \p a is stored in low 16 bits of the return value, input \p b is stored -* in high 16 bits of the return value. -* -* \return Returns \p half2 number which has one half equal to \p a and the -* other to \p b. -*/ -__CUDA_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Extracts low 16 bits from \p half2 input. -* -* Extracts low 16 bits from \p half2 input \p a and returns a new \p half2 -* number which has both halves equal to the extracted bits. -* -* \return Returns \p half2 with both halves equal to low 16 bits from the -* input. -*/ -__CUDA_FP16_DECL__ __half2 __low2half2(const __half2 a); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Extracts high 16 bits from \p half2 input. -* -* Extracts high 16 bits from \p half2 input \p a and returns a new \p half2 -* number which has both halves equal to the extracted bits. -* -* \return Returns \p half2 with both halves equal to high 16 bits from the -* input. -*/ -__CUDA_FP16_DECL__ __half2 __high2half2(const __half2 a); - -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Reinterprets bits in a \p half as a signed short integer. -* -* Reinterprets the bits in the half-precision floating point value \p h -* as a signed short integer. -* -* \return Returns reinterpreted value. -*/ -__CUDA_FP16_DECL__ short int __half_as_short(const __half h); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Reinterprets bits in a \p half as an unsigned short integer. -* -* Reinterprets the bits in the half-precision floating point value \p h -* as an unsigned short integer. -* -* \return Returns reinterpreted value. -*/ -__CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Reinterprets bits in a signed short integer as a \p half. -* -* Reinterprets the bits in the signed short integer value \p i as a -* half-precision floating point value. -* -* \return Returns reinterpreted value. -*/ -__CUDA_FP16_DECL__ __half __short_as_half(const short int i); -/** -* \ingroup CUDA_MATH__HALF_MISC -* \brief Reinterprets bits in an unsigned short integer as a \p half. -* -* Reinterprets the bits in the unsigned short integer value \p i as a -* half-precision floating point value. -* -* \return Returns reinterpreted value. -*/ -__CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i); - -#if __CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__) -#if !defined warpSize && !defined __local_warpSize -#define warpSize 32 -#define __local_warpSize -#endif - -#if defined(_WIN32) -# define __DEPRECATED__(msg) __declspec(deprecated(msg)) -#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__)))) -# define __DEPRECATED__(msg) __attribute__((deprecated)) -#else -# define __DEPRECATED__(msg) __attribute__((deprecated(msg))) -#endif - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 -#define __WSB_DEPRECATION_MESSAGE(x) #x"() is not valid on compute_70 and above, and should be replaced with "#x"_sync()." \ - "To continue using "#x"(), specify virtual architecture compute_60 when targeting sm_70 and above, for example, using the pair of compiler options: -arch=compute_60 -code=sm_70." -#else -#define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)." -#endif - -__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) __half2 __shfl(__half2 var, int delta, int width = warpSize); -__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) __half2 __shfl_up(__half2 var, unsigned int delta, int width = warpSize); -__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down))__half2 __shfl_down(__half2 var, unsigned int delta, int width = warpSize); -__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half2 __shfl_xor(__half2 var, int delta, int width = warpSize); -__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) __half __shfl(__half var, int delta, int width = warpSize); -__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) __half __shfl_up(__half var, unsigned int delta, int width = warpSize); -__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) __half __shfl_down(__half var, unsigned int delta, int width = warpSize); -__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half __shfl_xor(__half var, int delta, int width = warpSize); - -__CUDA_FP16_DECL__ __half2 __shfl_sync(unsigned mask, __half2 var, int delta, int width = warpSize); -__CUDA_FP16_DECL__ __half2 __shfl_up_sync(unsigned mask, __half2 var, unsigned int delta, int width = warpSize); -__CUDA_FP16_DECL__ __half2 __shfl_down_sync(unsigned mask, __half2 var, unsigned int delta, int width = warpSize); -__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(unsigned mask, __half2 var, int delta, int width = warpSize); -__CUDA_FP16_DECL__ __half __shfl_sync(unsigned mask, __half var, int delta, int width = warpSize); -__CUDA_FP16_DECL__ __half __shfl_up_sync(unsigned mask, __half var, unsigned int delta, int width = warpSize); -__CUDA_FP16_DECL__ __half __shfl_down_sync(unsigned mask, __half var, unsigned int delta, int width = warpSize); -__CUDA_FP16_DECL__ __half __shfl_xor_sync(unsigned mask, __half var, int delta, int width = warpSize); - -#if defined(__local_warpSize) -#undef warpSize -#undef __local_warpSize -#endif -#endif /*__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__) */ - -#if defined(__cplusplus) && ( __CUDA_ARCH__ >=320 || !defined(__CUDA_ARCH__) ) -__CUDA_FP16_DECL__ __half2 __ldg(const __half2 *ptr); -__CUDA_FP16_DECL__ __half __ldg(const __half *ptr); -__CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *ptr); -__CUDA_FP16_DECL__ __half __ldcg(const __half *ptr); -__CUDA_FP16_DECL__ __half2 __ldca(const __half2 *ptr); -__CUDA_FP16_DECL__ __half __ldca(const __half *ptr); -__CUDA_FP16_DECL__ __half2 __ldcs(const __half2 *ptr); -__CUDA_FP16_DECL__ __half __ldcs(const __half *ptr); -#endif /*defined(__cplusplus) && ( __CUDA_ARCH__ >=320 || !defined(__CUDA_ARCH__) )*/ - -#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) -/** -* \ingroup CUDA_MATH__HALF2_COMPARISON -* \brief Performs half2 vector if-equal comparison. -* -* Performs \p half2 vector if-equal comparison of inputs \p a and \p b. -* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. -* NaN inputs generate false results. -* -* \return Returns the \p half2 vector result of if-equal comparison of vectors -* \p a and \p b. -*/ -__CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_COMPARISON -* \brief Performs \p half2 vector not-equal comparison. -* -* Performs \p half2 vector not-equal comparison of inputs \p a and \p b. -* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. -* NaN inputs generate false results. -* -* \return Returns the \p half2 vector result of not-equal comparison of vectors -* \p a and \p b. -*/ -__CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_COMPARISON -* \brief Performs \p half2 vector less-equal comparison. -* -* Performs \p half2 vector less-equal comparison of inputs \p a and \p b. -* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. -* NaN inputs generate false results. -* -* \return Returns the \p half2 vector result of less-equal comparison of -* vectors \p a and \p b. -*/ -__CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_COMPARISON -* \brief Performs \p half2 vector greater-equal comparison. -* -* Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. -* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. -* NaN inputs generate false results. -* -* \return Returns the \p half2 vector result of greater-equal comparison of -* vectors \p a and \p b. -*/ -__CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_COMPARISON -* \brief Performs \p half2 vector less-than comparison. -* -* Performs \p half2 vector less-than comparison of inputs \p a and \p b. -* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. -* NaN inputs generate false results. -* -* \return Returns the \p half2 vector result of less-than comparison of vectors -* \p a and \p b. -*/ -__CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_COMPARISON -* \brief Performs \p half2 vector greater-than comparison. -* -* Performs \p half2 vector greater-than comparison of inputs \p a and \p b. -* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. -* NaN inputs generate false results. -* -* \return Returns the half2 vector result of greater-than comparison of vectors -* \p a and \p b. -*/ -__CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_COMPARISON -* \brief Performs \p half2 vector unordered if-equal comparison. -* -* Performs \p half2 vector if-equal comparison of inputs \p a and \p b. -* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. -* NaN inputs generate true results. -* -* \return Returns the \p half2 vector result of unordered if-equal comparison -* of vectors \p a and \p b. -*/ -__CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_COMPARISON -* \brief Performs \p half2 vector unordered not-equal comparison. -* -* Performs \p half2 vector not-equal comparison of inputs \p a and \p b. -* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. -* NaN inputs generate true results. -* -* \return Returns the \p half2 vector result of unordered not-equal comparison -* of vectors \p a and \p b. -*/ -__CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_COMPARISON -* \brief Performs \p half2 vector unordered less-equal comparison. -* -* Performs \p half2 vector less-equal comparison of inputs \p a and \p b. -* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. -* NaN inputs generate true results. -* -* \return Returns the \p half2 vector result of unordered less-equal comparison -* of vectors \p a and \p b. -*/ -__CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_COMPARISON -* \brief Performs \p half2 vector unordered greater-equal comparison. -* -* Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. -* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. -* NaN inputs generate true results. -* -* \return Returns the \p half2 vector result of unordered greater-equal -* comparison of vectors \p a and \p b. -*/ -__CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_COMPARISON -* \brief Performs \p half2 vector unordered less-than comparison. -* -* Performs \p half2 vector less-than comparison of inputs \p a and \p b. -* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. -* NaN inputs generate true results. -* -* \return Returns the \p half2 vector result of unordered less-than comparison -* of vectors \p a and \p b. -*/ -__CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_COMPARISON -* \brief Performs \p half2 vector unordered greater-than comparison. -* -* Performs \p half2 vector greater-than comparison of inputs \p a and \p b. -* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. -* NaN inputs generate true results. -* -* \return Returns the \p half2 vector result of unordered greater-than -* comparison of vectors \p a and \p b. -*/ -__CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_COMPARISON -* \brief Determine whether \p half2 argument is a NaN. -* -* Determine whether each half of input \p half2 number \p a is a NaN. -* -* \return Returns \p half2 which has the corresponding \p half results set to -* 1.0 for true, or 0.0 for false. -*/ -__CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a); -/** -* \ingroup CUDA_MATH__HALF2_ARITHMETIC -* \brief Performs \p half2 vector addition in round-to-nearest-even mode. -* -* Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest -* mode. -* -* \return Returns the \p half2 vector result of adding vectors \p a and \p b. -*/ -__CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_ARITHMETIC -* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode. -* -* Subtracts \p half2 input vector \p b from input vector \p a in -* round-to-nearest-even mode. -* -* \return Returns the \p half2 vector result of subtraction vector \p b from \p -* a. -*/ -__CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_ARITHMETIC -* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode. -* -* Performs \p half2 vector multiplication of inputs \p a and \p b, in -* round-to-nearest-even mode. -* -* \return Returns the \p half2 vector result of multiplying vectors \p a and \p -* b. -*/ -__CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF_ARITHMETIC -* \brief Performs \p half2 vector division in round-to-nearest-even mode. -* -* Divides \p half2 input vector \p a by input vector \p b in round-to-nearest -* mode. -* -* \return Returns the \p half2 vector result of division \p a by \p b. -*/ -__CUDA_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_ARITHMETIC -* \brief Performs \p half2 vector addition in round-to-nearest-even mode, with -* saturation to [0.0, 1.0]. -* -* Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest -* mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to -* +0.0. -* -* \return Returns the \p half2 vector result of adding vectors \p a and \p b -* with saturation. -*/ -__CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_ARITHMETIC -* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode, -* with saturation to [0.0, 1.0]. -* -* Subtracts \p half2 input vector \p b from input vector \p a in -* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN -* results are flushed to +0.0. -* -* \return Returns the \p half2 vector result of subtraction vector \p b from \p -* a with saturation. -*/ -__CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_ARITHMETIC -* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode, -* with saturation to [0.0, 1.0]. -* -* Performs \p half2 vector multiplication of inputs \p a and \p b, in -* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN -* results are flushed to +0.0. -* -* \return Returns the \p half2 vector result of multiplying vectors \p a and \p -* b with saturation. -*/ -__CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_ARITHMETIC -* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even -* mode. -* -* Performs \p half2 vector multiply on inputs \p a and \p b, -* then performs a \p half2 vector add of the result with \p c, -* rounding the result once in round-to-nearest-even mode. -* -* \return Returns the \p half2 vector result of the fused multiply-add -* operation on vectors \p a, \p b, and \p c. -*/ -__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c); -/** -* \ingroup CUDA_MATH__HALF2_ARITHMETIC -* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even -* mode, with saturation to [0.0, 1.0]. -* -* Performs \p half2 vector multiply on inputs \p a and \p b, -* then performs a \p half2 vector add of the result with \p c, -* rounding the result once in round-to-nearest-even mode, and clamps the -* results to range [0.0, 1.0]. NaN results are flushed to +0.0. -* -* \return Returns the \p half2 vector result of the fused multiply-add -* operation on vectors \p a, \p b, and \p c with saturation. -*/ -__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c); -/** -* \ingroup CUDA_MATH__HALF2_ARITHMETIC -* \brief Negates both halves of the input \p half2 number and returns the -* result. -* -* Negates both halves of the input \p half2 number \p a and returns the result. -* -* \return Returns \p half2 number with both halves negated. -*/ -__CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a); -/** -* \ingroup CUDA_MATH__HALF_ARITHMETIC -* \brief Performs \p half addition in round-to-nearest-even mode. -* -* Performs \p half addition of inputs \p a and \p b, in round-to-nearest-even -* mode. -* -* \return Returns the \p half result of adding \p a and \p b. -*/ -__CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b); -/** -* \ingroup CUDA_MATH__HALF_ARITHMETIC -* \brief Performs \p half subtraction in round-to-nearest-even mode. -* -* Subtracts \p half input \p b from input \p a in round-to-nearest -* mode. -* -* \return Returns the \p half result of subtraction \p b from \p a. -*/ -__CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b); -/** -* \ingroup CUDA_MATH__HALF_ARITHMETIC -* \brief Performs \p half multiplication in round-to-nearest-even mode. -* -* Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest -* mode. -* -* \return Returns the \p half result of multiplying \p a and \p b. -*/ -__CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b); -/** -* \ingroup CUDA_MATH__HALF_ARITHMETIC -* \brief Performs \p half division in round-to-nearest-even mode. -* -* Divides \p half input \p a by input \p b in round-to-nearest -* mode. -* -* \return Returns the \p half result of division \p a by \p b. -*/ -__CUDA_FP16_DECL__ __half __hdiv(const __half a, const __half b); -/** -* \ingroup CUDA_MATH__HALF_ARITHMETIC -* \brief Performs \p half addition in round-to-nearest-even mode, with -* saturation to [0.0, 1.0]. -* -* Performs \p half add of inputs \p a and \p b, in round-to-nearest-even mode, -* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. -* -* \return Returns the \p half result of adding \p a and \p b with saturation. -*/ -__CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b); -/** -* \ingroup CUDA_MATH__HALF_ARITHMETIC -* \brief Performs \p half subtraction in round-to-nearest-even mode, with -* saturation to [0.0, 1.0]. -* -* Subtracts \p half input \p b from input \p a in round-to-nearest -* mode, -* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. -* -* \return Returns the \p half result of subtraction \p b from \p a -* with saturation. -*/ -__CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b); -/** -* \ingroup CUDA_MATH__HALF_ARITHMETIC -* \brief Performs \p half multiplication in round-to-nearest-even mode, with -* saturation to [0.0, 1.0]. -* -* Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest -* mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to -* +0.0. -* -* \return Returns the \p half result of multiplying \p a and \p b with -* saturation. -*/ -__CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b); -/** -* \ingroup CUDA_MATH__HALF_ARITHMETIC -* \brief Performs \p half fused multiply-add in round-to-nearest-even mode. -* -* Performs \p half multiply on inputs \p a and \p b, -* then performs a \p half add of the result with \p c, -* rounding the result once in round-to-nearest-even mode. -* -* \return Returns the \p half result of the fused multiply-add operation on \p -* a, \p b, and \p c. -*/ -__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c); -/** -* \ingroup CUDA_MATH__HALF_ARITHMETIC -* \brief Performs \p half fused multiply-add in round-to-nearest-even mode, -* with saturation to [0.0, 1.0]. -* -* Performs \p half multiply on inputs \p a and \p b, -* then performs a \p half add of the result with \p c, -* rounding the result once in round-to-nearest-even mode, and clamps the result -* to range [0.0, 1.0]. NaN results are flushed to +0.0. -* -* \return Returns the \p half result of the fused multiply-add operation on \p -* a, \p b, and \p c with saturation. -*/ -__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c); -/** -* \ingroup CUDA_MATH__HALF_ARITHMETIC -* \brief Negates input \p half number and returns the result. -* -* Negates input \p half number and returns the result. -* -* \return Returns negated \p half input \p a. -*/ -__CUDA_FP16_DECL__ __half __hneg(const __half a); -/** -* \ingroup CUDA_MATH__HALF2_COMPARISON -* \brief Performs \p half2 vector if-equal comparison, and returns boolean true -* iff both \p half results are true, boolean false otherwise. -* -* Performs \p half2 vector if-equal comparison of inputs \p a and \p b. -* The bool result is set to true only if both \p half if-equal comparisons -* evaluate to true, or false otherwise. -* NaN inputs generate false results. -* -* \return Returns boolean true if both \p half results of if-equal comparison -* of vectors \p a and \p b are true, boolean false otherwise. -*/ -__CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_COMPARISON -* \brief Performs \p half2 vector not-equal comparison, and returns boolean -* true iff both \p half results are true, boolean false otherwise. -* -* Performs \p half2 vector not-equal comparison of inputs \p a and \p b. -* The bool result is set to true only if both \p half not-equal comparisons -* evaluate to true, or false otherwise. -* NaN inputs generate false results. -* -* \return Returns boolean true if both \p half results of not-equal comparison -* of vectors \p a and \p b are true, boolean false otherwise. -*/ -__CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_COMPARISON -* \brief Performs \p half2 vector less-equal comparison, and returns boolean -* true iff both \p half results are true, boolean false otherwise. -* -* Performs \p half2 vector less-equal comparison of inputs \p a and \p b. -* The bool result is set to true only if both \p half less-equal comparisons -* evaluate to true, or false otherwise. -* NaN inputs generate false results. -* -* \return Returns boolean true if both \p half results of less-equal comparison -* of vectors \p a and \p b are true, boolean false otherwise. -*/ -__CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_COMPARISON -* \brief Performs \p half2 vector greater-equal comparison, and returns boolean -* true iff both \p half results are true, boolean false otherwise. -* -* Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. -* The bool result is set to true only if both \p half greater-equal comparisons -* evaluate to true, or false otherwise. -* NaN inputs generate false results. -* -* \return Returns boolean true if both \p half results of greater-equal -* comparison of vectors \p a and \p b are true, boolean false otherwise. -*/ -__CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_COMPARISON -* \brief Performs \p half2 vector less-than comparison, and returns boolean -* true iff both \p half results are true, boolean false otherwise. -* -* Performs \p half2 vector less-than comparison of inputs \p a and \p b. -* The bool result is set to true only if both \p half less-than comparisons -* evaluate to true, or false otherwise. -* NaN inputs generate false results. -* -* \return Returns boolean true if both \p half results of less-than comparison -* of vectors \p a and \p b are true, boolean false otherwise. -*/ -__CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_COMPARISON -* \brief Performs \p half2 vector greater-than comparison, and returns boolean -* true iff both \p half results are true, boolean false otherwise. -* -* Performs \p half2 vector greater-than comparison of inputs \p a and \p b. -* The bool result is set to true only if both \p half greater-than comparisons -* evaluate to true, or false otherwise. -* NaN inputs generate false results. -* -* \return Returns boolean true if both \p half results of greater-than -* comparison of vectors \p a and \p b are true, boolean false otherwise. -*/ -__CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_COMPARISON -* \brief Performs \p half2 vector unordered if-equal comparison, and returns -* boolean true iff both \p half results are true, boolean false otherwise. -* -* Performs \p half2 vector if-equal comparison of inputs \p a and \p b. -* The bool result is set to true only if both \p half if-equal comparisons -* evaluate to true, or false otherwise. -* NaN inputs generate true results. -* -* \return Returns boolean true if both \p half results of unordered if-equal -* comparison of vectors \p a and \p b are true, boolean false otherwise. -*/ -__CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_COMPARISON -* \brief Performs \p half2 vector unordered not-equal comparison, and returns -* boolean true iff both \p half results are true, boolean false otherwise. -* -* Performs \p half2 vector not-equal comparison of inputs \p a and \p b. -* The bool result is set to true only if both \p half not-equal comparisons -* evaluate to true, or false otherwise. -* NaN inputs generate true results. -* -* \return Returns boolean true if both \p half results of unordered not-equal -* comparison of vectors \p a and \p b are true, boolean false otherwise. -*/ -__CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_COMPARISON -* \brief Performs \p half2 vector unordered less-equal comparison, and returns -* boolean true iff both \p half results are true, boolean false otherwise. -* -* Performs \p half2 vector less-equal comparison of inputs \p a and \p b. -* The bool result is set to true only if both \p half less-equal comparisons -* evaluate to true, or false otherwise. -* NaN inputs generate true results. -* -* \return Returns boolean true if both \p half results of unordered less-equal -* comparison of vectors \p a and \p b are true, boolean false otherwise. -*/ -__CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_COMPARISON -* \brief Performs \p half2 vector unordered greater-equal comparison, and -* returns boolean true iff both \p half results are true, boolean false -* otherwise. -* -* Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. -* The bool result is set to true only if both \p half greater-equal comparisons -* evaluate to true, or false otherwise. -* NaN inputs generate true results. -* -* \return Returns boolean true if both \p half results of unordered -* greater-equal comparison of vectors \p a and \p b are true, boolean false -* otherwise. -*/ -__CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_COMPARISON -* \brief Performs \p half2 vector unordered less-than comparison, and returns -* boolean true iff both \p half results are true, boolean false otherwise. -* -* Performs \p half2 vector less-than comparison of inputs \p a and \p b. -* The bool result is set to true only if both \p half less-than comparisons -* evaluate to true, or false otherwise. -* NaN inputs generate true results. -* -* \return Returns boolean true if both \p half results of unordered less-than -* comparison of vectors \p a and \p b are true, boolean false otherwise. -*/ -__CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF2_COMPARISON -* \brief Performs \p half2 vector unordered greater-than comparison, and -* returns boolean true iff both \p half results are true, boolean false -* otherwise. -* -* Performs \p half2 vector greater-than comparison of inputs \p a and \p b. -* The bool result is set to true only if both \p half greater-than comparisons -* evaluate to true, or false otherwise. -* NaN inputs generate true results. -* -* \return Returns boolean true if both \p half results of unordered -* greater-than comparison of vectors \p a and \p b are true, boolean false -* otherwise. -*/ -__CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b); -/** -* \ingroup CUDA_MATH__HALF_COMPARISON -* \brief Performs \p half if-equal comparison. -* -* Performs \p half if-equal comparison of inputs \p a and \p b. -* NaN inputs generate false results. -* -* \return Returns boolean result of if-equal comparison of \p a and \p b. -*/ -__CUDA_FP16_DECL__ bool __heq(const __half a, const __half b); -/** -* \ingroup CUDA_MATH__HALF_COMPARISON -* \brief Performs \p half not-equal comparison. -* -* Performs \p half not-equal comparison of inputs \p a and \p b. -* NaN inputs generate false results. -* -* \return Returns boolean result of not-equal comparison of \p a and \p b. -*/ -__CUDA_FP16_DECL__ bool __hne(const __half a, const __half b); -/** -* \ingroup CUDA_MATH__HALF_COMPARISON -* \brief Performs \p half less-equal comparison. -* -* Performs \p half less-equal comparison of inputs \p a and \p b. -* NaN inputs generate false results. -* -* \return Returns boolean result of less-equal comparison of \p a and \p b. -*/ -__CUDA_FP16_DECL__ bool __hle(const __half a, const __half b); -/** -* \ingroup CUDA_MATH__HALF_COMPARISON -* \brief Performs \p half greater-equal comparison. -* -* Performs \p half greater-equal comparison of inputs \p a and \p b. -* NaN inputs generate false results. -* -* \return Returns boolean result of greater-equal comparison of \p a and \p b. -*/ -__CUDA_FP16_DECL__ bool __hge(const __half a, const __half b); -/** -* \ingroup CUDA_MATH__HALF_COMPARISON -* \brief Performs \p half less-than comparison. -* -* Performs \p half less-than comparison of inputs \p a and \p b. -* NaN inputs generate false results. -* -* \return Returns boolean result of less-than comparison of \p a and \p b. -*/ -__CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b); -/** -* \ingroup CUDA_MATH__HALF_COMPARISON -* \brief Performs \p half greater-than comparison. -* -* Performs \p half greater-than comparison of inputs \p a and \p b. -* NaN inputs generate false results. -* -* \return Returns boolean result of greater-than comparison of \p a and \p b. -*/ -__CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b); -/** -* \ingroup CUDA_MATH__HALF_COMPARISON -* \brief Performs \p half unordered if-equal comparison. -* -* Performs \p half if-equal comparison of inputs \p a and \p b. -* NaN inputs generate true results. -* -* \return Returns boolean result of unordered if-equal comparison of \p a and -* \p b. -*/ -__CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b); -/** -* \ingroup CUDA_MATH__HALF_COMPARISON -* \brief Performs \p half unordered not-equal comparison. -* -* Performs \p half not-equal comparison of inputs \p a and \p b. -* NaN inputs generate true results. -* -* \return Returns boolean result of unordered not-equal comparison of \p a and -* \p b. -*/ -__CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b); -/** -* \ingroup CUDA_MATH__HALF_COMPARISON -* \brief Performs \p half unordered less-equal comparison. -* -* Performs \p half less-equal comparison of inputs \p a and \p b. -* NaN inputs generate true results. -* -* \return Returns boolean result of unordered less-equal comparison of \p a and -* \p b. -*/ -__CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b); -/** -* \ingroup CUDA_MATH__HALF_COMPARISON -* \brief Performs \p half unordered greater-equal comparison. -* -* Performs \p half greater-equal comparison of inputs \p a and \p b. -* NaN inputs generate true results. -* -* \return Returns boolean result of unordered greater-equal comparison of \p a -* and \p b. -*/ -__CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b); -/** -* \ingroup CUDA_MATH__HALF_COMPARISON -* \brief Performs \p half unordered less-than comparison. -* -* Performs \p half less-than comparison of inputs \p a and \p b. -* NaN inputs generate true results. -* -* \return Returns boolean result of unordered less-than comparison of \p a and -* \p b. -*/ -__CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b); -/** -* \ingroup CUDA_MATH__HALF_COMPARISON -* \brief Performs \p half unordered greater-than comparison. -* -* Performs \p half greater-than comparison of inputs \p a and \p b. -* NaN inputs generate true results. -* -* \return Returns boolean result of unordered greater-than comparison of \p a -* and \p b. -*/ -__CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b); -/** -* \ingroup CUDA_MATH__HALF_COMPARISON -* \brief Determine whether \p half argument is a NaN. -* -* Determine whether \p half value \p a is a NaN. -* -* \return Returns boolean true iff argument is a NaN, boolean false otherwise. -*/ -__CUDA_FP16_DECL__ bool __hisnan(const __half a); -/** -* \ingroup CUDA_MATH__HALF_FUNCTIONS -* \brief Calculates \p half square root in round-to-nearest-even mode. -* -* Calculates \p half square root of input \p a in round-to-nearest-even mode. -* -* \return Returns \p half square root of \p a. -*/ -__CUDA_FP16_DECL__ __half hsqrt(const __half a); -/** -* \ingroup CUDA_MATH__HALF_FUNCTIONS -* \brief Calculates \p half reciprocal square root in round-to-nearest-even -* mode. -* -* Calculates \p half reciprocal square root of input \p a in round-to-nearest -* mode. -* -* \return Returns \p half reciprocal square root of \p a. -*/ -__CUDA_FP16_DECL__ __half hrsqrt(const __half a); -/** -* \ingroup CUDA_MATH__HALF_FUNCTIONS -* \brief Calculates \p half reciprocal in round-to-nearest-even mode. -* -* Calculates \p half reciprocal of input \p a in round-to-nearest-even mode. -* -* \return Returns \p half reciprocal of \p a. -*/ -__CUDA_FP16_DECL__ __half hrcp(const __half a); -/** -* \ingroup CUDA_MATH__HALF_FUNCTIONS -* \brief Calculates \p half natural logarithm in round-to-nearest-even mode. -* -* Calculates \p half natural logarithm of input \p a in round-to-nearest-even -* mode. -* -* \return Returns \p half natural logarithm of \p a. -*/ -__CUDA_FP16_DECL__ __half hlog(const __half a); -/** -* \ingroup CUDA_MATH__HALF_FUNCTIONS -* \brief Calculates \p half binary logarithm in round-to-nearest-even mode. -* -* Calculates \p half binary logarithm of input \p a in round-to-nearest-even -* mode. -* -* \return Returns \p half binary logarithm of \p a. -*/ -__CUDA_FP16_DECL__ __half hlog2(const __half a); -/** -* \ingroup CUDA_MATH__HALF_FUNCTIONS -* \brief Calculates \p half decimal logarithm in round-to-nearest-even mode. -* -* Calculates \p half decimal logarithm of input \p a in round-to-nearest-even -* mode. -* -* \return Returns \p half decimal logarithm of \p a. -*/ -__CUDA_FP16_DECL__ __half hlog10(const __half a); -/** -* \ingroup CUDA_MATH__HALF_FUNCTIONS -* \brief Calculates \p half natural exponential function in round-to-nearest -* mode. -* -* Calculates \p half natural exponential function of input \p a in -* round-to-nearest-even mode. -* -* \return Returns \p half natural exponential function of \p a. -*/ -__CUDA_FP16_DECL__ __half hexp(const __half a); -/** -* \ingroup CUDA_MATH__HALF_FUNCTIONS -* \brief Calculates \p half binary exponential function in round-to-nearest -* mode. -* -* Calculates \p half binary exponential function of input \p a in -* round-to-nearest-even mode. -* -* \return Returns \p half binary exponential function of \p a. -*/ -__CUDA_FP16_DECL__ __half hexp2(const __half a); -/** -* \ingroup CUDA_MATH__HALF_FUNCTIONS -* \brief Calculates \p half decimal exponential function in round-to-nearest -* mode. -* -* Calculates \p half decimal exponential function of input \p a in -* round-to-nearest-even mode. -* -* \return Returns \p half decimal exponential function of \p a. -*/ -__CUDA_FP16_DECL__ __half hexp10(const __half a); -/** -* \ingroup CUDA_MATH__HALF_FUNCTIONS -* \brief Calculates \p half cosine in round-to-nearest-even mode. -* -* Calculates \p half cosine of input \p a in round-to-nearest-even mode. -* -* \return Returns \p half cosine of \p a. -*/ -__CUDA_FP16_DECL__ __half hcos(const __half a); -/** -* \ingroup CUDA_MATH__HALF_FUNCTIONS -* \brief Calculates \p half sine in round-to-nearest-even mode. -* -* Calculates \p half sine of input \p a in round-to-nearest-even mode. -* -* \return Returns \p half sine of \p a. -*/ -__CUDA_FP16_DECL__ __half hsin(const __half a); -/** -* \ingroup CUDA_MATH__HALF2_FUNCTIONS -* \brief Calculates \p half2 vector square root in round-to-nearest-even mode. -* -* Calculates \p half2 square root of input vector \p a in round-to-nearest -* mode. -* -* \return Returns \p half2 square root of vector \p a. -*/ -__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a); -/** -* \ingroup CUDA_MATH__HALF2_FUNCTIONS -* \brief Calculates \p half2 vector reciprocal square root in round-to-nearest -* mode. -* -* Calculates \p half2 reciprocal square root of input vector \p a in -* round-to-nearest-even mode. -* -* \return Returns \p half2 reciprocal square root of vector \p a. -*/ -__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a); -/** -* \ingroup CUDA_MATH__HALF2_FUNCTIONS -* \brief Calculates \p half2 vector reciprocal in round-to-nearest-even mode. -* -* Calculates \p half2 reciprocal of input vector \p a in round-to-nearest-even -* mode. -* -* \return Returns \p half2 reciprocal of vector \p a. -*/ -__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a); -/** -* \ingroup CUDA_MATH__HALF2_FUNCTIONS -* \brief Calculates \p half2 vector natural logarithm in round-to-nearest-even -* mode. -* -* Calculates \p half2 natural logarithm of input vector \p a in -* round-to-nearest-even mode. -* -* \return Returns \p half2 natural logarithm of vector \p a. -*/ -__CUDA_FP16_DECL__ __half2 h2log(const __half2 a); -/** -* \ingroup CUDA_MATH__HALF2_FUNCTIONS -* \brief Calculates \p half2 vector binary logarithm in round-to-nearest-even -* mode. -* -* Calculates \p half2 binary logarithm of input vector \p a in round-to-nearest -* mode. -* -* \return Returns \p half2 binary logarithm of vector \p a. -*/ -__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a); -/** -* \ingroup CUDA_MATH__HALF2_FUNCTIONS -* \brief Calculates \p half2 vector decimal logarithm in round-to-nearest-even -* mode. -* -* Calculates \p half2 decimal logarithm of input vector \p a in -* round-to-nearest-even mode. -* -* \return Returns \p half2 decimal logarithm of vector \p a. -*/ -__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a); -/** -* \ingroup CUDA_MATH__HALF2_FUNCTIONS -* \brief Calculates \p half2 vector exponential function in round-to-nearest -* mode. -* -* Calculates \p half2 exponential function of input vector \p a in -* round-to-nearest-even mode. -* -* \return Returns \p half2 exponential function of vector \p a. -*/ -__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a); -/** -* \ingroup CUDA_MATH__HALF2_FUNCTIONS -* \brief Calculates \p half2 vector binary exponential function in -* round-to-nearest-even mode. -* -* Calculates \p half2 binary exponential function of input vector \p a in -* round-to-nearest-even mode. -* -* \return Returns \p half2 binary exponential function of vector \p a. -*/ -__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a); -/** -* \ingroup CUDA_MATH__HALF2_FUNCTIONS -* \brief Calculates \p half2 vector decimal exponential function in -* round-to-nearest-even mode. -* -* Calculates \p half2 decimal exponential function of input vector \p a in -* round-to-nearest-even mode. -* -* \return Returns \p half2 decimal exponential function of vector \p a. -*/ -__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a); -/** -* \ingroup CUDA_MATH__HALF2_FUNCTIONS -* \brief Calculates \p half2 vector cosine in round-to-nearest-even mode. -* -* Calculates \p half2 cosine of input vector \p a in round-to-nearest-even -* mode. -* -* \return Returns \p half2 cosine of vector \p a. -*/ -__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a); -/** -* \ingroup CUDA_MATH__HALF2_FUNCTIONS -* \brief Calculates \p half2 vector sine in round-to-nearest-even mode. -* -* Calculates \p half2 sine of input vector \p a in round-to-nearest-even mode. -* -* \return Returns \p half2 sine of vector \p a. -*/ -__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a); - -#endif /*if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/ - -#undef __CUDA_FP16_DECL__ - -#endif /* defined(__cplusplus) && defined(__CUDACC__) */ - -/* Note the .hpp file is included even for host-side compilation, to capture the "half" & "half2" definitions */ -#include "cuda_fp16.hpp" - -#endif /* end of include guard: __CUDA_FP16_H__ */ diff --git a/include/triton/external/CUDA/cuda_fp16.hpp b/include/triton/external/CUDA/cuda_fp16.hpp deleted file mode 100755 index dcbab74ae..000000000 --- a/include/triton/external/CUDA/cuda_fp16.hpp +++ /dev/null @@ -1,1797 +0,0 @@ -/* -* Copyright 1993-2014 NVIDIA Corporation. All rights reserved. -* -* NOTICE TO LICENSEE: -* -* This source code and/or documentation ("Licensed Deliverables") are -* subject to NVIDIA intellectual property rights under U.S. and -* international Copyright laws. -* -* These Licensed Deliverables contained herein is PROPRIETARY and -* CONFIDENTIAL to NVIDIA and is being provided under the terms and -* conditions of a form of NVIDIA software license agreement by and -* between NVIDIA and Licensee ("License Agreement") or electronically -* accepted by Licensee. Notwithstanding any terms or conditions to -* the contrary in the License Agreement, reproduction or disclosure -* of the Licensed Deliverables to any third party without the express -* written consent of NVIDIA is prohibited. -* -* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE -* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE -* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS -* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. -* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED -* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, -* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. -* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE -* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY -* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY -* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, -* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS -* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE -* OF THESE LICENSED DELIVERABLES. -* -* U.S. Government End Users. These Licensed Deliverables are a -* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT -* 1995), consisting of "commercial computer software" and "commercial -* computer software documentation" as such terms are used in 48 -* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government -* only as a commercial end item. Consistent with 48 C.F.R.12.212 and -* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all -* U.S. Government End Users acquire the Licensed Deliverables with -* only those rights set forth herein. -* -* Any use of the Licensed Deliverables in individual and commercial -* software must include, in the user documentation and internal -* comments to the code, the above Disclaimer and U.S. Government End -* Users Notice. -*/ - -#if !defined(__CUDA_FP16_HPP__) -#define __CUDA_FP16_HPP__ - -/* C++11 header for std::move */ -#if __cplusplus >= 201103L -#include -#endif /* __cplusplus >= 201103L */ - -/* Set up function decorations */ -#if defined(__CUDACC_RTC__) -#define __CUDA_FP16_DECL__ __host__ __device__ -#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__ -#define __CUDA_HOSTDEVICE__ __host__ __device__ -#elif defined(__CUDACC__) /* !__CUDACC_RTC__ but yes __CUDACC__ */ -#define __CUDA_FP16_DECL__ static __device__ __inline__ -#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__ -#define __CUDA_HOSTDEVICE__ __host__ __device__ -#else /* !__CUDACC_RTC and !__CUDACC__ (i.e. host non-nvcc compiler */ -#define __CUDA_HOSTDEVICE__ -#endif /* __CUDACC_RTC__ and __CUDACC__ */ - -/* Set up structure-alignment attribute */ -#if defined(__CUDACC__) -#define __CUDA_ALIGN__(align) __align__(align) -#else -/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */ -#if __cplusplus >= 201103L -#define __CUDA_ALIGN__(n) alignas(n) /* C++11 kindly gives us a keyword for this */ -#else /* !(__cplusplus >= 201103L)*/ -#if defined(__GNUC__) /* || defined(__IBMC__) || defined(__clang__) || defined(__PGI) */ -#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n))) -#elif defined(_MSC_VER) /* || defined(__ICC) */ -#define __CUDA_ALIGN__(n) __declspec(align(n)) -#else -#define __CUDA_ALIGN__(n) -#endif /* defined(__GNUC__) */ -#endif /* __cplusplus >= 201103L */ -#endif /* defined(__CUDACC__) */ - - -/* Macros to allow half & half2 to be used by inline assembly */ -#define __HALF_TO_US(var) *(reinterpret_cast(&(var))) -#define __HALF_TO_CUS(var) *(reinterpret_cast(&(var))) -#define __HALF2_TO_UI(var) *(reinterpret_cast(&(var))) -#define __HALF2_TO_CUI(var) *(reinterpret_cast(&(var))) - - -/** -* Types which allow static initialization of "half" and "half2" until -* these become an actual builtin. Note this initialization is as a -* bitfield representation of "half", and not a conversion from short->half. -* Such a representation will be deprecated in a future version of CUDA. -* (Note these are visible to non-nvcc compilers, including C-only compilation) -*/ -typedef struct __CUDA_ALIGN__(2) { - unsigned short x; -} __half_raw; - -typedef struct __CUDA_ALIGN__(4) { - unsigned short x, y; -} __half2_raw; - -/* All other definitions in this file are only visible to C++ compilers */ -#if defined(__cplusplus) - -/* Hide GCC member initialization list warnings because of host/device in-function init requirement */ -#if defined(__GNUC__) -#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wstrict-aliasing" -#pragma GCC diagnostic ignored "-Weffc++" -#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ -#endif /* defined(__GNUC__) */ - -struct __CUDA_ALIGN__(2) __half { -protected: - unsigned short __x; - -public: -#if __cplusplus >= 201103L - __half() = default; -#else - __CUDA_HOSTDEVICE__ __half() { } -#endif /* __cplusplus >= 201103L */ - - /* Convert to/from __half_raw */ - __CUDA_HOSTDEVICE__ __half(const __half_raw &hr) : __x(hr.x) { } - __CUDA_HOSTDEVICE__ __half &operator=(const __half_raw &hr) { __x = hr.x; return *this; } - __CUDA_HOSTDEVICE__ operator __half_raw() const { __half_raw ret; ret.x = __x; return ret; } - -/* Member functions are only available to nvcc compilation */ -#if defined(__CUDACC__) -#if !defined(__CUDA_NO_HALF_CONVERSIONS__) - /* Allow automatic construction from types supported natively in hardware */ - /* Note we do avoid constructor init-list because of special host/device compilation rules */ - __device__ __half(float f) { __x = __float2half(f).__x; } - __device__ __half(double f) { __x = __float2half((float)f).__x; } - __device__ __half(short val) { __x = __short2half_rn(val).__x; } - __device__ __half(unsigned short val) { __x = __ushort2half_rn(val).__x; } - __device__ __half(int val) { __x = __int2half_rn(val).__x; } - __device__ __half(unsigned int val) { __x = __uint2half_rn(val).__x; } - __device__ __half(long long val) { __x = __ll2half_rn(val).__x; } - __device__ __half(unsigned long long val) { __x = __ull2half_rn(val).__x; } - - /* Allow automatic casts to supported builtin types, matching all that are permitted with float */ - __device__ operator float() const { return __half2float(*this); } - __device__ __half &operator=(float f) { __x = __float2half(f).__x; return *this; } - - /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */ - __device__ __half &operator=(double f) { __x = __float2half((float)f).__x; return *this; } - - __device__ operator short() const { return __half2short_rn(*this); } - __device__ __half &operator=(short val) { __x = __short2half_rn(val).__x; return *this; } - - __device__ operator unsigned short() const { return __half2ushort_rn(*this); } - __device__ __half &operator=(unsigned short val) { __x = __ushort2half_rn(val).__x; return *this; } - - __device__ operator int() const { return __half2int_rn(*this); } - __device__ __half &operator=(int val) { __x = __int2half_rn(val).__x; return *this; } - - __device__ operator unsigned int() const { return __half2uint_rn(*this); } - __device__ __half &operator=(unsigned int val) { __x = __uint2half_rn(val).__x; return *this; } - - __device__ operator long long() const { return __half2ll_rn(*this); } - __device__ __half &operator=(long long val) { __x = __ll2half_rn(val).__x; return *this; } - - __device__ operator unsigned long long() const { return __half2ull_rn(*this); } - __device__ __half &operator=(unsigned long long val) { __x = __ull2half_rn(val).__x; return *this; } - - /* Boolean conversion - note both 0 and -0 must return false */ - __device__ operator bool() const { return (__x & 0x7FFF) != 0; } -#endif /* !defined(__CUDA_NO_HALF_CONVERSIONS__) */ -#endif /* defined(__CUDACC__) */ -}; - -/* Global-space operator functions are only available to nvcc compilation */ -#if defined(__CUDACC__) - -/* Arithmetic FP16 operations only supported on arch >= 5.3 */ -#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) -#if !defined(__CUDA_NO_HALF_OPERATORS__) -/* Some basic arithmetic operations expected of a builtin */ -__device__ __forceinline__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); } -__device__ __forceinline__ __half operator-(const __half &lh, const __half &rh) { return __hsub(lh, rh); } -__device__ __forceinline__ __half operator*(const __half &lh, const __half &rh) { return __hmul(lh, rh); } -__device__ __forceinline__ __half operator/(const __half &lh, const __half &rh) { return __hdiv(lh, rh); } - -__device__ __forceinline__ __half &operator+=(__half &lh, const __half &rh) { lh = __hadd(lh, rh); return lh; } -__device__ __forceinline__ __half &operator-=(__half &lh, const __half &rh) { lh = __hsub(lh, rh); return lh; } -__device__ __forceinline__ __half &operator*=(__half &lh, const __half &rh) { lh = __hmul(lh, rh); return lh; } -__device__ __forceinline__ __half &operator/=(__half &lh, const __half &rh) { lh = __hdiv(lh, rh); return lh; } - -/* Note for increment and decrement we use the raw value 0x3C00 equating to half(1.0f), to avoid the extra conversion */ -__device__ __forceinline__ __half &operator++(__half &h) { __half_raw one; one.x = 0x3C00; h += one; return h; } -__device__ __forceinline__ __half &operator--(__half &h) { __half_raw one; one.x = 0x3C00; h -= one; return h; } -__device__ __forceinline__ __half operator++(__half &h, int) { __half ret = h; __half_raw one; one.x = 0x3C00; h += one; return ret; } -__device__ __forceinline__ __half operator--(__half &h, int) { __half ret = h; __half_raw one; one.x = 0x3C00; h -= one; return ret; } - -/* Unary plus and inverse operators */ -__device__ __forceinline__ __half operator+(const __half &h) { return h; } -__device__ __forceinline__ __half operator-(const __half &h) { return __hneg(h); } - -/* Some basic comparison operations to make it look like a builtin */ -__device__ __forceinline__ bool operator==(const __half &lh, const __half &rh) { return __heq(lh, rh); } -__device__ __forceinline__ bool operator!=(const __half &lh, const __half &rh) { return __hne(lh, rh); } -__device__ __forceinline__ bool operator> (const __half &lh, const __half &rh) { return __hgt(lh, rh); } -__device__ __forceinline__ bool operator< (const __half &lh, const __half &rh) { return __hlt(lh, rh); } -__device__ __forceinline__ bool operator>=(const __half &lh, const __half &rh) { return __hge(lh, rh); } -__device__ __forceinline__ bool operator<=(const __half &lh, const __half &rh) { return __hle(lh, rh); } -#endif /* !defined(__CUDA_NO_HALF_OPERATORS__) */ -#endif /* __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) */ -#endif /* defined(__CUDACC__) */ - -/* __half2 is visible to non-nvcc host compilers */ -struct __CUDA_ALIGN__(4) __half2 { - __half x, y; - - // All construct/copy/assign/move -public: -#if __cplusplus >= 201103L - __half2() = default; - __CUDA_HOSTDEVICE__ __half2(__half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); } - __CUDA_HOSTDEVICE__ __half2 &operator=(__half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); return *this; } -#else - __CUDA_HOSTDEVICE__ __half2() { } -#endif /* __cplusplus >= 201103L */ - __CUDA_HOSTDEVICE__ __half2(const __half &a, const __half &b) : x(a), y(b) { } - __CUDA_HOSTDEVICE__ __half2(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); } - __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); return *this; } - - /* Convert to/from __half2_raw */ - __CUDA_HOSTDEVICE__ __half2(const __half2_raw &h2r ) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); } - __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2_raw &h2r) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); return *this; } - __CUDA_HOSTDEVICE__ operator __half2_raw() const { __half2_raw ret; __HALF2_TO_UI(ret) = __HALF2_TO_CUI(*this); return ret; } -}; - -/* Restore -Weffc++ warnings from here on */ -#if defined(__GNUC__) -#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) -#pragma GCC diagnostic pop -#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ -#endif /* defined(__GNUC__) */ - -#undef __CUDA_HOSTDEVICE__ -#undef __CUDA_ALIGN__ - -/* All intrinsic functions are only available to nvcc compilers */ -#if defined(__CUDACC__) - -/* CUDA vector-types compatible vector creation function (note returns __half2, not half2) */ -__VECTOR_FUNCTIONS_DECL__ __half2 make_half2(__half x, __half y) -{ - __half2 t; t.x = x; t.y = y; return t; -} -#undef __VECTOR_FUNCTIONS_DECL__ - - -/* Definitions of intrinsics */ -__CUDA_FP16_DECL__ int __half2int_rn(__half h) -{ - int i; - asm("cvt.rni.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); - return i; -} -__CUDA_FP16_DECL__ int __half2int_rz(__half h) -{ - int i; - asm("cvt.rzi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); - return i; -} -__CUDA_FP16_DECL__ int __half2int_rd(__half h) -{ - int i; - asm("cvt.rmi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); - return i; -} -__CUDA_FP16_DECL__ int __half2int_ru(__half h) -{ - int i; - asm("cvt.rpi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); - return i; -} -__CUDA_FP16_DECL__ __half __int2half_rn(int i) -{ - __half h; - asm("cvt.rn.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); - return h; -} -__CUDA_FP16_DECL__ __half __int2half_rz(int i) -{ - __half h; - asm("cvt.rz.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); - return h; -} -__CUDA_FP16_DECL__ __half __int2half_rd(int i) -{ - __half h; - asm("cvt.rm.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); - return h; -} -__CUDA_FP16_DECL__ __half __int2half_ru(int i) -{ - __half h; - asm("cvt.rp.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); - return h; -} - -__CUDA_FP16_DECL__ short int __half2short_rn(__half h) -{ - short int i; - asm("cvt.rni.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); - return i; -} -__CUDA_FP16_DECL__ short int __half2short_rz(__half h) -{ - short int i; - asm("cvt.rzi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); - return i; -} -__CUDA_FP16_DECL__ short int __half2short_rd(__half h) -{ - short int i; - asm("cvt.rmi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); - return i; -} -__CUDA_FP16_DECL__ short int __half2short_ru(__half h) -{ - short int i; - asm("cvt.rpi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); - return i; -} -__CUDA_FP16_DECL__ __half __short2half_rn(short int i) -{ - __half h; - asm("cvt.rn.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); - return h; -} -__CUDA_FP16_DECL__ __half __short2half_rz(short int i) -{ - __half h; - asm("cvt.rz.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); - return h; -} -__CUDA_FP16_DECL__ __half __short2half_rd(short int i) -{ - __half h; - asm("cvt.rm.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); - return h; -} -__CUDA_FP16_DECL__ __half __short2half_ru(short int i) -{ - __half h; - asm("cvt.rp.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); - return h; -} - -__CUDA_FP16_DECL__ unsigned int __half2uint_rn(__half h) -{ - unsigned int i; - asm("cvt.rni.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); - return i; -} -__CUDA_FP16_DECL__ unsigned int __half2uint_rz(__half h) -{ - unsigned int i; - asm("cvt.rzi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); - return i; -} -__CUDA_FP16_DECL__ unsigned int __half2uint_rd(__half h) -{ - unsigned int i; - asm("cvt.rmi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); - return i; -} -__CUDA_FP16_DECL__ unsigned int __half2uint_ru(__half h) -{ - unsigned int i; - asm("cvt.rpi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_US(h))); - return i; -} -__CUDA_FP16_DECL__ __half __uint2half_rn(unsigned int i) -{ - __half h; - asm("cvt.rn.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); - return h; -} -__CUDA_FP16_DECL__ __half __uint2half_rz(unsigned int i) -{ - __half h; - asm("cvt.rz.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); - return h; -} -__CUDA_FP16_DECL__ __half __uint2half_rd(unsigned int i) -{ - __half h; - asm("cvt.rm.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); - return h; -} -__CUDA_FP16_DECL__ __half __uint2half_ru(unsigned int i) -{ - __half h; - asm("cvt.rp.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); - return h; -} - -__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(__half h) -{ - unsigned short int i; - asm("cvt.rni.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); - return i; -} -__CUDA_FP16_DECL__ unsigned short int __half2ushort_rz(__half h) -{ - unsigned short int i; - asm("cvt.rzi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); - return i; -} -__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(__half h) -{ - unsigned short int i; - asm("cvt.rmi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); - return i; -} -__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(__half h) -{ - unsigned short int i; - asm("cvt.rpi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_US(h))); - return i; -} -__CUDA_FP16_DECL__ __half __ushort2half_rn(unsigned short int i) -{ - __half h; - asm("cvt.rn.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); - return h; -} -__CUDA_FP16_DECL__ __half __ushort2half_rz(unsigned short int i) -{ - __half h; - asm("cvt.rz.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); - return h; -} -__CUDA_FP16_DECL__ __half __ushort2half_rd(unsigned short int i) -{ - __half h; - asm("cvt.rm.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); - return h; -} -__CUDA_FP16_DECL__ __half __ushort2half_ru(unsigned short int i) -{ - __half h; - asm("cvt.rp.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); - return h; -} - -__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(__half h) -{ - unsigned long long int i; - asm("cvt.rni.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); - return i; -} -__CUDA_FP16_DECL__ unsigned long long int __half2ull_rz(__half h) -{ - unsigned long long int i; - asm("cvt.rzi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); - return i; -} -__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(__half h) -{ - unsigned long long int i; - asm("cvt.rmi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); - return i; -} -__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(__half h) -{ - unsigned long long int i; - asm("cvt.rpi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); - return i; -} -__CUDA_FP16_DECL__ __half __ull2half_rn(unsigned long long int i) -{ - __half h; - asm("cvt.rn.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); - return h; -} -__CUDA_FP16_DECL__ __half __ull2half_rz(unsigned long long int i) -{ - __half h; - asm("cvt.rz.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); - return h; -} -__CUDA_FP16_DECL__ __half __ull2half_rd(unsigned long long int i) -{ - __half h; - asm("cvt.rm.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); - return h; -} -__CUDA_FP16_DECL__ __half __ull2half_ru(unsigned long long int i) -{ - __half h; - asm("cvt.rp.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); - return h; -} - -__CUDA_FP16_DECL__ long long int __half2ll_rn(__half h) -{ - long long int i; - asm("cvt.rni.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); - return i; -} -__CUDA_FP16_DECL__ long long int __half2ll_rz(__half h) -{ - long long int i; - asm("cvt.rzi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); - return i; -} -__CUDA_FP16_DECL__ long long int __half2ll_rd(__half h) -{ - long long int i; - asm("cvt.rmi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); - return i; -} -__CUDA_FP16_DECL__ long long int __half2ll_ru(__half h) -{ - long long int i; - asm("cvt.rpi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_US(h))); - return i; -} -__CUDA_FP16_DECL__ __half __ll2half_rn(long long int i) -{ - __half h; - asm("cvt.rn.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); - return h; -} -__CUDA_FP16_DECL__ __half __ll2half_rz(long long int i) -{ - __half h; - asm("cvt.rz.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); - return h; -} -__CUDA_FP16_DECL__ __half __ll2half_rd(long long int i) -{ - __half h; - asm("cvt.rm.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); - return h; -} -__CUDA_FP16_DECL__ __half __ll2half_ru(long long int i) -{ - __half h; - asm("cvt.rp.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); - return h; -} - -__CUDA_FP16_DECL__ __half htrunc(const __half h) -{ - __half r; - asm("cvt.rzi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); - return r; -} -__CUDA_FP16_DECL__ __half hceil(const __half h) -{ - __half r; - asm("cvt.rpi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); - return r; -} -__CUDA_FP16_DECL__ __half hfloor(const __half h) -{ - __half r; - asm("cvt.rmi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); - return r; -} -__CUDA_FP16_DECL__ __half hrint(const __half h) -{ - __half r; - asm("cvt.rni.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); - return r; -} - -__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h) -{ - __half2 val; - asm("{.reg .f16 low,high;\n" - " mov.b32 {low,high}, %1;\n" - " cvt.rzi.f16.f16 low, low;\n" - " cvt.rzi.f16.f16 high, high;\n" - " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); - return val; -} -__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h) -{ - __half2 val; - asm("{.reg .f16 low,high;\n" - " mov.b32 {low,high}, %1;\n" - " cvt.rpi.f16.f16 low, low;\n" - " cvt.rpi.f16.f16 high, high;\n" - " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); - return val; -} -__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h) -{ - __half2 val; - asm("{.reg .f16 low,high;\n" - " mov.b32 {low,high}, %1;\n" - " cvt.rmi.f16.f16 low, low;\n" - " cvt.rmi.f16.f16 high, high;\n" - " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); - return val; -} -__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h) -{ - __half2 val; - asm("{.reg .f16 low,high;\n" - " mov.b32 {low,high}, %1;\n" - " cvt.rni.f16.f16 low, low;\n" - " cvt.rni.f16.f16 high, high;\n" - " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); - return val; -} - -__CUDA_FP16_DECL__ float2 __half22float2(const __half2 l) -{ - float hi_float; - float lo_float; - asm("{.reg .f16 low,high;\n" - " mov.b32 {low,high},%1;\n" - " cvt.f32.f16 %0, low;}\n" : "=f"(lo_float) : "r"(__HALF2_TO_CUI(l))); - - asm("{.reg .f16 low,high;\n" - " mov.b32 {low,high},%1;\n" - " cvt.f32.f16 %0, high;}\n" : "=f"(hi_float) : "r"(__HALF2_TO_CUI(l))); - - return make_float2(lo_float, hi_float); -} -__CUDA_FP16_DECL__ __half __float2half(const float f) -{ - __half val; - asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f)); - return val; -} -__CUDA_FP16_DECL__ __half __float2half_rn(const float f) -{ - __half val; - asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f)); - return val; -} -__CUDA_FP16_DECL__ __half __float2half_rz(const float f) -{ - __half val; - asm("{ cvt.rz.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f)); - return val; -} -__CUDA_FP16_DECL__ __half __float2half_rd(const float f) -{ - __half val; - asm("{ cvt.rm.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f)); - return val; -} -__CUDA_FP16_DECL__ __half __float2half_ru(const float f) -{ - __half val; - asm("{ cvt.rp.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(f)); - return val; -} -__CUDA_FP16_DECL__ float __half2float(const __half h) -{ - float val; - asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(h))); - return val; -} -__CUDA_FP16_DECL__ __half2 __float2half2_rn(const float f) -{ - __half2 val; - asm("{.reg .f16 low;\n" - " cvt.rn.f16.f32 low, %1;\n" - " mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(f)); - return val; -} -__CUDA_FP16_DECL__ __half2 __floats2half2_rn(const float f1, const float f2) -{ - __half2 val; - asm("{.reg .f16 low,high;\n" - " cvt.rn.f16.f32 low, %1;\n" - " cvt.rn.f16.f32 high, %2;\n" - " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(f1), "f"(f2)); - return val; -} -__CUDA_FP16_DECL__ __half2 __float22half2_rn(const float2 f) -{ - __half2 val = __floats2half2_rn(f.x, f.y); - return val; -} -__CUDA_FP16_DECL__ float __low2float(const __half2 l) -{ - float val; - asm("{.reg .f16 low,high;\n" - " mov.b32 {low,high},%1;\n" - " cvt.f32.f16 %0, low;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(l))); - return val; -} -__CUDA_FP16_DECL__ float __high2float(const __half2 l) -{ - float val; - asm("{.reg .f16 low,high;\n" - " mov.b32 {low,high},%1;\n" - " cvt.f32.f16 %0, high;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(l))); - return val; -} -__CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 l, const __half2 h) -{ - __half2 val; - asm("{.reg .f16 alow,ahigh,blow,bhigh;\n" - " mov.b32 {alow,ahigh}, %1;\n" - " mov.b32 {blow,bhigh}, %2;\n" - " mov.b32 %0, {alow,blow};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(l)), "r"(__HALF2_TO_CUI(h))); - return val; -} -__CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 l, const __half2 h) -{ - __half2 val; - asm("{.reg .f16 alow,ahigh,blow,bhigh;\n" - " mov.b32 {alow,ahigh}, %1;\n" - " mov.b32 {blow,bhigh}, %2;\n" - " mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(l)), "r"(__HALF2_TO_CUI(h))); - return val; -} -__CUDA_FP16_DECL__ __half __low2half(const __half2 h) -{ - __half ret; - asm("{.reg .f16 low,high;\n" - " mov.b32 {low,high}, %1;\n" - " mov.b16 %0, low;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(h))); - return ret; -} -__CUDA_FP16_DECL__ int __hisinf(const __half a) -{ - if (__HALF_TO_CUS(a) == 0xFC00) - return -1; - if (__HALF_TO_CUS(a) == 0x7C00) - return 1; - return 0; -} -__CUDA_FP16_DECL__ __half2 __low2half2(const __half2 l) -{ - __half2 val; - asm("{.reg .f16 low,high;\n" - " mov.b32 {low,high}, %1;\n" - " mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(l))); - return val; -} -__CUDA_FP16_DECL__ __half2 __high2half2(const __half2 l) -{ - __half2 val; - asm("{.reg .f16 low,high;\n" - " mov.b32 {low,high}, %1;\n" - " mov.b32 %0, {high,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(l))); - return val; -} -__CUDA_FP16_DECL__ __half __high2half(const __half2 h) -{ - __half ret; - asm("{.reg .f16 low,high;\n" - " mov.b32 {low,high}, %1;\n" - " mov.b16 %0, high;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(h))); - return ret; -} -__CUDA_FP16_DECL__ __half2 __halves2half2(const __half l, const __half h) -{ - __half2 val; - asm("{ mov.b32 %0, {%1,%2};}\n" - : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(l)), "h"(__HALF_TO_CUS(h))); - return val; -} -__CUDA_FP16_DECL__ __half2 __half2half2(const __half lh) -{ - __half2 val; - asm("{ mov.b32 %0, {%1,%1};}\n" - : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(lh))); - return val; -} -__CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 lh) -{ - __half2 val; - asm("{.reg .f16 low,high;\n" - " mov.b32 {low,high}, %1;\n" - " mov.b32 %0, {high,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(lh))); - return val; -} -__CUDA_FP16_DECL__ short int __half_as_short(const __half h) -{ - return (short int)__HALF_TO_CUS(h); -} -__CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h) -{ - return __HALF_TO_CUS(h); -} -__CUDA_FP16_DECL__ __half __short_as_half(const short int i) -{ - __half h; - __HALF_TO_US(h) = (unsigned short int)i; - return h; -} -__CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i) -{ - __half h; - __HALF_TO_US(h) = i; - return h; -} - -#if __CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__) -/****************************************************************************** -* __half, __half2 warp shuffle * -******************************************************************************/ -#define __SHUFFLE_HALF2_MACRO(name) do {\ - __half2 r; \ - asm("{"#name" %0,%1,%2,%3;\n}" \ - :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c)); \ - return r; \ -} while(0); - -#define __SHUFFLE_SYNC_HALF2_MACRO(name) do {\ - __half2 r; \ - asm("{"#name" %0,%1,%2,%3,%4;\n}" \ - :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \ - return r; \ -} while(0); - -__CUDA_FP16_DECL__ __half2 __shfl(__half2 var, int delta, int width) -{ - int warpSize; - asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); - int c = ((warpSize - width) << 8) | 0x1f; - __SHUFFLE_HALF2_MACRO(shfl.idx.b32); -} -__CUDA_FP16_DECL__ __half2 __shfl_up(__half2 var, unsigned int delta, int width) -{ - int warpSize; - asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); - int c = (warpSize - width) << 8; - __SHUFFLE_HALF2_MACRO(shfl.up.b32); -} -__CUDA_FP16_DECL__ __half2 __shfl_down(__half2 var, unsigned int delta, int width) -{ - int warpSize; - asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); - int c = ((warpSize - width) << 8) | 0x1f; - __SHUFFLE_HALF2_MACRO(shfl.down.b32); -} -__CUDA_FP16_DECL__ __half2 __shfl_xor(__half2 var, int delta, int width) -{ - int warpSize; - asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); - int c = ((warpSize - width) << 8) | 0x1f; - __SHUFFLE_HALF2_MACRO(shfl.bfly.b32); -} - -__CUDA_FP16_DECL__ __half2 __shfl_sync(unsigned mask, __half2 var, int delta, int width) -{ - int warpSize; - asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); - int c = ((warpSize - width) << 8) | 0x1f; - __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.idx.b32); -} -__CUDA_FP16_DECL__ __half2 __shfl_up_sync(unsigned mask, __half2 var, unsigned int delta, int width) -{ - int warpSize; - asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); - int c = (warpSize - width) << 8; - __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.up.b32); -} -__CUDA_FP16_DECL__ __half2 __shfl_down_sync(unsigned mask, __half2 var, unsigned int delta, int width) -{ - int warpSize; - asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); - int c = ((warpSize - width) << 8) | 0x1f; - __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.down.b32); -} -__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(unsigned mask, __half2 var, int delta, int width) -{ - int warpSize; - asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warpSize)); - int c = ((warpSize - width) << 8) | 0x1f; - __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.bfly.b32); -} - -#undef __SHUFFLE_HALF2_MACRO -#undef __SHUFFLE_SYNC_HALF2_MACRO - -__CUDA_FP16_DECL__ __half __shfl(__half var, int delta, int width) -{ - __half2 temp1 = __halves2half2(var, var); - __half2 temp2 = __shfl(temp1, delta, width); - return __low2half(temp2); -} -__CUDA_FP16_DECL__ __half __shfl_up(__half var, unsigned int delta, int width) -{ - __half2 temp1 = __halves2half2(var, var); - __half2 temp2 = __shfl_up(temp1, delta, width); - return __low2half(temp2); -} -__CUDA_FP16_DECL__ __half __shfl_down(__half var, unsigned int delta, int width) -{ - __half2 temp1 = __halves2half2(var, var); - __half2 temp2 = __shfl_down(temp1, delta, width); - return __low2half(temp2); -} -__CUDA_FP16_DECL__ __half __shfl_xor(__half var, int delta, int width) -{ - __half2 temp1 = __halves2half2(var, var); - __half2 temp2 = __shfl_xor(temp1, delta, width); - return __low2half(temp2); -} - -__CUDA_FP16_DECL__ __half __shfl_sync(unsigned mask, __half var, int delta, int width) -{ - __half2 temp1 = __halves2half2(var, var); - __half2 temp2 = __shfl_sync(mask, temp1, delta, width); - return __low2half(temp2); -} -__CUDA_FP16_DECL__ __half __shfl_up_sync(unsigned mask, __half var, unsigned int delta, int width) -{ - __half2 temp1 = __halves2half2(var, var); - __half2 temp2 = __shfl_up_sync(mask, temp1, delta, width); - return __low2half(temp2); -} -__CUDA_FP16_DECL__ __half __shfl_down_sync(unsigned mask, __half var, unsigned int delta, int width) -{ - __half2 temp1 = __halves2half2(var, var); - __half2 temp2 = __shfl_down_sync(mask, temp1, delta, width); - return __low2half(temp2); -} -__CUDA_FP16_DECL__ __half __shfl_xor_sync(unsigned mask, __half var, int delta, int width) -{ - __half2 temp1 = __halves2half2(var, var); - __half2 temp2 = __shfl_xor_sync(mask, temp1, delta, width); - return __low2half(temp2); -} - -#endif /*__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__)*/ -/****************************************************************************** -* __half and __half2 __ldg,__ldcg,__ldca,__ldcs * -******************************************************************************/ - -#if defined(__cplusplus) && (__CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__)) -#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) -#define __LDG_PTR "l" -#else -#define __LDG_PTR "r" -#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ -__CUDA_FP16_DECL__ __half2 __ldg(const __half2 *ptr) -{ - __half2 ret; - asm ("ld.global.nc.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); - return ret; -} -__CUDA_FP16_DECL__ __half __ldg(const __half *ptr) -{ - __half ret; - asm ("ld.global.nc.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); - return ret; -} -__CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *ptr) -{ - __half2 ret; - asm ("ld.global.cg.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); - return ret; -} -__CUDA_FP16_DECL__ __half __ldcg(const __half *ptr) -{ - __half ret; - asm ("ld.global.cg.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); - return ret; -} -__CUDA_FP16_DECL__ __half2 __ldca(const __half2 *ptr) -{ - __half2 ret; - asm ("ld.global.ca.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); - return ret; -} -__CUDA_FP16_DECL__ __half __ldca(const __half *ptr) -{ - __half ret; - asm ("ld.global.ca.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); - return ret; -} -__CUDA_FP16_DECL__ __half2 __ldcs(const __half2 *ptr) -{ - __half2 ret; - asm ("ld.global.cs.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); - return ret; -} -__CUDA_FP16_DECL__ __half __ldcs(const __half *ptr) -{ - __half ret; - asm ("ld.global.cs.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); - return ret; -} -#undef __LDG_PTR -#endif /*defined(__cplusplus) && (__CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__))*/ -#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__) -/****************************************************************************** -* __half2 comparison * -******************************************************************************/ -#define __COMPARISON_OP_HALF2_MACRO(name) do {\ - __half2 val; \ - asm( "{ "#name".f16x2.f16x2 %0,%1,%2;\n}" \ - :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ - return val; \ -} while(0); -__CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b) -{ - __COMPARISON_OP_HALF2_MACRO(set.eq); -} -__CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b) -{ - __COMPARISON_OP_HALF2_MACRO(set.ne); -} -__CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b) -{ - __COMPARISON_OP_HALF2_MACRO(set.le); -} -__CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b) -{ - __COMPARISON_OP_HALF2_MACRO(set.ge); -} -__CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b) -{ - __COMPARISON_OP_HALF2_MACRO(set.lt); -} -__CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b) -{ - __COMPARISON_OP_HALF2_MACRO(set.gt); -} -__CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b) -{ - __COMPARISON_OP_HALF2_MACRO(set.equ); -} -__CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b) -{ - __COMPARISON_OP_HALF2_MACRO(set.neu); -} -__CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b) -{ - __COMPARISON_OP_HALF2_MACRO(set.leu); -} -__CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b) -{ - __COMPARISON_OP_HALF2_MACRO(set.geu); -} -__CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b) -{ - __COMPARISON_OP_HALF2_MACRO(set.ltu); -} -__CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b) -{ - __COMPARISON_OP_HALF2_MACRO(set.gtu); -} -#undef __COMPARISON_OP_HALF2_MACRO -#define __BOOL_COMPARISON_OP_HALF2_MACRO(name) do {\ - __half2 val; \ - asm( "{ "#name".f16x2.f16x2 %0,%1,%2;\n}" \ - :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ - if (__HALF2_TO_CUI(val) == 0x3C003C00) \ - return true; \ - else \ - return false; \ -} while(0); -__CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b) -{ - __BOOL_COMPARISON_OP_HALF2_MACRO(set.eq); -} -__CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b) -{ - __BOOL_COMPARISON_OP_HALF2_MACRO(set.ne); -} -__CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b) -{ - __BOOL_COMPARISON_OP_HALF2_MACRO(set.le); -} -__CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b) -{ - __BOOL_COMPARISON_OP_HALF2_MACRO(set.ge); -} -__CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b) -{ - __BOOL_COMPARISON_OP_HALF2_MACRO(set.lt); -} -__CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b) -{ - __BOOL_COMPARISON_OP_HALF2_MACRO(set.gt); -} -__CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b) -{ - __BOOL_COMPARISON_OP_HALF2_MACRO(set.equ); -} -__CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b) -{ - __BOOL_COMPARISON_OP_HALF2_MACRO(set.neu); -} -__CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b) -{ - __BOOL_COMPARISON_OP_HALF2_MACRO(set.leu); -} -__CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b) -{ - __BOOL_COMPARISON_OP_HALF2_MACRO(set.geu); -} -__CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b) -{ - __BOOL_COMPARISON_OP_HALF2_MACRO(set.ltu); -} -__CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b) -{ - __BOOL_COMPARISON_OP_HALF2_MACRO(set.gtu); -} -#undef __BOOL_COMPARISON_OP_HALF2_MACRO -/****************************************************************************** -* __half comparison * -******************************************************************************/ -#define __COMPARISON_OP_HALF_MACRO(name) do {\ - unsigned short val; \ - asm( "{ .reg .pred __$temp3;\n" \ - " setp."#name".f16 __$temp3, %1, %2;\n" \ - " selp.u16 %0, 1, 0, __$temp3;}" \ - : "=h"(val) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); \ - return val ? true : false; \ -} while(0); -__CUDA_FP16_DECL__ bool __heq(const __half a, const __half b) -{ - __COMPARISON_OP_HALF_MACRO(eq); -} -__CUDA_FP16_DECL__ bool __hne(const __half a, const __half b) -{ - __COMPARISON_OP_HALF_MACRO(ne); -} -__CUDA_FP16_DECL__ bool __hle(const __half a, const __half b) -{ - __COMPARISON_OP_HALF_MACRO(le); -} -__CUDA_FP16_DECL__ bool __hge(const __half a, const __half b) -{ - __COMPARISON_OP_HALF_MACRO(ge); -} -__CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b) -{ - __COMPARISON_OP_HALF_MACRO(lt); -} -__CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b) -{ - __COMPARISON_OP_HALF_MACRO(gt); -} -__CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b) -{ - __COMPARISON_OP_HALF_MACRO(equ); -} -__CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b) -{ - __COMPARISON_OP_HALF_MACRO(neu); -} -__CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b) -{ - __COMPARISON_OP_HALF_MACRO(leu); -} -__CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b) -{ - __COMPARISON_OP_HALF_MACRO(geu); -} -__CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b) -{ - __COMPARISON_OP_HALF_MACRO(ltu); -} -__CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b) -{ - __COMPARISON_OP_HALF_MACRO(gtu); -} -#undef __COMPARISON_OP_HALF_MACRO -/****************************************************************************** -* __half2 arithmetic * -******************************************************************************/ -#define __BINARY_OP_HALF2_MACRO(name) do {\ - __half2 val; \ - asm( "{"#name".f16x2 %0,%1,%2;\n}" \ - :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ - return val; \ -} while(0); - -__CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b) -{ - __BINARY_OP_HALF2_MACRO(add); -} -__CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b) -{ - __BINARY_OP_HALF2_MACRO(sub); -} -__CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b) -{ - __BINARY_OP_HALF2_MACRO(mul); -} -__CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b) -{ - __BINARY_OP_HALF2_MACRO(add.sat); -} -__CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b) -{ - __BINARY_OP_HALF2_MACRO(sub.sat); -} -__CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b) -{ - __BINARY_OP_HALF2_MACRO(mul.sat); -} -#undef __BINARY_OP_HALF2_MACRO -#define __TERNARY_OP_HALF2_MACRO(name) do {\ - __half2 val; \ - asm( "{"#name".f16x2 %0,%1,%2,%3;\n}" \ - :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b)),"r"(__HALF2_TO_CUI(c))); \ - return val; \ -} while(0); -__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c) -{ - __TERNARY_OP_HALF2_MACRO(fma.rn); -} -__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c) -{ - __TERNARY_OP_HALF2_MACRO(fma.rn.sat); -} -#undef __TERNARY_OP_HALF2_MACRO -__CUDA_FP16_DECL__ __half2 __h2div(__half2 a, __half2 b) { - __half ha, hb; - - ha = __low2half(a); - hb = __low2half(b); - - __half v1 = __hdiv(ha, hb); - - ha = __high2half(a); - hb = __high2half(b); - - __half v2 = __hdiv(ha, hb); - - return __halves2half2(v1, v2); -} -/****************************************************************************** -* __half arithmetic * -******************************************************************************/ -#define __BINARY_OP_HALF_MACRO(name) do {\ - __half val; \ - asm( "{"#name".f16 %0,%1,%2;\n}" \ - :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b))); \ - return val; \ -} while(0); -__CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b) -{ - __BINARY_OP_HALF_MACRO(add); -} -__CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b) -{ - __BINARY_OP_HALF_MACRO(sub); -} -__CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b) -{ - __BINARY_OP_HALF_MACRO(mul); -} -__CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b) -{ - __BINARY_OP_HALF_MACRO(add.sat); -} -__CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b) -{ - __BINARY_OP_HALF_MACRO(sub.sat); -} -__CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b) -{ - __BINARY_OP_HALF_MACRO(mul.sat); -} -#undef __BINARY_OP_HALF_MACRO -#define __TERNARY_OP_HALF_MACRO(name) do {\ - __half val; \ - asm( "{"#name".f16 %0,%1,%2,%3;\n}" \ - :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b)),"h"(__HALF_TO_CUS(c))); \ - return val; \ -} while(0); -__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c) -{ - __TERNARY_OP_HALF_MACRO(fma.rn); -} -__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c) -{ - __TERNARY_OP_HALF_MACRO(fma.rn.sat); -} -#undef __TERNARY_OP_HALF2_MACRO -__CUDA_FP16_DECL__ __half __hdiv(__half a, __half b) { - __half v, abs, den; - __HALF_TO_US(den) = 0x008F; - float fa, fb, fv, rcp; - - fa = __half2float(a); - fb = __half2float(b); - - asm("{rcp.approx.f32 %0, %1;\n}" :"=f"(rcp) : "f"(fb)); - - fv = rcp * fa; - - v = __float2half(fv); - __HALF_TO_US(abs) = (unsigned short)(((unsigned int)__HALF_TO_CUS(v)) & 0x00007FFF); - if (__hlt(abs, den) && (!(__HALF_TO_CUS(abs) == 0x0000))) { - float err = __fmaf_rn(-fb, fv, fa); - fv = __fmaf_rn(rcp, err, fv); - v = __float2half(fv); - } - return v; -} - -/****************************************************************************** -* __half2 functions * -******************************************************************************/ -#define __SPEC_CASE2(i,r, spc, ulp) \ - "{.reg.b32 spc, ulp, p;\n"\ - " mov.b32 spc,"#spc";\n"\ - " mov.b32 ulp,"#ulp";\n"\ - " set.eq.f16x2.f16x2 p,"#i", spc;\n"\ - " fma.rn.f16x2 "#r",p,ulp,"#r";\n}\n" -#define __SPEC_CASE(i,r, spc, ulp) \ - "{.reg.b16 spc, ulp, p;\n"\ - " mov.b16 spc,"#spc";\n"\ - " mov.b16 ulp,"#ulp";\n"\ - " set.eq.f16.f16 p,"#i", spc;\n"\ - " fma.rn.f16 "#r",p,ulp,"#r";\n}\n" -#define __APPROX_FCAST(fun) do {\ - __half val;\ - asm("{.reg.b32 f; \n"\ - " .reg.b16 r; \n"\ - " mov.b16 r,%1; \n"\ - " cvt.f32.f16 f,r; \n"\ - " "#fun".approx.f32 f,f; \n"\ - " cvt.rn.f16.f32 r,f; \n"\ - " mov.b16 %0,r; \n"\ - "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));\ - return val;\ -} while(0); -#define __APPROX_FCAST2(fun) do {\ - __half2 val;\ - asm("{.reg.b16 hl, hu; \n"\ - " .reg.b32 fl, fu; \n"\ - " mov.b32 {hl, hu}, %1; \n"\ - " cvt.f32.f16 fl, hl; \n"\ - " cvt.f32.f16 fu, hu; \n"\ - " "#fun".approx.f32 fl, fl; \n"\ - " "#fun".approx.f32 fu, fu; \n"\ - " cvt.rn.f16.f32 hl, fl; \n"\ - " cvt.rn.f16.f32 hu, fu; \n"\ - " mov.b32 %0, {hl, hu}; \n"\ - "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); \ - return val;\ -} while(0); -static __device__ __forceinline__ float __float_simpl_sinf(float); -static __device__ __forceinline__ float __float_simpl_cosf(float); -__CUDA_FP16_DECL__ __half __hsin_internal(const __half a) { - float f = __half2float(a); - f = __float_simpl_sinf(f); - return __float2half_rn(f); -} -__CUDA_FP16_DECL__ __half hsin(const __half a) { - __half r = __hsin_internal(a); - asm("{\n\t" - " .reg.b16 i,r,t; \n\t" - " mov.b16 r, %0; \n\t" - " mov.b16 i, %1; \n\t" - " mov.b16 t, 0x8000; \n\t" - " and.b16 t,r,t; \n\t" - __SPEC_CASE(i, r, 0X32B3, 0x0800) - __SPEC_CASE(i, r, 0X5CB0, 0x1000) - __SPEC_CASE(i, r, 0XB2B3, 0x8800) - __SPEC_CASE(i, r, 0XDCB0, 0x9000) - " or.b16 r,r,t; \n\t" - " mov.b16 %0, r; \n" - "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); - return r; -} -__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a) { - __half l = __low2half(a); - __half h = __high2half(a); - __half2 r = __halves2half2(__hsin_internal(l), __hsin_internal(h)); - asm("{\n\t" - " .reg.b32 i,r,t; \n\t" - " mov.b32 r, %0; \n\t" - " mov.b32 i, %1; \n\t" - " and.b32 t, r, 0x80008000; \n\t" - __SPEC_CASE2(i, r, 0X32B332B3, 0x08000800) - __SPEC_CASE2(i, r, 0X5CB05CB0, 0x10001000) - __SPEC_CASE2(i, r, 0XB2B3B2B3, 0x88008800) - __SPEC_CASE2(i, r, 0XDCB0DCB0, 0x90009000) - " or.b32 r, r, t; \n\t" - " mov.b32 %0, r; \n" - "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); - return r; -} -__CUDA_FP16_DECL__ __half __hcos_internal(const __half a) { - float f = __half2float(a); - f = __float_simpl_cosf(f); - return __float2half_rn(f); -} -__CUDA_FP16_DECL__ __half hcos(const __half a) { - __half r = __hcos_internal(a); - asm("{\n\t" - " .reg.b16 i,r; \n\t" - " mov.b16 r, %0; \n\t" - " mov.b16 i, %1; \n\t" - __SPEC_CASE(i, r, 0X2B7C, 0x1000) - __SPEC_CASE(i, r, 0XAB7C, 0x1000) - " mov.b16 %0, r; \n" - "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); - return r; -} -__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a) { - __half l = __low2half(a); - __half h = __high2half(a); - __half2 r = __halves2half2(__hcos_internal(l), __hcos_internal(h)); - asm("{\n\t" - " .reg.b32 i,r; \n\t" - " mov.b32 r, %0; \n\t" - " mov.b32 i, %1; \n\t" - __SPEC_CASE2(i, r, 0X2B7C2B7C, 0x10001000) - __SPEC_CASE2(i, r, 0XAB7CAB7C, 0x10001000) - " mov.b32 %0, r; \n" - "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); - return r; -} -static __device__ __forceinline__ float __internal_trig_reduction_kernel(float a, int *quadrant) -{ - float j, t; - int q; - q = __float2int_rn(a * 0.636619772f); - j = (float)q; - t = __fmaf_rn(-j, 1.5707962512969971e+000f, a); - t = __fmaf_rn(-j, 7.5497894158615964e-008f, t); - *quadrant = q; - return t; -} -static __device__ __forceinline__ float __internal_sin_cos_kernel(float x, int i) -{ - float x2, z; - x2 = x*x; - - if (i & 1) { - z = 2.44331571e-5f; - z = __fmaf_rn(z, x2, -1.38873163e-3f); - } - else { - z = -1.95152959e-4f; - z = __fmaf_rn(z, x2, 8.33216087e-3f); - } - if (i & 1) { - z = __fmaf_rn(z, x2, 4.16666457e-2f); - z = __fmaf_rn(z, x2, -5.00000000e-1f); - } - else { - z = __fmaf_rn(z, x2, -1.66666546e-1f); - z = __fmaf_rn(z, x2, 0.0f); - } - x = __fmaf_rn(z, x, x); - if (i & 1) x = __fmaf_rn(z, x2, 1.0f); - if (i & 2) x = __fmaf_rn(x, -1.0f, 0.0f); - return x; -} -static __device__ __forceinline__ float __float_simpl_sinf(float a) -{ - float z; - int i; - if (isinf(a)) { - a = a * 0.0f; - } - a = __internal_trig_reduction_kernel(a, &i); - z = __internal_sin_cos_kernel(a, i); - return z; -} -static __device__ __forceinline__ float __float_simpl_cosf(float a) -{ - float z; - int i; - if (isinf(a)) { - a = a * 0.0f; - } - a = __internal_trig_reduction_kernel(a, &i); - i++; - z = __internal_sin_cos_kernel(a, i); - return z; -} -__CUDA_FP16_DECL__ __half hexp(const __half a) { - __half val; - asm("{.reg.b32 f, C; \n" - " .reg.b16 h,r; \n" - " mov.b16 h,%1; \n" - " cvt.f32.f16 f,h; \n" - " mov.b32 C, 0x3fb8aa3b; \n" - " mul.f32 f,f,C; \n" - " ex2.approx.f32 f,f; \n" - " cvt.rn.f16.f32 r,f; \n" - __SPEC_CASE(h, r, 0X1F79, 0x9400) - __SPEC_CASE(h, r, 0X25CF, 0x9400) - __SPEC_CASE(h, r, 0XC13B, 0x0400) - __SPEC_CASE(h, r, 0XC1EF, 0x0200) - " mov.b16 %0,r; \n" - "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); - return val; -} -__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a) { - __half2 val; - asm("{.reg.b16 hl, hu; \n" - " .reg.b32 h,r,fl,fu, C; \n" - " mov.b32 {hl, hu}, %1; \n" - " mov.b32 h, %1; \n" - " cvt.f32.f16 fl, hl; \n" - " cvt.f32.f16 fu, hu; \n" - " mov.b32 C, 0x3fb8aa3b; \n" - " mul.f32 fl,fl,C; \n" - " mul.f32 fu,fu,C; \n" - " ex2.approx.f32 fl, fl; \n" - " ex2.approx.f32 fu, fu; \n" - " cvt.rn.f16.f32 hl, fl; \n" - " cvt.rn.f16.f32 hu, fu; \n" - " mov.b32 r, {hl, hu}; \n" - __SPEC_CASE2(h, r, 0X1F791F79, 0x94009400) - __SPEC_CASE2(h, r, 0X25CF25CF, 0x94009400) - __SPEC_CASE2(h, r, 0XC13BC13B, 0x04000400) - __SPEC_CASE2(h, r, 0XC1EFC1EF, 0x02000200) - " mov.b32 %0, r; \n" - "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); - return val; -} -__CUDA_FP16_DECL__ __half hexp2(const __half a) { - __half val; - asm("{.reg.b32 f, ULP; \n" - " .reg.b16 r; \n" - " mov.b16 r,%1; \n" - " cvt.f32.f16 f,r; \n" - " ex2.approx.f32 f,f; \n" - " mov.b32 ULP, 0x33800000;\n" - " fma.rn.f32 f,f,ULP,f; \n" - " cvt.rn.f16.f32 r,f; \n" - " mov.b16 %0,r; \n" - "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); - return val; -} -__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a) { - __half2 val; - asm("{.reg.b16 hl, hu; \n" - " .reg.b32 fl, fu, ULP; \n" - " mov.b32 {hl, hu}, %1; \n" - " cvt.f32.f16 fl, hl; \n" - " cvt.f32.f16 fu, hu; \n" - " ex2.approx.f32 fl, fl; \n" - " ex2.approx.f32 fu, fu; \n" - " mov.b32 ULP, 0x33800000;\n" - " fma.rn.f32 fl,fl,ULP,fl; \n" - " fma.rn.f32 fu,fu,ULP,fu; \n" - " cvt.rn.f16.f32 hl, fl; \n" - " cvt.rn.f16.f32 hu, fu; \n" - " mov.b32 %0, {hl, hu}; \n" - "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); - return val; -} -__CUDA_FP16_DECL__ __half hexp10(const __half a) { - __half val; - asm("{.reg.b16 h,r; \n" - " .reg.b32 f, C; \n" - " mov.b16 h, %1; \n" - " cvt.f32.f16 f, h; \n" - " mov.b32 C, 0x40549A78; \n" - " mul.f32 f,f,C; \n" - " ex2.approx.f32 f, f; \n" - " cvt.rn.f16.f32 r, f; \n" - __SPEC_CASE(h, r, 0x34DE, 0x9800) - __SPEC_CASE(h, r, 0x9766, 0x9000) - __SPEC_CASE(h, r, 0x9972, 0x1000) - __SPEC_CASE(h, r, 0xA5C4, 0x1000) - __SPEC_CASE(h, r, 0xBF0A, 0x8100) - " mov.b16 %0, r; \n" - "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); - return val; -} -__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a) { - __half2 val; - asm("{.reg.b16 hl, hu; \n" - " .reg.b32 h,r,fl,fu, C; \n" - " mov.b32 {hl, hu}, %1; \n" - " mov.b32 h, %1; \n" - " cvt.f32.f16 fl, hl; \n" - " cvt.f32.f16 fu, hu; \n" - " mov.b32 C, 0x40549A78; \n" - " mul.f32 fl,fl,C; \n" - " mul.f32 fu,fu,C; \n" - " ex2.approx.f32 fl, fl; \n" - " ex2.approx.f32 fu, fu; \n" - " cvt.rn.f16.f32 hl, fl; \n" - " cvt.rn.f16.f32 hu, fu; \n" - " mov.b32 r, {hl, hu}; \n" - __SPEC_CASE2(h, r, 0x34DE34DE, 0x98009800) - __SPEC_CASE2(h, r, 0x97669766, 0x90009000) - __SPEC_CASE2(h, r, 0x99729972, 0x10001000) - __SPEC_CASE2(h, r, 0xA5C4A5C4, 0x10001000) - __SPEC_CASE2(h, r, 0xBF0ABF0A, 0x81008100) - " mov.b32 %0, r; \n" - "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); - return val; -} -__CUDA_FP16_DECL__ __half hlog2(const __half a) { - __half val; - asm("{.reg.b16 h, r; \n" - " .reg.b32 f; \n" - " mov.b16 h, %1; \n" - " cvt.f32.f16 f, h; \n" - " lg2.approx.f32 f, f; \n" - " cvt.rn.f16.f32 r, f; \n" - __SPEC_CASE(r, r, 0xA2E2, 0x8080) - __SPEC_CASE(r, r, 0xBF46, 0x9400) - " mov.b16 %0, r; \n" - "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); - return val; -} -__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a) { - __half2 val; - asm("{.reg.b16 hl, hu; \n" - " .reg.b32 fl, fu, r, p; \n" - " mov.b32 {hl, hu}, %1; \n" - " cvt.f32.f16 fl, hl; \n" - " cvt.f32.f16 fu, hu; \n" - " lg2.approx.f32 fl, fl; \n" - " lg2.approx.f32 fu, fu; \n" - " cvt.rn.f16.f32 hl, fl; \n" - " cvt.rn.f16.f32 hu, fu; \n" - " mov.b32 r, {hl, hu}; \n" - __SPEC_CASE2(r, r, 0xA2E2A2E2, 0x80808080) - __SPEC_CASE2(r, r, 0xBF46BF46, 0x94009400) - " mov.b32 %0, r; \n" - "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); - return val; -} -__CUDA_FP16_DECL__ __half hlog(const __half a) { - __half val; - asm("{.reg.b32 f, C; \n" - " .reg.b16 r,h; \n" - " mov.b16 h,%1; \n" - " cvt.f32.f16 f,h; \n" - " lg2.approx.f32 f,f; \n" - " mov.b32 C, 0x3f317218; \n" - " mul.f32 f,f,C; \n" - " cvt.rn.f16.f32 r,f; \n" - __SPEC_CASE(h, r, 0X160D, 0x9C00) - __SPEC_CASE(h, r, 0X3BFE, 0x8010) - __SPEC_CASE(h, r, 0X3C0B, 0x8080) - __SPEC_CASE(h, r, 0X6051, 0x1C00) - " mov.b16 %0,r; \n" - "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); - return val; -} -__CUDA_FP16_DECL__ __half2 h2log(const __half2 a) { - __half2 val; - asm("{.reg.b16 hl, hu; \n" - " .reg.b32 r, fl, fu, C, h; \n" - " mov.b32 {hl, hu}, %1; \n" - " mov.b32 h, %1; \n" - " cvt.f32.f16 fl, hl; \n" - " cvt.f32.f16 fu, hu; \n" - " lg2.approx.f32 fl, fl; \n" - " lg2.approx.f32 fu, fu; \n" - " mov.b32 C, 0x3f317218; \n" - " mul.f32 fl,fl,C; \n" - " mul.f32 fu,fu,C; \n" - " cvt.rn.f16.f32 hl, fl; \n" - " cvt.rn.f16.f32 hu, fu; \n" - " mov.b32 r, {hl, hu}; \n" - __SPEC_CASE2(h, r, 0X160D160D, 0x9C009C00) - __SPEC_CASE2(h, r, 0X3BFE3BFE, 0x80108010) - __SPEC_CASE2(h, r, 0X3C0B3C0B, 0x80808080) - __SPEC_CASE2(h, r, 0X60516051, 0x1C001C00) - " mov.b32 %0, r; \n" - "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); - return val; -} -__CUDA_FP16_DECL__ __half hlog10(const __half a) { - __half val; - asm("{.reg.b16 h, r; \n" - " .reg.b32 f, C; \n" - " mov.b16 h, %1; \n" - " cvt.f32.f16 f, h; \n" - " lg2.approx.f32 f, f; \n" - " mov.b32 C, 0x3E9A209B; \n" - " mul.f32 f,f,C; \n" - " cvt.rn.f16.f32 r, f; \n" - __SPEC_CASE(h, r, 0x338F, 0x1000) - __SPEC_CASE(h, r, 0x33F8, 0x9000) - __SPEC_CASE(h, r, 0x57E1, 0x9800) - __SPEC_CASE(h, r, 0x719D, 0x9C00) - " mov.b16 %0, r; \n" - "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); - return val; -} -__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a) { - __half2 val; - asm("{.reg.b16 hl, hu; \n" - " .reg.b32 r, fl, fu, C, h; \n" - " mov.b32 {hl, hu}, %1; \n" - " mov.b32 h, %1; \n" - " cvt.f32.f16 fl, hl; \n" - " cvt.f32.f16 fu, hu; \n" - " lg2.approx.f32 fl, fl; \n" - " lg2.approx.f32 fu, fu; \n" - " mov.b32 C, 0x3E9A209B; \n" - " mul.f32 fl,fl,C; \n" - " mul.f32 fu,fu,C; \n" - " cvt.rn.f16.f32 hl, fl; \n" - " cvt.rn.f16.f32 hu, fu; \n" - " mov.b32 r, {hl, hu}; \n" - __SPEC_CASE2(h, r, 0x338F338F, 0x10001000) - __SPEC_CASE2(h, r, 0x33F833F8, 0x90009000) - __SPEC_CASE2(h, r, 0x57E157E1, 0x98009800) - __SPEC_CASE2(h, r, 0x719D719D, 0x9C009C00) - " mov.b32 %0, r; \n" - "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); - return val; -} -#undef __SPEC_CASE2 -#undef __SPEC_CASE -__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a) { - __APPROX_FCAST2(rcp); -} -__CUDA_FP16_DECL__ __half hrcp(const __half a) { - __APPROX_FCAST(rcp); -} -__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a) { - __APPROX_FCAST2(rsqrt); -} -__CUDA_FP16_DECL__ __half hrsqrt(const __half a) { - __APPROX_FCAST(rsqrt); -} -__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a) { - __APPROX_FCAST2(sqrt); -} -__CUDA_FP16_DECL__ __half hsqrt(const __half a) { - __APPROX_FCAST(sqrt); -} -#undef __APPROX_FCAST -#undef __APPROX_FCAST2 -__CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a) -{ - __half2 r; - asm("{set.nan.f16x2.f16x2 %0,%1,%2;\n}" - :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(a))); - return r; -} -__CUDA_FP16_DECL__ bool __hisnan(const __half a) -{ - __half r; - asm("{set.nan.f16.f16 %0,%1,%2;\n}" - :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(a))); - if (__HALF_TO_CUS(r) == 0) - return false; - else return true; -} -__CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a) -{ - __half2 zero = __float2half2_rn(0.0); - return __hsub2(zero, a); -} -__CUDA_FP16_DECL__ __half __hneg(const __half a) -{ - __half zero; - zero = __float2half(0.0); - return __hsub(zero, a); -} -#endif /*__CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)*/ - -#undef __CUDA_FP16_DECL__ -#endif /* defined(__CUDACC__) */ -#endif /* defined(__cplusplus) */ - -#undef __HALF_TO_US -#undef __HALF_TO_CUS -#undef __HALF2_TO_UI -#undef __HALF2_TO_CUI - - -/* Define first-class types "half" and "half2", unless user specifies otherwise via "#define CUDA_NO_HALF" */ -/* C cannot ever have these types defined here, because __half and __half2 are C++ classes */ -#if defined(__cplusplus) && !defined(CUDA_NO_HALF) -typedef __half half; -typedef __half2 half2; -#endif /* defined(__cplusplus) && !defined(CUDA_NO_HALF) */ - -#endif /* end of include guard: __CUDA_FP16_HPP__ */ diff --git a/include/triton/external/CUDA/cuda_runtime.h b/include/triton/external/CUDA/cuda_runtime.h deleted file mode 100755 index ce880e055..000000000 --- a/include/triton/external/CUDA/cuda_runtime.h +++ /dev/null @@ -1,2040 +0,0 @@ -/* - * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -#if !defined(__CUDA_RUNTIME_H__) -#define __CUDA_RUNTIME_H__ - -#if !defined(__CUDACC_RTC__) -#if defined(__GNUC__) -#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))) -#pragma GCC diagnostic push -#endif -#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))) -#pragma GCC diagnostic ignored "-Wunused-function" -#endif -#elif defined(_MSC_VER) -#pragma warning(push) -#pragma warning(disable: 4820) -#endif -#endif - -#ifdef __QNX__ -#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 7) -typedef unsigned size_t; -#endif -#endif -/******************************************************************************* -* * -* * -* * -*******************************************************************************/ - -#include "host_config.h" - -/******************************************************************************* -* * -* * -* * -*******************************************************************************/ - -#include "builtin_types.h" -#include "library_types.h" -#if !defined(__CUDACC_RTC__) -#define EXCLUDE_FROM_RTC -#include "channel_descriptor.h" -#include "cuda_runtime_api.h" -#include "driver_functions.h" -#undef EXCLUDE_FROM_RTC -#endif /* !__CUDACC_RTC__ */ -#include "host_defines.h" -#include "vector_functions.h" - -#if defined(__CUDACC__) - -#if defined(__CUDACC_RTC__) -#include "nvrtc_device_runtime.h" -#include "device_functions.h" - -extern __host__ __device__ unsigned cudaConfigureCall(dim3 gridDim, - dim3 blockDim, - size_t sharedMem = 0, - void *stream = 0); -#include "common_functions.h" -#include "cuda_surface_types.h" -#include "cuda_texture_types.h" -#include "device_launch_parameters.h" - -#else /* !__CUDACC_RTC__ */ -#define EXCLUDE_FROM_RTC -#include "common_functions.h" -#include "cuda_surface_types.h" -#include "cuda_texture_types.h" -#include "device_functions.h" -#include "device_launch_parameters.h" - -#if defined(__CUDACC_EXTENDED_LAMBDA__) -#include -#include -struct __device_builtin__ __nv_lambda_preheader_injection { }; -#endif /* defined(__CUDACC_EXTENDED_LAMBDA__) */ - -#undef EXCLUDE_FROM_RTC -#endif /* __CUDACC_RTC__ */ - -#endif /* __CUDACC__ */ - -#if defined(__cplusplus) && !defined(__CUDACC_RTC__) - -/******************************************************************************* -* * -* * -* * -*******************************************************************************/ - -/** - * \addtogroup CUDART_HIGHLEVEL - * @{ - */ - -/** - *\brief Launches a device function - * - * The function invokes kernel \p func on \p gridDim (\p gridDim.x × \p gridDim.y - * × \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x × - * \p blockDim.y × \p blockDim.z) threads. - * - * If the kernel has N parameters the \p args should point to array of N pointers. - * Each pointer, from args[0] to args[N - 1], point to the region - * of memory from which the actual parameter will be copied. - * - * \p sharedMem sets the amount of dynamic shared memory that will be available to - * each thread block. - * - * \p stream specifies a stream the invocation is associated to. - * - * \param func - Device function symbol - * \param gridDim - Grid dimentions - * \param blockDim - Block dimentions - * \param args - Arguments - * \param sharedMem - Shared memory (defaults to 0) - * \param stream - Stream identifier (defaults to NULL) - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidDeviceFunction, - * ::cudaErrorInvalidConfiguration, - * ::cudaErrorLaunchFailure, - * ::cudaErrorLaunchTimeout, - * ::cudaErrorLaunchOutOfResources, - * ::cudaErrorSharedObjectInitFailed, - * ::cudaErrorInvalidPtx, - * ::cudaErrorNoKernelImageForDevice, - * ::cudaErrorJitCompilerNotFound - * \notefnerr - * \note_async - * \note_null_stream - * - * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)" - */ -template -static __inline__ __host__ cudaError_t cudaLaunchKernel( - const T *func, - dim3 gridDim, - dim3 blockDim, - void **args, - size_t sharedMem = 0, - cudaStream_t stream = 0 -) -{ - return ::cudaLaunchKernel((const void *)func, gridDim, blockDim, args, sharedMem, stream); -} - -/** - *\brief Launches a device function - * - * The function invokes kernel \p func on \p gridDim (\p gridDim.x × \p gridDim.y - * × \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x × - * \p blockDim.y × \p blockDim.z) threads. - * - * The device on which this kernel is invoked must have a non-zero value for - * the device attribute ::cudaDevAttrCooperativeLaunch. - * - * The total number of blocks launched cannot exceed the maximum number of blocks per - * multiprocessor as returned by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor (or - * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors - * as specified by the device attribute ::cudaDevAttrMultiProcessorCount. - * - * The kernel cannot make use of CUDA dynamic parallelism. - * - * If the kernel has N parameters the \p args should point to array of N pointers. - * Each pointer, from args[0] to args[N - 1], point to the region - * of memory from which the actual parameter will be copied. - * - * \p sharedMem sets the amount of dynamic shared memory that will be available to - * each thread block. - * - * \p stream specifies a stream the invocation is associated to. - * - * \param func - Device function symbol - * \param gridDim - Grid dimentions - * \param blockDim - Block dimentions - * \param args - Arguments - * \param sharedMem - Shared memory (defaults to 0) - * \param stream - Stream identifier (defaults to NULL) - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidDeviceFunction, - * ::cudaErrorInvalidConfiguration, - * ::cudaErrorLaunchFailure, - * ::cudaErrorLaunchTimeout, - * ::cudaErrorLaunchOutOfResources, - * ::cudaErrorSharedObjectInitFailed - * \notefnerr - * \note_async - * \note_null_stream - * - * \ref ::cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchCooperativeKernel (C API)" - */ -template -static __inline__ __host__ cudaError_t cudaLaunchCooperativeKernel( - const T *func, - dim3 gridDim, - dim3 blockDim, - void **args, - size_t sharedMem = 0, - cudaStream_t stream = 0 -) -{ - return ::cudaLaunchCooperativeKernel((const void *)func, gridDim, blockDim, args, sharedMem, stream); -} - -/** - * \brief \hl Configure a device launch - * - * \deprecated This function is deprecated as of CUDA 7.0 - * - * Pushes \p size bytes of the argument pointed to by \p arg at \p offset - * bytes from the start of the parameter passing area, which starts at - * offset 0. The arguments are stored in the top of the execution stack. - * \ref ::cudaSetupArgument(T, size_t) "cudaSetupArgument()" must be preceded - * by a call to ::cudaConfigureCall(). - * - * \param arg - Argument to push for a kernel launch - * \param offset - Offset in argument stack to push new arg - * - * \return - * ::cudaSuccess - * \notefnerr - * - * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)", - * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)", - * \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)", - * ::cudaSetDoubleForDevice, - * ::cudaSetDoubleForHost, - * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)" - */ -template -static __inline__ __host__ cudaError_t cudaSetupArgument( - T arg, - size_t offset -) -{ - return ::cudaSetupArgument((const void*)&arg, sizeof(T), offset); -} - -/** - * \brief \hl Creates an event object with the specified flags - * - * Creates an event object with the specified flags. Valid flags include: - * - ::cudaEventDefault: Default event creation flag. - * - ::cudaEventBlockingSync: Specifies that event should use blocking - * synchronization. A host thread that uses ::cudaEventSynchronize() to wait - * on an event created with this flag will block until the event actually - * completes. - * - ::cudaEventDisableTiming: Specifies that the created event does not need - * to record timing data. Events created with this flag specified and - * the ::cudaEventBlockingSync flag not specified will provide the best - * performance when used with ::cudaStreamWaitEvent() and ::cudaEventQuery(). - * - * \param event - Newly created event - * \param flags - Flags for new event - * - * \return - * ::cudaSuccess, - * ::cudaErrorInitializationError, - * ::cudaErrorInvalidValue, - * ::cudaErrorLaunchFailure, - * ::cudaErrorMemoryAllocation - * \notefnerr - * - * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)", - * ::cudaEventCreateWithFlags, ::cudaEventRecord, ::cudaEventQuery, - * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime, - * ::cudaStreamWaitEvent - */ -static __inline__ __host__ cudaError_t cudaEventCreate( - cudaEvent_t *event, - unsigned int flags -) -{ - return ::cudaEventCreateWithFlags(event, flags); -} - -/** - * \brief \hl Allocates page-locked memory on the host - * - * Allocates \p size bytes of host memory that is page-locked and accessible - * to the device. The driver tracks the virtual memory ranges allocated with - * this function and automatically accelerates calls to functions such as - * ::cudaMemcpy(). Since the memory can be accessed directly by the device, it - * can be read or written with much higher bandwidth than pageable memory - * obtained with functions such as ::malloc(). Allocating excessive amounts of - * pinned memory may degrade system performance, since it reduces the amount - * of memory available to the system for paging. As a result, this function is - * best used sparingly to allocate staging areas for data exchange between host - * and device. - * - * The \p flags parameter enables different options to be specified that affect - * the allocation, as follows. - * - ::cudaHostAllocDefault: This flag's value is defined to be 0. - * - ::cudaHostAllocPortable: The memory returned by this call will be - * considered as pinned memory by all CUDA contexts, not just the one that - * performed the allocation. - * - ::cudaHostAllocMapped: Maps the allocation into the CUDA address space. - * The device pointer to the memory may be obtained by calling - * ::cudaHostGetDevicePointer(). - * - ::cudaHostAllocWriteCombined: Allocates the memory as write-combined (WC). - * WC memory can be transferred across the PCI Express bus more quickly on some - * system configurations, but cannot be read efficiently by most CPUs. WC - * memory is a good option for buffers that will be written by the CPU and read - * by the device via mapped pinned memory or host->device transfers. - * - * All of these flags are orthogonal to one another: a developer may allocate - * memory that is portable, mapped and/or write-combined with no restrictions. - * - * ::cudaSetDeviceFlags() must have been called with the ::cudaDeviceMapHost - * flag in order for the ::cudaHostAllocMapped flag to have any effect. - * - * The ::cudaHostAllocMapped flag may be specified on CUDA contexts for devices - * that do not support mapped pinned memory. The failure is deferred to - * ::cudaHostGetDevicePointer() because the memory may be mapped into other - * CUDA contexts via the ::cudaHostAllocPortable flag. - * - * Memory allocated by this function must be freed with ::cudaFreeHost(). - * - * \param ptr - Device pointer to allocated memory - * \param size - Requested allocation size in bytes - * \param flags - Requested properties of allocated memory - * - * \return - * ::cudaSuccess, - * ::cudaErrorMemoryAllocation - * \notefnerr - * - * \sa ::cudaSetDeviceFlags, - * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", - * ::cudaFreeHost, ::cudaHostAlloc - */ -static __inline__ __host__ cudaError_t cudaMallocHost( - void **ptr, - size_t size, - unsigned int flags -) -{ - return ::cudaHostAlloc(ptr, size, flags); -} - -template -static __inline__ __host__ cudaError_t cudaHostAlloc( - T **ptr, - size_t size, - unsigned int flags -) -{ - return ::cudaHostAlloc((void**)(void*)ptr, size, flags); -} - -template -static __inline__ __host__ cudaError_t cudaHostGetDevicePointer( - T **pDevice, - void *pHost, - unsigned int flags -) -{ - return ::cudaHostGetDevicePointer((void**)(void*)pDevice, pHost, flags); -} - -/** - * \brief Allocates memory that will be automatically managed by the Unified Memory system - * - * Allocates \p size bytes of managed memory on the device and returns in - * \p *devPtr a pointer to the allocated memory. If the device doesn't support - * allocating managed memory, ::cudaErrorNotSupported is returned. Support - * for managed memory can be queried using the device attribute - * ::cudaDevAttrManagedMemory. The allocated memory is suitably - * aligned for any kind of variable. The memory is not cleared. If \p size - * is 0, ::cudaMallocManaged returns ::cudaErrorInvalidValue. The pointer - * is valid on the CPU and on all GPUs in the system that support managed memory. - * All accesses to this pointer must obey the Unified Memory programming model. - * - * \p flags specifies the default stream association for this allocation. - * \p flags must be one of ::cudaMemAttachGlobal or ::cudaMemAttachHost. The - * default value for \p flags is ::cudaMemAttachGlobal. - * If ::cudaMemAttachGlobal is specified, then this memory is accessible from - * any stream on any device. If ::cudaMemAttachHost is specified, then the - * allocation should not be accessed from devices that have a zero value for the - * device attribute ::cudaDevAttrConcurrentManagedAccess; an explicit call to - * ::cudaStreamAttachMemAsync will be required to enable access on such devices. - * - * If the association is later changed via ::cudaStreamAttachMemAsync to - * a single stream, the default association, as specifed during ::cudaMallocManaged, - * is restored when that stream is destroyed. For __managed__ variables, the - * default association is always ::cudaMemAttachGlobal. Note that destroying a - * stream is an asynchronous operation, and as a result, the change to default - * association won't happen until all work in the stream has completed. - * - * Memory allocated with ::cudaMallocManaged should be released with ::cudaFree. - * - * Device memory oversubscription is possible for GPUs that have a non-zero value for the - * device attribute ::cudaDevAttrConcurrentManagedAccess. Managed memory on - * such GPUs may be evicted from device memory to host memory at any time by the Unified - * Memory driver in order to make room for other allocations. - * - * In a multi-GPU system where all GPUs have a non-zero value for the device attribute - * ::cudaDevAttrConcurrentManagedAccess, managed memory may not be populated when this - * API returns and instead may be populated on access. In such systems, managed memory can - * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to - * maintain data locality and prevent excessive page faults to the extent possible. The application - * can also guide the driver about memory usage patterns via ::cudaMemAdvise. The application - * can also explicitly migrate memory to a desired processor's memory via - * ::cudaMemPrefetchAsync. - * - * In a multi-GPU system where all of the GPUs have a zero value for the device attribute - * ::cudaDevAttrConcurrentManagedAccess and all the GPUs have peer-to-peer support - * with each other, the physical storage for managed memory is created on the GPU which is active - * at the time ::cudaMallocManaged is called. All other GPUs will reference the data at reduced - * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate - * memory among such GPUs. - * - * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and - * where the value of the device attribute ::cudaDevAttrConcurrentManagedAccess - * is zero for at least one of those GPUs, the location chosen for physical storage of managed - * memory is system-dependent. - * - On Linux, the location chosen will be device memory as long as the current set of active - * contexts are on devices that either have peer-to-peer support with each other or have a - * non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess. - * If there is an active context on a GPU that does not have a non-zero value for that device - * attribute and it does not have peer-to-peer support with the other devices that have active - * contexts on them, then the location for physical storage will be 'zero-copy' or host memory. - * Note that this means that managed memory that is located in device memory is migrated to - * host memory if a new context is created on a GPU that doesn't have a non-zero value for - * the device attribute and does not support peer-to-peer with at least one of the other devices - * that has an active context. This in turn implies that context creation may fail if there is - * insufficient host memory to migrate all managed allocations. - * - On Windows, the physical storage is always created in 'zero-copy' or host memory. - * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these - * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to - * restrict CUDA to only use those GPUs that have peer-to-peer support. - * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a non-zero - * value to force the driver to always use device memory for physical storage. - * When this environment variable is set to a non-zero value, all devices used in - * that process that support managed memory have to be peer-to-peer compatible - * with each other. The error ::cudaErrorInvalidDevice will be returned if a device - * that supports managed memory is used and it is not peer-to-peer compatible with - * any of the other managed memory supporting devices that were previously used in - * that process, even if ::cudaDeviceReset has been called on those devices. These - * environment variables are described in the CUDA programming guide under the - * "CUDA environment variables" section. - * - On ARM, managed memory is not available on discrete gpu with Drive PX-2. - * - * \param devPtr - Pointer to allocated device memory - * \param size - Requested allocation size in bytes - * \param flags - Must be either ::cudaMemAttachGlobal or ::cudaMemAttachHost (defaults to ::cudaMemAttachGlobal) - * - * \return - * ::cudaSuccess, - * ::cudaErrorMemoryAllocation - * ::cudaErrorNotSupported - * ::cudaErrorInvalidValue - * - * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray, - * ::cudaMalloc3D, ::cudaMalloc3DArray, - * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", - * ::cudaFreeHost, ::cudaHostAlloc, ::cudaDeviceGetAttribute, ::cudaStreamAttachMemAsync - */ -template -static __inline__ __host__ cudaError_t cudaMallocManaged( - T **devPtr, - size_t size, - unsigned int flags = cudaMemAttachGlobal -) -{ - return ::cudaMallocManaged((void**)(void*)devPtr, size, flags); -} - -/** - * \brief Attach memory to a stream asynchronously - * - * Enqueues an operation in \p stream to specify stream association of - * \p length bytes of memory starting from \p devPtr. This function is a - * stream-ordered operation, meaning that it is dependent on, and will - * only take effect when, previous work in stream has completed. Any - * previous association is automatically replaced. - * - * \p devPtr must point to an address within managed memory space declared - * using the __managed__ keyword or allocated with ::cudaMallocManaged. - * - * \p length must be zero, to indicate that the entire allocation's - * stream association is being changed. Currently, it's not possible - * to change stream association for a portion of an allocation. The default - * value for \p length is zero. - * - * The stream association is specified using \p flags which must be - * one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle. - * The default value for \p flags is ::cudaMemAttachSingle - * If the ::cudaMemAttachGlobal flag is specified, the memory can be accessed - * by any stream on any device. - * If the ::cudaMemAttachHost flag is specified, the program makes a guarantee - * that it won't access the memory on the device from any stream on a device that - * has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess. - * If the ::cudaMemAttachSingle flag is specified and \p stream is associated with - * a device that has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess, - * the program makes a guarantee that it will only access the memory on the device - * from \p stream. It is illegal to attach singly to the NULL stream, because the - * NULL stream is a virtual global stream and not a specific stream. An error will - * be returned in this case. - * - * When memory is associated with a single stream, the Unified Memory system will - * allow CPU access to this memory region so long as all operations in \p stream - * have completed, regardless of whether other streams are active. In effect, - * this constrains exclusive ownership of the managed memory region by - * an active GPU to per-stream activity instead of whole-GPU activity. - * - * Accessing memory on the device from streams that are not associated with - * it will produce undefined results. No error checking is performed by the - * Unified Memory system to ensure that kernels launched into other streams - * do not access this region. - * - * It is a program's responsibility to order calls to ::cudaStreamAttachMemAsync - * via events, synchronization or other means to ensure legal access to memory - * at all times. Data visibility and coherency will be changed appropriately - * for all kernels which follow a stream-association change. - * - * If \p stream is destroyed while data is associated with it, the association is - * removed and the association reverts to the default visibility of the allocation - * as specified at ::cudaMallocManaged. For __managed__ variables, the default - * association is always ::cudaMemAttachGlobal. Note that destroying a stream is an - * asynchronous operation, and as a result, the change to default association won't - * happen until all work in the stream has completed. - * - * \param stream - Stream in which to enqueue the attach operation - * \param devPtr - Pointer to memory (must be a pointer to managed memory) - * \param length - Length of memory (must be zero, defaults to zero) - * \param flags - Must be one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle (defaults to ::cudaMemAttachSingle) - * - * \return - * ::cudaSuccess, - * ::cudaErrorNotReady, - * ::cudaErrorInvalidValue - * ::cudaErrorInvalidResourceHandle - * \notefnerr - * - * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, ::cudaMallocManaged - */ -template -static __inline__ __host__ cudaError_t cudaStreamAttachMemAsync( - cudaStream_t stream, - T *devPtr, - size_t length = 0, - unsigned int flags = cudaMemAttachSingle -) -{ - return ::cudaStreamAttachMemAsync(stream, (void*)devPtr, length, flags); -} - -template -static __inline__ __host__ cudaError_t cudaMalloc( - T **devPtr, - size_t size -) -{ - return ::cudaMalloc((void**)(void*)devPtr, size); -} - -template -static __inline__ __host__ cudaError_t cudaMallocHost( - T **ptr, - size_t size, - unsigned int flags = 0 -) -{ - return cudaMallocHost((void**)(void*)ptr, size, flags); -} - -template -static __inline__ __host__ cudaError_t cudaMallocPitch( - T **devPtr, - size_t *pitch, - size_t width, - size_t height -) -{ - return ::cudaMallocPitch((void**)(void*)devPtr, pitch, width, height); -} - -#if defined(__CUDACC__) - -/** - * \brief \hl Copies data to the given symbol on the device - * - * Copies \p count bytes from the memory area pointed to by \p src - * to the memory area \p offset bytes from the start of symbol - * \p symbol. The memory areas may not overlap. \p symbol is a variable that - * resides in global or constant memory space. \p kind can be either - * ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice. - * - * \param symbol - Device symbol reference - * \param src - Source memory address - * \param count - Size in bytes to copy - * \param offset - Offset from start of symbol in bytes - * \param kind - Type of transfer - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidSymbol, - * ::cudaErrorInvalidMemcpyDirection, - * ::cudaErrorNoKernelImageForDevice - * \notefnerr - * \note_sync - * \note_string_api_deprecation - * - * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, - * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, - * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, - * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, - * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, - * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, - * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync - */ -template -static __inline__ __host__ cudaError_t cudaMemcpyToSymbol( - const T &symbol, - const void *src, - size_t count, - size_t offset = 0, - enum cudaMemcpyKind kind = cudaMemcpyHostToDevice -) -{ - return ::cudaMemcpyToSymbol((const void*)&symbol, src, count, offset, kind); -} - -/** - * \brief \hl Copies data to the given symbol on the device - * - * Copies \p count bytes from the memory area pointed to by \p src - * to the memory area \p offset bytes from the start of symbol - * \p symbol. The memory areas may not overlap. \p symbol is a variable that - * resides in global or constant memory space. \p kind can be either - * ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice. - * - * ::cudaMemcpyToSymbolAsync() is asynchronous with respect to the host, so - * the call may return before the copy is complete. The copy can optionally - * be associated to a stream by passing a non-zero \p stream argument. If - * \p kind is ::cudaMemcpyHostToDevice and \p stream is non-zero, the copy - * may overlap with operations in other streams. - * - * \param symbol - Device symbol reference - * \param src - Source memory address - * \param count - Size in bytes to copy - * \param offset - Offset from start of symbol in bytes - * \param kind - Type of transfer - * \param stream - Stream identifier - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidSymbol, - * ::cudaErrorInvalidMemcpyDirection, - * ::cudaErrorNoKernelImageForDevice - * \notefnerr - * \note_async - * \note_string_api_deprecation - * - * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, - * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, - * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, - * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, - * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, - * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, - * ::cudaMemcpyFromSymbolAsync - */ -template -static __inline__ __host__ cudaError_t cudaMemcpyToSymbolAsync( - const T &symbol, - const void *src, - size_t count, - size_t offset = 0, - enum cudaMemcpyKind kind = cudaMemcpyHostToDevice, - cudaStream_t stream = 0 -) -{ - return ::cudaMemcpyToSymbolAsync((const void*)&symbol, src, count, offset, kind, stream); -} - -/** - * \brief \hl Copies data from the given symbol on the device - * - * Copies \p count bytes from the memory area \p offset bytes - * from the start of symbol \p symbol to the memory area pointed to by \p dst. - * The memory areas may not overlap. \p symbol is a variable that - * resides in global or constant memory space. \p kind can be either - * ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice. - * - * \param dst - Destination memory address - * \param symbol - Device symbol reference - * \param count - Size in bytes to copy - * \param offset - Offset from start of symbol in bytes - * \param kind - Type of transfer - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidSymbol, - * ::cudaErrorInvalidMemcpyDirection, - * ::cudaErrorNoKernelImageForDevice - * \notefnerr - * \note_sync - * \note_string_api_deprecation - * - * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, - * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, - * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, - * ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, - * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, - * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, - * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync - */ -template -static __inline__ __host__ cudaError_t cudaMemcpyFromSymbol( - void *dst, - const T &symbol, - size_t count, - size_t offset = 0, - enum cudaMemcpyKind kind = cudaMemcpyDeviceToHost -) -{ - return ::cudaMemcpyFromSymbol(dst, (const void*)&symbol, count, offset, kind); -} - -/** - * \brief \hl Copies data from the given symbol on the device - * - * Copies \p count bytes from the memory area \p offset bytes - * from the start of symbol \p symbol to the memory area pointed to by \p dst. - * The memory areas may not overlap. \p symbol is a variable that resides in - * global or constant memory space. \p kind can be either - * ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice. - * - * ::cudaMemcpyFromSymbolAsync() is asynchronous with respect to the host, so - * the call may return before the copy is complete. The copy can optionally be - * associated to a stream by passing a non-zero \p stream argument. If \p kind - * is ::cudaMemcpyDeviceToHost and \p stream is non-zero, the copy may overlap - * with operations in other streams. - * - * \param dst - Destination memory address - * \param symbol - Device symbol reference - * \param count - Size in bytes to copy - * \param offset - Offset from start of symbol in bytes - * \param kind - Type of transfer - * \param stream - Stream identifier - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidSymbol, - * ::cudaErrorInvalidMemcpyDirection, - * ::cudaErrorNoKernelImageForDevice - * \notefnerr - * \note_async - * \note_string_api_deprecation - * - * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, - * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, - * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, - * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, - * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, - * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, - * ::cudaMemcpyToSymbolAsync - */ -template -static __inline__ __host__ cudaError_t cudaMemcpyFromSymbolAsync( - void *dst, - const T &symbol, - size_t count, - size_t offset = 0, - enum cudaMemcpyKind kind = cudaMemcpyDeviceToHost, - cudaStream_t stream = 0 -) -{ - return ::cudaMemcpyFromSymbolAsync(dst, (const void*)&symbol, count, offset, kind, stream); -} - -/** - * \brief \hl Finds the address associated with a CUDA symbol - * - * Returns in \p *devPtr the address of symbol \p symbol on the device. - * \p symbol can either be a variable that resides in global or constant memory space. - * If \p symbol cannot be found, or if \p symbol is not declared - * in the global or constant memory space, \p *devPtr is unchanged and the error - * ::cudaErrorInvalidSymbol is returned. - * - * \param devPtr - Return device pointer associated with symbol - * \param symbol - Device symbol reference - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidSymbol, - * ::cudaErrorNoKernelImageForDevice - * \notefnerr - * - * \sa \ref ::cudaGetSymbolAddress(void**, const void*) "cudaGetSymbolAddress (C API)", - * \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API)" - */ -template -static __inline__ __host__ cudaError_t cudaGetSymbolAddress( - void **devPtr, - const T &symbol -) -{ - return ::cudaGetSymbolAddress(devPtr, (const void*)&symbol); -} - -/** - * \brief \hl Finds the size of the object associated with a CUDA symbol - * - * Returns in \p *size the size of symbol \p symbol. \p symbol must be a - * variable that resides in global or constant memory space. - * If \p symbol cannot be found, or if \p symbol is not declared - * in global or constant memory space, \p *size is unchanged and the error - * ::cudaErrorInvalidSymbol is returned. - * - * \param size - Size of object associated with symbol - * \param symbol - Device symbol reference - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidSymbol, - * ::cudaErrorNoKernelImageForDevice - * \notefnerr - * - * \sa \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress (C++ API)", - * \ref ::cudaGetSymbolSize(size_t*, const void*) "cudaGetSymbolSize (C API)" - */ -template -static __inline__ __host__ cudaError_t cudaGetSymbolSize( - size_t *size, - const T &symbol -) -{ - return ::cudaGetSymbolSize(size, (const void*)&symbol); -} - -/** - * \brief \hl Binds a memory area to a texture - * - * Binds \p size bytes of the memory area pointed to by \p devPtr to texture - * reference \p tex. \p desc describes how the memory is interpreted when - * fetching values from the texture. The \p offset parameter is an optional - * byte offset as with the low-level - * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture()" - * function. Any memory previously bound to \p tex is unbound. - * - * \param offset - Offset in bytes - * \param tex - Texture to bind - * \param devPtr - Memory area on device - * \param desc - Channel format - * \param size - Size of the memory area pointed to by devPtr - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidTexture - * \notefnerr - * - * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", - * ::cudaGetChannelDesc, ::cudaGetTextureReference, - * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", - * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", - * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", - * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", - * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", - * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", - * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", - * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" - */ -template -static __inline__ __host__ cudaError_t cudaBindTexture( - size_t *offset, - const struct texture &tex, - const void *devPtr, - const struct cudaChannelFormatDesc &desc, - size_t size = UINT_MAX -) -{ - return ::cudaBindTexture(offset, &tex, devPtr, &desc, size); -} - -/** - * \brief \hl Binds a memory area to a texture - * - * Binds \p size bytes of the memory area pointed to by \p devPtr to texture - * reference \p tex. The channel descriptor is inherited from the texture - * reference type. The \p offset parameter is an optional byte offset as with - * the low-level - * ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) - * function. Any memory previously bound to \p tex is unbound. - * - * \param offset - Offset in bytes - * \param tex - Texture to bind - * \param devPtr - Memory area on device - * \param size - Size of the memory area pointed to by devPtr - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidTexture - * \notefnerr - * - * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", - * ::cudaGetChannelDesc, ::cudaGetTextureReference, - * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", - * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", - * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", - * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", - * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", - * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", - * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", - * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" - */ -template -static __inline__ __host__ cudaError_t cudaBindTexture( - size_t *offset, - const struct texture &tex, - const void *devPtr, - size_t size = UINT_MAX -) -{ - return cudaBindTexture(offset, tex, devPtr, tex.channelDesc, size); -} - -/** - * \brief \hl Binds a 2D memory area to a texture - * - * Binds the 2D memory area pointed to by \p devPtr to the - * texture reference \p tex. The size of the area is constrained by - * \p width in texel units, \p height in texel units, and \p pitch in byte - * units. \p desc describes how the memory is interpreted when fetching values - * from the texture. Any memory previously bound to \p tex is unbound. - * - * Since the hardware enforces an alignment requirement on texture base - * addresses, - * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D()" - * returns in \p *offset a byte offset that - * must be applied to texture fetches in order to read from the desired memory. - * This offset must be divided by the texel size and passed to kernels that - * read from the texture so they can be applied to the ::tex2D() function. - * If the device memory pointer was returned from ::cudaMalloc(), the offset is - * guaranteed to be 0 and NULL may be passed as the \p offset parameter. - * - * \param offset - Offset in bytes - * \param tex - Texture reference to bind - * \param devPtr - 2D memory area on device - * \param desc - Channel format - * \param width - Width in texel units - * \param height - Height in texel units - * \param pitch - Pitch in bytes - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidTexture - * \notefnerr - * - * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", - * ::cudaGetChannelDesc, ::cudaGetTextureReference, - * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", - * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", - * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", - * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", - * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", - * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", - * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", - * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" - */ -template -static __inline__ __host__ cudaError_t cudaBindTexture2D( - size_t *offset, - const struct texture &tex, - const void *devPtr, - const struct cudaChannelFormatDesc &desc, - size_t width, - size_t height, - size_t pitch -) -{ - return ::cudaBindTexture2D(offset, &tex, devPtr, &desc, width, height, pitch); -} - -/** - * \brief \hl Binds a 2D memory area to a texture - * - * Binds the 2D memory area pointed to by \p devPtr to the - * texture reference \p tex. The size of the area is constrained by - * \p width in texel units, \p height in texel units, and \p pitch in byte - * units. The channel descriptor is inherited from the texture reference - * type. Any memory previously bound to \p tex is unbound. - * - * Since the hardware enforces an alignment requirement on texture base - * addresses, - * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D()" - * returns in \p *offset a byte offset that - * must be applied to texture fetches in order to read from the desired memory. - * This offset must be divided by the texel size and passed to kernels that - * read from the texture so they can be applied to the ::tex2D() function. - * If the device memory pointer was returned from ::cudaMalloc(), the offset is - * guaranteed to be 0 and NULL may be passed as the \p offset parameter. - * - * \param offset - Offset in bytes - * \param tex - Texture reference to bind - * \param devPtr - 2D memory area on device - * \param width - Width in texel units - * \param height - Height in texel units - * \param pitch - Pitch in bytes - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidTexture - * \notefnerr - * - * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", - * ::cudaGetChannelDesc, ::cudaGetTextureReference, - * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", - * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", - * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", - * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", - * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", - * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", - * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", - * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" - */ -template -static __inline__ __host__ cudaError_t cudaBindTexture2D( - size_t *offset, - const struct texture &tex, - const void *devPtr, - size_t width, - size_t height, - size_t pitch -) -{ - return ::cudaBindTexture2D(offset, &tex, devPtr, &tex.channelDesc, width, height, pitch); -} - -/** - * \brief \hl Binds an array to a texture - * - * Binds the CUDA array \p array to the texture reference \p tex. - * \p desc describes how the memory is interpreted when fetching values from - * the texture. Any CUDA array previously bound to \p tex is unbound. - * - * \param tex - Texture to bind - * \param array - Memory array on device - * \param desc - Channel format - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidTexture - * \notefnerr - * - * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", - * ::cudaGetChannelDesc, ::cudaGetTextureReference, - * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", - * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", - * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", - * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", - * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", - * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", - * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", - * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" - */ -template -static __inline__ __host__ cudaError_t cudaBindTextureToArray( - const struct texture &tex, - cudaArray_const_t array, - const struct cudaChannelFormatDesc &desc -) -{ - return ::cudaBindTextureToArray(&tex, array, &desc); -} - -/** - * \brief \hl Binds an array to a texture - * - * Binds the CUDA array \p array to the texture reference \p tex. - * The channel descriptor is inherited from the CUDA array. Any CUDA array - * previously bound to \p tex is unbound. - * - * \param tex - Texture to bind - * \param array - Memory array on device - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidTexture - * \notefnerr - * - * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", - * ::cudaGetChannelDesc, ::cudaGetTextureReference, - * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", - * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", - * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", - * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", - * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", - * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", - * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", - * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" - */ -template -static __inline__ __host__ cudaError_t cudaBindTextureToArray( - const struct texture &tex, - cudaArray_const_t array -) -{ - struct cudaChannelFormatDesc desc; - cudaError_t err = ::cudaGetChannelDesc(&desc, array); - - return err == cudaSuccess ? cudaBindTextureToArray(tex, array, desc) : err; -} - -/** - * \brief \hl Binds a mipmapped array to a texture - * - * Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p tex. - * \p desc describes how the memory is interpreted when fetching values from - * the texture. Any CUDA mipmapped array previously bound to \p tex is unbound. - * - * \param tex - Texture to bind - * \param mipmappedArray - Memory mipmapped array on device - * \param desc - Channel format - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidTexture - * \notefnerr - * - * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", - * ::cudaGetChannelDesc, ::cudaGetTextureReference, - * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", - * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", - * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", - * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", - * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", - * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", - * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", - * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" - */ -template -static __inline__ __host__ cudaError_t cudaBindTextureToMipmappedArray( - const struct texture &tex, - cudaMipmappedArray_const_t mipmappedArray, - const struct cudaChannelFormatDesc &desc -) -{ - return ::cudaBindTextureToMipmappedArray(&tex, mipmappedArray, &desc); -} - -/** - * \brief \hl Binds a mipmapped array to a texture - * - * Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p tex. - * The channel descriptor is inherited from the CUDA array. Any CUDA mipmapped array - * previously bound to \p tex is unbound. - * - * \param tex - Texture to bind - * \param mipmappedArray - Memory mipmapped array on device - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidTexture - * \notefnerr - * - * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", - * ::cudaGetChannelDesc, ::cudaGetTextureReference, - * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", - * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", - * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", - * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", - * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", - * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", - * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", - * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" - */ -template -static __inline__ __host__ cudaError_t cudaBindTextureToMipmappedArray( - const struct texture &tex, - cudaMipmappedArray_const_t mipmappedArray -) -{ - struct cudaChannelFormatDesc desc; - cudaArray_t levelArray; - cudaError_t err = ::cudaGetMipmappedArrayLevel(&levelArray, mipmappedArray, 0); - - if (err != cudaSuccess) { - return err; - } - err = ::cudaGetChannelDesc(&desc, levelArray); - - return err == cudaSuccess ? cudaBindTextureToMipmappedArray(tex, mipmappedArray, desc) : err; -} - -/** - * \brief \hl Unbinds a texture - * - * Unbinds the texture bound to \p tex. - * - * \param tex - Texture to unbind - * - * \return ::cudaSuccess - * \notefnerr - * - * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", - * ::cudaGetChannelDesc, ::cudaGetTextureReference, - * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", - * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", - * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", - * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", - * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", - * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", - * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)", - * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture&) "cudaGetTextureAlignmentOffset (C++ API)" - */ -template -static __inline__ __host__ cudaError_t cudaUnbindTexture( - const struct texture &tex -) -{ - return ::cudaUnbindTexture(&tex); -} - -/** - * \brief \hl Get the alignment offset of a texture - * - * Returns in \p *offset the offset that was returned when texture reference - * \p tex was bound. - * - * \param offset - Offset of texture reference in bytes - * \param tex - Texture to get offset of - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidTexture, - * ::cudaErrorInvalidTextureBinding - * \notefnerr - * - * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", - * ::cudaGetChannelDesc, ::cudaGetTextureReference, - * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", - * \ref ::cudaBindTexture(size_t*, const struct texture&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", - * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", - * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", - * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", - * \ref ::cudaBindTextureToArray(const struct texture&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", - * \ref ::cudaUnbindTexture(const struct texture&) "cudaUnbindTexture (C++ API)", - * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)" - */ -template -static __inline__ __host__ cudaError_t cudaGetTextureAlignmentOffset( - size_t *offset, - const struct texture &tex -) -{ - return ::cudaGetTextureAlignmentOffset(offset, &tex); -} - -/** - * \brief \hl Sets the preferred cache configuration for a device function - * - * On devices where the L1 cache and shared memory use the same hardware - * resources, this sets through \p cacheConfig the preferred cache configuration - * for the function specified via \p func. This is only a preference. The - * runtime will use the requested configuration if possible, but it is free to - * choose a different configuration if required to execute \p func. - * - * \p func must be a pointer to a function that executes on the device. - * The parameter specified by \p func must be declared as a \p __global__ - * function. If the specified function does not exist, - * then ::cudaErrorInvalidDeviceFunction is returned. - * - * This setting does nothing on devices where the size of the L1 cache and - * shared memory are fixed. - * - * Launching a kernel with a different preference than the most recent - * preference setting may insert a device-side synchronization point. - * - * The supported cache configurations are: - * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default) - * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache - * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory - * - * \param func - device function pointer - * \param cacheConfig - Requested cache configuration - * - * \return - * ::cudaSuccess, - * ::cudaErrorInitializationError, - * ::cudaErrorInvalidDeviceFunction - * \notefnerr - * - * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)", - * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", - * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)", - * ::cudaSetDoubleForDevice, - * ::cudaSetDoubleForHost, - * \ref ::cudaSetupArgument(T, size_t) "cudaSetupArgument (C++ API)", - * ::cudaThreadGetCacheConfig, - * ::cudaThreadSetCacheConfig - */ -template -static __inline__ __host__ cudaError_t cudaFuncSetCacheConfig( - T *func, - enum cudaFuncCache cacheConfig -) -{ - return ::cudaFuncSetCacheConfig((const void*)func, cacheConfig); -} - -template -static __inline__ __host__ cudaError_t cudaFuncSetSharedMemConfig( - T *func, - enum cudaSharedMemConfig config -) -{ - return ::cudaFuncSetSharedMemConfig((const void*)func, config); -} - -/** - * \brief Returns occupancy for a device function - * - * Returns in \p *numBlocks the maximum number of active blocks per - * streaming multiprocessor for the device function. - * - * \param numBlocks - Returned occupancy - * \param func - Kernel function for which occupancy is calulated - * \param blockSize - Block size the kernel is intended to be launched with - * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes - * - * \return - * ::cudaSuccess, - * ::cudaErrorCudartUnloading, - * ::cudaErrorInitializationError, - * ::cudaErrorInvalidDevice, - * ::cudaErrorInvalidDeviceFunction, - * ::cudaErrorInvalidValue, - * ::cudaErrorUnknown, - * \notefnerr - * - * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags - * \sa ::cudaOccupancyMaxPotentialBlockSize - * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags - * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem - * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags - */ -template -static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor( - int *numBlocks, - T func, - int blockSize, - size_t dynamicSMemSize) -{ - return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, cudaOccupancyDefault); -} - -/** - * \brief Returns occupancy for a device function with the specified flags - * - * Returns in \p *numBlocks the maximum number of active blocks per - * streaming multiprocessor for the device function. - * - * The \p flags parameter controls how special cases are handled. Valid flags include: - * - * - ::cudaOccupancyDefault: keeps the default behavior as - * ::cudaOccupancyMaxActiveBlocksPerMultiprocessor - * - * - ::cudaOccupancyDisableCachingOverride: suppresses the default behavior - * on platform where global caching affects occupancy. On such platforms, if caching - * is enabled, but per-block SM resource usage would result in zero occupancy, the - * occupancy calculator will calculate the occupancy as if caching is disabled. - * Setting this flag makes the occupancy calculator to return 0 in such cases. - * More information can be found about this feature in the "Unified L1/Texture Cache" - * section of the Maxwell tuning guide. - * - * \param numBlocks - Returned occupancy - * \param func - Kernel function for which occupancy is calulated - * \param blockSize - Block size the kernel is intended to be launched with - * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes - * \param flags - Requested behavior for the occupancy calculator - * - * \return - * ::cudaSuccess, - * ::cudaErrorCudartUnloading, - * ::cudaErrorInitializationError, - * ::cudaErrorInvalidDevice, - * ::cudaErrorInvalidDeviceFunction, - * ::cudaErrorInvalidValue, - * ::cudaErrorUnknown, - * \notefnerr - * - * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor - * \sa ::cudaOccupancyMaxPotentialBlockSize - * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags - * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem - * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags - */ -template -static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - int *numBlocks, - T func, - int blockSize, - size_t dynamicSMemSize, - unsigned int flags) -{ - return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, flags); -} - -/** - * Helper functor for cudaOccupancyMaxPotentialBlockSize - */ -class __cudaOccupancyB2DHelper { - size_t n; -public: - inline __host__ CUDART_DEVICE __cudaOccupancyB2DHelper(size_t n_) : n(n_) {} - inline __host__ CUDART_DEVICE size_t operator()(int) - { - return n; - } -}; - -/** - * \brief Returns grid and block size that achieves maximum potential occupancy for a device function - * - * Returns in \p *minGridSize and \p *blocksize a suggested grid / - * block size pair that achieves the best potential occupancy - * (i.e. the maximum number of active warps with the smallest number - * of blocks). - * - * The \p flags parameter controls how special cases are handled. Valid flags include: - * - * - ::cudaOccupancyDefault: keeps the default behavior as - * ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags - * - * - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior - * on platform where global caching affects occupancy. On such platforms, if caching - * is enabled, but per-block SM resource usage would result in zero occupancy, the - * occupancy calculator will calculate the occupancy as if caching is disabled. - * Setting this flag makes the occupancy calculator to return 0 in such cases. - * More information can be found about this feature in the "Unified L1/Texture Cache" - * section of the Maxwell tuning guide. - * - * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy - * \param blockSize - Returned block size - * \param func - Device function symbol - * \param blockSizeToDynamicSMemSize - A unary function / functor that takes block size, and returns the size, in bytes, of dynamic shared memory needed for a block - * \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit. - * \param flags - Requested behavior for the occupancy calculator - * - * \return - * ::cudaSuccess, - * ::cudaErrorCudartUnloading, - * ::cudaErrorInitializationError, - * ::cudaErrorInvalidDevice, - * ::cudaErrorInvalidDeviceFunction, - * ::cudaErrorInvalidValue, - * ::cudaErrorUnknown, - * \notefnerr - * - * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem - * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor - * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags - * \sa ::cudaOccupancyMaxPotentialBlockSize - * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags - */ - -template -static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags( - int *minGridSize, - int *blockSize, - T func, - UnaryFunction blockSizeToDynamicSMemSize, - int blockSizeLimit = 0, - unsigned int flags = 0) -{ - cudaError_t status; - - // Device and function properties - int device; - struct cudaFuncAttributes attr; - - // Limits - int maxThreadsPerMultiProcessor; - int warpSize; - int devMaxThreadsPerBlock; - int multiProcessorCount; - int funcMaxThreadsPerBlock; - int occupancyLimit; - int granularity; - - // Recorded maximum - int maxBlockSize = 0; - int numBlocks = 0; - int maxOccupancy = 0; - - // Temporary - int blockSizeToTryAligned; - int blockSizeToTry; - int blockSizeLimitAligned; - int occupancyInBlocks; - int occupancyInThreads; - size_t dynamicSMemSize; - - /////////////////////////// - // Check user input - /////////////////////////// - - if (!minGridSize || !blockSize || !func) { - return cudaErrorInvalidValue; - } - - ////////////////////////////////////////////// - // Obtain device and function properties - ////////////////////////////////////////////// - - status = ::cudaGetDevice(&device); - if (status != cudaSuccess) { - return status; - } - - status = cudaDeviceGetAttribute( - &maxThreadsPerMultiProcessor, - cudaDevAttrMaxThreadsPerMultiProcessor, - device); - if (status != cudaSuccess) { - return status; - } - - status = cudaDeviceGetAttribute( - &warpSize, - cudaDevAttrWarpSize, - device); - if (status != cudaSuccess) { - return status; - } - - status = cudaDeviceGetAttribute( - &devMaxThreadsPerBlock, - cudaDevAttrMaxThreadsPerBlock, - device); - if (status != cudaSuccess) { - return status; - } - - status = cudaDeviceGetAttribute( - &multiProcessorCount, - cudaDevAttrMultiProcessorCount, - device); - if (status != cudaSuccess) { - return status; - } - - status = cudaFuncGetAttributes(&attr, func); - if (status != cudaSuccess) { - return status; - } - - funcMaxThreadsPerBlock = attr.maxThreadsPerBlock; - - ///////////////////////////////////////////////////////////////////////////////// - // Try each block size, and pick the block size with maximum occupancy - ///////////////////////////////////////////////////////////////////////////////// - - occupancyLimit = maxThreadsPerMultiProcessor; - granularity = warpSize; - - if (blockSizeLimit == 0) { - blockSizeLimit = devMaxThreadsPerBlock; - } - - if (devMaxThreadsPerBlock < blockSizeLimit) { - blockSizeLimit = devMaxThreadsPerBlock; - } - - if (funcMaxThreadsPerBlock < blockSizeLimit) { - blockSizeLimit = funcMaxThreadsPerBlock; - } - - blockSizeLimitAligned = ((blockSizeLimit + (granularity - 1)) / granularity) * granularity; - - for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) { - // This is needed for the first iteration, because - // blockSizeLimitAligned could be greater than blockSizeLimit - // - if (blockSizeLimit < blockSizeToTryAligned) { - blockSizeToTry = blockSizeLimit; - } else { - blockSizeToTry = blockSizeToTryAligned; - } - - dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry); - - status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - &occupancyInBlocks, - func, - blockSizeToTry, - dynamicSMemSize, - flags); - - if (status != cudaSuccess) { - return status; - } - - occupancyInThreads = blockSizeToTry * occupancyInBlocks; - - if (occupancyInThreads > maxOccupancy) { - maxBlockSize = blockSizeToTry; - numBlocks = occupancyInBlocks; - maxOccupancy = occupancyInThreads; - } - - // Early out if we have reached the maximum - // - if (occupancyLimit == maxOccupancy) { - break; - } - } - - /////////////////////////// - // Return best available - /////////////////////////// - - // Suggested min grid size to achieve a full machine launch - // - *minGridSize = numBlocks * multiProcessorCount; - *blockSize = maxBlockSize; - - return status; -} - -/** - * \brief Returns grid and block size that achieves maximum potential occupancy for a device function - * - * Returns in \p *minGridSize and \p *blocksize a suggested grid / - * block size pair that achieves the best potential occupancy - * (i.e. the maximum number of active warps with the smallest number - * of blocks). - * - * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy - * \param blockSize - Returned block size - * \param func - Device function symbol - * \param blockSizeToDynamicSMemSize - A unary function / functor that takes block size, and returns the size, in bytes, of dynamic shared memory needed for a block - * \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit. - * - * \return - * ::cudaSuccess, - * ::cudaErrorCudartUnloading, - * ::cudaErrorInitializationError, - * ::cudaErrorInvalidDevice, - * ::cudaErrorInvalidDeviceFunction, - * ::cudaErrorInvalidValue, - * ::cudaErrorUnknown, - * \notefnerr - * - * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags - * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor - * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags - * \sa ::cudaOccupancyMaxPotentialBlockSize - * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags - */ - -template -static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMem( - int *minGridSize, - int *blockSize, - T func, - UnaryFunction blockSizeToDynamicSMemSize, - int blockSizeLimit = 0) -{ - return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, blockSizeToDynamicSMemSize, blockSizeLimit, cudaOccupancyDefault); -} - -/** - * \brief Returns grid and block size that achieves maximum potential occupancy for a device function - * - * Returns in \p *minGridSize and \p *blocksize a suggested grid / - * block size pair that achieves the best potential occupancy - * (i.e. the maximum number of active warps with the smallest number - * of blocks). - * - * Use \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem if the - * amount of per-block dynamic shared memory changes with different - * block sizes. - * - * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy - * \param blockSize - Returned block size - * \param func - Device function symbol - * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes - * \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit. - * - * \return - * ::cudaSuccess, - * ::cudaErrorCudartUnloading, - * ::cudaErrorInitializationError, - * ::cudaErrorInvalidDevice, - * ::cudaErrorInvalidDeviceFunction, - * ::cudaErrorInvalidValue, - * ::cudaErrorUnknown, - * \notefnerr - * - * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags - * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor - * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags - * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem - * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags - */ -template -static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSize( - int *minGridSize, - int *blockSize, - T func, - size_t dynamicSMemSize = 0, - int blockSizeLimit = 0) -{ - return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, cudaOccupancyDefault); -} - -/** - * \brief Returns grid and block size that achived maximum potential occupancy for a device function with the specified flags - * - * Returns in \p *minGridSize and \p *blocksize a suggested grid / - * block size pair that achieves the best potential occupancy - * (i.e. the maximum number of active warps with the smallest number - * of blocks). - * - * The \p flags parameter controls how special cases are handle. Valid flags include: - * - * - ::cudaOccupancyDefault: keeps the default behavior as - * ::cudaOccupancyMaxPotentialBlockSize - * - * - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior - * on platform where global caching affects occupancy. On such platforms, if caching - * is enabled, but per-block SM resource usage would result in zero occupancy, the - * occupancy calculator will calculate the occupancy as if caching is disabled. - * Setting this flag makes the occupancy calculator to return 0 in such cases. - * More information can be found about this feature in the "Unified L1/Texture Cache" - * section of the Maxwell tuning guide. - * - * Use \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem if the - * amount of per-block dynamic shared memory changes with different - * block sizes. - * - * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy - * \param blockSize - Returned block size - * \param func - Device function symbol - * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes - * \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit. - * \param flags - Requested behavior for the occupancy calculator - * - * \return - * ::cudaSuccess, - * ::cudaErrorCudartUnloading, - * ::cudaErrorInitializationError, - * ::cudaErrorInvalidDevice, - * ::cudaErrorInvalidDeviceFunction, - * ::cudaErrorInvalidValue, - * ::cudaErrorUnknown, - * \notefnerr - * - * \sa ::cudaOccupancyMaxPotentialBlockSize - * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor - * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags - * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem - * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags - */ -template -static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeWithFlags( - int *minGridSize, - int *blockSize, - T func, - size_t dynamicSMemSize = 0, - int blockSizeLimit = 0, - unsigned int flags = 0) -{ - return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, flags); -} - -/** - * \brief \hl Launches a device function - * - * \deprecated This function is deprecated as of CUDA 7.0 - * - * Launches the function \p func on the device. The parameter \p func must - * be a function that executes on the device. The parameter specified by \p func - * must be declared as a \p __global__ function. - * \ref ::cudaLaunch(T*) "cudaLaunch()" must be preceded by a call to - * ::cudaConfigureCall() since it pops the data that was pushed by - * ::cudaConfigureCall() from the execution stack. - * - * \param func - Device function pointer - * to execute - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidDeviceFunction, - * ::cudaErrorInvalidConfiguration, - * ::cudaErrorLaunchFailure, - * ::cudaErrorLaunchTimeout, - * ::cudaErrorLaunchOutOfResources, - * ::cudaErrorSharedObjectSymbolNotFound, - * ::cudaErrorSharedObjectInitFailed, - * ::cudaErrorInvalidPtx, - * ::cudaErrorNoKernelImageForDevice, - * ::cudaErrorJitCompilerNotFound - * \notefnerr - * - * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)", - * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)", - * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)", - * \ref ::cudaLaunch(const void*) "cudaLaunch (C API)", - * ::cudaSetDoubleForDevice, - * ::cudaSetDoubleForHost, - * \ref ::cudaSetupArgument(T, size_t) "cudaSetupArgument (C++ API)", - * ::cudaThreadGetCacheConfig, - * ::cudaThreadSetCacheConfig - */ -template -static __inline__ __host__ cudaError_t cudaLaunch( - T *func -) -{ - return ::cudaLaunch((const void*)func); -} - -/** - * \brief \hl Find out attributes for a given function - * - * This function obtains the attributes of a function specified via \p entry. - * The parameter \p entry must be a pointer to a function that executes - * on the device. The parameter specified by \p entry must be declared as a \p __global__ - * function. The fetched attributes are placed in \p attr. If the specified - * function does not exist, then ::cudaErrorInvalidDeviceFunction is returned. - * - * Note that some function attributes such as - * \ref ::cudaFuncAttributes::maxThreadsPerBlock "maxThreadsPerBlock" - * may vary based on the device that is currently being used. - * - * \param attr - Return pointer to function's attributes - * \param entry - Function to get attributes of - * - * \return - * ::cudaSuccess, - * ::cudaErrorInitializationError, - * ::cudaErrorInvalidDeviceFunction - * \notefnerr - * - * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)", - * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)", - * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", - * ::cudaSetDoubleForDevice, - * ::cudaSetDoubleForHost, - * \ref ::cudaSetupArgument(T, size_t) "cudaSetupArgument (C++ API)" - */ -template -static __inline__ __host__ cudaError_t cudaFuncGetAttributes( - struct cudaFuncAttributes *attr, - T *entry -) -{ - return ::cudaFuncGetAttributes(attr, (const void*)entry); -} - -/** - * \brief \hl Set attributes for a given function - * - * This function sets the attributes of a function specified via \p entry. - * The parameter \p entry must be a pointer to a function that executes - * on the device. The parameter specified by \p entry must be declared as a \p __global__ - * function. The enumeration defined by \p attr is set to the value defined by \p value. - * If the specified function does not exist, then ::cudaErrorInvalidDeviceFunction is returned. - * If the specified attribute cannot be written, or if the value is incorrect, - * then ::cudaErrorInvalidValue is returned. - * - * Valid values for \p attr are: - * - ::cudaFuncAttributeMaxDynamicSharedMemorySize - Maximum size of dynamic shared memory per block - * - ::cudaFuncAttributePreferredSharedMemoryCarveout - Preferred shared memory-L1 cache split ratio in percent of maximum shared memory. - * - * \param entry - Function to get attributes of - * \param attr - Attribute to set - * \param value - Value to set - * - * \return - * ::cudaSuccess, - * ::cudaErrorInitializationError, - * ::cudaErrorInvalidDeviceFunction, - * ::cudaErrorInvalidValue - * \notefnerr - * - * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)", - * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)", - * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", - * ::cudaSetDoubleForDevice, - * ::cudaSetDoubleForHost, - * \ref ::cudaSetupArgument(T, size_t) "cudaSetupArgument (C++ API)" - */ -template -static __inline__ __host__ cudaError_t cudaFuncSetAttribute( - T *entry, - enum cudaFuncAttribute attr, - int value -) -{ - return ::cudaFuncSetAttribute((const void*)entry, attr, value); -} - -/** - * \brief \hl Binds an array to a surface - * - * Binds the CUDA array \p array to the surface reference \p surf. - * \p desc describes how the memory is interpreted when dealing with - * the surface. Any CUDA array previously bound to \p surf is unbound. - * - * \param surf - Surface to bind - * \param array - Memory array on device - * \param desc - Channel format - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidSurface - * \notefnerr - * - * \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)", - * \ref ::cudaBindSurfaceToArray(const struct surface&, cudaArray_const_t) "cudaBindSurfaceToArray (C++ API, inherited channel descriptor)" - */ -template -static __inline__ __host__ cudaError_t cudaBindSurfaceToArray( - const struct surface &surf, - cudaArray_const_t array, - const struct cudaChannelFormatDesc &desc -) -{ - return ::cudaBindSurfaceToArray(&surf, array, &desc); -} - -/** - * \brief \hl Binds an array to a surface - * - * Binds the CUDA array \p array to the surface reference \p surf. - * The channel descriptor is inherited from the CUDA array. Any CUDA array - * previously bound to \p surf is unbound. - * - * \param surf - Surface to bind - * \param array - Memory array on device - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidSurface - * \notefnerr - * - * \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)", - * \ref ::cudaBindSurfaceToArray(const struct surface&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindSurfaceToArray (C++ API)" - */ -template -static __inline__ __host__ cudaError_t cudaBindSurfaceToArray( - const struct surface &surf, - cudaArray_const_t array -) -{ - struct cudaChannelFormatDesc desc; - cudaError_t err = ::cudaGetChannelDesc(&desc, array); - - return err == cudaSuccess ? cudaBindSurfaceToArray(surf, array, desc) : err; -} - -#endif /* __CUDACC__ */ - -/** @} */ /* END CUDART_HIGHLEVEL */ - -#endif /* __cplusplus && !__CUDACC_RTC__ */ - -#if !defined(__CUDACC_RTC__) -#if defined(__GNUC__) -#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))) -#pragma GCC diagnostic pop -#endif -#elif defined(_MSC_VER) -#pragma warning(pop) -#endif -#endif - -#endif /* !__CUDA_RUNTIME_H__ */ diff --git a/include/triton/external/CUDA/cuda_runtime_api.h b/include/triton/external/CUDA/cuda_runtime_api.h deleted file mode 100755 index 4f2997cdd..000000000 --- a/include/triton/external/CUDA/cuda_runtime_api.h +++ /dev/null @@ -1,7422 +0,0 @@ -/* - * Copyright 1993-2017 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -#if !defined(__CUDA_RUNTIME_API_H__) -#define __CUDA_RUNTIME_API_H__ - -/** - * \latexonly - * \page sync_async API synchronization behavior - * - * \section memcpy_sync_async_behavior Memcpy - * The API provides memcpy/memset functions in both synchronous and asynchronous forms, - * the latter having an \e "Async" suffix. This is a misnomer as each function - * may exhibit synchronous or asynchronous behavior depending on the arguments - * passed to the function. In the reference documentation, each memcpy function is - * categorized as \e synchronous or \e asynchronous, corresponding to the definitions - * below. - * - * \subsection MemcpySynchronousBehavior Synchronous - * - *
    - *
  1. For transfers from pageable host memory to device memory, a stream sync is performed - * before the copy is initiated. The function will return once the pageable - * buffer has been copied to the staging memory for DMA transfer to device memory, - * but the DMA to final destination may not have completed. - * - *
  2. For transfers from pinned host memory to device memory, the function is synchronous - * with respect to the host. - * - *
  3. For transfers from device to either pageable or pinned host memory, the function returns - * only once the copy has completed. - * - *
  4. For transfers from device memory to device memory, no host-side synchronization is - * performed. - * - *
  5. For transfers from any host memory to any host memory, the function is fully - * synchronous with respect to the host. - *
- * - * \subsection MemcpyAsynchronousBehavior Asynchronous - * - *
    - *
  1. For transfers from device memory to pageable host memory, the function - * will return only once the copy has completed. - * - *
  2. For transfers from any host memory to any host memory, the function is fully - * synchronous with respect to the host. - * - *
  3. For all other transfers, the function is fully asynchronous. If pageable - * memory must first be staged to pinned memory, this will be handled - * asynchronously with a worker thread. - *
- * - * \section memset_sync_async_behavior Memset - * The cudaMemset functions are asynchronous with respect to the host - * except when the target memory is pinned host memory. The \e Async - * versions are always asynchronous with respect to the host. - * - * \section kernel_launch_details Kernel Launches - * Kernel launches are asynchronous with respect to the host. Details of - * concurrent kernel execution and data transfers can be found in the CUDA - * Programmers Guide. - * - * \endlatexonly - */ - -/** - * There are two levels for the runtime API. - * - * The C API (cuda_runtime_api.h) is - * a C-style interface that does not require compiling with \p nvcc. - * - * The \ref CUDART_HIGHLEVEL "C++ API" (cuda_runtime.h) is a - * C++-style interface built on top of the C API. It wraps some of the - * C API routines, using overloading, references and default arguments. - * These wrappers can be used from C++ code and can be compiled with any C++ - * compiler. The C++ API also has some CUDA-specific wrappers that wrap - * C API routines that deal with symbols, textures, and device functions. - * These wrappers require the use of \p nvcc because they depend on code being - * generated by the compiler. For example, the execution configuration syntax - * to invoke kernels is only available in source code compiled with \p nvcc. - */ - -/** CUDA Runtime API Version */ -#define CUDART_VERSION 9000 - -#include "host_defines.h" -#include "builtin_types.h" - -#include "cuda_device_runtime_api.h" - -#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) || defined(__CUDA_API_VERSION_INTERNAL) - #define __CUDART_API_PER_THREAD_DEFAULT_STREAM - #define __CUDART_API_PTDS(api) api ## _ptds - #define __CUDART_API_PTSZ(api) api ## _ptsz -#else - #define __CUDART_API_PTDS(api) api - #define __CUDART_API_PTSZ(api) api -#endif - -#if defined(__CUDART_API_PER_THREAD_DEFAULT_STREAM) - #define cudaMemcpy __CUDART_API_PTDS(cudaMemcpy) - #define cudaMemcpyToSymbol __CUDART_API_PTDS(cudaMemcpyToSymbol) - #define cudaMemcpyFromSymbol __CUDART_API_PTDS(cudaMemcpyFromSymbol) - #define cudaMemcpy2D __CUDART_API_PTDS(cudaMemcpy2D) - #define cudaMemcpyToArray __CUDART_API_PTDS(cudaMemcpyToArray) - #define cudaMemcpy2DToArray __CUDART_API_PTDS(cudaMemcpy2DToArray) - #define cudaMemcpyFromArray __CUDART_API_PTDS(cudaMemcpyFromArray) - #define cudaMemcpy2DFromArray __CUDART_API_PTDS(cudaMemcpy2DFromArray) - #define cudaMemcpyArrayToArray __CUDART_API_PTDS(cudaMemcpyArrayToArray) - #define cudaMemcpy2DArrayToArray __CUDART_API_PTDS(cudaMemcpy2DArrayToArray) - #define cudaMemcpy3D __CUDART_API_PTDS(cudaMemcpy3D) - #define cudaMemcpy3DPeer __CUDART_API_PTDS(cudaMemcpy3DPeer) - #define cudaMemset __CUDART_API_PTDS(cudaMemset) - #define cudaMemset2D __CUDART_API_PTDS(cudaMemset2D) - #define cudaMemset3D __CUDART_API_PTDS(cudaMemset3D) - #define cudaMemcpyAsync __CUDART_API_PTSZ(cudaMemcpyAsync) - #define cudaMemcpyToSymbolAsync __CUDART_API_PTSZ(cudaMemcpyToSymbolAsync) - #define cudaMemcpyFromSymbolAsync __CUDART_API_PTSZ(cudaMemcpyFromSymbolAsync) - #define cudaMemcpy2DAsync __CUDART_API_PTSZ(cudaMemcpy2DAsync) - #define cudaMemcpyToArrayAsync __CUDART_API_PTSZ(cudaMemcpyToArrayAsync) - #define cudaMemcpy2DToArrayAsync __CUDART_API_PTSZ(cudaMemcpy2DToArrayAsync) - #define cudaMemcpyFromArrayAsync __CUDART_API_PTSZ(cudaMemcpyFromArrayAsync) - #define cudaMemcpy2DFromArrayAsync __CUDART_API_PTSZ(cudaMemcpy2DFromArrayAsync) - #define cudaMemcpy3DAsync __CUDART_API_PTSZ(cudaMemcpy3DAsync) - #define cudaMemcpy3DPeerAsync __CUDART_API_PTSZ(cudaMemcpy3DPeerAsync) - #define cudaMemsetAsync __CUDART_API_PTSZ(cudaMemsetAsync) - #define cudaMemset2DAsync __CUDART_API_PTSZ(cudaMemset2DAsync) - #define cudaMemset3DAsync __CUDART_API_PTSZ(cudaMemset3DAsync) - #define cudaStreamQuery __CUDART_API_PTSZ(cudaStreamQuery) - #define cudaStreamGetFlags __CUDART_API_PTSZ(cudaStreamGetFlags) - #define cudaStreamGetPriority __CUDART_API_PTSZ(cudaStreamGetPriority) - #define cudaEventRecord __CUDART_API_PTSZ(cudaEventRecord) - #define cudaStreamWaitEvent __CUDART_API_PTSZ(cudaStreamWaitEvent) - #define cudaStreamAddCallback __CUDART_API_PTSZ(cudaStreamAddCallback) - #define cudaStreamAttachMemAsync __CUDART_API_PTSZ(cudaStreamAttachMemAsync) - #define cudaStreamSynchronize __CUDART_API_PTSZ(cudaStreamSynchronize) - #define cudaLaunch __CUDART_API_PTSZ(cudaLaunch) - #define cudaLaunchKernel __CUDART_API_PTSZ(cudaLaunchKernel) - #define cudaMemPrefetchAsync __CUDART_API_PTSZ(cudaMemPrefetchAsync) - #define cudaLaunchCooperativeKernel __CUDART_API_PTSZ(cudaLaunchCooperativeKernel) -#endif - -/** \cond impl_private */ -#if !defined(__dv) - -#if defined(__cplusplus) - -#define __dv(v) \ - = v - -#else /* __cplusplus */ - -#define __dv(v) - -#endif /* __cplusplus */ - -#endif /* !__dv */ -/** \endcond impl_private */ - -#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)) /** Visible to SM>=3.5 and "__host__ __device__" only **/ - -#define CUDART_DEVICE __device__ - -#else - -#define CUDART_DEVICE - -#endif /** CUDART_DEVICE */ - -#if defined(__cplusplus) -extern "C" { -#endif /* __cplusplus */ - -/** - * \defgroup CUDART_DEVICE Device Management - * - * ___MANBRIEF___ device management functions of the CUDA runtime API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the device management functions of the CUDA runtime - * application programming interface. - * - * @{ - */ - -/** - * \brief Destroy all allocations and reset all state on the current device - * in the current process. - * - * Explicitly destroys and cleans up all resources associated with the current - * device in the current process. Any subsequent API call to this device will - * reinitialize the device. - * - * Note that this function will reset the device immediately. It is the caller's - * responsibility to ensure that the device is not being accessed by any - * other host threads from the process when this function is called. - * - * \return - * ::cudaSuccess - * \notefnerr - * - * \sa ::cudaDeviceSynchronize - */ -extern __host__ cudaError_t CUDARTAPI cudaDeviceReset(void); - -/** - * \brief Wait for compute device to finish - * - * Blocks until the device has completed all preceding requested tasks. - * ::cudaDeviceSynchronize() returns an error if one of the preceding tasks - * has failed. If the ::cudaDeviceScheduleBlockingSync flag was set for - * this device, the host thread will block until the device has finished - * its work. - * - * \return - * ::cudaSuccess - * \notefnerr - * - * \sa - * ::cudaDeviceReset, - * ::cuCtxSynchronize - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceSynchronize(void); - -/** - * \brief Set resource limits - * - * Setting \p limit to \p value is a request by the application to update - * the current limit maintained by the device. The driver is free to - * modify the requested value to meet h/w requirements (this could be - * clamping to minimum or maximum values, rounding up to nearest element - * size, etc). The application can use ::cudaDeviceGetLimit() to find out - * exactly what the limit has been set to. - * - * Setting each ::cudaLimit has its own specific restrictions, so each is - * discussed here. - * - * - ::cudaLimitStackSize controls the stack size in bytes of each GPU thread. - * - * - ::cudaLimitPrintfFifoSize controls the size in bytes of the shared FIFO - * used by the ::printf() and ::fprintf() device system calls. Setting - * ::cudaLimitPrintfFifoSize must not be performed after launching any kernel - * that uses the ::printf() or ::fprintf() device system calls - in such case - * ::cudaErrorInvalidValue will be returned. - * - * - ::cudaLimitMallocHeapSize controls the size in bytes of the heap used by - * the ::malloc() and ::free() device system calls. Setting - * ::cudaLimitMallocHeapSize must not be performed after launching any kernel - * that uses the ::malloc() or ::free() device system calls - in such case - * ::cudaErrorInvalidValue will be returned. - * - * - ::cudaLimitDevRuntimeSyncDepth controls the maximum nesting depth of a - * grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting - * this limit must be performed before any launch of a kernel that uses the - * device runtime and calls ::cudaDeviceSynchronize() above the default sync - * depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail - * with error code ::cudaErrorSyncDepthExceeded if the limitation is - * violated. This limit can be set smaller than the default or up the maximum - * launch depth of 24. When setting this limit, keep in mind that additional - * levels of sync depth require the runtime to reserve large amounts of - * device memory which can no longer be used for user allocations. If these - * reservations of device memory fail, ::cudaDeviceSetLimit will return - * ::cudaErrorMemoryAllocation, and the limit can be reset to a lower value. - * This limit is only applicable to devices of compute capability 3.5 and - * higher. Attempting to set this limit on devices of compute capability less - * than 3.5 will result in the error ::cudaErrorUnsupportedLimit being - * returned. - * - * - ::cudaLimitDevRuntimePendingLaunchCount controls the maximum number of - * outstanding device runtime launches that can be made from the current - * device. A grid is outstanding from the point of launch up until the grid - * is known to have been completed. Device runtime launches which violate - * this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when - * ::cudaGetLastError() is called after launch. If more pending launches than - * the default (2048 launches) are needed for a module using the device - * runtime, this limit can be increased. Keep in mind that being able to - * sustain additional pending launches will require the runtime to reserve - * larger amounts of device memory upfront which can no longer be used for - * allocations. If these reservations fail, ::cudaDeviceSetLimit will return - * ::cudaErrorMemoryAllocation, and the limit can be reset to a lower value. - * This limit is only applicable to devices of compute capability 3.5 and - * higher. Attempting to set this limit on devices of compute capability less - * than 3.5 will result in the error ::cudaErrorUnsupportedLimit being - * returned. - * - * \param limit - Limit to set - * \param value - Size of limit - * - * \return - * ::cudaSuccess, - * ::cudaErrorUnsupportedLimit, - * ::cudaErrorInvalidValue, - * ::cudaErrorMemoryAllocation - * \notefnerr - * - * \sa - * ::cudaDeviceGetLimit, - * ::cuCtxSetLimit - */ -extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit, size_t value); - -/** - * \brief Returns resource limits - * - * Returns in \p *pValue the current size of \p limit. The supported - * ::cudaLimit values are: - * - ::cudaLimitStackSize: stack size in bytes of each GPU thread; - * - ::cudaLimitPrintfFifoSize: size in bytes of the shared FIFO used by the - * ::printf() and ::fprintf() device system calls. - * - ::cudaLimitMallocHeapSize: size in bytes of the heap used by the - * ::malloc() and ::free() device system calls; - * - ::cudaLimitDevRuntimeSyncDepth: maximum grid depth at which a - * thread can isssue the device runtime call ::cudaDeviceSynchronize() - * to wait on child grid launches to complete. - * - ::cudaLimitDevRuntimePendingLaunchCount: maximum number of outstanding - * device runtime launches. - * - * \param limit - Limit to query - * \param pValue - Returned size of the limit - * - * \return - * ::cudaSuccess, - * ::cudaErrorUnsupportedLimit, - * ::cudaErrorInvalidValue - * \notefnerr - * - * \sa - * ::cudaDeviceSetLimit, - * ::cuCtxGetLimit - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit); - -/** - * \brief Returns the preferred cache configuration for the current device. - * - * On devices where the L1 cache and shared memory use the same hardware - * resources, this returns through \p pCacheConfig the preferred cache - * configuration for the current device. This is only a preference. The - * runtime will use the requested configuration if possible, but it is free to - * choose a different configuration if required to execute functions. - * - * This will return a \p pCacheConfig of ::cudaFuncCachePreferNone on devices - * where the size of the L1 cache and shared memory are fixed. - * - * The supported cache configurations are: - * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default) - * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache - * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory - * - ::cudaFuncCachePreferEqual: prefer equal size L1 cache and shared memory - * - * \param pCacheConfig - Returned cache configuration - * - * \return - * ::cudaSuccess, - * ::cudaErrorInitializationError - * \notefnerr - * - * \sa cudaDeviceSetCacheConfig, - * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", - * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)", - * ::cuCtxGetCacheConfig - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig); - -/** - * \brief Returns numerical values that correspond to the least and - * greatest stream priorities. - * - * Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond - * to the least and greatest stream priorities respectively. Stream priorities - * follow a convention where lower numbers imply greater priorities. The range of - * meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority]. - * If the user attempts to create a stream with a priority value that is - * outside the the meaningful range as specified by this API, the priority is - * automatically clamped down or up to either \p *leastPriority or \p *greatestPriority - * respectively. See ::cudaStreamCreateWithPriority for details on creating a - * priority stream. - * A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value - * is not desired. - * - * This function will return '0' in both \p *leastPriority and \p *greatestPriority if - * the current context's device does not support stream priorities - * (see ::cudaDeviceGetAttribute). - * - * \param leastPriority - Pointer to an int in which the numerical value for least - * stream priority is returned - * \param greatestPriority - Pointer to an int in which the numerical value for greatest - * stream priority is returned - * - * \return - * ::cudaSuccess, - * ::cudaErrorInitializationError - * \notefnerr - * - * \sa ::cudaStreamCreateWithPriority, - * ::cudaStreamGetPriority, - * ::cuCtxGetStreamPriorityRange - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority); - -/** - * \brief Sets the preferred cache configuration for the current device. - * - * On devices where the L1 cache and shared memory use the same hardware - * resources, this sets through \p cacheConfig the preferred cache - * configuration for the current device. This is only a preference. The - * runtime will use the requested configuration if possible, but it is free to - * choose a different configuration if required to execute the function. Any - * function preference set via - * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)" - * or - * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)" - * will be preferred over this device-wide setting. Setting the device-wide - * cache configuration to ::cudaFuncCachePreferNone will cause subsequent - * kernel launches to prefer to not change the cache configuration unless - * required to launch the kernel. - * - * This setting does nothing on devices where the size of the L1 cache and - * shared memory are fixed. - * - * Launching a kernel with a different preference than the most recent - * preference setting may insert a device-side synchronization point. - * - * The supported cache configurations are: - * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default) - * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache - * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory - * - ::cudaFuncCachePreferEqual: prefer equal size L1 cache and shared memory - * - * \param cacheConfig - Requested cache configuration - * - * \return - * ::cudaSuccess, - * ::cudaErrorInitializationError - * \notefnerr - * - * \sa ::cudaDeviceGetCacheConfig, - * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", - * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)", - * ::cuCtxSetCacheConfig - */ -extern __host__ cudaError_t CUDARTAPI cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig); - -/** - * \brief Returns the shared memory configuration for the current device. - * - * This function will return in \p pConfig the current size of shared memory banks - * on the current device. On devices with configurable shared memory banks, - * ::cudaDeviceSetSharedMemConfig can be used to change this setting, so that all - * subsequent kernel launches will by default use the new bank size. When - * ::cudaDeviceGetSharedMemConfig is called on devices without configurable shared - * memory, it will return the fixed bank size of the hardware. - * - * The returned bank configurations can be either: - * - ::cudaSharedMemBankSizeFourByte - shared memory bank width is four bytes. - * - ::cudaSharedMemBankSizeEightByte - shared memory bank width is eight bytes. - * - * \param pConfig - Returned cache configuration - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInitializationError - * \notefnerr - * - * \sa ::cudaDeviceSetCacheConfig, - * ::cudaDeviceGetCacheConfig, - * ::cudaDeviceSetSharedMemConfig, - * ::cudaFuncSetCacheConfig, - * ::cuCtxGetSharedMemConfig - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig); - -/** - * \brief Sets the shared memory configuration for the current device. - * - * On devices with configurable shared memory banks, this function will set - * the shared memory bank size which is used for all subsequent kernel launches. - * Any per-function setting of shared memory set via ::cudaFuncSetSharedMemConfig - * will override the device wide setting. - * - * Changing the shared memory configuration between launches may introduce - * a device side synchronization point. - * - * Changing the shared memory bank size will not increase shared memory usage - * or affect occupancy of kernels, but may have major effects on performance. - * Larger bank sizes will allow for greater potential bandwidth to shared memory, - * but will change what kinds of accesses to shared memory will result in bank - * conflicts. - * - * This function will do nothing on devices with fixed shared memory bank size. - * - * The supported bank configurations are: - * - ::cudaSharedMemBankSizeDefault: set bank width the device default (currently, - * four bytes) - * - ::cudaSharedMemBankSizeFourByte: set shared memory bank width to be four bytes - * natively. - * - ::cudaSharedMemBankSizeEightByte: set shared memory bank width to be eight - * bytes natively. - * - * \param config - Requested cache configuration - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInitializationError - * \notefnerr - * - * \sa ::cudaDeviceSetCacheConfig, - * ::cudaDeviceGetCacheConfig, - * ::cudaDeviceGetSharedMemConfig, - * ::cudaFuncSetCacheConfig, - * ::cuCtxSetSharedMemConfig - */ -extern __host__ cudaError_t CUDARTAPI cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config); - -/** - * \brief Returns a handle to a compute device - * - * Returns in \p *device a device ordinal given a PCI bus ID string. - * - * \param device - Returned device ordinal - * - * \param pciBusId - String in one of the following forms: - * [domain]:[bus]:[device].[function] - * [domain]:[bus]:[device] - * [bus]:[device].[function] - * where \p domain, \p bus, \p device, and \p function are all hexadecimal values - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidDevice - * \notefnerr - * - * \sa - * ::cudaDeviceGetPCIBusId, - * ::cuDeviceGetByPCIBusId - */ -extern __host__ cudaError_t CUDARTAPI cudaDeviceGetByPCIBusId(int *device, const char *pciBusId); - -/** - * \brief Returns a PCI Bus Id string for the device - * - * Returns an ASCII string identifying the device \p dev in the NULL-terminated - * string pointed to by \p pciBusId. \p len specifies the maximum length of the - * string that may be returned. - * - * \param pciBusId - Returned identifier string for the device in the following format - * [domain]:[bus]:[device].[function] - * where \p domain, \p bus, \p device, and \p function are all hexadecimal values. - * pciBusId should be large enough to store 13 characters including the NULL-terminator. - * - * \param len - Maximum length of string to store in \p name - * - * \param device - Device to get identifier string for - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidDevice - * \notefnerr - * - * \sa - * ::cudaDeviceGetByPCIBusId, - * ::cuDeviceGetPCIBusId - */ -extern __host__ cudaError_t CUDARTAPI cudaDeviceGetPCIBusId(char *pciBusId, int len, int device); - -/** - * \brief Gets an interprocess handle for a previously allocated event - * - * Takes as input a previously allocated event. This event must have been - * created with the ::cudaEventInterprocess and ::cudaEventDisableTiming - * flags set. This opaque handle may be copied into other processes and - * opened with ::cudaIpcOpenEventHandle to allow efficient hardware - * synchronization between GPU work in different processes. - * - * After the event has been been opened in the importing process, - * ::cudaEventRecord, ::cudaEventSynchronize, ::cudaStreamWaitEvent and - * ::cudaEventQuery may be used in either process. Performing operations - * on the imported event after the exported event has been freed - * with ::cudaEventDestroy will result in undefined behavior. - * - * IPC functionality is restricted to devices with support for unified - * addressing on Linux operating systems. IPC functionality is not supported - * on Tegra platforms. - * - * \param handle - Pointer to a user allocated cudaIpcEventHandle - * in which to return the opaque event handle - * \param event - Event allocated with ::cudaEventInterprocess and - * ::cudaEventDisableTiming flags. - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidResourceHandle, - * ::cudaErrorMemoryAllocation, - * ::cudaErrorMapBufferObjectFailed, - * ::cudaErrorNotSupported - * - * \sa - * ::cudaEventCreate, - * ::cudaEventDestroy, - * ::cudaEventSynchronize, - * ::cudaEventQuery, - * ::cudaStreamWaitEvent, - * ::cudaIpcOpenEventHandle, - * ::cudaIpcGetMemHandle, - * ::cudaIpcOpenMemHandle, - * ::cudaIpcCloseMemHandle, - * ::cuIpcGetEventHandle - */ -extern __host__ cudaError_t CUDARTAPI cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event); - -/** - * \brief Opens an interprocess event handle for use in the current process - * - * Opens an interprocess event handle exported from another process with - * ::cudaIpcGetEventHandle. This function returns a ::cudaEvent_t that behaves like - * a locally created event with the ::cudaEventDisableTiming flag specified. - * This event must be freed with ::cudaEventDestroy. - * - * Performing operations on the imported event after the exported event has - * been freed with ::cudaEventDestroy will result in undefined behavior. - * - * IPC functionality is restricted to devices with support for unified - * addressing on Linux operating systems. IPC functionality is not supported - * on Tegra platforms. - * - * \param event - Returns the imported event - * \param handle - Interprocess handle to open - * - * \returns - * ::cudaSuccess, - * ::cudaErrorMapBufferObjectFailed, - * ::cudaErrorInvalidResourceHandle, - * ::cudaErrorNotSupported - * - * \sa - * ::cudaEventCreate, - * ::cudaEventDestroy, - * ::cudaEventSynchronize, - * ::cudaEventQuery, - * ::cudaStreamWaitEvent, - * ::cudaIpcGetEventHandle, - * ::cudaIpcGetMemHandle, - * ::cudaIpcOpenMemHandle, - * ::cudaIpcCloseMemHandle, - * ::cuIpcOpenEventHandle - */ -extern __host__ cudaError_t CUDARTAPI cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle); - - -/** - * \brief Gets an interprocess memory handle for an existing device memory - * allocation - * - * Takes a pointer to the base of an existing device memory allocation created - * with ::cudaMalloc and exports it for use in another process. This is a - * lightweight operation and may be called multiple times on an allocation - * without adverse effects. - * - * If a region of memory is freed with ::cudaFree and a subsequent call - * to ::cudaMalloc returns memory with the same device address, - * ::cudaIpcGetMemHandle will return a unique handle for the - * new memory. - * - * IPC functionality is restricted to devices with support for unified - * addressing on Linux operating systems. IPC functionality is not supported - * on Tegra platforms. - * - * \param handle - Pointer to user allocated ::cudaIpcMemHandle to return - * the handle in. - * \param devPtr - Base pointer to previously allocated device memory - * - * \returns - * ::cudaSuccess, - * ::cudaErrorInvalidResourceHandle, - * ::cudaErrorMemoryAllocation, - * ::cudaErrorMapBufferObjectFailed, - * ::cudaErrorNotSupported - * - * \sa - * ::cudaMalloc, - * ::cudaFree, - * ::cudaIpcGetEventHandle, - * ::cudaIpcOpenEventHandle, - * ::cudaIpcOpenMemHandle, - * ::cudaIpcCloseMemHandle, - * ::cuIpcGetMemHandle - */ -extern __host__ cudaError_t CUDARTAPI cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr); - -/** - * \brief Opens an interprocess memory handle exported from another process - * and returns a device pointer usable in the local process. - * - * Maps memory exported from another process with ::cudaIpcGetMemHandle into - * the current device address space. For contexts on different devices - * ::cudaIpcOpenMemHandle can attempt to enable peer access between the - * devices as if the user called ::cudaDeviceEnablePeerAccess. This behavior is - * controlled by the ::cudaIpcMemLazyEnablePeerAccess flag. - * ::cudaDeviceCanAccessPeer can determine if a mapping is possible. - * - * Contexts that may open ::cudaIpcMemHandles are restricted in the following way. - * ::cudaIpcMemHandles from each device in a given process may only be opened - * by one context per device per other process. - * - * Memory returned from ::cudaIpcOpenMemHandle must be freed with - * ::cudaIpcCloseMemHandle. - * - * Calling ::cudaFree on an exported memory region before calling - * ::cudaIpcCloseMemHandle in the importing context will result in undefined - * behavior. - * - * IPC functionality is restricted to devices with support for unified - * addressing on Linux operating systems. IPC functionality is not supported - * on Tegra platforms. - * - * \param devPtr - Returned device pointer - * \param handle - ::cudaIpcMemHandle to open - * \param flags - Flags for this operation. Must be specified as ::cudaIpcMemLazyEnablePeerAccess - * - * \returns - * ::cudaSuccess, - * ::cudaErrorMapBufferObjectFailed, - * ::cudaErrorInvalidResourceHandle, - * ::cudaErrorTooManyPeers, - * ::cudaErrorNotSupported - * - * \note No guarantees are made about the address returned in \p *devPtr. - * In particular, multiple processes may not receive the same address for the same \p handle. - * - * \sa - * ::cudaMalloc, - * ::cudaFree, - * ::cudaIpcGetEventHandle, - * ::cudaIpcOpenEventHandle, - * ::cudaIpcGetMemHandle, - * ::cudaIpcCloseMemHandle, - * ::cudaDeviceEnablePeerAccess, - * ::cudaDeviceCanAccessPeer, - * ::cuIpcOpenMemHandle - */ -extern __host__ cudaError_t CUDARTAPI cudaIpcOpenMemHandle(void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags); - -/** - * \brief Close memory mapped with cudaIpcOpenMemHandle - * - * Unmaps memory returnd by ::cudaIpcOpenMemHandle. The original allocation - * in the exporting process as well as imported mappings in other processes - * will be unaffected. - * - * Any resources used to enable peer access will be freed if this is the - * last mapping using them. - * - * IPC functionality is restricted to devices with support for unified - * addressing on Linux operating systems. IPC functionality is not supported - * on Tegra platforms. - * - * \param devPtr - Device pointer returned by ::cudaIpcOpenMemHandle - * - * \returns - * ::cudaSuccess, - * ::cudaErrorMapBufferObjectFailed, - * ::cudaErrorInvalidResourceHandle, - * ::cudaErrorNotSupported - * - * \sa - * ::cudaMalloc, - * ::cudaFree, - * ::cudaIpcGetEventHandle, - * ::cudaIpcOpenEventHandle, - * ::cudaIpcGetMemHandle, - * ::cudaIpcOpenMemHandle, - * ::cuIpcCloseMemHandle - */ -extern __host__ cudaError_t CUDARTAPI cudaIpcCloseMemHandle(void *devPtr); - -/** @} */ /* END CUDART_DEVICE */ - -/** - * \defgroup CUDART_THREAD_DEPRECATED Thread Management [DEPRECATED] - * - * ___MANBRIEF___ deprecated thread management functions of the CUDA runtime - * API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes deprecated thread management functions of the CUDA runtime - * application programming interface. - * - * @{ - */ - -/** - * \brief Exit and clean up from CUDA launches - * - * \deprecated - * - * Note that this function is deprecated because its name does not - * reflect its behavior. Its functionality is identical to the - * non-deprecated function ::cudaDeviceReset(), which should be used - * instead. - * - * Explicitly destroys all cleans up all resources associated with the current - * device in the current process. Any subsequent API call to this device will - * reinitialize the device. - * - * Note that this function will reset the device immediately. It is the caller's - * responsibility to ensure that the device is not being accessed by any - * other host threads from the process when this function is called. - * - * \return - * ::cudaSuccess - * \notefnerr - * - * \sa ::cudaDeviceReset - */ -extern __host__ cudaError_t CUDARTAPI cudaThreadExit(void); - -/** - * \brief Wait for compute device to finish - * - * \deprecated - * - * Note that this function is deprecated because its name does not - * reflect its behavior. Its functionality is similar to the - * non-deprecated function ::cudaDeviceSynchronize(), which should be used - * instead. - * - * Blocks until the device has completed all preceding requested tasks. - * ::cudaThreadSynchronize() returns an error if one of the preceding tasks - * has failed. If the ::cudaDeviceScheduleBlockingSync flag was set for - * this device, the host thread will block until the device has finished - * its work. - * - * \return - * ::cudaSuccess - * \notefnerr - * - * \sa ::cudaDeviceSynchronize - */ -extern __host__ cudaError_t CUDARTAPI cudaThreadSynchronize(void); - -/** - * \brief Set resource limits - * - * \deprecated - * - * Note that this function is deprecated because its name does not - * reflect its behavior. Its functionality is identical to the - * non-deprecated function ::cudaDeviceSetLimit(), which should be used - * instead. - * - * Setting \p limit to \p value is a request by the application to update - * the current limit maintained by the device. The driver is free to - * modify the requested value to meet h/w requirements (this could be - * clamping to minimum or maximum values, rounding up to nearest element - * size, etc). The application can use ::cudaThreadGetLimit() to find out - * exactly what the limit has been set to. - * - * Setting each ::cudaLimit has its own specific restrictions, so each is - * discussed here. - * - * - ::cudaLimitStackSize controls the stack size of each GPU thread. - * - * - ::cudaLimitPrintfFifoSize controls the size of the shared FIFO - * used by the ::printf() and ::fprintf() device system calls. - * Setting ::cudaLimitPrintfFifoSize must be performed before - * launching any kernel that uses the ::printf() or ::fprintf() device - * system calls, otherwise ::cudaErrorInvalidValue will be returned. - * - * - ::cudaLimitMallocHeapSize controls the size of the heap used - * by the ::malloc() and ::free() device system calls. Setting - * ::cudaLimitMallocHeapSize must be performed before launching - * any kernel that uses the ::malloc() or ::free() device system calls, - * otherwise ::cudaErrorInvalidValue will be returned. - * - * \param limit - Limit to set - * \param value - Size in bytes of limit - * - * \return - * ::cudaSuccess, - * ::cudaErrorUnsupportedLimit, - * ::cudaErrorInvalidValue - * \notefnerr - * - * \sa ::cudaDeviceSetLimit - */ -extern __host__ cudaError_t CUDARTAPI cudaThreadSetLimit(enum cudaLimit limit, size_t value); - -/** - * \brief Returns resource limits - * - * \deprecated - * - * Note that this function is deprecated because its name does not - * reflect its behavior. Its functionality is identical to the - * non-deprecated function ::cudaDeviceGetLimit(), which should be used - * instead. - * - * Returns in \p *pValue the current size of \p limit. The supported - * ::cudaLimit values are: - * - ::cudaLimitStackSize: stack size of each GPU thread; - * - ::cudaLimitPrintfFifoSize: size of the shared FIFO used by the - * ::printf() and ::fprintf() device system calls. - * - ::cudaLimitMallocHeapSize: size of the heap used by the - * ::malloc() and ::free() device system calls; - * - * \param limit - Limit to query - * \param pValue - Returned size in bytes of limit - * - * \return - * ::cudaSuccess, - * ::cudaErrorUnsupportedLimit, - * ::cudaErrorInvalidValue - * \notefnerr - * - * \sa ::cudaDeviceGetLimit - */ -extern __host__ cudaError_t CUDARTAPI cudaThreadGetLimit(size_t *pValue, enum cudaLimit limit); - -/** - * \brief Returns the preferred cache configuration for the current device. - * - * \deprecated - * - * Note that this function is deprecated because its name does not - * reflect its behavior. Its functionality is identical to the - * non-deprecated function ::cudaDeviceGetCacheConfig(), which should be - * used instead. - * - * On devices where the L1 cache and shared memory use the same hardware - * resources, this returns through \p pCacheConfig the preferred cache - * configuration for the current device. This is only a preference. The - * runtime will use the requested configuration if possible, but it is free to - * choose a different configuration if required to execute functions. - * - * This will return a \p pCacheConfig of ::cudaFuncCachePreferNone on devices - * where the size of the L1 cache and shared memory are fixed. - * - * The supported cache configurations are: - * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default) - * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache - * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory - * - * \param pCacheConfig - Returned cache configuration - * - * \return - * ::cudaSuccess, - * ::cudaErrorInitializationError - * \notefnerr - * - * \sa ::cudaDeviceGetCacheConfig - */ -extern __host__ cudaError_t CUDARTAPI cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig); - -/** - * \brief Sets the preferred cache configuration for the current device. - * - * \deprecated - * - * Note that this function is deprecated because its name does not - * reflect its behavior. Its functionality is identical to the - * non-deprecated function ::cudaDeviceSetCacheConfig(), which should be - * used instead. - * - * On devices where the L1 cache and shared memory use the same hardware - * resources, this sets through \p cacheConfig the preferred cache - * configuration for the current device. This is only a preference. The - * runtime will use the requested configuration if possible, but it is free to - * choose a different configuration if required to execute the function. Any - * function preference set via - * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)" - * or - * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)" - * will be preferred over this device-wide setting. Setting the device-wide - * cache configuration to ::cudaFuncCachePreferNone will cause subsequent - * kernel launches to prefer to not change the cache configuration unless - * required to launch the kernel. - * - * This setting does nothing on devices where the size of the L1 cache and - * shared memory are fixed. - * - * Launching a kernel with a different preference than the most recent - * preference setting may insert a device-side synchronization point. - * - * The supported cache configurations are: - * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default) - * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache - * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory - * - * \param cacheConfig - Requested cache configuration - * - * \return - * ::cudaSuccess, - * ::cudaErrorInitializationError - * \notefnerr - * - * \sa ::cudaDeviceSetCacheConfig - */ -extern __host__ cudaError_t CUDARTAPI cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig); - -/** @} */ /* END CUDART_THREAD_DEPRECATED */ - -/** - * \defgroup CUDART_ERROR Error Handling - * - * ___MANBRIEF___ error handling functions of the CUDA runtime API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the error handling functions of the CUDA runtime - * application programming interface. - * - * @{ - */ - -/** - * \brief Returns the last error from a runtime call - * - * Returns the last error that has been produced by any of the runtime calls - * in the same host thread and resets it to ::cudaSuccess. - * - * \return - * ::cudaSuccess, - * ::cudaErrorMissingConfiguration, - * ::cudaErrorMemoryAllocation, - * ::cudaErrorInitializationError, - * ::cudaErrorLaunchFailure, - * ::cudaErrorLaunchTimeout, - * ::cudaErrorLaunchOutOfResources, - * ::cudaErrorInvalidDeviceFunction, - * ::cudaErrorInvalidConfiguration, - * ::cudaErrorInvalidDevice, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidPitchValue, - * ::cudaErrorInvalidSymbol, - * ::cudaErrorUnmapBufferObjectFailed, - * ::cudaErrorInvalidDevicePointer, - * ::cudaErrorInvalidTexture, - * ::cudaErrorInvalidTextureBinding, - * ::cudaErrorInvalidChannelDescriptor, - * ::cudaErrorInvalidMemcpyDirection, - * ::cudaErrorInvalidFilterSetting, - * ::cudaErrorInvalidNormSetting, - * ::cudaErrorUnknown, - * ::cudaErrorInvalidResourceHandle, - * ::cudaErrorInsufficientDriver, - * ::cudaErrorSetOnActiveProcess, - * ::cudaErrorStartupFailure, - * ::cudaErrorInvalidPtx, - * ::cudaErrorNoKernelImageForDevice, - * ::cudaErrorJitCompilerNotFound - * \notefnerr - * - * \sa ::cudaPeekAtLastError, ::cudaGetErrorName, ::cudaGetErrorString, ::cudaError - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void); - -/** - * \brief Returns the last error from a runtime call - * - * Returns the last error that has been produced by any of the runtime calls - * in the same host thread. Note that this call does not reset the error to - * ::cudaSuccess like ::cudaGetLastError(). - * - * \return - * ::cudaSuccess, - * ::cudaErrorMissingConfiguration, - * ::cudaErrorMemoryAllocation, - * ::cudaErrorInitializationError, - * ::cudaErrorLaunchFailure, - * ::cudaErrorLaunchTimeout, - * ::cudaErrorLaunchOutOfResources, - * ::cudaErrorInvalidDeviceFunction, - * ::cudaErrorInvalidConfiguration, - * ::cudaErrorInvalidDevice, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidPitchValue, - * ::cudaErrorInvalidSymbol, - * ::cudaErrorUnmapBufferObjectFailed, - * ::cudaErrorInvalidDevicePointer, - * ::cudaErrorInvalidTexture, - * ::cudaErrorInvalidTextureBinding, - * ::cudaErrorInvalidChannelDescriptor, - * ::cudaErrorInvalidMemcpyDirection, - * ::cudaErrorInvalidFilterSetting, - * ::cudaErrorInvalidNormSetting, - * ::cudaErrorUnknown, - * ::cudaErrorInvalidResourceHandle, - * ::cudaErrorInsufficientDriver, - * ::cudaErrorSetOnActiveProcess, - * ::cudaErrorStartupFailure, - * ::cudaErrorInvalidPtx, - * ::cudaErrorNoKernelImageForDevice, - * ::cudaErrorJitCompilerNotFound - * \notefnerr - * - * \sa ::cudaGetLastError, ::cudaGetErrorName, ::cudaGetErrorString, ::cudaError - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void); - -/** - * \brief Returns the string representation of an error code enum name - * - * Returns a string containing the name of an error code in the enum. If the error - * code is not recognized, "unrecognized error code" is returned. - * - * \param error - Error code to convert to string - * - * \return - * \p char* pointer to a NULL-terminated string - * - * \sa ::cudaGetErrorString, ::cudaGetLastError, ::cudaPeekAtLastError, ::cudaError, - * ::cuGetErrorName - */ -extern __host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error); - -/** - * \brief Returns the description string for an error code - * - * Returns the description string for an error code. If the error - * code is not recognized, "unrecognized error code" is returned. - * - * \param error - Error code to convert to string - * - * \return - * \p char* pointer to a NULL-terminated string - * - * \sa ::cudaGetErrorName, ::cudaGetLastError, ::cudaPeekAtLastError, ::cudaError, - * ::cuGetErrorString - */ -extern __host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error); -/** @} */ /* END CUDART_ERROR */ - -/** - * \addtogroup CUDART_DEVICE - * - * @{ - */ - -/** - * \brief Returns the number of compute-capable devices - * - * Returns in \p *count the number of devices with compute capability greater - * or equal to 2.0 that are available for execution. If there is no such - * device then ::cudaGetDeviceCount() will return ::cudaErrorNoDevice. - * If no driver can be loaded to determine if any such devices exist then - * ::cudaGetDeviceCount() will return ::cudaErrorInsufficientDriver. - * - * \param count - Returns the number of devices with compute capability - * greater or equal to 2.0 - * - * \return - * ::cudaSuccess, - * ::cudaErrorNoDevice, - * ::cudaErrorInsufficientDriver - * \notefnerr - * - * \sa ::cudaGetDevice, ::cudaSetDevice, ::cudaGetDeviceProperties, - * ::cudaChooseDevice, - * ::cuDeviceGetCount - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count); - -/** - * \brief Returns information about the compute-device - * - * Returns in \p *prop the properties of device \p dev. The ::cudaDeviceProp - * structure is defined as: - * \code - struct cudaDeviceProp { - char name[256]; - size_t totalGlobalMem; - size_t sharedMemPerBlock; - int regsPerBlock; - int warpSize; - size_t memPitch; - int maxThreadsPerBlock; - int maxThreadsDim[3]; - int maxGridSize[3]; - int clockRate; - size_t totalConstMem; - int major; - int minor; - size_t textureAlignment; - size_t texturePitchAlignment; - int deviceOverlap; - int multiProcessorCount; - int kernelExecTimeoutEnabled; - int integrated; - int canMapHostMemory; - int computeMode; - int maxTexture1D; - int maxTexture1DMipmap; - int maxTexture1DLinear; - int maxTexture2D[2]; - int maxTexture2DMipmap[2]; - int maxTexture2DLinear[3]; - int maxTexture2DGather[2]; - int maxTexture3D[3]; - int maxTexture3DAlt[3]; - int maxTextureCubemap; - int maxTexture1DLayered[2]; - int maxTexture2DLayered[3]; - int maxTextureCubemapLayered[2]; - int maxSurface1D; - int maxSurface2D[2]; - int maxSurface3D[3]; - int maxSurface1DLayered[2]; - int maxSurface2DLayered[3]; - int maxSurfaceCubemap; - int maxSurfaceCubemapLayered[2]; - size_t surfaceAlignment; - int concurrentKernels; - int ECCEnabled; - int pciBusID; - int pciDeviceID; - int pciDomainID; - int tccDriver; - int asyncEngineCount; - int unifiedAddressing; - int memoryClockRate; - int memoryBusWidth; - int l2CacheSize; - int maxThreadsPerMultiProcessor; - int streamPrioritiesSupported; - int globalL1CacheSupported; - int localL1CacheSupported; - size_t sharedMemPerMultiprocessor; - int regsPerMultiprocessor; - int managedMemSupported; - int isMultiGpuBoard; - int multiGpuBoardGroupID; - int singleToDoublePrecisionPerfRatio; - int pageableMemoryAccess; - int concurrentManagedAccess; - int computePreemptionSupported; - int canUseHostPointerForRegisteredMem; - int cooperativeLaunch; - int cooperativeMultiDeviceLaunch; - } - \endcode - * where: - * - \ref ::cudaDeviceProp::name "name[256]" is an ASCII string identifying - * the device; - * - \ref ::cudaDeviceProp::totalGlobalMem "totalGlobalMem" is the total - * amount of global memory available on the device in bytes; - * - \ref ::cudaDeviceProp::sharedMemPerBlock "sharedMemPerBlock" is the - * maximum amount of shared memory available to a thread block in bytes; - * - \ref ::cudaDeviceProp::regsPerBlock "regsPerBlock" is the maximum number - * of 32-bit registers available to a thread block; - * - \ref ::cudaDeviceProp::warpSize "warpSize" is the warp size in threads; - * - \ref ::cudaDeviceProp::memPitch "memPitch" is the maximum pitch in - * bytes allowed by the memory copy functions that involve memory regions - * allocated through ::cudaMallocPitch(); - * - \ref ::cudaDeviceProp::maxThreadsPerBlock "maxThreadsPerBlock" is the - * maximum number of threads per block; - * - \ref ::cudaDeviceProp::maxThreadsDim "maxThreadsDim[3]" contains the - * maximum size of each dimension of a block; - * - \ref ::cudaDeviceProp::maxGridSize "maxGridSize[3]" contains the - * maximum size of each dimension of a grid; - * - \ref ::cudaDeviceProp::clockRate "clockRate" is the clock frequency in - * kilohertz; - * - \ref ::cudaDeviceProp::totalConstMem "totalConstMem" is the total amount - * of constant memory available on the device in bytes; - * - \ref ::cudaDeviceProp::major "major", - * \ref ::cudaDeviceProp::minor "minor" are the major and minor revision - * numbers defining the device's compute capability; - * - \ref ::cudaDeviceProp::textureAlignment "textureAlignment" is the - * alignment requirement; texture base addresses that are aligned to - * \ref ::cudaDeviceProp::textureAlignment "textureAlignment" bytes do not - * need an offset applied to texture fetches; - * - \ref ::cudaDeviceProp::texturePitchAlignment "texturePitchAlignment" is the - * pitch alignment requirement for 2D texture references that are bound to - * pitched memory; - * - \ref ::cudaDeviceProp::deviceOverlap "deviceOverlap" is 1 if the device - * can concurrently copy memory between host and device while executing a - * kernel, or 0 if not. Deprecated, use instead asyncEngineCount. - * - \ref ::cudaDeviceProp::multiProcessorCount "multiProcessorCount" is the - * number of multiprocessors on the device; - * - \ref ::cudaDeviceProp::kernelExecTimeoutEnabled "kernelExecTimeoutEnabled" - * is 1 if there is a run time limit for kernels executed on the device, or - * 0 if not. - * - \ref ::cudaDeviceProp::integrated "integrated" is 1 if the device is an - * integrated (motherboard) GPU and 0 if it is a discrete (card) component. - * - \ref ::cudaDeviceProp::canMapHostMemory "canMapHostMemory" is 1 if the - * device can map host memory into the CUDA address space for use with - * ::cudaHostAlloc()/::cudaHostGetDevicePointer(), or 0 if not; - * - \ref ::cudaDeviceProp::computeMode "computeMode" is the compute mode - * that the device is currently in. Available modes are as follows: - * - cudaComputeModeDefault: Default mode - Device is not restricted and - * multiple threads can use ::cudaSetDevice() with this device. - * - cudaComputeModeExclusive: Compute-exclusive mode - Only one thread will - * be able to use ::cudaSetDevice() with this device. - * - cudaComputeModeProhibited: Compute-prohibited mode - No threads can use - * ::cudaSetDevice() with this device. - * - cudaComputeModeExclusiveProcess: Compute-exclusive-process mode - Many - * threads in one process will be able to use ::cudaSetDevice() with this device. - *
If ::cudaSetDevice() is called on an already occupied \p device with - * computeMode ::cudaComputeModeExclusive, ::cudaErrorDeviceAlreadyInUse - * will be immediately returned indicating the device cannot be used. - * When an occupied exclusive mode device is chosen with ::cudaSetDevice, - * all subsequent non-device management runtime functions will return - * ::cudaErrorDevicesUnavailable. - * - \ref ::cudaDeviceProp::maxTexture1D "maxTexture1D" is the maximum 1D - * texture size. - * - \ref ::cudaDeviceProp::maxTexture1DMipmap "maxTexture1DMipmap" is the maximum - * 1D mipmapped texture texture size. - * - \ref ::cudaDeviceProp::maxTexture1DLinear "maxTexture1DLinear" is the maximum - * 1D texture size for textures bound to linear memory. - * - \ref ::cudaDeviceProp::maxTexture2D "maxTexture2D[2]" contains the maximum - * 2D texture dimensions. - * - \ref ::cudaDeviceProp::maxTexture2DMipmap "maxTexture2DMipmap[2]" contains the - * maximum 2D mipmapped texture dimensions. - * - \ref ::cudaDeviceProp::maxTexture2DLinear "maxTexture2DLinear[3]" contains the - * maximum 2D texture dimensions for 2D textures bound to pitch linear memory. - * - \ref ::cudaDeviceProp::maxTexture2DGather "maxTexture2DGather[2]" contains the - * maximum 2D texture dimensions if texture gather operations have to be performed. - * - \ref ::cudaDeviceProp::maxTexture3D "maxTexture3D[3]" contains the maximum - * 3D texture dimensions. - * - \ref ::cudaDeviceProp::maxTexture3DAlt "maxTexture3DAlt[3]" - * contains the maximum alternate 3D texture dimensions. - * - \ref ::cudaDeviceProp::maxTextureCubemap "maxTextureCubemap" is the - * maximum cubemap texture width or height. - * - \ref ::cudaDeviceProp::maxTexture1DLayered "maxTexture1DLayered[2]" contains - * the maximum 1D layered texture dimensions. - * - \ref ::cudaDeviceProp::maxTexture2DLayered "maxTexture2DLayered[3]" contains - * the maximum 2D layered texture dimensions. - * - \ref ::cudaDeviceProp::maxTextureCubemapLayered "maxTextureCubemapLayered[2]" - * contains the maximum cubemap layered texture dimensions. - * - \ref ::cudaDeviceProp::maxSurface1D "maxSurface1D" is the maximum 1D - * surface size. - * - \ref ::cudaDeviceProp::maxSurface2D "maxSurface2D[2]" contains the maximum - * 2D surface dimensions. - * - \ref ::cudaDeviceProp::maxSurface3D "maxSurface3D[3]" contains the maximum - * 3D surface dimensions. - * - \ref ::cudaDeviceProp::maxSurface1DLayered "maxSurface1DLayered[2]" contains - * the maximum 1D layered surface dimensions. - * - \ref ::cudaDeviceProp::maxSurface2DLayered "maxSurface2DLayered[3]" contains - * the maximum 2D layered surface dimensions. - * - \ref ::cudaDeviceProp::maxSurfaceCubemap "maxSurfaceCubemap" is the maximum - * cubemap surface width or height. - * - \ref ::cudaDeviceProp::maxSurfaceCubemapLayered "maxSurfaceCubemapLayered[2]" - * contains the maximum cubemap layered surface dimensions. - * - \ref ::cudaDeviceProp::surfaceAlignment "surfaceAlignment" specifies the - * alignment requirements for surfaces. - * - \ref ::cudaDeviceProp::concurrentKernels "concurrentKernels" is 1 if the - * device supports executing multiple kernels within the same context - * simultaneously, or 0 if not. It is not guaranteed that multiple kernels - * will be resident on the device concurrently so this feature should not be - * relied upon for correctness; - * - \ref ::cudaDeviceProp::ECCEnabled "ECCEnabled" is 1 if the device has ECC - * support turned on, or 0 if not. - * - \ref ::cudaDeviceProp::pciBusID "pciBusID" is the PCI bus identifier of - * the device. - * - \ref ::cudaDeviceProp::pciDeviceID "pciDeviceID" is the PCI device - * (sometimes called slot) identifier of the device. - * - \ref ::cudaDeviceProp::pciDomainID "pciDomainID" is the PCI domain identifier - * of the device. - * - \ref ::cudaDeviceProp::tccDriver "tccDriver" is 1 if the device is using a - * TCC driver or 0 if not. - * - \ref ::cudaDeviceProp::asyncEngineCount "asyncEngineCount" is 1 when the - * device can concurrently copy memory between host and device while executing - * a kernel. It is 2 when the device can concurrently copy memory between host - * and device in both directions and execute a kernel at the same time. It is - * 0 if neither of these is supported. - * - \ref ::cudaDeviceProp::unifiedAddressing "unifiedAddressing" is 1 if the device - * shares a unified address space with the host and 0 otherwise. - * - \ref ::cudaDeviceProp::memoryClockRate "memoryClockRate" is the peak memory - * clock frequency in kilohertz. - * - \ref ::cudaDeviceProp::memoryBusWidth "memoryBusWidth" is the memory bus width - * in bits. - * - \ref ::cudaDeviceProp::l2CacheSize "l2CacheSize" is L2 cache size in bytes. - * - \ref ::cudaDeviceProp::maxThreadsPerMultiProcessor "maxThreadsPerMultiProcessor" - * is the number of maximum resident threads per multiprocessor. - * - \ref ::cudaDeviceProp::streamPrioritiesSupported "streamPrioritiesSupported" - * is 1 if the device supports stream priorities, or 0 if it is not supported. - * - \ref ::cudaDeviceProp::globalL1CacheSupported "globalL1CacheSupported" - * is 1 if the device supports caching of globals in L1 cache, or 0 if it is not supported. - * - \ref ::cudaDeviceProp::localL1CacheSupported "localL1CacheSupported" - * is 1 if the device supports caching of locals in L1 cache, or 0 if it is not supported. - * - \ref ::cudaDeviceProp::sharedMemPerMultiprocessor "sharedMemPerMultiprocessor" is the - * maximum amount of shared memory available to a multiprocessor in bytes; this amount is - * shared by all thread blocks simultaneously resident on a multiprocessor; - * - \ref ::cudaDeviceProp::regsPerMultiprocessor "regsPerMultiprocessor" is the maximum number - * of 32-bit registers available to a multiprocessor; this number is shared - * by all thread blocks simultaneously resident on a multiprocessor; - * - \ref ::cudaDeviceProp::managedMemory "managedMemory" - * is 1 if the device supports allocating managed memory on this system, or 0 if it is not supported. - * - \ref ::cudaDeviceProp::isMultiGpuBoard "isMultiGpuBoard" - * is 1 if the device is on a multi-GPU board (e.g. Gemini cards), and 0 if not; - * - \ref ::cudaDeviceProp::multiGpuBoardGroupID "multiGpuBoardGroupID" is a unique identifier - * for a group of devices associated with the same board. - * Devices on the same multi-GPU board will share the same identifier; - * - \ref ::cudaDeviceProp::singleToDoublePrecisionPerfRatio "singleToDoublePrecisionPerfRatio" - * is the ratio of single precision performance (in floating-point operations per second) - * to double precision performance. - * - \ref ::cudaDeviceProp::pageableMemoryAccess "pageableMemoryAccess" is 1 if the device supports - * coherently accessing pageable memory without calling cudaHostRegister on it, and 0 otherwise. - * - \ref ::cudaDeviceProp::concurrentManagedAccess "concurrentManagedAccess" is 1 if the device can - * coherently access managed memory concurrently with the CPU, and 0 otherwise. - * - \ref ::cudaDeviceProp::computePreemptionSupported "computePreemptionSupported" is 1 if the device - * supports Compute Preemption, and 0 otherwise. - * - \ref ::cudaDeviceProp::canUseHostPointerForRegisteredMem "canUseHostPointerForRegisteredMem" is 1 if - * the device can access host registered memory at the same virtual address as the CPU, and 0 otherwise. - * - \ref ::cudaDeviceProp::cooperativeLaunch "cooperativeLaunch" is 1 if the device supports launching - * cooperative kernels via ::cudaLaunchCooperativeKernel, and 0 otherwise. - * - \ref ::cudaDeviceProp::cooperativeMultiDeviceLaunch "cooperativeMultiDeviceLaunch" is 1 if the device - * supports launching cooperative kernels via ::cudaLaunchCooperativeKernelMultiDevice, and 0 otherwise. - * - * \param prop - Properties for the specified device - * \param device - Device number to get properties for - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidDevice - * - * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice, ::cudaChooseDevice, - * ::cudaDeviceGetAttribute, - * ::cuDeviceGetAttribute, - * ::cuDeviceGetName - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device); - -/** - * \brief Returns information about the device - * - * Returns in \p *value the integer value of the attribute \p attr on device - * \p device. The supported attributes are: - * - ::cudaDevAttrMaxThreadsPerBlock: Maximum number of threads per block; - * - ::cudaDevAttrMaxBlockDimX: Maximum x-dimension of a block; - * - ::cudaDevAttrMaxBlockDimY: Maximum y-dimension of a block; - * - ::cudaDevAttrMaxBlockDimZ: Maximum z-dimension of a block; - * - ::cudaDevAttrMaxGridDimX: Maximum x-dimension of a grid; - * - ::cudaDevAttrMaxGridDimY: Maximum y-dimension of a grid; - * - ::cudaDevAttrMaxGridDimZ: Maximum z-dimension of a grid; - * - ::cudaDevAttrMaxSharedMemoryPerBlock: Maximum amount of shared memory - * available to a thread block in bytes; - * - ::cudaDevAttrTotalConstantMemory: Memory available on device for - * __constant__ variables in a CUDA C kernel in bytes; - * - ::cudaDevAttrWarpSize: Warp size in threads; - * - ::cudaDevAttrMaxPitch: Maximum pitch in bytes allowed by the memory copy - * functions that involve memory regions allocated through ::cudaMallocPitch(); - * - ::cudaDevAttrMaxTexture1DWidth: Maximum 1D texture width; - * - ::cudaDevAttrMaxTexture1DLinearWidth: Maximum width for a 1D texture bound - * to linear memory; - * - ::cudaDevAttrMaxTexture1DMipmappedWidth: Maximum mipmapped 1D texture width; - * - ::cudaDevAttrMaxTexture2DWidth: Maximum 2D texture width; - * - ::cudaDevAttrMaxTexture2DHeight: Maximum 2D texture height; - * - ::cudaDevAttrMaxTexture2DLinearWidth: Maximum width for a 2D texture - * bound to linear memory; - * - ::cudaDevAttrMaxTexture2DLinearHeight: Maximum height for a 2D texture - * bound to linear memory; - * - ::cudaDevAttrMaxTexture2DLinearPitch: Maximum pitch in bytes for a 2D - * texture bound to linear memory; - * - ::cudaDevAttrMaxTexture2DMipmappedWidth: Maximum mipmapped 2D texture - * width; - * - ::cudaDevAttrMaxTexture2DMipmappedHeight: Maximum mipmapped 2D texture - * height; - * - ::cudaDevAttrMaxTexture3DWidth: Maximum 3D texture width; - * - ::cudaDevAttrMaxTexture3DHeight: Maximum 3D texture height; - * - ::cudaDevAttrMaxTexture3DDepth: Maximum 3D texture depth; - * - ::cudaDevAttrMaxTexture3DWidthAlt: Alternate maximum 3D texture width, - * 0 if no alternate maximum 3D texture size is supported; - * - ::cudaDevAttrMaxTexture3DHeightAlt: Alternate maximum 3D texture height, - * 0 if no alternate maximum 3D texture size is supported; - * - ::cudaDevAttrMaxTexture3DDepthAlt: Alternate maximum 3D texture depth, - * 0 if no alternate maximum 3D texture size is supported; - * - ::cudaDevAttrMaxTextureCubemapWidth: Maximum cubemap texture width or - * height; - * - ::cudaDevAttrMaxTexture1DLayeredWidth: Maximum 1D layered texture width; - * - ::cudaDevAttrMaxTexture1DLayeredLayers: Maximum layers in a 1D layered - * texture; - * - ::cudaDevAttrMaxTexture2DLayeredWidth: Maximum 2D layered texture width; - * - ::cudaDevAttrMaxTexture2DLayeredHeight: Maximum 2D layered texture height; - * - ::cudaDevAttrMaxTexture2DLayeredLayers: Maximum layers in a 2D layered - * texture; - * - ::cudaDevAttrMaxTextureCubemapLayeredWidth: Maximum cubemap layered - * texture width or height; - * - ::cudaDevAttrMaxTextureCubemapLayeredLayers: Maximum layers in a cubemap - * layered texture; - * - ::cudaDevAttrMaxSurface1DWidth: Maximum 1D surface width; - * - ::cudaDevAttrMaxSurface2DWidth: Maximum 2D surface width; - * - ::cudaDevAttrMaxSurface2DHeight: Maximum 2D surface height; - * - ::cudaDevAttrMaxSurface3DWidth: Maximum 3D surface width; - * - ::cudaDevAttrMaxSurface3DHeight: Maximum 3D surface height; - * - ::cudaDevAttrMaxSurface3DDepth: Maximum 3D surface depth; - * - ::cudaDevAttrMaxSurface1DLayeredWidth: Maximum 1D layered surface width; - * - ::cudaDevAttrMaxSurface1DLayeredLayers: Maximum layers in a 1D layered - * surface; - * - ::cudaDevAttrMaxSurface2DLayeredWidth: Maximum 2D layered surface width; - * - ::cudaDevAttrMaxSurface2DLayeredHeight: Maximum 2D layered surface height; - * - ::cudaDevAttrMaxSurface2DLayeredLayers: Maximum layers in a 2D layered - * surface; - * - ::cudaDevAttrMaxSurfaceCubemapWidth: Maximum cubemap surface width; - * - ::cudaDevAttrMaxSurfaceCubemapLayeredWidth: Maximum cubemap layered - * surface width; - * - ::cudaDevAttrMaxSurfaceCubemapLayeredLayers: Maximum layers in a cubemap - * layered surface; - * - ::cudaDevAttrMaxRegistersPerBlock: Maximum number of 32-bit registers - * available to a thread block; - * - ::cudaDevAttrClockRate: Peak clock frequency in kilohertz; - * - ::cudaDevAttrTextureAlignment: Alignment requirement; texture base - * addresses aligned to ::textureAlign bytes do not need an offset applied - * to texture fetches; - * - ::cudaDevAttrTexturePitchAlignment: Pitch alignment requirement for 2D - * texture references bound to pitched memory; - * - ::cudaDevAttrGpuOverlap: 1 if the device can concurrently copy memory - * between host and device while executing a kernel, or 0 if not; - * - ::cudaDevAttrMultiProcessorCount: Number of multiprocessors on the device; - * - ::cudaDevAttrKernelExecTimeout: 1 if there is a run time limit for kernels - * executed on the device, or 0 if not; - * - ::cudaDevAttrIntegrated: 1 if the device is integrated with the memory - * subsystem, or 0 if not; - * - ::cudaDevAttrCanMapHostMemory: 1 if the device can map host memory into - * the CUDA address space, or 0 if not; - * - ::cudaDevAttrComputeMode: Compute mode is the compute mode that the device - * is currently in. Available modes are as follows: - * - ::cudaComputeModeDefault: Default mode - Device is not restricted and - * multiple threads can use ::cudaSetDevice() with this device. - * - ::cudaComputeModeExclusive: Compute-exclusive mode - Only one thread will - * be able to use ::cudaSetDevice() with this device. - * - ::cudaComputeModeProhibited: Compute-prohibited mode - No threads can use - * ::cudaSetDevice() with this device. - * - ::cudaComputeModeExclusiveProcess: Compute-exclusive-process mode - Many - * threads in one process will be able to use ::cudaSetDevice() with this - * device. - * - ::cudaDevAttrConcurrentKernels: 1 if the device supports executing - * multiple kernels within the same context simultaneously, or 0 if - * not. It is not guaranteed that multiple kernels will be resident on the - * device concurrently so this feature should not be relied upon for - * correctness; - * - ::cudaDevAttrEccEnabled: 1 if error correction is enabled on the device, - * 0 if error correction is disabled or not supported by the device; - * - ::cudaDevAttrPciBusId: PCI bus identifier of the device; - * - ::cudaDevAttrPciDeviceId: PCI device (also known as slot) identifier of - * the device; - * - ::cudaDevAttrTccDriver: 1 if the device is using a TCC driver. TCC is only - * available on Tesla hardware running Windows Vista or later; - * - ::cudaDevAttrMemoryClockRate: Peak memory clock frequency in kilohertz; - * - ::cudaDevAttrGlobalMemoryBusWidth: Global memory bus width in bits; - * - ::cudaDevAttrL2CacheSize: Size of L2 cache in bytes. 0 if the device - * doesn't have L2 cache; - * - ::cudaDevAttrMaxThreadsPerMultiProcessor: Maximum resident threads per - * multiprocessor; - * - ::cudaDevAttrUnifiedAddressing: 1 if the device shares a unified address - * space with the host, or 0 if not; - * - ::cudaDevAttrComputeCapabilityMajor: Major compute capability version - * number; - * - ::cudaDevAttrComputeCapabilityMinor: Minor compute capability version - * number; - * - ::cudaDevAttrStreamPrioritiesSupported: 1 if the device supports stream - * priorities, or 0 if not; - * - ::cudaDevAttrGlobalL1CacheSupported: 1 if device supports caching globals - * in L1 cache, 0 if not; - * - ::cudaDevAttrLocalL1CacheSupported: 1 if device supports caching locals - * in L1 cache, 0 if not; - * - ::cudaDevAttrMaxSharedMemoryPerMultiprocessor: Maximum amount of shared memory - * available to a multiprocessor in bytes; this amount is shared by all - * thread blocks simultaneously resident on a multiprocessor; - * - ::cudaDevAttrMaxRegistersPerMultiprocessor: Maximum number of 32-bit registers - * available to a multiprocessor; this number is shared by all thread blocks - * simultaneously resident on a multiprocessor; - * - ::cudaDevAttrManagedMemSupported: 1 if device supports allocating - * managed memory, 0 if not; - * - ::cudaDevAttrIsMultiGpuBoard: 1 if device is on a multi-GPU board, 0 if not; - * - ::cudaDevAttrMultiGpuBoardGroupID: Unique identifier for a group of devices on the - * same multi-GPU board; - * - ::cudaDevAttrHostNativeAtomicSupported: 1 if the link between the device and the - * host supports native atomic operations; - * - ::cudaDevAttrSingleToDoublePrecisionPerfRatio: Ratio of single precision performance - * (in floating-point operations per second) to double precision performance; - * - ::cudaDevAttrPageableMemoryAccess: 1 if the device supports coherently accessing - * pageable memory without calling cudaHostRegister on it, and 0 otherwise. - * - ::cudaDevAttrConcurrentManagedAccess: 1 if the device can coherently access managed - * memory concurrently with the CPU, and 0 otherwise. - * - ::cudaDevAttrComputePreemptionSupported: 1 if the device supports - * Compute Preemption, 0 if not. - * - ::cudaDevAttrCanUseHostPointerForRegisteredMem: 1 if the device can access host - * registered memory at the same virtual address as the CPU, and 0 otherwise. - * - ::cudaDevAttrCooperativeLaunch: 1 if the device supports launching cooperative kernels - * via ::cudaLaunchCooperativeKernel, and 0 otherwise. - * - ::cudaDevAttrCooperativeMultiDeviceLaunch: 1 if the device supports launching cooperative - * kernels via ::cudaLaunchCooperativeKernelMultiDevice, and 0 otherwise. - * - * \param value - Returned device attribute value - * \param attr - Device attribute to query - * \param device - Device number to query - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidDevice, - * ::cudaErrorInvalidValue - * \notefnerr - * - * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice, ::cudaChooseDevice, - * ::cudaGetDeviceProperties, - * ::cuDeviceGetAttribute - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device); - -/** - * \brief Queries attributes of the link between two devices. - * - * Returns in \p *value the value of the requested attribute \p attrib of the - * link between \p srcDevice and \p dstDevice. The supported attributes are: - * - ::CudaDevP2PAttrPerformanceRank: A relative value indicating the - * performance of the link between two devices. Lower value means better - * performance (0 being the value used for most performant link). - * - ::CudaDevP2PAttrAccessSupported: 1 if peer access is enabled. - * - ::CudaDevP2PAttrNativeAtomicSupported: 1 if native atomic operations over - * the link are supported. - * - * Returns ::cudaErrorInvalidDevice if \p srcDevice or \p dstDevice are not valid - * or if they represent the same device. - * - * Returns ::cudaErrorInvalidValue if \p attrib is not valid or if \p value is - * a null pointer. - * - * \param value - Returned value of the requested attribute - * \param attrib - The requested attribute of the link between \p srcDevice and \p dstDevice. - * \param srcDevice - The source device of the target link. - * \param dstDevice - The destination device of the target link. - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidDevice, - * ::cudaErrorInvalidValue - * \notefnerr - * - * \sa ::cudaCtxEnablePeerAccess, - * ::cudaCtxDisablePeerAccess, - * ::cudaCtxCanAccessPeer, - * ::cuDeviceGetP2PAttribute - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr, int srcDevice, int dstDevice); - -/** - * \brief Select compute-device which best matches criteria - * - * Returns in \p *device the device which has properties that best match - * \p *prop. - * - * \param device - Device with best match - * \param prop - Desired device properties - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue - * \notefnerr - * - * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice, - * ::cudaGetDeviceProperties - */ -extern __host__ cudaError_t CUDARTAPI cudaChooseDevice(int *device, const struct cudaDeviceProp *prop); - -/** - * \brief Set device to be used for GPU executions - * - * Sets \p device as the current device for the calling host thread. - * Valid device id's are 0 to (::cudaGetDeviceCount() - 1). - * - * Any device memory subsequently allocated from this host thread - * using ::cudaMalloc(), ::cudaMallocPitch() or ::cudaMallocArray() - * will be physically resident on \p device. Any host memory allocated - * from this host thread using ::cudaMallocHost() or ::cudaHostAlloc() - * or ::cudaHostRegister() will have its lifetime associated with - * \p device. Any streams or events created from this host thread will - * be associated with \p device. Any kernels launched from this host - * thread using the <<<>>> operator or ::cudaLaunchKernel() will be executed - * on \p device. - * - * This call may be made from any host thread, to any device, and at - * any time. This function will do no synchronization with the previous - * or new device, and should be considered a very low overhead call. - * - * \param device - Device on which the active host thread should execute the - * device code. - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidDevice, - * ::cudaErrorDeviceAlreadyInUse - * \notefnerr - * - * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaGetDeviceProperties, - * ::cudaChooseDevice, - * ::cuCtxSetCurrent - */ -extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device); - -/** - * \brief Returns which device is currently being used - * - * Returns in \p *device the current device for the calling host thread. - * - * \param device - Returns the device on which the active host thread - * executes the device code. - * - * \return - * ::cudaSuccess - * \notefnerr - * - * \sa ::cudaGetDeviceCount, ::cudaSetDevice, ::cudaGetDeviceProperties, - * ::cudaChooseDevice, - * ::cuCtxGetCurrent - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device); - -/** - * \brief Set a list of devices that can be used for CUDA - * - * Sets a list of devices for CUDA execution in priority order using - * \p device_arr. The parameter \p len specifies the number of elements in the - * list. CUDA will try devices from the list sequentially until it finds one - * that works. If this function is not called, or if it is called with a \p len - * of 0, then CUDA will go back to its default behavior of trying devices - * sequentially from a default list containing all of the available CUDA - * devices in the system. If a specified device ID in the list does not exist, - * this function will return ::cudaErrorInvalidDevice. If \p len is not 0 and - * \p device_arr is NULL or if \p len exceeds the number of devices in - * the system, then ::cudaErrorInvalidValue is returned. - * - * \param device_arr - List of devices to try - * \param len - Number of devices in specified list - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidDevice - * \notefnerr - * - * \sa ::cudaGetDeviceCount, ::cudaSetDevice, ::cudaGetDeviceProperties, - * ::cudaSetDeviceFlags, - * ::cudaChooseDevice - */ -extern __host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr, int len); - -/** - * \brief Sets flags to be used for device executions - * - * Records \p flags as the flags to use when initializing the current - * device. If no device has been made current to the calling thread, - * then \p flags will be applied to the initialization of any device - * initialized by the calling host thread, unless that device has had - * its initialization flags set explicitly by this or any host thread. - * - * If the current device has been set and that device has already been - * initialized then this call will fail with the error - * ::cudaErrorSetOnActiveProcess. In this case it is necessary - * to reset \p device using ::cudaDeviceReset() before the device's - * initialization flags may be set. - * - * The two LSBs of the \p flags parameter can be used to control how the CPU - * thread interacts with the OS scheduler when waiting for results from the - * device. - * - * - ::cudaDeviceScheduleAuto: The default value if the \p flags parameter is - * zero, uses a heuristic based on the number of active CUDA contexts in the - * process \p C and the number of logical processors in the system \p P. If - * \p C \> \p P, then CUDA will yield to other OS threads when waiting for the - * device, otherwise CUDA will not yield while waiting for results and - * actively spin on the processor. - * - ::cudaDeviceScheduleSpin: Instruct CUDA to actively spin when waiting for - * results from the device. This can decrease latency when waiting for the - * device, but may lower the performance of CPU threads if they are performing - * work in parallel with the CUDA thread. - * - ::cudaDeviceScheduleYield: Instruct CUDA to yield its thread when waiting - * for results from the device. This can increase latency when waiting for the - * device, but can increase the performance of CPU threads performing work in - * parallel with the device. - * - ::cudaDeviceScheduleBlockingSync: Instruct CUDA to block the CPU thread - * on a synchronization primitive when waiting for the device to finish work. - * - ::cudaDeviceBlockingSync: Instruct CUDA to block the CPU thread on a - * synchronization primitive when waiting for the device to finish work.
- * \ref deprecated "Deprecated:" This flag was deprecated as of CUDA 4.0 and - * replaced with ::cudaDeviceScheduleBlockingSync. - * - ::cudaDeviceMapHost: This flag enables allocating pinned - * host memory that is accessible to the device. It is implicit for the - * runtime but may be absent if a context is created using the driver API. - * If this flag is not set, ::cudaHostGetDevicePointer() will always return - * a failure code. - * - ::cudaDeviceLmemResizeToMax: Instruct CUDA to not reduce local memory - * after resizing local memory for a kernel. This can prevent thrashing by - * local memory allocations when launching many kernels with high local - * memory usage at the cost of potentially increased memory usage. - * - * \param flags - Parameters for device operation - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidDevice, - * ::cudaErrorSetOnActiveProcess - * - * \sa ::cudaGetDeviceFlags, ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaGetDeviceProperties, - * ::cudaSetDevice, ::cudaSetValidDevices, - * ::cudaChooseDevice, - * ::cuDevicePrimaryCtxSetFlags - */ -extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags( unsigned int flags ); - -/** - * \brief Gets the flags for the current device - * - * Returns in \p flags the flags for the current device. If there is a - * current device for the calling thread, and the device has been initialized - * or flags have been set on that device specifically, the flags for the - * device are returned. If there is no current device, but flags have been - * set for the thread with ::cudaSetDeviceFlags, the thread flags are returned. - * Finally, if there is no current device and no thread flags, the flags for - * the first device are returned, which may be the default flags. Compare - * to the behavior of ::cudaSetDeviceFlags. - * - * Typically, the flags returned should match the behavior that will be seen - * if the calling thread uses a device after this call, without any change to - * the flags or current device inbetween by this or another thread. Note that - * if the device is not initialized, it is possible for another thread to - * change the flags for the current device before it is initialized. - * Additionally, when using exclusive mode, if this thread has not requested a - * specific device, it may use a device other than the first device, contrary - * to the assumption made by this function. - * - * If a context has been created via the driver API and is current to the - * calling thread, the flags for that context are always returned. - * - * Flags returned by this function may specifically include ::cudaDeviceMapHost - * even though it is not accepted by ::cudaSetDeviceFlags because it is - * implicit in runtime API flags. The reason for this is that the current - * context may have been created via the driver API in which case the flag is - * not implicit and may be unset. - * - * \param flags - Pointer to store the device flags - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidDevice - * - * \sa ::cudaGetDevice, ::cudaGetDeviceProperties, - * ::cudaSetDevice, ::cudaSetDeviceFlags, - * ::cuCtxGetFlags, - * ::cuDevicePrimaryCtxGetState - */ -extern __host__ cudaError_t CUDARTAPI cudaGetDeviceFlags( unsigned int *flags ); -/** @} */ /* END CUDART_DEVICE */ - -/** - * \defgroup CUDART_STREAM Stream Management - * - * ___MANBRIEF___ stream management functions of the CUDA runtime API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the stream management functions of the CUDA runtime - * application programming interface. - * - * @{ - */ - -/** - * \brief Create an asynchronous stream - * - * Creates a new asynchronous stream. - * - * \param pStream - Pointer to new stream identifier - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue - * \notefnerr - * - * \sa ::cudaStreamCreateWithPriority, - * ::cudaStreamCreateWithFlags, - * ::cudaStreamGetPriority, - * ::cudaStreamGetFlags, - * ::cudaStreamQuery, - * ::cudaStreamSynchronize, - * ::cudaStreamWaitEvent, - * ::cudaStreamAddCallback, - * ::cudaStreamDestroy, - * ::cuStreamCreate - */ -extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream); - -/** - * \brief Create an asynchronous stream - * - * Creates a new asynchronous stream. The \p flags argument determines the - * behaviors of the stream. Valid values for \p flags are - * - ::cudaStreamDefault: Default stream creation flag. - * - ::cudaStreamNonBlocking: Specifies that work running in the created - * stream may run concurrently with work in stream 0 (the NULL stream), and that - * the created stream should perform no implicit synchronization with stream 0. - * - * \param pStream - Pointer to new stream identifier - * \param flags - Parameters for stream creation - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue - * \notefnerr - * - * \sa ::cudaStreamCreate, - * ::cudaStreamCreateWithPriority, - * ::cudaStreamGetFlags, - * ::cudaStreamQuery, - * ::cudaStreamSynchronize, - * ::cudaStreamWaitEvent, - * ::cudaStreamAddCallback, - * ::cudaStreamDestroy, - * ::cuStreamCreate - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags); - -/** - * \brief Create an asynchronous stream with the specified priority - * - * Creates a stream with the specified priority and returns a handle in \p pStream. - * This API alters the scheduler priority of work in the stream. Work in a higher - * priority stream may preempt work already executing in a low priority stream. - * - * \p priority follows a convention where lower numbers represent higher priorities. - * '0' represents default priority. The range of meaningful numerical priorities can - * be queried using ::cudaDeviceGetStreamPriorityRange. If the specified priority is - * outside the numerical range returned by ::cudaDeviceGetStreamPriorityRange, - * it will automatically be clamped to the lowest or the highest number in the range. - * - * \param pStream - Pointer to new stream identifier - * \param flags - Flags for stream creation. See ::cudaStreamCreateWithFlags for a list of valid flags that can be passed - * \param priority - Priority of the stream. Lower numbers represent higher priorities. - * See ::cudaDeviceGetStreamPriorityRange for more information about - * the meaningful stream priorities that can be passed. - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue - * \notefnerr - * - * \note Stream priorities are supported only on GPUs - * with compute capability 3.5 or higher. - * - * \note In the current implementation, only compute kernels launched in - * priority streams are affected by the stream's priority. Stream priorities have - * no effect on host-to-device and device-to-host memory operations. - * - * \sa ::cudaStreamCreate, - * ::cudaStreamCreateWithFlags, - * ::cudaDeviceGetStreamPriorityRange, - * ::cudaStreamGetPriority, - * ::cudaStreamQuery, - * ::cudaStreamWaitEvent, - * ::cudaStreamAddCallback, - * ::cudaStreamSynchronize, - * ::cudaStreamDestroy, - * ::cuStreamCreateWithPriority - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags, int priority); - -/** - * \brief Query the priority of a stream - * - * Query the priority of a stream. The priority is returned in in \p priority. - * Note that if the stream was created with a priority outside the meaningful - * numerical range returned by ::cudaDeviceGetStreamPriorityRange, - * this function returns the clamped priority. - * See ::cudaStreamCreateWithPriority for details about priority clamping. - * - * \param hStream - Handle to the stream to be queried - * \param priority - Pointer to a signed integer in which the stream's priority is returned - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidResourceHandle - * \notefnerr - * - * \sa ::cudaStreamCreateWithPriority, - * ::cudaDeviceGetStreamPriorityRange, - * ::cudaStreamGetFlags, - * ::cuStreamGetPriority - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetPriority(cudaStream_t hStream, int *priority); - -/** - * \brief Query the flags of a stream - * - * Query the flags of a stream. The flags are returned in \p flags. - * See ::cudaStreamCreateWithFlags for a list of valid flags. - * - * \param hStream - Handle to the stream to be queried - * \param flags - Pointer to an unsigned integer in which the stream's flags are returned - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidResourceHandle - * \note_null_stream - * \notefnerr - * - * \sa ::cudaStreamCreateWithPriority, - * ::cudaStreamCreateWithFlags, - * ::cudaStreamGetPriority, - * ::cuStreamGetFlags - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags); - -/** - * \brief Destroys and cleans up an asynchronous stream - * - * Destroys and cleans up the asynchronous stream specified by \p stream. - * - * In case the device is still doing work in the stream \p stream - * when ::cudaStreamDestroy() is called, the function will return immediately - * and the resources associated with \p stream will be released automatically - * once the device has completed all work in \p stream. - * - * \param stream - Stream identifier - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidResourceHandle - * \note_null_stream - * \notefnerr - * - * \sa ::cudaStreamCreate, - * ::cudaStreamCreateWithFlags, - * ::cudaStreamQuery, - * ::cudaStreamWaitEvent, - * ::cudaStreamSynchronize, - * ::cudaStreamAddCallback, - * ::cuStreamDestroy - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream); - -/** - * \brief Make a compute stream wait on an event - * - * Makes all future work submitted to \p stream wait until \p event reports - * completion before beginning execution. This synchronization will be - * performed efficiently on the device. The event \p event may - * be from a different context than \p stream, in which case this function - * will perform cross-device synchronization. - * - * The stream \p stream will wait only for the completion of the most recent - * host call to ::cudaEventRecord() on \p event. Once this call has returned, - * any functions (including ::cudaEventRecord() and ::cudaEventDestroy()) may be - * called on \p event again, and the subsequent calls will not have any effect - * on \p stream. - * - * If ::cudaEventRecord() has not been called on \p event, this call acts as if - * the record has already completed, and so is a functional no-op. - * - * \param stream - Stream to wait - * \param event - Event to wait on - * \param flags - Parameters for the operation (must be 0) - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidResourceHandle - * \note_null_stream - * \notefnerr - * - * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, - * ::cuStreamWaitEvent - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags); - -#ifdef _WIN32 -#define CUDART_CB __stdcall -#else -#define CUDART_CB -#endif - -/** - * Type of stream callback functions. - * \param stream The stream as passed to ::cudaStreamAddCallback, may be NULL. - * \param status ::cudaSuccess or any persistent error on the stream. - * \param userData User parameter provided at registration. - */ -typedef void (CUDART_CB *cudaStreamCallback_t)(cudaStream_t stream, cudaError_t status, void *userData); - -/** - * \brief Add a callback to a compute stream - * - * Adds a callback to be called on the host after all currently enqueued - * items in the stream have completed. For each - * cudaStreamAddCallback call, a callback will be executed exactly once. - * The callback will block later work in the stream until it is finished. - * - * The callback may be passed ::cudaSuccess or an error code. In the event - * of a device error, all subsequently executed callbacks will receive an - * appropriate ::cudaError_t. - * - * Callbacks must not make any CUDA API calls. Attempting to use CUDA APIs - * will result in ::cudaErrorNotPermitted. Callbacks must not perform any - * synchronization that may depend on outstanding device work or other callbacks - * that are not mandated to run earlier. Callbacks without a mandated order - * (in independent streams) execute in undefined order and may be serialized. - * - * For the purposes of Unified Memory, callback execution makes a number of - * guarantees: - *
    - *
  • The callback stream is considered idle for the duration of the - * callback. Thus, for example, a callback may always use memory attached - * to the callback stream.
  • - *
  • The start of execution of a callback has the same effect as - * synchronizing an event recorded in the same stream immediately prior to - * the callback. It thus synchronizes streams which have been "joined" - * prior to the callback.
  • - *
  • Adding device work to any stream does not have the effect of making - * the stream active until all preceding callbacks have executed. Thus, for - * example, a callback might use global attached memory even if work has - * been added to another stream, if it has been properly ordered with an - * event.
  • - *
  • Completion of a callback does not cause a stream to become - * active except as described above. The callback stream will remain idle - * if no device work follows the callback, and will remain idle across - * consecutive callbacks without device work in between. Thus, for example, - * stream synchronization can be done by signaling from a callback at the - * end of the stream.
  • - *
- * - * \param stream - Stream to add callback to - * \param callback - The function to call once preceding stream operations are complete - * \param userData - User specified data to be passed to the callback function - * \param flags - Reserved for future use, must be 0 - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidResourceHandle, - * ::cudaErrorNotSupported - * \note_null_stream - * \notefnerr - * - * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamSynchronize, ::cudaStreamWaitEvent, ::cudaStreamDestroy, ::cudaMallocManaged, ::cudaStreamAttachMemAsync, - * ::cuStreamAddCallback - */ -extern __host__ cudaError_t CUDARTAPI cudaStreamAddCallback(cudaStream_t stream, - cudaStreamCallback_t callback, void *userData, unsigned int flags); - -/** - * \brief Waits for stream tasks to complete - * - * Blocks until \p stream has completed all operations. If the - * ::cudaDeviceScheduleBlockingSync flag was set for this device, - * the host thread will block until the stream is finished with - * all of its tasks. - * - * \param stream - Stream identifier - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidResourceHandle - * \note_null_stream - * \notefnerr - * - * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamWaitEvent, ::cudaStreamAddCallback, ::cudaStreamDestroy, - * ::cuStreamSynchronize - */ -extern __host__ cudaError_t CUDARTAPI cudaStreamSynchronize(cudaStream_t stream); - -/** - * \brief Queries an asynchronous stream for completion status - * - * Returns ::cudaSuccess if all operations in \p stream have - * completed, or ::cudaErrorNotReady if not. - * - * For the purposes of Unified Memory, a return value of ::cudaSuccess - * is equivalent to having called ::cudaStreamSynchronize(). - * - * \param stream - Stream identifier - * - * \return - * ::cudaSuccess, - * ::cudaErrorNotReady, - * ::cudaErrorInvalidResourceHandle - * \note_null_stream - * \notefnerr - * - * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, - * ::cuStreamQuery - */ -extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream); - -/** - * \brief Attach memory to a stream asynchronously - * - * Enqueues an operation in \p stream to specify stream association of - * \p length bytes of memory starting from \p devPtr. This function is a - * stream-ordered operation, meaning that it is dependent on, and will - * only take effect when, previous work in stream has completed. Any - * previous association is automatically replaced. - * - * \p devPtr must point to an address within managed memory space declared - * using the __managed__ keyword or allocated with ::cudaMallocManaged. - * - * \p length must be zero, to indicate that the entire allocation's - * stream association is being changed. Currently, it's not possible - * to change stream association for a portion of an allocation. The default - * value for \p length is zero. - * - * The stream association is specified using \p flags which must be - * one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle. - * The default value for \p flags is ::cudaMemAttachSingle - * If the ::cudaMemAttachGlobal flag is specified, the memory can be accessed - * by any stream on any device. - * If the ::cudaMemAttachHost flag is specified, the program makes a guarantee - * that it won't access the memory on the device from any stream on a device that - * has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess. - * If the ::cudaMemAttachSingle flag is specified and \p stream is associated with - * a device that has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess, - * the program makes a guarantee that it will only access the memory on the device - * from \p stream. It is illegal to attach singly to the NULL stream, because the - * NULL stream is a virtual global stream and not a specific stream. An error will - * be returned in this case. - * - * When memory is associated with a single stream, the Unified Memory system will - * allow CPU access to this memory region so long as all operations in \p stream - * have completed, regardless of whether other streams are active. In effect, - * this constrains exclusive ownership of the managed memory region by - * an active GPU to per-stream activity instead of whole-GPU activity. - * - * Accessing memory on the device from streams that are not associated with - * it will produce undefined results. No error checking is performed by the - * Unified Memory system to ensure that kernels launched into other streams - * do not access this region. - * - * It is a program's responsibility to order calls to ::cudaStreamAttachMemAsync - * via events, synchronization or other means to ensure legal access to memory - * at all times. Data visibility and coherency will be changed appropriately - * for all kernels which follow a stream-association change. - * - * If \p stream is destroyed while data is associated with it, the association is - * removed and the association reverts to the default visibility of the allocation - * as specified at ::cudaMallocManaged. For __managed__ variables, the default - * association is always ::cudaMemAttachGlobal. Note that destroying a stream is an - * asynchronous operation, and as a result, the change to default association won't - * happen until all work in the stream has completed. - * - * \param stream - Stream in which to enqueue the attach operation - * \param devPtr - Pointer to memory (must be a pointer to managed memory) - * \param length - Length of memory (must be zero, defaults to zero) - * \param flags - Must be one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle (defaults to ::cudaMemAttachSingle) - * - * \return - * ::cudaSuccess, - * ::cudaErrorNotReady, - * ::cudaErrorInvalidValue - * ::cudaErrorInvalidResourceHandle - * \notefnerr - * - * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, ::cudaMallocManaged, - * ::cuStreamAttachMemAsync - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length __dv(0), unsigned int flags __dv(cudaMemAttachSingle)); - -/** @} */ /* END CUDART_STREAM */ - -/** - * \defgroup CUDART_EVENT Event Management - * - * ___MANBRIEF___ event management functions of the CUDA runtime API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the event management functions of the CUDA runtime - * application programming interface. - * - * @{ - */ - -/** - * \brief Creates an event object - * - * Creates an event object using ::cudaEventDefault. - * - * \param event - Newly created event - * - * \return - * ::cudaSuccess, - * ::cudaErrorInitializationError, - * ::cudaErrorInvalidValue, - * ::cudaErrorLaunchFailure, - * ::cudaErrorMemoryAllocation - * \notefnerr - * - * \sa \ref ::cudaEventCreate(cudaEvent_t*, unsigned int) "cudaEventCreate (C++ API)", - * ::cudaEventCreateWithFlags, ::cudaEventRecord, ::cudaEventQuery, - * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime, - * ::cudaStreamWaitEvent, - * ::cuEventCreate - */ -extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event); - -/** - * \brief Creates an event object with the specified flags - * - * Creates an event object with the specified flags. Valid flags include: - * - ::cudaEventDefault: Default event creation flag. - * - ::cudaEventBlockingSync: Specifies that event should use blocking - * synchronization. A host thread that uses ::cudaEventSynchronize() to wait - * on an event created with this flag will block until the event actually - * completes. - * - ::cudaEventDisableTiming: Specifies that the created event does not need - * to record timing data. Events created with this flag specified and - * the ::cudaEventBlockingSync flag not specified will provide the best - * performance when used with ::cudaStreamWaitEvent() and ::cudaEventQuery(). - * - ::cudaEventInterprocess: Specifies that the created event may be used as an - * interprocess event by ::cudaIpcGetEventHandle(). ::cudaEventInterprocess must - * be specified along with ::cudaEventDisableTiming. - * - * \param event - Newly created event - * \param flags - Flags for new event - * - * \return - * ::cudaSuccess, - * ::cudaErrorInitializationError, - * ::cudaErrorInvalidValue, - * ::cudaErrorLaunchFailure, - * ::cudaErrorMemoryAllocation - * \notefnerr - * - * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)", - * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime, - * ::cudaStreamWaitEvent, - * ::cuEventCreate - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags); - -/** - * \brief Records an event - * - * Records an event. See note about NULL stream behavior. Since operation - * is asynchronous, ::cudaEventQuery() or ::cudaEventSynchronize() must - * be used to determine when the event has actually been recorded. - * - * If ::cudaEventRecord() has previously been called on \p event, then this - * call will overwrite any existing state in \p event. Any subsequent calls - * which examine the status of \p event will only examine the completion of - * this most recent call to ::cudaEventRecord(). - * - * \param event - Event to record - * \param stream - Stream in which to record event - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInitializationError, - * ::cudaErrorInvalidResourceHandle, - * ::cudaErrorLaunchFailure - * \note_null_stream - * \notefnerr - * - * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)", - * ::cudaEventCreateWithFlags, ::cudaEventQuery, - * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime, - * ::cudaStreamWaitEvent, - * ::cuEventRecord - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0)); - -/** - * \brief Queries an event's status - * - * Query the status of all device work preceding the most recent call to - * ::cudaEventRecord() (in the appropriate compute streams, as specified by the - * arguments to ::cudaEventRecord()). - * - * If this work has successfully been completed by the device, or if - * ::cudaEventRecord() has not been called on \p event, then ::cudaSuccess is - * returned. If this work has not yet been completed by the device then - * ::cudaErrorNotReady is returned. - * - * For the purposes of Unified Memory, a return value of ::cudaSuccess - * is equivalent to having called ::cudaEventSynchronize(). - * - * \param event - Event to query - * - * \return - * ::cudaSuccess, - * ::cudaErrorNotReady, - * ::cudaErrorInitializationError, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidResourceHandle, - * ::cudaErrorLaunchFailure - * \notefnerr - * - * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)", - * ::cudaEventCreateWithFlags, ::cudaEventRecord, - * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime, - * ::cuEventQuery - */ -extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event); - -/** - * \brief Waits for an event to complete - * - * Wait until the completion of all device work preceding the most recent - * call to ::cudaEventRecord() (in the appropriate compute streams, as specified - * by the arguments to ::cudaEventRecord()). - * - * If ::cudaEventRecord() has not been called on \p event, ::cudaSuccess is - * returned immediately. - * - * Waiting for an event that was created with the ::cudaEventBlockingSync - * flag will cause the calling CPU thread to block until the event has - * been completed by the device. If the ::cudaEventBlockingSync flag has - * not been set, then the CPU thread will busy-wait until the event has - * been completed by the device. - * - * \param event - Event to wait for - * - * \return - * ::cudaSuccess, - * ::cudaErrorInitializationError, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidResourceHandle, - * ::cudaErrorLaunchFailure - * \notefnerr - * - * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)", - * ::cudaEventCreateWithFlags, ::cudaEventRecord, - * ::cudaEventQuery, ::cudaEventDestroy, ::cudaEventElapsedTime, - * ::cuEventSynchronize - */ -extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event); - -/** - * \brief Destroys an event object - * - * Destroys the event specified by \p event. - * - * In case \p event has been recorded but has not yet been completed - * when ::cudaEventDestroy() is called, the function will return immediately and - * the resources associated with \p event will be released automatically once - * the device has completed \p event. - * - * \param event - Event to destroy - * - * \return - * ::cudaSuccess, - * ::cudaErrorInitializationError, - * ::cudaErrorInvalidValue, - * ::cudaErrorLaunchFailure - * \notefnerr - * - * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)", - * ::cudaEventCreateWithFlags, ::cudaEventQuery, - * ::cudaEventSynchronize, ::cudaEventRecord, ::cudaEventElapsedTime, - * ::cuEventDestroy - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event); - -/** - * \brief Computes the elapsed time between events - * - * Computes the elapsed time between two events (in milliseconds with a - * resolution of around 0.5 microseconds). - * - * If either event was last recorded in a non-NULL stream, the resulting time - * may be greater than expected (even if both used the same stream handle). This - * happens because the ::cudaEventRecord() operation takes place asynchronously - * and there is no guarantee that the measured latency is actually just between - * the two events. Any number of other different stream operations could execute - * in between the two measured events, thus altering the timing in a significant - * way. - * - * If ::cudaEventRecord() has not been called on either event, then - * ::cudaErrorInvalidResourceHandle is returned. If ::cudaEventRecord() has been - * called on both events but one or both of them has not yet been completed - * (that is, ::cudaEventQuery() would return ::cudaErrorNotReady on at least one - * of the events), ::cudaErrorNotReady is returned. If either event was created - * with the ::cudaEventDisableTiming flag, then this function will return - * ::cudaErrorInvalidResourceHandle. - * - * \param ms - Time between \p start and \p end in ms - * \param start - Starting event - * \param end - Ending event - * - * \return - * ::cudaSuccess, - * ::cudaErrorNotReady, - * ::cudaErrorInvalidValue, - * ::cudaErrorInitializationError, - * ::cudaErrorInvalidResourceHandle, - * ::cudaErrorLaunchFailure - * \notefnerr - * - * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)", - * ::cudaEventCreateWithFlags, ::cudaEventQuery, - * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventRecord, - * ::cuEventElapsedTime - */ -extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms, cudaEvent_t start, cudaEvent_t end); - -/** @} */ /* END CUDART_EVENT */ - -/** - * \defgroup CUDART_EXECUTION Execution Control - * - * ___MANBRIEF___ execution control functions of the CUDA runtime API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the execution control functions of the CUDA runtime - * application programming interface. - * - * Some functions have overloaded C++ API template versions documented separately in the - * \ref CUDART_HIGHLEVEL "C++ API Routines" module. - * - * @{ - */ - -/** - * \brief Launches a device function - * - * The function invokes kernel \p func on \p gridDim (\p gridDim.x × \p gridDim.y - * × \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x × - * \p blockDim.y × \p blockDim.z) threads. - * - * If the kernel has N parameters the \p args should point to array of N pointers. - * Each pointer, from args[0] to args[N - 1], point to the region - * of memory from which the actual parameter will be copied. - * - * For templated functions, pass the function symbol as follows: - * func_name - * - * \p sharedMem sets the amount of dynamic shared memory that will be available to - * each thread block. - * - * \p stream specifies a stream the invocation is associated to. - * - * \param func - Device function symbol - * \param gridDim - Grid dimentions - * \param blockDim - Block dimentions - * \param args - Arguments - * \param sharedMem - Shared memory - * \param stream - Stream identifier - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidDeviceFunction, - * ::cudaErrorInvalidConfiguration, - * ::cudaErrorLaunchFailure, - * ::cudaErrorLaunchTimeout, - * ::cudaErrorLaunchOutOfResources, - * ::cudaErrorSharedObjectInitFailed, - * ::cudaErrorInvalidPtx, - * ::cudaErrorNoKernelImageForDevice, - * ::cudaErrorJitCompilerNotFound - * \note_null_stream - * \notefnerr - * - * \sa - * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)", - * ::cuLaunchKernel - */ -extern __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream); - -/** - * \brief Launches a device function where thread blocks can cooperate and synchronize as they execute - * - * The function invokes kernel \p func on \p gridDim (\p gridDim.x × \p gridDim.y - * × \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x × - * \p blockDim.y × \p blockDim.z) threads. - * - * The device on which this kernel is invoked must have a non-zero value for - * the device attribute ::cudaDevAttrCooperativeLaunch. - * - * The total number of blocks launched cannot exceed the maximum number of blocks per - * multiprocessor as returned by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor (or - * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors - * as specified by the device attribute ::cudaDevAttrMultiProcessorCount. - * - * The kernel cannot make use of CUDA dynamic parallelism. - * - * If the kernel has N parameters the \p args should point to array of N pointers. - * Each pointer, from args[0] to args[N - 1], point to the region - * of memory from which the actual parameter will be copied. - * - * For templated functions, pass the function symbol as follows: - * func_name - * - * \p sharedMem sets the amount of dynamic shared memory that will be available to - * each thread block. - * - * \p stream specifies a stream the invocation is associated to. - * - * \param func - Device function symbol - * \param gridDim - Grid dimentions - * \param blockDim - Block dimentions - * \param args - Arguments - * \param sharedMem - Shared memory - * \param stream - Stream identifier - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidDeviceFunction, - * ::cudaErrorInvalidConfiguration, - * ::cudaErrorLaunchFailure, - * ::cudaErrorLaunchTimeout, - * ::cudaErrorLaunchOutOfResources, - * ::cudaErrorCooperativeLaunchTooLarge, - * ::cudaErrorSharedObjectInitFailed - * \note_null_stream - * \notefnerr - * - * \sa - * \ref ::cudaLaunchCooperativeKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchCooperativeKernel (C++ API)", - * ::cudaLaunchCooperativeKernelMultiDevice, - * ::cuLaunchCooperativeKernel - */ -extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream); - -/** - * \brief Launches device functions on multiple devices where thread blocks can cooperate and synchronize as they execute - * - * Invokes kernels as specified in the \p launchParamsList array where each element - * of the array specifies all the parameters required to perform a single kernel launch. - * These kernels can cooperate and synchronize as they execute. The size of the array is - * specified by \p numDevices. - * - * No two kernels can be launched on the same device. All the devices targeted by this - * multi-device launch must be identical. All devices must have a non-zero value for the - * device attribute ::cudaDevAttrCooperativeLaunch. - * - * The same kernel must be launched on all devices. Note that any __device__ or __constant__ - * variables are independently instantiated on every device. It is the application's - * responsiblity to ensure these variables are initialized and used appropriately. - * - * The size of the grids as specified in blocks, the size of the blocks themselves and the - * amount of shared memory used by each thread block must also match across all launched kernels. - * - * The streams used to launch these kernels must have been created via either ::cudaStreamCreate - * or ::cudaStreamCreateWithPriority or ::cudaStreamCreateWithPriority. The NULL stream or - * ::cudaStreamLegacy or ::cudaStreamPerThread cannot be used. - * - * The total number of blocks launched per kernel cannot exceed the maximum number of blocks - * per multiprocessor as returned by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor (or - * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors - * as specified by the device attribute ::cudaDevAttrMultiProcessorCount. Since the - * total number of blocks launched per device has to match across all devices, the maximum - * number of blocks that can be launched per device will be limited by the device with the - * least number of multiprocessors. - * - * The kernel cannot make use of CUDA dynamic parallelism. - * - * The ::cudaLaunchParams structure is defined as: - * \code - struct cudaLaunchParams - { - void *func; - dim3 gridDim; - dim3 blockDim; - void **args; - size_t sharedMem; - cudaStream_t stream; - }; - * \endcode - * where: - * - ::cudaLaunchParams::func specifies the kernel to be launched. This same functions must - * be launched on all devices. For templated functions, pass the function symbol as follows: - * func_name - * - ::cudaLaunchParams::gridDim specifies the width, height and depth of the grid in blocks. - * This must match across all kernels launched. - * - ::cudaLaunchParams::blockDim is the width, height and depth of each thread block. This - * must match across all kernels launched. - * - ::cudaLaunchParams::args specifies the arguments to the kernel. If the kernel has - * N parameters then ::cudaLaunchParams::args should point to array of N pointers. Each - * pointer, from ::cudaLaunchParams::args[0] to ::cudaLaunchParams::args[N - 1], - * point to the region of memory from which the actual parameter will be copied. - * - ::cudaLaunchParams::sharedMem is the dynamic shared-memory size per thread block in bytes. - * This must match across all kernels launched. - * - ::cudaLaunchParams::stream is the handle to the stream to perform the launch in. This cannot - * be the NULL stream or ::cudaStreamLegacy or ::cudaStreamPerThread. - * - * By default, the kernel won't begin execution on any GPU until all prior work in all the specified - * streams has completed. This behavior can be overridden by specifying the flag - * ::cudaCooperativeLaunchMultiDeviceNoPreSync. When this flag is specified, each kernel - * will only wait for prior work in the stream corresponding to that GPU to complete before it begins - * execution. - * - * Similarly, by default, any subsequent work pushed in any of the specified streams will not begin - * execution until the kernels on all GPUs have completed. This behavior can be overridden by specifying - * the flag ::cudaCooperativeLaunchMultiDeviceNoPostSync. When this flag is specified, - * any subsequent work pushed in any of the specified streams will only wait for the kernel launched - * on the GPU corresponding to that stream to complete before it begins execution. - * - * \param launchParamsList - List of launch parameters, one per device - * \param numDevices - Size of the \p launchParamsList array - * \param flags - Flags to control launch behavior - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidDeviceFunction, - * ::cudaErrorInvalidConfiguration, - * ::cudaErrorLaunchFailure, - * ::cudaErrorLaunchTimeout, - * ::cudaErrorLaunchOutOfResources, - * ::cudaErrorCooperativeLaunchTooLarge, - * ::cudaErrorSharedObjectInitFailed - * \note_null_stream - * \notefnerr - * - * \sa - * \ref ::cudaLaunchCooperativeKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchCooperativeKernel (C++ API)", - * ::cudaLaunchCooperativeKernel, - * ::cuLaunchCooperativeKernelMultiDevice - */ -extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *launchParamsList, unsigned int numDevices, unsigned int flags __dv(0)); - -/** - * \brief Sets the preferred cache configuration for a device function - * - * On devices where the L1 cache and shared memory use the same hardware - * resources, this sets through \p cacheConfig the preferred cache configuration - * for the function specified via \p func. This is only a preference. The - * runtime will use the requested configuration if possible, but it is free to - * choose a different configuration if required to execute \p func. - * - * \p func is a device function symbol and must be declared as a - * \c __global__ function. If the specified function does not exist, - * then ::cudaErrorInvalidDeviceFunction is returned. For templated functions, - * pass the function symbol as follows: func_name - * - * This setting does nothing on devices where the size of the L1 cache and - * shared memory are fixed. - * - * Launching a kernel with a different preference than the most recent - * preference setting may insert a device-side synchronization point. - * - * The supported cache configurations are: - * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default) - * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache - * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory - * - ::cudaFuncCachePreferEqual: prefer equal size L1 cache and shared memory - * - * \param func - Device function symbol - * \param cacheConfig - Requested cache configuration - * - * \return - * ::cudaSuccess, - * ::cudaErrorInitializationError, - * ::cudaErrorInvalidDeviceFunction - * \notefnerr - * \note_string_api_deprecation2 - * - * \sa ::cudaConfigureCall, - * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)", - * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", - * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)", - * ::cudaSetDoubleForDevice, - * ::cudaSetDoubleForHost, - * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)", - * ::cudaThreadGetCacheConfig, - * ::cudaThreadSetCacheConfig, - * ::cuFuncSetCacheConfig - */ -extern __host__ cudaError_t CUDARTAPI cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig); - -/** - * \brief Sets the shared memory configuration for a device function - * - * On devices with configurable shared memory banks, this function will - * force all subsequent launches of the specified device function to have - * the given shared memory bank size configuration. On any given launch of the - * function, the shared memory configuration of the device will be temporarily - * changed if needed to suit the function's preferred configuration. Changes in - * shared memory configuration between subsequent launches of functions, - * may introduce a device side synchronization point. - * - * Any per-function setting of shared memory bank size set via - * ::cudaFuncSetSharedMemConfig will override the device wide setting set by - * ::cudaDeviceSetSharedMemConfig. - * - * Changing the shared memory bank size will not increase shared memory usage - * or affect occupancy of kernels, but may have major effects on performance. - * Larger bank sizes will allow for greater potential bandwidth to shared memory, - * but will change what kinds of accesses to shared memory will result in bank - * conflicts. - * - * This function will do nothing on devices with fixed shared memory bank size. - * - * For templated functions, pass the function symbol as follows: - * func_name - * - * The supported bank configurations are: - * - ::cudaSharedMemBankSizeDefault: use the device's shared memory configuration - * when launching this function. - * - ::cudaSharedMemBankSizeFourByte: set shared memory bank width to be - * four bytes natively when launching this function. - * - ::cudaSharedMemBankSizeEightByte: set shared memory bank width to be eight - * bytes natively when launching this function. - * - * \param func - Device function symbol - * \param config - Requested shared memory configuration - * - * \return - * ::cudaSuccess, - * ::cudaErrorInitializationError, - * ::cudaErrorInvalidDeviceFunction, - * ::cudaErrorInvalidValue, - * \notefnerr - * \note_string_api_deprecation2 - * - * \sa ::cudaConfigureCall, - * ::cudaDeviceSetSharedMemConfig, - * ::cudaDeviceGetSharedMemConfig, - * ::cudaDeviceSetCacheConfig, - * ::cudaDeviceGetCacheConfig, - * ::cudaFuncSetCacheConfig, - * ::cuFuncSetSharedMemConfig - */ -extern __host__ cudaError_t CUDARTAPI cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config); - -/** - * \brief Find out attributes for a given function - * - * This function obtains the attributes of a function specified via \p func. - * \p func is a device function symbol and must be declared as a - * \c __global__ function. The fetched attributes are placed in \p attr. - * If the specified function does not exist, then - * ::cudaErrorInvalidDeviceFunction is returned. For templated functions, pass - * the function symbol as follows: func_name - * - * Note that some function attributes such as - * \ref ::cudaFuncAttributes::maxThreadsPerBlock "maxThreadsPerBlock" - * may vary based on the device that is currently being used. - * - * \param attr - Return pointer to function's attributes - * \param func - Device function symbol - * - * \return - * ::cudaSuccess, - * ::cudaErrorInitializationError, - * ::cudaErrorInvalidDeviceFunction - * \notefnerr - * \note_string_api_deprecation2 - * - * \sa ::cudaConfigureCall, - * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", - * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)", - * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)", - * ::cudaSetDoubleForDevice, - * ::cudaSetDoubleForHost, - * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)", - * ::cuFuncGetAttribute - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func); - - -/** - * \brief Set attributes for a given function - * - * This function sets the attributes of a function specified via \p func. - * The parameter \p func must be a pointer to a function that executes - * on the device. The parameter specified by \p func must be declared as a \p __global__ - * function. The enumeration defined by \p attr is set to the value defined by \p value. - * If the specified function does not exist, then ::cudaErrorInvalidDeviceFunction is returned. - * If the specified attribute cannot be written, or if the value is incorrect, - * then ::cudaErrorInvalidValue is returned. - * - * Valid values for \p attr are: - * - ::cudaFuncAttributeMaxDynamicSharedMemorySize - Maximum size of dynamic shared memory per block - * - ::cudaFuncAttributePreferredSharedMemoryCarveout - Preferred shared memory-L1 cache split ratio in percent of maximum shared memory - * - * \param func - Function to get attributes of - * \param attr - Attribute to set - * \param value - Value to set - * - * \return - * ::cudaSuccess, - * ::cudaErrorInitializationError, - * ::cudaErrorInvalidDeviceFunction, - * ::cudaErrorInvalidValue - * \notefnerr - * - * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)", - * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)", - * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", - * ::cudaSetDoubleForDevice, - * ::cudaSetDoubleForHost, - * \ref ::cudaSetupArgument(T, size_t) "cudaSetupArgument (C++ API)" - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value); - -/** - * \brief Converts a double argument to be executed on a device - * - * \param d - Double to convert - * - * \deprecated This function is deprecated as of CUDA 7.5 - * - * Converts the double value of \p d to an internal float representation if - * the device does not support double arithmetic. If the device does natively - * support doubles, then this function does nothing. - * - * \return - * ::cudaSuccess - * \notefnerr - * - * \sa - * \ref ::cudaLaunch(const void*) "cudaLaunch (C API)", - * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", - * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", - * ::cudaSetDoubleForHost, - * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)" - */ -extern __host__ cudaError_t CUDARTAPI cudaSetDoubleForDevice(double *d); - -/** - * \brief Converts a double argument after execution on a device - * - * \deprecated This function is deprecated as of CUDA 7.5 - * - * Converts the double value of \p d from a potentially internal float - * representation if the device does not support double arithmetic. If the - * device does natively support doubles, then this function does nothing. - * - * \param d - Double to convert - * - * \return - * ::cudaSuccess - * \notefnerr - * - * \sa - * \ref ::cudaLaunch(const void*) "cudaLaunch (C API)", - * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", - * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", - * ::cudaSetDoubleForDevice, - * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)" - */ -extern __host__ cudaError_t CUDARTAPI cudaSetDoubleForHost(double *d); - -/** @} */ /* END CUDART_EXECUTION */ - -/** - * \defgroup CUDART_OCCUPANCY Occupancy - * - * ___MANBRIEF___ occupancy calculation functions of the CUDA runtime API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the occupancy calculation functions of the CUDA runtime - * application programming interface. - * - * Besides the occupancy calculator functions - * (\ref ::cudaOccupancyMaxActiveBlocksPerMultiprocessor and \ref ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags), - * there are also C++ only occupancy-based launch configuration functions documented in - * \ref CUDART_HIGHLEVEL "C++ API Routines" module. - * - * See - * \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)", - * \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSize (C++ API)", - * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)", - * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)" - * - * @{ - */ - -/** - * \brief Returns occupancy for a device function - * - * Returns in \p *numBlocks the maximum number of active blocks per - * streaming multiprocessor for the device function. - * - * \param numBlocks - Returned occupancy - * \param func - Kernel function for which occupancy is calculated - * \param blockSize - Block size the kernel is intended to be launched with - * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes - * - * \return - * ::cudaSuccess, - * ::cudaErrorCudartUnloading, - * ::cudaErrorInitializationError, - * ::cudaErrorInvalidDevice, - * ::cudaErrorInvalidDeviceFunction, - * ::cudaErrorInvalidValue, - * ::cudaErrorUnknown, - * \notefnerr - * - * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, - * \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)", - * \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API)", - * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)", - * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API)", - * ::cuOccupancyMaxActiveBlocksPerMultiprocessor - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize); - -/** - * \brief Returns occupancy for a device function with the specified flags - * - * Returns in \p *numBlocks the maximum number of active blocks per - * streaming multiprocessor for the device function. - * - * The \p flags parameter controls how special cases are handled. Valid flags include: - * - * - ::cudaOccupancyDefault: keeps the default behavior as - * ::cudaOccupancyMaxActiveBlocksPerMultiprocessor - * - * - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior - * on platform where global caching affects occupancy. On such platforms, if caching - * is enabled, but per-block SM resource usage would result in zero occupancy, the - * occupancy calculator will calculate the occupancy as if caching is disabled. - * Setting this flag makes the occupancy calculator to return 0 in such cases. - * More information can be found about this feature in the "Unified L1/Texture Cache" - * section of the Maxwell tuning guide. - * - * \param numBlocks - Returned occupancy - * \param func - Kernel function for which occupancy is calculated - * \param blockSize - Block size the kernel is intended to be launched with - * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes - * \param flags - Requested behavior for the occupancy calculator - * - * \return - * ::cudaSuccess, - * ::cudaErrorCudartUnloading, - * ::cudaErrorInitializationError, - * ::cudaErrorInvalidDevice, - * ::cudaErrorInvalidDeviceFunction, - * ::cudaErrorInvalidValue, - * ::cudaErrorUnknown, - * \notefnerr - * - * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor, - * \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)", - * \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API)", - * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)", - * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API)", - * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize, unsigned int flags); - -/** @} */ /* END CUDA_OCCUPANCY */ - -/** - * \defgroup CUDART_EXECUTION_DEPRECATED Execution Control [DEPRECATED] - * - * ___MANBRIEF___ deprecated execution control functions of the CUDA runtime API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the deprecated execution control functions of the CUDA runtime - * application programming interface. - * - * Some functions have overloaded C++ API template versions documented separately in the - * \ref CUDART_HIGHLEVEL "C++ API Routines" module. - * - * @{ - */ - -/** - * \brief Configure a device-launch - * - * \deprecated This function is deprecated as of CUDA 7.0 - * - * Specifies the grid and block dimensions for the device call to be executed - * similar to the execution configuration syntax. ::cudaConfigureCall() is - * stack based. Each call pushes data on top of an execution stack. This data - * contains the dimension for the grid and thread blocks, together with any - * arguments for the call. - * - * \param gridDim - Grid dimensions - * \param blockDim - Block dimensions - * \param sharedMem - Shared memory - * \param stream - Stream identifier - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidConfiguration - * \note_null_stream - * \notefnerr - * - * \sa - * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)", - * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", - * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", - * \ref ::cudaLaunch(const void*) "cudaLaunch (C API)", - * ::cudaSetDoubleForDevice, - * ::cudaSetDoubleForHost, - * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)", - */ -extern __host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim, dim3 blockDim, size_t sharedMem __dv(0), cudaStream_t stream __dv(0)); - -/** - * \brief Configure a device launch - * - * \deprecated This function is deprecated as of CUDA 7.0 - * - * Pushes \p size bytes of the argument pointed to by \p arg at \p offset - * bytes from the start of the parameter passing area, which starts at - * offset 0. The arguments are stored in the top of the execution stack. - * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument()" - * must be preceded by a call to ::cudaConfigureCall(). - * - * \param arg - Argument to push for a kernel launch - * \param size - Size of argument - * \param offset - Offset in argument stack to push new arg - * - * \return - * ::cudaSuccess - * \notefnerr - * - * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)", - * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", - * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", - * \ref ::cudaLaunch(const void*) "cudaLaunch (C API)", - * ::cudaSetDoubleForDevice, - * ::cudaSetDoubleForHost, - * \ref ::cudaSetupArgument(T, size_t) "cudaSetupArgument (C++ API)", - */ -extern __host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg, size_t size, size_t offset); - -/** - * \brief Launches a device function - * - * \deprecated This function is deprecated as of CUDA 7.0 - * - * Launches the function \p func on the device. The parameter \p func must - * be a device function symbol. The parameter specified by \p func must be - * declared as a \p __global__ function. For templated functions, pass the - * function symbol as follows: func_name - * \ref ::cudaLaunch(const void*) "cudaLaunch()" must be preceded by a call to - * ::cudaConfigureCall() since it pops the data that was pushed by - * ::cudaConfigureCall() from the execution stack. - * - * \param func - Device function symbol - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidDeviceFunction, - * ::cudaErrorInvalidConfiguration, - * ::cudaErrorLaunchFailure, - * ::cudaErrorLaunchTimeout, - * ::cudaErrorLaunchOutOfResources, - * ::cudaErrorSharedObjectInitFailed, - * ::cudaErrorInvalidPtx, - * ::cudaErrorNoKernelImageForDevice, - * ::cudaErrorJitCompilerNotFound - * \notefnerr - * \note_string_api_deprecation_50 - * - * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)", - * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", - * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", - * \ref ::cudaLaunch(T*) "cudaLaunch (C++ API)", - * ::cudaSetDoubleForDevice, - * ::cudaSetDoubleForHost, - * \ref ::cudaSetupArgument(const void*, size_t, size_t) "cudaSetupArgument (C API)", - * ::cudaThreadGetCacheConfig, - * ::cudaThreadSetCacheConfig - */ -extern __host__ cudaError_t CUDARTAPI cudaLaunch(const void *func); - - -/** @} */ /* END CUDART_EXECUTION_DEPRECATED */ - - -/** - * \defgroup CUDART_MEMORY Memory Management - * - * ___MANBRIEF___ memory management functions of the CUDA runtime API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the memory management functions of the CUDA runtime - * application programming interface. - * - * Some functions have overloaded C++ API template versions documented separately in the - * \ref CUDART_HIGHLEVEL "C++ API Routines" module. - * - * @{ - */ - -/** - * \brief Allocates memory that will be automatically managed by the Unified Memory system - * - * Allocates \p size bytes of managed memory on the device and returns in - * \p *devPtr a pointer to the allocated memory. If the device doesn't support - * allocating managed memory, ::cudaErrorNotSupported is returned. Support - * for managed memory can be queried using the device attribute - * ::cudaDevAttrManagedMemory. The allocated memory is suitably - * aligned for any kind of variable. The memory is not cleared. If \p size - * is 0, ::cudaMallocManaged returns ::cudaErrorInvalidValue. The pointer - * is valid on the CPU and on all GPUs in the system that support managed memory. - * All accesses to this pointer must obey the Unified Memory programming model. - * - * \p flags specifies the default stream association for this allocation. - * \p flags must be one of ::cudaMemAttachGlobal or ::cudaMemAttachHost. The - * default value for \p flags is ::cudaMemAttachGlobal. - * If ::cudaMemAttachGlobal is specified, then this memory is accessible from - * any stream on any device. If ::cudaMemAttachHost is specified, then the - * allocation should not be accessed from devices that have a zero value for the - * device attribute ::cudaDevAttrConcurrentManagedAccess; an explicit call to - * ::cudaStreamAttachMemAsync will be required to enable access on such devices. - * - * If the association is later changed via ::cudaStreamAttachMemAsync to - * a single stream, the default association, as specifed during ::cudaMallocManaged, - * is restored when that stream is destroyed. For __managed__ variables, the - * default association is always ::cudaMemAttachGlobal. Note that destroying a - * stream is an asynchronous operation, and as a result, the change to default - * association won't happen until all work in the stream has completed. - * - * Memory allocated with ::cudaMallocManaged should be released with ::cudaFree. - * - * Device memory oversubscription is possible for GPUs that have a non-zero value for the - * device attribute ::cudaDevAttrConcurrentManagedAccess. Managed memory on - * such GPUs may be evicted from device memory to host memory at any time by the Unified - * Memory driver in order to make room for other allocations. - * - * In a multi-GPU system where all GPUs have a non-zero value for the device attribute - * ::cudaDevAttrConcurrentManagedAccess, managed memory may not be populated when this - * API returns and instead may be populated on access. In such systems, managed memory can - * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to - * maintain data locality and prevent excessive page faults to the extent possible. The application - * can also guide the driver about memory usage patterns via ::cudaMemAdvise. The application - * can also explicitly migrate memory to a desired processor's memory via - * ::cudaMemPrefetchAsync. - * - * In a multi-GPU system where all of the GPUs have a zero value for the device attribute - * ::cudaDevAttrConcurrentManagedAccess and all the GPUs have peer-to-peer support - * with each other, the physical storage for managed memory is created on the GPU which is active - * at the time ::cudaMallocManaged is called. All other GPUs will reference the data at reduced - * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate - * memory among such GPUs. - * - * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and - * where the value of the device attribute ::cudaDevAttrConcurrentManagedAccess - * is zero for at least one of those GPUs, the location chosen for physical storage of managed - * memory is system-dependent. - * - On Linux, the location chosen will be device memory as long as the current set of active - * contexts are on devices that either have peer-to-peer support with each other or have a - * non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess. - * If there is an active context on a GPU that does not have a non-zero value for that device - * attribute and it does not have peer-to-peer support with the other devices that have active - * contexts on them, then the location for physical storage will be 'zero-copy' or host memory. - * Note that this means that managed memory that is located in device memory is migrated to - * host memory if a new context is created on a GPU that doesn't have a non-zero value for - * the device attribute and does not support peer-to-peer with at least one of the other devices - * that has an active context. This in turn implies that context creation may fail if there is - * insufficient host memory to migrate all managed allocations. - * - On Windows, the physical storage is always created in 'zero-copy' or host memory. - * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these - * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to - * restrict CUDA to only use those GPUs that have peer-to-peer support. - * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a non-zero - * value to force the driver to always use device memory for physical storage. - * When this environment variable is set to a non-zero value, all devices used in - * that process that support managed memory have to be peer-to-peer compatible - * with each other. The error ::cudaErrorInvalidDevice will be returned if a device - * that supports managed memory is used and it is not peer-to-peer compatible with - * any of the other managed memory supporting devices that were previously used in - * that process, even if ::cudaDeviceReset has been called on those devices. These - * environment variables are described in the CUDA programming guide under the - * "CUDA environment variables" section. - * - * \param devPtr - Pointer to allocated device memory - * \param size - Requested allocation size in bytes - * \param flags - Must be either ::cudaMemAttachGlobal or ::cudaMemAttachHost (defaults to ::cudaMemAttachGlobal) - * - * \return - * ::cudaSuccess, - * ::cudaErrorMemoryAllocation, - * ::cudaErrorNotSupported, - * ::cudaErrorInvalidValue - * - * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray, - * ::cudaMalloc3D, ::cudaMalloc3DArray, - * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", - * ::cudaFreeHost, ::cudaHostAlloc, ::cudaDeviceGetAttribute, ::cudaStreamAttachMemAsync, - * ::cuMemAllocManaged - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMallocManaged(void **devPtr, size_t size, unsigned int flags __dv(cudaMemAttachGlobal)); - - -/** - * \brief Allocate memory on the device - * - * Allocates \p size bytes of linear memory on the device and returns in - * \p *devPtr a pointer to the allocated memory. The allocated memory is - * suitably aligned for any kind of variable. The memory is not cleared. - * ::cudaMalloc() returns ::cudaErrorMemoryAllocation in case of failure. - * - * The device version of ::cudaFree cannot be used with a \p *devPtr - * allocated using the host API, and vice versa. - * - * \param devPtr - Pointer to allocated device memory - * \param size - Requested allocation size in bytes - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorMemoryAllocation - * - * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray, - * ::cudaMalloc3D, ::cudaMalloc3DArray, - * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", - * ::cudaFreeHost, ::cudaHostAlloc, - * ::cuMemAlloc - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size); - -/** - * \brief Allocates page-locked memory on the host - * - * Allocates \p size bytes of host memory that is page-locked and accessible - * to the device. The driver tracks the virtual memory ranges allocated with - * this function and automatically accelerates calls to functions such as - * ::cudaMemcpy*(). Since the memory can be accessed directly by the device, - * it can be read or written with much higher bandwidth than pageable memory - * obtained with functions such as ::malloc(). Allocating excessive amounts of - * memory with ::cudaMallocHost() may degrade system performance, since it - * reduces the amount of memory available to the system for paging. As a - * result, this function is best used sparingly to allocate staging areas for - * data exchange between host and device. - * - * \param ptr - Pointer to allocated host memory - * \param size - Requested allocation size in bytes - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorMemoryAllocation - * \notefnerr - * - * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaMallocArray, ::cudaMalloc3D, - * ::cudaMalloc3DArray, ::cudaHostAlloc, ::cudaFree, ::cudaFreeArray, - * \ref ::cudaMallocHost(void**, size_t, unsigned int) "cudaMallocHost (C++ API)", - * ::cudaFreeHost, ::cudaHostAlloc, - * ::cuMemAllocHost - */ -extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size); - -/** - * \brief Allocates pitched memory on the device - * - * Allocates at least \p width (in bytes) * \p height bytes of linear memory - * on the device and returns in \p *devPtr a pointer to the allocated memory. - * The function may pad the allocation to ensure that corresponding pointers - * in any given row will continue to meet the alignment requirements for - * coalescing as the address is updated from row to row. The pitch returned in - * \p *pitch by ::cudaMallocPitch() is the width in bytes of the allocation. - * The intended usage of \p pitch is as a separate parameter of the allocation, - * used to compute addresses within the 2D array. Given the row and column of - * an array element of type \p T, the address is computed as: - * \code - T* pElement = (T*)((char*)BaseAddress + Row * pitch) + Column; - \endcode - * - * For allocations of 2D arrays, it is recommended that programmers consider - * performing pitch allocations using ::cudaMallocPitch(). Due to pitch - * alignment restrictions in the hardware, this is especially true if the - * application will be performing 2D memory copies between different regions - * of device memory (whether linear memory or CUDA arrays). - * - * \param devPtr - Pointer to allocated pitched device memory - * \param pitch - Pitch for allocation - * \param width - Requested pitched allocation width (in bytes) - * \param height - Requested pitched allocation height - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorMemoryAllocation - * \notefnerr - * - * \sa ::cudaMalloc, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray, - * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", - * ::cudaFreeHost, ::cudaMalloc3D, ::cudaMalloc3DArray, - * ::cudaHostAlloc, - * ::cuMemAllocPitch - */ -extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t height); - -/** - * \brief Allocate an array on the device - * - * Allocates a CUDA array according to the ::cudaChannelFormatDesc structure - * \p desc and returns a handle to the new CUDA array in \p *array. - * - * The ::cudaChannelFormatDesc is defined as: - * \code - struct cudaChannelFormatDesc { - int x, y, z, w; - enum cudaChannelFormatKind f; - }; - \endcode - * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned, - * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat. - * - * The \p flags parameter enables different options to be specified that affect - * the allocation, as follows. - * - ::cudaArrayDefault: This flag's value is defined to be 0 and provides default array allocation - * - ::cudaArraySurfaceLoadStore: Allocates an array that can be read from or written to using a surface reference - * - ::cudaArrayTextureGather: This flag indicates that texture gather operations will be performed on the array. - * - * \p width and \p height must meet certain size requirements. See ::cudaMalloc3DArray() for more details. - * - * \param array - Pointer to allocated array in device memory - * \param desc - Requested channel format - * \param width - Requested array allocation width - * \param height - Requested array allocation height - * \param flags - Requested properties of allocated array - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorMemoryAllocation - * \notefnerr - * - * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaFreeArray, - * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", - * ::cudaFreeHost, ::cudaMalloc3D, ::cudaMalloc3DArray, - * ::cudaHostAlloc, - * ::cuArrayCreate - */ -extern __host__ cudaError_t CUDARTAPI cudaMallocArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height __dv(0), unsigned int flags __dv(0)); - -/** - * \brief Frees memory on the device - * - * Frees the memory space pointed to by \p devPtr, which must have been - * returned by a previous call to ::cudaMalloc() or ::cudaMallocPitch(). - * Otherwise, or if ::cudaFree(\p devPtr) has already been called before, - * an error is returned. If \p devPtr is 0, no operation is performed. - * ::cudaFree() returns ::cudaErrorInvalidDevicePointer in case of failure. - * - * The device version of ::cudaFree cannot be used with a \p *devPtr - * allocated using the host API, and vice versa. - * - * \param devPtr - Device pointer to memory to free - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidDevicePointer, - * ::cudaErrorInitializationError - * \notefnerr - * - * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaMallocArray, ::cudaFreeArray, - * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", - * ::cudaFreeHost, ::cudaMalloc3D, ::cudaMalloc3DArray, - * ::cudaHostAlloc, - * ::cuMemFree - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr); - -/** - * \brief Frees page-locked memory - * - * Frees the memory space pointed to by \p hostPtr, which must have been - * returned by a previous call to ::cudaMallocHost() or ::cudaHostAlloc(). - * - * \param ptr - Pointer to memory to free - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInitializationError - * \notefnerr - * - * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, - * ::cudaFreeArray, - * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", - * ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaHostAlloc, - * ::cuMemFreeHost - */ -extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr); - -/** - * \brief Frees an array on the device - * - * Frees the CUDA array \p array, which must have been * returned by a - * previous call to ::cudaMallocArray(). If ::cudaFreeArray(\p array) has - * already been called before, ::cudaErrorInvalidValue is returned. If - * \p devPtr is 0, no operation is performed. - * - * \param array - Pointer to array to free - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInitializationError - * \notefnerr - * - * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, - * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", - * ::cudaFreeHost, ::cudaHostAlloc, - * ::cuArrayDestroy - */ -extern __host__ cudaError_t CUDARTAPI cudaFreeArray(cudaArray_t array); - -/** - * \brief Frees a mipmapped array on the device - * - * Frees the CUDA mipmapped array \p mipmappedArray, which must have been - * returned by a previous call to ::cudaMallocMipmappedArray(). - * If ::cudaFreeMipmappedArray(\p mipmappedArray) has already been called before, - * ::cudaErrorInvalidValue is returned. - * - * \param mipmappedArray - Pointer to mipmapped array to free - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInitializationError - * \notefnerr - * - * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, - * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", - * ::cudaFreeHost, ::cudaHostAlloc, - * ::cuMipmappedArrayDestroy - */ -extern __host__ cudaError_t CUDARTAPI cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray); - - -/** - * \brief Allocates page-locked memory on the host - * - * Allocates \p size bytes of host memory that is page-locked and accessible - * to the device. The driver tracks the virtual memory ranges allocated with - * this function and automatically accelerates calls to functions such as - * ::cudaMemcpy(). Since the memory can be accessed directly by the device, it - * can be read or written with much higher bandwidth than pageable memory - * obtained with functions such as ::malloc(). Allocating excessive amounts of - * pinned memory may degrade system performance, since it reduces the amount - * of memory available to the system for paging. As a result, this function is - * best used sparingly to allocate staging areas for data exchange between host - * and device. - * - * The \p flags parameter enables different options to be specified that affect - * the allocation, as follows. - * - ::cudaHostAllocDefault: This flag's value is defined to be 0 and causes - * ::cudaHostAlloc() to emulate ::cudaMallocHost(). - * - ::cudaHostAllocPortable: The memory returned by this call will be - * considered as pinned memory by all CUDA contexts, not just the one that - * performed the allocation. - * - ::cudaHostAllocMapped: Maps the allocation into the CUDA address space. - * The device pointer to the memory may be obtained by calling - * ::cudaHostGetDevicePointer(). - * - ::cudaHostAllocWriteCombined: Allocates the memory as write-combined (WC). - * WC memory can be transferred across the PCI Express bus more quickly on some - * system configurations, but cannot be read efficiently by most CPUs. WC - * memory is a good option for buffers that will be written by the CPU and read - * by the device via mapped pinned memory or host->device transfers. - * - * All of these flags are orthogonal to one another: a developer may allocate - * memory that is portable, mapped and/or write-combined with no restrictions. - * - * ::cudaSetDeviceFlags() must have been called with the ::cudaDeviceMapHost - * flag in order for the ::cudaHostAllocMapped flag to have any effect. - * - * The ::cudaHostAllocMapped flag may be specified on CUDA contexts for devices - * that do not support mapped pinned memory. The failure is deferred to - * ::cudaHostGetDevicePointer() because the memory may be mapped into other - * CUDA contexts via the ::cudaHostAllocPortable flag. - * - * Memory allocated by this function must be freed with ::cudaFreeHost(). - * - * \param pHost - Device pointer to allocated memory - * \param size - Requested allocation size in bytes - * \param flags - Requested properties of allocated memory - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorMemoryAllocation - * \notefnerr - * - * \sa ::cudaSetDeviceFlags, - * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", - * ::cudaFreeHost, - * ::cuMemHostAlloc - */ -extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size, unsigned int flags); - -/** - * \brief Registers an existing host memory range for use by CUDA - * - * Page-locks the memory range specified by \p ptr and \p size and maps it - * for the device(s) as specified by \p flags. This memory range also is added - * to the same tracking mechanism as ::cudaHostAlloc() to automatically accelerate - * calls to functions such as ::cudaMemcpy(). Since the memory can be accessed - * directly by the device, it can be read or written with much higher bandwidth - * than pageable memory that has not been registered. Page-locking excessive - * amounts of memory may degrade system performance, since it reduces the amount - * of memory available to the system for paging. As a result, this function is - * best used sparingly to register staging areas for data exchange between - * host and device. - * - * ::cudaHostRegister is not supported on non I/O coherent devices. - * - * The \p flags parameter enables different options to be specified that - * affect the allocation, as follows. - * - * - ::cudaHostRegisterDefault: On a system with unified virtual addressing, - * the memory will be both mapped and portable. On a system with no unified - * virtual addressing, the memory will be neither mapped nor portable. - * - * - ::cudaHostRegisterPortable: The memory returned by this call will be - * considered as pinned memory by all CUDA contexts, not just the one that - * performed the allocation. - * - * - ::cudaHostRegisterMapped: Maps the allocation into the CUDA address - * space. The device pointer to the memory may be obtained by calling - * ::cudaHostGetDevicePointer(). - * - * - ::cudaHostRegisterIoMemory: The passed memory pointer is treated as - * pointing to some memory-mapped I/O space, e.g. belonging to a - * third-party PCIe device, and it will marked as non cache-coherent and - * contiguous. - * - * All of these flags are orthogonal to one another: a developer may page-lock - * memory that is portable or mapped with no restrictions. - * - * The CUDA context must have been created with the ::cudaMapHost flag in - * order for the ::cudaHostRegisterMapped flag to have any effect. - * - * The ::cudaHostRegisterMapped flag may be specified on CUDA contexts for - * devices that do not support mapped pinned memory. The failure is deferred - * to ::cudaHostGetDevicePointer() because the memory may be mapped into - * other CUDA contexts via the ::cudaHostRegisterPortable flag. - * - * For devices that have a non-zero value for the device attribute - * ::cudaDevAttrCanUseHostPointerForRegisteredMem, the memory - * can also be accessed from the device using the host pointer \p ptr. - * The device pointer returned by ::cudaHostGetDevicePointer() may or may not - * match the original host pointer \p ptr and depends on the devices visible to the - * application. If all devices visible to the application have a non-zero value for the - * device attribute, the device pointer returned by ::cudaHostGetDevicePointer() - * will match the original pointer \p ptr. If any device visible to the application - * has a zero value for the device attribute, the device pointer returned by - * ::cudaHostGetDevicePointer() will not match the original host pointer \p ptr, - * but it will be suitable for use on all devices provided Unified Virtual Addressing - * is enabled. In such systems, it is valid to access the memory using either pointer - * on devices that have a non-zero value for the device attribute. Note however that - * such devices should access the memory using only of the two pointers and not both. - * - * The memory page-locked by this function must be unregistered with ::cudaHostUnregister(). - * - * \param ptr - Host pointer to memory to page-lock - * \param size - Size in bytes of the address range to page-lock in bytes - * \param flags - Flags for allocation request - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorMemoryAllocation, - * ::cudaErrorHostMemoryAlreadyRegistered, - * ::cudaErrorNotSupported - * \notefnerr - * - * \sa ::cudaHostUnregister, ::cudaHostGetFlags, ::cudaHostGetDevicePointer, - * ::cuMemHostRegister - */ -extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size, unsigned int flags); - -/** - * \brief Unregisters a memory range that was registered with cudaHostRegister - * - * Unmaps the memory range whose base address is specified by \p ptr, and makes - * it pageable again. - * - * The base address must be the same one specified to ::cudaHostRegister(). - * - * \param ptr - Host pointer to memory to unregister - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorHostMemoryNotRegistered - * \notefnerr - * - * \sa ::cudaHostUnregister, - * ::cuMemHostUnregister - */ -extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr); - -/** - * \brief Passes back device pointer of mapped host memory allocated by - * cudaHostAlloc or registered by cudaHostRegister - * - * Passes back the device pointer corresponding to the mapped, pinned host - * buffer allocated by ::cudaHostAlloc() or registered by ::cudaHostRegister(). - * - * ::cudaHostGetDevicePointer() will fail if the ::cudaDeviceMapHost flag was - * not specified before deferred context creation occurred, or if called on a - * device that does not support mapped, pinned memory. - * - * For devices that have a non-zero value for the device attribute - * ::cudaDevAttrCanUseHostPointerForRegisteredMem, the memory - * can also be accessed from the device using the host pointer \p pHost. - * The device pointer returned by ::cudaHostGetDevicePointer() may or may not - * match the original host pointer \p pHost and depends on the devices visible to the - * application. If all devices visible to the application have a non-zero value for the - * device attribute, the device pointer returned by ::cudaHostGetDevicePointer() - * will match the original pointer \p pHost. If any device visible to the application - * has a zero value for the device attribute, the device pointer returned by - * ::cudaHostGetDevicePointer() will not match the original host pointer \p pHost, - * but it will be suitable for use on all devices provided Unified Virtual Addressing - * is enabled. In such systems, it is valid to access the memory using either pointer - * on devices that have a non-zero value for the device attribute. Note however that - * such devices should access the memory using only of the two pointers and not both. - * - * \p flags provides for future releases. For now, it must be set to 0. - * - * \param pDevice - Returned device pointer for mapped memory - * \param pHost - Requested host pointer mapping - * \param flags - Flags for extensions (must be 0 for now) - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorMemoryAllocation - * \notefnerr - * - * \sa ::cudaSetDeviceFlags, ::cudaHostAlloc, - * ::cuMemHostGetDevicePointer - */ -extern __host__ cudaError_t CUDARTAPI cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags); - -/** - * \brief Passes back flags used to allocate pinned host memory allocated by - * cudaHostAlloc - * - * ::cudaHostGetFlags() will fail if the input pointer does not - * reside in an address range allocated by ::cudaHostAlloc(). - * - * \param pFlags - Returned flags word - * \param pHost - Host pointer - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue - * \notefnerr - * - * \sa ::cudaHostAlloc, - * ::cuMemHostGetFlags - */ -extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags, void *pHost); - -/** - * \brief Allocates logical 1D, 2D, or 3D memory objects on the device - * - * Allocates at least \p width * \p height * \p depth bytes of linear memory - * on the device and returns a ::cudaPitchedPtr in which \p ptr is a pointer - * to the allocated memory. The function may pad the allocation to ensure - * hardware alignment requirements are met. The pitch returned in the \p pitch - * field of \p pitchedDevPtr is the width in bytes of the allocation. - * - * The returned ::cudaPitchedPtr contains additional fields \p xsize and - * \p ysize, the logical width and height of the allocation, which are - * equivalent to the \p width and \p height \p extent parameters provided by - * the programmer during allocation. - * - * For allocations of 2D and 3D objects, it is highly recommended that - * programmers perform allocations using ::cudaMalloc3D() or - * ::cudaMallocPitch(). Due to alignment restrictions in the hardware, this is - * especially true if the application will be performing memory copies - * involving 2D or 3D objects (whether linear memory or CUDA arrays). - * - * \param pitchedDevPtr - Pointer to allocated pitched device memory - * \param extent - Requested allocation size (\p width field in bytes) - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorMemoryAllocation - * \notefnerr - * - * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMemcpy3D, ::cudaMemset3D, - * ::cudaMalloc3DArray, ::cudaMallocArray, ::cudaFreeArray, - * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", - * ::cudaFreeHost, ::cudaHostAlloc, ::make_cudaPitchedPtr, ::make_cudaExtent, - * ::cuMemAllocPitch - */ -extern __host__ cudaError_t CUDARTAPI cudaMalloc3D(struct cudaPitchedPtr* pitchedDevPtr, struct cudaExtent extent); - -/** - * \brief Allocate an array on the device - * - * Allocates a CUDA array according to the ::cudaChannelFormatDesc structure - * \p desc and returns a handle to the new CUDA array in \p *array. - * - * The ::cudaChannelFormatDesc is defined as: - * \code - struct cudaChannelFormatDesc { - int x, y, z, w; - enum cudaChannelFormatKind f; - }; - \endcode - * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned, - * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat. - * - * ::cudaMalloc3DArray() can allocate the following: - * - * - A 1D array is allocated if the height and depth extents are both zero. - * - A 2D array is allocated if only the depth extent is zero. - * - A 3D array is allocated if all three extents are non-zero. - * - A 1D layered CUDA array is allocated if only the height extent is zero and - * the cudaArrayLayered flag is set. Each layer is a 1D array. The number of layers is - * determined by the depth extent. - * - A 2D layered CUDA array is allocated if all three extents are non-zero and - * the cudaArrayLayered flag is set. Each layer is a 2D array. The number of layers is - * determined by the depth extent. - * - A cubemap CUDA array is allocated if all three extents are non-zero and the - * cudaArrayCubemap flag is set. Width must be equal to height, and depth must be six. A cubemap is - * a special type of 2D layered CUDA array, where the six layers represent the six faces of a cube. - * The order of the six layers in memory is the same as that listed in ::cudaGraphicsCubeFace. - * - A cubemap layered CUDA array is allocated if all three extents are non-zero, and both, - * cudaArrayCubemap and cudaArrayLayered flags are set. Width must be equal to height, and depth must be - * a multiple of six. A cubemap layered CUDA array is a special type of 2D layered CUDA array that consists - * of a collection of cubemaps. The first six layers represent the first cubemap, the next six layers form - * the second cubemap, and so on. - * - * - * The \p flags parameter enables different options to be specified that affect - * the allocation, as follows. - * - ::cudaArrayDefault: This flag's value is defined to be 0 and provides default array allocation - * - ::cudaArrayLayered: Allocates a layered CUDA array, with the depth extent indicating the number of layers - * - ::cudaArrayCubemap: Allocates a cubemap CUDA array. Width must be equal to height, and depth must be six. - * If the cudaArrayLayered flag is also set, depth must be a multiple of six. - * - ::cudaArraySurfaceLoadStore: Allocates a CUDA array that could be read from or written to using a surface - * reference. - * - ::cudaArrayTextureGather: This flag indicates that texture gather operations will be performed on the CUDA - * array. Texture gather can only be performed on 2D CUDA arrays. - * - * The width, height and depth extents must meet certain size requirements as listed in the following table. - * All values are specified in elements. - * - * Note that 2D CUDA arrays have different size requirements if the ::cudaArrayTextureGather flag is set. In that - * case, the valid range for (width, height, depth) is ((1,maxTexture2DGather[0]), (1,maxTexture2DGather[1]), 0). - * - * \xmlonly - * - * - * - * - * - * - * - * CUDA array type - * Valid extents that must always be met {(width range in elements), - * (height range), (depth range)} - * Valid extents with cudaArraySurfaceLoadStore set {(width range in - * elements), (height range), (depth range)} - * - * - * - * - * 1D - * { (1,maxTexture1D), 0, 0 } - * { (1,maxSurface1D), 0, 0 } - * - * - * 2D - * { (1,maxTexture2D[0]), (1,maxTexture2D[1]), 0 } - * { (1,maxSurface2D[0]), (1,maxSurface2D[1]), 0 } - * - * - * 3D - * { (1,maxTexture3D[0]), (1,maxTexture3D[1]), (1,maxTexture3D[2]) } - * OR { (1,maxTexture3DAlt[0]), (1,maxTexture3DAlt[1]), - * (1,maxTexture3DAlt[2]) } - * { (1,maxSurface3D[0]), (1,maxSurface3D[1]), (1,maxSurface3D[2]) } - * - * - * 1D Layered - * { (1,maxTexture1DLayered[0]), 0, (1,maxTexture1DLayered[1]) } - * { (1,maxSurface1DLayered[0]), 0, (1,maxSurface1DLayered[1]) } - * - * - * 2D Layered - * { (1,maxTexture2DLayered[0]), (1,maxTexture2DLayered[1]), - * (1,maxTexture2DLayered[2]) } - * { (1,maxSurface2DLayered[0]), (1,maxSurface2DLayered[1]), - * (1,maxSurface2DLayered[2]) } - * - * - * Cubemap - * { (1,maxTextureCubemap), (1,maxTextureCubemap), 6 } - * { (1,maxSurfaceCubemap), (1,maxSurfaceCubemap), 6 } - * - * - * Cubemap Layered - * { (1,maxTextureCubemapLayered[0]), (1,maxTextureCubemapLayered[0]), - * (1,maxTextureCubemapLayered[1]) } - * { (1,maxSurfaceCubemapLayered[0]), (1,maxSurfaceCubemapLayered[0]), - * (1,maxSurfaceCubemapLayered[1]) } - * - * - * - *
- * \endxmlonly - * - * \param array - Pointer to allocated array in device memory - * \param desc - Requested channel format - * \param extent - Requested allocation size (\p width field in elements) - * \param flags - Flags for extensions - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorMemoryAllocation - * \notefnerr - * - * \sa ::cudaMalloc3D, ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, - * ::cudaFreeArray, - * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", - * ::cudaFreeHost, ::cudaHostAlloc, - * ::make_cudaExtent, - * ::cuArray3DCreate - */ -extern __host__ cudaError_t CUDARTAPI cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int flags __dv(0)); - -/** - * \brief Allocate a mipmapped array on the device - * - * Allocates a CUDA mipmapped array according to the ::cudaChannelFormatDesc structure - * \p desc and returns a handle to the new CUDA mipmapped array in \p *mipmappedArray. - * \p numLevels specifies the number of mipmap levels to be allocated. This value is - * clamped to the range [1, 1 + floor(log2(max(width, height, depth)))]. - * - * The ::cudaChannelFormatDesc is defined as: - * \code - struct cudaChannelFormatDesc { - int x, y, z, w; - enum cudaChannelFormatKind f; - }; - \endcode - * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned, - * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat. - * - * ::cudaMallocMipmappedArray() can allocate the following: - * - * - A 1D mipmapped array is allocated if the height and depth extents are both zero. - * - A 2D mipmapped array is allocated if only the depth extent is zero. - * - A 3D mipmapped array is allocated if all three extents are non-zero. - * - A 1D layered CUDA mipmapped array is allocated if only the height extent is zero and - * the cudaArrayLayered flag is set. Each layer is a 1D mipmapped array. The number of layers is - * determined by the depth extent. - * - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and - * the cudaArrayLayered flag is set. Each layer is a 2D mipmapped array. The number of layers is - * determined by the depth extent. - * - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the - * cudaArrayCubemap flag is set. Width must be equal to height, and depth must be six. - * The order of the six layers in memory is the same as that listed in ::cudaGraphicsCubeFace. - * - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero, and both, - * cudaArrayCubemap and cudaArrayLayered flags are set. Width must be equal to height, and depth must be - * a multiple of six. A cubemap layered CUDA mipmapped array is a special type of 2D layered CUDA mipmapped - * array that consists of a collection of cubemap mipmapped arrays. The first six layers represent the - * first cubemap mipmapped array, the next six layers form the second cubemap mipmapped array, and so on. - * - * - * The \p flags parameter enables different options to be specified that affect - * the allocation, as follows. - * - ::cudaArrayDefault: This flag's value is defined to be 0 and provides default mipmapped array allocation - * - ::cudaArrayLayered: Allocates a layered CUDA mipmapped array, with the depth extent indicating the number of layers - * - ::cudaArrayCubemap: Allocates a cubemap CUDA mipmapped array. Width must be equal to height, and depth must be six. - * If the cudaArrayLayered flag is also set, depth must be a multiple of six. - * - ::cudaArraySurfaceLoadStore: This flag indicates that individual mipmap levels of the CUDA mipmapped array - * will be read from or written to using a surface reference. - * - ::cudaArrayTextureGather: This flag indicates that texture gather operations will be performed on the CUDA - * array. Texture gather can only be performed on 2D CUDA mipmapped arrays, and the gather operations are - * performed only on the most detailed mipmap level. - * - * The width, height and depth extents must meet certain size requirements as listed in the following table. - * All values are specified in elements. - * - * \xmlonly - * - * - * - * - * - * - * - * CUDA array type - * Valid extents that must always be met {(width range in elements), - * (height range), (depth range)} - * Valid extents with cudaArraySurfaceLoadStore set {(width range in - * elements), (height range), (depth range)} - * - * - * - * - * 1D - * { (1,maxTexture1DMipmap), 0, 0 } - * { (1,maxSurface1D), 0, 0 } - * - * - * 2D - * { (1,maxTexture2DMipmap[0]), (1,maxTexture2DMipmap[1]), 0 } - * { (1,maxSurface2D[0]), (1,maxSurface2D[1]), 0 } - * - * - * 3D - * { (1,maxTexture3D[0]), (1,maxTexture3D[1]), (1,maxTexture3D[2]) } - * OR { (1,maxTexture3DAlt[0]), (1,maxTexture3DAlt[1]), - * (1,maxTexture3DAlt[2]) } - * { (1,maxSurface3D[0]), (1,maxSurface3D[1]), (1,maxSurface3D[2]) } - * - * - * 1D Layered - * { (1,maxTexture1DLayered[0]), 0, (1,maxTexture1DLayered[1]) } - * { (1,maxSurface1DLayered[0]), 0, (1,maxSurface1DLayered[1]) } - * - * - * 2D Layered - * { (1,maxTexture2DLayered[0]), (1,maxTexture2DLayered[1]), - * (1,maxTexture2DLayered[2]) } - * { (1,maxSurface2DLayered[0]), (1,maxSurface2DLayered[1]), - * (1,maxSurface2DLayered[2]) } - * - * - * Cubemap - * { (1,maxTextureCubemap), (1,maxTextureCubemap), 6 } - * { (1,maxSurfaceCubemap), (1,maxSurfaceCubemap), 6 } - * - * - * Cubemap Layered - * { (1,maxTextureCubemapLayered[0]), (1,maxTextureCubemapLayered[0]), - * (1,maxTextureCubemapLayered[1]) } - * { (1,maxSurfaceCubemapLayered[0]), (1,maxSurfaceCubemapLayered[0]), - * (1,maxSurfaceCubemapLayered[1]) } - * - * - * - *
- * \endxmlonly - * - * \param mipmappedArray - Pointer to allocated mipmapped array in device memory - * \param desc - Requested channel format - * \param extent - Requested allocation size (\p width field in elements) - * \param numLevels - Number of mipmap levels to allocate - * \param flags - Flags for extensions - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorMemoryAllocation - * \notefnerr - * - * \sa ::cudaMalloc3D, ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, - * ::cudaFreeArray, - * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", - * ::cudaFreeHost, ::cudaHostAlloc, - * ::make_cudaExtent, - * ::cuMipmappedArrayCreate - */ -extern __host__ cudaError_t CUDARTAPI cudaMallocMipmappedArray(cudaMipmappedArray_t *mipmappedArray, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int numLevels, unsigned int flags __dv(0)); - -/** - * \brief Gets a mipmap level of a CUDA mipmapped array - * - * Returns in \p *levelArray a CUDA array that represents a single mipmap level - * of the CUDA mipmapped array \p mipmappedArray. - * - * If \p level is greater than the maximum number of levels in this mipmapped array, - * ::cudaErrorInvalidValue is returned. - * - * \param levelArray - Returned mipmap level CUDA array - * \param mipmappedArray - CUDA mipmapped array - * \param level - Mipmap level - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue - * \notefnerr - * - * \sa ::cudaMalloc3D, ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, - * ::cudaFreeArray, - * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", - * ::cudaFreeHost, ::cudaHostAlloc, - * ::make_cudaExtent, - * ::cuMipmappedArrayGetLevel - */ -extern __host__ cudaError_t CUDARTAPI cudaGetMipmappedArrayLevel(cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray, unsigned int level); - -/** - * \brief Copies data between 3D objects - * -\code -struct cudaExtent { - size_t width; - size_t height; - size_t depth; -}; -struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d); - -struct cudaPos { - size_t x; - size_t y; - size_t z; -}; -struct cudaPos make_cudaPos(size_t x, size_t y, size_t z); - -struct cudaMemcpy3DParms { - cudaArray_t srcArray; - struct cudaPos srcPos; - struct cudaPitchedPtr srcPtr; - cudaArray_t dstArray; - struct cudaPos dstPos; - struct cudaPitchedPtr dstPtr; - struct cudaExtent extent; - enum cudaMemcpyKind kind; -}; -\endcode - * - * ::cudaMemcpy3D() copies data betwen two 3D objects. The source and - * destination objects may be in either host memory, device memory, or a CUDA - * array. The source, destination, extent, and kind of copy performed is - * specified by the ::cudaMemcpy3DParms struct which should be initialized to - * zero before use: -\code -cudaMemcpy3DParms myParms = {0}; -\endcode - * - * The struct passed to ::cudaMemcpy3D() must specify one of \p srcArray or - * \p srcPtr and one of \p dstArray or \p dstPtr. Passing more than one - * non-zero source or destination will cause ::cudaMemcpy3D() to return an - * error. - * - * The \p srcPos and \p dstPos fields are optional offsets into the source and - * destination objects and are defined in units of each object's elements. The - * element for a host or device pointer is assumed to be unsigned char. - * - * The \p extent field defines the dimensions of the transferred area in - * elements. If a CUDA array is participating in the copy, the extent is - * defined in terms of that array's elements. If no CUDA array is - * participating in the copy then the extents are defined in elements of - * unsigned char. - * - * The \p kind field defines the direction of the copy. It must be one of - * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, - * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing - * ::cudaMemcpyDefault is recommended, in which case the type of transfer is - * inferred from the pointer values. However, ::cudaMemcpyDefault is only - * allowed on systems that support unified virtual addressing. - * For ::cudaMemcpyHostToHost or ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost - * passed as kind and cudaArray type passed as source or destination, if the kind - * implies cudaArray type to be present on the host, ::cudaMemcpy3D() will - * disregard that implication and silently correct the kind based on the fact that - * cudaArray type can only be present on the device. - * - * If the source and destination are both arrays, ::cudaMemcpy3D() will return - * an error if they do not have the same element size. - * - * The source and destination object may not overlap. If overlapping source - * and destination objects are specified, undefined behavior will result. - * - * The source object must lie entirely within the region defined by \p srcPos - * and \p extent. The destination object must lie entirely within the region - * defined by \p dstPos and \p extent. - * - * ::cudaMemcpy3D() returns an error if the pitch of \p srcPtr or \p dstPtr - * exceeds the maximum allowed. The pitch of a ::cudaPitchedPtr allocated - * with ::cudaMalloc3D() will always be valid. - * - * \param p - 3D memory copy parameters - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidPitchValue, - * ::cudaErrorInvalidMemcpyDirection - * \notefnerr - * \note_sync - * - * \sa ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaMemset3D, ::cudaMemcpy3DAsync, - * ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, - * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, - * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, - * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, - * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, - * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, - * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, - * ::make_cudaExtent, ::make_cudaPos, - * ::cuMemcpy3D - */ -extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3DParms *p); - -/** - * \brief Copies memory between devices - * - * Perform a 3D memory copy according to the parameters specified in - * \p p. See the definition of the ::cudaMemcpy3DPeerParms structure - * for documentation of its parameters. - * - * Note that this function is synchronous with respect to the host only if - * the source or destination of the transfer is host memory. Note also - * that this copy is serialized with respect to all pending and future - * asynchronous work in to the current device, the copy's source device, - * and the copy's destination device (use ::cudaMemcpy3DPeerAsync to avoid - * this synchronization). - * - * \param p - Parameters for the memory copy - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidDevice - * \notefnerr - * \note_sync - * - * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync, ::cudaMemcpyPeerAsync, - * ::cudaMemcpy3DPeerAsync, - * ::cuMemcpy3DPeer - */ -extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p); - -/** - * \brief Copies data between 3D objects - * -\code -struct cudaExtent { - size_t width; - size_t height; - size_t depth; -}; -struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d); - -struct cudaPos { - size_t x; - size_t y; - size_t z; -}; -struct cudaPos make_cudaPos(size_t x, size_t y, size_t z); - -struct cudaMemcpy3DParms { - cudaArray_t srcArray; - struct cudaPos srcPos; - struct cudaPitchedPtr srcPtr; - cudaArray_t dstArray; - struct cudaPos dstPos; - struct cudaPitchedPtr dstPtr; - struct cudaExtent extent; - enum cudaMemcpyKind kind; -}; -\endcode - * - * ::cudaMemcpy3DAsync() copies data betwen two 3D objects. The source and - * destination objects may be in either host memory, device memory, or a CUDA - * array. The source, destination, extent, and kind of copy performed is - * specified by the ::cudaMemcpy3DParms struct which should be initialized to - * zero before use: -\code -cudaMemcpy3DParms myParms = {0}; -\endcode - * - * The struct passed to ::cudaMemcpy3DAsync() must specify one of \p srcArray - * or \p srcPtr and one of \p dstArray or \p dstPtr. Passing more than one - * non-zero source or destination will cause ::cudaMemcpy3DAsync() to return an - * error. - * - * The \p srcPos and \p dstPos fields are optional offsets into the source and - * destination objects and are defined in units of each object's elements. The - * element for a host or device pointer is assumed to be unsigned char. - * For CUDA arrays, positions must be in the range [0, 2048) for any - * dimension. - * - * The \p extent field defines the dimensions of the transferred area in - * elements. If a CUDA array is participating in the copy, the extent is - * defined in terms of that array's elements. If no CUDA array is - * participating in the copy then the extents are defined in elements of - * unsigned char. - * - * The \p kind field defines the direction of the copy. It must be one of - * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, - * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing - * ::cudaMemcpyDefault is recommended, in which case the type of transfer is - * inferred from the pointer values. However, ::cudaMemcpyDefault is only - * allowed on systems that support unified virtual addressing. - * For ::cudaMemcpyHostToHost or ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost - * passed as kind and cudaArray type passed as source or destination, if the kind - * implies cudaArray type to be present on the host, ::cudaMemcpy3DAsync() will - * disregard that implication and silently correct the kind based on the fact that - * cudaArray type can only be present on the device. - * - * If the source and destination are both arrays, ::cudaMemcpy3DAsync() will - * return an error if they do not have the same element size. - * - * The source and destination object may not overlap. If overlapping source - * and destination objects are specified, undefined behavior will result. - * - * The source object must lie entirely within the region defined by \p srcPos - * and \p extent. The destination object must lie entirely within the region - * defined by \p dstPos and \p extent. - * - * ::cudaMemcpy3DAsync() returns an error if the pitch of \p srcPtr or - * \p dstPtr exceeds the maximum allowed. The pitch of a - * ::cudaPitchedPtr allocated with ::cudaMalloc3D() will always be valid. - * - * ::cudaMemcpy3DAsync() is asynchronous with respect to the host, so - * the call may return before the copy is complete. The copy can optionally - * be associated to a stream by passing a non-zero \p stream argument. If - * \p kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream - * is non-zero, the copy may overlap with operations in other streams. - * - * The device version of this function only handles device to device copies and - * cannot be given local or shared pointers. - * - * \param p - 3D memory copy parameters - * \param stream - Stream identifier - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidPitchValue, - * ::cudaErrorInvalidMemcpyDirection - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaMemset3D, ::cudaMemcpy3D, - * ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, - * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, - * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, - * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, - * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, - * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, - * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, - * ::make_cudaExtent, ::make_cudaPos, - * ::cuMemcpy3DAsync - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0)); - -/** - * \brief Copies memory between devices asynchronously. - * - * Perform a 3D memory copy according to the parameters specified in - * \p p. See the definition of the ::cudaMemcpy3DPeerParms structure - * for documentation of its parameters. - * - * \param p - Parameters for the memory copy - * \param stream - Stream identifier - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidDevice - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync, ::cudaMemcpyPeerAsync, - * ::cudaMemcpy3DPeerAsync, - * ::cuMemcpy3DPeerAsync - */ -extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0)); - -/** - * \brief Gets free and total device memory - * - * Returns in \p *free and \p *total respectively, the free and total amount of - * memory available for allocation by the device in bytes. - * - * \param free - Returned free memory in bytes - * \param total - Returned total memory in bytes - * - * \return - * ::cudaSuccess, - * ::cudaErrorInitializationError, - * ::cudaErrorInvalidValue, - * ::cudaErrorLaunchFailure - * \notefnerr - * - * \sa - * ::cuMemGetInfo - */ -extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free, size_t *total); - -/** - * \brief Gets info about the specified cudaArray - * - * Returns in \p *desc, \p *extent and \p *flags respectively, the type, shape - * and flags of \p array. - * - * Any of \p *desc, \p *extent and \p *flags may be specified as NULL. - * - * \param desc - Returned array type - * \param extent - Returned array shape. 2D arrays will have depth of zero - * \param flags - Returned array flags - * \param array - The ::cudaArray to get info for - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue - * \notefnerr - * - * \sa - * ::cuArrayGetDescriptor, - * ::cuArray3DGetDescriptor - */ -extern __host__ cudaError_t CUDARTAPI cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent, unsigned int *flags, cudaArray_t array); - -/** - * \brief Copies data between host and device - * - * Copies \p count bytes from the memory area pointed to by \p src to the - * memory area pointed to by \p dst, where \p kind specifies the direction - * of the copy, and must be one of ::cudaMemcpyHostToHost, - * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, - * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing - * ::cudaMemcpyDefault is recommended, in which case the type of transfer is - * inferred from the pointer values. However, ::cudaMemcpyDefault is only - * allowed on systems that support unified virtual addressing. Calling - * ::cudaMemcpy() with dst and src pointers that do not match the direction of - * the copy results in an undefined behavior. - * - * \param dst - Destination memory address - * \param src - Source memory address - * \param count - Size in bytes to copy - * \param kind - Type of transfer - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidMemcpyDirection - * \notefnerr - * - * \note_sync - * - * \sa ::cudaMemcpy2D, ::cudaMemcpyToArray, - * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, - * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, - * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, - * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, - * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, - * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, - * ::cuMemcpyDtoH, - * ::cuMemcpyHtoD, - * ::cuMemcpyDtoD, - * ::cuMemcpy - */ -extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind); - -/** - * \brief Copies memory between two devices - * - * Copies memory from one device to memory on another device. \p dst is the - * base device pointer of the destination memory and \p dstDevice is the - * destination device. \p src is the base device pointer of the source memory - * and \p srcDevice is the source device. \p count specifies the number of bytes - * to copy. - * - * Note that this function is asynchronous with respect to the host, but - * serialized with respect all pending and future asynchronous work in to the - * current device, \p srcDevice, and \p dstDevice (use ::cudaMemcpyPeerAsync - * to avoid this synchronization). - * - * \param dst - Destination device pointer - * \param dstDevice - Destination device - * \param src - Source device pointer - * \param srcDevice - Source device - * \param count - Size of memory copy in bytes - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidDevice - * \notefnerr - * \note_sync - * - * \sa ::cudaMemcpy, ::cudaMemcpyAsync, ::cudaMemcpyPeerAsync, - * ::cudaMemcpy3DPeerAsync, - * ::cuMemcpyPeer - */ -extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeer(void *dst, int dstDevice, const void *src, int srcDevice, size_t count); - -/** - * \brief Copies data between host and device - * - * Copies \p count bytes from the memory area pointed to by \p src to the - * CUDA array \p dst starting at the upper left corner - * (\p wOffset, \p hOffset), where \p kind specifies the direction - * of the copy, and must be one of ::cudaMemcpyHostToHost, - * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, - * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing - * ::cudaMemcpyDefault is recommended, in which case the type of transfer is - * inferred from the pointer values. However, ::cudaMemcpyDefault is only - * allowed on systems that support unified virtual addressing. - * - * \param dst - Destination memory address - * \param wOffset - Destination starting X offset - * \param hOffset - Destination starting Y offset - * \param src - Source memory address - * \param count - Size in bytes to copy - * \param kind - Type of transfer - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidMemcpyDirection - * \notefnerr - * \note_sync - * - * \sa ::cudaMemcpy, ::cudaMemcpy2D, - * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, - * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, - * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, - * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, - * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, - * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, - * ::cuMemcpyHtoA, - * ::cuMemcpyDtoA - */ -extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind); - -/** - * \brief Copies data between host and device - * - * Copies \p count bytes from the CUDA array \p src starting at the upper - * left corner (\p wOffset, hOffset) to the memory area pointed to by \p dst, - * where \p kind specifies the direction of the copy, and must be one of - * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, - * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing - * ::cudaMemcpyDefault is recommended, in which case the type of transfer is - * inferred from the pointer values. However, ::cudaMemcpyDefault is only - * allowed on systems that support unified virtual addressing. - * - * \param dst - Destination memory address - * \param src - Source memory address - * \param wOffset - Source starting X offset - * \param hOffset - Source starting Y offset - * \param count - Size in bytes to copy - * \param kind - Type of transfer - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidMemcpyDirection - * \notefnerr - * \note_sync - * - * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, - * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, - * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, - * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, - * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, - * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, - * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, - * ::cuMemcpyAtoH, - * ::cuMemcpyAtoD - */ -extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind); - -/** - * \brief Copies data between host and device - * - * Copies \p count bytes from the CUDA array \p src starting at the upper - * left corner (\p wOffsetSrc, \p hOffsetSrc) to the CUDA array \p dst - * starting at the upper left corner (\p wOffsetDst, \p hOffsetDst) where - * \p kind specifies the direction of the copy, and must be one of - * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, - * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing - * ::cudaMemcpyDefault is recommended, in which case the type of transfer is - * inferred from the pointer values. However, ::cudaMemcpyDefault is only - * allowed on systems that support unified virtual addressing. - * - * \param dst - Destination memory address - * \param wOffsetDst - Destination starting X offset - * \param hOffsetDst - Destination starting Y offset - * \param src - Source memory address - * \param wOffsetSrc - Source starting X offset - * \param hOffsetSrc - Source starting Y offset - * \param count - Size in bytes to copy - * \param kind - Type of transfer - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidMemcpyDirection - * \notefnerr - * - * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, - * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, - * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, - * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, - * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, - * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, - * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, - * ::cuMemcpyAtoA - */ -extern __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)); - -/** - * \brief Copies data between host and device - * - * Copies a matrix (\p height rows of \p width bytes each) from the memory - * area pointed to by \p src to the memory area pointed to by \p dst, where - * \p kind specifies the direction of the copy, and must be one of - * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, - * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing - * ::cudaMemcpyDefault is recommended, in which case the type of transfer is - * inferred from the pointer values. However, ::cudaMemcpyDefault is only - * allowed on systems that support unified virtual addressing. \p dpitch and - * \p spitch are the widths in memory in bytes of the 2D arrays pointed to by - * \p dst and \p src, including any padding added to the end of each row. The - * memory areas may not overlap. \p width must not exceed either \p dpitch or - * \p spitch. Calling ::cudaMemcpy2D() with \p dst and \p src pointers that do - * not match the direction of the copy results in an undefined behavior. - * ::cudaMemcpy2D() returns an error if \p dpitch or \p spitch exceeds - * the maximum allowed. - * - * \param dst - Destination memory address - * \param dpitch - Pitch of destination memory - * \param src - Source memory address - * \param spitch - Pitch of source memory - * \param width - Width of matrix transfer (columns in bytes) - * \param height - Height of matrix transfer (rows) - * \param kind - Type of transfer - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidPitchValue, - * ::cudaErrorInvalidMemcpyDirection - * \notefnerr - * - * \sa ::cudaMemcpy, ::cudaMemcpyToArray, - * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, - * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, - * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, - * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, - * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, - * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, - * ::cuMemcpy2D, - * ::cuMemcpy2DUnaligned - */ -extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind); - -/** - * \brief Copies data between host and device - * - * Copies a matrix (\p height rows of \p width bytes each) from the memory - * area pointed to by \p src to the CUDA array \p dst starting at the - * upper left corner (\p wOffset, \p hOffset) where \p kind specifies the - * direction of the copy, and must be one of ::cudaMemcpyHostToHost, - * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, - * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing - * ::cudaMemcpyDefault is recommended, in which case the type of transfer is - * inferred from the pointer values. However, ::cudaMemcpyDefault is only - * allowed on systems that support unified virtual addressing. - * \p spitch is the width in memory in bytes of the 2D array pointed to by - * \p src, including any padding added to the end of each row. \p wOffset + - * \p width must not exceed the width of the CUDA array \p dst. \p width must - * not exceed \p spitch. ::cudaMemcpy2DToArray() returns an error if \p spitch - * exceeds the maximum allowed. - * - * \param dst - Destination memory address - * \param wOffset - Destination starting X offset - * \param hOffset - Destination starting Y offset - * \param src - Source memory address - * \param spitch - Pitch of source memory - * \param width - Width of matrix transfer (columns in bytes) - * \param height - Height of matrix transfer (rows) - * \param kind - Type of transfer - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidPitchValue, - * ::cudaErrorInvalidMemcpyDirection - * \notefnerr - * \note_sync - * - * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, - * ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, - * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, - * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, - * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, - * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, - * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, - * ::cuMemcpy2D, - * ::cuMemcpy2DUnaligned - */ -extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind); - -/** - * \brief Copies data between host and device - * - * Copies a matrix (\p height rows of \p width bytes each) from the CUDA - * array \p srcArray starting at the upper left corner - * (\p wOffset, \p hOffset) to the memory area pointed to by \p dst, where - * \p kind specifies the direction of the copy, and must be one of - * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, - * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing - * ::cudaMemcpyDefault is recommended, in which case the type of transfer is - * inferred from the pointer values. However, ::cudaMemcpyDefault is only - * allowed on systems that support unified virtual addressing. \p dpitch is the - * width in memory in bytes of the 2D array pointed to by \p dst, including any - * padding added to the end of each row. \p wOffset + \p width must not exceed - * the width of the CUDA array \p src. \p width must not exceed \p dpitch. - * ::cudaMemcpy2DFromArray() returns an error if \p dpitch exceeds the maximum - * allowed. - * - * \param dst - Destination memory address - * \param dpitch - Pitch of destination memory - * \param src - Source memory address - * \param wOffset - Source starting X offset - * \param hOffset - Source starting Y offset - * \param width - Width of matrix transfer (columns in bytes) - * \param height - Height of matrix transfer (rows) - * \param kind - Type of transfer - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidPitchValue, - * ::cudaErrorInvalidMemcpyDirection - * \notefnerr - * \note_sync - * - * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, - * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, - * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, - * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, - * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, - * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, - * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, - * ::cuMemcpy2D, - * ::cuMemcpy2DUnaligned - */ -extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind); - -/** - * \brief Copies data between host and device - * - * Copies a matrix (\p height rows of \p width bytes each) from the CUDA - * array \p srcArray starting at the upper left corner - * (\p wOffsetSrc, \p hOffsetSrc) to the CUDA array \p dst starting at - * the upper left corner (\p wOffsetDst, \p hOffsetDst), where \p kind - * specifies the direction of the copy, and must be one of - * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, - * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing - * ::cudaMemcpyDefault is recommended, in which case the type of transfer is - * inferred from the pointer values. However, ::cudaMemcpyDefault is only - * allowed on systems that support unified virtual addressing. - * \p wOffsetDst + \p width must not exceed the width of the CUDA array \p dst. - * \p wOffsetSrc + \p width must not exceed the width of the CUDA array \p src. - * - * \param dst - Destination memory address - * \param wOffsetDst - Destination starting X offset - * \param hOffsetDst - Destination starting Y offset - * \param src - Source memory address - * \param wOffsetSrc - Source starting X offset - * \param hOffsetSrc - Source starting Y offset - * \param width - Width of matrix transfer (columns in bytes) - * \param height - Height of matrix transfer (rows) - * \param kind - Type of transfer - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidMemcpyDirection - * \notefnerr - * \note_sync - * - * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, - * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, - * ::cudaMemcpyArrayToArray, ::cudaMemcpyToSymbol, - * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, - * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, - * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, - * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, - * ::cuMemcpy2D, - * ::cuMemcpy2DUnaligned - */ -extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)); - -/** - * \brief Copies data to the given symbol on the device - * - * Copies \p count bytes from the memory area pointed to by \p src - * to the memory area pointed to by \p offset bytes from the start of symbol - * \p symbol. The memory areas may not overlap. \p symbol is a variable that - * resides in global or constant memory space. \p kind can be either - * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. - * Passing ::cudaMemcpyDefault is recommended, in which case the type of - * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault - * is only allowed on systems that support unified virtual addressing. - * - * \param symbol - Device symbol address - * \param src - Source memory address - * \param count - Size in bytes to copy - * \param offset - Offset from start of symbol in bytes - * \param kind - Type of transfer - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidSymbol, - * ::cudaErrorInvalidMemcpyDirection, - * ::cudaErrorNoKernelImageForDevice - * \notefnerr - * \note_sync - * \note_string_api_deprecation - * - * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, - * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, - * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, - * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, - * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, - * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, - * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, - * ::cuMemcpy, - * ::cuMemcpyHtoD, - * ::cuMemcpyDtoD - */ -extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const void *symbol, const void *src, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice)); - -/** - * \brief Copies data from the given symbol on the device - * - * Copies \p count bytes from the memory area pointed to by \p offset bytes - * from the start of symbol \p symbol to the memory area pointed to by \p dst. - * The memory areas may not overlap. \p symbol is a variable that - * resides in global or constant memory space. \p kind can be either - * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. - * Passing ::cudaMemcpyDefault is recommended, in which case the type of - * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault - * is only allowed on systems that support unified virtual addressing. - * - * \param dst - Destination memory address - * \param symbol - Device symbol address - * \param count - Size in bytes to copy - * \param offset - Offset from start of symbol in bytes - * \param kind - Type of transfer - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidSymbol, - * ::cudaErrorInvalidMemcpyDirection, - * ::cudaErrorNoKernelImageForDevice - * \notefnerr - * \note_sync - * \note_string_api_deprecation - * - * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, - * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, - * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, - * ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, - * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, - * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, - * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, - * ::cuMemcpy, - * ::cuMemcpyDtoH, - * ::cuMemcpyDtoD - */ -extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(void *dst, const void *symbol, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost)); - - -/** - * \brief Copies data between host and device - * - * Copies \p count bytes from the memory area pointed to by \p src to the - * memory area pointed to by \p dst, where \p kind specifies the - * direction of the copy, and must be one of ::cudaMemcpyHostToHost, - * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, - * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing - * ::cudaMemcpyDefault is recommended, in which case the type of transfer is - * inferred from the pointer values. However, ::cudaMemcpyDefault is only - * allowed on systems that support unified virtual addressing. - * - * The memory areas may not overlap. Calling ::cudaMemcpyAsync() with \p dst and - * \p src pointers that do not match the direction of the copy results in an - * undefined behavior. - * - * ::cudaMemcpyAsync() is asynchronous with respect to the host, so the call - * may return before the copy is complete. The copy can optionally be - * associated to a stream by passing a non-zero \p stream argument. If \p kind - * is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and the \p stream is - * non-zero, the copy may overlap with operations in other streams. - * - * The device version of this function only handles device to device copies and - * cannot be given local or shared pointers. - * - * \param dst - Destination memory address - * \param src - Source memory address - * \param count - Size in bytes to copy - * \param kind - Type of transfer - * \param stream - Stream identifier - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidMemcpyDirection - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, - * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, - * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, - * ::cudaMemcpyFromSymbol, ::cudaMemcpy2DAsync, - * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, - * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, - * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync - * ::cuMemcpyAsync, - * ::cuMemcpyDtoHAsync, - * ::cuMemcpyHtoDAsync, - * ::cuMemcpyDtoDAsync - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); - -/** - * \brief Copies memory between two devices asynchronously. - * - * Copies memory from one device to memory on another device. \p dst is the - * base device pointer of the destination memory and \p dstDevice is the - * destination device. \p src is the base device pointer of the source memory - * and \p srcDevice is the source device. \p count specifies the number of bytes - * to copy. - * - * Note that this function is asynchronous with respect to the host and all work - * on other devices. - * - * \param dst - Destination device pointer - * \param dstDevice - Destination device - * \param src - Source device pointer - * \param srcDevice - Source device - * \param count - Size of memory copy in bytes - * \param stream - Stream identifier - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidDevice - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync, - * ::cudaMemcpy3DPeerAsync, - * ::cuMemcpyPeerAsync - */ -extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice, size_t count, cudaStream_t stream __dv(0)); - -/** - * \brief Copies data between host and device - * - * Copies \p count bytes from the memory area pointed to by \p src to the - * CUDA array \p dst starting at the upper left corner - * (\p wOffset, \p hOffset), where \p kind specifies the - * direction of the copy, and must be one of ::cudaMemcpyHostToHost, - * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, - * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing - * ::cudaMemcpyDefault is recommended, in which case the type of transfer is - * inferred from the pointer values. However, ::cudaMemcpyDefault is only - * allowed on systems that support unified virtual addressing. - * - * ::cudaMemcpyToArrayAsync() is asynchronous with respect to the host, so - * the call may return before the copy is complete. The copy can optionally - * be associated to a stream by passing a non-zero \p stream argument. If \p - * kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream - * is non-zero, the copy may overlap with operations in other streams. - * - * \param dst - Destination memory address - * \param wOffset - Destination starting X offset - * \param hOffset - Destination starting Y offset - * \param src - Source memory address - * \param count - Size in bytes to copy - * \param kind - Type of transfer - * \param stream - Stream identifier - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidMemcpyDirection - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, - * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, - * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, - * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, - * ::cudaMemcpy2DToArrayAsync, - * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, - * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, - * ::cuMemcpyHtoAAsync, - * ::cuMemcpy2DAsync - */ -extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); - -/** - * \brief Copies data between host and device - * - * Copies \p count bytes from the CUDA array \p src starting at the upper - * left corner (\p wOffset, hOffset) to the memory area pointed to by \p dst, - * where \p kind specifies the direction of the copy, and must be one of - * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, - * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing - * ::cudaMemcpyDefault is recommended, in which case the type of transfer is - * inferred from the pointer values. However, ::cudaMemcpyDefault is only - * allowed on systems that support unified virtual addressing. - * - * ::cudaMemcpyFromArrayAsync() is asynchronous with respect to the host, so - * the call may return before the copy is complete. The copy can optionally - * be associated to a stream by passing a non-zero \p stream argument. If \p - * kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream - * is non-zero, the copy may overlap with operations in other streams. - * - * \param dst - Destination memory address - * \param src - Source memory address - * \param wOffset - Source starting X offset - * \param hOffset - Source starting Y offset - * \param count - Size in bytes to copy - * \param kind - Type of transfer - * \param stream - Stream identifier - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidMemcpyDirection - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, - * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, - * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, - * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, - * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, - * ::cudaMemcpy2DFromArrayAsync, - * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, - * ::cuMemcpyAtoHAsync, - * ::cuMemcpy2DAsync - */ -extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); - -/** - * \brief Copies data between host and device - * - * Copies a matrix (\p height rows of \p width bytes each) from the memory - * area pointed to by \p src to the memory area pointed to by \p dst, where - * \p kind specifies the direction of the copy, and must be one of - * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, - * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing - * ::cudaMemcpyDefault is recommended, in which case the type of transfer is - * inferred from the pointer values. However, ::cudaMemcpyDefault is only - * allowed on systems that support unified virtual addressing. - * \p dpitch and \p spitch are the widths in memory in bytes of the 2D arrays - * pointed to by \p dst and \p src, including any padding added to the end of - * each row. The memory areas may not overlap. \p width must not exceed either - * \p dpitch or \p spitch. - * - * Calling ::cudaMemcpy2DAsync() with \p dst and \p src pointers that do not - * match the direction of the copy results in an undefined behavior. - * ::cudaMemcpy2DAsync() returns an error if \p dpitch or \p spitch is greater - * than the maximum allowed. - * - * ::cudaMemcpy2DAsync() is asynchronous with respect to the host, so - * the call may return before the copy is complete. The copy can optionally - * be associated to a stream by passing a non-zero \p stream argument. If - * \p kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and - * \p stream is non-zero, the copy may overlap with operations in other - * streams. - * - * The device version of this function only handles device to device copies and - * cannot be given local or shared pointers. - * - * \param dst - Destination memory address - * \param dpitch - Pitch of destination memory - * \param src - Source memory address - * \param spitch - Pitch of source memory - * \param width - Width of matrix transfer (columns in bytes) - * \param height - Height of matrix transfer (rows) - * \param kind - Type of transfer - * \param stream - Stream identifier - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidPitchValue, - * ::cudaErrorInvalidMemcpyDirection - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, - * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, - * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, - * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, - * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, - * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, - * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, - * ::cuMemcpy2DAsync - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); - -/** - * \brief Copies data between host and device - * - * Copies a matrix (\p height rows of \p width bytes each) from the memory - * area pointed to by \p src to the CUDA array \p dst starting at the - * upper left corner (\p wOffset, \p hOffset) where \p kind specifies the - * direction of the copy, and must be one of ::cudaMemcpyHostToHost, - * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, - * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing - * ::cudaMemcpyDefault is recommended, in which case the type of transfer is - * inferred from the pointer values. However, ::cudaMemcpyDefault is only - * allowed on systems that support unified virtual addressing. - * \p spitch is the width in memory in bytes of the 2D array pointed to by - * \p src, including any padding added to the end of each row. \p wOffset + - * \p width must not exceed the width of the CUDA array \p dst. \p width must - * not exceed \p spitch. ::cudaMemcpy2DToArrayAsync() returns an error if - * \p spitch exceeds the maximum allowed. - * - * ::cudaMemcpy2DToArrayAsync() is asynchronous with respect to the host, so - * the call may return before the copy is complete. The copy can optionally - * be associated to a stream by passing a non-zero \p stream argument. If - * \p kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and - * \p stream is non-zero, the copy may overlap with operations in other - * streams. - * - * \param dst - Destination memory address - * \param wOffset - Destination starting X offset - * \param hOffset - Destination starting Y offset - * \param src - Source memory address - * \param spitch - Pitch of source memory - * \param width - Width of matrix transfer (columns in bytes) - * \param height - Height of matrix transfer (rows) - * \param kind - Type of transfer - * \param stream - Stream identifier - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidPitchValue, - * ::cudaErrorInvalidMemcpyDirection - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, - * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, - * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, - * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, - * ::cudaMemcpyToArrayAsync, - * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, - * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, - * ::cuMemcpy2DAsync - */ -extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); - -/** - * \brief Copies data between host and device - * - * Copies a matrix (\p height rows of \p width bytes each) from the CUDA - * array \p srcArray starting at the upper left corner - * (\p wOffset, \p hOffset) to the memory area pointed to by \p dst, where - * \p kind specifies the direction of the copy, and must be one of - * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, - * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing - * ::cudaMemcpyDefault is recommended, in which case the type of transfer is - * inferred from the pointer values. However, ::cudaMemcpyDefault is only - * allowed on systems that support unified virtual addressing. - * \p dpitch is the width in memory in bytes of the 2D - * array pointed to by \p dst, including any padding added to the end of each - * row. \p wOffset + \p width must not exceed the width of the CUDA array - * \p src. \p width must not exceed \p dpitch. ::cudaMemcpy2DFromArrayAsync() - * returns an error if \p dpitch exceeds the maximum allowed. - * - * ::cudaMemcpy2DFromArrayAsync() is asynchronous with respect to the host, so - * the call may return before the copy is complete. The copy can optionally be - * associated to a stream by passing a non-zero \p stream argument. If \p kind - * is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream is - * non-zero, the copy may overlap with operations in other streams. - * - * \param dst - Destination memory address - * \param dpitch - Pitch of destination memory - * \param src - Source memory address - * \param wOffset - Source starting X offset - * \param hOffset - Source starting Y offset - * \param width - Width of matrix transfer (columns in bytes) - * \param height - Height of matrix transfer (rows) - * \param kind - Type of transfer - * \param stream - Stream identifier - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidPitchValue, - * ::cudaErrorInvalidMemcpyDirection - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, - * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, - * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, - * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, - * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, - * ::cudaMemcpyFromArrayAsync, - * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, - * ::cuMemcpy2DAsync - */ -extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); - -/** - * \brief Copies data to the given symbol on the device - * - * Copies \p count bytes from the memory area pointed to by \p src - * to the memory area pointed to by \p offset bytes from the start of symbol - * \p symbol. The memory areas may not overlap. \p symbol is a variable that - * resides in global or constant memory space. \p kind can be either - * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. - * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer - * is inferred from the pointer values. However, ::cudaMemcpyDefault is only - * allowed on systems that support unified virtual addressing. - * - * ::cudaMemcpyToSymbolAsync() is asynchronous with respect to the host, so - * the call may return before the copy is complete. The copy can optionally - * be associated to a stream by passing a non-zero \p stream argument. If - * \p kind is ::cudaMemcpyHostToDevice and \p stream is non-zero, the copy - * may overlap with operations in other streams. - * - * \param symbol - Device symbol address - * \param src - Source memory address - * \param count - Size in bytes to copy - * \param offset - Offset from start of symbol in bytes - * \param kind - Type of transfer - * \param stream - Stream identifier - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidSymbol, - * ::cudaErrorInvalidMemcpyDirection, - * ::cudaErrorNoKernelImageForDevice - * \notefnerr - * \note_async - * \note_null_stream - * \note_string_api_deprecation - * - * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, - * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, - * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, - * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, - * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, - * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, - * ::cudaMemcpyFromSymbolAsync, - * ::cuMemcpyAsync, - * ::cuMemcpyHtoDAsync, - * ::cuMemcpyDtoDAsync - */ -extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); - -/** - * \brief Copies data from the given symbol on the device - * - * Copies \p count bytes from the memory area pointed to by \p offset bytes - * from the start of symbol \p symbol to the memory area pointed to by \p dst. - * The memory areas may not overlap. \p symbol is a variable that resides in - * global or constant memory space. \p kind can be either - * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. - * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer - * is inferred from the pointer values. However, ::cudaMemcpyDefault is only - * allowed on systems that support unified virtual addressing. - * - * ::cudaMemcpyFromSymbolAsync() is asynchronous with respect to the host, so - * the call may return before the copy is complete. The copy can optionally be - * associated to a stream by passing a non-zero \p stream argument. If \p kind - * is ::cudaMemcpyDeviceToHost and \p stream is non-zero, the copy may overlap - * with operations in other streams. - * - * \param dst - Destination memory address - * \param symbol - Device symbol address - * \param count - Size in bytes to copy - * \param offset - Offset from start of symbol in bytes - * \param kind - Type of transfer - * \param stream - Stream identifier - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidSymbol, - * ::cudaErrorInvalidMemcpyDirection, - * ::cudaErrorNoKernelImageForDevice - * \notefnerr - * \note_async - * \note_null_stream - * \note_string_api_deprecation - * - * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, - * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, - * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, - * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, - * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, - * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, - * ::cudaMemcpyToSymbolAsync, - * ::cuMemcpyAsync, - * ::cuMemcpyDtoHAsync, - * ::cuMemcpyDtoDAsync - */ -extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); - - -/** - * \brief Initializes or sets device memory to a value - * - * Fills the first \p count bytes of the memory area pointed to by \p devPtr - * with the constant byte value \p value. - * - * Note that this function is asynchronous with respect to the host unless - * \p devPtr refers to pinned host memory. - * - * \param devPtr - Pointer to device memory - * \param value - Value to set for each byte of specified memory - * \param count - Size in bytes to set - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * \notefnerr - * \note_memset - * - * \sa - * ::cuMemsetD8, - * ::cuMemsetD16, - * ::cuMemsetD32 - */ -extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value, size_t count); - -/** - * \brief Initializes or sets device memory to a value - * - * Sets to the specified value \p value a matrix (\p height rows of \p width - * bytes each) pointed to by \p dstPtr. \p pitch is the width in bytes of the - * 2D array pointed to by \p dstPtr, including any padding added to the end - * of each row. This function performs fastest when the pitch is one that has - * been passed back by ::cudaMallocPitch(). - * - * Note that this function is asynchronous with respect to the host unless - * \p devPtr refers to pinned host memory. - * - * \param devPtr - Pointer to 2D device memory - * \param pitch - Pitch in bytes of 2D device memory - * \param value - Value to set for each byte of specified memory - * \param width - Width of matrix set (columns in bytes) - * \param height - Height of matrix set (rows) - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * \notefnerr - * \note_memset - * - * \sa ::cudaMemset, ::cudaMemset3D, ::cudaMemsetAsync, - * ::cudaMemset2DAsync, ::cudaMemset3DAsync, - * ::cuMemsetD2D8, - * ::cuMemsetD2D16, - * ::cuMemsetD2D32 - */ -extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch, int value, size_t width, size_t height); - -/** - * \brief Initializes or sets device memory to a value - * - * Initializes each element of a 3D array to the specified value \p value. - * The object to initialize is defined by \p pitchedDevPtr. The \p pitch field - * of \p pitchedDevPtr is the width in memory in bytes of the 3D array pointed - * to by \p pitchedDevPtr, including any padding added to the end of each row. - * The \p xsize field specifies the logical width of each row in bytes, while - * the \p ysize field specifies the height of each 2D slice in rows. - * - * The extents of the initialized region are specified as a \p width in bytes, - * a \p height in rows, and a \p depth in slices. - * - * Extents with \p width greater than or equal to the \p xsize of - * \p pitchedDevPtr may perform significantly faster than extents narrower - * than the \p xsize. Secondarily, extents with \p height equal to the - * \p ysize of \p pitchedDevPtr will perform faster than when the \p height is - * shorter than the \p ysize. - * - * This function performs fastest when the \p pitchedDevPtr has been allocated - * by ::cudaMalloc3D(). - * - * Note that this function is asynchronous with respect to the host unless - * \p pitchedDevPtr refers to pinned host memory. - * - * \param pitchedDevPtr - Pointer to pitched device memory - * \param value - Value to set for each byte of specified memory - * \param extent - Size parameters for where to set device memory (\p width field in bytes) - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * \notefnerr - * \note_memset - * - * \sa ::cudaMemset, ::cudaMemset2D, - * ::cudaMemsetAsync, ::cudaMemset2DAsync, ::cudaMemset3DAsync, - * ::cudaMalloc3D, ::make_cudaPitchedPtr, - * ::make_cudaExtent - */ -extern __host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent); - -/** - * \brief Initializes or sets device memory to a value - * - * Fills the first \p count bytes of the memory area pointed to by \p devPtr - * with the constant byte value \p value. - * - * ::cudaMemsetAsync() is asynchronous with respect to the host, so - * the call may return before the memset is complete. The operation can optionally - * be associated to a stream by passing a non-zero \p stream argument. - * If \p stream is non-zero, the operation may overlap with operations in other streams. - * - * The device version of this function only handles device to device copies and - * cannot be given local or shared pointers. - * - * \param devPtr - Pointer to device memory - * \param value - Value to set for each byte of specified memory - * \param count - Size in bytes to set - * \param stream - Stream identifier - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * \notefnerr - * \note_memset - * \note_null_stream - * - * \sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D, - * ::cudaMemset2DAsync, ::cudaMemset3DAsync, - * ::cuMemsetD8Async, - * ::cuMemsetD16Async, - * ::cuMemsetD32Async - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream __dv(0)); - -/** - * \brief Initializes or sets device memory to a value - * - * Sets to the specified value \p value a matrix (\p height rows of \p width - * bytes each) pointed to by \p dstPtr. \p pitch is the width in bytes of the - * 2D array pointed to by \p dstPtr, including any padding added to the end - * of each row. This function performs fastest when the pitch is one that has - * been passed back by ::cudaMallocPitch(). - * - * ::cudaMemset2DAsync() is asynchronous with respect to the host, so - * the call may return before the memset is complete. The operation can optionally - * be associated to a stream by passing a non-zero \p stream argument. - * If \p stream is non-zero, the operation may overlap with operations in other streams. - * - * The device version of this function only handles device to device copies and - * cannot be given local or shared pointers. - * - * \param devPtr - Pointer to 2D device memory - * \param pitch - Pitch in bytes of 2D device memory - * \param value - Value to set for each byte of specified memory - * \param width - Width of matrix set (columns in bytes) - * \param height - Height of matrix set (rows) - * \param stream - Stream identifier - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * \notefnerr - * \note_memset - * \note_null_stream - * - * \sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D, - * ::cudaMemsetAsync, ::cudaMemset3DAsync, - * ::cuMemsetD2D8Async, - * ::cuMemsetD2D16Async, - * ::cuMemsetD2D32Async - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream __dv(0)); - -/** - * \brief Initializes or sets device memory to a value - * - * Initializes each element of a 3D array to the specified value \p value. - * The object to initialize is defined by \p pitchedDevPtr. The \p pitch field - * of \p pitchedDevPtr is the width in memory in bytes of the 3D array pointed - * to by \p pitchedDevPtr, including any padding added to the end of each row. - * The \p xsize field specifies the logical width of each row in bytes, while - * the \p ysize field specifies the height of each 2D slice in rows. - * - * The extents of the initialized region are specified as a \p width in bytes, - * a \p height in rows, and a \p depth in slices. - * - * Extents with \p width greater than or equal to the \p xsize of - * \p pitchedDevPtr may perform significantly faster than extents narrower - * than the \p xsize. Secondarily, extents with \p height equal to the - * \p ysize of \p pitchedDevPtr will perform faster than when the \p height is - * shorter than the \p ysize. - * - * This function performs fastest when the \p pitchedDevPtr has been allocated - * by ::cudaMalloc3D(). - * - * ::cudaMemset3DAsync() is asynchronous with respect to the host, so - * the call may return before the memset is complete. The operation can optionally - * be associated to a stream by passing a non-zero \p stream argument. - * If \p stream is non-zero, the operation may overlap with operations in other streams. - * - * The device version of this function only handles device to device copies and - * cannot be given local or shared pointers. - * - * \param pitchedDevPtr - Pointer to pitched device memory - * \param value - Value to set for each byte of specified memory - * \param extent - Size parameters for where to set device memory (\p width field in bytes) - * \param stream - Stream identifier - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * \notefnerr - * \note_memset - * \note_null_stream - * - * \sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D, - * ::cudaMemsetAsync, ::cudaMemset2DAsync, - * ::cudaMalloc3D, ::make_cudaPitchedPtr, - * ::make_cudaExtent - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream __dv(0)); - -/** - * \brief Finds the address associated with a CUDA symbol - * - * Returns in \p *devPtr the address of symbol \p symbol on the device. - * \p symbol is a variable that resides in global or constant memory space. - * If \p symbol cannot be found, or if \p symbol is not declared in the - * global or constant memory space, \p *devPtr is unchanged and the error - * ::cudaErrorInvalidSymbol is returned. - * - * \param devPtr - Return device pointer associated with symbol - * \param symbol - Device symbol address - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidSymbol, - * ::cudaErrorNoKernelImageForDevice - * \notefnerr - * \note_string_api_deprecation - * - * \sa - * \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress (C++ API)", - * \ref ::cudaGetSymbolSize(size_t*, const void*) "cudaGetSymbolSize (C API)", - * ::cuModuleGetGlobal - */ -extern __host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr, const void *symbol); - -/** - * \brief Finds the size of the object associated with a CUDA symbol - * - * Returns in \p *size the size of symbol \p symbol. \p symbol is a variable that - * resides in global or constant memory space. If \p symbol cannot be found, or - * if \p symbol is not declared in global or constant memory space, \p *size is - * unchanged and the error ::cudaErrorInvalidSymbol is returned. - * - * \param size - Size of object associated with symbol - * \param symbol - Device symbol address - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidSymbol, - * ::cudaErrorNoKernelImageForDevice - * \notefnerr - * \note_string_api_deprecation - * - * \sa - * \ref ::cudaGetSymbolAddress(void**, const void*) "cudaGetSymbolAddress (C API)", - * \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API)", - * ::cuModuleGetGlobal - */ -extern __host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size, const void *symbol); - -/** - * \brief Prefetches memory to the specified destination device - * - * Prefetches memory to the specified destination device. \p devPtr is the - * base device pointer of the memory to be prefetched and \p dstDevice is the - * destination device. \p count specifies the number of bytes to copy. \p stream - * is the stream in which the operation is enqueued. The memory range must refer - * to managed memory allocated via ::cudaMallocManaged or declared via __managed__ variables. - * - * Passing in cudaCpuDeviceId for \p dstDevice will prefetch the data to host memory. If - * \p dstDevice is a GPU, then the device attribute ::cudaDevAttrConcurrentManagedAccess - * must be non-zero. Additionally, \p stream must be associated with a device that has a - * non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess. - * - * The start address and end address of the memory range will be rounded down and rounded up - * respectively to be aligned to CPU page size before the prefetch operation is enqueued - * in the stream. - * - * If no physical memory has been allocated for this region, then this memory region - * will be populated and mapped on the destination device. If there's insufficient - * memory to prefetch the desired region, the Unified Memory driver may evict pages from other - * ::cudaMallocManaged allocations to host memory in order to make room. Device memory - * allocated using ::cudaMalloc or ::cudaMallocArray will not be evicted. - * - * By default, any mappings to the previous location of the migrated pages are removed and - * mappings for the new location are only setup on \p dstDevice. The exact behavior however - * also depends on the settings applied to this memory range via ::cudaMemAdvise as described - * below: - * - * If ::cudaMemAdviseSetReadMostly was set on any subset of this memory range, - * then that subset will create a read-only copy of the pages on \p dstDevice. - * - * If ::cudaMemAdviseSetPreferredLocation was called on any subset of this memory - * range, then the pages will be migrated to \p dstDevice even if \p dstDevice is not the - * preferred location of any pages in the memory range. - * - * If ::cudaMemAdviseSetAccessedBy was called on any subset of this memory range, - * then mappings to those pages from all the appropriate processors are updated to - * refer to the new location if establishing such a mapping is possible. Otherwise, - * those mappings are cleared. - * - * Note that this API is not required for functionality and only serves to improve performance - * by allowing the application to migrate data to a suitable location before it is accessed. - * Memory accesses to this range are always coherent and are allowed even when the data is - * actively being migrated. - * - * Note that this function is asynchronous with respect to the host and all work - * on other devices. - * - * \param devPtr - Pointer to be prefetched - * \param count - Size in bytes - * \param dstDevice - Destination device to prefetch to - * \param stream - Stream to enqueue prefetch operation - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidDevice - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync, - * ::cudaMemcpy3DPeerAsync, ::cudaMemAdvise, - * ::cuMemPrefetchAsync - */ -extern __host__ cudaError_t CUDARTAPI cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice, cudaStream_t stream __dv(0)); - -/** - * \brief Advise about the usage of a given memory range - * - * Advise the Unified Memory subsystem about the usage pattern for the memory range - * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory - * range will be rounded down and rounded up respectively to be aligned to CPU page size before the - * advice is applied. The memory range must refer to managed memory allocated via ::cudaMallocManaged - * or declared via __managed__ variables. - * - * The \p advice parameter can take the following values: - * - ::cudaMemAdviseSetReadMostly: This implies that the data is mostly going to be read - * from and only occasionally written to. Any read accesses from any processor to this region will create a - * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cudaMemPrefetchAsync - * is called on this region, it will create a read-only copy of the data on the destination processor. - * If any processor writes to this region, all copies of the corresponding page will be invalidated - * except for the one where the write occurred. The \p device argument is ignored for this advice. - * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU - * that has a non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess. - * Also, if a context is created on a device that does not have the device attribute - * ::cudaDevAttrConcurrentManagedAccess set, then read-duplication will not occur until - * all such contexts are destroyed. - * - ::cudaMemAdviceUnsetReadMostly: Undoes the effect of ::cudaMemAdviceReadMostly and also prevents the - * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated - * copies of the data will be collapsed into a single copy. The location for the collapsed - * copy will be the preferred location if the page has a preferred location and one of the read-duplicated - * copies was resident at that location. Otherwise, the location chosen is arbitrary. - * - ::cudaMemAdviseSetPreferredLocation: This advice sets the preferred location for the - * data to be the memory belonging to \p device. Passing in cudaCpuDeviceId for \p device sets the - * preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the - * device attribute ::cudaDevAttrConcurrentManagedAccess. Setting the preferred location - * does not cause data to migrate to that location immediately. Instead, it guides the migration policy - * when a fault occurs on that memory region. If the data is already in its preferred location and the - * faulting processor can establish a mapping without requiring the data to be migrated, then - * data migration will be avoided. On the other hand, if the data is not in its preferred location - * or if a direct mapping cannot be established, then it will be migrated to the processor accessing - * it. It is important to note that setting the preferred location does not prevent data prefetching - * done using ::cudaMemPrefetchAsync. - * Having a preferred location can override the page thrash detection and resolution logic in the Unified - * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device - * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But - * if the preferred location is set as device memory, then the page will continue to thrash indefinitely. - * If ::cudaMemAdviseSetReadMostly is also set on this memory region or any subset of it, then the - * policies associated with that advice will override the policies of this advice. - * - ::cudaMemAdviseUnsetPreferredLocation: Undoes the effect of ::cudaMemAdviseSetPreferredLocation - * and changes the preferred location to none. - * - ::cudaMemAdviseSetAccessedBy: This advice implies that the data will be accessed by \p device. - * Passing in ::cudaCpuDeviceId for \p device will set the advice for the CPU. If \p device is a GPU, then - * the device attribute ::cudaDevAttrConcurrentManagedAccess must be non-zero. - * This advice does not cause data migration and has no impact on the location of the data per se. Instead, - * it causes the data to always be mapped in the specified processor's page tables, as long as the - * location of the data permits a mapping to be established. If the data gets migrated for any reason, - * the mappings are updated accordingly. - * This advice is recommended in scenarios where data locality is not important, but avoiding faults is. - * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the - * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data - * over to the other GPUs is not as important because the accesses are infrequent and the overhead of - * migration may be too high. But preventing faults can still help improve performance, and so having - * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated - * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the - * ::cudaMemAdviceSetAccessedBy flag set for this data will now have its mapping updated to point to the - * page in host memory. - * If ::cudaMemAdviseSetReadMostly is also set on this memory region or any subset of it, then the - * policies associated with that advice will override the policies of this advice. Additionally, if the - * preferred location of this memory region or any subset of it is also \p device, then the policies - * associated with ::cudaMemAdviseSetPreferredLocation will override the policies of this advice. - * - ::cudaMemAdviseUnsetAccessedBy: Undoes the effect of ::cudaMemAdviseSetAccessedBy. Any mappings to - * the data from \p device may be removed at any time causing accesses to result in non-fatal page faults. - * - * \param devPtr - Pointer to memory to set the advice for - * \param count - Size in bytes of the memory range - * \param advice - Advice to be applied for the specified memory range - * \param device - Device to apply the advice for - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidDevice - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync, - * ::cudaMemcpy3DPeerAsync, ::cudaMemPrefetchAsync, - * ::cuMemAdvise - */ -extern __host__ cudaError_t CUDARTAPI cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice, int device); - -/** -* \brief Query an attribute of a given memory range -* -* Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The -* memory range must refer to managed memory allocated via ::cudaMallocManaged or declared via -* __managed__ variables. -* -* The \p attribute parameter can take the following values: -* - ::cudaMemRangeAttributeReadMostly: If this attribute is specified, \p data will be interpreted -* as a 32-bit integer, and \p dataSize must be 4. The result returned will be 1 if all pages in the given -* memory range have read-duplication enabled, or 0 otherwise. -* - ::cudaMemRangeAttributePreferredLocation: If this attribute is specified, \p data will be -* interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be a GPU device -* id if all pages in the memory range have that GPU as their preferred location, or it will be cudaCpuDeviceId -* if all pages in the memory range have the CPU as their preferred location, or it will be cudaInvalidDeviceId -* if either all the pages don't have the same preferred location or some of the pages don't have a -* preferred location at all. Note that the actual location of the pages in the memory range at the time of -* the query may be different from the preferred location. -* - ::cudaMemRangeAttributeAccessedBy: If this attribute is specified, \p data will be interpreted -* as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned -* will be a list of device ids that had ::cudaMemAdviceSetAccessedBy set for that entire memory range. -* If any device does not have that advice set for the entire memory range, that device will not be included. -* If \p data is larger than the number of devices that have that advice set for that memory range, -* cudaInvalidDeviceId will be returned in all the extra space provided. For ex., if \p dataSize is 12 -* (i.e. \p data has 3 elements) and only device 0 has the advice set, then the result returned will be -* { 0, cudaInvalidDeviceId, cudaInvalidDeviceId }. If \p data is smaller than the number of devices that have -* that advice set, then only as many devices will be returned as can fit in the array. There is no -* guarantee on which specific devices will be returned, however. -* - ::cudaMemRangeAttributeLastPrefetchLocation: If this attribute is specified, \p data will be -* interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be the last location -* to which all pages in the memory range were prefetched explicitly via ::cudaMemPrefetchAsync. This will either be -* a GPU id or cudaCpuDeviceId depending on whether the last location for prefetch was a GPU or the CPU -* respectively. If any page in the memory range was never explicitly prefetched or if all pages were not -* prefetched to the same location, cudaInvalidDeviceId will be returned. Note that this simply returns the -* last location that the applicaton requested to prefetch the memory range to. It gives no indication as to -* whether the prefetch operation to that location has completed or even begun. -* -* \param data - A pointers to a memory location where the result -* of each attribute query will be written to. -* \param dataSize - Array containing the size of data -* \param attribute - The attribute to query -* \param devPtr - Start of the range to query -* \param count - Size of the range to query - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue - * \notefnerr - * \note_async - * \note_null_stream - * - * \sa ::cudaMemRangeGetAttributes, ::cudaMemPrefetchAsync, - * ::cudaMemAdvise, - * ::cuMemRangeGetAttribute - */ -extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttribute(void *data, size_t dataSize, enum cudaMemRangeAttribute attribute, const void *devPtr, size_t count); - -/** - * \brief Query attributes of a given memory range. - * - * Query attributes of the memory range starting at \p devPtr with a size of \p count bytes. The - * memory range must refer to managed memory allocated via ::cudaMallocManaged or declared via - * __managed__ variables. The \p attributes array will be interpreted to have \p numAttributes - * entries. The \p dataSizes array will also be interpreted to have \p numAttributes entries. - * The results of the query will be stored in \p data. - * - * The list of supported attributes are given below. Please refer to ::cudaMemRangeGetAttribute for - * attribute descriptions and restrictions. - * - * - ::cudaMemRangeAttributeReadMostly - * - ::cudaMemRangeAttributePreferredLocation - * - ::cudaMemRangeAttributeAccessedBy - * - ::cudaMemRangeAttributeLastPrefetchLocation - * - * \param data - A two-dimensional array containing pointers to memory - * locations where the result of each attribute query will be written to. - * \param dataSizes - Array containing the sizes of each result - * \param attributes - An array of attributes to query - * (numAttributes and the number of attributes in this array should match) - * \param numAttributes - Number of attributes to query - * \param devPtr - Start of the range to query - * \param count - Size of the range to query - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue - * \notefnerr - * - * \sa ::cudaMemRangeGetAttribute, ::cudaMemAdvise - * ::cudaMemPrefetchAsync, - * ::cuMemRangeGetAttributes - */ -extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttributes(void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes, size_t numAttributes, const void *devPtr, size_t count); - -/** @} */ /* END CUDART_MEMORY */ - -/** - * \defgroup CUDART_UNIFIED Unified Addressing - * - * ___MANBRIEF___ unified addressing functions of the CUDA runtime API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the unified addressing functions of the CUDA - * runtime application programming interface. - * - * @{ - * - * \section CUDART_UNIFIED_overview Overview - * - * CUDA devices can share a unified address space with the host. - * For these devices there is no distinction between a device - * pointer and a host pointer -- the same pointer value may be - * used to access memory from the host program and from a kernel - * running on the device (with exceptions enumerated below). - * - * \section CUDART_UNIFIED_support Supported Platforms - * - * Whether or not a device supports unified addressing may be - * queried by calling ::cudaGetDeviceProperties() with the device - * property ::cudaDeviceProp::unifiedAddressing. - * - * Unified addressing is automatically enabled in 64-bit processes . - * - * Unified addressing is not yet supported on Windows Vista or - * Windows 7 for devices that do not use the TCC driver model. - * - * \section CUDART_UNIFIED_lookup Looking Up Information from Pointer Values - * - * It is possible to look up information about the memory which backs a - * pointer value. For instance, one may want to know if a pointer points - * to host or device memory. As another example, in the case of device - * memory, one may want to know on which CUDA device the memory - * resides. These properties may be queried using the function - * ::cudaPointerGetAttributes() - * - * Since pointers are unique, it is not necessary to specify information - * about the pointers specified to ::cudaMemcpy() and other copy functions. - * The copy direction ::cudaMemcpyDefault may be used to specify that the - * CUDA runtime should infer the location of the pointer from its value. - * - * \section CUDART_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory - * - * All host memory allocated through all devices using ::cudaMallocHost() and - * ::cudaHostAlloc() is always directly accessible from all devices that - * support unified addressing. This is the case regardless of whether or - * not the flags ::cudaHostAllocPortable and ::cudaHostAllocMapped are - * specified. - * - * The pointer value through which allocated host memory may be accessed - * in kernels on all devices that support unified addressing is the same - * as the pointer value through which that memory is accessed on the host. - * It is not necessary to call ::cudaHostGetDevicePointer() to get the device - * pointer for these allocations. - * - * Note that this is not the case for memory allocated using the flag - * ::cudaHostAllocWriteCombined, as discussed below. - * - * \section CUDART_UNIFIED_autopeerregister Direct Access of Peer Memory - - * Upon enabling direct access from a device that supports unified addressing - * to another peer device that supports unified addressing using - * ::cudaDeviceEnablePeerAccess() all memory allocated in the peer device using - * ::cudaMalloc() and ::cudaMallocPitch() will immediately be accessible - * by the current device. The device pointer value through - * which any peer's memory may be accessed in the current device - * is the same pointer value through which that memory may be - * accessed from the peer device. - * - * \section CUDART_UNIFIED_exceptions Exceptions, Disjoint Addressing - * - * Not all memory may be accessed on devices through the same pointer - * value through which they are accessed on the host. These exceptions - * are host memory registered using ::cudaHostRegister() and host memory - * allocated using the flag ::cudaHostAllocWriteCombined. For these - * exceptions, there exists a distinct host and device address for the - * memory. The device address is guaranteed to not overlap any valid host - * pointer range and is guaranteed to have the same value across all devices - * that support unified addressing. - * - * This device address may be queried using ::cudaHostGetDevicePointer() - * when a device using unified addressing is current. Either the host - * or the unified device pointer value may be used to refer to this memory - * in ::cudaMemcpy() and similar functions using the ::cudaMemcpyDefault - * memory direction. - * - */ - -/** - * \brief Returns attributes about a specified pointer - * - * Returns in \p *attributes the attributes of the pointer \p ptr. - * If pointer was not allocated in, mapped by or registered with context - * supporting unified addressing ::cudaErrorInvalidValue is returned. - * - * The ::cudaPointerAttributes structure is defined as: - * \code - struct cudaPointerAttributes { - enum cudaMemoryType memoryType; - int device; - void *devicePointer; - void *hostPointer; - int isManaged; - } - \endcode - * In this structure, the individual fields mean - * - * - \ref ::cudaPointerAttributes::memoryType "memoryType" identifies the physical - * location of the memory associated with pointer \p ptr. It can be - * ::cudaMemoryTypeHost for host memory or ::cudaMemoryTypeDevice for device - * memory. - * - * - \ref ::cudaPointerAttributes::device "device" is the device against which - * \p ptr was allocated. If \p ptr has memory type ::cudaMemoryTypeDevice - * then this identifies the device on which the memory referred to by \p ptr - * physically resides. If \p ptr has memory type ::cudaMemoryTypeHost then this - * identifies the device which was current when the allocation was made - * (and if that device is deinitialized then this allocation will vanish - * with that device's state). - * - * - \ref ::cudaPointerAttributes::devicePointer "devicePointer" is - * the device pointer alias through which the memory referred to by \p ptr - * may be accessed on the current device. - * If the memory referred to by \p ptr cannot be accessed directly by the - * current device then this is NULL. - * - * - \ref ::cudaPointerAttributes::hostPointer "hostPointer" is - * the host pointer alias through which the memory referred to by \p ptr - * may be accessed on the host. - * If the memory referred to by \p ptr cannot be accessed directly by the - * host then this is NULL. - * - * - \ref ::cudaPointerAttributes::isManaged "isManaged" indicates if - * the pointer \p ptr points to managed memory or not. - * - * \param attributes - Attributes for the specified pointer - * \param ptr - Pointer to get attributes for - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidDevice, - * ::cudaErrorInvalidValue - * - * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice, - * ::cudaChooseDevice, - * ::cuPointerGetAttributes - */ -extern __host__ cudaError_t CUDARTAPI cudaPointerGetAttributes(struct cudaPointerAttributes *attributes, const void *ptr); - -/** @} */ /* END CUDART_UNIFIED */ - -/** - * \defgroup CUDART_PEER Peer Device Memory Access - * - * ___MANBRIEF___ peer device memory access functions of the CUDA runtime API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the peer device memory access functions of the CUDA runtime - * application programming interface. - * - * @{ - */ - -/** - * \brief Queries if a device may directly access a peer device's memory. - * - * Returns in \p *canAccessPeer a value of 1 if device \p device is capable of - * directly accessing memory from \p peerDevice and 0 otherwise. If direct - * access of \p peerDevice from \p device is possible, then access may be - * enabled by calling ::cudaDeviceEnablePeerAccess(). - * - * \param canAccessPeer - Returned access capability - * \param device - Device from which allocations on \p peerDevice are to - * be directly accessed. - * \param peerDevice - Device on which the allocations to be directly accessed - * by \p device reside. - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidDevice - * \notefnerr - * - * \sa ::cudaDeviceEnablePeerAccess, - * ::cudaDeviceDisablePeerAccess, - * ::cuDeviceCanAccessPeer - */ -extern __host__ cudaError_t CUDARTAPI cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice); - -/** - * \brief Enables direct access to memory allocations on a peer device. - * - * On success, all allocations from \p peerDevice will immediately be accessible by - * the current device. They will remain accessible until access is explicitly - * disabled using ::cudaDeviceDisablePeerAccess() or either device is reset using - * ::cudaDeviceReset(). - * - * Note that access granted by this call is unidirectional and that in order to access - * memory on the current device from \p peerDevice, a separate symmetric call - * to ::cudaDeviceEnablePeerAccess() is required. - * - * Each device can support a system-wide maximum of eight peer connections. - * - * Peer access is not supported in 32 bit applications. - * - * Returns ::cudaErrorInvalidDevice if ::cudaDeviceCanAccessPeer() indicates - * that the current device cannot directly access memory from \p peerDevice. - * - * Returns ::cudaErrorPeerAccessAlreadyEnabled if direct access of - * \p peerDevice from the current device has already been enabled. - * - * Returns ::cudaErrorInvalidValue if \p flags is not 0. - * - * \param peerDevice - Peer device to enable direct access to from the current device - * \param flags - Reserved for future use and must be set to 0 - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidDevice, - * ::cudaErrorPeerAccessAlreadyEnabled, - * ::cudaErrorInvalidValue - * \notefnerr - * - * \sa ::cudaDeviceCanAccessPeer, - * ::cudaDeviceDisablePeerAccess, - * ::cuCtxEnablePeerAccess - */ -extern __host__ cudaError_t CUDARTAPI cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags); - -/** - * \brief Disables direct access to memory allocations on a peer device. - * - * Returns ::cudaErrorPeerAccessNotEnabled if direct access to memory on - * \p peerDevice has not yet been enabled from the current device. - * - * \param peerDevice - Peer device to disable direct access to - * - * \return - * ::cudaSuccess, - * ::cudaErrorPeerAccessNotEnabled, - * ::cudaErrorInvalidDevice - * \notefnerr - * - * \sa ::cudaDeviceCanAccessPeer, - * ::cudaDeviceEnablePeerAccess, - * ::cuCtxDisablePeerAccess - */ -extern __host__ cudaError_t CUDARTAPI cudaDeviceDisablePeerAccess(int peerDevice); - -/** @} */ /* END CUDART_PEER */ - -/** \defgroup CUDART_OPENGL OpenGL Interoperability */ - -/** \defgroup CUDART_OPENGL_DEPRECATED OpenGL Interoperability [DEPRECATED] */ - -/** \defgroup CUDART_D3D9 Direct3D 9 Interoperability */ - -/** \defgroup CUDART_D3D9_DEPRECATED Direct3D 9 Interoperability [DEPRECATED] */ - -/** \defgroup CUDART_D3D10 Direct3D 10 Interoperability */ - -/** \defgroup CUDART_D3D10_DEPRECATED Direct3D 10 Interoperability [DEPRECATED] */ - -/** \defgroup CUDART_D3D11 Direct3D 11 Interoperability */ - -/** \defgroup CUDART_D3D11_DEPRECATED Direct3D 11 Interoperability [DEPRECATED] */ - -/** \defgroup CUDART_VDPAU VDPAU Interoperability */ - -/** \defgroup CUDART_EGL EGL Interoperability */ - -/** - * \defgroup CUDART_INTEROP Graphics Interoperability - * - * ___MANBRIEF___ graphics interoperability functions of the CUDA runtime API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the graphics interoperability functions of the CUDA - * runtime application programming interface. - * - * @{ - */ - -/** - * \brief Unregisters a graphics resource for access by CUDA - * - * Unregisters the graphics resource \p resource so it is not accessible by - * CUDA unless registered again. - * - * If \p resource is invalid then ::cudaErrorInvalidResourceHandle is - * returned. - * - * \param resource - Resource to unregister - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidResourceHandle, - * ::cudaErrorUnknown - * \notefnerr - * - * \sa - * ::cudaGraphicsD3D9RegisterResource, - * ::cudaGraphicsD3D10RegisterResource, - * ::cudaGraphicsD3D11RegisterResource, - * ::cudaGraphicsGLRegisterBuffer, - * ::cudaGraphicsGLRegisterImage, - * ::cuGraphicsUnregisterResource - */ -extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource); - -/** - * \brief Set usage flags for mapping a graphics resource - * - * Set \p flags for mapping the graphics resource \p resource. - * - * Changes to \p flags will take effect the next time \p resource is mapped. - * The \p flags argument may be any of the following: - * - ::cudaGraphicsMapFlagsNone: Specifies no hints about how \p resource will - * be used. It is therefore assumed that CUDA may read from or write to \p resource. - * - ::cudaGraphicsMapFlagsReadOnly: Specifies that CUDA will not write to \p resource. - * - ::cudaGraphicsMapFlagsWriteDiscard: Specifies CUDA will not read from \p resource and will - * write over the entire contents of \p resource, so none of the data - * previously stored in \p resource will be preserved. - * - * If \p resource is presently mapped for access by CUDA then ::cudaErrorUnknown is returned. - * If \p flags is not one of the above values then ::cudaErrorInvalidValue is returned. - * - * \param resource - Registered resource to set flags for - * \param flags - Parameters for resource mapping - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidResourceHandle, - * ::cudaErrorUnknown, - * \notefnerr - * - * \sa - * ::cudaGraphicsMapResources, - * ::cuGraphicsResourceSetMapFlags - */ -extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(cudaGraphicsResource_t resource, unsigned int flags); - -/** - * \brief Map graphics resources for access by CUDA - * - * Maps the \p count graphics resources in \p resources for access by CUDA. - * - * The resources in \p resources may be accessed by CUDA until they - * are unmapped. The graphics API from which \p resources were registered - * should not access any resources while they are mapped by CUDA. If an - * application does so, the results are undefined. - * - * This function provides the synchronization guarantee that any graphics calls - * issued before ::cudaGraphicsMapResources() will complete before any subsequent CUDA - * work issued in \p stream begins. - * - * If \p resources contains any duplicate entries then ::cudaErrorInvalidResourceHandle - * is returned. If any of \p resources are presently mapped for access by - * CUDA then ::cudaErrorUnknown is returned. - * - * \param count - Number of resources to map - * \param resources - Resources to map for CUDA - * \param stream - Stream for synchronization - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidResourceHandle, - * ::cudaErrorUnknown - * \note_null_stream - * \notefnerr - * - * \sa - * ::cudaGraphicsResourceGetMappedPointer, - * ::cudaGraphicsSubResourceGetMappedArray, - * ::cudaGraphicsUnmapResources, - * ::cuGraphicsMapResources - */ -extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)); - -/** - * \brief Unmap graphics resources. - * - * Unmaps the \p count graphics resources in \p resources. - * - * Once unmapped, the resources in \p resources may not be accessed by CUDA - * until they are mapped again. - * - * This function provides the synchronization guarantee that any CUDA work issued - * in \p stream before ::cudaGraphicsUnmapResources() will complete before any - * subsequently issued graphics work begins. - * - * If \p resources contains any duplicate entries then ::cudaErrorInvalidResourceHandle - * is returned. If any of \p resources are not presently mapped for access by - * CUDA then ::cudaErrorUnknown is returned. - * - * \param count - Number of resources to unmap - * \param resources - Resources to unmap - * \param stream - Stream for synchronization - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidResourceHandle, - * ::cudaErrorUnknown - * \note_null_stream - * \notefnerr - * - * \sa - * ::cudaGraphicsMapResources, - * ::cuGraphicsUnmapResources - */ -extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)); - -/** - * \brief Get an device pointer through which to access a mapped graphics resource. - * - * Returns in \p *devPtr a pointer through which the mapped graphics resource - * \p resource may be accessed. - * Returns in \p *size the size of the memory in bytes which may be accessed from that pointer. - * The value set in \p devPtr may change every time that \p resource is mapped. - * - * If \p resource is not a buffer then it cannot be accessed via a pointer and - * ::cudaErrorUnknown is returned. - * If \p resource is not mapped then ::cudaErrorUnknown is returned. - * * - * \param devPtr - Returned pointer through which \p resource may be accessed - * \param size - Returned size of the buffer accessible starting at \p *devPtr - * \param resource - Mapped resource to access - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidResourceHandle, - * ::cudaErrorUnknown - * \notefnerr - * - * \sa - * ::cudaGraphicsMapResources, - * ::cudaGraphicsSubResourceGetMappedArray, - * ::cuGraphicsResourceGetMappedPointer - */ -extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(void **devPtr, size_t *size, cudaGraphicsResource_t resource); - -/** - * \brief Get an array through which to access a subresource of a mapped graphics resource. - * - * Returns in \p *array an array through which the subresource of the mapped - * graphics resource \p resource which corresponds to array index \p arrayIndex - * and mipmap level \p mipLevel may be accessed. The value set in \p array may - * change every time that \p resource is mapped. - * - * If \p resource is not a texture then it cannot be accessed via an array and - * ::cudaErrorUnknown is returned. - * If \p arrayIndex is not a valid array index for \p resource then - * ::cudaErrorInvalidValue is returned. - * If \p mipLevel is not a valid mipmap level for \p resource then - * ::cudaErrorInvalidValue is returned. - * If \p resource is not mapped then ::cudaErrorUnknown is returned. - * - * \param array - Returned array through which a subresource of \p resource may be accessed - * \param resource - Mapped resource to access - * \param arrayIndex - Array index for array textures or cubemap face - * index as defined by ::cudaGraphicsCubeFace for - * cubemap textures for the subresource to access - * \param mipLevel - Mipmap level for the subresource to access - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidResourceHandle, - * ::cudaErrorUnknown - * \notefnerr - * - * \sa - * ::cudaGraphicsResourceGetMappedPointer, - * ::cuGraphicsSubResourceGetMappedArray - */ -extern __host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray(cudaArray_t *array, cudaGraphicsResource_t resource, unsigned int arrayIndex, unsigned int mipLevel); - -/** - * \brief Get a mipmapped array through which to access a mapped graphics resource. - * - * Returns in \p *mipmappedArray a mipmapped array through which the mapped - * graphics resource \p resource may be accessed. The value set in \p mipmappedArray may - * change every time that \p resource is mapped. - * - * If \p resource is not a texture then it cannot be accessed via an array and - * ::cudaErrorUnknown is returned. - * If \p resource is not mapped then ::cudaErrorUnknown is returned. - * - * \param mipmappedArray - Returned mipmapped array through which \p resource may be accessed - * \param resource - Mapped resource to access - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidResourceHandle, - * ::cudaErrorUnknown - * \notefnerr - * - * \sa - * ::cudaGraphicsResourceGetMappedPointer, - * ::cuGraphicsResourceGetMappedMipmappedArray - */ -extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedMipmappedArray(cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource); - -/** @} */ /* END CUDART_INTEROP */ - -/** - * \defgroup CUDART_TEXTURE Texture Reference Management - * - * ___MANBRIEF___ texture reference management functions of the CUDA runtime - * API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the low level texture reference management functions - * of the CUDA runtime application programming interface. - * - * Some functions have overloaded C++ API template versions documented separately in the - * \ref CUDART_HIGHLEVEL "C++ API Routines" module. - * - * @{ - */ - -/** - * \brief Get the channel descriptor of an array - * - * Returns in \p *desc the channel descriptor of the CUDA array \p array. - * - * \param desc - Channel format - * \param array - Memory array on device - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue - * \notefnerr - * - * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)", - * ::cudaGetTextureReference, - * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", - * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", - * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", - * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)", - * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)" - */ -extern __host__ cudaError_t CUDARTAPI cudaGetChannelDesc(struct cudaChannelFormatDesc *desc, cudaArray_const_t array); - -/** - * \brief Returns a channel descriptor using the specified format - * - * Returns a channel descriptor with format \p f and number of bits of each - * component \p x, \p y, \p z, and \p w. The ::cudaChannelFormatDesc is - * defined as: - * \code - struct cudaChannelFormatDesc { - int x, y, z, w; - enum cudaChannelFormatKind f; - }; - * \endcode - * - * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned, - * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat. - * - * \param x - X component - * \param y - Y component - * \param z - Z component - * \param w - W component - * \param f - Channel format - * - * \return - * Channel descriptor with format \p f - * - * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", - * ::cudaGetChannelDesc, ::cudaGetTextureReference, - * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", - * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", - * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", - * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)", - * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)", - * ::cuTexRefSetFormat - */ -extern __host__ struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc(int x, int y, int z, int w, enum cudaChannelFormatKind f); - - -/** - * \brief Binds a memory area to a texture - * - * Binds \p size bytes of the memory area pointed to by \p devPtr to the - * texture reference \p texref. \p desc describes how the memory is interpreted - * when fetching values from the texture. Any memory previously bound to - * \p texref is unbound. - * - * Since the hardware enforces an alignment requirement on texture base - * addresses, - * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture()" - * returns in \p *offset a byte offset that - * must be applied to texture fetches in order to read from the desired memory. - * This offset must be divided by the texel size and passed to kernels that - * read from the texture so they can be applied to the ::tex1Dfetch() function. - * If the device memory pointer was returned from ::cudaMalloc(), the offset is - * guaranteed to be 0 and NULL may be passed as the \p offset parameter. - * - * The total number of elements (or texels) in the linear address range - * cannot exceed ::cudaDeviceProp::maxTexture1DLinear[0]. - * The number of elements is computed as (\p size / elementSize), - * where elementSize is determined from \p desc. - * - * \param offset - Offset in bytes - * \param texref - Texture to bind - * \param devPtr - Memory area on device - * \param desc - Channel format - * \param size - Size of the memory area pointed to by devPtr - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidTexture - * \notefnerr - * - * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)", - * ::cudaGetChannelDesc, ::cudaGetTextureReference, - * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", - * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", - * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", - * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)", - * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)", - * ::cuTexRefSetAddress, - * ::cuTexRefSetAddressMode, - * ::cuTexRefSetFormat, - * ::cuTexRefSetFlags, - * ::cuTexRefSetBorderColor - */ -extern __host__ cudaError_t CUDARTAPI cudaBindTexture(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t size __dv(UINT_MAX)); - -/** - * \brief Binds a 2D memory area to a texture - * - * Binds the 2D memory area pointed to by \p devPtr to the - * texture reference \p texref. The size of the area is constrained by - * \p width in texel units, \p height in texel units, and \p pitch in byte - * units. \p desc describes how the memory is interpreted when fetching values - * from the texture. Any memory previously bound to \p texref is unbound. - * - * Since the hardware enforces an alignment requirement on texture base - * addresses, ::cudaBindTexture2D() returns in \p *offset a byte offset that - * must be applied to texture fetches in order to read from the desired memory. - * This offset must be divided by the texel size and passed to kernels that - * read from the texture so they can be applied to the ::tex2D() function. - * If the device memory pointer was returned from ::cudaMalloc(), the offset is - * guaranteed to be 0 and NULL may be passed as the \p offset parameter. - * - * \p width and \p height, which are specified in elements (or texels), cannot - * exceed ::cudaDeviceProp::maxTexture2DLinear[0] and ::cudaDeviceProp::maxTexture2DLinear[1] - * respectively. \p pitch, which is specified in bytes, cannot exceed - * ::cudaDeviceProp::maxTexture2DLinear[2]. - * - * The driver returns ::cudaErrorInvalidValue if \p pitch is not a multiple of - * ::cudaDeviceProp::texturePitchAlignment. - * - * \param offset - Offset in bytes - * \param texref - Texture reference to bind - * \param devPtr - 2D memory area on device - * \param desc - Channel format - * \param width - Width in texel units - * \param height - Height in texel units - * \param pitch - Pitch in bytes - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidTexture - * \notefnerr - * - * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)", - * ::cudaGetChannelDesc, ::cudaGetTextureReference, - * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", - * \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", - * \ref ::cudaBindTexture2D(size_t*, const struct texture&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", - * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", - * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaBindTextureToArray (C API)", - * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)", - * ::cuTexRefSetAddress2D, - * ::cuTexRefSetFormat, - * ::cuTexRefSetFlags, - * ::cuTexRefSetAddressMode, - * ::cuTexRefSetBorderColor - */ -extern __host__ cudaError_t CUDARTAPI cudaBindTexture2D(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t width, size_t height, size_t pitch); - -/** - * \brief Binds an array to a texture - * - * Binds the CUDA array \p array to the texture reference \p texref. - * \p desc describes how the memory is interpreted when fetching values from - * the texture. Any CUDA array previously bound to \p texref is unbound. - * - * \param texref - Texture to bind - * \param array - Memory array on device - * \param desc - Channel format - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidTexture - * \notefnerr - * - * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)", - * ::cudaGetChannelDesc, ::cudaGetTextureReference, - * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", - * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", - * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", - * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)", - * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)", - * ::cuTexRefSetArray, - * ::cuTexRefSetFormat, - * ::cuTexRefSetFlags, - * ::cuTexRefSetAddressMode, - * ::cuTexRefSetFilterMode, - * ::cuTexRefSetBorderColor, - * ::cuTexRefSetMaxAnisotropy - */ -extern __host__ cudaError_t CUDARTAPI cudaBindTextureToArray(const struct textureReference *texref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc); - -/** - * \brief Binds a mipmapped array to a texture - * - * Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p texref. - * \p desc describes how the memory is interpreted when fetching values from - * the texture. Any CUDA mipmapped array previously bound to \p texref is unbound. - * - * \param texref - Texture to bind - * \param mipmappedArray - Memory mipmapped array on device - * \param desc - Channel format - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidTexture - * \notefnerr - * - * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)", - * ::cudaGetChannelDesc, ::cudaGetTextureReference, - * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", - * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", - * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", - * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)", - * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)", - * ::cuTexRefSetMipmappedArray, - * ::cuTexRefSetMipmapFilterMode - * ::cuTexRefSetMipmapLevelClamp, - * ::cuTexRefSetMipmapLevelBias, - * ::cuTexRefSetFormat, - * ::cuTexRefSetFlags, - * ::cuTexRefSetAddressMode, - * ::cuTexRefSetBorderColor, - * ::cuTexRefSetMaxAnisotropy - */ -extern __host__ cudaError_t CUDARTAPI cudaBindTextureToMipmappedArray(const struct textureReference *texref, cudaMipmappedArray_const_t mipmappedArray, const struct cudaChannelFormatDesc *desc); - -/** - * \brief Unbinds a texture - * - * Unbinds the texture bound to \p texref. - * - * \param texref - Texture to unbind - * - * \return - * ::cudaSuccess - * \notefnerr - * - * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)", - * ::cudaGetChannelDesc, ::cudaGetTextureReference, - * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", - * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", - * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", - * \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (C++ API)", - * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)" - */ -extern __host__ cudaError_t CUDARTAPI cudaUnbindTexture(const struct textureReference *texref); - -/** - * \brief Get the alignment offset of a texture - * - * Returns in \p *offset the offset that was returned when texture reference - * \p texref was bound. - * - * \param offset - Offset of texture reference in bytes - * \param texref - Texture to get offset of - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidTexture, - * ::cudaErrorInvalidTextureBinding - * \notefnerr - * - * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)", - * ::cudaGetChannelDesc, ::cudaGetTextureReference, - * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", - * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", - * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", - * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)", - * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)" - */ -extern __host__ cudaError_t CUDARTAPI cudaGetTextureAlignmentOffset(size_t *offset, const struct textureReference *texref); - -/** - * \brief Get the texture reference associated with a symbol - * - * Returns in \p *texref the structure associated to the texture reference - * defined by symbol \p symbol. - * - * \param texref - Texture reference associated with symbol - * \param symbol - Texture to get reference for - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidTexture - * \notefnerr - * \note_string_api_deprecation_50 - * - * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)", - * ::cudaGetChannelDesc, - * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)", - * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", - * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", - * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", - * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)", - * ::cuModuleGetTexRef - */ -extern __host__ cudaError_t CUDARTAPI cudaGetTextureReference(const struct textureReference **texref, const void *symbol); - -/** @} */ /* END CUDART_TEXTURE */ - -/** - * \defgroup CUDART_SURFACE Surface Reference Management - * - * ___MANBRIEF___ surface reference management functions of the CUDA runtime - * API (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the low level surface reference management functions - * of the CUDA runtime application programming interface. - * - * Some functions have overloaded C++ API template versions documented separately in the - * \ref CUDART_HIGHLEVEL "C++ API Routines" module. - * - * @{ - */ - -/** - * \brief Binds an array to a surface - * - * Binds the CUDA array \p array to the surface reference \p surfref. - * \p desc describes how the memory is interpreted when fetching values from - * the surface. Any CUDA array previously bound to \p surfref is unbound. - * - * \param surfref - Surface to bind - * \param array - Memory array on device - * \param desc - Channel format - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue, - * ::cudaErrorInvalidSurface - * \notefnerr - * - * \sa \ref ::cudaBindSurfaceToArray(const struct surface< T, dim>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindSurfaceToArray (C++ API)", - * \ref ::cudaBindSurfaceToArray(const struct surface< T, dim>&, cudaArray_const_t) "cudaBindSurfaceToArray (C++ API, inherited channel descriptor)", - * ::cudaGetSurfaceReference, - * ::cuSurfRefSetArray - */ -extern __host__ cudaError_t CUDARTAPI cudaBindSurfaceToArray(const struct surfaceReference *surfref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc); - -/** - * \brief Get the surface reference associated with a symbol - * - * Returns in \p *surfref the structure associated to the surface reference - * defined by symbol \p symbol. - * - * \param surfref - Surface reference associated with symbol - * \param symbol - Surface to get reference for - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidSurface - * \notefnerr - * \note_string_api_deprecation_50 - * - * \sa - * \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)", - * ::cuModuleGetSurfRef - */ -extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceReference(const struct surfaceReference **surfref, const void *symbol); - -/** @} */ /* END CUDART_SURFACE */ - -/** - * \defgroup CUDART_TEXTURE_OBJECT Texture Object Management - * - * ___MANBRIEF___ texture object management functions of the CUDA runtime API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the low level texture object management functions - * of the CUDA runtime application programming interface. The texture - * object API is only supported on devices of compute capability 3.0 or higher. - * - * @{ - */ - -/** - * \brief Creates a texture object - * - * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes - * the data to texture from. \p pTexDesc describes how the data should be sampled. - * \p pResViewDesc is an optional argument that specifies an alternate format for - * the data described by \p pResDesc, and also describes the subresource region - * to restrict access to when texturing. \p pResViewDesc can only be specified if - * the type of resource is a CUDA array or a CUDA mipmapped array. - * - * Texture objects are only supported on devices of compute capability 3.0 or higher. - * Additionally, a texture object is an opaque value, and, as such, should only be - * accessed through CUDA API calls. - * - * The ::cudaResourceDesc structure is defined as: - * \code - struct cudaResourceDesc { - enum cudaResourceType resType; - - union { - struct { - cudaArray_t array; - } array; - struct { - cudaMipmappedArray_t mipmap; - } mipmap; - struct { - void *devPtr; - struct cudaChannelFormatDesc desc; - size_t sizeInBytes; - } linear; - struct { - void *devPtr; - struct cudaChannelFormatDesc desc; - size_t width; - size_t height; - size_t pitchInBytes; - } pitch2D; - } res; - }; - * \endcode - * where: - * - ::cudaResourceDesc::resType specifies the type of resource to texture from. - * CUresourceType is defined as: - * \code - enum cudaResourceType { - cudaResourceTypeArray = 0x00, - cudaResourceTypeMipmappedArray = 0x01, - cudaResourceTypeLinear = 0x02, - cudaResourceTypePitch2D = 0x03 - }; - * \endcode - * - * \par - * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeArray, ::cudaResourceDesc::res::array::array - * must be set to a valid CUDA array handle. - * - * \par - * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeMipmappedArray, ::cudaResourceDesc::res::mipmap::mipmap - * must be set to a valid CUDA mipmapped array handle and ::cudaTextureDesc::normalizedCoords must be set to true. - * - * \par - * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeLinear, ::cudaResourceDesc::res::linear::devPtr - * must be set to a valid device pointer, that is aligned to ::cudaDeviceProp::textureAlignment. - * ::cudaResourceDesc::res::linear::desc describes the format and the number of components per array element. ::cudaResourceDesc::res::linear::sizeInBytes - * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed - * ::cudaDeviceProp::maxTexture1DLinear. The number of elements is computed as (sizeInBytes / sizeof(desc)). - * - * \par - * If ::cudaResourceDesc::resType is set to ::cudaResourceTypePitch2D, ::cudaResourceDesc::res::pitch2D::devPtr - * must be set to a valid device pointer, that is aligned to ::cudaDeviceProp::textureAlignment. - * ::cudaResourceDesc::res::pitch2D::desc describes the format and the number of components per array element. ::cudaResourceDesc::res::pitch2D::width - * and ::cudaResourceDesc::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed - * ::cudaDeviceProp::maxTexture2DLinear[0] and ::cudaDeviceProp::maxTexture2DLinear[1] respectively. - * ::cudaResourceDesc::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to - * ::cudaDeviceProp::texturePitchAlignment. Pitch cannot exceed ::cudaDeviceProp::maxTexture2DLinear[2]. - * - * - * The ::cudaTextureDesc struct is defined as - * \code - struct cudaTextureDesc { - enum cudaTextureAddressMode addressMode[3]; - enum cudaTextureFilterMode filterMode; - enum cudaTextureReadMode readMode; - int sRGB; - float borderColor[4]; - int normalizedCoords; - unsigned int maxAnisotropy; - enum cudaTextureFilterMode mipmapFilterMode; - float mipmapLevelBias; - float minMipmapLevelClamp; - float maxMipmapLevelClamp; - }; - * \endcode - * where - * - ::cudaTextureDesc::addressMode specifies the addressing mode for each dimension of the texture data. ::cudaTextureAddressMode is defined as: - * \code - enum cudaTextureAddressMode { - cudaAddressModeWrap = 0, - cudaAddressModeClamp = 1, - cudaAddressModeMirror = 2, - cudaAddressModeBorder = 3 - }; - * \endcode - * This is ignored if ::cudaResourceDesc::resType is ::cudaResourceTypeLinear. Also, if ::cudaTextureDesc::normalizedCoords - * is set to zero, ::cudaAddressModeWrap and ::cudaAddressModeMirror won't be supported and will be switched to ::cudaAddressModeClamp. - * - * - ::cudaTextureDesc::filterMode specifies the filtering mode to be used when fetching from the texture. ::cudaTextureFilterMode is defined as: - * \code - enum cudaTextureFilterMode { - cudaFilterModePoint = 0, - cudaFilterModeLinear = 1 - }; - * \endcode - * This is ignored if ::cudaResourceDesc::resType is ::cudaResourceTypeLinear. - * - * - ::cudaTextureDesc::readMode specifies whether integer data should be converted to floating point or not. ::cudaTextureReadMode is defined as: - * \code - enum cudaTextureReadMode { - cudaReadModeElementType = 0, - cudaReadModeNormalizedFloat = 1 - }; - * \endcode - * Note that this applies only to 8-bit and 16-bit integer formats. 32-bit integer format would not be promoted, regardless of - * whether or not this ::cudaTextureDesc::readMode is set ::cudaReadModeNormalizedFloat is specified. - * - * - ::cudaTextureDesc::sRGB specifies whether sRGB to linear conversion should be performed during texture fetch. - * - * - ::cudaTextureDesc::borderColor specifies the float values of color. where: - * ::cudaTextureDesc::borderColor[0] contains value of 'R', - * ::cudaTextureDesc::borderColor[1] contains value of 'G', - * ::cudaTextureDesc::borderColor[2] contains value of 'B', - * ::cudaTextureDesc::borderColor[3] contains value of 'A' - * Note that application using integer border color values will need to these values to float. - * The values are set only when the addressing mode specified by ::cudaTextureDesc::addressMode is cudaAddressModeBorder. - * - * - ::cudaTextureDesc::normalizedCoords specifies whether the texture coordinates will be normalized or not. - * - * - ::cudaTextureDesc::maxAnisotropy specifies the maximum anistropy ratio to be used when doing anisotropic filtering. This value will be - * clamped to the range [1,16]. - * - * - ::cudaTextureDesc::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels. - * - * - ::cudaTextureDesc::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level. - * - * - ::cudaTextureDesc::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to. - * - * - ::cudaTextureDesc::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to. - * - * - * The ::cudaResourceViewDesc struct is defined as - * \code - struct cudaResourceViewDesc { - enum cudaResourceViewFormat format; - size_t width; - size_t height; - size_t depth; - unsigned int firstMipmapLevel; - unsigned int lastMipmapLevel; - unsigned int firstLayer; - unsigned int lastLayer; - }; - * \endcode - * where: - * - ::cudaResourceViewDesc::format specifies how the data contained in the CUDA array or CUDA mipmapped array should - * be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block - * compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a 32-bit unsigned integer format - * with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have - * a 32-bit unsigned int with 2 channels. The other BC formats require the underlying resource to have the same 32-bit unsigned int - * format but with 4 channels. - * - * - ::cudaResourceViewDesc::width specifies the new width of the texture data. If the resource view format is a block - * compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats, - * this value has to be equal to that of the original resource. - * - * - ::cudaResourceViewDesc::height specifies the new height of the texture data. If the resource view format is a block - * compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats, - * this value has to be equal to that of the original resource. - * - * - ::cudaResourceViewDesc::depth specifies the new depth of the texture data. This value has to be equal to that of the - * original resource. - * - * - ::cudaResourceViewDesc::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero. - * For non-mipmapped resources, this value has to be zero.::cudaTextureDesc::minMipmapLevelClamp and ::cudaTextureDesc::maxMipmapLevelClamp - * will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified, - * then the actual minimum mipmap level clamp will be 3.2. - * - * - ::cudaResourceViewDesc::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value - * has to be zero. - * - * - ::cudaResourceViewDesc::firstLayer specifies the first layer index for layered textures. This will be the new layer zero. - * For non-layered resources, this value has to be zero. - * - * - ::cudaResourceViewDesc::lastLayer specifies the last layer index for layered textures. For non-layered resources, - * this value has to be zero. - * - * - * \param pTexObject - Texture object to create - * \param pResDesc - Resource descriptor - * \param pTexDesc - Texture descriptor - * \param pResViewDesc - Resource view descriptor - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue - * - * \sa - * ::cudaDestroyTextureObject, - * ::cuTexObjectCreate - */ - -extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject(cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc, const struct cudaTextureDesc *pTexDesc, const struct cudaResourceViewDesc *pResViewDesc); - -/** - * \brief Destroys a texture object - * - * Destroys the texture object specified by \p texObject. - * - * \param texObject - Texture object to destroy - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue - * - * \sa - * ::cudaCreateTextureObject, - * ::cuTexObjectDestroy - */ -extern __host__ cudaError_t CUDARTAPI cudaDestroyTextureObject(cudaTextureObject_t texObject); - -/** - * \brief Returns a texture object's resource descriptor - * - * Returns the resource descriptor for the texture object specified by \p texObject. - * - * \param pResDesc - Resource descriptor - * \param texObject - Texture object - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue - * - * \sa - * ::cudaCreateTextureObject, - * ::cuTexObjectGetResourceDesc - */ -extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject); - -/** - * \brief Returns a texture object's texture descriptor - * - * Returns the texture descriptor for the texture object specified by \p texObject. - * - * \param pTexDesc - Texture descriptor - * \param texObject - Texture object - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue - * - * \sa - * ::cudaCreateTextureObject, - * ::cuTexObjectGetTextureDesc - */ -extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject); - -/** - * \brief Returns a texture object's resource view descriptor - * - * Returns the resource view descriptor for the texture object specified by \p texObject. - * If no resource view was specified, ::cudaErrorInvalidValue is returned. - * - * \param pResViewDesc - Resource view descriptor - * \param texObject - Texture object - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue - * - * \sa - * ::cudaCreateTextureObject, - * ::cuTexObjectGetResourceViewDesc - */ -extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject); - -/** @} */ /* END CUDART_TEXTURE_OBJECT */ - -/** - * \defgroup CUDART_SURFACE_OBJECT Surface Object Management - * - * ___MANBRIEF___ surface object management functions of the CUDA runtime API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the low level texture object management functions - * of the CUDA runtime application programming interface. The surface object - * API is only supported on devices of compute capability 3.0 or higher. - * - * @{ - */ - -/** - * \brief Creates a surface object - * - * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes - * the data to perform surface load/stores on. ::cudaResourceDesc::resType must be - * ::cudaResourceTypeArray and ::cudaResourceDesc::res::array::array - * must be set to a valid CUDA array handle. - * - * Surface objects are only supported on devices of compute capability 3.0 or higher. - * Additionally, a surface object is an opaque value, and, as such, should only be - * accessed through CUDA API calls. - * - * \param pSurfObject - Surface object to create - * \param pResDesc - Resource descriptor - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue - * - * \sa - * ::cudaDestroySurfaceObject, - * ::cuSurfObjectCreate - */ - -extern __host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc); - -/** - * \brief Destroys a surface object - * - * Destroys the surface object specified by \p surfObject. - * - * \param surfObject - Surface object to destroy - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue - * - * \sa - * ::cudaCreateSurfaceObject, - * ::cuSurfObjectDestroy - */ -extern __host__ cudaError_t CUDARTAPI cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject); - -/** - * \brief Returns a surface object's resource descriptor - * Returns the resource descriptor for the surface object specified by \p surfObject. - * - * \param pResDesc - Resource descriptor - * \param surfObject - Surface object - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue - * - * \sa - * ::cudaCreateSurfaceObject, - * ::cuSurfObjectGetResourceDesc - */ -extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject); - -/** @} */ /* END CUDART_SURFACE_OBJECT */ - -/** - * \defgroup CUDART__VERSION Version Management - * - * @{ - */ - -/** - * \brief Returns the CUDA driver version - * - * Returns in \p *driverVersion the version number of the installed CUDA - * driver. If no driver is installed, then 0 is returned as the driver - * version (via \p driverVersion). This function automatically returns - * ::cudaErrorInvalidValue if the \p driverVersion argument is NULL. - * - * \param driverVersion - Returns the CUDA driver version. - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue - * \notefnerr - * - * \sa - * ::cudaRuntimeGetVersion, - * ::cuDriverGetVersion - */ -extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion); - -/** - * \brief Returns the CUDA Runtime version - * - * Returns in \p *runtimeVersion the version number of the installed CUDA - * Runtime. This function automatically returns ::cudaErrorInvalidValue if - * the \p runtimeVersion argument is NULL. - * - * \param runtimeVersion - Returns the CUDA Runtime version. - * - * \return - * ::cudaSuccess, - * ::cudaErrorInvalidValue - * - * \sa - * ::cudaDriverGetVersion, - * ::cuDriverGetVersion - */ -extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion); - -/** @} */ /* END CUDART__VERSION */ - -/** \cond impl_private */ -extern __host__ cudaError_t CUDARTAPI cudaGetExportTable(const void **ppExportTable, const cudaUUID_t *pExportTableId); -/** \endcond impl_private */ - -/** - * \defgroup CUDART_HIGHLEVEL C++ API Routines - * - * ___MANBRIEF___ C++ high level API functions of the CUDA runtime API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the C++ high level API functions of the CUDA runtime - * application programming interface. To use these functions, your - * application needs to be compiled with the \p nvcc compiler. - * - * \brief C++-style interface built on top of CUDA runtime API - */ - -/** - * \defgroup CUDART_DRIVER Interactions with the CUDA Driver API - * - * ___MANBRIEF___ interactions between CUDA Driver API and CUDA Runtime API - * (___CURRENT_FILE___) ___ENDMANBRIEF___ - * - * This section describes the interactions between the CUDA Driver API and the CUDA Runtime API - * - * @{ - * - * \section CUDART_CUDA_primary Primary Contexts - * - * There exists a one to one relationship between CUDA devices in the CUDA Runtime - * API and ::CUcontext s in the CUDA Driver API within a process. The specific - * context which the CUDA Runtime API uses for a device is called the device's - * primary context. From the perspective of the CUDA Runtime API, a device and - * its primary context are synonymous. - * - * \section CUDART_CUDA_init Initialization and Tear-Down - * - * CUDA Runtime API calls operate on the CUDA Driver API ::CUcontext which is current to - * to the calling host thread. - * - * The function ::cudaSetDevice() makes the primary context for the - * specified device current to the calling thread by calling ::cuCtxSetCurrent(). - * - * The CUDA Runtime API will automatically initialize the primary context for - * a device at the first CUDA Runtime API call which requires an active context. - * If no ::CUcontext is current to the calling thread when a CUDA Runtime API call - * which requires an active context is made, then the primary context for a device - * will be selected, made current to the calling thread, and initialized. - * - * The context which the CUDA Runtime API initializes will be initialized using - * the parameters specified by the CUDA Runtime API functions - * ::cudaSetDeviceFlags(), - * ::cudaD3D9SetDirect3DDevice(), - * ::cudaD3D10SetDirect3DDevice(), - * ::cudaD3D11SetDirect3DDevice(), - * ::cudaGLSetGLDevice(), and - * ::cudaVDPAUSetVDPAUDevice(). - * Note that these functions will fail with ::cudaErrorSetOnActiveProcess if they are - * called when the primary context for the specified device has already been initialized. - * (or if the current device has already been initialized, in the case of - * ::cudaSetDeviceFlags()). - * - * Primary contexts will remain active until they are explicitly deinitialized - * using ::cudaDeviceReset(). The function ::cudaDeviceReset() will deinitialize the - * primary context for the calling thread's current device immediately. The context - * will remain current to all of the threads that it was current to. The next CUDA - * Runtime API call on any thread which requires an active context will trigger the - * reinitialization of that device's primary context. - * - * Note that there is no reference counting of the primary context's lifetime. It is - * recommended that the primary context not be deinitialized except just before exit - * or to recover from an unspecified launch failure. - * - * \section CUDART_CUDA_context Context Interoperability - * - * Note that the use of multiple ::CUcontext s per device within a single process - * will substantially degrade performance and is strongly discouraged. Instead, - * it is highly recommended that the implicit one-to-one device-to-context mapping - * for the process provided by the CUDA Runtime API be used. - * - * If a non-primary ::CUcontext created by the CUDA Driver API is current to a - * thread then the CUDA Runtime API calls to that thread will operate on that - * ::CUcontext, with some exceptions listed below. Interoperability between data - * types is discussed in the following sections. - * - * The function ::cudaPointerGetAttributes() will return the error - * ::cudaErrorIncompatibleDriverContext if the pointer being queried was allocated by a - * non-primary context. The function ::cudaDeviceEnablePeerAccess() and the rest of - * the peer access API may not be called when a non-primary ::CUcontext is current. - * To use the pointer query and peer access APIs with a context created using the - * CUDA Driver API, it is necessary that the CUDA Driver API be used to access - * these features. - * - * All CUDA Runtime API state (e.g, global variables' addresses and values) travels - * with its underlying ::CUcontext. In particular, if a ::CUcontext is moved from one - * thread to another then all CUDA Runtime API state will move to that thread as well. - * - * Please note that attaching to legacy contexts (those with a version of 3010 as returned - * by ::cuCtxGetApiVersion()) is not possible. The CUDA Runtime will return - * ::cudaErrorIncompatibleDriverContext in such cases. - * - * \section CUDART_CUDA_stream Interactions between CUstream and cudaStream_t - * - * The types ::CUstream and ::cudaStream_t are identical and may be used interchangeably. - * - * \section CUDART_CUDA_event Interactions between CUevent and cudaEvent_t - * - * The types ::CUevent and ::cudaEvent_t are identical and may be used interchangeably. - * - * \section CUDART_CUDA_array Interactions between CUarray and cudaArray_t - * - * The types ::CUarray and struct ::cudaArray * represent the same data type and may be used - * interchangeably by casting the two types between each other. - * - * In order to use a ::CUarray in a CUDA Runtime API function which takes a struct ::cudaArray *, - * it is necessary to explicitly cast the ::CUarray to a struct ::cudaArray *. - * - * In order to use a struct ::cudaArray * in a CUDA Driver API function which takes a ::CUarray, - * it is necessary to explicitly cast the struct ::cudaArray * to a ::CUarray . - * - * \section CUDART_CUDA_graphicsResource Interactions between CUgraphicsResource and cudaGraphicsResource_t - * - * The types ::CUgraphicsResource and ::cudaGraphicsResource_t represent the same data type and may be used - * interchangeably by casting the two types between each other. - * - * In order to use a ::CUgraphicsResource in a CUDA Runtime API function which takes a - * ::cudaGraphicsResource_t, it is necessary to explicitly cast the ::CUgraphicsResource - * to a ::cudaGraphicsResource_t. - * - * In order to use a ::cudaGraphicsResource_t in a CUDA Driver API function which takes a - * ::CUgraphicsResource, it is necessary to explicitly cast the ::cudaGraphicsResource_t - * to a ::CUgraphicsResource. - * - * @} - */ - -#if defined(__CUDA_API_VERSION_INTERNAL) - #undef cudaMemcpy - #undef cudaMemcpyToSymbol - #undef cudaMemcpyFromSymbol - #undef cudaMemcpy2D - #undef cudaMemcpyToArray - #undef cudaMemcpy2DToArray - #undef cudaMemcpyFromArray - #undef cudaMemcpy2DFromArray - #undef cudaMemcpyArrayToArray - #undef cudaMemcpy2DArrayToArray - #undef cudaMemcpy3D - #undef cudaMemcpy3DPeer - #undef cudaMemset - #undef cudaMemset2D - #undef cudaMemset3D - #undef cudaMemcpyAsync - #undef cudaMemcpyToSymbolAsync - #undef cudaMemcpyFromSymbolAsync - #undef cudaMemcpy2DAsync - #undef cudaMemcpyToArrayAsync - #undef cudaMemcpy2DToArrayAsync - #undef cudaMemcpyFromArrayAsync - #undef cudaMemcpy2DFromArrayAsync - #undef cudaMemcpy3DAsync - #undef cudaMemcpy3DPeerAsync - #undef cudaMemsetAsync - #undef cudaMemset2DAsync - #undef cudaMemset3DAsync - #undef cudaStreamQuery - #undef cudaStreamGetFlags - #undef cudaStreamGetPriority - #undef cudaEventRecord - #undef cudaStreamWaitEvent - #undef cudaStreamAddCallback - #undef cudaStreamAttachMemAsync - #undef cudaStreamSynchronize - #undef cudaLaunch - #undef cudaLaunchKernel - #undef cudaMemPrefetchAsync - #undef cudaLaunchCooperativeKernel - extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind); - extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const void *symbol, const void *src, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice)); - extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(void *dst, const void *symbol, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost)); - extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind); - extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind); - extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind); - extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind); - extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind); - extern __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)); - extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)); - extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3DParms *p); - extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p); - extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value, size_t count); - extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch, int value, size_t width, size_t height); - extern __host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent); - extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); - extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); - extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); - extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); - extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); - extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); - extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); - extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); - extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0)); - extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0)); - extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream __dv(0)); - extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream __dv(0)); - extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream __dv(0)); - extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream); - extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags); - extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetPriority(cudaStream_t hStream, int *priority); - extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0)); - extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags); - extern __host__ cudaError_t CUDARTAPI cudaStreamAddCallback(cudaStream_t stream, cudaStreamCallback_t callback, void *userData, unsigned int flags); - extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length, unsigned int flags); - extern __host__ cudaError_t CUDARTAPI cudaStreamSynchronize(cudaStream_t stream); - extern __host__ cudaError_t CUDARTAPI cudaLaunch(const void *func); - extern __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream); - extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream); - extern __host__ cudaError_t CUDARTAPI cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice, cudaStream_t stream); -#elif defined(__CUDART_API_PER_THREAD_DEFAULT_STREAM) - // nvcc stubs reference the 'cudaLaunch' identifier even if it was defined - // to 'cudaLaunch_ptsz'. Redirect through a static inline function. - #undef cudaLaunch - static __inline__ __host__ cudaError_t cudaLaunch(const void *func) - { - return cudaLaunch_ptsz(func); - } - #define cudaLaunch __CUDART_API_PTSZ(cudaLaunch) -#endif - -#if defined(__cplusplus) -} - -#endif /* __cplusplus */ - -#undef __dv - -#endif /* !__CUDA_RUNTIME_API_H__ */ diff --git a/include/triton/external/CUDA/cudnn.h b/include/triton/external/CUDA/cudnn.h deleted file mode 100755 index b375596c1..000000000 --- a/include/triton/external/CUDA/cudnn.h +++ /dev/null @@ -1,1805 +0,0 @@ -/* - * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - - /* cudnn : Neural Networks Library - - */ - -#if !defined(CUDNN_H_) -#define CUDNN_H_ - -#define CUDNN_MAJOR 7 -#define CUDNN_MINOR 0 -#define CUDNN_PATCHLEVEL 2 - -#define CUDNN_VERSION (CUDNN_MAJOR * 1000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL) - -#include "driver_types.h" -#include "cuda_runtime.h" - -#ifndef CUDNNWINAPI -#ifdef _WIN32 -#define CUDNNWINAPI __stdcall -#else -#define CUDNNWINAPI -#endif -#endif - -#if defined (__cplusplus) -extern "C" { -#endif - -struct cudnnContext; -typedef struct cudnnContext *cudnnHandle_t; - -size_t CUDNNWINAPI cudnnGetVersion(void); - -/* Returns CUDA Runtime version statically linked against cudnn */ -size_t CUDNNWINAPI cudnnGetCudartVersion(void); - -/* - * CUDNN return codes - */ -typedef enum -{ - CUDNN_STATUS_SUCCESS = 0, - CUDNN_STATUS_NOT_INITIALIZED = 1, - CUDNN_STATUS_ALLOC_FAILED = 2, - CUDNN_STATUS_BAD_PARAM = 3, - CUDNN_STATUS_INTERNAL_ERROR = 4, - CUDNN_STATUS_INVALID_VALUE = 5, - CUDNN_STATUS_ARCH_MISMATCH = 6, - CUDNN_STATUS_MAPPING_ERROR = 7, - CUDNN_STATUS_EXECUTION_FAILED = 8, - CUDNN_STATUS_NOT_SUPPORTED = 9, - CUDNN_STATUS_LICENSE_ERROR = 10, - CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING = 11, - CUDNN_STATUS_RUNTIME_IN_PROGRESS = 12, - CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 13, -} cudnnStatus_t; - -/* human-readable error messages */ -const char * CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status); - -/* Forward definition in this version only */ -typedef struct cudnnRuntimeTag_t cudnnRuntimeTag_t; - -typedef enum -{ - CUDNN_ERRQUERY_RAWCODE = 0, - CUDNN_ERRQUERY_NONBLOCKING = 1, - CUDNN_ERRQUERY_BLOCKING = 2, -} cudnnErrQueryMode_t; - -cudnnStatus_t CUDNNWINAPI cudnnQueryRuntimeError( - cudnnHandle_t handle, - cudnnStatus_t *rstatus, - cudnnErrQueryMode_t mode, - cudnnRuntimeTag_t *tag ); - -#ifndef __LIBRARY_TYPES_H__ - -typedef enum libraryPropertyType_t -{ - MAJOR_VERSION, - MINOR_VERSION, - PATCH_LEVEL -} libraryPropertyType; - -#endif - -cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type, int *value); - -cudnnStatus_t CUDNNWINAPI cudnnCreate (cudnnHandle_t *handle); -cudnnStatus_t CUDNNWINAPI cudnnDestroy (cudnnHandle_t handle); -cudnnStatus_t CUDNNWINAPI cudnnSetStream (cudnnHandle_t handle, cudaStream_t streamId); -cudnnStatus_t CUDNNWINAPI cudnnGetStream (cudnnHandle_t handle, cudaStream_t *streamId); - -/* Data structures to represent Image/Filter and the Neural Network Layer */ -typedef struct cudnnTensorStruct* cudnnTensorDescriptor_t; -typedef struct cudnnConvolutionStruct* cudnnConvolutionDescriptor_t; -typedef struct cudnnPoolingStruct* cudnnPoolingDescriptor_t; -typedef struct cudnnFilterStruct* cudnnFilterDescriptor_t; -typedef struct cudnnLRNStruct* cudnnLRNDescriptor_t; -typedef struct cudnnActivationStruct* cudnnActivationDescriptor_t; -typedef struct cudnnSpatialTransformerStruct* cudnnSpatialTransformerDescriptor_t; -typedef struct cudnnOpTensorStruct* cudnnOpTensorDescriptor_t; -typedef struct cudnnReduceTensorStruct* cudnnReduceTensorDescriptor_t; -typedef struct cudnnCTCLossStruct* cudnnCTCLossDescriptor_t; -/* -* CUDNN data type -*/ -typedef enum -{ - CUDNN_DATA_FLOAT = 0, - CUDNN_DATA_DOUBLE = 1, - CUDNN_DATA_HALF = 2, - CUDNN_DATA_INT8 = 3, - CUDNN_DATA_INT32 = 4, - CUDNN_DATA_INT8x4 = 5 -} cudnnDataType_t; - -/* -* CUDNN math type -*/ -typedef enum { - CUDNN_DEFAULT_MATH = 0, - CUDNN_TENSOR_OP_MATH = 1, -} cudnnMathType_t; - -/* - * CUDNN propagate Nan - */ -typedef enum{ - CUDNN_NOT_PROPAGATE_NAN = 0, - CUDNN_PROPAGATE_NAN = 1, -} cudnnNanPropagation_t; - -/* - * CUDNN Determinism - */ -typedef enum -{ - CUDNN_NON_DETERMINISTIC = 0, - CUDNN_DETERMINISTIC = 1, -} cudnnDeterminism_t; - -/* Maximum supported number of tensor dimensions */ -#define CUDNN_DIM_MAX 8 - -/* Create an instance of a generic Tensor descriptor */ -cudnnStatus_t CUDNNWINAPI cudnnCreateTensorDescriptor( - cudnnTensorDescriptor_t *tensorDesc ); - -typedef enum -{ - CUDNN_TENSOR_NCHW = 0, /* row major (wStride = 1, hStride = w) */ - CUDNN_TENSOR_NHWC = 1, /* feature maps interleaved ( cStride = 1 )*/ - CUDNN_TENSOR_NCHW_VECT_C = 2 /* each image point is vector of element of C : the length of the vector is carried by the data type*/ -} cudnnTensorFormat_t; - -cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptor( - cudnnTensorDescriptor_t tensorDesc, - cudnnTensorFormat_t format, - cudnnDataType_t dataType, /* image data type */ - int n, /* number of inputs (batch size) */ - int c, /* number of input feature maps */ - int h, /* height of input section */ - int w ); /* width of input section */ - -cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptorEx( - cudnnTensorDescriptor_t tensorDesc, - cudnnDataType_t dataType, /* image data type */ - int n, /* number of inputs (batch size) */ - int c, /* number of input feature maps */ - int h, /* height of input section */ - int w, /* width of input section */ - int nStride, - int cStride, - int hStride, - int wStride ); - -cudnnStatus_t CUDNNWINAPI cudnnGetTensor4dDescriptor( - const cudnnTensorDescriptor_t tensorDesc, - cudnnDataType_t *dataType, /* image data type */ - int *n, /* number of inputs (batch size) */ - int *c, /* number of input feature maps */ - int *h, /* height of input section */ - int *w, /* width of input section */ - int *nStride, - int *cStride, - int *hStride, - int *wStride ); - -cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptor( - cudnnTensorDescriptor_t tensorDesc, - cudnnDataType_t dataType, - int nbDims, - const int dimA[], - const int strideA[] ); - -cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptorEx( - cudnnTensorDescriptor_t tensorDesc, - cudnnTensorFormat_t format, - cudnnDataType_t dataType, - int nbDims, - const int dimA[] ); - -cudnnStatus_t CUDNNWINAPI cudnnGetTensorNdDescriptor( - const cudnnTensorDescriptor_t tensorDesc, - int nbDimsRequested, - cudnnDataType_t *dataType, - int *nbDims, - int dimA[], - int strideA[] ); - -cudnnStatus_t CUDNNWINAPI cudnnGetTensorSizeInBytes( - const cudnnTensorDescriptor_t tensorDesc, - size_t *size); - -/* PixelOffset( n, c, h, w ) = n *input_stride + c * feature_stride + h * h_stride + w * w_stride - - 1)Example of all images in row major order one batch of features after the other (with an optional padding on row) - input_stride : c x h x h_stride - feature_stride : h x h_stride - h_stride : >= w ( h_stride = w if no padding) - w_stride : 1 - - - 2)Example of all images in row major with features maps interleaved - input_stride : c x h x h_stride - feature_stride : 1 - h_stride : w x c - w_stride : c - - 3)Example of all images in column major order one batch of features after the other (with optional padding on column) - input_stride : c x w x w_stride - feature_stride : w x w_stride - h_stride : 1 - w_stride : >= h - -*/ - -/* Destroy an instance of Tensor4d descriptor */ -cudnnStatus_t CUDNNWINAPI cudnnDestroyTensorDescriptor( - cudnnTensorDescriptor_t tensorDesc ); - - -/* Tensor layout conversion helper (y = alpha * x + beta * y) */ -cudnnStatus_t CUDNNWINAPI cudnnTransformTensor( - cudnnHandle_t handle, - const void *alpha, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const void *beta, - const cudnnTensorDescriptor_t yDesc, - void *y ); - - -/* Tensor Bias addition : C = alpha * A + beta * C */ -cudnnStatus_t CUDNNWINAPI cudnnAddTensor( - cudnnHandle_t handle, - const void *alpha, - const cudnnTensorDescriptor_t aDesc, - const void *A, - const void *beta, - const cudnnTensorDescriptor_t cDesc, - void *C ); - -/* -* CUDNN OpTensor op type -*/ -typedef enum -{ - CUDNN_OP_TENSOR_ADD = 0, - CUDNN_OP_TENSOR_MUL = 1, - CUDNN_OP_TENSOR_MIN = 2, - CUDNN_OP_TENSOR_MAX = 3, - CUDNN_OP_TENSOR_SQRT = 4, - CUDNN_OP_TENSOR_NOT = 5, -} cudnnOpTensorOp_t; - -cudnnStatus_t CUDNNWINAPI cudnnCreateOpTensorDescriptor( - cudnnOpTensorDescriptor_t *opTensorDesc ); - -cudnnStatus_t CUDNNWINAPI cudnnSetOpTensorDescriptor( - cudnnOpTensorDescriptor_t opTensorDesc, - cudnnOpTensorOp_t opTensorOp, - cudnnDataType_t opTensorCompType, - cudnnNanPropagation_t opTensorNanOpt ); - -cudnnStatus_t CUDNNWINAPI cudnnGetOpTensorDescriptor( - const cudnnOpTensorDescriptor_t opTensorDesc, - cudnnOpTensorOp_t *opTensorOp, - cudnnDataType_t *opTensorCompType, - cudnnNanPropagation_t *opTensorNanOpt ); - -cudnnStatus_t CUDNNWINAPI cudnnDestroyOpTensorDescriptor( - cudnnOpTensorDescriptor_t opTensorDesc ); - -/* Tensor operation : C = op( alpha1 * A, alpha2 * B ) + beta * C */ -/* B tensor is ignored for CUDNN_OP_TENSOR_SQRT, CUDNN_OP_TENSOR_NOT. */ -cudnnStatus_t CUDNNWINAPI cudnnOpTensor( - cudnnHandle_t handle, - const cudnnOpTensorDescriptor_t opTensorDesc, - const void *alpha1, - const cudnnTensorDescriptor_t aDesc, - const void *A, - const void *alpha2, - const cudnnTensorDescriptor_t bDesc, - const void *B, - const void *beta, - const cudnnTensorDescriptor_t cDesc, - void *C ); - -/* -* CUDNN ReduceTensor op type -*/ -typedef enum -{ - CUDNN_REDUCE_TENSOR_ADD = 0, - CUDNN_REDUCE_TENSOR_MUL = 1, - CUDNN_REDUCE_TENSOR_MIN = 2, - CUDNN_REDUCE_TENSOR_MAX = 3, - CUDNN_REDUCE_TENSOR_AMAX = 4, - CUDNN_REDUCE_TENSOR_AVG = 5, - CUDNN_REDUCE_TENSOR_NORM1 = 6, - CUDNN_REDUCE_TENSOR_NORM2 = 7, - CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8, -} cudnnReduceTensorOp_t; - -/* -* CUDNN ReduceTensor indices type -*/ -typedef enum -{ - CUDNN_REDUCE_TENSOR_NO_INDICES = 0, - CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1, -} cudnnReduceTensorIndices_t; - -/* -* CUDNN tensor indices type size (all unsigned) -* Currently not supported, default is 32 bit unsigned. -*/ -typedef enum -{ - CUDNN_32BIT_INDICES = 0, - CUDNN_64BIT_INDICES = 1, - CUDNN_16BIT_INDICES = 2, - CUDNN_8BIT_INDICES = 3, -} cudnnIndicesType_t; - -cudnnStatus_t CUDNNWINAPI cudnnCreateReduceTensorDescriptor( - cudnnReduceTensorDescriptor_t *reduceTensorDesc ); - -cudnnStatus_t CUDNNWINAPI cudnnSetReduceTensorDescriptor( - cudnnReduceTensorDescriptor_t reduceTensorDesc, - cudnnReduceTensorOp_t reduceTensorOp, - cudnnDataType_t reduceTensorCompType, - cudnnNanPropagation_t reduceTensorNanOpt, - cudnnReduceTensorIndices_t reduceTensorIndices, - cudnnIndicesType_t reduceTensorIndicesType ); - -cudnnStatus_t CUDNNWINAPI cudnnGetReduceTensorDescriptor( - const cudnnReduceTensorDescriptor_t reduceTensorDesc, - cudnnReduceTensorOp_t *reduceTensorOp, - cudnnDataType_t *reduceTensorCompType, - cudnnNanPropagation_t *reduceTensorNanOpt, - cudnnReduceTensorIndices_t *reduceTensorIndices, - cudnnIndicesType_t *reduceTensorIndicesType ); - -cudnnStatus_t CUDNNWINAPI cudnnDestroyReduceTensorDescriptor( - cudnnReduceTensorDescriptor_t reduceTensorDesc ); - - /* Helper function to return the minimum size of the index space to be passed to the reduction given the input and output tensors */ -cudnnStatus_t CUDNNWINAPI cudnnGetReductionIndicesSize( - cudnnHandle_t handle, - const cudnnReduceTensorDescriptor_t reduceTensorDesc, - const cudnnTensorDescriptor_t aDesc, - const cudnnTensorDescriptor_t cDesc, - size_t *sizeInBytes ); - - /* Helper function to return the minimum size of the workspace to be passed to the reduction given the input and output tensors */ -cudnnStatus_t CUDNNWINAPI cudnnGetReductionWorkspaceSize( - cudnnHandle_t handle, - const cudnnReduceTensorDescriptor_t reduceTensorDesc, - const cudnnTensorDescriptor_t aDesc, - const cudnnTensorDescriptor_t cDesc, - size_t *sizeInBytes ); - -/* Tensor operation : C = reduce op( alpha * A ) + beta * C */ -/* The NaN propagation enum applies to only the min and max reduce ops; the other reduce ops propagate NaN as usual. */ -/* The indices space is ignored for reduce ops other than min or max. */ -cudnnStatus_t CUDNNWINAPI cudnnReduceTensor( - cudnnHandle_t handle, - const cudnnReduceTensorDescriptor_t reduceTensorDesc, - void *indices, - size_t indicesSizeInBytes, - void *workspace, - size_t workspaceSizeInBytes, - const void *alpha, - const cudnnTensorDescriptor_t aDesc, - const void *A, - const void *beta, - const cudnnTensorDescriptor_t cDesc, - void *C ); - -/* Set all values of a tensor to a given value : y[i] = value[0] */ -cudnnStatus_t CUDNNWINAPI cudnnSetTensor( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t yDesc, - void *y, - const void *valuePtr ); - -/* Scale all values of a tensor by a given factor : y[i] = alpha * y[i] */ -cudnnStatus_t CUDNNWINAPI cudnnScaleTensor( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t yDesc, - void *y, - const void *alpha ); - -/* - * convolution mode - */ -typedef enum -{ - CUDNN_CONVOLUTION = 0, - CUDNN_CROSS_CORRELATION = 1 -} cudnnConvolutionMode_t; - - -/* Create an instance of FilterStruct */ -cudnnStatus_t CUDNNWINAPI cudnnCreateFilterDescriptor( - cudnnFilterDescriptor_t *filterDesc ); - - -cudnnStatus_t CUDNNWINAPI cudnnSetFilter4dDescriptor( - cudnnFilterDescriptor_t filterDesc, - cudnnDataType_t dataType, /* image data type */ - cudnnTensorFormat_t format, - int k, /* number of output feature maps */ - int c, /* number of input feature maps */ - int h, /* height of each input filter */ - int w ); /* width of each input filter */ - - -cudnnStatus_t CUDNNWINAPI cudnnGetFilter4dDescriptor( - const cudnnFilterDescriptor_t filterDesc, - cudnnDataType_t *dataType, /* image data type */ - cudnnTensorFormat_t *format, - int *k, /* number of output feature maps */ - int *c, /* number of input feature maps */ - int *h, /* height of each input filter */ - int *w ); /* width of each input filter */ - - -cudnnStatus_t CUDNNWINAPI cudnnSetFilterNdDescriptor( - cudnnFilterDescriptor_t filterDesc, - cudnnDataType_t dataType, /* image data type */ - cudnnTensorFormat_t format, - int nbDims, - const int filterDimA[] ); - -cudnnStatus_t CUDNNWINAPI cudnnGetFilterNdDescriptor( - const cudnnFilterDescriptor_t filterDesc, - int nbDimsRequested, - cudnnDataType_t *dataType, /* image data type */ - cudnnTensorFormat_t *format, - int *nbDims, - int filterDimA[] ); - - -cudnnStatus_t CUDNNWINAPI cudnnDestroyFilterDescriptor( - cudnnFilterDescriptor_t filterDesc ); - -/* Create an instance of convolution descriptor */ -cudnnStatus_t CUDNNWINAPI cudnnCreateConvolutionDescriptor( - cudnnConvolutionDescriptor_t *convDesc ); - -cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionMathType( cudnnConvolutionDescriptor_t convDesc, - cudnnMathType_t mathType ); - -cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionMathType( cudnnConvolutionDescriptor_t convDesc, - cudnnMathType_t *mathType ); - -cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionGroupCount( cudnnConvolutionDescriptor_t convDesc, - int groupCount ); - -cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionGroupCount( cudnnConvolutionDescriptor_t convDesc, - int *groupCount ); - -cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor( cudnnConvolutionDescriptor_t convDesc, - int pad_h, /* zero-padding height */ - int pad_w, /* zero-padding width */ - int u, /* vertical filter stride */ - int v, /* horizontal filter stride */ - int dilation_h, /* filter dilation in the vertical dimension */ - int dilation_w, /* filter dilation in the horizontal dimension */ - cudnnConvolutionMode_t mode, - cudnnDataType_t computeType - ); - -cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor( const cudnnConvolutionDescriptor_t convDesc, - int* pad_h, /* zero-padding height */ - int* pad_w, /* zero-padding width */ - int* u, /* vertical filter stride */ - int* v, /* horizontal filter stride */ - int* dilation_h, /* filter dilation in the vertical dimension */ - int* dilation_w, /* filter dilation in the horizontal dimension */ - cudnnConvolutionMode_t* mode, - cudnnDataType_t *computeType - ); - -/* Helper function to return the dimensions of the output tensor given a convolution descriptor */ -cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dForwardOutputDim( - const cudnnConvolutionDescriptor_t convDesc, - const cudnnTensorDescriptor_t inputTensorDesc, - const cudnnFilterDescriptor_t filterDesc, - int *n, - int *c, - int *h, - int *w ); - - -cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionNdDescriptor( - cudnnConvolutionDescriptor_t convDesc, - int arrayLength, /* nbDims-2 size */ - const int padA[], - const int filterStrideA[], - const int dilationA[], - cudnnConvolutionMode_t mode, - cudnnDataType_t computeType ); /* convolution data type */ - -cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdDescriptor( - const cudnnConvolutionDescriptor_t convDesc, - int arrayLengthRequested, - int *arrayLength, - int padA[], - int strideA[], - int dilationA[], - cudnnConvolutionMode_t *mode, - cudnnDataType_t *computeType ); /* convolution data type */ - - -/* Helper function to return the dimensions of the output tensor given a convolution descriptor */ -cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdForwardOutputDim( - const cudnnConvolutionDescriptor_t convDesc, - const cudnnTensorDescriptor_t inputTensorDesc, - const cudnnFilterDescriptor_t filterDesc, - int nbDims, - int tensorOuputDimA[] ); - -/* Destroy an instance of convolution descriptor */ -cudnnStatus_t CUDNNWINAPI cudnnDestroyConvolutionDescriptor( - cudnnConvolutionDescriptor_t convDesc ); - - -/* helper function to provide the convolution algo that fit best the requirement */ -typedef enum -{ - CUDNN_CONVOLUTION_FWD_NO_WORKSPACE = 0, - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST = 1, - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT = 2, -} cudnnConvolutionFwdPreference_t; - - -typedef enum -{ - CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 0, - CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1, - CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2, - CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 3, - CUDNN_CONVOLUTION_FWD_ALGO_FFT = 4, - CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 5, - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 6, - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED = 7, - CUDNN_CONVOLUTION_FWD_ALGO_COUNT = 8 -} cudnnConvolutionFwdAlgo_t; - -typedef struct { - cudnnConvolutionFwdAlgo_t algo; - cudnnStatus_t status; - float time; - size_t memory; - cudnnDeterminism_t determinism; - cudnnMathType_t mathType; - int reserved[3]; -} cudnnConvolutionFwdAlgoPerf_t; - -cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithmMaxCount( cudnnHandle_t handle, - int *count); - -cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithm( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t xDesc, - const cudnnFilterDescriptor_t wDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnTensorDescriptor_t yDesc, - const int requestedAlgoCount, - int *returnedAlgoCount, - cudnnConvolutionFwdAlgoPerf_t *perfResults ); - -cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithmEx( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const cudnnFilterDescriptor_t wDesc, - const void *w, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnTensorDescriptor_t yDesc, - void *y, - const int requestedAlgoCount, - int *returnedAlgoCount, - cudnnConvolutionFwdAlgoPerf_t *perfResults, - void *workSpace, - size_t workSpaceSizeInBytes ); - - -cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t xDesc, - const cudnnFilterDescriptor_t wDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnTensorDescriptor_t yDesc, - cudnnConvolutionFwdPreference_t preference, - size_t memoryLimitInBytes, - cudnnConvolutionFwdAlgo_t *algo ); - - -cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm_v7( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t srcDesc, - const cudnnFilterDescriptor_t filterDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnTensorDescriptor_t destDesc, - const int requestedAlgoCount, - int *returnedAlgoCount, - cudnnConvolutionFwdAlgoPerf_t *perfResults); - -/* - * convolution algorithm (which requires potentially some workspace) - */ - - /* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/ -cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardWorkspaceSize( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t xDesc, - const cudnnFilterDescriptor_t wDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnTensorDescriptor_t yDesc, - cudnnConvolutionFwdAlgo_t algo, - size_t *sizeInBytes ); - - -/* Convolution functions: All of the form "output = alpha * Op(inputs) + beta * output" */ - -/* Function to perform the forward pass for batch convolution */ -cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward( - cudnnHandle_t handle, - const void *alpha, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const cudnnFilterDescriptor_t wDesc, - const void *w, - const cudnnConvolutionDescriptor_t convDesc, - cudnnConvolutionFwdAlgo_t algo, - void *workSpace, - size_t workSpaceSizeInBytes, - const void *beta, - const cudnnTensorDescriptor_t yDesc, - void *y ); - -/* Fused conv/bias/activation operation : y = Act( alpha1 * conv(x) + alpha2 * z + bias ) */ -cudnnStatus_t CUDNNWINAPI cudnnConvolutionBiasActivationForward( - cudnnHandle_t handle, - const void *alpha1, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const cudnnFilterDescriptor_t wDesc, - const void *w, - const cudnnConvolutionDescriptor_t convDesc, - cudnnConvolutionFwdAlgo_t algo, - void *workSpace, - size_t workSpaceSizeInBytes, - const void *alpha2, - const cudnnTensorDescriptor_t zDesc, - const void *z, - const cudnnTensorDescriptor_t biasDesc, - const void *bias, - const cudnnActivationDescriptor_t activationDesc, - const cudnnTensorDescriptor_t yDesc, - void *y ); - -/* Function to compute the bias gradient for batch convolution */ -cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardBias( - cudnnHandle_t handle, - const void *alpha, - const cudnnTensorDescriptor_t dyDesc, - const void *dy, - const void *beta, - const cudnnTensorDescriptor_t dbDesc, - void *db ); - - -/* helper function to provide the convolution algo that fit best the requirement */ -typedef enum -{ - CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE = 0, - CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST = 1, - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT = 2, -} cudnnConvolutionBwdFilterPreference_t; - -typedef enum -{ - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 0, /* non-deterministic */ - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 1, - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 2, - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 3, /* non-deterministic */ - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD = 4, /* not implemented */ - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5, - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING = 6, - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT = 7 -} cudnnConvolutionBwdFilterAlgo_t; - - -typedef struct { - cudnnConvolutionBwdFilterAlgo_t algo; - cudnnStatus_t status; - float time; - size_t memory; - cudnnDeterminism_t determinism; - cudnnMathType_t mathType; - int reserved[3]; -} cudnnConvolutionBwdFilterAlgoPerf_t; - -cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithmMaxCount( cudnnHandle_t handle, - int *count); - -cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithm( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t xDesc, - const cudnnTensorDescriptor_t dyDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnFilterDescriptor_t dwDesc, - const int requestedAlgoCount, - int *returnedAlgoCount, - cudnnConvolutionBwdFilterAlgoPerf_t *perfResults ); - -cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithmEx( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const cudnnTensorDescriptor_t dyDesc, - const void *y, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnFilterDescriptor_t dwDesc, - void *dw, - const int requestedAlgoCount, - int *returnedAlgoCount, - cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, - void *workSpace, - size_t workSpaceSizeInBytes ); - -cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t xDesc, - const cudnnTensorDescriptor_t dyDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnFilterDescriptor_t dwDesc, - cudnnConvolutionBwdFilterPreference_t preference, - size_t memoryLimitInBytes, - cudnnConvolutionBwdFilterAlgo_t *algo ); - -cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm_v7( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t srcDesc, - const cudnnTensorDescriptor_t diffDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnFilterDescriptor_t gradDesc, - const int requestedAlgoCount, - int *returnedAlgoCount, - cudnnConvolutionBwdFilterAlgoPerf_t *perfResults); - -/* - * convolution algorithm (which requires potentially some workspace) - */ - - /* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/ -cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterWorkspaceSize( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t xDesc, - const cudnnTensorDescriptor_t dyDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnFilterDescriptor_t gradDesc, - cudnnConvolutionBwdFilterAlgo_t algo, - size_t *sizeInBytes ); - -cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardFilter( - cudnnHandle_t handle, - const void *alpha, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const cudnnTensorDescriptor_t dyDesc, - const void *dy, - const cudnnConvolutionDescriptor_t convDesc, - cudnnConvolutionBwdFilterAlgo_t algo, - void *workSpace, - size_t workSpaceSizeInBytes, - const void *beta, - const cudnnFilterDescriptor_t dwDesc, - void *dw ); - -/*********************************************************/ -/* helper function to provide the convolution algo that fit best the requirement */ -typedef enum -{ - CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE = 0, - CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST = 1, - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT = 2, -} cudnnConvolutionBwdDataPreference_t; - -typedef enum -{ - CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 0, /* non-deterministic */ - CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 1, - CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 2, - CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 3, - CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 4, - CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5, - CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT = 6 -} cudnnConvolutionBwdDataAlgo_t; - -typedef struct { - cudnnConvolutionBwdDataAlgo_t algo; - cudnnStatus_t status; - float time; - size_t memory; - cudnnDeterminism_t determinism; - cudnnMathType_t mathType; - int reserved[3]; -} cudnnConvolutionBwdDataAlgoPerf_t; - -cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithmMaxCount( cudnnHandle_t handle, - int *count); - -cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithm( - cudnnHandle_t handle, - const cudnnFilterDescriptor_t wDesc, - const cudnnTensorDescriptor_t dyDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnTensorDescriptor_t dxDesc, - const int requestedAlgoCount, - int *returnedAlgoCount, - cudnnConvolutionBwdDataAlgoPerf_t *perfResults ); - -cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithmEx( - cudnnHandle_t handle, - const cudnnFilterDescriptor_t wDesc, - const void *w, - const cudnnTensorDescriptor_t dyDesc, - const void *dy, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnTensorDescriptor_t dxDesc, - void *dx, - const int requestedAlgoCount, - int *returnedAlgoCount, - cudnnConvolutionBwdDataAlgoPerf_t *perfResults, - void *workSpace, - size_t workSpaceSizeInBytes ); - -cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm( - cudnnHandle_t handle, - const cudnnFilterDescriptor_t wDesc, - const cudnnTensorDescriptor_t dyDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnTensorDescriptor_t dxDesc, - cudnnConvolutionBwdDataPreference_t preference, - size_t memoryLimitInBytes, - cudnnConvolutionBwdDataAlgo_t *algo ); - -cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm_v7( - cudnnHandle_t handle, - const cudnnFilterDescriptor_t filterDesc, - const cudnnTensorDescriptor_t diffDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnTensorDescriptor_t gradDesc, - const int requestedAlgoCount, - int *returnedAlgoCount, - cudnnConvolutionBwdDataAlgoPerf_t *perfResults); - - /* Helper function to return the minimum size of the workspace to be passed to the convolution given an algo*/ -cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataWorkspaceSize( - cudnnHandle_t handle, - const cudnnFilterDescriptor_t wDesc, - const cudnnTensorDescriptor_t dyDesc, - const cudnnConvolutionDescriptor_t convDesc, - const cudnnTensorDescriptor_t dxDesc, - cudnnConvolutionBwdDataAlgo_t algo, - size_t *sizeInBytes ); - - -cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardData( - cudnnHandle_t handle, - const void *alpha, - const cudnnFilterDescriptor_t wDesc, - const void *w, - const cudnnTensorDescriptor_t dyDesc, - const void *dy, - const cudnnConvolutionDescriptor_t convDesc, - cudnnConvolutionBwdDataAlgo_t algo, - void *workSpace, - size_t workSpaceSizeInBytes, - const void *beta, - const cudnnTensorDescriptor_t dxDesc, - void *dx ); - - -cudnnStatus_t CUDNNWINAPI cudnnIm2Col( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const cudnnFilterDescriptor_t wDesc, - const cudnnConvolutionDescriptor_t convDesc, - void *colBuffer ); - - -/* - * softmax algorithm - */ -typedef enum -{ - CUDNN_SOFTMAX_FAST = 0, /* straightforward implementation */ - CUDNN_SOFTMAX_ACCURATE = 1, /* subtract max from every point to avoid overflow */ - CUDNN_SOFTMAX_LOG = 2 -} cudnnSoftmaxAlgorithm_t; - -typedef enum -{ - CUDNN_SOFTMAX_MODE_INSTANCE = 0, /* compute the softmax over all C, H, W for each N */ - CUDNN_SOFTMAX_MODE_CHANNEL = 1 /* compute the softmax over all C for each H, W, N */ -} cudnnSoftmaxMode_t; - -/* Softmax functions: All of the form "output = alpha * Op(inputs) + beta * output" */ - -/* Function to perform forward softmax */ -cudnnStatus_t CUDNNWINAPI cudnnSoftmaxForward( - cudnnHandle_t handle, - cudnnSoftmaxAlgorithm_t algo, - cudnnSoftmaxMode_t mode, - const void *alpha, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const void *beta, - const cudnnTensorDescriptor_t yDesc, - void *y ); - -/* Function to perform backward softmax */ -cudnnStatus_t CUDNNWINAPI cudnnSoftmaxBackward( - cudnnHandle_t handle, - cudnnSoftmaxAlgorithm_t algo, - cudnnSoftmaxMode_t mode, - const void *alpha, - const cudnnTensorDescriptor_t yDesc, - const void *y, - const cudnnTensorDescriptor_t dyDesc, - const void *dy, - const void *beta, - const cudnnTensorDescriptor_t dxDesc, - void *dx ); - -/* - * pooling mode - */ -typedef enum -{ - CUDNN_POOLING_MAX = 0, - CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1, /* count for average includes padded values */ - CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2, /* count for average does not include padded values */ - CUDNN_POOLING_MAX_DETERMINISTIC = 3 -} cudnnPoolingMode_t; - -/* Create an instance of pooling descriptor */ -cudnnStatus_t CUDNNWINAPI cudnnCreatePoolingDescriptor( - cudnnPoolingDescriptor_t *poolingDesc ); - -cudnnStatus_t CUDNNWINAPI cudnnSetPooling2dDescriptor( - cudnnPoolingDescriptor_t poolingDesc, - cudnnPoolingMode_t mode, - cudnnNanPropagation_t maxpoolingNanOpt, - int windowHeight, - int windowWidth, - int verticalPadding, - int horizontalPadding, - int verticalStride, - int horizontalStride ); - -cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dDescriptor( - const cudnnPoolingDescriptor_t poolingDesc, - cudnnPoolingMode_t *mode, - cudnnNanPropagation_t *maxpoolingNanOpt, - int *windowHeight, - int *windowWidth, - int *verticalPadding, - int *horizontalPadding, - int *verticalStride, - int *horizontalStride ); - -cudnnStatus_t CUDNNWINAPI cudnnSetPoolingNdDescriptor( - cudnnPoolingDescriptor_t poolingDesc, - const cudnnPoolingMode_t mode, - const cudnnNanPropagation_t maxpoolingNanOpt, - int nbDims, - const int windowDimA[], - const int paddingA[], - const int strideA[] ); - -cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdDescriptor( - const cudnnPoolingDescriptor_t poolingDesc, - int nbDimsRequested, - cudnnPoolingMode_t *mode, - cudnnNanPropagation_t *maxpoolingNanOpt, - int *nbDims, - int windowDimA[], - int paddingA[], - int strideA[] ); - -cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdForwardOutputDim( - const cudnnPoolingDescriptor_t poolingDesc, - const cudnnTensorDescriptor_t inputTensorDesc, - int nbDims, - int outputTensorDimA[] ); - -cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dForwardOutputDim( - const cudnnPoolingDescriptor_t poolingDesc, - const cudnnTensorDescriptor_t inputTensorDesc, - int *n, - int *c, - int *h, - int *w ); - - -/* Destroy an instance of pooling descriptor */ -cudnnStatus_t CUDNNWINAPI cudnnDestroyPoolingDescriptor( - cudnnPoolingDescriptor_t poolingDesc ); - -/* Pooling functions: All of the form "output = alpha * Op(inputs) + beta * output" */ - -/* Function to perform forward pooling */ -cudnnStatus_t CUDNNWINAPI cudnnPoolingForward( - cudnnHandle_t handle, - const cudnnPoolingDescriptor_t poolingDesc, - const void *alpha, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const void *beta, - const cudnnTensorDescriptor_t yDesc, - void *y ); - -/* Function to perform backward pooling */ -cudnnStatus_t CUDNNWINAPI cudnnPoolingBackward( - cudnnHandle_t handle, - const cudnnPoolingDescriptor_t poolingDesc, - const void *alpha, - const cudnnTensorDescriptor_t yDesc, - const void *y, - const cudnnTensorDescriptor_t dyDesc, - const void *dy, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const void *beta, - const cudnnTensorDescriptor_t dxDesc, - void *dx ); - -/* - * activation mode - */ -typedef enum -{ - CUDNN_ACTIVATION_SIGMOID = 0, - CUDNN_ACTIVATION_RELU = 1, - CUDNN_ACTIVATION_TANH = 2, - CUDNN_ACTIVATION_CLIPPED_RELU = 3, - CUDNN_ACTIVATION_ELU = 4 -} cudnnActivationMode_t; - -/* Activation functions: All of the form "output = alpha * Op(inputs) + beta * output" */ -cudnnStatus_t CUDNNWINAPI cudnnCreateActivationDescriptor( - cudnnActivationDescriptor_t *activationDesc); - -cudnnStatus_t CUDNNWINAPI cudnnSetActivationDescriptor( - cudnnActivationDescriptor_t activationDesc, - cudnnActivationMode_t mode, - cudnnNanPropagation_t reluNanOpt, - double coef ); /* ceiling for clipped RELU, alpha for ELU */ - -cudnnStatus_t CUDNNWINAPI cudnnGetActivationDescriptor( - const cudnnActivationDescriptor_t activationDesc, - cudnnActivationMode_t *mode, - cudnnNanPropagation_t *reluNanOpt, - double* coef ); /* ceiling for clipped RELU, alpha for ELU */ - -cudnnStatus_t CUDNNWINAPI cudnnDestroyActivationDescriptor( - cudnnActivationDescriptor_t activationDesc); - -/* Function to perform forward activation */ -cudnnStatus_t CUDNNWINAPI cudnnActivationForward( - cudnnHandle_t handle, - cudnnActivationDescriptor_t activationDesc, - const void *alpha, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const void *beta, - const cudnnTensorDescriptor_t yDesc, - void *y ); - -/* Function to perform backward activation */ -cudnnStatus_t CUDNNWINAPI cudnnActivationBackward( - cudnnHandle_t handle, - cudnnActivationDescriptor_t activationDesc, - const void *alpha, - const cudnnTensorDescriptor_t yDesc, - const void *y, - const cudnnTensorDescriptor_t dyDesc, - const void *dy, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const void *beta, - const cudnnTensorDescriptor_t dxDesc, - void *dx ); - -/* -* Create an instance of LRN (Local Response Normalization) descriptor -* Uses lrnN=5, lrnAlpha=1e-4, lrnBeta=0.75, lrnK=2.0 as defaults from Krizhevsky'12 ImageNet paper -*/ -cudnnStatus_t CUDNNWINAPI cudnnCreateLRNDescriptor( - cudnnLRNDescriptor_t *normDesc ); - -#define CUDNN_LRN_MIN_N 1 /* minimum allowed lrnN */ -#define CUDNN_LRN_MAX_N 16 /* maximum allowed lrnN */ -#define CUDNN_LRN_MIN_K 1e-5 /* minimum allowed lrnK */ -#define CUDNN_LRN_MIN_BETA 0.01 /* minimum allowed lrnBeta */ - -/* LRN layer mode */ -typedef enum -{ - CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0,/* Normalize across tensor's dimA[1] dimension */ -} cudnnLRNMode_t; - -/* -* Uses a window [center-lookBehind, center+lookAhead], where -* lookBehind = floor( (lrnN-1)/2 ), lookAhead = lrnN-lookBehind-1. -* Values of double parameters cast to tensor data type. -*/ -cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor( - cudnnLRNDescriptor_t normDesc, - unsigned lrnN, - double lrnAlpha, - double lrnBeta, - double lrnK ); -/* -* Retrieve the settings currently stored in an LRN layer descriptor -* Any of the provided pointers can be NULL (no corresponding value will be returned) -*/ -cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor( - cudnnLRNDescriptor_t normDesc, - unsigned* lrnN, - double* lrnAlpha, - double* lrnBeta, - double* lrnK ); - -/* Destroy an instance of LRN descriptor */ -cudnnStatus_t CUDNNWINAPI cudnnDestroyLRNDescriptor( cudnnLRNDescriptor_t lrnDesc ); - -/* LRN functions: output = alpha * normalize(x) + beta * old_y */ - -/* LRN cross-channel forward computation. Double parameters cast to tensor data type */ -cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelForward( - cudnnHandle_t handle, - cudnnLRNDescriptor_t normDesc, - cudnnLRNMode_t lrnMode, - const void* alpha, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const void *beta, - const cudnnTensorDescriptor_t yDesc, - void *y ); - -/* LRN cross-channel backward computation. Double parameters cast to tensor data type */ -cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelBackward( - cudnnHandle_t handle, - cudnnLRNDescriptor_t normDesc, - cudnnLRNMode_t lrnMode, - const void* alpha, - const cudnnTensorDescriptor_t yDesc, - const void *y, - const cudnnTensorDescriptor_t dyDesc, - const void *dy, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const void *beta, - const cudnnTensorDescriptor_t dxDesc, - void *dx); - -typedef enum -{ - CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0, -} cudnnDivNormMode_t; - -/* LCN/divisive normalization functions: y = alpha * normalize(x) + beta * y */ -cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationForward( - cudnnHandle_t handle, - cudnnLRNDescriptor_t normDesc, - cudnnDivNormMode_t mode, - const void *alpha, - const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */ - const void *x, - const void *means, /* if NULL, means are assumed to be zero */ - void *temp, - void *temp2, - const void *beta, - const cudnnTensorDescriptor_t yDesc, - void *y ); - -cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationBackward( - cudnnHandle_t handle, - cudnnLRNDescriptor_t normDesc, - cudnnDivNormMode_t mode, - const void *alpha, - const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */ - const void *x, - const void *means, /* if NULL, means are assumed to be zero */ - const void *dy, - void *temp, - void *temp2, - const void *beta, - const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */ - void *dx, /* output x differential */ - void *dMeans ); /* output means differential, can be NULL */ - -typedef enum -{ - /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */ - CUDNN_BATCHNORM_PER_ACTIVATION = 0, - - /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */ - CUDNN_BATCHNORM_SPATIAL = 1, - - /* - * bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors). - * May be faster than CUDNN_BATCHNORM_SPATIAL but imposes some limits on the range of values - */ - CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2, -} cudnnBatchNormMode_t; - -#define CUDNN_BN_MIN_EPSILON 1e-5 /* Minimum epsilon allowed to be used in the Batch Normalization formula */ - -/* -* Derives a tensor descriptor from layer data descriptor for BatchNormalization -* scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for -* bnScaleBiasMeanVarDesc and bnScaleBiasDiffDesc in Batch Normalization forward and backward functions. -*/ -cudnnStatus_t CUDNNWINAPI cudnnDeriveBNTensorDescriptor( - cudnnTensorDescriptor_t derivedBnDesc, - const cudnnTensorDescriptor_t xDesc, - cudnnBatchNormMode_t mode ); - -/* Computes y = BN(x). Also accumulates moving averages of mean and inverse variances */ -cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTraining( - cudnnHandle_t handle, - cudnnBatchNormMode_t mode, - - const void *alpha, /* alpha[0] = result blend factor */ - const void *beta, /* beta[0] = dest layer blend factor */ - - const cudnnTensorDescriptor_t xDesc, - const void *x, /* NxCxHxW */ - const cudnnTensorDescriptor_t yDesc, - void *y, /* NxCxHxW */ - - /* Shared desc for the next 6 tensors in the argument list. - Data type to be set as follows: - type = (typeOf(x) == double) ? double : float - Dimensions for this descriptor depend on normalization mode - - Spatial Normalization : tensors are expected to have dims 1xCx1x1 - (normalization is performed across NxHxW) - - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW - (normalization is performed across N) */ - const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, - - /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */ - const void *bnScale, - const void *bnBias, - - /* MUST use factor=1 in the very first call of a complete training cycle. - Use a factor=1/(1+n) at N-th call to the function to get - Cumulative Moving Average (CMA) behavior - CMA[n] = (x[1]+...+x[n])/n - Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) = - ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) = - CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */ - double exponentialAverageFactor, - - /* Used in Training phase only. - runningMean = newMean*factor + runningMean*(1-factor) */ - void *resultRunningMean, - /* Output in training mode, input in inference. Is the moving average - of variance[x] (factor is applied in the same way as for runningMean) */ - void *resultRunningVariance, - - /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */ - double epsilon, - - /* Optionally save intermediate results from the forward pass here - - can be reused to speed up backward pass. NULL if unused */ - void *resultSaveMean, - void *resultSaveInvVariance ); - -/* -* Performs Batch Normalization during Inference: -* y[i] = bnScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + bnBias[k] -* with bnScale, bnBias, runningMean, runningInvVariance tensors indexed -* according to spatial or per-activation mode. Refer to cudnnBatchNormalizationForwardTraining -* above for notes on function arguments. -*/ -cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardInference( - cudnnHandle_t handle, - cudnnBatchNormMode_t mode, - const void *alpha, /* alpha[0] = result blend factor */ - const void *beta, /* beta[0] = dest layer blend factor */ - const cudnnTensorDescriptor_t xDesc, - const void *x, /* NxCxHxW */ - const cudnnTensorDescriptor_t yDesc, - void *y, /* NxCxHxW */ - const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, - const void *bnScale, - const void *bnBias, - const void *estimatedMean, - const void *estimatedVariance, - double epsilon ); - -/* Performs backward pass of Batch Normalization layer. Returns x gradient, -* bnScale gradient and bnBias gradient */ -cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackward( - cudnnHandle_t handle, - cudnnBatchNormMode_t mode, - const void *alphaDataDiff, - const void *betaDataDiff, - const void *alphaParamDiff, - const void *betaParamDiff, - const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */ - const void *x, - const cudnnTensorDescriptor_t dyDesc, - const void *dy, - const cudnnTensorDescriptor_t dxDesc, - void *dx, - /* Shared tensor desc for the 4 tensors below */ - const cudnnTensorDescriptor_t dBnScaleBiasDesc, - const void *bnScale, /* bnBias doesn't affect backpropagation */ - /* scale and bias diff are not backpropagated below this layer */ - void *dBnScaleResult, - void *dBnBiasResult, - /* Same epsilon as forward pass */ - double epsilon, - - /* Optionally cached intermediate results from - forward pass */ - const void *savedMean, - const void *savedInvVariance ); - - -/* APIs for spatial transformer network*/ -typedef enum { - CUDNN_SAMPLER_BILINEAR=0, -} cudnnSamplerType_t; - -cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor( - cudnnSpatialTransformerDescriptor_t *stDesc); - -cudnnStatus_t CUDNNWINAPI cudnnSetSpatialTransformerNdDescriptor( - cudnnSpatialTransformerDescriptor_t stDesc, - cudnnSamplerType_t samplerType, - cudnnDataType_t dataType, - const int nbDims, - const int dimA[]); - -cudnnStatus_t CUDNNWINAPI cudnnDestroySpatialTransformerDescriptor( - cudnnSpatialTransformerDescriptor_t stDesc); - -cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorForward( - cudnnHandle_t handle, - const cudnnSpatialTransformerDescriptor_t stDesc, - const void *theta, - void *grid); - -cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorBackward( - cudnnHandle_t handle, - const cudnnSpatialTransformerDescriptor_t stDesc, - const void *dgrid, - void *dtheta); - -cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerForward( - cudnnHandle_t handle, - cudnnSpatialTransformerDescriptor_t stDesc, - const void *alpha, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const void *grid, - const void *beta, - cudnnTensorDescriptor_t yDesc, - void *y); - -cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerBackward( - cudnnHandle_t handle, - cudnnSpatialTransformerDescriptor_t stDesc, - const void *alpha, - const cudnnTensorDescriptor_t xDesc, - const void *x, - const void *beta, - const cudnnTensorDescriptor_t dxDesc, - void *dx, - const void *alphaDgrid, - const cudnnTensorDescriptor_t dyDesc, - const void *dy, - const void *grid, - const void *betaDgrid, - void *dgrid); - -typedef struct cudnnDropoutStruct * cudnnDropoutDescriptor_t; - -cudnnStatus_t CUDNNWINAPI cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t * dropoutDesc); - -cudnnStatus_t CUDNNWINAPI cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc); - -/*helper function to determine size of the states to be passed to cudnnSetDropoutDescriptor */ -cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t * sizeInBytes); - -/*helper function to determine size of the reserve space to be passed to dropout forward/backward calls */ -cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t * sizeInBytes); - -cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, - cudnnHandle_t handle, - float dropout, - void * states, - size_t stateSizeInBytes, - unsigned long long seed); - -// Restores the dropout descriptor to a previously saved-off state -cudnnStatus_t CUDNNWINAPI cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, - cudnnHandle_t handle, - float dropout, - void * states, - size_t stateSizeInBytes, - unsigned long long seed); - -cudnnStatus_t CUDNNWINAPI cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, - cudnnHandle_t handle, - float * dropout, - void ** states, - unsigned long long * seed); - -cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(cudnnHandle_t handle, - const cudnnDropoutDescriptor_t dropoutDesc, - const cudnnTensorDescriptor_t xdesc, - const void * x, - const cudnnTensorDescriptor_t ydesc, - void * y, - void * reserveSpace, - size_t reserveSpaceSizeInBytes); - -cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(cudnnHandle_t handle, - const cudnnDropoutDescriptor_t dropoutDesc, - const cudnnTensorDescriptor_t dydesc, - const void * dy, - const cudnnTensorDescriptor_t dxdesc, - void * dx, - void * reserveSpace, - size_t reserveSpaceSizeInBytes); - -/* RNN API */ -typedef enum - { - CUDNN_RNN_RELU = 0, /* Stock RNN with ReLu activation */ - CUDNN_RNN_TANH = 1, /* Stock RNN with tanh activation */ - CUDNN_LSTM = 2, /* LSTM with no peephole connections */ - CUDNN_GRU = 3 /* Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1); */ - } cudnnRNNMode_t; - -typedef enum - { - CUDNN_UNIDIRECTIONAL = 0, - CUDNN_BIDIRECTIONAL = 1 /* Using output concatination at each step. Do we also want to support output sum? */ - } cudnnDirectionMode_t; - -typedef enum - { - CUDNN_LINEAR_INPUT = 0, - CUDNN_SKIP_INPUT = 1 - } cudnnRNNInputMode_t; - - -typedef enum - { - CUDNN_RNN_ALGO_STANDARD = 0, - CUDNN_RNN_ALGO_PERSIST_STATIC = 1, - CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 2 - } cudnnRNNAlgo_t; - -struct cudnnRNNStruct; -typedef struct cudnnRNNStruct* cudnnRNNDescriptor_t; - -cudnnStatus_t CUDNNWINAPI cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t * rnnDesc); -cudnnStatus_t CUDNNWINAPI cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc); - -struct cudnnPersistentRNNPlan; -typedef struct cudnnPersistentRNNPlan *cudnnPersistentRNNPlan_t; - - -/* Expensive. Creates the plan for the specific settings. */ -cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc, - const int minibatch, - const cudnnDataType_t dataType, - cudnnPersistentRNNPlan_t * plan); - -/* Attaches the plan to the descriptor. */ -cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc, - cudnnPersistentRNNPlan_t plan); - -cudnnStatus_t CUDNNWINAPI cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan); - -cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(cudnnHandle_t handle, - cudnnRNNDescriptor_t rnnDesc, - const int hiddenSize, - const int numLayers, - cudnnDropoutDescriptor_t dropoutDesc, /* Between layers, not between recurrent steps. */ - cudnnRNNInputMode_t inputMode, - cudnnDirectionMode_t direction, - cudnnRNNMode_t mode, - cudnnRNNAlgo_t algo, - cudnnDataType_t dataType); - -cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor(cudnnHandle_t cudnnHandle, - cudnnRNNDescriptor_t rnnDesc, - int * hiddenSize, - int * numLayers, - cudnnDropoutDescriptor_t * dropoutDesc, - cudnnRNNInputMode_t * inputMode, - cudnnDirectionMode_t * direction, - cudnnRNNMode_t * mode, - cudnnRNNAlgo_t * algo, - cudnnDataType_t * dataType); - -cudnnStatus_t CUDNNWINAPI cudnnSetRNNMatrixMathType (cudnnRNNDescriptor_t desc, cudnnMathType_t math); - -/* dataType in the RNN descriptor is used to determine math precision */ -/* dataType in weight descriptors and input descriptors is used to describe storage */ -cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize( cudnnHandle_t handle, - const cudnnRNNDescriptor_t rnnDesc, - const int seqLength, - const cudnnTensorDescriptor_t *xDesc, - size_t *sizeInBytes); - -cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize( cudnnHandle_t handle, - const cudnnRNNDescriptor_t rnnDesc, - const int seqLength, - const cudnnTensorDescriptor_t *xDesc, - size_t *sizeInBytes); - - -cudnnStatus_t CUDNNWINAPI cudnnGetRNNParamsSize( cudnnHandle_t handle, - const cudnnRNNDescriptor_t rnnDesc, - const cudnnTensorDescriptor_t xDesc, - size_t *sizeInBytes, - cudnnDataType_t dataType); - -cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams( cudnnHandle_t handle, - const cudnnRNNDescriptor_t rnnDesc, - const int layer, - const cudnnTensorDescriptor_t xDesc, - const cudnnFilterDescriptor_t wDesc, - const void * w, - const int linLayerID, - cudnnFilterDescriptor_t linLayerMatDesc, - void ** linLayerMat); - -cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams( cudnnHandle_t handle, - const cudnnRNNDescriptor_t rnnDesc, - const int layer, - const cudnnTensorDescriptor_t xDesc, - const cudnnFilterDescriptor_t wDesc, - const void * w, - const int linLayerID, - cudnnFilterDescriptor_t linLayerBiasDesc, - void ** linLayerBias); - -cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference( cudnnHandle_t handle, - const cudnnRNNDescriptor_t rnnDesc, - const int seqLength, - const cudnnTensorDescriptor_t * xDesc, - const void * x, - const cudnnTensorDescriptor_t hxDesc, - const void * hx, - const cudnnTensorDescriptor_t cxDesc, - const void * cx, - const cudnnFilterDescriptor_t wDesc, - const void * w, - const cudnnTensorDescriptor_t *yDesc, - void * y, - const cudnnTensorDescriptor_t hyDesc, - void * hy, - const cudnnTensorDescriptor_t cyDesc, - void * cy, - void * workspace, - size_t workSpaceSizeInBytes); - -cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining( cudnnHandle_t handle, - const cudnnRNNDescriptor_t rnnDesc, - const int seqLength, - const cudnnTensorDescriptor_t *xDesc, - const void * x, - const cudnnTensorDescriptor_t hxDesc, - const void * hx, - const cudnnTensorDescriptor_t cxDesc, - const void * cx, - const cudnnFilterDescriptor_t wDesc, - const void * w, - const cudnnTensorDescriptor_t *yDesc, - void * y, - const cudnnTensorDescriptor_t hyDesc, - void * hy, - const cudnnTensorDescriptor_t cyDesc, - void * cy, - void * workspace, - size_t workSpaceSizeInBytes, - void * reserveSpace, - size_t reserveSpaceSizeInBytes); - -cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardData( cudnnHandle_t handle, - const cudnnRNNDescriptor_t rnnDesc, - const int seqLength, - const cudnnTensorDescriptor_t * yDesc, - const void * y, - const cudnnTensorDescriptor_t * dyDesc, - const void * dy, - const cudnnTensorDescriptor_t dhyDesc, - const void * dhy, - const cudnnTensorDescriptor_t dcyDesc, - const void * dcy, - const cudnnFilterDescriptor_t wDesc, - const void * w, - const cudnnTensorDescriptor_t hxDesc, - const void * hx, - const cudnnTensorDescriptor_t cxDesc, - const void * cx, - const cudnnTensorDescriptor_t * dxDesc, - void * dx, - const cudnnTensorDescriptor_t dhxDesc, - void * dhx, - const cudnnTensorDescriptor_t dcxDesc, - void * dcx, - void * workspace, - size_t workSpaceSizeInBytes, - void * reserveSpace, - size_t reserveSpaceSizeInBytes ); - - -cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights( cudnnHandle_t handle, - const cudnnRNNDescriptor_t rnnDesc, - const int seqLength, - const cudnnTensorDescriptor_t * xDesc, - const void * x, - const cudnnTensorDescriptor_t hxDesc, - const void * hx, - const cudnnTensorDescriptor_t * yDesc, - const void * y, - const void * workspace, - size_t workSpaceSizeInBytes, - const cudnnFilterDescriptor_t dwDesc, - void * dw, - const void * reserveSpace, - size_t reserveSpaceSizeInBytes ); - -typedef enum -{ - CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0, - CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 -}cudnnCTCLossAlgo_t; - -/* -* Create an instance of a CTC (Connectionist Temporal Classification) loss descriptor -*/ -cudnnStatus_t CUDNNWINAPI cudnnCreateCTCLossDescriptor( cudnnCTCLossDescriptor_t* ctcLossDesc ); - -cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptor( - cudnnCTCLossDescriptor_t ctcLossDesc, - cudnnDataType_t compType ); - -cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptor( - cudnnCTCLossDescriptor_t ctcLossDesc, - cudnnDataType_t* compType ); - -cudnnStatus_t CUDNNWINAPI cudnnDestroyCTCLossDescriptor( cudnnCTCLossDescriptor_t ctcLossDesc ); - -/* return the ctc costs and gradients, given the probabilities and labels */ -cudnnStatus_t CUDNNWINAPI cudnnCTCLoss( cudnnHandle_t handle, - const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the mini batch size, A is the alphabet size) */ - const void * probs, /* probabilities after softmax, in GPU memory */ - const int * labels, /* labels, in CPU memory */ - const int * labelLengths, /* the length of each label, in CPU memory */ - const int * inputLengths, /* the lengths of timing steps in each batch, in CPU memory */ - void * costs, /* the returned costs of CTC, in GPU memory */ - const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */ - const void * gradients, /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */ - cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */ - cudnnCTCLossDescriptor_t ctcLossDesc, - void * workspace, /* pointer to the workspace, in GPU memory */ - size_t workSpaceSizeInBytes); /* the workspace size needed */ - -/* return the workspace size needed for ctc */ -cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossWorkspaceSize( - cudnnHandle_t handle, - const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the mini batch size, A is the alphabet size) */ - const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A. To compute costs only, set it to NULL */ - const int * labels, /* labels, in CPU memory */ - const int * labelLengths, /* the length of each label, in CPU memory */ - const int * inputLengths, /* the lengths of timing steps in each batch, in CPU memory */ - cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */ - cudnnCTCLossDescriptor_t ctcLossDesc, - size_t *sizeInBytes ); /* pointer to the returned workspace size */ - - -/* DEPRECATED routines to be removed next release : - User should use the non-suffixed version (which has the API and functionality of _v6 version) - Routines with _v5 suffix has the functionality of the non-suffixed routines in the CUDNN V6 - */ - -cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(cudnnHandle_t handle, - cudnnRNNDescriptor_t rnnDesc, - const int hiddenSize, - const int numLayers, - cudnnDropoutDescriptor_t dropoutDesc, /* Between layers, not between recurrent steps. */ - cudnnRNNInputMode_t inputMode, - cudnnDirectionMode_t direction, - cudnnRNNMode_t mode, - cudnnRNNAlgo_t algo, - cudnnDataType_t dataType); - -cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v5(cudnnRNNDescriptor_t rnnDesc, - int hiddenSize, - int numLayers, - cudnnDropoutDescriptor_t dropoutDesc, /* Between layers, not between recurrent steps. */ - cudnnRNNInputMode_t inputMode, - cudnnDirectionMode_t direction, - cudnnRNNMode_t mode, - cudnnDataType_t dataType); -#if defined (__cplusplus) -} -#endif - -#endif /* CUDNN_H_ */ - diff --git a/include/triton/external/CUDA/cusparse.h b/include/triton/external/CUDA/cusparse.h deleted file mode 100644 index 0381c2b0b..000000000 --- a/include/triton/external/CUDA/cusparse.h +++ /dev/null @@ -1,6257 +0,0 @@ -/* - * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -#if !defined(CUSPARSE_H_) -#define CUSPARSE_H_ - - -#ifndef CUSPARSEAPI -#ifdef _WIN32 -#define CUSPARSEAPI __stdcall -#else -#define CUSPARSEAPI -#endif -#endif - -#include "driver_types.h" -#include "cuComplex.h" /* import complex data type */ - -#include "cuda_fp16.h" - -#include "library_types.h" - -#if defined(__cplusplus) -extern "C" { -#endif /* __cplusplus */ - -/* CUSPARSE status type returns */ -typedef enum{ - CUSPARSE_STATUS_SUCCESS=0, - CUSPARSE_STATUS_NOT_INITIALIZED=1, - CUSPARSE_STATUS_ALLOC_FAILED=2, - CUSPARSE_STATUS_INVALID_VALUE=3, - CUSPARSE_STATUS_ARCH_MISMATCH=4, - CUSPARSE_STATUS_MAPPING_ERROR=5, - CUSPARSE_STATUS_EXECUTION_FAILED=6, - CUSPARSE_STATUS_INTERNAL_ERROR=7, - CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED=8, - CUSPARSE_STATUS_ZERO_PIVOT=9 -} cusparseStatus_t; - -/* Opaque structure holding CUSPARSE library context */ -struct cusparseContext; -typedef struct cusparseContext *cusparseHandle_t; - -/* Opaque structure holding the matrix descriptor */ -struct cusparseMatDescr; -typedef struct cusparseMatDescr *cusparseMatDescr_t; - -/* Opaque structure holding the sparse triangular solve information */ -struct cusparseSolveAnalysisInfo; -typedef struct cusparseSolveAnalysisInfo *cusparseSolveAnalysisInfo_t; - -/* Opaque structures holding the sparse triangular solve information */ -struct csrsv2Info; -typedef struct csrsv2Info *csrsv2Info_t; - -struct bsrsv2Info; -typedef struct bsrsv2Info *bsrsv2Info_t; - -struct bsrsm2Info; -typedef struct bsrsm2Info *bsrsm2Info_t; - -/* Opaque structures holding incomplete Cholesky information */ -struct csric02Info; -typedef struct csric02Info *csric02Info_t; - -struct bsric02Info; -typedef struct bsric02Info *bsric02Info_t; - -/* Opaque structures holding incomplete LU information */ -struct csrilu02Info; -typedef struct csrilu02Info *csrilu02Info_t; - -struct bsrilu02Info; -typedef struct bsrilu02Info *bsrilu02Info_t; - -/* Opaque structures holding the hybrid (HYB) storage information */ -struct cusparseHybMat; -typedef struct cusparseHybMat *cusparseHybMat_t; - -/* Opaque structures holding sparse gemm information */ -struct csrgemm2Info; -typedef struct csrgemm2Info *csrgemm2Info_t; - -/* Opaque structure holding the sorting information */ -struct csru2csrInfo; -typedef struct csru2csrInfo *csru2csrInfo_t; - -/* Opaque structure holding the coloring information */ -struct cusparseColorInfo; -typedef struct cusparseColorInfo *cusparseColorInfo_t; - -/* Opaque structure holding the prune information */ -struct pruneInfo; -typedef struct pruneInfo *pruneInfo_t; - -/* Types definitions */ -typedef enum { - CUSPARSE_POINTER_MODE_HOST = 0, - CUSPARSE_POINTER_MODE_DEVICE = 1 -} cusparsePointerMode_t; - -typedef enum { - CUSPARSE_ACTION_SYMBOLIC = 0, - CUSPARSE_ACTION_NUMERIC = 1 -} cusparseAction_t; - -typedef enum { - CUSPARSE_MATRIX_TYPE_GENERAL = 0, - CUSPARSE_MATRIX_TYPE_SYMMETRIC = 1, - CUSPARSE_MATRIX_TYPE_HERMITIAN = 2, - CUSPARSE_MATRIX_TYPE_TRIANGULAR = 3 -} cusparseMatrixType_t; - -typedef enum { - CUSPARSE_FILL_MODE_LOWER = 0, - CUSPARSE_FILL_MODE_UPPER = 1 -} cusparseFillMode_t; - -typedef enum { - CUSPARSE_DIAG_TYPE_NON_UNIT = 0, - CUSPARSE_DIAG_TYPE_UNIT = 1 -} cusparseDiagType_t; - -typedef enum { - CUSPARSE_INDEX_BASE_ZERO = 0, - CUSPARSE_INDEX_BASE_ONE = 1 -} cusparseIndexBase_t; - -typedef enum { - CUSPARSE_OPERATION_NON_TRANSPOSE = 0, - CUSPARSE_OPERATION_TRANSPOSE = 1, - CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2 -} cusparseOperation_t; - -typedef enum { - CUSPARSE_DIRECTION_ROW = 0, - CUSPARSE_DIRECTION_COLUMN = 1 -} cusparseDirection_t; - -typedef enum { - CUSPARSE_HYB_PARTITION_AUTO = 0, // automatically decide how to split the data into regular/irregular part - CUSPARSE_HYB_PARTITION_USER = 1, // store data into regular part up to a user specified treshhold - CUSPARSE_HYB_PARTITION_MAX = 2 // store all data in the regular part -} cusparseHybPartition_t; - -// used in csrsv2, csric02, and csrilu02 -typedef enum { - CUSPARSE_SOLVE_POLICY_NO_LEVEL = 0, // no level information is generated, only reports structural zero. - CUSPARSE_SOLVE_POLICY_USE_LEVEL = 1 -} cusparseSolvePolicy_t; - -typedef enum { - CUSPARSE_SIDE_LEFT =0, - CUSPARSE_SIDE_RIGHT=1 -} cusparseSideMode_t; - -typedef enum { - CUSPARSE_COLOR_ALG0 = 0, // default - CUSPARSE_COLOR_ALG1 = 1 -} cusparseColorAlg_t; - -typedef enum { - CUSPARSE_ALG0 = 0, //default, naive - CUSPARSE_ALG1 = 1 //merge path -} cusparseAlgMode_t; - -/* CUSPARSE initialization and managment routines */ -cusparseStatus_t CUSPARSEAPI cusparseCreate(cusparseHandle_t *handle); -cusparseStatus_t CUSPARSEAPI cusparseDestroy(cusparseHandle_t handle); -cusparseStatus_t CUSPARSEAPI cusparseGetVersion(cusparseHandle_t handle, int *version); -cusparseStatus_t CUSPARSEAPI cusparseGetProperty(libraryPropertyType type, int *value); -cusparseStatus_t CUSPARSEAPI cusparseSetStream(cusparseHandle_t handle, cudaStream_t streamId); -cusparseStatus_t CUSPARSEAPI cusparseGetStream(cusparseHandle_t handle, cudaStream_t *streamId); - - -/* CUSPARSE type creation, destruction, set and get routines */ -cusparseStatus_t CUSPARSEAPI cusparseGetPointerMode(cusparseHandle_t handle, cusparsePointerMode_t *mode); -cusparseStatus_t CUSPARSEAPI cusparseSetPointerMode(cusparseHandle_t handle, cusparsePointerMode_t mode); - -/* sparse matrix descriptor */ -/* When the matrix descriptor is created, its fields are initialized to: - CUSPARSE_MATRIX_TYPE_GENERAL - CUSPARSE_INDEX_BASE_ZERO - All other fields are uninitialized -*/ -cusparseStatus_t CUSPARSEAPI cusparseCreateMatDescr(cusparseMatDescr_t *descrA); -cusparseStatus_t CUSPARSEAPI cusparseDestroyMatDescr (cusparseMatDescr_t descrA); - -cusparseStatus_t CUSPARSEAPI cusparseCopyMatDescr(cusparseMatDescr_t dest, const cusparseMatDescr_t src); - -cusparseStatus_t CUSPARSEAPI cusparseSetMatType(cusparseMatDescr_t descrA, cusparseMatrixType_t type); -cusparseMatrixType_t CUSPARSEAPI cusparseGetMatType(const cusparseMatDescr_t descrA); - -cusparseStatus_t CUSPARSEAPI cusparseSetMatFillMode(cusparseMatDescr_t descrA, cusparseFillMode_t fillMode); -cusparseFillMode_t CUSPARSEAPI cusparseGetMatFillMode(const cusparseMatDescr_t descrA); - -cusparseStatus_t CUSPARSEAPI cusparseSetMatDiagType(cusparseMatDescr_t descrA, cusparseDiagType_t diagType); -cusparseDiagType_t CUSPARSEAPI cusparseGetMatDiagType(const cusparseMatDescr_t descrA); - -cusparseStatus_t CUSPARSEAPI cusparseSetMatIndexBase(cusparseMatDescr_t descrA, cusparseIndexBase_t base); -cusparseIndexBase_t CUSPARSEAPI cusparseGetMatIndexBase(const cusparseMatDescr_t descrA); - -/* sparse triangular solve and incomplete-LU and Cholesky (algorithm 1) */ -cusparseStatus_t CUSPARSEAPI cusparseCreateSolveAnalysisInfo(cusparseSolveAnalysisInfo_t *info); -cusparseStatus_t CUSPARSEAPI cusparseDestroySolveAnalysisInfo(cusparseSolveAnalysisInfo_t info); -cusparseStatus_t CUSPARSEAPI cusparseGetLevelInfo(cusparseHandle_t handle, - cusparseSolveAnalysisInfo_t info, - int *nlevels, - int **levelPtr, - int **levelInd); - -/* sparse triangular solve (algorithm 2) */ -cusparseStatus_t CUSPARSEAPI cusparseCreateCsrsv2Info(csrsv2Info_t *info); -cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrsv2Info(csrsv2Info_t info); - -/* incomplete Cholesky (algorithm 2)*/ -cusparseStatus_t CUSPARSEAPI cusparseCreateCsric02Info(csric02Info_t *info); -cusparseStatus_t CUSPARSEAPI cusparseDestroyCsric02Info(csric02Info_t info); - -cusparseStatus_t CUSPARSEAPI cusparseCreateBsric02Info(bsric02Info_t *info); -cusparseStatus_t CUSPARSEAPI cusparseDestroyBsric02Info(bsric02Info_t info); - -/* incomplete LU (algorithm 2) */ -cusparseStatus_t CUSPARSEAPI cusparseCreateCsrilu02Info(csrilu02Info_t *info); -cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrilu02Info(csrilu02Info_t info); - -cusparseStatus_t CUSPARSEAPI cusparseCreateBsrilu02Info(bsrilu02Info_t *info); -cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrilu02Info(bsrilu02Info_t info); - -/* block-CSR triangular solve (algorithm 2) */ -cusparseStatus_t CUSPARSEAPI cusparseCreateBsrsv2Info(bsrsv2Info_t *info); -cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrsv2Info(bsrsv2Info_t info); - -cusparseStatus_t CUSPARSEAPI cusparseCreateBsrsm2Info(bsrsm2Info_t *info); -cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrsm2Info(bsrsm2Info_t info); - -/* hybrid (HYB) format */ -cusparseStatus_t CUSPARSEAPI cusparseCreateHybMat(cusparseHybMat_t *hybA); -cusparseStatus_t CUSPARSEAPI cusparseDestroyHybMat(cusparseHybMat_t hybA); - -/* sorting information */ -cusparseStatus_t CUSPARSEAPI cusparseCreateCsru2csrInfo(csru2csrInfo_t *info); -cusparseStatus_t CUSPARSEAPI cusparseDestroyCsru2csrInfo(csru2csrInfo_t info); - -/* coloring info */ -cusparseStatus_t CUSPARSEAPI cusparseCreateColorInfo(cusparseColorInfo_t *info); -cusparseStatus_t CUSPARSEAPI cusparseDestroyColorInfo(cusparseColorInfo_t info); - -cusparseStatus_t CUSPARSEAPI cusparseSetColorAlgs(cusparseColorInfo_t info, cusparseColorAlg_t alg); -cusparseStatus_t CUSPARSEAPI cusparseGetColorAlgs(cusparseColorInfo_t info, cusparseColorAlg_t *alg); - -/* prune information */ -cusparseStatus_t CUSPARSEAPI cusparseCreatePruneInfo(pruneInfo_t *info); - -cusparseStatus_t CUSPARSEAPI cusparseDestroyPruneInfo(pruneInfo_t info); - - -/* --- Sparse Level 1 routines --- */ - -/* Description: Addition of a scalar multiple of a sparse vector x - and a dense vector y. */ -cusparseStatus_t CUSPARSEAPI cusparseSaxpyi(cusparseHandle_t handle, - int nnz, - const float *alpha, - const float *xVal, - const int *xInd, - float *y, - cusparseIndexBase_t idxBase); - -cusparseStatus_t CUSPARSEAPI cusparseDaxpyi(cusparseHandle_t handle, - int nnz, - const double *alpha, - const double *xVal, - const int *xInd, - double *y, - cusparseIndexBase_t idxBase); - -cusparseStatus_t CUSPARSEAPI cusparseCaxpyi(cusparseHandle_t handle, - int nnz, - const cuComplex *alpha, - const cuComplex *xVal, - const int *xInd, - cuComplex *y, - cusparseIndexBase_t idxBase); - -cusparseStatus_t CUSPARSEAPI cusparseZaxpyi(cusparseHandle_t handle, - int nnz, - const cuDoubleComplex *alpha, - const cuDoubleComplex *xVal, - const int *xInd, - cuDoubleComplex *y, - cusparseIndexBase_t idxBase); - -/* Description: dot product of a sparse vector x and a dense vector y. */ -cusparseStatus_t CUSPARSEAPI cusparseSdoti(cusparseHandle_t handle, - int nnz, - const float *xVal, - const int *xInd, - const float *y, - float *resultDevHostPtr, - cusparseIndexBase_t idxBase); - -cusparseStatus_t CUSPARSEAPI cusparseDdoti(cusparseHandle_t handle, - int nnz, - const double *xVal, - const int *xInd, - const double *y, - double *resultDevHostPtr, - cusparseIndexBase_t idxBase); - -cusparseStatus_t CUSPARSEAPI cusparseCdoti(cusparseHandle_t handle, - int nnz, - const cuComplex *xVal, - const int *xInd, - const cuComplex *y, - cuComplex *resultDevHostPtr, - cusparseIndexBase_t idxBase); - -cusparseStatus_t CUSPARSEAPI cusparseZdoti(cusparseHandle_t handle, - int nnz, - const cuDoubleComplex *xVal, - const int *xInd, - const cuDoubleComplex *y, - cuDoubleComplex *resultDevHostPtr, - cusparseIndexBase_t idxBase); - -/* Description: dot product of complex conjugate of a sparse vector x - and a dense vector y. */ -cusparseStatus_t CUSPARSEAPI cusparseCdotci(cusparseHandle_t handle, - int nnz, - const cuComplex *xVal, - const int *xInd, - const cuComplex *y, - cuComplex *resultDevHostPtr, - cusparseIndexBase_t idxBase); - -cusparseStatus_t CUSPARSEAPI cusparseZdotci(cusparseHandle_t handle, - int nnz, - const cuDoubleComplex *xVal, - const int *xInd, - const cuDoubleComplex *y, - cuDoubleComplex *resultDevHostPtr, - cusparseIndexBase_t idxBase); - - -/* Description: Gather of non-zero elements from dense vector y into - sparse vector x. */ -cusparseStatus_t CUSPARSEAPI cusparseSgthr(cusparseHandle_t handle, - int nnz, - const float *y, - float *xVal, - const int *xInd, - cusparseIndexBase_t idxBase); - -cusparseStatus_t CUSPARSEAPI cusparseDgthr(cusparseHandle_t handle, - int nnz, - const double *y, - double *xVal, - const int *xInd, - cusparseIndexBase_t idxBase); - -cusparseStatus_t CUSPARSEAPI cusparseCgthr(cusparseHandle_t handle, - int nnz, - const cuComplex *y, - cuComplex *xVal, - const int *xInd, - cusparseIndexBase_t idxBase); - -cusparseStatus_t CUSPARSEAPI cusparseZgthr(cusparseHandle_t handle, - int nnz, - const cuDoubleComplex *y, - cuDoubleComplex *xVal, - const int *xInd, - cusparseIndexBase_t idxBase); - -/* Description: Gather of non-zero elements from desne vector y into - sparse vector x (also replacing these elements in y by zeros). */ -cusparseStatus_t CUSPARSEAPI cusparseSgthrz(cusparseHandle_t handle, - int nnz, - float *y, - float *xVal, - const int *xInd, - cusparseIndexBase_t idxBase); - -cusparseStatus_t CUSPARSEAPI cusparseDgthrz(cusparseHandle_t handle, - int nnz, - double *y, - double *xVal, - const int *xInd, - cusparseIndexBase_t idxBase); - -cusparseStatus_t CUSPARSEAPI cusparseCgthrz(cusparseHandle_t handle, - int nnz, - cuComplex *y, - cuComplex *xVal, - const int *xInd, - cusparseIndexBase_t idxBase); - -cusparseStatus_t CUSPARSEAPI cusparseZgthrz(cusparseHandle_t handle, - int nnz, - cuDoubleComplex *y, - cuDoubleComplex *xVal, - const int *xInd, - cusparseIndexBase_t idxBase); - -/* Description: Scatter of elements of the sparse vector x into - dense vector y. */ -cusparseStatus_t CUSPARSEAPI cusparseSsctr(cusparseHandle_t handle, - int nnz, - const float *xVal, - const int *xInd, - float *y, - cusparseIndexBase_t idxBase); - -cusparseStatus_t CUSPARSEAPI cusparseDsctr(cusparseHandle_t handle, - int nnz, - const double *xVal, - const int *xInd, - double *y, - cusparseIndexBase_t idxBase); - -cusparseStatus_t CUSPARSEAPI cusparseCsctr(cusparseHandle_t handle, - int nnz, - const cuComplex *xVal, - const int *xInd, - cuComplex *y, - cusparseIndexBase_t idxBase); - -cusparseStatus_t CUSPARSEAPI cusparseZsctr(cusparseHandle_t handle, - int nnz, - const cuDoubleComplex *xVal, - const int *xInd, - cuDoubleComplex *y, - cusparseIndexBase_t idxBase); - -/* Description: Givens rotation, where c and s are cosine and sine, - x and y are sparse and dense vectors, respectively. */ -cusparseStatus_t CUSPARSEAPI cusparseSroti(cusparseHandle_t handle, - int nnz, - float *xVal, - const int *xInd, - float *y, - const float *c, - const float *s, - cusparseIndexBase_t idxBase); - -cusparseStatus_t CUSPARSEAPI cusparseDroti(cusparseHandle_t handle, - int nnz, - double *xVal, - const int *xInd, - double *y, - const double *c, - const double *s, - cusparseIndexBase_t idxBase); - - -/* --- Sparse Level 2 routines --- */ - -cusparseStatus_t CUSPARSEAPI cusparseSgemvi(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int n, - const float *alpha, /* host or device pointer */ - const float *A, - int lda, - int nnz, - const float *xVal, - const int *xInd, - const float *beta, /* host or device pointer */ - float *y, - cusparseIndexBase_t idxBase, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseSgemvi_bufferSize( cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int n, - int nnz, - int *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseDgemvi(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int n, - const double *alpha, /* host or device pointer */ - const double *A, - int lda, - int nnz, - const double *xVal, - const int *xInd, - const double *beta, /* host or device pointer */ - double *y, - cusparseIndexBase_t idxBase, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDgemvi_bufferSize( cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int n, - int nnz, - int *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseCgemvi(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int n, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *A, - int lda, - int nnz, - const cuComplex *xVal, - const int *xInd, - const cuComplex *beta, /* host or device pointer */ - cuComplex *y, - cusparseIndexBase_t idxBase, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseCgemvi_bufferSize( cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int n, - int nnz, - int *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseZgemvi(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int n, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *A, - int lda, - int nnz, - const cuDoubleComplex *xVal, - const int *xInd, - const cuDoubleComplex *beta, /* host or device pointer */ - cuDoubleComplex *y, - cusparseIndexBase_t idxBase, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseZgemvi_bufferSize( cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int n, - int nnz, - int *pBufferSize); - - -/* Description: Matrix-vector multiplication y = alpha * op(A) * x + beta * y, - where A is a sparse matrix in CSR storage format, x and y are dense vectors. */ -cusparseStatus_t CUSPARSEAPI cusparseScsrmv(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int n, - int nnz, - const float *alpha, - const cusparseMatDescr_t descrA, - const float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const float *x, - const float *beta, - float *y); - -cusparseStatus_t CUSPARSEAPI cusparseDcsrmv(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int n, - int nnz, - const double *alpha, - const cusparseMatDescr_t descrA, - const double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const double *x, - const double *beta, - double *y); - -cusparseStatus_t CUSPARSEAPI cusparseCcsrmv(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int n, - int nnz, - const cuComplex *alpha, - const cusparseMatDescr_t descrA, - const cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cuComplex *x, - const cuComplex *beta, - cuComplex *y); - -cusparseStatus_t CUSPARSEAPI cusparseZcsrmv(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int n, - int nnz, - const cuDoubleComplex *alpha, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cuDoubleComplex *x, - const cuDoubleComplex *beta, - cuDoubleComplex *y); - -//Returns number of bytes -cusparseStatus_t CUSPARSEAPI cusparseCsrmvEx_bufferSize(cusparseHandle_t handle, - cusparseAlgMode_t alg, - cusparseOperation_t transA, - int m, - int n, - int nnz, - const void *alpha, - cudaDataType alphatype, - const cusparseMatDescr_t descrA, - const void *csrValA, - cudaDataType csrValAtype, - const int *csrRowPtrA, - const int *csrColIndA, - const void *x, - cudaDataType xtype, - const void *beta, - cudaDataType betatype, - void *y, - cudaDataType ytype, - cudaDataType executiontype, - size_t *bufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseCsrmvEx(cusparseHandle_t handle, - cusparseAlgMode_t alg, - cusparseOperation_t transA, - int m, - int n, - int nnz, - const void *alpha, - cudaDataType alphatype, - const cusparseMatDescr_t descrA, - const void *csrValA, - cudaDataType csrValAtype, - const int *csrRowPtrA, - const int *csrColIndA, - const void *x, - cudaDataType xtype, - const void *beta, - cudaDataType betatype, - void *y, - cudaDataType ytype, - cudaDataType executiontype, - void* buffer); - -/* Description: Matrix-vector multiplication y = alpha * op(A) * x + beta * y, - where A is a sparse matrix in CSR storage format, x and y are dense vectors - using a Merge Path load-balancing implementation. */ - cusparseStatus_t CUSPARSEAPI cusparseScsrmv_mp(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int n, - int nnz, - const float *alpha, - const cusparseMatDescr_t descrA, - const float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const float *x, - const float *beta, - float *y); - -cusparseStatus_t CUSPARSEAPI cusparseDcsrmv_mp(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int n, - int nnz, - const double *alpha, - const cusparseMatDescr_t descrA, - const double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const double *x, - const double *beta, - double *y); - -cusparseStatus_t CUSPARSEAPI cusparseCcsrmv_mp(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int n, - int nnz, - const cuComplex *alpha, - const cusparseMatDescr_t descrA, - const cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cuComplex *x, - const cuComplex *beta, - cuComplex *y); - -cusparseStatus_t CUSPARSEAPI cusparseZcsrmv_mp(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int n, - int nnz, - const cuDoubleComplex *alpha, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cuDoubleComplex *x, - const cuDoubleComplex *beta, - cuDoubleComplex *y); - - -/* Description: Matrix-vector multiplication y = alpha * op(A) * x + beta * y, - where A is a sparse matrix in HYB storage format, x and y are dense vectors. */ -cusparseStatus_t CUSPARSEAPI cusparseShybmv(cusparseHandle_t handle, - cusparseOperation_t transA, - const float *alpha, - const cusparseMatDescr_t descrA, - const cusparseHybMat_t hybA, - const float *x, - const float *beta, - float *y); - -cusparseStatus_t CUSPARSEAPI cusparseDhybmv(cusparseHandle_t handle, - cusparseOperation_t transA, - const double *alpha, - const cusparseMatDescr_t descrA, - const cusparseHybMat_t hybA, - const double *x, - const double *beta, - double *y); - -cusparseStatus_t CUSPARSEAPI cusparseChybmv(cusparseHandle_t handle, - cusparseOperation_t transA, - const cuComplex *alpha, - const cusparseMatDescr_t descrA, - const cusparseHybMat_t hybA, - const cuComplex *x, - const cuComplex *beta, - cuComplex *y); - -cusparseStatus_t CUSPARSEAPI cusparseZhybmv(cusparseHandle_t handle, - cusparseOperation_t transA, - const cuDoubleComplex *alpha, - const cusparseMatDescr_t descrA, - const cusparseHybMat_t hybA, - const cuDoubleComplex *x, - const cuDoubleComplex *beta, - cuDoubleComplex *y); - -/* Description: Matrix-vector multiplication y = alpha * op(A) * x + beta * y, - where A is a sparse matrix in BSR storage format, x and y are dense vectors. */ -cusparseStatus_t CUSPARSEAPI cusparseSbsrmv(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - int mb, - int nb, - int nnzb, - const float *alpha, - const cusparseMatDescr_t descrA, - const float *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int blockDim, - const float *x, - const float *beta, - float *y); - -cusparseStatus_t CUSPARSEAPI cusparseDbsrmv(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - int mb, - int nb, - int nnzb, - const double *alpha, - const cusparseMatDescr_t descrA, - const double *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int blockDim, - const double *x, - const double *beta, - double *y); - -cusparseStatus_t CUSPARSEAPI cusparseCbsrmv(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - int mb, - int nb, - int nnzb, - const cuComplex *alpha, - const cusparseMatDescr_t descrA, - const cuComplex *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int blockDim, - const cuComplex *x, - const cuComplex *beta, - cuComplex *y); - -cusparseStatus_t CUSPARSEAPI cusparseZbsrmv(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - int mb, - int nb, - int nnzb, - const cuDoubleComplex *alpha, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int blockDim, - const cuDoubleComplex *x, - const cuDoubleComplex *beta, - cuDoubleComplex *y); - -/* Description: Matrix-vector multiplication y = alpha * op(A) * x + beta * y, - where A is a sparse matrix in extended BSR storage format, x and y are dense - vectors. */ -cusparseStatus_t CUSPARSEAPI cusparseSbsrxmv(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - int sizeOfMask, - int mb, - int nb, - int nnzb, - const float *alpha, - const cusparseMatDescr_t descrA, - const float *bsrSortedValA, - const int *bsrSortedMaskPtrA, - const int *bsrSortedRowPtrA, - const int *bsrSortedEndPtrA, - const int *bsrSortedColIndA, - int blockDim, - const float *x, - const float *beta, - float *y); - - -cusparseStatus_t CUSPARSEAPI cusparseDbsrxmv(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - int sizeOfMask, - int mb, - int nb, - int nnzb, - const double *alpha, - const cusparseMatDescr_t descrA, - const double *bsrSortedValA, - const int *bsrSortedMaskPtrA, - const int *bsrSortedRowPtrA, - const int *bsrSortedEndPtrA, - const int *bsrSortedColIndA, - int blockDim, - const double *x, - const double *beta, - double *y); - -cusparseStatus_t CUSPARSEAPI cusparseCbsrxmv(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - int sizeOfMask, - int mb, - int nb, - int nnzb, - const cuComplex *alpha, - const cusparseMatDescr_t descrA, - const cuComplex *bsrSortedValA, - const int *bsrSortedMaskPtrA, - const int *bsrSortedRowPtrA, - const int *bsrSortedEndPtrA, - const int *bsrSortedColIndA, - int blockDim, - const cuComplex *x, - const cuComplex *beta, - cuComplex *y); - - -cusparseStatus_t CUSPARSEAPI cusparseZbsrxmv(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - int sizeOfMask, - int mb, - int nb, - int nnzb, - const cuDoubleComplex *alpha, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *bsrSortedValA, - const int *bsrSortedMaskPtrA, - const int *bsrSortedRowPtrA, - const int *bsrSortedEndPtrA, - const int *bsrSortedColIndA, - int blockDim, - const cuDoubleComplex *x, - const cuDoubleComplex *beta, - cuDoubleComplex *y); - -/* Description: Solution of triangular linear system op(A) * x = alpha * f, - where A is a sparse matrix in CSR storage format, rhs f and solution x - are dense vectors. This routine implements algorithm 1 for the solve. */ -cusparseStatus_t CUSPARSEAPI cusparseCsrsv_analysisEx(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int nnz, - const cusparseMatDescr_t descrA, - const void *csrSortedValA, - cudaDataType csrSortedValAtype, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info, - cudaDataType executiontype); - -cusparseStatus_t CUSPARSEAPI cusparseScsrsv_analysis(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int nnz, - const cusparseMatDescr_t descrA, - const float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info); - -cusparseStatus_t CUSPARSEAPI cusparseDcsrsv_analysis(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int nnz, - const cusparseMatDescr_t descrA, - const double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info); - -cusparseStatus_t CUSPARSEAPI cusparseCcsrsv_analysis(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int nnz, - const cusparseMatDescr_t descrA, - const cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info); - -cusparseStatus_t CUSPARSEAPI cusparseZcsrsv_analysis(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int nnz, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info); - -cusparseStatus_t CUSPARSEAPI cusparseCsrsv_solveEx(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - const void *alpha, - cudaDataType alphatype, - const cusparseMatDescr_t descrA, - const void *csrSortedValA, - cudaDataType csrSortedValAtype, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info, - const void *f, - cudaDataType ftype, - void *x, - cudaDataType xtype, - cudaDataType executiontype); - -cusparseStatus_t CUSPARSEAPI cusparseScsrsv_solve(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - const float *alpha, - const cusparseMatDescr_t descrA, - const float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info, - const float *f, - float *x); - -cusparseStatus_t CUSPARSEAPI cusparseDcsrsv_solve(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - const double *alpha, - const cusparseMatDescr_t descrA, - const double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info, - const double *f, - double *x); - -cusparseStatus_t CUSPARSEAPI cusparseCcsrsv_solve(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - const cuComplex *alpha, - const cusparseMatDescr_t descrA, - const cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info, - const cuComplex *f, - cuComplex *x); - -cusparseStatus_t CUSPARSEAPI cusparseZcsrsv_solve(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - const cuDoubleComplex *alpha, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info, - const cuDoubleComplex *f, - cuDoubleComplex *x); - -/* Description: Solution of triangular linear system op(A) * x = alpha * f, - where A is a sparse matrix in CSR storage format, rhs f and solution y - are dense vectors. This routine implements algorithm 1 for this problem. - Also, it provides a utility function to query size of buffer used. */ -cusparseStatus_t CUSPARSEAPI cusparseXcsrsv2_zeroPivot(cusparseHandle_t handle, - csrsv2Info_t info, - int *position); - -cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_bufferSize(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int nnz, - const cusparseMatDescr_t descrA, - float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrsv2Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_bufferSize(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int nnz, - const cusparseMatDescr_t descrA, - double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrsv2Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_bufferSize(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int nnz, - const cusparseMatDescr_t descrA, - cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrsv2Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_bufferSize(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int nnz, - const cusparseMatDescr_t descrA, - cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrsv2Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_bufferSizeExt(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int nnz, - const cusparseMatDescr_t descrA, - float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrsv2Info_t info, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_bufferSizeExt(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int nnz, - const cusparseMatDescr_t descrA, - double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrsv2Info_t info, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_bufferSizeExt(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int nnz, - const cusparseMatDescr_t descrA, - cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrsv2Info_t info, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_bufferSizeExt(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int nnz, - const cusparseMatDescr_t descrA, - cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrsv2Info_t info, - size_t *pBufferSize); - - -cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_analysis(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int nnz, - const cusparseMatDescr_t descrA, - const float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrsv2Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_analysis(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int nnz, - const cusparseMatDescr_t descrA, - const double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrsv2Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_analysis(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int nnz, - const cusparseMatDescr_t descrA, - const cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrsv2Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_analysis(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int nnz, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrsv2Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_solve(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int nnz, - const float *alpha, - const cusparseMatDescr_t descrA, - const float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrsv2Info_t info, - const float *f, - float *x, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_solve(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int nnz, - const double *alpha, - const cusparseMatDescr_t descrA, - const double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrsv2Info_t info, - const double *f, - double *x, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_solve(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int nnz, - const cuComplex *alpha, - const cusparseMatDescr_t descrA, - const cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrsv2Info_t info, - const cuComplex *f, - cuComplex *x, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_solve(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int nnz, - const cuDoubleComplex *alpha, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrsv2Info_t info, - const cuDoubleComplex *f, - cuDoubleComplex *x, - cusparseSolvePolicy_t policy, - void *pBuffer); - -/* Description: Solution of triangular linear system op(A) * x = alpha * f, - where A is a sparse matrix in block-CSR storage format, rhs f and solution y - are dense vectors. This routine implements algorithm 2 for this problem. - Also, it provides a utility function to query size of buffer used. */ -cusparseStatus_t CUSPARSEAPI cusparseXbsrsv2_zeroPivot(cusparseHandle_t handle, - bsrsv2Info_t info, - int *position); - - -cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_bufferSize(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - float *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int blockDim, - bsrsv2Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_bufferSize(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - double *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int blockDim, - bsrsv2Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_bufferSize(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - cuComplex *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int blockDim, - bsrsv2Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_bufferSize(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - cuDoubleComplex *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int blockDim, - bsrsv2Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_bufferSizeExt(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - float *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int blockSize, - bsrsv2Info_t info, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_bufferSizeExt(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - double *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int blockSize, - bsrsv2Info_t info, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_bufferSizeExt(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - cuComplex *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int blockSize, - bsrsv2Info_t info, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_bufferSizeExt(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - cuDoubleComplex *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int blockSize, - bsrsv2Info_t info, - size_t *pBufferSize); - - -cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_analysis(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - const float *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int blockDim, - bsrsv2Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_analysis(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - const double *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int blockDim, - bsrsv2Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_analysis(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - const cuComplex *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int blockDim, - bsrsv2Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_analysis(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int blockDim, - bsrsv2Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - - -cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_solve(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - int mb, - int nnzb, - const float *alpha, - const cusparseMatDescr_t descrA, - const float *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int blockDim, - bsrsv2Info_t info, - const float *f, - float *x, - cusparseSolvePolicy_t policy, - void *pBuffer); - - -cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_solve(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - int mb, - int nnzb, - const double *alpha, - const cusparseMatDescr_t descrA, - const double *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int blockDim, - bsrsv2Info_t info, - const double *f, - double *x, - cusparseSolvePolicy_t policy, - void *pBuffer); - - -cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_solve(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - int mb, - int nnzb, - const cuComplex *alpha, - const cusparseMatDescr_t descrA, - const cuComplex *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int blockDim, - bsrsv2Info_t info, - const cuComplex *f, - cuComplex *x, - cusparseSolvePolicy_t policy, - void *pBuffer); - - -cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_solve(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - int mb, - int nnzb, - const cuDoubleComplex *alpha, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int blockDim, - bsrsv2Info_t info, - const cuDoubleComplex *f, - cuDoubleComplex *x, - cusparseSolvePolicy_t policy, - void *pBuffer); - -/* Description: Solution of triangular linear system op(A) * x = alpha * f, - where A is a sparse matrix in HYB storage format, rhs f and solution x - are dense vectors. */ -cusparseStatus_t CUSPARSEAPI cusparseShybsv_analysis(cusparseHandle_t handle, - cusparseOperation_t transA, - const cusparseMatDescr_t descrA, - cusparseHybMat_t hybA, - cusparseSolveAnalysisInfo_t info); - -cusparseStatus_t CUSPARSEAPI cusparseDhybsv_analysis(cusparseHandle_t handle, - cusparseOperation_t transA, - const cusparseMatDescr_t descrA, - cusparseHybMat_t hybA, - cusparseSolveAnalysisInfo_t info); - -cusparseStatus_t CUSPARSEAPI cusparseChybsv_analysis(cusparseHandle_t handle, - cusparseOperation_t transA, - const cusparseMatDescr_t descrA, - cusparseHybMat_t hybA, - cusparseSolveAnalysisInfo_t info); - -cusparseStatus_t CUSPARSEAPI cusparseZhybsv_analysis(cusparseHandle_t handle, - cusparseOperation_t transA, - const cusparseMatDescr_t descrA, - cusparseHybMat_t hybA, - cusparseSolveAnalysisInfo_t info); - -cusparseStatus_t CUSPARSEAPI cusparseShybsv_solve(cusparseHandle_t handle, - cusparseOperation_t trans, - const float *alpha, - const cusparseMatDescr_t descra, - const cusparseHybMat_t hybA, - cusparseSolveAnalysisInfo_t info, - const float *f, - float *x); - -cusparseStatus_t CUSPARSEAPI cusparseChybsv_solve(cusparseHandle_t handle, - cusparseOperation_t trans, - const cuComplex *alpha, - const cusparseMatDescr_t descra, - const cusparseHybMat_t hybA, - cusparseSolveAnalysisInfo_t info, - const cuComplex *f, - cuComplex *x); - -cusparseStatus_t CUSPARSEAPI cusparseDhybsv_solve(cusparseHandle_t handle, - cusparseOperation_t trans, - const double *alpha, - const cusparseMatDescr_t descra, - const cusparseHybMat_t hybA, - cusparseSolveAnalysisInfo_t info, - const double *f, - double *x); - -cusparseStatus_t CUSPARSEAPI cusparseZhybsv_solve(cusparseHandle_t handle, - cusparseOperation_t trans, - const cuDoubleComplex *alpha, - const cusparseMatDescr_t descra, - const cusparseHybMat_t hybA, - cusparseSolveAnalysisInfo_t info, - const cuDoubleComplex *f, - cuDoubleComplex *x); - - -/* --- Sparse Level 3 routines --- */ - -/* Description: sparse - dense matrix multiplication C = alpha * op(A) * B + beta * C, - where A is a sparse matrix in CSR format, B and C are dense tall matrices. */ -cusparseStatus_t CUSPARSEAPI cusparseScsrmm(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int n, - int k, - int nnz, - const float *alpha, - const cusparseMatDescr_t descrA, - const float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const float *B, - int ldb, - const float *beta, - float *C, - int ldc); - -cusparseStatus_t CUSPARSEAPI cusparseDcsrmm(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int n, - int k, - int nnz, - const double *alpha, - const cusparseMatDescr_t descrA, - const double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const double *B, - int ldb, - const double *beta, - double *C, - int ldc); - -cusparseStatus_t CUSPARSEAPI cusparseCcsrmm(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int n, - int k, - int nnz, - const cuComplex *alpha, - const cusparseMatDescr_t descrA, - const cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cuComplex *B, - int ldb, - const cuComplex *beta, - cuComplex *C, - int ldc); - -cusparseStatus_t CUSPARSEAPI cusparseZcsrmm(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int n, - int k, - int nnz, - const cuDoubleComplex *alpha, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cuDoubleComplex *B, - int ldb, - const cuDoubleComplex *beta, - cuDoubleComplex *C, - int ldc); - -/* Description: sparse - dense matrix multiplication C = alpha * op(A) * B + beta * C, - where A is a sparse matrix in CSR format, B and C are dense tall matrices. - This routine allows transposition of matrix B, which may improve performance. */ -cusparseStatus_t CUSPARSEAPI cusparseScsrmm2(cusparseHandle_t handle, - cusparseOperation_t transA, - cusparseOperation_t transB, - int m, - int n, - int k, - int nnz, - const float *alpha, - const cusparseMatDescr_t descrA, - const float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const float *B, - int ldb, - const float *beta, - float *C, - int ldc); - -cusparseStatus_t CUSPARSEAPI cusparseDcsrmm2(cusparseHandle_t handle, - cusparseOperation_t transA, - cusparseOperation_t transB, - int m, - int n, - int k, - int nnz, - const double *alpha, - const cusparseMatDescr_t descrA, - const double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const double *B, - int ldb, - const double *beta, - double *C, - int ldc); - -cusparseStatus_t CUSPARSEAPI cusparseCcsrmm2(cusparseHandle_t handle, - cusparseOperation_t transA, - cusparseOperation_t transB, - int m, - int n, - int k, - int nnz, - const cuComplex *alpha, - const cusparseMatDescr_t descrA, - const cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cuComplex *B, - int ldb, - const cuComplex *beta, - cuComplex *C, - int ldc); - -cusparseStatus_t CUSPARSEAPI cusparseZcsrmm2(cusparseHandle_t handle, - cusparseOperation_t transA, - cusparseOperation_t transB, - int m, - int n, - int k, - int nnz, - const cuDoubleComplex *alpha, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cuDoubleComplex *B, - int ldb, - const cuDoubleComplex *beta, - cuDoubleComplex *C, - int ldc); - -/* Description: sparse - dense matrix multiplication C = alpha * op(A) * B + beta * C, - where A is a sparse matrix in block-CSR format, B and C are dense tall matrices. - This routine allows transposition of matrix B, which may improve performance. */ -cusparseStatus_t CUSPARSEAPI cusparseSbsrmm(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - cusparseOperation_t transB, - int mb, - int n, - int kb, - int nnzb, - const float *alpha, - const cusparseMatDescr_t descrA, - const float *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - const int blockSize, - const float *B, - const int ldb, - const float *beta, - float *C, - int ldc); - -cusparseStatus_t CUSPARSEAPI cusparseDbsrmm(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - cusparseOperation_t transB, - int mb, - int n, - int kb, - int nnzb, - const double *alpha, - const cusparseMatDescr_t descrA, - const double *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - const int blockSize, - const double *B, - const int ldb, - const double *beta, - double *C, - int ldc); - -cusparseStatus_t CUSPARSEAPI cusparseCbsrmm(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - cusparseOperation_t transB, - int mb, - int n, - int kb, - int nnzb, - const cuComplex *alpha, - const cusparseMatDescr_t descrA, - const cuComplex *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - const int blockSize, - const cuComplex *B, - const int ldb, - const cuComplex *beta, - cuComplex *C, - int ldc); - -cusparseStatus_t CUSPARSEAPI cusparseZbsrmm(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - cusparseOperation_t transB, - int mb, - int n, - int kb, - int nnzb, - const cuDoubleComplex *alpha, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - const int blockSize, - const cuDoubleComplex *B, - const int ldb, - const cuDoubleComplex *beta, - cuDoubleComplex *C, - int ldc); - - -/* Description: dense - sparse matrix multiplication C = alpha * A * B + beta * C, - where A is column-major dense matrix, B is a sparse matrix in CSC format, - and C is column-major dense matrix. */ -cusparseStatus_t CUSPARSEAPI cusparseSgemmi(cusparseHandle_t handle, - int m, - int n, - int k, - int nnz, - const float *alpha, /* host or device pointer */ - const float *A, - int lda, - const float *cscValB, - const int *cscColPtrB, - const int *cscRowIndB, - const float *beta, /* host or device pointer */ - float *C, - int ldc); - -cusparseStatus_t CUSPARSEAPI cusparseDgemmi(cusparseHandle_t handle, - int m, - int n, - int k, - int nnz, - const double *alpha, /* host or device pointer */ - const double *A, - int lda, - const double *cscValB, - const int *cscColPtrB, - const int *cscRowIndB, - const double *beta, /* host or device pointer */ - double *C, - int ldc); - -cusparseStatus_t CUSPARSEAPI cusparseCgemmi(cusparseHandle_t handle, - int m, - int n, - int k, - int nnz, - const cuComplex *alpha, /* host or device pointer */ - const cuComplex *A, - int lda, - const cuComplex *cscValB, - const int *cscColPtrB, - const int *cscRowIndB, - const cuComplex *beta, /* host or device pointer */ - cuComplex *C, - int ldc); - -cusparseStatus_t CUSPARSEAPI cusparseZgemmi(cusparseHandle_t handle, - int m, - int n, - int k, - int nnz, - const cuDoubleComplex *alpha, /* host or device pointer */ - const cuDoubleComplex *A, - int lda, - const cuDoubleComplex *cscValB, - const int *cscColPtrB, - const int *cscRowIndB, - const cuDoubleComplex *beta, /* host or device pointer */ - cuDoubleComplex *C, - int ldc); - - -/* Description: Solution of triangular linear system op(A) * X = alpha * F, - with multiple right-hand-sides, where A is a sparse matrix in CSR storage - format, rhs F and solution X are dense tall matrices. - This routine implements algorithm 1 for this problem. */ -cusparseStatus_t CUSPARSEAPI cusparseScsrsm_analysis(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int nnz, - const cusparseMatDescr_t descrA, - const float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info); - -cusparseStatus_t CUSPARSEAPI cusparseDcsrsm_analysis(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int nnz, - const cusparseMatDescr_t descrA, - const double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info); - -cusparseStatus_t CUSPARSEAPI cusparseCcsrsm_analysis(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int nnz, - const cusparseMatDescr_t descrA, - const cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info); - -cusparseStatus_t CUSPARSEAPI cusparseZcsrsm_analysis(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int nnz, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info); - - -cusparseStatus_t CUSPARSEAPI cusparseScsrsm_solve(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int n, - const float *alpha, - const cusparseMatDescr_t descrA, - const float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info, - const float *F, - int ldf, - float *X, - int ldx); - -cusparseStatus_t CUSPARSEAPI cusparseDcsrsm_solve(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int n, - const double *alpha, - const cusparseMatDescr_t descrA, - const double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info, - const double *F, - int ldf, - double *X, - int ldx); - -cusparseStatus_t CUSPARSEAPI cusparseCcsrsm_solve(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int n, - const cuComplex *alpha, - const cusparseMatDescr_t descrA, - const cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info, - const cuComplex *F, - int ldf, - cuComplex *X, - int ldx); - -cusparseStatus_t CUSPARSEAPI cusparseZcsrsm_solve(cusparseHandle_t handle, - cusparseOperation_t transA, - int m, - int n, - const cuDoubleComplex *alpha, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info, - const cuDoubleComplex *F, - int ldf, - cuDoubleComplex *X, - int ldx); - -/* Description: Solution of triangular linear system op(A) * X = alpha * F, - with multiple right-hand-sides, where A is a sparse matrix in CSR storage - format, rhs F and solution X are dense tall matrices. - This routine implements algorithm 2 for this problem. */ -cusparseStatus_t CUSPARSEAPI cusparseXbsrsm2_zeroPivot(cusparseHandle_t handle, - bsrsm2Info_t info, - int *position); - -cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_bufferSize(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - cusparseOperation_t transXY, - int mb, - int n, - int nnzb, - const cusparseMatDescr_t descrA, - float *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockSize, - bsrsm2Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_bufferSize(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - cusparseOperation_t transXY, - int mb, - int n, - int nnzb, - const cusparseMatDescr_t descrA, - double *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockSize, - bsrsm2Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_bufferSize(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - cusparseOperation_t transXY, - int mb, - int n, - int nnzb, - const cusparseMatDescr_t descrA, - cuComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockSize, - bsrsm2Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_bufferSize(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - cusparseOperation_t transXY, - int mb, - int n, - int nnzb, - const cusparseMatDescr_t descrA, - cuDoubleComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockSize, - bsrsm2Info_t info, - int *pBufferSizeInBytes); - - -cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_bufferSizeExt(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - cusparseOperation_t transB, - int mb, - int n, - int nnzb, - const cusparseMatDescr_t descrA, - float *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockSize, - bsrsm2Info_t info, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_bufferSizeExt(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - cusparseOperation_t transB, - int mb, - int n, - int nnzb, - const cusparseMatDescr_t descrA, - double *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockSize, - bsrsm2Info_t info, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_bufferSizeExt(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - cusparseOperation_t transB, - int mb, - int n, - int nnzb, - const cusparseMatDescr_t descrA, - cuComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockSize, - bsrsm2Info_t info, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_bufferSizeExt(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - cusparseOperation_t transB, - int mb, - int n, - int nnzb, - const cusparseMatDescr_t descrA, - cuDoubleComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockSize, - bsrsm2Info_t info, - size_t *pBufferSize); - - -cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_analysis(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - cusparseOperation_t transXY, - int mb, - int n, - int nnzb, - const cusparseMatDescr_t descrA, - const float *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockSize, - bsrsm2Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_analysis(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - cusparseOperation_t transXY, - int mb, - int n, - int nnzb, - const cusparseMatDescr_t descrA, - const double *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockSize, - bsrsm2Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_analysis(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - cusparseOperation_t transXY, - int mb, - int n, - int nnzb, - const cusparseMatDescr_t descrA, - const cuComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockSize, - bsrsm2Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_analysis(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - cusparseOperation_t transXY, - int mb, - int n, - int nnzb, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockSize, - bsrsm2Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - - -cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_solve(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - cusparseOperation_t transXY, - int mb, - int n, - int nnzb, - const float *alpha, - const cusparseMatDescr_t descrA, - const float *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockSize, - bsrsm2Info_t info, - const float *F, - int ldf, - float *X, - int ldx, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_solve(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - cusparseOperation_t transXY, - int mb, - int n, - int nnzb, - const double *alpha, - const cusparseMatDescr_t descrA, - const double *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockSize, - bsrsm2Info_t info, - const double *F, - int ldf, - double *X, - int ldx, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_solve(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - cusparseOperation_t transXY, - int mb, - int n, - int nnzb, - const cuComplex *alpha, - const cusparseMatDescr_t descrA, - const cuComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockSize, - bsrsm2Info_t info, - const cuComplex *F, - int ldf, - cuComplex *X, - int ldx, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_solve(cusparseHandle_t handle, - cusparseDirection_t dirA, - cusparseOperation_t transA, - cusparseOperation_t transXY, - int mb, - int n, - int nnzb, - const cuDoubleComplex *alpha, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockSize, - bsrsm2Info_t info, - const cuDoubleComplex *F, - int ldf, - cuDoubleComplex *X, - int ldx, - cusparseSolvePolicy_t policy, - void *pBuffer); - - -/* --- Preconditioners --- */ - -/* Description: Compute the incomplete-LU factorization with 0 fill-in (ILU0) - of the matrix A stored in CSR format based on the information in the opaque - structure info that was obtained from the analysis phase (csrsv_analysis). - This routine implements algorithm 1 for this problem. */ -cusparseStatus_t CUSPARSEAPI cusparseCsrilu0Ex(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - const cusparseMatDescr_t descrA, - void *csrSortedValA_ValM, - cudaDataType csrSortedValA_ValMtype, - /* matrix A values are updated inplace - to be the preconditioner M values */ - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info, - cudaDataType executiontype); - -cusparseStatus_t CUSPARSEAPI cusparseScsrilu0(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - const cusparseMatDescr_t descrA, - float *csrSortedValA_ValM, - /* matrix A values are updated inplace - to be the preconditioner M values */ - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info); - -cusparseStatus_t CUSPARSEAPI cusparseDcsrilu0(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - const cusparseMatDescr_t descrA, - double *csrSortedValA_ValM, - /* matrix A values are updated inplace - to be the preconditioner M values */ - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info); - -cusparseStatus_t CUSPARSEAPI cusparseCcsrilu0(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - const cusparseMatDescr_t descrA, - cuComplex *csrSortedValA_ValM, - /* matrix A values are updated inplace - to be the preconditioner M values */ - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info); - -cusparseStatus_t CUSPARSEAPI cusparseZcsrilu0(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - const cusparseMatDescr_t descrA, - cuDoubleComplex *csrSortedValA_ValM, - /* matrix A values are updated inplace - to be the preconditioner M values */ - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info); - -/* Description: Compute the incomplete-LU factorization with 0 fill-in (ILU0) - of the matrix A stored in CSR format based on the information in the opaque - structure info that was obtained from the analysis phase (csrsv2_analysis). - This routine implements algorithm 2 for this problem. */ -cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_numericBoost(cusparseHandle_t handle, - csrilu02Info_t info, - int enable_boost, - double *tol, - float *boost_val); - -cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_numericBoost(cusparseHandle_t handle, - csrilu02Info_t info, - int enable_boost, - double *tol, - double *boost_val); - -cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_numericBoost(cusparseHandle_t handle, - csrilu02Info_t info, - int enable_boost, - double *tol, - cuComplex *boost_val); - -cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_numericBoost(cusparseHandle_t handle, - csrilu02Info_t info, - int enable_boost, - double *tol, - cuDoubleComplex *boost_val); - -cusparseStatus_t CUSPARSEAPI cusparseXcsrilu02_zeroPivot(cusparseHandle_t handle, - csrilu02Info_t info, - int *position); - -cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_bufferSize(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrilu02Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_bufferSize(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrilu02Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_bufferSize(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrilu02Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_bufferSize(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrilu02Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_bufferSizeExt(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - float *csrSortedVal, - const int *csrSortedRowPtr, - const int *csrSortedColInd, - csrilu02Info_t info, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_bufferSizeExt(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - double *csrSortedVal, - const int *csrSortedRowPtr, - const int *csrSortedColInd, - csrilu02Info_t info, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_bufferSizeExt(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - cuComplex *csrSortedVal, - const int *csrSortedRowPtr, - const int *csrSortedColInd, - csrilu02Info_t info, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_bufferSizeExt(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - cuDoubleComplex *csrSortedVal, - const int *csrSortedRowPtr, - const int *csrSortedColInd, - csrilu02Info_t info, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_analysis(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - const float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrilu02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_analysis(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - const double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrilu02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_analysis(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - const cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrilu02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_analysis(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrilu02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseScsrilu02(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - float *csrSortedValA_valM, - /* matrix A values are updated inplace - to be the preconditioner M values */ - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrilu02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - double *csrSortedValA_valM, - /* matrix A values are updated inplace - to be the preconditioner M values */ - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrilu02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - cuComplex *csrSortedValA_valM, - /* matrix A values are updated inplace - to be the preconditioner M values */ - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrilu02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - cuDoubleComplex *csrSortedValA_valM, - /* matrix A values are updated inplace - to be the preconditioner M values */ - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csrilu02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -/* Description: Compute the incomplete-LU factorization with 0 fill-in (ILU0) - of the matrix A stored in block-CSR format based on the information in the opaque - structure info that was obtained from the analysis phase (bsrsv2_analysis). - This routine implements algorithm 2 for this problem. */ -cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_numericBoost(cusparseHandle_t handle, - bsrilu02Info_t info, - int enable_boost, - double *tol, - float *boost_val); - -cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_numericBoost(cusparseHandle_t handle, - bsrilu02Info_t info, - int enable_boost, - double *tol, - double *boost_val); - -cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_numericBoost(cusparseHandle_t handle, - bsrilu02Info_t info, - int enable_boost, - double *tol, - cuComplex *boost_val); - -cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_numericBoost(cusparseHandle_t handle, - bsrilu02Info_t info, - int enable_boost, - double *tol, - cuDoubleComplex *boost_val); - -cusparseStatus_t CUSPARSEAPI cusparseXbsrilu02_zeroPivot(cusparseHandle_t handle, - bsrilu02Info_t info, - int *position); - -cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_bufferSize(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - float *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockDim, - bsrilu02Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_bufferSize(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - double *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockDim, - bsrilu02Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_bufferSize(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - cuComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockDim, - bsrilu02Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_bufferSize(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - cuDoubleComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockDim, - bsrilu02Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_bufferSizeExt(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - float *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockSize, - bsrilu02Info_t info, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_bufferSizeExt(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - double *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockSize, - bsrilu02Info_t info, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_bufferSizeExt(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - cuComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockSize, - bsrilu02Info_t info, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_bufferSizeExt(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - cuDoubleComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockSize, - bsrilu02Info_t info, - size_t *pBufferSize); - - -cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_analysis(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - float *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockDim, - bsrilu02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_analysis(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - double *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockDim, - bsrilu02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_analysis(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - cuComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockDim, - bsrilu02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_analysis(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - cuDoubleComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockDim, - bsrilu02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - - -cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descra, - float *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockDim, - bsrilu02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descra, - double *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockDim, - bsrilu02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descra, - cuComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockDim, - bsrilu02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descra, - cuDoubleComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockDim, - bsrilu02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -/* Description: Compute the incomplete-Cholesky factorization with 0 fill-in (IC0) - of the matrix A stored in CSR format based on the information in the opaque - structure info that was obtained from the analysis phase (csrsv_analysis). - This routine implements algorithm 1 for this problem. */ -cusparseStatus_t CUSPARSEAPI cusparseScsric0(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - const cusparseMatDescr_t descrA, - float *csrSortedValA_ValM, - /* matrix A values are updated inplace - to be the preconditioner M values */ - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info); - -cusparseStatus_t CUSPARSEAPI cusparseDcsric0(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - const cusparseMatDescr_t descrA, - double *csrSortedValA_ValM, - /* matrix A values are updated inplace - to be the preconditioner M values */ - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info); - -cusparseStatus_t CUSPARSEAPI cusparseCcsric0(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - const cusparseMatDescr_t descrA, - cuComplex *csrSortedValA_ValM, - /* matrix A values are updated inplace - to be the preconditioner M values */ - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info); - -cusparseStatus_t CUSPARSEAPI cusparseZcsric0(cusparseHandle_t handle, - cusparseOperation_t trans, - int m, - const cusparseMatDescr_t descrA, - cuDoubleComplex *csrSortedValA_ValM, - /* matrix A values are updated inplace - to be the preconditioner M values */ - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseSolveAnalysisInfo_t info); - -/* Description: Compute the incomplete-Cholesky factorization with 0 fill-in (IC0) - of the matrix A stored in CSR format based on the information in the opaque - structure info that was obtained from the analysis phase (csrsv2_analysis). - This routine implements algorithm 2 for this problem. */ -cusparseStatus_t CUSPARSEAPI cusparseXcsric02_zeroPivot(cusparseHandle_t handle, - csric02Info_t info, - int *position); - -cusparseStatus_t CUSPARSEAPI cusparseScsric02_bufferSize(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csric02Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseDcsric02_bufferSize(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csric02Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseCcsric02_bufferSize(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csric02Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseZcsric02_bufferSize(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csric02Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseScsric02_bufferSizeExt(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - float *csrSortedVal, - const int *csrSortedRowPtr, - const int *csrSortedColInd, - csric02Info_t info, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseDcsric02_bufferSizeExt(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - double *csrSortedVal, - const int *csrSortedRowPtr, - const int *csrSortedColInd, - csric02Info_t info, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseCcsric02_bufferSizeExt(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - cuComplex *csrSortedVal, - const int *csrSortedRowPtr, - const int *csrSortedColInd, - csric02Info_t info, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseZcsric02_bufferSizeExt(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - cuDoubleComplex *csrSortedVal, - const int *csrSortedRowPtr, - const int *csrSortedColInd, - csric02Info_t info, - size_t *pBufferSize); - - -cusparseStatus_t CUSPARSEAPI cusparseScsric02_analysis(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - const float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csric02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - - -cusparseStatus_t CUSPARSEAPI cusparseDcsric02_analysis(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - const double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csric02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseCcsric02_analysis(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - const cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csric02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseZcsric02_analysis(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csric02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseScsric02(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - float *csrSortedValA_valM, - /* matrix A values are updated inplace - to be the preconditioner M values */ - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csric02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDcsric02(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - double *csrSortedValA_valM, - /* matrix A values are updated inplace - to be the preconditioner M values */ - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csric02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseCcsric02(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - cuComplex *csrSortedValA_valM, - /* matrix A values are updated inplace - to be the preconditioner M values */ - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csric02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseZcsric02(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - cuDoubleComplex *csrSortedValA_valM, - /* matrix A values are updated inplace - to be the preconditioner M values */ - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - csric02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -/* Description: Compute the incomplete-Cholesky factorization with 0 fill-in (IC0) - of the matrix A stored in block-CSR format based on the information in the opaque - structure info that was obtained from the analysis phase (bsrsv2_analysis). - This routine implements algorithm 1 for this problem. */ -cusparseStatus_t CUSPARSEAPI cusparseXbsric02_zeroPivot(cusparseHandle_t handle, - bsric02Info_t info, - int *position); - -cusparseStatus_t CUSPARSEAPI cusparseSbsric02_bufferSize(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - float *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockDim, - bsric02Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseDbsric02_bufferSize(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - double *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockDim, - bsric02Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseCbsric02_bufferSize(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - cuComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockDim, - bsric02Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseZbsric02_bufferSize(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - cuDoubleComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockDim, - bsric02Info_t info, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseSbsric02_bufferSizeExt(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - float *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockSize, - bsric02Info_t info, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseDbsric02_bufferSizeExt(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - double *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockSize, - bsric02Info_t info, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseCbsric02_bufferSizeExt(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - cuComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockSize, - bsric02Info_t info, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseZbsric02_bufferSizeExt(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - cuDoubleComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockSize, - bsric02Info_t info, - size_t *pBufferSize); - - - -cusparseStatus_t CUSPARSEAPI cusparseSbsric02_analysis(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - const float *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockDim, - bsric02Info_t info, - cusparseSolvePolicy_t policy, - void *pInputBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDbsric02_analysis(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - const double *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockDim, - bsric02Info_t info, - cusparseSolvePolicy_t policy, - void *pInputBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseCbsric02_analysis(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - const cuComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockDim, - bsric02Info_t info, - cusparseSolvePolicy_t policy, - void *pInputBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseZbsric02_analysis(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockDim, - bsric02Info_t info, - cusparseSolvePolicy_t policy, - void *pInputBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseSbsric02(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - float *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockDim, - bsric02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDbsric02(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - double *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockDim, - bsric02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseCbsric02(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - cuComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockDim, - bsric02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseZbsric02(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nnzb, - const cusparseMatDescr_t descrA, - cuDoubleComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int blockDim, - bsric02Info_t info, - cusparseSolvePolicy_t policy, - void *pBuffer); - - -/* Description: Solution of tridiagonal linear system A * X = F, - with multiple right-hand-sides. The coefficient matrix A is - composed of lower (dl), main (d) and upper (du) diagonals, and - the right-hand-sides F are overwritten with the solution X. - These routine use pivoting. */ -cusparseStatus_t CUSPARSEAPI cusparseSgtsv( - cusparseHandle_t handle, - int m, - int n, - const float *dl, - const float *d, - const float *du, - float *B, - int ldb); - -cusparseStatus_t CUSPARSEAPI cusparseDgtsv( - cusparseHandle_t handle, - int m, - int n, - const double *dl, - const double *d, - const double *du, - double *B, - int ldb); - -cusparseStatus_t CUSPARSEAPI cusparseCgtsv( - cusparseHandle_t handle, - int m, - int n, - const cuComplex *dl, - const cuComplex *d, - const cuComplex *du, - cuComplex *B, - int ldb); - -cusparseStatus_t CUSPARSEAPI cusparseZgtsv( - cusparseHandle_t handle, - int m, - int n, - const cuDoubleComplex *dl, - const cuDoubleComplex *d, - const cuDoubleComplex *du, - cuDoubleComplex *B, - int ldb); - - -cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_bufferSizeExt( - cusparseHandle_t handle, - int m, - int n, - const float *dl, - const float *d, - const float *du, - const float *B, - int ldb, - size_t *bufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_bufferSizeExt( - cusparseHandle_t handle, - int m, - int n, - const double *dl, - const double *d, - const double *du, - const double *B, - int ldb, - size_t *bufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_bufferSizeExt( - cusparseHandle_t handle, - int m, - int n, - const cuComplex *dl, - const cuComplex *d, - const cuComplex *du, - const cuComplex *B, - int ldb, - size_t *bufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_bufferSizeExt( - cusparseHandle_t handle, - int m, - int n, - const cuDoubleComplex *dl, - const cuDoubleComplex *d, - const cuDoubleComplex *du, - const cuDoubleComplex *B, - int ldb, - size_t *bufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseSgtsv2( - cusparseHandle_t handle, - int m, - int n, - const float *dl, - const float *d, - const float *du, - float *B, - int ldb, - void* pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDgtsv2( - cusparseHandle_t handle, - int m, - int n, - const double *dl, - const double *d, - const double *du, - double *B, - int ldb, - void* pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseCgtsv2( - cusparseHandle_t handle, - int m, - int n, - const cuComplex *dl, - const cuComplex *d, - const cuComplex *du, - cuComplex *B, - int ldb, - void* pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseZgtsv2( - cusparseHandle_t handle, - int m, - int n, - const cuDoubleComplex *dl, - const cuDoubleComplex *d, - const cuDoubleComplex *du, - cuDoubleComplex *B, - int ldb, - void* pBuffer); - - -/* Description: Solution of tridiagonal linear system A * X = F, - with multiple right-hand-sides. The coefficient matrix A is - composed of lower (dl), main (d) and upper (du) diagonals, and - the right-hand-sides F are overwritten with the solution X. - These routine does not use pivoting. */ -cusparseStatus_t CUSPARSEAPI cusparseSgtsv_nopivot( - cusparseHandle_t handle, - int m, - int n, - const float *dl, - const float *d, - const float *du, - float *B, - int ldb); - -cusparseStatus_t CUSPARSEAPI cusparseDgtsv_nopivot( - cusparseHandle_t handle, - int m, - int n, - const double *dl, - const double *d, - const double *du, - double *B, - int ldb); - -cusparseStatus_t CUSPARSEAPI cusparseCgtsv_nopivot( - cusparseHandle_t handle, - int m, - int n, - const cuComplex *dl, - const cuComplex *d, - const cuComplex *du, - cuComplex *B, - int ldb); - -cusparseStatus_t CUSPARSEAPI cusparseZgtsv_nopivot( - cusparseHandle_t handle, - int m, - int n, - const cuDoubleComplex *dl, - const cuDoubleComplex *d, - const cuDoubleComplex *du, - cuDoubleComplex *B, - int ldb); - - -cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_nopivot_bufferSizeExt( - cusparseHandle_t handle, - int m, - int n, - const float *dl, - const float *d, - const float *du, - const float *B, - int ldb, - size_t *bufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_nopivot_bufferSizeExt( - cusparseHandle_t handle, - int m, - int n, - const double *dl, - const double *d, - const double *du, - const double *B, - int ldb, - size_t *bufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_nopivot_bufferSizeExt( - cusparseHandle_t handle, - int m, - int n, - const cuComplex *dl, - const cuComplex *d, - const cuComplex *du, - const cuComplex *B, - int ldb, - size_t *bufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_nopivot_bufferSizeExt( - cusparseHandle_t handle, - int m, - int n, - const cuDoubleComplex *dl, - const cuDoubleComplex *d, - const cuDoubleComplex *du, - const cuDoubleComplex *B, - int ldb, - size_t *bufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_nopivot( - cusparseHandle_t handle, - int m, - int n, - const float *dl, - const float *d, - const float *du, - float *B, - int ldb, - void* pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_nopivot( - cusparseHandle_t handle, - int m, - int n, - const double *dl, - const double *d, - const double *du, - double *B, - int ldb, - void* pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_nopivot( - cusparseHandle_t handle, - int m, - int n, - const cuComplex *dl, - const cuComplex *d, - const cuComplex *du, - cuComplex *B, - int ldb, - void* pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_nopivot( - cusparseHandle_t handle, - int m, - int n, - const cuDoubleComplex *dl, - const cuDoubleComplex *d, - const cuDoubleComplex *du, - cuDoubleComplex *B, - int ldb, - void* pBuffer); - -/* Description: Solution of a set of tridiagonal linear systems - A_{i} * x_{i} = f_{i} for i=1,...,batchCount. The coefficient - matrices A_{i} are composed of lower (dl), main (d) and upper (du) - diagonals and stored separated by a batchStride. Also, the - right-hand-sides/solutions f_{i}/x_{i} are separated by a batchStride. */ -cusparseStatus_t CUSPARSEAPI cusparseSgtsvStridedBatch( - cusparseHandle_t handle, - int m, - const float *dl, - const float *d, - const float *du, - float *x, - int batchCount, - int batchStride); - -cusparseStatus_t CUSPARSEAPI cusparseDgtsvStridedBatch( - cusparseHandle_t handle, - int m, - const double *dl, - const double *d, - const double *du, - double *x, - int batchCount, - int batchStride); - -cusparseStatus_t CUSPARSEAPI cusparseCgtsvStridedBatch( - cusparseHandle_t handle, - int m, - const cuComplex *dl, - const cuComplex *d, - const cuComplex *du, - cuComplex *x, - int batchCount, - int batchStride); - -cusparseStatus_t CUSPARSEAPI cusparseZgtsvStridedBatch( - cusparseHandle_t handle, - int m, - const cuDoubleComplex *dl, - const cuDoubleComplex *d, - const cuDoubleComplex *du, - cuDoubleComplex *x, - int batchCount, - int batchStride); - - -cusparseStatus_t CUSPARSEAPI cusparseSgtsv2StridedBatch_bufferSizeExt( - cusparseHandle_t handle, - int m, - const float *dl, - const float *d, - const float *du, - const float *x, - int batchCount, - int batchStride, - size_t *bufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseDgtsv2StridedBatch_bufferSizeExt( - cusparseHandle_t handle, - int m, - const double *dl, - const double *d, - const double *du, - const double *x, - int batchCount, - int batchStride, - size_t *bufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseCgtsv2StridedBatch_bufferSizeExt( - cusparseHandle_t handle, - int m, - const cuComplex *dl, - const cuComplex *d, - const cuComplex *du, - const cuComplex *x, - int batchCount, - int batchStride, - size_t *bufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseZgtsv2StridedBatch_bufferSizeExt( - cusparseHandle_t handle, - int m, - const cuDoubleComplex *dl, - const cuDoubleComplex *d, - const cuDoubleComplex *du, - const cuDoubleComplex *x, - int batchCount, - int batchStride, - size_t *bufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseSgtsv2StridedBatch( - cusparseHandle_t handle, - int m, - const float *dl, - const float *d, - const float *du, - float *x, - int batchCount, - int batchStride, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDgtsv2StridedBatch( - cusparseHandle_t handle, - int m, - const double *dl, - const double *d, - const double *du, - double *x, - int batchCount, - int batchStride, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseCgtsv2StridedBatch( - cusparseHandle_t handle, - int m, - const cuComplex *dl, - const cuComplex *d, - const cuComplex *du, - cuComplex *x, - int batchCount, - int batchStride, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseZgtsv2StridedBatch( - cusparseHandle_t handle, - int m, - const cuDoubleComplex *dl, - const cuDoubleComplex *d, - const cuDoubleComplex *du, - cuDoubleComplex *x, - int batchCount, - int batchStride, - void *pBuffer); - -/* --- Sparse Level 4 routines --- */ - -/* Description: Compute sparse - sparse matrix multiplication for matrices - stored in CSR format. */ -cusparseStatus_t CUSPARSEAPI cusparseXcsrgemmNnz(cusparseHandle_t handle, - cusparseOperation_t transA, - cusparseOperation_t transB, - int m, - int n, - int k, - const cusparseMatDescr_t descrA, - const int nnzA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cusparseMatDescr_t descrB, - const int nnzB, - const int *csrSortedRowPtrB, - const int *csrSortedColIndB, - const cusparseMatDescr_t descrC, - int *csrSortedRowPtrC, - int *nnzTotalDevHostPtr); - -cusparseStatus_t CUSPARSEAPI cusparseScsrgemm(cusparseHandle_t handle, - cusparseOperation_t transA, - cusparseOperation_t transB, - int m, - int n, - int k, - const cusparseMatDescr_t descrA, - const int nnzA, - const float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cusparseMatDescr_t descrB, - const int nnzB, - const float *csrSortedValB, - const int *csrSortedRowPtrB, - const int *csrSortedColIndB, - const cusparseMatDescr_t descrC, - float *csrSortedValC, - const int *csrSortedRowPtrC, - int *csrSortedColIndC); - -cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm(cusparseHandle_t handle, - cusparseOperation_t transA, - cusparseOperation_t transB, - int m, - int n, - int k, - const cusparseMatDescr_t descrA, - int nnzA, - const double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cusparseMatDescr_t descrB, - int nnzB, - const double *csrSortedValB, - const int *csrSortedRowPtrB, - const int *csrSortedColIndB, - const cusparseMatDescr_t descrC, - double *csrSortedValC, - const int *csrSortedRowPtrC, - int *csrSortedColIndC); - -cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm(cusparseHandle_t handle, - cusparseOperation_t transA, - cusparseOperation_t transB, - int m, - int n, - int k, - const cusparseMatDescr_t descrA, - int nnzA, - const cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cusparseMatDescr_t descrB, - int nnzB, - const cuComplex *csrSortedValB, - const int *csrSortedRowPtrB, - const int *csrSortedColIndB, - const cusparseMatDescr_t descrC, - cuComplex *csrSortedValC, - const int *csrSortedRowPtrC, - int *csrSortedColIndC); - -cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm(cusparseHandle_t handle, - cusparseOperation_t transA, - cusparseOperation_t transB, - int m, - int n, - int k, - const cusparseMatDescr_t descrA, - int nnzA, - const cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cusparseMatDescr_t descrB, - int nnzB, - const cuDoubleComplex *csrSortedValB, - const int *csrSortedRowPtrB, - const int *csrSortedColIndB, - const cusparseMatDescr_t descrC, - cuDoubleComplex *csrSortedValC, - const int *csrSortedRowPtrC, - int *csrSortedColIndC); - -/* Description: Compute sparse - sparse matrix multiplication for matrices - stored in CSR format. */ - -cusparseStatus_t CUSPARSEAPI cusparseCreateCsrgemm2Info(csrgemm2Info_t *info); - -cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrgemm2Info(csrgemm2Info_t info); - -cusparseStatus_t CUSPARSEAPI cusparseScsrgemm2_bufferSizeExt(cusparseHandle_t handle, - int m, - int n, - int k, - const float *alpha, - const cusparseMatDescr_t descrA, - int nnzA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cusparseMatDescr_t descrB, - int nnzB, - const int *csrSortedRowPtrB, - const int *csrSortedColIndB, - const float *beta, - const cusparseMatDescr_t descrD, - int nnzD, - const int *csrSortedRowPtrD, - const int *csrSortedColIndD, - csrgemm2Info_t info, - size_t *pBufferSizeInBytes ); - -cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm2_bufferSizeExt(cusparseHandle_t handle, - int m, - int n, - int k, - const double *alpha, - const cusparseMatDescr_t descrA, - int nnzA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cusparseMatDescr_t descrB, - int nnzB, - const int *csrSortedRowPtrB, - const int *csrSortedColIndB, - const double *beta, - const cusparseMatDescr_t descrD, - int nnzD, - const int *csrSortedRowPtrD, - const int *csrSortedColIndD, - csrgemm2Info_t info, - size_t *pBufferSizeInBytes ); - -cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm2_bufferSizeExt(cusparseHandle_t handle, - int m, - int n, - int k, - const cuComplex *alpha, - const cusparseMatDescr_t descrA, - int nnzA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cusparseMatDescr_t descrB, - int nnzB, - const int *csrSortedRowPtrB, - const int *csrSortedColIndB, - const cuComplex *beta, - const cusparseMatDescr_t descrD, - int nnzD, - const int *csrSortedRowPtrD, - const int *csrSortedColIndD, - csrgemm2Info_t info, - size_t *pBufferSizeInBytes ); - -cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm2_bufferSizeExt(cusparseHandle_t handle, - int m, - int n, - int k, - const cuDoubleComplex *alpha, - const cusparseMatDescr_t descrA, - int nnzA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cusparseMatDescr_t descrB, - int nnzB, - const int *csrSortedRowPtrB, - const int *csrSortedColIndB, - const cuDoubleComplex *beta, - const cusparseMatDescr_t descrD, - int nnzD, - const int *csrSortedRowPtrD, - const int *csrSortedColIndD, - csrgemm2Info_t info, - size_t *pBufferSizeInBytes ); - - -cusparseStatus_t CUSPARSEAPI cusparseXcsrgemm2Nnz(cusparseHandle_t handle, - int m, - int n, - int k, - const cusparseMatDescr_t descrA, - int nnzA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cusparseMatDescr_t descrB, - int nnzB, - const int *csrSortedRowPtrB, - const int *csrSortedColIndB, - const cusparseMatDescr_t descrD, - int nnzD, - const int *csrSortedRowPtrD, - const int *csrSortedColIndD, - const cusparseMatDescr_t descrC, - int *csrSortedRowPtrC, - int *nnzTotalDevHostPtr, - const csrgemm2Info_t info, - void *pBuffer ); - - -cusparseStatus_t CUSPARSEAPI cusparseScsrgemm2(cusparseHandle_t handle, - int m, - int n, - int k, - const float *alpha, - const cusparseMatDescr_t descrA, - int nnzA, - const float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cusparseMatDescr_t descrB, - int nnzB, - const float *csrSortedValB, - const int *csrSortedRowPtrB, - const int *csrSortedColIndB, - const float *beta, - const cusparseMatDescr_t descrD, - int nnzD, - const float *csrSortedValD, - const int *csrSortedRowPtrD, - const int *csrSortedColIndD, - const cusparseMatDescr_t descrC, - float *csrSortedValC, - const int *csrSortedRowPtrC, - int *csrSortedColIndC, - const csrgemm2Info_t info, - void *pBuffer ); - -cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm2(cusparseHandle_t handle, - int m, - int n, - int k, - const double *alpha, - const cusparseMatDescr_t descrA, - int nnzA, - const double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cusparseMatDescr_t descrB, - int nnzB, - const double *csrSortedValB, - const int *csrSortedRowPtrB, - const int *csrSortedColIndB, - const double *beta, - const cusparseMatDescr_t descrD, - int nnzD, - const double *csrSortedValD, - const int *csrSortedRowPtrD, - const int *csrSortedColIndD, - const cusparseMatDescr_t descrC, - double *csrSortedValC, - const int *csrSortedRowPtrC, - int *csrSortedColIndC, - const csrgemm2Info_t info, - void *pBuffer ); - - -cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm2(cusparseHandle_t handle, - int m, - int n, - int k, - const cuComplex *alpha, - const cusparseMatDescr_t descrA, - int nnzA, - const cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cusparseMatDescr_t descrB, - int nnzB, - const cuComplex *csrSortedValB, - const int *csrSortedRowPtrB, - const int *csrSortedColIndB, - const cuComplex *beta, - const cusparseMatDescr_t descrD, - int nnzD, - const cuComplex *csrSortedValD, - const int *csrSortedRowPtrD, - const int *csrSortedColIndD, - const cusparseMatDescr_t descrC, - cuComplex *csrSortedValC, - const int *csrSortedRowPtrC, - int *csrSortedColIndC, - const csrgemm2Info_t info, - void *pBuffer ); - - -cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm2(cusparseHandle_t handle, - int m, - int n, - int k, - const cuDoubleComplex *alpha, - const cusparseMatDescr_t descrA, - int nnzA, - const cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cusparseMatDescr_t descrB, - int nnzB, - const cuDoubleComplex *csrSortedValB, - const int *csrSortedRowPtrB, - const int *csrSortedColIndB, - const cuDoubleComplex *beta, - const cusparseMatDescr_t descrD, - int nnzD, - const cuDoubleComplex *csrSortedValD, - const int *csrSortedRowPtrD, - const int *csrSortedColIndD, - const cusparseMatDescr_t descrC, - cuDoubleComplex *csrSortedValC, - const int *csrSortedRowPtrC, - int *csrSortedColIndC, - const csrgemm2Info_t info, - void *pBuffer ); - - -/* Description: Compute sparse - sparse matrix addition of matrices - stored in CSR format */ -cusparseStatus_t CUSPARSEAPI cusparseXcsrgeamNnz(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - int nnzA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cusparseMatDescr_t descrB, - int nnzB, - const int *csrSortedRowPtrB, - const int *csrSortedColIndB, - const cusparseMatDescr_t descrC, - int *csrSortedRowPtrC, - int *nnzTotalDevHostPtr); - -cusparseStatus_t CUSPARSEAPI cusparseScsrgeam(cusparseHandle_t handle, - int m, - int n, - const float *alpha, - const cusparseMatDescr_t descrA, - int nnzA, - const float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const float *beta, - const cusparseMatDescr_t descrB, - int nnzB, - const float *csrSortedValB, - const int *csrSortedRowPtrB, - const int *csrSortedColIndB, - const cusparseMatDescr_t descrC, - float *csrSortedValC, - int *csrSortedRowPtrC, - int *csrSortedColIndC); - -cusparseStatus_t CUSPARSEAPI cusparseDcsrgeam(cusparseHandle_t handle, - int m, - int n, - const double *alpha, - const cusparseMatDescr_t descrA, - int nnzA, - const double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const double *beta, - const cusparseMatDescr_t descrB, - int nnzB, - const double *csrSortedValB, - const int *csrSortedRowPtrB, - const int *csrSortedColIndB, - const cusparseMatDescr_t descrC, - double *csrSortedValC, - int *csrSortedRowPtrC, - int *csrSortedColIndC); - -cusparseStatus_t CUSPARSEAPI cusparseCcsrgeam(cusparseHandle_t handle, - int m, - int n, - const cuComplex *alpha, - const cusparseMatDescr_t descrA, - int nnzA, - const cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cuComplex *beta, - const cusparseMatDescr_t descrB, - int nnzB, - const cuComplex *csrSortedValB, - const int *csrSortedRowPtrB, - const int *csrSortedColIndB, - const cusparseMatDescr_t descrC, - cuComplex *csrSortedValC, - int *csrSortedRowPtrC, - int *csrSortedColIndC); - -cusparseStatus_t CUSPARSEAPI cusparseZcsrgeam(cusparseHandle_t handle, - int m, - int n, - const cuDoubleComplex *alpha, - const cusparseMatDescr_t descrA, - int nnzA, - const cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cuDoubleComplex *beta, - const cusparseMatDescr_t descrB, - int nnzB, - const cuDoubleComplex *csrSortedValB, - const int *csrSortedRowPtrB, - const int *csrSortedColIndB, - const cusparseMatDescr_t descrC, - cuDoubleComplex *csrSortedValC, - int *csrSortedRowPtrC, - int *csrSortedColIndC); - - -/* --- Sparse Matrix Reorderings --- */ - -/* Description: Find an approximate coloring of a matrix stored in CSR format. */ -cusparseStatus_t CUSPARSEAPI cusparseScsrcolor(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - const float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const float *fractionToColor, - int *ncolors, - int *coloring, - int *reordering, - const cusparseColorInfo_t info); - -cusparseStatus_t CUSPARSEAPI cusparseDcsrcolor(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - const double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const double *fractionToColor, - int *ncolors, - int *coloring, - int *reordering, - const cusparseColorInfo_t info); - -cusparseStatus_t CUSPARSEAPI cusparseCcsrcolor(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - const cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const float *fractionToColor, - int *ncolors, - int *coloring, - int *reordering, - const cusparseColorInfo_t info); - -cusparseStatus_t CUSPARSEAPI cusparseZcsrcolor(cusparseHandle_t handle, - int m, - int nnz, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const double *fractionToColor, - int *ncolors, - int *coloring, - int *reordering, - const cusparseColorInfo_t info); - -/* --- Sparse Format Conversion --- */ - -/* Description: This routine finds the total number of non-zero elements and - the number of non-zero elements per row or column in the dense matrix A. */ -cusparseStatus_t CUSPARSEAPI cusparseSnnz(cusparseHandle_t handle, - cusparseDirection_t dirA, - int m, - int n, - const cusparseMatDescr_t descrA, - const float *A, - int lda, - int *nnzPerRowCol, - int *nnzTotalDevHostPtr); - -cusparseStatus_t CUSPARSEAPI cusparseDnnz(cusparseHandle_t handle, - cusparseDirection_t dirA, - int m, - int n, - const cusparseMatDescr_t descrA, - const double *A, - int lda, - int *nnzPerRowCol, - int *nnzTotalDevHostPtr); - -cusparseStatus_t CUSPARSEAPI cusparseCnnz(cusparseHandle_t handle, - cusparseDirection_t dirA, - int m, - int n, - const cusparseMatDescr_t descrA, - const cuComplex *A, - int lda, - int *nnzPerRowCol, - int *nnzTotalDevHostPtr); - -cusparseStatus_t CUSPARSEAPI cusparseZnnz(cusparseHandle_t handle, - cusparseDirection_t dirA, - int m, - int n, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *A, - int lda, - int *nnzPerRowCol, - int *nnzTotalDevHostPtr); - -/* --- Sparse Format Conversion --- */ - -/* Description: This routine finds the total number of non-zero elements and - the number of non-zero elements per row in a noncompressed csr matrix A. */ -cusparseStatus_t CUSPARSEAPI cusparseSnnz_compress(cusparseHandle_t handle, - int m, - const cusparseMatDescr_t descr, - const float *values, - const int *rowPtr, - int *nnzPerRow, - int *nnzTotal, - float tol); - -cusparseStatus_t CUSPARSEAPI cusparseDnnz_compress(cusparseHandle_t handle, - int m, - const cusparseMatDescr_t descr, - const double *values, - const int *rowPtr, - int *nnzPerRow, - int *nnzTotal, - double tol); - -cusparseStatus_t CUSPARSEAPI cusparseCnnz_compress(cusparseHandle_t handle, - int m, - const cusparseMatDescr_t descr, - const cuComplex *values, - const int *rowPtr, - int *nnzPerRow, - int *nnzTotal, - cuComplex tol); - -cusparseStatus_t CUSPARSEAPI cusparseZnnz_compress(cusparseHandle_t handle, - int m, - const cusparseMatDescr_t descr, - const cuDoubleComplex *values, - const int *rowPtr, - int *nnzPerRow, - int *nnzTotal, - cuDoubleComplex tol); -/* Description: This routine takes as input a csr form where the values may have 0 elements - and compresses it to return a csr form with no zeros. */ - -cusparseStatus_t CUSPARSEAPI cusparseScsr2csr_compress(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descra, - const float *inVal, - const int *inColInd, - const int * inRowPtr, - int inNnz, - int *nnzPerRow, - float *outVal, - int *outColInd, - int *outRowPtr, - float tol); - -cusparseStatus_t CUSPARSEAPI cusparseDcsr2csr_compress(cusparseHandle_t handle, - int m, //number of rows - int n, - const cusparseMatDescr_t descra, - const double *inVal, //csr values array-the elements which are below a certain tolerance will be remvoed - const int *inColInd, - const int * inRowPtr, //corresponding input noncompressed row pointer - int inNnz, - int *nnzPerRow, //output: returns number of nonzeros per row - double *outVal, - int *outColInd, - int *outRowPtr, - double tol); - -cusparseStatus_t CUSPARSEAPI cusparseCcsr2csr_compress(cusparseHandle_t handle, - int m, //number of rows - int n, - const cusparseMatDescr_t descra, - const cuComplex *inVal, //csr values array-the elements which are below a certain tolerance will be remvoed - const int *inColInd, - const int * inRowPtr, //corresponding input noncompressed row pointer - int inNnz, - int *nnzPerRow, //output: returns number of nonzeros per row - cuComplex *outVal, - int *outColInd, - int *outRowPtr, - cuComplex tol); - -cusparseStatus_t CUSPARSEAPI cusparseZcsr2csr_compress(cusparseHandle_t handle, - int m, //number of rows - int n, - const cusparseMatDescr_t descra, - const cuDoubleComplex *inVal, //csr values array-the elements which are below a certain tolerance will be remvoed - const int *inColInd, - const int * inRowPtr, //corresponding input noncompressed row pointer - int inNnz, - int *nnzPerRow, //output: returns number of nonzeros per row - cuDoubleComplex *outVal, - int *outColInd, - int *outRowPtr, - cuDoubleComplex tol); - -/* Description: This routine converts a dense matrix to a sparse matrix - in the CSR storage format, using the information computed by the - nnz routine. */ -cusparseStatus_t CUSPARSEAPI cusparseSdense2csr(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const float *A, - int lda, - const int *nnzPerRow, - float *csrSortedValA, - int *csrSortedRowPtrA, - int *csrSortedColIndA); - -cusparseStatus_t CUSPARSEAPI cusparseDdense2csr(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const double *A, - int lda, - const int *nnzPerRow, - double *csrSortedValA, - int *csrSortedRowPtrA, - int *csrSortedColIndA); - -cusparseStatus_t CUSPARSEAPI cusparseCdense2csr(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const cuComplex *A, - int lda, - const int *nnzPerRow, - cuComplex *csrSortedValA, - int *csrSortedRowPtrA, - int *csrSortedColIndA); - -cusparseStatus_t CUSPARSEAPI cusparseZdense2csr(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *A, - int lda, - const int *nnzPerRow, - cuDoubleComplex *csrSortedValA, - int *csrSortedRowPtrA, - int *csrSortedColIndA); - -/* Description: This routine converts a sparse matrix in CSR storage format - to a dense matrix. */ -cusparseStatus_t CUSPARSEAPI cusparseScsr2dense(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - float *A, - int lda); - -cusparseStatus_t CUSPARSEAPI cusparseDcsr2dense(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - double *A, - int lda); - -cusparseStatus_t CUSPARSEAPI cusparseCcsr2dense(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cuComplex *A, - int lda); - -cusparseStatus_t CUSPARSEAPI cusparseZcsr2dense(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cuDoubleComplex *A, - int lda); - -/* Description: This routine converts a dense matrix to a sparse matrix - in the CSC storage format, using the information computed by the - nnz routine. */ -cusparseStatus_t CUSPARSEAPI cusparseSdense2csc(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const float *A, - int lda, - const int *nnzPerCol, - float *cscSortedValA, - int *cscSortedRowIndA, - int *cscSortedColPtrA); - -cusparseStatus_t CUSPARSEAPI cusparseDdense2csc(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const double *A, - int lda, - const int *nnzPerCol, - double *cscSortedValA, - int *cscSortedRowIndA, - int *cscSortedColPtrA); - -cusparseStatus_t CUSPARSEAPI cusparseCdense2csc(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const cuComplex *A, - int lda, - const int *nnzPerCol, - cuComplex *cscSortedValA, - int *cscSortedRowIndA, - int *cscSortedColPtrA); - -cusparseStatus_t CUSPARSEAPI cusparseZdense2csc(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *A, - int lda, - const int *nnzPerCol, - cuDoubleComplex *cscSortedValA, - int *cscSortedRowIndA, - int *cscSortedColPtrA); - -/* Description: This routine converts a sparse matrix in CSC storage format - to a dense matrix. */ -cusparseStatus_t CUSPARSEAPI cusparseScsc2dense(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const float *cscSortedValA, - const int *cscSortedRowIndA, - const int *cscSortedColPtrA, - float *A, - int lda); - -cusparseStatus_t CUSPARSEAPI cusparseDcsc2dense(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const double *cscSortedValA, - const int *cscSortedRowIndA, - const int *cscSortedColPtrA, - double *A, - int lda); - -cusparseStatus_t CUSPARSEAPI cusparseCcsc2dense(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const cuComplex *cscSortedValA, - const int *cscSortedRowIndA, - const int *cscSortedColPtrA, - cuComplex *A, - int lda); - -cusparseStatus_t CUSPARSEAPI cusparseZcsc2dense(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *cscSortedValA, - const int *cscSortedRowIndA, - const int *cscSortedColPtrA, - cuDoubleComplex *A, - int lda); - -/* Description: This routine compresses the indecis of rows or columns. - It can be interpreted as a conversion from COO to CSR sparse storage - format. */ -cusparseStatus_t CUSPARSEAPI cusparseXcoo2csr(cusparseHandle_t handle, - const int *cooRowInd, - int nnz, - int m, - int *csrSortedRowPtr, - cusparseIndexBase_t idxBase); - -/* Description: This routine uncompresses the indecis of rows or columns. - It can be interpreted as a conversion from CSR to COO sparse storage - format. */ -cusparseStatus_t CUSPARSEAPI cusparseXcsr2coo(cusparseHandle_t handle, - const int *csrSortedRowPtr, - int nnz, - int m, - int *cooRowInd, - cusparseIndexBase_t idxBase); - -/* Description: This routine converts a matrix from CSR to CSC sparse - storage format. The resulting matrix can be re-interpreted as a - transpose of the original matrix in CSR storage format. */ -cusparseStatus_t CUSPARSEAPI cusparseCsr2cscEx(cusparseHandle_t handle, - int m, - int n, - int nnz, - const void *csrSortedVal, - cudaDataType csrSortedValtype, - const int *csrSortedRowPtr, - const int *csrSortedColInd, - void *cscSortedVal, - cudaDataType cscSortedValtype, - int *cscSortedRowInd, - int *cscSortedColPtr, - cusparseAction_t copyValues, - cusparseIndexBase_t idxBase, - cudaDataType executiontype); - -cusparseStatus_t CUSPARSEAPI cusparseScsr2csc(cusparseHandle_t handle, - int m, - int n, - int nnz, - const float *csrSortedVal, - const int *csrSortedRowPtr, - const int *csrSortedColInd, - float *cscSortedVal, - int *cscSortedRowInd, - int *cscSortedColPtr, - cusparseAction_t copyValues, - cusparseIndexBase_t idxBase); - -cusparseStatus_t CUSPARSEAPI cusparseDcsr2csc(cusparseHandle_t handle, - int m, - int n, - int nnz, - const double *csrSortedVal, - const int *csrSortedRowPtr, - const int *csrSortedColInd, - double *cscSortedVal, - int *cscSortedRowInd, - int *cscSortedColPtr, - cusparseAction_t copyValues, - cusparseIndexBase_t idxBase); - -cusparseStatus_t CUSPARSEAPI cusparseCcsr2csc(cusparseHandle_t handle, - int m, - int n, - int nnz, - const cuComplex *csrSortedVal, - const int *csrSortedRowPtr, - const int *csrSortedColInd, - cuComplex *cscSortedVal, - int *cscSortedRowInd, - int *cscSortedColPtr, - cusparseAction_t copyValues, - cusparseIndexBase_t idxBase); - -cusparseStatus_t CUSPARSEAPI cusparseZcsr2csc(cusparseHandle_t handle, - int m, - int n, - int nnz, - const cuDoubleComplex *csrSortedVal, - const int *csrSortedRowPtr, - const int *csrSortedColInd, - cuDoubleComplex *cscSortedVal, - int *cscSortedRowInd, - int *cscSortedColPtr, - cusparseAction_t copyValues, - cusparseIndexBase_t idxBase); - -/* Description: This routine converts a dense matrix to a sparse matrix - in HYB storage format. */ -cusparseStatus_t CUSPARSEAPI cusparseSdense2hyb(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const float *A, - int lda, - const int *nnzPerRow, - cusparseHybMat_t hybA, - int userEllWidth, - cusparseHybPartition_t partitionType); - -cusparseStatus_t CUSPARSEAPI cusparseDdense2hyb(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const double *A, - int lda, - const int *nnzPerRow, - cusparseHybMat_t hybA, - int userEllWidth, - cusparseHybPartition_t partitionType); - -cusparseStatus_t CUSPARSEAPI cusparseCdense2hyb(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const cuComplex *A, - int lda, - const int *nnzPerRow, - cusparseHybMat_t hybA, - int userEllWidth, - cusparseHybPartition_t partitionType); - -cusparseStatus_t CUSPARSEAPI cusparseZdense2hyb(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *A, - int lda, - const int *nnzPerRow, - cusparseHybMat_t hybA, - int userEllWidth, - cusparseHybPartition_t partitionType); - -/* Description: This routine converts a sparse matrix in HYB storage format - to a dense matrix. */ -cusparseStatus_t CUSPARSEAPI cusparseShyb2dense(cusparseHandle_t handle, - const cusparseMatDescr_t descrA, - const cusparseHybMat_t hybA, - float *A, - int lda); - -cusparseStatus_t CUSPARSEAPI cusparseDhyb2dense(cusparseHandle_t handle, - const cusparseMatDescr_t descrA, - const cusparseHybMat_t hybA, - double *A, - int lda); - -cusparseStatus_t CUSPARSEAPI cusparseChyb2dense(cusparseHandle_t handle, - const cusparseMatDescr_t descrA, - const cusparseHybMat_t hybA, - cuComplex *A, - int lda); - -cusparseStatus_t CUSPARSEAPI cusparseZhyb2dense(cusparseHandle_t handle, - const cusparseMatDescr_t descrA, - const cusparseHybMat_t hybA, - cuDoubleComplex *A, - int lda); - -/* Description: This routine converts a sparse matrix in CSR storage format - to a sparse matrix in HYB storage format. */ -cusparseStatus_t CUSPARSEAPI cusparseScsr2hyb(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseHybMat_t hybA, - int userEllWidth, - cusparseHybPartition_t partitionType); - -cusparseStatus_t CUSPARSEAPI cusparseDcsr2hyb(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseHybMat_t hybA, - int userEllWidth, - cusparseHybPartition_t partitionType); - -cusparseStatus_t CUSPARSEAPI cusparseCcsr2hyb(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseHybMat_t hybA, - int userEllWidth, - cusparseHybPartition_t partitionType); - -cusparseStatus_t CUSPARSEAPI cusparseZcsr2hyb(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - cusparseHybMat_t hybA, - int userEllWidth, - cusparseHybPartition_t partitionType); - -/* Description: This routine converts a sparse matrix in HYB storage format - to a sparse matrix in CSR storage format. */ -cusparseStatus_t CUSPARSEAPI cusparseShyb2csr(cusparseHandle_t handle, - const cusparseMatDescr_t descrA, - const cusparseHybMat_t hybA, - float *csrSortedValA, - int *csrSortedRowPtrA, - int *csrSortedColIndA); - -cusparseStatus_t CUSPARSEAPI cusparseDhyb2csr(cusparseHandle_t handle, - const cusparseMatDescr_t descrA, - const cusparseHybMat_t hybA, - double *csrSortedValA, - int *csrSortedRowPtrA, - int *csrSortedColIndA); - -cusparseStatus_t CUSPARSEAPI cusparseChyb2csr(cusparseHandle_t handle, - const cusparseMatDescr_t descrA, - const cusparseHybMat_t hybA, - cuComplex *csrSortedValA, - int *csrSortedRowPtrA, - int *csrSortedColIndA); - -cusparseStatus_t CUSPARSEAPI cusparseZhyb2csr(cusparseHandle_t handle, - const cusparseMatDescr_t descrA, - const cusparseHybMat_t hybA, - cuDoubleComplex *csrSortedValA, - int *csrSortedRowPtrA, - int *csrSortedColIndA); - -/* Description: This routine converts a sparse matrix in CSC storage format - to a sparse matrix in HYB storage format. */ -cusparseStatus_t CUSPARSEAPI cusparseScsc2hyb(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const float *cscSortedValA, - const int *cscSortedRowIndA, - const int *cscSortedColPtrA, - cusparseHybMat_t hybA, - int userEllWidth, - cusparseHybPartition_t partitionType); - -cusparseStatus_t CUSPARSEAPI cusparseDcsc2hyb(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const double *cscSortedValA, - const int *cscSortedRowIndA, - const int *cscSortedColPtrA, - cusparseHybMat_t hybA, - int userEllWidth, - cusparseHybPartition_t partitionType); - -cusparseStatus_t CUSPARSEAPI cusparseCcsc2hyb(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const cuComplex *cscSortedValA, - const int *cscSortedRowIndA, - const int *cscSortedColPtrA, - cusparseHybMat_t hybA, - int userEllWidth, - cusparseHybPartition_t partitionType); - -cusparseStatus_t CUSPARSEAPI cusparseZcsc2hyb(cusparseHandle_t handle, - int m, - int n, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *cscSortedValA, - const int *cscSortedRowIndA, - const int *cscSortedColPtrA, - cusparseHybMat_t hybA, - int userEllWidth, - cusparseHybPartition_t partitionType); - -/* Description: This routine converts a sparse matrix in HYB storage format - to a sparse matrix in CSC storage format. */ -cusparseStatus_t CUSPARSEAPI cusparseShyb2csc(cusparseHandle_t handle, - const cusparseMatDescr_t descrA, - const cusparseHybMat_t hybA, - float *cscSortedVal, - int *cscSortedRowInd, - int *cscSortedColPtr); - -cusparseStatus_t CUSPARSEAPI cusparseDhyb2csc(cusparseHandle_t handle, - const cusparseMatDescr_t descrA, - const cusparseHybMat_t hybA, - double *cscSortedVal, - int *cscSortedRowInd, - int *cscSortedColPtr); - -cusparseStatus_t CUSPARSEAPI cusparseChyb2csc(cusparseHandle_t handle, - const cusparseMatDescr_t descrA, - const cusparseHybMat_t hybA, - cuComplex *cscSortedVal, - int *cscSortedRowInd, - int *cscSortedColPtr); - -cusparseStatus_t CUSPARSEAPI cusparseZhyb2csc(cusparseHandle_t handle, - const cusparseMatDescr_t descrA, - const cusparseHybMat_t hybA, - cuDoubleComplex *cscSortedVal, - int *cscSortedRowInd, - int *cscSortedColPtr); - -/* Description: This routine converts a sparse matrix in CSR storage format - to a sparse matrix in block-CSR storage format. */ -cusparseStatus_t CUSPARSEAPI cusparseXcsr2bsrNnz(cusparseHandle_t handle, - cusparseDirection_t dirA, - int m, - int n, - const cusparseMatDescr_t descrA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - int blockDim, - const cusparseMatDescr_t descrC, - int *bsrSortedRowPtrC, - int *nnzTotalDevHostPtr); - -cusparseStatus_t CUSPARSEAPI cusparseScsr2bsr(cusparseHandle_t handle, - cusparseDirection_t dirA, - int m, - int n, - const cusparseMatDescr_t descrA, - const float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - int blockDim, - const cusparseMatDescr_t descrC, - float *bsrSortedValC, - int *bsrSortedRowPtrC, - int *bsrSortedColIndC); - -cusparseStatus_t CUSPARSEAPI cusparseDcsr2bsr(cusparseHandle_t handle, - cusparseDirection_t dirA, - int m, - int n, - const cusparseMatDescr_t descrA, - const double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - int blockDim, - const cusparseMatDescr_t descrC, - double *bsrSortedValC, - int *bsrSortedRowPtrC, - int *bsrSortedColIndC); - -cusparseStatus_t CUSPARSEAPI cusparseCcsr2bsr(cusparseHandle_t handle, - cusparseDirection_t dirA, - int m, - int n, - const cusparseMatDescr_t descrA, - const cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - int blockDim, - const cusparseMatDescr_t descrC, - cuComplex *bsrSortedValC, - int *bsrSortedRowPtrC, - int *bsrSortedColIndC); - -cusparseStatus_t CUSPARSEAPI cusparseZcsr2bsr(cusparseHandle_t handle, - cusparseDirection_t dirA, - int m, - int n, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - int blockDim, - const cusparseMatDescr_t descrC, - cuDoubleComplex *bsrSortedValC, - int *bsrSortedRowPtrC, - int *bsrSortedColIndC); - -/* Description: This routine converts a sparse matrix in block-CSR storage format - to a sparse matrix in CSR storage format. */ -cusparseStatus_t CUSPARSEAPI cusparseSbsr2csr(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nb, - const cusparseMatDescr_t descrA, - const float *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int blockDim, - const cusparseMatDescr_t descrC, - float *csrSortedValC, - int *csrSortedRowPtrC, - int *csrSortedColIndC); - -cusparseStatus_t CUSPARSEAPI cusparseDbsr2csr(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nb, - const cusparseMatDescr_t descrA, - const double *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int blockDim, - const cusparseMatDescr_t descrC, - double *csrSortedValC, - int *csrSortedRowPtrC, - int *csrSortedColIndC); - -cusparseStatus_t CUSPARSEAPI cusparseCbsr2csr(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nb, - const cusparseMatDescr_t descrA, - const cuComplex *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int blockDim, - const cusparseMatDescr_t descrC, - cuComplex *csrSortedValC, - int *csrSortedRowPtrC, - int *csrSortedColIndC); - -cusparseStatus_t CUSPARSEAPI cusparseZbsr2csr(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nb, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int blockDim, - const cusparseMatDescr_t descrC, - cuDoubleComplex *csrSortedValC, - int *csrSortedRowPtrC, - int *csrSortedColIndC); - -/* Description: This routine converts a sparse matrix in general block-CSR storage format - to a sparse matrix in general block-CSC storage format. */ -cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc_bufferSize(cusparseHandle_t handle, - int mb, - int nb, - int nnzb, - const float *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int rowBlockDim, - int colBlockDim, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc_bufferSize(cusparseHandle_t handle, - int mb, - int nb, - int nnzb, - const double *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int rowBlockDim, - int colBlockDim, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc_bufferSize(cusparseHandle_t handle, - int mb, - int nb, - int nnzb, - const cuComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int rowBlockDim, - int colBlockDim, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc_bufferSize(cusparseHandle_t handle, - int mb, - int nb, - int nnzb, - const cuDoubleComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int rowBlockDim, - int colBlockDim, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc_bufferSizeExt(cusparseHandle_t handle, - int mb, - int nb, - int nnzb, - const float *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int rowBlockDim, - int colBlockDim, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc_bufferSizeExt(cusparseHandle_t handle, - int mb, - int nb, - int nnzb, - const double *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int rowBlockDim, - int colBlockDim, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc_bufferSizeExt(cusparseHandle_t handle, - int mb, - int nb, - int nnzb, - const cuComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int rowBlockDim, - int colBlockDim, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc_bufferSizeExt(cusparseHandle_t handle, - int mb, - int nb, - int nnzb, - const cuDoubleComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int rowBlockDim, - int colBlockDim, - size_t *pBufferSize); - - -cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc(cusparseHandle_t handle, - int mb, - int nb, - int nnzb, - const float *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int rowBlockDim, - int colBlockDim, - float *bscVal, - int *bscRowInd, - int *bscColPtr, - cusparseAction_t copyValues, - cusparseIndexBase_t baseIdx, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc(cusparseHandle_t handle, - int mb, - int nb, - int nnzb, - const double *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int rowBlockDim, - int colBlockDim, - double *bscVal, - int *bscRowInd, - int *bscColPtr, - cusparseAction_t copyValues, - cusparseIndexBase_t baseIdx, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc(cusparseHandle_t handle, - int mb, - int nb, - int nnzb, - const cuComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int rowBlockDim, - int colBlockDim, - cuComplex *bscVal, - int *bscRowInd, - int *bscColPtr, - cusparseAction_t copyValues, - cusparseIndexBase_t baseIdx, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc(cusparseHandle_t handle, - int mb, - int nb, - int nnzb, - const cuDoubleComplex *bsrSortedVal, - const int *bsrSortedRowPtr, - const int *bsrSortedColInd, - int rowBlockDim, - int colBlockDim, - cuDoubleComplex *bscVal, - int *bscRowInd, - int *bscColPtr, - cusparseAction_t copyValues, - cusparseIndexBase_t baseIdx, - void *pBuffer); - -/* Description: This routine converts a sparse matrix in general block-CSR storage format - to a sparse matrix in CSR storage format. */ -cusparseStatus_t CUSPARSEAPI cusparseXgebsr2csr(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nb, - const cusparseMatDescr_t descrA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int rowBlockDim, - int colBlockDim, - const cusparseMatDescr_t descrC, - int *csrSortedRowPtrC, - int *csrSortedColIndC ); - -cusparseStatus_t CUSPARSEAPI cusparseSgebsr2csr(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nb, - const cusparseMatDescr_t descrA, - const float *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int rowBlockDim, - int colBlockDim, - const cusparseMatDescr_t descrC, - float *csrSortedValC, - int *csrSortedRowPtrC, - int *csrSortedColIndC ); - - -cusparseStatus_t CUSPARSEAPI cusparseDgebsr2csr(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nb, - const cusparseMatDescr_t descrA, - const double *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int rowBlockDim, - int colBlockDim, - const cusparseMatDescr_t descrC, - double *csrSortedValC, - int *csrSortedRowPtrC, - int *csrSortedColIndC ); - - -cusparseStatus_t CUSPARSEAPI cusparseCgebsr2csr(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nb, - const cusparseMatDescr_t descrA, - const cuComplex *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int rowBlockDim, - int colBlockDim, - const cusparseMatDescr_t descrC, - cuComplex *csrSortedValC, - int *csrSortedRowPtrC, - int *csrSortedColIndC ); - - -cusparseStatus_t CUSPARSEAPI cusparseZgebsr2csr(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nb, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int rowBlockDim, - int colBlockDim, - const cusparseMatDescr_t descrC, - cuDoubleComplex *csrSortedValC, - int *csrSortedRowPtrC, - int *csrSortedColIndC ); - -/* Description: This routine converts a sparse matrix in CSR storage format - to a sparse matrix in general block-CSR storage format. */ -cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr_bufferSize(cusparseHandle_t handle, - cusparseDirection_t dirA, - int m, - int n, - const cusparseMatDescr_t descrA, - const float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - int rowBlockDim, - int colBlockDim, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr_bufferSize(cusparseHandle_t handle, - cusparseDirection_t dirA, - int m, - int n, - const cusparseMatDescr_t descrA, - const double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - int rowBlockDim, - int colBlockDim, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr_bufferSize(cusparseHandle_t handle, - cusparseDirection_t dirA, - int m, - int n, - const cusparseMatDescr_t descrA, - const cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - int rowBlockDim, - int colBlockDim, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr_bufferSize(cusparseHandle_t handle, - cusparseDirection_t dirA, - int m, - int n, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - int rowBlockDim, - int colBlockDim, - int *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr_bufferSizeExt(cusparseHandle_t handle, - cusparseDirection_t dirA, - int m, - int n, - const cusparseMatDescr_t descrA, - const float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - int rowBlockDim, - int colBlockDim, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr_bufferSizeExt(cusparseHandle_t handle, - cusparseDirection_t dirA, - int m, - int n, - const cusparseMatDescr_t descrA, - const double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - int rowBlockDim, - int colBlockDim, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr_bufferSizeExt(cusparseHandle_t handle, - cusparseDirection_t dirA, - int m, - int n, - const cusparseMatDescr_t descrA, - const cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - int rowBlockDim, - int colBlockDim, - size_t *pBufferSize); - -cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr_bufferSizeExt(cusparseHandle_t handle, - cusparseDirection_t dirA, - int m, - int n, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - int rowBlockDim, - int colBlockDim, - size_t *pBufferSize); - - - -cusparseStatus_t CUSPARSEAPI cusparseXcsr2gebsrNnz(cusparseHandle_t handle, - cusparseDirection_t dirA, - int m, - int n, - const cusparseMatDescr_t descrA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cusparseMatDescr_t descrC, - int *bsrSortedRowPtrC, - int rowBlockDim, - int colBlockDim, - int *nnzTotalDevHostPtr, - void *pBuffer ); - -cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr(cusparseHandle_t handle, - cusparseDirection_t dirA, - int m, - int n, - const cusparseMatDescr_t descrA, - const float *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cusparseMatDescr_t descrC, - float *bsrSortedValC, - int *bsrSortedRowPtrC, - int *bsrSortedColIndC, - int rowBlockDim, - int colBlockDim, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr(cusparseHandle_t handle, - cusparseDirection_t dirA, - int m, - int n, - const cusparseMatDescr_t descrA, - const double *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cusparseMatDescr_t descrC, - double *bsrSortedValC, - int *bsrSortedRowPtrC, - int *bsrSortedColIndC, - int rowBlockDim, - int colBlockDim, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr(cusparseHandle_t handle, - cusparseDirection_t dirA, - int m, - int n, - const cusparseMatDescr_t descrA, - const cuComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cusparseMatDescr_t descrC, - cuComplex *bsrSortedValC, - int *bsrSortedRowPtrC, - int *bsrSortedColIndC, - int rowBlockDim, - int colBlockDim, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr(cusparseHandle_t handle, - cusparseDirection_t dirA, - int m, - int n, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *csrSortedValA, - const int *csrSortedRowPtrA, - const int *csrSortedColIndA, - const cusparseMatDescr_t descrC, - cuDoubleComplex *bsrSortedValC, - int *bsrSortedRowPtrC, - int *bsrSortedColIndC, - int rowBlockDim, - int colBlockDim, - void *pBuffer); - -/* Description: This routine converts a sparse matrix in general block-CSR storage format - to a sparse matrix in general block-CSR storage format with different block size. */ -cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr_bufferSize(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nb, - int nnzb, - const cusparseMatDescr_t descrA, - const float *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int rowBlockDimA, - int colBlockDimA, - int rowBlockDimC, - int colBlockDimC, - int *pBufferSizeInBytes ); - -cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr_bufferSize(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nb, - int nnzb, - const cusparseMatDescr_t descrA, - const double *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int rowBlockDimA, - int colBlockDimA, - int rowBlockDimC, - int colBlockDimC, - int *pBufferSizeInBytes ); - -cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr_bufferSize(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nb, - int nnzb, - const cusparseMatDescr_t descrA, - const cuComplex *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int rowBlockDimA, - int colBlockDimA, - int rowBlockDimC, - int colBlockDimC, - int *pBufferSizeInBytes ); - -cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr_bufferSize(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nb, - int nnzb, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int rowBlockDimA, - int colBlockDimA, - int rowBlockDimC, - int colBlockDimC, - int *pBufferSizeInBytes ); - - -cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr_bufferSizeExt(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nb, - int nnzb, - const cusparseMatDescr_t descrA, - const float *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int rowBlockDimA, - int colBlockDimA, - int rowBlockDimC, - int colBlockDimC, - size_t *pBufferSize ); - -cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr_bufferSizeExt(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nb, - int nnzb, - const cusparseMatDescr_t descrA, - const double *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int rowBlockDimA, - int colBlockDimA, - int rowBlockDimC, - int colBlockDimC, - size_t *pBufferSize ); - -cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr_bufferSizeExt(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nb, - int nnzb, - const cusparseMatDescr_t descrA, - const cuComplex *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int rowBlockDimA, - int colBlockDimA, - int rowBlockDimC, - int colBlockDimC, - size_t *pBufferSize ); - -cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr_bufferSizeExt(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nb, - int nnzb, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int rowBlockDimA, - int colBlockDimA, - int rowBlockDimC, - int colBlockDimC, - size_t *pBufferSize ); - - - -cusparseStatus_t CUSPARSEAPI cusparseXgebsr2gebsrNnz(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nb, - int nnzb, - const cusparseMatDescr_t descrA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int rowBlockDimA, - int colBlockDimA, - const cusparseMatDescr_t descrC, - int *bsrSortedRowPtrC, - int rowBlockDimC, - int colBlockDimC, - int *nnzTotalDevHostPtr, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nb, - int nnzb, - const cusparseMatDescr_t descrA, - const float *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int rowBlockDimA, - int colBlockDimA, - const cusparseMatDescr_t descrC, - float *bsrSortedValC, - int *bsrSortedRowPtrC, - int *bsrSortedColIndC, - int rowBlockDimC, - int colBlockDimC, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nb, - int nnzb, - const cusparseMatDescr_t descrA, - const double *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int rowBlockDimA, - int colBlockDimA, - const cusparseMatDescr_t descrC, - double *bsrSortedValC, - int *bsrSortedRowPtrC, - int *bsrSortedColIndC, - int rowBlockDimC, - int colBlockDimC, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nb, - int nnzb, - const cusparseMatDescr_t descrA, - const cuComplex *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int rowBlockDimA, - int colBlockDimA, - const cusparseMatDescr_t descrC, - cuComplex *bsrSortedValC, - int *bsrSortedRowPtrC, - int *bsrSortedColIndC, - int rowBlockDimC, - int colBlockDimC, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr(cusparseHandle_t handle, - cusparseDirection_t dirA, - int mb, - int nb, - int nnzb, - const cusparseMatDescr_t descrA, - const cuDoubleComplex *bsrSortedValA, - const int *bsrSortedRowPtrA, - const int *bsrSortedColIndA, - int rowBlockDimA, - int colBlockDimA, - const cusparseMatDescr_t descrC, - cuDoubleComplex *bsrSortedValC, - int *bsrSortedRowPtrC, - int *bsrSortedColIndC, - int rowBlockDimC, - int colBlockDimC, - void *pBuffer); - -/* --- Sparse Matrix Sorting --- */ - -/* Description: Create a identity sequence p=[0,1,...,n-1]. */ -cusparseStatus_t CUSPARSEAPI cusparseCreateIdentityPermutation(cusparseHandle_t handle, - int n, - int *p); - -/* Description: Sort sparse matrix stored in COO format */ -cusparseStatus_t CUSPARSEAPI cusparseXcoosort_bufferSizeExt(cusparseHandle_t handle, - int m, - int n, - int nnz, - const int *cooRowsA, - const int *cooColsA, - size_t *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseXcoosortByRow(cusparseHandle_t handle, - int m, - int n, - int nnz, - int *cooRowsA, - int *cooColsA, - int *P, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseXcoosortByColumn(cusparseHandle_t handle, - int m, - int n, - int nnz, - int *cooRowsA, - int *cooColsA, - int *P, - void *pBuffer); - -/* Description: Sort sparse matrix stored in CSR format */ -cusparseStatus_t CUSPARSEAPI cusparseXcsrsort_bufferSizeExt(cusparseHandle_t handle, - int m, - int n, - int nnz, - const int *csrRowPtrA, - const int *csrColIndA, - size_t *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseXcsrsort(cusparseHandle_t handle, - int m, - int n, - int nnz, - const cusparseMatDescr_t descrA, - const int *csrRowPtrA, - int *csrColIndA, - int *P, - void *pBuffer); - -/* Description: Sort sparse matrix stored in CSC format */ -cusparseStatus_t CUSPARSEAPI cusparseXcscsort_bufferSizeExt(cusparseHandle_t handle, - int m, - int n, - int nnz, - const int *cscColPtrA, - const int *cscRowIndA, - size_t *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseXcscsort(cusparseHandle_t handle, - int m, - int n, - int nnz, - const cusparseMatDescr_t descrA, - const int *cscColPtrA, - int *cscRowIndA, - int *P, - void *pBuffer); - -/* Description: Wrapper that sorts sparse matrix stored in CSR format - (without exposing the permutation). */ -cusparseStatus_t CUSPARSEAPI cusparseScsru2csr_bufferSizeExt(cusparseHandle_t handle, - int m, - int n, - int nnz, - float *csrVal, - const int *csrRowPtr, - int *csrColInd, - csru2csrInfo_t info, - size_t *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseDcsru2csr_bufferSizeExt(cusparseHandle_t handle, - int m, - int n, - int nnz, - double *csrVal, - const int *csrRowPtr, - int *csrColInd, - csru2csrInfo_t info, - size_t *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseCcsru2csr_bufferSizeExt(cusparseHandle_t handle, - int m, - int n, - int nnz, - cuComplex *csrVal, - const int *csrRowPtr, - int *csrColInd, - csru2csrInfo_t info, - size_t *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseZcsru2csr_bufferSizeExt(cusparseHandle_t handle, - int m, - int n, - int nnz, - cuDoubleComplex *csrVal, - const int *csrRowPtr, - int *csrColInd, - csru2csrInfo_t info, - size_t *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseScsru2csr(cusparseHandle_t handle, - int m, - int n, - int nnz, - const cusparseMatDescr_t descrA, - float *csrVal, - const int *csrRowPtr, - int *csrColInd, - csru2csrInfo_t info, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDcsru2csr(cusparseHandle_t handle, - int m, - int n, - int nnz, - const cusparseMatDescr_t descrA, - double *csrVal, - const int *csrRowPtr, - int *csrColInd, - csru2csrInfo_t info, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseCcsru2csr(cusparseHandle_t handle, - int m, - int n, - int nnz, - const cusparseMatDescr_t descrA, - cuComplex *csrVal, - const int *csrRowPtr, - int *csrColInd, - csru2csrInfo_t info, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseZcsru2csr(cusparseHandle_t handle, - int m, - int n, - int nnz, - const cusparseMatDescr_t descrA, - cuDoubleComplex *csrVal, - const int *csrRowPtr, - int *csrColInd, - csru2csrInfo_t info, - void *pBuffer); - -/* Description: Wrapper that un-sorts sparse matrix stored in CSR format - (without exposing the permutation). */ -cusparseStatus_t CUSPARSEAPI cusparseScsr2csru(cusparseHandle_t handle, - int m, - int n, - int nnz, - const cusparseMatDescr_t descrA, - float *csrVal, - const int *csrRowPtr, - int *csrColInd, - csru2csrInfo_t info, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDcsr2csru(cusparseHandle_t handle, - int m, - int n, - int nnz, - const cusparseMatDescr_t descrA, - double *csrVal, - const int *csrRowPtr, - int *csrColInd, - csru2csrInfo_t info, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseCcsr2csru(cusparseHandle_t handle, - int m, - int n, - int nnz, - const cusparseMatDescr_t descrA, - cuComplex *csrVal, - const int *csrRowPtr, - int *csrColInd, - csru2csrInfo_t info, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseZcsr2csru(cusparseHandle_t handle, - int m, - int n, - int nnz, - const cusparseMatDescr_t descrA, - cuDoubleComplex *csrVal, - const int *csrRowPtr, - int *csrColInd, - csru2csrInfo_t info, - void *pBuffer); - -/* Description: prune dense matrix to a sparse matrix with CSR format */ -#if defined(__cplusplus) -cusparseStatus_t CUSPARSEAPI cusparseHpruneDense2csr_bufferSizeExt( - cusparseHandle_t handle, - int m, - int n, - const __half *A, - int lda, - const __half *threshold, - const cusparseMatDescr_t descrC, - const __half *csrValC, - const int *csrRowPtrC, - const int *csrColIndC, - size_t *pBufferSizeInBytes); -#endif - -cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csr_bufferSizeExt( - cusparseHandle_t handle, - int m, - int n, - const float *A, - int lda, - const float *threshold, - const cusparseMatDescr_t descrC, - const float *csrValC, - const int *csrRowPtrC, - const int *csrColIndC, - size_t *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csr_bufferSizeExt( - cusparseHandle_t handle, - int m, - int n, - const double *A, - int lda, - const double *threshold, - const cusparseMatDescr_t descrC, - const double *csrValC, - const int *csrRowPtrC, - const int *csrColIndC, - size_t *pBufferSizeInBytes); - -#if defined(__cplusplus) -cusparseStatus_t CUSPARSEAPI cusparseHpruneDense2csrNnz( - cusparseHandle_t handle, - int m, - int n, - const __half *A, - int lda, - const __half *threshold, - const cusparseMatDescr_t descrC, - int *csrRowPtrC, - int *nnzTotalDevHostPtr, - void *pBuffer); -#endif - -cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrNnz( - cusparseHandle_t handle, - int m, - int n, - const float *A, - int lda, - const float *threshold, - const cusparseMatDescr_t descrC, - int *csrRowPtrC, - int *nnzTotalDevHostPtr, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrNnz( - cusparseHandle_t handle, - int m, - int n, - const double *A, - int lda, - const double *threshold, - const cusparseMatDescr_t descrC, - int *csrRowPtrC, - int *nnzTotalDevHostPtr, - void *pBuffer); - -#if defined(__cplusplus) -cusparseStatus_t CUSPARSEAPI cusparseHpruneDense2csr( - cusparseHandle_t handle, - int m, - int n, - const __half *A, - int lda, - const __half *threshold, - const cusparseMatDescr_t descrC, - __half *csrValC, - const int *csrRowPtrC, - int *csrColIndC, - void *pBuffer); -#endif - -cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csr( - cusparseHandle_t handle, - int m, - int n, - const float *A, - int lda, - const float *threshold, - const cusparseMatDescr_t descrC, - float *csrValC, - const int *csrRowPtrC, - int *csrColIndC, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csr( - cusparseHandle_t handle, - int m, - int n, - const double *A, - int lda, - const double *threshold, - const cusparseMatDescr_t descrC, - double *csrValC, - const int *csrRowPtrC, - int *csrColIndC, - void *pBuffer); - -/* Description: prune sparse matrix with CSR format to another sparse matrix with CSR format */ -#if defined(__cplusplus) -cusparseStatus_t CUSPARSEAPI cusparseHpruneCsr2csr_bufferSizeExt( - cusparseHandle_t handle, - int m, - int n, - int nnzA, - const cusparseMatDescr_t descrA, - const __half *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - const __half *threshold, - const cusparseMatDescr_t descrC, - const __half *csrValC, - const int *csrRowPtrC, - const int *csrColIndC, - size_t *pBufferSizeInBytes); -#endif - -cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csr_bufferSizeExt( - cusparseHandle_t handle, - int m, - int n, - int nnzA, - const cusparseMatDescr_t descrA, - const float *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - const float *threshold, - const cusparseMatDescr_t descrC, - const float *csrValC, - const int *csrRowPtrC, - const int *csrColIndC, - size_t *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csr_bufferSizeExt( - cusparseHandle_t handle, - int m, - int n, - int nnzA, - const cusparseMatDescr_t descrA, - const double *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - const double *threshold, - const cusparseMatDescr_t descrC, - const double *csrValC, - const int *csrRowPtrC, - const int *csrColIndC, - size_t *pBufferSizeInBytes); - -#if defined(__cplusplus) -cusparseStatus_t CUSPARSEAPI cusparseHpruneCsr2csrNnz( - cusparseHandle_t handle, - int m, - int n, - int nnzA, - const cusparseMatDescr_t descrA, - const __half *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - const __half *threshold, - const cusparseMatDescr_t descrC, - int *csrRowPtrC, - int *nnzTotalDevHostPtr, /* can be on host or device */ - void *pBuffer); -#endif - -cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrNnz( - cusparseHandle_t handle, - int m, - int n, - int nnzA, - const cusparseMatDescr_t descrA, - const float *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - const float *threshold, - const cusparseMatDescr_t descrC, - int *csrRowPtrC, - int *nnzTotalDevHostPtr, /* can be on host or device */ - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrNnz( - cusparseHandle_t handle, - int m, - int n, - int nnzA, - const cusparseMatDescr_t descrA, - const double *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - const double *threshold, - const cusparseMatDescr_t descrC, - int *csrRowPtrC, - int *nnzTotalDevHostPtr, /* can be on host or device */ - void *pBuffer); - -#if defined(__cplusplus) -cusparseStatus_t CUSPARSEAPI cusparseHpruneCsr2csr( - cusparseHandle_t handle, - int m, - int n, - int nnzA, - const cusparseMatDescr_t descrA, - const __half *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - const __half *threshold, - const cusparseMatDescr_t descrC, - __half *csrValC, - const int *csrRowPtrC, - int *csrColIndC, - void *pBuffer); -#endif - -cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csr( - cusparseHandle_t handle, - int m, - int n, - int nnzA, - const cusparseMatDescr_t descrA, - const float *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - const float *threshold, - const cusparseMatDescr_t descrC, - float *csrValC, - const int *csrRowPtrC, - int *csrColIndC, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csr( - cusparseHandle_t handle, - int m, - int n, - int nnzA, - const cusparseMatDescr_t descrA, - const double *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - const double *threshold, - const cusparseMatDescr_t descrC, - double *csrValC, - const int *csrRowPtrC, - int *csrColIndC, - void *pBuffer); - -/* Description: prune dense matrix to a sparse matrix with CSR format by percentage */ -#if defined(__cplusplus) -cusparseStatus_t CUSPARSEAPI cusparseHpruneDense2csrByPercentage_bufferSizeExt( - cusparseHandle_t handle, - int m, - int n, - const __half *A, - int lda, - float percentage, /* between 0 to 100 */ - const cusparseMatDescr_t descrC, - const __half *csrValC, - const int *csrRowPtrC, - const int *csrColIndC, - pruneInfo_t info, - size_t *pBufferSizeInBytes); -#endif - -cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrByPercentage_bufferSizeExt( - cusparseHandle_t handle, - int m, - int n, - const float *A, - int lda, - float percentage, /* between 0 to 100 */ - const cusparseMatDescr_t descrC, - const float *csrValC, - const int *csrRowPtrC, - const int *csrColIndC, - pruneInfo_t info, - size_t *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrByPercentage_bufferSizeExt( - cusparseHandle_t handle, - int m, - int n, - const double *A, - int lda, - float percentage, /* between 0 to 100 */ - const cusparseMatDescr_t descrC, - const double *csrValC, - const int *csrRowPtrC, - const int *csrColIndC, - pruneInfo_t info, - size_t *pBufferSizeInBytes); - -#if defined(__cplusplus) -cusparseStatus_t CUSPARSEAPI cusparseHpruneDense2csrNnzByPercentage( - cusparseHandle_t handle, - int m, - int n, - const __half *A, - int lda, - float percentage, /* between 0 to 100 */ - const cusparseMatDescr_t descrC, - int *csrRowPtrC, - int *nnzTotalDevHostPtr, /* can be on host or device */ - pruneInfo_t info, - void *pBuffer); -#endif - -cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrNnzByPercentage( - cusparseHandle_t handle, - int m, - int n, - const float *A, - int lda, - float percentage, /* between 0 to 100 */ - const cusparseMatDescr_t descrC, - int *csrRowPtrC, - int *nnzTotalDevHostPtr, /* can be on host or device */ - pruneInfo_t info, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrNnzByPercentage( - cusparseHandle_t handle, - int m, - int n, - const double *A, - int lda, - float percentage, /* between 0 to 100 */ - const cusparseMatDescr_t descrC, - int *csrRowPtrC, - int *nnzTotalDevHostPtr, /* can be on host or device */ - pruneInfo_t info, - void *pBuffer); - -#if defined(__cplusplus) -cusparseStatus_t CUSPARSEAPI cusparseHpruneDense2csrByPercentage( - cusparseHandle_t handle, - int m, - int n, - const __half *A, - int lda, - float percentage, /* between 0 to 100 */ - const cusparseMatDescr_t descrC, - __half *csrValC, - const int *csrRowPtrC, - int *csrColIndC, - pruneInfo_t info, - void *pBuffer); -#endif - -cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrByPercentage( - cusparseHandle_t handle, - int m, - int n, - const float *A, - int lda, - float percentage, /* between 0 to 100 */ - const cusparseMatDescr_t descrC, - float *csrValC, - const int *csrRowPtrC, - int *csrColIndC, - pruneInfo_t info, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrByPercentage( - cusparseHandle_t handle, - int m, - int n, - const double *A, - int lda, - float percentage, /* between 0 to 100 */ - const cusparseMatDescr_t descrC, - double *csrValC, - const int *csrRowPtrC, - int *csrColIndC, - pruneInfo_t info, - void *pBuffer); - - -/* Description: prune sparse matrix to a sparse matrix with CSR format by percentage*/ -#if defined(__cplusplus) -cusparseStatus_t CUSPARSEAPI cusparseHpruneCsr2csrByPercentage_bufferSizeExt( - cusparseHandle_t handle, - int m, - int n, - int nnzA, - const cusparseMatDescr_t descrA, - const __half *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - float percentage, /* between 0 to 100 */ - const cusparseMatDescr_t descrC, - const __half *csrValC, - const int *csrRowPtrC, - const int *csrColIndC, - pruneInfo_t info, - size_t *pBufferSizeInBytes); -#endif - -cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrByPercentage_bufferSizeExt( - cusparseHandle_t handle, - int m, - int n, - int nnzA, - const cusparseMatDescr_t descrA, - const float *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - float percentage, /* between 0 to 100 */ - const cusparseMatDescr_t descrC, - const float *csrValC, - const int *csrRowPtrC, - const int *csrColIndC, - pruneInfo_t info, - size_t *pBufferSizeInBytes); - -cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrByPercentage_bufferSizeExt( - cusparseHandle_t handle, - int m, - int n, - int nnzA, - const cusparseMatDescr_t descrA, - const double *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - float percentage, /* between 0 to 100 */ - const cusparseMatDescr_t descrC, - const double *csrValC, - const int *csrRowPtrC, - const int *csrColIndC, - pruneInfo_t info, - size_t *pBufferSizeInBytes); - -#if defined(__cplusplus) -cusparseStatus_t CUSPARSEAPI cusparseHpruneCsr2csrNnzByPercentage( - cusparseHandle_t handle, - int m, - int n, - int nnzA, - const cusparseMatDescr_t descrA, - const __half *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - float percentage, /* between 0 to 100 */ - const cusparseMatDescr_t descrC, - int *csrRowPtrC, - int *nnzTotalDevHostPtr, /* can be on host or device */ - pruneInfo_t info, - void *pBuffer); -#endif - -cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrNnzByPercentage( - cusparseHandle_t handle, - int m, - int n, - int nnzA, - const cusparseMatDescr_t descrA, - const float *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - float percentage, /* between 0 to 100 */ - const cusparseMatDescr_t descrC, - int *csrRowPtrC, - int *nnzTotalDevHostPtr, /* can be on host or device */ - pruneInfo_t info, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrNnzByPercentage( - cusparseHandle_t handle, - int m, - int n, - int nnzA, - const cusparseMatDescr_t descrA, - const double *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - float percentage, /* between 0 to 100 */ - const cusparseMatDescr_t descrC, - int *csrRowPtrC, - int *nnzTotalDevHostPtr, /* can be on host or device */ - pruneInfo_t info, - void *pBuffer); - -#if defined(__cplusplus) -cusparseStatus_t CUSPARSEAPI cusparseHpruneCsr2csrByPercentage( - cusparseHandle_t handle, - int m, - int n, - int nnzA, - const cusparseMatDescr_t descrA, - const __half *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - float percentage, /* between 0 to 100 */ - const cusparseMatDescr_t descrC, - __half *csrValC, - const int *csrRowPtrC, - int *csrColIndC, - pruneInfo_t info, - void *pBuffer); -#endif - -cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrByPercentage( - cusparseHandle_t handle, - int m, - int n, - int nnzA, - const cusparseMatDescr_t descrA, - const float *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - float percentage, /* between 0 to 100 */ - const cusparseMatDescr_t descrC, - float *csrValC, - const int *csrRowPtrC, - int *csrColIndC, - pruneInfo_t info, - void *pBuffer); - -cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrByPercentage( - cusparseHandle_t handle, - int m, - int n, - int nnzA, - const cusparseMatDescr_t descrA, - const double *csrValA, - const int *csrRowPtrA, - const int *csrColIndA, - float percentage, /* between 0 to 100 */ - const cusparseMatDescr_t descrC, - double *csrValC, - const int *csrRowPtrC, - int *csrColIndC, - pruneInfo_t info, - void *pBuffer); - - - - -#if defined(__cplusplus) -} -#endif /* __cplusplus */ - -#endif /* !defined(CUSPARSE_H_) */ - diff --git a/include/triton/external/CUDA/device_types.h b/include/triton/external/CUDA/device_types.h deleted file mode 100755 index 1eab7bd3b..000000000 --- a/include/triton/external/CUDA/device_types.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -#if !defined(__DEVICE_TYPES_H__) -#define __DEVICE_TYPES_H__ - -#include "host_defines.h" - -/******************************************************************************* -* * -* * -* * -*******************************************************************************/ - -enum __device_builtin__ cudaRoundMode -{ - cudaRoundNearest, - cudaRoundZero, - cudaRoundPosInf, - cudaRoundMinInf -}; - -#endif /* !__DEVICE_TYPES_H__ */ diff --git a/include/triton/external/CUDA/driver_functions.h b/include/triton/external/CUDA/driver_functions.h deleted file mode 100755 index 7ea235c1e..000000000 --- a/include/triton/external/CUDA/driver_functions.h +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -#if !defined(__DRIVER_FUNCTIONS_H__) -#define __DRIVER_FUNCTIONS_H__ - -#include "builtin_types.h" -#include "host_defines.h" -#include "driver_types.h" - -/** - * \addtogroup CUDART_MEMORY - * - * @{ - */ - -/** - * \brief Returns a cudaPitchedPtr based on input parameters - * - * Returns a ::cudaPitchedPtr based on the specified input parameters \p d, - * \p p, \p xsz, and \p ysz. - * - * \param d - Pointer to allocated memory - * \param p - Pitch of allocated memory in bytes - * \param xsz - Logical width of allocation in elements - * \param ysz - Logical height of allocation in elements - * - * \return - * ::cudaPitchedPtr specified by \p d, \p p, \p xsz, and \p ysz - * - * \sa make_cudaExtent, make_cudaPos - */ -static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz) -{ - struct cudaPitchedPtr s; - - s.ptr = d; - s.pitch = p; - s.xsize = xsz; - s.ysize = ysz; - - return s; -} - -/** - * \brief Returns a cudaPos based on input parameters - * - * Returns a ::cudaPos based on the specified input parameters \p x, - * \p y, and \p z. - * - * \param x - X position - * \param y - Y position - * \param z - Z position - * - * \return - * ::cudaPos specified by \p x, \p y, and \p z - * - * \sa make_cudaExtent, make_cudaPitchedPtr - */ -static __inline__ __host__ struct cudaPos make_cudaPos(size_t x, size_t y, size_t z) -{ - struct cudaPos p; - - p.x = x; - p.y = y; - p.z = z; - - return p; -} - -/** - * \brief Returns a cudaExtent based on input parameters - * - * Returns a ::cudaExtent based on the specified input parameters \p w, - * \p h, and \p d. - * - * \param w - Width in elements when referring to array memory, in bytes when referring to linear memory - * \param h - Height in elements - * \param d - Depth in elements - * - * \return - * ::cudaExtent specified by \p w, \p h, and \p d - * - * \sa make_cudaPitchedPtr, make_cudaPos - */ -static __inline__ __host__ struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d) -{ - struct cudaExtent e; - - e.width = w; - e.height = h; - e.depth = d; - - return e; -} - -/** @} */ /* END CUDART_MEMORY */ - -#endif /* !__DRIVER_FUNCTIONS_H__ */ diff --git a/include/triton/external/CUDA/driver_types.h b/include/triton/external/CUDA/driver_types.h deleted file mode 100755 index fd11843b5..000000000 --- a/include/triton/external/CUDA/driver_types.h +++ /dev/null @@ -1,1610 +0,0 @@ -/* - * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -#if !defined(__DRIVER_TYPES_H__) -#define __DRIVER_TYPES_H__ - -#include "host_defines.h" -#include "vector_types.h" - -/** - * \defgroup CUDART_TYPES Data types used by CUDA Runtime - * \ingroup CUDART - * - * @{ - */ - -/******************************************************************************* -* * -* TYPE DEFINITIONS USED BY RUNTIME API * -* * -*******************************************************************************/ - -#if !defined(__CUDA_INTERNAL_COMPILATION__) - -#if !defined(__CUDACC_RTC__) -#include -#include -#endif /* !defined(__CUDACC_RTC__) */ - -#define cudaHostAllocDefault 0x00 /**< Default page-locked allocation flag */ -#define cudaHostAllocPortable 0x01 /**< Pinned memory accessible by all CUDA contexts */ -#define cudaHostAllocMapped 0x02 /**< Map allocation into device space */ -#define cudaHostAllocWriteCombined 0x04 /**< Write-combined memory */ - -#define cudaHostRegisterDefault 0x00 /**< Default host memory registration flag */ -#define cudaHostRegisterPortable 0x01 /**< Pinned memory accessible by all CUDA contexts */ -#define cudaHostRegisterMapped 0x02 /**< Map registered memory into device space */ -#define cudaHostRegisterIoMemory 0x04 /**< Memory-mapped I/O space */ - -#define cudaPeerAccessDefault 0x00 /**< Default peer addressing enable flag */ - -#define cudaStreamDefault 0x00 /**< Default stream flag */ -#define cudaStreamNonBlocking 0x01 /**< Stream does not synchronize with stream 0 (the NULL stream) */ - - /** - * Legacy stream handle - * - * Stream handle that can be passed as a cudaStream_t to use an implicit stream - * with legacy synchronization behavior. - * - * See details of the \link_sync_behavior - */ -#define cudaStreamLegacy ((cudaStream_t)0x1) - -/** - * Per-thread stream handle - * - * Stream handle that can be passed as a cudaStream_t to use an implicit stream - * with per-thread synchronization behavior. - * - * See details of the \link_sync_behavior - */ -#define cudaStreamPerThread ((cudaStream_t)0x2) - -#define cudaEventDefault 0x00 /**< Default event flag */ -#define cudaEventBlockingSync 0x01 /**< Event uses blocking synchronization */ -#define cudaEventDisableTiming 0x02 /**< Event will not record timing data */ -#define cudaEventInterprocess 0x04 /**< Event is suitable for interprocess use. cudaEventDisableTiming must be set */ - -#define cudaDeviceScheduleAuto 0x00 /**< Device flag - Automatic scheduling */ -#define cudaDeviceScheduleSpin 0x01 /**< Device flag - Spin default scheduling */ -#define cudaDeviceScheduleYield 0x02 /**< Device flag - Yield default scheduling */ -#define cudaDeviceScheduleBlockingSync 0x04 /**< Device flag - Use blocking synchronization */ -#define cudaDeviceBlockingSync 0x04 /**< Device flag - Use blocking synchronization - * \deprecated This flag was deprecated as of CUDA 4.0 and - * replaced with ::cudaDeviceScheduleBlockingSync. */ -#define cudaDeviceScheduleMask 0x07 /**< Device schedule flags mask */ -#define cudaDeviceMapHost 0x08 /**< Device flag - Support mapped pinned allocations */ -#define cudaDeviceLmemResizeToMax 0x10 /**< Device flag - Keep local memory allocation after launch */ -#define cudaDeviceMask 0x1f /**< Device flags mask */ - -#define cudaArrayDefault 0x00 /**< Default CUDA array allocation flag */ -#define cudaArrayLayered 0x01 /**< Must be set in cudaMalloc3DArray to create a layered CUDA array */ -#define cudaArraySurfaceLoadStore 0x02 /**< Must be set in cudaMallocArray or cudaMalloc3DArray in order to bind surfaces to the CUDA array */ -#define cudaArrayCubemap 0x04 /**< Must be set in cudaMalloc3DArray to create a cubemap CUDA array */ -#define cudaArrayTextureGather 0x08 /**< Must be set in cudaMallocArray or cudaMalloc3DArray in order to perform texture gather operations on the CUDA array */ - -#define cudaIpcMemLazyEnablePeerAccess 0x01 /**< Automatically enable peer access between remote devices as needed */ - -#define cudaMemAttachGlobal 0x01 /**< Memory can be accessed by any stream on any device*/ -#define cudaMemAttachHost 0x02 /**< Memory cannot be accessed by any stream on any device */ -#define cudaMemAttachSingle 0x04 /**< Memory can only be accessed by a single stream on the associated device */ - -#define cudaOccupancyDefault 0x00 /**< Default behavior */ -#define cudaOccupancyDisableCachingOverride 0x01 /**< Assume global caching is enabled and cannot be automatically turned off */ - -#define cudaCpuDeviceId ((int)-1) /**< Device id that represents the CPU */ -#define cudaInvalidDeviceId ((int)-2) /**< Device id that represents an invalid device */ - -/** - * If set, each kernel launched as part of ::cudaLaunchCooperativeKernelMultiDevice only - * waits for prior work in the stream corresponding to that GPU to complete before the - * kernel begins execution. - */ -#define cudaCooperativeLaunchMultiDeviceNoPreSync 0x01 - -/** - * If set, any subsequent work pushed in a stream that participated in a call to - * ::cudaLaunchCooperativeKernelMultiDevice will only wait for the kernel launched on - * the GPU corresponding to that stream to complete before it begins execution. - */ -#define cudaCooperativeLaunchMultiDeviceNoPostSync 0x02 - -#endif /* !__CUDA_INTERNAL_COMPILATION__ */ - -/******************************************************************************* -* * -* * -* * -*******************************************************************************/ - -/** - * CUDA error types - */ -enum __device_builtin__ cudaError -{ - /** - * The API call returned with no errors. In the case of query calls, this - * can also mean that the operation being queried is complete (see - * ::cudaEventQuery() and ::cudaStreamQuery()). - */ - cudaSuccess = 0, - - /** - * The device function being invoked (usually via ::cudaLaunchKernel()) was not - * previously configured via the ::cudaConfigureCall() function. - */ - cudaErrorMissingConfiguration = 1, - - /** - * The API call failed because it was unable to allocate enough memory to - * perform the requested operation. - */ - cudaErrorMemoryAllocation = 2, - - /** - * The API call failed because the CUDA driver and runtime could not be - * initialized. - */ - cudaErrorInitializationError = 3, - - /** - * An exception occurred on the device while executing a kernel. Common - * causes include dereferencing an invalid device pointer and accessing - * out of bounds shared memory. The device cannot be used until - * ::cudaThreadExit() is called. All existing device memory allocations - * are invalid and must be reconstructed if the program is to continue - * using CUDA. - */ - cudaErrorLaunchFailure = 4, - - /** - * This indicated that a previous kernel launch failed. This was previously - * used for device emulation of kernel launches. - * \deprecated - * This error return is deprecated as of CUDA 3.1. Device emulation mode was - * removed with the CUDA 3.1 release. - */ - cudaErrorPriorLaunchFailure = 5, - - /** - * This indicates that the device kernel took too long to execute. This can - * only occur if timeouts are enabled - see the device property - * \ref ::cudaDeviceProp::kernelExecTimeoutEnabled "kernelExecTimeoutEnabled" - * for more information. - * This leaves the process in an inconsistent state and any further CUDA work - * will return the same error. To continue using CUDA, the process must be terminated - * and relaunched. - */ - cudaErrorLaunchTimeout = 6, - - /** - * This indicates that a launch did not occur because it did not have - * appropriate resources. Although this error is similar to - * ::cudaErrorInvalidConfiguration, this error usually indicates that the - * user has attempted to pass too many arguments to the device kernel, or the - * kernel launch specifies too many threads for the kernel's register count. - */ - cudaErrorLaunchOutOfResources = 7, - - /** - * The requested device function does not exist or is not compiled for the - * proper device architecture. - */ - cudaErrorInvalidDeviceFunction = 8, - - /** - * This indicates that a kernel launch is requesting resources that can - * never be satisfied by the current device. Requesting more shared memory - * per block than the device supports will trigger this error, as will - * requesting too many threads or blocks. See ::cudaDeviceProp for more - * device limitations. - */ - cudaErrorInvalidConfiguration = 9, - - /** - * This indicates that the device ordinal supplied by the user does not - * correspond to a valid CUDA device. - */ - cudaErrorInvalidDevice = 10, - - /** - * This indicates that one or more of the parameters passed to the API call - * is not within an acceptable range of values. - */ - cudaErrorInvalidValue = 11, - - /** - * This indicates that one or more of the pitch-related parameters passed - * to the API call is not within the acceptable range for pitch. - */ - cudaErrorInvalidPitchValue = 12, - - /** - * This indicates that the symbol name/identifier passed to the API call - * is not a valid name or identifier. - */ - cudaErrorInvalidSymbol = 13, - - /** - * This indicates that the buffer object could not be mapped. - */ - cudaErrorMapBufferObjectFailed = 14, - - /** - * This indicates that the buffer object could not be unmapped. - */ - cudaErrorUnmapBufferObjectFailed = 15, - - /** - * This indicates that at least one host pointer passed to the API call is - * not a valid host pointer. - */ - cudaErrorInvalidHostPointer = 16, - - /** - * This indicates that at least one device pointer passed to the API call is - * not a valid device pointer. - */ - cudaErrorInvalidDevicePointer = 17, - - /** - * This indicates that the texture passed to the API call is not a valid - * texture. - */ - cudaErrorInvalidTexture = 18, - - /** - * This indicates that the texture binding is not valid. This occurs if you - * call ::cudaGetTextureAlignmentOffset() with an unbound texture. - */ - cudaErrorInvalidTextureBinding = 19, - - /** - * This indicates that the channel descriptor passed to the API call is not - * valid. This occurs if the format is not one of the formats specified by - * ::cudaChannelFormatKind, or if one of the dimensions is invalid. - */ - cudaErrorInvalidChannelDescriptor = 20, - - /** - * This indicates that the direction of the memcpy passed to the API call is - * not one of the types specified by ::cudaMemcpyKind. - */ - cudaErrorInvalidMemcpyDirection = 21, - - /** - * This indicated that the user has taken the address of a constant variable, - * which was forbidden up until the CUDA 3.1 release. - * \deprecated - * This error return is deprecated as of CUDA 3.1. Variables in constant - * memory may now have their address taken by the runtime via - * ::cudaGetSymbolAddress(). - */ - cudaErrorAddressOfConstant = 22, - - /** - * This indicated that a texture fetch was not able to be performed. - * This was previously used for device emulation of texture operations. - * \deprecated - * This error return is deprecated as of CUDA 3.1. Device emulation mode was - * removed with the CUDA 3.1 release. - */ - cudaErrorTextureFetchFailed = 23, - - /** - * This indicated that a texture was not bound for access. - * This was previously used for device emulation of texture operations. - * \deprecated - * This error return is deprecated as of CUDA 3.1. Device emulation mode was - * removed with the CUDA 3.1 release. - */ - cudaErrorTextureNotBound = 24, - - /** - * This indicated that a synchronization operation had failed. - * This was previously used for some device emulation functions. - * \deprecated - * This error return is deprecated as of CUDA 3.1. Device emulation mode was - * removed with the CUDA 3.1 release. - */ - cudaErrorSynchronizationError = 25, - - /** - * This indicates that a non-float texture was being accessed with linear - * filtering. This is not supported by CUDA. - */ - cudaErrorInvalidFilterSetting = 26, - - /** - * This indicates that an attempt was made to read a non-float texture as a - * normalized float. This is not supported by CUDA. - */ - cudaErrorInvalidNormSetting = 27, - - /** - * Mixing of device and device emulation code was not allowed. - * \deprecated - * This error return is deprecated as of CUDA 3.1. Device emulation mode was - * removed with the CUDA 3.1 release. - */ - cudaErrorMixedDeviceExecution = 28, - - /** - * This indicates that a CUDA Runtime API call cannot be executed because - * it is being called during process shut down, at a point in time after - * CUDA driver has been unloaded. - */ - cudaErrorCudartUnloading = 29, - - /** - * This indicates that an unknown internal error has occurred. - */ - cudaErrorUnknown = 30, - - /** - * This indicates that the API call is not yet implemented. Production - * releases of CUDA will never return this error. - * \deprecated - * This error return is deprecated as of CUDA 4.1. - */ - cudaErrorNotYetImplemented = 31, - - /** - * This indicated that an emulated device pointer exceeded the 32-bit address - * range. - * \deprecated - * This error return is deprecated as of CUDA 3.1. Device emulation mode was - * removed with the CUDA 3.1 release. - */ - cudaErrorMemoryValueTooLarge = 32, - - /** - * This indicates that a resource handle passed to the API call was not - * valid. Resource handles are opaque types like ::cudaStream_t and - * ::cudaEvent_t. - */ - cudaErrorInvalidResourceHandle = 33, - - /** - * This indicates that asynchronous operations issued previously have not - * completed yet. This result is not actually an error, but must be indicated - * differently than ::cudaSuccess (which indicates completion). Calls that - * may return this value include ::cudaEventQuery() and ::cudaStreamQuery(). - */ - cudaErrorNotReady = 34, - - /** - * This indicates that the installed NVIDIA CUDA driver is older than the - * CUDA runtime library. This is not a supported configuration. Users should - * install an updated NVIDIA display driver to allow the application to run. - */ - cudaErrorInsufficientDriver = 35, - - /** - * This indicates that the user has called ::cudaSetValidDevices(), - * ::cudaSetDeviceFlags(), ::cudaD3D9SetDirect3DDevice(), - * ::cudaD3D10SetDirect3DDevice, ::cudaD3D11SetDirect3DDevice(), or - * ::cudaVDPAUSetVDPAUDevice() after initializing the CUDA runtime by - * calling non-device management operations (allocating memory and - * launching kernels are examples of non-device management operations). - * This error can also be returned if using runtime/driver - * interoperability and there is an existing ::CUcontext active on the - * host thread. - */ - cudaErrorSetOnActiveProcess = 36, - - /** - * This indicates that the surface passed to the API call is not a valid - * surface. - */ - cudaErrorInvalidSurface = 37, - - /** - * This indicates that no CUDA-capable devices were detected by the installed - * CUDA driver. - */ - cudaErrorNoDevice = 38, - - /** - * This indicates that an uncorrectable ECC error was detected during - * execution. - */ - cudaErrorECCUncorrectable = 39, - - /** - * This indicates that a link to a shared object failed to resolve. - */ - cudaErrorSharedObjectSymbolNotFound = 40, - - /** - * This indicates that initialization of a shared object failed. - */ - cudaErrorSharedObjectInitFailed = 41, - - /** - * This indicates that the ::cudaLimit passed to the API call is not - * supported by the active device. - */ - cudaErrorUnsupportedLimit = 42, - - /** - * This indicates that multiple global or constant variables (across separate - * CUDA source files in the application) share the same string name. - */ - cudaErrorDuplicateVariableName = 43, - - /** - * This indicates that multiple textures (across separate CUDA source - * files in the application) share the same string name. - */ - cudaErrorDuplicateTextureName = 44, - - /** - * This indicates that multiple surfaces (across separate CUDA source - * files in the application) share the same string name. - */ - cudaErrorDuplicateSurfaceName = 45, - - /** - * This indicates that all CUDA devices are busy or unavailable at the current - * time. Devices are often busy/unavailable due to use of - * ::cudaComputeModeExclusive, ::cudaComputeModeProhibited or when long - * running CUDA kernels have filled up the GPU and are blocking new work - * from starting. They can also be unavailable due to memory constraints - * on a device that already has active CUDA work being performed. - */ - cudaErrorDevicesUnavailable = 46, - - /** - * This indicates that the device kernel image is invalid. - */ - cudaErrorInvalidKernelImage = 47, - - /** - * This indicates that there is no kernel image available that is suitable - * for the device. This can occur when a user specifies code generation - * options for a particular CUDA source file that do not include the - * corresponding device configuration. - */ - cudaErrorNoKernelImageForDevice = 48, - - /** - * This indicates that the current context is not compatible with this - * the CUDA Runtime. This can only occur if you are using CUDA - * Runtime/Driver interoperability and have created an existing Driver - * context using the driver API. The Driver context may be incompatible - * either because the Driver context was created using an older version - * of the API, because the Runtime API call expects a primary driver - * context and the Driver context is not primary, or because the Driver - * context has been destroyed. Please see \ref CUDART_DRIVER "Interactions - * with the CUDA Driver API" for more information. - */ - cudaErrorIncompatibleDriverContext = 49, - - /** - * This error indicates that a call to ::cudaDeviceEnablePeerAccess() is - * trying to re-enable peer addressing on from a context which has already - * had peer addressing enabled. - */ - cudaErrorPeerAccessAlreadyEnabled = 50, - - /** - * This error indicates that ::cudaDeviceDisablePeerAccess() is trying to - * disable peer addressing which has not been enabled yet via - * ::cudaDeviceEnablePeerAccess(). - */ - cudaErrorPeerAccessNotEnabled = 51, - - /** - * This indicates that a call tried to access an exclusive-thread device that - * is already in use by a different thread. - */ - cudaErrorDeviceAlreadyInUse = 54, - - /** - * This indicates profiler is not initialized for this run. This can - * happen when the application is running with external profiling tools - * like visual profiler. - */ - cudaErrorProfilerDisabled = 55, - - /** - * \deprecated - * This error return is deprecated as of CUDA 5.0. It is no longer an error - * to attempt to enable/disable the profiling via ::cudaProfilerStart or - * ::cudaProfilerStop without initialization. - */ - cudaErrorProfilerNotInitialized = 56, - - /** - * \deprecated - * This error return is deprecated as of CUDA 5.0. It is no longer an error - * to call cudaProfilerStart() when profiling is already enabled. - */ - cudaErrorProfilerAlreadyStarted = 57, - - /** - * \deprecated - * This error return is deprecated as of CUDA 5.0. It is no longer an error - * to call cudaProfilerStop() when profiling is already disabled. - */ - cudaErrorProfilerAlreadyStopped = 58, - - /** - * An assert triggered in device code during kernel execution. The device - * cannot be used again until ::cudaThreadExit() is called. All existing - * allocations are invalid and must be reconstructed if the program is to - * continue using CUDA. - */ - cudaErrorAssert = 59, - - /** - * This error indicates that the hardware resources required to enable - * peer access have been exhausted for one or more of the devices - * passed to ::cudaEnablePeerAccess(). - */ - cudaErrorTooManyPeers = 60, - - /** - * This error indicates that the memory range passed to ::cudaHostRegister() - * has already been registered. - */ - cudaErrorHostMemoryAlreadyRegistered = 61, - - /** - * This error indicates that the pointer passed to ::cudaHostUnregister() - * does not correspond to any currently registered memory region. - */ - cudaErrorHostMemoryNotRegistered = 62, - - /** - * This error indicates that an OS call failed. - */ - cudaErrorOperatingSystem = 63, - - /** - * This error indicates that P2P access is not supported across the given - * devices. - */ - cudaErrorPeerAccessUnsupported = 64, - - /** - * This error indicates that a device runtime grid launch did not occur - * because the depth of the child grid would exceed the maximum supported - * number of nested grid launches. - */ - cudaErrorLaunchMaxDepthExceeded = 65, - - /** - * This error indicates that a grid launch did not occur because the kernel - * uses file-scoped textures which are unsupported by the device runtime. - * Kernels launched via the device runtime only support textures created with - * the Texture Object API's. - */ - cudaErrorLaunchFileScopedTex = 66, - - /** - * This error indicates that a grid launch did not occur because the kernel - * uses file-scoped surfaces which are unsupported by the device runtime. - * Kernels launched via the device runtime only support surfaces created with - * the Surface Object API's. - */ - cudaErrorLaunchFileScopedSurf = 67, - - /** - * This error indicates that a call to ::cudaDeviceSynchronize made from - * the device runtime failed because the call was made at grid depth greater - * than than either the default (2 levels of grids) or user specified device - * limit ::cudaLimitDevRuntimeSyncDepth. To be able to synchronize on - * launched grids at a greater depth successfully, the maximum nested - * depth at which ::cudaDeviceSynchronize will be called must be specified - * with the ::cudaLimitDevRuntimeSyncDepth limit to the ::cudaDeviceSetLimit - * api before the host-side launch of a kernel using the device runtime. - * Keep in mind that additional levels of sync depth require the runtime - * to reserve large amounts of device memory that cannot be used for - * user allocations. - */ - cudaErrorSyncDepthExceeded = 68, - - /** - * This error indicates that a device runtime grid launch failed because - * the launch would exceed the limit ::cudaLimitDevRuntimePendingLaunchCount. - * For this launch to proceed successfully, ::cudaDeviceSetLimit must be - * called to set the ::cudaLimitDevRuntimePendingLaunchCount to be higher - * than the upper bound of outstanding launches that can be issued to the - * device runtime. Keep in mind that raising the limit of pending device - * runtime launches will require the runtime to reserve device memory that - * cannot be used for user allocations. - */ - cudaErrorLaunchPendingCountExceeded = 69, - - /** - * This error indicates the attempted operation is not permitted. - */ - cudaErrorNotPermitted = 70, - - /** - * This error indicates the attempted operation is not supported - * on the current system or device. - */ - cudaErrorNotSupported = 71, - - /** - * Device encountered an error in the call stack during kernel execution, - * possibly due to stack corruption or exceeding the stack size limit. - * This leaves the process in an inconsistent state and any further CUDA work - * will return the same error. To continue using CUDA, the process must be terminated - * and relaunched. - */ - cudaErrorHardwareStackError = 72, - - /** - * The device encountered an illegal instruction during kernel execution - * This leaves the process in an inconsistent state and any further CUDA work - * will return the same error. To continue using CUDA, the process must be terminated - * and relaunched. - */ - cudaErrorIllegalInstruction = 73, - - /** - * The device encountered a load or store instruction - * on a memory address which is not aligned. - * This leaves the process in an inconsistent state and any further CUDA work - * will return the same error. To continue using CUDA, the process must be terminated - * and relaunched. - */ - cudaErrorMisalignedAddress = 74, - - /** - * While executing a kernel, the device encountered an instruction - * which can only operate on memory locations in certain address spaces - * (global, shared, or local), but was supplied a memory address not - * belonging to an allowed address space. - * This leaves the process in an inconsistent state and any further CUDA work - * will return the same error. To continue using CUDA, the process must be terminated - * and relaunched. - */ - cudaErrorInvalidAddressSpace = 75, - - /** - * The device encountered an invalid program counter. - * This leaves the process in an inconsistent state and any further CUDA work - * will return the same error. To continue using CUDA, the process must be terminated - * and relaunched. - */ - cudaErrorInvalidPc = 76, - - /** - * The device encountered a load or store instruction on an invalid memory address. - * This leaves the process in an inconsistent state and any further CUDA work - * will return the same error. To continue using CUDA, the process must be terminated - * and relaunched. - */ - cudaErrorIllegalAddress = 77, - - /** - * A PTX compilation failed. The runtime may fall back to compiling PTX if - * an application does not contain a suitable binary for the current device. - */ - cudaErrorInvalidPtx = 78, - - /** - * This indicates an error with the OpenGL or DirectX context. - */ - cudaErrorInvalidGraphicsContext = 79, - - /** - * This indicates that an uncorrectable NVLink error was detected during the - * execution. - */ - cudaErrorNvlinkUncorrectable = 80, - - /** - * This indicates that the PTX JIT compiler library was not found. The JIT Compiler - * library is used for PTX compilation. The runtime may fall back to compiling PTX - * if an application does not contain a suitable binary for the current device. - */ - cudaErrorJitCompilerNotFound = 81, - - /** - * This error indicates that the number of blocks launched per grid for a kernel that was - * launched via either ::cudaLaunchCooperativeKernel or ::cudaLaunchCooperativeKernelMultiDevice - * exceeds the maximum number of blocks as allowed by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor - * or ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors - * as specified by the device attribute ::cudaDevAttrMultiProcessorCount. - */ - cudaErrorCooperativeLaunchTooLarge = 82, - - /** - * This indicates an internal startup failure in the CUDA runtime. - */ - cudaErrorStartupFailure = 0x7f, - - /** - * Any unhandled CUDA driver error is added to this value and returned via - * the runtime. Production releases of CUDA should not return such errors. - * \deprecated - * This error return is deprecated as of CUDA 4.1. - */ - cudaErrorApiFailureBase = 10000 -}; - -/** - * Channel format kind - */ -enum __device_builtin__ cudaChannelFormatKind -{ - cudaChannelFormatKindSigned = 0, /**< Signed channel format */ - cudaChannelFormatKindUnsigned = 1, /**< Unsigned channel format */ - cudaChannelFormatKindFloat = 2, /**< Float channel format */ - cudaChannelFormatKindNone = 3 /**< No channel format */ -}; - -/** - * CUDA Channel format descriptor - */ -struct __device_builtin__ cudaChannelFormatDesc -{ - int x; /**< x */ - int y; /**< y */ - int z; /**< z */ - int w; /**< w */ - enum cudaChannelFormatKind f; /**< Channel format kind */ -}; - -/** - * CUDA array - */ -typedef struct cudaArray *cudaArray_t; - -/** - * CUDA array (as source copy argument) - */ -typedef const struct cudaArray *cudaArray_const_t; - -struct cudaArray; - -/** - * CUDA mipmapped array - */ -typedef struct cudaMipmappedArray *cudaMipmappedArray_t; - -/** - * CUDA mipmapped array (as source argument) - */ -typedef const struct cudaMipmappedArray *cudaMipmappedArray_const_t; - -struct cudaMipmappedArray; - -/** - * CUDA memory types - */ -enum __device_builtin__ cudaMemoryType -{ - cudaMemoryTypeHost = 1, /**< Host memory */ - cudaMemoryTypeDevice = 2 /**< Device memory */ -}; - -/** - * CUDA memory copy types - */ -enum __device_builtin__ cudaMemcpyKind -{ - cudaMemcpyHostToHost = 0, /**< Host -> Host */ - cudaMemcpyHostToDevice = 1, /**< Host -> Device */ - cudaMemcpyDeviceToHost = 2, /**< Device -> Host */ - cudaMemcpyDeviceToDevice = 3, /**< Device -> Device */ - cudaMemcpyDefault = 4 /**< Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing */ -}; - -/** - * CUDA Pitched memory pointer - * - * \sa ::make_cudaPitchedPtr - */ -struct __device_builtin__ cudaPitchedPtr -{ - void *ptr; /**< Pointer to allocated memory */ - size_t pitch; /**< Pitch of allocated memory in bytes */ - size_t xsize; /**< Logical width of allocation in elements */ - size_t ysize; /**< Logical height of allocation in elements */ -}; - -/** - * CUDA extent - * - * \sa ::make_cudaExtent - */ -struct __device_builtin__ cudaExtent -{ - size_t width; /**< Width in elements when referring to array memory, in bytes when referring to linear memory */ - size_t height; /**< Height in elements */ - size_t depth; /**< Depth in elements */ -}; - -/** - * CUDA 3D position - * - * \sa ::make_cudaPos - */ -struct __device_builtin__ cudaPos -{ - size_t x; /**< x */ - size_t y; /**< y */ - size_t z; /**< z */ -}; - -/** - * CUDA 3D memory copying parameters - */ -struct __device_builtin__ cudaMemcpy3DParms -{ - cudaArray_t srcArray; /**< Source memory address */ - struct cudaPos srcPos; /**< Source position offset */ - struct cudaPitchedPtr srcPtr; /**< Pitched source memory address */ - - cudaArray_t dstArray; /**< Destination memory address */ - struct cudaPos dstPos; /**< Destination position offset */ - struct cudaPitchedPtr dstPtr; /**< Pitched destination memory address */ - - struct cudaExtent extent; /**< Requested memory copy size */ - enum cudaMemcpyKind kind; /**< Type of transfer */ -}; - -/** - * CUDA 3D cross-device memory copying parameters - */ -struct __device_builtin__ cudaMemcpy3DPeerParms -{ - cudaArray_t srcArray; /**< Source memory address */ - struct cudaPos srcPos; /**< Source position offset */ - struct cudaPitchedPtr srcPtr; /**< Pitched source memory address */ - int srcDevice; /**< Source device */ - - cudaArray_t dstArray; /**< Destination memory address */ - struct cudaPos dstPos; /**< Destination position offset */ - struct cudaPitchedPtr dstPtr; /**< Pitched destination memory address */ - int dstDevice; /**< Destination device */ - - struct cudaExtent extent; /**< Requested memory copy size */ -}; - -/** - * CUDA graphics interop resource - */ -struct cudaGraphicsResource; - -/** - * CUDA graphics interop register flags - */ -enum __device_builtin__ cudaGraphicsRegisterFlags -{ - cudaGraphicsRegisterFlagsNone = 0, /**< Default */ - cudaGraphicsRegisterFlagsReadOnly = 1, /**< CUDA will not write to this resource */ - cudaGraphicsRegisterFlagsWriteDiscard = 2, /**< CUDA will only write to and will not read from this resource */ - cudaGraphicsRegisterFlagsSurfaceLoadStore = 4, /**< CUDA will bind this resource to a surface reference */ - cudaGraphicsRegisterFlagsTextureGather = 8 /**< CUDA will perform texture gather operations on this resource */ -}; - -/** - * CUDA graphics interop map flags - */ -enum __device_builtin__ cudaGraphicsMapFlags -{ - cudaGraphicsMapFlagsNone = 0, /**< Default; Assume resource can be read/written */ - cudaGraphicsMapFlagsReadOnly = 1, /**< CUDA will not write to this resource */ - cudaGraphicsMapFlagsWriteDiscard = 2 /**< CUDA will only write to and will not read from this resource */ -}; - -/** - * CUDA graphics interop array indices for cube maps - */ -enum __device_builtin__ cudaGraphicsCubeFace -{ - cudaGraphicsCubeFacePositiveX = 0x00, /**< Positive X face of cubemap */ - cudaGraphicsCubeFaceNegativeX = 0x01, /**< Negative X face of cubemap */ - cudaGraphicsCubeFacePositiveY = 0x02, /**< Positive Y face of cubemap */ - cudaGraphicsCubeFaceNegativeY = 0x03, /**< Negative Y face of cubemap */ - cudaGraphicsCubeFacePositiveZ = 0x04, /**< Positive Z face of cubemap */ - cudaGraphicsCubeFaceNegativeZ = 0x05 /**< Negative Z face of cubemap */ -}; - -/** - * CUDA resource types - */ -enum __device_builtin__ cudaResourceType -{ - cudaResourceTypeArray = 0x00, /**< Array resource */ - cudaResourceTypeMipmappedArray = 0x01, /**< Mipmapped array resource */ - cudaResourceTypeLinear = 0x02, /**< Linear resource */ - cudaResourceTypePitch2D = 0x03 /**< Pitch 2D resource */ -}; - -/** - * CUDA texture resource view formats - */ -enum __device_builtin__ cudaResourceViewFormat -{ - cudaResViewFormatNone = 0x00, /**< No resource view format (use underlying resource format) */ - cudaResViewFormatUnsignedChar1 = 0x01, /**< 1 channel unsigned 8-bit integers */ - cudaResViewFormatUnsignedChar2 = 0x02, /**< 2 channel unsigned 8-bit integers */ - cudaResViewFormatUnsignedChar4 = 0x03, /**< 4 channel unsigned 8-bit integers */ - cudaResViewFormatSignedChar1 = 0x04, /**< 1 channel signed 8-bit integers */ - cudaResViewFormatSignedChar2 = 0x05, /**< 2 channel signed 8-bit integers */ - cudaResViewFormatSignedChar4 = 0x06, /**< 4 channel signed 8-bit integers */ - cudaResViewFormatUnsignedShort1 = 0x07, /**< 1 channel unsigned 16-bit integers */ - cudaResViewFormatUnsignedShort2 = 0x08, /**< 2 channel unsigned 16-bit integers */ - cudaResViewFormatUnsignedShort4 = 0x09, /**< 4 channel unsigned 16-bit integers */ - cudaResViewFormatSignedShort1 = 0x0a, /**< 1 channel signed 16-bit integers */ - cudaResViewFormatSignedShort2 = 0x0b, /**< 2 channel signed 16-bit integers */ - cudaResViewFormatSignedShort4 = 0x0c, /**< 4 channel signed 16-bit integers */ - cudaResViewFormatUnsignedInt1 = 0x0d, /**< 1 channel unsigned 32-bit integers */ - cudaResViewFormatUnsignedInt2 = 0x0e, /**< 2 channel unsigned 32-bit integers */ - cudaResViewFormatUnsignedInt4 = 0x0f, /**< 4 channel unsigned 32-bit integers */ - cudaResViewFormatSignedInt1 = 0x10, /**< 1 channel signed 32-bit integers */ - cudaResViewFormatSignedInt2 = 0x11, /**< 2 channel signed 32-bit integers */ - cudaResViewFormatSignedInt4 = 0x12, /**< 4 channel signed 32-bit integers */ - cudaResViewFormatHalf1 = 0x13, /**< 1 channel 16-bit floating point */ - cudaResViewFormatHalf2 = 0x14, /**< 2 channel 16-bit floating point */ - cudaResViewFormatHalf4 = 0x15, /**< 4 channel 16-bit floating point */ - cudaResViewFormatFloat1 = 0x16, /**< 1 channel 32-bit floating point */ - cudaResViewFormatFloat2 = 0x17, /**< 2 channel 32-bit floating point */ - cudaResViewFormatFloat4 = 0x18, /**< 4 channel 32-bit floating point */ - cudaResViewFormatUnsignedBlockCompressed1 = 0x19, /**< Block compressed 1 */ - cudaResViewFormatUnsignedBlockCompressed2 = 0x1a, /**< Block compressed 2 */ - cudaResViewFormatUnsignedBlockCompressed3 = 0x1b, /**< Block compressed 3 */ - cudaResViewFormatUnsignedBlockCompressed4 = 0x1c, /**< Block compressed 4 unsigned */ - cudaResViewFormatSignedBlockCompressed4 = 0x1d, /**< Block compressed 4 signed */ - cudaResViewFormatUnsignedBlockCompressed5 = 0x1e, /**< Block compressed 5 unsigned */ - cudaResViewFormatSignedBlockCompressed5 = 0x1f, /**< Block compressed 5 signed */ - cudaResViewFormatUnsignedBlockCompressed6H = 0x20, /**< Block compressed 6 unsigned half-float */ - cudaResViewFormatSignedBlockCompressed6H = 0x21, /**< Block compressed 6 signed half-float */ - cudaResViewFormatUnsignedBlockCompressed7 = 0x22 /**< Block compressed 7 */ -}; - -/** - * CUDA resource descriptor - */ -struct __device_builtin__ cudaResourceDesc { - enum cudaResourceType resType; /**< Resource type */ - - union { - struct { - cudaArray_t array; /**< CUDA array */ - } array; - struct { - cudaMipmappedArray_t mipmap; /**< CUDA mipmapped array */ - } mipmap; - struct { - void *devPtr; /**< Device pointer */ - struct cudaChannelFormatDesc desc; /**< Channel descriptor */ - size_t sizeInBytes; /**< Size in bytes */ - } linear; - struct { - void *devPtr; /**< Device pointer */ - struct cudaChannelFormatDesc desc; /**< Channel descriptor */ - size_t width; /**< Width of the array in elements */ - size_t height; /**< Height of the array in elements */ - size_t pitchInBytes; /**< Pitch between two rows in bytes */ - } pitch2D; - } res; -}; - -/** - * CUDA resource view descriptor - */ -struct __device_builtin__ cudaResourceViewDesc -{ - enum cudaResourceViewFormat format; /**< Resource view format */ - size_t width; /**< Width of the resource view */ - size_t height; /**< Height of the resource view */ - size_t depth; /**< Depth of the resource view */ - unsigned int firstMipmapLevel; /**< First defined mipmap level */ - unsigned int lastMipmapLevel; /**< Last defined mipmap level */ - unsigned int firstLayer; /**< First layer index */ - unsigned int lastLayer; /**< Last layer index */ -}; - -/** - * CUDA pointer attributes - */ -struct __device_builtin__ cudaPointerAttributes -{ - /** - * The physical location of the memory, ::cudaMemoryTypeHost or - * ::cudaMemoryTypeDevice. - */ - enum cudaMemoryType memoryType; - - /** - * The device against which the memory was allocated or registered. - * If the memory type is ::cudaMemoryTypeDevice then this identifies - * the device on which the memory referred physically resides. If - * the memory type is ::cudaMemoryTypeHost then this identifies the - * device which was current when the memory was allocated or registered - * (and if that device is deinitialized then this allocation will vanish - * with that device's state). - */ - int device; - - /** - * The address which may be dereferenced on the current device to access - * the memory or NULL if no such address exists. - */ - void *devicePointer; - - /** - * The address which may be dereferenced on the host to access the - * memory or NULL if no such address exists. - */ - void *hostPointer; - - /** - * Indicates if this pointer points to managed memory - */ - int isManaged; -}; - -/** - * CUDA function attributes - */ -struct __device_builtin__ cudaFuncAttributes -{ - /** - * The size in bytes of statically-allocated shared memory per block - * required by this function. This does not include dynamically-allocated - * shared memory requested by the user at runtime. - */ - size_t sharedSizeBytes; - - /** - * The size in bytes of user-allocated constant memory required by this - * function. - */ - size_t constSizeBytes; - - /** - * The size in bytes of local memory used by each thread of this function. - */ - size_t localSizeBytes; - - /** - * The maximum number of threads per block, beyond which a launch of the - * function would fail. This number depends on both the function and the - * device on which the function is currently loaded. - */ - int maxThreadsPerBlock; - - /** - * The number of registers used by each thread of this function. - */ - int numRegs; - - /** - * The PTX virtual architecture version for which the function was - * compiled. This value is the major PTX version * 10 + the minor PTX - * version, so a PTX version 1.3 function would return the value 13. - */ - int ptxVersion; - - /** - * The binary architecture version for which the function was compiled. - * This value is the major binary version * 10 + the minor binary version, - * so a binary version 1.3 function would return the value 13. - */ - int binaryVersion; - - /** - * The attribute to indicate whether the function has been compiled with - * user specified option "-Xptxas --dlcm=ca" set. - */ - int cacheModeCA; - - /** - * The maximum size in bytes of dynamic shared memory per block for - * this function. Any launch must have a dynamic shared memory size - * smaller than this value. - */ - int maxDynamicSharedSizeBytes; - - /** - * On devices where the L1 cache and shared memory use the same hardware resources, - * this sets the shared memory carveout preference, in percent of the maximum shared memory. - * This is only a hint, and the driver can choose a different ratio if required to execute the function. - */ - int preferredShmemCarveout; -}; - -/** - * CUDA function attributes that can be set using cudaFuncSetAttribute - */ -enum __device_builtin__ cudaFuncAttribute -{ - cudaFuncAttributeMaxDynamicSharedMemorySize = 8, /**< Maximum dynamic shared memory size */ - cudaFuncAttributePreferredSharedMemoryCarveout = 9, /**< Preferred shared memory-L1 cache split ratio */ - cudaFuncAttributeMax -}; - -/** - * CUDA function cache configurations - */ -enum __device_builtin__ cudaFuncCache -{ - cudaFuncCachePreferNone = 0, /**< Default function cache configuration, no preference */ - cudaFuncCachePreferShared = 1, /**< Prefer larger shared memory and smaller L1 cache */ - cudaFuncCachePreferL1 = 2, /**< Prefer larger L1 cache and smaller shared memory */ - cudaFuncCachePreferEqual = 3 /**< Prefer equal size L1 cache and shared memory */ -}; - -/** - * CUDA shared memory configuration - */ - -enum __device_builtin__ cudaSharedMemConfig -{ - cudaSharedMemBankSizeDefault = 0, - cudaSharedMemBankSizeFourByte = 1, - cudaSharedMemBankSizeEightByte = 2 -}; - -/** - * Shared memory carveout configurations - */ -enum __device_builtin__ cudaSharedCarveout { - cudaSharedmemCarveoutDefault = -1, /* * < no preference for shared memory or L1 (default) */ - cudaSharedmemCarveoutMaxShared = 100, /* * < prefer maximum available shared memory, minimum L1 cache */ - cudaSharedmemCarveoutMaxL1 = 0 /* * < prefer maximum available L1 cache, minimum shared memory */ -}; - -/** - * CUDA device compute modes - */ -enum __device_builtin__ cudaComputeMode -{ - cudaComputeModeDefault = 0, /**< Default compute mode (Multiple threads can use ::cudaSetDevice() with this device) */ - cudaComputeModeExclusive = 1, /**< Compute-exclusive-thread mode (Only one thread in one process will be able to use ::cudaSetDevice() with this device) */ - cudaComputeModeProhibited = 2, /**< Compute-prohibited mode (No threads can use ::cudaSetDevice() with this device) */ - cudaComputeModeExclusiveProcess = 3 /**< Compute-exclusive-process mode (Many threads in one process will be able to use ::cudaSetDevice() with this device) */ -}; - -/** - * CUDA Limits - */ -enum __device_builtin__ cudaLimit -{ - cudaLimitStackSize = 0x00, /**< GPU thread stack size */ - cudaLimitPrintfFifoSize = 0x01, /**< GPU printf/fprintf FIFO size */ - cudaLimitMallocHeapSize = 0x02, /**< GPU malloc heap size */ - cudaLimitDevRuntimeSyncDepth = 0x03, /**< GPU device runtime synchronize depth */ - cudaLimitDevRuntimePendingLaunchCount = 0x04 /**< GPU device runtime pending launch count */ -}; - -/** - * CUDA Memory Advise values - */ -enum __device_builtin__ cudaMemoryAdvise -{ - cudaMemAdviseSetReadMostly = 1, /**< Data will mostly be read and only occassionally be written to */ - cudaMemAdviseUnsetReadMostly = 2, /**< Undo the effect of ::cudaMemAdviseSetReadMostly */ - cudaMemAdviseSetPreferredLocation = 3, /**< Set the preferred location for the data as the specified device */ - cudaMemAdviseUnsetPreferredLocation = 4, /**< Clear the preferred location for the data */ - cudaMemAdviseSetAccessedBy = 5, /**< Data will be accessed by the specified device, so prevent page faults as much as possible */ - cudaMemAdviseUnsetAccessedBy = 6 /**< Let the Unified Memory subsystem decide on the page faulting policy for the specified device */ -}; - -/** - * CUDA range attributes - */ -enum __device_builtin__ cudaMemRangeAttribute -{ - cudaMemRangeAttributeReadMostly = 1, /**< Whether the range will mostly be read and only occassionally be written to */ - cudaMemRangeAttributePreferredLocation = 2, /**< The preferred location of the range */ - cudaMemRangeAttributeAccessedBy = 3, /**< Memory range has ::cudaMemAdviseSetAccessedBy set for specified device */ - cudaMemRangeAttributeLastPrefetchLocation = 4 /**< The last location to which the range was prefetched */ -}; - -/** - * CUDA Profiler Output modes - */ -enum __device_builtin__ cudaOutputMode -{ - cudaKeyValuePair = 0x00, /**< Output mode Key-Value pair format. */ - cudaCSV = 0x01 /**< Output mode Comma separated values format. */ -}; - -/** - * CUDA device attributes - */ -enum __device_builtin__ cudaDeviceAttr -{ - cudaDevAttrMaxThreadsPerBlock = 1, /**< Maximum number of threads per block */ - cudaDevAttrMaxBlockDimX = 2, /**< Maximum block dimension X */ - cudaDevAttrMaxBlockDimY = 3, /**< Maximum block dimension Y */ - cudaDevAttrMaxBlockDimZ = 4, /**< Maximum block dimension Z */ - cudaDevAttrMaxGridDimX = 5, /**< Maximum grid dimension X */ - cudaDevAttrMaxGridDimY = 6, /**< Maximum grid dimension Y */ - cudaDevAttrMaxGridDimZ = 7, /**< Maximum grid dimension Z */ - cudaDevAttrMaxSharedMemoryPerBlock = 8, /**< Maximum shared memory available per block in bytes */ - cudaDevAttrTotalConstantMemory = 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */ - cudaDevAttrWarpSize = 10, /**< Warp size in threads */ - cudaDevAttrMaxPitch = 11, /**< Maximum pitch in bytes allowed by memory copies */ - cudaDevAttrMaxRegistersPerBlock = 12, /**< Maximum number of 32-bit registers available per block */ - cudaDevAttrClockRate = 13, /**< Peak clock frequency in kilohertz */ - cudaDevAttrTextureAlignment = 14, /**< Alignment requirement for textures */ - cudaDevAttrGpuOverlap = 15, /**< Device can possibly copy memory and execute a kernel concurrently */ - cudaDevAttrMultiProcessorCount = 16, /**< Number of multiprocessors on device */ - cudaDevAttrKernelExecTimeout = 17, /**< Specifies whether there is a run time limit on kernels */ - cudaDevAttrIntegrated = 18, /**< Device is integrated with host memory */ - cudaDevAttrCanMapHostMemory = 19, /**< Device can map host memory into CUDA address space */ - cudaDevAttrComputeMode = 20, /**< Compute mode (See ::cudaComputeMode for details) */ - cudaDevAttrMaxTexture1DWidth = 21, /**< Maximum 1D texture width */ - cudaDevAttrMaxTexture2DWidth = 22, /**< Maximum 2D texture width */ - cudaDevAttrMaxTexture2DHeight = 23, /**< Maximum 2D texture height */ - cudaDevAttrMaxTexture3DWidth = 24, /**< Maximum 3D texture width */ - cudaDevAttrMaxTexture3DHeight = 25, /**< Maximum 3D texture height */ - cudaDevAttrMaxTexture3DDepth = 26, /**< Maximum 3D texture depth */ - cudaDevAttrMaxTexture2DLayeredWidth = 27, /**< Maximum 2D layered texture width */ - cudaDevAttrMaxTexture2DLayeredHeight = 28, /**< Maximum 2D layered texture height */ - cudaDevAttrMaxTexture2DLayeredLayers = 29, /**< Maximum layers in a 2D layered texture */ - cudaDevAttrSurfaceAlignment = 30, /**< Alignment requirement for surfaces */ - cudaDevAttrConcurrentKernels = 31, /**< Device can possibly execute multiple kernels concurrently */ - cudaDevAttrEccEnabled = 32, /**< Device has ECC support enabled */ - cudaDevAttrPciBusId = 33, /**< PCI bus ID of the device */ - cudaDevAttrPciDeviceId = 34, /**< PCI device ID of the device */ - cudaDevAttrTccDriver = 35, /**< Device is using TCC driver model */ - cudaDevAttrMemoryClockRate = 36, /**< Peak memory clock frequency in kilohertz */ - cudaDevAttrGlobalMemoryBusWidth = 37, /**< Global memory bus width in bits */ - cudaDevAttrL2CacheSize = 38, /**< Size of L2 cache in bytes */ - cudaDevAttrMaxThreadsPerMultiProcessor = 39, /**< Maximum resident threads per multiprocessor */ - cudaDevAttrAsyncEngineCount = 40, /**< Number of asynchronous engines */ - cudaDevAttrUnifiedAddressing = 41, /**< Device shares a unified address space with the host */ - cudaDevAttrMaxTexture1DLayeredWidth = 42, /**< Maximum 1D layered texture width */ - cudaDevAttrMaxTexture1DLayeredLayers = 43, /**< Maximum layers in a 1D layered texture */ - cudaDevAttrMaxTexture2DGatherWidth = 45, /**< Maximum 2D texture width if cudaArrayTextureGather is set */ - cudaDevAttrMaxTexture2DGatherHeight = 46, /**< Maximum 2D texture height if cudaArrayTextureGather is set */ - cudaDevAttrMaxTexture3DWidthAlt = 47, /**< Alternate maximum 3D texture width */ - cudaDevAttrMaxTexture3DHeightAlt = 48, /**< Alternate maximum 3D texture height */ - cudaDevAttrMaxTexture3DDepthAlt = 49, /**< Alternate maximum 3D texture depth */ - cudaDevAttrPciDomainId = 50, /**< PCI domain ID of the device */ - cudaDevAttrTexturePitchAlignment = 51, /**< Pitch alignment requirement for textures */ - cudaDevAttrMaxTextureCubemapWidth = 52, /**< Maximum cubemap texture width/height */ - cudaDevAttrMaxTextureCubemapLayeredWidth = 53, /**< Maximum cubemap layered texture width/height */ - cudaDevAttrMaxTextureCubemapLayeredLayers = 54, /**< Maximum layers in a cubemap layered texture */ - cudaDevAttrMaxSurface1DWidth = 55, /**< Maximum 1D surface width */ - cudaDevAttrMaxSurface2DWidth = 56, /**< Maximum 2D surface width */ - cudaDevAttrMaxSurface2DHeight = 57, /**< Maximum 2D surface height */ - cudaDevAttrMaxSurface3DWidth = 58, /**< Maximum 3D surface width */ - cudaDevAttrMaxSurface3DHeight = 59, /**< Maximum 3D surface height */ - cudaDevAttrMaxSurface3DDepth = 60, /**< Maximum 3D surface depth */ - cudaDevAttrMaxSurface1DLayeredWidth = 61, /**< Maximum 1D layered surface width */ - cudaDevAttrMaxSurface1DLayeredLayers = 62, /**< Maximum layers in a 1D layered surface */ - cudaDevAttrMaxSurface2DLayeredWidth = 63, /**< Maximum 2D layered surface width */ - cudaDevAttrMaxSurface2DLayeredHeight = 64, /**< Maximum 2D layered surface height */ - cudaDevAttrMaxSurface2DLayeredLayers = 65, /**< Maximum layers in a 2D layered surface */ - cudaDevAttrMaxSurfaceCubemapWidth = 66, /**< Maximum cubemap surface width */ - cudaDevAttrMaxSurfaceCubemapLayeredWidth = 67, /**< Maximum cubemap layered surface width */ - cudaDevAttrMaxSurfaceCubemapLayeredLayers = 68, /**< Maximum layers in a cubemap layered surface */ - cudaDevAttrMaxTexture1DLinearWidth = 69, /**< Maximum 1D linear texture width */ - cudaDevAttrMaxTexture2DLinearWidth = 70, /**< Maximum 2D linear texture width */ - cudaDevAttrMaxTexture2DLinearHeight = 71, /**< Maximum 2D linear texture height */ - cudaDevAttrMaxTexture2DLinearPitch = 72, /**< Maximum 2D linear texture pitch in bytes */ - cudaDevAttrMaxTexture2DMipmappedWidth = 73, /**< Maximum mipmapped 2D texture width */ - cudaDevAttrMaxTexture2DMipmappedHeight = 74, /**< Maximum mipmapped 2D texture height */ - cudaDevAttrComputeCapabilityMajor = 75, /**< Major compute capability version number */ - cudaDevAttrComputeCapabilityMinor = 76, /**< Minor compute capability version number */ - cudaDevAttrMaxTexture1DMipmappedWidth = 77, /**< Maximum mipmapped 1D texture width */ - cudaDevAttrStreamPrioritiesSupported = 78, /**< Device supports stream priorities */ - cudaDevAttrGlobalL1CacheSupported = 79, /**< Device supports caching globals in L1 */ - cudaDevAttrLocalL1CacheSupported = 80, /**< Device supports caching locals in L1 */ - cudaDevAttrMaxSharedMemoryPerMultiprocessor = 81, /**< Maximum shared memory available per multiprocessor in bytes */ - cudaDevAttrMaxRegistersPerMultiprocessor = 82, /**< Maximum number of 32-bit registers available per multiprocessor */ - cudaDevAttrManagedMemory = 83, /**< Device can allocate managed memory on this system */ - cudaDevAttrIsMultiGpuBoard = 84, /**< Device is on a multi-GPU board */ - cudaDevAttrMultiGpuBoardGroupID = 85, /**< Unique identifier for a group of devices on the same multi-GPU board */ - cudaDevAttrHostNativeAtomicSupported = 86, /**< Link between the device and the host supports native atomic operations */ - cudaDevAttrSingleToDoublePrecisionPerfRatio = 87, /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */ - cudaDevAttrPageableMemoryAccess = 88, /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */ - cudaDevAttrConcurrentManagedAccess = 89, /**< Device can coherently access managed memory concurrently with the CPU */ - cudaDevAttrComputePreemptionSupported = 90, /**< Device supports Compute Preemption */ - cudaDevAttrCanUseHostPointerForRegisteredMem = 91, /**< Device can access host registered memory at the same virtual address as the CPU */ - cudaDevAttrReserved92 = 92, - cudaDevAttrReserved93 = 93, - cudaDevAttrReserved94 = 94, - cudaDevAttrCooperativeLaunch = 95, /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel*/ - cudaDevAttrCooperativeMultiDeviceLaunch = 96, /**< Device can participate in cooperative kernels launched via ::cudaLaunchCooperativeKernelMultiDevice */ - cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 /**< The maximum optin shared memory per block. This value may vary by chip. See ::cudaFuncSetAttribute */ -}; - -/** - * CUDA device P2P attributes - */ - -enum __device_builtin__ cudaDeviceP2PAttr { - cudaDevP2PAttrPerformanceRank = 1, /**< A relative value indicating the performance of the link between two devices */ - cudaDevP2PAttrAccessSupported = 2, /**< Peer access is enabled */ - cudaDevP2PAttrNativeAtomicSupported = 3 /**< Native atomic operation over the link supported */ -}; -/** - * CUDA device properties - */ -struct __device_builtin__ cudaDeviceProp -{ - char name[256]; /**< ASCII string identifying device */ - size_t totalGlobalMem; /**< Global memory available on device in bytes */ - size_t sharedMemPerBlock; /**< Shared memory available per block in bytes */ - int regsPerBlock; /**< 32-bit registers available per block */ - int warpSize; /**< Warp size in threads */ - size_t memPitch; /**< Maximum pitch in bytes allowed by memory copies */ - int maxThreadsPerBlock; /**< Maximum number of threads per block */ - int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */ - int maxGridSize[3]; /**< Maximum size of each dimension of a grid */ - int clockRate; /**< Clock frequency in kilohertz */ - size_t totalConstMem; /**< Constant memory available on device in bytes */ - int major; /**< Major compute capability */ - int minor; /**< Minor compute capability */ - size_t textureAlignment; /**< Alignment requirement for textures */ - size_t texturePitchAlignment; /**< Pitch alignment requirement for texture references bound to pitched memory */ - int deviceOverlap; /**< Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. */ - int multiProcessorCount; /**< Number of multiprocessors on device */ - int kernelExecTimeoutEnabled; /**< Specified whether there is a run time limit on kernels */ - int integrated; /**< Device is integrated as opposed to discrete */ - int canMapHostMemory; /**< Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer */ - int computeMode; /**< Compute mode (See ::cudaComputeMode) */ - int maxTexture1D; /**< Maximum 1D texture size */ - int maxTexture1DMipmap; /**< Maximum 1D mipmapped texture size */ - int maxTexture1DLinear; /**< Maximum size for 1D textures bound to linear memory */ - int maxTexture2D[2]; /**< Maximum 2D texture dimensions */ - int maxTexture2DMipmap[2]; /**< Maximum 2D mipmapped texture dimensions */ - int maxTexture2DLinear[3]; /**< Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory */ - int maxTexture2DGather[2]; /**< Maximum 2D texture dimensions if texture gather operations have to be performed */ - int maxTexture3D[3]; /**< Maximum 3D texture dimensions */ - int maxTexture3DAlt[3]; /**< Maximum alternate 3D texture dimensions */ - int maxTextureCubemap; /**< Maximum Cubemap texture dimensions */ - int maxTexture1DLayered[2]; /**< Maximum 1D layered texture dimensions */ - int maxTexture2DLayered[3]; /**< Maximum 2D layered texture dimensions */ - int maxTextureCubemapLayered[2];/**< Maximum Cubemap layered texture dimensions */ - int maxSurface1D; /**< Maximum 1D surface size */ - int maxSurface2D[2]; /**< Maximum 2D surface dimensions */ - int maxSurface3D[3]; /**< Maximum 3D surface dimensions */ - int maxSurface1DLayered[2]; /**< Maximum 1D layered surface dimensions */ - int maxSurface2DLayered[3]; /**< Maximum 2D layered surface dimensions */ - int maxSurfaceCubemap; /**< Maximum Cubemap surface dimensions */ - int maxSurfaceCubemapLayered[2];/**< Maximum Cubemap layered surface dimensions */ - size_t surfaceAlignment; /**< Alignment requirements for surfaces */ - int concurrentKernels; /**< Device can possibly execute multiple kernels concurrently */ - int ECCEnabled; /**< Device has ECC support enabled */ - int pciBusID; /**< PCI bus ID of the device */ - int pciDeviceID; /**< PCI device ID of the device */ - int pciDomainID; /**< PCI domain ID of the device */ - int tccDriver; /**< 1 if device is a Tesla device using TCC driver, 0 otherwise */ - int asyncEngineCount; /**< Number of asynchronous engines */ - int unifiedAddressing; /**< Device shares a unified address space with the host */ - int memoryClockRate; /**< Peak memory clock frequency in kilohertz */ - int memoryBusWidth; /**< Global memory bus width in bits */ - int l2CacheSize; /**< Size of L2 cache in bytes */ - int maxThreadsPerMultiProcessor;/**< Maximum resident threads per multiprocessor */ - int streamPrioritiesSupported; /**< Device supports stream priorities */ - int globalL1CacheSupported; /**< Device supports caching globals in L1 */ - int localL1CacheSupported; /**< Device supports caching locals in L1 */ - size_t sharedMemPerMultiprocessor; /**< Shared memory available per multiprocessor in bytes */ - int regsPerMultiprocessor; /**< 32-bit registers available per multiprocessor */ - int managedMemory; /**< Device supports allocating managed memory on this system */ - int isMultiGpuBoard; /**< Device is on a multi-GPU board */ - int multiGpuBoardGroupID; /**< Unique identifier for a group of devices on the same multi-GPU board */ - int hostNativeAtomicSupported; /**< Link between the device and the host supports native atomic operations */ - int singleToDoublePrecisionPerfRatio; /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */ - int pageableMemoryAccess; /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */ - int concurrentManagedAccess; /**< Device can coherently access managed memory concurrently with the CPU */ - int computePreemptionSupported; /**< Device supports Compute Preemption */ - int canUseHostPointerForRegisteredMem; /**< Device can access host registered memory at the same virtual address as the CPU */ - int cooperativeLaunch; /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel */ - int cooperativeMultiDeviceLaunch; /**< Device can participate in cooperative kernels launched via ::cudaLaunchCooperativeKernelMultiDevice */ - size_t sharedMemPerBlockOptin; /**< Per device maximum shared memory per block usable by special opt in */ -}; - -#define cudaDevicePropDontCare \ - { \ - {'\0'}, /* char name[256]; */ \ - 0, /* size_t totalGlobalMem; */ \ - 0, /* size_t sharedMemPerBlock; */ \ - 0, /* int regsPerBlock; */ \ - 0, /* int warpSize; */ \ - 0, /* size_t memPitch; */ \ - 0, /* int maxThreadsPerBlock; */ \ - {0, 0, 0}, /* int maxThreadsDim[3]; */ \ - {0, 0, 0}, /* int maxGridSize[3]; */ \ - 0, /* int clockRate; */ \ - 0, /* size_t totalConstMem; */ \ - -1, /* int major; */ \ - -1, /* int minor; */ \ - 0, /* size_t textureAlignment; */ \ - 0, /* size_t texturePitchAlignment */ \ - -1, /* int deviceOverlap; */ \ - 0, /* int multiProcessorCount; */ \ - 0, /* int kernelExecTimeoutEnabled */ \ - 0, /* int integrated */ \ - 0, /* int canMapHostMemory */ \ - 0, /* int computeMode */ \ - 0, /* int maxTexture1D */ \ - 0, /* int maxTexture1DMipmap */ \ - 0, /* int maxTexture1DLinear */ \ - {0, 0}, /* int maxTexture2D[2] */ \ - {0, 0}, /* int maxTexture2DMipmap[2] */ \ - {0, 0, 0}, /* int maxTexture2DLinear[3] */ \ - {0, 0}, /* int maxTexture2DGather[2] */ \ - {0, 0, 0}, /* int maxTexture3D[3] */ \ - {0, 0, 0}, /* int maxTexture3DAlt[3] */ \ - 0, /* int maxTextureCubemap */ \ - {0, 0}, /* int maxTexture1DLayered[2] */ \ - {0, 0, 0}, /* int maxTexture2DLayered[3] */ \ - {0, 0}, /* int maxTextureCubemapLayered[2] */ \ - 0, /* int maxSurface1D */ \ - {0, 0}, /* int maxSurface2D[2] */ \ - {0, 0, 0}, /* int maxSurface3D[3] */ \ - {0, 0}, /* int maxSurface1DLayered[2] */ \ - {0, 0, 0}, /* int maxSurface2DLayered[3] */ \ - 0, /* int maxSurfaceCubemap */ \ - {0, 0}, /* int maxSurfaceCubemapLayered[2] */ \ - 0, /* size_t surfaceAlignment */ \ - 0, /* int concurrentKernels */ \ - 0, /* int ECCEnabled */ \ - 0, /* int pciBusID */ \ - 0, /* int pciDeviceID */ \ - 0, /* int pciDomainID */ \ - 0, /* int tccDriver */ \ - 0, /* int asyncEngineCount */ \ - 0, /* int unifiedAddressing */ \ - 0, /* int memoryClockRate */ \ - 0, /* int memoryBusWidth */ \ - 0, /* int l2CacheSize */ \ - 0, /* int maxThreadsPerMultiProcessor */ \ - 0, /* int streamPrioritiesSupported */ \ - 0, /* int globalL1CacheSupported */ \ - 0, /* int localL1CacheSupported */ \ - 0, /* size_t sharedMemPerMultiprocessor; */ \ - 0, /* int regsPerMultiprocessor; */ \ - 0, /* int managedMemory */ \ - 0, /* int isMultiGpuBoard */ \ - 0, /* int multiGpuBoardGroupID */ \ - 0, /* int hostNativeAtomicSupported */ \ - 0, /* int singleToDoublePrecisionPerfRatio */ \ - 0, /* int pageableMemoryAccess */ \ - 0, /* int concurrentManagedAccess */ \ - 0, /* int computePreemptionSupported */ \ - 0, /* int canUseHostPointerForRegisteredMem */ \ - 0, /* int cooperativeLaunch */ \ - 0, /* int cooperativeMultiDeviceLaunch */ \ - 0, /* size_t sharedMemPerBlockOptin */ \ - } /**< Empty device properties */ - -/** - * CUDA IPC Handle Size - */ -#define CUDA_IPC_HANDLE_SIZE 64 - -/** - * CUDA IPC event handle - */ -typedef __device_builtin__ struct __device_builtin__ cudaIpcEventHandle_st -{ - char reserved[CUDA_IPC_HANDLE_SIZE]; -}cudaIpcEventHandle_t; - -/** - * CUDA IPC memory handle - */ -typedef __device_builtin__ struct __device_builtin__ cudaIpcMemHandle_st -{ - char reserved[CUDA_IPC_HANDLE_SIZE]; -}cudaIpcMemHandle_t; - -/******************************************************************************* -* * -* SHORTHAND TYPE DEFINITION USED BY RUNTIME API * -* * -*******************************************************************************/ - -/** - * CUDA Error types - */ -typedef __device_builtin__ enum cudaError cudaError_t; - -/** - * CUDA stream - */ -typedef __device_builtin__ struct CUstream_st *cudaStream_t; - -/** - * CUDA event types - */ -typedef __device_builtin__ struct CUevent_st *cudaEvent_t; - -/** - * CUDA graphics resource types - */ -typedef __device_builtin__ struct cudaGraphicsResource *cudaGraphicsResource_t; - -/** - * CUDA UUID types - */ -typedef __device_builtin__ struct CUuuid_st cudaUUID_t; - -/** - * CUDA output file modes - */ -typedef __device_builtin__ enum cudaOutputMode cudaOutputMode_t; - -/** - * CUDA cooperative group scope - */ -enum __device_builtin__ cudaCGScope { - cudaCGScopeInvalid = 0, /**< Invalid cooperative group scope */ - cudaCGScopeGrid = 1, /**< Scope represented by a grid_group */ - cudaCGScopeMultiGrid = 2 /**< Scope represented by a multi_grid_group */ -}; - -/** - * CUDA launch parameters - */ -struct __device_builtin__ cudaLaunchParams -{ - void *func; /**< Device function symbol */ - dim3 gridDim; /**< Grid dimentions */ - dim3 blockDim; /**< Block dimentions */ - void **args; /**< Arguments */ - size_t sharedMem; /**< Shared memory */ - cudaStream_t stream; /**< Stream identifier */ -}; - -/** @} */ -/** @} */ /* END CUDART_TYPES */ - -#endif /* !__DRIVER_TYPES_H__ */ diff --git a/include/triton/external/CUDA/host_config.h b/include/triton/external/CUDA/host_config.h deleted file mode 100755 index 1bd79e551..000000000 --- a/include/triton/external/CUDA/host_config.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright 1993-2017 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -#include "crt/host_config.h" diff --git a/include/triton/external/CUDA/host_defines.h b/include/triton/external/CUDA/host_defines.h deleted file mode 100755 index 33507ae51..000000000 --- a/include/triton/external/CUDA/host_defines.h +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Copyright 1993-2017 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -#include "crt/host_defines.h" diff --git a/include/triton/external/CUDA/library_types.h b/include/triton/external/CUDA/library_types.h deleted file mode 100755 index c36f0d18c..000000000 --- a/include/triton/external/CUDA/library_types.h +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -#if !defined(__LIBRARY_TYPES_H__) -#define __LIBRARY_TYPES_H__ - - -typedef enum cudaDataType_t -{ - CUDA_R_16F= 2, /* real as a half */ - CUDA_C_16F= 6, /* complex as a pair of half numbers */ - CUDA_R_32F= 0, /* real as a float */ - CUDA_C_32F= 4, /* complex as a pair of float numbers */ - CUDA_R_64F= 1, /* real as a double */ - CUDA_C_64F= 5, /* complex as a pair of double numbers */ - CUDA_R_8I = 3, /* real as a signed char */ - CUDA_C_8I = 7, /* complex as a pair of signed char numbers */ - CUDA_R_8U = 8, /* real as a unsigned char */ - CUDA_C_8U = 9, /* complex as a pair of unsigned char numbers */ - CUDA_R_32I= 10, /* real as a signed int */ - CUDA_C_32I= 11, /* complex as a pair of signed int numbers */ - CUDA_R_32U= 12, /* real as a unsigned int */ - CUDA_C_32U= 13 /* complex as a pair of unsigned int numbers */ -} cudaDataType; - - -typedef enum libraryPropertyType_t -{ - MAJOR_VERSION, - MINOR_VERSION, - PATCH_LEVEL -} libraryPropertyType; - -#endif /* !__LIBRARY_TYPES_H__ */ diff --git a/include/triton/external/CUDA/nvrtc.h b/include/triton/external/CUDA/nvrtc.h deleted file mode 100755 index 1d2acd272..000000000 --- a/include/triton/external/CUDA/nvrtc.h +++ /dev/null @@ -1,525 +0,0 @@ -// -// NVIDIA_COPYRIGHT_BEGIN -// -// Copyright (c) 2014-2017, NVIDIA CORPORATION. All rights reserved. -// -// NVIDIA CORPORATION and its licensors retain all intellectual property -// and proprietary rights in and to this software, related documentation -// and any modifications thereto. Any use, reproduction, disclosure or -// distribution of this software and related documentation without an express -// license agreement from NVIDIA CORPORATION is strictly prohibited. -// -// NVIDIA_COPYRIGHT_END -// - -#ifndef __NVRTC_H__ -#define __NVRTC_H__ - -#ifdef __cplusplus -extern "C" { -#endif /* __cplusplus */ - -#include - - -/*************************************************************************//** - * - * \defgroup error Error Handling - * - * NVRTC defines the following enumeration type and function for API call - * error handling. - * - ****************************************************************************/ - - -/** - * \ingroup error - * \brief The enumerated type nvrtcResult defines API call result codes. - * NVRTC API functions return nvrtcResult to indicate the call - * result. - */ -typedef enum { - NVRTC_SUCCESS = 0, - NVRTC_ERROR_OUT_OF_MEMORY = 1, - NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2, - NVRTC_ERROR_INVALID_INPUT = 3, - NVRTC_ERROR_INVALID_PROGRAM = 4, - NVRTC_ERROR_INVALID_OPTION = 5, - NVRTC_ERROR_COMPILATION = 6, - NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7, - NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8, - NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9, - NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10, - NVRTC_ERROR_INTERNAL_ERROR = 11 -} nvrtcResult; - - -/** - * \ingroup error - * \brief nvrtcGetErrorString is a helper function that returns a string - * describing the given nvrtcResult code, e.g., NVRTC_SUCCESS to - * \c "NVRTC_SUCCESS". - * For unrecognized enumeration values, it returns - * \c "NVRTC_ERROR unknown". - * - * \param [in] result CUDA Runtime Compilation API result code. - * \return Message string for the given #nvrtcResult code. - */ -const char *nvrtcGetErrorString(nvrtcResult result); - - -/*************************************************************************//** - * - * \defgroup query General Information Query - * - * NVRTC defines the following function for general information query. - * - ****************************************************************************/ - - -/** - * \ingroup query - * \brief nvrtcVersion sets the output parameters \p major and \p minor - * with the CUDA Runtime Compilation version number. - * - * \param [out] major CUDA Runtime Compilation major version number. - * \param [out] minor CUDA Runtime Compilation minor version number. - * \return - * - \link #nvrtcResult NVRTC_SUCCESS \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink - * - */ -nvrtcResult nvrtcVersion(int *major, int *minor); - - -/*************************************************************************//** - * - * \defgroup compilation Compilation - * - * NVRTC defines the following type and functions for actual compilation. - * - ****************************************************************************/ - - -/** - * \ingroup compilation - * \brief nvrtcProgram is the unit of compilation, and an opaque handle for - * a program. - * - * To compile a CUDA program string, an instance of nvrtcProgram must be - * created first with ::nvrtcCreateProgram, then compiled with - * ::nvrtcCompileProgram. - */ -typedef struct _nvrtcProgram *nvrtcProgram; - - -/** - * \ingroup compilation - * \brief nvrtcCreateProgram creates an instance of nvrtcProgram with the - * given input parameters, and sets the output parameter \p prog with - * it. - * - * \param [out] prog CUDA Runtime Compilation program. - * \param [in] src CUDA program source. - * \param [in] name CUDA program name.\n - * \p name can be \c NULL; \c "default_program" is - * used when \p name is \c NULL. - * \param [in] numHeaders Number of headers used.\n - * \p numHeaders must be greater than or equal to 0. - * \param [in] headers Sources of the headers.\n - * \p headers can be \c NULL when \p numHeaders is - * 0. - * \param [in] includeNames Name of each header by which they can be - * included in the CUDA program source.\n - * \p includeNames can be \c NULL when \p numHeaders - * is 0. - * \return - * - \link #nvrtcResult NVRTC_SUCCESS \endlink - * - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink - * - \link #nvrtcResult NVRTC_ERROR_PROGRAM_CREATION_FAILURE \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink - * - * \see ::nvrtcDestroyProgram - */ -nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, - const char *src, - const char *name, - int numHeaders, - const char * const *headers, - const char * const *includeNames); - - -/** - * \ingroup compilation - * \brief nvrtcDestroyProgram destroys the given program. - * - * \param [in] prog CUDA Runtime Compilation program. - * \return - * - \link #nvrtcResult NVRTC_SUCCESS \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink - * - * \see ::nvrtcCreateProgram - */ -nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog); - - -/** - * \ingroup compilation - * \brief nvrtcCompileProgram compiles the given program. - * - * It supports compile options listed in \ref options. - */ -nvrtcResult nvrtcCompileProgram(nvrtcProgram prog, - int numOptions, const char * const *options); - - -/** - * \ingroup compilation - * \brief nvrtcGetPTXSize sets \p ptxSizeRet with the size of the PTX - * generated by the previous compilation of \p prog (including the - * trailing \c NULL). - * - * \param [in] prog CUDA Runtime Compilation program. - * \param [out] ptxSizeRet Size of the generated PTX (including the trailing - * \c NULL). - * \return - * - \link #nvrtcResult NVRTC_SUCCESS \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink - * - * \see ::nvrtcGetPTX - */ -nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet); - - -/** - * \ingroup compilation - * \brief nvrtcGetPTX stores the PTX generated by the previous compilation - * of \p prog in the memory pointed by \p ptx. - * - * \param [in] prog CUDA Runtime Compilation program. - * \param [out] ptx Compiled result. - * \return - * - \link #nvrtcResult NVRTC_SUCCESS \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink - * - * \see ::nvrtcGetPTXSize - */ -nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx); - - -/** - * \ingroup compilation - * \brief nvrtcGetProgramLogSize sets \p logSizeRet with the size of the - * log generated by the previous compilation of \p prog (including the - * trailing \c NULL). - * - * Note that compilation log may be generated with warnings and informative - * messages, even when the compilation of \p prog succeeds. - * - * \param [in] prog CUDA Runtime Compilation program. - * \param [out] logSizeRet Size of the compilation log - * (including the trailing \c NULL). - * \return - * - \link #nvrtcResult NVRTC_SUCCESS \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink - * - * \see ::nvrtcGetProgramLog - */ -nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet); - - -/** - * \ingroup compilation - * \brief nvrtcGetProgramLog stores the log generated by the previous - * compilation of \p prog in the memory pointed by \p log. - * - * \param [in] prog CUDA Runtime Compilation program. - * \param [out] log Compilation log. - * \return - * - \link #nvrtcResult NVRTC_SUCCESS \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink - * - * \see ::nvrtcGetProgramLogSize - */ -nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log); - - -/** - * \ingroup compilation - * \brief nvrtcAddNameExpression notes the given name expression - * denoting a __global__ function or function template - * instantiation. - * - * The identical name expression string must be provided on a subsequent - * call to nvrtcGetLoweredName to extract the lowered name. - * \param [in] prog CUDA Runtime Compilation program. - * \param [in] name_expression constant expression denoting a __global__ - * function or function template instantiation. - * \return - * - \link #nvrtcResult NVRTC_SUCCESS \endlink - * - \link #nvrtcResult NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION \endlink - * - * \see ::nvrtcGetLoweredName - */ -nvrtcResult nvrtcAddNameExpression(nvrtcProgram prog, - const char * const name_expression); - -/** - * \ingroup compilation - * \brief nvrtcGetLoweredName extracts the lowered (mangled) name - * for a __global__ function or function template instantiation, - * and updates *lowered_name to point to it. The memory containing - * the name is released when the NVRTC program is destroyed by - * nvrtcDestroyProgram. - * The identical name expression must have been previously - * provided to nvrtcAddNameExpression. - * - * \param [in] prog CUDA Runtime Compilation program. - * \param [in] name_expression constant expression denoting a __global__ - * function or function template instantiation. - * \param [out] lowered_name initialized by the function to point to a - * C string containing the lowered (mangled) - * name corresponding to the provided name expression. - * \return - * - \link #nvrtcResult NVRTC_SUCCESS \endlink - * - \link #nvrtcResult NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION \endlink - * - \link #nvrtcResult NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID \endlink - * - * \see ::nvrtcAddNameExpression - */ -nvrtcResult nvrtcGetLoweredName(nvrtcProgram prog, - const char *const name_expression, - const char** lowered_name); - - -/** - * \defgroup options Supported Compile Options - * - * NVRTC supports the compile options below. - * Option names with two preceding dashs (\c --) are long option names and - * option names with one preceding dash (\c -) are short option names. - * Short option names can be used instead of long option names. - * When a compile option takes an argument, an assignment operator (\c =) - * is used to separate the compile option argument from the compile option - * name, e.g., \c "--gpu-architecture=compute_30". - * Alternatively, the compile option name and the argument can be specified in - * separate strings without an assignment operator, .e.g, - * \c "--gpu-architecture" \c "compute_30". - * Single-character short option names, such as \c -D, \c -U, and \c -I, do - * not require an assignment operator, and the compile option name and the - * argument can be present in the same string with or without spaces between - * them. - * For instance, \c "-D=", \c "-D", and \c "-D " are all - * supported. - * - * The valid compiler options are: - * - * - Compilation targets - * - \c --gpu-architecture=\ (\c -arch)\n - * Specify the name of the class of GPU architectures for which the - * input must be compiled.\n - * - Valid \s: - * - \c compute_30 - * - \c compute_32 - * - \c compute_35 - * - \c compute_37 - * - \c compute_50 - * - \c compute_52 - * - \c compute_53 - * - \c compute_60 - * - \c compute_61 - * - \c compute_62 - * - \c compute_70 - * - \c compute_72 - * - Default: \c compute_30 - * - Separate compilation / whole-program compilation - * - \c --device-c (\c -dc)\n - * Generate relocatable code that can be linked with other relocatable - * device code. It is equivalent to --relocatable-device-code=true. - * - \c --device-w (\c -dw)\n - * Generate non-relocatable code. It is equivalent to - * \c --relocatable-device-code=false. - * - \c --relocatable-device-code={true|false} (\c -rdc)\n - * Enable (disable) the generation of relocatable device code. - * - Default: \c false - * - Debugging support - * - \c --device-debug (\c -G)\n - * Generate debug information. - * - \c --generate-line-info (\c -lineinfo)\n - * Generate line-number information. - * - Code generation - * - \c --maxrregcount=\ (\c -maxrregcount)\n - * Specify the maximum amount of registers that GPU functions can use. - * Until a function-specific limit, a higher value will generally - * increase the performance of individual GPU threads that execute this - * function. However, because thread registers are allocated from a - * global register pool on each GPU, a higher value of this option will - * also reduce the maximum thread block size, thereby reducing the amount - * of thread parallelism. Hence, a good maxrregcount value is the result - * of a trade-off. If this option is not specified, then no maximum is - * assumed. Value less than the minimum registers required by ABI will - * be bumped up by the compiler to ABI minimum limit. - * - \c --ftz={true|false} (\c -ftz)\n - * When performing single-precision floating-point operations, flush - * denormal values to zero or preserve denormal values. - * \c --use_fast_math implies \c --ftz=true. - * - Default: \c false - * - \c --prec-sqrt={true|false} (\c -prec-sqrt)\n - * For single-precision floating-point square root, use IEEE - * round-to-nearest mode or use a faster approximation. - * \c --use_fast_math implies \c --prec-sqrt=false. - * - Default: \c true - * - \c --prec-div={true|false} (\c -prec-div)\n - * For single-precision floating-point division and reciprocals, use IEEE - * round-to-nearest mode or use a faster approximation. - * \c --use_fast_math implies \c --prec-div=false. - * - Default: \c true - * - \c --fmad={true|false} (\c -fmad)\n - * Enables (disables) the contraction of floating-point multiplies and - * adds/subtracts into floating-point multiply-add operations (FMAD, - * FFMA, or DFMA). \c --use_fast_math implies \c --fmad=true. - * - Default: \c true - * - \c --use_fast_math (\c -use_fast_math)\n - * Make use of fast math operations. - * \c --use_fast_math implies \c --ftz=true \c --prec-div=false - * \c --prec-sqrt=false \c --fmad=true. - * - Preprocessing - * - \c --define-macro=\ (\c -D)\n - * \c \ can be either \c \ or \c \. - * - \c \ \n - * Predefine \c \ as a macro with definition \c 1. - * - \c \=\ \n - * The contents of \c \ are tokenized and preprocessed - * as if they appeared during translation phase three in a \c \#define - * directive. In particular, the definition will be truncated by - * embedded new line characters. - * - \c --undefine-macro=\ (\c -U)\n - * Cancel any previous definition of \c \. - * - \c --include-path=\ (\c -I)\n - * Add the directory \c \ to the list of directories to be - * searched for headers. These paths are searched after the list of - * headers given to ::nvrtcCreateProgram. - * - \c --pre-include=\ (\c -include)\n - * Preinclude \c \ during preprocessing. - * - Language Dialect - * - \c --std={c++11|c++14} (\c -std={c++11|c++14})\n - * Set language dialect to C++11 or C++14. - * - \c --builtin-move-forward={true|false} (\c -builtin-move-forward)\n - * Provide builtin definitions of \c std::move and \c std::forward, - * when C++11 language dialect is selected. - * - Default: \c true - * - \c --builtin-initializer-list={true|false} - * (\c -builtin-initializer-list)\n - * Provide builtin definitions of \c std::initializer_list class and - * member functions when C++11 language dialect is selected. - * - Default: \c true - * - Misc. - * - \c --disable-warnings (\c -w)\n - * Inhibit all warning messages. - * - \c --restrict (\c -restrict)\n - * Programmer assertion that all kernel pointer parameters are restrict - * pointers. - * - \c --device-as-default-execution-space - * (\c -default-device)\n - * Treat entities with no execution space annotation as \c __device__ - * entities. - * - * \param [in] prog CUDA Runtime Compilation program. - * \param [in] numOptions Number of compiler options passed. - * \param [in] options Compiler options in the form of C string array.\n - * \p options can be \c NULL when \p numOptions is 0. - * - * \return - * - \link #nvrtcResult NVRTC_SUCCESS \endlink - * - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink - * - \link #nvrtcResult NVRTC_ERROR_INVALID_OPTION \endlink - * - \link #nvrtcResult NVRTC_ERROR_COMPILATION \endlink - * - \link #nvrtcResult NVRTC_ERROR_BUILTIN_OPERATION_FAILURE \endlink - */ - - -#ifdef __cplusplus -} -#endif /* __cplusplus */ - - -/* The utility function 'nvrtcGetTypeName' is not available by default. Define - the macro 'NVRTC_GET_TYPE_NAME' to a non-zero value to make it available. -*/ - -#if NVRTC_GET_TYPE_NAME || __DOXYGEN_ONLY__ - -#if NVRTC_USE_CXXABI || __clang__ || __GNUC__ || __DOXYGEN_ONLY__ -#include -#include - -#elif defined(_WIN32) -#include -#include -#endif /* NVRTC_USE_CXXABI || __clang__ || __GNUC__ */ - - -#include -#include - - -/*************************************************************************//** - * - * \defgroup hosthelper Host Helper - * - * NVRTC defines the following functions for easier interaction with host code. - * - ****************************************************************************/ - -/** - * \ingroup hosthelper - * \brief nvrtcGetTypeName stores the source level name of the template type argument - * T in the given std::string location. - * - * This function is only provided when the macro NVRTC_GET_TYPE_NAME is - * defined with a non-zero value. It uses abi::__cxa_demangle or UnDecorateSymbolName - * function calls to extract the type name, when using gcc/clang or cl.exe compilers, - * respectively. If the name extraction fails, it will return NVRTC_INTERNAL_ERROR, - * otherwise *result is initialized with the extracted name. - * - * \param [in] result: pointer to std::string in which to store the type name. - * \return - * - \link #nvrtcResult NVRTC_SUCCESS \endlink - * - \link #nvrtcResult NVRTC_ERROR_INTERNAL_ERROR \endlink - * - */ - -template -nvrtcResult nvrtcGetTypeName(std::string *result) -{ - const char *name = typeid(T).name(); - -#if USE_CXXABI || __clang__ || __GNUC__ - int status; - char *undecorated_name = abi::__cxa_demangle(name, 0, 0, &status); - if (status == 0) { - *result = undecorated_name; - free(undecorated_name); - return NVRTC_SUCCESS; - } -#elif defined(_WIN32) - char undecorated_name[4096]; - if(UnDecorateSymbolName(name, undecorated_name, - sizeof(undecorated_name) / sizeof(*undecorated_name), - UNDNAME_COMPLETE) ) { - *result = undecorated_name; - return NVRTC_SUCCESS; - } -#endif /* USE_CXXABI || __clang__ || __GNUC__ */ - return NVRTC_ERROR_INTERNAL_ERROR; -} -#endif /* NVRTC_GET_TYPE_NAME */ - -#endif /* __NVRTC_H__ */ diff --git a/include/triton/external/CUDA/surface_types.h b/include/triton/external/CUDA/surface_types.h deleted file mode 100755 index 95ff57ca1..000000000 --- a/include/triton/external/CUDA/surface_types.h +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -#if !defined(__SURFACE_TYPES_H__) -#define __SURFACE_TYPES_H__ - -/******************************************************************************* -* * -* * -* * -*******************************************************************************/ - -#include "driver_types.h" - -/** - * \addtogroup CUDART_TYPES - * - * @{ - */ - -/******************************************************************************* -* * -* * -* * -*******************************************************************************/ - -#define cudaSurfaceType1D 0x01 -#define cudaSurfaceType2D 0x02 -#define cudaSurfaceType3D 0x03 -#define cudaSurfaceTypeCubemap 0x0C -#define cudaSurfaceType1DLayered 0xF1 -#define cudaSurfaceType2DLayered 0xF2 -#define cudaSurfaceTypeCubemapLayered 0xFC - -/** - * CUDA Surface boundary modes - */ -enum __device_builtin__ cudaSurfaceBoundaryMode -{ - cudaBoundaryModeZero = 0, /**< Zero boundary mode */ - cudaBoundaryModeClamp = 1, /**< Clamp boundary mode */ - cudaBoundaryModeTrap = 2 /**< Trap boundary mode */ -}; - -/** - * CUDA Surface format modes - */ -enum __device_builtin__ cudaSurfaceFormatMode -{ - cudaFormatModeForced = 0, /**< Forced format mode */ - cudaFormatModeAuto = 1 /**< Auto format mode */ -}; - -/** - * CUDA Surface reference - */ -struct __device_builtin__ surfaceReference -{ - /** - * Channel descriptor for surface reference - */ - struct cudaChannelFormatDesc channelDesc; -}; - -/** - * An opaque value that represents a CUDA Surface object - */ -typedef __device_builtin__ unsigned long long cudaSurfaceObject_t; - -/** @} */ -/** @} */ /* END CUDART_TYPES */ - -#endif /* !__SURFACE_TYPES_H__ */ diff --git a/include/triton/external/CUDA/texture_types.h b/include/triton/external/CUDA/texture_types.h deleted file mode 100755 index dda31dd72..000000000 --- a/include/triton/external/CUDA/texture_types.h +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -#if !defined(__TEXTURE_TYPES_H__) -#define __TEXTURE_TYPES_H__ - -/******************************************************************************* -* * -* * -* * -*******************************************************************************/ - -#include "driver_types.h" - -/** - * \addtogroup CUDART_TYPES - * - * @{ - */ - -/******************************************************************************* -* * -* * -* * -*******************************************************************************/ - -#define cudaTextureType1D 0x01 -#define cudaTextureType2D 0x02 -#define cudaTextureType3D 0x03 -#define cudaTextureTypeCubemap 0x0C -#define cudaTextureType1DLayered 0xF1 -#define cudaTextureType2DLayered 0xF2 -#define cudaTextureTypeCubemapLayered 0xFC - -/** - * CUDA texture address modes - */ -enum __device_builtin__ cudaTextureAddressMode -{ - cudaAddressModeWrap = 0, /**< Wrapping address mode */ - cudaAddressModeClamp = 1, /**< Clamp to edge address mode */ - cudaAddressModeMirror = 2, /**< Mirror address mode */ - cudaAddressModeBorder = 3 /**< Border address mode */ -}; - -/** - * CUDA texture filter modes - */ -enum __device_builtin__ cudaTextureFilterMode -{ - cudaFilterModePoint = 0, /**< Point filter mode */ - cudaFilterModeLinear = 1 /**< Linear filter mode */ -}; - -/** - * CUDA texture read modes - */ -enum __device_builtin__ cudaTextureReadMode -{ - cudaReadModeElementType = 0, /**< Read texture as specified element type */ - cudaReadModeNormalizedFloat = 1 /**< Read texture as normalized float */ -}; - -/** - * CUDA texture reference - */ -struct __device_builtin__ textureReference -{ - /** - * Indicates whether texture reads are normalized or not - */ - int normalized; - /** - * Texture filter mode - */ - enum cudaTextureFilterMode filterMode; - /** - * Texture address mode for up to 3 dimensions - */ - enum cudaTextureAddressMode addressMode[3]; - /** - * Channel descriptor for the texture reference - */ - struct cudaChannelFormatDesc channelDesc; - /** - * Perform sRGB->linear conversion during texture read - */ - int sRGB; - /** - * Limit to the anisotropy ratio - */ - unsigned int maxAnisotropy; - /** - * Mipmap filter mode - */ - enum cudaTextureFilterMode mipmapFilterMode; - /** - * Offset applied to the supplied mipmap level - */ - float mipmapLevelBias; - /** - * Lower end of the mipmap level range to clamp access to - */ - float minMipmapLevelClamp; - /** - * Upper end of the mipmap level range to clamp access to - */ - float maxMipmapLevelClamp; - int __cudaReserved[15]; -}; - -/** - * CUDA texture descriptor - */ -struct __device_builtin__ cudaTextureDesc -{ - /** - * Texture address mode for up to 3 dimensions - */ - enum cudaTextureAddressMode addressMode[3]; - /** - * Texture filter mode - */ - enum cudaTextureFilterMode filterMode; - /** - * Texture read mode - */ - enum cudaTextureReadMode readMode; - /** - * Perform sRGB->linear conversion during texture read - */ - int sRGB; - /** - * Texture Border Color - */ - float borderColor[4]; - /** - * Indicates whether texture reads are normalized or not - */ - int normalizedCoords; - /** - * Limit to the anisotropy ratio - */ - unsigned int maxAnisotropy; - /** - * Mipmap filter mode - */ - enum cudaTextureFilterMode mipmapFilterMode; - /** - * Offset applied to the supplied mipmap level - */ - float mipmapLevelBias; - /** - * Lower end of the mipmap level range to clamp access to - */ - float minMipmapLevelClamp; - /** - * Upper end of the mipmap level range to clamp access to - */ - float maxMipmapLevelClamp; -}; - -/** - * An opaque value that represents a CUDA texture object - */ -typedef __device_builtin__ unsigned long long cudaTextureObject_t; - -/** @} */ -/** @} */ /* END CUDART_TYPES */ - -#endif /* !__TEXTURE_TYPES_H__ */ diff --git a/include/triton/external/CUDA/vector_functions.h b/include/triton/external/CUDA/vector_functions.h deleted file mode 100755 index 8ffb37122..000000000 --- a/include/triton/external/CUDA/vector_functions.h +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -#if !defined(__VECTOR_FUNCTIONS_H__) -#define __VECTOR_FUNCTIONS_H__ - -/******************************************************************************* -* * -* * -* * -*******************************************************************************/ - -#include "builtin_types.h" -#include "host_defines.h" -#include "vector_types.h" - -#if defined(__CUDACC_RTC__) -#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__ -#else /* !__CUDACC_RTC__ */ -#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__ -#endif /* __CUDACC_RTC__ */ - -/******************************************************************************* -* * -* * -* * -*******************************************************************************/ - -__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x); - -__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x); - -__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y); - -__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y); - -__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z); - -__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z); - -__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w); - -__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w); - -__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x); - -__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x); - -__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y); - -__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y); - -__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z); - -__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z); - -__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w); - -__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w); - -__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x); - -__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x); - -__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y); - -__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y); - -__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z); - -__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z); - -__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w); - -__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w); - -__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x); - -__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x); - -__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y); - -__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y); - -__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z); - -__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z); - -__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w); - -__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w); - -__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x); - -__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y); - -__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z); - -__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w); - -__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x); - -__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x); - -__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y); - -__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y); - -__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z); - -__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z); - -__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w); - -__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w); - -__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x); - -__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y); - -__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z); - -__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w); - -#undef __VECTOR_FUNCTIONS_DECL__ - -#if !defined(__CUDACC_RTC__) -#include "vector_functions.hpp" -#endif /* !__CUDACC_RTC__ */ - -#endif /* !__VECTOR_FUNCTIONS_H__ */ diff --git a/include/triton/external/CUDA/vector_functions.hpp b/include/triton/external/CUDA/vector_functions.hpp deleted file mode 100755 index 2ee5d5890..000000000 --- a/include/triton/external/CUDA/vector_functions.hpp +++ /dev/null @@ -1,318 +0,0 @@ -/* - * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -#if !defined(__VECTOR_FUNCTIONS_HPP__) -#define __VECTOR_FUNCTIONS_HPP__ - -/******************************************************************************* -* * -* * -* * -*******************************************************************************/ - -#include "builtin_types.h" -#include "host_defines.h" -#include "vector_types.h" - -#if defined(__CUDACC_RTC__) -#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__ -#else /* !__CUDACC_RTC__ */ -#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__ -#endif /* __CUDACC_RTC__ */ - -/******************************************************************************* -* * -* * -* * -*******************************************************************************/ - -__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x) -{ - char1 t; t.x = x; return t; -} - -__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x) -{ - uchar1 t; t.x = x; return t; -} - -__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y) -{ - char2 t; t.x = x; t.y = y; return t; -} - -__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y) -{ - uchar2 t; t.x = x; t.y = y; return t; -} - -__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z) -{ - char3 t; t.x = x; t.y = y; t.z = z; return t; -} - -__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z) -{ - uchar3 t; t.x = x; t.y = y; t.z = z; return t; -} - -__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w) -{ - char4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; -} - -__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w) -{ - uchar4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; -} - -__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x) -{ - short1 t; t.x = x; return t; -} - -__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x) -{ - ushort1 t; t.x = x; return t; -} - -__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y) -{ - short2 t; t.x = x; t.y = y; return t; -} - -__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y) -{ - ushort2 t; t.x = x; t.y = y; return t; -} - -__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z) -{ - short3 t; t.x = x; t.y = y; t.z = z; return t; -} - -__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z) -{ - ushort3 t; t.x = x; t.y = y; t.z = z; return t; -} - -__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w) -{ - short4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; -} - -__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w) -{ - ushort4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; -} - -__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x) -{ - int1 t; t.x = x; return t; -} - -__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x) -{ - uint1 t; t.x = x; return t; -} - -__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y) -{ - int2 t; t.x = x; t.y = y; return t; -} - -__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y) -{ - uint2 t; t.x = x; t.y = y; return t; -} - -__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z) -{ - int3 t; t.x = x; t.y = y; t.z = z; return t; -} - -__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z) -{ - uint3 t; t.x = x; t.y = y; t.z = z; return t; -} - -__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w) -{ - int4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; -} - -__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w) -{ - uint4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; -} - -__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x) -{ - long1 t; t.x = x; return t; -} - -__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x) -{ - ulong1 t; t.x = x; return t; -} - -__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y) -{ - long2 t; t.x = x; t.y = y; return t; -} - -__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y) -{ - ulong2 t; t.x = x; t.y = y; return t; -} - -__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z) -{ - long3 t; t.x = x; t.y = y; t.z = z; return t; -} - -__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z) -{ - ulong3 t; t.x = x; t.y = y; t.z = z; return t; -} - -__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w) -{ - long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; -} - -__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w) -{ - ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; -} - -__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x) -{ - float1 t; t.x = x; return t; -} - -__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y) -{ - float2 t; t.x = x; t.y = y; return t; -} - -__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z) -{ - float3 t; t.x = x; t.y = y; t.z = z; return t; -} - -__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w) -{ - float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; -} - -__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x) -{ - longlong1 t; t.x = x; return t; -} - -__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x) -{ - ulonglong1 t; t.x = x; return t; -} - -__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y) -{ - longlong2 t; t.x = x; t.y = y; return t; -} - -__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y) -{ - ulonglong2 t; t.x = x; t.y = y; return t; -} - -__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z) -{ - longlong3 t; t.x = x; t.y = y; t.z = z; return t; -} - -__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z) -{ - ulonglong3 t; t.x = x; t.y = y; t.z = z; return t; -} - -__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w) -{ - longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; -} - -__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w) -{ - ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; -} - -__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x) -{ - double1 t; t.x = x; return t; -} - -__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y) -{ - double2 t; t.x = x; t.y = y; return t; -} - -__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z) -{ - double3 t; t.x = x; t.y = y; t.z = z; return t; -} - -__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w) -{ - double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; -} - -#undef __VECTOR_FUNCTIONS_DECL__ - -#endif /* !__VECTOR_FUNCTIONS_HPP__ */ - diff --git a/include/triton/external/CUDA/vector_types.h b/include/triton/external/CUDA/vector_types.h deleted file mode 100755 index 63d9e680b..000000000 --- a/include/triton/external/CUDA/vector_types.h +++ /dev/null @@ -1,425 +0,0 @@ -/* - * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. - * - * NOTICE TO LICENSEE: - * - * This source code and/or documentation ("Licensed Deliverables") are - * subject to NVIDIA intellectual property rights under U.S. and - * international Copyright laws. - * - * These Licensed Deliverables contained herein is PROPRIETARY and - * CONFIDENTIAL to NVIDIA and is being provided under the terms and - * conditions of a form of NVIDIA software license agreement by and - * between NVIDIA and Licensee ("License Agreement") or electronically - * accepted by Licensee. Notwithstanding any terms or conditions to - * the contrary in the License Agreement, reproduction or disclosure - * of the Licensed Deliverables to any third party without the express - * written consent of NVIDIA is prohibited. - * - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE - * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS - * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. - * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED - * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, - * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. - * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE - * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY - * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY - * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, - * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS - * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE - * OF THESE LICENSED DELIVERABLES. - * - * U.S. Government End Users. These Licensed Deliverables are a - * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT - * 1995), consisting of "commercial computer software" and "commercial - * computer software documentation" as such terms are used in 48 - * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government - * only as a commercial end item. Consistent with 48 C.F.R.12.212 and - * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all - * U.S. Government End Users acquire the Licensed Deliverables with - * only those rights set forth herein. - * - * Any use of the Licensed Deliverables in individual and commercial - * software must include, in the user documentation and internal - * comments to the code, the above Disclaimer and U.S. Government End - * Users Notice. - */ - -#if !defined(__VECTOR_TYPES_H__) -#define __VECTOR_TYPES_H__ - -/******************************************************************************* -* * -* * -* * -*******************************************************************************/ - -#include "host_defines.h" - -/******************************************************************************* -* * -* * -* * -*******************************************************************************/ - -#if !defined(__CUDACC__) && !defined(__CUDACC_RTC__) && \ - defined(_WIN32) && !defined(_WIN64) - -#pragma warning(push) -#pragma warning(disable: 4201 4408) - -#define __cuda_builtin_vector_align8(tag, members) \ -struct __device_builtin__ tag \ -{ \ - union \ - { \ - struct { members }; \ - struct { long long int :1,:0; }; \ - }; \ -} - -#else /* !__CUDACC__ && !__CUDACC_RTC__ && _WIN32 && !_WIN64 */ - -#define __cuda_builtin_vector_align8(tag, members) \ -struct __device_builtin__ __align__(8) tag \ -{ \ - members \ -} - -#endif /* !__CUDACC__ && !__CUDACC_RTC__ && _WIN32 && !_WIN64 */ - -struct __device_builtin__ char1 -{ - signed char x; -}; - -struct __device_builtin__ uchar1 -{ - unsigned char x; -}; - - -struct __device_builtin__ __align__(2) char2 -{ - signed char x, y; -}; - -struct __device_builtin__ __align__(2) uchar2 -{ - unsigned char x, y; -}; - -struct __device_builtin__ char3 -{ - signed char x, y, z; -}; - -struct __device_builtin__ uchar3 -{ - unsigned char x, y, z; -}; - -struct __device_builtin__ __align__(4) char4 -{ - signed char x, y, z, w; -}; - -struct __device_builtin__ __align__(4) uchar4 -{ - unsigned char x, y, z, w; -}; - -struct __device_builtin__ short1 -{ - short x; -}; - -struct __device_builtin__ ushort1 -{ - unsigned short x; -}; - -struct __device_builtin__ __align__(4) short2 -{ - short x, y; -}; - -struct __device_builtin__ __align__(4) ushort2 -{ - unsigned short x, y; -}; - -struct __device_builtin__ short3 -{ - short x, y, z; -}; - -struct __device_builtin__ ushort3 -{ - unsigned short x, y, z; -}; - -__cuda_builtin_vector_align8(short4, short x; short y; short z; short w;); -__cuda_builtin_vector_align8(ushort4, unsigned short x; unsigned short y; unsigned short z; unsigned short w;); - -struct __device_builtin__ int1 -{ - int x; -}; - -struct __device_builtin__ uint1 -{ - unsigned int x; -}; - -__cuda_builtin_vector_align8(int2, int x; int y;); -__cuda_builtin_vector_align8(uint2, unsigned int x; unsigned int y;); - -struct __device_builtin__ int3 -{ - int x, y, z; -}; - -struct __device_builtin__ uint3 -{ - unsigned int x, y, z; -}; - -struct __device_builtin__ __builtin_align__(16) int4 -{ - int x, y, z, w; -}; - -struct __device_builtin__ __builtin_align__(16) uint4 -{ - unsigned int x, y, z, w; -}; - -struct __device_builtin__ long1 -{ - long int x; -}; - -struct __device_builtin__ ulong1 -{ - unsigned long x; -}; - -#if defined(_WIN32) -__cuda_builtin_vector_align8(long2, long int x; long int y;); -__cuda_builtin_vector_align8(ulong2, unsigned long int x; unsigned long int y;); -#else /* !_WIN32 */ - -struct __device_builtin__ __align__(2*sizeof(long int)) long2 -{ - long int x, y; -}; - -struct __device_builtin__ __align__(2*sizeof(unsigned long int)) ulong2 -{ - unsigned long int x, y; -}; - -#endif /* _WIN32 */ - -struct __device_builtin__ long3 -{ - long int x, y, z; -}; - -struct __device_builtin__ ulong3 -{ - unsigned long int x, y, z; -}; - -struct __device_builtin__ __builtin_align__(16) long4 -{ - long int x, y, z, w; -}; - -struct __device_builtin__ __builtin_align__(16) ulong4 -{ - unsigned long int x, y, z, w; -}; - -struct __device_builtin__ float1 -{ - float x; -}; - -#if !defined(__CUDACC__) && defined(__arm__) && \ - defined(__ARM_PCS_VFP) && __GNUC__ == 4 && __GNUC_MINOR__ == 6 - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-pedantic" - -struct __device_builtin__ __attribute__((aligned(8))) float2 -{ - float x; float y; float __cuda_gnu_arm_ice_workaround[0]; -}; - -#pragma GCC poison __cuda_gnu_arm_ice_workaround -#pragma GCC diagnostic pop - -#else /* !__CUDACC__ && __arm__ && __ARM_PCS_VFP && - __GNUC__ == 4&& __GNUC_MINOR__ == 6 */ - -__cuda_builtin_vector_align8(float2, float x; float y;); - -#endif /* !__CUDACC__ && __arm__ && __ARM_PCS_VFP && - __GNUC__ == 4&& __GNUC_MINOR__ == 6 */ - -struct __device_builtin__ float3 -{ - float x, y, z; -}; - -struct __device_builtin__ __builtin_align__(16) float4 -{ - float x, y, z, w; -}; - -struct __device_builtin__ longlong1 -{ - long long int x; -}; - -struct __device_builtin__ ulonglong1 -{ - unsigned long long int x; -}; - -struct __device_builtin__ __builtin_align__(16) longlong2 -{ - long long int x, y; -}; - -struct __device_builtin__ __builtin_align__(16) ulonglong2 -{ - unsigned long long int x, y; -}; - -struct __device_builtin__ longlong3 -{ - long long int x, y, z; -}; - -struct __device_builtin__ ulonglong3 -{ - unsigned long long int x, y, z; -}; - -struct __device_builtin__ __builtin_align__(16) longlong4 -{ - long long int x, y, z ,w; -}; - -struct __device_builtin__ __builtin_align__(16) ulonglong4 -{ - unsigned long long int x, y, z, w; -}; - -struct __device_builtin__ double1 -{ - double x; -}; - -struct __device_builtin__ __builtin_align__(16) double2 -{ - double x, y; -}; - -struct __device_builtin__ double3 -{ - double x, y, z; -}; - -struct __device_builtin__ __builtin_align__(16) double4 -{ - double x, y, z, w; -}; - -#if !defined(__CUDACC__) && defined(_WIN32) && !defined(_WIN64) - -#pragma warning(pop) - -#endif /* !__CUDACC__ && _WIN32 && !_WIN64 */ - -/******************************************************************************* -* * -* * -* * -*******************************************************************************/ - -typedef __device_builtin__ struct char1 char1; -typedef __device_builtin__ struct uchar1 uchar1; -typedef __device_builtin__ struct char2 char2; -typedef __device_builtin__ struct uchar2 uchar2; -typedef __device_builtin__ struct char3 char3; -typedef __device_builtin__ struct uchar3 uchar3; -typedef __device_builtin__ struct char4 char4; -typedef __device_builtin__ struct uchar4 uchar4; -typedef __device_builtin__ struct short1 short1; -typedef __device_builtin__ struct ushort1 ushort1; -typedef __device_builtin__ struct short2 short2; -typedef __device_builtin__ struct ushort2 ushort2; -typedef __device_builtin__ struct short3 short3; -typedef __device_builtin__ struct ushort3 ushort3; -typedef __device_builtin__ struct short4 short4; -typedef __device_builtin__ struct ushort4 ushort4; -typedef __device_builtin__ struct int1 int1; -typedef __device_builtin__ struct uint1 uint1; -typedef __device_builtin__ struct int2 int2; -typedef __device_builtin__ struct uint2 uint2; -typedef __device_builtin__ struct int3 int3; -typedef __device_builtin__ struct uint3 uint3; -typedef __device_builtin__ struct int4 int4; -typedef __device_builtin__ struct uint4 uint4; -typedef __device_builtin__ struct long1 long1; -typedef __device_builtin__ struct ulong1 ulong1; -typedef __device_builtin__ struct long2 long2; -typedef __device_builtin__ struct ulong2 ulong2; -typedef __device_builtin__ struct long3 long3; -typedef __device_builtin__ struct ulong3 ulong3; -typedef __device_builtin__ struct long4 long4; -typedef __device_builtin__ struct ulong4 ulong4; -typedef __device_builtin__ struct float1 float1; -typedef __device_builtin__ struct float2 float2; -typedef __device_builtin__ struct float3 float3; -typedef __device_builtin__ struct float4 float4; -typedef __device_builtin__ struct longlong1 longlong1; -typedef __device_builtin__ struct ulonglong1 ulonglong1; -typedef __device_builtin__ struct longlong2 longlong2; -typedef __device_builtin__ struct ulonglong2 ulonglong2; -typedef __device_builtin__ struct longlong3 longlong3; -typedef __device_builtin__ struct ulonglong3 ulonglong3; -typedef __device_builtin__ struct longlong4 longlong4; -typedef __device_builtin__ struct ulonglong4 ulonglong4; -typedef __device_builtin__ struct double1 double1; -typedef __device_builtin__ struct double2 double2; -typedef __device_builtin__ struct double3 double3; -typedef __device_builtin__ struct double4 double4; - -/******************************************************************************* -* * -* * -* * -*******************************************************************************/ - -struct __device_builtin__ dim3 -{ - unsigned int x, y, z; -#if defined(__cplusplus) - __host__ __device__ dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {} - __host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {} - __host__ __device__ operator uint3(void) { uint3 t; t.x = x; t.y = y; t.z = z; return t; } -#endif /* __cplusplus */ -}; - -typedef __device_builtin__ struct dim3 dim3; - -#undef __cuda_builtin_vector_align8 - -#endif /* !__VECTOR_TYPES_H__ */ diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index a114cca8c..476d25f5a 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -45,6 +45,11 @@ public: }; typedef std::function benchmark_t; + struct tune_res_t{ + double perf; + std::vector params; + }; + struct passes_wrapper { passes_wrapper(codegen::target* target) : shmem_liveness(&shmem_info), @@ -93,7 +98,7 @@ private: public: jit(driver::context* context); ~jit(); - void autotune(const char* name, const char* src, benchmark_t benchmark); + tune_res_t autotune(const char* name, const char* src, benchmark_t benchmark); void add_module(ir::module &module, const std::vector& params = {}); void add_module(const char* name, const char* src, const std::vector& params = {}); driver::kernel* get_function(const char* name); diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp new file mode 100644 index 000000000..15c12a3fd --- /dev/null +++ b/include/triton/tools/bench.hpp @@ -0,0 +1,50 @@ +#ifndef TRITON_TOOLS_BENCH_HPP +#define TRITON_TOOLS_BENCH_HPP + +namespace triton{ +namespace tools{ + +class timer{ + typedef std::chrono::high_resolution_clock high_resolution_clock; + typedef std::chrono::nanoseconds nanoseconds; + +public: + explicit timer(bool run = false) + { if (run) start(); } + + void start() + { _start = high_resolution_clock::now(); } + + nanoseconds get() const + { return std::chrono::duration_cast(high_resolution_clock::now() - _start); } + +private: + high_resolution_clock::time_point _start; +}; + +template +double bench(OP const & op, SYNC const & sync, const triton::driver::device * device) +{ + timer tmr; + std::vector times; + double total_time = 0; + op(); + sync(); + while(total_time*1e-9 < 1e-3){ + float norm = 1; + // normalize clock if possible to get roughly constant result + if(auto cu_device = dynamic_cast(device)) + norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock(); + tmr.start(); + op(); + sync(); + times.push_back(norm*tmr.get().count()); + total_time+=times.back(); + } + return *std::min_element(times.begin(), times.end()); +} + +} +} + +#endif diff --git a/lib/dnn/conv.cpp b/lib/dnn/conv.cpp index 889a37f00..2acbfed94 100644 --- a/lib/dnn/conv.cpp +++ b/lib/dnn/conv.cpp @@ -1,8 +1,20 @@ +#include #include "triton/dnn/conv.h" namespace triton{ namespace dnn{ +void conv::set_ld(const std::vector& shapes, + std::vector& ld) { + size_t size = shapes.size(); + ld.resize(size); + ld[4] = 1; + ld[3] = shapes[4]*ld[4]; + ld[2] = shapes[3]*ld[3]; + ld[1] = shapes[2]*ld[2]; + ld[0] = shapes[1]*ld[1]; +} + conv::conv(int B, int NC, int D, int H, int W, int T, int R, int S, int NF, @@ -41,9 +53,14 @@ conv::conv(int B, int NC, std::swap(AH_, CH_); std::swap(AW_, CW_); shapes_a_.swap(shapes_c_); + std::swap(stride_d_, upsample_d_); + std::swap(stride_h_, upsample_h_); + std::swap(stride_w_, upsample_w_); pad_d_ = (CD_*stride_d_ - AD_*upsample_d_ + BD_ - 1 - stride_d_ + 1)/2; pad_h_ = (CH_*stride_h_ - AH_*upsample_h_ + BH_ - 1 - stride_h_ + 1)/2; pad_w_ = (CW_*stride_w_ - AW_*upsample_w_ + BW_ - 1 - stride_w_ + 1)/2; + std::swap(b_inner_idx_, b_outer_idx_); + std::swap(NC_, NF_); } // swap b and c for wgrad if(ty_ == WGRAD){ @@ -57,32 +74,16 @@ conv::conv(int B, int NC, std::swap(b_pix_idx_, c_pix_idx); } // leading dimensions - auto set_ld = [](const std::vector& shapes, - std::vector& ld) { - size_t size = shapes.size(); - ld.resize(size); - ld[4] = 1; - ld[3] = shapes[4]*ld[4]; - ld[2] = shapes[3]*ld[3]; - ld[1] = shapes[2]*ld[2]; - ld[0] = shapes[1]*ld[1]; - }; set_ld(shapes_a_, ld_a_); set_ld(shapes_b_, ld_b_); set_ld(shapes_c_, ld_c_); // equivalent matmul + bool upsampled_b = (ty_ == BPROP) && (upsample_d_ > 1 || upsample_h_ > 1 || upsample_w_ > 1); b_trans_ = ty_ != BPROP; - b_lut_ = ty_ == WGRAD; - if(ty_ == WGRAD) { - M_ = shapes_c_[0]*shapes_c_[1]*shapes_c_[2]*shapes_c_[3]; - N_ = shapes_c_[4]; - K_ = shapes_b_[0]*shapes_b_[2]*shapes_b_[3]*shapes_b_[4]; - } - else { - M_ = shapes_c_[0]*shapes_c_[2]*shapes_c_[3]*shapes_c_[4]; - N_ = shapes_c_[1]; - K_ = shapes_b_[0]*shapes_b_[1]*shapes_b_[2]*shapes_b_[3]; - } + b_lut_ = ty_ == WGRAD || upsampled_b; + M_ = shapes_c_[c_outer_0_idx_]*shapes_c_[c_pix_idx]*shapes_c_[c_pix_idx+1]*shapes_c_[c_pix_idx+2]; + N_ = shapes_c_[c_outer_1_idx_]; + K_ = shapes_b_[b_inner_idx_]*BD_*BH_*BW_; // look-up table info if(ty_ == FPROP) Fs_ = shapes_b_[1]*shapes_b_[2]*shapes_b_[3]; @@ -91,6 +92,8 @@ conv::conv(int B, int NC, TK_ = 8; Luts_ = (TK_ + Fs_ - 1) / Fs_ * Fs_; build_deltas(); + if(b_lut_) + build_b_deltas(); build_masks(); size_t cst_size = h_b_deltas_.size()*4; is_b_deltas_cst_ = cst_size < 65536; @@ -98,6 +101,8 @@ conv::conv(int B, int NC, is_a_deltas_cst = cst_size < 65536; cst_size += h_masks_.size()*4; is_mask_cst_ = cst_size < 65536; + max_grid_0_ = 256; + max_grid_1_ = 256; } size_t conv::a_size() @@ -115,103 +120,133 @@ size_t conv::c_size() std::vector conv::c_shapes() { return shapes_c_; } -void conv::build_deltas(){ - h_a_deltas_.resize(Luts_ + upsample_d_*upsample_h_*upsample_w_*Luts_); - if(b_lut_) - h_b_deltas_.resize(Luts_); - auto unpack = [&](int32_t ltrs) { - int32_t l = (!b_trans_) ? ltrs % NF_ : ltrs / (BD_*BH_*BW_); - int32_t trs = (!b_trans_) ? ltrs / NF_ : ltrs % (BD_*BH_*BW_); - int32_t tr = trs / BW_; - int32_t s = trs % BW_; - int32_t t = tr / BH_; - int32_t r = tr % BH_; - if(!b_trans_){ - r = BH_ - 1 - r; - s = BW_ - 1 - s; - } - return std::make_tuple(l, t, r, s); - }; +std::tuple conv::unpack(int32_t ltrs, bool flip, int32_t EBD, int32_t EBH, int32_t EBW) { + int32_t l, t, r, s; + if(b_trans_){ + l = ltrs / (EBD*EBH*EBW); + int32_t trs = ltrs % (EBD*EBH*EBW); + int32_t tr = trs / EBW; + s = trs % EBW; + t = tr / EBH; + r = tr % EBH; + } + else{ + int32_t rs = ltrs / NC_; + l = ltrs % NC_; + r = rs / EBW; + s = rs % EBW; + } + if(flip){ + r = EBH - 1 - r; + s = EBW - 1 - s; + } + return std::make_tuple(l, t, r, s); +} - for(size_t i = 0; i < Luts_; ++i) - h_a_deltas_[i] = (((i + TK_) % Luts_) - i); +void conv::build_b_deltas(){ + h_b_deltas_.resize(Luts_*upsample_d_*upsample_h_*upsample_w_); size_t Ds0 = Luts_; size_t Ds1 = upsample_w_; size_t Ds2 = upsample_h_; size_t Ds3 = upsample_d_; - for(size_t pd = 0; pd < Ds3; ++pd) - for(size_t ph = 0; ph < Ds2; ++ph) - for(size_t pw = 0; pw < Ds1; ++pw) { - int32_t* deltas_ptr = &h_a_deltas_[Luts_ + pw*Ds0 + ph*Ds0*Ds1 + pd*Ds0*Ds1*Ds2]; + for(size_t ud = 0; ud < Ds3; ++ud) + for(size_t uh = 0; uh < Ds2; ++uh) + for(size_t uw = 0; uw < Ds1; ++uw) { + int32_t* deltas_ptr = &h_b_deltas_[uw*Ds0 + uh*Ds0*Ds1 + ud*Ds0*Ds1*Ds2]; + for(size_t i = 0; i < Luts_; ++i) { + int32_t EBD = 1; + int32_t EBH = ((upsample_h_ - uh - 1) + BH_) / upsample_h_; + int32_t EBW = ((upsample_w_ - uw - 1) + BW_) / upsample_w_; + if(EBD == 0 || EBH == 0 || EBW == 0) + continue; + int32_t c, t, r, s; + int32_t nextc, nextt, nextr, nexts; + std::tie(c, t, r, s) = unpack(i, false, EBD, EBH, EBW); + std::tie(nextc, nextt, nextr, nexts) = unpack(i + TK_, false, EBD, EBH, EBW); + int32_t cdiff = nextc - c; + int32_t tdiff = (nextt - t)*upsample_d_; + int32_t rdiff = (nextr - r)*upsample_h_; + int32_t sdiff = (nexts - s)*upsample_w_; + deltas_ptr[i] = cdiff*ld_b_[b_inner_idx_] + tdiff*ld_b_[b_pix_idx_] + rdiff*ld_b_[b_pix_idx_ + 1] + sdiff*ld_b_[b_pix_idx_ + 2]; + } + } +} + +void conv::build_deltas(){ + h_a_deltas_.resize(Luts_ + upsample_d_*upsample_h_*upsample_w_*Luts_); + for(size_t i = 0; i < Luts_; ++i) + h_a_deltas_[i] = (((i + TK_) % Luts_) - i); + size_t Ds0 = Luts_; + size_t Ds1 = upsample_w_; + size_t Ds2 = upsample_h_; + size_t Ds3 = upsample_d_; + for(size_t ud = 0; ud < Ds3; ++ud) + for(size_t uh = 0; uh < Ds2; ++uh) + for(size_t uw = 0; uw < Ds1; ++uw) { + int32_t* deltas_ptr = &h_a_deltas_[Luts_ + uw*Ds0 + uh*Ds0*Ds1 + ud*Ds0*Ds1*Ds2]; // cumulative increments for(size_t i = 0; i < Ds0; ++i) { + int32_t EBD = 1; + int32_t EBH = ((upsample_h_ - uh - 1) + BH_) / upsample_h_; + int32_t EBW = ((upsample_w_ - uw - 1) + BW_) / upsample_w_; + if(EBD == 0 || EBH == 0 || EBW == 0) + continue; // unpack int32_t ctrs = i; int32_t c, t, r, s; - std::tie(c, t, r, s) = unpack(ctrs); + std::tie(c, t, r, s) = unpack(ctrs, !b_trans_, EBD, EBH, EBW); // next indices int32_t nextctrs = ctrs + TK_; int32_t nextc, nextt, nextr, nexts; - std::tie(nextc, nextt, nextr, nexts) = unpack(nextctrs); + std::tie(nextc, nextt, nextr, nexts) = unpack(nextctrs, !b_trans_, EBD, EBH, EBW); // diffs int32_t cdiff = nextc - c; - int32_t tdiff = (nextt + pd)/upsample_d_ - (t + pd)/upsample_d_; - int32_t rdiff = (nextr + ph)/upsample_h_ - (r + ph)/upsample_h_; - int32_t sdiff = (nexts + pw)/upsample_w_ - (s + pw)/upsample_w_; - // delta pointers - deltas_ptr[i] = cdiff*ld_a_[a_inner_idx_] + tdiff*ld_a_[a_pix_idx_] + rdiff*ld_a_[a_pix_idx_ + 1] + sdiff*ld_a_[a_pix_idx_ + 2]; - } - } - - if(b_lut_) { - for(size_t i = 0; i < Ds0; ++i) { - int32_t c, t, r, s; - int32_t nextc, nextt, nextr, nexts; - std::tie(c, t, r, s) = unpack(i); - std::tie(nextc, nextt, nextr, nexts) = unpack(i + TK_); - int32_t cdiff = nextc - c; int32_t tdiff = nextt - t; int32_t rdiff = nextr - r; int32_t sdiff = nexts - s; - h_b_deltas_[i] = cdiff*ld_b_[b_inner_idx_] + tdiff*ld_b_[b_pix_idx_] + rdiff*ld_b_[b_pix_idx_ + 1] + sdiff*ld_b_[b_pix_idx_ + 2]; + if(ty_ == WGRAD){ + tdiff = tdiff * stride_d_; + rdiff = rdiff * stride_h_; + sdiff = sdiff * stride_w_; + } + // delta pointers + deltas_ptr[i] = cdiff*ld_a_[a_inner_idx_] + tdiff*ld_a_[a_pix_idx_] + rdiff*ld_a_[a_pix_idx_ + 1] + sdiff*ld_a_[a_pix_idx_ + 2]; } } } void conv::build_masks(){ - h_masks_.resize(Luts_ + (2*pad_h_+1)*(2*pad_w_+1)*(2*pad_d_+1)*Luts_); + h_masks_.resize(Luts_ + upsample_d_*upsample_h_*upsample_w_*(2*pad_h_+1)*(2*pad_w_+1)*(2*pad_d_+1)*Luts_); - auto unpack = [&](int32_t ltrs){ - int32_t l = (!b_trans_) ? ltrs % NF_ : ltrs / (BD_*BH_*BW_); - int32_t trs = (!b_trans_) ? ltrs / NF_ : ltrs % (BD_*BH_*BW_); - int32_t tr = trs / BW_; - int32_t s = trs % BW_; - int32_t t = tr / BH_; - int32_t r = tr % BH_; - if(!b_trans_){ - r = BH_ - 1 - r; - s = BW_ - 1 - s; - } - return std::make_tuple(l, t, r, s); - }; size_t Ms0 = Luts_; size_t Ms1 = 2*pad_w_ + 1; size_t Ms2 = 2*pad_h_ + 1; size_t Ms3 = 2*pad_d_ + 1; + size_t Ms4 = upsample_w_; + size_t Ms5 = upsample_h_; + size_t Ms6 = upsample_d_; + for(size_t ud = 0; ud < Ms6; ++ud) + for(size_t uh = 0; uh < Ms5; ++uh) + for(size_t uw = 0; uw < Ms4; ++uw) for(size_t pd = 0; pd < Ms3; ++pd) for(size_t ph = 0; ph < Ms2; ++ph) for(size_t pw = 0; pw < Ms1; ++pw){ - int32_t* masks_ptr = &h_masks_[Luts_ + pw*Ms0 + ph*Ms0*Ms1 + pd*Ms0*Ms1*Ms2]; + int32_t* masks_ptr = &h_masks_[Luts_ + pw*Ms0 + ph*Ms0*Ms1 + pd*Ms0*Ms1*Ms2 + uw*Ms0*Ms1*Ms2*Ms3 + uh*Ms0*Ms1*Ms2*Ms3*Ms4 + ud*Ms0*Ms1*Ms2*Ms3*Ms4*Ms5]; for(size_t i = 0; i < Ms0; ++i){ int32_t l, t, r, s; int32_t mask = 0x0; for(size_t j = 0; j < TK_; ++j){ - std::tie(l, t, r, s) = unpack(i + j); - bool in_bounds_d = (t + pd) >= pad_d_ && (t + pd) < (BD_ + pad_d_); - bool in_bounds_h = (r + ph) >= pad_h_ && (r + ph) < (BH_ + pad_h_); - bool in_bounds_w = (s + pw) >= pad_w_ && (s + pw) < (BW_ + pad_w_); + int32_t EBD = 1; + int32_t EBH = ((upsample_h_ - uh - 1) + BH_) / upsample_h_; + int32_t EBW = ((upsample_w_ - uw - 1) + BW_) / upsample_w_; + if(EBD == 0 || EBH == 0 || EBW == 0) + continue; + std::tie(l, t, r, s) = unpack(i + j, !b_trans_, EBD, EBH, EBW); + bool in_bounds_d = (t + pd) >= pad_d_ && (t + pd) < (EBD + pad_d_); + bool in_bounds_h = (r + ph) >= pad_h_ && (r + ph) < (EBH + pad_h_); + bool in_bounds_w = (s + pw) >= pad_w_ && (s + pw) < (EBW + pad_w_); mask |= (in_bounds_d && in_bounds_h && in_bounds_w) << j; } masks_ptr[i] = mask; @@ -246,6 +281,8 @@ void conv::init(driver::stream *stream, triton::driver::cu_module* module) { d_a_deltas_ = init_lut(is_a_deltas_cst, "delta", h_a_deltas_); d_b_deltas_ = init_lut(is_b_deltas_cst_, "b_delta", h_b_deltas_); d_masks_ = init_lut(is_mask_cst_, "masks", h_masks_); + d_locks_ = triton::driver::buffer::create(stream->context(), max_grid_0_*max_grid_1_*4); + ((triton::driver::cu_buffer*)d_locks_)->set_zero(stream, max_grid_0_*max_grid_1_*4); } void conv::set_arg(driver::kernel *kernel, @@ -264,34 +301,44 @@ void conv::set_arg(driver::kernel *kernel, kernel->setArg(10, BW_); kernel->setArg(11, CH_); kernel->setArg(12, CW_); + kernel->setArg(13, NC_); // A arguments - kernel->setArg(13, ld_a_[a_outer_idx_]); - kernel->setArg(14, ld_a_[a_inner_idx_]); - kernel->setArg(15, ld_a_[2]); - kernel->setArg(16, ld_a_[3]); - kernel->setArg(17, ld_a_[4]); + kernel->setArg(14, ld_a_[a_outer_idx_]); + kernel->setArg(15, ld_a_[a_inner_idx_]); + kernel->setArg(16, ld_a_[2]); + kernel->setArg(17, ld_a_[3]); + kernel->setArg(18, ld_a_[4]); // B arguments - kernel->setArg(18, ld_b_[b_inner_idx_]); - kernel->setArg(19, ld_b_[b_pix_idx_]); - kernel->setArg(20, ld_b_[b_pix_idx_+1]); - kernel->setArg(21, ld_b_[b_pix_idx_+2]); - kernel->setArg(22, ld_b_[b_outer_idx_]); + kernel->setArg(19, ld_b_[b_inner_idx_]); + kernel->setArg(20, ld_b_[b_pix_idx_]); + kernel->setArg(21, ld_b_[b_pix_idx_+1]); + kernel->setArg(22, ld_b_[b_pix_idx_+2]); + kernel->setArg(23, ld_b_[b_outer_idx_]); // C arguments - kernel->setArg(23, ld_c_[c_outer_0_idx_]); - kernel->setArg(24, ld_c_[c_outer_1_idx_]); - kernel->setArg(25, ld_c_[c_pix_idx]); - kernel->setArg(26, ld_c_[c_pix_idx+1]); - kernel->setArg(27, ld_c_[c_pix_idx+2]); + kernel->setArg(24, ld_c_[c_outer_0_idx_]); + kernel->setArg(25, ld_c_[c_outer_1_idx_]); + kernel->setArg(26, ld_c_[c_pix_idx]); + kernel->setArg(27, ld_c_[c_pix_idx+1]); + kernel->setArg(28, ld_c_[c_pix_idx+2]); // pad - kernel->setArg(28, pad_h_); - kernel->setArg(29, pad_w_); + kernel->setArg(29, pad_h_); + kernel->setArg(30, pad_w_); // stride - kernel->setArg(30, stride_h_); - kernel->setArg(31, stride_w_); + kernel->setArg(31, stride_h_); + kernel->setArg(32, stride_w_); // dilate - kernel->setArg(32, upsample_h_); - kernel->setArg(33, upsample_w_); - size_t idx = 34; + kernel->setArg(33, upsample_h_); + kernel->setArg(34, upsample_w_); + kernel->setArg(35, (int32_t)0); + kernel->setArg(36, (int32_t)0); + kernel->setArg(37, pad_h_); + kernel->setArg(38, pad_w_); + kernel->setArg(39, (int32_t)0); + kernel->setArg(40, (int32_t)0); + kernel->setArg(41, d_locks_); + kernel->setArg(42, 0); + kernel->setArg(43, 0); + size_t idx = 44; if(!is_a_deltas_cst) kernel->setArg(idx++, d_a_deltas_); if(!is_b_deltas_cst_) @@ -300,13 +347,67 @@ void conv::set_arg(driver::kernel *kernel, kernel->setArg(idx++, d_masks_); } +void conv::enqueue(driver::stream *stream, driver::kernel *kernel, + driver::buffer *a, driver::buffer *b, driver::buffer *c, driver::buffer *bias, + size_t TM, size_t TN, size_t GZ, size_t nthreads) { + set_arg(kernel, a, b, c, bias); + std::array grid = {1}; + grid[0] = (M_ + TM - 1)/TM; + grid[1] = (N_ + TN - 1)/TN; + grid[2] = GZ; + grid[0] /= upsample_h_*upsample_w_; + kernel->setArg(11, CH_/upsample_h_); + kernel->setArg(12, CW_/upsample_w_); + kernel->setArg(42, (int32_t)grid[0]); + kernel->setArg(43, (int32_t)grid[1]); + + // initialize to zero if necessary + bool init_zero = false; + for(int32_t off_uh = 0; off_uh < upsample_h_; off_uh++) + for(int32_t off_uw = 0; off_uw < upsample_w_; off_uw++) { + int32_t EBD = 1; + int32_t EBH = ((upsample_h_ - off_uh - 1) + BH_) / upsample_h_; + int32_t EBW = ((upsample_w_ - off_uw - 1) + BW_) / upsample_w_; + if(EBD == 0 || EBH == 0 || EBW == 0) + init_zero = true; + } + if(init_zero) + ((driver::cu_buffer*)c)->set_zero(stream, c_size()*4); + + for(int32_t off_uh = 0; off_uh < upsample_h_; off_uh++) + for(int32_t off_uw = 0; off_uw < upsample_w_; off_uw++) { + int32_t EBD = 1; + int32_t EBH = ((upsample_h_ - off_uh - 1) + BH_) / upsample_h_; + int32_t EBW = ((upsample_w_ - off_uw - 1) + BW_) / upsample_w_; + if(EBD == 0 || EBH == 0 || EBW == 0) + continue; + int32_t K = shapes_b_[b_inner_idx_]*EBD*EBH*EBW; + kernel->setArg(6, K); + kernel->setArg(9, EBH); + kernel->setArg(10, EBW); + kernel->setArg(29, pad_h_); + kernel->setArg(30, pad_w_); + kernel->setArg(35, off_uh); + kernel->setArg(36, off_uw); + kernel->setArg(37, (pad_h_ + (1 - upsample_h_)*off_uh)/upsample_h_); + kernel->setArg(38, (pad_w_ + (1 - upsample_w_)*off_uw)/upsample_w_); + kernel->setArg(39, (off_uh + pad_h_) % upsample_h_); + kernel->setArg(40, (off_uw + pad_w_) % upsample_w_); + stream->enqueue(kernel, grid, {nthreads, 1, 1}); + } +} + std::vector conv::default_params() { - if(b_lut_) - return {32, 2, 64, 32, 2, 64, 16, 8, 2, 2, 4, 2, 8}; + if(b_lut_){ + if(!b_trans_) + return {16, 2, 32, 16, 16, 8, 8, 2, 2, 4, 2, 8, 4, 2, 1}; + else + return {32, 2, 64, 32, 2, 64, 16, 8, 2, 2, 4, 2, 8, 1}; + } else if(ty_ == FPROP) - return {16, 2, 64, 32, 2, 64, 16, 8, 2, 2, 8, 1, 8, 4}; - else if(ty_ == BPROP) - return {32, 2, 64, 32, 64, 32, 4, 2, 2, 4, 2, 8, 4, 2}; + return {16, 2, 64, 32, 2, 64, 16, 8, 2, 2, 8, 1, 8, 4, 1}; + else + return {16, 2, 64, 16, 16, 16, 4, 2, 2, 4, 2, 8, 4, 2, 1}; } @@ -395,6 +496,205 @@ void conv::cpu_ref(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B) cpu_wgrad(C, A, B); } +void conv::src(std::ostream &os){ + std::string BS = b_trans_ ? "[TN,TK]" : "[TK, TN]"; + std::string bcb0 = b_trans_ ? "[:, newaxis]" : "[newaxis, :]"; + std::string bcb1 = b_trans_ ? "[newaxis, :]" : "[:, newaxis]"; + std::string ldb0 = b_trans_ ? "*ldb_s" : ""; + std::string useb = b_trans_ ? "trans(b)" : "b"; + std::string flipr = b_trans_ ? "" : "BH - 1 -"; + std::string flips = b_trans_ ? "" : "BW - 1 -"; + std::string upar = ty_ == WGRAD ? "stride_h * ": ""; + std::string upas = ty_ == WGRAD ? "stride_w * ": ""; + std::string upah = ty_ == WGRAD ? "": "*stride_h"; + std::string upaw = ty_ == WGRAD ? "": "*stride_w"; + std::vector crs = {"c", "r", "s"}; + std::vector rsc = {"r", "s", "c"}; + std::vector ax = b_trans_ ? crs : rsc; + std::vector redax; + if(b_trans_) + redax = {"NC", "BH", "BW"}; + else + redax = {"BH", "BW", "NC"}; + std::string inc_pb = b_lut_ ? "db" + bcb1 : "TK" + ldb0; + std::string inc_pdb = b_trans_ ? "incd" : "TK"; + std::string a_delta_mem = is_a_deltas_cst ? "__constant__" : ""; + std::string b_delta_mem = is_b_deltas_cst_? "__constant__" : ""; + std::string masks_mem = is_mask_cst_? "__constant__" : ""; + + os << + R"( +const tunable int32 TM = {16, 32, 64}; +const tunable int32 TN = {16, 32, 64}; +const tunable int32 TK = {8}; +const tunable int32 GZ = {1}; +)"; +if(is_a_deltas_cst) + os << "__constant__ int32* delta = alloc_const int32[" + std::to_string(h_a_deltas_.size()) + "];\n"; +if(b_lut_ && is_b_deltas_cst_) + os << "__constant__ int32* b_delta = alloc_const int32[" + std::to_string(h_b_deltas_.size()) + "];\n"; +if(is_mask_cst_) + os << "__constant__ int32* masks = alloc_const int32[" + std::to_string(h_masks_.size()) + "];\n"; +os << R"( + + void conv(read_only restrict fp32 *a, + read_only restrict fp32 *b, + fp32 *c, + fp32 *bias, + int32 M, int32 N, int32 K, + int32 AH, int32 AW, + int32 BH, int32 BW, + int32 CH, int32 CW, + int32 NC, + int32 lda_n, int32 lda_c, int32 lda_d, int32 lda_h, int32 lda_w, + int32 ldb_c, int32 ldb_t, int32 ldb_r, int32 ldb_s, int32 ldb_k, + int32 ldc_n, int32 ldc_k, int32 ldc_m, int32 ldc_p, int32 ldc_q, + int32 pad_h, int32 pad_w, + int32 stride_h, int32 stride_w, + int32 upsample_h, int32 upsample_w, + int32 off_uh, int32 off_uw, + int32 off_uah, int32 off_uaw, + int32 off_uch, int32 off_ucw, + int32 *locks, int32 grid0, int32 grid1)"; +if(!is_a_deltas_cst) + os << ", int32* delta"; +if(b_lut_ && !is_b_deltas_cst_) + os << ", int32* b_delta"; +if(!is_mask_cst_) + os << ", int32* masks"; + os << R"(){ + int32 rxa[TM] = get_global_range[TM](0); + int32 rb0[TN] = get_global_range[TN](1); + int32 rz = get_global_range[1](2); + int32 rka[TK] = 0 ... TK; + int32 rkb[TK] = 0 ... TK; + fp32 C[TM, TN] = 0; + int32 ldlut = )" + std::to_string(Luts_) + R"(; + int32 div = K / GZ; + int32 rem = K % GZ; + K = select(rz < rem, div, div + rem); + int32 offk = rz*div; + rka = rka + offk; + rkb = rkb + offk; + int32 rabh[TM] = rxa / CW; + int32 raw[TM] = rxa % CW; + int32 rab[TM] = rabh / CH; + int32 rah[TM] = rabh % CH; + rah = rah)" + upaw + R"( - off_uah; + raw = raw)" + upah + R"( - off_uaw; + int32 ra0[TM] = rab*lda_n + rah*lda_h + raw*lda_w; + int32 ra)" + ax[0] + ax[1] + "[TK] = rka / " + redax[2] + R"(; + int32 ra)" + ax[2] + "[TK] = rka % " + redax[2] + R"(; + int32 ra)" + ax[0] + "[TK] = ra" + ax[0] + ax[1] + " / " + redax[1] + R"(; + int32 ra)" + ax[1] + "[TK] = ra" + ax[0] + ax[1] + " % " + redax[1] + R"(; + rar = )" + flipr + R"( rar; + ras = )" + flips + R"( ras; + rar = )" + upar + R"( rar; + ras = )" + upas + R"( ras; + int32 ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; + fp32* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis];)"; +if(b_lut_){ + os << R"( + int32 rb)" + ax[0] + ax[1] + "[TK] = rkb / " + redax[2] + R"(; + int32 rb)" + ax[2] + "[TK] = rkb % " + redax[2] + R"(; + int32 rb)" + ax[0] + "[TK] = rb" + ax[0] + ax[1] + " / " + redax[1] + R"(; + int32 rb)" + ax[1] + "[TK] = rb" + ax[0] + ax[1] + " % " + redax[1] + R"(; + rbr = rbr*upsample_h + off_uh; + rbs = rbs*upsample_w + off_uw; + int32 offdb[TK] = rkb % ldlut; + int32 rb1[TK] = rbc*ldb_c + rbr*ldb_r + rbs*ldb_s; + )" + b_delta_mem + R"( int32* pdb[TK] = b_delta + offdb + off_uw*ldlut + off_uh*ldlut*upsample_w; + int32 db[TK] = *pdb;)"; +} +else{ +os << R"( + int32 rb1[TK] = rkb)" + ldb0 + ";"; +} +os << R"( + fp32* pb)" + BS + " = b + rb1" + bcb1 + " + rb0" + bcb0 + R"(*ldb_k; + int32 offda[TK] = rka % ldlut; + )" + a_delta_mem + R"( int32* pincd[TK] = delta + offda; + )" + a_delta_mem + R"( int32* pda[TK] = delta + ldlut + offda + off_uw*ldlut + off_uh*ldlut*upsample_w; + int32 da[TK] = *pda; + int32 incd[TK] = *pincd; + int32 maskh[TM] = pad_h + min(rah, 0) + max(rah + BH - AH, 0); + int32 maskw[TM] = pad_w + min(raw, 0) + max(raw + BW - AW, 0); + int32 offma = offk % ldlut; + )" + masks_mem + R"( int32* pm[TM] = masks + ldlut + offma + maskw*ldlut + maskh*ldlut*(2*pad_w + 1) + off_uw*ldlut*(2*pad_w+1)*(2*pad_h+1) + off_uh*ldlut*(2*pad_w+1)*(2*pad_h+1)*upsample_w; + )" + a_delta_mem + R"( int32* pincm[TM] = delta + offma; + int32 incm[TM] = *pincm; + int32 maska0[TM] = *pm; + int32 maska1[TK] = 1 << (0 ... TK); + int1 checka[TM, TK] = (maska0[:, newaxis] & maska1[newaxis, :]) > 0; + int1 checkb0[TN] = rb0 < N; + int1 checkb)" + BS + " = checkb0" + bcb0 + R"(; + fp32 a[TM, TK] = checka ? *pa : 0; + fp32 b)" + BS + R"( = checkb ? *pb : 0; + int32 rkamin[TK] = rka - offk + TK; + for(int32 k = K; k > 0; k = k - TK){ + C = dot(a, )" + useb + R"(, C); + pa = pa + da[newaxis, :]; + pb = pb + )" + inc_pb + R"(; + pda = pda + incd;)"; +if(b_lut_){ + os << R"( + pdb = pdb + )" + inc_pdb + R"(; + db = *pdb;)"; +} + os << R"( + pincd = pincd + incd; + da = *pda; + incd = *pincd; + pm = pm + incm; + pincm = pincm + incm; + incm = *pincm; + int1 checka1[TK] = (rkamin < k); + maska0 = *pm; + checka = (maska0[:, newaxis] & maska1[newaxis, :]) > 0; + checka = checka && checka1[newaxis,:]; + a = checka ? *pa : 0; + checkb = checkb && (k > TK); + @checkb b = *pb; + } + int32 rxc[TM] = get_global_range[TM](0); + int32 rc1[TN] = get_global_range[TN](1); + int32 rcn[TM] = rxc / (CH*CW); + int32 rcpq[TM] = rxc % (CH*CW); + int32 rcp[TM] = rcpq / CW; + int32 rcq[TM] = rcpq % CW; + rcp = rcp * upsample_h + off_uch; + rcq = rcq * upsample_w + off_ucw; + int1 checkc1[TN] = rc1 < N; + int32 rc0[TM] = rcn * ldc_n + rcp * ldc_p + rcq * ldc_q; + fp32* pc[TM, TN] = c + rc1[newaxis, :]*ldc_k + rc0[:, newaxis]; + int1 checkc0[TM] = rxc < M; + int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + int32 ridx = get_range_id(0); + int32 ridy = get_range_id(1); + int32 *plock = locks + ridx + ridy*grid0; + int32 *pcount = plock + grid0*grid1; + while(__atomic_cas(plock, 0, 1)); + int32 count = *pcount; + int32 countp1 = select(count == GZ - 1, 0, count + 1); + if(count == 0) {)"; + if(bias_ && ty_==FPROP){ + os << R"( + fp32* pbias[TN] = bias + rc1; + fp32 bias[TN] = checkc1 ? *pbias : 0; + C = C + bias[newaxis, :];)"; + } + os << R"( + @checkc *pc = C; + *pcount = countp1; + } + else { + @checkc *pc = C + *pc; + *pcount = countp1; + } + __atomic_cas(plock, 1, 0); +})"; +} + template void conv::cpu_ref(float*, float*, float*); template void conv::cpu_xprop(float*, float*, float*); template void conv::cpu_wgrad(float*, float*, float*); diff --git a/lib/dnn/gemm.cpp b/lib/dnn/gemm.cpp new file mode 100644 index 000000000..59f413d81 --- /dev/null +++ b/lib/dnn/gemm.cpp @@ -0,0 +1,137 @@ +#include "triton/driver/stream.h" +#include "triton/driver/kernel.h" +#include "triton/dnn/gemm.h" +#include + +namespace triton{ +namespace dnn{ + + +void gemm::init(driver::stream* stream, driver::buffer* locks) { + std::vector hlocks(2048, 0); + stream->write(locks, false, 0, hlocks); +} + +void gemm::set_arg(driver::kernel *kernel, + driver::buffer *a, driver::buffer *b, driver::buffer *c, + int32_t M, int32_t N, int32_t K, + driver::buffer *locks, int32_t grid_0, int32_t grid_1) { + kernel->setArg(0, a); + kernel->setArg(1, b); + kernel->setArg(2, c); + kernel->setArg(3, M); + kernel->setArg(4, N); + kernel->setArg(5, K); + kernel->setArg(6, M); + kernel->setArg(7, N); + kernel->setArg(8, M); + kernel->setArg(9, locks); + kernel->setArg(10, grid_0); + kernel->setArg(11, grid_1); +} + +std::vector gemm::default_params(bool AT, bool BT) { + if(AT && BT) + return {32, 64, 32, 64, 16, 8, 2, 2, 4, 2, 8, 4, 2, 1}; + else if(AT && !BT) + return {32, 64, 32, 64, 16, 8, 2, 2, 4, 2, 8, 4, 2, 1}; + else if(!AT && BT) + return {16, 2, 64, 16, 2, 64, 16, 8, 2, 2, 8, 8, 8, 1}; + else + return {16, 2, 128, 32, 32, 32, 4, 2, 2, 8, 8, 4, 2, 1}; +} + +std::string gemm::src(bool AT, bool BT) { + std::string AS0 = "TM", AS1 = "TK"; + std::string BS0 = "TK", BS1 = "TN"; + std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; + std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; + std::string lda0 = "*lda", lda1 = ""; + std::string ldb0 = "", ldb1 = "*ldb"; + std::string usea = AT ? "trans(a)" : "a"; + std::string useb = BT ? "trans(b)" : "b"; + if(AT){ + std::swap(AS0, AS1); + std::swap(bca0, bca1); + std::swap(lda0, lda1); + } + if(BT){ + std::swap(BS0, BS1); + std::swap(bcb0, bcb1); + std::swap(ldb0, ldb1); + } + std::string res = +R"( +const tunable int32 TM = {16, 32, 64, 128}; +const tunable int32 TN = {16, 32, 64, 128}; +const tunable int32 TK = {8}; +const tunable int32 GZ = {1}; + +void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, + int32 M, int32 N, int32 K, + int32 lda, int32 ldb, int32 ldc, + int32 *locks, int32 grid0, int32 grid1) { + int32 rxa[TM] = get_global_range[TM](0); + int32 ryb[TN] = get_global_range[TN](1); + int32 rz = get_global_range[1](2); + int32 rka[TK] = 0 ... TK; + int32 rkb[TK] = 0 ... TK; + fp32 c[TM, TN] = 0; + int32 div = K / GZ; + int32 rem = K % GZ; + K = select(rz < rem, div - 1, div); + int32 offk = select(rz < rem, rz*(div + 1), rz*div + rem); + fp32* pa[)" + AS0 + ", " + AS1 + "] = A + (offk + rka" + bca0 + ")" + lda0 + " + rxa" + bca1 + lda1 + R"(; + fp32* pb[)" + BS0 + ", " + BS1 + "] = B + (offk + rkb" + bcb0 + ")" + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; + fp32 a[)" + AS0 + ", " + AS1 + R"(] = *pa; + fp32 b[)" + BS0 + ", " + BS1 + R"(] = *pb; + int32 last_a = ((M*K - 1) - (TM*TK + 1)) / lda; + int32 last_b = ((K*N - 1) - (TN*TK + 1)) / ldb; + last_a = last_a / TK * TK; + last_b = last_b / TK * TK; + int32 bound = K - max(last_a, last_b); + for(int32 k = K; k > bound; k = k - TK){ + c = dot()" + usea + ", " + useb + R"(, c); + pa = pa + TK)" + lda0 + R"(; + pb = pb + TK)" + ldb0 + R"(; + a = *pa; + b = *pb; + } + int32 rxc[TM] = get_global_range[TM](0); + int32 ryc[TN] = get_global_range[TN](1); + for(int32 k = bound; k > 0; k = k - 1){ + int1 checka[TM, 1] = rxc[:, newaxis] < M; + int1 checkb[TN, 1] = ryc[:, newaxis] < N; + fp32* pa[TM, 1] = A + (offk + K - k))" + lda0 + " + rxc[:, newaxis]" + lda1 + R"(; + fp32* pb[TN, 1] = B + (offk + K - k))" + ldb0 + " + ryc[:, newaxis]" + ldb1 + R"(; + fp32 a[TM, 1] = checka ? *pa : 0; + fp32 b[TN, 1] = checkb ? *pb : 0; + c = dot(a, trans(b), c); + } + int32 ridx = get_range_id(0); + int32 ridy = get_range_id(1); + int1 checkc0[TM] = rxc < M; + int1 checkc1[TN] = ryc < N; + int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; + int32 *plock = locks + ridx + ridy*grid0; + while(__atomic_cas(plock, 0, 1)); + int32 *pcount = plock + grid0*grid1; + int32 count = *pcount; + int32 countp1 = select(count == GZ - 1, 0, count + 1); + if(count == 0) { + @checkc *pc = c; + *pcount = countp1; + } + else { + @checkc *pc = c + *pc; + *pcount = countp1; + } + __atomic_cas(plock, 1, 0); +} +)"; + return res; +} + +} +} diff --git a/lib/driver/buffer.cpp b/lib/driver/buffer.cpp index a64e0aeca..cf96aa115 100755 --- a/lib/driver/buffer.cpp +++ b/lib/driver/buffer.cpp @@ -88,10 +88,10 @@ cu_buffer::cu_buffer(driver::context* context, CUdeviceptr cu, bool take_ownersh : buffer(context, cu, take_ownership){ } -void cu_buffer::set_zero(cu_stream const & queue, size_t size) +void cu_buffer::set_zero(driver::stream* queue, size_t size) { cu_context::context_switcher ctx_switch(*context_); - dispatch::cuMemsetD8Async(*cu_, 0, size, *queue.cu()); + dispatch::cuMemsetD8Async(*cu_, 0, size, *queue->cu()); } } diff --git a/lib/driver/dispatch.cpp b/lib/driver/dispatch.cpp index f02d4ea2e..e0f75a586 100755 --- a/lib/driver/dispatch.cpp +++ b/lib/driver/dispatch.cpp @@ -101,20 +101,6 @@ namespace driver #define NVML_DEFINE2(ret, fname, t1, t2) DEFINE2(nvmlinit, nvml_, ret, fname, t1, t2) #define NVML_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvmlinit, nvml_, ret, fname, t1, t2, t3) -#define CUBLAS_DEFINE1(ret, fname, t1) DEFINE1(cublasinit, cublas_, ret, fname, t1) -#define CUBLAS_DEFINE13(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) DEFINE13(cublasinit, cublas_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) -#define CUBLAS_DEFINE19(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19) DEFINE19(cublasinit, cublas_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19) - -#define CUDNN_DEFINE1(ret, fname, t1) DEFINE1(cudnninit, cudnn_, ret, fname, t1) -#define CUDNN_DEFINE2(ret, fname, t1, t2) DEFINE2(cudnninit, cudnn_, ret, fname, t1, t2) -#define CUDNN_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(cudnninit, cudnn_, ret, fname, t1, t2, t3) -#define CUDNN_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5) -#define CUDNN_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5, t6) -#define CUDNN_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5, t6, t7) -#define CUDNN_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) -#define CUDNN_DEFINE13(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) DEFINE13(cudnninit, cudnn_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) - - bool dispatch::clinit() { if(opencl_==nullptr) @@ -146,18 +132,6 @@ bool dispatch::nvmlinit(){ return res; } -bool dispatch::cublasinit(){ - if(cublas_==nullptr) - cublas_ = dlopen("libcublas.so", RTLD_LAZY); - return cublas_ != nullptr; -} - -bool dispatch::cudnninit(){ - if(cudnn_==nullptr) - cudnn_ = dlopen("libcudnn.so", RTLD_LAZY); - return cudnn_ != nullptr; -} - bool dispatch::spvllvminit(){ if(spvllvm_==nullptr) spvllvm_ = dlopen("libLLVMSPIRVLib.so", RTLD_LAZY); @@ -207,57 +181,6 @@ NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, nvmlClockType_t NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*) NVML_DEFINE3(nvmlReturn_t, nvmlDeviceSetApplicationsClocks, nvmlDevice_t, unsigned int, unsigned int) -cublasHandle_t dispatch::cublasHandle(const cu_context &ctx){ - static std::map handles; - auto pr = handles.insert({*ctx.cu(), cublasHandle_t()}); - if(pr.second) - cublasCreate_v2(&pr.first->second); - return pr.first->second; -} - -cudnnHandle_t dispatch::cudnnHandle(driver::cu_context const & ctx){ - static std::map handles; - auto pr = handles.insert({*ctx.cu(), cudnnHandle_t()}); - if(pr.second) - cudnnCreate(&pr.first->second); - return pr.first->second; -} - -CUBLAS_DEFINE1(cublasStatus_t, cublasCreate_v2, cublasHandle_t*) -cublasStatus_t dispatch::cublasGetStream_v2(cublasHandle_t h, cudaStream_t *a) -{ return f_impl(cublas_, cublasGetStream_v2, cublasGetStream_v2_, "cublasGetStream_v2", h, a); } -cublasStatus_t dispatch::cublasSetStream_v2(cublasHandle_t h, cudaStream_t a) -{ return f_impl(cublas_, cublasSetStream_v2, cublasSetStream_v2_, "cublasSetStream_v2", h, a); } -cublasStatus_t dispatch::cublasSgemm_v2(cublasHandle_t h, cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, float* alpha, const float *A, int lda, const float *B, int ldb, float* beta, float *C, int ldc) -{ return f_impl(cublas_, cublasSgemm_v2, cublasSgemm_v2_, "cublasSgemm_v2", h, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);} -cublasStatus_t dispatch::cublasDgemm_v2(cublasHandle_t h, cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, double* alpha, const double *A, int lda, const double *B, int ldb, double* beta, double *C, int ldc) -{ return f_impl(cublas_, cublasDgemm_v2, cublasDgemm_v2_, "cublasDgemm_v2", h, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);} -cublasStatus_t dispatch::cublasHgemm(cublasHandle_t h, cublasOperation_t at, cublasOperation_t bt, int m, int n, int k, half* alpha, const half *A, int lda, const half *B, int ldb, half* beta, half *C, int ldc) -{ return f_impl(cublas_, cublasHgemm, cublasHgemm_, "cublasHgemm", h, at, bt, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);} -CUBLAS_DEFINE19(cublasStatus_t, cublasGemmEx, cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const void*, const void*, cudaDataType, int, const void*, cudaDataType, int, const void*, void*, cudaDataType, int, cudaDataType, cublasGemmAlgo_t) - -//cuDNN -CUDNN_DEFINE1(cudnnStatus_t, cudnnCreateConvolutionDescriptor, cudnnConvolutionDescriptor_t*) -CUDNN_DEFINE1(cudnnStatus_t, cudnnCreateTensorDescriptor, cudnnTensorDescriptor_t*) -CUDNN_DEFINE1(cudnnStatus_t, cudnnCreateFilterDescriptor, cudnnFilterDescriptor_t*) -CUDNN_DEFINE1(cudnnStatus_t, cudnnCreate, cudnnHandle_t*) -CUDNN_DEFINE7(cudnnStatus_t, cudnnSetTensor4dDescriptor, cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, int, int, int) -CUDNN_DEFINE7(cudnnStatus_t, cudnnSetFilter4dDescriptor, cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, int, int, int) -CUDNN_DEFINE5(cudnnStatus_t, cudnnSetTensorNdDescriptorEx, cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, const int*) -CUDNN_DEFINE5(cudnnStatus_t, cudnnSetFilterNdDescriptor, cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, const int*) -CUDNN_DEFINE1(cudnnStatus_t, cudnnCreatePoolingDescriptor, cudnnPoolingDescriptor_t*) -CUDNN_DEFINE7(cudnnStatus_t, cudnnSetPoolingNdDescriptor, cudnnPoolingDescriptor_t, const cudnnPoolingMode_t, const cudnnNanPropagation_t, int, const int*, const int*, const int*) -CUDNN_DEFINE8(cudnnStatus_t, cudnnPoolingForward, cudnnHandle_t, const cudnnPoolingDescriptor_t, const void*, const cudnnTensorDescriptor_t, const void*, const void*, const cudnnTensorDescriptor_t, void*) - - -CUDNN_DEFINE8(cudnnStatus_t, cudnnSetConvolution2dDescriptor, cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t) -CUDNN_DEFINE7(cudnnStatus_t, cudnnSetConvolutionNdDescriptor, cudnnConvolutionDescriptor_t, int, const int*, const int*, const int*, cudnnConvolutionMode_t, cudnnDataType_t) -CUDNN_DEFINE8(cudnnStatus_t, cudnnGetConvolutionForwardAlgorithm, cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t, cudnnConvolutionFwdAlgo_t *) -CUDNN_DEFINE7(cudnnStatus_t, cudnnGetConvolutionForwardWorkspaceSize, cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t*) -CUDNN_DEFINE13(cudnnStatus_t, cudnnConvolutionForward, cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *) -CUDNN_DEFINE2(cudnnStatus_t, cudnnSetStream, cudnnHandle_t, cudaStream_t) -CUDNN_DEFINE7(cudnnStatus_t, cudnnTransformTensor, cudnnHandle_t, const void*, const cudnnTensorDescriptor_t, const void*, const void*, const cudnnTensorDescriptor_t, void*) - // OpenCL cl_int dispatch::clBuildProgram(cl_program a, cl_uint b, const cl_device_id * c, const char * d, void (*e)(cl_program, void *), void * f) { return f_impl(opencl_, clBuildProgram, clBuildProgram_, "clBuildProgram", a, b, c, d, e, f); } @@ -313,21 +236,11 @@ void dispatch::release(){ dlclose(cuda_); cuda_ = nullptr; } - if(cublas_){ - dlclose(cublas_); - cublas_ = nullptr; - } - if(cudnn_){ - dlclose(cudnn_); - cudnn_ = nullptr; - } } void * dispatch::opencl_; void* dispatch::cuda_; void* dispatch::nvml_; -void* dispatch::cublas_; -void* dispatch::cudnn_; void* dispatch::spvllvm_; //OpenCL @@ -410,33 +323,6 @@ void* dispatch::nvmlDeviceGetClockInfo_; void* dispatch::nvmlDeviceGetMaxClockInfo_; void* dispatch::nvmlDeviceSetApplicationsClocks_; -void* dispatch::cublasCreate_v2_; -void* dispatch::cublasGetStream_v2_; -void* dispatch::cublasSetStream_v2_; -void* dispatch::cublasHgemm_; -void* dispatch::cublasSgemm_v2_; -void* dispatch::cublasDgemm_v2_; -void* dispatch::cublasGemmEx_; - -void* dispatch::cudnnCreateConvolutionDescriptor_; -void* dispatch::cudnnCreatePoolingDescriptor_; -void* dispatch::cudnnCreateTensorDescriptor_; -void* dispatch::cudnnCreateFilterDescriptor_; -void* dispatch::cudnnCreate_; -void* dispatch::cudnnSetTensor4dDescriptor_; -void* dispatch::cudnnSetFilter4dDescriptor_; -void* dispatch::cudnnSetTensorNdDescriptorEx_; -void* dispatch::cudnnSetFilterNdDescriptor_; -void* dispatch::cudnnSetPoolingNdDescriptor_; -void* dispatch::cudnnSetConvolution2dDescriptor_; -void* dispatch::cudnnSetConvolutionNdDescriptor_; -void* dispatch::cudnnGetConvolutionForwardAlgorithm_; -void* dispatch::cudnnGetConvolutionForwardWorkspaceSize_; -void* dispatch::cudnnConvolutionForward_; -void* dispatch::cudnnPoolingForward_; -void* dispatch::cudnnSetStream_; -void* dispatch::cudnnTransformTensor_; - // SPIR-V void* dispatch::initializeLLVMToSPIRVPass_; void* dispatch::writeSpirv_; diff --git a/lib/driver/error.cpp b/lib/driver/error.cpp index 99b2401dd..ea7d1721a 100755 --- a/lib/driver/error.cpp +++ b/lib/driver/error.cpp @@ -94,45 +94,6 @@ void check(CUresult err) } } -void check(cublasStatus_t err){ - using namespace exception::cublas; - switch(err) - { - case CUBLAS_STATUS_SUCCESS : break; - case CUBLAS_STATUS_NOT_INITIALIZED : throw not_initialized(); - case CUBLAS_STATUS_ALLOC_FAILED : throw alloc_failed(); - case CUBLAS_STATUS_INVALID_VALUE : throw invalid_value(); - case CUBLAS_STATUS_ARCH_MISMATCH : throw arch_mismatch(); - case CUBLAS_STATUS_MAPPING_ERROR : throw mapping_error(); - case CUBLAS_STATUS_EXECUTION_FAILED: throw execution_failed(); - case CUBLAS_STATUS_INTERNAL_ERROR : throw internal_error(); - case CUBLAS_STATUS_NOT_SUPPORTED : throw not_supported(); - case CUBLAS_STATUS_LICENSE_ERROR : throw license_error(); - default : throw unknown(); - } -} - -void check(cudnnStatus_t err){ - using namespace exception::cudnn; - switch(err) - { - case CUDNN_STATUS_SUCCESS: break; - case CUDNN_STATUS_NOT_INITIALIZED: throw not_initialized(); - case CUDNN_STATUS_ALLOC_FAILED: throw alloc_failed(); - case CUDNN_STATUS_BAD_PARAM: throw bad_param(); - case CUDNN_STATUS_INTERNAL_ERROR: throw internal_error(); - case CUDNN_STATUS_INVALID_VALUE: throw invalid_value(); - case CUDNN_STATUS_ARCH_MISMATCH: throw arch_mismatch(); - case CUDNN_STATUS_MAPPING_ERROR: throw mapping_error(); - case CUDNN_STATUS_EXECUTION_FAILED: throw execution_failed(); - case CUDNN_STATUS_NOT_SUPPORTED: throw not_supported(); - case CUDNN_STATUS_LICENSE_ERROR: throw license_error(); - case CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING: throw runtime_prerequisite_missing(); - case CUDNN_STATUS_RUNTIME_IN_PROGRESS: throw runtime_in_progress(); - case CUDNN_STATUS_RUNTIME_FP_OVERFLOW: throw runtime_fp_overflow(); - } -} - void check(cl_int err) { using namespace exception::ocl; diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index 58d3aef73..bb8637648 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -97,7 +97,7 @@ jit::jit(driver::context *context): driver_context_(context), jit::~jit(){ } -void jit::autotune(const char *name, const char *src, benchmark_t benchmark) { +jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t benchmark) { // find metaparameters auto ptt_module = make_triton_module(name, src); ir::module &tt_module = *ptt_module; @@ -113,7 +113,7 @@ void jit::autotune(const char *name, const char *src, benchmark_t benchmark) { // std::cout << ranges.size() << std::endl; // iterate over parameters unsigned i; - double best = 0; + tune_res_t best; loop_nest(ranges, [&](const std::vector params){ std::map> errors; i = 0; @@ -153,10 +153,14 @@ void jit::autotune(const char *name, const char *src, benchmark_t benchmark) { modules_.insert({name, module.get()}); double perf; perf = benchmark(kernel.get(), info); - best = std::max(perf, best); - std::cout << perf << " [ " << best << " ] " << std::endl; + if(perf > best.perf){ + best.perf = perf; + best.params = params; + } + std::cout << perf << " [ " << best.perf << " ] " << std::endl; modules_.erase(name); }); + return best; } void jit::add_module(ir::module &tt_module, const std::vector ¶ms) { From 8102efc0643380ba3ac176165b4e49375b56702b Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 28 May 2019 14:14:33 -0400 Subject: [PATCH 158/494] [triton/examples/cpp] removed common.hpp helper --- examples/cpp/common.hpp | 59 --------------------------------------- examples/cpp/dot.cpp | 3 +- examples/cpp/shift.cpp | 32 ++++++++++++++++++++- include/triton/dnn/gemm.h | 23 +++++++++++++++ 4 files changed, 55 insertions(+), 62 deletions(-) delete mode 100644 examples/cpp/common.hpp diff --git a/examples/cpp/common.hpp b/examples/cpp/common.hpp deleted file mode 100644 index f92bbeb69..000000000 --- a/examples/cpp/common.hpp +++ /dev/null @@ -1,59 +0,0 @@ -#include -#include -#include -#include "triton/driver/device.h" -#include - -template -void simple_gemm(std::vector &c, const std::vector &a, const std::vector &b, size_t M, size_t N, size_t K){ - for(size_t m = 0; m < M; m++) - for(size_t n = 0; n < N; n++){ - T acc = 0; - for(size_t k = 0; k < K; k++) - acc += (AT?a[k + m*K]:a[m + k*M]) * (BT?b[n + k*N]:b[k + n*K]); - c[m + n*M] = acc; - } -} - -template -void simple_gemm(bool AT, bool BT, std::vector &c, const std::vector &a, const std::vector &b, size_t M, size_t N, size_t K) { - if(AT && BT) - simple_gemm(c, a, b, M, N, K); - else if(AT && !BT) - simple_gemm(c, a, b, M, N, K); - else if(!AT && BT) - simple_gemm(c, a, b, M, N, K); - else - simple_gemm(c, a, b, M, N, K); -} - -// input layout: C, H, W, BS -// filter layout: C, K -// output layout: K, H, W, BS -template -void shift_conv(int32_t C, int32_t H, int32_t W, int32_t BS, - int32_t K, - std::vector& O, - const std::vector& I, - const std::vector& F, - const std::vector shift_h, - const std::vector shift_w) -{ - OUT_DTYPE acc; - for(int32_t p = 0; p < H; ++p) - for(int32_t q = 0; q < W; ++q) - for(int32_t bs = 0; bs < BS; ++bs) - for(int32_t k = 0; k < K; ++k) - { - acc = 0; - for(int32_t c = 0; c < C; ++c){ - int32_t h = p + shift_h[c]; - int32_t w = q + shift_w[c]; - bool in_bounds = (h >= 0 && w >= 0 && h < H && w < W); - IN_DTYPE a = in_bounds?I[bs + w*BS + h*BS*W + c*BS*H*W]:0; - IN_DTYPE b = F[k + c*K]; - acc = std::fma(a, b, acc); - } - O[bs + q*BS + p*BS*W + k*BS*H*W] = acc; - } -} diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 4e805ce9d..abaed5ff3 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -1,6 +1,5 @@ #include #include -#include "common.hpp" #include "triton/runtime/jit.h" #include "triton/driver/backend.h" #include "triton/driver/stream.h" @@ -67,7 +66,7 @@ int main() { triton::jit::launch_information info = jit.get_launch_info("matmul"); std::cout << "Performance: " << benchmark(kernel, info) << " TFLOPS " << std::endl; stream->read(dc, true, 0, hc); - simple_gemm(AT, BT, rc, ha, hb, M, N, K); + triton::dnn::gemm::cpu_ref(AT, BT, rc, ha, hb, M, N, K); for(size_t i = 0; i < M*N; i++) if(!std::isnan(hc[i]) && std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index 2cd17643e..b244e8ec2 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -1,11 +1,41 @@ #include #include -#include "common.hpp" #include "triton/runtime/jit.h" #include "triton/driver/backend.h" #include "triton/driver/stream.h" #include "triton/tools/bench.hpp" +// input layout: C, H, W, BS +// filter layout: C, K +// output layout: K, H, W, BS +template +void shift_conv(int32_t C, int32_t H, int32_t W, int32_t BS, + int32_t K, + std::vector& O, + const std::vector& I, + const std::vector& F, + const std::vector shift_h, + const std::vector shift_w) +{ + OUT_DTYPE acc; + for(int32_t p = 0; p < H; ++p) + for(int32_t q = 0; q < W; ++q) + for(int32_t bs = 0; bs < BS; ++bs) + for(int32_t k = 0; k < K; ++k) + { + acc = 0; + for(int32_t c = 0; c < C; ++c){ + int32_t h = p + shift_h[c]; + int32_t w = q + shift_w[c]; + bool in_bounds = (h >= 0 && w >= 0 && h < H && w < W); + IN_DTYPE a = in_bounds?I[bs + w*BS + h*BS*W + c*BS*H*W]:0; + IN_DTYPE b = F[k + c*K]; + acc = std::fma(a, b, acc); + } + O[bs + q*BS + p*BS*W + k*BS*H*W] = acc; + } +} + // K = channels // M = batch * height * width // N = number of feature maps diff --git a/include/triton/dnn/gemm.h b/include/triton/dnn/gemm.h index abc5b8d1b..0697ea981 100644 --- a/include/triton/dnn/gemm.h +++ b/include/triton/dnn/gemm.h @@ -14,6 +14,29 @@ public: driver::buffer *locks, int32_t grid_0, int32_t grid_1); static std::vector default_params(bool AT, bool BT); static std::string src(bool AT, bool BT); + + template + static void cpu_ref(std::vector &c, const std::vector &a, const std::vector &b, size_t M, size_t N, size_t K){ + for(size_t m = 0; m < M; m++) + for(size_t n = 0; n < N; n++){ + T acc = 0; + for(size_t k = 0; k < K; k++) + acc += (AT?a[k + m*K]:a[m + k*M]) * (BT?b[n + k*N]:b[k + n*K]); + c[m + n*M] = acc; + } + } + + template + static void cpu_ref(bool AT, bool BT, std::vector &c, const std::vector &a, const std::vector &b, size_t M, size_t N, size_t K) { + if(AT && BT) + gemm::cpu_ref(c, a, b, M, N, K); + else if(AT && !BT) + gemm::cpu_ref(c, a, b, M, N, K); + else if(!AT && BT) + gemm::cpu_ref(c, a, b, M, N, K); + else + gemm::cpu_ref(c, a, b, M, N, K); + } }; } From d2a46afe006380470738e87219cfcc52d7c9e189 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 28 May 2019 17:07:54 -0400 Subject: [PATCH 159/494] [triton/ast]: cleaned the ast module --- include/triton/ast/ast.h | 695 +------------------------ include/triton/ast/declaration.h | 222 ++++++++ include/triton/ast/error.h | 62 +++ include/triton/ast/expression.h | 311 +++++++++++ include/triton/ast/module.h | 37 ++ include/triton/ast/node.h | 77 +++ include/triton/ast/ops.h | 60 +++ include/triton/ast/parser.y | 241 ++++----- include/triton/ast/statement.h | 121 +++++ lib/ast/declaration.cpp | 199 +++++++ lib/ast/error.cpp | 49 ++ lib/ast/expression.cpp | 329 ++++++++++++ lib/ast/lowering.cpp | 855 ------------------------------- lib/ast/module.cpp | 18 + lib/ast/node.cpp | 160 ++++++ lib/ast/statement.cpp | 160 ++++++ 16 files changed, 1937 insertions(+), 1659 deletions(-) create mode 100644 include/triton/ast/declaration.h create mode 100644 include/triton/ast/error.h create mode 100644 include/triton/ast/expression.h create mode 100644 include/triton/ast/module.h create mode 100644 include/triton/ast/node.h create mode 100644 include/triton/ast/ops.h create mode 100644 include/triton/ast/statement.h create mode 100644 lib/ast/declaration.cpp create mode 100644 lib/ast/error.cpp create mode 100644 lib/ast/expression.cpp delete mode 100644 lib/ast/lowering.cpp create mode 100644 lib/ast/module.cpp create mode 100644 lib/ast/node.cpp create mode 100644 lib/ast/statement.cpp diff --git a/include/triton/ast/ast.h b/include/triton/ast/ast.h index 3d1da3064..26282894e 100644 --- a/include/triton/ast/ast.h +++ b/include/triton/ast/ast.h @@ -1,691 +1,12 @@ -#ifndef TDL_INCLUDE_AST_H -#define TDL_INCLUDE_AST_H +#ifndef TRITON_INCLUDE_AST_AST_H +#define TRITON_INCLUDE_AST_AST_H +#include "ops.h" #include "parser.hpp" -#include -#include -#include -#include - - -namespace triton{ - - -namespace ir{ - class function; - class value; - class type; - class builder; - class module; -} - -namespace ast{ - -// Enumerations -enum ASSIGN_OP_T{ - ASSIGN, - INPLACE_MUL, INPLACE_DIV, INPLACE_MOD, - INPLACE_ADD, INPLACE_SUB, - INPLACE_LSHIFT, INPLACE_RSHIFT, - INPLACE_AND, INPLACE_XOR, - INPLACE_OR -}; - -enum BIN_OP_T{ - MUL, DIV, MOD, - ADD, SUB, - LEFT_SHIFT, RIGHT_SHIFT, - LT, GT, - LE, GE, - EQ, NE, - AND, XOR, OR, - LAND, LOR -}; - -enum UNARY_OP_T{ - INC, DEC, - PLUS, MINUS, - ADDR, DEREF, - COMPL, NOT -}; - -enum TYPE_T{ - VOID_T, - UINT1_T, UINT8_T, UINT16_T, UINT32_T, UINT64_T, - INT1_T, INT8_T, INT16_T, INT32_T, INT64_T, - FLOAT32_T, FLOAT64_T -}; - -enum STORAGE_SPEC_T{ - CONST_T, - TUNABLE_T, - KERNEL_T, - RESTRICT_T, - READONLY_T, - CONSTANT_SPACE_T, - WRITEONLY_T -}; - -class pointer; -class identifier; -class constant; - -// AST -class node { -protected: - static ir::value* explicit_cast(ir::builder &builder, ir::value *src, ir::type *dst_ty); - static void implicit_broadcast(ir::module *mod, ir::type *dst_ty, ir::value *&src); - static void implicit_broadcast(ir::module *mod, ir::value *&lhs, ir::value *&rhs); - static void implicit_cast(ir::builder &builder, ir::value *&lhs, ir::value *&rhs, - bool &is_float, bool &is_ptr, bool &is_int, bool &is_signed); -public: - virtual ir::value* codegen(ir::module *) const { return nullptr; } -}; - -template -class list: public node { -public: - list(const T& x): values_(1, x) {} - - node* append(const T& x){ - values_.push_back(x); - return this; - } - - ir::value* codegen(ir::module * mod) const{ - for(T x: values_){ - x->codegen(mod); - } - return nullptr; - } - - const std::vector &values() const - { return values_; } - -private: - std::vector values_; -}; - -enum slice_enum_t{ - ALL, - NEWAXIS -}; - -class slice: public node{ -public: - slice(slice_enum_t type) - : type_(type){} - - slice_enum_t type() const{ - return type_; - } - -public: - const slice_enum_t type_; -}; - -class named_expression; - -class expression: public node{ -public: - virtual ir::value* codegen(ir::module *) const = 0; - named_expression *lvalue() const { return lvalue_; } - -protected: - named_expression *lvalue_; -}; - -class postfix_expression: public expression{ - -}; - -class builtin_expression: public node{ - -}; - -class typed_declaration_specifier; -class alloc_const: public builtin_expression{ -public: - alloc_const(node *spec, node *size): spec_((typed_declaration_specifier*)spec), size_((constant*)size) { } - ir::value* codegen(ir::module *mod) const; - -private: - const typed_declaration_specifier* spec_; - const constant* size_; -}; - -class get_global_range: public builtin_expression{ -public: - get_global_range(node *size, node *axis): size_((constant*)size), axis_((constant*)axis) { } - ir::value* codegen(ir::module *) const; - -private: - const constant* size_; - const constant* axis_; -}; - -class get_range_id: public builtin_expression{ -public: - get_range_id(node *axis): axis_((constant*)axis) { } - ir::value* codegen(ir::module *) const; - -private: - const constant* axis_; -}; - -class atomic_cas: public builtin_expression{ -public: - atomic_cas(node *ptr, node *cmp, node *val): ptr_(ptr), cmp_(cmp), val_(val) { } - ir::value* codegen(ir::module *) const; - -private: - const node *ptr_; - const node *cmp_; - const node *val_; -}; - - -class matmul_expression: public builtin_expression{ -public: - matmul_expression(node* A, node *B, node *C): - A_((expression*)A), B_((expression*)B), C_((expression*)C) { } - ir::value* codegen(ir::module *) const; - -private: - const expression *A_; - const expression *B_; - const expression *C_; -}; - -class max_expression: public builtin_expression{ -public: - max_expression(node* x, node* y) - : x_((expression*)x), y_((expression*)y){ } - ir::value* codegen(ir::module *) const; - -private: - const expression *x_; - const expression *y_; -}; - -class min_expression: public builtin_expression{ -public: - min_expression(node* x, node* y) - : x_((expression*)x), y_((expression*)y){ } - ir::value* codegen(ir::module *mod) const; - -private: - const expression *x_; - const expression *y_; -}; - -class select_expression: public builtin_expression{ -public: - select_expression(node* pred, node* if_value, node* else_value) - : pred_((expression*)pred), if_value_((expression*)if_value), else_value_((expression*)else_value) { } - ir::value* codegen(ir::module *mod) const; - -private: - const expression *pred_; - const expression *if_value_; - const expression *else_value_; -}; - -class trans_expression: public builtin_expression{ -public: - trans_expression(node *arg): arg_(arg) {} - ir::value* codegen(ir::module *mod) const; - -private: - node* arg_; -}; - - -class indexing_expression: public postfix_expression{ -public: - indexing_expression(node *id, node *slices) - : id_((const identifier*)id), slices_((const list*)slices) {} - - ir::value* codegen(ir::module *) const; - -private: - const identifier* id_; - const list* slices_; -}; - - - -class named_expression: public expression { -public: - named_expression(node *id): id_((const identifier*)id) { lvalue_ = this; } - const identifier *id() const { return id_; } - ir::value* codegen(ir::module * mod) const; - -private: - const identifier *id_; -}; - -class binary_operator: public expression{ -private: - ir::value* llvm_op(ir::module *mod, ir::builder &bld, ir::value *lhs, ir::value *rhs, const std::string &name) const; - -public: - binary_operator(BIN_OP_T op, node *lhs, node *rhs) - : op_(op), lhs_((expression*)lhs), rhs_((expression*)rhs) { - } - ir::value* codegen(ir::module *) const; - -private: - const BIN_OP_T op_; - const expression *lhs_; - const expression *rhs_; -}; - - -class constant: public expression{ -public: - constant(int value): value_(value) { } - ir::value* codegen(ir::module *mod) const; - int value() const; - -private: - const int value_; -}; - -class constant_range: public expression { -public: - constant_range(node *first, node *last) - : first_((constant*)first), last_((constant*)last) { } - - ir::value* codegen(ir::module *mod) const; - -private: - constant *first_; - constant *last_; -}; - -class string_literal: public expression{ -public: - string_literal(char *&value): value_(value) { } - ir::value* codegen(ir::module *mod) const; - -public: - std::string value_; -}; - -class unary_operator: public expression{ -private: - ir::value *llvm_op(ir::builder &builder, ir::value *arg, const std::string &name) const; - -public: - unary_operator(UNARY_OP_T op, node *arg) - : op_(op), - arg_((expression*)arg) { - if(op == DEREF) - this->lvalue_ = arg_->lvalue(); - } - - UNARY_OP_T get_op() const { return op_; } - ir::value* codegen(ir::module *mod) const; - -private: - const UNARY_OP_T op_; - const expression *arg_; -}; - -class type_name; -class cast_operator: public expression{ -private: - ir::value *llvm_op(ir::builder &builder, ir::type *T, ir::value *arg, const std::string &name) const; - -public: - cast_operator(node *T, node *arg): - T_((type_name*)T), - arg_((expression*)arg) { } - - ir::value* codegen(ir::module *mod) const; - -public: - const type_name *T_; - const expression *arg_; -}; - -class conditional_expression: public expression{ -private: - ir::value *llvm_op(ir::builder &builder, - ir::value *cond, ir::value *true_value, ir::value *false_value, - const std::string &name) const; - -public: - conditional_expression(node *cond, node *true_value, node *false_value) - : cond_((expression*)cond), - true_value_((expression*)true_value), - false_value_((expression*)false_value) { } - - ir::value* codegen(ir::module *mod) const; - -public: - const expression *cond_; - const expression *true_value_; - const expression *false_value_; -}; - -class assignment_expression: public expression{ -public: - assignment_expression(node *lvalue, ASSIGN_OP_T op, node *rvalue) - : lvalue_((named_expression*)lvalue), op_(op), rvalue_((expression*)rvalue) { } - - ir::value* codegen(ir::module *mod) const; - const expression *lvalue() const { return lvalue_; } - const expression *rvalue() const { return rvalue_; } - -public: - ASSIGN_OP_T op_; - const expression *lvalue_; - const expression *rvalue_; -}; - - -class initializer; -class declaration_specifier; - -class block_item: public node{ -}; - -class declaration: public block_item{ -public: - declaration(node *spec, node *init) - : spec_((declaration_specifier*)spec), init_((list*)init) { } - - ir::value* codegen(ir::module * mod) const; - -public: - const declaration_specifier *spec_; - const list *init_; -}; - -class statement: public block_item{ -}; - -class expression_statement: public statement{ -public: - expression_statement(node *expr, node *mask = nullptr) - : expr_((expression*)expr), pred_((expression*)mask){ } - - ir::value* codegen(ir::module * mod) const; - -private: - expression *expr_; - expression *pred_; -}; - - -class compound_statement: public statement{ - typedef list* declarations_t; - typedef list* statements_t; - -public: - compound_statement(node* items) - : items_((list*)items){} - - ir::value* codegen(ir::module * mod) const; - -private: - list* items_; -}; - -class selection_statement: public statement{ -public: - selection_statement(node *cond, node *if_value, node *else_value = nullptr) - : cond_(cond), then_value_(if_value), else_value_(else_value) { } - - ir::value* codegen(ir::module *mod) const; - -public: - const node *cond_; - const node *then_value_; - const node *else_value_; -}; - -class iteration_statement: public statement{ -public: - iteration_statement(node *init, node *stop, node *exec, node *statements) - : init_(init), stop_(stop), exec_(exec), statements_(statements) - { } - - ir::value* codegen(ir::module *mod) const; - -private: - const node *init_; - const node *stop_; - const node *exec_; - const node *statements_; -}; - -class while_statement: public statement{ -public: - while_statement(node *cond, node *statements) - : cond_(cond), statements_(statements) - { } - - ir::value* codegen(ir::module *) const; - -private: - const node *cond_; - const node *statements_; -}; - -// Jump - -class jump_statement: public statement{ -public: - using statement::statement; -}; - -class continue_statement: public jump_statement{ -public: - ir::value* codegen(ir::module *mod) const; -}; - -class no_op: public statement { }; - -// Types -class declaration_specifier: public node{ -public: - virtual ir::type* type(ir::module *mod) const = 0; - virtual std::vector storage() const = 0; -}; - -class typed_declaration_specifier: public declaration_specifier { -public: - typed_declaration_specifier(TYPE_T ty): ty_(ty){ } - ir::type* type(ir::module *mod) const; - std::vector storage() const; - -private: - const TYPE_T ty_; -}; - -class storage_declaration_specifier: public declaration_specifier { -public: - storage_declaration_specifier(STORAGE_SPEC_T storage_spec, node *decl_spec) - : storage_spec_(storage_spec), decl_spec_((declaration_specifier*)decl_spec) {} - ir::type* type(ir::module *mod) const; - std::vector storage() const; - -private: - const STORAGE_SPEC_T storage_spec_; - const declaration_specifier* decl_spec_; -}; - -class declarator; -class parameter: public node { -public: - parameter(node *spec, node *decl) - : spec_((declaration_specifier*)spec), - decl_((declarator*)decl) { } - - ir::type* type(ir::module *mod) const; - std::vector storage() const; - const identifier* id() const; - -public: - const declaration_specifier *spec_; - const declarator *decl_; -}; - -/* Declarators */ -class declarator: public node{ -protected: - typedef std::vector storage_spec_vec_t; - typedef const storage_spec_vec_t& storage_spec_vec_const_ref_t; - -public: - virtual ir::type* type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const = 0; - -public: - declarator(node *lhs) - : lhs_((declarator*)lhs), ptr_(nullptr){ } - - ir::type* type(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const; - - const identifier* id() const { - return (const identifier*)lhs_; - } - - declarator *set_ptr(node *ptr){ - ptr_ = (pointer*)ptr; - return this; - } - - void set_addr_space(unsigned addr_space){ - addr_space_ = addr_space; - } - -protected: - declarator *lhs_; - pointer *ptr_; - unsigned addr_space_; -}; - -class identifier: public declarator { - ir::type* type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const; - -public: - identifier(char *&name): declarator(this), name_(name) { } - const std::string &name() const; - -private: - std::string name_; -}; - -class pointer: public declarator{ -private: - ir::type* type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const; - -public: - pointer(node *id): declarator(id) { } -}; - -class tile: public declarator{ -private: - ir::type* type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const; - -public: - tile(node *id, node *shapes) - : declarator(id), shapes_((list*)(shapes)) { } - -public: - const list* shapes_; -}; - -class function: public declarator{ -private: - ir::type* type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const; - -public: - function(node *id, node *args) - : declarator(id), args_((list*)args) { } - - void bind_parameters(ir::module *mod, ir::function *fn) const; - unsigned get_num_args() const { return args_->values().size(); } - parameter* get_arg(unsigned i) const { return args_->values().at(i); } - -public: - const list* args_; -}; - - -class initializer : public declarator{ -private: - ir::type* type_impl(ir::module * mod, ir::type *type, storage_spec_vec_const_ref_t storage) const; - -public: - initializer(node *decl, node *init) - : declarator((node*)((declarator*)decl)->id()), - decl_((declarator*)decl), expr_((expression*)init){ } - - void set_specifier(const declaration_specifier *spec); - ir::value* codegen(ir::module *) const; - -public: - const declaration_specifier *spec_; - declarator *decl_; - const expression *expr_; -}; - - -class type_name: public node{ -public: - type_name(node *spec, node * decl) - : spec_((declaration_specifier*)spec), decl_((declarator*)decl) { } - - ir::type *type(ir::module *mod) const; - -public: - const declaration_specifier *spec_; - const declarator *decl_; -}; - -/* Function definition */ -class function_definition: public node{ -public: - function_definition(node *spec, node *header, node *body) - : spec_((declaration_specifier*)spec), header_((function *)header), body_((compound_statement*)body) { } - - ir::value* codegen(ir::module * mod) const; - -public: - const declaration_specifier *spec_; - const function *header_; - const compound_statement *body_; -}; - -/* Translation Unit */ -class translation_unit: public node{ -public: - translation_unit(node *item) - : decls_(item) { } - - translation_unit *add(node *item) { - decls_.append(item); - return this; - } - - ir::value* codegen(ir::module * mod) const; - -private: - list decls_; -}; - -void update_location(const char *t); -void print_error(const char *error); -char return_impl(char t, const char * yytext); -yytokentype return_impl(yytokentype t, const char * yytext); -void return_void(const char * yytext); - -} - -} +#include "declaration.h" +#include "error.h" +#include "expression.h" +#include "node.h" +#include "ops.h" #endif diff --git a/include/triton/ast/declaration.h b/include/triton/ast/declaration.h new file mode 100644 index 000000000..5a51c3f9a --- /dev/null +++ b/include/triton/ast/declaration.h @@ -0,0 +1,222 @@ +#ifndef TRITON_INCLUDE_AST_DECLARATION_H +#define TRITON_INCLUDE_AST_DECLARATION_H + +#include "node.h" +#include "parser.hpp" +#include +#include +#include +#include + + +namespace triton{ + + +namespace ir{ + class function; + class value; + class type; + class builder; + class module; +} + +namespace ast{ + +class expression; +class pointer; +class identifier; +class constant; +class compound_statement; +class initializer; +class declaration_specifier; + + +class declaration: public block_item{ +public: + declaration(node *spec, node *init) + : spec_((declaration_specifier*)spec), init_((list*)init) { } + + ir::value* codegen(ir::module * mod) const; + +public: + const declaration_specifier *spec_; + const list *init_; +}; + +// Types +class declaration_specifier: public node{ +public: + virtual ir::type* type(ir::module *mod) const = 0; + virtual std::vector storage() const = 0; +}; + +class typed_declaration_specifier: public declaration_specifier { +public: + typed_declaration_specifier(TYPE_T ty): ty_(ty){ } + ir::type* type(ir::module *mod) const; + std::vector storage() const; + +private: + const TYPE_T ty_; +}; + +class storage_declaration_specifier: public declaration_specifier { +public: + storage_declaration_specifier(STORAGE_SPEC_T storage_spec, node *decl_spec) + : storage_spec_(storage_spec), decl_spec_((declaration_specifier*)decl_spec) {} + ir::type* type(ir::module *mod) const; + std::vector storage() const; + +private: + const STORAGE_SPEC_T storage_spec_; + const declaration_specifier* decl_spec_; +}; + +class declarator; +class parameter: public node { +public: + parameter(node *spec, node *decl) + : spec_((declaration_specifier*)spec), + decl_((declarator*)decl) { } + + ir::type* type(ir::module *mod) const; + std::vector storage() const; + const identifier* id() const; + +public: + const declaration_specifier *spec_; + const declarator *decl_; +}; + +/* Declarators */ +class declarator: public node{ +protected: + typedef std::vector storage_spec_vec_t; + typedef const storage_spec_vec_t& storage_spec_vec_const_ref_t; + +public: + virtual ir::type* type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const = 0; + +public: + declarator(node *lhs) + : lhs_((declarator*)lhs), ptr_(nullptr){ } + + ir::type* type(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const; + + const identifier* id() const { + return (const identifier*)lhs_; + } + + declarator *set_ptr(node *ptr){ + ptr_ = (pointer*)ptr; + return this; + } + + void set_addr_space(unsigned addr_space){ + addr_space_ = addr_space; + } + +protected: + declarator *lhs_; + pointer *ptr_; + unsigned addr_space_; +}; + +class identifier: public declarator { + ir::type* type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const; + +public: + identifier(char *&name): declarator(this), name_(name) { } + const std::string &name() const; + +private: + std::string name_; +}; + +class pointer: public declarator{ +private: + ir::type* type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const; + +public: + pointer(node *id): declarator(id) { } +}; + +class tile: public declarator{ +private: + ir::type* type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const; + +public: + tile(node *id, node *shapes) + : declarator(id), shapes_((list*)(shapes)) { } + +public: + const list* shapes_; +}; + +class function: public declarator{ +private: + ir::type* type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const; + +public: + function(node *id, node *args) + : declarator(id), args_((list*)args) { } + + void bind_parameters(ir::module *mod, ir::function *fn) const; + unsigned get_num_args() const { return args_->values().size(); } + parameter* get_arg(unsigned i) const { return args_->values().at(i); } + +public: + const list* args_; +}; + + +class initializer : public declarator{ +private: + ir::type* type_impl(ir::module * mod, ir::type *type, storage_spec_vec_const_ref_t storage) const; + +public: + initializer(node *decl, node *init) + : declarator((node*)((declarator*)decl)->id()), + decl_((declarator*)decl), expr_((expression*)init){ } + + void set_specifier(const declaration_specifier *spec); + ir::value* codegen(ir::module *) const; + +public: + const declaration_specifier *spec_; + declarator *decl_; + const expression *expr_; +}; + + +class type_name: public node{ +public: + type_name(node *spec, node * decl) + : spec_((declaration_specifier*)spec), decl_((declarator*)decl) { } + + ir::type *type(ir::module *mod) const; + +public: + const declaration_specifier *spec_; + const declarator *decl_; +}; + +/* Function definition */ +class function_definition: public node{ +public: + function_definition(node *spec, node *header, node *body) + : spec_((declaration_specifier*)spec), header_((function *)header), body_((compound_statement*)body) { } + + ir::value* codegen(ir::module * mod) const; + +public: + const declaration_specifier *spec_; + const function *header_; + const compound_statement *body_; +}; + +} + +} + +#endif diff --git a/include/triton/ast/error.h b/include/triton/ast/error.h new file mode 100644 index 000000000..5834d55f6 --- /dev/null +++ b/include/triton/ast/error.h @@ -0,0 +1,62 @@ +#ifndef TRITON_INCLUDE_AST_ERROR_H +#define TRITON_INCLUDE_AST_ERROR_H + +#include "ops.h" +#include "parser.hpp" +#include "node.h" +#include +#include +#include +#include + + +namespace triton{ + + +namespace ir{ + class function; + class value; + class type; + class builder; + class module; +} + +namespace ast{ + +class expression; +class pointer; +class identifier; +class constant; +class compound_statement; +class initializer; +class declaration_specifier; +class function; + +/* Translation Unit */ +class translation_unit: public node{ +public: + translation_unit(node *item) + : decls_(item) { } + + translation_unit *add(node *item) { + decls_.append(item); + return this; + } + + ir::value* codegen(ir::module * mod) const; + +private: + list decls_; +}; + +void update_location(const char *t); +void print_error(const char *error); +char return_impl(char t, const char * yytext); +yytokentype return_impl(yytokentype t, const char * yytext); +void return_void(const char * yytext); + +} + +} + +#endif diff --git a/include/triton/ast/expression.h b/include/triton/ast/expression.h new file mode 100644 index 000000000..27d72dec8 --- /dev/null +++ b/include/triton/ast/expression.h @@ -0,0 +1,311 @@ +#ifndef TDL_INCLUDE_AST_EXPRESSION_H +#define TDL_INCLUDE_AST_EXPRESSION_H + +#include "parser.hpp" +#include "ast.h" +#include +#include +#include +#include + + +namespace triton{ + + +namespace ir{ + class function; + class value; + class type; + class builder; + class module; +} + +namespace ast{ + + +enum slice_enum_t{ + ALL, + NEWAXIS +}; + +class slice: public node{ +public: + slice(slice_enum_t type) + : type_(type){} + + slice_enum_t type() const{ + return type_; + } + +public: + const slice_enum_t type_; +}; + + +class named_expression; + +class expression: public node{ +public: + virtual ir::value* codegen(ir::module *) const = 0; + named_expression *lvalue() const { return lvalue_; } + +protected: + named_expression *lvalue_; +}; + +class postfix_expression: public expression{ + +}; + +class builtin_expression: public node{ + +}; + +class typed_declaration_specifier; +class alloc_const_expression: public builtin_expression{ +public: + alloc_const_expression(node *spec, node *size): spec_((typed_declaration_specifier*)spec), size_((constant*)size) { } + ir::value* codegen(ir::module *mod) const; + +private: + const typed_declaration_specifier* spec_; + const constant* size_; +}; + +class get_global_range_expression: public builtin_expression{ +public: + get_global_range_expression(node *size, node *axis): size_((constant*)size), axis_((constant*)axis) { } + ir::value* codegen(ir::module *) const; + +private: + const constant* size_; + const constant* axis_; +}; + +class get_range_id_expression: public builtin_expression{ +public: + get_range_id_expression(node *axis): axis_((constant*)axis) { } + ir::value* codegen(ir::module *) const; + +private: + const constant* axis_; +}; + +class atomic_cas_expression: public builtin_expression{ +public: + atomic_cas_expression(node *ptr, node *cmp, node *val): ptr_(ptr), cmp_(cmp), val_(val) { } + ir::value* codegen(ir::module *) const; + +private: + const node *ptr_; + const node *cmp_; + const node *val_; +}; + + +class matmul_expression: public builtin_expression{ +public: + matmul_expression(node* A, node *B, node *C): + A_((expression*)A), B_((expression*)B), C_((expression*)C) { } + ir::value* codegen(ir::module *) const; + +private: + const expression *A_; + const expression *B_; + const expression *C_; +}; + +class max_expression: public builtin_expression{ +public: + max_expression(node* x, node* y) + : x_((expression*)x), y_((expression*)y){ } + ir::value* codegen(ir::module *) const; + +private: + const expression *x_; + const expression *y_; +}; + +class min_expression: public builtin_expression{ +public: + min_expression(node* x, node* y) + : x_((expression*)x), y_((expression*)y){ } + ir::value* codegen(ir::module *mod) const; + +private: + const expression *x_; + const expression *y_; +}; + +class select_expression: public builtin_expression{ +public: + select_expression(node* pred, node* if_value, node* else_value) + : pred_((expression*)pred), if_value_((expression*)if_value), else_value_((expression*)else_value) { } + ir::value* codegen(ir::module *mod) const; + +private: + const expression *pred_; + const expression *if_value_; + const expression *else_value_; +}; + +class trans_expression: public builtin_expression{ +public: + trans_expression(node *arg): arg_(arg) {} + ir::value* codegen(ir::module *mod) const; + +private: + node* arg_; +}; + + +class indexing_expression: public postfix_expression{ +public: + indexing_expression(node *id, node *slices) + : id_((const identifier*)id), slices_((const list*)slices) {} + + ir::value* codegen(ir::module *) const; + +private: + const identifier* id_; + const list* slices_; +}; + + + +class named_expression: public expression { +public: + named_expression(node *id): id_((const identifier*)id) { lvalue_ = this; } + const identifier *id() const { return id_; } + ir::value* codegen(ir::module * mod) const; + +private: + const identifier *id_; +}; + +class binary_expression: public expression{ +private: + ir::value* llvm_op(ir::module *mod, ir::builder &bld, ir::value *lhs, ir::value *rhs, const std::string &name) const; + +public: + binary_expression(BIN_OP_T op, node *lhs, node *rhs) + : op_(op), lhs_((expression*)lhs), rhs_((expression*)rhs) { + } + ir::value* codegen(ir::module *) const; + +private: + const BIN_OP_T op_; + const expression *lhs_; + const expression *rhs_; +}; + + +class constant: public expression{ +public: + constant(int value): value_(value) { } + ir::value* codegen(ir::module *mod) const; + int value() const; + +private: + const int value_; +}; + +class constant_range: public expression { +public: + constant_range(node *first, node *last) + : first_((constant*)first), last_((constant*)last) { } + + ir::value* codegen(ir::module *mod) const; + +private: + constant *first_; + constant *last_; +}; + +class string_literal: public expression{ +public: + string_literal(char *&value): value_(value) { } + ir::value* codegen(ir::module *mod) const; + +public: + std::string value_; +}; + +class unary_expression: public expression{ +private: + ir::value *llvm_op(ir::builder &builder, ir::value *arg, const std::string &name) const; + +public: + unary_expression(UNARY_OP_T op, node *arg) + : op_(op), + arg_((expression*)arg) { + if(op == DEREF) + this->lvalue_ = arg_->lvalue(); + } + + UNARY_OP_T get_op() const { return op_; } + ir::value* codegen(ir::module *mod) const; + +private: + const UNARY_OP_T op_; + const expression *arg_; +}; + +class type_name; +class cast_expression: public expression{ +private: + ir::value *llvm_op(ir::builder &builder, ir::type *T, ir::value *arg, const std::string &name) const; + +public: + cast_expression(node *T, node *arg): + T_((type_name*)T), + arg_((expression*)arg) { } + + ir::value* codegen(ir::module *mod) const; + +public: + const type_name *T_; + const expression *arg_; +}; + +class conditional_expression: public expression{ +private: + ir::value *llvm_op(ir::builder &builder, + ir::value *cond, ir::value *true_value, ir::value *false_value, + const std::string &name) const; + +public: + conditional_expression(node *cond, node *true_value, node *false_value) + : cond_((expression*)cond), + true_value_((expression*)true_value), + false_value_((expression*)false_value) { } + + ir::value* codegen(ir::module *mod) const; + +public: + const expression *cond_; + const expression *true_value_; + const expression *false_value_; +}; + +class assignment_expression: public expression{ +public: + assignment_expression(node *lvalue, ASSIGN_OP_T op, node *rvalue) + : lvalue_((named_expression*)lvalue), op_(op), rvalue_((expression*)rvalue) { } + + ir::value* codegen(ir::module *mod) const; + const expression *lvalue() const { return lvalue_; } + const expression *rvalue() const { return rvalue_; } + +public: + ASSIGN_OP_T op_; + const expression *lvalue_; + const expression *rvalue_; +}; + + +} + +} + +#endif diff --git a/include/triton/ast/module.h b/include/triton/ast/module.h new file mode 100644 index 000000000..6d72753ce --- /dev/null +++ b/include/triton/ast/module.h @@ -0,0 +1,37 @@ +#ifndef TRITON_INCLUDE_AST_MODULE_H +#define TRITON_INCLUDE_AST_MODULE_H + +#include "ops.h" +#include "parser.hpp" +#include "node.h" +#include +#include +#include +#include + + +namespace triton{ +namespace ast{ + +/* Translation Unit */ +class translation_unit: public node{ +public: + translation_unit(node *item) + : decls_(item) { } + + translation_unit *add(node *item) { + decls_.append(item); + return this; + } + + ir::value* codegen(ir::module * mod) const; + +private: + list decls_; +}; + +} + +} + +#endif diff --git a/include/triton/ast/node.h b/include/triton/ast/node.h new file mode 100644 index 000000000..265443397 --- /dev/null +++ b/include/triton/ast/node.h @@ -0,0 +1,77 @@ +#ifndef TRITON_INCLUDE_AST_NODE_H +#define TRITON_INCLUDE_AST_NODE_H + +#include "ops.h" +#include "parser.hpp" +#include +#include +#include +#include + + +namespace triton{ + + +namespace ir{ + class function; + class value; + class type; + class builder; + class module; +} + +namespace ast{ + +class expression; +class pointer; +class identifier; +class constant; +class compound_statement; +class initializer; +class declaration_specifier; +class function; + +// Node +class node { +protected: + static ir::value* explicit_cast(ir::builder &builder, ir::value *src, ir::type *dst_ty); + static void implicit_broadcast(ir::module *mod, ir::type *dst_ty, ir::value *&src); + static void implicit_broadcast(ir::module *mod, ir::value *&lhs, ir::value *&rhs); + static void implicit_cast(ir::builder &builder, ir::value *&lhs, ir::value *&rhs, + bool &is_float, bool &is_ptr, bool &is_int, bool &is_signed); +public: + virtual ir::value* codegen(ir::module *) const { return nullptr; } +}; + +class block_item: public node{ +}; + +template +class list: public node { +public: + list(const T& x): values_(1, x) {} + + node* append(const T& x){ + values_.push_back(x); + return this; + } + + ir::value* codegen(ir::module * mod) const{ + for(T x: values_){ + x->codegen(mod); + } + return nullptr; + } + + const std::vector &values() const + { return values_; } + +private: + std::vector values_; +}; + +} + +} + +#endif diff --git a/include/triton/ast/ops.h b/include/triton/ast/ops.h new file mode 100644 index 000000000..316fdccb3 --- /dev/null +++ b/include/triton/ast/ops.h @@ -0,0 +1,60 @@ +#ifndef TRITON_INCLUDE_AST_OPS_H +#define TRITON_INCLUDE_AST_OPS_H + +#include "parser.hpp" +#include +#include +#include +#include + +namespace triton{ +namespace ast{ + +enum ASSIGN_OP_T{ + ASSIGN, + INPLACE_MUL, INPLACE_DIV, INPLACE_MOD, + INPLACE_ADD, INPLACE_SUB, + INPLACE_LSHIFT, INPLACE_RSHIFT, + INPLACE_AND, INPLACE_XOR, + INPLACE_OR +}; + +enum BIN_OP_T{ + MUL, DIV, MOD, + ADD, SUB, + LEFT_SHIFT, RIGHT_SHIFT, + LT, GT, + LE, GE, + EQ, NE, + AND, XOR, OR, + LAND, LOR +}; + +enum UNARY_OP_T{ + INC, DEC, + PLUS, MINUS, + ADDR, DEREF, + COMPL, NOT +}; + +enum TYPE_T{ + VOID_T, + UINT1_T, UINT8_T, UINT16_T, UINT32_T, UINT64_T, + INT1_T, INT8_T, INT16_T, INT32_T, INT64_T, + FLOAT32_T, FLOAT64_T +}; + +enum STORAGE_SPEC_T{ + CONST_T, + TUNABLE_T, + KERNEL_T, + RESTRICT_T, + READONLY_T, + CONSTANT_SPACE_T, + WRITEONLY_T +}; + +} +} + +#endif diff --git a/include/triton/ast/parser.y b/include/triton/ast/parser.y index 9dab092de..c71f8a20e 100644 --- a/include/triton/ast/parser.y +++ b/include/triton/ast/parser.y @@ -9,6 +9,9 @@ class node; using namespace triton::ast; #define YYSTYPE node* #include "../include/triton/ast/ast.h" +#include "../include/triton/ast/expression.h" +#include "../include/triton/ast/statement.h" +#include "../include/triton/ast/declaration.h" extern char* yytext; void yyerror(const char *s); @@ -86,82 +89,80 @@ pointer | '*' pointer { $$ = new pointer($1); } abstract_declarator - : pointer { $$ = $1; } + : pointer { $$ = $1; } | pointer direct_abstract_declarator { $$ = ((declarator*)$2)->set_ptr($1); } - | direct_abstract_declarator { $$ = $1; } - ; + | direct_abstract_declarator { $$ = $1; } + ; direct_abstract_declarator - : '[' primary_expression_list ']' { $$ = new tile(nullptr, $1); } - -constant: - CONSTANT { $$ = new constant(atoi(yytext)); } - ; - -constant_list: - constant { $$ = new list((constant*)$1); } - | constant_list ',' constant { $$ = append_ptr_list($1, $3); } - ; + : '[' primary_expression_list ']' { $$ = new tile(nullptr, $1); } type_name : declaration_specifiers { $$ = new type_name($1, nullptr); } | declaration_specifiers abstract_declarator { $$ = new type_name($1, $2); } - ; + ; /* -------------------------- */ /* Expressions */ /* -------------------------- */ -identifier - : IDENTIFIER { $$ = new identifier(yytext); } - ; - -builtin - : GET_GLOBAL_RANGE '[' primary_expression ']' '(' constant ')' { $$ = new get_global_range($3, $6); } - | GET_RANGE_ID '(' constant ')' { $$ = new get_range_id($3); } - | DOT '(' expression ',' expression ',' expression ')' { $$ = new matmul_expression($3, $5, $7); } - | ALLOC_CONST type_specifier '[' constant ']' { $$ = new alloc_const(new typed_declaration_specifier(get_type_spec($2)), $4); } - | TRANS '(' expression ')' { $$ = new trans_expression($3); } - | MAX '(' expression ',' expression ')' { $$ = new max_expression($3, $5); } - | MIN '(' expression ',' expression ')' { $$ = new min_expression($3, $5); } - | SELECT '(' expression ',' expression ',' expression ')' { $$ = new select_expression($3, $5, $7); } - | ATOMIC_CAS '(' expression ',' expression ',' expression ')' { $$ = new atomic_cas($3, $5, $7); } +/* Constants */ +constant + : CONSTANT { $$ = new constant(atoi(yytext)); } ; +constant_list + : constant { $$ = new list((constant*)$1); } + | constant_list ',' constant { $$ = append_ptr_list($1, $3); } + ; + +identifier + : IDENTIFIER { $$ = new identifier(yytext); } + ; + +/* Built-in */ +builtin_expression + : GET_GLOBAL_RANGE '[' primary_expression ']' '(' constant ')' { $$ = new get_global_range_expression($3, $6); } + | GET_RANGE_ID '(' constant ')' { $$ = new get_range_id_expression($3); } + | DOT '(' expression ',' expression ',' expression ')' { $$ = new matmul_expression($3, $5, $7); } + | ALLOC_CONST type_specifier '[' constant ']' { $$ = new alloc_const_expression(new typed_declaration_specifier(get_type_spec($2)), $4); } + | TRANS '(' expression ')' { $$ = new trans_expression($3); } + | MAX '(' expression ',' expression ')' { $$ = new max_expression($3, $5); } + | MIN '(' expression ',' expression ')' { $$ = new min_expression($3, $5); } + | SELECT '(' expression ',' expression ',' expression ')' { $$ = new select_expression($3, $5, $7); } + | ATOMIC_CAS '(' expression ',' expression ',' expression ')' { $$ = new atomic_cas_expression($3, $5, $7); } + ; + +/* Primary */ primary_expression - : identifier { $$ = new named_expression($1); } - | constant { $$ = $1; } + : identifier { $$ = new named_expression($1); } + | constant { $$ = $1; } | primary_expression ELLIPSIS primary_expression { $$ = new constant_range($1, $3); } - | builtin { $$ = $1; } - | STRING_LITERAL { $$ = new string_literal(yytext); } - | '(' expression ')' { $$ = $2; } - ; + | builtin_expression { $$ = $1; } + | STRING_LITERAL { $$ = new string_literal(yytext); } + | '(' expression ')' { $$ = $2; } + ; primary_expression_list - : primary_expression { $$ = new list((expression*)$1); } + : primary_expression { $$ = new list((expression*)$1); } | primary_expression_list ',' primary_expression { $$ = append_ptr_list($1, $3); } ; +/* Postfix */ slice - : ':' { $$ = new slice(triton::ast::ALL); } - | NEWAXIS { $$ = new slice(triton::ast::NEWAXIS); } + : ':' { $$ = new slice(triton::ast::ALL); } + | NEWAXIS { $$ = new slice(triton::ast::NEWAXIS); } slice_list - : slice { $$ = new list((slice*)$1); } - | slice_list ',' slice { $$ = append_ptr_list($1, $3); } + : slice { $$ = new list((slice*)$1); } + | slice_list ',' slice { $$ = append_ptr_list($1, $3); } postfix_expression - : primary_expression { $$ = $1;} - | identifier '[' slice_list ']' { $$ = new indexing_expression($1, $3);} + : primary_expression { $$ = $1;} + | identifier '[' slice_list ']' { $$ = new indexing_expression($1, $3);} ; -unary_expression - : postfix_expression { $$ = $1; } - | INC_OP unary_expression { $$ = new unary_operator(INC, $2); } - | DEC_OP unary_expression { $$ = new unary_operator(DEC, $2); } - | unary_operator cast_expression { $$ = new unary_operator(get_unary_op($1), $2); } - ; - +/* Unary */ unary_operator : '&' { $$ = new token(ADDR); } | '*' { $$ = new token(DEREF); } @@ -169,79 +170,86 @@ unary_operator | '-' { $$ = new token(MINUS); } | '~' { $$ = new token(COMPL); } | '!' { $$ = new token(NOT); } - ; + ; + +unary_expression + : postfix_expression { $$ = $1; } + | INC_OP unary_expression { $$ = new unary_expression(INC, $2); } + | DEC_OP unary_expression { $$ = new unary_expression(DEC, $2); } + | unary_operator cast_expression { $$ = new unary_expression(get_unary_op($1), $2); } + ; cast_expression - : unary_expression { $$ = $1; } - | '(' type_name ')' cast_expression { $$ = new cast_operator($2, $4); } - ; + : unary_expression { $$ = $1; } + | '(' type_name ')' cast_expression { $$ = new cast_expression($2, $4); } + ; multiplicative_expression - : cast_expression { $$ = $1; } - | multiplicative_expression '*' cast_expression { $$ = new binary_operator(MUL, $1, $3); } - | multiplicative_expression '/' cast_expression { $$ = new binary_operator(DIV, $1, $3); } - | multiplicative_expression '%' cast_expression { $$ = new binary_operator(MOD, $1, $3); } - ; + : cast_expression { $$ = $1; } + | multiplicative_expression '*' cast_expression { $$ = new binary_expression(MUL, $1, $3); } + | multiplicative_expression '/' cast_expression { $$ = new binary_expression(DIV, $1, $3); } + | multiplicative_expression '%' cast_expression { $$ = new binary_expression(MOD, $1, $3); } + ; additive_expression - : multiplicative_expression { $$ = $1; } - | additive_expression '+' multiplicative_expression { $$ = new binary_operator(ADD, $1, $3); } - | additive_expression '-' multiplicative_expression { $$ = new binary_operator(SUB, $1, $3); } - ; + : multiplicative_expression { $$ = $1; } + | additive_expression '+' multiplicative_expression { $$ = new binary_expression(ADD, $1, $3); } + | additive_expression '-' multiplicative_expression { $$ = new binary_expression(SUB, $1, $3); } + ; shift_expression - : additive_expression { $$ = $1; } - | shift_expression LEFT_OP additive_expression { $$ = new binary_operator(LEFT_SHIFT, $1, $3); } - | shift_expression RIGHT_OP additive_expression { $$ = new binary_operator(RIGHT_SHIFT, $1, $3); } - ; + : additive_expression { $$ = $1; } + | shift_expression LEFT_OP additive_expression { $$ = new binary_expression(LEFT_SHIFT, $1, $3); } + | shift_expression RIGHT_OP additive_expression { $$ = new binary_expression(RIGHT_SHIFT, $1, $3); } + ; /* Comparison */ relational_expression - : shift_expression { $$ = $1; } - | relational_expression '<' shift_expression { $$ = new binary_operator(LT, $1, $3); } - | relational_expression '>' shift_expression { $$ = new binary_operator(GT, $1, $3); } - | relational_expression LE_OP shift_expression { $$ = new binary_operator(LE, $1, $3); } - | relational_expression GE_OP shift_expression { $$ = new binary_operator(GE, $1, $3); } - ; + : shift_expression { $$ = $1; } + | relational_expression '<' shift_expression { $$ = new binary_expression(LT, $1, $3); } + | relational_expression '>' shift_expression { $$ = new binary_expression(GT, $1, $3); } + | relational_expression LE_OP shift_expression { $$ = new binary_expression(LE, $1, $3); } + | relational_expression GE_OP shift_expression { $$ = new binary_expression(GE, $1, $3); } + ; equality_expression - : relational_expression { $$ = $1; } - | equality_expression EQ_OP relational_expression { $$ = new binary_operator(EQ, $1, $3); } - | equality_expression NE_OP relational_expression { $$ = new binary_operator(NE, $1, $3); } - ; + : relational_expression { $$ = $1; } + | equality_expression EQ_OP relational_expression { $$ = new binary_expression(EQ, $1, $3); } + | equality_expression NE_OP relational_expression { $$ = new binary_expression(NE, $1, $3); } + ; /* Binary */ and_expression - : equality_expression { $$ = $1; } - | and_expression '&' equality_expression { $$ = new binary_operator(AND, $1, $3); } - ; + : equality_expression { $$ = $1; } + | and_expression '&' equality_expression { $$ = new binary_expression(AND, $1, $3); } + ; exclusive_or_expression - : and_expression { $$ = $1; } - | exclusive_or_expression '^' and_expression { $$ = new binary_operator(XOR, $1, $3); } - ; + : and_expression { $$ = $1; } + | exclusive_or_expression '^' and_expression { $$ = new binary_expression(XOR, $1, $3); } + ; inclusive_or_expression - : exclusive_or_expression { $$ = $1; } - | inclusive_or_expression '|' exclusive_or_expression { $$ = new binary_operator(OR, $1, $3); } - ; + : exclusive_or_expression { $$ = $1; } + | inclusive_or_expression '|' exclusive_or_expression { $$ = new binary_expression(OR, $1, $3); } + ; /* Logical */ logical_and_expression - : inclusive_or_expression { $$ = $1; } - | logical_and_expression AND_OP inclusive_or_expression { $$ = new binary_operator(LAND, $1, $3); } - ; + : inclusive_or_expression { $$ = $1; } + | logical_and_expression AND_OP inclusive_or_expression { $$ = new binary_expression(LAND, $1, $3); } + ; logical_or_expression - : logical_and_expression { $$ = $1; } - | logical_or_expression OR_OP logical_and_expression { $$ = new binary_operator(LOR, $1, $3); } - ; + : logical_and_expression { $$ = $1; } + | logical_or_expression OR_OP logical_and_expression { $$ = new binary_expression(LOR, $1, $3); } + ; /* Conditional */ conditional_expression - : logical_or_expression { $$ = $1; } + : logical_or_expression { $$ = $1; } | logical_or_expression '?' conditional_expression ':' conditional_expression { $$ = new conditional_expression($1, $3, $5); } - ; + ; /* Assignment */ assignment_operator @@ -259,14 +267,14 @@ assignment_operator ; assignment_expression - : conditional_expression { $$ = $1; } + : conditional_expression { $$ = $1; } | unary_expression assignment_operator assignment_expression { $$ = new assignment_expression($1, get_assign_op($2), $3); } - ; + ; /* Expression */ expression - : assignment_expression { $$ = $1; } - ; + : assignment_expression { $$ = $1; } + ; /* Initialization */ initialization_expression @@ -280,16 +288,16 @@ initialization_expression /* -------------------------- */ statement - : compound_statement { $$ = $1; } - | expression_statement { $$ = $1; } - | selection_statement { $$ = $1; } - | iteration_statement { $$ = $1; } - | jump_statement { $$ = $1; } - ; + : compound_statement { $$ = $1; } + | expression_statement { $$ = $1; } + | selection_statement { $$ = $1; } + | iteration_statement { $$ = $1; } + | jump_statement { $$ = $1; } + ; compound_statement - : '{' '}' { $$ = new compound_statement(nullptr); } - | '{' block_item_list '}' { $$ = new compound_statement($2); } + : '{' '}' { $$ = new compound_statement(nullptr); } + | '{' block_item_list '}' { $$ = new compound_statement($2); } block_item_list : block_item { $$ = new list((block_item*)$1); } @@ -300,7 +308,7 @@ block_item | statement { $$ = $1; } expression_statement - : ';' { $$ = new no_op(); } + : ';' { $$ = new no_op(); } | expression ';' { $$ = new expression_statement($1); } | AT primary_expression expression ';' { $$ = new expression_statement($3, $2); } ; @@ -334,7 +342,7 @@ direct_declarator parameter_list - : parameter_declaration { $$ = new list((parameter*)$1); } + : parameter_declaration { $$ = new list((parameter*)$1); } | parameter_list ',' parameter_declaration { $$ = append_ptr_list($1, $3); } ; @@ -355,20 +363,19 @@ init_declarator_list ; declaration - : declaration_specifiers ';' { $$ = new declaration($1, nullptr); } - | declaration_specifiers init_declarator_list ';' { $$ = new declaration($1, $2); } - ; + : declaration_specifiers ';' { $$ = new declaration($1, nullptr); } + | declaration_specifiers init_declarator_list ';' { $$ = new declaration($1, $2); } + ; declarator : pointer direct_declarator { $$ = ((declarator*)$2)->set_ptr($1); } - | direct_declarator { $$ = $1; } - ; - + | direct_declarator { $$ = $1; } + ; init_declarator : declarator { $$ = new initializer($1, nullptr); } | declarator '=' initialization_expression { $$ = new initializer($1, $3); } - ; + ; storage_class_specifier : CONST { $$ = new token(CONST_T); } @@ -381,13 +388,13 @@ storage_class_specifier ; /* -------------------------- */ -/* Translation Unit */ +/* Translation Unit */ /* -------------------------- */ translation_unit : external_declaration { ast_root = new translation_unit($1); $$ = ast_root; } - | translation_unit external_declaration { $$ = ((translation_unit*)($1))->add($2); } - ; + | translation_unit external_declaration { $$ = ((translation_unit*)($1))->add($2); } + ; external_declaration : function_definition { $$ = $1; } @@ -396,7 +403,7 @@ external_declaration function_definition : declaration_specifiers declarator compound_statement { $$ = new function_definition($1, $2, $3); } - ; + ; %% void yyerror (const char *s){ diff --git a/include/triton/ast/statement.h b/include/triton/ast/statement.h new file mode 100644 index 000000000..575d70690 --- /dev/null +++ b/include/triton/ast/statement.h @@ -0,0 +1,121 @@ +#ifndef TRITON_INCLUDE_AST_STATEMENT_H +#define TRITON_INCLUDE_AST_STATEMENT_H + +#include "parser.hpp" +#include "triton/ast/ast.h" +#include +#include +#include +#include + + +namespace triton{ + + +namespace ir{ + class function; + class value; + class type; + class builder; + class module; +} + +namespace ast{ + +class declaration; + +class statement: public block_item{ +}; + +// Expression +class expression_statement: public statement{ +public: + expression_statement(node *expr, node *mask = nullptr) + : expr_((expression*)expr), pred_((expression*)mask){ } + + ir::value* codegen(ir::module * mod) const; + +private: + expression *expr_; + expression *pred_; +}; + +// Compound +class compound_statement: public statement{ + typedef list* declarations_t; + typedef list* statements_t; + +public: + compound_statement(node* items) + : items_((list*)items){} + + ir::value* codegen(ir::module * mod) const; + +private: + list* items_; +}; + +// Selection +class selection_statement: public statement{ +public: + selection_statement(node *cond, node *if_value, node *else_value = nullptr) + : cond_(cond), then_value_(if_value), else_value_(else_value) { } + + ir::value* codegen(ir::module *mod) const; + +public: + const node *cond_; + const node *then_value_; + const node *else_value_; +}; + +// Iteration +class iteration_statement: public statement{ +public: + iteration_statement(node *init, node *stop, node *exec, node *statements) + : init_(init), stop_(stop), exec_(exec), statements_(statements) + { } + + ir::value* codegen(ir::module *mod) const; + +private: + const node *init_; + const node *stop_; + const node *exec_; + const node *statements_; +}; + +// While +class while_statement: public statement{ +public: + while_statement(node *cond, node *statements) + : cond_(cond), statements_(statements) + { } + + ir::value* codegen(ir::module *) const; + +private: + const node *cond_; + const node *statements_; +}; + +// Jump +class jump_statement: public statement{ +public: + using statement::statement; +}; + +// Continue +class continue_statement: public jump_statement{ +public: + ir::value* codegen(ir::module *mod) const; +}; + +// No op +class no_op: public statement { }; + +} + +} + +#endif diff --git a/lib/ast/declaration.cpp b/lib/ast/declaration.cpp new file mode 100644 index 000000000..888cdf7ff --- /dev/null +++ b/lib/ast/declaration.cpp @@ -0,0 +1,199 @@ +#include "triton/ast/statement.h" +#include "triton/ast/declaration.h" +#include "triton/ir/function.h" +#include "triton/ir/module.h" +#include "triton/ir/basic_block.h" +#include "triton/ir/builder.h" +#include "triton/ir/type.h" + + +namespace triton{ + +namespace ast{ + +/* Declaration specifier */ +ir::type* typed_declaration_specifier::type(ir::module *mod) const { + ir::context &ctx = mod->get_context(); + switch (ty_) { + case VOID_T: return ir::type::get_void_ty(ctx); + case INT1_T: return ir::type::get_int1_ty(ctx); + case INT8_T: return ir::type::get_int8_ty(ctx); + case INT16_T: return ir::type::get_int16_ty(ctx); + case INT32_T: return ir::type::get_int32_ty(ctx); + case INT64_T: return ir::type::get_int64_ty(ctx); + case FLOAT32_T: return ir::type::get_float_ty(ctx); + case FLOAT64_T: return ir::type::get_double_ty(ctx); + default: throw std::runtime_error("unreachable"); + } +} + +std::vector typed_declaration_specifier::storage() const { + return {}; +} + + +ir::type* storage_declaration_specifier::type(ir::module *mod) const { + return decl_spec_->type(mod); +} + +std::vector storage_declaration_specifier::storage() const { + auto result = decl_spec_->storage(); + result.push_back(storage_spec_); + return result; +} + + +/* Parameter */ +ir::type* parameter::type(ir::module *mod) const { + return decl_->type(mod, spec_->type(mod), {}); +} + +std::vector parameter::storage() const { + return spec_->storage(); +} + +const identifier *parameter::id() const { + return decl_->id(); +} + +/* Declarators */ +ir::type* declarator::type(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const{ + if(ptr_) + return type_impl(mod, ptr_->type(mod, type, storage), storage); + return type_impl(mod, type, storage); +} + +// Identifier +ir::type* identifier::type_impl(ir::module *, ir::type *type, storage_spec_vec_const_ref_t) const{ + return type; +} + +const std::string &identifier::name() const{ + return name_; +} + +// Tile +ir::type* tile::type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t) const{ + ir::type::tile_shapes_t shapes; + for(expression *expr: shapes_->values()){ + ir::constant_int *shape = dynamic_cast(expr->codegen(mod)); + assert(shape); + shapes.push_back(shape); + } + return ir::tile_type::get(type, shapes); +} + + +// Pointer +ir::type* pointer::type_impl(ir::module*, ir::type *type, storage_spec_vec_const_ref_t storage) const{ + bool is_ptr_to_const = std::find(storage.begin(), storage.end(), CONSTANT_SPACE_T) != storage.end(); + return ir::pointer_type::get(type, is_ptr_to_const?4:1); +} + +// Function +void function::bind_parameters(ir::module *mod, ir::function *fn) const{ + std::vector args = fn->args(); + assert(args.size() == args_->values().size()); + for(size_t i = 0; i < args.size(); i++){ + parameter *param_i = args_->values().at(i); + const identifier *id_i = param_i->id(); + if(id_i){ + args[i]->set_name(id_i->name()); + mod->set_value(id_i->name(), nullptr, args[i]); + mod->get_scope().types[id_i->name()] = args[i]->get_type(); + } + } +} + +ir::type* function::type_impl(ir::module* mod, ir::type *type, storage_spec_vec_const_ref_t) const{ + std::vector types; + for(parameter* param: args_->values()) + types.push_back(param->type(mod)); + return ir::function_type::get(type, types); +} + + +/* Declaration */ +ir::value* declaration::codegen(ir::module* mod) const{ + for(initializer *init: init_->values()) + init->set_specifier(spec_); + init_->codegen(mod); + return nullptr; +} + +/* Initializer */ +ir::type* initializer::type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const{ + return decl_->type(mod, type, storage); +} + +void initializer::set_specifier(const declaration_specifier *spec) { + spec_ = spec; +} + +ir::value* initializer::codegen(ir::module * mod) const{ + std::vector storage = spec_->storage(); + ir::type *ty = decl_->type(mod, spec_->type(mod), storage); + std::string name = decl_->id()->name(); + ir::value *value = ir::undef_value::get(ty); + if(std::find(storage.begin(), storage.end(), TUNABLE_T) != storage.end()){ + auto csts = dynamic_cast*>((node*)expr_); + if(csts == nullptr) + throw std::runtime_error("must specify constant list for metaparameters"); + std::vector values; + for(constant* cst: csts->values()) + values.push_back(cst->value()); + value = ir::metaparameter::create(mod->get_context(), ty, values); + mod->register_global(name, value); + } + else if(expr_){ + value = expr_->codegen(mod); + value = explicit_cast(mod->get_builder(), value, ty); + implicit_broadcast(mod, ty, value); + } + value->set_name(name); + mod->set_value(name, value); + mod->get_scope().types[name] = ty; + if(auto *x = dynamic_cast(value)) + mod->add_alloc(x); + if(std::find(storage.begin(), storage.end(), CONST_T) != storage.end()) + mod->set_const(name); + return value; +} + +/* Type name */ +ir::type *type_name::type(ir::module *mod) const{ + return decl_->type(mod, spec_->type(mod), {}); +} + +/* Function definition */ +ir::attribute_t get_ir_attr(STORAGE_SPEC_T spec){ + switch(spec){ + case RESTRICT_T: return ir::noalias; + case READONLY_T: return ir::readonly; + case WRITEONLY_T: return ir::writeonly; + default: throw std::runtime_error("cannot convert storage specifier to IR function attribute"); + } +} + +ir::value* function_definition::codegen(ir::module *mod) const{ + ir::function_type *prototype = (ir::function_type*)header_->type(mod, spec_->type(mod), spec_->storage()); + const std::string &name = header_->id()->name(); + ir::function *fn = mod->get_or_insert_function(name, prototype); + for(unsigned i = 0; i < header_->get_num_args(); i++){ + parameter *param = header_->get_arg(i); + std::vector storage = param->storage(); + for(STORAGE_SPEC_T spec: storage) + fn->add_attr(1 + i, get_ir_attr(spec)); + } + header_->bind_parameters(mod, fn); + ir::basic_block *entry = ir::basic_block::create(mod->get_context(), "entry", fn); + mod->seal_block(entry); + mod->get_builder().set_insert_point(entry); + body_->codegen(mod); + mod->get_builder().create_ret_void(); + return nullptr; +} + +} + +} diff --git a/lib/ast/error.cpp b/lib/ast/error.cpp new file mode 100644 index 000000000..72c18277d --- /dev/null +++ b/lib/ast/error.cpp @@ -0,0 +1,49 @@ +#include "triton/ast/error.h" + + +namespace triton{ + +namespace ast{ + +static int current_line = 0; +static int current_column = 0; + +// begin token +void update_location(const char *text) { + for (int i = 0; text[i] != '\0'; i++){ + if (text[i] == '\n'){ + current_column = 0; + current_line++; + } + else if (text[i] == '\t') + current_column += 8 - (current_column % 8); + else + current_column++; + } +} + +void print_error(const char *cerror) { + std::string error(cerror); + auto it = error.find("syntax error,"); + error.replace(it, 13, ""); + std::cerr << "error at line " << current_line << " (column " << current_column << "): " << error << std::endl; + throw std::runtime_error("compilation failed"); +} + +char return_impl(char t, const char * yytext) { + update_location(yytext); + return t; +} + +yytokentype return_impl(yytokentype t, const char * yytext){ + update_location(yytext); + return t; +} + +void return_void(const char * yytext){ + update_location(yytext); +} + +} + +} diff --git a/lib/ast/expression.cpp b/lib/ast/expression.cpp new file mode 100644 index 000000000..7b6f43429 --- /dev/null +++ b/lib/ast/expression.cpp @@ -0,0 +1,329 @@ +#include "triton/ast/expression.h" +#include "triton/ast/declaration.h" +#include "triton/ir/constant.h" +#include "triton/ir/module.h" +#include "triton/ir/builder.h" +#include "triton/ir/type.h" + + +namespace triton{ + +namespace ast{ + + +/* Binary operator */ +ir::value *binary_expression::llvm_op(ir::module *mod, ir::builder &builder, ir::value *lhs, ir::value *rhs, const std::string &name) const +{ + bool is_float = false, is_ptr = false, is_int = false, is_signed = false; + implicit_cast(builder, lhs, rhs, is_float, is_ptr, is_int, is_signed); + implicit_broadcast(mod, lhs, rhs); + if(op_==MUL && is_float) + return builder.create_fmul(lhs, rhs, name); + if(op_==MUL && is_int) + return builder.create_mul(lhs, rhs, name); + if(op_==DIV && is_float) + return builder.create_fdiv(lhs, rhs, name); + if(op_==DIV && is_int && is_signed) + return builder.create_sdiv(lhs, rhs, name); + if(op_==DIV && is_int && !is_signed) + return builder.create_udiv(lhs, rhs, name); + if(op_==MOD && is_float) + return builder.create_frem(lhs, rhs, name); + if(op_==MOD && is_int && is_signed) + return builder.create_srem(lhs, rhs, name); + if(op_==MOD && is_int && !is_signed) + return builder.create_urem(lhs, rhs, name); + if(op_==ADD && is_float) + return builder.create_fadd(lhs, rhs, name); + if(op_==ADD && is_int) + return builder.create_add(lhs, rhs); + if(op_==ADD && is_ptr) + return builder.create_gep(lhs, {rhs}); + if(op_==SUB && is_float) + return builder.create_fsub(lhs, rhs, name); + if(op_==SUB && is_int) + return builder.create_sub(lhs, rhs, name); + if(op_==SUB && is_ptr) + return builder.create_gep(lhs, {builder.create_neg(rhs)}); + if(op_==LEFT_SHIFT) + return builder.create_shl(lhs, rhs, name); + if(op_==RIGHT_SHIFT) + return builder.create_ashr(lhs, rhs, name); + if(op_ == LT && is_float) + return builder.create_fcmpOLT(lhs, rhs, name); + if(op_ == LT && is_int && is_signed) + return builder.create_icmpSLT(lhs, rhs, name); + if(op_ == LT && is_int && !is_signed) + return builder.create_icmpULT(lhs, rhs, name); + if(op_ == GT && is_float) + return builder.create_fcmpOGT(lhs, rhs, name); + if(op_ == GT && is_int && is_signed) + return builder.create_icmpSGT(lhs, rhs, name); + if(op_ == GT && is_int && !is_signed) + return builder.create_icmpUGT(lhs, rhs, name); + if(op_ == LE && is_float) + return builder.create_fcmpOLE(lhs, rhs, name); + if(op_ == LE && is_int && is_signed) + return builder.create_icmpSLE(lhs, rhs, name); + if(op_ == LE && is_int && !is_signed) + return builder.create_icmpULE(lhs, rhs, name); + if(op_ == GE && is_float) + return builder.create_fcmpOGE(lhs, rhs, name); + if(op_ == GE && is_int && is_signed) + return builder.create_icmpSGE(lhs, rhs, name); + if(op_ == GE && is_int && !is_signed) + return builder.create_icmpUGE(lhs, rhs, name); + if(op_ == EQ && is_float) + return builder.create_fcmpOEQ(lhs, rhs, name); + if(op_ == EQ && is_int) + return builder.create_icmpEQ(lhs, rhs, name); + if(op_ == NE && is_float) + return builder.create_fcmpONE(lhs, rhs, name); + if(op_ == NE && is_int) + return builder.create_icmpNE(lhs, rhs, name); + if(op_ == AND) + return builder.create_and(lhs, rhs, name); + if(op_ == XOR) + return builder.create_xor(lhs, rhs, name); + if(op_ == OR) + return builder.create_or(lhs, rhs, name); + if(op_ == LAND) + return builder.create_and(lhs, rhs, name); + if(op_ == LOR) + return builder.create_or(lhs, rhs, name); + throw std::runtime_error("unreachable"); +} + +ir::value* binary_expression::codegen(ir::module *mod) const{ + ir::value *lhs = lhs_->codegen(mod); + ir::value *rhs = rhs_->codegen(mod); + ir::value *result = llvm_op(mod, mod->get_builder(), lhs, rhs, ""); + return result; +} + +/* Builtin expression */ + +// alloc constant +ir::value* alloc_const_expression::codegen(ir::module *mod) const { + ir::type *ty = spec_->type(mod); + ir::constant_int *size = (ir::constant_int*)size_->codegen(mod); + ir::alloc_const *res = new ir::alloc_const(ty, size); + return res; +} + +// get_global_range +ir::value* get_global_range_expression::codegen(ir::module *mod) const { + ir::builder &builder = mod->get_builder(); + return builder.create_get_global_range(axis_->value(), (ir::constant_int*)size_->codegen(mod)); +} + +// get_range_id +ir::value* get_range_id_expression::codegen(ir::module *mod) const { + return mod->get_builder().create_get_range_id(axis_->value()); +} + +// atomic cas +ir::value* atomic_cas_expression::codegen(ir::module *mod) const { + ir::value *ptr = ptr_->codegen(mod); + ir::value *cmp = cmp_->codegen(mod); + ir::value *val = val_->codegen(mod); + return mod->get_builder().create_atomic_cas(ptr, cmp, val); +} + +// matmul +ir::value* matmul_expression::codegen(ir::module *mod) const { + ir::value *A = A_->codegen(mod); + ir::value *B = B_->codegen(mod); + ir::value *C = C_->codegen(mod); +// unsigned M = A->get_type()->get_tile_shapes()[0]; +// unsigned N = B->get_type()->get_tile_shapes()[1]; +// ir::type *scalar_ty = A->get_type()->get_scalar_ty(); +// ir::type *tile_ty = ir::tile_type::get(scalar_ty, {M, N}); +// ir::value *tmp = ir::undef_value::get(tile_ty); +// implicit_broadcast(mod, tmp, C); + return mod->get_builder().create_dot(A, B, C); +} + +// min +ir::value* min_expression::codegen(ir::module *mod) const { + ir::value* cmp = binary_expression(LT, (node*)x_, (node*)y_).codegen(mod); + ir::value* x = ((ir::cmp_inst*)cmp)->get_operand(0); + ir::value* y = ((ir::cmp_inst*)cmp)->get_operand(1); + return mod->get_builder().create_select(cmp, x, y); +} + +// max +ir::value* max_expression::codegen(ir::module *mod) const { + ir::value* cmp = binary_expression(GT, (node*)x_, (node*)y_).codegen(mod); + ir::value* x = ((ir::cmp_inst*)cmp)->get_operand(0); + ir::value* y = ((ir::cmp_inst*)cmp)->get_operand(1); + return mod->get_builder().create_select(cmp, x, y); +} + +// select +ir::value* select_expression::codegen(ir::module *mod) const { + ir::value* pred = pred_->codegen(mod); + ir::value* if_value = if_value_->codegen(mod); + ir::value* else_value = else_value_->codegen(mod); + return mod->get_builder().create_select(pred, if_value, else_value); +} + +// Trans +ir::value* trans_expression::codegen(ir::module *mod) const { + return mod->get_builder().create_trans(arg_->codegen(mod)); +} + +/* Postfix expression */ +ir::value* indexing_expression::codegen(ir::module *mod) const{ + ir::value *in = mod->get_value(id_->name()); + const std::vector &slices = slices_->values(); + auto in_shapes = in->get_type()->get_tile_shapes(); + ir::type::tile_shapes_t::value_type one = ir::tile_type::make_one(mod->get_context()); + ir::type::tile_shapes_t out_shapes(slices.size()); + // create shapes + size_t current = 0; + for(size_t i = 0; i < out_shapes.size(); i++) + out_shapes[i] = (slices[i]->type()==NEWAXIS)?one:in_shapes[current++]; + return mod->get_builder().create_reshape(in, out_shapes); +} + + +/* Unary operator */ +ir::value *unary_expression::llvm_op(ir::builder &builder, ir::value *arg, const std::string &name) const{ + ir::type *atype = arg->get_type(); + bool is_float = atype->is_floating_point_ty(); + bool is_int = atype->is_integer_ty(); + if(op_ == INC) + return builder.create_add(arg, builder.get_int32(1), name); + if(op_ == DEC) + return builder.create_sub(arg, builder.get_int32(1), name); + if(op_ == PLUS) + return arg; + if(op_ == MINUS && is_float) + return builder.create_fneg(arg, name); + if(op_ == MINUS && is_int) + return builder.create_neg(arg, name); + if(op_ == ADDR) + throw std::runtime_error("not supported"); + if(op_ == DEREF) + return builder.create_load(arg, name); + if(op_ == COMPL) + throw std::runtime_error("not supported"); + if(op_ == NOT) + return builder.create_not(arg, name); + throw std::runtime_error("unreachable"); +} + +ir::value* unary_expression::codegen(ir::module *mod) const{ + ir::value *arg = arg_->codegen(mod); + ir::value *result = llvm_op(mod->get_builder(), arg, ""); + return result; +} + +/* Cast operator */ +ir::value *cast_expression::llvm_op(ir::builder &builder, ir::type *T, ir::value *arg, const std::string &name) const{ + return nullptr; +} + +ir::value* cast_expression::codegen(ir::module *mod) const{ + ir::value *arg = arg_->codegen(mod); + ir::type *T = T_->type(mod); + return llvm_op(mod->get_builder(), T, arg, ""); +} + +/* Conditional expression */ +ir::value *conditional_expression::codegen(ir::module *mod) const{ + ir::builder &builder = mod->get_builder(); + ir::value *pred = cond_->codegen(mod); + ir::instruction *mask = (ir::instruction*)builder.create_mask(pred); + ir::value *true_mask = mask->get_result(0); + ir::value *false_mask = mask->get_result(1); + ir::value *true_value = true_value_->codegen(mod); + ir::value *false_value = false_value_->codegen(mod); + if(auto *itn = dynamic_cast(true_value)) + itn->set_mask_pred(true_mask); + if(auto *itn = dynamic_cast(false_value)) + itn->set_mask_pred(false_mask); + bool is_float, is_ptr, is_int, is_signed; + ir::value *uncasted_true_value = true_value; + ir::value *uncasted_false_value = false_value; + implicit_cast(builder, true_value, false_value, is_float, is_ptr, is_int, is_signed); + implicit_broadcast(mod, true_value, false_value); + { + ir::value *current = true_value; + while(current != uncasted_true_value) { + if(auto *itn = dynamic_cast(current)){ + itn->set_mask_pred(true_mask); + current = itn->get_operand(0); + } + else + break; + } + } + { + ir::value *current = false_value; + while(current != uncasted_false_value) { + if(auto *itn = dynamic_cast(current)){ + itn->set_mask_pred(false_mask); + current = itn->get_operand(0); + } + else + break; + } + } + ir::value *result = builder.create_merge(true_mask, true_value, false_mask, false_value); + return result; +} + +/* Assignment expression */ +ir::value *assignment_expression::codegen(ir::module *mod) const{ + ir::value *rvalue = rvalue_->codegen(mod); + if(auto *x = dynamic_cast(lvalue_)){ + ir::type *ty = mod->get_scope().types.at(x->id()->name()); + rvalue = explicit_cast(mod->get_builder(), rvalue, ty); + implicit_broadcast(mod, ty, rvalue); + mod->set_value(x->id()->name(), rvalue); + } + else if(auto* x = dynamic_cast(lvalue_)){ + assert(x->get_op()==DEREF); + assert(x->lvalue()); + ir::value *ptr = x->lvalue()->codegen(mod); + rvalue = mod->get_builder().create_store(ptr, rvalue); + } + return rvalue; +} + + +/* String literal */ +ir::value* string_literal::codegen(ir::module *) const{ + throw std::runtime_error("not supported"); +// return ir::constant_data_array::get_string(mod->get_context(), value_); +} + +/* Constant */ +ir::value* constant::codegen(ir::module *mod) const{ + return mod->get_builder().get_int32(value_); +} + +int constant::value() const{ + return value_; +} + +/* Constant range */ +ir::value* constant_range::codegen(ir::module *mod) const{ + return ir::constant_range::get((ir::constant_int*)first_->codegen(mod), + (ir::constant_int*)last_->codegen(mod)); +} + +/* Named */ +ir::value* named_expression::codegen(ir::module *mod) const{ + const std::string &name = id()->name(); + const auto& declarations = mod->get_scope().types; + if(declarations.find(name) == declarations.end()) + throw std::runtime_error("variable " + name + " not declared"); + return mod->get_value(name); +} + +} + +} diff --git a/lib/ast/lowering.cpp b/lib/ast/lowering.cpp deleted file mode 100644 index 3f8623e1c..000000000 --- a/lib/ast/lowering.cpp +++ /dev/null @@ -1,855 +0,0 @@ -#include -#include -#include "triton/ast/ast.h" -#include "triton/ir/constant.h" -#include "triton/ir/function.h" -#include "triton/ir/module.h" -#include "triton/ir/basic_block.h" -#include "triton/ir/builder.h" -#include "triton/ir/type.h" -#include -#include - - -namespace triton{ - -namespace ast{ - -static int current_line = 0; -static int current_column = 0; - -/* node */ -ir::value *node::explicit_cast(ir::builder &builder, ir::value *src, ir::type *dst_ty){ - ir::type *src_scalar_ty = src->get_type()->get_scalar_ty(); - ir::type *dst_scalar_ty = dst_ty->get_scalar_ty(); - bool src_signed = false; - bool dst_signed = false; - if(src_scalar_ty == dst_scalar_ty) - return src; - else if(src_scalar_ty->is_integer_ty() && src_signed && dst_scalar_ty->is_floating_point_ty()) - return builder.create_si_to_fp(src, dst_ty); - - else if(src_scalar_ty->is_integer_ty() && !src_signed && dst_scalar_ty->is_floating_point_ty()) - return builder.create_ui_to_fp(src, dst_ty); - - else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_integer_ty() && dst_signed) - return builder.create_fp_to_si(src, dst_ty); - - else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_integer_ty() && !dst_signed) - return builder.create_fp_to_ui(src, dst_ty); - - else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_floating_point_ty() && - src_scalar_ty->get_fp_mantissa_width() < dst_scalar_ty->get_fp_mantissa_width()) - return builder.create_fp_ext(src, dst_ty); - - else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_floating_point_ty() && - src_scalar_ty->get_fp_mantissa_width() > dst_scalar_ty->get_fp_mantissa_width()) - return builder.create_fp_trunc(src, dst_ty); - - else if(src_scalar_ty->is_integer_ty() && dst_scalar_ty->is_integer_ty() && - src_scalar_ty->get_integer_bitwidth()) - return builder.create_int_cast(src, dst_ty, dst_signed); - - else - throw std::runtime_error("unreachable"); -} - - -void node::implicit_cast(ir::builder &builder, ir::value *&lhs, ir::value *&rhs, - bool &is_float, bool &is_ptr, bool &is_int, bool &is_signed){ - // Input types - ir::type *left_ty = lhs->get_type()->get_scalar_ty(); - ir::type *right_ty = rhs->get_type()->get_scalar_ty(); - // One operand is pointer - if(left_ty->is_pointer_ty() || right_ty->is_pointer_ty()){ - if(left_ty->is_pointer_ty() && right_ty->is_pointer_ty()) - throw std::runtime_error("invalid operands"); - if(right_ty->is_pointer_ty()) - std::swap(lhs, rhs); - is_ptr = true; - } - // One operand is double - else if(left_ty->is_double_ty() || right_ty->is_double_ty()){ - ir::value *&to_convert = left_ty->is_double_ty()?rhs:lhs; - to_convert = explicit_cast(builder, to_convert, builder.get_double_ty()); - is_float = true; - } - // One operand is float - else if(left_ty->is_float_ty() || right_ty->is_float_ty()){ - ir::value *&to_convert = left_ty->is_float_ty()?rhs:lhs; - to_convert = explicit_cast(builder, to_convert, builder.get_float_ty()); - is_float = true; - } - // Both operands are integers - else if(left_ty->is_integer_ty() && right_ty->is_integer_ty()){ - is_int = true; - is_signed = true; // always signed for now - if(left_ty->get_integer_bitwidth() != right_ty->get_integer_bitwidth()){ - ir::value *&to_convert = (left_ty->get_integer_bitwidth() > right_ty->get_integer_bitwidth())?rhs:lhs; - ir::type *dst_ty = (to_convert==lhs)?right_ty:left_ty; - to_convert = explicit_cast(builder, to_convert, dst_ty); - } - } - // Not reachable - else - throw std::runtime_error("unreachable"); -} - -void node::implicit_broadcast(ir::module *mod, ir::value *&lhs, ir::value *&rhs) { - ir::type *lhs_ty = lhs->get_type(); - ir::type *rhs_ty = rhs->get_type(); - ir::type *res_ty = nullptr; - if(!lhs_ty->is_tile_ty() && !rhs_ty->is_tile_ty()) - return; - else if(lhs_ty->is_tile_ty() && !rhs_ty->is_tile_ty()) - res_ty = lhs_ty; - else if(!lhs_ty->is_tile_ty() && rhs_ty->is_tile_ty()) - res_ty = rhs_ty; - else{ - auto lhs_shapes = lhs_ty->get_tile_shapes(); - auto rhs_shapes = rhs_ty->get_tile_shapes(); - size_t lhs_size = lhs_shapes.size(); - size_t rhs_size = rhs_shapes.size(); - size_t res_size = std::max(lhs_size, rhs_size); - ir::type::tile_shapes_t res_shapes(res_size); - ir::type::tile_shapes_t::value_type one = ir::tile_type::make_one(mod->get_context()); - for(int i = 0; i < res_size; i++){ - if(i >= res_size - lhs_size && i >= res_size - rhs_size) - res_shapes[i] = lhs_shapes[i]==one?rhs_shapes[i]:lhs_shapes[i]; - else if(i >= res_size - lhs_size) - res_shapes[i] = lhs_shapes[i]; - else if(i >= res_size - rhs_size) - res_shapes[i] = rhs_shapes[i]; - } - res_ty = ir::tile_type::get(lhs_ty->get_scalar_ty(), res_shapes); - } - implicit_broadcast(mod, res_ty, rhs); - implicit_broadcast(mod, res_ty, lhs); -} - -void node::implicit_broadcast(ir::module *mod, ir::type *ty, ir::value *&src){ - ir::builder &builder = mod->get_builder(); - ir::type *src_ty = src->get_type(); - ir::type::tile_shapes_t::value_type one = ir::tile_type::make_one(mod->get_context()); - // Both are scalar - if(!ty->is_tile_ty() && !src_ty->is_tile_ty()) - return; - // Broadcast scalar - if(ty->is_tile_ty() && !src_ty->is_tile_ty()){ - src = builder.create_splat(src, ty->get_tile_shapes()); - return; - } - // Downcast tile - if(!ty->is_tile_ty() && src_ty->is_tile_ty()){ - for(ir::constant *shape: src_ty->get_tile_shapes()) - if(shape != one) - throw std::runtime_error("cannot downcast"); - src = builder.create_downcast(src); - return; - } - // Both are arrays - auto dst_shapes = ty->get_tile_shapes(); - auto src_shapes = src_ty->get_tile_shapes(); - int dst_dim = dst_shapes.size(); - int src_dim = src_shapes.size(); - // Pad - int off = dst_dim - src_dim; - for(size_t i = 0; i < off; i++) - src_shapes.insert(src_shapes.begin(), one); - if(off > 0) - src = builder.create_reshape(src, src_shapes); - // Broadcast - for(int i = dst_dim - 1; i>= 0; i--) - if(dst_shapes[i] != src_shapes[i] && dst_shapes[i] != one && src_shapes[i] != one) - throw std::runtime_error("cannot broadcast"); - if(dst_shapes != src_shapes) - src = builder.create_broadcast(src, dst_shapes); -} - -/* Helper */ -inline bool is_terminator(ir::value* x) { - return x && dynamic_cast(x); -} - -/* Translation unit */ -ir::value* translation_unit::codegen(ir::module *mod) const{ - mod->add_new_scope(); - decls_.codegen(mod); - return nullptr; -} - -/* Declaration specifier */ -ir::type* typed_declaration_specifier::type(ir::module *mod) const { - ir::context &ctx = mod->get_context(); - switch (ty_) { - case VOID_T: return ir::type::get_void_ty(ctx); - case INT1_T: return ir::type::get_int1_ty(ctx); - case INT8_T: return ir::type::get_int8_ty(ctx); - case INT16_T: return ir::type::get_int16_ty(ctx); - case INT32_T: return ir::type::get_int32_ty(ctx); - case INT64_T: return ir::type::get_int64_ty(ctx); - case FLOAT32_T: return ir::type::get_float_ty(ctx); - case FLOAT64_T: return ir::type::get_double_ty(ctx); - default: throw std::runtime_error("unreachable"); - } -} - -std::vector typed_declaration_specifier::storage() const { - return {}; -} - - -ir::type* storage_declaration_specifier::type(ir::module *mod) const { - return decl_spec_->type(mod); -} - -std::vector storage_declaration_specifier::storage() const { - auto result = decl_spec_->storage(); - result.push_back(storage_spec_); - return result; -} - - -/* Parameter */ -ir::type* parameter::type(ir::module *mod) const { - return decl_->type(mod, spec_->type(mod), {}); -} - -std::vector parameter::storage() const { - return spec_->storage(); -} - -const identifier *parameter::id() const { - return decl_->id(); -} - -/* Declarators */ -ir::type* declarator::type(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const{ - if(ptr_) - return type_impl(mod, ptr_->type(mod, type, storage), storage); - return type_impl(mod, type, storage); -} - -// Identifier -ir::type* identifier::type_impl(ir::module *, ir::type *type, storage_spec_vec_const_ref_t) const{ - return type; -} - -const std::string &identifier::name() const{ - return name_; -} - -// Tile -ir::type* tile::type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t) const{ - ir::type::tile_shapes_t shapes; - for(expression *expr: shapes_->values()){ - ir::constant_int *shape = dynamic_cast(expr->codegen(mod)); - assert(shape); - shapes.push_back(shape); - } - return ir::tile_type::get(type, shapes); -} - - -// Pointer -ir::type* pointer::type_impl(ir::module*, ir::type *type, storage_spec_vec_const_ref_t storage) const{ - bool is_ptr_to_const = std::find(storage.begin(), storage.end(), CONSTANT_SPACE_T) != storage.end(); - return ir::pointer_type::get(type, is_ptr_to_const?4:1); -} - -// Function -void function::bind_parameters(ir::module *mod, ir::function *fn) const{ - std::vector args = fn->args(); - assert(args.size() == args_->values().size()); - for(size_t i = 0; i < args.size(); i++){ - parameter *param_i = args_->values().at(i); - const identifier *id_i = param_i->id(); - if(id_i){ - args[i]->set_name(id_i->name()); - mod->set_value(id_i->name(), nullptr, args[i]); - mod->get_scope().types[id_i->name()] = args[i]->get_type(); - } - } -} - -ir::type* function::type_impl(ir::module* mod, ir::type *type, storage_spec_vec_const_ref_t) const{ - std::vector types; - for(parameter* param: args_->values()) - types.push_back(param->type(mod)); - return ir::function_type::get(type, types); -} - -/* Function definition */ -ir::attribute_t get_ir_attr(STORAGE_SPEC_T spec){ - switch(spec){ - case RESTRICT_T: return ir::noalias; - case READONLY_T: return ir::readonly; - case WRITEONLY_T: return ir::writeonly; - default: throw std::runtime_error("cannot convert storage specifier to IR function attribute"); - } -} - -ir::value* function_definition::codegen(ir::module *mod) const{ - ir::function_type *prototype = (ir::function_type*)header_->type(mod, spec_->type(mod), spec_->storage()); - const std::string &name = header_->id()->name(); - ir::function *fn = mod->get_or_insert_function(name, prototype); - for(unsigned i = 0; i < header_->get_num_args(); i++){ - parameter *param = header_->get_arg(i); - std::vector storage = param->storage(); - for(STORAGE_SPEC_T spec: storage) - fn->add_attr(1 + i, get_ir_attr(spec)); - } - header_->bind_parameters(mod, fn); - ir::basic_block *entry = ir::basic_block::create(mod->get_context(), "entry", fn); - mod->seal_block(entry); - mod->get_builder().set_insert_point(entry); - body_->codegen(mod); - mod->get_builder().create_ret_void(); - return nullptr; -} - -/* Statements */ -ir::value* compound_statement::codegen(ir::module* mod) const{ - mod->add_new_scope(); - if(items_) - items_->codegen(mod); - mod->pop_scope(); - return nullptr; -} - -/* expression statement */ -ir::value* expression_statement::codegen(ir::module *mod) const{ - ir::builder &builder = mod->get_builder(); - ir::basic_block *block = builder.get_insert_block(); - if(pred_) { - // check that it is an assignment - assignment_expression *assignment = dynamic_cast(expr_); - assert(assignment); - // generate mask - ir::value *pred = pred_->codegen(mod); - ir::mask_inst *mask = (ir::mask_inst*)builder.create_mask(pred); - // generate expression - unsigned szbegin = block->get_inst_list().size(); - ir::value *expr = expr_->codegen(mod); - ir::basic_block::iterator begin = block->begin(); - std::advance(begin, szbegin); - // set mask - ir::type *ty = expr->get_type(); - for(auto it = begin; it != builder.get_insert_point(); it++) - (*it)->set_mask_pred(mask->get_result(0)); -// if(auto *itn = dynamic_cast(expr)) -// itn->set_mask_pred(mask->get_result(0)); - if(ty->is_void_ty()) - return expr; - // merge with psi - ir::psi_inst *psi = (ir::psi_inst*)builder.create_merge(mask->get_result(0), expr, - mask->get_result(1), ir::undef_value::get(ty)); - std::string name = ((named_expression*)assignment->lvalue())->id()->name(); - mod->set_value(name, psi); - return psi; - } - return expr_->codegen(mod); -} - -/* For statement */ -ir::value* iteration_statement::codegen(ir::module *mod) const{ - ir::builder &builder = mod->get_builder(); - ir::context &ctx = mod->get_context(); - ir::basic_block *current_bb = builder.get_insert_block(); - ir::function *fn = current_bb->get_parent(); - ir::basic_block *loop_bb = ir::basic_block::create(ctx, "loop", fn); - ir::basic_block *next_bb = ir::basic_block::create(ctx, "postloop", fn); - mod->set_continue_fn([&](){ - if(exec_) - exec_->codegen(mod); - ir::value *cond = explicit_cast(builder, stop_->codegen(mod), ir::type::get_int1_ty(ctx)); - return builder.create_cond_br(cond, loop_bb, next_bb); - }); - init_->codegen(mod); - ir::value *cond = explicit_cast(builder, stop_->codegen(mod), ir::type::get_int1_ty(ctx)); - builder.create_cond_br(cond, loop_bb, next_bb); -// builder.create_br(loop_bb); - builder.set_insert_point(loop_bb); - if(!is_terminator(statements_->codegen(mod))) - mod->get_continue_fn()(); - ir::basic_block *stop_bb = builder.get_insert_block(); - mod->seal_block(stop_bb); - mod->seal_block(loop_bb); - mod->seal_block(builder.get_insert_block()); - mod->seal_block(next_bb); - builder.set_insert_point(next_bb); - return nullptr; -} - -/* While statement */ -ir::value* while_statement::codegen(ir::module* mod) const{ - ir::builder &builder = mod->get_builder(); - ir::context &ctx = mod->get_context(); - ir::basic_block *current_bb = builder.get_insert_block(); - ir::function *fn = current_bb->get_parent(); - ir::basic_block *loop_bb = ir::basic_block::create(ctx, "loop", fn); - ir::basic_block *next_bb = ir::basic_block::create(ctx, "postloop", fn); - mod->set_continue_fn([&](){ - ir::value *cond = explicit_cast(builder, cond_->codegen(mod), ir::type::get_int1_ty(ctx)); - return builder.create_cond_br(cond, loop_bb, next_bb); - }); - ir::value *cond = explicit_cast(builder, cond_->codegen(mod), ir::type::get_int1_ty(ctx)); - builder.create_cond_br(cond, loop_bb, next_bb); - builder.set_insert_point(loop_bb); - if(!is_terminator(statements_->codegen(mod))) - mod->get_continue_fn()(); - ir::basic_block *stop_bb = builder.get_insert_block(); - mod->seal_block(stop_bb); - mod->seal_block(loop_bb); - mod->seal_block(builder.get_insert_block()); - mod->seal_block(next_bb); - builder.set_insert_point(next_bb); - return nullptr; -} - -/* Selection statement */ -ir::value* selection_statement::codegen(ir::module* mod) const{ - ir::builder &builder = mod->get_builder(); - ir::context &ctx = mod->get_context(); - ir::function *fn = builder.get_insert_block()->get_parent(); - ir::value *cond = cond_->codegen(mod); - ir::basic_block *then_bb = ir::basic_block::create(ctx, "then", fn); - ir::basic_block *else_bb = else_value_?ir::basic_block::create(ctx, "else", fn):nullptr; - ir::basic_block *endif_bb = ir::basic_block::create(ctx, "endif", fn); - mod->seal_block(then_bb); - if(else_value_) - mod->seal_block(else_bb); - - // Branch - if(else_value_) - builder.create_cond_br(cond, then_bb, else_bb); - else - builder.create_cond_br(cond, then_bb, endif_bb); - // Then - builder.set_insert_point(then_bb); - if(!is_terminator(then_value_->codegen(mod))) - builder.create_br(endif_bb); - // Else - if(else_value_){ - builder.set_insert_point(else_bb); - if(!is_terminator(else_value_->codegen(mod))) - builder.create_br(endif_bb); - } - // Endif - mod->seal_block(endif_bb); - builder.set_insert_point(endif_bb); - return nullptr; -} - -/* Continue statement */ -ir::value* continue_statement::codegen(ir::module *mod) const{ - return mod->get_continue_fn()(); -} - -/* Declaration */ -ir::value* declaration::codegen(ir::module* mod) const{ - for(initializer *init: init_->values()) - init->set_specifier(spec_); - init_->codegen(mod); - return nullptr; -} - -/* Initializer */ -ir::type* initializer::type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const{ - return decl_->type(mod, type, storage); -} - -void initializer::set_specifier(const declaration_specifier *spec) { - spec_ = spec; -} - -ir::value* initializer::codegen(ir::module * mod) const{ - std::vector storage = spec_->storage(); - ir::type *ty = decl_->type(mod, spec_->type(mod), storage); - std::string name = decl_->id()->name(); - ir::value *value = ir::undef_value::get(ty); - if(std::find(storage.begin(), storage.end(), TUNABLE_T) != storage.end()){ - auto csts = dynamic_cast*>((node*)expr_); - if(csts == nullptr) - throw std::runtime_error("must specify constant list for metaparameters"); - std::vector values; - for(constant* cst: csts->values()) - values.push_back(cst->value()); - value = ir::metaparameter::create(mod->get_context(), ty, values); - mod->register_global(name, value); - } - else if(expr_){ - value = expr_->codegen(mod); - value = explicit_cast(mod->get_builder(), value, ty); - implicit_broadcast(mod, ty, value); - } - value->set_name(name); - mod->set_value(name, value); - mod->get_scope().types[name] = ty; - if(auto *x = dynamic_cast(value)) - mod->add_alloc(x); - if(std::find(storage.begin(), storage.end(), CONST_T) != storage.end()) - mod->set_const(name); - return value; -} - -/*------------------*/ -/* Expression */ -/*------------------*/ -/* Binary operator */ -ir::value *binary_operator::llvm_op(ir::module *mod, ir::builder &builder, ir::value *lhs, ir::value *rhs, const std::string &name) const -{ - bool is_float = false, is_ptr = false, is_int = false, is_signed = false; - implicit_cast(builder, lhs, rhs, is_float, is_ptr, is_int, is_signed); - implicit_broadcast(mod, lhs, rhs); - if(op_==MUL && is_float) - return builder.create_fmul(lhs, rhs, name); - if(op_==MUL && is_int) - return builder.create_mul(lhs, rhs, name); - if(op_==DIV && is_float) - return builder.create_fdiv(lhs, rhs, name); - if(op_==DIV && is_int && is_signed) - return builder.create_sdiv(lhs, rhs, name); - if(op_==DIV && is_int && !is_signed) - return builder.create_udiv(lhs, rhs, name); - if(op_==MOD && is_float) - return builder.create_frem(lhs, rhs, name); - if(op_==MOD && is_int && is_signed) - return builder.create_srem(lhs, rhs, name); - if(op_==MOD && is_int && !is_signed) - return builder.create_urem(lhs, rhs, name); - if(op_==ADD && is_float) - return builder.create_fadd(lhs, rhs, name); - if(op_==ADD && is_int) - return builder.create_add(lhs, rhs); - if(op_==ADD && is_ptr) - return builder.create_gep(lhs, {rhs}); - if(op_==SUB && is_float) - return builder.create_fsub(lhs, rhs, name); - if(op_==SUB && is_int) - return builder.create_sub(lhs, rhs, name); - if(op_==SUB && is_ptr) - return builder.create_gep(lhs, {builder.create_neg(rhs)}); - if(op_==LEFT_SHIFT) - return builder.create_shl(lhs, rhs, name); - if(op_==RIGHT_SHIFT) - return builder.create_ashr(lhs, rhs, name); - if(op_ == LT && is_float) - return builder.create_fcmpOLT(lhs, rhs, name); - if(op_ == LT && is_int && is_signed) - return builder.create_icmpSLT(lhs, rhs, name); - if(op_ == LT && is_int && !is_signed) - return builder.create_icmpULT(lhs, rhs, name); - if(op_ == GT && is_float) - return builder.create_fcmpOGT(lhs, rhs, name); - if(op_ == GT && is_int && is_signed) - return builder.create_icmpSGT(lhs, rhs, name); - if(op_ == GT && is_int && !is_signed) - return builder.create_icmpUGT(lhs, rhs, name); - if(op_ == LE && is_float) - return builder.create_fcmpOLE(lhs, rhs, name); - if(op_ == LE && is_int && is_signed) - return builder.create_icmpSLE(lhs, rhs, name); - if(op_ == LE && is_int && !is_signed) - return builder.create_icmpULE(lhs, rhs, name); - if(op_ == GE && is_float) - return builder.create_fcmpOGE(lhs, rhs, name); - if(op_ == GE && is_int && is_signed) - return builder.create_icmpSGE(lhs, rhs, name); - if(op_ == GE && is_int && !is_signed) - return builder.create_icmpUGE(lhs, rhs, name); - if(op_ == EQ && is_float) - return builder.create_fcmpOEQ(lhs, rhs, name); - if(op_ == EQ && is_int) - return builder.create_icmpEQ(lhs, rhs, name); - if(op_ == NE && is_float) - return builder.create_fcmpONE(lhs, rhs, name); - if(op_ == NE && is_int) - return builder.create_icmpNE(lhs, rhs, name); - if(op_ == AND) - return builder.create_and(lhs, rhs, name); - if(op_ == XOR) - return builder.create_xor(lhs, rhs, name); - if(op_ == OR) - return builder.create_or(lhs, rhs, name); - if(op_ == LAND) - return builder.create_and(lhs, rhs, name); - if(op_ == LOR) - return builder.create_or(lhs, rhs, name); - throw std::runtime_error("unreachable"); -} - -ir::value* binary_operator::codegen(ir::module *mod) const{ - ir::value *lhs = lhs_->codegen(mod); - ir::value *rhs = rhs_->codegen(mod); - ir::value *result = llvm_op(mod, mod->get_builder(), lhs, rhs, ""); - return result; -} - -/* Builtin expression */ - -// alloc constant -ir::value* alloc_const::codegen(ir::module *mod) const { - ir::type *ty = spec_->type(mod); - ir::constant_int *size = (ir::constant_int*)size_->codegen(mod); - ir::alloc_const *res = new ir::alloc_const(ty, size); - return res; -} - -// get_global_range -ir::value* get_global_range::codegen(ir::module *mod) const { - ir::builder &builder = mod->get_builder(); - return builder.create_get_global_range(axis_->value(), (ir::constant_int*)size_->codegen(mod)); -} - -// get_range_id -ir::value* get_range_id::codegen(ir::module *mod) const { - return mod->get_builder().create_get_range_id(axis_->value()); -} - -// atomic cas -ir::value* atomic_cas::codegen(ir::module *mod) const { - ir::value *ptr = ptr_->codegen(mod); - ir::value *cmp = cmp_->codegen(mod); - ir::value *val = val_->codegen(mod); - return mod->get_builder().create_atomic_cas(ptr, cmp, val); -} - -// matmul -ir::value* matmul_expression::codegen(ir::module *mod) const { - ir::value *A = A_->codegen(mod); - ir::value *B = B_->codegen(mod); - ir::value *C = C_->codegen(mod); -// unsigned M = A->get_type()->get_tile_shapes()[0]; -// unsigned N = B->get_type()->get_tile_shapes()[1]; -// ir::type *scalar_ty = A->get_type()->get_scalar_ty(); -// ir::type *tile_ty = ir::tile_type::get(scalar_ty, {M, N}); -// ir::value *tmp = ir::undef_value::get(tile_ty); -// implicit_broadcast(mod, tmp, C); - return mod->get_builder().create_dot(A, B, C); -} - -// min -ir::value* min_expression::codegen(ir::module *mod) const { - ir::value* cmp = binary_operator(LT, (node*)x_, (node*)y_).codegen(mod); - ir::value* x = ((ir::cmp_inst*)cmp)->get_operand(0); - ir::value* y = ((ir::cmp_inst*)cmp)->get_operand(1); - return mod->get_builder().create_select(cmp, x, y); -} - -// max -ir::value* max_expression::codegen(ir::module *mod) const { - ir::value* cmp = binary_operator(GT, (node*)x_, (node*)y_).codegen(mod); - ir::value* x = ((ir::cmp_inst*)cmp)->get_operand(0); - ir::value* y = ((ir::cmp_inst*)cmp)->get_operand(1); - return mod->get_builder().create_select(cmp, x, y); -} - -// select -ir::value* select_expression::codegen(ir::module *mod) const { - ir::value* pred = pred_->codegen(mod); - ir::value* if_value = if_value_->codegen(mod); - ir::value* else_value = else_value_->codegen(mod); - return mod->get_builder().create_select(pred, if_value, else_value); -} - -// Trans -ir::value* trans_expression::codegen(ir::module *mod) const { - return mod->get_builder().create_trans(arg_->codegen(mod)); -} - -/* Postfix expression */ -ir::value* indexing_expression::codegen(ir::module *mod) const{ - ir::value *in = mod->get_value(id_->name()); - const std::vector &slices = slices_->values(); - auto in_shapes = in->get_type()->get_tile_shapes(); - ir::type::tile_shapes_t::value_type one = ir::tile_type::make_one(mod->get_context()); - ir::type::tile_shapes_t out_shapes(slices.size()); - // create shapes - size_t current = 0; - for(size_t i = 0; i < out_shapes.size(); i++) - out_shapes[i] = (slices[i]->type()==NEWAXIS)?one:in_shapes[current++]; - return mod->get_builder().create_reshape(in, out_shapes); -} - - -/* Unary operator */ -ir::value *unary_operator::llvm_op(ir::builder &builder, ir::value *arg, const std::string &name) const{ - ir::type *atype = arg->get_type(); - bool is_float = atype->is_floating_point_ty(); - bool is_int = atype->is_integer_ty(); - if(op_ == INC) - return builder.create_add(arg, builder.get_int32(1), name); - if(op_ == DEC) - return builder.create_sub(arg, builder.get_int32(1), name); - if(op_ == PLUS) - return arg; - if(op_ == MINUS && is_float) - return builder.create_fneg(arg, name); - if(op_ == MINUS && is_int) - return builder.create_neg(arg, name); - if(op_ == ADDR) - throw std::runtime_error("not supported"); - if(op_ == DEREF) - return builder.create_load(arg, name); - if(op_ == COMPL) - throw std::runtime_error("not supported"); - if(op_ == NOT) - return builder.create_not(arg, name); - throw std::runtime_error("unreachable"); -} - -ir::value* unary_operator::codegen(ir::module *mod) const{ - ir::value *arg = arg_->codegen(mod); - ir::value *result = llvm_op(mod->get_builder(), arg, ""); - return result; -} - -/* Cast operator */ -ir::value *cast_operator::llvm_op(ir::builder &builder, ir::type *T, ir::value *arg, const std::string &name) const{ - return nullptr; -} - -ir::value* cast_operator::codegen(ir::module *mod) const{ - ir::value *arg = arg_->codegen(mod); - ir::type *T = T_->type(mod); - return llvm_op(mod->get_builder(), T, arg, ""); -} - -/* Conditional expression */ -ir::value *conditional_expression::codegen(ir::module *mod) const{ - ir::builder &builder = mod->get_builder(); - ir::value *pred = cond_->codegen(mod); - ir::instruction *mask = (ir::instruction*)builder.create_mask(pred); - ir::value *true_mask = mask->get_result(0); - ir::value *false_mask = mask->get_result(1); - ir::value *true_value = true_value_->codegen(mod); - ir::value *false_value = false_value_->codegen(mod); - if(auto *itn = dynamic_cast(true_value)) - itn->set_mask_pred(true_mask); - if(auto *itn = dynamic_cast(false_value)) - itn->set_mask_pred(false_mask); - bool is_float, is_ptr, is_int, is_signed; - ir::value *uncasted_true_value = true_value; - ir::value *uncasted_false_value = false_value; - implicit_cast(builder, true_value, false_value, is_float, is_ptr, is_int, is_signed); - implicit_broadcast(mod, true_value, false_value); - { - ir::value *current = true_value; - while(current != uncasted_true_value) { - if(auto *itn = dynamic_cast(current)){ - itn->set_mask_pred(true_mask); - current = itn->get_operand(0); - } - else - break; - } - } - { - ir::value *current = false_value; - while(current != uncasted_false_value) { - if(auto *itn = dynamic_cast(current)){ - itn->set_mask_pred(false_mask); - current = itn->get_operand(0); - } - else - break; - } - } - ir::value *result = builder.create_merge(true_mask, true_value, false_mask, false_value); - return result; -} - -/* Assignment expression */ -ir::value *assignment_expression::codegen(ir::module *mod) const{ - ir::value *rvalue = rvalue_->codegen(mod); - if(auto *x = dynamic_cast(lvalue_)){ - ir::type *ty = mod->get_scope().types.at(x->id()->name()); - rvalue = explicit_cast(mod->get_builder(), rvalue, ty); - implicit_broadcast(mod, ty, rvalue); - mod->set_value(x->id()->name(), rvalue); - } - else if(auto* x = dynamic_cast(lvalue_)){ - assert(x->get_op()==DEREF); - assert(x->lvalue()); - ir::value *ptr = x->lvalue()->codegen(mod); - rvalue = mod->get_builder().create_store(ptr, rvalue); - } - return rvalue; -} - -/* Type name */ -ir::type *type_name::type(ir::module *mod) const{ - return decl_->type(mod, spec_->type(mod), {}); -} - -/* String literal */ -ir::value* string_literal::codegen(ir::module *) const{ - throw std::runtime_error("not supported"); -// return ir::constant_data_array::get_string(mod->get_context(), value_); -} - -/* Constant */ -ir::value* constant::codegen(ir::module *mod) const{ - return mod->get_builder().get_int32(value_); -} - -int constant::value() const{ - return value_; -} - -/* Constant range */ -ir::value* constant_range::codegen(ir::module *mod) const{ - return ir::constant_range::get((ir::constant_int*)first_->codegen(mod), - (ir::constant_int*)last_->codegen(mod)); -} - -/* Named */ -ir::value* named_expression::codegen(ir::module *mod) const{ - const std::string &name = id()->name(); - const auto& declarations = mod->get_scope().types; - if(declarations.find(name) == declarations.end()) - throw std::runtime_error("variable " + name + " not declared"); - return mod->get_value(name); -} - - -// begin token -void update_location(const char *text) { - for (int i = 0; text[i] != '\0'; i++){ - if (text[i] == '\n'){ - current_column = 0; - current_line++; - } - else if (text[i] == '\t') - current_column += 8 - (current_column % 8); - else - current_column++; - } -} - -void print_error(const char *cerror) { - std::string error(cerror); - auto it = error.find("syntax error,"); - error.replace(it, 13, ""); - std::cerr << "error at line " << current_line << " (column " << current_column << "): " << error << std::endl; - throw std::runtime_error("compilation failed"); -} - -char return_impl(char t, const char * yytext) { - update_location(yytext); - return t; -} - -yytokentype return_impl(yytokentype t, const char * yytext){ - update_location(yytext); - return t; -} - -void return_void(const char * yytext){ - update_location(yytext); -} - -} - -} diff --git a/lib/ast/module.cpp b/lib/ast/module.cpp new file mode 100644 index 000000000..32ae8b4c0 --- /dev/null +++ b/lib/ast/module.cpp @@ -0,0 +1,18 @@ +#include "triton/ast/module.h" +#include "triton/ir/module.h" + + +namespace triton{ + +namespace ast{ + +/* Translation unit */ +ir::value* translation_unit::codegen(ir::module *mod) const{ + mod->add_new_scope(); + decls_.codegen(mod); + return nullptr; +} + +} + +} diff --git a/lib/ast/node.cpp b/lib/ast/node.cpp new file mode 100644 index 000000000..c13bf3db7 --- /dev/null +++ b/lib/ast/node.cpp @@ -0,0 +1,160 @@ +#include "triton/ast/node.h" +#include "triton/ir/builder.h" +#include "triton/ir/module.h" +#include "triton/ir/constant.h" + +namespace triton{ + +namespace ast{ + +/* node */ +ir::value *node::explicit_cast(ir::builder &builder, ir::value *src, ir::type *dst_ty){ + ir::type *src_scalar_ty = src->get_type()->get_scalar_ty(); + ir::type *dst_scalar_ty = dst_ty->get_scalar_ty(); + bool src_signed = false; + bool dst_signed = false; + if(src_scalar_ty == dst_scalar_ty) + return src; + else if(src_scalar_ty->is_integer_ty() && src_signed && dst_scalar_ty->is_floating_point_ty()) + return builder.create_si_to_fp(src, dst_ty); + + else if(src_scalar_ty->is_integer_ty() && !src_signed && dst_scalar_ty->is_floating_point_ty()) + return builder.create_ui_to_fp(src, dst_ty); + + else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_integer_ty() && dst_signed) + return builder.create_fp_to_si(src, dst_ty); + + else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_integer_ty() && !dst_signed) + return builder.create_fp_to_ui(src, dst_ty); + + else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_floating_point_ty() && + src_scalar_ty->get_fp_mantissa_width() < dst_scalar_ty->get_fp_mantissa_width()) + return builder.create_fp_ext(src, dst_ty); + + else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_floating_point_ty() && + src_scalar_ty->get_fp_mantissa_width() > dst_scalar_ty->get_fp_mantissa_width()) + return builder.create_fp_trunc(src, dst_ty); + + else if(src_scalar_ty->is_integer_ty() && dst_scalar_ty->is_integer_ty() && + src_scalar_ty->get_integer_bitwidth()) + return builder.create_int_cast(src, dst_ty, dst_signed); + + else + throw std::runtime_error("unreachable"); +} + + +void node::implicit_cast(ir::builder &builder, ir::value *&lhs, ir::value *&rhs, + bool &is_float, bool &is_ptr, bool &is_int, bool &is_signed){ + // Input types + ir::type *left_ty = lhs->get_type()->get_scalar_ty(); + ir::type *right_ty = rhs->get_type()->get_scalar_ty(); + // One operand is pointer + if(left_ty->is_pointer_ty() || right_ty->is_pointer_ty()){ + if(left_ty->is_pointer_ty() && right_ty->is_pointer_ty()) + throw std::runtime_error("invalid operands"); + if(right_ty->is_pointer_ty()) + std::swap(lhs, rhs); + is_ptr = true; + } + // One operand is double + else if(left_ty->is_double_ty() || right_ty->is_double_ty()){ + ir::value *&to_convert = left_ty->is_double_ty()?rhs:lhs; + to_convert = explicit_cast(builder, to_convert, builder.get_double_ty()); + is_float = true; + } + // One operand is float + else if(left_ty->is_float_ty() || right_ty->is_float_ty()){ + ir::value *&to_convert = left_ty->is_float_ty()?rhs:lhs; + to_convert = explicit_cast(builder, to_convert, builder.get_float_ty()); + is_float = true; + } + // Both operands are integers + else if(left_ty->is_integer_ty() && right_ty->is_integer_ty()){ + is_int = true; + is_signed = true; // always signed for now + if(left_ty->get_integer_bitwidth() != right_ty->get_integer_bitwidth()){ + ir::value *&to_convert = (left_ty->get_integer_bitwidth() > right_ty->get_integer_bitwidth())?rhs:lhs; + ir::type *dst_ty = (to_convert==lhs)?right_ty:left_ty; + to_convert = explicit_cast(builder, to_convert, dst_ty); + } + } + // Not reachable + else + throw std::runtime_error("unreachable"); +} + +void node::implicit_broadcast(ir::module *mod, ir::value *&lhs, ir::value *&rhs) { + ir::type *lhs_ty = lhs->get_type(); + ir::type *rhs_ty = rhs->get_type(); + ir::type *res_ty = nullptr; + if(!lhs_ty->is_tile_ty() && !rhs_ty->is_tile_ty()) + return; + else if(lhs_ty->is_tile_ty() && !rhs_ty->is_tile_ty()) + res_ty = lhs_ty; + else if(!lhs_ty->is_tile_ty() && rhs_ty->is_tile_ty()) + res_ty = rhs_ty; + else{ + auto lhs_shapes = lhs_ty->get_tile_shapes(); + auto rhs_shapes = rhs_ty->get_tile_shapes(); + size_t lhs_size = lhs_shapes.size(); + size_t rhs_size = rhs_shapes.size(); + size_t res_size = std::max(lhs_size, rhs_size); + ir::type::tile_shapes_t res_shapes(res_size); + ir::type::tile_shapes_t::value_type one = ir::tile_type::make_one(mod->get_context()); + for(int i = 0; i < res_size; i++){ + if(i >= res_size - lhs_size && i >= res_size - rhs_size) + res_shapes[i] = lhs_shapes[i]==one?rhs_shapes[i]:lhs_shapes[i]; + else if(i >= res_size - lhs_size) + res_shapes[i] = lhs_shapes[i]; + else if(i >= res_size - rhs_size) + res_shapes[i] = rhs_shapes[i]; + } + res_ty = ir::tile_type::get(lhs_ty->get_scalar_ty(), res_shapes); + } + implicit_broadcast(mod, res_ty, rhs); + implicit_broadcast(mod, res_ty, lhs); +} + +void node::implicit_broadcast(ir::module *mod, ir::type *ty, ir::value *&src){ + ir::builder &builder = mod->get_builder(); + ir::type *src_ty = src->get_type(); + ir::type::tile_shapes_t::value_type one = ir::tile_type::make_one(mod->get_context()); + // Both are scalar + if(!ty->is_tile_ty() && !src_ty->is_tile_ty()) + return; + // Broadcast scalar + if(ty->is_tile_ty() && !src_ty->is_tile_ty()){ + src = builder.create_splat(src, ty->get_tile_shapes()); + return; + } + // Downcast tile + if(!ty->is_tile_ty() && src_ty->is_tile_ty()){ + for(ir::constant *shape: src_ty->get_tile_shapes()) + if(shape != one) + throw std::runtime_error("cannot downcast"); + src = builder.create_downcast(src); + return; + } + // Both are arrays + auto dst_shapes = ty->get_tile_shapes(); + auto src_shapes = src_ty->get_tile_shapes(); + int dst_dim = dst_shapes.size(); + int src_dim = src_shapes.size(); + // Pad + int off = dst_dim - src_dim; + for(size_t i = 0; i < off; i++) + src_shapes.insert(src_shapes.begin(), one); + if(off > 0) + src = builder.create_reshape(src, src_shapes); + // Broadcast + for(int i = dst_dim - 1; i>= 0; i--) + if(dst_shapes[i] != src_shapes[i] && dst_shapes[i] != one && src_shapes[i] != one) + throw std::runtime_error("cannot broadcast"); + if(dst_shapes != src_shapes) + src = builder.create_broadcast(src, dst_shapes); +} + +} + +} diff --git a/lib/ast/statement.cpp b/lib/ast/statement.cpp new file mode 100644 index 000000000..265dcbb19 --- /dev/null +++ b/lib/ast/statement.cpp @@ -0,0 +1,160 @@ +#include "triton/ast/expression.h" +#include "triton/ast/statement.h" +#include "triton/ast/declaration.h" +#include "triton/ir/constant.h" +#include "triton/ir/module.h" +#include "triton/ir/basic_block.h" +#include "triton/ir/builder.h" +#include "triton/ir/type.h" + +namespace triton{ + +namespace ast{ + +/* Helpers */ +inline bool is_terminator(ir::value* x) { + return x && dynamic_cast(x); +} + + +/* Statements */ +ir::value* compound_statement::codegen(ir::module* mod) const{ + mod->add_new_scope(); + if(items_) + items_->codegen(mod); + mod->pop_scope(); + return nullptr; +} + +/* Expression statement */ +ir::value* expression_statement::codegen(ir::module *mod) const{ + ir::builder &builder = mod->get_builder(); + ir::basic_block *block = builder.get_insert_block(); + if(pred_) { + // check that it is an assignment + assignment_expression *assignment = dynamic_cast(expr_); + assert(assignment); + // generate mask + ir::value *pred = pred_->codegen(mod); + ir::mask_inst *mask = (ir::mask_inst*)builder.create_mask(pred); + // generate expression + unsigned szbegin = block->get_inst_list().size(); + ir::value *expr = expr_->codegen(mod); + ir::basic_block::iterator begin = block->begin(); + std::advance(begin, szbegin); + // set mask + ir::type *ty = expr->get_type(); + for(auto it = begin; it != builder.get_insert_point(); it++) + (*it)->set_mask_pred(mask->get_result(0)); +// if(auto *itn = dynamic_cast(expr)) +// itn->set_mask_pred(mask->get_result(0)); + if(ty->is_void_ty()) + return expr; + // merge with psi + ir::psi_inst *psi = (ir::psi_inst*)builder.create_merge(mask->get_result(0), expr, + mask->get_result(1), ir::undef_value::get(ty)); + std::string name = ((named_expression*)assignment->lvalue())->id()->name(); + mod->set_value(name, psi); + return psi; + } + return expr_->codegen(mod); +} + +/* For statement */ +ir::value* iteration_statement::codegen(ir::module *mod) const{ + ir::builder &builder = mod->get_builder(); + ir::context &ctx = mod->get_context(); + ir::basic_block *current_bb = builder.get_insert_block(); + ir::function *fn = current_bb->get_parent(); + ir::basic_block *loop_bb = ir::basic_block::create(ctx, "loop", fn); + ir::basic_block *next_bb = ir::basic_block::create(ctx, "postloop", fn); + mod->set_continue_fn([&](){ + if(exec_) + exec_->codegen(mod); + ir::value *cond = explicit_cast(builder, stop_->codegen(mod), ir::type::get_int1_ty(ctx)); + return builder.create_cond_br(cond, loop_bb, next_bb); + }); + init_->codegen(mod); + ir::value *cond = explicit_cast(builder, stop_->codegen(mod), ir::type::get_int1_ty(ctx)); + builder.create_cond_br(cond, loop_bb, next_bb); +// builder.create_br(loop_bb); + builder.set_insert_point(loop_bb); + if(!is_terminator(statements_->codegen(mod))) + mod->get_continue_fn()(); + ir::basic_block *stop_bb = builder.get_insert_block(); + mod->seal_block(stop_bb); + mod->seal_block(loop_bb); + mod->seal_block(builder.get_insert_block()); + mod->seal_block(next_bb); + builder.set_insert_point(next_bb); + return nullptr; +} + +/* While statement */ +ir::value* while_statement::codegen(ir::module* mod) const{ + ir::builder &builder = mod->get_builder(); + ir::context &ctx = mod->get_context(); + ir::basic_block *current_bb = builder.get_insert_block(); + ir::function *fn = current_bb->get_parent(); + ir::basic_block *loop_bb = ir::basic_block::create(ctx, "loop", fn); + ir::basic_block *next_bb = ir::basic_block::create(ctx, "postloop", fn); + mod->set_continue_fn([&](){ + ir::value *cond = explicit_cast(builder, cond_->codegen(mod), ir::type::get_int1_ty(ctx)); + return builder.create_cond_br(cond, loop_bb, next_bb); + }); + ir::value *cond = explicit_cast(builder, cond_->codegen(mod), ir::type::get_int1_ty(ctx)); + builder.create_cond_br(cond, loop_bb, next_bb); + builder.set_insert_point(loop_bb); + if(!is_terminator(statements_->codegen(mod))) + mod->get_continue_fn()(); + ir::basic_block *stop_bb = builder.get_insert_block(); + mod->seal_block(stop_bb); + mod->seal_block(loop_bb); + mod->seal_block(builder.get_insert_block()); + mod->seal_block(next_bb); + builder.set_insert_point(next_bb); + return nullptr; +} + +/* Selection statement */ +ir::value* selection_statement::codegen(ir::module* mod) const{ + ir::builder &builder = mod->get_builder(); + ir::context &ctx = mod->get_context(); + ir::function *fn = builder.get_insert_block()->get_parent(); + ir::value *cond = cond_->codegen(mod); + ir::basic_block *then_bb = ir::basic_block::create(ctx, "then", fn); + ir::basic_block *else_bb = else_value_?ir::basic_block::create(ctx, "else", fn):nullptr; + ir::basic_block *endif_bb = ir::basic_block::create(ctx, "endif", fn); + mod->seal_block(then_bb); + if(else_value_) + mod->seal_block(else_bb); + + // Branch + if(else_value_) + builder.create_cond_br(cond, then_bb, else_bb); + else + builder.create_cond_br(cond, then_bb, endif_bb); + // Then + builder.set_insert_point(then_bb); + if(!is_terminator(then_value_->codegen(mod))) + builder.create_br(endif_bb); + // Else + if(else_value_){ + builder.set_insert_point(else_bb); + if(!is_terminator(else_value_->codegen(mod))) + builder.create_br(endif_bb); + } + // Endif + mod->seal_block(endif_bb); + builder.set_insert_point(endif_bb); + return nullptr; +} + +/* Continue statement */ +ir::value* continue_statement::codegen(ir::module *mod) const{ + return mod->get_continue_fn()(); +} + +} + +} From 383b5b2a2afb11f0100d6fd96031c0817c95d06f Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 28 May 2019 17:22:48 -0400 Subject: [PATCH 160/494] [triton/ast] renamed ast -> lang in namespace and file structure --- CMakeLists.txt | 4 +- include/triton/ast/error.h | 62 ---------------------- include/triton/ir/module.h | 4 +- include/triton/{ast => lang}/declaration.h | 10 ++-- include/triton/lang/error.h | 20 +++++++ include/triton/{ast => lang}/expression.h | 9 ++-- include/triton/{ast/ast.h => lang/lang.h} | 7 +-- include/triton/{ast => lang}/module.h | 13 ++--- include/triton/{ast => lang}/node.h | 13 ++--- include/triton/{ast => lang}/ops.h | 12 ++--- include/triton/{ast => lang}/parser.y | 32 ++++++----- include/triton/{ast => lang}/scanner.l | 6 +-- include/triton/{ast => lang}/statement.h | 14 ++--- lib/{ast => lang}/declaration.cpp | 6 +-- lib/{ast => lang}/error.cpp | 5 +- lib/{ast => lang}/expression.cpp | 6 +-- lib/{ast => lang}/module.cpp | 4 +- lib/{ast => lang}/node.cpp | 4 +- lib/{ast => lang}/statement.cpp | 8 +-- lib/runtime/jit.cpp | 7 ++- 20 files changed, 87 insertions(+), 159 deletions(-) delete mode 100644 include/triton/ast/error.h rename include/triton/{ast => lang}/declaration.h (96%) create mode 100644 include/triton/lang/error.h rename include/triton/{ast => lang}/expression.h (98%) rename include/triton/{ast/ast.h => lang/lang.h} (54%) rename include/triton/{ast => lang}/module.h (62%) rename include/triton/{ast => lang}/node.h (88%) rename include/triton/{ast => lang}/ops.h (79%) rename include/triton/{ast => lang}/parser.y (98%) rename include/triton/{ast => lang}/scanner.l (98%) rename include/triton/{ast => lang}/statement.h (90%) rename lib/{ast => lang}/declaration.cpp (98%) rename lib/{ast => lang}/error.cpp (93%) rename lib/{ast => lang}/expression.cpp (99%) rename lib/{ast => lang}/module.cpp (81%) rename lib/{ast => lang}/node.cpp (99%) rename lib/{ast => lang}/statement.cpp (97%) diff --git a/CMakeLists.txt b/CMakeLists.txt index d2e486afb..21805aa68 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,8 +6,8 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") # FLEX/YACC find_package(BISON) find_package(FLEX) -BISON_TARGET(Parser ${CMAKE_CURRENT_SOURCE_DIR}/include/triton/ast/parser.y ${CMAKE_CURRENT_BINARY_DIR}/parser.cpp) -FLEX_TARGET(Lexer ${CMAKE_CURRENT_SOURCE_DIR}/include/triton/ast/scanner.l ${CMAKE_CURRENT_BINARY_DIR}/scanner.cpp) +BISON_TARGET(Parser ${CMAKE_CURRENT_SOURCE_DIR}/include/triton/lang/parser.y ${CMAKE_CURRENT_BINARY_DIR}/parser.cpp) +FLEX_TARGET(Lexer ${CMAKE_CURRENT_SOURCE_DIR}/include/triton/lang/scanner.l ${CMAKE_CURRENT_BINARY_DIR}/scanner.cpp) get_filename_component(BISON_Parser_INCLUDE_DIRECTORIES ${BISON_Parser_OUTPUT_HEADER} DIRECTORY) include_directories(${BISON_Parser_INCLUDE_DIRECTORIES}) diff --git a/include/triton/ast/error.h b/include/triton/ast/error.h deleted file mode 100644 index 5834d55f6..000000000 --- a/include/triton/ast/error.h +++ /dev/null @@ -1,62 +0,0 @@ -#ifndef TRITON_INCLUDE_AST_ERROR_H -#define TRITON_INCLUDE_AST_ERROR_H - -#include "ops.h" -#include "parser.hpp" -#include "node.h" -#include -#include -#include -#include - - -namespace triton{ - - -namespace ir{ - class function; - class value; - class type; - class builder; - class module; -} - -namespace ast{ - -class expression; -class pointer; -class identifier; -class constant; -class compound_statement; -class initializer; -class declaration_specifier; -class function; - -/* Translation Unit */ -class translation_unit: public node{ -public: - translation_unit(node *item) - : decls_(item) { } - - translation_unit *add(node *item) { - decls_.append(item); - return this; - } - - ir::value* codegen(ir::module * mod) const; - -private: - list decls_; -}; - -void update_location(const char *t); -void print_error(const char *error); -char return_impl(char t, const char * yytext); -yytokentype return_impl(yytokentype t, const char * yytext); -void return_void(const char * yytext); - -} - -} - -#endif diff --git a/include/triton/ir/module.h b/include/triton/ir/module.h index 13d99d436..eddc49454 100644 --- a/include/triton/ir/module.h +++ b/include/triton/ir/module.h @@ -10,7 +10,7 @@ namespace triton{ -namespace ast{ +namespace lang{ class iteration_statement; class compound_statement; @@ -43,7 +43,7 @@ public: typedef std::map symbols_map_t; typedef std::vector functions_list_t; struct current_iteration_info_t{ - ast::iteration_statement *statement; + lang::iteration_statement *statement; basic_block *block; }; diff --git a/include/triton/ast/declaration.h b/include/triton/lang/declaration.h similarity index 96% rename from include/triton/ast/declaration.h rename to include/triton/lang/declaration.h index 5a51c3f9a..a7dbdb97e 100644 --- a/include/triton/ast/declaration.h +++ b/include/triton/lang/declaration.h @@ -1,12 +1,8 @@ -#ifndef TRITON_INCLUDE_AST_DECLARATION_H -#define TRITON_INCLUDE_AST_DECLARATION_H +#ifndef TRITON_INCLUDE_LANG_DECLARATION_H +#define TRITON_INCLUDE_LANG_DECLARATION_H #include "node.h" -#include "parser.hpp" -#include -#include #include -#include namespace triton{ @@ -20,7 +16,7 @@ namespace ir{ class module; } -namespace ast{ +namespace lang{ class expression; class pointer; diff --git a/include/triton/lang/error.h b/include/triton/lang/error.h new file mode 100644 index 000000000..70e70a387 --- /dev/null +++ b/include/triton/lang/error.h @@ -0,0 +1,20 @@ +#ifndef TRITON_INCLUDE_LANG_ERROR_H +#define TRITON_INCLUDE_LANG_ERROR_H + +#include "parser.hpp" + + +namespace triton{ +namespace lang{ + + +void update_location(const char *t); +void print_error(const char *error); +char return_impl(char t, const char * yytext); +yytokentype return_impl(yytokentype t, const char * yytext); +void return_void(const char * yytext); + +} +} + +#endif diff --git a/include/triton/ast/expression.h b/include/triton/lang/expression.h similarity index 98% rename from include/triton/ast/expression.h rename to include/triton/lang/expression.h index 27d72dec8..6ce0819cb 100644 --- a/include/triton/ast/expression.h +++ b/include/triton/lang/expression.h @@ -1,8 +1,7 @@ -#ifndef TDL_INCLUDE_AST_EXPRESSION_H -#define TDL_INCLUDE_AST_EXPRESSION_H +#ifndef TDL_INCLUDE_LANG_EXPRESSION_H +#define TDL_INCLUDE_LANG_EXPRESSION_H -#include "parser.hpp" -#include "ast.h" +#include "lang.h" #include #include #include @@ -20,7 +19,7 @@ namespace ir{ class module; } -namespace ast{ +namespace lang{ enum slice_enum_t{ diff --git a/include/triton/ast/ast.h b/include/triton/lang/lang.h similarity index 54% rename from include/triton/ast/ast.h rename to include/triton/lang/lang.h index 26282894e..ba1d1a2d8 100644 --- a/include/triton/ast/ast.h +++ b/include/triton/lang/lang.h @@ -1,12 +1,13 @@ -#ifndef TRITON_INCLUDE_AST_AST_H -#define TRITON_INCLUDE_AST_AST_H +#ifndef TRITON_INCLUDE_LANG_LANG_H +#define TRITON_INCLUDE_LANG_LANG_H -#include "ops.h" #include "parser.hpp" #include "declaration.h" #include "error.h" #include "expression.h" #include "node.h" #include "ops.h" +#include "module.h" +#include "statement.h" #endif diff --git a/include/triton/ast/module.h b/include/triton/lang/module.h similarity index 62% rename from include/triton/ast/module.h rename to include/triton/lang/module.h index 6d72753ce..7ac6c2960 100644 --- a/include/triton/ast/module.h +++ b/include/triton/lang/module.h @@ -1,17 +1,10 @@ -#ifndef TRITON_INCLUDE_AST_MODULE_H -#define TRITON_INCLUDE_AST_MODULE_H +#ifndef TRITON_INCLUDE_LANG_MODULE_H +#define TRITON_INCLUDE_LANG_MODULE_H -#include "ops.h" -#include "parser.hpp" #include "node.h" -#include -#include -#include -#include - namespace triton{ -namespace ast{ +namespace lang{ /* Translation Unit */ class translation_unit: public node{ diff --git a/include/triton/ast/node.h b/include/triton/lang/node.h similarity index 88% rename from include/triton/ast/node.h rename to include/triton/lang/node.h index 265443397..e689f6f16 100644 --- a/include/triton/ast/node.h +++ b/include/triton/lang/node.h @@ -1,13 +1,8 @@ -#ifndef TRITON_INCLUDE_AST_NODE_H -#define TRITON_INCLUDE_AST_NODE_H +#ifndef TRITON_INCLUDE_LANG_NODE_H +#define TRITON_INCLUDE_LANG_NODE_H -#include "ops.h" -#include "parser.hpp" -#include #include -#include -#include - +#include "ops.h" namespace triton{ @@ -20,7 +15,7 @@ namespace ir{ class module; } -namespace ast{ +namespace lang{ class expression; class pointer; diff --git a/include/triton/ast/ops.h b/include/triton/lang/ops.h similarity index 79% rename from include/triton/ast/ops.h rename to include/triton/lang/ops.h index 316fdccb3..9328be921 100644 --- a/include/triton/ast/ops.h +++ b/include/triton/lang/ops.h @@ -1,14 +1,8 @@ -#ifndef TRITON_INCLUDE_AST_OPS_H -#define TRITON_INCLUDE_AST_OPS_H - -#include "parser.hpp" -#include -#include -#include -#include +#ifndef TRITON_INCLUDE_LANG_OPS_H +#define TRITON_INCLUDE_LANG_OPS_H namespace triton{ -namespace ast{ +namespace lang{ enum ASSIGN_OP_T{ ASSIGN, diff --git a/include/triton/ast/parser.y b/include/triton/lang/parser.y similarity index 98% rename from include/triton/ast/parser.y rename to include/triton/lang/parser.y index c71f8a20e..66d7c1770 100644 --- a/include/triton/ast/parser.y +++ b/include/triton/lang/parser.y @@ -2,16 +2,13 @@ %{ namespace triton{ -namespace ast{ +namespace lang{ class node; } } -using namespace triton::ast; +using namespace triton::lang; #define YYSTYPE node* -#include "../include/triton/ast/ast.h" -#include "../include/triton/ast/expression.h" -#include "../include/triton/ast/statement.h" -#include "../include/triton/ast/declaration.h" +#include "../include/triton/lang/lang.h" extern char* yytext; void yyerror(const char *s); @@ -150,8 +147,8 @@ primary_expression_list /* Postfix */ slice - : ':' { $$ = new slice(triton::ast::ALL); } - | NEWAXIS { $$ = new slice(triton::ast::NEWAXIS); } + : ':' { $$ = new slice(triton::lang::ALL); } + | NEWAXIS { $$ = new slice(triton::lang::NEWAXIS); } slice_list : slice { $$ = new list((slice*)$1); } @@ -387,6 +384,15 @@ storage_class_specifier | CONSTANT_SPACE { $$ = new token(CONSTANT_SPACE_T); } ; +external_declaration + : function_definition { $$ = $1; } + | declaration { $$ = $1; } + ; + +function_definition + : declaration_specifiers declarator compound_statement { $$ = new function_definition($1, $2, $3); } + ; + /* -------------------------- */ /* Translation Unit */ /* -------------------------- */ @@ -395,15 +401,7 @@ translation_unit : external_declaration { ast_root = new translation_unit($1); $$ = ast_root; } | translation_unit external_declaration { $$ = ((translation_unit*)($1))->add($2); } ; - -external_declaration - : function_definition { $$ = $1; } - | declaration { $$ = $1; } - ; - -function_definition - : declaration_specifiers declarator compound_statement { $$ = new function_definition($1, $2, $3); } - ; + %% void yyerror (const char *s){ diff --git a/include/triton/ast/scanner.l b/include/triton/lang/scanner.l similarity index 98% rename from include/triton/ast/scanner.l rename to include/triton/lang/scanner.l index 9a47f929f..b1160fb1c 100644 --- a/include/triton/ast/scanner.l +++ b/include/triton/lang/scanner.l @@ -8,9 +8,9 @@ IS (u|U|l|L)* %{ #include #include "parser.hpp" -#include "../include/triton/ast/ast.h" -using triton::ast::return_impl; -using triton::ast::return_void; +#include "../include/triton/lang/lang.h" +using triton::lang::return_impl; +using triton::lang::return_void; %} %% diff --git a/include/triton/ast/statement.h b/include/triton/lang/statement.h similarity index 90% rename from include/triton/ast/statement.h rename to include/triton/lang/statement.h index 575d70690..42b4140dc 100644 --- a/include/triton/ast/statement.h +++ b/include/triton/lang/statement.h @@ -1,13 +1,7 @@ -#ifndef TRITON_INCLUDE_AST_STATEMENT_H -#define TRITON_INCLUDE_AST_STATEMENT_H - -#include "parser.hpp" -#include "triton/ast/ast.h" -#include -#include -#include -#include +#ifndef TRITON_INCLUDE_LANG_STATEMENT_H +#define TRITON_INCLUDE_LANG_STATEMENT_H +#include "expression.h" namespace triton{ @@ -20,7 +14,7 @@ namespace ir{ class module; } -namespace ast{ +namespace lang{ class declaration; diff --git a/lib/ast/declaration.cpp b/lib/lang/declaration.cpp similarity index 98% rename from lib/ast/declaration.cpp rename to lib/lang/declaration.cpp index 888cdf7ff..d4a73ef00 100644 --- a/lib/ast/declaration.cpp +++ b/lib/lang/declaration.cpp @@ -1,5 +1,5 @@ -#include "triton/ast/statement.h" -#include "triton/ast/declaration.h" +#include "triton/lang/statement.h" +#include "triton/lang/declaration.h" #include "triton/ir/function.h" #include "triton/ir/module.h" #include "triton/ir/basic_block.h" @@ -9,7 +9,7 @@ namespace triton{ -namespace ast{ +namespace lang{ /* Declaration specifier */ ir::type* typed_declaration_specifier::type(ir::module *mod) const { diff --git a/lib/ast/error.cpp b/lib/lang/error.cpp similarity index 93% rename from lib/ast/error.cpp rename to lib/lang/error.cpp index 72c18277d..77076fba0 100644 --- a/lib/ast/error.cpp +++ b/lib/lang/error.cpp @@ -1,9 +1,10 @@ -#include "triton/ast/error.h" +#include +#include "triton/lang/error.h" namespace triton{ -namespace ast{ +namespace lang{ static int current_line = 0; static int current_column = 0; diff --git a/lib/ast/expression.cpp b/lib/lang/expression.cpp similarity index 99% rename from lib/ast/expression.cpp rename to lib/lang/expression.cpp index 7b6f43429..87f6a8194 100644 --- a/lib/ast/expression.cpp +++ b/lib/lang/expression.cpp @@ -1,5 +1,5 @@ -#include "triton/ast/expression.h" -#include "triton/ast/declaration.h" +#include "triton/lang/expression.h" +#include "triton/lang/declaration.h" #include "triton/ir/constant.h" #include "triton/ir/module.h" #include "triton/ir/builder.h" @@ -8,7 +8,7 @@ namespace triton{ -namespace ast{ +namespace lang{ /* Binary operator */ diff --git a/lib/ast/module.cpp b/lib/lang/module.cpp similarity index 81% rename from lib/ast/module.cpp rename to lib/lang/module.cpp index 32ae8b4c0..3455ca98f 100644 --- a/lib/ast/module.cpp +++ b/lib/lang/module.cpp @@ -1,10 +1,10 @@ -#include "triton/ast/module.h" +#include "triton/lang/module.h" #include "triton/ir/module.h" namespace triton{ -namespace ast{ +namespace lang{ /* Translation unit */ ir::value* translation_unit::codegen(ir::module *mod) const{ diff --git a/lib/ast/node.cpp b/lib/lang/node.cpp similarity index 99% rename from lib/ast/node.cpp rename to lib/lang/node.cpp index c13bf3db7..f25a5fdf5 100644 --- a/lib/ast/node.cpp +++ b/lib/lang/node.cpp @@ -1,11 +1,11 @@ -#include "triton/ast/node.h" +#include "triton/lang/node.h" #include "triton/ir/builder.h" #include "triton/ir/module.h" #include "triton/ir/constant.h" namespace triton{ -namespace ast{ +namespace lang{ /* node */ ir::value *node::explicit_cast(ir::builder &builder, ir::value *src, ir::type *dst_ty){ diff --git a/lib/ast/statement.cpp b/lib/lang/statement.cpp similarity index 97% rename from lib/ast/statement.cpp rename to lib/lang/statement.cpp index 265dcbb19..d67c62c8f 100644 --- a/lib/ast/statement.cpp +++ b/lib/lang/statement.cpp @@ -1,6 +1,6 @@ -#include "triton/ast/expression.h" -#include "triton/ast/statement.h" -#include "triton/ast/declaration.h" +#include "triton/lang/expression.h" +#include "triton/lang/statement.h" +#include "triton/lang/declaration.h" #include "triton/ir/constant.h" #include "triton/ir/module.h" #include "triton/ir/basic_block.h" @@ -9,7 +9,7 @@ namespace triton{ -namespace ast{ +namespace lang{ /* Helpers */ inline bool is_terminator(ir::value* x) { diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index bb8637648..e03d51c63 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -1,5 +1,5 @@ #include -#include "triton/ast/ast.h" +#include "triton/lang/lang.h" #include "triton/codegen/target.h" #include "triton/ir/context.h" #include "triton/ir/context_impl.h" @@ -24,8 +24,7 @@ typedef struct yy_buffer_state * YY_BUFFER_STATE; extern int yyparse(); extern YY_BUFFER_STATE yy_scan_string(const char * str); extern void yy_delete_buffer(YY_BUFFER_STATE buffer); -using triton::ast::translation_unit; -extern translation_unit *ast_root; +extern triton::lang::translation_unit *ast_root; namespace triton { @@ -84,7 +83,7 @@ std::unique_ptr jit::make_triton_module(const char *name, const char YY_BUFFER_STATE buffer = yy_scan_string(src); yyparse(); yy_delete_buffer(buffer); - translation_unit *program = ast_root; + triton::lang::translation_unit *program = ast_root; // create Triton-IR from AST ir::module* module = new ir::module(name, triton_context_); program->codegen(module); From 49fcfd6fc7c20fb385b1314ac457589c2334d06e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 5 Jun 2019 11:09:41 -0700 Subject: [PATCH 161/494] [examples/tensorflow] fixed #include issue --- examples/python/tensorflow/CMakeLists.txt | 2 +- examples/python/tensorflow/dot.cpp | 2 +- include/triton/external/CUDA/cuda.h | 3314 +++++++++++++++++---- include/triton/external/CUDA/nvml.h | 801 ++++- 4 files changed, 3545 insertions(+), 574 deletions(-) diff --git a/examples/python/tensorflow/CMakeLists.txt b/examples/python/tensorflow/CMakeLists.txt index 008397c1b..1ce055203 100644 --- a/examples/python/tensorflow/CMakeLists.txt +++ b/examples/python/tensorflow/CMakeLists.txt @@ -5,6 +5,6 @@ if(${TensorFlow_FOUND}) include_directories("${CUDA_HOME}/include") link_directories(${TF_LIB}) add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) - add_library(tf_blocksparse SHARED blocksparse.cpp) + add_library(tf_blocksparse SHARED dot.cpp) target_link_libraries(tf_blocksparse tensorflow_framework triton) endif() diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index bc87c71e8..70ab8c386 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -3,7 +3,7 @@ #include "triton/driver/buffer.h" #include "triton/driver/backend.h" #include "triton/driver/stream.h" -#include "triton/jit.h" +#include "triton/runtime/jit.h" #define EIGEN_USE_GPU #include "tensorflow/core/framework/op.h" diff --git a/include/triton/external/CUDA/cuda.h b/include/triton/external/CUDA/cuda.h index 175b31703..24d96bd6c 100755 --- a/include/triton/external/CUDA/cuda.h +++ b/include/triton/external/CUDA/cuda.h @@ -1,5 +1,5 @@ /* - * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. * * NOTICE TO LICENSEE: * @@ -63,6 +63,16 @@ typedef uint64_t cuuint64_t; /** * CUDA API versioning support */ +#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED) +#define __CUDA_DEPRECATED +#elif defined(_MSC_VER) +#define __CUDA_DEPRECATED __declspec(deprecated) +#elif defined(__GNUC__) +#define __CUDA_DEPRECATED __attribute__((deprecated)) +#else +#define __CUDA_DEPRECATED +#endif + #if defined(CUDA_FORCE_API_VERSION) #if (CUDA_FORCE_API_VERSION == 3010) #define __CUDA_API_VERSION 3010 @@ -70,7 +80,7 @@ typedef uint64_t cuuint64_t; #error "Unsupported value of CUDA_FORCE_API_VERSION" #endif #else - #define __CUDA_API_VERSION 9000 + #define __CUDA_API_VERSION 10000 #endif /* CUDA_FORCE_API_VERSION */ #if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) @@ -169,13 +179,18 @@ typedef uint64_t cuuint64_t; #define cuStreamGetPriority __CUDA_API_PTSZ(cuStreamGetPriority) #define cuStreamGetFlags __CUDA_API_PTSZ(cuStreamGetFlags) + #define cuStreamGetCtx __CUDA_API_PTSZ(cuStreamGetCtx) #define cuStreamWaitEvent __CUDA_API_PTSZ(cuStreamWaitEvent) + #define cuStreamBeginCapture __CUDA_API_PTSZ(cuStreamBeginCapture) + #define cuStreamEndCapture __CUDA_API_PTSZ(cuStreamEndCapture) + #define cuStreamIsCapturing __CUDA_API_PTSZ(cuStreamIsCapturing) #define cuStreamAddCallback __CUDA_API_PTSZ(cuStreamAddCallback) #define cuStreamAttachMemAsync __CUDA_API_PTSZ(cuStreamAttachMemAsync) #define cuStreamQuery __CUDA_API_PTSZ(cuStreamQuery) #define cuStreamSynchronize __CUDA_API_PTSZ(cuStreamSynchronize) #define cuEventRecord __CUDA_API_PTSZ(cuEventRecord) #define cuLaunchKernel __CUDA_API_PTSZ(cuLaunchKernel) + #define cuLaunchHostFunc __CUDA_API_PTSZ(cuLaunchHostFunc) #define cuGraphicsMapResources __CUDA_API_PTSZ(cuGraphicsMapResources) #define cuGraphicsUnmapResources __CUDA_API_PTSZ(cuGraphicsUnmapResources) @@ -187,6 +202,10 @@ typedef uint64_t cuuint64_t; #define cuLaunchCooperativeKernel __CUDA_API_PTSZ(cuLaunchCooperativeKernel) + #define cuSignalExternalSemaphoresAsync __CUDA_API_PTSZ(cuSignalExternalSemaphoresAsync) + #define cuWaitExternalSemaphoresAsync __CUDA_API_PTSZ(cuWaitExternalSemaphoresAsync) + + #define cuGraphLaunch __CUDA_API_PTSZ(cuGraphLaunch) #endif /** @@ -210,7 +229,7 @@ typedef uint64_t cuuint64_t; /** * CUDA API version number */ -#define CUDA_VERSION 9000 +#define CUDA_VERSION 10000 #ifdef __cplusplus extern "C" { @@ -219,7 +238,7 @@ extern "C" { /** * CUDA device pointer * CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform. - */ + */ #if __CUDA_API_VERSION >= 3020 #if defined(_WIN64) || defined(__LP64__) @@ -243,16 +262,23 @@ typedef struct CUstream_st *CUstream; /**< CUDA stream */ typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */ typedef unsigned long long CUtexObject; /**< An opaque value that represents a CUDA texture object */ typedef unsigned long long CUsurfObject; /**< An opaque value that represents a CUDA surface object */ +typedef struct CUextMemory_st *CUexternalMemory; /**< CUDA external memory */ +typedef struct CUextSemaphore_st *CUexternalSemaphore; /**< CUDA external semaphore */ +typedef struct CUgraph_st *CUgraph; /**< CUDA graph */ +typedef struct CUgraphNode_st *CUgraphNode; /**< CUDA graph node */ +typedef struct CUgraphExec_st *CUgraphExec; /**< CUDA executable graph */ +#ifndef CU_UUID_HAS_BEEN_DEFINED +#define CU_UUID_HAS_BEEN_DEFINED typedef struct CUuuid_st { /**< CUDA definition of UUID */ char bytes[16]; } CUuuid; - +#endif #if __CUDA_API_VERSION >= 4010 /** - * CUDA IPC handle size + * CUDA IPC handle size */ #define CU_IPC_HANDLE_SIZE 64 @@ -299,7 +325,7 @@ typedef enum CUctx_flags_enum { CU_CTX_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling * \deprecated This flag was deprecated as of CUDA 4.0 * and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */ - CU_CTX_SCHED_MASK = 0x07, + CU_CTX_SCHED_MASK = 0x07, CU_CTX_MAP_HOST = 0x08, /**< Support mapped pinned allocations */ CU_CTX_LMEM_RESIZE_TO_MAX = 0x10, /**< Keep local memory allocation after launch */ CU_CTX_FLAGS_MASK = 0x1f @@ -355,15 +381,16 @@ typedef enum CUstreamWaitValue_flags_enum { CU_STREAM_WAIT_VALUE_AND = 0x2, /**< Wait until (*addr & value) != 0. */ CU_STREAM_WAIT_VALUE_NOR = 0x3, /**< Wait until ~(*addr | value) != 0. Support for this operation can be queried with ::cuDeviceGetAttribute() and - ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR. Generally, this - requires compute capability 7.0 or greater. */ + ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.*/ CU_STREAM_WAIT_VALUE_FLUSH = 1<<30 /**< Follow the wait operation with a flush of outstanding remote writes. This means that, if a remote write operation is guaranteed to have reached the device before the wait can be satisfied, that write is guaranteed to be visible to downstream device work. The device is permitted to reorder remote writes internally. For example, this flag would be required if two remote writes arrive in a defined order, the wait is satisfied by the - second write, and downstream work needs to observe the first write. */ + second write, and downstream work needs to observe the first write. + Support for this operation is restricted to selected platforms and can be + queried with ::CU_DEVICE_ATTRIBUTE_CAN_USE_WAIT_VALUE_FLUSH.*/ } CUstreamWaitValue_flags; /** @@ -513,7 +540,7 @@ typedef enum CUdevice_attribute_enum { CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, /**< Size of L2 cache in bytes */ CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */ CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, /**< Number of asynchronous engines */ - CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, /**< Device shares a unified address space with the host */ + CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, /**< Device shares a unified address space with the host */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, /**< Maximum 1D layered texture width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, /**< Maximum layers in a 1D layered texture */ CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44, /**< Deprecated, do not use. */ @@ -547,7 +574,7 @@ typedef enum CUdevice_attribute_enum { CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72, /**< Maximum 2D linear texture pitch in bytes */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, /**< Maximum mipmapped 2D texture width */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,/**< Maximum mipmapped 2D texture height */ - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, /**< Major compute capability version number */ + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, /**< Major compute capability version number */ CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, /**< Minor compute capability version number */ CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, /**< Maximum mipmapped 1D texture width */ CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78, /**< Device supports stream priorities */ @@ -556,7 +583,7 @@ typedef enum CUdevice_attribute_enum { CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81, /**< Maximum shared memory available per multiprocessor in bytes */ CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, /**< Maximum number of 32-bit registers available per multiprocessor */ CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, /**< Device can allocate managed memory on this system */ - CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, /**< Device is on a multi-GPU board */ + CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, /**< Device is on a multi-GPU board */ CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85, /**< Unique id for a group of devices on the same multi-GPU board */ CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86, /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/ CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87, /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */ @@ -570,6 +597,10 @@ typedef enum CUdevice_attribute_enum { CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95, /**< Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */ CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96, /**< Device can participate in cooperative kernels launched via ::cuLaunchCooperativeKernelMultiDevice */ CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, /**< Maximum optin shared memory per block */ + CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98, /**< Both the ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */ + CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99, /**< Device supports host memory registration via ::cudaHostRegister. */ + CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, /**< Device accesses pageable memory via the host's page tables. */ + CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, /**< The host can directly access managed memory on the device without migration. */ CU_DEVICE_ATTRIBUTE_MAX } CUdevice_attribute; @@ -600,7 +631,8 @@ typedef enum CUpointer_attribute_enum { CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5, /**< A pair of tokens for use with the nv-p2p.h Linux kernel interface */ CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6, /**< Synchronize every synchronous memory operation initiated on this region */ CU_POINTER_ATTRIBUTE_BUFFER_ID = 7, /**< A process-wide unique ID for an allocated memory region*/ - CU_POINTER_ATTRIBUTE_IS_MANAGED = 8 /**< Indicates if the pointer points to managed memory */ + CU_POINTER_ATTRIBUTE_IS_MANAGED = 8, /**< Indicates if the pointer points to managed memory */ + CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9 /**< A device ordinal of a device on which a pointer was allocated or registered */ } CUpointer_attribute; /** @@ -656,7 +688,7 @@ typedef enum CUfunction_attribute_enum { CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6, /** - * The attribute to indicate whether the function has been compiled with + * The attribute to indicate whether the function has been compiled with * user specified option "-Xptxas --dlcm=ca" set . */ CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7, @@ -669,8 +701,8 @@ typedef enum CUfunction_attribute_enum { CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8, /** - * On devices where the L1 cache and shared memory use the same hardware resources, - * this sets the shared memory carveout preference, in percent of the total resources. + * On devices where the L1 cache and shared memory use the same hardware resources, + * this sets the shared memory carveout preference, in percent of the total resources. * This is only a hint, and the driver can choose a different ratio if required to execute the function. */ CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9, @@ -884,6 +916,37 @@ typedef enum CUjit_option_enum CU_JIT_NEW_SM3X_OPT, CU_JIT_FAST_COMPILE, + /** + * Array of device symbol names that will be relocated to the corresponing + * host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES.\n + * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n + * When loding a device module, driver will relocate all encountered + * unresolved symbols to the host addresses.\n + * It is only allowed to register symbols that correspond to unresolved + * global variables.\n + * It is illegal to register the same device symbol at multiple addresses.\n + * Option type: const char **\n + * Applies to: dynamic linker only + */ + CU_JIT_GLOBAL_SYMBOL_NAMES, + + /** + * Array of host addresses that will be used to relocate corresponding + * device symbols stored in ::CU_JIT_GLOBAL_SYMBOL_NAMES.\n + * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n + * Option type: void **\n + * Applies to: dynamic linker only + */ + CU_JIT_GLOBAL_SYMBOL_ADDRESSES, + + /** + * Number of entries in ::CU_JIT_GLOBAL_SYMBOL_NAMES and + * ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays.\n + * Option type: unsigned int\n + * Applies to: dynamic linker only + */ + CU_JIT_GLOBAL_SYMBOL_COUNT, + CU_JIT_NUM_OPTIONS } CUjit_option; @@ -905,7 +968,9 @@ typedef enum CUjit_target_enum CU_TARGET_COMPUTE_60 = 60, /**< Compute device class 6.0.*/ CU_TARGET_COMPUTE_61 = 61, /**< Compute device class 6.1.*/ CU_TARGET_COMPUTE_62 = 62, /**< Compute device class 6.2.*/ - CU_TARGET_COMPUTE_70 = 70 /**< Compute device class 7.0.*/ + CU_TARGET_COMPUTE_70 = 70, /**< Compute device class 7.0.*/ + + CU_TARGET_COMPUTE_75 = 75 /**< Compute device class 7.5.*/ } CUjit_target; /** @@ -920,7 +985,7 @@ typedef enum CUjit_fallback_enum } CUjit_fallback; /** - * Caching modes for dlcm + * Caching modes for dlcm */ typedef enum CUjit_cacheMode_enum { @@ -1012,6 +1077,7 @@ typedef enum CUlimit_enum { CU_LIMIT_MALLOC_HEAP_SIZE = 0x02, /**< GPU malloc heap size */ CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH = 0x03, /**< GPU device runtime launch synchronize depth */ CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04, /**< GPU device runtime pending launch count */ + CU_LIMIT_MAX_L2_FETCH_GRANULARITY = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */ CU_LIMIT_MAX } CUlimit; @@ -1025,13 +1091,88 @@ typedef enum CUresourcetype_enum { CU_RESOURCE_TYPE_PITCH2D = 0x03 /**< Pitch 2D resource */ } CUresourcetype; +#ifdef _WIN32 +#define CUDA_CB __stdcall +#else +#define CUDA_CB +#endif + +#if __CUDA_API_VERSION >= 10000 + +/** + * CUDA host function + * \param userData Argument value passed to the function + */ +typedef void (CUDA_CB *CUhostFn)(void *userData); + +/** + * GPU kernel node parameters + */ +typedef struct CUDA_KERNEL_NODE_PARAMS_st { + CUfunction func; /**< Kernel to launch */ + unsigned int gridDimX; /**< Width of grid in blocks */ + unsigned int gridDimY; /**< Height of grid in blocks */ + unsigned int gridDimZ; /**< Depth of grid in blocks */ + unsigned int blockDimX; /**< X dimension of each thread block */ + unsigned int blockDimY; /**< Y dimension of each thread block */ + unsigned int blockDimZ; /**< Z dimension of each thread block */ + unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */ + void **kernelParams; /**< Array of pointers to kernel parameters */ + void **extra; /**< Extra options */ +} CUDA_KERNEL_NODE_PARAMS; + +/** + * Memset node parameters + */ +typedef struct CUDA_MEMSET_NODE_PARAMS_st { + CUdeviceptr dst; /**< Destination device pointer */ + size_t pitch; /**< Pitch of destination device pointer. Unused if height is 1 */ + unsigned int value; /**< Value to be set */ + unsigned int elementSize; /**< Size of each element in bytes. Must be 1, 2, or 4. */ + size_t width; /**< Width in bytes, of the row */ + size_t height; /**< Number of rows */ +} CUDA_MEMSET_NODE_PARAMS; + +/** + * Host node parameters + */ +typedef struct CUDA_HOST_NODE_PARAMS_st { + CUhostFn fn; /**< The function to call when the node executes */ + void* userData; /**< Argument to pass to the function */ +} CUDA_HOST_NODE_PARAMS; + +/** + * Graph node types + */ +typedef enum CUgraphNodeType_enum { + CU_GRAPH_NODE_TYPE_KERNEL = 0, /**< GPU kernel node */ + CU_GRAPH_NODE_TYPE_MEMCPY = 1, /**< Memcpy node */ + CU_GRAPH_NODE_TYPE_MEMSET = 2, /**< Memset node */ + CU_GRAPH_NODE_TYPE_HOST = 3, /**< Host (executable) node */ + CU_GRAPH_NODE_TYPE_GRAPH = 4, /**< Node which executes an embedded graph */ + CU_GRAPH_NODE_TYPE_EMPTY = 5, /**< Empty (no-op) node */ + CU_GRAPH_NODE_TYPE_COUNT +} CUgraphNodeType; + +/** + * Possible stream capture statuses returned by ::cuStreamIsCapturing + */ +typedef enum CUstreamCaptureStatus_enum { + CU_STREAM_CAPTURE_STATUS_NONE = 0, /**< Stream is not capturing */ + CU_STREAM_CAPTURE_STATUS_ACTIVE = 1, /**< Stream is actively capturing */ + CU_STREAM_CAPTURE_STATUS_INVALIDATED = 2 /**< Stream is part of a capture sequence that + has been invalidated, but not terminated */ +} CUstreamCaptureStatus; + +#endif /* __CUDA_API_VERSION >= 10000 */ + /** * Error codes */ typedef enum cudaError_enum { /** * The API call returned with no errors. In the case of query calls, this - * can also mean that the operation being queried is complete (see + * also means that the operation being queried is complete (see * ::cuEventQuery() and ::cuStreamQuery()). */ CUDA_SUCCESS = 0, @@ -1191,7 +1332,7 @@ typedef enum cudaError_enum { /** * This indicates that the ::CUcontext passed to the API call can - * only be bound to a single CPU thread at a time but is already + * only be bound to a single CPU thread at a time but is already * bound to a CPU thread. */ CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216, @@ -1254,6 +1395,12 @@ typedef enum cudaError_enum { */ CUDA_ERROR_INVALID_HANDLE = 400, + /** + * This indicates that a resource required by the API call is not in a + * valid state to perform the requested operation. + */ + CUDA_ERROR_ILLEGAL_STATE = 401, + /** * This indicates that a named symbol was not found. Examples of symbols * are global/constant variable names, texture names, and surface names. @@ -1303,7 +1450,7 @@ typedef enum cudaError_enum { * mode. */ CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, - + /** * This error indicates that a call to ::cuCtxEnablePeerAccess() is * trying to re-enable peer access to a context which has already @@ -1312,9 +1459,9 @@ typedef enum cudaError_enum { CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704, /** - * This error indicates that ::cuCtxDisablePeerAccess() is - * trying to disable peer access which has not been enabled yet - * via ::cuCtxEnablePeerAccess(). + * This error indicates that ::cuCtxDisablePeerAccess() is + * trying to disable peer access which has not been enabled yet + * via ::cuCtxEnablePeerAccess(). */ CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705, @@ -1333,15 +1480,15 @@ typedef enum cudaError_enum { /** * A device-side assert triggered during kernel execution. The context - * cannot be used anymore, and must be destroyed. All existing device - * memory allocations from this context are invalid and must be + * cannot be used anymore, and must be destroyed. All existing device + * memory allocations from this context are invalid and must be * reconstructed if the program is to continue using CUDA. */ CUDA_ERROR_ASSERT = 710, /** * This error indicates that the hardware resources required to enable - * peer access have been exhausted for one or more of the devices + * peer access have been exhausted for one or more of the devices * passed to ::cuCtxEnablePeerAccess(). */ CUDA_ERROR_TOO_MANY_PEERS = 711, @@ -1433,6 +1580,61 @@ typedef enum cudaError_enum { */ CUDA_ERROR_NOT_SUPPORTED = 801, + /** + * This error indicates that the system is not yet ready to start any CUDA + * work. To continue using CUDA, verify the system configuration is in a + * valid state and all required driver daemons are actively running. + */ + CUDA_ERROR_SYSTEM_NOT_READY = 802, + + /** + * This error indicates that the operation is not permitted when + * the stream is capturing. + */ + CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900, + + /** + * This error indicates that the current capture sequence on the stream + * has been invalidated due to a previous error. + */ + CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901, + + /** + * This error indicates that the operation would have resulted in a merge + * of two independent capture sequences. + */ + CUDA_ERROR_STREAM_CAPTURE_MERGE = 902, + + /** + * This error indicates that the capture was not initiated in this stream. + */ + CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903, + + /** + * This error indicates that the capture sequence contains a fork that was + * not joined to the primary stream. + */ + CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904, + + /** + * This error indicates that a dependency would have been created which + * crosses the capture sequence boundary. Only implicit in-stream ordering + * dependencies are allowed to cross the boundary. + */ + CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905, + + /** + * This error indicates a disallowed implicit dependency on a current capture + * sequence from cudaStreamLegacy. + */ + CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906, + + /** + * This error indicates that the operation is not permitted on an event which + * was last recorded in a capturing stream. + */ + CUDA_ERROR_CAPTURED_EVENT = 907, + /** * This indicates that an unknown internal error has occurred. */ @@ -1443,17 +1645,13 @@ typedef enum cudaError_enum { * P2P Attributes */ typedef enum CUdevice_P2PAttribute_enum { - CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01, /**< A relative value indicating the performance of the link between two devices */ - CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02, /**< P2P Access is enable */ - CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03 /**< Atomic operation over the link supported */ + CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01, /**< A relative value indicating the performance of the link between two devices */ + CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02, /**< P2P Access is enable */ + CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03, /**< Atomic operation over the link supported */ + CU_DEVICE_P2P_ATTRIBUTE_ARRAY_ACCESS_ACCESS_SUPPORTED = 0x04, /**< \deprecated use CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED instead */ + CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED = 0x04 /**< Accessing CUDA arrays over the link supported */ } CUdevice_P2PAttribute; -#ifdef _WIN32 -#define CUDA_CB __stdcall -#else -#define CUDA_CB -#endif - /** * CUDA stream callback * \param hStream The stream the callback was added to, as passed to ::cuStreamAddCallback. May be NULL. @@ -1689,7 +1887,7 @@ typedef struct CUDA_TEXTURE_DESC_st { CUfilter_mode mipmapFilterMode; /**< Mipmap filter mode */ float mipmapLevelBias; /**< Mipmap level bias */ float minMipmapLevelClamp; /**< Mipmap minimum level clamp */ - float maxMipmapLevelClamp; /**< Mipmap maximum level clamp */ + float maxMipmapLevelClamp; /**< Mipmap maximum level clamp */ float borderColor[4]; /**< Border Color */ int reserved[12]; } CUDA_TEXTURE_DESC; @@ -1782,6 +1980,244 @@ typedef struct CUDA_LAUNCH_PARAMS_st { #endif /* __CUDA_API_VERSION >= 9000 */ +#if __CUDA_API_VERSION >= 10000 + +/** + * External memory handle types + */ +typedef enum CUexternalMemoryHandleType_enum { + /** + * Handle is an opaque file descriptor + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1, + /** + * Handle is an opaque shared NT handle + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 = 2, + /** + * Handle is an opaque, globally shared handle + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, + /** + * Handle is a D3D12 heap object + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP = 4, + /** + * Handle is a D3D12 committed resource + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE = 5 +} CUexternalMemoryHandleType; + +/** + * Indicates that the external memory object is a dedicated resource + */ +#define CUDA_EXTERNAL_MEMORY_DEDICATED 0x1 + +/** + * External memory handle descriptor + */ +typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st { + /** + * Type of the handle + */ + CUexternalMemoryHandleType type; + union { + /** + * File descriptor referencing the memory object. Valid + * when type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD + */ + int fd; + /** + * Win32 handle referencing the semaphore object. Valid when + * type is one of the following: + * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 + * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT + * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP + * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE + * Exactly one of 'handle' and 'name' must be non-NULL. If + * type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT + * then 'name' must be NULL. + */ + struct { + /** + * Valid NT handle. Must be NULL if 'name' is non-NULL + */ + void *handle; + /** + * Name of a valid memory object. + * Must be NULL if 'handle' is non-NULL. + */ + const void *name; + } win32; + } handle; + /** + * Size of the memory allocation + */ + unsigned long long size; + /** + * Flags must either be zero or ::CUDA_EXTERNAL_MEMORY_DEDICATED + */ + unsigned int flags; + unsigned int reserved[16]; +} CUDA_EXTERNAL_MEMORY_HANDLE_DESC; + +/** + * External memory buffer descriptor + */ +typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st { + /** + * Offset into the memory object where the buffer's base is + */ + unsigned long long offset; + /** + * Size of the buffer + */ + unsigned long long size; + /** + * Flags reserved for future use. Must be zero. + */ + unsigned int flags; + unsigned int reserved[16]; +} CUDA_EXTERNAL_MEMORY_BUFFER_DESC; + +/** + * External memory mipmap descriptor + */ +typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st { + /** + * Offset into the memory object where the base level of the + * mipmap chain is. + */ + unsigned long long offset; + /** + * Format, dimension and type of base level of the mipmap chain + */ + CUDA_ARRAY3D_DESCRIPTOR arrayDesc; + /** + * Total number of levels in the mipmap chain + */ + unsigned int numLevels; + unsigned int reserved[16]; +} CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC; + +/** + * External semaphore handle types + */ +typedef enum CUexternalSemaphoreHandleType_enum { + /** + * Handle is an opaque file descriptor + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD = 1, + /** + * Handle is an opaque shared NT handle + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 = 2, + /** + * Handle is an opaque, globally shared handle + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, + /** + * Handle is a shared NT handle referencing a D3D12 fence object + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE = 4 +} CUexternalSemaphoreHandleType; + +/** + * External semaphore handle descriptor + */ +typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st { + /** + * Type of the handle + */ + CUexternalSemaphoreHandleType type; + union { + /** + * File descriptor referencing the semaphore object. Valid + * when type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD + */ + int fd; + /** + * Win32 handle referencing the semaphore object. Valid when + * type is one of the following: + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE + * Exactly one of 'handle' and 'name' must be non-NULL. If + * type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT + * then 'name' must be NULL. + */ + struct { + /** + * Valid NT handle. Must be NULL if 'name' is non-NULL + */ + void *handle; + /** + * Name of a valid synchronization primitive. + * Must be NULL if 'handle' is non-NULL. + */ + const void *name; + } win32; + } handle; + /** + * Flags reserved for the future. Must be zero. + */ + unsigned int flags; + unsigned int reserved[16]; +} CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC; + +/** + * External semaphore signal parameters + */ +typedef struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st { + struct { + /** + * Parameters for fence objects + */ + struct { + /** + * Value of fence to be signaled + */ + unsigned long long value; + } fence; + unsigned int reserved[16]; + } params; + /** + * Flags reserved for the future. Must be zero. + */ + unsigned int flags; + unsigned int reserved[16]; +} CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS; + +/** + * External semaphore wait parameters + */ +typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st { + struct { + /** + * Parameters for fence objects + */ + struct { + /** + * Value of fence to be waited on + */ + unsigned long long value; + } fence; + unsigned int reserved[16]; + } params; + /** + * Flags reserved for the future. Must be zero. + */ + unsigned int flags; + unsigned int reserved[16]; +} CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS; + + +#endif /* __CUDA_API_VERSION >= 10000 */ + /** * If set, each kernel launched as part of ::cuLaunchCooperativeKernelMultiDevice only * waits for prior work in the stream corresponding to that GPU to complete before the @@ -1798,7 +2234,7 @@ typedef struct CUDA_LAUNCH_PARAMS_st { /** * If set, the CUDA array is a collection of layers, where each layer is either a 1D - * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number + * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number * of layers, not the depth of a 3D array. */ #define CUDA_ARRAY3D_LAYERED 0x01 @@ -1831,9 +2267,15 @@ typedef struct CUDA_LAUNCH_PARAMS_st { /** * This flag if set indicates that the CUDA * array is a DEPTH_TEXTURE. -*/ + */ #define CUDA_ARRAY3D_DEPTH_TEXTURE 0x10 +/** + * This flag indicates that the CUDA array may be bound as a color target + * in an external graphics API + */ +#define CUDA_ARRAY3D_COLOR_ATTACHMENT 0x20 + /** * Override the texref format with a format inferred from the array. * Flag for ::cuTexRefSetArray() @@ -2011,11 +2453,15 @@ CUresult CUDAAPI cuInit(unsigned int Flags); */ /** - * \brief Returns the CUDA driver version + * \brief Returns the latest CUDA version supported by driver * - * Returns in \p *driverVersion the version number of the installed CUDA - * driver. This function automatically returns ::CUDA_ERROR_INVALID_VALUE if - * the \p driverVersion argument is NULL. + * Returns in \p *driverVersion the version of CUDA supported by + * the driver. The version is returned as + * (1000 × major + 10 × minor). For example, CUDA 9.2 + * would be represented by 9020. + * + * This function automatically returns ::CUDA_ERROR_INVALID_VALUE if + * \p driverVersion is NULL. * * \param driverVersion - Returns the CUDA driver version * @@ -2066,6 +2512,8 @@ CUresult CUDAAPI cuDriverGetVersion(int *driverVersion); * ::cuDeviceGetAttribute, * ::cuDeviceGetCount, * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGetLuid, * ::cuDeviceTotalMem */ CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal); @@ -2090,6 +2538,8 @@ CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal); * \sa * ::cuDeviceGetAttribute, * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGetLuid, * ::cuDeviceGet, * ::cuDeviceTotalMem, * ::cudaGetDeviceCount @@ -2118,6 +2568,8 @@ CUresult CUDAAPI cuDeviceGetCount(int *count); * * \sa * ::cuDeviceGetAttribute, + * ::cuDeviceGetUuid, + * ::cuDeviceGetLuid, * ::cuDeviceGetCount, * ::cuDeviceGet, * ::cuDeviceTotalMem, @@ -2125,6 +2577,66 @@ CUresult CUDAAPI cuDeviceGetCount(int *count); */ CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev); +#if __CUDA_API_VERSION >= 9020 +/** + * \brief Return an UUID for the device + * + * Returns 16-octets identifing the device \p dev in the structure + * pointed by the \p uuid. + * + * \param uuid - Returned UUID + * \param dev - Device to get identifier string for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetLuid, + * ::cuDeviceGet, + * ::cuDeviceTotalMem, + * ::cudaGetDeviceProperties + */ +CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev); +#endif + +#if defined(_WIN32) && __CUDA_API_VERSION >= 10000 +/** + * \brief Return an LUID and device node mask for the device + * + * Return identifying information (\p luid and \p deviceNodeMask) to allow + * matching device with graphics APIs. + * + * \param luid - Returned LUID + * \param deviceNodeMask - Returned device node mask + * \param dev - Device to get identifier string for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGet, + * ::cuDeviceTotalMem, + * ::cudaGetDeviceProperties + */ +CUresult CUDAAPI cuDeviceGetLuid(char *luid, unsigned int *deviceNodeMask, CUdevice dev); +#endif + #if __CUDA_API_VERSION >= 3020 /** * \brief Returns the total amount of memory on the device @@ -2148,6 +2660,7 @@ CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev); * ::cuDeviceGetAttribute, * ::cuDeviceGetCount, * ::cuDeviceGetName, + * ::cuDeviceGetUuid, * ::cuDeviceGet, * ::cudaMemGetInfo */ @@ -2175,15 +2688,15 @@ CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev); * - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the * memory copy functions that involve memory regions allocated through * ::cuMemAllocPitch(); - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D * texture width; * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width * for a 1D texture bound to linear memory; - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: Maximum + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: Maximum * mipmapped 1D texture width; - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D * texture width; - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D * texture height; * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width * for a 2D texture bound to linear memory; @@ -2191,40 +2704,40 @@ CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev); * for a 2D texture bound to linear memory; * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch * in bytes for a 2D texture bound to linear memory; - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: Maximum + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: Maximum * mipmapped 2D texture width; * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: Maximum * mipmapped 2D texture height; - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D * texture width; - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D * texture height; - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D * texture depth; - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE: + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE: * Alternate maximum 3D texture width, 0 if no alternate * maximum 3D texture size is supported; - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE: + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE: * Alternate maximum 3D texture height, 0 if no alternate * maximum 3D texture size is supported; - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE: + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE: * Alternate maximum 3D texture depth, 0 if no alternate * maximum 3D texture size is supported; * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH: * Maximum cubemap texture width or height; - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH: + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH: * Maximum 1D layered texture width; - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS: + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS: * Maximum layers in a 1D layered texture; - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH: + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH: * Maximum 2D layered texture width; - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT: + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT: * Maximum 2D layered texture height; - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS: + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS: * Maximum layers in a 2D layered texture; - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH: + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH: * Maximum cubemap layered texture width or height; - * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS: + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS: * Maximum layers in a cubemap layered texture; * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH: * Maximum 1D surface width; @@ -2297,13 +2810,13 @@ CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev); * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits; * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache; * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor; - * - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with + * - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with * the host, or 0 if not; * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number; * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number; - * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: 1 if device supports caching globals + * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: 1 if device supports caching globals * in L1 cache, 0 if caching globals in L1 cache is not supported by the device; - * - ::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: 1 if device supports caching locals + * - ::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: 1 if device supports caching locals * in L1 cache, 0 if caching locals in L1 cache is not supported by the device; * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: Maximum amount of * shared memory available to a multiprocessor in bytes; this amount is shared @@ -2330,6 +2843,9 @@ CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev); * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN: The maximum per block shared memory size * suported on this device. This is the maximum value that can be opted into when using the cuFuncSetAttribute() call. * For more details see ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES + * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES: Device accesses pageable memory via the host's + * page tables. + * - ::CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: The host can directly access managed memory on the device without migration. * * \param pi - Returned device attribute value * \param attrib - Device attribute to query @@ -2347,6 +2863,7 @@ CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev); * \sa * ::cuDeviceGetCount, * ::cuDeviceGetName, + * ::cuDeviceGetUuid, * ::cuDeviceGet, * ::cuDeviceTotalMem, * ::cudaDeviceGetAttribute, @@ -2426,10 +2943,11 @@ CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevi * ::cuDeviceGetAttribute, * ::cuDeviceGetCount, * ::cuDeviceGetName, + * ::cuDeviceGetUuid, * ::cuDeviceGet, * ::cuDeviceTotalMem */ -CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev); +__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev); /** * \brief Returns the compute capability of the device @@ -2437,7 +2955,7 @@ CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev); * \deprecated * * This function was deprecated as of CUDA 5.0 and its functionality superceded - * by ::cuDeviceGetAttribute(). + * by ::cuDeviceGetAttribute(). * * Returns in \p *major and \p *minor the major and minor revision numbers that * define the compute capability of the device \p dev. @@ -2459,10 +2977,11 @@ CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev); * ::cuDeviceGetAttribute, * ::cuDeviceGetCount, * ::cuDeviceGetName, + * ::cuDeviceGetUuid, * ::cuDeviceGet, * ::cuDeviceTotalMem */ -CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev); +__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev); /** @} */ /* END CUDA_DEVICE_DEPRECATED */ @@ -2492,9 +3011,9 @@ CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev) * Unlike ::cuCtxCreate() the newly created context is not pushed onto the stack. * * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of - * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() - * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the compute mode - * of the device. + * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() + * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the compute mode + * of the device. * The nvidia-smi tool can be used to set the compute mode for * devices. Documentation for nvidia-smi can be obtained by passing a * -h option to it. @@ -2718,8 +3237,8 @@ CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); * Creates a new CUDA context and associates it with the calling thread. The * \p flags parameter is described below. The context is created with a usage * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy() or - * when done using the context. If a context is already current to the thread, - * it is supplanted by the newly created context and may be restored by a subsequent + * when done using the context. If a context is already current to the thread, + * it is supplanted by the newly created context and may be restored by a subsequent * call to ::cuCtxPopCurrent(). * * The three LSBs of the \p flags parameter can be used to control how the OS @@ -2736,22 +3255,22 @@ CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); * results from the GPU. This can increase latency when waiting for the GPU, * but can increase the performance of CPU threads performing work in parallel * with the GPU. - * + * * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a * synchronization primitive when waiting for the GPU to finish work. * * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a * synchronization primitive when waiting for the GPU to finish work.
* Deprecated: This flag was deprecated as of CUDA 4.0 and was - * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. + * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. * * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, * uses a heuristic based on the number of active CUDA contexts in the * process \e C and the number of logical processors in the system \e P. If - * \e C > \e P, then CUDA will yield to other OS threads when waiting for - * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while - * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). - * However, on low power devices like Tegra, it always defaults to + * \e C > \e P, then CUDA will yield to other OS threads when waiting for + * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while + * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). + * However, on low power devices like Tegra, it always defaults to * ::CU_CTX_SCHED_BLOCKING_SYNC. * * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations. @@ -2764,10 +3283,10 @@ CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); * memory usage at the cost of potentially increased memory usage. * * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of - * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() - * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the - * compute mode of the device. The nvidia-smi tool can be used to set - * the compute mode for * devices. + * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() + * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the + * compute mode of the device. The nvidia-smi tool can be used to set + * the compute mode for * devices. * Documentation for nvidia-smi can be obtained by passing a * -h option to it. * @@ -2810,7 +3329,7 @@ CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); * It is the responsibility of the calling function to ensure that no API * call issues using \p ctx while ::cuCtxDestroy() is executing. * - * If \p ctx is current to the calling thread then \p ctx will also be + * If \p ctx is current to the calling thread then \p ctx will also be * popped from the current thread's context stack (as though ::cuCtxPopCurrent() * were called). If \p ctx is current to other threads, then \p ctx will * remain current to those threads, and attempting to access \p ctx from @@ -2879,8 +3398,8 @@ CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx); /** * \brief Pops the current CUDA context from the current CPU thread. * - * Pops the current CUDA context from the CPU thread and passes back the - * old context handle in \p *pctx. That context may then be made current + * Pops the current CUDA context from the CPU thread and passes back the + * old context handle in \p *pctx. That context may then be made current * to a different CPU thread by calling ::cuCtxPushCurrent(). * * If a context was current to the CPU thread before ::cuCtxCreate() or @@ -2918,7 +3437,7 @@ CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx); * calling CPU thread is unbound and ::CUDA_SUCCESS is returned. * * If there exists a CUDA context stack on the calling CPU thread, this - * will replace the top of that stack with \p ctx. + * will replace the top of that stack with \p ctx. * If \p ctx is NULL then this will be equivalent to popping the top * of the calling CPU thread's CUDA context stack (or a no-op if the * calling CPU thread's CUDA context stack is empty). @@ -2953,6 +3472,7 @@ CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx); * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE * \notefnerr * * \sa @@ -3029,7 +3549,7 @@ CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags); * * Blocks until the device has completed all preceding requested tasks. * ::cuCtxSynchronize() returns an error if one of the preceding tasks failed. - * If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the + * If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the * CPU thread will block until the GPU context has finished its work. * * \return @@ -3082,25 +3602,25 @@ CUresult CUDAAPI cuCtxSynchronize(void); * * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH controls the maximum nesting depth of * a grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting - * this limit must be performed before any launch of a kernel that uses the + * this limit must be performed before any launch of a kernel that uses the * device runtime and calls ::cudaDeviceSynchronize() above the default sync - * depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail - * with error code ::cudaErrorSyncDepthExceeded if the limitation is + * depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail + * with error code ::cudaErrorSyncDepthExceeded if the limitation is * violated. This limit can be set smaller than the default or up the maximum * launch depth of 24. When setting this limit, keep in mind that additional * levels of sync depth require the driver to reserve large amounts of device - * memory which can no longer be used for user allocations. If these - * reservations of device memory fail, ::cuCtxSetLimit will return + * memory which can no longer be used for user allocations. If these + * reservations of device memory fail, ::cuCtxSetLimit will return * ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value. * This limit is only applicable to devices of compute capability 3.5 and * higher. Attempting to set this limit on devices of compute capability less - * than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being + * than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being * returned. * * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT controls the maximum number of * outstanding device runtime launches that can be made from the current * context. A grid is outstanding from the point of launch up until the grid - * is known to have been completed. Device runtime launches which violate + * is known to have been completed. Device runtime launches which violate * this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when * ::cudaGetLastError() is called after launch. If more pending launches than * the default (2048 launches) are needed for a module using the device @@ -3121,7 +3641,8 @@ CUresult CUDAAPI cuCtxSynchronize(void); * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_UNSUPPORTED_LIMIT, - * ::CUDA_ERROR_OUT_OF_MEMORY + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_INVALID_CONTEXT * \notefnerr * * \sa ::cuCtxCreate, @@ -3279,20 +3800,20 @@ CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config); * \brief Returns the current shared memory configuration for the current context. * * This function will return in \p pConfig the current size of shared memory banks - * in the current context. On devices with configurable shared memory banks, - * ::cuCtxSetSharedMemConfig can be used to change this setting, so that all - * subsequent kernel launches will by default use the new bank size. When - * ::cuCtxGetSharedMemConfig is called on devices without configurable shared + * in the current context. On devices with configurable shared memory banks, + * ::cuCtxSetSharedMemConfig can be used to change this setting, so that all + * subsequent kernel launches will by default use the new bank size. When + * ::cuCtxGetSharedMemConfig is called on devices without configurable shared * memory, it will return the fixed bank size of the hardware. * * The returned bank configurations can be either: - * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: shared memory bank width is + * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: shared memory bank width is * four bytes. * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: shared memory bank width will * eight bytes. * * \param pConfig - returned shared memory configuration - * \return + * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, @@ -3321,16 +3842,16 @@ CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig); * \brief Sets the shared memory configuration for the current context. * * On devices with configurable shared memory banks, this function will set - * the context's shared memory bank size which is used for subsequent kernel - * launches. + * the context's shared memory bank size which is used for subsequent kernel + * launches. * * Changed the shared memory configuration between launches may insert a device * side synchronization point between those launches. * * Changing the shared memory bank size will not increase shared memory usage - * or affect occupancy of kernels, but may have major effects on performance. + * or affect occupancy of kernels, but may have major effects on performance. * Larger bank sizes will allow for greater potential bandwidth to shared memory, - * but will change what kinds of accesses to shared memory will result in bank + * but will change what kinds of accesses to shared memory will result in bank * conflicts. * * This function will do nothing on devices with fixed shared memory bank size. @@ -3392,6 +3913,7 @@ CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config); * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, * ::CUDA_ERROR_UNKNOWN * \notefnerr * @@ -3501,7 +4023,7 @@ CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority, int *greatestPr * ::cuCtxSetLimit, * ::cuCtxSynchronize */ -CUresult CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags); +__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags); /** * \brief Decrement a context's usage-count @@ -3537,7 +4059,7 @@ CUresult CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags); * ::cuCtxSetLimit, * ::cuCtxSynchronize */ -CUresult CUDAAPI cuCtxDetach(CUcontext ctx); +__CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx); /** @} */ /* END CUDA_CTX_DEPRECATED */ @@ -3642,7 +4164,7 @@ CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image); * as Windows \c FindResource() to obtain the pointer. Options are passed as * an array via \p options and any corresponding parameters are passed in * \p optionValues. The number of total options is supplied via \p numOptions. - * Any outputs will be returned via \p optionValues. + * Any outputs will be returned via \p optionValues. * * \param module - Returned module * \param image - Module data to load @@ -4260,7 +4782,7 @@ CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdevic * Note all host memory allocated using ::cuMemHostAlloc() will automatically * be immediately accessible to all contexts on all devices which support unified * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). - * The device pointer that may be used to access this host memory from those + * The device pointer that may be used to access this host memory from those * contexts is always equal to the returned host pointer \p *pp. * See \ref CUDA_UNIFIED for additional details. * @@ -4369,8 +4891,8 @@ CUresult CUDAAPI cuMemFreeHost(void *p); * Note all host memory allocated using ::cuMemHostAlloc() will automatically * be immediately accessible to all contexts on all devices which support unified * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). - * Unless the flag ::CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device pointer - * that may be used to access this host memory from those contexts is always equal + * Unless the flag ::CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device pointer + * that may be used to access this host memory from those contexts is always equal * to the returned host pointer \p *pp. If the flag ::CU_MEMHOSTALLOC_WRITECOMBINED * is specified, then the function ::cuMemHostGetDevicePointer() must be used * to query the device pointer, even if the context supports unified addressing. @@ -4608,7 +5130,7 @@ CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, unsigned * * \param dev - Returned device handle * - * \param pciBusId - String in one of the following forms: + * \param pciBusId - String in one of the following forms: * [domain]:[bus]:[device].[function] * [domain]:[bus]:[device] * [bus]:[device].[function] @@ -4665,35 +5187,38 @@ CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev); /** * \brief Gets an interprocess handle for a previously allocated event * - * Takes as input a previously allocated event. This event must have been - * created with the ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING + * Takes as input a previously allocated event. This event must have been + * created with the ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING * flags set. This opaque handle may be copied into other processes and * opened with ::cuIpcOpenEventHandle to allow efficient hardware * synchronization between GPU work in different processes. * - * After the event has been opened in the importing process, - * ::cuEventRecord, ::cuEventSynchronize, ::cuStreamWaitEvent and - * ::cuEventQuery may be used in either process. Performing operations - * on the imported event after the exported event has been freed + * After the event has been opened in the importing process, + * ::cuEventRecord, ::cuEventSynchronize, ::cuStreamWaitEvent and + * ::cuEventQuery may be used in either process. Performing operations + * on the imported event after the exported event has been freed * with ::cuEventDestroy will result in undefined behavior. * - * IPC functionality is restricted to devices with support for unified - * addressing on Linux operating systems. + * IPC functionality is restricted to devices with support for unified + * addressing on Linux and Windows operating systems. + * IPC functionality on Windows is restricted to GPUs in TCC mode. + * IPC functionality is not supported on Tegra platforms. * * \param pHandle - Pointer to a user allocated CUipcEventHandle * in which to return the opaque event handle - * \param event - Event allocated with ::CU_EVENT_INTERPROCESS and + * \param event - Event allocated with ::CU_EVENT_INTERPROCESS and * ::CU_EVENT_DISABLE_TIMING flags. * * \return * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_OUT_OF_MEMORY, - * ::CUDA_ERROR_MAP_FAILED + * ::CUDA_ERROR_MAP_FAILED, + * ::CUDA_ERROR_INVALID_VALUE * - * \sa - * ::cuEventCreate, - * ::cuEventDestroy, + * \sa + * ::cuEventCreate, + * ::cuEventDestroy, * ::cuEventSynchronize, * ::cuEventQuery, * ::cuStreamWaitEvent, @@ -4708,16 +5233,18 @@ CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event); /** * \brief Opens an interprocess event handle for use in the current process * - * Opens an interprocess event handle exported from another process with - * ::cuIpcGetEventHandle. This function returns a ::CUevent that behaves like - * a locally created event with the ::CU_EVENT_DISABLE_TIMING flag specified. + * Opens an interprocess event handle exported from another process with + * ::cuIpcGetEventHandle. This function returns a ::CUevent that behaves like + * a locally created event with the ::CU_EVENT_DISABLE_TIMING flag specified. * This event must be freed with ::cuEventDestroy. * - * Performing operations on the imported event after the exported event has + * Performing operations on the imported event after the exported event has * been freed with ::cuEventDestroy will result in undefined behavior. * - * IPC functionality is restricted to devices with support for unified - * addressing on Linux operating systems. + * IPC functionality is restricted to devices with support for unified + * addressing on Linux and Windows operating systems. + * IPC functionality on Windows is restricted to GPUs in TCC mode. + * IPC functionality is not supported on Tegra platforms. * * \param phEvent - Returns the imported event * \param handle - Interprocess handle to open @@ -4727,11 +5254,12 @@ CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event); * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_MAP_FAILED, * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, - * ::CUDA_ERROR_INVALID_HANDLE + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE * * \sa - * ::cuEventCreate, - * ::cuEventDestroy, + * ::cuEventCreate, + * ::cuEventDestroy, * ::cuEventSynchronize, * ::cuEventQuery, * ::cuStreamWaitEvent, @@ -4747,29 +5275,32 @@ CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle) * \brief Gets an interprocess memory handle for an existing device memory * allocation * - * Takes a pointer to the base of an existing device memory allocation created - * with ::cuMemAlloc and exports it for use in another process. This is a + * Takes a pointer to the base of an existing device memory allocation created + * with ::cuMemAlloc and exports it for use in another process. This is a * lightweight operation and may be called multiple times on an allocation - * without adverse effects. + * without adverse effects. * * If a region of memory is freed with ::cuMemFree and a subsequent call * to ::cuMemAlloc returns memory with the same device address, * ::cuIpcGetMemHandle will return a unique handle for the - * new memory. + * new memory. * - * IPC functionality is restricted to devices with support for unified - * addressing on Linux operating systems. + * IPC functionality is restricted to devices with support for unified + * addressing on Linux and Windows operating systems. + * IPC functionality on Windows is restricted to GPUs in TCC mode. + * IPC functionality is not supported on Tegra platforms. * * \param pHandle - Pointer to user allocated ::CUipcMemHandle to return * the handle in. - * \param dptr - Base pointer to previously allocated device memory + * \param dptr - Base pointer to previously allocated device memory * * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_HANDLE, * ::CUDA_ERROR_OUT_OF_MEMORY, * ::CUDA_ERROR_MAP_FAILED, - * + * ::CUDA_ERROR_INVALID_VALUE + * * \sa * ::cuMemAlloc, * ::cuMemFree, @@ -4786,14 +5317,14 @@ CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr); * and returns a device pointer usable in the local process. * * Maps memory exported from another process with ::cuIpcGetMemHandle into - * the current device address space. For contexts on different devices + * the current device address space. For contexts on different devices * ::cuIpcOpenMemHandle can attempt to enable peer access between the - * devices as if the user called ::cuCtxEnablePeerAccess. This behavior is - * controlled by the ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS flag. + * devices as if the user called ::cuCtxEnablePeerAccess. This behavior is + * controlled by the ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS flag. * ::cuDeviceCanAccessPeer can determine if a mapping is possible. * * Contexts that may open ::CUipcMemHandles are restricted in the following way. - * ::CUipcMemHandles from each ::CUdevice in a given process may only be opened + * ::CUipcMemHandles from each ::CUdevice in a given process may only be opened * by one ::CUcontext per ::CUdevice per other process. * * Memory returned from ::cuIpcOpenMemHandle must be freed with @@ -4803,9 +5334,11 @@ CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr); * ::cuIpcCloseMemHandle in the importing context will result in undefined * behavior. * - * IPC functionality is restricted to devices with support for unified - * addressing on Linux operating systems. - * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux and Windows operating systems. + * IPC functionality on Windows is restricted to GPUs in TCC mode. + * IPC functionality is not supported on Tegra platforms. + * * \param pdptr - Returned device pointer * \param handle - ::CUipcMemHandle to open * \param Flags - Flags for this operation. Must be specified as ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS @@ -4815,9 +5348,10 @@ CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr); * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_MAP_FAILED, * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_TOO_MANY_PEERS + * ::CUDA_ERROR_TOO_MANY_PEERS, + * ::CUDA_ERROR_INVALID_VALUE * - * \note No guarantees are made about the address returned in \p *pdptr. + * \note No guarantees are made about the address returned in \p *pdptr. * In particular, multiple processes may not receive the same address for the same \p handle. * * \sa @@ -4835,7 +5369,7 @@ CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, u /** * \brief Close memory mapped with ::cuIpcOpenMemHandle - * + * * Unmaps memory returnd by ::cuIpcOpenMemHandle. The original allocation * in the exporting process as well as imported mappings in other processes * will be unaffected. @@ -4843,17 +5377,19 @@ CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, u * Any resources used to enable peer access will be freed if this is the * last mapping using them. * - * IPC functionality is restricted to devices with support for unified - * addressing on Linux operating systems. + * IPC functionality is restricted to devices with support for unified + * addressing on Linux and Windows operating systems. + * IPC functionality on Windows is restricted to GPUs in TCC mode. + * IPC functionality is not supported on Tegra platforms. * * \param dptr - Device pointer returned by ::cuIpcOpenMemHandle - * + * * \returns * ::CUDA_SUCCESS, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_MAP_FAILED, * ::CUDA_ERROR_INVALID_HANDLE, - * + * ::CUDA_ERROR_INVALID_VALUE * \sa * ::cuMemAlloc, * ::cuMemFree, @@ -4874,8 +5410,8 @@ CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr); * Page-locks the memory range specified by \p p and \p bytesize and maps it * for the device(s) as specified by \p Flags. This memory range also is added * to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate - * calls to functions such as ::cuMemcpyHtoD(). Since the memory can be accessed - * directly by the device, it can be read or written with much higher bandwidth + * calls to functions such as ::cuMemcpyHtoD(). Since the memory can be accessed + * directly by the device, it can be read or written with much higher bandwidth * than pageable memory that has not been registered. Page-locking excessive * amounts of memory may degrade system performance, since it reduces the amount * of memory available to the system for paging. As a result, this function is @@ -4884,6 +5420,9 @@ CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr); * * This function has limited support on Mac OS X. OS 10.7 or higher is required. * + * This function is supported only on I/O coherent devices that have a non-zero value + * for the device attribute ::CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED. + * * The \p Flags parameter enables different options to be specified that * affect the allocation, as follows. * @@ -4924,7 +5463,7 @@ CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr); * on devices that have a non-zero value for the device attribute. Note however that * such devices should access the memory using only of the two pointers and not both. * - * The memory page-locked by this function must be unregistered with + * The memory page-locked by this function must be unregistered with * ::cuMemHostUnregister(). * * \param p - Host pointer to memory to page-lock @@ -4980,10 +5519,10 @@ CUresult CUDAAPI cuMemHostUnregister(void *p); /** * \brief Copies memory * - * Copies data between two pointers. - * \p dst and \p src are base pointers of the destination and source, respectively. + * Copies data between two pointers. + * \p dst and \p src are base pointers of the destination and source, respectively. * \p ByteCount specifies the number of bytes to copy. - * Note that this function infers the type of the transfer (host to host, host to + * Note that this function infers the type of the transfer (host to host, host to * device, device to device, or device to host) from the pointer values. This * function is only allowed in contexts which support unified addressing. * @@ -5020,9 +5559,9 @@ CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); * \brief Copies device memory between two contexts * * Copies from device memory in one context to device memory in another - * context. \p dstDevice is the base device pointer of the destination memory - * and \p dstContext is the destination context. \p srcDevice is the base - * device pointer of the source memory and \p srcContext is the source pointer. + * context. \p dstDevice is the base device pointer of the destination memory + * and \p dstContext is the destination context. \p srcDevice is the base + * device pointer of the source memory and \p srcContext is the source pointer. * \p ByteCount specifies the number of bytes to copy. * * \param dstDevice - Destination device pointer @@ -5382,9 +5921,9 @@ CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArr * * \par * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::srcArray is ignored. - * This value may be used only if unified addressing is supported in the calling + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling * context. * * \par @@ -5409,9 +5948,9 @@ CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArr * * \par * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::dstArray is ignored. - * This value may be used only if unified addressing is supported in the calling + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling * context. * * \par @@ -5544,9 +6083,9 @@ CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy); * * \par * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::srcArray is ignored. - * This value may be used only if unified addressing is supported in the calling + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling * context. * * \par @@ -5566,9 +6105,9 @@ CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy); * * \par * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::dstArray is ignored. - * This value may be used only if unified addressing is supported in the calling + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling * context. * * \par @@ -5714,9 +6253,9 @@ CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy); * * \par * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::srcArray is ignored. - * This value may be used only if unified addressing is supported in the calling + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling * context. * * \par @@ -5738,9 +6277,9 @@ CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy); * * \par * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::dstArray is ignored. - * This value may be used only if unified addressing is supported in the calling + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling * context. * * \par @@ -5865,10 +6404,10 @@ CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy); /** * \brief Copies memory asynchronously * - * Copies data between two pointers. - * \p dst and \p src are base pointers of the destination and source, respectively. + * Copies data between two pointers. + * \p dst and \p src are base pointers of the destination and source, respectively. * \p ByteCount specifies the number of bytes to copy. - * Note that this function infers the type of the transfer (host to host, host to + * Note that this function infers the type of the transfer (host to host, host to * device, device to device, or device to host) from the pointer values. This * function is only allowed in contexts which support unified addressing. * @@ -5882,7 +6421,8 @@ CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy); * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * \note_async * \note_null_stream @@ -5909,9 +6449,9 @@ CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCoun * \brief Copies device memory between two contexts asynchronously. * * Copies from device memory in one context to device memory in another - * context. \p dstDevice is the base device pointer of the destination memory - * and \p dstContext is the destination context. \p srcDevice is the base - * device pointer of the source memory and \p srcContext is the source pointer. + * context. \p dstDevice is the base device pointer of the destination memory + * and \p dstContext is the destination context. \p srcDevice is the base + * device pointer of the source memory and \p srcContext is the source pointer. * \p ByteCount specifies the number of bytes to copy. * * \param dstDevice - Destination device pointer @@ -5926,12 +6466,13 @@ CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCoun * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * \note_async * \note_null_stream * - * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, + * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, * ::cuMemcpy3DPeerAsync, * ::cudaMemcpyPeerAsync */ @@ -5956,7 +6497,8 @@ CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * \note_async * \note_null_stream @@ -5995,7 +6537,8 @@ CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, s * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * \note_async * \note_null_stream @@ -6034,7 +6577,8 @@ CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * \note_async * \note_null_stream @@ -6076,7 +6620,8 @@ CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * \note_async * \note_null_stream @@ -6116,7 +6661,8 @@ CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const voi * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * \note_async * \note_null_stream @@ -6181,9 +6727,9 @@ CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOf * * \par * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::srcArray is ignored. - * This value may be used only if unified addressing is supported in the calling + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling * context. * * \par @@ -6198,9 +6744,9 @@ CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOf * * \par * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::dstArray is ignored. - * This value may be used only if unified addressing is supported in the calling + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling * context. * * \par @@ -6282,7 +6828,8 @@ CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOf * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * \note_async * \note_null_stream @@ -6352,9 +6899,9 @@ CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream); * * \par * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::srcArray is ignored. - * This value may be used only if unified addressing is supported in the calling + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling * context. * * \par @@ -6376,9 +6923,9 @@ CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream); * * \par * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch - * specify the (unified virtual address space) base address of the source data - * and the bytes per row to apply. ::dstArray is ignored. - * This value may be used only if unified addressing is supported in the calling + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling * context. * * \par @@ -6457,7 +7004,8 @@ CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream); * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * \note_async * \note_null_stream @@ -6892,7 +7440,7 @@ CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsig * * Sets the 2D memory range of \p Width 16-bit values to the specified value * \p us. \p Height specifies the number of rows to set, and \p dstPitch - * specifies the number of bytes between each row. The \p dstDevice pointer + * specifies the number of bytes between each row. The \p dstDevice pointer * and \p dstPitch offset must be two byte aligned. This function performs * fastest when the pitch is one that has been passed back by * ::cuMemAllocPitch(). @@ -7126,7 +7674,8 @@ CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, C * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_ARRAY_IS_MAPPED + * ::CUDA_ERROR_ARRAY_IS_MAPPED, + * ::CUDA_ERROR_CONTEXT_IS_DESTROYED * \notefnerr * * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, @@ -7168,22 +7717,22 @@ CUresult CUDAAPI cuArrayDestroy(CUarray hArray); * - A 1D array is allocated if \p Height and \p Depth extents are both zero. * - A 2D array is allocated if only \p Depth extent is zero. * - A 3D array is allocated if all three extents are non-zero. - * - A 1D layered CUDA array is allocated if only \p Height is zero and the - * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number + * - A 1D layered CUDA array is allocated if only \p Height is zero and the + * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number * of layers is determined by the depth extent. - * - A 2D layered CUDA array is allocated if all three extents are non-zero and - * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number + * - A 2D layered CUDA array is allocated if all three extents are non-zero and + * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number * of layers is determined by the depth extent. * - A cubemap CUDA array is allocated if all three extents are non-zero and the - * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and - * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, - * where the six layers represent the six faces of a cube. The order of the six + * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and + * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, + * where the six layers represent the six faces of a cube. The order of the six * layers in memory is the same as that listed in ::CUarray_cubemap_face. - * - A cubemap layered CUDA array is allocated if all three extents are non-zero, - * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. - * \p Width must be equal to \p Height, and \p Depth must be a multiple of six. - * A cubemap layered CUDA array is a special type of 2D layered CUDA array that - * consists of a collection of cubemaps. The first six layers represent the first + * - A cubemap layered CUDA array is allocated if all three extents are non-zero, + * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. + * \p Width must be equal to \p Height, and \p Depth must be a multiple of six. + * A cubemap layered CUDA array is a special type of 2D layered CUDA array that + * consists of a collection of cubemaps. The first six layers represent the first * cubemap, the next six layers form the second cubemap, and so on. * * - ::Format specifies the format of the elements; ::CUarray_format is @@ -7204,11 +7753,11 @@ CUresult CUDAAPI cuArrayDestroy(CUarray hArray); * - \p NumChannels specifies the number of packed components per CUDA array * element; it may be 1, 2, or 4; * - * - ::Flags may be set to - * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA arrays. If this flag is set, + * - ::Flags may be set to + * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA arrays. If this flag is set, * \p Depth specifies the number of layers, not the depth of a 3D array. - * - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to the CUDA array. - * If this flag is not set, ::cuSurfRefSetArray will fail when attempting to bind the CUDA array + * - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to the CUDA array. + * If this flag is not set, ::cuSurfRefSetArray will fail when attempting to bind the CUDA array * to a surface reference. * - ::CUDA_ARRAY3D_CUBEMAP to enable creation of cubemaps. If this flag is set, \p Width must be * equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set, @@ -7216,20 +7765,20 @@ CUresult CUDAAPI cuArrayDestroy(CUarray hArray); * - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA array will be used for texture gather. * Texture gather can only be performed on 2D CUDA arrays. * - * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. - * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute - * is not specified. For ex., TEXTURE1D_WIDTH refers to the device attribute + * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. + * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute + * is not specified. For ex., TEXTURE1D_WIDTH refers to the device attribute * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH. * - * Note that 2D CUDA arrays have different size requirements if the ::CUDA_ARRAY3D_TEXTURE_GATHER flag - * is set. \p Width and \p Height must not be greater than ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH + * Note that 2D CUDA arrays have different size requirements if the ::CUDA_ARRAY3D_TEXTURE_GATHER flag + * is set. \p Width and \p Height must not be greater than ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH * and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT respectively, in that case. * * * - * - * * * @@ -7239,28 +7788,28 @@ CUresult CUDAAPI cuArrayDestroy(CUarray hArray); * * * - * * - * - * * - * - * * * - * * - * - * *
CUDA array typeValid extents that must always be met
{(width range in elements), (height range), + *
Valid extents that must always be met
{(width range in elements), (height range), * (depth range)}
Valid extents with CUDA_ARRAY3D_SURFACE_LDST set
+ *
Valid extents with CUDA_ARRAY3D_SURFACE_LDST set
* {(width range in elements), (height range), (depth range)}
1D{ (1,TEXTURE1D_WIDTH), 0, 0 }{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }
3D{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) } - *
OR
{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), + *
OR
{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), * (1,TEXTURE3D_DEPTH_ALTERNATE) }
{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), + * { (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), * (1,SURFACE3D_DEPTH) }
1D Layered{ (1,TEXTURE1D_LAYERED_WIDTH), 0, + * { (1,TEXTURE1D_LAYERED_WIDTH), 0, * (1,TEXTURE1D_LAYERED_LAYERS) }{ (1,SURFACE1D_LAYERED_WIDTH), 0, + * { (1,SURFACE1D_LAYERED_WIDTH), 0, * (1,SURFACE1D_LAYERED_LAYERS) }
2D Layered{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), + * { (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), * (1,TEXTURE2D_LAYERED_LAYERS) }{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), + * { (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), * (1,SURFACE2D_LAYERED_LAYERS) }
Cubemap{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }{ (1,SURFACECUBEMAP_WIDTH), + * { (1,SURFACECUBEMAP_WIDTH), * (1,SURFACECUBEMAP_WIDTH), 6 }
Cubemap Layered{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), + * { (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), + * { (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), * (1,SURFACECUBEMAP_LAYERED_LAYERS) }
* @@ -7344,7 +7893,8 @@ CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_VALUE, - * ::CUDA_ERROR_INVALID_HANDLE + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_CONTEXT_IS_DESTROYED * \notefnerr * * \sa ::cuArray3DCreate, ::cuArrayCreate, @@ -7391,22 +7941,22 @@ CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescripto * - A 1D mipmapped array is allocated if \p Height and \p Depth extents are both zero. * - A 2D mipmapped array is allocated if only \p Depth extent is zero. * - A 3D mipmapped array is allocated if all three extents are non-zero. - * - A 1D layered CUDA mipmapped array is allocated if only \p Height is zero and the - * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number + * - A 1D layered CUDA mipmapped array is allocated if only \p Height is zero and the + * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number * of layers is determined by the depth extent. - * - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and - * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number + * - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and + * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number * of layers is determined by the depth extent. * - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the - * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and - * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, - * where the six layers represent the six faces of a cube. The order of the six + * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and + * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, + * where the six layers represent the six faces of a cube. The order of the six * layers in memory is the same as that listed in ::CUarray_cubemap_face. - * - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero, - * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. - * \p Width must be equal to \p Height, and \p Depth must be a multiple of six. - * A cubemap layered CUDA array is a special type of 2D layered CUDA array that - * consists of a collection of cubemaps. The first six layers represent the first + * - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero, + * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. + * \p Width must be equal to \p Height, and \p Depth must be a multiple of six. + * A cubemap layered CUDA array is a special type of 2D layered CUDA array that + * consists of a collection of cubemaps. The first six layers represent the first * cubemap, the next six layers form the second cubemap, and so on. * * - ::Format specifies the format of the elements; ::CUarray_format is @@ -7427,11 +7977,11 @@ CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescripto * - \p NumChannels specifies the number of packed components per CUDA array * element; it may be 1, 2, or 4; * - * - ::Flags may be set to - * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA mipmapped arrays. If this flag is set, + * - ::Flags may be set to + * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA mipmapped arrays. If this flag is set, * \p Depth specifies the number of layers, not the depth of a 3D array. * - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to individual mipmap levels of - * the CUDA mipmapped array. If this flag is not set, ::cuSurfRefSetArray will fail when attempting to + * the CUDA mipmapped array. If this flag is not set, ::cuSurfRefSetArray will fail when attempting to * bind a mipmap level of the CUDA mipmapped array to a surface reference. * - ::CUDA_ARRAY3D_CUBEMAP to enable creation of mipmapped cubemaps. If this flag is set, \p Width must be * equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set, @@ -7439,16 +7989,16 @@ CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescripto * - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA mipmapped array will be used for texture gather. * Texture gather can only be performed on 2D CUDA mipmapped arrays. * - * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. - * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute - * is not specified. For ex., TEXTURE1D_MIPMAPPED_WIDTH refers to the device attribute + * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. + * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute + * is not specified. For ex., TEXTURE1D_MIPMAPPED_WIDTH refers to the device attribute * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH. * * * - * - * * * @@ -7458,28 +8008,28 @@ CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescripto * * * - * * - * - * * - * - * * * - * * - * - * *
CUDA array typeValid extents that must always be met
{(width range in elements), (height range), + *
Valid extents that must always be met
{(width range in elements), (height range), * (depth range)}
Valid extents with CUDA_ARRAY3D_SURFACE_LDST set
+ *
Valid extents with CUDA_ARRAY3D_SURFACE_LDST set
* {(width range in elements), (height range), (depth range)}
1D{ (1,TEXTURE1D_MIPMAPPED_WIDTH), 0, 0 }{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }
3D{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) } - *
OR
{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), + *
OR
{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), * (1,TEXTURE3D_DEPTH_ALTERNATE) }
{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), + * { (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), * (1,SURFACE3D_DEPTH) }
1D Layered{ (1,TEXTURE1D_LAYERED_WIDTH), 0, + * { (1,TEXTURE1D_LAYERED_WIDTH), 0, * (1,TEXTURE1D_LAYERED_LAYERS) }{ (1,SURFACE1D_LAYERED_WIDTH), 0, + * { (1,SURFACE1D_LAYERED_WIDTH), 0, * (1,SURFACE1D_LAYERED_LAYERS) }
2D Layered{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), + * { (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), * (1,TEXTURE2D_LAYERED_LAYERS) }{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), + * { (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), * (1,SURFACE2D_LAYERED_LAYERS) }
Cubemap{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }{ (1,SURFACECUBEMAP_WIDTH), + * { (1,SURFACECUBEMAP_WIDTH), * (1,SURFACECUBEMAP_WIDTH), 6 }
Cubemap Layered{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), + * { (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), + * { (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), * (1,SURFACECUBEMAP_LAYERED_LAYERS) }
* @@ -7549,7 +8099,8 @@ CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE, - * ::CUDA_ERROR_ARRAY_IS_MAPPED + * ::CUDA_ERROR_ARRAY_IS_MAPPED, + * ::CUDA_ERROR_CONTEXT_IS_DESTROYED * \notefnerr * * \sa @@ -7570,42 +8121,42 @@ CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray); * ___MANBRIEF___ unified addressing functions of the low-level CUDA driver * API (___CURRENT_FILE___) ___ENDMANBRIEF___ * - * This section describes the unified addressing functions of the + * This section describes the unified addressing functions of the * low-level CUDA driver application programming interface. * * @{ * * \section CUDA_UNIFIED_overview Overview * - * CUDA devices can share a unified address space with the host. + * CUDA devices can share a unified address space with the host. * For these devices there is no distinction between a device - * pointer and a host pointer -- the same pointer value may be - * used to access memory from the host program and from a kernel + * pointer and a host pointer -- the same pointer value may be + * used to access memory from the host program and from a kernel * running on the device (with exceptions enumerated below). * * \section CUDA_UNIFIED_support Supported Platforms - * - * Whether or not a device supports unified addressing may be - * queried by calling ::cuDeviceGetAttribute() with the device + * + * Whether or not a device supports unified addressing may be + * queried by calling ::cuDeviceGetAttribute() with the device * attribute ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. * - * Unified addressing is automatically enabled in 64-bit processes + * Unified addressing is automatically enabled in 64-bit processes * * \section CUDA_UNIFIED_lookup Looking Up Information from Pointer Values * - * It is possible to look up information about the memory which backs a + * It is possible to look up information about the memory which backs a * pointer value. For instance, one may want to know if a pointer points - * to host or device memory. As another example, in the case of device - * memory, one may want to know on which CUDA device the memory - * resides. These properties may be queried using the function + * to host or device memory. As another example, in the case of device + * memory, one may want to know on which CUDA device the memory + * resides. These properties may be queried using the function * ::cuPointerGetAttribute() * * Since pointers are unique, it is not necessary to specify information - * about the pointers specified to the various copy functions in the + * about the pointers specified to the various copy functions in the * CUDA API. The function ::cuMemcpy() may be used to perform a copy * between two pointers, ignoring whether they point to host or device * memory (making ::cuMemcpyHtoD(), ::cuMemcpyDtoD(), and ::cuMemcpyDtoH() - * unnecessary for devices supporting unified addressing). For + * unnecessary for devices supporting unified addressing). For * multidimensional copies, the memory type ::CU_MEMORYTYPE_UNIFIED may be * used to specify that the CUDA driver should infer the location of the * pointer from its value. @@ -7614,45 +8165,45 @@ CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray); * * All host memory allocated in all contexts using ::cuMemAllocHost() and * ::cuMemHostAlloc() is always directly accessible from all contexts on - * all devices that support unified addressing. This is the case regardless + * all devices that support unified addressing. This is the case regardless * of whether or not the flags ::CU_MEMHOSTALLOC_PORTABLE and * ::CU_MEMHOSTALLOC_DEVICEMAP are specified. * - * The pointer value through which allocated host memory may be accessed - * in kernels on all devices that support unified addressing is the same + * The pointer value through which allocated host memory may be accessed + * in kernels on all devices that support unified addressing is the same * as the pointer value through which that memory is accessed on the host, - * so it is not necessary to call ::cuMemHostGetDevicePointer() to get the device + * so it is not necessary to call ::cuMemHostGetDevicePointer() to get the device * pointer for these allocations. - * + * * Note that this is not the case for memory allocated using the flag * ::CU_MEMHOSTALLOC_WRITECOMBINED, as discussed below. * * \section CUDA_UNIFIED_autopeerregister Automatic Registration of Peer Memory * - * Upon enabling direct access from a context that supports unified addressing - * to another peer context that supports unified addressing using - * ::cuCtxEnablePeerAccess() all memory allocated in the peer context using - * ::cuMemAlloc() and ::cuMemAllocPitch() will immediately be accessible + * Upon enabling direct access from a context that supports unified addressing + * to another peer context that supports unified addressing using + * ::cuCtxEnablePeerAccess() all memory allocated in the peer context using + * ::cuMemAlloc() and ::cuMemAllocPitch() will immediately be accessible * by the current context. The device pointer value through * which any peer memory may be accessed in the current context * is the same pointer value through which that memory may be * accessed in the peer context. * * \section CUDA_UNIFIED_exceptions Exceptions, Disjoint Addressing - * + * * Not all memory may be accessed on devices through the same pointer * value through which they are accessed on the host. These exceptions * are host memory registered using ::cuMemHostRegister() and host memory - * allocated using the flag ::CU_MEMHOSTALLOC_WRITECOMBINED. For these + * allocated using the flag ::CU_MEMHOSTALLOC_WRITECOMBINED. For these * exceptions, there exists a distinct host and device address for the * memory. The device address is guaranteed to not overlap any valid host - * pointer range and is guaranteed to have the same value across all - * contexts that support unified addressing. - * - * This device address may be queried using ::cuMemHostGetDevicePointer() - * when a context using unified addressing is current. Either the host - * or the unified device pointer value may be used to refer to this memory - * through ::cuMemcpy() and similar functions using the + * pointer range and is guaranteed to have the same value across all + * contexts that support unified addressing. + * + * This device address may be queried using ::cuMemHostGetDevicePointer() + * when a context using unified addressing is current. Either the host + * or the unified device pointer value may be used to refer to this memory + * through ::cuMemcpy() and similar functions using the * ::CU_MEMORYTYPE_UNIFIED memory type. * */ @@ -7660,69 +8211,69 @@ CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray); #if __CUDA_API_VERSION >= 4000 /** * \brief Returns information about a pointer - * + * * The supported attributes are: - * - * - ::CU_POINTER_ATTRIBUTE_CONTEXT: - * - * Returns in \p *data the ::CUcontext in which \p ptr was allocated or - * registered. - * The type of \p data must be ::CUcontext *. - * + * + * - ::CU_POINTER_ATTRIBUTE_CONTEXT: + * + * Returns in \p *data the ::CUcontext in which \p ptr was allocated or + * registered. + * The type of \p data must be ::CUcontext *. + * * If \p ptr was not allocated by, mapped by, or registered with - * a ::CUcontext which uses unified virtual addressing then - * ::CUDA_ERROR_INVALID_VALUE is returned. - * - * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE: - * - * Returns in \p *data the physical memory type of the memory that - * \p ptr addresses as a ::CUmemorytype enumerated value. - * The type of \p data must be unsigned int. - * - * If \p ptr addresses device memory then \p *data is set to - * ::CU_MEMORYTYPE_DEVICE. The particular ::CUdevice on which the - * memory resides is the ::CUdevice of the ::CUcontext returned by the - * ::CU_POINTER_ATTRIBUTE_CONTEXT attribute of \p ptr. - * - * If \p ptr addresses host memory then \p *data is set to - * ::CU_MEMORYTYPE_HOST. - * - * If \p ptr was not allocated by, mapped by, or registered with - * a ::CUcontext which uses unified virtual addressing then + * a ::CUcontext which uses unified virtual addressing then * ::CUDA_ERROR_INVALID_VALUE is returned. * - * If the current ::CUcontext does not support unified virtual + * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE: + * + * Returns in \p *data the physical memory type of the memory that + * \p ptr addresses as a ::CUmemorytype enumerated value. + * The type of \p data must be unsigned int. + * + * If \p ptr addresses device memory then \p *data is set to + * ::CU_MEMORYTYPE_DEVICE. The particular ::CUdevice on which the + * memory resides is the ::CUdevice of the ::CUcontext returned by the + * ::CU_POINTER_ATTRIBUTE_CONTEXT attribute of \p ptr. + * + * If \p ptr addresses host memory then \p *data is set to + * ::CU_MEMORYTYPE_HOST. + * + * If \p ptr was not allocated by, mapped by, or registered with + * a ::CUcontext which uses unified virtual addressing then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * + * If the current ::CUcontext does not support unified virtual * addressing then ::CUDA_ERROR_INVALID_CONTEXT is returned. - * + * * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER: - * + * * Returns in \p *data the device pointer value through which - * \p ptr may be accessed by kernels running in the current + * \p ptr may be accessed by kernels running in the current * ::CUcontext. * The type of \p data must be CUdeviceptr *. - * + * * If there exists no device pointer value through which * kernels running in the current ::CUcontext may access * \p ptr then ::CUDA_ERROR_INVALID_VALUE is returned. - * - * If there is no current ::CUcontext then + * + * If there is no current ::CUcontext then * ::CUDA_ERROR_INVALID_CONTEXT is returned. - * - * Except in the exceptional disjoint addressing cases discussed - * below, the value returned in \p *data will equal the input + * + * Except in the exceptional disjoint addressing cases discussed + * below, the value returned in \p *data will equal the input * value \p ptr. - * + * * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER: - * - * Returns in \p *data the host pointer value through which + * + * Returns in \p *data the host pointer value through which * \p ptr may be accessed by by the host program. * The type of \p data must be void **. * If there exists no host pointer value through which - * the host program may directly access \p ptr then + * the host program may directly access \p ptr then * ::CUDA_ERROR_INVALID_VALUE is returned. - * - * Except in the exceptional disjoint addressing cases discussed - * below, the value returned in \p *data will equal the input + * + * Except in the exceptional disjoint addressing cases discussed + * below, the value returned in \p *data will equal the input * value \p ptr. * * - ::CU_POINTER_ATTRIBUTE_P2P_TOKENS: @@ -7738,7 +8289,7 @@ CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray); * Querying this attribute has a side effect of setting the attribute * ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS for the region of memory that * \p ptr points to. - * + * * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS: * * A boolean attribute which when set, ensures that synchronous memory operations @@ -7763,22 +8314,27 @@ CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray); * Returns in \p *data a boolean that indicates whether the pointer points to * managed memory or not. * + * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL: + * + * Returns in \p *data an integer representing a device ordinal of a device against + * which the memory was allocated or registered. + * * \par * * Note that for most allocations in the unified virtual address space - * the host and device pointer for accessing the allocation will be the + * the host and device pointer for accessing the allocation will be the * same. The exceptions to this are - * - user memory registered using ::cuMemHostRegister - * - host memory allocated using ::cuMemHostAlloc with the + * - user memory registered using ::cuMemHostRegister + * - host memory allocated using ::cuMemHostAlloc with the * ::CU_MEMHOSTALLOC_WRITECOMBINED flag - * For these types of allocation there will exist separate, disjoint host - * and device addresses for accessing the allocation. In particular - * - The host address will correspond to an invalid unmapped device address - * (which will result in an exception if accessed from the device) - * - The device address will correspond to an invalid unmapped host address + * For these types of allocation there will exist separate, disjoint host + * and device addresses for accessing the allocation. In particular + * - The host address will correspond to an invalid unmapped device address + * (which will result in an exception if accessed from the device) + * - The device address will correspond to an invalid unmapped host address * (which will result in an exception if accessed from the host). - * For these types of allocations, querying ::CU_POINTER_ATTRIBUTE_HOST_POINTER - * and ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER may be used to retrieve the host + * For these types of allocations, querying ::CU_POINTER_ATTRIBUTE_HOST_POINTER + * and ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER may be used to retrieve the host * and device addresses from either address. * * \param data - Returned pointer attribute value @@ -7812,8 +8368,8 @@ CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute /** * \brief Prefetches memory to the specified destination device * - * Prefetches memory to the specified destination device. \p devPtr is the - * base device pointer of the memory to be prefetched and \p dstDevice is the + * Prefetches memory to the specified destination device. \p devPtr is the + * base device pointer of the memory to be prefetched and \p dstDevice is the * destination device. \p count specifies the number of bytes to copy. \p hStream * is the stream in which the operation is enqueued. The memory range must refer * to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. @@ -7884,7 +8440,10 @@ CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice d * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory * range will be rounded down and rounded up respectively to be aligned to CPU page size before the * advice is applied. The memory range must refer to managed memory allocated via ::cuMemAllocManaged - * or declared via __managed__ variables. + * or declared via __managed__ variables. The memory range could also refer to system-allocated pageable + * memory provided it represents a valid, host-accessible region of memory and all additional constraints + * imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable + * memory range results in an error being returned. * * The \p advice parameter can take the following values: * - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read @@ -7898,11 +8457,18 @@ CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice d * Also, if a context is created on a device that does not have the device attribute * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until * all such contexts are destroyed. + * If the memory region refers to valid system-allocated pageable memory, then the accessing device must + * have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + * copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + * device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + * will not create a read-only copy when that device accesses this memory region. + * * - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated * copies of the data will be collapsed into a single copy. The location for the collapsed * copy will be the preferred location if the page has a preferred location and one of the read-duplicated * copies was resident at that location. Otherwise, the location chosen is arbitrary. + * * - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the * data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the * preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the @@ -7919,9 +8485,17 @@ CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice d * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But * if the preferred location is set as device memory, then the page will continue to thrash indefinitely. * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the - * policies associated with that advice will override the policies of this advice. + * policies associated with that advice will override the policies of this advice, unless read accesses from + * \p device will not result in a read-only copy being created on that device as outlined in description for + * the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero + * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has + * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + * then this call has no effect. Note however that this behavior may change in the future. + * * - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION * and changes the preferred location to none. + * * - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. * Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. If \p device is a GPU, then * the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. @@ -7942,8 +8516,17 @@ CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice d * policies associated with that advice will override the policies of this advice. Additionally, if the * preferred location of this memory region or any subset of it is also \p device, then the policies * associated with ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero + * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has + * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + * then this call has no effect. + * * - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY. Any mappings to * the data from \p device may be removed at any time causing accesses to result in non-fatal page faults. + * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero + * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has + * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + * then this call has no effect. * * \param devPtr - Pointer to memory to set the advice for * \param count - Size in bytes of the memory range @@ -7966,7 +8549,7 @@ CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advi /** * \brief Query an attribute of a given memory range - * + * * Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via * __managed__ variables. @@ -7981,7 +8564,7 @@ CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advi * if all pages in the memory range have the CPU as their preferred location, or it will be CU_DEVICE_INVALID * if either all the pages don't have the same preferred location or some of the pages don't have a * preferred location at all. Note that the actual location of the pages in the memory range at the time of - * the query may be different from the preferred location. + * the query may be different from the preferred location. * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY: If this attribute is specified, \p data will be interpreted * as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned * will be a list of device ids that had ::CU_MEM_ADVISE_SET_ACCESSED_BY set for that entire memory range. @@ -8120,6 +8703,7 @@ CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute at * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED + * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL * * \param numAttributes - Number of attributes to query * \param attributes - An array of attributes to query @@ -8171,7 +8755,7 @@ CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_at * Creates a stream and returns a handle in \p phStream. The \p Flags argument * determines behaviors of the stream. Valid values for \p Flags are: * - ::CU_STREAM_DEFAULT: Default stream creation flag. - * - ::CU_STREAM_NON_BLOCKING: Specifies that work running in the created + * - ::CU_STREAM_NON_BLOCKING: Specifies that work running in the created * stream may run concurrently with work in stream 0 (the NULL stream), and that * the created stream should perform no implicit synchronization with stream 0. * @@ -8308,24 +8892,61 @@ CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority); */ CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags); +#if __CUDA_API_VERSION >= 9020 + +/** + * \brief Query the context associated with a stream + * + * Returns the CUDA context that the stream is associated with. + * + * The stream handle \p hStream can refer to any of the following: + *
    + *
  • a stream created via any of the CUDA driver APIs such as ::cuStreamCreate + * and ::cuStreamCreateWithPriority, or their runtime API equivalents such as + * ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority. + * The returned context is the context that was active in the calling thread when the + * stream was created. Passing an invalid handle will result in undefined behavior.
  • + *
  • any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and + * ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted, + * which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively. + * Specifying any of the special handles will return the context current to the + * calling thread. If no context is current to the calling thread, + * ::CUDA_ERROR_INVALID_CONTEXT is returned.
  • + *
+ * + * \param hStream - Handle to the stream to be queried + * \param pctx - Returned context associated with the stream + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * \notefnerr + * + * \sa ::cuStreamDestroy, + * ::cuStreamCreateWithPriority, + * ::cuStreamGetPriority, + * ::cuStreamGetFlags, + * ::cuStreamWaitEvent, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback, + * ::cudaStreamCreate, + * ::cudaStreamCreateWithFlags + */ +CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx); + +#endif /* __CUDA_API_VERSION >= 9020 */ /** * \brief Make a compute stream wait on an event * - * Makes all future work submitted to \p hStream wait until \p hEvent - * reports completion before beginning execution. This synchronization - * will be performed efficiently on the device. The event \p hEvent may - * be from a different context than \p hStream, in which case this function - * will perform cross-device synchronization. - * - * The stream \p hStream will wait only for the completion of the most recent - * host call to ::cuEventRecord() on \p hEvent. Once this call has returned, - * any functions (including ::cuEventRecord() and ::cuEventDestroy()) may be - * called on \p hEvent again, and subsequent calls will not have any - * effect on \p hStream. - * - * If ::cuEventRecord() has not been called on \p hEvent, this call acts as if - * the record has already completed, and so is a functional no-op. + * Makes all future work submitted to \p hStream wait for all work captured in + * \p hEvent. See ::cuEventRecord() for details on what is captured by an event. + * The synchronization will be performed efficiently on the device when applicable. + * \p hEvent may be from a different context or device than \p hStream. * * \param hStream - Stream to wait * \param hEvent - Event to wait on (may not be NULL) @@ -8353,8 +8974,14 @@ CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned in /** * \brief Add a callback to a compute stream * + * \note This function is slated for eventual deprecation and removal. If + * you do not require the callback to execute in case of a device error, + * consider using ::cuLaunchHostFunc. Additionally, this function is not + * supported with ::cuStreamBeginCapture and ::cuStreamEndCapture, unlike + * ::cuLaunchHostFunc. + * * Adds a callback to be called on the host after all currently enqueued - * items in the stream have completed. For each + * items in the stream have completed. For each * cuStreamAddCallback call, the callback will be executed exactly once. * The callback will block later work in the stream until it is finished. * @@ -8379,10 +9006,11 @@ CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned in * the callback. It thus synchronizes streams which have been "joined" * prior to the callback. *
  • Adding device work to any stream does not have the effect of making - * the stream active until all preceding callbacks have executed. Thus, for + * the stream active until all preceding host functions and stream callbacks + * have executed. Thus, for * example, a callback might use global attached memory even if work has - * been added to another stream, if it has been properly ordered with an - * event.
  • + * been added to another stream, if the work has been ordered behind the + * callback with an event. *
  • Completion of a callback does not cause a stream to become * active except as described above. The callback stream will remain idle * if no device work follows the callback, and will remain idle across @@ -8413,10 +9041,110 @@ CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned in * ::cuStreamDestroy, * ::cuMemAllocManaged, * ::cuStreamAttachMemAsync, + * ::cuStreamLaunchHostFunc, * ::cudaStreamAddCallback */ CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); +#if __CUDA_API_VERSION >= 10000 + +/** + * \brief Begins graph capture on a stream + * + * Begin graph capture on \p hStream. When a stream is in capture mode, all operations + * pushed into the stream will not be executed, but will instead be captured into + * a graph, which will be returned via ::cuStreamEndCapture. Capture may not be initiated + * if \p stream is CU_STREAM_LEGACY. Capture must be ended on the same stream in which + * it was initiated, and it may only be initiated if the stream is not already in capture + * mode. The capture mode may be queried via ::cuStreamIsCapturing. + * + * \param hStream - Stream in which to initiate capture + * + * \note Kernels captured using this API must not use texture and surface references. + * Reading or writing through any texture or surface reference is undefined + * behavior. This restriction does not apply to texture and surface objects. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuStreamCreate, + * ::cuStreamIsCapturing, + * ::cuStreamEndCapture + */ +CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream); + +/** + * \brief Ends capture on a stream, returning the captured graph + * + * End capture on \p hStream, returning the captured graph via \p phGraph. + * Capture must have been initiated on \p hStream via a call to ::cuStreamBeginCapture. + * If capture was invalidated, due to a violation of the rules of stream capture, then + * a NULL graph will be returned. + * + * \param hStream - Stream to query + * \param phGraph - The captured graph + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuStreamCreate, + * ::cuStreamBeginCapture, + * ::cuStreamIsCapturing + */ +CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph); + +/** + * \brief Returns a stream's capture status + * + * Return the capture status of \p hStream via \p captureStatus. After a successful + * call, \p *captureStatus will contain one of the following: + * - ::CU_STREAM_CAPTURE_STATUS_NONE: The stream is not capturing. + * - ::CU_STREAM_CAPTURE_STATUS_ACTIVE: The stream is capturing. + * - ::CU_STREAM_CAPTURE_STATUS_INVALIDATED: The stream was capturing but an error + * has invalidated the capture sequence. The capture sequence must be terminated + * with ::cuStreamEndCapture on the stream where it was initiated in order to + * continue using \p hStream. + * + * Note that, if this is called on ::CU_STREAM_LEGACY (the "null stream") while + * a blocking stream in the same context is capturing, it will return + * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT and \p *captureStatus is unspecified + * after the call. The blocking stream capture is not invalidated. + * + * When a blocking stream is capturing, the legacy stream is in an + * unusable state until the blocking stream capture is terminated. The legacy + * stream is not supported for stream capture, but attempted use would have an + * implicit dependency on the capturing stream(s). + * + * \param hStream - Stream to query + * \param captureStatus - Returns the stream's capture status + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT + * \notefnerr + * + * \sa + * ::cuStreamCreate, + * ::cuStreamBeginCapture, + * ::cuStreamEndCapture + */ +CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus); + +#endif /* __CUDA_API_VERSION >= 10000 */ + #if __CUDA_API_VERSION >= 6000 /** @@ -8428,12 +9156,20 @@ CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback * only take effect when, previous work in stream has completed. Any * previous association is automatically replaced. * - * \p dptr must point to an address within managed memory space declared - * using the __managed__ keyword or allocated with ::cuMemAllocManaged. + * \p dptr must point to one of the following types of memories: + * - managed memory declared using the __managed__ keyword or allocated with + * ::cuMemAllocManaged. + * - a valid host-accessible region of system-allocated pageable memory. This + * type of memory may only be specified if the device associated with the + * stream reports a non-zero value for the device attribute + * ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. * - * \p length must be zero, to indicate that the entire allocation's - * stream association is being changed. Currently, it's not possible - * to change stream association for a portion of an allocation. + * For managed allocations, \p length must be either zero or the entire + * allocation's size. Both indicate that the entire allocation's stream + * association is being changed. Currently, it is not possible to change stream + * association for a portion of a managed allocation. + * + * For pageable host allocations, \p length must be non-zero. * * The stream association is specified using \p flags which must be * one of ::CUmemAttach_flags. @@ -8458,7 +9194,7 @@ CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback * Accessing memory on the device from streams that are not associated with * it will produce undefined results. No error checking is performed by the * Unified Memory system to ensure that kernels launched into other streams - * do not access this region. + * do not access this region. * * It is a program's responsibility to order calls to ::cuStreamAttachMemAsync * via events, synchronization or other means to ensure legal access to memory @@ -8473,8 +9209,10 @@ CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback * happen until all work in the stream has completed. * * \param hStream - Stream in which to enqueue the attach operation - * \param dptr - Pointer to memory (must be a pointer to managed memory) - * \param length - Length of memory (must be zero) + * \param dptr - Pointer to memory (must be a pointer to managed memory or + * to a valid host-accessible region of system-allocated + * pageable memory) + * \param length - Length of memory * \param flags - Must be one of ::CUmemAttach_flags * * \return @@ -8533,7 +9271,7 @@ CUresult CUDAAPI cuStreamQuery(CUstream hStream); * \brief Wait until a stream's tasks are completed * * Waits until the device has completed all operations in the stream specified - * by \p hStream. If the context was created with the + * by \p hStream. If the context was created with the * ::CU_CTX_SCHED_BLOCKING_SYNC flag, the CPU thread will block until the * stream is finished with all of its tasks. * @@ -8545,6 +9283,7 @@ CUresult CUDAAPI cuStreamQuery(CUstream hStream); * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, * ::CUDA_ERROR_INVALID_HANDLE + * \note_null_stream * \notefnerr * @@ -8561,11 +9300,11 @@ CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); /** * \brief Destroys a stream * - * Destroys the stream specified by \p hStream. + * Destroys the stream specified by \p hStream. * * In case the device is still doing work in the stream \p hStream - * when ::cuStreamDestroy() is called, the function will return immediately - * and the resources associated with \p hStream will be released automatically + * when ::cuStreamDestroy() is called, the function will return immediately + * and the resources associated with \p hStream will be released automatically * once the device has completed all work in \p hStream. * * \param hStream - Stream to destroy @@ -8575,7 +9314,8 @@ CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); * ::CUDA_ERROR_DEINITIALIZED, * ::CUDA_ERROR_NOT_INITIALIZED, * ::CUDA_ERROR_INVALID_CONTEXT, - * ::CUDA_ERROR_INVALID_VALUE + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE * \notefnerr * * \sa ::cuStreamCreate, @@ -8606,8 +9346,8 @@ CUresult CUDAAPI cuStreamDestroy(CUstream hStream); /** * \brief Creates an event * - * Creates an event *phEvent with the flags specified via \p Flags. Valid flags - * include: + * Creates an event *phEvent for the current context with the flags specified via + * \p Flags. Valid flags include: * - ::CU_EVENT_DEFAULT: Default event creation flag. * - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking * synchronization. A CPU thread that uses ::cuEventSynchronize() to wait on @@ -8647,16 +9387,20 @@ CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags); /** * \brief Records an event * - * Records an event. See note on NULL stream behavior. Since operation is - * asynchronous, ::cuEventQuery or ::cuEventSynchronize() must be used - * to determine when the event has actually been recorded. + * Captures in \p hEvent the contents of \p hStream at the time of this call. + * \p hEvent and \p hStream must be from the same context. + * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then + * examine or wait for completion of the work that was captured. Uses of + * \p hStream after this call do not modify \p hEvent. See note on default + * stream behavior for what is captured in the default case. * - * If ::cuEventRecord() has previously been called on \p hEvent, then this - * call will overwrite any existing state in \p hEvent. Any subsequent calls - * which examine the status of \p hEvent will only examine the completion of - * this most recent call to ::cuEventRecord(). - * - * It is necessary that \p hEvent and \p hStream be created on the same context. + * ::cuEventRecord() can be called multiple times on the same event and + * will overwrite the previously captured state. Other APIs such as + * ::cuStreamWaitEvent() use the most recently captured state at the time + * of the API call, and are not affected by later calls to + * ::cuEventRecord(). Before the first call to ::cuEventRecord(), an + * event represents an empty set of work, so for example ::cuEventQuery() + * would return ::CUDA_SUCCESS. * * \param hEvent - Event to record * \param hStream - Stream to record event for @@ -8684,14 +9428,11 @@ CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream); /** * \brief Queries an event's status * - * Query the status of all device work preceding the most recent - * call to ::cuEventRecord() (in the appropriate compute streams, - * as specified by the arguments to ::cuEventRecord()). + * Queries the status of all work currently captured by \p hEvent. See + * ::cuEventRecord() for details on what is captured by an event. * - * If this work has successfully been completed by the device, or if - * ::cuEventRecord() has not been called on \p hEvent, then ::CUDA_SUCCESS is - * returned. If this work has not yet been completed by the device then - * ::CUDA_ERROR_NOT_READY is returned. + * Returns ::CUDA_SUCCESS if all captured work has been completed, or + * ::CUDA_ERROR_NOT_READY if any captured work is incomplete. * * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS * is equivalent to having called ::cuEventSynchronize(). @@ -8719,12 +9460,8 @@ CUresult CUDAAPI cuEventQuery(CUevent hEvent); /** * \brief Waits for an event to complete * - * Wait until the completion of all device work preceding the most recent - * call to ::cuEventRecord() (in the appropriate compute streams, as specified - * by the arguments to ::cuEventRecord()). - * - * If ::cuEventRecord() has not been called on \p hEvent, ::CUDA_SUCCESS is - * returned immediately. + * Waits until the completion of all work currently captured in \p hEvent. + * See ::cuEventRecord() for details on what is captured by an event. * * Waiting for an event that was created with the ::CU_EVENT_BLOCKING_SYNC * flag will cause the calling CPU thread to block until the event has @@ -8757,10 +9494,10 @@ CUresult CUDAAPI cuEventSynchronize(CUevent hEvent); * * Destroys the event specified by \p hEvent. * - * In case \p hEvent has been recorded but has not yet been completed - * when ::cuEventDestroy() is called, the function will return immediately and - * the resources associated with \p hEvent will be released automatically once - * the device has completed \p hEvent. + * An event may be destroyed before it is complete (i.e., while + * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY). In this case, the + * call does not block on completion of the event, and any associated + * resources will automatically be released asynchronously at completion. * * \param hEvent - Event to destroy * @@ -8826,6 +9563,507 @@ CUresult CUDAAPI cuEventDestroy(CUevent hEvent); */ CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd); +/** @} */ /* END CUDA_EVENT */ + +/** + * \defgroup CUDA_EXTRES_INTEROP External Resource Interoperability + * + * ___MANBRIEF___ External resource interoperability functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the external resource interoperability functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +#if __CUDA_API_VERSION >= 10000 + + /** + * \brief Imports an external memory object + * + * Imports an externally allocated memory object and returns + * a handle to that in \p extMem_out. + * + * The properties of the handle being imported must be described in + * \p memHandleDesc. The ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC structure + * is defined as follows: + * + * \code + typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st { + CUexternalMemoryHandleType type; + union { + int fd; + struct { + void *handle; + const void *name; + } win32; + } handle; + unsigned int flags; + } CUDA_EXTERNAL_MEMORY_HANDLE_DESC; + * \endcode + * + * where ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type specifies the type + * of handle being imported. ::CUexternalMemoryHandleType is + * defined as: + * + * \code + typedef enum CUexternalMemoryHandleType_enum { + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 = 2, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP = 4, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE = 5 + } CUexternalMemoryHandleType; + * \endcode + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD, then + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::fd must be a valid + * file descriptor referencing a memory object. Ownership of + * the file descriptor is transferred to the CUDA driver when the + * handle is imported successfully. Performing any operations on the + * file descriptor after it is imported results in undefined behavior. + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32, then exactly one + * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be + * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * references a memory object. Ownership of this handle is + * not transferred to CUDA after the import operation, so the + * application must release the handle using the appropriate system + * call. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name + * is not NULL, then it must point to a NULL-terminated array of + * UTF-16 characters that refers to a memory object. + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT, then + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must + * be non-NULL and + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name + * must be NULL. The handle specified must be a globally shared KMT + * handle. This handle does not hold a reference to the underlying + * object, and thus will be invalid when all references to the + * memory object are destroyed. + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP, then exactly one + * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be + * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * is returned by ID3DDevice::CreateSharedHandle when referring to a + * ID3D12Heap object. This handle holds a reference to the underlying + * object. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name + * is not NULL, then it must point to a NULL-terminated array of + * UTF-16 characters that refers to a ID3D12Heap object. + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE, then exactly one + * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be + * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * is returned by ID3DDevice::CreateSharedHandle when referring to a + * ID3D12Resource object. This handle holds a reference to the + * underlying object. If + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name + * is not NULL, then it must point to a NULL-terminated array of + * UTF-16 characters that refers to a ID3D12Resource object. + * + * Specifying the flag ::CUDA_EXTERNAL_MEMORY_DEDICATED in + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::flags indicates that the + * resource is a dedicated resource. The definition of what a + * dedicated resource is outside the scope of this extension. + * + * \param extMem_out - Returned handle to an external memory object + * \param memHandleDesc - Memory import handle descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \note If the Vulkan memory imported into CUDA is mapped on the CPU then the + * application must use vkInvalidateMappedMemoryRanges/vkFlushMappedMemoryRanges + * as well as appropriate Vulkan pipeline barriers to maintain coherence between + * CPU and GPU. For more information on these APIs, please refer to "Synchronization + * and Cache Control" chapter from Vulkan specification. + * + * \sa ::cuDestroyExternalMemory, + * ::cuExternalMemoryGetMappedBuffer, + * ::cuExternalMemoryGetMappedMipmappedArray + */ +CUresult CUDAAPI cuImportExternalMemory(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc); + +/** + * \brief Maps a buffer onto an imported memory object + * + * Maps a buffer onto an imported memory object and returns a device + * pointer in \p devPtr. + * + * The properties of the buffer being mapped must be described in + * \p bufferDesc. The ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC structure is + * defined as follows: + * + * \code + typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st { + unsigned long long offset; + unsigned long long size; + unsigned int flags; + } CUDA_EXTERNAL_MEMORY_BUFFER_DESC; + * \endcode + * + * where ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::offset is the offset in + * the memory object where the buffer's base address is. + * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::size is the size of the buffer. + * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::flags must be zero. + * + * The offset and size have to be suitably aligned to match the + * requirements of the external API. Mapping two buffers whose ranges + * overlap may or may not result in the same virtual address being + * returned for the overlapped portion. In such cases, the application + * must ensure that all accesses to that region from the GPU are + * volatile. Otherwise writes made via one address are not guaranteed + * to be visible via the other address, even if they're issued by the + * same thread. It is recommended that applications map the combined + * range instead of mapping separate buffers and then apply the + * appropriate offsets to the returned pointer to derive the + * individual buffers. + * + * \param devPtr - Returned device pointer to buffer + * \param extMem - Handle to external memory object + * \param bufferDesc - Buffer descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuImportExternalMemory + * ::cuDestroyExternalMemory, + * ::cuExternalMemoryGetMappedMipmappedArray + */ +CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc); + +/** + * \brief Maps a CUDA mipmapped array onto an external memory object + * + * Maps a CUDA mipmapped array onto an external object and returns a + * handle to it in \p mipmap. + * + * The properties of the CUDA mipmapped array being mapped must be + * described in \p mipmapDesc. The structure + * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC is defined as follows: + * + * \code + typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st { + unsigned long long offset; + CUDA_ARRAY3D_DESCRIPTOR arrayDesc; + unsigned int numLevels; + } CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC; + * \endcode + * + * where ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::offset is the + * offset in the memory object where the base level of the mipmap + * chain is. + * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc describes + * the format, dimensions and type of the base level of the mipmap + * chain. For further details on these parameters, please refer to the + * documentation for ::cuMipmappedArrayCreate. Note that if the mipmapped + * array is bound as a color target in the graphics API, then the flag + * ::CUDA_ARRAY3D_COLOR_ATTACHMENT must be specified in + * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc::Flags. + * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels specifies + * the total number of levels in the mipmap chain. + * + * \param mipmap - Returned CUDA mipmapped array + * \param extMem - Handle to external memory object + * \param mipmapDesc - CUDA array descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuImportExternalMemory + * ::cuDestroyExternalMemory, + * ::cuExternalMemoryGetMappedBuffer + */ +CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc); + +/** + * \brief Releases all resources associated with an external memory + * object. + * + * Frees all buffers and CUDA mipmapped arrays that were + * mapped onto this external memory object and releases any reference + * on the underlying memory itself. + * + * \param extMem - External memory object to be destroyed + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuImportExternalMemory + * ::cuExternalMemoryGetMappedBuffer, + * ::cuExternalMemoryGetMappedMipmappedArray + */ +CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem); + +/** + * \brief Imports an external semaphore + * + * Imports an externally allocated synchronization object and returns + * a handle to that in \p extSem_out. + * + * The properties of the handle being imported must be described in + * \p semHandleDesc. The ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC is + * defined as follows: + * + * \code + typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st { + CUexternalSemaphoreHandleType type; + union { + int fd; + struct { + void *handle; + const void *name; + } win32; + } handle; + unsigned int flags; + } CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC; + * \endcode + * + * where ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type specifies the type of + * handle being imported. ::CUexternalSemaphoreHandleType is defined + * as: + * + * \code + typedef enum CUexternalSemaphoreHandleType_enum { + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD = 1, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 = 2, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE = 4 + } CUexternalSemaphoreHandleType; + * \endcode + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, then + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid + * file descriptor referencing a synchronization object. Ownership of + * the file descriptor is transferred to the CUDA driver when the + * handle is imported successfully. Performing any operations on the + * file descriptor after it is imported results in undefined behavior. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, then exactly one + * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and + * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be + * NULL. If + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * references a synchronization object. Ownership of this handle is + * not transferred to CUDA after the import operation, so the + * application must release the handle using the appropriate system + * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name + * is not NULL, then it must name a valid synchronization object. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT, then + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle must + * be non-NULL and + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name + * must be NULL. The handle specified must be a globally shared KMT + * handle. This handle does not hold a reference to the underlying + * object, and thus will be invalid when all references to the + * synchronization object are destroyed. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then exactly one + * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and + * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be + * NULL. If + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * is returned by ID3DDevice::CreateSharedHandle when referring to a + * ID3D12Fence object. This handle holds a reference to the underlying + * object. If + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name + * is not NULL, then it must name a valid synchronization object that + * refers to a valid ID3D12Fence object. + * + * \param extSem_out - Returned handle to an external semaphore + * \param semHandleDesc - Semaphore import handle descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuDestroyExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuImportExternalSemaphore(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc); + +/** + * \brief Signals a set of external semaphore objects + * + * Enqueues a signal operation on a set of externally allocated + * semaphore object in the specified stream. The operations will be + * executed when all prior operations in the stream complete. + * + * The exact semantics of signaling a semaphore depends on the type of + * the object. + * + * If the semaphore object is any one of the following types: + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT + * then signaling the semaphore will set it to the signaled state. + * + * If the semaphore object is of the type + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then the + * semaphore will be set to the value specified in + * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::fence::value. + * + * \param extSemArray - Set of external semaphores to be signaled + * \param paramsArray - Array of semaphore parameters + * \param numExtSems - Number of semaphores to signal + * \param stream - Stream to enqueue the signal operations in + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuImportExternalSemaphore, + * ::cuDestroyExternalSemaphore, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); + +/** + * \brief Waits on a set of external semaphore objects + * + * Enqueues a wait operation on a set of externally allocated + * semaphore object in the specified stream. The operations will be + * executed when all prior operations in the stream complete. + * + * The exact semantics of waiting on a semaphore depends on the type + * of the object. + * + * If the semaphore object is any one of the following types: + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT + * then waiting on the semaphore will wait until the semaphore reaches + * the signaled state. The semaphore will then be reset to the + * unsignaled state. Therefore for every signal operation, there can + * only be one wait operation. + * + * If the semaphore object is of the type + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then waiting on + * the semaphore will wait until the value of the semaphore is + * greater than or equal to + * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::fence::value. + * + * \param extSemArray - External semaphores to be waited on + * \param paramsArray - Array of semaphore parameters + * \param numExtSems - Number of semaphores to wait on + * \param stream - Stream to enqueue the wait operations in + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuImportExternalSemaphore, + * ::cuDestroyExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync + */ +CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); + +/** + * \brief Destroys an external semaphore + * + * Destroys an external semaphore object and releases any references + * to the underlying resource. Any outstanding signals or waits must + * have completed before the semaphore is destroyed. + * + * \param extSem - External semaphore to be destroyed + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuImportExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem); + +#endif /* __CUDA_API_VERSION >= 10000 */ + +/** @} */ /* END CUDA_EXTRES_INTEROP */ + +/** + * \defgroup CUDA_MEMOP Stream memory operations + * + * ___MANBRIEF___ Stream memory operations of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the stream memory operations of the low-level CUDA + * driver application programming interface. + * + * The whole set of operations is disabled by default. Users are required + * to explicitly enable them, e.g. on Linux by passing the kernel module + * parameter shown below: + * modprobe nvidia NVreg_EnableStreamMemOPs=1 + * There is currently no way to enable these operations on other operating + * systems. + * + * Users can programmatically query whether the device supports these + * operations with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. + * + * Support for the ::CU_STREAM_WAIT_VALUE_NOR flag can be queried with + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR. + * + * Support for the ::cuStreamWriteValue64() and ::cuStreamWaitValue64() + * functions, as well as for the ::CU_STREAM_MEM_OP_WAIT_VALUE_64 and + * ::CU_STREAM_MEM_OP_WRITE_VALUE_64 flags, can be queried with + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. + * + * Support for both ::CU_STREAM_WAIT_VALUE_FLUSH and + * ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES requires dedicated platform + * hardware features and can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES. + * + * Note that all memory pointers passed as parameters to these operations + * are device pointers. Where necessary a device pointer should be + * obtained, for example with ::cuMemHostGetDevicePointer(). + * + * None of the operations accepts pointers to managed memory buffers + * (::cuMemAllocManaged). + * + * @{ + */ + #if __CUDA_API_VERSION >= 8000 /** * \brief Wait on a memory location @@ -8841,8 +10079,10 @@ CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUeven * be used with managed memory (::cuMemAllocManaged). * * Support for this can be queried with ::cuDeviceGetAttribute() and - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. The only requirement for basic - * support is that on Windows, a device must be in TCC mode. + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. + * + * Support for CU_STREAM_WAIT_VALUE_NOR can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR. * * \param stream The stream to synchronize on the memory location. * \param addr The memory location to wait on. @@ -8877,9 +10117,7 @@ CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32 * should be obtained with ::cuMemHostGetDevicePointer(). * * Support for this can be queried with ::cuDeviceGetAttribute() and - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. The requirements are - * compute capability 7.0 or greater, and on Windows, that the device be in - * TCC mode. + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. * * \param stream The stream to synchronize on the memory location. * \param addr The memory location to wait on. @@ -8914,8 +10152,7 @@ CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64 * be used with managed memory (::cuMemAllocManaged). * * Support for this can be queried with ::cuDeviceGetAttribute() and - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. The only requirement for basic - * support is that on Windows, a device must be in TCC mode. + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. * * \param stream The stream to do the write in. * \param addr The device address to write to. @@ -8949,9 +10186,7 @@ CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint3 * should be obtained with ::cuMemHostGetDevicePointer(). * * Support for this can be queried with ::cuDeviceGetAttribute() and - * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. The requirements are - * compute capability 7.0 or greater, and on Windows, that the device be in - * TCC mode. + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. * * \param stream The stream to do the write in. * \param addr The device address to write to. @@ -9009,7 +10244,7 @@ CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint6 CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); #endif /* __CUDA_API_VERSION >= 8000 */ -/** @} */ /* END CUDA_EVENT */ +/** @} */ /* END CUDA_MEMOP */ /** * \defgroup CUDA_EXEC Execution Control @@ -9053,11 +10288,11 @@ CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstrea * would return the value 13. Note that this will return a value of 10 for * legacy cubins that do not have a properly-encoded binary architecture * version. - * - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the function has + * - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the function has * been compiled with user specified option "-Xptxas --dlcm=ca" set . * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in bytes of - * dynamically-allocated shared memory. - * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1 + * dynamically-allocated shared memory. + * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1 * cache split ratio in percent of shared memory. * * \param pi - Returned attribute value @@ -9102,7 +10337,7 @@ CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunc * device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN. * The maximal size of requestable dynamic shared memory may differ by GPU * architecture. - * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1 + * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1 * cache and shared memory use the same hardware resources, this sets the shared memory * carveout preference, in percent of the total resources. This is only a hint, and the * driver can choose a different ratio if required to execute the function. @@ -9178,28 +10413,28 @@ CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config); /** * \brief Sets the shared memory configuration for a device function. * - * On devices with configurable shared memory banks, this function will + * On devices with configurable shared memory banks, this function will * force all subsequent launches of the specified device function to have * the given shared memory bank size configuration. On any given launch of the * function, the shared memory configuration of the device will be temporarily * changed if needed to suit the function's preferred configuration. Changes in - * shared memory configuration between subsequent launches of functions, + * shared memory configuration between subsequent launches of functions, * may introduce a device side synchronization point. * - * Any per-function setting of shared memory bank size set via + * Any per-function setting of shared memory bank size set via * ::cuFuncSetSharedMemConfig will override the context wide setting set with * ::cuCtxSetSharedMemConfig. * * Changing the shared memory bank size will not increase shared memory usage - * or affect occupancy of kernels, but may have major effects on performance. + * or affect occupancy of kernels, but may have major effects on performance. * Larger bank sizes will allow for greater potential bandwidth to shared memory, - * but will change what kinds of accesses to shared memory will result in bank + * but will change what kinds of accesses to shared memory will result in bank * conflicts. * * This function will do nothing on devices with fixed shared memory bank size. * * The supported bank configurations are: - * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: use the context's shared memory + * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: use the context's shared memory * configuration when launching this function. * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to * be natively four bytes when launching this function. @@ -9451,7 +10686,7 @@ CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, * No two kernels can be launched on the same device. All the devices targeted by this * multi-device launch must be identical. All devices must have a non-zero value for the * device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH. - * + * * All kernels launched must be identical with respect to the compiled code. Note that * any __device__, __constant__ or __managed__ variables present in the module that owns * the kernel launched on each device, are independently instantiated on every device. @@ -9577,6 +10812,75 @@ CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launch #endif /* __CUDA_API_VERSION >= 9000 */ +#if __CUDA_API_VERSION >= 10000 + +/** + * \brief Enqueues a host function call in a stream + * + * Enqueues a host function to run in a stream. The function will be called + * after currently enqueued work and will block work added after it. + * + * The host function must not make any CUDA API calls. Attempting to use a + * CUDA API may result in ::CUDA_ERROR_NOT_PERMITTED, but this is not required. + * The host function must not perform any synchronization that may depend on + * outstanding CUDA work not mandated to run earlier. Host functions without a + * mandated order (such as in independent streams) execute in undefined order + * and may be serialized. + * + * For the purposes of Unified Memory, execution makes a number of guarantees: + *
      + *
    • The stream is considered idle for the duration of the function's + * execution. Thus, for example, the function may always use memory attached + * to the stream it was enqueued in.
    • + *
    • The start of execution of the function has the same effect as + * synchronizing an event recorded in the same stream immediately prior to + * the function. It thus synchronizes streams which have been "joined" + * prior to the function.
    • + *
    • Adding device work to any stream does not have the effect of making + * the stream active until all preceding host functions and stream callbacks + * have executed. Thus, for + * example, a function might use global attached memory even if work has + * been added to another stream, if the work has been ordered behind the + * function call with an event.
    • + *
    • Completion of the function does not cause a stream to become + * active except as described above. The stream will remain idle + * if no device work follows the function, and will remain idle across + * consecutive host functions or stream callbacks without device work in + * between. Thus, for example, + * stream synchronization can be done by signaling from a host function at the + * end of the stream.
    • + *
    + * + * Note that, in contrast to ::cuStreamAddCallback, the function will not be + * called in the event of an error in the CUDA context. + * + * \param hStream - Stream to enqueue function call in + * \param fn - The function to call once preceding stream operations are complete + * \param userData - User-specified data to be passed to the function + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamWaitEvent, + * ::cuStreamDestroy, + * ::cuMemAllocManaged, + * ::cuStreamAttachMemAsync, + * ::cuStreamAddCallback + */ +CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData); + +#endif /* __CUDA_API_VERSION >= 10000 */ + /** @} */ /* END CUDA_EXEC */ /** @@ -9625,7 +10929,7 @@ CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launch * ::cuLaunchGridAsync, * ::cuLaunchKernel */ -CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z); +__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z); /** * \brief Sets the dynamic shared-memory size for the function @@ -9659,7 +10963,7 @@ CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z); * ::cuLaunchGridAsync, * ::cuLaunchKernel */ -CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes); +__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes); /** * \brief Sets the parameter size for the function @@ -9691,7 +10995,7 @@ CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes); * ::cuLaunchGridAsync, * ::cuLaunchKernel */ -CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes); +__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes); /** * \brief Adds an integer parameter to the function's argument list @@ -9724,7 +11028,7 @@ CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes); * ::cuLaunchGridAsync, * ::cuLaunchKernel */ -CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset, unsigned int value); +__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset, unsigned int value); /** * \brief Adds a floating-point parameter to the function's argument list @@ -9757,7 +11061,7 @@ CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset, unsigned int value); * ::cuLaunchGridAsync, * ::cuLaunchKernel */ -CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset, float value); +__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset, float value); /** * \brief Adds arbitrary data to the function's argument list @@ -9792,7 +11096,7 @@ CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset, float value); * ::cuLaunchGridAsync, * ::cuLaunchKernel */ -CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes); +__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes); /** * \brief Launches a CUDA function @@ -9829,7 +11133,7 @@ CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned i * ::cuLaunchGridAsync, * ::cuLaunchKernel */ -CUresult CUDAAPI cuLaunch(CUfunction f); +__CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f); /** * \brief Launches a CUDA function @@ -9868,7 +11172,7 @@ CUresult CUDAAPI cuLaunch(CUfunction f); * ::cuLaunchGridAsync, * ::cuLaunchKernel */ -CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height); +__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height); /** * \brief Launches a CUDA function @@ -9897,10 +11201,10 @@ CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height); * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED * - * \note In certain cases where cubins are created with no ABI (i.e., using \p ptxas \p --abi-compile \p no), - * this function may serialize kernel launches. In order to force the CUDA driver to retain - * asynchronous behavior, set the ::CU_CTX_LMEM_RESIZE_TO_MAX flag during context creation (see ::cuCtxCreate). - * + * \note In certain cases where cubins are created with no ABI (i.e., using \p ptxas \p --abi-compile \p no), + * this function may serialize kernel launches. In order to force the CUDA driver to retain + * asynchronous behavior, set the ::CU_CTX_LMEM_RESIZE_TO_MAX flag during context creation (see ::cuCtxCreate). + * * \note_null_stream * \notefnerr * @@ -9915,7 +11219,7 @@ CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height); * ::cuLaunchGrid, * ::cuLaunchKernel */ -CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream); +__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream); /** @@ -9940,9 +11244,1005 @@ CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height * ::CUDA_ERROR_INVALID_VALUE * \notefnerr */ -CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef); +__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef); /** @} */ /* END CUDA_EXEC_DEPRECATED */ +#if __CUDA_API_VERSION >= 10000 +/** + * \defgroup CUDA_GRAPH Graph Management + * + * ___MANBRIEF___ graph management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the graph management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Creates a graph + * + * Creates an empty graph, which is returned via \p phGraph. + * + * \param phGraph - Returns newly created graph + * \param flags - Graph creation flags, must be 0 + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode, + * ::cuGraphInstantiate, + * ::cuGraphDestroy, + * ::cuGraphGetNodes, + * ::cuGraphGetRootNodes, + * ::cuGraphGetEdges, + * ::cuGraphClone + */ +CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags); + +/** + * \brief Creates a kernel execution node and adds it to a graph + * + * Creates a new kernel execution node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies and arguments specified in \p nodeParams. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * The CUDA_KERNEL_NODE_PARAMS structure is defined as: + * + * \code + * typedef struct CUDA_KERNEL_NODE_PARAMS_st { + * CUfunction func; + * unsigned int gridDimX; + * unsigned int gridDimY; + * unsigned int gridDimZ; + * unsigned int blockDimX; + * unsigned int blockDimY; + * unsigned int blockDimZ; + * unsigned int sharedMemBytes; + * void **kernelParams; + * void **extra; + * } CUDA_KERNEL_NODE_PARAMS; + * \endcode + * + * When the graph is launched, the node will invoke kernel \p func on a (\p gridDimX x + * \p gridDimY x \p gridDimZ) grid of blocks. Each block contains + * (\p blockDimX x \p blockDimY x \p blockDimZ) threads. + * + * \p sharedMemBytes sets the amount of dynamic shared memory that will be + * available to each thread block. + * + * Kernel parameters to \p func can be specified in one of two ways: + * + * 1) Kernel parameters can be specified via \p kernelParams. If the kernel has N + * parameters, then \p kernelParams needs to be an array of N pointers. Each pointer, + * from \p kernelParams[0] to \p kernelParams[N-1], points to the region of memory from which the actual + * parameter will be copied. The number of kernel parameters and their offsets and sizes do not need + * to be specified as that information is retrieved directly from the kernel's image. + * + * 2) Kernel parameters can also be packaged by the application into a single buffer that is passed in + * via \p extra. This places the burden on the application of knowing each kernel + * parameter's size and alignment/padding within the buffer. The \p extra parameter exists + * to allow this function to take additional less commonly used arguments. \p extra specifies + * a list of names of extra settings and their corresponding values. Each extra setting name is + * immediately followed by the corresponding value. The list must be terminated with either NULL or + * CU_LAUNCH_PARAM_END. + * + * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra + * array; + * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next + * value in \p extra will be a pointer to a buffer + * containing all the kernel parameters for launching kernel + * \p func; + * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next + * value in \p extra will be a pointer to a size_t + * containing the size of the buffer specified with + * ::CU_LAUNCH_PARAM_BUFFER_POINTER; + * + * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel parameters are specified with both + * \p kernelParams and \p extra (i.e. both \p kernelParams and + * \p extra are non-NULL). + * + * The \p kernelParams or \p extra array, as well as the argument values it points to, + * are copied during this call. + * + * \note Kernels launched using graphs must not use texture and surface references. Reading or + * writing through any texture or surface reference is undefined behavior. + * This restriction does not apply to texture and surface objects. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param nodeParams - Parameters for the GPU execution node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchKernel, + * ::cuGraphKernelNodeGetParams, + * ::cuGraphKernelNodeSetParams, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams); + +/** + * \brief Returns a kernel node's parameters + * + * Returns the parameters of kernel node \p hNode in \p nodeParams. + * The \p kernelParams or \p extra array returned in \p nodeParams, + * as well as the argument values it points to, are owned by the node. + * This memory remains valid until the node is destroyed or its + * parameters are modified, and should not be modified + * directly. Use ::cuGraphKernelNodeSetParams to update the + * parameters of this node. + * + * The params will contain either \p kernelParams or \p extra, + * according to which of these was most recently set on the node. + * + * \param hNode - Node to get the parameters for + * \param nodeParams - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchKernel, + * ::cuGraphAddKernelNode, + * ::cuGraphKernelNodeSetParams + */ +CUresult CUDAAPI cuGraphKernelNodeGetParams(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams); + +/** + * \brief Sets a kernel node's parameters + * + * Sets the parameters of kernel node \p hNode to \p nodeParams. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchKernel, + * ::cuGraphAddKernelNode, + * ::cuGraphKernelNodeGetParams + */ +CUresult CUDAAPI cuGraphKernelNodeSetParams(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams); + +/** + * \brief Creates a memcpy node and adds it to a graph + * + * Creates a new memcpy node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * When the graph is launched, the node will perform the memcpy described by \p copyParams. + * See ::cuMemcpy3D() for a description of the structure and its restrictions. + * + * Memcpy nodes have some additional restrictions with regards to managed memory, if the + * system contains at least one device which has a zero value for the device attribute + * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. If one or more of the operands refer + * to managed memory, then using the memory type ::CU_MEMORYTYPE_UNIFIED is disallowed + * for those operand(s). The managed memory will be treated as residing on either the + * host or the device, depending on which memory type is specified. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param copyParams - Parameters for the memory copy + * \param ctx - Context on which to run the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuMemcpy3D, + * ::cuGraphMemcpyNodeGetParams, + * ::cuGraphMemcpyNodeSetParams, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D *copyParams, CUcontext ctx); + +/** + * \brief Returns a memcpy node's parameters + * + * Returns the parameters of memcpy node \p hNode in \p nodeParams. + * + * \param hNode - Node to get the parameters for + * \param nodeParams - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuMemcpy3D, + * ::cuGraphAddMemcpyNode, + * ::cuGraphMemcpyNodeSetParams + */ +CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode, CUDA_MEMCPY3D *nodeParams); + +/** + * \brief Sets a memcpy node's parameters + * + * Sets the parameters of memcpy node \p hNode to \p nodeParams. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuMemcpy3D, + * ::cuGraphAddMemcpyNode, + * ::cuGraphMemcpyNodeGetParams + */ +CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY3D *nodeParams); + +/** + * \brief Creates a memset node and adds it to a graph + * + * Creates a new memset node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * The element size must be 1, 2, or 4 bytes. + * When the graph is launched, the node will perform the memset described by \p memsetParams. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param memsetParams - Parameters for the memory set + * \param ctx - Context on which to run the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_CONTEXT + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuMemsetD2D32, + * ::cuGraphMemsetNodeGetParams, + * ::cuGraphMemsetNodeSetParams, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemcpyNode + */ +CUresult CUDAAPI cuGraphAddMemsetNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx); + +/** + * \brief Returns a memset node's parameters + * + * Returns the parameters of memset node \p hNode in \p nodeParams. + * + * \param hNode - Node to get the parameters for + * \param nodeParams - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuMemsetD2D32, + * ::cuGraphAddMemsetNode, + * ::cuGraphMemsetNodeSetParams + */ +CUresult CUDAAPI cuGraphMemsetNodeGetParams(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams); + +/** + * \brief Sets a memset node's parameters + * + * Sets the parameters of memset node \p hNode to \p nodeParams. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuMemsetD2D32, + * ::cuGraphAddMemsetNode, + * ::cuGraphMemsetNodeGetParams + */ +CUresult CUDAAPI cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams); + +/** + * \brief Creates a host execution node and adds it to a graph + * + * Creates a new CPU execution node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies and arguments specified in \p nodeParams. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * When the graph is launched, the node will invoke the specified CPU function. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param nodeParams - Parameters for the host node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchHostFunc, + * ::cuGraphHostNodeGetParams, + * ::cuGraphHostNodeSetParams, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS *nodeParams); + +/** + * \brief Returns a host node's parameters + * + * Returns the parameters of host node \p hNode in \p nodeParams. + * + * \param hNode - Node to get the parameters for + * \param nodeParams - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchHostFunc, + * ::cuGraphAddHostNode, + * ::cuGraphHostNodeSetParams + */ +CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS *nodeParams); + +/** + * \brief Sets a host node's parameters + * + * Sets the parameters of host node \p hNode to \p nodeParams. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchHostFunc, + * ::cuGraphAddHostNode, + * ::cuGraphHostNodeGetParams + */ +CUresult CUDAAPI cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams); + +/** + * \brief Creates a child graph node and adds it to a graph + * + * Creates a new node which executes an embedded graph, and adds it to \p hGraph with + * \p numDependencies dependencies specified via \p dependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * The node executes an embedded child graph. The child graph is cloned in this call. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param childGraph - The graph to clone into this node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphChildGraphNodeGetGraph, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode, + * ::cuGraphClone + */ +CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph); + +/** + * \brief Gets a handle to the embedded graph of a child graph node + * + * Gets a handle to the embedded graph in a child graph node. This call + * does not clone the graph. Changes to the graph will be reflected in + * the node, and the node retains ownership of the graph. + * + * \param hNode - Node to get the embedded graph for + * \param phGraph - Location to store a handle to the graph + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddChildGraphNode, + * ::cuGraphNodeFindInClone + */ +CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph *phGraph); + +/** + * \brief Creates an empty node and adds it to a graph + * + * Creates a new node which performs no operation, and adds it to \p hGraph with + * \p numDependencies dependencies specified via \p dependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * An empty node performs no operation during execution, but can be used for + * transitive ordering. For example, a phased execution graph with 2 groups of n + * nodes with a barrier between them can be represented using an empty node and + * 2*n dependency edges, rather than no empty node and n^2 dependency edges. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph, CUgraphNode *dependencies, size_t numDependencies); + +/** + * \brief Clones a graph + * + * This function creates a copy of \p originalGraph and returns it in \p * phGraphClone. + * All parameters are copied into the cloned graph. The original graph may be modified + * after this call without affecting the clone. + * + * Child graph nodes in the original graph are recursively copied into the clone. + * + * \param phGraphClone - Returns newly created cloned graph + * \param originalGraph - Graph to clone + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphCreate, + * ::cuGraphNodeFindInClone + */ +CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph); + +/** + * \brief Finds a cloned version of a node + * + * This function returns the node in \p hClonedGraph corresponding to \p hOriginalNode + * in the original graph. + * + * \p hClonedGraph must have been cloned from \p hOriginalGraph via ::cuGraphClone. + * \p hOriginalNode must have been in \p hOriginalGraph at the time of the call to + * ::cuGraphClone, and the corresponding cloned node in \p hClonedGraph must not have + * been removed. The cloned node is then returned via \p phClonedNode. + * + * \param phNode - Returns handle to the cloned node + * \param hOriginalNode - Handle to the original node + * \param hClonedGraph - Cloned graph to query + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphClone + */ +CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph); + +/** + * \brief Returns a node's type + * + * Returns the node type of \p hNode in \p type. + * + * \param hNode - Node to query + * \param type - Pointer to return the node type + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphGetNodes, + * ::cuGraphGetRootNodes, + * ::cuGraphChildGraphNodeGetGraph, + * ::cuGraphKernelNodeGetParams, + * ::cuGraphKernelNodeSetParams, + * ::cuGraphHostNodeGetParams, + * ::cuGraphHostNodeSetParams, + * ::cuGraphMemcpyNodeGetParams, + * ::cuGraphMemcpyNodeSetParams, + * ::cuGraphMemsetNodeGetParams, + * ::cuGraphMemsetNodeSetParams + */ +CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type); + +/** + * \brief Returns a graph's nodes + * + * Returns a list of \p hGraph's nodes. \p nodes may be NULL, in which case this + * function will return the number of nodes in \p numNodes. Otherwise, + * \p numNodes entries will be filled in. If \p numNodes is higher than the actual + * number of nodes, the remaining entries in \p nodes will be set to NULL, and the + * number of nodes actually obtained will be returned in \p numNodes. + * + * \param hGraph - Graph to query + * \param nodes - Pointer to return the nodes + * \param numNodes - See description + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphCreate, + * ::cuGraphGetRootNodes, + * ::cuGraphGetEdges, + * ::cuGraphNodeGetType, + * ::cuGraphNodeGetDependencies, + * ::cuGraphNodeGetDependentNodes + */ +CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes); + +/** + * \brief Returns a graph's root nodes + * + * Returns a list of \p hGraph's root nodes. \p rootNodes may be NULL, in which case this + * function will return the number of root nodes in \p numRootNodes. Otherwise, + * \p numRootNodes entries will be filled in. If \p numRootNodes is higher than the actual + * number of root nodes, the remaining entries in \p rootNodes will be set to NULL, and the + * number of nodes actually obtained will be returned in \p numRootNodes. + * + * \param hGraph - Graph to query + * \param rootNodes - Pointer to return the root nodes + * \param numRootNodes - See description + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphCreate, + * ::cuGraphGetNodes, + * ::cuGraphGetEdges, + * ::cuGraphNodeGetType, + * ::cuGraphNodeGetDependencies, + * ::cuGraphNodeGetDependentNodes + */ +CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes); + +/** + * \brief Returns a graph's dependency edges + * + * Returns a list of \p hGraph's dependency edges. Edges are returned via corresponding + * indices in \p from and \p to; that is, the node in \p to[i] has a dependency on the + * node in \p from[i]. \p from and \p to may both be NULL, in which + * case this function only returns the number of edges in \p numEdges. Otherwise, + * \p numEdges entries will be filled in. If \p numEdges is higher than the actual + * number of edges, the remaining entries in \p from and \p to will be set to NULL, and + * the number of edges actually returned will be written to \p numEdges. + * + * \param hGraph - Graph to get the edges from + * \param from - Location to return edge endpoints + * \param to - Location to return edge endpoints + * \param numEdges - See description + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphGetNodes, + * ::cuGraphGetRootNodes, + * ::cuGraphAddDependencies, + * ::cuGraphRemoveDependencies, + * ::cuGraphNodeGetDependencies, + * ::cuGraphNodeGetDependentNodes + */ +CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges); + +/** + * \brief Returns a node's dependencies + * + * Returns a list of \p node's dependencies. \p dependencies may be NULL, in which case this + * function will return the number of dependencies in \p numDependencies. Otherwise, + * \p numDependencies entries will be filled in. If \p numDependencies is higher than the actual + * number of dependencies, the remaining entries in \p dependencies will be set to NULL, and the + * number of nodes actually obtained will be returned in \p numDependencies. + * + * \param hNode - Node to query + * \param dependencies - Pointer to return the dependencies + * \param numDependencies - See description + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphNodeGetDependentNodes, + * ::cuGraphGetNodes, + * ::cuGraphGetRootNodes, + * ::cuGraphGetEdges, + * ::cuGraphAddDependencies, + * ::cuGraphRemoveDependencies + */ +CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies); + +/** + * \brief Returns a node's dependent nodes + * + * Returns a list of \p node's dependent nodes. \p dependentNodes may be NULL, in which + * case this function will return the number of dependent nodes in \p numDependentNodes. + * Otherwise, \p numDependentNodes entries will be filled in. If \p numDependentNodes is + * higher than the actual number of dependent nodes, the remaining entries in + * \p dependentNodes will be set to NULL, and the number of nodes actually obtained will + * be returned in \p numDependentNodes. + * + * \param hNode - Node to query + * \param dependentNodes - Pointer to return the dependent nodes + * \param numDependentNodes - See description + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphNodeGetDependencies, + * ::cuGraphGetNodes, + * ::cuGraphGetRootNodes, + * ::cuGraphGetEdges, + * ::cuGraphAddDependencies, + * ::cuGraphRemoveDependencies + */ +CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes); + +/** + * \brief Adds dependency edges to a graph + * + * The number of dependencies to be added is defined by \p numDependencies + * Elements in \p from and \p to at corresponding indices define a dependency. + * Each node in \p from and \p to must belong to \p hGraph. + * + * If \p numDependencies is 0, elements in \p from and \p to will be ignored. + * Specifying an existing dependency will return an error. + * + * \param hGraph - Graph to which dependencies are added + * \param from - Array of nodes that provide the dependencies + * \param to - Array of dependent nodes + * \param numDependencies - Number of dependencies to be added + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphRemoveDependencies, + * ::cuGraphGetEdges, + * ::cuGraphNodeGetDependencies, + * ::cuGraphNodeGetDependentNodes + */ +CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t numDependencies); + +/** + * \brief Removes dependency edges from a graph + * + * The number of \p dependencies to be removed is defined by \p numDependencies. + * Elements in \p from and \p to at corresponding indices define a dependency. + * Each node in \p from and \p to must belong to \p hGraph. + * + * If \p numDependencies is 0, elements in \p from and \p to will be ignored. + * Specifying a non-existing dependency will return an error. + * + * \param hGraph - Graph from which to remove dependencies + * \param from - Array of nodes that provide the dependencies + * \param to - Array of dependent nodes + * \param numDependencies - Number of dependencies to be removed + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddDependencies, + * ::cuGraphGetEdges, + * ::cuGraphNodeGetDependencies, + * ::cuGraphNodeGetDependentNodes + */ +CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t numDependencies); + +/** + * \brief Remove a node from the graph + * + * Removes \p hNode from its graph. This operation also severs any dependencies of other nodes + * on \p hNode and vice versa. + * + * \param hNode - Node to remove + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode); + +/** + * \brief Creates an executable graph from a graph + * + * Instantiates \p hGraph as an executable graph. The graph is validated for any + * structural constraints or intra-node constraints which were not previously + * validated. If instantiation is successful, a handle to the instantiated graph + * is returned in \p graphExec. + * + * If there are any errors, diagnostic information may be returned in \p errorNode and + * \p logBuffer. This is the primary way to inspect instantiation errors. The output + * will be null terminated unless the diagnostics overflow + * the buffer. In this case, they will be truncated, and the last byte can be + * inspected to determine if truncation occurred. + * + * \param phGraphExec - Returns instantiated graph + * \param hGraph - Graph to instantiate + * \param phErrorNode - In case of an instantiation error, this may be modified to + * indicate a node contributing to the error + * \param logBuffer - A character buffer to store diagnostic messages + * \param bufferSize - Size of the log buffer in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphCreate, + * ::cuGraphLaunch, + * ::cuGraphExecDestroy + */ +CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize); + +/** + * \brief Launches an executable graph in a stream + * + * Executes \p hGraphExec in \p hStream. Only one instance of \p hGraphExec may be executing + * at a time. Each launch is ordered behind both any previous work in \p hStream + * and any previous launches of \p hGraphExec. To execute a graph concurrently, it must be + * instantiated multiple times into multiple executable graphs. + * + * \param hGraphExec - Executable graph to launch + * \param hStream - Stream in which to launch the graph + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphInstantiate, + * ::cuGraphExecDestroy + */ +CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream); + +/** + * \brief Destroys an executable graph + * + * Destroys the executable graph specified by \p hGraphExec, as well + * as all of its executable nodes. If the executable graph is + * in-flight, it will not be terminated, but rather freed + * asynchronously on completion. + * + * \param hGraphExec - Executable graph to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphInstantiate, + * ::cuGraphLaunch + */ +CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec); + +/** + * \brief Destroys a graph + * + * Destroys the graph specified by \p hGraph, as well as all of its nodes. + * + * \param hGraph - Graph to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphCreate + */ +CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph); +/** @} */ /* END CUDA_GRAPH */ +#endif /* __CUDA_API_VERSION >= 10000 */ #if __CUDA_API_VERSION >= 6050 /** @@ -10023,7 +12323,7 @@ CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUf * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags */ CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags); - + /** * \brief Suggest a launch configuration with reasonable occupancy * @@ -10171,7 +12471,7 @@ CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int * * Binds the CUDA mipmapped array \p hMipmappedArray to the texture reference \p hTexRef. * Any previous address or CUDA array state associated with the texture reference - * is superseded by this function. \p Flags must be set to ::CU_TRSA_OVERRIDE_FORMAT. + * is superseded by this function. \p Flags must be set to ::CU_TRSA_OVERRIDE_FORMAT. * Any CUDA array previously bound to \p hTexRef is unbound. * * \param hTexRef - Texture reference to bind @@ -10216,7 +12516,7 @@ CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hM * The total number of elements (or texels) in the linear address range * cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. * The number of elements is computed as (\p bytes / bytesPerElement), - * where bytesPerElement is determined from the data format and number of + * where bytesPerElement is determined from the data format and number of * components set using ::cuTexRefSetFormat(). * * \param ByteOffset - Returned byte offset @@ -10261,14 +12561,14 @@ CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdevi * supplied, ::CUDA_ERROR_INVALID_VALUE is returned. * * \p Pitch has to be aligned to the hardware-specific texture pitch alignment. - * This value can be queried using the device attribute - * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. If an unaligned \p Pitch is + * This value can be queried using the device attribute + * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. If an unaligned \p Pitch is * supplied, ::CUDA_ERROR_INVALID_VALUE is returned. * * Width and Height, which are specified in elements (or texels), cannot exceed * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively. - * \p Pitch, which is specified in bytes, cannot exceed + * \p Pitch, which is specified in bytes, cannot exceed * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH. * * \param hTexRef - Texture reference to bind @@ -10344,7 +12644,7 @@ CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int Num * \endcode * * Note that this call has no effect if \p hTexRef is bound to linear memory. - * Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES, is not set, the only + * Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES, is not set, the only * supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP. * * \param hTexRef - Texture reference @@ -10441,7 +12741,7 @@ CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm) /** * \brief Sets the mipmap level bias for a texture reference * - * Specifies the mipmap level bias \p bias to be added to the specified mipmap level when + * Specifies the mipmap level bias \p bias to be added to the specified mipmap level when * reading memory through the texture reference \p hTexRef. * * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. @@ -10469,7 +12769,7 @@ CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias); * \brief Sets the mipmap min/max mipmap level clamps for a texture reference * * Specifies the min/max mipmap level clamps, \p minMipmapLevelClamp and \p maxMipmapLevelClamp - * respectively, to be used when reading memory through the texture reference + * respectively, to be used when reading memory through the texture reference * \p hTexRef. * * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. @@ -10498,7 +12798,7 @@ CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLe * \brief Sets the maximum anisotropy for a texture reference * * Specifies the maximum anisotropy \p maxAniso to be used when reading memory through - * the texture reference \p hTexRef. + * the texture reference \p hTexRef. * * Note that this call has no effect if \p hTexRef is bound to linear memory. * @@ -10567,7 +12867,7 @@ CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor); * range [0, 1]. Note that texture with 32-bit integer format * would not be promoted, regardless of whether or not this * flag is specified; - * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the + * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the * default behavior of having the texture coordinates range * from [0, Dim) where Dim is the width or height of the CUDA * array. Instead, the texture coordinates [0, 1.0) reference @@ -10650,7 +12950,7 @@ CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef); /** * \brief Gets the mipmapped array bound to a texture reference * - * Returns in \p *phMipmappedArray the CUDA mipmapped array bound to the texture + * Returns in \p *phMipmappedArray the CUDA mipmapped array bound to the texture * reference \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference * is not bound to any CUDA mipmapped array. * @@ -10800,7 +13100,7 @@ CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef); * \brief Gets the min/max mipmap level clamps for a texture reference * * Returns the min/max mipmap level clamps in \p pminMipmapLevelClamp and \p pmaxMipmapLevelClamp - * that's used when reading memory through the texture reference \p hTexRef. + * that's used when reading memory through the texture reference \p hTexRef. * * \param pminMipmapLevelClamp - Returned mipmap min level clamp * \param pmaxMipmapLevelClamp - Returned mipmap max level clamp @@ -10825,7 +13125,7 @@ CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float * \brief Gets the maximum anisotropy for a texture reference * * Returns the maximum anisotropy in \p pmaxAniso that's used when reading memory through - * the texture reference \p hTexRef. + * the texture reference \p hTexRef. * * \param pmaxAniso - Returned maximum anisotropy * \param hTexRef - Texture reference @@ -10870,7 +13170,7 @@ CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef); * \sa ::cuTexRefSetAddressMode, * ::cuTexRefSetAddressMode, ::cuTexRefSetBorderColor */ -CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef); +CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef); /** * \brief Gets the flags used by a texture reference @@ -10932,7 +13232,7 @@ CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef); * * \sa ::cuTexRefDestroy */ -CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef); +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef); /** * \brief Destroys a texture reference @@ -10952,7 +13252,7 @@ CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef); * * \sa ::cuTexRefCreate */ -CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef); +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef); /** @} */ /* END CUDA_TEXREF_DEPRECATED */ @@ -11105,7 +13405,7 @@ CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef); * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. * ::CUDA_RESOURCE_DESC::res::linear::format and ::CUDA_RESOURCE_DESC::res::linear::numChannels * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::linear::sizeInBytes - * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed + * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. The number of elements is computed as (sizeInBytes / (sizeof(format) * numChannels)). * * \par @@ -11115,7 +13415,7 @@ CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef); * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::pitch2D::width * and ::CUDA_RESOURCE_DESC::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively. - * ::CUDA_RESOURCE_DESC::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to + * ::CUDA_RESOURCE_DESC::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. Pitch cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH. * * - ::flags must be set to zero. @@ -11144,7 +13444,7 @@ CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef); CU_TR_ADDRESS_MODE_BORDER = 3 } CUaddress_mode; * \endcode - * This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES + * This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES * is not set, the only supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP. * * - ::CUDA_TEXTURE_DESC::filterMode specifies the filtering mode to be used when fetching from the texture. CUfilter_mode is defined as: @@ -11219,14 +13519,14 @@ CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef); * - ::CUDA_RESOURCE_VIEW_DESC::firstLayer specifies the first layer index for layered textures. This will be the new layer zero. * For non-layered resources, this value has to be zero. * - * - ::CUDA_RESOURCE_VIEW_DESC::lastLayer specifies the last layer index for layered textures. For non-layered resources, + * - ::CUDA_RESOURCE_VIEW_DESC::lastLayer specifies the last layer index for layered textures. For non-layered resources, * this value has to be zero. * * * \param pTexObject - Texture object to create * \param pResDesc - Resource descriptor * \param pTexDesc - Texture descriptor - * \param pResViewDesc - Resource view descriptor + * \param pResViewDesc - Resource view descriptor * * \return * ::CUDA_SUCCESS, @@ -11344,7 +13644,7 @@ CUresult CUDAAPI cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC *pResVie * \brief Creates a surface object * * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes - * the data to perform surface load/stores on. ::CUDA_RESOURCE_DESC::resType must be + * the data to perform surface load/stores on. ::CUDA_RESOURCE_DESC::resType must be * ::CU_RESOURCE_TYPE_ARRAY and ::CUDA_RESOURCE_DESC::res::array::hArray * must be set to a valid CUDA array handle. ::CUDA_RESOURCE_DESC::flags must be set to zero. * @@ -11418,7 +13718,7 @@ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsur * ___MANBRIEF___ direct peer context memory access functions of the low-level * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ * - * This section describes the direct peer context memory access functions + * This section describes the direct peer context memory access functions * of the low-level CUDA driver application programming interface. * * @{ @@ -11437,7 +13737,7 @@ CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsur * \param canAccessPeer - Returned access capability * \param dev - Device from which allocations on \p peerDev are to * be directly accessed. - * \param peerDev - Device on which the allocations to be directly accessed + * \param peerDev - Device on which the allocations to be directly accessed * by \p dev reside. * * \return @@ -11464,7 +13764,7 @@ CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevic * details. * * Note that access granted by this call is unidirectional and that in order to access - * memory from the current context in \p peerContext, a separate symmetric call + * memory from the current context in \p peerContext, a separate symmetric call * to ::cuCtxEnablePeerAccess() is required. * * There is a system-wide maximum of eight peer connections per device. @@ -11476,7 +13776,7 @@ CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevic * Returns ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED if direct access of * \p peerContext from the current context has already been enabled. * - * Returns ::CUDA_ERROR_TOO_MANY_PEERS if direct peer access is not possible + * Returns ::CUDA_ERROR_TOO_MANY_PEERS if direct peer access is not possible * because hardware resources required for peer access have been exhausted. * * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, \p peerContext @@ -11506,10 +13806,10 @@ CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevic CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags); /** - * \brief Disables direct access to memory allocations in a peer context and + * \brief Disables direct access to memory allocations in a peer context and * unregisters any registered allocations. * - Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has + Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has * not yet been enabled from \p peerContext to the current context. * * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, or if @@ -11546,6 +13846,8 @@ CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext); * - ::CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED P2P: 1 if P2P Access is enable. * - ::CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED: 1 if Atomic operations over * the link are supported. + * - ::CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED: 1 if cudaArray can + * be accessed over the link. * * Returns ::CUDA_ERROR_INVALID_DEVICE if \p srcDevice or \p dstDevice are not valid * or if they represent the same device. @@ -11665,8 +13967,8 @@ CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphics /** * \brief Get a mipmapped array through which to access a mapped graphics resource. * - * Returns in \p *pMipmappedArray a mipmapped array through which the mapped graphics - * resource \p resource. The value set in \p *pMipmappedArray may change every time + * Returns in \p *pMipmappedArray a mipmapped array through which the mapped graphics + * resource \p resource. The value set in \p *pMipmappedArray may change every time * that \p resource is mapped. * * If \p resource is not a texture then it cannot be accessed via a mipmapped array and @@ -11924,6 +14226,7 @@ CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExp #undef cuMemsetD2D32Async #undef cuStreamGetPriority #undef cuStreamGetFlags + #undef cuStreamGetCtx #undef cuStreamWaitEvent #undef cuStreamAddCallback #undef cuStreamAttachMemAsync @@ -11931,6 +14234,7 @@ CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExp #undef cuStreamSynchronize #undef cuEventRecord #undef cuLaunchKernel + #undef cuLaunchHostFunc #undef cuGraphicsMapResources #undef cuGraphicsUnmapResources #undef cuStreamWriteValue32 @@ -11940,6 +14244,12 @@ CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExp #undef cuStreamBatchMemOp #undef cuMemPrefetchAsync #undef cuLaunchCooperativeKernel + #undef cuSignalExternalSemaphoresAsync + #undef cuWaitExternalSemaphoresAsync + #undef cuStreamBeginCapture + #undef cuStreamEndCapture + #undef cuStreamIsCapturing + #undef cuGraphLaunch #endif /* __CUDA_API_VERSION_INTERNAL */ #if defined(__CUDA_API_VERSION_INTERNAL) || (__CUDA_API_VERSION >= 4000 && __CUDA_API_VERSION < 6050) @@ -12157,6 +14467,7 @@ CUresult CUDAAPI cuEventDestroy(CUevent hEvent); CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority); CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags); + CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx); CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags); CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags); @@ -12164,6 +14475,7 @@ CUresult CUDAAPI cuEventDestroy(CUevent hEvent); CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream); CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra); + CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData); CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); @@ -12173,6 +14485,12 @@ CUresult CUDAAPI cuEventDestroy(CUevent hEvent); CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream); CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams); + CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); + CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); + CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream); + CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph); + CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus); + CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraph, CUstream hStream); #endif #ifdef __cplusplus @@ -12180,6 +14498,6 @@ CUresult CUDAAPI cuEventDestroy(CUevent hEvent); #endif #undef __CUDA_API_VERSION +#undef __CUDA_DEPRECATED #endif /* __cuda_cuda_h__ */ - diff --git a/include/triton/external/CUDA/nvml.h b/include/triton/external/CUDA/nvml.h index 0790b3aad..0b38f5f8a 100755 --- a/include/triton/external/CUDA/nvml.h +++ b/include/triton/external/CUDA/nvml.h @@ -1,5 +1,5 @@ /* - * Copyright 1993-2017 NVIDIA Corporation. All rights reserved. + * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. * * NOTICE TO USER: * @@ -95,14 +95,15 @@ extern "C" { /** * NVML API versioning support */ -#define NVML_API_VERSION 9 -#define NVML_API_VERSION_STR "9" +#define NVML_API_VERSION 10 +#define NVML_API_VERSION_STR "10" #define nvmlInit nvmlInit_v2 #define nvmlDeviceGetPciInfo nvmlDeviceGetPciInfo_v3 #define nvmlDeviceGetCount nvmlDeviceGetCount_v2 #define nvmlDeviceGetHandleByIndex nvmlDeviceGetHandleByIndex_v2 #define nvmlDeviceGetHandleByPciBusId nvmlDeviceGetHandleByPciBusId_v2 #define nvmlDeviceGetNvLinkRemotePciInfo nvmlDeviceGetNvLinkRemotePciInfo_v2 +#define nvmlDeviceRemoveGpu nvmlDeviceRemoveGpu_v2 /***************************************************************************************************/ /** @defgroup nvmlDeviceStructs Device Structs @@ -147,6 +148,23 @@ typedef struct nvmlPciInfo_st char busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; //!< The tuple domain:bus:device.function PCI identifier (& NULL terminator) } nvmlPciInfo_t; +/** + * PCI format string for ::busIdLegacy + */ +#define NVML_DEVICE_PCI_BUS_ID_LEGACY_FMT "%04X:%02X:%02X.0" + +/** + * PCI format string for ::busId + */ +#define NVML_DEVICE_PCI_BUS_ID_FMT "%08X:%02X:%02X.0" + +/** + * Utility macro for filling the pci bus id format from a nvmlPciInfo_t + */ +#define NVML_DEVICE_PCI_BUS_ID_FMT_ARGS(pciInfo) (pciInfo)->domain, \ + (pciInfo)->bus, \ + (pciInfo)->device + /** * Detailed ECC error counts for a device. * @@ -297,12 +315,15 @@ typedef enum nvmlGpuLevel_enum NVML_TOPOLOGY_SINGLE = 10, // all devices that only need traverse a single PCIe switch NVML_TOPOLOGY_MULTIPLE = 20, // all devices that need not traverse a host bridge NVML_TOPOLOGY_HOSTBRIDGE = 30, // all devices that are connected to the same host bridge - NVML_TOPOLOGY_CPU = 40, // all devices that are connected to the same CPU but possibly multiple host bridges + NVML_TOPOLOGY_NODE = 40, // all devices that are connected to the same NUMA node but possibly multiple host bridges NVML_TOPOLOGY_SYSTEM = 50, // all devices in the system // there is purposefully no COUNT here because of the need for spacing above } nvmlGpuTopologyLevel_t; +/* Compatibility for CPU->NODE renaming */ +#define NVML_TOPOLOGY_CPU NVML_TOPOLOGY_NODE + /* P2P Capability Index Status*/ typedef enum nvmlGpuP2PStatus_enum { @@ -478,6 +499,7 @@ typedef enum nvmlBrandType_enum NVML_BRAND_NVS = 3, NVML_BRAND_GRID = 4, NVML_BRAND_GEFORCE = 5, + NVML_BRAND_TITAN = 6, // Keep this last NVML_BRAND_COUNT @@ -603,7 +625,7 @@ typedef enum nvmlClockType_enum NVML_CLOCK_VIDEO = 3, //!< Video encoder/decoder clock domain // Keep this last - NVML_CLOCK_COUNT //busId. pci->busIdLegacy will be populated for both nvmlDeviceGetPciInfo and - * nvmlDeviceGetPciInfo_v2. - * * @param device The identifier of the target device * @param pci Reference in which to return the PCI info * @@ -2420,7 +2596,6 @@ nvmlReturn_t DECLDIR nvmlDeviceGetPersistenceMode(nvmlDevice_t device, nvmlEnabl * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo(nvmlDevice_t device, nvmlPciInfo_t *pci); -nvmlReturn_t DECLDIR nvmlDeviceGetPciInfo_v2(nvmlDevice_t device, nvmlPciInfo_t *pci); /** * Retrieves the maximum PCIe link generation possible with this device and system @@ -2532,9 +2707,9 @@ nvmlReturn_t DECLDIR nvmlDeviceGetPcieThroughput(nvmlDevice_t device, nvmlPcieUt * @param value Reference in which to return the counter's value * * @return - * - \ref NVML_SUCCESS if \a value and \a rollover have been set + * - \ref NVML_SUCCESS if \a value has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a value or \a rollover are NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, or \a value is NULL * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error @@ -3459,7 +3634,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetUtilizationRates(nvmlDevice_t device, nvmlUtil nvmlReturn_t DECLDIR nvmlDeviceGetEncoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); /** - * Retrieves the current capacity of the device's encoder, in macroblocks per second. + * Retrieves the current capacity of the device's encoder, as a percentage of maximum encoder capacity with valid values in the range 0-100. * * For Maxwell &tm; or newer fully supported devices. * @@ -3546,6 +3721,54 @@ nvmlReturn_t DECLDIR nvmlDeviceGetEncoderSessions(nvmlDevice_t device, unsigned */ nvmlReturn_t DECLDIR nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsigned int *utilization, unsigned int *samplingPeriodUs); +/** +* Retrieves the active frame buffer capture sessions statistics for a given device. +* +* For Maxwell &tm; or newer fully supported devices. +* +* @param device The identifier of the target device +* @param fbcStats Reference to nvmlFBCStats_t structure contianing NvFBC stats +* +* @return +* - \ref NVML_SUCCESS if \a fbcStats is fetched +* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized +* - \ref NVML_ERROR_INVALID_ARGUMENT if \a fbcStats is NULL +* - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible +* - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlDeviceGetFBCStats(nvmlDevice_t device, nvmlFBCStats_t *fbcStats); + +/** +* Retrieves information about active frame buffer capture sessions on a target device. +* +* An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The +* array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions +* written to the buffer. +* +* If the supplied buffer is not large enough to accomodate the active session array, the function returns +* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount. +* To query the number of active FBC sessions, call this function with *sessionCount = 0. The code will return +* NVML_SUCCESS with number of active FBC sessions updated in *sessionCount. +* +* For Maxwell &tm; or newer fully supported devices. +* +* @note hResolution, vResolution, averageFPS and averageLatency data for a FBC session returned in \a sessionInfo may +* be zero if there are no new frames captured since the session started. +* +* @param device The identifier of the target device +* @param sessionCount Reference to caller supplied array size, and returns the number of sessions. +* @param sessionInfo Reference in which to return the session information +* +* @return +* - \ref NVML_SUCCESS if \a sessionInfo is fetched +* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized +* - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount +* - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL. +* - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible +* - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlDeviceGetFBCSessions(nvmlDevice_t device, unsigned int *sessionCount, nvmlFBCSessionInfo_t *sessionInfo); + /** * Retrieves the current and pending driver model for the device. * @@ -3980,6 +4203,38 @@ nvmlReturn_t DECLDIR nvmlDeviceGetAccountingBufferSize(nvmlDevice_t device, unsi nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageRetirementCause_t cause, unsigned int *pageCount, unsigned long long *addresses); +/** + * Returns the list of retired pages by source, including pages that are pending retirement + * The address information provided from this API is the hardware address of the page that was retired. Note + * that this does not match the virtual address used in CUDA, but will match the address information in XID 63 + * + * \note nvmlDeviceGetRetiredPages_v2 adds an additional timestamps paramter to return the time of each page's + * retirement. + * + * For Kepler &tm; or newer fully supported devices. + * + * @param device The identifier of the target device + * @param cause Filter page addresses by cause of retirement + * @param pageCount Reference in which to provide the \a addresses buffer size, and + * to return the number of retired pages that match \a cause + * Set to 0 to query the size without allocating an \a addresses buffer + * @param addresses Buffer to write the page addresses into + * @param timestamps Buffer to write the timestamps of page retirement, additional for _v2 + * + * @return + * - \ref NVML_SUCCESS if \a pageCount was populated and \a addresses was filled + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a pageCount indicates the buffer is not large enough to store all the + * matching page addresses. \a pageCount is set to the needed size. + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid, \a pageCount is NULL, \a cause is invalid, or + * \a addresses is NULL + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages_v2(nvmlDevice_t device, nvmlPageRetirementCause_t cause, + unsigned int *pageCount, unsigned long long *addresses, unsigned long long *timestamps); + /** * Check if any pages are pending retirement and need a reboot to fully retire. * @@ -4217,11 +4472,65 @@ nvmlReturn_t DECLDIR nvmlDeviceClearEccErrorCounts(nvmlDevice_t device, nvmlEccC */ nvmlReturn_t DECLDIR nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverModel_t driverModel, unsigned int flags); +/** + * Set clocks that device will lock to. + * + * Sets the clocks that the device will be running at to the value in the range of minGpuClockMHz to maxGpuClockMHz. + * Setting this will supercede application clock values and take effect regardless if a cuda app is running. + * See /ref nvmlDeviceSetApplicationsClocks + * + * Can be used as a setting to request constant performance. + * + * Requires root/admin permissions. + * + * After system reboot or driver reload applications clocks go back to their default value. + * See \ref nvmlDeviceResetGpuLockedClocks. + * + * For newer than Pascal &tm; fully supported devices. + * + * @param device The identifier of the target device + * @param minGpuClockMHz Requested minimum gpu clock in MHz + * @param maxGpuClockMHz Requested maximum gpu clock in MHz + * + * @return + * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a minGpuClockMHz and \a maxGpuClockMHz + * is not a valid clock combination + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceSetGpuLockedClocks(nvmlDevice_t device, unsigned int minGpuClockMHz, unsigned int maxGpuClockMHz); + +/** + * Resets the gpu clock to the default value + * + * This is the gpu clock that will be used after system reboot or driver reload. + * Default values are idle clocks, but the current values can be changed using \ref nvmlDeviceSetApplicationsClocks. + * + * @see nvmlDeviceSetGpuLockedClocks + * + * For newer than Pascal &tm; fully supported devices. + * + * @param device The identifier of the target device + * + * @return + * - \ref NVML_SUCCESS if new settings were successfully set + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid + * - \ref NVML_ERROR_NOT_SUPPORTED if the device does not support this feature + * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceResetGpuLockedClocks(nvmlDevice_t device); + /** * Set clocks that applications will lock to. * * Sets the clocks that compute and graphics applications will be running at. - * e.g. CUDA driver requests these clocks during context creation which means this property + * e.g. CUDA driver requests these clocks during context creation which means this property * defines clocks at which CUDA applications will be running unless some overspec event * occurs (e.g. over power, over thermal or external HW brake). * @@ -4234,9 +4543,9 @@ nvmlReturn_t DECLDIR nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverMod * above the clock value being set. * * For Kepler &tm; or newer non-GeForce fully supported devices and Maxwell or newer GeForce devices. - * Requires root/admin permissions. + * Requires root/admin permissions. * - * See \ref nvmlDeviceGetSupportedMemoryClocks and \ref nvmlDeviceGetSupportedGraphicsClocks + * See \ref nvmlDeviceGetSupportedMemoryClocks and \ref nvmlDeviceGetSupportedGraphicsClocks * for details on how to list available clocks combinations. * * After system reboot or driver reload applications clocks go back to their default value. @@ -4245,13 +4554,13 @@ nvmlReturn_t DECLDIR nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverMod * @param device The identifier of the target device * @param memClockMHz Requested memory clock in MHz * @param graphicsClockMHz Requested graphics clock in MHz - * - * @return + * + * @return * - \ref NVML_SUCCESS if new settings were successfully set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memClockMHz and \a graphicsClockMHz + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a device is invalid or \a memClockMHz and \a graphicsClockMHz * is not a valid clock combination - * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation + * - \ref NVML_ERROR_NO_PERMISSION if the user doesn't have permission to perform this operation * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_GPU_IS_LOST if the target GPU has fallen off the bus or is otherwise inaccessible * - \ref NVML_ERROR_UNKNOWN on any unexpected error @@ -4848,6 +5157,9 @@ nvmlReturn_t DECLDIR nvmlDeviceQueryDrainState (nvmlPciInfo_t *pciInfo, nvmlEnab * Some Kepler devices supported. * * @param pciInfo The PCI address of the GPU to be removed + * @param gpuState Whether the GPU is to be removed, from the OS + * see \ref nvmlDetachGpuState_t + * @param linkState Requested upstream PCIe link state, see \ref nvmlPcieLinkState_t * * @return * - \ref NVML_SUCCESS if counters were successfully reset @@ -4856,7 +5168,7 @@ nvmlReturn_t DECLDIR nvmlDeviceQueryDrainState (nvmlPciInfo_t *pciInfo, nvmlEnab * - \ref NVML_ERROR_NOT_SUPPORTED if the device doesn't support this feature * - \ref NVML_ERROR_IN_USE if the device is still in use and cannot be removed */ -nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu (nvmlPciInfo_t *pciInfo); +nvmlReturn_t DECLDIR nvmlDeviceRemoveGpu (nvmlPciInfo_t *pciInfo, nvmlDetachGpuState_t gpuState, nvmlPcieLinkState_t linkState); /** * Request the OS and the NVIDIA kernel driver to rediscover a portion of the PCI subsystem looking for GPUs that @@ -5247,7 +5559,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetActiveVgpus(nvmlDevice_t device, unsigned int * @return * - \ref NVML_SUCCESS successful completion * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a vmId or \a vmIdType are NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vmId or \a vmIdType is NULL, or \a vgpuInstance is 0 + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ @@ -5269,7 +5582,8 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmID(nvmlVgpuInstance_t vgpuInstance, ch * @return * - \ref NVML_SUCCESS successful completion * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a uuid is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a uuid is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a size is too small * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ @@ -5295,7 +5609,8 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetUUID(nvmlVgpuInstance_t vgpuInstance, ch * @return * - \ref NVML_SUCCESS if \a version has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0 + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a length is too small * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ @@ -5314,7 +5629,8 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetVmDriverVersion(nvmlVgpuInstance_t vgpuI * @return * - \ref NVML_SUCCESS successful completion * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a fbUsage is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a fbUsage is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFbUsage(nvmlVgpuInstance_t vgpuInstance, unsigned long long *fbUsage); @@ -5332,7 +5648,8 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFbUsage(nvmlVgpuInstance_t vgpuInstance, * @return * - \ref NVML_SUCCESS if \a licensed has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a licensed is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a licensed is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseStatus(nvmlVgpuInstance_t vgpuInstance, unsigned int *licensed); @@ -5350,7 +5667,8 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetLicenseStatus(nvmlVgpuInstance_t vgpuIns * @return * - \ref NVML_SUCCESS if \a vgpuTypeId has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a vgpuTypeId is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a vgpuTypeId is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetType(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuTypeId_t *vgpuTypeId); @@ -5369,13 +5687,14 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetType(nvmlVgpuInstance_t vgpuInstance, nv * - \ref NVML_SUCCESS if \a frameRateLimit has been set * - \ref NVML_ERROR_NOT_SUPPORTED if frame rate limiter is turned off for the vGPU type * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a frameRateLimit is NULL + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a frameRateLimit is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFrameRateLimit(nvmlVgpuInstance_t vgpuInstance, unsigned int *frameRateLimit); /** - * Retrieve the encoder Capacity of a vGPU instance, in macroblocks per second. + * Retrieve the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100. * * For Maxwell &tm; or newer fully supported devices. * @@ -5385,13 +5704,14 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFrameRateLimit(nvmlVgpuInstance_t vgpuIn * @return * - \ref NVML_SUCCESS if \a encoderCapacity has been retrived * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid, or \a encoderQueryType is invalid + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a encoderQueryType is invalid + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int *encoderCapacity); /** - * Set the encoder Capacity of a vGPU instance, in macroblocks per second. + * Set the encoder capacity of a vGPU instance, as a percentage of maximum encoder capacity with valid values in the range 0-100. * * For Maxwell &tm; or newer fully supported devices. * @@ -5401,7 +5721,8 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderCapacity(nvmlVgpuInstance_t vgpuI * @return * - \ref NVML_SUCCESS if \a encoderCapacity has been set * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is invalid + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0 + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ nvmlReturn_t DECLDIR nvmlVgpuInstanceSetEncoderCapacity(nvmlVgpuInstance_t vgpuInstance, unsigned int encoderCapacity); @@ -5531,7 +5852,8 @@ nvmlReturn_t DECLDIR nvmlDeviceGetGridLicensableFeatures(nvmlDevice_t device, nv * - \ref NVML_SUCCESS if \a sessionCount, \a averageFps and \a averageLatency is fetched * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount , or \a averageFps or \a averageLatency is NULL - * or \a vgpuInstance is invalid. + * or \a vgpuInstance is 0. + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, @@ -5562,11 +5884,60 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInst * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount - * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL or \a vgpuInstance is invalid.. + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a sessionCount is NULL, or \a vgpuInstance is 0. + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system * - \ref NVML_ERROR_UNKNOWN on any unexpected error */ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlEncoderSessionInfo_t *sessionInfo); +/** +* Retrieves the active frame buffer capture sessions statistics of a vGPU Instance +* +* For Maxwell &tm; or newer fully supported devices. +* +* @param vgpuInstance Identifier of the target vGPU instance +* @param fbcStats Reference to nvmlFBCStats_t structure contianing NvFBC stats +* +* @return +* - \ref NVML_SUCCESS if \a fbcStats is fetched +* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized +* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a fbcStats is NULL +* - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system +* - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFBCStats(nvmlVgpuInstance_t vgpuInstance, nvmlFBCStats_t *fbcStats); + +/** +* Retrieves information about active frame buffer capture sessions on a vGPU Instance. +* +* An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The +* array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions +* written to the buffer. +* +* If the supplied buffer is not large enough to accomodate the active session array, the function returns +* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount. +* To query the number of active FBC sessions, call this function with *sessionCount = 0. The code will return +* NVML_SUCCESS with number of active FBC sessions updated in *sessionCount. +* +* For Maxwell &tm; or newer fully supported devices. +* +* @note hResolution, vResolution, averageFPS and averageLatency data for a FBC session returned in \a sessionInfo may +* be zero if there are no new frames captured since the session started. +* +* @param vgpuInstance Identifier of the target vGPU instance +* @param sessionCount Reference to caller supplied array size, and returns the number of sessions. +* @param sessionInfo Reference in which to return the session information +* +* @return +* - \ref NVML_SUCCESS if \a sessionInfo is fetched +* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized +* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a sessionCount is NULL. +* - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system +* - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a sessionCount is too small, array element count is returned in \a sessionCount +* - \ref NVML_ERROR_UNKNOWN on any unexpected error +*/ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFBCSessions(nvmlVgpuInstance_t vgpuInstance, unsigned int *sessionCount, nvmlFBCSessionInfo_t *sessionInfo); + /** * Retrieves the current utilization and process ID * @@ -5607,12 +5978,294 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderSessions(nvmlVgpuInstance_t vgpuI nvmlReturn_t DECLDIR nvmlDeviceGetProcessUtilization(nvmlDevice_t device, nvmlProcessUtilizationSample_t *utilization, unsigned int *processSamplesCount, unsigned long long lastSeenTimeStamp); +/** + * Queries the state of per process accounting mode on vGPU. + * + * For Maxwell &tm; or newer fully supported devices. + * + * @param vgpuInstance The identifier of the target vGPU VM + * @param mode Reference in which to return the current accounting mode + * + * @return + * - \ref NVML_SUCCESS if the mode has been successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a mode is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingMode(nvmlVgpuInstance_t vgpuInstance, nvmlEnableState_t *mode); + +/** + * Queries list of processes running on vGPU that can be queried for accounting stats. The list of processes + * returned can be in running or terminated state. + * + * For Maxwell &tm; or newer fully supported devices. + * + * To just query the maximum number of processes that can be queried, call this function with *count = 0 and + * pids=NULL. The return code will be NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if list is empty. + * + * For more details see \ref nvmlVgpuInstanceGetAccountingStats. + * + * @note In case of PID collision some processes might not be accessible before the circular buffer is full. + * + * @param vgpuInstance The identifier of the target vGPU VM + * @param count Reference in which to provide the \a pids array size, and + * to return the number of elements ready to be queried + * @param pids Reference in which to return list of process ids + * + * @return + * - \ref NVML_SUCCESS if pids were successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a count is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature or accounting mode is disabled + * - \ref NVML_ERROR_INSUFFICIENT_SIZE if \a count is too small (\a count is set to expected value) + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + * + * @see nvmlVgpuInstanceGetAccountingPids + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingPids(nvmlVgpuInstance_t vgpuInstance, unsigned int *count, unsigned int *pids); + +/** + * Queries process's accounting stats. + * + * For Maxwell &tm; or newer fully supported devices. + * + * Accounting stats capture GPU utilization and other statistics across the lifetime of a process, and + * can be queried during life time of the process or after its termination. + * The time field in \ref nvmlAccountingStats_t is reported as 0 during the lifetime of the process and + * updated to actual running time after its termination. + * Accounting stats are kept in a circular buffer, newly created processes overwrite information about old + * processes. + * + * See \ref nvmlAccountingStats_t for description of each returned metric. + * List of processes that can be queried can be retrieved from \ref nvmlVgpuInstanceGetAccountingPids. + * + * @note Accounting Mode needs to be on. See \ref nvmlVgpuInstanceGetAccountingMode. + * @note Only compute and graphics applications stats can be queried. Monitoring applications stats can't be + * queried since they don't contribute to GPU utilization. + * @note In case of pid collision stats of only the latest process (that terminated last) will be reported + * + * @param vgpuInstance The identifier of the target vGPU VM + * @param pid Process Id of the target process to query stats for + * @param stats Reference in which to return the process's accounting stats + * + * @return + * - \ref NVML_SUCCESS if stats have been successfully retrieved + * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a stats is NULL + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * or \a stats is not found + * - \ref NVML_ERROR_NOT_SUPPORTED if the vGPU doesn't support this feature or accounting mode is disabled + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetAccountingStats(nvmlVgpuInstance_t vgpuInstance, unsigned int pid, nvmlAccountingStats_t *stats); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvml vGPU Migration + * This chapter describes NVML operations that are associated with vGPU Migration. + * @{ + */ +/***************************************************************************************************/ + +/** + * vGPU metadata structure. + */ +typedef struct nvmlVgpuMetadata_st +{ + unsigned int version; //!< Current version of the structure + unsigned int revision; //!< Current revision of the structure + nvmlVgpuGuestInfoState_t guestInfoState; //!< Current state of Guest-dependent fields + char guestDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in guest + char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Version of driver installed in host + unsigned int reserved[8]; //!< Reserved for internal use + unsigned int opaqueDataSize; //!< Size of opaque data field in bytes + char opaqueData[4]; //!< Opaque data +} nvmlVgpuMetadata_t; + +/** + * Physical GPU metadata structure + */ +typedef struct nvmlVgpuPgpuMetadata_st +{ + unsigned int version; //!< Current version of the structure + unsigned int revision; //!< Current revision of the structure + char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Host driver version + unsigned int pgpuVirtualizationCaps; //!< Pgpu virtualizaion capabilities bitfileld + unsigned int reserved[7]; //!< Reserved for internal use + unsigned int opaqueDataSize; //!< Size of opaque data field in bytes + char opaqueData[4]; //!< Opaque data +} nvmlVgpuPgpuMetadata_t; + +/** + * vGPU VM compatibility codes + */ +typedef enum nvmlVgpuVmCompatibility_enum +{ + NVML_VGPU_VM_COMPATIBILITY_NONE = 0x0, //!< vGPU is not runnable + NVML_VGPU_VM_COMPATIBILITY_COLD = 0x1, //!< vGPU is runnable from a cold / powered-off state (ACPI S5) + NVML_VGPU_VM_COMPATIBILITY_HIBERNATE = 0x2, //!< vGPU is runnable from a hibernated state (ACPI S4) + NVML_VGPU_VM_COMPATIBILITY_SLEEP = 0x4, //!< vGPU is runnable from a sleeped state (ACPI S3) + NVML_VGPU_VM_COMPATIBILITY_LIVE = 0x8, //!< vGPU is runnable from a live/paused (ACPI S0) +} nvmlVgpuVmCompatibility_t; + +/** + * vGPU-pGPU compatibility limit codes + */ +typedef enum nvmlVgpuPgpuCompatibilityLimitCode_enum +{ + NVML_VGPU_COMPATIBILITY_LIMIT_NONE = 0x0, //!< Compatibility is not limited. + NVML_VGPU_COMPATIBILITY_LIMIT_HOST_DRIVER = 0x1, //!< Compatibility is limited by host driver version. + NVML_VGPU_COMPATIBILITY_LIMIT_GUEST_DRIVER = 0x2, //!< Compatibility is limited by guest driver version. + NVML_VGPU_COMPATIBILITY_LIMIT_GPU = 0x4, //!< Compatibility is limited by GPU hardware. + NVML_VGPU_COMPATIBILITY_LIMIT_OTHER = 0x80000000, //!< Compatibility is limited by an undefined factor. +} nvmlVgpuPgpuCompatibilityLimitCode_t; + +/** + * vGPU-pGPU compatibility structure + */ +typedef struct nvmlVgpuPgpuCompatibility_st +{ + nvmlVgpuVmCompatibility_t vgpuVmCompatibility; //!< Compatibility of vGPU VM. See \ref nvmlVgpuVmCompatibility_t + nvmlVgpuPgpuCompatibilityLimitCode_t compatibilityLimitCode; //!< Limiting factor for vGPU-pGPU compatibility. See \ref nvmlVgpuPgpuCompatibilityLimitCode_t +} nvmlVgpuPgpuCompatibility_t; + +/** + * Returns vGPU metadata structure for a running vGPU. The structure contains information about the vGPU and its associated VM + * such as the currently installed NVIDIA guest driver version, together with host driver version and an opaque data section + * containing internal state. + * + * nvmlVgpuInstanceGetMetadata() may be called at any time for a vGPU instance. Some fields in the returned structure are + * dependent on information obtained from the guest VM, which may not yet have reached a state where that information + * is available. The current state of these dependent fields is reflected in the info structure's \ref guestInfoState field. + * + * The VMM may choose to read and save the vGPU's VM info as persistent metadata associated with the VM, and provide + * it to GRID Virtual GPU Manager when creating a vGPU for subsequent instances of the VM. + * + * The caller passes in a buffer via \a vgpuMetadata, with the size of the buffer in \a bufferSize. If the vGPU Metadata structure + * is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed + * in \a bufferSize. + * + * @param vgpuInstance vGPU instance handle + * @param vgpuMetadata Pointer to caller-supplied buffer into which vGPU metadata is written + * @param bufferSize Size of vgpuMetadata buffer + * + * @return + * - \ref NVML_SUCCESS vGPU metadata structure was successfully returned + * - \ref NVML_ERROR_INSUFFICIENT_SIZE vgpuMetadata buffer is too small, required size is returned in \a bufferSize + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a vgpuInstance is 0; if \a vgpuMetadata is NULL and the value of \a bufferSize is not 0. + * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlVgpuInstanceGetMetadata(nvmlVgpuInstance_t vgpuInstance, nvmlVgpuMetadata_t *vgpuMetadata, unsigned int *bufferSize); + +/** + * Returns a vGPU metadata structure for the physical GPU indicated by \a device. The structure contains information about + * the GPU and the currently installed NVIDIA host driver version that's controlling it, together with an opaque data section + * containing internal state. + * + * The caller passes in a buffer via \a pgpuMetadata, with the size of the buffer in \a bufferSize. If the \a pgpuMetadata + * structure is too large to fit in the supplied buffer, the function returns NVML_ERROR_INSUFFICIENT_SIZE with the size needed + * in \a bufferSize. + * + * @param device The identifier of the target device + * @param pgpuMetadata Pointer to caller-supplied buffer into which \a pgpuMetadata is written + * @param bufferSize Pointer to size of \a pgpuMetadata buffer + * + * @return + * - \ref NVML_SUCCESS GPU metadata structure was successfully returned + * - \ref NVML_ERROR_INSUFFICIENT_SIZE pgpuMetadata buffer is too small, required size is returned in \a bufferSize + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a bufferSize is NULL or \a device is invalid; if \a pgpuMetadata is NULL and the value of \a bufferSize is not 0. + * - \ref NVML_ERROR_NOT_SUPPORTED vGPU is not supported by the system + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlDeviceGetVgpuMetadata(nvmlDevice_t device, nvmlVgpuPgpuMetadata_t *pgpuMetadata, unsigned int *bufferSize); + +/** + * Takes a vGPU instance metadata structure read from \ref nvmlVgpuInstanceGetMetadata(), and a vGPU metadata structure for a + * physical GPU read from \ref nvmlDeviceGetVgpuMetadata(), and returns compatibility information of the vGPU instance and the + * physical GPU. + * + * The caller passes in a buffer via \a compatibilityInfo, into which a compatibility information structure is written. The + * structure defines the states in which the vGPU / VM may be booted on the physical GPU. If the vGPU / VM compatibility + * with the physical GPU is limited, a limit code indicates the factor limiting compability. + * (see \ref nvmlVgpuPgpuCompatibilityLimitCode_t for details). + * + * Note: vGPU compatibility does not take into account dynamic capacity conditions that may limit a system's ability to + * boot a given vGPU or associated VM. + * + * @param vgpuMetadata Pointer to caller-supplied vGPU metadata structure + * @param pgpuMetadata Pointer to caller-supplied GPU metadata structure + * @param compatibilityInfo Pointer to caller-supplied buffer to hold compatibility info + * + * @return + * - \ref NVML_SUCCESS vGPU metadata structure was successfully returned + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuMetadata or \a pgpuMetadata or \a bufferSize are NULL + * - \ref NVML_ERROR_UNKNOWN on any unexpected error + */ +nvmlReturn_t DECLDIR nvmlGetVgpuCompatibility(nvmlVgpuMetadata_t *vgpuMetadata, nvmlVgpuPgpuMetadata_t *pgpuMetadata, nvmlVgpuPgpuCompatibility_t *compatibilityInfo); + +/** @} */ + +/***************************************************************************************************/ +/** @defgroup nvmlGpuBlacklistQueries GPU Blacklist Queries + * This chapter describes NVML operations that are associated with blacklisted GPUs. + * @{ + */ +/***************************************************************************************************/ + +/** + * Blacklist GPU device information + **/ +typedef struct nvmlBlacklistDeviceInfo_st +{ + nvmlPciInfo_t pciInfo; //!< The PCI information for the blacklisted GPU + char uuid[NVML_DEVICE_UUID_BUFFER_SIZE]; //!< The ASCII string UUID for the blacklisted GPU +} nvmlBlacklistDeviceInfo_t; + + /** + * Retrieves the number of blacklisted GPU devices in the system. + * + * For all products. + * + * @param deviceCount Reference in which to return the number of blacklisted devices + * + * @return + * - \ref NVML_SUCCESS if \a deviceCount has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a deviceCount is NULL + */ +nvmlReturn_t DECLDIR nvmlGetBlacklistDeviceCount(unsigned int *deviceCount); + +/** + * Acquire the device information for a blacklisted device, based on its index. + * + * For all products. + * + * Valid indices are derived from the \a deviceCount returned by + * \ref nvmlGetBlacklistDeviceCount(). For example, if \a deviceCount is 2 the valid indices + * are 0 and 1, corresponding to GPU 0 and GPU 1. + * + * @param index The index of the target GPU, >= 0 and < \a deviceCount + * @param info Reference in which to return the device information + * + * @return + * - \ref NVML_SUCCESS if \a device has been set + * - \ref NVML_ERROR_INVALID_ARGUMENT if \a index is invalid or \a info is NULL + * + * @see nvmlGetBlacklistDeviceCount + */ +nvmlReturn_t DECLDIR nvmlGetBlacklistDeviceInfoByIndex(unsigned int index, nvmlBlacklistDeviceInfo_t *info); + /** @} */ /** * NVML API versioning support */ #if defined(__NVML_API_VERSION_INTERNAL) +#undef nvmlDeviceRemoveGpu #undef nvmlDeviceGetNvLinkRemotePciInfo #undef nvmlDeviceGetPciInfo #undef nvmlDeviceGetCount From f58c9a4d2baf635b53f278ef0115687b22bfacf4 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 5 Jun 2019 14:43:38 -0700 Subject: [PATCH 162/494] [general] hmma baseline setup --- examples/cpp/dot.cpp | 4 +-- examples/python/tensorflow/CMakeLists.txt | 2 +- examples/python/tensorflow/dot.cpp | 38 +++++++++++------------ examples/python/tensorflow/run.py | 19 +++++++----- include/triton/ir/builder.h | 1 + include/triton/lang/ops.h | 2 +- include/triton/lang/parser.y | 3 +- include/triton/lang/scanner.l | 1 + include/triton/runtime/jit.h | 1 + lib/driver/module.cpp | 2 -- lib/ir/builder.cpp | 3 ++ lib/ir/ir.cpp | 0 lib/lang/declaration.cpp | 1 + lib/lang/node.cpp | 6 ++++ 14 files changed, 50 insertions(+), 33 deletions(-) delete mode 100644 lib/ir/ir.cpp diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index abaed5ff3..8b7559f55 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -16,7 +16,7 @@ int main() { triton::jit jit(context); // matrix multiplication parameters - int32_t M = 512, N = 512, K = 512; + int32_t M = 2048, N = 2048, K = 2048; std::vector hc(M*N); std::vector rc(M*N); std::vector ha(M*K); @@ -60,7 +60,7 @@ int main() { // just-in-time compile source-code std::string src = triton::dnn::gemm::src(AT, BT); -// jit.autotune("matmul",src.c_str(), benchmark); + jit.autotune("matmul",src.c_str(), benchmark); jit.add_module("matmul", src.c_str(), triton::dnn::gemm::default_params(AT, BT)); triton::driver::kernel* kernel = jit.get_function("matmul"); triton::jit::launch_information info = jit.get_launch_info("matmul"); diff --git a/examples/python/tensorflow/CMakeLists.txt b/examples/python/tensorflow/CMakeLists.txt index 1ce055203..6c8a6f008 100644 --- a/examples/python/tensorflow/CMakeLists.txt +++ b/examples/python/tensorflow/CMakeLists.txt @@ -4,7 +4,7 @@ if(${TensorFlow_FOUND}) include_directories("${TF_INC}/tensorflow/include") include_directories("${CUDA_HOME}/include") link_directories(${TF_LIB}) - add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) + add_definitions(-D_GLIBCXX_USE_CXX11_ABI=${TF_ABI}) add_library(tf_blocksparse SHARED dot.cpp) target_link_libraries(tf_blocksparse tensorflow_framework triton) endif() diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index 70ab8c386..c87b054fa 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -25,7 +25,8 @@ const tunable int32 TN = {16, 32, 64, 128}; const tunable int32 TK = {8}; const tunable int32 GZ = {1}; -void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, +void matmul(restrict read_only fp16 *A, restrict read_only fp16 *B, + fp32 *C, int32 M, int32 N, int32 K, int32 lda, int32 ldb, int32 ldc, int32 *locks, int32 grid0, int32 grid1) { @@ -39,10 +40,10 @@ void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, int32 rem = K % GZ; K = select(rz < rem, div - 1, div); int32 offk = select(rz < rem, rz*(div + 1), rz*div + rem); - fp32* pa[TM, TK] = A + (offk + rka[newaxis, :])*lda + rxa[:, newaxis]; - fp32* pb[TN, TK] = B + (offk + rkb[newaxis, :])*ldb + ryb[:, newaxis]; - fp32 a[TM, TK] = *pa; - fp32 b[TN, TK] = *pb; + fp16* pa[TM, TK] = A + (offk + rka[newaxis, :])*lda + rxa[:, newaxis]; + fp16* pb[TN, TK] = B + (offk + rkb[newaxis, :])*ldb + ryb[:, newaxis]; + fp16 a[TM, TK] = *pa; + fp16 b[TN, TK] = *pb; int32 last_a = ((M*K - 1) - (TM*TK + 1)) / lda; int32 last_b = ((K*N - 1) - (TN*TK + 1)) / ldb; last_a = last_a / TK * TK; @@ -60,10 +61,10 @@ void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, for(int32 k = bound; k > 0; k = k - 1){ int1 checka[TM, 1] = rxc[:, newaxis] < M; int1 checkb[TN, 1] = ryc[:, newaxis] < N; - fp32* pa[TM, 1] = A + (offk + K - k)*lda + rxc[:, newaxis]; - fp32* pb[TN, 1] = B + (offk + K - k)*ldb + ryc[:, newaxis]; - fp32 a[TM, 1] = checka ? *pa : 0; - fp32 b[TN, 1] = checkb ? *pb : 0; + fp16* pa[TM, 1] = A + (offk + K - k)*lda + rxc[:, newaxis]; + fp16* pb[TN, 1] = B + (offk + K - k)*ldb + ryc[:, newaxis]; + fp16 a[TM, 1] = checka ? *pa : 0; + fp16 b[TN, 1] = checkb ? *pb : 0; c = dot(a, trans(b), c); } int32 ridx = get_range_id(0); @@ -89,13 +90,6 @@ void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, } )"; -REGISTER_OP("Dot") - .Input("a: T") - .Input("b: T") - .Input("locks: int32") - .Output("c: T") - .Attr("T: {float}") -; class BlockSparseGemmOp : public OpKernel { public: @@ -126,8 +120,8 @@ class BlockSparseGemmOp : public OpKernel { // initialize default compute device triton::jit jit(ctx); // matrix multiplication parameters - triton::driver::cu_buffer da(ctx, (CUdeviceptr)a.flat().data(), false); - triton::driver::cu_buffer db(ctx, (CUdeviceptr)b.flat().data(), false); + triton::driver::cu_buffer da(ctx, (CUdeviceptr)a.flat().data(), false); + triton::driver::cu_buffer db(ctx, (CUdeviceptr)b.flat().data(), false); triton::driver::cu_buffer dc(ctx, (CUdeviceptr)c->flat().data(), false); triton::driver::cu_buffer dlocks(ctx, (CUdeviceptr)locks.flat().data(), false); stream->synchronize(); @@ -160,4 +154,10 @@ class BlockSparseGemmOp : public OpKernel { private: }; -REGISTER_KERNEL_BUILDER(Name("Dot").Device(DEVICE_GPU).TypeConstraint("T"), BlockSparseGemmOp); +REGISTER_KERNEL_BUILDER(Name("Dot").Device(DEVICE_GPU), BlockSparseGemmOp); +REGISTER_OP("Dot") + .Input("a: float16") + .Input("b: float16") + .Input("locks: int32") + .Output("c: float32") +; diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 5a721def9..194e6e9ed 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -3,18 +3,23 @@ import tensorflow as tf import numpy as np data_files_path = tf.resource_loader.get_data_files_path() -library_dir = '/home/philippe/Development/triton/build/examples/python/tensorflow' +library_dir = '/home/philippe/development/triton/build/examples/python/tensorflow' module = tf.load_op_library(os.path.join(library_dir, 'libtf_blocksparse.so')) M, N, K = 512, 512, 512 -a = tf.placeholder(tf.float32, shape=[M, K]) -b = tf.placeholder(tf.float32, shape=[N, K]) +a = tf.placeholder(tf.float16, shape=[M, K]) +b = tf.placeholder(tf.float16, shape=[N, K]) locks = tf.placeholder(tf.int32, shape=[4096]) -c = module.block_sparse_mat_mul(a, b, locks) +c = module.dot(a, b, locks) +# Reference +ha = np.random.rand(M, K).astype(np.float16) +hb = np.random.rand(N, K).astype(np.float16) +hresult = np.dot(hb.T, ha) + # Run sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) result = sess.run([c], feed_dict = {locks: np.zeros(4096), - a: np.random.rand(M, K), - b: np.random.rand(N, K)}) -print(result) + a: ha, + b: hb}) +print(result - hresult) diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h index 51dd656d3..48b1d172d 100644 --- a/include/triton/ir/builder.h +++ b/include/triton/ir/builder.h @@ -40,6 +40,7 @@ public: type *get_int16_ty(); type *get_int32_ty(); type *get_int64_ty(); + type *get_half_ty(); type *get_float_ty(); type *get_double_ty(); // Insert diff --git a/include/triton/lang/ops.h b/include/triton/lang/ops.h index 9328be921..38fc200bf 100644 --- a/include/triton/lang/ops.h +++ b/include/triton/lang/ops.h @@ -35,7 +35,7 @@ enum TYPE_T{ VOID_T, UINT1_T, UINT8_T, UINT16_T, UINT32_T, UINT64_T, INT1_T, INT8_T, INT16_T, INT32_T, INT64_T, - FLOAT32_T, FLOAT64_T + FLOAT16_T, FLOAT32_T, FLOAT64_T }; enum STORAGE_SPEC_T{ diff --git a/include/triton/lang/parser.y b/include/triton/lang/parser.y index 66d7c1770..18fc3bbed 100644 --- a/include/triton/lang/parser.y +++ b/include/triton/lang/parser.y @@ -52,7 +52,7 @@ STORAGE_SPEC_T get_storage_spec(node *op) { return ((token*)op)->storage_spec;} %token AND_OP OR_OP MUL_ASSIGN DIV_ASSIGN MOD_ASSIGN ADD_ASSIGN %token SUB_ASSIGN LEFT_ASSIGN RIGHT_ASSIGN AND_ASSIGN %token XOR_ASSIGN OR_ASSIGN TYPE_NAME -%token VOID UINT1 UINT8 UINT16 UINT32 UINT64 INT1 INT8 INT16 INT32 INT64 FP32 FP64 +%token VOID UINT1 UINT8 UINT16 UINT32 UINT64 INT1 INT8 INT16 INT32 INT64 FP16 FP32 FP64 %token IF ELSE FOR CONTINUE WHILE %token NEWAXIS ELLIPSIS AT %token GET_GLOBAL_RANGE GET_RANGE_ID DOT TRANS MAX MIN SELECT ATOMIC_CAS ATOMIC_EXCHG ALLOC_CONST @@ -77,6 +77,7 @@ type_specifier | INT16 { $$ = new token(INT16_T); } | INT32 { $$ = new token(INT32_T); } | INT64 { $$ = new token(INT64_T); } + | FP16 { $$ = new token(FLOAT16_T); } | FP32 { $$ = new token(FLOAT32_T); } | FP64 { $$ = new token(FLOAT64_T); } ; diff --git a/include/triton/lang/scanner.l b/include/triton/lang/scanner.l index b1160fb1c..a2cd50922 100644 --- a/include/triton/lang/scanner.l +++ b/include/triton/lang/scanner.l @@ -38,6 +38,7 @@ using triton::lang::return_void; "int16" { return return_impl(INT16, yytext); } "int32" { return return_impl(INT32, yytext); } "int64" { return return_impl(INT64, yytext); } +"fp16" { return return_impl(FP16, yytext); } "fp32" { return return_impl(FP32, yytext); } "fp64" { return return_impl(FP64, yytext); } "..." { return return_impl(ELLIPSIS, yytext); } diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index 476d25f5a..424a00e6d 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -65,6 +65,7 @@ public: void target_independent(ir::module &module) { optimize_dot.run(module); optimize_trans.run(module); +// ir::print(module, std::cout); } void target_dependent(ir::module &module) { diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 3f595b318..19c9baccb 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -247,8 +247,6 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { return std::string(buffer.begin(), buffer.end()); } - - cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index d82ee2c3b..5de366045 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -56,6 +56,9 @@ type *builder::get_int32_ty() type *builder::get_int64_ty() { return type::get_int64_ty(ctx_); } +type *builder::get_half_ty() +{ return type::get_half_ty(ctx_); } + type *builder::get_float_ty() { return type::get_float_ty(ctx_); } diff --git a/lib/ir/ir.cpp b/lib/ir/ir.cpp deleted file mode 100644 index e69de29bb..000000000 diff --git a/lib/lang/declaration.cpp b/lib/lang/declaration.cpp index d4a73ef00..46fa6b597 100644 --- a/lib/lang/declaration.cpp +++ b/lib/lang/declaration.cpp @@ -21,6 +21,7 @@ ir::type* typed_declaration_specifier::type(ir::module *mod) const { case INT16_T: return ir::type::get_int16_ty(ctx); case INT32_T: return ir::type::get_int32_ty(ctx); case INT64_T: return ir::type::get_int64_ty(ctx); + case FLOAT16_T: return ir::type::get_half_ty(ctx); case FLOAT32_T: return ir::type::get_float_ty(ctx); case FLOAT64_T: return ir::type::get_double_ty(ctx); default: throw std::runtime_error("unreachable"); diff --git a/lib/lang/node.cpp b/lib/lang/node.cpp index f25a5fdf5..418a86fca 100644 --- a/lib/lang/node.cpp +++ b/lib/lang/node.cpp @@ -69,6 +69,12 @@ void node::implicit_cast(ir::builder &builder, ir::value *&lhs, ir::value *&rhs, to_convert = explicit_cast(builder, to_convert, builder.get_float_ty()); is_float = true; } + // One operand is half + else if(left_ty->is_half_ty() || right_ty->is_half_ty()){ + ir::value *&to_convert = left_ty->is_half_ty()?rhs:lhs; + to_convert = explicit_cast(builder, to_convert, builder.get_half_ty()); + is_float = true; + } // Both operands are integers else if(left_ty->is_integer_ty() && right_ty->is_integer_ty()){ is_int = true; From cdf5a0d011961e0f51936f6c09e2884486ba124c Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 6 Jun 2019 16:48:32 -0700 Subject: [PATCH 163/494] [codegen/tune]: added fragmentation types --- examples/python/tensorflow/dot.cpp | 6 +-- examples/python/tensorflow/run.py | 4 +- include/triton/codegen/selection.h | 16 ++++-- include/triton/codegen/tune.h | 13 ++++- lib/codegen/selection.cpp | 15 +++--- lib/codegen/tune.cpp | 78 ++++++++++++++++++++++++------ lib/dnn/gemm.cpp | 4 +- 7 files changed, 101 insertions(+), 35 deletions(-) diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index c87b054fa..d02c8a56e 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -20,8 +20,8 @@ using GPUDevice = Eigen::GpuDevice; const char* src = R"( -const tunable int32 TM = {16, 32, 64, 128}; -const tunable int32 TN = {16, 32, 64, 128}; +const tunable int32 TM = {16}; +const tunable int32 TN = {16}; const tunable int32 TK = {8}; const tunable int32 GZ = {1}; @@ -126,7 +126,7 @@ class BlockSparseGemmOp : public OpKernel { triton::driver::cu_buffer dlocks(ctx, (CUdeviceptr)locks.flat().data(), false); stream->synchronize(); // just-in-time compile source-code - jit.add_module("matmul", src, {16, 2, 64, 16, 2, 64, 16, 8, 2, 2, 8, 8, 8, 1}); + jit.add_module("matmul", src, {8, 2, 16, 8, 2, 16, 8, 8, 2, 2, 8, 8, 8, 1}); triton::driver::kernel* kernel = jit.get_function("matmul"); triton::jit::launch_information info = jit.get_launch_info("matmul"); // launch info diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 194e6e9ed..1fd609a8f 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -20,6 +20,6 @@ hresult = np.dot(hb.T, ha) sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) result = sess.run([c], feed_dict = {locks: np.zeros(4096), - a: ha, - b: hb}) + a: ha, + b: hb}) print(result - hresult) diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index d9ce08c53..a8b46e716 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -70,6 +70,7 @@ private: unsigned vector_size_; }; +// Distribtued tile class distributed_tile: public tile{ typedef std::vector axes_t; typedef std::vector ordered_indices_vec_t; @@ -98,6 +99,15 @@ private: }; +// Fragmented tile +class fragmented_tile: public tile{ +public: + +private: + +}; + +// Selection pass class selection{ typedef std::map vmap_t; typedef std::map tmap_t; @@ -118,9 +128,9 @@ private: // grid construction void create_grids(std::vector &grids, - std::map &references, + std::map &references, ir::function *fn); - void create_tile(ir::value *v, llvm::IRBuilder<> &builder, const std::map &references, std::set &seen, llvm::Value *sh_mem_ptr); + void create_tile(ir::value *v, llvm::IRBuilder<> &builder, const std::map &references, std::set &seen, llvm::Value *sh_mem_ptr); void init_axes(ir::value *i, llvm::IRBuilder<> &builder, llvm::Value *u_thread_id, llvm::Value *u_warp_id); void init_grids(ir::function *fn, llvm::IRBuilder<> &builder, llvm::Value *sh_mem_ptr); @@ -143,7 +153,7 @@ private: tune *params_; target *tgt_; shmem_info *buffer_info_; - std::map axes_; + std::map axes_; llvm::Value *sh_mem_ptr_; }; diff --git a/include/triton/codegen/tune.h b/include/triton/codegen/tune.h index 9b81fcb53..43a731a32 100644 --- a/include/triton/codegen/tune.h +++ b/include/triton/codegen/tune.h @@ -21,11 +21,17 @@ class tune { typedef std::pair node_t; typedef std::map > graph_t; + enum fragment_t{ + STRIDED_SCAN, + HMMA_FRAGMENT_C + }; + private: void add_constraint(node_t x, node_t y); void init_c_phi(ir::instruction *i); void init_c_graph(ir::instruction *v); - void connected_components(node_t x, const std::vector mps, std::set &nodes, graph_t &graph); + fragment_t get_fragmentation_type(node_t x, graph_t &graph); + void connected_components(node_t x, const std::vector mps, const std::vector prefixes, std::set &nodes, graph_t &graph, unsigned group_id); void create_grids(std::vector &grids, std::map &references, ir::function *fn); @@ -34,7 +40,8 @@ public: std::vector get_params(ir::module& mod); std::map get_params(ir::instruction* i); ir::metaparameter* get_param(ir::value *value, const std::string &key) { return params_[value][key]; } - void copy(ir::value *dst, ir::value *src) { params_[dst] = params_[src]; } + unsigned get_param_group(ir::value *value, unsigned ax); + void copy(ir::value *dst, ir::value *src) { params_[dst] = params_[src]; groups_[dst] = groups_[src]; } bool check_constraints(std::map> &errors); void run(ir::module &mod); void init(ir::module &mod); @@ -46,12 +53,14 @@ private: std::vector pool_; graph_t dependencies_; std::set nodes_; + std::map fragments_; std::map static_params_; std::map> params_; std::map global_range_sizes_; unsigned num_global_ranges_; unsigned num_threads_; std::vector grids_; + std::map> groups_; }; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 6b638abaa..ab26b9e5d 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -459,12 +459,12 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id unsigned offset = n / contiguous[k] * per_block + n % contiguous[k]; idx_list[n] = builder.CreateAdd(thread_id, builder.getInt32(offset), "idx_" + str_k + "_" + std::to_string(n)); } - axes_[params_->get_param(v, "nts.d" + str_k)] = distributed_axis{contiguous[k], idx_list}; + axes_[params_->get_param_group(v, k)] = distributed_axis{contiguous[k], idx_list}; } } void selection::create_grids(std::vector &grids, - std::map &references, + std::map &references, ir::function *fn) { // get number of dimensions greater than 1 auto get_tile_gt1_dim = [&](ir::value *v){ @@ -479,7 +479,7 @@ void selection::create_grids(std::vector &grids, std::function bind_references = [&](ir::value *v) { // skip - if(!v->get_type()->is_tile_ty() || !seen.insert(v).second) + if(!v->get_type()->is_tile_ty() || !seen.insert(v).second || dynamic_cast(v)) return; // recurse if(auto *user = dynamic_cast(v)) @@ -492,7 +492,7 @@ void selection::create_grids(std::vector &grids, for(size_t d = 0; d < shapes.size(); d++){ if(shapes[d]->get_value() == 1) continue; - ir::metaparameter *x = params_->get_param(v, "nts.d" + std::to_string(d)); + unsigned x = params_->get_param_group(v, d); ir::value *&r = references[x]; if(!r || get_tile_gt1_dim(v) > get_tile_gt1_dim(r)) r = v; @@ -517,7 +517,7 @@ bool static inline has_phi_user(ir::value *v) { return false; } void selection::create_tile(ir::value *v, IRBuilder<> &builder, - const std::map& references, + const std::map& references, std::set &seen, Value *sh_mem_ptr) { if(!v->get_type()->is_tile_ty() || !seen.insert(v).second) return; @@ -576,7 +576,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, std::vector axes(cshapes.size()); for(size_t d = 0; d < cshapes.size(); d++){ if(cshapes[d]->get_value() > 1){ - ir::metaparameter *x = params_->get_param(v, "nts.d" + std::to_string(d)); + unsigned x = params_->get_param_group(v, d); axes[d] = axes_.at(x); } else{ @@ -607,7 +607,7 @@ void selection::init_grids(ir::function *fn, IRBuilder<> &builder, Value *sh_mem Value *u_warp_id = builder.CreateUDiv(u_thread_id, warp_size); // create grid std::vector grids; - std::map references; + std::map references; create_grids(grids, references, fn); for(ir::value* i: grids){ if(auto *instr = dynamic_cast(i)) @@ -812,7 +812,6 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & std::swap(b_idx[0], b_idx[1]); Value *a = TA->get_value(a_idx); Value *b = TB->get_value(b_idx); -// res = builder.CreateCall(f_mul_add, {ConstantFP::get(a->getType(), 1), ConstantFP::get(b->getType(), 1), res}); res = builder.CreateCall(f_mul_add, {a, b, res}); } diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 9b71aea4f..f567128f0 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -15,6 +15,19 @@ namespace codegen{ tune::tune(): num_global_ranges_(0){ } +bool is_hmma(ir::value *v){ + bool result = false; + if(auto *x = dynamic_cast(v)){ + ir::value *a = x->get_operand(0); + ir::type *a_ty = a->get_type(); + ir::value *b = x->get_operand(1); + ir::type *b_ty = b->get_type(); + result = !x->is_a_trans() && x->is_b_trans(); + result = result && a_ty->get_scalar_ty()->is_half_ty() && b_ty->get_scalar_ty()->is_half_ty(); + } + return result; +} + void tune::add_constraint(node_t x, node_t y) { dependencies_[x].insert(y); dependencies_[y].insert(x); @@ -34,6 +47,7 @@ void tune::init_c_phi(ir::instruction *v) { } void tune::init_c_graph(ir::instruction *v) { + // Reference shape ir::type::tile_shapes_t::value_type one = ir::tile_type::make_one(v->get_parent()->get_context()); ir::type::tile_shapes_t shapes; @@ -83,20 +97,41 @@ void tune::init_c_graph(ir::instruction *v) { add_constraint({v, 1}, {D, 1}); } // Element-wise - else if(dynamic_cast(v)){ + else if(dynamic_cast(v)) { for(unsigned k = 0; k < v->get_num_results(); k++) - for(unsigned i = 0; i < shapes.size(); i ++) - for(ir::value* op: v->ops()) + for(unsigned i = 0; i < shapes.size(); i ++){ + for(ir::value* op: v->ops()){ add_constraint({v->get_result(k), i}, {op, i}); + } + } } } -void tune::connected_components(node_t x, const std::vector mps, std::set &nodes, graph_t &graph) { +tune::fragment_t tune::get_fragmentation_type(node_t x, graph_t &graph){ + std::list work; + std::set seen; + work.push_back(x); + while(!work.empty()){ + node_t current = work.back(); + if(is_hmma(current.first)) + return HMMA_FRAGMENT_C; + work.pop_back(); + seen.insert(current); + for(node_t y: graph[current]){ + if(seen.find(y) == seen.end()) + work.push_back(y); + } + } + return STRIDED_SCAN; +} + +void tune::connected_components(node_t x, const std::vector mps, const std::vector prefixes, std::set &nodes, graph_t &graph, unsigned group_id) { + groups_[x.first][x.second] = group_id; if(nodes.find(x) != nodes.end()){ nodes.erase(x); std::string suffix = ".d" + std::to_string(x.second); - params_[x.first].insert({"nts" + suffix, mps[0]}); - params_[x.first].insert({"mts" + suffix, mps[1]}); + for(int i = 0; i < mps.size(); i++) + params_[x.first].insert({prefixes[i] + suffix, mps[i]}); ir::type *ty = x.first->get_type(); if(ty->is_tile_ty()){ ir::type::tile_shapes_t::value_type shape = ty->get_tile_shapes().at(x.second); @@ -109,11 +144,11 @@ void tune::connected_components(node_t x, const std::vector num_global_ranges_ = std::max(num_global_ranges_, ax + 1); } if(static_params_.find(x) != static_params_.end()){ - mps[0]->set_value(static_params_.at(x)); - mps[1]->set_value(static_params_.at(x)); + for(ir::metaparameter *mp: mps) + mp->set_value(static_params_.at(x)); } for(const node_t &y: graph[x]) - connected_components(y, mps, nodes, graph); + connected_components(y, mps, prefixes, nodes, graph, group_id); } } @@ -142,6 +177,10 @@ std::map tune::get_params(ir::instruction* i) return params_.at(i); } +unsigned tune::get_param_group(ir::value *value, unsigned ax) { + unsigned result = groups_.at(value).at(ax); + return result; +} void tune::run(ir::module &mod) { ir::context &ctx = mod.get_context(); @@ -159,12 +198,21 @@ void tune::run(ir::module &mod) { if(i->has_tile_result_or_op()) init_c_phi(i); // Layout parameters - while(!nodes_.empty()){ + unsigned group_id = 0; + while(!nodes_.empty()) { ir::type *ty = mod.get_builder().get_int32_ty(); - ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 1, 1); - nts->set_value(1); - ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 4, 32); - connected_components(*nodes_.begin(), {nts, mts}, nodes_, dependencies_); + node_t node = *nodes_.begin(); + fragment_t fragment = get_fragmentation_type(node, dependencies_); + if(fragment == STRIDED_SCAN) { + ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 1, 1); + ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 4, 32); + connected_components(node, {nts, mts}, {"nts", "mts"}, nodes_, dependencies_, group_id++); + nts->set_value(1); + } + else { + ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 1, 4); + connected_components(node, {fpw}, {"fpw"}, nodes_, dependencies_, group_id++); + } } } @@ -269,7 +317,7 @@ bool tune::check_constraints(std::map> &er int num_threads = 1; for(size_t k = 0; k < shapes.size(); k++) num_threads *= params_[i]["mts.d" + to_string(k)]->get_value(); - if(num_threads % 64 != 0) + if(num_threads % 32 != 0) errors[i].push_back("number of threads per block (" + to_string(num_threads) + ") must be multiple of warp size"); if(num_threads != num_threads_) errors[i].push_back("Number of threads must be the same for all tiles (" + to_string(num_threads_) + ")"); diff --git a/lib/dnn/gemm.cpp b/lib/dnn/gemm.cpp index 59f413d81..940b256b2 100644 --- a/lib/dnn/gemm.cpp +++ b/lib/dnn/gemm.cpp @@ -62,8 +62,8 @@ std::string gemm::src(bool AT, bool BT) { } std::string res = R"( -const tunable int32 TM = {16, 32, 64, 128}; -const tunable int32 TN = {16, 32, 64, 128}; +const tunable int32 TM = {16}; +const tunable int32 TN = {16}; const tunable int32 TK = {8}; const tunable int32 GZ = {1}; From 81eba3e1ec5d1532b88143b548b8708b1a942fe8 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 6 Jun 2019 19:36:41 -0700 Subject: [PATCH 164/494] ugh --- include/triton/codegen/tune.h | 1 + lib/codegen/tune.cpp | 65 +++++++++++++++++++++++++++-------- 2 files changed, 52 insertions(+), 14 deletions(-) diff --git a/include/triton/codegen/tune.h b/include/triton/codegen/tune.h index 43a731a32..6c08f2ea0 100644 --- a/include/triton/codegen/tune.h +++ b/include/triton/codegen/tune.h @@ -33,6 +33,7 @@ private: fragment_t get_fragmentation_type(node_t x, graph_t &graph); void connected_components(node_t x, const std::vector mps, const std::vector prefixes, std::set &nodes, graph_t &graph, unsigned group_id); void create_grids(std::vector &grids, std::map &references, ir::function *fn); + unsigned get_req_num_threads(ir::instruction *i); public: diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index f567128f0..4b8e405bc 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -100,8 +100,9 @@ void tune::init_c_graph(ir::instruction *v) { else if(dynamic_cast(v)) { for(unsigned k = 0; k < v->get_num_results(); k++) for(unsigned i = 0; i < shapes.size(); i ++){ + ir::value *result = v->get_result(k); for(ir::value* op: v->ops()){ - add_constraint({v->get_result(k), i}, {op, i}); + add_constraint({result, i}, {op, i}); } } } @@ -199,20 +200,23 @@ void tune::run(ir::module &mod) { init_c_phi(i); // Layout parameters unsigned group_id = 0; +// for(auto x: nodes_){ +// fragments_[x] = STRIDED_SCAN; +// } while(!nodes_.empty()) { ir::type *ty = mod.get_builder().get_int32_ty(); node_t node = *nodes_.begin(); - fragment_t fragment = get_fragmentation_type(node, dependencies_); - if(fragment == STRIDED_SCAN) { +// if(fragments_[node] == STRIDED_SCAN) { ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 1, 1); ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 4, 32); connected_components(node, {nts, mts}, {"nts", "mts"}, nodes_, dependencies_, group_id++); nts->set_value(1); - } - else { - ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 1, 4); - connected_components(node, {fpw}, {"fpw"}, nodes_, dependencies_, group_id++); - } +// } +// else { +// ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 1, 4); +// ir::metaparameter *wpb = ir::metaparameter::create(ctx, ty, 1, 4); +// connected_components(node, {fpw, wpb}, {"fpw", "wpb"}, nodes_, dependencies_, group_id++); +// } } } @@ -220,6 +224,8 @@ void tune::run(ir::module &mod) { for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i : block->get_inst_list()){ +// if(fragments_.find({i, 0}) != fragments_.end() && fragments_.at({i, 0}) != STRIDED_SCAN) +// continue; if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 2)); @@ -250,6 +256,23 @@ void tune::init(ir::module &mod) { } } +unsigned tune::get_req_num_threads(ir::instruction *i){ +// if(fragments_.at({i, 0}) == STRIDED_SCAN) { +// unsigned result = 1; +// for(unsigned k = 0; k < i->get_type()->get_tile_shapes().size(); k++){ +// std::string suffix = ".d" + std::to_string(k); +// result *= params_.at(i).at("mts" + suffix)->get_value(); +// } +// } +// else { + unsigned result = 32; + for(unsigned k = 0; k < i->get_type()->get_tile_shapes().size(); k++){ + std::string suffix = ".d" + std::to_string(k); + result *= params_.at(i).at("wpt" + suffix)->get_value(); + } +// } +} + void tune::create_grids(std::vector &grids, std::map &references, ir::function *fn) { @@ -307,16 +330,30 @@ bool tune::check_constraints(std::map> &er // must device the shape for(size_t k = 0; k < shapes.size(); k++) { std::string strk = to_string(k); - ir::metaparameter *mts = params_[i]["mts.d" + strk]; - ir::metaparameter *nts = params_[i]["nts.d" + strk]; - unsigned multiple = mts->get_value()*nts->get_value(); + unsigned multiple; +// if(fragments_.at({i, 0}) == STRIDED_SCAN) { + ir::metaparameter *mts = params_[i]["mts.d" + strk]; + ir::metaparameter *nts = params_[i]["nts.d" + strk]; + multiple = mts->get_value()*nts->get_value(); +// } +// else { +// ir::metaparameter *fpw = params_[i]["fpw.d" + strk]; +// ir::metaparameter *wpt = params_[i]["wpt.d" + strk]; +// multiple = fpw->get_value()*wpt->get_value(); +// } if(shapes[k]->get_value() % multiple != 0) errors[i].push_back("for dim " + strk + ": shape (" + to_string(shapes[k]->get_value()) + ")" " is not a multiple of layout (" + to_string(multiple) + ")"); } - int num_threads = 1; - for(size_t k = 0; k < shapes.size(); k++) - num_threads *= params_[i]["mts.d" + to_string(k)]->get_value(); + // the product of mma fragments per warp must be 4 +// if(fragments_.at({i, 0}) == HMMA_FRAGMENT_C){ +// unsigned prod = 1; +// for(size_t k = 0; k < shapes.size(); k++) +// prod *= params_[i]["fpw.d" + std::to_string(k)]->get_value(); +// if(prod != 4) +// errors[i].push_back("HMMA must have only 4 fragments per warp"); +// } + int num_threads = get_req_num_threads(i); if(num_threads % 32 != 0) errors[i].push_back("number of threads per block (" + to_string(num_threads) + ") must be multiple of warp size"); if(num_threads != num_threads_) From 0a0b48e9a291369b35744076dcfbdfee5c891373 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 6 Jun 2019 19:51:02 -0700 Subject: [PATCH 165/494] adding hmma tuning parameters --- lib/codegen/tune.cpp | 91 ++++++++++++++++++++++---------------------- 1 file changed, 46 insertions(+), 45 deletions(-) diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 4b8e405bc..4a9940400 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -200,23 +200,23 @@ void tune::run(ir::module &mod) { init_c_phi(i); // Layout parameters unsigned group_id = 0; -// for(auto x: nodes_){ -// fragments_[x] = STRIDED_SCAN; -// } + for(auto x: nodes_){ + fragments_[x] = get_fragmentation_type(x, dependencies_); + } while(!nodes_.empty()) { ir::type *ty = mod.get_builder().get_int32_ty(); node_t node = *nodes_.begin(); -// if(fragments_[node] == STRIDED_SCAN) { + if(fragments_[node] == STRIDED_SCAN) { ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 1, 1); ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 4, 32); connected_components(node, {nts, mts}, {"nts", "mts"}, nodes_, dependencies_, group_id++); nts->set_value(1); -// } -// else { -// ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 1, 4); -// ir::metaparameter *wpb = ir::metaparameter::create(ctx, ty, 1, 4); -// connected_components(node, {fpw, wpb}, {"fpw", "wpb"}, nodes_, dependencies_, group_id++); -// } + } + else { + ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 1, 4); + ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 1, 4); + connected_components(node, {fpw, wpt}, {"fpw", "wpt"}, nodes_, dependencies_, group_id++); + } } } @@ -224,8 +224,8 @@ void tune::run(ir::module &mod) { for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i : block->get_inst_list()){ -// if(fragments_.find({i, 0}) != fragments_.end() && fragments_.at({i, 0}) != STRIDED_SCAN) -// continue; + if(fragments_.find({i, 0}) != fragments_.end() && fragments_.at({i, 0}) != STRIDED_SCAN) + continue; if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 2)); @@ -248,29 +248,25 @@ void tune::init(ir::module &mod) { create_grids(grids_, references, fn); } // number of threads - num_threads_ = 1; - ir::instruction *first = grids_.front(); - for(unsigned k = 0; k < first->get_type()->get_tile_shapes().size(); k++){ - std::string suffix = ".d" + std::to_string(k); - num_threads_ *= params_.at(first).at("mts" + suffix)->get_value(); - } + num_threads_ = get_req_num_threads(grids_.front()); } unsigned tune::get_req_num_threads(ir::instruction *i){ -// if(fragments_.at({i, 0}) == STRIDED_SCAN) { -// unsigned result = 1; -// for(unsigned k = 0; k < i->get_type()->get_tile_shapes().size(); k++){ -// std::string suffix = ".d" + std::to_string(k); -// result *= params_.at(i).at("mts" + suffix)->get_value(); -// } -// } -// else { + if(fragments_.at({i, 0}) == STRIDED_SCAN) { + unsigned result = 1; + for(unsigned k = 0; k < i->get_type()->get_tile_shapes().size(); k++){ + std::string suffix = ".d" + std::to_string(k); + result *= params_.at(i).at("mts" + suffix)->get_value(); + } + return result; + } + else { unsigned result = 32; for(unsigned k = 0; k < i->get_type()->get_tile_shapes().size(); k++){ std::string suffix = ".d" + std::to_string(k); result *= params_.at(i).at("wpt" + suffix)->get_value(); } -// } + } } void tune::create_grids(std::vector &grids, @@ -310,10 +306,15 @@ bool tune::check_constraints(std::map> &er auto get_num_warps = [&](ir::instruction *i, unsigned axis) { std::string strk = to_string(axis); - unsigned mts = params_[i]["mts.d" + strk]->get_value(); - unsigned nts = params_[i]["nts.d" + strk]->get_value(); - unsigned shape = i->get_type()->get_tile_shapes()[axis]->get_value(); - return shape / (mts * nts); + if(fragments_.at({i, axis}) == STRIDED_SCAN){ + unsigned mts = params_[i]["mts.d" + strk]->get_value(); + unsigned nts = params_[i]["nts.d" + strk]->get_value(); + unsigned shape = i->get_type()->get_tile_shapes()[axis]->get_value(); + return shape / (mts * nts); + } + else{ + return (unsigned)params_[i]["wpt.d" + strk]->get_value(); + } }; // number of warps @@ -331,28 +332,28 @@ bool tune::check_constraints(std::map> &er for(size_t k = 0; k < shapes.size(); k++) { std::string strk = to_string(k); unsigned multiple; -// if(fragments_.at({i, 0}) == STRIDED_SCAN) { + if(fragments_.at({i, 0}) == STRIDED_SCAN) { ir::metaparameter *mts = params_[i]["mts.d" + strk]; ir::metaparameter *nts = params_[i]["nts.d" + strk]; multiple = mts->get_value()*nts->get_value(); -// } -// else { -// ir::metaparameter *fpw = params_[i]["fpw.d" + strk]; -// ir::metaparameter *wpt = params_[i]["wpt.d" + strk]; -// multiple = fpw->get_value()*wpt->get_value(); -// } + } + else { + ir::metaparameter *fpw = params_[i]["fpw.d" + strk]; + ir::metaparameter *wpt = params_[i]["wpt.d" + strk]; + multiple = fpw->get_value()*wpt->get_value(); + } if(shapes[k]->get_value() % multiple != 0) errors[i].push_back("for dim " + strk + ": shape (" + to_string(shapes[k]->get_value()) + ")" " is not a multiple of layout (" + to_string(multiple) + ")"); } // the product of mma fragments per warp must be 4 -// if(fragments_.at({i, 0}) == HMMA_FRAGMENT_C){ -// unsigned prod = 1; -// for(size_t k = 0; k < shapes.size(); k++) -// prod *= params_[i]["fpw.d" + std::to_string(k)]->get_value(); -// if(prod != 4) -// errors[i].push_back("HMMA must have only 4 fragments per warp"); -// } + if(fragments_.at({i, 0}) == HMMA_FRAGMENT_C){ + unsigned prod = 1; + for(size_t k = 0; k < shapes.size(); k++) + prod *= params_[i]["fpw.d" + std::to_string(k)]->get_value(); + if(prod != 4) + errors[i].push_back("HMMA must have only 4 fragments per warp"); + } int num_threads = get_req_num_threads(i); if(num_threads % 32 != 0) errors[i].push_back("number of threads per block (" + to_string(num_threads) + ") must be multiple of warp size"); From 6045209d5bfa0226a8f31ee9383c0be1dc665521 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 6 Jun 2019 20:13:26 -0700 Subject: [PATCH 166/494] Now find correct tuning configuration --- examples/python/tensorflow/dot.cpp | 53 +++++++++++++++++------------- lib/codegen/tune.cpp | 8 +++-- lib/runtime/jit.cpp | 6 +++- 3 files changed, 40 insertions(+), 27 deletions(-) diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index d02c8a56e..01d18c435 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -4,6 +4,7 @@ #include "triton/driver/backend.h" #include "triton/driver/stream.h" #include "triton/runtime/jit.h" +#include "triton/tools/bench.hpp" #define EIGEN_USE_GPU #include "tensorflow/core/framework/op.h" @@ -125,30 +126,36 @@ class BlockSparseGemmOp : public OpKernel { triton::driver::cu_buffer dc(ctx, (CUdeviceptr)c->flat().data(), false); triton::driver::cu_buffer dlocks(ctx, (CUdeviceptr)locks.flat().data(), false); stream->synchronize(); + // benchmark a given matrix multiplication kernel + auto benchmark = [&](triton::driver::kernel* kernel, + triton::jit::launch_information info) { + // launch info + unsigned TM = info.global_range_size[0]; + unsigned TN = info.global_range_size[1]; + unsigned nthreads = info.num_threads; + unsigned GZ = jit.get_int("GZ"); + std::array grid = {(M + TM - 1)/TM, (N + TN - 1)/TN, GZ}; + // set argument + kernel->setArg(0, *da.cu()); + kernel->setArg(1, *db.cu()); + kernel->setArg(2, *dc.cu()); + kernel->setArg(3, M); + kernel->setArg(4, N); + kernel->setArg(5, K); + kernel->setArg(6, M); + kernel->setArg(7, N); + kernel->setArg(8, M); + kernel->setArg(9, *dlocks.cu()); + kernel->setArg(10, grid[0]); + kernel->setArg(11, grid[1]); + stream->enqueue(kernel, grid, {nthreads, 1, 1}); + stream->synchronize(); + double ts = triton::tools::bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, + [&](){ stream->synchronize(); }, nullptr); + return 2.*M*N*K / ts * 1e-3; + }; // just-in-time compile source-code - jit.add_module("matmul", src, {8, 2, 16, 8, 2, 16, 8, 8, 2, 2, 8, 8, 8, 1}); - triton::driver::kernel* kernel = jit.get_function("matmul"); - triton::jit::launch_information info = jit.get_launch_info("matmul"); - // launch info - unsigned TM = info.global_range_size[0]; - unsigned TN = info.global_range_size[1]; - unsigned nthreads = info.num_threads; - unsigned GZ = jit.get_int("GZ"); - std::array grid = {(M + TM - 1)/TM, (N + TN - 1)/TN, GZ}; - // set argument - kernel->setArg(0, *da.cu()); - kernel->setArg(1, *db.cu()); - kernel->setArg(2, *dc.cu()); - kernel->setArg(3, M); - kernel->setArg(4, N); - kernel->setArg(5, K); - kernel->setArg(6, M); - kernel->setArg(7, N); - kernel->setArg(8, M); - kernel->setArg(9, *dlocks.cu()); - kernel->setArg(10, grid[0]); - kernel->setArg(11, grid[1]); - stream->enqueue(kernel, grid, {nthreads, 1, 1}); + jit.autotune("matmul", src, benchmark); } private: diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 4a9940400..a995a8a7c 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -213,8 +213,8 @@ void tune::run(ir::module &mod) { nts->set_value(1); } else { - ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 1, 4); - ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 1, 4); + ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 2, 2); + ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 1, 1); connected_components(node, {fpw, wpt}, {"fpw", "wpt"}, nodes_, dependencies_, group_id++); } } @@ -266,6 +266,7 @@ unsigned tune::get_req_num_threads(ir::instruction *i){ std::string suffix = ".d" + std::to_string(k); result *= params_.at(i).at("wpt" + suffix)->get_value(); } + return result; } } @@ -349,8 +350,9 @@ bool tune::check_constraints(std::map> &er // the product of mma fragments per warp must be 4 if(fragments_.at({i, 0}) == HMMA_FRAGMENT_C){ unsigned prod = 1; - for(size_t k = 0; k < shapes.size(); k++) + for(size_t k = 0; k < shapes.size(); k++){ prod *= params_[i]["fpw.d" + std::to_string(k)]->get_value(); + } if(prod != 4) errors[i].push_back("HMMA must have only 4 fragments per warp"); } diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index e03d51c63..5f5b161ea 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -120,7 +120,11 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben mp->set_value(params[i++]); passes.target_independent(tt_module); passes.tune.init(tt_module); - if(!passes.tune.check_constraints(errors)) + passes.tune.check_constraints(errors); + for(auto x: errors) + for(auto err: x.second) + std::cout << err << std::endl; + if(!errors.empty()) return; // Deep copy of the module and tuner auto ptt_module = make_triton_module(name, src); From 781b6d377d3ade6d5c15c9ec994203c4c0d9c9c0 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 6 Jun 2019 20:34:56 -0700 Subject: [PATCH 167/494] seleciton now segfault (expected --- examples/python/tensorflow/dot.cpp | 5 ++++- lib/runtime/jit.cpp | 8 +++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index 01d18c435..25f92f865 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -155,7 +155,10 @@ class BlockSparseGemmOp : public OpKernel { return 2.*M*N*K / ts * 1e-3; }; // just-in-time compile source-code - jit.autotune("matmul", src, benchmark); + jit.add_module("matmul", src, {4, 2, 16, 4, 2, 16, 2, 2, 1, 1, 8, 8, 8, 1}); + triton::driver::kernel* kernel = jit.get_function("matmul"); + triton::jit::launch_information info = jit.get_launch_info("matmul"); + benchmark(kernel, info); } private: diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index 5f5b161ea..f5bdc2e8f 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -121,14 +121,16 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben passes.target_independent(tt_module); passes.tune.init(tt_module); passes.tune.check_constraints(errors); - for(auto x: errors) - for(auto err: x.second) - std::cout << err << std::endl; +// for(auto x: errors) +// for(auto err: x.second) +// std::cout << err << std::endl; if(!errors.empty()) return; // Deep copy of the module and tuner auto ptt_module = make_triton_module(name, src); ir::module &tt_module = *ptt_module; + for(unsigned p: params) + std::cout << p << " " << std::flush; passes_wrapper passes(target_.get()); passes.target_independent(tt_module); passes.tune.run(tt_module); From 6fce9f28ae325842b21729d4cdc9d93d409d088e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 7 Jun 2019 10:32:56 -0700 Subject: [PATCH 168/494] added fragmented axis --- examples/python/tensorflow/run.py | 6 +-- include/triton/codegen/tune.h | 2 + lib/codegen/selection.cpp | 80 +++++++++++++++++++++---------- 3 files changed, 59 insertions(+), 29 deletions(-) diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 1fd609a8f..3b74aa240 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -6,14 +6,14 @@ data_files_path = tf.resource_loader.get_data_files_path() library_dir = '/home/philippe/development/triton/build/examples/python/tensorflow' module = tf.load_op_library(os.path.join(library_dir, 'libtf_blocksparse.so')) -M, N, K = 512, 512, 512 +M, N, K = 16, 16, 16 a = tf.placeholder(tf.float16, shape=[M, K]) b = tf.placeholder(tf.float16, shape=[N, K]) locks = tf.placeholder(tf.int32, shape=[4096]) c = module.dot(a, b, locks) # Reference -ha = np.random.rand(M, K).astype(np.float16) -hb = np.random.rand(N, K).astype(np.float16) +ha = np.ones((M, K)).astype(np.float16) +hb = np.ones((N, K)).astype(np.float16) hresult = np.dot(hb.T, ha) # Run diff --git a/include/triton/codegen/tune.h b/include/triton/codegen/tune.h index 6c08f2ea0..098106149 100644 --- a/include/triton/codegen/tune.h +++ b/include/triton/codegen/tune.h @@ -21,6 +21,7 @@ class tune { typedef std::pair node_t; typedef std::map > graph_t; +public: enum fragment_t{ STRIDED_SCAN, HMMA_FRAGMENT_C @@ -41,6 +42,7 @@ public: std::vector get_params(ir::module& mod); std::map get_params(ir::instruction* i); ir::metaparameter* get_param(ir::value *value, const std::string &key) { return params_[value][key]; } + fragment_t get_fragment(ir::value *value, unsigned ax) { return fragments_.at({value, ax}); } unsigned get_param_group(ir::value *value, unsigned ax); void copy(ir::value *dst, ir::value *src) { params_[dst] = params_[src]; groups_[dst] = groups_[src]; } bool check_constraints(std::map> &errors); diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index ab26b9e5d..f1db61f1e 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -433,33 +433,61 @@ inline void to_warps(const std::vector &bs, std::vector &nw, void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { const auto& shapes = v->get_type()->get_tile_shapes(); size_t dim = shapes.size(); - std::vector contiguous(dim); - std::vector block_size(dim); - std::vector warp_size(dim); - std::vector n_warps(dim); - for(unsigned i = 0; i < shapes.size(); i++){ - std::string str_i = std::to_string(i); - contiguous[i] = params_->get_param(v, "nts.d" + str_i)->get_value(); - block_size[i] = params_->get_param(v, "mts.d" + str_i)->get_value(); - } - to_warps(block_size, n_warps, warp_size); - std::vector thread_id_in_warp = delinearize(u_thread_id, warp_size, builder); - std::vector warp_id = delinearize(u_warp_id, n_warps, builder); - // Create axes - for(unsigned k = 0; k < dim; k++) { - std::string str_k = std::to_string(k); - Value *warp_size_k = builder.getInt32(warp_size[k]); - Value *contiguous_k = builder.getInt32(contiguous[k]); - Value *thread_id = builder.CreateAdd(thread_id_in_warp[k], builder.CreateMul(warp_id[k], warp_size_k)); - thread_id = builder.CreateMul(thread_id, contiguous_k); - unsigned per_block = contiguous[k] * warp_size[k] * n_warps[k]; - unsigned per_thread = contiguous[k] * shapes[k]->get_value() / per_block; - std::vector idx_list(per_thread); - for(unsigned n = 0 ; n < per_thread; n++){ - unsigned offset = n / contiguous[k] * per_block + n % contiguous[k]; - idx_list[n] = builder.CreateAdd(thread_id, builder.getInt32(offset), "idx_" + str_k + "_" + std::to_string(n)); + if(params_->get_fragment(v, 0) == tune::STRIDED_SCAN){ + std::vector contiguous(dim); + std::vector block_size(dim); + std::vector warp_size(dim); + std::vector n_warps(dim); + for(unsigned i = 0; i < shapes.size(); i++){ + std::string str_i = std::to_string(i); + contiguous[i] = params_->get_param(v, "nts.d" + str_i)->get_value(); + block_size[i] = params_->get_param(v, "mts.d" + str_i)->get_value(); } - axes_[params_->get_param_group(v, k)] = distributed_axis{contiguous[k], idx_list}; + to_warps(block_size, n_warps, warp_size); + std::vector thread_id_in_warp = delinearize(u_thread_id, warp_size, builder); + std::vector warp_id = delinearize(u_warp_id, n_warps, builder); + // Create axes + for(unsigned k = 0; k < dim; k++) { + std::string str_k = std::to_string(k); + Value *warp_size_k = builder.getInt32(warp_size[k]); + Value *contiguous_k = builder.getInt32(contiguous[k]); + Value *thread_id = builder.CreateAdd(thread_id_in_warp[k], builder.CreateMul(warp_id[k], warp_size_k)); + thread_id = builder.CreateMul(thread_id, contiguous_k); + unsigned per_block = contiguous[k] * warp_size[k] * n_warps[k]; + unsigned per_thread = contiguous[k] * shapes[k]->get_value() / per_block; + std::vector idx_list(per_thread); + for(unsigned n = 0 ; n < per_thread; n++){ + unsigned offset = n / contiguous[k] * per_block + n % contiguous[k]; + idx_list[n] = builder.CreateAdd(thread_id, builder.getInt32(offset), "idx_" + str_k + "_" + std::to_string(n)); + } + axes_[params_->get_param_group(v, k)] = distributed_axis{contiguous[k], idx_list}; + } + } + else { + Value *_1 = builder.getInt32(1); + Value *_2 = builder.getInt32(2); + Value *_4 = builder.getInt32(4); + Value *_8 = builder.getInt32(8); + Value *_16 = builder.getInt32(16); + // offset_i = tid & 2 + tid & 8 + Value *offset_i = builder.CreateAdd(builder.CreateAnd(u_thread_id, _2), + builder.CreateAnd(u_thread_id, _8)); + // offset_j = (tid & 1) + (tid & 4)*2 + (tid & 16)/4 + Value *offset_j = builder.CreateAdd(builder.CreateAnd(u_thread_id, _1), + builder.CreateAdd(builder.CreateMul(builder.CreateAnd(u_thread_id, _4), _2), + builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), _4))); + // idx_i + std::vector idx_i; + for(unsigned i = 0; i < 2; i++) + idx_i.push_back(builder.CreateAdd(offset_i, builder.getInt32(i*4))); + + // idx_j + std::vector idx_j; + for(unsigned j = 0; j < 2; j++) + idx_j.push_back(builder.CreateAdd(offset_j, builder.getInt32(j*2))); + + axes_[params_->get_param_group(v, 0)] = distributed_axis{1, idx_i}; + axes_[params_->get_param_group(v, 1)] = distributed_axis{1, idx_j}; } } From ec4c6aaaaadea02a1b85bb3ea366d4e6022913f4 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 7 Jun 2019 19:39:33 -0700 Subject: [PATCH 169/494] Added inline PTX for mma.sync --- examples/python/tensorflow/dot.cpp | 16 +--- lib/codegen/selection.cpp | 113 +++++++++++++++++++++-------- lib/driver/module.cpp | 11 ++- 3 files changed, 92 insertions(+), 48 deletions(-) diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index 25f92f865..ca611d296 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -45,12 +45,7 @@ void matmul(restrict read_only fp16 *A, restrict read_only fp16 *B, fp16* pb[TN, TK] = B + (offk + rkb[newaxis, :])*ldb + ryb[:, newaxis]; fp16 a[TM, TK] = *pa; fp16 b[TN, TK] = *pb; - int32 last_a = ((M*K - 1) - (TM*TK + 1)) / lda; - int32 last_b = ((K*N - 1) - (TN*TK + 1)) / ldb; - last_a = last_a / TK * TK; - last_b = last_b / TK * TK; - int32 bound = K - max(last_a, last_b); - for(int32 k = K; k > bound; k = k - TK){ + for(int32 k = K; k > 0; k = k - TK){ c = dot(a, trans(b), c); pa = pa + TK*lda; pb = pb + TK*ldb; @@ -59,15 +54,6 @@ void matmul(restrict read_only fp16 *A, restrict read_only fp16 *B, } int32 rxc[TM] = get_global_range[TM](0); int32 ryc[TN] = get_global_range[TN](1); - for(int32 k = bound; k > 0; k = k - 1){ - int1 checka[TM, 1] = rxc[:, newaxis] < M; - int1 checkb[TN, 1] = ryc[:, newaxis] < N; - fp16* pa[TM, 1] = A + (offk + K - k)*lda + rxc[:, newaxis]; - fp16* pb[TN, 1] = B + (offk + K - k)*ldb + ryc[:, newaxis]; - fp16 a[TM, 1] = checka ? *pa : 0; - fp16 b[TN, 1] = checkb ? *pb : 0; - c = dot(a, trans(b), c); - } int32 ridx = get_range_id(0); int32 ridy = get_range_id(1); fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index f1db61f1e..8d5ba3d4a 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -14,6 +14,7 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Attributes.h" +#include "llvm/IR/InlineAsm.h" namespace triton{ namespace codegen{ @@ -470,24 +471,27 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id Value *_8 = builder.getInt32(8); Value *_16 = builder.getInt32(16); // offset_i = tid & 2 + tid & 8 - Value *offset_i = builder.CreateAdd(builder.CreateAnd(u_thread_id, _2), + Value *offset_j = builder.CreateAdd(builder.CreateAnd(u_thread_id, _2), builder.CreateAnd(u_thread_id, _8)); // offset_j = (tid & 1) + (tid & 4)*2 + (tid & 16)/4 - Value *offset_j = builder.CreateAdd(builder.CreateAnd(u_thread_id, _1), + Value *offset_i = builder.CreateAdd(builder.CreateAnd(u_thread_id, _1), builder.CreateAdd(builder.CreateMul(builder.CreateAnd(u_thread_id, _4), _2), builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), _4))); // idx_i - std::vector idx_i; - for(unsigned i = 0; i < 2; i++) - idx_i.push_back(builder.CreateAdd(offset_i, builder.getInt32(i*4))); + std::vector idx_j; + for(unsigned i = 0; i < 2; i++){ + idx_j.push_back(builder.CreateAdd(offset_j, builder.getInt32(i*4))); + idx_j.push_back(builder.CreateAdd(offset_j, builder.getInt32(i*4 + 1))); + } // idx_j - std::vector idx_j; - for(unsigned j = 0; j < 2; j++) - idx_j.push_back(builder.CreateAdd(offset_j, builder.getInt32(j*2))); + std::vector idx_i; + for(unsigned j = 0; j < 2; j++){ + idx_i.push_back(builder.CreateAdd(offset_i, builder.getInt32(j*2))); + } - axes_[params_->get_param_group(v, 0)] = distributed_axis{1, idx_i}; - axes_[params_->get_param_group(v, 1)] = distributed_axis{1, idx_j}; + axes_[params_->get_param_group(v, 0)] = distributed_axis{1, idx_j}; + axes_[params_->get_param_group(v, 1)] = distributed_axis{1, idx_i}; } } @@ -822,29 +826,80 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & bool BT = dot->is_b_trans(); distributed_tile *TC = (distributed_tile*)tmap_.at(C); Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {llvm_type(C->get_type()->get_scalar_ty(), ctx)}); - if(dot->get_operand(0)->get_type()->get_tile_shapes()[1]->get_value() != 1) + unsigned NK = A->get_type()->get_tile_shapes()[1]->get_value(); + std::cout << NK << std::endl; + if(NK != 1) { shared_tile *TA = (shared_tile*)tmap_.at(A); shared_tile *TB = (shared_tile*)tmap_.at(B); - TA->set_vector_size(TC->axis(0).contiguous); - TB->set_vector_size(TC->axis(1).contiguous); - result->for_each([&](indices_t idx){ - Value *res = TC->get_value(idx); - unsigned NK = A->get_type()->get_tile_shapes()[1]->get_value(); - for(unsigned K = 0; K < NK; ++K){ - indices_t a_idx = {idx[0], builder.getInt32(K)}; - indices_t b_idx = {builder.getInt32(K), idx[1]}; - if(AT) - std::swap(a_idx[0], a_idx[1]); - if(BT) - std::swap(b_idx[0], b_idx[1]); - Value *a = TA->get_value(a_idx); - Value *b = TB->get_value(b_idx); - res = builder.CreateCall(f_mul_add, {a, b, res}); + if(params_->get_fragment(ins, 0) == tune::STRIDED_SCAN) + { + TA->set_vector_size(TC->axis(0).contiguous); + TB->set_vector_size(TC->axis(1).contiguous); + result->for_each([&](indices_t idx){ + Value *res = TC->get_value(idx); + for(unsigned K = 0; K < NK; ++K){ + indices_t a_idx = {idx[0], builder.getInt32(K)}; + indices_t b_idx = {builder.getInt32(K), idx[1]}; + if(AT) + std::swap(a_idx[0], a_idx[1]); + if(BT) + std::swap(b_idx[0], b_idx[1]); + Value *a = TA->get_value(a_idx); + Value *b = TB->get_value(b_idx); + res = builder.CreateCall(f_mul_add, {a, b, res}); - } - result->set_value(idx, res); - }); + } + result->set_value(idx, res); + }); + } + else + { + Value *_1 = builder.getInt32(1); + Value *_2 = builder.getInt32(2); + Value *_3 = builder.getInt32(3); + Value *_4 = builder.getInt32(4); + Value *_8 = builder.getInt32(8); + Value *_16 = builder.getInt32(16); + BasicBlock *current = builder.GetInsertBlock(); + Module *module = current->getModule(); + Value *tid = tgt_->get_local_id(module, builder, 0); + // offset_a_i = (tid & 3) + // offset_a_j = (tid & 4)*2 + (tid & 16)/4; + Value *offset_a_i = builder.CreateAnd(tid, _3); + Value *offset_a_k = builder.CreateAdd(builder.CreateMul(builder.CreateAnd(tid, _4), + _2), + builder.CreateUDiv(builder.CreateAnd(tid, _16), + _4)); + // offset_b_i = (tid & 3) + // offset_b_j = (tid & 8)*1 + (tid & 16)/4 + Value *offset_b_i = builder.CreateAnd(tid, _3); + Value *offset_b_k = builder.CreateAdd(builder.CreateAnd(tid, _8), + builder.CreateUDiv(builder.CreateAnd(tid, _16), + _4)); + Value *ha0 = TA->get_value({offset_a_i, offset_a_k}); + Value *ha1 = TA->get_value({builder.CreateAdd(offset_a_i, _1), offset_a_k}); + Value *hb0 = TB->get_value({offset_b_i, offset_b_k}); + Value *hb1 = TB->get_value({builder.CreateAdd(offset_b_i, _1), offset_b_k}); + std::vector fc; + result->for_each([&](indices_t idx){ + fc.push_back(result->get_value(idx)); + }); + + Type *void_ty = builder.getVoidTy(); + Type *fp32_ty = builder.getFloatTy(); + Type *fp16x2_ty = VectorType::get(builder.getHalfTy(), 2); +// Type *fp32_vec8_ty = VectorType::get(fp32_ty, 8); +// Type *fp16x2_vec2 = VectorType::get(fp16x2_ty, 2); + FunctionType *mma_ty = FunctionType::get(void_ty, {fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp16x2_ty, fp16x2_ty, fp16x2_ty, fp16x2_ty}, false); + + InlineAsm *mma_fn = InlineAsm::get(mma_ty, " mma.sync.aligned.m8n8k4.col.row.f32.f16.f16.f32 \n\ + {$0, $1, $2, $3, $4, $5, $6, $7}, \n\ + {$8, $9}, \n\ + {$10, $11}, \n\ + {$0, $1, $2, $3, $4, $5, $6, $7};", "+f, +f, +f, +f, +f, +f, +f, +f, r, r, r, r", false); + builder.CreateCall(mma_fn, {fc[0], fc[1], fc[2], fc[3], fc[4], fc[5], fc[6], fc[7], ha0, ha1, hb0, hb1}); + } } else { diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 19c9baccb..ebc876559 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -119,7 +119,7 @@ void module::compile_llvm_module(llvm::Module* module, const std::string& triple opt.UnsafeFPMath = false; opt.NoInfsFPMath = false; opt.NoNaNsFPMath = true; - llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt, + llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, "-ptx60", opt, llvm::Reloc::PIC_, llvm::None, llvm::CodeGenOpt::Aggressive); // set data layout if(layout.empty()) @@ -243,14 +243,17 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { layout += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; // create llvm::SmallVector buffer; - module::compile_llvm_module(module, "nvptx64-nvidia-cuda", "sm_52", layout, buffer, "", Assembly); - return std::string(buffer.begin(), buffer.end()); + module::compile_llvm_module(module, "nvptx64-nvidia-cuda", "sm_75", layout, buffer, "", Assembly); + std::string result(buffer.begin(), buffer.end()); + std::string to_replace = ".version 6.3"; + result.replace(result.find(to_replace), to_replace.size(), ".version 6.4"); + return result; } cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ -// std::cout << source << std::endl; + std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; From 5f3d48c1d0b09adc00a822fc4482fbf3e81cfb4b Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 7 Jun 2019 21:19:47 -0700 Subject: [PATCH 170/494] [tensor cores] added basic codegen template for using wmma --- lib/codegen/selection.cpp | 24 +++++++++++++++--------- lib/driver/module.cpp | 8 ++++---- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 8d5ba3d4a..e6a04a3b3 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -887,18 +887,24 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & }); Type *void_ty = builder.getVoidTy(); + Type *int32_ty = builder.getInt32Ty(); Type *fp32_ty = builder.getFloatTy(); Type *fp16x2_ty = VectorType::get(builder.getHalfTy(), 2); -// Type *fp32_vec8_ty = VectorType::get(fp32_ty, 8); -// Type *fp16x2_vec2 = VectorType::get(fp16x2_ty, 2); - FunctionType *mma_ty = FunctionType::get(void_ty, {fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp16x2_ty, fp16x2_ty, fp16x2_ty, fp16x2_ty}, false); + Type *fp32_pack8_ty = StructType::get(ctx, {fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}); + FunctionType *mma_ty = FunctionType::get(fp32_pack8_ty, {int32_ty, int32_ty, int32_ty, int32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}, false); - InlineAsm *mma_fn = InlineAsm::get(mma_ty, " mma.sync.aligned.m8n8k4.col.row.f32.f16.f16.f32 \n\ - {$0, $1, $2, $3, $4, $5, $6, $7}, \n\ - {$8, $9}, \n\ - {$10, $11}, \n\ - {$0, $1, $2, $3, $4, $5, $6, $7};", "+f, +f, +f, +f, +f, +f, +f, +f, r, r, r, r", false); - builder.CreateCall(mma_fn, {fc[0], fc[1], fc[2], fc[3], fc[4], fc[5], fc[6], fc[7], ha0, ha1, hb0, hb1}); + InlineAsm *mma_fn = InlineAsm::get(mma_ty, " mma.sync.aligned.m8n8k4.col.row.f32.f16.f16.f32 " + "{$0, $1, $2, $3, $4, $5, $6, $7}, " + "{$8, $9}, " + "{$10, $11}, " + "{$0, $1, $2, $3, $4, $5, $6, $7};", "=f,=f,=f,=f,=f,=f,=f,=f,r,r,r,r,0,1,2,3,4,5,6,7", false); + Value *nc = builder.CreateCall(mma_fn, {builder.getInt32(0), builder.getInt32(0), builder.getInt32(0), builder.getInt32(0), fc[0], fc[1], fc[2], fc[3], fc[4], fc[5], fc[6], fc[7]}); + std::cout << mma_fn->getFunctionType()->getFunctionNumParams() << std::endl; + unsigned i = 0; + result->for_each([&](indices_t idx){ + result->set_value(idx, builder.CreateExtractValue(nc, {i++})); + }); + std::cout << "haha" << std::endl; } } else diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index ebc876559..a9d8ab549 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -106,10 +106,10 @@ void module::compile_llvm_module(llvm::Module* module, const std::string& triple file_type_t ft) { init_llvm(); // debug -// llvm::legacy::PassManager pm; -// pm.add(llvm::createPrintModulePass(llvm::outs())); -// pm.add(llvm::createVerifierPass()); -// pm.run(*module); + llvm::legacy::PassManager pm; + pm.add(llvm::createPrintModulePass(llvm::outs())); + pm.add(llvm::createVerifierPass()); + pm.run(*module); // create machine module->setTargetTriple(triple); std::string error; From d074a166e207bf1c875301645515491b1a89d72e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 8 Jun 2019 12:14:37 -0700 Subject: [PATCH 171/494] [feature] basic tensor core utilization works --- examples/python/tensorflow/dot.cpp | 19 +------ examples/python/tensorflow/run.py | 11 ++-- include/triton/codegen/selection.h | 2 + lib/codegen/selection.cpp | 83 ++++++++++++++++++++---------- 4 files changed, 66 insertions(+), 49 deletions(-) diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index ca611d296..fef3bb73e 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -54,26 +54,11 @@ void matmul(restrict read_only fp16 *A, restrict read_only fp16 *B, } int32 rxc[TM] = get_global_range[TM](0); int32 ryc[TN] = get_global_range[TN](1); - int32 ridx = get_range_id(0); - int32 ridy = get_range_id(1); - fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - int32 *plock = locks + ridx + ridy*grid0; - while(__atomic_cas(plock, 0, 1)); - int32 *pcount = plock + grid0*grid1; - int32 count = *pcount; - int32 countp1 = select(count == GZ - 1, 0, count + 1); + fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; int1 checkc0[TM] = rxc < M; int1 checkc1[TN] = ryc < N; int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - if(count == 0) { - @checkc *pc = c; - *pcount = countp1; - } - else { - @checkc *pc = c + *pc; - *pcount = countp1; - } - __atomic_cas(plock, 1, 0); + @checkc *pc = c; } )"; diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 3b74aa240..2a34bab45 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -6,14 +6,14 @@ data_files_path = tf.resource_loader.get_data_files_path() library_dir = '/home/philippe/development/triton/build/examples/python/tensorflow' module = tf.load_op_library(os.path.join(library_dir, 'libtf_blocksparse.so')) -M, N, K = 16, 16, 16 +M, N, K = 256, 256, 256 a = tf.placeholder(tf.float16, shape=[M, K]) b = tf.placeholder(tf.float16, shape=[N, K]) locks = tf.placeholder(tf.int32, shape=[4096]) c = module.dot(a, b, locks) # Reference -ha = np.ones((M, K)).astype(np.float16) -hb = np.ones((N, K)).astype(np.float16) +ha = np.random.rand(M, K).astype(np.float16) +hb = np.random.rand(N, K).astype(np.float16) hresult = np.dot(hb.T, ha) # Run @@ -22,4 +22,7 @@ sess.run(tf.global_variables_initializer()) result = sess.run([c], feed_dict = {locks: np.zeros(4096), a: ha, b: hb}) -print(result - hresult) +print(result) +print(hresult) +#print(result - hresult) +print(np.max(np.abs(result - hresult))) diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index a8b46e716..c659ab6d3 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -57,6 +57,7 @@ private: public: shared_tile(llvm::Type* ty, const shapes_t &shapes, llvm::Value* ptr, llvm::IRBuilder<> &builder, llvm::Value* offset = nullptr); void set_vector_size(unsigned vector_size); + void set_return_mode(bool return_vector); void set_value(indices_t, llvm::Value *); llvm::Value* get_value(indices_t idx); llvm::Value* get_pointer() { return ptr_; } @@ -64,6 +65,7 @@ public: private: llvm::Value *ptr_; + bool return_vector_; llvm::Value *offset_; llvm::IRBuilder<> &builder_; std::map ptr_cache_; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index e6a04a3b3..9b6501b3e 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -129,6 +129,7 @@ Value* shared_tile::shared_offset(indices_t idx) { shared_tile::shared_tile(Type *ty, const shapes_t &shapes, Value *ptr, llvm::IRBuilder<> &builder, Value *offset): tile(ty, shapes), ptr_(ptr), builder_(builder), offset_(offset), vector_size_(1){ + return_vector_ = false; } void shared_tile::set_value(indices_t idx, Value *value) { @@ -142,12 +143,18 @@ void shared_tile::set_vector_size(unsigned vector_size) { vector_size_ = vector_size; } +void shared_tile::set_return_mode(bool return_vector){ + return_vector_ = return_vector; +} + + Value* shared_tile::get_value(indices_t idx) { indices_t non_cst_idx, cst_idx; extract_constant(idx, non_cst_idx, cst_idx); Value *&base_ptr = ptr_cache_[non_cst_idx]; if(base_ptr == nullptr){ base_ptr = builder_.CreateGEP(ptr_, shared_offset(non_cst_idx)); +// base_ptr = builder_.CreateBitCast(base_ptr, load_ptr_->getType()); if(vector_size_ > 1){ Type *vec_ty = VectorType::get(base_ptr->getType()->getPointerElementType(), vector_size_); Type *vec_ptr_ty = PointerType::get(vec_ty, base_ptr->getType()->getPointerAddressSpace()); @@ -160,7 +167,7 @@ Value* shared_tile::get_value(indices_t idx) { div = builder_.CreateUDiv(offset, builder_.getInt32(vector_size_)); Value *ptr = builder_.CreateGEP(base_ptr, div); Value *result = builder_.CreateLoad(ptr); - if(vector_size_ > 1) { + if(return_vector_ == false && vector_size_ > 1) { Value *rem = builder_.CreateURem(offset, builder_.getInt32(vector_size_)); result = builder_.CreateExtractElement(result, rem); } @@ -479,19 +486,19 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), _4))); // idx_i std::vector idx_j; - for(unsigned i = 0; i < 2; i++){ - idx_j.push_back(builder.CreateAdd(offset_j, builder.getInt32(i*4))); - idx_j.push_back(builder.CreateAdd(offset_j, builder.getInt32(i*4 + 1))); + for(unsigned j = 0; j < 2; j++){ + idx_j.push_back(builder.CreateAdd(offset_j, builder.getInt32(j*4))); + idx_j.push_back(builder.CreateAdd(offset_j, builder.getInt32(j*4 + 1))); } // idx_j std::vector idx_i; - for(unsigned j = 0; j < 2; j++){ - idx_i.push_back(builder.CreateAdd(offset_i, builder.getInt32(j*2))); + for(unsigned i = 0; i < 2; i++){ + idx_i.push_back(builder.CreateAdd(offset_i, builder.getInt32(i*2))); } - axes_[params_->get_param_group(v, 0)] = distributed_axis{1, idx_j}; - axes_[params_->get_param_group(v, 1)] = distributed_axis{1, idx_i}; + axes_[params_->get_param_group(v, 0)] = distributed_axis{1, idx_i}; + axes_[params_->get_param_group(v, 1)] = distributed_axis{1, idx_j}; } } @@ -855,6 +862,11 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & } else { + TA->set_vector_size(2); + TB->set_vector_size(2); + TA->set_return_mode(true); + TB->set_return_mode(true); + Value *_0 = builder.getInt32(0); Value *_1 = builder.getInt32(1); Value *_2 = builder.getInt32(2); Value *_3 = builder.getInt32(3); @@ -864,47 +876,62 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & BasicBlock *current = builder.GetInsertBlock(); Module *module = current->getModule(); Value *tid = tgt_->get_local_id(module, builder, 0); - // offset_a_i = (tid & 3) - // offset_a_j = (tid & 4)*2 + (tid & 16)/4; - Value *offset_a_i = builder.CreateAnd(tid, _3); - Value *offset_a_k = builder.CreateAdd(builder.CreateMul(builder.CreateAnd(tid, _4), - _2), + // offset_a_i = (tid & 4)*2 + (tid & 16)/4; + // offset_a_k = (tid & 3) + Value *offset_a_i = builder.CreateAdd(builder.CreateMul(builder.CreateAnd(tid, _4), _2), builder.CreateUDiv(builder.CreateAnd(tid, _16), _4)); - // offset_b_i = (tid & 3) - // offset_b_j = (tid & 8)*1 + (tid & 16)/4 - Value *offset_b_i = builder.CreateAnd(tid, _3); - Value *offset_b_k = builder.CreateAdd(builder.CreateAnd(tid, _8), + Value *offset_a_k = builder.CreateAnd(tid, _3); + + // offset_b_i = (tid & 4)*1 + (tid & 16)/4 + // offset_b_k = (tid & 3) + Value *offset_b_i = builder.CreateAdd(builder.CreateAnd(tid, _8), builder.CreateUDiv(builder.CreateAnd(tid, _16), _4)); - Value *ha0 = TA->get_value({offset_a_i, offset_a_k}); - Value *ha1 = TA->get_value({builder.CreateAdd(offset_a_i, _1), offset_a_k}); - Value *hb0 = TB->get_value({offset_b_i, offset_b_k}); - Value *hb1 = TB->get_value({builder.CreateAdd(offset_b_i, _1), offset_b_k}); + Value *offset_b_k = builder.CreateAnd(tid, _3); + + std::vector fc; result->for_each([&](indices_t idx){ fc.push_back(result->get_value(idx)); }); - Type *void_ty = builder.getVoidTy(); - Type *int32_ty = builder.getInt32Ty(); Type *fp32_ty = builder.getFloatTy(); Type *fp16x2_ty = VectorType::get(builder.getHalfTy(), 2); Type *fp32_pack8_ty = StructType::get(ctx, {fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}); - FunctionType *mma_ty = FunctionType::get(fp32_pack8_ty, {int32_ty, int32_ty, int32_ty, int32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}, false); + FunctionType *mma_ty = FunctionType::get(fp32_pack8_ty, {fp16x2_ty, fp16x2_ty, fp16x2_ty, fp16x2_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}, false); InlineAsm *mma_fn = InlineAsm::get(mma_ty, " mma.sync.aligned.m8n8k4.col.row.f32.f16.f16.f32 " "{$0, $1, $2, $3, $4, $5, $6, $7}, " "{$8, $9}, " "{$10, $11}, " "{$0, $1, $2, $3, $4, $5, $6, $7};", "=f,=f,=f,=f,=f,=f,=f,=f,r,r,r,r,0,1,2,3,4,5,6,7", false); - Value *nc = builder.CreateCall(mma_fn, {builder.getInt32(0), builder.getInt32(0), builder.getInt32(0), builder.getInt32(0), fc[0], fc[1], fc[2], fc[3], fc[4], fc[5], fc[6], fc[7]}); - std::cout << mma_fn->getFunctionType()->getFunctionNumParams() << std::endl; + + for(unsigned K = 0; K < NK; K += 4){ + Value *_K = builder.getInt32(K); + Value *ha0 = TA->get_value({offset_a_i, builder.CreateAdd(offset_a_k, _K)}); + Value *ha1 = TA->get_value({builder.CreateAdd(offset_a_i, _2), builder.CreateAdd(offset_a_k, _K)}); + Value *hb0 = TB->get_value({offset_b_i, builder.CreateAdd(offset_b_k, _K)}); + Value *hb1 = TB->get_value({builder.CreateAdd(offset_b_i, _2), builder.CreateAdd(offset_b_k, _K)}); + Value *nc = builder.CreateCall(mma_fn, {ha0, ha1, hb0, hb1, fc[0], fc[2], fc[1], fc[3], fc[4], fc[6], fc[5], fc[7]}); + fc[0] = builder.CreateExtractValue(nc, {0}); + fc[2] = builder.CreateExtractValue(nc, {1}); + fc[1] = builder.CreateExtractValue(nc, {2}); + fc[3] = builder.CreateExtractValue(nc, {3}); + fc[4] = builder.CreateExtractValue(nc, {4}); + fc[6] = builder.CreateExtractValue(nc, {5}); + fc[5] = builder.CreateExtractValue(nc, {6}); + fc[7] = builder.CreateExtractValue(nc, {7}); + } + + // write back unsigned i = 0; result->for_each([&](indices_t idx){ - result->set_value(idx, builder.CreateExtractValue(nc, {i++})); + result->set_value(idx, fc[i++]); }); - std::cout << "haha" << std::endl; + + TA->set_return_mode(false); + TB->set_return_mode(false); } } else From 06b59925097f586ce3001096adeeda5f06d00021 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 9 Jun 2019 14:41:36 -0700 Subject: [PATCH 172/494] [feature] added basic tensor core support --- examples/cpp/dot.cpp | 2 +- examples/python/tensorflow/dot.cpp | 20 ++-- examples/python/tensorflow/run.py | 27 +++-- include/triton/codegen/selection.h | 2 + lib/codegen/selection.cpp | 180 +++++++++++++++++++++-------- lib/codegen/tune.cpp | 10 +- lib/dnn/gemm.cpp | 4 +- lib/driver/module.cpp | 8 +- lib/runtime/jit.cpp | 4 +- 9 files changed, 180 insertions(+), 77 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 8b7559f55..fdb04a935 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -16,7 +16,7 @@ int main() { triton::jit jit(context); // matrix multiplication parameters - int32_t M = 2048, N = 2048, K = 2048; + int32_t M = 1024, N = 1024, K = 1024; std::vector hc(M*N); std::vector rc(M*N); std::vector ha(M*K); diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index fef3bb73e..52328d386 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -21,8 +21,8 @@ using GPUDevice = Eigen::GpuDevice; const char* src = R"( -const tunable int32 TM = {16}; -const tunable int32 TN = {16}; +const tunable int32 TM = {8, 16, 32, 64, 128}; +const tunable int32 TN = {8, 16, 32, 64, 128}; const tunable int32 TK = {8}; const tunable int32 GZ = {1}; @@ -54,11 +54,8 @@ void matmul(restrict read_only fp16 *A, restrict read_only fp16 *B, } int32 rxc[TM] = get_global_range[TM](0); int32 ryc[TN] = get_global_range[TN](1); - fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - int1 checkc0[TM] = rxc < M; - int1 checkc1[TN] = ryc < N; - int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - @checkc *pc = c; + fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; + *pc = c; } )"; @@ -122,14 +119,17 @@ class BlockSparseGemmOp : public OpKernel { stream->enqueue(kernel, grid, {nthreads, 1, 1}); stream->synchronize(); double ts = triton::tools::bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, - [&](){ stream->synchronize(); }, nullptr); + [&](){ stream->synchronize(); }, ctx->device()); return 2.*M*N*K / ts * 1e-3; }; // just-in-time compile source-code - jit.add_module("matmul", src, {4, 2, 16, 4, 2, 16, 2, 2, 1, 1, 8, 8, 8, 1}); +// jit.autotune("matmul", src, benchmark); +// jit.add_module("matmul", src, {4, 2, 8, 4, 2, 32, 1, 4, 1, 1, 8, 8, 8, 1}); +// jit.add_module("matmul", src, {32, 2, 128, 32, 2, 128, 2, 2, 2, 2, 4, 8, 4, 1}); + jit.add_module("matmul", src, {16, 4, 128, 32, 4, 128, 2, 2, 2, 2, 8, 8, 4, 1}); triton::driver::kernel* kernel = jit.get_function("matmul"); triton::jit::launch_information info = jit.get_launch_info("matmul"); - benchmark(kernel, info); + std::cout << benchmark(kernel, info) << std::endl;; } private: diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 2a34bab45..3e8f78f31 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -1,28 +1,39 @@ import os import tensorflow as tf import numpy as np - +from time import time data_files_path = tf.resource_loader.get_data_files_path() library_dir = '/home/philippe/development/triton/build/examples/python/tensorflow' module = tf.load_op_library(os.path.join(library_dir, 'libtf_blocksparse.so')) -M, N, K = 256, 256, 256 +M, N, K = 256,256,256 a = tf.placeholder(tf.float16, shape=[M, K]) b = tf.placeholder(tf.float16, shape=[N, K]) locks = tf.placeholder(tf.int32, shape=[4096]) +# c = tf.matmul(a, b, transpose_a=True) c = module.dot(a, b, locks) + # Reference ha = np.random.rand(M, K).astype(np.float16) hb = np.random.rand(N, K).astype(np.float16) -hresult = np.dot(hb.T, ha) # Run sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) result = sess.run([c], feed_dict = {locks: np.zeros(4096), a: ha, - b: hb}) -print(result) -print(hresult) -#print(result - hresult) -print(np.max(np.abs(result - hresult))) + b: hb})[0] + +#bench = tf.test.Benchmark().run_op_benchmark(sess=sess, +# op_or_tensor=c, +# feed_dict={a: ha, b: hb}, +# min_iters=100) +#print(end - start) +#print(2*M*N*K / (end - start) * 1e-12) +hresult = np.dot(ha.T, hb).T +dif = np.abs(result - hresult) +print("dif: %f" % np.max(dif)) + +#np.savetxt("dif.txt", dif, fmt="%5.2f") +#np.savetxt("gpu.txt", result, fmt="%5.2f") +#np.savetxt("cpu.txt", hresult, fmt="%5.2f") diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index c659ab6d3..9f19ec884 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -41,6 +41,7 @@ public: tile(llvm::Type *ty, const shapes_t &shapes): ty_(ty), shapes_(shapes){ } virtual void set_value(indices_t idx, llvm::Value *v) = 0; virtual llvm::Value* get_value(indices_t idx) = 0; + llvm::Type *get_ty() const { return ty_; } protected: llvm::Type *ty_; @@ -59,6 +60,7 @@ public: void set_vector_size(unsigned vector_size); void set_return_mode(bool return_vector); void set_value(indices_t, llvm::Value *); + llvm::Value* get_ptr_to(indices_t idx); llvm::Value* get_value(indices_t idx); llvm::Value* get_pointer() { return ptr_; } llvm::Value* get_offset() { return offset_; } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 9b6501b3e..fa000d6a7 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -152,11 +152,16 @@ Value* shared_tile::get_value(indices_t idx) { indices_t non_cst_idx, cst_idx; extract_constant(idx, non_cst_idx, cst_idx); Value *&base_ptr = ptr_cache_[non_cst_idx]; + unsigned vector_size = vector_size_; + Type *ty = ty_; + if(ty->isHalfTy() && (vector_size % 2 == 0)){ + ty = IntegerType::get(ty->getContext(), 32); + vector_size = vector_size / 2; + } if(base_ptr == nullptr){ base_ptr = builder_.CreateGEP(ptr_, shared_offset(non_cst_idx)); -// base_ptr = builder_.CreateBitCast(base_ptr, load_ptr_->getType()); if(vector_size_ > 1){ - Type *vec_ty = VectorType::get(base_ptr->getType()->getPointerElementType(), vector_size_); + Type *vec_ty = VectorType::get(ty, vector_size); Type *vec_ptr_ty = PointerType::get(vec_ty, base_ptr->getType()->getPointerAddressSpace()); base_ptr = builder_.CreateBitCast(base_ptr, vec_ptr_ty); } @@ -477,26 +482,64 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id Value *_4 = builder.getInt32(4); Value *_8 = builder.getInt32(8); Value *_16 = builder.getInt32(16); - // offset_i = tid & 2 + tid & 8 - Value *offset_j = builder.CreateAdd(builder.CreateAnd(u_thread_id, _2), - builder.CreateAnd(u_thread_id, _8)); - // offset_j = (tid & 1) + (tid & 4)*2 + (tid & 16)/4 - Value *offset_i = builder.CreateAdd(builder.CreateAnd(u_thread_id, _1), - builder.CreateAdd(builder.CreateMul(builder.CreateAnd(u_thread_id, _4), _2), - builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), _4))); + + // warp tile size + unsigned fpw_0 = params_->get_param(v, "fpw.d0")->get_value(); + unsigned fpw_1 = params_->get_param(v, "fpw.d1")->get_value(); + unsigned wts_0 = fpw_0 * 8; + unsigned wts_1 = fpw_1 * 8; + Value *warp_tile_size_0 = builder.getInt32(wts_0); + Value *warp_tile_size_1 = builder.getInt32(wts_1); + + /* intra warp offset */ + Value *qpa_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), _4); // quad pair id + Value *qpb_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), builder.CreateUDiv(_16, builder.getInt32(fpw_1))); // quad pair id + // B ofsets + Value *qpb_off = builder.CreateURem(builder.CreateMul(qpb_id, _8), warp_tile_size_1); // offset of quad pair in warp + // A offsets + Value *qa_off = builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), _4);// offset of quad in pair + Value *qpa_off = builder.CreateURem(builder.CreateMul(qpa_id, _8), warp_tile_size_0); // offset of LHS quad pair in warp + + /* inter warp offset */ + unsigned wpt_0 = params_->get_param(v, "wpt.d0")->get_value(); + unsigned wpt_1 = params_->get_param(v, "wpt.d1")->get_value(); + Value *warp_id_0 = builder.CreateURem(u_warp_id, builder.getInt32(wpt_0)); + Value *warp_id_1 = builder.CreateUDiv(u_warp_id, builder.getInt32(wpt_0)); + Value *warp_offset_i = builder.CreateMul(warp_id_0, warp_tile_size_0); + Value *warp_offset_j = builder.CreateMul(warp_id_1, warp_tile_size_1); + + // offset_i = (tid & 1) + (tid & 4)*2 + (tid & 16)/4 + Value *offset_i = builder.CreateAdd(warp_offset_i, + builder.CreateAdd(builder.CreateAnd(u_thread_id, _1), + builder.CreateAdd(qpa_off, qa_off))); + + // repetitions + unsigned stride_rep_i = wpt_0 * wts_0; + unsigned stride_rep_j = wpt_1 * wts_1; + // idx_i - std::vector idx_j; - for(unsigned j = 0; j < 2; j++){ - idx_j.push_back(builder.CreateAdd(offset_j, builder.getInt32(j*4))); - idx_j.push_back(builder.CreateAdd(offset_j, builder.getInt32(j*4 + 1))); + std::vector idx_i; + for(unsigned base_i = 0; base_i < shapes[0]->get_value(); base_i += stride_rep_i) + for(unsigned i = 0; i < 2; i++){ + idx_i.push_back(builder.CreateAdd(offset_i, builder.getInt32(base_i + i*2))); } + // offset_j = tid & 2 + tid & 8 + Value *offset_j = builder.CreateAdd(warp_offset_j, + builder.CreateAdd(builder.CreateAnd(u_thread_id, _2), + qpb_off)); + + // idx_j - std::vector idx_i; - for(unsigned i = 0; i < 2; i++){ - idx_i.push_back(builder.CreateAdd(offset_i, builder.getInt32(i*2))); + std::vector idx_j; + for(unsigned base_j = 0; base_j < shapes[1]->get_value(); base_j += stride_rep_j) + for(unsigned j = 0; j < 2; j++){ + idx_j.push_back(builder.CreateAdd(offset_j, builder.getInt32(base_j + j*4))); + idx_j.push_back(builder.CreateAdd(offset_j, builder.getInt32(base_j + j*4 + 1))); } + + axes_[params_->get_param_group(v, 0)] = distributed_axis{1, idx_i}; axes_[params_->get_param_group(v, 1)] = distributed_axis{1, idx_j}; } @@ -797,6 +840,10 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & unsigned id = linear / vector_size; if(linear % vector_size == 0) packets[id] = result->get_value(idx); + }); + in->for_each([&](indices_t idx){ + unsigned linear = in->get_linear_index(idx); + unsigned id = linear / vector_size; packets[id] = builder.CreateInsertElement(packets.at(id), in->get_value(idx), linear % vector_size); }); result->for_each([&](indices_t idx){ @@ -834,7 +881,6 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & distributed_tile *TC = (distributed_tile*)tmap_.at(C); Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {llvm_type(C->get_type()->get_scalar_ty(), ctx)}); unsigned NK = A->get_type()->get_tile_shapes()[1]->get_value(); - std::cout << NK << std::endl; if(NK != 1) { shared_tile *TA = (shared_tile*)tmap_.at(A); @@ -862,8 +908,8 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & } else { - TA->set_vector_size(2); - TB->set_vector_size(2); + TA->set_vector_size(4); + TB->set_vector_size(4); TA->set_return_mode(true); TB->set_return_mode(true); Value *_0 = builder.getInt32(0); @@ -873,22 +919,47 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & Value *_4 = builder.getInt32(4); Value *_8 = builder.getInt32(8); Value *_16 = builder.getInt32(16); + unsigned fpw_0 = params_->get_param(dot, "fpw.d0")->get_value(); + unsigned fpw_1 = params_->get_param(dot, "fpw.d1")->get_value(); + unsigned wts_0 = fpw_0 * 8; + unsigned wts_1 = fpw_1 * 8; + Value *warp_tile_size_0 = builder.getInt32(wts_0); + Value *warp_tile_size_1 = builder.getInt32(wts_1); + BasicBlock *current = builder.GetInsertBlock(); Module *module = current->getModule(); Value *tid = tgt_->get_local_id(module, builder, 0); + Value *u_thread_id = builder.CreateURem(tid, builder.getInt32(32)); + Value *u_warp_id = builder.CreateUDiv(tid, builder.getInt32(32)); + + /* intra-warp offset */ + Value *qpa_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), _4); // quad pair id + Value *qpb_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), builder.CreateUDiv(_16, builder.getInt32(fpw_1))); // quad pair id + Value *qpa_off = builder.CreateURem(builder.CreateMul(qpa_id, _8), warp_tile_size_0); // offset of LHS quad pair in warp + Value *qpb_off = builder.CreateURem(builder.CreateMul(qpb_id, _8), warp_tile_size_1); // offset of quad pair in warp + Value *q_off = builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), _4);// offset of quad in pair + + /* inter-warp offset */ + unsigned wpt_0 = params_->get_param(dot, "wpt.d0")->get_value(); + unsigned wpt_1 = params_->get_param(dot, "wpt.d1")->get_value(); + Value *warp_id_0 = builder.CreateURem(u_warp_id, builder.getInt32(wpt_0)); + Value *warp_id_1 = builder.CreateUDiv(u_warp_id, builder.getInt32(wpt_0)); + Value *warp_offset_i = builder.CreateMul(warp_id_0, warp_tile_size_0); + Value *warp_offset_j = builder.CreateMul(warp_id_1, warp_tile_size_1); + + /* repetitions */ + unsigned stride_rep_i = wpt_0 * wts_0; + unsigned stride_rep_j = wpt_1 * wts_1; + // offset_a_i = (tid & 4)*2 + (tid & 16)/4; // offset_a_k = (tid & 3) - Value *offset_a_i = builder.CreateAdd(builder.CreateMul(builder.CreateAnd(tid, _4), _2), - builder.CreateUDiv(builder.CreateAnd(tid, _16), - _4)); - Value *offset_a_k = builder.CreateAnd(tid, _3); + Value *offset_a_i = builder.CreateAdd(warp_offset_i, builder.CreateAdd(qpa_off, q_off)); + Value *offset_a_k = builder.CreateAnd(u_thread_id, _3); - // offset_b_i = (tid & 4)*1 + (tid & 16)/4 + // offset_b_i = (tid & 8)*1 + (tid & 16)/4 // offset_b_k = (tid & 3) - Value *offset_b_i = builder.CreateAdd(builder.CreateAnd(tid, _8), - builder.CreateUDiv(builder.CreateAnd(tid, _16), - _4)); - Value *offset_b_k = builder.CreateAnd(tid, _3); + Value *offset_b_i = builder.CreateAdd(warp_offset_j, builder.CreateAdd(qpb_off, q_off)); + Value *offset_b_k = builder.CreateAnd(u_thread_id, _3); std::vector fc; @@ -902,26 +973,45 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & FunctionType *mma_ty = FunctionType::get(fp32_pack8_ty, {fp16x2_ty, fp16x2_ty, fp16x2_ty, fp16x2_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}, false); InlineAsm *mma_fn = InlineAsm::get(mma_ty, " mma.sync.aligned.m8n8k4.col.row.f32.f16.f16.f32 " - "{$0, $1, $2, $3, $4, $5, $6, $7}, " - "{$8, $9}, " - "{$10, $11}, " - "{$0, $1, $2, $3, $4, $5, $6, $7};", "=f,=f,=f,=f,=f,=f,=f,=f,r,r,r,r,0,1,2,3,4,5,6,7", false); + "{$0, $1, $2, $3, $4, $5, $6, $7}, " + "{$8, $9}, " + "{$10, $11}, " + "{$0, $1, $2, $3, $4, $5, $6, $7};", "=f,=f,=f,=f,=f,=f,=f,=f,r,r,r,r,0,1,2,3,4,5,6,7", false); + unsigned num_rep_i = shapes[0]->get_value() / stride_rep_i; + unsigned num_rep_j = shapes[1]->get_value() / stride_rep_j; + unsigned ld_fc = num_rep_i * 2; + for(unsigned ii = 0; ii < num_rep_i; ii++) + for(unsigned jj = 0; jj < num_rep_j; jj++) for(unsigned K = 0; K < NK; K += 4){ Value *_K = builder.getInt32(K); - Value *ha0 = TA->get_value({offset_a_i, builder.CreateAdd(offset_a_k, _K)}); - Value *ha1 = TA->get_value({builder.CreateAdd(offset_a_i, _2), builder.CreateAdd(offset_a_k, _K)}); - Value *hb0 = TB->get_value({offset_b_i, builder.CreateAdd(offset_b_k, _K)}); - Value *hb1 = TB->get_value({builder.CreateAdd(offset_b_i, _2), builder.CreateAdd(offset_b_k, _K)}); - Value *nc = builder.CreateCall(mma_fn, {ha0, ha1, hb0, hb1, fc[0], fc[2], fc[1], fc[3], fc[4], fc[6], fc[5], fc[7]}); - fc[0] = builder.CreateExtractValue(nc, {0}); - fc[2] = builder.CreateExtractValue(nc, {1}); - fc[1] = builder.CreateExtractValue(nc, {2}); - fc[3] = builder.CreateExtractValue(nc, {3}); - fc[4] = builder.CreateExtractValue(nc, {4}); - fc[6] = builder.CreateExtractValue(nc, {5}); - fc[5] = builder.CreateExtractValue(nc, {6}); - fc[7] = builder.CreateExtractValue(nc, {7}); + Value *current_offset_a_i = builder.CreateAdd(offset_a_i, builder.getInt32(ii * stride_rep_i)); + Value *current_offset_b_i = builder.CreateAdd(offset_b_i, builder.getInt32(jj * stride_rep_j)); + Value *ha = TA->get_value({current_offset_a_i, builder.CreateAdd(offset_a_k, _K)}); + Value *hb = TB->get_value({current_offset_b_i, builder.CreateAdd(offset_b_k, _K)}); + Value *ha0 = builder.CreateExtractElement(ha, builder.getInt32(0)); + Value *ha1 = builder.CreateExtractElement(ha, builder.getInt32(1)); + Value *hb0 = builder.CreateExtractElement(hb, builder.getInt32(0)); + Value *hb1 = builder.CreateExtractElement(hb, builder.getInt32(1)); + std::vector idx = { + (ii*2 + 0) + (jj*4 + 0)*ld_fc, + (ii*2 + 0) + (jj*4 + 1)*ld_fc, + (ii*2 + 1) + (jj*4 + 0)*ld_fc, + (ii*2 + 1) + (jj*4 + 1)*ld_fc, + (ii*2 + 0) + (jj*4 + 2)*ld_fc, + (ii*2 + 0) + (jj*4 + 3)*ld_fc, + (ii*2 + 1) + (jj*4 + 2)*ld_fc, + (ii*2 + 1) + (jj*4 + 3)*ld_fc + }; + Value *nc = builder.CreateCall(mma_fn, {ha0, ha1, hb0, hb1, fc[idx[0]], fc[idx[1]], fc[idx[2]], fc[idx[3]], fc[idx[4]], fc[idx[5]], fc[idx[6]], fc[idx[7]]}); + fc[idx[0]] = builder.CreateExtractValue(nc, {0}); + fc[idx[1]] = builder.CreateExtractValue(nc, {1}); + fc[idx[2]] = builder.CreateExtractValue(nc, {2}); + fc[idx[3]] = builder.CreateExtractValue(nc, {3}); + fc[idx[4]] = builder.CreateExtractValue(nc, {4}); + fc[idx[5]] = builder.CreateExtractValue(nc, {5}); + fc[idx[6]] = builder.CreateExtractValue(nc, {6}); + fc[idx[7]] = builder.CreateExtractValue(nc, {7}); } // write back diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index a995a8a7c..dfd079817 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -214,7 +214,7 @@ void tune::run(ir::module &mod) { } else { ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 2, 2); - ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 1, 1); + ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 1, 4); connected_components(node, {fpw, wpt}, {"fpw", "wpt"}, nodes_, dependencies_, group_id++); } } @@ -228,13 +228,13 @@ void tune::run(ir::module &mod) { continue; if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 2)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 4, 4)); *params_.at(i).at("nts.d0") = *tmp; } if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 2, 2)); - std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 2, 2)); + std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 4, 4)); + std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 4, 4)); *params_.at(i).at("nts.d0") = *tmp1; *params_.at(i).at("nts.d1") = *tmp2; } @@ -341,7 +341,7 @@ bool tune::check_constraints(std::map> &er else { ir::metaparameter *fpw = params_[i]["fpw.d" + strk]; ir::metaparameter *wpt = params_[i]["wpt.d" + strk]; - multiple = fpw->get_value()*wpt->get_value(); + multiple = fpw->get_value()*wpt->get_value()*8; } if(shapes[k]->get_value() % multiple != 0) errors[i].push_back("for dim " + strk + ": shape (" + to_string(shapes[k]->get_value()) + ")" diff --git a/lib/dnn/gemm.cpp b/lib/dnn/gemm.cpp index 940b256b2..59f413d81 100644 --- a/lib/dnn/gemm.cpp +++ b/lib/dnn/gemm.cpp @@ -62,8 +62,8 @@ std::string gemm::src(bool AT, bool BT) { } std::string res = R"( -const tunable int32 TM = {16}; -const tunable int32 TN = {16}; +const tunable int32 TM = {16, 32, 64, 128}; +const tunable int32 TN = {16, 32, 64, 128}; const tunable int32 TK = {8}; const tunable int32 GZ = {1}; diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index a9d8ab549..ebc876559 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -106,10 +106,10 @@ void module::compile_llvm_module(llvm::Module* module, const std::string& triple file_type_t ft) { init_llvm(); // debug - llvm::legacy::PassManager pm; - pm.add(llvm::createPrintModulePass(llvm::outs())); - pm.add(llvm::createVerifierPass()); - pm.run(*module); +// llvm::legacy::PassManager pm; +// pm.add(llvm::createPrintModulePass(llvm::outs())); +// pm.add(llvm::createVerifierPass()); +// pm.run(*module); // create machine module->setTargetTriple(triple); std::string error; diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index f5bdc2e8f..d1d90278d 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -129,8 +129,8 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben // Deep copy of the module and tuner auto ptt_module = make_triton_module(name, src); ir::module &tt_module = *ptt_module; - for(unsigned p: params) - std::cout << p << " " << std::flush; +// for(unsigned p: params) +// std::cout << p << " " << std::flush; passes_wrapper passes(target_.get()); passes.target_independent(tt_module); passes.tune.run(tt_module); From 7d50b87681f76923375ac2e29311a9f25ffacd6c Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 11 Jun 2019 10:45:19 -0700 Subject: [PATCH 173/494] [selection/codegen] bugfix in distributed tile indices initialization --- lib/codegen/selection.cpp | 7 ++----- lib/driver/module.cpp | 8 +++++--- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index fa000d6a7..5463e7090 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -29,7 +29,8 @@ void distributed_tile::init_indices() { indices_t current; for(size_t d = 0; d < id.size(); d++) current.push_back(axes_[d].values[id[d]]); - indices_[current] = indices_.size(); + size_t sz = indices_.size(); + indices_[current] = sz; values_[current] = UndefValue::get(ty_); ordered_indices_.push_back(current); id[0]++; @@ -840,10 +841,6 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & unsigned id = linear / vector_size; if(linear % vector_size == 0) packets[id] = result->get_value(idx); - }); - in->for_each([&](indices_t idx){ - unsigned linear = in->get_linear_index(idx); - unsigned id = linear / vector_size; packets[id] = builder.CreateInsertElement(packets.at(id), in->get_value(idx), linear % vector_size); }); result->for_each([&](indices_t idx){ diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index ebc876559..c3139ece6 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -243,10 +243,12 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { layout += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; // create llvm::SmallVector buffer; - module::compile_llvm_module(module, "nvptx64-nvidia-cuda", "sm_75", layout, buffer, "", Assembly); + module::compile_llvm_module(module, "nvptx64-nvidia-cuda", "sm_70", layout, buffer, "", Assembly); std::string result(buffer.begin(), buffer.end()); - std::string to_replace = ".version 6.3"; - result.replace(result.find(to_replace), to_replace.size(), ".version 6.4"); + size_t start_replace = result.find(".version"); + size_t end_replace = result.find('\n', start_replace); + assert(start_replace != std::string::npos); + result.replace(start_replace, end_replace - start_replace, ".version 6.4"); return result; } From cbd916994d05051c2e453f7c6c7a03f5d89c4cd8 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 11 Jun 2019 11:06:02 -0700 Subject: [PATCH 174/494] [example/tensorflow] no longer hardcoding library dir --- examples/python/tensorflow/CMakeLists.txt | 5 +++++ examples/python/tensorflow/run.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/examples/python/tensorflow/CMakeLists.txt b/examples/python/tensorflow/CMakeLists.txt index 6c8a6f008..652c16b90 100644 --- a/examples/python/tensorflow/CMakeLists.txt +++ b/examples/python/tensorflow/CMakeLists.txt @@ -7,4 +7,9 @@ if(${TensorFlow_FOUND}) add_definitions(-D_GLIBCXX_USE_CXX11_ABI=${TF_ABI}) add_library(tf_blocksparse SHARED dot.cpp) target_link_libraries(tf_blocksparse tensorflow_framework triton) + add_custom_command( + TARGET tf_blocksparse POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_CURRENT_SOURCE_DIR}/run.py + ${CMAKE_CURRENT_BINARY_DIR}/run.py) endif() diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 3e8f78f31..0e0a51b43 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -3,7 +3,7 @@ import tensorflow as tf import numpy as np from time import time data_files_path = tf.resource_loader.get_data_files_path() -library_dir = '/home/philippe/development/triton/build/examples/python/tensorflow' +library_dir = os.path.dirname(os.path.realpath(__file__)) module = tf.load_op_library(os.path.join(library_dir, 'libtf_blocksparse.so')) M, N, K = 256,256,256 From 1b5a742a88ada38da4f7dfa2f494dd79882bb861 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 11 Jun 2019 13:27:54 -0700 Subject: [PATCH 175/494] [triton/codegen] added shared memory padding for HMMA arguments and vectorized loads --- CMakeLists.txt | 4 ++-- examples/python/tensorflow/CMakeLists.txt | 8 +++---- examples/python/tensorflow/dot.cpp | 26 +++++++++-------------- examples/python/tensorflow/run.py | 8 +++---- include/triton/codegen/shmem_allocation.h | 8 ++++--- include/triton/runtime/jit.h | 2 +- lib/codegen/selection.cpp | 22 +++++++++++++++++-- lib/codegen/shmem_allocation.cpp | 20 ++++++++++++----- lib/driver/module.cpp | 2 +- 9 files changed, 61 insertions(+), 39 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 21805aa68..2bece7b6f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,8 +6,8 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") # FLEX/YACC find_package(BISON) find_package(FLEX) -BISON_TARGET(Parser ${CMAKE_CURRENT_SOURCE_DIR}/include/triton/lang/parser.y ${CMAKE_CURRENT_BINARY_DIR}/parser.cpp) -FLEX_TARGET(Lexer ${CMAKE_CURRENT_SOURCE_DIR}/include/triton/lang/scanner.l ${CMAKE_CURRENT_BINARY_DIR}/scanner.cpp) +BISON_TARGET(Parser ${CMAKE_CURRENT_SOURCE_DIR}/include/triton/lang/parser.y ${CMAKE_CURRENT_SOURCE_DIR}/lib/lang/parser.cpp) +FLEX_TARGET(Lexer ${CMAKE_CURRENT_SOURCE_DIR}/include/triton/lang/scanner.l ${CMAKE_CURRENT_SOURCE_DIR}/lib/lang/scanner.cpp) get_filename_component(BISON_Parser_INCLUDE_DIRECTORIES ${BISON_Parser_OUTPUT_HEADER} DIRECTORY) include_directories(${BISON_Parser_INCLUDE_DIRECTORIES}) diff --git a/examples/python/tensorflow/CMakeLists.txt b/examples/python/tensorflow/CMakeLists.txt index 652c16b90..f9b650d1d 100644 --- a/examples/python/tensorflow/CMakeLists.txt +++ b/examples/python/tensorflow/CMakeLists.txt @@ -7,9 +7,7 @@ if(${TensorFlow_FOUND}) add_definitions(-D_GLIBCXX_USE_CXX11_ABI=${TF_ABI}) add_library(tf_blocksparse SHARED dot.cpp) target_link_libraries(tf_blocksparse tensorflow_framework triton) - add_custom_command( - TARGET tf_blocksparse POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy - ${CMAKE_CURRENT_SOURCE_DIR}/run.py - ${CMAKE_CURRENT_BINARY_DIR}/run.py) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/run.py + ${CMAKE_CURRENT_BINARY_DIR}/run.py + COPYONLY) endif() diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index 52328d386..cd712594a 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -21,9 +21,9 @@ using GPUDevice = Eigen::GpuDevice; const char* src = R"( -const tunable int32 TM = {8, 16, 32, 64, 128}; -const tunable int32 TN = {8, 16, 32, 64, 128}; -const tunable int32 TK = {8}; +const tunable int32 TM = {64, 128}; +const tunable int32 TN = {64, 128}; +const tunable int32 TK = {32}; const tunable int32 GZ = {1}; void matmul(restrict read_only fp16 *A, restrict read_only fp16 *B, @@ -37,20 +37,14 @@ void matmul(restrict read_only fp16 *A, restrict read_only fp16 *B, int32 rka[TK] = 0 ... TK; int32 rkb[TK] = 0 ... TK; fp32 c[TM, TN] = 0; - int32 div = K / GZ; - int32 rem = K % GZ; - K = select(rz < rem, div - 1, div); - int32 offk = select(rz < rem, rz*(div + 1), rz*div + rem); - fp16* pa[TM, TK] = A + (offk + rka[newaxis, :])*lda + rxa[:, newaxis]; - fp16* pb[TN, TK] = B + (offk + rkb[newaxis, :])*ldb + ryb[:, newaxis]; - fp16 a[TM, TK] = *pa; - fp16 b[TN, TK] = *pb; - for(int32 k = K; k > 0; k = k - TK){ + fp16* pa[TM, TK] = A + rka[newaxis, :]*lda + rxa[:, newaxis]; + fp16* pb[TN, TK] = B + rkb[newaxis, :]*ldb + ryb[:, newaxis]; + for(int32 k = K; k > TK; k = k - TK){ + fp16 a[TM, TK] = *pa; + fp16 b[TN, TK] = *pb; c = dot(a, trans(b), c); pa = pa + TK*lda; pb = pb + TK*ldb; - a = *pa; - b = *pb; } int32 rxc[TM] = get_global_range[TM](0); int32 ryc[TN] = get_global_range[TN](1); @@ -123,10 +117,10 @@ class BlockSparseGemmOp : public OpKernel { return 2.*M*N*K / ts * 1e-3; }; // just-in-time compile source-code -// jit.autotune("matmul", src, benchmark); + jit.autotune("matmul", src, benchmark); // jit.add_module("matmul", src, {4, 2, 8, 4, 2, 32, 1, 4, 1, 1, 8, 8, 8, 1}); // jit.add_module("matmul", src, {32, 2, 128, 32, 2, 128, 2, 2, 2, 2, 4, 8, 4, 1}); - jit.add_module("matmul", src, {16, 4, 128, 32, 4, 128, 2, 2, 2, 2, 8, 8, 4, 1}); + jit.add_module("matmul", src, {16, 4, 128, 16, 4, 128, 2, 2, 2, 2, 8, 32, 8, 1}); triton::driver::kernel* kernel = jit.get_function("matmul"); triton::jit::launch_information info = jit.get_launch_info("matmul"); std::cout << benchmark(kernel, info) << std::endl;; diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 0e0a51b43..589f46303 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -6,7 +6,7 @@ data_files_path = tf.resource_loader.get_data_files_path() library_dir = os.path.dirname(os.path.realpath(__file__)) module = tf.load_op_library(os.path.join(library_dir, 'libtf_blocksparse.so')) -M, N, K = 256,256,256 +M, N, K = 8192,8192,8192 a = tf.placeholder(tf.float16, shape=[M, K]) b = tf.placeholder(tf.float16, shape=[N, K]) locks = tf.placeholder(tf.int32, shape=[4096]) @@ -30,9 +30,9 @@ result = sess.run([c], feed_dict = {locks: np.zeros(4096), # min_iters=100) #print(end - start) #print(2*M*N*K / (end - start) * 1e-12) -hresult = np.dot(ha.T, hb).T -dif = np.abs(result - hresult) -print("dif: %f" % np.max(dif)) +#hresult = np.dot(ha.T, hb).T +#dif = np.abs(result - hresult) +#print("dif: %f" % np.max(dif)) #np.savetxt("dif.txt", dif, fmt="%5.2f") #np.savetxt("gpu.txt", result, fmt="%5.2f") diff --git a/include/triton/codegen/shmem_allocation.h b/include/triton/codegen/shmem_allocation.h index 8a6f175a8..0f36ec154 100644 --- a/include/triton/codegen/shmem_allocation.h +++ b/include/triton/codegen/shmem_allocation.h @@ -18,15 +18,16 @@ class layout; class target_tuner; class shmem_liveness; class shmem_info; +class tune; class shmem_allocation { public: - shmem_allocation(shmem_liveness *live, shmem_info *buffer_info) - : liveness_(live), buffer_info_(buffer_info){ } + shmem_allocation(shmem_liveness *live, shmem_info *buffer_info, tune *params) + : liveness_(live), buffer_info_(buffer_info), params_(params){ } // utilities unsigned get_num_bytes(ir::value *x); - bool is_ld_padded(ir::value* x); + unsigned is_ld_padded(ir::value* x); // accessors unsigned get_offset(ir::value *x) const { return offsets_.at(x); } @@ -42,6 +43,7 @@ private: // dependences shmem_liveness *liveness_; shmem_info *buffer_info_; + tune *params_; }; } diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index 424a00e6d..a9bea664b 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -53,7 +53,7 @@ public: struct passes_wrapper { passes_wrapper(codegen::target* target) : shmem_liveness(&shmem_info), - shmem_allocation(&shmem_liveness, &shmem_info), + shmem_allocation(&shmem_liveness, &shmem_info, &tune), shmem_barriers(&shmem_allocation, &shmem_info), vectorize(&tune), selection(&shmem_allocation, &tune, &shmem_info, target), diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 5463e7090..200e6878e 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -612,8 +612,9 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, std::vector shapes; for(ir::constant_int* shape: cshapes) shapes.push_back(shape->get_value()); - if(alloc_->is_ld_padded(v)) - shapes[0] += 4; + unsigned pad = alloc_->is_ld_padded(v); + if(pad > 0) + shapes[0] += pad; Type* ty = llvm_type(v->get_type()->get_scalar_ty(), ctx); // create shared tile if(buffer_info_->is_shared(v)){ @@ -1040,6 +1041,23 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & }); } } + else if(auto *ld = dynamic_cast(ins)){ + unsigned vector_size = result->axis(0).contiguous; + std::map packets; + distributed_tile *TP = (distributed_tile*)tmap_.at(ld->get_pointer_operand()); + result->for_each([&](indices_t idx){ + set_mask_insert_pt(idx); + unsigned linear = result->get_linear_index(idx); + unsigned id = linear / vector_size; + if(linear % vector_size == 0){ + Value *ptr = TP->get_value(idx); + ptr= builder.CreateBitCast(ptr, PointerType::get(VectorType::get(result->get_ty(), vector_size), + ptr->getType()->getPointerAddressSpace())); + packets[id] = builder.CreateLoad(ptr); + } + result->set_value(idx, builder.CreateExtractElement(packets.at(id), linear % vector_size)); + }); + } // element-wise else { result->for_each([&](indices_t idx){ diff --git a/lib/codegen/shmem_allocation.cpp b/lib/codegen/shmem_allocation.cpp index 90cf7ef2b..60df4a9f3 100644 --- a/lib/codegen/shmem_allocation.cpp +++ b/lib/codegen/shmem_allocation.cpp @@ -1,6 +1,7 @@ #include "triton/codegen/shmem_allocation.h" #include "triton/codegen/shmem_liveness.h" #include "triton/codegen/shmem_info.h" +#include "triton/codegen/tune.h" #include "triton/ir/basic_block.h" #include "triton/ir/type.h" #include "triton/ir/value.h" @@ -10,7 +11,7 @@ namespace triton{ namespace codegen{ -bool shmem_allocation::is_ld_padded(ir::value *x) { +unsigned shmem_allocation::is_ld_padded(ir::value *x) { if(auto* phi = dynamic_cast(x)) { bool result = false; for(unsigned i = 0; i < phi->get_num_incoming(); i++) @@ -18,15 +19,24 @@ bool shmem_allocation::is_ld_padded(ir::value *x) { return result; } if(dynamic_cast(x)) - return true; - return false; + return 4; + for(ir::user* user: x->get_users()) + if(dynamic_cast(user)) + if(params_->get_fragment(user, 0) == tune::HMMA_FRAGMENT_C){ + if(x == user->get_operand(0)) + return 16; + else + return 16; + } + return 0; } unsigned shmem_allocation::get_num_bytes(ir::value *x) { unsigned result = x->get_type()->get_primitive_size_in_bits() / 8; - if(is_ld_padded(x)){ + unsigned pad = is_ld_padded(x); + if(pad > 0){ unsigned ld = x->get_type()->get_tile_shapes()[0]->get_value(); - result += 4 * result / ld; + result += pad * result / ld; } if(buffer_info_->is_double(x)) result *= 2; diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index c3139ece6..4ff863666 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -255,7 +255,7 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ - std::cout << source << std::endl; +// std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; From a6b580ec055e9832ebd8c2c69a30d8c750516d8e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 12 Jun 2019 19:46:43 -0700 Subject: [PATCH 176/494] interleaving fails with B --- examples/python/tensorflow/dot.cpp | 4 +- examples/python/tensorflow/run.py | 14 +-- include/triton/codegen/selection.h | 4 + lib/codegen/selection.cpp | 186 +++++++++++++---------------- lib/codegen/shmem_allocation.cpp | 2 +- lib/driver/module.cpp | 2 +- 6 files changed, 96 insertions(+), 116 deletions(-) diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index cd712594a..fae790eb1 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -39,7 +39,7 @@ void matmul(restrict read_only fp16 *A, restrict read_only fp16 *B, fp32 c[TM, TN] = 0; fp16* pa[TM, TK] = A + rka[newaxis, :]*lda + rxa[:, newaxis]; fp16* pb[TN, TK] = B + rkb[newaxis, :]*ldb + ryb[:, newaxis]; - for(int32 k = K; k > TK; k = k - TK){ + for(int32 k = K; k > 0; k = k - TK){ fp16 a[TM, TK] = *pa; fp16 b[TN, TK] = *pb; c = dot(a, trans(b), c); @@ -117,7 +117,7 @@ class BlockSparseGemmOp : public OpKernel { return 2.*M*N*K / ts * 1e-3; }; // just-in-time compile source-code - jit.autotune("matmul", src, benchmark); +// jit.autotune("matmul", src, benchmark); // jit.add_module("matmul", src, {4, 2, 8, 4, 2, 32, 1, 4, 1, 1, 8, 8, 8, 1}); // jit.add_module("matmul", src, {32, 2, 128, 32, 2, 128, 2, 2, 2, 2, 4, 8, 4, 1}); jit.add_module("matmul", src, {16, 4, 128, 16, 4, 128, 2, 2, 2, 2, 8, 32, 8, 1}); diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 589f46303..ebb01ddad 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -6,7 +6,7 @@ data_files_path = tf.resource_loader.get_data_files_path() library_dir = os.path.dirname(os.path.realpath(__file__)) module = tf.load_op_library(os.path.join(library_dir, 'libtf_blocksparse.so')) -M, N, K = 8192,8192,8192 +M, N, K = 128, 128, 128 a = tf.placeholder(tf.float16, shape=[M, K]) b = tf.placeholder(tf.float16, shape=[N, K]) locks = tf.placeholder(tf.int32, shape=[4096]) @@ -30,10 +30,10 @@ result = sess.run([c], feed_dict = {locks: np.zeros(4096), # min_iters=100) #print(end - start) #print(2*M*N*K / (end - start) * 1e-12) -#hresult = np.dot(ha.T, hb).T -#dif = np.abs(result - hresult) -#print("dif: %f" % np.max(dif)) +hresult = np.dot(ha.T, hb).T +dif = np.abs(result - hresult) +print("dif: %f" % np.max(dif)) -#np.savetxt("dif.txt", dif, fmt="%5.2f") -#np.savetxt("gpu.txt", result, fmt="%5.2f") -#np.savetxt("cpu.txt", hresult, fmt="%5.2f") +np.savetxt("dif.txt", dif, fmt="%5.2f") +np.savetxt("gpu.txt", result, fmt="%5.2f") +np.savetxt("cpu.txt", hresult, fmt="%5.2f") diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 9f19ec884..9a8149a01 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -159,6 +159,10 @@ private: shmem_info *buffer_info_; std::map axes_; llvm::Value *sh_mem_ptr_; + llvm::Value *offset_a_i_, *offset_a_k_; + llvm::Value *offset_b_j_, *offset_b_k_; + unsigned num_packs_0_, num_packs_1_; + unsigned pack_size_0_, pack_size_1_; }; } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 200e6878e..5f3a28d71 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -480,67 +480,82 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id else { Value *_1 = builder.getInt32(1); Value *_2 = builder.getInt32(2); + Value *_3 = builder.getInt32(3); Value *_4 = builder.getInt32(4); Value *_8 = builder.getInt32(8); Value *_16 = builder.getInt32(16); - // warp tile size + // fragments per warp unsigned fpw_0 = params_->get_param(v, "fpw.d0")->get_value(); unsigned fpw_1 = params_->get_param(v, "fpw.d1")->get_value(); - unsigned wts_0 = fpw_0 * 8; - unsigned wts_1 = fpw_1 * 8; - Value *warp_tile_size_0 = builder.getInt32(wts_0); - Value *warp_tile_size_1 = builder.getInt32(wts_1); - - /* intra warp offset */ - Value *qpa_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), _4); // quad pair id - Value *qpb_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), builder.CreateUDiv(_16, builder.getInt32(fpw_1))); // quad pair id - // B ofsets - Value *qpb_off = builder.CreateURem(builder.CreateMul(qpb_id, _8), warp_tile_size_1); // offset of quad pair in warp - // A offsets - Value *qa_off = builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), _4);// offset of quad in pair - Value *qpa_off = builder.CreateURem(builder.CreateMul(qpa_id, _8), warp_tile_size_0); // offset of LHS quad pair in warp - - /* inter warp offset */ + // warps per tile unsigned wpt_0 = params_->get_param(v, "wpt.d0")->get_value(); unsigned wpt_1 = params_->get_param(v, "wpt.d1")->get_value(); + // hmma warp tile size + unsigned hmma_wts_0 = fpw_0 * 8; + unsigned hmma_wts_1 = fpw_1 * 8; + // hmma block tile size + unsigned hmma_bts_0 = hmma_wts_0 * wpt_0; + unsigned hmma_bts_1 = hmma_wts_1 * wpt_1; + // number of repetition + unsigned num_rep_0 = shapes[0]->get_value() / hmma_bts_0; + unsigned num_rep_1 = shapes[1]->get_value() / hmma_bts_1; + // size of each pack (interleaving) + pack_size_0_ = 1; + pack_size_1_ = 1; + // number of packs (interleaving) + num_packs_0_ = num_rep_0 / pack_size_0_; + num_packs_1_ = num_rep_1 / pack_size_1_; + + + /* intra warp offset */ + // offset of quad in pair + Value *in_pair_off_a = builder.CreateMul(builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), builder.getInt32(4)), builder.getInt32(pack_size_0_)); + Value *in_pair_off_b = builder.CreateMul(builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), builder.getInt32(4)), builder.getInt32(pack_size_1_)); + // Quad pair id + Value *pair_a_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), _4); + Value *pair_b_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), builder.CreateUDiv(_16, builder.getInt32(fpw_1))); + // Quad pair offset + Value *pair_a_off = builder.CreateURem(builder.CreateMul(pair_a_id, builder.getInt32(8 * pack_size_0_)), builder.getInt32(hmma_wts_0 * pack_size_0_)); + Value *pair_b_off = builder.CreateURem(builder.CreateMul(pair_b_id, builder.getInt32(8 * pack_size_1_)), builder.getInt32(hmma_wts_1 * pack_size_1_)); + + /* inter warp offset */ Value *warp_id_0 = builder.CreateURem(u_warp_id, builder.getInt32(wpt_0)); Value *warp_id_1 = builder.CreateUDiv(u_warp_id, builder.getInt32(wpt_0)); - Value *warp_offset_i = builder.CreateMul(warp_id_0, warp_tile_size_0); - Value *warp_offset_j = builder.CreateMul(warp_id_1, warp_tile_size_1); + Value *warp_offset_i = builder.CreateMul(warp_id_0, builder.getInt32(hmma_wts_0 * pack_size_0_)); + Value *warp_offset_j = builder.CreateMul(warp_id_1, builder.getInt32(hmma_wts_1 * pack_size_1_)); - // offset_i = (tid & 1) + (tid & 4)*2 + (tid & 16)/4 - Value *offset_i = builder.CreateAdd(warp_offset_i, - builder.CreateAdd(builder.CreateAnd(u_thread_id, _1), - builder.CreateAdd(qpa_off, qa_off))); + /* offsets */ + // a offset + offset_a_i_ = builder.CreateAdd(warp_offset_i, builder.CreateAdd(pair_a_off, in_pair_off_a)); + offset_a_k_ = builder.CreateAnd(u_thread_id, _3); + // b offsets + offset_b_j_ = builder.CreateAdd(warp_offset_j, builder.CreateAdd(pair_b_off, in_pair_off_b)); + offset_b_k_ = builder.CreateAnd(u_thread_id, _3); + // c offsets + Value *offset_c_i = builder.CreateAdd(builder.CreateAnd(u_thread_id, _1), offset_a_i_); + Value *offset_c_j = builder.CreateAdd(warp_offset_j, + builder.CreateAdd(builder.CreateAnd(u_thread_id, _2), + pair_b_off)); - // repetitions - unsigned stride_rep_i = wpt_0 * wts_0; - unsigned stride_rep_j = wpt_1 * wts_1; - - // idx_i + /* indices */ + // i indices std::vector idx_i; - for(unsigned base_i = 0; base_i < shapes[0]->get_value(); base_i += stride_rep_i) + for(unsigned pack = 0; pack < num_packs_0_; pack++) + for(unsigned ii = 0; ii < pack_size_0_; ii++) for(unsigned i = 0; i < 2; i++){ - idx_i.push_back(builder.CreateAdd(offset_i, builder.getInt32(base_i + i*2))); + idx_i.push_back(builder.CreateAdd(offset_c_i, builder.getInt32(pack*hmma_bts_0*pack_size_0_ + ii*4 + i*2))); } - - // offset_j = tid & 2 + tid & 8 - Value *offset_j = builder.CreateAdd(warp_offset_j, - builder.CreateAdd(builder.CreateAnd(u_thread_id, _2), - qpb_off)); - - - // idx_j + // j indices std::vector idx_j; - for(unsigned base_j = 0; base_j < shapes[1]->get_value(); base_j += stride_rep_j) + for(unsigned pack = 0; pack < num_packs_1_; pack++) + for(unsigned jj = 0; jj < pack_size_1_; jj++) for(unsigned j = 0; j < 2; j++){ - idx_j.push_back(builder.CreateAdd(offset_j, builder.getInt32(base_j + j*4))); - idx_j.push_back(builder.CreateAdd(offset_j, builder.getInt32(base_j + j*4 + 1))); + idx_j.push_back(builder.CreateAdd(offset_c_j, builder.getInt32(pack*hmma_bts_1*pack_size_1_ + jj*8 + j*4))); + idx_j.push_back(builder.CreateAdd(offset_c_j, builder.getInt32(pack*hmma_bts_1*pack_size_1_ + jj*8 + j*4 + 1))); } - - + /* axes */ axes_[params_->get_param_group(v, 0)] = distributed_axis{1, idx_i}; axes_[params_->get_param_group(v, 1)] = distributed_axis{1, idx_j}; } @@ -910,55 +925,6 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & TB->set_vector_size(4); TA->set_return_mode(true); TB->set_return_mode(true); - Value *_0 = builder.getInt32(0); - Value *_1 = builder.getInt32(1); - Value *_2 = builder.getInt32(2); - Value *_3 = builder.getInt32(3); - Value *_4 = builder.getInt32(4); - Value *_8 = builder.getInt32(8); - Value *_16 = builder.getInt32(16); - unsigned fpw_0 = params_->get_param(dot, "fpw.d0")->get_value(); - unsigned fpw_1 = params_->get_param(dot, "fpw.d1")->get_value(); - unsigned wts_0 = fpw_0 * 8; - unsigned wts_1 = fpw_1 * 8; - Value *warp_tile_size_0 = builder.getInt32(wts_0); - Value *warp_tile_size_1 = builder.getInt32(wts_1); - - BasicBlock *current = builder.GetInsertBlock(); - Module *module = current->getModule(); - Value *tid = tgt_->get_local_id(module, builder, 0); - Value *u_thread_id = builder.CreateURem(tid, builder.getInt32(32)); - Value *u_warp_id = builder.CreateUDiv(tid, builder.getInt32(32)); - - /* intra-warp offset */ - Value *qpa_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), _4); // quad pair id - Value *qpb_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), builder.CreateUDiv(_16, builder.getInt32(fpw_1))); // quad pair id - Value *qpa_off = builder.CreateURem(builder.CreateMul(qpa_id, _8), warp_tile_size_0); // offset of LHS quad pair in warp - Value *qpb_off = builder.CreateURem(builder.CreateMul(qpb_id, _8), warp_tile_size_1); // offset of quad pair in warp - Value *q_off = builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), _4);// offset of quad in pair - - /* inter-warp offset */ - unsigned wpt_0 = params_->get_param(dot, "wpt.d0")->get_value(); - unsigned wpt_1 = params_->get_param(dot, "wpt.d1")->get_value(); - Value *warp_id_0 = builder.CreateURem(u_warp_id, builder.getInt32(wpt_0)); - Value *warp_id_1 = builder.CreateUDiv(u_warp_id, builder.getInt32(wpt_0)); - Value *warp_offset_i = builder.CreateMul(warp_id_0, warp_tile_size_0); - Value *warp_offset_j = builder.CreateMul(warp_id_1, warp_tile_size_1); - - /* repetitions */ - unsigned stride_rep_i = wpt_0 * wts_0; - unsigned stride_rep_j = wpt_1 * wts_1; - - // offset_a_i = (tid & 4)*2 + (tid & 16)/4; - // offset_a_k = (tid & 3) - Value *offset_a_i = builder.CreateAdd(warp_offset_i, builder.CreateAdd(qpa_off, q_off)); - Value *offset_a_k = builder.CreateAnd(u_thread_id, _3); - - // offset_b_i = (tid & 8)*1 + (tid & 16)/4 - // offset_b_k = (tid & 3) - Value *offset_b_i = builder.CreateAdd(warp_offset_j, builder.CreateAdd(qpb_off, q_off)); - Value *offset_b_k = builder.CreateAnd(u_thread_id, _3); - std::vector fc; result->for_each([&](indices_t idx){ @@ -976,30 +942,40 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & "{$10, $11}, " "{$0, $1, $2, $3, $4, $5, $6, $7};", "=f,=f,=f,=f,=f,=f,=f,=f,r,r,r,r,0,1,2,3,4,5,6,7", false); + unsigned fpw_0 = params_->get_param(dot, "fpw.d0")->get_value(); + unsigned fpw_1 = params_->get_param(dot, "fpw.d1")->get_value(); + unsigned wts_0 = fpw_0 * 8; + unsigned wts_1 = fpw_1 * 8; + unsigned wpt_0 = params_->get_param(dot, "wpt.d0")->get_value(); + unsigned wpt_1 = params_->get_param(dot, "wpt.d1")->get_value(); + unsigned stride_rep_i = wpt_0 * wts_0; + unsigned stride_rep_j = wpt_1 * wts_1; unsigned num_rep_i = shapes[0]->get_value() / stride_rep_i; unsigned num_rep_j = shapes[1]->get_value() / stride_rep_j; unsigned ld_fc = num_rep_i * 2; - for(unsigned ii = 0; ii < num_rep_i; ii++) - for(unsigned jj = 0; jj < num_rep_j; jj++) + for(unsigned pack_i = 0; pack_i < num_packs_0_; pack_i++) + for(unsigned ii = 0; ii < pack_size_0_; ii++) + for(unsigned pack_j = 0; pack_j < num_packs_1_; pack_j++) + for(unsigned jj = 0; jj < pack_size_1_; jj++) for(unsigned K = 0; K < NK; K += 4){ Value *_K = builder.getInt32(K); - Value *current_offset_a_i = builder.CreateAdd(offset_a_i, builder.getInt32(ii * stride_rep_i)); - Value *current_offset_b_i = builder.CreateAdd(offset_b_i, builder.getInt32(jj * stride_rep_j)); - Value *ha = TA->get_value({current_offset_a_i, builder.CreateAdd(offset_a_k, _K)}); - Value *hb = TB->get_value({current_offset_b_i, builder.CreateAdd(offset_b_k, _K)}); + Value *current_offset_a_i = builder.CreateAdd(offset_a_i_, builder.getInt32(pack_i*stride_rep_i*pack_size_0_ + ii*4)); + Value *current_offset_b_i = builder.CreateAdd(offset_b_j_, builder.getInt32(pack_j*stride_rep_j*pack_size_1_ + jj*4)); + Value *ha = TA->get_value({current_offset_a_i, builder.CreateAdd(offset_a_k_, _K)}); + Value *hb = TB->get_value({current_offset_b_i, builder.CreateAdd(offset_b_k_, _K)}); Value *ha0 = builder.CreateExtractElement(ha, builder.getInt32(0)); Value *ha1 = builder.CreateExtractElement(ha, builder.getInt32(1)); Value *hb0 = builder.CreateExtractElement(hb, builder.getInt32(0)); Value *hb1 = builder.CreateExtractElement(hb, builder.getInt32(1)); std::vector idx = { - (ii*2 + 0) + (jj*4 + 0)*ld_fc, - (ii*2 + 0) + (jj*4 + 1)*ld_fc, - (ii*2 + 1) + (jj*4 + 0)*ld_fc, - (ii*2 + 1) + (jj*4 + 1)*ld_fc, - (ii*2 + 0) + (jj*4 + 2)*ld_fc, - (ii*2 + 0) + (jj*4 + 3)*ld_fc, - (ii*2 + 1) + (jj*4 + 2)*ld_fc, - (ii*2 + 1) + (jj*4 + 3)*ld_fc + (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 0)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 1)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 0)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 1)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 2)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 3)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 2)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 3)*ld_fc }; Value *nc = builder.CreateCall(mma_fn, {ha0, ha1, hb0, hb1, fc[idx[0]], fc[idx[1]], fc[idx[2]], fc[idx[3]], fc[idx[4]], fc[idx[5]], fc[idx[6]], fc[idx[7]]}); fc[idx[0]] = builder.CreateExtractValue(nc, {0}); diff --git a/lib/codegen/shmem_allocation.cpp b/lib/codegen/shmem_allocation.cpp index 60df4a9f3..379ba0216 100644 --- a/lib/codegen/shmem_allocation.cpp +++ b/lib/codegen/shmem_allocation.cpp @@ -24,7 +24,7 @@ unsigned shmem_allocation::is_ld_padded(ir::value *x) { if(dynamic_cast(user)) if(params_->get_fragment(user, 0) == tune::HMMA_FRAGMENT_C){ if(x == user->get_operand(0)) - return 16; + return 8; else return 16; } diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 4ff863666..c3139ece6 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -255,7 +255,7 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ -// std::cout << source << std::endl; + std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; From 1c6372711b4ea1bc2fd6515fe46d8f0579d253b8 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 12 Jun 2019 20:30:28 -0700 Subject: [PATCH 177/494] added interleaving --- lib/codegen/selection.cpp | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 5f3a28d71..bf6600dec 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -1,4 +1,4 @@ -#include "triton/codegen/selection.h" +#include "triton/codegen/selection.h" #include "triton/codegen/tune.h" #include "triton/codegen/shmem_allocation.h" #include "triton/codegen/target.h" @@ -501,8 +501,8 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id unsigned num_rep_0 = shapes[0]->get_value() / hmma_bts_0; unsigned num_rep_1 = shapes[1]->get_value() / hmma_bts_1; // size of each pack (interleaving) - pack_size_0_ = 1; - pack_size_1_ = 1; + pack_size_0_ = 2; + pack_size_1_ = 2; // number of packs (interleaving) num_packs_0_ = num_rep_0 / pack_size_0_; num_packs_1_ = num_rep_1 / pack_size_1_; @@ -514,10 +514,12 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id Value *in_pair_off_b = builder.CreateMul(builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), builder.getInt32(4)), builder.getInt32(pack_size_1_)); // Quad pair id Value *pair_a_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), _4); - Value *pair_b_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), builder.CreateUDiv(_16, builder.getInt32(fpw_1))); + Value *pair_b_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), _4); + pair_a_id = builder.CreateURem(pair_a_id, builder.getInt32(fpw_0)); + pair_b_id = builder.CreateUDiv(pair_b_id, builder.getInt32(fpw_0)); // Quad pair offset - Value *pair_a_off = builder.CreateURem(builder.CreateMul(pair_a_id, builder.getInt32(8 * pack_size_0_)), builder.getInt32(hmma_wts_0 * pack_size_0_)); - Value *pair_b_off = builder.CreateURem(builder.CreateMul(pair_b_id, builder.getInt32(8 * pack_size_1_)), builder.getInt32(hmma_wts_1 * pack_size_1_)); + Value *pair_a_off = builder.CreateMul(pair_a_id, builder.getInt32(8 * pack_size_0_)); + Value *pair_b_off = builder.CreateMul(pair_b_id, builder.getInt32(8 * pack_size_1_)); /* inter warp offset */ Value *warp_id_0 = builder.CreateURem(u_warp_id, builder.getInt32(wpt_0)); @@ -534,9 +536,8 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id offset_b_k_ = builder.CreateAnd(u_thread_id, _3); // c offsets Value *offset_c_i = builder.CreateAdd(builder.CreateAnd(u_thread_id, _1), offset_a_i_); - Value *offset_c_j = builder.CreateAdd(warp_offset_j, - builder.CreateAdd(builder.CreateAnd(u_thread_id, _2), - pair_b_off)); + Value *offset_c_j = builder.CreateAdd(builder.CreateAnd(u_thread_id, _2), + builder.CreateAdd(warp_offset_j, pair_b_off)); /* indices */ // i indices @@ -551,8 +552,8 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id for(unsigned pack = 0; pack < num_packs_1_; pack++) for(unsigned jj = 0; jj < pack_size_1_; jj++) for(unsigned j = 0; j < 2; j++){ - idx_j.push_back(builder.CreateAdd(offset_c_j, builder.getInt32(pack*hmma_bts_1*pack_size_1_ + jj*8 + j*4))); - idx_j.push_back(builder.CreateAdd(offset_c_j, builder.getInt32(pack*hmma_bts_1*pack_size_1_ + jj*8 + j*4 + 1))); + idx_j.push_back(builder.CreateAdd(offset_c_j, builder.getInt32(pack*hmma_bts_1*pack_size_1_ + jj*4 + j*4*pack_size_1_))); + idx_j.push_back(builder.CreateAdd(offset_c_j, builder.getInt32(pack*hmma_bts_1*pack_size_1_ + jj*4 + j*4*pack_size_1_ + 1))); } /* axes */ From d487cf31cecc7c013f2fecf45f45158a2243110b Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 12 Jun 2019 21:07:01 -0700 Subject: [PATCH 178/494] trying 128 bits loads --- examples/python/tensorflow/dot.cpp | 4 +- examples/python/tensorflow/run.py | 14 +++---- lib/codegen/selection.cpp | 66 +++++++++++++++--------------- lib/codegen/shmem_allocation.cpp | 4 +- lib/driver/module.cpp | 2 +- 5 files changed, 46 insertions(+), 44 deletions(-) diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index fae790eb1..c31d8745c 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -23,7 +23,7 @@ const char* src = R"( const tunable int32 TM = {64, 128}; const tunable int32 TN = {64, 128}; -const tunable int32 TK = {32}; +const tunable int32 TK = {16}; const tunable int32 GZ = {1}; void matmul(restrict read_only fp16 *A, restrict read_only fp16 *B, @@ -117,7 +117,7 @@ class BlockSparseGemmOp : public OpKernel { return 2.*M*N*K / ts * 1e-3; }; // just-in-time compile source-code -// jit.autotune("matmul", src, benchmark); + jit.autotune("matmul", src, benchmark); // jit.add_module("matmul", src, {4, 2, 8, 4, 2, 32, 1, 4, 1, 1, 8, 8, 8, 1}); // jit.add_module("matmul", src, {32, 2, 128, 32, 2, 128, 2, 2, 2, 2, 4, 8, 4, 1}); jit.add_module("matmul", src, {16, 4, 128, 16, 4, 128, 2, 2, 2, 2, 8, 32, 8, 1}); diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index ebb01ddad..86c0bc999 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -6,7 +6,7 @@ data_files_path = tf.resource_loader.get_data_files_path() library_dir = os.path.dirname(os.path.realpath(__file__)) module = tf.load_op_library(os.path.join(library_dir, 'libtf_blocksparse.so')) -M, N, K = 128, 128, 128 +M, N, K = 8192, 8192, 8192 a = tf.placeholder(tf.float16, shape=[M, K]) b = tf.placeholder(tf.float16, shape=[N, K]) locks = tf.placeholder(tf.int32, shape=[4096]) @@ -30,10 +30,10 @@ result = sess.run([c], feed_dict = {locks: np.zeros(4096), # min_iters=100) #print(end - start) #print(2*M*N*K / (end - start) * 1e-12) -hresult = np.dot(ha.T, hb).T -dif = np.abs(result - hresult) -print("dif: %f" % np.max(dif)) +#hresult = np.dot(ha.T, hb).T +#dif = np.abs(result - hresult) +#print("dif: %f" % np.max(dif)) -np.savetxt("dif.txt", dif, fmt="%5.2f") -np.savetxt("gpu.txt", result, fmt="%5.2f") -np.savetxt("cpu.txt", hresult, fmt="%5.2f") +#np.savetxt("dif.txt", dif, fmt="%5.2f") +#np.savetxt("gpu.txt", result, fmt="%5.2f") +#np.savetxt("cpu.txt", hresult, fmt="%5.2f") diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index bf6600dec..e265a0fc7 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -501,8 +501,8 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id unsigned num_rep_0 = shapes[0]->get_value() / hmma_bts_0; unsigned num_rep_1 = shapes[1]->get_value() / hmma_bts_1; // size of each pack (interleaving) - pack_size_0_ = 2; - pack_size_1_ = 2; + pack_size_0_ = std::min(num_rep_0, 2); + pack_size_1_ = std::min(num_rep_1, 2); // number of packs (interleaving) num_packs_0_ = num_rep_0 / pack_size_0_; num_packs_1_ = num_rep_1 / pack_size_1_; @@ -922,8 +922,8 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & } else { - TA->set_vector_size(4); - TB->set_vector_size(4); + TA->set_vector_size(4*pack_size_0_); + TB->set_vector_size(4*pack_size_1_); TA->set_return_mode(true); TB->set_return_mode(true); @@ -955,38 +955,40 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & unsigned num_rep_j = shapes[1]->get_value() / stride_rep_j; unsigned ld_fc = num_rep_i * 2; for(unsigned pack_i = 0; pack_i < num_packs_0_; pack_i++) - for(unsigned ii = 0; ii < pack_size_0_; ii++) - for(unsigned pack_j = 0; pack_j < num_packs_1_; pack_j++) - for(unsigned jj = 0; jj < pack_size_1_; jj++) + for(unsigned pack_j = 0; pack_j < num_packs_1_; pack_j++){ for(unsigned K = 0; K < NK; K += 4){ Value *_K = builder.getInt32(K); - Value *current_offset_a_i = builder.CreateAdd(offset_a_i_, builder.getInt32(pack_i*stride_rep_i*pack_size_0_ + ii*4)); - Value *current_offset_b_i = builder.CreateAdd(offset_b_j_, builder.getInt32(pack_j*stride_rep_j*pack_size_1_ + jj*4)); + Value *current_offset_a_i = builder.CreateAdd(offset_a_i_, builder.getInt32(pack_i*stride_rep_i*pack_size_0_)); + Value *current_offset_b_i = builder.CreateAdd(offset_b_j_, builder.getInt32(pack_j*stride_rep_j*pack_size_1_)); Value *ha = TA->get_value({current_offset_a_i, builder.CreateAdd(offset_a_k_, _K)}); Value *hb = TB->get_value({current_offset_b_i, builder.CreateAdd(offset_b_k_, _K)}); - Value *ha0 = builder.CreateExtractElement(ha, builder.getInt32(0)); - Value *ha1 = builder.CreateExtractElement(ha, builder.getInt32(1)); - Value *hb0 = builder.CreateExtractElement(hb, builder.getInt32(0)); - Value *hb1 = builder.CreateExtractElement(hb, builder.getInt32(1)); - std::vector idx = { - (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 0)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 1)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 0)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 1)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 2)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 3)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 2)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 3)*ld_fc - }; - Value *nc = builder.CreateCall(mma_fn, {ha0, ha1, hb0, hb1, fc[idx[0]], fc[idx[1]], fc[idx[2]], fc[idx[3]], fc[idx[4]], fc[idx[5]], fc[idx[6]], fc[idx[7]]}); - fc[idx[0]] = builder.CreateExtractValue(nc, {0}); - fc[idx[1]] = builder.CreateExtractValue(nc, {1}); - fc[idx[2]] = builder.CreateExtractValue(nc, {2}); - fc[idx[3]] = builder.CreateExtractValue(nc, {3}); - fc[idx[4]] = builder.CreateExtractValue(nc, {4}); - fc[idx[5]] = builder.CreateExtractValue(nc, {5}); - fc[idx[6]] = builder.CreateExtractValue(nc, {6}); - fc[idx[7]] = builder.CreateExtractValue(nc, {7}); + for(unsigned ii = 0; ii < pack_size_0_; ii++) + for(unsigned jj = 0; jj < pack_size_1_; jj++){ + Value *ha0 = builder.CreateExtractElement(ha, builder.getInt32(ii*pack_size_0_ + 0)); + Value *ha1 = builder.CreateExtractElement(ha, builder.getInt32(ii*pack_size_0_ + 1)); + Value *hb0 = builder.CreateExtractElement(hb, builder.getInt32(jj*pack_size_0_ + 0)); + Value *hb1 = builder.CreateExtractElement(hb, builder.getInt32(jj*pack_size_0_ + 1)); + std::vector idx = { + (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 0)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 1)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 0)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 1)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 2)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 3)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 2)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 3)*ld_fc + }; + Value *nc = builder.CreateCall(mma_fn, {ha0, ha1, hb0, hb1, fc[idx[0]], fc[idx[1]], fc[idx[2]], fc[idx[3]], fc[idx[4]], fc[idx[5]], fc[idx[6]], fc[idx[7]]}); + fc[idx[0]] = builder.CreateExtractValue(nc, {0}); + fc[idx[1]] = builder.CreateExtractValue(nc, {1}); + fc[idx[2]] = builder.CreateExtractValue(nc, {2}); + fc[idx[3]] = builder.CreateExtractValue(nc, {3}); + fc[idx[4]] = builder.CreateExtractValue(nc, {4}); + fc[idx[5]] = builder.CreateExtractValue(nc, {5}); + fc[idx[6]] = builder.CreateExtractValue(nc, {6}); + fc[idx[7]] = builder.CreateExtractValue(nc, {7}); + } + } } // write back diff --git a/lib/codegen/shmem_allocation.cpp b/lib/codegen/shmem_allocation.cpp index 379ba0216..6ef2101c2 100644 --- a/lib/codegen/shmem_allocation.cpp +++ b/lib/codegen/shmem_allocation.cpp @@ -24,9 +24,9 @@ unsigned shmem_allocation::is_ld_padded(ir::value *x) { if(dynamic_cast(user)) if(params_->get_fragment(user, 0) == tune::HMMA_FRAGMENT_C){ if(x == user->get_operand(0)) - return 8; + return 4; else - return 16; + return 4; } return 0; } diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index c3139ece6..4ff863666 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -255,7 +255,7 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ - std::cout << source << std::endl; +// std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; From 21a9b92c87352f38072ed25df8d9312df702c74b Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 13 Jun 2019 17:16:00 -0700 Subject: [PATCH 179/494] disabling interleaving --- examples/python/tensorflow/dot.cpp | 6 +++--- lib/codegen/selection.cpp | 11 ++++++++--- lib/codegen/shmem_allocation.cpp | 7 ++++--- lib/driver/module.cpp | 2 +- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index c31d8745c..183b3f492 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -23,7 +23,7 @@ const char* src = R"( const tunable int32 TM = {64, 128}; const tunable int32 TN = {64, 128}; -const tunable int32 TK = {16}; +const tunable int32 TK = {32}; const tunable int32 GZ = {1}; void matmul(restrict read_only fp16 *A, restrict read_only fp16 *B, @@ -117,10 +117,10 @@ class BlockSparseGemmOp : public OpKernel { return 2.*M*N*K / ts * 1e-3; }; // just-in-time compile source-code - jit.autotune("matmul", src, benchmark); +// jit.autotune("matmul", src, benchmark); // jit.add_module("matmul", src, {4, 2, 8, 4, 2, 32, 1, 4, 1, 1, 8, 8, 8, 1}); -// jit.add_module("matmul", src, {32, 2, 128, 32, 2, 128, 2, 2, 2, 2, 4, 8, 4, 1}); jit.add_module("matmul", src, {16, 4, 128, 16, 4, 128, 2, 2, 2, 2, 8, 32, 8, 1}); +// jit.add_module("matmul", src, {8, 8, 128, 16, 8, 128, 2, 2, 2, 2, 16, 32, 8, 1 }); triton::driver::kernel* kernel = jit.get_function("matmul"); triton::jit::launch_information info = jit.get_launch_info("matmul"); std::cout << benchmark(kernel, info) << std::endl;; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index e265a0fc7..5c1d67bf3 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -501,8 +501,8 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id unsigned num_rep_0 = shapes[0]->get_value() / hmma_bts_0; unsigned num_rep_1 = shapes[1]->get_value() / hmma_bts_1; // size of each pack (interleaving) - pack_size_0_ = std::min(num_rep_0, 2); - pack_size_1_ = std::min(num_rep_1, 2); + pack_size_0_ = std::min(num_rep_0, 1); + pack_size_1_ = std::min(num_rep_1, 1); // number of packs (interleaving) num_packs_0_ = num_rep_0 / pack_size_0_; num_packs_1_ = num_rep_1 / pack_size_1_; @@ -531,9 +531,14 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id // a offset offset_a_i_ = builder.CreateAdd(warp_offset_i, builder.CreateAdd(pair_a_off, in_pair_off_a)); offset_a_k_ = builder.CreateAnd(u_thread_id, _3); - // b offsets +// // b offsets offset_b_j_ = builder.CreateAdd(warp_offset_j, builder.CreateAdd(pair_b_off, in_pair_off_b)); offset_b_k_ = builder.CreateAnd(u_thread_id, _3); +// offset_a_i_ = builder.getInt32(0); +// offset_a_k_ = builder.getInt32(0); +// offset_b_j_ = builder.getInt32(0); +// offset_b_k_ = builder.getInt32(0); + // c offsets Value *offset_c_i = builder.CreateAdd(builder.CreateAnd(u_thread_id, _1), offset_a_i_); Value *offset_c_j = builder.CreateAdd(builder.CreateAnd(u_thread_id, _2), diff --git a/lib/codegen/shmem_allocation.cpp b/lib/codegen/shmem_allocation.cpp index 6ef2101c2..aa7aada71 100644 --- a/lib/codegen/shmem_allocation.cpp +++ b/lib/codegen/shmem_allocation.cpp @@ -23,10 +23,11 @@ unsigned shmem_allocation::is_ld_padded(ir::value *x) { for(ir::user* user: x->get_users()) if(dynamic_cast(user)) if(params_->get_fragment(user, 0) == tune::HMMA_FRAGMENT_C){ - if(x == user->get_operand(0)) - return 4; + if(x == user->get_operand(0)){ + return 16; + } else - return 4; + return 16; } return 0; } diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 4ff863666..c3139ece6 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -255,7 +255,7 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ -// std::cout << source << std::endl; + std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; From 36e3667a9a43697501159c48a7f50ffcd0d5680c Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 13 Jun 2019 17:51:54 -0700 Subject: [PATCH 180/494] removed shared conflicts for 8x32x4 and 32x8x4 configurations --- examples/python/tensorflow/dot.cpp | 2 +- examples/python/tensorflow/run.py | 8 ++++---- lib/codegen/selection.cpp | 12 ++++++------ 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index 183b3f492..cf88693e4 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -119,7 +119,7 @@ class BlockSparseGemmOp : public OpKernel { // just-in-time compile source-code // jit.autotune("matmul", src, benchmark); // jit.add_module("matmul", src, {4, 2, 8, 4, 2, 32, 1, 4, 1, 1, 8, 8, 8, 1}); - jit.add_module("matmul", src, {16, 4, 128, 16, 4, 128, 2, 2, 2, 2, 8, 32, 8, 1}); + jit.add_module("matmul", src, {16, 4, 128, 16, 4, 128, 1, 4, 2, 2, 8, 32, 8, 1}); // jit.add_module("matmul", src, {8, 8, 128, 16, 8, 128, 2, 2, 2, 2, 16, 32, 8, 1 }); triton::driver::kernel* kernel = jit.get_function("matmul"); triton::jit::launch_information info = jit.get_launch_info("matmul"); diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 86c0bc999..94764e515 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -6,7 +6,7 @@ data_files_path = tf.resource_loader.get_data_files_path() library_dir = os.path.dirname(os.path.realpath(__file__)) module = tf.load_op_library(os.path.join(library_dir, 'libtf_blocksparse.so')) -M, N, K = 8192, 8192, 8192 +M, N, K = 256, 256, 256 a = tf.placeholder(tf.float16, shape=[M, K]) b = tf.placeholder(tf.float16, shape=[N, K]) locks = tf.placeholder(tf.int32, shape=[4096]) @@ -30,9 +30,9 @@ result = sess.run([c], feed_dict = {locks: np.zeros(4096), # min_iters=100) #print(end - start) #print(2*M*N*K / (end - start) * 1e-12) -#hresult = np.dot(ha.T, hb).T -#dif = np.abs(result - hresult) -#print("dif: %f" % np.max(dif)) +hresult = np.dot(ha.T, hb).T +dif = np.abs(result - hresult) +print("dif: %f" % np.max(dif)) #np.savetxt("dif.txt", dif, fmt="%5.2f") #np.savetxt("gpu.txt", result, fmt="%5.2f") diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 5c1d67bf3..9ba1fa870 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -510,16 +510,16 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id /* intra warp offset */ // offset of quad in pair - Value *in_pair_off_a = builder.CreateMul(builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), builder.getInt32(4)), builder.getInt32(pack_size_0_)); - Value *in_pair_off_b = builder.CreateMul(builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), builder.getInt32(4)), builder.getInt32(pack_size_1_)); + Value *in_pair_off_a = builder.CreateMul(builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), builder.getInt32(4)), builder.getInt32(fpw_0 * pack_size_0_)); + Value *in_pair_off_b = builder.CreateMul(builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), builder.getInt32(4)), builder.getInt32(fpw_1 * pack_size_1_)); // Quad pair id Value *pair_a_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), _4); Value *pair_b_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), _4); pair_a_id = builder.CreateURem(pair_a_id, builder.getInt32(fpw_0)); pair_b_id = builder.CreateUDiv(pair_b_id, builder.getInt32(fpw_0)); // Quad pair offset - Value *pair_a_off = builder.CreateMul(pair_a_id, builder.getInt32(8 * pack_size_0_)); - Value *pair_b_off = builder.CreateMul(pair_b_id, builder.getInt32(8 * pack_size_1_)); + Value *pair_a_off = builder.CreateMul(pair_a_id, builder.getInt32(4 * pack_size_0_)); + Value *pair_b_off = builder.CreateMul(pair_b_id, builder.getInt32(4 * pack_size_1_)); /* inter warp offset */ Value *warp_id_0 = builder.CreateURem(u_warp_id, builder.getInt32(wpt_0)); @@ -557,8 +557,8 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id for(unsigned pack = 0; pack < num_packs_1_; pack++) for(unsigned jj = 0; jj < pack_size_1_; jj++) for(unsigned j = 0; j < 2; j++){ - idx_j.push_back(builder.CreateAdd(offset_c_j, builder.getInt32(pack*hmma_bts_1*pack_size_1_ + jj*4 + j*4*pack_size_1_))); - idx_j.push_back(builder.CreateAdd(offset_c_j, builder.getInt32(pack*hmma_bts_1*pack_size_1_ + jj*4 + j*4*pack_size_1_ + 1))); + idx_j.push_back(builder.CreateAdd(offset_c_j, builder.getInt32(pack*hmma_bts_1*pack_size_1_ + jj*4 + j*4*fpw_1*pack_size_1_))); + idx_j.push_back(builder.CreateAdd(offset_c_j, builder.getInt32(pack*hmma_bts_1*pack_size_1_ + jj*4 + j*4*fpw_1*pack_size_1_ + 1))); } /* axes */ From f7dcea11875ef565f772d393fcca428293df6fb3 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 13 Jun 2019 19:48:02 -0700 Subject: [PATCH 181/494] Now doing double-buffering --- examples/python/tensorflow/dot.cpp | 19 +++++++++++-------- examples/python/tensorflow/run.py | 8 ++++---- lib/codegen/selection.cpp | 11 ++++++----- lib/codegen/shmem_allocation.cpp | 10 +++------- lib/driver/module.cpp | 2 +- 5 files changed, 25 insertions(+), 25 deletions(-) diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index cf88693e4..937309df9 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -23,7 +23,7 @@ const char* src = R"( const tunable int32 TM = {64, 128}; const tunable int32 TN = {64, 128}; -const tunable int32 TK = {32}; +const tunable int32 TK = {16}; const tunable int32 GZ = {1}; void matmul(restrict read_only fp16 *A, restrict read_only fp16 *B, @@ -39,12 +39,14 @@ void matmul(restrict read_only fp16 *A, restrict read_only fp16 *B, fp32 c[TM, TN] = 0; fp16* pa[TM, TK] = A + rka[newaxis, :]*lda + rxa[:, newaxis]; fp16* pb[TN, TK] = B + rkb[newaxis, :]*ldb + ryb[:, newaxis]; - for(int32 k = K; k > 0; k = k - TK){ - fp16 a[TM, TK] = *pa; - fp16 b[TN, TK] = *pb; - c = dot(a, trans(b), c); + fp16 a[TM, TK] = *pa; + fp16 b[TN, TK] = *pb; + for(int32 k = K; k > TK; k = k - TK){ pa = pa + TK*lda; pb = pb + TK*ldb; + c = dot(a, trans(b), c); + a = *pa; + b = *pb; } int32 rxc[TM] = get_global_range[TM](0); int32 ryc[TN] = get_global_range[TN](1); @@ -116,11 +118,12 @@ class BlockSparseGemmOp : public OpKernel { [&](){ stream->synchronize(); }, ctx->device()); return 2.*M*N*K / ts * 1e-3; }; - // just-in-time compile source-code -// jit.autotune("matmul", src, benchmark); +// just-in-time compile source-code + jit.autotune("matmul", src, benchmark); // jit.add_module("matmul", src, {4, 2, 8, 4, 2, 32, 1, 4, 1, 1, 8, 8, 8, 1}); - jit.add_module("matmul", src, {16, 4, 128, 16, 4, 128, 1, 4, 2, 2, 8, 32, 8, 1}); +// jit.add_module("matmul", src, {16, 4, 128, 16, 4, 128, 2, 2, 2, 2, 8, 32, 8, 1}); // jit.add_module("matmul", src, {8, 8, 128, 16, 8, 128, 2, 2, 2, 2, 16, 32, 8, 1 }); + jit.add_module("matmul", src, {16, 4, 128, 16, 4, 128, 2, 2, 2, 2, 8, 16, 8, 1}); triton::driver::kernel* kernel = jit.get_function("matmul"); triton::jit::launch_information info = jit.get_launch_info("matmul"); std::cout << benchmark(kernel, info) << std::endl;; diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 94764e515..86c0bc999 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -6,7 +6,7 @@ data_files_path = tf.resource_loader.get_data_files_path() library_dir = os.path.dirname(os.path.realpath(__file__)) module = tf.load_op_library(os.path.join(library_dir, 'libtf_blocksparse.so')) -M, N, K = 256, 256, 256 +M, N, K = 8192, 8192, 8192 a = tf.placeholder(tf.float16, shape=[M, K]) b = tf.placeholder(tf.float16, shape=[N, K]) locks = tf.placeholder(tf.int32, shape=[4096]) @@ -30,9 +30,9 @@ result = sess.run([c], feed_dict = {locks: np.zeros(4096), # min_iters=100) #print(end - start) #print(2*M*N*K / (end - start) * 1e-12) -hresult = np.dot(ha.T, hb).T -dif = np.abs(result - hresult) -print("dif: %f" % np.max(dif)) +#hresult = np.dot(ha.T, hb).T +#dif = np.abs(result - hresult) +#print("dif: %f" % np.max(dif)) #np.savetxt("dif.txt", dif, fmt="%5.2f") #np.savetxt("gpu.txt", result, fmt="%5.2f") diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 9ba1fa870..31cee7e6b 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -160,12 +160,18 @@ Value* shared_tile::get_value(indices_t idx) { vector_size = vector_size / 2; } if(base_ptr == nullptr){ +// BasicBlock* store = builder_.GetInsertBlock(); +// if(!non_cst_idx.empty()) +// if(isa(non_cst_idx.front())){ +// builder_.SetInsertPoint((Instruction*)non_cst_idx.front()); +// } base_ptr = builder_.CreateGEP(ptr_, shared_offset(non_cst_idx)); if(vector_size_ > 1){ Type *vec_ty = VectorType::get(ty, vector_size); Type *vec_ptr_ty = PointerType::get(vec_ty, base_ptr->getType()->getPointerAddressSpace()); base_ptr = builder_.CreateBitCast(base_ptr, vec_ptr_ty); } +// builder_.SetInsertPoint(store); } Value *offset = shared_offset(cst_idx); Value *div = offset; @@ -534,10 +540,6 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id // // b offsets offset_b_j_ = builder.CreateAdd(warp_offset_j, builder.CreateAdd(pair_b_off, in_pair_off_b)); offset_b_k_ = builder.CreateAnd(u_thread_id, _3); -// offset_a_i_ = builder.getInt32(0); -// offset_a_k_ = builder.getInt32(0); -// offset_b_j_ = builder.getInt32(0); -// offset_b_k_ = builder.getInt32(0); // c offsets Value *offset_c_i = builder.CreateAdd(builder.CreateAnd(u_thread_id, _1), offset_a_i_); @@ -957,7 +959,6 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & unsigned stride_rep_i = wpt_0 * wts_0; unsigned stride_rep_j = wpt_1 * wts_1; unsigned num_rep_i = shapes[0]->get_value() / stride_rep_i; - unsigned num_rep_j = shapes[1]->get_value() / stride_rep_j; unsigned ld_fc = num_rep_i * 2; for(unsigned pack_i = 0; pack_i < num_packs_0_; pack_i++) for(unsigned pack_j = 0; pack_j < num_packs_1_; pack_j++){ diff --git a/lib/codegen/shmem_allocation.cpp b/lib/codegen/shmem_allocation.cpp index aa7aada71..4941a75f5 100644 --- a/lib/codegen/shmem_allocation.cpp +++ b/lib/codegen/shmem_allocation.cpp @@ -13,9 +13,9 @@ namespace codegen{ unsigned shmem_allocation::is_ld_padded(ir::value *x) { if(auto* phi = dynamic_cast(x)) { - bool result = false; + unsigned result = 0; for(unsigned i = 0; i < phi->get_num_incoming(); i++) - result = result | is_ld_padded(phi->get_incoming_value(i)); + result = std::max(result, is_ld_padded(phi->get_incoming_value(i))); return result; } if(dynamic_cast(x)) @@ -23,11 +23,7 @@ unsigned shmem_allocation::is_ld_padded(ir::value *x) { for(ir::user* user: x->get_users()) if(dynamic_cast(user)) if(params_->get_fragment(user, 0) == tune::HMMA_FRAGMENT_C){ - if(x == user->get_operand(0)){ - return 16; - } - else - return 16; + return 16; } return 0; } diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index c3139ece6..4ff863666 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -255,7 +255,7 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ - std::cout << source << std::endl; +// std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; From 67989e7d18c7970454bc5ac4306401d5d862a054 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 13 Jun 2019 20:03:28 -0700 Subject: [PATCH 182/494] fixup --- lib/codegen/shmem_allocation.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/codegen/shmem_allocation.cpp b/lib/codegen/shmem_allocation.cpp index 4941a75f5..7940808bb 100644 --- a/lib/codegen/shmem_allocation.cpp +++ b/lib/codegen/shmem_allocation.cpp @@ -25,7 +25,7 @@ unsigned shmem_allocation::is_ld_padded(ir::value *x) { if(params_->get_fragment(user, 0) == tune::HMMA_FRAGMENT_C){ return 16; } - return 0; + return 16; } unsigned shmem_allocation::get_num_bytes(ir::value *x) { From f257884eb74712d7f864a9a8987daef8082cb54c Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 24 Jun 2019 09:31:34 -0700 Subject: [PATCH 183/494] some cleaning --- include/triton/dnn/gemm.h | 3 +++ include/triton/lang/declaration.h | 33 +++++++++++++++++++++-------- include/triton/lang/node.h | 2 +- include/triton/lang/parser.y | 22 +++++++++---------- include/triton/lang/scanner.l | 10 +++++---- lib/codegen/shmem_allocation.cpp | 14 ++++++------- lib/lang/declaration.cpp | 35 +++++++++++++++++-------------- 7 files changed, 71 insertions(+), 48 deletions(-) diff --git a/include/triton/dnn/gemm.h b/include/triton/dnn/gemm.h index 0697ea981..e44c9631d 100644 --- a/include/triton/dnn/gemm.h +++ b/include/triton/dnn/gemm.h @@ -8,11 +8,14 @@ namespace dnn{ class gemm { public: static void init(driver::stream* stream, driver::buffer* locks); + static void set_arg(driver::kernel *kernel, driver::buffer *a, driver::buffer *b, driver::buffer *c, int32_t M, int32_t N, int32_t K, driver::buffer *locks, int32_t grid_0, int32_t grid_1); + static std::vector default_params(bool AT, bool BT); + static std::string src(bool AT, bool BT); template diff --git a/include/triton/lang/declaration.h b/include/triton/lang/declaration.h index a7dbdb97e..22275630c 100644 --- a/include/triton/lang/declaration.h +++ b/include/triton/lang/declaration.h @@ -40,34 +40,49 @@ public: }; // Types +class modifier: public node { + +}; + +class storage_specifier: public node { +public: + storage_specifier(STORAGE_SPEC_T value): value_(value) {} + STORAGE_SPEC_T value() const { return value_; } + +private: + const STORAGE_SPEC_T value_; +}; + + class declaration_specifier: public node{ public: virtual ir::type* type(ir::module *mod) const = 0; - virtual std::vector storage() const = 0; + virtual std::vector modifiers() const = 0; }; class typed_declaration_specifier: public declaration_specifier { public: typed_declaration_specifier(TYPE_T ty): ty_(ty){ } ir::type* type(ir::module *mod) const; - std::vector storage() const; + std::vector modifiers() const; private: const TYPE_T ty_; }; -class storage_declaration_specifier: public declaration_specifier { +class declaration_modifier: public declaration_specifier { public: - storage_declaration_specifier(STORAGE_SPEC_T storage_spec, node *decl_spec) - : storage_spec_(storage_spec), decl_spec_((declaration_specifier*)decl_spec) {} + declaration_modifier(node* mod, node *decl_spec) + : mod_((modifier*)mod), decl_spec_((declaration_specifier*)decl_spec) {} ir::type* type(ir::module *mod) const; - std::vector storage() const; + std::vector modifiers() const; private: - const STORAGE_SPEC_T storage_spec_; + modifier* mod_; const declaration_specifier* decl_spec_; }; + class declarator; class parameter: public node { public: @@ -76,7 +91,7 @@ public: decl_((declarator*)decl) { } ir::type* type(ir::module *mod) const; - std::vector storage() const; + std::vector storage() const; const identifier* id() const; public: @@ -87,7 +102,7 @@ public: /* Declarators */ class declarator: public node{ protected: - typedef std::vector storage_spec_vec_t; + typedef std::vector storage_spec_vec_t; typedef const storage_spec_vec_t& storage_spec_vec_const_ref_t; public: diff --git a/include/triton/lang/node.h b/include/triton/lang/node.h index e689f6f16..c9bd0b011 100644 --- a/include/triton/lang/node.h +++ b/include/triton/lang/node.h @@ -23,7 +23,7 @@ class identifier; class constant; class compound_statement; class initializer; -class declaration_specifier; +class modifier; class function; // Node diff --git a/include/triton/lang/parser.y b/include/triton/lang/parser.y index 18fc3bbed..21065d94f 100644 --- a/include/triton/lang/parser.y +++ b/include/triton/lang/parser.y @@ -47,7 +47,7 @@ STORAGE_SPEC_T get_storage_spec(node *op) { return ((token*)op)->storage_spec;} %} %token IDENTIFIER CONSTANT STRING_LITERAL -%token TUNABLE KERNEL RESTRICT READONLY WRITEONLY CONST CONSTANT_SPACE +%token TUNABLE KERNEL RESTRICT READONLY WRITEONLY CONST CONSTANT_SPACE ALIGN MULTIPLE_OF %token PTR_OP INC_OP DEC_OP LEFT_OP RIGHT_OP LE_OP GE_OP EQ_OP NE_OP %token AND_OP OR_OP MUL_ASSIGN DIV_ASSIGN MOD_ASSIGN ADD_ASSIGN %token SUB_ASSIGN LEFT_ASSIGN RIGHT_ASSIGN AND_ASSIGN @@ -351,8 +351,8 @@ parameter_declaration declaration_specifiers - : type_specifier { $$ = new typed_declaration_specifier(get_type_spec($1)); } - | storage_class_specifier declaration_specifiers { $$ = new storage_declaration_specifier(get_storage_spec($1), $2); } + : type_specifier { $$ = new typed_declaration_specifier(get_type_spec($1)); } + | storage_class_specifier declaration_specifiers { $$ = new declaration_modifier($1, $2); } ; init_declarator_list @@ -376,13 +376,13 @@ init_declarator ; storage_class_specifier - : CONST { $$ = new token(CONST_T); } - | TUNABLE { $$ = new token(TUNABLE_T); } - | KERNEL { $$ = new token(KERNEL_T); } - | RESTRICT { $$ = new token(RESTRICT_T); } - | READONLY { $$ = new token(READONLY_T); } - | WRITEONLY { $$ = new token(WRITEONLY_T); } - | CONSTANT_SPACE { $$ = new token(CONSTANT_SPACE_T); } + : CONST { $$ = new storage_specifier(CONST_T); } + | TUNABLE { $$ = new storage_specifier(TUNABLE_T); } + | KERNEL { $$ = new storage_specifier(KERNEL_T); } + | RESTRICT { $$ = new storage_specifier(RESTRICT_T); } + | READONLY { $$ = new storage_specifier(READONLY_T); } + | WRITEONLY { $$ = new storage_specifier(WRITEONLY_T); } + | CONSTANT_SPACE { $$ = new storage_specifier(CONSTANT_SPACE_T); } ; external_declaration @@ -399,7 +399,7 @@ function_definition /* -------------------------- */ translation_unit - : external_declaration { ast_root = new translation_unit($1); $$ = ast_root; } + : external_declaration { ast_root = new translation_unit($1); $$ = ast_root; } | translation_unit external_declaration { $$ = ((translation_unit*)($1))->add($2); } ; diff --git a/include/triton/lang/scanner.l b/include/triton/lang/scanner.l index a2cd50922..e91b25961 100644 --- a/include/triton/lang/scanner.l +++ b/include/triton/lang/scanner.l @@ -21,11 +21,13 @@ using triton::lang::return_void; "restrict" { return return_impl(RESTRICT, yytext); } "read_only" { return return_impl(READONLY, yytext); } "write_only" { return return_impl(WRITEONLY, yytext); } +"align" { return return_impl(ALIGN, yytext); } +"multiple_of" { return return_impl(MULTIPLE_OF, yytext); } "@" { return return_impl(AT, yytext); } -"newaxis" { return return_impl(NEWAXIS, yytext); } -"if" { return return_impl(IF, yytext); } -"else" { return return_impl(ELSE, yytext); } -"for" { return return_impl(FOR, yytext); } +"newaxis" { return return_impl(NEWAXIS, yytext); } +"if" { return return_impl(IF, yytext); } +"else" { return return_impl(ELSE, yytext); } +"for" { return return_impl(FOR, yytext); } "while" { return return_impl(WHILE, yytext); } "void" { return return_impl(VOID, yytext); } "uint1" { return return_impl(UINT1, yytext); } diff --git a/lib/codegen/shmem_allocation.cpp b/lib/codegen/shmem_allocation.cpp index 7940808bb..1558c663d 100644 --- a/lib/codegen/shmem_allocation.cpp +++ b/lib/codegen/shmem_allocation.cpp @@ -12,12 +12,6 @@ namespace triton{ namespace codegen{ unsigned shmem_allocation::is_ld_padded(ir::value *x) { - if(auto* phi = dynamic_cast(x)) { - unsigned result = 0; - for(unsigned i = 0; i < phi->get_num_incoming(); i++) - result = std::max(result, is_ld_padded(phi->get_incoming_value(i))); - return result; - } if(dynamic_cast(x)) return 4; for(ir::user* user: x->get_users()) @@ -25,7 +19,13 @@ unsigned shmem_allocation::is_ld_padded(ir::value *x) { if(params_->get_fragment(user, 0) == tune::HMMA_FRAGMENT_C){ return 16; } - return 16; + if(auto* phi = dynamic_cast(x)) { + unsigned result = 0; + for(unsigned i = 0; i < phi->get_num_incoming(); i++) + result = std::max(result, is_ld_padded(phi->get_incoming_value(i))); + return result; + } + return 0; } unsigned shmem_allocation::get_num_bytes(ir::value *x) { diff --git a/lib/lang/declaration.cpp b/lib/lang/declaration.cpp index 46fa6b597..c5a23def5 100644 --- a/lib/lang/declaration.cpp +++ b/lib/lang/declaration.cpp @@ -28,18 +28,18 @@ ir::type* typed_declaration_specifier::type(ir::module *mod) const { } } -std::vector typed_declaration_specifier::storage() const { +std::vector typed_declaration_specifier::modifiers() const { return {}; } -ir::type* storage_declaration_specifier::type(ir::module *mod) const { +ir::type* declaration_modifier::type(ir::module *mod) const { return decl_spec_->type(mod); } -std::vector storage_declaration_specifier::storage() const { - auto result = decl_spec_->storage(); - result.push_back(storage_spec_); +std::vector declaration_modifier::modifiers() const { + auto result = decl_spec_->modifiers(); + result.push_back(mod_); return result; } @@ -49,8 +49,8 @@ ir::type* parameter::type(ir::module *mod) const { return decl_->type(mod, spec_->type(mod), {}); } -std::vector parameter::storage() const { - return spec_->storage(); +std::vector parameter::storage() const { + return spec_->modifiers(); } const identifier *parameter::id() const { @@ -87,7 +87,8 @@ ir::type* tile::type_impl(ir::module *mod, ir::type *type, storage_spec_vec_cons // Pointer ir::type* pointer::type_impl(ir::module*, ir::type *type, storage_spec_vec_const_ref_t storage) const{ - bool is_ptr_to_const = std::find(storage.begin(), storage.end(), CONSTANT_SPACE_T) != storage.end(); + auto is_cst = [](modifier* x){ return x->value() == CONSTANT_SPACE_T; }; + bool is_ptr_to_const = std::find_if(storage.begin(), storage.end(), is_cst) != storage.end(); return ir::pointer_type::get(type, is_ptr_to_const?4:1); } @@ -132,11 +133,12 @@ void initializer::set_specifier(const declaration_specifier *spec) { } ir::value* initializer::codegen(ir::module * mod) const{ - std::vector storage = spec_->storage(); + std::vector storage = spec_->modifiers(); ir::type *ty = decl_->type(mod, spec_->type(mod), storage); std::string name = decl_->id()->name(); ir::value *value = ir::undef_value::get(ty); - if(std::find(storage.begin(), storage.end(), TUNABLE_T) != storage.end()){ + auto is_tunable = [](modifier* x){ return x->value() == TUNABLE_T; }; + if(std::find_if(storage.begin(), storage.end(), is_tunable) != storage.end()){ auto csts = dynamic_cast*>((node*)expr_); if(csts == nullptr) throw std::runtime_error("must specify constant list for metaparameters"); @@ -156,7 +158,8 @@ ir::value* initializer::codegen(ir::module * mod) const{ mod->get_scope().types[name] = ty; if(auto *x = dynamic_cast(value)) mod->add_alloc(x); - if(std::find(storage.begin(), storage.end(), CONST_T) != storage.end()) + auto is_cst = [](modifier* mod){ return mod->value() == CONST_T; }; + if(std::find_if(storage.begin(), storage.end(), is_cst) != storage.end()) mod->set_const(name); return value; } @@ -167,8 +170,8 @@ ir::type *type_name::type(ir::module *mod) const{ } /* Function definition */ -ir::attribute_t get_ir_attr(STORAGE_SPEC_T spec){ - switch(spec){ +ir::attribute_t get_ir_attr(modifier* mod){ + switch(mod->value()){ case RESTRICT_T: return ir::noalias; case READONLY_T: return ir::readonly; case WRITEONLY_T: return ir::writeonly; @@ -177,13 +180,13 @@ ir::attribute_t get_ir_attr(STORAGE_SPEC_T spec){ } ir::value* function_definition::codegen(ir::module *mod) const{ - ir::function_type *prototype = (ir::function_type*)header_->type(mod, spec_->type(mod), spec_->storage()); + ir::function_type *prototype = (ir::function_type*)header_->type(mod, spec_->type(mod), spec_->modifiers()); const std::string &name = header_->id()->name(); ir::function *fn = mod->get_or_insert_function(name, prototype); for(unsigned i = 0; i < header_->get_num_args(); i++){ parameter *param = header_->get_arg(i); - std::vector storage = param->storage(); - for(STORAGE_SPEC_T spec: storage) + std::vector storage = param->storage(); + for(modifier* spec: storage) fn->add_attr(1 + i, get_ir_attr(spec)); } header_->bind_parameters(mod, fn); From 72867d17d492957c9121fb096dc6f62387e880d5 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 24 Jun 2019 12:37:13 -0700 Subject: [PATCH 184/494] more cleaning --- examples/python/tensorflow/dot.cpp | 13 ++++++----- include/triton/ir/function.h | 32 ++++++++++++++++++++++---- include/triton/lang/declaration.h | 35 +++++++++++++++++++++++++---- include/triton/lang/parser.y | 9 ++++++++ lib/codegen/selection.cpp | 20 ++++++++--------- lib/driver/backend.cpp | 4 ++-- lib/lang/declaration.cpp | 36 +++++++++++++++++++++--------- 7 files changed, 113 insertions(+), 36 deletions(-) diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index 937309df9..9b52020f6 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -26,11 +26,12 @@ const tunable int32 TN = {64, 128}; const tunable int32 TK = {16}; const tunable int32 GZ = {1}; -void matmul(restrict read_only fp16 *A, restrict read_only fp16 *B, - fp32 *C, - int32 M, int32 N, int32 K, - int32 lda, int32 ldb, int32 ldc, - int32 *locks, int32 grid0, int32 grid1) { +void matmul(restrict read_only align(4) fp16 *A, + restrict read_only align(4) fp16 *B, + align(4) fp32 *C, + int32 M, int32 N, int32 K, + int32 lda, int32 ldb, int32 ldc, + int32 *locks, int32 grid0, int32 grid1) { int32 rxa[TM] = get_global_range[TM](0); int32 ryb[TN] = get_global_range[TN](1); int32 rz = get_global_range[1](2); @@ -119,7 +120,7 @@ class BlockSparseGemmOp : public OpKernel { return 2.*M*N*K / ts * 1e-3; }; // just-in-time compile source-code - jit.autotune("matmul", src, benchmark); +// jit.autotune("matmul", src, benchmark); // jit.add_module("matmul", src, {4, 2, 8, 4, 2, 32, 1, 4, 1, 1, 8, 8, 8, 1}); // jit.add_module("matmul", src, {16, 4, 128, 16, 4, 128, 2, 2, 2, 2, 8, 32, 8, 1}); // jit.add_module("matmul", src, {8, 8, 128, 16, 8, 128, 2, 2, 2, 2, 16, 32, 8, 1 }); diff --git a/include/triton/ir/function.h b/include/triton/ir/function.h index cc00b4a92..cb1ab1f6d 100644 --- a/include/triton/ir/function.h +++ b/include/triton/ir/function.h @@ -28,10 +28,34 @@ private: }; /* Attribute */ -enum attribute_t { +enum attribute_kind_t { readonly, writeonly, - noalias + noalias, + aligned, + multiple_of +}; + +class attribute { +public: + attribute(attribute_kind_t kind, unsigned value = 0): + kind_(kind), value_(value){} + + bool operator<(const attribute& other) const { + return std::make_pair(kind_, value_) < std::make_pair(other.kind_, other.value_); + } + + const attribute_kind_t get_kind() const { + return kind_; + } + + const unsigned get_value() const { + return value_; + } + +private: + attribute_kind_t kind_; + unsigned value_; }; /* Function */ @@ -44,7 +68,7 @@ class function: public global_object{ typedef blocks_t::iterator block_iterator; typedef blocks_t::const_iterator const_block_iterator; - typedef std::map> attr_map_t; + typedef std::map> attr_map_t; private: function(function_type *ty, linkage_types_t linkage, @@ -63,7 +87,7 @@ public: void insert_block(basic_block* block, basic_block *next = nullptr); // attributes - void add_attr(unsigned arg_id, attribute_t attr) { attrs_[arg_id].insert(attr); } + void add_attr(unsigned arg_id, attribute attr) { attrs_[arg_id].insert(attr); } const attr_map_t &attrs() { return attrs_; } private: diff --git a/include/triton/lang/declaration.h b/include/triton/lang/declaration.h index 22275630c..b5f4de412 100644 --- a/include/triton/lang/declaration.h +++ b/include/triton/lang/declaration.h @@ -3,7 +3,7 @@ #include "node.h" #include - +#include namespace triton{ @@ -41,19 +41,45 @@ public: // Types class modifier: public node { - +public: + virtual bool is_cst_space() const { return false; } + virtual bool is_tunable() const { return false; } + virtual bool is_cst() const { return false; } + virtual void add_attr(ir::function* fn, size_t pos) = 0; }; -class storage_specifier: public node { +class storage_specifier: public modifier { public: storage_specifier(STORAGE_SPEC_T value): value_(value) {} STORAGE_SPEC_T value() const { return value_; } + bool is_cst_space() const { return value_ == CONSTANT_SPACE_T; } + bool is_tunable() const { return value_ == TUNABLE_T; } + bool is_cst() const { return value_ == CONST_T; } + void add_attr(ir::function* fn, size_t pos); private: const STORAGE_SPEC_T value_; }; +class alignment_specifier: public modifier { +public: + alignment_specifier(node* value): cst_((constant*)value) { } + void add_attr(ir::function* fn, size_t pos); +private: + constant* cst_; +}; + +class multiple_of_specifier: public modifier { +public: + multiple_of_specifier(node* value): cst_((constant*)value) {} + void add_attr(ir::function* fn, size_t pos); + +private: + constant* cst_; +}; + +// declaration specifier class declaration_specifier: public node{ public: virtual ir::type* type(ir::module *mod) const = 0; @@ -70,6 +96,7 @@ private: const TYPE_T ty_; }; +// declaration modifier class declaration_modifier: public declaration_specifier { public: declaration_modifier(node* mod, node *decl_spec) @@ -91,7 +118,7 @@ public: decl_((declarator*)decl) { } ir::type* type(ir::module *mod) const; - std::vector storage() const; + std::vector modifiers() const; const identifier* id() const; public: diff --git a/include/triton/lang/parser.y b/include/triton/lang/parser.y index 21065d94f..2c942b86c 100644 --- a/include/triton/lang/parser.y +++ b/include/triton/lang/parser.y @@ -353,6 +353,8 @@ parameter_declaration declaration_specifiers : type_specifier { $$ = new typed_declaration_specifier(get_type_spec($1)); } | storage_class_specifier declaration_specifiers { $$ = new declaration_modifier($1, $2); } + | alignment_class_specifier declaration_specifiers { $$ = new declaration_modifier($1, $2); } + | multiple_of_class_specifier declaration_specifiers { $$ = new declaration_modifier($1, $2); } ; init_declarator_list @@ -385,6 +387,13 @@ storage_class_specifier | CONSTANT_SPACE { $$ = new storage_specifier(CONSTANT_SPACE_T); } ; +alignment_class_specifier + : ALIGN '(' constant ')' { $$ = new alignment_specifier($3); } + +multiple_of_class_specifier + : MULTIPLE_OF '(' constant ')' { $$ = new multiple_of_specifier($3); } + + external_declaration : function_definition { $$ = $1; } | declaration { $$ = $1; } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 31cee7e6b..fed49407f 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -1074,11 +1074,12 @@ void selection::lower_instruction(ir::instruction *src, IRBuilder<> &builder) { } } -inline llvm::Attribute::AttrKind llvm_attr(ir::attribute_t attr) { - switch(attr){ - case ir::noalias: return llvm::Attribute::NoAlias; - case ir::readonly: return llvm::Attribute::ReadOnly; - case ir::writeonly: return llvm::Attribute::WriteOnly; +inline llvm::Attribute llvm_attr(llvm::LLVMContext& ctx, ir::attribute attr) { + switch(attr.get_kind()){ + case ir::noalias: return llvm::Attribute::get(ctx, llvm::Attribute::NoAlias); + case ir::readonly: return llvm::Attribute::get(ctx, llvm::Attribute::ReadOnly); + case ir::writeonly: return llvm::Attribute::get(ctx, llvm::Attribute::WriteOnly); + case ir::aligned: return llvm::Attribute::get(ctx, llvm::Attribute::Alignment, attr.get_value()); default: throw std::runtime_error("cannot convert ir::attribute_t to llvm::Attribute"); } } @@ -1101,6 +1102,7 @@ void selection::run(ir::module &src, Module &dst) { // iterate over functions for(ir::function *fn: src.get_function_list()) { + // create LLVM function FunctionType *fn_ty = (FunctionType*)llvm_type(fn->get_fn_type(), dst_ctx); FunctionType *dst_fn_ty = fn_ty; @@ -1114,18 +1116,16 @@ void selection::run(ir::module &src, Module &dst) { dst_fn_args_ty.push_back(dst_builder.getInt32Ty()); dst_fn_ty = FunctionType::get(dst_fn_ret_ty, dst_fn_args_ty, false); } + // grid indices fn->get_fn_type()->get_return_ty(); Function *dst_fn = Function::Create(dst_fn_ty, Function::ExternalLinkage, fn->get_name(), &dst); - - - // set attributes for(auto attr_pair: fn->attrs()){ unsigned id = attr_pair.first; - for(ir::attribute_t attr: attr_pair.second) - dst_fn->addAttribute(id, llvm_attr(attr)); + for(ir::attribute attr: attr_pair.second) + dst_fn->addAttribute(id, llvm_attr(dst_ctx, attr)); } tgt_->set_kernel(dst_builder, dst_ctx, &dst, dst_fn); diff --git a/lib/driver/backend.cpp b/lib/driver/backend.cpp index 9761e94e7..6f98be75c 100755 --- a/lib/driver/backend.cpp +++ b/lib/driver/backend.cpp @@ -63,7 +63,7 @@ void backend::platforms::init() { cache_.push_back(new host_platform()); } if(cache_.empty()) - throw std::runtime_error("ISAAC: No backend available. Make sure CUDA is available in your library path"); + throw std::runtime_error("Triton: No backend available. Make sure CUDA is available in your library path"); } void backend::platforms::get(std::vector &results) { @@ -83,7 +83,7 @@ void backend::devices::init(std::vector const & platforms) { for(driver::platform* pf: platforms) pf->devices(cache_); if(cache_.empty()) - throw std::runtime_error("ISAAC: No device available. Make sure that your platform is configured properly"); + throw std::runtime_error("Triton: No device available. Make sure that your platform is configured properly"); } void backend::devices::get(std::vector &devs) { diff --git a/lib/lang/declaration.cpp b/lib/lang/declaration.cpp index c5a23def5..b1a455099 100644 --- a/lib/lang/declaration.cpp +++ b/lib/lang/declaration.cpp @@ -49,7 +49,7 @@ ir::type* parameter::type(ir::module *mod) const { return decl_->type(mod, spec_->type(mod), {}); } -std::vector parameter::storage() const { +std::vector parameter::modifiers() const { return spec_->modifiers(); } @@ -87,7 +87,7 @@ ir::type* tile::type_impl(ir::module *mod, ir::type *type, storage_spec_vec_cons // Pointer ir::type* pointer::type_impl(ir::module*, ir::type *type, storage_spec_vec_const_ref_t storage) const{ - auto is_cst = [](modifier* x){ return x->value() == CONSTANT_SPACE_T; }; + auto is_cst = [](modifier* x){ return x->is_cst_space(); }; bool is_ptr_to_const = std::find_if(storage.begin(), storage.end(), is_cst) != storage.end(); return ir::pointer_type::get(type, is_ptr_to_const?4:1); } @@ -137,7 +137,7 @@ ir::value* initializer::codegen(ir::module * mod) const{ ir::type *ty = decl_->type(mod, spec_->type(mod), storage); std::string name = decl_->id()->name(); ir::value *value = ir::undef_value::get(ty); - auto is_tunable = [](modifier* x){ return x->value() == TUNABLE_T; }; + auto is_tunable = [](modifier* x){ return x->is_tunable(); }; if(std::find_if(storage.begin(), storage.end(), is_tunable) != storage.end()){ auto csts = dynamic_cast*>((node*)expr_); if(csts == nullptr) @@ -158,7 +158,7 @@ ir::value* initializer::codegen(ir::module * mod) const{ mod->get_scope().types[name] = ty; if(auto *x = dynamic_cast(value)) mod->add_alloc(x); - auto is_cst = [](modifier* mod){ return mod->value() == CONST_T; }; + auto is_cst = [](modifier* x){ return x->is_cst(); }; if(std::find_if(storage.begin(), storage.end(), is_cst) != storage.end()) mod->set_const(name); return value; @@ -169,9 +169,9 @@ ir::type *type_name::type(ir::module *mod) const{ return decl_->type(mod, spec_->type(mod), {}); } -/* Function definition */ -ir::attribute_t get_ir_attr(modifier* mod){ - switch(mod->value()){ +/* Storage specifier */ +inline ir::attribute_kind_t get_ir_attr(STORAGE_SPEC_T spec){ + switch(spec){ case RESTRICT_T: return ir::noalias; case READONLY_T: return ir::readonly; case WRITEONLY_T: return ir::writeonly; @@ -179,15 +179,31 @@ ir::attribute_t get_ir_attr(modifier* mod){ } } +void storage_specifier::add_attr(ir::function* fn, size_t pos) { + fn->add_attr(pos, ir::attribute(get_ir_attr(value_))); +} + +/* Alignment specifier */ +void alignment_specifier::add_attr(ir::function* fn, size_t pos) { + fn->add_attr(pos, ir::attribute(ir::aligned, cst_->value())); +} + +/* Multiple-Of specifier */ +void multiple_of_specifier::add_attr(ir::function* fn, size_t pos) { + fn->add_attr(pos, ir::attribute(ir::multiple_of, cst_->value())); +} + + +/* Function definition */ ir::value* function_definition::codegen(ir::module *mod) const{ ir::function_type *prototype = (ir::function_type*)header_->type(mod, spec_->type(mod), spec_->modifiers()); const std::string &name = header_->id()->name(); ir::function *fn = mod->get_or_insert_function(name, prototype); for(unsigned i = 0; i < header_->get_num_args(); i++){ parameter *param = header_->get_arg(i); - std::vector storage = param->storage(); - for(modifier* spec: storage) - fn->add_attr(1 + i, get_ir_attr(spec)); + std::vector modifiers = param->modifiers(); + for(modifier* m: modifiers) + m->add_attr(fn, 1 + i); } header_->bind_parameters(mod, fn); ir::basic_block *entry = ir::basic_block::create(mod->get_context(), "entry", fn); From edc31cabb0949b92c32d1f568ee5cc5a28052db4 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 24 Jun 2019 18:57:32 -0700 Subject: [PATCH 185/494] [codegen] rough template for axis_info pass --- examples/python/tensorflow/dot.cpp | 2 +- include/triton/codegen/axis_info.h | 39 +++++++++ include/triton/ir/function.h | 8 ++ include/triton/ir/instructions.h | 6 ++ include/triton/runtime/jit.h | 4 + lib/codegen/axis_info.cpp | 129 +++++++++++++++++++++++++++++ lib/codegen/selection.cpp | 1 + lib/ir/function.cpp | 9 ++ lib/ir/instructions.cpp | 25 ++++++ 9 files changed, 222 insertions(+), 1 deletion(-) create mode 100644 include/triton/codegen/axis_info.h create mode 100644 lib/codegen/axis_info.cpp diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index 9b52020f6..0a8e1e948 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -30,7 +30,7 @@ void matmul(restrict read_only align(4) fp16 *A, restrict read_only align(4) fp16 *B, align(4) fp32 *C, int32 M, int32 N, int32 K, - int32 lda, int32 ldb, int32 ldc, + multiple_of(4) int32 lda, multiple_of(4) int32 ldb, multiple_of(4) int32 ldc, int32 *locks, int32 grid0, int32 grid1) { int32 rxa[TM] = get_global_range[TM](0); int32 ryb[TN] = get_global_range[TN](1); diff --git a/include/triton/codegen/axis_info.h b/include/triton/codegen/axis_info.h new file mode 100644 index 000000000..bfc4ef322 --- /dev/null +++ b/include/triton/codegen/axis_info.h @@ -0,0 +1,39 @@ +#ifndef TDL_INCLUDE_CODEGEN_AXIS_INFO_PASS_H +#define TDL_INCLUDE_CODEGEN_AXIS_INFO_PASS_H + +#include +#include + +namespace triton { + +namespace ir { + class value; + class module; +} + +namespace codegen{ + +class axis_info { +private: + // helpers + bool is_first_axis_unit(ir::value *x); + + // populate maps + bool populate_is_constant(ir::value *i); + unsigned populate_max_contiguous(ir::value *i); + unsigned populate_multiple_of(ir::value *i); + +public: + void run(ir::module &mod); + +private: + std::map is_constant_; + std::map max_contiguous_; + std::map multiple_of_; +}; + + +} +} + +#endif diff --git a/include/triton/ir/function.h b/include/triton/ir/function.h index cb1ab1f6d..c5f5f0605 100644 --- a/include/triton/ir/function.h +++ b/include/triton/ir/function.h @@ -5,6 +5,7 @@ #include #include "value.h" #include "constant.h" +#include namespace triton{ namespace ir{ @@ -21,6 +22,8 @@ class argument: public value{ public: static argument* create(type *ty, const std::string &name, function *parent = nullptr, unsigned arg_no = 0); + function* get_parent() const; + unsigned get_arg_no() const; private: function *parent_; @@ -53,6 +56,10 @@ public: return value_; } + bool is_llvm_attr() const { + return kind_ != multiple_of; + } + private: attribute_kind_t kind_; unsigned value_; @@ -89,6 +96,7 @@ public: // attributes void add_attr(unsigned arg_id, attribute attr) { attrs_[arg_id].insert(attr); } const attr_map_t &attrs() { return attrs_; } + std::set get_attributes(argument* arg) { return attrs_[arg->get_arg_no() + 1]; } private: module *parent_; diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index 8e1da57ae..397be9d9d 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -122,6 +122,12 @@ public: bool is_int_div_rem() const; bool is_shift() const; bool is_cast() const; + bool is_int_mult() const; + bool is_int_add_sub() const; + bool is_int_div() const; + bool is_int_rem() const; + bool is_shl() const; + bool is_shr() const; // Wraps void set_has_no_unsigned_wrap(bool b = true) { has_no_unsigned_wrap_ = b; } diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index a9bea664b..c12bb6b23 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -17,6 +17,7 @@ #include "triton/codegen/shmem_liveness.h" #include "triton/codegen/shmem_info.h" #include "triton/codegen/shmem_barriers.h" +#include "triton/codegen/axis_info.h" #include "triton/codegen/target.h" #include "triton/codegen/vectorize.h" #include @@ -60,11 +61,13 @@ public: optimize_dot(&tune), optimize_cse(), optimize_trans(), + axis_info(), target_(target) { } void target_independent(ir::module &module) { optimize_dot.run(module); optimize_trans.run(module); + axis_info.run(module); // ir::print(module, std::cout); } @@ -88,6 +91,7 @@ public: codegen::optimize_dot optimize_dot; codegen::optimize_cse optimize_cse; codegen::optimize_trans optimize_trans; + codegen::axis_info axis_info; codegen::target* target_; }; diff --git a/lib/codegen/axis_info.cpp b/lib/codegen/axis_info.cpp new file mode 100644 index 000000000..38e2fcd9b --- /dev/null +++ b/lib/codegen/axis_info.cpp @@ -0,0 +1,129 @@ +#include "triton/codegen/axis_info.h" +#include "triton/ir/module.h" +#include "triton/ir/function.h" +#include "triton/ir/basic_block.h" +#include "triton/ir/instructions.h" +#include "triton/ir/type.h" + +namespace triton { +namespace codegen{ + + +template +inline T add_to_cache(ir::value *i, T value, std::map &map) { + return map.insert(std::make_pair(i, value)).first->second; +} + + +bool axis_info::is_first_axis_unit(ir::value *x){ + if(x->get_type()->is_tile_ty()) + return x->get_type()->get_tile_shapes()[0]->get_value() == 1; + else + return true; +} + +bool axis_info::populate_is_constant(ir::value *v) { + // helper for the cache + auto cache = [this,v](bool value){ return add_to_cache(v, value, is_constant_); }; + // populate + if(v->get_type()->is_tile_ty()){ + if(auto *x = dynamic_cast(v)){ + bool value = populate_is_constant(x->get_operand(0)); + // check if broadcast (i.e., constant) along contiguous dimension + if(is_first_axis_unit(x->get_operand(0)) + && !is_first_axis_unit(x)) + return cache(value); + } + // otherwise the tile is not constant in the contiguous dimension + return cache(false); + } + // scalars are always constant in the contiguous dimension + return cache(true); +} + +unsigned axis_info::populate_max_contiguous(ir::value *v){ + // helper for the cache + auto cache = [this,v](unsigned value){ return add_to_cache(v, value, max_contiguous_); }; + // populate + if(v->get_type()->is_tile_ty()){ + auto shapes = v->get_type()->get_tile_shapes(); + if(dynamic_cast(v)) + return cache(shapes[0]->get_value()); + if(auto *x = dynamic_cast(v)){ + ir::value* lhs = x->get_operand(0); + ir::value* rhs = x->get_operand(1); + unsigned lhs_max_contiguous = populate_max_contiguous(lhs); + bool lhs_has_cst = populate_is_constant(lhs); + unsigned rhs_max_contiguous = populate_max_contiguous(rhs); + bool rhs_has_cst = populate_is_constant(rhs); + if(x->is_int_add_sub()){ + if(lhs_has_cst) + return cache(rhs_max_contiguous); + if(rhs_has_cst) + return cache(lhs_max_contiguous); + } + } + } + return cache(1); +} + +unsigned axis_info::populate_multiple_of(ir::value *v){ + auto cache = [this,v](unsigned value){ return add_to_cache(v, value, max_contiguous_); }; + + if(auto *x = dynamic_cast(v)){ + std::set attributes = x->get_parent()->get_attributes(x); + for(auto attr: attributes){ + if(attr.get_kind() == ir::multiple_of) + return cache(attr.get_value()); + } + } + if(auto *x = dynamic_cast(v)){ + int lhs = populate_multiple_of(x->get_operand(0)); + int rhs = populate_multiple_of(x->get_operand(1)); + if(x->is_int_mult()) + return cache(lhs * rhs); + if(x->is_int_add_sub()) + return cache(std::min(lhs, rhs)); + if(x->is_int_div()) + return cache(std::max(lhs / rhs, 1)); + if(x->is_int_rem()) + return cache(std::max(lhs % rhs, 1)); + if(x->is_shl()) + return cache(lhs << rhs); + if(x->is_shr()) + return cache(std::max(lhs >> rhs, 1)); + } + if(auto *x = dynamic_cast(v)){ + return cache(populate_multiple_of(x->get_operand(0))); + } + return cache(1); +} + + + +void axis_info::run(ir::module &mod) { + // populate constant + for(ir::function *fn: mod.get_function_list()) + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *i: block->get_inst_list()){ + populate_is_constant(i); + } + + // populate multiple_of + for(ir::function *fn: mod.get_function_list()) + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *i: block->get_inst_list()){ + populate_multiple_of(i); + } + + // populate maximum contiguous + for(ir::function *fn: mod.get_function_list()) + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *i: block->get_inst_list()){ + populate_max_contiguous(i); + } +} + + +} +} diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index fed49407f..ae2a3f1c1 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -1125,6 +1125,7 @@ void selection::run(ir::module &src, Module &dst) { for(auto attr_pair: fn->attrs()){ unsigned id = attr_pair.first; for(ir::attribute attr: attr_pair.second) + if(attr.is_llvm_attr()) dst_fn->addAttribute(id, llvm_attr(dst_ctx, attr)); } tgt_->set_kernel(dst_builder, dst_ctx, &dst, dst_fn); diff --git a/lib/ir/function.cpp b/lib/ir/function.cpp index 758fd8bc3..5c7ca1e2a 100644 --- a/lib/ir/function.cpp +++ b/lib/ir/function.cpp @@ -16,6 +16,15 @@ argument *argument::create(type *ty, const std::string &name, return new argument(ty, name, parent, arg_no); } +function* argument::get_parent() const { + return parent_; +} + +unsigned argument::get_arg_no() const { + return arg_no_; +} + + /* function */ function::function(function_type *ty, linkage_types_t linkage, const std::string &name, module *parent) diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 79c951f6d..9b5d37094 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -109,6 +109,31 @@ std::string binary_operator::repr_impl() const { } } +bool binary_operator::is_int_div() const { + return op_ == llop::UDiv || op_ == llop::SDiv; +} + +bool binary_operator::is_int_rem() const { + return op_ == llop::URem || op_ == llop::SRem; +} + +bool binary_operator::is_shl() const { + return op_ == llop::Shl; +} + +bool binary_operator::is_shr() const { + return op_ == llop::LShr || op_ == llop::AShr; +} + +bool binary_operator::is_int_mult() const { + return op_ == llop::Mul; +} + +bool binary_operator::is_int_add_sub() const { + return op_ == llop::Add || llop::Sub; +} + + binary_operator::binary_operator(op_t op, value *lhs, value *rhs, type *ty, const std::string &name, instruction *next) : instruction(ty, 2, 1, name, next), op_(op){ set_operand(0, lhs); From d52abc93799be0873018e32dbd6fe9c018a2734a Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 25 Jun 2019 15:06:15 -0700 Subject: [PATCH 186/494] [codegen] bugfix in alignment inference --- examples/python/tensorflow/dot.cpp | 6 +- include/triton/codegen/axis_info.h | 12 +- include/triton/codegen/selection.h | 6 +- include/triton/runtime/jit.h | 6 +- lib/codegen/axis_info.cpp | 183 +++++++++++++++++++++++------ lib/codegen/selection.cpp | 7 +- lib/driver/module.cpp | 2 +- 7 files changed, 170 insertions(+), 52 deletions(-) diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index 0a8e1e948..8dea2337a 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -26,9 +26,9 @@ const tunable int32 TN = {64, 128}; const tunable int32 TK = {16}; const tunable int32 GZ = {1}; -void matmul(restrict read_only align(4) fp16 *A, - restrict read_only align(4) fp16 *B, - align(4) fp32 *C, +void matmul(restrict read_only align(16) fp16 *A, + restrict read_only align(16) fp16 *B, + align(16) fp32 *C, int32 M, int32 N, int32 K, multiple_of(4) int32 lda, multiple_of(4) int32 ldb, multiple_of(4) int32 ldc, int32 *locks, int32 grid0, int32 grid1) { diff --git a/include/triton/codegen/axis_info.h b/include/triton/codegen/axis_info.h index bfc4ef322..9b44b01c7 100644 --- a/include/triton/codegen/axis_info.h +++ b/include/triton/codegen/axis_info.h @@ -16,20 +16,22 @@ namespace codegen{ class axis_info { private: // helpers - bool is_first_axis_unit(ir::value *x); + bool is_first_axis_unit(ir::value *v); // populate maps - bool populate_is_constant(ir::value *i); - unsigned populate_max_contiguous(ir::value *i); - unsigned populate_multiple_of(ir::value *i); + bool populate_is_constant(ir::value *v); + unsigned populate_max_contiguous(ir::value *v); + unsigned populate_starting_multiple(ir::value *v); public: void run(ir::module &mod); + unsigned get_starting_multiple(ir::value* v) const; + unsigned get_max_contiguous(ir::value* v) const; private: std::map is_constant_; std::map max_contiguous_; - std::map multiple_of_; + std::map starting_multiple_; }; diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 9a8149a01..7ad586058 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -25,6 +25,7 @@ class shmem_allocation; class tune; class shmem_info; class target; +class axis_info; typedef std::vector indices_t; @@ -143,8 +144,8 @@ private: void lower_tile_instruction(ir::instruction *src, llvm::IRBuilder<> &builder); public: - selection(shmem_allocation *alloc, tune *params, shmem_info *buffer_info, target *tgt) - : alloc_(alloc), params_(params), buffer_info_(buffer_info), tgt_(tgt){ } + selection(shmem_allocation *alloc, tune *params, shmem_info *buffer_info, axis_info *ax_info, target *tgt) + : alloc_(alloc), params_(params), buffer_info_(buffer_info), axis_info_(ax_info), tgt_(tgt){ } void run(ir::module &src, llvm::Module &dst); @@ -157,6 +158,7 @@ private: tune *params_; target *tgt_; shmem_info *buffer_info_; + axis_info *axis_info_; std::map axes_; llvm::Value *sh_mem_ptr_; llvm::Value *offset_a_i_, *offset_a_k_; diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index c12bb6b23..3b8aa606c 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -57,7 +57,7 @@ public: shmem_allocation(&shmem_liveness, &shmem_info, &tune), shmem_barriers(&shmem_allocation, &shmem_info), vectorize(&tune), - selection(&shmem_allocation, &tune, &shmem_info, target), + selection(&shmem_allocation, &tune, &shmem_info, &axis_info, target), optimize_dot(&tune), optimize_cse(), optimize_trans(), @@ -67,11 +67,11 @@ public: void target_independent(ir::module &module) { optimize_dot.run(module); optimize_trans.run(module); - axis_info.run(module); -// ir::print(module, std::cout); + ir::print(module, std::cout); } void target_dependent(ir::module &module) { + axis_info.run(module); if(target_->is_gpu()){ shmem_info.run(module); shmem_liveness.run(module); diff --git a/lib/codegen/axis_info.cpp b/lib/codegen/axis_info.cpp index 38e2fcd9b..be2a16c91 100644 --- a/lib/codegen/axis_info.cpp +++ b/lib/codegen/axis_info.cpp @@ -11,7 +11,7 @@ namespace codegen{ template inline T add_to_cache(ir::value *i, T value, std::map &map) { - return map.insert(std::make_pair(i, value)).first->second; + return map[i] = value; } @@ -23,63 +23,132 @@ bool axis_info::is_first_axis_unit(ir::value *x){ } bool axis_info::populate_is_constant(ir::value *v) { + if(is_constant_.find(v) != is_constant_.end()) + return is_constant_.at(v); // helper for the cache auto cache = [this,v](bool value){ return add_to_cache(v, value, is_constant_); }; // populate - if(v->get_type()->is_tile_ty()){ - if(auto *x = dynamic_cast(v)){ - bool value = populate_is_constant(x->get_operand(0)); - // check if broadcast (i.e., constant) along contiguous dimension - if(is_first_axis_unit(x->get_operand(0)) - && !is_first_axis_unit(x)) - return cache(value); - } - // otherwise the tile is not constant in the contiguous dimension + if(auto *x = dynamic_cast(v)){ + ir::value *op = x->get_operand(0); + populate_is_constant(op); + if(is_first_axis_unit(op)) + return cache(true); + } + if(auto *x = dynamic_cast(v)){ + bool lhs = populate_is_constant(x->get_operand(0)); + bool rhs = populate_is_constant(x->get_operand(1)); + return cache(lhs && rhs); + } + if(v->get_type()->is_tile_ty()) return cache(false); + if(auto *x = dynamic_cast(v)){ + // put a conservative initial value in phi node to avoid infinite recursion + bool result = true; + for(unsigned n = 0; n < x->get_num_incoming(); n++){ + ir::value* inc = x->get_incoming_value(n); + if(is_constant_.find(inc) != is_constant_.end()) + result = is_constant_.at(inc); + } + cache(result); + // recurse + for(unsigned n = 0; n < x->get_num_incoming(); n++){ + ir::value* inc = x->get_incoming_value(n); + result = result && populate_is_constant(inc); + } + return cache(result); } // scalars are always constant in the contiguous dimension return cache(true); } unsigned axis_info::populate_max_contiguous(ir::value *v){ + if(max_contiguous_.find(v) != max_contiguous_.end()) + return max_contiguous_.at(v); // helper for the cache auto cache = [this,v](unsigned value){ return add_to_cache(v, value, max_contiguous_); }; // populate - if(v->get_type()->is_tile_ty()){ - auto shapes = v->get_type()->get_tile_shapes(); - if(dynamic_cast(v)) - return cache(shapes[0]->get_value()); - if(auto *x = dynamic_cast(v)){ - ir::value* lhs = x->get_operand(0); - ir::value* rhs = x->get_operand(1); - unsigned lhs_max_contiguous = populate_max_contiguous(lhs); - bool lhs_has_cst = populate_is_constant(lhs); - unsigned rhs_max_contiguous = populate_max_contiguous(rhs); - bool rhs_has_cst = populate_is_constant(rhs); - if(x->is_int_add_sub()){ - if(lhs_has_cst) - return cache(rhs_max_contiguous); - if(rhs_has_cst) - return cache(lhs_max_contiguous); - } + if(!v->get_type()->is_tile_ty()) + return cache(1); + auto shapes = v->get_type()->get_tile_shapes(); + if(dynamic_cast(v)) + return cache(shapes[0]->get_value()); + if(dynamic_cast(v)) + return cache(shapes[0]->get_value()); + if(auto *x = dynamic_cast(v)){ + ir::value *op = x->get_operand(0); + if(op->get_type()->is_tile_ty()){ + auto op_shapes = op->get_type()->get_tile_shapes(); + if(op_shapes[0] == shapes[0]) + return cache(populate_max_contiguous(op)); } + return cache(1); + } + if(auto *x = dynamic_cast(v)){ + ir::value* lhs = x->get_operand(0); + ir::value* rhs = x->get_operand(1); + unsigned lhs_max_contiguous = populate_max_contiguous(lhs); + unsigned rhs_max_contiguous = populate_max_contiguous(rhs); + bool lhs_has_cst = populate_is_constant(lhs); + bool rhs_has_cst = populate_is_constant(rhs); + if(x->is_int_add_sub()){ + if(lhs_has_cst) + return cache(rhs_max_contiguous); + if(rhs_has_cst) + return cache(lhs_max_contiguous); + } + } + if(auto *x = dynamic_cast(v)){ + ir::value* lhs = x->get_operand(0); + ir::value* rhs = x->get_operand(1); + unsigned lhs_max_contiguous = populate_max_contiguous(lhs); + unsigned rhs_max_contiguous = populate_max_contiguous(rhs); + bool lhs_has_cst = populate_is_constant(lhs); + bool rhs_has_cst = populate_is_constant(rhs); + if(lhs_has_cst) + return cache(rhs_max_contiguous); + if(rhs_has_cst) + return cache(lhs_max_contiguous); + } + if(auto *x = dynamic_cast(v)){ + // put a conservative initial value in phi node to avoid infinite recursion + unsigned result = 1; + for(unsigned n = 0; n < x->get_num_incoming(); n++){ + ir::value* inc = x->get_incoming_value(n); + if(max_contiguous_.find(inc) != max_contiguous_.end()) + result = max_contiguous_.at(inc); + } + cache(result); + // recurse + for(unsigned n = 0; n < x->get_num_incoming(); n++){ + ir::value* inc = x->get_incoming_value(n); + result = std::min(result, populate_max_contiguous(inc)); + } + return cache(result); } return cache(1); } -unsigned axis_info::populate_multiple_of(ir::value *v){ - auto cache = [this,v](unsigned value){ return add_to_cache(v, value, max_contiguous_); }; - +unsigned axis_info::populate_starting_multiple(ir::value *v){ + if(starting_multiple_.find(v) != starting_multiple_.end()) + return starting_multiple_.at(v); + auto cache = [this,v](unsigned value){ return add_to_cache(v, value, starting_multiple_); }; + // arguments if(auto *x = dynamic_cast(v)){ std::set attributes = x->get_parent()->get_attributes(x); for(auto attr: attributes){ if(attr.get_kind() == ir::multiple_of) return cache(attr.get_value()); + if(attr.get_kind() == ir::aligned){ + ir::type* ty = x->get_type()->get_pointer_element_ty(); + int nbits = ty->get_primitive_size_in_bits(); + int nbytes = nbits / 8; + return cache(attr.get_value() / nbytes); + } } } if(auto *x = dynamic_cast(v)){ - int lhs = populate_multiple_of(x->get_operand(0)); - int rhs = populate_multiple_of(x->get_operand(1)); + int lhs = populate_starting_multiple(x->get_operand(0)); + int rhs = populate_starting_multiple(x->get_operand(1)); if(x->is_int_mult()) return cache(lhs * rhs); if(x->is_int_add_sub()) @@ -93,12 +162,52 @@ unsigned axis_info::populate_multiple_of(ir::value *v){ if(x->is_shr()) return cache(std::max(lhs >> rhs, 1)); } - if(auto *x = dynamic_cast(v)){ - return cache(populate_multiple_of(x->get_operand(0))); + if(auto *x = dynamic_cast(v)){ + int lhs = populate_starting_multiple(x->get_operand(0)); + int rhs = populate_starting_multiple(x->get_operand(1)); + return cache(std::min(lhs, rhs)); } - return cache(1); + if(auto *x = dynamic_cast(v)){ + int op = populate_starting_multiple(x->get_operand(0)); + return cache(op); + } + if(auto *x = dynamic_cast(v)){ + return cache(v->get_type()->get_tile_shapes()[0]->get_value()); + } + if(auto *x = dynamic_cast(v)){ + // put a conservative initial value in phi node to avoid infinite recursion + unsigned result = 1; + for(unsigned n = 0; n < x->get_num_incoming(); n++){ + ir::value* inc = x->get_incoming_value(n); + if(starting_multiple_.find(inc) != starting_multiple_.end()) + result = starting_multiple_.at(inc); + } + cache(result); + // recurse + for(unsigned n = 0; n < x->get_num_incoming(); n++){ + ir::value* inc = x->get_incoming_value(n); + result = std::min(result, populate_starting_multiple(inc)); + } + return cache(result); + } + // scalars + if(!v->get_type()->is_tile_ty()) + return cache(1); + // tiles + auto shapes = v->get_type()->get_tile_shapes(); + unsigned result = 1; + for(unsigned i = 0; i < shapes.size() - 1; i++) + result *= shapes[i]->get_value(); + return cache(result); } +unsigned axis_info::get_starting_multiple(ir::value* v) const { + return starting_multiple_.at(v); +} + +unsigned axis_info::get_max_contiguous(ir::value* v) const { + return max_contiguous_.at(v); +} void axis_info::run(ir::module &mod) { @@ -109,11 +218,11 @@ void axis_info::run(ir::module &mod) { populate_is_constant(i); } - // populate multiple_of + // populate starting multiple for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()){ - populate_multiple_of(i); + populate_starting_multiple(i); } // populate maximum contiguous diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index ae2a3f1c1..e98f0bcb0 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -2,6 +2,7 @@ #include "triton/codegen/tune.h" #include "triton/codegen/shmem_allocation.h" #include "triton/codegen/target.h" +#include "triton/codegen/axis_info.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Module.h" #include "llvm/IR/IRBuilder.h" @@ -1027,7 +1028,11 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & } } else if(auto *ld = dynamic_cast(ins)){ - unsigned vector_size = result->axis(0).contiguous; + ir::value *ptr = ld->get_pointer_operand(); + unsigned starting_multiple = axis_info_->get_starting_multiple(ptr); + unsigned max_contiguous = axis_info_->get_max_contiguous(ptr); + unsigned alignment = std::min(starting_multiple, max_contiguous); + unsigned vector_size = std::min(result->axis(0).contiguous, alignment); std::map packets; distributed_tile *TP = (distributed_tile*)tmap_.at(ld->get_pointer_operand()); result->for_each([&](indices_t idx){ diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 4ff863666..c3139ece6 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -255,7 +255,7 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ -// std::cout << source << std::endl; + std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; From 62000738f0c66903d9daea5bd10d6b0b441fb2b8 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 25 Jun 2019 15:10:47 -0700 Subject: [PATCH 187/494] [codegen] renamed axis_info -> alignment_info --- .../codegen/{axis_info.h => alignment_info.h} | 6 +++--- include/triton/codegen/selection.h | 6 +++--- include/triton/runtime/jit.h | 4 ++-- .../{axis_info.cpp => alignment_info.cpp} | 16 ++++++++-------- lib/codegen/selection.cpp | 2 +- 5 files changed, 17 insertions(+), 17 deletions(-) rename include/triton/codegen/{axis_info.h => alignment_info.h} (84%) rename lib/codegen/{axis_info.cpp => alignment_info.cpp} (94%) diff --git a/include/triton/codegen/axis_info.h b/include/triton/codegen/alignment_info.h similarity index 84% rename from include/triton/codegen/axis_info.h rename to include/triton/codegen/alignment_info.h index 9b44b01c7..b90263dbe 100644 --- a/include/triton/codegen/axis_info.h +++ b/include/triton/codegen/alignment_info.h @@ -1,5 +1,5 @@ -#ifndef TDL_INCLUDE_CODEGEN_AXIS_INFO_PASS_H -#define TDL_INCLUDE_CODEGEN_AXIS_INFO_PASS_H +#ifndef TDL_INCLUDE_CODEGEN_ALIGNMENT_INFO_PASS_H +#define TDL_INCLUDE_CODEGEN_ALIGNMENT_INFO_PASS_H #include #include @@ -13,7 +13,7 @@ namespace ir { namespace codegen{ -class axis_info { +class alignment_info { private: // helpers bool is_first_axis_unit(ir::value *v); diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 7ad586058..4355bfce6 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -25,7 +25,7 @@ class shmem_allocation; class tune; class shmem_info; class target; -class axis_info; +class alignment_info; typedef std::vector indices_t; @@ -144,7 +144,7 @@ private: void lower_tile_instruction(ir::instruction *src, llvm::IRBuilder<> &builder); public: - selection(shmem_allocation *alloc, tune *params, shmem_info *buffer_info, axis_info *ax_info, target *tgt) + selection(shmem_allocation *alloc, tune *params, shmem_info *buffer_info, alignment_info *ax_info, target *tgt) : alloc_(alloc), params_(params), buffer_info_(buffer_info), axis_info_(ax_info), tgt_(tgt){ } void run(ir::module &src, llvm::Module &dst); @@ -158,7 +158,7 @@ private: tune *params_; target *tgt_; shmem_info *buffer_info_; - axis_info *axis_info_; + alignment_info *axis_info_; std::map axes_; llvm::Value *sh_mem_ptr_; llvm::Value *offset_a_i_, *offset_a_k_; diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index 3b8aa606c..232ea0e5a 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -17,7 +17,7 @@ #include "triton/codegen/shmem_liveness.h" #include "triton/codegen/shmem_info.h" #include "triton/codegen/shmem_barriers.h" -#include "triton/codegen/axis_info.h" +#include "triton/codegen/alignment_info.h" #include "triton/codegen/target.h" #include "triton/codegen/vectorize.h" #include @@ -91,7 +91,7 @@ public: codegen::optimize_dot optimize_dot; codegen::optimize_cse optimize_cse; codegen::optimize_trans optimize_trans; - codegen::axis_info axis_info; + codegen::alignment_info axis_info; codegen::target* target_; }; diff --git a/lib/codegen/axis_info.cpp b/lib/codegen/alignment_info.cpp similarity index 94% rename from lib/codegen/axis_info.cpp rename to lib/codegen/alignment_info.cpp index be2a16c91..ec3204587 100644 --- a/lib/codegen/axis_info.cpp +++ b/lib/codegen/alignment_info.cpp @@ -1,4 +1,4 @@ -#include "triton/codegen/axis_info.h" +#include "triton/codegen/alignment_info.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" @@ -15,14 +15,14 @@ inline T add_to_cache(ir::value *i, T value, std::map &map) { } -bool axis_info::is_first_axis_unit(ir::value *x){ +bool alignment_info::is_first_axis_unit(ir::value *x){ if(x->get_type()->is_tile_ty()) return x->get_type()->get_tile_shapes()[0]->get_value() == 1; else return true; } -bool axis_info::populate_is_constant(ir::value *v) { +bool alignment_info::populate_is_constant(ir::value *v) { if(is_constant_.find(v) != is_constant_.end()) return is_constant_.at(v); // helper for the cache @@ -61,7 +61,7 @@ bool axis_info::populate_is_constant(ir::value *v) { return cache(true); } -unsigned axis_info::populate_max_contiguous(ir::value *v){ +unsigned alignment_info::populate_max_contiguous(ir::value *v){ if(max_contiguous_.find(v) != max_contiguous_.end()) return max_contiguous_.at(v); // helper for the cache @@ -128,7 +128,7 @@ unsigned axis_info::populate_max_contiguous(ir::value *v){ return cache(1); } -unsigned axis_info::populate_starting_multiple(ir::value *v){ +unsigned alignment_info::populate_starting_multiple(ir::value *v){ if(starting_multiple_.find(v) != starting_multiple_.end()) return starting_multiple_.at(v); auto cache = [this,v](unsigned value){ return add_to_cache(v, value, starting_multiple_); }; @@ -201,16 +201,16 @@ unsigned axis_info::populate_starting_multiple(ir::value *v){ return cache(result); } -unsigned axis_info::get_starting_multiple(ir::value* v) const { +unsigned alignment_info::get_starting_multiple(ir::value* v) const { return starting_multiple_.at(v); } -unsigned axis_info::get_max_contiguous(ir::value* v) const { +unsigned alignment_info::get_max_contiguous(ir::value* v) const { return max_contiguous_.at(v); } -void axis_info::run(ir::module &mod) { +void alignment_info::run(ir::module &mod) { // populate constant for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index e98f0bcb0..751f88647 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -2,7 +2,7 @@ #include "triton/codegen/tune.h" #include "triton/codegen/shmem_allocation.h" #include "triton/codegen/target.h" -#include "triton/codegen/axis_info.h" +#include "triton/codegen/alignment_info.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Module.h" #include "llvm/IR/IRBuilder.h" From 64513fb407fd2f3fcf034a266fdc30726b181eb8 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 25 Jun 2019 15:49:58 -0700 Subject: [PATCH 188/494] [codegen] added fallback when tensor cores cannot be used --- examples/python/tensorflow/dot.cpp | 16 +++++++++++++++- examples/python/tensorflow/run.py | 18 ++++-------------- include/triton/runtime/jit.h | 10 +++++----- lib/codegen/selection.cpp | 11 ++++++++++- lib/codegen/tune.cpp | 4 ++++ lib/driver/module.cpp | 2 +- 6 files changed, 39 insertions(+), 22 deletions(-) diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index 8dea2337a..6c376822b 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -42,7 +42,12 @@ void matmul(restrict read_only align(16) fp16 *A, fp16* pb[TN, TK] = B + rkb[newaxis, :]*ldb + ryb[:, newaxis]; fp16 a[TM, TK] = *pa; fp16 b[TN, TK] = *pb; - for(int32 k = K; k > TK; k = k - TK){ + int32 last_a = ((M*K - 1) - (TM*TK + 1)) / lda; + int32 last_b = ((K*N - 1) - (TN*TK + 1)) / ldb; + last_a = last_a / TK * TK; + last_b = last_b / TK * TK; + int32 bound = K - max(last_a, last_b); + for(int32 k = K; k > bound; k = k - TK){ pa = pa + TK*lda; pb = pb + TK*ldb; c = dot(a, trans(b), c); @@ -51,6 +56,15 @@ void matmul(restrict read_only align(16) fp16 *A, } int32 rxc[TM] = get_global_range[TM](0); int32 ryc[TN] = get_global_range[TN](1); + for(int32 k = bound; k > 0; k = k - 1){ + int1 checka[TM, 1] = rxc[:, newaxis] < M; + int1 checkb[TN, 1] = ryc[:, newaxis] < N; + fp16* pa[TM, 1] = A + (K - k)*lda + rxc[:, newaxis]; + fp16* pb[TN, 1] = B + (K - k)*ldb + ryc[:, newaxis]; + fp16 a[TM, 1] = checka ? *pa : 0; + fp16 b[TN, 1] = checkb ? *pb : 0; + c = dot(a, trans(b), c); + } fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; *pc = c; } diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 86c0bc999..0788231e0 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -6,7 +6,7 @@ data_files_path = tf.resource_loader.get_data_files_path() library_dir = os.path.dirname(os.path.realpath(__file__)) module = tf.load_op_library(os.path.join(library_dir, 'libtf_blocksparse.so')) -M, N, K = 8192, 8192, 8192 +M, N, K = 128,128,128 a = tf.placeholder(tf.float16, shape=[M, K]) b = tf.placeholder(tf.float16, shape=[N, K]) locks = tf.placeholder(tf.int32, shape=[4096]) @@ -24,16 +24,6 @@ result = sess.run([c], feed_dict = {locks: np.zeros(4096), a: ha, b: hb})[0] -#bench = tf.test.Benchmark().run_op_benchmark(sess=sess, -# op_or_tensor=c, -# feed_dict={a: ha, b: hb}, -# min_iters=100) -#print(end - start) -#print(2*M*N*K / (end - start) * 1e-12) -#hresult = np.dot(ha.T, hb).T -#dif = np.abs(result - hresult) -#print("dif: %f" % np.max(dif)) - -#np.savetxt("dif.txt", dif, fmt="%5.2f") -#np.savetxt("gpu.txt", result, fmt="%5.2f") -#np.savetxt("cpu.txt", hresult, fmt="%5.2f") +hresult = np.dot(ha.T, hb).T +dif = np.abs(result - hresult) +print("dif: %f" % np.max(dif)) diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index 232ea0e5a..684bc6875 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -57,21 +57,21 @@ public: shmem_allocation(&shmem_liveness, &shmem_info, &tune), shmem_barriers(&shmem_allocation, &shmem_info), vectorize(&tune), - selection(&shmem_allocation, &tune, &shmem_info, &axis_info, target), + selection(&shmem_allocation, &tune, &shmem_info, &alignment_info, target), optimize_dot(&tune), optimize_cse(), optimize_trans(), - axis_info(), + alignment_info(), target_(target) { } void target_independent(ir::module &module) { optimize_dot.run(module); optimize_trans.run(module); - ir::print(module, std::cout); +// ir::print(module, std::cout); } void target_dependent(ir::module &module) { - axis_info.run(module); + alignment_info.run(module); if(target_->is_gpu()){ shmem_info.run(module); shmem_liveness.run(module); @@ -91,7 +91,7 @@ public: codegen::optimize_dot optimize_dot; codegen::optimize_cse optimize_cse; codegen::optimize_trans optimize_trans; - codegen::alignment_info axis_info; + codegen::alignment_info alignment_info; codegen::target* target_; }; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 751f88647..d394c1ec0 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -901,7 +901,8 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & bool AT = dot->is_a_trans(); bool BT = dot->is_b_trans(); distributed_tile *TC = (distributed_tile*)tmap_.at(C); - Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {llvm_type(C->get_type()->get_scalar_ty(), ctx)}); + Type *c_ty = llvm_type(C->get_type()->get_scalar_ty(), ctx); + Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {c_ty}); unsigned NK = A->get_type()->get_tile_shapes()[1]->get_value(); if(NK != 1) { @@ -922,6 +923,10 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & std::swap(b_idx[0], b_idx[1]); Value *a = TA->get_value(a_idx); Value *b = TB->get_value(b_idx); + if(a->getType() != c_ty) + a = builder.CreateFPCast(a, c_ty); + if(b->getType() != c_ty) + b = builder.CreateFPCast(b, c_ty); res = builder.CreateCall(f_mul_add, {a, b, res}); } @@ -1022,6 +1027,10 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & std::swap(b_idx[0], b_idx[1]); Value *a = TA->get_value(a_idx); Value *b = TB->get_value(b_idx); + if(a->getType() != c_ty) + a = builder.CreateFPCast(a, c_ty); + if(b->getType() != c_ty) + b = builder.CreateFPCast(b, c_ty); res = builder.CreateCall(f_mul_add, {a, b, res}); result->set_value(idx, res); }); diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index dfd079817..ad2006436 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -22,8 +22,12 @@ bool is_hmma(ir::value *v){ ir::type *a_ty = a->get_type(); ir::value *b = x->get_operand(1); ir::type *b_ty = b->get_type(); + // only NT supported result = !x->is_a_trans() && x->is_b_trans(); + // inputs have to be FP16 result = result && a_ty->get_scalar_ty()->is_half_ty() && b_ty->get_scalar_ty()->is_half_ty(); + // reduction has to be multiple of 4 + result = result && ((a_ty->get_tile_shapes()[1]->get_value() % 4) == 0); } return result; } diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index c3139ece6..4ff863666 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -255,7 +255,7 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ - std::cout << source << std::endl; +// std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; From 616f22c6109af94372dc3845b725227377c26fb0 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 25 Jun 2019 16:35:43 -0700 Subject: [PATCH 189/494] confirmed this is the fastest bounds checking --- examples/python/tensorflow/dot.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index 6c376822b..737e9603b 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -71,6 +71,7 @@ void matmul(restrict read_only align(16) fp16 *A, )"; + class BlockSparseGemmOp : public OpKernel { public: explicit BlockSparseGemmOp(OpKernelConstruction* context) : OpKernel(context) { From d945ce5e1b7b3987addc1c018ece96c29c1dc097 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 25 Jun 2019 19:18:43 -0700 Subject: [PATCH 190/494] Now showing valid parameter for NN --- examples/cpp/dot.cpp | 2 +- examples/python/tensorflow/dot.cpp | 81 ++++-------------------------- include/triton/dnn/gemm.h | 4 +- lib/codegen/tune.cpp | 1 + lib/dnn/gemm.cpp | 32 +++++++----- lib/runtime/jit.cpp | 3 +- 6 files changed, 34 insertions(+), 89 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index fdb04a935..9c5349570 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -59,7 +59,7 @@ int main() { // just-in-time compile source-code - std::string src = triton::dnn::gemm::src(AT, BT); + std::string src = triton::dnn::gemm::src(AT, BT, "fp32", "fp32", 1, 1); jit.autotune("matmul",src.c_str(), benchmark); jit.add_module("matmul", src.c_str(), triton::dnn::gemm::default_params(AT, BT)); triton::driver::kernel* kernel = jit.get_function("matmul"); diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index 737e9603b..36a9bacfb 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -5,6 +5,7 @@ #include "triton/driver/stream.h" #include "triton/runtime/jit.h" #include "triton/tools/bench.hpp" +#include "triton/dnn/gemm.h" #define EIGEN_USE_GPU #include "tensorflow/core/framework/op.h" @@ -18,60 +19,6 @@ using namespace tensorflow; using GPUDevice = Eigen::GpuDevice; - -const char* src = -R"( -const tunable int32 TM = {64, 128}; -const tunable int32 TN = {64, 128}; -const tunable int32 TK = {16}; -const tunable int32 GZ = {1}; - -void matmul(restrict read_only align(16) fp16 *A, - restrict read_only align(16) fp16 *B, - align(16) fp32 *C, - int32 M, int32 N, int32 K, - multiple_of(4) int32 lda, multiple_of(4) int32 ldb, multiple_of(4) int32 ldc, - int32 *locks, int32 grid0, int32 grid1) { - int32 rxa[TM] = get_global_range[TM](0); - int32 ryb[TN] = get_global_range[TN](1); - int32 rz = get_global_range[1](2); - int32 rka[TK] = 0 ... TK; - int32 rkb[TK] = 0 ... TK; - fp32 c[TM, TN] = 0; - fp16* pa[TM, TK] = A + rka[newaxis, :]*lda + rxa[:, newaxis]; - fp16* pb[TN, TK] = B + rkb[newaxis, :]*ldb + ryb[:, newaxis]; - fp16 a[TM, TK] = *pa; - fp16 b[TN, TK] = *pb; - int32 last_a = ((M*K - 1) - (TM*TK + 1)) / lda; - int32 last_b = ((K*N - 1) - (TN*TK + 1)) / ldb; - last_a = last_a / TK * TK; - last_b = last_b / TK * TK; - int32 bound = K - max(last_a, last_b); - for(int32 k = K; k > bound; k = k - TK){ - pa = pa + TK*lda; - pb = pb + TK*ldb; - c = dot(a, trans(b), c); - a = *pa; - b = *pb; - } - int32 rxc[TM] = get_global_range[TM](0); - int32 ryc[TN] = get_global_range[TN](1); - for(int32 k = bound; k > 0; k = k - 1){ - int1 checka[TM, 1] = rxc[:, newaxis] < M; - int1 checkb[TN, 1] = ryc[:, newaxis] < N; - fp16* pa[TM, 1] = A + (K - k)*lda + rxc[:, newaxis]; - fp16* pb[TN, 1] = B + (K - k)*ldb + ryc[:, newaxis]; - fp16 a[TM, 1] = checka ? *pa : 0; - fp16 b[TN, 1] = checkb ? *pb : 0; - c = dot(a, trans(b), c); - } - fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - *pc = c; -} -)"; - - - class BlockSparseGemmOp : public OpKernel { public: explicit BlockSparseGemmOp(OpKernelConstruction* context) : OpKernel(context) { @@ -115,31 +62,21 @@ class BlockSparseGemmOp : public OpKernel { unsigned nthreads = info.num_threads; unsigned GZ = jit.get_int("GZ"); std::array grid = {(M + TM - 1)/TM, (N + TN - 1)/TN, GZ}; - // set argument - kernel->setArg(0, *da.cu()); - kernel->setArg(1, *db.cu()); - kernel->setArg(2, *dc.cu()); - kernel->setArg(3, M); - kernel->setArg(4, N); - kernel->setArg(5, K); - kernel->setArg(6, M); - kernel->setArg(7, N); - kernel->setArg(8, M); - kernel->setArg(9, *dlocks.cu()); - kernel->setArg(10, grid[0]); - kernel->setArg(11, grid[1]); + triton::dnn::gemm::set_arg(kernel, &da, &db, &dc, M, N, K, &dlocks, grid[0], grid[1]); stream->enqueue(kernel, grid, {nthreads, 1, 1}); stream->synchronize(); double ts = triton::tools::bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, [&](){ stream->synchronize(); }, ctx->device()); return 2.*M*N*K / ts * 1e-3; }; + std::string src = triton::dnn::gemm::src(false, false, "fp16", "fp16", 1, 1); // just-in-time compile source-code -// jit.autotune("matmul", src, benchmark); -// jit.add_module("matmul", src, {4, 2, 8, 4, 2, 32, 1, 4, 1, 1, 8, 8, 8, 1}); -// jit.add_module("matmul", src, {16, 4, 128, 16, 4, 128, 2, 2, 2, 2, 8, 32, 8, 1}); -// jit.add_module("matmul", src, {8, 8, 128, 16, 8, 128, 2, 2, 2, 2, 16, 32, 8, 1 }); - jit.add_module("matmul", src, {16, 4, 128, 16, 4, 128, 2, 2, 2, 2, 8, 16, 8, 1}); +// jit.autotune("matmul", src.c_str(), benchmark); +// jit.add_module("matmul", src.c_str(), {4, 2, 8, 4, 2, 32, 1, 4, 1, 1, 8, 8, 8, 1}); +// jit.add_module("matmul", src.c_str(), {16, 4, 128, 16, 4, 128, 2, 2, 2, 2, 8, 32, 8, 1}); +// jit.add_module("matmul", src.c_str(), {8, 8, 128, 16, 8, 128, 2, 2, 2, 2, 16, 32, 8, 1 }); +// jit.add_module("matmul", src.c_str(), {16, 4, 128, 16, 4, 128, 2, 2, 2, 2, 8, 16, 8, 1}); +// jit.add_module("matmul", src.c_str(), {16, 2, 128, 32, 32, 2, 2, 2, 2, 8, 8, 4, 2, 1}); //NN triton::driver::kernel* kernel = jit.get_function("matmul"); triton::jit::launch_information info = jit.get_launch_info("matmul"); std::cout << benchmark(kernel, info) << std::endl;; diff --git a/include/triton/dnn/gemm.h b/include/triton/dnn/gemm.h index e44c9631d..bd55d030a 100644 --- a/include/triton/dnn/gemm.h +++ b/include/triton/dnn/gemm.h @@ -16,7 +16,9 @@ public: static std::vector default_params(bool AT, bool BT); - static std::string src(bool AT, bool BT); + static std::string src(bool AT, bool BT, + std::string a_ty, std::string b_ty, + unsigned alignment_lda, unsigned alignment_ldb); template static void cpu_ref(std::vector &c, const std::vector &a, const std::vector &b, size_t M, size_t N, size_t K){ diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index ad2006436..db6d67702 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -166,6 +166,7 @@ std::vector tune::get_params(ir::module &mod) { for(ir::instruction *i : block->get_inst_list()) for(auto &x: params_[i]) if(seen.insert(x.second).second && !x.second->has_value()){ + std::cout << i->get_name() << " " << x.first << std::endl; result.push_back(x.second); } diff --git a/lib/dnn/gemm.cpp b/lib/dnn/gemm.cpp index 59f413d81..6aebf318e 100644 --- a/lib/dnn/gemm.cpp +++ b/lib/dnn/gemm.cpp @@ -41,7 +41,9 @@ std::vector gemm::default_params(bool AT, bool BT) { return {16, 2, 128, 32, 32, 32, 4, 2, 2, 8, 8, 4, 2, 1}; } -std::string gemm::src(bool AT, bool BT) { +std::string gemm::src(bool AT, bool BT, + std::string a_ty, std::string b_ty, + unsigned align_lda, unsigned align_ldb) { std::string AS0 = "TM", AS1 = "TK"; std::string BS0 = "TK", BS1 = "TN"; std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; @@ -60,6 +62,8 @@ std::string gemm::src(bool AT, bool BT) { std::swap(bcb0, bcb1); std::swap(ldb0, ldb1); } + std::string align_lda_str = "multiple_of(" + std::to_string(align_lda) + ")"; + std::string align_ldb_str = "multiple_of(" + std::to_string(align_ldb) + ")"; std::string res = R"( const tunable int32 TM = {16, 32, 64, 128}; @@ -67,10 +71,12 @@ const tunable int32 TN = {16, 32, 64, 128}; const tunable int32 TK = {8}; const tunable int32 GZ = {1}; -void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, - int32 M, int32 N, int32 K, - int32 lda, int32 ldb, int32 ldc, - int32 *locks, int32 grid0, int32 grid1) { +void matmul(restrict read_only )" + a_ty + R"( *A, + restrict read_only )" + b_ty + R"( *B, + fp32 *C, + int32 M, int32 N, int32 K, + )" + align_lda_str + R"( int32 lda, )" + align_ldb_str + R"(" int32 ldb, int32 ldc, + int32 *locks, int32 grid0, int32 grid1) { int32 rxa[TM] = get_global_range[TM](0); int32 ryb[TN] = get_global_range[TN](1); int32 rz = get_global_range[1](2); @@ -81,10 +87,10 @@ void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, int32 rem = K % GZ; K = select(rz < rem, div - 1, div); int32 offk = select(rz < rem, rz*(div + 1), rz*div + rem); - fp32* pa[)" + AS0 + ", " + AS1 + "] = A + (offk + rka" + bca0 + ")" + lda0 + " + rxa" + bca1 + lda1 + R"(; - fp32* pb[)" + BS0 + ", " + BS1 + "] = B + (offk + rkb" + bcb0 + ")" + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; - fp32 a[)" + AS0 + ", " + AS1 + R"(] = *pa; - fp32 b[)" + BS0 + ", " + BS1 + R"(] = *pb; + )" + a_ty + R"(* pa[)" + AS0 + ", " + AS1 + "] = A + (offk + rka" + bca0 + ")" + lda0 + " + rxa" + bca1 + lda1 + R"(; + )" + b_ty + R"(* pb[)" + BS0 + ", " + BS1 + "] = B + (offk + rkb" + bcb0 + ")" + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; + )" + a_ty + R"( a[)" + AS0 + ", " + AS1 + R"(] = *pa; + )" + b_ty + R"( b[)" + BS0 + ", " + BS1 + R"(] = *pb; int32 last_a = ((M*K - 1) - (TM*TK + 1)) / lda; int32 last_b = ((K*N - 1) - (TN*TK + 1)) / ldb; last_a = last_a / TK * TK; @@ -102,10 +108,10 @@ void matmul(restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, for(int32 k = bound; k > 0; k = k - 1){ int1 checka[TM, 1] = rxc[:, newaxis] < M; int1 checkb[TN, 1] = ryc[:, newaxis] < N; - fp32* pa[TM, 1] = A + (offk + K - k))" + lda0 + " + rxc[:, newaxis]" + lda1 + R"(; - fp32* pb[TN, 1] = B + (offk + K - k))" + ldb0 + " + ryc[:, newaxis]" + ldb1 + R"(; - fp32 a[TM, 1] = checka ? *pa : 0; - fp32 b[TN, 1] = checkb ? *pb : 0; + )" + a_ty + R"(* pa[TM, 1] = A + (offk + K - k))" + lda0 + " + rxc[:, newaxis]" + lda1 + R"(; + )" + b_ty + R"(* pb[TN, 1] = B + (offk + K - k))" + ldb0 + " + ryc[:, newaxis]" + ldb1 + R"(; + )" + a_ty + R"( a[TM, 1] = checka ? *pa : 0; + )" + b_ty + R"( b[TN, 1] = checkb ? *pb : 0; c = dot(a, trans(b), c); } int32 ridx = get_range_id(0); diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index d1d90278d..85e51b22f 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -182,9 +182,8 @@ void jit::add_module(ir::module &tt_module, const std::vector ¶ms) std::map> errors; passes.tune.check_constraints(errors); for(auto x: errors){ - std::cout << x.first << std::endl; for(auto str: x.second) - std::cout << str << std::endl; + std::cout << x.first->get_name() << ": " << str << std::endl; } if(errors.size()) throw std::runtime_error("invalid parameters"); From 25e9a109177fb3f37014c31de87b527e224b46b5 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 25 Jun 2019 19:27:49 -0700 Subject: [PATCH 191/494] changed auto-tuner parameter ranges --- examples/python/tensorflow/dot.cpp | 6 +++--- lib/codegen/tune.cpp | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index 36a9bacfb..8ff9dc854 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -69,14 +69,14 @@ class BlockSparseGemmOp : public OpKernel { [&](){ stream->synchronize(); }, ctx->device()); return 2.*M*N*K / ts * 1e-3; }; - std::string src = triton::dnn::gemm::src(false, false, "fp16", "fp16", 1, 1); + std::string src = triton::dnn::gemm::src(false, true, "fp16", "fp16", 1, 1); // just-in-time compile source-code -// jit.autotune("matmul", src.c_str(), benchmark); + jit.autotune("matmul", src.c_str(), benchmark); // jit.add_module("matmul", src.c_str(), {4, 2, 8, 4, 2, 32, 1, 4, 1, 1, 8, 8, 8, 1}); // jit.add_module("matmul", src.c_str(), {16, 4, 128, 16, 4, 128, 2, 2, 2, 2, 8, 32, 8, 1}); // jit.add_module("matmul", src.c_str(), {8, 8, 128, 16, 8, 128, 2, 2, 2, 2, 16, 32, 8, 1 }); // jit.add_module("matmul", src.c_str(), {16, 4, 128, 16, 4, 128, 2, 2, 2, 2, 8, 16, 8, 1}); -// jit.add_module("matmul", src.c_str(), {16, 2, 128, 32, 32, 2, 2, 2, 2, 8, 8, 4, 2, 1}); //NN + jit.add_module("matmul", src.c_str(), {16, 2, 128, 32, 32, 2, 2, 2, 2, 8, 8, 4, 2, 1}); //NN triton::driver::kernel* kernel = jit.get_function("matmul"); triton::jit::launch_information info = jit.get_launch_info("matmul"); std::cout << benchmark(kernel, info) << std::endl;; diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index db6d67702..e1d62f4cd 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -166,7 +166,7 @@ std::vector tune::get_params(ir::module &mod) { for(ir::instruction *i : block->get_inst_list()) for(auto &x: params_[i]) if(seen.insert(x.second).second && !x.second->has_value()){ - std::cout << i->get_name() << " " << x.first << std::endl; +// std::cout << i->get_name() << " " << x.first << std::endl; result.push_back(x.second); } @@ -233,13 +233,13 @@ void tune::run(ir::module &mod) { continue; if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 4, 4)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 4)); *params_.at(i).at("nts.d0") = *tmp; } if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 4, 4)); - std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 4, 4)); + std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 2, 4)); + std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 2, 4)); *params_.at(i).at("nts.d0") = *tmp1; *params_.at(i).at("nts.d1") = *tmp2; } From f1a89722673d63f2346771d22523ad7f024324c4 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 26 Jun 2019 11:39:22 -0700 Subject: [PATCH 192/494] [examples] added tensorflow dense convolution templates --- examples/python/tensorflow/CMakeLists.txt | 2 +- examples/python/tensorflow/dense_conv.cpp | 117 ++++++++++++++++++++++ examples/python/tensorflow/dot.cpp | 7 +- examples/python/tensorflow/run.py | 54 ++++++---- 4 files changed, 156 insertions(+), 24 deletions(-) create mode 100644 examples/python/tensorflow/dense_conv.cpp diff --git a/examples/python/tensorflow/CMakeLists.txt b/examples/python/tensorflow/CMakeLists.txt index f9b650d1d..c531c23b1 100644 --- a/examples/python/tensorflow/CMakeLists.txt +++ b/examples/python/tensorflow/CMakeLists.txt @@ -5,7 +5,7 @@ if(${TensorFlow_FOUND}) include_directories("${CUDA_HOME}/include") link_directories(${TF_LIB}) add_definitions(-D_GLIBCXX_USE_CXX11_ABI=${TF_ABI}) - add_library(tf_blocksparse SHARED dot.cpp) + add_library(tf_blocksparse SHARED dot.cpp dense_conv) target_link_libraries(tf_blocksparse tensorflow_framework triton) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/run.py ${CMAKE_CURRENT_BINARY_DIR}/run.py diff --git a/examples/python/tensorflow/dense_conv.cpp b/examples/python/tensorflow/dense_conv.cpp new file mode 100644 index 000000000..66e7bfdab --- /dev/null +++ b/examples/python/tensorflow/dense_conv.cpp @@ -0,0 +1,117 @@ +#include + +#include "triton/driver/buffer.h" +#include "triton/driver/backend.h" +#include "triton/driver/stream.h" +#include "triton/runtime/jit.h" +#include "triton/tools/bench.hpp" +#include "triton/dnn/gemm.h" +#include "triton/dnn/conv.h" + +#define EIGEN_USE_GPU +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" +#include "tensorflow/core/util/padding.h" +#include "tensorflow/core/util/tensor_format.h" +#include "tensorflow/core/framework/common_shape_fns.h" + +using namespace tensorflow; +using GPUDevice = Eigen::GpuDevice; + +//torch::Tensor conv_common( +// int32_t B, int32_t C, int32_t D, int32_t H, int32_t W, +// int32_t T, int32_t R, int32_t S, int32_t NF, +// int32_t stride_d, int32_t stride_h, int32_t stride_w, +// int32_t pad_d, int32_t pad_h, int32_t pad_w, +// triton::dnn::conv::type ty, +// torch::Tensor torcha, torch::Tensor torchb, torch::Tensor torchbias, +// bool autotune = false +// ) { + +//} + +class DenseConvOp : public OpKernel { + public: + explicit DenseConvOp(OpKernelConstruction* context) : OpKernel(context) { + } + + void Compute(OpKernelContext* context){ + // get device/stream + GPUDevice device = context->eigen_device(); + triton::driver::cu_stream sstream(device.stream(), false); + triton::driver::context* ctx = sstream.context(); + triton::driver::stream* stream = &sstream; + // get inputs + const Tensor& tfa = context->input(0); + const Tensor& tfb = context->input(1); + // get shapes + int32_t B = tfa.dim_size(0); + int32_t Ca = tfa.dim_size(1); + int32_t D = 1; + int32_t H = tfa.dim_size(2); + int32_t W = tfa.dim_size(3); + int32_t Cb = tfb.dim_size(0); + int32_t T = 1; + int32_t R = tfb.dim_size(1); + int32_t S = tfb.dim_size(2); + int32_t NF = tfb.dim_size(3); + assert(Ca == Cb); + int32_t C = Ca; + int32_t stride_d = 1, stride_h = 1, stride_w = 1; + int32_t pad_d = 0, pad_h = 0, pad_w = 0; + bool has_bias = false; + + // get conv configuration + triton::dnn::conv configuration(B, C, D, H, W, T, R, S, NF, + stride_d, stride_h, stride_w, + pad_d, pad_h, pad_w, + 1, 1, 1, + triton::dnn::conv::FPROP, has_bias); + + // Bind memory + triton::driver::cu_buffer a(ctx, (CUdeviceptr)tfa.flat().data(), false); + triton::driver::cu_buffer b(ctx, (CUdeviceptr)tfb.flat().data(), false); +// triton::driver::cu_buffer cubias(ctx, (CUdeviceptr)torchbias.storage().data(), false); +// triton::driver::buffer* bias = has_bias ? &cubias : nullptr; + triton::driver::buffer* bias = nullptr; + + // allocate output + auto c_shapes = configuration.c_shapes(); + Tensor* tfc = nullptr; + TensorShape out_shape({c_shapes[0], c_shapes[1], c_shapes[2], c_shapes[3]}); + OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &tfc)); + triton::driver::cu_buffer c(ctx, (CUdeviceptr)tfc->flat().data(), false); + + // benchmark a given convolution kernel + triton::jit jit(ctx); + auto benchmark = [&](triton::driver::kernel* kernel, + triton::jit::launch_information info) { + configuration.init(stream, (triton::driver::cu_module*)kernel->module()); + unsigned TM = info.global_range_size[0]; + unsigned TN = info.global_range_size[1]; + unsigned nthreads = info.num_threads; + unsigned GZ = jit.get_int("GZ"); + configuration.enqueue(stream, kernel, &a, &b, &c, bias, TM, TN, GZ, nthreads); + stream->synchronize(); + double ts = triton::tools::bench([&](){ configuration.enqueue(stream, kernel, &a, &b, &c, bias, TM, TN, GZ, nthreads); }, + [&](){ stream->synchronize(); }, stream->context()->device()); + return configuration.get_nflops() / ts * 1e-3; + }; + + std::ostringstream oss; + configuration.src(oss); + std::string src = oss.str(); + + triton::jit::tune_res_t best = jit.autotune("conv", src.c_str(), benchmark); + jit.add_module("conv", src.c_str(), best.params); + } +}; + +REGISTER_KERNEL_BUILDER(Name("DenseConv").Device(DEVICE_GPU), DenseConvOp); +REGISTER_OP("DenseConv") + .Input("a: float32") + .Input("b: float32") + .Output("c: float32") +; diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index 8ff9dc854..bdaab5921 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -19,9 +19,9 @@ using namespace tensorflow; using GPUDevice = Eigen::GpuDevice; -class BlockSparseGemmOp : public OpKernel { +class DotOp : public OpKernel { public: - explicit BlockSparseGemmOp(OpKernelConstruction* context) : OpKernel(context) { + explicit DotOp(OpKernelConstruction* context) : OpKernel(context) { } void Compute(OpKernelContext* context){ @@ -52,7 +52,6 @@ class BlockSparseGemmOp : public OpKernel { triton::driver::cu_buffer db(ctx, (CUdeviceptr)b.flat().data(), false); triton::driver::cu_buffer dc(ctx, (CUdeviceptr)c->flat().data(), false); triton::driver::cu_buffer dlocks(ctx, (CUdeviceptr)locks.flat().data(), false); - stream->synchronize(); // benchmark a given matrix multiplication kernel auto benchmark = [&](triton::driver::kernel* kernel, triton::jit::launch_information info) { @@ -85,7 +84,7 @@ class BlockSparseGemmOp : public OpKernel { private: }; -REGISTER_KERNEL_BUILDER(Name("Dot").Device(DEVICE_GPU), BlockSparseGemmOp); +REGISTER_KERNEL_BUILDER(Name("Dot").Device(DEVICE_GPU), DotOp); REGISTER_OP("Dot") .Input("a: float16") .Input("b: float16") diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 0788231e0..9756ee340 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -6,24 +6,40 @@ data_files_path = tf.resource_loader.get_data_files_path() library_dir = os.path.dirname(os.path.realpath(__file__)) module = tf.load_op_library(os.path.join(library_dir, 'libtf_blocksparse.so')) -M, N, K = 128,128,128 -a = tf.placeholder(tf.float16, shape=[M, K]) -b = tf.placeholder(tf.float16, shape=[N, K]) -locks = tf.placeholder(tf.int32, shape=[4096]) -# c = tf.matmul(a, b, transpose_a=True) -c = module.dot(a, b, locks) +def run_dot(): + M, N, K = 128,128,128 + a = tf.placeholder(tf.float16, shape=[M, K]) + b = tf.placeholder(tf.float16, shape=[N, K]) + locks = tf.placeholder(tf.int32, shape=[4096]) + # c = tf.matmul(a, b, transpose_a=True) + c = module.dot(a, b, locks) + # Reference + ha = np.random.rand(M, K).astype(np.float16) + hb = np.random.rand(N, K).astype(np.float16) + # Run + sess = tf.InteractiveSession() + sess.run(tf.global_variables_initializer()) + result = sess.run([c], feed_dict = {locks: np.zeros(4096), + a: ha, + b: hb})[0] + # Test + hresult = np.dot(ha.T, hb).T + dif = np.abs(result - hresult) + print("dif: %f" % np.max(dif)) -# Reference -ha = np.random.rand(M, K).astype(np.float16) -hb = np.random.rand(N, K).astype(np.float16) +def run_conv(): + BS, C, H, W = 16, 32, 32, 32 + R, S, NF = 3, 3, 32 + a = tf.placeholder(tf.float32, shape=[BS, C, H, W]) + b = tf.placeholder(tf.float32, shape=[C, R, S, NF]) + c = module.dense_conv(a, b) + # Reference + ha = np.random.rand(BS, C, H, W) + hb = np.random.rand(C, R, S, NF) + # Run + sess = tf.InteractiveSession() + sess.run(tf.global_variables_initializer()) + result = sess.run([c], feed_dict = {a: ha, + b: hb})[0] -# Run -sess = tf.InteractiveSession() -sess.run(tf.global_variables_initializer()) -result = sess.run([c], feed_dict = {locks: np.zeros(4096), - a: ha, - b: hb})[0] - -hresult = np.dot(ha.T, hb).T -dif = np.abs(result - hresult) -print("dif: %f" % np.max(dif)) +run_conv() From 6300ec50800e4b586f2e9cd8a32f12c64ea841ad Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 26 Jun 2019 18:50:53 -0700 Subject: [PATCH 193/494] [examples] added conv2d op in tensorflow --- examples/cpp/conv.cpp | 8 ++-- examples/python/tensorflow/CMakeLists.txt | 2 +- .../tensorflow/{dense_conv.cpp => conv2d.cpp} | 40 +++++++++---------- examples/python/tensorflow/dot.cpp | 2 +- examples/python/tensorflow/run.py | 2 +- include/triton/dnn/conv.h | 6 ++- lib/codegen/selection.cpp | 2 +- lib/codegen/tune.cpp | 6 +-- lib/dnn/conv.cpp | 30 +++++++------- 9 files changed, 49 insertions(+), 49 deletions(-) rename examples/python/tensorflow/{dense_conv.cpp => conv2d.cpp} (78%) diff --git a/examples/cpp/conv.cpp b/examples/cpp/conv.cpp index e906493ac..d5f2bba3d 100644 --- a/examples/cpp/conv.cpp +++ b/examples/cpp/conv.cpp @@ -13,13 +13,13 @@ int main() { triton::jit jit(context); triton::dnn::conv::type ty = triton::dnn::conv::FPROP; // initialization - int32_t B = 64, NF = 64; - int32_t D = 1, H = 8, W = 8; - int32_t NC = 3, T = 1, R = 3, S = 3; + int32_t B = 16, NF = 128; + int32_t D = 1, H = 16, W = 16; + int32_t NC = 64, T = 1, R = 3, S = 3; int32_t pad_d = 0, pad_h = 0, pad_w = 0; int32_t stride_d = 1, stride_h = 1, stride_w = 1; int32_t upsample_d = 1, upsample_h = 1, upsample_w = 1; - triton::dnn::conv configuration(128, 256, 1, 14, 14, 1, 5, 5, 512, 1, 1, 1, 0, 0, 0, 1, 1, 1, triton::dnn::conv::FPROP, 0); + triton::dnn::conv configuration(128, 256, 1, 14, 14, 1, 5, 5, 512, 1, 1, 1, 0, 0, 0, 1, 1, 1, "fp32", "fp32", triton::dnn::conv::FPROP, 0); // triton::dnn::conv configuration(B, NC, D, H, W, T, R, S, NF, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, upsample_d, upsample_h, upsample_w, ty); // convolution configuration std::vector hc(configuration.c_size()); diff --git a/examples/python/tensorflow/CMakeLists.txt b/examples/python/tensorflow/CMakeLists.txt index c531c23b1..bfd54f6a6 100644 --- a/examples/python/tensorflow/CMakeLists.txt +++ b/examples/python/tensorflow/CMakeLists.txt @@ -5,7 +5,7 @@ if(${TensorFlow_FOUND}) include_directories("${CUDA_HOME}/include") link_directories(${TF_LIB}) add_definitions(-D_GLIBCXX_USE_CXX11_ABI=${TF_ABI}) - add_library(tf_blocksparse SHARED dot.cpp dense_conv) + add_library(tf_blocksparse SHARED dot.cpp conv2d.cpp) target_link_libraries(tf_blocksparse tensorflow_framework triton) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/run.py ${CMAKE_CURRENT_BINARY_DIR}/run.py diff --git a/examples/python/tensorflow/dense_conv.cpp b/examples/python/tensorflow/conv2d.cpp similarity index 78% rename from examples/python/tensorflow/dense_conv.cpp rename to examples/python/tensorflow/conv2d.cpp index 66e7bfdab..12b033f21 100644 --- a/examples/python/tensorflow/dense_conv.cpp +++ b/examples/python/tensorflow/conv2d.cpp @@ -20,21 +20,9 @@ using namespace tensorflow; using GPUDevice = Eigen::GpuDevice; -//torch::Tensor conv_common( -// int32_t B, int32_t C, int32_t D, int32_t H, int32_t W, -// int32_t T, int32_t R, int32_t S, int32_t NF, -// int32_t stride_d, int32_t stride_h, int32_t stride_w, -// int32_t pad_d, int32_t pad_h, int32_t pad_w, -// triton::dnn::conv::type ty, -// torch::Tensor torcha, torch::Tensor torchb, torch::Tensor torchbias, -// bool autotune = false -// ) { - -//} - -class DenseConvOp : public OpKernel { - public: - explicit DenseConvOp(OpKernelConstruction* context) : OpKernel(context) { +class Conv2dOp : public OpKernel { +public: + explicit Conv2dOp(OpKernelConstruction* context) : OpKernel(context) { } void Compute(OpKernelContext* context){ @@ -64,15 +52,19 @@ class DenseConvOp : public OpKernel { bool has_bias = false; // get conv configuration - triton::dnn::conv configuration(B, C, D, H, W, T, R, S, NF, + triton::dnn::conv configuration(B, C, + D, H, W, + T, R, S, + NF, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, 1, 1, 1, + "fp16", "fp16", triton::dnn::conv::FPROP, has_bias); // Bind memory - triton::driver::cu_buffer a(ctx, (CUdeviceptr)tfa.flat().data(), false); - triton::driver::cu_buffer b(ctx, (CUdeviceptr)tfb.flat().data(), false); + triton::driver::cu_buffer a(ctx, (CUdeviceptr)tfa.flat().data(), false); + triton::driver::cu_buffer b(ctx, (CUdeviceptr)tfb.flat().data(), false); // triton::driver::cu_buffer cubias(ctx, (CUdeviceptr)torchbias.storage().data(), false); // triton::driver::buffer* bias = has_bias ? &cubias : nullptr; triton::driver::buffer* bias = nullptr; @@ -106,12 +98,16 @@ class DenseConvOp : public OpKernel { triton::jit::tune_res_t best = jit.autotune("conv", src.c_str(), benchmark); jit.add_module("conv", src.c_str(), best.params); +// jit.add_module("conv", src.c_str(), {16, 2, 32, 32, 2, 64, 2, 2, 2, 2, 8, 2, 16, 4, 1}); + triton::driver::kernel* kernel = jit.get_function("conv"); + triton::jit::launch_information info = jit.get_launch_info("conv"); + std::cout << benchmark(kernel, info) << std::endl; } }; -REGISTER_KERNEL_BUILDER(Name("DenseConv").Device(DEVICE_GPU), DenseConvOp); -REGISTER_OP("DenseConv") - .Input("a: float32") - .Input("b: float32") +REGISTER_KERNEL_BUILDER(Name("Conv2d").Device(DEVICE_GPU), Conv2dOp); +REGISTER_OP("Conv2d") + .Input("a: float16") + .Input("b: float16") .Output("c: float32") ; diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index bdaab5921..09b9f47b4 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -78,7 +78,7 @@ class DotOp : public OpKernel { jit.add_module("matmul", src.c_str(), {16, 2, 128, 32, 32, 2, 2, 2, 2, 8, 8, 4, 2, 1}); //NN triton::driver::kernel* kernel = jit.get_function("matmul"); triton::jit::launch_information info = jit.get_launch_info("matmul"); - std::cout << benchmark(kernel, info) << std::endl;; + std::cout << benchmark(kernel, info) << std::endl; } private: diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 9756ee340..dca626b1a 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -32,7 +32,7 @@ def run_conv(): R, S, NF = 3, 3, 32 a = tf.placeholder(tf.float32, shape=[BS, C, H, W]) b = tf.placeholder(tf.float32, shape=[C, R, S, NF]) - c = module.dense_conv(a, b) + c = module.conv2d(a, b) # Reference ha = np.random.rand(BS, C, H, W) hb = np.random.rand(C, R, S, NF) diff --git a/include/triton/dnn/conv.h b/include/triton/dnn/conv.h index a950c3304..6a590f201 100644 --- a/include/triton/dnn/conv.h +++ b/include/triton/dnn/conv.h @@ -31,6 +31,7 @@ public: int stride_d, int stride_h, int stride_w, int pad_d, int pad_h, int pad_w, int upsample_d, int upsample_h, int upsample_w, + std::string a_ty = "fp32", std::string b_ty = "fp32", type ty = FPROP, bool bias = false); // accessors @@ -126,7 +127,10 @@ private: bool is_a_deltas_cst; bool is_b_deltas_cst_; bool is_mask_cst_; - // type + // data type + std::string a_ty_; + std::string b_ty_; + // conv type type ty_; bool bias_; bool b_trans_; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index d394c1ec0..b066a963a 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -347,7 +347,7 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::function(inst)){ Value *offset = tgt_->get_block_id(builder.GetInsertBlock()->getModule(), builder, ii->get_axis()); - return (Instruction*)builder.CreateAdd(offset, builder.getInt32(0)); + return (Instruction*)offset; } if(ir::atomic_cas_inst* ii = dynamic_cast(inst)){ BasicBlock *current = builder.GetInsertBlock(); diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index e1d62f4cd..72267b23f 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -233,13 +233,13 @@ void tune::run(ir::module &mod) { continue; if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 4)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 2)); *params_.at(i).at("nts.d0") = *tmp; } if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 2, 4)); - std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 2, 4)); + std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 2, 2)); + std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 2, 2)); *params_.at(i).at("nts.d0") = *tmp1; *params_.at(i).at("nts.d1") = *tmp2; } diff --git a/lib/dnn/conv.cpp b/lib/dnn/conv.cpp index 2acbfed94..c67d132c8 100644 --- a/lib/dnn/conv.cpp +++ b/lib/dnn/conv.cpp @@ -21,11 +21,13 @@ conv::conv(int B, int NC, int stride_d, int stride_h, int stride_w, int pad_d, int pad_h, int pad_w, int upsample_d, int upsample_h, int upsample_w, + std::string a_ty, std::string b_ty, type ty, bool bias) : NB_(B), NC_(NC), AD_(D), AH_(H), AW_(W), BD_(T), BH_(R), BW_(S), NF_(NF), stride_d_(stride_d), stride_h_(stride_h), stride_w_(stride_w), pad_d_(pad_d), pad_h_(pad_h), pad_w_(pad_w), upsample_d_(upsample_d), upsample_h_(upsample_h), upsample_w_(upsample_w), + a_ty_(a_ty), b_ty_(b_ty), ty_(ty), bias_(bias) { CD_ = (AD_*upsample_d_ - BD_ + 1 + 2*pad_d_ + stride_d_ - 1)/stride_d_; @@ -281,8 +283,8 @@ void conv::init(driver::stream *stream, triton::driver::cu_module* module) { d_a_deltas_ = init_lut(is_a_deltas_cst, "delta", h_a_deltas_); d_b_deltas_ = init_lut(is_b_deltas_cst_, "b_delta", h_b_deltas_); d_masks_ = init_lut(is_mask_cst_, "masks", h_masks_); - d_locks_ = triton::driver::buffer::create(stream->context(), max_grid_0_*max_grid_1_*4); - ((triton::driver::cu_buffer*)d_locks_)->set_zero(stream, max_grid_0_*max_grid_1_*4); + d_locks_ = triton::driver::buffer::create(stream->context(), max_grid_0_*max_grid_1_*4*2); + ((triton::driver::cu_buffer*)d_locks_)->set_zero(stream, max_grid_0_*max_grid_1_*4*2); } void conv::set_arg(driver::kernel *kernel, @@ -336,8 +338,8 @@ void conv::set_arg(driver::kernel *kernel, kernel->setArg(39, (int32_t)0); kernel->setArg(40, (int32_t)0); kernel->setArg(41, d_locks_); - kernel->setArg(42, 0); - kernel->setArg(43, 0); + kernel->setArg(42, max_grid_0_); + kernel->setArg(43, max_grid_1_); size_t idx = 44; if(!is_a_deltas_cst) kernel->setArg(idx++, d_a_deltas_); @@ -358,8 +360,6 @@ void conv::enqueue(driver::stream *stream, driver::kernel *kernel, grid[0] /= upsample_h_*upsample_w_; kernel->setArg(11, CH_/upsample_h_); kernel->setArg(12, CW_/upsample_w_); - kernel->setArg(42, (int32_t)grid[0]); - kernel->setArg(43, (int32_t)grid[1]); // initialize to zero if necessary bool init_zero = false; @@ -526,7 +526,7 @@ void conv::src(std::ostream &os){ R"( const tunable int32 TM = {16, 32, 64}; const tunable int32 TN = {16, 32, 64}; -const tunable int32 TK = {8}; +const tunable int32 TK = {16}; const tunable int32 GZ = {1}; )"; if(is_a_deltas_cst) @@ -537,8 +537,8 @@ if(is_mask_cst_) os << "__constant__ int32* masks = alloc_const int32[" + std::to_string(h_masks_.size()) + "];\n"; os << R"( - void conv(read_only restrict fp32 *a, - read_only restrict fp32 *b, + void conv(read_only restrict )" << a_ty_ << R"( *a, + read_only restrict )" << b_ty_ << R"( *b, fp32 *c, fp32 *bias, int32 M, int32 N, int32 K, @@ -592,7 +592,7 @@ if(!is_mask_cst_) rar = )" + upar + R"( rar; ras = )" + upas + R"( ras; int32 ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; - fp32* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis];)"; + )" << a_ty_ << R"(* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis];)"; if(b_lut_){ os << R"( int32 rb)" + ax[0] + ax[1] + "[TK] = rkb / " + redax[2] + R"(; @@ -611,7 +611,7 @@ os << R"( int32 rb1[TK] = rkb)" + ldb0 + ";"; } os << R"( - fp32* pb)" + BS + " = b + rb1" + bcb1 + " + rb0" + bcb0 + R"(*ldb_k; + )" << b_ty_ << R"(* pb)" + BS + " = b + rb1" + bcb1 + " + rb0" + bcb0 + R"(*ldb_k; int32 offda[TK] = rka % ldlut; )" + a_delta_mem + R"( int32* pincd[TK] = delta + offda; )" + a_delta_mem + R"( int32* pda[TK] = delta + ldlut + offda + off_uw*ldlut + off_uh*ldlut*upsample_w; @@ -628,8 +628,8 @@ os << R"( int1 checka[TM, TK] = (maska0[:, newaxis] & maska1[newaxis, :]) > 0; int1 checkb0[TN] = rb0 < N; int1 checkb)" + BS + " = checkb0" + bcb0 + R"(; - fp32 a[TM, TK] = checka ? *pa : 0; - fp32 b)" + BS + R"( = checkb ? *pb : 0; + )" << a_ty_ << R"( a[TM, TK] = checka ? *pa : 0; + )" << b_ty_ << R"( b)" + BS + R"( = checkb ? *pb : 0; int32 rkamin[TK] = rka - offk + TK; for(int32 k = K; k > 0; k = k - TK){ C = dot(a, )" + useb + R"(, C); @@ -672,8 +672,8 @@ if(b_lut_){ int32 ridx = get_range_id(0); int32 ridy = get_range_id(1); int32 *plock = locks + ridx + ridy*grid0; + while(__atomic_cas(plock, 0, 1) == 1); int32 *pcount = plock + grid0*grid1; - while(__atomic_cas(plock, 0, 1)); int32 count = *pcount; int32 countp1 = select(count == GZ - 1, 0, count + 1); if(count == 0) {)"; @@ -691,7 +691,7 @@ if(b_lut_){ @checkc *pc = C + *pc; *pcount = countp1; } - __atomic_cas(plock, 1, 0); + *plock = 0; })"; } From 9028e40f1d04645f738bb01f13177002ad1a018f Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 27 Jun 2019 11:37:19 -0700 Subject: [PATCH 194/494] [dnn] added shift in the DNN libs --- examples/cpp/shift.cpp | 186 +++++++------------------------------ include/triton/dnn/shift.h | 151 ++++++++++++++++++++++++++++++ lib/dnn/shift.cpp | 176 +++++++++++++++++++++++++++++++++++ 3 files changed, 359 insertions(+), 154 deletions(-) create mode 100644 include/triton/dnn/shift.h create mode 100644 lib/dnn/shift.cpp diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index b244e8ec2..ed949c74c 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -1,9 +1,11 @@ #include #include +#include #include "triton/runtime/jit.h" #include "triton/driver/backend.h" #include "triton/driver/stream.h" #include "triton/tools/bench.hpp" +#include "triton/dnn/shift.h" // input layout: C, H, W, BS // filter layout: C, K @@ -36,96 +38,6 @@ void shift_conv(int32_t C, int32_t H, int32_t W, int32_t BS, } } -// K = channels -// M = batch * height * width -// N = number of feature maps - -const char* src = -R"( -const tunable int32 TM = {16, 32, 64, 128}; -const tunable int32 TN = {16, 32, 64, 128}; -const tunable int32 TK = {8}; - -__constant__ int32* delta = alloc_const int32[256]; -__constant__ int32* masks = alloc_const int32[8192]; - -void shift(restrict read_only fp32 *a, restrict read_only fp32 *b, fp32 *c, - int32 M, int32 N, int32 K, - int32 ABS, int32 AH, int32 AW, int32 AR, int32 AS){ - int32 rxa[TM] = get_global_range[TM](0); - int32 ryb[TN] = get_global_range[TN](1); - int32 rka[TK] = 0 ... TK; - int32 rkb[TK] = 0 ... TK; - fp32 C[TM, TN] = 0; - fp32* pxa[TM, TK] = a + rxa[:, newaxis]; - fp32* pb[TN, TK] = b + rkb[newaxis, :]*N + ryb[:, newaxis]; - __constant__ int32* pd[TK] = delta + rka; - int32 pad_h = AR/2; - int32 pad_w = AS/2; - int32 rawhc[TM] = rxa / ABS; - int32 raw[TM] = rawhc % AW - pad_w; - int32 rahc[TM] = rawhc / AW; - int32 rah[TM] = rahc % AH - pad_h; - int32 maskh[TM] = pad_h + min(rah, 0) + max(rah + AR - AH, 0); - int32 maskw[TM] = pad_w + min(raw, 0) + max(raw + AS - AW, 0); - __constant__ int32* pxm[TM] = masks + maskh*K + maskw*K*(2*pad_h + 1); - __constant__ int32* pm[TM, TK] = pxm[:, newaxis] + rka[newaxis, :]; - for(int32 k = K; k > 0; k = k - TK){ - int32 delta[TK] = *pd; - fp32 *pa[TM, TK] = pxa + delta[newaxis, :]; - int1 m[TM, TK] = *pm > 0; - fp32 a[TM, TK] = m ? *pa : 0; - fp32 b[TN, TK] = *pb; - C = dot(a, trans(b), C); - pb = pb + TK*N; - pd = pd + TK; - pm = pm + TK; - } - int32 rxc[TM] = get_global_range[TM](0); - int32 ryc[TN] = get_global_range[TN](1); - fp32* pc[TM, TN] = c + ryc[newaxis, :]*M + rxc[:, newaxis]; - int1 checkc0[TM] = rxc < M; - int1 checkc1[TN] = ryc < N; - int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - @checkc *pc = C; -} -)"; - -std::vector shift_deltas(// strides - int32_t stride_w, int32_t stride_h, int32_t stride_c, - // shift - int32_t C, - const std::vector& shift_h, - const std::vector& shift_w) { - std::vector res(C); - for(unsigned c = 0; c < C; c++){ - res[c] = c*stride_c; - res[c] += shift_h[c]*stride_h; - res[c] += shift_w[c]*stride_w; - } - return res; -} - -std::vector shift_masks(int32_t C, - const std::vector& shift_h, - const std::vector& shift_w, - int32_t R, int32_t S) { - size_t S0 = C; - size_t S1 = R; - size_t S2 = S; - std::vector res(S0*S1*S2); - for(size_t ph = 0; ph < S1; ++ph) - for(size_t pw = 0; pw < S2; ++pw){ - int32_t* ptr = &res[ph*S0 + pw*S0*S1]; - for(size_t i = 0; i < S0; ++i){ - bool in_bounds_h = shift_h[i] + ph >= 0 && shift_h[i] + ph < R; - bool in_bounds_w = shift_w[i] + pw >= 0 && shift_w[i] + pw < S; - ptr[i] = in_bounds_h && in_bounds_w; - } - } - return res; -} - int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); @@ -136,20 +48,6 @@ int main() { int32_t BS = 4, F = 128; int32_t H = 32, W = 32; int32_t C = 128; - // equivalent matmul dimensions - int32_t M = BS*H*W; - int32_t N = F; - int32_t K = C; - std::cout << M << " " << N << " " << K << std::endl; - std::vector hc(BS*H*W*F); - std::vector rc(BS*H*W*F); - std::vector ha(BS*C*H*W); - std::vector hb(F*C); - // strides - int32_t stride_i_bs = 1; - int32_t stride_i_w = BS*stride_i_bs; - int32_t stride_i_h = W*stride_i_w; - int32_t stride_i_c = H*stride_i_h; // random shifts std::vector shift_h(C); std::vector shift_w(C); @@ -157,83 +55,63 @@ int main() { shift_h[c] = rand() % R - R/2; shift_w[c] = rand() % S - S/2; } - // initialize buffers - srand(0); - for(int c = 0 ; c < C; c++) - for(int h = 0 ; h < H; h++) - for(int w = 0 ; w < W; w++) - for(int bs = 0 ; bs < BS; bs++){ - float value = (float)rand() / RAND_MAX; - size_t idx = bs + w*stride_i_w + h*stride_i_h + c*stride_i_c; - ha[idx] = value; - } - for(size_t i = 0; i < hb.size(); i++) - hb[i] = (float)rand() / RAND_MAX; - for(size_t i = 0; i < hc.size(); i++) - hc[i] = 0; + // configuration + triton::dnn::shift shift(BS, C, 1, H, W, 1, R, S, F, shift_h, shift_w); + // host buffers + std::vector hc(shift.c_size()); + std::vector rc(shift.c_size()); + std::vector ha(shift.a_size()); + std::vector hb(shift.b_size()); + // device buffers triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*4); triton::driver::buffer* da = triton::driver::buffer::create(context, ha.size()*4); triton::driver::buffer* db = triton::driver::buffer::create(context, hb.size()*4); triton::driver::stream* stream = triton::driver::stream::create(context); + // initialize host + srand(0); + for(size_t i = 0; i < ha.size(); i++) + ha[i] = (float)rand() / RAND_MAX; + for(size_t i = 0; i < hb.size(); i++) + hb[i] = (float)rand() / RAND_MAX; + for(size_t i = 0; i < hc.size(); i++) + hc[i] = 0; + // initialize device stream->write(da, true, 0, ha); stream->write(db, true, 0, hb); stream->write(dc, true, 0, hc); stream->synchronize(); - std::vector h_delta = shift_deltas(stride_i_w, stride_i_h, stride_i_c, C, shift_h, shift_w); - std::vector h_masks = shift_masks(C, shift_h, shift_w, R, S); - // benchmark a given matrix multiplication kernel + // benchmark auto benchmark = [&](triton::driver::kernel* kernel, triton::jit::launch_information info) { + shift.init(stream, (triton::driver::cu_module*)kernel->module()); // launch info unsigned TM = info.global_range_size[0]; unsigned TN = info.global_range_size[1]; unsigned nthreads = info.num_threads; - // initialize constant memory - triton::driver::buffer* delta = ((triton::driver::cu_module*)kernel->module())->symbol("delta"); - triton::driver::buffer* masks = ((triton::driver::cu_module*)kernel->module())->symbol("masks"); - stream->write(delta, false, 0, h_delta.size()*4, h_delta.data()); - stream->write(masks, false, 0, h_masks.size()*4, h_masks.data()); - stream->synchronize(); // set argument - kernel->setArg(0, da); - kernel->setArg(1, db); - kernel->setArg(2, dc); - kernel->setArg(3, M); - kernel->setArg(4, N); - kernel->setArg(5, K); - kernel->setArg(6, BS); - kernel->setArg(7, H); - kernel->setArg(8, W); - kernel->setArg(9, R); - kernel->setArg(10, S); - // dry run - std::array grid = {(M + TM - 1)/TM, (N + TN - 1)/TN, 1}; - stream->enqueue(kernel, grid, {nthreads, 1, 1}); + shift.enqueue(stream, kernel, da, db, dc, TM, TN, nthreads); stream->synchronize(); // benchmark - double ts = triton::tools::bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, + double ts = triton::tools::bench([&](){shift.enqueue(stream, kernel, da, db, dc, TM, TN, nthreads);}, [&](){ stream->synchronize(); }, context->device()); - ts = ts * 1e-9; - double tflops = 2.*M*N*K / ts * 1e-12; - return tflops; + return shift.get_nflops() / ts * 1e-3; }; // shift std::vector params = { - 16, 2, 64, - 32, 2, 64, - 16, 8, 2, 2, - 8, 8, - 4 + 8, 2, 16, 8, 2, 32, 8, 4, 2, 2, 4, 2, 8, 4 }; - jit.autotune("shift", src, benchmark); - jit.add_module("shift", src, params); + std::ostringstream oss; + shift.src(oss); + std::string src = oss.str(); +// jit.autotune("shift", src.c_str(), benchmark); + jit.add_module("shift", src.c_str(), params); triton::driver::kernel* kernel = jit.get_function("shift"); triton::jit::launch_information info = jit.get_launch_info("shift"); std::cout << "Performance: " << benchmark(kernel, info) << " TFLOPS " << std::endl; stream->read(dc, true, 0, hc); - shift_conv(C, H, W, BS, F, rc, ha, hb, shift_h, shift_w); - for(size_t i = 0; i < M*N; i++) + shift.cpu_ref(rc.data(), ha.data(), hb.data()); + for(size_t i = 0; i < hc.size(); i++) if(std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; exit(EXIT_FAILURE); diff --git a/include/triton/dnn/shift.h b/include/triton/dnn/shift.h new file mode 100644 index 000000000..6d6bda9de --- /dev/null +++ b/include/triton/dnn/shift.h @@ -0,0 +1,151 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#ifndef TDL_INCLUDE_DNN_SHIFT_H +#define TDL_INCLUDE_DNN_SHIFT_H + +#include +#include +#include +#include +#include +#include "triton/driver/stream.h" +#include "triton/driver/kernel.h" + +namespace triton{ +namespace dnn{ + +class shift { + +public: + enum type { + FPROP + }; + +private: + void set_ld(const std::vector& shapes, + std::vector& ld); + +public: + + shift(int B, int NC, + int D, int H, int W, + int T, int R, int S, int NF, + const std::vector &shift_h, const std::vector &shift_w, + std::string a_ty = "fp32", std::string b_ty = "fp32", + type ty = FPROP, bool bias = false); + + // look-up table + void build_deltas(); + void build_masks(); + + // accessors + size_t a_size(); + size_t b_size(); + size_t c_size(); + std::vector c_shapes(); + + // device function + void init(driver::stream *stream, driver::cu_module *module); + void enqueue(driver::stream *stream, driver::kernel *kernel, + driver::buffer *a, driver::buffer *b, driver::buffer *c, + size_t TM, size_t TN, size_t nthreads); + + // utils + size_t get_nflops(); + + // source + void src(std::ostream &os); + + // cpu_ref + template + void cpu_ref(OUT_DTYPE* O, + const IN_DTYPE* I, + const IN_DTYPE* F) + { + OUT_DTYPE acc; + for(int32_t p = 0; p < AH_; ++p) + for(int32_t q = 0; q < AW_; ++q) + for(int32_t bs = 0; bs < NB_; ++bs) + for(int32_t k = 0; k < NF_; ++k) + { + acc = 0; + for(int32_t c = 0; c < NC_; ++c){ + int32_t h = p + shift_h_[c]; + int32_t w = q + shift_w_[c]; + bool in_bounds = (h >= 0 && w >= 0 && h < AH_ && w < AW_); + IN_DTYPE a = in_bounds?I[bs + w*NB_ + h*NB_*AW_ + c*NB_*AH_*AW_]:0; + IN_DTYPE b = F[k + c*NF_]; + acc = std::fma(a, b, acc); + } + O[bs + q*NB_ + p*NB_*AW_ + k*NB_*AH_*AW_] = acc; + } + } + +private: + // image size + int32_t NB_; + int32_t NC_; + int32_t AD_; + int32_t AH_; + int32_t AW_; + // filter size + int32_t BD_; + int32_t BH_; + int32_t BW_; + int32_t NF_; + // activation size + int32_t CD_; + int32_t CH_; + int32_t CW_; + // equivalent matmul + int32_t M_; + int32_t N_; + int32_t K_; + // shapes + std::vector shapes_a_; + std::vector shapes_b_; + std::vector shapes_c_; + // memory strides + std::vector ld_a_; + std::vector ld_b_; + std::vector ld_c_; + // shift values + std::vector shift_h_; + std::vector shift_w_; + // look-up tables + std::vector h_deltas_; + std::vector h_masks_; + driver::buffer* d_deltas_; + driver::buffer* d_masks_; + // data types + std::string a_ty_; + std::string b_ty_; + // convolution type + type ty_; + bool bias_; +}; + +} +} + +#endif diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp new file mode 100644 index 000000000..d07809f1f --- /dev/null +++ b/lib/dnn/shift.cpp @@ -0,0 +1,176 @@ +#include "triton/dnn/shift.h" + + +namespace triton{ +namespace dnn{ + +void shift::set_ld(const std::vector& shapes, + std::vector& ld) { + size_t size = shapes.size(); + ld.resize(size); + ld[4] = 1; + ld[3] = shapes[4]*ld[4]; + ld[2] = shapes[3]*ld[3]; + ld[1] = shapes[2]*ld[2]; + ld[0] = shapes[1]*ld[1]; +} + +shift::shift(int B, int NC, + int D, int H, int W, + int T, int R, int S, + int NF, + const std::vector& shift_h, const std::vector& shift_w, + std::string a_ty, std::string b_ty, + type ty, bool bias) + : NB_(B), NC_(NC), + AD_(D), AH_(H), AW_(W), + BD_(T), BH_(R), BW_(S), + NF_(NF), + shift_h_(shift_h), shift_w_(shift_w), + a_ty_(a_ty), b_ty_(b_ty), + ty_(ty), bias_(bias) { + // equivalent matmul + M_ = NB_*AH_*AW_; + N_ = NF_; + K_ = NC_; + // shapes + // input layout: C, H, W, BS + // filter layout: C, K + // output layout: K, H, W, BS + shapes_a_ = {NC, H, W, B}; + shapes_b_ = {NC, NF}; + shapes_c_ = {NF, H, W, B}; + // memory strides + set_ld(shapes_a_, ld_a_); +} + +void shift::build_deltas() { + h_deltas_.resize(NC_); + for(unsigned c = 0; c < NC_; c++){ + h_deltas_[c] = c*ld_a_[0]; + h_deltas_[c] += shift_h_[c]*ld_a_[1]; + h_deltas_[c] += shift_w_[c]*ld_a_[2]; + } +} + +void shift::build_masks() { + size_t S0 = NC_; + size_t S1 = BH_; + size_t S2 = BW_; + h_masks_.resize(S0*S1*S2); + for(size_t ph = 0; ph < S1; ++ph) + for(size_t pw = 0; pw < S2; ++pw){ + int32_t* ptr = &h_masks_[ph*S0 + pw*S0*S1]; + for(size_t i = 0; i < S0; ++i){ + bool in_bounds_h = shift_h_[i] + ph >= 0 && shift_h_[i] + ph < BH_; + bool in_bounds_w = shift_w_[i] + pw >= 0 && shift_w_[i] + pw < BW_; + ptr[i] = in_bounds_h && in_bounds_w; + } + } +} + +size_t shift::a_size(){ + return std::accumulate(shapes_a_.begin(), shapes_a_.end(), + 1, std::multiplies()); +} + +size_t shift::b_size(){ + return std::accumulate(shapes_b_.begin(), shapes_b_.end(), + 1, std::multiplies()); +} + +size_t shift::c_size(){ + return std::accumulate(shapes_c_.begin(), shapes_c_.end(), + 1, std::multiplies()); +} + +std::vector shift::c_shapes(){ + return shapes_c_; +} + +size_t shift::get_nflops() { + return 2 * M_ * N_ * K_; +} + + +void shift::init(driver::stream *stream, driver::cu_module *module) { + triton::driver::buffer* delta = ((triton::driver::cu_module*)module)->symbol("delta"); + triton::driver::buffer* masks = ((triton::driver::cu_module*)module)->symbol("masks"); + stream->write(delta, false, 0, h_deltas_.size()*4, h_deltas_.data()); + stream->write(masks, false, 0, h_masks_.size()*4, h_masks_.data()); +} + +void shift::enqueue(driver::stream *stream, driver::kernel *kernel, + driver::buffer *a, driver::buffer *b, driver::buffer *c, + size_t TM, size_t TN, size_t nthreads) { + kernel->setArg(0, a); + kernel->setArg(1, b); + kernel->setArg(2, c); + kernel->setArg(3, M_); + kernel->setArg(4, N_); + kernel->setArg(5, K_); + kernel->setArg(6, NB_); + kernel->setArg(7, AH_); + kernel->setArg(8, AW_); + kernel->setArg(9, BH_); + kernel->setArg(10, BW_); + // dry run + std::array grid = {(M_ + TM - 1)/TM, (N_ + TN - 1)/TN, 1}; + stream->enqueue(kernel, grid, {nthreads, 1, 1}); +} + +void shift::src(std::ostream &os) { + os << +R"( +const tunable int32 TM = {16, 32, 64, 128}; +const tunable int32 TN = {16, 32, 64, 128}; +const tunable int32 TK = {8}; + +__constant__ int32* delta = alloc_const int32[256]; +__constant__ int32* masks = alloc_const int32[8192]; + +void shift(restrict read_only fp32 *a, restrict read_only fp32 *b, fp32 *c, + int32 M, int32 N, int32 K, + int32 ABS, int32 AH, int32 AW, int32 AR, int32 AS){ + int32 rxa[TM] = get_global_range[TM](0); + int32 ryb[TN] = get_global_range[TN](1); + int32 rka[TK] = 0 ... TK; + int32 rkb[TK] = 0 ... TK; + fp32 C[TM, TN] = 0; + fp32* pxa[TM, TK] = a + rxa[:, newaxis]; + fp32* pb[TN, TK] = b + rkb[newaxis, :]*N + ryb[:, newaxis]; + __constant__ int32* pd[TK] = delta + rka; + int32 pad_h = AR/2; + int32 pad_w = AS/2; + int32 rawhc[TM] = rxa / ABS; + int32 raw[TM] = rawhc % AW - pad_w; + int32 rahc[TM] = rawhc / AW; + int32 rah[TM] = rahc % AH - pad_h; + int32 maskh[TM] = pad_h + min(rah, 0) + max(rah + AR - AH, 0); + int32 maskw[TM] = pad_w + min(raw, 0) + max(raw + AS - AW, 0); + __constant__ int32* pxm[TM] = masks + maskh*K + maskw*K*(2*pad_h + 1); + __constant__ int32* pm[TM, TK] = pxm[:, newaxis] + rka[newaxis, :]; + for(int32 k = K; k > 0; k = k - TK){ + int32 delta[TK] = *pd; + fp32 *pa[TM, TK] = pxa + delta[newaxis, :]; + int1 m[TM, TK] = *pm > 0; + fp32 a[TM, TK] = m ? *pa : 0; + fp32 b[TN, TK] = *pb; + C = dot(a, trans(b), C); + pb = pb + TK*N; + pd = pd + TK; + pm = pm + TK; + } + int32 rxc[TM] = get_global_range[TM](0); + int32 ryc[TN] = get_global_range[TN](1); + fp32* pc[TM, TN] = c + ryc[newaxis, :]*M + rxc[:, newaxis]; + int1 checkc0[TM] = rxc < M; + int1 checkc1[TN] = ryc < N; + int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + @checkc *pc = C; +} +)"; +} + +} +} From d8526669f5a656e7869625c110521d637b5be850 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 27 Jun 2019 12:39:17 -0700 Subject: [PATCH 195/494] fixup --- examples/cpp/shift.cpp | 2 +- lib/dnn/shift.cpp | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index ed949c74c..0b523f826 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -99,7 +99,7 @@ int main() { // shift std::vector params = { - 8, 2, 16, 8, 2, 32, 8, 4, 2, 2, 4, 2, 8, 4 + 8, 2, 32, 8, 2, 64, 8, 4, 2, 2, 4, 2, 8, 4 }; std::ostringstream oss; shift.src(oss); diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index d07809f1f..1fc3645ec 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -8,8 +8,7 @@ void shift::set_ld(const std::vector& shapes, std::vector& ld) { size_t size = shapes.size(); ld.resize(size); - ld[4] = 1; - ld[3] = shapes[4]*ld[4]; + ld[3] = 1; ld[2] = shapes[3]*ld[3]; ld[1] = shapes[2]*ld[2]; ld[0] = shapes[1]*ld[1]; @@ -42,6 +41,9 @@ shift::shift(int B, int NC, shapes_c_ = {NF, H, W, B}; // memory strides set_ld(shapes_a_, ld_a_); + // build LUTs + build_deltas(); + build_masks(); } void shift::build_deltas() { @@ -89,7 +91,7 @@ std::vector shift::c_shapes(){ } size_t shift::get_nflops() { - return 2 * M_ * N_ * K_; + return 2. * M_ * N_ * K_; } From 12e6036e5f4282ecd19b4c4c1a2a37d2d505a181 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 27 Jun 2019 14:13:48 -0700 Subject: [PATCH 196/494] trying interior shift --- examples/cpp/shift.cpp | 33 +-------------------------------- include/triton/dnn/shift.h | 11 +++++++---- lib/dnn/shift.cpp | 27 ++++++++++++--------------- 3 files changed, 20 insertions(+), 51 deletions(-) diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index 0b523f826..e23ba5c9c 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -7,37 +7,6 @@ #include "triton/tools/bench.hpp" #include "triton/dnn/shift.h" -// input layout: C, H, W, BS -// filter layout: C, K -// output layout: K, H, W, BS -template -void shift_conv(int32_t C, int32_t H, int32_t W, int32_t BS, - int32_t K, - std::vector& O, - const std::vector& I, - const std::vector& F, - const std::vector shift_h, - const std::vector shift_w) -{ - OUT_DTYPE acc; - for(int32_t p = 0; p < H; ++p) - for(int32_t q = 0; q < W; ++q) - for(int32_t bs = 0; bs < BS; ++bs) - for(int32_t k = 0; k < K; ++k) - { - acc = 0; - for(int32_t c = 0; c < C; ++c){ - int32_t h = p + shift_h[c]; - int32_t w = q + shift_w[c]; - bool in_bounds = (h >= 0 && w >= 0 && h < H && w < W); - IN_DTYPE a = in_bounds?I[bs + w*BS + h*BS*W + c*BS*H*W]:0; - IN_DTYPE b = F[k + c*K]; - acc = std::fma(a, b, acc); - } - O[bs + q*BS + p*BS*W + k*BS*H*W] = acc; - } -} - int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); @@ -104,7 +73,7 @@ int main() { std::ostringstream oss; shift.src(oss); std::string src = oss.str(); -// jit.autotune("shift", src.c_str(), benchmark); + jit.autotune("shift", src.c_str(), benchmark); jit.add_module("shift", src.c_str(), params); triton::driver::kernel* kernel = jit.get_function("shift"); triton::jit::launch_information info = jit.get_launch_info("shift"); diff --git a/include/triton/dnn/shift.h b/include/triton/dnn/shift.h index 6d6bda9de..9b81bf6ad 100644 --- a/include/triton/dnn/shift.h +++ b/include/triton/dnn/shift.h @@ -90,10 +90,13 @@ public: { acc = 0; for(int32_t c = 0; c < NC_; ++c){ - int32_t h = p + shift_h_[c]; - int32_t w = q + shift_w_[c]; - bool in_bounds = (h >= 0 && w >= 0 && h < AH_ && w < AW_); - IN_DTYPE a = in_bounds?I[bs + w*NB_ + h*NB_*AW_ + c*NB_*AH_*AW_]:0; + int32_t h = p; + int32_t w = q; + if(h >= BH_/2 && h < AH_ - BH_/2) + h += shift_h_[c]; + if(w > BW_/2 && w < AW_ - BW_/2) + w += shift_w_[c]; + IN_DTYPE a = I[bs + w*NB_ + h*NB_*AW_ + c*NB_*AH_*AW_]; IN_DTYPE b = F[k + c*NF_]; acc = std::fma(a, b, acc); } diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 1fc3645ec..dbaa3f496 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -47,7 +47,7 @@ shift::shift(int B, int NC, } void shift::build_deltas() { - h_deltas_.resize(NC_); + h_deltas_ = std::vector(512, 0); for(unsigned c = 0; c < NC_; c++){ h_deltas_[c] = c*ld_a_[0]; h_deltas_[c] += shift_h_[c]*ld_a_[1]; @@ -128,12 +128,12 @@ const tunable int32 TM = {16, 32, 64, 128}; const tunable int32 TN = {16, 32, 64, 128}; const tunable int32 TK = {8}; -__constant__ int32* delta = alloc_const int32[256]; +__constant__ int32* delta = alloc_const int32[512]; __constant__ int32* masks = alloc_const int32[8192]; void shift(restrict read_only fp32 *a, restrict read_only fp32 *b, fp32 *c, int32 M, int32 N, int32 K, - int32 ABS, int32 AH, int32 AW, int32 AR, int32 AS){ + int32 ABS, int32 AH, int32 AW, int32 AR, int32 AS) { int32 rxa[TM] = get_global_range[TM](0); int32 ryb[TN] = get_global_range[TN](1); int32 rka[TK] = 0 ... TK; @@ -141,27 +141,24 @@ void shift(restrict read_only fp32 *a, restrict read_only fp32 *b, fp32 *c, fp32 C[TM, TN] = 0; fp32* pxa[TM, TK] = a + rxa[:, newaxis]; fp32* pb[TN, TK] = b + rkb[newaxis, :]*N + ryb[:, newaxis]; - __constant__ int32* pd[TK] = delta + rka; int32 pad_h = AR/2; int32 pad_w = AS/2; int32 rawhc[TM] = rxa / ABS; - int32 raw[TM] = rawhc % AW - pad_w; + int32 raw[TM] = rawhc % AW; int32 rahc[TM] = rawhc / AW; - int32 rah[TM] = rahc % AH - pad_h; - int32 maskh[TM] = pad_h + min(rah, 0) + max(rah + AR - AH, 0); - int32 maskw[TM] = pad_w + min(raw, 0) + max(raw + AS - AW, 0); - __constant__ int32* pxm[TM] = masks + maskh*K + maskw*K*(2*pad_h + 1); - __constant__ int32* pm[TM, TK] = pxm[:, newaxis] + rka[newaxis, :]; + int32 rah[TM] = rahc % AH; + int1 maskh[TM] = (rah >= pad_h) && (rah < (AH - pad_h)); + int1 maskw[TM] = (raw >= pad_w) && (raw < (AW - pad_w)); + int32 offd[TM] = (maskh && maskw) ? 0 : 256; + __constant__ int32* pd[TM, TK] = delta + rka[newaxis, :] + offd[:, newaxis]; for(int32 k = K; k > 0; k = k - TK){ - int32 delta[TK] = *pd; - fp32 *pa[TM, TK] = pxa + delta[newaxis, :]; - int1 m[TM, TK] = *pm > 0; - fp32 a[TM, TK] = m ? *pa : 0; + int32 delta[TM, TK] = *pd; + fp32 *pa[TM, TK] = pxa + delta; + fp32 a[TM, TK] = *pa; fp32 b[TN, TK] = *pb; C = dot(a, trans(b), C); pb = pb + TK*N; pd = pd + TK; - pm = pm + TK; } int32 rxc[TM] = get_global_range[TM](0); int32 ryc[TN] = get_global_range[TN](1); From f4dedb522c51d10ad5a82cf163c54e47ec1ae110 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 27 Jun 2019 17:05:48 -0700 Subject: [PATCH 197/494] fixup --- lib/codegen/tune.cpp | 10 ++++++---- lib/lang/expression.cpp | 3 ++- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 72267b23f..0b4367052 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -51,7 +51,6 @@ void tune::init_c_phi(ir::instruction *v) { } void tune::init_c_graph(ir::instruction *v) { - // Reference shape ir::type::tile_shapes_t::value_type one = ir::tile_type::make_one(v->get_parent()->get_context()); ir::type::tile_shapes_t shapes; @@ -59,8 +58,10 @@ void tune::init_c_graph(ir::instruction *v) { shapes = store->get_pointer_operand()->get_type()->get_tile_shapes(); else if(auto *downcast = dynamic_cast(v)) return; - else + else{ +// std::cout << v->get_name() << std::endl; shapes = v->get_type()->get_tile_shapes(); + } // Reshape if(dynamic_cast(v)){ ir::value *op = v->get_operand(0); @@ -102,13 +103,14 @@ void tune::init_c_graph(ir::instruction *v) { } // Element-wise else if(dynamic_cast(v)) { - for(unsigned k = 0; k < v->get_num_results(); k++) + for(unsigned k = 0; k < v->get_num_results(); k++){ + ir::value *result = v->get_result(k); for(unsigned i = 0; i < shapes.size(); i ++){ - ir::value *result = v->get_result(k); for(ir::value* op: v->ops()){ add_constraint({result, i}, {op, i}); } } + } } } diff --git a/lib/lang/expression.cpp b/lib/lang/expression.cpp index 87f6a8194..6054a2694 100644 --- a/lib/lang/expression.cpp +++ b/lib/lang/expression.cpp @@ -248,7 +248,8 @@ ir::value *conditional_expression::codegen(ir::module *mod) const{ ir::value *uncasted_true_value = true_value; ir::value *uncasted_false_value = false_value; implicit_cast(builder, true_value, false_value, is_float, is_ptr, is_int, is_signed); - implicit_broadcast(mod, true_value, false_value); + implicit_broadcast(mod, pred, true_value); + implicit_broadcast(mod, pred, false_value); { ir::value *current = true_value; while(current != uncasted_true_value) { From 21fd0fd65e12aa2f9dcb6fa749099e205865b0af Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 28 Jun 2019 11:13:36 -0700 Subject: [PATCH 198/494] fixup --- examples/cpp/shift.cpp | 6 +++--- include/triton/dnn/shift.h | 5 +++-- lib/dnn/shift.cpp | 3 +++ 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index e23ba5c9c..fa7714782 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -68,12 +68,12 @@ int main() { // shift std::vector params = { - 8, 2, 32, 8, 2, 64, 8, 4, 2, 2, 4, 2, 8, 4 + 4, 2, 32, 8, 2, 32, 8, 4, 2, 2, 8, 8, 4 }; std::ostringstream oss; shift.src(oss); std::string src = oss.str(); - jit.autotune("shift", src.c_str(), benchmark); +// jit.autotune("shift", src.c_str(), benchmark); jit.add_module("shift", src.c_str(), params); triton::driver::kernel* kernel = jit.get_function("shift"); triton::jit::launch_information info = jit.get_launch_info("shift"); @@ -81,7 +81,7 @@ int main() { stream->read(dc, true, 0, hc); shift.cpu_ref(rc.data(), ha.data(), hb.data()); for(size_t i = 0; i < hc.size(); i++) - if(std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ + if(std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; exit(EXIT_FAILURE); } diff --git a/include/triton/dnn/shift.h b/include/triton/dnn/shift.h index 9b81bf6ad..cec282d34 100644 --- a/include/triton/dnn/shift.h +++ b/include/triton/dnn/shift.h @@ -92,10 +92,11 @@ public: for(int32_t c = 0; c < NC_; ++c){ int32_t h = p; int32_t w = q; - if(h >= BH_/2 && h < AH_ - BH_/2) + if(h >= BH_/2 && h < AH_ - BH_/2 + && w >= BW_/2 && w < AW_ - BW_/2){ h += shift_h_[c]; - if(w > BW_/2 && w < AW_ - BW_/2) w += shift_w_[c]; + } IN_DTYPE a = I[bs + w*NB_ + h*NB_*AW_ + c*NB_*AH_*AW_]; IN_DTYPE b = F[k + c*NF_]; acc = std::fma(a, b, acc); diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index dbaa3f496..87b158648 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -53,6 +53,9 @@ void shift::build_deltas() { h_deltas_[c] += shift_h_[c]*ld_a_[1]; h_deltas_[c] += shift_w_[c]*ld_a_[2]; } + for(unsigned c = 0; c < NC_; c++){ + h_deltas_[c + 256] = c*ld_a_[0]; + } } void shift::build_masks() { From a567f3f8a8e024437f7f06487a5d2a921b038275 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 28 Jun 2019 15:10:39 -0700 Subject: [PATCH 199/494] more cleaning --- examples/cpp/shift.cpp | 2 +- lib/dnn/shift.cpp | 49 ++++++++++++++++-------------------------- 2 files changed, 20 insertions(+), 31 deletions(-) diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index fa7714782..90aeaa595 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -68,7 +68,7 @@ int main() { // shift std::vector params = { - 4, 2, 32, 8, 2, 32, 8, 4, 2, 2, 8, 8, 4 + 4, 2, 16, 8, 2, 64, 4, 8, 2, 2, 4, 8, 8 }; std::ostringstream oss; shift.src(oss); diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 87b158648..0eae63ddc 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -43,34 +43,24 @@ shift::shift(int B, int NC, set_ld(shapes_a_, ld_a_); // build LUTs build_deltas(); - build_masks(); } void shift::build_deltas() { + // compute offset + auto offset = [&](unsigned c) { + return c*ld_a_[0] + shift_h_[c]*ld_a_[1] + shift_w_[c]*ld_a_[2]; + }; + // allocate look-up table + size_t TK = 8; h_deltas_ = std::vector(512, 0); - for(unsigned c = 0; c < NC_; c++){ - h_deltas_[c] = c*ld_a_[0]; - h_deltas_[c] += shift_h_[c]*ld_a_[1]; - h_deltas_[c] += shift_w_[c]*ld_a_[2]; + // populate look-up table + for(unsigned c = 0; c < TK; c++){ + h_deltas_[c] = offset(c); // init (shift) + h_deltas_[c + 256] = c*ld_a_[0]; // init (no shift) } for(unsigned c = 0; c < NC_; c++){ - h_deltas_[c + 256] = c*ld_a_[0]; - } -} - -void shift::build_masks() { - size_t S0 = NC_; - size_t S1 = BH_; - size_t S2 = BW_; - h_masks_.resize(S0*S1*S2); - for(size_t ph = 0; ph < S1; ++ph) - for(size_t pw = 0; pw < S2; ++pw){ - int32_t* ptr = &h_masks_[ph*S0 + pw*S0*S1]; - for(size_t i = 0; i < S0; ++i){ - bool in_bounds_h = shift_h_[i] + ph >= 0 && shift_h_[i] + ph < BH_; - bool in_bounds_w = shift_w_[i] + pw >= 0 && shift_w_[i] + pw < BW_; - ptr[i] = in_bounds_h && in_bounds_w; - } + h_deltas_[TK + c] = offset(c + TK) - offset(c); // deltas (shift) + h_deltas_[TK + c + 256] = TK*ld_a_[0]; // deltas (shift) } } @@ -100,9 +90,7 @@ size_t shift::get_nflops() { void shift::init(driver::stream *stream, driver::cu_module *module) { triton::driver::buffer* delta = ((triton::driver::cu_module*)module)->symbol("delta"); - triton::driver::buffer* masks = ((triton::driver::cu_module*)module)->symbol("masks"); stream->write(delta, false, 0, h_deltas_.size()*4, h_deltas_.data()); - stream->write(masks, false, 0, h_masks_.size()*4, h_masks_.data()); } void shift::enqueue(driver::stream *stream, driver::kernel *kernel, @@ -132,9 +120,10 @@ const tunable int32 TN = {16, 32, 64, 128}; const tunable int32 TK = {8}; __constant__ int32* delta = alloc_const int32[512]; -__constant__ int32* masks = alloc_const int32[8192]; -void shift(restrict read_only fp32 *a, restrict read_only fp32 *b, fp32 *c, +void shift(restrict read_only align(16) fp32 *a, + restrict read_only align(16) fp32 *b, + fp32 *c, int32 M, int32 N, int32 K, int32 ABS, int32 AH, int32 AW, int32 AR, int32 AS) { int32 rxa[TM] = get_global_range[TM](0); @@ -142,7 +131,6 @@ void shift(restrict read_only fp32 *a, restrict read_only fp32 *b, fp32 *c, int32 rka[TK] = 0 ... TK; int32 rkb[TK] = 0 ... TK; fp32 C[TM, TN] = 0; - fp32* pxa[TM, TK] = a + rxa[:, newaxis]; fp32* pb[TN, TK] = b + rkb[newaxis, :]*N + ryb[:, newaxis]; int32 pad_h = AR/2; int32 pad_w = AS/2; @@ -152,16 +140,17 @@ void shift(restrict read_only fp32 *a, restrict read_only fp32 *b, fp32 *c, int32 rah[TM] = rahc % AH; int1 maskh[TM] = (rah >= pad_h) && (rah < (AH - pad_h)); int1 maskw[TM] = (raw >= pad_w) && (raw < (AW - pad_w)); - int32 offd[TM] = (maskh && maskw) ? 0 : 256; + int1 mask[TM] = maskh && maskw; + int32 offd[TM] = mask ? 0 : 256; __constant__ int32* pd[TM, TK] = delta + rka[newaxis, :] + offd[:, newaxis]; + fp32* pa[TM, TK] = a + rxa[:, newaxis] + (*pd); for(int32 k = K; k > 0; k = k - TK){ - int32 delta[TM, TK] = *pd; - fp32 *pa[TM, TK] = pxa + delta; fp32 a[TM, TK] = *pa; fp32 b[TN, TK] = *pb; C = dot(a, trans(b), C); pb = pb + TK*N; pd = pd + TK; + pa = pa + (*pd); } int32 rxc[TM] = get_global_range[TM](0); int32 ryc[TN] = get_global_range[TN](1); From ab1afbf0825c94640a870f458513aeaf218c732e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 28 Jun 2019 17:04:07 -0700 Subject: [PATCH 200/494] more performance optimizations --- examples/cpp/shift.cpp | 8 +++--- include/triton/dnn/shift.h | 1 + lib/dnn/shift.cpp | 58 +++++++++++++++++++++----------------- 3 files changed, 37 insertions(+), 30 deletions(-) diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index 90aeaa595..f8d0b3ed9 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -14,9 +14,9 @@ int main() { triton::jit jit(context); // initialization int32_t R = 3, S = 3; - int32_t BS = 4, F = 128; + int32_t BS = 4, F = 512; int32_t H = 32, W = 32; - int32_t C = 128; + int32_t C = 512; // random shifts std::vector shift_h(C); std::vector shift_w(C); @@ -68,12 +68,12 @@ int main() { // shift std::vector params = { - 4, 2, 16, 8, 2, 64, 4, 8, 2, 2, 4, 8, 8 + 32, 2, 128, 16, 2, 128, 16, 8, 2, 2, 4, 2, 8, 8 }; std::ostringstream oss; shift.src(oss); std::string src = oss.str(); -// jit.autotune("shift", src.c_str(), benchmark); + jit.autotune("shift", src.c_str(), benchmark); jit.add_module("shift", src.c_str(), params); triton::driver::kernel* kernel = jit.get_function("shift"); triton::jit::launch_information info = jit.get_launch_info("shift"); diff --git a/include/triton/dnn/shift.h b/include/triton/dnn/shift.h index cec282d34..99a173112 100644 --- a/include/triton/dnn/shift.h +++ b/include/triton/dnn/shift.h @@ -106,6 +106,7 @@ public: } private: + int32_t MAX_C_; // image size int32_t NB_; int32_t NC_; diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 0eae63ddc..330fd9ec8 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -28,6 +28,8 @@ shift::shift(int B, int NC, shift_h_(shift_h), shift_w_(shift_w), a_ty_(a_ty), b_ty_(b_ty), ty_(ty), bias_(bias) { + // max number of channels + MAX_C_ = 1024; // equivalent matmul M_ = NB_*AH_*AW_; N_ = NF_; @@ -52,16 +54,12 @@ void shift::build_deltas() { }; // allocate look-up table size_t TK = 8; - h_deltas_ = std::vector(512, 0); + h_deltas_.resize(MAX_C_); // populate look-up table - for(unsigned c = 0; c < TK; c++){ - h_deltas_[c] = offset(c); // init (shift) - h_deltas_[c + 256] = c*ld_a_[0]; // init (no shift) - } - for(unsigned c = 0; c < NC_; c++){ - h_deltas_[TK + c] = offset(c + TK) - offset(c); // deltas (shift) - h_deltas_[TK + c + 256] = TK*ld_a_[0]; // deltas (shift) - } + for(unsigned c = 0; c < TK; c++) + h_deltas_[c] = offset(c); + for(unsigned c = 0; c < NC_; c++) + h_deltas_[TK + c] = offset(c + TK) - offset(c); } size_t shift::a_size(){ @@ -102,11 +100,12 @@ void shift::enqueue(driver::stream *stream, driver::kernel *kernel, kernel->setArg(3, M_); kernel->setArg(4, N_); kernel->setArg(5, K_); - kernel->setArg(6, NB_); - kernel->setArg(7, AH_); - kernel->setArg(8, AW_); - kernel->setArg(9, BH_); - kernel->setArg(10, BW_); + kernel->setArg(6, NB_*AH_*AW_); + kernel->setArg(7, NB_); + kernel->setArg(8, AH_); + kernel->setArg(9, AW_); + kernel->setArg(10, BH_); + kernel->setArg(11, BW_); // dry run std::array grid = {(M_ + TM - 1)/TM, (N_ + TN - 1)/TN, 1}; stream->enqueue(kernel, grid, {nthreads, 1, 1}); @@ -119,19 +118,19 @@ const tunable int32 TM = {16, 32, 64, 128}; const tunable int32 TN = {16, 32, 64, 128}; const tunable int32 TK = {8}; -__constant__ int32* delta = alloc_const int32[512]; +__constant__ int32* delta = alloc_const int32[)" << MAX_C_ << R"(]; -void shift(restrict read_only align(16) fp32 *a, - restrict read_only align(16) fp32 *b, +void shift(restrict read_only align(16) )" << a_ty_ << R"( *a, + restrict read_only align(16) )" << b_ty_ << R"( *b, fp32 *c, int32 M, int32 N, int32 K, + int32 lda, int32 ABS, int32 AH, int32 AW, int32 AR, int32 AS) { int32 rxa[TM] = get_global_range[TM](0); int32 ryb[TN] = get_global_range[TN](1); int32 rka[TK] = 0 ... TK; int32 rkb[TK] = 0 ... TK; fp32 C[TM, TN] = 0; - fp32* pb[TN, TK] = b + rkb[newaxis, :]*N + ryb[:, newaxis]; int32 pad_h = AR/2; int32 pad_w = AS/2; int32 rawhc[TM] = rxa / ABS; @@ -140,17 +139,24 @@ void shift(restrict read_only align(16) fp32 *a, int32 rah[TM] = rahc % AH; int1 maskh[TM] = (rah >= pad_h) && (rah < (AH - pad_h)); int1 maskw[TM] = (raw >= pad_w) && (raw < (AW - pad_w)); - int1 mask[TM] = maskh && maskw; - int32 offd[TM] = mask ? 0 : 256; - __constant__ int32* pd[TM, TK] = delta + rka[newaxis, :] + offd[:, newaxis]; - fp32* pa[TM, TK] = a + rxa[:, newaxis] + (*pd); - for(int32 k = K; k > 0; k = k - TK){ - fp32 a[TM, TK] = *pa; - fp32 b[TN, TK] = *pb; + int1 mask[TM, TK] = maskh[:, newaxis] && maskw[:, newaxis]; + __constant__ int32* pd[TK] = delta + rka; + int32 d[TK] = *pd; + int32 offa1[TK] = rka*lda; + int32 inc[TM, TK] = mask ? d[newaxis, :] : offa1[newaxis, :]; + )" << a_ty_ << R"(* pa[TM, TK] = a + rxa[:, newaxis] + inc; + )" << b_ty_ << R"(* pb[TN, TK] = b + rkb[newaxis, :]*N + ryb[:, newaxis]; + )" << a_ty_ << R"( a[TM, TK] = *pa; + )" << b_ty_ << R"( b[TN, TK] = *pb; + for(int32 k = K; k > TK; k = k - TK){ C = dot(a, trans(b), C); pb = pb + TK*N; pd = pd + TK; - pa = pa + (*pd); + d = *pd; + inc = mask ? d[newaxis, :] : TK*lda; + pa = pa + inc; + a = *pa; + b = *pb; } int32 rxc[TM] = get_global_range[TM](0); int32 ryc[TN] = get_global_range[TN](1); From 83b753512cd313000eb4ee4ce54a2b824aa85cfa Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 28 Jun 2019 17:17:50 -0700 Subject: [PATCH 201/494] prefetching with shift --- lib/dnn/shift.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 330fd9ec8..a69049317 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -148,15 +148,16 @@ void shift(restrict read_only align(16) )" << a_ty_ << R"( *a, )" << b_ty_ << R"(* pb[TN, TK] = b + rkb[newaxis, :]*N + ryb[:, newaxis]; )" << a_ty_ << R"( a[TM, TK] = *pa; )" << b_ty_ << R"( b[TN, TK] = *pb; - for(int32 k = K; k > TK; k = k - TK){ + for(int32 k = K; k > 0; k = k - TK){ C = dot(a, trans(b), C); pb = pb + TK*N; pd = pd + TK; d = *pd; - inc = mask ? d[newaxis, :] : TK*lda; - pa = pa + inc; - a = *pa; - b = *pb; + pa = pa + (mask ? d[newaxis, :] : TK*lda); + int1 checka[TM, TK] = k > TK; + int1 checkb[TN, TK] = k > TK; + @checka a = *pa; + @checkb b = *pb; } int32 rxc[TM] = get_global_range[TM](0); int32 ryc[TN] = get_global_range[TN](1); From d8c3d58593e1e991007de04e0a860994bd58ed58 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 28 Jun 2019 20:22:52 -0700 Subject: [PATCH 202/494] more optimization --- examples/cpp/shift.cpp | 42 +++++++++++++++++++++----------------- include/triton/dnn/shift.h | 1 + lib/codegen/selection.cpp | 1 + lib/codegen/tune.cpp | 6 +++--- lib/dnn/shift.cpp | 17 ++++++++------- 5 files changed, 36 insertions(+), 31 deletions(-) diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index f8d0b3ed9..8251be322 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -6,17 +6,21 @@ #include "triton/driver/stream.h" #include "triton/tools/bench.hpp" #include "triton/dnn/shift.h" +#include "triton/external/half.hpp" int main() { + typedef half_float::half NumericT; + std::string numeric_t_str = "fp16"; + // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); // initialize just-in-time compiler triton::jit jit(context); // initialization int32_t R = 3, S = 3; - int32_t BS = 4, F = 512; + int32_t BS = 32, F = 1024; int32_t H = 32, W = 32; - int32_t C = 512; + int32_t C = 1024; // random shifts std::vector shift_h(C); std::vector shift_w(C); @@ -25,23 +29,23 @@ int main() { shift_w[c] = rand() % S - S/2; } // configuration - triton::dnn::shift shift(BS, C, 1, H, W, 1, R, S, F, shift_h, shift_w); + triton::dnn::shift shift(BS, C, 1, H, W, 1, R, S, F, shift_h, shift_w, numeric_t_str, numeric_t_str); // host buffers std::vector hc(shift.c_size()); std::vector rc(shift.c_size()); - std::vector ha(shift.a_size()); - std::vector hb(shift.b_size()); + std::vector ha(shift.a_size()); + std::vector hb(shift.b_size()); // device buffers triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*4); - triton::driver::buffer* da = triton::driver::buffer::create(context, ha.size()*4); - triton::driver::buffer* db = triton::driver::buffer::create(context, hb.size()*4); + triton::driver::buffer* da = triton::driver::buffer::create(context, ha.size()*sizeof(NumericT)); + triton::driver::buffer* db = triton::driver::buffer::create(context, hb.size()*sizeof(NumericT)); triton::driver::stream* stream = triton::driver::stream::create(context); // initialize host srand(0); for(size_t i = 0; i < ha.size(); i++) - ha[i] = (float)rand() / RAND_MAX; + ha[i] = (NumericT)rand() / RAND_MAX; for(size_t i = 0; i < hb.size(); i++) - hb[i] = (float)rand() / RAND_MAX; + hb[i] = (NumericT)rand() / RAND_MAX; for(size_t i = 0; i < hc.size(); i++) hc[i] = 0; // initialize device @@ -68,23 +72,23 @@ int main() { // shift std::vector params = { - 32, 2, 128, 16, 2, 128, 16, 8, 2, 2, 4, 2, 8, 8 + 16, 4, 64, 16, 4, 128, 2, 2, 1, 2, 4, 4, 16, 4 }; std::ostringstream oss; shift.src(oss); std::string src = oss.str(); - jit.autotune("shift", src.c_str(), benchmark); +// jit.autotune("shift", src.c_str(), benchmark); jit.add_module("shift", src.c_str(), params); triton::driver::kernel* kernel = jit.get_function("shift"); triton::jit::launch_information info = jit.get_launch_info("shift"); std::cout << "Performance: " << benchmark(kernel, info) << " TFLOPS " << std::endl; - stream->read(dc, true, 0, hc); - shift.cpu_ref(rc.data(), ha.data(), hb.data()); - for(size_t i = 0; i < hc.size(); i++) - if(std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ - std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; - exit(EXIT_FAILURE); - } - std::cout << "Pass!" << std::endl; +// stream->read(dc, true, 0, hc); +// shift.cpu_ref(rc.data(), ha.data(), hb.data()); +// for(size_t i = 0; i < hc.size(); i++) +// if(std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ +// std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; +// exit(EXIT_FAILURE); +// } +// std::cout << "Pass!" << std::endl; } diff --git a/include/triton/dnn/shift.h b/include/triton/dnn/shift.h index 99a173112..1b407aa43 100644 --- a/include/triton/dnn/shift.h +++ b/include/triton/dnn/shift.h @@ -107,6 +107,7 @@ public: private: int32_t MAX_C_; + int32_t TK_; // image size int32_t NB_; int32_t NC_; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index b066a963a..d0f6f825e 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -1042,6 +1042,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & unsigned max_contiguous = axis_info_->get_max_contiguous(ptr); unsigned alignment = std::min(starting_multiple, max_contiguous); unsigned vector_size = std::min(result->axis(0).contiguous, alignment); +// vector_size = result->axis(0).contiguous; std::map packets; distributed_tile *TP = (distributed_tile*)tmap_.at(ld->get_pointer_operand()); result->for_each([&](indices_t idx){ diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 0b4367052..ac56bd5ed 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -235,13 +235,13 @@ void tune::run(ir::module &mod) { continue; if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 2)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 4, 4)); *params_.at(i).at("nts.d0") = *tmp; } if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 2, 2)); - std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 2, 2)); + std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 4, 4)); + std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 4, 4)); *params_.at(i).at("nts.d0") = *tmp1; *params_.at(i).at("nts.d1") = *tmp2; } diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index a69049317..6a71b6c33 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -29,7 +29,8 @@ shift::shift(int B, int NC, a_ty_(a_ty), b_ty_(b_ty), ty_(ty), bias_(bias) { // max number of channels - MAX_C_ = 1024; + TK_ = 16; + MAX_C_ = 8192 + TK_; // equivalent matmul M_ = NB_*AH_*AW_; N_ = NF_; @@ -52,14 +53,12 @@ void shift::build_deltas() { auto offset = [&](unsigned c) { return c*ld_a_[0] + shift_h_[c]*ld_a_[1] + shift_w_[c]*ld_a_[2]; }; - // allocate look-up table - size_t TK = 8; h_deltas_.resize(MAX_C_); // populate look-up table - for(unsigned c = 0; c < TK; c++) + for(unsigned c = 0; c < TK_; c++) h_deltas_[c] = offset(c); for(unsigned c = 0; c < NC_; c++) - h_deltas_[TK + c] = offset(c + TK) - offset(c); + h_deltas_[TK_ + c] = offset(c + TK_) - offset(c); } size_t shift::a_size(){ @@ -116,14 +115,14 @@ void shift::src(std::ostream &os) { R"( const tunable int32 TM = {16, 32, 64, 128}; const tunable int32 TN = {16, 32, 64, 128}; -const tunable int32 TK = {8}; +const tunable int32 TK = {)" << TK_ << R"(}; __constant__ int32* delta = alloc_const int32[)" << MAX_C_ << R"(]; void shift(restrict read_only align(16) )" << a_ty_ << R"( *a, restrict read_only align(16) )" << b_ty_ << R"( *b, fp32 *c, - int32 M, int32 N, int32 K, + multiple_of(4) int32 M, multiple_of(4) int32 N, multiple_of(4) int32 K, int32 lda, int32 ABS, int32 AH, int32 AW, int32 AR, int32 AS) { int32 rxa[TM] = get_global_range[TM](0); @@ -131,8 +130,8 @@ void shift(restrict read_only align(16) )" << a_ty_ << R"( *a, int32 rka[TK] = 0 ... TK; int32 rkb[TK] = 0 ... TK; fp32 C[TM, TN] = 0; - int32 pad_h = AR/2; - int32 pad_w = AS/2; + int32 pad_h = AR / 2; + int32 pad_w = AS / 2; int32 rawhc[TM] = rxa / ABS; int32 raw[TM] = rawhc % AW; int32 rahc[TM] = rawhc / AW; From 9a86bc51e19c55880a1add0f938b55560dbe8b54 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 29 Jun 2019 13:58:46 -0700 Subject: [PATCH 203/494] [language] added alignment metadata for variables --- examples/cpp/dot.cpp | 2 +- examples/cpp/shift.cpp | 1 + include/triton/external/half.hpp | 3067 +++++++++++++++++++++++++++++ include/triton/ir/instructions.h | 9 +- include/triton/ir/metadata.h | 29 + include/triton/ir/module.h | 5 + include/triton/lang/declaration.h | 6 + lib/codegen/alignment_info.cpp | 21 + lib/dnn/shift.cpp | 5 +- lib/ir/metadata.cpp | 14 + lib/ir/module.cpp | 5 + lib/lang/declaration.cpp | 28 +- 12 files changed, 3183 insertions(+), 9 deletions(-) create mode 100644 include/triton/external/half.hpp create mode 100644 include/triton/ir/metadata.h create mode 100644 lib/ir/metadata.cpp diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 9c5349570..5dbff07bb 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -16,7 +16,7 @@ int main() { triton::jit jit(context); // matrix multiplication parameters - int32_t M = 1024, N = 1024, K = 1024; + int32_t M = 32768, N = 1024, K = 1024; std::vector hc(M*N); std::vector rc(M*N); std::vector ha(M*K); diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index 8251be322..83082ec4d 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -21,6 +21,7 @@ int main() { int32_t BS = 32, F = 1024; int32_t H = 32, W = 32; int32_t C = 1024; + // random shifts std::vector shift_h(C); std::vector shift_w(C); diff --git a/include/triton/external/half.hpp b/include/triton/external/half.hpp new file mode 100644 index 000000000..625cce7cb --- /dev/null +++ b/include/triton/external/half.hpp @@ -0,0 +1,3067 @@ +// half - IEEE 754-based half-precision floating point library. +// +// Copyright (c) 2012-2017 Christian Rau +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +// Version 1.12.0 + +/// \file +/// Main header file for half precision functionality. + +#ifndef HALF_HALF_HPP +#define HALF_HALF_HPP + +/// Combined gcc version number. +#define HALF_GNUC_VERSION (__GNUC__*100+__GNUC_MINOR__) + +//check C++11 language features +#if defined(__clang__) //clang + #if __has_feature(cxx_static_assert) && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) + #define HALF_ENABLE_CPP11_STATIC_ASSERT 1 + #endif + #if __has_feature(cxx_constexpr) && !defined(HALF_ENABLE_CPP11_CONSTEXPR) + #define HALF_ENABLE_CPP11_CONSTEXPR 1 + #endif + #if __has_feature(cxx_noexcept) && !defined(HALF_ENABLE_CPP11_NOEXCEPT) + #define HALF_ENABLE_CPP11_NOEXCEPT 1 + #endif + #if __has_feature(cxx_user_literals) && !defined(HALF_ENABLE_CPP11_USER_LITERALS) + #define HALF_ENABLE_CPP11_USER_LITERALS 1 + #endif + #if (defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L) && !defined(HALF_ENABLE_CPP11_LONG_LONG) + #define HALF_ENABLE_CPP11_LONG_LONG 1 + #endif +/*#elif defined(__INTEL_COMPILER) //Intel C++ + #if __INTEL_COMPILER >= 1100 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) ???????? + #define HALF_ENABLE_CPP11_STATIC_ASSERT 1 + #endif + #if __INTEL_COMPILER >= 1300 && !defined(HALF_ENABLE_CPP11_CONSTEXPR) ???????? + #define HALF_ENABLE_CPP11_CONSTEXPR 1 + #endif + #if __INTEL_COMPILER >= 1300 && !defined(HALF_ENABLE_CPP11_NOEXCEPT) ???????? + #define HALF_ENABLE_CPP11_NOEXCEPT 1 + #endif + #if __INTEL_COMPILER >= 1100 && !defined(HALF_ENABLE_CPP11_LONG_LONG) ???????? + #define HALF_ENABLE_CPP11_LONG_LONG 1 + #endif*/ +#elif defined(__GNUC__) //gcc + #if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L + #if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) + #define HALF_ENABLE_CPP11_STATIC_ASSERT 1 + #endif + #if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_CONSTEXPR) + #define HALF_ENABLE_CPP11_CONSTEXPR 1 + #endif + #if HALF_GNUC_VERSION >= 406 && !defined(HALF_ENABLE_CPP11_NOEXCEPT) + #define HALF_ENABLE_CPP11_NOEXCEPT 1 + #endif + #if HALF_GNUC_VERSION >= 407 && !defined(HALF_ENABLE_CPP11_USER_LITERALS) + #define HALF_ENABLE_CPP11_USER_LITERALS 1 + #endif + #if !defined(HALF_ENABLE_CPP11_LONG_LONG) + #define HALF_ENABLE_CPP11_LONG_LONG 1 + #endif + #endif +#elif defined(_MSC_VER) //Visual C++ + #if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_CONSTEXPR) + #define HALF_ENABLE_CPP11_CONSTEXPR 1 + #endif + #if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_NOEXCEPT) + #define HALF_ENABLE_CPP11_NOEXCEPT 1 + #endif + #if _MSC_VER >= 1900 && !defined(HALF_ENABLE_CPP11_USER_LITERALS) + #define HALF_ENABLE_CPP11_USER_LITERALS 1 + #endif + #if _MSC_VER >= 1600 && !defined(HALF_ENABLE_CPP11_STATIC_ASSERT) + #define HALF_ENABLE_CPP11_STATIC_ASSERT 1 + #endif + #if _MSC_VER >= 1310 && !defined(HALF_ENABLE_CPP11_LONG_LONG) + #define HALF_ENABLE_CPP11_LONG_LONG 1 + #endif + #define HALF_POP_WARNINGS 1 + #pragma warning(push) + #pragma warning(disable : 4099 4127 4146) //struct vs class, constant in if, negative unsigned +#endif + +//check C++11 library features +#include +#if defined(_LIBCPP_VERSION) //libc++ + #if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103 + #ifndef HALF_ENABLE_CPP11_TYPE_TRAITS + #define HALF_ENABLE_CPP11_TYPE_TRAITS 1 + #endif + #ifndef HALF_ENABLE_CPP11_CSTDINT + #define HALF_ENABLE_CPP11_CSTDINT 1 + #endif + #ifndef HALF_ENABLE_CPP11_CMATH + #define HALF_ENABLE_CPP11_CMATH 1 + #endif + #ifndef HALF_ENABLE_CPP11_HASH + #define HALF_ENABLE_CPP11_HASH 1 + #endif + #endif +#elif defined(__GLIBCXX__) //libstdc++ + #if defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103 + #ifdef __clang__ + #if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_TYPE_TRAITS) + #define HALF_ENABLE_CPP11_TYPE_TRAITS 1 + #endif + #if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CSTDINT) + #define HALF_ENABLE_CPP11_CSTDINT 1 + #endif + #if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_CMATH) + #define HALF_ENABLE_CPP11_CMATH 1 + #endif + #if __GLIBCXX__ >= 20080606 && !defined(HALF_ENABLE_CPP11_HASH) + #define HALF_ENABLE_CPP11_HASH 1 + #endif + #else + #if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CSTDINT) + #define HALF_ENABLE_CPP11_CSTDINT 1 + #endif + #if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_CMATH) + #define HALF_ENABLE_CPP11_CMATH 1 + #endif + #if HALF_GNUC_VERSION >= 403 && !defined(HALF_ENABLE_CPP11_HASH) + #define HALF_ENABLE_CPP11_HASH 1 + #endif + #endif + #endif +#elif defined(_CPPLIB_VER) //Dinkumware/Visual C++ + #if _CPPLIB_VER >= 520 + #ifndef HALF_ENABLE_CPP11_TYPE_TRAITS + #define HALF_ENABLE_CPP11_TYPE_TRAITS 1 + #endif + #ifndef HALF_ENABLE_CPP11_CSTDINT + #define HALF_ENABLE_CPP11_CSTDINT 1 + #endif + #ifndef HALF_ENABLE_CPP11_HASH + #define HALF_ENABLE_CPP11_HASH 1 + #endif + #endif + #if _CPPLIB_VER >= 610 + #ifndef HALF_ENABLE_CPP11_CMATH + #define HALF_ENABLE_CPP11_CMATH 1 + #endif + #endif +#endif +#undef HALF_GNUC_VERSION + +//support constexpr +#if HALF_ENABLE_CPP11_CONSTEXPR + #define HALF_CONSTEXPR constexpr + #define HALF_CONSTEXPR_CONST constexpr +#else + #define HALF_CONSTEXPR + #define HALF_CONSTEXPR_CONST const +#endif + +//support noexcept +#if HALF_ENABLE_CPP11_NOEXCEPT + #define HALF_NOEXCEPT noexcept + #define HALF_NOTHROW noexcept +#else + #define HALF_NOEXCEPT + #define HALF_NOTHROW throw() +#endif + +#include +#include +#include +#include +#include +#include +#if HALF_ENABLE_CPP11_TYPE_TRAITS + #include +#endif +#if HALF_ENABLE_CPP11_CSTDINT + #include +#endif +#if HALF_ENABLE_CPP11_HASH + #include +#endif + + +/// Default rounding mode. +/// This specifies the rounding mode used for all conversions between [half](\ref half_float::half)s and `float`s as well as +/// for the half_cast() if not specifying a rounding mode explicitly. It can be redefined (before including half.hpp) to one +/// of the standard rounding modes using their respective constants or the equivalent values of `std::float_round_style`: +/// +/// `std::float_round_style` | value | rounding +/// ---------------------------------|-------|------------------------- +/// `std::round_indeterminate` | -1 | fastest (default) +/// `std::round_toward_zero` | 0 | toward zero +/// `std::round_to_nearest` | 1 | to nearest +/// `std::round_toward_infinity` | 2 | toward positive infinity +/// `std::round_toward_neg_infinity` | 3 | toward negative infinity +/// +/// By default this is set to `-1` (`std::round_indeterminate`), which uses truncation (round toward zero, but with overflows +/// set to infinity) and is the fastest rounding mode possible. It can even be set to `std::numeric_limits::round_style` +/// to synchronize the rounding mode with that of the underlying single-precision implementation. +#ifndef HALF_ROUND_STYLE + #define HALF_ROUND_STYLE -1 // = std::round_indeterminate +#endif + +/// Tie-breaking behaviour for round to nearest. +/// This specifies if ties in round to nearest should be resolved by rounding to the nearest even value. By default this is +/// defined to `0` resulting in the faster but slightly more biased behaviour of rounding away from zero in half-way cases (and +/// thus equal to the round() function), but can be redefined to `1` (before including half.hpp) if more IEEE-conformant +/// behaviour is needed. +#ifndef HALF_ROUND_TIES_TO_EVEN + #define HALF_ROUND_TIES_TO_EVEN 0 // ties away from zero +#endif + +/// Value signaling overflow. +/// In correspondence with `HUGE_VAL[F|L]` from `` this symbol expands to a positive value signaling the overflow of an +/// operation, in particular it just evaluates to positive infinity. +#define HUGE_VALH std::numeric_limits::infinity() + +/// Fast half-precision fma function. +/// This symbol is only defined if the fma() function generally executes as fast as, or faster than, a separate +/// half-precision multiplication followed by an addition. Due to the internal single-precision implementation of all +/// arithmetic operations, this is in fact always the case. +#define FP_FAST_FMAH 1 + +#ifndef FP_ILOGB0 + #define FP_ILOGB0 INT_MIN +#endif +#ifndef FP_ILOGBNAN + #define FP_ILOGBNAN INT_MAX +#endif +#ifndef FP_SUBNORMAL + #define FP_SUBNORMAL 0 +#endif +#ifndef FP_ZERO + #define FP_ZERO 1 +#endif +#ifndef FP_NAN + #define FP_NAN 2 +#endif +#ifndef FP_INFINITE + #define FP_INFINITE 3 +#endif +#ifndef FP_NORMAL + #define FP_NORMAL 4 +#endif + + +/// Main namespace for half precision functionality. +/// This namespace contains all the functionality provided by the library. +namespace half_float +{ + class half; + +#if HALF_ENABLE_CPP11_USER_LITERALS + /// Library-defined half-precision literals. + /// Import this namespace to enable half-precision floating point literals: + /// ~~~~{.cpp} + /// using namespace half_float::literal; + /// half_float::half = 4.2_h; + /// ~~~~ + namespace literal + { + half operator""_h(long double); + } +#endif + + /// \internal + /// \brief Implementation details. + namespace detail + { + #if HALF_ENABLE_CPP11_TYPE_TRAITS + /// Conditional type. + template struct conditional : std::conditional {}; + + /// Helper for tag dispatching. + template struct bool_type : std::integral_constant {}; + using std::true_type; + using std::false_type; + + /// Type traits for floating point types. + template struct is_float : std::is_floating_point {}; + #else + /// Conditional type. + template struct conditional { typedef T type; }; + template struct conditional { typedef F type; }; + + /// Helper for tag dispatching. + template struct bool_type {}; + typedef bool_type true_type; + typedef bool_type false_type; + + /// Type traits for floating point types. + template struct is_float : false_type {}; + template struct is_float : is_float {}; + template struct is_float : is_float {}; + template struct is_float : is_float {}; + template<> struct is_float : true_type {}; + template<> struct is_float : true_type {}; + template<> struct is_float : true_type {}; + #endif + + /// Type traits for floating point bits. + template struct bits { typedef unsigned char type; }; + template struct bits : bits {}; + template struct bits : bits {}; + template struct bits : bits {}; + + #if HALF_ENABLE_CPP11_CSTDINT + /// Unsigned integer of (at least) 16 bits width. + typedef std::uint_least16_t uint16; + + /// Unsigned integer of (at least) 32 bits width. + template<> struct bits { typedef std::uint_least32_t type; }; + + /// Unsigned integer of (at least) 64 bits width. + template<> struct bits { typedef std::uint_least64_t type; }; + #else + /// Unsigned integer of (at least) 16 bits width. + typedef unsigned short uint16; + + /// Unsigned integer of (at least) 32 bits width. + template<> struct bits : conditional::digits>=32,unsigned int,unsigned long> {}; + + #if HALF_ENABLE_CPP11_LONG_LONG + /// Unsigned integer of (at least) 64 bits width. + template<> struct bits : conditional::digits>=64,unsigned long,unsigned long long> {}; + #else + /// Unsigned integer of (at least) 64 bits width. + template<> struct bits { typedef unsigned long type; }; + #endif + #endif + + /// Tag type for binary construction. + struct binary_t {}; + + /// Tag for binary construction. + HALF_CONSTEXPR_CONST binary_t binary = binary_t(); + + /// Temporary half-precision expression. + /// This class represents a half-precision expression which just stores a single-precision value internally. + struct expr + { + /// Conversion constructor. + /// \param f single-precision value to convert + explicit HALF_CONSTEXPR expr(float f) HALF_NOEXCEPT : value_(f) {} + + /// Conversion to single-precision. + /// \return single precision value representing expression value + HALF_CONSTEXPR operator float() const HALF_NOEXCEPT { return value_; } + + private: + /// Internal expression value stored in single-precision. + float value_; + }; + + /// SFINAE helper for generic half-precision functions. + /// This class template has to be specialized for each valid combination of argument types to provide a corresponding + /// `type` member equivalent to \a T. + /// \tparam T type to return + template struct enable {}; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + template struct enable { typedef T type; }; + + /// Return type for specialized generic 2-argument half-precision functions. + /// This class template has to be specialized for each valid combination of argument types to provide a corresponding + /// `type` member denoting the appropriate return type. + /// \tparam T first argument type + /// \tparam U first argument type + template struct result : enable {}; + template<> struct result { typedef half type; }; + + /// \name Classification helpers + /// \{ + + /// Check for infinity. + /// \tparam T argument type (builtin floating point type) + /// \param arg value to query + /// \retval true if infinity + /// \retval false else + template bool builtin_isinf(T arg) + { + #if HALF_ENABLE_CPP11_CMATH + return std::isinf(arg); + #elif defined(_MSC_VER) + return !::_finite(static_cast(arg)) && !::_isnan(static_cast(arg)); + #else + return arg == std::numeric_limits::infinity() || arg == -std::numeric_limits::infinity(); + #endif + } + + /// Check for NaN. + /// \tparam T argument type (builtin floating point type) + /// \param arg value to query + /// \retval true if not a number + /// \retval false else + template bool builtin_isnan(T arg) + { + #if HALF_ENABLE_CPP11_CMATH + return std::isnan(arg); + #elif defined(_MSC_VER) + return ::_isnan(static_cast(arg)) != 0; + #else + return arg != arg; + #endif + } + + /// Check sign. + /// \tparam T argument type (builtin floating point type) + /// \param arg value to query + /// \retval true if signbit set + /// \retval false else + template bool builtin_signbit(T arg) + { + #if HALF_ENABLE_CPP11_CMATH + return std::signbit(arg); + #else + return arg < T() || (arg == T() && T(1)/arg < T()); + #endif + } + + /// \} + /// \name Conversion + /// \{ + + /// Convert IEEE single-precision to half-precision. + /// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf). + /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding + /// \param value single-precision value + /// \return binary representation of half-precision value + template uint16 float2half_impl(float value, true_type) + { + typedef bits::type uint32; + uint32 bits;// = *reinterpret_cast(&value); //violating strict aliasing! + std::memcpy(&bits, &value, sizeof(float)); +/* uint16 hbits = (bits>>16) & 0x8000; + bits &= 0x7FFFFFFF; + int exp = bits >> 23; + if(exp == 255) + return hbits | 0x7C00 | (0x3FF&-static_cast((bits&0x7FFFFF)!=0)); + if(exp > 142) + { + if(R == std::round_toward_infinity) + return hbits | 0x7C00 - (hbits>>15); + if(R == std::round_toward_neg_infinity) + return hbits | 0x7BFF + (hbits>>15); + return hbits | 0x7BFF + (R!=std::round_toward_zero); + } + int g, s; + if(exp > 112) + { + g = (bits>>12) & 1; + s = (bits&0xFFF) != 0; + hbits |= ((exp-112)<<10) | ((bits>>13)&0x3FF); + } + else if(exp > 101) + { + int i = 125 - exp; + bits = (bits&0x7FFFFF) | 0x800000; + g = (bits>>i) & 1; + s = (bits&((1L<> (i+1); + } + else + { + g = 0; + s = bits != 0; + } + if(R == std::round_to_nearest) + #if HALF_ROUND_TIES_TO_EVEN + hbits += g & (s|hbits); + #else + hbits += g; + #endif + else if(R == std::round_toward_infinity) + hbits += ~(hbits>>15) & (s|g); + else if(R == std::round_toward_neg_infinity) + hbits += (hbits>>15) & (g|s); +*/ static const uint16 base_table[512] = { + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100, + 0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00, + 0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, 0x7C00, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, + 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, + 0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00, + 0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, + 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00, 0xFC00 }; + static const unsigned char shift_table[512] = { + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13 }; + uint16 hbits = base_table[bits>>23] + static_cast((bits&0x7FFFFF)>>shift_table[bits>>23]); + if(R == std::round_to_nearest) + hbits += (((bits&0x7FFFFF)>>(shift_table[bits>>23]-1))|(((bits>>23)&0xFF)==102)) & ((hbits&0x7C00)!=0x7C00) + #if HALF_ROUND_TIES_TO_EVEN + & (((((static_cast(1)<<(shift_table[bits>>23]-1))-1)&bits)!=0)|hbits) + #endif + ; + else if(R == std::round_toward_zero) + hbits -= ((hbits&0x7FFF)==0x7C00) & ~shift_table[bits>>23]; + else if(R == std::round_toward_infinity) + hbits += ((((bits&0x7FFFFF&((static_cast(1)<<(shift_table[bits>>23]))-1))!=0)|(((bits>>23)<=102)& + ((bits>>23)!=0)))&(hbits<0x7C00)) - ((hbits==0xFC00)&((bits>>23)!=511)); + else if(R == std::round_toward_neg_infinity) + hbits += ((((bits&0x7FFFFF&((static_cast(1)<<(shift_table[bits>>23]))-1))!=0)|(((bits>>23)<=358)& + ((bits>>23)!=256)))&(hbits<0xFC00)&(hbits>>15)) - ((hbits==0x7C00)&((bits>>23)!=255)); + return hbits; + } + + /// Convert IEEE double-precision to half-precision. + /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding + /// \param value double-precision value + /// \return binary representation of half-precision value + template uint16 float2half_impl(double value, true_type) + { + typedef bits::type uint32; + typedef bits::type uint64; + uint64 bits;// = *reinterpret_cast(&value); //violating strict aliasing! + std::memcpy(&bits, &value, sizeof(double)); + uint32 hi = bits >> 32, lo = bits & 0xFFFFFFFF; + uint16 hbits = (hi>>16) & 0x8000; + hi &= 0x7FFFFFFF; + int exp = hi >> 20; + if(exp == 2047) + return hbits | 0x7C00 | (0x3FF&-static_cast((bits&0xFFFFFFFFFFFFF)!=0)); + if(exp > 1038) + { + if(R == std::round_toward_infinity) + return hbits | 0x7C00 - (hbits>>15); + if(R == std::round_toward_neg_infinity) + return hbits | 0x7BFF + (hbits>>15); + return hbits | 0x7BFF + (R!=std::round_toward_zero); + } + int g, s = lo != 0; + if(exp > 1008) + { + g = (hi>>9) & 1; + s |= (hi&0x1FF) != 0; + hbits |= ((exp-1008)<<10) | ((hi>>10)&0x3FF); + } + else if(exp > 997) + { + int i = 1018 - exp; + hi = (hi&0xFFFFF) | 0x100000; + g = (hi>>i) & 1; + s |= (hi&((1L<> (i+1); + } + else + { + g = 0; + s |= hi != 0; + } + if(R == std::round_to_nearest) + #if HALF_ROUND_TIES_TO_EVEN + hbits += g & (s|hbits); + #else + hbits += g; + #endif + else if(R == std::round_toward_infinity) + hbits += ~(hbits>>15) & (s|g); + else if(R == std::round_toward_neg_infinity) + hbits += (hbits>>15) & (g|s); + return hbits; + } + + /// Convert non-IEEE floating point to half-precision. + /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding + /// \tparam T source type (builtin floating point type) + /// \param value floating point value + /// \return binary representation of half-precision value + template uint16 float2half_impl(T value, ...) + { + uint16 hbits = static_cast(builtin_signbit(value)) << 15; + if(value == T()) + return hbits; + if(builtin_isnan(value)) + return hbits | 0x7FFF; + if(builtin_isinf(value)) + return hbits | 0x7C00; + int exp; + std::frexp(value, &exp); + if(exp > 16) + { + if(R == std::round_toward_infinity) + return hbits | 0x7C00 - (hbits>>15); + else if(R == std::round_toward_neg_infinity) + return hbits | 0x7BFF + (hbits>>15); + return hbits | 0x7BFF + (R!=std::round_toward_zero); + } + if(exp < -13) + value = std::ldexp(value, 24); + else + { + value = std::ldexp(value, 11-exp); + hbits |= ((exp+13)<<10); + } + T ival, frac = std::modf(value, &ival); + hbits += static_cast(std::abs(static_cast(ival))); + if(R == std::round_to_nearest) + { + frac = std::abs(frac); + #if HALF_ROUND_TIES_TO_EVEN + hbits += (frac>T(0.5)) | ((frac==T(0.5))&hbits); + #else + hbits += frac >= T(0.5); + #endif + } + else if(R == std::round_toward_infinity) + hbits += frac > T(); + else if(R == std::round_toward_neg_infinity) + hbits += frac < T(); + return hbits; + } + + /// Convert floating point to half-precision. + /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding + /// \tparam T source type (builtin floating point type) + /// \param value floating point value + /// \return binary representation of half-precision value + template uint16 float2half(T value) + { + return float2half_impl(value, bool_type::is_iec559&&sizeof(typename bits::type)==sizeof(T)>()); + } + + /// Convert integer to half-precision floating point. + /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding + /// \tparam S `true` if value negative, `false` else + /// \tparam T type to convert (builtin integer type) + /// \param value non-negative integral value + /// \return binary representation of half-precision value + template uint16 int2half_impl(T value) + { + #if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_integral::value, "int to half conversion only supports builtin integer types"); + #endif + if(S) + value = -value; + uint16 bits = S << 15; + if(value > 0xFFFF) + { + if(R == std::round_toward_infinity) + bits |= 0x7C00 - S; + else if(R == std::round_toward_neg_infinity) + bits |= 0x7BFF + S; + else + bits |= 0x7BFF + (R!=std::round_toward_zero); + } + else if(value) + { + unsigned int m = value, exp = 24; + for(; m<0x400; m<<=1,--exp) ; + for(; m>0x7FF; m>>=1,++exp) ; + bits |= (exp<<10) + m; + if(exp > 24) + { + if(R == std::round_to_nearest) + bits += (value>>(exp-25)) & 1 + #if HALF_ROUND_TIES_TO_EVEN + & (((((1<<(exp-25))-1)&value)!=0)|bits) + #endif + ; + else if(R == std::round_toward_infinity) + bits += ((value&((1<<(exp-24))-1))!=0) & !S; + else if(R == std::round_toward_neg_infinity) + bits += ((value&((1<<(exp-24))-1))!=0) & S; + } + } + return bits; + } + + /// Convert integer to half-precision floating point. + /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding + /// \tparam T type to convert (builtin integer type) + /// \param value integral value + /// \return binary representation of half-precision value + template uint16 int2half(T value) + { + return (value<0) ? int2half_impl(value) : int2half_impl(value); + } + + /// Convert half-precision to IEEE single-precision. + /// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf). + /// \param value binary representation of half-precision value + /// \return single-precision value + inline float half2float_impl(uint16 value, float, true_type) + { + typedef bits::type uint32; +/* uint32 bits = static_cast(value&0x8000) << 16; + int abs = value & 0x7FFF; + if(abs) + { + bits |= 0x38000000 << static_cast(abs>=0x7C00); + for(; abs<0x400; abs<<=1,bits-=0x800000) ; + bits += static_cast(abs) << 13; + } +*/ static const uint32 mantissa_table[2048] = { + 0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000, + 0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000, + 0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000, + 0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000, + 0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000, + 0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000, + 0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000, + 0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000, + 0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000, + 0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000, + 0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000, + 0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000, + 0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000, + 0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000, + 0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000, + 0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000, + 0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000, + 0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000, + 0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000, + 0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000, + 0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000, + 0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000, + 0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000, + 0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000, + 0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000, + 0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000, + 0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000, + 0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000, + 0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000, + 0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000, + 0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000, + 0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000, + 0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000, + 0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000, + 0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000, + 0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000, + 0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000, + 0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000, + 0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000, + 0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000, + 0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000, + 0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000, + 0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000, + 0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000, + 0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000, + 0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000, + 0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000, + 0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000, + 0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000, + 0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000, + 0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000, + 0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000, + 0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000, + 0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000, + 0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000, + 0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000, + 0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000, + 0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000, + 0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000, + 0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000, + 0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000, + 0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000, + 0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000, + 0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000, + 0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000, + 0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000, + 0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000, + 0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000, + 0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000, + 0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000, + 0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000, + 0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000, + 0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000, + 0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000, + 0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000, + 0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000, + 0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000, + 0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000, + 0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000, + 0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000, + 0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000, + 0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000, + 0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000, + 0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000, + 0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000, + 0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000, + 0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000, + 0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000, + 0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000, + 0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000, + 0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000, + 0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000, + 0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000, + 0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000, + 0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000, + 0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000, + 0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000, + 0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000, + 0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000, + 0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000, + 0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000, + 0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000, + 0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000, + 0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000, + 0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000, + 0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000, + 0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000, + 0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000, + 0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000, + 0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000, + 0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000, + 0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000, + 0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000, + 0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000, + 0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000, + 0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000, + 0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000, + 0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000, + 0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000, + 0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000, + 0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000, + 0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000, + 0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000, + 0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000, + 0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000, + 0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000, + 0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000, + 0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000 }; + static const uint32 exponent_table[64] = { + 0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, 0x07800000, + 0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000, + 0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000, + 0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000 }; + static const unsigned short offset_table[64] = { + 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, + 0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 }; + uint32 bits = mantissa_table[offset_table[value>>10]+(value&0x3FF)] + exponent_table[value>>10]; +// return *reinterpret_cast(&bits); //violating strict aliasing! + float out; + std::memcpy(&out, &bits, sizeof(float)); + return out; + } + + /// Convert half-precision to IEEE double-precision. + /// \param value binary representation of half-precision value + /// \return double-precision value + inline double half2float_impl(uint16 value, double, true_type) + { + typedef bits::type uint32; + typedef bits::type uint64; + uint32 hi = static_cast(value&0x8000) << 16; + int abs = value & 0x7FFF; + if(abs) + { + hi |= 0x3F000000 << static_cast(abs>=0x7C00); + for(; abs<0x400; abs<<=1,hi-=0x100000) ; + hi += static_cast(abs) << 10; + } + uint64 bits = static_cast(hi) << 32; +// return *reinterpret_cast(&bits); //violating strict aliasing! + double out; + std::memcpy(&out, &bits, sizeof(double)); + return out; + } + + /// Convert half-precision to non-IEEE floating point. + /// \tparam T type to convert to (builtin integer type) + /// \param value binary representation of half-precision value + /// \return floating point value + template T half2float_impl(uint16 value, T, ...) + { + T out; + int abs = value & 0x7FFF; + if(abs > 0x7C00) + out = std::numeric_limits::has_quiet_NaN ? std::numeric_limits::quiet_NaN() : T(); + else if(abs == 0x7C00) + out = std::numeric_limits::has_infinity ? std::numeric_limits::infinity() : std::numeric_limits::max(); + else if(abs > 0x3FF) + out = std::ldexp(static_cast((abs&0x3FF)|0x400), (abs>>10)-25); + else + out = std::ldexp(static_cast(abs), -24); + return (value&0x8000) ? -out : out; + } + + /// Convert half-precision to floating point. + /// \tparam T type to convert to (builtin integer type) + /// \param value binary representation of half-precision value + /// \return floating point value + template T half2float(uint16 value) + { + return half2float_impl(value, T(), bool_type::is_iec559&&sizeof(typename bits::type)==sizeof(T)>()); + } + + /// Convert half-precision floating point to integer. + /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding + /// \tparam E `true` for round to even, `false` for round away from zero + /// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign bits) + /// \param value binary representation of half-precision value + /// \return integral value + template T half2int_impl(uint16 value) + { + #if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_integral::value, "half to int conversion only supports builtin integer types"); + #endif + unsigned int e = value & 0x7FFF; + if(e >= 0x7C00) + return (value&0x8000) ? std::numeric_limits::min() : std::numeric_limits::max(); + if(e < 0x3800) + { + if(R == std::round_toward_infinity) + return T(~(value>>15)&(e!=0)); + else if(R == std::round_toward_neg_infinity) + return -T(value>0x8000); + return T(); + } + unsigned int m = (value&0x3FF) | 0x400; + e >>= 10; + if(e < 25) + { + if(R == std::round_to_nearest) + m += (1<<(24-e)) - (~(m>>(25-e))&E); + else if(R == std::round_toward_infinity) + m += ((value>>15)-1) & ((1<<(25-e))-1U); + else if(R == std::round_toward_neg_infinity) + m += -(value>>15) & ((1<<(25-e))-1U); + m >>= 25 - e; + } + else + m <<= e - 25; + return (value&0x8000) ? -static_cast(m) : static_cast(m); + } + + /// Convert half-precision floating point to integer. + /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding + /// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign bits) + /// \param value binary representation of half-precision value + /// \return integral value + template T half2int(uint16 value) { return half2int_impl(value); } + + /// Convert half-precision floating point to integer using round-to-nearest-away-from-zero. + /// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign bits) + /// \param value binary representation of half-precision value + /// \return integral value + template T half2int_up(uint16 value) { return half2int_impl(value); } + + /// Round half-precision number to nearest integer value. + /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding + /// \tparam E `true` for round to even, `false` for round away from zero + /// \param value binary representation of half-precision value + /// \return half-precision bits for nearest integral value + template uint16 round_half_impl(uint16 value) + { + unsigned int e = value & 0x7FFF; + uint16 result = value; + if(e < 0x3C00) + { + result &= 0x8000; + if(R == std::round_to_nearest) + result |= 0x3C00U & -(e>=(0x3800+E)); + else if(R == std::round_toward_infinity) + result |= 0x3C00U & -(~(value>>15)&(e!=0)); + else if(R == std::round_toward_neg_infinity) + result |= 0x3C00U & -(value>0x8000); + } + else if(e < 0x6400) + { + e = 25 - (e>>10); + unsigned int mask = (1<>e)&E); + else if(R == std::round_toward_infinity) + result += mask & ((value>>15)-1); + else if(R == std::round_toward_neg_infinity) + result += mask & -(value>>15); + result &= ~mask; + } + return result; + } + + /// Round half-precision number to nearest integer value. + /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding + /// \param value binary representation of half-precision value + /// \return half-precision bits for nearest integral value + template uint16 round_half(uint16 value) { return round_half_impl(value); } + + /// Round half-precision number to nearest integer value using round-to-nearest-away-from-zero. + /// \param value binary representation of half-precision value + /// \return half-precision bits for nearest integral value + inline uint16 round_half_up(uint16 value) { return round_half_impl(value); } + /// \} + + struct functions; + template struct unary_specialized; + template struct binary_specialized; + template struct half_caster; + } + + /// Half-precision floating point type. + /// This class implements an IEEE-conformant half-precision floating point type with the usual arithmetic operators and + /// conversions. It is implicitly convertible to single-precision floating point, which makes artihmetic expressions and + /// functions with mixed-type operands to be of the most precise operand type. Additionally all arithmetic operations + /// (and many mathematical functions) are carried out in single-precision internally. All conversions from single- to + /// half-precision are done using the library's default rounding mode, but temporary results inside chained arithmetic + /// expressions are kept in single-precision as long as possible (while of course still maintaining a strong half-precision type). + /// + /// According to the C++98/03 definition, the half type is not a POD type. But according to C++11's less strict and + /// extended definitions it is both a standard layout type and a trivially copyable type (even if not a POD type), which + /// means it can be standard-conformantly copied using raw binary copies. But in this context some more words about the + /// actual size of the type. Although the half is representing an IEEE 16-bit type, it does not neccessarily have to be of + /// exactly 16-bits size. But on any reasonable implementation the actual binary representation of this type will most + /// probably not ivolve any additional "magic" or padding beyond the simple binary representation of the underlying 16-bit + /// IEEE number, even if not strictly guaranteed by the standard. But even then it only has an actual size of 16 bits if + /// your C++ implementation supports an unsigned integer type of exactly 16 bits width. But this should be the case on + /// nearly any reasonable platform. + /// + /// So if your C++ implementation is not totally exotic or imposes special alignment requirements, it is a reasonable + /// assumption that the data of a half is just comprised of the 2 bytes of the underlying IEEE representation. + class half + { + friend struct detail::functions; + friend struct detail::unary_specialized; + friend struct detail::binary_specialized; + template friend struct detail::half_caster; + friend class std::numeric_limits; + #if HALF_ENABLE_CPP11_HASH + friend struct std::hash; + #endif + #if HALF_ENABLE_CPP11_USER_LITERALS + friend half literal::operator""_h(long double); + #endif + + public: + /// Default constructor. + /// This initializes the half to 0. Although this does not match the builtin types' default-initialization semantics + /// and may be less efficient than no initialization, it is needed to provide proper value-initialization semantics. + HALF_CONSTEXPR half() HALF_NOEXCEPT : data_() {} + + /// Copy constructor. + /// \tparam T type of concrete half expression + /// \param rhs half expression to copy from + half(detail::expr rhs) : data_(detail::float2half(static_cast(rhs))) {} + + /// Conversion constructor. + /// \param rhs float to convert + explicit half(float rhs) : data_(detail::float2half(rhs)) {} + + /// Conversion to single-precision. + /// \return single precision value representing expression value + operator float() const { return detail::half2float(data_); } + + /// Assignment operator. + /// \tparam T type of concrete half expression + /// \param rhs half expression to copy from + /// \return reference to this half + half& operator=(detail::expr rhs) { return *this = static_cast(rhs); } + + /// Arithmetic assignment. + /// \tparam T type of concrete half expression + /// \param rhs half expression to add + /// \return reference to this half + template typename detail::enable::type operator+=(T rhs) { return *this += static_cast(rhs); } + + /// Arithmetic assignment. + /// \tparam T type of concrete half expression + /// \param rhs half expression to subtract + /// \return reference to this half + template typename detail::enable::type operator-=(T rhs) { return *this -= static_cast(rhs); } + + /// Arithmetic assignment. + /// \tparam T type of concrete half expression + /// \param rhs half expression to multiply with + /// \return reference to this half + template typename detail::enable::type operator*=(T rhs) { return *this *= static_cast(rhs); } + + /// Arithmetic assignment. + /// \tparam T type of concrete half expression + /// \param rhs half expression to divide by + /// \return reference to this half + template typename detail::enable::type operator/=(T rhs) { return *this /= static_cast(rhs); } + + /// Assignment operator. + /// \param rhs single-precision value to copy from + /// \return reference to this half + half& operator=(float rhs) { data_ = detail::float2half(rhs); return *this; } + + /// Arithmetic assignment. + /// \param rhs single-precision value to add + /// \return reference to this half + half& operator+=(float rhs) { data_ = detail::float2half(detail::half2float(data_)+rhs); return *this; } + + /// Arithmetic assignment. + /// \param rhs single-precision value to subtract + /// \return reference to this half + half& operator-=(float rhs) { data_ = detail::float2half(detail::half2float(data_)-rhs); return *this; } + + /// Arithmetic assignment. + /// \param rhs single-precision value to multiply with + /// \return reference to this half + half& operator*=(float rhs) { data_ = detail::float2half(detail::half2float(data_)*rhs); return *this; } + + /// Arithmetic assignment. + /// \param rhs single-precision value to divide by + /// \return reference to this half + half& operator/=(float rhs) { data_ = detail::float2half(detail::half2float(data_)/rhs); return *this; } + + /// Prefix increment. + /// \return incremented half value + half& operator++() { return *this += 1.0f; } + + /// Prefix decrement. + /// \return decremented half value + half& operator--() { return *this -= 1.0f; } + + /// Postfix increment. + /// \return non-incremented half value + half operator++(int) { half out(*this); ++*this; return out; } + + /// Postfix decrement. + /// \return non-decremented half value + half operator--(int) { half out(*this); --*this; return out; } + + private: + /// Rounding mode to use + static const std::float_round_style round_style = (std::float_round_style)(HALF_ROUND_STYLE); + + /// Constructor. + /// \param bits binary representation to set half to + HALF_CONSTEXPR half(detail::binary_t, detail::uint16 bits) HALF_NOEXCEPT : data_(bits) {} + + /// Internal binary representation + detail::uint16 data_; + }; + +#if HALF_ENABLE_CPP11_USER_LITERALS + namespace literal + { + /// Half literal. + /// While this returns an actual half-precision value, half literals can unfortunately not be constant expressions due + /// to rather involved conversions. + /// \param value literal value + /// \return half with given value (if representable) + inline half operator""_h(long double value) { return half(detail::binary, detail::float2half(value)); } + } +#endif + + namespace detail + { + /// Wrapper implementing unspecialized half-precision functions. + struct functions + { + /// Addition implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision sum stored in single-precision + static expr plus(float x, float y) { return expr(x+y); } + + /// Subtraction implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision difference stored in single-precision + static expr minus(float x, float y) { return expr(x-y); } + + /// Multiplication implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision product stored in single-precision + static expr multiplies(float x, float y) { return expr(x*y); } + + /// Division implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision quotient stored in single-precision + static expr divides(float x, float y) { return expr(x/y); } + + /// Output implementation. + /// \param out stream to write to + /// \param arg value to write + /// \return reference to stream + template static std::basic_ostream& write(std::basic_ostream &out, float arg) { return out << arg; } + + /// Input implementation. + /// \param in stream to read from + /// \param arg half to read into + /// \return reference to stream + template static std::basic_istream& read(std::basic_istream &in, half &arg) + { + float f; + if(in >> f) + arg = f; + return in; + } + + /// Modulo implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision division remainder stored in single-precision + static expr fmod(float x, float y) { return expr(std::fmod(x, y)); } + + /// Remainder implementation. + /// \param x first operand + /// \param y second operand + /// \return Half-precision division remainder stored in single-precision + static expr remainder(float x, float y) + { + #if HALF_ENABLE_CPP11_CMATH + return expr(std::remainder(x, y)); + #else + if(builtin_isnan(x) || builtin_isnan(y)) + return expr(std::numeric_limits::quiet_NaN()); + float ax = std::fabs(x), ay = std::fabs(y); + if(ax >= 65536.0f || ay < std::ldexp(1.0f, -24)) + return expr(std::numeric_limits::quiet_NaN()); + if(ay >= 65536.0f) + return expr(x); + if(ax == ay) + return expr(builtin_signbit(x) ? -0.0f : 0.0f); + ax = std::fmod(ax, ay+ay); + float y2 = 0.5f * ay; + if(ax > y2) + { + ax -= ay; + if(ax >= y2) + ax -= ay; + } + return expr(builtin_signbit(x) ? -ax : ax); + #endif + } + + /// Remainder implementation. + /// \param x first operand + /// \param y second operand + /// \param quo address to store quotient bits at + /// \return Half-precision division remainder stored in single-precision + static expr remquo(float x, float y, int *quo) + { + #if HALF_ENABLE_CPP11_CMATH + return expr(std::remquo(x, y, quo)); + #else + if(builtin_isnan(x) || builtin_isnan(y)) + return expr(std::numeric_limits::quiet_NaN()); + bool sign = builtin_signbit(x), qsign = static_cast(sign^builtin_signbit(y)); + float ax = std::fabs(x), ay = std::fabs(y); + if(ax >= 65536.0f || ay < std::ldexp(1.0f, -24)) + return expr(std::numeric_limits::quiet_NaN()); + if(ay >= 65536.0f) + return expr(x); + if(ax == ay) + return *quo = qsign ? -1 : 1, expr(sign ? -0.0f : 0.0f); + ax = std::fmod(ax, 8.0f*ay); + int cquo = 0; + if(ax >= 4.0f * ay) + { + ax -= 4.0f * ay; + cquo += 4; + } + if(ax >= 2.0f * ay) + { + ax -= 2.0f * ay; + cquo += 2; + } + float y2 = 0.5f * ay; + if(ax > y2) + { + ax -= ay; + ++cquo; + if(ax >= y2) + { + ax -= ay; + ++cquo; + } + } + return *quo = qsign ? -cquo : cquo, expr(sign ? -ax : ax); + #endif + } + + /// Positive difference implementation. + /// \param x first operand + /// \param y second operand + /// \return Positive difference stored in single-precision + static expr fdim(float x, float y) + { + #if HALF_ENABLE_CPP11_CMATH + return expr(std::fdim(x, y)); + #else + return expr((x<=y) ? 0.0f : (x-y)); + #endif + } + + /// Fused multiply-add implementation. + /// \param x first operand + /// \param y second operand + /// \param z third operand + /// \return \a x * \a y + \a z stored in single-precision + static expr fma(float x, float y, float z) + { + #if HALF_ENABLE_CPP11_CMATH && defined(FP_FAST_FMAF) + return expr(std::fma(x, y, z)); + #else + return expr(x*y+z); + #endif + } + + /// Get NaN. + /// \return Half-precision quiet NaN + static half nanh() { return half(binary, 0x7FFF); } + + /// Exponential implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr exp(float arg) { return expr(std::exp(arg)); } + + /// Exponential implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr expm1(float arg) + { + #if HALF_ENABLE_CPP11_CMATH + return expr(std::expm1(arg)); + #else + return expr(static_cast(std::exp(static_cast(arg))-1.0)); + #endif + } + + /// Binary exponential implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr exp2(float arg) + { + #if HALF_ENABLE_CPP11_CMATH + return expr(std::exp2(arg)); + #else + return expr(static_cast(std::exp(arg*0.69314718055994530941723212145818))); + #endif + } + + /// Logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr log(float arg) { return expr(std::log(arg)); } + + /// Common logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr log10(float arg) { return expr(std::log10(arg)); } + + /// Logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr log1p(float arg) + { + #if HALF_ENABLE_CPP11_CMATH + return expr(std::log1p(arg)); + #else + return expr(static_cast(std::log(1.0+arg))); + #endif + } + + /// Binary logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr log2(float arg) + { + #if HALF_ENABLE_CPP11_CMATH + return expr(std::log2(arg)); + #else + return expr(static_cast(std::log(static_cast(arg))*1.4426950408889634073599246810019)); + #endif + } + + /// Square root implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr sqrt(float arg) { return expr(std::sqrt(arg)); } + + /// Cubic root implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr cbrt(float arg) + { + #if HALF_ENABLE_CPP11_CMATH + return expr(std::cbrt(arg)); + #else + if(builtin_isnan(arg) || builtin_isinf(arg)) + return expr(arg); + return expr(builtin_signbit(arg) ? -static_cast(std::pow(-static_cast(arg), 1.0/3.0)) : + static_cast(std::pow(static_cast(arg), 1.0/3.0))); + #endif + } + + /// Hypotenuse implementation. + /// \param x first argument + /// \param y second argument + /// \return function value stored in single-preicision + static expr hypot(float x, float y) + { + #if HALF_ENABLE_CPP11_CMATH + return expr(std::hypot(x, y)); + #else + return expr((builtin_isinf(x) || builtin_isinf(y)) ? std::numeric_limits::infinity() : + static_cast(std::sqrt(static_cast(x)*x+static_cast(y)*y))); + #endif + } + + /// Power implementation. + /// \param base value to exponentiate + /// \param exp power to expontiate to + /// \return function value stored in single-preicision + static expr pow(float base, float exp) { return expr(std::pow(base, exp)); } + + /// Sine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr sin(float arg) { return expr(std::sin(arg)); } + + /// Cosine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr cos(float arg) { return expr(std::cos(arg)); } + + /// Tan implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr tan(float arg) { return expr(std::tan(arg)); } + + /// Arc sine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr asin(float arg) { return expr(std::asin(arg)); } + + /// Arc cosine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr acos(float arg) { return expr(std::acos(arg)); } + + /// Arc tangent implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr atan(float arg) { return expr(std::atan(arg)); } + + /// Arc tangent implementation. + /// \param x first argument + /// \param y second argument + /// \return function value stored in single-preicision + static expr atan2(float x, float y) { return expr(std::atan2(x, y)); } + + /// Hyperbolic sine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr sinh(float arg) { return expr(std::sinh(arg)); } + + /// Hyperbolic cosine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr cosh(float arg) { return expr(std::cosh(arg)); } + + /// Hyperbolic tangent implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr tanh(float arg) { return expr(std::tanh(arg)); } + + /// Hyperbolic area sine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr asinh(float arg) + { + #if HALF_ENABLE_CPP11_CMATH + return expr(std::asinh(arg)); + #else + return expr((arg==-std::numeric_limits::infinity()) ? arg : static_cast(std::log(arg+std::sqrt(arg*arg+1.0)))); + #endif + } + + /// Hyperbolic area cosine implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr acosh(float arg) + { + #if HALF_ENABLE_CPP11_CMATH + return expr(std::acosh(arg)); + #else + return expr((arg<-1.0f) ? std::numeric_limits::quiet_NaN() : static_cast(std::log(arg+std::sqrt(arg*arg-1.0)))); + #endif + } + + /// Hyperbolic area tangent implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr atanh(float arg) + { + #if HALF_ENABLE_CPP11_CMATH + return expr(std::atanh(arg)); + #else + return expr(static_cast(0.5*std::log((1.0+arg)/(1.0-arg)))); + #endif + } + + /// Error function implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr erf(float arg) + { + #if HALF_ENABLE_CPP11_CMATH + return expr(std::erf(arg)); + #else + return expr(static_cast(erf(static_cast(arg)))); + #endif + } + + /// Complementary implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr erfc(float arg) + { + #if HALF_ENABLE_CPP11_CMATH + return expr(std::erfc(arg)); + #else + return expr(static_cast(1.0-erf(static_cast(arg)))); + #endif + } + + /// Gamma logarithm implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr lgamma(float arg) + { + #if HALF_ENABLE_CPP11_CMATH + return expr(std::lgamma(arg)); + #else + if(builtin_isinf(arg)) + return expr(std::numeric_limits::infinity()); + if(arg < 0.0f) + { + float i, f = std::modf(-arg, &i); + if(f == 0.0f) + return expr(std::numeric_limits::infinity()); + return expr(static_cast(1.1447298858494001741434273513531- + std::log(std::abs(std::sin(3.1415926535897932384626433832795*f)))-lgamma(1.0-arg))); + } + return expr(static_cast(lgamma(static_cast(arg)))); + #endif + } + + /// Gamma implementation. + /// \param arg function argument + /// \return function value stored in single-preicision + static expr tgamma(float arg) + { + #if HALF_ENABLE_CPP11_CMATH + return expr(std::tgamma(arg)); + #else + if(arg == 0.0f) + return builtin_signbit(arg) ? expr(-std::numeric_limits::infinity()) : expr(std::numeric_limits::infinity()); + if(arg < 0.0f) + { + float i, f = std::modf(-arg, &i); + if(f == 0.0f) + return expr(std::numeric_limits::quiet_NaN()); + double value = 3.1415926535897932384626433832795 / (std::sin(3.1415926535897932384626433832795*f)*std::exp(lgamma(1.0-arg))); + return expr(static_cast((std::fmod(i, 2.0f)==0.0f) ? -value : value)); + } + if(builtin_isinf(arg)) + return expr(arg); + return expr(static_cast(std::exp(lgamma(static_cast(arg))))); + #endif + } + + /// Floor implementation. + /// \param arg value to round + /// \return rounded value + static half floor(half arg) { return half(binary, round_half(arg.data_)); } + + /// Ceiling implementation. + /// \param arg value to round + /// \return rounded value + static half ceil(half arg) { return half(binary, round_half(arg.data_)); } + + /// Truncation implementation. + /// \param arg value to round + /// \return rounded value + static half trunc(half arg) { return half(binary, round_half(arg.data_)); } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static half round(half arg) { return half(binary, round_half_up(arg.data_)); } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static long lround(half arg) { return detail::half2int_up(arg.data_); } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static half rint(half arg) { return half(binary, round_half(arg.data_)); } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static long lrint(half arg) { return detail::half2int(arg.data_); } + + #if HALF_ENABLE_CPP11_LONG_LONG + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static long long llround(half arg) { return detail::half2int_up(arg.data_); } + + /// Nearest integer implementation. + /// \param arg value to round + /// \return rounded value + static long long llrint(half arg) { return detail::half2int(arg.data_); } + #endif + + /// Decompression implementation. + /// \param arg number to decompress + /// \param exp address to store exponent at + /// \return normalized significant + static half frexp(half arg, int *exp) + { + int m = arg.data_ & 0x7FFF, e = -14; + if(m >= 0x7C00 || !m) + return *exp = 0, arg; + for(; m<0x400; m<<=1,--e) ; + return *exp = e+(m>>10), half(binary, (arg.data_&0x8000)|0x3800|(m&0x3FF)); + } + + /// Decompression implementation. + /// \param arg number to decompress + /// \param iptr address to store integer part at + /// \return fractional part + static half modf(half arg, half *iptr) + { + unsigned int e = arg.data_ & 0x7FFF; + if(e >= 0x6400) + return *iptr = arg, half(binary, arg.data_&(0x8000U|-(e>0x7C00))); + if(e < 0x3C00) + return iptr->data_ = arg.data_ & 0x8000, arg; + e >>= 10; + unsigned int mask = (1<<(25-e)) - 1, m = arg.data_ & mask; + iptr->data_ = arg.data_ & ~mask; + if(!m) + return half(binary, arg.data_&0x8000); + for(; m<0x400; m<<=1,--e) ; + return half(binary, static_cast((arg.data_&0x8000)|(e<<10)|(m&0x3FF))); + } + + /// Scaling implementation. + /// \param arg number to scale + /// \param exp power of two to scale by + /// \return scaled number + static half scalbln(half arg, long exp) + { + unsigned int m = arg.data_ & 0x7FFF; + if(m >= 0x7C00 || !m) + return arg; + for(; m<0x400; m<<=1,--exp) ; + exp += m >> 10; + uint16 value = arg.data_ & 0x8000; + if(exp > 30) + { + if(half::round_style == std::round_toward_zero) + value |= 0x7BFF; + else if(half::round_style == std::round_toward_infinity) + value |= 0x7C00 - (value>>15); + else if(half::round_style == std::round_toward_neg_infinity) + value |= 0x7BFF + (value>>15); + else + value |= 0x7C00; + } + else if(exp > 0) + value |= (exp<<10) | (m&0x3FF); + else if(exp > -11) + { + m = (m&0x3FF) | 0x400; + if(half::round_style == std::round_to_nearest) + { + m += 1 << -exp; + #if HALF_ROUND_TIES_TO_EVEN + m -= (m>>(1-exp)) & 1; + #endif + } + else if(half::round_style == std::round_toward_infinity) + m += ((value>>15)-1) & ((1<<(1-exp))-1U); + else if(half::round_style == std::round_toward_neg_infinity) + m += -(value>>15) & ((1<<(1-exp))-1U); + value |= m >> (1-exp); + } + else if(half::round_style == std::round_toward_infinity) + value -= (value>>15) - 1; + else if(half::round_style == std::round_toward_neg_infinity) + value += value >> 15; + return half(binary, value); + } + + /// Exponent implementation. + /// \param arg number to query + /// \return floating point exponent + static int ilogb(half arg) + { + int abs = arg.data_ & 0x7FFF; + if(!abs) + return FP_ILOGB0; + if(abs < 0x7C00) + { + int exp = (abs>>10) - 15; + if(abs < 0x400) + for(; abs<0x200; abs<<=1,--exp) ; + return exp; + } + if(abs > 0x7C00) + return FP_ILOGBNAN; + return INT_MAX; + } + + /// Exponent implementation. + /// \param arg number to query + /// \return floating point exponent + static half logb(half arg) + { + int abs = arg.data_ & 0x7FFF; + if(!abs) + return half(binary, 0xFC00); + if(abs < 0x7C00) + { + int exp = (abs>>10) - 15; + if(abs < 0x400) + for(; abs<0x200; abs<<=1,--exp) ; + uint16 bits = (exp<0) << 15; + if(exp) + { + unsigned int m = std::abs(exp) << 6, e = 18; + for(; m<0x400; m<<=1,--e) ; + bits |= (e<<10) + m; + } + return half(binary, bits); + } + if(abs > 0x7C00) + return arg; + return half(binary, 0x7C00); + } + + /// Enumeration implementation. + /// \param from number to increase/decrease + /// \param to direction to enumerate into + /// \return next representable number + static half nextafter(half from, half to) + { + uint16 fabs = from.data_ & 0x7FFF, tabs = to.data_ & 0x7FFF; + if(fabs > 0x7C00) + return from; + if(tabs > 0x7C00 || from.data_ == to.data_ || !(fabs|tabs)) + return to; + if(!fabs) + return half(binary, (to.data_&0x8000)+1); + bool lt = ((fabs==from.data_) ? static_cast(fabs) : -static_cast(fabs)) < + ((tabs==to.data_) ? static_cast(tabs) : -static_cast(tabs)); + return half(binary, from.data_+(((from.data_>>15)^static_cast(lt))<<1)-1); + } + + /// Enumeration implementation. + /// \param from number to increase/decrease + /// \param to direction to enumerate into + /// \return next representable number + static half nexttoward(half from, long double to) + { + if(isnan(from)) + return from; + long double lfrom = static_cast(from); + if(builtin_isnan(to) || lfrom == to) + return half(static_cast(to)); + if(!(from.data_&0x7FFF)) + return half(binary, (static_cast(builtin_signbit(to))<<15)+1); + return half(binary, from.data_+(((from.data_>>15)^static_cast(lfrom0x3FF) ? ((abs>=0x7C00) ? ((abs>0x7C00) ? FP_NAN : FP_INFINITE) : FP_NORMAL) :FP_SUBNORMAL) : FP_ZERO; + } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if finite number + /// \retval false else + static bool isfinite(half arg) { return (arg.data_&0x7C00) != 0x7C00; } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if infinite number + /// \retval false else + static bool isinf(half arg) { return (arg.data_&0x7FFF) == 0x7C00; } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if not a number + /// \retval false else + static bool isnan(half arg) { return (arg.data_&0x7FFF) > 0x7C00; } + + /// Classification implementation. + /// \param arg value to classify + /// \retval true if normal number + /// \retval false else + static bool isnormal(half arg) { return ((arg.data_&0x7C00)!=0) & ((arg.data_&0x7C00)!=0x7C00); } + + /// Sign bit implementation. + /// \param arg value to check + /// \retval true if signed + /// \retval false if unsigned + static bool signbit(half arg) { return (arg.data_&0x8000) != 0; } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if operands equal + /// \retval false else + static bool isequal(half x, half y) { return (x.data_==y.data_ || !((x.data_|y.data_)&0x7FFF)) && !isnan(x); } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if operands not equal + /// \retval false else + static bool isnotequal(half x, half y) { return (x.data_!=y.data_ && ((x.data_|y.data_)&0x7FFF)) || isnan(x); } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x > \a y + /// \retval false else + static bool isgreater(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + return xabs<=0x7C00 && yabs<=0x7C00 && (((xabs==x.data_) ? xabs : -xabs) > ((yabs==y.data_) ? yabs : -yabs)); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x >= \a y + /// \retval false else + static bool isgreaterequal(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + return xabs<=0x7C00 && yabs<=0x7C00 && (((xabs==x.data_) ? xabs : -xabs) >= ((yabs==y.data_) ? yabs : -yabs)); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x < \a y + /// \retval false else + static bool isless(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + return xabs<=0x7C00 && yabs<=0x7C00 && (((xabs==x.data_) ? xabs : -xabs) < ((yabs==y.data_) ? yabs : -yabs)); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x <= \a y + /// \retval false else + static bool islessequal(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + return xabs<=0x7C00 && yabs<=0x7C00 && (((xabs==x.data_) ? xabs : -xabs) <= ((yabs==y.data_) ? yabs : -yabs)); + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if either \a x > \a y nor \a x < \a y + /// \retval false else + static bool islessgreater(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + if(xabs > 0x7C00 || yabs > 0x7C00) + return false; + int a = (xabs==x.data_) ? xabs : -xabs, b = (yabs==y.data_) ? yabs : -yabs; + return a < b || a > b; + } + + /// Comparison implementation. + /// \param x first operand + /// \param y second operand + /// \retval true if operand unordered + /// \retval false else + static bool isunordered(half x, half y) { return isnan(x) || isnan(y); } + + private: + static double erf(double arg) + { + if(builtin_isinf(arg)) + return (arg<0.0) ? -1.0 : 1.0; + double x2 = arg * arg, ax2 = 0.147 * x2, value = std::sqrt(1.0-std::exp(-x2*(1.2732395447351626861510701069801+ax2)/(1.0+ax2))); + return builtin_signbit(arg) ? -value : value; + } + + static double lgamma(double arg) + { + double v = 1.0; + for(; arg<8.0; ++arg) v *= arg; + double w = 1.0 / (arg*arg); + return (((((((-0.02955065359477124183006535947712*w+0.00641025641025641025641025641026)*w+ + -0.00191752691752691752691752691753)*w+8.4175084175084175084175084175084e-4)*w+ + -5.952380952380952380952380952381e-4)*w+7.9365079365079365079365079365079e-4)*w+ + -0.00277777777777777777777777777778)*w+0.08333333333333333333333333333333)/arg + + 0.91893853320467274178032973640562 - std::log(v) - arg + (arg-0.5) * std::log(arg); + } + }; + + /// Wrapper for unary half-precision functions needing specialization for individual argument types. + /// \tparam T argument type + template struct unary_specialized + { + /// Negation implementation. + /// \param arg value to negate + /// \return negated value + static HALF_CONSTEXPR half negate(half arg) { return half(binary, arg.data_^0x8000); } + + /// Absolute value implementation. + /// \param arg function argument + /// \return absolute value + static half fabs(half arg) { return half(binary, arg.data_&0x7FFF); } + }; + template<> struct unary_specialized + { + static HALF_CONSTEXPR expr negate(float arg) { return expr(-arg); } + static expr fabs(float arg) { return expr(std::fabs(arg)); } + }; + + /// Wrapper for binary half-precision functions needing specialization for individual argument types. + /// \tparam T first argument type + /// \tparam U first argument type + template struct binary_specialized + { + /// Minimum implementation. + /// \param x first operand + /// \param y second operand + /// \return minimum value + static expr fmin(float x, float y) + { + #if HALF_ENABLE_CPP11_CMATH + return expr(std::fmin(x, y)); + #else + if(builtin_isnan(x)) + return expr(y); + if(builtin_isnan(y)) + return expr(x); + return expr(std::min(x, y)); + #endif + } + + /// Maximum implementation. + /// \param x first operand + /// \param y second operand + /// \return maximum value + static expr fmax(float x, float y) + { + #if HALF_ENABLE_CPP11_CMATH + return expr(std::fmax(x, y)); + #else + if(builtin_isnan(x)) + return expr(y); + if(builtin_isnan(y)) + return expr(x); + return expr(std::max(x, y)); + #endif + } + }; + template<> struct binary_specialized + { + static half fmin(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + if(xabs > 0x7C00) + return y; + if(yabs > 0x7C00) + return x; + return (((xabs==x.data_) ? xabs : -xabs) > ((yabs==y.data_) ? yabs : -yabs)) ? y : x; + } + static half fmax(half x, half y) + { + int xabs = x.data_ & 0x7FFF, yabs = y.data_ & 0x7FFF; + if(xabs > 0x7C00) + return y; + if(yabs > 0x7C00) + return x; + return (((xabs==x.data_) ? xabs : -xabs) < ((yabs==y.data_) ? yabs : -yabs)) ? y : x; + } + }; + + /// Helper class for half casts. + /// This class template has to be specialized for all valid cast argument to define an appropriate static `cast` member + /// function and a corresponding `type` member denoting its return type. + /// \tparam T destination type + /// \tparam U source type + /// \tparam R rounding mode to use + template struct half_caster {}; + template struct half_caster + { + #if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_arithmetic::value, "half_cast from non-arithmetic type unsupported"); + #endif + + static half cast(U arg) { return cast_impl(arg, is_float()); }; + + private: + static half cast_impl(U arg, true_type) { return half(binary, float2half(arg)); } + static half cast_impl(U arg, false_type) { return half(binary, int2half(arg)); } + }; + template struct half_caster + { + #if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_arithmetic::value, "half_cast to non-arithmetic type unsupported"); + #endif + + static T cast(half arg) { return cast_impl(arg, is_float()); } + + private: + static T cast_impl(half arg, true_type) { return half2float(arg.data_); } + static T cast_impl(half arg, false_type) { return half2int(arg.data_); } + }; + template struct half_caster + { + #if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS + static_assert(std::is_arithmetic::value, "half_cast to non-arithmetic type unsupported"); + #endif + + static T cast(expr arg) { return cast_impl(arg, is_float()); } + + private: + static T cast_impl(float arg, true_type) { return static_cast(arg); } + static T cast_impl(half arg, false_type) { return half2int(arg.data_); } + }; + template struct half_caster + { + static half cast(half arg) { return arg; } + }; + template struct half_caster : half_caster {}; + + /// \name Comparison operators + /// \{ + + /// Comparison for equality. + /// \param x first operand + /// \param y second operand + /// \retval true if operands equal + /// \retval false else + template typename enable::type operator==(T x, U y) { return functions::isequal(x, y); } + + /// Comparison for inequality. + /// \param x first operand + /// \param y second operand + /// \retval true if operands not equal + /// \retval false else + template typename enable::type operator!=(T x, U y) { return functions::isnotequal(x, y); } + + /// Comparison for less than. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x less than \a y + /// \retval false else + template typename enable::type operator<(T x, U y) { return functions::isless(x, y); } + + /// Comparison for greater than. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x greater than \a y + /// \retval false else + template typename enable::type operator>(T x, U y) { return functions::isgreater(x, y); } + + /// Comparison for less equal. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x less equal \a y + /// \retval false else + template typename enable::type operator<=(T x, U y) { return functions::islessequal(x, y); } + + /// Comparison for greater equal. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x greater equal \a y + /// \retval false else + template typename enable::type operator>=(T x, U y) { return functions::isgreaterequal(x, y); } + + /// \} + /// \name Arithmetic operators + /// \{ + + /// Add halfs. + /// \param x left operand + /// \param y right operand + /// \return sum of half expressions + template typename enable::type operator+(T x, U y) { return functions::plus(x, y); } + + /// Subtract halfs. + /// \param x left operand + /// \param y right operand + /// \return difference of half expressions + template typename enable::type operator-(T x, U y) { return functions::minus(x, y); } + + /// Multiply halfs. + /// \param x left operand + /// \param y right operand + /// \return product of half expressions + template typename enable::type operator*(T x, U y) { return functions::multiplies(x, y); } + + /// Divide halfs. + /// \param x left operand + /// \param y right operand + /// \return quotient of half expressions + template typename enable::type operator/(T x, U y) { return functions::divides(x, y); } + + /// Identity. + /// \param arg operand + /// \return uncahnged operand + template HALF_CONSTEXPR typename enable::type operator+(T arg) { return arg; } + + /// Negation. + /// \param arg operand + /// \return negated operand + template HALF_CONSTEXPR typename enable::type operator-(T arg) { return unary_specialized::negate(arg); } + + /// \} + /// \name Input and output + /// \{ + + /// Output operator. + /// \param out output stream to write into + /// \param arg half expression to write + /// \return reference to output stream + template typename enable&,T>::type + operator<<(std::basic_ostream &out, T arg) { return functions::write(out, arg); } + + /// Input operator. + /// \param in input stream to read from + /// \param arg half to read into + /// \return reference to input stream + template std::basic_istream& + operator>>(std::basic_istream &in, half &arg) { return functions::read(in, arg); } + + /// \} + /// \name Basic mathematical operations + /// \{ + + /// Absolute value. + /// \param arg operand + /// \return absolute value of \a arg +// template typename enable::type abs(T arg) { return unary_specialized::fabs(arg); } + inline half abs(half arg) { return unary_specialized::fabs(arg); } + inline expr abs(expr arg) { return unary_specialized::fabs(arg); } + + /// Absolute value. + /// \param arg operand + /// \return absolute value of \a arg +// template typename enable::type fabs(T arg) { return unary_specialized::fabs(arg); } + inline half fabs(half arg) { return unary_specialized::fabs(arg); } + inline expr fabs(expr arg) { return unary_specialized::fabs(arg); } + + /// Remainder of division. + /// \param x first operand + /// \param y second operand + /// \return remainder of floating point division. +// template typename enable::type fmod(T x, U y) { return functions::fmod(x, y); } + inline expr fmod(half x, half y) { return functions::fmod(x, y); } + inline expr fmod(half x, expr y) { return functions::fmod(x, y); } + inline expr fmod(expr x, half y) { return functions::fmod(x, y); } + inline expr fmod(expr x, expr y) { return functions::fmod(x, y); } + + /// Remainder of division. + /// \param x first operand + /// \param y second operand + /// \return remainder of floating point division. +// template typename enable::type remainder(T x, U y) { return functions::remainder(x, y); } + inline expr remainder(half x, half y) { return functions::remainder(x, y); } + inline expr remainder(half x, expr y) { return functions::remainder(x, y); } + inline expr remainder(expr x, half y) { return functions::remainder(x, y); } + inline expr remainder(expr x, expr y) { return functions::remainder(x, y); } + + /// Remainder of division. + /// \param x first operand + /// \param y second operand + /// \param quo address to store some bits of quotient at + /// \return remainder of floating point division. +// template typename enable::type remquo(T x, U y, int *quo) { return functions::remquo(x, y, quo); } + inline expr remquo(half x, half y, int *quo) { return functions::remquo(x, y, quo); } + inline expr remquo(half x, expr y, int *quo) { return functions::remquo(x, y, quo); } + inline expr remquo(expr x, half y, int *quo) { return functions::remquo(x, y, quo); } + inline expr remquo(expr x, expr y, int *quo) { return functions::remquo(x, y, quo); } + + /// Fused multiply add. + /// \param x first operand + /// \param y second operand + /// \param z third operand + /// \return ( \a x * \a y ) + \a z rounded as one operation. +// template typename enable::type fma(T x, U y, V z) { return functions::fma(x, y, z); } + inline expr fma(half x, half y, half z) { return functions::fma(x, y, z); } + inline expr fma(half x, half y, expr z) { return functions::fma(x, y, z); } + inline expr fma(half x, expr y, half z) { return functions::fma(x, y, z); } + inline expr fma(half x, expr y, expr z) { return functions::fma(x, y, z); } + inline expr fma(expr x, half y, half z) { return functions::fma(x, y, z); } + inline expr fma(expr x, half y, expr z) { return functions::fma(x, y, z); } + inline expr fma(expr x, expr y, half z) { return functions::fma(x, y, z); } + inline expr fma(expr x, expr y, expr z) { return functions::fma(x, y, z); } + + /// Maximum of half expressions. + /// \param x first operand + /// \param y second operand + /// \return maximum of operands +// template typename result::type fmax(T x, U y) { return binary_specialized::fmax(x, y); } + inline half fmax(half x, half y) { return binary_specialized::fmax(x, y); } + inline expr fmax(half x, expr y) { return binary_specialized::fmax(x, y); } + inline expr fmax(expr x, half y) { return binary_specialized::fmax(x, y); } + inline expr fmax(expr x, expr y) { return binary_specialized::fmax(x, y); } + + /// Minimum of half expressions. + /// \param x first operand + /// \param y second operand + /// \return minimum of operands +// template typename result::type fmin(T x, U y) { return binary_specialized::fmin(x, y); } + inline half fmin(half x, half y) { return binary_specialized::fmin(x, y); } + inline expr fmin(half x, expr y) { return binary_specialized::fmin(x, y); } + inline expr fmin(expr x, half y) { return binary_specialized::fmin(x, y); } + inline expr fmin(expr x, expr y) { return binary_specialized::fmin(x, y); } + + /// Positive difference. + /// \param x first operand + /// \param y second operand + /// \return \a x - \a y or 0 if difference negative +// template typename enable::type fdim(T x, U y) { return functions::fdim(x, y); } + inline expr fdim(half x, half y) { return functions::fdim(x, y); } + inline expr fdim(half x, expr y) { return functions::fdim(x, y); } + inline expr fdim(expr x, half y) { return functions::fdim(x, y); } + inline expr fdim(expr x, expr y) { return functions::fdim(x, y); } + + /// Get NaN value. + /// \return quiet NaN + inline half nanh(const char*) { return functions::nanh(); } + + /// \} + /// \name Exponential functions + /// \{ + + /// Exponential function. + /// \param arg function argument + /// \return e raised to \a arg +// template typename enable::type exp(T arg) { return functions::exp(arg); } + inline expr exp(half arg) { return functions::exp(arg); } + inline expr exp(expr arg) { return functions::exp(arg); } + + /// Exponential minus one. + /// \param arg function argument + /// \return e raised to \a arg subtracted by 1 +// template typename enable::type expm1(T arg) { return functions::expm1(arg); } + inline expr expm1(half arg) { return functions::expm1(arg); } + inline expr expm1(expr arg) { return functions::expm1(arg); } + + /// Binary exponential. + /// \param arg function argument + /// \return 2 raised to \a arg +// template typename enable::type exp2(T arg) { return functions::exp2(arg); } + inline expr exp2(half arg) { return functions::exp2(arg); } + inline expr exp2(expr arg) { return functions::exp2(arg); } + + /// Natural logorithm. + /// \param arg function argument + /// \return logarithm of \a arg to base e +// template typename enable::type log(T arg) { return functions::log(arg); } + inline expr log(half arg) { return functions::log(arg); } + inline expr log(expr arg) { return functions::log(arg); } + + /// Common logorithm. + /// \param arg function argument + /// \return logarithm of \a arg to base 10 +// template typename enable::type log10(T arg) { return functions::log10(arg); } + inline expr log10(half arg) { return functions::log10(arg); } + inline expr log10(expr arg) { return functions::log10(arg); } + + /// Natural logorithm. + /// \param arg function argument + /// \return logarithm of \a arg plus 1 to base e +// template typename enable::type log1p(T arg) { return functions::log1p(arg); } + inline expr log1p(half arg) { return functions::log1p(arg); } + inline expr log1p(expr arg) { return functions::log1p(arg); } + + /// Binary logorithm. + /// \param arg function argument + /// \return logarithm of \a arg to base 2 +// template typename enable::type log2(T arg) { return functions::log2(arg); } + inline expr log2(half arg) { return functions::log2(arg); } + inline expr log2(expr arg) { return functions::log2(arg); } + + /// \} + /// \name Power functions + /// \{ + + /// Square root. + /// \param arg function argument + /// \return square root of \a arg +// template typename enable::type sqrt(T arg) { return functions::sqrt(arg); } + inline expr sqrt(half arg) { return functions::sqrt(arg); } + inline expr sqrt(expr arg) { return functions::sqrt(arg); } + + /// Cubic root. + /// \param arg function argument + /// \return cubic root of \a arg +// template typename enable::type cbrt(T arg) { return functions::cbrt(arg); } + inline expr cbrt(half arg) { return functions::cbrt(arg); } + inline expr cbrt(expr arg) { return functions::cbrt(arg); } + + /// Hypotenuse function. + /// \param x first argument + /// \param y second argument + /// \return square root of sum of squares without internal over- or underflows +// template typename enable::type hypot(T x, U y) { return functions::hypot(x, y); } + inline expr hypot(half x, half y) { return functions::hypot(x, y); } + inline expr hypot(half x, expr y) { return functions::hypot(x, y); } + inline expr hypot(expr x, half y) { return functions::hypot(x, y); } + inline expr hypot(expr x, expr y) { return functions::hypot(x, y); } + + /// Power function. + /// \param base first argument + /// \param exp second argument + /// \return \a base raised to \a exp +// template typename enable::type pow(T base, U exp) { return functions::pow(base, exp); } + inline expr pow(half base, half exp) { return functions::pow(base, exp); } + inline expr pow(half base, expr exp) { return functions::pow(base, exp); } + inline expr pow(expr base, half exp) { return functions::pow(base, exp); } + inline expr pow(expr base, expr exp) { return functions::pow(base, exp); } + + /// \} + /// \name Trigonometric functions + /// \{ + + /// Sine function. + /// \param arg function argument + /// \return sine value of \a arg +// template typename enable::type sin(T arg) { return functions::sin(arg); } + inline expr sin(half arg) { return functions::sin(arg); } + inline expr sin(expr arg) { return functions::sin(arg); } + + /// Cosine function. + /// \param arg function argument + /// \return cosine value of \a arg +// template typename enable::type cos(T arg) { return functions::cos(arg); } + inline expr cos(half arg) { return functions::cos(arg); } + inline expr cos(expr arg) { return functions::cos(arg); } + + /// Tangent function. + /// \param arg function argument + /// \return tangent value of \a arg +// template typename enable::type tan(T arg) { return functions::tan(arg); } + inline expr tan(half arg) { return functions::tan(arg); } + inline expr tan(expr arg) { return functions::tan(arg); } + + /// Arc sine. + /// \param arg function argument + /// \return arc sine value of \a arg +// template typename enable::type asin(T arg) { return functions::asin(arg); } + inline expr asin(half arg) { return functions::asin(arg); } + inline expr asin(expr arg) { return functions::asin(arg); } + + /// Arc cosine function. + /// \param arg function argument + /// \return arc cosine value of \a arg +// template typename enable::type acos(T arg) { return functions::acos(arg); } + inline expr acos(half arg) { return functions::acos(arg); } + inline expr acos(expr arg) { return functions::acos(arg); } + + /// Arc tangent function. + /// \param arg function argument + /// \return arc tangent value of \a arg +// template typename enable::type atan(T arg) { return functions::atan(arg); } + inline expr atan(half arg) { return functions::atan(arg); } + inline expr atan(expr arg) { return functions::atan(arg); } + + /// Arc tangent function. + /// \param x first argument + /// \param y second argument + /// \return arc tangent value +// template typename enable::type atan2(T x, U y) { return functions::atan2(x, y); } + inline expr atan2(half x, half y) { return functions::atan2(x, y); } + inline expr atan2(half x, expr y) { return functions::atan2(x, y); } + inline expr atan2(expr x, half y) { return functions::atan2(x, y); } + inline expr atan2(expr x, expr y) { return functions::atan2(x, y); } + + /// \} + /// \name Hyperbolic functions + /// \{ + + /// Hyperbolic sine. + /// \param arg function argument + /// \return hyperbolic sine value of \a arg +// template typename enable::type sinh(T arg) { return functions::sinh(arg); } + inline expr sinh(half arg) { return functions::sinh(arg); } + inline expr sinh(expr arg) { return functions::sinh(arg); } + + /// Hyperbolic cosine. + /// \param arg function argument + /// \return hyperbolic cosine value of \a arg +// template typename enable::type cosh(T arg) { return functions::cosh(arg); } + inline expr cosh(half arg) { return functions::cosh(arg); } + inline expr cosh(expr arg) { return functions::cosh(arg); } + + /// Hyperbolic tangent. + /// \param arg function argument + /// \return hyperbolic tangent value of \a arg +// template typename enable::type tanh(T arg) { return functions::tanh(arg); } + inline expr tanh(half arg) { return functions::tanh(arg); } + inline expr tanh(expr arg) { return functions::tanh(arg); } + + /// Hyperbolic area sine. + /// \param arg function argument + /// \return area sine value of \a arg +// template typename enable::type asinh(T arg) { return functions::asinh(arg); } + inline expr asinh(half arg) { return functions::asinh(arg); } + inline expr asinh(expr arg) { return functions::asinh(arg); } + + /// Hyperbolic area cosine. + /// \param arg function argument + /// \return area cosine value of \a arg +// template typename enable::type acosh(T arg) { return functions::acosh(arg); } + inline expr acosh(half arg) { return functions::acosh(arg); } + inline expr acosh(expr arg) { return functions::acosh(arg); } + + /// Hyperbolic area tangent. + /// \param arg function argument + /// \return area tangent value of \a arg +// template typename enable::type atanh(T arg) { return functions::atanh(arg); } + inline expr atanh(half arg) { return functions::atanh(arg); } + inline expr atanh(expr arg) { return functions::atanh(arg); } + + /// \} + /// \name Error and gamma functions + /// \{ + + /// Error function. + /// \param arg function argument + /// \return error function value of \a arg +// template typename enable::type erf(T arg) { return functions::erf(arg); } + inline expr erf(half arg) { return functions::erf(arg); } + inline expr erf(expr arg) { return functions::erf(arg); } + + /// Complementary error function. + /// \param arg function argument + /// \return 1 minus error function value of \a arg +// template typename enable::type erfc(T arg) { return functions::erfc(arg); } + inline expr erfc(half arg) { return functions::erfc(arg); } + inline expr erfc(expr arg) { return functions::erfc(arg); } + + /// Natural logarithm of gamma function. + /// \param arg function argument + /// \return natural logarith of gamma function for \a arg +// template typename enable::type lgamma(T arg) { return functions::lgamma(arg); } + inline expr lgamma(half arg) { return functions::lgamma(arg); } + inline expr lgamma(expr arg) { return functions::lgamma(arg); } + + /// Gamma function. + /// \param arg function argument + /// \return gamma function value of \a arg +// template typename enable::type tgamma(T arg) { return functions::tgamma(arg); } + inline expr tgamma(half arg) { return functions::tgamma(arg); } + inline expr tgamma(expr arg) { return functions::tgamma(arg); } + + /// \} + /// \name Rounding + /// \{ + + /// Nearest integer not less than half value. + /// \param arg half to round + /// \return nearest integer not less than \a arg +// template typename enable::type ceil(T arg) { return functions::ceil(arg); } + inline half ceil(half arg) { return functions::ceil(arg); } + inline half ceil(expr arg) { return functions::ceil(arg); } + + /// Nearest integer not greater than half value. + /// \param arg half to round + /// \return nearest integer not greater than \a arg +// template typename enable::type floor(T arg) { return functions::floor(arg); } + inline half floor(half arg) { return functions::floor(arg); } + inline half floor(expr arg) { return functions::floor(arg); } + + /// Nearest integer not greater in magnitude than half value. + /// \param arg half to round + /// \return nearest integer not greater in magnitude than \a arg +// template typename enable::type trunc(T arg) { return functions::trunc(arg); } + inline half trunc(half arg) { return functions::trunc(arg); } + inline half trunc(expr arg) { return functions::trunc(arg); } + + /// Nearest integer. + /// \param arg half to round + /// \return nearest integer, rounded away from zero in half-way cases +// template typename enable::type round(T arg) { return functions::round(arg); } + inline half round(half arg) { return functions::round(arg); } + inline half round(expr arg) { return functions::round(arg); } + + /// Nearest integer. + /// \param arg half to round + /// \return nearest integer, rounded away from zero in half-way cases +// template typename enable::type lround(T arg) { return functions::lround(arg); } + inline long lround(half arg) { return functions::lround(arg); } + inline long lround(expr arg) { return functions::lround(arg); } + + /// Nearest integer using half's internal rounding mode. + /// \param arg half expression to round + /// \return nearest integer using default rounding mode +// template typename enable::type nearbyint(T arg) { return functions::nearbyint(arg); } + inline half nearbyint(half arg) { return functions::rint(arg); } + inline half nearbyint(expr arg) { return functions::rint(arg); } + + /// Nearest integer using half's internal rounding mode. + /// \param arg half expression to round + /// \return nearest integer using default rounding mode +// template typename enable::type rint(T arg) { return functions::rint(arg); } + inline half rint(half arg) { return functions::rint(arg); } + inline half rint(expr arg) { return functions::rint(arg); } + + /// Nearest integer using half's internal rounding mode. + /// \param arg half expression to round + /// \return nearest integer using default rounding mode +// template typename enable::type lrint(T arg) { return functions::lrint(arg); } + inline long lrint(half arg) { return functions::lrint(arg); } + inline long lrint(expr arg) { return functions::lrint(arg); } + #if HALF_ENABLE_CPP11_LONG_LONG + /// Nearest integer. + /// \param arg half to round + /// \return nearest integer, rounded away from zero in half-way cases +// template typename enable::type llround(T arg) { return functions::llround(arg); } + inline long long llround(half arg) { return functions::llround(arg); } + inline long long llround(expr arg) { return functions::llround(arg); } + + /// Nearest integer using half's internal rounding mode. + /// \param arg half expression to round + /// \return nearest integer using default rounding mode +// template typename enable::type llrint(T arg) { return functions::llrint(arg); } + inline long long llrint(half arg) { return functions::llrint(arg); } + inline long long llrint(expr arg) { return functions::llrint(arg); } + #endif + + /// \} + /// \name Floating point manipulation + /// \{ + + /// Decompress floating point number. + /// \param arg number to decompress + /// \param exp address to store exponent at + /// \return significant in range [0.5, 1) +// template typename enable::type frexp(T arg, int *exp) { return functions::frexp(arg, exp); } + inline half frexp(half arg, int *exp) { return functions::frexp(arg, exp); } + inline half frexp(expr arg, int *exp) { return functions::frexp(arg, exp); } + + /// Multiply by power of two. + /// \param arg number to modify + /// \param exp power of two to multiply with + /// \return \a arg multplied by 2 raised to \a exp +// template typename enable::type ldexp(T arg, int exp) { return functions::scalbln(arg, exp); } + inline half ldexp(half arg, int exp) { return functions::scalbln(arg, exp); } + inline half ldexp(expr arg, int exp) { return functions::scalbln(arg, exp); } + + /// Extract integer and fractional parts. + /// \param arg number to decompress + /// \param iptr address to store integer part at + /// \return fractional part +// template typename enable::type modf(T arg, half *iptr) { return functions::modf(arg, iptr); } + inline half modf(half arg, half *iptr) { return functions::modf(arg, iptr); } + inline half modf(expr arg, half *iptr) { return functions::modf(arg, iptr); } + + /// Multiply by power of two. + /// \param arg number to modify + /// \param exp power of two to multiply with + /// \return \a arg multplied by 2 raised to \a exp +// template typename enable::type scalbn(T arg, int exp) { return functions::scalbln(arg, exp); } + inline half scalbn(half arg, int exp) { return functions::scalbln(arg, exp); } + inline half scalbn(expr arg, int exp) { return functions::scalbln(arg, exp); } + + /// Multiply by power of two. + /// \param arg number to modify + /// \param exp power of two to multiply with + /// \return \a arg multplied by 2 raised to \a exp +// template typename enable::type scalbln(T arg, long exp) { return functions::scalbln(arg, exp); } + inline half scalbln(half arg, long exp) { return functions::scalbln(arg, exp); } + inline half scalbln(expr arg, long exp) { return functions::scalbln(arg, exp); } + + /// Extract exponent. + /// \param arg number to query + /// \return floating point exponent + /// \retval FP_ILOGB0 for zero + /// \retval FP_ILOGBNAN for NaN + /// \retval MAX_INT for infinity +// template typename enable::type ilogb(T arg) { return functions::ilogb(arg); } + inline int ilogb(half arg) { return functions::ilogb(arg); } + inline int ilogb(expr arg) { return functions::ilogb(arg); } + + /// Extract exponent. + /// \param arg number to query + /// \return floating point exponent +// template typename enable::type logb(T arg) { return functions::logb(arg); } + inline half logb(half arg) { return functions::logb(arg); } + inline half logb(expr arg) { return functions::logb(arg); } + + /// Next representable value. + /// \param from value to compute next representable value for + /// \param to direction towards which to compute next value + /// \return next representable value after \a from in direction towards \a to +// template typename enable::type nextafter(T from, U to) { return functions::nextafter(from, to); } + inline half nextafter(half from, half to) { return functions::nextafter(from, to); } + inline half nextafter(half from, expr to) { return functions::nextafter(from, to); } + inline half nextafter(expr from, half to) { return functions::nextafter(from, to); } + inline half nextafter(expr from, expr to) { return functions::nextafter(from, to); } + + /// Next representable value. + /// \param from value to compute next representable value for + /// \param to direction towards which to compute next value + /// \return next representable value after \a from in direction towards \a to +// template typename enable::type nexttoward(T from, long double to) { return functions::nexttoward(from, to); } + inline half nexttoward(half from, long double to) { return functions::nexttoward(from, to); } + inline half nexttoward(expr from, long double to) { return functions::nexttoward(from, to); } + + /// Take sign. + /// \param x value to change sign for + /// \param y value to take sign from + /// \return value equal to \a x in magnitude and to \a y in sign +// template typename enable::type copysign(T x, U y) { return functions::copysign(x, y); } + inline half copysign(half x, half y) { return functions::copysign(x, y); } + inline half copysign(half x, expr y) { return functions::copysign(x, y); } + inline half copysign(expr x, half y) { return functions::copysign(x, y); } + inline half copysign(expr x, expr y) { return functions::copysign(x, y); } + + /// \} + /// \name Floating point classification + /// \{ + + + /// Classify floating point value. + /// \param arg number to classify + /// \retval FP_ZERO for positive and negative zero + /// \retval FP_SUBNORMAL for subnormal numbers + /// \retval FP_INFINITY for positive and negative infinity + /// \retval FP_NAN for NaNs + /// \retval FP_NORMAL for all other (normal) values +// template typename enable::type fpclassify(T arg) { return functions::fpclassify(arg); } + inline int fpclassify(half arg) { return functions::fpclassify(arg); } + inline int fpclassify(expr arg) { return functions::fpclassify(arg); } + + /// Check if finite number. + /// \param arg number to check + /// \retval true if neither infinity nor NaN + /// \retval false else +// template typename enable::type isfinite(T arg) { return functions::isfinite(arg); } + inline bool isfinite(half arg) { return functions::isfinite(arg); } + inline bool isfinite(expr arg) { return functions::isfinite(arg); } + + /// Check for infinity. + /// \param arg number to check + /// \retval true for positive or negative infinity + /// \retval false else +// template typename enable::type isinf(T arg) { return functions::isinf(arg); } + inline bool isinf(half arg) { return functions::isinf(arg); } + inline bool isinf(expr arg) { return functions::isinf(arg); } + + /// Check for NaN. + /// \param arg number to check + /// \retval true for NaNs + /// \retval false else +// template typename enable::type isnan(T arg) { return functions::isnan(arg); } + inline bool isnan(half arg) { return functions::isnan(arg); } + inline bool isnan(expr arg) { return functions::isnan(arg); } + + /// Check if normal number. + /// \param arg number to check + /// \retval true if normal number + /// \retval false if either subnormal, zero, infinity or NaN +// template typename enable::type isnormal(T arg) { return functions::isnormal(arg); } + inline bool isnormal(half arg) { return functions::isnormal(arg); } + inline bool isnormal(expr arg) { return functions::isnormal(arg); } + + /// Check sign. + /// \param arg number to check + /// \retval true for negative number + /// \retval false for positive number +// template typename enable::type signbit(T arg) { return functions::signbit(arg); } + inline bool signbit(half arg) { return functions::signbit(arg); } + inline bool signbit(expr arg) { return functions::signbit(arg); } + + /// \} + /// \name Comparison + /// \{ + + /// Comparison for greater than. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x greater than \a y + /// \retval false else +// template typename enable::type isgreater(T x, U y) { return functions::isgreater(x, y); } + inline bool isgreater(half x, half y) { return functions::isgreater(x, y); } + inline bool isgreater(half x, expr y) { return functions::isgreater(x, y); } + inline bool isgreater(expr x, half y) { return functions::isgreater(x, y); } + inline bool isgreater(expr x, expr y) { return functions::isgreater(x, y); } + + /// Comparison for greater equal. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x greater equal \a y + /// \retval false else +// template typename enable::type isgreaterequal(T x, U y) { return functions::isgreaterequal(x, y); } + inline bool isgreaterequal(half x, half y) { return functions::isgreaterequal(x, y); } + inline bool isgreaterequal(half x, expr y) { return functions::isgreaterequal(x, y); } + inline bool isgreaterequal(expr x, half y) { return functions::isgreaterequal(x, y); } + inline bool isgreaterequal(expr x, expr y) { return functions::isgreaterequal(x, y); } + + /// Comparison for less than. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x less than \a y + /// \retval false else +// template typename enable::type isless(T x, U y) { return functions::isless(x, y); } + inline bool isless(half x, half y) { return functions::isless(x, y); } + inline bool isless(half x, expr y) { return functions::isless(x, y); } + inline bool isless(expr x, half y) { return functions::isless(x, y); } + inline bool isless(expr x, expr y) { return functions::isless(x, y); } + + /// Comparison for less equal. + /// \param x first operand + /// \param y second operand + /// \retval true if \a x less equal \a y + /// \retval false else +// template typename enable::type islessequal(T x, U y) { return functions::islessequal(x, y); } + inline bool islessequal(half x, half y) { return functions::islessequal(x, y); } + inline bool islessequal(half x, expr y) { return functions::islessequal(x, y); } + inline bool islessequal(expr x, half y) { return functions::islessequal(x, y); } + inline bool islessequal(expr x, expr y) { return functions::islessequal(x, y); } + + /// Comarison for less or greater. + /// \param x first operand + /// \param y second operand + /// \retval true if either less or greater + /// \retval false else +// template typename enable::type islessgreater(T x, U y) { return functions::islessgreater(x, y); } + inline bool islessgreater(half x, half y) { return functions::islessgreater(x, y); } + inline bool islessgreater(half x, expr y) { return functions::islessgreater(x, y); } + inline bool islessgreater(expr x, half y) { return functions::islessgreater(x, y); } + inline bool islessgreater(expr x, expr y) { return functions::islessgreater(x, y); } + + /// Check if unordered. + /// \param x first operand + /// \param y second operand + /// \retval true if unordered (one or two NaN operands) + /// \retval false else +// template typename enable::type isunordered(T x, U y) { return functions::isunordered(x, y); } + inline bool isunordered(half x, half y) { return functions::isunordered(x, y); } + inline bool isunordered(half x, expr y) { return functions::isunordered(x, y); } + inline bool isunordered(expr x, half y) { return functions::isunordered(x, y); } + inline bool isunordered(expr x, expr y) { return functions::isunordered(x, y); } + + /// \name Casting + /// \{ + + /// Cast to or from half-precision floating point number. + /// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted + /// directly using the given rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do. + /// It uses the default rounding mode. + /// + /// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types + /// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler + /// error and casting between [half](\ref half_float::half)s is just a no-op. + /// \tparam T destination type (half or built-in arithmetic type) + /// \tparam U source type (half or built-in arithmetic type) + /// \param arg value to cast + /// \return \a arg converted to destination type + template T half_cast(U arg) { return half_caster::cast(arg); } + + /// Cast to or from half-precision floating point number. + /// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted + /// directly using the given rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do. + /// + /// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types + /// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler + /// error and casting between [half](\ref half_float::half)s is just a no-op. + /// \tparam T destination type (half or built-in arithmetic type) + /// \tparam R rounding mode to use. + /// \tparam U source type (half or built-in arithmetic type) + /// \param arg value to cast + /// \return \a arg converted to destination type + template T half_cast(U arg) { return half_caster::cast(arg); } + /// \} + } + + using detail::operator==; + using detail::operator!=; + using detail::operator<; + using detail::operator>; + using detail::operator<=; + using detail::operator>=; + using detail::operator+; + using detail::operator-; + using detail::operator*; + using detail::operator/; + using detail::operator<<; + using detail::operator>>; + + using detail::abs; + using detail::fabs; + using detail::fmod; + using detail::remainder; + using detail::remquo; + using detail::fma; + using detail::fmax; + using detail::fmin; + using detail::fdim; + using detail::nanh; + using detail::exp; + using detail::expm1; + using detail::exp2; + using detail::log; + using detail::log10; + using detail::log1p; + using detail::log2; + using detail::sqrt; + using detail::cbrt; + using detail::hypot; + using detail::pow; + using detail::sin; + using detail::cos; + using detail::tan; + using detail::asin; + using detail::acos; + using detail::atan; + using detail::atan2; + using detail::sinh; + using detail::cosh; + using detail::tanh; + using detail::asinh; + using detail::acosh; + using detail::atanh; + using detail::erf; + using detail::erfc; + using detail::lgamma; + using detail::tgamma; + using detail::ceil; + using detail::floor; + using detail::trunc; + using detail::round; + using detail::lround; + using detail::nearbyint; + using detail::rint; + using detail::lrint; +#if HALF_ENABLE_CPP11_LONG_LONG + using detail::llround; + using detail::llrint; +#endif + using detail::frexp; + using detail::ldexp; + using detail::modf; + using detail::scalbn; + using detail::scalbln; + using detail::ilogb; + using detail::logb; + using detail::nextafter; + using detail::nexttoward; + using detail::copysign; + using detail::fpclassify; + using detail::isfinite; + using detail::isinf; + using detail::isnan; + using detail::isnormal; + using detail::signbit; + using detail::isgreater; + using detail::isgreaterequal; + using detail::isless; + using detail::islessequal; + using detail::islessgreater; + using detail::isunordered; + + using detail::half_cast; +} + + +/// Extensions to the C++ standard library. +namespace std +{ + /// Numeric limits for half-precision floats. + /// Because of the underlying single-precision implementation of many operations, it inherits some properties from + /// `std::numeric_limits`. + template<> class numeric_limits : public numeric_limits + { + public: + /// Supports signed values. + static HALF_CONSTEXPR_CONST bool is_signed = true; + + /// Is not exact. + static HALF_CONSTEXPR_CONST bool is_exact = false; + + /// Doesn't provide modulo arithmetic. + static HALF_CONSTEXPR_CONST bool is_modulo = false; + + /// IEEE conformant. + static HALF_CONSTEXPR_CONST bool is_iec559 = true; + + /// Supports infinity. + static HALF_CONSTEXPR_CONST bool has_infinity = true; + + /// Supports quiet NaNs. + static HALF_CONSTEXPR_CONST bool has_quiet_NaN = true; + + /// Supports subnormal values. + static HALF_CONSTEXPR_CONST float_denorm_style has_denorm = denorm_present; + + /// Rounding mode. + /// Due to the mix of internal single-precision computations (using the rounding mode of the underlying + /// single-precision implementation) with the rounding mode of the single-to-half conversions, the actual rounding + /// mode might be `std::round_indeterminate` if the default half-precision rounding mode doesn't match the + /// single-precision rounding mode. + static HALF_CONSTEXPR_CONST float_round_style round_style = (std::numeric_limits::round_style== + half_float::half::round_style) ? half_float::half::round_style : round_indeterminate; + + /// Significant digits. + static HALF_CONSTEXPR_CONST int digits = 11; + + /// Significant decimal digits. + static HALF_CONSTEXPR_CONST int digits10 = 3; + + /// Required decimal digits to represent all possible values. + static HALF_CONSTEXPR_CONST int max_digits10 = 5; + + /// Number base. + static HALF_CONSTEXPR_CONST int radix = 2; + + /// One more than smallest exponent. + static HALF_CONSTEXPR_CONST int min_exponent = -13; + + /// Smallest normalized representable power of 10. + static HALF_CONSTEXPR_CONST int min_exponent10 = -4; + + /// One more than largest exponent + static HALF_CONSTEXPR_CONST int max_exponent = 16; + + /// Largest finitely representable power of 10. + static HALF_CONSTEXPR_CONST int max_exponent10 = 4; + + /// Smallest positive normal value. + static HALF_CONSTEXPR half_float::half min() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x0400); } + + /// Smallest finite value. + static HALF_CONSTEXPR half_float::half lowest() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0xFBFF); } + + /// Largest finite value. + static HALF_CONSTEXPR half_float::half max() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7BFF); } + + /// Difference between one and next representable value. + static HALF_CONSTEXPR half_float::half epsilon() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x1400); } + + /// Maximum rounding error. + static HALF_CONSTEXPR half_float::half round_error() HALF_NOTHROW + { return half_float::half(half_float::detail::binary, (round_style==std::round_to_nearest) ? 0x3800 : 0x3C00); } + + /// Positive infinity. + static HALF_CONSTEXPR half_float::half infinity() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7C00); } + + /// Quiet NaN. + static HALF_CONSTEXPR half_float::half quiet_NaN() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7FFF); } + + /// Signalling NaN. + static HALF_CONSTEXPR half_float::half signaling_NaN() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x7DFF); } + + /// Smallest positive subnormal value. + static HALF_CONSTEXPR half_float::half denorm_min() HALF_NOTHROW { return half_float::half(half_float::detail::binary, 0x0001); } + }; + +#if HALF_ENABLE_CPP11_HASH + /// Hash function for half-precision floats. + /// This is only defined if C++11 `std::hash` is supported and enabled. + template<> struct hash //: unary_function + { + /// Type of function argument. + typedef half_float::half argument_type; + + /// Function return type. + typedef size_t result_type; + + /// Compute hash function. + /// \param arg half to hash + /// \return hash value + result_type operator()(argument_type arg) const + { return hash()(static_cast(arg.data_)&-(arg.data_!=0x8000)); } + }; +#endif +} + + +#undef HALF_CONSTEXPR +#undef HALF_CONSTEXPR_CONST +#undef HALF_NOEXCEPT +#undef HALF_NOTHROW +#ifdef HALF_POP_WARNINGS + #pragma warning(pop) + #undef HALF_POP_WARNINGS +#endif + +#endif diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index 397be9d9d..a08e89a85 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -2,8 +2,9 @@ #define TDL_INCLUDE_IR_INSTRUCTIONS_H #include -#include "value.h" +#include "triton/ir/value.h" #include "triton/ir/type.h" +#include "triton/ir/metadata.h" #include "llvm/IR/Instructions.h" namespace triton{ @@ -48,12 +49,16 @@ public: // results unsigned get_num_results() const { return results_.size(); } value* get_result(unsigned i) { return results_.at(i); } - + // metadata + void set_metadata(ir::metadata::kind_t kind, + unsigned value) { metadatas_[kind] = value;} + unsigned get_metadata(ir::metadata::kind_t kind) { return metadatas_[kind];} private: basic_block *parent_; value *pred_; value *mask_pred_; std::vector results_; + std::map metadatas_; }; // result reference diff --git a/include/triton/ir/metadata.h b/include/triton/ir/metadata.h new file mode 100644 index 000000000..618e84cb2 --- /dev/null +++ b/include/triton/ir/metadata.h @@ -0,0 +1,29 @@ +#ifndef TDL_INCLUDE_IR_METADATA_H +#define TDL_INCLUDE_IR_METADATA_H + +namespace triton{ +namespace ir{ + + +/* Metadata */ +class metadata{ +public: + enum kind_t{ + multiple_of + }; + +private: + metadata(kind_t kind, unsigned value); + +public: + static metadata* get(kind_t kind, unsigned value); + +private: + kind_t kind_; + unsigned value_; +}; + +} +} + +#endif diff --git a/include/triton/ir/module.h b/include/triton/ir/module.h index eddc49454..238968e7b 100644 --- a/include/triton/ir/module.h +++ b/include/triton/ir/module.h @@ -7,6 +7,7 @@ #include #include #include "builder.h" +#include "metadata.h" namespace triton{ @@ -38,6 +39,7 @@ struct scope { class module { typedef std::pair val_key_t; friend class function; + typedef std::pair md_pair_t; public: typedef std::map symbols_map_t; @@ -84,6 +86,8 @@ public: // Register global void register_global(const std::string& name, ir::value *x) { globals_[name] = x; } const std::map& globals() const { return globals_; } + // Metadata + void add_metadata(const std::string &name, md_pair_t x) { metadatas_[name] = x; } private: std::string name_; @@ -101,6 +105,7 @@ private: std::stack scopes_; std::vector allocs_; std::map globals_; + std::map metadatas_; }; } diff --git a/include/triton/lang/declaration.h b/include/triton/lang/declaration.h index b5f4de412..7441e8449 100644 --- a/include/triton/lang/declaration.h +++ b/include/triton/lang/declaration.h @@ -45,7 +45,9 @@ public: virtual bool is_cst_space() const { return false; } virtual bool is_tunable() const { return false; } virtual bool is_cst() const { return false; } + virtual bool is_multiple_of() const { return false; } virtual void add_attr(ir::function* fn, size_t pos) = 0; + virtual void add_metadata(ir::module* mod, std::string name) = 0; }; class storage_specifier: public modifier { @@ -56,6 +58,7 @@ public: bool is_tunable() const { return value_ == TUNABLE_T; } bool is_cst() const { return value_ == CONST_T; } void add_attr(ir::function* fn, size_t pos); + void add_metadata(ir::module* mod, std::string name); private: const STORAGE_SPEC_T value_; @@ -65,6 +68,7 @@ class alignment_specifier: public modifier { public: alignment_specifier(node* value): cst_((constant*)value) { } void add_attr(ir::function* fn, size_t pos); + void add_metadata(ir::module* mod, std::string name); private: constant* cst_; @@ -74,6 +78,8 @@ class multiple_of_specifier: public modifier { public: multiple_of_specifier(node* value): cst_((constant*)value) {} void add_attr(ir::function* fn, size_t pos); + void add_metadata(ir::module* mod, std::string name); + bool is_multiple_of() const { return true; } private: constant* cst_; diff --git a/lib/codegen/alignment_info.cpp b/lib/codegen/alignment_info.cpp index ec3204587..5a7dc5fcd 100644 --- a/lib/codegen/alignment_info.cpp +++ b/lib/codegen/alignment_info.cpp @@ -39,6 +39,11 @@ bool alignment_info::populate_is_constant(ir::value *v) { bool rhs = populate_is_constant(x->get_operand(1)); return cache(lhs && rhs); } + if(auto *x = dynamic_cast(v)){ + bool value_true = populate_is_constant(x->get_value_true()); + bool value_false = populate_is_constant(x->get_value_false()); + return cache(value_true && value_false); + } if(v->get_type()->is_tile_ty()) return cache(false); if(auto *x = dynamic_cast(v)){ @@ -97,6 +102,11 @@ unsigned alignment_info::populate_max_contiguous(ir::value *v){ return cache(lhs_max_contiguous); } } + if(auto *x = dynamic_cast(v)){ + int value_true = populate_max_contiguous(x->get_value_true()); + int value_false = populate_max_contiguous(x->get_value_false()); + return cache(std::min(value_true, value_false)); + } if(auto *x = dynamic_cast(v)){ ir::value* lhs = x->get_operand(0); ir::value* rhs = x->get_operand(1); @@ -132,6 +142,12 @@ unsigned alignment_info::populate_starting_multiple(ir::value *v){ if(starting_multiple_.find(v) != starting_multiple_.end()) return starting_multiple_.at(v); auto cache = [this,v](unsigned value){ return add_to_cache(v, value, starting_multiple_); }; + // has metadata + if(auto *x = dynamic_cast(v)){ + unsigned multiple_of = x->get_metadata(ir::metadata::multiple_of); + if(multiple_of > 0) + return cache(multiple_of); + } // arguments if(auto *x = dynamic_cast(v)){ std::set attributes = x->get_parent()->get_attributes(x); @@ -174,6 +190,11 @@ unsigned alignment_info::populate_starting_multiple(ir::value *v){ if(auto *x = dynamic_cast(v)){ return cache(v->get_type()->get_tile_shapes()[0]->get_value()); } + if(auto *x = dynamic_cast(v)){ + int value_true = populate_starting_multiple(x->get_value_true()); + int value_false = populate_starting_multiple(x->get_value_false()); + return cache(std::min(value_true, value_false)); + } if(auto *x = dynamic_cast(v)){ // put a conservative initial value in phi node to avoid infinite recursion unsigned result = 1; diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 6a71b6c33..102a970df 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -123,7 +123,7 @@ void shift(restrict read_only align(16) )" << a_ty_ << R"( *a, restrict read_only align(16) )" << b_ty_ << R"( *b, fp32 *c, multiple_of(4) int32 M, multiple_of(4) int32 N, multiple_of(4) int32 K, - int32 lda, + multiple_of(4) int32 lda, int32 ABS, int32 AH, int32 AW, int32 AR, int32 AS) { int32 rxa[TM] = get_global_range[TM](0); int32 ryb[TN] = get_global_range[TN](1); @@ -140,7 +140,8 @@ void shift(restrict read_only align(16) )" << a_ty_ << R"( *a, int1 maskw[TM] = (raw >= pad_w) && (raw < (AW - pad_w)); int1 mask[TM, TK] = maskh[:, newaxis] && maskw[:, newaxis]; __constant__ int32* pd[TK] = delta + rka; - int32 d[TK] = *pd; + multiple_of(4) int32 d[TK]; + d = *pd; int32 offa1[TK] = rka*lda; int32 inc[TM, TK] = mask ? d[newaxis, :] : offa1[newaxis, :]; )" << a_ty_ << R"(* pa[TM, TK] = a + rxa[:, newaxis] + inc; diff --git a/lib/ir/metadata.cpp b/lib/ir/metadata.cpp new file mode 100644 index 000000000..16bc059c5 --- /dev/null +++ b/lib/ir/metadata.cpp @@ -0,0 +1,14 @@ +#include "triton/ir/metadata.h" + +namespace triton{ +namespace ir{ + +metadata::metadata(kind_t kind, unsigned value) + : kind_(kind), value_(value) { } + +metadata* metadata::get(kind_t kind, unsigned value) { + return new metadata(kind, value); +} + +} +} diff --git a/lib/ir/module.cpp b/lib/ir/module.cpp index d8f07ecc4..678c6119d 100644 --- a/lib/ir/module.cpp +++ b/lib/ir/module.cpp @@ -23,6 +23,11 @@ ir::context& module::get_context() { void module::set_value(const std::string& name, ir::basic_block *block, ir::value *value){ values_[val_key_t{name, block}] = value; + auto it = metadatas_.find(name); + if(auto *x = dynamic_cast(value)) + if(it != metadatas_.end()){ + x->set_metadata(it->second.first, it->second.second); + } } void module::set_value(const std::string& name, ir::value *value){ diff --git a/lib/lang/declaration.cpp b/lib/lang/declaration.cpp index b1a455099..64f238171 100644 --- a/lib/lang/declaration.cpp +++ b/lib/lang/declaration.cpp @@ -5,6 +5,7 @@ #include "triton/ir/basic_block.h" #include "triton/ir/builder.h" #include "triton/ir/type.h" +#include "triton/ir/metadata.h" namespace triton{ @@ -133,12 +134,12 @@ void initializer::set_specifier(const declaration_specifier *spec) { } ir::value* initializer::codegen(ir::module * mod) const{ - std::vector storage = spec_->modifiers(); - ir::type *ty = decl_->type(mod, spec_->type(mod), storage); + std::vector modifiers = spec_->modifiers(); + ir::type *ty = decl_->type(mod, spec_->type(mod), modifiers); std::string name = decl_->id()->name(); ir::value *value = ir::undef_value::get(ty); auto is_tunable = [](modifier* x){ return x->is_tunable(); }; - if(std::find_if(storage.begin(), storage.end(), is_tunable) != storage.end()){ + if(std::find_if(modifiers.begin(), modifiers.end(), is_tunable) != modifiers.end()){ auto csts = dynamic_cast*>((node*)expr_); if(csts == nullptr) throw std::runtime_error("must specify constant list for metaparameters"); @@ -154,12 +155,19 @@ ir::value* initializer::codegen(ir::module * mod) const{ implicit_broadcast(mod, ty, value); } value->set_name(name); + // metadata + auto is_multiple_of = [](modifier* x){ return x->is_multiple_of(); }; + auto it = std::find_if(modifiers.begin(), modifiers.end(), is_multiple_of); + if(it != modifiers.end()) + (*it)->add_metadata(mod, name); + // register mod->set_value(name, value); mod->get_scope().types[name] = ty; if(auto *x = dynamic_cast(value)) mod->add_alloc(x); + // constants auto is_cst = [](modifier* x){ return x->is_cst(); }; - if(std::find_if(storage.begin(), storage.end(), is_cst) != storage.end()) + if(std::find_if(modifiers.begin(), modifiers.end(), is_cst) != modifiers.end()) mod->set_const(name); return value; } @@ -183,16 +191,28 @@ void storage_specifier::add_attr(ir::function* fn, size_t pos) { fn->add_attr(pos, ir::attribute(get_ir_attr(value_))); } +void storage_specifier::add_metadata(ir::module*, std::string) { + throw std::runtime_error("storage specifier is not a metadata"); +} + /* Alignment specifier */ void alignment_specifier::add_attr(ir::function* fn, size_t pos) { fn->add_attr(pos, ir::attribute(ir::aligned, cst_->value())); } +void alignment_specifier::add_metadata(ir::module *mod, std::string name) { + throw std::runtime_error("alignment specifier is not a metadata"); +} + /* Multiple-Of specifier */ void multiple_of_specifier::add_attr(ir::function* fn, size_t pos) { fn->add_attr(pos, ir::attribute(ir::multiple_of, cst_->value())); } +void multiple_of_specifier::add_metadata(ir::module *mod, std::string name) { + mod->add_metadata(name, {ir::metadata::multiple_of, cst_->value()}); +} + /* Function definition */ ir::value* function_definition::codegen(ir::module *mod) const{ From c172bd518b571250e880401bd23c27d3c738ac56 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 30 Jun 2019 16:55:02 -0700 Subject: [PATCH 204/494] more stuff --- examples/cpp/shift.cpp | 7 +- include/triton/dnn/shift.h | 25 ++++--- include/triton/lang/expression.h | 6 +- include/triton/lang/parser.y | 2 +- include/triton/runtime/jit.h | 6 +- lib/codegen/tune.cpp | 2 +- lib/dnn/shift.cpp | 110 +++++++++++++++++++++---------- lib/driver/module.cpp | 2 +- lib/lang/expression.cpp | 54 ++++++--------- 9 files changed, 124 insertions(+), 90 deletions(-) diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index 83082ec4d..ba4f7fa43 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -16,6 +16,7 @@ int main() { auto context = triton::driver::backend::contexts::get_default(); // initialize just-in-time compiler triton::jit jit(context); + // initialization int32_t R = 3, S = 3; int32_t BS = 32, F = 1024; @@ -30,7 +31,7 @@ int main() { shift_w[c] = rand() % S - S/2; } // configuration - triton::dnn::shift shift(BS, C, 1, H, W, 1, R, S, F, shift_h, shift_w, numeric_t_str, numeric_t_str); + triton::dnn::shift shift(BS, C, 1, H, W, 1, R, S, F, shift_h, shift_w, numeric_t_str, numeric_t_str, triton::dnn::shift::FPROP); // host buffers std::vector hc(shift.c_size()); std::vector rc(shift.c_size()); @@ -58,7 +59,7 @@ int main() { auto benchmark = [&](triton::driver::kernel* kernel, triton::jit::launch_information info) { shift.init(stream, (triton::driver::cu_module*)kernel->module()); - // launch info + // launch infoRR unsigned TM = info.global_range_size[0]; unsigned TN = info.global_range_size[1]; unsigned nthreads = info.num_threads; @@ -78,7 +79,7 @@ int main() { std::ostringstream oss; shift.src(oss); std::string src = oss.str(); -// jit.autotune("shift", src.c_str(), benchmark); + jit.autotune("shift", src.c_str(), benchmark); jit.add_module("shift", src.c_str(), params); triton::driver::kernel* kernel = jit.get_function("shift"); triton::jit::launch_information info = jit.get_launch_info("shift"); diff --git a/include/triton/dnn/shift.h b/include/triton/dnn/shift.h index 1b407aa43..3c4b53037 100644 --- a/include/triton/dnn/shift.h +++ b/include/triton/dnn/shift.h @@ -38,7 +38,9 @@ class shift { public: enum type { - FPROP + FPROP, + BPROP, + WGRAD }; private: @@ -85,11 +87,11 @@ public: OUT_DTYPE acc; for(int32_t p = 0; p < AH_; ++p) for(int32_t q = 0; q < AW_; ++q) - for(int32_t bs = 0; bs < NB_; ++bs) - for(int32_t k = 0; k < NF_; ++k) + for(int32_t bs = 0; bs < B_; ++bs) + for(int32_t k = 0; k < F_; ++k) { acc = 0; - for(int32_t c = 0; c < NC_; ++c){ + for(int32_t c = 0; c < C_; ++c){ int32_t h = p; int32_t w = q; if(h >= BH_/2 && h < AH_ - BH_/2 @@ -97,11 +99,11 @@ public: h += shift_h_[c]; w += shift_w_[c]; } - IN_DTYPE a = I[bs + w*NB_ + h*NB_*AW_ + c*NB_*AH_*AW_]; - IN_DTYPE b = F[k + c*NF_]; + IN_DTYPE a = I[bs + w*B_ + h*B_*AW_ + c*B_*AH_*AW_]; + IN_DTYPE b = F[k + c*F_]; acc = std::fma(a, b, acc); } - O[bs + q*NB_ + p*NB_*AW_ + k*NB_*AH_*AW_] = acc; + O[bs + q*B_ + p*B_*AW_ + k*B_*AH_*AW_] = acc; } } @@ -109,8 +111,8 @@ private: int32_t MAX_C_; int32_t TK_; // image size - int32_t NB_; - int32_t NC_; + int32_t B_; + int32_t C_; int32_t AD_; int32_t AH_; int32_t AW_; @@ -118,7 +120,7 @@ private: int32_t BD_; int32_t BH_; int32_t BW_; - int32_t NF_; + int32_t F_; // activation size int32_t CD_; int32_t CH_; @@ -149,6 +151,9 @@ private: // convolution type type ty_; bool bias_; + // transpose + bool AT_; + bool BT_; }; } diff --git a/include/triton/lang/expression.h b/include/triton/lang/expression.h index 6ce0819cb..dc9a6a449 100644 --- a/include/triton/lang/expression.h +++ b/include/triton/lang/expression.h @@ -160,13 +160,13 @@ private: class indexing_expression: public postfix_expression{ public: - indexing_expression(node *id, node *slices) - : id_((const identifier*)id), slices_((const list*)slices) {} + indexing_expression(node *lhs, node *slices) + : lhs_((const expression*)lhs), slices_((const list*)slices) {} ir::value* codegen(ir::module *) const; private: - const identifier* id_; + const expression* lhs_; const list* slices_; }; diff --git a/include/triton/lang/parser.y b/include/triton/lang/parser.y index 2c942b86c..579099e80 100644 --- a/include/triton/lang/parser.y +++ b/include/triton/lang/parser.y @@ -157,7 +157,7 @@ slice_list postfix_expression : primary_expression { $$ = $1;} - | identifier '[' slice_list ']' { $$ = new indexing_expression($1, $3);} + | primary_expression '[' slice_list ']' { $$ = new indexing_expression($1, $3);} ; /* Unary */ diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index 684bc6875..b74ae7c83 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -65,9 +65,9 @@ public: target_(target) { } void target_independent(ir::module &module) { - optimize_dot.run(module); - optimize_trans.run(module); -// ir::print(module, std::cout); + ir::print(module, std::cout); + optimize_dot.run(module); + optimize_trans.run(module); } void target_dependent(ir::module &module) { diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index ac56bd5ed..3821ecdb2 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -59,7 +59,7 @@ void tune::init_c_graph(ir::instruction *v) { else if(auto *downcast = dynamic_cast(v)) return; else{ -// std::cout << v->get_name() << std::endl; + std::cout << v->get_name() << std::endl; shapes = v->get_type()->get_tile_shapes(); } // Reshape diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 102a970df..099192080 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -8,42 +8,63 @@ void shift::set_ld(const std::vector& shapes, std::vector& ld) { size_t size = shapes.size(); ld.resize(size); - ld[3] = 1; - ld[2] = shapes[3]*ld[3]; - ld[1] = shapes[2]*ld[2]; - ld[0] = shapes[1]*ld[1]; + ld[size - 1] = 1; + for(int i = size - 1; i >= 1; i--) + ld[i - 1] = shapes[i] * ld[i]; } -shift::shift(int B, int NC, +shift::shift(int B, int C, int D, int H, int W, int T, int R, int S, - int NF, + int F, const std::vector& shift_h, const std::vector& shift_w, std::string a_ty, std::string b_ty, type ty, bool bias) - : NB_(B), NC_(NC), + : B_(B), C_(C), AD_(D), AH_(H), AW_(W), BD_(T), BH_(R), BW_(S), - NF_(NF), + F_(F), shift_h_(shift_h), shift_w_(shift_w), a_ty_(a_ty), b_ty_(b_ty), ty_(ty), bias_(bias) { // max number of channels TK_ = 16; MAX_C_ = 8192 + TK_; + // transpose + AT_ = false; + BT_ = true; // equivalent matmul - M_ = NB_*AH_*AW_; - N_ = NF_; - K_ = NC_; + M_ = B_*AH_*AW_; + N_ = F_; + K_ = C_; // shapes - // input layout: C, H, W, BS - // filter layout: C, K - // output layout: K, H, W, BS - shapes_a_ = {NC, H, W, B}; - shapes_b_ = {NC, NF}; - shapes_c_ = {NF, H, W, B}; + // input layout: C, H, W, B + // filter layout: C, F + // output layout: F, H, W, B + shapes_a_ = {C, H, W, B}; + shapes_b_ = {C, F}; + shapes_c_ = {F, H, W, B}; + if(ty_ == WGRAD){ + shapes_b_.swap(shapes_c_); + shapes_a_.swap(shapes_b_); + AT_ = true; + BT_ = false; + M_ = K_; + N_ = C_; + K_ = B_*AH_*AW_; + } + if(ty_ == BPROP){ + shapes_a_.swap(shapes_c_); + AT_ = false; + BT_ = false; + K_ = F_; + M_ = B_*AH_*AW_; + N_ = C_; + } // memory strides set_ld(shapes_a_, ld_a_); + set_ld(shapes_b_, ld_b_); + set_ld(shapes_c_, ld_c_); // build LUTs build_deltas(); } @@ -57,7 +78,7 @@ void shift::build_deltas() { // populate look-up table for(unsigned c = 0; c < TK_; c++) h_deltas_[c] = offset(c); - for(unsigned c = 0; c < NC_; c++) + for(unsigned c = 0; c < C_; c++) h_deltas_[TK_ + c] = offset(c + TK_) - offset(c); } @@ -99,18 +120,36 @@ void shift::enqueue(driver::stream *stream, driver::kernel *kernel, kernel->setArg(3, M_); kernel->setArg(4, N_); kernel->setArg(5, K_); - kernel->setArg(6, NB_*AH_*AW_); - kernel->setArg(7, NB_); + kernel->setArg(6, B_*AH_*AW_); + kernel->setArg(7, B_); kernel->setArg(8, AH_); kernel->setArg(9, AW_); kernel->setArg(10, BH_); kernel->setArg(11, BW_); - // dry run std::array grid = {(M_ + TM - 1)/TM, (N_ + TN - 1)/TN, 1}; stream->enqueue(kernel, grid, {nthreads, 1, 1}); } void shift::src(std::ostream &os) { + std::string AS0 = "TM", AS1 = "TK"; + std::string BS0 = "TK", BS1 = "TN"; + std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; + std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; + std::string lda0 = "*lda", lda1 = ""; + std::string ldb0 = "", ldb1 = "*ldb"; + std::string usea = AT_ ? "trans(a)" : "a"; + std::string useb = BT_ ? "trans(b)" : "b"; + if(AT_){ + std::swap(AS0, AS1); + std::swap(bca0, bca1); + std::swap(lda0, lda1); + } + if(BT_){ + std::swap(BS0, BS1); + std::swap(bcb0, bcb1); + std::swap(ldb0, ldb1); + } + os << R"( const tunable int32 TM = {16, 32, 64, 128}; @@ -136,26 +175,27 @@ void shift(restrict read_only align(16) )" << a_ty_ << R"( *a, int32 raw[TM] = rawhc % AW; int32 rahc[TM] = rawhc / AW; int32 rah[TM] = rahc % AH; + __constant__ int32* pd[TK] = delta + rka; + multiple_of(4) int32 d[TK] = *pd; int1 maskh[TM] = (rah >= pad_h) && (rah < (AH - pad_h)); int1 maskw[TM] = (raw >= pad_w) && (raw < (AW - pad_w)); - int1 mask[TM, TK] = maskh[:, newaxis] && maskw[:, newaxis]; - __constant__ int32* pd[TK] = delta + rka; - multiple_of(4) int32 d[TK]; - d = *pd; - int32 offa1[TK] = rka*lda; - int32 inc[TM, TK] = mask ? d[newaxis, :] : offa1[newaxis, :]; - )" << a_ty_ << R"(* pa[TM, TK] = a + rxa[:, newaxis] + inc; - )" << b_ty_ << R"(* pb[TN, TK] = b + rkb[newaxis, :]*N + ryb[:, newaxis]; - )" << a_ty_ << R"( a[TM, TK] = *pa; - )" << b_ty_ << R"( b[TN, TK] = *pb; + int1 mask[)" << AS0 << ", " << AS1 << "] = maskh" << bca1 << " && maskw" << bca1 << R"(; + int32 inc_true[)" << AS0 << ", " << AS1 << "] = d" << bca0 << R"(; + int32 inc_false[)" << AS0 << ", " << AS1 << "] = rka" << bca0 << R"( * lda; + )" << a_ty_ << "* pa[" << AS0 << ", " << AS1 << R"(] = a + rxa)" << bca1 << R"( + (mask ? inc_true : inc_false); + )" << b_ty_ << "* pb[" << BS0 << ", " << BS1 << "] = b + ryb" << bcb1 << " + rkb" << bcb0 << R"(*N; + )" << a_ty_ << " a[" << AS0 << ", " << AS1 << R"(] = *pa; + )" << b_ty_ << " b[" << BS0 << ", " << BS1 << R"(] = *pb; for(int32 k = K; k > 0; k = k - TK){ - C = dot(a, trans(b), C); + C = dot()" << usea << "," << useb << R"(, C); pb = pb + TK*N; pd = pd + TK; d = *pd; - pa = pa + (mask ? d[newaxis, :] : TK*lda); - int1 checka[TM, TK] = k > TK; - int1 checkb[TN, TK] = k > TK; + inc_true = d)" << bca0 << R"(; + inc_false = TK * lda; + pa = pa + (mask ? inc_true : inc_false); + int1 checka[)" << AS0 << ", " << AS1 << R"(] = k > TK; + int1 checkb[)" << BS0 << ", " << BS1 << R"(] = k > TK; @checka a = *pa; @checkb b = *pb; } diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 4ff863666..f11118401 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -255,7 +255,7 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ -// std::cout << source << std::endl; +// std::cout << source << sd::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/lang/expression.cpp b/lib/lang/expression.cpp index 6054a2694..388815164 100644 --- a/lib/lang/expression.cpp +++ b/lib/lang/expression.cpp @@ -175,7 +175,7 @@ ir::value* trans_expression::codegen(ir::module *mod) const { /* Postfix expression */ ir::value* indexing_expression::codegen(ir::module *mod) const{ - ir::value *in = mod->get_value(id_->name()); + ir::value *in = lhs_->codegen(mod); const std::vector &slices = slices_->values(); auto in_shapes = in->get_type()->get_tile_shapes(); ir::type::tile_shapes_t::value_type one = ir::tile_type::make_one(mod->get_context()); @@ -234,44 +234,32 @@ ir::value* cast_expression::codegen(ir::module *mod) const{ /* Conditional expression */ ir::value *conditional_expression::codegen(ir::module *mod) const{ ir::builder &builder = mod->get_builder(); + ir::basic_block::inst_list_t &instructions = builder.get_insert_block()->get_inst_list(); ir::value *pred = cond_->codegen(mod); ir::instruction *mask = (ir::instruction*)builder.create_mask(pred); + /* true value */ ir::value *true_mask = mask->get_result(0); - ir::value *false_mask = mask->get_result(1); + auto it_true_begin = instructions.end(); + it_true_begin--; ir::value *true_value = true_value_->codegen(mod); - ir::value *false_value = false_value_->codegen(mod); - if(auto *itn = dynamic_cast(true_value)) - itn->set_mask_pred(true_mask); - if(auto *itn = dynamic_cast(false_value)) - itn->set_mask_pred(false_mask); - bool is_float, is_ptr, is_int, is_signed; - ir::value *uncasted_true_value = true_value; - ir::value *uncasted_false_value = false_value; - implicit_cast(builder, true_value, false_value, is_float, is_ptr, is_int, is_signed); implicit_broadcast(mod, pred, true_value); + it_true_begin++; + auto it_true_end = instructions.end(); + for(auto it = it_true_begin; it != it_true_end; it++) + (*it)->set_mask_pred(true_mask); + /* false value */ + ir::value *false_mask = mask->get_result(1); + auto it_false_begin = instructions.end(); + it_false_begin--; + ir::value *false_value = false_value_->codegen(mod); + it_false_begin++; implicit_broadcast(mod, pred, false_value); - { - ir::value *current = true_value; - while(current != uncasted_true_value) { - if(auto *itn = dynamic_cast(current)){ - itn->set_mask_pred(true_mask); - current = itn->get_operand(0); - } - else - break; - } - } - { - ir::value *current = false_value; - while(current != uncasted_false_value) { - if(auto *itn = dynamic_cast(current)){ - itn->set_mask_pred(false_mask); - current = itn->get_operand(0); - } - else - break; - } - } + auto it_false_end = instructions.end(); + for(auto it = it_false_begin; it != it_false_end; it++) + (*it)->set_mask_pred(false_mask); + /* cast */ + bool is_float, is_ptr, is_int, is_signed; + implicit_cast(builder, true_value, false_value, is_float, is_ptr, is_int, is_signed); ir::value *result = builder.create_merge(true_mask, true_value, false_mask, false_value); return result; } From 6cfb575d298389156d846348a81279693b909d7b Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 30 Jun 2019 17:43:18 -0700 Subject: [PATCH 205/494] [lang] fixup in cast type --- include/triton/ir/instructions.h | 28 ++++++++++++++-------------- include/triton/runtime/jit.h | 2 +- lib/codegen/tune.cpp | 4 +--- lib/lang/expression.cpp | 12 +++++++----- lib/lang/node.cpp | 2 ++ 5 files changed, 25 insertions(+), 23 deletions(-) diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index a08e89a85..ee22a5b25 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -237,26 +237,26 @@ private: op_t op_; }; -#define TDL_IR_DECLARE_CAST_INST_SIMPLE(name, op) \ +#define TRITON_IR_DECLARE_CAST_INST_SIMPL(name, op) \ class name : public cast_inst{ \ friend class cast_inst; \ name(type *ty, value *v, const std::string &name, instruction *next) \ : cast_inst(ty, v, name, next, op){ } \ }; -TDL_IR_DECLARE_CAST_INST_SIMPLE(trunc_inst, llvm::Instruction::CastOps::Trunc) -TDL_IR_DECLARE_CAST_INST_SIMPLE(z_ext_inst, llvm::Instruction::CastOps::ZExt) -TDL_IR_DECLARE_CAST_INST_SIMPLE(s_ext_inst, llvm::Instruction::CastOps::SExt) -TDL_IR_DECLARE_CAST_INST_SIMPLE(fp_trunc_inst, llvm::Instruction::CastOps::FPTrunc) -TDL_IR_DECLARE_CAST_INST_SIMPLE(fp_ext_inst, llvm::Instruction::CastOps::FPExt) -TDL_IR_DECLARE_CAST_INST_SIMPLE(ui_to_fp_inst, llvm::Instruction::CastOps::UIToFP) -TDL_IR_DECLARE_CAST_INST_SIMPLE(si_to_fp_inst, llvm::Instruction::CastOps::SIToFP) -TDL_IR_DECLARE_CAST_INST_SIMPLE(fp_to_ui_inst, llvm::Instruction::CastOps::FPToUI) -TDL_IR_DECLARE_CAST_INST_SIMPLE(fp_to_si_inst, llvm::Instruction::CastOps::FPToSI) -TDL_IR_DECLARE_CAST_INST_SIMPLE(ptr_to_int_inst, llvm::Instruction::CastOps::PtrToInt) -TDL_IR_DECLARE_CAST_INST_SIMPLE(int_to_ptr_inst, llvm::Instruction::CastOps::IntToPtr) -TDL_IR_DECLARE_CAST_INST_SIMPLE(bit_cast_inst, llvm::Instruction::CastOps::BitCast) -TDL_IR_DECLARE_CAST_INST_SIMPLE(addr_space_cast_inst, llvm::Instruction::CastOps::AddrSpaceCast) +TRITON_IR_DECLARE_CAST_INST_SIMPL(trunc_inst, llvm::Instruction::CastOps::Trunc) +TRITON_IR_DECLARE_CAST_INST_SIMPL(z_ext_inst, llvm::Instruction::CastOps::ZExt) +TRITON_IR_DECLARE_CAST_INST_SIMPL(s_ext_inst, llvm::Instruction::CastOps::SExt) +TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_trunc_inst, llvm::Instruction::CastOps::FPTrunc) +TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_ext_inst, llvm::Instruction::CastOps::FPExt) +TRITON_IR_DECLARE_CAST_INST_SIMPL(ui_to_fp_inst, llvm::Instruction::CastOps::UIToFP) +TRITON_IR_DECLARE_CAST_INST_SIMPL(si_to_fp_inst, llvm::Instruction::CastOps::SIToFP) +TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_to_ui_inst, llvm::Instruction::CastOps::FPToUI) +TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_to_si_inst, llvm::Instruction::CastOps::FPToSI) +TRITON_IR_DECLARE_CAST_INST_SIMPL(ptr_to_int_inst, llvm::Instruction::CastOps::PtrToInt) +TRITON_IR_DECLARE_CAST_INST_SIMPL(int_to_ptr_inst, llvm::Instruction::CastOps::IntToPtr) +TRITON_IR_DECLARE_CAST_INST_SIMPL(bit_cast_inst, llvm::Instruction::CastOps::BitCast) +TRITON_IR_DECLARE_CAST_INST_SIMPL(addr_space_cast_inst, llvm::Instruction::CastOps::AddrSpaceCast) //===----------------------------------------------------------------------===// // terminator_inst classes diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index b74ae7c83..6bc377c95 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -65,7 +65,7 @@ public: target_(target) { } void target_independent(ir::module &module) { - ir::print(module, std::cout); +// ir::print(module, std::cout); optimize_dot.run(module); optimize_trans.run(module); } diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 3821ecdb2..9f8d88492 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -58,10 +58,8 @@ void tune::init_c_graph(ir::instruction *v) { shapes = store->get_pointer_operand()->get_type()->get_tile_shapes(); else if(auto *downcast = dynamic_cast(v)) return; - else{ - std::cout << v->get_name() << std::endl; + else shapes = v->get_type()->get_tile_shapes(); - } // Reshape if(dynamic_cast(v)){ ir::value *op = v->get_operand(0); diff --git a/lib/lang/expression.cpp b/lib/lang/expression.cpp index 388815164..731d50c24 100644 --- a/lib/lang/expression.cpp +++ b/lib/lang/expression.cpp @@ -246,20 +246,22 @@ ir::value *conditional_expression::codegen(ir::module *mod) const{ it_true_begin++; auto it_true_end = instructions.end(); for(auto it = it_true_begin; it != it_true_end; it++) +// if(!dynamic_cast(*it)) (*it)->set_mask_pred(true_mask); /* false value */ ir::value *false_mask = mask->get_result(1); auto it_false_begin = instructions.end(); it_false_begin--; ir::value *false_value = false_value_->codegen(mod); - it_false_begin++; implicit_broadcast(mod, pred, false_value); - auto it_false_end = instructions.end(); - for(auto it = it_false_begin; it != it_false_end; it++) - (*it)->set_mask_pred(false_mask); - /* cast */ bool is_float, is_ptr, is_int, is_signed; implicit_cast(builder, true_value, false_value, is_float, is_ptr, is_int, is_signed); + it_false_begin++; + auto it_false_end = instructions.end(); + for(auto it = it_false_begin; it != it_false_end; it++) +// if(!dynamic_cast(*it)) + (*it)->set_mask_pred(false_mask); + /* psi */ ir::value *result = builder.create_merge(true_mask, true_value, false_mask, false_value); return result; } diff --git a/lib/lang/node.cpp b/lib/lang/node.cpp index 418a86fca..5c48657bf 100644 --- a/lib/lang/node.cpp +++ b/lib/lang/node.cpp @@ -11,6 +11,8 @@ namespace lang{ ir::value *node::explicit_cast(ir::builder &builder, ir::value *src, ir::type *dst_ty){ ir::type *src_scalar_ty = src->get_type()->get_scalar_ty(); ir::type *dst_scalar_ty = dst_ty->get_scalar_ty(); + if(src->get_type()->is_tile_ty()) + dst_ty = ir::tile_type::get_same_shapes(dst_scalar_ty, src->get_type()); bool src_signed = false; bool dst_signed = false; if(src_scalar_ty == dst_scalar_ty) From 8fc253946c740d7ffb4dc6eeb2b37bbe3f3f1f2c Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 2 Jul 2019 16:39:07 -0700 Subject: [PATCH 206/494] [codegen] shift: added sketch for shift-convolution backpropagation --- examples/cpp/dot.cpp | 10 +-- examples/cpp/shift.cpp | 6 +- include/triton/ir/builder.h | 1 + include/triton/ir/instructions.h | 9 +++ include/triton/lang/expression.h | 10 +++ include/triton/lang/parser.y | 3 +- include/triton/lang/scanner.l | 119 +++++++++++++++--------------- lib/codegen/selection.cpp | 7 ++ lib/codegen/tune.cpp | 8 +- lib/dnn/shift.cpp | 122 ++++++++++++++++++++++++------- lib/ir/builder.cpp | 4 + lib/ir/instructions.cpp | 13 ++++ lib/ir/type.cpp | 8 +- lib/lang/expression.cpp | 7 ++ lib/lang/node.cpp | 2 - lib/lang/statement.cpp | 9 +-- 16 files changed, 231 insertions(+), 107 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 5dbff07bb..7ff939318 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -8,7 +8,7 @@ int main() { - bool AT = false; + bool AT = true; bool BT = true; // initialize default compute device @@ -16,7 +16,7 @@ int main() { triton::jit jit(context); // matrix multiplication parameters - int32_t M = 32768, N = 1024, K = 1024; + int32_t M = 1024, N = 1024, K = 1024; std::vector hc(M*N); std::vector rc(M*N); std::vector ha(M*K); @@ -59,9 +59,9 @@ int main() { // just-in-time compile source-code - std::string src = triton::dnn::gemm::src(AT, BT, "fp32", "fp32", 1, 1); - jit.autotune("matmul",src.c_str(), benchmark); - jit.add_module("matmul", src.c_str(), triton::dnn::gemm::default_params(AT, BT)); + std::string src = triton::dnn::gemm::src(AT, BT, "fp32", "fp32", 4, 4); +// jit.autotune("matmul",src.c_str(), benchmark); + jit.add_module("matmul", src.c_str(), {8, 16, 4, 2, 16, 8, 4, 2, 2, 4, 2, 8, 8, 1}); triton::driver::kernel* kernel = jit.get_function("matmul"); triton::jit::launch_information info = jit.get_launch_info("matmul"); std::cout << "Performance: " << benchmark(kernel, info) << " TFLOPS " << std::endl; diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index ba4f7fa43..b330a3a9c 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -19,8 +19,8 @@ int main() { // initialization int32_t R = 3, S = 3; - int32_t BS = 32, F = 1024; - int32_t H = 32, W = 32; + int32_t BS = 4, F = 1024; + int32_t H = 16, W = 16; int32_t C = 1024; // random shifts @@ -31,7 +31,7 @@ int main() { shift_w[c] = rand() % S - S/2; } // configuration - triton::dnn::shift shift(BS, C, 1, H, W, 1, R, S, F, shift_h, shift_w, numeric_t_str, numeric_t_str, triton::dnn::shift::FPROP); + triton::dnn::shift shift(BS, C, 1, H, W, 1, R, S, F, shift_h, shift_w, numeric_t_str, numeric_t_str, triton::dnn::shift::BPROP); // host buffers std::vector hc(shift.c_size()); std::vector rc(shift.c_size()); diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h index 48b1d172d..fbea4ec27 100644 --- a/include/triton/ir/builder.h +++ b/include/triton/ir/builder.h @@ -128,6 +128,7 @@ public: value *create_get_global_range(unsigned axis, type::tile_shapes_t::value_type size, const std::string &name = ""); value *create_get_range_id(unsigned axis, const std::string &name = ""); value *create_atomic_cas(value *ptr, value *cmp, value *val, const std::string &name = ""); + value *create_atomic_add(value *ptr, value *val, const std::string &name = ""); value *create_dot(value *A, value *B, value *C, const std::string &name = ""); value *create_trans(value *A, const std::string &name = ""); value *create_select(value *pred, value *if_value, value *else_value, const std::string &name = ""); diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index ee22a5b25..95ecd9c70 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -532,6 +532,15 @@ public: static instruction* create(value *ptr, value *cmp, value *val, const std::string &name = "", instruction *next = nullptr); }; +class atomic_add_inst: public builtin_inst { +private: + atomic_add_inst(value *ptr, value *val, const std::string &name = "", instruction *next = nullptr); + std::string repr_impl() const { return "atomic_add"; } + +public: + static instruction* create(value *ptr, value *val, const std::string &name = "", instruction *next = nullptr); +}; + class dot_inst: public builtin_inst { public: enum TransT { NoTrans, Trans }; diff --git a/include/triton/lang/expression.h b/include/triton/lang/expression.h index dc9a6a449..420e902a8 100644 --- a/include/triton/lang/expression.h +++ b/include/triton/lang/expression.h @@ -101,6 +101,16 @@ private: const node *val_; }; +class atomic_add_expression: public builtin_expression{ +public: + atomic_add_expression(node *ptr, node *val): ptr_(ptr), val_(val) { } + ir::value* codegen(ir::module *) const; + +private: + const node *ptr_; + const node *val_; +}; + class matmul_expression: public builtin_expression{ public: diff --git a/include/triton/lang/parser.y b/include/triton/lang/parser.y index 579099e80..6acc128e5 100644 --- a/include/triton/lang/parser.y +++ b/include/triton/lang/parser.y @@ -55,7 +55,7 @@ STORAGE_SPEC_T get_storage_spec(node *op) { return ((token*)op)->storage_spec;} %token VOID UINT1 UINT8 UINT16 UINT32 UINT64 INT1 INT8 INT16 INT32 INT64 FP16 FP32 FP64 %token IF ELSE FOR CONTINUE WHILE %token NEWAXIS ELLIPSIS AT -%token GET_GLOBAL_RANGE GET_RANGE_ID DOT TRANS MAX MIN SELECT ATOMIC_CAS ATOMIC_EXCHG ALLOC_CONST +%token GET_GLOBAL_RANGE GET_RANGE_ID DOT TRANS MAX MIN SELECT ATOMIC_CAS ATOMIC_EXCHG ATOMIC_ADD ALLOC_CONST %start translation_unit %% @@ -129,6 +129,7 @@ builtin_expression | MIN '(' expression ',' expression ')' { $$ = new min_expression($3, $5); } | SELECT '(' expression ',' expression ',' expression ')' { $$ = new select_expression($3, $5, $7); } | ATOMIC_CAS '(' expression ',' expression ',' expression ')' { $$ = new atomic_cas_expression($3, $5, $7); } + | ATOMIC_ADD '(' expression ',' expression ')' { $$ = new atomic_add_expression($3, $5); } ; /* Primary */ diff --git a/include/triton/lang/scanner.l b/include/triton/lang/scanner.l index e91b25961..24385659d 100644 --- a/include/triton/lang/scanner.l +++ b/include/triton/lang/scanner.l @@ -28,26 +28,27 @@ using triton::lang::return_void; "if" { return return_impl(IF, yytext); } "else" { return return_impl(ELSE, yytext); } "for" { return return_impl(FOR, yytext); } -"while" { return return_impl(WHILE, yytext); } -"void" { return return_impl(VOID, yytext); } +"while" { return return_impl(WHILE, yytext); } +"void" { return return_impl(VOID, yytext); } "uint1" { return return_impl(UINT1, yytext); } -"uint8" { return return_impl(UINT8, yytext); } -"uint16" { return return_impl(UINT16, yytext); } -"uint32" { return return_impl(UINT32, yytext); } -"uint64" { return return_impl(UINT64, yytext); } +"uint8" { return return_impl(UINT8, yytext); } +"uint16" { return return_impl(UINT16, yytext); } +"uint32" { return return_impl(UINT32, yytext); } +"uint64" { return return_impl(UINT64, yytext); } "int1" { return return_impl(INT1, yytext); } -"int8" { return return_impl(INT8, yytext); } -"int16" { return return_impl(INT16, yytext); } -"int32" { return return_impl(INT32, yytext); } -"int64" { return return_impl(INT64, yytext); } -"fp16" { return return_impl(FP16, yytext); } -"fp32" { return return_impl(FP32, yytext); } -"fp64" { return return_impl(FP64, yytext); } +"int8" { return return_impl(INT8, yytext); } +"int16" { return return_impl(INT16, yytext); } +"int32" { return return_impl(INT32, yytext); } +"int64" { return return_impl(INT64, yytext); } +"fp16" { return return_impl(FP16, yytext); } +"fp32" { return return_impl(FP32, yytext); } +"fp64" { return return_impl(FP64, yytext); } "..." { return return_impl(ELLIPSIS, yytext); } "get_global_range" { return return_impl(GET_GLOBAL_RANGE, yytext); } "get_range_id" { return return_impl(GET_RANGE_ID, yytext); } "__atomic_cas" { return return_impl(ATOMIC_CAS, yytext); } -"__atomic_exchg" { return return_impl(ATOMIC_EXCHG, yytext); } +"__atomic_exchg" { return return_impl(ATOMIC_EXCHG, yytext); } +"__atomic_add" { return return_impl(ATOMIC_ADD, yytext); } "dot" { return return_impl(DOT, yytext); } "max" { return return_impl(MAX, yytext); } "min" { return return_impl(MIN, yytext); } @@ -58,57 +59,57 @@ using triton::lang::return_void; {L}({L}|{D})* { return return_impl(IDENTIFIER, yytext); } 0[xX]{H}+{IS}? { return return_impl(CONSTANT, yytext); } 0{D}+{IS}? { return return_impl(CONSTANT, yytext); } -{D}+{IS}? { return return_impl(CONSTANT, yytext); } +{D}+{IS}? { return return_impl(CONSTANT, yytext); } L?'(\\.|[^\\'])+' { return return_impl(CONSTANT, yytext); } -{D}+{E}{FS}? { return return_impl(CONSTANT, yytext); } +{D}+{E}{FS}? { return return_impl(CONSTANT, yytext); } L?\"(\\.|[^\\"])*\" { return return_impl(STRING_LITERAL, yytext); } ">>=" { return return_impl(RIGHT_ASSIGN, yytext); } -"<<=" { return return_impl(LEFT_ASSIGN, yytext); } -"+=" { return return_impl(ADD_ASSIGN, yytext); } -"-=" { return return_impl(SUB_ASSIGN, yytext); } -"*=" { return return_impl(MUL_ASSIGN, yytext); } -"/=" { return return_impl(DIV_ASSIGN, yytext); } -"%=" { return return_impl(MOD_ASSIGN, yytext); } -"&=" { return return_impl(AND_ASSIGN, yytext); } -"^=" { return return_impl(XOR_ASSIGN, yytext); } -"|=" { return return_impl(OR_ASSIGN, yytext); } -">>" { return return_impl(RIGHT_OP, yytext); } -"<<" { return return_impl(LEFT_OP, yytext); } -"++" { return return_impl(INC_OP, yytext); } -"--" { return return_impl(DEC_OP, yytext); } -"->" { return return_impl(PTR_OP, yytext); } -"&&" { return return_impl(AND_OP, yytext); } -"||" { return return_impl(OR_OP, yytext); } -"<=" { return return_impl(LE_OP, yytext); } -">=" { return return_impl(GE_OP, yytext); } -"==" { return return_impl(EQ_OP, yytext); } -"!=" { return return_impl(NE_OP, yytext); } -";" { return return_impl(';', yytext); } +"<<=" { return return_impl(LEFT_ASSIGN, yytext); } +"+=" { return return_impl(ADD_ASSIGN, yytext); } +"-=" { return return_impl(SUB_ASSIGN, yytext); } +"*=" { return return_impl(MUL_ASSIGN, yytext); } +"/=" { return return_impl(DIV_ASSIGN, yytext); } +"%=" { return return_impl(MOD_ASSIGN, yytext); } +"&=" { return return_impl(AND_ASSIGN, yytext); } +"^=" { return return_impl(XOR_ASSIGN, yytext); } +"|=" { return return_impl(OR_ASSIGN, yytext); } +">>" { return return_impl(RIGHT_OP, yytext); } +"<<" { return return_impl(LEFT_OP, yytext); } +"++" { return return_impl(INC_OP, yytext); } +"--" { return return_impl(DEC_OP, yytext); } +"->" { return return_impl(PTR_OP, yytext); } +"&&" { return return_impl(AND_OP, yytext); } +"||" { return return_impl(OR_OP, yytext); } +"<=" { return return_impl(LE_OP, yytext); } +">=" { return return_impl(GE_OP, yytext); } +"==" { return return_impl(EQ_OP, yytext); } +"!=" { return return_impl(NE_OP, yytext); } +";" { return return_impl(';', yytext); } ("{"|"<%") { return return_impl('{', yytext); } ("}"|"%>") { return return_impl('}', yytext); } -"," { return return_impl(',', yytext); } -":" { return return_impl(':', yytext); } -"=" { return return_impl('=', yytext); } -"(" { return return_impl('(', yytext); } -")" { return return_impl(')', yytext); } -("["|"<:") { return return_impl('[', yytext); } -("]"|":>") { return return_impl(']', yytext); } -"." { return return_impl('.', yytext); } -"&" { return return_impl('&', yytext); } -"!" { return return_impl('!', yytext); } -"~" { return return_impl('~', yytext); } -"-" { return return_impl('-', yytext); } -"+" { return return_impl('+', yytext); } -"*" { return return_impl('*', yytext); } -"/" { return return_impl('/', yytext); } -"%" { return return_impl('%', yytext); } -"<" { return return_impl('<', yytext); } -">" { return return_impl('>', yytext); } -"^" { return return_impl('^', yytext); } -"|" { return return_impl('|', yytext); } -"?" { return return_impl('?', yytext); } +"," { return return_impl(',', yytext); } +":" { return return_impl(':', yytext); } +"=" { return return_impl('=', yytext); } +"(" { return return_impl('(', yytext); } +")" { return return_impl(')', yytext); } +("["|"<:") { return return_impl('[', yytext); } +("]"|":>") { return return_impl(']', yytext); } +"." { return return_impl('.', yytext); } +"&" { return return_impl('&', yytext); } +"!" { return return_impl('!', yytext); } +"~" { return return_impl('~', yytext); } +"-" { return return_impl('-', yytext); } +"+" { return return_impl('+', yytext); } +"*" { return return_impl('*', yytext); } +"/" { return return_impl('/', yytext); } +"%" { return return_impl('%', yytext); } +"<" { return return_impl('<', yytext); } +">" { return return_impl('>', yytext); } +"^" { return return_impl('^', yytext); } +"|" { return return_impl('|', yytext); } +"?" { return return_impl('?', yytext); } [ \t\v\n\f] { return_void(yytext);} -. { /* ignore bad characters */ } +. { /* ignore bad characters */ } %% diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index d0f6f825e..3f0c02d2a 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -373,6 +373,13 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::function(inst)){ + Value *ptr = value(ii->get_operand(0)); + Value *val = value(ii->get_operand(1)); + Value *atom_f_add = Intrinsic::getDeclaration(builder.GetInsertBlock()->getModule(), Intrinsic::nvvm_atomic_load_add_f32, {ptr->getType()}); + Value *res = builder.CreateCall(atom_f_add, {ptr, val}); + return (Instruction*)res; + } // unknown instruction throw std::runtime_error("unknown conversion from ir::instruction to Instruction"); } diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 9f8d88492..6bdc97759 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -56,6 +56,8 @@ void tune::init_c_graph(ir::instruction *v) { ir::type::tile_shapes_t shapes; if(auto *store = dynamic_cast(v)) shapes = store->get_pointer_operand()->get_type()->get_tile_shapes(); + else if(auto *atom = dynamic_cast(v)) + shapes = atom->get_operand(0)->get_type()->get_tile_shapes(); else if(auto *downcast = dynamic_cast(v)) return; else @@ -233,13 +235,13 @@ void tune::run(ir::module &mod) { continue; if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 4, 4)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 2)); *params_.at(i).at("nts.d0") = *tmp; } if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 4, 4)); - std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 4, 4)); + std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 2, 2)); + std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 2, 2)); *params_.at(i).at("nts.d0") = *tmp1; *params_.at(i).at("nts.d1") = *tmp2; } diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 099192080..e54ac4bdb 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -114,6 +114,8 @@ void shift::init(driver::stream *stream, driver::cu_module *module) { void shift::enqueue(driver::stream *stream, driver::kernel *kernel, driver::buffer *a, driver::buffer *b, driver::buffer *c, size_t TM, size_t TN, size_t nthreads) { + if(ty_ == WGRAD) + std::swap(a, b); kernel->setArg(0, a); kernel->setArg(1, b); kernel->setArg(2, c); @@ -121,24 +123,35 @@ void shift::enqueue(driver::stream *stream, driver::kernel *kernel, kernel->setArg(4, N_); kernel->setArg(5, K_); kernel->setArg(6, B_*AH_*AW_); - kernel->setArg(7, B_); - kernel->setArg(8, AH_); - kernel->setArg(9, AW_); - kernel->setArg(10, BH_); - kernel->setArg(11, BW_); + kernel->setArg(7, N_); + kernel->setArg(8, B_); + kernel->setArg(9, AH_); + kernel->setArg(10, AW_); + kernel->setArg(11, BH_); + kernel->setArg(12, BW_); std::array grid = {(M_ + TM - 1)/TM, (N_ + TN - 1)/TN, 1}; + if(ty_ == BPROP) + ((driver::cu_buffer*)c)->set_zero(stream, M_*N_*4); stream->enqueue(kernel, grid, {nthreads, 1, 1}); } void shift::src(std::ostream &os) { std::string AS0 = "TM", AS1 = "TK"; std::string BS0 = "TK", BS1 = "TN"; - std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; - std::string lda0 = "*lda", lda1 = ""; std::string ldb0 = "", ldb1 = "*ldb"; std::string usea = AT_ ? "trans(a)" : "a"; std::string useb = BT_ ? "trans(b)" : "b"; + std::string rkb = "rkb"; + std::string rka = "rka"; + std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; + std::string lda0 = "*lda", lda1 = ""; + if(ty_ == FPROP){ + rka = "inc"; + bca0 = ""; + lda0 = ""; + } + if(AT_){ std::swap(AS0, AS1); std::swap(bca0, bca1); @@ -149,6 +162,8 @@ void shift::src(std::ostream &os) { std::swap(bcb0, bcb1); std::swap(ldb0, ldb1); } + std::string AS = AS0 + ", " + AS1; + std::string BS = BS0 + ", " + BS1; os << R"( @@ -161,8 +176,8 @@ __constant__ int32* delta = alloc_const int32[)" << MAX_C_ << R"(]; void shift(restrict read_only align(16) )" << a_ty_ << R"( *a, restrict read_only align(16) )" << b_ty_ << R"( *b, fp32 *c, - multiple_of(4) int32 M, multiple_of(4) int32 N, multiple_of(4) int32 K, - multiple_of(4) int32 lda, + int32 M, int32 N, int32 K, + multiple_of(4) int32 lda, multiple_of(4) int32 ldb, int32 ABS, int32 AH, int32 AW, int32 AR, int32 AS) { int32 rxa[TM] = get_global_range[TM](0); int32 ryb[TN] = get_global_range[TN](1); @@ -170,7 +185,9 @@ void shift(restrict read_only align(16) )" << a_ty_ << R"( *a, int32 rkb[TK] = 0 ... TK; fp32 C[TM, TN] = 0; int32 pad_h = AR / 2; - int32 pad_w = AS / 2; + int32 pad_w = AS / 2;)"; +if(ty_ == FPROP){ + os << R"( int32 rawhc[TM] = rxa / ABS; int32 raw[TM] = rawhc % AW; int32 rahc[TM] = rawhc / AW; @@ -179,35 +196,86 @@ void shift(restrict read_only align(16) )" << a_ty_ << R"( *a, multiple_of(4) int32 d[TK] = *pd; int1 maskh[TM] = (rah >= pad_h) && (rah < (AH - pad_h)); int1 maskw[TM] = (raw >= pad_w) && (raw < (AW - pad_w)); - int1 mask[)" << AS0 << ", " << AS1 << "] = maskh" << bca1 << " && maskw" << bca1 << R"(; - int32 inc_true[)" << AS0 << ", " << AS1 << "] = d" << bca0 << R"(; - int32 inc_false[)" << AS0 << ", " << AS1 << "] = rka" << bca0 << R"( * lda; - )" << a_ty_ << "* pa[" << AS0 << ", " << AS1 << R"(] = a + rxa)" << bca1 << R"( + (mask ? inc_true : inc_false); - )" << b_ty_ << "* pb[" << BS0 << ", " << BS1 << "] = b + ryb" << bcb1 << " + rkb" << bcb0 << R"(*N; - )" << a_ty_ << " a[" << AS0 << ", " << AS1 << R"(] = *pa; - )" << b_ty_ << " b[" << BS0 << ", " << BS1 << R"(] = *pb; + int1 mask[TM, TK] = maskh[:, newaxis] && maskw[:, newaxis]; + int32 inc_true[TM, TK] = d[newaxis, :]; + int32 inc_false[TM, TK] = rka[newaxis, :] * lda; + int32 inc[TM, TK] = mask ? inc_true : inc_false;)"; +} +if(ty_ == WGRAD){ + os << R"( + int32 shift[TK, TN] = 0;)"; +} + os << R"( + )" << a_ty_ << "* pa[" << AS << "] = a + rxa" << bca1 << " + " << rka << bca0 << lda0 << R"(; + )" << b_ty_ << "* pb[" << BS << "] = b + ryb" << bcb1 << " + " << rkb << bcb0 << ldb0 << R"(; + )" << a_ty_ << " a[" << AS << R"(] = *pa; + )" << b_ty_ << " b[" << BS << R"(] = *pb; for(int32 k = K; k > 0; k = k - TK){ C = dot()" << usea << "," << useb << R"(, C); - pb = pb + TK*N; + int1 checka[)" << AS << R"(] = k > TK; + int1 checkb[)" << BS << R"(] = k > TK;)"; +if(ty_ == FPROP){ + os << R"( pd = pd + TK; d = *pd; - inc_true = d)" << bca0 << R"(; + inc_true = d[newaxis, :]; inc_false = TK * lda; - pa = pa + (mask ? inc_true : inc_false); - int1 checka[)" << AS0 << ", " << AS1 << R"(] = k > TK; - int1 checkb[)" << BS0 << ", " << BS1 << R"(] = k > TK; - @checka a = *pa; - @checkb b = *pb; + inc = mask ? inc_true : inc_false; + pa = pa + inc; + @checka a = *pa;)"; +} +else{ + os << R"( + pa = pa + TK)" << lda0 << R"(; + @checka a = *pa;)"; +} +if(ty_ == WGRAD){ + os << R"( + int32 rbwhc[TK] = rkb / ABS; + int32 rbw[TK] = rbwhc % AW; + int32 rbhc[TK] = rbwhc / AW; + int32 rbh[TK] = rbhc % AH; + int1 maskh[TK] = (rbh >= pad_h) && (rbh < (AH - pad_h)); + int1 maskw[TK] = (rbw >= pad_w) && (rbw < (AW - pad_w)); + int1 mask[TK, TN] = maskh[:, newaxis] && maskw[:, newaxis]; + int32 inc[TK, TN] = mask ? 0 : shift; + pb = pb + TK; + )" << b_ty_ << R"(* pbb[TK, TN] = pb + inc; + @checkb b = *pbb;)"; +} +else{ + os << R"( + pb = pb + TK)" << ldb0 << R"(; + @checkb b = *pb;)"; +} + os << R"( } int32 rxc[TM] = get_global_range[TM](0); int32 ryc[TN] = get_global_range[TN](1); fp32* pc[TM, TN] = c + ryc[newaxis, :]*M + rxc[:, newaxis]; int1 checkc0[TM] = rxc < M; int1 checkc1[TN] = ryc < N; - int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - @checkc *pc = C; + int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :];)"; +if(ty_ == BPROP){ + os << R"( + int32 rcwhc[TM] = rxc / ABS; + int32 rcw[TM] = rcwhc % AW; + int32 rchc[TM] = rcwhc / AW; + int32 rch[TM] = rchc % AH; + int1 maskh[TM] = (rch >= pad_h) && (rch < (AH - pad_h)); + int1 maskw[TM] = (rcw >= pad_w) && (rcw < (AW - pad_w)); + int1 interior[TM, TN] = maskh[:, newaxis] && maskw[:, newaxis]; + fp32* shiftpc[TM, TN] = pc + 0; + pc = interior ? shiftpc : pc; + @checkc __atomic_add(pc, C); + )"; } -)"; +else{ + os << R"( + @checkc *pc = C;)"; +} + os << R"( +})"; } } diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index 5de366045..7110edccf 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -308,6 +308,10 @@ value *builder::create_atomic_cas(value *ptr, value *cmp, value *val, const std: return insert(atomic_cas_inst::create(ptr, cmp, val, name)); } +value *builder::create_atomic_add(value *ptr, value *val, const std::string &name){ + return insert(atomic_add_inst::create(ptr, val, name)); +} + value *builder::create_dot(value *A, value *B, value *C, const std::string &name) { return insert(dot_inst::create_nn(A, B, C, name)); } diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 9b5d37094..d1ae2af25 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -620,6 +620,19 @@ atomic_cas_inst::atomic_cas_inst(value *ptr, value *cmp, value *val, const std:: instruction* atomic_cas_inst::create(value *ptr, value *cmp, value *val, const std::string &name, instruction *next) { return new atomic_cas_inst(ptr, cmp, val, name, next); } + +// atomic add + +atomic_add_inst::atomic_add_inst(value *ptr, value *val, const std::string &name, instruction *next) + : builtin_inst(ptr->get_type()->get_pointer_element_ty(), 2, 1, name, next) { + set_operand(0, ptr); + set_operand(1, val); +} + +instruction* atomic_add_inst::create(value *ptr, value *val, const std::string &name, instruction *next) { + return new atomic_add_inst(ptr, val, name, next); +} + //===----------------------------------------------------------------------===// // intrinsic instructions //===----------------------------------------------------------------------===// diff --git a/lib/ir/type.cpp b/lib/ir/type.cpp index 215e8f746..e192b7431 100644 --- a/lib/ir/type.cpp +++ b/lib/ir/type.cpp @@ -59,8 +59,12 @@ unsigned type::get_pointer_address_space() const { } type * type::get_pointer_element_ty() const { - assert(is_pointer_ty()); - return ((pointer_type*)this)->get_element_ty(); + type *ptr_ty = get_scalar_ty(); + assert(ptr_ty->is_pointer_ty()); + type *scalar_ty = ((pointer_type*)ptr_ty)->get_element_ty(); + if(is_tile_ty()) + return tile_type::get_same_shapes(scalar_ty, (type*)this); + return scalar_ty; } diff --git a/lib/lang/expression.cpp b/lib/lang/expression.cpp index 731d50c24..7b1b4888c 100644 --- a/lib/lang/expression.cpp +++ b/lib/lang/expression.cpp @@ -130,6 +130,13 @@ ir::value* atomic_cas_expression::codegen(ir::module *mod) const { return mod->get_builder().create_atomic_cas(ptr, cmp, val); } +// atomic add +ir::value* atomic_add_expression::codegen(ir::module *mod) const { + ir::value *ptr = ptr_->codegen(mod); + ir::value *val = val_->codegen(mod); + return mod->get_builder().create_atomic_add(ptr, val); +} + // matmul ir::value* matmul_expression::codegen(ir::module *mod) const { ir::value *A = A_->codegen(mod); diff --git a/lib/lang/node.cpp b/lib/lang/node.cpp index 5c48657bf..940b4f2b9 100644 --- a/lib/lang/node.cpp +++ b/lib/lang/node.cpp @@ -53,8 +53,6 @@ void node::implicit_cast(ir::builder &builder, ir::value *&lhs, ir::value *&rhs, ir::type *right_ty = rhs->get_type()->get_scalar_ty(); // One operand is pointer if(left_ty->is_pointer_ty() || right_ty->is_pointer_ty()){ - if(left_ty->is_pointer_ty() && right_ty->is_pointer_ty()) - throw std::runtime_error("invalid operands"); if(right_ty->is_pointer_ty()) std::swap(lhs, rhs); is_ptr = true; diff --git a/lib/lang/statement.cpp b/lib/lang/statement.cpp index d67c62c8f..2e6460620 100644 --- a/lib/lang/statement.cpp +++ b/lib/lang/statement.cpp @@ -31,9 +31,6 @@ ir::value* expression_statement::codegen(ir::module *mod) const{ ir::builder &builder = mod->get_builder(); ir::basic_block *block = builder.get_insert_block(); if(pred_) { - // check that it is an assignment - assignment_expression *assignment = dynamic_cast(expr_); - assert(assignment); // generate mask ir::value *pred = pred_->codegen(mod); ir::mask_inst *mask = (ir::mask_inst*)builder.create_mask(pred); @@ -53,8 +50,10 @@ ir::value* expression_statement::codegen(ir::module *mod) const{ // merge with psi ir::psi_inst *psi = (ir::psi_inst*)builder.create_merge(mask->get_result(0), expr, mask->get_result(1), ir::undef_value::get(ty)); - std::string name = ((named_expression*)assignment->lvalue())->id()->name(); - mod->set_value(name, psi); + if(assignment_expression *assignment = dynamic_cast(expr_)){ + std::string name = ((named_expression*)assignment->lvalue())->id()->name(); + mod->set_value(name, psi); + } return psi; } return expr_->codegen(mod); From 5144dc3a6c56c802ece4f595a73d5f65956765c4 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 2 Jul 2019 20:45:10 -0700 Subject: [PATCH 207/494] [examples/python] added framework code for shift-conv --- examples/python/tensorflow/CMakeLists.txt | 2 +- examples/python/tensorflow/run.py | 27 +++++- examples/python/tensorflow/shift.cpp | 111 ++++++++++++++++++++++ 3 files changed, 135 insertions(+), 5 deletions(-) create mode 100644 examples/python/tensorflow/shift.cpp diff --git a/examples/python/tensorflow/CMakeLists.txt b/examples/python/tensorflow/CMakeLists.txt index bfd54f6a6..08ba828e3 100644 --- a/examples/python/tensorflow/CMakeLists.txt +++ b/examples/python/tensorflow/CMakeLists.txt @@ -5,7 +5,7 @@ if(${TensorFlow_FOUND}) include_directories("${CUDA_HOME}/include") link_directories(${TF_LIB}) add_definitions(-D_GLIBCXX_USE_CXX11_ABI=${TF_ABI}) - add_library(tf_blocksparse SHARED dot.cpp conv2d.cpp) + add_library(tf_blocksparse SHARED dot.cpp conv2d.cpp shift.cpp) target_link_libraries(tf_blocksparse tensorflow_framework triton) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/run.py ${CMAKE_CURRENT_BINARY_DIR}/run.py diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index dca626b1a..63acfdf2a 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -28,13 +28,13 @@ def run_dot(): print("dif: %f" % np.max(dif)) def run_conv(): - BS, C, H, W = 16, 32, 32, 32 + B, C, H, W = 16, 32, 32, 32 R, S, NF = 3, 3, 32 - a = tf.placeholder(tf.float32, shape=[BS, C, H, W]) + a = tf.placeholder(tf.float32, shape=[B, C, H, W]) b = tf.placeholder(tf.float32, shape=[C, R, S, NF]) c = module.conv2d(a, b) # Reference - ha = np.random.rand(BS, C, H, W) + ha = np.random.rand(B, C, H, W) hb = np.random.rand(C, R, S, NF) # Run sess = tf.InteractiveSession() @@ -42,4 +42,23 @@ def run_conv(): result = sess.run([c], feed_dict = {a: ha, b: hb})[0] -run_conv() +def run_shift(): + B, C, H, W = 16, 32, 32, 32 + R, S, F = 3, 3, 32 + a = tf.placeholder(tf.float32, shape=[C, H, W, B]) + b = tf.placeholder(tf.float32, shape=[C, F]) + shift_h = tf.zeros(C, tf.int32) + shift_w = tf.zeros(C, tf.int32) + hshift_h = np.zeros(C, np.int32) + hshift_w = np.zeros(C, np.int32) + c = module.shift_conv(a, b, shift_h=tf.make_tensor_proto(hshift_h), shift_w=tf.make_tensor_proto(hshift_w)) + # Reference + ha = np.random.rand(C, H, W, B) + hb = np.random.rand(C, F) + # Run + sess = tf.InteractiveSession() + sess.run(tf.global_variables_initializer()) + result = sess.run([c], feed_dict = {a: ha, + b: hb})[0] + +run_shift() diff --git a/examples/python/tensorflow/shift.cpp b/examples/python/tensorflow/shift.cpp new file mode 100644 index 000000000..2fe366de6 --- /dev/null +++ b/examples/python/tensorflow/shift.cpp @@ -0,0 +1,111 @@ +#include + +#include "triton/driver/buffer.h" +#include "triton/driver/backend.h" +#include "triton/driver/stream.h" +#include "triton/runtime/jit.h" +#include "triton/tools/bench.hpp" +#include "triton/dnn/shift.h" + +#define EIGEN_USE_GPU +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" +#include "tensorflow/core/util/padding.h" +#include "tensorflow/core/util/tensor_format.h" +#include "tensorflow/core/framework/common_shape_fns.h" + +using namespace tensorflow; +using GPUDevice = Eigen::GpuDevice; + +class ShiftConvOp : public OpKernel { +public: + explicit ShiftConvOp(OpKernelConstruction* context) : OpKernel(context) { + context->GetAttr("shift_h", &h_shift_h_); + context->GetAttr("shift_w", &h_shift_w_); + R_ = 3; + S_ = 3; + } + + void ComputeCommon(OpKernelContext* context){ + + } + + void Compute(OpKernelContext* context){ + // get device/stream + GPUDevice device = context->eigen_device(); + triton::driver::cu_stream sstream(device.stream(), false); + triton::driver::context* ctx = sstream.context(); + triton::driver::stream* stream = &sstream; + // get inputs + const Tensor& tf_a = context->input(0); + const Tensor& tf_b = context->input(1); + // shapes for a + int64_t Ca = tf_a.dim_size(0); + int64_t H = tf_a.dim_size(1); + int64_t W = tf_a.dim_size(2); + int64_t B = tf_a.dim_size(3); + // shapes for b + int64_t Cb = tf_b.dim_size(0); + int64_t F = tf_b.dim_size(1); + // checks + OP_REQUIRES(context, Ca == Cb, tensorflow::errors::InvalidArgument("operands must have the same number of channels")); + int64_t C = Ca; + // shapes for c + Tensor* tf_c = nullptr; + TensorShape out_shape({Ca, H, W, B}); + OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &tf_c)); + // return early if possible + if (out_shape.num_elements() == 0) + return; + // initialize default compute device + triton::jit jit(ctx); + // matrix multiplication parameters + triton::driver::cu_buffer da(ctx, (CUdeviceptr)tf_a.flat().data(), false); + triton::driver::cu_buffer db(ctx, (CUdeviceptr)tf_b.flat().data(), false); + triton::driver::cu_buffer dc(ctx, (CUdeviceptr)tf_c->flat().data(), false); + // shift configuration + int32_t* shift_h_data = h_shift_h_.flat().data(); + int32_t* shift_w_data = h_shift_w_.flat().data(); + std::vector shift_h(shift_h_data, shift_h_data + C); + std::vector shift_w(shift_w_data, shift_w_data + C); + triton::dnn::shift shift(B, C, 1, H, W, 1, R_, S_, F, shift_h, shift_w, "fp32", "fp32", triton::dnn::shift::FPROP, false); + // benchmark a given matrix multiplication kernel + auto benchmark = [&](triton::driver::kernel* kernel, + triton::jit::launch_information info) { + // launch info + unsigned TM = info.global_range_size[0]; + unsigned TN = info.global_range_size[1]; + unsigned nthreads = info.num_threads; + shift.init(stream, (triton::driver::cu_module*)kernel->module()); + shift.enqueue(stream, kernel, &da, &db, &dc, TM, TN, nthreads); + stream->synchronize(); + double ts = triton::tools::bench([&](){ shift.enqueue(stream, kernel, &da, &db, &dc, TM, TN, nthreads); }, + [&](){ stream->synchronize(); }, ctx->device()); + return shift.get_nflops() / ts * 1e-3; + }; + + std::ostringstream oss; + shift.src(oss); + std::string src = oss.str(); + triton::jit::tune_res_t best = jit.autotune("shift", src.c_str(), benchmark); + } + +private: + Tensor h_shift_h_; + Tensor h_shift_w_; +// triton::driver::buffer* d_shift_h_; +// triton::driver::buffer* d_shift_w_; + int R_; + int S_; +}; + +REGISTER_KERNEL_BUILDER(Name("ShiftConv").Device(DEVICE_GPU), ShiftConvOp); +REGISTER_OP("ShiftConv") + .Input("a: float32") + .Input("b: float32") + .Attr("shift_h: tensor") + .Attr("shift_w: tensor") + .Output("c: float32") +; From 0d8faa5b1e093b7059d60a3d8bd4d9da9fb429db Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 2 Jul 2019 21:38:10 -0700 Subject: [PATCH 208/494] fixup --- examples/python/tensorflow/shift.cpp | 73 +++++++++++++++++++--------- 1 file changed, 51 insertions(+), 22 deletions(-) diff --git a/examples/python/tensorflow/shift.cpp b/examples/python/tensorflow/shift.cpp index 2fe366de6..812912704 100644 --- a/examples/python/tensorflow/shift.cpp +++ b/examples/python/tensorflow/shift.cpp @@ -19,6 +19,7 @@ using namespace tensorflow; using GPUDevice = Eigen::GpuDevice; +template class ShiftConvOp : public OpKernel { public: explicit ShiftConvOp(OpKernelConstruction* context) : OpKernel(context) { @@ -28,7 +29,40 @@ public: S_ = 3; } - void ComputeCommon(OpKernelContext* context){ + void FillShapes(OpKernelContext* context, + int64_t &C, int64_t &H, int64_t &W, int64_t &B, int64_t &F, + const Tensor& tf_a, const Tensor& tf_b) { + if(OP == triton::dnn::shift::WGRAD) { + // shapes for a + F = tf_a.dim_size(0); + int64_t Ha = tf_a.dim_size(1); + int64_t Wa = tf_a.dim_size(2); + int64_t Ba = tf_a.dim_size(3); + // shapes for b + C = tf_b.dim_size(0); + int64_t Hb = tf_b.dim_size(1); + int64_t Wb = tf_b.dim_size(2); + int64_t Bb = tf_b.dim_size(3); + OP_REQUIRES(context, Ha == Hb, tensorflow::errors::InvalidArgument("operands must have the same image height")); + OP_REQUIRES(context, Wa == Wb, tensorflow::errors::InvalidArgument("operands must have the same image width")); + OP_REQUIRES(context, Ba == Bb, tensorflow::errors::InvalidArgument("operands must have the same batch size")); + H = Ha; + W = Wa; + B = Ba; + } + else { + // shapes for a + int64_t Ca = tf_a.dim_size(0); + H = tf_a.dim_size(1); + W = tf_a.dim_size(2); + B = tf_a.dim_size(3); + // shapes for b + int64_t Cb = tf_b.dim_size(0); + F = tf_b.dim_size(1); + // checks + OP_REQUIRES(context, Ca == Cb, tensorflow::errors::InvalidArgument("operands must have the same number of channels")); + C = Ca; + } } @@ -41,23 +75,24 @@ public: // get inputs const Tensor& tf_a = context->input(0); const Tensor& tf_b = context->input(1); - // shapes for a - int64_t Ca = tf_a.dim_size(0); - int64_t H = tf_a.dim_size(1); - int64_t W = tf_a.dim_size(2); - int64_t B = tf_a.dim_size(3); - // shapes for b - int64_t Cb = tf_b.dim_size(0); - int64_t F = tf_b.dim_size(1); - // checks - OP_REQUIRES(context, Ca == Cb, tensorflow::errors::InvalidArgument("operands must have the same number of channels")); - int64_t C = Ca; + // shapes + int64_t C, H, W, B, F; + FillShapes(context, C, H, W, B, F, tf_a, tf_b); + // shift configuration + int32_t* shift_h_data = h_shift_h_.flat().data(); + int32_t* shift_w_data = h_shift_w_.flat().data(); + std::vector shift_h(shift_h_data, shift_h_data + C); + std::vector shift_w(shift_w_data, shift_w_data + C); + triton::dnn::shift shift(B, C, 1, H, W, 1, R_, S_, F, shift_h, shift_w, "fp32", "fp32", OP, false); // shapes for c + std::vector c_shapes; + for(int32_t x: shift.c_shapes()) + c_shapes.push_back(x); + TensorShape out_shapes(c_shapes); Tensor* tf_c = nullptr; - TensorShape out_shape({Ca, H, W, B}); - OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &tf_c)); + OP_REQUIRES_OK(context, context->allocate_output(0, out_shapes, &tf_c)); // return early if possible - if (out_shape.num_elements() == 0) + if (out_shapes.num_elements() == 0) return; // initialize default compute device triton::jit jit(ctx); @@ -65,12 +100,6 @@ public: triton::driver::cu_buffer da(ctx, (CUdeviceptr)tf_a.flat().data(), false); triton::driver::cu_buffer db(ctx, (CUdeviceptr)tf_b.flat().data(), false); triton::driver::cu_buffer dc(ctx, (CUdeviceptr)tf_c->flat().data(), false); - // shift configuration - int32_t* shift_h_data = h_shift_h_.flat().data(); - int32_t* shift_w_data = h_shift_w_.flat().data(); - std::vector shift_h(shift_h_data, shift_h_data + C); - std::vector shift_w(shift_w_data, shift_w_data + C); - triton::dnn::shift shift(B, C, 1, H, W, 1, R_, S_, F, shift_h, shift_w, "fp32", "fp32", triton::dnn::shift::FPROP, false); // benchmark a given matrix multiplication kernel auto benchmark = [&](triton::driver::kernel* kernel, triton::jit::launch_information info) { @@ -101,7 +130,7 @@ private: int S_; }; -REGISTER_KERNEL_BUILDER(Name("ShiftConv").Device(DEVICE_GPU), ShiftConvOp); +REGISTER_KERNEL_BUILDER(Name("ShiftConv").Device(DEVICE_GPU), ShiftConvOp); REGISTER_OP("ShiftConv") .Input("a: float32") .Input("b: float32") From 1d88f0a36b3555c68bfbc0f14953876d0cc747c5 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 3 Jul 2019 19:25:16 -0700 Subject: [PATCH 209/494] stuff --- examples/cpp/shift.cpp | 4 +- examples/python/tensorflow/run.py | 42 +++++++--- examples/python/tensorflow/shift.cpp | 111 ++++++++++++++++++++------- include/triton/runtime/jit.h | 1 + lib/dnn/shift.cpp | 55 +++++++------ lib/driver/module.cpp | 2 +- lib/runtime/jit.cpp | 40 ++++++++++ 7 files changed, 194 insertions(+), 61 deletions(-) diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index b330a3a9c..3d7646d9e 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -74,12 +74,12 @@ int main() { // shift std::vector params = { - 16, 4, 64, 16, 4, 128, 2, 2, 1, 2, 4, 4, 16, 4 + 4, 2, 16, 4, 128, 2, 2, 1, 1, 8, 16, 8, 2 }; std::ostringstream oss; shift.src(oss); std::string src = oss.str(); - jit.autotune("shift", src.c_str(), benchmark); +// jit.autotune("shift", src.c_str(), benchmark); jit.add_module("shift", src.c_str(), params); triton::driver::kernel* kernel = jit.get_function("shift"); triton::jit::launch_information info = jit.get_launch_info("shift"); diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 63acfdf2a..cd6365f52 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -1,7 +1,9 @@ import os import tensorflow as tf +from tensorflow.python.framework import ops import numpy as np from time import time + data_files_path = tf.resource_loader.get_data_files_path() library_dir = os.path.dirname(os.path.realpath(__file__)) module = tf.load_op_library(os.path.join(library_dir, 'libtf_blocksparse.so')) @@ -42,23 +44,45 @@ def run_conv(): result = sess.run([c], feed_dict = {a: ha, b: hb})[0] + +@ops.RegisterGradient('ShiftConv') +def blocksparse_matmul_grad(op, dy): + shift_h = op.get_attr('shift_h') + shift_w = op.get_attr('shift_w') + x = op.inputs[0] + w = op.inputs[1] + dx = module.shift_conv_dx(dy, w, shift_h=shift_h, shift_w=shift_w) + dw = module.shift_conv_dw(dy, x, shift_h=shift_h, shift_w=shift_w) + return (dx, dw) + def run_shift(): - B, C, H, W = 16, 32, 32, 32 - R, S, F = 3, 3, 32 + B, C, H, W = 1, 16, 8, 8 + R, S, F = 3, 3, 16 a = tf.placeholder(tf.float32, shape=[C, H, W, B]) b = tf.placeholder(tf.float32, shape=[C, F]) - shift_h = tf.zeros(C, tf.int32) - shift_w = tf.zeros(C, tf.int32) - hshift_h = np.zeros(C, np.int32) - hshift_w = np.zeros(C, np.int32) + #hshift_h = np.random.randint(-R//2, R//2 + 1, size=C, dtype=np.int32) + #hshift_w = np.random.randint(-S//2, R//2 + 1, size=C, dtype=np.int32) + hshift_h = 0*np.ones(C, dtype=np.int32) + hshift_w = 0*np.ones(C, dtype=np.int32) c = module.shift_conv(a, b, shift_h=tf.make_tensor_proto(hshift_h), shift_w=tf.make_tensor_proto(hshift_w)) # Reference - ha = np.random.rand(C, H, W, B) - hb = np.random.rand(C, F) - # Run + ha = np.ones((C, H, W, B), dtype=np.int32) + hb = np.ones((C, F), dtype=np.int32) sess = tf.InteractiveSession() + grads = tf.test.compute_gradient([a, b], [(C, H, W, B), (C, F)], c, (C, H, W, B), + extra_feed_dict={a: ha, b: hb}) + dx_t, dx_n = grads[0] + dw_t, dw_n = grads[1] + print(dw_t) + print(dw_n) + #print(np.max(dw_t - dw_n)) + #print(np.max(dx_t - dx_n)) + np.savetxt('theoretical.dat', dw_t, fmt='%4.2f') + np.savetxt('numerical.dat', dw_n, fmt='%4.2f') + # Run sess.run(tf.global_variables_initializer()) result = sess.run([c], feed_dict = {a: ha, b: hb})[0] + #print(result) run_shift() diff --git a/examples/python/tensorflow/shift.cpp b/examples/python/tensorflow/shift.cpp index 812912704..a049f869d 100644 --- a/examples/python/tensorflow/shift.cpp +++ b/examples/python/tensorflow/shift.cpp @@ -19,6 +19,15 @@ using namespace tensorflow; using GPUDevice = Eigen::GpuDevice; +typedef std::tuple shift_key_t; + +static std::map> m_stream; +static std::map> m_jit; +static std::map> m_config; + template class ShiftConvOp : public OpKernel { public: @@ -78,15 +87,27 @@ public: // shapes int64_t C, H, W, B, F; FillShapes(context, C, H, W, B, F, tf_a, tf_b); + int64_t D = 1, T = 1; + bool has_bias = false; // shift configuration int32_t* shift_h_data = h_shift_h_.flat().data(); int32_t* shift_w_data = h_shift_w_.flat().data(); std::vector shift_h(shift_h_data, shift_h_data + C); std::vector shift_w(shift_w_data, shift_w_data + C); - triton::dnn::shift shift(B, C, 1, H, W, 1, R_, S_, F, shift_h, shift_w, "fp32", "fp32", OP, false); + shift_key_t key = {B, C, 1, H, W, 1, R_, S_, F, shift_h_data, shift_w_data, OP, has_bias}; + // create configuration + triton::dnn::shift* shift; + if(m_config.find(key) == m_config.end()) + shift = m_config.emplace(key, new triton::dnn::shift( + B, C, D, H, W, T, R_, S_, F, + shift_h, shift_w, "fp32", "fp32", OP, has_bias)) + .first->second.get(); + else + shift = m_config.at(key).get(); + // shapes for c std::vector c_shapes; - for(int32_t x: shift.c_shapes()) + for(int32_t x: shift->c_shapes()) c_shapes.push_back(x); TensorShape out_shapes(c_shapes); Tensor* tf_c = nullptr; @@ -94,38 +115,58 @@ public: // return early if possible if (out_shapes.num_elements() == 0) return; - // initialize default compute device - triton::jit jit(ctx); // matrix multiplication parameters triton::driver::cu_buffer da(ctx, (CUdeviceptr)tf_a.flat().data(), false); triton::driver::cu_buffer db(ctx, (CUdeviceptr)tf_b.flat().data(), false); triton::driver::cu_buffer dc(ctx, (CUdeviceptr)tf_c->flat().data(), false); - // benchmark a given matrix multiplication kernel - auto benchmark = [&](triton::driver::kernel* kernel, - triton::jit::launch_information info) { - // launch info - unsigned TM = info.global_range_size[0]; - unsigned TN = info.global_range_size[1]; - unsigned nthreads = info.num_threads; - shift.init(stream, (triton::driver::cu_module*)kernel->module()); - shift.enqueue(stream, kernel, &da, &db, &dc, TM, TN, nthreads); - stream->synchronize(); - double ts = triton::tools::bench([&](){ shift.enqueue(stream, kernel, &da, &db, &dc, TM, TN, nthreads); }, - [&](){ stream->synchronize(); }, ctx->device()); - return shift.get_nflops() / ts * 1e-3; - }; - - std::ostringstream oss; - shift.src(oss); - std::string src = oss.str(); - triton::jit::tune_res_t best = jit.autotune("shift", src.c_str(), benchmark); + // get JIT + triton::jit* jit; + bool autotune = false; + if(m_jit.find(key) == m_jit.end()) { + jit = m_jit.emplace(key, new triton::jit(ctx)).first->second.get(); + std::ostringstream oss; + shift->src(oss); + std::string src = oss.str(); + auto benchmark = [&](triton::driver::kernel* kernel, + triton::jit::launch_information info) { + // launch info + unsigned TM = info.global_range_size[0]; + unsigned TN = info.global_range_size[1]; + unsigned nthreads = info.num_threads; + shift->init(stream, (triton::driver::cu_module*)kernel->module()); + shift->enqueue(stream, kernel, &da, &db, &dc, TM, TN, nthreads); + stream->synchronize(); + double ts = triton::tools::bench([&](){ shift->enqueue(stream, kernel, &da, &db, &dc, TM, TN, nthreads); }, + [&](){ stream->synchronize(); }, ctx->device()); + return shift->get_nflops() / ts * 1e-3; + }; + // auto-tune and save result + if(autotune) { + triton::jit::tune_res_t best = jit->autotune("shift", src.c_str(), benchmark); + jit->add_module("shift", src.c_str(), best.params); + } + else { + jit->add_module("shift", src.c_str(), jit->get_valid("shift", src.c_str())); + } + triton::driver::kernel* kernel = jit->get_function("shift"); + shift->init(stream, (triton::driver::cu_module*)kernel->module()); + } + else + jit = m_jit.at(key).get(); + // Run + triton::driver::kernel* kernel = jit->get_function("shift"); + triton::jit::launch_information info = jit->get_launch_info("shift"); + // launch info + unsigned TM = info.global_range_size[0]; + unsigned TN = info.global_range_size[1]; + unsigned nthreads = info.num_threads; + // enqueue + shift->enqueue(stream, kernel, &da, &db, &dc, TM, TN, nthreads); } private: Tensor h_shift_h_; Tensor h_shift_w_; -// triton::driver::buffer* d_shift_h_; -// triton::driver::buffer* d_shift_w_; int R_; int S_; }; @@ -136,5 +177,21 @@ REGISTER_OP("ShiftConv") .Input("b: float32") .Attr("shift_h: tensor") .Attr("shift_w: tensor") - .Output("c: float32") -; + .Output("c: float32"); + +REGISTER_KERNEL_BUILDER(Name("ShiftConvDx").Device(DEVICE_GPU), ShiftConvOp); +REGISTER_OP("ShiftConvDx") + .Input("a: float32") + .Input("b: float32") + .Attr("shift_h: tensor") + .Attr("shift_w: tensor") + .Output("c: float32"); + +REGISTER_KERNEL_BUILDER(Name("ShiftConvDw").Device(DEVICE_GPU), ShiftConvOp); +REGISTER_OP("ShiftConvDw") + .Input("a: float32") + .Input("b: float32") + .Attr("shift_h: tensor") + .Attr("shift_w: tensor") + .Output("c: float32"); + diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index 6bc377c95..ca5395893 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -103,6 +103,7 @@ private: public: jit(driver::context* context); ~jit(); + std::vector get_valid(const char *name, const char *src); tune_res_t autotune(const char* name, const char* src, benchmark_t benchmark); void add_module(ir::module &module, const std::vector& params = {}); void add_module(const char* name, const char* src, const std::vector& params = {}); diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index e54ac4bdb..1a640e91e 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -70,16 +70,26 @@ shift::shift(int B, int C, } void shift::build_deltas() { - // compute offset - auto offset = [&](unsigned c) { - return c*ld_a_[0] + shift_h_[c]*ld_a_[1] + shift_w_[c]*ld_a_[2]; - }; h_deltas_.resize(MAX_C_); - // populate look-up table - for(unsigned c = 0; c < TK_; c++) - h_deltas_[c] = offset(c); - for(unsigned c = 0; c < C_; c++) - h_deltas_[TK_ + c] = offset(c + TK_) - offset(c); + if(ty_ == FPROP){ + // compute offset + auto offset = [&](unsigned c) { + return c*ld_a_[0] + shift_h_[c]*ld_a_[1] + shift_w_[c]*ld_a_[2]; + }; + // populate look-up table + for(unsigned c = 0; c < TK_; c++) + h_deltas_[c] = offset(c); + for(unsigned c = 0; c < C_; c++) + h_deltas_[TK_ + c] = offset(c + TK_) - offset(c); + } + if(ty_ == BPROP){ + for(unsigned c = 0; c < C_; c++) + h_deltas_[c] = shift_h_[c]*ld_c_[1] + shift_w_[c]*ld_c_[2]; + } + if(ty_ == WGRAD){ + for(unsigned c = 0; c < C_; c++) + h_deltas_[c] = shift_h_[c]*ld_b_[1] + shift_w_[c]*ld_b_[2]; + } } size_t shift::a_size(){ @@ -102,7 +112,7 @@ std::vector shift::c_shapes(){ } size_t shift::get_nflops() { - return 2. * M_ * N_ * K_; + return 2.*M_*N_*K_; } @@ -114,15 +124,13 @@ void shift::init(driver::stream *stream, driver::cu_module *module) { void shift::enqueue(driver::stream *stream, driver::kernel *kernel, driver::buffer *a, driver::buffer *b, driver::buffer *c, size_t TM, size_t TN, size_t nthreads) { - if(ty_ == WGRAD) - std::swap(a, b); kernel->setArg(0, a); kernel->setArg(1, b); kernel->setArg(2, c); kernel->setArg(3, M_); kernel->setArg(4, N_); kernel->setArg(5, K_); - kernel->setArg(6, B_*AH_*AW_); + kernel->setArg(6, M_); kernel->setArg(7, N_); kernel->setArg(8, B_); kernel->setArg(9, AH_); @@ -177,7 +185,7 @@ void shift(restrict read_only align(16) )" << a_ty_ << R"( *a, restrict read_only align(16) )" << b_ty_ << R"( *b, fp32 *c, int32 M, int32 N, int32 K, - multiple_of(4) int32 lda, multiple_of(4) int32 ldb, + int32 lda, int32 ldb, int32 ABS, int32 AH, int32 AW, int32 AR, int32 AS) { int32 rxa[TM] = get_global_range[TM](0); int32 ryb[TN] = get_global_range[TN](1); @@ -203,11 +211,13 @@ if(ty_ == FPROP){ } if(ty_ == WGRAD){ os << R"( - int32 shift[TK, TN] = 0;)"; + __constant__ int32* pd[TN] = delta + ryb; + int32 d[TN] = *pd; + int32 shift[TK, TN] = d[newaxis, :];)"; } os << R"( - )" << a_ty_ << "* pa[" << AS << "] = a + rxa" << bca1 << " + " << rka << bca0 << lda0 << R"(; - )" << b_ty_ << "* pb[" << BS << "] = b + ryb" << bcb1 << " + " << rkb << bcb0 << ldb0 << R"(; + )" << a_ty_ << "* pa[" << AS << "] = a + rxa" << bca1 << lda1 << " + " << rka << bca0 << lda0 << R"(; + )" << b_ty_ << "* pb[" << BS << "] = b + ryb" << bcb1 << ldb1 << " + " << rkb << bcb0 << ldb0 << R"(; )" << a_ty_ << " a[" << AS << R"(] = *pa; )" << b_ty_ << " b[" << BS << R"(] = *pb; for(int32 k = K; k > 0; k = k - TK){ @@ -239,7 +249,7 @@ if(ty_ == WGRAD){ int1 maskw[TK] = (rbw >= pad_w) && (rbw < (AW - pad_w)); int1 mask[TK, TN] = maskh[:, newaxis] && maskw[:, newaxis]; int32 inc[TK, TN] = mask ? 0 : shift; - pb = pb + TK; + pb = pb + TK)" << ldb0 << R"(; )" << b_ty_ << R"(* pbb[TK, TN] = pb + inc; @checkb b = *pbb;)"; } @@ -259,14 +269,15 @@ else{ if(ty_ == BPROP){ os << R"( int32 rcwhc[TM] = rxc / ABS; - int32 rcw[TM] = rcwhc % AW; + int32 rcw[TM] = (rcwhc % AW); int32 rchc[TM] = rcwhc / AW; - int32 rch[TM] = rchc % AH; + int32 rch[TM] = (rchc % AH); int1 maskh[TM] = (rch >= pad_h) && (rch < (AH - pad_h)); int1 maskw[TM] = (rcw >= pad_w) && (rcw < (AW - pad_w)); int1 interior[TM, TN] = maskh[:, newaxis] && maskw[:, newaxis]; - fp32* shiftpc[TM, TN] = pc + 0; - pc = interior ? shiftpc : pc; + __constant__ int32* pd[TN] = delta + ryc; + fp32* shift_pc[TM, TN] = pc + (*pd)[newaxis, :]; + pc = interior ? shift_pc : pc; @checkc __atomic_add(pc, C); )"; } diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index f11118401..4ff863666 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -255,7 +255,7 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ -// std::cout << source << sd::endl; +// std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index 85e51b22f..30547a19e 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -96,6 +96,46 @@ jit::jit(driver::context *context): driver_context_(context), jit::~jit(){ } +std::vector jit::get_valid(const char *name, const char *src) { + // find metaparameters + auto ptt_module = make_triton_module(name, src); + ir::module &tt_module = *ptt_module; + // set parameters + passes_wrapper passes(target_.get()); + passes.target_independent(tt_module); + passes.tune.run(tt_module); + auto mps = passes.tune.get_params(tt_module); + // create parameter ranges + std::vector> ranges; + for(ir::metaparameter *mp: mps) + ranges.push_back(mp->get_space()); + // iterate over parameters + std::vector result; + loop_nest(ranges, [&](const std::vector params){ + if(!result.empty()) + return; + std::map> errors; + unsigned i = 0; + for(ir::metaparameter *mp: mps) + mp->set_value(params[i++]); + passes.target_independent(tt_module); + passes.tune.init(tt_module); + passes.tune.check_constraints(errors); +// for(auto e: errors) +// for(auto x: e.second) +// std::cout << x << std::endl; +// std::cout << "-----" << std::endl; + if(!errors.empty()) + return; + result = params; + }); + if(result.empty()) + throw std::runtime_error("couldn't find valid parameters"); + return result; +} + + + jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t benchmark) { // find metaparameters auto ptt_module = make_triton_module(name, src); From 39aa22babb7d27dd824ee1dcd7569f81c1a76816 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 3 Jul 2019 19:52:31 -0700 Subject: [PATCH 210/494] more tinkering --- examples/python/tensorflow/run.py | 19 +++++++------ lib/dnn/shift.cpp | 44 ++++++++++++++++++++++--------- 2 files changed, 42 insertions(+), 21 deletions(-) diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index cd6365f52..55f60bb27 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -56,18 +56,20 @@ def blocksparse_matmul_grad(op, dy): return (dx, dw) def run_shift(): - B, C, H, W = 1, 16, 8, 8 + B, C, H, W = 1, 16, 4, 4 R, S, F = 3, 3, 16 a = tf.placeholder(tf.float32, shape=[C, H, W, B]) b = tf.placeholder(tf.float32, shape=[C, F]) #hshift_h = np.random.randint(-R//2, R//2 + 1, size=C, dtype=np.int32) #hshift_w = np.random.randint(-S//2, R//2 + 1, size=C, dtype=np.int32) - hshift_h = 0*np.ones(C, dtype=np.int32) - hshift_w = 0*np.ones(C, dtype=np.int32) + hshift_h = np.ones(C, dtype=np.int32) + hshift_w = np.ones(C, dtype=np.int32) c = module.shift_conv(a, b, shift_h=tf.make_tensor_proto(hshift_h), shift_w=tf.make_tensor_proto(hshift_w)) # Reference - ha = np.ones((C, H, W, B), dtype=np.int32) - hb = np.ones((C, F), dtype=np.int32) + ha = np.random.rand(C, H, W, B) + hb = np.random.rand(C, F) + #ha = np.ones((C, H, W, B), dtype=np.int32) + #hb = np.ones((C, F), dtype=np.int32) sess = tf.InteractiveSession() grads = tf.test.compute_gradient([a, b], [(C, H, W, B), (C, F)], c, (C, H, W, B), extra_feed_dict={a: ha, b: hb}) @@ -75,10 +77,11 @@ def run_shift(): dw_t, dw_n = grads[1] print(dw_t) print(dw_n) - #print(np.max(dw_t - dw_n)) + print(np.max(dw_t - dw_n)) #print(np.max(dx_t - dx_n)) - np.savetxt('theoretical.dat', dw_t, fmt='%4.2f') - np.savetxt('numerical.dat', dw_n, fmt='%4.2f') + np.savetxt('diff.dat', dw_t - dw_n, fmt='%2.4f') + np.savetxt('theoretical.dat', dw_t, fmt='%2.4f') + np.savetxt('numerical.dat', dw_n, fmt='%2.4f') # Run sess.run(tf.global_variables_initializer()) result = sess.run([c], feed_dict = {a: ha, diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 1a640e91e..f9674578f 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -217,9 +217,26 @@ if(ty_ == WGRAD){ } os << R"( )" << a_ty_ << "* pa[" << AS << "] = a + rxa" << bca1 << lda1 << " + " << rka << bca0 << lda0 << R"(; - )" << b_ty_ << "* pb[" << BS << "] = b + ryb" << bcb1 << ldb1 << " + " << rkb << bcb0 << ldb0 << R"(; )" << a_ty_ << " a[" << AS << R"(] = *pa; - )" << b_ty_ << " b[" << BS << R"(] = *pb; + )" << b_ty_ << "* pb[" << BS << "] = b + ryb" << bcb1 << ldb1 << " + " << rkb << bcb0 << ldb0 << ";"; +if(ty_ == WGRAD){ + os << R"( + int32 rbwhc[TK] = rkb / ABS; + int32 rbw[TK] = rbwhc % AW; + int32 rbhc[TK] = rbwhc / AW; + int32 rbh[TK] = rbhc % AH; + int1 maskh[TK] = (rbh >= pad_h) && (rbh < (AH - pad_h)); + int1 maskw[TK] = (rbw >= pad_w) && (rbw < (AW - pad_w)); + int1 mask[TK, TN] = maskh[:, newaxis] && maskw[:, newaxis]; + int32 inc[TK, TN] = mask ? 0 : shift; + )" << b_ty_ << R"(* shifted_pb[TK, TN] = pb + inc; + )" << b_ty_ << R"( b[TK, TN] = *shifted_pb;)"; +} +else{ + os << R"( + )" << b_ty_ << " b[" << BS << R"(] = *pb;)"; +} + os << R"( for(int32 k = K; k > 0; k = k - TK){ C = dot()" << usea << "," << useb << R"(, C); int1 checka[)" << AS << R"(] = k > TK; @@ -241,17 +258,18 @@ else{ } if(ty_ == WGRAD){ os << R"( - int32 rbwhc[TK] = rkb / ABS; - int32 rbw[TK] = rbwhc % AW; - int32 rbhc[TK] = rbwhc / AW; - int32 rbh[TK] = rbhc % AH; - int1 maskh[TK] = (rbh >= pad_h) && (rbh < (AH - pad_h)); - int1 maskw[TK] = (rbw >= pad_w) && (rbw < (AW - pad_w)); - int1 mask[TK, TN] = maskh[:, newaxis] && maskw[:, newaxis]; - int32 inc[TK, TN] = mask ? 0 : shift; - pb = pb + TK)" << ldb0 << R"(; - )" << b_ty_ << R"(* pbb[TK, TN] = pb + inc; - @checkb b = *pbb;)"; + pb = pb + TK)" << ldb0 << R"(; + rkb = rkb + TK; + rbwhc = rkb / ABS; + rbw = rbwhc % AW; + rbhc = rbwhc / AW; + rbh = rbhc % AH; + maskh = (rbh >= pad_h) && (rbh < (AH - pad_h)); + maskw = (rbw >= pad_w) && (rbw < (AW - pad_w)); + mask = maskh[:, newaxis] && maskw[:, newaxis]; + inc = mask ? 0 : shift; + shifted_pb = pb + inc; + @checkb b = *shifted_pb;)"; } else{ os << R"( From 1b2ceadf0d7b79f38a0b66e974a141c2cbcecd1a Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 3 Jul 2019 20:04:38 -0700 Subject: [PATCH 211/494] weight gradient seem to work --- examples/python/tensorflow/run.py | 14 +++++++------- lib/dnn/shift.cpp | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 55f60bb27..96f1c61ff 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -75,13 +75,13 @@ def run_shift(): extra_feed_dict={a: ha, b: hb}) dx_t, dx_n = grads[0] dw_t, dw_n = grads[1] - print(dw_t) - print(dw_n) - print(np.max(dw_t - dw_n)) - #print(np.max(dx_t - dx_n)) - np.savetxt('diff.dat', dw_t - dw_n, fmt='%2.4f') - np.savetxt('theoretical.dat', dw_t, fmt='%2.4f') - np.savetxt('numerical.dat', dw_n, fmt='%2.4f') + print(dx_t) + print(dx_n) + #print(np.max(dw_t - dw_n)) + print(np.max(dx_t - dx_n)) + np.savetxt('diff.dat', dx_t - dx_n, fmt='%2.4f') + np.savetxt('theoretical.dat', dx_t, fmt='%2.4f') + np.savetxt('numerical.dat', dx_n, fmt='%2.4f') # Run sess.run(tf.global_variables_initializer()) result = sess.run([c], feed_dict = {a: ha, diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index f9674578f..4b0662755 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -228,7 +228,7 @@ if(ty_ == WGRAD){ int1 maskh[TK] = (rbh >= pad_h) && (rbh < (AH - pad_h)); int1 maskw[TK] = (rbw >= pad_w) && (rbw < (AW - pad_w)); int1 mask[TK, TN] = maskh[:, newaxis] && maskw[:, newaxis]; - int32 inc[TK, TN] = mask ? 0 : shift; + int32 inc[TK, TN] = mask ? shift : 0; )" << b_ty_ << R"(* shifted_pb[TK, TN] = pb + inc; )" << b_ty_ << R"( b[TK, TN] = *shifted_pb;)"; } From bd1040510ff300e0d3613733bb3a4abf618af9cf Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 3 Jul 2019 20:21:32 -0700 Subject: [PATCH 212/494] dx works but that makes no sense? --- examples/python/tensorflow/run.py | 26 ++++++++++++++++---------- lib/dnn/shift.cpp | 6 +++--- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 96f1c61ff..96e3cb309 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -58,12 +58,15 @@ def blocksparse_matmul_grad(op, dy): def run_shift(): B, C, H, W = 1, 16, 4, 4 R, S, F = 3, 3, 16 + np.random.seed(2) a = tf.placeholder(tf.float32, shape=[C, H, W, B]) b = tf.placeholder(tf.float32, shape=[C, F]) - #hshift_h = np.random.randint(-R//2, R//2 + 1, size=C, dtype=np.int32) - #hshift_w = np.random.randint(-S//2, R//2 + 1, size=C, dtype=np.int32) - hshift_h = np.ones(C, dtype=np.int32) - hshift_w = np.ones(C, dtype=np.int32) + hshift_h = np.random.randint(- (R//2), R//2 + 1, size=C, dtype=np.int32) + hshift_w = np.random.randint(- (S//2), R//2 + 1, size=C, dtype=np.int32) + print(hshift_h) + print(hshift_w) + #hshift_h = np.ones(C, dtype=np.int32) + #hshift_w = np.ones(C, dtype=np.int32) c = module.shift_conv(a, b, shift_h=tf.make_tensor_proto(hshift_h), shift_w=tf.make_tensor_proto(hshift_w)) # Reference ha = np.random.rand(C, H, W, B) @@ -75,13 +78,16 @@ def run_shift(): extra_feed_dict={a: ha, b: hb}) dx_t, dx_n = grads[0] dw_t, dw_n = grads[1] - print(dx_t) - print(dx_n) - #print(np.max(dw_t - dw_n)) + print(dw_t) + print(dw_n) + print(np.max(dw_t - dw_n)) + #np.savetxt('diff.dat', dw_t - dw_n, fmt='%2.4f') + #np.savetxt('theoretical.dat', dw_t, fmt='%2.4f') + #np.savetxt('numerical.dat', dw_n, fmt='%2.4f') print(np.max(dx_t - dx_n)) - np.savetxt('diff.dat', dx_t - dx_n, fmt='%2.4f') - np.savetxt('theoretical.dat', dx_t, fmt='%2.4f') - np.savetxt('numerical.dat', dx_n, fmt='%2.4f') + #np.savetxt('diff.dat', dx_t - dx_n, fmt='%2.4f') + #np.savetxt('theoretical.dat', dx_t, fmt='%2.4f') + #np.savetxt('numerical.dat', dx_n, fmt='%2.4f') # Run sess.run(tf.global_variables_initializer()) result = sess.run([c], feed_dict = {a: ha, diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 4b0662755..c7013e281 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -287,15 +287,15 @@ else{ if(ty_ == BPROP){ os << R"( int32 rcwhc[TM] = rxc / ABS; - int32 rcw[TM] = (rcwhc % AW); + int32 rcw[TM] = rcwhc % AW; int32 rchc[TM] = rcwhc / AW; - int32 rch[TM] = (rchc % AH); + int32 rch[TM] = rchc % AH; int1 maskh[TM] = (rch >= pad_h) && (rch < (AH - pad_h)); int1 maskw[TM] = (rcw >= pad_w) && (rcw < (AW - pad_w)); int1 interior[TM, TN] = maskh[:, newaxis] && maskw[:, newaxis]; __constant__ int32* pd[TN] = delta + ryc; fp32* shift_pc[TM, TN] = pc + (*pd)[newaxis, :]; - pc = interior ? shift_pc : pc; + pc = interior ? pc : shift_pc; @checkc __atomic_add(pc, C); )"; } From 88ebdddf3dab1bfb5435b6eb982e6b71878c1111 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 3 Jul 2019 20:45:03 -0700 Subject: [PATCH 213/494] makes more sense now --- examples/python/tensorflow/run.py | 12 ++++++------ lib/dnn/shift.cpp | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 96e3cb309..e30be57e1 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -78,16 +78,16 @@ def run_shift(): extra_feed_dict={a: ha, b: hb}) dx_t, dx_n = grads[0] dw_t, dw_n = grads[1] - print(dw_t) - print(dw_n) - print(np.max(dw_t - dw_n)) + print(dx_t) + print(dx_n) + #print(np.max(dw_t - dw_n)) #np.savetxt('diff.dat', dw_t - dw_n, fmt='%2.4f') #np.savetxt('theoretical.dat', dw_t, fmt='%2.4f') #np.savetxt('numerical.dat', dw_n, fmt='%2.4f') print(np.max(dx_t - dx_n)) - #np.savetxt('diff.dat', dx_t - dx_n, fmt='%2.4f') - #np.savetxt('theoretical.dat', dx_t, fmt='%2.4f') - #np.savetxt('numerical.dat', dx_n, fmt='%2.4f') + np.savetxt('diff.dat', dx_t - dx_n, fmt='%2.4f') + np.savetxt('theoretical.dat', dx_t, fmt='%2.4f') + np.savetxt('numerical.dat', dx_n, fmt='%2.4f') # Run sess.run(tf.global_variables_initializer()) result = sess.run([c], feed_dict = {a: ha, diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index c7013e281..f70991635 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -88,7 +88,7 @@ void shift::build_deltas() { } if(ty_ == WGRAD){ for(unsigned c = 0; c < C_; c++) - h_deltas_[c] = shift_h_[c]*ld_b_[1] + shift_w_[c]*ld_b_[2]; + h_deltas_[c] = -shift_h_[c]*ld_b_[1] + -shift_w_[c]*ld_b_[2]; } } From c666f71fd6107380671405e4ae6e251819a4a0f6 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 5 Jul 2019 15:07:20 -0700 Subject: [PATCH 214/494] fixed bug --- examples/python/tensorflow/run.py | 22 ++++++++--------- lib/dnn/shift.cpp | 39 ++++++++++++++++--------------- lib/lang/node.cpp | 2 -- 3 files changed, 31 insertions(+), 32 deletions(-) diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index e30be57e1..0d8019f27 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -61,30 +61,30 @@ def run_shift(): np.random.seed(2) a = tf.placeholder(tf.float32, shape=[C, H, W, B]) b = tf.placeholder(tf.float32, shape=[C, F]) - hshift_h = np.random.randint(- (R//2), R//2 + 1, size=C, dtype=np.int32) - hshift_w = np.random.randint(- (S//2), R//2 + 1, size=C, dtype=np.int32) + #hshift_h = np.random.randint(- (R//2), R//2 + 1, size=C, dtype=np.int32) + #hshift_w = np.random.randint(- (S//2), R//2 + 1, size=C, dtype=np.int32) + hshift_h = -1*np.ones(C, dtype=np.int32) + hshift_w = -1*np.ones(C, dtype=np.int32) print(hshift_h) print(hshift_w) - #hshift_h = np.ones(C, dtype=np.int32) - #hshift_w = np.ones(C, dtype=np.int32) c = module.shift_conv(a, b, shift_h=tf.make_tensor_proto(hshift_h), shift_w=tf.make_tensor_proto(hshift_w)) + c = tf.math.reduce_sum(c) # Reference - ha = np.random.rand(C, H, W, B) - hb = np.random.rand(C, F) + ha = np.ones((C, H, W, B), dtype=np.float32) + hb = np.ones((C, F), dtype=np.float32) #ha = np.ones((C, H, W, B), dtype=np.int32) #hb = np.ones((C, F), dtype=np.int32) sess = tf.InteractiveSession() - grads = tf.test.compute_gradient([a, b], [(C, H, W, B), (C, F)], c, (C, H, W, B), + grads = tf.test.compute_gradient([a, b], [(C, H, W, B), (C, F)], c, (1,), extra_feed_dict={a: ha, b: hb}) dx_t, dx_n = grads[0] dw_t, dw_n = grads[1] - print(dx_t) - print(dx_n) - #print(np.max(dw_t - dw_n)) + #print(dw_t - dw_n) #np.savetxt('diff.dat', dw_t - dw_n, fmt='%2.4f') #np.savetxt('theoretical.dat', dw_t, fmt='%2.4f') #np.savetxt('numerical.dat', dw_n, fmt='%2.4f') - print(np.max(dx_t - dx_n)) + print(np.max(np.abs(dw_t - dw_n))) + print(np.max(np.abs(dx_t - dx_n))) np.savetxt('diff.dat', dx_t - dx_n, fmt='%2.4f') np.savetxt('theoretical.dat', dx_t, fmt='%2.4f') np.savetxt('numerical.dat', dx_n, fmt='%2.4f') diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index f70991635..92245ba16 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -83,12 +83,13 @@ void shift::build_deltas() { h_deltas_[TK_ + c] = offset(c + TK_) - offset(c); } if(ty_ == BPROP){ - for(unsigned c = 0; c < C_; c++) + for(unsigned c = 0; c < C_; c++){ h_deltas_[c] = shift_h_[c]*ld_c_[1] + shift_w_[c]*ld_c_[2]; + } } if(ty_ == WGRAD){ for(unsigned c = 0; c < C_; c++) - h_deltas_[c] = -shift_h_[c]*ld_b_[1] + -shift_w_[c]*ld_b_[2]; + h_deltas_[c] = shift_h_[c]*ld_b_[1] + shift_w_[c]*ld_b_[2]; } } @@ -202,12 +203,12 @@ if(ty_ == FPROP){ int32 rah[TM] = rahc % AH; __constant__ int32* pd[TK] = delta + rka; multiple_of(4) int32 d[TK] = *pd; - int1 maskh[TM] = (rah >= pad_h) && (rah < (AH - pad_h)); - int1 maskw[TM] = (raw >= pad_w) && (raw < (AW - pad_w)); - int1 mask[TM, TK] = maskh[:, newaxis] && maskw[:, newaxis]; + int1 interiorh[TM] = (rah >= pad_h) && (rah < (AH - pad_h)); + int1 interiorw[TM] = (raw >= pad_w) && (raw < (AW - pad_w)); + int1 interior[TM, TK] = interiorh[:, newaxis] && interiorw[:, newaxis]; int32 inc_true[TM, TK] = d[newaxis, :]; int32 inc_false[TM, TK] = rka[newaxis, :] * lda; - int32 inc[TM, TK] = mask ? inc_true : inc_false;)"; + int32 inc[TM, TK] = interior ? inc_true : inc_false;)"; } if(ty_ == WGRAD){ os << R"( @@ -225,10 +226,10 @@ if(ty_ == WGRAD){ int32 rbw[TK] = rbwhc % AW; int32 rbhc[TK] = rbwhc / AW; int32 rbh[TK] = rbhc % AH; - int1 maskh[TK] = (rbh >= pad_h) && (rbh < (AH - pad_h)); - int1 maskw[TK] = (rbw >= pad_w) && (rbw < (AW - pad_w)); - int1 mask[TK, TN] = maskh[:, newaxis] && maskw[:, newaxis]; - int32 inc[TK, TN] = mask ? shift : 0; + int1 interiorh[TK] = (rbh >= pad_h) && (rbh < (AH - pad_h)); + int1 interiorw[TK] = (rbw >= pad_w) && (rbw < (AW - pad_w)); + int1 interior[TK, TN] = interiorh[:, newaxis] && interiorw[:, newaxis]; + int32 inc[TK, TN] = interior ? shift : 0; )" << b_ty_ << R"(* shifted_pb[TK, TN] = pb + inc; )" << b_ty_ << R"( b[TK, TN] = *shifted_pb;)"; } @@ -247,7 +248,7 @@ if(ty_ == FPROP){ d = *pd; inc_true = d[newaxis, :]; inc_false = TK * lda; - inc = mask ? inc_true : inc_false; + inc = interior ? inc_true : inc_false; pa = pa + inc; @checka a = *pa;)"; } @@ -264,10 +265,10 @@ if(ty_ == WGRAD){ rbw = rbwhc % AW; rbhc = rbwhc / AW; rbh = rbhc % AH; - maskh = (rbh >= pad_h) && (rbh < (AH - pad_h)); - maskw = (rbw >= pad_w) && (rbw < (AW - pad_w)); - mask = maskh[:, newaxis] && maskw[:, newaxis]; - inc = mask ? 0 : shift; + interiorh = (rbh >= pad_h) && (rbh < (AH - pad_h)); + interiorw = (rbw >= pad_w) && (rbw < (AW - pad_w)); + interior = interiorh[:, newaxis] && interiorw[:, newaxis]; + inc = interior ? shift : 0; shifted_pb = pb + inc; @checkb b = *shifted_pb;)"; } @@ -290,12 +291,12 @@ if(ty_ == BPROP){ int32 rcw[TM] = rcwhc % AW; int32 rchc[TM] = rcwhc / AW; int32 rch[TM] = rchc % AH; - int1 maskh[TM] = (rch >= pad_h) && (rch < (AH - pad_h)); - int1 maskw[TM] = (rcw >= pad_w) && (rcw < (AW - pad_w)); - int1 interior[TM, TN] = maskh[:, newaxis] && maskw[:, newaxis]; + int1 interiorh[TM] = (rch >= pad_h) && (rch < (AH - pad_h)); + int1 interiorw[TM] = (rcw >= pad_w) && (rcw < (AW - pad_w)); + int1 interior[TM, TN] = interiorh[:, newaxis] && interiorw[:, newaxis]; __constant__ int32* pd[TN] = delta + ryc; fp32* shift_pc[TM, TN] = pc + (*pd)[newaxis, :]; - pc = interior ? pc : shift_pc; + pc = interior ? shift_pc : pc; @checkc __atomic_add(pc, C); )"; } diff --git a/lib/lang/node.cpp b/lib/lang/node.cpp index 940b4f2b9..29d61cdb8 100644 --- a/lib/lang/node.cpp +++ b/lib/lang/node.cpp @@ -53,8 +53,6 @@ void node::implicit_cast(ir::builder &builder, ir::value *&lhs, ir::value *&rhs, ir::type *right_ty = rhs->get_type()->get_scalar_ty(); // One operand is pointer if(left_ty->is_pointer_ty() || right_ty->is_pointer_ty()){ - if(right_ty->is_pointer_ty()) - std::swap(lhs, rhs); is_ptr = true; } // One operand is double From 3e49dbe6abe0b01772bdf698a4694ac1b8ea53fc Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 5 Jul 2019 17:17:22 -0700 Subject: [PATCH 215/494] [dnn/shift] fixed in leading dimensions for shift-conv operation --- examples/python/tensorflow/run.py | 24 ++++++++---------------- examples/python/tensorflow/shift.cpp | 2 ++ lib/dnn/shift.cpp | 6 ++++-- 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 0d8019f27..c85ef2f7e 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -56,38 +56,30 @@ def blocksparse_matmul_grad(op, dy): return (dx, dw) def run_shift(): - B, C, H, W = 1, 16, 4, 4 + B, C, H, W = 1, 16, 8, 8 R, S, F = 3, 3, 16 np.random.seed(2) a = tf.placeholder(tf.float32, shape=[C, H, W, B]) b = tf.placeholder(tf.float32, shape=[C, F]) - #hshift_h = np.random.randint(- (R//2), R//2 + 1, size=C, dtype=np.int32) - #hshift_w = np.random.randint(- (S//2), R//2 + 1, size=C, dtype=np.int32) - hshift_h = -1*np.ones(C, dtype=np.int32) - hshift_w = -1*np.ones(C, dtype=np.int32) + hshift_h = np.random.randint(- (R//2), R//2 + 1, size=C, dtype=np.int32) + hshift_w = np.random.randint(- (S//2), R//2 + 1, size=C, dtype=np.int32) + #hshift_h = np.ones(C, dtype=np.int32) + #hshift_w = np.ones(C, dtype=np.int32) print(hshift_h) print(hshift_w) c = module.shift_conv(a, b, shift_h=tf.make_tensor_proto(hshift_h), shift_w=tf.make_tensor_proto(hshift_w)) - c = tf.math.reduce_sum(c) # Reference - ha = np.ones((C, H, W, B), dtype=np.float32) - hb = np.ones((C, F), dtype=np.float32) + ha = np.random.rand(C, H, W, B) + hb = np.random.rand(C, F) #ha = np.ones((C, H, W, B), dtype=np.int32) #hb = np.ones((C, F), dtype=np.int32) sess = tf.InteractiveSession() - grads = tf.test.compute_gradient([a, b], [(C, H, W, B), (C, F)], c, (1,), + grads = tf.test.compute_gradient([a, b], [(C, H, W, B), (C, F)], c, (F, H, W, B), extra_feed_dict={a: ha, b: hb}) dx_t, dx_n = grads[0] dw_t, dw_n = grads[1] - #print(dw_t - dw_n) - #np.savetxt('diff.dat', dw_t - dw_n, fmt='%2.4f') - #np.savetxt('theoretical.dat', dw_t, fmt='%2.4f') - #np.savetxt('numerical.dat', dw_n, fmt='%2.4f') print(np.max(np.abs(dw_t - dw_n))) print(np.max(np.abs(dx_t - dx_n))) - np.savetxt('diff.dat', dx_t - dx_n, fmt='%2.4f') - np.savetxt('theoretical.dat', dx_t, fmt='%2.4f') - np.savetxt('numerical.dat', dx_n, fmt='%2.4f') # Run sess.run(tf.global_variables_initializer()) result = sess.run([c], feed_dict = {a: ha, diff --git a/examples/python/tensorflow/shift.cpp b/examples/python/tensorflow/shift.cpp index a049f869d..c000c9db2 100644 --- a/examples/python/tensorflow/shift.cpp +++ b/examples/python/tensorflow/shift.cpp @@ -71,6 +71,8 @@ public: // checks OP_REQUIRES(context, Ca == Cb, tensorflow::errors::InvalidArgument("operands must have the same number of channels")); C = Ca; + if(OP == triton::dnn::shift::BPROP) + std::swap(C, F); } } diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 92245ba16..122a51578 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -125,14 +125,16 @@ void shift::init(driver::stream *stream, driver::cu_module *module) { void shift::enqueue(driver::stream *stream, driver::kernel *kernel, driver::buffer *a, driver::buffer *b, driver::buffer *c, size_t TM, size_t TN, size_t nthreads) { + int32_t lda = AT_ ? K_ : M_; + int32_t ldb = BT_ ? N_ : K_; kernel->setArg(0, a); kernel->setArg(1, b); kernel->setArg(2, c); kernel->setArg(3, M_); kernel->setArg(4, N_); kernel->setArg(5, K_); - kernel->setArg(6, M_); - kernel->setArg(7, N_); + kernel->setArg(6, lda); + kernel->setArg(7, ldb); kernel->setArg(8, B_); kernel->setArg(9, AH_); kernel->setArg(10, AW_); From b0cf3143c5c062cafb187aeebf25a4cc17def701 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 6 Jul 2019 11:27:49 -0700 Subject: [PATCH 216/494] [dnn/shift] bugfix in wgrad --- examples/python/tensorflow/run.py | 4 ++-- examples/python/tensorflow/shift.cpp | 2 ++ lib/dnn/shift.cpp | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index c85ef2f7e..385a904c0 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -56,7 +56,7 @@ def blocksparse_matmul_grad(op, dy): return (dx, dw) def run_shift(): - B, C, H, W = 1, 16, 8, 8 + B, C, H, W = 1, 32, 8, 6 R, S, F = 3, 3, 16 np.random.seed(2) a = tf.placeholder(tf.float32, shape=[C, H, W, B]) @@ -76,8 +76,8 @@ def run_shift(): sess = tf.InteractiveSession() grads = tf.test.compute_gradient([a, b], [(C, H, W, B), (C, F)], c, (F, H, W, B), extra_feed_dict={a: ha, b: hb}) - dx_t, dx_n = grads[0] dw_t, dw_n = grads[1] + dx_t, dx_n = grads[0] print(np.max(np.abs(dw_t - dw_n))) print(np.max(np.abs(dx_t - dx_n))) # Run diff --git a/examples/python/tensorflow/shift.cpp b/examples/python/tensorflow/shift.cpp index c000c9db2..f9c102a6d 100644 --- a/examples/python/tensorflow/shift.cpp +++ b/examples/python/tensorflow/shift.cpp @@ -68,6 +68,8 @@ public: // shapes for b int64_t Cb = tf_b.dim_size(0); F = tf_b.dim_size(1); + if(OP == triton::dnn::shift::BPROP) + std::swap(Cb, F); // checks OP_REQUIRES(context, Ca == Cb, tensorflow::errors::InvalidArgument("operands must have the same number of channels")); C = Ca; diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 122a51578..b686f9cde 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -49,7 +49,7 @@ shift::shift(int B, int C, shapes_a_.swap(shapes_b_); AT_ = true; BT_ = false; - M_ = K_; + M_ = F_; N_ = C_; K_ = B_*AH_*AW_; } From f9db0449b7f4d69e3822e57e4b5832bc0ee0ba9d Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 8 Jul 2019 18:44:37 -0700 Subject: [PATCH 217/494] [dnn] Adding batchnorm --- examples/python/pytorch/CMakeLists.txt | 2 +- examples/python/pytorch/common.hpp | 75 -------- examples/python/pytorch/conv.cpp | 2 +- examples/python/pytorch/main.py | 145 --------------- examples/python/pytorch/models/__init__.py | 14 -- examples/python/pytorch/models/densenet.py | 107 ----------- examples/python/pytorch/models/dpn.py | 98 ---------- examples/python/pytorch/models/googlenet.py | 107 ----------- examples/python/pytorch/models/lenet.py | 24 --- examples/python/pytorch/models/mobilenet.py | 61 ------ examples/python/pytorch/models/mobilenetv2.py | 86 --------- examples/python/pytorch/models/pnasnet.py | 125 ------------- .../python/pytorch/models/preact_resnet.py | 118 ------------ examples/python/pytorch/models/resnet.py | 121 ------------ examples/python/pytorch/models/resnext.py | 95 ---------- examples/python/pytorch/models/senet.py | 121 ------------ examples/python/pytorch/models/shufflenet.py | 109 ----------- .../python/pytorch/models/shufflenetv2.py | 162 ---------------- examples/python/pytorch/models/vgg.py | 47 ----- examples/python/pytorch/shift.cpp | 114 ++++++++++++ examples/python/pytorch/utils.py | 124 ------------- examples/python/tensorflow/CMakeLists.txt | 2 +- examples/python/tensorflow/batchnorm.cpp | 174 ++++++++++++++++++ .../tensorflow/{conv2d.cpp => conv.cpp} | 2 - examples/python/tensorflow/run.py | 40 ++-- examples/python/tensorflow/shift.cpp | 2 +- include/triton/dnn/batchnorm.h | 83 +++++++++ include/triton/ir/builder.h | 1 + include/triton/ir/instructions.h | 9 + include/triton/lang/expression.h | 8 + include/triton/lang/parser.y | 3 +- include/triton/lang/scanner.l | 1 + lib/codegen/selection.cpp | 56 ++++++ lib/codegen/shmem_allocation.cpp | 2 + lib/codegen/shmem_info.cpp | 2 + lib/codegen/tune.cpp | 2 + lib/dnn/batchnorm.cpp | 165 +++++++++++++++++ lib/dnn/shift.cpp | 10 +- lib/driver/module.cpp | 2 +- lib/ir/builder.cpp | 4 + lib/ir/instructions.cpp | 13 ++ lib/lang/expression.cpp | 7 +- 42 files changed, 682 insertions(+), 1763 deletions(-) delete mode 100644 examples/python/pytorch/common.hpp delete mode 100644 examples/python/pytorch/main.py delete mode 100644 examples/python/pytorch/models/__init__.py delete mode 100644 examples/python/pytorch/models/densenet.py delete mode 100644 examples/python/pytorch/models/dpn.py delete mode 100644 examples/python/pytorch/models/googlenet.py delete mode 100644 examples/python/pytorch/models/lenet.py delete mode 100644 examples/python/pytorch/models/mobilenet.py delete mode 100644 examples/python/pytorch/models/mobilenetv2.py delete mode 100644 examples/python/pytorch/models/pnasnet.py delete mode 100644 examples/python/pytorch/models/preact_resnet.py delete mode 100644 examples/python/pytorch/models/resnet.py delete mode 100644 examples/python/pytorch/models/resnext.py delete mode 100644 examples/python/pytorch/models/senet.py delete mode 100644 examples/python/pytorch/models/shufflenet.py delete mode 100644 examples/python/pytorch/models/shufflenetv2.py delete mode 100644 examples/python/pytorch/models/vgg.py create mode 100644 examples/python/pytorch/shift.cpp delete mode 100644 examples/python/pytorch/utils.py create mode 100644 examples/python/tensorflow/batchnorm.cpp rename examples/python/tensorflow/{conv2d.cpp => conv.cpp} (96%) create mode 100644 include/triton/dnn/batchnorm.h create mode 100644 lib/dnn/batchnorm.cpp diff --git a/examples/python/pytorch/CMakeLists.txt b/examples/python/pytorch/CMakeLists.txt index 22e52c65d..759a9709a 100644 --- a/examples/python/pytorch/CMakeLists.txt +++ b/examples/python/pytorch/CMakeLists.txt @@ -5,6 +5,6 @@ if(${TORCH_FOUND}) include_directories("${CUDA_HOME}/include") link_directories(${TORCH_LIBRARY_DIRS}) add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) - add_library(torch_triton SHARED conv.cpp) + add_library(torch_triton SHARED conv.cpp shift.cpp) target_link_libraries(torch_triton torch triton) endif() diff --git a/examples/python/pytorch/common.hpp b/examples/python/pytorch/common.hpp deleted file mode 100644 index f2ba2f83b..000000000 --- a/examples/python/pytorch/common.hpp +++ /dev/null @@ -1,75 +0,0 @@ -#include -#include -#include -#include "triton/driver/device.h" -#include - -class timer{ - typedef std::chrono::high_resolution_clock high_resolution_clock; - typedef std::chrono::nanoseconds nanoseconds; - -public: - explicit timer(bool run = false) - { if (run) start(); } - - void start() - { _start = high_resolution_clock::now(); } - - nanoseconds get() const - { return std::chrono::duration_cast(high_resolution_clock::now() - _start); } - -private: - high_resolution_clock::time_point _start; -}; - -template -T min(std::vector x) -{ return *std::min_element(x.begin(), x.end()); } - - -template -double bench(OP const & op, SYNC const & sync, triton::driver::device const & device) -{ - timer tmr; - std::vector times; - double total_time = 0; - op(); - sync(); - while(total_time*1e-9 < 1e-3){ - float norm = 1; - tmr.start(); - op(); - sync(); - times.push_back(norm*tmr.get().count()); - total_time+=times.back(); - } - return min(times); -} - -// helper function to print a tuple of any size -template -struct TuplePrinter { - static void print(const Tuple& t) - { - TuplePrinter::print(t); - std::cout << ", " << std::get(t); - } -}; - -template -struct TuplePrinter { - static void print(const Tuple& t) - { - std::cout << std::get<0>(t); - } -}; - -template -void print(const std::tuple& t) -{ - std::cout << "("; - TuplePrinter::print(t); - std::cout << ")\n"; -} - - diff --git a/examples/python/pytorch/conv.cpp b/examples/python/pytorch/conv.cpp index 6ccedc75f..a21549c31 100644 --- a/examples/python/pytorch/conv.cpp +++ b/examples/python/pytorch/conv.cpp @@ -56,7 +56,7 @@ torch::Tensor conv_common( stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, 1, 1, 1, - ty, has_bias)).first->second.get(); + "fp32", "fp32", ty, has_bias)).first->second.get(); else configuration = m_config.at(key).get(); diff --git a/examples/python/pytorch/main.py b/examples/python/pytorch/main.py deleted file mode 100644 index 5b3de3790..000000000 --- a/examples/python/pytorch/main.py +++ /dev/null @@ -1,145 +0,0 @@ -'''Train CIFAR10 with PyTorch.''' -from __future__ import print_function - -import torch -import torch.nn as nn -import torch.optim as optim -import torch.nn.functional as F -import torch.backends.cudnn as cudnn - -import torchvision -import torchvision.transforms as transforms - -import os -import argparse -import numpy as np -import random - -from models import * -from utils import progress_bar - - -parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training') -parser.add_argument('--lr', default=0.1, type=float, help='learning rate') -parser.add_argument('--resume', '-r', action='store_true', help='resume from checkpoint') -args = parser.parse_args() - -device = 'cuda' if torch.cuda.is_available() else 'cpu' -best_acc = 0 # best test accuracy -start_epoch = 0 # start from epoch 0 or last checkpoint epoch - -# Data -print('==> Preparing data..') -transform_train = transforms.Compose([ - transforms.RandomCrop(32, padding=4), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), -]) - -transform_test = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), -]) - -trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train) -trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2) - -testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test) -testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2) - -classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') - -# Model -print('==> Building model..') -net = LeNet() -# net = VGG('VGG19') -# net = ResNet18() -# net = PreActResNet18() -# net = GoogLeNet() -# net = DenseNet121() -# net = ResNeXt29_2x64d() -# net = MobileNet() -# net = MobileNetV2() -# net = DPN92() -# net = ShuffleNetG2() -# net = SENet18() -# net = ShuffleNetV2(1) -net = net.to(device) -if device == 'cuda': - net = torch.nn.DataParallel(net) - cudnn.benchmark = False - -if args.resume: - # Load checkpoint. - print('==> Resuming from checkpoint..') - assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!' - checkpoint = torch.load('./checkpoint/ckpt.t7') - net.load_state_dict(checkpoint['net']) - best_acc = checkpoint['acc'] - start_epoch = checkpoint['epoch'] - -criterion = nn.CrossEntropyLoss() -optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) - -# Training -def train(epoch): - print('\nEpoch: %d' % epoch) - net.train() - train_loss = 0 - correct = 0 - total = 0 - for batch_idx, (inputs, targets) in enumerate(trainloader): - inputs, targets = inputs.to(device), targets.to(device) - optimizer.zero_grad() - outputs = net(inputs) - loss = criterion(outputs, targets) - loss.backward() - optimizer.step() - - train_loss += loss.item() - _, predicted = outputs.max(1) - total += targets.size(0) - correct += predicted.eq(targets).sum().item() - - progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' - % (train_loss/(batch_idx+1), 100.*correct/total, correct, total)) - -def test(epoch): - global best_acc - net.eval() - test_loss = 0 - correct = 0 - total = 0 - with torch.no_grad(): - for batch_idx, (inputs, targets) in enumerate(testloader): - inputs, targets = inputs.to(device), targets.to(device) - outputs = net(inputs) - loss = criterion(outputs, targets) - - test_loss += loss.item() - _, predicted = outputs.max(1) - total += targets.size(0) - correct += predicted.eq(targets).sum().item() - - progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' - % (test_loss/(batch_idx+1), 100.*correct/total, correct, total)) - - # Save checkpoint. - acc = 100.*correct/total - if acc > best_acc: - print('Saving..') - state = { - 'net': net.state_dict(), - 'acc': acc, - 'epoch': epoch, - } - if not os.path.isdir('checkpoint'): - os.mkdir('checkpoint') - torch.save(state, './checkpoint/ckpt.t7') - best_acc = acc - - -for epoch in range(start_epoch, start_epoch+200): - train(epoch) - test(epoch) diff --git a/examples/python/pytorch/models/__init__.py b/examples/python/pytorch/models/__init__.py deleted file mode 100644 index 877893903..000000000 --- a/examples/python/pytorch/models/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -from .vgg import * -from .dpn import * -from .lenet import * -from .senet import * -from .pnasnet import * -from .densenet import * -from .googlenet import * -from .shufflenet import * -from .shufflenetv2 import * -from .resnet import * -from .resnext import * -from .preact_resnet import * -from .mobilenet import * -from .mobilenetv2 import * diff --git a/examples/python/pytorch/models/densenet.py b/examples/python/pytorch/models/densenet.py deleted file mode 100644 index 47ebbbe08..000000000 --- a/examples/python/pytorch/models/densenet.py +++ /dev/null @@ -1,107 +0,0 @@ -'''DenseNet in PyTorch.''' -import math - -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class Bottleneck(nn.Module): - def __init__(self, in_planes, growth_rate): - super(Bottleneck, self).__init__() - self.bn1 = nn.BatchNorm2d(in_planes) - self.conv1 = nn.Conv2d(in_planes, 4*growth_rate, kernel_size=1, bias=False) - self.bn2 = nn.BatchNorm2d(4*growth_rate) - self.conv2 = nn.Conv2d(4*growth_rate, growth_rate, kernel_size=3, padding=1, bias=False) - - def forward(self, x): - out = self.conv1(F.relu(self.bn1(x))) - out = self.conv2(F.relu(self.bn2(out))) - out = torch.cat([out,x], 1) - return out - - -class Transition(nn.Module): - def __init__(self, in_planes, out_planes): - super(Transition, self).__init__() - self.bn = nn.BatchNorm2d(in_planes) - self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, bias=False) - - def forward(self, x): - out = self.conv(F.relu(self.bn(x))) - out = F.avg_pool2d(out, 2) - return out - - -class DenseNet(nn.Module): - def __init__(self, block, nblocks, growth_rate=12, reduction=0.5, num_classes=10): - super(DenseNet, self).__init__() - self.growth_rate = growth_rate - - num_planes = 2*growth_rate - self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, padding=1, bias=False) - - self.dense1 = self._make_dense_layers(block, num_planes, nblocks[0]) - num_planes += nblocks[0]*growth_rate - out_planes = int(math.floor(num_planes*reduction)) - self.trans1 = Transition(num_planes, out_planes) - num_planes = out_planes - - self.dense2 = self._make_dense_layers(block, num_planes, nblocks[1]) - num_planes += nblocks[1]*growth_rate - out_planes = int(math.floor(num_planes*reduction)) - self.trans2 = Transition(num_planes, out_planes) - num_planes = out_planes - - self.dense3 = self._make_dense_layers(block, num_planes, nblocks[2]) - num_planes += nblocks[2]*growth_rate - out_planes = int(math.floor(num_planes*reduction)) - self.trans3 = Transition(num_planes, out_planes) - num_planes = out_planes - - self.dense4 = self._make_dense_layers(block, num_planes, nblocks[3]) - num_planes += nblocks[3]*growth_rate - - self.bn = nn.BatchNorm2d(num_planes) - self.linear = nn.Linear(num_planes, num_classes) - - def _make_dense_layers(self, block, in_planes, nblock): - layers = [] - for i in range(nblock): - layers.append(block(in_planes, self.growth_rate)) - in_planes += self.growth_rate - return nn.Sequential(*layers) - - def forward(self, x): - out = self.conv1(x) - out = self.trans1(self.dense1(out)) - out = self.trans2(self.dense2(out)) - out = self.trans3(self.dense3(out)) - out = self.dense4(out) - out = F.avg_pool2d(F.relu(self.bn(out)), 4) - out = out.view(out.size(0), -1) - out = self.linear(out) - return out - -def DenseNet121(): - return DenseNet(Bottleneck, [6,12,24,16], growth_rate=32) - -def DenseNet169(): - return DenseNet(Bottleneck, [6,12,32,32], growth_rate=32) - -def DenseNet201(): - return DenseNet(Bottleneck, [6,12,48,32], growth_rate=32) - -def DenseNet161(): - return DenseNet(Bottleneck, [6,12,36,24], growth_rate=48) - -def densenet_cifar(): - return DenseNet(Bottleneck, [6,12,24,16], growth_rate=12) - -def test(): - net = densenet_cifar() - x = torch.randn(1,3,32,32) - y = net(x) - print(y) - -# test() diff --git a/examples/python/pytorch/models/dpn.py b/examples/python/pytorch/models/dpn.py deleted file mode 100644 index d334367fc..000000000 --- a/examples/python/pytorch/models/dpn.py +++ /dev/null @@ -1,98 +0,0 @@ -'''Dual Path Networks in PyTorch.''' -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class Bottleneck(nn.Module): - def __init__(self, last_planes, in_planes, out_planes, dense_depth, stride, first_layer): - super(Bottleneck, self).__init__() - self.out_planes = out_planes - self.dense_depth = dense_depth - - self.conv1 = nn.Conv2d(last_planes, in_planes, kernel_size=1, bias=False) - self.bn1 = nn.BatchNorm2d(in_planes) - self.conv2 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=32, bias=False) - self.bn2 = nn.BatchNorm2d(in_planes) - self.conv3 = nn.Conv2d(in_planes, out_planes+dense_depth, kernel_size=1, bias=False) - self.bn3 = nn.BatchNorm2d(out_planes+dense_depth) - - self.shortcut = nn.Sequential() - if first_layer: - self.shortcut = nn.Sequential( - nn.Conv2d(last_planes, out_planes+dense_depth, kernel_size=1, stride=stride, bias=False), - nn.BatchNorm2d(out_planes+dense_depth) - ) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = F.relu(self.bn2(self.conv2(out))) - out = self.bn3(self.conv3(out)) - x = self.shortcut(x) - d = self.out_planes - out = torch.cat([x[:,:d,:,:]+out[:,:d,:,:], x[:,d:,:,:], out[:,d:,:,:]], 1) - out = F.relu(out) - return out - - -class DPN(nn.Module): - def __init__(self, cfg): - super(DPN, self).__init__() - in_planes, out_planes = cfg['in_planes'], cfg['out_planes'] - num_blocks, dense_depth = cfg['num_blocks'], cfg['dense_depth'] - - self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(64) - self.last_planes = 64 - self.layer1 = self._make_layer(in_planes[0], out_planes[0], num_blocks[0], dense_depth[0], stride=1) - self.layer2 = self._make_layer(in_planes[1], out_planes[1], num_blocks[1], dense_depth[1], stride=2) - self.layer3 = self._make_layer(in_planes[2], out_planes[2], num_blocks[2], dense_depth[2], stride=2) - self.layer4 = self._make_layer(in_planes[3], out_planes[3], num_blocks[3], dense_depth[3], stride=2) - self.linear = nn.Linear(out_planes[3]+(num_blocks[3]+1)*dense_depth[3], 10) - - def _make_layer(self, in_planes, out_planes, num_blocks, dense_depth, stride): - strides = [stride] + [1]*(num_blocks-1) - layers = [] - for i,stride in enumerate(strides): - layers.append(Bottleneck(self.last_planes, in_planes, out_planes, dense_depth, stride, i==0)) - self.last_planes = out_planes + (i+2) * dense_depth - return nn.Sequential(*layers) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = self.layer1(out) - out = self.layer2(out) - out = self.layer3(out) - out = self.layer4(out) - out = F.avg_pool2d(out, 4) - out = out.view(out.size(0), -1) - out = self.linear(out) - return out - - -def DPN26(): - cfg = { - 'in_planes': (96,192,384,768), - 'out_planes': (256,512,1024,2048), - 'num_blocks': (2,2,2,2), - 'dense_depth': (16,32,24,128) - } - return DPN(cfg) - -def DPN92(): - cfg = { - 'in_planes': (96,192,384,768), - 'out_planes': (256,512,1024,2048), - 'num_blocks': (3,4,20,3), - 'dense_depth': (16,32,24,128) - } - return DPN(cfg) - - -def test(): - net = DPN92() - x = torch.randn(1,3,32,32) - y = net(x) - print(y) - -# test() diff --git a/examples/python/pytorch/models/googlenet.py b/examples/python/pytorch/models/googlenet.py deleted file mode 100644 index de036d87d..000000000 --- a/examples/python/pytorch/models/googlenet.py +++ /dev/null @@ -1,107 +0,0 @@ -'''GoogLeNet with PyTorch.''' -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class Inception(nn.Module): - def __init__(self, in_planes, n1x1, n3x3red, n3x3, n5x5red, n5x5, pool_planes): - super(Inception, self).__init__() - # 1x1 conv branch - self.b1 = nn.Sequential( - nn.Conv2d(in_planes, n1x1, kernel_size=1), - nn.BatchNorm2d(n1x1), - nn.ReLU(True), - ) - - # 1x1 conv -> 3x3 conv branch - self.b2 = nn.Sequential( - nn.Conv2d(in_planes, n3x3red, kernel_size=1), - nn.BatchNorm2d(n3x3red), - nn.ReLU(True), - nn.Conv2d(n3x3red, n3x3, kernel_size=3, padding=1), - nn.BatchNorm2d(n3x3), - nn.ReLU(True), - ) - - # 1x1 conv -> 5x5 conv branch - self.b3 = nn.Sequential( - nn.Conv2d(in_planes, n5x5red, kernel_size=1), - nn.BatchNorm2d(n5x5red), - nn.ReLU(True), - nn.Conv2d(n5x5red, n5x5, kernel_size=3, padding=1), - nn.BatchNorm2d(n5x5), - nn.ReLU(True), - nn.Conv2d(n5x5, n5x5, kernel_size=3, padding=1), - nn.BatchNorm2d(n5x5), - nn.ReLU(True), - ) - - # 3x3 pool -> 1x1 conv branch - self.b4 = nn.Sequential( - nn.MaxPool2d(3, stride=1, padding=1), - nn.Conv2d(in_planes, pool_planes, kernel_size=1), - nn.BatchNorm2d(pool_planes), - nn.ReLU(True), - ) - - def forward(self, x): - y1 = self.b1(x) - y2 = self.b2(x) - y3 = self.b3(x) - y4 = self.b4(x) - return torch.cat([y1,y2,y3,y4], 1) - - -class GoogLeNet(nn.Module): - def __init__(self): - super(GoogLeNet, self).__init__() - self.pre_layers = nn.Sequential( - nn.Conv2d(3, 192, kernel_size=3, padding=1), - nn.BatchNorm2d(192), - nn.ReLU(True), - ) - - self.a3 = Inception(192, 64, 96, 128, 16, 32, 32) - self.b3 = Inception(256, 128, 128, 192, 32, 96, 64) - - self.maxpool = nn.MaxPool2d(3, stride=2, padding=1) - - self.a4 = Inception(480, 192, 96, 208, 16, 48, 64) - self.b4 = Inception(512, 160, 112, 224, 24, 64, 64) - self.c4 = Inception(512, 128, 128, 256, 24, 64, 64) - self.d4 = Inception(512, 112, 144, 288, 32, 64, 64) - self.e4 = Inception(528, 256, 160, 320, 32, 128, 128) - - self.a5 = Inception(832, 256, 160, 320, 32, 128, 128) - self.b5 = Inception(832, 384, 192, 384, 48, 128, 128) - - self.avgpool = nn.AvgPool2d(8, stride=1) - self.linear = nn.Linear(1024, 10) - - def forward(self, x): - out = self.pre_layers(x) - out = self.a3(out) - out = self.b3(out) - out = self.maxpool(out) - out = self.a4(out) - out = self.b4(out) - out = self.c4(out) - out = self.d4(out) - out = self.e4(out) - out = self.maxpool(out) - out = self.a5(out) - out = self.b5(out) - out = self.avgpool(out) - out = out.view(out.size(0), -1) - out = self.linear(out) - return out - - -def test(): - net = GoogLeNet() - x = torch.randn(1,3,32,32) - y = net(x) - print(y.size()) - -# test() diff --git a/examples/python/pytorch/models/lenet.py b/examples/python/pytorch/models/lenet.py deleted file mode 100644 index 49c4e9572..000000000 --- a/examples/python/pytorch/models/lenet.py +++ /dev/null @@ -1,24 +0,0 @@ -'''LeNet in PyTorch.''' -import torch.nn as nn -import torch.nn.functional as F -import triton - -class LeNet(nn.Module): - def __init__(self): - super(LeNet, self).__init__() - self.conv1 = nn.Conv2d(3, 512, 3) - self.conv2 = triton.Conv2d(512, 512, 1) - self.fc1 = nn.Linear(512*7*7, 120) - self.fc2 = nn.Linear(120, 84) - self.fc3 = nn.Linear(84, 10) - - def forward(self, x): - out = F.relu(self.conv1(x)) - out = F.max_pool2d(out, 2) - out = F.relu(self.conv2(out)) - out = F.max_pool2d(out, 2) - out = out.view(out.size(0), -1) - out = F.relu(self.fc1(out)) - out = F.relu(self.fc2(out)) - out = self.fc3(out) - return out diff --git a/examples/python/pytorch/models/mobilenet.py b/examples/python/pytorch/models/mobilenet.py deleted file mode 100644 index 497ef1e86..000000000 --- a/examples/python/pytorch/models/mobilenet.py +++ /dev/null @@ -1,61 +0,0 @@ -'''MobileNet in PyTorch. - -See the paper "MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications" -for more details. -''' -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class Block(nn.Module): - '''Depthwise conv + Pointwise conv''' - def __init__(self, in_planes, out_planes, stride=1): - super(Block, self).__init__() - self.conv1 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=in_planes, bias=False) - self.bn1 = nn.BatchNorm2d(in_planes) - self.conv2 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) - self.bn2 = nn.BatchNorm2d(out_planes) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = F.relu(self.bn2(self.conv2(out))) - return out - - -class MobileNet(nn.Module): - # (128,2) means conv planes=128, conv stride=2, by default conv stride=1 - cfg = [64, (128,2), 128, (256,2), 256, (512,2), 512, 512, 512, 512, 512, (1024,2), 1024] - - def __init__(self, num_classes=10): - super(MobileNet, self).__init__() - self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(32) - self.layers = self._make_layers(in_planes=32) - self.linear = nn.Linear(1024, num_classes) - - def _make_layers(self, in_planes): - layers = [] - for x in self.cfg: - out_planes = x if isinstance(x, int) else x[0] - stride = 1 if isinstance(x, int) else x[1] - layers.append(Block(in_planes, out_planes, stride)) - in_planes = out_planes - return nn.Sequential(*layers) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = self.layers(out) - out = F.avg_pool2d(out, 2) - out = out.view(out.size(0), -1) - out = self.linear(out) - return out - - -def test(): - net = MobileNet() - x = torch.randn(1,3,32,32) - y = net(x) - print(y.size()) - -# test() diff --git a/examples/python/pytorch/models/mobilenetv2.py b/examples/python/pytorch/models/mobilenetv2.py deleted file mode 100644 index 17e5823ef..000000000 --- a/examples/python/pytorch/models/mobilenetv2.py +++ /dev/null @@ -1,86 +0,0 @@ -'''MobileNetV2 in PyTorch. - -See the paper "Inverted Residuals and Linear Bottlenecks: -Mobile Networks for Classification, Detection and Segmentation" for more details. -''' -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class Block(nn.Module): - '''expand + depthwise + pointwise''' - def __init__(self, in_planes, out_planes, expansion, stride): - super(Block, self).__init__() - self.stride = stride - - planes = expansion * in_planes - self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, stride=1, padding=0, bias=False) - self.bn1 = nn.BatchNorm2d(planes) - self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, groups=planes, bias=False) - self.bn2 = nn.BatchNorm2d(planes) - self.conv3 = nn.Conv2d(planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) - self.bn3 = nn.BatchNorm2d(out_planes) - - self.shortcut = nn.Sequential() - if stride == 1 and in_planes != out_planes: - self.shortcut = nn.Sequential( - nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False), - nn.BatchNorm2d(out_planes), - ) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = F.relu(self.bn2(self.conv2(out))) - out = self.bn3(self.conv3(out)) - out = out + self.shortcut(x) if self.stride==1 else out - return out - - -class MobileNetV2(nn.Module): - # (expansion, out_planes, num_blocks, stride) - cfg = [(1, 16, 1, 1), - (6, 24, 2, 1), # NOTE: change stride 2 -> 1 for CIFAR10 - (6, 32, 3, 2), - (6, 64, 4, 2), - (6, 96, 3, 1), - (6, 160, 3, 2), - (6, 320, 1, 1)] - - def __init__(self, num_classes=10): - super(MobileNetV2, self).__init__() - # NOTE: change conv1 stride 2 -> 1 for CIFAR10 - self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(32) - self.layers = self._make_layers(in_planes=32) - self.conv2 = nn.Conv2d(320, 1280, kernel_size=1, stride=1, padding=0, bias=False) - self.bn2 = nn.BatchNorm2d(1280) - self.linear = nn.Linear(1280, num_classes) - - def _make_layers(self, in_planes): - layers = [] - for expansion, out_planes, num_blocks, stride in self.cfg: - strides = [stride] + [1]*(num_blocks-1) - for stride in strides: - layers.append(Block(in_planes, out_planes, expansion, stride)) - in_planes = out_planes - return nn.Sequential(*layers) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = self.layers(out) - out = F.relu(self.bn2(self.conv2(out))) - # NOTE: change pooling kernel_size 7 -> 4 for CIFAR10 - out = F.avg_pool2d(out, 4) - out = out.view(out.size(0), -1) - out = self.linear(out) - return out - - -def test(): - net = MobileNetV2() - x = torch.randn(2,3,32,32) - y = net(x) - print(y.size()) - -# test() diff --git a/examples/python/pytorch/models/pnasnet.py b/examples/python/pytorch/models/pnasnet.py deleted file mode 100644 index de8c4d51f..000000000 --- a/examples/python/pytorch/models/pnasnet.py +++ /dev/null @@ -1,125 +0,0 @@ -'''PNASNet in PyTorch. - -Paper: Progressive Neural Architecture Search -''' -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class SepConv(nn.Module): - '''Separable Convolution.''' - def __init__(self, in_planes, out_planes, kernel_size, stride): - super(SepConv, self).__init__() - self.conv1 = nn.Conv2d(in_planes, out_planes, - kernel_size, stride, - padding=(kernel_size-1)//2, - bias=False, groups=in_planes) - self.bn1 = nn.BatchNorm2d(out_planes) - - def forward(self, x): - return self.bn1(self.conv1(x)) - - -class CellA(nn.Module): - def __init__(self, in_planes, out_planes, stride=1): - super(CellA, self).__init__() - self.stride = stride - self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride) - if stride==2: - self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) - self.bn1 = nn.BatchNorm2d(out_planes) - - def forward(self, x): - y1 = self.sep_conv1(x) - y2 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1) - if self.stride==2: - y2 = self.bn1(self.conv1(y2)) - return F.relu(y1+y2) - -class CellB(nn.Module): - def __init__(self, in_planes, out_planes, stride=1): - super(CellB, self).__init__() - self.stride = stride - # Left branch - self.sep_conv1 = SepConv(in_planes, out_planes, kernel_size=7, stride=stride) - self.sep_conv2 = SepConv(in_planes, out_planes, kernel_size=3, stride=stride) - # Right branch - self.sep_conv3 = SepConv(in_planes, out_planes, kernel_size=5, stride=stride) - if stride==2: - self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) - self.bn1 = nn.BatchNorm2d(out_planes) - # Reduce channels - self.conv2 = nn.Conv2d(2*out_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False) - self.bn2 = nn.BatchNorm2d(out_planes) - - def forward(self, x): - # Left branch - y1 = self.sep_conv1(x) - y2 = self.sep_conv2(x) - # Right branch - y3 = F.max_pool2d(x, kernel_size=3, stride=self.stride, padding=1) - if self.stride==2: - y3 = self.bn1(self.conv1(y3)) - y4 = self.sep_conv3(x) - # Concat & reduce channels - b1 = F.relu(y1+y2) - b2 = F.relu(y3+y4) - y = torch.cat([b1,b2], 1) - return F.relu(self.bn2(self.conv2(y))) - -class PNASNet(nn.Module): - def __init__(self, cell_type, num_cells, num_planes): - super(PNASNet, self).__init__() - self.in_planes = num_planes - self.cell_type = cell_type - - self.conv1 = nn.Conv2d(3, num_planes, kernel_size=3, stride=1, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(num_planes) - - self.layer1 = self._make_layer(num_planes, num_cells=6) - self.layer2 = self._downsample(num_planes*2) - self.layer3 = self._make_layer(num_planes*2, num_cells=6) - self.layer4 = self._downsample(num_planes*4) - self.layer5 = self._make_layer(num_planes*4, num_cells=6) - - self.linear = nn.Linear(num_planes*4, 10) - - def _make_layer(self, planes, num_cells): - layers = [] - for _ in range(num_cells): - layers.append(self.cell_type(self.in_planes, planes, stride=1)) - self.in_planes = planes - return nn.Sequential(*layers) - - def _downsample(self, planes): - layer = self.cell_type(self.in_planes, planes, stride=2) - self.in_planes = planes - return layer - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = self.layer1(out) - out = self.layer2(out) - out = self.layer3(out) - out = self.layer4(out) - out = self.layer5(out) - out = F.avg_pool2d(out, 8) - out = self.linear(out.view(out.size(0), -1)) - return out - - -def PNASNetA(): - return PNASNet(CellA, num_cells=6, num_planes=44) - -def PNASNetB(): - return PNASNet(CellB, num_cells=6, num_planes=32) - - -def test(): - net = PNASNetB() - x = torch.randn(1,3,32,32) - y = net(x) - print(y) - -# test() diff --git a/examples/python/pytorch/models/preact_resnet.py b/examples/python/pytorch/models/preact_resnet.py deleted file mode 100644 index abb1bc313..000000000 --- a/examples/python/pytorch/models/preact_resnet.py +++ /dev/null @@ -1,118 +0,0 @@ -'''Pre-activation ResNet in PyTorch. - -Reference: -[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun - Identity Mappings in Deep Residual Networks. arXiv:1603.05027 -''' -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class PreActBlock(nn.Module): - '''Pre-activation version of the BasicBlock.''' - expansion = 1 - - def __init__(self, in_planes, planes, stride=1): - super(PreActBlock, self).__init__() - self.bn1 = nn.BatchNorm2d(in_planes) - self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) - self.bn2 = nn.BatchNorm2d(planes) - self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) - - if stride != 1 or in_planes != self.expansion*planes: - self.shortcut = nn.Sequential( - nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False) - ) - - def forward(self, x): - out = F.relu(self.bn1(x)) - shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x - out = self.conv1(out) - out = self.conv2(F.relu(self.bn2(out))) - out += shortcut - return out - - -class PreActBottleneck(nn.Module): - '''Pre-activation version of the original Bottleneck module.''' - expansion = 4 - - def __init__(self, in_planes, planes, stride=1): - super(PreActBottleneck, self).__init__() - self.bn1 = nn.BatchNorm2d(in_planes) - self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) - self.bn2 = nn.BatchNorm2d(planes) - self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) - self.bn3 = nn.BatchNorm2d(planes) - self.conv3 = nn.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False) - - if stride != 1 or in_planes != self.expansion*planes: - self.shortcut = nn.Sequential( - nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False) - ) - - def forward(self, x): - out = F.relu(self.bn1(x)) - shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x - out = self.conv1(out) - out = self.conv2(F.relu(self.bn2(out))) - out = self.conv3(F.relu(self.bn3(out))) - out += shortcut - return out - - -class PreActResNet(nn.Module): - def __init__(self, block, num_blocks, num_classes=10): - super(PreActResNet, self).__init__() - self.in_planes = 64 - - self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) - self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) - self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) - self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) - self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) - self.linear = nn.Linear(512*block.expansion, num_classes) - - def _make_layer(self, block, planes, num_blocks, stride): - strides = [stride] + [1]*(num_blocks-1) - layers = [] - for stride in strides: - layers.append(block(self.in_planes, planes, stride)) - self.in_planes = planes * block.expansion - return nn.Sequential(*layers) - - def forward(self, x): - out = self.conv1(x) - out = self.layer1(out) - out = self.layer2(out) - out = self.layer3(out) - out = self.layer4(out) - out = F.avg_pool2d(out, 4) - out = out.view(out.size(0), -1) - out = self.linear(out) - return out - - -def PreActResNet18(): - return PreActResNet(PreActBlock, [2,2,2,2]) - -def PreActResNet34(): - return PreActResNet(PreActBlock, [3,4,6,3]) - -def PreActResNet50(): - return PreActResNet(PreActBottleneck, [3,4,6,3]) - -def PreActResNet101(): - return PreActResNet(PreActBottleneck, [3,4,23,3]) - -def PreActResNet152(): - return PreActResNet(PreActBottleneck, [3,8,36,3]) - - -def test(): - net = PreActResNet18() - y = net((torch.randn(1,3,32,32))) - print(y.size()) - -# test() diff --git a/examples/python/pytorch/models/resnet.py b/examples/python/pytorch/models/resnet.py deleted file mode 100644 index 8fe334fd9..000000000 --- a/examples/python/pytorch/models/resnet.py +++ /dev/null @@ -1,121 +0,0 @@ -'''ResNet in PyTorch. - -For Pre-activation ResNet, see 'preact_resnet.py'. - -Reference: -[1] Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun - Deep Residual Learning for Image Recognition. arXiv:1512.03385 -''' -import torch -import torch.nn as nn -import torch.nn.functional as F -import triton - -class BasicBlock(nn.Module): - expansion = 1 - - def __init__(self, in_planes, planes, stride=1): - super(BasicBlock, self).__init__() - self.conv1 = triton.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(planes) - self.conv2 = triton.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) - self.bn2 = nn.BatchNorm2d(planes) - - self.shortcut = nn.Sequential() - if stride != 1 or in_planes != self.expansion*planes: - self.shortcut = nn.Sequential( - triton.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False), - nn.BatchNorm2d(self.expansion*planes) - ) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = self.bn2(self.conv2(out)) - out += self.shortcut(x) - out = F.relu(out) - return out - - -class Bottleneck(nn.Module): - expansion = 4 - - def __init__(self, in_planes, planes, stride=1): - super(Bottleneck, self).__init__() - self.conv1 = triton.Conv2d(in_planes, planes, kernel_size=1, bias=False) - self.bn1 = nn.BatchNorm2d(planes) - self.conv2 = triton.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) - self.bn2 = nn.BatchNorm2d(planes) - self.conv3 = triton.Conv2d(planes, self.expansion*planes, kernel_size=1, bias=False) - self.bn3 = nn.BatchNorm2d(self.expansion*planes) - - self.shortcut = nn.Sequential() - if stride != 1 or in_planes != self.expansion*planes: - self.shortcut = nn.Sequential( - triton.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False), - nn.BatchNorm2d(self.expansion*planes) - ) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = F.relu(self.bn2(self.conv2(out))) - out = self.bn3(self.conv3(out)) - out += self.shortcut(x) - out = F.relu(out) - return out - - -class ResNet(nn.Module): - def __init__(self, block, num_blocks, num_classes=10): - super(ResNet, self).__init__() - self.in_planes = 64 - - self.conv1 = triton.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(64) - self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) - self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) - self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) - self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) - self.linear = nn.Linear(512*block.expansion, num_classes) - - def _make_layer(self, block, planes, num_blocks, stride): - strides = [stride] + [1]*(num_blocks-1) - layers = [] - for stride in strides: - layers.append(block(self.in_planes, planes, stride)) - self.in_planes = planes * block.expansion - return nn.Sequential(*layers) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = self.layer1(out) - out = self.layer2(out) - out = self.layer3(out) - out = self.layer4(out) - out = F.avg_pool2d(out, 4) - out = out.view(out.size(0), -1) - out = self.linear(out) - return out - - -def ResNet18(): - return ResNet(BasicBlock, [2,2,2,2]) - -def ResNet34(): - return ResNet(BasicBlock, [3,4,6,3]) - -def ResNet50(): - return ResNet(Bottleneck, [3,4,6,3]) - -def ResNet101(): - return ResNet(Bottleneck, [3,4,23,3]) - -def ResNet152(): - return ResNet(Bottleneck, [3,8,36,3]) - - -def test(): - net = ResNet18() - y = net(torch.randn(1,3,32,32)) - print(y.size()) - -# test() diff --git a/examples/python/pytorch/models/resnext.py b/examples/python/pytorch/models/resnext.py deleted file mode 100644 index 7a08f3e7d..000000000 --- a/examples/python/pytorch/models/resnext.py +++ /dev/null @@ -1,95 +0,0 @@ -'''ResNeXt in PyTorch. - -See the paper "Aggregated Residual Transformations for Deep Neural Networks" for more details. -''' -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class Block(nn.Module): - '''Grouped convolution block.''' - expansion = 2 - - def __init__(self, in_planes, cardinality=32, bottleneck_width=4, stride=1): - super(Block, self).__init__() - group_width = cardinality * bottleneck_width - self.conv1 = nn.Conv2d(in_planes, group_width, kernel_size=1, bias=False) - self.bn1 = nn.BatchNorm2d(group_width) - self.conv2 = nn.Conv2d(group_width, group_width, kernel_size=3, stride=stride, padding=1, groups=cardinality, bias=False) - self.bn2 = nn.BatchNorm2d(group_width) - self.conv3 = nn.Conv2d(group_width, self.expansion*group_width, kernel_size=1, bias=False) - self.bn3 = nn.BatchNorm2d(self.expansion*group_width) - - self.shortcut = nn.Sequential() - if stride != 1 or in_planes != self.expansion*group_width: - self.shortcut = nn.Sequential( - nn.Conv2d(in_planes, self.expansion*group_width, kernel_size=1, stride=stride, bias=False), - nn.BatchNorm2d(self.expansion*group_width) - ) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = F.relu(self.bn2(self.conv2(out))) - out = self.bn3(self.conv3(out)) - out += self.shortcut(x) - out = F.relu(out) - return out - - -class ResNeXt(nn.Module): - def __init__(self, num_blocks, cardinality, bottleneck_width, num_classes=10): - super(ResNeXt, self).__init__() - self.cardinality = cardinality - self.bottleneck_width = bottleneck_width - self.in_planes = 64 - - self.conv1 = nn.Conv2d(3, 64, kernel_size=1, bias=False) - self.bn1 = nn.BatchNorm2d(64) - self.layer1 = self._make_layer(num_blocks[0], 1) - self.layer2 = self._make_layer(num_blocks[1], 2) - self.layer3 = self._make_layer(num_blocks[2], 2) - # self.layer4 = self._make_layer(num_blocks[3], 2) - self.linear = nn.Linear(cardinality*bottleneck_width*8, num_classes) - - def _make_layer(self, num_blocks, stride): - strides = [stride] + [1]*(num_blocks-1) - layers = [] - for stride in strides: - layers.append(Block(self.in_planes, self.cardinality, self.bottleneck_width, stride)) - self.in_planes = Block.expansion * self.cardinality * self.bottleneck_width - # Increase bottleneck_width by 2 after each stage. - self.bottleneck_width *= 2 - return nn.Sequential(*layers) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = self.layer1(out) - out = self.layer2(out) - out = self.layer3(out) - # out = self.layer4(out) - out = F.avg_pool2d(out, 8) - out = out.view(out.size(0), -1) - out = self.linear(out) - return out - - -def ResNeXt29_2x64d(): - return ResNeXt(num_blocks=[3,3,3], cardinality=2, bottleneck_width=64) - -def ResNeXt29_4x64d(): - return ResNeXt(num_blocks=[3,3,3], cardinality=4, bottleneck_width=64) - -def ResNeXt29_8x64d(): - return ResNeXt(num_blocks=[3,3,3], cardinality=8, bottleneck_width=64) - -def ResNeXt29_32x4d(): - return ResNeXt(num_blocks=[3,3,3], cardinality=32, bottleneck_width=4) - -def test_resnext(): - net = ResNeXt29_2x64d() - x = torch.randn(1,3,32,32) - y = net(x) - print(y.size()) - -# test_resnext() diff --git a/examples/python/pytorch/models/senet.py b/examples/python/pytorch/models/senet.py deleted file mode 100644 index 98bfa0ca5..000000000 --- a/examples/python/pytorch/models/senet.py +++ /dev/null @@ -1,121 +0,0 @@ -'''SENet in PyTorch. - -SENet is the winner of ImageNet-2017. The paper is not released yet. -''' -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class BasicBlock(nn.Module): - def __init__(self, in_planes, planes, stride=1): - super(BasicBlock, self).__init__() - self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(planes) - self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) - self.bn2 = nn.BatchNorm2d(planes) - - self.shortcut = nn.Sequential() - if stride != 1 or in_planes != planes: - self.shortcut = nn.Sequential( - nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False), - nn.BatchNorm2d(planes) - ) - - # SE layers - self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1) # Use nn.Conv2d instead of nn.Linear - self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = self.bn2(self.conv2(out)) - - # Squeeze - w = F.avg_pool2d(out, out.size(2)) - w = F.relu(self.fc1(w)) - w = F.sigmoid(self.fc2(w)) - # Excitation - out = out * w # New broadcasting feature from v0.2! - - out += self.shortcut(x) - out = F.relu(out) - return out - - -class PreActBlock(nn.Module): - def __init__(self, in_planes, planes, stride=1): - super(PreActBlock, self).__init__() - self.bn1 = nn.BatchNorm2d(in_planes) - self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) - self.bn2 = nn.BatchNorm2d(planes) - self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False) - - if stride != 1 or in_planes != planes: - self.shortcut = nn.Sequential( - nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride, bias=False) - ) - - # SE layers - self.fc1 = nn.Conv2d(planes, planes//16, kernel_size=1) - self.fc2 = nn.Conv2d(planes//16, planes, kernel_size=1) - - def forward(self, x): - out = F.relu(self.bn1(x)) - shortcut = self.shortcut(out) if hasattr(self, 'shortcut') else x - out = self.conv1(out) - out = self.conv2(F.relu(self.bn2(out))) - - # Squeeze - w = F.avg_pool2d(out, out.size(2)) - w = F.relu(self.fc1(w)) - w = F.sigmoid(self.fc2(w)) - # Excitation - out = out * w - - out += shortcut - return out - - -class SENet(nn.Module): - def __init__(self, block, num_blocks, num_classes=10): - super(SENet, self).__init__() - self.in_planes = 64 - - self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(64) - self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) - self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) - self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) - self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) - self.linear = nn.Linear(512, num_classes) - - def _make_layer(self, block, planes, num_blocks, stride): - strides = [stride] + [1]*(num_blocks-1) - layers = [] - for stride in strides: - layers.append(block(self.in_planes, planes, stride)) - self.in_planes = planes - return nn.Sequential(*layers) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = self.layer1(out) - out = self.layer2(out) - out = self.layer3(out) - out = self.layer4(out) - out = F.avg_pool2d(out, 4) - out = out.view(out.size(0), -1) - out = self.linear(out) - return out - - -def SENet18(): - return SENet(PreActBlock, [2,2,2,2]) - - -def test(): - net = SENet18() - y = net(torch.randn(1,3,32,32)) - print(y.size()) - -# test() diff --git a/examples/python/pytorch/models/shufflenet.py b/examples/python/pytorch/models/shufflenet.py deleted file mode 100644 index 3682fd3b1..000000000 --- a/examples/python/pytorch/models/shufflenet.py +++ /dev/null @@ -1,109 +0,0 @@ -'''ShuffleNet in PyTorch. - -See the paper "ShuffleNet: An Extremely Efficient Convolutional Neural Network for Mobile Devices" for more details. -''' -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class ShuffleBlock(nn.Module): - def __init__(self, groups): - super(ShuffleBlock, self).__init__() - self.groups = groups - - def forward(self, x): - '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]''' - N,C,H,W = x.size() - g = self.groups - return x.view(N,g,C/g,H,W).permute(0,2,1,3,4).contiguous().view(N,C,H,W) - - -class Bottleneck(nn.Module): - def __init__(self, in_planes, out_planes, stride, groups): - super(Bottleneck, self).__init__() - self.stride = stride - - mid_planes = out_planes/4 - g = 1 if in_planes==24 else groups - self.conv1 = nn.Conv2d(in_planes, mid_planes, kernel_size=1, groups=g, bias=False) - self.bn1 = nn.BatchNorm2d(mid_planes) - self.shuffle1 = ShuffleBlock(groups=g) - self.conv2 = nn.Conv2d(mid_planes, mid_planes, kernel_size=3, stride=stride, padding=1, groups=mid_planes, bias=False) - self.bn2 = nn.BatchNorm2d(mid_planes) - self.conv3 = nn.Conv2d(mid_planes, out_planes, kernel_size=1, groups=groups, bias=False) - self.bn3 = nn.BatchNorm2d(out_planes) - - self.shortcut = nn.Sequential() - if stride == 2: - self.shortcut = nn.Sequential(nn.AvgPool2d(3, stride=2, padding=1)) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = self.shuffle1(out) - out = F.relu(self.bn2(self.conv2(out))) - out = self.bn3(self.conv3(out)) - res = self.shortcut(x) - out = F.relu(torch.cat([out,res], 1)) if self.stride==2 else F.relu(out+res) - return out - - -class ShuffleNet(nn.Module): - def __init__(self, cfg): - super(ShuffleNet, self).__init__() - out_planes = cfg['out_planes'] - num_blocks = cfg['num_blocks'] - groups = cfg['groups'] - - self.conv1 = nn.Conv2d(3, 24, kernel_size=1, bias=False) - self.bn1 = nn.BatchNorm2d(24) - self.in_planes = 24 - self.layer1 = self._make_layer(out_planes[0], num_blocks[0], groups) - self.layer2 = self._make_layer(out_planes[1], num_blocks[1], groups) - self.layer3 = self._make_layer(out_planes[2], num_blocks[2], groups) - self.linear = nn.Linear(out_planes[2], 10) - - def _make_layer(self, out_planes, num_blocks, groups): - layers = [] - for i in range(num_blocks): - stride = 2 if i == 0 else 1 - cat_planes = self.in_planes if i == 0 else 0 - layers.append(Bottleneck(self.in_planes, out_planes-cat_planes, stride=stride, groups=groups)) - self.in_planes = out_planes - return nn.Sequential(*layers) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - out = self.layer1(out) - out = self.layer2(out) - out = self.layer3(out) - out = F.avg_pool2d(out, 4) - out = out.view(out.size(0), -1) - out = self.linear(out) - return out - - -def ShuffleNetG2(): - cfg = { - 'out_planes': [200,400,800], - 'num_blocks': [4,8,4], - 'groups': 2 - } - return ShuffleNet(cfg) - -def ShuffleNetG3(): - cfg = { - 'out_planes': [240,480,960], - 'num_blocks': [4,8,4], - 'groups': 3 - } - return ShuffleNet(cfg) - - -def test(): - net = ShuffleNetG2() - x = torch.randn(1,3,32,32) - y = net(x) - print(y) - -# test() diff --git a/examples/python/pytorch/models/shufflenetv2.py b/examples/python/pytorch/models/shufflenetv2.py deleted file mode 100644 index d24c5dcbb..000000000 --- a/examples/python/pytorch/models/shufflenetv2.py +++ /dev/null @@ -1,162 +0,0 @@ -'''ShuffleNetV2 in PyTorch. - -See the paper "ShuffleNet V2: Practical Guidelines for Efficient CNN Architecture Design" for more details. -''' -import torch -import torch.nn as nn -import torch.nn.functional as F - - -class ShuffleBlock(nn.Module): - def __init__(self, groups=2): - super(ShuffleBlock, self).__init__() - self.groups = groups - - def forward(self, x): - '''Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]''' - N, C, H, W = x.size() - g = self.groups - return x.view(N, g, C/g, H, W).permute(0, 2, 1, 3, 4).reshape(N, C, H, W) - - -class SplitBlock(nn.Module): - def __init__(self, ratio): - super(SplitBlock, self).__init__() - self.ratio = ratio - - def forward(self, x): - c = int(x.size(1) * self.ratio) - return x[:, :c, :, :], x[:, c:, :, :] - - -class BasicBlock(nn.Module): - def __init__(self, in_channels, split_ratio=0.5): - super(BasicBlock, self).__init__() - self.split = SplitBlock(split_ratio) - in_channels = int(in_channels * split_ratio) - self.conv1 = nn.Conv2d(in_channels, in_channels, - kernel_size=1, bias=False) - self.bn1 = nn.BatchNorm2d(in_channels) - self.conv2 = nn.Conv2d(in_channels, in_channels, - kernel_size=3, stride=1, padding=1, groups=in_channels, bias=False) - self.bn2 = nn.BatchNorm2d(in_channels) - self.conv3 = nn.Conv2d(in_channels, in_channels, - kernel_size=1, bias=False) - self.bn3 = nn.BatchNorm2d(in_channels) - self.shuffle = ShuffleBlock() - - def forward(self, x): - x1, x2 = self.split(x) - out = F.relu(self.bn1(self.conv1(x2))) - out = self.bn2(self.conv2(out)) - out = F.relu(self.bn3(self.conv3(out))) - out = torch.cat([x1, out], 1) - out = self.shuffle(out) - return out - - -class DownBlock(nn.Module): - def __init__(self, in_channels, out_channels): - super(DownBlock, self).__init__() - mid_channels = out_channels // 2 - # left - self.conv1 = nn.Conv2d(in_channels, in_channels, - kernel_size=3, stride=2, padding=1, groups=in_channels, bias=False) - self.bn1 = nn.BatchNorm2d(in_channels) - self.conv2 = nn.Conv2d(in_channels, mid_channels, - kernel_size=1, bias=False) - self.bn2 = nn.BatchNorm2d(mid_channels) - # right - self.conv3 = nn.Conv2d(in_channels, mid_channels, - kernel_size=1, bias=False) - self.bn3 = nn.BatchNorm2d(mid_channels) - self.conv4 = nn.Conv2d(mid_channels, mid_channels, - kernel_size=3, stride=2, padding=1, groups=mid_channels, bias=False) - self.bn4 = nn.BatchNorm2d(mid_channels) - self.conv5 = nn.Conv2d(mid_channels, mid_channels, - kernel_size=1, bias=False) - self.bn5 = nn.BatchNorm2d(mid_channels) - - self.shuffle = ShuffleBlock() - - def forward(self, x): - # left - out1 = self.bn1(self.conv1(x)) - out1 = F.relu(self.bn2(self.conv2(out1))) - # right - out2 = F.relu(self.bn3(self.conv3(x))) - out2 = self.bn4(self.conv4(out2)) - out2 = F.relu(self.bn5(self.conv5(out2))) - # concat - out = torch.cat([out1, out2], 1) - out = self.shuffle(out) - return out - - -class ShuffleNetV2(nn.Module): - def __init__(self, net_size): - super(ShuffleNetV2, self).__init__() - out_channels = configs[net_size]['out_channels'] - num_blocks = configs[net_size]['num_blocks'] - - self.conv1 = nn.Conv2d(3, 24, kernel_size=3, - stride=1, padding=1, bias=False) - self.bn1 = nn.BatchNorm2d(24) - self.in_channels = 24 - self.layer1 = self._make_layer(out_channels[0], num_blocks[0]) - self.layer2 = self._make_layer(out_channels[1], num_blocks[1]) - self.layer3 = self._make_layer(out_channels[2], num_blocks[2]) - self.conv2 = nn.Conv2d(out_channels[2], out_channels[3], - kernel_size=1, stride=1, padding=0, bias=False) - self.bn2 = nn.BatchNorm2d(out_channels[3]) - self.linear = nn.Linear(out_channels[3], 10) - - def _make_layer(self, out_channels, num_blocks): - layers = [DownBlock(self.in_channels, out_channels)] - for i in range(num_blocks): - layers.append(BasicBlock(out_channels)) - self.in_channels = out_channels - return nn.Sequential(*layers) - - def forward(self, x): - out = F.relu(self.bn1(self.conv1(x))) - # out = F.max_pool2d(out, 3, stride=2, padding=1) - out = self.layer1(out) - out = self.layer2(out) - out = self.layer3(out) - out = F.relu(self.bn2(self.conv2(out))) - out = F.avg_pool2d(out, 4) - out = out.view(out.size(0), -1) - out = self.linear(out) - return out - - -configs = { - 0.5: { - 'out_channels': (48, 96, 192, 1024), - 'num_blocks': (3, 7, 3) - }, - - 1: { - 'out_channels': (116, 232, 464, 1024), - 'num_blocks': (3, 7, 3) - }, - 1.5: { - 'out_channels': (176, 352, 704, 1024), - 'num_blocks': (3, 7, 3) - }, - 2: { - 'out_channels': (224, 488, 976, 2048), - 'num_blocks': (3, 7, 3) - } -} - - -def test(): - net = ShuffleNetV2(net_size=0.5) - x = torch.randn(3, 3, 32, 32) - y = net(x) - print(y.shape) - - -# test() diff --git a/examples/python/pytorch/models/vgg.py b/examples/python/pytorch/models/vgg.py deleted file mode 100644 index cb2b3a3ae..000000000 --- a/examples/python/pytorch/models/vgg.py +++ /dev/null @@ -1,47 +0,0 @@ -'''VGG11/13/16/19 in Pytorch.''' -import torch -import torch.nn as nn -import triton - -cfg = { - 'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], - 'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], - 'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'], - 'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'], -} - - -class VGG(nn.Module): - def __init__(self, vgg_name): - super(VGG, self).__init__() - self.features = self._make_layers(cfg[vgg_name]) - self.classifier = nn.Linear(512, 10) - - def forward(self, x): - out = self.features(x) - out = out.view(out.size(0), -1) - out = self.classifier(out) - return out - - def _make_layers(self, cfg): - layers = [] - in_channels = 3 - for x in cfg: - if x == 'M': - layers += [nn.MaxPool2d(kernel_size=2, stride=2)] - else: - layers += [triton.Conv2d(in_channels, x, kernel_size=3, padding=1), - nn.BatchNorm2d(x), - nn.ReLU(inplace=True)] - in_channels = x - layers += [nn.AvgPool2d(kernel_size=1, stride=1)] - return nn.Sequential(*layers) - - -def test(): - net = VGG('VGG11') - x = torch.randn(2,3,32,32) - y = net(x) - print(y.size()) - -# test() diff --git a/examples/python/pytorch/shift.cpp b/examples/python/pytorch/shift.cpp new file mode 100644 index 000000000..0bf5340c7 --- /dev/null +++ b/examples/python/pytorch/shift.cpp @@ -0,0 +1,114 @@ +#include +#include +#include +#include +#include "ATen/cuda/CUDAContext.h" +#include "triton/runtime/jit.h" +#include "triton/driver/stream.h" +#include "triton/dnn/shift.h" +#include "triton/tools/bench.hpp" + +#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") +#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x " must be contiguous") +#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) + +typedef std::tuple shift_key_t; + +static std::map> m_shift_stream; +static std::map> m_shift_jit; +static std::map> m_shift_config; + +torch::Tensor shift_common( + int32_t B, int32_t C, int32_t D, int32_t H, int32_t W, + int32_t T, int32_t R, int32_t S, int32_t F, + std::vector shift_h, std::vector shift_w, + triton::dnn::shift::type ty, + torch::Tensor torcha, torch::Tensor torchb, torch::Tensor torchbias, + bool autotune = false + ) { + + // Wrap CUDA handles + c10::DeviceIndex device = torcha.storage().device().index(); + + // Get stream + CUstream custream = (CUstream)at::cuda::getCurrentCUDAStream(device).stream(); + triton::driver::stream* stream; + if(m_shift_stream.find(custream) == m_shift_stream.end()) + stream = m_shift_stream.emplace(custream, new triton::driver::cu_stream(custream, false)).first->second.get(); + else + stream = m_shift_stream.at(custream).get(); + + // Get context + triton::driver::context* ctx = stream->context(); + + // Get configuration + bool has_bias = torchbias.storage().size() > 0; + shift_key_t key = {B, C, D, H, W, T, R, S, F, shift_h.data(), shift_w.data(), ty, has_bias}; + triton::dnn::shift* configuration; + if(m_shift_config.find(key) == m_shift_config.end()) + configuration = m_shift_config.emplace(key, new triton::dnn::shift( + B, C, D, H, W, T, R, S, F, + shift_h, shift_w, "fp32", "fp32", + ty, has_bias)).first->second.get(); + else + configuration = m_shift_config.at(key).get(); + + // Bind memory + triton::driver::cu_buffer a(ctx, (CUdeviceptr)torcha.storage().data(), false); + triton::driver::cu_buffer b(ctx, (CUdeviceptr)torchb.storage().data(), false); + triton::driver::cu_buffer cubias(ctx, (CUdeviceptr)torchbias.storage().data(), false); + triton::driver::buffer* bias = has_bias ? &cubias : nullptr; + + // Allocate output + std::vector c_shapes = configuration->c_shapes(); + torch::Tensor torchc = torch::empty({c_shapes[0], c_shapes[1], c_shapes[2], c_shapes[3]}).cuda(); + triton::driver::cu_buffer c(ctx, (CUdeviceptr)torchc.storage().data(), false); + + // Get JIT + triton::jit* jit; + if(m_shift_jit.find(key) == m_shift_jit.end()){ + jit = m_shift_jit.emplace(key, new triton::jit(ctx)).first->second.get(); + std::ostringstream oss; + configuration->src(oss); + std::string src = oss.str(); + // benchmark a given shiftolution kernel + auto benchmark = [&](triton::driver::kernel* kernel, + triton::jit::launch_information info) { + configuration->init(stream, (triton::driver::cu_module*)kernel->module()); + unsigned TM = info.global_range_size[0]; + unsigned TN = info.global_range_size[1]; + unsigned nthreads = info.num_threads; + configuration->enqueue(stream, kernel, &a, &b, &c, TM, TN, nthreads); + stream->synchronize(); + double ts = triton::tools::bench([&](){ configuration->enqueue(stream, kernel, &a, &b, &c, TM, TN, nthreads); }, + [&](){ stream->synchronize(); }, stream->context()->device()); + return configuration->get_nflops() / ts * 1e-3; + }; + // auto-tune and save result + if(autotune) { + triton::jit::tune_res_t best = jit->autotune("shift", src.c_str(), benchmark); + jit->add_module("shift", src.c_str(), best.params); + } + else { + jit->add_module("shift", src.c_str(), jit->get_valid("shift", src.c_str())); + } + triton::driver::kernel* kernel = jit->get_function("shift"); + configuration->init(stream, (triton::driver::cu_module*)kernel->module()); + } + else + jit = m_shift_jit.at(key).get(); + + // Run + triton::driver::kernel* kernel = jit->get_function("shift"); + triton::jit::launch_information info = jit->get_launch_info("shift"); + // launch info + unsigned TM = info.global_range_size[0]; + unsigned TN = info.global_range_size[1]; + unsigned nthreads = info.num_threads; + // enqueue + configuration->enqueue(stream, kernel, &a, &b, &c, TM, TN, nthreads); + return torchc; +} diff --git a/examples/python/pytorch/utils.py b/examples/python/pytorch/utils.py deleted file mode 100644 index 4c9b3f90c..000000000 --- a/examples/python/pytorch/utils.py +++ /dev/null @@ -1,124 +0,0 @@ -'''Some helper functions for PyTorch, including: - - get_mean_and_std: calculate the mean and std value of dataset. - - msr_init: net parameter initialization. - - progress_bar: progress bar mimic xlua.progress. -''' -import os -import sys -import time -import math - -import torch.nn as nn -import torch.nn.init as init - - -def get_mean_and_std(dataset): - '''Compute the mean and std value of dataset.''' - dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True, num_workers=2) - mean = torch.zeros(3) - std = torch.zeros(3) - print('==> Computing mean and std..') - for inputs, targets in dataloader: - for i in range(3): - mean[i] += inputs[:,i,:,:].mean() - std[i] += inputs[:,i,:,:].std() - mean.div_(len(dataset)) - std.div_(len(dataset)) - return mean, std - -def init_params(net): - '''Init layer parameters.''' - for m in net.modules(): - if isinstance(m, nn.Conv2d): - init.kaiming_normal(m.weight, mode='fan_out') - if m.bias: - init.constant(m.bias, 0) - elif isinstance(m, nn.BatchNorm2d): - init.constant(m.weight, 1) - init.constant(m.bias, 0) - elif isinstance(m, nn.Linear): - init.normal(m.weight, std=1e-3) - if m.bias: - init.constant(m.bias, 0) - - -_, term_width = os.popen('stty size', 'r').read().split() -term_width = int(term_width) - -TOTAL_BAR_LENGTH = 65. -last_time = time.time() -begin_time = last_time -def progress_bar(current, total, msg=None): - global last_time, begin_time - if current == 0: - begin_time = time.time() # Reset for new bar. - - cur_len = int(TOTAL_BAR_LENGTH*current/total) - rest_len = int(TOTAL_BAR_LENGTH - cur_len) - 1 - - sys.stdout.write(' [') - for i in range(cur_len): - sys.stdout.write('=') - sys.stdout.write('>') - for i in range(rest_len): - sys.stdout.write('.') - sys.stdout.write(']') - - cur_time = time.time() - step_time = cur_time - last_time - last_time = cur_time - tot_time = cur_time - begin_time - - L = [] - L.append(' Step: %s' % format_time(step_time)) - L.append(' | Tot: %s' % format_time(tot_time)) - if msg: - L.append(' | ' + msg) - - msg = ''.join(L) - sys.stdout.write(msg) - for i in range(term_width-int(TOTAL_BAR_LENGTH)-len(msg)-3): - sys.stdout.write(' ') - - # Go back to the center of the bar. - for i in range(term_width-int(TOTAL_BAR_LENGTH/2)+2): - sys.stdout.write('\b') - sys.stdout.write(' %d/%d ' % (current+1, total)) - - if current < total-1: - sys.stdout.write('\r') - else: - sys.stdout.write('\n') - sys.stdout.flush() - -def format_time(seconds): - days = int(seconds / 3600/24) - seconds = seconds - days*3600*24 - hours = int(seconds / 3600) - seconds = seconds - hours*3600 - minutes = int(seconds / 60) - seconds = seconds - minutes*60 - secondsf = int(seconds) - seconds = seconds - secondsf - millis = int(seconds*1000) - - f = '' - i = 1 - if days > 0: - f += str(days) + 'D' - i += 1 - if hours > 0 and i <= 2: - f += str(hours) + 'h' - i += 1 - if minutes > 0 and i <= 2: - f += str(minutes) + 'm' - i += 1 - if secondsf > 0 and i <= 2: - f += str(secondsf) + 's' - i += 1 - if millis > 0 and i <= 2: - f += str(millis) + 'ms' - i += 1 - if f == '': - f = '0ms' - return f diff --git a/examples/python/tensorflow/CMakeLists.txt b/examples/python/tensorflow/CMakeLists.txt index 08ba828e3..5c151f19b 100644 --- a/examples/python/tensorflow/CMakeLists.txt +++ b/examples/python/tensorflow/CMakeLists.txt @@ -5,7 +5,7 @@ if(${TensorFlow_FOUND}) include_directories("${CUDA_HOME}/include") link_directories(${TF_LIB}) add_definitions(-D_GLIBCXX_USE_CXX11_ABI=${TF_ABI}) - add_library(tf_blocksparse SHARED dot.cpp conv2d.cpp shift.cpp) + add_library(tf_blocksparse SHARED dot.cpp conv.cpp shift.cpp batchnorm.cpp) target_link_libraries(tf_blocksparse tensorflow_framework triton) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/run.py ${CMAKE_CURRENT_BINARY_DIR}/run.py diff --git a/examples/python/tensorflow/batchnorm.cpp b/examples/python/tensorflow/batchnorm.cpp new file mode 100644 index 000000000..677168d08 --- /dev/null +++ b/examples/python/tensorflow/batchnorm.cpp @@ -0,0 +1,174 @@ +#include + +#include "triton/driver/buffer.h" +#include "triton/driver/backend.h" +#include "triton/driver/stream.h" +#include "triton/runtime/jit.h" +#include "triton/tools/bench.hpp" +#include "triton/dnn/batchnorm.h" + +#define EIGEN_USE_GPU +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" +#include "tensorflow/core/util/padding.h" +#include "tensorflow/core/util/tensor_format.h" +#include "tensorflow/core/framework/common_shape_fns.h" + +using namespace tensorflow; +using shape_inference::DimensionHandle; +using shape_inference::InferenceContext; +using shape_inference::ShapeHandle; +using GPUDevice = Eigen::GpuDevice; + +class BatchnormForwardOp : public OpKernel { +public: + explicit BatchnormForwardOp(OpKernelConstruction* context): OpKernel(context) { + context->GetAttr("eps", &eps_); + } + + void Compute(OpKernelContext* context){ + // get device/stream + GPUDevice device = context->eigen_device(); + triton::driver::cu_stream sstream(device.stream(), false); + triton::driver::context* ctx = sstream.context(); + triton::driver::stream* stream = &sstream; + // get inputs + const Tensor& x = context->input(0); + const Tensor& g = context->input(1); + const Tensor& b = context->input(2); + // get sizes + int C = x.dim_size(0); + int H = x.dim_size(1); + int W = x.dim_size(2); + int B = x.dim_size(3); + // allocate outputs + Tensor* y = nullptr; + Tensor* m = nullptr; + Tensor* v = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, x.shape(), &y)); + OP_REQUIRES_OK(context, context->allocate_output(1, g.shape(), &m)); + OP_REQUIRES_OK(context, context->allocate_output(2, g.shape(), &v)); + // triton handles + triton::driver::cu_buffer tx(ctx, (CUdeviceptr)x.flat().data(), false); + triton::driver::cu_buffer tg(ctx, (CUdeviceptr)g.flat().data(), false); + triton::driver::cu_buffer tb(ctx, (CUdeviceptr)b.flat().data(), false); + triton::driver::cu_buffer ty(ctx, (CUdeviceptr)y->flat().data(), false); + triton::driver::cu_buffer tm(ctx, (CUdeviceptr)m->flat().data(), false); + triton::driver::cu_buffer tv(ctx, (CUdeviceptr)v->flat().data(), false); + // create config + triton::dnn::batchnorm_forward batchnorm(C, 1, H, W, B, "fp32"); + std::ostringstream oss; + batchnorm.src(oss); + std::string src = oss.str(); + triton::jit jit(ctx); + jit.add_module("batchnorm", src.c_str(), jit.get_valid("batchnorm", src.c_str())); + triton::driver::kernel* kernel = jit.get_function("batchnorm"); + size_t TM = jit.get_int("TM"); + triton::jit::launch_information info = jit.get_launch_info("batchnorm"); + batchnorm.enqueue(stream, kernel, &ty, &tm, &tv, &tx, &tg, &tb, TM, info.num_threads); + } + +private: + float eps_; +}; + + +REGISTER_KERNEL_BUILDER(Name("BatchnormForward").Device(DEVICE_GPU), BatchnormForwardOp); +REGISTER_OP("BatchnormForward") + .Input("x: T") + .Input("g: float") + .Input("b: float") + .Output("y: T") + .Output("m: float") + .Output("v: float") + .Attr("T: {float}") + .Attr("eps: float") + .SetShapeFn([](InferenceContext* ctx) { + ctx->set_output(0, ctx->input(0)); + ctx->set_output(1, ctx->input(1)); + ctx->set_output(2, ctx->input(1)); + return Status::OK(); + }) +; + + +class BatchnormBackwardOp : public OpKernel { +public: + explicit BatchnormBackwardOp(OpKernelConstruction* context): OpKernel(context) { + context->GetAttr("eps", &eps_); + } + + void Compute(OpKernelContext* context){ + // get device/stream + GPUDevice device = context->eigen_device(); + triton::driver::cu_stream sstream(device.stream(), false); + triton::driver::context* ctx = sstream.context(); + triton::driver::stream* stream = &sstream; + // get inputs + const Tensor& dy = context->input(0); + const Tensor& x = context->input(1); + const Tensor& g = context->input(2); + const Tensor& m = context->input(3); + const Tensor& v = context->input(4); + // get sizes + int C = x.dim_size(0); + int H = x.dim_size(1); + int W = x.dim_size(2); + int B = x.dim_size(3); + // allocate outputs + Tensor* dx = nullptr; + Tensor* dg = nullptr; + Tensor* db = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, x.shape(), &dx)); + OP_REQUIRES_OK(context, context->allocate_output(1, g.shape(), &dg)); + OP_REQUIRES_OK(context, context->allocate_output(2, g.shape(), &db)); + // triton handles + triton::driver::cu_buffer tdy(ctx, (CUdeviceptr)dy.flat().data(), false); + triton::driver::cu_buffer tx(ctx, (CUdeviceptr)x.flat().data(), false); + triton::driver::cu_buffer tg(ctx, (CUdeviceptr)g.flat().data(), false); + triton::driver::cu_buffer tm(ctx, (CUdeviceptr)m.flat().data(), false); + triton::driver::cu_buffer tv(ctx, (CUdeviceptr)v.flat().data(), false); + triton::driver::cu_buffer tdx(ctx, (CUdeviceptr)dx->flat().data(), false); + triton::driver::cu_buffer tdg(ctx, (CUdeviceptr)dg->flat().data(), false); + triton::driver::cu_buffer tdb(ctx, (CUdeviceptr)db->flat().data(), false); + + // create config + triton::dnn::batchnorm_backward batchnorm(C, 1, H, W, B, "fp32"); + std::ostringstream oss; + batchnorm.src(oss); + std::string src = oss.str(); + triton::jit jit(ctx); + jit.add_module("batchnorm", src.c_str(), jit.get_valid("batchnorm", src.c_str())); + triton::driver::kernel* kernel = jit.get_function("batchnorm"); + size_t TM = jit.get_int("TM"); + triton::jit::launch_information info = jit.get_launch_info("batchnorm"); + batchnorm.enqueue(stream, kernel, &tdx, &tdg, &tdb, &tdy, &tx, &tg, &tm, &tv, TM, info.num_threads); + } + +private: + float eps_; +}; + + +REGISTER_KERNEL_BUILDER(Name("BatchnormBackward").Device(DEVICE_GPU), BatchnormBackwardOp); +REGISTER_OP("BatchnormBackward") + .Input("dy: TY") + .Input("x: TX") + .Input("g: float") + .Input("m: float") + .Input("v: float") + .Output("dx: TY") + .Output("dg: float") + .Output("db: float") + .Attr("TX: {float}") + .Attr("TY: {float}") + .Attr("eps: float") + .SetShapeFn([](InferenceContext* ctx) { + ctx->set_output(0, ctx->input(1)); + ctx->set_output(1, ctx->input(2)); + ctx->set_output(2, ctx->input(2)); + return Status::OK(); + }) +; diff --git a/examples/python/tensorflow/conv2d.cpp b/examples/python/tensorflow/conv.cpp similarity index 96% rename from examples/python/tensorflow/conv2d.cpp rename to examples/python/tensorflow/conv.cpp index 12b033f21..ff81e3d31 100644 --- a/examples/python/tensorflow/conv2d.cpp +++ b/examples/python/tensorflow/conv.cpp @@ -65,8 +65,6 @@ public: // Bind memory triton::driver::cu_buffer a(ctx, (CUdeviceptr)tfa.flat().data(), false); triton::driver::cu_buffer b(ctx, (CUdeviceptr)tfb.flat().data(), false); -// triton::driver::cu_buffer cubias(ctx, (CUdeviceptr)torchbias.storage().data(), false); -// triton::driver::buffer* bias = has_bias ? &cubias : nullptr; triton::driver::buffer* bias = nullptr; // allocate output diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 385a904c0..2de35d7d6 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -56,8 +56,8 @@ def blocksparse_matmul_grad(op, dy): return (dx, dw) def run_shift(): - B, C, H, W = 1, 32, 8, 6 - R, S, F = 3, 3, 16 + B, C, H, W = 16, 1024, 8, 8 + R, S, F = 3, 3, 1024 np.random.seed(2) a = tf.placeholder(tf.float32, shape=[C, H, W, B]) b = tf.placeholder(tf.float32, shape=[C, F]) @@ -65,8 +65,6 @@ def run_shift(): hshift_w = np.random.randint(- (S//2), R//2 + 1, size=C, dtype=np.int32) #hshift_h = np.ones(C, dtype=np.int32) #hshift_w = np.ones(C, dtype=np.int32) - print(hshift_h) - print(hshift_w) c = module.shift_conv(a, b, shift_h=tf.make_tensor_proto(hshift_h), shift_w=tf.make_tensor_proto(hshift_w)) # Reference ha = np.random.rand(C, H, W, B) @@ -74,16 +72,36 @@ def run_shift(): #ha = np.ones((C, H, W, B), dtype=np.int32) #hb = np.ones((C, F), dtype=np.int32) sess = tf.InteractiveSession() - grads = tf.test.compute_gradient([a, b], [(C, H, W, B), (C, F)], c, (F, H, W, B), - extra_feed_dict={a: ha, b: hb}) - dw_t, dw_n = grads[1] - dx_t, dx_n = grads[0] - print(np.max(np.abs(dw_t - dw_n))) - print(np.max(np.abs(dx_t - dx_n))) + #grads = tf.test.compute_gradient([a, b], [(C, H, W, B), (C, F)], c, (F, H, W, B), + # extra_feed_dict = {a: ha, b: hb}) + #dw_t, dw_n = grads[1] + #dx_t, dx_n = grads[0] + #print(np.max(np.abs(dw_t - dw_n))) + #print(np.max(np.abs(dx_t - dx_n))) # Run sess.run(tf.global_variables_initializer()) result = sess.run([c], feed_dict = {a: ha, b: hb})[0] #print(result) -run_shift() +def run_batchnorm(): + C, H, W, B = 32, 16, 16, 16 + np.random.seed(0) + # Placeholders + x = tf.placeholder(tf.float32, shape=[C, H, W, B]) + g = tf.placeholder(tf.float32, shape=[C]) + b = tf.placeholder(tf.float32, shape=[C]) + # Feed values + hx = np.random.rand(C, H, W, B) + hg = np.random.rand(C) + hb = np.random.rand(C) + # batchnorm + y, m, v = module.batchnorm_forward(x, g, b, eps=1e-5) + # Run + sess = tf.InteractiveSession() + sess.run(tf.global_variables_initializer()) + result = sess.run([y, m, v], feed_dict = {x: hx, g: hg, b: hb}) + print(hx.sum(axis=(1,2,3))) + print(result[1]) + +run_batchnorm() diff --git a/examples/python/tensorflow/shift.cpp b/examples/python/tensorflow/shift.cpp index f9c102a6d..6e9abec55 100644 --- a/examples/python/tensorflow/shift.cpp +++ b/examples/python/tensorflow/shift.cpp @@ -125,7 +125,7 @@ public: triton::driver::cu_buffer dc(ctx, (CUdeviceptr)tf_c->flat().data(), false); // get JIT triton::jit* jit; - bool autotune = false; + bool autotune = true; if(m_jit.find(key) == m_jit.end()) { jit = m_jit.emplace(key, new triton::jit(ctx)).first->second.get(); std::ostringstream oss; diff --git a/include/triton/dnn/batchnorm.h b/include/triton/dnn/batchnorm.h new file mode 100644 index 000000000..a61178500 --- /dev/null +++ b/include/triton/dnn/batchnorm.h @@ -0,0 +1,83 @@ +/* Copyright 2015-2019 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#ifndef TDL_INCLUDE_DNN_BATCHNORM_H +#define TDL_INCLUDE_DNN_BATCHNORM_H + +#include +#include +#include +#include +#include +#include "triton/driver/stream.h" +#include "triton/driver/kernel.h" + +namespace triton{ +namespace dnn{ + +class batchnorm_forward { +public: + // constructor + batchnorm_forward(int C, int D, int H, int W, int B, std::string ty = "fp32"); + // enqueue + void enqueue(driver::stream *stream, driver::kernel *kernel, + driver::buffer *y, driver::buffer *m, driver::buffer *v, + driver::buffer *x, driver::buffer *g, driver::buffer *b, + size_t TM, size_t nthreads); + // triton-c source code + void src(std::ostream &os); + +private: + int32_t C_; + int32_t D_; + int32_t H_; + int32_t W_; + int32_t B_; + std::string ty_; +}; + +class batchnorm_backward { +public: + // constructor + batchnorm_backward(int C, int D, int H, int W, int B, std::string ty = "fp32"); + // enqueue + void enqueue(driver::stream *stream, driver::kernel *kernel, + driver::buffer *dx, driver::buffer *dg, driver::buffer *db, driver::buffer *dy, + driver::buffer *x, driver::buffer *g, driver::buffer *m, driver::buffer *v, + size_t TM, size_t nthreads); + // triton-c source code + void src(std::ostream &os); + + +private: + int32_t C_; + int32_t D_; + int32_t H_; + int32_t W_; + int32_t B_; + std::string ty_; +}; + +} +} + +#endif diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h index fbea4ec27..ea56a8d9c 100644 --- a/include/triton/ir/builder.h +++ b/include/triton/ir/builder.h @@ -131,6 +131,7 @@ public: value *create_atomic_add(value *ptr, value *val, const std::string &name = ""); value *create_dot(value *A, value *B, value *C, const std::string &name = ""); value *create_trans(value *A, const std::string &name = ""); + value *create_reduce(value *A, const std::string &name = ""); value *create_select(value *pred, value *if_value, value *else_value, const std::string &name = ""); // Intrinsics value *create_copy_to_shared(value *arg, const std::string &name = ""); diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index 95ecd9c70..9a8aa2f0b 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -581,6 +581,15 @@ public: static instruction* create(value *arg, const std::string &name = "", instruction *next = nullptr); }; +class reduce_inst: public builtin_inst { +private: + reduce_inst(value* arg, const std::string& name, instruction* next); + std::string repr_impl() const { return "reduce"; } + +public: + static instruction* create(value *arg, const std::string &name = "", instruction *next = nullptr); +}; + class select_inst: public builtin_inst { private: select_inst(value *pred, value *if_value, value *else_value, const std::string& name, instruction* next); diff --git a/include/triton/lang/expression.h b/include/triton/lang/expression.h index 420e902a8..40e03f84d 100644 --- a/include/triton/lang/expression.h +++ b/include/triton/lang/expression.h @@ -167,6 +167,14 @@ private: node* arg_; }; +class reduce_expression: public builtin_expression{ +public: + reduce_expression(node *arg): arg_(arg) {} + ir::value* codegen(ir::module *mod) const; + +private: + node* arg_; +}; class indexing_expression: public postfix_expression{ public: diff --git a/include/triton/lang/parser.y b/include/triton/lang/parser.y index 6acc128e5..9296cd52f 100644 --- a/include/triton/lang/parser.y +++ b/include/triton/lang/parser.y @@ -55,7 +55,7 @@ STORAGE_SPEC_T get_storage_spec(node *op) { return ((token*)op)->storage_spec;} %token VOID UINT1 UINT8 UINT16 UINT32 UINT64 INT1 INT8 INT16 INT32 INT64 FP16 FP32 FP64 %token IF ELSE FOR CONTINUE WHILE %token NEWAXIS ELLIPSIS AT -%token GET_GLOBAL_RANGE GET_RANGE_ID DOT TRANS MAX MIN SELECT ATOMIC_CAS ATOMIC_EXCHG ATOMIC_ADD ALLOC_CONST +%token GET_GLOBAL_RANGE GET_RANGE_ID DOT REDUCE_SUM TRANS MAX MIN SELECT ATOMIC_CAS ATOMIC_EXCHG ATOMIC_ADD ALLOC_CONST %start translation_unit %% @@ -125,6 +125,7 @@ builtin_expression | DOT '(' expression ',' expression ',' expression ')' { $$ = new matmul_expression($3, $5, $7); } | ALLOC_CONST type_specifier '[' constant ']' { $$ = new alloc_const_expression(new typed_declaration_specifier(get_type_spec($2)), $4); } | TRANS '(' expression ')' { $$ = new trans_expression($3); } + | REDUCE_SUM '(' expression ')' { $$ = new reduce_expression($3);} | MAX '(' expression ',' expression ')' { $$ = new max_expression($3, $5); } | MIN '(' expression ',' expression ')' { $$ = new min_expression($3, $5); } | SELECT '(' expression ',' expression ',' expression ')' { $$ = new select_expression($3, $5, $7); } diff --git a/include/triton/lang/scanner.l b/include/triton/lang/scanner.l index 24385659d..68c38d2f0 100644 --- a/include/triton/lang/scanner.l +++ b/include/triton/lang/scanner.l @@ -49,6 +49,7 @@ using triton::lang::return_void; "__atomic_cas" { return return_impl(ATOMIC_CAS, yytext); } "__atomic_exchg" { return return_impl(ATOMIC_EXCHG, yytext); } "__atomic_add" { return return_impl(ATOMIC_ADD, yytext); } +"__sum" { return return_impl(REDUCE_SUM, yytext); } "dot" { return return_impl(DOT, yytext); } "max" { return return_impl(MAX, yytext); } "min" { return return_impl(MIN, yytext); } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 3f0c02d2a..cd40f7912 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -773,6 +773,62 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & vmap_[x] = tmap_[x->get_operand(0)]->get_value({builder.getInt32(0)}); return; } + if(auto *x = dynamic_cast(ins)){ + Value *partial = nullptr; + distributed_tile* op = (distributed_tile*)tmap_.at(ins->get_operand(0)); + // reduce within thread + op->for_each([&](indices_t idx){ + Value *current = op->get_value(idx); + if(partial == nullptr) + partial = current; + else + partial = builder.CreateFAdd(partial, current); + }); + // reduce within warp + Value *shfl = Intrinsic::getDeclaration(builder.GetInsertBlock()->getModule(), Intrinsic::nvvm_shfl_sync_bfly_f32); + for (int i = 16; i > 0; i >>= 1){ + Value *rhs = builder.CreateCall(shfl, {builder.getInt32(0x1f), partial, + builder.getInt32(i), builder.getInt32(0xffffffff)}); + partial = builder.CreateFAdd(partial, rhs); + } + // reduce within block + Value *tid = tgt_->get_local_id(module, builder, 0); + BasicBlock *partial_reduce_do = BasicBlock::Create(ctx, "partial_reduce_do", fn); + BasicBlock *partial_reduce_done = BasicBlock::Create(ctx, "partial_reduce_done", fn); + Value *id_in_warp = builder.CreateURem(tid, builder.getInt32(32)); + Value *warp_id = builder.CreateUDiv(tid, builder.getInt32(32)); + + builder.CreateCondBr(builder.CreateICmpEQ(id_in_warp, builder.getInt32(0)), + partial_reduce_do, partial_reduce_done); + builder.SetInsertPoint(partial_reduce_do); + unsigned addr_space = sh_mem_ptr_->getType()->getPointerAddressSpace(); + Type *ptr_ty = PointerType::get(builder.getFloatTy(), addr_space); + Value *sh_mem_ptr = builder.CreateBitCast(sh_mem_ptr_, ptr_ty); + Value *write_ptr = builder.CreateGEP(sh_mem_ptr, warp_id); + builder.CreateStore(partial, write_ptr); + builder.CreateBr(partial_reduce_done); + builder.SetInsertPoint(partial_reduce_done); + // Final reduction with the first warp + tgt_->add_barrier(module, builder); + BasicBlock *final_reduce_do = BasicBlock::Create(ctx, "final_reduce_do", fn); + BasicBlock *final_reduce_done = BasicBlock::Create(ctx, "final_reduce_done", fn); + builder.CreateCondBr(builder.CreateICmpEQ(warp_id, builder.getInt32(0)), + final_reduce_do, final_reduce_done); + builder.SetInsertPoint(final_reduce_do); + Value *read_ptr = builder.CreateGEP(sh_mem_ptr, tid); + Value *result = builder.CreateLoad(read_ptr); + for (int i = params_->get_num_threads() / 64; i > 0; i >>= 1){ + Value *rhs = builder.CreateCall(shfl, {result, builder.getInt32(i), + builder.getInt32(0x1f), builder.getInt32(0xffffffff)}); + builder.CreateFAdd(result, rhs); + } + builder.CreateStore(result, read_ptr); + builder.CreateBr(final_reduce_done); + builder.SetInsertPoint(final_reduce_done); + tgt_->add_barrier(module, builder); + vmap_[ins] = builder.CreateLoad(sh_mem_ptr); + return; + } tile *ti = tmap_[ins]; distributed_tile* result = (distributed_tile*)ti; if(!ins->get_type()->is_tile_ty()) diff --git a/lib/codegen/shmem_allocation.cpp b/lib/codegen/shmem_allocation.cpp index 1558c663d..469524b07 100644 --- a/lib/codegen/shmem_allocation.cpp +++ b/lib/codegen/shmem_allocation.cpp @@ -29,6 +29,8 @@ unsigned shmem_allocation::is_ld_padded(ir::value *x) { } unsigned shmem_allocation::get_num_bytes(ir::value *x) { + if(dynamic_cast(x)) + return 32; unsigned result = x->get_type()->get_primitive_size_in_bits() / 8; unsigned pad = is_ld_padded(x); if(pad > 0){ diff --git a/lib/codegen/shmem_info.cpp b/lib/codegen/shmem_info.cpp index 6d3caafab..659afaa4a 100644 --- a/lib/codegen/shmem_info.cpp +++ b/lib/codegen/shmem_info.cpp @@ -40,6 +40,8 @@ inline bool get_is_shared(ir::value* v) { return true; if(auto x = dynamic_cast(v)) return true; + if(auto x = dynamic_cast(v)) + return true; if(auto x = dynamic_cast(v)){ bool res = true; for(unsigned inc = 0; inc < x->get_num_incoming(); inc++) diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 6bdc97759..3b4fc0492 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -60,6 +60,8 @@ void tune::init_c_graph(ir::instruction *v) { shapes = atom->get_operand(0)->get_type()->get_tile_shapes(); else if(auto *downcast = dynamic_cast(v)) return; + else if(auto *reduce = dynamic_cast(v)) + return; else shapes = v->get_type()->get_tile_shapes(); // Reshape diff --git a/lib/dnn/batchnorm.cpp b/lib/dnn/batchnorm.cpp new file mode 100644 index 000000000..db84bf5b9 --- /dev/null +++ b/lib/dnn/batchnorm.cpp @@ -0,0 +1,165 @@ +/* Copyright 2015-2019 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#include "triton/dnn/batchnorm.h" + +namespace triton{ +namespace dnn{ + +/* --------------- + * Forward + * --------------- */ + +batchnorm_forward::batchnorm_forward(int C, int D, int H, int W, int B, std::string ty) + : C_(C), D_(D), H_(H), W_(W), B_(B), ty_(ty) { } + +void batchnorm_forward::enqueue(driver::stream *stream, driver::kernel *kernel, + driver::buffer *y, driver::buffer *m, driver::buffer *v, + driver::buffer *x, driver::buffer *g, driver::buffer *b, + size_t, size_t nthreads) { + + std::array grid = {(size_t)C_, 1, 1}; + kernel->setArg(0, y); + kernel->setArg(1, m); + kernel->setArg(2, v); + kernel->setArg(3, x); + kernel->setArg(4, g); + kernel->setArg(5, b); + kernel->setArg(6, (int32_t)(D_*H_*W_*B_)); + stream->enqueue(kernel, grid, {nthreads, 1, 1}); +} + +void batchnorm_forward::src(std::ostream &os) { + os << +R"( +const tunable int32 TM = {32, 64, 128}; + +void batchnorm(fp32 *Y, fp32 *M, fp32 *V, + restrict read_only fp32 *X, + restrict read_only fp32 *G, + restrict read_only fp32 *B, + int32 DHWN) { + int32 rx[TM] = get_global_range[TM](0); + fp32 *px[TM]; + fp32 x[TM]; + int32 c = get_range_id(0); + fp32 g = *(G + c); + fp32 b = *(B + c); + + fp32 mean[TM] = 0; + px = X + rx + c*DHWN; + for(int32 i = 0; i < DHWN; i = i + TM){ + x = *px; + mean = mean + x; + px = px + TM; + } + fp32 m = __sum(mean); + fp32 *pm = M + c; + *pm = m; + + fp32 var[TM] = 0; + px = X + rx + c*DHWN; + for(int32 i = 0; i < DHWN; i = i + TM){ + x = *px; + x = x - mean; + var = var + x*x; + } + fp32 v = __sum(var); + fp32 *pv = V + c; + *pv = v; +})"; +} + +/* --------------- + * Backward + * --------------- */ + +batchnorm_backward::batchnorm_backward(int C, int D, int H, int W, int B, std::string ty) + : C_(C), D_(D), H_(H), W_(W), B_(B), ty_(ty) +{ } + +void batchnorm_backward::enqueue(driver::stream *stream, driver::kernel *kernel, + driver::buffer *dx, driver::buffer *dg, driver::buffer *db, driver::buffer *dy, + driver::buffer *x, driver::buffer *g, driver::buffer *m, driver::buffer *v, + size_t, size_t nthreads) { + + std::array grid = {(size_t)C_, 1, 1}; + kernel->setArg(0, dx); + kernel->setArg(1, dg); + kernel->setArg(2, db); + kernel->setArg(3, dy); + kernel->setArg(4, x); + kernel->setArg(5, g); + kernel->setArg(6, m); + kernel->setArg(7, v); + kernel->setArg(8, (int32_t)(D_*H_*W_*B_)); + stream->enqueue(kernel, grid, {nthreads, 1, 1}); +} + +void batchnorm_backward::src(std::ostream &os) { + os << +R"( +const tunable int32 TM = {32, 64, 128}; + +void batchnorm(fp32 *DX, fp32 *DG, fp32 *DB, + restrict read_only fp32 *DY, + restrict read_only fp32 *X, + restrict read_only fp32 *G, + restrict read_only fp32 *M, + restrict read_only fp32 *V, + int32 DHWN) { + int32 rx[TM] = get_global_range[TM](0); + int32 c = get_range_id(0); + int32 offset = c*DHWN; + fp32 g = *(G + c); + fp32 mean = *(M + c); + fp32 var = *(V + c); + fp32 rstd = var; + fp32* px[TM]; + fp32* pdx[TM]; + fp32* pdy[TM]; + + px = X + rx + offset; + pdy = DY + rx + offset; + fp32 dg[TM] = 0; + fp32 db[TM] = 0; + for(int32 i = 0; i < DHWN; i += TM){ + fp32 x[TM] = *px; + fp32 dy[TM] = *pdy; + dg = dg + dy*(x - mean)*rstd; + db = db + dy; + } + + px = X + rx + offset; + pdy = DY + rx + offset; + pdx = DX + rx + offset; + for(int32 i = 0; i < DHWN; i += TM){ + fp32 xhat[TM] = (x - mean) * rstd; + fp32 xtmp[TM] = (xhat * dg + db) * NDHW; + fp32 dx[TM] = (dy - xtmp) * rstd * g; + *pdx = dx; + } +})"; +} + +} +} diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index b686f9cde..66f3e0c35 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -220,8 +220,10 @@ if(ty_ == WGRAD){ } os << R"( )" << a_ty_ << "* pa[" << AS << "] = a + rxa" << bca1 << lda1 << " + " << rka << bca0 << lda0 << R"(; - )" << a_ty_ << " a[" << AS << R"(] = *pa; - )" << b_ty_ << "* pb[" << BS << "] = b + ryb" << bcb1 << ldb1 << " + " << rkb << bcb0 << ldb0 << ";"; + )" << b_ty_ << "* pb[" << BS << "] = b + ryb" << bcb1 << ldb1 << " + " << rkb << bcb0 << ldb0 << R"(; + int1 checka[)" << AS << "] = (rka < K)" << bca0 << R"(; + int1 checkb[)" << BS << "] = (rkb < K)" << bcb0 << R"(; + )" << a_ty_ << " a[" << AS << R"(] = checka ? *pa : 0;)"; if(ty_ == WGRAD){ os << R"( int32 rbwhc[TK] = rkb / ABS; @@ -233,11 +235,11 @@ if(ty_ == WGRAD){ int1 interior[TK, TN] = interiorh[:, newaxis] && interiorw[:, newaxis]; int32 inc[TK, TN] = interior ? shift : 0; )" << b_ty_ << R"(* shifted_pb[TK, TN] = pb + inc; - )" << b_ty_ << R"( b[TK, TN] = *shifted_pb;)"; + )" << b_ty_ << R"( b[TK, TN] = checkb ? *shifted_pb : 0;)"; } else{ os << R"( - )" << b_ty_ << " b[" << BS << R"(] = *pb;)"; + )" << b_ty_ << " b[" << BS << R"(] = checkb ? *pb : 0;)"; } os << R"( for(int32 k = K; k > 0; k = k - TK){ diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 4ff863666..c3139ece6 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -255,7 +255,7 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ -// std::cout << source << std::endl; + std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index 7110edccf..cf6832958 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -320,6 +320,10 @@ value *builder::create_trans(value *A, const std::string &name) { return insert(trans_inst::create(A, name)); } +value *builder::create_reduce(value *A, const std::string &name) { + return insert(reduce_inst::create(A, name)); +} + value *builder::create_select(value *pred, value *if_value, value *else_value, const std::string &name){ return insert(select_inst::create(pred, if_value, else_value, name)); } diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index d1ae2af25..6607990c4 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -566,6 +566,19 @@ instruction* trans_inst::create(value *arg, const std::string &name, instruction return new trans_inst(arg, name, next); } +//===----------------------------------------------------------------------===// +// reduce instructions +//===----------------------------------------------------------------------===// +reduce_inst::reduce_inst(value *arg, const std::string &name, instruction *next) + : builtin_inst(arg->get_type()->get_scalar_ty(), 1, 1, name, next) { + set_operand(0, arg); +} + +instruction* reduce_inst::create(value *arg, const std::string &name, instruction *next) { + return new reduce_inst(arg, name, next); +} + + //===----------------------------------------------------------------------===// // select instructions //===----------------------------------------------------------------------===// diff --git a/lib/lang/expression.cpp b/lib/lang/expression.cpp index 7b1b4888c..8f51ec47a 100644 --- a/lib/lang/expression.cpp +++ b/lib/lang/expression.cpp @@ -175,11 +175,16 @@ ir::value* select_expression::codegen(ir::module *mod) const { return mod->get_builder().create_select(pred, if_value, else_value); } -// Trans +// trans ir::value* trans_expression::codegen(ir::module *mod) const { return mod->get_builder().create_trans(arg_->codegen(mod)); } +// reduce +ir::value* reduce_expression::codegen(ir::module *mod) const { + return mod->get_builder().create_reduce(arg_->codegen(mod)); +} + /* Postfix expression */ ir::value* indexing_expression::codegen(ir::module *mod) const{ ir::value *in = lhs_->codegen(mod); From fa3270dcf29e8b3ad2de089ab1d7f0f50d283ff8 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 8 Jul 2019 18:53:37 -0700 Subject: [PATCH 218/494] [codegen/selection] bugfix in code generation for reduction instructions --- lib/codegen/selection.cpp | 4 ++-- lib/dnn/batchnorm.cpp | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index cd40f7912..da31ea9d8 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -787,8 +787,8 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & // reduce within warp Value *shfl = Intrinsic::getDeclaration(builder.GetInsertBlock()->getModule(), Intrinsic::nvvm_shfl_sync_bfly_f32); for (int i = 16; i > 0; i >>= 1){ - Value *rhs = builder.CreateCall(shfl, {builder.getInt32(0x1f), partial, - builder.getInt32(i), builder.getInt32(0xffffffff)}); + Value *rhs = builder.CreateCall(shfl, {builder.getInt32(0xffffffff), partial, + builder.getInt32(i), builder.getInt32(0x1f)}); partial = builder.CreateFAdd(partial, rhs); } // reduce within block diff --git a/lib/dnn/batchnorm.cpp b/lib/dnn/batchnorm.cpp index db84bf5b9..4bb29db5d 100644 --- a/lib/dnn/batchnorm.cpp +++ b/lib/dnn/batchnorm.cpp @@ -58,7 +58,7 @@ void batchnorm(fp32 *Y, fp32 *M, fp32 *V, restrict read_only fp32 *G, restrict read_only fp32 *B, int32 DHWN) { - int32 rx[TM] = get_global_range[TM](0); + int32 rx[TM] = 0 ... TM; fp32 *px[TM]; fp32 x[TM]; int32 c = get_range_id(0); @@ -82,6 +82,7 @@ void batchnorm(fp32 *Y, fp32 *M, fp32 *V, x = *px; x = x - mean; var = var + x*x; + px = px + TM; } fp32 v = __sum(var); fp32 *pv = V + c; From f74dcb7e30e0e71e49e41ec42c0ddf04e1badc15 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 8 Jul 2019 20:18:20 -0700 Subject: [PATCH 219/494] [dnn/batchnorm]: added some more code in Triton-C batchnorm implementations --- examples/python/tensorflow/run.py | 19 ++++++++++++++-- include/triton/dnn/batchnorm.h | 3 +++ include/triton/ir/builder.h | 1 + include/triton/ir/instructions.h | 8 +++++++ include/triton/lang/expression.h | 9 ++++++++ include/triton/lang/parser.y | 3 ++- include/triton/lang/scanner.l | 1 + lib/codegen/selection.cpp | 7 +++++- lib/dnn/batchnorm.cpp | 38 ++++++++++++++++++++++++++----- lib/driver/module.cpp | 2 +- lib/ir/builder.cpp | 4 ++++ lib/ir/instructions.cpp | 13 +++++++++++ lib/lang/expression.cpp | 6 +++++ 13 files changed, 103 insertions(+), 11 deletions(-) diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 2de35d7d6..df37c830c 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -84,6 +84,21 @@ def run_shift(): b: hb})[0] #print(result) + +def batch_norm(x, g, b, epsilon=1e-6): + shape = x.shape + C = int(shape[1]) + assert g.get_shape().num_elements() == C + assert b.get_shape().num_elements() == C + return module.batchnorm_forward(x, g, b, eps=epsilon) + +@ops.RegisterGradient("BatchnormForward") +def batch_norm_grad(op, dy, mean, var): + eps = op.get_attr("eps") + return module.batchnorm_backward(dy, op.inputs[0], op.inputs[1], + op.outputs[1], op.outputs[2], eps=eps) + + def run_batchnorm(): C, H, W, B = 32, 16, 16, 16 np.random.seed(0) @@ -101,7 +116,7 @@ def run_batchnorm(): sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) result = sess.run([y, m, v], feed_dict = {x: hx, g: hg, b: hb}) - print(hx.sum(axis=(1,2,3))) - print(result[1]) + + run_batchnorm() diff --git a/include/triton/dnn/batchnorm.h b/include/triton/dnn/batchnorm.h index a61178500..7a97e83af 100644 --- a/include/triton/dnn/batchnorm.h +++ b/include/triton/dnn/batchnorm.h @@ -53,6 +53,9 @@ private: int32_t W_; int32_t B_; std::string ty_; + float eps_; + int32_t DHWB_; + float rcpDHWB_; }; class batchnorm_backward { diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h index ea56a8d9c..8dd60fdff 100644 --- a/include/triton/ir/builder.h +++ b/include/triton/ir/builder.h @@ -131,6 +131,7 @@ public: value *create_atomic_add(value *ptr, value *val, const std::string &name = ""); value *create_dot(value *A, value *B, value *C, const std::string &name = ""); value *create_trans(value *A, const std::string &name = ""); + value *create_sqrt(value *A, const std::string &name = ""); value *create_reduce(value *A, const std::string &name = ""); value *create_select(value *pred, value *if_value, value *else_value, const std::string &name = ""); // Intrinsics diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index 9a8aa2f0b..9828406af 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -581,6 +581,14 @@ public: static instruction* create(value *arg, const std::string &name = "", instruction *next = nullptr); }; +class sqrt_inst: public builtin_inst { +private: + sqrt_inst(value *arg, const std::string& name, instruction* next); + std::string repr_impl() const { return "sqrt"; } +public: + static instruction* create(value *arg, const std::string &name = "", instruction *next = nullptr); +}; + class reduce_inst: public builtin_inst { private: reduce_inst(value* arg, const std::string& name, instruction* next); diff --git a/include/triton/lang/expression.h b/include/triton/lang/expression.h index 40e03f84d..3d894c802 100644 --- a/include/triton/lang/expression.h +++ b/include/triton/lang/expression.h @@ -167,6 +167,15 @@ private: node* arg_; }; +class sqrt_expression: public builtin_expression{ +public: + sqrt_expression(node *arg): arg_(arg) {} + ir::value* codegen(ir::module *) const; + +private: + node* arg_; +}; + class reduce_expression: public builtin_expression{ public: reduce_expression(node *arg): arg_(arg) {} diff --git a/include/triton/lang/parser.y b/include/triton/lang/parser.y index 9296cd52f..32b3c5ed4 100644 --- a/include/triton/lang/parser.y +++ b/include/triton/lang/parser.y @@ -55,7 +55,7 @@ STORAGE_SPEC_T get_storage_spec(node *op) { return ((token*)op)->storage_spec;} %token VOID UINT1 UINT8 UINT16 UINT32 UINT64 INT1 INT8 INT16 INT32 INT64 FP16 FP32 FP64 %token IF ELSE FOR CONTINUE WHILE %token NEWAXIS ELLIPSIS AT -%token GET_GLOBAL_RANGE GET_RANGE_ID DOT REDUCE_SUM TRANS MAX MIN SELECT ATOMIC_CAS ATOMIC_EXCHG ATOMIC_ADD ALLOC_CONST +%token GET_GLOBAL_RANGE GET_RANGE_ID DOT SQRT REDUCE_SUM TRANS MAX MIN SELECT ATOMIC_CAS ATOMIC_EXCHG ATOMIC_ADD ALLOC_CONST %start translation_unit %% @@ -123,6 +123,7 @@ builtin_expression : GET_GLOBAL_RANGE '[' primary_expression ']' '(' constant ')' { $$ = new get_global_range_expression($3, $6); } | GET_RANGE_ID '(' constant ')' { $$ = new get_range_id_expression($3); } | DOT '(' expression ',' expression ',' expression ')' { $$ = new matmul_expression($3, $5, $7); } + | SQRT '(' expression ')' { $$ = new sqrt_expression($3); } | ALLOC_CONST type_specifier '[' constant ']' { $$ = new alloc_const_expression(new typed_declaration_specifier(get_type_spec($2)), $4); } | TRANS '(' expression ')' { $$ = new trans_expression($3); } | REDUCE_SUM '(' expression ')' { $$ = new reduce_expression($3);} diff --git a/include/triton/lang/scanner.l b/include/triton/lang/scanner.l index 68c38d2f0..0fbaa52d2 100644 --- a/include/triton/lang/scanner.l +++ b/include/triton/lang/scanner.l @@ -50,6 +50,7 @@ using triton::lang::return_void; "__atomic_exchg" { return return_impl(ATOMIC_EXCHG, yytext); } "__atomic_add" { return return_impl(ATOMIC_ADD, yytext); } "__sum" { return return_impl(REDUCE_SUM, yytext); } +"sqrt" { return return_impl(SQRT, yytext); } "dot" { return return_impl(DOT, yytext); } "max" { return return_impl(MAX, yytext); } "min" { return return_impl(MIN, yytext); } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index da31ea9d8..3bd010ebc 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -380,6 +380,12 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::function(inst)){ + Value *val = value(ii->get_operand(0)); + Value *sqrt = Intrinsic::getDeclaration(builder.GetInsertBlock()->getModule(), Intrinsic::sqrt, {val->getType()}); + Value *res = builder.CreateCall(sqrt, {val}); + return (Instruction*)res; + } // unknown instruction throw std::runtime_error("unknown conversion from ir::instruction to Instruction"); } @@ -797,7 +803,6 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & BasicBlock *partial_reduce_done = BasicBlock::Create(ctx, "partial_reduce_done", fn); Value *id_in_warp = builder.CreateURem(tid, builder.getInt32(32)); Value *warp_id = builder.CreateUDiv(tid, builder.getInt32(32)); - builder.CreateCondBr(builder.CreateICmpEQ(id_in_warp, builder.getInt32(0)), partial_reduce_do, partial_reduce_done); builder.SetInsertPoint(partial_reduce_do); diff --git a/lib/dnn/batchnorm.cpp b/lib/dnn/batchnorm.cpp index 4bb29db5d..e3b1a630c 100644 --- a/lib/dnn/batchnorm.cpp +++ b/lib/dnn/batchnorm.cpp @@ -30,7 +30,10 @@ namespace dnn{ * --------------- */ batchnorm_forward::batchnorm_forward(int C, int D, int H, int W, int B, std::string ty) - : C_(C), D_(D), H_(H), W_(W), B_(B), ty_(ty) { } + : C_(C), D_(D), H_(H), W_(W), B_(B), ty_(ty), eps_(1e-5) { + DHWB_ = D_*H_*W_*B_; + rcpDHWB_ = (float)1 / DHWB_; +} void batchnorm_forward::enqueue(driver::stream *stream, driver::kernel *kernel, driver::buffer *y, driver::buffer *m, driver::buffer *v, @@ -44,7 +47,9 @@ void batchnorm_forward::enqueue(driver::stream *stream, driver::kernel *kernel, kernel->setArg(3, x); kernel->setArg(4, g); kernel->setArg(5, b); - kernel->setArg(6, (int32_t)(D_*H_*W_*B_)); + kernel->setArg(6, DHWB_); + kernel->setArg(7, rcpDHWB_); + kernel->setArg(8, eps_); stream->enqueue(kernel, grid, {nthreads, 1, 1}); } @@ -57,7 +62,8 @@ void batchnorm(fp32 *Y, fp32 *M, fp32 *V, restrict read_only fp32 *X, restrict read_only fp32 *G, restrict read_only fp32 *B, - int32 DHWN) { + int32 DHWN, + fp32 rcpDHWN, fp32 eps) { int32 rx[TM] = 0 ... TM; fp32 *px[TM]; fp32 x[TM]; @@ -72,9 +78,8 @@ void batchnorm(fp32 *Y, fp32 *M, fp32 *V, mean = mean + x; px = px + TM; } - fp32 m = __sum(mean); fp32 *pm = M + c; - *pm = m; + *pm = __sum(mean) * rcpDHWN; fp32 var[TM] = 0; px = X + rx + c*DHWN; @@ -84,9 +89,21 @@ void batchnorm(fp32 *Y, fp32 *M, fp32 *V, var = var + x*x; px = px + TM; } - fp32 v = __sum(var); + fp32 v = __sum(var) * rcpDHWN; fp32 *pv = V + c; *pv = v; + + fp32 rstdg = 1 / sqrt(v + eps) * g; + + px = X + rx + c*DHWN; + fp32* py[TM] = Y + rx + c*DHWN; + for(int32 i = 0; i < DHWN; i = i + TM){ + x = *px; + fp32 y[TM] = (x - mean)*rstdg + b; + *py = y; + px = px + TM; + py = py + TM; + } })"; } @@ -148,16 +165,25 @@ void batchnorm(fp32 *DX, fp32 *DG, fp32 *DB, fp32 dy[TM] = *pdy; dg = dg + dy*(x - mean)*rstd; db = db + dy; + px = px + TM; + pdy = pdy + TM; } + fp32 sdg = __sum(dg); + fp32 sdb = __sum(db); px = X + rx + offset; pdy = DY + rx + offset; pdx = DX + rx + offset; for(int32 i = 0; i < DHWN; i += TM){ + fp32 x[TM] = *px; + fp32 dy[TM] = *pdy; fp32 xhat[TM] = (x - mean) * rstd; fp32 xtmp[TM] = (xhat * dg + db) * NDHW; fp32 dx[TM] = (dy - xtmp) * rstd * g; *pdx = dx; + px = px + TM; + pdy = pdy + TM; + pdx = pdx + TM; } })"; } diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index c3139ece6..4ff863666 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -255,7 +255,7 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ - std::cout << source << std::endl; +// std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index cf6832958..54321bd81 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -320,6 +320,10 @@ value *builder::create_trans(value *A, const std::string &name) { return insert(trans_inst::create(A, name)); } +value *builder::create_sqrt(value *A, const std::string &name) { + return insert(sqrt_inst::create(A, name)); +} + value *builder::create_reduce(value *A, const std::string &name) { return insert(reduce_inst::create(A, name)); } diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 6607990c4..27efc0838 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -566,6 +566,19 @@ instruction* trans_inst::create(value *arg, const std::string &name, instruction return new trans_inst(arg, name, next); } +//===----------------------------------------------------------------------===// +// sqrt instructions +//===----------------------------------------------------------------------===// + +sqrt_inst::sqrt_inst(value *arg, const std::string &name, instruction *next) + : builtin_inst(arg->get_type(), 1, 1, name, next){ + set_operand(0, arg); +} + +instruction* sqrt_inst::create(value *arg, const std::string &name, instruction *next) { + return new sqrt_inst(arg, name, next); +} + //===----------------------------------------------------------------------===// // reduce instructions //===----------------------------------------------------------------------===// diff --git a/lib/lang/expression.cpp b/lib/lang/expression.cpp index 8f51ec47a..85e98a771 100644 --- a/lib/lang/expression.cpp +++ b/lib/lang/expression.cpp @@ -180,6 +180,12 @@ ir::value* trans_expression::codegen(ir::module *mod) const { return mod->get_builder().create_trans(arg_->codegen(mod)); } +// sqrt +ir::value* sqrt_expression::codegen(ir::module *mod) const { + return mod->get_builder().create_sqrt(arg_->codegen(mod)); +} + + // reduce ir::value* reduce_expression::codegen(ir::module *mod) const { return mod->get_builder().create_reduce(arg_->codegen(mod)); From cc4160478443e2a40045fae27546d610fa584384 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 9 Jul 2019 13:03:16 -0700 Subject: [PATCH 220/494] [codegen/batchnorm] forward and backward now seemingly working --- examples/python/tensorflow/run.py | 14 +++++++++++--- include/triton/dnn/batchnorm.h | 3 ++- lib/dnn/batchnorm.cpp | 31 +++++++++++++++++++------------ 3 files changed, 32 insertions(+), 16 deletions(-) diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index df37c830c..a0f107ea4 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -100,7 +100,7 @@ def batch_norm_grad(op, dy, mean, var): def run_batchnorm(): - C, H, W, B = 32, 16, 16, 16 + C, H, W, B = 1, 4, 4, 4 np.random.seed(0) # Placeholders x = tf.placeholder(tf.float32, shape=[C, H, W, B]) @@ -112,11 +112,19 @@ def run_batchnorm(): hb = np.random.rand(C) # batchnorm y, m, v = module.batchnorm_forward(x, g, b, eps=1e-5) + loss = np.sum(y) # Run sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) result = sess.run([y, m, v], feed_dict = {x: hx, g: hg, b: hb}) - - + #print(result[0], result[1], result[2]) + grads = tf.test.compute_gradient([x, g, b], [(C, H, W, B), (C, ), (C, )], y, (C, H, W, B), + extra_feed_dict = {x: hx, g: hg, b: hb}) + dx_t, dx_n = grads[0] + dg_t, dg_n = grads[1] + db_t, db_n = grads[2] + print(np.max(np.abs(dx_t - dx_n))) + print(np.max(np.abs(dg_t - dg_n))) + print(np.max(np.abs(db_t - db_n))) run_batchnorm() diff --git a/include/triton/dnn/batchnorm.h b/include/triton/dnn/batchnorm.h index 7a97e83af..65f71ce58 100644 --- a/include/triton/dnn/batchnorm.h +++ b/include/triton/dnn/batchnorm.h @@ -61,7 +61,7 @@ private: class batchnorm_backward { public: // constructor - batchnorm_backward(int C, int D, int H, int W, int B, std::string ty = "fp32"); + batchnorm_backward(int C, int D, int H, int W, int B, std::string ty = "fp32", float eps = 1e-5); // enqueue void enqueue(driver::stream *stream, driver::kernel *kernel, driver::buffer *dx, driver::buffer *dg, driver::buffer *db, driver::buffer *dy, @@ -78,6 +78,7 @@ private: int32_t W_; int32_t B_; std::string ty_; + float eps_; }; } diff --git a/lib/dnn/batchnorm.cpp b/lib/dnn/batchnorm.cpp index e3b1a630c..a8e91bf8e 100644 --- a/lib/dnn/batchnorm.cpp +++ b/lib/dnn/batchnorm.cpp @@ -79,13 +79,14 @@ void batchnorm(fp32 *Y, fp32 *M, fp32 *V, px = px + TM; } fp32 *pm = M + c; - *pm = __sum(mean) * rcpDHWN; + fp32 m = __sum(mean) * rcpDHWN; + *pm = m; fp32 var[TM] = 0; px = X + rx + c*DHWN; for(int32 i = 0; i < DHWN; i = i + TM){ x = *px; - x = x - mean; + x = x - m; var = var + x*x; px = px + TM; } @@ -99,7 +100,7 @@ void batchnorm(fp32 *Y, fp32 *M, fp32 *V, fp32* py[TM] = Y + rx + c*DHWN; for(int32 i = 0; i < DHWN; i = i + TM){ x = *px; - fp32 y[TM] = (x - mean)*rstdg + b; + fp32 y[TM] = (x - m)*rstdg + b; *py = y; px = px + TM; py = py + TM; @@ -111,8 +112,8 @@ void batchnorm(fp32 *Y, fp32 *M, fp32 *V, * Backward * --------------- */ -batchnorm_backward::batchnorm_backward(int C, int D, int H, int W, int B, std::string ty) - : C_(C), D_(D), H_(H), W_(W), B_(B), ty_(ty) +batchnorm_backward::batchnorm_backward(int C, int D, int H, int W, int B, std::string ty, float eps) + : C_(C), D_(D), H_(H), W_(W), B_(B), ty_(ty), eps_(eps) { } void batchnorm_backward::enqueue(driver::stream *stream, driver::kernel *kernel, @@ -120,7 +121,7 @@ void batchnorm_backward::enqueue(driver::stream *stream, driver::kernel *kernel, driver::buffer *x, driver::buffer *g, driver::buffer *m, driver::buffer *v, size_t, size_t nthreads) { - std::array grid = {(size_t)C_, 1, 1}; + std::array grid = {1, (size_t)C_, 1}; kernel->setArg(0, dx); kernel->setArg(1, dg); kernel->setArg(2, db); @@ -130,6 +131,8 @@ void batchnorm_backward::enqueue(driver::stream *stream, driver::kernel *kernel, kernel->setArg(6, m); kernel->setArg(7, v); kernel->setArg(8, (int32_t)(D_*H_*W_*B_)); + kernel->setArg(9, (float)1/(D_*H_*W_*B_)); + kernel->setArg(10, eps_); stream->enqueue(kernel, grid, {nthreads, 1, 1}); } @@ -144,14 +147,14 @@ void batchnorm(fp32 *DX, fp32 *DG, fp32 *DB, restrict read_only fp32 *G, restrict read_only fp32 *M, restrict read_only fp32 *V, - int32 DHWN) { - int32 rx[TM] = get_global_range[TM](0); + int32 DHWN, fp32 rcpDHWN, fp32 epsilon) { + int32 rx[TM] = 0 ... TM; int32 c = get_range_id(0); int32 offset = c*DHWN; fp32 g = *(G + c); fp32 mean = *(M + c); fp32 var = *(V + c); - fp32 rstd = var; + fp32 rstd = 1 / sqrt(var + epsilon); fp32* px[TM]; fp32* pdx[TM]; fp32* pdy[TM]; @@ -160,7 +163,7 @@ void batchnorm(fp32 *DX, fp32 *DG, fp32 *DB, pdy = DY + rx + offset; fp32 dg[TM] = 0; fp32 db[TM] = 0; - for(int32 i = 0; i < DHWN; i += TM){ + for(int32 i = 0; i < DHWN; i = i + TM){ fp32 x[TM] = *px; fp32 dy[TM] = *pdy; dg = dg + dy*(x - mean)*rstd; @@ -170,15 +173,19 @@ void batchnorm(fp32 *DX, fp32 *DG, fp32 *DB, } fp32 sdg = __sum(dg); fp32 sdb = __sum(db); + fp32 *pdg = DG + c; + fp32 *pdb = DB + c; + *pdg = sdg; + *pdb = sdb; px = X + rx + offset; pdy = DY + rx + offset; pdx = DX + rx + offset; - for(int32 i = 0; i < DHWN; i += TM){ + for(int32 i = 0; i < DHWN; i = i + TM){ fp32 x[TM] = *px; fp32 dy[TM] = *pdy; fp32 xhat[TM] = (x - mean) * rstd; - fp32 xtmp[TM] = (xhat * dg + db) * NDHW; + fp32 xtmp[TM] = (xhat * dg + db) * rcpDHWN; fp32 dx[TM] = (dy - xtmp) * rstd * g; *pdx = dx; px = px + TM; From 066ae338f1370ff72e6e394709054fa69a82dece Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 9 Jul 2019 14:08:51 -0700 Subject: [PATCH 221/494] [dnn/shift]: added stride to shift --- examples/python/tensorflow/run.py | 34 +++++++++---------- examples/python/tensorflow/shift.cpp | 30 +++++++++++++---- include/triton/dnn/shift.h | 5 +++ lib/dnn/shift.cpp | 49 ++++++++++++++++------------ 4 files changed, 74 insertions(+), 44 deletions(-) diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index a0f107ea4..1fcf68587 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -49,35 +49,35 @@ def run_conv(): def blocksparse_matmul_grad(op, dy): shift_h = op.get_attr('shift_h') shift_w = op.get_attr('shift_w') + stride_h = op.get_attr('stride_h') + stride_w = op.get_attr('stride_w') x = op.inputs[0] w = op.inputs[1] - dx = module.shift_conv_dx(dy, w, shift_h=shift_h, shift_w=shift_w) - dw = module.shift_conv_dw(dy, x, shift_h=shift_h, shift_w=shift_w) + dx = module.shift_conv_dx(dy, w, stride_h=stride_h, stride_w=stride_w, shift_h=shift_h, shift_w=shift_w) + dw = module.shift_conv_dw(dy, x, stride_h=stride_h, stride_w=stride_w, shift_h=shift_h, shift_w=shift_w) return (dx, dw) def run_shift(): - B, C, H, W = 16, 1024, 8, 8 - R, S, F = 3, 3, 1024 + B, C, H, W = 16, 16, 4, 4 + R, S, F = 3, 3, 4 + stride_h, stride_w = 2, 2 np.random.seed(2) a = tf.placeholder(tf.float32, shape=[C, H, W, B]) b = tf.placeholder(tf.float32, shape=[C, F]) hshift_h = np.random.randint(- (R//2), R//2 + 1, size=C, dtype=np.int32) hshift_w = np.random.randint(- (S//2), R//2 + 1, size=C, dtype=np.int32) - #hshift_h = np.ones(C, dtype=np.int32) - #hshift_w = np.ones(C, dtype=np.int32) - c = module.shift_conv(a, b, shift_h=tf.make_tensor_proto(hshift_h), shift_w=tf.make_tensor_proto(hshift_w)) - # Reference + c = module.shift_conv(a, b, stride_h=stride_h, stride_w=stride_w, shift_h=tf.make_tensor_proto(hshift_h), shift_w=tf.make_tensor_proto(hshift_w)) + # feed values ha = np.random.rand(C, H, W, B) hb = np.random.rand(C, F) - #ha = np.ones((C, H, W, B), dtype=np.int32) - #hb = np.ones((C, F), dtype=np.int32) sess = tf.InteractiveSession() - #grads = tf.test.compute_gradient([a, b], [(C, H, W, B), (C, F)], c, (F, H, W, B), - # extra_feed_dict = {a: ha, b: hb}) - #dw_t, dw_n = grads[1] - #dx_t, dx_n = grads[0] - #print(np.max(np.abs(dw_t - dw_n))) - #print(np.max(np.abs(dx_t - dx_n))) + # test + grads = tf.test.compute_gradient([a, b], [(C, H, W, B), (C, F)], c, (F, H//stride_h, W//stride_w, B), + extra_feed_dict = {a: ha, b: hb}) + dw_t, dw_n = grads[1] + dx_t, dx_n = grads[0] + print(np.max(np.abs(dw_t - dw_n))) + print(np.max(np.abs(dx_t - dx_n))) # Run sess.run(tf.global_variables_initializer()) result = sess.run([c], feed_dict = {a: ha, @@ -127,4 +127,4 @@ def run_batchnorm(): print(np.max(np.abs(dg_t - dg_n))) print(np.max(np.abs(db_t - db_n))) -run_batchnorm() +run_shift() diff --git a/examples/python/tensorflow/shift.cpp b/examples/python/tensorflow/shift.cpp index 6e9abec55..0ccd06d1f 100644 --- a/examples/python/tensorflow/shift.cpp +++ b/examples/python/tensorflow/shift.cpp @@ -34,6 +34,8 @@ public: explicit ShiftConvOp(OpKernelConstruction* context) : OpKernel(context) { context->GetAttr("shift_h", &h_shift_h_); context->GetAttr("shift_w", &h_shift_w_); + context->GetAttr("stride_h", &stride_h_); + context->GetAttr("stride_w", &stride_w_); R_ = 3; S_ = 3; } @@ -52,12 +54,12 @@ public: int64_t Hb = tf_b.dim_size(1); int64_t Wb = tf_b.dim_size(2); int64_t Bb = tf_b.dim_size(3); - OP_REQUIRES(context, Ha == Hb, tensorflow::errors::InvalidArgument("operands must have the same image height")); - OP_REQUIRES(context, Wa == Wb, tensorflow::errors::InvalidArgument("operands must have the same image width")); + OP_REQUIRES(context, Ha*stride_h_ == Hb, tensorflow::errors::InvalidArgument("operands must have the same image height")); + OP_REQUIRES(context, Wa*stride_w_ == Wb, tensorflow::errors::InvalidArgument("operands must have the same image width")); OP_REQUIRES(context, Ba == Bb, tensorflow::errors::InvalidArgument("operands must have the same batch size")); - H = Ha; - W = Wa; - B = Ba; + H = Hb; + W = Wb; + B = Bb; } else { // shapes for a @@ -65,6 +67,10 @@ public: H = tf_a.dim_size(1); W = tf_a.dim_size(2); B = tf_a.dim_size(3); + if(OP == triton::dnn::shift::BPROP){ + H *= stride_h_; + W *= stride_w_; + } // shapes for b int64_t Cb = tf_b.dim_size(0); F = tf_b.dim_size(1); @@ -104,7 +110,9 @@ public: if(m_config.find(key) == m_config.end()) shift = m_config.emplace(key, new triton::dnn::shift( B, C, D, H, W, T, R_, S_, F, - shift_h, shift_w, "fp32", "fp32", OP, has_bias)) + stride_h_, stride_w_, + shift_h, shift_w, + "fp32", "fp32", OP, has_bias)) .first->second.get(); else shift = m_config.at(key).get(); @@ -125,7 +133,7 @@ public: triton::driver::cu_buffer dc(ctx, (CUdeviceptr)tf_c->flat().data(), false); // get JIT triton::jit* jit; - bool autotune = true; + bool autotune = false; if(m_jit.find(key) == m_jit.end()) { jit = m_jit.emplace(key, new triton::jit(ctx)).first->second.get(); std::ostringstream oss; @@ -171,6 +179,8 @@ public: private: Tensor h_shift_h_; Tensor h_shift_w_; + int stride_h_; + int stride_w_; int R_; int S_; }; @@ -181,6 +191,8 @@ REGISTER_OP("ShiftConv") .Input("b: float32") .Attr("shift_h: tensor") .Attr("shift_w: tensor") + .Attr("stride_h: int") + .Attr("stride_w: int") .Output("c: float32"); REGISTER_KERNEL_BUILDER(Name("ShiftConvDx").Device(DEVICE_GPU), ShiftConvOp); @@ -189,6 +201,8 @@ REGISTER_OP("ShiftConvDx") .Input("b: float32") .Attr("shift_h: tensor") .Attr("shift_w: tensor") + .Attr("stride_h: int") + .Attr("stride_w: int") .Output("c: float32"); REGISTER_KERNEL_BUILDER(Name("ShiftConvDw").Device(DEVICE_GPU), ShiftConvOp); @@ -197,5 +211,7 @@ REGISTER_OP("ShiftConvDw") .Input("b: float32") .Attr("shift_h: tensor") .Attr("shift_w: tensor") + .Attr("stride_h: int") + .Attr("stride_w: int") .Output("c: float32"); diff --git a/include/triton/dnn/shift.h b/include/triton/dnn/shift.h index 3c4b53037..e9bd921df 100644 --- a/include/triton/dnn/shift.h +++ b/include/triton/dnn/shift.h @@ -52,6 +52,7 @@ public: shift(int B, int NC, int D, int H, int W, int T, int R, int S, int NF, + int stride_h, int stride_w, const std::vector &shift_h, const std::vector &shift_w, std::string a_ty = "fp32", std::string b_ty = "fp32", type ty = FPROP, bool bias = false); @@ -133,6 +134,10 @@ private: std::vector shapes_a_; std::vector shapes_b_; std::vector shapes_c_; + // strides + int32_t stride_d_; + int32_t stride_h_; + int32_t stride_w_; // memory strides std::vector ld_a_; std::vector ld_b_; diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 66f3e0c35..99078e0cd 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -17,6 +17,7 @@ shift::shift(int B, int C, int D, int H, int W, int T, int R, int S, int F, + int stride_h, int stride_w, const std::vector& shift_h, const std::vector& shift_w, std::string a_ty, std::string b_ty, type ty, bool bias) @@ -24,6 +25,7 @@ shift::shift(int B, int C, AD_(D), AH_(H), AW_(W), BD_(T), BH_(R), BW_(S), F_(F), + stride_d_(1), stride_h_(stride_h), stride_w_(stride_w), shift_h_(shift_h), shift_w_(shift_w), a_ty_(a_ty), b_ty_(b_ty), ty_(ty), bias_(bias) { @@ -33,17 +35,21 @@ shift::shift(int B, int C, // transpose AT_ = false; BT_ = true; + // activation sizes + CD_ = AD_ / stride_d_; + CH_ = AH_ / stride_h_; + CW_ = AW_ / stride_w_; // equivalent matmul - M_ = B_*AH_*AW_; + M_ = B_*CH_*CW_; N_ = F_; K_ = C_; // shapes // input layout: C, H, W, B // filter layout: C, F // output layout: F, H, W, B - shapes_a_ = {C, H, W, B}; + shapes_a_ = {C, AH_, AW_, B}; shapes_b_ = {C, F}; - shapes_c_ = {F, H, W, B}; + shapes_c_ = {F, CH_, CW_, B}; if(ty_ == WGRAD){ shapes_b_.swap(shapes_c_); shapes_a_.swap(shapes_b_); @@ -51,14 +57,14 @@ shift::shift(int B, int C, BT_ = false; M_ = F_; N_ = C_; - K_ = B_*AH_*AW_; + K_ = B_*CH_*CW_; } if(ty_ == BPROP){ shapes_a_.swap(shapes_c_); AT_ = false; BT_ = false; K_ = F_; - M_ = B_*AH_*AW_; + M_ = B_*CH_*CW_; N_ = C_; } // memory strides @@ -133,13 +139,15 @@ void shift::enqueue(driver::stream *stream, driver::kernel *kernel, kernel->setArg(3, M_); kernel->setArg(4, N_); kernel->setArg(5, K_); - kernel->setArg(6, lda); - kernel->setArg(7, ldb); - kernel->setArg(8, B_); - kernel->setArg(9, AH_); - kernel->setArg(10, AW_); - kernel->setArg(11, BH_); - kernel->setArg(12, BW_); + kernel->setArg(6, stride_h_); + kernel->setArg(7, stride_w_); + kernel->setArg(8, lda); + kernel->setArg(9, ldb); + kernel->setArg(10, B_); + kernel->setArg(11, AH_); + kernel->setArg(12, AW_); + kernel->setArg(13, BH_); + kernel->setArg(14, BW_); std::array grid = {(M_ + TM - 1)/TM, (N_ + TN - 1)/TN, 1}; if(ty_ == BPROP) ((driver::cu_buffer*)c)->set_zero(stream, M_*N_*4); @@ -188,6 +196,7 @@ void shift(restrict read_only align(16) )" << a_ty_ << R"( *a, restrict read_only align(16) )" << b_ty_ << R"( *b, fp32 *c, int32 M, int32 N, int32 K, + int32 stride_h, int32 stride_w, int32 lda, int32 ldb, int32 ABS, int32 AH, int32 AW, int32 AR, int32 AS) { int32 rxa[TM] = get_global_range[TM](0); @@ -200,9 +209,9 @@ void shift(restrict read_only align(16) )" << a_ty_ << R"( *a, if(ty_ == FPROP){ os << R"( int32 rawhc[TM] = rxa / ABS; - int32 raw[TM] = rawhc % AW; + int32 raw[TM] = (rawhc % AW)*stride_w; int32 rahc[TM] = rawhc / AW; - int32 rah[TM] = rahc % AH; + int32 rah[TM] = (rahc % AH)*stride_h; __constant__ int32* pd[TK] = delta + rka; multiple_of(4) int32 d[TK] = *pd; int1 interiorh[TM] = (rah >= pad_h) && (rah < (AH - pad_h)); @@ -227,9 +236,9 @@ if(ty_ == WGRAD){ if(ty_ == WGRAD){ os << R"( int32 rbwhc[TK] = rkb / ABS; - int32 rbw[TK] = rbwhc % AW; + int32 rbw[TK] = (rbwhc % AW)*stride_w; int32 rbhc[TK] = rbwhc / AW; - int32 rbh[TK] = rbhc % AH; + int32 rbh[TK] = (rbhc % AH)*stride_h; int1 interiorh[TK] = (rbh >= pad_h) && (rbh < (AH - pad_h)); int1 interiorw[TK] = (rbw >= pad_w) && (rbw < (AW - pad_w)); int1 interior[TK, TN] = interiorh[:, newaxis] && interiorw[:, newaxis]; @@ -266,9 +275,9 @@ if(ty_ == WGRAD){ pb = pb + TK)" << ldb0 << R"(; rkb = rkb + TK; rbwhc = rkb / ABS; - rbw = rbwhc % AW; + rbw = (rbwhc % AW)*stride_w; rbhc = rbwhc / AW; - rbh = rbhc % AH; + rbh = (rbhc % AH)*stride_h; interiorh = (rbh >= pad_h) && (rbh < (AH - pad_h)); interiorw = (rbw >= pad_w) && (rbw < (AW - pad_w)); interior = interiorh[:, newaxis] && interiorw[:, newaxis]; @@ -292,9 +301,9 @@ else{ if(ty_ == BPROP){ os << R"( int32 rcwhc[TM] = rxc / ABS; - int32 rcw[TM] = rcwhc % AW; + int32 rcw[TM] = (rcwhc % AW)*stride_w; int32 rchc[TM] = rcwhc / AW; - int32 rch[TM] = rchc % AH; + int32 rch[TM] = (rchc % AH)*stride_h; int1 interiorh[TM] = (rch >= pad_h) && (rch < (AH - pad_h)); int1 interiorw[TM] = (rcw >= pad_w) && (rcw < (AW - pad_w)); int1 interior[TM, TN] = interiorh[:, newaxis] && interiorw[:, newaxis]; From 88675fa01afc3cfcee72572e1250fa2a9a7fb5fc Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 9 Jul 2019 16:09:34 -0700 Subject: [PATCH 222/494] [dnn] added base template class for mutualized auto-tuning --- examples/cpp/shift.cpp | 34 +----------- examples/python/pytorch/shift.cpp | 12 ++--- examples/python/tensorflow/batchnorm.cpp | 1 - examples/python/tensorflow/run.py | 1 + examples/python/tensorflow/shift.cpp | 47 +--------------- include/triton/dnn/base.h | 65 ++++++++++++++++++++++ include/triton/dnn/shift.h | 30 +++++------ lib/dnn/base.cpp | 69 ++++++++++++++++++++++++ lib/dnn/shift.cpp | 28 +++++++--- 9 files changed, 181 insertions(+), 106 deletions(-) create mode 100644 include/triton/dnn/base.h create mode 100644 lib/dnn/base.cpp diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index 3d7646d9e..fda28520c 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -14,8 +14,6 @@ int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); - // initialize just-in-time compiler - triton::jit jit(context); // initialization int32_t R = 3, S = 3; @@ -31,7 +29,7 @@ int main() { shift_w[c] = rand() % S - S/2; } // configuration - triton::dnn::shift shift(BS, C, 1, H, W, 1, R, S, F, shift_h, shift_w, numeric_t_str, numeric_t_str, triton::dnn::shift::BPROP); + triton::dnn::shift shift(BS, C, 1, H, W, 1, R, S, F, 1, 1, shift_h, shift_w, numeric_t_str, numeric_t_str, triton::dnn::shift::BPROP); // host buffers std::vector hc(shift.c_size()); std::vector rc(shift.c_size()); @@ -55,35 +53,7 @@ int main() { stream->write(db, true, 0, hb); stream->write(dc, true, 0, hc); stream->synchronize(); - // benchmark - auto benchmark = [&](triton::driver::kernel* kernel, - triton::jit::launch_information info) { - shift.init(stream, (triton::driver::cu_module*)kernel->module()); - // launch infoRR - unsigned TM = info.global_range_size[0]; - unsigned TN = info.global_range_size[1]; - unsigned nthreads = info.num_threads; - // set argument - shift.enqueue(stream, kernel, da, db, dc, TM, TN, nthreads); - stream->synchronize(); - // benchmark - double ts = triton::tools::bench([&](){shift.enqueue(stream, kernel, da, db, dc, TM, TN, nthreads);}, - [&](){ stream->synchronize(); }, context->device()); - return shift.get_nflops() / ts * 1e-3; - }; - - // shift - std::vector params = { - 4, 2, 16, 4, 128, 2, 2, 1, 1, 8, 16, 8, 2 - }; - std::ostringstream oss; - shift.src(oss); - std::string src = oss.str(); -// jit.autotune("shift", src.c_str(), benchmark); - jit.add_module("shift", src.c_str(), params); - triton::driver::kernel* kernel = jit.get_function("shift"); - triton::jit::launch_information info = jit.get_launch_info("shift"); - std::cout << "Performance: " << benchmark(kernel, info) << " TFLOPS " << std::endl; + shift.enqueue(stream, da, db, dc); // stream->read(dc, true, 0, hc); // shift.cpu_ref(rc.data(), ha.data(), hb.data()); // for(size_t i = 0; i < hc.size(); i++) diff --git a/examples/python/pytorch/shift.cpp b/examples/python/pytorch/shift.cpp index 0bf5340c7..6332e3027 100644 --- a/examples/python/pytorch/shift.cpp +++ b/examples/python/pytorch/shift.cpp @@ -72,18 +72,18 @@ torch::Tensor shift_common( if(m_shift_jit.find(key) == m_shift_jit.end()){ jit = m_shift_jit.emplace(key, new triton::jit(ctx)).first->second.get(); std::ostringstream oss; - configuration->src(oss); + configuration->get_src(oss); std::string src = oss.str(); // benchmark a given shiftolution kernel auto benchmark = [&](triton::driver::kernel* kernel, triton::jit::launch_information info) { - configuration->init(stream, (triton::driver::cu_module*)kernel->module()); + configuration->init_impl(stream, (triton::driver::cu_module*)kernel->module()); unsigned TM = info.global_range_size[0]; unsigned TN = info.global_range_size[1]; unsigned nthreads = info.num_threads; - configuration->enqueue(stream, kernel, &a, &b, &c, TM, TN, nthreads); + configuration->enqueue_impl(stream, kernel, &a, &b, &c, TM, TN, nthreads); stream->synchronize(); - double ts = triton::tools::bench([&](){ configuration->enqueue(stream, kernel, &a, &b, &c, TM, TN, nthreads); }, + double ts = triton::tools::bench([&](){ configuration->enqueue_impl(stream, kernel, &a, &b, &c, TM, TN, nthreads); }, [&](){ stream->synchronize(); }, stream->context()->device()); return configuration->get_nflops() / ts * 1e-3; }; @@ -96,7 +96,7 @@ torch::Tensor shift_common( jit->add_module("shift", src.c_str(), jit->get_valid("shift", src.c_str())); } triton::driver::kernel* kernel = jit->get_function("shift"); - configuration->init(stream, (triton::driver::cu_module*)kernel->module()); + configuration->init_impl(stream, (triton::driver::cu_module*)kernel->module()); } else jit = m_shift_jit.at(key).get(); @@ -109,6 +109,6 @@ torch::Tensor shift_common( unsigned TN = info.global_range_size[1]; unsigned nthreads = info.num_threads; // enqueue - configuration->enqueue(stream, kernel, &a, &b, &c, TM, TN, nthreads); + configuration->enqueue_impl(stream, kernel, &a, &b, &c, TM, TN, nthreads); return torchc; } diff --git a/examples/python/tensorflow/batchnorm.cpp b/examples/python/tensorflow/batchnorm.cpp index 677168d08..4942e25d6 100644 --- a/examples/python/tensorflow/batchnorm.cpp +++ b/examples/python/tensorflow/batchnorm.cpp @@ -133,7 +133,6 @@ public: triton::driver::cu_buffer tdx(ctx, (CUdeviceptr)dx->flat().data(), false); triton::driver::cu_buffer tdg(ctx, (CUdeviceptr)dg->flat().data(), false); triton::driver::cu_buffer tdb(ctx, (CUdeviceptr)db->flat().data(), false); - // create config triton::dnn::batchnorm_backward batchnorm(C, 1, H, W, B, "fp32"); std::ostringstream oss; diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 1fcf68587..27cb7e5c8 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -128,3 +128,4 @@ def run_batchnorm(): print(np.max(np.abs(db_t - db_n))) run_shift() +#run_batchnorm() diff --git a/examples/python/tensorflow/shift.cpp b/examples/python/tensorflow/shift.cpp index 0ccd06d1f..b2cebbaeb 100644 --- a/examples/python/tensorflow/shift.cpp +++ b/examples/python/tensorflow/shift.cpp @@ -99,7 +99,7 @@ public: FillShapes(context, C, H, W, B, F, tf_a, tf_b); int64_t D = 1, T = 1; bool has_bias = false; - // shift configuration + // shift offsets int32_t* shift_h_data = h_shift_h_.flat().data(); int32_t* shift_w_data = h_shift_w_.flat().data(); std::vector shift_h(shift_h_data, shift_h_data + C); @@ -116,7 +116,6 @@ public: .first->second.get(); else shift = m_config.at(key).get(); - // shapes for c std::vector c_shapes; for(int32_t x: shift->c_shapes()) @@ -131,49 +130,7 @@ public: triton::driver::cu_buffer da(ctx, (CUdeviceptr)tf_a.flat().data(), false); triton::driver::cu_buffer db(ctx, (CUdeviceptr)tf_b.flat().data(), false); triton::driver::cu_buffer dc(ctx, (CUdeviceptr)tf_c->flat().data(), false); - // get JIT - triton::jit* jit; - bool autotune = false; - if(m_jit.find(key) == m_jit.end()) { - jit = m_jit.emplace(key, new triton::jit(ctx)).first->second.get(); - std::ostringstream oss; - shift->src(oss); - std::string src = oss.str(); - auto benchmark = [&](triton::driver::kernel* kernel, - triton::jit::launch_information info) { - // launch info - unsigned TM = info.global_range_size[0]; - unsigned TN = info.global_range_size[1]; - unsigned nthreads = info.num_threads; - shift->init(stream, (triton::driver::cu_module*)kernel->module()); - shift->enqueue(stream, kernel, &da, &db, &dc, TM, TN, nthreads); - stream->synchronize(); - double ts = triton::tools::bench([&](){ shift->enqueue(stream, kernel, &da, &db, &dc, TM, TN, nthreads); }, - [&](){ stream->synchronize(); }, ctx->device()); - return shift->get_nflops() / ts * 1e-3; - }; - // auto-tune and save result - if(autotune) { - triton::jit::tune_res_t best = jit->autotune("shift", src.c_str(), benchmark); - jit->add_module("shift", src.c_str(), best.params); - } - else { - jit->add_module("shift", src.c_str(), jit->get_valid("shift", src.c_str())); - } - triton::driver::kernel* kernel = jit->get_function("shift"); - shift->init(stream, (triton::driver::cu_module*)kernel->module()); - } - else - jit = m_jit.at(key).get(); - // Run - triton::driver::kernel* kernel = jit->get_function("shift"); - triton::jit::launch_information info = jit->get_launch_info("shift"); - // launch info - unsigned TM = info.global_range_size[0]; - unsigned TN = info.global_range_size[1]; - unsigned nthreads = info.num_threads; - // enqueue - shift->enqueue(stream, kernel, &da, &db, &dc, TM, TN, nthreads); + shift->enqueue(stream, {&da, &db, &dc}); } private: diff --git a/include/triton/dnn/base.h b/include/triton/dnn/base.h new file mode 100644 index 000000000..236cd0c1b --- /dev/null +++ b/include/triton/dnn/base.h @@ -0,0 +1,65 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#ifndef TDL_INCLUDE_DNN_BASE_H +#define TDL_INCLUDE_DNN_BASE_H + +#include "triton/driver/stream.h" +#include "triton/driver/kernel.h" + +namespace triton{ +namespace dnn{ + +class base { +protected: + // leading dimensions + static void set_ld(const std::vector& shapes, + std::vector& ld); + +private: + // initialize + virtual void init_impl(driver::stream *stream, driver::cu_module *module) = 0; + // enqueue + virtual void enqueue_impl(driver::stream *stream, driver::kernel *kernel, + std::vector args, + size_t TM, size_t TN, size_t nthreads) = 0; + +public: + // constructor + base(const std::string& name); + // number of flops + virtual size_t get_nflops() const = 0; + // triton-c source + virtual void get_src(std::ostream &os) const = 0; + // comparison for maps + virtual bool operator<(const base& other) const = 0; + // enqueue + void enqueue(driver::stream* stream, std::vector args); + +private: + std::string name_; +}; + +} +} + +#endif diff --git a/include/triton/dnn/shift.h b/include/triton/dnn/shift.h index e9bd921df..8683c9879 100644 --- a/include/triton/dnn/shift.h +++ b/include/triton/dnn/shift.h @@ -28,13 +28,15 @@ #include #include #include +#include "triton/dnn/base.h" #include "triton/driver/stream.h" #include "triton/driver/kernel.h" +#include "triton/runtime/jit.h" namespace triton{ namespace dnn{ -class shift { +class shift: public base { public: enum type { @@ -44,8 +46,14 @@ public: }; private: + // leading dimensions void set_ld(const std::vector& shapes, std::vector& ld); + // initialize and enqueue + void init_impl(driver::stream *stream, driver::cu_module *module); + void enqueue_impl(driver::stream *stream, driver::kernel *kernel, + std::vector args, + size_t TM, size_t TN, size_t nthreads); public: @@ -60,26 +68,18 @@ public: // look-up table void build_deltas(); void build_masks(); - // accessors size_t a_size(); size_t b_size(); size_t c_size(); std::vector c_shapes(); - - // device function - void init(driver::stream *stream, driver::cu_module *module); - void enqueue(driver::stream *stream, driver::kernel *kernel, - driver::buffer *a, driver::buffer *b, driver::buffer *c, - size_t TM, size_t TN, size_t nthreads); - - // utils - size_t get_nflops(); - + // number of flops + size_t get_nflops() const; // source - void src(std::ostream &os); - - // cpu_ref + void get_src(std::ostream &os) const; + // comparison + bool operator<(const base& other) const; + // cpu reference template void cpu_ref(OUT_DTYPE* O, const IN_DTYPE* I, diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp new file mode 100644 index 000000000..7bd5a9a0a --- /dev/null +++ b/lib/dnn/base.cpp @@ -0,0 +1,69 @@ +#include +#include "triton/dnn/base.h" +#include "triton/runtime/jit.h" +#include "triton/tools/bench.hpp" + +namespace triton{ +namespace dnn{ + +struct cmp_recompile{ + bool operator()(base* x, base* y) const{ + return *x < *y; + } +}; + +base::base(const std::string& name) + : name_(name) { } + +void base::enqueue(driver::stream *stream, std::vector args) { + static std::map, cmp_recompile> m_jit; + bool autotune = false; + driver::context* ctx = stream->context(); + triton::jit* jit; + /* the current template has not already been compiled */ + if(m_jit.find(this) == m_jit.end()) { + jit = m_jit.emplace(this, new triton::jit(ctx)).first->second.get(); + std::ostringstream oss; + get_src(oss); + std::string src = oss.str(); + auto benchmark = [&](triton::driver::kernel* kernel, + triton::jit::launch_information info) { + // launch info + unsigned TM = info.global_range_size[0]; + unsigned TN = info.global_range_size[1]; + unsigned nthreads = info.num_threads; + init_impl(stream, (triton::driver::cu_module*)kernel->module()); + enqueue_impl(stream, kernel, args, TM, TN, nthreads); + stream->synchronize(); + double ts = triton::tools::bench([&](){ enqueue_impl(stream, kernel, args, TM, TN, nthreads); }, + [&](){ stream->synchronize(); }, ctx->device()); + return get_nflops() / ts * 1e-3; + }; + // auto-tune and save result + if(autotune) { + triton::jit::tune_res_t best = jit->autotune(name_.c_str(), src.c_str(), benchmark); + jit->add_module(name_.c_str(), src.c_str(), best.params); + } + else { + jit->add_module(name_.c_str(), src.c_str(), jit->get_valid(name_.c_str(), src.c_str())); + } + triton::driver::kernel* kernel = jit->get_function(name_.c_str()); + init_impl(stream, (triton::driver::cu_module*)kernel->module()); + } + /* retrieved compiled template */ + else + jit = m_jit.at(this).get(); + + /* get launch parameters */ + driver::kernel* kernel = jit->get_function(name_.c_str()); + triton::jit::launch_information info = jit->get_launch_info(name_.c_str()); + unsigned TM = info.global_range_size[0]; + unsigned TN = info.global_range_size[1]; + unsigned nthreads = info.num_threads; + + /* launch */ + enqueue_impl(stream, kernel, args, TM, TN, nthreads); +} + +} +} diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 99078e0cd..cabf99ed2 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -1,5 +1,6 @@ +#include #include "triton/dnn/shift.h" - +#include "triton/tools/bench.hpp" namespace triton{ namespace dnn{ @@ -21,7 +22,8 @@ shift::shift(int B, int C, const std::vector& shift_h, const std::vector& shift_w, std::string a_ty, std::string b_ty, type ty, bool bias) - : B_(B), C_(C), + : base("shift"), + B_(B), C_(C), AD_(D), AH_(H), AW_(W), BD_(T), BH_(R), BW_(S), F_(F), @@ -118,21 +120,33 @@ std::vector shift::c_shapes(){ return shapes_c_; } -size_t shift::get_nflops() { +size_t shift::get_nflops() const { return 2.*M_*N_*K_; } +bool shift::operator <(const base& other) const{ + auto *y = dynamic_cast(&other); + if(!y) + return false; + const int32_t *x_shift_h = shift_h_.data(), *x_shift_w = shift_w_.data(); + const int32_t *y_shift_h = y->shift_h_.data(), *y_shift_w = y->shift_w_.data(); + return std::tie(B_, C_, AD_, AH_, AW_, BD_, BH_, BW_, F_, + x_shift_h, x_shift_w, ty_, bias_) + < std::tie(y->B_, y->C_, y->AD_, y->AH_, y->AW_, y->BD_, y->BH_, y->BW_, y->F_, + y_shift_h, y_shift_w, y->ty_, y->bias_); +} -void shift::init(driver::stream *stream, driver::cu_module *module) { +void shift::init_impl(driver::stream *stream, driver::cu_module *module) { triton::driver::buffer* delta = ((triton::driver::cu_module*)module)->symbol("delta"); stream->write(delta, false, 0, h_deltas_.size()*4, h_deltas_.data()); } -void shift::enqueue(driver::stream *stream, driver::kernel *kernel, - driver::buffer *a, driver::buffer *b, driver::buffer *c, +void shift::enqueue_impl(driver::stream *stream, driver::kernel *kernel, + std::vector args, size_t TM, size_t TN, size_t nthreads) { int32_t lda = AT_ ? K_ : M_; int32_t ldb = BT_ ? N_ : K_; + driver::buffer *a = args[0], *b = args[1], *c = args[2]; kernel->setArg(0, a); kernel->setArg(1, b); kernel->setArg(2, c); @@ -154,7 +168,7 @@ void shift::enqueue(driver::stream *stream, driver::kernel *kernel, stream->enqueue(kernel, grid, {nthreads, 1, 1}); } -void shift::src(std::ostream &os) { +void shift::get_src(std::ostream &os) const { std::string AS0 = "TM", AS1 = "TK"; std::string BS0 = "TK", BS1 = "TN"; std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; From b7986baffae3a96210866d624890672aabee73de Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 9 Jul 2019 17:30:58 -0700 Subject: [PATCH 223/494] [dnn]: Now implementing all existing DNN routines using common base template and auto-tuner --- examples/cpp/conv.cpp | 33 ++----- examples/cpp/dot.cpp | 38 +------- examples/cpp/shift.cpp | 4 +- examples/python/pytorch/shift.cpp | 4 +- examples/python/tensorflow/batchnorm.cpp | 20 +--- examples/python/tensorflow/conv.cpp | 56 +++-------- examples/python/tensorflow/dot.cpp | 35 +------ examples/python/tensorflow/run.py | 4 +- examples/python/tensorflow/shift.cpp | 31 ++---- include/triton/dnn/base.h | 27 ++++-- include/triton/dnn/batchnorm.h | 54 +++++++---- include/triton/dnn/conv.h | 53 ++++++----- include/triton/dnn/gemm.h | 68 ++++++++++---- include/triton/dnn/shift.h | 18 ++-- lib/dnn/base.cpp | 38 ++++---- lib/dnn/batchnorm.cpp | 63 ++++++++++--- lib/dnn/conv.cpp | 70 +++++++++----- lib/dnn/gemm.cpp | 115 +++++++++++++++-------- lib/dnn/shift.cpp | 32 +++---- 19 files changed, 388 insertions(+), 375 deletions(-) diff --git a/examples/cpp/conv.cpp b/examples/cpp/conv.cpp index d5f2bba3d..2d6d7a845 100644 --- a/examples/cpp/conv.cpp +++ b/examples/cpp/conv.cpp @@ -10,7 +10,6 @@ int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); - triton::jit jit(context); triton::dnn::conv::type ty = triton::dnn::conv::FPROP; // initialization int32_t B = 16, NF = 128; @@ -19,8 +18,12 @@ int main() { int32_t pad_d = 0, pad_h = 0, pad_w = 0; int32_t stride_d = 1, stride_h = 1, stride_w = 1; int32_t upsample_d = 1, upsample_h = 1, upsample_w = 1; - triton::dnn::conv configuration(128, 256, 1, 14, 14, 1, 5, 5, 512, 1, 1, 1, 0, 0, 0, 1, 1, 1, "fp32", "fp32", triton::dnn::conv::FPROP, 0); -// triton::dnn::conv configuration(B, NC, D, H, W, T, R, S, NF, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, upsample_d, upsample_h, upsample_w, ty); +// triton::dnn::conv configuration(128, 256, 1, 14, 14, 1, 5, 5, 512, 1, 1, 1, 0, 0, 0, 1, 1, 1, "fp32", "fp32", triton::dnn::conv::FPROP, 0); + triton::dnn::conv configuration(B, NC, D, H, W, T, R, S, NF, + stride_d, stride_h, stride_w, + pad_d, pad_h, pad_w, + upsample_d, upsample_h, upsample_w, + "fp32", "fp32", ty, 0); // convolution configuration std::vector hc(configuration.c_size()); std::vector rc(configuration.c_size()); @@ -42,29 +45,7 @@ int main() { stream->write(db, true, 0, hb); stream->write(dc, true, 0, hc); stream->synchronize(); - // benchmark a given convolution kernel - auto benchmark = [&](triton::driver::kernel* kernel, - triton::jit::launch_information info) { - configuration.init(stream, (triton::driver::cu_module*)kernel->module()); - unsigned TM = info.global_range_size[0]; - unsigned TN = info.global_range_size[1]; - unsigned nthreads = info.num_threads; - unsigned GZ = jit.get_int("GZ"); - configuration.enqueue(stream, kernel, da, db, dc, nullptr, TM, TN, GZ, nthreads); - stream->synchronize(); - double ts = triton::tools::bench([&](){ configuration.enqueue(stream, kernel, da, db, dc, nullptr, TM, TN, GZ, nthreads); }, - [&](){ stream->synchronize(); }, nullptr); - return configuration.get_nflops() / ts * 1e-3; - }; - std::ostringstream oss; - configuration.src(oss); - std::string src = oss.str(); - triton::jit::tune_res_t best = jit.autotune("conv", src.c_str(), benchmark); - jit.add_module("conv", src.c_str(), best.params); -// jit.add_module("conv", src.c_str(), configuration.default_params()); - triton::driver::kernel* kernel = jit.get_function("conv"); - triton::jit::launch_information info = jit.get_launch_info("conv"); - std::cout << "Performance: " << benchmark(kernel, info) << " TFLOPS " << std::endl; + configuration.enqueue(stream, {da, db, dc, nullptr}); stream->read(dc, true, 0, hc); configuration.cpu_ref(rc.data(), ha.data(), hb.data()); for(size_t i = 0; i < hc.size(); i++){ diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 7ff939318..f788ba048 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -10,18 +10,14 @@ int main() { bool AT = true; bool BT = true; - // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); - triton::jit jit(context); - // matrix multiplication parameters - int32_t M = 1024, N = 1024, K = 1024; + int32_t M = 128, N = 128, K = 128; std::vector hc(M*N); std::vector rc(M*N); std::vector ha(M*K); std::vector hb(K*N); - std::vector hlocks(2048); srand(0); for(size_t i = 0; i < ha.size(); i++) ha[i] = (float)rand()/RAND_MAX; @@ -32,41 +28,15 @@ int main() { triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*4); triton::driver::buffer* da = triton::driver::buffer::create(context, ha.size()*4); triton::driver::buffer* db = triton::driver::buffer::create(context, hb.size()*4); - triton::driver::buffer* dlocks = triton::driver::buffer::create(context, hlocks.size()*4); triton::driver::stream* stream = triton::driver::stream::create(context); stream->write(da, true, 0, ha); stream->write(db, true, 0, hb); stream->write(dc, true, 0, hc); - triton::dnn::gemm::init(stream, dlocks); stream->synchronize(); - - - // benchmark a given matrix multiplication kernel - auto benchmark = [&](triton::driver::kernel* kernel, - triton::jit::launch_information info) { - unsigned TM = info.global_range_size[0]; - unsigned TN = info.global_range_size[1]; - unsigned nthreads = info.num_threads; - unsigned GZ = jit.get_int("GZ"); - std::array grid = {(M + TM - 1)/TM, (N + TN - 1)/TN, GZ}; - triton::dnn::gemm::set_arg(kernel, da, db, dc, M, N, K, dlocks, grid[0], grid[1]); - stream->enqueue(kernel, grid, {nthreads, 1, 1}); - stream->synchronize(); - double ts = triton::tools::bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, - [&](){ stream->synchronize(); }, context->device()); - return 2.*M*N*K / ts * 1e-3; - }; - - - // just-in-time compile source-code - std::string src = triton::dnn::gemm::src(AT, BT, "fp32", "fp32", 4, 4); -// jit.autotune("matmul",src.c_str(), benchmark); - jit.add_module("matmul", src.c_str(), {8, 16, 4, 2, 16, 8, 4, 2, 2, 4, 2, 8, 8, 1}); - triton::driver::kernel* kernel = jit.get_function("matmul"); - triton::jit::launch_information info = jit.get_launch_info("matmul"); - std::cout << "Performance: " << benchmark(kernel, info) << " TFLOPS " << std::endl; + triton::dnn::gemm gemm(M, N, K, AT, BT, "fp32", "fp32", 4, 4); + gemm.enqueue(stream, {da, db, dc}); stream->read(dc, true, 0, hc); - triton::dnn::gemm::cpu_ref(AT, BT, rc, ha, hb, M, N, K); + gemm.cpu_ref(rc, ha, hb); for(size_t i = 0; i < M*N; i++) if(!std::isnan(hc[i]) && std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index fda28520c..a4edd38e3 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -29,7 +29,7 @@ int main() { shift_w[c] = rand() % S - S/2; } // configuration - triton::dnn::shift shift(BS, C, 1, H, W, 1, R, S, F, 1, 1, shift_h, shift_w, numeric_t_str, numeric_t_str, triton::dnn::shift::BPROP); + triton::dnn::shift shift(BS, C, 1, H, W, 1, R, S, F, 1, 1, shift_h.data(), shift_w.data(), numeric_t_str, numeric_t_str, triton::dnn::shift::BPROP); // host buffers std::vector hc(shift.c_size()); std::vector rc(shift.c_size()); @@ -53,7 +53,7 @@ int main() { stream->write(db, true, 0, hb); stream->write(dc, true, 0, hc); stream->synchronize(); - shift.enqueue(stream, da, db, dc); + shift.enqueue(stream, {da, db, dc}); // stream->read(dc, true, 0, hc); // shift.cpu_ref(rc.data(), ha.data(), hb.data()); // for(size_t i = 0; i < hc.size(); i++) diff --git a/examples/python/pytorch/shift.cpp b/examples/python/pytorch/shift.cpp index 6332e3027..df28c6ca7 100644 --- a/examples/python/pytorch/shift.cpp +++ b/examples/python/pytorch/shift.cpp @@ -72,7 +72,7 @@ torch::Tensor shift_common( if(m_shift_jit.find(key) == m_shift_jit.end()){ jit = m_shift_jit.emplace(key, new triton::jit(ctx)).first->second.get(); std::ostringstream oss; - configuration->get_src(oss); + configuration->triton_c_src(oss); std::string src = oss.str(); // benchmark a given shiftolution kernel auto benchmark = [&](triton::driver::kernel* kernel, @@ -85,7 +85,7 @@ torch::Tensor shift_common( stream->synchronize(); double ts = triton::tools::bench([&](){ configuration->enqueue_impl(stream, kernel, &a, &b, &c, TM, TN, nthreads); }, [&](){ stream->synchronize(); }, stream->context()->device()); - return configuration->get_nflops() / ts * 1e-3; + return configuration->num_flops() / ts * 1e-3; }; // auto-tune and save result if(autotune) { diff --git a/examples/python/tensorflow/batchnorm.cpp b/examples/python/tensorflow/batchnorm.cpp index 4942e25d6..3a34079e1 100644 --- a/examples/python/tensorflow/batchnorm.cpp +++ b/examples/python/tensorflow/batchnorm.cpp @@ -59,15 +59,7 @@ public: triton::driver::cu_buffer tv(ctx, (CUdeviceptr)v->flat().data(), false); // create config triton::dnn::batchnorm_forward batchnorm(C, 1, H, W, B, "fp32"); - std::ostringstream oss; - batchnorm.src(oss); - std::string src = oss.str(); - triton::jit jit(ctx); - jit.add_module("batchnorm", src.c_str(), jit.get_valid("batchnorm", src.c_str())); - triton::driver::kernel* kernel = jit.get_function("batchnorm"); - size_t TM = jit.get_int("TM"); - triton::jit::launch_information info = jit.get_launch_info("batchnorm"); - batchnorm.enqueue(stream, kernel, &ty, &tm, &tv, &tx, &tg, &tb, TM, info.num_threads); + batchnorm.enqueue(stream, {&ty, &tm, &tv, &tx, &tg, &tb}); } private: @@ -135,15 +127,7 @@ public: triton::driver::cu_buffer tdb(ctx, (CUdeviceptr)db->flat().data(), false); // create config triton::dnn::batchnorm_backward batchnorm(C, 1, H, W, B, "fp32"); - std::ostringstream oss; - batchnorm.src(oss); - std::string src = oss.str(); - triton::jit jit(ctx); - jit.add_module("batchnorm", src.c_str(), jit.get_valid("batchnorm", src.c_str())); - triton::driver::kernel* kernel = jit.get_function("batchnorm"); - size_t TM = jit.get_int("TM"); - triton::jit::launch_information info = jit.get_launch_info("batchnorm"); - batchnorm.enqueue(stream, kernel, &tdx, &tdg, &tdb, &tdy, &tx, &tg, &tm, &tv, TM, info.num_threads); + batchnorm.enqueue(stream, {&tdx, &tdg, &tdb, &tdy, &tx, &tg, &tm, &tv}); } private: diff --git a/examples/python/tensorflow/conv.cpp b/examples/python/tensorflow/conv.cpp index ff81e3d31..ebd60ac6d 100644 --- a/examples/python/tensorflow/conv.cpp +++ b/examples/python/tensorflow/conv.cpp @@ -50,56 +50,28 @@ public: int32_t stride_d = 1, stride_h = 1, stride_w = 1; int32_t pad_d = 0, pad_h = 0, pad_w = 0; bool has_bias = false; - - // get conv configuration - triton::dnn::conv configuration(B, C, - D, H, W, - T, R, S, - NF, - stride_d, stride_h, stride_w, - pad_d, pad_h, pad_w, - 1, 1, 1, - "fp16", "fp16", - triton::dnn::conv::FPROP, has_bias); - - // Bind memory + // wrap buffers triton::driver::cu_buffer a(ctx, (CUdeviceptr)tfa.flat().data(), false); triton::driver::cu_buffer b(ctx, (CUdeviceptr)tfb.flat().data(), false); triton::driver::buffer* bias = nullptr; - + // template + triton::dnn::conv conv(B, C, + D, H, W, + T, R, S, + NF, + stride_d, stride_h, stride_w, + pad_d, pad_h, pad_w, + 1, 1, 1, + "fp16", "fp16", + triton::dnn::conv::FPROP, has_bias); // allocate output - auto c_shapes = configuration.c_shapes(); + auto c_shapes = conv.c_shapes(); Tensor* tfc = nullptr; TensorShape out_shape({c_shapes[0], c_shapes[1], c_shapes[2], c_shapes[3]}); OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &tfc)); triton::driver::cu_buffer c(ctx, (CUdeviceptr)tfc->flat().data(), false); - - // benchmark a given convolution kernel - triton::jit jit(ctx); - auto benchmark = [&](triton::driver::kernel* kernel, - triton::jit::launch_information info) { - configuration.init(stream, (triton::driver::cu_module*)kernel->module()); - unsigned TM = info.global_range_size[0]; - unsigned TN = info.global_range_size[1]; - unsigned nthreads = info.num_threads; - unsigned GZ = jit.get_int("GZ"); - configuration.enqueue(stream, kernel, &a, &b, &c, bias, TM, TN, GZ, nthreads); - stream->synchronize(); - double ts = triton::tools::bench([&](){ configuration.enqueue(stream, kernel, &a, &b, &c, bias, TM, TN, GZ, nthreads); }, - [&](){ stream->synchronize(); }, stream->context()->device()); - return configuration.get_nflops() / ts * 1e-3; - }; - - std::ostringstream oss; - configuration.src(oss); - std::string src = oss.str(); - - triton::jit::tune_res_t best = jit.autotune("conv", src.c_str(), benchmark); - jit.add_module("conv", src.c_str(), best.params); -// jit.add_module("conv", src.c_str(), {16, 2, 32, 32, 2, 64, 2, 2, 2, 2, 8, 2, 16, 4, 1}); - triton::driver::kernel* kernel = jit.get_function("conv"); - triton::jit::launch_information info = jit.get_launch_info("conv"); - std::cout << benchmark(kernel, info) << std::endl; + // enqueue + conv.enqueue(stream, {&a, &b, &c, bias}); } }; diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index 09b9f47b4..6d5cbb414 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -33,7 +33,6 @@ class DotOp : public OpKernel { // get inputs const Tensor& a = context->input(0); const Tensor& b = context->input(1); - const Tensor& locks = context->input(2); // get shapes const int32_t M = a.dim_size(0); const int32_t N = b.dim_size(0); @@ -45,40 +44,13 @@ class DotOp : public OpKernel { // return early if possible if (out_shape.num_elements() == 0) return; - // initialize default compute device - triton::jit jit(ctx); // matrix multiplication parameters triton::driver::cu_buffer da(ctx, (CUdeviceptr)a.flat().data(), false); triton::driver::cu_buffer db(ctx, (CUdeviceptr)b.flat().data(), false); triton::driver::cu_buffer dc(ctx, (CUdeviceptr)c->flat().data(), false); - triton::driver::cu_buffer dlocks(ctx, (CUdeviceptr)locks.flat().data(), false); - // benchmark a given matrix multiplication kernel - auto benchmark = [&](triton::driver::kernel* kernel, - triton::jit::launch_information info) { - // launch info - unsigned TM = info.global_range_size[0]; - unsigned TN = info.global_range_size[1]; - unsigned nthreads = info.num_threads; - unsigned GZ = jit.get_int("GZ"); - std::array grid = {(M + TM - 1)/TM, (N + TN - 1)/TN, GZ}; - triton::dnn::gemm::set_arg(kernel, &da, &db, &dc, M, N, K, &dlocks, grid[0], grid[1]); - stream->enqueue(kernel, grid, {nthreads, 1, 1}); - stream->synchronize(); - double ts = triton::tools::bench([&](){stream->enqueue(kernel, grid, {nthreads, 1, 1});}, - [&](){ stream->synchronize(); }, ctx->device()); - return 2.*M*N*K / ts * 1e-3; - }; - std::string src = triton::dnn::gemm::src(false, true, "fp16", "fp16", 1, 1); -// just-in-time compile source-code - jit.autotune("matmul", src.c_str(), benchmark); -// jit.add_module("matmul", src.c_str(), {4, 2, 8, 4, 2, 32, 1, 4, 1, 1, 8, 8, 8, 1}); -// jit.add_module("matmul", src.c_str(), {16, 4, 128, 16, 4, 128, 2, 2, 2, 2, 8, 32, 8, 1}); -// jit.add_module("matmul", src.c_str(), {8, 8, 128, 16, 8, 128, 2, 2, 2, 2, 16, 32, 8, 1 }); -// jit.add_module("matmul", src.c_str(), {16, 4, 128, 16, 4, 128, 2, 2, 2, 2, 8, 16, 8, 1}); - jit.add_module("matmul", src.c_str(), {16, 2, 128, 32, 32, 2, 2, 2, 2, 8, 8, 4, 2, 1}); //NN - triton::driver::kernel* kernel = jit.get_function("matmul"); - triton::jit::launch_information info = jit.get_launch_info("matmul"); - std::cout << benchmark(kernel, info) << std::endl; + // template + triton::dnn::gemm dot(M, N, K, false, true, "fp16", "fp16", 4, 4); + dot.enqueue(stream, {&da, &db, &dc}); } private: @@ -88,6 +60,5 @@ REGISTER_KERNEL_BUILDER(Name("Dot").Device(DEVICE_GPU), DotOp); REGISTER_OP("Dot") .Input("a: float16") .Input("b: float16") - .Input("locks: int32") .Output("c: float32") ; diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 27cb7e5c8..1d5fba379 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -127,5 +127,5 @@ def run_batchnorm(): print(np.max(np.abs(dg_t - dg_n))) print(np.max(np.abs(db_t - db_n))) -run_shift() -#run_batchnorm() +#run_shift() +run_batchnorm() diff --git a/examples/python/tensorflow/shift.cpp b/examples/python/tensorflow/shift.cpp index b2cebbaeb..b5e0dffce 100644 --- a/examples/python/tensorflow/shift.cpp +++ b/examples/python/tensorflow/shift.cpp @@ -19,15 +19,6 @@ using namespace tensorflow; using GPUDevice = Eigen::GpuDevice; -typedef std::tuple shift_key_t; - -static std::map> m_stream; -static std::map> m_jit; -static std::map> m_config; - template class ShiftConvOp : public OpKernel { public: @@ -102,23 +93,15 @@ public: // shift offsets int32_t* shift_h_data = h_shift_h_.flat().data(); int32_t* shift_w_data = h_shift_w_.flat().data(); - std::vector shift_h(shift_h_data, shift_h_data + C); - std::vector shift_w(shift_w_data, shift_w_data + C); - shift_key_t key = {B, C, 1, H, W, 1, R_, S_, F, shift_h_data, shift_w_data, OP, has_bias}; // create configuration - triton::dnn::shift* shift; - if(m_config.find(key) == m_config.end()) - shift = m_config.emplace(key, new triton::dnn::shift( - B, C, D, H, W, T, R_, S_, F, - stride_h_, stride_w_, - shift_h, shift_w, - "fp32", "fp32", OP, has_bias)) - .first->second.get(); - else - shift = m_config.at(key).get(); + triton::dnn::shift shift(B, C, D, H, W, T, R_, S_, F, + stride_h_, stride_w_, + shift_h_data, shift_w_data, + "fp32", "fp32", OP, has_bias); + // shapes for c std::vector c_shapes; - for(int32_t x: shift->c_shapes()) + for(int32_t x: shift.c_shapes()) c_shapes.push_back(x); TensorShape out_shapes(c_shapes); Tensor* tf_c = nullptr; @@ -130,7 +113,7 @@ public: triton::driver::cu_buffer da(ctx, (CUdeviceptr)tf_a.flat().data(), false); triton::driver::cu_buffer db(ctx, (CUdeviceptr)tf_b.flat().data(), false); triton::driver::cu_buffer dc(ctx, (CUdeviceptr)tf_c->flat().data(), false); - shift->enqueue(stream, {&da, &db, &dc}); + shift.enqueue(stream, {&da, &db, &dc}); } private: diff --git a/include/triton/dnn/base.h b/include/triton/dnn/base.h index 236cd0c1b..e3c6ff9e1 100644 --- a/include/triton/dnn/base.h +++ b/include/triton/dnn/base.h @@ -29,7 +29,11 @@ namespace triton{ namespace dnn{ + + class base { + friend class cmp_recompile; + protected: // leading dimensions static void set_ld(const std::vector& shapes, @@ -37,21 +41,24 @@ protected: private: // initialize - virtual void init_impl(driver::stream *stream, driver::cu_module *module) = 0; + virtual void init_impl(driver::stream *, driver::cu_module *){ } // enqueue virtual void enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, - size_t TM, size_t TN, size_t nthreads) = 0; + const std::vector& ranges, + size_t nthreads) = 0; + // number of flops + virtual size_t num_flops() const = 0; + // comparison for maps + virtual bool operator<(const base& other) const = 0; public: // constructor base(const std::string& name); - // number of flops - virtual size_t get_nflops() const = 0; // triton-c source - virtual void get_src(std::ostream &os) const = 0; - // comparison for maps - virtual bool operator<(const base& other) const = 0; + virtual void triton_c_src(std::ostream &os) const = 0; + // clone + virtual base* clone() const = 0; // enqueue void enqueue(driver::stream* stream, std::vector args); @@ -59,6 +66,12 @@ private: std::string name_; }; +struct cmp_recompile{ + bool operator()(base* x, base* y) const{ + return *x < *y; + } +}; + } } diff --git a/include/triton/dnn/batchnorm.h b/include/triton/dnn/batchnorm.h index 65f71ce58..df2a2df30 100644 --- a/include/triton/dnn/batchnorm.h +++ b/include/triton/dnn/batchnorm.h @@ -28,23 +28,32 @@ #include #include #include +#include "triton/dnn/base.h" #include "triton/driver/stream.h" #include "triton/driver/kernel.h" namespace triton{ namespace dnn{ -class batchnorm_forward { +class batchnorm_forward: public base { +private: + // enqueue + void enqueue_impl(driver::stream *stream, driver::kernel *kernel, + std::vector args, + const std::vector &ranges, size_t nthreads); + // number of flops + size_t num_flops() const; + // comparison for maps + bool operator<(const base& other) const; + // clone + base* clone() const; + public: // constructor - batchnorm_forward(int C, int D, int H, int W, int B, std::string ty = "fp32"); - // enqueue - void enqueue(driver::stream *stream, driver::kernel *kernel, - driver::buffer *y, driver::buffer *m, driver::buffer *v, - driver::buffer *x, driver::buffer *g, driver::buffer *b, - size_t TM, size_t nthreads); - // triton-c source code - void src(std::ostream &os); + batchnorm_forward(int C, int D, int H, int W, int B, + std::string ty = "fp32", float eps = 1e-5); + // triton-c source + void triton_c_src(std::ostream &os) const; private: int32_t C_; @@ -58,18 +67,25 @@ private: float rcpDHWB_; }; -class batchnorm_backward { +class batchnorm_backward: public base{ +private: + // enqueue + void enqueue_impl(driver::stream *stream, driver::kernel *kernel, + std::vector args, + const std::vector &ranges, size_t nthreads); + // number of flops + size_t num_flops() const; + // comparison for maps + bool operator<(const base& other) const; + // clone + base* clone() const; + public: // constructor - batchnorm_backward(int C, int D, int H, int W, int B, std::string ty = "fp32", float eps = 1e-5); - // enqueue - void enqueue(driver::stream *stream, driver::kernel *kernel, - driver::buffer *dx, driver::buffer *dg, driver::buffer *db, driver::buffer *dy, - driver::buffer *x, driver::buffer *g, driver::buffer *m, driver::buffer *v, - size_t TM, size_t nthreads); - // triton-c source code - void src(std::ostream &os); - + batchnorm_backward(int C, int D, int H, int W, int B, + std::string ty = "fp32", float eps = 1e-5); + // triton-c source + void triton_c_src(std::ostream &os) const; private: int32_t C_; diff --git a/include/triton/dnn/conv.h b/include/triton/dnn/conv.h index 6a590f201..67d621050 100644 --- a/include/triton/dnn/conv.h +++ b/include/triton/dnn/conv.h @@ -4,11 +4,12 @@ #include #include "triton/driver/stream.h" #include "triton/driver/kernel.h" +#include "triton/dnn/base.h" namespace triton{ namespace dnn{ -class conv { +class conv: public base{ public: enum type { FPROP, @@ -17,11 +18,29 @@ public: }; private: - void set_ld(const std::vector& shapes, - std::vector& ld); - + // initialize std::tuple unpack(int32_t ltrs, bool flip, int32_t EBD, int32_t EBH, int32_t EBW); + void build_b_deltas(); + void build_a_deltas(); + void build_masks(); + void init_impl(driver::stream *, driver::cu_module *); + + // enqueue + std::array get_grid(size_t TM, size_t TN); + void set_arg(driver::kernel *kernel, + driver::buffer *a, driver::buffer *b, driver::buffer *c, + driver::buffer *bias); + void enqueue_impl(driver::stream *stream, driver::kernel *kernel, + std::vector args, + const std::vector& ranges, + size_t nthreads); + // number of flops + size_t num_flops() const; + // comparison for maps + bool operator<(const base& other) const; + // clone + base* clone() const; public: @@ -39,35 +58,17 @@ public: size_t b_size(); size_t c_size(); std::vector c_shapes(); - - // initialize - void build_b_deltas(); - void build_deltas(); - void build_masks(); - void init(driver::stream *stream, driver::cu_module *module); - std::array get_grid(size_t TM, size_t TN); - void set_arg(driver::kernel *kernel, - driver::buffer *a, driver::buffer *b, driver::buffer *c, - driver::buffer *bias); - void enqueue(driver::stream *stream, driver::kernel *kernel, - driver::buffer *a, driver::buffer *b, driver::buffer *c, - driver::buffer *bias, - size_t TM, size_t TN, size_t GZ, size_t nthreads); - - // utilities - size_t get_nflops(); + // default params std::vector default_params(); - // source - void src(std::ostream &os); + // triton-c source code + void triton_c_src(std::ostream &os) const; - // cpu check + // cpu reference implementations template void cpu_xprop(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B); - template void cpu_wgrad(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B); - template void cpu_ref(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B); diff --git a/include/triton/dnn/gemm.h b/include/triton/dnn/gemm.h index bd55d030a..26ed7d68a 100644 --- a/include/triton/dnn/gemm.h +++ b/include/triton/dnn/gemm.h @@ -1,27 +1,44 @@ #include "triton/driver/stream.h" #include "triton/driver/kernel.h" +#include "triton/dnn/base.h" #include namespace triton{ namespace dnn{ -class gemm { +class gemm: public base { +private: + // initialize + void init_impl(driver::stream *, driver::cu_module *); + // enqueue + void enqueue_impl(driver::stream *stream, driver::kernel *kernel, + std::vector args, + const std::vector& ranges, + size_t nthreads); + // number of flops + size_t num_flops() const; + // comparison for maps + bool operator<(const base& other) const; + + public: - static void init(driver::stream* stream, driver::buffer* locks); + gemm(int M, int N, int K, bool AT, bool BT, + std::string a_ty, std::string b_ty, + unsigned alignment_lda, unsigned alignment_ldb); - static void set_arg(driver::kernel *kernel, - driver::buffer *a, driver::buffer *b, driver::buffer *c, - int32_t M, int32_t N, int32_t K, - driver::buffer *locks, int32_t grid_0, int32_t grid_1); + // triton-c source + void triton_c_src(std::ostream &os) const; - static std::vector default_params(bool AT, bool BT); + // clone + base* clone() const; - static std::string src(bool AT, bool BT, - std::string a_ty, std::string b_ty, - unsigned alignment_lda, unsigned alignment_ldb); + // default params + std::vector default_params(); + // CPU reference implementation template - static void cpu_ref(std::vector &c, const std::vector &a, const std::vector &b, size_t M, size_t N, size_t K){ + static void cpu_ref(std::vector &c, const std::vector &a, const std::vector &b, + size_t M, size_t N, size_t K){ for(size_t m = 0; m < M; m++) for(size_t n = 0; n < N; n++){ T acc = 0; @@ -30,18 +47,29 @@ public: c[m + n*M] = acc; } } - template - static void cpu_ref(bool AT, bool BT, std::vector &c, const std::vector &a, const std::vector &b, size_t M, size_t N, size_t K) { - if(AT && BT) - gemm::cpu_ref(c, a, b, M, N, K); - else if(AT && !BT) - gemm::cpu_ref(c, a, b, M, N, K); - else if(!AT && BT) - gemm::cpu_ref(c, a, b, M, N, K); + void cpu_ref(std::vector &c, const std::vector &a, const std::vector &b) { + if(AT_ && BT_) + gemm::cpu_ref(c, a, b, M_, N_, K_); + else if(AT_ && !BT_) + gemm::cpu_ref(c, a, b, M_, N_, K_); + else if(!AT_ && BT_) + gemm::cpu_ref(c, a, b, M_, N_, K_); else - gemm::cpu_ref(c, a, b, M, N, K); + gemm::cpu_ref(c, a, b, M_, N_, K_); } + +private: + int32_t M_; + int32_t N_; + int32_t K_; + bool AT_; + bool BT_; + std::string a_ty_; + std::string b_ty_; + unsigned align_lda_; + unsigned align_ldb_; + driver::buffer *locks_; }; } diff --git a/include/triton/dnn/shift.h b/include/triton/dnn/shift.h index 8683c9879..be30da1eb 100644 --- a/include/triton/dnn/shift.h +++ b/include/triton/dnn/shift.h @@ -46,14 +46,12 @@ public: }; private: - // leading dimensions - void set_ld(const std::vector& shapes, - std::vector& ld); // initialize and enqueue void init_impl(driver::stream *stream, driver::cu_module *module); void enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, - size_t TM, size_t TN, size_t nthreads); + const std::vector& ranges, + size_t nthreads); public: @@ -61,7 +59,7 @@ public: int D, int H, int W, int T, int R, int S, int NF, int stride_h, int stride_w, - const std::vector &shift_h, const std::vector &shift_w, + const int32_t* shift_h, const int32_t* shift_w, std::string a_ty = "fp32", std::string b_ty = "fp32", type ty = FPROP, bool bias = false); @@ -74,11 +72,13 @@ public: size_t c_size(); std::vector c_shapes(); // number of flops - size_t get_nflops() const; + size_t num_flops() const; // source - void get_src(std::ostream &os) const; + void triton_c_src(std::ostream &os) const; // comparison bool operator<(const base& other) const; + // clone + base* clone() const; // cpu reference template void cpu_ref(OUT_DTYPE* O, @@ -143,8 +143,8 @@ private: std::vector ld_b_; std::vector ld_c_; // shift values - std::vector shift_h_; - std::vector shift_w_; + const int32_t* shift_h_; + const int32_t* shift_w_; // look-up tables std::vector h_deltas_; std::vector h_masks_; diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index 7bd5a9a0a..710794925 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -6,38 +6,43 @@ namespace triton{ namespace dnn{ -struct cmp_recompile{ - bool operator()(base* x, base* y) const{ - return *x < *y; - } -}; + + + +void base::set_ld(const std::vector& shapes, + std::vector& ld) { + size_t size = shapes.size(); + ld.resize(size); + ld[size - 1] = 1; + for(int i = size - 1; i >= 1; i--) + ld[i - 1] = shapes[i] * ld[i]; +} + base::base(const std::string& name) : name_(name) { } void base::enqueue(driver::stream *stream, std::vector args) { - static std::map, cmp_recompile> m_jit; + static std::map, cmp_recompile> m_jit; bool autotune = false; driver::context* ctx = stream->context(); triton::jit* jit; /* the current template has not already been compiled */ if(m_jit.find(this) == m_jit.end()) { - jit = m_jit.emplace(this, new triton::jit(ctx)).first->second.get(); + jit = m_jit.emplace(this->clone(), new triton::jit(ctx)).first->second.get(); std::ostringstream oss; - get_src(oss); + triton_c_src(oss); std::string src = oss.str(); auto benchmark = [&](triton::driver::kernel* kernel, triton::jit::launch_information info) { // launch info - unsigned TM = info.global_range_size[0]; - unsigned TN = info.global_range_size[1]; unsigned nthreads = info.num_threads; init_impl(stream, (triton::driver::cu_module*)kernel->module()); - enqueue_impl(stream, kernel, args, TM, TN, nthreads); + enqueue_impl(stream, kernel, args, info.global_range_size, nthreads); stream->synchronize(); - double ts = triton::tools::bench([&](){ enqueue_impl(stream, kernel, args, TM, TN, nthreads); }, + double ts = triton::tools::bench([&](){ enqueue_impl(stream, kernel, args, info.global_range_size, nthreads); }, [&](){ stream->synchronize(); }, ctx->device()); - return get_nflops() / ts * 1e-3; + return num_flops() / ts * 1e-3; }; // auto-tune and save result if(autotune) { @@ -57,12 +62,9 @@ void base::enqueue(driver::stream *stream, std::vector args) { /* get launch parameters */ driver::kernel* kernel = jit->get_function(name_.c_str()); triton::jit::launch_information info = jit->get_launch_info(name_.c_str()); - unsigned TM = info.global_range_size[0]; - unsigned TN = info.global_range_size[1]; - unsigned nthreads = info.num_threads; - /* launch */ - enqueue_impl(stream, kernel, args, TM, TN, nthreads); + enqueue_impl(stream, kernel, args, + info.global_range_size, info.num_threads); } } diff --git a/lib/dnn/batchnorm.cpp b/lib/dnn/batchnorm.cpp index a8e91bf8e..3085a5b44 100644 --- a/lib/dnn/batchnorm.cpp +++ b/lib/dnn/batchnorm.cpp @@ -29,17 +29,36 @@ namespace dnn{ * Forward * --------------- */ -batchnorm_forward::batchnorm_forward(int C, int D, int H, int W, int B, std::string ty) - : C_(C), D_(D), H_(H), W_(W), B_(B), ty_(ty), eps_(1e-5) { +batchnorm_forward::batchnorm_forward(int C, int D, int H, int W, int B, std::string ty, float eps) + : base("batchnorm"), + C_(C), D_(D), H_(H), W_(W), B_(B), ty_(ty), eps_(eps) { DHWB_ = D_*H_*W_*B_; rcpDHWB_ = (float)1 / DHWB_; } -void batchnorm_forward::enqueue(driver::stream *stream, driver::kernel *kernel, - driver::buffer *y, driver::buffer *m, driver::buffer *v, - driver::buffer *x, driver::buffer *g, driver::buffer *b, - size_t, size_t nthreads) { +size_t batchnorm_forward::num_flops() const { + return C_*DHWB_; +} +bool batchnorm_forward::operator <(const base& other) const { + auto *y = dynamic_cast(&other); + if(!y) + return true; + return std::tie(C_, D_, H_, W_, B_, ty_) + < std::tie(y->C_, y->D_, y->H_, y->W_, y->B_, y->ty_); +} + +base* batchnorm_forward::clone() const { + return new batchnorm_forward(*this); +} + +void batchnorm_forward::enqueue_impl(driver::stream *stream, driver::kernel *kernel, + std::vector args, + const std::vector&, + size_t nthreads) +{ + driver::buffer *y = args[0], *m = args[1], *v = args[2]; + driver::buffer *x = args[3], *g = args[4], *b = args[5]; std::array grid = {(size_t)C_, 1, 1}; kernel->setArg(0, y); kernel->setArg(1, m); @@ -53,7 +72,7 @@ void batchnorm_forward::enqueue(driver::stream *stream, driver::kernel *kernel, stream->enqueue(kernel, grid, {nthreads, 1, 1}); } -void batchnorm_forward::src(std::ostream &os) { +void batchnorm_forward::triton_c_src(std::ostream &os) const { os << R"( const tunable int32 TM = {32, 64, 128}; @@ -113,14 +132,32 @@ void batchnorm(fp32 *Y, fp32 *M, fp32 *V, * --------------- */ batchnorm_backward::batchnorm_backward(int C, int D, int H, int W, int B, std::string ty, float eps) - : C_(C), D_(D), H_(H), W_(W), B_(B), ty_(ty), eps_(eps) + : base("batchnorm"), + C_(C), D_(D), H_(H), W_(W), B_(B), + ty_(ty), eps_(eps) { } -void batchnorm_backward::enqueue(driver::stream *stream, driver::kernel *kernel, - driver::buffer *dx, driver::buffer *dg, driver::buffer *db, driver::buffer *dy, - driver::buffer *x, driver::buffer *g, driver::buffer *m, driver::buffer *v, - size_t, size_t nthreads) { +size_t batchnorm_backward::num_flops() const { + return C_*D_*H_*W_*B_; +} +bool batchnorm_backward::operator <(const base& other) const { + auto *y = dynamic_cast(&other); + if(!y) + return true; + return std::tie(C_, D_, H_, W_, B_, ty_) + < std::tie(y->C_, y->D_, y->H_, y->W_, y->B_, y->ty_); +} + +base* batchnorm_backward::clone() const { + return new batchnorm_backward(*this); +} + +void batchnorm_backward::enqueue_impl(driver::stream *stream, driver::kernel *kernel, + std::vector args, + const std::vector &, size_t nthreads) { + driver::buffer *dx = args[0], *dg = args[1], *db = args[2], *dy = args[3]; + driver::buffer *x = args[4], *g = args[5], *m = args[6], *v = args[7]; std::array grid = {1, (size_t)C_, 1}; kernel->setArg(0, dx); kernel->setArg(1, dg); @@ -136,7 +173,7 @@ void batchnorm_backward::enqueue(driver::stream *stream, driver::kernel *kernel, stream->enqueue(kernel, grid, {nthreads, 1, 1}); } -void batchnorm_backward::src(std::ostream &os) { +void batchnorm_backward::triton_c_src(std::ostream &os) const { os << R"( const tunable int32 TM = {32, 64, 128}; diff --git a/lib/dnn/conv.cpp b/lib/dnn/conv.cpp index c67d132c8..011cd7a53 100644 --- a/lib/dnn/conv.cpp +++ b/lib/dnn/conv.cpp @@ -4,17 +4,6 @@ namespace triton{ namespace dnn{ -void conv::set_ld(const std::vector& shapes, - std::vector& ld) { - size_t size = shapes.size(); - ld.resize(size); - ld[4] = 1; - ld[3] = shapes[4]*ld[4]; - ld[2] = shapes[3]*ld[3]; - ld[1] = shapes[2]*ld[2]; - ld[0] = shapes[1]*ld[1]; -} - conv::conv(int B, int NC, int D, int H, int W, int T, int R, int S, int NF, @@ -23,7 +12,8 @@ conv::conv(int B, int NC, int upsample_d, int upsample_h, int upsample_w, std::string a_ty, std::string b_ty, type ty, bool bias) - : NB_(B), NC_(NC), AD_(D), AH_(H), AW_(W), BD_(T), BH_(R), BW_(S), NF_(NF), + : base("conv"), + NB_(B), NC_(NC), AD_(D), AH_(H), AW_(W), BD_(T), BH_(R), BW_(S), NF_(NF), stride_d_(stride_d), stride_h_(stride_h), stride_w_(stride_w), pad_d_(pad_d), pad_h_(pad_h), pad_w_(pad_w), upsample_d_(upsample_d), upsample_h_(upsample_h), upsample_w_(upsample_w), @@ -93,7 +83,7 @@ conv::conv(int B, int NC, Fs_ = K_; TK_ = 8; Luts_ = (TK_ + Fs_ - 1) / Fs_ * Fs_; - build_deltas(); + build_a_deltas(); if(b_lut_) build_b_deltas(); build_masks(); @@ -107,6 +97,28 @@ conv::conv(int B, int NC, max_grid_1_ = 256; } +// comparison for maps +bool conv::operator<(const base& other) const { + auto *y = dynamic_cast(&other); + if(!y) + return true; + return std::tie(NB_, NC_, AD_, AH_, AW_, + NF_, BD_, BH_, BW_, + pad_d_, pad_h_, pad_w_, + stride_d_, stride_h_, stride_w_, + a_ty_, b_ty_, ty_, bias_) + < std::tie(y->NB_, y->NC_, y->AD_, y->AH_, y->AW_, + y->NF_, y->BD_, y->BH_, y->BW_, + y->pad_d_, y->pad_h_, y->pad_w_, + y->stride_d_, y->stride_h_, y->stride_w_, + y->a_ty_, y->b_ty_, y->ty_, y->bias_); +} + +// clone +base* conv::clone() const { + return new conv(*this); +} + size_t conv::a_size() { return std::accumulate(shapes_a_.begin(), shapes_a_.end(), 1, std::multiplies()); } @@ -176,7 +188,7 @@ void conv::build_b_deltas(){ } } -void conv::build_deltas(){ +void conv::build_a_deltas(){ h_a_deltas_.resize(Luts_ + upsample_d_*upsample_h_*upsample_w_*Luts_); for(size_t i = 0; i < Luts_; ++i) h_a_deltas_[i] = (((i + TK_) % Luts_) - i); @@ -258,13 +270,15 @@ void conv::build_masks(){ h_masks_[i] = 0x0; } -std::array conv::get_grid(size_t TM, size_t TN) -{ return {(M_ + TM - 1)/TM, (N_ + TN - 1)/TN, 1}; } +std::array conv::get_grid(size_t TM, size_t TN){ + return {(M_ + TM - 1)/TM, (N_ + TN - 1)/TN, 1}; +} -size_t conv::get_nflops() -{ return 2.*M_*N_*K_; } +size_t conv::num_flops() const{ + return 2.*M_*N_*K_; +} -void conv::init(driver::stream *stream, triton::driver::cu_module* module) { +void conv::init_impl(driver::stream *stream, triton::driver::cu_module* module) { auto init_lut = [&](bool is_cst, const char *name, std::vector host) -> triton::driver::buffer*{ if(host.empty()) return nullptr; @@ -349,9 +363,13 @@ void conv::set_arg(driver::kernel *kernel, kernel->setArg(idx++, d_masks_); } -void conv::enqueue(driver::stream *stream, driver::kernel *kernel, - driver::buffer *a, driver::buffer *b, driver::buffer *c, driver::buffer *bias, - size_t TM, size_t TN, size_t GZ, size_t nthreads) { +void conv::enqueue_impl(driver::stream *stream, driver::kernel *kernel, + std::vector args, + const std::vector& ranges, + size_t nthreads) { + driver::buffer *a = args[0], *b = args[1], *c = args[2], *bias = args[3]; + unsigned TM = ranges[0], TN = ranges[1]; + unsigned GZ = 1; set_arg(kernel, a, b, c, bias); std::array grid = {1}; grid[0] = (M_ + TM - 1)/TM; @@ -411,6 +429,8 @@ std::vector conv::default_params() { } +/* CPU reference implementation */ + template void conv::cpu_xprop(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B) { @@ -496,7 +516,9 @@ void conv::cpu_ref(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B) cpu_wgrad(C, A, B); } -void conv::src(std::ostream &os){ +/* Triton-C source code */ + +void conv::triton_c_src(std::ostream &os) const { std::string BS = b_trans_ ? "[TN,TK]" : "[TK, TN]"; std::string bcb0 = b_trans_ ? "[:, newaxis]" : "[newaxis, :]"; std::string bcb1 = b_trans_ ? "[newaxis, :]" : "[:, newaxis]"; @@ -526,7 +548,7 @@ void conv::src(std::ostream &os){ R"( const tunable int32 TM = {16, 32, 64}; const tunable int32 TN = {16, 32, 64}; -const tunable int32 TK = {16}; +const tunable int32 TK = {)" << TK_ << R"(}; const tunable int32 GZ = {1}; )"; if(is_a_deltas_cst) diff --git a/lib/dnn/gemm.cpp b/lib/dnn/gemm.cpp index 6aebf318e..6ea1a8c21 100644 --- a/lib/dnn/gemm.cpp +++ b/lib/dnn/gemm.cpp @@ -6,64 +6,103 @@ namespace triton{ namespace dnn{ +gemm::gemm(int M, int N, int K, + bool AT, bool BT, + std::string a_ty, std::string b_ty, + unsigned alignment_lda, unsigned alignment_ldb) + : base("matmul"), + M_(M), N_(N), K_(K), AT_(AT), BT_(BT), + a_ty_(a_ty), b_ty_(b_ty), + align_lda_(alignment_lda), align_ldb_(alignment_ldb), + locks_(nullptr) { -void gemm::init(driver::stream* stream, driver::buffer* locks) { - std::vector hlocks(2048, 0); - stream->write(locks, false, 0, hlocks); } -void gemm::set_arg(driver::kernel *kernel, - driver::buffer *a, driver::buffer *b, driver::buffer *c, - int32_t M, int32_t N, int32_t K, - driver::buffer *locks, int32_t grid_0, int32_t grid_1) { +size_t gemm::num_flops() const { + return 2.*M_*N_*K_; +} + +// comparison for maps +bool gemm::operator<(const base& other) const { + auto *y = dynamic_cast(&other); + if(!y) + return true; + return std::tie(M_, N_, K_, AT_, BT_, + a_ty_, b_ty_, align_lda_, align_ldb_) + < std::tie(y->M_, y->N_, y->K_, y->AT_, y->BT_, + y->a_ty_, y->b_ty_, y->align_lda_, y->align_ldb_); +} + +// clone +base* gemm::clone() const { + return new gemm(*this); +} + +void gemm::init_impl(driver::stream* stream, driver::cu_module *) { + std::vector hlocks(2048, 0); + if(locks_ == nullptr) + locks_ = triton::driver::buffer::create(stream->context(), hlocks.size()*4); + stream->write(locks_, false, 0, hlocks); +} + +void gemm::enqueue_impl(driver::stream *stream, driver::kernel *kernel, + std::vector args, + const std::vector& ranges, + size_t nthreads) { + driver::buffer *a = args[0], *b = args[1], *c = args[2]; + unsigned TM = ranges[0]; + unsigned TN = ranges[1]; + unsigned grid_0 = (M_ + TM - 1)/TM; + unsigned grid_1 = (N_ + TN - 1)/TN; + unsigned grid_2 = 1; + std::array grid = {grid_0, grid_1, grid_2}; kernel->setArg(0, a); kernel->setArg(1, b); kernel->setArg(2, c); - kernel->setArg(3, M); - kernel->setArg(4, N); - kernel->setArg(5, K); - kernel->setArg(6, M); - kernel->setArg(7, N); - kernel->setArg(8, M); - kernel->setArg(9, locks); + kernel->setArg(3, M_); + kernel->setArg(4, N_); + kernel->setArg(5, K_); + kernel->setArg(6, M_); + kernel->setArg(7, N_); + kernel->setArg(8, M_); + kernel->setArg(9, locks_); kernel->setArg(10, grid_0); kernel->setArg(11, grid_1); + stream->enqueue(kernel, grid, {nthreads, 1, 1}); } -std::vector gemm::default_params(bool AT, bool BT) { - if(AT && BT) +std::vector gemm::default_params() { + if(AT_ && BT_) return {32, 64, 32, 64, 16, 8, 2, 2, 4, 2, 8, 4, 2, 1}; - else if(AT && !BT) + else if(AT_ && !BT_) return {32, 64, 32, 64, 16, 8, 2, 2, 4, 2, 8, 4, 2, 1}; - else if(!AT && BT) + else if(!AT_ && BT_) return {16, 2, 64, 16, 2, 64, 16, 8, 2, 2, 8, 8, 8, 1}; else return {16, 2, 128, 32, 32, 32, 4, 2, 2, 8, 8, 4, 2, 1}; } -std::string gemm::src(bool AT, bool BT, - std::string a_ty, std::string b_ty, - unsigned align_lda, unsigned align_ldb) { +void gemm::triton_c_src(std::ostream &os) const { std::string AS0 = "TM", AS1 = "TK"; std::string BS0 = "TK", BS1 = "TN"; std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; std::string lda0 = "*lda", lda1 = ""; std::string ldb0 = "", ldb1 = "*ldb"; - std::string usea = AT ? "trans(a)" : "a"; - std::string useb = BT ? "trans(b)" : "b"; - if(AT){ + std::string usea = AT_ ? "trans(a)" : "a"; + std::string useb = BT_ ? "trans(b)" : "b"; + if(AT_){ std::swap(AS0, AS1); std::swap(bca0, bca1); std::swap(lda0, lda1); } - if(BT){ + if(BT_){ std::swap(BS0, BS1); std::swap(bcb0, bcb1); std::swap(ldb0, ldb1); } - std::string align_lda_str = "multiple_of(" + std::to_string(align_lda) + ")"; - std::string align_ldb_str = "multiple_of(" + std::to_string(align_ldb) + ")"; + std::string align_lda_str = "multiple_of(" + std::to_string(align_lda_) + ")"; + std::string align_ldb_str = "multiple_of(" + std::to_string(align_ldb_) + ")"; std::string res = R"( const tunable int32 TM = {16, 32, 64, 128}; @@ -71,8 +110,8 @@ const tunable int32 TN = {16, 32, 64, 128}; const tunable int32 TK = {8}; const tunable int32 GZ = {1}; -void matmul(restrict read_only )" + a_ty + R"( *A, - restrict read_only )" + b_ty + R"( *B, +void matmul(restrict read_only )" + a_ty_ + R"( *A, + restrict read_only )" + b_ty_ + R"( *B, fp32 *C, int32 M, int32 N, int32 K, )" + align_lda_str + R"( int32 lda, )" + align_ldb_str + R"(" int32 ldb, int32 ldc, @@ -87,10 +126,10 @@ void matmul(restrict read_only )" + a_ty + R"( *A, int32 rem = K % GZ; K = select(rz < rem, div - 1, div); int32 offk = select(rz < rem, rz*(div + 1), rz*div + rem); - )" + a_ty + R"(* pa[)" + AS0 + ", " + AS1 + "] = A + (offk + rka" + bca0 + ")" + lda0 + " + rxa" + bca1 + lda1 + R"(; - )" + b_ty + R"(* pb[)" + BS0 + ", " + BS1 + "] = B + (offk + rkb" + bcb0 + ")" + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; - )" + a_ty + R"( a[)" + AS0 + ", " + AS1 + R"(] = *pa; - )" + b_ty + R"( b[)" + BS0 + ", " + BS1 + R"(] = *pb; + )" + a_ty_ + R"(* pa[)" + AS0 + ", " + AS1 + "] = A + (offk + rka" + bca0 + ")" + lda0 + " + rxa" + bca1 + lda1 + R"(; + )" + b_ty_ + R"(* pb[)" + BS0 + ", " + BS1 + "] = B + (offk + rkb" + bcb0 + ")" + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; + )" + a_ty_ + R"( a[)" + AS0 + ", " + AS1 + R"(] = *pa; + )" + b_ty_ + R"( b[)" + BS0 + ", " + BS1 + R"(] = *pb; int32 last_a = ((M*K - 1) - (TM*TK + 1)) / lda; int32 last_b = ((K*N - 1) - (TN*TK + 1)) / ldb; last_a = last_a / TK * TK; @@ -108,10 +147,10 @@ void matmul(restrict read_only )" + a_ty + R"( *A, for(int32 k = bound; k > 0; k = k - 1){ int1 checka[TM, 1] = rxc[:, newaxis] < M; int1 checkb[TN, 1] = ryc[:, newaxis] < N; - )" + a_ty + R"(* pa[TM, 1] = A + (offk + K - k))" + lda0 + " + rxc[:, newaxis]" + lda1 + R"(; - )" + b_ty + R"(* pb[TN, 1] = B + (offk + K - k))" + ldb0 + " + ryc[:, newaxis]" + ldb1 + R"(; - )" + a_ty + R"( a[TM, 1] = checka ? *pa : 0; - )" + b_ty + R"( b[TN, 1] = checkb ? *pb : 0; + )" + a_ty_ + R"(* pa[TM, 1] = A + (offk + K - k))" + lda0 + " + rxc[:, newaxis]" + lda1 + R"(; + )" + b_ty_ + R"(* pb[TN, 1] = B + (offk + K - k))" + ldb0 + " + ryc[:, newaxis]" + ldb1 + R"(; + )" + a_ty_ + R"( a[TM, 1] = checka ? *pa : 0; + )" + b_ty_ + R"( b[TN, 1] = checkb ? *pb : 0; c = dot(a, trans(b), c); } int32 ridx = get_range_id(0); @@ -136,7 +175,7 @@ void matmul(restrict read_only )" + a_ty + R"( *A, __atomic_cas(plock, 1, 0); } )"; - return res; + os << res; } } diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index cabf99ed2..2934017b7 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -5,21 +5,13 @@ namespace triton{ namespace dnn{ -void shift::set_ld(const std::vector& shapes, - std::vector& ld) { - size_t size = shapes.size(); - ld.resize(size); - ld[size - 1] = 1; - for(int i = size - 1; i >= 1; i--) - ld[i - 1] = shapes[i] * ld[i]; -} shift::shift(int B, int C, int D, int H, int W, int T, int R, int S, int F, int stride_h, int stride_w, - const std::vector& shift_h, const std::vector& shift_w, + const int32_t *shift_h, const int32_t *shift_w, std::string a_ty, std::string b_ty, type ty, bool bias) : base("shift"), @@ -73,8 +65,10 @@ shift::shift(int B, int C, set_ld(shapes_a_, ld_a_); set_ld(shapes_b_, ld_b_); set_ld(shapes_c_, ld_c_); - // build LUTs - build_deltas(); +} + +base* shift::clone() const { + return new shift(*this); } void shift::build_deltas() { @@ -120,30 +114,29 @@ std::vector shift::c_shapes(){ return shapes_c_; } -size_t shift::get_nflops() const { +size_t shift::num_flops() const { return 2.*M_*N_*K_; } bool shift::operator <(const base& other) const{ auto *y = dynamic_cast(&other); if(!y) - return false; - const int32_t *x_shift_h = shift_h_.data(), *x_shift_w = shift_w_.data(); - const int32_t *y_shift_h = y->shift_h_.data(), *y_shift_w = y->shift_w_.data(); + return true; return std::tie(B_, C_, AD_, AH_, AW_, BD_, BH_, BW_, F_, - x_shift_h, x_shift_w, ty_, bias_) + shift_h_, shift_w_, ty_, bias_) < std::tie(y->B_, y->C_, y->AD_, y->AH_, y->AW_, y->BD_, y->BH_, y->BW_, y->F_, - y_shift_h, y_shift_w, y->ty_, y->bias_); + y->shift_h_, y->shift_w_, y->ty_, y->bias_); } void shift::init_impl(driver::stream *stream, driver::cu_module *module) { + build_deltas(); triton::driver::buffer* delta = ((triton::driver::cu_module*)module)->symbol("delta"); stream->write(delta, false, 0, h_deltas_.size()*4, h_deltas_.data()); } void shift::enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, - size_t TM, size_t TN, size_t nthreads) { + const std::vector &ranges, size_t nthreads) { int32_t lda = AT_ ? K_ : M_; int32_t ldb = BT_ ? N_ : K_; driver::buffer *a = args[0], *b = args[1], *c = args[2]; @@ -162,13 +155,14 @@ void shift::enqueue_impl(driver::stream *stream, driver::kernel *kernel, kernel->setArg(12, AW_); kernel->setArg(13, BH_); kernel->setArg(14, BW_); + unsigned TM = ranges[0], TN = ranges[1]; std::array grid = {(M_ + TM - 1)/TM, (N_ + TN - 1)/TN, 1}; if(ty_ == BPROP) ((driver::cu_buffer*)c)->set_zero(stream, M_*N_*4); stream->enqueue(kernel, grid, {nthreads, 1, 1}); } -void shift::get_src(std::ostream &os) const { +void shift::triton_c_src(std::ostream &os) const { std::string AS0 = "TM", AS1 = "TK"; std::string BS0 = "TK", BS1 = "TN"; std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; From 63b249c1d63f7aedc1c1806949d31677f0971f35 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 9 Jul 2019 20:59:04 -0700 Subject: [PATCH 224/494] [examples/python/pytorch] added batchnorm cpp extension --- examples/python/pytorch/CMakeLists.txt | 2 +- examples/python/pytorch/batchnorm.cpp | 72 ++++++++++ examples/python/pytorch/conv.cpp | 93 ++----------- examples/python/pytorch/shift.cpp | 164 ++++++++++++----------- examples/python/tensorflow/batchnorm.cpp | 88 ++++++------ examples/python/tensorflow/shift.cpp | 1 - 6 files changed, 214 insertions(+), 206 deletions(-) create mode 100644 examples/python/pytorch/batchnorm.cpp diff --git a/examples/python/pytorch/CMakeLists.txt b/examples/python/pytorch/CMakeLists.txt index 759a9709a..5cc6bbb4e 100644 --- a/examples/python/pytorch/CMakeLists.txt +++ b/examples/python/pytorch/CMakeLists.txt @@ -5,6 +5,6 @@ if(${TORCH_FOUND}) include_directories("${CUDA_HOME}/include") link_directories(${TORCH_LIBRARY_DIRS}) add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) - add_library(torch_triton SHARED conv.cpp shift.cpp) + add_library(torch_triton SHARED conv.cpp shift.cpp batchnorm.cpp) target_link_libraries(torch_triton torch triton) endif() diff --git a/examples/python/pytorch/batchnorm.cpp b/examples/python/pytorch/batchnorm.cpp new file mode 100644 index 000000000..2d6818e98 --- /dev/null +++ b/examples/python/pytorch/batchnorm.cpp @@ -0,0 +1,72 @@ +#include +#include +#include "ATen/cuda/CUDAContext.h" +#include "triton/driver/stream.h" +#include "triton/dnn/batchnorm.h" +#include "triton/tools/bench.hpp" + +std::vector + batchnorm_ymv(const torch::Tensor fw_x, + const torch::Tensor fw_g, + const torch::Tensor fw_b, + float eps) { + // Wrap CUDA handles + c10::DeviceIndex device = fw_x.storage().device().index(); + CUstream custream = (CUstream)at::cuda::getCurrentCUDAStream(device).stream(); + triton::driver::cu_stream stream(custream, false); + triton::driver::context* ctx = stream.context(); + // get sizes + int C = fw_x.size(0); + int H = fw_x.size(1); + int W = fw_x.size(2); + int B = fw_x.size(3); + // allocate outputs + torch::Tensor fw_y = torch::empty(fw_x.sizes()).cuda(); + torch::Tensor fw_m = torch::empty(fw_g.sizes()).cuda(); + torch::Tensor fw_v = torch::empty(fw_g.sizes()).cuda(); + triton::driver::cu_buffer x(ctx, (CUdeviceptr)fw_x.storage().data(), false); + triton::driver::cu_buffer g(ctx, (CUdeviceptr)fw_g.storage().data(), false); + triton::driver::cu_buffer b(ctx, (CUdeviceptr)fw_b.storage().data(), false); + triton::driver::cu_buffer y(ctx, (CUdeviceptr)fw_y.storage().data(), false); + triton::driver::cu_buffer m(ctx, (CUdeviceptr)fw_m.storage().data(), false); + triton::driver::cu_buffer v(ctx, (CUdeviceptr)fw_v.storage().data(), false); + // create template + triton::dnn::batchnorm_forward batchnorm(C, 1, H, W, B, "fp32", eps); + batchnorm.enqueue(&stream, {&y, &m, &v, &x, &g, &b}); + return {fw_y, fw_m, fw_v}; +} + +std::vector + batchnorm_dxdgdb(const torch::Tensor fw_dy, + const torch::Tensor fw_x, + const torch::Tensor fw_g, + const torch::Tensor fw_m, + const torch::Tensor fw_v, + float eps) { + // Wrap CUDA handles + c10::DeviceIndex device = fw_x.storage().device().index(); + CUstream custream = (CUstream)at::cuda::getCurrentCUDAStream(device).stream(); + triton::driver::cu_stream stream(custream, false); + triton::driver::context* ctx = stream.context(); + // get sizes + int C = fw_x.size(0); + int H = fw_x.size(1); + int W = fw_x.size(2); + int B = fw_x.size(3); + // allocate outputs + torch::Tensor fw_dx = torch::empty(fw_x.sizes()).cuda(); + torch::Tensor fw_dg = torch::empty(fw_g.sizes()).cuda(); + torch::Tensor fw_db = torch::empty(fw_g.sizes()).cuda(); + // triton handles + triton::driver::cu_buffer dy(ctx, (CUdeviceptr)fw_dy.storage().data(), false); + triton::driver::cu_buffer x(ctx, (CUdeviceptr) fw_x.storage().data(), false); + triton::driver::cu_buffer g(ctx, (CUdeviceptr) fw_g.storage().data(), false); + triton::driver::cu_buffer m(ctx, (CUdeviceptr) fw_m.storage().data(), false); + triton::driver::cu_buffer v(ctx, (CUdeviceptr) fw_v.storage().data(), false); + triton::driver::cu_buffer dx(ctx, (CUdeviceptr)fw_dx.storage().data(), false); + triton::driver::cu_buffer dg(ctx, (CUdeviceptr)fw_dg.storage().data(), false); + triton::driver::cu_buffer db(ctx, (CUdeviceptr)fw_db.storage().data(), false); + // create config + triton::dnn::batchnorm_backward batchnorm(C, 1, H, W, B, "fp32", eps); + batchnorm.enqueue(&stream, {&dx, &dg, &db, &dy, &x, &g, &m, &v}); +} diff --git a/examples/python/pytorch/conv.cpp b/examples/python/pytorch/conv.cpp index a21549c31..41c2e75e7 100644 --- a/examples/python/pytorch/conv.cpp +++ b/examples/python/pytorch/conv.cpp @@ -12,16 +12,6 @@ #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) -typedef std::tuple conv_key_t; - -static std::map> m_stream; -static std::map> m_jit; -static std::map> m_config; - torch::Tensor conv_common( int32_t B, int32_t C, int32_t D, int32_t H, int32_t W, int32_t T, int32_t R, int32_t S, int32_t NF, @@ -31,95 +21,34 @@ torch::Tensor conv_common( torch::Tensor torcha, torch::Tensor torchb, torch::Tensor torchbias, bool autotune = false ) { - // Wrap CUDA handles c10::DeviceIndex device = torcha.storage().device().index(); - // Get stream CUstream custream = (CUstream)at::cuda::getCurrentCUDAStream(device).stream(); - triton::driver::stream* stream; - if(m_stream.find(custream) == m_stream.end()) - stream = m_stream.emplace(custream, new triton::driver::cu_stream(custream, false)).first->second.get(); - else - stream = m_stream.at(custream).get(); - - // Get context - triton::driver::context* ctx = stream->context(); - - // Get configuration + triton::driver::cu_stream stream(custream, false); + triton::driver::context* ctx = stream.context(); + // Get template bool has_bias = torchbias.storage().size() > 0; - conv_key_t key = {B, C, D, H, W, T, R, S, NF, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, ty, has_bias}; - triton::dnn::conv* configuration; - if(m_config.find(key) == m_config.end()) - configuration = m_config.emplace(key, new triton::dnn::conv( - B, C, D, H, W, T, R, S, NF, - stride_d, stride_h, stride_w, - pad_d, pad_h, pad_w, - 1, 1, 1, - "fp32", "fp32", ty, has_bias)).first->second.get(); - else - configuration = m_config.at(key).get(); - + triton::dnn::conv conv(B, C, D, H, W, T, R, S, NF, + stride_d, stride_h, stride_w, + pad_d, pad_h, pad_w, + 1, 1, 1, + "fp32", "fp32", ty, has_bias); // Bind memory triton::driver::cu_buffer a(ctx, (CUdeviceptr)torcha.storage().data(), false); triton::driver::cu_buffer b(ctx, (CUdeviceptr)torchb.storage().data(), false); triton::driver::cu_buffer cubias(ctx, (CUdeviceptr)torchbias.storage().data(), false); triton::driver::buffer* bias = has_bias ? &cubias : nullptr; - // Allocate output - std::vector c_shapes = configuration->c_shapes(); + std::vector c_shapes = conv.c_shapes(); torch::Tensor torchc; if(ty == triton::dnn::conv::WGRAD) torchc = torch::empty({c_shapes[0], c_shapes[2], c_shapes[3], c_shapes[4]}, torch::kFloat).cuda(); else torchc = torch::empty({c_shapes[0], c_shapes[1], c_shapes[3], c_shapes[4]}, torch::kFloat).cuda(); triton::driver::cu_buffer c(ctx, (CUdeviceptr)torchc.storage().data(), false); - - // Get JIT - triton::jit* jit; - if(m_jit.find(key) == m_jit.end()){ - jit = m_jit.emplace(key, new triton::jit(ctx)).first->second.get(); - std::ostringstream oss; - configuration->src(oss); - std::string src = oss.str(); - // benchmark a given convolution kernel - auto benchmark = [&](triton::driver::kernel* kernel, - triton::jit::launch_information info) { - configuration->init(stream, (triton::driver::cu_module*)kernel->module()); - unsigned TM = info.global_range_size[0]; - unsigned TN = info.global_range_size[1]; - unsigned nthreads = info.num_threads; - unsigned GZ = jit->get_int("GZ"); - configuration->enqueue(stream, kernel, &a, &b, &c, bias, TM, TN, GZ, nthreads); - stream->synchronize(); - double ts = triton::tools::bench([&](){ configuration->enqueue(stream, kernel, &a, &b, &c, bias, TM, TN, GZ, nthreads); }, - [&](){ stream->synchronize(); }, stream->context()->device()); - return configuration->get_nflops() / ts * 1e-3; - }; - // auto-tune and save result - if(autotune) { - triton::jit::tune_res_t best = jit->autotune("conv", src.c_str(), benchmark); - jit->add_module("conv", src.c_str(), best.params); - } - else { - jit->add_module("conv", src.c_str(), configuration->default_params()); - } - triton::driver::kernel* kernel = jit->get_function("conv"); - configuration->init(stream, (triton::driver::cu_module*)kernel->module()); - } - else - jit = m_jit.at(key).get(); - - // Run - triton::driver::kernel* kernel = jit->get_function("conv"); - triton::jit::launch_information info = jit->get_launch_info("conv"); - unsigned GZ = jit->get_int("GZ"); - // launch info - unsigned TM = info.global_range_size[0]; - unsigned TN = info.global_range_size[1]; - unsigned nthreads = info.num_threads; - // enqueue - configuration->enqueue(stream, kernel, &a, &b, &c, bias, TM, TN, GZ, nthreads); + // Enqueue + conv.enqueue(&stream, {&a, &b, &c, bias}); return torchc; } diff --git a/examples/python/pytorch/shift.cpp b/examples/python/pytorch/shift.cpp index df28c6ca7..59e78e72e 100644 --- a/examples/python/pytorch/shift.cpp +++ b/examples/python/pytorch/shift.cpp @@ -12,103 +12,111 @@ #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) -typedef std::tuple shift_key_t; - -static std::map> m_shift_stream; -static std::map> m_shift_jit; -static std::map> m_shift_config; - torch::Tensor shift_common( int32_t B, int32_t C, int32_t D, int32_t H, int32_t W, int32_t T, int32_t R, int32_t S, int32_t F, - std::vector shift_h, std::vector shift_w, + int32_t stride_h, int32_t stride_w, + int32_t* shift_h, int32_t* shift_w, triton::dnn::shift::type ty, torch::Tensor torcha, torch::Tensor torchb, torch::Tensor torchbias, bool autotune = false ) { - // Wrap CUDA handles c10::DeviceIndex device = torcha.storage().device().index(); - - // Get stream CUstream custream = (CUstream)at::cuda::getCurrentCUDAStream(device).stream(); - triton::driver::stream* stream; - if(m_shift_stream.find(custream) == m_shift_stream.end()) - stream = m_shift_stream.emplace(custream, new triton::driver::cu_stream(custream, false)).first->second.get(); - else - stream = m_shift_stream.at(custream).get(); - - // Get context - triton::driver::context* ctx = stream->context(); - + triton::driver::cu_stream stream(custream, false); + triton::driver::context* ctx = stream.context(); // Get configuration bool has_bias = torchbias.storage().size() > 0; - shift_key_t key = {B, C, D, H, W, T, R, S, F, shift_h.data(), shift_w.data(), ty, has_bias}; - triton::dnn::shift* configuration; - if(m_shift_config.find(key) == m_shift_config.end()) - configuration = m_shift_config.emplace(key, new triton::dnn::shift( - B, C, D, H, W, T, R, S, F, - shift_h, shift_w, "fp32", "fp32", - ty, has_bias)).first->second.get(); - else - configuration = m_shift_config.at(key).get(); - + triton::dnn::shift shift(B, C, D, H, W, T, R, S, F, + stride_h, stride_w, + shift_h, shift_w, "fp32", "fp32", + ty, has_bias); // Bind memory triton::driver::cu_buffer a(ctx, (CUdeviceptr)torcha.storage().data(), false); triton::driver::cu_buffer b(ctx, (CUdeviceptr)torchb.storage().data(), false); triton::driver::cu_buffer cubias(ctx, (CUdeviceptr)torchbias.storage().data(), false); triton::driver::buffer* bias = has_bias ? &cubias : nullptr; - // Allocate output - std::vector c_shapes = configuration->c_shapes(); + std::vector c_shapes = shift.c_shapes(); torch::Tensor torchc = torch::empty({c_shapes[0], c_shapes[1], c_shapes[2], c_shapes[3]}).cuda(); triton::driver::cu_buffer c(ctx, (CUdeviceptr)torchc.storage().data(), false); - - // Get JIT - triton::jit* jit; - if(m_shift_jit.find(key) == m_shift_jit.end()){ - jit = m_shift_jit.emplace(key, new triton::jit(ctx)).first->second.get(); - std::ostringstream oss; - configuration->triton_c_src(oss); - std::string src = oss.str(); - // benchmark a given shiftolution kernel - auto benchmark = [&](triton::driver::kernel* kernel, - triton::jit::launch_information info) { - configuration->init_impl(stream, (triton::driver::cu_module*)kernel->module()); - unsigned TM = info.global_range_size[0]; - unsigned TN = info.global_range_size[1]; - unsigned nthreads = info.num_threads; - configuration->enqueue_impl(stream, kernel, &a, &b, &c, TM, TN, nthreads); - stream->synchronize(); - double ts = triton::tools::bench([&](){ configuration->enqueue_impl(stream, kernel, &a, &b, &c, TM, TN, nthreads); }, - [&](){ stream->synchronize(); }, stream->context()->device()); - return configuration->num_flops() / ts * 1e-3; - }; - // auto-tune and save result - if(autotune) { - triton::jit::tune_res_t best = jit->autotune("shift", src.c_str(), benchmark); - jit->add_module("shift", src.c_str(), best.params); - } - else { - jit->add_module("shift", src.c_str(), jit->get_valid("shift", src.c_str())); - } - triton::driver::kernel* kernel = jit->get_function("shift"); - configuration->init_impl(stream, (triton::driver::cu_module*)kernel->module()); - } - else - jit = m_shift_jit.at(key).get(); - - // Run - triton::driver::kernel* kernel = jit->get_function("shift"); - triton::jit::launch_information info = jit->get_launch_info("shift"); - // launch info - unsigned TM = info.global_range_size[0]; - unsigned TN = info.global_range_size[1]; - unsigned nthreads = info.num_threads; - // enqueue - configuration->enqueue_impl(stream, kernel, &a, &b, &c, TM, TN, nthreads); + // Enqueue + shift.enqueue(&stream, {&a, &b, &c}); return torchc; } + +torch::Tensor shift_y( + const torch::Tensor x, + const torch::Tensor w, + const torch::Tensor bias, + int32_t R, int32_t S, + int32_t stride_h, int32_t stride_w, + int32_t* shift_h, int32_t* shift_w) { + // shapes for a + int64_t Ca = x.size(0); + int64_t H = x.size(1); + int64_t W = x.size(2); + int64_t B = x.size(3); + // shapes for b + int64_t Cb = w.size(0); + int64_t F = w.size(1); + AT_CHECK(Ca == Cb, "operands must have the same number of channels"); + int64_t C = Ca; + // run + shift_common(B, C, 1, H, W, 1, R, S, F, stride_h, stride_w, shift_h, shift_w, triton::dnn::shift::FPROP, x, w, bias); +} + +torch::Tensor shift_dx( + const torch::Tensor dy, + const torch::Tensor w, + const torch::Tensor bias, + int32_t R, int32_t S, + int32_t stride_h, int32_t stride_w, + int32_t* shift_h, int32_t* shift_w) { + // shapes for a + int64_t Ca = dy.size(0); + int64_t H = dy.size(1); + int64_t W = dy.size(2); + int64_t B = dy.size(3); + H *= stride_h; + W *= stride_w; + // shapes for b + int64_t Cb = w.size(0); + int64_t F = w.size(1); + std::swap(Cb, F); + // checks + AT_CHECK(Ca == Cb, "operands must have the same number of channels"); + int64_t C = Ca; + std::swap(C, F); + // run + shift_common(B, C, 1, H, W, 1, R, S, F, stride_h, stride_w, shift_h, shift_w, triton::dnn::shift::BPROP, dy, w, bias); +} + +torch::Tensor shift_dw( + const torch::Tensor dy, + const torch::Tensor x, + const torch::Tensor bias, + int32_t R, int32_t S, + int32_t stride_h, int32_t stride_w, + int32_t* shift_h, int32_t* shift_w) { + // shapes for a + int64_t F = dy.size(0); + int64_t Ha = dy.size(1); + int64_t Wa = dy.size(2); + int64_t Ba = dy.size(3); + // shapes for b + int64_t C = x.size(0); + int64_t Hb = x.size(1); + int64_t Wb = x.size(2); + int64_t Bb = x.size(3); + // check + AT_CHECK(Ha*stride_h == Hb, "operands must have the same image height"); + AT_CHECK(Wa*stride_w == Wb, "operands must have the same image width"); + AT_CHECK(Ba == Bb, "operands must have the same batch size"); + int64_t H = Hb; + int64_t W = Wb; + int64_t B = Bb; + // run + shift_common(B, C, 1, H, W, 1, R, S, F, stride_h, stride_w, shift_h, shift_w, triton::dnn::shift::WGRAD, dy, x, bias); +} diff --git a/examples/python/tensorflow/batchnorm.cpp b/examples/python/tensorflow/batchnorm.cpp index 3a34079e1..3e50aeb8b 100644 --- a/examples/python/tensorflow/batchnorm.cpp +++ b/examples/python/tensorflow/batchnorm.cpp @@ -35,31 +35,31 @@ public: triton::driver::context* ctx = sstream.context(); triton::driver::stream* stream = &sstream; // get inputs - const Tensor& x = context->input(0); - const Tensor& g = context->input(1); - const Tensor& b = context->input(2); + const Tensor& fw_x = context->input(0); + const Tensor& fw_g = context->input(1); + const Tensor& fw_b = context->input(2); // get sizes - int C = x.dim_size(0); - int H = x.dim_size(1); - int W = x.dim_size(2); - int B = x.dim_size(3); + int C = fw_x.dim_size(0); + int H = fw_x.dim_size(1); + int W = fw_x.dim_size(2); + int B = fw_x.dim_size(3); // allocate outputs - Tensor* y = nullptr; - Tensor* m = nullptr; - Tensor* v = nullptr; - OP_REQUIRES_OK(context, context->allocate_output(0, x.shape(), &y)); - OP_REQUIRES_OK(context, context->allocate_output(1, g.shape(), &m)); - OP_REQUIRES_OK(context, context->allocate_output(2, g.shape(), &v)); + Tensor* fw_y = nullptr; + Tensor* fw_m = nullptr; + Tensor* fw_v = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, fw_x.shape(), &fw_y)); + OP_REQUIRES_OK(context, context->allocate_output(1, fw_g.shape(), &fw_m)); + OP_REQUIRES_OK(context, context->allocate_output(2, fw_g.shape(), &fw_v)); // triton handles - triton::driver::cu_buffer tx(ctx, (CUdeviceptr)x.flat().data(), false); - triton::driver::cu_buffer tg(ctx, (CUdeviceptr)g.flat().data(), false); - triton::driver::cu_buffer tb(ctx, (CUdeviceptr)b.flat().data(), false); - triton::driver::cu_buffer ty(ctx, (CUdeviceptr)y->flat().data(), false); - triton::driver::cu_buffer tm(ctx, (CUdeviceptr)m->flat().data(), false); - triton::driver::cu_buffer tv(ctx, (CUdeviceptr)v->flat().data(), false); + triton::driver::cu_buffer x(ctx, (CUdeviceptr)fw_x.flat().data(), false); + triton::driver::cu_buffer g(ctx, (CUdeviceptr)fw_g.flat().data(), false); + triton::driver::cu_buffer b(ctx, (CUdeviceptr)fw_b.flat().data(), false); + triton::driver::cu_buffer y(ctx, (CUdeviceptr)fw_y->flat().data(), false); + triton::driver::cu_buffer m(ctx, (CUdeviceptr)fw_m->flat().data(), false); + triton::driver::cu_buffer v(ctx, (CUdeviceptr)fw_v->flat().data(), false); // create config triton::dnn::batchnorm_forward batchnorm(C, 1, H, W, B, "fp32"); - batchnorm.enqueue(stream, {&ty, &tm, &tv, &tx, &tg, &tb}); + batchnorm.enqueue(stream, {&y, &m, &v, &x, &g, &b}); } private: @@ -99,35 +99,35 @@ public: triton::driver::context* ctx = sstream.context(); triton::driver::stream* stream = &sstream; // get inputs - const Tensor& dy = context->input(0); - const Tensor& x = context->input(1); - const Tensor& g = context->input(2); - const Tensor& m = context->input(3); - const Tensor& v = context->input(4); + const Tensor& fw_dy = context->input(0); + const Tensor& fw_x = context->input(1); + const Tensor& fw_g = context->input(2); + const Tensor& fw_m = context->input(3); + const Tensor& fw_v = context->input(4); // get sizes - int C = x.dim_size(0); - int H = x.dim_size(1); - int W = x.dim_size(2); - int B = x.dim_size(3); + int C = fw_x.dim_size(0); + int H = fw_x.dim_size(1); + int W = fw_x.dim_size(2); + int B = fw_x.dim_size(3); // allocate outputs - Tensor* dx = nullptr; - Tensor* dg = nullptr; - Tensor* db = nullptr; - OP_REQUIRES_OK(context, context->allocate_output(0, x.shape(), &dx)); - OP_REQUIRES_OK(context, context->allocate_output(1, g.shape(), &dg)); - OP_REQUIRES_OK(context, context->allocate_output(2, g.shape(), &db)); + Tensor* fw_dx = nullptr; + Tensor* fw_dg = nullptr; + Tensor* fw_db = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, fw_x.shape(), &fw_dx)); + OP_REQUIRES_OK(context, context->allocate_output(1, fw_g.shape(), &fw_dg)); + OP_REQUIRES_OK(context, context->allocate_output(2, fw_g.shape(), &fw_db)); // triton handles - triton::driver::cu_buffer tdy(ctx, (CUdeviceptr)dy.flat().data(), false); - triton::driver::cu_buffer tx(ctx, (CUdeviceptr)x.flat().data(), false); - triton::driver::cu_buffer tg(ctx, (CUdeviceptr)g.flat().data(), false); - triton::driver::cu_buffer tm(ctx, (CUdeviceptr)m.flat().data(), false); - triton::driver::cu_buffer tv(ctx, (CUdeviceptr)v.flat().data(), false); - triton::driver::cu_buffer tdx(ctx, (CUdeviceptr)dx->flat().data(), false); - triton::driver::cu_buffer tdg(ctx, (CUdeviceptr)dg->flat().data(), false); - triton::driver::cu_buffer tdb(ctx, (CUdeviceptr)db->flat().data(), false); + triton::driver::cu_buffer dy(ctx, (CUdeviceptr)fw_dy.flat().data(), false); + triton::driver::cu_buffer x(ctx, (CUdeviceptr)fw_x.flat().data(), false); + triton::driver::cu_buffer g(ctx, (CUdeviceptr)fw_g.flat().data(), false); + triton::driver::cu_buffer m(ctx, (CUdeviceptr)fw_m.flat().data(), false); + triton::driver::cu_buffer v(ctx, (CUdeviceptr)fw_v.flat().data(), false); + triton::driver::cu_buffer dx(ctx, (CUdeviceptr)fw_dx->flat().data(), false); + triton::driver::cu_buffer dg(ctx, (CUdeviceptr)fw_dg->flat().data(), false); + triton::driver::cu_buffer db(ctx, (CUdeviceptr)fw_db->flat().data(), false); // create config triton::dnn::batchnorm_backward batchnorm(C, 1, H, W, B, "fp32"); - batchnorm.enqueue(stream, {&tdx, &tdg, &tdb, &tdy, &tx, &tg, &tm, &tv}); + batchnorm.enqueue(stream, {&dx, &dg, &db, &dy, &x, &g, &m, &v}); } private: diff --git a/examples/python/tensorflow/shift.cpp b/examples/python/tensorflow/shift.cpp index b5e0dffce..bde4d1b5e 100644 --- a/examples/python/tensorflow/shift.cpp +++ b/examples/python/tensorflow/shift.cpp @@ -73,7 +73,6 @@ public: if(OP == triton::dnn::shift::BPROP) std::swap(C, F); } - } void Compute(OpKernelContext* context){ From 3b89bc8463c023e1fe39bc179785bfd947a86bc7 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 9 Jul 2019 21:54:37 -0700 Subject: [PATCH 225/494] [examples/python/pytorch] added skeleton of wrapper for shift-conv and batch-norm --- examples/python/pytorch/CMakeLists.txt | 2 +- examples/python/pytorch/batchnorm.cpp | 1 - examples/python/pytorch/bench.py | 142 ------------------------- examples/python/pytorch/conv.cpp | 3 - examples/python/pytorch/shift.cpp | 8 +- examples/python/pytorch/triton.py | 55 ++++++++++ include/triton/dnn/shift.h | 1 - include/triton/tools/bench.hpp | 2 + 8 files changed, 63 insertions(+), 151 deletions(-) delete mode 100644 examples/python/pytorch/bench.py diff --git a/examples/python/pytorch/CMakeLists.txt b/examples/python/pytorch/CMakeLists.txt index 5cc6bbb4e..f4b4df758 100644 --- a/examples/python/pytorch/CMakeLists.txt +++ b/examples/python/pytorch/CMakeLists.txt @@ -4,7 +4,7 @@ if(${TORCH_FOUND}) include_directories(${TORCH_INCLUDE_DIRS}) include_directories("${CUDA_HOME}/include") link_directories(${TORCH_LIBRARY_DIRS}) - add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) + add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1) add_library(torch_triton SHARED conv.cpp shift.cpp batchnorm.cpp) target_link_libraries(torch_triton torch triton) endif() diff --git a/examples/python/pytorch/batchnorm.cpp b/examples/python/pytorch/batchnorm.cpp index 2d6818e98..fff9039d7 100644 --- a/examples/python/pytorch/batchnorm.cpp +++ b/examples/python/pytorch/batchnorm.cpp @@ -3,7 +3,6 @@ #include "ATen/cuda/CUDAContext.h" #include "triton/driver/stream.h" #include "triton/dnn/batchnorm.h" -#include "triton/tools/bench.hpp" std::vector batchnorm_ymv(const torch::Tensor fw_x, diff --git a/examples/python/pytorch/bench.py b/examples/python/pytorch/bench.py deleted file mode 100644 index 98a782099..000000000 --- a/examples/python/pytorch/bench.py +++ /dev/null @@ -1,142 +0,0 @@ -'''Train CIFAR10 with PyTorch.''' -from __future__ import print_function - -import torch -import torch.nn as nn -import torch.optim as optim -import torch.nn.functional as F -import torch.backends.cudnn as cudnn - -import torchvision -import torchvision.transforms as transforms - -import os -import argparse - -from resnet import * -from utils import progress_bar - - -parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training') -parser.add_argument('--lr', default=0.1, type=float, help='learning rate') -parser.add_argument('--resume', '-r', action='store_true', help='resume from checkpoint') -args = parser.parse_args() - -device = 'cuda' if torch.cuda.is_available() else 'cpu' -best_acc = 0 # best test accuracy -start_epoch = 0 # start from epoch 0 or last checkpoint epoch - -# Data -print('==> Preparing data..') -transform_train = transforms.Compose([ - transforms.RandomCrop(32, padding=4), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), -]) - -transform_test = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), -]) - -trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train) -trainloader = torch.utils.data.DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2) - -testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test) -testloader = torch.utils.data.DataLoader(testset, batch_size=100, shuffle=False, num_workers=2) - -classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') - -# Model -print('==> Building model..') -# net = VGG('VGG19') -net = ResNet18() -# net = PreActResNet18() -# net = GoogLeNet() -# net = DenseNet121() -# net = ResNeXt29_2x64d() -# net = MobileNet() -# net = MobileNetV2() -# net = DPN92() -# net = ShuffleNetG2() -# net = SENet18() -#net = ShuffleNetV2(1) -net = net.to(device) -if device == 'cuda': - net = torch.nn.DataParallel(net) - cudnn.benchmark = True - -if args.resume: - # Load checkpoint. - print('==> Resuming from checkpoint..') - assert os.path.isdir('checkpoint'), 'Error: no checkpoint directory found!' - checkpoint = torch.load('./checkpoint/ckpt.t7') - net.load_state_dict(checkpoint['net']) - best_acc = checkpoint['acc'] - start_epoch = checkpoint['epoch'] - -criterion = nn.CrossEntropyLoss() -optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4) - -# Training -def train(epoch): - print('\nEpoch: %d' % epoch) - net.train() - train_loss = 0 - correct = 0 - total = 0 - for batch_idx, (inputs, targets) in enumerate(trainloader): - inputs, targets = inputs.to(device), targets.to(device) - optimizer.zero_grad() - outputs = net(inputs) - loss = criterion(outputs, targets) - loss.backward() - optimizer.step() - - train_loss += loss.item() - _, predicted = outputs.max(1) - total += targets.size(0) - correct += predicted.eq(targets).sum().item() - - progress_bar(batch_idx, len(trainloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' - % (train_loss/(batch_idx+1), 100.*correct/total, correct, total)) - -def test(epoch): - global best_acc - net.eval() - test_loss = 0 - correct = 0 - total = 0 - with torch.no_grad(): - for batch_idx, (inputs, targets) in enumerate(testloader): - inputs, targets = inputs.to(device), targets.to(device) - outputs = net(inputs) - loss = criterion(outputs, targets) - - test_loss += loss.item() - _, predicted = outputs.max(1) - total += targets.size(0) - correct += predicted.eq(targets).sum().item() - - progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)' - % (test_loss/(batch_idx+1), 100.*correct/total, correct, total)) - - # Save checkpoint. - acc = 100.*correct/total - if acc > best_acc: - print('Saving..') - state = { - 'net': net.state_dict(), - 'acc': acc, - 'epoch': epoch, - } - if not os.path.isdir('checkpoint'): - os.mkdir('checkpoint') - torch.save(state, './checkpoint/ckpt.t7') - best_acc = acc - - -for epoch in range(start_epoch, start_epoch+200): - train(epoch) -test(epoch) diff --git a/examples/python/pytorch/conv.cpp b/examples/python/pytorch/conv.cpp index 41c2e75e7..eab6ba9e7 100644 --- a/examples/python/pytorch/conv.cpp +++ b/examples/python/pytorch/conv.cpp @@ -1,12 +1,9 @@ #include -#include #include #include #include "ATen/cuda/CUDAContext.h" -#include "triton/runtime/jit.h" #include "triton/driver/stream.h" #include "triton/dnn/conv.h" -#include "triton/tools/bench.hpp" #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x " must be contiguous") diff --git a/examples/python/pytorch/shift.cpp b/examples/python/pytorch/shift.cpp index 59e78e72e..e9271d37c 100644 --- a/examples/python/pytorch/shift.cpp +++ b/examples/python/pytorch/shift.cpp @@ -1,12 +1,9 @@ #include -#include #include #include #include "ATen/cuda/CUDAContext.h" -#include "triton/runtime/jit.h" #include "triton/driver/stream.h" #include "triton/dnn/shift.h" -#include "triton/tools/bench.hpp" #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x " must be contiguous") @@ -120,3 +117,8 @@ torch::Tensor shift_dw( // run shift_common(B, C, 1, H, W, 1, R, S, F, stride_h, stride_w, shift_h, shift_w, triton::dnn::shift::WGRAD, dy, x, bias); } + +static auto registry = + torch::jit::RegisterOperators("triton::shift_conv_y", &shift_y) + .op("triton::shift_conv_dx", &shift_dx) + .op("triton::shift_conv_dw", &shift_dw); diff --git a/examples/python/pytorch/triton.py b/examples/python/pytorch/triton.py index 18f08ba44..e8a0a7ff2 100644 --- a/examples/python/pytorch/triton.py +++ b/examples/python/pytorch/triton.py @@ -4,6 +4,10 @@ import math torch.ops.load_library("/home/philippe/development/triton/build/examples/python/pytorch/libtorch_triton.so") +################################# +####### Convolutions ########## +################################# + class ConvFunction(torch.autograd.Function): @staticmethod @@ -81,3 +85,54 @@ class Conv2d(_ConvNd): super(Conv2d, self).__init__( in_channels, out_channels, kernel_size, stride, padding, dilation, False, _pair(0), groups, bias) + +################################# +#### Shift-Convolutions ####### +################################# + +class ShiftConvFunction(torch.autograd.Function): + + @staticmethod + def forward(ctx, input, weight, bias, stride, width): + if bias is None: + bias = torch.empty(0) + ctx.save_for_backward(input, weight, bias) + ctx.stride = stride + ctx.width = width + output = torch.ops.triton.shift_conv_y(input, weight, bias, width[0], width[1], stride[0], stride[1]) + return output + + @staticmethod + def backward(ctx, dy): + input, weight, bias = ctx.saved_tensors + stride = ctx.stride + width = ctx.width + dx = dw = dbias = None + if ctx.needs_input_grad[0]: + dx = torch.ops.triton.shift_conv_dx(dy, weight, bias, width[0], width[1], stride[0], stride[1]) + if ctx.needs_input_grad[1]: + dw = torch.ops.triton.shift_conv_dw(dy, input, bias, width[0], width[1], stride[0], stride[1]) + if ctx.needs_input_grad[2]: + dbias = torch.sum(dy, (1, 2, 3)) + return dx, dw, dbias, None, None + + +################################# +######### BatchNorm ########### +################################# + +class BatchNormFunction(torch.autograd.Function): + + @staticmethod + def forward(ctx, x, gamma, beta, eps): + ctx.eps = eps + y, mean, var = torch.ops.triton.batchnorm_ymv(x, gamma, beta, eps) + ctx.save_for_backward(x, gamma, beta, mean, var) + return y + + @staticmethod + def backward(ctx, dy): + eps = ctx.eps + x, gamma, beta, mean, var = ctx.saved_tensors + dx, dg, db = torch.ops.triton.batchnorm_dxdgdb(dy, x, gamma, mean, var, eps) + return dx, dg, db, None, None diff --git a/include/triton/dnn/shift.h b/include/triton/dnn/shift.h index be30da1eb..fdf2a0eaf 100644 --- a/include/triton/dnn/shift.h +++ b/include/triton/dnn/shift.h @@ -31,7 +31,6 @@ #include "triton/dnn/base.h" #include "triton/driver/stream.h" #include "triton/driver/kernel.h" -#include "triton/runtime/jit.h" namespace triton{ namespace dnn{ diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp index 15c12a3fd..64c88cd64 100644 --- a/include/triton/tools/bench.hpp +++ b/include/triton/tools/bench.hpp @@ -1,6 +1,8 @@ #ifndef TRITON_TOOLS_BENCH_HPP #define TRITON_TOOLS_BENCH_HPP +#include + namespace triton{ namespace tools{ From f665c742f918d02cb04be4e1f98f8bf8b1ab486b Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 10 Jul 2019 13:33:08 -0700 Subject: [PATCH 226/494] testing a simple shiftnet --- examples/python/pytorch/batchnorm.cpp | 25 +++++- examples/python/pytorch/run.py | 120 ++++++++++++++++++++++++++ examples/python/pytorch/shift.cpp | 37 +++++--- examples/python/pytorch/triton.py | 98 +++++++++++++++++++-- examples/python/tensorflow/run.py | 5 +- lib/dnn/batchnorm.cpp | 7 +- 6 files changed, 261 insertions(+), 31 deletions(-) create mode 100644 examples/python/pytorch/run.py diff --git a/examples/python/pytorch/batchnorm.cpp b/examples/python/pytorch/batchnorm.cpp index fff9039d7..521137a9e 100644 --- a/examples/python/pytorch/batchnorm.cpp +++ b/examples/python/pytorch/batchnorm.cpp @@ -4,11 +4,18 @@ #include "triton/driver/stream.h" #include "triton/dnn/batchnorm.h" +#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") +#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x " must be contiguous") +#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) + std::vector batchnorm_ymv(const torch::Tensor fw_x, const torch::Tensor fw_g, const torch::Tensor fw_b, - float eps) { + double eps) { + CHECK_INPUT(fw_x); + CHECK_INPUT(fw_g); + CHECK_INPUT(fw_b); // Wrap CUDA handles c10::DeviceIndex device = fw_x.storage().device().index(); CUstream custream = (CUstream)at::cuda::getCurrentCUDAStream(device).stream(); @@ -30,8 +37,9 @@ std::vector triton::driver::cu_buffer m(ctx, (CUdeviceptr)fw_m.storage().data(), false); triton::driver::cu_buffer v(ctx, (CUdeviceptr)fw_v.storage().data(), false); // create template - triton::dnn::batchnorm_forward batchnorm(C, 1, H, W, B, "fp32", eps); + triton::dnn::batchnorm_forward batchnorm(C, 1, H, W, B, "fp32"); batchnorm.enqueue(&stream, {&y, &m, &v, &x, &g, &b}); + stream.synchronize(); return {fw_y, fw_m, fw_v}; } @@ -41,7 +49,12 @@ std::vector const torch::Tensor fw_g, const torch::Tensor fw_m, const torch::Tensor fw_v, - float eps) { + double eps) { + CHECK_INPUT(fw_dy); + CHECK_INPUT(fw_x); + CHECK_INPUT(fw_g); + CHECK_INPUT(fw_m); + CHECK_INPUT(fw_v); // Wrap CUDA handles c10::DeviceIndex device = fw_x.storage().device().index(); CUstream custream = (CUstream)at::cuda::getCurrentCUDAStream(device).stream(); @@ -68,4 +81,10 @@ std::vector // create config triton::dnn::batchnorm_backward batchnorm(C, 1, H, W, B, "fp32", eps); batchnorm.enqueue(&stream, {&dx, &dg, &db, &dy, &x, &g, &m, &v}); + stream.synchronize(); + return {fw_dx, fw_dg, fw_db}; } + +static auto registry = + torch::jit::RegisterOperators("triton::batchnorm_ymv", &batchnorm_ymv) + .op("triton::batchnorm_dxdgdb", &batchnorm_dxdgdb); diff --git a/examples/python/pytorch/run.py b/examples/python/pytorch/run.py new file mode 100644 index 000000000..86f489ccc --- /dev/null +++ b/examples/python/pytorch/run.py @@ -0,0 +1,120 @@ +from __future__ import print_function +import argparse +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torchvision import datasets, transforms +import triton + +class Net(nn.Module): + def __init__(self): + super(Net, self).__init__() + self.conv1 = triton.ShiftConv2d(1, 32, 3, 2) + self.bn1 = triton.BatchNorm2d(32) + self.conv2 = triton.ShiftConv2d(32, 64, 3, 2) + self.bn2 = triton.BatchNorm2d(64) + self.fc1 = nn.Linear(64*7*7, 500) + self.fc2 = nn.Linear(500, 10) + + def forward(self, x): + x = x.permute(1, 2, 3, 0).contiguous() + x = self.conv1(x) + x = self.bn1(x) + x = F.relu(x) + x = self.conv2(x) + x = F.relu(x) + x = x.permute(3, 0, 1, 2).contiguous() + x = x.view(-1, 64*7*7) + x = F.relu(self.fc1(x)) + x = self.fc2(x) + return F.log_softmax(x, dim=1) + +def train(args, model, device, train_loader, optimizer, epoch): + model.train() + for batch_idx, (data, target) in enumerate(train_loader): + data, target = data.to(device), target.to(device) + optimizer.zero_grad() + output = model(data) + loss = F.nll_loss(output, target) + loss.backward() + optimizer.step() + if batch_idx % args.log_interval == 0: + print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + epoch, batch_idx * len(data), len(train_loader.dataset), + 100. * batch_idx / len(train_loader), loss.item())) + +def test(args, model, device, test_loader): + model.eval() + test_loss = 0 + correct = 0 + with torch.no_grad(): + for data, target in test_loader: + data, target = data.to(device), target.to(device) + output = model(data) + test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss + pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability + correct += pred.eq(target.view_as(pred)).sum().item() + + test_loss /= len(test_loader.dataset) + + print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( + test_loss, correct, len(test_loader.dataset), + 100. * correct / len(test_loader.dataset))) + +def main(): + # Training settings + parser = argparse.ArgumentParser(description='PyTorch MNIST Example') + parser.add_argument('--batch-size', type=int, default=64, metavar='N', + help='input batch size for training (default: 64)') + parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', + help='input batch size for testing (default: 1000)') + parser.add_argument('--epochs', type=int, default=10, metavar='N', + help='number of epochs to train (default: 10)') + parser.add_argument('--lr', type=float, default=0.01, metavar='LR', + help='learning rate (default: 0.01)') + parser.add_argument('--momentum', type=float, default=0.5, metavar='M', + help='SGD momentum (default: 0.5)') + parser.add_argument('--no-cuda', action='store_true', default=False, + help='disables CUDA training') + parser.add_argument('--seed', type=int, default=1, metavar='S', + help='random seed (default: 1)') + parser.add_argument('--log-interval', type=int, default=10, metavar='N', + help='how many batches to wait before logging training status') + + parser.add_argument('--save-model', action='store_true', default=False, + help='For Saving the current Model') + args = parser.parse_args() + use_cuda = not args.no_cuda and torch.cuda.is_available() + + torch.manual_seed(args.seed) + + device = torch.device("cuda" if use_cuda else "cpu") + + kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} + train_loader = torch.utils.data.DataLoader( + datasets.MNIST('../data', train=True, download=True, + transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])), + batch_size=args.batch_size, shuffle=True, **kwargs) + test_loader = torch.utils.data.DataLoader( + datasets.MNIST('../data', train=False, transform=transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.1307,), (0.3081,)) + ])), + batch_size=args.test_batch_size, shuffle=True, **kwargs) + + + model = Net().to(device) + optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) + + for epoch in range(1, args.epochs + 1): + train(args, model, device, train_loader, optimizer, epoch) + test(args, model, device, test_loader) + + if (args.save_model): + torch.save(model.state_dict(),"mnist_cnn.pt") + +main() diff --git a/examples/python/pytorch/shift.cpp b/examples/python/pytorch/shift.cpp index e9271d37c..bde690b27 100644 --- a/examples/python/pytorch/shift.cpp +++ b/examples/python/pytorch/shift.cpp @@ -35,8 +35,11 @@ torch::Tensor shift_common( triton::driver::cu_buffer cubias(ctx, (CUdeviceptr)torchbias.storage().data(), false); triton::driver::buffer* bias = has_bias ? &cubias : nullptr; // Allocate output - std::vector c_shapes = shift.c_shapes(); - torch::Tensor torchc = torch::empty({c_shapes[0], c_shapes[1], c_shapes[2], c_shapes[3]}).cuda(); + std::vector _c_shapes = shift.c_shapes(); + std::vector c_shapes; + for(auto x: _c_shapes) + c_shapes.push_back(x); + torch::Tensor torchc = torch::empty(c_shapes).cuda(); triton::driver::cu_buffer c(ctx, (CUdeviceptr)torchc.storage().data(), false); // Enqueue shift.enqueue(&stream, {&a, &b, &c}); @@ -47,9 +50,9 @@ torch::Tensor shift_y( const torch::Tensor x, const torch::Tensor w, const torch::Tensor bias, - int32_t R, int32_t S, - int32_t stride_h, int32_t stride_w, - int32_t* shift_h, int32_t* shift_w) { + int64_t R, int64_t S, + int64_t stride_h, int64_t stride_w, + const torch::Tensor shift_h, const torch::Tensor shift_w) { // shapes for a int64_t Ca = x.size(0); int64_t H = x.size(1); @@ -61,16 +64,18 @@ torch::Tensor shift_y( AT_CHECK(Ca == Cb, "operands must have the same number of channels"); int64_t C = Ca; // run - shift_common(B, C, 1, H, W, 1, R, S, F, stride_h, stride_w, shift_h, shift_w, triton::dnn::shift::FPROP, x, w, bias); + return shift_common(B, C, 1, H, W, 1, R, S, F, stride_h, stride_w, + (int32_t*)shift_h.storage().data(), (int32_t*)shift_w.storage().data(), + triton::dnn::shift::FPROP, x, w, bias); } torch::Tensor shift_dx( const torch::Tensor dy, const torch::Tensor w, const torch::Tensor bias, - int32_t R, int32_t S, - int32_t stride_h, int32_t stride_w, - int32_t* shift_h, int32_t* shift_w) { + int64_t R, int64_t S, + int64_t stride_h, int64_t stride_w, + const torch::Tensor shift_h, const torch::Tensor shift_w) { // shapes for a int64_t Ca = dy.size(0); int64_t H = dy.size(1); @@ -87,16 +92,18 @@ torch::Tensor shift_dx( int64_t C = Ca; std::swap(C, F); // run - shift_common(B, C, 1, H, W, 1, R, S, F, stride_h, stride_w, shift_h, shift_w, triton::dnn::shift::BPROP, dy, w, bias); + return shift_common(B, C, 1, H, W, 1, R, S, F, stride_h, stride_w, + (int32_t*)shift_h.storage().data(), (int32_t*)shift_w.storage().data(), + triton::dnn::shift::BPROP, dy, w, bias); } torch::Tensor shift_dw( const torch::Tensor dy, const torch::Tensor x, const torch::Tensor bias, - int32_t R, int32_t S, - int32_t stride_h, int32_t stride_w, - int32_t* shift_h, int32_t* shift_w) { + int64_t R, int64_t S, + int64_t stride_h, int64_t stride_w, + const torch::Tensor shift_h, const torch::Tensor shift_w) { // shapes for a int64_t F = dy.size(0); int64_t Ha = dy.size(1); @@ -115,7 +122,9 @@ torch::Tensor shift_dw( int64_t W = Wb; int64_t B = Bb; // run - shift_common(B, C, 1, H, W, 1, R, S, F, stride_h, stride_w, shift_h, shift_w, triton::dnn::shift::WGRAD, dy, x, bias); + return shift_common(B, C, 1, H, W, 1, R, S, F, stride_h, stride_w, + (int32_t*)shift_h.storage().data(), (int32_t*)shift_w.storage().data(), + triton::dnn::shift::WGRAD, dy, x, bias); } static auto registry = diff --git a/examples/python/pytorch/triton.py b/examples/python/pytorch/triton.py index e8a0a7ff2..3b7b38b87 100644 --- a/examples/python/pytorch/triton.py +++ b/examples/python/pytorch/triton.py @@ -1,6 +1,7 @@ import torch -from torch.nn.modules.utils import _single, _pair, _triple import math +from torch.nn.modules.utils import _single, _pair, _triple +from torch.distributions import categorical torch.ops.load_library("/home/philippe/development/triton/build/examples/python/pytorch/libtorch_triton.so") @@ -93,13 +94,18 @@ class Conv2d(_ConvNd): class ShiftConvFunction(torch.autograd.Function): @staticmethod - def forward(ctx, input, weight, bias, stride, width): + def forward(ctx, input, weight, bias, stride, width, shift_h, shift_w): if bias is None: bias = torch.empty(0) ctx.save_for_backward(input, weight, bias) ctx.stride = stride ctx.width = width - output = torch.ops.triton.shift_conv_y(input, weight, bias, width[0], width[1], stride[0], stride[1]) + ctx.shift_h = shift_h + ctx.shift_w = shift_w + output = torch.ops.triton.shift_conv_y(input, weight, bias, + width[0], width[1], + stride[0], stride[1], + shift_h, shift_w) return output @staticmethod @@ -107,16 +113,70 @@ class ShiftConvFunction(torch.autograd.Function): input, weight, bias = ctx.saved_tensors stride = ctx.stride width = ctx.width + shift_h = ctx.shift_h + shift_w = ctx.shift_w dx = dw = dbias = None if ctx.needs_input_grad[0]: - dx = torch.ops.triton.shift_conv_dx(dy, weight, bias, width[0], width[1], stride[0], stride[1]) + dx = torch.ops.triton.shift_conv_dx(dy, weight, bias, width[0], width[1], stride[0], stride[1], shift_h, shift_w) if ctx.needs_input_grad[1]: - dw = torch.ops.triton.shift_conv_dw(dy, input, bias, width[0], width[1], stride[0], stride[1]) + dw = torch.ops.triton.shift_conv_dw(dy, input, bias, width[0], width[1], stride[0], stride[1], shift_h, shift_w) if ctx.needs_input_grad[2]: dbias = torch.sum(dy, (1, 2, 3)) - return dx, dw, dbias, None, None + return dx, dw, dbias, None, None, None, None +class _ShiftConvNd(torch.nn.Module): + + def __init__(self, in_channels, out_channels, kernel_size, stride, bias): + super(_ShiftConvNd, self).__init__() + # initialize + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.stride = stride + self.weight = torch.nn.Parameter(torch.Tensor(in_channels, out_channels)) + if bias: + self.bias = torch.nn.Parameter(torch.Tensor(out_channels)) + else: + self.register_parameter('bias', None) + self.shift_h = self.make_shift(kernel_size[0]) + self.shift_w = self.make_shift(kernel_size[1]) + self.reset_parameters() + + def forward(self, input): + return ShiftConvFunction.apply(input, self.weight, self.bias, self.stride, + self.kernel_size, self.shift_h, self.shift_w) + + def make_shift(self, kernel_size): + if kernel_size == 3: + p = torch.Tensor([0.3, 0.4, 0.3]) + elif kernel_size == 5: + p = torch.Tensor([0.1, 0.25, 0.3, 0.25, 0.1]) + elif kernel_size == 7: + p = torch.Tensor([0.075, 0.1, 0.175, 0.3, 0.175, 0.1, 0.075]) + elif kernel_size == 9: + p = torch.Tensor([0.05, 0.075, 0.1, 0.175, 0.2, 0.175, 0.1, 0.075, 0.05]) + else: + raise RuntimeError('Unsupported kernel size') + return categorical.Categorical(p).sample((self.in_channels,)) - (kernel_size // 2) + + def reset_parameters(self): + n = self.in_channels + for k in self.kernel_size: + n *= k + stdv = 1. / math.sqrt(n) + self.weight.data.uniform_(-stdv, stdv) + if self.bias is not None: + self.bias.data.uniform_(-stdv, stdv) + +class ShiftConv2d(_ShiftConvNd): + + def __init__(self, in_channels, out_channels, kernel_size, stride=1, bias=False): + kernel_size = _pair(kernel_size) + stride = _pair(stride) + super(ShiftConv2d, self).__init__( + in_channels, out_channels, kernel_size, stride, bias) + ################################# ######### BatchNorm ########### ################################# @@ -134,5 +194,27 @@ class BatchNormFunction(torch.autograd.Function): def backward(ctx, dy): eps = ctx.eps x, gamma, beta, mean, var = ctx.saved_tensors - dx, dg, db = torch.ops.triton.batchnorm_dxdgdb(dy, x, gamma, mean, var, eps) - return dx, dg, db, None, None + dx, dg, db = torch.ops.triton.batchnorm_dxdgdb(dy.contiguous(), x, gamma, mean, var, eps) + return dx, dg, db, None + + +class _BatchNorm(torch.nn.Module): + + def __init__(self, num_features, eps=1e-5): + super(_BatchNorm, self).__init__() + self.num_features = num_features + self.eps = eps + self.weight = torch.nn.Parameter(torch.Tensor(num_features)) + self.bias = torch.nn.Parameter(torch.Tensor(num_features)) + self.reset_parameters() + + def reset_parameters(self): + torch.nn.init.uniform_(self.weight) + torch.nn.init.zeros_(self.bias) + + def forward(self, input): + return BatchNormFunction.apply(input, self.weight, self.bias, self.eps) + +class BatchNorm2d(_BatchNorm): + + pass diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 1d5fba379..c15b0e8af 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -100,7 +100,7 @@ def batch_norm_grad(op, dy, mean, var): def run_batchnorm(): - C, H, W, B = 1, 4, 4, 4 + C, H, W, B = 32, 14, 14, 64 np.random.seed(0) # Placeholders x = tf.placeholder(tf.float32, shape=[C, H, W, B]) @@ -117,7 +117,8 @@ def run_batchnorm(): sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) result = sess.run([y, m, v], feed_dict = {x: hx, g: hg, b: hb}) - #print(result[0], result[1], result[2]) + print(result[1]) + print(np.mean(hx, (1, 2, 3))) grads = tf.test.compute_gradient([x, g, b], [(C, H, W, B), (C, ), (C, )], y, (C, H, W, B), extra_feed_dict = {x: hx, g: hg, b: hb}) dx_t, dx_n = grads[0] diff --git a/lib/dnn/batchnorm.cpp b/lib/dnn/batchnorm.cpp index 3085a5b44..54bb9c16e 100644 --- a/lib/dnn/batchnorm.cpp +++ b/lib/dnn/batchnorm.cpp @@ -59,7 +59,7 @@ void batchnorm_forward::enqueue_impl(driver::stream *stream, driver::kernel *ker { driver::buffer *y = args[0], *m = args[1], *v = args[2]; driver::buffer *x = args[3], *g = args[4], *b = args[5]; - std::array grid = {(size_t)C_, 1, 1}; + std::array grid = {1, (size_t)C_, 1}; kernel->setArg(0, y); kernel->setArg(1, m); kernel->setArg(2, v); @@ -86,7 +86,7 @@ void batchnorm(fp32 *Y, fp32 *M, fp32 *V, int32 rx[TM] = 0 ... TM; fp32 *px[TM]; fp32 x[TM]; - int32 c = get_range_id(0); + int32 c = get_range_id(1); fp32 g = *(G + c); fp32 b = *(B + c); @@ -112,7 +112,6 @@ void batchnorm(fp32 *Y, fp32 *M, fp32 *V, fp32 v = __sum(var) * rcpDHWN; fp32 *pv = V + c; *pv = v; - fp32 rstdg = 1 / sqrt(v + eps) * g; px = X + rx + c*DHWN; @@ -186,7 +185,7 @@ void batchnorm(fp32 *DX, fp32 *DG, fp32 *DB, restrict read_only fp32 *V, int32 DHWN, fp32 rcpDHWN, fp32 epsilon) { int32 rx[TM] = 0 ... TM; - int32 c = get_range_id(0); + int32 c = get_range_id(1); int32 offset = c*DHWN; fp32 g = *(G + c); fp32 mean = *(M + c); From 4ca83f19356fd104eb8469e7dfa74c17b2835855 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 10 Jul 2019 17:00:22 -0700 Subject: [PATCH 227/494] ugh bug in shift-conv striding --- examples/python/pytorch/run.py | 96 ++++++++++++++++++++++++++++++- examples/python/pytorch/shift.cpp | 6 ++ examples/python/pytorch/triton.py | 9 ++- examples/python/tensorflow/run.py | 12 ++-- lib/dnn/shift.cpp | 46 ++++++++++----- 5 files changed, 143 insertions(+), 26 deletions(-) diff --git a/examples/python/pytorch/run.py b/examples/python/pytorch/run.py index 86f489ccc..db7a1b152 100644 --- a/examples/python/pytorch/run.py +++ b/examples/python/pytorch/run.py @@ -6,10 +6,97 @@ import torch.nn.functional as F import torch.optim as optim from torchvision import datasets, transforms import triton +from torch.utils.cpp_extension import load +from torch.distributions import categorical -class Net(nn.Module): +shift_cuda = load( + 'shift_cuda', ['/home/philippe/development/shiftnet/kernels/shift_cuda.cpp', + '/home/philippe/development/shiftnet/kernels/shift_cuda_kernel.cu'], extra_cflags=['-O3']) + +class shift(torch.autograd.Function): + @staticmethod + def forward(ctx, x, shift): + ctx.save_for_backward(shift) + return shift_cuda.forward(x, shift) + + @staticmethod + def backward(ctx, grad_output): + shift, = ctx.saved_tensors + grad_output = shift_cuda.backward(grad_output, shift) + + return grad_output, None + + +class Shift(nn.Module): + def __init__(self, in_channels, kernel_size): + super(Shift, self).__init__() + self.channels = in_channels + self.kernel_size = kernel_size + if kernel_size == 3: + p = torch.Tensor([0., 1., 0.]) + elif kernel_size == 5: + p = torch.Tensor([0.1, 0.25, 0.3, 0.25, 0.1]) + elif kernel_size == 7: + p = torch.Tensor([0.075, 0.1, 0.175, 0.3, 0.175, 0.1, 0.075]) + elif kernel_size == 9: + p = torch.Tensor([0.05, 0.075, 0.1, 0.175, 0.2, 0.175, 0.1, 0.075, 0.05]) + else: + raise RuntimeError('Unsupported kernel size') + shift_t = categorical.Categorical(p).sample((in_channels, 2)) - (kernel_size // 2) + self.register_buffer('shift_t', shift_t.int()) + + def forward(self, x): + if x.is_cuda: + return shift.apply(x, self.shift_t) + else: + print('Shift only supports GPU for now..') + assert False + + def extra_repr(self): + s = ('{channels}, kernel_size={kernel_size}') + return s.format(**self.__dict__) + + +def ShiftConv2d(in_planes, out_planes, kernel_size=3, stride=1, groups=1, dilation=1): + return nn.Sequential( + Shift(in_planes, kernel_size), + nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, + padding=0, groups=groups, bias=False) + ) + + +class NetReference(nn.Module): def __init__(self): - super(Net, self).__init__() + super(NetReference, self).__init__() + #self.conv1 = ShiftConv2d(1, 32, 3, 2) + self.conv1 = triton.ShiftConv2d(1, 32, 3, 2) + self.bn1 = nn.BatchNorm2d(32) + #self.conv2a = ShiftConv2d(32, 32, 3, 1) + self.conv2b = triton.ShiftConv2d(32, 32, 3, 2) + #self.conv2b = ShiftConv2d(32, 32, 3, 2) + self.bn2 = nn.BatchNorm2d(32) + self.fc1 = nn.Linear(32*7*7, 500) + self.fc2 = nn.Linear(500, 10) + + def forward(self, x): + x = x.permute(1, 2, 3, 0).contiguous() + x = self.conv1(x) + x = x.permute(3, 0, 1, 2).contiguous() + x = self.bn1(x) + x = F.relu(x) + x = x.permute(1, 2, 3, 0).contiguous() + x = self.conv2b(x) + x = x.permute(3, 0, 1, 2).contiguous() + x = self.bn2(x) + x = F.relu(x) + x = x.view(-1, 32*7*7) + x = F.relu(self.fc1(x)) + x = self.fc2(x) + return F.log_softmax(x, dim=1) + +class NetTriton(nn.Module): + def __init__(self): + super(NetTriton, self).__init__() self.conv1 = triton.ShiftConv2d(1, 32, 3, 2) self.bn1 = triton.BatchNorm2d(32) self.conv2 = triton.ShiftConv2d(32, 64, 3, 2) @@ -23,6 +110,7 @@ class Net(nn.Module): x = self.bn1(x) x = F.relu(x) x = self.conv2(x) + x = self.bn2(x) x = F.relu(x) x = x.permute(3, 0, 1, 2).contiguous() x = x.view(-1, 64*7*7) @@ -30,6 +118,8 @@ class Net(nn.Module): x = self.fc2(x) return F.log_softmax(x, dim=1) +Net = NetReference() + def train(args, model, device, train_loader, optimizer, epoch): model.train() for batch_idx, (data, target) in enumerate(train_loader): @@ -107,7 +197,7 @@ def main(): batch_size=args.test_batch_size, shuffle=True, **kwargs) - model = Net().to(device) + model = Net.to(device) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) for epoch in range(1, args.epochs + 1): diff --git a/examples/python/pytorch/shift.cpp b/examples/python/pytorch/shift.cpp index bde690b27..1da8f3fbd 100644 --- a/examples/python/pytorch/shift.cpp +++ b/examples/python/pytorch/shift.cpp @@ -53,6 +53,8 @@ torch::Tensor shift_y( int64_t R, int64_t S, int64_t stride_h, int64_t stride_w, const torch::Tensor shift_h, const torch::Tensor shift_w) { + CHECK_INPUT(x); + CHECK_INPUT(w); // shapes for a int64_t Ca = x.size(0); int64_t H = x.size(1); @@ -76,6 +78,8 @@ torch::Tensor shift_dx( int64_t R, int64_t S, int64_t stride_h, int64_t stride_w, const torch::Tensor shift_h, const torch::Tensor shift_w) { + CHECK_INPUT(dy); + CHECK_INPUT(w); // shapes for a int64_t Ca = dy.size(0); int64_t H = dy.size(1); @@ -104,6 +108,8 @@ torch::Tensor shift_dw( int64_t R, int64_t S, int64_t stride_h, int64_t stride_w, const torch::Tensor shift_h, const torch::Tensor shift_w) { + CHECK_INPUT(dy); + CHECK_INPUT(x); // shapes for a int64_t F = dy.size(0); int64_t Ha = dy.size(1); diff --git a/examples/python/pytorch/triton.py b/examples/python/pytorch/triton.py index 3b7b38b87..7f45daef0 100644 --- a/examples/python/pytorch/triton.py +++ b/examples/python/pytorch/triton.py @@ -1,5 +1,6 @@ import torch import math +import numpy as np from torch.nn.modules.utils import _single, _pair, _triple from torch.distributions import categorical @@ -117,11 +118,13 @@ class ShiftConvFunction(torch.autograd.Function): shift_w = ctx.shift_w dx = dw = dbias = None if ctx.needs_input_grad[0]: - dx = torch.ops.triton.shift_conv_dx(dy, weight, bias, width[0], width[1], stride[0], stride[1], shift_h, shift_w) + dx = torch.ops.triton.shift_conv_dx(dy.contiguous(), weight, bias, width[0], width[1], stride[0], stride[1], shift_h, shift_w) if ctx.needs_input_grad[1]: - dw = torch.ops.triton.shift_conv_dw(dy, input, bias, width[0], width[1], stride[0], stride[1], shift_h, shift_w) + dw = torch.ops.triton.shift_conv_dw(dy.contiguous(), input, bias, width[0], width[1], stride[0], stride[1], shift_h, shift_w) if ctx.needs_input_grad[2]: dbias = torch.sum(dy, (1, 2, 3)) + #print('dx', ctx.needs_input_grad[0], np.isnan(dx.cpu().numpy()).any()) + #print('dw', ctx.needs_input_grad[1], np.isnan(dw.cpu().numpy()).any()) return dx, dw, dbias, None, None, None, None @@ -149,7 +152,7 @@ class _ShiftConvNd(torch.nn.Module): def make_shift(self, kernel_size): if kernel_size == 3: - p = torch.Tensor([0.3, 0.4, 0.3]) + p = torch.Tensor([0., 1., 0.]) elif kernel_size == 5: p = torch.Tensor([0.1, 0.25, 0.3, 0.25, 0.1]) elif kernel_size == 7: diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index c15b0e8af..c44e0edab 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -58,8 +58,8 @@ def blocksparse_matmul_grad(op, dy): return (dx, dw) def run_shift(): - B, C, H, W = 16, 16, 4, 4 - R, S, F = 3, 3, 4 + B, C, H, W = 16, 1, 4, 4 + R, S, F = 3, 3, 32 stride_h, stride_w = 2, 2 np.random.seed(2) a = tf.placeholder(tf.float32, shape=[C, H, W, B]) @@ -68,8 +68,8 @@ def run_shift(): hshift_w = np.random.randint(- (S//2), R//2 + 1, size=C, dtype=np.int32) c = module.shift_conv(a, b, stride_h=stride_h, stride_w=stride_w, shift_h=tf.make_tensor_proto(hshift_h), shift_w=tf.make_tensor_proto(hshift_w)) # feed values - ha = np.random.rand(C, H, W, B) - hb = np.random.rand(C, F) + ha = np.ones((C, H, W, B), dtype=np.float32) + hb = np.ones((C, F), dtype=np.float32) sess = tf.InteractiveSession() # test grads = tf.test.compute_gradient([a, b], [(C, H, W, B), (C, F)], c, (F, H//stride_h, W//stride_w, B), @@ -128,5 +128,5 @@ def run_batchnorm(): print(np.max(np.abs(dg_t - dg_n))) print(np.max(np.abs(db_t - db_n))) -#run_shift() -run_batchnorm() +run_shift() +#run_batchnorm() diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 2934017b7..cbae500ed 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -158,7 +158,7 @@ void shift::enqueue_impl(driver::stream *stream, driver::kernel *kernel, unsigned TM = ranges[0], TN = ranges[1]; std::array grid = {(M_ + TM - 1)/TM, (N_ + TN - 1)/TN, 1}; if(ty_ == BPROP) - ((driver::cu_buffer*)c)->set_zero(stream, M_*N_*4); + ((driver::cu_buffer*)c)->set_zero(stream, M_*N_*stride_h_*stride_w_*4); stream->enqueue(kernel, grid, {nthreads, 1, 1}); } @@ -217,6 +217,7 @@ void shift(restrict read_only align(16) )" << a_ty_ << R"( *a, if(ty_ == FPROP){ os << R"( int32 rawhc[TM] = rxa / ABS; + int32 rab[TM] = rxa % ABS; int32 raw[TM] = (rawhc % AW)*stride_w; int32 rahc[TM] = rawhc / AW; int32 rah[TM] = (rahc % AH)*stride_h; @@ -227,26 +228,32 @@ if(ty_ == FPROP){ int1 interior[TM, TK] = interiorh[:, newaxis] && interiorw[:, newaxis]; int32 inc_true[TM, TK] = d[newaxis, :]; int32 inc_false[TM, TK] = rka[newaxis, :] * lda; - int32 inc[TM, TK] = interior ? inc_true : inc_false;)"; + int32 inc[TM, TK] = interior ? inc_true : inc_false; + rxa = rab + raw*ABS + rah*ABS*AW; + int32 offa0[TM, TK] = rxa[:, newaxis];)"; +} +else{ + os << " int32 offa0[" << AS << "] = rxa" << bca1 << lda1 << ";" << std::endl; } if(ty_ == WGRAD){ os << R"( __constant__ int32* pd[TN] = delta + ryb; int32 d[TN] = *pd; - int32 shift[TK, TN] = d[newaxis, :];)"; + int32 shift[TK, TN] = d[newaxis, :]; + int32 rbwhc[TK] = rkb / ABS; + int32 rbw[TK] = (rbwhc % AW)*stride_w; + int32 rbhc[TK] = rbwhc / AW; + int32 rbh[TK] = (rbhc % AH)*stride_h; + )"; } os << R"( - )" << a_ty_ << "* pa[" << AS << "] = a + rxa" << bca1 << lda1 << " + " << rka << bca0 << lda0 << R"(; + )" << a_ty_ << "* pa[" << AS << "] = a + offa0 + " << rka << bca0 << lda0 << R"(; )" << b_ty_ << "* pb[" << BS << "] = b + ryb" << bcb1 << ldb1 << " + " << rkb << bcb0 << ldb0 << R"(; int1 checka[)" << AS << "] = (rka < K)" << bca0 << R"(; int1 checkb[)" << BS << "] = (rkb < K)" << bcb0 << R"(; )" << a_ty_ << " a[" << AS << R"(] = checka ? *pa : 0;)"; if(ty_ == WGRAD){ os << R"( - int32 rbwhc[TK] = rkb / ABS; - int32 rbw[TK] = (rbwhc % AW)*stride_w; - int32 rbhc[TK] = rbwhc / AW; - int32 rbh[TK] = (rbhc % AH)*stride_h; int1 interiorh[TK] = (rbh >= pad_h) && (rbh < (AH - pad_h)); int1 interiorw[TK] = (rbw >= pad_w) && (rbw < (AW - pad_w)); int1 interior[TK, TN] = interiorh[:, newaxis] && interiorw[:, newaxis]; @@ -301,17 +308,28 @@ else{ os << R"( } int32 rxc[TM] = get_global_range[TM](0); - int32 ryc[TN] = get_global_range[TN](1); - fp32* pc[TM, TN] = c + ryc[newaxis, :]*M + rxc[:, newaxis]; + int32 ryc[TN] = get_global_range[TN](1);)"; + if(ty_ == BPROP){ + os << R"( + int32 rcwhc[TM] = rxc / ABS; + int32 rcb[TM] = rxc % ABS; + int32 rcw[TM] = (rcwhc % AW)*stride_w; + int32 rchc[TM] = rcwhc / AW; + int32 rch[TM] = (rchc % AH)*stride_h; + rxc = rcb + rcw*ABS + rch*ABS*AW; + int32 offc0[TM, TN] = rxc[:, newaxis];)"; + } + else{ + os << R"( + int32 offc0[TM, TN] = rxc[:, newaxis];)"; + } + os << R"(" + fp32* pc[TM, TN] = c + ryc[newaxis, :]*M + offc0; int1 checkc0[TM] = rxc < M; int1 checkc1[TN] = ryc < N; int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :];)"; if(ty_ == BPROP){ os << R"( - int32 rcwhc[TM] = rxc / ABS; - int32 rcw[TM] = (rcwhc % AW)*stride_w; - int32 rchc[TM] = rcwhc / AW; - int32 rch[TM] = (rchc % AH)*stride_h; int1 interiorh[TM] = (rch >= pad_h) && (rch < (AH - pad_h)); int1 interiorw[TM] = (rcw >= pad_w) && (rcw < (AW - pad_w)); int1 interior[TM, TN] = interiorh[:, newaxis] && interiorw[:, newaxis]; From 75cf2df110899efd81a09e4bc44e9b4ac2adcf2d Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 10 Jul 2019 19:49:31 -0700 Subject: [PATCH 228/494] [dnn/shift] many bugfixes in strided shift-conv --- examples/python/pytorch/run.py | 21 +++--- examples/python/pytorch/triton.py | 2 +- examples/python/tensorflow/run.py | 17 +++-- lib/dnn/shift.cpp | 117 ++++++++++++++++-------------- 4 files changed, 84 insertions(+), 73 deletions(-) diff --git a/examples/python/pytorch/run.py b/examples/python/pytorch/run.py index db7a1b152..906468fe6 100644 --- a/examples/python/pytorch/run.py +++ b/examples/python/pytorch/run.py @@ -33,7 +33,7 @@ class Shift(nn.Module): self.channels = in_channels self.kernel_size = kernel_size if kernel_size == 3: - p = torch.Tensor([0., 1., 0.]) + p = torch.Tensor([0.3, 0.4, 0.3]) elif kernel_size == 5: p = torch.Tensor([0.1, 0.25, 0.3, 0.25, 0.1]) elif kernel_size == 7: @@ -68,25 +68,24 @@ def ShiftConv2d(in_planes, out_planes, kernel_size=3, stride=1, groups=1, dilati class NetReference(nn.Module): def __init__(self): super(NetReference, self).__init__() - #self.conv1 = ShiftConv2d(1, 32, 3, 2) - self.conv1 = triton.ShiftConv2d(1, 32, 3, 2) + self.conv1 = ShiftConv2d(1, 32, 3, 2) + #self.conv1 = triton.ShiftConv2d(1, 32, 3, 2) self.bn1 = nn.BatchNorm2d(32) - #self.conv2a = ShiftConv2d(32, 32, 3, 1) - self.conv2b = triton.ShiftConv2d(32, 32, 3, 2) - #self.conv2b = ShiftConv2d(32, 32, 3, 2) + #self.conv2 = triton.ShiftConv2d(32, 32, 3, 2) + self.conv2 = ShiftConv2d(32, 32, 3, 2) self.bn2 = nn.BatchNorm2d(32) self.fc1 = nn.Linear(32*7*7, 500) self.fc2 = nn.Linear(500, 10) def forward(self, x): - x = x.permute(1, 2, 3, 0).contiguous() + #x = x.permute(1, 2, 3, 0).contiguous() x = self.conv1(x) - x = x.permute(3, 0, 1, 2).contiguous() + #x = x.permute(3, 0, 1, 2).contiguous() x = self.bn1(x) x = F.relu(x) - x = x.permute(1, 2, 3, 0).contiguous() - x = self.conv2b(x) - x = x.permute(3, 0, 1, 2).contiguous() + #x = x.permute(1, 2, 3, 0).contiguous() + x = self.conv2(x) + #x = x.permute(3, 0, 1, 2).contiguous() x = self.bn2(x) x = F.relu(x) x = x.view(-1, 32*7*7) diff --git a/examples/python/pytorch/triton.py b/examples/python/pytorch/triton.py index 7f45daef0..efeade389 100644 --- a/examples/python/pytorch/triton.py +++ b/examples/python/pytorch/triton.py @@ -152,7 +152,7 @@ class _ShiftConvNd(torch.nn.Module): def make_shift(self, kernel_size): if kernel_size == 3: - p = torch.Tensor([0., 1., 0.]) + p = torch.Tensor([0.3, 0.4, 0.3]) elif kernel_size == 5: p = torch.Tensor([0.1, 0.25, 0.3, 0.25, 0.1]) elif kernel_size == 7: diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index c44e0edab..ee1322d5c 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -58,24 +58,29 @@ def blocksparse_matmul_grad(op, dy): return (dx, dw) def run_shift(): - B, C, H, W = 16, 1, 4, 4 - R, S, F = 3, 3, 32 + B, C, H, W = 16, 16, 4, 4 + R, S, F = 3, 3, 16 stride_h, stride_w = 2, 2 np.random.seed(2) a = tf.placeholder(tf.float32, shape=[C, H, W, B]) b = tf.placeholder(tf.float32, shape=[C, F]) - hshift_h = np.random.randint(- (R//2), R//2 + 1, size=C, dtype=np.int32) - hshift_w = np.random.randint(- (S//2), R//2 + 1, size=C, dtype=np.int32) + #hshift_h = np.random.randint(- (R//2), R//2 + 1, size=C, dtype=np.int32) + #hshift_w = np.random.randint(- (S//2), R//2 + 1, size=C, dtype=np.int32) + hshift_h = np.zeros(C, dtype=np.int32) + hshift_w = np.zeros(C, dtype=np.int32) c = module.shift_conv(a, b, stride_h=stride_h, stride_w=stride_w, shift_h=tf.make_tensor_proto(hshift_h), shift_w=tf.make_tensor_proto(hshift_w)) # feed values - ha = np.ones((C, H, W, B), dtype=np.float32) - hb = np.ones((C, F), dtype=np.float32) + ha = np.random.rand(C, H, W, B) + hb = np.random.rand(C, F) + #ha = np.ones((C, H, W, B), dtype=np.float32) + #hb = np.ones((C, F), dtype=np.float32) sess = tf.InteractiveSession() # test grads = tf.test.compute_gradient([a, b], [(C, H, W, B), (C, F)], c, (F, H//stride_h, W//stride_w, B), extra_feed_dict = {a: ha, b: hb}) dw_t, dw_n = grads[1] dx_t, dx_n = grads[0] + print(dw_t, dw_n) print(np.max(np.abs(dw_t - dw_n))) print(np.max(np.abs(dx_t - dx_n))) # Run diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index cbae500ed..e537cc563 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -139,6 +139,13 @@ void shift::enqueue_impl(driver::stream *stream, driver::kernel *kernel, const std::vector &ranges, size_t nthreads) { int32_t lda = AT_ ? K_ : M_; int32_t ldb = BT_ ? N_ : K_; + int32_t ldc = M_; + if(ty_ == FPROP) + lda *= stride_h_*stride_w_; + if(ty_ == WGRAD) + ldb *= stride_h_*stride_w_; + if(ty_ == BPROP) + ldc *= stride_h_*stride_w_; driver::buffer *a = args[0], *b = args[1], *c = args[2]; kernel->setArg(0, a); kernel->setArg(1, b); @@ -150,15 +157,18 @@ void shift::enqueue_impl(driver::stream *stream, driver::kernel *kernel, kernel->setArg(7, stride_w_); kernel->setArg(8, lda); kernel->setArg(9, ldb); - kernel->setArg(10, B_); - kernel->setArg(11, AH_); - kernel->setArg(12, AW_); - kernel->setArg(13, BH_); - kernel->setArg(14, BW_); + kernel->setArg(10, ldc); + kernel->setArg(11, B_); + kernel->setArg(12, AH_); + kernel->setArg(13, AW_); + kernel->setArg(14, BH_); + kernel->setArg(15, BW_); + kernel->setArg(16, CH_); + kernel->setArg(17, CW_); unsigned TM = ranges[0], TN = ranges[1]; std::array grid = {(M_ + TM - 1)/TM, (N_ + TN - 1)/TN, 1}; if(ty_ == BPROP) - ((driver::cu_buffer*)c)->set_zero(stream, M_*N_*stride_h_*stride_w_*4); + ((driver::cu_buffer*)c)->set_zero(stream, ldc*N_*4); stream->enqueue(kernel, grid, {nthreads, 1, 1}); } @@ -205,22 +215,21 @@ void shift(restrict read_only align(16) )" << a_ty_ << R"( *a, fp32 *c, int32 M, int32 N, int32 K, int32 stride_h, int32 stride_w, - int32 lda, int32 ldb, - int32 ABS, int32 AH, int32 AW, int32 AR, int32 AS) { + int32 lda, int32 ldb, int32 ldc, + int32 NB, int32 AH, int32 AW, int32 BH, int32 BW, int32 CH, int32 CW) { int32 rxa[TM] = get_global_range[TM](0); int32 ryb[TN] = get_global_range[TN](1); int32 rka[TK] = 0 ... TK; int32 rkb[TK] = 0 ... TK; fp32 C[TM, TN] = 0; - int32 pad_h = AR / 2; - int32 pad_w = AS / 2;)"; + int32 pad_h = BH / 2; + int32 pad_w = BW / 2;)"; if(ty_ == FPROP){ os << R"( - int32 rawhc[TM] = rxa / ABS; - int32 rab[TM] = rxa % ABS; - int32 raw[TM] = (rawhc % AW)*stride_w; - int32 rahc[TM] = rawhc / AW; - int32 rah[TM] = (rahc % AH)*stride_h; + int32 rawh[TM] = rxa / NB; + int32 rab[TM] = rxa % NB; + int32 raw[TM] = (rawh % CW)*stride_w; + int32 rah[TM] = (rawh / CW)*stride_h; __constant__ int32* pd[TK] = delta + rka; multiple_of(4) int32 d[TK] = *pd; int1 interiorh[TM] = (rah >= pad_h) && (rah < (AH - pad_h)); @@ -229,43 +238,41 @@ if(ty_ == FPROP){ int32 inc_true[TM, TK] = d[newaxis, :]; int32 inc_false[TM, TK] = rka[newaxis, :] * lda; int32 inc[TM, TK] = interior ? inc_true : inc_false; - rxa = rab + raw*ABS + rah*ABS*AW; - int32 offa0[TM, TK] = rxa[:, newaxis];)"; + int32 offxa[TM] = rab + raw*NB + rah*NB*AW;)"; } else{ - os << " int32 offa0[" << AS << "] = rxa" << bca1 << lda1 << ";" << std::endl; + os << R"( + int32 offxa[TM] = rxa;)"; } if(ty_ == WGRAD){ os << R"( __constant__ int32* pd[TN] = delta + ryb; int32 d[TN] = *pd; int32 shift[TK, TN] = d[newaxis, :]; - int32 rbwhc[TK] = rkb / ABS; - int32 rbw[TK] = (rbwhc % AW)*stride_w; - int32 rbhc[TK] = rbwhc / AW; - int32 rbh[TK] = (rbhc % AH)*stride_h; - )"; -} - os << R"( - )" << a_ty_ << "* pa[" << AS << "] = a + offa0 + " << rka << bca0 << lda0 << R"(; - )" << b_ty_ << "* pb[" << BS << "] = b + ryb" << bcb1 << ldb1 << " + " << rkb << bcb0 << ldb0 << R"(; - int1 checka[)" << AS << "] = (rka < K)" << bca0 << R"(; - int1 checkb[)" << BS << "] = (rkb < K)" << bcb0 << R"(; - )" << a_ty_ << " a[" << AS << R"(] = checka ? *pa : 0;)"; -if(ty_ == WGRAD){ - os << R"( - int1 interiorh[TK] = (rbh >= pad_h) && (rbh < (AH - pad_h)); - int1 interiorw[TK] = (rbw >= pad_w) && (rbw < (AW - pad_w)); - int1 interior[TK, TN] = interiorh[:, newaxis] && interiorw[:, newaxis]; - int32 inc[TK, TN] = interior ? shift : 0; - )" << b_ty_ << R"(* shifted_pb[TK, TN] = pb + inc; - )" << b_ty_ << R"( b[TK, TN] = checkb ? *shifted_pb : 0;)"; + int32 rbwh[TK] = rkb / NB; + int32 rbb[TK] = rkb % NB; + int32 rbw[TK] = (rbwh % CW)*stride_w; + int32 rbh[TK] = (rbwh / CW)*stride_h; + int32 offkb[TK] = rbb + rbw*NB + rbh*NB*AW; + int1 interiorh[TK] = (rbh >= pad_h) && (rbh < (AH - pad_h)); + int1 interiorw[TK] = (rbw >= pad_w) && (rbw < (AW - pad_w)); + int1 interior[TK, TN] = interiorh[:, newaxis] && interiorw[:, newaxis]; + int32 inc[TK, TN] = interior ? shift : 0; + )" << b_ty_ << "* pb_base[" << BS << "] = b + ryb" << bcb1 << ldb1 << R"(; + )" << b_ty_ << "* pb[" << BS << "] = pb_base + offkb[:, newaxis] + inc;"; } else{ os << R"( - )" << b_ty_ << " b[" << BS << R"(] = checkb ? *pb : 0;)"; + int32 offkb[TK] = rkb; + )" << b_ty_ << "* pb[" << BS << "] = b + ryb" << bcb1 << ldb1 << " + " << "offkb" << bcb0 << ldb0 << R"(; + )"; } os << R"( + )" << a_ty_ << "* pa[" << AS << "] = a + offxa" << bca1 << lda1 << " + " << rka << bca0 << lda0 << R"(; + int1 checka[)" << AS << "] = (rka < K)" << bca0 << R"(; + int1 checkb[)" << BS << "] = (rkb < K)" << bcb0 << R"(; + )" << a_ty_ << " a[" << AS << R"(] = checka ? *pa : 0; + )" << b_ty_ << " b[" << BS << R"(] = checkb ? *pb : 0; for(int32 k = K; k > 0; k = k - TK){ C = dot()" << usea << "," << useb << R"(, C); int1 checka[)" << AS << R"(] = k > TK; @@ -287,18 +294,18 @@ else{ } if(ty_ == WGRAD){ os << R"( - pb = pb + TK)" << ldb0 << R"(; rkb = rkb + TK; - rbwhc = rkb / ABS; - rbw = (rbwhc % AW)*stride_w; - rbhc = rbwhc / AW; - rbh = (rbhc % AH)*stride_h; + rbwh = rkb / NB; + rbb = rkb % NB; + rbw = (rbwh % CW)*stride_w; + rbh = (rbwh / CW)*stride_h; + offkb = rbb + rbw*NB + rbh*NB*AW; interiorh = (rbh >= pad_h) && (rbh < (AH - pad_h)); interiorw = (rbw >= pad_w) && (rbw < (AW - pad_w)); interior = interiorh[:, newaxis] && interiorw[:, newaxis]; inc = interior ? shift : 0; - shifted_pb = pb + inc; - @checkb b = *shifted_pb;)"; + pb = pb_base + offkb[:, newaxis] + inc; + @checkb b = *pb;)"; } else{ os << R"( @@ -311,20 +318,20 @@ else{ int32 ryc[TN] = get_global_range[TN](1);)"; if(ty_ == BPROP){ os << R"( - int32 rcwhc[TM] = rxc / ABS; - int32 rcb[TM] = rxc % ABS; - int32 rcw[TM] = (rcwhc % AW)*stride_w; - int32 rchc[TM] = rcwhc / AW; - int32 rch[TM] = (rchc % AH)*stride_h; - rxc = rcb + rcw*ABS + rch*ABS*AW; - int32 offc0[TM, TN] = rxc[:, newaxis];)"; + int32 rcwh[TM] = rxc / NB; + int32 rcb[TM] = rxc % NB; + int32 rcw[TM] = (rcwh % CW) * stride_w; + int32 rch[TM] = (rcwh / CW) * stride_h; + int32 offxc[TM] = rcb + rcw*NB + rch*NB*AW; + )"; } else{ os << R"( - int32 offc0[TM, TN] = rxc[:, newaxis];)"; + int32 offxc[TM] = rxc; + )"; } os << R"(" - fp32* pc[TM, TN] = c + ryc[newaxis, :]*M + offc0; + fp32* pc[TM, TN] = c + ryc[newaxis, :]*ldc + offxc[:, newaxis]; int1 checkc0[TM] = rxc < M; int1 checkc1[TN] = ryc < N; int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :];)"; From 207e021973e48ae0a58589f08c037ae022c32ddc Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 11 Jul 2019 16:38:58 -0700 Subject: [PATCH 229/494] [codegen/shift] substantial cleaning of triton-c shift-conv code --- examples/python/pytorch/run.py | 8 +- examples/python/tensorflow/run.py | 12 +- include/triton/dnn/shift.h | 20 +- lib/dnn/shift.cpp | 305 +++++++++++++++++------------- 4 files changed, 191 insertions(+), 154 deletions(-) diff --git a/examples/python/pytorch/run.py b/examples/python/pytorch/run.py index 906468fe6..488f547f8 100644 --- a/examples/python/pytorch/run.py +++ b/examples/python/pytorch/run.py @@ -68,11 +68,11 @@ def ShiftConv2d(in_planes, out_planes, kernel_size=3, stride=1, groups=1, dilati class NetReference(nn.Module): def __init__(self): super(NetReference, self).__init__() - self.conv1 = ShiftConv2d(1, 32, 3, 2) - #self.conv1 = triton.ShiftConv2d(1, 32, 3, 2) + #self.conv1 = ShiftConv2d(1, 32, 3, 2) + self.conv1 = triton.ShiftConv2d(1, 32, 3, 2) self.bn1 = nn.BatchNorm2d(32) - #self.conv2 = triton.ShiftConv2d(32, 32, 3, 2) - self.conv2 = ShiftConv2d(32, 32, 3, 2) + self.conv2 = triton.ShiftConv2d(32, 32, 3, 2) + #self.conv2 = ShiftConv2d(32, 32, 3, 2) self.bn2 = nn.BatchNorm2d(32) self.fc1 = nn.Linear(32*7*7, 500) self.fc2 = nn.Linear(500, 10) diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index ee1322d5c..375c45227 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -59,15 +59,15 @@ def blocksparse_matmul_grad(op, dy): def run_shift(): B, C, H, W = 16, 16, 4, 4 - R, S, F = 3, 3, 16 + R, S, F = 3, 3, 32 stride_h, stride_w = 2, 2 np.random.seed(2) a = tf.placeholder(tf.float32, shape=[C, H, W, B]) b = tf.placeholder(tf.float32, shape=[C, F]) - #hshift_h = np.random.randint(- (R//2), R//2 + 1, size=C, dtype=np.int32) - #hshift_w = np.random.randint(- (S//2), R//2 + 1, size=C, dtype=np.int32) - hshift_h = np.zeros(C, dtype=np.int32) - hshift_w = np.zeros(C, dtype=np.int32) + hshift_h = np.random.randint(- (R//2), R//2 + 1, size=C, dtype=np.int32) + hshift_w = np.random.randint(- (S//2), R//2 + 1, size=C, dtype=np.int32) + #hshift_h = np.zeros(C, dtype=np.int32) + #hshift_w = np.zeros(C, dtype=np.int32) c = module.shift_conv(a, b, stride_h=stride_h, stride_w=stride_w, shift_h=tf.make_tensor_proto(hshift_h), shift_w=tf.make_tensor_proto(hshift_w)) # feed values ha = np.random.rand(C, H, W, B) @@ -122,8 +122,6 @@ def run_batchnorm(): sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) result = sess.run([y, m, v], feed_dict = {x: hx, g: hg, b: hb}) - print(result[1]) - print(np.mean(hx, (1, 2, 3))) grads = tf.test.compute_gradient([x, g, b], [(C, H, W, B), (C, ), (C, )], y, (C, H, W, B), extra_feed_dict = {x: hx, g: hg, b: hb}) dx_t, dx_n = grads[0] diff --git a/include/triton/dnn/shift.h b/include/triton/dnn/shift.h index fdf2a0eaf..8386ee83e 100644 --- a/include/triton/dnn/shift.h +++ b/include/triton/dnn/shift.h @@ -63,11 +63,9 @@ public: type ty = FPROP, bool bias = false); // look-up table - void build_deltas(); + void build_delta_a(); void build_masks(); // accessors - size_t a_size(); - size_t b_size(); size_t c_size(); std::vector c_shapes(); // number of flops @@ -130,25 +128,23 @@ private: int32_t N_; int32_t K_; // shapes - std::vector shapes_a_; - std::vector shapes_b_; std::vector shapes_c_; // strides int32_t stride_d_; int32_t stride_h_; int32_t stride_w_; // memory strides - std::vector ld_a_; - std::vector ld_b_; - std::vector ld_c_; + int32_t lda_n_, lda_c_, lda_h_, lda_w_; + int32_t ldb_n_, ldb_c_, ldb_h_, ldb_w_; + int32_t ldc_n_, ldc_f_, ldc_h_, ldc_w_; // shift values const int32_t* shift_h_; const int32_t* shift_w_; // look-up tables - std::vector h_deltas_; - std::vector h_masks_; - driver::buffer* d_deltas_; - driver::buffer* d_masks_; + std::vector h_delta_a; + std::vector h_delta_b; + driver::buffer* d_delta_a; + driver::buffer* d_delta_b; // data types std::string a_ty_; std::string b_ty_; diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index e537cc563..4b65b6e8f 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -26,86 +26,92 @@ shift::shift(int B, int C, // max number of channels TK_ = 16; MAX_C_ = 8192 + TK_; - // transpose - AT_ = false; - BT_ = true; // activation sizes CD_ = AD_ / stride_d_; CH_ = AH_ / stride_h_; CW_ = AW_ / stride_w_; - // equivalent matmul + // A memory strides: [C, H, W, B] + lda_n_ = 1; + lda_w_ = B_; + lda_h_ = B_*AW_; + lda_c_ = B_*AW_*AH_; + // B memory strides: [C, F] + ldb_n_ = 1; + ldb_h_ = 1; + ldb_w_ = 1; + ldb_c_ = F_; + // C memory strides: [F, H, W, B] + ldc_n_ = 1; + ldc_w_ = B_; + ldc_h_ = B_*CW_; + ldc_f_ = B_*CW_*CH_; + // C shapes + shapes_c_ = {F, CH_, CW_, B}; + // Equivalent matmul M_ = B_*CH_*CW_; N_ = F_; K_ = C_; - // shapes - // input layout: C, H, W, B - // filter layout: C, F - // output layout: F, H, W, B - shapes_a_ = {C, AH_, AW_, B}; - shapes_b_ = {C, F}; - shapes_c_ = {F, CH_, CW_, B}; + // transpose + AT_ = false; + BT_ = true; + // Weight gradient if(ty_ == WGRAD){ - shapes_b_.swap(shapes_c_); - shapes_a_.swap(shapes_b_); + std::swap(ldb_n_, ldc_n_); + std::swap(ldb_w_, ldc_w_); + std::swap(ldb_h_, ldc_h_); + std::swap(ldb_c_, ldc_f_); + std::swap(lda_n_, ldb_n_); + std::swap(lda_w_, ldb_w_); + std::swap(lda_h_, ldb_h_); + std::swap(lda_c_, ldb_c_); + std::swap(M_, K_); + std::swap(M_, N_); AT_ = true; BT_ = false; - M_ = F_; - N_ = C_; - K_ = B_*CH_*CW_; + shapes_c_ = {C, F}; } + // Input gradient if(ty_ == BPROP){ - shapes_a_.swap(shapes_c_); + std::swap(lda_n_, ldc_n_); + std::swap(lda_w_, ldc_w_); + std::swap(lda_h_, ldc_h_); + std::swap(lda_c_, ldc_f_); + std::swap(K_, N_); AT_ = false; BT_ = false; - K_ = F_; - M_ = B_*CH_*CW_; - N_ = C_; + shapes_c_ = {C, AH_, AW_, B}; } - // memory strides - set_ld(shapes_a_, ld_a_); - set_ld(shapes_b_, ld_b_); - set_ld(shapes_c_, ld_c_); } base* shift::clone() const { return new shift(*this); } -void shift::build_deltas() { - h_deltas_.resize(MAX_C_); +void shift::build_delta_a() { + h_delta_a.resize(MAX_C_); if(ty_ == FPROP){ // compute offset auto offset = [&](unsigned c) { - return c*ld_a_[0] + shift_h_[c]*ld_a_[1] + shift_w_[c]*ld_a_[2]; + return c*lda_c_ + shift_h_[c]*lda_h_ + shift_w_[c]*lda_w_; }; // populate look-up table for(unsigned c = 0; c < TK_; c++) - h_deltas_[c] = offset(c); + h_delta_a[c] = offset(c); for(unsigned c = 0; c < C_; c++) - h_deltas_[TK_ + c] = offset(c + TK_) - offset(c); + h_delta_a[TK_ + c] = offset(c + TK_) - offset(c); } if(ty_ == BPROP){ for(unsigned c = 0; c < C_; c++){ - h_deltas_[c] = shift_h_[c]*ld_c_[1] + shift_w_[c]*ld_c_[2]; + h_delta_a[c] = shift_h_[c]*ldc_h_ + shift_w_[c]*ldc_w_; } } if(ty_ == WGRAD){ for(unsigned c = 0; c < C_; c++) - h_deltas_[c] = shift_h_[c]*ld_b_[1] + shift_w_[c]*ld_b_[2]; + h_delta_a[c] = shift_h_[c]*ldb_h_ + shift_w_[c]*ldb_w_; } } -size_t shift::a_size(){ - return std::accumulate(shapes_a_.begin(), shapes_a_.end(), - 1, std::multiplies()); -} - -size_t shift::b_size(){ - return std::accumulate(shapes_b_.begin(), shapes_b_.end(), - 1, std::multiplies()); -} - -size_t shift::c_size(){ +size_t shift::c_size() { return std::accumulate(shapes_c_.begin(), shapes_c_.end(), 1, std::multiplies()); } @@ -129,23 +135,14 @@ bool shift::operator <(const base& other) const{ } void shift::init_impl(driver::stream *stream, driver::cu_module *module) { - build_deltas(); - triton::driver::buffer* delta = ((triton::driver::cu_module*)module)->symbol("delta"); - stream->write(delta, false, 0, h_deltas_.size()*4, h_deltas_.data()); + build_delta_a(); + triton::driver::buffer* delta_a = ((triton::driver::cu_module*)module)->symbol("delta_a"); + stream->write(delta_a, false, 0, h_delta_a.size()*4, h_delta_a.data()); } void shift::enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, const std::vector &ranges, size_t nthreads) { - int32_t lda = AT_ ? K_ : M_; - int32_t ldb = BT_ ? N_ : K_; - int32_t ldc = M_; - if(ty_ == FPROP) - lda *= stride_h_*stride_w_; - if(ty_ == WGRAD) - ldb *= stride_h_*stride_w_; - if(ty_ == BPROP) - ldc *= stride_h_*stride_w_; driver::buffer *a = args[0], *b = args[1], *c = args[2]; kernel->setArg(0, a); kernel->setArg(1, b); @@ -155,20 +152,29 @@ void shift::enqueue_impl(driver::stream *stream, driver::kernel *kernel, kernel->setArg(5, K_); kernel->setArg(6, stride_h_); kernel->setArg(7, stride_w_); - kernel->setArg(8, lda); - kernel->setArg(9, ldb); - kernel->setArg(10, ldc); - kernel->setArg(11, B_); - kernel->setArg(12, AH_); - kernel->setArg(13, AW_); - kernel->setArg(14, BH_); - kernel->setArg(15, BW_); - kernel->setArg(16, CH_); - kernel->setArg(17, CW_); + kernel->setArg(8, lda_n_); + kernel->setArg(9, lda_w_); + kernel->setArg(10, lda_h_); + kernel->setArg(11, lda_c_); + kernel->setArg(12, ldb_n_); + kernel->setArg(13, ldb_w_); + kernel->setArg(14, ldb_h_); + kernel->setArg(15, ldb_c_); + kernel->setArg(16, ldc_n_); + kernel->setArg(17, ldc_w_); + kernel->setArg(18, ldc_h_); + kernel->setArg(19, ldc_f_); + kernel->setArg(20, B_); + kernel->setArg(21, AH_); + kernel->setArg(22, AW_); + kernel->setArg(23, BH_); + kernel->setArg(24, BW_); + kernel->setArg(25, CH_); + kernel->setArg(26, CW_); unsigned TM = ranges[0], TN = ranges[1]; std::array grid = {(M_ + TM - 1)/TM, (N_ + TN - 1)/TN, 1}; if(ty_ == BPROP) - ((driver::cu_buffer*)c)->set_zero(stream, ldc*N_*4); + ((driver::cu_buffer*)c)->set_zero(stream, AH_*AW_*B_*C_*4); stream->enqueue(kernel, grid, {nthreads, 1, 1}); } @@ -176,28 +182,16 @@ void shift::triton_c_src(std::ostream &os) const { std::string AS0 = "TM", AS1 = "TK"; std::string BS0 = "TK", BS1 = "TN"; std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; - std::string ldb0 = "", ldb1 = "*ldb"; std::string usea = AT_ ? "trans(a)" : "a"; std::string useb = BT_ ? "trans(b)" : "b"; - std::string rkb = "rkb"; - std::string rka = "rka"; std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; - std::string lda0 = "*lda", lda1 = ""; - if(ty_ == FPROP){ - rka = "inc"; - bca0 = ""; - lda0 = ""; - } - if(AT_){ std::swap(AS0, AS1); std::swap(bca0, bca1); - std::swap(lda0, lda1); } if(BT_){ std::swap(BS0, BS1); std::swap(bcb0, bcb1); - std::swap(ldb0, ldb1); } std::string AS = AS0 + ", " + AS1; std::string BS = BS0 + ", " + BS1; @@ -208,90 +202,125 @@ const tunable int32 TM = {16, 32, 64, 128}; const tunable int32 TN = {16, 32, 64, 128}; const tunable int32 TK = {)" << TK_ << R"(}; -__constant__ int32* delta = alloc_const int32[)" << MAX_C_ << R"(]; +__constant__ int32* delta_a = alloc_const int32[)" << MAX_C_ << R"(]; -void shift(restrict read_only align(16) )" << a_ty_ << R"( *a, - restrict read_only align(16) )" << b_ty_ << R"( *b, - fp32 *c, +void shift(restrict read_only align(16) )" << a_ty_ << R"( *A, + restrict read_only align(16) )" << b_ty_ << R"( *B, + fp32 *C, int32 M, int32 N, int32 K, int32 stride_h, int32 stride_w, - int32 lda, int32 ldb, int32 ldc, - int32 NB, int32 AH, int32 AW, int32 BH, int32 BW, int32 CH, int32 CW) { + int32 lda_b, int32 lda_w, int32 lda_h, int32 lda_c, + int32 ldb_b, int32 ldb_w, int32 ldb_h, int32 ldb_c, + int32 ldc_b, int32 ldc_w, int32 ldc_h, int32 ldc_c, + int32 NB, int32 AH, int32 AW, + int32 BH, int32 BW, + int32 CH, int32 CW) { int32 rxa[TM] = get_global_range[TM](0); int32 ryb[TN] = get_global_range[TN](1); int32 rka[TK] = 0 ... TK; int32 rkb[TK] = 0 ... TK; - fp32 C[TM, TN] = 0; + fp32 c[TM, TN] = 0; int32 pad_h = BH / 2; int32 pad_w = BW / 2;)"; + +/* A offsets */ if(ty_ == FPROP){ os << R"( - int32 rawh[TM] = rxa / NB; - int32 rab[TM] = rxa % NB; - int32 raw[TM] = (rawh % CW)*stride_w; - int32 rah[TM] = (rawh / CW)*stride_h; - __constant__ int32* pd[TK] = delta + rka; + int32 rawh[TM] = rxa / NB; + int32 rab[TM] = rxa % NB; + int32 raw[TM] = (rawh % CW) * stride_w; + int32 rah[TM] = (rawh / CW) * stride_h; + int32 offxa[TM] = rab*lda_b + raw*lda_w + rah*lda_h; + int32 offa0[TM, TK] = offxa[:, newaxis]; + __constant__ int32* pd[TK] = delta_a + rka; multiple_of(4) int32 d[TK] = *pd; + int32 offa_interior[TM, TK] = d[newaxis, :]; + int32 offa_exterior[TM, TK] = rka[newaxis, :] * lda_c; int1 interiorh[TM] = (rah >= pad_h) && (rah < (AH - pad_h)); int1 interiorw[TM] = (raw >= pad_w) && (raw < (AW - pad_w)); int1 interior[TM, TK] = interiorh[:, newaxis] && interiorw[:, newaxis]; - int32 inc_true[TM, TK] = d[newaxis, :]; - int32 inc_false[TM, TK] = rka[newaxis, :] * lda; - int32 inc[TM, TK] = interior ? inc_true : inc_false; - int32 offxa[TM] = rab + raw*NB + rah*NB*AW;)"; + int32 offa1[TM, TK] = interior ? offa_interior : offa_exterior;)"; } -else{ +if(ty_ == BPROP){ os << R"( - int32 offxa[TM] = rxa;)"; + int32 rawh[TM] = rxa / NB; + int32 rab[TM] = rxa % NB; + int32 raw[TM] = (rawh % CW); + int32 rah[TM] = (rawh / CW); + int32 offxa[TM] = rab*lda_b + raw*lda_w + rah*lda_h; + int32 offa0[TM, TK] = offxa[:, newaxis]; + int32 offa1[TM, TK] = rka[newaxis, :] * lda_c;)"; } if(ty_ == WGRAD){ os << R"( - __constant__ int32* pd[TN] = delta + ryb; + int32 offa0[TK, TM] = rxa[newaxis, :] * lda_c; + int32 offa1[TK, TM] = rka[:, newaxis];)"; +} + +/* B offsets */ +if(ty_ == FPROP){ + os << R"( + int32 offb0[TN, TK] = ryb[:, newaxis]; + int32 offb1[TN, TK] = rkb[newaxis, :] * ldb_c;)"; +} +if(ty_ == BPROP){ + os << R"( + int32 offb0[TK, TN] = ryb[newaxis, :] * ldb_c; + int32 offb1[TK, TN] = rkb[:, newaxis];)"; +} +if(ty_ == WGRAD){ + os << R"( + __constant__ int32* pd[TN] = delta_a + ryb; int32 d[TN] = *pd; int32 shift[TK, TN] = d[newaxis, :]; int32 rbwh[TK] = rkb / NB; int32 rbb[TK] = rkb % NB; int32 rbw[TK] = (rbwh % CW)*stride_w; int32 rbh[TK] = (rbwh / CW)*stride_h; - int32 offkb[TK] = rbb + rbw*NB + rbh*NB*AW; + int32 offkb[TK] = rbb*ldb_b + rbw*ldb_w + rbh*ldb_h; int1 interiorh[TK] = (rbh >= pad_h) && (rbh < (AH - pad_h)); int1 interiorw[TK] = (rbw >= pad_w) && (rbw < (AW - pad_w)); int1 interior[TK, TN] = interiorh[:, newaxis] && interiorw[:, newaxis]; - int32 inc[TK, TN] = interior ? shift : 0; - )" << b_ty_ << "* pb_base[" << BS << "] = b + ryb" << bcb1 << ldb1 << R"(; - )" << b_ty_ << "* pb[" << BS << "] = pb_base + offkb[:, newaxis] + inc;"; + int32 incb[TK, TN] = interior ? shift : 0; + int32 offb0[TK, TN] = ryb[newaxis, :] * ldb_c; + int32 offb1[TK, TN] = offkb[:, newaxis] + incb;)"; } -else{ + +/* Main loop */ os << R"( - int32 offkb[TK] = rkb; - )" << b_ty_ << "* pb[" << BS << "] = b + ryb" << bcb1 << ldb1 << " + " << "offkb" << bcb0 << ldb0 << R"(; - )"; -} - os << R"( - )" << a_ty_ << "* pa[" << AS << "] = a + offxa" << bca1 << lda1 << " + " << rka << bca0 << lda0 << R"(; + )" << a_ty_ << "* pa[" << AS << R"(] = A + offa0 + offa1; + )" << b_ty_ << "* pb[" << BS << R"(] = B + offb0 + offb1; int1 checka[)" << AS << "] = (rka < K)" << bca0 << R"(; int1 checkb[)" << BS << "] = (rkb < K)" << bcb0 << R"(; )" << a_ty_ << " a[" << AS << R"(] = checka ? *pa : 0; )" << b_ty_ << " b[" << BS << R"(] = checkb ? *pb : 0; for(int32 k = K; k > 0; k = k - TK){ - C = dot()" << usea << "," << useb << R"(, C); + c = dot()" << usea << "," << useb << R"(, c); int1 checka[)" << AS << R"(] = k > TK; int1 checkb[)" << BS << R"(] = k > TK;)"; + +/* Increment A pointers */ if(ty_ == FPROP){ os << R"( pd = pd + TK; d = *pd; - inc_true = d[newaxis, :]; - inc_false = TK * lda; - inc = interior ? inc_true : inc_false; - pa = pa + inc; - @checka a = *pa;)"; + offa_interior = d[newaxis, :]; + offa_exterior = TK * lda_c; + int32 offa[TM, TK] = interior ? offa_interior : offa_exterior; + pa = pa + offa;)"; } -else{ +if(ty_ == BPROP){ + os << R"( + pa = pa + TK * lda_c;)"; +} +if(ty_ == WGRAD){ os << R"( - pa = pa + TK)" << lda0 << R"(; - @checka a = *pa;)"; + pa = pa + TK;)"; } + os << R"( + @checka a = *pa;)"; + +/* Increment B pointers */ if(ty_ == WGRAD){ os << R"( rkb = rkb + TK; @@ -299,39 +328,53 @@ if(ty_ == WGRAD){ rbb = rkb % NB; rbw = (rbwh % CW)*stride_w; rbh = (rbwh / CW)*stride_h; - offkb = rbb + rbw*NB + rbh*NB*AW; + offkb = rbb*ldb_b + rbw*ldb_w + rbh*ldb_h; interiorh = (rbh >= pad_h) && (rbh < (AH - pad_h)); interiorw = (rbw >= pad_w) && (rbw < (AW - pad_w)); interior = interiorh[:, newaxis] && interiorw[:, newaxis]; - inc = interior ? shift : 0; - pb = pb_base + offkb[:, newaxis] + inc; - @checkb b = *pb;)"; + incb = interior ? shift : 0; + pb = B + offb0 + offkb[:, newaxis] + incb;)"; } -else{ +if(ty_ == FPROP){ os << R"( - pb = pb + TK)" << ldb0 << R"(; - @checkb b = *pb;)"; + pb = pb + TK * ldb_c;)"; +} +if(ty_ == BPROP){ + os << R"( + pb = pb + TK;)"; } os << R"( + @checkb b = *pb; } int32 rxc[TM] = get_global_range[TM](0); int32 ryc[TN] = get_global_range[TN](1);)"; - if(ty_ == BPROP){ + +/* C offsets */ +if(ty_ == BPROP){ os << R"( int32 rcwh[TM] = rxc / NB; int32 rcb[TM] = rxc % NB; int32 rcw[TM] = (rcwh % CW) * stride_w; int32 rch[TM] = (rcwh / CW) * stride_h; - int32 offxc[TM] = rcb + rcw*NB + rch*NB*AW; + int32 offxc[TM] = rcb*ldc_b + rcw*ldc_w + rch*ldc_h; )"; } - else{ +if(ty_ == FPROP){ + os << R"( + int32 rcwh[TM] = rxc / NB; + int32 rcb[TM] = rxc % NB; + int32 rcw[TM] = (rcwh % CW); + int32 rch[TM] = (rcwh / CW); + int32 offxc[TM] = rcb*ldc_b + rcw*ldc_w + rch*ldc_h; + )"; +} +if(ty_ == WGRAD){ os << R"( int32 offxc[TM] = rxc; )"; - } +} os << R"(" - fp32* pc[TM, TN] = c + ryc[newaxis, :]*ldc + offxc[:, newaxis]; + fp32* pc[TM, TN] = C + offxc[:, newaxis] + ryc[newaxis, :]*ldc_c; int1 checkc0[TM] = rxc < M; int1 checkc1[TN] = ryc < N; int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :];)"; @@ -340,15 +383,15 @@ if(ty_ == BPROP){ int1 interiorh[TM] = (rch >= pad_h) && (rch < (AH - pad_h)); int1 interiorw[TM] = (rcw >= pad_w) && (rcw < (AW - pad_w)); int1 interior[TM, TN] = interiorh[:, newaxis] && interiorw[:, newaxis]; - __constant__ int32* pd[TN] = delta + ryc; + __constant__ int32* pd[TN] = delta_a + ryc; fp32* shift_pc[TM, TN] = pc + (*pd)[newaxis, :]; pc = interior ? shift_pc : pc; - @checkc __atomic_add(pc, C); + @checkc __atomic_add(pc, c); )"; } else{ os << R"( - @checkc *pc = C;)"; + @checkc *pc = c;)"; } os << R"( })"; From fe8caf12f0f4f748f4a3209f8a77993d3959b823 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 11 Jul 2019 20:34:38 -0700 Subject: [PATCH 230/494] [dnn/conv]: skeleton for NCHW layout --- include/triton/dnn/shift.h | 7 +++ lib/dnn/shift.cpp | 94 ++++++++++++++++++++++++++++++-------- 2 files changed, 83 insertions(+), 18 deletions(-) diff --git a/include/triton/dnn/shift.h b/include/triton/dnn/shift.h index 8386ee83e..b85ffe299 100644 --- a/include/triton/dnn/shift.h +++ b/include/triton/dnn/shift.h @@ -44,6 +44,11 @@ public: WGRAD }; + enum layout_t { + NCHW, + CHWN + }; + private: // initialize and enqueue void init_impl(driver::stream *stream, driver::cu_module *module); @@ -154,6 +159,8 @@ private: // transpose bool AT_; bool BT_; + // layout + layout_t layout_; }; } diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 4b65b6e8f..0bdcb49e2 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -22,7 +22,8 @@ shift::shift(int B, int C, stride_d_(1), stride_h_(stride_h), stride_w_(stride_w), shift_h_(shift_h), shift_w_(shift_w), a_ty_(a_ty), b_ty_(b_ty), - ty_(ty), bias_(bias) { + ty_(ty), bias_(bias), + layout_(CHWN){ // max number of channels TK_ = 16; MAX_C_ = 8192 + TK_; @@ -31,22 +32,48 @@ shift::shift(int B, int C, CH_ = AH_ / stride_h_; CW_ = AW_ / stride_w_; // A memory strides: [C, H, W, B] - lda_n_ = 1; - lda_w_ = B_; - lda_h_ = B_*AW_; - lda_c_ = B_*AW_*AH_; + switch(layout_){ + case CHWN: { + lda_n_ = 1; + lda_w_ = B_; + lda_h_ = B_*AW_; + lda_c_ = B_*AW_*AH_; + break; + } + case NCHW: { + lda_w_ = 1; + lda_h_ = AW_; + lda_c_ = AW_*AH_; + lda_n_ = AW_*AH_*C_; + break; + } + default: + throw std::runtime_error("unsupported input layout"); + } // B memory strides: [C, F] ldb_n_ = 1; ldb_h_ = 1; ldb_w_ = 1; ldb_c_ = F_; // C memory strides: [F, H, W, B] - ldc_n_ = 1; - ldc_w_ = B_; - ldc_h_ = B_*CW_; - ldc_f_ = B_*CW_*CH_; - // C shapes - shapes_c_ = {F, CH_, CW_, B}; + switch(layout_){ + case CHWN: { + ldc_n_ = 1; + ldc_w_ = B_; + ldc_h_ = B_*CW_; + ldc_f_ = B_*CW_*CH_; + break; + } + case NCHW: { + ldc_w_ = 1; + ldc_h_ = CW_; + ldc_f_ = CW_*CH_; + ldc_n_ = CW_*CH_*F_; + break; + } + default: + throw std::runtime_error("unsupported input layout"); + } // Equivalent matmul M_ = B_*CH_*CW_; N_ = F_; @@ -54,8 +81,15 @@ shift::shift(int B, int C, // transpose AT_ = false; BT_ = true; + // C shapes + if(layout_ == CHWN) + shapes_c_ = {F, CH_, CW_, B}; + if(layout_ == NCHW) + shapes_c_ = {B, F, CH_, CW_}; // Weight gradient if(ty_ == WGRAD){ + // b <-> c + // b <-> a std::swap(ldb_n_, ldc_n_); std::swap(ldb_w_, ldc_w_); std::swap(ldb_h_, ldc_h_); @@ -72,6 +106,7 @@ shift::shift(int B, int C, } // Input gradient if(ty_ == BPROP){ + // a <-> c std::swap(lda_n_, ldc_n_); std::swap(lda_w_, ldc_w_); std::swap(lda_h_, ldc_h_); @@ -79,7 +114,10 @@ shift::shift(int B, int C, std::swap(K_, N_); AT_ = false; BT_ = false; - shapes_c_ = {C, AH_, AW_, B}; + if(layout_ == CHWN) + shapes_c_ = {C, AH_, AW_, B}; + if(layout_ == NCHW) + shapes_c_ = {B, C, AH_, AW_}; } } @@ -251,11 +289,21 @@ if(ty_ == BPROP){ int32 offa0[TM, TK] = offxa[:, newaxis]; int32 offa1[TM, TK] = rka[newaxis, :] * lda_c;)"; } -if(ty_ == WGRAD){ +if(ty_ == WGRAD && layout_ == CHWN){ os << R"( int32 offa0[TK, TM] = rxa[newaxis, :] * lda_c; int32 offa1[TK, TM] = rka[:, newaxis];)"; } +if(ty_ == WGRAD && layout_ == NCHW){ + os << R"( + int32 offa0[TK, TM] = rxa[newaxis, :] * lda_c; + int32 rawh[TK] = rka / NB; + int32 rab[TK] = rka % NB; + int32 raw[TK] = (rawh % CW); + int32 rah[TK] = (rawh / CW); + int32 offxa[TK] = rab*lda_b + raw*lda_w + rah*lda_h; + int32 offa1[TK, TM] = offxa[:, newaxis];)"; +} /* B offsets */ if(ty_ == FPROP){ @@ -301,7 +349,7 @@ if(ty_ == WGRAD){ /* Increment A pointers */ if(ty_ == FPROP){ - os << R"( + os << R"( pd = pd + TK; d = *pd; offa_interior = d[newaxis, :]; @@ -311,14 +359,24 @@ if(ty_ == FPROP){ } if(ty_ == BPROP){ os << R"( - pa = pa + TK * lda_c;)"; + pa = pa + TK * lda_c;)"; } -if(ty_ == WGRAD){ - os << R"( +if(ty_ == WGRAD && layout_ == CHWN){ + os << R"( pa = pa + TK;)"; } +if(ty_ == WGRAD && layout_ == NCHW){ os << R"( - @checka a = *pa;)"; + rka = rka + TK; + rawh = rka / NB; + rab = rka % NB; + raw = (rawh % CW); + rah = (rawh / CW); + offxa = rab*lda_b + raw*lda_w + rah*lda_h; + pa = A + offa0 + offxa[:, newaxis];)"; +} + os << R"( + @checka a = *pa;)"; /* Increment B pointers */ if(ty_ == WGRAD){ From f36a646ffc78be1c35f8e972dce6231dc8dd9ba1 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 11 Jul 2019 21:00:33 -0700 Subject: [PATCH 231/494] [dnn/shift-conv] added and tested NCHW layout --- examples/python/pytorch/run.py | 4 -- examples/python/pytorch/shift.cpp | 56 +++++++++++++++++----------- examples/python/tensorflow/run.py | 8 ++-- examples/python/tensorflow/shift.cpp | 43 +++++++++++++-------- include/triton/dnn/shift.h | 2 +- lib/dnn/shift.cpp | 5 ++- 6 files changed, 70 insertions(+), 48 deletions(-) diff --git a/examples/python/pytorch/run.py b/examples/python/pytorch/run.py index 488f547f8..59f70d6c5 100644 --- a/examples/python/pytorch/run.py +++ b/examples/python/pytorch/run.py @@ -78,14 +78,10 @@ class NetReference(nn.Module): self.fc2 = nn.Linear(500, 10) def forward(self, x): - #x = x.permute(1, 2, 3, 0).contiguous() x = self.conv1(x) - #x = x.permute(3, 0, 1, 2).contiguous() x = self.bn1(x) x = F.relu(x) - #x = x.permute(1, 2, 3, 0).contiguous() x = self.conv2(x) - #x = x.permute(3, 0, 1, 2).contiguous() x = self.bn2(x) x = F.relu(x) x = x.view(-1, 32*7*7) diff --git a/examples/python/pytorch/shift.cpp b/examples/python/pytorch/shift.cpp index 1da8f3fbd..d650ca9e6 100644 --- a/examples/python/pytorch/shift.cpp +++ b/examples/python/pytorch/shift.cpp @@ -9,12 +9,34 @@ #define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) +void extract_shapes(const torch::Tensor &x, + int64_t &C, int64_t &H, int64_t &W, int64_t &B, + triton::dnn::shift::layout_t layout) { + if(layout == triton::dnn::shift::CHWN){ + C = x.size(0); + H = x.size(1); + W = x.size(2); + B = x.size(3); + } + else if(layout == triton::dnn::shift::NCHW){ + B = x.size(0); + C = x.size(1); + H = x.size(2); + W = x.size(3); + } + else{ + throw std::runtime_error("unsupported layout"); + } +} + +static const triton::dnn::shift::layout_t layout = triton::dnn::shift::NCHW; + torch::Tensor shift_common( int32_t B, int32_t C, int32_t D, int32_t H, int32_t W, int32_t T, int32_t R, int32_t S, int32_t F, int32_t stride_h, int32_t stride_w, int32_t* shift_h, int32_t* shift_w, - triton::dnn::shift::type ty, + triton::dnn::shift::type ty, triton::dnn::shift::layout_t layout, torch::Tensor torcha, torch::Tensor torchb, torch::Tensor torchbias, bool autotune = false ) { @@ -28,7 +50,7 @@ torch::Tensor shift_common( triton::dnn::shift shift(B, C, D, H, W, T, R, S, F, stride_h, stride_w, shift_h, shift_w, "fp32", "fp32", - ty, has_bias); + ty, has_bias, layout); // Bind memory triton::driver::cu_buffer a(ctx, (CUdeviceptr)torcha.storage().data(), false); triton::driver::cu_buffer b(ctx, (CUdeviceptr)torchb.storage().data(), false); @@ -56,10 +78,8 @@ torch::Tensor shift_y( CHECK_INPUT(x); CHECK_INPUT(w); // shapes for a - int64_t Ca = x.size(0); - int64_t H = x.size(1); - int64_t W = x.size(2); - int64_t B = x.size(3); + int64_t Ca, H, W, B; + extract_shapes(x, Ca, H, W, B, layout); // shapes for b int64_t Cb = w.size(0); int64_t F = w.size(1); @@ -68,7 +88,7 @@ torch::Tensor shift_y( // run return shift_common(B, C, 1, H, W, 1, R, S, F, stride_h, stride_w, (int32_t*)shift_h.storage().data(), (int32_t*)shift_w.storage().data(), - triton::dnn::shift::FPROP, x, w, bias); + triton::dnn::shift::FPROP, layout, x, w, bias); } torch::Tensor shift_dx( @@ -81,10 +101,8 @@ torch::Tensor shift_dx( CHECK_INPUT(dy); CHECK_INPUT(w); // shapes for a - int64_t Ca = dy.size(0); - int64_t H = dy.size(1); - int64_t W = dy.size(2); - int64_t B = dy.size(3); + int64_t Ca, H, W, B; + extract_shapes(dy, Ca, H, W, B, layout); H *= stride_h; W *= stride_w; // shapes for b @@ -98,7 +116,7 @@ torch::Tensor shift_dx( // run return shift_common(B, C, 1, H, W, 1, R, S, F, stride_h, stride_w, (int32_t*)shift_h.storage().data(), (int32_t*)shift_w.storage().data(), - triton::dnn::shift::BPROP, dy, w, bias); + triton::dnn::shift::BPROP, layout, dy, w, bias); } torch::Tensor shift_dw( @@ -111,15 +129,11 @@ torch::Tensor shift_dw( CHECK_INPUT(dy); CHECK_INPUT(x); // shapes for a - int64_t F = dy.size(0); - int64_t Ha = dy.size(1); - int64_t Wa = dy.size(2); - int64_t Ba = dy.size(3); + int64_t F, Ha, Wa, Ba; + extract_shapes(dy, F, Ha, Wa, Ba, layout); // shapes for b - int64_t C = x.size(0); - int64_t Hb = x.size(1); - int64_t Wb = x.size(2); - int64_t Bb = x.size(3); + int64_t C, Hb, Wb, Bb; + extract_shapes(x, C, Hb, Wb, Bb, layout); // check AT_CHECK(Ha*stride_h == Hb, "operands must have the same image height"); AT_CHECK(Wa*stride_w == Wb, "operands must have the same image width"); @@ -130,7 +144,7 @@ torch::Tensor shift_dw( // run return shift_common(B, C, 1, H, W, 1, R, S, F, stride_h, stride_w, (int32_t*)shift_h.storage().data(), (int32_t*)shift_w.storage().data(), - triton::dnn::shift::WGRAD, dy, x, bias); + triton::dnn::shift::WGRAD, layout, dy, x, bias); } static auto registry = diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 375c45227..9de71d8a4 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -62,7 +62,7 @@ def run_shift(): R, S, F = 3, 3, 32 stride_h, stride_w = 2, 2 np.random.seed(2) - a = tf.placeholder(tf.float32, shape=[C, H, W, B]) + a = tf.placeholder(tf.float32, shape=[B, C, H, W]) b = tf.placeholder(tf.float32, shape=[C, F]) hshift_h = np.random.randint(- (R//2), R//2 + 1, size=C, dtype=np.int32) hshift_w = np.random.randint(- (S//2), R//2 + 1, size=C, dtype=np.int32) @@ -70,13 +70,13 @@ def run_shift(): #hshift_w = np.zeros(C, dtype=np.int32) c = module.shift_conv(a, b, stride_h=stride_h, stride_w=stride_w, shift_h=tf.make_tensor_proto(hshift_h), shift_w=tf.make_tensor_proto(hshift_w)) # feed values - ha = np.random.rand(C, H, W, B) + ha = np.random.rand(B, C, H, W) hb = np.random.rand(C, F) - #ha = np.ones((C, H, W, B), dtype=np.float32) + #ha = np.ones((B, C, H, W), dtype=np.float32) #hb = np.ones((C, F), dtype=np.float32) sess = tf.InteractiveSession() # test - grads = tf.test.compute_gradient([a, b], [(C, H, W, B), (C, F)], c, (F, H//stride_h, W//stride_w, B), + grads = tf.test.compute_gradient([a, b], [(B, C, H, W), (C, F)], c, (B, F, H//stride_h, W//stride_w), extra_feed_dict = {a: ha, b: hb}) dw_t, dw_n = grads[1] dx_t, dx_n = grads[0] diff --git a/examples/python/tensorflow/shift.cpp b/examples/python/tensorflow/shift.cpp index bde4d1b5e..d9014795e 100644 --- a/examples/python/tensorflow/shift.cpp +++ b/examples/python/tensorflow/shift.cpp @@ -22,7 +22,7 @@ using GPUDevice = Eigen::GpuDevice; template class ShiftConvOp : public OpKernel { public: - explicit ShiftConvOp(OpKernelConstruction* context) : OpKernel(context) { + explicit ShiftConvOp(OpKernelConstruction* context) : OpKernel(context), layout_(triton::dnn::shift::NCHW) { context->GetAttr("shift_h", &h_shift_h_); context->GetAttr("shift_w", &h_shift_w_); context->GetAttr("stride_h", &stride_h_); @@ -31,20 +31,32 @@ public: S_ = 3; } + void ExtractShapes(const Tensor &x, int64_t &C, int64_t &H, int64_t &W, int64_t &B) { + if(layout_ == triton::dnn::shift::CHWN){ + C = x.dim_size(0); + H = x.dim_size(1); + W = x.dim_size(2); + B = x.dim_size(3); + } + else if(layout_ == triton::dnn::shift::NCHW){ + B = x.dim_size(0); + C = x.dim_size(1); + H = x.dim_size(2); + W = x.dim_size(3); + } + else{ + throw std::runtime_error("unsupported layout"); + } + } + void FillShapes(OpKernelContext* context, int64_t &C, int64_t &H, int64_t &W, int64_t &B, int64_t &F, const Tensor& tf_a, const Tensor& tf_b) { if(OP == triton::dnn::shift::WGRAD) { - // shapes for a - F = tf_a.dim_size(0); - int64_t Ha = tf_a.dim_size(1); - int64_t Wa = tf_a.dim_size(2); - int64_t Ba = tf_a.dim_size(3); - // shapes for b - C = tf_b.dim_size(0); - int64_t Hb = tf_b.dim_size(1); - int64_t Wb = tf_b.dim_size(2); - int64_t Bb = tf_b.dim_size(3); + int64_t Ha, Wa, Ba; + int64_t Hb, Wb, Bb; + ExtractShapes(tf_a, F, Ha, Wa, Ba); + ExtractShapes(tf_b, C, Hb, Wb, Bb); OP_REQUIRES(context, Ha*stride_h_ == Hb, tensorflow::errors::InvalidArgument("operands must have the same image height")); OP_REQUIRES(context, Wa*stride_w_ == Wb, tensorflow::errors::InvalidArgument("operands must have the same image width")); OP_REQUIRES(context, Ba == Bb, tensorflow::errors::InvalidArgument("operands must have the same batch size")); @@ -54,10 +66,8 @@ public: } else { // shapes for a - int64_t Ca = tf_a.dim_size(0); - H = tf_a.dim_size(1); - W = tf_a.dim_size(2); - B = tf_a.dim_size(3); + int64_t Ca; + ExtractShapes(tf_a, Ca, H, W, B); if(OP == triton::dnn::shift::BPROP){ H *= stride_h_; W *= stride_w_; @@ -96,7 +106,7 @@ public: triton::dnn::shift shift(B, C, D, H, W, T, R_, S_, F, stride_h_, stride_w_, shift_h_data, shift_w_data, - "fp32", "fp32", OP, has_bias); + "fp32", "fp32", OP, has_bias, layout_); // shapes for c std::vector c_shapes; @@ -122,6 +132,7 @@ private: int stride_w_; int R_; int S_; + triton::dnn::shift::layout_t layout_; }; REGISTER_KERNEL_BUILDER(Name("ShiftConv").Device(DEVICE_GPU), ShiftConvOp); diff --git a/include/triton/dnn/shift.h b/include/triton/dnn/shift.h index b85ffe299..1731508d0 100644 --- a/include/triton/dnn/shift.h +++ b/include/triton/dnn/shift.h @@ -65,7 +65,7 @@ public: int stride_h, int stride_w, const int32_t* shift_h, const int32_t* shift_w, std::string a_ty = "fp32", std::string b_ty = "fp32", - type ty = FPROP, bool bias = false); + type ty = FPROP, bool bias = false, layout_t layout = CHWN); // look-up table void build_delta_a(); diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 0bdcb49e2..da3b5877d 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -13,7 +13,8 @@ shift::shift(int B, int C, int stride_h, int stride_w, const int32_t *shift_h, const int32_t *shift_w, std::string a_ty, std::string b_ty, - type ty, bool bias) + type ty, bool bias, + layout_t layout) : base("shift"), B_(B), C_(C), AD_(D), AH_(H), AW_(W), @@ -23,7 +24,7 @@ shift::shift(int B, int C, shift_h_(shift_h), shift_w_(shift_w), a_ty_(a_ty), b_ty_(b_ty), ty_(ty), bias_(bias), - layout_(CHWN){ + layout_(layout){ // max number of channels TK_ = 16; MAX_C_ = 8192 + TK_; From c1c7062914046a15c700ffa4f61a650d58763a09 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 12 Jul 2019 17:42:29 -0700 Subject: [PATCH 232/494] blabla --- examples/python/pytorch/run.py | 50 ++++------- examples/python/tensorflow/run.py | 2 +- include/triton/dnn/shift.h | 6 +- lib/dnn/shift.cpp | 132 ++++++++++++++++++++---------- 4 files changed, 108 insertions(+), 82 deletions(-) diff --git a/examples/python/pytorch/run.py b/examples/python/pytorch/run.py index 59f70d6c5..e7c10112c 100644 --- a/examples/python/pytorch/run.py +++ b/examples/python/pytorch/run.py @@ -65,55 +65,33 @@ def ShiftConv2d(in_planes, out_planes, kernel_size=3, stride=1, groups=1, dilati ) -class NetReference(nn.Module): +class Net(nn.Module): def __init__(self): - super(NetReference, self).__init__() - #self.conv1 = ShiftConv2d(1, 32, 3, 2) - self.conv1 = triton.ShiftConv2d(1, 32, 3, 2) - self.bn1 = nn.BatchNorm2d(32) - self.conv2 = triton.ShiftConv2d(32, 32, 3, 2) - #self.conv2 = ShiftConv2d(32, 32, 3, 2) - self.bn2 = nn.BatchNorm2d(32) - self.fc1 = nn.Linear(32*7*7, 500) + super(Net, self).__init__() + self.conv1 = ShiftConv2d(1, 32, 3, 1) + self.conv2 = ShiftConv2d(32, 128, 3, 1) + self.conv3 = ShiftConv2d(128, 128, 3, 2) + self.bn1 = nn.BatchNorm2d(128) + self.conv4 = ShiftConv2d(128, 256, 3, 2) + self.bn2 = nn.BatchNorm2d(256) + self.fc1 = nn.Linear(256*7*7, 500) self.fc2 = nn.Linear(500, 10) def forward(self, x): x = self.conv1(x) + x = self.conv2(x) + x = self.conv3(x) x = self.bn1(x) x = F.relu(x) - x = self.conv2(x) + x = self.conv4(x) x = self.bn2(x) x = F.relu(x) - x = x.view(-1, 32*7*7) + x = x.view(-1, 256*7*7) x = F.relu(self.fc1(x)) x = self.fc2(x) return F.log_softmax(x, dim=1) -class NetTriton(nn.Module): - def __init__(self): - super(NetTriton, self).__init__() - self.conv1 = triton.ShiftConv2d(1, 32, 3, 2) - self.bn1 = triton.BatchNorm2d(32) - self.conv2 = triton.ShiftConv2d(32, 64, 3, 2) - self.bn2 = triton.BatchNorm2d(64) - self.fc1 = nn.Linear(64*7*7, 500) - self.fc2 = nn.Linear(500, 10) - - def forward(self, x): - x = x.permute(1, 2, 3, 0).contiguous() - x = self.conv1(x) - x = self.bn1(x) - x = F.relu(x) - x = self.conv2(x) - x = self.bn2(x) - x = F.relu(x) - x = x.permute(3, 0, 1, 2).contiguous() - x = x.view(-1, 64*7*7) - x = F.relu(self.fc1(x)) - x = self.fc2(x) - return F.log_softmax(x, dim=1) - -Net = NetReference() +Net = Net() def train(args, model, device, train_loader, optimizer, epoch): model.train() diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 9de71d8a4..57850de9a 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -58,7 +58,7 @@ def blocksparse_matmul_grad(op, dy): return (dx, dw) def run_shift(): - B, C, H, W = 16, 16, 4, 4 + B, C, H, W = 16, 16, 2, 2 R, S, F = 3, 3, 32 stride_h, stride_w = 2, 2 np.random.seed(2) diff --git a/include/triton/dnn/shift.h b/include/triton/dnn/shift.h index 1731508d0..57cb5ea0a 100644 --- a/include/triton/dnn/shift.h +++ b/include/triton/dnn/shift.h @@ -62,7 +62,7 @@ public: shift(int B, int NC, int D, int H, int W, int T, int R, int S, int NF, - int stride_h, int stride_w, + int stride_h, int stride_w, const int32_t* shift_h, const int32_t* shift_w, std::string a_ty = "fp32", std::string b_ty = "fp32", type ty = FPROP, bool bias = false, layout_t layout = CHWN); @@ -145,6 +145,8 @@ private: // shift values const int32_t* shift_h_; const int32_t* shift_w_; + bool shift_edge_h_; + bool shift_edge_w_; // look-up tables std::vector h_delta_a; std::vector h_delta_b; @@ -154,7 +156,7 @@ private: std::string a_ty_; std::string b_ty_; // convolution type - type ty_; + type op_; bool bias_; // transpose bool AT_; diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index da3b5877d..aeaba72a4 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -23,8 +23,9 @@ shift::shift(int B, int C, stride_d_(1), stride_h_(stride_h), stride_w_(stride_w), shift_h_(shift_h), shift_w_(shift_w), a_ty_(a_ty), b_ty_(b_ty), - ty_(ty), bias_(bias), + op_(ty), bias_(bias), layout_(layout){ +// std::cout << B_ << " " << C_ << " " << F_ << " " << stride_h_ << " " << stride_w_ << " " << a_ty_ << " " << b_ty_ << " " << ty_ << " " << layout_ << std::endl; // max number of channels TK_ = 16; MAX_C_ = 8192 + TK_; @@ -51,6 +52,9 @@ shift::shift(int B, int C, default: throw std::runtime_error("unsupported input layout"); } + // Shift edge + shift_edge_h_ = (AH_ == stride_h_); + shift_edge_w_ = (AW_ == stride_w_); // B memory strides: [C, F] ldb_n_ = 1; ldb_h_ = 1; @@ -88,7 +92,7 @@ shift::shift(int B, int C, if(layout_ == NCHW) shapes_c_ = {B, F, CH_, CW_}; // Weight gradient - if(ty_ == WGRAD){ + if(op_ == WGRAD){ // b <-> c // b <-> a std::swap(ldb_n_, ldc_n_); @@ -106,7 +110,7 @@ shift::shift(int B, int C, shapes_c_ = {C, F}; } // Input gradient - if(ty_ == BPROP){ + if(op_ == BPROP){ // a <-> c std::swap(lda_n_, ldc_n_); std::swap(lda_w_, ldc_w_); @@ -128,10 +132,12 @@ base* shift::clone() const { void shift::build_delta_a() { h_delta_a.resize(MAX_C_); - if(ty_ == FPROP){ + auto shift_h = [&](int c) { return shift_edge_h_ ? std::max(0, shift_h_[c]) : shift_h_[c]; }; + auto shift_w = [&](int c) { return shift_edge_w_ ? std::max(0, shift_w_[c]) : shift_w_[c]; }; + if(op_ == FPROP){ // compute offset auto offset = [&](unsigned c) { - return c*lda_c_ + shift_h_[c]*lda_h_ + shift_w_[c]*lda_w_; + return c*lda_c_ + shift_h(c)*lda_h_ + shift_w(c)*lda_w_; }; // populate look-up table for(unsigned c = 0; c < TK_; c++) @@ -139,14 +145,14 @@ void shift::build_delta_a() { for(unsigned c = 0; c < C_; c++) h_delta_a[TK_ + c] = offset(c + TK_) - offset(c); } - if(ty_ == BPROP){ + if(op_ == BPROP){ for(unsigned c = 0; c < C_; c++){ - h_delta_a[c] = shift_h_[c]*ldc_h_ + shift_w_[c]*ldc_w_; + h_delta_a[c] = shift_h(c)*ldc_h_ + shift_w(c)*ldc_w_; } } - if(ty_ == WGRAD){ + if(op_ == WGRAD){ for(unsigned c = 0; c < C_; c++) - h_delta_a[c] = shift_h_[c]*ldb_h_ + shift_w_[c]*ldb_w_; + h_delta_a[c] = shift_h(c)*ldb_h_ + shift_w(c)*ldb_w_; } } @@ -167,10 +173,22 @@ bool shift::operator <(const base& other) const{ auto *y = dynamic_cast(&other); if(!y) return true; - return std::tie(B_, C_, AD_, AH_, AW_, BD_, BH_, BW_, F_, - shift_h_, shift_w_, ty_, bias_) - < std::tie(y->B_, y->C_, y->AD_, y->AH_, y->AW_, y->BD_, y->BH_, y->BW_, y->F_, - y->shift_h_, y->shift_w_, y->ty_, y->bias_); + return std::tie(B_, C_, F_, + AD_, AH_, AW_, + BD_, BH_, BW_, + CD_, CH_, CW_, + shift_h_, shift_w_, + stride_h_, stride_w_, + layout_, op_, + bias_) + < std::tie(y->B_, y->C_, y->F_, + y->AD_, y->AH_, y->AW_, + y->BD_, y->BH_, y->BW_, + y->CD_, y->CH_, y->CW_, + y->shift_h_, y->shift_w_, + y->stride_h_, y->stride_w_, + y->layout_, y->op_, + y->bias_); } void shift::init_impl(driver::stream *stream, driver::cu_module *module) { @@ -212,7 +230,7 @@ void shift::enqueue_impl(driver::stream *stream, driver::kernel *kernel, kernel->setArg(26, CW_); unsigned TM = ranges[0], TN = ranges[1]; std::array grid = {(M_ + TM - 1)/TM, (N_ + TN - 1)/TN, 1}; - if(ty_ == BPROP) + if(op_ == BPROP) ((driver::cu_buffer*)c)->set_zero(stream, AH_*AW_*B_*C_*4); stream->enqueue(kernel, grid, {nthreads, 1, 1}); } @@ -263,7 +281,7 @@ void shift(restrict read_only align(16) )" << a_ty_ << R"( *A, int32 pad_w = BW / 2;)"; /* A offsets */ -if(ty_ == FPROP){ +if(op_ == FPROP){ os << R"( int32 rawh[TM] = rxa / NB; int32 rab[TM] = rxa % NB; @@ -274,13 +292,20 @@ if(ty_ == FPROP){ __constant__ int32* pd[TK] = delta_a + rka; multiple_of(4) int32 d[TK] = *pd; int32 offa_interior[TM, TK] = d[newaxis, :]; - int32 offa_exterior[TM, TK] = rka[newaxis, :] * lda_c; - int1 interiorh[TM] = (rah >= pad_h) && (rah < (AH - pad_h)); - int1 interiorw[TM] = (raw >= pad_w) && (raw < (AW - pad_w)); + int32 offa_exterior[TM, TK] = rka[newaxis, :] * lda_c;\n)"; + if(shift_edge_h_) + os << " int1 interiorh[TM] = 1;"; + else + os << " int1 interiorh[TM] = (rah >= pad_h) && (rah < (AH - pad_h));"; + if(shift_edge_w_) + os << " int1 interiorw[TM] = 1;"; + else + os << " int1 interiorw[TM] = (raw >= pad_w) && (raw < (AW - pad_w));"; + os << R"( int1 interior[TM, TK] = interiorh[:, newaxis] && interiorw[:, newaxis]; int32 offa1[TM, TK] = interior ? offa_interior : offa_exterior;)"; } -if(ty_ == BPROP){ +if(op_ == BPROP){ os << R"( int32 rawh[TM] = rxa / NB; int32 rab[TM] = rxa % NB; @@ -290,12 +315,12 @@ if(ty_ == BPROP){ int32 offa0[TM, TK] = offxa[:, newaxis]; int32 offa1[TM, TK] = rka[newaxis, :] * lda_c;)"; } -if(ty_ == WGRAD && layout_ == CHWN){ +if(op_ == WGRAD && layout_ == CHWN){ os << R"( int32 offa0[TK, TM] = rxa[newaxis, :] * lda_c; int32 offa1[TK, TM] = rka[:, newaxis];)"; } -if(ty_ == WGRAD && layout_ == NCHW){ +if(op_ == WGRAD && layout_ == NCHW){ os << R"( int32 offa0[TK, TM] = rxa[newaxis, :] * lda_c; int32 rawh[TK] = rka / NB; @@ -307,17 +332,17 @@ if(ty_ == WGRAD && layout_ == NCHW){ } /* B offsets */ -if(ty_ == FPROP){ +if(op_ == FPROP){ os << R"( int32 offb0[TN, TK] = ryb[:, newaxis]; int32 offb1[TN, TK] = rkb[newaxis, :] * ldb_c;)"; } -if(ty_ == BPROP){ +if(op_ == BPROP){ os << R"( int32 offb0[TK, TN] = ryb[newaxis, :] * ldb_c; int32 offb1[TK, TN] = rkb[:, newaxis];)"; } -if(ty_ == WGRAD){ +if(op_ == WGRAD){ os << R"( __constant__ int32* pd[TN] = delta_a + ryb; int32 d[TN] = *pd; @@ -326,9 +351,16 @@ if(ty_ == WGRAD){ int32 rbb[TK] = rkb % NB; int32 rbw[TK] = (rbwh % CW)*stride_w; int32 rbh[TK] = (rbwh / CW)*stride_h; - int32 offkb[TK] = rbb*ldb_b + rbw*ldb_w + rbh*ldb_h; - int1 interiorh[TK] = (rbh >= pad_h) && (rbh < (AH - pad_h)); - int1 interiorw[TK] = (rbw >= pad_w) && (rbw < (AW - pad_w)); + int32 offkb[TK] = rbb*ldb_b + rbw*ldb_w + rbh*ldb_h;\n)"; + if(shift_edge_h_) + os << " int1 interiorh[TK] = 1;\n"; + else + os << " int1 interiorh[TK] = (rbh >= pad_h) && (rbh < (AH - pad_h));\n"; + if(shift_edge_w_) + os << " int1 interiorw[TK] = 1;\n"; + else + os << " int1 interiorw[TK] = (rbw >= pad_w) && (rbw < (AW - pad_w));\n"; + os << R"( int1 interior[TK, TN] = interiorh[:, newaxis] && interiorw[:, newaxis]; int32 incb[TK, TN] = interior ? shift : 0; int32 offb0[TK, TN] = ryb[newaxis, :] * ldb_c; @@ -349,7 +381,7 @@ if(ty_ == WGRAD){ int1 checkb[)" << BS << R"(] = k > TK;)"; /* Increment A pointers */ -if(ty_ == FPROP){ +if(op_ == FPROP){ os << R"( pd = pd + TK; d = *pd; @@ -358,15 +390,15 @@ if(ty_ == FPROP){ int32 offa[TM, TK] = interior ? offa_interior : offa_exterior; pa = pa + offa;)"; } -if(ty_ == BPROP){ +if(op_ == BPROP){ os << R"( pa = pa + TK * lda_c;)"; } -if(ty_ == WGRAD && layout_ == CHWN){ +if(op_ == WGRAD && layout_ == CHWN){ os << R"( pa = pa + TK;)"; } -if(ty_ == WGRAD && layout_ == NCHW){ +if(op_ == WGRAD && layout_ == NCHW){ os << R"( rka = rka + TK; rawh = rka / NB; @@ -380,25 +412,32 @@ if(ty_ == WGRAD && layout_ == NCHW){ @checka a = *pa;)"; /* Increment B pointers */ -if(ty_ == WGRAD){ +if(op_ == WGRAD){ os << R"( rkb = rkb + TK; rbwh = rkb / NB; rbb = rkb % NB; rbw = (rbwh % CW)*stride_w; rbh = (rbwh / CW)*stride_h; - offkb = rbb*ldb_b + rbw*ldb_w + rbh*ldb_h; - interiorh = (rbh >= pad_h) && (rbh < (AH - pad_h)); - interiorw = (rbw >= pad_w) && (rbw < (AW - pad_w)); + offkb = rbb*ldb_b + rbw*ldb_w + rbh*ldb_h;\n)"; + if(shift_edge_h_) + os << " interiorh = 1;\n"; + else + os << " interiorh = (rbh >= pad_h) && (rbh < (AH - pad_h));\n"; + if(shift_edge_w_) + os << " interiorw = 1;\n"; + else + os << " interiorw = (rbw >= pad_w) && (rbw < (AW - pad_w));\n"; + os << R"( interior = interiorh[:, newaxis] && interiorw[:, newaxis]; incb = interior ? shift : 0; pb = B + offb0 + offkb[:, newaxis] + incb;)"; } -if(ty_ == FPROP){ +if(op_ == FPROP){ os << R"( pb = pb + TK * ldb_c;)"; } -if(ty_ == BPROP){ +if(op_ == BPROP){ os << R"( pb = pb + TK;)"; } @@ -409,7 +448,7 @@ if(ty_ == BPROP){ int32 ryc[TN] = get_global_range[TN](1);)"; /* C offsets */ -if(ty_ == BPROP){ +if(op_ == BPROP){ os << R"( int32 rcwh[TM] = rxc / NB; int32 rcb[TM] = rxc % NB; @@ -418,7 +457,7 @@ if(ty_ == BPROP){ int32 offxc[TM] = rcb*ldc_b + rcw*ldc_w + rch*ldc_h; )"; } -if(ty_ == FPROP){ +if(op_ == FPROP){ os << R"( int32 rcwh[TM] = rxc / NB; int32 rcb[TM] = rxc % NB; @@ -427,7 +466,7 @@ if(ty_ == FPROP){ int32 offxc[TM] = rcb*ldc_b + rcw*ldc_w + rch*ldc_h; )"; } -if(ty_ == WGRAD){ +if(op_ == WGRAD){ os << R"( int32 offxc[TM] = rxc; )"; @@ -437,10 +476,17 @@ if(ty_ == WGRAD){ int1 checkc0[TM] = rxc < M; int1 checkc1[TN] = ryc < N; int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :];)"; -if(ty_ == BPROP){ +if(op_ == BPROP){ + os << "\n"; + if(shift_edge_h_) + os << " int1 interiorh[TM] = 1;\n"; + else + os << " int1 interiorh[TM] = (rch >= pad_h) && (rch < (AH - pad_h));\n"; + if(shift_edge_w_) + os << " int1 interiorw[TM] = 1;\n"; + else + os << " int1 interiorw[TM] = (rcw >= pad_w) && (rcw < (AW - pad_w));\n"; os << R"( - int1 interiorh[TM] = (rch >= pad_h) && (rch < (AH - pad_h)); - int1 interiorw[TM] = (rcw >= pad_w) && (rcw < (AW - pad_w)); int1 interior[TM, TN] = interiorh[:, newaxis] && interiorw[:, newaxis]; __constant__ int32* pd[TN] = delta_a + ryc; fp32* shift_pc[TM, TN] = pc + (*pd)[newaxis, :]; From 7512c7ebed41ba69c867535948dd08201f644bad Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 12 Jul 2019 20:03:05 -0700 Subject: [PATCH 233/494] some cleaning --- examples/cpp/dot.cpp | 4 +- examples/cpp/shift.cpp | 22 +++-- include/triton/tools/bench.hpp | 20 ++-- lib/dnn/shift.cpp | 171 +++++++++++++++++---------------- 4 files changed, 114 insertions(+), 103 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index f788ba048..7612f7c16 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -8,12 +8,12 @@ int main() { - bool AT = true; + bool AT = false; bool BT = true; // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); // matrix multiplication parameters - int32_t M = 128, N = 128, K = 128; + int32_t M = 4096, N = 4096, K = 4096; std::vector hc(M*N); std::vector rc(M*N); std::vector ha(M*K); diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index a4edd38e3..d4417cd7d 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -9,17 +9,18 @@ #include "triton/external/half.hpp" int main() { - typedef half_float::half NumericT; - std::string numeric_t_str = "fp16"; + typedef float NumericT; + std::string numeric_t_str = "fp32"; // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); + auto op = triton::dnn::shift::FPROP; // initialization int32_t R = 3, S = 3; - int32_t BS = 4, F = 1024; + int32_t B = 16, F = 4096; int32_t H = 16, W = 16; - int32_t C = 1024; + int32_t C = 4096; // random shifts std::vector shift_h(C); @@ -29,12 +30,15 @@ int main() { shift_w[c] = rand() % S - S/2; } // configuration - triton::dnn::shift shift(BS, C, 1, H, W, 1, R, S, F, 1, 1, shift_h.data(), shift_w.data(), numeric_t_str, numeric_t_str, triton::dnn::shift::BPROP); + triton::dnn::shift shift(B, C, 1, H, W, 1, R, S, F, 1, 1, + shift_h.data(), shift_w.data(), + numeric_t_str, numeric_t_str, + op, false); // host buffers - std::vector hc(shift.c_size()); - std::vector rc(shift.c_size()); - std::vector ha(shift.a_size()); - std::vector hb(shift.b_size()); + std::vector ha(B*C*H*W); + std::vector hb(C*F); + std::vector hc(B*F*H*W); + std::vector rc(hc.size()); // device buffers triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*4); triton::driver::buffer* da = triton::driver::buffer::create(context, ha.size()*sizeof(NumericT)); diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp index 64c88cd64..f37c04371 100644 --- a/include/triton/tools/bench.hpp +++ b/include/triton/tools/bench.hpp @@ -32,17 +32,15 @@ double bench(OP const & op, SYNC const & sync, const triton::driver::device * de double total_time = 0; op(); sync(); - while(total_time*1e-9 < 1e-3){ - float norm = 1; - // normalize clock if possible to get roughly constant result - if(auto cu_device = dynamic_cast(device)) - norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock(); - tmr.start(); - op(); - sync(); - times.push_back(norm*tmr.get().count()); - total_time+=times.back(); - } + float norm = 1; + // normalize clock if possible to get roughly constant result + if(auto cu_device = dynamic_cast(device)) + norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock(); + tmr.start(); + op(); + sync(); + times.push_back(norm*tmr.get().count()); + total_time+=times.back(); return *std::min_element(times.begin(), times.end()); } diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index aeaba72a4..f2502db70 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -27,7 +27,7 @@ shift::shift(int B, int C, layout_(layout){ // std::cout << B_ << " " << C_ << " " << F_ << " " << stride_h_ << " " << stride_w_ << " " << a_ty_ << " " << b_ty_ << " " << ty_ << " " << layout_ << std::endl; // max number of channels - TK_ = 16; + TK_ = (ty == FPROP && a_ty_ == "fp32") ? 8 : 16; MAX_C_ = 8192 + TK_; // activation sizes CD_ = AD_ / stride_d_; @@ -53,8 +53,8 @@ shift::shift(int B, int C, throw std::runtime_error("unsupported input layout"); } // Shift edge - shift_edge_h_ = (AH_ == stride_h_); - shift_edge_w_ = (AW_ == stride_w_); + shift_edge_h_ = (AH_ == stride_h_ && stride_h_ > 1); + shift_edge_w_ = (AW_ == stride_w_ && stride_w_ > 1); // B memory strides: [C, F] ldb_n_ = 1; ldb_h_ = 1; @@ -132,8 +132,8 @@ base* shift::clone() const { void shift::build_delta_a() { h_delta_a.resize(MAX_C_); - auto shift_h = [&](int c) { return shift_edge_h_ ? std::max(0, shift_h_[c]) : shift_h_[c]; }; - auto shift_w = [&](int c) { return shift_edge_w_ ? std::max(0, shift_w_[c]) : shift_w_[c]; }; + auto shift_h = [&](int c) { return shift_edge_h_ ? (c / AH_) % AH_ : shift_h_[c]; }; + auto shift_w = [&](int c) { return shift_edge_w_ ? c % AW_ : shift_w_[c]; }; if(op_ == FPROP){ // compute offset auto offset = [&](unsigned c) { @@ -253,23 +253,24 @@ void shift::triton_c_src(std::ostream &os) const { std::string AS = AS0 + ", " + AS1; std::string BS = BS0 + ", " + BS1; - os << + std::string result = R"( const tunable int32 TM = {16, 32, 64, 128}; const tunable int32 TN = {16, 32, 64, 128}; -const tunable int32 TK = {)" << TK_ << R"(}; +const tunable int32 TK = {)" + std::to_string(TK_) + R"(}; -__constant__ int32* delta_a = alloc_const int32[)" << MAX_C_ << R"(]; +__constant__ int32* delta_a = alloc_const int32[)" + std::to_string(MAX_C_) + R"(]; -void shift(restrict read_only align(16) )" << a_ty_ << R"( *A, - restrict read_only align(16) )" << b_ty_ << R"( *B, +void shift(restrict read_only align(16) )" + a_ty_ + R"( *A, + restrict read_only align(16) )" + b_ty_ + R"( *B, fp32 *C, int32 M, int32 N, int32 K, int32 stride_h, int32 stride_w, int32 lda_b, int32 lda_w, int32 lda_h, int32 lda_c, int32 ldb_b, int32 ldb_w, int32 ldb_h, int32 ldb_c, int32 ldc_b, int32 ldc_w, int32 ldc_h, int32 ldc_c, - int32 NB, int32 AH, int32 AW, + int32 NB, + int32 AH, int32 AW, int32 BH, int32 BW, int32 CH, int32 CW) { int32 rxa[TM] = get_global_range[TM](0); @@ -282,31 +283,34 @@ void shift(restrict read_only align(16) )" << a_ty_ << R"( *A, /* A offsets */ if(op_ == FPROP){ - os << R"( - int32 rawh[TM] = rxa / NB; - int32 rab[TM] = rxa % NB; - int32 raw[TM] = (rawh % CW) * stride_w; - int32 rah[TM] = (rawh / CW) * stride_h; + result += R"( + int32 rawh[TM] = rxa / NB; + int32 rab[TM] = rxa % NB; + int32 raw[TM] = rawh % CW; + int32 rah[TM] = rawh / CW; + raw = raw * stride_w; + rah = rah * stride_h; int32 offxa[TM] = rab*lda_b + raw*lda_w + rah*lda_h; int32 offa0[TM, TK] = offxa[:, newaxis]; __constant__ int32* pd[TK] = delta_a + rka; multiple_of(4) int32 d[TK] = *pd; int32 offa_interior[TM, TK] = d[newaxis, :]; - int32 offa_exterior[TM, TK] = rka[newaxis, :] * lda_c;\n)"; + int32 offa_exterior[TM, TK] = rka[newaxis, :] * lda_c; + )"; if(shift_edge_h_) - os << " int1 interiorh[TM] = 1;"; + result += " int1 interiorh[TM] = 1;\n"; else - os << " int1 interiorh[TM] = (rah >= pad_h) && (rah < (AH - pad_h));"; + result += " int1 interiorh[TM] = (rah >= pad_h) && (rah < (AH - pad_h));\n"; if(shift_edge_w_) - os << " int1 interiorw[TM] = 1;"; + result += " int1 interiorw[TM] = 1;"; else - os << " int1 interiorw[TM] = (raw >= pad_w) && (raw < (AW - pad_w));"; - os << R"( + result += " int1 interiorw[TM] = (raw >= pad_w) && (raw < (AW - pad_w));"; + result += R"( int1 interior[TM, TK] = interiorh[:, newaxis] && interiorw[:, newaxis]; int32 offa1[TM, TK] = interior ? offa_interior : offa_exterior;)"; } if(op_ == BPROP){ - os << R"( + result += R"( int32 rawh[TM] = rxa / NB; int32 rab[TM] = rxa % NB; int32 raw[TM] = (rawh % CW); @@ -316,12 +320,12 @@ if(op_ == BPROP){ int32 offa1[TM, TK] = rka[newaxis, :] * lda_c;)"; } if(op_ == WGRAD && layout_ == CHWN){ - os << R"( + result += R"( int32 offa0[TK, TM] = rxa[newaxis, :] * lda_c; int32 offa1[TK, TM] = rka[:, newaxis];)"; } if(op_ == WGRAD && layout_ == NCHW){ - os << R"( + result += R"( int32 offa0[TK, TM] = rxa[newaxis, :] * lda_c; int32 rawh[TK] = rka / NB; int32 rab[TK] = rka % NB; @@ -333,34 +337,37 @@ if(op_ == WGRAD && layout_ == NCHW){ /* B offsets */ if(op_ == FPROP){ - os << R"( + result += R"( int32 offb0[TN, TK] = ryb[:, newaxis]; int32 offb1[TN, TK] = rkb[newaxis, :] * ldb_c;)"; } if(op_ == BPROP){ - os << R"( + result += R"( int32 offb0[TK, TN] = ryb[newaxis, :] * ldb_c; int32 offb1[TK, TN] = rkb[:, newaxis];)"; } if(op_ == WGRAD){ - os << R"( + result += R"( __constant__ int32* pd[TN] = delta_a + ryb; int32 d[TN] = *pd; int32 shift[TK, TN] = d[newaxis, :]; int32 rbwh[TK] = rkb / NB; int32 rbb[TK] = rkb % NB; - int32 rbw[TK] = (rbwh % CW)*stride_w; - int32 rbh[TK] = (rbwh / CW)*stride_h; - int32 offkb[TK] = rbb*ldb_b + rbw*ldb_w + rbh*ldb_h;\n)"; + int32 rbw[TK] = rbwh % CW; + int32 rbh[TK] = rbwh / CW; + rbw = rbw * stride_w; + rbh = rbh * stride_h; + int32 offkb[TK] = rbb*ldb_b + rbw*ldb_w + rbh*ldb_h; + )"; if(shift_edge_h_) - os << " int1 interiorh[TK] = 1;\n"; + result += " int1 interiorh[TK] = 1;\n"; else - os << " int1 interiorh[TK] = (rbh >= pad_h) && (rbh < (AH - pad_h));\n"; + result += " int1 interiorh[TK] = (rbh >= pad_h) && (rbh < (AH - pad_h));\n"; if(shift_edge_w_) - os << " int1 interiorw[TK] = 1;\n"; + result += " int1 interiorw[TK] = 1;"; else - os << " int1 interiorw[TK] = (rbw >= pad_w) && (rbw < (AW - pad_w));\n"; - os << R"( + result += " int1 interiorw[TK] = (rbw >= pad_w) && (rbw < (AW - pad_w));"; + result += R"( int1 interior[TK, TN] = interiorh[:, newaxis] && interiorw[:, newaxis]; int32 incb[TK, TN] = interior ? shift : 0; int32 offb0[TK, TN] = ryb[newaxis, :] * ldb_c; @@ -368,21 +375,21 @@ if(op_ == WGRAD){ } /* Main loop */ - os << R"( - )" << a_ty_ << "* pa[" << AS << R"(] = A + offa0 + offa1; - )" << b_ty_ << "* pb[" << BS << R"(] = B + offb0 + offb1; - int1 checka[)" << AS << "] = (rka < K)" << bca0 << R"(; - int1 checkb[)" << BS << "] = (rkb < K)" << bcb0 << R"(; - )" << a_ty_ << " a[" << AS << R"(] = checka ? *pa : 0; - )" << b_ty_ << " b[" << BS << R"(] = checkb ? *pb : 0; + result += R"( + )" + a_ty_ + "* pa[" + AS + R"(] = A + offa0 + offa1; + )" + b_ty_ + "* pb[" + BS + R"(] = B + offb0 + offb1; + int1 checka[)" + AS + "] = (rka < K)" + bca0 + R"(; + int1 checkb[)" + BS + "] = (rkb < K)" + bcb0 + R"(; + )" + a_ty_ + " a[" + AS + R"(] = checka ? *pa : 0; + )" + b_ty_ + " b[" + BS + R"(] = checkb ? *pb : 0; for(int32 k = K; k > 0; k = k - TK){ - c = dot()" << usea << "," << useb << R"(, c); - int1 checka[)" << AS << R"(] = k > TK; - int1 checkb[)" << BS << R"(] = k > TK;)"; + c = dot()" + usea + "," + useb + R"(, c); + int1 checka[)" + AS + R"(] = k > TK; + int1 checkb[)" + BS + R"(] = k > TK;)"; /* Increment A pointers */ if(op_ == FPROP){ - os << R"( + result += R"( pd = pd + TK; d = *pd; offa_interior = d[newaxis, :]; @@ -391,15 +398,15 @@ if(op_ == FPROP){ pa = pa + offa;)"; } if(op_ == BPROP){ - os << R"( + result += R"( pa = pa + TK * lda_c;)"; } if(op_ == WGRAD && layout_ == CHWN){ - os << R"( + result += R"( pa = pa + TK;)"; } if(op_ == WGRAD && layout_ == NCHW){ - os << R"( + result += R"( rka = rka + TK; rawh = rka / NB; rab = rka % NB; @@ -408,40 +415,43 @@ if(op_ == WGRAD && layout_ == NCHW){ offxa = rab*lda_b + raw*lda_w + rah*lda_h; pa = A + offa0 + offxa[:, newaxis];)"; } - os << R"( + result += R"( @checka a = *pa;)"; /* Increment B pointers */ if(op_ == WGRAD){ - os << R"( + result += R"( rkb = rkb + TK; rbwh = rkb / NB; rbb = rkb % NB; - rbw = (rbwh % CW)*stride_w; - rbh = (rbwh / CW)*stride_h; - offkb = rbb*ldb_b + rbw*ldb_w + rbh*ldb_h;\n)"; + rbw = rbwh % CW; + rbh = rbwh / CW; + rbw = rbw * stride_w; + rbh = rbh * stride_h; + offkb = rbb*ldb_b + rbw*ldb_w + rbh*ldb_h; + )"; if(shift_edge_h_) - os << " interiorh = 1;\n"; + result += " interiorh = 1;\n"; else - os << " interiorh = (rbh >= pad_h) && (rbh < (AH - pad_h));\n"; + result += " interiorh = (rbh >= pad_h) && (rbh < (AH - pad_h));\n"; if(shift_edge_w_) - os << " interiorw = 1;\n"; + result += " interiorw = 1;"; else - os << " interiorw = (rbw >= pad_w) && (rbw < (AW - pad_w));\n"; - os << R"( + result += " interiorw = (rbw >= pad_w) && (rbw < (AW - pad_w));"; + result += R"( interior = interiorh[:, newaxis] && interiorw[:, newaxis]; incb = interior ? shift : 0; pb = B + offb0 + offkb[:, newaxis] + incb;)"; } if(op_ == FPROP){ - os << R"( + result += R"( pb = pb + TK * ldb_c;)"; } if(op_ == BPROP){ - os << R"( + result += R"( pb = pb + TK;)"; } - os << R"( + result += R"( @checkb b = *pb; } int32 rxc[TM] = get_global_range[TM](0); @@ -449,44 +459,41 @@ if(op_ == BPROP){ /* C offsets */ if(op_ == BPROP){ - os << R"( + result += R"( int32 rcwh[TM] = rxc / NB; int32 rcb[TM] = rxc % NB; int32 rcw[TM] = (rcwh % CW) * stride_w; int32 rch[TM] = (rcwh / CW) * stride_h; - int32 offxc[TM] = rcb*ldc_b + rcw*ldc_w + rch*ldc_h; - )"; + int32 offxc[TM] = rcb*ldc_b + rcw*ldc_w + rch*ldc_h;)"; } if(op_ == FPROP){ - os << R"( + result += R"( int32 rcwh[TM] = rxc / NB; int32 rcb[TM] = rxc % NB; int32 rcw[TM] = (rcwh % CW); int32 rch[TM] = (rcwh / CW); - int32 offxc[TM] = rcb*ldc_b + rcw*ldc_w + rch*ldc_h; - )"; + int32 offxc[TM] = rcb*ldc_b + rcw*ldc_w + rch*ldc_h;)"; } if(op_ == WGRAD){ - os << R"( - int32 offxc[TM] = rxc; - )"; + result += R"( + int32 offxc[TM] = rxc;)"; } - os << R"(" + result += R"(" fp32* pc[TM, TN] = C + offxc[:, newaxis] + ryc[newaxis, :]*ldc_c; int1 checkc0[TM] = rxc < M; int1 checkc1[TN] = ryc < N; int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :];)"; if(op_ == BPROP){ - os << "\n"; + result += "\n"; if(shift_edge_h_) - os << " int1 interiorh[TM] = 1;\n"; + result += " int1 interiorh[TM] = 1;\n"; else - os << " int1 interiorh[TM] = (rch >= pad_h) && (rch < (AH - pad_h));\n"; + result += " int1 interiorh[TM] = (rch >= pad_h) && (rch < (AH - pad_h));\n"; if(shift_edge_w_) - os << " int1 interiorw[TM] = 1;\n"; + result += " int1 interiorw[TM] = 1;"; else - os << " int1 interiorw[TM] = (rcw >= pad_w) && (rcw < (AW - pad_w));\n"; - os << R"( + result += " int1 interiorw[TM] = (rcw >= pad_w) && (rcw < (AW - pad_w));"; + result += R"( int1 interior[TM, TN] = interiorh[:, newaxis] && interiorw[:, newaxis]; __constant__ int32* pd[TN] = delta_a + ryc; fp32* shift_pc[TM, TN] = pc + (*pd)[newaxis, :]; @@ -495,11 +502,13 @@ if(op_ == BPROP){ )"; } else{ - os << R"( + result += R"( @checkc *pc = c;)"; } - os << R"( + result += R"( })"; + + os << result; } } From 54617b4e516d5fc45a9c193db00ca5057a305d23 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 12 Jul 2019 20:10:15 -0700 Subject: [PATCH 234/494] some cleaning --- lib/dnn/shift.cpp | 154 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 119 insertions(+), 35 deletions(-) diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index f2502db70..72e8395ba 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -283,11 +283,21 @@ void shift(restrict read_only align(16) )" + a_ty_ + R"( *A, /* A offsets */ if(op_ == FPROP){ + if(true){ + result += R"( + int32 rawh[TM] = rxa / NB; + int32 rab[TM] = rxa % NB; + int32 raw[TM] = rawh % CW; + int32 rah[TM] = rawh / CW;)"; + } + else{ + result += R"( + int32 rabh[TM] = rxa / CW; + int32 raw[TM] = rxa % CW; + int32 rah[TM] = rabh % CH; + int32 rab[TM] = rabh / CH;)"; + } result += R"( - int32 rawh[TM] = rxa / NB; - int32 rab[TM] = rxa % NB; - int32 raw[TM] = rawh % CW; - int32 rah[TM] = rawh / CW; raw = raw * stride_w; rah = rah * stride_h; int32 offxa[TM] = rab*lda_b + raw*lda_w + rah*lda_h; @@ -310,11 +320,21 @@ if(op_ == FPROP){ int32 offa1[TM, TK] = interior ? offa_interior : offa_exterior;)"; } if(op_ == BPROP){ + if(true){ + result += R"( + int32 rawh[TM] = rxa / NB; + int32 rab[TM] = rxa % NB; + int32 raw[TM] = rawh % CW; + int32 rah[TM] = rawh / CW;)"; + } + else{ + result += R"( + int32 rabh[TM] = rxa / CW; + int32 raw[TM] = rxa % CW; + int32 rah[TM] = rabh % CH; + int32 rab[TM] = rabh / CH;)"; + } result += R"( - int32 rawh[TM] = rxa / NB; - int32 rab[TM] = rxa % NB; - int32 raw[TM] = (rawh % CW); - int32 rah[TM] = (rawh / CW); int32 offxa[TM] = rab*lda_b + raw*lda_w + rah*lda_h; int32 offa0[TM, TK] = offxa[:, newaxis]; int32 offa1[TM, TK] = rka[newaxis, :] * lda_c;)"; @@ -325,12 +345,22 @@ if(op_ == WGRAD && layout_ == CHWN){ int32 offa1[TK, TM] = rka[:, newaxis];)"; } if(op_ == WGRAD && layout_ == NCHW){ + if(true){ + result += R"( + int32 rawh[TM] = rka / NB; + int32 rab[TM] = rka % NB; + int32 raw[TM] = rawh % CW; + int32 rah[TM] = rawh / CW;)"; + } + else{ + result += R"( + int32 rabh[TM] = rka / CW; + int32 raw[TM] = rka % CW; + int32 rah[TM] = rabh % CH; + int32 rab[TM] = rabh / CH;)"; + } result += R"( int32 offa0[TK, TM] = rxa[newaxis, :] * lda_c; - int32 rawh[TK] = rka / NB; - int32 rab[TK] = rka % NB; - int32 raw[TK] = (rawh % CW); - int32 rah[TK] = (rawh / CW); int32 offxa[TK] = rab*lda_b + raw*lda_w + rah*lda_h; int32 offa1[TK, TM] = offxa[:, newaxis];)"; } @@ -347,14 +377,24 @@ if(op_ == BPROP){ int32 offb1[TK, TN] = rkb[:, newaxis];)"; } if(op_ == WGRAD){ + if(true){ + result += R"( + int32 rbwh[TM] = rkb / NB; + int32 rbb[TM] = rkb % NB; + int32 rbw[TM] = rbwh % CW; + int32 rbh[TM] = rbwh / CW;)"; + } + else{ + result += R"( + int32 rbbh[TM] = rkb / CW; + int32 rbw[TM] = rkb % CW; + int32 rbh[TM] = rbbh % CH; + int32 rbb[TM] = rbbh / CH;)"; + } result += R"( __constant__ int32* pd[TN] = delta_a + ryb; int32 d[TN] = *pd; int32 shift[TK, TN] = d[newaxis, :]; - int32 rbwh[TK] = rkb / NB; - int32 rbb[TK] = rkb % NB; - int32 rbw[TK] = rbwh % CW; - int32 rbh[TK] = rbwh / CW; rbw = rbw * stride_w; rbh = rbh * stride_h; int32 offkb[TK] = rbb*ldb_b + rbw*ldb_w + rbh*ldb_h; @@ -406,12 +446,23 @@ if(op_ == WGRAD && layout_ == CHWN){ pa = pa + TK;)"; } if(op_ == WGRAD && layout_ == NCHW){ + result += R"( + rka = rka + TK;)"; + if(true){ + result += R"( + int32 rawh[TM] = rka / NB; + int32 rab[TM] = rka % NB; + int32 raw[TM] = rawh % CW; + int32 rah[TM] = rawh / CW;)"; + } + else{ + result += R"( + int32 rabh[TM] = rka / CW; + int32 raw[TM] = rka % CW; + int32 rah[TM] = rabh % CH; + int32 rab[TM] = rabh / CH;)"; + } result += R"( - rka = rka + TK; - rawh = rka / NB; - rab = rka % NB; - raw = (rawh % CW); - rah = (rawh / CW); offxa = rab*lda_b + raw*lda_w + rah*lda_h; pa = A + offa0 + offxa[:, newaxis];)"; } @@ -420,12 +471,23 @@ if(op_ == WGRAD && layout_ == NCHW){ /* Increment B pointers */ if(op_ == WGRAD){ - result += R"( - rkb = rkb + TK; - rbwh = rkb / NB; - rbb = rkb % NB; - rbw = rbwh % CW; - rbh = rbwh / CW; + result += R"( + rkb = rkb + TK;)"; + if(true){ + result += R"( + int32 rbwh[TM] = rkb / NB; + int32 rbb[TM] = rkb % NB; + int32 rbw[TM] = rbwh % CW; + int32 rbh[TM] = rbwh / CW;)"; + } + else{ + result += R"( + int32 rbbh[TM] = rkb / CW; + int32 rbw[TM] = rkb % CW; + int32 rbh[TM] = rbbh % CH; + int32 rbb[TM] = rbbh / CH;)"; + } + result += R"( rbw = rbw * stride_w; rbh = rbh * stride_h; offkb = rbb*ldb_b + rbw*ldb_w + rbh*ldb_h; @@ -459,19 +521,41 @@ if(op_ == BPROP){ /* C offsets */ if(op_ == BPROP){ + if(true){ + result += R"( + int32 rcwh[TM] = rxc / NB; + int32 rcb[TM] = rxc % NB; + int32 rcw[TM] = rcwh % CW; + int32 rch[TM] = rcwh / CW;)"; + } + else{ + result += R"( + int32 rcbh[TM] = rxc / CW; + int32 rcw[TM] = rxc % CW; + int32 rch[TM] = rcbh % CH; + int32 rcb[TM] = rcbh / CH;)"; + } result += R"( - int32 rcwh[TM] = rxc / NB; - int32 rcb[TM] = rxc % NB; - int32 rcw[TM] = (rcwh % CW) * stride_w; - int32 rch[TM] = (rcwh / CW) * stride_h; + rcw = rcw * stride_w; + rch = rch * stride_h; int32 offxc[TM] = rcb*ldc_b + rcw*ldc_w + rch*ldc_h;)"; } if(op_ == FPROP){ +if(true){ + result += R"( + int32 rcwh[TM] = rxc / NB; + int32 rcb[TM] = rxc % NB; + int32 rcw[TM] = rcwh % CW; + int32 rch[TM] = rcwh / CW;)"; +} +else{ + result += R"( + int32 rcbh[TM] = rxc / CW; + int32 rcw[TM] = rxc % CW; + int32 rch[TM] = rcbh % CH; + int32 rcb[TM] = rcbh / CH;)"; +} result += R"( - int32 rcwh[TM] = rxc / NB; - int32 rcb[TM] = rxc % NB; - int32 rcw[TM] = (rcwh % CW); - int32 rch[TM] = (rcwh / CW); int32 offxc[TM] = rcb*ldc_b + rcw*ldc_w + rch*ldc_h;)"; } if(op_ == WGRAD){ From fe42cb71422a052548c5cbcba790aeb593b472b2 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 12 Jul 2019 20:22:32 -0700 Subject: [PATCH 235/494] [dnn/shift] optimizations for NCHW layout --- examples/cpp/shift.cpp | 2 +- lib/dnn/base.cpp | 2 +- lib/dnn/shift.cpp | 81 +++++++++++++++++++++--------------------- 3 files changed, 43 insertions(+), 42 deletions(-) diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index d4417cd7d..4be4861cc 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -33,7 +33,7 @@ int main() { triton::dnn::shift shift(B, C, 1, H, W, 1, R, S, F, 1, 1, shift_h.data(), shift_w.data(), numeric_t_str, numeric_t_str, - op, false); + op, false, triton::dnn::shift::NCHW); // host buffers std::vector ha(B*C*H*W); std::vector hb(C*F); diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index 710794925..61ab85b60 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -24,7 +24,7 @@ base::base(const std::string& name) void base::enqueue(driver::stream *stream, std::vector args) { static std::map, cmp_recompile> m_jit; - bool autotune = false; + bool autotune = true; driver::context* ctx = stream->context(); triton::jit* jit; /* the current template has not already been compiled */ diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 72e8395ba..6e209fef3 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -252,6 +252,7 @@ void shift::triton_c_src(std::ostream &os) const { } std::string AS = AS0 + ", " + AS1; std::string BS = BS0 + ", " + BS1; + bool is_chwn = layout_ == CHWN; std::string result = R"( @@ -283,7 +284,7 @@ void shift(restrict read_only align(16) )" + a_ty_ + R"( *A, /* A offsets */ if(op_ == FPROP){ - if(true){ + if(is_chwn){ result += R"( int32 rawh[TM] = rxa / NB; int32 rab[TM] = rxa % NB; @@ -320,7 +321,7 @@ if(op_ == FPROP){ int32 offa1[TM, TK] = interior ? offa_interior : offa_exterior;)"; } if(op_ == BPROP){ - if(true){ + if(is_chwn){ result += R"( int32 rawh[TM] = rxa / NB; int32 rab[TM] = rxa % NB; @@ -345,19 +346,19 @@ if(op_ == WGRAD && layout_ == CHWN){ int32 offa1[TK, TM] = rka[:, newaxis];)"; } if(op_ == WGRAD && layout_ == NCHW){ - if(true){ + if(is_chwn){ result += R"( - int32 rawh[TM] = rka / NB; - int32 rab[TM] = rka % NB; - int32 raw[TM] = rawh % CW; - int32 rah[TM] = rawh / CW;)"; + int32 rawh[TK] = rka / NB; + int32 rab[TK] = rka % NB; + int32 raw[TK] = rawh % CW; + int32 rah[TK] = rawh / CW;)"; } else{ result += R"( - int32 rabh[TM] = rka / CW; - int32 raw[TM] = rka % CW; - int32 rah[TM] = rabh % CH; - int32 rab[TM] = rabh / CH;)"; + int32 rabh[TK] = rka / CW; + int32 raw[TK] = rka % CW; + int32 rah[TK] = rabh % CH; + int32 rab[TK] = rabh / CH;)"; } result += R"( int32 offa0[TK, TM] = rxa[newaxis, :] * lda_c; @@ -377,19 +378,19 @@ if(op_ == BPROP){ int32 offb1[TK, TN] = rkb[:, newaxis];)"; } if(op_ == WGRAD){ - if(true){ + if(is_chwn){ result += R"( - int32 rbwh[TM] = rkb / NB; - int32 rbb[TM] = rkb % NB; - int32 rbw[TM] = rbwh % CW; - int32 rbh[TM] = rbwh / CW;)"; + int32 rbwh[TK] = rkb / NB; + int32 rbb[TK] = rkb % NB; + int32 rbw[TK] = rbwh % CW; + int32 rbh[TK] = rbwh / CW;)"; } else{ result += R"( - int32 rbbh[TM] = rkb / CW; - int32 rbw[TM] = rkb % CW; - int32 rbh[TM] = rbbh % CH; - int32 rbb[TM] = rbbh / CH;)"; + int32 rbbh[TK] = rkb / CW; + int32 rbw[TK] = rkb % CW; + int32 rbh[TK] = rbbh % CH; + int32 rbb[TK] = rbbh / CH;)"; } result += R"( __constant__ int32* pd[TN] = delta_a + ryb; @@ -448,19 +449,19 @@ if(op_ == WGRAD && layout_ == CHWN){ if(op_ == WGRAD && layout_ == NCHW){ result += R"( rka = rka + TK;)"; - if(true){ + if(is_chwn){ result += R"( - int32 rawh[TM] = rka / NB; - int32 rab[TM] = rka % NB; - int32 raw[TM] = rawh % CW; - int32 rah[TM] = rawh / CW;)"; + int32 rawh[TK] = rka / NB; + int32 rab[TK] = rka % NB; + int32 raw[TK] = rawh % CW; + int32 rah[TK] = rawh / CW;)"; } else{ result += R"( - int32 rabh[TM] = rka / CW; - int32 raw[TM] = rka % CW; - int32 rah[TM] = rabh % CH; - int32 rab[TM] = rabh / CH;)"; + int32 rabh[TK] = rka / CW; + int32 raw[TK] = rka % CW; + int32 rah[TK] = rabh % CH; + int32 rab[TK] = rabh / CH;)"; } result += R"( offxa = rab*lda_b + raw*lda_w + rah*lda_h; @@ -473,19 +474,19 @@ if(op_ == WGRAD && layout_ == NCHW){ if(op_ == WGRAD){ result += R"( rkb = rkb + TK;)"; - if(true){ + if(is_chwn){ result += R"( - int32 rbwh[TM] = rkb / NB; - int32 rbb[TM] = rkb % NB; - int32 rbw[TM] = rbwh % CW; - int32 rbh[TM] = rbwh / CW;)"; + int32 rbwh[TK] = rkb / NB; + int32 rbb[TK] = rkb % NB; + int32 rbw[TK] = rbwh % CW; + int32 rbh[TK] = rbwh / CW;)"; } else{ result += R"( - int32 rbbh[TM] = rkb / CW; - int32 rbw[TM] = rkb % CW; - int32 rbh[TM] = rbbh % CH; - int32 rbb[TM] = rbbh / CH;)"; + int32 rbbh[TK] = rkb / CW; + int32 rbw[TK] = rkb % CW; + int32 rbh[TK] = rbbh % CH; + int32 rbb[TK] = rbbh / CH;)"; } result += R"( rbw = rbw * stride_w; @@ -521,7 +522,7 @@ if(op_ == BPROP){ /* C offsets */ if(op_ == BPROP){ - if(true){ + if(is_chwn){ result += R"( int32 rcwh[TM] = rxc / NB; int32 rcb[TM] = rxc % NB; @@ -541,7 +542,7 @@ if(op_ == BPROP){ int32 offxc[TM] = rcb*ldc_b + rcw*ldc_w + rch*ldc_h;)"; } if(op_ == FPROP){ -if(true){ +if(is_chwn){ result += R"( int32 rcwh[TM] = rxc / NB; int32 rcb[TM] = rxc % NB; From 3e7a3ed67a14b3768b7470919fcd832b25eb0ac0 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 13 Jul 2019 21:05:34 -0700 Subject: [PATCH 236/494] [dnn/shift]: added support for fp16 --- examples/cpp/shift.cpp | 13 +++++++++++-- examples/python/pytorch/shift.cpp | 15 +++++++++++++-- examples/python/pytorch/triton.py | 2 -- examples/python/tensorflow/run.py | 23 +++++++++++++---------- examples/python/tensorflow/shift.cpp | 26 +++++++++++++------------- include/triton/dnn/base.h | 2 +- include/triton/dnn/shift.h | 1 + lib/codegen/selection.cpp | 11 ++++++++++- lib/dnn/base.cpp | 3 +-- lib/dnn/shift.cpp | 22 +++++++++++++--------- lib/runtime/jit.cpp | 1 - 11 files changed, 76 insertions(+), 43 deletions(-) diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index 4be4861cc..41c123fef 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -10,11 +10,11 @@ int main() { typedef float NumericT; - std::string numeric_t_str = "fp32"; + std::string numeric_t_str = "fp16"; // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); - auto op = triton::dnn::shift::FPROP; + auto op = triton::dnn::shift::BPROP; // initialization int32_t R = 3, S = 3; @@ -35,6 +35,15 @@ int main() { numeric_t_str, numeric_t_str, op, false, triton::dnn::shift::NCHW); // host buffers + size_t a_size = B*C*H*W; + size_t b_size = C*F; + size_t c_size = B*F*H*W; + if(op == triton::dnn::shift::BPROP) + std::swap(a_size, c_size); + if(op == triton::dnn::shift::WGRAD){ + std::swap(b_size, c_size); + std::swap(a_size, b_size); + } std::vector ha(B*C*H*W); std::vector hb(C*F); std::vector hc(B*F*H*W); diff --git a/examples/python/pytorch/shift.cpp b/examples/python/pytorch/shift.cpp index d650ca9e6..e3e968db6 100644 --- a/examples/python/pytorch/shift.cpp +++ b/examples/python/pytorch/shift.cpp @@ -45,11 +45,20 @@ torch::Tensor shift_common( CUstream custream = (CUstream)at::cuda::getCurrentCUDAStream(device).stream(); triton::driver::cu_stream stream(custream, false); triton::driver::context* ctx = stream.context(); + // Data-type + std::string dtype; + at::ScalarType type = torcha.scalar_type(); + switch(type){ + case at::ScalarType::Double: dtype = "fp64"; break; + case at::ScalarType::Float: dtype = "fp32"; break; + case at::ScalarType::Half: dtype = "fp16"; break; + default: AT_ERROR("unknown data-type for shift-conv"); + } // Get configuration bool has_bias = torchbias.storage().size() > 0; triton::dnn::shift shift(B, C, D, H, W, T, R, S, F, stride_h, stride_w, - shift_h, shift_w, "fp32", "fp32", + shift_h, shift_w, dtype, dtype, ty, has_bias, layout); // Bind memory triton::driver::cu_buffer a(ctx, (CUdeviceptr)torcha.storage().data(), false); @@ -61,7 +70,9 @@ torch::Tensor shift_common( std::vector c_shapes; for(auto x: _c_shapes) c_shapes.push_back(x); - torch::Tensor torchc = torch::empty(c_shapes).cuda(); + torch::Tensor torchc = torch::empty(c_shapes, type).cuda(); + + triton::driver::cu_buffer c(ctx, (CUdeviceptr)torchc.storage().data(), false); // Enqueue shift.enqueue(&stream, {&a, &b, &c}); diff --git a/examples/python/pytorch/triton.py b/examples/python/pytorch/triton.py index efeade389..2d78e58f7 100644 --- a/examples/python/pytorch/triton.py +++ b/examples/python/pytorch/triton.py @@ -123,8 +123,6 @@ class ShiftConvFunction(torch.autograd.Function): dw = torch.ops.triton.shift_conv_dw(dy.contiguous(), input, bias, width[0], width[1], stride[0], stride[1], shift_h, shift_w) if ctx.needs_input_grad[2]: dbias = torch.sum(dy, (1, 2, 3)) - #print('dx', ctx.needs_input_grad[0], np.isnan(dx.cpu().numpy()).any()) - #print('dw', ctx.needs_input_grad[1], np.isnan(dw.cpu().numpy()).any()) return dx, dw, dbias, None, None, None, None diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 57850de9a..971ad2898 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -58,29 +58,32 @@ def blocksparse_matmul_grad(op, dy): return (dx, dw) def run_shift(): - B, C, H, W = 16, 16, 2, 2 - R, S, F = 3, 3, 32 + B, C, H, W = 1, 16, 4, 4 + R, S, F = 3, 3, 16 stride_h, stride_w = 2, 2 np.random.seed(2) - a = tf.placeholder(tf.float32, shape=[B, C, H, W]) - b = tf.placeholder(tf.float32, shape=[C, F]) + a = tf.placeholder(tf.float16, shape=[B, C, H, W]) + b = tf.placeholder(tf.float16, shape=[C, F]) hshift_h = np.random.randint(- (R//2), R//2 + 1, size=C, dtype=np.int32) hshift_w = np.random.randint(- (S//2), R//2 + 1, size=C, dtype=np.int32) #hshift_h = np.zeros(C, dtype=np.int32) #hshift_w = np.zeros(C, dtype=np.int32) c = module.shift_conv(a, b, stride_h=stride_h, stride_w=stride_w, shift_h=tf.make_tensor_proto(hshift_h), shift_w=tf.make_tensor_proto(hshift_w)) # feed values - ha = np.random.rand(B, C, H, W) - hb = np.random.rand(C, F) - #ha = np.ones((B, C, H, W), dtype=np.float32) - #hb = np.ones((C, F), dtype=np.float32) + ha = np.random.rand(B, C, H, W)*0.1 + hb = np.random.rand(C, F)*0.1 + #ha = np.ones((B, C, H, W), dtype=np.float16) + #hb = np.ones((C, F), dtype=np.float16) sess = tf.InteractiveSession() # test grads = tf.test.compute_gradient([a, b], [(B, C, H, W), (C, F)], c, (B, F, H//stride_h, W//stride_w), - extra_feed_dict = {a: ha, b: hb}) + extra_feed_dict = {a: ha, b: hb}, delta=1e-2) dw_t, dw_n = grads[1] dx_t, dx_n = grads[0] - print(dw_t, dw_n) + #import sys + #np.set_printoptions(threshold=sys.maxsize) + print(dx_t) + print(dx_n) print(np.max(np.abs(dw_t - dw_n))) print(np.max(np.abs(dx_t - dx_n))) # Run diff --git a/examples/python/tensorflow/shift.cpp b/examples/python/tensorflow/shift.cpp index d9014795e..d844e9aa1 100644 --- a/examples/python/tensorflow/shift.cpp +++ b/examples/python/tensorflow/shift.cpp @@ -106,7 +106,7 @@ public: triton::dnn::shift shift(B, C, D, H, W, T, R_, S_, F, stride_h_, stride_w_, shift_h_data, shift_w_data, - "fp32", "fp32", OP, has_bias, layout_); + "fp16", "fp16", OP, has_bias, layout_); // shapes for c std::vector c_shapes; @@ -119,9 +119,9 @@ public: if (out_shapes.num_elements() == 0) return; // matrix multiplication parameters - triton::driver::cu_buffer da(ctx, (CUdeviceptr)tf_a.flat().data(), false); - triton::driver::cu_buffer db(ctx, (CUdeviceptr)tf_b.flat().data(), false); - triton::driver::cu_buffer dc(ctx, (CUdeviceptr)tf_c->flat().data(), false); + triton::driver::cu_buffer da(ctx, (CUdeviceptr)tf_a.flat().data(), false); + triton::driver::cu_buffer db(ctx, (CUdeviceptr)tf_b.flat().data(), false); + triton::driver::cu_buffer dc(ctx, (CUdeviceptr)tf_c->flat().data(), false); shift.enqueue(stream, {&da, &db, &dc}); } @@ -137,31 +137,31 @@ private: REGISTER_KERNEL_BUILDER(Name("ShiftConv").Device(DEVICE_GPU), ShiftConvOp); REGISTER_OP("ShiftConv") - .Input("a: float32") - .Input("b: float32") + .Input("a: float16") + .Input("b: float16") .Attr("shift_h: tensor") .Attr("shift_w: tensor") .Attr("stride_h: int") .Attr("stride_w: int") - .Output("c: float32"); + .Output("c: float16"); REGISTER_KERNEL_BUILDER(Name("ShiftConvDx").Device(DEVICE_GPU), ShiftConvOp); REGISTER_OP("ShiftConvDx") - .Input("a: float32") - .Input("b: float32") + .Input("a: float16") + .Input("b: float16") .Attr("shift_h: tensor") .Attr("shift_w: tensor") .Attr("stride_h: int") .Attr("stride_w: int") - .Output("c: float32"); + .Output("c: float16"); REGISTER_KERNEL_BUILDER(Name("ShiftConvDw").Device(DEVICE_GPU), ShiftConvOp); REGISTER_OP("ShiftConvDw") - .Input("a: float32") - .Input("b: float32") + .Input("a: float16") + .Input("b: float16") .Attr("shift_h: tensor") .Attr("shift_w: tensor") .Attr("stride_h: int") .Attr("stride_w: int") - .Output("c: float32"); + .Output("c: float16"); diff --git a/include/triton/dnn/base.h b/include/triton/dnn/base.h index e3c6ff9e1..7aeab2a14 100644 --- a/include/triton/dnn/base.h +++ b/include/triton/dnn/base.h @@ -60,7 +60,7 @@ public: // clone virtual base* clone() const = 0; // enqueue - void enqueue(driver::stream* stream, std::vector args); + void enqueue(driver::stream* stream, std::vector args, bool autotune = false); private: std::string name_; diff --git a/include/triton/dnn/shift.h b/include/triton/dnn/shift.h index 57cb5ea0a..ec4ffc753 100644 --- a/include/triton/dnn/shift.h +++ b/include/triton/dnn/shift.h @@ -155,6 +155,7 @@ private: // data types std::string a_ty_; std::string b_ty_; + std::string c_ty_; // convolution type type op_; bool bias_; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 3bd010ebc..7ca8fb6ee 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -376,7 +376,15 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::function(inst)){ Value *ptr = value(ii->get_operand(0)); Value *val = value(ii->get_operand(1)); - Value *atom_f_add = Intrinsic::getDeclaration(builder.GetInsertBlock()->getModule(), Intrinsic::nvvm_atomic_load_add_f32, {ptr->getType()}); + Value *atom_f_add; + if(val->getType()->isFloatTy()) + atom_f_add = Intrinsic::getDeclaration(builder.GetInsertBlock()->getModule(), Intrinsic::nvvm_atomic_load_add_f32, {ptr->getType()}); + else if(val->getType()->isHalfTy()){ + Type *fp16 = Type::getHalfTy(ctx); + + FunctionType *atom_ty = FunctionType::get(fp16, {fp16->getPointerTo(), fp16}, false); + atom_f_add = InlineAsm::get(atom_ty, " atom.relaxed.global.gpu.add.noftz.f16 $0, [$1], $2;", "=h,l,h", true); + } Value *res = builder.CreateCall(atom_f_add, {ptr, val}); return (Instruction*)res; } @@ -1110,6 +1118,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & unsigned max_contiguous = axis_info_->get_max_contiguous(ptr); unsigned alignment = std::min(starting_multiple, max_contiguous); unsigned vector_size = std::min(result->axis(0).contiguous, alignment); + vector_size = 1; // vector_size = result->axis(0).contiguous; std::map packets; distributed_tile *TP = (distributed_tile*)tmap_.at(ld->get_pointer_operand()); diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index 61ab85b60..b3bf6c05a 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -22,9 +22,8 @@ void base::set_ld(const std::vector& shapes, base::base(const std::string& name) : name_(name) { } -void base::enqueue(driver::stream *stream, std::vector args) { +void base::enqueue(driver::stream *stream, std::vector args, bool autotune) { static std::map, cmp_recompile> m_jit; - bool autotune = true; driver::context* ctx = stream->context(); triton::jit* jit; /* the current template has not already been compiled */ diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 6e209fef3..872189c89 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -22,7 +22,7 @@ shift::shift(int B, int C, F_(F), stride_d_(1), stride_h_(stride_h), stride_w_(stride_w), shift_h_(shift_h), shift_w_(shift_w), - a_ty_(a_ty), b_ty_(b_ty), + a_ty_(a_ty), b_ty_(b_ty), c_ty_(b_ty), op_(ty), bias_(bias), layout_(layout){ // std::cout << B_ << " " << C_ << " " << F_ << " " << stride_h_ << " " << stride_w_ << " " << a_ty_ << " " << b_ty_ << " " << ty_ << " " << layout_ << std::endl; @@ -230,8 +230,10 @@ void shift::enqueue_impl(driver::stream *stream, driver::kernel *kernel, kernel->setArg(26, CW_); unsigned TM = ranges[0], TN = ranges[1]; std::array grid = {(M_ + TM - 1)/TM, (N_ + TN - 1)/TN, 1}; - if(op_ == BPROP) - ((driver::cu_buffer*)c)->set_zero(stream, AH_*AW_*B_*C_*4); + if(op_ == BPROP){ + size_t c_nbytes = (c_ty_ == "fp16") ? 2 : 4; + ((driver::cu_buffer*)c)->set_zero(stream, AH_*AW_*B_*C_*c_nbytes); + } stream->enqueue(kernel, grid, {nthreads, 1, 1}); } @@ -264,7 +266,7 @@ __constant__ int32* delta_a = alloc_const int32[)" + std::to_string(MAX_C_) + R" void shift(restrict read_only align(16) )" + a_ty_ + R"( *A, restrict read_only align(16) )" + b_ty_ + R"( *B, - fp32 *C, + )" + c_ty_ + R"( *C, int32 M, int32 N, int32 K, int32 stride_h, int32 stride_w, int32 lda_b, int32 lda_w, int32 lda_h, int32 lda_c, @@ -278,7 +280,7 @@ void shift(restrict read_only align(16) )" + a_ty_ + R"( *A, int32 ryb[TN] = get_global_range[TN](1); int32 rka[TK] = 0 ... TK; int32 rkb[TK] = 0 ... TK; - fp32 c[TM, TN] = 0; + fp32 acc[TM, TN] = 0; int32 pad_h = BH / 2; int32 pad_w = BW / 2;)"; @@ -304,7 +306,7 @@ if(op_ == FPROP){ int32 offxa[TM] = rab*lda_b + raw*lda_w + rah*lda_h; int32 offa0[TM, TK] = offxa[:, newaxis]; __constant__ int32* pd[TK] = delta_a + rka; - multiple_of(4) int32 d[TK] = *pd; + int32 d[TK] = *pd; int32 offa_interior[TM, TK] = d[newaxis, :]; int32 offa_exterior[TM, TK] = rka[newaxis, :] * lda_c; )"; @@ -424,7 +426,7 @@ if(op_ == WGRAD){ )" + a_ty_ + " a[" + AS + R"(] = checka ? *pa : 0; )" + b_ty_ + " b[" + BS + R"(] = checkb ? *pb : 0; for(int32 k = K; k > 0; k = k - TK){ - c = dot()" + usea + "," + useb + R"(, c); + acc = dot()" + usea + "," + useb + R"(, acc); int1 checka[)" + AS + R"(] = k > TK; int1 checkb[)" + BS + R"(] = k > TK;)"; @@ -564,7 +566,8 @@ if(op_ == WGRAD){ int32 offxc[TM] = rxc;)"; } result += R"(" - fp32* pc[TM, TN] = C + offxc[:, newaxis] + ryc[newaxis, :]*ldc_c; + )" + c_ty_ + R"( c[TM, TN] = acc; + )" + c_ty_ + R"(* pc[TM, TN] = C + offxc[:, newaxis] + ryc[newaxis, :]*ldc_c; int1 checkc0[TM] = rxc < M; int1 checkc1[TN] = ryc < N; int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :];)"; @@ -581,7 +584,7 @@ if(op_ == BPROP){ result += R"( int1 interior[TM, TN] = interiorh[:, newaxis] && interiorw[:, newaxis]; __constant__ int32* pd[TN] = delta_a + ryc; - fp32* shift_pc[TM, TN] = pc + (*pd)[newaxis, :]; + )" + c_ty_ + R"(* shift_pc[TM, TN] = pc + (*pd)[newaxis, :]; pc = interior ? shift_pc : pc; @checkc __atomic_add(pc, c); )"; @@ -593,6 +596,7 @@ else{ result += R"( })"; +// std::cout << result << std::endl; os << result; } diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index 30547a19e..536ad44b0 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -149,7 +149,6 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben std::vector> ranges; for(ir::metaparameter *mp: mps) ranges.push_back(mp->get_space()); -// std::cout << ranges.size() << std::endl; // iterate over parameters unsigned i; tune_res_t best; From 3c128fc2e2630de0fc7332e7e25c20cc116b0448 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 14 Jul 2019 21:54:57 -0700 Subject: [PATCH 237/494] [jit/autotune] added support for multi-threaded auto-tuning --- examples/cpp/shift.cpp | 8 +-- include/triton/runtime/jit.h | 9 ++- lib/runtime/jit.cpp | 122 ++++++++++++++++++++--------------- 3 files changed, 81 insertions(+), 58 deletions(-) diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index 41c123fef..6941dfa0d 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -14,13 +14,13 @@ int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); - auto op = triton::dnn::shift::BPROP; + auto op = triton::dnn::shift::FPROP; // initialization int32_t R = 3, S = 3; - int32_t B = 16, F = 4096; + int32_t B = 16, F = 512; int32_t H = 16, W = 16; - int32_t C = 4096; + int32_t C = 512; // random shifts std::vector shift_h(C); @@ -66,7 +66,7 @@ int main() { stream->write(db, true, 0, hb); stream->write(dc, true, 0, hc); stream->synchronize(); - shift.enqueue(stream, {da, db, dc}); + shift.enqueue(stream, {da, db, dc}, true); // stream->read(dc, true, 0, hc); // shift.cpu_ref(rc.data(), ha.data(), hb.data()); // for(size_t i = 0; i < hc.size(); i++) diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index ca5395893..9b0f75f96 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -28,6 +28,10 @@ namespace llvm { namespace triton { +namespace lang{ +class translation_unit; +} + namespace codegen{ class tune; } @@ -97,8 +101,9 @@ public: private: std::string compute_data_layout(bool is_64bit = true, bool use_short_pointers = true); - std::unique_ptr make_llvm_module(triton::ir::module &module, passes_wrapper &passes); - std::unique_ptr make_triton_module(const char* name, const char* src); + std::unique_ptr make_llvm_module(triton::ir::module &module, passes_wrapper &passes, llvm::LLVMContext &context, launch_information &info); + std::unique_ptr make_triton_module(const char *name, triton::ir::context &context, triton::lang::translation_unit *program); + triton::lang::translation_unit *parse_program(const char *name, const char *src); public: jit(driver::context* context); diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index 536ad44b0..51f3ed916 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -19,6 +19,8 @@ #include "llvm/IR/LegacyPassManager.h" #include "llvm/Transforms/Scalar/EarlyCSE.h" #include "llvm/Analysis/LoopPass.h" +#include "triton/tools/thread_pool.h" +#include typedef struct yy_buffer_state * YY_BUFFER_STATE; extern int yyparse(); @@ -28,14 +30,19 @@ extern triton::lang::translation_unit *ast_root; namespace triton { -void loop_nest(std::vector const & ranges, std::function const &)> const & f){ +void loop_nest(std::vector const & ranges, + std::function const &)> const & f, + size_t nthreads){ size_t D = ranges.size(); std::vector values(D, 0); + // thread pools + nbsdx::concurrent::thread_pool pool(nthreads); // Start with innermost loop size_t i = D - 1; + size_t current = 0; while(true){ //Execute function - f(values); + pool.add_job([values, &f](){ f(values); }); //Increment counters while(values[i]++ == ranges[i] - 1){ if(i == 0) @@ -47,7 +54,7 @@ void loop_nest(std::vector const & ranges, std::function -void loop_nest(std::vector> const & iterates, std::function)> const & f){ +void loop_nest(std::vector> const & iterates, std::function)> const & f, size_t nthreads){ //Ranges to iterate over std::vector ranges; for(auto const & x: iterates) @@ -60,17 +67,16 @@ void loop_nest(std::vector> const & iterates, std::function jit::make_llvm_module(ir::module &module, passes_wrapper &passes) { - llvm::Module* result = new llvm::Module(module.get_name(), llvm_context_); +std::unique_ptr jit::make_llvm_module(ir::module &module, passes_wrapper &passes, llvm::LLVMContext& llvm_context, launch_information& info) { + llvm::Module* result = new llvm::Module(module.get_name(), llvm_context); passes.selection.run(module, *result); // launch information - launch_information& info = launch_info_map_[result->getName()]; info.global_range_size.clear(); for(unsigned i = 0; i < passes.tune.get_num_global_range(); i++) info.global_range_size.push_back(passes.tune.get_global_range_size(i)); @@ -78,14 +84,18 @@ std::unique_ptr jit::make_llvm_module(ir::module &module, passes_w return std::unique_ptr(result); } -std::unique_ptr jit::make_triton_module(const char *name, const char *src) { +triton::lang::translation_unit *jit::parse_program(const char *name, const char *src) { // create AST from Triton-C source YY_BUFFER_STATE buffer = yy_scan_string(src); yyparse(); yy_delete_buffer(buffer); triton::lang::translation_unit *program = ast_root; + return program; +} + +std::unique_ptr jit::make_triton_module(const char * name, triton::ir::context &context, triton::lang::translation_unit *program) { // create Triton-IR from AST - ir::module* module = new ir::module(name, triton_context_); + ir::module* module = new ir::module(name, context); program->codegen(module); return std::unique_ptr(module); } @@ -98,7 +108,8 @@ jit::~jit(){ } std::vector jit::get_valid(const char *name, const char *src) { // find metaparameters - auto ptt_module = make_triton_module(name, src); + triton::lang::translation_unit* program = parse_program(name, src); + auto ptt_module = make_triton_module(name, triton_context_, program); ir::module &tt_module = *ptt_module; // set parameters passes_wrapper passes(target_.get()); @@ -111,6 +122,7 @@ std::vector jit::get_valid(const char *name, const char *src) { ranges.push_back(mp->get_space()); // iterate over parameters std::vector result; + size_t nthreads = 1; loop_nest(ranges, [&](const std::vector params){ if(!result.empty()) return; @@ -128,7 +140,7 @@ std::vector jit::get_valid(const char *name, const char *src) { if(!errors.empty()) return; result = params; - }); + }, nthreads); if(result.empty()) throw std::runtime_error("couldn't find valid parameters"); return result; @@ -138,72 +150,77 @@ std::vector jit::get_valid(const char *name, const char *src) { jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t benchmark) { // find metaparameters - auto ptt_module = make_triton_module(name, src); - ir::module &tt_module = *ptt_module; + triton::lang::translation_unit* program = parse_program(name, src); + auto ptt_module_0 = make_triton_module(name, triton_context_, program); + ir::module &tt_module_0 = *ptt_module_0; // set parameters - passes_wrapper passes(target_.get()); - passes.target_independent(tt_module); - passes.tune.run(tt_module); - auto mps = passes.tune.get_params(tt_module); + passes_wrapper passes_0(target_.get()); + passes_0.target_independent(tt_module_0); + passes_0.tune.run(tt_module_0); // create parameter ranges std::vector> ranges; + auto mps = passes_0.tune.get_params(tt_module_0); for(ir::metaparameter *mp: mps) ranges.push_back(mp->get_space()); // iterate over parameters - unsigned i; tune_res_t best; + size_t nthreads = 4; + std::mutex mutex; loop_nest(ranges, [&](const std::vector params){ std::map> errors; - i = 0; - for(ir::metaparameter *mp: mps) - mp->set_value(params[i++]); - passes.target_independent(tt_module); - passes.tune.init(tt_module); - passes.tune.check_constraints(errors); -// for(auto x: errors) -// for(auto err: x.second) -// std::cout << err << std::endl; + unsigned i = 0; + { + std::lock_guard lock(mutex); + for(ir::metaparameter *mp: mps) + mp->set_value(params[i++]); + passes_0.tune.init(tt_module_0); + passes_0.tune.check_constraints(errors); + } if(!errors.empty()) return; // Deep copy of the module and tuner - auto ptt_module = make_triton_module(name, src); - ir::module &tt_module = *ptt_module; -// for(unsigned p: params) -// std::cout << p << " " << std::flush; - passes_wrapper passes(target_.get()); - passes.target_independent(tt_module); - passes.tune.run(tt_module); + triton::ir::context triton_context; + auto ptt_module_1 = make_triton_module(name, triton_context, program); + ir::module &tt_module_1 = *ptt_module_1; + // run passes + passes_wrapper passes_1(target_.get()); + passes_1.target_independent(tt_module_1); + passes_1.tune.run(tt_module_1); i = 0; - for(ir::metaparameter* mp: passes.tune.get_params(tt_module)){ + for(ir::metaparameter* mp: passes_1.tune.get_params(tt_module_1)){ mp->set_value(params[i++]); } - passes.tune.init(tt_module); - passes.target_dependent(tt_module); + passes_1.tune.init(tt_module_1); + passes_1.target_dependent(tt_module_1); driver::device* device = driver_context_->device(); - if(passes.shmem_allocation.get_allocated_size() > device->max_shared_memory()) + if(passes_1.shmem_allocation.get_allocated_size() > device->max_shared_memory()) return; - if(passes.tune.get_num_threads() > device->max_threads_per_block()) + if(passes_1.tune.get_num_threads() > device->max_threads_per_block()) return; // Compile - auto ll_module = make_llvm_module(tt_module, passes); + launch_information info; + llvm::LLVMContext llvm_context; + auto ll_module = make_llvm_module(tt_module_1, passes_1, llvm_context, info); std::unique_ptr module(driver::module::create(driver_context_, &*ll_module)); std::unique_ptr kernel(driver::kernel::create(module.get(), name)); - launch_information info = launch_info_map_.at(name); - for(unsigned p: params) - std::cout << p << " " << std::flush; // add globals - for(auto x: tt_module.globals()) + for(auto x: tt_module_1.globals()) global_ints_[x.first] = ((ir::metaparameter*)x.second)->get_value(); modules_.insert({name, module.get()}); double perf; perf = benchmark(kernel.get(), info); - if(perf > best.perf){ - best.perf = perf; - best.params = params; + { + std::lock_guard lock(mutex); + if(perf > best.perf){ + best.perf = perf; + best.params = params; + } + for(unsigned p: params) + std::cout << p << " " << std::flush; + std::cout << perf << " [ " << best.perf << " ] " << std::endl; } - std::cout << perf << " [ " << best.perf << " ] " << std::endl; modules_.erase(name); - }); + }, nthreads); return best; } @@ -227,9 +244,9 @@ void jit::add_module(ir::module &tt_module, const std::vector ¶ms) if(errors.size()) throw std::runtime_error("invalid parameters"); // triton module -> llvm module - auto ll_module = make_llvm_module(tt_module, passes); - // llvm module -> machine code std::string name = tt_module.get_name(); + auto ll_module = make_llvm_module(tt_module, passes, llvm_context_, launch_info_map_[name]); + // llvm module -> machine code modules_.insert({name, driver::module::create(driver_context_, &*ll_module)}); // add globals for(auto x: tt_module.globals()) @@ -237,7 +254,8 @@ void jit::add_module(ir::module &tt_module, const std::vector ¶ms) } void jit::add_module(const char *name, const char *src, const std::vector ¶ms) { - auto ptt_module = make_triton_module(name, src); + triton::lang::translation_unit* program = parse_program(name, src); + auto ptt_module = make_triton_module(name, triton_context_, program); add_module(*ptt_module, params); } From 434f65737f4ba2d7692500a3a46edcee3ea532fe Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 15 Jul 2019 12:35:53 -0700 Subject: [PATCH 238/494] [runtime] put jit::launch_info in another file --- examples/cpp/shift.cpp | 8 +- include/triton/dnn/base.h | 4 +- include/triton/dnn/batchnorm.h | 4 +- include/triton/dnn/conv.h | 3 +- include/triton/dnn/gemm.h | 3 +- include/triton/dnn/shift.h | 3 +- include/triton/runtime/jit.h | 11 +- include/triton/runtime/launch_info.h | 19 ++++ include/triton/tools/thread_pool.h | 161 +++++++++++++++++++++++++++ lib/dnn/base.cpp | 20 ++-- lib/dnn/batchnorm.cpp | 9 +- lib/dnn/conv.cpp | 7 +- lib/dnn/gemm.cpp | 9 +- lib/dnn/shift.cpp | 6 +- lib/runtime/jit.cpp | 21 ++-- 15 files changed, 227 insertions(+), 61 deletions(-) create mode 100644 include/triton/runtime/launch_info.h create mode 100644 include/triton/tools/thread_pool.h diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index 6941dfa0d..33ded064e 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -14,13 +14,13 @@ int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); - auto op = triton::dnn::shift::FPROP; + auto op = triton::dnn::shift::WGRAD; // initialization int32_t R = 3, S = 3; - int32_t B = 16, F = 512; - int32_t H = 16, W = 16; - int32_t C = 512; + int32_t B = 32, F = 128; + int32_t H = 28, W = 28; + int32_t C = 128; // random shifts std::vector shift_h(C); diff --git a/include/triton/dnn/base.h b/include/triton/dnn/base.h index 7aeab2a14..e8ba1c47e 100644 --- a/include/triton/dnn/base.h +++ b/include/triton/dnn/base.h @@ -25,6 +25,7 @@ #include "triton/driver/stream.h" #include "triton/driver/kernel.h" +#include "triton/runtime/launch_info.h" namespace triton{ namespace dnn{ @@ -45,8 +46,7 @@ private: // enqueue virtual void enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, - const std::vector& ranges, - size_t nthreads) = 0; + triton::runtime::launch_information info) = 0; // number of flops virtual size_t num_flops() const = 0; // comparison for maps diff --git a/include/triton/dnn/batchnorm.h b/include/triton/dnn/batchnorm.h index df2a2df30..496e19ae4 100644 --- a/include/triton/dnn/batchnorm.h +++ b/include/triton/dnn/batchnorm.h @@ -40,7 +40,7 @@ private: // enqueue void enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, - const std::vector &ranges, size_t nthreads); + triton::runtime::launch_information info); // number of flops size_t num_flops() const; // comparison for maps @@ -72,7 +72,7 @@ private: // enqueue void enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, - const std::vector &ranges, size_t nthreads); + runtime::launch_information info); // number of flops size_t num_flops() const; // comparison for maps diff --git a/include/triton/dnn/conv.h b/include/triton/dnn/conv.h index 67d621050..1b6f2d778 100644 --- a/include/triton/dnn/conv.h +++ b/include/triton/dnn/conv.h @@ -33,8 +33,7 @@ private: driver::buffer *bias); void enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, - const std::vector& ranges, - size_t nthreads); + triton::runtime::launch_information info); // number of flops size_t num_flops() const; // comparison for maps diff --git a/include/triton/dnn/gemm.h b/include/triton/dnn/gemm.h index 26ed7d68a..8348edf3e 100644 --- a/include/triton/dnn/gemm.h +++ b/include/triton/dnn/gemm.h @@ -13,8 +13,7 @@ private: // enqueue void enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, - const std::vector& ranges, - size_t nthreads); + triton::runtime::launch_information info); // number of flops size_t num_flops() const; // comparison for maps diff --git a/include/triton/dnn/shift.h b/include/triton/dnn/shift.h index ec4ffc753..8f33aee66 100644 --- a/include/triton/dnn/shift.h +++ b/include/triton/dnn/shift.h @@ -54,8 +54,7 @@ private: void init_impl(driver::stream *stream, driver::cu_module *module); void enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, - const std::vector& ranges, - size_t nthreads); + triton::runtime::launch_information info); public: diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index 9b0f75f96..c594eccd8 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -20,6 +20,7 @@ #include "triton/codegen/alignment_info.h" #include "triton/codegen/target.h" #include "triton/codegen/vectorize.h" +#include "triton/runtime/launch_info.h" #include namespace llvm { @@ -42,12 +43,10 @@ class context; class metaparameter; } +namespace runtime{ + class jit { public: - struct launch_information{ - std::vector global_range_size; - unsigned num_threads; - }; typedef std::function benchmark_t; struct tune_res_t{ @@ -114,7 +113,6 @@ public: void add_module(const char* name, const char* src, const std::vector& params = {}); driver::kernel* get_function(const char* name); launch_information get_launch_info(const char* name); - unsigned get_int(const char* name); private: std::map modules_; @@ -122,11 +120,10 @@ private: llvm::LLVMContext llvm_context_; ir::context triton_context_; std::map launch_info_map_; - std::map global_ints_; std::shared_ptr target_; }; - +} } #endif diff --git a/include/triton/runtime/launch_info.h b/include/triton/runtime/launch_info.h new file mode 100644 index 000000000..a6a0ddb5b --- /dev/null +++ b/include/triton/runtime/launch_info.h @@ -0,0 +1,19 @@ +#ifndef TRITON_INCLUDE_RUNTIME_LAUNCH_INFO_H +#define TRITON_INCLUDE_RUNTIME_LAUNCH_INFO_H + +#include +#include + +namespace triton{ +namespace runtime{ + +struct launch_information{ + std::vector global_range_size; + unsigned num_threads; + std::map globals; +}; + +} +} + +#endif diff --git a/include/triton/tools/thread_pool.h b/include/triton/tools/thread_pool.h new file mode 100644 index 000000000..5d01511e1 --- /dev/null +++ b/include/triton/tools/thread_pool.h @@ -0,0 +1,161 @@ +#ifndef CONCURRENT_THREADPOOL_H +#define CONCURRENT_THREADPOOL_H + +#include +#include +#include +#include +#include +#include +#include + +namespace nbsdx { +namespace concurrent { + +/** + * Simple ThreadPool that creates `ThreadCount` threads upon its creation, + * and pulls from a queue to get new jobs. The default is 10 threads. + * + * This class requires a number of c++11 features be present in your compiler. + */ +class thread_pool { + + std::vector threads_; + std::list> queue_; + + std::atomic_int jobs_left_; + std::atomic_bool bailout_; + std::atomic_bool finished_; + std::condition_variable job_available_var_; + std::condition_variable wait_var_; + std::mutex wait_mutex_; + std::mutex queue_mutex_; + unsigned thread_count_; + + /** + * Take the next job in the queue and run it. + * Notify the main thread that a job has completed. + */ + void task() { + while( !bailout_ ) { + next_job()(); + --jobs_left_; + wait_var_.notify_one(); + } + } + + /** + * Get the next job; pop the first item in the queue, + * otherwise wait for a signal from the main thread. + */ + std::function next_job() { + std::function res; + std::unique_lock job_lock( queue_mutex_ ); + + // Wait for a job if we don't have any. + job_available_var_.wait( job_lock, [this]() ->bool { return queue_.size() || bailout_; } ); + + // Get job from the queue + if( !bailout_ ) { + res = queue_.front(); + queue_.pop_front(); + } + else { // If we're bailing out, 'inject' a job into the queue to keep jobs_left accurate. + res = []{}; + ++jobs_left_; + } + return res; + } + +public: + thread_pool(unsigned thread_count = 4) + : jobs_left_( 0 ) + , bailout_( false ) + , finished_( false ) + , thread_count_(thread_count) + { + threads_.resize(thread_count_); + for( unsigned i = 0; i < thread_count_; ++i ) + threads_[ i ] = std::thread( [this]{ this->task(); } ); + } + + /** + * JoinAll on deconstruction + */ + ~thread_pool() { + join_all(); + } + + /** + * Get the number of threads in this pool + */ + inline unsigned size() const { + return thread_count_; + } + + /** + * Get the number of jobs left in the queue. + */ + inline unsigned jobs_remaining() { + std::lock_guard guard( queue_mutex_ ); + return queue_.size(); + } + + /** + * Add a new job to the pool. If there are no jobs in the queue, + * a thread is woken up to take the job. If all threads are busy, + * the job is added to the end of the queue. + */ + void add_job( std::function job ) { + std::lock_guard guard( queue_mutex_ ); + queue_.emplace_back( job ); + ++jobs_left_; + job_available_var_.notify_one(); + } + + /** + * Join with all threads. Block until all threads have completed. + * Params: WaitForAll: If true, will wait for the queue to empty + * before joining with threads. If false, will complete + * current jobs, then inform the threads to exit. + * The queue will be empty after this call, and the threads will + * be done. After invoking `ThreadPool::JoinAll`, the pool can no + * longer be used. If you need the pool to exist past completion + * of jobs, look to use `ThreadPool::WaitAll`. + */ + void join_all( bool WaitForAll = true ) { + if( !finished_ ) { + if( WaitForAll ) { + wait_all(); + } + + // note that we're done, and wake up any thread that's + // waiting for a new job + bailout_ = true; + job_available_var_.notify_all(); + + for( auto &x : threads_ ) + if( x.joinable() ) + x.join(); + finished_ = true; + } + } + + /** + * Wait for the pool to empty before continuing. + * This does not call `std::thread::join`, it only waits until + * all jobs have finshed executing. + */ + void wait_all() { + if( jobs_left_ > 0 ) { + std::unique_lock lk( wait_mutex_ ); + wait_var_.wait( lk, [this]{ return this->jobs_left_ == 0; } ); + lk.unlock(); + } + } +}; + +} // namespace concurrent +} // namespace nbsdx + +#endif //CONCURRENT_THREADPOOL_H diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index b3bf6c05a..f5e2af0b2 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -23,29 +23,30 @@ base::base(const std::string& name) : name_(name) { } void base::enqueue(driver::stream *stream, std::vector args, bool autotune) { - static std::map, cmp_recompile> m_jit; + namespace rt = triton::runtime; + static std::map, cmp_recompile> m_jit; driver::context* ctx = stream->context(); - triton::jit* jit; + rt::jit* jit; /* the current template has not already been compiled */ if(m_jit.find(this) == m_jit.end()) { - jit = m_jit.emplace(this->clone(), new triton::jit(ctx)).first->second.get(); + jit = m_jit.emplace(this->clone(), new rt::jit(ctx)).first->second.get(); std::ostringstream oss; triton_c_src(oss); std::string src = oss.str(); auto benchmark = [&](triton::driver::kernel* kernel, - triton::jit::launch_information info) { + rt::launch_information info) { // launch info unsigned nthreads = info.num_threads; init_impl(stream, (triton::driver::cu_module*)kernel->module()); - enqueue_impl(stream, kernel, args, info.global_range_size, nthreads); + enqueue_impl(stream, kernel, args, info); stream->synchronize(); - double ts = triton::tools::bench([&](){ enqueue_impl(stream, kernel, args, info.global_range_size, nthreads); }, + double ts = triton::tools::bench([&](){ enqueue_impl(stream, kernel, args, info); }, [&](){ stream->synchronize(); }, ctx->device()); return num_flops() / ts * 1e-3; }; // auto-tune and save result if(autotune) { - triton::jit::tune_res_t best = jit->autotune(name_.c_str(), src.c_str(), benchmark); + rt::jit::tune_res_t best = jit->autotune(name_.c_str(), src.c_str(), benchmark); jit->add_module(name_.c_str(), src.c_str(), best.params); } else { @@ -60,10 +61,9 @@ void base::enqueue(driver::stream *stream, std::vector args, b /* get launch parameters */ driver::kernel* kernel = jit->get_function(name_.c_str()); - triton::jit::launch_information info = jit->get_launch_info(name_.c_str()); + rt::launch_information info = jit->get_launch_info(name_.c_str()); /* launch */ - enqueue_impl(stream, kernel, args, - info.global_range_size, info.num_threads); + enqueue_impl(stream, kernel, args, info); } } diff --git a/lib/dnn/batchnorm.cpp b/lib/dnn/batchnorm.cpp index 54bb9c16e..34275a931 100644 --- a/lib/dnn/batchnorm.cpp +++ b/lib/dnn/batchnorm.cpp @@ -54,8 +54,7 @@ base* batchnorm_forward::clone() const { void batchnorm_forward::enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, - const std::vector&, - size_t nthreads) + runtime::launch_information info) { driver::buffer *y = args[0], *m = args[1], *v = args[2]; driver::buffer *x = args[3], *g = args[4], *b = args[5]; @@ -69,7 +68,7 @@ void batchnorm_forward::enqueue_impl(driver::stream *stream, driver::kernel *ker kernel->setArg(6, DHWB_); kernel->setArg(7, rcpDHWB_); kernel->setArg(8, eps_); - stream->enqueue(kernel, grid, {nthreads, 1, 1}); + stream->enqueue(kernel, grid, {info.num_threads, 1, 1}); } void batchnorm_forward::triton_c_src(std::ostream &os) const { @@ -154,7 +153,7 @@ base* batchnorm_backward::clone() const { void batchnorm_backward::enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, - const std::vector &, size_t nthreads) { + runtime::launch_information info) { driver::buffer *dx = args[0], *dg = args[1], *db = args[2], *dy = args[3]; driver::buffer *x = args[4], *g = args[5], *m = args[6], *v = args[7]; std::array grid = {1, (size_t)C_, 1}; @@ -169,7 +168,7 @@ void batchnorm_backward::enqueue_impl(driver::stream *stream, driver::kernel *ke kernel->setArg(8, (int32_t)(D_*H_*W_*B_)); kernel->setArg(9, (float)1/(D_*H_*W_*B_)); kernel->setArg(10, eps_); - stream->enqueue(kernel, grid, {nthreads, 1, 1}); + stream->enqueue(kernel, grid, {info.num_threads, 1, 1}); } void batchnorm_backward::triton_c_src(std::ostream &os) const { diff --git a/lib/dnn/conv.cpp b/lib/dnn/conv.cpp index 011cd7a53..c20701a4b 100644 --- a/lib/dnn/conv.cpp +++ b/lib/dnn/conv.cpp @@ -365,10 +365,9 @@ void conv::set_arg(driver::kernel *kernel, void conv::enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, - const std::vector& ranges, - size_t nthreads) { + runtime::launch_information info) { driver::buffer *a = args[0], *b = args[1], *c = args[2], *bias = args[3]; - unsigned TM = ranges[0], TN = ranges[1]; + unsigned TM = info.global_range_size[0], TN = info.global_range_size[1]; unsigned GZ = 1; set_arg(kernel, a, b, c, bias); std::array grid = {1}; @@ -411,7 +410,7 @@ void conv::enqueue_impl(driver::stream *stream, driver::kernel *kernel, kernel->setArg(38, (pad_w_ + (1 - upsample_w_)*off_uw)/upsample_w_); kernel->setArg(39, (off_uh + pad_h_) % upsample_h_); kernel->setArg(40, (off_uw + pad_w_) % upsample_w_); - stream->enqueue(kernel, grid, {nthreads, 1, 1}); + stream->enqueue(kernel, grid, {info.num_threads, 1, 1}); } } diff --git a/lib/dnn/gemm.cpp b/lib/dnn/gemm.cpp index 6ea1a8c21..139062db8 100644 --- a/lib/dnn/gemm.cpp +++ b/lib/dnn/gemm.cpp @@ -47,11 +47,10 @@ void gemm::init_impl(driver::stream* stream, driver::cu_module *) { void gemm::enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, - const std::vector& ranges, - size_t nthreads) { + runtime::launch_information info) { driver::buffer *a = args[0], *b = args[1], *c = args[2]; - unsigned TM = ranges[0]; - unsigned TN = ranges[1]; + unsigned TM = info.global_range_size[0]; + unsigned TN = info.global_range_size[1]; unsigned grid_0 = (M_ + TM - 1)/TM; unsigned grid_1 = (N_ + TN - 1)/TN; unsigned grid_2 = 1; @@ -68,7 +67,7 @@ void gemm::enqueue_impl(driver::stream *stream, driver::kernel *kernel, kernel->setArg(9, locks_); kernel->setArg(10, grid_0); kernel->setArg(11, grid_1); - stream->enqueue(kernel, grid, {nthreads, 1, 1}); + stream->enqueue(kernel, grid, {info.num_threads, 1, 1}); } std::vector gemm::default_params() { diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 872189c89..cc6dccc4d 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -199,7 +199,7 @@ void shift::init_impl(driver::stream *stream, driver::cu_module *module) { void shift::enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, - const std::vector &ranges, size_t nthreads) { + runtime::launch_information info) { driver::buffer *a = args[0], *b = args[1], *c = args[2]; kernel->setArg(0, a); kernel->setArg(1, b); @@ -228,13 +228,13 @@ void shift::enqueue_impl(driver::stream *stream, driver::kernel *kernel, kernel->setArg(24, BW_); kernel->setArg(25, CH_); kernel->setArg(26, CW_); - unsigned TM = ranges[0], TN = ranges[1]; + unsigned TM = info.global_range_size[0], TN = info.global_range_size[1]; std::array grid = {(M_ + TM - 1)/TM, (N_ + TN - 1)/TN, 1}; if(op_ == BPROP){ size_t c_nbytes = (c_ty_ == "fp16") ? 2 : 4; ((driver::cu_buffer*)c)->set_zero(stream, AH_*AW_*B_*C_*c_nbytes); } - stream->enqueue(kernel, grid, {nthreads, 1, 1}); + stream->enqueue(kernel, grid, {info.num_threads, 1, 1}); } void shift::triton_c_src(std::ostream &os) const { diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index 51f3ed916..b55680a21 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -29,6 +29,7 @@ extern void yy_delete_buffer(YY_BUFFER_STATE buffer); extern triton::lang::translation_unit *ast_root; namespace triton { +namespace runtime{ void loop_nest(std::vector const & ranges, std::function const &)> const & f, @@ -80,6 +81,10 @@ std::unique_ptr jit::make_llvm_module(ir::module &module, passes_w info.global_range_size.clear(); for(unsigned i = 0; i < passes.tune.get_num_global_range(); i++) info.global_range_size.push_back(passes.tune.get_global_range_size(i)); + // add globals + for(auto x: module.globals()) + info.globals[x.first] = ((ir::metaparameter*)x.second)->get_value(); + // number of threads info.num_threads = passes.tune.get_num_threads(); return std::unique_ptr(result); } @@ -164,7 +169,7 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben ranges.push_back(mp->get_space()); // iterate over parameters tune_res_t best; - size_t nthreads = 4; + size_t nthreads = 1; std::mutex mutex; loop_nest(ranges, [&](const std::vector params){ std::map> errors; @@ -203,10 +208,6 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben auto ll_module = make_llvm_module(tt_module_1, passes_1, llvm_context, info); std::unique_ptr module(driver::module::create(driver_context_, &*ll_module)); std::unique_ptr kernel(driver::kernel::create(module.get(), name)); - // add globals - for(auto x: tt_module_1.globals()) - global_ints_[x.first] = ((ir::metaparameter*)x.second)->get_value(); - modules_.insert({name, module.get()}); double perf; perf = benchmark(kernel.get(), info); { @@ -219,7 +220,6 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben std::cout << p << " " << std::flush; std::cout << perf << " [ " << best.perf << " ] " << std::endl; } - modules_.erase(name); }, nthreads); return best; } @@ -248,9 +248,6 @@ void jit::add_module(ir::module &tt_module, const std::vector ¶ms) auto ll_module = make_llvm_module(tt_module, passes, llvm_context_, launch_info_map_[name]); // llvm module -> machine code modules_.insert({name, driver::module::create(driver_context_, &*ll_module)}); - // add globals - for(auto x: tt_module.globals()) - global_ints_[x.first] = ((ir::metaparameter*)x.second)->get_value(); } void jit::add_module(const char *name, const char *src, const std::vector ¶ms) { @@ -263,12 +260,10 @@ driver::kernel *jit::get_function(const char *name) { return driver::kernel::create(modules_.at(name), name); } -jit::launch_information jit::get_launch_info(const char *name) { +launch_information jit::get_launch_info(const char *name) { return launch_info_map_.at(name); } -unsigned jit::get_int(const char *name){ - return global_ints_.at(name); -} } +} From aa8bcf6bde843ea2ce995678afa31a34b2fe15e0 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 15 Jul 2019 21:03:58 -0700 Subject: [PATCH 239/494] [dnn/shift] added split-k for shift-conv --- examples/cpp/shift.cpp | 10 +- examples/python/tensorflow/run.py | 8 +- include/triton/dnn/base.h | 2 + include/triton/dnn/shift.h | 4 + include/triton/tools/bench.hpp | 20 +- lib/dnn/base.cpp | 18 +- lib/dnn/shift.cpp | 298 ++++++++++++------------------ lib/lang/expression.cpp | 4 + lib/runtime/jit.cpp | 5 +- 9 files changed, 166 insertions(+), 203 deletions(-) diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index 33ded064e..482fad6b4 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -18,8 +18,8 @@ int main() { // initialization int32_t R = 3, S = 3; - int32_t B = 32, F = 128; - int32_t H = 28, W = 28; + int32_t B = 128, F = 128; + int32_t H = 16, W = 16; int32_t C = 128; // random shifts @@ -44,9 +44,9 @@ int main() { std::swap(b_size, c_size); std::swap(a_size, b_size); } - std::vector ha(B*C*H*W); - std::vector hb(C*F); - std::vector hc(B*F*H*W); + std::vector ha(a_size); + std::vector hb(b_size); + std::vector hc(c_size); std::vector rc(hc.size()); // device buffers triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*4); diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 971ad2898..893fc5b10 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -58,9 +58,9 @@ def blocksparse_matmul_grad(op, dy): return (dx, dw) def run_shift(): - B, C, H, W = 1, 16, 4, 4 + B, C, H, W = 2, 16, 4, 4 R, S, F = 3, 3, 16 - stride_h, stride_w = 2, 2 + stride_h, stride_w = 1, 1 np.random.seed(2) a = tf.placeholder(tf.float16, shape=[B, C, H, W]) b = tf.placeholder(tf.float16, shape=[C, F]) @@ -82,8 +82,8 @@ def run_shift(): dx_t, dx_n = grads[0] #import sys #np.set_printoptions(threshold=sys.maxsize) - print(dx_t) - print(dx_n) + print(dw_t) + print(dw_n) print(np.max(np.abs(dw_t - dw_n))) print(np.max(np.abs(dx_t - dx_n))) # Run diff --git a/include/triton/dnn/base.h b/include/triton/dnn/base.h index e8ba1c47e..3045ffb49 100644 --- a/include/triton/dnn/base.h +++ b/include/triton/dnn/base.h @@ -43,6 +43,8 @@ protected: private: // initialize virtual void init_impl(driver::stream *, driver::cu_module *){ } + // deinitialize + virtual void deinit_impl(){ } // enqueue virtual void enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, diff --git a/include/triton/dnn/shift.h b/include/triton/dnn/shift.h index 8f33aee66..fbff404ca 100644 --- a/include/triton/dnn/shift.h +++ b/include/triton/dnn/shift.h @@ -52,6 +52,7 @@ public: private: // initialize and enqueue void init_impl(driver::stream *stream, driver::cu_module *module); + void deinit_impl(); void enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, triton::runtime::launch_information info); @@ -163,6 +164,9 @@ private: bool BT_; // layout layout_t layout_; + // locks + size_t max_locks_; + driver::buffer *locks_; }; } diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp index f37c04371..3c584bb02 100644 --- a/include/triton/tools/bench.hpp +++ b/include/triton/tools/bench.hpp @@ -32,15 +32,17 @@ double bench(OP const & op, SYNC const & sync, const triton::driver::device * de double total_time = 0; op(); sync(); - float norm = 1; - // normalize clock if possible to get roughly constant result - if(auto cu_device = dynamic_cast(device)) - norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock(); - tmr.start(); - op(); - sync(); - times.push_back(norm*tmr.get().count()); - total_time+=times.back(); +// while(total_time*1e-9 < 1e-3){ + float norm = 1; + // normalize clock if possible to get roughly constant result + if(auto cu_device = dynamic_cast(device)) + norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock(); + tmr.start(); + op(); + sync(); + times.push_back(norm*tmr.get().count()); + total_time+=times.back(); +// } return *std::min_element(times.begin(), times.end()); } diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index f5e2af0b2..c4f5ace3e 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -29,19 +29,20 @@ void base::enqueue(driver::stream *stream, std::vector args, b rt::jit* jit; /* the current template has not already been compiled */ if(m_jit.find(this) == m_jit.end()) { - jit = m_jit.emplace(this->clone(), new rt::jit(ctx)).first->second.get(); + base* clone = this->clone(); + jit = m_jit.emplace(clone, new rt::jit(ctx)).first->second.get(); std::ostringstream oss; - triton_c_src(oss); + clone->triton_c_src(oss); std::string src = oss.str(); auto benchmark = [&](triton::driver::kernel* kernel, rt::launch_information info) { // launch info - unsigned nthreads = info.num_threads; - init_impl(stream, (triton::driver::cu_module*)kernel->module()); - enqueue_impl(stream, kernel, args, info); + clone->init_impl(stream, (triton::driver::cu_module*)kernel->module()); + clone->enqueue_impl(stream, kernel, args, info); stream->synchronize(); - double ts = triton::tools::bench([&](){ enqueue_impl(stream, kernel, args, info); }, + double ts = triton::tools::bench([&](){ clone->enqueue_impl(stream, kernel, args, info); }, [&](){ stream->synchronize(); }, ctx->device()); + clone->deinit_impl(); return num_flops() / ts * 1e-3; }; // auto-tune and save result @@ -53,7 +54,7 @@ void base::enqueue(driver::stream *stream, std::vector args, b jit->add_module(name_.c_str(), src.c_str(), jit->get_valid(name_.c_str(), src.c_str())); } triton::driver::kernel* kernel = jit->get_function(name_.c_str()); - init_impl(stream, (triton::driver::cu_module*)kernel->module()); + clone->init_impl(stream, (triton::driver::cu_module*)kernel->module()); } /* retrieved compiled template */ else @@ -63,7 +64,8 @@ void base::enqueue(driver::stream *stream, std::vector args, b driver::kernel* kernel = jit->get_function(name_.c_str()); rt::launch_information info = jit->get_launch_info(name_.c_str()); /* launch */ - enqueue_impl(stream, kernel, args, info); + auto it = m_jit.find(this); + it->first->enqueue_impl(stream, kernel, args, info); } } diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index cc6dccc4d..87aaf32cb 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -124,6 +124,9 @@ shift::shift(int B, int C, if(layout_ == NCHW) shapes_c_ = {B, C, AH_, AW_}; } + // locks + max_locks_ = (op_ == WGRAD) ? 8192 : 0; + locks_ = nullptr; } base* shift::clone() const { @@ -195,11 +198,30 @@ void shift::init_impl(driver::stream *stream, driver::cu_module *module) { build_delta_a(); triton::driver::buffer* delta_a = ((triton::driver::cu_module*)module)->symbol("delta_a"); stream->write(delta_a, false, 0, h_delta_a.size()*4, h_delta_a.data()); + // locks + if(locks_ == nullptr && max_locks_ > 0){ + std::vector hlocks(2*max_locks_, 0); + locks_ = triton::driver::buffer::create(stream->context(), 2*max_locks_*4); + stream->write(locks_, false, 0, hlocks); + } +} + +void shift::deinit_impl() { + if(locks_ != nullptr){ + delete locks_; + locks_ = nullptr; + } } void shift::enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, runtime::launch_information info) { + unsigned TM = info.global_range_size[0], TN = info.global_range_size[1]; + unsigned grid_0 = (M_ + TM - 1)/TM; + unsigned grid_1 = (N_ + TN - 1)/TN; + unsigned num_locks = grid_0 * grid_1; + unsigned grid_2 = num_locks < max_locks_ ? info.globals.at("GZ") : 1; + std::array grid = {grid_0, grid_1, grid_2}; driver::buffer *a = args[0], *b = args[1], *c = args[2]; kernel->setArg(0, a); kernel->setArg(1, b); @@ -228,8 +250,9 @@ void shift::enqueue_impl(driver::stream *stream, driver::kernel *kernel, kernel->setArg(24, BW_); kernel->setArg(25, CH_); kernel->setArg(26, CW_); - unsigned TM = info.global_range_size[0], TN = info.global_range_size[1]; - std::array grid = {(M_ + TM - 1)/TM, (N_ + TN - 1)/TN, 1}; + kernel->setArg(27, (num_locks > max_locks_) ? nullptr : locks_); + kernel->setArg(28, (int32_t)grid[0]); + kernel->setArg(29, (int32_t)grid[1]); if(op_ == BPROP){ size_t c_nbytes = (c_ty_ == "fp16") ? 2 : 4; ((driver::cu_buffer*)c)->set_zero(stream, AH_*AW_*B_*C_*c_nbytes); @@ -256,12 +279,49 @@ void shift::triton_c_src(std::ostream &os) const { std::string BS = BS0 + ", " + BS1; bool is_chwn = layout_ == CHWN; + auto compute_bhw = [&](std::string rx, std::string sz, std::string rkx){ + if(is_chwn) { + return R"( + int32 )" + rx + "wh[" + sz + "] = " + rkx + R"( / NB; + int32 )" + rx + "b[" + sz + "] = " + rkx + R"( % NB; + int32 )" + rx + "w[" + sz + "] = " + rx + R"(wh % CW; + int32 )" + rx + "h[" + sz + "] = " + rx + R"(wh / CW;)"; + } + else { + return R"( + int32 )" + rx + "bh[" + sz + "] = " + rkx + R"( / CW; + int32 )" + rx + "w[" + sz + "] = " + rkx + R"( % CW; + int32 )" + rx + "h[" + sz + "] = " + rx + R"(bh % CH; + int32 )" + rx + "b[" + sz + "] = " + rx + R"(bh / CH;)"; + } + }; + + auto compute_interior = [&](std::string rx, std::string sz0, std::string sz1) { + std::string result; + if(shift_edge_h_) + result += "int1 interiorh[" + sz0 + "] = 1;\n "; + else + result += "int1 interiorh[" + sz0 + "] = (" + rx + "h >= pad_h) && (" + rx + "h < (AH - pad_h));\n "; + if(shift_edge_w_) + result += "int1 interiorw[" + sz0 + "] = 1;"; + else + result += "int1 interiorw[" + sz0 + "] = (" + rx + "w >= pad_w) && (" + rx + "w < (AW - pad_w));"; + result += R"( + int1 interior[)" + sz0 + ", " + sz1 + "] = interiorh[:, newaxis] && interiorw[:, newaxis];"; + return result; + }; + std::string result = R"( const tunable int32 TM = {16, 32, 64, 128}; const tunable int32 TN = {16, 32, 64, 128}; -const tunable int32 TK = {)" + std::to_string(TK_) + R"(}; +const tunable int32 TK = {)" + std::to_string(TK_) + "};"; +if(op_ == WGRAD) + result += "const tunable int32 GZ = {1, 4, 16};"; +else + result += "const tunable int32 GZ = {1};"; +result += R"( __constant__ int32* delta_a = alloc_const int32[)" + std::to_string(MAX_C_) + R"(]; void shift(restrict read_only align(16) )" + a_ty_ + R"( *A, @@ -275,32 +335,32 @@ void shift(restrict read_only align(16) )" + a_ty_ + R"( *A, int32 NB, int32 AH, int32 AW, int32 BH, int32 BW, - int32 CH, int32 CW) { + int32 CH, int32 CW, + int32* locks, int32 grid0, int32 grid1) { int32 rxa[TM] = get_global_range[TM](0); int32 ryb[TN] = get_global_range[TN](1); + int32 rz = get_global_range[1](2); int32 rka[TK] = 0 ... TK; int32 rkb[TK] = 0 ... TK; fp32 acc[TM, TN] = 0; int32 pad_h = BH / 2; - int32 pad_w = BW / 2;)"; + int32 pad_w = BW / 2; + int32 split = select(locks == 0, 1, GZ); + int32 div = K / split; + int32 rem = K % split; + K = select(rz < rem, div - 1, div); + int32 offk = select(rz < rem, rz*(div + 1), rz*div + rem);)"; +if(op_ == WGRAD){ + result += R"( + rka = rka + offk; + rkb = rkb + offk; + )"; +} /* A offsets */ if(op_ == FPROP){ - if(is_chwn){ - result += R"( - int32 rawh[TM] = rxa / NB; - int32 rab[TM] = rxa % NB; - int32 raw[TM] = rawh % CW; - int32 rah[TM] = rawh / CW;)"; - } - else{ - result += R"( - int32 rabh[TM] = rxa / CW; - int32 raw[TM] = rxa % CW; - int32 rah[TM] = rabh % CH; - int32 rab[TM] = rabh / CH;)"; - } - result += R"( + result += + compute_bhw("ra", "TM", "rxa") + R"( raw = raw * stride_w; rah = rah * stride_h; int32 offxa[TM] = rab*lda_b + raw*lda_w + rah*lda_h; @@ -309,35 +369,12 @@ if(op_ == FPROP){ int32 d[TK] = *pd; int32 offa_interior[TM, TK] = d[newaxis, :]; int32 offa_exterior[TM, TK] = rka[newaxis, :] * lda_c; - )"; - if(shift_edge_h_) - result += " int1 interiorh[TM] = 1;\n"; - else - result += " int1 interiorh[TM] = (rah >= pad_h) && (rah < (AH - pad_h));\n"; - if(shift_edge_w_) - result += " int1 interiorw[TM] = 1;"; - else - result += " int1 interiorw[TM] = (raw >= pad_w) && (raw < (AW - pad_w));"; - result += R"( - int1 interior[TM, TK] = interiorh[:, newaxis] && interiorw[:, newaxis]; + )" + compute_interior("ra", "TM", "TK") + R"( int32 offa1[TM, TK] = interior ? offa_interior : offa_exterior;)"; } if(op_ == BPROP){ - if(is_chwn){ - result += R"( - int32 rawh[TM] = rxa / NB; - int32 rab[TM] = rxa % NB; - int32 raw[TM] = rawh % CW; - int32 rah[TM] = rawh / CW;)"; - } - else{ - result += R"( - int32 rabh[TM] = rxa / CW; - int32 raw[TM] = rxa % CW; - int32 rah[TM] = rabh % CH; - int32 rab[TM] = rabh / CH;)"; - } - result += R"( + result += + compute_bhw("ra", "TM", "rxa") + R"( int32 offxa[TM] = rab*lda_b + raw*lda_w + rah*lda_h; int32 offa0[TM, TK] = offxa[:, newaxis]; int32 offa1[TM, TK] = rka[newaxis, :] * lda_c;)"; @@ -348,21 +385,8 @@ if(op_ == WGRAD && layout_ == CHWN){ int32 offa1[TK, TM] = rka[:, newaxis];)"; } if(op_ == WGRAD && layout_ == NCHW){ - if(is_chwn){ - result += R"( - int32 rawh[TK] = rka / NB; - int32 rab[TK] = rka % NB; - int32 raw[TK] = rawh % CW; - int32 rah[TK] = rawh / CW;)"; - } - else{ - result += R"( - int32 rabh[TK] = rka / CW; - int32 raw[TK] = rka % CW; - int32 rah[TK] = rabh % CH; - int32 rab[TK] = rabh / CH;)"; - } - result += R"( + result += + compute_bhw("ra", "TK", "rka") + R"( int32 offa0[TK, TM] = rxa[newaxis, :] * lda_c; int32 offxa[TK] = rab*lda_b + raw*lda_w + rah*lda_h; int32 offa1[TK, TM] = offxa[:, newaxis];)"; @@ -380,38 +404,15 @@ if(op_ == BPROP){ int32 offb1[TK, TN] = rkb[:, newaxis];)"; } if(op_ == WGRAD){ - if(is_chwn){ - result += R"( - int32 rbwh[TK] = rkb / NB; - int32 rbb[TK] = rkb % NB; - int32 rbw[TK] = rbwh % CW; - int32 rbh[TK] = rbwh / CW;)"; - } - else{ - result += R"( - int32 rbbh[TK] = rkb / CW; - int32 rbw[TK] = rkb % CW; - int32 rbh[TK] = rbbh % CH; - int32 rbb[TK] = rbbh / CH;)"; - } - result += R"( + result += + compute_bhw("rb", "TK", "rkb") + R"( __constant__ int32* pd[TN] = delta_a + ryb; int32 d[TN] = *pd; int32 shift[TK, TN] = d[newaxis, :]; rbw = rbw * stride_w; rbh = rbh * stride_h; int32 offkb[TK] = rbb*ldb_b + rbw*ldb_w + rbh*ldb_h; - )"; - if(shift_edge_h_) - result += " int1 interiorh[TK] = 1;\n"; - else - result += " int1 interiorh[TK] = (rbh >= pad_h) && (rbh < (AH - pad_h));\n"; - if(shift_edge_w_) - result += " int1 interiorw[TK] = 1;"; - else - result += " int1 interiorw[TK] = (rbw >= pad_w) && (rbw < (AW - pad_w));"; - result += R"( - int1 interior[TK, TN] = interiorh[:, newaxis] && interiorw[:, newaxis]; + )" + compute_interior("rb", "TK", "TN") + R"( int32 incb[TK, TN] = interior ? shift : 0; int32 offb0[TK, TN] = ryb[newaxis, :] * ldb_c; int32 offb1[TK, TN] = offkb[:, newaxis] + incb;)"; @@ -421,8 +422,8 @@ if(op_ == WGRAD){ result += R"( )" + a_ty_ + "* pa[" + AS + R"(] = A + offa0 + offa1; )" + b_ty_ + "* pb[" + BS + R"(] = B + offb0 + offb1; - int1 checka[)" + AS + "] = (rka < K)" + bca0 + R"(; - int1 checkb[)" + BS + "] = (rkb < K)" + bcb0 + R"(; + int1 checka[)" + AS + "] = (rka < K + offk)" + bca0 + R"(; + int1 checkb[)" + BS + "] = (rkb < K + offk)" + bcb0 + R"(; )" + a_ty_ + " a[" + AS + R"(] = checka ? *pa : 0; )" + b_ty_ + " b[" + BS + R"(] = checkb ? *pb : 0; for(int32 k = K; k > 0; k = k - TK){ @@ -450,22 +451,8 @@ if(op_ == WGRAD && layout_ == CHWN){ } if(op_ == WGRAD && layout_ == NCHW){ result += R"( - rka = rka + TK;)"; - if(is_chwn){ - result += R"( - int32 rawh[TK] = rka / NB; - int32 rab[TK] = rka % NB; - int32 raw[TK] = rawh % CW; - int32 rah[TK] = rawh / CW;)"; - } - else{ - result += R"( - int32 rabh[TK] = rka / CW; - int32 raw[TK] = rka % CW; - int32 rah[TK] = rabh % CH; - int32 rab[TK] = rabh / CH;)"; - } - result += R"( + rka = rka + TK;)" + + compute_bhw("ra", "TK", "rka") + R"( offxa = rab*lda_b + raw*lda_w + rah*lda_h; pa = A + offa0 + offxa[:, newaxis];)"; } @@ -475,36 +462,12 @@ if(op_ == WGRAD && layout_ == NCHW){ /* Increment B pointers */ if(op_ == WGRAD){ result += R"( - rkb = rkb + TK;)"; - if(is_chwn){ - result += R"( - int32 rbwh[TK] = rkb / NB; - int32 rbb[TK] = rkb % NB; - int32 rbw[TK] = rbwh % CW; - int32 rbh[TK] = rbwh / CW;)"; - } - else{ - result += R"( - int32 rbbh[TK] = rkb / CW; - int32 rbw[TK] = rkb % CW; - int32 rbh[TK] = rbbh % CH; - int32 rbb[TK] = rbbh / CH;)"; - } - result += R"( + rkb = rkb + TK;)" + + compute_bhw("rb", "TK", "rkb") + R"( rbw = rbw * stride_w; rbh = rbh * stride_h; offkb = rbb*ldb_b + rbw*ldb_w + rbh*ldb_h; - )"; - if(shift_edge_h_) - result += " interiorh = 1;\n"; - else - result += " interiorh = (rbh >= pad_h) && (rbh < (AH - pad_h));\n"; - if(shift_edge_w_) - result += " interiorw = 1;"; - else - result += " interiorw = (rbw >= pad_w) && (rbw < (AW - pad_w));"; - result += R"( - interior = interiorh[:, newaxis] && interiorw[:, newaxis]; + )" + compute_interior("rb", "TK", "TN") + R"( incb = interior ? shift : 0; pb = B + offb0 + offkb[:, newaxis] + incb;)"; } @@ -524,41 +487,15 @@ if(op_ == BPROP){ /* C offsets */ if(op_ == BPROP){ - if(is_chwn){ - result += R"( - int32 rcwh[TM] = rxc / NB; - int32 rcb[TM] = rxc % NB; - int32 rcw[TM] = rcwh % CW; - int32 rch[TM] = rcwh / CW;)"; - } - else{ - result += R"( - int32 rcbh[TM] = rxc / CW; - int32 rcw[TM] = rxc % CW; - int32 rch[TM] = rcbh % CH; - int32 rcb[TM] = rcbh / CH;)"; - } - result += R"( + result += + compute_bhw("rc", "TM", "rxc") + R"( rcw = rcw * stride_w; rch = rch * stride_h; int32 offxc[TM] = rcb*ldc_b + rcw*ldc_w + rch*ldc_h;)"; } if(op_ == FPROP){ -if(is_chwn){ - result += R"( - int32 rcwh[TM] = rxc / NB; - int32 rcb[TM] = rxc % NB; - int32 rcw[TM] = rcwh % CW; - int32 rch[TM] = rcwh / CW;)"; -} -else{ - result += R"( - int32 rcbh[TM] = rxc / CW; - int32 rcw[TM] = rxc % CW; - int32 rch[TM] = rcbh % CH; - int32 rcb[TM] = rcbh / CH;)"; -} - result += R"( + result += + compute_bhw("rc", "TM", "rxc") + R"( int32 offxc[TM] = rcb*ldc_b + rcw*ldc_w + rch*ldc_h;)"; } if(op_ == WGRAD){ @@ -572,17 +509,8 @@ if(op_ == WGRAD){ int1 checkc1[TN] = ryc < N; int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :];)"; if(op_ == BPROP){ - result += "\n"; - if(shift_edge_h_) - result += " int1 interiorh[TM] = 1;\n"; - else - result += " int1 interiorh[TM] = (rch >= pad_h) && (rch < (AH - pad_h));\n"; - if(shift_edge_w_) - result += " int1 interiorw[TM] = 1;"; - else - result += " int1 interiorw[TM] = (rcw >= pad_w) && (rcw < (AW - pad_w));"; - result += R"( - int1 interior[TM, TN] = interiorh[:, newaxis] && interiorw[:, newaxis]; + result += R"( + )" + compute_interior("rc", "TM", "TN") + R"( __constant__ int32* pd[TN] = delta_a + ryc; )" + c_ty_ + R"(* shift_pc[TM, TN] = pc + (*pd)[newaxis, :]; pc = interior ? shift_pc : pc; @@ -591,12 +519,32 @@ if(op_ == BPROP){ } else{ result += R"( - @checkc *pc = c;)"; + int1 has_lock = (GZ > 1) && (locks != 0); + if(has_lock){ + int32 ridx = get_range_id(0); + int32 ridy = get_range_id(1); + int32 *plock = locks + ridx + ridy*grid0; + while(__atomic_cas(plock, 0, 1)); + int32 *pcount = plock + grid0*grid1; + int32 count = *pcount; + int32 countp1 = select(count == split - 1, 0, count + 1); + if(count == 0) { + @checkc *pc = c; + *pcount = countp1; + } + else { + @checkc *pc = c + *pc; + *pcount = countp1; + } + *plock = 0; + } + else{ + @checkc *pc = c; + })"; } result += R"( })"; -// std::cout << result << std::endl; os << result; } diff --git a/lib/lang/expression.cpp b/lib/lang/expression.cpp index 85e98a771..1e0536801 100644 --- a/lib/lang/expression.cpp +++ b/lib/lang/expression.cpp @@ -73,10 +73,14 @@ ir::value *binary_expression::llvm_op(ir::module *mod, ir::builder &builder, ir: return builder.create_icmpSGE(lhs, rhs, name); if(op_ == GE && is_int && !is_signed) return builder.create_icmpUGE(lhs, rhs, name); + if(op_ == EQ && is_ptr) + return builder.create_icmpEQ(lhs, rhs, name); if(op_ == EQ && is_float) return builder.create_fcmpOEQ(lhs, rhs, name); if(op_ == EQ && is_int) return builder.create_icmpEQ(lhs, rhs, name); + if(op_ == NE && is_ptr) + return builder.create_icmpNE(lhs, rhs, name); if(op_ == NE && is_float) return builder.create_fcmpONE(lhs, rhs, name); if(op_ == NE && is_int) diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index b55680a21..15d33b029 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -43,7 +43,8 @@ void loop_nest(std::vector const & ranges, size_t current = 0; while(true){ //Execute function - pool.add_job([values, &f](){ f(values); }); +// pool.add_job([values, &f](){ f(values); }); + f(values); //Increment counters while(values[i]++ == ranges[i] - 1){ if(i == 0) @@ -169,7 +170,7 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben ranges.push_back(mp->get_space()); // iterate over parameters tune_res_t best; - size_t nthreads = 1; + size_t nthreads = 4; std::mutex mutex; loop_nest(ranges, [&](const std::vector params){ std::map> errors; From f50d7a420a5ac4a4788a5cb1fb891663810c5f6f Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 15 Jul 2019 21:16:50 -0700 Subject: [PATCH 240/494] [runtime/jit] fixed bug in multi-threaded auto-tuning --- lib/dnn/shift.cpp | 2 +- lib/runtime/jit.cpp | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 87aaf32cb..d06366550 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -317,7 +317,7 @@ const tunable int32 TM = {16, 32, 64, 128}; const tunable int32 TN = {16, 32, 64, 128}; const tunable int32 TK = {)" + std::to_string(TK_) + "};"; if(op_ == WGRAD) - result += "const tunable int32 GZ = {1, 4, 16};"; + result += "const tunable int32 GZ = {1};"; else result += "const tunable int32 GZ = {1};"; diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index 15d33b029..fa970653a 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -43,8 +43,8 @@ void loop_nest(std::vector const & ranges, size_t current = 0; while(true){ //Execute function -// pool.add_job([values, &f](){ f(values); }); - f(values); + pool.add_job([values, &f](){ f(values); }); +// f(values); //Increment counters while(values[i]++ == ranges[i] - 1){ if(i == 0) @@ -210,9 +210,9 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben std::unique_ptr module(driver::module::create(driver_context_, &*ll_module)); std::unique_ptr kernel(driver::kernel::create(module.get(), name)); double perf; - perf = benchmark(kernel.get(), info); { std::lock_guard lock(mutex); + perf = benchmark(kernel.get(), info); if(perf > best.perf){ best.perf = perf; best.params = params; From 7d1797cd3271bc53d7a93461cae1369c3f894461 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 16 Jul 2019 12:59:27 -0700 Subject: [PATCH 241/494] ugh --- examples/cpp/dot.cpp | 2 +- examples/cpp/shift.cpp | 2 +- examples/python/tensorflow/run.py | 6 +----- lib/codegen/selection.cpp | 2 -- lib/dnn/shift.cpp | 28 +++++++++++++++------------- lib/runtime/jit.cpp | 15 +++++++++------ 6 files changed, 27 insertions(+), 28 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 7612f7c16..cada07cd4 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -13,7 +13,7 @@ int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); // matrix multiplication parameters - int32_t M = 4096, N = 4096, K = 4096; + int32_t M = 131072, N = 128, K = 128; std::vector hc(M*N); std::vector rc(M*N); std::vector ha(M*K); diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index 482fad6b4..020dba23a 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -14,7 +14,7 @@ int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); - auto op = triton::dnn::shift::WGRAD; + auto op = triton::dnn::shift::FPROP; // initialization int32_t R = 3, S = 3; diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 893fc5b10..665ec8cd0 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -66,16 +66,12 @@ def run_shift(): b = tf.placeholder(tf.float16, shape=[C, F]) hshift_h = np.random.randint(- (R//2), R//2 + 1, size=C, dtype=np.int32) hshift_w = np.random.randint(- (S//2), R//2 + 1, size=C, dtype=np.int32) - #hshift_h = np.zeros(C, dtype=np.int32) - #hshift_w = np.zeros(C, dtype=np.int32) c = module.shift_conv(a, b, stride_h=stride_h, stride_w=stride_w, shift_h=tf.make_tensor_proto(hshift_h), shift_w=tf.make_tensor_proto(hshift_w)) # feed values ha = np.random.rand(B, C, H, W)*0.1 hb = np.random.rand(C, F)*0.1 - #ha = np.ones((B, C, H, W), dtype=np.float16) - #hb = np.ones((C, F), dtype=np.float16) sess = tf.InteractiveSession() - # test + # check gradients grads = tf.test.compute_gradient([a, b], [(B, C, H, W), (C, F)], c, (B, F, H//stride_h, W//stride_w), extra_feed_dict = {a: ha, b: hb}, delta=1e-2) dw_t, dw_n = grads[1] diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 7ca8fb6ee..f83eea5c7 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -1118,8 +1118,6 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & unsigned max_contiguous = axis_info_->get_max_contiguous(ptr); unsigned alignment = std::min(starting_multiple, max_contiguous); unsigned vector_size = std::min(result->axis(0).contiguous, alignment); - vector_size = 1; -// vector_size = result->axis(0).contiguous; std::map packets; distributed_tile *TP = (distributed_tile*)tmap_.at(ld->get_pointer_operand()); result->for_each([&](indices_t idx){ diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index d06366550..b9e580506 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -223,6 +223,7 @@ void shift::enqueue_impl(driver::stream *stream, driver::kernel *kernel, unsigned grid_2 = num_locks < max_locks_ ? info.globals.at("GZ") : 1; std::array grid = {grid_0, grid_1, grid_2}; driver::buffer *a = args[0], *b = args[1], *c = args[2]; +// std::cout << op_ << " " << M_ << " " << N_ << " " << K_ << std::endl; kernel->setArg(0, a); kernel->setArg(1, b); kernel->setArg(2, c); @@ -253,6 +254,9 @@ void shift::enqueue_impl(driver::stream *stream, driver::kernel *kernel, kernel->setArg(27, (num_locks > max_locks_) ? nullptr : locks_); kernel->setArg(28, (int32_t)grid[0]); kernel->setArg(29, (int32_t)grid[1]); + kernel->setArg(30, (int32_t)grid[2]); + if(locks_) + ((driver::cu_buffer*)locks_)->set_zero(stream, 2*max_locks_*4); if(op_ == BPROP){ size_t c_nbytes = (c_ty_ == "fp16") ? 2 : 4; ((driver::cu_buffer*)c)->set_zero(stream, AH_*AW_*B_*C_*c_nbytes); @@ -317,7 +321,7 @@ const tunable int32 TM = {16, 32, 64, 128}; const tunable int32 TN = {16, 32, 64, 128}; const tunable int32 TK = {)" + std::to_string(TK_) + "};"; if(op_ == WGRAD) - result += "const tunable int32 GZ = {1};"; + result += "const tunable int32 GZ = {1, 4, 16};"; else result += "const tunable int32 GZ = {1};"; @@ -329,14 +333,14 @@ void shift(restrict read_only align(16) )" + a_ty_ + R"( *A, )" + c_ty_ + R"( *C, int32 M, int32 N, int32 K, int32 stride_h, int32 stride_w, - int32 lda_b, int32 lda_w, int32 lda_h, int32 lda_c, - int32 ldb_b, int32 ldb_w, int32 ldb_h, int32 ldb_c, - int32 ldc_b, int32 ldc_w, int32 ldc_h, int32 ldc_c, + multiple_of(4) int32 lda_b, multiple_of(4) int32 lda_w, multiple_of(4) int32 lda_h, multiple_of(4) int32 lda_c, + multiple_of(4) int32 ldb_b, multiple_of(4) int32 ldb_w, multiple_of(4) int32 ldb_h, multiple_of(4) int32 ldb_c, + multiple_of(4) int32 ldc_b, multiple_of(4) int32 ldc_w, multiple_of(4) int32 ldc_h, multiple_of(4) int32 ldc_c, int32 NB, int32 AH, int32 AW, int32 BH, int32 BW, int32 CH, int32 CW, - int32* locks, int32 grid0, int32 grid1) { + int32* locks, int32 grid0, int32 grid1, int32 grid2) { int32 rxa[TM] = get_global_range[TM](0); int32 ryb[TN] = get_global_range[TN](1); int32 rz = get_global_range[1](2); @@ -345,9 +349,8 @@ void shift(restrict read_only align(16) )" + a_ty_ + R"( *A, fp32 acc[TM, TN] = 0; int32 pad_h = BH / 2; int32 pad_w = BW / 2; - int32 split = select(locks == 0, 1, GZ); - int32 div = K / split; - int32 rem = K % split; + int32 div = K / grid2; + int32 rem = K % grid2; K = select(rz < rem, div - 1, div); int32 offk = select(rz < rem, rz*(div + 1), rz*div + rem);)"; if(op_ == WGRAD){ @@ -366,7 +369,7 @@ if(op_ == FPROP){ int32 offxa[TM] = rab*lda_b + raw*lda_w + rah*lda_h; int32 offa0[TM, TK] = offxa[:, newaxis]; __constant__ int32* pd[TK] = delta_a + rka; - int32 d[TK] = *pd; + multiple_of(4) int32 d[TK] = *pd; int32 offa_interior[TM, TK] = d[newaxis, :]; int32 offa_exterior[TM, TK] = rka[newaxis, :] * lda_c; )" + compute_interior("ra", "TM", "TK") + R"( @@ -524,18 +527,17 @@ else{ int32 ridx = get_range_id(0); int32 ridy = get_range_id(1); int32 *plock = locks + ridx + ridy*grid0; - while(__atomic_cas(plock, 0, 1)); int32 *pcount = plock + grid0*grid1; + while(__atomic_cas(plock, 0, 1) == 1); int32 count = *pcount; - int32 countp1 = select(count == split - 1, 0, count + 1); + int32 countp1 = select(count == grid2 - 1, 0, count + 1); if(count == 0) { @checkc *pc = c; - *pcount = countp1; } else { @checkc *pc = c + *pc; - *pcount = countp1; } + *pcount = countp1; *plock = 0; } else{ diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index fa970653a..ec38b7fa1 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -37,20 +37,24 @@ void loop_nest(std::vector const & ranges, size_t D = ranges.size(); std::vector values(D, 0); // thread pools - nbsdx::concurrent::thread_pool pool(nthreads); +// nbsdx::concurrent::thread_pool pool(nthreads); // Start with innermost loop size_t i = D - 1; - size_t current = 0; +// size_t current = 0; while(true){ //Execute function - pool.add_job([values, &f](){ f(values); }); -// f(values); +// pool.add_job([values, &f](){ f(values); }); + f(values); //Increment counters while(values[i]++ == ranges[i] - 1){ if(i == 0) return; values[i--] = 0; } +// if(current++ >= 1024){ +// current = 0; +// pool.join_all(); +// } i = D - 1; } } @@ -128,7 +132,6 @@ std::vector jit::get_valid(const char *name, const char *src) { ranges.push_back(mp->get_space()); // iterate over parameters std::vector result; - size_t nthreads = 1; loop_nest(ranges, [&](const std::vector params){ if(!result.empty()) return; @@ -146,7 +149,7 @@ std::vector jit::get_valid(const char *name, const char *src) { if(!errors.empty()) return; result = params; - }, nthreads); + }, 1); if(result.empty()) throw std::runtime_error("couldn't find valid parameters"); return result; From 28959fe1652fd07654cf575de9d6d4010184c335 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 16 Jul 2019 14:41:38 -0700 Subject: [PATCH 242/494] [runtime/jit] made auto-tuning silent --- examples/cpp/dot.cpp | 6 +- examples/python/pytorch/shift.cpp | 2 +- include/triton/runtime/jit.h | 3 +- include/triton/tools/thread_pool.h | 239 +++++++++++------------------ lib/codegen/tune.cpp | 2 +- lib/runtime/jit.cpp | 25 ++- 6 files changed, 105 insertions(+), 172 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index cada07cd4..25cf1c0a7 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -13,7 +13,7 @@ int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); // matrix multiplication parameters - int32_t M = 131072, N = 128, K = 128; + int32_t M = 32768, N = 128, K = 128; std::vector hc(M*N); std::vector rc(M*N); std::vector ha(M*K); @@ -33,8 +33,8 @@ int main() { stream->write(db, true, 0, hb); stream->write(dc, true, 0, hc); stream->synchronize(); - triton::dnn::gemm gemm(M, N, K, AT, BT, "fp32", "fp32", 4, 4); - gemm.enqueue(stream, {da, db, dc}); + triton::dnn::gemm gemm(M, N, K, AT, BT, "fp16", "fp16", 4, 4); + gemm.enqueue(stream, {da, db, dc}, true); stream->read(dc, true, 0, hc); gemm.cpu_ref(rc, ha, hb); for(size_t i = 0; i < M*N; i++) diff --git a/examples/python/pytorch/shift.cpp b/examples/python/pytorch/shift.cpp index e3e968db6..7efe0198b 100644 --- a/examples/python/pytorch/shift.cpp +++ b/examples/python/pytorch/shift.cpp @@ -75,7 +75,7 @@ torch::Tensor shift_common( triton::driver::cu_buffer c(ctx, (CUdeviceptr)torchc.storage().data(), false); // Enqueue - shift.enqueue(&stream, {&a, &b, &c}); + shift.enqueue(&stream, {&a, &b, &c}, true); return torchc; } diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index c594eccd8..8c2fa41b8 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -105,7 +105,7 @@ private: triton::lang::translation_unit *parse_program(const char *name, const char *src); public: - jit(driver::context* context); + jit(driver::context* context, unsigned nthreads = 4); ~jit(); std::vector get_valid(const char *name, const char *src); tune_res_t autotune(const char* name, const char* src, benchmark_t benchmark); @@ -121,6 +121,7 @@ private: ir::context triton_context_; std::map launch_info_map_; std::shared_ptr target_; + unsigned nthreads_; }; } diff --git a/include/triton/tools/thread_pool.h b/include/triton/tools/thread_pool.h index 5d01511e1..0475bdb24 100644 --- a/include/triton/tools/thread_pool.h +++ b/include/triton/tools/thread_pool.h @@ -1,161 +1,98 @@ -#ifndef CONCURRENT_THREADPOOL_H -#define CONCURRENT_THREADPOOL_H +#ifndef THREAD_POOL_H +#define THREAD_POOL_H -#include +#include +#include +#include #include #include -#include -#include -#include #include +#include +#include +#include -namespace nbsdx { -namespace concurrent { - -/** - * Simple ThreadPool that creates `ThreadCount` threads upon its creation, - * and pulls from a queue to get new jobs. The default is 10 threads. - * - * This class requires a number of c++11 features be present in your compiler. - */ -class thread_pool { - - std::vector threads_; - std::list> queue_; - - std::atomic_int jobs_left_; - std::atomic_bool bailout_; - std::atomic_bool finished_; - std::condition_variable job_available_var_; - std::condition_variable wait_var_; - std::mutex wait_mutex_; - std::mutex queue_mutex_; - unsigned thread_count_; - - /** - * Take the next job in the queue and run it. - * Notify the main thread that a job has completed. - */ - void task() { - while( !bailout_ ) { - next_job()(); - --jobs_left_; - wait_var_.notify_one(); - } - } - - /** - * Get the next job; pop the first item in the queue, - * otherwise wait for a signal from the main thread. - */ - std::function next_job() { - std::function res; - std::unique_lock job_lock( queue_mutex_ ); - - // Wait for a job if we don't have any. - job_available_var_.wait( job_lock, [this]() ->bool { return queue_.size() || bailout_; } ); - - // Get job from the queue - if( !bailout_ ) { - res = queue_.front(); - queue_.pop_front(); - } - else { // If we're bailing out, 'inject' a job into the queue to keep jobs_left accurate. - res = []{}; - ++jobs_left_; - } - return res; - } - +class ThreadPool { public: - thread_pool(unsigned thread_count = 4) - : jobs_left_( 0 ) - , bailout_( false ) - , finished_( false ) - , thread_count_(thread_count) - { - threads_.resize(thread_count_); - for( unsigned i = 0; i < thread_count_; ++i ) - threads_[ i ] = std::thread( [this]{ this->task(); } ); - } + ThreadPool(size_t); + template + auto enqueue(F&& f, Args&&... args) + -> std::future::type>; + ~ThreadPool(); +private: + // need to keep track of threads so we can join them + std::vector< std::thread > workers; + // the task queue + std::queue< std::function > tasks; - /** - * JoinAll on deconstruction - */ - ~thread_pool() { - join_all(); - } - - /** - * Get the number of threads in this pool - */ - inline unsigned size() const { - return thread_count_; - } - - /** - * Get the number of jobs left in the queue. - */ - inline unsigned jobs_remaining() { - std::lock_guard guard( queue_mutex_ ); - return queue_.size(); - } - - /** - * Add a new job to the pool. If there are no jobs in the queue, - * a thread is woken up to take the job. If all threads are busy, - * the job is added to the end of the queue. - */ - void add_job( std::function job ) { - std::lock_guard guard( queue_mutex_ ); - queue_.emplace_back( job ); - ++jobs_left_; - job_available_var_.notify_one(); - } - - /** - * Join with all threads. Block until all threads have completed. - * Params: WaitForAll: If true, will wait for the queue to empty - * before joining with threads. If false, will complete - * current jobs, then inform the threads to exit. - * The queue will be empty after this call, and the threads will - * be done. After invoking `ThreadPool::JoinAll`, the pool can no - * longer be used. If you need the pool to exist past completion - * of jobs, look to use `ThreadPool::WaitAll`. - */ - void join_all( bool WaitForAll = true ) { - if( !finished_ ) { - if( WaitForAll ) { - wait_all(); - } - - // note that we're done, and wake up any thread that's - // waiting for a new job - bailout_ = true; - job_available_var_.notify_all(); - - for( auto &x : threads_ ) - if( x.joinable() ) - x.join(); - finished_ = true; - } - } - - /** - * Wait for the pool to empty before continuing. - * This does not call `std::thread::join`, it only waits until - * all jobs have finshed executing. - */ - void wait_all() { - if( jobs_left_ > 0 ) { - std::unique_lock lk( wait_mutex_ ); - wait_var_.wait( lk, [this]{ return this->jobs_left_ == 0; } ); - lk.unlock(); - } - } + // synchronization + std::mutex queue_mutex; + std::condition_variable condition; + bool stop; }; -} // namespace concurrent -} // namespace nbsdx +// the constructor just launches some amount of workers +inline ThreadPool::ThreadPool(size_t threads) + : stop(false) +{ + for(size_t i = 0;i task; -#endif //CONCURRENT_THREADPOOL_H + { + std::unique_lock lock(this->queue_mutex); + this->condition.wait(lock, + [this]{ return this->stop || !this->tasks.empty(); }); + if(this->stop && this->tasks.empty()) + return; + task = std::move(this->tasks.front()); + this->tasks.pop(); + } + + task(); + } + } + ); +} + +// add new work item to the pool +template +auto ThreadPool::enqueue(F&& f, Args&&... args) + -> std::future::type> +{ + using return_type = typename std::result_of::type; + + auto task = std::make_shared< std::packaged_task >( + std::bind(std::forward(f), std::forward(args)...) + ); + + std::future res = task->get_future(); + { + std::unique_lock lock(queue_mutex); + + // don't allow enqueueing after stopping the pool + if(stop) + throw std::runtime_error("enqueue on stopped ThreadPool"); + + tasks.emplace([task](){ (*task)(); }); + } + condition.notify_one(); + return res; +} + +// the destructor joins all threads +inline ThreadPool::~ThreadPool() +{ + { + std::unique_lock lock(queue_mutex); + stop = true; + } + condition.notify_all(); + for(std::thread &worker: workers) + worker.join(); +} + +#endif diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 3b4fc0492..e7273d5d5 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -223,7 +223,7 @@ void tune::run(ir::module &mod) { } else { ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 2, 2); - ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 1, 4); + ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 2, 4); connected_components(node, {fpw, wpt}, {"fpw", "wpt"}, nodes_, dependencies_, group_id++); } } diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index ec38b7fa1..779e24a3f 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -37,24 +37,18 @@ void loop_nest(std::vector const & ranges, size_t D = ranges.size(); std::vector values(D, 0); // thread pools -// nbsdx::concurrent::thread_pool pool(nthreads); + ThreadPool pool(nthreads); // Start with innermost loop size_t i = D - 1; // size_t current = 0; while(true){ //Execute function -// pool.add_job([values, &f](){ f(values); }); - f(values); - //Increment counters + pool.enqueue([values, &f](){ f(values); }); while(values[i]++ == ranges[i] - 1){ if(i == 0) return; values[i--] = 0; } -// if(current++ >= 1024){ -// current = 0; -// pool.join_all(); -// } i = D - 1; } } @@ -111,8 +105,9 @@ std::unique_ptr jit::make_triton_module(const char * name, triton::i } -jit::jit(driver::context *context): driver_context_(context), - target_(context->device()->make_target()) { } +jit::jit(driver::context *context, unsigned nthreads): driver_context_(context), + target_(context->device()->make_target()), + nthreads_(nthreads) { } jit::~jit(){ } @@ -173,7 +168,6 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben ranges.push_back(mp->get_space()); // iterate over parameters tune_res_t best; - size_t nthreads = 4; std::mutex mutex; loop_nest(ranges, [&](const std::vector params){ std::map> errors; @@ -220,11 +214,12 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben best.perf = perf; best.params = params; } - for(unsigned p: params) - std::cout << p << " " << std::flush; - std::cout << perf << " [ " << best.perf << " ] " << std::endl; +// for(unsigned p: params) +// std::cout << p << " " << std::flush; +// std::cout << perf << " [ " << best.perf << " ] " << std::endl; } - }, nthreads); + }, nthreads_); + std::cout << "Autotuning done - Best performance: " << best.perf << std::endl; return best; } From 164d85077f3ba5c7797c0e5ac88601c7d6f2c957 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 16 Jul 2019 15:03:53 -0700 Subject: [PATCH 243/494] more stuff --- examples/cpp/dot.cpp | 2 +- lib/dnn/base.cpp | 2 +- lib/dnn/gemm.cpp | 2 +- lib/runtime/jit.cpp | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 25cf1c0a7..c19d43e2e 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -13,7 +13,7 @@ int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); // matrix multiplication parameters - int32_t M = 32768, N = 128, K = 128; + int32_t M = 8192, N = 8192, K = 8192; std::vector hc(M*N); std::vector rc(M*N); std::vector ha(M*K); diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index c4f5ace3e..a3a3ce403 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -30,7 +30,7 @@ void base::enqueue(driver::stream *stream, std::vector args, b /* the current template has not already been compiled */ if(m_jit.find(this) == m_jit.end()) { base* clone = this->clone(); - jit = m_jit.emplace(clone, new rt::jit(ctx)).first->second.get(); + jit = m_jit.emplace(clone, std::unique_ptr(new rt::jit(ctx))).first->second.get(); std::ostringstream oss; clone->triton_c_src(oss); std::string src = oss.str(); diff --git a/lib/dnn/gemm.cpp b/lib/dnn/gemm.cpp index 139062db8..4e79b5b7a 100644 --- a/lib/dnn/gemm.cpp +++ b/lib/dnn/gemm.cpp @@ -106,7 +106,7 @@ void gemm::triton_c_src(std::ostream &os) const { R"( const tunable int32 TM = {16, 32, 64, 128}; const tunable int32 TN = {16, 32, 64, 128}; -const tunable int32 TK = {8}; +const tunable int32 TK = {16}; const tunable int32 GZ = {1}; void matmul(restrict read_only )" + a_ty_ + R"( *A, diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index 779e24a3f..14bec7172 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -214,9 +214,9 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben best.perf = perf; best.params = params; } -// for(unsigned p: params) -// std::cout << p << " " << std::flush; -// std::cout << perf << " [ " << best.perf << " ] " << std::endl; + for(unsigned p: params) + std::cout << p << " " << std::flush; + std::cout << perf << " [ " << best.perf << " ] " << std::endl; } }, nthreads_); std::cout << "Autotuning done - Best performance: " << best.perf << std::endl; From 5f6dd23fc2dfcdd4caddf8c7c10ff680e4e3fd6c Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 16 Jul 2019 16:14:58 -0700 Subject: [PATCH 244/494] [dnn/dot] reverted back to peak tensorcores performance --- examples/python/tensorflow/run.py | 17 ++++++++--------- include/triton/tools/bench.hpp | 4 ++-- lib/codegen/tune.cpp | 6 +++--- lib/dnn/base.cpp | 2 +- lib/dnn/gemm.cpp | 19 +++---------------- 5 files changed, 17 insertions(+), 31 deletions(-) diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 665ec8cd0..809967f84 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -9,25 +9,23 @@ library_dir = os.path.dirname(os.path.realpath(__file__)) module = tf.load_op_library(os.path.join(library_dir, 'libtf_blocksparse.so')) def run_dot(): - M, N, K = 128,128,128 + M, N, K = 8192, 8192, 8192 a = tf.placeholder(tf.float16, shape=[M, K]) b = tf.placeholder(tf.float16, shape=[N, K]) - locks = tf.placeholder(tf.int32, shape=[4096]) # c = tf.matmul(a, b, transpose_a=True) - c = module.dot(a, b, locks) + c = module.dot(a, b) # Reference ha = np.random.rand(M, K).astype(np.float16) hb = np.random.rand(N, K).astype(np.float16) # Run sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) - result = sess.run([c], feed_dict = {locks: np.zeros(4096), - a: ha, + result = sess.run([c], feed_dict = {a: ha, b: hb})[0] # Test - hresult = np.dot(ha.T, hb).T - dif = np.abs(result - hresult) - print("dif: %f" % np.max(dif)) + #hresult = np.dot(ha.T, hb).T + #dif = np.abs(result - hresult) + #print("dif: %f" % np.max(dif)) def run_conv(): B, C, H, W = 16, 32, 32, 32 @@ -130,5 +128,6 @@ def run_batchnorm(): print(np.max(np.abs(dg_t - dg_n))) print(np.max(np.abs(db_t - db_n))) -run_shift() +run_dot() +#run_shift() #run_batchnorm() diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp index 3c584bb02..64c88cd64 100644 --- a/include/triton/tools/bench.hpp +++ b/include/triton/tools/bench.hpp @@ -32,7 +32,7 @@ double bench(OP const & op, SYNC const & sync, const triton::driver::device * de double total_time = 0; op(); sync(); -// while(total_time*1e-9 < 1e-3){ + while(total_time*1e-9 < 1e-3){ float norm = 1; // normalize clock if possible to get roughly constant result if(auto cu_device = dynamic_cast(device)) @@ -42,7 +42,7 @@ double bench(OP const & op, SYNC const & sync, const triton::driver::device * de sync(); times.push_back(norm*tmr.get().count()); total_time+=times.back(); -// } + } return *std::min_element(times.begin(), times.end()); } diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index e7273d5d5..e78440f52 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -237,13 +237,13 @@ void tune::run(ir::module &mod) { continue; if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 2)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 4, 4)); *params_.at(i).at("nts.d0") = *tmp; } if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 2, 2)); - std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 2, 2)); + std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 4, 4)); + std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 4, 4)); *params_.at(i).at("nts.d0") = *tmp1; *params_.at(i).at("nts.d1") = *tmp2; } diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index a3a3ce403..9798d8cb3 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -51,7 +51,7 @@ void base::enqueue(driver::stream *stream, std::vector args, b jit->add_module(name_.c_str(), src.c_str(), best.params); } else { - jit->add_module(name_.c_str(), src.c_str(), jit->get_valid(name_.c_str(), src.c_str())); + jit->add_module(name_.c_str(), src.c_str(), {16, 4, 128, 16, 4, 128, 2, 2, 2, 2, 8, 16, 8, 1}); } triton::driver::kernel* kernel = jit->get_function(name_.c_str()); clone->init_impl(stream, (triton::driver::cu_module*)kernel->module()); diff --git a/lib/dnn/gemm.cpp b/lib/dnn/gemm.cpp index 4e79b5b7a..eb0042901 100644 --- a/lib/dnn/gemm.cpp +++ b/lib/dnn/gemm.cpp @@ -109,8 +109,8 @@ const tunable int32 TN = {16, 32, 64, 128}; const tunable int32 TK = {16}; const tunable int32 GZ = {1}; -void matmul(restrict read_only )" + a_ty_ + R"( *A, - restrict read_only )" + b_ty_ + R"( *B, +void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, + restrict read_only align(16) )" + b_ty_ + R"( *B, fp32 *C, int32 M, int32 N, int32 K, )" + align_lda_str + R"( int32 lda, )" + align_ldb_str + R"(" int32 ldb, int32 ldc, @@ -158,20 +158,7 @@ void matmul(restrict read_only )" + a_ty_ + R"( *A, int1 checkc1[TN] = ryc < N; int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - int32 *plock = locks + ridx + ridy*grid0; - while(__atomic_cas(plock, 0, 1)); - int32 *pcount = plock + grid0*grid1; - int32 count = *pcount; - int32 countp1 = select(count == GZ - 1, 0, count + 1); - if(count == 0) { - @checkc *pc = c; - *pcount = countp1; - } - else { - @checkc *pc = c + *pc; - *pcount = countp1; - } - __atomic_cas(plock, 1, 0); + @checkc *pc = c; } )"; os << res; From ec24e1e7df15572f34454b270797fa7f5e812c9e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 16 Jul 2019 18:47:50 -0700 Subject: [PATCH 245/494] trying to remove interior logic --- examples/cpp/shift.cpp | 6 +++--- examples/python/tensorflow/run.py | 4 ++-- lib/codegen/tune.cpp | 2 +- lib/dnn/base.cpp | 2 +- lib/dnn/shift.cpp | 36 +++++++++++-------------------- 5 files changed, 20 insertions(+), 30 deletions(-) diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index 020dba23a..754853a8e 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -14,13 +14,13 @@ int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); - auto op = triton::dnn::shift::FPROP; + auto op = triton::dnn::shift::BPROP; // initialization int32_t R = 3, S = 3; - int32_t B = 128, F = 128; + int32_t B = 16, F = 4096; int32_t H = 16, W = 16; - int32_t C = 128; + int32_t C = 4096; // random shifts std::vector shift_h(C); diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 809967f84..5fb1d9314 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -128,6 +128,6 @@ def run_batchnorm(): print(np.max(np.abs(dg_t - dg_n))) print(np.max(np.abs(db_t - db_n))) -run_dot() -#run_shift() +#run_dot() +run_shift() #run_batchnorm() diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index e78440f52..f18afeeac 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -237,7 +237,7 @@ void tune::run(ir::module &mod) { continue; if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 4, 4)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 4)); *params_.at(i).at("nts.d0") = *tmp; } if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index 9798d8cb3..a3a3ce403 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -51,7 +51,7 @@ void base::enqueue(driver::stream *stream, std::vector args, b jit->add_module(name_.c_str(), src.c_str(), best.params); } else { - jit->add_module(name_.c_str(), src.c_str(), {16, 4, 128, 16, 4, 128, 2, 2, 2, 2, 8, 16, 8, 1}); + jit->add_module(name_.c_str(), src.c_str(), jit->get_valid(name_.c_str(), src.c_str())); } triton::driver::kernel* kernel = jit->get_function(name_.c_str()); clone->init_impl(stream, (triton::driver::cu_module*)kernel->module()); diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index b9e580506..0691a5980 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -80,7 +80,9 @@ shift::shift(int B, int C, throw std::runtime_error("unsupported input layout"); } // Equivalent matmul - M_ = B_*CH_*CW_; + M_ = B_*(CH_ - BH_ / 2)*(CW_ - BW_/2); + if(M_ == 0) + throw std::runtime_error("unsupported input shapes - no interior !"); N_ = F_; K_ = C_; // transpose @@ -288,14 +290,14 @@ void shift::triton_c_src(std::ostream &os) const { return R"( int32 )" + rx + "wh[" + sz + "] = " + rkx + R"( / NB; int32 )" + rx + "b[" + sz + "] = " + rkx + R"( % NB; - int32 )" + rx + "w[" + sz + "] = " + rx + R"(wh % CW; - int32 )" + rx + "h[" + sz + "] = " + rx + R"(wh / CW;)"; + int32 )" + rx + "w[" + sz + "] = " + rx + R"(wh % CW + pad_w; + int32 )" + rx + "h[" + sz + "] = " + rx + R"(wh / CW + pad_h;)"; } else { return R"( int32 )" + rx + "bh[" + sz + "] = " + rkx + R"( / CW; - int32 )" + rx + "w[" + sz + "] = " + rkx + R"( % CW; - int32 )" + rx + "h[" + sz + "] = " + rx + R"(bh % CH; + int32 )" + rx + "w[" + sz + "] = " + rkx + R"( % CW + pad_w; + int32 )" + rx + "h[" + sz + "] = " + rx + R"(bh % CH + pad_h; int32 )" + rx + "b[" + sz + "] = " + rx + R"(bh / CH;)"; } }; @@ -370,10 +372,7 @@ if(op_ == FPROP){ int32 offa0[TM, TK] = offxa[:, newaxis]; __constant__ int32* pd[TK] = delta_a + rka; multiple_of(4) int32 d[TK] = *pd; - int32 offa_interior[TM, TK] = d[newaxis, :]; - int32 offa_exterior[TM, TK] = rka[newaxis, :] * lda_c; - )" + compute_interior("ra", "TM", "TK") + R"( - int32 offa1[TM, TK] = interior ? offa_interior : offa_exterior;)"; + int32 offa1[TM, TK] = d[newaxis, :];)"; } if(op_ == BPROP){ result += @@ -415,10 +414,8 @@ if(op_ == WGRAD){ rbw = rbw * stride_w; rbh = rbh * stride_h; int32 offkb[TK] = rbb*ldb_b + rbw*ldb_w + rbh*ldb_h; - )" + compute_interior("rb", "TK", "TN") + R"( - int32 incb[TK, TN] = interior ? shift : 0; int32 offb0[TK, TN] = ryb[newaxis, :] * ldb_c; - int32 offb1[TK, TN] = offkb[:, newaxis] + incb;)"; + int32 offb1[TK, TN] = offkb[:, newaxis] + shift;)"; } /* Main loop */ @@ -439,10 +436,7 @@ if(op_ == FPROP){ result += R"( pd = pd + TK; d = *pd; - offa_interior = d[newaxis, :]; - offa_exterior = TK * lda_c; - int32 offa[TM, TK] = interior ? offa_interior : offa_exterior; - pa = pa + offa;)"; + pa = pa + d[newaxis, :];)"; } if(op_ == BPROP){ result += R"( @@ -470,9 +464,7 @@ if(op_ == WGRAD){ rbw = rbw * stride_w; rbh = rbh * stride_h; offkb = rbb*ldb_b + rbw*ldb_w + rbh*ldb_h; - )" + compute_interior("rb", "TK", "TN") + R"( - incb = interior ? shift : 0; - pb = B + offb0 + offkb[:, newaxis] + incb;)"; + pb = B + offb0 + offkb[:, newaxis] + shift;)"; } if(op_ == FPROP){ result += R"( @@ -513,11 +505,9 @@ if(op_ == WGRAD){ int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :];)"; if(op_ == BPROP){ result += R"( - )" + compute_interior("rc", "TM", "TN") + R"( __constant__ int32* pd[TN] = delta_a + ryc; - )" + c_ty_ + R"(* shift_pc[TM, TN] = pc + (*pd)[newaxis, :]; - pc = interior ? shift_pc : pc; - @checkc __atomic_add(pc, c); + pc = pc + (*pd)[newaxis, :]; + @checkc *pc = c; )"; } else{ From 07c964919cf78c86514e39ceaeefe7e4568eb317 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 16 Jul 2019 20:18:48 -0700 Subject: [PATCH 246/494] [dnn/shift] now strictly only shifting the interior --- examples/python/pytorch/shift.cpp | 2 +- examples/python/tensorflow/shift.cpp | 2 +- include/triton/dnn/shift.h | 8 +++++ lib/codegen/tune.cpp | 4 +-- lib/dnn/shift.cpp | 50 +++++++++++----------------- lib/runtime/jit.cpp | 3 +- 6 files changed, 34 insertions(+), 35 deletions(-) diff --git a/examples/python/pytorch/shift.cpp b/examples/python/pytorch/shift.cpp index 7efe0198b..d25ed588f 100644 --- a/examples/python/pytorch/shift.cpp +++ b/examples/python/pytorch/shift.cpp @@ -75,7 +75,7 @@ torch::Tensor shift_common( triton::driver::cu_buffer c(ctx, (CUdeviceptr)torchc.storage().data(), false); // Enqueue - shift.enqueue(&stream, {&a, &b, &c}, true); + shift.enqueue(&stream, {&a, &b, &c}, false); return torchc; } diff --git a/examples/python/tensorflow/shift.cpp b/examples/python/tensorflow/shift.cpp index d844e9aa1..1834cadaf 100644 --- a/examples/python/tensorflow/shift.cpp +++ b/examples/python/tensorflow/shift.cpp @@ -122,7 +122,7 @@ public: triton::driver::cu_buffer da(ctx, (CUdeviceptr)tf_a.flat().data(), false); triton::driver::cu_buffer db(ctx, (CUdeviceptr)tf_b.flat().data(), false); triton::driver::cu_buffer dc(ctx, (CUdeviceptr)tf_c->flat().data(), false); - shift.enqueue(stream, {&da, &db, &dc}); + shift.enqueue(stream, {&da, &db, &dc}, false); } private: diff --git a/include/triton/dnn/shift.h b/include/triton/dnn/shift.h index fbff404ca..84c6ccda7 100644 --- a/include/triton/dnn/shift.h +++ b/include/triton/dnn/shift.h @@ -128,6 +128,14 @@ private: int32_t CD_; int32_t CH_; int32_t CW_; + // interior image size + int32_t IAD_; + int32_t IAH_; + int32_t IAW_; + // interior activation size + int32_t ICD_; + int32_t ICH_; + int32_t ICW_; // equivalent matmul int32_t M_; int32_t N_; diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index f18afeeac..6c9522f03 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -223,7 +223,7 @@ void tune::run(ir::module &mod) { } else { ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 2, 2); - ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 2, 4); + ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 1, 4); connected_components(node, {fpw, wpt}, {"fpw", "wpt"}, nodes_, dependencies_, group_id++); } } @@ -237,7 +237,7 @@ void tune::run(ir::module &mod) { continue; if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 4)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 2)); *params_.at(i).at("nts.d0") = *tmp; } if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 0691a5980..49212619d 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -79,10 +79,15 @@ shift::shift(int B, int C, default: throw std::runtime_error("unsupported input layout"); } + IAD_ = AD_ - 2*(BD_/2); + IAH_ = AH_ - 2*(BH_/2); + IAW_ = AW_ - 2*(BW_/2); + ICD_ = IAD_ / stride_d_; + ICH_ = IAH_ / stride_h_; + ICW_ = IAW_ / stride_w_; + // Equivalent matmul - M_ = B_*(CH_ - BH_ / 2)*(CW_ - BW_/2); - if(M_ == 0) - throw std::runtime_error("unsupported input shapes - no interior !"); + M_ = B_*ICH_*ICW_; N_ = F_; K_ = C_; // transpose @@ -247,21 +252,21 @@ void shift::enqueue_impl(driver::stream *stream, driver::kernel *kernel, kernel->setArg(18, ldc_h_); kernel->setArg(19, ldc_f_); kernel->setArg(20, B_); - kernel->setArg(21, AH_); - kernel->setArg(22, AW_); + kernel->setArg(21, IAH_); + kernel->setArg(22, IAW_); kernel->setArg(23, BH_); kernel->setArg(24, BW_); - kernel->setArg(25, CH_); - kernel->setArg(26, CW_); + kernel->setArg(25, ICH_); + kernel->setArg(26, ICW_); kernel->setArg(27, (num_locks > max_locks_) ? nullptr : locks_); kernel->setArg(28, (int32_t)grid[0]); kernel->setArg(29, (int32_t)grid[1]); kernel->setArg(30, (int32_t)grid[2]); if(locks_) ((driver::cu_buffer*)locks_)->set_zero(stream, 2*max_locks_*4); - if(op_ == BPROP){ + if(op_ == FPROP || op_ == BPROP){ size_t c_nbytes = (c_ty_ == "fp16") ? 2 : 4; - ((driver::cu_buffer*)c)->set_zero(stream, AH_*AW_*B_*C_*c_nbytes); + ((driver::cu_buffer*)c)->set_zero(stream, c_size()*c_nbytes); } stream->enqueue(kernel, grid, {info.num_threads, 1, 1}); } @@ -290,33 +295,18 @@ void shift::triton_c_src(std::ostream &os) const { return R"( int32 )" + rx + "wh[" + sz + "] = " + rkx + R"( / NB; int32 )" + rx + "b[" + sz + "] = " + rkx + R"( % NB; - int32 )" + rx + "w[" + sz + "] = " + rx + R"(wh % CW + pad_w; - int32 )" + rx + "h[" + sz + "] = " + rx + R"(wh / CW + pad_h;)"; + int32 )" + rx + "w[" + sz + "] = (" + rx + R"(wh % CW) + pad_w; + int32 )" + rx + "h[" + sz + "] = (" + rx + R"(wh / CW) + pad_h;)"; } else { return R"( int32 )" + rx + "bh[" + sz + "] = " + rkx + R"( / CW; - int32 )" + rx + "w[" + sz + "] = " + rkx + R"( % CW + pad_w; - int32 )" + rx + "h[" + sz + "] = " + rx + R"(bh % CH + pad_h; + int32 )" + rx + "w[" + sz + "] = (" + rkx + R"( % CW) + pad_w; + int32 )" + rx + "h[" + sz + "] = (" + rx + R"(bh % CH) + pad_h; int32 )" + rx + "b[" + sz + "] = " + rx + R"(bh / CH;)"; } }; - auto compute_interior = [&](std::string rx, std::string sz0, std::string sz1) { - std::string result; - if(shift_edge_h_) - result += "int1 interiorh[" + sz0 + "] = 1;\n "; - else - result += "int1 interiorh[" + sz0 + "] = (" + rx + "h >= pad_h) && (" + rx + "h < (AH - pad_h));\n "; - if(shift_edge_w_) - result += "int1 interiorw[" + sz0 + "] = 1;"; - else - result += "int1 interiorw[" + sz0 + "] = (" + rx + "w >= pad_w) && (" + rx + "w < (AW - pad_w));"; - result += R"( - int1 interior[)" + sz0 + ", " + sz1 + "] = interiorh[:, newaxis] && interiorw[:, newaxis];"; - return result; - }; - std::string result = R"( const tunable int32 TM = {16, 32, 64, 128}; @@ -506,8 +496,8 @@ if(op_ == WGRAD){ if(op_ == BPROP){ result += R"( __constant__ int32* pd[TN] = delta_a + ryc; - pc = pc + (*pd)[newaxis, :]; - @checkc *pc = c; + )" + c_ty_ + R"(* shift_pc[TM, TN] = pc + (*pd)[newaxis, :]; + @checkc *shift_pc = c; )"; } else{ diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index 14bec7172..e42c534b6 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -43,7 +43,8 @@ void loop_nest(std::vector const & ranges, // size_t current = 0; while(true){ //Execute function - pool.enqueue([values, &f](){ f(values); }); +// pool.enqueue([values, &f](){ f(values); }); + f(values); while(values[i]++ == ranges[i] - 1){ if(i == 0) return; From a55b098e8880ae29f82b219bf4372588c3ff1324 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 16 Jul 2019 21:05:21 -0700 Subject: [PATCH 247/494] [dnn/shift] now using constant divisions --- examples/cpp/shift.cpp | 2 +- lib/codegen/tune.cpp | 4 ++-- lib/dnn/shift.cpp | 20 ++++++++++++-------- lib/runtime/jit.cpp | 4 ++-- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index 754853a8e..982085b10 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -14,7 +14,7 @@ int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); - auto op = triton::dnn::shift::BPROP; + auto op = triton::dnn::shift::FPROP; // initialization int32_t R = 3, S = 3; diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 6c9522f03..f18afeeac 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -223,7 +223,7 @@ void tune::run(ir::module &mod) { } else { ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 2, 2); - ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 1, 4); + ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 2, 4); connected_components(node, {fpw, wpt}, {"fpw", "wpt"}, nodes_, dependencies_, group_id++); } } @@ -237,7 +237,7 @@ void tune::run(ir::module &mod) { continue; if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 2)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 4)); *params_.at(i).at("nts.d0") = *tmp; } if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 49212619d..bf3fde138 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -291,19 +291,23 @@ void shift::triton_c_src(std::ostream &os) const { bool is_chwn = layout_ == CHWN; auto compute_bhw = [&](std::string rx, std::string sz, std::string rkx){ + std::string B = std::to_string(B_); + std::string CW = std::to_string(CW_); + std::string CH = std::to_string(CH_); + if(is_chwn) { return R"( - int32 )" + rx + "wh[" + sz + "] = " + rkx + R"( / NB; - int32 )" + rx + "b[" + sz + "] = " + rkx + R"( % NB; - int32 )" + rx + "w[" + sz + "] = (" + rx + R"(wh % CW) + pad_w; - int32 )" + rx + "h[" + sz + "] = (" + rx + R"(wh / CW) + pad_h;)"; + int32 )" + rx + "wh[" + sz + "] = " + rkx + " / " + B + R"(; + int32 )" + rx + "b[" + sz + "] = " + rkx + " % " + B + R"(); + int32 )" + rx + "w[" + sz + "] = (" + rx + "(wh % " + CW + R"() + pad_w; + int32 )" + rx + "h[" + sz + "] = (" + rx + "(wh / " + CW + R"() + pad_h;)"; } else { return R"( - int32 )" + rx + "bh[" + sz + "] = " + rkx + R"( / CW; - int32 )" + rx + "w[" + sz + "] = (" + rkx + R"( % CW) + pad_w; - int32 )" + rx + "h[" + sz + "] = (" + rx + R"(bh % CH) + pad_h; - int32 )" + rx + "b[" + sz + "] = " + rx + R"(bh / CH;)"; + int32 )" + rx + "bh[" + sz + "] = " + rkx + " / " + CW + R"(; + int32 )" + rx + "w[" + sz + "] = (" + rkx + " % " + CW + R"() + pad_w; + int32 )" + rx + "h[" + sz + "] = (" + rx + "bh % " + CH + R"() + pad_h; + int32 )" + rx + "b[" + sz + "] = " + rx + "bh / " + CH + ";"; } }; diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index e42c534b6..cb07bd0b3 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -43,8 +43,8 @@ void loop_nest(std::vector const & ranges, // size_t current = 0; while(true){ //Execute function -// pool.enqueue([values, &f](){ f(values); }); - f(values); + pool.enqueue([values, &f](){ f(values); }); +// f(values); while(values[i]++ == ranges[i] - 1){ if(i == 0) return; From 791c91ee6398896ef0962014270ce1dc2631a30e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 17 Jul 2019 11:39:17 -0700 Subject: [PATCH 248/494] [dnn/shift] bugfix in static shape division --- lib/codegen/tune.cpp | 4 ++-- lib/dnn/shift.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index f18afeeac..6c9522f03 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -223,7 +223,7 @@ void tune::run(ir::module &mod) { } else { ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 2, 2); - ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 2, 4); + ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 1, 4); connected_components(node, {fpw, wpt}, {"fpw", "wpt"}, nodes_, dependencies_, group_id++); } } @@ -237,7 +237,7 @@ void tune::run(ir::module &mod) { continue; if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 4)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 2)); *params_.at(i).at("nts.d0") = *tmp; } if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index bf3fde138..c4f1b6dc6 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -292,8 +292,8 @@ void shift::triton_c_src(std::ostream &os) const { auto compute_bhw = [&](std::string rx, std::string sz, std::string rkx){ std::string B = std::to_string(B_); - std::string CW = std::to_string(CW_); - std::string CH = std::to_string(CH_); + std::string CW = std::to_string(ICW_); + std::string CH = std::to_string(ICH_); if(is_chwn) { return R"( From d2e116d05729b0228aaa5f6ad761c3db0231fc8b Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 17 Jul 2019 12:38:30 -0700 Subject: [PATCH 249/494] testing GEMM --- examples/python/tensorflow/run.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 5fb1d9314..5fc0e9988 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -9,7 +9,7 @@ library_dir = os.path.dirname(os.path.realpath(__file__)) module = tf.load_op_library(os.path.join(library_dir, 'libtf_blocksparse.so')) def run_dot(): - M, N, K = 8192, 8192, 8192 + M, N, K = 128, 128, 128 a = tf.placeholder(tf.float16, shape=[M, K]) b = tf.placeholder(tf.float16, shape=[N, K]) # c = tf.matmul(a, b, transpose_a=True) @@ -23,9 +23,11 @@ def run_dot(): result = sess.run([c], feed_dict = {a: ha, b: hb})[0] # Test - #hresult = np.dot(ha.T, hb).T - #dif = np.abs(result - hresult) - #print("dif: %f" % np.max(dif)) + hresult = np.dot(ha.T, hb).T + dif = np.abs(result - hresult) + print(hresult) + print(result) + print("dif: %f" % np.max(dif)) def run_conv(): B, C, H, W = 16, 32, 32, 32 @@ -128,6 +130,6 @@ def run_batchnorm(): print(np.max(np.abs(dg_t - dg_n))) print(np.max(np.abs(db_t - db_n))) -#run_dot() -run_shift() +run_dot() +#run_shift() #run_batchnorm() From bfa39b8992fee00a92703aaf786c1b4639813b11 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 17 Jul 2019 13:20:33 -0700 Subject: [PATCH 250/494] preparing the field for tensor cores transposes --- examples/cpp/dot.cpp | 2 +- examples/python/tensorflow/dot.cpp | 2 +- examples/python/tensorflow/run.py | 2 +- include/triton/ir/instructions.h | 1 + include/triton/runtime/jit.h | 2 +- lib/codegen/optimize_dot.cpp | 68 +++++++++++++++++++++++------- lib/codegen/tune.cpp | 6 +-- lib/ir/instructions.cpp | 8 ++++ lib/runtime/jit.cpp | 5 --- 9 files changed, 67 insertions(+), 29 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index c19d43e2e..c0b0ae52d 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -9,7 +9,7 @@ int main() { bool AT = false; - bool BT = true; + bool BT = false; // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); // matrix multiplication parameters diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index 6d5cbb414..9bd25eeb3 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -49,7 +49,7 @@ class DotOp : public OpKernel { triton::driver::cu_buffer db(ctx, (CUdeviceptr)b.flat().data(), false); triton::driver::cu_buffer dc(ctx, (CUdeviceptr)c->flat().data(), false); // template - triton::dnn::gemm dot(M, N, K, false, true, "fp16", "fp16", 4, 4); + triton::dnn::gemm dot(M, N, K, false, false, "fp16", "fp16", 4, 4); dot.enqueue(stream, {&da, &db, &dc}); } diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 5fc0e9988..9824bcea4 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -23,7 +23,7 @@ def run_dot(): result = sess.run([c], feed_dict = {a: ha, b: hb})[0] # Test - hresult = np.dot(ha.T, hb).T + hresult = np.dot(ha.T, hb.T).T dif = np.abs(result - hresult) print(hresult) print(result) diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index 9828406af..99ef1d1be 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -550,6 +550,7 @@ private: std::string repr_impl() const { return std::string("dot.") + ((AT_==NoTrans)?"n":"t") + ((BT_==NoTrans)?"n":"t"); } public: + static instruction *create(value *A, value *B, value *C, bool AT, bool BT, const std::string &name = "", instruction *next = nullptr); static instruction* create_nn(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr); static instruction* create_nt(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr); static instruction* create_tn(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr); diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index 8c2fa41b8..d3088d73b 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -68,9 +68,9 @@ public: target_(target) { } void target_independent(ir::module &module) { -// ir::print(module, std::cout); optimize_dot.run(module); optimize_trans.run(module); +// ir::print(module, std::cout); } void target_dependent(ir::module &module) { diff --git a/lib/codegen/optimize_dot.cpp b/lib/codegen/optimize_dot.cpp index 67e3f8569..ee59145c7 100644 --- a/lib/codegen/optimize_dot.cpp +++ b/lib/codegen/optimize_dot.cpp @@ -11,6 +11,21 @@ inline bool is_trans(ir::value *v){ return dynamic_cast(v) != nullptr; } +inline bool is_hmma(ir::value *v){ + bool result = false; + if(auto *x = dynamic_cast(v)){ + ir::value *a = x->get_operand(0); + ir::type *a_ty = a->get_type(); + ir::value *b = x->get_operand(1); + ir::type *b_ty = b->get_type(); + // inputs have to be FP16 + result = a_ty->get_scalar_ty()->is_half_ty() && b_ty->get_scalar_ty()->is_half_ty(); + // reduction has to be multiple of 4 + result = result && ((a_ty->get_tile_shapes()[1]->get_value() % 4) == 0); + } + return result; +} + void optimize_dot::run(ir::module &mod) { ir::builder &builder = mod.get_builder(); std::vector to_delete; @@ -19,26 +34,47 @@ void optimize_dot::run(ir::module &mod) { for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()) if(auto dot = dynamic_cast(i)) - if(dot->get_operand(1)->get_type()->get_tile_shapes()[1]->get_value() != 1) - if(!dot->is_a_trans() && !dot->is_b_trans()){ + if(dot->get_operand(1)->get_type()->get_tile_shapes()[1]->get_value() != 1){ builder.set_insert_point(i); ir::value *A = dot->get_operand(0); ir::value *B = dot->get_operand(1); ir::value *D = dot->get_operand(2); - // dot(op(a), trans(b)) - if(is_trans(B)){ - ir::value* BN = ((ir::trans_inst*)B)->get_operand(0); - ir::instruction *NT = builder.insert(ir::dot_inst::create_nt(A, BN, D)); - dot->replace_all_uses_with(NT); - to_delete.push_back((ir::instruction*)B); - to_delete.push_back(dot); - } - // dot(op(a), b) - if(!is_trans(B)){ - ir::value* BT = builder.create_trans(B); - ir::instruction *NT = builder.insert(ir::dot_inst::create_nt(A, BT, D)); - dot->replace_all_uses_with(NT); - to_delete.push_back(dot); + bool trans_a = is_trans(A); + bool trans_b = is_trans(B); + + if(!dot->is_a_trans() && !dot->is_b_trans()){ + if(is_hmma(dot)){ + ir::value *AA = A; + ir::value *BB = B; + if(trans_a){ + AA = ((ir::trans_inst*)A)->get_operand(0); + to_delete.push_back((ir::instruction*)A); + } + if(trans_b){ + BB = ((ir::trans_inst*)B)->get_operand(0); + to_delete.push_back((ir::instruction*)B); + } + ir::instruction *dot_atbt = builder.insert(ir::dot_inst::create(AA, BB, D, trans_a, trans_b)); + dot->replace_all_uses_with(dot_atbt); + to_delete.push_back(dot); + } + else{ + // dot(op(a), trans(b)) + if(trans_b){ + ir::value* BB = ((ir::trans_inst*)B)->get_operand(0); + ir::instruction *NT = builder.insert(ir::dot_inst::create_nt(A, BB, D)); + dot->replace_all_uses_with(NT); + to_delete.push_back((ir::instruction*)B); + to_delete.push_back(dot); + } + // dot(op(a), b) + if(!trans_b){ + ir::value* BB = builder.create_trans(B); + ir::instruction *NT = builder.insert(ir::dot_inst::create_nt(A, BB, D)); + dot->replace_all_uses_with(NT); + to_delete.push_back(dot); + } + } } } diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 6c9522f03..7baf54fc8 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -22,10 +22,8 @@ bool is_hmma(ir::value *v){ ir::type *a_ty = a->get_type(); ir::value *b = x->get_operand(1); ir::type *b_ty = b->get_type(); - // only NT supported - result = !x->is_a_trans() && x->is_b_trans(); // inputs have to be FP16 - result = result && a_ty->get_scalar_ty()->is_half_ty() && b_ty->get_scalar_ty()->is_half_ty(); + result = a_ty->get_scalar_ty()->is_half_ty() && b_ty->get_scalar_ty()->is_half_ty(); // reduction has to be multiple of 4 result = result && ((a_ty->get_tile_shapes()[1]->get_value() % 4) == 0); } @@ -223,7 +221,7 @@ void tune::run(ir::module &mod) { } else { ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 2, 2); - ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 1, 4); + ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 2, 4); connected_components(node, {fpw, wpt}, {"fpw", "wpt"}, nodes_, dependencies_, group_id++); } } diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 27efc0838..e3ac042d1 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -527,6 +527,14 @@ dot_inst::dot_inst(value *A, value *B, value *C, TransT AT, TransT BT, set_operand(2, C); } +instruction *dot_inst::create(value *A, value *B, value *C, + bool AT, bool BT, + const std::string &name, instruction *next) { + TransT OPA = AT ? Trans : NoTrans; + TransT OPB = BT ? Trans : NoTrans; + return new dot_inst(A, B, C, OPA, OPB, name, next); +} + instruction *dot_inst::create_nn(value *A, value *B, value *C, const std::string &name, instruction *next) { return new dot_inst(A, B, C, NoTrans, NoTrans, name, next); diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index cb07bd0b3..2f4ddd710 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -135,13 +135,8 @@ std::vector jit::get_valid(const char *name, const char *src) { unsigned i = 0; for(ir::metaparameter *mp: mps) mp->set_value(params[i++]); - passes.target_independent(tt_module); passes.tune.init(tt_module); passes.tune.check_constraints(errors); -// for(auto e: errors) -// for(auto x: e.second) -// std::cout << x << std::endl; -// std::cout << "-----" << std::endl; if(!errors.empty()) return; result = params; From 2f0817b2cd52e267f60a693410a7d0c94bde496d Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 17 Jul 2019 17:20:38 -0700 Subject: [PATCH 251/494] [codegen/selection] tensor cores now used for transposed layotus --- examples/cpp/dot.cpp | 2 +- examples/python/tensorflow/run.py | 1 + lib/codegen/selection.cpp | 45 +++++++++++++++++++++++++------ lib/codegen/shmem_allocation.cpp | 19 ++++++++++--- lib/codegen/tune.cpp | 4 +-- 5 files changed, 57 insertions(+), 14 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index c0b0ae52d..c19d43e2e 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -9,7 +9,7 @@ int main() { bool AT = false; - bool BT = false; + bool BT = true; // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); // matrix multiplication parameters diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 9824bcea4..88fe7ef3d 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -25,6 +25,7 @@ def run_dot(): # Test hresult = np.dot(ha.T, hb.T).T dif = np.abs(result - hresult) + np.savetxt('dif.dat', dif, '%2.4f') print(hresult) print(result) print("dif: %f" % np.max(dif)) diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index f83eea5c7..72ca66ad1 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -538,8 +538,11 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id /* intra warp offset */ // offset of quad in pair - Value *in_pair_off_a = builder.CreateMul(builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), builder.getInt32(4)), builder.getInt32(fpw_0 * pack_size_0_)); - Value *in_pair_off_b = builder.CreateMul(builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), builder.getInt32(4)), builder.getInt32(fpw_1 * pack_size_1_)); + Value *in_pair_off_a = builder.CreateMul(builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), builder.getInt32(4)), + builder.getInt32(fpw_0 * pack_size_0_)); + Value *in_pair_off_b = builder.CreateMul(builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), builder.getInt32(4)), + builder.getInt32(fpw_1 * pack_size_1_)); + // Quad pair id Value *pair_a_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), _4); Value *pair_b_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), _4); @@ -559,15 +562,17 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id // a offset offset_a_i_ = builder.CreateAdd(warp_offset_i, builder.CreateAdd(pair_a_off, in_pair_off_a)); offset_a_k_ = builder.CreateAnd(u_thread_id, _3); -// // b offsets + // b offsets offset_b_j_ = builder.CreateAdd(warp_offset_j, builder.CreateAdd(pair_b_off, in_pair_off_b)); offset_b_k_ = builder.CreateAnd(u_thread_id, _3); + // c offsets Value *offset_c_i = builder.CreateAdd(builder.CreateAnd(u_thread_id, _1), offset_a_i_); Value *offset_c_j = builder.CreateAdd(builder.CreateAnd(u_thread_id, _2), builder.CreateAdd(warp_offset_j, pair_b_off)); + /* indices */ // i indices std::vector idx_i; @@ -1026,7 +1031,25 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & Type *fp32_pack8_ty = StructType::get(ctx, {fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}); FunctionType *mma_ty = FunctionType::get(fp32_pack8_ty, {fp16x2_ty, fp16x2_ty, fp16x2_ty, fp16x2_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}, false); - InlineAsm *mma_fn = InlineAsm::get(mma_ty, " mma.sync.aligned.m8n8k4.col.row.f32.f16.f16.f32 " + Value *offset_a_i = offset_a_i_; + Value *offset_a_k = offset_a_k_; + Value *offset_b_j = offset_b_j_; + Value *offset_b_k = offset_b_k_; + + Value* u_thread_id = tgt_->get_local_id(builder.GetInsertBlock()->getModule(), builder, 0); + if(dot->is_a_trans()){ + offset_a_i = builder.CreateAdd(offset_a_i, builder.CreateURem(u_thread_id, builder.getInt32(4))); + offset_a_k = builder.getInt32(0); + } + if(!dot->is_b_trans()){ + offset_b_j = builder.CreateAdd(offset_b_j, builder.CreateURem(u_thread_id, builder.getInt32(4))); + offset_b_k = builder.getInt32(0); + } + + std::string op_a = dot->is_a_trans() ? "row" : "col"; + std::string op_b = dot->is_b_trans() ? "row" : "col"; + + InlineAsm *mma_fn = InlineAsm::get(mma_ty, " mma.sync.aligned.m8n8k4." + op_a + "." + op_b + ".f32.f16.f16.f32 " "{$0, $1, $2, $3, $4, $5, $6, $7}, " "{$8, $9}, " "{$10, $11}, " @@ -1046,10 +1069,16 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & for(unsigned pack_j = 0; pack_j < num_packs_1_; pack_j++){ for(unsigned K = 0; K < NK; K += 4){ Value *_K = builder.getInt32(K); - Value *current_offset_a_i = builder.CreateAdd(offset_a_i_, builder.getInt32(pack_i*stride_rep_i*pack_size_0_)); - Value *current_offset_b_i = builder.CreateAdd(offset_b_j_, builder.getInt32(pack_j*stride_rep_j*pack_size_1_)); - Value *ha = TA->get_value({current_offset_a_i, builder.CreateAdd(offset_a_k_, _K)}); - Value *hb = TB->get_value({current_offset_b_i, builder.CreateAdd(offset_b_k_, _K)}); + Value *current_offset_a_i = builder.CreateAdd(offset_a_i, builder.getInt32(pack_i*stride_rep_i*pack_size_0_)); + Value *current_offset_b_i = builder.CreateAdd(offset_b_j, builder.getInt32(pack_j*stride_rep_j*pack_size_1_)); + indices_t idx_a = {current_offset_a_i, builder.CreateAdd(offset_a_k, _K)}; + indices_t idx_b = {current_offset_b_i, builder.CreateAdd(offset_b_k, _K)}; + if(dot->is_a_trans()) + std::swap(idx_a[0], idx_a[1]); + if(!dot->is_b_trans()) + std::swap(idx_b[0], idx_b[1]); + Value *ha = TA->get_value(idx_a); + Value *hb = TB->get_value(idx_b); for(unsigned ii = 0; ii < pack_size_0_; ii++) for(unsigned jj = 0; jj < pack_size_1_; jj++){ Value *ha0 = builder.CreateExtractElement(ha, builder.getInt32(ii*pack_size_0_ + 0)); diff --git a/lib/codegen/shmem_allocation.cpp b/lib/codegen/shmem_allocation.cpp index 469524b07..eb65b224f 100644 --- a/lib/codegen/shmem_allocation.cpp +++ b/lib/codegen/shmem_allocation.cpp @@ -15,9 +15,22 @@ unsigned shmem_allocation::is_ld_padded(ir::value *x) { if(dynamic_cast(x)) return 4; for(ir::user* user: x->get_users()) - if(dynamic_cast(user)) - if(params_->get_fragment(user, 0) == tune::HMMA_FRAGMENT_C){ - return 16; + if(auto dot = dynamic_cast(user)){ + bool is_hmma = params_->get_fragment(user, 0) == tune::HMMA_FRAGMENT_C; + bool is_op_0 = x == dot->get_operand(0); + bool is_op_1 = x == dot->get_operand(1); + if(is_hmma && is_op_0){ + if(dot->is_a_trans()) + return 20; + else + return 16; + } + if(is_hmma && is_op_1){ + if(!dot->is_b_trans()) + return 20; + else + return 16; + } } if(auto* phi = dynamic_cast(x)) { unsigned result = 0; diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 7baf54fc8..293ebf053 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -221,7 +221,7 @@ void tune::run(ir::module &mod) { } else { ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 2, 2); - ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 2, 4); + ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 1, 4); connected_components(node, {fpw, wpt}, {"fpw", "wpt"}, nodes_, dependencies_, group_id++); } } @@ -235,7 +235,7 @@ void tune::run(ir::module &mod) { continue; if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 2)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 4, 4)); *params_.at(i).at("nts.d0") = *tmp; } if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ From 86f70f8224a4acb1b4509642e56713e5dbdcc198 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 17 Jul 2019 21:46:23 -0700 Subject: [PATCH 252/494] [codegen/selection] performance fix-up when A is transposed for hmma --- examples/cpp/dot.cpp | 6 +++--- lib/codegen/selection.cpp | 4 +++- lib/codegen/shmem_allocation.cpp | 4 ++-- lib/codegen/tune.cpp | 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index c19d43e2e..43903c592 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -8,12 +8,12 @@ int main() { - bool AT = false; - bool BT = true; + bool AT = true; + bool BT = false; // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); // matrix multiplication parameters - int32_t M = 8192, N = 8192, K = 8192; + int32_t M = 2048, N = 2048, K = 2048; std::vector hc(M*N); std::vector rc(M*N); std::vector ha(M*K); diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 72ca66ad1..04a413b32 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -984,7 +984,8 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & distributed_tile *TC = (distributed_tile*)tmap_.at(C); Type *c_ty = llvm_type(C->get_type()->get_scalar_ty(), ctx); Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {c_ty}); - unsigned NK = A->get_type()->get_tile_shapes()[1]->get_value(); + size_t red_axis = dot->is_a_trans() ? 0 : 1; + unsigned NK = A->get_type()->get_tile_shapes()[red_axis]->get_value(); if(NK != 1) { shared_tile *TA = (shared_tile*)tmap_.at(A); @@ -1147,6 +1148,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & unsigned max_contiguous = axis_info_->get_max_contiguous(ptr); unsigned alignment = std::min(starting_multiple, max_contiguous); unsigned vector_size = std::min(result->axis(0).contiguous, alignment); +// vector_size = result->axis(0).contiguous; std::map packets; distributed_tile *TP = (distributed_tile*)tmap_.at(ld->get_pointer_operand()); result->for_each([&](indices_t idx){ diff --git a/lib/codegen/shmem_allocation.cpp b/lib/codegen/shmem_allocation.cpp index eb65b224f..4031864c2 100644 --- a/lib/codegen/shmem_allocation.cpp +++ b/lib/codegen/shmem_allocation.cpp @@ -21,13 +21,13 @@ unsigned shmem_allocation::is_ld_padded(ir::value *x) { bool is_op_1 = x == dot->get_operand(1); if(is_hmma && is_op_0){ if(dot->is_a_trans()) - return 20; + return 4; else return 16; } if(is_hmma && is_op_1){ if(!dot->is_b_trans()) - return 20; + return 4; else return 16; } diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 293ebf053..fcb519c4a 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -235,7 +235,7 @@ void tune::run(ir::module &mod) { continue; if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 4, 4)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 4)); *params_.at(i).at("nts.d0") = *tmp; } if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ From f0d8306437e9a4c2cc2fb2d2e62bfff70fd8ef36 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 18 Jul 2019 16:12:06 -0700 Subject: [PATCH 253/494] [codegen/alignment_info] better handling of constants --- examples/cpp/dot.cpp | 18 +++++++++--------- examples/cpp/shift.cpp | 6 +++--- examples/python/pytorch/shift.cpp | 2 +- include/triton/ir/constant.h | 2 ++ include/triton/runtime/jit.h | 1 - lib/codegen/alignment_info.cpp | 30 +++++++++++++++++++++++++----- lib/codegen/tune.cpp | 4 ++-- lib/dnn/gemm.cpp | 13 ++++--------- lib/dnn/shift.cpp | 6 +++--- lib/ir/constant.cpp | 7 +++++++ lib/runtime/jit.cpp | 1 + 11 files changed, 57 insertions(+), 33 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 43903c592..2e790c2f5 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -8,12 +8,12 @@ int main() { - bool AT = true; + bool AT = false; bool BT = false; // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); // matrix multiplication parameters - int32_t M = 2048, N = 2048, K = 2048; + int32_t M = 1024, N = 1024, K = 1024; std::vector hc(M*N); std::vector rc(M*N); std::vector ha(M*K); @@ -35,12 +35,12 @@ int main() { stream->synchronize(); triton::dnn::gemm gemm(M, N, K, AT, BT, "fp16", "fp16", 4, 4); gemm.enqueue(stream, {da, db, dc}, true); - stream->read(dc, true, 0, hc); - gemm.cpu_ref(rc, ha, hb); - for(size_t i = 0; i < M*N; i++) - if(!std::isnan(hc[i]) && std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ - std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; - exit(EXIT_FAILURE); - } +// stream->read(dc, true, 0, hc); +// gemm.cpu_ref(rc, ha, hb); +// for(size_t i = 0; i < M*N; i++) +// if(!std::isnan(hc[i]) && std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ +// std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; +// exit(EXIT_FAILURE); +// } std::cout << "Pass!" << std::endl; } diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index 982085b10..482fad6b4 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -14,13 +14,13 @@ int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); - auto op = triton::dnn::shift::FPROP; + auto op = triton::dnn::shift::WGRAD; // initialization int32_t R = 3, S = 3; - int32_t B = 16, F = 4096; + int32_t B = 128, F = 128; int32_t H = 16, W = 16; - int32_t C = 4096; + int32_t C = 128; // random shifts std::vector shift_h(C); diff --git a/examples/python/pytorch/shift.cpp b/examples/python/pytorch/shift.cpp index d25ed588f..7efe0198b 100644 --- a/examples/python/pytorch/shift.cpp +++ b/examples/python/pytorch/shift.cpp @@ -75,7 +75,7 @@ torch::Tensor shift_common( triton::driver::cu_buffer c(ctx, (CUdeviceptr)torchc.storage().data(), false); // Enqueue - shift.enqueue(&stream, {&a, &b, &c}, false); + shift.enqueue(&stream, {&a, &b, &c}, true); return torchc; } diff --git a/include/triton/ir/constant.h b/include/triton/ir/constant.h index 43aa41c6d..49f11a1aa 100644 --- a/include/triton/ir/constant.h +++ b/include/triton/ir/constant.h @@ -67,6 +67,8 @@ class constant_range: public constant{ public: static constant *get(constant_int *first, constant_int *last); + const constant_int* get_first() const; + const constant_int* get_last() const; private: constant_int* first_; diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index d3088d73b..a88cb2ddf 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -70,7 +70,6 @@ public: void target_independent(ir::module &module) { optimize_dot.run(module); optimize_trans.run(module); -// ir::print(module, std::cout); } void target_dependent(ir::module &module) { diff --git a/lib/codegen/alignment_info.cpp b/lib/codegen/alignment_info.cpp index 5a7dc5fcd..5b7564479 100644 --- a/lib/codegen/alignment_info.cpp +++ b/lib/codegen/alignment_info.cpp @@ -34,6 +34,8 @@ bool alignment_info::populate_is_constant(ir::value *v) { if(is_first_axis_unit(op)) return cache(true); } + if(auto *x = dynamic_cast(v)) + return cache(true); if(auto *x = dynamic_cast(v)){ bool lhs = populate_is_constant(x->get_operand(0)); bool rhs = populate_is_constant(x->get_operand(1)); @@ -138,6 +140,18 @@ unsigned alignment_info::populate_max_contiguous(ir::value *v){ return cache(1); } +inline int gcd(int a, int b) { + if (a == 0) + return b; + if (b == 0) + return a; + if (a == b) + return a; + if (a > b) + return gcd(a-b, b); + return gcd(a, b-a); +} + unsigned alignment_info::populate_starting_multiple(ir::value *v){ if(starting_multiple_.find(v) != starting_multiple_.end()) return starting_multiple_.at(v); @@ -168,7 +182,7 @@ unsigned alignment_info::populate_starting_multiple(ir::value *v){ if(x->is_int_mult()) return cache(lhs * rhs); if(x->is_int_add_sub()) - return cache(std::min(lhs, rhs)); + return cache(gcd(lhs, rhs)); if(x->is_int_div()) return cache(std::max(lhs / rhs, 1)); if(x->is_int_rem()) @@ -178,10 +192,15 @@ unsigned alignment_info::populate_starting_multiple(ir::value *v){ if(x->is_shr()) return cache(std::max(lhs >> rhs, 1)); } + if(auto *x = dynamic_cast(v)) + return cache(x->get_value()); + if(auto *x = dynamic_cast(v)){ + return cache(x->get_first()->get_value()); + } if(auto *x = dynamic_cast(v)){ int lhs = populate_starting_multiple(x->get_operand(0)); int rhs = populate_starting_multiple(x->get_operand(1)); - return cache(std::min(lhs, rhs)); + return cache(gcd(lhs, rhs)); } if(auto *x = dynamic_cast(v)){ int op = populate_starting_multiple(x->get_operand(0)); @@ -193,7 +212,7 @@ unsigned alignment_info::populate_starting_multiple(ir::value *v){ if(auto *x = dynamic_cast(v)){ int value_true = populate_starting_multiple(x->get_value_true()); int value_false = populate_starting_multiple(x->get_value_false()); - return cache(std::min(value_true, value_false)); + return cache(gcd(value_true, value_false)); } if(auto *x = dynamic_cast(v)){ // put a conservative initial value in phi node to avoid infinite recursion @@ -207,7 +226,7 @@ unsigned alignment_info::populate_starting_multiple(ir::value *v){ // recurse for(unsigned n = 0; n < x->get_num_incoming(); n++){ ir::value* inc = x->get_incoming_value(n); - result = std::min(result, populate_starting_multiple(inc)); + result = gcd(result, populate_starting_multiple(inc)); } return cache(result); } @@ -230,7 +249,7 @@ unsigned alignment_info::get_max_contiguous(ir::value* v) const { return max_contiguous_.at(v); } - +///TODO: This doesn't seem to work in DOT-NN, DOT-TT, DOT-TN void alignment_info::run(ir::module &mod) { // populate constant for(ir::function *fn: mod.get_function_list()) @@ -251,6 +270,7 @@ void alignment_info::run(ir::module &mod) { for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()){ populate_max_contiguous(i); +// std::cout << i->get_name() << " " << max_contiguous_.at(i) << " " << starting_multiple_.at(i) << std::endl; } } diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index fcb519c4a..2812d00a2 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -221,7 +221,7 @@ void tune::run(ir::module &mod) { } else { ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 2, 2); - ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 1, 4); + ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 2, 4); connected_components(node, {fpw, wpt}, {"fpw", "wpt"}, nodes_, dependencies_, group_id++); } } @@ -235,7 +235,7 @@ void tune::run(ir::module &mod) { continue; if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 4)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 4, 4)); *params_.at(i).at("nts.d0") = *tmp; } if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ diff --git a/lib/dnn/gemm.cpp b/lib/dnn/gemm.cpp index eb0042901..05a47e41f 100644 --- a/lib/dnn/gemm.cpp +++ b/lib/dnn/gemm.cpp @@ -117,16 +117,11 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, int32 *locks, int32 grid0, int32 grid1) { int32 rxa[TM] = get_global_range[TM](0); int32 ryb[TN] = get_global_range[TN](1); - int32 rz = get_global_range[1](2); int32 rka[TK] = 0 ... TK; int32 rkb[TK] = 0 ... TK; fp32 c[TM, TN] = 0; - int32 div = K / GZ; - int32 rem = K % GZ; - K = select(rz < rem, div - 1, div); - int32 offk = select(rz < rem, rz*(div + 1), rz*div + rem); - )" + a_ty_ + R"(* pa[)" + AS0 + ", " + AS1 + "] = A + (offk + rka" + bca0 + ")" + lda0 + " + rxa" + bca1 + lda1 + R"(; - )" + b_ty_ + R"(* pb[)" + BS0 + ", " + BS1 + "] = B + (offk + rkb" + bcb0 + ")" + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; + )" + a_ty_ + R"(* pa[)" + AS0 + ", " + AS1 + "] = A + rka" + bca0 + lda0 + " + rxa" + bca1 + lda1 + R"(; + )" + b_ty_ + R"(* pb[)" + BS0 + ", " + BS1 + "] = B + rkb" + bcb0 + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; )" + a_ty_ + R"( a[)" + AS0 + ", " + AS1 + R"(] = *pa; )" + b_ty_ + R"( b[)" + BS0 + ", " + BS1 + R"(] = *pb; int32 last_a = ((M*K - 1) - (TM*TK + 1)) / lda; @@ -146,8 +141,8 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, for(int32 k = bound; k > 0; k = k - 1){ int1 checka[TM, 1] = rxc[:, newaxis] < M; int1 checkb[TN, 1] = ryc[:, newaxis] < N; - )" + a_ty_ + R"(* pa[TM, 1] = A + (offk + K - k))" + lda0 + " + rxc[:, newaxis]" + lda1 + R"(; - )" + b_ty_ + R"(* pb[TN, 1] = B + (offk + K - k))" + ldb0 + " + ryc[:, newaxis]" + ldb1 + R"(; + )" + a_ty_ + R"(* pa[TM, 1] = A + (K - k))" + lda0 + " + rxc[:, newaxis]" + lda1 + R"(; + )" + b_ty_ + R"(* pb[TN, 1] = B + (K - k))" + ldb0 + " + ryc[:, newaxis]" + ldb1 + R"(; )" + a_ty_ + R"( a[TM, 1] = checka ? *pa : 0; )" + b_ty_ + R"( b[TN, 1] = checkb ? *pb : 0; c = dot(a, trans(b), c); diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index c4f1b6dc6..47e283769 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -298,9 +298,9 @@ void shift::triton_c_src(std::ostream &os) const { if(is_chwn) { return R"( int32 )" + rx + "wh[" + sz + "] = " + rkx + " / " + B + R"(; - int32 )" + rx + "b[" + sz + "] = " + rkx + " % " + B + R"(); - int32 )" + rx + "w[" + sz + "] = (" + rx + "(wh % " + CW + R"() + pad_w; - int32 )" + rx + "h[" + sz + "] = (" + rx + "(wh / " + CW + R"() + pad_h;)"; + int32 )" + rx + "b[" + sz + "] = " + rkx + " % " + B + R"(; + int32 )" + rx + "w[" + sz + "] = (" + rx + "wh % " + CW + R"() + pad_w; + int32 )" + rx + "h[" + sz + "] = (" + rx + "wh / " + CW + R"() + pad_h;)"; } else { return R"( diff --git a/lib/ir/constant.cpp b/lib/ir/constant.cpp index 5df644842..a2341f52f 100644 --- a/lib/ir/constant.cpp +++ b/lib/ir/constant.cpp @@ -71,6 +71,13 @@ constant *constant_range::get(constant_int *first, constant_int *last) { return new constant_range(ty, first, last); } +const constant_int* constant_range::get_first() const { + return first_; +} + +const constant_int* constant_range::get_last() const { + return last_; +} // constant_fp // FIXME use something like APFloat diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index 2f4ddd710..d32c8722e 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -51,6 +51,7 @@ void loop_nest(std::vector const & ranges, values[i--] = 0; } i = D - 1; + std::this_thread::sleep_for(std::chrono::microseconds(1)); } } From 71594da66ff6c41d7c6c27605b4627a5957efd9c Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 18 Jul 2019 16:35:48 -0700 Subject: [PATCH 254/494] [dnn/gemm]: fixed leading dimension in transposed variants --- examples/cpp/dot.cpp | 22 +++++++++++----------- lib/codegen/selection.cpp | 1 + lib/dnn/gemm.cpp | 9 ++++++--- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 2e790c2f5..5068dfbde 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -8,12 +8,12 @@ int main() { - bool AT = false; + bool AT = true; bool BT = false; // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); // matrix multiplication parameters - int32_t M = 1024, N = 1024, K = 1024; + int32_t M = 64, N = 128, K = 128; std::vector hc(M*N); std::vector rc(M*N); std::vector ha(M*K); @@ -33,14 +33,14 @@ int main() { stream->write(db, true, 0, hb); stream->write(dc, true, 0, hc); stream->synchronize(); - triton::dnn::gemm gemm(M, N, K, AT, BT, "fp16", "fp16", 4, 4); - gemm.enqueue(stream, {da, db, dc}, true); -// stream->read(dc, true, 0, hc); -// gemm.cpu_ref(rc, ha, hb); -// for(size_t i = 0; i < M*N; i++) -// if(!std::isnan(hc[i]) && std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ -// std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; -// exit(EXIT_FAILURE); -// } + triton::dnn::gemm gemm(M, N, K, AT, BT, "fp32", "fp32", 4, 4); + gemm.enqueue(stream, {da, db, dc}, false); + stream->read(dc, true, 0, hc); + gemm.cpu_ref(rc, ha, hb); + for(size_t i = 0; i < M*N; i++) + if(std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ + std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; + exit(EXIT_FAILURE); + } std::cout << "Pass!" << std::endl; } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 04a413b32..3238efdfa 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -1149,6 +1149,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & unsigned alignment = std::min(starting_multiple, max_contiguous); unsigned vector_size = std::min(result->axis(0).contiguous, alignment); // vector_size = result->axis(0).contiguous; +// vector_size = 1; std::map packets; distributed_tile *TP = (distributed_tile*)tmap_.at(ld->get_pointer_operand()); result->for_each([&](indices_t idx){ diff --git a/lib/dnn/gemm.cpp b/lib/dnn/gemm.cpp index 05a47e41f..222173c61 100644 --- a/lib/dnn/gemm.cpp +++ b/lib/dnn/gemm.cpp @@ -54,6 +54,9 @@ void gemm::enqueue_impl(driver::stream *stream, driver::kernel *kernel, unsigned grid_0 = (M_ + TM - 1)/TM; unsigned grid_1 = (N_ + TN - 1)/TN; unsigned grid_2 = 1; + int32_t lda = AT_ ? K_ : M_; + int32_t ldb = BT_ ? N_ : K_; + int32_t ldc = M_; std::array grid = {grid_0, grid_1, grid_2}; kernel->setArg(0, a); kernel->setArg(1, b); @@ -61,9 +64,9 @@ void gemm::enqueue_impl(driver::stream *stream, driver::kernel *kernel, kernel->setArg(3, M_); kernel->setArg(4, N_); kernel->setArg(5, K_); - kernel->setArg(6, M_); - kernel->setArg(7, N_); - kernel->setArg(8, M_); + kernel->setArg(6, lda); + kernel->setArg(7, ldb); + kernel->setArg(8, ldc); kernel->setArg(9, locks_); kernel->setArg(10, grid_0); kernel->setArg(11, grid_1); From 5215fb0424ecc0b482dd040d7c8fce7d29d5befe Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 18 Jul 2019 19:39:40 -0700 Subject: [PATCH 255/494] [codegen] some more optimizations --- examples/cpp/dot.cpp | 43 +++++++++++++++++--------------- examples/cpp/shift.cpp | 10 ++++---- include/triton/driver/dispatch.h | 6 +++++ lib/codegen/selection.cpp | 14 ++++++++--- lib/codegen/shmem_allocation.cpp | 4 +-- lib/codegen/target.cpp | 38 +++++++++++++++++++++++++--- lib/codegen/tune.cpp | 8 +++--- lib/dnn/base.cpp | 5 ++-- lib/dnn/gemm.cpp | 29 ++++----------------- lib/driver/dispatch.cpp | 6 +++++ lib/driver/kernel.cpp | 1 + lib/runtime/jit.cpp | 17 ++++++------- 12 files changed, 108 insertions(+), 73 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 5068dfbde..b5af64615 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -10,37 +10,40 @@ int main() { bool AT = true; bool BT = false; + typedef float T; + std::string ty = "fp16"; + size_t dt_nbytes = sizeof(T); // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); // matrix multiplication parameters - int32_t M = 64, N = 128, K = 128; - std::vector hc(M*N); - std::vector rc(M*N); - std::vector ha(M*K); - std::vector hb(K*N); + int32_t M = 65536, N = 2048, K = 2048; + std::vector hc(M*N); + std::vector rc(M*N); + std::vector ha(M*K); + std::vector hb(K*N); srand(0); for(size_t i = 0; i < ha.size(); i++) - ha[i] = (float)rand()/RAND_MAX; + ha[i] = (T)rand()/RAND_MAX; for(size_t i = 0; i < hb.size(); i++) - hb[i] = (float)rand()/RAND_MAX; + hb[i] = (T)rand()/RAND_MAX; for(size_t i = 0; i < hc.size(); i++) hc[i] = 0; - triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*4); - triton::driver::buffer* da = triton::driver::buffer::create(context, ha.size()*4); - triton::driver::buffer* db = triton::driver::buffer::create(context, hb.size()*4); + triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*dt_nbytes); + triton::driver::buffer* da = triton::driver::buffer::create(context, ha.size()*dt_nbytes); + triton::driver::buffer* db = triton::driver::buffer::create(context, hb.size()*dt_nbytes); triton::driver::stream* stream = triton::driver::stream::create(context); stream->write(da, true, 0, ha); stream->write(db, true, 0, hb); stream->write(dc, true, 0, hc); stream->synchronize(); - triton::dnn::gemm gemm(M, N, K, AT, BT, "fp32", "fp32", 4, 4); - gemm.enqueue(stream, {da, db, dc}, false); - stream->read(dc, true, 0, hc); - gemm.cpu_ref(rc, ha, hb); - for(size_t i = 0; i < M*N; i++) - if(std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ - std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; - exit(EXIT_FAILURE); - } - std::cout << "Pass!" << std::endl; + triton::dnn::gemm gemm(M, N, K, AT, BT, ty, ty, 4, 4); + gemm.enqueue(stream, {da, db, dc}, true); +// stream->read(dc, true, 0, hc); +// gemm.cpu_ref(rc, ha, hb); +// for(size_t i = 0; i < M*N; i++) +// if(std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ +// std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; +// exit(EXIT_FAILURE); +// } +// std::cout << "Pass!" << std::endl; } diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index 482fad6b4..739b35117 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -18,22 +18,22 @@ int main() { // initialization int32_t R = 3, S = 3; - int32_t B = 128, F = 128; + int32_t B = 16, F = 4096; int32_t H = 16, W = 16; - int32_t C = 128; + int32_t C = 4096; // random shifts std::vector shift_h(C); std::vector shift_w(C); for(int32_t c = 0; c < C; c++){ - shift_h[c] = rand() % R - R/2; - shift_w[c] = rand() % S - S/2; + shift_h[c] = 0; + shift_w[c] = 0; } // configuration triton::dnn::shift shift(B, C, 1, H, W, 1, R, S, F, 1, 1, shift_h.data(), shift_w.data(), numeric_t_str, numeric_t_str, - op, false, triton::dnn::shift::NCHW); + op, false, triton::dnn::shift::CHWN); // host buffers size_t a_size = B*C*H*W; size_t b_size = C*F; diff --git a/include/triton/driver/dispatch.h b/include/triton/driver/dispatch.h index 0e1db604b..c1f4f01f9 100755 --- a/include/triton/driver/dispatch.h +++ b/include/triton/driver/dispatch.h @@ -159,6 +159,9 @@ public: static CUresult cuPointerGetAttribute(void * data, CUpointer_attribute attribute, CUdeviceptr ptr); static CUresult cuCtxGetDevice(CUdevice* result); static CUresult cuMemsetD8Async(CUdeviceptr dst, unsigned char x, size_t N, CUstream stream); + static CUresult cuFuncGetAttribute(int* pi, CUfunction_attribute attrib, CUfunction hfunc); + static CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value); + static CUresult cuFuncSetCacheConfig (CUfunction hfunc, CUfunc_cache config); // NVML static nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2( const char* pciBusId, nvmlDevice_t* device); static nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); @@ -252,6 +255,9 @@ private: static void* cuMemsetD8Async_; static void* cuCtxPushCurrent_v2_; static void* cuCtxPopCurrent_v2_; + static void* cuFuncGetAttribute_; + static void* cuFuncSetAttribute_; + static void* cuFuncSetCacheConfig_; // NVML static void* nvmlInit_v2_; static void* nvmlDeviceGetHandleByPciBusId_v2_; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 3238efdfa..b5cd54a8b 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -529,8 +529,8 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id unsigned num_rep_0 = shapes[0]->get_value() / hmma_bts_0; unsigned num_rep_1 = shapes[1]->get_value() / hmma_bts_1; // size of each pack (interleaving) - pack_size_0_ = std::min(num_rep_0, 1); - pack_size_1_ = std::min(num_rep_1, 1); + pack_size_0_ = std::min(num_rep_0, 2); + pack_size_1_ = std::min(num_rep_1, 2); // number of packs (interleaving) num_packs_0_ = num_rep_0 / pack_size_0_; num_packs_1_ = num_rep_1 / pack_size_1_; @@ -1148,7 +1148,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & unsigned max_contiguous = axis_info_->get_max_contiguous(ptr); unsigned alignment = std::min(starting_multiple, max_contiguous); unsigned vector_size = std::min(result->axis(0).contiguous, alignment); -// vector_size = result->axis(0).contiguous; + vector_size = result->axis(0).contiguous; // vector_size = 1; std::map packets; distributed_tile *TP = (distributed_tile*)tmap_.at(ld->get_pointer_operand()); @@ -1251,6 +1251,14 @@ void selection::run(ir::module &src, Module &dst) { dst_fn->addAttribute(id, llvm_attr(dst_ctx, attr)); } tgt_->set_kernel(dst_builder, dst_ctx, &dst, dst_fn); + // set metadata + Metadata *md_args[] = { + ValueAsMetadata::get(dst_fn), + MDString::get(dst_ctx, "maxntidx"), + ValueAsMetadata::get(dst_builder.getInt32(params_->get_num_threads())) + }; + dst.getOrInsertNamedMetadata("nvvm.annotations")->addOperand(MDNode::get(dst_ctx, md_args)); + // map parameters for(unsigned i = 0; i < fn->args().size(); i++) diff --git a/lib/codegen/shmem_allocation.cpp b/lib/codegen/shmem_allocation.cpp index 4031864c2..699406f08 100644 --- a/lib/codegen/shmem_allocation.cpp +++ b/lib/codegen/shmem_allocation.cpp @@ -21,13 +21,13 @@ unsigned shmem_allocation::is_ld_padded(ir::value *x) { bool is_op_1 = x == dot->get_operand(1); if(is_hmma && is_op_0){ if(dot->is_a_trans()) - return 4; + return 8; else return 16; } if(is_hmma && is_op_1){ if(!dot->is_b_trans()) - return 4; + return 8; else return 16; } diff --git a/lib/codegen/target.cpp b/lib/codegen/target.cpp index 2554bf5c3..2e20839d9 100644 --- a/lib/codegen/target.cpp +++ b/lib/codegen/target.cpp @@ -77,14 +77,44 @@ Value* nvidia_cu_target::get_global_offset(Module *module, IRBuilder<>& builder, } Value* nvidia_cu_target::get_block_id(Module *module, IRBuilder<>& builder, unsigned ax) { - static std::array ids = { + static std::array cta_ids = { Intrinsic::nvvm_read_ptx_sreg_ctaid_x, Intrinsic::nvvm_read_ptx_sreg_ctaid_y, Intrinsic::nvvm_read_ptx_sreg_ctaid_z }; - Value* get_group_id = Intrinsic::getDeclaration(module, ids[ax]); - Value* group_id = builder.CreateCall(get_group_id, {}); - return group_id; + bool z_order = true; + if(z_order && ax < 2){ + static std::array n_cta_ids = { + Intrinsic::nvvm_read_ptx_sreg_nctaid_x, + Intrinsic::nvvm_read_ptx_sreg_nctaid_y, + Intrinsic::nvvm_read_ptx_sreg_nctaid_z + }; + Value* cta_id_0 = builder.CreateIntrinsic(cta_ids[0], {}, {}); + Value* cta_id_1 = builder.CreateIntrinsic(cta_ids[1], {}, {}); + Value* n_cta_id_0 = builder.CreateIntrinsic(n_cta_ids[0], {}, {}); + Value* n_cta_id_1 = builder.CreateIntrinsic(n_cta_ids[1], {}, {}); + // global block ID + Value* bid = builder.CreateAdd(cta_id_0, builder.CreateMul(cta_id_1, n_cta_id_0)); + // helper for minimum + auto Min = [&](Value *x, Value *y){ + return builder.CreateSelect(builder.CreateICmpSGE(x, y), y, x); + }; + // super-tile size + Value* sts = Min(builder.getInt32(16), n_cta_id_1); + // number of CTAs per super-block + Value *nscta = builder.CreateMul(n_cta_id_0, sts); + Value *bid0 = builder.CreateURem(builder.CreateUDiv(bid, sts), n_cta_id_0); + Value *bid1 = builder.CreateAdd(builder.CreateMul(builder.CreateUDiv(bid, nscta), sts),builder.CreateURem(bid, sts)); + if(ax == 0) + return bid0; + else + return bid1; + } + else{ + Value* get_cta_id = Intrinsic::getDeclaration(module, cta_ids[ax]); + Value* cta_id = builder.CreateCall(get_cta_id, {}); + return cta_id; + } } Value* nvidia_cu_target::get_local_id(Module *module, IRBuilder<>& builder, unsigned ax) { diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 2812d00a2..1d24b9548 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -215,7 +215,7 @@ void tune::run(ir::module &mod) { node_t node = *nodes_.begin(); if(fragments_[node] == STRIDED_SCAN) { ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 1, 1); - ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 4, 32); + ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 2, 64); connected_components(node, {nts, mts}, {"nts", "mts"}, nodes_, dependencies_, group_id++); nts->set_value(1); } @@ -235,13 +235,13 @@ void tune::run(ir::module &mod) { continue; if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 4, 4)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 8, 8)); *params_.at(i).at("nts.d0") = *tmp; } if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 4, 4)); - std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 4, 4)); + std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 8, 8)); + std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 8, 8)); *params_.at(i).at("nts.d0") = *tmp1; *params_.at(i).at("nts.d1") = *tmp2; } diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index a3a3ce403..efca7bec3 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -30,7 +30,7 @@ void base::enqueue(driver::stream *stream, std::vector args, b /* the current template has not already been compiled */ if(m_jit.find(this) == m_jit.end()) { base* clone = this->clone(); - jit = m_jit.emplace(clone, std::unique_ptr(new rt::jit(ctx))).first->second.get(); + jit = m_jit.emplace(clone, std::unique_ptr(new rt::jit(ctx, 8))).first->second.get(); std::ostringstream oss; clone->triton_c_src(oss); std::string src = oss.str(); @@ -51,7 +51,8 @@ void base::enqueue(driver::stream *stream, std::vector args, b jit->add_module(name_.c_str(), src.c_str(), best.params); } else { - jit->add_module(name_.c_str(), src.c_str(), jit->get_valid(name_.c_str(), src.c_str())); +// jit->add_module(name_.c_str(), src.c_str(), jit->get_valid(name_.c_str(), src.c_str())); + jit->add_module(name_.c_str(), src.c_str(), {32, 128, 16, 128, 2, 2, 2, 2, 4, 4, 32, 8, 4, 1}); } triton::driver::kernel* kernel = jit->get_function(name_.c_str()); clone->init_impl(stream, (triton::driver::cu_module*)kernel->module()); diff --git a/lib/dnn/gemm.cpp b/lib/dnn/gemm.cpp index 222173c61..6a9bace7d 100644 --- a/lib/dnn/gemm.cpp +++ b/lib/dnn/gemm.cpp @@ -49,8 +49,8 @@ void gemm::enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, runtime::launch_information info) { driver::buffer *a = args[0], *b = args[1], *c = args[2]; - unsigned TM = info.global_range_size[0]; - unsigned TN = info.global_range_size[1]; + unsigned TM = info.globals.at("TM"); + unsigned TN = info.globals.at("TN"); unsigned grid_0 = (M_ + TM - 1)/TM; unsigned grid_1 = (N_ + TN - 1)/TN; unsigned grid_2 = 1; @@ -109,7 +109,7 @@ void gemm::triton_c_src(std::ostream &os) const { R"( const tunable int32 TM = {16, 32, 64, 128}; const tunable int32 TN = {16, 32, 64, 128}; -const tunable int32 TK = {16}; +const tunable int32 TK = {32}; const tunable int32 GZ = {1}; void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, @@ -127,12 +127,7 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, )" + b_ty_ + R"(* pb[)" + BS0 + ", " + BS1 + "] = B + rkb" + bcb0 + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; )" + a_ty_ + R"( a[)" + AS0 + ", " + AS1 + R"(] = *pa; )" + b_ty_ + R"( b[)" + BS0 + ", " + BS1 + R"(] = *pb; - int32 last_a = ((M*K - 1) - (TM*TK + 1)) / lda; - int32 last_b = ((K*N - 1) - (TN*TK + 1)) / ldb; - last_a = last_a / TK * TK; - last_b = last_b / TK * TK; - int32 bound = K - max(last_a, last_b); - for(int32 k = K; k > bound; k = k - TK){ + for(int32 k = K; k > TK; k = k - TK){ c = dot()" + usea + ", " + useb + R"(, c); pa = pa + TK)" + lda0 + R"(; pb = pb + TK)" + ldb0 + R"(; @@ -141,22 +136,8 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, } int32 rxc[TM] = get_global_range[TM](0); int32 ryc[TN] = get_global_range[TN](1); - for(int32 k = bound; k > 0; k = k - 1){ - int1 checka[TM, 1] = rxc[:, newaxis] < M; - int1 checkb[TN, 1] = ryc[:, newaxis] < N; - )" + a_ty_ + R"(* pa[TM, 1] = A + (K - k))" + lda0 + " + rxc[:, newaxis]" + lda1 + R"(; - )" + b_ty_ + R"(* pb[TN, 1] = B + (K - k))" + ldb0 + " + ryc[:, newaxis]" + ldb1 + R"(; - )" + a_ty_ + R"( a[TM, 1] = checka ? *pa : 0; - )" + b_ty_ + R"( b[TN, 1] = checkb ? *pb : 0; - c = dot(a, trans(b), c); - } - int32 ridx = get_range_id(0); - int32 ridy = get_range_id(1); - int1 checkc0[TM] = rxc < M; - int1 checkc1[TN] = ryc < N; - int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - @checkc *pc = c; + *pc = c; } )"; os << res; diff --git a/lib/driver/dispatch.cpp b/lib/driver/dispatch.cpp index e0f75a586..ee5a36b85 100755 --- a/lib/driver/dispatch.cpp +++ b/lib/driver/dispatch.cpp @@ -175,6 +175,9 @@ CUDA_DEFINE1(CUresult, cuCtxSetCurrent, CUcontext) CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t, CUstream) CUDA_DEFINE1(CUresult, cuCtxPushCurrent_v2, CUcontext) CUDA_DEFINE1(CUresult, cuCtxPopCurrent_v2, CUcontext*) +CUDA_DEFINE3(CUresult, cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction) +CUDA_DEFINE3(CUresult, cuFuncSetAttribute, CUfunction, CUfunction_attribute, int) +CUDA_DEFINE2(CUresult, cuFuncSetCacheConfig, CUfunction, CUfunc_cache) NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *, nvmlDevice_t*) NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*) @@ -316,6 +319,9 @@ void* dispatch::cuCtxGetDevice_; void* dispatch::cuMemsetD8Async_; void* dispatch::cuCtxPushCurrent_v2_; void* dispatch::cuCtxPopCurrent_v2_; +void* dispatch::cuFuncGetAttribute_; +void* dispatch::cuFuncSetAttribute_; +void* dispatch::cuFuncSetCacheConfig_; void* dispatch::nvmlInit_v2_; void* dispatch::nvmlDeviceGetHandleByPciBusId_v2_; diff --git a/lib/driver/kernel.cpp b/lib/driver/kernel.cpp index a16e3e6f9..e4b5ac76c 100755 --- a/lib/driver/kernel.cpp +++ b/lib/driver/kernel.cpp @@ -124,6 +124,7 @@ cu_kernel::cu_kernel(driver::module *program, const char * name) : kernel(progra cu_params_store_.reserve(64); cu_params_.reserve(64); dispatch::cuModuleGetFunction(&*cu_, *program->cu(), name); +// dispatch::cuFuncSetCacheConfig(*cu_, CU_FUNC_CACHE_PREFER_SHARED); } void cu_kernel::setArg(unsigned int index, std::size_t size, void* ptr){ diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index d32c8722e..6e0f72334 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -40,17 +40,16 @@ void loop_nest(std::vector const & ranges, ThreadPool pool(nthreads); // Start with innermost loop size_t i = D - 1; -// size_t current = 0; while(true){ - //Execute function - pool.enqueue([values, &f](){ f(values); }); -// f(values); + // Execute function + pool.enqueue(f,values); while(values[i]++ == ranges[i] - 1){ if(i == 0) return; values[i--] = 0; } i = D - 1; + // Small sleep so that the thread pool doesn't grow too big std::this_thread::sleep_for(std::chrono::microseconds(1)); } } @@ -201,19 +200,19 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben launch_information info; llvm::LLVMContext llvm_context; auto ll_module = make_llvm_module(tt_module_1, passes_1, llvm_context, info); - std::unique_ptr module(driver::module::create(driver_context_, &*ll_module)); - std::unique_ptr kernel(driver::kernel::create(module.get(), name)); double perf; { std::lock_guard lock(mutex); + std::unique_ptr module(driver::module::create(driver_context_, &*ll_module)); + std::unique_ptr kernel(driver::kernel::create(module.get(), name)); perf = benchmark(kernel.get(), info); if(perf > best.perf){ best.perf = perf; best.params = params; } - for(unsigned p: params) - std::cout << p << " " << std::flush; - std::cout << perf << " [ " << best.perf << " ] " << std::endl; + for(size_t i = 0; i < params.size(); i++) + std::cout << ((i==0)?"":", ") << params[i] << std::flush; + std::cout << ", " << perf << " [ " << best.perf << " ] " << std::endl; } }, nthreads_); std::cout << "Autotuning done - Best performance: " << best.perf << std::endl; From 28c250216cae840d3c59f55cb61cdece833e3e90 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 19 Jul 2019 21:32:55 -0700 Subject: [PATCH 256/494] [dnn/gemm] added some bounds checking --- examples/cpp/dot.cpp | 6 ++-- include/triton/dnn/gemm.h | 3 -- lib/codegen/tune.cpp | 2 +- lib/dnn/gemm.cpp | 60 ++++++++++++++++++++------------------- lib/runtime/jit.cpp | 2 +- 5 files changed, 36 insertions(+), 37 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index b5af64615..720c872f2 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -8,15 +8,15 @@ int main() { - bool AT = true; - bool BT = false; + bool AT = false; + bool BT = true; typedef float T; std::string ty = "fp16"; size_t dt_nbytes = sizeof(T); // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); // matrix multiplication parameters - int32_t M = 65536, N = 2048, K = 2048; + int32_t M = 4096, N = 4096, K = 4096; std::vector hc(M*N); std::vector rc(M*N); std::vector ha(M*K); diff --git a/include/triton/dnn/gemm.h b/include/triton/dnn/gemm.h index 8348edf3e..f43370606 100644 --- a/include/triton/dnn/gemm.h +++ b/include/triton/dnn/gemm.h @@ -31,9 +31,6 @@ public: // clone base* clone() const; - // default params - std::vector default_params(); - // CPU reference implementation template static void cpu_ref(std::vector &c, const std::vector &a, const std::vector &b, diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 1d24b9548..2d104d8d6 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -235,7 +235,7 @@ void tune::run(ir::module &mod) { continue; if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 8, 8)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 4, 8)); *params_.at(i).at("nts.d0") = *tmp; } if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ diff --git a/lib/dnn/gemm.cpp b/lib/dnn/gemm.cpp index 6a9bace7d..82fdb431b 100644 --- a/lib/dnn/gemm.cpp +++ b/lib/dnn/gemm.cpp @@ -51,6 +51,7 @@ void gemm::enqueue_impl(driver::stream *stream, driver::kernel *kernel, driver::buffer *a = args[0], *b = args[1], *c = args[2]; unsigned TM = info.globals.at("TM"); unsigned TN = info.globals.at("TN"); + unsigned TK = info.globals.at("TK"); unsigned grid_0 = (M_ + TM - 1)/TM; unsigned grid_1 = (N_ + TN - 1)/TN; unsigned grid_2 = 1; @@ -67,23 +68,13 @@ void gemm::enqueue_impl(driver::stream *stream, driver::kernel *kernel, kernel->setArg(6, lda); kernel->setArg(7, ldb); kernel->setArg(8, ldc); - kernel->setArg(9, locks_); - kernel->setArg(10, grid_0); - kernel->setArg(11, grid_1); + kernel->setArg(9, TK); + kernel->setArg(10, locks_); + kernel->setArg(11, grid_0); + kernel->setArg(12, grid_1); stream->enqueue(kernel, grid, {info.num_threads, 1, 1}); } -std::vector gemm::default_params() { - if(AT_ && BT_) - return {32, 64, 32, 64, 16, 8, 2, 2, 4, 2, 8, 4, 2, 1}; - else if(AT_ && !BT_) - return {32, 64, 32, 64, 16, 8, 2, 2, 4, 2, 8, 4, 2, 1}; - else if(!AT_ && BT_) - return {16, 2, 64, 16, 2, 64, 16, 8, 2, 2, 8, 8, 8, 1}; - else - return {16, 2, 128, 32, 32, 32, 4, 2, 2, 8, 8, 4, 2, 1}; -} - void gemm::triton_c_src(std::ostream &os) const { std::string AS0 = "TM", AS1 = "TK"; std::string BS0 = "TK", BS1 = "TN"; @@ -103,12 +94,14 @@ void gemm::triton_c_src(std::ostream &os) const { std::swap(bcb0, bcb1); std::swap(ldb0, ldb1); } + std::string AS = AS0 + ", " + AS1; + std::string BS = BS0 + ", " + BS1; std::string align_lda_str = "multiple_of(" + std::to_string(align_lda_) + ")"; std::string align_ldb_str = "multiple_of(" + std::to_string(align_ldb_) + ")"; std::string res = R"( -const tunable int32 TM = {16, 32, 64, 128}; -const tunable int32 TN = {16, 32, 64, 128}; +const tunable int32 TM = {32, 64, 128, 256}; +const tunable int32 TN = {32, 64, 128, 256}; const tunable int32 TK = {32}; const tunable int32 GZ = {1}; @@ -117,27 +110,36 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, fp32 *C, int32 M, int32 N, int32 K, )" + align_lda_str + R"( int32 lda, )" + align_ldb_str + R"(" int32 ldb, int32 ldc, - int32 *locks, int32 grid0, int32 grid1) { - int32 rxa[TM] = get_global_range[TM](0); - int32 ryb[TN] = get_global_range[TN](1); + int32 bound, int32 *locks, int32 grid0, int32 grid1) { + int32 ridx = get_range_id(0); + int32 ridy = get_range_id(1); + int32 rxa[TM] = ridx*TM + (0 ... TM); + int32 ryb[TN] = ridy*TN + (0 ... TN); int32 rka[TK] = 0 ... TK; int32 rkb[TK] = 0 ... TK; fp32 c[TM, TN] = 0; - )" + a_ty_ + R"(* pa[)" + AS0 + ", " + AS1 + "] = A + rka" + bca0 + lda0 + " + rxa" + bca1 + lda1 + R"(; - )" + b_ty_ + R"(* pb[)" + BS0 + ", " + BS1 + "] = B + rkb" + bcb0 + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; - )" + a_ty_ + R"( a[)" + AS0 + ", " + AS1 + R"(] = *pa; - )" + b_ty_ + R"( b[)" + BS0 + ", " + BS1 + R"(] = *pb; - for(int32 k = K; k > TK; k = k - TK){ + )" + a_ty_ + R"(* pa[)" + AS + "] = A + rka" + bca0 + lda0 + " + rxa" + bca1 + lda1 + R"(; + )" + b_ty_ + R"(* pb[)" + BS + "] = B + rkb" + bcb0 + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; + int1 checka[)" + AS + R"(] = (rka < K))" + bca0 + " && (rxa < M)" + bca1 + R"(; + int1 checkb[)" + BS + R"(] = (rkb < K))" + bcb0 + " && (ryb < N)" + bcb1 + R"(; + )" + a_ty_ + R"( a[)" + AS + R"(] = checka ? *pa : 0; + )" + b_ty_ + R"( b[)" + BS + R"(] = checkb ? *pb : 0; + for(int32 k = K; k > 0; k = k - TK){ c = dot()" + usea + ", " + useb + R"(, c); pa = pa + TK)" + lda0 + R"(; pb = pb + TK)" + ldb0 + R"(; - a = *pa; - b = *pb; + int1 checka[)" + AS + R"(] = k > bound; + int1 checkb[)" + BS + R"(] = k > bound; + @checka a = *pa; + @checkb b = *pb; } - int32 rxc[TM] = get_global_range[TM](0); - int32 ryc[TN] = get_global_range[TN](1); + int32 rxc[TM] = ridx*TM + (0 ... TM); + int32 ryc[TN] = ridy*TN + (0 ... TN); + int1 checkc0[TM] = rxc < M; + int1 checkc1[TN] = ryc < N; + int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - *pc = c; + @checkc *pc = c; } )"; os << res; diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index 6e0f72334..c925a690c 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -49,7 +49,7 @@ void loop_nest(std::vector const & ranges, values[i--] = 0; } i = D - 1; - // Small sleep so that the thread pool doesn't grow too big + // Short sleep so that the thread pool doesn't grow too big std::this_thread::sleep_for(std::chrono::microseconds(1)); } } From d159455f7bbf4a32c5fa0ae526a4aa83587b2481 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 20 Jul 2019 21:44:18 -0700 Subject: [PATCH 257/494] [codegen/alignment_info] better alignment information --- examples/cpp/dot.cpp | 6 +- include/triton/codegen/alignment_info.h | 9 +- include/triton/runtime/jit.h | 1 + lib/codegen/alignment_info.cpp | 121 +++++++++++++++--------- lib/codegen/tune.cpp | 13 ++- lib/dnn/base.cpp | 3 +- lib/dnn/gemm.cpp | 4 +- lib/dnn/shift.cpp | 117 +++++++++-------------- lib/ir/instructions.cpp | 2 +- 9 files changed, 145 insertions(+), 131 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 720c872f2..09483116e 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -8,8 +8,8 @@ int main() { - bool AT = false; - bool BT = true; + bool AT = true; + bool BT = false; typedef float T; std::string ty = "fp16"; size_t dt_nbytes = sizeof(T); @@ -37,7 +37,7 @@ int main() { stream->write(dc, true, 0, hc); stream->synchronize(); triton::dnn::gemm gemm(M, N, K, AT, BT, ty, ty, 4, 4); - gemm.enqueue(stream, {da, db, dc}, true); + gemm.enqueue(stream, {da, db, dc}, false); // stream->read(dc, true, 0, hc); // gemm.cpu_ref(rc, ha, hb); // for(size_t i = 0; i < M*N; i++) diff --git a/include/triton/codegen/alignment_info.h b/include/triton/codegen/alignment_info.h index b90263dbe..d2d72e176 100644 --- a/include/triton/codegen/alignment_info.h +++ b/include/triton/codegen/alignment_info.h @@ -14,12 +14,17 @@ namespace ir { namespace codegen{ class alignment_info { + struct cst_info { + unsigned num_cst; + unsigned value; + }; + private: // helpers bool is_first_axis_unit(ir::value *v); // populate maps - bool populate_is_constant(ir::value *v); + cst_info populate_is_constant(ir::value *v); unsigned populate_max_contiguous(ir::value *v); unsigned populate_starting_multiple(ir::value *v); @@ -29,7 +34,7 @@ public: unsigned get_max_contiguous(ir::value* v) const; private: - std::map is_constant_; + std::map is_constant_; std::map max_contiguous_; std::map starting_multiple_; }; diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index a88cb2ddf..d3088d73b 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -70,6 +70,7 @@ public: void target_independent(ir::module &module) { optimize_dot.run(module); optimize_trans.run(module); +// ir::print(module, std::cout); } void target_dependent(ir::module &module) { diff --git a/lib/codegen/alignment_info.cpp b/lib/codegen/alignment_info.cpp index 5b7564479..b7e0b3641 100644 --- a/lib/codegen/alignment_info.cpp +++ b/lib/codegen/alignment_info.cpp @@ -9,6 +9,18 @@ namespace triton { namespace codegen{ +inline int gcd(int a, int b) { + if (a == 0) + return b; + if (b == 0) + return a; + if (a == b) + return a; + if (a > b) + return gcd(a-b, b); + return gcd(a, b-a); +} + template inline T add_to_cache(ir::value *i, T value, std::map &map) { return map[i] = value; @@ -22,50 +34,69 @@ bool alignment_info::is_first_axis_unit(ir::value *x){ return true; } -bool alignment_info::populate_is_constant(ir::value *v) { +alignment_info::cst_info alignment_info::populate_is_constant(ir::value *v) { if(is_constant_.find(v) != is_constant_.end()) return is_constant_.at(v); // helper for the cache - auto cache = [this,v](bool value){ return add_to_cache(v, value, is_constant_); }; + auto cache = [this,v](cst_info value){ + return add_to_cache(v, value, is_constant_); } + ; // populate if(auto *x = dynamic_cast(v)){ ir::value *op = x->get_operand(0); - populate_is_constant(op); - if(is_first_axis_unit(op)) - return cache(true); + auto op_cst = populate_is_constant(op); + if(is_first_axis_unit(op)){ + unsigned num_cst = x->get_type()->get_tile_shapes()[0]->get_value(); + return cache({num_cst, op_cst.value}); + } } if(auto *x = dynamic_cast(v)) - return cache(true); + return cache({true, (unsigned)x->get_value()}); if(auto *x = dynamic_cast(v)){ - bool lhs = populate_is_constant(x->get_operand(0)); - bool rhs = populate_is_constant(x->get_operand(1)); - return cache(lhs && rhs); + ir::value* lhs_op = x->get_operand(0); + ir::value* rhs_op = x->get_operand(1); + cst_info lhs = populate_is_constant(lhs_op); + cst_info rhs = populate_is_constant(rhs_op); + if(lhs.num_cst==0 && rhs.value && x->is_int_div()){ + unsigned max_contiguous = populate_max_contiguous(lhs_op); + unsigned starting_multiple = populate_starting_multiple(lhs_op); + return cache({gcd(max_contiguous, rhs.value) - (starting_multiple % rhs.value), 0}); + } + return cache({std::min(lhs.num_cst, rhs.num_cst), 0}); + } + if(auto *x = dynamic_cast(v)){ + ir::value* lhs_op = x->get_operand(0); + ir::value* rhs_op = x->get_operand(1); + cst_info lhs = populate_is_constant(lhs_op); + cst_info rhs = populate_is_constant(rhs_op); + return cache({std::min(lhs.num_cst, rhs.num_cst), 0}); } if(auto *x = dynamic_cast(v)){ - bool value_true = populate_is_constant(x->get_value_true()); - bool value_false = populate_is_constant(x->get_value_false()); - return cache(value_true && value_false); + cst_info value_true = populate_is_constant(x->get_value_true()); + cst_info value_false = populate_is_constant(x->get_value_false()); + return cache({std::min(value_true.num_cst, value_false.num_cst), 0}); } if(v->get_type()->is_tile_ty()) - return cache(false); + return cache({0, 0}); if(auto *x = dynamic_cast(v)){ // put a conservative initial value in phi node to avoid infinite recursion - bool result = true; + unsigned result = 1; for(unsigned n = 0; n < x->get_num_incoming(); n++){ ir::value* inc = x->get_incoming_value(n); if(is_constant_.find(inc) != is_constant_.end()) - result = is_constant_.at(inc); + result = is_constant_.at(inc).num_cst; } - cache(result); + cache({result, 0}); // recurse for(unsigned n = 0; n < x->get_num_incoming(); n++){ ir::value* inc = x->get_incoming_value(n); - result = result && populate_is_constant(inc); + result = std::min(result, populate_is_constant(inc).num_cst); } - return cache(result); + return cache({result, 0}); } // scalars are always constant in the contiguous dimension - return cache(true); + // but value is not known at compile-time + return cache({1, 0}); } unsigned alignment_info::populate_max_contiguous(ir::value *v){ @@ -95,13 +126,21 @@ unsigned alignment_info::populate_max_contiguous(ir::value *v){ ir::value* rhs = x->get_operand(1); unsigned lhs_max_contiguous = populate_max_contiguous(lhs); unsigned rhs_max_contiguous = populate_max_contiguous(rhs); - bool lhs_has_cst = populate_is_constant(lhs); - bool rhs_has_cst = populate_is_constant(rhs); - if(x->is_int_add_sub()){ - if(lhs_has_cst) - return cache(rhs_max_contiguous); - if(rhs_has_cst) + cst_info lhs_cst_info = populate_is_constant(lhs); + cst_info rhs_cst_info = populate_is_constant(rhs); + if(x->is_int_rem() && rhs_cst_info.value > 0) + return cache(std::min(lhs_max_contiguous, rhs_cst_info.value)); + if(x->is_int_mult()){ + if(rhs_cst_info.value == 1) return cache(lhs_max_contiguous); + if(lhs_cst_info.value == 1) + return cache(rhs_max_contiguous); + } + if(x->is_int_add_sub()){ + if(lhs_cst_info.num_cst) + return cache(gcd(rhs_max_contiguous, lhs_cst_info.num_cst)); + if(rhs_cst_info.num_cst) + return cache(gcd(lhs_max_contiguous, rhs_cst_info.num_cst)); } } if(auto *x = dynamic_cast(v)){ @@ -114,11 +153,11 @@ unsigned alignment_info::populate_max_contiguous(ir::value *v){ ir::value* rhs = x->get_operand(1); unsigned lhs_max_contiguous = populate_max_contiguous(lhs); unsigned rhs_max_contiguous = populate_max_contiguous(rhs); - bool lhs_has_cst = populate_is_constant(lhs); - bool rhs_has_cst = populate_is_constant(rhs); - if(lhs_has_cst) + auto lhs_cst_info = populate_is_constant(lhs); + auto rhs_cst_info = populate_is_constant(rhs); + if(lhs_cst_info.num_cst) return cache(rhs_max_contiguous); - if(rhs_has_cst) + if(rhs_cst_info.num_cst) return cache(lhs_max_contiguous); } if(auto *x = dynamic_cast(v)){ @@ -140,22 +179,12 @@ unsigned alignment_info::populate_max_contiguous(ir::value *v){ return cache(1); } -inline int gcd(int a, int b) { - if (a == 0) - return b; - if (b == 0) - return a; - if (a == b) - return a; - if (a > b) - return gcd(a-b, b); - return gcd(a, b-a); -} - unsigned alignment_info::populate_starting_multiple(ir::value *v){ if(starting_multiple_.find(v) != starting_multiple_.end()) return starting_multiple_.at(v); - auto cache = [this,v](unsigned value){ return add_to_cache(v, value, starting_multiple_); }; + auto cache = [this,v](unsigned value){ + return add_to_cache(v, value, starting_multiple_); + }; // has metadata if(auto *x = dynamic_cast(v)){ unsigned multiple_of = x->get_metadata(ir::metadata::multiple_of); @@ -185,15 +214,16 @@ unsigned alignment_info::populate_starting_multiple(ir::value *v){ return cache(gcd(lhs, rhs)); if(x->is_int_div()) return cache(std::max(lhs / rhs, 1)); - if(x->is_int_rem()) - return cache(std::max(lhs % rhs, 1)); + if(x->is_int_rem() && rhs > 1) + return cache(gcd(lhs, rhs)); if(x->is_shl()) return cache(lhs << rhs); if(x->is_shr()) return cache(std::max(lhs >> rhs, 1)); } - if(auto *x = dynamic_cast(v)) + if(auto *x = dynamic_cast(v)){ return cache(x->get_value()); + } if(auto *x = dynamic_cast(v)){ return cache(x->get_first()->get_value()); } @@ -270,7 +300,6 @@ void alignment_info::run(ir::module &mod) { for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()){ populate_max_contiguous(i); -// std::cout << i->get_name() << " " << max_contiguous_.at(i) << " " << starting_multiple_.at(i) << std::endl; } } diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 2d104d8d6..47b3f05fa 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -233,10 +233,15 @@ void tune::run(ir::module &mod) { for(ir::instruction *i : block->get_inst_list()){ if(fragments_.find({i, 0}) != fragments_.end() && fragments_.at({i, 0}) != STRIDED_SCAN) continue; - if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ - ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 4, 8)); - *params_.at(i).at("nts.d0") = *tmp; + if(auto *ld = dynamic_cast(i)) + if(i->get_type()->is_tile_ty()){ + ir::type *ptr_ty = ld->get_pointer_operand()->get_type()->get_scalar_ty(); + size_t addr_space = ptr_ty->get_pointer_address_space(); + if(addr_space < 4){ + ir::type *ty = mod.get_builder().get_int32_ty(); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 8, 8)); + *params_.at(i).at("nts.d0") = *tmp; + } } if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index efca7bec3..73bb474b8 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -51,8 +51,7 @@ void base::enqueue(driver::stream *stream, std::vector args, b jit->add_module(name_.c_str(), src.c_str(), best.params); } else { -// jit->add_module(name_.c_str(), src.c_str(), jit->get_valid(name_.c_str(), src.c_str())); - jit->add_module(name_.c_str(), src.c_str(), {32, 128, 16, 128, 2, 2, 2, 2, 4, 4, 32, 8, 4, 1}); + jit->add_module(name_.c_str(), src.c_str(), jit->get_valid(name_.c_str(), src.c_str())); } triton::driver::kernel* kernel = jit->get_function(name_.c_str()); clone->init_impl(stream, (triton::driver::cu_module*)kernel->module()); diff --git a/lib/dnn/gemm.cpp b/lib/dnn/gemm.cpp index 82fdb431b..42c7793c2 100644 --- a/lib/dnn/gemm.cpp +++ b/lib/dnn/gemm.cpp @@ -113,8 +113,8 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, int32 bound, int32 *locks, int32 grid0, int32 grid1) { int32 ridx = get_range_id(0); int32 ridy = get_range_id(1); - int32 rxa[TM] = ridx*TM + (0 ... TM); - int32 ryb[TN] = ridy*TN + (0 ... TN); + int32 rxa[TM] = ridx * TM + (0 ... TM); + int32 ryb[TN] = ridy * TN + (0 ... TN); int32 rka[TK] = 0 ... TK; int32 rkb[TK] = 0 ... TK; fp32 c[TM, TN] = 0; diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 47e283769..844c982e7 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -27,7 +27,7 @@ shift::shift(int B, int C, layout_(layout){ // std::cout << B_ << " " << C_ << " " << F_ << " " << stride_h_ << " " << stride_w_ << " " << a_ty_ << " " << b_ty_ << " " << ty_ << " " << layout_ << std::endl; // max number of channels - TK_ = (ty == FPROP && a_ty_ == "fp32") ? 8 : 16; + TK_ = (ty == FPROP && a_ty_ == "fp32") ? 8 : 32; MAX_C_ = 8192 + TK_; // activation sizes CD_ = AD_ / stride_d_; @@ -223,7 +223,7 @@ void shift::deinit_impl() { void shift::enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, runtime::launch_information info) { - unsigned TM = info.global_range_size[0], TN = info.global_range_size[1]; + unsigned TM = info.globals.at("TM"), TN = info.globals.at("TN"); unsigned grid_0 = (M_ + TM - 1)/TM; unsigned grid_1 = (N_ + TN - 1)/TN; unsigned num_locks = grid_0 * grid_1; @@ -278,6 +278,8 @@ void shift::triton_c_src(std::ostream &os) const { std::string usea = AT_ ? "trans(a)" : "a"; std::string useb = BT_ ? "trans(b)" : "b"; std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; + std::string stride_h = std::to_string(stride_h_); + std::string stride_w = std::to_string(stride_w_); if(AT_){ std::swap(AS0, AS1); std::swap(bca0, bca1); @@ -290,6 +292,11 @@ void shift::triton_c_src(std::ostream &os) const { std::string BS = BS0 + ", " + BS1; bool is_chwn = layout_ == CHWN; + std::string lda_b = is_chwn ? "1" : "lda_b"; + std::string ldb_b = is_chwn ? "1" : "ldb_b"; + std::string ldc_b = is_chwn ? "1" : "ldc_b"; + + auto compute_bhw = [&](std::string rx, std::string sz, std::string rkx){ std::string B = std::to_string(B_); std::string CW = std::to_string(ICW_); @@ -317,7 +324,7 @@ const tunable int32 TM = {16, 32, 64, 128}; const tunable int32 TN = {16, 32, 64, 128}; const tunable int32 TK = {)" + std::to_string(TK_) + "};"; if(op_ == WGRAD) - result += "const tunable int32 GZ = {1, 4, 16};"; + result += "const tunable int32 GZ = {1};"; else result += "const tunable int32 GZ = {1};"; @@ -329,30 +336,27 @@ void shift(restrict read_only align(16) )" + a_ty_ + R"( *A, )" + c_ty_ + R"( *C, int32 M, int32 N, int32 K, int32 stride_h, int32 stride_w, - multiple_of(4) int32 lda_b, multiple_of(4) int32 lda_w, multiple_of(4) int32 lda_h, multiple_of(4) int32 lda_c, - multiple_of(4) int32 ldb_b, multiple_of(4) int32 ldb_w, multiple_of(4) int32 ldb_h, multiple_of(4) int32 ldb_c, - multiple_of(4) int32 ldc_b, multiple_of(4) int32 ldc_w, multiple_of(4) int32 ldc_h, multiple_of(4) int32 ldc_c, + multiple_of(8) int32 lda_b, multiple_of(8) int32 lda_w, multiple_of(8) int32 lda_h, multiple_of(8) int32 lda_c, + multiple_of(8) int32 ldb_b, multiple_of(8) int32 ldb_w, multiple_of(8) int32 ldb_h, multiple_of(8) int32 ldb_c, + multiple_of(8) int32 ldc_b, multiple_of(8) int32 ldc_w, multiple_of(8) int32 ldc_h, multiple_of(8) int32 ldc_c, int32 NB, int32 AH, int32 AW, int32 BH, int32 BW, int32 CH, int32 CW, int32* locks, int32 grid0, int32 grid1, int32 grid2) { - int32 rxa[TM] = get_global_range[TM](0); - int32 ryb[TN] = get_global_range[TN](1); - int32 rz = get_global_range[1](2); + int32 ridx = get_range_id(0); + int32 ridy = get_range_id(1); + int32 rz = get_range_id(2); + int32 rxa[TM] = ridx*TM + (0 ... TM); + int32 ryb[TN] = ridy*TN + (0 ... TN); int32 rka[TK] = 0 ... TK; int32 rkb[TK] = 0 ... TK; fp32 acc[TM, TN] = 0; int32 pad_h = BH / 2; - int32 pad_w = BW / 2; - int32 div = K / grid2; - int32 rem = K % grid2; - K = select(rz < rem, div - 1, div); - int32 offk = select(rz < rem, rz*(div + 1), rz*div + rem);)"; + int32 pad_w = BW / 2;)"; if(op_ == WGRAD){ result += R"( - rka = rka + offk; - rkb = rkb + offk; + )"; } @@ -360,31 +364,26 @@ if(op_ == WGRAD){ if(op_ == FPROP){ result += compute_bhw("ra", "TM", "rxa") + R"( - raw = raw * stride_w; - rah = rah * stride_h; - int32 offxa[TM] = rab*lda_b + raw*lda_w + rah*lda_h; + raw = raw * )" + stride_w + R"(; + rah = rah * )" + stride_h + R"(; + int32 offxa[TM] = rab*)" + lda_b + R"( + raw*lda_w + rah*lda_h; int32 offa0[TM, TK] = offxa[:, newaxis]; __constant__ int32* pd[TK] = delta_a + rka; - multiple_of(4) int32 d[TK] = *pd; + multiple_of(8) int32 d[TK] = *pd; int32 offa1[TM, TK] = d[newaxis, :];)"; } if(op_ == BPROP){ result += compute_bhw("ra", "TM", "rxa") + R"( - int32 offxa[TM] = rab*lda_b + raw*lda_w + rah*lda_h; + int32 offxa[TM] = rab*)" + lda_b + R"( + raw*lda_w + rah*lda_h; int32 offa0[TM, TK] = offxa[:, newaxis]; int32 offa1[TM, TK] = rka[newaxis, :] * lda_c;)"; } -if(op_ == WGRAD && layout_ == CHWN){ - result += R"( - int32 offa0[TK, TM] = rxa[newaxis, :] * lda_c; - int32 offa1[TK, TM] = rka[:, newaxis];)"; -} -if(op_ == WGRAD && layout_ == NCHW){ +if(op_ == WGRAD){ result += compute_bhw("ra", "TK", "rka") + R"( int32 offa0[TK, TM] = rxa[newaxis, :] * lda_c; - int32 offxa[TK] = rab*lda_b + raw*lda_w + rah*lda_h; + int32 offxa[TK] = rab*)" + lda_b + R"( + raw*lda_w + rah*lda_h; int32 offa1[TK, TM] = offxa[:, newaxis];)"; } @@ -403,11 +402,11 @@ if(op_ == WGRAD){ result += compute_bhw("rb", "TK", "rkb") + R"( __constant__ int32* pd[TN] = delta_a + ryb; - int32 d[TN] = *pd; - int32 shift[TK, TN] = d[newaxis, :]; - rbw = rbw * stride_w; - rbh = rbh * stride_h; - int32 offkb[TK] = rbb*ldb_b + rbw*ldb_w + rbh*ldb_h; + multiple_of(8) int32 d[TN] = *pd; + multiple_of(8) int32 shift[TK, TN] = d[newaxis, :]; + rbw = rbw * )" + stride_w + R"(; + rbh = rbh * )" + stride_h + R"(; + int32 offkb[TK] = rbb*)" + ldb_b + R"( + rbw*ldb_w + rbh*ldb_h; int32 offb0[TK, TN] = ryb[newaxis, :] * ldb_c; int32 offb1[TK, TN] = offkb[:, newaxis] + shift;)"; } @@ -416,8 +415,8 @@ if(op_ == WGRAD){ result += R"( )" + a_ty_ + "* pa[" + AS + R"(] = A + offa0 + offa1; )" + b_ty_ + "* pb[" + BS + R"(] = B + offb0 + offb1; - int1 checka[)" + AS + "] = (rka < K + offk)" + bca0 + R"(; - int1 checkb[)" + BS + "] = (rkb < K + offk)" + bcb0 + R"(; + int1 checka[)" + AS + "] = (rka < K)" + bca0 + R"(; + int1 checkb[)" + BS + "] = (rkb < K)" + bcb0 + R"(; )" + a_ty_ + " a[" + AS + R"(] = checka ? *pa : 0; )" + b_ty_ + " b[" + BS + R"(] = checkb ? *pb : 0; for(int32 k = K; k > 0; k = k - TK){ @@ -436,15 +435,11 @@ if(op_ == BPROP){ result += R"( pa = pa + TK * lda_c;)"; } -if(op_ == WGRAD && layout_ == CHWN){ - result += R"( - pa = pa + TK;)"; -} -if(op_ == WGRAD && layout_ == NCHW){ +if(op_ == WGRAD){ result += R"( rka = rka + TK;)" + compute_bhw("ra", "TK", "rka") + R"( - offxa = rab*lda_b + raw*lda_w + rah*lda_h; + offxa = rab*)" + lda_b + R"( + raw*lda_w + rah*lda_h; pa = A + offa0 + offxa[:, newaxis];)"; } result += R"( @@ -455,9 +450,9 @@ if(op_ == WGRAD){ result += R"( rkb = rkb + TK;)" + compute_bhw("rb", "TK", "rkb") + R"( - rbw = rbw * stride_w; - rbh = rbh * stride_h; - offkb = rbb*ldb_b + rbw*ldb_w + rbh*ldb_h; + rbw = rbw * )" + stride_w + R"(; + rbh = rbh * )" + stride_h + R"(; + offkb = rbb*)" + ldb_b + R"( + rbw*ldb_w + rbh*ldb_h; pb = B + offb0 + offkb[:, newaxis] + shift;)"; } if(op_ == FPROP){ @@ -471,21 +466,21 @@ if(op_ == BPROP){ result += R"( @checkb b = *pb; } - int32 rxc[TM] = get_global_range[TM](0); - int32 ryc[TN] = get_global_range[TN](1);)"; + int32 rxc[TM] = ridx*TM + (0 ... TM); + int32 ryc[TN] = ridy*TN + (0 ... TN);)"; /* C offsets */ if(op_ == BPROP){ result += compute_bhw("rc", "TM", "rxc") + R"( - rcw = rcw * stride_w; - rch = rch * stride_h; - int32 offxc[TM] = rcb*ldc_b + rcw*ldc_w + rch*ldc_h;)"; + rcw = rcw * )" + stride_w + R"(; + rch = rch * )" + stride_h + R"(; + int32 offxc[TM] = rcb*)" + ldc_b + R"( + rcw*ldc_w + rch*ldc_h;)"; } if(op_ == FPROP){ result += compute_bhw("rc", "TM", "rxc") + R"( - int32 offxc[TM] = rcb*ldc_b + rcw*ldc_w + rch*ldc_h;)"; + int32 offxc[TM] = rcb*)" + ldc_b + R"( + rcw*ldc_w + rch*ldc_h;)"; } if(op_ == WGRAD){ result += R"( @@ -506,27 +501,7 @@ if(op_ == BPROP){ } else{ result += R"( - int1 has_lock = (GZ > 1) && (locks != 0); - if(has_lock){ - int32 ridx = get_range_id(0); - int32 ridy = get_range_id(1); - int32 *plock = locks + ridx + ridy*grid0; - int32 *pcount = plock + grid0*grid1; - while(__atomic_cas(plock, 0, 1) == 1); - int32 count = *pcount; - int32 countp1 = select(count == grid2 - 1, 0, count + 1); - if(count == 0) { - @checkc *pc = c; - } - else { - @checkc *pc = c + *pc; - } - *pcount = countp1; - *plock = 0; - } - else{ - @checkc *pc = c; - })"; + @checkc *pc = c;)"; } result += R"( })"; diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index e3ac042d1..063dbffc9 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -130,7 +130,7 @@ bool binary_operator::is_int_mult() const { } bool binary_operator::is_int_add_sub() const { - return op_ == llop::Add || llop::Sub; + return op_ == llop::Add || op_ == llop::Sub; } From 484e3871cfa96747eaa004f694dbcd3af92b95ce Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 20 Jul 2019 22:05:16 -0700 Subject: [PATCH 258/494] [dnn/shift] added base pointer for a, b --- examples/cpp/dot.cpp | 8 ++++---- examples/cpp/shift.cpp | 8 ++++---- lib/codegen/alignment_info.cpp | 6 ++++-- lib/codegen/selection.cpp | 2 +- lib/dnn/shift.cpp | 23 +++++++++++++---------- 5 files changed, 26 insertions(+), 21 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 09483116e..0eac9c046 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -8,15 +8,15 @@ int main() { - bool AT = true; - bool BT = false; + bool AT = false; + bool BT = true; typedef float T; std::string ty = "fp16"; size_t dt_nbytes = sizeof(T); // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); // matrix multiplication parameters - int32_t M = 4096, N = 4096, K = 4096; + int32_t M = 65536, N = 2048, K = 2048; std::vector hc(M*N); std::vector rc(M*N); std::vector ha(M*K); @@ -37,7 +37,7 @@ int main() { stream->write(dc, true, 0, hc); stream->synchronize(); triton::dnn::gemm gemm(M, N, K, AT, BT, ty, ty, 4, 4); - gemm.enqueue(stream, {da, db, dc}, false); + gemm.enqueue(stream, {da, db, dc}, true); // stream->read(dc, true, 0, hc); // gemm.cpu_ref(rc, ha, hb); // for(size_t i = 0; i < M*N; i++) diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index 739b35117..3dabddfe2 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -14,13 +14,13 @@ int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); - auto op = triton::dnn::shift::WGRAD; + auto op = triton::dnn::shift::FPROP; // initialization int32_t R = 3, S = 3; - int32_t B = 16, F = 4096; - int32_t H = 16, W = 16; - int32_t C = 4096; + int32_t B = 64, F = 2048; + int32_t H = 32, W = 32; + int32_t C = 2048; // random shifts std::vector shift_h(C); diff --git a/lib/codegen/alignment_info.cpp b/lib/codegen/alignment_info.cpp index b7e0b3641..ccd9778d1 100644 --- a/lib/codegen/alignment_info.cpp +++ b/lib/codegen/alignment_info.cpp @@ -59,8 +59,9 @@ alignment_info::cst_info alignment_info::populate_is_constant(ir::value *v) { cst_info rhs = populate_is_constant(rhs_op); if(lhs.num_cst==0 && rhs.value && x->is_int_div()){ unsigned max_contiguous = populate_max_contiguous(lhs_op); - unsigned starting_multiple = populate_starting_multiple(lhs_op); - return cache({gcd(max_contiguous, rhs.value) - (starting_multiple % rhs.value), 0}); + // todo might not be entirely true + unsigned num_constants = gcd(max_contiguous, rhs.value); + return cache({num_constants, 0}); } return cache({std::min(lhs.num_cst, rhs.num_cst), 0}); } @@ -300,6 +301,7 @@ void alignment_info::run(ir::module &mod) { for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()){ populate_max_contiguous(i); +// std::cout << i->get_name() << " " << is_constant_.at(i).num_cst << " " << max_contiguous_.at(i) << " " << starting_multiple_.at(i) << std::endl; } } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index b5cd54a8b..4e4741658 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -1148,7 +1148,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & unsigned max_contiguous = axis_info_->get_max_contiguous(ptr); unsigned alignment = std::min(starting_multiple, max_contiguous); unsigned vector_size = std::min(result->axis(0).contiguous, alignment); - vector_size = result->axis(0).contiguous; +// vector_size = result->axis(0).contiguous; // vector_size = 1; std::map packets; distributed_tile *TP = (distributed_tile*)tmap_.at(ld->get_pointer_operand()); diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 844c982e7..adc36740c 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -354,11 +354,6 @@ void shift(restrict read_only align(16) )" + a_ty_ + R"( *A, fp32 acc[TM, TN] = 0; int32 pad_h = BH / 2; int32 pad_w = BW / 2;)"; -if(op_ == WGRAD){ - result += R"( - - )"; -} /* A offsets */ if(op_ == FPROP){ @@ -408,13 +403,21 @@ if(op_ == WGRAD){ rbh = rbh * )" + stride_h + R"(; int32 offkb[TK] = rbb*)" + ldb_b + R"( + rbw*ldb_w + rbh*ldb_h; int32 offb0[TK, TN] = ryb[newaxis, :] * ldb_c; - int32 offb1[TK, TN] = offkb[:, newaxis] + shift;)"; + int32 offb1[TK, TN] = offkb[:, newaxis]; + )" + a_ty_ + "* pa_base[" + AS + R"(] = A + offa0; + )" + b_ty_ + "* pb_base[" + BS + R"(] = B + offb0 + shift; + )" + a_ty_ + "* pa[" + AS + R"(] = pa_base + offa1; + )" + b_ty_ + "* pb[" + BS + R"(] = pb_base + offb1;)"; +} +else{ + result += R"( + )" + a_ty_ + "* pa[" + AS + R"(] = A + offa0 + offa1; + )" + b_ty_ + "* pb[" + BS + R"(] = B + offb0 + offb1;)"; } /* Main loop */ +/* Increment A pointers */ result += R"( - )" + a_ty_ + "* pa[" + AS + R"(] = A + offa0 + offa1; - )" + b_ty_ + "* pb[" + BS + R"(] = B + offb0 + offb1; int1 checka[)" + AS + "] = (rka < K)" + bca0 + R"(; int1 checkb[)" + BS + "] = (rkb < K)" + bcb0 + R"(; )" + a_ty_ + " a[" + AS + R"(] = checka ? *pa : 0; @@ -440,7 +443,7 @@ if(op_ == WGRAD){ rka = rka + TK;)" + compute_bhw("ra", "TK", "rka") + R"( offxa = rab*)" + lda_b + R"( + raw*lda_w + rah*lda_h; - pa = A + offa0 + offxa[:, newaxis];)"; + pa = pa_base + offxa[:, newaxis];)"; } result += R"( @checka a = *pa;)"; @@ -453,7 +456,7 @@ if(op_ == WGRAD){ rbw = rbw * )" + stride_w + R"(; rbh = rbh * )" + stride_h + R"(; offkb = rbb*)" + ldb_b + R"( + rbw*ldb_w + rbh*ldb_h; - pb = B + offb0 + offkb[:, newaxis] + shift;)"; + pb = pb_base + offkb[:, newaxis];)"; } if(op_ == FPROP){ result += R"( From b1d81a5802fc5bcb4fc235e6d8b99c213d80eee8 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 21 Jul 2019 18:11:54 -0700 Subject: [PATCH 259/494] more work on heuristics --- examples/cpp/dot.cpp | 59 +++++++++---- examples/cpp/shift.cpp | 44 +++++----- examples/python/pytorch/shift.cpp | 2 +- examples/python/tensorflow/dot.cpp | 2 +- examples/python/tensorflow/shift.cpp | 2 +- include/triton/dnn/base.h | 12 ++- include/triton/dnn/gemm.h | 16 ++-- include/triton/dnn/shift.h | 7 +- include/triton/driver/dispatch.h | 2 + include/triton/runtime/jit.h | 2 +- include/triton/tools/bench.hpp | 11 ++- lib/codegen/selection.cpp | 4 +- lib/codegen/tune.cpp | 4 +- lib/dnn/base.cpp | 27 ++++-- lib/dnn/gemm.cpp | 119 ++++++++++++++++++++++++--- lib/dnn/shift.cpp | 12 ++- lib/runtime/jit.cpp | 42 ++++++---- 17 files changed, 268 insertions(+), 99 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 0eac9c046..4c9f51960 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -6,19 +6,21 @@ #include "triton/dnn/gemm.h" #include "triton/tools/bench.hpp" +template +void diff(const std::vector& x, const std::vector& y){ + for(size_t i = 0; i < x.size(); i++) + if(std::isnan(x[i]) || std::abs(x[i] - y[i])/std::max(x[i], y[i]) > 1e-4){ + std::cout << i << " " << x[i] << " " << y[i] << std::endl; + exit(EXIT_FAILURE); + } + std::cout << "Pass!" << std::endl; +} -int main() { - bool AT = false; - bool BT = true; +double bench(triton::driver::context* context, bool AT, bool BT, int32_t M, int32_t N, int32_t K){ typedef float T; std::string ty = "fp16"; size_t dt_nbytes = sizeof(T); - // initialize default compute device - auto context = triton::driver::backend::contexts::get_default(); - // matrix multiplication parameters - int32_t M = 65536, N = 2048, K = 2048; std::vector hc(M*N); - std::vector rc(M*N); std::vector ha(M*K); std::vector hb(K*N); srand(0); @@ -36,14 +38,35 @@ int main() { stream->write(db, true, 0, hb); stream->write(dc, true, 0, hc); stream->synchronize(); - triton::dnn::gemm gemm(M, N, K, AT, BT, ty, ty, 4, 4); - gemm.enqueue(stream, {da, db, dc}, true); -// stream->read(dc, true, 0, hc); -// gemm.cpu_ref(rc, ha, hb); -// for(size_t i = 0; i < M*N; i++) -// if(std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ -// std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; -// exit(EXIT_FAILURE); -// } -// std::cout << "Pass!" << std::endl; + triton::dnn::dot dot(M, N, K, AT, BT, ty, ty, 8, 8); + double result = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::PARTIAL_TUNING);}, stream); + delete dc; + delete da; + delete db; + return result; +} + +int main() { + struct config_t{ + bool AT; + bool BT; + int32_t M; + int32_t N; + int32_t K; + }; + // shapes to benchmark + std::vector configs = { + {false, false, 4096, 4096, 4096}, + {false, true, 4096, 4096, 4096}, + {true, false, 4096, 4096, 4096}, + {true, true, 4096, 4096, 4096} + }; + // initialize default compute device + auto context = triton::driver::backend::contexts::get_default(); + // does the work + for(config_t c: configs){ + double tns = bench(context, c.AT, c.BT, c.M, c.N, c.K); + double tflops = 2.*c.M*c.N*c.K / tns * 1e-3; + std::cout << c.AT << ", " << c.BT << ", " << c.M << ", " << c.N << ", " << c.K << ", " << tflops << std::endl; + } } diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index 3dabddfe2..388523de2 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -8,31 +8,23 @@ #include "triton/dnn/shift.h" #include "triton/external/half.hpp" -int main() { +double do_bench(triton::driver::context* context, + int32_t R, int32_t S, int32_t B, int32_t F, int32_t H, int32_t W, int32_t C, + triton::dnn::shift::op_t op, triton::dnn::shift::layout_t layout, + std::string numeric_t) { typedef float NumericT; - std::string numeric_t_str = "fp16"; - - // initialize default compute device - auto context = triton::driver::backend::contexts::get_default(); - auto op = triton::dnn::shift::FPROP; - - // initialization - int32_t R = 3, S = 3; - int32_t B = 64, F = 2048; - int32_t H = 32, W = 32; - int32_t C = 2048; // random shifts std::vector shift_h(C); std::vector shift_w(C); for(int32_t c = 0; c < C; c++){ - shift_h[c] = 0; - shift_w[c] = 0; + shift_h[c] = rand() % R - R / 2; + shift_w[c] = rand() % S - S / 2; } // configuration triton::dnn::shift shift(B, C, 1, H, W, 1, R, S, F, 1, 1, shift_h.data(), shift_w.data(), - numeric_t_str, numeric_t_str, + numeric_t, numeric_t, op, false, triton::dnn::shift::CHWN); // host buffers size_t a_size = B*C*H*W; @@ -67,13 +59,19 @@ int main() { stream->write(dc, true, 0, hc); stream->synchronize(); shift.enqueue(stream, {da, db, dc}, true); -// stream->read(dc, true, 0, hc); -// shift.cpu_ref(rc.data(), ha.data(), hb.data()); -// for(size_t i = 0; i < hc.size(); i++) -// if(std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ -// std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; -// exit(EXIT_FAILURE); -// } -// std::cout << "Pass!" << std::endl; + double tns = triton::tools::bench([&]() { shift.enqueue(stream, {da, db, dc}, true);}, stream); + std::cout << tns << std::endl; +} + +int main() { + // initialize default compute device + auto context = triton::driver::backend::contexts::get_default(); + // shapes + int32_t R = 3, S = 3; + int32_t B = 16, F = 4096; + int32_t H = 32, W = 32; + int32_t C = 4096; + // benchmark + do_bench(context, R, S, B, F, H, W, C, triton::dnn::shift::FPROP, triton::dnn::shift::CHWN, "fp16"); } diff --git a/examples/python/pytorch/shift.cpp b/examples/python/pytorch/shift.cpp index 7efe0198b..a16c2922e 100644 --- a/examples/python/pytorch/shift.cpp +++ b/examples/python/pytorch/shift.cpp @@ -36,7 +36,7 @@ torch::Tensor shift_common( int32_t T, int32_t R, int32_t S, int32_t F, int32_t stride_h, int32_t stride_w, int32_t* shift_h, int32_t* shift_w, - triton::dnn::shift::type ty, triton::dnn::shift::layout_t layout, + triton::dnn::shift::op_t ty, triton::dnn::shift::layout_t layout, torch::Tensor torcha, torch::Tensor torchb, torch::Tensor torchbias, bool autotune = false ) { diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index 9bd25eeb3..84f67664c 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -49,7 +49,7 @@ class DotOp : public OpKernel { triton::driver::cu_buffer db(ctx, (CUdeviceptr)b.flat().data(), false); triton::driver::cu_buffer dc(ctx, (CUdeviceptr)c->flat().data(), false); // template - triton::dnn::gemm dot(M, N, K, false, false, "fp16", "fp16", 4, 4); + triton::dnn::dot dot(M, N, K, false, false, "fp16", "fp16", 8, 8); dot.enqueue(stream, {&da, &db, &dc}); } diff --git a/examples/python/tensorflow/shift.cpp b/examples/python/tensorflow/shift.cpp index 1834cadaf..1a9ebbe59 100644 --- a/examples/python/tensorflow/shift.cpp +++ b/examples/python/tensorflow/shift.cpp @@ -19,7 +19,7 @@ using namespace tensorflow; using GPUDevice = Eigen::GpuDevice; -template +template class ShiftConvOp : public OpKernel { public: explicit ShiftConvOp(OpKernelConstruction* context) : OpKernel(context), layout_(triton::dnn::shift::NCHW) { diff --git a/include/triton/dnn/base.h b/include/triton/dnn/base.h index 3045ffb49..1fbded42c 100644 --- a/include/triton/dnn/base.h +++ b/include/triton/dnn/base.h @@ -31,6 +31,13 @@ namespace triton{ namespace dnn{ +enum autotuning_t{ + FULL_TUNING, + PARTIAL_TUNING, + NO_TUNING +}; + +typedef std::vector params_t; class base { friend class cmp_recompile; @@ -53,6 +60,9 @@ private: virtual size_t num_flops() const = 0; // comparison for maps virtual bool operator<(const base& other) const = 0; + // default parameters + virtual std::vector search_space() const; + virtual params_t heuristics() const; public: // constructor @@ -62,7 +72,7 @@ public: // clone virtual base* clone() const = 0; // enqueue - void enqueue(driver::stream* stream, std::vector args, bool autotune = false); + void enqueue(driver::stream* stream, std::vector args, autotuning_t autotune = PARTIAL_TUNING); private: std::string name_; diff --git a/include/triton/dnn/gemm.h b/include/triton/dnn/gemm.h index f43370606..1e581f6a1 100644 --- a/include/triton/dnn/gemm.h +++ b/include/triton/dnn/gemm.h @@ -6,7 +6,7 @@ namespace triton{ namespace dnn{ -class gemm: public base { +class dot: public base { private: // initialize void init_impl(driver::stream *, driver::cu_module *); @@ -18,10 +18,12 @@ private: size_t num_flops() const; // comparison for maps bool operator<(const base& other) const; - + // default parameters + virtual std::vector search_space() const; + virtual params_t heuristics() const; public: - gemm(int M, int N, int K, bool AT, bool BT, + dot(int M, int N, int K, bool AT, bool BT, std::string a_ty, std::string b_ty, unsigned alignment_lda, unsigned alignment_ldb); @@ -46,13 +48,13 @@ public: template void cpu_ref(std::vector &c, const std::vector &a, const std::vector &b) { if(AT_ && BT_) - gemm::cpu_ref(c, a, b, M_, N_, K_); + dot::cpu_ref(c, a, b, M_, N_, K_); else if(AT_ && !BT_) - gemm::cpu_ref(c, a, b, M_, N_, K_); + dot::cpu_ref(c, a, b, M_, N_, K_); else if(!AT_ && BT_) - gemm::cpu_ref(c, a, b, M_, N_, K_); + dot::cpu_ref(c, a, b, M_, N_, K_); else - gemm::cpu_ref(c, a, b, M_, N_, K_); + dot::cpu_ref(c, a, b, M_, N_, K_); } private: diff --git a/include/triton/dnn/shift.h b/include/triton/dnn/shift.h index 84c6ccda7..59d26ab44 100644 --- a/include/triton/dnn/shift.h +++ b/include/triton/dnn/shift.h @@ -38,7 +38,7 @@ namespace dnn{ class shift: public base { public: - enum type { + enum op_t { FPROP, BPROP, WGRAD @@ -56,6 +56,7 @@ private: void enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, triton::runtime::launch_information info); + std::vector default_params() const; public: @@ -65,7 +66,7 @@ public: int stride_h, int stride_w, const int32_t* shift_h, const int32_t* shift_w, std::string a_ty = "fp32", std::string b_ty = "fp32", - type ty = FPROP, bool bias = false, layout_t layout = CHWN); + op_t ty = FPROP, bool bias = false, layout_t layout = CHWN); // look-up table void build_delta_a(); @@ -165,7 +166,7 @@ private: std::string b_ty_; std::string c_ty_; // convolution type - type op_; + op_t op_; bool bias_; // transpose bool AT_; diff --git a/include/triton/driver/dispatch.h b/include/triton/driver/dispatch.h index c1f4f01f9..9803a163e 100755 --- a/include/triton/driver/dispatch.h +++ b/include/triton/driver/dispatch.h @@ -167,6 +167,8 @@ public: static nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock); static nvmlReturn_t nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int mem_clock, unsigned int sm_clock); + + // SPIR-V libraries static int initializeLLVMToSPIRVPass(llvm::PassRegistry &); static bool writeSpirv(llvm::Module *M, std::ostream &OS, std::string &ErrMsg); diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index d3088d73b..f1da2a5a2 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -108,7 +108,7 @@ public: jit(driver::context* context, unsigned nthreads = 4); ~jit(); std::vector get_valid(const char *name, const char *src); - tune_res_t autotune(const char* name, const char* src, benchmark_t benchmark); + tune_res_t autotune(const char* name, const char* src, benchmark_t benchmark, const std::vector > &targets = {}); void add_module(ir::module &module, const std::vector& params = {}); void add_module(const char* name, const char* src, const std::vector& params = {}); driver::kernel* get_function(const char* name); diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp index 64c88cd64..6ac72fec7 100644 --- a/include/triton/tools/bench.hpp +++ b/include/triton/tools/bench.hpp @@ -2,6 +2,9 @@ #define TRITON_TOOLS_BENCH_HPP #include +#include +#include "triton/driver/device.h" +#include "triton/driver/stream.h" namespace triton{ namespace tools{ @@ -24,14 +27,14 @@ private: high_resolution_clock::time_point _start; }; -template -double bench(OP const & op, SYNC const & sync, const triton::driver::device * device) +inline double bench(std::function const & op, driver::stream * stream) { + const driver::device * device = stream->context()->device(); timer tmr; std::vector times; double total_time = 0; op(); - sync(); + stream->synchronize(); while(total_time*1e-9 < 1e-3){ float norm = 1; // normalize clock if possible to get roughly constant result @@ -39,7 +42,7 @@ double bench(OP const & op, SYNC const & sync, const triton::driver::device * de norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock(); tmr.start(); op(); - sync(); + stream->synchronize(); times.push_back(norm*tmr.get().count()); total_time+=times.back(); } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 4e4741658..8a6e74f33 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -529,8 +529,8 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id unsigned num_rep_0 = shapes[0]->get_value() / hmma_bts_0; unsigned num_rep_1 = shapes[1]->get_value() / hmma_bts_1; // size of each pack (interleaving) - pack_size_0_ = std::min(num_rep_0, 2); - pack_size_1_ = std::min(num_rep_1, 2); + pack_size_0_ = std::min(num_rep_0, 1); + pack_size_1_ = std::min(num_rep_1, 1); // number of packs (interleaving) num_packs_0_ = num_rep_0 / pack_size_0_; num_packs_1_ = num_rep_1 / pack_size_1_; diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 47b3f05fa..3f5119577 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -221,7 +221,7 @@ void tune::run(ir::module &mod) { } else { ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 2, 2); - ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 2, 4); + ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 1, 4); connected_components(node, {fpw, wpt}, {"fpw", "wpt"}, nodes_, dependencies_, group_id++); } } @@ -239,7 +239,7 @@ void tune::run(ir::module &mod) { size_t addr_space = ptr_ty->get_pointer_address_space(); if(addr_space < 4){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 8, 8)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 4, 8)); *params_.at(i).at("nts.d0") = *tmp; } } diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index 73bb474b8..8224dc846 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -7,8 +7,6 @@ namespace triton{ namespace dnn{ - - void base::set_ld(const std::vector& shapes, std::vector& ld) { size_t size = shapes.size(); @@ -22,7 +20,15 @@ void base::set_ld(const std::vector& shapes, base::base(const std::string& name) : name_(name) { } -void base::enqueue(driver::stream *stream, std::vector args, bool autotune) { +std::vector base::search_space() const { + return {}; +} + +params_t base::heuristics() const { + return *search_space().begin(); +} + +void base::enqueue(driver::stream *stream, std::vector args, autotuning_t autotune) { namespace rt = triton::runtime; static std::map, cmp_recompile> m_jit; driver::context* ctx = stream->context(); @@ -30,7 +36,7 @@ void base::enqueue(driver::stream *stream, std::vector args, b /* the current template has not already been compiled */ if(m_jit.find(this) == m_jit.end()) { base* clone = this->clone(); - jit = m_jit.emplace(clone, std::unique_ptr(new rt::jit(ctx, 8))).first->second.get(); + jit = m_jit.emplace(clone, std::unique_ptr(new rt::jit(ctx))).first->second.get(); std::ostringstream oss; clone->triton_c_src(oss); std::string src = oss.str(); @@ -40,18 +46,21 @@ void base::enqueue(driver::stream *stream, std::vector args, b clone->init_impl(stream, (triton::driver::cu_module*)kernel->module()); clone->enqueue_impl(stream, kernel, args, info); stream->synchronize(); - double ts = triton::tools::bench([&](){ clone->enqueue_impl(stream, kernel, args, info); }, - [&](){ stream->synchronize(); }, ctx->device()); + double ts = triton::tools::bench([&](){ clone->enqueue_impl(stream, kernel, args, info); }, stream); clone->deinit_impl(); return num_flops() / ts * 1e-3; }; // auto-tune and save result - if(autotune) { - rt::jit::tune_res_t best = jit->autotune(name_.c_str(), src.c_str(), benchmark); + if(autotune != NO_TUNING) { + std::vector space = {}; + if(autotune == PARTIAL_TUNING) + space = search_space(); + rt::jit::tune_res_t best = jit->autotune(name_.c_str(), src.c_str(), benchmark, space); jit->add_module(name_.c_str(), src.c_str(), best.params); } else { - jit->add_module(name_.c_str(), src.c_str(), jit->get_valid(name_.c_str(), src.c_str())); + params_t params = heuristics(); + jit->add_module(name_.c_str(), src.c_str(), params); } triton::driver::kernel* kernel = jit->get_function(name_.c_str()); clone->init_impl(stream, (triton::driver::cu_module*)kernel->module()); diff --git a/lib/dnn/gemm.cpp b/lib/dnn/gemm.cpp index 42c7793c2..43fc9f173 100644 --- a/lib/dnn/gemm.cpp +++ b/lib/dnn/gemm.cpp @@ -6,7 +6,7 @@ namespace triton{ namespace dnn{ -gemm::gemm(int M, int N, int K, +dot::dot(int M, int N, int K, bool AT, bool BT, std::string a_ty, std::string b_ty, unsigned alignment_lda, unsigned alignment_ldb) @@ -18,13 +18,13 @@ gemm::gemm(int M, int N, int K, } -size_t gemm::num_flops() const { +size_t dot::num_flops() const { return 2.*M_*N_*K_; } // comparison for maps -bool gemm::operator<(const base& other) const { - auto *y = dynamic_cast(&other); +bool dot::operator<(const base& other) const { + auto *y = dynamic_cast(&other); if(!y) return true; return std::tie(M_, N_, K_, AT_, BT_, @@ -34,18 +34,18 @@ bool gemm::operator<(const base& other) const { } // clone -base* gemm::clone() const { - return new gemm(*this); +base* dot::clone() const { + return new dot(*this); } -void gemm::init_impl(driver::stream* stream, driver::cu_module *) { +void dot::init_impl(driver::stream* stream, driver::cu_module *) { std::vector hlocks(2048, 0); if(locks_ == nullptr) locks_ = triton::driver::buffer::create(stream->context(), hlocks.size()*4); stream->write(locks_, false, 0, hlocks); } -void gemm::enqueue_impl(driver::stream *stream, driver::kernel *kernel, +void dot::enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, runtime::launch_information info) { driver::buffer *a = args[0], *b = args[1], *c = args[2]; @@ -75,7 +75,7 @@ void gemm::enqueue_impl(driver::stream *stream, driver::kernel *kernel, stream->enqueue(kernel, grid, {info.num_threads, 1, 1}); } -void gemm::triton_c_src(std::ostream &os) const { +void dot::triton_c_src(std::ostream &os) const { std::string AS0 = "TM", AS1 = "TK"; std::string BS0 = "TK", BS1 = "TN"; std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; @@ -100,8 +100,8 @@ void gemm::triton_c_src(std::ostream &os) const { std::string align_ldb_str = "multiple_of(" + std::to_string(align_ldb_) + ")"; std::string res = R"( -const tunable int32 TM = {32, 64, 128, 256}; -const tunable int32 TN = {32, 64, 128, 256}; +const tunable int32 TM = {16, 32, 64, 128}; +const tunable int32 TN = {16, 32, 64, 128}; const tunable int32 TK = {32}; const tunable int32 GZ = {1}; @@ -145,5 +145,102 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, os << res; } +// small search space for partial auto-tuning +std::vector dot::search_space() const { + typedef std::vector params_t; + typedef std::tuple key_t; + static std::vector keys = { + {16, 16}, {16, 32}, {16, 64}, {16, 128}, + {32, 16}, {32, 32}, {32, 64}, {32, 128}, + {64, 16}, {64, 32}, {64, 64}, {64, 128}, + {128, 16},{128, 32},{128, 64},{128, 128} + }; + static std::vector space_nn = { + {4, 4, 16, 8, 16, 2, 2, 1, 1, 8, 32, 4, 8, 1}, + {2, 8, 16, 8, 32, 2, 2, 1, 1, 16, 32, 4, 8, 1}, + {4, 4, 16, 4, 64, 2, 2, 1, 1, 8, 32, 8, 4, 1}, + {4, 4, 16, 16, 128, 2, 2, 1, 2, 16, 32, 4, 8, 1}, + {4, 8, 32, 8, 16, 2, 2, 1, 1, 8, 32, 4, 8, 1}, + {4, 8, 32, 8, 32, 2, 2, 1, 1, 8, 32, 4, 8, 1}, + {8, 4, 32, 8, 64, 2, 2, 1, 1, 4, 32, 4, 8, 1}, + {8, 4, 32, 16, 128, 2, 2, 1, 4, 16, 32, 8, 4, 1}, + {8, 8, 64, 4, 16, 2, 2, 1, 1, 4, 32, 8, 4, 1}, + {8, 8, 64, 8, 32, 2, 2, 1, 1, 4, 32, 4, 8, 1}, + {8, 8, 64, 16, 64, 2, 2, 2, 1, 8, 32, 4, 8, 1}, + {16, 4, 64, 16, 128, 2, 2, 2, 2, 8, 32, 8, 4, 1}, + {8, 8, 128, 8, 16, 2, 2, 2, 1, 8, 32, 8, 4, 1}, + {8, 8, 128, 16, 32, 2, 2, 2, 1, 8, 32, 4, 8, 1}, + {8, 8, 128, 32, 64, 2, 2, 2, 2, 16, 32, 4, 8, 1}, + {8, 8, 128, 32, 128, 2, 2, 1, 4, 16, 32, 4, 8, 1}, + }; + static std::vector space_nt = { + {4, 4, 16, 2, 8, 16, 2, 2, 1, 1, 8, 32, 16, 1}, + {4, 4, 16, 4, 8, 32, 2, 2, 1, 1, 8, 32, 8, 1}, + {4, 4, 16, 8, 8, 64, 2, 2, 1, 4, 32, 32, 16, 1}, + {4, 4, 16, 32, 4, 128, 2, 2, 1, 2, 16, 32, 2, 1}, + {8, 4, 32, 2, 8, 16, 2, 2, 1, 1, 4, 32, 16, 1}, + {4, 8, 32, 4, 8, 32, 2, 2, 1, 1, 8, 32, 8, 1}, + {16, 8, 128, 4, 4, 64, 2, 2, 1, 4, 8, 32, 32, 1}, + {4, 8, 32, 8, 8, 128, 2, 2, 1, 2, 16, 32, 8, 1}, + {8, 8, 64, 2, 8, 16, 2, 2, 1, 1, 4, 32, 16, 1}, + {8, 8, 64, 4, 8, 32, 2, 2, 1, 1, 4, 32, 8, 1}, + {8, 8, 64, 8, 8, 64, 2, 2, 1, 2, 8, 32, 8, 1}, + {8, 8, 64, 16, 8, 128, 2, 2, 1, 4, 16, 32, 8, 1}, + {8, 8, 128, 2, 8, 16, 2, 2, 2, 1, 8, 32, 32, 1}, + {16, 8, 128, 4, 8, 32, 2, 2, 2, 1, 4, 32, 16, 1}, + {8, 8, 128, 8, 8, 64, 2, 2, 2, 2, 16, 32, 16, 1}, + {8, 8, 128, 8, 8, 128, 2, 2, 4, 1, 16, 32, 16, 1}, + }; + static std::vector space_tn = { + {8, 16, 16, 16, 2, 2, 1, 1, 4, 8, 32, 2, 8, 1}, + {4, 16, 8, 32, 2, 2, 1, 1, 8, 4, 32, 4, 8, 1}, + {4, 16, 4, 64, 2, 2, 1, 1, 8, 4, 32, 8, 4, 1}, + {16, 16, 16, 128, 2, 2, 1, 2, 4, 8, 32, 4, 8, 1}, + {4, 32, 8, 16, 2, 2, 1, 1, 8, 4, 32, 4, 8, 1}, + {8, 32, 8, 32, 2, 2, 1, 1, 4, 8, 32, 4, 8, 1}, + {8, 32, 8, 64, 2, 2, 1, 1, 4, 8, 32, 4, 8, 1}, + {32, 32, 64, 128, 2, 2, 2, 2, 4, 8, 32, 2, 8, 1}, + {8, 64, 8, 16, 2, 2, 1, 1, 4, 8, 32, 4, 8, 1}, + {8, 64, 8, 32, 2, 2, 1, 1, 4, 8, 32, 4, 8, 1}, + {16, 64, 16, 64, 2, 2, 2, 1, 4, 8, 32, 4, 8, 1}, + {32, 64, 16, 128, 2, 2, 2, 2, 4, 8, 32, 8, 4, 1}, + {16, 128, 16, 16, 2, 2, 2, 1, 4, 8, 32, 4, 8, 1}, + {32, 128, 32, 32, 2, 2, 4, 1, 4, 8, 32, 4, 8, 1}, + {32, 128, 32, 64, 2, 2, 4, 1, 4, 8, 32, 4, 8, 1}, + {32, 128, 32, 128, 2, 2, 4, 1, 4, 8, 32, 4, 8, 1}, + }; + static std::vector space_tt = { + {4, 16, 2, 8, 16, 2, 2, 1, 1, 8, 4, 32, 16, 1}, + {8, 16, 4, 8, 32, 2, 2, 1, 1, 4, 8, 32, 8, 1}, + {16, 16, 4, 8, 64, 2, 2, 1, 4, 8, 4, 32, 32, 1}, + {16, 16, 8, 4, 128, 2, 2, 1, 2, 4, 8, 32, 8, 1}, + {4, 32, 2, 8, 16, 2, 2, 1, 1, 8, 4, 32, 16, 1}, + {8, 32, 4, 8, 32, 2, 2, 1, 1, 4, 8, 32, 8, 1}, + {16, 64, 4, 8, 64, 2, 2, 2, 1, 4, 8, 32, 16, 1}, + {32, 32, 8, 8, 128, 2, 2, 1, 4, 4, 8, 32, 16, 1}, + {8, 64, 2, 8, 16, 2, 2, 1, 1, 4, 8, 32, 16, 1}, + {8, 64, 4, 8, 32, 2, 2, 1, 1, 4, 8, 32, 8, 1}, + {16, 64, 8, 8, 64, 2, 2, 2, 1, 4, 8, 32, 8, 1}, + {32, 64, 8, 8, 128, 2, 2, 1, 4, 4, 8, 32, 16, 1}, + {16, 128, 2, 8, 16, 2, 2, 2, 1, 4, 8, 32, 32, 1}, + {32, 128, 8, 4, 32, 2, 2, 4, 1, 4, 8, 32, 16, 1}, + {32, 128, 16, 4, 64, 2, 2, 4, 1, 4, 8, 32, 8, 1}, + {32, 128, 8, 8, 128, 2, 2, 4, 1, 4, 8, 32, 16, 1} + }; + if(!AT_ && !BT_) + return space_nn; + else if(!AT_ && BT_) + return space_nt; + else if(AT_ && !BT_) + return space_tn; + else + return space_tt; +} + +// simple parameter heuristics +params_t dot::heuristics() const { + return search_space().back(); +} + } } diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index adc36740c..e8a4f3584 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -13,7 +13,7 @@ shift::shift(int B, int C, int stride_h, int stride_w, const int32_t *shift_h, const int32_t *shift_w, std::string a_ty, std::string b_ty, - type ty, bool bias, + op_t ty, bool bias, layout_t layout) : base("shift"), B_(B), C_(C), @@ -512,5 +512,15 @@ else{ os << result; } + +// simple parameter heuristics +std::vector shift::default_params() const { + typedef std::vector params_t; + std::map, params_t> params = { + {{}, {}} + }; +} + + } } diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index c925a690c..6f25de8da 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -31,7 +31,7 @@ extern triton::lang::translation_unit *ast_root; namespace triton { namespace runtime{ -void loop_nest(std::vector const & ranges, +void parallel_loop_nest(std::vector const & ranges, std::function const &)> const & f, size_t nthreads){ size_t D = ranges.size(); @@ -55,7 +55,7 @@ void loop_nest(std::vector const & ranges, } template -void loop_nest(std::vector> const & iterates, std::function)> const & f, size_t nthreads){ +void parallel_loop_nest(std::vector> const & iterates, std::function)> const & f, size_t nthreads){ //Ranges to iterate over std::vector ranges; for(auto const & x: iterates) @@ -68,10 +68,14 @@ void loop_nest(std::vector> const & iterates, std::function> const & iterates, std::function)> const & f, size_t nthreads) { + ThreadPool pool(nthreads); + for(const std::vector& values: iterates) + pool.enqueue(f, values); +} std::unique_ptr jit::make_llvm_module(ir::module &module, passes_wrapper &passes, llvm::LLVMContext& llvm_context, launch_information& info) { @@ -128,7 +132,7 @@ std::vector jit::get_valid(const char *name, const char *src) { ranges.push_back(mp->get_space()); // iterate over parameters std::vector result; - loop_nest(ranges, [&](const std::vector params){ + parallel_loop_nest(ranges, [&](const std::vector params){ if(!result.empty()) return; std::map> errors; @@ -148,7 +152,7 @@ std::vector jit::get_valid(const char *name, const char *src) { -jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t benchmark) { +jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t benchmark, const std::vector> & targets) { // find metaparameters triton::lang::translation_unit* program = parse_program(name, src); auto ptt_module_0 = make_triton_module(name, triton_context_, program); @@ -157,15 +161,12 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben passes_wrapper passes_0(target_.get()); passes_0.target_independent(tt_module_0); passes_0.tune.run(tt_module_0); - // create parameter ranges - std::vector> ranges; auto mps = passes_0.tune.get_params(tt_module_0); - for(ir::metaparameter *mp: mps) - ranges.push_back(mp->get_space()); // iterate over parameters tune_res_t best; std::mutex mutex; - loop_nest(ranges, [&](const std::vector params){ + // update_best + auto update_best = [&](const std::vector params){ std::map> errors; unsigned i = 0; { @@ -200,10 +201,10 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben launch_information info; llvm::LLVMContext llvm_context; auto ll_module = make_llvm_module(tt_module_1, passes_1, llvm_context, info); + std::unique_ptr module(driver::module::create(driver_context_, &*ll_module)); double perf; { std::lock_guard lock(mutex); - std::unique_ptr module(driver::module::create(driver_context_, &*ll_module)); std::unique_ptr kernel(driver::kernel::create(module.get(), name)); perf = benchmark(kernel.get(), info); if(perf > best.perf){ @@ -214,8 +215,21 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben std::cout << ((i==0)?"":", ") << params[i] << std::flush; std::cout << ", " << perf << " [ " << best.perf << " ] " << std::endl; } - }, nthreads_); - std::cout << "Autotuning done - Best performance: " << best.perf << std::endl; + }; + + + if(targets.empty()) { + // create parameter ranges + std::vector> ranges; + for(ir::metaparameter *mp: mps) + ranges.push_back(mp->get_space()); + parallel_loop_nest(ranges, update_best, nthreads_); + } + else { + parallel_for_each(targets, update_best, nthreads_); + } + +// std::cout << "Autotuning done - Best performance: " << best.perf << std::endl; return best; } From ead368d1edfdb71dce6befcbca3cc296d9676df9 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 21 Jul 2019 20:17:56 -0700 Subject: [PATCH 260/494] [general] a bunch of fixes in anticipation of proper triton vs cudnn benchmarks * DNN: Added partial auto-tuning mode and skeleton for heuristics * Examples: Moduralized benchmarking and now evaluating ResNet-18 shapes --- examples/cpp/dot.cpp | 27 +++++--- examples/cpp/shift.cpp | 65 ++++++++++++++---- examples/python/pytorch/shift.cpp | 21 +++--- include/triton/dnn/gemm.h | 5 +- include/triton/dnn/heuristics.h | 109 ++++++++++++++++++++++++++++++ include/triton/dnn/shift.h | 27 ++++---- lib/dnn/base.cpp | 3 +- lib/dnn/gemm.cpp | 92 +------------------------ lib/dnn/shift.cpp | 13 ++-- lib/runtime/jit.cpp | 6 +- 10 files changed, 221 insertions(+), 147 deletions(-) create mode 100644 include/triton/dnn/heuristics.h diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 4c9f51960..a8723f2e2 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -1,4 +1,5 @@ #include +#include #include #include "triton/runtime/jit.h" #include "triton/driver/backend.h" @@ -16,7 +17,7 @@ void diff(const std::vector& x, const std::vector& y){ std::cout << "Pass!" << std::endl; } -double bench(triton::driver::context* context, bool AT, bool BT, int32_t M, int32_t N, int32_t K){ +double do_bench(triton::driver::context* context, bool AT, bool BT, int32_t M, int32_t N, int32_t K){ typedef float T; std::string ty = "fp16"; size_t dt_nbytes = sizeof(T); @@ -39,11 +40,11 @@ double bench(triton::driver::context* context, bool AT, bool BT, int32_t M, int3 stream->write(dc, true, 0, hc); stream->synchronize(); triton::dnn::dot dot(M, N, K, AT, BT, ty, ty, 8, 8); - double result = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::PARTIAL_TUNING);}, stream); + double nanosec = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::PARTIAL_TUNING);}, stream); delete dc; delete da; delete db; - return result; + return dot.num_flops() / nanosec * 1e-3; } int main() { @@ -53,20 +54,28 @@ int main() { int32_t M; int32_t N; int32_t K; + + std::string repr() { + std::ostringstream oss; + oss << AT << " " << BT << " " << M << " " << N << " " << K; + return oss.str(); + } + + double perf(triton::driver::context *context){ + return do_bench(context, AT, BT, M, N, K); + } }; // shapes to benchmark std::vector configs = { {false, false, 4096, 4096, 4096}, - {false, true, 4096, 4096, 4096}, - {true, false, 4096, 4096, 4096}, - {true, true, 4096, 4096, 4096} + {false, true, 4096, 4096, 4096}, + {true, false, 4096, 4096, 4096}, + {true, true, 4096, 4096, 4096} }; // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); // does the work for(config_t c: configs){ - double tns = bench(context, c.AT, c.BT, c.M, c.N, c.K); - double tflops = 2.*c.M*c.N*c.K / tns * 1e-3; - std::cout << c.AT << ", " << c.BT << ", " << c.M << ", " << c.N << ", " << c.K << ", " << tflops << std::endl; + std::cout << c.repr() << ", " << c.perf(context) << std::endl; } } diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index 388523de2..c4074c722 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -10,7 +10,7 @@ double do_bench(triton::driver::context* context, int32_t R, int32_t S, int32_t B, int32_t F, int32_t H, int32_t W, int32_t C, - triton::dnn::shift::op_t op, triton::dnn::shift::layout_t layout, + triton::dnn::op_t op, triton::dnn::layout_t layout, std::string numeric_t) { typedef float NumericT; @@ -25,14 +25,14 @@ double do_bench(triton::driver::context* context, triton::dnn::shift shift(B, C, 1, H, W, 1, R, S, F, 1, 1, shift_h.data(), shift_w.data(), numeric_t, numeric_t, - op, false, triton::dnn::shift::CHWN); + op, false, layout); // host buffers size_t a_size = B*C*H*W; size_t b_size = C*F; size_t c_size = B*F*H*W; - if(op == triton::dnn::shift::BPROP) + if(op == triton::dnn::BPROP) std::swap(a_size, c_size); - if(op == triton::dnn::shift::WGRAD){ + if(op == triton::dnn::WGRAD){ std::swap(b_size, c_size); std::swap(a_size, b_size); } @@ -58,20 +58,57 @@ double do_bench(triton::driver::context* context, stream->write(db, true, 0, hb); stream->write(dc, true, 0, hc); stream->synchronize(); - shift.enqueue(stream, {da, db, dc}, true); - double tns = triton::tools::bench([&]() { shift.enqueue(stream, {da, db, dc}, true);}, stream); - std::cout << tns << std::endl; + double nanosec = triton::tools::bench([&]() { shift.enqueue(stream, {da, db, dc});}, stream); + return shift.num_flops() / nanosec * 1e-3; } int main() { + using triton::dnn::op_t; + using triton::dnn::layout_t; + + struct config_t{ + int32_t B; + int32_t C; + int32_t H; + int32_t W; + int32_t R; + int32_t S; + int32_t F; + int32_t stride_h; + int32_t stride_w; + op_t op; + layout_t layout; + std::string ty; + + std::string repr() { + std::ostringstream oss; + oss << B << ", " << C << ", " << H << ", " << W << ", " << R << ", " << S << ", " << F << ", " << op << ", " << layout << ", " << ty; + return oss.str(); + } + + double perf(triton::driver::context *context){ + return do_bench(context, R, S, B, F, H, W, C, op, layout, ty); + } + }; + // shapes to benchmark + std::vector configs; + std::vector resnet18 = { + {128, 128, 32, 32, 3, 3, 128, 1, 1}, + {128, 128, 32, 32, 3, 3, 256, 2, 2}, + {128, 256, 16, 16, 3, 3, 256, 1, 1}, + {128, 256, 16, 16, 3, 3, 512, 2, 2}, + {128, 512, 8, 8, 3, 3, 512, 1, 1}, + {128, 512, 8, 8, 3, 3, 1024, 1, 1}, + {128, 1024, 8, 8, 3, 3, 1024, 1, 1} + }; + for(config_t c: resnet18){ + for(op_t op: {op_t::FPROP, op_t::BPROP, op_t::WGRAD}) + configs.push_back({c.B, c.C, c.H, c.W, c.R, c.S, c.F, c.stride_h, c.stride_w, op, layout_t::CHWN, "fp16"}); + } + // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); - // shapes - int32_t R = 3, S = 3; - int32_t B = 16, F = 4096; - int32_t H = 32, W = 32; - int32_t C = 4096; - // benchmark - do_bench(context, R, S, B, F, H, W, C, triton::dnn::shift::FPROP, triton::dnn::shift::CHWN, "fp16"); + for(config_t c: configs) + std::cout << c.repr() << ", " << c.perf(context) << std::endl; } diff --git a/examples/python/pytorch/shift.cpp b/examples/python/pytorch/shift.cpp index a16c2922e..7c86b227e 100644 --- a/examples/python/pytorch/shift.cpp +++ b/examples/python/pytorch/shift.cpp @@ -11,14 +11,14 @@ void extract_shapes(const torch::Tensor &x, int64_t &C, int64_t &H, int64_t &W, int64_t &B, - triton::dnn::shift::layout_t layout) { - if(layout == triton::dnn::shift::CHWN){ + triton::dnn::layout_t layout) { + if(layout == triton::dnn::CHWN){ C = x.size(0); H = x.size(1); W = x.size(2); B = x.size(3); } - else if(layout == triton::dnn::shift::NCHW){ + else if(layout == triton::dnn::NCHW){ B = x.size(0); C = x.size(1); H = x.size(2); @@ -29,14 +29,14 @@ void extract_shapes(const torch::Tensor &x, } } -static const triton::dnn::shift::layout_t layout = triton::dnn::shift::NCHW; +static const triton::dnn::layout_t layout = triton::dnn::NCHW; torch::Tensor shift_common( int32_t B, int32_t C, int32_t D, int32_t H, int32_t W, int32_t T, int32_t R, int32_t S, int32_t F, int32_t stride_h, int32_t stride_w, int32_t* shift_h, int32_t* shift_w, - triton::dnn::shift::op_t ty, triton::dnn::shift::layout_t layout, + triton::dnn::op_t op, triton::dnn::layout_t layout, torch::Tensor torcha, torch::Tensor torchb, torch::Tensor torchbias, bool autotune = false ) { @@ -59,7 +59,7 @@ torch::Tensor shift_common( triton::dnn::shift shift(B, C, D, H, W, T, R, S, F, stride_h, stride_w, shift_h, shift_w, dtype, dtype, - ty, has_bias, layout); + op, has_bias, layout); // Bind memory triton::driver::cu_buffer a(ctx, (CUdeviceptr)torcha.storage().data(), false); triton::driver::cu_buffer b(ctx, (CUdeviceptr)torchb.storage().data(), false); @@ -74,8 +74,9 @@ torch::Tensor shift_common( triton::driver::cu_buffer c(ctx, (CUdeviceptr)torchc.storage().data(), false); + std::cout << B << ", " << C << ", " << H << ", " << W << ", " << T << ", " << R << ", " << S << ", " << F << ", " << stride_h << ", " << stride_w << ", " << op << ", " << layout << std::endl; // Enqueue - shift.enqueue(&stream, {&a, &b, &c}, true); + shift.enqueue(&stream, {&a, &b, &c}, triton::dnn::NO_TUNING); return torchc; } @@ -99,7 +100,7 @@ torch::Tensor shift_y( // run return shift_common(B, C, 1, H, W, 1, R, S, F, stride_h, stride_w, (int32_t*)shift_h.storage().data(), (int32_t*)shift_w.storage().data(), - triton::dnn::shift::FPROP, layout, x, w, bias); + triton::dnn::FPROP, layout, x, w, bias); } torch::Tensor shift_dx( @@ -127,7 +128,7 @@ torch::Tensor shift_dx( // run return shift_common(B, C, 1, H, W, 1, R, S, F, stride_h, stride_w, (int32_t*)shift_h.storage().data(), (int32_t*)shift_w.storage().data(), - triton::dnn::shift::BPROP, layout, dy, w, bias); + triton::dnn::BPROP, layout, dy, w, bias); } torch::Tensor shift_dw( @@ -155,7 +156,7 @@ torch::Tensor shift_dw( // run return shift_common(B, C, 1, H, W, 1, R, S, F, stride_h, stride_w, (int32_t*)shift_h.storage().data(), (int32_t*)shift_w.storage().data(), - triton::dnn::shift::WGRAD, layout, dy, x, bias); + triton::dnn::WGRAD, layout, dy, x, bias); } static auto registry = diff --git a/include/triton/dnn/gemm.h b/include/triton/dnn/gemm.h index 1e581f6a1..3df8a13a6 100644 --- a/include/triton/dnn/gemm.h +++ b/include/triton/dnn/gemm.h @@ -14,8 +14,6 @@ private: void enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, triton::runtime::launch_information info); - // number of flops - size_t num_flops() const; // comparison for maps bool operator<(const base& other) const; // default parameters @@ -27,6 +25,9 @@ public: std::string a_ty, std::string b_ty, unsigned alignment_lda, unsigned alignment_ldb); + // number of flops + size_t num_flops() const; + // triton-c source void triton_c_src(std::ostream &os) const; diff --git a/include/triton/dnn/heuristics.h b/include/triton/dnn/heuristics.h new file mode 100644 index 000000000..4962c55f0 --- /dev/null +++ b/include/triton/dnn/heuristics.h @@ -0,0 +1,109 @@ +#ifndef TRITON_DNN_HEURISTICS_H +#define TRITON_DNN_HEURISTICS_H + +#include +#include "triton/dnn/base.h" + +namespace triton{ +namespace dnn{ + +typedef std::vector params_t; +typedef std::tuple trans_key_t; +typedef std::tuple size_key_t; +static const std::map> params = { + /* NN */ + {trans_key_t(false, false), std::map{ + {size_key_t(16, 16), {4, 4, 16, 8, 16, 2, 2, 1, 1, 8, 32, 4, 8, 1}}, + {size_key_t(16, 32), {2, 8, 16, 8, 32, 2, 2, 1, 1, 16, 32, 4, 8, 1}}, + {size_key_t(16, 64), {4, 4, 16, 4, 64, 2, 2, 1, 1, 8, 32, 8, 4, 1}}, + {size_key_t(16, 128), {4, 4, 16, 16, 128, 2, 2, 1, 2, 16, 32, 4, 8, 1}}, + {size_key_t(32, 16), {4, 8, 32, 8, 16, 2, 2, 1, 1, 8, 32, 4, 8, 1}}, + {size_key_t(32, 32), {4, 8, 32, 8, 32, 2, 2, 1, 1, 8, 32, 4, 8, 1}}, + {size_key_t(32, 64), {8, 4, 32, 8, 64, 2, 2, 1, 1, 4, 32, 4, 8, 1}}, + {size_key_t(32, 128), {8, 4, 32, 16, 128, 2, 2, 1, 4, 16, 32, 8, 4, 1}}, + {size_key_t(64, 16), {8, 8, 64, 4, 16, 2, 2, 1, 1, 4, 32, 8, 4, 1}}, + {size_key_t(64, 32), {8, 8, 64, 8, 32, 2, 2, 1, 1, 4, 32, 4, 8, 1}}, + {size_key_t(64, 64), {8, 8, 64, 16, 64, 2, 2, 2, 1, 8, 32, 4, 8, 1}}, + {size_key_t(64, 128), {16, 4, 64, 16, 128, 2, 2, 2, 2, 8, 32, 8, 4, 1}}, + {size_key_t(128, 16), {8, 8, 128, 8, 16, 2, 2, 2, 1, 8, 32, 8, 4, 1}}, + {size_key_t(128, 32), {8, 8, 128, 16, 32, 2, 2, 2, 1, 8, 32, 4, 8, 1}}, + {size_key_t(128, 64), {8, 8, 128, 32, 64, 2, 2, 2, 2, 16, 32, 4, 8, 1}}, + {size_key_t(128, 128), {8, 8, 128, 32, 128, 2, 2, 1, 4, 16, 32, 4, 8, 1}} + }}, + /* NT */ + {trans_key_t(false, true), std::map{ + {size_key_t(16, 16), {4, 4, 16, 2, 8, 16, 2, 2, 1, 1, 8, 32, 16, 1}}, + {size_key_t(16, 32), {4, 4, 16, 4, 8, 32, 2, 2, 1, 1, 8, 32, 8, 1}}, + {size_key_t(16, 64), {4, 4, 16, 8, 8, 64, 2, 2, 1, 4, 32, 32, 16, 1}}, + {size_key_t(16, 128), {4, 4, 16, 32, 4, 128, 2, 2, 1, 2, 16, 32, 2, 1}}, + {size_key_t(32, 16), {8, 4, 32, 2, 8, 16, 2, 2, 1, 1, 4, 32, 16, 1}}, + {size_key_t(32, 32), {4, 8, 32, 4, 8, 32, 2, 2, 1, 1, 8, 32, 8, 1}}, + {size_key_t(32, 64), {16, 8, 128, 4, 4, 64, 2, 2, 1, 4, 8, 32, 32, 1}}, + {size_key_t(32, 128), {4, 8, 32, 8, 8, 128, 2, 2, 1, 2, 16, 32, 8, 1}}, + {size_key_t(64, 16), {8, 8, 64, 2, 8, 16, 2, 2, 1, 1, 4, 32, 16, 1}}, + {size_key_t(64, 32), {8, 8, 64, 4, 8, 32, 2, 2, 1, 1, 4, 32, 8, 1}}, + {size_key_t(64, 64), {8, 8, 64, 8, 8, 64, 2, 2, 1, 2, 8, 32, 8, 1}}, + {size_key_t(64, 128), {8, 8, 64, 16, 8, 128, 2, 2, 1, 4, 16, 32, 8, 1}}, + {size_key_t(128, 16), {8, 8, 128, 2, 8, 16, 2, 2, 2, 1, 8, 32, 32, 1}}, + {size_key_t(128, 32), {16, 8, 128, 4, 8, 32, 2, 2, 2, 1, 4, 32, 16, 1}}, + {size_key_t(128, 64), {8, 8, 128, 8, 8, 64, 2, 2, 2, 2, 16, 32, 16, 1}}, + {size_key_t(128, 128), {8, 8, 128, 8, 8, 128, 2, 2, 4, 1, 16, 32, 16, 1}} + }}, + /* TN */ + {trans_key_t(true, false), std::map{ + {size_key_t(16, 16), {8, 16, 16, 16, 2, 2, 1, 1, 4, 8, 32, 2, 8, 1}}, + {size_key_t(16, 32), {4, 16, 8, 32, 2, 2, 1, 1, 8, 4, 32, 4, 8, 1}}, + {size_key_t(16, 64), {4, 16, 4, 64, 2, 2, 1, 1, 8, 4, 32, 8, 4, 1}}, + {size_key_t(16, 128), {16, 16, 16, 128, 2, 2, 1, 2, 4, 8, 32, 4, 8, 1}}, + {size_key_t(32, 16), {4, 32, 8, 16, 2, 2, 1, 1, 8, 4, 32, 4, 8, 1}}, + {size_key_t(32, 32), {8, 32, 8, 32, 2, 2, 1, 1, 4, 8, 32, 4, 8, 1}}, + {size_key_t(32, 64), {8, 32, 8, 64, 2, 2, 1, 1, 4, 8, 32, 4, 8, 1}}, + {size_key_t(32, 128), {32, 32, 64, 128, 2, 2, 2, 2, 4, 8, 32, 2, 8, 1}}, + {size_key_t(64, 16), {8, 64, 8, 16, 2, 2, 1, 1, 4, 8, 32, 4, 8, 1}}, + {size_key_t(64, 32), {8, 64, 8, 32, 2, 2, 1, 1, 4, 8, 32, 4, 8, 1}}, + {size_key_t(64, 64), {16, 64, 16, 64, 2, 2, 2, 1, 4, 8, 32, 4, 8, 1}}, + {size_key_t(64, 128), {32, 64, 16, 128, 2, 2, 2, 2, 4, 8, 32, 8, 4, 1}}, + {size_key_t(128, 16), {16, 128, 16, 16, 2, 2, 2, 1, 4, 8, 32, 4, 8, 1}}, + {size_key_t(128, 32), {32, 128, 32, 32, 2, 2, 4, 1, 4, 8, 32, 4, 8, 1}}, + {size_key_t(128, 64), {32, 128, 32, 64, 2, 2, 4, 1, 4, 8, 32, 4, 8, 1}}, + {size_key_t(128, 128), {32, 128, 32, 128, 2, 2, 4, 1, 4, 8, 32, 4, 8, 1}}, + }}, + /* TT */ + {trans_key_t(true, true), std::map{ + {size_key_t(16, 16), {4, 16, 2, 8, 16, 2, 2, 1, 1, 8, 4, 32, 16, 1}}, + {size_key_t(16, 32), {8, 16, 4, 8, 32, 2, 2, 1, 1, 4, 8, 32, 8, 1}}, + {size_key_t(16, 64), {16, 16, 4, 8, 64, 2, 2, 1, 4, 8, 4, 32, 32, 1}}, + {size_key_t(16, 128), {16, 16, 8, 4, 128, 2, 2, 1, 2, 4, 8, 32, 8, 1}}, + {size_key_t(32, 16), {4, 32, 2, 8, 16, 2, 2, 1, 1, 8, 4, 32, 16, 1}}, + {size_key_t(32, 32), {8, 32, 4, 8, 32, 2, 2, 1, 1, 4, 8, 32, 8, 1}}, + {size_key_t(32, 64), {16, 64, 4, 8, 64, 2, 2, 2, 1, 4, 8, 32, 16, 1}}, + {size_key_t(32, 128), {32, 32, 8, 8, 128, 2, 2, 1, 4, 4, 8, 32, 16, 1}}, + {size_key_t(64, 16), {8, 64, 2, 8, 16, 2, 2, 1, 1, 4, 8, 32, 16, 1}}, + {size_key_t(64, 32), {8, 64, 4, 8, 32, 2, 2, 1, 1, 4, 8, 32, 8, 1}}, + {size_key_t(64, 64), {16, 64, 8, 8, 64, 2, 2, 2, 1, 4, 8, 32, 8, 1}}, + {size_key_t(64, 128), {32, 64, 8, 8, 128, 2, 2, 1, 4, 4, 8, 32, 16, 1}}, + {size_key_t(128, 16), {16, 128, 2, 8, 16, 2, 2, 2, 1, 4, 8, 32, 32, 1}}, + {size_key_t(128, 32), {32, 128, 8, 4, 32, 2, 2, 4, 1, 4, 8, 32, 16, 1}}, + {size_key_t(128, 64), {32, 128, 16, 4, 64, 2, 2, 4, 1, 4, 8, 32, 8, 1}}, + {size_key_t(128, 128), {32, 128, 8, 8, 128, 2, 2, 4, 1, 4, 8, 32, 16, 1}} + }} +}; + +// small search space for partial auto-tuning +inline std::vector dot_search_space(bool AT, bool BT) { + std::vector result; + for(auto x: params.at(trans_key_t{AT, BT})) + result.push_back(x.second); + return result; +} + +// simple parameter heuristics +inline params_t dot_heuristics(bool AT, bool BT, size_t M, size_t N, size_t K) { + size_t TM = 128; + size_t TN = 128; + return params.at(trans_key_t{AT, BT}).at(size_key_t{TM, TN}); +} + +} +} +#endif diff --git a/include/triton/dnn/shift.h b/include/triton/dnn/shift.h index 59d26ab44..d1f1bc972 100644 --- a/include/triton/dnn/shift.h +++ b/include/triton/dnn/shift.h @@ -35,20 +35,18 @@ namespace triton{ namespace dnn{ +enum op_t { + FPROP, + BPROP, + WGRAD +}; + +enum layout_t { + NCHW, + CHWN +}; + class shift: public base { - -public: - enum op_t { - FPROP, - BPROP, - WGRAD - }; - - enum layout_t { - NCHW, - CHWN - }; - private: // initialize and enqueue void init_impl(driver::stream *stream, driver::cu_module *module); @@ -56,7 +54,8 @@ private: void enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, triton::runtime::launch_information info); - std::vector default_params() const; + std::vector search_space() const; + params_t heuristics() const; public: diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index 8224dc846..befb7c842 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -66,8 +66,9 @@ void base::enqueue(driver::stream *stream, std::vector args, a clone->init_impl(stream, (triton::driver::cu_module*)kernel->module()); } /* retrieved compiled template */ - else + else{ jit = m_jit.at(this).get(); + } /* get launch parameters */ driver::kernel* kernel = jit->get_function(name_.c_str()); diff --git a/lib/dnn/gemm.cpp b/lib/dnn/gemm.cpp index 43fc9f173..33f8273bf 100644 --- a/lib/dnn/gemm.cpp +++ b/lib/dnn/gemm.cpp @@ -1,6 +1,7 @@ #include "triton/driver/stream.h" #include "triton/driver/kernel.h" #include "triton/dnn/gemm.h" +#include "triton/dnn/heuristics.h" #include namespace triton{ @@ -147,99 +148,12 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, // small search space for partial auto-tuning std::vector dot::search_space() const { - typedef std::vector params_t; - typedef std::tuple key_t; - static std::vector keys = { - {16, 16}, {16, 32}, {16, 64}, {16, 128}, - {32, 16}, {32, 32}, {32, 64}, {32, 128}, - {64, 16}, {64, 32}, {64, 64}, {64, 128}, - {128, 16},{128, 32},{128, 64},{128, 128} - }; - static std::vector space_nn = { - {4, 4, 16, 8, 16, 2, 2, 1, 1, 8, 32, 4, 8, 1}, - {2, 8, 16, 8, 32, 2, 2, 1, 1, 16, 32, 4, 8, 1}, - {4, 4, 16, 4, 64, 2, 2, 1, 1, 8, 32, 8, 4, 1}, - {4, 4, 16, 16, 128, 2, 2, 1, 2, 16, 32, 4, 8, 1}, - {4, 8, 32, 8, 16, 2, 2, 1, 1, 8, 32, 4, 8, 1}, - {4, 8, 32, 8, 32, 2, 2, 1, 1, 8, 32, 4, 8, 1}, - {8, 4, 32, 8, 64, 2, 2, 1, 1, 4, 32, 4, 8, 1}, - {8, 4, 32, 16, 128, 2, 2, 1, 4, 16, 32, 8, 4, 1}, - {8, 8, 64, 4, 16, 2, 2, 1, 1, 4, 32, 8, 4, 1}, - {8, 8, 64, 8, 32, 2, 2, 1, 1, 4, 32, 4, 8, 1}, - {8, 8, 64, 16, 64, 2, 2, 2, 1, 8, 32, 4, 8, 1}, - {16, 4, 64, 16, 128, 2, 2, 2, 2, 8, 32, 8, 4, 1}, - {8, 8, 128, 8, 16, 2, 2, 2, 1, 8, 32, 8, 4, 1}, - {8, 8, 128, 16, 32, 2, 2, 2, 1, 8, 32, 4, 8, 1}, - {8, 8, 128, 32, 64, 2, 2, 2, 2, 16, 32, 4, 8, 1}, - {8, 8, 128, 32, 128, 2, 2, 1, 4, 16, 32, 4, 8, 1}, - }; - static std::vector space_nt = { - {4, 4, 16, 2, 8, 16, 2, 2, 1, 1, 8, 32, 16, 1}, - {4, 4, 16, 4, 8, 32, 2, 2, 1, 1, 8, 32, 8, 1}, - {4, 4, 16, 8, 8, 64, 2, 2, 1, 4, 32, 32, 16, 1}, - {4, 4, 16, 32, 4, 128, 2, 2, 1, 2, 16, 32, 2, 1}, - {8, 4, 32, 2, 8, 16, 2, 2, 1, 1, 4, 32, 16, 1}, - {4, 8, 32, 4, 8, 32, 2, 2, 1, 1, 8, 32, 8, 1}, - {16, 8, 128, 4, 4, 64, 2, 2, 1, 4, 8, 32, 32, 1}, - {4, 8, 32, 8, 8, 128, 2, 2, 1, 2, 16, 32, 8, 1}, - {8, 8, 64, 2, 8, 16, 2, 2, 1, 1, 4, 32, 16, 1}, - {8, 8, 64, 4, 8, 32, 2, 2, 1, 1, 4, 32, 8, 1}, - {8, 8, 64, 8, 8, 64, 2, 2, 1, 2, 8, 32, 8, 1}, - {8, 8, 64, 16, 8, 128, 2, 2, 1, 4, 16, 32, 8, 1}, - {8, 8, 128, 2, 8, 16, 2, 2, 2, 1, 8, 32, 32, 1}, - {16, 8, 128, 4, 8, 32, 2, 2, 2, 1, 4, 32, 16, 1}, - {8, 8, 128, 8, 8, 64, 2, 2, 2, 2, 16, 32, 16, 1}, - {8, 8, 128, 8, 8, 128, 2, 2, 4, 1, 16, 32, 16, 1}, - }; - static std::vector space_tn = { - {8, 16, 16, 16, 2, 2, 1, 1, 4, 8, 32, 2, 8, 1}, - {4, 16, 8, 32, 2, 2, 1, 1, 8, 4, 32, 4, 8, 1}, - {4, 16, 4, 64, 2, 2, 1, 1, 8, 4, 32, 8, 4, 1}, - {16, 16, 16, 128, 2, 2, 1, 2, 4, 8, 32, 4, 8, 1}, - {4, 32, 8, 16, 2, 2, 1, 1, 8, 4, 32, 4, 8, 1}, - {8, 32, 8, 32, 2, 2, 1, 1, 4, 8, 32, 4, 8, 1}, - {8, 32, 8, 64, 2, 2, 1, 1, 4, 8, 32, 4, 8, 1}, - {32, 32, 64, 128, 2, 2, 2, 2, 4, 8, 32, 2, 8, 1}, - {8, 64, 8, 16, 2, 2, 1, 1, 4, 8, 32, 4, 8, 1}, - {8, 64, 8, 32, 2, 2, 1, 1, 4, 8, 32, 4, 8, 1}, - {16, 64, 16, 64, 2, 2, 2, 1, 4, 8, 32, 4, 8, 1}, - {32, 64, 16, 128, 2, 2, 2, 2, 4, 8, 32, 8, 4, 1}, - {16, 128, 16, 16, 2, 2, 2, 1, 4, 8, 32, 4, 8, 1}, - {32, 128, 32, 32, 2, 2, 4, 1, 4, 8, 32, 4, 8, 1}, - {32, 128, 32, 64, 2, 2, 4, 1, 4, 8, 32, 4, 8, 1}, - {32, 128, 32, 128, 2, 2, 4, 1, 4, 8, 32, 4, 8, 1}, - }; - static std::vector space_tt = { - {4, 16, 2, 8, 16, 2, 2, 1, 1, 8, 4, 32, 16, 1}, - {8, 16, 4, 8, 32, 2, 2, 1, 1, 4, 8, 32, 8, 1}, - {16, 16, 4, 8, 64, 2, 2, 1, 4, 8, 4, 32, 32, 1}, - {16, 16, 8, 4, 128, 2, 2, 1, 2, 4, 8, 32, 8, 1}, - {4, 32, 2, 8, 16, 2, 2, 1, 1, 8, 4, 32, 16, 1}, - {8, 32, 4, 8, 32, 2, 2, 1, 1, 4, 8, 32, 8, 1}, - {16, 64, 4, 8, 64, 2, 2, 2, 1, 4, 8, 32, 16, 1}, - {32, 32, 8, 8, 128, 2, 2, 1, 4, 4, 8, 32, 16, 1}, - {8, 64, 2, 8, 16, 2, 2, 1, 1, 4, 8, 32, 16, 1}, - {8, 64, 4, 8, 32, 2, 2, 1, 1, 4, 8, 32, 8, 1}, - {16, 64, 8, 8, 64, 2, 2, 2, 1, 4, 8, 32, 8, 1}, - {32, 64, 8, 8, 128, 2, 2, 1, 4, 4, 8, 32, 16, 1}, - {16, 128, 2, 8, 16, 2, 2, 2, 1, 4, 8, 32, 32, 1}, - {32, 128, 8, 4, 32, 2, 2, 4, 1, 4, 8, 32, 16, 1}, - {32, 128, 16, 4, 64, 2, 2, 4, 1, 4, 8, 32, 8, 1}, - {32, 128, 8, 8, 128, 2, 2, 4, 1, 4, 8, 32, 16, 1} - }; - if(!AT_ && !BT_) - return space_nn; - else if(!AT_ && BT_) - return space_nt; - else if(AT_ && !BT_) - return space_tn; - else - return space_tt; + return dot_search_space(AT_, BT_); } // simple parameter heuristics params_t dot::heuristics() const { - return search_space().back(); + return dot_heuristics(AT_, BT_, M_, N_, K_); } } diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index e8a4f3584..e09ce0a58 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -1,5 +1,6 @@ #include #include "triton/dnn/shift.h" +#include "triton/dnn/heuristics.h" #include "triton/tools/bench.hpp" namespace triton{ @@ -513,12 +514,14 @@ else{ } +// small search space for partial auto-tuning +std::vector shift::search_space() const { + return dot_search_space(AT_, BT_); +} + // simple parameter heuristics -std::vector shift::default_params() const { - typedef std::vector params_t; - std::map, params_t> params = { - {{}, {}} - }; +params_t shift::heuristics() const { + return dot_heuristics(AT_, BT_, M_, N_, K_); } diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index 6f25de8da..90f9a0e4c 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -211,9 +211,9 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben best.perf = perf; best.params = params; } - for(size_t i = 0; i < params.size(); i++) - std::cout << ((i==0)?"":", ") << params[i] << std::flush; - std::cout << ", " << perf << " [ " << best.perf << " ] " << std::endl; +// for(size_t i = 0; i < params.size(); i++) +// std::cout << ((i==0)?"":", ") << params[i] << std::flush; +// std::cout << ", " << perf << " [ " << best.perf << " ] " << std::endl; } }; From c448876178d217f321fe6bf73384b2e2e59d504b Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 22 Jul 2019 19:26:12 -0700 Subject: [PATCH 261/494] better benchmarking --- examples/cpp/CMakeLists.txt | 2 +- examples/cpp/dot.cpp | 69 ++++++++++++++++++++++++--------- examples/cpp/shift.cpp | 53 ++++++++++++++++++++----- include/triton/dnn/heuristics.h | 7 ++-- include/triton/dnn/shift.h | 9 +++++ lib/codegen/selection.cpp | 17 +++++++- lib/codegen/tune.cpp | 4 +- lib/dnn/base.cpp | 2 +- lib/dnn/gemm.cpp | 6 +-- lib/dnn/shift.cpp | 28 +++++++++++-- lib/driver/module.cpp | 2 +- lib/runtime/jit.cpp | 8 ++-- 12 files changed, 159 insertions(+), 48 deletions(-) diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt index db1e5421f..3366ba591 100644 --- a/examples/cpp/CMakeLists.txt +++ b/examples/cpp/CMakeLists.txt @@ -2,5 +2,5 @@ foreach(PROG dot conv shift) add_executable(${PROG} ${PROG}.cpp) set_target_properties(${PROG} PROPERTIES OUTPUT_NAME ${PROG}) include_directories(/usr/local/cuda/include/) - target_link_libraries(${PROG} triton) + target_link_libraries(${PROG} triton cublas) endforeach(PROG) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index a8723f2e2..2e73a17b0 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -6,6 +6,7 @@ #include "triton/driver/stream.h" #include "triton/dnn/gemm.h" #include "triton/tools/bench.hpp" +#include "cuda.h" template void diff(const std::vector& x, const std::vector& y){ @@ -17,34 +18,63 @@ void diff(const std::vector& x, const std::vector& y){ std::cout << "Pass!" << std::endl; } -double do_bench(triton::driver::context* context, bool AT, bool BT, int32_t M, int32_t N, int32_t K){ - typedef float T; +struct perf_t { + double triton; + double cublas; +}; + + +perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K){ + typedef float NumericT; std::string ty = "fp16"; - size_t dt_nbytes = sizeof(T); - std::vector hc(M*N); - std::vector ha(M*K); - std::vector hb(K*N); + size_t dt_nbytes = sizeof(NumericT); + triton::driver::context* context = stream->context(); + std::vector hc(M*N); + std::vector ha(M*K); + std::vector hb(K*N); srand(0); for(size_t i = 0; i < ha.size(); i++) - ha[i] = (T)rand()/RAND_MAX; + ha[i] = (NumericT)rand()/RAND_MAX; for(size_t i = 0; i < hb.size(); i++) - hb[i] = (T)rand()/RAND_MAX; + hb[i] = (NumericT)rand()/RAND_MAX; for(size_t i = 0; i < hc.size(); i++) hc[i] = 0; triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*dt_nbytes); triton::driver::buffer* da = triton::driver::buffer::create(context, ha.size()*dt_nbytes); triton::driver::buffer* db = triton::driver::buffer::create(context, hb.size()*dt_nbytes); - triton::driver::stream* stream = triton::driver::stream::create(context); stream->write(da, true, 0, ha); stream->write(db, true, 0, hb); stream->write(dc, true, 0, hc); stream->synchronize(); triton::dnn::dot dot(M, N, K, AT, BT, ty, ty, 8, 8); - double nanosec = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::PARTIAL_TUNING);}, stream); + // benchmark triton + double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::PARTIAL_TUNING);}, stream); + // benchmark cublas + NumericT alpha = 1; + NumericT beta = 0; + int32_t lda = AT ? K : M; + int32_t ldb = BT ? N : K; + int32_t ldc = M; + cublasGemmAlgo_t fastest; +// cublasGemm(HALF_TYPE, stream, AT, BT, M, N, K, +// &alpha, da, lda, +// db, ldb, &beta, +// dc, ldc, &fastest); + double cublas_ns = triton::tools::bench([&]() { cublasGemm(HALF_TYPE, stream, AT, BT, M, N, K, + &alpha, da, lda, + db, ldb, &beta, + dc, ldc, nullptr, CUBLAS_GEMM_DEFAULT_TENSOR_OP); }, stream); + // result + auto tflops = [&](double nanosec) { return dot.num_flops() / nanosec * 1e-3; }; + + perf_t result; + result.cublas = tflops(cublas_ns); + result.triton = tflops(triton_ns); + // clean-up delete dc; delete da; delete db; - return dot.num_flops() / nanosec * 1e-3; + return result; } int main() { @@ -61,21 +91,24 @@ int main() { return oss.str(); } - double perf(triton::driver::context *context){ - return do_bench(context, AT, BT, M, N, K); + perf_t perf(triton::driver::stream *stream){ + return do_bench(stream, AT, BT, M, N, K); } }; // shapes to benchmark std::vector configs = { - {false, false, 4096, 4096, 4096}, - {false, true, 4096, 4096, 4096}, - {true, false, 4096, 4096, 4096}, - {true, true, 4096, 4096, 4096} +// {false, false, 8192, 512, 512}, + {false, true, 8192, 8192, 8192}, + {false, true, 32768, 256, 512} +// {true, false, 8192, 512, 512}, +// {true, true, 8192, 512, 512} }; // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); + triton::driver::stream* stream = triton::driver::stream::create(context); // does the work for(config_t c: configs){ - std::cout << c.repr() << ", " << c.perf(context) << std::endl; + perf_t perf = c.perf(stream); + std::cout << c.repr() << ", " << perf.triton << ", " << perf.cublas << std::endl; } } diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index c4074c722..fc10d4316 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -1,6 +1,7 @@ #include #include #include +#include "cuda.h" #include "triton/runtime/jit.h" #include "triton/driver/backend.h" #include "triton/driver/stream.h" @@ -8,12 +9,20 @@ #include "triton/dnn/shift.h" #include "triton/external/half.hpp" -double do_bench(triton::driver::context* context, +struct perf_t { + double triton; + double cublas; +}; + +perf_t do_bench(triton::driver::stream *stream, int32_t R, int32_t S, int32_t B, int32_t F, int32_t H, int32_t W, int32_t C, triton::dnn::op_t op, triton::dnn::layout_t layout, std::string numeric_t) { typedef float NumericT; + // driver variables + triton::driver::context* context = stream->context(); + // random shifts std::vector shift_h(C); std::vector shift_w(C); @@ -44,7 +53,6 @@ double do_bench(triton::driver::context* context, triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*4); triton::driver::buffer* da = triton::driver::buffer::create(context, ha.size()*sizeof(NumericT)); triton::driver::buffer* db = triton::driver::buffer::create(context, hb.size()*sizeof(NumericT)); - triton::driver::stream* stream = triton::driver::stream::create(context); // initialize host srand(0); for(size_t i = 0; i < ha.size(); i++) @@ -58,8 +66,29 @@ double do_bench(triton::driver::context* context, stream->write(db, true, 0, hb); stream->write(dc, true, 0, hc); stream->synchronize(); - double nanosec = triton::tools::bench([&]() { shift.enqueue(stream, {da, db, dc});}, stream); - return shift.num_flops() / nanosec * 1e-3; + // benchmark triton + double triton_ns = triton::tools::bench([&]() { shift.enqueue(stream, {da, db, dc}, triton::dnn::FULL_TUNING);}, stream); + // benchmark cublas + NumericT alpha = 1; + NumericT beta = 0; + cublasGemmAlgo_t fastest; + cublasGemm(HALF_TYPE, stream, shift.AT(), shift.BT(), shift.M(), shift.N(), shift.K(), + &alpha, da, shift.lda(), + db, shift.ldb(), &beta, + dc, shift.ldc(), &fastest); + double cublas_ns = triton::tools::bench([&]() { cublasGemm(HALF_TYPE, stream, shift.AT(), shift.BT(), shift.M(), shift.N(), shift.K(), + &alpha, da, shift.lda(), + db, shift.ldb(), + &beta, dc, shift.ldc(), nullptr, fastest); }, stream); + // result + auto tflops = [&](double nanosec) { return shift.num_flops() / nanosec * 1e-3; }; + perf_t result; + result.cublas = tflops(cublas_ns); + result.triton = tflops(triton_ns); + delete da; + delete db; + delete dc; + return result; } int main() { @@ -86,13 +115,15 @@ int main() { return oss.str(); } - double perf(triton::driver::context *context){ - return do_bench(context, R, S, B, F, H, W, C, op, layout, ty); + perf_t perf(triton::driver::stream *stream){ + return do_bench(stream, R, S, B, F, H, W, C, op, layout, ty); } }; // shapes to benchmark std::vector configs; - std::vector resnet18 = { + std::vector resnet18 = + { + {128, 128, 32, 32, 3, 3, 128, 1, 1}, {128, 128, 32, 32, 3, 3, 128, 1, 1}, {128, 128, 32, 32, 3, 3, 256, 2, 2}, {128, 256, 16, 16, 3, 3, 256, 1, 1}, @@ -108,7 +139,11 @@ int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); - for(config_t c: configs) - std::cout << c.repr() << ", " << c.perf(context) << std::endl; + triton::driver::stream *stream = triton::driver::stream::create(context); + for(config_t c: configs){ + std::string repr = c.repr(); + perf_t perf = c.perf(stream); + std::cout << repr << ", " << perf.triton << ", " << perf.cublas << std::endl; + } } diff --git a/include/triton/dnn/heuristics.h b/include/triton/dnn/heuristics.h index 4962c55f0..31e38841d 100644 --- a/include/triton/dnn/heuristics.h +++ b/include/triton/dnn/heuristics.h @@ -99,9 +99,10 @@ inline std::vector dot_search_space(bool AT, bool BT) { // simple parameter heuristics inline params_t dot_heuristics(bool AT, bool BT, size_t M, size_t N, size_t K) { - size_t TM = 128; - size_t TN = 128; - return params.at(trans_key_t{AT, BT}).at(size_key_t{TM, TN}); +// size_t TM = 128; +// size_t TN = 128; + return {4, 8, 256, 8, 8, 64, 2, 2, 2, 2, 32, 32, 16, 1}; +// return params.at(trans_key_t{AT, BT}).at(size_key_t{TM, TN}); } } diff --git a/include/triton/dnn/shift.h b/include/triton/dnn/shift.h index d1f1bc972..25b9547f3 100644 --- a/include/triton/dnn/shift.h +++ b/include/triton/dnn/shift.h @@ -73,6 +73,15 @@ public: // accessors size_t c_size(); std::vector c_shapes(); + // equivalent GEMM + bool AT() const; + bool BT() const; + size_t M() const; + size_t N() const; + size_t K() const; + size_t lda() const; + size_t ldb() const; + size_t ldc() const; // number of flops size_t num_flops() const; // source diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 8a6e74f33..e55e55c87 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -781,9 +781,24 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & if(auto *x = dynamic_cast(ins)) { distributed_tile* ptr = (distributed_tile*)tmap_.at(x->get_pointer_operand()); tile *value = tmap_.at(x->get_value_operand()); + distributed_tile *mask_tile; + if(mask) + mask_tile = (distributed_tile*)tmap_.at(ins->get_mask_pred()); ptr->for_each([&](indices_t idx){ set_mask_insert_pt(idx); - StoreInst *store = new StoreInst(value->get_value(idx), ptr->get_value(idx)); + Value *ptr_value = ptr->get_value(idx); + Value *value_value = value->get_value(idx); + Instruction *store; +// if(mask){ +// Value *pred_value = mask_tile->get_value(idx); +// value_value = builder.CreateVectorSplat(1, value_value); +// pred_value = builder.CreateVectorSplat(1, pred_value); +// Type *ptr_ty = PointerType::get(value_value->getType(), ptr_value->getType()->getPointerAddressSpace()); +// ptr_value = builder.CreateBitCast(ptr_value, ptr_ty); +// store = builder.CreateMaskedStore(value_value, ptr_value, 1, pred_value); +// } +// else + store = new StoreInst(value_value, ptr_value); builder.Insert(store); }); } diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 3f5119577..b649a4d7c 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -215,7 +215,7 @@ void tune::run(ir::module &mod) { node_t node = *nodes_.begin(); if(fragments_[node] == STRIDED_SCAN) { ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 1, 1); - ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 2, 64); + ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 4, 32); connected_components(node, {nts, mts}, {"nts", "mts"}, nodes_, dependencies_, group_id++); nts->set_value(1); } @@ -239,7 +239,7 @@ void tune::run(ir::module &mod) { size_t addr_space = ptr_ty->get_pointer_address_space(); if(addr_space < 4){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 4, 8)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 8, 8)); *params_.at(i).at("nts.d0") = *tmp; } } diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index befb7c842..1058e751e 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -36,7 +36,7 @@ void base::enqueue(driver::stream *stream, std::vector args, a /* the current template has not already been compiled */ if(m_jit.find(this) == m_jit.end()) { base* clone = this->clone(); - jit = m_jit.emplace(clone, std::unique_ptr(new rt::jit(ctx))).first->second.get(); + jit = m_jit.emplace(clone, std::unique_ptr(new rt::jit(ctx, 8))).first->second.get(); std::ostringstream oss; clone->triton_c_src(oss); std::string src = oss.str(); diff --git a/lib/dnn/gemm.cpp b/lib/dnn/gemm.cpp index 33f8273bf..a2d636129 100644 --- a/lib/dnn/gemm.cpp +++ b/lib/dnn/gemm.cpp @@ -129,10 +129,8 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, c = dot()" + usea + ", " + useb + R"(, c); pa = pa + TK)" + lda0 + R"(; pb = pb + TK)" + ldb0 + R"(; - int1 checka[)" + AS + R"(] = k > bound; - int1 checkb[)" + BS + R"(] = k > bound; - @checka a = *pa; - @checkb b = *pb; + a = *pa; + b = *pb; } int32 rxc[TM] = ridx*TM + (0 ... TM); int32 ryc[TN] = ridy*TN + (0 ... TN); diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index e09ce0a58..fd45ea805 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -180,6 +180,30 @@ size_t shift::num_flops() const { return 2.*M_*N_*K_; } +bool shift::AT() const +{ return AT_; } + +bool shift::BT() const +{ return BT_; } + +size_t shift::M() const +{ return M_; } + +size_t shift::N() const +{ return N_; } + +size_t shift::K() const +{ return K_; } + +size_t shift::lda() const +{ return AT_ ? K_ : M_; } + +size_t shift::ldb() const +{ return BT_ ? N_ : K_; } + +size_t shift::ldc() const +{ return M_; } + bool shift::operator <(const base& other) const{ auto *y = dynamic_cast(&other); if(!y) @@ -265,10 +289,6 @@ void shift::enqueue_impl(driver::stream *stream, driver::kernel *kernel, kernel->setArg(30, (int32_t)grid[2]); if(locks_) ((driver::cu_buffer*)locks_)->set_zero(stream, 2*max_locks_*4); - if(op_ == FPROP || op_ == BPROP){ - size_t c_nbytes = (c_ty_ == "fp16") ? 2 : 4; - ((driver::cu_buffer*)c)->set_zero(stream, c_size()*c_nbytes); - } stream->enqueue(kernel, grid, {info.num_threads, 1, 1}); } diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 4ff863666..08c41a7b8 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -255,7 +255,7 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ -// std::cout << source << std::endl; + // std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index 90f9a0e4c..f39f6c397 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -164,8 +164,8 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben auto mps = passes_0.tune.get_params(tt_module_0); // iterate over parameters tune_res_t best; - std::mutex mutex; // update_best + std::mutex mutex; auto update_best = [&](const std::vector params){ std::map> errors; unsigned i = 0; @@ -211,9 +211,9 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben best.perf = perf; best.params = params; } -// for(size_t i = 0; i < params.size(); i++) -// std::cout << ((i==0)?"":", ") << params[i] << std::flush; -// std::cout << ", " << perf << " [ " << best.perf << " ] " << std::endl; + for(size_t i = 0; i < params.size(); i++) + std::cout << ((i==0)?"":", ") << params[i] << std::flush; + std::cout << ", " << perf << " [ " << best.perf << " ] " << std::endl; } }; From 38b3771c26911eff6c903fad66fc7263cd826a39 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 23 Jul 2019 14:43:18 -0700 Subject: [PATCH 262/494] some reassociation --- examples/cpp/dot.cpp | 6 +- include/triton/codegen/reassociate.h | 35 +++++ include/triton/dnn/heuristics.h | 8 +- include/triton/ir/builder.h | 4 +- include/triton/ir/instructions.h | 80 ++++++----- include/triton/runtime/jit.h | 6 +- lib/codegen/alignment_info.cpp | 30 ++--- lib/codegen/reassociate.cpp | 185 +++++++++++++++++++++++++ lib/codegen/selection.cpp | 194 ++++++++++----------------- lib/dnn/base.cpp | 1 + lib/dnn/gemm.cpp | 12 +- lib/driver/module.cpp | 2 +- lib/ir/builder.cpp | 12 +- lib/ir/instructions.cpp | 66 +++++---- lib/ir/print.cpp | 8 +- lib/lang/expression.cpp | 63 ++++----- lib/lang/statement.cpp | 42 +++--- 17 files changed, 476 insertions(+), 278 deletions(-) create mode 100644 include/triton/codegen/reassociate.h create mode 100644 lib/codegen/reassociate.cpp diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 2e73a17b0..6ec396a1a 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -48,7 +48,7 @@ perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int stream->synchronize(); triton::dnn::dot dot(M, N, K, AT, BT, ty, ty, 8, 8); // benchmark triton - double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::PARTIAL_TUNING);}, stream); + double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::NO_TUNING);}, stream); // benchmark cublas NumericT alpha = 1; NumericT beta = 0; @@ -98,8 +98,8 @@ int main() { // shapes to benchmark std::vector configs = { // {false, false, 8192, 512, 512}, - {false, true, 8192, 8192, 8192}, - {false, true, 32768, 256, 512} + {false, true, 8192, 8192, 8192} +// {false, true, 32768, 256, 512} // {true, false, 8192, 512, 512}, // {true, true, 8192, 512, 512} }; diff --git a/include/triton/codegen/reassociate.h b/include/triton/codegen/reassociate.h new file mode 100644 index 000000000..3360a15fe --- /dev/null +++ b/include/triton/codegen/reassociate.h @@ -0,0 +1,35 @@ +#ifndef TDL_INCLUDE_IR_CODEGEN_REASSOCIATE_H +#define TDL_INCLUDE_IR_CODEGEN_REASSOCIATE_H + +#include +#include +#include +#include + +namespace triton { + +// forward declaration +namespace ir { +class module; +class value; +class builder; +class instruction; +} + +namespace codegen{ + +class reassociate { +private: + ir::instruction* is_bin_add(ir::value *x); + ir::value *reorder_op(ir::value *value, ir::builder &builder, std::vector& to_delete, ir::value *&noncst, ir::value *&cst); + +public: + reassociate(); + void run(ir::module& module); +}; + +} + +} + +#endif diff --git a/include/triton/dnn/heuristics.h b/include/triton/dnn/heuristics.h index 31e38841d..bd9bc50aa 100644 --- a/include/triton/dnn/heuristics.h +++ b/include/triton/dnn/heuristics.h @@ -99,10 +99,10 @@ inline std::vector dot_search_space(bool AT, bool BT) { // simple parameter heuristics inline params_t dot_heuristics(bool AT, bool BT, size_t M, size_t N, size_t K) { -// size_t TM = 128; -// size_t TN = 128; - return {4, 8, 256, 8, 8, 64, 2, 2, 2, 2, 32, 32, 16, 1}; -// return params.at(trans_key_t{AT, BT}).at(size_key_t{TM, TN}); + size_t TM = 128; + size_t TN = 128; +// return {4, 8, 256, 8, 8, 64, 2, 2, 2, 2, 32, 32, 16, 1}; + return params.at(trans_key_t{AT, BT}).at(size_key_t{TM, TN}); } } diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h index 8dd60fdff..598c82454 100644 --- a/include/triton/ir/builder.h +++ b/include/triton/ir/builder.h @@ -57,8 +57,8 @@ public: value* create_cond_br(value *cond, basic_block* if_dest, basic_block* else_dest); value* create_ret_void(); // Tile-level control flow - value *create_mask(value *pred, const std::string &name = ""); - value *create_merge(value *mask_true, value *value_true, value *mask_false, value *value_false, const std::string &name = ""); +// value *create_mask(value *pred, const std::string &name = ""); +// value *create_merge(value *mask_true, value *value_true, value *mask_false, value *value_false, const std::string &name = ""); // Cast instructions value *create_cast(cast_inst::op_t op, value *v, type *dst_ty, const std::string &name = ""); value* create_si_to_fp(value *src, type *dst_ty, const std::string &name = ""); diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index 99ef1d1be..a3d56309c 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -20,10 +20,10 @@ class context; class result_reference; class instruction: public user{ public: - struct mask_info_t { - value *pred; - value *else_value; - }; +// struct mask_info_t { +// value *pred; +// value *else_value; +// }; virtual std::string repr_impl() const = 0; @@ -37,11 +37,11 @@ public: const basic_block *get_parent() const { return parent_; } basic_block *get_parent() { return parent_; } void erase_from_parent(); - // mask - void set_mask_pred(value *pred) { resize_hidden(1); set_operand(get_num_operands(), pred); } - value* get_mask_pred() const { if(get_num_hidden() == 0) return nullptr; return get_operand(get_num_operands()); } - void set_mask_else(value *x) { resize_hidden(2); set_operand(get_num_operands() + 1, x); } - value* get_mask_else() const { if(get_num_hidden() < 2) return nullptr; return get_operand(get_num_operands() + 1); } +// // mask +// void set_mask_pred(value *pred) { resize_hidden(1); set_operand(get_num_operands(), pred); } +// value* get_mask_pred() const { if(get_num_hidden() == 0) return nullptr; return get_operand(get_num_operands()); } +// void set_mask_else(value *x) { resize_hidden(2); set_operand(get_num_operands() + 1, x); } +// value* get_mask_else() const { if(get_num_hidden() < 2) return nullptr; return get_operand(get_num_operands() + 1); } // helpers bool has_tile_result_or_op(); // repr @@ -55,8 +55,8 @@ public: unsigned get_metadata(ir::metadata::kind_t kind) { return metadatas_[kind];} private: basic_block *parent_; - value *pred_; - value *mask_pred_; +// value *pred_; +// value *mask_pred_; std::vector results_; std::map metadatas_; }; @@ -335,34 +335,34 @@ public: const std::string &name = "", instruction *next = nullptr); }; -// mask -class mask_inst: public instruction { -private: - std::string repr_impl() const { return "mask"; } - mask_inst(ir::value *pred, const std::string &name, instruction *next); +//// mask +//class mask_inst: public instruction { +//private: +// std::string repr_impl() const { return "mask"; } +// mask_inst(ir::value *pred, const std::string &name, instruction *next); -public: - static mask_inst* create(ir::value *pred, const std::string &name = "", instruction *next = nullptr); -}; +//public: +// static mask_inst* create(ir::value *pred, const std::string &name = "", instruction *next = nullptr); +//}; -// merge -class psi_inst: public instruction { -private: - std::string repr_impl() const { return "merge"; } - psi_inst(ir::value *mask_true, ir::value *value_true, - ir::value *mask_false, ir::value *value_false, - const std::string &name, instruction *next); +//// merge +//class psi_inst: public instruction { +//private: +// std::string repr_impl() const { return "merge"; } +// psi_inst(ir::value *mask_true, ir::value *value_true, +// ir::value *mask_false, ir::value *value_false, +// const std::string &name, instruction *next); -public: - static psi_inst* create(ir::value *mask_true, ir::value *value_true, - ir::value *mask_false, ir::value *value_false, - const std::string &name = "", instruction *next = nullptr); - ir::value *get_mask_true() { return get_operand(0); } - ir::value *get_value_true() { return get_operand(1); } - ir::value *get_mask_false() { return get_operand(2); } - ir::value *get_value_false() { return get_operand(3); } +//public: +// static psi_inst* create(ir::value *mask_true, ir::value *value_true, +// ir::value *mask_false, ir::value *value_false, +// const std::string &name = "", instruction *next = nullptr); +// ir::value *get_mask_true() { return get_operand(0); } +// ir::value *get_value_true() { return get_operand(1); } +// ir::value *get_mask_false() { return get_operand(2); } +// ir::value *get_value_false() { return get_operand(3); } -}; +//}; //===----------------------------------------------------------------------===// // getelementptr_inst classes @@ -408,9 +408,14 @@ private: public: // accessors value *get_pointer_operand() { return get_operand(0); } + value *get_mask() const; + value *set_mask(value *mask); // factory method static load_inst* create(value *ptr, const std::string &name = "", instruction *next = nullptr); + +private: + value *mask_; }; class store_inst: public instruction{ @@ -421,9 +426,14 @@ private: public: value *get_pointer_operand() { return get_operand(0); } value *get_value_operand() { return get_operand(1); } + value *get_mask() const; + value *set_mask(value *mask); // factory method static store_inst* create(value* ptr, value *v, const std::string &name = "", instruction *next = nullptr); + +private: + ir::value *mask_; }; //===----------------------------------------------------------------------===// diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index f1da2a5a2..f3a054716 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -18,6 +18,7 @@ #include "triton/codegen/shmem_info.h" #include "triton/codegen/shmem_barriers.h" #include "triton/codegen/alignment_info.h" +#include "triton/codegen/reassociate.h" #include "triton/codegen/target.h" #include "triton/codegen/vectorize.h" #include "triton/runtime/launch_info.h" @@ -70,10 +71,12 @@ public: void target_independent(ir::module &module) { optimize_dot.run(module); optimize_trans.run(module); -// ir::print(module, std::cout); + ir::print(module, std::cout); + reassociate_.run(module); } void target_dependent(ir::module &module) { + ir::print(module, std::cout); alignment_info.run(module); if(target_->is_gpu()){ shmem_info.run(module); @@ -95,6 +98,7 @@ public: codegen::optimize_cse optimize_cse; codegen::optimize_trans optimize_trans; codegen::alignment_info alignment_info; + codegen::reassociate reassociate_; codegen::target* target_; }; diff --git a/lib/codegen/alignment_info.cpp b/lib/codegen/alignment_info.cpp index ccd9778d1..8c330a13f 100644 --- a/lib/codegen/alignment_info.cpp +++ b/lib/codegen/alignment_info.cpp @@ -72,11 +72,11 @@ alignment_info::cst_info alignment_info::populate_is_constant(ir::value *v) { cst_info rhs = populate_is_constant(rhs_op); return cache({std::min(lhs.num_cst, rhs.num_cst), 0}); } - if(auto *x = dynamic_cast(v)){ - cst_info value_true = populate_is_constant(x->get_value_true()); - cst_info value_false = populate_is_constant(x->get_value_false()); - return cache({std::min(value_true.num_cst, value_false.num_cst), 0}); - } +// if(auto *x = dynamic_cast(v)){ +// cst_info value_true = populate_is_constant(x->get_value_true()); +// cst_info value_false = populate_is_constant(x->get_value_false()); +// return cache({std::min(value_true.num_cst, value_false.num_cst), 0}); +// } if(v->get_type()->is_tile_ty()) return cache({0, 0}); if(auto *x = dynamic_cast(v)){ @@ -144,11 +144,11 @@ unsigned alignment_info::populate_max_contiguous(ir::value *v){ return cache(gcd(lhs_max_contiguous, rhs_cst_info.num_cst)); } } - if(auto *x = dynamic_cast(v)){ - int value_true = populate_max_contiguous(x->get_value_true()); - int value_false = populate_max_contiguous(x->get_value_false()); - return cache(std::min(value_true, value_false)); - } +// if(auto *x = dynamic_cast(v)){ +// int value_true = populate_max_contiguous(x->get_value_true()); +// int value_false = populate_max_contiguous(x->get_value_false()); +// return cache(std::min(value_true, value_false)); +// } if(auto *x = dynamic_cast(v)){ ir::value* lhs = x->get_operand(0); ir::value* rhs = x->get_operand(1); @@ -240,11 +240,11 @@ unsigned alignment_info::populate_starting_multiple(ir::value *v){ if(auto *x = dynamic_cast(v)){ return cache(v->get_type()->get_tile_shapes()[0]->get_value()); } - if(auto *x = dynamic_cast(v)){ - int value_true = populate_starting_multiple(x->get_value_true()); - int value_false = populate_starting_multiple(x->get_value_false()); - return cache(gcd(value_true, value_false)); - } +// if(auto *x = dynamic_cast(v)){ +// int value_true = populate_starting_multiple(x->get_value_true()); +// int value_false = populate_starting_multiple(x->get_value_false()); +// return cache(gcd(value_true, value_false)); +// } if(auto *x = dynamic_cast(v)){ // put a conservative initial value in phi node to avoid infinite recursion unsigned result = 1; diff --git a/lib/codegen/reassociate.cpp b/lib/codegen/reassociate.cpp new file mode 100644 index 000000000..a8794d422 --- /dev/null +++ b/lib/codegen/reassociate.cpp @@ -0,0 +1,185 @@ +#include +#include "triton/codegen/reassociate.h" +#include "triton/ir/module.h" +#include "triton/ir/function.h" +#include "triton/ir/basic_block.h" +#include "triton/ir/instructions.h" +#include "triton/ir/cfg.h" + +namespace triton { +namespace codegen{ + +//inline Constant *get_gep_cst_offset(GetElementPtrInst *gep){ +// std::vector idx_vals; +// std::transform(gep->idx_begin(), gep->idx_end(), +// std::back_inserter(idx_vals), +// [](Value* x){ return x;}); +// if(idx_vals.size() > 1) +// return nullptr; +// Value *idx = idx_vals[0]; +// if(isa(idx)) +// return idx; +// if(Instruction *BinOp = is_bin_add(idx)){ +// Value *LHS = BinOp->getOperand(0); +// Value *RHS = BinOp->getOperand(1); +// if(Constant* Res = dyn_cast(LHS)) +// return Res; +// if(Constant* Res = dyn_cast(RHS)) +// return Res; +// } +// return nullptr; +//} + + +inline ir::instruction* reassociate::is_bin_add(ir::value *x) { + ir::binary_operator *bin_op = dynamic_cast(x); + bool is_bin_add = bin_op && bin_op->get_op()==llvm::Instruction::Add; + if(is_bin_add) + return (ir::instruction*)x; + return nullptr; +} + +inline bool is_cst(ir::value *x) { + if(dynamic_cast(x)) + return true; + if(auto *v = dynamic_cast(x)) + return is_cst(v->get_operand(0)); + return false; +} + + +inline ir::value *reassociate::reorder_op(ir::value *old_value, + ir::builder &builder, + std::vector& to_delete, + ir::value *&noncst, + ir::value *&cst){ + // value doesn't change by default + ir::value* new_value = old_value; + cst = nullptr; + noncst = old_value; + + // handle retiling + if(ir::instruction* op = dynamic_cast(old_value)){ + auto shapes = op->get_type()->get_tile_shapes(); + ir::value *old_arg = op->get_operand(0); + ir::value *new_arg = reorder_op(old_arg, builder, to_delete, noncst, cst); + // retile(x + y) = retile(x) + retile(y) + if(ir::instruction* bin_add = is_bin_add(new_arg)) + if(cst){ + ir::value *old_lhs = bin_add->get_operand(0); + ir::value *old_rhs = bin_add->get_operand(1); + if(dynamic_cast(op)){ + builder.set_insert_point(op); + ir::value *new_lhs = builder.create_reshape(old_lhs, shapes); + ir::value *new_rhs = builder.create_reshape(old_rhs, shapes); + new_value = builder.create_add(new_lhs, new_rhs, op->get_name()); + } + if(dynamic_cast(op)){ + builder.set_insert_point(op); + ir::value *new_lhs = builder.create_broadcast(old_lhs, shapes); + ir::value *new_rhs = builder.create_broadcast(old_rhs, shapes); + new_value = builder.create_add(new_lhs, new_rhs, op->get_name()); + } + if(dynamic_cast(op)){ + builder.set_insert_point(op); + ir::value *new_lhs = builder.create_splat(old_lhs, shapes); + ir::value *new_rhs = builder.create_splat(old_rhs, shapes); + new_value = builder.create_add(new_lhs, new_rhs, op->get_name()); + } + to_delete.push_back(op); + } + } + + // handle binary addition + if(ir::instruction* op = is_bin_add(old_value)){ + builder.set_insert_point(op); + std::string name = op->get_name(); + ir::value *lhs = reorder_op(op->get_operand (0), builder, to_delete, noncst, cst); + ir::value *rhs = reorder_op(op->get_operand(1), builder, to_delete, noncst, cst); + builder.set_insert_point(op); + // (x + y) + z + if(ir::instruction* bin_lhs = is_bin_add(lhs)){ + ir::value *llhs = bin_lhs->get_operand(0); + ir::value *rlhs = bin_lhs->get_operand(1); + // (cst + x) + y -> cst + (x + y) + if(is_cst(llhs)) + new_value = builder.create_add(llhs, builder.create_add(rlhs, rhs), name); + // (x + cst) + y -> cst + (x + y) + if(is_cst(rlhs)) + new_value = builder.create_add(rlhs, builder.create_add(llhs, rhs), name); + if(new_value != op) + to_delete.push_back(bin_lhs); + } + // x + (y + z) + if(ir::instruction* bin_rhs = is_bin_add(rhs)){ + ir::value *lrhs = bin_rhs->get_operand(0); + ir::value *rrhs = bin_rhs->get_operand(1); + // x + (cst + y) -> cst + (x + y) + if(is_cst(lrhs)) + new_value = builder.create_add(lrhs, builder.create_add(rrhs, lhs), name, cst); + // x + (y + cst) -> cst + (x + y) + if(is_cst(rrhs)) + new_value = builder.create_add(rrhs, builder.create_add(lrhs, lhs), name, cst); + if(new_value != op) + to_delete.push_back(bin_rhs); + } + } + + // extract constant and non-constant + if(ir::instruction *bin_add = is_bin_add(new_value)){ + ir::value *new_lhs = bin_add->get_operand(0); + ir::value *new_rhs = bin_add->get_operand(1); + if(is_cst(new_lhs)){ + cst = new_lhs; + noncst = new_rhs; + } + if(is_cst(new_rhs)){ + cst = new_rhs; + noncst = new_lhs; + } + } + + // clean-up if some re-ordering happened + if(old_value != new_value){ + old_value->replace_all_uses_with(new_value); + if(auto *x = dynamic_cast(old_value)) + to_delete.push_back(x); + } + + return new_value; +} + +reassociate::reassociate() { + +} + +void reassociate::run(ir::module &mod) { + ir::builder &builder = mod.get_builder(); + std::vector to_delete; + for(ir::function *fn: mod.get_function_list()){ + std::vector rpo = ir::cfg::reverse_post_order(fn); + bool done = false; + do{ + // iterate through blocks + for(ir::basic_block *block: rpo){ + // iterate through instruction + for(ir::instruction *i: block->get_inst_list()){ + if(auto *gep = dynamic_cast(i)){ + std::vector idxs(gep->idx_begin(), gep->idx_end()); + ir::value *cst = nullptr; + ir::value *noncst = idxs[0]; + reorder_op(noncst, builder, to_delete, noncst, cst); +// std::cout << gep->get_name() << " " << noncst << " " << cst << std::endl; + } + } + done = true; + } + }while(!done); + } + // erase dead code + for(ir::instruction* i: to_delete) + i->erase_from_parent(); +} + +} +} diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index e55e55c87..1f573c3f6 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -236,39 +236,6 @@ Constant *selection::llvm_constant(ir::constant *cst, LLVMContext &ctx) { throw std::runtime_error("unknown conversion from ir::constant to Constant"); } -inline Value *Reassociate(Value *V, IRBuilder<> &Builder){ - BinaryOperator *BinOp = dyn_cast(V); - if(BinOp) - if(BinOp->getOpcode()==BinaryOperator::BinaryOps::Add){ - Value *LHS = Reassociate(BinOp->getOperand(0), Builder); - Value *RHS = Reassociate(BinOp->getOperand(1), Builder); - if(BinaryOperator *BinLHS = dyn_cast(LHS)) - if(BinLHS->getOpcode()==BinaryOperator::BinaryOps::Add){ - Value *LLHS = BinLHS->getOperand(0); - Value *RLHS = BinLHS->getOperand(1); - // (cst + x) + y -> cst + (x + y) - if(isa(LLHS)) - return Builder.CreateAdd(LLHS, Builder.CreateAdd(RLHS, RHS)); - // (x + cst) + y -> cst + (x + y) - if(isa(RLHS)) - return Builder.CreateAdd(RLHS, Builder.CreateAdd(LLHS, RHS)); - } - if(BinaryOperator *BinRHS = dyn_cast(RHS)) - if(BinRHS->getOpcode()==BinaryOperator::BinaryOps::Add){ - Value *LRHS = BinRHS->getOperand(0); - Value *RRHS = BinRHS->getOperand(1); - // x + (cst + y) -> cst + (x + y) - if(isa(LRHS)) - return Builder.CreateAdd(LRHS, Builder.CreateAdd(RRHS, LHS)); - // x + (cst + y) -> cst + (x + y) - if(isa(LRHS)) - return Builder.CreateAdd(RRHS, Builder.CreateAdd(LRHS, LHS)); - } - return BinOp; - } - return V; -} - /* convert ir::instruction to llvm::Instruction */ Instruction *selection::llvm_inst(ir::instruction *inst, std::function value, IRBuilder<> &builder) { LLVMContext & ctx = builder.getContext(); @@ -320,13 +287,14 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::functionget_op(), arg, dst_ty)); } if(auto* ii = dynamic_cast(inst)){ + // get pointer + Value *ptr = value(ii->get_operand(0)); + // reassociate first index std::vector idx_vals; std::transform(ii->idx_begin(), ii->idx_end(), std::back_inserter(idx_vals), [&value](ir::value* x){ return value(x);}); Type *source_ty = type(ii->get_source_elt_ty()->get_scalar_ty()); - idx_vals[0] = Reassociate(idx_vals[0], builder); - Value *arg = value(ii->get_operand(0)); - return builder.Insert(GetElementPtrInst::CreateInBounds(source_ty, arg, idx_vals)); + return builder.Insert(GetElementPtrInst::CreateInBounds(source_ty, ptr, idx_vals)); } if(ir::load_inst* ii = dynamic_cast(inst)){ Value *ptr = value(ii->get_pointer_operand()); @@ -612,7 +580,7 @@ void selection::create_grids(std::vector &grids, std::function bind_references = [&](ir::value *v) { // skip - if(!v->get_type()->is_tile_ty() || !seen.insert(v).second || dynamic_cast(v)) + if(!v->get_type()->is_tile_ty() || !seen.insert(v).second) return; // recurse if(auto *user = dynamic_cast(v)) @@ -767,40 +735,32 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & Module *module = block->getModule(); LLVMContext &ctx = builder.getContext(); Function *fn = block->getParent(); - ir::value *mask = ins->get_mask_pred(); - BasicBlock *last_block = nullptr; - auto set_mask_insert_pt = [&](indices_t idx){ - if(mask){ - distributed_tile *mask_tile = (distributed_tile*)tmap_.at(ins->get_mask_pred()); - BasicBlock *block = pmap_.at({mask_tile, idx}); - builder.SetInsertPoint(block->getTerminator()); - last_block = last_block_.at({mask_tile, idx}); - } - }; // store if(auto *x = dynamic_cast(ins)) { distributed_tile* ptr = (distributed_tile*)tmap_.at(x->get_pointer_operand()); tile *value = tmap_.at(x->get_value_operand()); - distributed_tile *mask_tile; - if(mask) - mask_tile = (distributed_tile*)tmap_.at(ins->get_mask_pred()); - ptr->for_each([&](indices_t idx){ - set_mask_insert_pt(idx); - Value *ptr_value = ptr->get_value(idx); - Value *value_value = value->get_value(idx); - Instruction *store; -// if(mask){ -// Value *pred_value = mask_tile->get_value(idx); -// value_value = builder.CreateVectorSplat(1, value_value); -// pred_value = builder.CreateVectorSplat(1, pred_value); -// Type *ptr_ty = PointerType::get(value_value->getType(), ptr_value->getType()->getPointerAddressSpace()); -// ptr_value = builder.CreateBitCast(ptr_value, ptr_ty); -// store = builder.CreateMaskedStore(value_value, ptr_value, 1, pred_value); -// } -// else - store = new StoreInst(value_value, ptr_value); - builder.Insert(store); - }); + ir::value *mask = x->get_mask(); + if(mask) { + distributed_tile* preds = (distributed_tile*)tmap_.at(mask); + ptr->for_each([&](indices_t idx){ + BasicBlock *mask_then_bb = BasicBlock::Create(ctx, "mask_then", fn); + BasicBlock *mask_done_bb = BasicBlock::Create(ctx, "mask_done", fn); + builder.CreateCondBr(preds->get_value(idx), mask_then_bb, mask_done_bb); + builder.SetInsertPoint(mask_then_bb); + builder.CreateStore(value->get_value(idx), ptr->get_value(idx)); + builder.CreateBr(mask_done_bb); + builder.SetInsertPoint(mask_done_bb); + }); + } + else { + ptr->for_each([&](indices_t idx){ + if(GetElementPtrInst *gep = dyn_cast(ptr->get_value(idx))) + if(BinaryOperator *binop = dyn_cast(*gep->idx_begin())){ + std::cout << isa(binop->getOperand(0)) << " " << isa(binop->getOperand(1)) << std::endl; + } + builder.CreateStore(value->get_value(idx), ptr->get_value(idx)); + }); + } } else { if(auto *x = dynamic_cast(ins)){ @@ -875,49 +835,49 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & result->set_value(idx, builder.CreateAdd(bin, offset)); }); } - // mask - else if(dynamic_cast(ins)) { - distributed_tile* pred = (distributed_tile*)tmap_.at(ins->get_operand(0)); - distributed_tile* mask_tile_true = (distributed_tile*)tmap_.at(ins->get_result(0)); - distributed_tile* mask_tile_false = (distributed_tile*)tmap_.at(ins->get_result(1)); - pred->for_each([&](indices_t idx){ - BasicBlock *mask_then_bb = BasicBlock::Create(ctx, "mask_then", fn); - BasicBlock* mask_else_bb = BasicBlock::Create(ctx, "mask_else", fn); - BasicBlock *mask_done_bb = BasicBlock::Create(ctx, "mask_done", fn); - builder.CreateCondBr(pred->get_value(idx), mask_then_bb, mask_else_bb); - builder.SetInsertPoint(mask_then_bb); - builder.CreateBr(mask_done_bb); - builder.SetInsertPoint(mask_else_bb); - builder.CreateBr(mask_done_bb); - builder.SetInsertPoint(mask_done_bb); - pmap_.insert({{mask_tile_true, idx}, mask_then_bb}); - pmap_.insert({{mask_tile_false, idx}, mask_else_bb}); - last_block_.insert({{mask_tile_true, idx}, mask_done_bb}); - last_block_.insert({{mask_tile_false, idx}, mask_done_bb}); - }); - } - // merge - else if(auto *merge = dynamic_cast(ins)) { - distributed_tile* mask_tile_true = (distributed_tile*)tmap_.at(merge->get_mask_true()); - distributed_tile *value_tile_true = (distributed_tile*)tmap_.at(merge->get_value_true()); - distributed_tile* mask_tile_false = (distributed_tile*)tmap_.at(merge->get_mask_false()); - distributed_tile *value_tile_false = (distributed_tile*)tmap_.at(merge->get_value_false()); - result->for_each([&](indices_t idx){ - BasicBlock *block_true = pmap_.at({mask_tile_true, idx}); - Value *value_true = value_tile_true->get_value(idx); - BasicBlock *block_false = pmap_.at({mask_tile_false, idx}); - Value *value_false = value_tile_false->get_value(idx); - BasicBlock *block_done = last_block_.at({mask_tile_true, idx}); - if(block_done->getTerminator()) - builder.SetInsertPoint(block_done->getTerminator()); - else - builder.SetInsertPoint(block_done); - PHINode *phi = builder.CreatePHI(value_true->getType(), 2); - phi->addIncoming(value_true, block_true); - phi->addIncoming(value_false,block_false); - result->set_value(idx, phi); - }); - } +// // mask +// else if(dynamic_cast(ins)) { +// distributed_tile* pred = (distributed_tile*)tmap_.at(ins->get_operand(0)); +// distributed_tile* mask_tile_true = (distributed_tile*)tmap_.at(ins->get_result(0)); +// distributed_tile* mask_tile_false = (distributed_tile*)tmap_.at(ins->get_result(1)); +// pred->for_each([&](indices_t idx){ +// BasicBlock *mask_then_bb = BasicBlock::Create(ctx, "mask_then", fn); +// BasicBlock* mask_else_bb = BasicBlock::Create(ctx, "mask_else", fn); +// BasicBlock *mask_done_bb = BasicBlock::Create(ctx, "mask_done", fn); +// builder.CreateCondBr(pred->get_value(idx), mask_then_bb, mask_else_bb); +// builder.SetInsertPoint(mask_then_bb); +// builder.CreateBr(mask_done_bb); +// builder.SetInsertPoint(mask_else_bb); +// builder.CreateBr(mask_done_bb); +// builder.SetInsertPoint(mask_done_bb); +// pmap_.insert({{mask_tile_true, idx}, mask_then_bb}); +// pmap_.insert({{mask_tile_false, idx}, mask_else_bb}); +// last_block_.insert({{mask_tile_true, idx}, mask_done_bb}); +// last_block_.insert({{mask_tile_false, idx}, mask_done_bb}); +// }); +// } +// // merge +// else if(auto *merge = dynamic_cast(ins)) { +// distributed_tile* mask_tile_true = (distributed_tile*)tmap_.at(merge->get_mask_true()); +// distributed_tile *value_tile_true = (distributed_tile*)tmap_.at(merge->get_value_true()); +// distributed_tile* mask_tile_false = (distributed_tile*)tmap_.at(merge->get_mask_false()); +// distributed_tile *value_tile_false = (distributed_tile*)tmap_.at(merge->get_value_false()); +// result->for_each([&](indices_t idx){ +// BasicBlock *block_true = pmap_.at({mask_tile_true, idx}); +// Value *value_true = value_tile_true->get_value(idx); +// BasicBlock *block_false = pmap_.at({mask_tile_false, idx}); +// Value *value_false = value_tile_false->get_value(idx); +// BasicBlock *block_done = last_block_.at({mask_tile_true, idx}); +// if(block_done->getTerminator()) +// builder.SetInsertPoint(block_done->getTerminator()); +// else +// builder.SetInsertPoint(block_done); +// PHINode *phi = builder.CreatePHI(value_true->getType(), 2); +// phi->addIncoming(value_true, block_true); +// phi->addIncoming(value_false,block_false); +// result->set_value(idx, phi); +// }); +// } // reshape else if(dynamic_cast(ins)) { ir::value* in = ins->get_operand(0); @@ -934,7 +894,6 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & // splat else if(dynamic_cast(ins)) { result->for_each([&](indices_t idx) { - set_mask_insert_pt(idx); result->set_value(idx, llvm_value(ins->get_operand(0), builder)); }); } @@ -1163,12 +1122,9 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & unsigned max_contiguous = axis_info_->get_max_contiguous(ptr); unsigned alignment = std::min(starting_multiple, max_contiguous); unsigned vector_size = std::min(result->axis(0).contiguous, alignment); -// vector_size = result->axis(0).contiguous; -// vector_size = 1; std::map packets; distributed_tile *TP = (distributed_tile*)tmap_.at(ld->get_pointer_operand()); result->for_each([&](indices_t idx){ - set_mask_insert_pt(idx); unsigned linear = result->get_linear_index(idx); unsigned id = linear / vector_size; if(linear % vector_size == 0){ @@ -1189,20 +1145,14 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & else return llvm_value(x, builder); }; - set_mask_insert_pt(idx); result->set_value(idx, llvm_inst(ins, value, builder)); }); } } - if(mask){ - builder.SetInsertPoint(block); - if(last_block) - builder.SetInsertPoint(last_block); - } } void selection::lower_instruction(ir::instruction *src, IRBuilder<> &builder) { - if(src->has_tile_result_or_op() || (src->get_mask_pred() && src->get_mask_pred()->get_type()->is_tile_ty())) { + if(src->has_tile_result_or_op()) { lower_tile_instruction(src, builder); } else { @@ -1310,7 +1260,7 @@ void selection::run(ir::module &src, Module &dst) { dst_builder.SetInsertPoint(parent); for(ir::instruction *i: block->get_inst_list()){ BasicBlock *current = dst_builder.GetInsertBlock(); - bool phi_inserted = (dynamic_cast(i) || dynamic_cast(i)) && !current->empty(); + bool phi_inserted = (dynamic_cast(i)) && !current->empty(); if(phi_inserted && current->getFirstNonPHI()) dst_builder.SetInsertPoint(&*current->getFirstNonPHI()); lower_instruction(i, dst_builder); diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index 1058e751e..d7ffc11d2 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -60,6 +60,7 @@ void base::enqueue(driver::stream *stream, std::vector args, a } else { params_t params = heuristics(); +// params_t params = jit->get_valid(name_.c_str(), src.c_str()); jit->add_module(name_.c_str(), src.c_str(), params); } triton::driver::kernel* kernel = jit->get_function(name_.c_str()); diff --git a/lib/dnn/gemm.cpp b/lib/dnn/gemm.cpp index a2d636129..23e62ae76 100644 --- a/lib/dnn/gemm.cpp +++ b/lib/dnn/gemm.cpp @@ -123,8 +123,8 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, )" + b_ty_ + R"(* pb[)" + BS + "] = B + rkb" + bcb0 + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; int1 checka[)" + AS + R"(] = (rka < K))" + bca0 + " && (rxa < M)" + bca1 + R"(; int1 checkb[)" + BS + R"(] = (rkb < K))" + bcb0 + " && (ryb < N)" + bcb1 + R"(; - )" + a_ty_ + R"( a[)" + AS + R"(] = checka ? *pa : 0; - )" + b_ty_ + R"( b[)" + BS + R"(] = checkb ? *pb : 0; + )" + a_ty_ + R"( a[)" + AS + R"(] = *pa; + )" + b_ty_ + R"( b[)" + BS + R"(] = *pb; for(int32 k = K; k > 0; k = k - TK){ c = dot()" + usea + ", " + useb + R"(, c); pa = pa + TK)" + lda0 + R"(; @@ -132,15 +132,17 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, a = *pa; b = *pb; } - int32 rxc[TM] = ridx*TM + (0 ... TM); - int32 ryc[TN] = ridy*TN + (0 ... TN); + int32 rxc[TM] = ridx * TM + (0 ... TM); + int32 ryc[TN] = ridy * TN + (0 ... TN); int1 checkc0[TM] = rxc < M; int1 checkc1[TN] = ryc < N; int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - @checkc *pc = c; + *pc = c; } )"; + + std::cout << res << std::endl; os << res; } diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 08c41a7b8..551a55a20 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -255,7 +255,7 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ - // std::cout << source << std::endl; + std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index 54321bd81..bff68e083 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -90,13 +90,13 @@ value *builder::create_ret_void() { // tile-level control-flow instructions //===----------------------------------------------------------------------===// -value *builder::create_mask(value *pred, const std::string &name){ - return insert(mask_inst::create(pred, name)); -} +//value *builder::create_mask(value *pred, const std::string &name){ +// return insert(mask_inst::create(pred, name)); +//} -value *builder::create_merge(value *mask_true, value *value_true, value *mask_false, value *value_false, const std::string &name) { - return insert(psi_inst::create(mask_true, value_true, mask_false, value_false, name)); -} +//value *builder::create_merge(value *mask_true, value *value_true, value *mask_false, value *value_false, const std::string &name) { +// return insert(psi_inst::create(mask_true, value_true, mask_false, value_false, name)); +//} //===----------------------------------------------------------------------===// diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 063dbffc9..3968ce0e6 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -349,31 +349,31 @@ cond_branch_inst::cond_branch_inst(basic_block *if_dst, basic_block *else_dst, v } // mask_inst -mask_inst::mask_inst(value *pred, const std::string &name, instruction *next) - : instruction(pred->get_type(), 1, 2, name, next) { - set_operand(0, pred); -} +//mask_inst::mask_inst(value *pred, const std::string &name, instruction *next) +// : instruction(pred->get_type(), 1, 2, name, next) { +// set_operand(0, pred); +//} -mask_inst* mask_inst::create(value *pred, const std::string &name, instruction *next) { - return new mask_inst(pred, name, next); -} +//mask_inst* mask_inst::create(value *pred, const std::string &name, instruction *next) { +// return new mask_inst(pred, name, next); +//} -// merge_inst -psi_inst::psi_inst(value *mask_true, value *value_true, - value *mask_false, value *value_false, - const std::string &name, instruction *next) - : instruction(value_true->get_type(), 4, 1, name, next) { - set_operand(0, mask_true); - set_operand(1, value_true); - set_operand(2, mask_false); - set_operand(3, value_false); -} +//// merge_inst +//psi_inst::psi_inst(value *mask_true, value *value_true, +// value *mask_false, value *value_false, +// const std::string &name, instruction *next) +// : instruction(value_true->get_type(), 4, 1, name, next) { +// set_operand(0, mask_true); +// set_operand(1, value_true); +// set_operand(2, mask_false); +// set_operand(3, value_false); +//} -psi_inst* psi_inst::create(value *mask_true, value *value_true, - value *mask_false, value *value_false, - const std::string &name, instruction *next) { - return new psi_inst(mask_true, value_true, mask_false, value_false, name, next); -} +//psi_inst* psi_inst::create(value *mask_true, value *value_true, +// value *mask_false, value *value_false, +// const std::string &name, instruction *next) { +// return new psi_inst(mask_true, value_true, mask_false, value_false, name, next); +//} @@ -449,7 +449,16 @@ type *load_inst::get_pointee_type(type *ty) { } load_inst::load_inst(value *ptr, const std::string &name, instruction *next) - : unary_inst(get_pointee_type(ptr->get_type()), ptr, name, next) { + : unary_inst(get_pointee_type(ptr->get_type()), ptr, name, next), mask_(nullptr){ +} + +value *load_inst::get_mask() const { + return mask_; +} + +value *load_inst::set_mask(value *mask) { + mask_ = mask; + return this; } load_inst* load_inst::create(value *ptr, const std::string &name, instruction *next) { @@ -458,11 +467,20 @@ load_inst* load_inst::create(value *ptr, const std::string &name, instruction *n // store store_inst::store_inst(value *ptr, value *v, const std::string &name, instruction *next) - : instruction(type::get_void_ty(ptr->get_type()->get_context()), 2, 1, name, next) { + : instruction(type::get_void_ty(ptr->get_type()->get_context()), 2, 1, name, next), mask_(nullptr) { set_operand(0, ptr); set_operand(1, v); } +value *store_inst::get_mask() const { + return mask_; +} + +value *store_inst::set_mask(value *mask) { + mask_ = mask; + return this; +} + store_inst* store_inst::create(value *ptr, value *v, const std::string &name, instruction *next) { return new store_inst(ptr, v, name, next); } diff --git a/lib/ir/print.cpp b/lib/ir/print.cpp index f4f117ff7..b1e43c0e2 100644 --- a/lib/ir/print.cpp +++ b/lib/ir/print.cpp @@ -35,8 +35,12 @@ void print(module &mod, std::ostream& os) { os << std::endl; for(ir::instruction *inst: block->get_inst_list()){ os << " "; - if(ir::value *pred = inst->get_mask_pred()) - os << "@" << get_name(pred, cnt++) << " "; + if(auto *x = dynamic_cast(inst)) + if(ir::value *mask = x->get_mask()) + os << "@" << get_name(mask, cnt++) << " "; + if(auto *x = dynamic_cast(inst)) + if(ir::value *mask = x->get_mask()) + os << "@" << get_name(mask, cnt++) << " "; unsigned num_results = inst->get_num_results(); for(unsigned i = 0; i < num_results; i++){ os << get_name(inst->get_result(i), cnt++); diff --git a/lib/lang/expression.cpp b/lib/lang/expression.cpp index 1e0536801..b21a3b4c7 100644 --- a/lib/lang/expression.cpp +++ b/lib/lang/expression.cpp @@ -255,37 +255,38 @@ ir::value* cast_expression::codegen(ir::module *mod) const{ /* Conditional expression */ ir::value *conditional_expression::codegen(ir::module *mod) const{ - ir::builder &builder = mod->get_builder(); - ir::basic_block::inst_list_t &instructions = builder.get_insert_block()->get_inst_list(); - ir::value *pred = cond_->codegen(mod); - ir::instruction *mask = (ir::instruction*)builder.create_mask(pred); - /* true value */ - ir::value *true_mask = mask->get_result(0); - auto it_true_begin = instructions.end(); - it_true_begin--; - ir::value *true_value = true_value_->codegen(mod); - implicit_broadcast(mod, pred, true_value); - it_true_begin++; - auto it_true_end = instructions.end(); - for(auto it = it_true_begin; it != it_true_end; it++) -// if(!dynamic_cast(*it)) - (*it)->set_mask_pred(true_mask); - /* false value */ - ir::value *false_mask = mask->get_result(1); - auto it_false_begin = instructions.end(); - it_false_begin--; - ir::value *false_value = false_value_->codegen(mod); - implicit_broadcast(mod, pred, false_value); - bool is_float, is_ptr, is_int, is_signed; - implicit_cast(builder, true_value, false_value, is_float, is_ptr, is_int, is_signed); - it_false_begin++; - auto it_false_end = instructions.end(); - for(auto it = it_false_begin; it != it_false_end; it++) -// if(!dynamic_cast(*it)) - (*it)->set_mask_pred(false_mask); - /* psi */ - ir::value *result = builder.create_merge(true_mask, true_value, false_mask, false_value); - return result; + throw std::runtime_error("not implemented"); +// ir::builder &builder = mod->get_builder(); +// ir::basic_block::inst_list_t &instructions = builder.get_insert_block()->get_inst_list(); +// ir::value *pred = cond_->codegen(mod); +// ir::instruction *mask = (ir::instruction*)builder.create_mask(pred); +// /* true value */ +// ir::value *true_mask = mask->get_result(0); +// auto it_true_begin = instructions.end(); +// it_true_begin--; +// ir::value *true_value = true_value_->codegen(mod); +// implicit_broadcast(mod, pred, true_value); +// it_true_begin++; +// auto it_true_end = instructions.end(); +// for(auto it = it_true_begin; it != it_true_end; it++) +//// if(!dynamic_cast(*it)) +// (*it)->set_mask_pred(true_mask); +// /* false value */ +// ir::value *false_mask = mask->get_result(1); +// auto it_false_begin = instructions.end(); +// it_false_begin--; +// ir::value *false_value = false_value_->codegen(mod); +// implicit_broadcast(mod, pred, false_value); +// bool is_float, is_ptr, is_int, is_signed; +// implicit_cast(builder, true_value, false_value, is_float, is_ptr, is_int, is_signed); +// it_false_begin++; +// auto it_false_end = instructions.end(); +// for(auto it = it_false_begin; it != it_false_end; it++) +//// if(!dynamic_cast(*it)) +// (*it)->set_mask_pred(false_mask); +// /* psi */ +// ir::value *result = builder.create_merge(true_mask, true_value, false_mask, false_value); +// return result; } /* Assignment expression */ diff --git a/lib/lang/statement.cpp b/lib/lang/statement.cpp index 2e6460620..ab0a55828 100644 --- a/lib/lang/statement.cpp +++ b/lib/lang/statement.cpp @@ -29,34 +29,22 @@ ir::value* compound_statement::codegen(ir::module* mod) const{ /* Expression statement */ ir::value* expression_statement::codegen(ir::module *mod) const{ ir::builder &builder = mod->get_builder(); - ir::basic_block *block = builder.get_insert_block(); - if(pred_) { - // generate mask - ir::value *pred = pred_->codegen(mod); - ir::mask_inst *mask = (ir::mask_inst*)builder.create_mask(pred); - // generate expression - unsigned szbegin = block->get_inst_list().size(); - ir::value *expr = expr_->codegen(mod); - ir::basic_block::iterator begin = block->begin(); - std::advance(begin, szbegin); - // set mask - ir::type *ty = expr->get_type(); - for(auto it = begin; it != builder.get_insert_point(); it++) - (*it)->set_mask_pred(mask->get_result(0)); -// if(auto *itn = dynamic_cast(expr)) -// itn->set_mask_pred(mask->get_result(0)); - if(ty->is_void_ty()) - return expr; - // merge with psi - ir::psi_inst *psi = (ir::psi_inst*)builder.create_merge(mask->get_result(0), expr, - mask->get_result(1), ir::undef_value::get(ty)); - if(assignment_expression *assignment = dynamic_cast(expr_)){ - std::string name = ((named_expression*)assignment->lvalue())->id()->name(); - mod->set_value(name, psi); - } - return psi; + ir::value *expr = expr_->codegen(mod); + if(pred_ == nullptr) + return expr; + ir::value *pred = pred_->codegen(mod); + if(auto *x = dynamic_cast(expr)) + x->set_mask(pred); + else if(auto *x = dynamic_cast(expr)) + x->set_mask(pred); + else + expr = builder.create_select(pred, expr, ir::undef_value::get(expr->get_type())); + if(assignment_expression *assignment = dynamic_cast(expr_)) + if(auto *named = dynamic_cast(assignment)){ + std::string name = named->lvalue()->id()->name(); + mod->set_value(name, expr); } - return expr_->codegen(mod); + return expr; } /* For statement */ From 397d76156b2efeca5f8cdce6f94e3e9cb2b16c8e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 23 Jul 2019 17:21:24 -0700 Subject: [PATCH 263/494] progress on re-association --- include/triton/codegen/alignment_info.h | 1 + include/triton/codegen/reassociate.h | 7 ++- include/triton/codegen/tune.h | 2 +- include/triton/ir/instructions.h | 26 +++++++++++ include/triton/runtime/jit.h | 9 ++-- lib/codegen/alignment_info.cpp | 14 +++++- lib/codegen/reassociate.cpp | 62 ++++++++++++++++++++----- lib/codegen/selection.cpp | 22 ++++++++- lib/codegen/tune.cpp | 20 +++++--- lib/ir/constant.cpp | 2 + lib/ir/instructions.cpp | 28 ++++++++++- 11 files changed, 167 insertions(+), 26 deletions(-) diff --git a/include/triton/codegen/alignment_info.h b/include/triton/codegen/alignment_info.h index d2d72e176..92a15efeb 100644 --- a/include/triton/codegen/alignment_info.h +++ b/include/triton/codegen/alignment_info.h @@ -32,6 +32,7 @@ public: void run(ir::module &mod); unsigned get_starting_multiple(ir::value* v) const; unsigned get_max_contiguous(ir::value* v) const; + void copy(ir::value *dst, ir::value *src); private: std::map is_constant_; diff --git a/include/triton/codegen/reassociate.h b/include/triton/codegen/reassociate.h index 3360a15fe..9be8ed6bd 100644 --- a/include/triton/codegen/reassociate.h +++ b/include/triton/codegen/reassociate.h @@ -18,14 +18,19 @@ class instruction; namespace codegen{ +class tune; + class reassociate { private: ir::instruction* is_bin_add(ir::value *x); ir::value *reorder_op(ir::value *value, ir::builder &builder, std::vector& to_delete, ir::value *&noncst, ir::value *&cst); public: - reassociate(); + reassociate(tune *params); void run(ir::module& module); + +private: + tune* params_; }; } diff --git a/include/triton/codegen/tune.h b/include/triton/codegen/tune.h index 098106149..7f393a3a0 100644 --- a/include/triton/codegen/tune.h +++ b/include/triton/codegen/tune.h @@ -44,7 +44,7 @@ public: ir::metaparameter* get_param(ir::value *value, const std::string &key) { return params_[value][key]; } fragment_t get_fragment(ir::value *value, unsigned ax) { return fragments_.at({value, ax}); } unsigned get_param_group(ir::value *value, unsigned ax); - void copy(ir::value *dst, ir::value *src) { params_[dst] = params_[src]; groups_[dst] = groups_[src]; } + void copy(ir::value *dst, ir::value *src); bool check_constraints(std::map> &errors); void run(ir::module &mod); void init(ir::module &mod); diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index a3d56309c..1bce5bd47 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -2,6 +2,7 @@ #define TDL_INCLUDE_IR_INSTRUCTIONS_H #include +#include "triton/ir/constant.h" #include "triton/ir/value.h" #include "triton/ir/type.h" #include "triton/ir/metadata.h" @@ -651,6 +652,31 @@ public: static vectorize_inst* create(value *arg, const std::string &name = "", instruction *next = nullptr); }; +// On NVIDIA, implementation is such that +// constant_range = nv_dynamic_range_idx + nv_static_range_idx +// so as to enable re-association on nv_static_range_idx which is constant +class nv_dynamic_range_idx_inst: public instruction { +private: + nv_dynamic_range_idx_inst(type *ty, const std::string &name, instruction *next); + std::string repr_impl() const { return "nv_dynamic_range_idx"; } + +public: + static nv_dynamic_range_idx_inst* create(type *ty, const std::string &name = "", instruction *next = nullptr); +}; + +class nv_static_range_idx: public constant { +private: + nv_static_range_idx(constant_range *range); + +public: + static nv_static_range_idx *get(constant_range* range); + constant_range* get_range() const; + +private: + constant_range *range_; +}; + + } } diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index f3a054716..8a1157940 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -66,18 +66,19 @@ public: optimize_cse(), optimize_trans(), alignment_info(), + reassociate(&tune), target_(target) { } void target_independent(ir::module &module) { optimize_dot.run(module); optimize_trans.run(module); - ir::print(module, std::cout); - reassociate_.run(module); +// ir::print(module, std::cout); } void target_dependent(ir::module &module) { - ir::print(module, std::cout); alignment_info.run(module); + reassociate.run(module); + ir::print(module, std::cout); if(target_->is_gpu()){ shmem_info.run(module); shmem_liveness.run(module); @@ -98,7 +99,7 @@ public: codegen::optimize_cse optimize_cse; codegen::optimize_trans optimize_trans; codegen::alignment_info alignment_info; - codegen::reassociate reassociate_; + codegen::reassociate reassociate; codegen::target* target_; }; diff --git a/lib/codegen/alignment_info.cpp b/lib/codegen/alignment_info.cpp index 8c330a13f..7c40229a2 100644 --- a/lib/codegen/alignment_info.cpp +++ b/lib/codegen/alignment_info.cpp @@ -228,6 +228,12 @@ unsigned alignment_info::populate_starting_multiple(ir::value *v){ if(auto *x = dynamic_cast(v)){ return cache(x->get_first()->get_value()); } + if(auto *x = dynamic_cast(v)){ + return cache(128); + } + if(auto *x = dynamic_cast(v)){ + return cache(x->get_range()->get_first()->get_value()); + } if(auto *x = dynamic_cast(v)){ int lhs = populate_starting_multiple(x->get_operand(0)); int rhs = populate_starting_multiple(x->get_operand(1)); @@ -280,6 +286,12 @@ unsigned alignment_info::get_max_contiguous(ir::value* v) const { return max_contiguous_.at(v); } +void alignment_info::copy(ir::value *dst, ir::value *src) { + starting_multiple_[dst] = starting_multiple_[src]; + max_contiguous_[dst] = max_contiguous_[src]; + is_constant_[dst] = is_constant_[src]; +} + ///TODO: This doesn't seem to work in DOT-NN, DOT-TT, DOT-TN void alignment_info::run(ir::module &mod) { // populate constant @@ -301,7 +313,7 @@ void alignment_info::run(ir::module &mod) { for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()){ populate_max_contiguous(i); -// std::cout << i->get_name() << " " << is_constant_.at(i).num_cst << " " << max_contiguous_.at(i) << " " << starting_multiple_.at(i) << std::endl; + std::cout << i->get_name() << " " << is_constant_.at(i).num_cst << " " << max_contiguous_.at(i) << " " << starting_multiple_.at(i) << std::endl; } } diff --git a/lib/codegen/reassociate.cpp b/lib/codegen/reassociate.cpp index a8794d422..2ca8828d7 100644 --- a/lib/codegen/reassociate.cpp +++ b/lib/codegen/reassociate.cpp @@ -5,6 +5,7 @@ #include "triton/ir/basic_block.h" #include "triton/ir/instructions.h" #include "triton/ir/cfg.h" +#include "triton/codegen/tune.h" namespace triton { namespace codegen{ @@ -68,25 +69,32 @@ inline ir::value *reassociate::reorder_op(ir::value *old_value, if(cst){ ir::value *old_lhs = bin_add->get_operand(0); ir::value *old_rhs = bin_add->get_operand(1); + ir::value *new_lhs = nullptr; + ir::value *new_rhs = nullptr; if(dynamic_cast(op)){ builder.set_insert_point(op); - ir::value *new_lhs = builder.create_reshape(old_lhs, shapes); - ir::value *new_rhs = builder.create_reshape(old_rhs, shapes); + new_lhs = builder.create_reshape(old_lhs, shapes); + new_rhs = builder.create_reshape(old_rhs, shapes); new_value = builder.create_add(new_lhs, new_rhs, op->get_name()); } if(dynamic_cast(op)){ builder.set_insert_point(op); - ir::value *new_lhs = builder.create_broadcast(old_lhs, shapes); - ir::value *new_rhs = builder.create_broadcast(old_rhs, shapes); + new_lhs = builder.create_broadcast(old_lhs, shapes); + new_rhs = builder.create_broadcast(old_rhs, shapes); new_value = builder.create_add(new_lhs, new_rhs, op->get_name()); } if(dynamic_cast(op)){ builder.set_insert_point(op); - ir::value *new_lhs = builder.create_splat(old_lhs, shapes); - ir::value *new_rhs = builder.create_splat(old_rhs, shapes); + new_lhs = builder.create_splat(old_lhs, shapes); + new_rhs = builder.create_splat(old_rhs, shapes); new_value = builder.create_add(new_lhs, new_rhs, op->get_name()); } - to_delete.push_back(op); + if(new_value != old_value){ + params_->copy(new_value, old_value); + params_->copy(new_lhs, old_value); + params_->copy(new_rhs, old_value); + to_delete.push_back(op); + } } } @@ -107,8 +115,9 @@ inline ir::value *reassociate::reorder_op(ir::value *old_value, // (x + cst) + y -> cst + (x + y) if(is_cst(rlhs)) new_value = builder.create_add(rlhs, builder.create_add(llhs, rhs), name); - if(new_value != op) + if(new_value != old_value){ to_delete.push_back(bin_lhs); + } } // x + (y + z) if(ir::instruction* bin_rhs = is_bin_add(rhs)){ @@ -123,6 +132,11 @@ inline ir::value *reassociate::reorder_op(ir::value *old_value, if(new_value != op) to_delete.push_back(bin_rhs); } + if(new_value != old_value){ + params_->copy(new_value, old_value); + params_->copy(((ir::instruction*)new_value)->get_operand(0), old_value); + params_->copy(((ir::instruction*)new_value)->get_operand(1), old_value); + } } // extract constant and non-constant @@ -149,13 +163,39 @@ inline ir::value *reassociate::reorder_op(ir::value *old_value, return new_value; } -reassociate::reassociate() { - -} +reassociate::reassociate(tune* params) + : params_(params) +{ } void reassociate::run(ir::module &mod) { ir::builder &builder = mod.get_builder(); std::vector to_delete; + + // constant_range -> nv_dynamic_range_idx + nv_static_range_idx + for(ir::function *fn: mod.get_function_list()){ + std::vector ranges; + std::vector rpo = ir::cfg::reverse_post_order(fn); + for(ir::basic_block *block: rpo){ + // iterate through instruction + for(ir::instruction *i: block->get_inst_list()) + for(ir::value* op: i->ops()) + if(auto *range = dynamic_cast(op)) + ranges.push_back(range); + } + + builder.set_insert_point(rpo.front()->get_first_non_phi()); + for(ir::constant_range* old_range: ranges){ + ir::value* dyn_range = builder.insert(ir::nv_dynamic_range_idx_inst::create(old_range->get_type())); + ir::value* static_range = ir::nv_static_range_idx::get(old_range); + ir::value* new_range = builder.create_add(dyn_range, static_range); + old_range->replace_all_uses_with(new_range); + params_->copy(dyn_range, old_range); + params_->copy(static_range, old_range); + params_->copy(new_range, old_range); + } + } + + // reassociate for(ir::function *fn: mod.get_function_list()){ std::vector rpo = ir::cfg::reverse_post_order(fn); bool done = false; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 1f573c3f6..caf666bfd 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -690,12 +690,22 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, distributed_tile *T = new distributed_tile(ty, shapes, axes, builder, vectorize); tmap_.insert({v, T}); // constant range - if(dynamic_cast(v) && !dynamic_cast(v)){ + if(dynamic_cast(v)){ T->for_each([&](indices_t idx){ assert(idx.size() == 1); T->set_value(idx, idx[0]); }); } + if(dynamic_cast(v)){ + T->for_each([&](indices_t idx){ + assert(idx.size() == 1); + BinaryOperator *bin_add = dyn_cast(idx[0]); + assert(bin_add); + Value *res = bin_add->getOperand(1); + assert(isa(res)); + T->set_value(idx, res); + }); + } } } @@ -835,6 +845,16 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & result->set_value(idx, builder.CreateAdd(bin, offset)); }); } + // nv_dynamic_range_idx_inst + if(dynamic_cast(ins)){ + result->for_each([&](indices_t idx){ + assert(idx.size() == 1); + BinaryOperator *bin_add = dyn_cast(idx[0]); + assert(bin_add); + Value *res = bin_add->getOperand(0); + result->set_value(idx, res); + }); + } // // mask // else if(dynamic_cast(ins)) { // distributed_tile* pred = (distributed_tile*)tmap_.at(ins->get_operand(0)); diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index b649a4d7c..288eb4204 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -133,7 +133,7 @@ tune::fragment_t tune::get_fragmentation_type(node_t x, graph_t &graph){ } void tune::connected_components(node_t x, const std::vector mps, const std::vector prefixes, std::set &nodes, graph_t &graph, unsigned group_id) { - groups_[x.first][x.second] = group_id; + groups_[x.first].insert({x.second, group_id}); if(nodes.find(x) != nodes.end()){ nodes.erase(x); std::string suffix = ".d" + std::to_string(x.second); @@ -145,11 +145,11 @@ void tune::connected_components(node_t x, const std::vector if(auto mp = dynamic_cast(shape)) params_[x.first].insert({"shape" + suffix, mp}); } - if(auto range = dynamic_cast(x.first)){ - unsigned ax = range->get_axis(); - global_range_sizes_[ax] = params_[x.first].at("shape.d0"); - num_global_ranges_ = std::max(num_global_ranges_, ax + 1); - } +// if(auto range = dynamic_cast(x.first)){ +// unsigned ax = range->get_axis(); +// global_range_sizes_[ax] = params_[x.first].at("shape.d0"); +// num_global_ranges_ = std::max(num_global_ranges_, ax + 1); +// } if(static_params_.find(x) != static_params_.end()){ for(ir::metaparameter *mp: mps) mp->set_value(static_params_.at(x)); @@ -190,6 +190,14 @@ unsigned tune::get_param_group(ir::value *value, unsigned ax) { return result; } +//TODO: This shouldn't exist! +void tune::copy(ir::value *dst, ir::value *src) { + params_[dst] = params_[src]; + groups_[dst] = groups_[src]; + fragments_[{dst, 0}] = fragments_[{src, 0}]; +} + + void tune::run(ir::module &mod) { ir::context &ctx = mod.get_context(); // Create metaparameters diff --git a/lib/ir/constant.cpp b/lib/ir/constant.cpp index a2341f52f..ddc10028d 100644 --- a/lib/ir/constant.cpp +++ b/lib/ir/constant.cpp @@ -59,6 +59,8 @@ constant_int *constant_int::get(type *ty, uint64_t value) { // constant_range // FIXME use something like APInt +//"[" + std::to_string(first->get_value()) + " ... " + std::to_string(ty->get_tile_shapes()[0]->get_value()) + "]" + constant_range::constant_range(type *ty, constant_int *first, constant_int *last) : constant(ty, 0), first_(first), last_(last){ } diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 3968ce0e6..b7743c7d5 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -688,22 +688,48 @@ instruction* atomic_add_inst::create(value *ptr, value *val, const std::string & //===----------------------------------------------------------------------===// // intrinsic instructions //===----------------------------------------------------------------------===// +// copy to shared copy_to_shared_inst* copy_to_shared_inst::create(value *arg, const std::string &name, instruction *next) { return new copy_to_shared_inst(arg->get_type(), arg, name, next); } +// vectorize vectorize_inst* vectorize_inst::create(value *arg, const std::string &name, instruction *next) { return new vectorize_inst(arg->get_type(), arg, name, next); } +// barrier barrier_inst::barrier_inst(context &ctx, const std::string &name, instruction *next) - : instruction(type::get_void_ty(ctx), 0, 0, name, next){ } + : instruction(type::get_void_ty(ctx), 0, 0, name, next) { } barrier_inst* barrier_inst::create(context &ctx, const std::string &name, instruction *next) { return new barrier_inst(ctx, name, next); } +// nv_dynamic_range_idx +nv_dynamic_range_idx_inst::nv_dynamic_range_idx_inst(type *ty, const std::string &name, instruction *next) + : instruction(ty, 0, 1, name, next) { } + +nv_dynamic_range_idx_inst* nv_dynamic_range_idx_inst::create(type *ty, const std::string &name, instruction *next) { + return new nv_dynamic_range_idx_inst(ty, name, next); +} + +// nv_static_range_idx +nv_static_range_idx::nv_static_range_idx(constant_range *range) + : constant(range->get_type(), 0), range_(range) { } + +constant_range* nv_static_range_idx::get_range() const +{ return range_; } + +nv_static_range_idx* nv_static_range_idx::get(constant_range* range) { + static std::map cache; + if(cache.find(range) == cache.end()) + cache.insert({range, new nv_static_range_idx(range)}); + return cache.at(range); +} + + } } From b7fadb9986237b6c3b800e628420009e7874a151 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 23 Jul 2019 21:22:47 -0700 Subject: [PATCH 264/494] more stuff --- include/triton/codegen/reassociate.h | 9 ++- include/triton/ir/instructions.h | 1 + include/triton/runtime/jit.h | 1 + lib/codegen/reassociate.cpp | 111 +++++++++++++++++++++------ 4 files changed, 97 insertions(+), 25 deletions(-) diff --git a/include/triton/codegen/reassociate.h b/include/triton/codegen/reassociate.h index 9be8ed6bd..f6d30ea72 100644 --- a/include/triton/codegen/reassociate.h +++ b/include/triton/codegen/reassociate.h @@ -14,6 +14,7 @@ class module; class value; class builder; class instruction; +class getelementptr_inst; } namespace codegen{ @@ -21,9 +22,15 @@ namespace codegen{ class tune; class reassociate { + struct cst_info { + ir::value* sta; + ir::value* dyn; + }; + private: ir::instruction* is_bin_add(ir::value *x); - ir::value *reorder_op(ir::value *value, ir::builder &builder, std::vector& to_delete, ir::value *&noncst, ir::value *&cst); + ir::value *reassociate_idx(ir::value *value, ir::builder &builder, std::vector& to_delete, ir::value *&noncst, ir::value *&cst); + ir::value *reassociate_ptr(ir::getelementptr_inst* pz, ir::builder &builder, std::map &offsets); public: reassociate(tune *params); diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index 1bce5bd47..29b2678a3 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -384,6 +384,7 @@ public: type *get_source_elt_ty() { return source_elt_ty; } op_iterator idx_begin() { return op_begin() + 1; } op_iterator idx_end() { return op_end(); } + value *get_pointer_operand() { return *op_begin(); } // factory methods static getelementptr_inst* create(value *ptr, const std::vector &idx, diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index 8a1157940..a1dcfc578 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -79,6 +79,7 @@ public: alignment_info.run(module); reassociate.run(module); ir::print(module, std::cout); + //exit(EXIT_FAILURE); if(target_->is_gpu()){ shmem_info.run(module); shmem_liveness.run(module); diff --git a/lib/codegen/reassociate.cpp b/lib/codegen/reassociate.cpp index 2ca8828d7..a06141d0e 100644 --- a/lib/codegen/reassociate.cpp +++ b/lib/codegen/reassociate.cpp @@ -49,11 +49,34 @@ inline bool is_cst(ir::value *x) { } -inline ir::value *reassociate::reorder_op(ir::value *old_value, - ir::builder &builder, - std::vector& to_delete, - ir::value *&noncst, - ir::value *&cst){ +// reassociate pointer +// pz = py + a = (px + (cst + b)) + a -> (px + b) + (cst + a) +ir::value *reassociate::reassociate_ptr(ir::getelementptr_inst* pz, + ir::builder &builder, + std::map &info) { + ir::value *a = *pz->idx_begin(); + ir::value *vpy = pz->get_pointer_operand(); + if(info.find(vpy) == info.end()) + return nullptr; + ir::getelementptr_inst *py = (ir::getelementptr_inst*)vpy; + ir::value *px = py->get_pointer_operand(); + ir::value *cst = info.at(py).sta; + ir::value *b = info.at(py).dyn; + ir::value *new_py = builder.create_gep(px, {b}); + ir::value *new_a = builder.create_add(cst, a); + ir::value *new_pz = builder.create_gep(new_py, {new_a}); + params_->copy(new_pz, pz); + params_->copy(new_py, vpy); + params_->copy(new_a, a); + pz->replace_all_uses_with(new_pz); + return pz; +} + +ir::value *reassociate::reassociate_idx(ir::value *old_value, + ir::builder &builder, + std::vector& to_delete, + ir::value *&noncst, + ir::value *&cst){ // value doesn't change by default ir::value* new_value = old_value; cst = nullptr; @@ -63,7 +86,7 @@ inline ir::value *reassociate::reorder_op(ir::value *old_value, if(ir::instruction* op = dynamic_cast(old_value)){ auto shapes = op->get_type()->get_tile_shapes(); ir::value *old_arg = op->get_operand(0); - ir::value *new_arg = reorder_op(old_arg, builder, to_delete, noncst, cst); + ir::value *new_arg = reassociate_idx(old_arg, builder, to_delete, noncst, cst); // retile(x + y) = retile(x) + retile(y) if(ir::instruction* bin_add = is_bin_add(new_arg)) if(cst){ @@ -102,8 +125,8 @@ inline ir::value *reassociate::reorder_op(ir::value *old_value, if(ir::instruction* op = is_bin_add(old_value)){ builder.set_insert_point(op); std::string name = op->get_name(); - ir::value *lhs = reorder_op(op->get_operand (0), builder, to_delete, noncst, cst); - ir::value *rhs = reorder_op(op->get_operand(1), builder, to_delete, noncst, cst); + ir::value *lhs = reassociate_idx(op->get_operand (0), builder, to_delete, noncst, cst); + ir::value *rhs = reassociate_idx(op->get_operand(1), builder, to_delete, noncst, cst); builder.set_insert_point(op); // (x + y) + z if(ir::instruction* bin_lhs = is_bin_add(lhs)){ @@ -167,6 +190,8 @@ reassociate::reassociate(tune* params) : params_(params) { } + +/* run */ void reassociate::run(ir::module &mod) { ir::builder &builder = mod.get_builder(); std::vector to_delete; @@ -196,25 +221,63 @@ void reassociate::run(ir::module &mod) { } // reassociate + std::map infos; + std::map> re_ordered; + for(ir::function *fn: mod.get_function_list()){ std::vector rpo = ir::cfg::reverse_post_order(fn); - bool done = false; - do{ - // iterate through blocks - for(ir::basic_block *block: rpo){ - // iterate through instruction - for(ir::instruction *i: block->get_inst_list()){ - if(auto *gep = dynamic_cast(i)){ - std::vector idxs(gep->idx_begin(), gep->idx_end()); - ir::value *cst = nullptr; - ir::value *noncst = idxs[0]; - reorder_op(noncst, builder, to_delete, noncst, cst); -// std::cout << gep->get_name() << " " << noncst << " " << cst << std::endl; - } - } - done = true; + // iterate through blocks + for(ir::basic_block *block: rpo){ + // iterate through instruction + for(ir::instruction *i: block->get_inst_list()){ + // getelementptr instruction + if(ir::getelementptr_inst *pz = dynamic_cast(i)){ + + // pz = py + offset + // tries to achieve pz = py + (cst + a) + // by modifying py and/or offset + ir::value* py = pz->get_pointer_operand(); + ir::value* offset = *pz->idx_begin(); + + // reassociate index + ir::value *sta = nullptr; + ir::value *dyn = offset; + reassociate_idx(pz, builder, to_delete, dyn, sta); + if(sta){ + infos[pz] = {sta, dyn}; + re_ordered[block].insert(pz); } - }while(!done); + +// // reassociate pointer +// reassociate_ptr(pz, builder, offsets); + +// // reassociate phi-node +// if(ir::phi_node* phi = dynamic_cast(py)){ +// // only optimize the case where py = phi pa, pz +// std::vector ops = phi->ops(); +// if(!(ops.size() == 2 && (ops[0] == pz || ops[1] == pz))) +// continue; +// size_t idx_z = (ops[0] == pz) ? 0 : 1; +// size_t idx_a = (idx_z + 1) % 2; +// ir::value *vpa = phi->get_incoming_value(idx_a); +// ir::value *block_a = phi->get_incoming_block(idx_a); +// ir::value *block_z = phi->get_incoming_value(idx_z); +// auto it = infos.find(vpa); +// if(it == infos.end()) +// continue; +// ir::value *b = it->a; +// // pa = px + (cst + b) +// ir::getelementptr_inst *pa = (ir::getelementptr_inst*)vpa; +// ir::getelementptr_inst *px = pa->get_pointer_operand(); +// // new_pa = px + b +// ir::getelementptr_inst *new_pa = builder.create_gep(px, {b}); +// // new_pz = py + (offset + a) +// ir::getelementptr_inst *new_offset = builder.create_add(it->cst, dyn); +// ir::getelementptr_inst *new_pz = builder.create_gep(pz->get_pointer_operand(), {new_offset}); +// } + } + } + } } // erase dead code for(ir::instruction* i: to_delete) From 6ce82dfcdb027de79480a4c73b1cd5292d810a24 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 23 Jul 2019 22:19:57 -0700 Subject: [PATCH 265/494] FINALLY --- include/triton/codegen/reassociate.h | 10 +- include/triton/runtime/jit.h | 4 +- lib/codegen/reassociate.cpp | 134 ++++++++++++--------------- 3 files changed, 68 insertions(+), 80 deletions(-) diff --git a/include/triton/codegen/reassociate.h b/include/triton/codegen/reassociate.h index f6d30ea72..3c9cc813b 100644 --- a/include/triton/codegen/reassociate.h +++ b/include/triton/codegen/reassociate.h @@ -20,24 +20,26 @@ class getelementptr_inst; namespace codegen{ class tune; +class alignment_info; class reassociate { struct cst_info { - ir::value* sta; - ir::value* dyn; + ir::getelementptr_inst* dyn_ptr; + ir::getelementptr_inst* sta_ptr; }; private: ir::instruction* is_bin_add(ir::value *x); - ir::value *reassociate_idx(ir::value *value, ir::builder &builder, std::vector& to_delete, ir::value *&noncst, ir::value *&cst); + ir::value *reassociate_idx(ir::value *value, ir::builder &builder, ir::value *&noncst, ir::value *&cst); ir::value *reassociate_ptr(ir::getelementptr_inst* pz, ir::builder &builder, std::map &offsets); public: - reassociate(tune *params); + reassociate(tune *params, alignment_info *align); void run(ir::module& module); private: tune* params_; + alignment_info* align_; }; } diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index a1dcfc578..aa7a930bb 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -66,7 +66,7 @@ public: optimize_cse(), optimize_trans(), alignment_info(), - reassociate(&tune), + reassociate(&tune, &alignment_info), target_(target) { } void target_independent(ir::module &module) { @@ -79,7 +79,7 @@ public: alignment_info.run(module); reassociate.run(module); ir::print(module, std::cout); - //exit(EXIT_FAILURE); +// exit(EXIT_FAILURE); if(target_->is_gpu()){ shmem_info.run(module); shmem_liveness.run(module); diff --git a/lib/codegen/reassociate.cpp b/lib/codegen/reassociate.cpp index a06141d0e..fa7c256fd 100644 --- a/lib/codegen/reassociate.cpp +++ b/lib/codegen/reassociate.cpp @@ -1,5 +1,6 @@ #include #include "triton/codegen/reassociate.h" +#include "triton/codegen/alignment_info.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" @@ -48,33 +49,8 @@ inline bool is_cst(ir::value *x) { return false; } - -// reassociate pointer -// pz = py + a = (px + (cst + b)) + a -> (px + b) + (cst + a) -ir::value *reassociate::reassociate_ptr(ir::getelementptr_inst* pz, - ir::builder &builder, - std::map &info) { - ir::value *a = *pz->idx_begin(); - ir::value *vpy = pz->get_pointer_operand(); - if(info.find(vpy) == info.end()) - return nullptr; - ir::getelementptr_inst *py = (ir::getelementptr_inst*)vpy; - ir::value *px = py->get_pointer_operand(); - ir::value *cst = info.at(py).sta; - ir::value *b = info.at(py).dyn; - ir::value *new_py = builder.create_gep(px, {b}); - ir::value *new_a = builder.create_add(cst, a); - ir::value *new_pz = builder.create_gep(new_py, {new_a}); - params_->copy(new_pz, pz); - params_->copy(new_py, vpy); - params_->copy(new_a, a); - pz->replace_all_uses_with(new_pz); - return pz; -} - ir::value *reassociate::reassociate_idx(ir::value *old_value, ir::builder &builder, - std::vector& to_delete, ir::value *&noncst, ir::value *&cst){ // value doesn't change by default @@ -86,7 +62,7 @@ ir::value *reassociate::reassociate_idx(ir::value *old_value, if(ir::instruction* op = dynamic_cast(old_value)){ auto shapes = op->get_type()->get_tile_shapes(); ir::value *old_arg = op->get_operand(0); - ir::value *new_arg = reassociate_idx(old_arg, builder, to_delete, noncst, cst); + ir::value *new_arg = reassociate_idx(old_arg, builder, noncst, cst); // retile(x + y) = retile(x) + retile(y) if(ir::instruction* bin_add = is_bin_add(new_arg)) if(cst){ @@ -116,7 +92,6 @@ ir::value *reassociate::reassociate_idx(ir::value *old_value, params_->copy(new_value, old_value); params_->copy(new_lhs, old_value); params_->copy(new_rhs, old_value); - to_delete.push_back(op); } } } @@ -125,8 +100,8 @@ ir::value *reassociate::reassociate_idx(ir::value *old_value, if(ir::instruction* op = is_bin_add(old_value)){ builder.set_insert_point(op); std::string name = op->get_name(); - ir::value *lhs = reassociate_idx(op->get_operand (0), builder, to_delete, noncst, cst); - ir::value *rhs = reassociate_idx(op->get_operand(1), builder, to_delete, noncst, cst); + ir::value *lhs = reassociate_idx(op->get_operand (0), builder, noncst, cst); + ir::value *rhs = reassociate_idx(op->get_operand(1), builder, noncst, cst); builder.set_insert_point(op); // (x + y) + z if(ir::instruction* bin_lhs = is_bin_add(lhs)){ @@ -138,9 +113,6 @@ ir::value *reassociate::reassociate_idx(ir::value *old_value, // (x + cst) + y -> cst + (x + y) if(is_cst(rlhs)) new_value = builder.create_add(rlhs, builder.create_add(llhs, rhs), name); - if(new_value != old_value){ - to_delete.push_back(bin_lhs); - } } // x + (y + z) if(ir::instruction* bin_rhs = is_bin_add(rhs)){ @@ -152,8 +124,6 @@ ir::value *reassociate::reassociate_idx(ir::value *old_value, // x + (y + cst) -> cst + (x + y) if(is_cst(rrhs)) new_value = builder.create_add(rrhs, builder.create_add(lrhs, lhs), name, cst); - if(new_value != op) - to_delete.push_back(bin_rhs); } if(new_value != old_value){ params_->copy(new_value, old_value); @@ -179,22 +149,19 @@ ir::value *reassociate::reassociate_idx(ir::value *old_value, // clean-up if some re-ordering happened if(old_value != new_value){ old_value->replace_all_uses_with(new_value); - if(auto *x = dynamic_cast(old_value)) - to_delete.push_back(x); } return new_value; } -reassociate::reassociate(tune* params) - : params_(params) +reassociate::reassociate(tune* params, alignment_info* align) + : params_(params), align_(align) { } /* run */ void reassociate::run(ir::module &mod) { ir::builder &builder = mod.get_builder(); - std::vector to_delete; // constant_range -> nv_dynamic_range_idx + nv_static_range_idx for(ir::function *fn: mod.get_function_list()){ @@ -232,56 +199,75 @@ void reassociate::run(ir::module &mod) { for(ir::instruction *i: block->get_inst_list()){ // getelementptr instruction if(ir::getelementptr_inst *pz = dynamic_cast(i)){ - - // pz = py + offset - // tries to achieve pz = py + (cst + a) - // by modifying py and/or offset + // unpack GEP instruction ir::value* py = pz->get_pointer_operand(); ir::value* offset = *pz->idx_begin(); - // reassociate index ir::value *sta = nullptr; ir::value *dyn = offset; - reassociate_idx(pz, builder, to_delete, dyn, sta); + reassociate_idx(offset, builder, dyn, sta); if(sta){ - infos[pz] = {sta, dyn}; - re_ordered[block].insert(pz); + builder.set_insert_point(pz); + ir::value *dyn_ptr = builder.create_gep(py, {dyn}); + ir::value *sta_ptr = builder.create_gep(dyn_ptr, {sta}); + params_->copy(dyn_ptr, pz); + params_->copy(sta_ptr, pz); + align_->copy(sta_ptr, pz); + pz->replace_all_uses_with(sta_ptr); + infos[sta_ptr].dyn_ptr = (ir::getelementptr_inst*)dyn_ptr; + infos[sta_ptr].sta_ptr = (ir::getelementptr_inst*)sta_ptr; + } + // reassociate phi-node pointer + if(ir::phi_node* phi = dynamic_cast(py)){ + // only optimize the case where py = phi pa, pz for now + std::vector ops = phi->ops(); + if(ops.size() != 2) + continue; + if(ops[0] != pz && ops[1] != pz) + continue; + // grab incoming + size_t idx_z = (ops[0] == pz) ? 0 : 1; + size_t idx_a = (ops[0] == pz) ? 1 : 0; + // check if pa is known to have constant offset + ir::value *vpa = phi->get_incoming_value(idx_a); + auto it = infos.find(vpa); + if(it == infos.end()) + continue; + ir::getelementptr_inst *pa = (ir::getelementptr_inst*)vpa; + // unpack dynamically/statically offset pointer + ir::getelementptr_inst *dyn_ptr = it->second.dyn_ptr; + ir::getelementptr_inst *sta_ptr = it->second.sta_ptr; + // we take static offset out of the phi function + builder.set_insert_point(phi); + ir::phi_node *new_phi = builder.create_phi(phi->get_type(), 2); + // new pz for phi has the same offsets + builder.set_insert_point(pz); + std::vector idxs(pz->idx_begin(), pz->idx_end()); + ir::value *new_phi_pz = builder.create_gep(new_phi, idxs); + // fold the static offset into the new pz value + ir::value *new_pz = builder.create_gep(new_phi_pz, {*sta_ptr->idx_begin()}); + // populate incoming values + new_phi->add_incoming(dyn_ptr, phi->get_incoming_block(idx_a)); + new_phi->add_incoming(new_phi_pz, phi->get_incoming_block(idx_z)); + // replace phi uses + phi->replace_all_uses_with(new_phi); + // replace pz uses + pz->replace_all_uses_with(new_pz); + // copy params + params_->copy(new_phi_pz, pz); + params_->copy(new_phi, phi); + params_->copy(new_pz, pz); + align_->copy(new_pz, pz); } // // reassociate pointer // reassociate_ptr(pz, builder, offsets); -// // reassociate phi-node -// if(ir::phi_node* phi = dynamic_cast(py)){ -// // only optimize the case where py = phi pa, pz -// std::vector ops = phi->ops(); -// if(!(ops.size() == 2 && (ops[0] == pz || ops[1] == pz))) -// continue; -// size_t idx_z = (ops[0] == pz) ? 0 : 1; -// size_t idx_a = (idx_z + 1) % 2; -// ir::value *vpa = phi->get_incoming_value(idx_a); -// ir::value *block_a = phi->get_incoming_block(idx_a); -// ir::value *block_z = phi->get_incoming_value(idx_z); -// auto it = infos.find(vpa); -// if(it == infos.end()) -// continue; -// ir::value *b = it->a; -// // pa = px + (cst + b) -// ir::getelementptr_inst *pa = (ir::getelementptr_inst*)vpa; -// ir::getelementptr_inst *px = pa->get_pointer_operand(); -// // new_pa = px + b -// ir::getelementptr_inst *new_pa = builder.create_gep(px, {b}); -// // new_pz = py + (offset + a) -// ir::getelementptr_inst *new_offset = builder.create_add(it->cst, dyn); -// ir::getelementptr_inst *new_pz = builder.create_gep(pz->get_pointer_operand(), {new_offset}); -// } + } } } } - // erase dead code - for(ir::instruction* i: to_delete) - i->erase_from_parent(); } } From 2a377bc8b143f37dc30f62865305adf3aa3620fb Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 25 Jul 2019 15:06:15 -0700 Subject: [PATCH 266/494] [ir] deleted mask/merge instructions; will be replaced by masked_load/store and select --- examples/cpp/dot.cpp | 6 +- examples/cpp/shift.cpp | 2 +- .../{optimize_cse.h => optimize_dce.h} | 4 +- include/triton/codegen/optimize_trans.h | 2 +- include/triton/codegen/selection.h | 11 - include/triton/dnn/heuristics.h | 2 +- include/triton/ir/builder.h | 7 +- include/triton/ir/instructions.h | 131 ++++------ include/triton/lang/expression.h | 10 - include/triton/lang/parser.y | 5 +- include/triton/lang/scanner.l | 1 - include/triton/runtime/jit.h | 11 +- lib/codegen/alignment_info.cpp | 11 - lib/codegen/optimize_cse.cpp | 14 - lib/codegen/optimize_dce.cpp | 60 +++++ lib/codegen/optimize_trans.cpp | 11 +- lib/codegen/reassociate.cpp | 7 - lib/codegen/selection.cpp | 243 +++++++++++------- lib/codegen/tune.cpp | 4 +- lib/dnn/gemm.cpp | 13 +- lib/driver/module.cpp | 2 +- lib/ir/builder.cpp | 30 +-- lib/ir/instructions.cpp | 104 +++----- lib/ir/print.cpp | 6 - lib/lang/declaration.cpp | 2 +- lib/lang/expression.cpp | 55 ++-- lib/lang/statement.cpp | 40 ++- 27 files changed, 387 insertions(+), 407 deletions(-) rename include/triton/codegen/{optimize_cse.h => optimize_dce.h} (87%) delete mode 100644 lib/codegen/optimize_cse.cpp create mode 100644 lib/codegen/optimize_dce.cpp diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 6ec396a1a..efebc102e 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -48,14 +48,14 @@ perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int stream->synchronize(); triton::dnn::dot dot(M, N, K, AT, BT, ty, ty, 8, 8); // benchmark triton - double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::NO_TUNING);}, stream); + double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::FULL_TUNING);}, stream); // benchmark cublas NumericT alpha = 1; NumericT beta = 0; int32_t lda = AT ? K : M; int32_t ldb = BT ? N : K; int32_t ldc = M; - cublasGemmAlgo_t fastest; +// cublasGemmAlgo_t fastest; // cublasGemm(HALF_TYPE, stream, AT, BT, M, N, K, // &alpha, da, lda, // db, ldb, &beta, @@ -109,6 +109,6 @@ int main() { // does the work for(config_t c: configs){ perf_t perf = c.perf(stream); - std::cout << c.repr() << ", " << perf.triton << ", " << perf.cublas << std::endl; + std::cout << "// " << c.repr() << ", " << perf.triton << ", " << perf.cublas << std::endl; } } diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index fc10d4316..91ed2daaa 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -144,6 +144,6 @@ int main() { for(config_t c: configs){ std::string repr = c.repr(); perf_t perf = c.perf(stream); - std::cout << repr << ", " << perf.triton << ", " << perf.cublas << std::endl; + std::cout << "// " << repr << ", " << perf.triton << ", " << perf.cublas << std::endl; } } diff --git a/include/triton/codegen/optimize_cse.h b/include/triton/codegen/optimize_dce.h similarity index 87% rename from include/triton/codegen/optimize_cse.h rename to include/triton/codegen/optimize_dce.h index d718f318e..e40bafef5 100644 --- a/include/triton/codegen/optimize_cse.h +++ b/include/triton/codegen/optimize_dce.h @@ -14,9 +14,9 @@ namespace ir { namespace codegen{ class tune; -class optimize_cse { +class optimize_dce { public: - optimize_cse() {} + optimize_dce() {} void run(ir::module &mod); }; diff --git a/include/triton/codegen/optimize_trans.h b/include/triton/codegen/optimize_trans.h index beaace2a5..c6ec73b4d 100644 --- a/include/triton/codegen/optimize_trans.h +++ b/include/triton/codegen/optimize_trans.h @@ -19,7 +19,7 @@ namespace codegen{ class optimize_trans { private: - ir::value *replace_phi(ir::value* value, std::vector& to_delete, ir::builder &builder); + ir::value *replace_phi(ir::value* value, ir::builder &builder); public: optimize_trans() {} diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 4355bfce6..e1d2dbf0b 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -104,19 +104,10 @@ private: }; -// Fragmented tile -class fragmented_tile: public tile{ -public: - -private: - -}; - // Selection pass class selection{ typedef std::map vmap_t; typedef std::map tmap_t; - typedef std::map, llvm::BasicBlock*> pmap_t; private: // utils @@ -152,8 +143,6 @@ public: private: vmap_t vmap_; tmap_t tmap_; - pmap_t pmap_; - pmap_t last_block_; shmem_allocation *alloc_; tune *params_; target *tgt_; diff --git a/include/triton/dnn/heuristics.h b/include/triton/dnn/heuristics.h index bd9bc50aa..e2efe6df2 100644 --- a/include/triton/dnn/heuristics.h +++ b/include/triton/dnn/heuristics.h @@ -101,7 +101,7 @@ inline std::vector dot_search_space(bool AT, bool BT) { inline params_t dot_heuristics(bool AT, bool BT, size_t M, size_t N, size_t K) { size_t TM = 128; size_t TN = 128; -// return {4, 8, 256, 8, 8, 64, 2, 2, 2, 2, 32, 32, 16, 1}; +// return {4, 4, 128, 8, 4, 128, 2, 2, 2, 2, 32, 32, 16, 1}; return params.at(trans_key_t{AT, BT}).at(size_key_t{TM, TN}); } diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h index 598c82454..9cee12c68 100644 --- a/include/triton/ir/builder.h +++ b/include/triton/ir/builder.h @@ -50,15 +50,14 @@ public: block_->get_inst_list().insert(insert_point_, inst); inst->set_parent(block_); inst->set_name(name); +// for(ir::value* op: inst->ops()) +// op->add_use(inst); return inst; } // terminator instructions value* create_br(basic_block *dest); value* create_cond_br(value *cond, basic_block* if_dest, basic_block* else_dest); value* create_ret_void(); - // Tile-level control flow -// value *create_mask(value *pred, const std::string &name = ""); -// value *create_merge(value *mask_true, value *value_true, value *mask_false, value *value_false, const std::string &name = ""); // Cast instructions value *create_cast(cast_inst::op_t op, value *v, type *dst_ty, const std::string &name = ""); value* create_si_to_fp(value *src, type *dst_ty, const std::string &name = ""); @@ -120,6 +119,8 @@ public: // Input/Output value *create_load(value *arg, const std::string &name = ""); value *create_store(value *ptr, value *val, const std::string &name = ""); + value *create_masked_load(value *arg, value *mask, value *false_value, const std::string &name = ""); + value *create_masked_store(value *ptr, value *val, value *mask, const std::string &name = ""); // Tile instruction value *create_splat(value *arg, const type::tile_shapes_t &shapes, const std::string &name = ""); value *create_reshape(value *arg, const type::tile_shapes_t &shapes, const std::string &name = ""); diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index 29b2678a3..d76ebf719 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -21,11 +21,6 @@ class context; class result_reference; class instruction: public user{ public: -// struct mask_info_t { -// value *pred; -// value *else_value; -// }; - virtual std::string repr_impl() const = 0; protected: @@ -38,11 +33,6 @@ public: const basic_block *get_parent() const { return parent_; } basic_block *get_parent() { return parent_; } void erase_from_parent(); -// // mask -// void set_mask_pred(value *pred) { resize_hidden(1); set_operand(get_num_operands(), pred); } -// value* get_mask_pred() const { if(get_num_hidden() == 0) return nullptr; return get_operand(get_num_operands()); } -// void set_mask_else(value *x) { resize_hidden(2); set_operand(get_num_operands() + 1, x); } -// value* get_mask_else() const { if(get_num_hidden() < 2) return nullptr; return get_operand(get_num_operands() + 1); } // helpers bool has_tile_result_or_op(); // repr @@ -56,8 +46,6 @@ public: unsigned get_metadata(ir::metadata::kind_t kind) { return metadatas_[kind];} private: basic_block *parent_; -// value *pred_; -// value *mask_pred_; std::vector results_; std::map metadatas_; }; @@ -336,35 +324,6 @@ public: const std::string &name = "", instruction *next = nullptr); }; -//// mask -//class mask_inst: public instruction { -//private: -// std::string repr_impl() const { return "mask"; } -// mask_inst(ir::value *pred, const std::string &name, instruction *next); - -//public: -// static mask_inst* create(ir::value *pred, const std::string &name = "", instruction *next = nullptr); -//}; - -//// merge -//class psi_inst: public instruction { -//private: -// std::string repr_impl() const { return "merge"; } -// psi_inst(ir::value *mask_true, ir::value *value_true, -// ir::value *mask_false, ir::value *value_false, -// const std::string &name, instruction *next); - -//public: -// static psi_inst* create(ir::value *mask_true, ir::value *value_true, -// ir::value *mask_false, ir::value *value_false, -// const std::string &name = "", instruction *next = nullptr); -// ir::value *get_mask_true() { return get_operand(0); } -// ir::value *get_value_true() { return get_operand(1); } -// ir::value *get_mask_false() { return get_operand(2); } -// ir::value *get_value_false() { return get_operand(3); } - -//}; - //===----------------------------------------------------------------------===// // getelementptr_inst classes //===----------------------------------------------------------------------===// @@ -399,43 +358,78 @@ private: // load_inst/store_inst classes //===----------------------------------------------------------------------===// -class load_inst: public unary_inst{ -private: - std::string repr_impl() const { return "load"; } - load_inst(value *ptr, const std::string &name, instruction *next); +class io_inst: public instruction { +protected: + io_inst(type *ty, unsigned num_ops, unsigned num_results = 1, const std::string &name = "", instruction *next = nullptr); +public: +// value *get_mask() const; +// value *get_false_value() const; +}; + +class load_inst: public io_inst{ +protected: + load_inst(value *ptr, unsigned num_extra_ops, const std::string &name, instruction *next); private: + std::string repr_impl() const { return "load"; } static type *get_pointee_type(type *ty); public: // accessors value *get_pointer_operand() { return get_operand(0); } - value *get_mask() const; - value *set_mask(value *mask); // factory method - static load_inst* create(value *ptr, const std::string &name = "", + static load_inst* create(value *ptr, + const std::string &name = "", instruction *next = nullptr); - -private: - value *mask_; }; -class store_inst: public instruction{ +class masked_load_inst: public load_inst{ private: - std::string repr_impl() const { return "store"; } - store_inst(value *ptr, value *v, const std::string &name, instruction *next); + std::string repr_impl() const { return "masked_load"; } + masked_load_inst(value *ptr, value *mask, value *false_value, + const std::string &name, instruction *next); public: - value *get_pointer_operand() { return get_operand(0); } - value *get_value_operand() { return get_operand(1); } - value *get_mask() const; - value *set_mask(value *mask); + // accessors + value *get_mask_operand() { return get_operand(1); } + value *get_false_value_operand() { return get_operand(2); } // factory method - static store_inst* create(value* ptr, value *v, const std::string &name = "", - instruction *next = nullptr); + static masked_load_inst* create(value *ptr, value *mask, value *false_value, + const std::string &name = "", + instruction *next = nullptr); +}; + +class store_inst: public io_inst{ +protected: + store_inst(value *ptr, value *v, unsigned num_extra_ops, + const std::string &name, instruction *next); private: - ir::value *mask_; + std::string repr_impl() const { return "store"; } + +public: + // accessors + value *get_pointer_operand() { return get_operand(0); } + value *get_value_operand() { return get_operand(1); } + // factory method + static store_inst* create(value* ptr, value *v, + const std::string &name = "", + instruction *next = nullptr); +}; + +class masked_store_inst: public store_inst{ +private: + std::string repr_impl() const { return "masked_store"; } + masked_store_inst(value *ptr, value *v, value *mask, + const std::string &name, instruction *next); + +public: + // accessors + value *get_mask_operand() { return get_operand(2); } + // factory method + static masked_store_inst* create(value *ptr, value *v, value *mask, + const std::string &name = "", + instruction *next = nullptr); }; //===----------------------------------------------------------------------===// @@ -507,21 +501,6 @@ protected: using instruction::instruction; }; -class get_global_range_inst: public builtin_inst { -private: - get_global_range_inst(type *ty, unsigned axis, const std::string &name, instruction *next); - std::string repr_impl() const { return "get_global_range(" + std::to_string(axis_) + ")"; } - -public: - static instruction* create(context &ctx, unsigned axis, type::tile_shapes_t::value_type size, - const std::string &name = "", - instruction *next = nullptr); - unsigned get_axis() const { return axis_; } - -private: - unsigned axis_; -}; - class get_range_id_inst: public builtin_inst { private: get_range_id_inst(type *ty, unsigned axis, const std::string &name, instruction *next); diff --git a/include/triton/lang/expression.h b/include/triton/lang/expression.h index 3d894c802..538485366 100644 --- a/include/triton/lang/expression.h +++ b/include/triton/lang/expression.h @@ -71,16 +71,6 @@ private: const constant* size_; }; -class get_global_range_expression: public builtin_expression{ -public: - get_global_range_expression(node *size, node *axis): size_((constant*)size), axis_((constant*)axis) { } - ir::value* codegen(ir::module *) const; - -private: - const constant* size_; - const constant* axis_; -}; - class get_range_id_expression: public builtin_expression{ public: get_range_id_expression(node *axis): axis_((constant*)axis) { } diff --git a/include/triton/lang/parser.y b/include/triton/lang/parser.y index 32b3c5ed4..645b0b51f 100644 --- a/include/triton/lang/parser.y +++ b/include/triton/lang/parser.y @@ -55,7 +55,7 @@ STORAGE_SPEC_T get_storage_spec(node *op) { return ((token*)op)->storage_spec;} %token VOID UINT1 UINT8 UINT16 UINT32 UINT64 INT1 INT8 INT16 INT32 INT64 FP16 FP32 FP64 %token IF ELSE FOR CONTINUE WHILE %token NEWAXIS ELLIPSIS AT -%token GET_GLOBAL_RANGE GET_RANGE_ID DOT SQRT REDUCE_SUM TRANS MAX MIN SELECT ATOMIC_CAS ATOMIC_EXCHG ATOMIC_ADD ALLOC_CONST +%token GET_RANGE_ID DOT SQRT REDUCE_SUM TRANS MAX MIN SELECT ATOMIC_CAS ATOMIC_EXCHG ATOMIC_ADD ALLOC_CONST %start translation_unit %% @@ -120,8 +120,7 @@ identifier /* Built-in */ builtin_expression - : GET_GLOBAL_RANGE '[' primary_expression ']' '(' constant ')' { $$ = new get_global_range_expression($3, $6); } - | GET_RANGE_ID '(' constant ')' { $$ = new get_range_id_expression($3); } + : GET_RANGE_ID '(' constant ')' { $$ = new get_range_id_expression($3); } | DOT '(' expression ',' expression ',' expression ')' { $$ = new matmul_expression($3, $5, $7); } | SQRT '(' expression ')' { $$ = new sqrt_expression($3); } | ALLOC_CONST type_specifier '[' constant ']' { $$ = new alloc_const_expression(new typed_declaration_specifier(get_type_spec($2)), $4); } diff --git a/include/triton/lang/scanner.l b/include/triton/lang/scanner.l index 0fbaa52d2..83d11035d 100644 --- a/include/triton/lang/scanner.l +++ b/include/triton/lang/scanner.l @@ -44,7 +44,6 @@ using triton::lang::return_void; "fp32" { return return_impl(FP32, yytext); } "fp64" { return return_impl(FP64, yytext); } "..." { return return_impl(ELLIPSIS, yytext); } -"get_global_range" { return return_impl(GET_GLOBAL_RANGE, yytext); } "get_range_id" { return return_impl(GET_RANGE_ID, yytext); } "__atomic_cas" { return return_impl(ATOMIC_CAS, yytext); } "__atomic_exchg" { return return_impl(ATOMIC_EXCHG, yytext); } diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index aa7a930bb..8f0f1ef73 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -11,7 +11,7 @@ #include "triton/codegen/selection.h" #include "triton/codegen/tune.h" #include "triton/codegen/optimize_dot.h" -#include "triton/codegen/optimize_cse.h" +#include "triton/codegen/optimize_dce.h" #include "triton/codegen/optimize_trans.h" #include "triton/codegen/shmem_allocation.h" #include "triton/codegen/shmem_liveness.h" @@ -63,7 +63,7 @@ public: vectorize(&tune), selection(&shmem_allocation, &tune, &shmem_info, &alignment_info, target), optimize_dot(&tune), - optimize_cse(), + optimize_dce(), optimize_trans(), alignment_info(), reassociate(&tune, &alignment_info), @@ -72,14 +72,11 @@ public: void target_independent(ir::module &module) { optimize_dot.run(module); optimize_trans.run(module); -// ir::print(module, std::cout); } void target_dependent(ir::module &module) { alignment_info.run(module); reassociate.run(module); - ir::print(module, std::cout); -// exit(EXIT_FAILURE); if(target_->is_gpu()){ shmem_info.run(module); shmem_liveness.run(module); @@ -87,6 +84,8 @@ public: shmem_barriers.run(module); } vectorize.run(module); + optimize_dce.run(module); +// ir::print(module, std::cout); } codegen::tune tune; @@ -97,7 +96,7 @@ public: codegen::vectorize vectorize; codegen::selection selection; codegen::optimize_dot optimize_dot; - codegen::optimize_cse optimize_cse; + codegen::optimize_dce optimize_dce; codegen::optimize_trans optimize_trans; codegen::alignment_info alignment_info; codegen::reassociate reassociate; diff --git a/lib/codegen/alignment_info.cpp b/lib/codegen/alignment_info.cpp index 7c40229a2..87df925df 100644 --- a/lib/codegen/alignment_info.cpp +++ b/lib/codegen/alignment_info.cpp @@ -109,8 +109,6 @@ unsigned alignment_info::populate_max_contiguous(ir::value *v){ if(!v->get_type()->is_tile_ty()) return cache(1); auto shapes = v->get_type()->get_tile_shapes(); - if(dynamic_cast(v)) - return cache(shapes[0]->get_value()); if(dynamic_cast(v)) return cache(shapes[0]->get_value()); if(auto *x = dynamic_cast(v)){ @@ -243,14 +241,6 @@ unsigned alignment_info::populate_starting_multiple(ir::value *v){ int op = populate_starting_multiple(x->get_operand(0)); return cache(op); } - if(auto *x = dynamic_cast(v)){ - return cache(v->get_type()->get_tile_shapes()[0]->get_value()); - } -// if(auto *x = dynamic_cast(v)){ -// int value_true = populate_starting_multiple(x->get_value_true()); -// int value_false = populate_starting_multiple(x->get_value_false()); -// return cache(gcd(value_true, value_false)); -// } if(auto *x = dynamic_cast(v)){ // put a conservative initial value in phi node to avoid infinite recursion unsigned result = 1; @@ -313,7 +303,6 @@ void alignment_info::run(ir::module &mod) { for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()){ populate_max_contiguous(i); - std::cout << i->get_name() << " " << is_constant_.at(i).num_cst << " " << max_contiguous_.at(i) << " " << starting_multiple_.at(i) << std::endl; } } diff --git a/lib/codegen/optimize_cse.cpp b/lib/codegen/optimize_cse.cpp deleted file mode 100644 index b0c07a99e..000000000 --- a/lib/codegen/optimize_cse.cpp +++ /dev/null @@ -1,14 +0,0 @@ -#include "triton/ir/function.h" -#include "triton/ir/basic_block.h" -#include "triton/ir/module.h" -#include "triton/codegen/optimize_cse.h" - -namespace triton { -namespace codegen{ - - -void optimize_cse::run(ir::module &mod) { -} - -} -} diff --git a/lib/codegen/optimize_dce.cpp b/lib/codegen/optimize_dce.cpp new file mode 100644 index 000000000..d30bf4c1d --- /dev/null +++ b/lib/codegen/optimize_dce.cpp @@ -0,0 +1,60 @@ +#include "triton/ir/function.h" +#include "triton/ir/basic_block.h" +#include "triton/ir/module.h" +#include "triton/ir/cfg.h" +#include "triton/codegen/optimize_dce.h" + +namespace triton { +namespace codegen{ + + +void optimize_dce::run(ir::module &mod) { + std::list work_list; + std::set marked; + + // initialize work-list + for(ir::function *fn: mod.get_function_list()){ + std::vector rpo = ir::cfg::reverse_post_order(fn); + // iterate through blocks + for(ir::basic_block *block: rpo) + for(ir::instruction *i: block->get_inst_list()){ + if(dynamic_cast(i) || dynamic_cast(i) || dynamic_cast(i) + || dynamic_cast(i) || dynamic_cast(i)){ + work_list.push_back(i); + marked.insert(i); + } + } + } + + // mark -- ignore branches + while(!work_list.empty()){ + ir::instruction* current = work_list.back(); + work_list.pop_back(); + // mark instruction operands + for(ir::value* op: current->ops()) { + if(auto *i = dynamic_cast(op)) + if(marked.insert(i).second) + work_list.push_back(i); + } + // TODO: mark last intstruction of current's reverse-dominance frontier + } + + // sweep -- delete non-branch unmarked instructions + std::vector to_delete; + for(ir::function *fn: mod.get_function_list()){ + std::vector rpo = ir::cfg::reverse_post_order(fn); + // iterate through blocks + for(ir::basic_block *block: rpo) + for(ir::instruction *i: block->get_inst_list()){ + if(marked.find(i) == marked.end()) + to_delete.push_back(i); + } + } + + // delete + for(ir::instruction* i: to_delete) + i->erase_from_parent(); +} + +} +} diff --git a/lib/codegen/optimize_trans.cpp b/lib/codegen/optimize_trans.cpp index b6ad7cfd2..0fb96ac96 100644 --- a/lib/codegen/optimize_trans.cpp +++ b/lib/codegen/optimize_trans.cpp @@ -7,20 +7,18 @@ namespace codegen{ ir::value* optimize_trans::replace_phi(ir::value* value, - std::vector& to_delete, ir::builder& builder){ if(auto phi = dynamic_cast(value)) { // transpose operands std::vector incs; for(unsigned n = 0; n < phi->get_num_incoming(); n++) - incs.push_back(replace_phi(phi->get_incoming_value(n), to_delete, builder)); + incs.push_back(replace_phi(phi->get_incoming_value(n), builder)); // create phi for transposed values builder.set_insert_point(phi); ir::phi_node* result = builder.create_phi(incs[0]->get_type(), incs.size(), phi->get_name()); for(unsigned n = 0; n < phi->get_num_incoming(); n++) result->add_incoming(incs[n], phi->get_incoming_block(n)); phi->replace_all_uses_with(result); - to_delete.push_back(phi); return result; } else if(auto i = dynamic_cast(value)){ @@ -39,7 +37,6 @@ ir::value* optimize_trans::replace_phi(ir::value* value, void optimize_trans::run(ir::module &mod) { ir::builder &builder = mod.get_builder(); - std::vector to_delete; // iterate for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) @@ -56,15 +53,11 @@ void optimize_trans::run(ir::module &mod) { // trans(phi) -> phi(trans(), trans()...) if(dynamic_cast(op)){ - ir::value* new_phi = replace_phi(op, to_delete, builder); - to_delete.push_back(trans); + ir::value* new_phi = replace_phi(op, builder); trans->replace_all_uses_with(new_phi); } } } - // erase dead code - for(ir::instruction* i: to_delete) - i->erase_from_parent(); } } diff --git a/lib/codegen/reassociate.cpp b/lib/codegen/reassociate.cpp index fa7c256fd..bf36b2033 100644 --- a/lib/codegen/reassociate.cpp +++ b/lib/codegen/reassociate.cpp @@ -189,8 +189,6 @@ void reassociate::run(ir::module &mod) { // reassociate std::map infos; - std::map> re_ordered; - for(ir::function *fn: mod.get_function_list()){ std::vector rpo = ir::cfg::reverse_post_order(fn); // iterate through blocks @@ -259,11 +257,6 @@ void reassociate::run(ir::module &mod) { params_->copy(new_pz, pz); align_->copy(new_pz, pz); } - -// // reassociate pointer -// reassociate_ptr(pz, builder, offsets); - - } } } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index caf666bfd..b4e40a3f2 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -32,7 +32,7 @@ void distributed_tile::init_indices() { current.push_back(axes_[d].values[id[d]]); size_t sz = indices_.size(); indices_[current] = sz; - values_[current] = UndefValue::get(ty_); + values_[current] = nullptr; ordered_indices_.push_back(current); id[0]++; while(id[k] == axes_[k].values.size()){ @@ -57,12 +57,17 @@ distributed_tile::distributed_tile(Type *ty, const shapes_t &shapes, const axes_ init_indices(); } -void distributed_tile::set_value(indices_t idx, Value *v) { - values_[idx] = v; +void distributed_tile::set_value(indices_t idx, Value *x) { + assert(x->getType() == ty_ && "cannot set a value of different type"); + Value *&result = values_[idx]; + assert(!result && "value cannot be set twice"); + result = x; } Value* distributed_tile::get_value(indices_t idx) { - return values_[idx]; + Value *result = values_.at(idx); + assert(result && "value has not been set"); + return result; } unsigned distributed_tile::get_linear_index(indices_t idx) { @@ -688,15 +693,15 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, } bool vectorize = dynamic_cast(v); distributed_tile *T = new distributed_tile(ty, shapes, axes, builder, vectorize); - tmap_.insert({v, T}); + bool is_inserted = tmap_.insert({v, T}).second; // constant range - if(dynamic_cast(v)){ + if(is_inserted && dynamic_cast(v)){ T->for_each([&](indices_t idx){ assert(idx.size() == 1); T->set_value(idx, idx[0]); }); } - if(dynamic_cast(v)){ + if(is_inserted && dynamic_cast(v)){ T->for_each([&](indices_t idx){ assert(idx.size() == 1); BinaryOperator *bin_add = dyn_cast(idx[0]); @@ -746,31 +751,41 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & LLVMContext &ctx = builder.getContext(); Function *fn = block->getParent(); // store - if(auto *x = dynamic_cast(ins)) { - distributed_tile* ptr = (distributed_tile*)tmap_.at(x->get_pointer_operand()); - tile *value = tmap_.at(x->get_value_operand()); - ir::value *mask = x->get_mask(); - if(mask) { - distributed_tile* preds = (distributed_tile*)tmap_.at(mask); - ptr->for_each([&](indices_t idx){ - BasicBlock *mask_then_bb = BasicBlock::Create(ctx, "mask_then", fn); - BasicBlock *mask_done_bb = BasicBlock::Create(ctx, "mask_done", fn); - builder.CreateCondBr(preds->get_value(idx), mask_then_bb, mask_done_bb); - builder.SetInsertPoint(mask_then_bb); - builder.CreateStore(value->get_value(idx), ptr->get_value(idx)); - builder.CreateBr(mask_done_bb); - builder.SetInsertPoint(mask_done_bb); - }); - } - else { - ptr->for_each([&](indices_t idx){ - if(GetElementPtrInst *gep = dyn_cast(ptr->get_value(idx))) - if(BinaryOperator *binop = dyn_cast(*gep->idx_begin())){ - std::cout << isa(binop->getOperand(0)) << " " << isa(binop->getOperand(1)) << std::endl; - } - builder.CreateStore(value->get_value(idx), ptr->get_value(idx)); - }); - } + if(auto *x = dynamic_cast(ins)){ + distributed_tile* ptrs = (distributed_tile*)tmap_.at(x->get_pointer_operand()); + tile *scalars = tmap_.at(x->get_value_operand()); + ir::value *mask = x->get_mask_operand(); + distributed_tile* preds = (distributed_tile*)tmap_.at(mask); + ptrs->for_each([&](indices_t idx){ + Value *scalar = scalars->get_value(idx); + Value *ptr = ptrs->get_value(idx); + Value *pred = preds->get_value(idx); +// std::string offset = ""; +// if(GetElementPtrInst *gep = dyn_cast(ptr)) +// if(gep->getNumIndices() == 1) +// if(ConstantInt *cst = dyn_cast(gep->idx_begin())){ +// offset = " + " + std::to_string(cst->getValue().getSExtValue()*4); +// } +// FunctionType *ty = FunctionType::get(Type::getVoidTy(ctx), {pred->getType(), ptr->getType(), scalar->getType()}, false); +// std::string asm_str = "@$0 st.global.b32 [$1" + offset + "], $2;"; +// InlineAsm *iasm = InlineAsm::get(ty, asm_str, "b,l,f", true); +// builder.CreateCall(iasm, {pred, ptr, scalar}); + + BasicBlock *mask_then_bb = BasicBlock::Create(ctx, "mask_then", fn); + BasicBlock *mask_done_bb = BasicBlock::Create(ctx, "mask_done", fn); + builder.CreateCondBr(pred, mask_then_bb, mask_done_bb); + builder.SetInsertPoint(mask_then_bb); + builder.CreateStore(scalar, ptr); + builder.CreateBr(mask_done_bb); + builder.SetInsertPoint(mask_done_bb); + }); + } + else if(auto *x = dynamic_cast(ins)) { + distributed_tile* ptrs = (distributed_tile*)tmap_.at(x->get_pointer_operand()); + tile *scalars = tmap_.at(x->get_value_operand()); + ptrs->for_each([&](indices_t idx){ + builder.CreateStore(scalars->get_value(idx), ptrs->get_value(idx)); + }); } else { if(auto *x = dynamic_cast(ins)){ @@ -837,14 +852,6 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & if(!ins->get_type()->is_tile_ty()) return; const auto& shapes = ins->get_type()->get_tile_shapes(); - // global_range - if(auto *x = dynamic_cast(ins)) { - Value *offset = tgt_->get_global_offset(module, builder, shapes[0]->get_value(), x->get_axis()); - result->for_each([&](indices_t idx){ - BinaryOperator *bin = static_cast(idx[0]); - result->set_value(idx, builder.CreateAdd(bin, offset)); - }); - } // nv_dynamic_range_idx_inst if(dynamic_cast(ins)){ result->for_each([&](indices_t idx){ @@ -855,49 +862,6 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & result->set_value(idx, res); }); } -// // mask -// else if(dynamic_cast(ins)) { -// distributed_tile* pred = (distributed_tile*)tmap_.at(ins->get_operand(0)); -// distributed_tile* mask_tile_true = (distributed_tile*)tmap_.at(ins->get_result(0)); -// distributed_tile* mask_tile_false = (distributed_tile*)tmap_.at(ins->get_result(1)); -// pred->for_each([&](indices_t idx){ -// BasicBlock *mask_then_bb = BasicBlock::Create(ctx, "mask_then", fn); -// BasicBlock* mask_else_bb = BasicBlock::Create(ctx, "mask_else", fn); -// BasicBlock *mask_done_bb = BasicBlock::Create(ctx, "mask_done", fn); -// builder.CreateCondBr(pred->get_value(idx), mask_then_bb, mask_else_bb); -// builder.SetInsertPoint(mask_then_bb); -// builder.CreateBr(mask_done_bb); -// builder.SetInsertPoint(mask_else_bb); -// builder.CreateBr(mask_done_bb); -// builder.SetInsertPoint(mask_done_bb); -// pmap_.insert({{mask_tile_true, idx}, mask_then_bb}); -// pmap_.insert({{mask_tile_false, idx}, mask_else_bb}); -// last_block_.insert({{mask_tile_true, idx}, mask_done_bb}); -// last_block_.insert({{mask_tile_false, idx}, mask_done_bb}); -// }); -// } -// // merge -// else if(auto *merge = dynamic_cast(ins)) { -// distributed_tile* mask_tile_true = (distributed_tile*)tmap_.at(merge->get_mask_true()); -// distributed_tile *value_tile_true = (distributed_tile*)tmap_.at(merge->get_value_true()); -// distributed_tile* mask_tile_false = (distributed_tile*)tmap_.at(merge->get_mask_false()); -// distributed_tile *value_tile_false = (distributed_tile*)tmap_.at(merge->get_value_false()); -// result->for_each([&](indices_t idx){ -// BasicBlock *block_true = pmap_.at({mask_tile_true, idx}); -// Value *value_true = value_tile_true->get_value(idx); -// BasicBlock *block_false = pmap_.at({mask_tile_false, idx}); -// Value *value_false = value_tile_false->get_value(idx); -// BasicBlock *block_done = last_block_.at({mask_tile_true, idx}); -// if(block_done->getTerminator()) -// builder.SetInsertPoint(block_done->getTerminator()); -// else -// builder.SetInsertPoint(block_done); -// PHINode *phi = builder.CreatePHI(value_true->getType(), 2); -// phi->addIncoming(value_true, block_true); -// phi->addIncoming(value_false,block_false); -// result->set_value(idx, phi); -// }); -// } // reshape else if(dynamic_cast(ins)) { ir::value* in = ins->get_operand(0); @@ -939,9 +903,10 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & in->for_each([&](indices_t idx){ unsigned linear = in->get_linear_index(idx); unsigned id = linear / vector_size; + Value *in_value = in->get_value(idx); if(linear % vector_size == 0) - packets[id] = result->get_value(idx); - packets[id] = builder.CreateInsertElement(packets.at(id), in->get_value(idx), linear % vector_size); + packets[id] = UndefValue::get(VectorType::get(in_value->getType(), vector_size)); + packets[id] = builder.CreateInsertElement(packets.at(id), in_value, linear % vector_size); }); result->for_each([&](indices_t idx){ unsigned linear = in->get_linear_index(idx); @@ -1017,8 +982,10 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & TB->set_return_mode(true); std::vector fc; + result->for_each([&](indices_t idx){ - fc.push_back(result->get_value(idx)); + fc.push_back(TC->get_value(idx)); +// fc.push_back(UndefValue::get(TC->get_value(idx)->getType())); }); Type *fp32_ty = builder.getFloatTy(); @@ -1076,10 +1043,10 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & Value *hb = TB->get_value(idx_b); for(unsigned ii = 0; ii < pack_size_0_; ii++) for(unsigned jj = 0; jj < pack_size_1_; jj++){ - Value *ha0 = builder.CreateExtractElement(ha, builder.getInt32(ii*pack_size_0_ + 0)); - Value *ha1 = builder.CreateExtractElement(ha, builder.getInt32(ii*pack_size_0_ + 1)); - Value *hb0 = builder.CreateExtractElement(hb, builder.getInt32(jj*pack_size_0_ + 0)); - Value *hb1 = builder.CreateExtractElement(hb, builder.getInt32(jj*pack_size_0_ + 1)); + Value *ha0 = builder.CreateBitCast(builder.CreateExtractElement(ha, builder.getInt32(ii*pack_size_0_ + 0)), fp16x2_ty); + Value *ha1 = builder.CreateBitCast(builder.CreateExtractElement(ha, builder.getInt32(ii*pack_size_0_ + 1)), fp16x2_ty); + Value *hb0 = builder.CreateBitCast(builder.CreateExtractElement(hb, builder.getInt32(jj*pack_size_0_ + 0)), fp16x2_ty); + Value *hb1 = builder.CreateBitCast(builder.CreateExtractElement(hb, builder.getInt32(jj*pack_size_0_ + 1)), fp16x2_ty); std::vector idx = { (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 0)*ld_fc, (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 1)*ld_fc, @@ -1136,24 +1103,106 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & }); } } - else if(auto *ld = dynamic_cast(ins)){ + else if(auto *ld = dynamic_cast(ins)){ + // find vector size ir::value *ptr = ld->get_pointer_operand(); unsigned starting_multiple = axis_info_->get_starting_multiple(ptr); unsigned max_contiguous = axis_info_->get_max_contiguous(ptr); unsigned alignment = std::min(starting_multiple, max_contiguous); unsigned vector_size = std::min(result->axis(0).contiguous, alignment); + distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr); + distributed_tile *masks = (distributed_tile*)tmap_.at(ld->get_mask_operand()); + distributed_tile *false_values = (distributed_tile*)tmap_.at(ld->get_false_value_operand()); std::map packets; - distributed_tile *TP = (distributed_tile*)tmap_.at(ld->get_pointer_operand()); result->for_each([&](indices_t idx){ unsigned linear = result->get_linear_index(idx); unsigned id = linear / vector_size; - if(linear % vector_size == 0){ - Value *ptr = TP->get_value(idx); - ptr= builder.CreateBitCast(ptr, PointerType::get(VectorType::get(result->get_ty(), vector_size), - ptr->getType()->getPointerAddressSpace())); + if(linear % vector_size == 0) { + Value *ptr = pointers->get_value(idx); + ConstantInt *cst = nullptr; + if(GetElementPtrInst *gep = dyn_cast(ptr)) + if(gep->getNumIndices() == 1){ + cst = dyn_cast(gep->idx_begin()); + } + + ptr = builder.CreateBitCast(ptr, PointerType::get(VectorType::get(result->get_ty(), vector_size), + ptr->getType()->getPointerAddressSpace())); + Value *mask = masks->get_value(idx); + BasicBlock *current_bb = builder.GetInsertBlock(); + BasicBlock *mask_then_bb = BasicBlock::Create(ctx, "mask_then", fn); + BasicBlock *mask_done_bb = BasicBlock::Create(ctx, "mask_done", fn); + builder.CreateCondBr(mask, mask_then_bb, mask_done_bb); + builder.SetInsertPoint(mask_then_bb); + Value *result_then = builder.CreateLoad(ptr); + builder.CreateBr(mask_done_bb); + builder.SetInsertPoint(mask_done_bb); + Value *result = nullptr; + if(false_values){ + result = builder.CreatePHI(result_then->getType(), 2); + ((PHINode*)result)->addIncoming(result_then, mask_then_bb); + Value *result_false = false_values->get_value(idx); + if(vector_size > 1) + result_false = builder.CreateVectorSplat(vector_size, result_false); + ((PHINode*)result)->addIncoming(result_false, current_bb); + } + else + result = result_then; + +// std::string offset = ""; +// if(cst) +// offset = " + " + std::to_string(cst->getValue().getSExtValue()*2*vector_size); +// Type *fp16x2_ty = VectorType::get(builder.getHalfTy(), 2); +// Type *fp16x2_pack4_ty = StructType::get(ctx, {fp16x2_ty, fp16x2_ty, fp16x2_ty, fp16x2_ty}); +// FunctionType *ty = FunctionType::get(fp16x2_pack4_ty, {mask->getType(), ptr->getType()}, false); +// std::string asm_str = "@$0 ld.global.nc.v4.b32 {$1, $2, $3, $4}, [$5" + offset + "];"; +// if(false_value) +// asm_str += "\n\t@!$0 mov.v4.b32 {$1, $2, $3, $4}, {0, 0, 0, 0};"; +// InlineAsm *iasm = InlineAsm::get(ty, asm_str, "b,=r,=r,=r,=r,l", true); +// Value *result = builder.CreateCall(iasm, {mask, ptr}); + + packets[id] = result; + } + }); + // extract result element + result->for_each([&](indices_t idx){ + unsigned linear = result->get_linear_index(idx); + unsigned id = linear / vector_size; +// Value *tmp = builder.CreateExtractValue(packets.at(id), {(linear % vector_size) / 2}); +// Value *res = builder.CreateExtractElement(tmp, (linear % vector_size) % 2); +// result->set_value(idx, res); + result->set_value(idx, builder.CreateExtractElement(packets.at(id), linear % vector_size)); + }); + } + else if(auto *ld = dynamic_cast(ins)){ + // find vector size + ir::value *ptr = ld->get_pointer_operand(); + unsigned starting_multiple = axis_info_->get_starting_multiple(ptr); + unsigned max_contiguous = axis_info_->get_max_contiguous(ptr); + unsigned alignment = std::min(starting_multiple, max_contiguous); + unsigned vector_size = std::min(result->axis(0).contiguous, alignment); + distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr); + // vector loads + std::map packets; + result->for_each([&](indices_t idx){ + unsigned linear = result->get_linear_index(idx); + unsigned id = linear / vector_size; + if(linear % vector_size == 0) { + Value *ptr = pointers->get_value(idx); + ConstantInt *cst = nullptr; + if(GetElementPtrInst *gep = dyn_cast(ptr)) + if(gep->getNumIndices() == 1){ + cst = dyn_cast(gep->idx_begin()); + } + ptr = builder.CreateBitCast(ptr, PointerType::get(VectorType::get(result->get_ty(), vector_size), + ptr->getType()->getPointerAddressSpace())); packets[id] = builder.CreateLoad(ptr); } - result->set_value(idx, builder.CreateExtractElement(packets.at(id), linear % vector_size)); + }); + // extract result element + result->for_each([&](indices_t idx){ + unsigned linear = result->get_linear_index(idx); + unsigned id = linear / vector_size; +// result->set_value(idx, builder.CreateExtractElement(packets.at(id), linear % vector_size)); }); } // element-wise diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 288eb4204..1da6240dd 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -106,9 +106,9 @@ void tune::init_c_graph(ir::instruction *v) { for(unsigned k = 0; k < v->get_num_results(); k++){ ir::value *result = v->get_result(k); for(unsigned i = 0; i < shapes.size(); i ++){ - for(ir::value* op: v->ops()){ + std::vector ops = v->ops(); + for(ir::value* op: ops) add_constraint({result, i}, {op, i}); - } } } } diff --git a/lib/dnn/gemm.cpp b/lib/dnn/gemm.cpp index 23e62ae76..897a26402 100644 --- a/lib/dnn/gemm.cpp +++ b/lib/dnn/gemm.cpp @@ -123,14 +123,16 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, )" + b_ty_ + R"(* pb[)" + BS + "] = B + rkb" + bcb0 + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; int1 checka[)" + AS + R"(] = (rka < K))" + bca0 + " && (rxa < M)" + bca1 + R"(; int1 checkb[)" + BS + R"(] = (rkb < K))" + bcb0 + " && (ryb < N)" + bcb1 + R"(; - )" + a_ty_ + R"( a[)" + AS + R"(] = *pa; - )" + b_ty_ + R"( b[)" + BS + R"(] = *pb; + )" + a_ty_ + R"( a[)" + AS + R"(] = checka ? *pa : 0; + )" + b_ty_ + R"( b[)" + BS + R"(] = checkb ? *pb : 0; for(int32 k = K; k > 0; k = k - TK){ c = dot()" + usea + ", " + useb + R"(, c); pa = pa + TK)" + lda0 + R"(; pb = pb + TK)" + ldb0 + R"(; - a = *pa; - b = *pb; + int1 checka[)" + AS + R"(] = k > TK; + int1 checkb[)" + BS + R"(] = k > TK; + a = checka ? *pa : 0; + b = checkb ? *pb : 0; } int32 rxc[TM] = ridx * TM + (0 ... TM); int32 ryc[TN] = ridy * TN + (0 ... TN); @@ -138,11 +140,10 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, int1 checkc1[TN] = ryc < N; int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - *pc = c; + @checkc *pc = c; } )"; - std::cout << res << std::endl; os << res; } diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 551a55a20..4ff863666 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -255,7 +255,7 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ - std::cout << source << std::endl; +// std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index bff68e083..e58fd9924 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -85,20 +85,6 @@ value *builder::create_ret_void() { return insert(return_inst::create(ctx_)); } - -//===----------------------------------------------------------------------===// -// tile-level control-flow instructions -//===----------------------------------------------------------------------===// - -//value *builder::create_mask(value *pred, const std::string &name){ -// return insert(mask_inst::create(pred, name)); -//} - -//value *builder::create_merge(value *mask_true, value *value_true, value *mask_false, value *value_false, const std::string &name) { -// return insert(psi_inst::create(mask_true, value_true, mask_false, value_false, name)); -//} - - //===----------------------------------------------------------------------===// // cast instructions //===----------------------------------------------------------------------===// @@ -264,14 +250,22 @@ DEFINE_FCMP_INSTR(ONE, llvm::FCmpInst::FCMP_ONE) // load/store instructions //===----------------------------------------------------------------------===// -value *builder::create_load(value *arg, const std::string &name){ - return insert(load_inst::create(arg, name)); +value *builder::create_load(value *ptr, const std::string &name){ + return insert(load_inst::create(ptr, name)); } value *builder::create_store(value *ptr, value *val, const std::string &name){ return insert(store_inst::create(ptr, val, name)); } +value *builder::create_masked_load(value *ptr, value *mask, value *false_value, const std::string &name){ + return insert(masked_load_inst::create(ptr, mask, false_value, name)); +} + +value *builder::create_masked_store(value *ptr, value *val, value *mask, const std::string &name){ + return insert(masked_store_inst::create(ptr, val, mask, name)); +} + //===----------------------------------------------------------------------===// // tile instructions //===----------------------------------------------------------------------===// @@ -296,10 +290,6 @@ value *builder::create_downcast(value *arg, const std::string &name) { // built-in instructions //===----------------------------------------------------------------------===// -value *builder::create_get_global_range(unsigned axis, type::tile_shapes_t::value_type size, const std::string &name) { - return insert(get_global_range_inst::create(ctx_, axis, size, name)); -} - value *builder::create_get_range_id(unsigned axis, const std::string &name) { return insert(get_range_id_inst::create(ctx_, axis, name)); } diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index b7743c7d5..9537336fb 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -270,6 +270,7 @@ std::string cast_inst::repr_impl() const { } // TODO bool cast_inst::is_valid(op_t op, value *arg, type *ty) { + assert(arg->get_type()->is_tile_ty() == ty->is_tile_ty()); return true; } @@ -348,34 +349,6 @@ cond_branch_inst::cond_branch_inst(basic_block *if_dst, basic_block *else_dst, v set_operand(2, cond); } -// mask_inst -//mask_inst::mask_inst(value *pred, const std::string &name, instruction *next) -// : instruction(pred->get_type(), 1, 2, name, next) { -// set_operand(0, pred); -//} - -//mask_inst* mask_inst::create(value *pred, const std::string &name, instruction *next) { -// return new mask_inst(pred, name, next); -//} - -//// merge_inst -//psi_inst::psi_inst(value *mask_true, value *value_true, -// value *mask_false, value *value_false, -// const std::string &name, instruction *next) -// : instruction(value_true->get_type(), 4, 1, name, next) { -// set_operand(0, mask_true); -// set_operand(1, value_true); -// set_operand(2, mask_false); -// set_operand(3, value_false); -//} - -//psi_inst* psi_inst::create(value *mask_true, value *value_true, -// value *mask_false, value *value_false, -// const std::string &name, instruction *next) { -// return new psi_inst(mask_true, value_true, mask_false, value_false, name, next); -//} - - //===----------------------------------------------------------------------===// // getelementptr_inst classes @@ -440,6 +413,13 @@ getelementptr_inst *getelementptr_inst::create(value *ptr, const std::vectorget_scalar_ty(); type *pointee_ty = scalar_ty->get_pointer_element_ty(); @@ -448,43 +428,52 @@ type *load_inst::get_pointee_type(type *ty) { return pointee_ty; } -load_inst::load_inst(value *ptr, const std::string &name, instruction *next) - : unary_inst(get_pointee_type(ptr->get_type()), ptr, name, next), mask_(nullptr){ -} - -value *load_inst::get_mask() const { - return mask_; -} - -value *load_inst::set_mask(value *mask) { - mask_ = mask; - return this; +load_inst::load_inst(value *ptr, unsigned num_extra_ops, const std::string &name, instruction *next) + : io_inst(get_pointee_type(ptr->get_type()), 1 + num_extra_ops, 1, name, next) { + set_operand(0, ptr); } load_inst* load_inst::create(value *ptr, const std::string &name, instruction *next) { - return new load_inst(ptr, name, next); + return new load_inst(ptr, 0, name, next); } +// masked load +masked_load_inst::masked_load_inst(value *ptr, value *mask, value *false_value, + const std::string &name, instruction *next) + : load_inst(ptr, 2, name, next) { + set_operand(1, mask); + set_operand(2, false_value); +} + +masked_load_inst* masked_load_inst::create(value *ptr, value *mask, value *false_value, + const std::string &name, instruction *next) { + return new masked_load_inst(ptr, mask, false_value, name, next); +} + + // store -store_inst::store_inst(value *ptr, value *v, const std::string &name, instruction *next) - : instruction(type::get_void_ty(ptr->get_type()->get_context()), 2, 1, name, next), mask_(nullptr) { +store_inst::store_inst(value *ptr, value *val, unsigned num_extra_ops, + const std::string &name, instruction *next) + : io_inst(type::get_void_ty(ptr->get_type()->get_context()), 2 + num_extra_ops, 1, name, next) { set_operand(0, ptr); - set_operand(1, v); + set_operand(1, val); } -value *store_inst::get_mask() const { - return mask_; +store_inst* store_inst::create(value *ptr, value *val, + const std::string &name, instruction *next) { + return new store_inst(ptr, val, 0, name, next); } -value *store_inst::set_mask(value *mask) { - mask_ = mask; - return this; +// masked store +masked_store_inst::masked_store_inst(value *ptr, value *val, value *mask, + const std::string &name, instruction *next) + : store_inst(ptr, val, 1, name, next) { + set_operand(2, mask); } -store_inst* store_inst::create(value *ptr, value *v, const std::string &name, instruction *next) { - return new store_inst(ptr, v, name, next); +masked_store_inst* masked_store_inst::create(value *ptr, value *val, value *mask, const std::string &name, instruction *next) { + return new masked_store_inst(ptr, val, mask, name, next); } - //===----------------------------------------------------------------------===// // retile_inst classes //===----------------------------------------------------------------------===// @@ -636,19 +625,6 @@ instruction* select_inst::create(value *pred, value *if_value, value *else_value // builtin instructions //===----------------------------------------------------------------------===// -// get_global_range -get_global_range_inst::get_global_range_inst(type *ty, unsigned axis, - const std::string &name, instruction *next) - : builtin_inst(ty, 0, 1, name, next), axis_(axis) { - -} - -instruction* get_global_range_inst::create(context &ctx, unsigned axis, type::tile_shapes_t::value_type size, - const std::string &name, instruction *next) { - type *int_ty = type::get_int32_ty(ctx); - type *tile_ty = tile_type::get(int_ty, {size}); - return new get_global_range_inst(tile_ty, axis, name, next); -} // get_range_id get_range_id_inst::get_range_id_inst(type *ty, unsigned axis, const std::string &name, instruction *next) diff --git a/lib/ir/print.cpp b/lib/ir/print.cpp index b1e43c0e2..4b7248bc6 100644 --- a/lib/ir/print.cpp +++ b/lib/ir/print.cpp @@ -35,12 +35,6 @@ void print(module &mod, std::ostream& os) { os << std::endl; for(ir::instruction *inst: block->get_inst_list()){ os << " "; - if(auto *x = dynamic_cast(inst)) - if(ir::value *mask = x->get_mask()) - os << "@" << get_name(mask, cnt++) << " "; - if(auto *x = dynamic_cast(inst)) - if(ir::value *mask = x->get_mask()) - os << "@" << get_name(mask, cnt++) << " "; unsigned num_results = inst->get_num_results(); for(unsigned i = 0; i < num_results; i++){ os << get_name(inst->get_result(i), cnt++); diff --git a/lib/lang/declaration.cpp b/lib/lang/declaration.cpp index 64f238171..dba439ce1 100644 --- a/lib/lang/declaration.cpp +++ b/lib/lang/declaration.cpp @@ -151,7 +151,7 @@ ir::value* initializer::codegen(ir::module * mod) const{ } else if(expr_){ value = expr_->codegen(mod); - value = explicit_cast(mod->get_builder(), value, ty); + value = explicit_cast(mod->get_builder(), value, ty->get_scalar_ty()); implicit_broadcast(mod, ty, value); } value->set_name(name); diff --git a/lib/lang/expression.cpp b/lib/lang/expression.cpp index b21a3b4c7..6baa1f3b2 100644 --- a/lib/lang/expression.cpp +++ b/lib/lang/expression.cpp @@ -115,12 +115,6 @@ ir::value* alloc_const_expression::codegen(ir::module *mod) const { return res; } -// get_global_range -ir::value* get_global_range_expression::codegen(ir::module *mod) const { - ir::builder &builder = mod->get_builder(); - return builder.create_get_global_range(axis_->value(), (ir::constant_int*)size_->codegen(mod)); -} - // get_range_id ir::value* get_range_id_expression::codegen(ir::module *mod) const { return mod->get_builder().create_get_range_id(axis_->value()); @@ -254,39 +248,24 @@ ir::value* cast_expression::codegen(ir::module *mod) const{ } /* Conditional expression */ -ir::value *conditional_expression::codegen(ir::module *mod) const{ +ir::value *conditional_expression::codegen(ir::module *mod) const { + ir::builder &builder = mod->get_builder(); + ir::value *mask = cond_->codegen(mod); + ir::value *true_value = true_value_->codegen(mod); + ir::value *false_value = false_value_->codegen(mod); + bool is_float, is_ptr, is_int, is_signed; + implicit_cast(builder, true_value, false_value, is_float, is_ptr, is_int, is_signed); + implicit_broadcast(mod, mask, true_value); + implicit_broadcast(mod, mask, false_value); + if(ir::load_inst* load = dynamic_cast(true_value)){ + load->erase_from_parent(); + return builder.create_masked_load(load->get_pointer_operand(), mask, false_value); + } + if(ir::load_inst* load = dynamic_cast(false_value)){ + load->erase_from_parent(); + return builder.create_masked_load(load->get_pointer_operand(), mask, true_value); + } throw std::runtime_error("not implemented"); -// ir::builder &builder = mod->get_builder(); -// ir::basic_block::inst_list_t &instructions = builder.get_insert_block()->get_inst_list(); -// ir::value *pred = cond_->codegen(mod); -// ir::instruction *mask = (ir::instruction*)builder.create_mask(pred); -// /* true value */ -// ir::value *true_mask = mask->get_result(0); -// auto it_true_begin = instructions.end(); -// it_true_begin--; -// ir::value *true_value = true_value_->codegen(mod); -// implicit_broadcast(mod, pred, true_value); -// it_true_begin++; -// auto it_true_end = instructions.end(); -// for(auto it = it_true_begin; it != it_true_end; it++) -//// if(!dynamic_cast(*it)) -// (*it)->set_mask_pred(true_mask); -// /* false value */ -// ir::value *false_mask = mask->get_result(1); -// auto it_false_begin = instructions.end(); -// it_false_begin--; -// ir::value *false_value = false_value_->codegen(mod); -// implicit_broadcast(mod, pred, false_value); -// bool is_float, is_ptr, is_int, is_signed; -// implicit_cast(builder, true_value, false_value, is_float, is_ptr, is_int, is_signed); -// it_false_begin++; -// auto it_false_end = instructions.end(); -// for(auto it = it_false_begin; it != it_false_end; it++) -//// if(!dynamic_cast(*it)) -// (*it)->set_mask_pred(false_mask); -// /* psi */ -// ir::value *result = builder.create_merge(true_mask, true_value, false_mask, false_value); -// return result; } /* Assignment expression */ diff --git a/lib/lang/statement.cpp b/lib/lang/statement.cpp index ab0a55828..a768bf7b4 100644 --- a/lib/lang/statement.cpp +++ b/lib/lang/statement.cpp @@ -29,21 +29,35 @@ ir::value* compound_statement::codegen(ir::module* mod) const{ /* Expression statement */ ir::value* expression_statement::codegen(ir::module *mod) const{ ir::builder &builder = mod->get_builder(); - ir::value *expr = expr_->codegen(mod); - if(pred_ == nullptr) - return expr; - ir::value *pred = pred_->codegen(mod); - if(auto *x = dynamic_cast(expr)) - x->set_mask(pred); - else if(auto *x = dynamic_cast(expr)) - x->set_mask(pred); - else - expr = builder.create_select(pred, expr, ir::undef_value::get(expr->get_type())); + // get name if applicable + std::string name = ""; + ir::value *current = nullptr; if(assignment_expression *assignment = dynamic_cast(expr_)) - if(auto *named = dynamic_cast(assignment)){ - std::string name = named->lvalue()->id()->name(); - mod->set_value(name, expr); + if(const named_expression* named = dynamic_cast(assignment->lvalue())){ + name = named->id()->name(); + current = mod->get_value(name); } + // lower expression + ir::value *expr = expr_->codegen(mod); + // modify expression if predicated + if(pred_) { + ir::value *pred = pred_->codegen(mod); + if(!current) + current = ir::undef_value::get(expr->get_type()); + if(auto *x = dynamic_cast(expr)){ + x->erase_from_parent(); + expr = builder.create_masked_load(x->get_pointer_operand(), pred, current); + } + else if(auto *x = dynamic_cast(expr)){ + x->erase_from_parent(); + expr =builder.create_masked_store(x->get_pointer_operand(), x->get_value_operand(), pred); + } + else + expr = builder.create_select(pred, expr, current); + } + // update symbols table + if(!name.empty()) + mod->set_value(name, expr); return expr; } From 17cb2db356a2840488732145dbf72615d9bb25ee Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 27 Jul 2019 21:21:36 -0700 Subject: [PATCH 267/494] [dnn/blocksparse/dot] prototype version seems to pass basic test --- examples/cpp/dot.cpp | 9 +- examples/cpp/shift.cpp | 29 +-- examples/python/tensorflow/CMakeLists.txt | 2 +- examples/python/tensorflow/blocksparse.cpp | 206 ++++++++++++--------- examples/python/tensorflow/conv.cpp | 1 - examples/python/tensorflow/dot.cpp | 2 +- examples/python/tensorflow/shift.cpp | 26 +-- include/triton/dnn/blocksparse/dot.h | 42 +++++ include/triton/dnn/{gemm.h => dot.h} | 0 include/triton/dnn/heuristics.h | 142 +++++++------- include/triton/runtime/jit.h | 1 + lib/codegen/selection.cpp | 5 +- lib/codegen/tune.cpp | 6 +- lib/dnn/base.cpp | 4 +- lib/dnn/blocksparse/dot.cpp | 109 +++++++++++ lib/dnn/{gemm.cpp => dot.cpp} | 6 +- lib/dnn/shift.cpp | 4 +- lib/runtime/jit.cpp | 13 +- 18 files changed, 402 insertions(+), 205 deletions(-) create mode 100644 include/triton/dnn/blocksparse/dot.h rename include/triton/dnn/{gemm.h => dot.h} (100%) create mode 100644 lib/dnn/blocksparse/dot.cpp rename lib/dnn/{gemm.cpp => dot.cpp} (97%) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index efebc102e..771e44c1f 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -4,7 +4,7 @@ #include "triton/runtime/jit.h" #include "triton/driver/backend.h" #include "triton/driver/stream.h" -#include "triton/dnn/gemm.h" +#include "triton/dnn/dot.h" #include "triton/tools/bench.hpp" #include "cuda.h" @@ -48,7 +48,7 @@ perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int stream->synchronize(); triton::dnn::dot dot(M, N, K, AT, BT, ty, ty, 8, 8); // benchmark triton - double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::FULL_TUNING);}, stream); + double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::PARTIAL_TUNING);}, stream); // benchmark cublas NumericT alpha = 1; NumericT beta = 0; @@ -98,8 +98,9 @@ int main() { // shapes to benchmark std::vector configs = { // {false, false, 8192, 512, 512}, - {false, true, 8192, 8192, 8192} -// {false, true, 32768, 256, 512} +// {false, true, 8192, 8192, 8192} + {false, true, 32768, 256, 256}, + {false, true, 32768, 256, 512} // {true, false, 8192, 512, 512}, // {true, true, 8192, 512, 512} }; diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index 91ed2daaa..38e0e37bf 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -67,23 +67,23 @@ perf_t do_bench(triton::driver::stream *stream, stream->write(dc, true, 0, hc); stream->synchronize(); // benchmark triton - double triton_ns = triton::tools::bench([&]() { shift.enqueue(stream, {da, db, dc}, triton::dnn::FULL_TUNING);}, stream); + double triton_ns = triton::tools::bench([&]() { shift.enqueue(stream, {da, db, dc}, triton::dnn::NO_TUNING);}, stream); // benchmark cublas - NumericT alpha = 1; - NumericT beta = 0; - cublasGemmAlgo_t fastest; - cublasGemm(HALF_TYPE, stream, shift.AT(), shift.BT(), shift.M(), shift.N(), shift.K(), - &alpha, da, shift.lda(), - db, shift.ldb(), &beta, - dc, shift.ldc(), &fastest); - double cublas_ns = triton::tools::bench([&]() { cublasGemm(HALF_TYPE, stream, shift.AT(), shift.BT(), shift.M(), shift.N(), shift.K(), - &alpha, da, shift.lda(), - db, shift.ldb(), - &beta, dc, shift.ldc(), nullptr, fastest); }, stream); +// NumericT alpha = 1; +// NumericT beta = 0; +// cublasGemmAlgo_t fastest; +// cublasGemm(HALF_TYPE, stream, shift.AT(), shift.BT(), shift.M(), shift.N(), shift.K(), +// &alpha, da, shift.lda(), +// db, shift.ldb(), &beta, +// dc, shift.ldc(), &fastest); +// double cublas_ns = triton::tools::bench([&]() { cublasGemm(HALF_TYPE, stream, shift.AT(), shift.BT(), shift.M(), shift.N(), shift.K(), +// &alpha, da, shift.lda(), +// db, shift.ldb(), +// &beta, dc, shift.ldc(), nullptr, fastest); }, stream); // result auto tflops = [&](double nanosec) { return shift.num_flops() / nanosec * 1e-3; }; perf_t result; - result.cublas = tflops(cublas_ns); +// result.cublas = tflops(cublas_ns); result.triton = tflops(triton_ns); delete da; delete db; @@ -133,8 +133,9 @@ int main() { {128, 1024, 8, 8, 3, 3, 1024, 1, 1} }; for(config_t c: resnet18){ - for(op_t op: {op_t::FPROP, op_t::BPROP, op_t::WGRAD}) + for(op_t op: {op_t::FPROP, op_t::BPROP, op_t::WGRAD}){ configs.push_back({c.B, c.C, c.H, c.W, c.R, c.S, c.F, c.stride_h, c.stride_w, op, layout_t::CHWN, "fp16"}); + } } // initialize default compute device diff --git a/examples/python/tensorflow/CMakeLists.txt b/examples/python/tensorflow/CMakeLists.txt index 5c151f19b..0dad37f19 100644 --- a/examples/python/tensorflow/CMakeLists.txt +++ b/examples/python/tensorflow/CMakeLists.txt @@ -5,7 +5,7 @@ if(${TensorFlow_FOUND}) include_directories("${CUDA_HOME}/include") link_directories(${TF_LIB}) add_definitions(-D_GLIBCXX_USE_CXX11_ABI=${TF_ABI}) - add_library(tf_blocksparse SHARED dot.cpp conv.cpp shift.cpp batchnorm.cpp) + add_library(tf_blocksparse SHARED blocksparse.cpp dot.cpp conv.cpp shift.cpp batchnorm.cpp) target_link_libraries(tf_blocksparse tensorflow_framework triton) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/run.py ${CMAKE_CURRENT_BINARY_DIR}/run.py diff --git a/examples/python/tensorflow/blocksparse.cpp b/examples/python/tensorflow/blocksparse.cpp index 85e73d033..b86c6bcab 100644 --- a/examples/python/tensorflow/blocksparse.cpp +++ b/examples/python/tensorflow/blocksparse.cpp @@ -3,7 +3,8 @@ #include "triton/driver/buffer.h" #include "triton/driver/backend.h" #include "triton/driver/stream.h" -#include "triton/jit.h" +#include "triton/runtime/jit.h" +#include "triton/dnn/blocksparse/dot.h" #define EIGEN_USE_GPU #include "tensorflow/core/framework/op.h" @@ -20,106 +21,88 @@ using shape_inference::InferenceContext; using shape_inference::ShapeHandle; using GPUDevice = Eigen::GpuDevice; - -const char* src = -R"( -const tunable int32 TM = {16, 32, 64, 128}; -const tunable int32 TN = {16, 32, 64, 128}; -const tunable int32 TK = {8}; -const tunable int32 GZ = {1}; - -void bsmm (restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C, - int32 M, int32 N, int32 K, - int32 lda, int32 ldb, int32 ldc, - int32 *locks, int32 grid0, int32 grid1) { - -} -)"; - Status XpropShape(InferenceContext* ctx) { - int K; TF_RETURN_IF_ERROR(ctx->GetAttr( "K", &K)); - int axis; TF_RETURN_IF_ERROR(ctx->GetAttr("axis", &axis)); + int K; TF_RETURN_IF_ERROR(ctx->GetAttr( "K", &K)); + int axis; TF_RETURN_IF_ERROR(ctx->GetAttr("axis", &axis)); - // C ==> K - ShapeHandle x = ctx->input(0); - int rank = ctx->Rank(x); - //printf("XpropShape: %d\n", rank); - if (rank > 0) - { - std::vector shape; - shape.reserve(rank); - for (int i = 0; i < rank; i++) - shape.push_back(i == axis ? ctx->MakeDim(K) : ctx->Dim(x, i)); - - ctx->set_output(0, ctx->MakeShape(shape)); - } - else - ctx->set_output(0, ctx->UnknownShape()); - ctx->set_output(1, ctx->UnknownShape()); - return Status::OK(); + // C ==> K + ShapeHandle x = ctx->input(0); + int rank = ctx->Rank(x); + //printf("XpropShape: %d\n", rank); + if (rank > 0) + { + std::vector shape; + shape.reserve(rank); + for (int i = 0; i < rank; i++) + shape.push_back(i == axis ? ctx->MakeDim(K) : ctx->Dim(x, i)); + ctx->set_output(0, ctx->MakeShape(shape)); + } + else + ctx->set_output(0, ctx->UnknownShape()); + ctx->set_output(1, ctx->UnknownShape()); + return Status::OK(); } -REGISTER_OP("BlocksparseMatmul") - .Input("x: T") - .Input("w: T") - .Input("lut: int64") - .Input("lut_dx: int64") - .Input("lut_dw: int64") - .Input("gate: ngate * float") - .Output("y: T") - .Output("temp: int32") - .Attr("T: {half, float, bfloat16}") - .Attr("blocks: int >=0") - .Attr("bsize: int") - .Attr("segments: int = 0") - .Attr("segments_dx: int = 0") - .Attr("locks: int = 0") - .Attr("locks_dx: int = 0") - .Attr("axis: int = 1") - .Attr("C: int >=0") - .Attr("K: int >=0") - .Attr("shared: int = 0") - .Attr("shared_dx: int = 0") - .Attr("alpha: float = 1.0") - .Attr("beta: float = 0.0") - .Attr("gated_dw: bool = false") - .Attr("gate_grad: bool = false") - .Attr("bench: int = 0") - .Attr("ngate: int >= 0") - .SetShapeFn(XpropShape) - .Doc(R"doc( -Multiply the matrix "a" by the blocksparse matrix "b". -)doc"); +REGISTER_OP("TritonBlocksparseMatmul") +.Input("x: T") +.Input("w: T") +.Input("lut: int64") +.Input("lut_dx: int64") +.Input("lut_dw: int64") +.Input("gate: ngate * float") +.Output("y: T") +.Output("temp: int32") +.Attr("T: {half, float, bfloat16}") +.Attr("blocks: int >=0") +.Attr("bsize: int") +.Attr("segments: int = 0") +.Attr("segments_dx: int = 0") +.Attr("locks: int = 0") +.Attr("locks_dx: int = 0") +.Attr("axis: int = 1") +.Attr("C: int >=0") +.Attr("K: int >=0") +.Attr("shared: int = 0") +.Attr("shared_dx: int = 0") +.Attr("alpha: float = 1.0") +.Attr("beta: float = 0.0") +.Attr("gated_dw: bool = false") +.Attr("gate_grad: bool = false") +.Attr("bench: int = 0") +.Attr("ngate: int >= 0") +.SetShapeFn(XpropShape) +.Doc(R"doc( + Multiply the matrix "a" by the blocksparse matrix "b". + )doc"); typedef struct bsmm_params { - const int* Lut; - const float* Gate; - int* Lock; - //float4* Scratch; - int blocks; - int bsize; - int segments; - int locks; - int C; - int K; - int N; - int shared; - int pcount; - uint blk_a; - uint blk_A; - uint blk_b; - uint blk_B; - float alpha; - float beta; - CUstream stream; + const int* Lut; + const float* Gate; + int* Lock; + int blocks; + int bsize; + int segments; + int locks; + int C; + int K; + int N; + int shared; + int pcount; + uint blk_a; + uint blk_A; + uint blk_b; + uint blk_B; + float alpha; + float beta; + CUstream stream; } bsmm_params; class BlocksparseMatmulOp : public OpKernel { - public: +public: explicit BlocksparseMatmulOp(OpKernelConstruction* ctx) : OpKernel(ctx) { OP_REQUIRES_OK(ctx, ctx->GetAttr("segments", ¶ms_.segments)); OP_REQUIRES_OK(ctx, ctx->GetAttr("locks", ¶ms_.locks )); @@ -147,6 +130,51 @@ class BlocksparseMatmulOp : public OpKernel { } void Compute(OpKernelContext* context){ + // get device/stream + GPUDevice device = context->eigen_device(); + triton::driver::cu_stream sstream(device.stream(), false); + triton::driver::context* ctx = sstream.context(); + triton::driver::stream* stream = &sstream; + // get inputs + const Tensor& a = context->input(0); + const Tensor& b = context->input(1); + const Tensor& lut = context->input(2); + // allocate c + TensorShape shape_c; + int N = 1; + int rank_a = a.dims(); + for (int i = 0; i < rank_a; i++) + if (i != axis_) { + shape_c.AddDim(a.dim_size(i)); + N *= a.dim_size(i); + } + else + shape_c.AddDim(params_.K); + Tensor* c = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, shape_c, &c)); + // grid and block + int blkN = 128, gridN = (N + 127)/128, modN128 = N & 127; + if (axis_ == 1 || (modN128 > 0 && modN128 <= 64) || gridN * params_.segments < SMs_*4){ + blkN = 64; + gridN = (N + 63)/64; + } + // allocate locks + Tensor* locks; + TensorShape shape_l; + if (params_.locks > 0) + shape_l.AddDim(gridN * params_.locks * 2); + OP_REQUIRES_OK(context, context->allocate_output(1, shape_l, &locks)); + // initialize default compute device + triton::runtime::jit jit(ctx); + // matrix multiplication parameters + triton::driver::cu_buffer da(ctx, (CUdeviceptr)a.flat().data(), false); + triton::driver::cu_buffer db(ctx, (CUdeviceptr)b.flat().data(), false); + triton::driver::cu_buffer dc(ctx, (CUdeviceptr)c->flat().data(), false); +// triton::driver::cu_buffer dlocks(ctx, (CUdeviceptr)locks->flat().data(), false); + triton::driver::cu_buffer dlut(ctx, (CUdeviceptr)lut.flat().data(), false); + // blocksparse matmul + triton::dnn::blocksparse::dot dot(N, params_.K, params_.C); + dot.enqueue(stream, {&da, &db, &dc, &dlut}, triton::dnn::NO_TUNING); } private: @@ -157,4 +185,4 @@ private: char bench_string_[256]; }; -REGISTER_KERNEL_BUILDER(Name("BlocksparseMatmul").Device(DEVICE_GPU).TypeConstraint("T"), BlocksparseMatmulOp); +REGISTER_KERNEL_BUILDER(Name("TritonBlocksparseMatmul").Device(DEVICE_GPU).TypeConstraint("T"), BlocksparseMatmulOp); diff --git a/examples/python/tensorflow/conv.cpp b/examples/python/tensorflow/conv.cpp index ebd60ac6d..4ed457021 100644 --- a/examples/python/tensorflow/conv.cpp +++ b/examples/python/tensorflow/conv.cpp @@ -5,7 +5,6 @@ #include "triton/driver/stream.h" #include "triton/runtime/jit.h" #include "triton/tools/bench.hpp" -#include "triton/dnn/gemm.h" #include "triton/dnn/conv.h" #define EIGEN_USE_GPU diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index 84f67664c..368ef8be3 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -5,7 +5,7 @@ #include "triton/driver/stream.h" #include "triton/runtime/jit.h" #include "triton/tools/bench.hpp" -#include "triton/dnn/gemm.h" +#include "triton/dnn/dot.h" #define EIGEN_USE_GPU #include "tensorflow/core/framework/op.h" diff --git a/examples/python/tensorflow/shift.cpp b/examples/python/tensorflow/shift.cpp index 1a9ebbe59..2a3973fbb 100644 --- a/examples/python/tensorflow/shift.cpp +++ b/examples/python/tensorflow/shift.cpp @@ -19,10 +19,10 @@ using namespace tensorflow; using GPUDevice = Eigen::GpuDevice; -template +template class ShiftConvOp : public OpKernel { public: - explicit ShiftConvOp(OpKernelConstruction* context) : OpKernel(context), layout_(triton::dnn::shift::NCHW) { + explicit ShiftConvOp(OpKernelConstruction* context) : OpKernel(context), layout_(triton::dnn::NCHW) { context->GetAttr("shift_h", &h_shift_h_); context->GetAttr("shift_w", &h_shift_w_); context->GetAttr("stride_h", &stride_h_); @@ -32,13 +32,13 @@ public: } void ExtractShapes(const Tensor &x, int64_t &C, int64_t &H, int64_t &W, int64_t &B) { - if(layout_ == triton::dnn::shift::CHWN){ + if(layout_ == triton::dnn::CHWN){ C = x.dim_size(0); H = x.dim_size(1); W = x.dim_size(2); B = x.dim_size(3); } - else if(layout_ == triton::dnn::shift::NCHW){ + else if(layout_ == triton::dnn::NCHW){ B = x.dim_size(0); C = x.dim_size(1); H = x.dim_size(2); @@ -52,7 +52,7 @@ public: void FillShapes(OpKernelContext* context, int64_t &C, int64_t &H, int64_t &W, int64_t &B, int64_t &F, const Tensor& tf_a, const Tensor& tf_b) { - if(OP == triton::dnn::shift::WGRAD) { + if(OP == triton::dnn::WGRAD) { int64_t Ha, Wa, Ba; int64_t Hb, Wb, Bb; ExtractShapes(tf_a, F, Ha, Wa, Ba); @@ -68,19 +68,19 @@ public: // shapes for a int64_t Ca; ExtractShapes(tf_a, Ca, H, W, B); - if(OP == triton::dnn::shift::BPROP){ + if(OP == triton::dnn::BPROP){ H *= stride_h_; W *= stride_w_; } // shapes for b int64_t Cb = tf_b.dim_size(0); F = tf_b.dim_size(1); - if(OP == triton::dnn::shift::BPROP) + if(OP == triton::dnn::BPROP) std::swap(Cb, F); // checks OP_REQUIRES(context, Ca == Cb, tensorflow::errors::InvalidArgument("operands must have the same number of channels")); C = Ca; - if(OP == triton::dnn::shift::BPROP) + if(OP == triton::dnn::BPROP) std::swap(C, F); } } @@ -122,7 +122,7 @@ public: triton::driver::cu_buffer da(ctx, (CUdeviceptr)tf_a.flat().data(), false); triton::driver::cu_buffer db(ctx, (CUdeviceptr)tf_b.flat().data(), false); triton::driver::cu_buffer dc(ctx, (CUdeviceptr)tf_c->flat().data(), false); - shift.enqueue(stream, {&da, &db, &dc}, false); + shift.enqueue(stream, {&da, &db, &dc}, triton::dnn::PARTIAL_TUNING); } private: @@ -132,10 +132,10 @@ private: int stride_w_; int R_; int S_; - triton::dnn::shift::layout_t layout_; + triton::dnn::layout_t layout_; }; -REGISTER_KERNEL_BUILDER(Name("ShiftConv").Device(DEVICE_GPU), ShiftConvOp); +REGISTER_KERNEL_BUILDER(Name("ShiftConv").Device(DEVICE_GPU), ShiftConvOp); REGISTER_OP("ShiftConv") .Input("a: float16") .Input("b: float16") @@ -145,7 +145,7 @@ REGISTER_OP("ShiftConv") .Attr("stride_w: int") .Output("c: float16"); -REGISTER_KERNEL_BUILDER(Name("ShiftConvDx").Device(DEVICE_GPU), ShiftConvOp); +REGISTER_KERNEL_BUILDER(Name("ShiftConvDx").Device(DEVICE_GPU), ShiftConvOp); REGISTER_OP("ShiftConvDx") .Input("a: float16") .Input("b: float16") @@ -155,7 +155,7 @@ REGISTER_OP("ShiftConvDx") .Attr("stride_w: int") .Output("c: float16"); -REGISTER_KERNEL_BUILDER(Name("ShiftConvDw").Device(DEVICE_GPU), ShiftConvOp); +REGISTER_KERNEL_BUILDER(Name("ShiftConvDw").Device(DEVICE_GPU), ShiftConvOp); REGISTER_OP("ShiftConvDw") .Input("a: float16") .Input("b: float16") diff --git a/include/triton/dnn/blocksparse/dot.h b/include/triton/dnn/blocksparse/dot.h new file mode 100644 index 000000000..fbd388937 --- /dev/null +++ b/include/triton/dnn/blocksparse/dot.h @@ -0,0 +1,42 @@ +#include "triton/driver/stream.h" +#include "triton/driver/kernel.h" +#include "triton/dnn/base.h" +#include + +namespace triton{ +namespace dnn{ +namespace blocksparse{ + + +class dot: public base { +private: + void enqueue_impl(driver::stream *stream, driver::kernel *kernel, + std::vector args, + triton::runtime::launch_information info); + // number of flops + virtual size_t num_flops() const; + // comparison for maps + virtual bool operator<(const base& other) const; + // default parameters + virtual std::vector search_space() const; + virtual params_t heuristics() const; + +public: + // constructor + dot(int32_t M, int32_t N, int32_t K); + // triton-c source + virtual void triton_c_src(std::ostream &os) const; + // clone + virtual base* clone() const; + +private: + std::string ab_ty_; + std::string c_ty_; + int32_t M_; + int32_t N_; + int32_t K_; +}; + +} +} +} diff --git a/include/triton/dnn/gemm.h b/include/triton/dnn/dot.h similarity index 100% rename from include/triton/dnn/gemm.h rename to include/triton/dnn/dot.h diff --git a/include/triton/dnn/heuristics.h b/include/triton/dnn/heuristics.h index e2efe6df2..ab8af7d32 100644 --- a/include/triton/dnn/heuristics.h +++ b/include/triton/dnn/heuristics.h @@ -13,79 +13,95 @@ typedef std::tuple size_key_t; static const std::map> params = { /* NN */ {trans_key_t(false, false), std::map{ - {size_key_t(16, 16), {4, 4, 16, 8, 16, 2, 2, 1, 1, 8, 32, 4, 8, 1}}, - {size_key_t(16, 32), {2, 8, 16, 8, 32, 2, 2, 1, 1, 16, 32, 4, 8, 1}}, + {size_key_t(16, 16), {2, 8, 16, 4, 16, 2, 2, 1, 1, 16, 32, 8, 4, 1}}, + {size_key_t(16, 32), {4, 4, 16, 4, 32, 2, 2, 1, 1, 8, 32, 8, 4, 1}}, {size_key_t(16, 64), {4, 4, 16, 4, 64, 2, 2, 1, 1, 8, 32, 8, 4, 1}}, - {size_key_t(16, 128), {4, 4, 16, 16, 128, 2, 2, 1, 2, 16, 32, 4, 8, 1}}, - {size_key_t(32, 16), {4, 8, 32, 8, 16, 2, 2, 1, 1, 8, 32, 4, 8, 1}}, - {size_key_t(32, 32), {4, 8, 32, 8, 32, 2, 2, 1, 1, 8, 32, 4, 8, 1}}, - {size_key_t(32, 64), {8, 4, 32, 8, 64, 2, 2, 1, 1, 4, 32, 4, 8, 1}}, - {size_key_t(32, 128), {8, 4, 32, 16, 128, 2, 2, 1, 4, 16, 32, 8, 4, 1}}, - {size_key_t(64, 16), {8, 8, 64, 4, 16, 2, 2, 1, 1, 4, 32, 8, 4, 1}}, + {size_key_t(16, 128), {2, 8, 16, 8, 128, 2, 2, 1, 1, 16, 32, 4, 8, 1}}, + {size_key_t(32, 16), {8, 4, 32, 8, 16, 2, 2, 1, 1, 4, 32, 4, 8, 1}}, + {size_key_t(32, 32), {4, 8, 32, 4, 32, 2, 2, 1, 1, 8, 32, 8, 4, 1}}, + {size_key_t(32, 64), {8, 4, 32, 4, 64, 2, 2, 1, 1, 4, 32, 8, 4, 1}}, + {size_key_t(32, 128), {8, 4, 32, 32, 128, 2, 2, 2, 2, 16, 32, 4, 4, 1}}, + {size_key_t(32, 256), {4, 8, 32, 32, 256, 2, 2, 1, 4, 32, 32, 4, 8, 1}}, + {size_key_t(64, 16), {8, 8, 64, 8, 16, 2, 2, 1, 1, 4, 32, 4, 8, 1}}, {size_key_t(64, 32), {8, 8, 64, 8, 32, 2, 2, 1, 1, 4, 32, 4, 8, 1}}, - {size_key_t(64, 64), {8, 8, 64, 16, 64, 2, 2, 2, 1, 8, 32, 4, 8, 1}}, - {size_key_t(64, 128), {16, 4, 64, 16, 128, 2, 2, 2, 2, 8, 32, 8, 4, 1}}, - {size_key_t(128, 16), {8, 8, 128, 8, 16, 2, 2, 2, 1, 8, 32, 8, 4, 1}}, - {size_key_t(128, 32), {8, 8, 128, 16, 32, 2, 2, 2, 1, 8, 32, 4, 8, 1}}, - {size_key_t(128, 64), {8, 8, 128, 32, 64, 2, 2, 2, 2, 16, 32, 4, 8, 1}}, - {size_key_t(128, 128), {8, 8, 128, 32, 128, 2, 2, 1, 4, 16, 32, 4, 8, 1}} + {size_key_t(64, 64), {8, 8, 64, 16, 64, 2, 2, 1, 2, 8, 32, 4, 8, 1}}, + {size_key_t(64, 128), {16, 4, 64, 32, 128, 2, 2, 1, 4, 8, 32, 4, 8, 1}}, + {size_key_t(128, 16), {8, 8, 128, 16, 16, 2, 2, 2, 1, 8, 32, 4, 8, 1}}, + {size_key_t(128, 32), {32, 4, 128, 16, 32, 2, 2, 2, 1, 2, 32, 4, 8, 1}}, + {size_key_t(128, 64), {16, 8, 128, 16, 64, 2, 2, 2, 2, 8, 32, 8, 4, 1}}, + {size_key_t(128, 128), {8, 8, 128, 32, 128, 2, 2, 2, 2, 16, 32, 4, 8, 1}}, + {size_key_t(256, 16), {32, 8, 256, 16, 16, 2, 2, 4, 1, 4, 32, 8, 4, 1}}, + {size_key_t(256, 32), {32, 8, 256, 16, 32, 2, 2, 4, 1, 4, 32, 8, 4, 1}}, + {size_key_t(256, 64), {16, 8, 256, 32, 64, 2, 2, 4, 1, 8, 32, 4, 8, 1}} }}, /* NT */ {trans_key_t(false, true), std::map{ - {size_key_t(16, 16), {4, 4, 16, 2, 8, 16, 2, 2, 1, 1, 8, 32, 16, 1}}, - {size_key_t(16, 32), {4, 4, 16, 4, 8, 32, 2, 2, 1, 1, 8, 32, 8, 1}}, - {size_key_t(16, 64), {4, 4, 16, 8, 8, 64, 2, 2, 1, 4, 32, 32, 16, 1}}, - {size_key_t(16, 128), {4, 4, 16, 32, 4, 128, 2, 2, 1, 2, 16, 32, 2, 1}}, - {size_key_t(32, 16), {8, 4, 32, 2, 8, 16, 2, 2, 1, 1, 4, 32, 16, 1}}, - {size_key_t(32, 32), {4, 8, 32, 4, 8, 32, 2, 2, 1, 1, 8, 32, 8, 1}}, - {size_key_t(32, 64), {16, 8, 128, 4, 4, 64, 2, 2, 1, 4, 8, 32, 32, 1}}, - {size_key_t(32, 128), {4, 8, 32, 8, 8, 128, 2, 2, 1, 2, 16, 32, 8, 1}}, - {size_key_t(64, 16), {8, 8, 64, 2, 8, 16, 2, 2, 1, 1, 4, 32, 16, 1}}, - {size_key_t(64, 32), {8, 8, 64, 4, 8, 32, 2, 2, 1, 1, 4, 32, 8, 1}}, - {size_key_t(64, 64), {8, 8, 64, 8, 8, 64, 2, 2, 1, 2, 8, 32, 8, 1}}, - {size_key_t(64, 128), {8, 8, 64, 16, 8, 128, 2, 2, 1, 4, 16, 32, 8, 1}}, - {size_key_t(128, 16), {8, 8, 128, 2, 8, 16, 2, 2, 2, 1, 8, 32, 32, 1}}, - {size_key_t(128, 32), {16, 8, 128, 4, 8, 32, 2, 2, 2, 1, 4, 32, 16, 1}}, - {size_key_t(128, 64), {8, 8, 128, 8, 8, 64, 2, 2, 2, 2, 16, 32, 16, 1}}, - {size_key_t(128, 128), {8, 8, 128, 8, 8, 128, 2, 2, 4, 1, 16, 32, 16, 1}} - }}, + {size_key_t(16, 16), {2, 4, 16, 2, 8, 16, 2, 2, 1, 1, 16, 32, 16, 1}}, + {size_key_t(16, 32), {4, 4, 16, 8, 4, 32, 2, 2, 1, 1, 8, 32, 4, 1}}, + {size_key_t(16, 64), {2, 4, 16, 2, 8, 64, 2, 2, 1, 1, 16, 32, 16, 1}}, + {size_key_t(16, 128), {2, 8, 16, 8, 8, 128, 2, 2, 1, 1, 16, 32, 4, 1}}, + {size_key_t(32, 16), {8, 4, 32, 2, 8, 16, 2, 2, 1, 1, 4, 32, 16, 1}}, + {size_key_t(32, 32), {4, 8, 32, 8, 4, 32, 2, 2, 1, 1, 8, 32, 4, 1}}, + {size_key_t(32, 64), {16, 4, 64, 16, 4, 64, 2, 2, 4, 1, 8, 32, 8, 1}}, + {size_key_t(32, 128), {4, 8, 32, 16, 4, 128, 2, 2, 1, 2, 16, 32, 4, 1}}, + {size_key_t(32, 256), {4, 8, 32, 64, 4, 256, 2, 2, 1, 4, 32, 32, 2, 1}}, + {size_key_t(64, 16), {8, 8, 64, 2, 8, 16, 2, 2, 1, 1, 4, 32, 16, 1}}, + {size_key_t(64, 32), {16, 4, 64, 4, 4, 32, 2, 2, 1, 1, 2, 32, 8, 1}}, + {size_key_t(64, 64), {8, 8, 64, 8, 8, 64, 2, 2, 2, 1, 8, 32, 8, 1}}, + {size_key_t(64, 128), {4, 4, 64, 8, 8, 128, 2, 2, 1, 4, 32, 32, 16, 1}}, + {size_key_t(64, 256), {8, 8, 64, 8, 8, 256, 2, 2, 1, 4, 16, 32, 16, 1}}, + {size_key_t(128, 16), {16, 8, 128, 2, 8, 16, 2, 2, 1, 1, 2, 32, 16, 1}}, + {size_key_t(128, 32), {32, 4, 128, 4, 8, 32, 2, 2, 2, 1, 2, 32, 16, 1}}, + {size_key_t(128, 64), {8, 8, 128, 8, 8, 64, 2, 2, 4, 1, 16, 32, 16, 1}}, + {size_key_t(128, 128), {8, 8, 128, 16, 8, 128, 2, 2, 2, 2, 16, 32, 8, 1}}, + {size_key_t(256, 16), {32, 4, 256, 4, 4, 16, 2, 2, 4, 1, 4, 32, 32, 1}}, + {size_key_t(256, 32), {16, 8, 256, 8, 4, 32, 2, 2, 4, 1, 8, 32, 16, 1}}, + {size_key_t(256, 64), {8, 8, 256, 8, 8, 64, 2, 2, 4, 1, 16, 32, 16, 1}} + }}, /* TN */ {trans_key_t(true, false), std::map{ - {size_key_t(16, 16), {8, 16, 16, 16, 2, 2, 1, 1, 4, 8, 32, 2, 8, 1}}, - {size_key_t(16, 32), {4, 16, 8, 32, 2, 2, 1, 1, 8, 4, 32, 4, 8, 1}}, - {size_key_t(16, 64), {4, 16, 4, 64, 2, 2, 1, 1, 8, 4, 32, 8, 4, 1}}, - {size_key_t(16, 128), {16, 16, 16, 128, 2, 2, 1, 2, 4, 8, 32, 4, 8, 1}}, - {size_key_t(32, 16), {4, 32, 8, 16, 2, 2, 1, 1, 8, 4, 32, 4, 8, 1}}, - {size_key_t(32, 32), {8, 32, 8, 32, 2, 2, 1, 1, 4, 8, 32, 4, 8, 1}}, - {size_key_t(32, 64), {8, 32, 8, 64, 2, 2, 1, 1, 4, 8, 32, 4, 8, 1}}, - {size_key_t(32, 128), {32, 32, 64, 128, 2, 2, 2, 2, 4, 8, 32, 2, 8, 1}}, - {size_key_t(64, 16), {8, 64, 8, 16, 2, 2, 1, 1, 4, 8, 32, 4, 8, 1}}, - {size_key_t(64, 32), {8, 64, 8, 32, 2, 2, 1, 1, 4, 8, 32, 4, 8, 1}}, - {size_key_t(64, 64), {16, 64, 16, 64, 2, 2, 2, 1, 4, 8, 32, 4, 8, 1}}, - {size_key_t(64, 128), {32, 64, 16, 128, 2, 2, 2, 2, 4, 8, 32, 8, 4, 1}}, - {size_key_t(128, 16), {16, 128, 16, 16, 2, 2, 2, 1, 4, 8, 32, 4, 8, 1}}, - {size_key_t(128, 32), {32, 128, 32, 32, 2, 2, 4, 1, 4, 8, 32, 4, 8, 1}}, - {size_key_t(128, 64), {32, 128, 32, 64, 2, 2, 4, 1, 4, 8, 32, 4, 8, 1}}, - {size_key_t(128, 128), {32, 128, 32, 128, 2, 2, 4, 1, 4, 8, 32, 4, 8, 1}}, + {size_key_t(16, 16), {4, 16, 4, 16, 2, 2, 1, 1, 8, 4, 32, 8, 4, 1}}, + {size_key_t(16, 32), {8, 16, 8, 32, 2, 2, 1, 1, 4, 4, 32, 4, 4, 1}}, + {size_key_t(16, 64), {4, 16, 8, 64, 2, 2, 1, 1, 8, 4, 32, 4, 8, 1}}, + {size_key_t(16, 128), {4, 16, 8, 128, 2, 2, 1, 1, 8, 4, 32, 4, 8, 1}}, + {size_key_t(32, 16), {4, 32, 8, 16, 2, 2, 1, 1, 8, 4, 32, 4, 8, 1}}, + {size_key_t(32, 32), {4, 32, 4, 32, 2, 2, 1, 1, 8, 4, 32, 8, 4, 1}}, + {size_key_t(32, 64), {4, 32, 4, 64, 2, 2, 1, 1, 8, 4, 32, 8, 4, 1}}, + {size_key_t(32, 128), {8, 32, 8, 128, 2, 2, 1, 1, 4, 8, 32, 4, 8, 1}}, + {size_key_t(32, 256), {32, 32, 32, 256, 2, 2, 1, 4, 4, 8, 32, 4, 8, 1}}, + {size_key_t(64, 16), {4, 64, 8, 16, 2, 2, 1, 1, 8, 4, 32, 4, 8, 1}}, + {size_key_t(64, 32), {4, 64, 4, 32, 2, 2, 1, 1, 8, 4, 32, 8, 4, 1}}, + {size_key_t(64, 64), {8, 64, 16, 64, 2, 2, 2, 1, 8, 4, 32, 4, 8, 1}}, + {size_key_t(64, 128), {16, 64, 32, 128, 2, 2, 1, 4, 8, 4, 32, 4, 8, 1}}, + {size_key_t(128, 16), {8, 128, 8, 16, 2, 2, 1, 1, 4, 8, 32, 4, 8, 1}}, + {size_key_t(128, 32), {16, 128, 16, 32, 2, 2, 4, 1, 8, 4, 32, 8, 4, 1}}, + {size_key_t(128, 64), {32, 128, 32, 64, 2, 2, 2, 2, 4, 8, 32, 4, 8, 1}}, + {size_key_t(128, 128), {32, 128, 32, 128, 2, 2, 1, 4, 4, 8, 32, 4, 8, 1}}, + {size_key_t(256, 16), {16, 256, 16, 16, 2, 2, 2, 1, 4, 8, 32, 4, 8, 1}}, + {size_key_t(256, 32), {16, 256, 32, 32, 2, 2, 4, 1, 8, 4, 32, 4, 8, 1}}, }}, /* TT */ {trans_key_t(true, true), std::map{ - {size_key_t(16, 16), {4, 16, 2, 8, 16, 2, 2, 1, 1, 8, 4, 32, 16, 1}}, - {size_key_t(16, 32), {8, 16, 4, 8, 32, 2, 2, 1, 1, 4, 8, 32, 8, 1}}, - {size_key_t(16, 64), {16, 16, 4, 8, 64, 2, 2, 1, 4, 8, 4, 32, 32, 1}}, - {size_key_t(16, 128), {16, 16, 8, 4, 128, 2, 2, 1, 2, 4, 8, 32, 8, 1}}, - {size_key_t(32, 16), {4, 32, 2, 8, 16, 2, 2, 1, 1, 8, 4, 32, 16, 1}}, - {size_key_t(32, 32), {8, 32, 4, 8, 32, 2, 2, 1, 1, 4, 8, 32, 8, 1}}, - {size_key_t(32, 64), {16, 64, 4, 8, 64, 2, 2, 2, 1, 4, 8, 32, 16, 1}}, - {size_key_t(32, 128), {32, 32, 8, 8, 128, 2, 2, 1, 4, 4, 8, 32, 16, 1}}, - {size_key_t(64, 16), {8, 64, 2, 8, 16, 2, 2, 1, 1, 4, 8, 32, 16, 1}}, - {size_key_t(64, 32), {8, 64, 4, 8, 32, 2, 2, 1, 1, 4, 8, 32, 8, 1}}, - {size_key_t(64, 64), {16, 64, 8, 8, 64, 2, 2, 2, 1, 4, 8, 32, 8, 1}}, - {size_key_t(64, 128), {32, 64, 8, 8, 128, 2, 2, 1, 4, 4, 8, 32, 16, 1}}, - {size_key_t(128, 16), {16, 128, 2, 8, 16, 2, 2, 2, 1, 4, 8, 32, 32, 1}}, - {size_key_t(128, 32), {32, 128, 8, 4, 32, 2, 2, 4, 1, 4, 8, 32, 16, 1}}, - {size_key_t(128, 64), {32, 128, 16, 4, 64, 2, 2, 4, 1, 4, 8, 32, 8, 1}}, - {size_key_t(128, 128), {32, 128, 8, 8, 128, 2, 2, 4, 1, 4, 8, 32, 16, 1}} + {size_key_t(16, 16), {8, 16, 4, 4, 16, 2, 2, 1, 1, 4, 8, 32, 8, 1}}, + {size_key_t(16, 32), {8, 16, 8, 4, 32, 2, 2, 1, 1, 4, 8, 32, 4, 1}}, + {size_key_t(16, 64), {16, 16, 4, 8, 64, 2, 2, 1, 4, 8, 4, 32, 32, 1}}, + {size_key_t(16, 128), {16, 16, 8, 8, 128, 2, 2, 1, 1, 2, 4, 32, 4, 1}}, + {size_key_t(32, 16), {4, 32, 4, 4, 16, 2, 2, 1, 1, 8, 4, 32, 8, 1}}, + {size_key_t(32, 32), {8, 32, 8, 4, 32, 2, 2, 1, 1, 4, 8, 32, 4, 1}}, + {size_key_t(32, 64), {64, 128, 8, 4, 64, 2, 2, 4, 1, 2, 8, 32, 16, 1}}, + {size_key_t(32, 128), {16, 32, 32, 4, 128, 2, 2, 1, 2, 4, 8, 32, 2, 1}}, + {size_key_t(32, 256), {32, 32, 32, 4, 256, 2, 2, 1, 4, 4, 8, 32, 4, 1}}, + {size_key_t(64, 16), {4, 64, 2, 8, 16, 2, 2, 1, 1, 8, 4, 32, 16, 1}}, + {size_key_t(64, 32), {4, 64, 8, 4, 32, 2, 2, 1, 1, 8, 4, 32, 4, 1}}, + {size_key_t(64, 64), {16, 64, 8, 8, 64, 2, 2, 2, 1, 4, 8, 32, 8, 1}}, + {size_key_t(64, 128), {32, 64, 8, 8, 128, 2, 2, 1, 4, 4, 4, 32, 16, 1}}, + {size_key_t(64, 256), {64, 64, 8, 8, 256, 2, 2, 1, 4, 2, 8, 32, 16}}, + {size_key_t(128, 16), {8, 128, 2, 8, 16, 2, 2, 1, 1, 4, 8, 32, 16, 1}}, + {size_key_t(128, 32), {16, 128, 8, 4, 32, 2, 2, 4, 1, 8, 4, 32, 16, 1}}, + {size_key_t(128, 64), {32, 128, 8, 8, 64, 2, 2, 4, 1, 4, 8, 32, 16, 1}}, + {size_key_t(128, 128), {32, 128, 16, 8, 128, 2, 2, 2, 2, 4, 8, 32, 8, 1}}, + {size_key_t(256, 16), {32, 256, 4, 4, 16, 2, 2, 4, 1, 4, 8, 32, 32, 1}}, + {size_key_t(256, 32), {32, 256, 8, 4, 32, 2, 2, 4, 1, 4, 8, 32, 16, 1}} }} }; diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index 8f0f1ef73..de84d1788 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -72,6 +72,7 @@ public: void target_independent(ir::module &module) { optimize_dot.run(module); optimize_trans.run(module); +// optimize_dce.run(module); } void target_dependent(ir::module &module) { diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index b4e40a3f2..5ab9c55f8 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -1190,9 +1190,8 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & Value *ptr = pointers->get_value(idx); ConstantInt *cst = nullptr; if(GetElementPtrInst *gep = dyn_cast(ptr)) - if(gep->getNumIndices() == 1){ + if(gep->getNumIndices() == 1) cst = dyn_cast(gep->idx_begin()); - } ptr = builder.CreateBitCast(ptr, PointerType::get(VectorType::get(result->get_ty(), vector_size), ptr->getType()->getPointerAddressSpace())); packets[id] = builder.CreateLoad(ptr); @@ -1202,7 +1201,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & result->for_each([&](indices_t idx){ unsigned linear = result->get_linear_index(idx); unsigned id = linear / vector_size; -// result->set_value(idx, builder.CreateExtractElement(packets.at(id), linear % vector_size)); + result->set_value(idx, builder.CreateExtractElement(packets.at(id), linear % vector_size)); }); } // element-wise diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 1da6240dd..820db29b3 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -247,14 +247,14 @@ void tune::run(ir::module &mod) { size_t addr_space = ptr_ty->get_pointer_address_space(); if(addr_space < 4){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 8, 8)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 4, 8)); *params_.at(i).at("nts.d0") = *tmp; } } if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 8, 8)); - std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 8, 8)); + std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 4, 8)); + std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 4, 8)); *params_.at(i).at("nts.d0") = *tmp1; *params_.at(i).at("nts.d1") = *tmp2; } diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index d7ffc11d2..72e0d340e 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -59,8 +59,8 @@ void base::enqueue(driver::stream *stream, std::vector args, a jit->add_module(name_.c_str(), src.c_str(), best.params); } else { - params_t params = heuristics(); -// params_t params = jit->get_valid(name_.c_str(), src.c_str()); +// params_t params = heuristics(); + params_t params = jit->get_valid(name_.c_str(), src.c_str()); jit->add_module(name_.c_str(), src.c_str(), params); } triton::driver::kernel* kernel = jit->get_function(name_.c_str()); diff --git a/lib/dnn/blocksparse/dot.cpp b/lib/dnn/blocksparse/dot.cpp new file mode 100644 index 000000000..9ddb2514b --- /dev/null +++ b/lib/dnn/blocksparse/dot.cpp @@ -0,0 +1,109 @@ +#include "triton/dnn/blocksparse/dot.h" + +namespace triton{ +namespace dnn{ +namespace blocksparse{ + + +size_t dot::num_flops() const { + +} + +bool dot::operator <(const base& other) const { + auto *y = dynamic_cast(&other); + if(!y) + return true; + return std::tie(M_, N_, K_) + < std::tie(y->M_, y->N_, y->K_); +} + +std::vector dot::search_space() const { + +} + +params_t dot::heuristics() const { + +} + +base * dot::clone() const { + return new dot(*this); +} + +dot::dot(int32_t M, int32_t N, int32_t K): + base("bsdot"), M_(M), N_(N), K_(K) { + ab_ty_ = "fp32"; + c_ty_ = "fp32"; +} + +void dot::enqueue_impl(driver::stream *stream, driver::kernel *kernel, + std::vector args, runtime::launch_information info) { + driver::buffer *a = args[0]; + driver::buffer *b = args[1]; + driver::buffer *c = args[2]; + driver::buffer *lut = args[3]; + int32_t lda = M_; + int32_t ldc = M_; + kernel->setArg(0, a); + kernel->setArg(1, b); + kernel->setArg(2, c); + kernel->setArg(3, lda); + kernel->setArg(4, ldc); + kernel->setArg(5, lut); + int32_t TM = info.globals["TM"]; + int32_t TN = info.globals["TN"]; + size_t grid_0 = (M_ + TM - 1) / TM; + size_t grid_1 = (N_ + TN - 1) / TN; + stream->enqueue(kernel, {grid_0, grid_1, 1}, {info.num_threads, 1, 1}); + stream->synchronize(); +} + +void dot::triton_c_src(std::ostream &os) const { + std::string result = + + R"( + const tunable int32 TM = {64, 128}; + const tunable int32 TN = {32}; + const tunable int32 TK = {32}; + + void bsdot(restrict read_only align(16) )" + ab_ty_ + R"( *A, + restrict read_only align(16) )" + ab_ty_ + R"( *B, + fp32* C, + int32 lda, int32 ldc, + int32* lut_base){ + int32 ridx = get_range_id(0); + int32 ridy = get_range_id(1); + fp32 c[TM, TN] = 0; + int32 rka[TK] = 0 ... TK; + int32 rkb[TK] = 0 ... TK; + int32 rxa[TM] = ridx * TM + (0 ... TM); + int32 ryb[TN] = 0 ... TN; + int32 offa[TM, TK] = rxa[:, newaxis] + rka[newaxis, :]*lda; + int32 offb[TK, TN] = ryb[newaxis, :] + rkb[:, newaxis]*TK; + int32 *header = lut_base + ridy * 4; + int32 offset = *(header + 0); + int32 K = *(header + 1); + int32 h2 = *(header + 2); + int32 h3 = *(header + 3); + int32 *lut = lut_base + offset*2; + for(int32 k = K; k > 0; k = k - 1){ + int32 ak = *(lut + 0); + int32 bk = *(lut + 1); + fp32* pa[TM, TK] = A + offa + ak * TK * lda; + fp32* pb[TK, TN] = B + offb + bk * TK * TN; + fp32 a[TM, TK] = *pa; + fp32 b[TK, TN] = *pb;; + c = dot(a, b, c); + lut = lut + 2; + } + int32 rxc[TM] = ridx * TM + (0 ... TM); + int32 ryc[TN] = ridy * TN + (0 ... TN); + fp32* pc[TM, TN] = C + rxc[:, newaxis] + ryc[newaxis, :]*ldc; + *pc = c; + })"; + + os << result; +} + +} +} +} diff --git a/lib/dnn/gemm.cpp b/lib/dnn/dot.cpp similarity index 97% rename from lib/dnn/gemm.cpp rename to lib/dnn/dot.cpp index 897a26402..114ec7450 100644 --- a/lib/dnn/gemm.cpp +++ b/lib/dnn/dot.cpp @@ -1,6 +1,6 @@ #include "triton/driver/stream.h" #include "triton/driver/kernel.h" -#include "triton/dnn/gemm.h" +#include "triton/dnn/dot.h" #include "triton/dnn/heuristics.h" #include @@ -101,8 +101,8 @@ void dot::triton_c_src(std::ostream &os) const { std::string align_ldb_str = "multiple_of(" + std::to_string(align_ldb_) + ")"; std::string res = R"( -const tunable int32 TM = {16, 32, 64, 128}; -const tunable int32 TN = {16, 32, 64, 128}; +const tunable int32 TM = {16, 32, 64, 128, 256}; +const tunable int32 TN = {16, 32, 64, 128, 256}; const tunable int32 TK = {32}; const tunable int32 GZ = {1}; diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index fd45ea805..58c62dd46 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -467,7 +467,7 @@ if(op_ == WGRAD){ pa = pa_base + offxa[:, newaxis];)"; } result += R"( - @checka a = *pa;)"; + a = checka ? *pa : 0;)"; /* Increment B pointers */ if(op_ == WGRAD){ @@ -488,7 +488,7 @@ if(op_ == BPROP){ pb = pb + TK;)"; } result += R"( - @checkb b = *pb; + b = checkb ? *pb : 0; } int32 rxc[TM] = ridx*TM + (0 ... TM); int32 ryc[TN] = ridy*TN + (0 ... TN);)"; diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index f39f6c397..6fa727406 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -37,12 +37,13 @@ void parallel_loop_nest(std::vector const & ranges, size_t D = ranges.size(); std::vector values(D, 0); // thread pools - ThreadPool pool(nthreads); +// ThreadPool pool(nthreads); // Start with innermost loop size_t i = D - 1; while(true){ // Execute function - pool.enqueue(f,values); +// pool.enqueue(f,values); + f(values); while(values[i]++ == ranges[i] - 1){ if(i == 0) return; @@ -50,7 +51,7 @@ void parallel_loop_nest(std::vector const & ranges, } i = D - 1; // Short sleep so that the thread pool doesn't grow too big - std::this_thread::sleep_for(std::chrono::microseconds(1)); +// std::this_thread::sleep_for(std::chrono::microseconds(1)); } } @@ -211,9 +212,9 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben best.perf = perf; best.params = params; } - for(size_t i = 0; i < params.size(); i++) - std::cout << ((i==0)?"":", ") << params[i] << std::flush; - std::cout << ", " << perf << " [ " << best.perf << " ] " << std::endl; +// for(size_t i = 0; i < params.size(); i++) +// std::cout << ((i==0)?"":", ") << params[i] << std::flush; +// std::cout << ", " << perf << " [ " << best.perf << " ] " << std::endl; } }; From dc11f70fad73776f666395d973f6ba1271126b01 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 29 Jul 2019 17:06:20 -0700 Subject: [PATCH 268/494] [dnn/blocksparse] FPROP test passes! --- examples/python/tensorflow/blocksparse.cpp | 28 +++-- include/triton/codegen/target.h | 8 ++ include/triton/dnn/base.h | 20 +++- include/triton/dnn/batchnorm.h | 7 ++ include/triton/dnn/blocksparse/dot.h | 25 +++-- include/triton/dnn/conv.h | 1 + include/triton/dnn/dot.h | 2 + include/triton/ir/builder.h | 3 +- include/triton/ir/instructions.h | 22 ++++ include/triton/lang/expression.h | 20 ++++ include/triton/lang/parser.y | 4 +- include/triton/lang/scanner.l | 3 +- lib/codegen/optimize_dce.cpp | 3 +- lib/codegen/selection.cpp | 42 ++++++-- lib/codegen/target.cpp | 102 ++++++++++++------ lib/dnn/base.cpp | 26 +++-- lib/dnn/blocksparse/dot.cpp | 119 ++++++++++++++------- lib/ir/builder.cpp | 8 ++ lib/ir/instructions.cpp | 23 ++++ lib/lang/expression.cpp | 12 +++ 20 files changed, 360 insertions(+), 118 deletions(-) diff --git a/examples/python/tensorflow/blocksparse.cpp b/examples/python/tensorflow/blocksparse.cpp index b86c6bcab..d6b305fcf 100644 --- a/examples/python/tensorflow/blocksparse.cpp +++ b/examples/python/tensorflow/blocksparse.cpp @@ -101,6 +101,7 @@ typedef struct bsmm_params CUstream stream; } bsmm_params; +template class BlocksparseMatmulOp : public OpKernel { public: explicit BlocksparseMatmulOp(OpKernelConstruction* ctx) : OpKernel(ctx) { @@ -152,29 +153,23 @@ public: shape_c.AddDim(params_.K); Tensor* c = nullptr; OP_REQUIRES_OK(context, context->allocate_output(0, shape_c, &c)); - // grid and block - int blkN = 128, gridN = (N + 127)/128, modN128 = N & 127; - if (axis_ == 1 || (modN128 > 0 && modN128 <= 64) || gridN * params_.segments < SMs_*4){ - blkN = 64; - gridN = (N + 63)/64; - } // allocate locks + int gridN = (N + 63)/64; Tensor* locks; TensorShape shape_l; if (params_.locks > 0) shape_l.AddDim(gridN * params_.locks * 2); OP_REQUIRES_OK(context, context->allocate_output(1, shape_l, &locks)); - // initialize default compute device - triton::runtime::jit jit(ctx); - // matrix multiplication parameters - triton::driver::cu_buffer da(ctx, (CUdeviceptr)a.flat().data(), false); - triton::driver::cu_buffer db(ctx, (CUdeviceptr)b.flat().data(), false); - triton::driver::cu_buffer dc(ctx, (CUdeviceptr)c->flat().data(), false); -// triton::driver::cu_buffer dlocks(ctx, (CUdeviceptr)locks->flat().data(), false); + // wrap tensorflow handles + triton::driver::cu_buffer da(ctx, (CUdeviceptr)a.flat().data(), false); + triton::driver::cu_buffer db(ctx, (CUdeviceptr)b.flat().data(), false); + triton::driver::cu_buffer dc(ctx, (CUdeviceptr)c->flat().data(), false); triton::driver::cu_buffer dlut(ctx, (CUdeviceptr)lut.flat().data(), false); + triton::driver::cu_buffer dlocks(ctx, (CUdeviceptr)locks->flat().data(), false); + // create profile + triton::dnn::blocksparse::dot dot(N, params_.K, params_.segments, params_.C, "fp32", params_.bsize, params_.locks); // blocksparse matmul - triton::dnn::blocksparse::dot dot(N, params_.K, params_.C); - dot.enqueue(stream, {&da, &db, &dc, &dlut}, triton::dnn::NO_TUNING); + dot.enqueue(stream, {&da, &db, &dc, &dlut, &dlocks}, triton::dnn::NO_TUNING); } private: @@ -185,4 +180,5 @@ private: char bench_string_[256]; }; -REGISTER_KERNEL_BUILDER(Name("TritonBlocksparseMatmul").Device(DEVICE_GPU).TypeConstraint("T"), BlocksparseMatmulOp); +REGISTER_KERNEL_BUILDER(Name("TritonBlocksparseMatmul").Device(DEVICE_GPU).TypeConstraint("T"), BlocksparseMatmulOp); +REGISTER_KERNEL_BUILDER(Name("TritonBlocksparseMatmul").Device(DEVICE_GPU).TypeConstraint("T"), BlocksparseMatmulOp); diff --git a/include/triton/codegen/target.h b/include/triton/codegen/target.h index 118ee919f..c080d1c07 100644 --- a/include/triton/codegen/target.h +++ b/include/triton/codegen/target.h @@ -23,9 +23,11 @@ public: virtual ~target() {} virtual void set_kernel(llvm::IRBuilder<>& builder, llvm::LLVMContext &ctx, llvm::Module *module, llvm::Function* fn) = 0; virtual llvm::Instruction* add_barrier(llvm::Module *module, llvm::IRBuilder<>& builder) = 0; + virtual llvm::Instruction* add_memfence(llvm::Module *module, llvm::IRBuilder<>& builder) = 0; virtual llvm::Value* get_global_offset(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned stride, unsigned ax) = 0; virtual llvm::Value* get_local_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax) = 0; virtual llvm::Value* get_block_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax) = 0; + virtual llvm::Value* get_num_blocks(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax) = 0; bool is_gpu() const; private: @@ -37,9 +39,11 @@ public: amd_cl_target(): target(true){} void set_kernel(llvm::IRBuilder<>& builder, llvm::LLVMContext &ctx, llvm::Module *module, llvm::Function* fn); llvm::Instruction* add_barrier(llvm::Module *module, llvm::IRBuilder<>& builder); + llvm::Instruction* add_memfence(llvm::Module *module, llvm::IRBuilder<>& builder); llvm::Value* get_global_offset(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned stride, unsigned ax); llvm::Value* get_local_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); llvm::Value* get_block_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); + llvm::Value* get_num_blocks(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); }; class nvidia_cu_target: public target { @@ -47,9 +51,11 @@ public: nvidia_cu_target(): target(true){} void set_kernel(llvm::IRBuilder<>& builder, llvm::LLVMContext &ctx, llvm::Module *module, llvm::Function* fn); llvm::Instruction* add_barrier(llvm::Module *module, llvm::IRBuilder<>& builder); + llvm::Instruction* add_memfence(llvm::Module *module, llvm::IRBuilder<>& builder); llvm::Value* get_global_offset(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned stride, unsigned ax); llvm::Value* get_local_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); llvm::Value* get_block_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); + llvm::Value* get_num_blocks(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); }; class cpu_target: public target { @@ -57,9 +63,11 @@ public: cpu_target(): target(false){} void set_kernel(llvm::IRBuilder<>& builder, llvm::LLVMContext &ctx, llvm::Module *module, llvm::Function* fn); llvm::Instruction* add_barrier(llvm::Module *module, llvm::IRBuilder<>& builder); + llvm::Instruction* add_memfence(llvm::Module *module, llvm::IRBuilder<>& builder); llvm::Value* get_global_offset(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned stride, unsigned ax); llvm::Value* get_local_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); llvm::Value* get_block_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); + llvm::Value* get_num_blocks(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); }; } diff --git a/include/triton/dnn/base.h b/include/triton/dnn/base.h index 1fbded42c..266f29803 100644 --- a/include/triton/dnn/base.h +++ b/include/triton/dnn/base.h @@ -28,6 +28,11 @@ #include "triton/runtime/launch_info.h" namespace triton{ + +namespace runtime{ + class jit; +} + namespace dnn{ @@ -37,6 +42,13 @@ enum autotuning_t{ NO_TUNING }; +class base; +struct launch_context_t{ + base *op; + driver::kernel* kernel; + triton::runtime::launch_information info; +}; + typedef std::vector params_t; class base { @@ -49,9 +61,9 @@ protected: private: // initialize - virtual void init_impl(driver::stream *, driver::cu_module *){ } + virtual void init_impl(driver::stream *, driver::cu_module *) = 0; // deinitialize - virtual void deinit_impl(){ } + virtual void deinit_impl() = 0; // enqueue virtual void enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, @@ -63,6 +75,8 @@ private: // default parameters virtual std::vector search_space() const; virtual params_t heuristics() const; + // obtain execution jit + std::pair get_profile_impl(driver::stream *stream, std::vector args, autotuning_t autotune); public: // constructor @@ -73,6 +87,8 @@ public: virtual base* clone() const = 0; // enqueue void enqueue(driver::stream* stream, std::vector args, autotuning_t autotune = PARTIAL_TUNING); + // get profile + launch_context_t get_launch_context(driver::stream *stream, std::vector args, autotuning_t autotune = PARTIAL_TUNING); private: std::string name_; diff --git a/include/triton/dnn/batchnorm.h b/include/triton/dnn/batchnorm.h index 496e19ae4..8f9053225 100644 --- a/include/triton/dnn/batchnorm.h +++ b/include/triton/dnn/batchnorm.h @@ -37,6 +37,10 @@ namespace dnn{ class batchnorm_forward: public base { private: + // init + void init_impl(driver::stream *, driver::cu_module *) { } + void deinit_impl() { } + // enqueue void enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, @@ -69,6 +73,9 @@ private: class batchnorm_backward: public base{ private: + // init + void init_impl(driver::stream *, driver::cu_module *) { } + void deinit_impl() { } // enqueue void enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, diff --git a/include/triton/dnn/blocksparse/dot.h b/include/triton/dnn/blocksparse/dot.h index fbd388937..a1df146fe 100644 --- a/include/triton/dnn/blocksparse/dot.h +++ b/include/triton/dnn/blocksparse/dot.h @@ -14,27 +14,34 @@ private: std::vector args, triton::runtime::launch_information info); // number of flops - virtual size_t num_flops() const; + size_t num_flops() const; // comparison for maps - virtual bool operator<(const base& other) const; + bool operator<(const base& other) const; // default parameters - virtual std::vector search_space() const; - virtual params_t heuristics() const; - + std::vector search_space() const; + params_t heuristics() const; + // init + void init_impl(driver::stream *stream, driver::cu_module *module); + // deinit + void deinit_impl(); public: // constructor - dot(int32_t M, int32_t N, int32_t K); + dot(int32_t N, int32_t K, int32_t S, int32_t C, const std::string &ty, int32_t BS, int32_t nlocks); // triton-c source - virtual void triton_c_src(std::ostream &os) const; + void triton_c_src(std::ostream &os) const; // clone - virtual base* clone() const; + base* clone() const; private: std::string ab_ty_; std::string c_ty_; - int32_t M_; int32_t N_; + int32_t S_; + int32_t C_; int32_t K_; + int32_t BS_; + int32_t nlocks_; + driver::buffer *locks_; }; } diff --git a/include/triton/dnn/conv.h b/include/triton/dnn/conv.h index 1b6f2d778..d81ff872d 100644 --- a/include/triton/dnn/conv.h +++ b/include/triton/dnn/conv.h @@ -25,6 +25,7 @@ private: void build_a_deltas(); void build_masks(); void init_impl(driver::stream *, driver::cu_module *); + void deinit_impl() { } // enqueue std::array get_grid(size_t TM, size_t TN); diff --git a/include/triton/dnn/dot.h b/include/triton/dnn/dot.h index 3df8a13a6..6ba3f0b24 100644 --- a/include/triton/dnn/dot.h +++ b/include/triton/dnn/dot.h @@ -10,6 +10,8 @@ class dot: public base { private: // initialize void init_impl(driver::stream *, driver::cu_module *); + void deinit_impl() { } + // enqueue void enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h index 9cee12c68..1921814c9 100644 --- a/include/triton/ir/builder.h +++ b/include/triton/ir/builder.h @@ -126,9 +126,10 @@ public: value *create_reshape(value *arg, const type::tile_shapes_t &shapes, const std::string &name = ""); value *create_broadcast(value *arg, const type::tile_shapes_t &shapes, const std::string &name = ""); // Built-in instruction - value *create_get_global_range(unsigned axis, type::tile_shapes_t::value_type size, const std::string &name = ""); value *create_get_range_id(unsigned axis, const std::string &name = ""); + value *create_get_num_program(unsigned axis, const std::string &name = ""); value *create_atomic_cas(value *ptr, value *cmp, value *val, const std::string &name = ""); + value *create_atomic_exch(value *ptr, value *val, const std::string &name = ""); value *create_atomic_add(value *ptr, value *val, const std::string &name = ""); value *create_dot(value *A, value *B, value *C, const std::string &name = ""); value *create_trans(value *A, const std::string &name = ""); diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index d76ebf719..37692d617 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -514,6 +514,19 @@ private: unsigned axis_; }; +class get_num_program_inst: public builtin_inst { +private: + get_num_program_inst(type *ty, unsigned axis, const std::string &name, instruction *next); + std::string repr_impl() const { return "get_num_program(" + std::to_string(axis_) + ")"; } + +public: + static instruction* create(context &ctx, unsigned axis, const std::string &name = "", instruction *next = nullptr); + unsigned get_axis() const { return axis_; } + +private: + unsigned axis_; +}; + class atomic_cas_inst: public builtin_inst { private: atomic_cas_inst(value *ptr, value *cmp, value *val, const std::string &name, instruction *next); @@ -523,6 +536,15 @@ public: static instruction* create(value *ptr, value *cmp, value *val, const std::string &name = "", instruction *next = nullptr); }; +class atomic_exch_inst: public builtin_inst { +private: + atomic_exch_inst(value *ptr, value *val, const std::string &name = "", instruction *next = nullptr); + std::string repr_impl() const { return "atomic_exch"; } + +public: + static instruction* create(value *ptr, value *val, const std::string &name = "", instruction *next = nullptr); +}; + class atomic_add_inst: public builtin_inst { private: atomic_add_inst(value *ptr, value *val, const std::string &name = "", instruction *next = nullptr); diff --git a/include/triton/lang/expression.h b/include/triton/lang/expression.h index 538485366..13894d18a 100644 --- a/include/triton/lang/expression.h +++ b/include/triton/lang/expression.h @@ -80,6 +80,15 @@ private: const constant* axis_; }; +class get_num_program_expression: public builtin_expression{ +public: + get_num_program_expression(node *axis): axis_((constant*)axis) { } + ir::value* codegen(ir::module *mod) const; + +private: + const constant* axis_; +}; + class atomic_cas_expression: public builtin_expression{ public: atomic_cas_expression(node *ptr, node *cmp, node *val): ptr_(ptr), cmp_(cmp), val_(val) { } @@ -91,6 +100,17 @@ private: const node *val_; }; +class atomic_exch_expression: public builtin_expression{ +public: + atomic_exch_expression(node *ptr, node *val): ptr_(ptr), val_(val) { } + ir::value* codegen(ir::module *) const; + +private: + const node *ptr_; + const node *val_; +}; + + class atomic_add_expression: public builtin_expression{ public: atomic_add_expression(node *ptr, node *val): ptr_(ptr), val_(val) { } diff --git a/include/triton/lang/parser.y b/include/triton/lang/parser.y index 645b0b51f..cd2c8941b 100644 --- a/include/triton/lang/parser.y +++ b/include/triton/lang/parser.y @@ -55,7 +55,7 @@ STORAGE_SPEC_T get_storage_spec(node *op) { return ((token*)op)->storage_spec;} %token VOID UINT1 UINT8 UINT16 UINT32 UINT64 INT1 INT8 INT16 INT32 INT64 FP16 FP32 FP64 %token IF ELSE FOR CONTINUE WHILE %token NEWAXIS ELLIPSIS AT -%token GET_RANGE_ID DOT SQRT REDUCE_SUM TRANS MAX MIN SELECT ATOMIC_CAS ATOMIC_EXCHG ATOMIC_ADD ALLOC_CONST +%token GET_NUM_PROGRAM GET_RANGE_ID DOT SQRT REDUCE_SUM TRANS MAX MIN SELECT ATOMIC_CAS ATOMIC_EXCH ATOMIC_ADD ALLOC_CONST %start translation_unit %% @@ -121,6 +121,7 @@ identifier /* Built-in */ builtin_expression : GET_RANGE_ID '(' constant ')' { $$ = new get_range_id_expression($3); } + | GET_NUM_PROGRAM '(' constant ')' { $$ = new get_num_program_expression($3); } | DOT '(' expression ',' expression ',' expression ')' { $$ = new matmul_expression($3, $5, $7); } | SQRT '(' expression ')' { $$ = new sqrt_expression($3); } | ALLOC_CONST type_specifier '[' constant ']' { $$ = new alloc_const_expression(new typed_declaration_specifier(get_type_spec($2)), $4); } @@ -130,6 +131,7 @@ builtin_expression | MIN '(' expression ',' expression ')' { $$ = new min_expression($3, $5); } | SELECT '(' expression ',' expression ',' expression ')' { $$ = new select_expression($3, $5, $7); } | ATOMIC_CAS '(' expression ',' expression ',' expression ')' { $$ = new atomic_cas_expression($3, $5, $7); } + | ATOMIC_EXCH '(' expression ',' expression ')' { $$ = new atomic_exch_expression($3, $5); } | ATOMIC_ADD '(' expression ',' expression ')' { $$ = new atomic_add_expression($3, $5); } ; diff --git a/include/triton/lang/scanner.l b/include/triton/lang/scanner.l index 83d11035d..af691349d 100644 --- a/include/triton/lang/scanner.l +++ b/include/triton/lang/scanner.l @@ -45,8 +45,9 @@ using triton::lang::return_void; "fp64" { return return_impl(FP64, yytext); } "..." { return return_impl(ELLIPSIS, yytext); } "get_range_id" { return return_impl(GET_RANGE_ID, yytext); } +"get_num_program" { return return_impl(GET_NUM_PROGRAM, yytext); } "__atomic_cas" { return return_impl(ATOMIC_CAS, yytext); } -"__atomic_exchg" { return return_impl(ATOMIC_EXCHG, yytext); } +"__atomic_exch" { return return_impl(ATOMIC_EXCH, yytext); } "__atomic_add" { return return_impl(ATOMIC_ADD, yytext); } "__sum" { return return_impl(REDUCE_SUM, yytext); } "sqrt" { return return_impl(SQRT, yytext); } diff --git a/lib/codegen/optimize_dce.cpp b/lib/codegen/optimize_dce.cpp index d30bf4c1d..9508cfa2e 100644 --- a/lib/codegen/optimize_dce.cpp +++ b/lib/codegen/optimize_dce.cpp @@ -19,7 +19,8 @@ void optimize_dce::run(ir::module &mod) { for(ir::basic_block *block: rpo) for(ir::instruction *i: block->get_inst_list()){ if(dynamic_cast(i) || dynamic_cast(i) || dynamic_cast(i) - || dynamic_cast(i) || dynamic_cast(i)){ + || dynamic_cast(i) || dynamic_cast(i) + || dynamic_cast(i) || dynamic_cast(i) || dynamic_cast(i) ){ work_list.push_back(i); marked.insert(i); } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 5ab9c55f8..ad7e395b1 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -319,8 +319,12 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::function(inst)){ - Value *offset = tgt_->get_block_id(builder.GetInsertBlock()->getModule(), builder, ii->get_axis()); - return (Instruction*)offset; + Value *result = tgt_->get_block_id(builder.GetInsertBlock()->getModule(), builder, ii->get_axis()); + return (Instruction*)result; + } + if(ir::get_num_program_inst* ii = dynamic_cast(inst)){ + Value *result = tgt_->get_num_blocks(builder.GetInsertBlock()->getModule(), builder, ii->get_axis()); + return (Instruction*)result; } if(ir::atomic_cas_inst* ii = dynamic_cast(inst)){ BasicBlock *current = builder.GetInsertBlock(); @@ -331,6 +335,7 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::functiongetParent()); Value *ptr = builder.CreateGEP(sh_mem_ptr_, builder.getInt32(alloc_->get_offset(ii))); ptr = builder.CreateBitCast(ptr, PointerType::get(builder.getInt32Ty(), ptr->getType()->getPointerAddressSpace())); + tgt_->add_memfence(module, builder); tgt_->add_barrier(module, builder); builder.CreateCondBr(pred, tid_0_bb, tid_0_done_bb); builder.SetInsertPoint(tid_0_bb); @@ -342,10 +347,29 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::functionadd_memfence(module, builder); tgt_->add_barrier(module, builder); Value *res = builder.CreateLoad(ptr); return (Instruction*)res; } + if(ir::atomic_exch_inst* ii = dynamic_cast(inst)){ + BasicBlock *current = builder.GetInsertBlock(); + Module *module = current->getModule(); + Value *rmw_ptr = value(ii->get_operand(0)); + Value *rmw_val = value(ii->get_operand(1)); + Value *tid = tgt_->get_local_id(module, builder, 0); + Value *pred = builder.CreateICmpEQ(tid, builder.getInt32(0)); + BasicBlock *tid_0_bb = BasicBlock::Create(ctx, "tid_0", current->getParent()); + BasicBlock *tid_0_done_bb = BasicBlock::Create(ctx, "tid_0_done", current->getParent()); + tgt_->add_memfence(module, builder); + tgt_->add_barrier(module, builder); + builder.CreateCondBr(pred, tid_0_bb, tid_0_done_bb); + builder.SetInsertPoint(tid_0_bb); + Value *res = builder.CreateAtomicRMW(AtomicRMWInst::Xchg, rmw_ptr, rmw_val, AtomicOrdering::Monotonic, SyncScope::System); + builder.CreateBr(tid_0_done_bb); + builder.SetInsertPoint(tid_0_done_bb); + return (Instruction*)res; + } if(ir::atomic_add_inst* ii = dynamic_cast(inst)){ Value *ptr = value(ii->get_operand(0)); Value *val = value(ii->get_operand(1)); @@ -1136,17 +1160,17 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & Value *result_then = builder.CreateLoad(ptr); builder.CreateBr(mask_done_bb); builder.SetInsertPoint(mask_done_bb); - Value *result = nullptr; + Value *current_result = nullptr; if(false_values){ - result = builder.CreatePHI(result_then->getType(), 2); - ((PHINode*)result)->addIncoming(result_then, mask_then_bb); + current_result = builder.CreatePHI(result_then->getType(), 2); + ((PHINode*)current_result)->addIncoming(result_then, mask_then_bb); Value *result_false = false_values->get_value(idx); - if(vector_size > 1) + if(result_then->getType()->isVectorTy()) result_false = builder.CreateVectorSplat(vector_size, result_false); - ((PHINode*)result)->addIncoming(result_false, current_bb); + ((PHINode*)current_result)->addIncoming(result_false, current_bb); } else - result = result_then; + current_result = result_then; // std::string offset = ""; // if(cst) @@ -1160,7 +1184,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & // InlineAsm *iasm = InlineAsm::get(ty, asm_str, "b,=r,=r,=r,=r,l", true); // Value *result = builder.CreateCall(iasm, {mask, ptr}); - packets[id] = result; + packets[id] = current_result; } }); // extract result element diff --git a/lib/codegen/target.cpp b/lib/codegen/target.cpp index 2e20839d9..4116bcca7 100644 --- a/lib/codegen/target.cpp +++ b/lib/codegen/target.cpp @@ -32,6 +32,11 @@ Value* amd_cl_target::get_global_offset(Module *module, IRBuilder<>& builder, un return result; } +Instruction* amd_cl_target::add_memfence(Module *module, IRBuilder<>& builder) { + throw std::runtime_error("not implemented"); +} + + Value* amd_cl_target::get_block_id(Module *module, IRBuilder<>& builder, unsigned ax) { static std::array ids = { Intrinsic::amdgcn_workgroup_id_x, @@ -43,6 +48,16 @@ Value* amd_cl_target::get_block_id(Module *module, IRBuilder<>& builder, unsigne return group_id; } +Value* amd_cl_target::get_num_blocks(Module *module, IRBuilder<>& builder, unsigned ax) { + static std::array ids = { + Intrinsic::r600_read_ngroups_x, + Intrinsic::r600_read_ngroups_y, + Intrinsic::r600_read_ngroups_z + }; + Value* get_num_group = Intrinsic::getDeclaration(module, ids[ax]); + return builder.CreateCall(get_num_group, {}); +} + Value* amd_cl_target::get_local_id(Module *module, IRBuilder<>& builder, unsigned ax) { static std::array ids = { Intrinsic::amdgcn_workitem_id_x, @@ -70,6 +85,12 @@ Instruction* nvidia_cu_target::add_barrier(Module *module, IRBuilder<>& builder) return builder.CreateCall(barrier, {}); } +Instruction* nvidia_cu_target::add_memfence(Module *module, IRBuilder<>& builder) { + Function *barrier = Intrinsic::getDeclaration(module, Intrinsic::nvvm_membar_gl); + return builder.CreateCall(barrier, {}); +} + + Value* nvidia_cu_target::get_global_offset(Module *module, IRBuilder<>& builder, unsigned stride, unsigned ax) { Value* group_id = get_block_id(module, builder, ax); Value* result = builder.CreateMul(builder.getInt32(stride), group_id); @@ -82,39 +103,39 @@ Value* nvidia_cu_target::get_block_id(Module *module, IRBuilder<>& builder, unsi Intrinsic::nvvm_read_ptx_sreg_ctaid_y, Intrinsic::nvvm_read_ptx_sreg_ctaid_z }; - bool z_order = true; - if(z_order && ax < 2){ - static std::array n_cta_ids = { - Intrinsic::nvvm_read_ptx_sreg_nctaid_x, - Intrinsic::nvvm_read_ptx_sreg_nctaid_y, - Intrinsic::nvvm_read_ptx_sreg_nctaid_z - }; - Value* cta_id_0 = builder.CreateIntrinsic(cta_ids[0], {}, {}); - Value* cta_id_1 = builder.CreateIntrinsic(cta_ids[1], {}, {}); - Value* n_cta_id_0 = builder.CreateIntrinsic(n_cta_ids[0], {}, {}); - Value* n_cta_id_1 = builder.CreateIntrinsic(n_cta_ids[1], {}, {}); - // global block ID - Value* bid = builder.CreateAdd(cta_id_0, builder.CreateMul(cta_id_1, n_cta_id_0)); - // helper for minimum - auto Min = [&](Value *x, Value *y){ - return builder.CreateSelect(builder.CreateICmpSGE(x, y), y, x); - }; - // super-tile size - Value* sts = Min(builder.getInt32(16), n_cta_id_1); - // number of CTAs per super-block - Value *nscta = builder.CreateMul(n_cta_id_0, sts); - Value *bid0 = builder.CreateURem(builder.CreateUDiv(bid, sts), n_cta_id_0); - Value *bid1 = builder.CreateAdd(builder.CreateMul(builder.CreateUDiv(bid, nscta), sts),builder.CreateURem(bid, sts)); - if(ax == 0) - return bid0; - else - return bid1; - } - else{ +// bool z_order = true; +// if(z_order && ax < 2){ +// static std::array n_cta_ids = { +// Intrinsic::nvvm_read_ptx_sreg_nctaid_x, +// Intrinsic::nvvm_read_ptx_sreg_nctaid_y, +// Intrinsic::nvvm_read_ptx_sreg_nctaid_z +// }; +// Value* cta_id_0 = builder.CreateIntrinsic(cta_ids[0], {}, {}); +// Value* cta_id_1 = builder.CreateIntrinsic(cta_ids[1], {}, {}); +// Value* n_cta_id_0 = builder.CreateIntrinsic(n_cta_ids[0], {}, {}); +// Value* n_cta_id_1 = builder.CreateIntrinsic(n_cta_ids[1], {}, {}); +// // global block ID +// Value* bid = builder.CreateAdd(cta_id_0, builder.CreateMul(cta_id_1, n_cta_id_0)); +// // helper for minimum +// auto Min = [&](Value *x, Value *y){ +// return builder.CreateSelect(builder.CreateICmpSGE(x, y), y, x); +// }; +// // super-tile size +// Value* sts = Min(builder.getInt32(16), n_cta_id_1); +// // number of CTAs per super-block +// Value *nscta = builder.CreateMul(n_cta_id_0, sts); +// Value *bid0 = builder.CreateURem(builder.CreateUDiv(bid, sts), n_cta_id_0); +// Value *bid1 = builder.CreateAdd(builder.CreateMul(builder.CreateUDiv(bid, nscta), sts),builder.CreateURem(bid, sts)); +// if(ax == 0) +// return bid0; +// else +// return bid1; +// } +// else{ Value* get_cta_id = Intrinsic::getDeclaration(module, cta_ids[ax]); Value* cta_id = builder.CreateCall(get_cta_id, {}); return cta_id; - } +// } } Value* nvidia_cu_target::get_local_id(Module *module, IRBuilder<>& builder, unsigned ax) { @@ -127,6 +148,16 @@ Value* nvidia_cu_target::get_local_id(Module *module, IRBuilder<>& builder, unsi return builder.CreateCall(get_local_id, {}); } +Value* nvidia_cu_target::get_num_blocks(Module *module, IRBuilder<>& builder, unsigned ax) { + static std::array ids = { + Intrinsic::nvvm_read_ptx_sreg_nctaid_x, + Intrinsic::nvvm_read_ptx_sreg_nctaid_y, + Intrinsic::nvvm_read_ptx_sreg_nctaid_z + }; + Value* get_nctaid = Intrinsic::getDeclaration(module, ids[ax]); + return builder.CreateCall(get_nctaid, {}); +} + // CPU void cpu_target::set_kernel(IRBuilder<>& builder, LLVMContext &ctx, Module *module, Function* fn) { @@ -138,6 +169,12 @@ Instruction* cpu_target::add_barrier(Module *module, IRBuilder<>& builder) { return (Instruction*)builder.CreateAdd(builder.getInt32(0), builder.getInt32(0)); } +Instruction* cpu_target::add_memfence(Module *module, IRBuilder<>& builder) { + // no barrier on CPU + return (Instruction*)builder.CreateAdd(builder.getInt32(0), builder.getInt32(0)); +} + + Value* cpu_target::get_block_id(Module *module, llvm::IRBuilder<> &builder, unsigned ax) { const Function *fn = builder.GetInsertBlock()->getParent(); size_t num_params = fn->getFunctionType()->getNumParams(); @@ -149,6 +186,11 @@ Value* cpu_target::get_block_id(Module *module, llvm::IRBuilder<> &builder, unsi return (Argument*)ids[ax]; } +Value* cpu_target::get_num_blocks(Module *module, IRBuilder<>& builder, unsigned ax) { + throw std::runtime_error("not implemented"); +} + + Value* cpu_target::get_global_offset(Module *module, IRBuilder<>& builder, unsigned stride, unsigned ax) { Value* result = builder.CreateMul(builder.getInt32(stride), get_block_id(module, builder, ax)); return result; diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index 72e0d340e..e5aa7ad45 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -6,6 +6,8 @@ namespace triton{ namespace dnn{ +namespace rt = triton::runtime; + void base::set_ld(const std::vector& shapes, std::vector& ld) { @@ -28,8 +30,7 @@ params_t base::heuristics() const { return *search_space().begin(); } -void base::enqueue(driver::stream *stream, std::vector args, autotuning_t autotune) { - namespace rt = triton::runtime; +std::pair base::get_profile_impl(driver::stream *stream, std::vector args, autotuning_t autotune) { static std::map, cmp_recompile> m_jit; driver::context* ctx = stream->context(); rt::jit* jit; @@ -67,16 +68,23 @@ void base::enqueue(driver::stream *stream, std::vector args, a clone->init_impl(stream, (triton::driver::cu_module*)kernel->module()); } /* retrieved compiled template */ - else{ + else { jit = m_jit.at(this).get(); } - - /* get launch parameters */ - driver::kernel* kernel = jit->get_function(name_.c_str()); - rt::launch_information info = jit->get_launch_info(name_.c_str()); - /* launch */ auto it = m_jit.find(this); - it->first->enqueue_impl(stream, kernel, args, info); + return {it->first, jit}; +} + +void base::enqueue(driver::stream *stream, std::vector args, autotuning_t autotune) { + launch_context_t info = get_launch_context(stream, args, autotune); + info.op->enqueue_impl(stream, info.kernel, args, info.info); +} + +launch_context_t base::get_launch_context(driver::stream *stream, std::vector args, autotuning_t autotune) { + std::pair profile = get_profile_impl(stream, args, autotune); + driver::kernel* kernel = profile.second->get_function(name_.c_str()); + rt::launch_information info = profile.second->get_launch_info(name_.c_str()); + return {profile.first, kernel, info}; } } diff --git a/lib/dnn/blocksparse/dot.cpp b/lib/dnn/blocksparse/dot.cpp index 9ddb2514b..46a706498 100644 --- a/lib/dnn/blocksparse/dot.cpp +++ b/lib/dnn/blocksparse/dot.cpp @@ -13,26 +13,42 @@ bool dot::operator <(const base& other) const { auto *y = dynamic_cast(&other); if(!y) return true; - return std::tie(M_, N_, K_) - < std::tie(y->M_, y->N_, y->K_); + return std::tie(N_, S_, C_, BS_, nlocks_, ab_ty_, c_ty_) + < std::tie(y->N_, y->S_, y->C_, y->BS_, y->nlocks_, y->ab_ty_, y->c_ty_); } std::vector dot::search_space() const { - + throw std::runtime_error("not implemented"); } params_t dot::heuristics() const { - + throw std::runtime_error("not implemented"); } base * dot::clone() const { return new dot(*this); } -dot::dot(int32_t M, int32_t N, int32_t K): - base("bsdot"), M_(M), N_(N), K_(K) { - ab_ty_ = "fp32"; - c_ty_ = "fp32"; +dot::dot(int32_t N, int32_t K, int32_t S, int32_t C, + const std::string& ty, int32_t BS, int32_t nlocks): + base("bsdot"), + N_(N), K_(K), S_(S), C_(C), + ab_ty_(ty), c_ty_(ty), + BS_(BS), nlocks_(nlocks) { +} + +void dot::init_impl(driver::stream *stream, driver::cu_module *module) { +// int32_t TM = info.globals["TM"]; +// size_t grid_0 = (N_ + TM - 1) / TM; +// if(nlocks_){ +// locks_ = triton::driver::buffer::create(stream->context(), grid_0 * nlocks_ * 2 * 4); +// ((driver::cu_buffer*)locks_)->set_zero(stream, grid_0 * nlocks_ * 2 * 4); +// } +} + +void dot::deinit_impl() { +// if(locks_) +// delete locks_; } void dot::enqueue_impl(driver::stream *stream, driver::kernel *kernel, @@ -41,64 +57,89 @@ void dot::enqueue_impl(driver::stream *stream, driver::kernel *kernel, driver::buffer *b = args[1]; driver::buffer *c = args[2]; driver::buffer *lut = args[3]; - int32_t lda = M_; - int32_t ldc = M_; + driver::buffer *locks = args[4]; + int32_t lda = N_; + int32_t ldc = N_; kernel->setArg(0, a); kernel->setArg(1, b); kernel->setArg(2, c); kernel->setArg(3, lda); kernel->setArg(4, ldc); - kernel->setArg(5, lut); + kernel->setArg(5, N_); + kernel->setArg(6, lut); + kernel->setArg(7, locks); + kernel->setArg(8, nlocks_); int32_t TM = info.globals["TM"]; - int32_t TN = info.globals["TN"]; - size_t grid_0 = (M_ + TM - 1) / TM; - size_t grid_1 = (N_ + TN - 1) / TN; + size_t grid_0 = (N_ + TM - 1) / TM; + size_t grid_1 = S_; + std::cout << N_ << " " << grid_0 << std::endl; + if(nlocks_){ +// locks_ = triton::driver::buffer::create(stream->context(), grid_0 * nlocks_ * 2 * 4); + ((driver::cu_buffer*)locks)->set_zero(stream, grid_0 * nlocks_ * 2 * 4); + } stream->enqueue(kernel, {grid_0, grid_1, 1}, {info.num_threads, 1, 1}); - stream->synchronize(); } void dot::triton_c_src(std::ostream &os) const { std::string result = R"( - const tunable int32 TM = {64, 128}; - const tunable int32 TN = {32}; - const tunable int32 TK = {32}; + const tunable int32 TM = {64}; + const tunable int32 TN = {)" + std::to_string(BS_) + R"(}; + const tunable int32 TK = {)" + std::to_string(BS_) + R"(}; void bsdot(restrict read_only align(16) )" + ab_ty_ + R"( *A, restrict read_only align(16) )" + ab_ty_ + R"( *B, - fp32* C, - int32 lda, int32 ldc, - int32* lut_base){ + )" + c_ty_ + R"(* C, + int32 lda, int32 ldc, int32 N, + int32* lut, int32* locks, int32 nlocks){ int32 ridx = get_range_id(0); int32 ridy = get_range_id(1); - fp32 c[TM, TN] = 0; - int32 rka[TK] = 0 ... TK; - int32 rkb[TK] = 0 ... TK; + fp32 acc[TM, TN] = 0; int32 rxa[TM] = ridx * TM + (0 ... TM); int32 ryb[TN] = 0 ... TN; + int32 rka[TK] = 0 ... TK; + int32 rkb[TK] = 0 ... TK; int32 offa[TM, TK] = rxa[:, newaxis] + rka[newaxis, :]*lda; int32 offb[TK, TN] = ryb[newaxis, :] + rkb[:, newaxis]*TK; - int32 *header = lut_base + ridy * 4; + int32 *header = lut + ridy * 4; int32 offset = *(header + 0); int32 K = *(header + 1); - int32 h2 = *(header + 2); - int32 h3 = *(header + 3); - int32 *lut = lut_base + offset*2; + int32 column = *(header + 2); + int32 lockid = *(header + 3); + int32 *plut = lut + offset * 2; for(int32 k = K; k > 0; k = k - 1){ - int32 ak = *(lut + 0); - int32 bk = *(lut + 1); - fp32* pa[TM, TK] = A + offa + ak * TK * lda; - fp32* pb[TK, TN] = B + offb + bk * TK * TN; - fp32 a[TM, TK] = *pa; - fp32 b[TK, TN] = *pb;; - c = dot(a, b, c); - lut = lut + 2; + int32 ak = *(plut + 0); + int32 bk = *(plut + 1); + )" + ab_ty_ + R"(* pa[TM, TK] = A + offa + ak * TK * lda; + )" + ab_ty_ + R"(* pb[TK, TN] = B + offb + bk * TK * TN; + )" + ab_ty_ + R"( a[TM, TK] = *pa; + )" + ab_ty_ + R"( b[TK, TN] = *pb; + acc = dot(a, b, acc); + plut = plut + 2; } int32 rxc[TM] = ridx * TM + (0 ... TM); - int32 ryc[TN] = ridy * TN + (0 ... TN); - fp32* pc[TM, TN] = C + rxc[:, newaxis] + ryc[newaxis, :]*ldc; - *pc = c; + int32 ryc[TN] = column * TN + (0 ... TN); + )" + c_ty_ + R"(" c[TM, TN] = acc; + )" + c_ty_ + R"(* pc[TM, TN] = C + rxc[:, newaxis] + ryc[newaxis, :]*ldc; + int1 checkc[TM, TN] = (rxc < N)[:, newaxis]; + if(lockid == 0){ + @checkc *pc = c; + } + else{ + int32 *plock = locks + ridx*nlocks + lockid - 1; + int32 *pcount = plock + get_num_program(0)*nlocks; + while(__atomic_cas(plock, 0, 1)); + int32 count = *pcount; + if(count == 0) { + @checkc *pc = c; + } + else { + @checkc *pc = c + *pc; + } + *pcount = 1; + __atomic_exch(plock, 0); + } })"; os << result; diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index e58fd9924..77c099827 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -294,10 +294,18 @@ value *builder::create_get_range_id(unsigned axis, const std::string &name) { return insert(get_range_id_inst::create(ctx_, axis, name)); } +value *builder::create_get_num_program(unsigned axis, const std::string &name) { + return insert(get_num_program_inst::create(ctx_, axis, name)); +} + value *builder::create_atomic_cas(value *ptr, value *cmp, value *val, const std::string &name){ return insert(atomic_cas_inst::create(ptr, cmp, val, name)); } +value *builder::create_atomic_exch(value *ptr, value *val, const std::string &name){ + return insert(atomic_exch_inst::create(ptr, val, name)); +} + value *builder::create_atomic_add(value *ptr, value *val, const std::string &name){ return insert(atomic_add_inst::create(ptr, val, name)); } diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 9537336fb..a29c11914 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -636,6 +636,17 @@ instruction* get_range_id_inst::create(context &ctx, unsigned axis, const std::s return new get_range_id_inst(type::get_int32_ty(ctx), axis, name, next); } +// get_num_program +get_num_program_inst::get_num_program_inst(type *ty, unsigned axis, const std::string &name, instruction *next) + : builtin_inst(ty, 0, 1, name, next), axis_(axis){ + +} + +instruction* get_num_program_inst::create(context &ctx, unsigned axis, const std::string &name, instruction *next) { + return new get_num_program_inst(type::get_int32_ty(ctx), axis, name, next); +} + + // atomic cas atomic_cas_inst::atomic_cas_inst(value *ptr, value *cmp, value *val, const std::string &name, instruction *next) @@ -649,6 +660,18 @@ instruction* atomic_cas_inst::create(value *ptr, value *cmp, value *val, const s return new atomic_cas_inst(ptr, cmp, val, name, next); } +// atomic exch + +atomic_exch_inst::atomic_exch_inst(value *ptr, value *val, const std::string &name, instruction *next) + : builtin_inst(ptr->get_type()->get_pointer_element_ty(), 2, 1, name, next) { + set_operand(0, ptr); + set_operand(1, val); +} + +instruction* atomic_exch_inst::create(value *ptr, value *val, const std::string &name, instruction *next) { + return new atomic_exch_inst(ptr, val, name, next); +} + // atomic add atomic_add_inst::atomic_add_inst(value *ptr, value *val, const std::string &name, instruction *next) diff --git a/lib/lang/expression.cpp b/lib/lang/expression.cpp index 6baa1f3b2..15e66607a 100644 --- a/lib/lang/expression.cpp +++ b/lib/lang/expression.cpp @@ -120,6 +120,11 @@ ir::value* get_range_id_expression::codegen(ir::module *mod) const { return mod->get_builder().create_get_range_id(axis_->value()); } +// get_num_program +ir::value* get_num_program_expression::codegen(ir::module *mod) const { + return mod->get_builder().create_get_num_program(axis_->value()); +} + // atomic cas ir::value* atomic_cas_expression::codegen(ir::module *mod) const { ir::value *ptr = ptr_->codegen(mod); @@ -128,6 +133,13 @@ ir::value* atomic_cas_expression::codegen(ir::module *mod) const { return mod->get_builder().create_atomic_cas(ptr, cmp, val); } +// atomic exch +ir::value* atomic_exch_expression::codegen(ir::module *mod) const { + ir::value *ptr = ptr_->codegen(mod); + ir::value *val = val_->codegen(mod); + return mod->get_builder().create_atomic_exch(ptr, val); +} + // atomic add ir::value* atomic_add_expression::codegen(ir::module *mod) const { ir::value *ptr = ptr_->codegen(mod); From 080bf1af88258bc3fb25d5fda1568df953b794b2 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 30 Jul 2019 11:42:31 -0700 Subject: [PATCH 269/494] [dnn/blocksparse/dot]: BlocksparseDx also working --- examples/python/tensorflow/blocksparse.cpp | 105 +++++++++++++-------- include/triton/dnn/blocksparse/dot.h | 8 +- lib/codegen/tune.cpp | 6 +- lib/dnn/blocksparse/dot.cpp | 59 ++++++------ 4 files changed, 109 insertions(+), 69 deletions(-) diff --git a/examples/python/tensorflow/blocksparse.cpp b/examples/python/tensorflow/blocksparse.cpp index d6b305fcf..5da231a12 100644 --- a/examples/python/tensorflow/blocksparse.cpp +++ b/examples/python/tensorflow/blocksparse.cpp @@ -45,39 +45,6 @@ Status XpropShape(InferenceContext* ctx) } -REGISTER_OP("TritonBlocksparseMatmul") -.Input("x: T") -.Input("w: T") -.Input("lut: int64") -.Input("lut_dx: int64") -.Input("lut_dw: int64") -.Input("gate: ngate * float") -.Output("y: T") -.Output("temp: int32") -.Attr("T: {half, float, bfloat16}") -.Attr("blocks: int >=0") -.Attr("bsize: int") -.Attr("segments: int = 0") -.Attr("segments_dx: int = 0") -.Attr("locks: int = 0") -.Attr("locks_dx: int = 0") -.Attr("axis: int = 1") -.Attr("C: int >=0") -.Attr("K: int >=0") -.Attr("shared: int = 0") -.Attr("shared_dx: int = 0") -.Attr("alpha: float = 1.0") -.Attr("beta: float = 0.0") -.Attr("gated_dw: bool = false") -.Attr("gate_grad: bool = false") -.Attr("bench: int = 0") -.Attr("ngate: int >= 0") -.SetShapeFn(XpropShape) -.Doc(R"doc( - Multiply the matrix "a" by the blocksparse matrix "b". - )doc"); - - typedef struct bsmm_params { const int* Lut; @@ -101,7 +68,7 @@ typedef struct bsmm_params CUstream stream; } bsmm_params; -template +template class BlocksparseMatmulOp : public OpKernel { public: explicit BlocksparseMatmulOp(OpKernelConstruction* ctx) : OpKernel(ctx) { @@ -167,7 +134,7 @@ public: triton::driver::cu_buffer dlut(ctx, (CUdeviceptr)lut.flat().data(), false); triton::driver::cu_buffer dlocks(ctx, (CUdeviceptr)locks->flat().data(), false); // create profile - triton::dnn::blocksparse::dot dot(N, params_.K, params_.segments, params_.C, "fp32", params_.bsize, params_.locks); + triton::dnn::blocksparse::dot dot(N, params_.K, params_.segments, params_.C, "fp32", params_.bsize, params_.locks, OP); // blocksparse matmul dot.enqueue(stream, {&da, &db, &dc, &dlut, &dlocks}, triton::dnn::NO_TUNING); } @@ -180,5 +147,69 @@ private: char bench_string_[256]; }; -REGISTER_KERNEL_BUILDER(Name("TritonBlocksparseMatmul").Device(DEVICE_GPU).TypeConstraint("T"), BlocksparseMatmulOp); -REGISTER_KERNEL_BUILDER(Name("TritonBlocksparseMatmul").Device(DEVICE_GPU).TypeConstraint("T"), BlocksparseMatmulOp); +REGISTER_OP("TritonBlocksparseMatmul") +.Input("x: T") +.Input("w: T") +.Input("lut: int64") +.Input("lut_dx: int64") +.Input("lut_dw: int64") +.Input("gate: ngate * float") +.Output("y: T") +.Output("temp: int32") +.Attr("T: {half, float, bfloat16}") +.Attr("blocks: int >=0") +.Attr("bsize: int") +.Attr("segments: int = 0") +.Attr("segments_dx: int = 0") +.Attr("locks: int = 0") +.Attr("locks_dx: int = 0") +.Attr("axis: int = 1") +.Attr("C: int >=0") +.Attr("K: int >=0") +.Attr("shared: int = 0") +.Attr("shared_dx: int = 0") +.Attr("alpha: float = 1.0") +.Attr("beta: float = 0.0") +.Attr("gated_dw: bool = false") +.Attr("gate_grad: bool = false") +.Attr("bench: int = 0") +.Attr("ngate: int >= 0") +.SetShapeFn(XpropShape) +.Doc(R"doc( + Multiply the matrix "a" by the blocksparse matrix "b". + )doc"); + +REGISTER_KERNEL_BUILDER(Name("TritonBlocksparseMatmul").Device(DEVICE_GPU).TypeConstraint("T"), BlocksparseMatmulOp); +REGISTER_KERNEL_BUILDER(Name("TritonBlocksparseMatmul").Device(DEVICE_GPU).TypeConstraint("T"), BlocksparseMatmulOp); + + +REGISTER_OP("TritonBlocksparseMatmulDX") + .Input("dy: T") + .Input("w: T") + .Input("lut: int64") + .Input("gate: ngate * float") + .Output("dx: T") + .Output("temp: int32") + .Attr("T: {half, float, bfloat16}") + .Attr("blocks: int >=0") + .Attr("bsize: int") + .Attr("segments: int = 0") + .Attr("locks: int = 0") + .Attr("axis: int = 1") + .Attr("C: int >=0") + .Attr("K: int >=0") + .Attr("shared: int = 0") + .Attr("alpha: float = 1.0") + .Attr("beta: float = 0.0") + .Attr("gated_dw: bool = false") + .Attr("gate_grad: bool = false") + .Attr("bench: int = 0") + .Attr("ngate: int >= 0") + .SetShapeFn(XpropShape) + .Doc(R"doc( +Multiply the matrix "a" by the blocksparse matrix "b". +)doc"); + +REGISTER_KERNEL_BUILDER(Name("TritonBlocksparseMatmulDX").Device(DEVICE_GPU).TypeConstraint("T"),BlocksparseMatmulOp); +REGISTER_KERNEL_BUILDER(Name("TritonBlocksparseMatmulDX").Device(DEVICE_GPU).TypeConstraint("T"),BlocksparseMatmulOp); + diff --git a/include/triton/dnn/blocksparse/dot.h b/include/triton/dnn/blocksparse/dot.h index a1df146fe..01c94a2fe 100644 --- a/include/triton/dnn/blocksparse/dot.h +++ b/include/triton/dnn/blocksparse/dot.h @@ -7,6 +7,11 @@ namespace triton{ namespace dnn{ namespace blocksparse{ +enum op_t{ + FPROP, + BPROP, + WGRAD +}; class dot: public base { private: @@ -26,7 +31,7 @@ private: void deinit_impl(); public: // constructor - dot(int32_t N, int32_t K, int32_t S, int32_t C, const std::string &ty, int32_t BS, int32_t nlocks); + dot(int32_t N, int32_t K, int32_t S, int32_t C, const std::string &ty, int32_t BS, int32_t nlocks, op_t op = FPROP); // triton-c source void triton_c_src(std::ostream &os) const; // clone @@ -42,6 +47,7 @@ private: int32_t BS_; int32_t nlocks_; driver::buffer *locks_; + op_t op_; }; } diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 820db29b3..1bb2701bc 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -247,14 +247,14 @@ void tune::run(ir::module &mod) { size_t addr_space = ptr_ty->get_pointer_address_space(); if(addr_space < 4){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 4, 8)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 2)); *params_.at(i).at("nts.d0") = *tmp; } } if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 4, 8)); - std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 4, 8)); + std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 2, 2)); + std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 2, 2)); *params_.at(i).at("nts.d0") = *tmp1; *params_.at(i).at("nts.d1") = *tmp2; } diff --git a/lib/dnn/blocksparse/dot.cpp b/lib/dnn/blocksparse/dot.cpp index 46a706498..2f69677a8 100644 --- a/lib/dnn/blocksparse/dot.cpp +++ b/lib/dnn/blocksparse/dot.cpp @@ -13,8 +13,8 @@ bool dot::operator <(const base& other) const { auto *y = dynamic_cast(&other); if(!y) return true; - return std::tie(N_, S_, C_, BS_, nlocks_, ab_ty_, c_ty_) - < std::tie(y->N_, y->S_, y->C_, y->BS_, y->nlocks_, y->ab_ty_, y->c_ty_); + return std::tie(N_, S_, C_, BS_, nlocks_, ab_ty_, c_ty_, op_) + < std::tie(y->N_, y->S_, y->C_, y->BS_, y->nlocks_, y->ab_ty_, y->c_ty_, y->op_); } std::vector dot::search_space() const { @@ -30,11 +30,11 @@ base * dot::clone() const { } dot::dot(int32_t N, int32_t K, int32_t S, int32_t C, - const std::string& ty, int32_t BS, int32_t nlocks): + const std::string& ty, int32_t BS, int32_t nlocks, op_t op): base("bsdot"), N_(N), K_(K), S_(S), C_(C), ab_ty_(ty), c_ty_(ty), - BS_(BS), nlocks_(nlocks) { + BS_(BS), nlocks_(nlocks), op_(op){ } void dot::init_impl(driver::stream *stream, driver::cu_module *module) { @@ -72,27 +72,33 @@ void dot::enqueue_impl(driver::stream *stream, driver::kernel *kernel, int32_t TM = info.globals["TM"]; size_t grid_0 = (N_ + TM - 1) / TM; size_t grid_1 = S_; - std::cout << N_ << " " << grid_0 << std::endl; - if(nlocks_){ -// locks_ = triton::driver::buffer::create(stream->context(), grid_0 * nlocks_ * 2 * 4); + if(nlocks_) ((driver::cu_buffer*)locks)->set_zero(stream, grid_0 * nlocks_ * 2 * 4); - } stream->enqueue(kernel, {grid_0, grid_1, 1}, {info.num_threads, 1, 1}); } void dot::triton_c_src(std::ostream &os) const { + std::string usea = (op_ == WGRAD) ? "trans(a)" : "a"; + std::string useb = (op_ == FPROP) ? "trans(b)" : "b"; + std::string sizea = "TM, TK"; + std::string sizeb = (op_ == FPROP) ? "TN, TK" : "TK, TN"; + std::string bca0 = ":, newaxis"; + std::string bca1 = "newaxis, :"; + std::string bcb0 = (op_ == FPROP) ? ":, newaxis" : "newaxis, :"; + std::string bcb1 = (op_ == FPROP) ? "newaxis, :" : ":, newaxis"; + std::string ldb0 = (op_ == FPROP) ? "1" : "TK"; + std::string ldb1 = (op_ == FPROP) ? "TK" : "1" ; std::string result = - R"( const tunable int32 TM = {64}; const tunable int32 TN = {)" + std::to_string(BS_) + R"(}; const tunable int32 TK = {)" + std::to_string(BS_) + R"(}; void bsdot(restrict read_only align(16) )" + ab_ty_ + R"( *A, - restrict read_only align(16) )" + ab_ty_ + R"( *B, - )" + c_ty_ + R"(* C, - int32 lda, int32 ldc, int32 N, - int32* lut, int32* locks, int32 nlocks){ + restrict read_only align(16) )" + ab_ty_ + R"( *B, + )" + c_ty_ + R"(* C, + int32 lda, int32 ldc, int32 N, + int32* lut, int32* locks, int32 nlocks){ int32 ridx = get_range_id(0); int32 ridy = get_range_id(1); fp32 acc[TM, TN] = 0; @@ -100,22 +106,22 @@ void dot::triton_c_src(std::ostream &os) const { int32 ryb[TN] = 0 ... TN; int32 rka[TK] = 0 ... TK; int32 rkb[TK] = 0 ... TK; - int32 offa[TM, TK] = rxa[:, newaxis] + rka[newaxis, :]*lda; - int32 offb[TK, TN] = ryb[newaxis, :] + rkb[:, newaxis]*TK; + int32 offa[)" + sizea + "] = rxa[" + bca0 + "] + rka[" + bca1 + R"(]*lda; + int32 offb[)" + sizeb + "] = ryb[" + bcb0 + "]*" + ldb0 + " + rkb[" + bcb1 + "]*" + ldb1 + R"(; int32 *header = lut + ridy * 4; int32 offset = *(header + 0); int32 K = *(header + 1); int32 column = *(header + 2); - int32 lockid = *(header + 3); + int32 lockid = *(header + 3); int32 *plut = lut + offset * 2; for(int32 k = K; k > 0; k = k - 1){ int32 ak = *(plut + 0); int32 bk = *(plut + 1); - )" + ab_ty_ + R"(* pa[TM, TK] = A + offa + ak * TK * lda; - )" + ab_ty_ + R"(* pb[TK, TN] = B + offb + bk * TK * TN; - )" + ab_ty_ + R"( a[TM, TK] = *pa; - )" + ab_ty_ + R"( b[TK, TN] = *pb; - acc = dot(a, b, acc); + )" + ab_ty_ + "* pa[" + sizea + R"(] = A + offa + ak * TK * lda; + )" + ab_ty_ + "* pb[" + sizeb + R"(] = B + offb + bk * TK * TN; + )" + ab_ty_ + " a[" + sizea + R"(] = *pa; + )" + ab_ty_ + " b[" + sizeb + R"(] = *pb; + acc = dot()" + usea + ", " + useb + R"(, acc); plut = plut + 2; } int32 rxc[TM] = ridx * TM + (0 ... TM); @@ -123,20 +129,17 @@ void dot::triton_c_src(std::ostream &os) const { )" + c_ty_ + R"(" c[TM, TN] = acc; )" + c_ty_ + R"(* pc[TM, TN] = C + rxc[:, newaxis] + ryc[newaxis, :]*ldc; int1 checkc[TM, TN] = (rxc < N)[:, newaxis]; - if(lockid == 0){ + if(lockid == 0) @checkc *pc = c; - } - else{ + else { int32 *plock = locks + ridx*nlocks + lockid - 1; int32 *pcount = plock + get_num_program(0)*nlocks; while(__atomic_cas(plock, 0, 1)); int32 count = *pcount; - if(count == 0) { + if(count == 0) @checkc *pc = c; - } - else { + else @checkc *pc = c + *pc; - } *pcount = 1; __atomic_exch(plock, 0); } From 5af7e5adac58d0234129d44c8a1088cd39fb8699 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 30 Jul 2019 20:02:16 -0700 Subject: [PATCH 270/494] Made sure it works for FP16 --- examples/python/tensorflow/batchnorm.cpp | 28 ++++++++--------- examples/python/tensorflow/blocksparse.cpp | 27 ++++++++-------- examples/python/tensorflow/conv.cpp | 6 ++-- examples/python/tensorflow/dot.cpp | 6 ++-- examples/python/tensorflow/shift.cpp | 6 ++-- include/triton/dnn/base.h | 4 +-- include/triton/dnn/batchnorm.h | 4 +-- include/triton/dnn/blocksparse/dot.h | 9 ++++-- include/triton/dnn/conv.h | 2 +- include/triton/dnn/dot.h | 2 +- include/triton/dnn/shift.h | 2 +- include/triton/driver/buffer.h | 10 +++--- include/triton/tools/bench.hpp | 4 +-- lib/dnn/base.cpp | 9 ++++-- lib/dnn/blocksparse/dot.cpp | 36 ++++++++++++---------- lib/dnn/conv.cpp | 18 ++++++----- lib/dnn/dot.cpp | 2 +- lib/dnn/shift.cpp | 2 +- lib/driver/buffer.cpp | 26 +++++++++------- lib/driver/module.cpp | 2 +- lib/runtime/jit.cpp | 14 ++++----- 21 files changed, 118 insertions(+), 101 deletions(-) diff --git a/examples/python/tensorflow/batchnorm.cpp b/examples/python/tensorflow/batchnorm.cpp index 3e50aeb8b..137a84809 100644 --- a/examples/python/tensorflow/batchnorm.cpp +++ b/examples/python/tensorflow/batchnorm.cpp @@ -51,12 +51,12 @@ public: OP_REQUIRES_OK(context, context->allocate_output(1, fw_g.shape(), &fw_m)); OP_REQUIRES_OK(context, context->allocate_output(2, fw_g.shape(), &fw_v)); // triton handles - triton::driver::cu_buffer x(ctx, (CUdeviceptr)fw_x.flat().data(), false); - triton::driver::cu_buffer g(ctx, (CUdeviceptr)fw_g.flat().data(), false); - triton::driver::cu_buffer b(ctx, (CUdeviceptr)fw_b.flat().data(), false); - triton::driver::cu_buffer y(ctx, (CUdeviceptr)fw_y->flat().data(), false); - triton::driver::cu_buffer m(ctx, (CUdeviceptr)fw_m->flat().data(), false); - triton::driver::cu_buffer v(ctx, (CUdeviceptr)fw_v->flat().data(), false); + triton::driver::cu_buffer x(ctx, fw_x.tensor_data().size(), (CUdeviceptr)fw_x.tensor_data().data(), false); + triton::driver::cu_buffer g(ctx, fw_g.tensor_data().size(), (CUdeviceptr)fw_g.tensor_data().data(), false); + triton::driver::cu_buffer b(ctx, fw_b.tensor_data().size(), (CUdeviceptr)fw_b.tensor_data().data(), false); + triton::driver::cu_buffer y(ctx, fw_y->tensor_data().size(), (CUdeviceptr)fw_y->tensor_data().data(), false); + triton::driver::cu_buffer m(ctx, fw_m->tensor_data().size(), (CUdeviceptr)fw_m->tensor_data().data(), false); + triton::driver::cu_buffer v(ctx, fw_v->tensor_data().size(), (CUdeviceptr)fw_v->tensor_data().data(), false); // create config triton::dnn::batchnorm_forward batchnorm(C, 1, H, W, B, "fp32"); batchnorm.enqueue(stream, {&y, &m, &v, &x, &g, &b}); @@ -117,14 +117,14 @@ public: OP_REQUIRES_OK(context, context->allocate_output(1, fw_g.shape(), &fw_dg)); OP_REQUIRES_OK(context, context->allocate_output(2, fw_g.shape(), &fw_db)); // triton handles - triton::driver::cu_buffer dy(ctx, (CUdeviceptr)fw_dy.flat().data(), false); - triton::driver::cu_buffer x(ctx, (CUdeviceptr)fw_x.flat().data(), false); - triton::driver::cu_buffer g(ctx, (CUdeviceptr)fw_g.flat().data(), false); - triton::driver::cu_buffer m(ctx, (CUdeviceptr)fw_m.flat().data(), false); - triton::driver::cu_buffer v(ctx, (CUdeviceptr)fw_v.flat().data(), false); - triton::driver::cu_buffer dx(ctx, (CUdeviceptr)fw_dx->flat().data(), false); - triton::driver::cu_buffer dg(ctx, (CUdeviceptr)fw_dg->flat().data(), false); - triton::driver::cu_buffer db(ctx, (CUdeviceptr)fw_db->flat().data(), false); + triton::driver::cu_buffer dy(ctx, fw_dy.tensor_data().size(), (CUdeviceptr)fw_dy.tensor_data().data(), false); + triton::driver::cu_buffer x(ctx, fw_x.tensor_data().size(), (CUdeviceptr)fw_x.tensor_data().data(), false); + triton::driver::cu_buffer g(ctx, fw_g.tensor_data().size(), (CUdeviceptr)fw_g.tensor_data().data(), false); + triton::driver::cu_buffer m(ctx, fw_m.tensor_data().size(), (CUdeviceptr)fw_m.tensor_data().data(), false); + triton::driver::cu_buffer v(ctx, fw_v.tensor_data().size(), (CUdeviceptr)fw_v.tensor_data().data(), false); + triton::driver::cu_buffer dx(ctx, fw_dx->tensor_data().size(), (CUdeviceptr)fw_dx->tensor_data().data(), false); + triton::driver::cu_buffer dg(ctx, fw_dg->tensor_data().size(), (CUdeviceptr)fw_dg->tensor_data().data(), false); + triton::driver::cu_buffer db(ctx, fw_db->tensor_data().size(), (CUdeviceptr)fw_db->tensor_data().data(), false); // create config triton::dnn::batchnorm_backward batchnorm(C, 1, H, W, B, "fp32"); batchnorm.enqueue(stream, {&dx, &dg, &db, &dy, &x, &g, &m, &v}); diff --git a/examples/python/tensorflow/blocksparse.cpp b/examples/python/tensorflow/blocksparse.cpp index 5da231a12..0d37d382d 100644 --- a/examples/python/tensorflow/blocksparse.cpp +++ b/examples/python/tensorflow/blocksparse.cpp @@ -14,6 +14,7 @@ #include "tensorflow/core/util/padding.h" #include "tensorflow/core/util/tensor_format.h" #include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/allocation_description.pb.h" using namespace tensorflow; using shape_inference::DimensionHandle; @@ -21,6 +22,7 @@ using shape_inference::InferenceContext; using shape_inference::ShapeHandle; using GPUDevice = Eigen::GpuDevice; + Status XpropShape(InferenceContext* ctx) { int K; TF_RETURN_IF_ERROR(ctx->GetAttr( "K", &K)); @@ -120,23 +122,20 @@ public: shape_c.AddDim(params_.K); Tensor* c = nullptr; OP_REQUIRES_OK(context, context->allocate_output(0, shape_c, &c)); - // allocate locks - int gridN = (N + 63)/64; - Tensor* locks; - TensorShape shape_l; - if (params_.locks > 0) - shape_l.AddDim(gridN * params_.locks * 2); - OP_REQUIRES_OK(context, context->allocate_output(1, shape_l, &locks)); // wrap tensorflow handles - triton::driver::cu_buffer da(ctx, (CUdeviceptr)a.flat().data(), false); - triton::driver::cu_buffer db(ctx, (CUdeviceptr)b.flat().data(), false); - triton::driver::cu_buffer dc(ctx, (CUdeviceptr)c->flat().data(), false); - triton::driver::cu_buffer dlut(ctx, (CUdeviceptr)lut.flat().data(), false); - triton::driver::cu_buffer dlocks(ctx, (CUdeviceptr)locks->flat().data(), false); + triton::driver::cu_buffer da(ctx, a.tensor_data().size(), (CUdeviceptr)a.tensor_data().data(), false); + triton::driver::cu_buffer db(ctx, b.tensor_data().size(), (CUdeviceptr)b.tensor_data().data(), false); + triton::driver::cu_buffer dc(ctx, c->tensor_data().size(), (CUdeviceptr)c->tensor_data().data(), false); + triton::driver::cu_buffer dlut(ctx, lut.tensor_data().size(), (CUdeviceptr)lut.tensor_data().data(), false); // create profile - triton::dnn::blocksparse::dot dot(N, params_.K, params_.segments, params_.C, "fp32", params_.bsize, params_.locks, OP); + triton::dnn::blocksparse::dot dot(N, params_.K, params_.segments, params_.C, "fp16", params_.bsize, params_.locks, params_.blocks, OP); // blocksparse matmul - dot.enqueue(stream, {&da, &db, &dc, &dlut, &dlocks}, triton::dnn::NO_TUNING); + triton::dnn::base* op = dot.enqueue(stream, {&da, &db, &dc, &dlut}, triton::dnn::FULL_TUNING); + triton::driver::buffer* locks_buffer = ((triton::dnn::blocksparse::dot*)op)->get_locks(); + Tensor *tmp = nullptr; + TensorShape tmp_shapes; + tmp_shapes.AddDim(locks_buffer->size() / 4); + OP_REQUIRES_OK(context, context->allocate_output(1, tmp_shapes, &tmp)); } private: diff --git a/examples/python/tensorflow/conv.cpp b/examples/python/tensorflow/conv.cpp index 4ed457021..f06bf679c 100644 --- a/examples/python/tensorflow/conv.cpp +++ b/examples/python/tensorflow/conv.cpp @@ -50,8 +50,8 @@ public: int32_t pad_d = 0, pad_h = 0, pad_w = 0; bool has_bias = false; // wrap buffers - triton::driver::cu_buffer a(ctx, (CUdeviceptr)tfa.flat().data(), false); - triton::driver::cu_buffer b(ctx, (CUdeviceptr)tfb.flat().data(), false); + triton::driver::cu_buffer a(ctx, tfa.tensor_data().size(), (CUdeviceptr)tfa.tensor_data().data(), false); + triton::driver::cu_buffer b(ctx, tfb.tensor_data().size(), (CUdeviceptr)tfb.tensor_data().data(), false); triton::driver::buffer* bias = nullptr; // template triton::dnn::conv conv(B, C, @@ -68,7 +68,7 @@ public: Tensor* tfc = nullptr; TensorShape out_shape({c_shapes[0], c_shapes[1], c_shapes[2], c_shapes[3]}); OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &tfc)); - triton::driver::cu_buffer c(ctx, (CUdeviceptr)tfc->flat().data(), false); + triton::driver::cu_buffer c(ctx, tfc->tensor_data().size(), (CUdeviceptr)tfc->tensor_data().data(), false); // enqueue conv.enqueue(stream, {&a, &b, &c, bias}); } diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index 368ef8be3..7acedb7e9 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -45,9 +45,9 @@ class DotOp : public OpKernel { if (out_shape.num_elements() == 0) return; // matrix multiplication parameters - triton::driver::cu_buffer da(ctx, (CUdeviceptr)a.flat().data(), false); - triton::driver::cu_buffer db(ctx, (CUdeviceptr)b.flat().data(), false); - triton::driver::cu_buffer dc(ctx, (CUdeviceptr)c->flat().data(), false); + triton::driver::cu_buffer da(ctx, a.tensor_data().size(), (CUdeviceptr)a.tensor_data().data(), false); + triton::driver::cu_buffer db(ctx, b.tensor_data().size(), (CUdeviceptr)b.tensor_data().data(), false); + triton::driver::cu_buffer dc(ctx, c->tensor_data().size(), (CUdeviceptr)c->tensor_data().data(), false); // template triton::dnn::dot dot(M, N, K, false, false, "fp16", "fp16", 8, 8); dot.enqueue(stream, {&da, &db, &dc}); diff --git a/examples/python/tensorflow/shift.cpp b/examples/python/tensorflow/shift.cpp index 2a3973fbb..28e10b679 100644 --- a/examples/python/tensorflow/shift.cpp +++ b/examples/python/tensorflow/shift.cpp @@ -119,9 +119,9 @@ public: if (out_shapes.num_elements() == 0) return; // matrix multiplication parameters - triton::driver::cu_buffer da(ctx, (CUdeviceptr)tf_a.flat().data(), false); - triton::driver::cu_buffer db(ctx, (CUdeviceptr)tf_b.flat().data(), false); - triton::driver::cu_buffer dc(ctx, (CUdeviceptr)tf_c->flat().data(), false); + triton::driver::cu_buffer da(ctx, tf_a.tensor_data().size(), (CUdeviceptr)tf_a.tensor_data().data(), false); + triton::driver::cu_buffer db(ctx, tf_b.tensor_data().size(), (CUdeviceptr)tf_b.tensor_data().data(), false); + triton::driver::cu_buffer dc(ctx, tf_c->tensor_data().size(), (CUdeviceptr)tf_c->tensor_data().data(), false); shift.enqueue(stream, {&da, &db, &dc}, triton::dnn::PARTIAL_TUNING); } diff --git a/include/triton/dnn/base.h b/include/triton/dnn/base.h index 266f29803..b9e2b886b 100644 --- a/include/triton/dnn/base.h +++ b/include/triton/dnn/base.h @@ -61,7 +61,7 @@ protected: private: // initialize - virtual void init_impl(driver::stream *, driver::cu_module *) = 0; + virtual void init_impl(driver::stream *, driver::cu_module *, triton::runtime::launch_information) = 0; // deinitialize virtual void deinit_impl() = 0; // enqueue @@ -86,7 +86,7 @@ public: // clone virtual base* clone() const = 0; // enqueue - void enqueue(driver::stream* stream, std::vector args, autotuning_t autotune = PARTIAL_TUNING); + base* enqueue(driver::stream* stream, std::vector args, autotuning_t autotune = PARTIAL_TUNING); // get profile launch_context_t get_launch_context(driver::stream *stream, std::vector args, autotuning_t autotune = PARTIAL_TUNING); diff --git a/include/triton/dnn/batchnorm.h b/include/triton/dnn/batchnorm.h index 8f9053225..32c006b99 100644 --- a/include/triton/dnn/batchnorm.h +++ b/include/triton/dnn/batchnorm.h @@ -38,7 +38,7 @@ namespace dnn{ class batchnorm_forward: public base { private: // init - void init_impl(driver::stream *, driver::cu_module *) { } + void init_impl(driver::stream *, driver::cu_module *, triton::runtime::launch_information) { } void deinit_impl() { } // enqueue @@ -74,7 +74,7 @@ private: class batchnorm_backward: public base{ private: // init - void init_impl(driver::stream *, driver::cu_module *) { } + void init_impl(driver::stream *, driver::cu_module *, triton::runtime::launch_information) { } void deinit_impl() { } // enqueue void enqueue_impl(driver::stream *stream, driver::kernel *kernel, diff --git a/include/triton/dnn/blocksparse/dot.h b/include/triton/dnn/blocksparse/dot.h index 01c94a2fe..98a1ce6fa 100644 --- a/include/triton/dnn/blocksparse/dot.h +++ b/include/triton/dnn/blocksparse/dot.h @@ -26,14 +26,16 @@ private: std::vector search_space() const; params_t heuristics() const; // init - void init_impl(driver::stream *stream, driver::cu_module *module); + void init_impl(driver::stream *stream, driver::cu_module *module, triton::runtime::launch_information info); // deinit void deinit_impl(); public: // constructor - dot(int32_t N, int32_t K, int32_t S, int32_t C, const std::string &ty, int32_t BS, int32_t nlocks, op_t op = FPROP); + dot(int32_t N, int32_t K, int32_t S, int32_t C, const std::string &ty, int32_t BS, int32_t nlocks, int32_t nblocks, op_t op = FPROP); // triton-c source void triton_c_src(std::ostream &os) const; + // locks + driver::buffer* get_locks() const; // clone base* clone() const; @@ -46,7 +48,8 @@ private: int32_t K_; int32_t BS_; int32_t nlocks_; - driver::buffer *locks_; + int32_t nblocks_; + std::shared_ptr locks_; op_t op_; }; diff --git a/include/triton/dnn/conv.h b/include/triton/dnn/conv.h index d81ff872d..2745d72bc 100644 --- a/include/triton/dnn/conv.h +++ b/include/triton/dnn/conv.h @@ -24,7 +24,7 @@ private: void build_b_deltas(); void build_a_deltas(); void build_masks(); - void init_impl(driver::stream *, driver::cu_module *); + void init_impl(driver::stream *stream, driver::cu_module *module, triton::runtime::launch_information info); void deinit_impl() { } // enqueue diff --git a/include/triton/dnn/dot.h b/include/triton/dnn/dot.h index 6ba3f0b24..30836357f 100644 --- a/include/triton/dnn/dot.h +++ b/include/triton/dnn/dot.h @@ -9,7 +9,7 @@ namespace dnn{ class dot: public base { private: // initialize - void init_impl(driver::stream *, driver::cu_module *); + void init_impl(driver::stream *, driver::cu_module *, triton::runtime::launch_information); void deinit_impl() { } // enqueue diff --git a/include/triton/dnn/shift.h b/include/triton/dnn/shift.h index 25b9547f3..35ad312e0 100644 --- a/include/triton/dnn/shift.h +++ b/include/triton/dnn/shift.h @@ -49,7 +49,7 @@ enum layout_t { class shift: public base { private: // initialize and enqueue - void init_impl(driver::stream *stream, driver::cu_module *module); + void init_impl(driver::stream *stream, driver::cu_module *module, triton::runtime::launch_information info); void deinit_impl(); void enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, diff --git a/include/triton/driver/buffer.h b/include/triton/driver/buffer.h index 0502f1ff4..a0502f789 100755 --- a/include/triton/driver/buffer.h +++ b/include/triton/driver/buffer.h @@ -36,14 +36,16 @@ class stream; // Base class buffer : public polymorphic_resource { public: - buffer(driver::context* ctx, CUdeviceptr cl, bool take_ownership); - buffer(driver::context* ctx, cl_mem cl, bool take_ownership); - buffer(driver::context* ctx, host_buffer_t hst, bool take_ownership); + buffer(driver::context* ctx, size_t size, CUdeviceptr cl, bool take_ownership); + buffer(driver::context* ctx, size_t size, cl_mem cl, bool take_ownership); + buffer(driver::context* ctx, size_t size, host_buffer_t hst, bool take_ownership); static buffer* create(driver::context* ctx, size_t size); driver::context* context(); + size_t size(); protected: driver::context* context_; + size_t size_; }; // CPU @@ -65,7 +67,7 @@ class cu_buffer: public buffer { public: cu_buffer(driver::context* context, size_t size); - cu_buffer(driver::context* context, CUdeviceptr cu, bool take_ownership); + cu_buffer(driver::context* context, size_t size, CUdeviceptr cu, bool take_ownership); void set_zero(triton::driver::stream *queue, size_t size); }; diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp index 6ac72fec7..6d71d27ae 100644 --- a/include/triton/tools/bench.hpp +++ b/include/triton/tools/bench.hpp @@ -38,8 +38,8 @@ inline double bench(std::function const & op, driver::stream * stream) while(total_time*1e-9 < 1e-3){ float norm = 1; // normalize clock if possible to get roughly constant result - if(auto cu_device = dynamic_cast(device)) - norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock(); +// if(auto cu_device = dynamic_cast(device)) +// norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock(); tmr.start(); op(); stream->synchronize(); diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index e5aa7ad45..a75334b90 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -44,11 +44,12 @@ std::pair base::get_profile_impl(driver::stream *stream, std::v auto benchmark = [&](triton::driver::kernel* kernel, rt::launch_information info) { // launch info - clone->init_impl(stream, (triton::driver::cu_module*)kernel->module()); + clone->init_impl(stream, (triton::driver::cu_module*)kernel->module(), info); clone->enqueue_impl(stream, kernel, args, info); stream->synchronize(); double ts = triton::tools::bench([&](){ clone->enqueue_impl(stream, kernel, args, info); }, stream); clone->deinit_impl(); +// std::cout << ts * 1e-6 << std::endl; return num_flops() / ts * 1e-3; }; // auto-tune and save result @@ -65,7 +66,8 @@ std::pair base::get_profile_impl(driver::stream *stream, std::v jit->add_module(name_.c_str(), src.c_str(), params); } triton::driver::kernel* kernel = jit->get_function(name_.c_str()); - clone->init_impl(stream, (triton::driver::cu_module*)kernel->module()); + rt::launch_information info = jit->get_launch_info(name_.c_str()); + clone->init_impl(stream, (triton::driver::cu_module*)kernel->module(), info); } /* retrieved compiled template */ else { @@ -75,9 +77,10 @@ std::pair base::get_profile_impl(driver::stream *stream, std::v return {it->first, jit}; } -void base::enqueue(driver::stream *stream, std::vector args, autotuning_t autotune) { +base* base::enqueue(driver::stream *stream, std::vector args, autotuning_t autotune) { launch_context_t info = get_launch_context(stream, args, autotune); info.op->enqueue_impl(stream, info.kernel, args, info.info); + return info.op; } launch_context_t base::get_launch_context(driver::stream *stream, std::vector args, autotuning_t autotune) { diff --git a/lib/dnn/blocksparse/dot.cpp b/lib/dnn/blocksparse/dot.cpp index 2f69677a8..ff021cca8 100644 --- a/lib/dnn/blocksparse/dot.cpp +++ b/lib/dnn/blocksparse/dot.cpp @@ -6,7 +6,7 @@ namespace blocksparse{ size_t dot::num_flops() const { - + return 2.*nblocks_*BS_*BS_*N_; } bool dot::operator <(const base& other) const { @@ -30,25 +30,23 @@ base * dot::clone() const { } dot::dot(int32_t N, int32_t K, int32_t S, int32_t C, - const std::string& ty, int32_t BS, int32_t nlocks, op_t op): + const std::string& ty, int32_t BS, int32_t nlocks, int32_t nblocks, op_t op): base("bsdot"), N_(N), K_(K), S_(S), C_(C), ab_ty_(ty), c_ty_(ty), - BS_(BS), nlocks_(nlocks), op_(op){ + BS_(BS), nlocks_(nlocks), nblocks_(nblocks), op_(op){ } -void dot::init_impl(driver::stream *stream, driver::cu_module *module) { -// int32_t TM = info.globals["TM"]; -// size_t grid_0 = (N_ + TM - 1) / TM; -// if(nlocks_){ -// locks_ = triton::driver::buffer::create(stream->context(), grid_0 * nlocks_ * 2 * 4); -// ((driver::cu_buffer*)locks_)->set_zero(stream, grid_0 * nlocks_ * 2 * 4); -// } +void dot::init_impl(driver::stream *stream, driver::cu_module *module, triton::runtime::launch_information info) { + int32_t TM = info.globals["TM"]; + size_t grid_0 = (N_ + TM - 1) / TM; + if(nlocks_ && !locks_){ + locks_.reset(triton::driver::buffer::create(stream->context(), grid_0 * nlocks_ * 2 * 4)); + ((driver::cu_buffer*)locks_.get())->set_zero(stream, grid_0 * nlocks_ * 2 * 4); + } } void dot::deinit_impl() { -// if(locks_) -// delete locks_; } void dot::enqueue_impl(driver::stream *stream, driver::kernel *kernel, @@ -57,7 +55,6 @@ void dot::enqueue_impl(driver::stream *stream, driver::kernel *kernel, driver::buffer *b = args[1]; driver::buffer *c = args[2]; driver::buffer *lut = args[3]; - driver::buffer *locks = args[4]; int32_t lda = N_; int32_t ldc = N_; kernel->setArg(0, a); @@ -67,16 +64,20 @@ void dot::enqueue_impl(driver::stream *stream, driver::kernel *kernel, kernel->setArg(4, ldc); kernel->setArg(5, N_); kernel->setArg(6, lut); - kernel->setArg(7, locks); + kernel->setArg(7, locks_.get()); kernel->setArg(8, nlocks_); int32_t TM = info.globals["TM"]; size_t grid_0 = (N_ + TM - 1) / TM; size_t grid_1 = S_; if(nlocks_) - ((driver::cu_buffer*)locks)->set_zero(stream, grid_0 * nlocks_ * 2 * 4); + ((driver::cu_buffer*)locks_.get())->set_zero(stream, grid_0 * nlocks_ * 2 * 4); stream->enqueue(kernel, {grid_0, grid_1, 1}, {info.num_threads, 1, 1}); } +driver::buffer* dot::get_locks() const { + return locks_.get(); +} + void dot::triton_c_src(std::ostream &os) const { std::string usea = (op_ == WGRAD) ? "trans(a)" : "a"; std::string useb = (op_ == FPROP) ? "trans(b)" : "b"; @@ -90,7 +91,7 @@ void dot::triton_c_src(std::ostream &os) const { std::string ldb1 = (op_ == FPROP) ? "TK" : "1" ; std::string result = R"( - const tunable int32 TM = {64}; + const tunable int32 TM = {32, 64, 128}; const tunable int32 TN = {)" + std::to_string(BS_) + R"(}; const tunable int32 TK = {)" + std::to_string(BS_) + R"(}; @@ -106,6 +107,7 @@ void dot::triton_c_src(std::ostream &os) const { int32 ryb[TN] = 0 ... TN; int32 rka[TK] = 0 ... TK; int32 rkb[TK] = 0 ... TK; + int1 checka[TM, TK] = (rxa < N)[:, newaxis]; int32 offa[)" + sizea + "] = rxa[" + bca0 + "] + rka[" + bca1 + R"(]*lda; int32 offb[)" + sizeb + "] = ryb[" + bcb0 + "]*" + ldb0 + " + rkb[" + bcb1 + "]*" + ldb1 + R"(; int32 *header = lut + ridy * 4; @@ -119,7 +121,7 @@ void dot::triton_c_src(std::ostream &os) const { int32 bk = *(plut + 1); )" + ab_ty_ + "* pa[" + sizea + R"(] = A + offa + ak * TK * lda; )" + ab_ty_ + "* pb[" + sizeb + R"(] = B + offb + bk * TK * TN; - )" + ab_ty_ + " a[" + sizea + R"(] = *pa; + )" + ab_ty_ + " a[" + sizea + R"(] = checka ? *pa : 0; )" + ab_ty_ + " b[" + sizeb + R"(] = *pb; acc = dot()" + usea + ", " + useb + R"(, acc); plut = plut + 2; diff --git a/lib/dnn/conv.cpp b/lib/dnn/conv.cpp index c20701a4b..f54c63560 100644 --- a/lib/dnn/conv.cpp +++ b/lib/dnn/conv.cpp @@ -278,7 +278,7 @@ size_t conv::num_flops() const{ return 2.*M_*N_*K_; } -void conv::init_impl(driver::stream *stream, triton::driver::cu_module* module) { +void conv::init_impl(driver::stream *stream, triton::driver::cu_module* module, triton::runtime::launch_information info) { auto init_lut = [&](bool is_cst, const char *name, std::vector host) -> triton::driver::buffer*{ if(host.empty()) return nullptr; @@ -293,12 +293,16 @@ void conv::init_impl(driver::stream *stream, triton::driver::cu_module* module) stream->write(buffer, false, 0, nbytes, host.data()); return buffer; }; - - d_a_deltas_ = init_lut(is_a_deltas_cst, "delta", h_a_deltas_); - d_b_deltas_ = init_lut(is_b_deltas_cst_, "b_delta", h_b_deltas_); - d_masks_ = init_lut(is_mask_cst_, "masks", h_masks_); - d_locks_ = triton::driver::buffer::create(stream->context(), max_grid_0_*max_grid_1_*4*2); - ((triton::driver::cu_buffer*)d_locks_)->set_zero(stream, max_grid_0_*max_grid_1_*4*2); + if(d_a_deltas_ == nullptr) + d_a_deltas_ = init_lut(is_a_deltas_cst, "delta", h_a_deltas_); + if(d_b_deltas_ == nullptr) + d_b_deltas_ = init_lut(is_b_deltas_cst_, "b_delta", h_b_deltas_); + if(d_masks_ == nullptr) + d_masks_ = init_lut(is_mask_cst_, "masks", h_masks_); + if(d_locks_ == nullptr){ + d_locks_ = triton::driver::buffer::create(stream->context(), max_grid_0_*max_grid_1_*4*2); + ((triton::driver::cu_buffer*)d_locks_)->set_zero(stream, max_grid_0_*max_grid_1_*4*2); + } } void conv::set_arg(driver::kernel *kernel, diff --git a/lib/dnn/dot.cpp b/lib/dnn/dot.cpp index 114ec7450..1b5e061d3 100644 --- a/lib/dnn/dot.cpp +++ b/lib/dnn/dot.cpp @@ -39,7 +39,7 @@ base* dot::clone() const { return new dot(*this); } -void dot::init_impl(driver::stream* stream, driver::cu_module *) { +void dot::init_impl(driver::stream* stream, driver::cu_module *, runtime::launch_information) { std::vector hlocks(2048, 0); if(locks_ == nullptr) locks_ = triton::driver::buffer::create(stream->context(), hlocks.size()*4); diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 58c62dd46..3bf5e1035 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -226,7 +226,7 @@ bool shift::operator <(const base& other) const{ y->bias_); } -void shift::init_impl(driver::stream *stream, driver::cu_module *module) { +void shift::init_impl(driver::stream *stream, driver::cu_module *module, triton::runtime::launch_information info) { build_delta_a(); triton::driver::buffer* delta_a = ((triton::driver::cu_module*)module)->symbol("delta_a"); stream->write(delta_a, false, 0, h_delta_a.size()*4, h_delta_a.data()); diff --git a/lib/driver/buffer.cpp b/lib/driver/buffer.cpp index cf96aa115..111091fdf 100755 --- a/lib/driver/buffer.cpp +++ b/lib/driver/buffer.cpp @@ -36,20 +36,24 @@ namespace driver // -buffer::buffer(driver::context* ctx, CUdeviceptr cu, bool take_ownership) - : polymorphic_resource(cu, take_ownership), context_(ctx) { } +buffer::buffer(driver::context* ctx, size_t size, CUdeviceptr cu, bool take_ownership) + : polymorphic_resource(cu, take_ownership), context_(ctx), size_(size) { } -buffer::buffer(driver::context* ctx, cl_mem cl, bool take_ownership) - : polymorphic_resource(cl, take_ownership), context_(ctx) { } +buffer::buffer(driver::context* ctx, size_t size, cl_mem cl, bool take_ownership) + : polymorphic_resource(cl, take_ownership), context_(ctx), size_(size) { } -buffer::buffer(driver::context* ctx, host_buffer_t hst, bool take_ownership) - : polymorphic_resource(hst, take_ownership), context_(ctx) { } +buffer::buffer(driver::context* ctx, size_t size, host_buffer_t hst, bool take_ownership) + : polymorphic_resource(hst, take_ownership), context_(ctx), size_(size) { } driver::context* buffer::context() { return context_; } +size_t buffer::size() { + return size_; +} + buffer* buffer::create(driver::context* ctx, size_t size) { switch(ctx->backend()){ case CUDA: return new cu_buffer(ctx, size); @@ -62,14 +66,14 @@ buffer* buffer::create(driver::context* ctx, size_t size) { // host_buffer::host_buffer(driver::context *context, size_t size) - : buffer(context, host_buffer_t(), true){ + : buffer(context, size, host_buffer_t(), true){ hst_->data = new char[size]; } // ocl_buffer::ocl_buffer(driver::context* context, size_t size) - : buffer(context, cl_mem(), true){ + : buffer(context, size, cl_mem(), true){ cl_int err; *cl_ = dispatch::clCreateBuffer(*context->cl(), CL_MEM_READ_WRITE, size, NULL, &err); check(err); @@ -79,13 +83,13 @@ ocl_buffer::ocl_buffer(driver::context* context, size_t size) // cu_buffer::cu_buffer(driver::context* context, size_t size) - : buffer(context, CUdeviceptr(), true) { + : buffer(context, size, CUdeviceptr(), true) { cu_context::context_switcher ctx_switch(*context_); dispatch::cuMemAlloc(&*cu_, size); } -cu_buffer::cu_buffer(driver::context* context, CUdeviceptr cu, bool take_ownership) - : buffer(context, cu, take_ownership){ +cu_buffer::cu_buffer(driver::context* context, size_t size, CUdeviceptr cu, bool take_ownership) + : buffer(context, size, cu, take_ownership){ } void cu_buffer::set_zero(driver::stream* queue, size_t size) diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 4ff863666..d2c31fadd 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -275,7 +275,7 @@ cu_buffer* cu_module::symbol(const char *name) const{ CUdeviceptr handle; size_t size; dispatch::cuModuleGetGlobal_v2(&handle, &size, *cu_, name); - return new cu_buffer(ctx_, handle, false); + return new cu_buffer(ctx_, size, handle, false); } diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index 6fa727406..12eebdd3c 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -37,13 +37,13 @@ void parallel_loop_nest(std::vector const & ranges, size_t D = ranges.size(); std::vector values(D, 0); // thread pools -// ThreadPool pool(nthreads); + ThreadPool pool(nthreads); // Start with innermost loop size_t i = D - 1; while(true){ // Execute function -// pool.enqueue(f,values); - f(values); + pool.enqueue(f,values); +// f(values); while(values[i]++ == ranges[i] - 1){ if(i == 0) return; @@ -51,7 +51,7 @@ void parallel_loop_nest(std::vector const & ranges, } i = D - 1; // Short sleep so that the thread pool doesn't grow too big -// std::this_thread::sleep_for(std::chrono::microseconds(1)); + std::this_thread::sleep_for(std::chrono::microseconds(1)); } } @@ -212,9 +212,9 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben best.perf = perf; best.params = params; } -// for(size_t i = 0; i < params.size(); i++) -// std::cout << ((i==0)?"":", ") << params[i] << std::flush; -// std::cout << ", " << perf << " [ " << best.perf << " ] " << std::endl; + for(size_t i = 0; i < params.size(); i++) + std::cout << ((i==0)?"":", ") << params[i] << std::flush; + std::cout << ", " << perf << " [ " << best.perf << " ] " << std::endl; } }; From bb32ac56c92a97a46f58c168ef77451742a8433d Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 31 Jul 2019 15:11:10 -0700 Subject: [PATCH 271/494] [codegen/optimize_dce.cpp] fixed bugs whereby barriers were removed by DCE --- examples/python/tensorflow/blocksparse.cpp | 2 +- include/triton/dnn/heuristics.h | 40 ++++++++++++++++++++-- lib/codegen/alignment_info.cpp | 1 + lib/codegen/optimize_dce.cpp | 3 +- lib/codegen/selection.cpp | 2 ++ lib/codegen/tune.cpp | 6 ++-- lib/dnn/base.cpp | 4 +-- lib/dnn/blocksparse/dot.cpp | 19 ++++++---- 8 files changed, 60 insertions(+), 17 deletions(-) diff --git a/examples/python/tensorflow/blocksparse.cpp b/examples/python/tensorflow/blocksparse.cpp index 0d37d382d..38b335689 100644 --- a/examples/python/tensorflow/blocksparse.cpp +++ b/examples/python/tensorflow/blocksparse.cpp @@ -130,7 +130,7 @@ public: // create profile triton::dnn::blocksparse::dot dot(N, params_.K, params_.segments, params_.C, "fp16", params_.bsize, params_.locks, params_.blocks, OP); // blocksparse matmul - triton::dnn::base* op = dot.enqueue(stream, {&da, &db, &dc, &dlut}, triton::dnn::FULL_TUNING); + triton::dnn::base* op = dot.enqueue(stream, {&da, &db, &dc, &dlut}, triton::dnn::NO_TUNING); triton::driver::buffer* locks_buffer = ((triton::dnn::blocksparse::dot*)op)->get_locks(); Tensor *tmp = nullptr; TensorShape tmp_shapes; diff --git a/include/triton/dnn/heuristics.h b/include/triton/dnn/heuristics.h index ab8af7d32..d9bd01e75 100644 --- a/include/triton/dnn/heuristics.h +++ b/include/triton/dnn/heuristics.h @@ -7,10 +7,12 @@ namespace triton{ namespace dnn{ +/* Dense matrix multiplication */ + typedef std::vector params_t; typedef std::tuple trans_key_t; typedef std::tuple size_key_t; -static const std::map> params = { +static const std::map> dot_params = { /* NN */ {trans_key_t(false, false), std::map{ {size_key_t(16, 16), {2, 8, 16, 4, 16, 2, 2, 1, 1, 16, 32, 8, 4, 1}}, @@ -108,7 +110,7 @@ static const std::map> params = { // small search space for partial auto-tuning inline std::vector dot_search_space(bool AT, bool BT) { std::vector result; - for(auto x: params.at(trans_key_t{AT, BT})) + for(auto x: dot_params.at(trans_key_t{AT, BT})) result.push_back(x.second); return result; } @@ -118,9 +120,41 @@ inline params_t dot_heuristics(bool AT, bool BT, size_t M, size_t N, size_t K) { size_t TM = 128; size_t TN = 128; // return {4, 4, 128, 8, 4, 128, 2, 2, 2, 2, 32, 32, 16, 1}; - return params.at(trans_key_t{AT, BT}).at(size_key_t{TM, TN}); + return dot_params.at(trans_key_t{AT, BT}).at(size_key_t{TM, TN}); } + +/* Block-sparse matrix multiplication */ + +static const std::map, std::map> bsdot_params = { + /* 32x32 */ + {{true, 32}, std::map{ + {32, {2, 2, 32, 32, 2, 2, 4, 8, 32, 32, 8, 4, 16}}, + {64, {2, 2, 64, 32, 2, 1, 16, 4, 4, 32, 16, 2, 4}}, + {128, {2, 2, 128, 32, 4, 1, 32, 4, 4, 32, 8, 4, 16}} + }}, + {{false, 32}, std::map{ + {32, {2, 2, 32, 32, 1, 1, 8, 4, 4, 32, 8, 4, 8}}, + {64, {2, 2, 64, 32, 2, 1, 16, 4, 4, 32, 16, 4, 8}}, + {128, {2, 2, 128, 32, 4, 1, 32, 4, 4, 32, 32, 4, 8}} + }} +}; + +// small search space for partial auto-tuning +inline std::vector bsdot_search_space(bool is_fprop, size_t block_size) { + std::vector result; + for(auto x: bsdot_params.at({is_fprop, block_size})) + result.push_back(x.second); + return result; +} + +// simple parameter heuristics +inline params_t bsdot_heuristics(bool is_fprop, size_t block_size, size_t N, size_t S) { + return bsdot_params.at({is_fprop,block_size}).at(128); +} + + } } + #endif diff --git a/lib/codegen/alignment_info.cpp b/lib/codegen/alignment_info.cpp index 87df925df..ed20e01fc 100644 --- a/lib/codegen/alignment_info.cpp +++ b/lib/codegen/alignment_info.cpp @@ -303,6 +303,7 @@ void alignment_info::run(ir::module &mod) { for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()){ populate_max_contiguous(i); +// std::cout << i->get_name() << " " << is_constant_.at(i).num_cst << " " << starting_multiple_.at(i) << " " << max_contiguous_.at(i) << std::endl; } } diff --git a/lib/codegen/optimize_dce.cpp b/lib/codegen/optimize_dce.cpp index 9508cfa2e..ec42729ec 100644 --- a/lib/codegen/optimize_dce.cpp +++ b/lib/codegen/optimize_dce.cpp @@ -20,7 +20,8 @@ void optimize_dce::run(ir::module &mod) { for(ir::instruction *i: block->get_inst_list()){ if(dynamic_cast(i) || dynamic_cast(i) || dynamic_cast(i) || dynamic_cast(i) || dynamic_cast(i) - || dynamic_cast(i) || dynamic_cast(i) || dynamic_cast(i) ){ + || dynamic_cast(i) || dynamic_cast(i) || dynamic_cast(i) + || dynamic_cast(i)){ work_list.push_back(i); marked.insert(i); } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index ad7e395b1..a57713f38 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -368,6 +368,8 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::functionadd_memfence(module, builder); + tgt_->add_barrier(module, builder); return (Instruction*)res; } if(ir::atomic_add_inst* ii = dynamic_cast(inst)){ diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 1bb2701bc..a28fd827e 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -247,14 +247,14 @@ void tune::run(ir::module &mod) { size_t addr_space = ptr_ty->get_pointer_address_space(); if(addr_space < 4){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 2)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 8)); *params_.at(i).at("nts.d0") = *tmp; } } if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 2, 2)); - std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 2, 2)); + std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 2, 8)); + std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 2, 8)); *params_.at(i).at("nts.d0") = *tmp1; *params_.at(i).at("nts.d1") = *tmp2; } diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index a75334b90..033e2497c 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -61,8 +61,8 @@ std::pair base::get_profile_impl(driver::stream *stream, std::v jit->add_module(name_.c_str(), src.c_str(), best.params); } else { -// params_t params = heuristics(); - params_t params = jit->get_valid(name_.c_str(), src.c_str()); + params_t params = heuristics(); +// params_t params = jit->get_valid(name_.c_str(), src.c_str()); jit->add_module(name_.c_str(), src.c_str(), params); } triton::driver::kernel* kernel = jit->get_function(name_.c_str()); diff --git a/lib/dnn/blocksparse/dot.cpp b/lib/dnn/blocksparse/dot.cpp index ff021cca8..f38030366 100644 --- a/lib/dnn/blocksparse/dot.cpp +++ b/lib/dnn/blocksparse/dot.cpp @@ -1,3 +1,4 @@ +#include "triton/dnn/heuristics.h" #include "triton/dnn/blocksparse/dot.h" namespace triton{ @@ -18,11 +19,11 @@ bool dot::operator <(const base& other) const { } std::vector dot::search_space() const { - throw std::runtime_error("not implemented"); + return bsdot_search_space(op_ == FPROP, BS_); } params_t dot::heuristics() const { - throw std::runtime_error("not implemented"); + return bsdot_heuristics(op_ == FPROP, BS_, N_, S_); } base * dot::clone() const { @@ -116,7 +117,8 @@ void dot::triton_c_src(std::ostream &os) const { int32 column = *(header + 2); int32 lockid = *(header + 3); int32 *plut = lut + offset * 2; - for(int32 k = K; k > 0; k = k - 1){ + for(int32 k = K; k > 0; k = k - 1) + { int32 ak = *(plut + 0); int32 bk = *(plut + 1); )" + ab_ty_ + "* pa[" + sizea + R"(] = A + offa + ak * TK * lda; @@ -133,16 +135,19 @@ void dot::triton_c_src(std::ostream &os) const { int1 checkc[TM, TN] = (rxc < N)[:, newaxis]; if(lockid == 0) @checkc *pc = c; - else { + else + { int32 *plock = locks + ridx*nlocks + lockid - 1; int32 *pcount = plock + get_num_program(0)*nlocks; while(__atomic_cas(plock, 0, 1)); int32 count = *pcount; - if(count == 0) + if(count == 0){ @checkc *pc = c; - else + __atomic_exch(pcount, 1); + } + else{ @checkc *pc = c + *pc; - *pcount = 1; + } __atomic_exch(plock, 0); } })"; From f7bd976fc7abb23680abe9c9a675c30136289f0f Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 31 Jul 2019 17:12:36 -0700 Subject: [PATCH 272/494] [dnn/blocksparse] added heuristics for block-sparse dot --- include/triton/dnn/heuristics.h | 28 +++++++++++++++++++++++++++- lib/codegen/tune.cpp | 2 +- lib/dnn/blocksparse/dot.cpp | 4 ++-- lib/runtime/jit.cpp | 2 ++ 4 files changed, 32 insertions(+), 4 deletions(-) diff --git a/include/triton/dnn/heuristics.h b/include/triton/dnn/heuristics.h index d9bd01e75..56c23642b 100644 --- a/include/triton/dnn/heuristics.h +++ b/include/triton/dnn/heuristics.h @@ -127,16 +127,42 @@ inline params_t dot_heuristics(bool AT, bool BT, size_t M, size_t N, size_t K) { /* Block-sparse matrix multiplication */ static const std::map, std::map> bsdot_params = { - /* 32x32 */ + /* FPROP */ {{true, 32}, std::map{ {32, {2, 2, 32, 32, 2, 2, 4, 8, 32, 32, 8, 4, 16}}, {64, {2, 2, 64, 32, 2, 1, 16, 4, 4, 32, 16, 2, 4}}, {128, {2, 2, 128, 32, 4, 1, 32, 4, 4, 32, 8, 4, 16}} }}, + + {{true, 16}, std::map{ + {32, {4, 1, 32, 16, 1, 1, 8, 4, 4, 16, 4, 4, 8}}, + {64, {4, 1, 64, 16, 2, 2, 8, 8, 16, 16, 8, 2, 16}}, + {128, {4, 1, 128, 16, 4, 1, 16, 8, 8, 16, 8, 2, 16}} + }}, + + {{true, 8}, std::map{ + {32, {4, 1, 32, 8, 1, 1, 4, 8, 8, 8, 4, 2, 8}}, + {64, {4, 1, 64, 8, 1, 1, 8, 8, 4, 8, 4, 2, 8}}, + {128, {4, 1, 128, 8, 1, 1, 4, 8, 8, 8, 4, 2, 8}} + }}, + + /* BPROP */ {{false, 32}, std::map{ {32, {2, 2, 32, 32, 1, 1, 8, 4, 4, 32, 8, 4, 8}}, {64, {2, 2, 64, 32, 2, 1, 16, 4, 4, 32, 16, 4, 8}}, {128, {2, 2, 128, 32, 4, 1, 32, 4, 4, 32, 32, 4, 8}} + }}, + + {{false, 16}, std::map{ + {32, {4, 1, 32, 16, 1, 2, 4, 8, 16, 16, 16, 4, 4}}, + {64, {4, 1, 64, 16, 2, 1, 8, 8, 8, 16, 16, 4, 4}}, + {128, {4, 1, 128, 16, 2, 2, 32, 4, 4, 16, 16, 8, 2}} + }}, + + {{false, 8}, std::map{ + {32, {4, 1, 32, 8, 1, 1, 4, 8, 8, 8, 8, 4, 2}}, + {64, {4, 1, 64, 8, 1, 1, 8, 8, 4, 8, 8, 4, 2}}, + {128, {4, 1, 128, 8, 1, 1, 8, 8, 4, 8, 8, 4, 2}} }} }; diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index a28fd827e..8c351be1c 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -228,7 +228,7 @@ void tune::run(ir::module &mod) { nts->set_value(1); } else { - ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 2, 2); + ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 1, 4); ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 1, 4); connected_components(node, {fpw, wpt}, {"fpw", "wpt"}, nodes_, dependencies_, group_id++); } diff --git a/lib/dnn/blocksparse/dot.cpp b/lib/dnn/blocksparse/dot.cpp index f38030366..3ea79bc78 100644 --- a/lib/dnn/blocksparse/dot.cpp +++ b/lib/dnn/blocksparse/dot.cpp @@ -92,7 +92,7 @@ void dot::triton_c_src(std::ostream &os) const { std::string ldb1 = (op_ == FPROP) ? "TK" : "1" ; std::string result = R"( - const tunable int32 TM = {32, 64, 128}; + const tunable int32 TM = {16, 32, 64, 128}; const tunable int32 TN = {)" + std::to_string(BS_) + R"(}; const tunable int32 TK = {)" + std::to_string(BS_) + R"(}; @@ -143,11 +143,11 @@ void dot::triton_c_src(std::ostream &os) const { int32 count = *pcount; if(count == 0){ @checkc *pc = c; - __atomic_exch(pcount, 1); } else{ @checkc *pc = c + *pc; } + __atomic_exch(pcount, 1); __atomic_exch(plock, 0); } })"; diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index 12eebdd3c..928ec0812 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -230,6 +230,8 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben parallel_for_each(targets, update_best, nthreads_); } + if(best.params.empty()) + throw std::runtime_error("auto-tuning didn't find valid parameters"); // std::cout << "Autotuning done - Best performance: " << best.perf << std::endl; return best; } From 3b92ddf7e60a0ea55d9dcfc5e3abaeca653aa152 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 31 Jul 2019 18:41:56 -0700 Subject: [PATCH 273/494] [codegen/reassociation] now recursively takes pointer arguments into account as well --- examples/python/tensorflow/blocksparse.cpp | 2 +- include/triton/runtime/jit.h | 1 + lib/codegen/reassociate.cpp | 15 +++++++++++++++ lib/codegen/tune.cpp | 8 ++++---- lib/dnn/blocksparse/dot.cpp | 6 +++--- 5 files changed, 24 insertions(+), 8 deletions(-) diff --git a/examples/python/tensorflow/blocksparse.cpp b/examples/python/tensorflow/blocksparse.cpp index 38b335689..0d37d382d 100644 --- a/examples/python/tensorflow/blocksparse.cpp +++ b/examples/python/tensorflow/blocksparse.cpp @@ -130,7 +130,7 @@ public: // create profile triton::dnn::blocksparse::dot dot(N, params_.K, params_.segments, params_.C, "fp16", params_.bsize, params_.locks, params_.blocks, OP); // blocksparse matmul - triton::dnn::base* op = dot.enqueue(stream, {&da, &db, &dc, &dlut}, triton::dnn::NO_TUNING); + triton::dnn::base* op = dot.enqueue(stream, {&da, &db, &dc, &dlut}, triton::dnn::FULL_TUNING); triton::driver::buffer* locks_buffer = ((triton::dnn::blocksparse::dot*)op)->get_locks(); Tensor *tmp = nullptr; TensorShape tmp_shapes; diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index de84d1788..19fde0e84 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -77,6 +77,7 @@ public: void target_dependent(ir::module &module) { alignment_info.run(module); +// ir::print(module, std::cout); reassociate.run(module); if(target_->is_gpu()){ shmem_info.run(module); diff --git a/lib/codegen/reassociate.cpp b/lib/codegen/reassociate.cpp index bf36b2033..d0a54ec31 100644 --- a/lib/codegen/reassociate.cpp +++ b/lib/codegen/reassociate.cpp @@ -215,6 +215,21 @@ void reassociate::run(ir::module &mod) { infos[sta_ptr].dyn_ptr = (ir::getelementptr_inst*)dyn_ptr; infos[sta_ptr].sta_ptr = (ir::getelementptr_inst*)sta_ptr; } + // reassociate pointer argument + if(ir::getelementptr_inst* gepy = dynamic_cast(py)) + if(infos.find(gepy) != infos.end()){ + builder.set_insert_point(pz); + ir::getelementptr_inst *sta = infos[gepy].sta_ptr; + ir::getelementptr_inst *dyn = infos[gepy].dyn_ptr; + ir::value *cst = *sta->idx_begin(); + ir::value *off = *pz->idx_begin(); + ir::value *new_dyn = builder.create_gep(dyn, {off}); + ir::value *new_pz = builder.create_gep(new_dyn, {cst}, pz->get_name()); + params_->copy(new_dyn, pz); + params_->copy(new_pz, pz); + align_->copy(new_pz, pz); + pz->replace_all_uses_with(new_pz); + } // reassociate phi-node pointer if(ir::phi_node* phi = dynamic_cast(py)){ // only optimize the case where py = phi pa, pz for now diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 8c351be1c..820db29b3 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -228,7 +228,7 @@ void tune::run(ir::module &mod) { nts->set_value(1); } else { - ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 1, 4); + ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 2, 2); ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 1, 4); connected_components(node, {fpw, wpt}, {"fpw", "wpt"}, nodes_, dependencies_, group_id++); } @@ -247,14 +247,14 @@ void tune::run(ir::module &mod) { size_t addr_space = ptr_ty->get_pointer_address_space(); if(addr_space < 4){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 8)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 4, 8)); *params_.at(i).at("nts.d0") = *tmp; } } if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 2, 8)); - std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 2, 8)); + std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 4, 8)); + std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 4, 8)); *params_.at(i).at("nts.d0") = *tmp1; *params_.at(i).at("nts.d1") = *tmp2; } diff --git a/lib/dnn/blocksparse/dot.cpp b/lib/dnn/blocksparse/dot.cpp index 3ea79bc78..c7e3a9a85 100644 --- a/lib/dnn/blocksparse/dot.cpp +++ b/lib/dnn/blocksparse/dot.cpp @@ -88,8 +88,8 @@ void dot::triton_c_src(std::ostream &os) const { std::string bca1 = "newaxis, :"; std::string bcb0 = (op_ == FPROP) ? ":, newaxis" : "newaxis, :"; std::string bcb1 = (op_ == FPROP) ? "newaxis, :" : ":, newaxis"; - std::string ldb0 = (op_ == FPROP) ? "1" : "TK"; - std::string ldb1 = (op_ == FPROP) ? "TK" : "1" ; + std::string ldb0 = (op_ == FPROP) ? "" : "*TK"; + std::string ldb1 = (op_ == FPROP) ? "*TK" : "" ; std::string result = R"( const tunable int32 TM = {16, 32, 64, 128}; @@ -110,7 +110,7 @@ void dot::triton_c_src(std::ostream &os) const { int32 rkb[TK] = 0 ... TK; int1 checka[TM, TK] = (rxa < N)[:, newaxis]; int32 offa[)" + sizea + "] = rxa[" + bca0 + "] + rka[" + bca1 + R"(]*lda; - int32 offb[)" + sizeb + "] = ryb[" + bcb0 + "]*" + ldb0 + " + rkb[" + bcb1 + "]*" + ldb1 + R"(; + int32 offb[)" + sizeb + "] = ryb[" + bcb0 + "]" + ldb0 + " + rkb[" + bcb1 + "]" + ldb1 + R"(; int32 *header = lut + ridy * 4; int32 offset = *(header + 0); int32 K = *(header + 1); From d9945692a9d61561411a120792448f8e80c7d8a3 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 2 Aug 2019 17:42:48 -0700 Subject: [PATCH 274/494] [dnn] better specification of recompilation key --- examples/cpp/conv.cpp | 4 +- examples/cpp/dot.cpp | 4 +- examples/cpp/shift.cpp | 2 +- examples/python/pytorch/batchnorm.cpp | 4 +- examples/python/pytorch/conv.cpp | 2 +- examples/python/pytorch/shift.cpp | 6 +- examples/python/tensorflow/batchnorm.cpp | 4 +- examples/python/tensorflow/blocksparse.cpp | 4 +- examples/python/tensorflow/conv.cpp | 2 +- examples/python/tensorflow/dot.cpp | 2 +- examples/python/tensorflow/run.py | 6 +- examples/python/tensorflow/shift.cpp | 2 +- include/triton/codegen/selection.h | 1 + include/triton/dnn/base.h | 20 +- include/triton/dnn/batchnorm.h | 12 +- include/triton/dnn/blocksparse/dot.h | 4 +- include/triton/dnn/conv.h | 6 +- include/triton/dnn/dot.h | 7 +- include/triton/dnn/shift.h | 6 +- include/triton/lang/scanner.l | 25 ++- include/triton/runtime/jit.h | 4 +- include/triton/tools/bench.hpp | 2 +- lib/codegen/selection.cpp | 30 +-- lib/codegen/tune.cpp | 20 +- lib/dnn/base.cpp | 3 +- lib/dnn/batchnorm.cpp | 137 +++++++------- lib/dnn/blocksparse/dot.cpp | 68 ++++--- lib/dnn/conv.cpp | 208 ++++++++++----------- lib/dnn/dot.cpp | 69 ++++--- lib/dnn/shift.cpp | 169 ++++++++--------- lib/runtime/jit.cpp | 13 +- 31 files changed, 418 insertions(+), 428 deletions(-) diff --git a/examples/cpp/conv.cpp b/examples/cpp/conv.cpp index 2d6d7a845..dbe0591f0 100644 --- a/examples/cpp/conv.cpp +++ b/examples/cpp/conv.cpp @@ -18,12 +18,12 @@ int main() { int32_t pad_d = 0, pad_h = 0, pad_w = 0; int32_t stride_d = 1, stride_h = 1, stride_w = 1; int32_t upsample_d = 1, upsample_h = 1, upsample_w = 1; -// triton::dnn::conv configuration(128, 256, 1, 14, 14, 1, 5, 5, 512, 1, 1, 1, 0, 0, 0, 1, 1, 1, "fp32", "fp32", triton::dnn::conv::FPROP, 0); +// triton::dnn::conv configuration(128, 256, 1, 14, 14, 1, 5, 5, 512, 1, 1, 1, 0, 0, 0, 1, 1, 1, "float", "float", triton::dnn::conv::FPROP, 0); triton::dnn::conv configuration(B, NC, D, H, W, T, R, S, NF, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, upsample_d, upsample_h, upsample_w, - "fp32", "fp32", ty, 0); + "float", "float", ty, 0); // convolution configuration std::vector hc(configuration.c_size()); std::vector rc(configuration.c_size()); diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 771e44c1f..591237fbe 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -26,7 +26,7 @@ struct perf_t { perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K){ typedef float NumericT; - std::string ty = "fp16"; + std::string ty = "half"; size_t dt_nbytes = sizeof(NumericT); triton::driver::context* context = stream->context(); std::vector hc(M*N); @@ -46,7 +46,7 @@ perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int stream->write(db, true, 0, hb); stream->write(dc, true, 0, hc); stream->synchronize(); - triton::dnn::dot dot(M, N, K, AT, BT, ty, ty, 8, 8); + triton::dnn::dot dot(M, N, K, AT, BT, ty, ty, 8, 8, 8); // benchmark triton double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::PARTIAL_TUNING);}, stream); // benchmark cublas diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp index 38e0e37bf..1495de3c4 100644 --- a/examples/cpp/shift.cpp +++ b/examples/cpp/shift.cpp @@ -134,7 +134,7 @@ int main() { }; for(config_t c: resnet18){ for(op_t op: {op_t::FPROP, op_t::BPROP, op_t::WGRAD}){ - configs.push_back({c.B, c.C, c.H, c.W, c.R, c.S, c.F, c.stride_h, c.stride_w, op, layout_t::CHWN, "fp16"}); + configs.push_back({c.B, c.C, c.H, c.W, c.R, c.S, c.F, c.stride_h, c.stride_w, op, layout_t::CHWN, "half"}); } } diff --git a/examples/python/pytorch/batchnorm.cpp b/examples/python/pytorch/batchnorm.cpp index 521137a9e..64559e197 100644 --- a/examples/python/pytorch/batchnorm.cpp +++ b/examples/python/pytorch/batchnorm.cpp @@ -37,7 +37,7 @@ std::vector triton::driver::cu_buffer m(ctx, (CUdeviceptr)fw_m.storage().data(), false); triton::driver::cu_buffer v(ctx, (CUdeviceptr)fw_v.storage().data(), false); // create template - triton::dnn::batchnorm_forward batchnorm(C, 1, H, W, B, "fp32"); + triton::dnn::batchnorm_forward batchnorm(C, 1, H, W, B, "float"); batchnorm.enqueue(&stream, {&y, &m, &v, &x, &g, &b}); stream.synchronize(); return {fw_y, fw_m, fw_v}; @@ -79,7 +79,7 @@ std::vector triton::driver::cu_buffer dg(ctx, (CUdeviceptr)fw_dg.storage().data(), false); triton::driver::cu_buffer db(ctx, (CUdeviceptr)fw_db.storage().data(), false); // create config - triton::dnn::batchnorm_backward batchnorm(C, 1, H, W, B, "fp32", eps); + triton::dnn::batchnorm_backward batchnorm(C, 1, H, W, B, "float", eps); batchnorm.enqueue(&stream, {&dx, &dg, &db, &dy, &x, &g, &m, &v}); stream.synchronize(); return {fw_dx, fw_dg, fw_db}; diff --git a/examples/python/pytorch/conv.cpp b/examples/python/pytorch/conv.cpp index eab6ba9e7..91cef5441 100644 --- a/examples/python/pytorch/conv.cpp +++ b/examples/python/pytorch/conv.cpp @@ -30,7 +30,7 @@ torch::Tensor conv_common( stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, 1, 1, 1, - "fp32", "fp32", ty, has_bias); + "float", "float", ty, has_bias); // Bind memory triton::driver::cu_buffer a(ctx, (CUdeviceptr)torcha.storage().data(), false); triton::driver::cu_buffer b(ctx, (CUdeviceptr)torchb.storage().data(), false); diff --git a/examples/python/pytorch/shift.cpp b/examples/python/pytorch/shift.cpp index 7c86b227e..bd80d73d9 100644 --- a/examples/python/pytorch/shift.cpp +++ b/examples/python/pytorch/shift.cpp @@ -49,9 +49,9 @@ torch::Tensor shift_common( std::string dtype; at::ScalarType type = torcha.scalar_type(); switch(type){ - case at::ScalarType::Double: dtype = "fp64"; break; - case at::ScalarType::Float: dtype = "fp32"; break; - case at::ScalarType::Half: dtype = "fp16"; break; + case at::ScalarType::Double: dtype = "double"; break; + case at::ScalarType::Float: dtype = "float"; break; + case at::ScalarType::Half: dtype = "half"; break; default: AT_ERROR("unknown data-type for shift-conv"); } // Get configuration diff --git a/examples/python/tensorflow/batchnorm.cpp b/examples/python/tensorflow/batchnorm.cpp index 137a84809..956ecef24 100644 --- a/examples/python/tensorflow/batchnorm.cpp +++ b/examples/python/tensorflow/batchnorm.cpp @@ -58,7 +58,7 @@ public: triton::driver::cu_buffer m(ctx, fw_m->tensor_data().size(), (CUdeviceptr)fw_m->tensor_data().data(), false); triton::driver::cu_buffer v(ctx, fw_v->tensor_data().size(), (CUdeviceptr)fw_v->tensor_data().data(), false); // create config - triton::dnn::batchnorm_forward batchnorm(C, 1, H, W, B, "fp32"); + triton::dnn::batchnorm_forward batchnorm(C, 1, H, W, B, "float", triton::dnn::FULL_TUNING); batchnorm.enqueue(stream, {&y, &m, &v, &x, &g, &b}); } @@ -126,7 +126,7 @@ public: triton::driver::cu_buffer dg(ctx, fw_dg->tensor_data().size(), (CUdeviceptr)fw_dg->tensor_data().data(), false); triton::driver::cu_buffer db(ctx, fw_db->tensor_data().size(), (CUdeviceptr)fw_db->tensor_data().data(), false); // create config - triton::dnn::batchnorm_backward batchnorm(C, 1, H, W, B, "fp32"); + triton::dnn::batchnorm_backward batchnorm(C, 1, H, W, B, "float", triton::dnn::FULL_TUNING); batchnorm.enqueue(stream, {&dx, &dg, &db, &dy, &x, &g, &m, &v}); } diff --git a/examples/python/tensorflow/blocksparse.cpp b/examples/python/tensorflow/blocksparse.cpp index 0d37d382d..3a6a2505c 100644 --- a/examples/python/tensorflow/blocksparse.cpp +++ b/examples/python/tensorflow/blocksparse.cpp @@ -128,9 +128,9 @@ public: triton::driver::cu_buffer dc(ctx, c->tensor_data().size(), (CUdeviceptr)c->tensor_data().data(), false); triton::driver::cu_buffer dlut(ctx, lut.tensor_data().size(), (CUdeviceptr)lut.tensor_data().data(), false); // create profile - triton::dnn::blocksparse::dot dot(N, params_.K, params_.segments, params_.C, "fp16", params_.bsize, params_.locks, params_.blocks, OP); + triton::dnn::blocksparse::dot dot(N, params_.K, params_.segments, params_.C, "half", params_.bsize, params_.locks, params_.blocks, OP); // blocksparse matmul - triton::dnn::base* op = dot.enqueue(stream, {&da, &db, &dc, &dlut}, triton::dnn::FULL_TUNING); + triton::dnn::base* op = dot.enqueue(stream, {&da, &db, &dc, &dlut}, triton::dnn::NO_TUNING); triton::driver::buffer* locks_buffer = ((triton::dnn::blocksparse::dot*)op)->get_locks(); Tensor *tmp = nullptr; TensorShape tmp_shapes; diff --git a/examples/python/tensorflow/conv.cpp b/examples/python/tensorflow/conv.cpp index f06bf679c..00bf05473 100644 --- a/examples/python/tensorflow/conv.cpp +++ b/examples/python/tensorflow/conv.cpp @@ -61,7 +61,7 @@ public: stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, 1, 1, 1, - "fp16", "fp16", + "half", "half", triton::dnn::conv::FPROP, has_bias); // allocate output auto c_shapes = conv.c_shapes(); diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index 7acedb7e9..553ad11fa 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -49,7 +49,7 @@ class DotOp : public OpKernel { triton::driver::cu_buffer db(ctx, b.tensor_data().size(), (CUdeviceptr)b.tensor_data().data(), false); triton::driver::cu_buffer dc(ctx, c->tensor_data().size(), (CUdeviceptr)c->tensor_data().data(), false); // template - triton::dnn::dot dot(M, N, K, false, false, "fp16", "fp16", 8, 8); + triton::dnn::dot dot(M, N, K, false, false, "half", "half", 8, 8, 8); dot.enqueue(stream, {&da, &db, &dc}); } diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 88fe7ef3d..8dbc6ac55 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -105,7 +105,7 @@ def batch_norm_grad(op, dy, mean, var): def run_batchnorm(): - C, H, W, B = 32, 14, 14, 64 + C, H, W, B = 8, 4, 4, 32 np.random.seed(0) # Placeholders x = tf.placeholder(tf.float32, shape=[C, H, W, B]) @@ -131,6 +131,6 @@ def run_batchnorm(): print(np.max(np.abs(dg_t - dg_n))) print(np.max(np.abs(db_t - db_n))) -run_dot() +#run_dot() #run_shift() -#run_batchnorm() +run_batchnorm() diff --git a/examples/python/tensorflow/shift.cpp b/examples/python/tensorflow/shift.cpp index 28e10b679..cb28ce281 100644 --- a/examples/python/tensorflow/shift.cpp +++ b/examples/python/tensorflow/shift.cpp @@ -106,7 +106,7 @@ public: triton::dnn::shift shift(B, C, D, H, W, T, R_, S_, F, stride_h_, stride_w_, shift_h_data, shift_w_data, - "fp16", "fp16", OP, has_bias, layout_); + "half", "half", OP, has_bias, layout_); // shapes for c std::vector c_shapes; diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index e1d2dbf0b..317fc7f2b 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -91,6 +91,7 @@ public: void set_value(indices_t idx, llvm::Value *v); llvm::Value* get_value(indices_t idx); unsigned get_linear_index(indices_t idx); + indices_t get_ordered_indices(unsigned id); void for_each(std::function fn); const distributed_axis &axis(unsigned dim) { return axes_.at(dim); } diff --git a/include/triton/dnn/base.h b/include/triton/dnn/base.h index b9e2b886b..b991c3726 100644 --- a/include/triton/dnn/base.h +++ b/include/triton/dnn/base.h @@ -52,12 +52,15 @@ struct launch_context_t{ typedef std::vector params_t; class base { - friend class cmp_recompile; + friend class recompile_hash; + friend class recompile_equal; protected: // leading dimensions static void set_ld(const std::vector& shapes, std::vector& ld); + // list of retuning parameters + virtual std::vector retune_params() const = 0; private: // initialize @@ -70,8 +73,6 @@ private: triton::runtime::launch_information info) = 0; // number of flops virtual size_t num_flops() const = 0; - // comparison for maps - virtual bool operator<(const base& other) const = 0; // default parameters virtual std::vector search_space() const; virtual params_t heuristics() const; @@ -94,12 +95,21 @@ private: std::string name_; }; -struct cmp_recompile{ + +struct recompile_equal{ bool operator()(base* x, base* y) const{ - return *x < *y; + return typeid(*x) == typeid(*y) && + x->retune_params() == y->retune_params(); } }; +struct recompile_hash{ + unsigned operator()(base* x) const{ + return x->retune_params()[0]; + } +}; + + } } diff --git a/include/triton/dnn/batchnorm.h b/include/triton/dnn/batchnorm.h index 32c006b99..204ab631b 100644 --- a/include/triton/dnn/batchnorm.h +++ b/include/triton/dnn/batchnorm.h @@ -47,15 +47,15 @@ private: triton::runtime::launch_information info); // number of flops size_t num_flops() const; - // comparison for maps - bool operator<(const base& other) const; + // retuning parameters + std::vector retune_params() const; // clone base* clone() const; public: // constructor batchnorm_forward(int C, int D, int H, int W, int B, - std::string ty = "fp32", float eps = 1e-5); + std::string ty = "float", float eps = 1e-5); // triton-c source void triton_c_src(std::ostream &os) const; @@ -82,15 +82,15 @@ private: runtime::launch_information info); // number of flops size_t num_flops() const; - // comparison for maps - bool operator<(const base& other) const; + // retuning parameters + std::vector retune_params() const; // clone base* clone() const; public: // constructor batchnorm_backward(int C, int D, int H, int W, int B, - std::string ty = "fp32", float eps = 1e-5); + std::string ty = "float", float eps = 1e-5); // triton-c source void triton_c_src(std::ostream &os) const; diff --git a/include/triton/dnn/blocksparse/dot.h b/include/triton/dnn/blocksparse/dot.h index 98a1ce6fa..488c26c31 100644 --- a/include/triton/dnn/blocksparse/dot.h +++ b/include/triton/dnn/blocksparse/dot.h @@ -20,8 +20,8 @@ private: triton::runtime::launch_information info); // number of flops size_t num_flops() const; - // comparison for maps - bool operator<(const base& other) const; + // retuning parameters + std::vector retune_params() const; // default parameters std::vector search_space() const; params_t heuristics() const; diff --git a/include/triton/dnn/conv.h b/include/triton/dnn/conv.h index 2745d72bc..5a167531d 100644 --- a/include/triton/dnn/conv.h +++ b/include/triton/dnn/conv.h @@ -37,8 +37,8 @@ private: triton::runtime::launch_information info); // number of flops size_t num_flops() const; - // comparison for maps - bool operator<(const base& other) const; + // retuning parameters + std::vector retune_params() const; // clone base* clone() const; @@ -50,7 +50,7 @@ public: int stride_d, int stride_h, int stride_w, int pad_d, int pad_h, int pad_w, int upsample_d, int upsample_h, int upsample_w, - std::string a_ty = "fp32", std::string b_ty = "fp32", + std::string a_ty = "float", std::string b_ty = "float", type ty = FPROP, bool bias = false); // accessors diff --git a/include/triton/dnn/dot.h b/include/triton/dnn/dot.h index 30836357f..c655d12b5 100644 --- a/include/triton/dnn/dot.h +++ b/include/triton/dnn/dot.h @@ -16,8 +16,8 @@ private: void enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, triton::runtime::launch_information info); - // comparison for maps - bool operator<(const base& other) const; + // retuning parameters + std::vector retune_params() const; // default parameters virtual std::vector search_space() const; virtual params_t heuristics() const; @@ -25,7 +25,7 @@ private: public: dot(int M, int N, int K, bool AT, bool BT, std::string a_ty, std::string b_ty, - unsigned alignment_lda, unsigned alignment_ldb); + unsigned align_lda, unsigned align_ldb, unsigned align_ldc); // number of flops size_t num_flops() const; @@ -70,6 +70,7 @@ private: std::string b_ty_; unsigned align_lda_; unsigned align_ldb_; + unsigned align_ldc_; driver::buffer *locks_; }; diff --git a/include/triton/dnn/shift.h b/include/triton/dnn/shift.h index 35ad312e0..4590c476e 100644 --- a/include/triton/dnn/shift.h +++ b/include/triton/dnn/shift.h @@ -64,7 +64,7 @@ public: int T, int R, int S, int NF, int stride_h, int stride_w, const int32_t* shift_h, const int32_t* shift_w, - std::string a_ty = "fp32", std::string b_ty = "fp32", + std::string a_ty = "float", std::string b_ty = "float", op_t ty = FPROP, bool bias = false, layout_t layout = CHWN); // look-up table @@ -86,8 +86,8 @@ public: size_t num_flops() const; // source void triton_c_src(std::ostream &os) const; - // comparison - bool operator<(const base& other) const; + // retuning parameters + std::vector retune_params() const; // clone base* clone() const; // cpu reference diff --git a/include/triton/lang/scanner.l b/include/triton/lang/scanner.l index af691349d..fc791ae94 100644 --- a/include/triton/lang/scanner.l +++ b/include/triton/lang/scanner.l @@ -30,19 +30,18 @@ using triton::lang::return_void; "for" { return return_impl(FOR, yytext); } "while" { return return_impl(WHILE, yytext); } "void" { return return_impl(VOID, yytext); } -"uint1" { return return_impl(UINT1, yytext); } -"uint8" { return return_impl(UINT8, yytext); } -"uint16" { return return_impl(UINT16, yytext); } -"uint32" { return return_impl(UINT32, yytext); } -"uint64" { return return_impl(UINT64, yytext); } -"int1" { return return_impl(INT1, yytext); } -"int8" { return return_impl(INT8, yytext); } -"int16" { return return_impl(INT16, yytext); } -"int32" { return return_impl(INT32, yytext); } -"int64" { return return_impl(INT64, yytext); } -"fp16" { return return_impl(FP16, yytext); } -"fp32" { return return_impl(FP32, yytext); } -"fp64" { return return_impl(FP64, yytext); } +"uchar" { return return_impl(UCHAR, yytext); } +"ushort" { return return_impl(USHORT, yytext); } +"uint" { return return_impl(UINT, yytext); } +"ulong" { return return_impl(ULONG, yytext); } +"bool" { return return_impl(BOOL, yytext); } +"char" { return return_impl(CHAR, yytext); } +"short" { return return_impl(SHORT, yytext); } +"int" { return return_impl(INT, yytext); } +"long" { return return_impl(LONG, yytext); } +"half" { return return_impl(HALF, yytext); } +"float" { return return_impl(FLOAT, yytext); } +"double" { return return_impl(DOUBLE, yytext); } "..." { return return_impl(ELLIPSIS, yytext); } "get_range_id" { return return_impl(GET_RANGE_ID, yytext); } "get_num_program" { return return_impl(GET_NUM_PROGRAM, yytext); } diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index 19fde0e84..939aebbfe 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -78,7 +78,7 @@ public: void target_dependent(ir::module &module) { alignment_info.run(module); // ir::print(module, std::cout); - reassociate.run(module); +// reassociate.run(module); if(target_->is_gpu()){ shmem_info.run(module); shmem_liveness.run(module); @@ -86,7 +86,7 @@ public: shmem_barriers.run(module); } vectorize.run(module); - optimize_dce.run(module); +// optimize_dce.run(module); // ir::print(module, std::cout); } diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp index 6d71d27ae..74053b717 100644 --- a/include/triton/tools/bench.hpp +++ b/include/triton/tools/bench.hpp @@ -37,7 +37,7 @@ inline double bench(std::function const & op, driver::stream * stream) stream->synchronize(); while(total_time*1e-9 < 1e-3){ float norm = 1; - // normalize clock if possible to get roughly constant result + // normalize clock if possible to reduce noise in auto-tuning // if(auto cu_device = dynamic_cast(device)) // norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock(); tmr.start(); diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index a57713f38..84326529d 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -74,6 +74,11 @@ unsigned distributed_tile::get_linear_index(indices_t idx) { return indices_[idx]; } +indices_t distributed_tile::get_ordered_indices(unsigned id) { + return ordered_indices_.at(id); +} + + void distributed_tile::for_each(std::function fn) { for(unsigned i = 0; i < ordered_indices_.size(); i++) if(i % vector_size_ == 0) @@ -779,13 +784,21 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & // store if(auto *x = dynamic_cast(ins)){ distributed_tile* ptrs = (distributed_tile*)tmap_.at(x->get_pointer_operand()); - tile *scalars = tmap_.at(x->get_value_operand()); + distributed_tile* scalars = (distributed_tile*)tmap_.at(x->get_value_operand()); ir::value *mask = x->get_mask_operand(); distributed_tile* preds = (distributed_tile*)tmap_.at(mask); ptrs->for_each([&](indices_t idx){ Value *scalar = scalars->get_value(idx); Value *ptr = ptrs->get_value(idx); Value *pred = preds->get_value(idx); + BasicBlock *mask_then_bb = BasicBlock::Create(ctx, "mask_then", fn); + BasicBlock *mask_done_bb = BasicBlock::Create(ctx, "mask_done", fn); + builder.CreateCondBr(pred, mask_then_bb, mask_done_bb); + builder.SetInsertPoint(mask_then_bb); + builder.CreateStore(scalar, ptr); + builder.CreateBr(mask_done_bb); + builder.SetInsertPoint(mask_done_bb); + // std::string offset = ""; // if(GetElementPtrInst *gep = dyn_cast(ptr)) // if(gep->getNumIndices() == 1) @@ -796,14 +809,6 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & // std::string asm_str = "@$0 st.global.b32 [$1" + offset + "], $2;"; // InlineAsm *iasm = InlineAsm::get(ty, asm_str, "b,l,f", true); // builder.CreateCall(iasm, {pred, ptr, scalar}); - - BasicBlock *mask_then_bb = BasicBlock::Create(ctx, "mask_then", fn); - BasicBlock *mask_done_bb = BasicBlock::Create(ctx, "mask_done", fn); - builder.CreateCondBr(pred, mask_then_bb, mask_done_bb); - builder.SetInsertPoint(mask_then_bb); - builder.CreateStore(scalar, ptr); - builder.CreateBr(mask_done_bb); - builder.SetInsertPoint(mask_done_bb); }); } else if(auto *x = dynamic_cast(ins)) { @@ -893,11 +898,8 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & ir::value* in = ins->get_operand(0); distributed_tile *in_tile = (distributed_tile*)tmap_.at(in); result->for_each([&](indices_t out_idx){ - indices_t in_idx; - for(size_t k = 0; k < shapes.size(); k++){ - if(shapes[k]->get_value() > 1) - in_idx.push_back(out_idx[k]); - } + unsigned pos = result->get_linear_index(out_idx); + indices_t in_idx = in_tile->get_ordered_indices(pos); result->set_value(out_idx, in_tile->get_value(in_idx)); }); } diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 820db29b3..9cdf2767d 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -63,14 +63,19 @@ void tune::init_c_graph(ir::instruction *v) { else shapes = v->get_type()->get_tile_shapes(); // Reshape - if(dynamic_cast(v)){ + if(dynamic_cast(v)) { ir::value *op = v->get_operand(0); unsigned current = 0; + bool is_skewed = false; for(unsigned i = 0; i < shapes.size(); i ++){ - if(shapes[i] == one) + bool is_one = shapes[i] == one; + bool is_same = shapes[i] == op->get_type()->get_tile_shapes()[current]; + if(is_one) static_params_.insert({{v, i}, 1}); - else + else if(!is_skewed && is_same) add_constraint({v, i}, {op, current++}); + else + is_skewed = true; } } // Splat @@ -81,9 +86,8 @@ void tune::init_c_graph(ir::instruction *v) { else if(dynamic_cast(v)){ ir::value *op = v->get_operand(0); size_t n_shapes = shapes.size(); - for(unsigned i = 0; i < n_shapes; i++){ + for(unsigned i = 0; i < n_shapes; i++) add_constraint({v, (i + 1) % n_shapes}, {op, i}); - } } // Broadcast else if(dynamic_cast(v)){ @@ -247,14 +251,14 @@ void tune::run(ir::module &mod) { size_t addr_space = ptr_ty->get_pointer_address_space(); if(addr_space < 4){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 4, 8)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 1, 8)); *params_.at(i).at("nts.d0") = *tmp; } } if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 4, 8)); - std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 4, 8)); + std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 1, 8)); + std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 1, 8)); *params_.at(i).at("nts.d0") = *tmp1; *params_.at(i).at("nts.d1") = *tmp2; } diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index 033e2497c..1c1ee8ceb 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -1,4 +1,5 @@ #include +#include #include "triton/dnn/base.h" #include "triton/runtime/jit.h" #include "triton/tools/bench.hpp" @@ -31,7 +32,7 @@ params_t base::heuristics() const { } std::pair base::get_profile_impl(driver::stream *stream, std::vector args, autotuning_t autotune) { - static std::map, cmp_recompile> m_jit; + static std::unordered_map, recompile_hash, recompile_equal> m_jit; driver::context* ctx = stream->context(); rt::jit* jit; /* the current template has not already been compiled */ diff --git a/lib/dnn/batchnorm.cpp b/lib/dnn/batchnorm.cpp index 34275a931..dcc9d6a4e 100644 --- a/lib/dnn/batchnorm.cpp +++ b/lib/dnn/batchnorm.cpp @@ -30,7 +30,7 @@ namespace dnn{ * --------------- */ batchnorm_forward::batchnorm_forward(int C, int D, int H, int W, int B, std::string ty, float eps) - : base("batchnorm"), + : base("batchnorm_forward"), C_(C), D_(D), H_(H), W_(W), B_(B), ty_(ty), eps_(eps) { DHWB_ = D_*H_*W_*B_; rcpDHWB_ = (float)1 / DHWB_; @@ -40,12 +40,9 @@ size_t batchnorm_forward::num_flops() const { return C_*DHWB_; } -bool batchnorm_forward::operator <(const base& other) const { - auto *y = dynamic_cast(&other); - if(!y) - return true; - return std::tie(C_, D_, H_, W_, B_, ty_) - < std::tie(y->C_, y->D_, y->H_, y->W_, y->B_, y->ty_); + +std::vector batchnorm_forward::retune_params() const { + return {C_, D_, H_, W_, B_}; } base* batchnorm_forward::clone() const { @@ -74,50 +71,50 @@ void batchnorm_forward::enqueue_impl(driver::stream *stream, driver::kernel *ker void batchnorm_forward::triton_c_src(std::ostream &os) const { os << R"( -const tunable int32 TM = {32, 64, 128}; +const tunable int TM = {32, 64, 128}; -void batchnorm(fp32 *Y, fp32 *M, fp32 *V, - restrict read_only fp32 *X, - restrict read_only fp32 *G, - restrict read_only fp32 *B, - int32 DHWN, - fp32 rcpDHWN, fp32 eps) { - int32 rx[TM] = 0 ... TM; - fp32 *px[TM]; - fp32 x[TM]; - int32 c = get_range_id(1); - fp32 g = *(G + c); - fp32 b = *(B + c); +void batchnorm_forward(float *Y, float *M, float *V, + restrict read_only float *X, + restrict read_only float *G, + restrict read_only float *B, + int DHWN, + float rcpDHWN, float eps) { + int rx[TM] = 0 ... TM; + float *px[TM]; + float x[TM] = 0; + int c = get_range_id(1); + float g = *(G + c); + float b = *(B + c); - fp32 mean[TM] = 0; + float mean[TM] = 0; px = X + rx + c*DHWN; - for(int32 i = 0; i < DHWN; i = i + TM){ + for(int i = 0; i < DHWN; i = i + TM){ x = *px; mean = mean + x; px = px + TM; } - fp32 *pm = M + c; - fp32 m = __sum(mean) * rcpDHWN; + float *pm = M + c; + float m = __sum(mean) * rcpDHWN; *pm = m; - fp32 var[TM] = 0; + float var[TM] = 0; px = X + rx + c*DHWN; - for(int32 i = 0; i < DHWN; i = i + TM){ + for(int i = 0; i < DHWN; i = i + TM){ x = *px; x = x - m; var = var + x*x; px = px + TM; } - fp32 v = __sum(var) * rcpDHWN; - fp32 *pv = V + c; + float v = __sum(var) * rcpDHWN; + float *pv = V + c; *pv = v; - fp32 rstdg = 1 / sqrt(v + eps) * g; + float rstdg = 1 / sqrt(v + eps) * g; px = X + rx + c*DHWN; - fp32* py[TM] = Y + rx + c*DHWN; - for(int32 i = 0; i < DHWN; i = i + TM){ + float* py[TM] = Y + rx + c*DHWN; + for(int i = 0; i < DHWN; i = i + TM){ x = *px; - fp32 y[TM] = (x - m)*rstdg + b; + float y[TM] = (x - m)*rstdg + b; *py = y; px = px + TM; py = py + TM; @@ -130,7 +127,7 @@ void batchnorm(fp32 *Y, fp32 *M, fp32 *V, * --------------- */ batchnorm_backward::batchnorm_backward(int C, int D, int H, int W, int B, std::string ty, float eps) - : base("batchnorm"), + : base("batchnorm_backward"), C_(C), D_(D), H_(H), W_(W), B_(B), ty_(ty), eps_(eps) { } @@ -139,12 +136,8 @@ size_t batchnorm_backward::num_flops() const { return C_*D_*H_*W_*B_; } -bool batchnorm_backward::operator <(const base& other) const { - auto *y = dynamic_cast(&other); - if(!y) - return true; - return std::tie(C_, D_, H_, W_, B_, ty_) - < std::tie(y->C_, y->D_, y->H_, y->W_, y->B_, y->ty_); +std::vector batchnorm_backward::retune_params() const { + return {C_, D_, H_, W_, B_}; } base* batchnorm_backward::clone() const { @@ -174,54 +167,54 @@ void batchnorm_backward::enqueue_impl(driver::stream *stream, driver::kernel *ke void batchnorm_backward::triton_c_src(std::ostream &os) const { os << R"( -const tunable int32 TM = {32, 64, 128}; +const tunable int TM = {32, 64, 128}; -void batchnorm(fp32 *DX, fp32 *DG, fp32 *DB, - restrict read_only fp32 *DY, - restrict read_only fp32 *X, - restrict read_only fp32 *G, - restrict read_only fp32 *M, - restrict read_only fp32 *V, - int32 DHWN, fp32 rcpDHWN, fp32 epsilon) { - int32 rx[TM] = 0 ... TM; - int32 c = get_range_id(1); - int32 offset = c*DHWN; - fp32 g = *(G + c); - fp32 mean = *(M + c); - fp32 var = *(V + c); - fp32 rstd = 1 / sqrt(var + epsilon); - fp32* px[TM]; - fp32* pdx[TM]; - fp32* pdy[TM]; +void batchnorm_backward(float *DX, float *DG, float *DB, + restrict read_only float *DY, + restrict read_only float *X, + restrict read_only float *G, + restrict read_only float *M, + restrict read_only float *V, + int DHWN, float rcpDHWN, float epsilon) { + int rx[TM] = 0 ... TM; + int c = get_range_id(1); + int offset = c*DHWN; + float g = *(G + c); + float mean = *(M + c); + float var = *(V + c); + float rstd = 1 / sqrt(var + epsilon); + float* px[TM]; + float* pdx[TM]; + float* pdy[TM]; px = X + rx + offset; pdy = DY + rx + offset; - fp32 dg[TM] = 0; - fp32 db[TM] = 0; - for(int32 i = 0; i < DHWN; i = i + TM){ - fp32 x[TM] = *px; - fp32 dy[TM] = *pdy; + float dg[TM] = 0; + float db[TM] = 0; + for(int i = 0; i < DHWN; i = i + TM){ + float x[TM] = *px; + float dy[TM] = *pdy; dg = dg + dy*(x - mean)*rstd; db = db + dy; px = px + TM; pdy = pdy + TM; } - fp32 sdg = __sum(dg); - fp32 sdb = __sum(db); - fp32 *pdg = DG + c; - fp32 *pdb = DB + c; + float sdg = __sum(dg); + float sdb = __sum(db); + float *pdg = DG + c; + float *pdb = DB + c; *pdg = sdg; *pdb = sdb; px = X + rx + offset; pdy = DY + rx + offset; pdx = DX + rx + offset; - for(int32 i = 0; i < DHWN; i = i + TM){ - fp32 x[TM] = *px; - fp32 dy[TM] = *pdy; - fp32 xhat[TM] = (x - mean) * rstd; - fp32 xtmp[TM] = (xhat * dg + db) * rcpDHWN; - fp32 dx[TM] = (dy - xtmp) * rstd * g; + for(int i = 0; i < DHWN; i = i + TM){ + float x[TM] = *px; + float dy[TM] = *pdy; + float xhat[TM] = (x - mean) * rstd; + float xtmp[TM] = (xhat * dg + db) * rcpDHWN; + float dx[TM] = (dy - xtmp) * rstd * g; *pdx = dx; px = px + TM; pdy = pdy + TM; diff --git a/lib/dnn/blocksparse/dot.cpp b/lib/dnn/blocksparse/dot.cpp index c7e3a9a85..9c7fd95d9 100644 --- a/lib/dnn/blocksparse/dot.cpp +++ b/lib/dnn/blocksparse/dot.cpp @@ -10,12 +10,8 @@ size_t dot::num_flops() const { return 2.*nblocks_*BS_*BS_*N_; } -bool dot::operator <(const base& other) const { - auto *y = dynamic_cast(&other); - if(!y) - return true; - return std::tie(N_, S_, C_, BS_, nlocks_, ab_ty_, c_ty_, op_) - < std::tie(y->N_, y->S_, y->C_, y->BS_, y->nlocks_, y->ab_ty_, y->c_ty_, y->op_); +std::vector dot::retune_params() const{ + return {N_, S_, C_, BS_, nlocks_, op_}; } std::vector dot::search_space() const { @@ -92,35 +88,35 @@ void dot::triton_c_src(std::ostream &os) const { std::string ldb1 = (op_ == FPROP) ? "*TK" : "" ; std::string result = R"( - const tunable int32 TM = {16, 32, 64, 128}; - const tunable int32 TN = {)" + std::to_string(BS_) + R"(}; - const tunable int32 TK = {)" + std::to_string(BS_) + R"(}; + const tunable int TM = {16, 32, 64, 128}; + const tunable int TN = {)" + std::to_string(BS_) + R"(}; + const tunable int TK = {)" + std::to_string(BS_) + R"(}; void bsdot(restrict read_only align(16) )" + ab_ty_ + R"( *A, restrict read_only align(16) )" + ab_ty_ + R"( *B, )" + c_ty_ + R"(* C, - int32 lda, int32 ldc, int32 N, - int32* lut, int32* locks, int32 nlocks){ - int32 ridx = get_range_id(0); - int32 ridy = get_range_id(1); - fp32 acc[TM, TN] = 0; - int32 rxa[TM] = ridx * TM + (0 ... TM); - int32 ryb[TN] = 0 ... TN; - int32 rka[TK] = 0 ... TK; - int32 rkb[TK] = 0 ... TK; - int1 checka[TM, TK] = (rxa < N)[:, newaxis]; - int32 offa[)" + sizea + "] = rxa[" + bca0 + "] + rka[" + bca1 + R"(]*lda; - int32 offb[)" + sizeb + "] = ryb[" + bcb0 + "]" + ldb0 + " + rkb[" + bcb1 + "]" + ldb1 + R"(; - int32 *header = lut + ridy * 4; - int32 offset = *(header + 0); - int32 K = *(header + 1); - int32 column = *(header + 2); - int32 lockid = *(header + 3); - int32 *plut = lut + offset * 2; - for(int32 k = K; k > 0; k = k - 1) + int lda, int ldc, int N, + int* lut, int* locks, int nlocks){ + int ridx = get_range_id(0); + int ridy = get_range_id(1); + float acc[TM, TN] = 0; + int rxa[TM] = ridx * TM + (0 ... TM); + int ryb[TN] = 0 ... TN; + int rka[TK] = 0 ... TK; + int rkb[TK] = 0 ... TK; + bool checka[TM, TK] = (rxa < N)[:, newaxis]; + int offa[)" + sizea + "] = rxa[" + bca0 + "] + rka[" + bca1 + R"(]*lda; + int offb[)" + sizeb + "] = ryb[" + bcb0 + "]" + ldb0 + " + rkb[" + bcb1 + "]" + ldb1 + R"(; + int *header = lut + ridy * 4; + int offset = *(header + 0); + int K = *(header + 1); + int column = *(header + 2); + int lockid = *(header + 3); + int *plut = lut + offset * 2; + for(int k = K; k > 0; k = k - 1) { - int32 ak = *(plut + 0); - int32 bk = *(plut + 1); + int ak = *(plut + 0); + int bk = *(plut + 1); )" + ab_ty_ + "* pa[" + sizea + R"(] = A + offa + ak * TK * lda; )" + ab_ty_ + "* pb[" + sizeb + R"(] = B + offb + bk * TK * TN; )" + ab_ty_ + " a[" + sizea + R"(] = checka ? *pa : 0; @@ -128,19 +124,19 @@ void dot::triton_c_src(std::ostream &os) const { acc = dot()" + usea + ", " + useb + R"(, acc); plut = plut + 2; } - int32 rxc[TM] = ridx * TM + (0 ... TM); - int32 ryc[TN] = column * TN + (0 ... TN); + int rxc[TM] = ridx * TM + (0 ... TM); + int ryc[TN] = column * TN + (0 ... TN); )" + c_ty_ + R"(" c[TM, TN] = acc; )" + c_ty_ + R"(* pc[TM, TN] = C + rxc[:, newaxis] + ryc[newaxis, :]*ldc; - int1 checkc[TM, TN] = (rxc < N)[:, newaxis]; + bool checkc[TM, TN] = (rxc < N)[:, newaxis]; if(lockid == 0) @checkc *pc = c; else { - int32 *plock = locks + ridx*nlocks + lockid - 1; - int32 *pcount = plock + get_num_program(0)*nlocks; + int *plock = locks + ridx*nlocks + lockid - 1; + int *pcount = plock + get_num_program(0)*nlocks; while(__atomic_cas(plock, 0, 1)); - int32 count = *pcount; + int count = *pcount; if(count == 0){ @checkc *pc = c; } diff --git a/lib/dnn/conv.cpp b/lib/dnn/conv.cpp index f54c63560..0f32455ea 100644 --- a/lib/dnn/conv.cpp +++ b/lib/dnn/conv.cpp @@ -98,20 +98,12 @@ conv::conv(int B, int NC, } // comparison for maps -bool conv::operator<(const base& other) const { - auto *y = dynamic_cast(&other); - if(!y) - return true; - return std::tie(NB_, NC_, AD_, AH_, AW_, - NF_, BD_, BH_, BW_, - pad_d_, pad_h_, pad_w_, - stride_d_, stride_h_, stride_w_, - a_ty_, b_ty_, ty_, bias_) - < std::tie(y->NB_, y->NC_, y->AD_, y->AH_, y->AW_, - y->NF_, y->BD_, y->BH_, y->BW_, - y->pad_d_, y->pad_h_, y->pad_w_, - y->stride_d_, y->stride_h_, y->stride_w_, - y->a_ty_, y->b_ty_, y->ty_, y->bias_); +std::vector conv::retune_params() const { + return {NB_, NC_, AD_, AH_, AW_, + NF_, BD_, BH_, BW_, + pad_d_, pad_h_, pad_w_, + stride_d_, stride_h_, stride_w_, + ty_, bias_}; } // clone @@ -549,114 +541,114 @@ void conv::triton_c_src(std::ostream &os) const { os << R"( -const tunable int32 TM = {16, 32, 64}; -const tunable int32 TN = {16, 32, 64}; -const tunable int32 TK = {)" << TK_ << R"(}; -const tunable int32 GZ = {1}; +const tunable int TM = {16, 32, 64}; +const tunable int TN = {16, 32, 64}; +const tunable int TK = {)" << TK_ << R"(}; +const tunable int GZ = {1}; )"; if(is_a_deltas_cst) - os << "__constant__ int32* delta = alloc_const int32[" + std::to_string(h_a_deltas_.size()) + "];\n"; + os << "__constant__ int* delta = alloc_const int[" + std::to_string(h_a_deltas_.size()) + "];\n"; if(b_lut_ && is_b_deltas_cst_) - os << "__constant__ int32* b_delta = alloc_const int32[" + std::to_string(h_b_deltas_.size()) + "];\n"; + os << "__constant__ int* b_delta = alloc_const int[" + std::to_string(h_b_deltas_.size()) + "];\n"; if(is_mask_cst_) - os << "__constant__ int32* masks = alloc_const int32[" + std::to_string(h_masks_.size()) + "];\n"; + os << "__constant__ int* masks = alloc_const int[" + std::to_string(h_masks_.size()) + "];\n"; os << R"( void conv(read_only restrict )" << a_ty_ << R"( *a, read_only restrict )" << b_ty_ << R"( *b, - fp32 *c, - fp32 *bias, - int32 M, int32 N, int32 K, - int32 AH, int32 AW, - int32 BH, int32 BW, - int32 CH, int32 CW, - int32 NC, - int32 lda_n, int32 lda_c, int32 lda_d, int32 lda_h, int32 lda_w, - int32 ldb_c, int32 ldb_t, int32 ldb_r, int32 ldb_s, int32 ldb_k, - int32 ldc_n, int32 ldc_k, int32 ldc_m, int32 ldc_p, int32 ldc_q, - int32 pad_h, int32 pad_w, - int32 stride_h, int32 stride_w, - int32 upsample_h, int32 upsample_w, - int32 off_uh, int32 off_uw, - int32 off_uah, int32 off_uaw, - int32 off_uch, int32 off_ucw, - int32 *locks, int32 grid0, int32 grid1)"; + float *c, + float *bias, + int M, int N, int K, + int AH, int AW, + int BH, int BW, + int CH, int CW, + int NC, + int lda_n, int lda_c, int lda_d, int lda_h, int lda_w, + int ldb_c, int ldb_t, int ldb_r, int ldb_s, int ldb_k, + int ldc_n, int ldc_k, int ldc_m, int ldc_p, int ldc_q, + int pad_h, int pad_w, + int stride_h, int stride_w, + int upsample_h, int upsample_w, + int off_uh, int off_uw, + int off_uah, int off_uaw, + int off_uch, int off_ucw, + int *locks, int grid0, int grid1)"; if(!is_a_deltas_cst) - os << ", int32* delta"; + os << ", int* delta"; if(b_lut_ && !is_b_deltas_cst_) - os << ", int32* b_delta"; + os << ", int* b_delta"; if(!is_mask_cst_) - os << ", int32* masks"; + os << ", int* masks"; os << R"(){ - int32 rxa[TM] = get_global_range[TM](0); - int32 rb0[TN] = get_global_range[TN](1); - int32 rz = get_global_range[1](2); - int32 rka[TK] = 0 ... TK; - int32 rkb[TK] = 0 ... TK; - fp32 C[TM, TN] = 0; - int32 ldlut = )" + std::to_string(Luts_) + R"(; - int32 div = K / GZ; - int32 rem = K % GZ; + int rxa[TM] = get_global_range[TM](0); + int rb0[TN] = get_global_range[TN](1); + int rz = get_global_range[1](2); + int rka[TK] = 0 ... TK; + int rkb[TK] = 0 ... TK; + float C[TM, TN] = 0; + int ldlut = )" + std::to_string(Luts_) + R"(; + int div = K / GZ; + int rem = K % GZ; K = select(rz < rem, div, div + rem); - int32 offk = rz*div; + int offk = rz*div; rka = rka + offk; rkb = rkb + offk; - int32 rabh[TM] = rxa / CW; - int32 raw[TM] = rxa % CW; - int32 rab[TM] = rabh / CH; - int32 rah[TM] = rabh % CH; + int rabh[TM] = rxa / CW; + int raw[TM] = rxa % CW; + int rab[TM] = rabh / CH; + int rah[TM] = rabh % CH; rah = rah)" + upaw + R"( - off_uah; raw = raw)" + upah + R"( - off_uaw; - int32 ra0[TM] = rab*lda_n + rah*lda_h + raw*lda_w; - int32 ra)" + ax[0] + ax[1] + "[TK] = rka / " + redax[2] + R"(; - int32 ra)" + ax[2] + "[TK] = rka % " + redax[2] + R"(; - int32 ra)" + ax[0] + "[TK] = ra" + ax[0] + ax[1] + " / " + redax[1] + R"(; - int32 ra)" + ax[1] + "[TK] = ra" + ax[0] + ax[1] + " % " + redax[1] + R"(; + int ra0[TM] = rab*lda_n + rah*lda_h + raw*lda_w; + int ra)" + ax[0] + ax[1] + "[TK] = rka / " + redax[2] + R"(; + int ra)" + ax[2] + "[TK] = rka % " + redax[2] + R"(; + int ra)" + ax[0] + "[TK] = ra" + ax[0] + ax[1] + " / " + redax[1] + R"(; + int ra)" + ax[1] + "[TK] = ra" + ax[0] + ax[1] + " % " + redax[1] + R"(; rar = )" + flipr + R"( rar; ras = )" + flips + R"( ras; rar = )" + upar + R"( rar; ras = )" + upas + R"( ras; - int32 ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; + int ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; )" << a_ty_ << R"(* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis];)"; if(b_lut_){ os << R"( - int32 rb)" + ax[0] + ax[1] + "[TK] = rkb / " + redax[2] + R"(; - int32 rb)" + ax[2] + "[TK] = rkb % " + redax[2] + R"(; - int32 rb)" + ax[0] + "[TK] = rb" + ax[0] + ax[1] + " / " + redax[1] + R"(; - int32 rb)" + ax[1] + "[TK] = rb" + ax[0] + ax[1] + " % " + redax[1] + R"(; + int rb)" + ax[0] + ax[1] + "[TK] = rkb / " + redax[2] + R"(; + int rb)" + ax[2] + "[TK] = rkb % " + redax[2] + R"(; + int rb)" + ax[0] + "[TK] = rb" + ax[0] + ax[1] + " / " + redax[1] + R"(; + int rb)" + ax[1] + "[TK] = rb" + ax[0] + ax[1] + " % " + redax[1] + R"(; rbr = rbr*upsample_h + off_uh; rbs = rbs*upsample_w + off_uw; - int32 offdb[TK] = rkb % ldlut; - int32 rb1[TK] = rbc*ldb_c + rbr*ldb_r + rbs*ldb_s; - )" + b_delta_mem + R"( int32* pdb[TK] = b_delta + offdb + off_uw*ldlut + off_uh*ldlut*upsample_w; - int32 db[TK] = *pdb;)"; + int offdb[TK] = rkb % ldlut; + int rb1[TK] = rbc*ldb_c + rbr*ldb_r + rbs*ldb_s; + )" + b_delta_mem + R"( int* pdb[TK] = b_delta + offdb + off_uw*ldlut + off_uh*ldlut*upsample_w; + int db[TK] = *pdb;)"; } else{ os << R"( - int32 rb1[TK] = rkb)" + ldb0 + ";"; + int rb1[TK] = rkb)" + ldb0 + ";"; } os << R"( )" << b_ty_ << R"(* pb)" + BS + " = b + rb1" + bcb1 + " + rb0" + bcb0 + R"(*ldb_k; - int32 offda[TK] = rka % ldlut; - )" + a_delta_mem + R"( int32* pincd[TK] = delta + offda; - )" + a_delta_mem + R"( int32* pda[TK] = delta + ldlut + offda + off_uw*ldlut + off_uh*ldlut*upsample_w; - int32 da[TK] = *pda; - int32 incd[TK] = *pincd; - int32 maskh[TM] = pad_h + min(rah, 0) + max(rah + BH - AH, 0); - int32 maskw[TM] = pad_w + min(raw, 0) + max(raw + BW - AW, 0); - int32 offma = offk % ldlut; - )" + masks_mem + R"( int32* pm[TM] = masks + ldlut + offma + maskw*ldlut + maskh*ldlut*(2*pad_w + 1) + off_uw*ldlut*(2*pad_w+1)*(2*pad_h+1) + off_uh*ldlut*(2*pad_w+1)*(2*pad_h+1)*upsample_w; - )" + a_delta_mem + R"( int32* pincm[TM] = delta + offma; - int32 incm[TM] = *pincm; - int32 maska0[TM] = *pm; - int32 maska1[TK] = 1 << (0 ... TK); - int1 checka[TM, TK] = (maska0[:, newaxis] & maska1[newaxis, :]) > 0; - int1 checkb0[TN] = rb0 < N; - int1 checkb)" + BS + " = checkb0" + bcb0 + R"(; + int offda[TK] = rka % ldlut; + )" + a_delta_mem + R"( int* pincd[TK] = delta + offda; + )" + a_delta_mem + R"( int* pda[TK] = delta + ldlut + offda + off_uw*ldlut + off_uh*ldlut*upsample_w; + int da[TK] = *pda; + int incd[TK] = *pincd; + int maskh[TM] = pad_h + min(rah, 0) + max(rah + BH - AH, 0); + int maskw[TM] = pad_w + min(raw, 0) + max(raw + BW - AW, 0); + int offma = offk % ldlut; + )" + masks_mem + R"( int* pm[TM] = masks + ldlut + offma + maskw*ldlut + maskh*ldlut*(2*pad_w + 1) + off_uw*ldlut*(2*pad_w+1)*(2*pad_h+1) + off_uh*ldlut*(2*pad_w+1)*(2*pad_h+1)*upsample_w; + )" + a_delta_mem + R"( int* pincm[TM] = delta + offma; + int incm[TM] = *pincm; + int maska0[TM] = *pm; + int maska1[TK] = 1 << (0 ... TK); + bool checka[TM, TK] = (maska0[:, newaxis] & maska1[newaxis, :]) > 0; + bool checkb0[TN] = rb0 < N; + bool checkb)" + BS + " = checkb0" + bcb0 + R"(; )" << a_ty_ << R"( a[TM, TK] = checka ? *pa : 0; )" << b_ty_ << R"( b)" + BS + R"( = checkb ? *pb : 0; - int32 rkamin[TK] = rka - offk + TK; - for(int32 k = K; k > 0; k = k - TK){ + int rkamin[TK] = rka - offk + TK; + for(int k = K; k > 0; k = k - TK){ C = dot(a, )" + useb + R"(, C); pa = pa + da[newaxis, :]; pb = pb + )" + inc_pb + R"(; @@ -673,7 +665,7 @@ if(b_lut_){ pm = pm + incm; pincm = pincm + incm; incm = *pincm; - int1 checka1[TK] = (rkamin < k); + bool checka1[TK] = (rkamin < k); maska0 = *pm; checka = (maska0[:, newaxis] & maska1[newaxis, :]) > 0; checka = checka && checka1[newaxis,:]; @@ -681,31 +673,31 @@ if(b_lut_){ checkb = checkb && (k > TK); @checkb b = *pb; } - int32 rxc[TM] = get_global_range[TM](0); - int32 rc1[TN] = get_global_range[TN](1); - int32 rcn[TM] = rxc / (CH*CW); - int32 rcpq[TM] = rxc % (CH*CW); - int32 rcp[TM] = rcpq / CW; - int32 rcq[TM] = rcpq % CW; + int rxc[TM] = get_global_range[TM](0); + int rc1[TN] = get_global_range[TN](1); + int rcn[TM] = rxc / (CH*CW); + int rcpq[TM] = rxc % (CH*CW); + int rcp[TM] = rcpq / CW; + int rcq[TM] = rcpq % CW; rcp = rcp * upsample_h + off_uch; rcq = rcq * upsample_w + off_ucw; - int1 checkc1[TN] = rc1 < N; - int32 rc0[TM] = rcn * ldc_n + rcp * ldc_p + rcq * ldc_q; - fp32* pc[TM, TN] = c + rc1[newaxis, :]*ldc_k + rc0[:, newaxis]; - int1 checkc0[TM] = rxc < M; - int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - int32 ridx = get_range_id(0); - int32 ridy = get_range_id(1); - int32 *plock = locks + ridx + ridy*grid0; + bool checkc1[TN] = rc1 < N; + int rc0[TM] = rcn * ldc_n + rcp * ldc_p + rcq * ldc_q; + float* pc[TM, TN] = c + rc1[newaxis, :]*ldc_k + rc0[:, newaxis]; + bool checkc0[TM] = rxc < M; + bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + int ridx = get_range_id(0); + int ridy = get_range_id(1); + int *plock = locks + ridx + ridy*grid0; while(__atomic_cas(plock, 0, 1) == 1); - int32 *pcount = plock + grid0*grid1; - int32 count = *pcount; - int32 countp1 = select(count == GZ - 1, 0, count + 1); + int *pcount = plock + grid0*grid1; + int count = *pcount; + int countp1 = select(count == GZ - 1, 0, count + 1); if(count == 0) {)"; if(bias_ && ty_==FPROP){ os << R"( - fp32* pbias[TN] = bias + rc1; - fp32 bias[TN] = checkc1 ? *pbias : 0; + float* pbias[TN] = bias + rc1; + float bias[TN] = checkc1 ? *pbias : 0; C = C + bias[newaxis, :];)"; } os << R"( diff --git a/lib/dnn/dot.cpp b/lib/dnn/dot.cpp index 1b5e061d3..3b9a2e300 100644 --- a/lib/dnn/dot.cpp +++ b/lib/dnn/dot.cpp @@ -10,11 +10,11 @@ namespace dnn{ dot::dot(int M, int N, int K, bool AT, bool BT, std::string a_ty, std::string b_ty, - unsigned alignment_lda, unsigned alignment_ldb) + unsigned align_lda, unsigned align_ldb, unsigned align_ldc) : base("matmul"), M_(M), N_(N), K_(K), AT_(AT), BT_(BT), a_ty_(a_ty), b_ty_(b_ty), - align_lda_(alignment_lda), align_ldb_(alignment_ldb), + align_lda_(align_lda), align_ldb_(align_ldb), align_ldc_(align_ldc), locks_(nullptr) { } @@ -23,15 +23,10 @@ size_t dot::num_flops() const { return 2.*M_*N_*K_; } -// comparison for maps -bool dot::operator<(const base& other) const { - auto *y = dynamic_cast(&other); - if(!y) - return true; - return std::tie(M_, N_, K_, AT_, BT_, - a_ty_, b_ty_, align_lda_, align_ldb_) - < std::tie(y->M_, y->N_, y->K_, y->AT_, y->BT_, - y->a_ty_, y->b_ty_, y->align_lda_, y->align_ldb_); +// retune parameters +std::vector dot::retune_params() const { + return {M_, N_, K_, AT_, BT_, + (int)align_lda_, (int)align_ldb_}; } // clone @@ -101,45 +96,45 @@ void dot::triton_c_src(std::ostream &os) const { std::string align_ldb_str = "multiple_of(" + std::to_string(align_ldb_) + ")"; std::string res = R"( -const tunable int32 TM = {16, 32, 64, 128, 256}; -const tunable int32 TN = {16, 32, 64, 128, 256}; -const tunable int32 TK = {32}; -const tunable int32 GZ = {1}; +const tunable int TM = {16, 32, 64, 128}; +const tunable int TN = {16, 32, 64, 128}; +const tunable int TK = {32}; +const tunable int GZ = {1}; void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, restrict read_only align(16) )" + b_ty_ + R"( *B, - fp32 *C, - int32 M, int32 N, int32 K, - )" + align_lda_str + R"( int32 lda, )" + align_ldb_str + R"(" int32 ldb, int32 ldc, - int32 bound, int32 *locks, int32 grid0, int32 grid1) { - int32 ridx = get_range_id(0); - int32 ridy = get_range_id(1); - int32 rxa[TM] = ridx * TM + (0 ... TM); - int32 ryb[TN] = ridy * TN + (0 ... TN); - int32 rka[TK] = 0 ... TK; - int32 rkb[TK] = 0 ... TK; - fp32 c[TM, TN] = 0; + restrict read_only align(16) float *C, + int M, int N, int K, + )" + align_lda_str + R"( int lda, )" + align_ldb_str + R"(" int ldb, int ldc, + int bound, int *locks, int grid0, int grid1) { + int ridx = get_range_id(0); + int ridy = get_range_id(1); + int rxa[TM] = ridx * TM + (0 ... TM); + int ryb[TN] = ridy * TN + (0 ... TN); + int rka[TK] = 0 ... TK; + int rkb[TK] = 0 ... TK; + float c[TM, TN] = 0; )" + a_ty_ + R"(* pa[)" + AS + "] = A + rka" + bca0 + lda0 + " + rxa" + bca1 + lda1 + R"(; )" + b_ty_ + R"(* pb[)" + BS + "] = B + rkb" + bcb0 + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; - int1 checka[)" + AS + R"(] = (rka < K))" + bca0 + " && (rxa < M)" + bca1 + R"(; - int1 checkb[)" + BS + R"(] = (rkb < K))" + bcb0 + " && (ryb < N)" + bcb1 + R"(; + bool checka[)" + AS + R"(] = (rka < K))" + bca0 + " && (rxa < M)" + bca1 + R"(; + bool checkb[)" + BS + R"(] = (rkb < K))" + bcb0 + " && (ryb < N)" + bcb1 + R"(; )" + a_ty_ + R"( a[)" + AS + R"(] = checka ? *pa : 0; )" + b_ty_ + R"( b[)" + BS + R"(] = checkb ? *pb : 0; - for(int32 k = K; k > 0; k = k - TK){ + for(int k = K; k > 0; k = k - TK){ c = dot()" + usea + ", " + useb + R"(, c); pa = pa + TK)" + lda0 + R"(; pb = pb + TK)" + ldb0 + R"(; - int1 checka[)" + AS + R"(] = k > TK; - int1 checkb[)" + BS + R"(] = k > TK; + bool checka[)" + AS + R"(] = k > TK; + bool checkb[)" + BS + R"(] = k > TK; a = checka ? *pa : 0; b = checkb ? *pb : 0; } - int32 rxc[TM] = ridx * TM + (0 ... TM); - int32 ryc[TN] = ridy * TN + (0 ... TN); - int1 checkc0[TM] = rxc < M; - int1 checkc1[TN] = ryc < N; - int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - fp32* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; + int rxc[TM] = ridx * TM + (0 ... TM); + int ryc[TN] = ridy * TN + (0 ... TN); + bool checkc0[TM] = rxc < M; + bool checkc1[TN] = ryc < N; + bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + float* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; @checkc *pc = c; } )"; diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 3bf5e1035..5b50a73b4 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -28,7 +28,7 @@ shift::shift(int B, int C, layout_(layout){ // std::cout << B_ << " " << C_ << " " << F_ << " " << stride_h_ << " " << stride_w_ << " " << a_ty_ << " " << b_ty_ << " " << ty_ << " " << layout_ << std::endl; // max number of channels - TK_ = (ty == FPROP && a_ty_ == "fp32") ? 8 : 32; + TK_ = (ty == FPROP && a_ty_ == "float") ? 8 : 32; MAX_C_ = 8192 + TK_; // activation sizes CD_ = AD_ / stride_d_; @@ -204,26 +204,15 @@ size_t shift::ldb() const size_t shift::ldc() const { return M_; } -bool shift::operator <(const base& other) const{ - auto *y = dynamic_cast(&other); - if(!y) - return true; - return std::tie(B_, C_, F_, - AD_, AH_, AW_, - BD_, BH_, BW_, - CD_, CH_, CW_, - shift_h_, shift_w_, - stride_h_, stride_w_, - layout_, op_, - bias_) - < std::tie(y->B_, y->C_, y->F_, - y->AD_, y->AH_, y->AW_, - y->BD_, y->BH_, y->BW_, - y->CD_, y->CH_, y->CW_, - y->shift_h_, y->shift_w_, - y->stride_h_, y->stride_w_, - y->layout_, y->op_, - y->bias_); +std::vector shift::retune_params() const { + return {B_, C_, F_, + AD_, AH_, AW_, + BD_, BH_, BW_, + CD_, CH_, CW_, + (int64_t)shift_h_, (int64_t)shift_w_, + stride_h_, stride_w_, + layout_, op_, + bias_}; } void shift::init_impl(driver::stream *stream, driver::cu_module *module, triton::runtime::launch_information info) { @@ -325,56 +314,56 @@ void shift::triton_c_src(std::ostream &os) const { if(is_chwn) { return R"( - int32 )" + rx + "wh[" + sz + "] = " + rkx + " / " + B + R"(; - int32 )" + rx + "b[" + sz + "] = " + rkx + " % " + B + R"(; - int32 )" + rx + "w[" + sz + "] = (" + rx + "wh % " + CW + R"() + pad_w; - int32 )" + rx + "h[" + sz + "] = (" + rx + "wh / " + CW + R"() + pad_h;)"; + int )" + rx + "wh[" + sz + "] = " + rkx + " / " + B + R"(; + int )" + rx + "b[" + sz + "] = " + rkx + " % " + B + R"(; + int )" + rx + "w[" + sz + "] = (" + rx + "wh % " + CW + R"() + pad_w; + int )" + rx + "h[" + sz + "] = (" + rx + "wh / " + CW + R"() + pad_h;)"; } else { return R"( - int32 )" + rx + "bh[" + sz + "] = " + rkx + " / " + CW + R"(; - int32 )" + rx + "w[" + sz + "] = (" + rkx + " % " + CW + R"() + pad_w; - int32 )" + rx + "h[" + sz + "] = (" + rx + "bh % " + CH + R"() + pad_h; - int32 )" + rx + "b[" + sz + "] = " + rx + "bh / " + CH + ";"; + int )" + rx + "bh[" + sz + "] = " + rkx + " / " + CW + R"(; + int )" + rx + "w[" + sz + "] = (" + rkx + " % " + CW + R"() + pad_w; + int )" + rx + "h[" + sz + "] = (" + rx + "bh % " + CH + R"() + pad_h; + int )" + rx + "b[" + sz + "] = " + rx + "bh / " + CH + ";"; } }; std::string result = R"( -const tunable int32 TM = {16, 32, 64, 128}; -const tunable int32 TN = {16, 32, 64, 128}; -const tunable int32 TK = {)" + std::to_string(TK_) + "};"; +const tunable int TM = {16, 32, 64, 128}; +const tunable int TN = {16, 32, 64, 128}; +const tunable int TK = {)" + std::to_string(TK_) + "};"; if(op_ == WGRAD) - result += "const tunable int32 GZ = {1};"; + result += "const tunable int GZ = {1};"; else - result += "const tunable int32 GZ = {1};"; + result += "const tunable int GZ = {1};"; result += R"( -__constant__ int32* delta_a = alloc_const int32[)" + std::to_string(MAX_C_) + R"(]; +__constant__ int* delta_a = alloc_const int[)" + std::to_string(MAX_C_) + R"(]; void shift(restrict read_only align(16) )" + a_ty_ + R"( *A, restrict read_only align(16) )" + b_ty_ + R"( *B, )" + c_ty_ + R"( *C, - int32 M, int32 N, int32 K, - int32 stride_h, int32 stride_w, - multiple_of(8) int32 lda_b, multiple_of(8) int32 lda_w, multiple_of(8) int32 lda_h, multiple_of(8) int32 lda_c, - multiple_of(8) int32 ldb_b, multiple_of(8) int32 ldb_w, multiple_of(8) int32 ldb_h, multiple_of(8) int32 ldb_c, - multiple_of(8) int32 ldc_b, multiple_of(8) int32 ldc_w, multiple_of(8) int32 ldc_h, multiple_of(8) int32 ldc_c, - int32 NB, - int32 AH, int32 AW, - int32 BH, int32 BW, - int32 CH, int32 CW, - int32* locks, int32 grid0, int32 grid1, int32 grid2) { - int32 ridx = get_range_id(0); - int32 ridy = get_range_id(1); - int32 rz = get_range_id(2); - int32 rxa[TM] = ridx*TM + (0 ... TM); - int32 ryb[TN] = ridy*TN + (0 ... TN); - int32 rka[TK] = 0 ... TK; - int32 rkb[TK] = 0 ... TK; - fp32 acc[TM, TN] = 0; - int32 pad_h = BH / 2; - int32 pad_w = BW / 2;)"; + int M, int N, int K, + int stride_h, int stride_w, + multiple_of(8) int lda_b, multiple_of(8) int lda_w, multiple_of(8) int lda_h, multiple_of(8) int lda_c, + multiple_of(8) int ldb_b, multiple_of(8) int ldb_w, multiple_of(8) int ldb_h, multiple_of(8) int ldb_c, + multiple_of(8) int ldc_b, multiple_of(8) int ldc_w, multiple_of(8) int ldc_h, multiple_of(8) int ldc_c, + int NB, + int AH, int AW, + int BH, int BW, + int CH, int CW, + int* locks, int grid0, int grid1, int grid2) { + int ridx = get_range_id(0); + int ridy = get_range_id(1); + int rz = get_range_id(2); + int rxa[TM] = ridx*TM + (0 ... TM); + int ryb[TN] = ridy*TN + (0 ... TN); + int rka[TK] = 0 ... TK; + int rkb[TK] = 0 ... TK; + float acc[TM, TN] = 0; + int pad_h = BH / 2; + int pad_w = BW / 2;)"; /* A offsets */ if(op_ == FPROP){ @@ -382,49 +371,49 @@ if(op_ == FPROP){ compute_bhw("ra", "TM", "rxa") + R"( raw = raw * )" + stride_w + R"(; rah = rah * )" + stride_h + R"(; - int32 offxa[TM] = rab*)" + lda_b + R"( + raw*lda_w + rah*lda_h; - int32 offa0[TM, TK] = offxa[:, newaxis]; - __constant__ int32* pd[TK] = delta_a + rka; - multiple_of(8) int32 d[TK] = *pd; - int32 offa1[TM, TK] = d[newaxis, :];)"; + int offxa[TM] = rab*)" + lda_b + R"( + raw*lda_w + rah*lda_h; + int offa0[TM, TK] = offxa[:, newaxis]; + __constant__ int* pd[TK] = delta_a + rka; + multiple_of(8) int d[TK] = *pd; + int offa1[TM, TK] = d[newaxis, :];)"; } if(op_ == BPROP){ result += compute_bhw("ra", "TM", "rxa") + R"( - int32 offxa[TM] = rab*)" + lda_b + R"( + raw*lda_w + rah*lda_h; - int32 offa0[TM, TK] = offxa[:, newaxis]; - int32 offa1[TM, TK] = rka[newaxis, :] * lda_c;)"; + int offxa[TM] = rab*)" + lda_b + R"( + raw*lda_w + rah*lda_h; + int offa0[TM, TK] = offxa[:, newaxis]; + int offa1[TM, TK] = rka[newaxis, :] * lda_c;)"; } if(op_ == WGRAD){ result += compute_bhw("ra", "TK", "rka") + R"( - int32 offa0[TK, TM] = rxa[newaxis, :] * lda_c; - int32 offxa[TK] = rab*)" + lda_b + R"( + raw*lda_w + rah*lda_h; - int32 offa1[TK, TM] = offxa[:, newaxis];)"; + int offa0[TK, TM] = rxa[newaxis, :] * lda_c; + int offxa[TK] = rab*)" + lda_b + R"( + raw*lda_w + rah*lda_h; + int offa1[TK, TM] = offxa[:, newaxis];)"; } /* B offsets */ if(op_ == FPROP){ result += R"( - int32 offb0[TN, TK] = ryb[:, newaxis]; - int32 offb1[TN, TK] = rkb[newaxis, :] * ldb_c;)"; + int offb0[TN, TK] = ryb[:, newaxis]; + int offb1[TN, TK] = rkb[newaxis, :] * ldb_c;)"; } if(op_ == BPROP){ result += R"( - int32 offb0[TK, TN] = ryb[newaxis, :] * ldb_c; - int32 offb1[TK, TN] = rkb[:, newaxis];)"; + int offb0[TK, TN] = ryb[newaxis, :] * ldb_c; + int offb1[TK, TN] = rkb[:, newaxis];)"; } if(op_ == WGRAD){ result += compute_bhw("rb", "TK", "rkb") + R"( - __constant__ int32* pd[TN] = delta_a + ryb; - multiple_of(8) int32 d[TN] = *pd; - multiple_of(8) int32 shift[TK, TN] = d[newaxis, :]; + __constant__ int* pd[TN] = delta_a + ryb; + multiple_of(8) int d[TN] = *pd; + multiple_of(8) int shift[TK, TN] = d[newaxis, :]; rbw = rbw * )" + stride_w + R"(; rbh = rbh * )" + stride_h + R"(; - int32 offkb[TK] = rbb*)" + ldb_b + R"( + rbw*ldb_w + rbh*ldb_h; - int32 offb0[TK, TN] = ryb[newaxis, :] * ldb_c; - int32 offb1[TK, TN] = offkb[:, newaxis]; + int offkb[TK] = rbb*)" + ldb_b + R"( + rbw*ldb_w + rbh*ldb_h; + int offb0[TK, TN] = ryb[newaxis, :] * ldb_c; + int offb1[TK, TN] = offkb[:, newaxis]; )" + a_ty_ + "* pa_base[" + AS + R"(] = A + offa0; )" + b_ty_ + "* pb_base[" + BS + R"(] = B + offb0 + shift; )" + a_ty_ + "* pa[" + AS + R"(] = pa_base + offa1; @@ -439,14 +428,14 @@ else{ /* Main loop */ /* Increment A pointers */ result += R"( - int1 checka[)" + AS + "] = (rka < K)" + bca0 + R"(; - int1 checkb[)" + BS + "] = (rkb < K)" + bcb0 + R"(; + bool checka[)" + AS + "] = (rka < K)" + bca0 + R"(; + bool checkb[)" + BS + "] = (rkb < K)" + bcb0 + R"(; )" + a_ty_ + " a[" + AS + R"(] = checka ? *pa : 0; )" + b_ty_ + " b[" + BS + R"(] = checkb ? *pb : 0; - for(int32 k = K; k > 0; k = k - TK){ + for(int k = K; k > 0; k = k - TK){ acc = dot()" + usea + "," + useb + R"(, acc); - int1 checka[)" + AS + R"(] = k > TK; - int1 checkb[)" + BS + R"(] = k > TK;)"; + bool checka[)" + AS + R"(] = k > TK; + bool checkb[)" + BS + R"(] = k > TK;)"; /* Increment A pointers */ if(op_ == FPROP){ @@ -490,8 +479,8 @@ if(op_ == BPROP){ result += R"( b = checkb ? *pb : 0; } - int32 rxc[TM] = ridx*TM + (0 ... TM); - int32 ryc[TN] = ridy*TN + (0 ... TN);)"; + int rxc[TM] = ridx*TM + (0 ... TM); + int ryc[TN] = ridy*TN + (0 ... TN);)"; /* C offsets */ if(op_ == BPROP){ @@ -499,26 +488,26 @@ if(op_ == BPROP){ compute_bhw("rc", "TM", "rxc") + R"( rcw = rcw * )" + stride_w + R"(; rch = rch * )" + stride_h + R"(; - int32 offxc[TM] = rcb*)" + ldc_b + R"( + rcw*ldc_w + rch*ldc_h;)"; + int offxc[TM] = rcb*)" + ldc_b + R"( + rcw*ldc_w + rch*ldc_h;)"; } if(op_ == FPROP){ result += compute_bhw("rc", "TM", "rxc") + R"( - int32 offxc[TM] = rcb*)" + ldc_b + R"( + rcw*ldc_w + rch*ldc_h;)"; + int offxc[TM] = rcb*)" + ldc_b + R"( + rcw*ldc_w + rch*ldc_h;)"; } if(op_ == WGRAD){ result += R"( - int32 offxc[TM] = rxc;)"; + int offxc[TM] = rxc;)"; } result += R"(" )" + c_ty_ + R"( c[TM, TN] = acc; )" + c_ty_ + R"(* pc[TM, TN] = C + offxc[:, newaxis] + ryc[newaxis, :]*ldc_c; - int1 checkc0[TM] = rxc < M; - int1 checkc1[TN] = ryc < N; - int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :];)"; + bool checkc0[TM] = rxc < M; + bool checkc1[TN] = ryc < N; + bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :];)"; if(op_ == BPROP){ result += R"( - __constant__ int32* pd[TN] = delta_a + ryc; + __constant__ int* pd[TN] = delta_a + ryc; )" + c_ty_ + R"(* shift_pc[TM, TN] = pc + (*pd)[newaxis, :]; @checkc *shift_pc = c; )"; diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index 928ec0812..1ce0a77e6 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -174,8 +174,15 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben std::lock_guard lock(mutex); for(ir::metaparameter *mp: mps) mp->set_value(params[i++]); +// for(size_t i = 0; i < params.size(); i++) +// std::cout << ((i==0)?"":", ") << params[i] << std::flush; +// std::cout << std::endl; passes_0.tune.init(tt_module_0); passes_0.tune.check_constraints(errors); +// for(auto x: errors) +// for(auto e: x.second){ +// std::cout << x.first->get_name() << ": " << e << std::endl; +// } } if(!errors.empty()) return; @@ -212,9 +219,9 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben best.perf = perf; best.params = params; } - for(size_t i = 0; i < params.size(); i++) - std::cout << ((i==0)?"":", ") << params[i] << std::flush; - std::cout << ", " << perf << " [ " << best.perf << " ] " << std::endl; +// for(size_t i = 0; i < params.size(); i++) +// std::cout << ((i==0)?"":", ") << params[i] << std::flush; +// std::cout << ", " << perf << " [ " << best.perf << " ] " << std::endl; } }; From 6be532c6a20e69fee6c1fb232984c9a1852939ac Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 2 Aug 2019 20:56:40 -0700 Subject: [PATCH 275/494] [codegen][selection] adding support for reduction along arbitrary axis --- include/triton/codegen/selection.h | 3 +- lib/codegen/selection.cpp | 73 +++++++++++++++++++++--------- lib/codegen/shmem_allocation.cpp | 20 +++++--- lib/codegen/tune.cpp | 4 +- lib/runtime/jit.cpp | 6 +-- 5 files changed, 73 insertions(+), 33 deletions(-) diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 317fc7f2b..3f7e5686c 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -43,6 +43,7 @@ public: virtual void set_value(indices_t idx, llvm::Value *v) = 0; virtual llvm::Value* get_value(indices_t idx) = 0; llvm::Type *get_ty() const { return ty_; } + shapes_t get_shapes() const { return shapes_; } protected: llvm::Type *ty_; @@ -54,7 +55,6 @@ private: void extract_constant(llvm::Value *arg, llvm::Value *&non_cst, llvm::Value *&cst); void extract_constant(const indices_t &arg_idx, indices_t &non_cst_idx, indices_t &cst_idx); - llvm::Value* shared_offset(indices_t idx); public: shared_tile(llvm::Type* ty, const shapes_t &shapes, llvm::Value* ptr, llvm::IRBuilder<> &builder, llvm::Value* offset = nullptr); @@ -65,6 +65,7 @@ public: llvm::Value* get_value(indices_t idx); llvm::Value* get_pointer() { return ptr_; } llvm::Value* get_offset() { return offset_; } + static llvm::Value* shared_offset(llvm::IRBuilder<>& builder, const shapes_t& shapes, indices_t idx); private: llvm::Value *ptr_; diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 84326529d..7e981672a 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -131,11 +131,11 @@ void shared_tile::extract_constant(const indices_t &arg_idx, indices_t &non_cst_ } -Value* shared_tile::shared_offset(indices_t idx) { - Value *result = builder_.getInt32(0); - result = builder_.CreateAdd(result, idx[0]); +Value* shared_tile::shared_offset(llvm::IRBuilder<> &builder, const shapes_t& shapes, indices_t idx) { + Value *result = builder.getInt32(0); + result = builder.CreateAdd(result, idx[0]); for(size_t i = 1; i < idx.size(); i++) - result = builder_.CreateAdd(result, builder_.CreateMul(idx[i], builder_.getInt32(shapes_[i-1]))); + result = builder.CreateAdd(result, builder.CreateMul(idx[i], builder.getInt32(shapes[i-1]))); return result; } @@ -145,7 +145,7 @@ shared_tile::shared_tile(Type *ty, const shapes_t &shapes, Value *ptr, llvm::IRB } void shared_tile::set_value(indices_t idx, Value *value) { - Value *ptr = builder_.CreateGEP(ptr_, shared_offset(idx)); + Value *ptr = builder_.CreateGEP(ptr_, shared_offset(builder_, shapes_, idx)); unsigned addr_space = ptr->getType()->getPointerAddressSpace(); ptr = builder_.CreateBitCast(ptr, value->getType()->getPointerTo(addr_space)); builder_.CreateStore(value, ptr); @@ -176,7 +176,7 @@ Value* shared_tile::get_value(indices_t idx) { // if(isa(non_cst_idx.front())){ // builder_.SetInsertPoint((Instruction*)non_cst_idx.front()); // } - base_ptr = builder_.CreateGEP(ptr_, shared_offset(non_cst_idx)); + base_ptr = builder_.CreateGEP(ptr_, shared_offset(builder_, shapes_, non_cst_idx)); if(vector_size_ > 1){ Type *vec_ty = VectorType::get(ty, vector_size); Type *vec_ptr_ty = PointerType::get(vec_ty, base_ptr->getType()->getPointerAddressSpace()); @@ -184,7 +184,7 @@ Value* shared_tile::get_value(indices_t idx) { } // builder_.SetInsertPoint(store); } - Value *offset = shared_offset(cst_idx); + Value *offset = shared_offset(builder_, shapes_, cst_idx); Value *div = offset; if(vector_size_ > 1) div = builder_.CreateUDiv(offset, builder_.getInt32(vector_size_)); @@ -824,23 +824,39 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & return; } if(auto *x = dynamic_cast(ins)){ - Value *partial = nullptr; + std::map partial; distributed_tile* op = (distributed_tile*)tmap_.at(ins->get_operand(0)); + size_t axis = 0; + unsigned num_warps = params_->get_num_threads() / 32; + std::vector shapes = op->get_shapes(); + shapes.erase(shapes.begin() + axis); + if(shapes.empty()) + shapes.push_back(1); + // reduce within thread op->for_each([&](indices_t idx){ + indices_t pidx = idx; + pidx.erase(pidx.begin() + axis); + if(pidx.empty()) + pidx.push_back(builder.getInt32(0)); Value *current = op->get_value(idx); - if(partial == nullptr) - partial = current; + if(partial.find(pidx) == partial.end()) + partial[pidx] = current; else - partial = builder.CreateFAdd(partial, current); + partial[pidx] = builder.CreateFAdd(partial[pidx], current); }); + // reduce within warp Value *shfl = Intrinsic::getDeclaration(builder.GetInsertBlock()->getModule(), Intrinsic::nvvm_shfl_sync_bfly_f32); - for (int i = 16; i > 0; i >>= 1){ - Value *rhs = builder.CreateCall(shfl, {builder.getInt32(0xffffffff), partial, - builder.getInt32(i), builder.getInt32(0x1f)}); - partial = builder.CreateFAdd(partial, rhs); + for (int i = 16; i > 0; i >>= 1) + for(auto& x: partial) + { + Value *rhs = builder.CreateCall(shfl, {builder.getInt32(0xffffffff), x.second, + builder.getInt32(i), + builder.getInt32(0x1f)}); + x.second = builder.CreateFAdd(x.second, rhs); } + // reduce within block Value *tid = tgt_->get_local_id(module, builder, 0); BasicBlock *partial_reduce_do = BasicBlock::Create(ctx, "partial_reduce_do", fn); @@ -853,10 +869,15 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & unsigned addr_space = sh_mem_ptr_->getType()->getPointerAddressSpace(); Type *ptr_ty = PointerType::get(builder.getFloatTy(), addr_space); Value *sh_mem_ptr = builder.CreateBitCast(sh_mem_ptr_, ptr_ty); - Value *write_ptr = builder.CreateGEP(sh_mem_ptr, warp_id); - builder.CreateStore(partial, write_ptr); + for(auto& x: partial){ + Value *offset = shared_tile::shared_offset(builder, shapes, x.first); + offset = builder.CreateAdd(offset, builder.CreateMul(warp_id, builder.getInt32(shapes[0]))); + Value *write_ptr = builder.CreateGEP(sh_mem_ptr, offset); + builder.CreateStore(x.second, write_ptr); + } builder.CreateBr(partial_reduce_done); builder.SetInsertPoint(partial_reduce_done); + // Final reduction with the first warp tgt_->add_barrier(module, builder); BasicBlock *final_reduce_do = BasicBlock::Create(ctx, "final_reduce_do", fn); @@ -865,11 +886,21 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & final_reduce_do, final_reduce_done); builder.SetInsertPoint(final_reduce_do); Value *read_ptr = builder.CreateGEP(sh_mem_ptr, tid); - Value *result = builder.CreateLoad(read_ptr); + BasicBlock *read_shmem_do = BasicBlock::Create(ctx, "read_shmem_do", fn); + BasicBlock *read_shmem_done = BasicBlock::Create(ctx, "read_shmem_done", fn); + builder.CreateCondBr(builder.CreateICmpULT(id_in_warp, builder.getInt32(num_warps)), + read_shmem_do, read_shmem_done); + builder.SetInsertPoint(read_shmem_do); + Value *loaded= builder.CreateLoad(read_ptr); + builder.CreateBr(read_shmem_done); + builder.SetInsertPoint(read_shmem_done); + Value *result = builder.CreatePHI(loaded->getType(), 2); + ((PHINode*)result)->addIncoming(ConstantFP::get(loaded->getType(), (double)0), final_reduce_do); + ((PHINode*)result)->addIncoming(loaded, read_shmem_do); for (int i = params_->get_num_threads() / 64; i > 0; i >>= 1){ - Value *rhs = builder.CreateCall(shfl, {result, builder.getInt32(i), - builder.getInt32(0x1f), builder.getInt32(0xffffffff)}); - builder.CreateFAdd(result, rhs); + Value *rhs = builder.CreateCall(shfl, {builder.getInt32(0xffffffff), result, + builder.getInt32(i), builder.getInt32(0x1f)}); + result = builder.CreateFAdd(result, rhs); } builder.CreateStore(result, read_ptr); builder.CreateBr(final_reduce_done); diff --git a/lib/codegen/shmem_allocation.cpp b/lib/codegen/shmem_allocation.cpp index 699406f08..b4a903c1a 100644 --- a/lib/codegen/shmem_allocation.cpp +++ b/lib/codegen/shmem_allocation.cpp @@ -42,17 +42,25 @@ unsigned shmem_allocation::is_ld_padded(ir::value *x) { } unsigned shmem_allocation::get_num_bytes(ir::value *x) { - if(dynamic_cast(x)) - return 32; - unsigned result = x->get_type()->get_primitive_size_in_bits() / 8; + unsigned num_bytes = x->get_type()->get_primitive_size_in_bits() / 8; + if(dynamic_cast(x)){ + size_t shape = 1; + if(x->get_type()->is_tile_ty()){ + auto shapes = x->get_type()->get_tile_shapes(); + for(auto x: shapes) + shape *= x->get_value(); + } + size_t n_warps = params_->get_num_threads() / 32; + return shape * num_bytes * n_warps; + } unsigned pad = is_ld_padded(x); if(pad > 0){ unsigned ld = x->get_type()->get_tile_shapes()[0]->get_value(); - result += pad * result / ld; + num_bytes += pad * num_bytes / ld; } if(buffer_info_->is_double(x)) - result *= 2; - return result; + num_bytes *= 2; + return num_bytes; } void shmem_allocation::run(){ diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 9cdf2767d..1a7ec94e5 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -227,7 +227,7 @@ void tune::run(ir::module &mod) { node_t node = *nodes_.begin(); if(fragments_[node] == STRIDED_SCAN) { ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 1, 1); - ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 4, 32); + ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 2, 64); connected_components(node, {nts, mts}, {"nts", "mts"}, nodes_, dependencies_, group_id++); nts->set_value(1); } @@ -381,7 +381,7 @@ bool tune::check_constraints(std::map> &er errors[i].push_back("HMMA must have only 4 fragments per warp"); } int num_threads = get_req_num_threads(i); - if(num_threads % 32 != 0) + if(num_threads % 64 != 0) errors[i].push_back("number of threads per block (" + to_string(num_threads) + ") must be multiple of warp size"); if(num_threads != num_threads_) errors[i].push_back("Number of threads must be the same for all tiles (" + to_string(num_threads_) + ")"); diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index 1ce0a77e6..1f6a60ccd 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -219,9 +219,9 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben best.perf = perf; best.params = params; } -// for(size_t i = 0; i < params.size(); i++) -// std::cout << ((i==0)?"":", ") << params[i] << std::flush; -// std::cout << ", " << perf << " [ " << best.perf << " ] " << std::endl; + for(size_t i = 0; i < params.size(); i++) + std::cout << ((i==0)?"":", ") << params[i] << std::flush; + std::cout << ", " << perf << " [ " << best.perf << " ] " << std::endl; } }; From d869d9a924840bac6b49a541e796e4f32e9b06a1 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 4 Aug 2019 16:34:36 -0700 Subject: [PATCH 276/494] [codegen][selection] more flexible instruction selection for reduce_inst --- include/triton/codegen/selection.h | 1 + include/triton/ir/builder.h | 2 +- include/triton/ir/instructions.h | 11 ++- include/triton/lang/expression.h | 13 ++- include/triton/lang/parser.y | 5 +- include/triton/lang/scanner.l | 25 +++--- lib/codegen/selection.cpp | 133 +++++++++++++---------------- lib/codegen/shmem_allocation.cpp | 19 +++-- lib/codegen/tune.cpp | 19 ++++- lib/dnn/batchnorm.cpp | 12 +-- lib/ir/builder.cpp | 4 +- lib/ir/instructions.cpp | 19 ++++- lib/lang/expression.cpp | 17 +++- lib/runtime/jit.cpp | 6 +- 14 files changed, 167 insertions(+), 119 deletions(-) diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 3f7e5686c..b480da5f0 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -32,6 +32,7 @@ typedef std::vector indices_t; struct distributed_axis { size_t contiguous; std::vector values; + llvm::Value* thread_id; }; class tile { diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h index 1921814c9..079a79e40 100644 --- a/include/triton/ir/builder.h +++ b/include/triton/ir/builder.h @@ -134,7 +134,7 @@ public: value *create_dot(value *A, value *B, value *C, const std::string &name = ""); value *create_trans(value *A, const std::string &name = ""); value *create_sqrt(value *A, const std::string &name = ""); - value *create_reduce(value *A, const std::string &name = ""); + value *create_reduce(value *A, unsigned axis, const std::string &name = ""); value *create_select(value *pred, value *if_value, value *else_value, const std::string &name = ""); // Intrinsics value *create_copy_to_shared(value *arg, const std::string &name = ""); diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index 37692d617..3cc86da26 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -605,11 +605,18 @@ public: class reduce_inst: public builtin_inst { private: - reduce_inst(value* arg, const std::string& name, instruction* next); + static type* get_type(value *arg, unsigned axis); + +private: + reduce_inst(value* arg, unsigned axis, const std::string& name, instruction* next); std::string repr_impl() const { return "reduce"; } public: - static instruction* create(value *arg, const std::string &name = "", instruction *next = nullptr); + static instruction* create(value *arg, unsigned axis, const std::string &name = "", instruction *next = nullptr); + unsigned get_axis() const { return axis_; } + +private: + unsigned axis_; }; class select_inst: public builtin_inst { diff --git a/include/triton/lang/expression.h b/include/triton/lang/expression.h index 13894d18a..f0dac3bc9 100644 --- a/include/triton/lang/expression.h +++ b/include/triton/lang/expression.h @@ -134,6 +134,16 @@ private: const expression *C_; }; +class reshape_expression: public builtin_expression{ +public: + reshape_expression(node *arg, node *shapes): arg_(arg), shapes_((list*)shapes) { } + ir::value* codegen(ir::module *) const; + +private: + const node *arg_; + const list* shapes_; +}; + class max_expression: public builtin_expression{ public: max_expression(node* x, node* y) @@ -188,11 +198,12 @@ private: class reduce_expression: public builtin_expression{ public: - reduce_expression(node *arg): arg_(arg) {} + reduce_expression(node *arg, node *axis): arg_(arg), axis_((constant*)axis) {} ir::value* codegen(ir::module *mod) const; private: node* arg_; + constant* axis_; }; class indexing_expression: public postfix_expression{ diff --git a/include/triton/lang/parser.y b/include/triton/lang/parser.y index cd2c8941b..1a4b26633 100644 --- a/include/triton/lang/parser.y +++ b/include/triton/lang/parser.y @@ -55,7 +55,7 @@ STORAGE_SPEC_T get_storage_spec(node *op) { return ((token*)op)->storage_spec;} %token VOID UINT1 UINT8 UINT16 UINT32 UINT64 INT1 INT8 INT16 INT32 INT64 FP16 FP32 FP64 %token IF ELSE FOR CONTINUE WHILE %token NEWAXIS ELLIPSIS AT -%token GET_NUM_PROGRAM GET_RANGE_ID DOT SQRT REDUCE_SUM TRANS MAX MIN SELECT ATOMIC_CAS ATOMIC_EXCH ATOMIC_ADD ALLOC_CONST +%token GET_NUM_PROGRAM GET_RANGE_ID DOT SQRT REDUCE_SUM TRANS MAX MIN SELECT ATOMIC_CAS ATOMIC_EXCH ATOMIC_ADD ALLOC_CONST RESHAPE %start translation_unit %% @@ -126,13 +126,14 @@ builtin_expression | SQRT '(' expression ')' { $$ = new sqrt_expression($3); } | ALLOC_CONST type_specifier '[' constant ']' { $$ = new alloc_const_expression(new typed_declaration_specifier(get_type_spec($2)), $4); } | TRANS '(' expression ')' { $$ = new trans_expression($3); } - | REDUCE_SUM '(' expression ')' { $$ = new reduce_expression($3);} + | REDUCE_SUM '(' expression ',' constant ')' { $$ = new reduce_expression($3, $5);} | MAX '(' expression ',' expression ')' { $$ = new max_expression($3, $5); } | MIN '(' expression ',' expression ')' { $$ = new min_expression($3, $5); } | SELECT '(' expression ',' expression ',' expression ')' { $$ = new select_expression($3, $5, $7); } | ATOMIC_CAS '(' expression ',' expression ',' expression ')' { $$ = new atomic_cas_expression($3, $5, $7); } | ATOMIC_EXCH '(' expression ',' expression ')' { $$ = new atomic_exch_expression($3, $5); } | ATOMIC_ADD '(' expression ',' expression ')' { $$ = new atomic_add_expression($3, $5); } + | RESHAPE '(' expression ',' primary_expression_list ')' { $$ = new reshape_expression($3, $5); } ; /* Primary */ diff --git a/include/triton/lang/scanner.l b/include/triton/lang/scanner.l index fc791ae94..1aaf40a57 100644 --- a/include/triton/lang/scanner.l +++ b/include/triton/lang/scanner.l @@ -30,18 +30,18 @@ using triton::lang::return_void; "for" { return return_impl(FOR, yytext); } "while" { return return_impl(WHILE, yytext); } "void" { return return_impl(VOID, yytext); } -"uchar" { return return_impl(UCHAR, yytext); } -"ushort" { return return_impl(USHORT, yytext); } -"uint" { return return_impl(UINT, yytext); } -"ulong" { return return_impl(ULONG, yytext); } -"bool" { return return_impl(BOOL, yytext); } -"char" { return return_impl(CHAR, yytext); } -"short" { return return_impl(SHORT, yytext); } -"int" { return return_impl(INT, yytext); } -"long" { return return_impl(LONG, yytext); } -"half" { return return_impl(HALF, yytext); } -"float" { return return_impl(FLOAT, yytext); } -"double" { return return_impl(DOUBLE, yytext); } +"uchar" { return return_impl(UINT8, yytext); } +"ushort" { return return_impl(UINT16, yytext); } +"uint" { return return_impl(UINT32, yytext); } +"ulong" { return return_impl(UINT64, yytext); } +"bool" { return return_impl(INT1, yytext); } +"char" { return return_impl(INT8, yytext); } +"short" { return return_impl(INT16, yytext); } +"int" { return return_impl(INT32, yytext); } +"long" { return return_impl(INT64, yytext); } +"half" { return return_impl(FP16, yytext); } +"float" { return return_impl(FP32, yytext); } +"double" { return return_impl(FP64, yytext); } "..." { return return_impl(ELLIPSIS, yytext); } "get_range_id" { return return_impl(GET_RANGE_ID, yytext); } "get_num_program" { return return_impl(GET_NUM_PROGRAM, yytext); } @@ -49,6 +49,7 @@ using triton::lang::return_void; "__atomic_exch" { return return_impl(ATOMIC_EXCH, yytext); } "__atomic_add" { return return_impl(ATOMIC_ADD, yytext); } "__sum" { return return_impl(REDUCE_SUM, yytext); } +"__reshape" { return return_impl(RESHAPE, yytext); } "sqrt" { return return_impl(SQRT, yytext); } "dot" { return return_impl(DOT, yytext); } "max" { return return_impl(MAX, yytext); } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 7e981672a..dc8980a28 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -80,9 +80,10 @@ indices_t distributed_tile::get_ordered_indices(unsigned id) { void distributed_tile::for_each(std::function fn) { - for(unsigned i = 0; i < ordered_indices_.size(); i++) + for(unsigned i = 0; i < ordered_indices_.size(); i++){ if(i % vector_size_ == 0) fn(ordered_indices_[i]); + } } /* Shared Tile */ @@ -498,15 +499,15 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id Value *warp_size_k = builder.getInt32(warp_size[k]); Value *contiguous_k = builder.getInt32(contiguous[k]); Value *thread_id = builder.CreateAdd(thread_id_in_warp[k], builder.CreateMul(warp_id[k], warp_size_k)); - thread_id = builder.CreateMul(thread_id, contiguous_k); + Value *scaled_thread_id = builder.CreateMul(thread_id, contiguous_k); unsigned per_block = contiguous[k] * warp_size[k] * n_warps[k]; unsigned per_thread = contiguous[k] * shapes[k]->get_value() / per_block; std::vector idx_list(per_thread); for(unsigned n = 0 ; n < per_thread; n++){ unsigned offset = n / contiguous[k] * per_block + n % contiguous[k]; - idx_list[n] = builder.CreateAdd(thread_id, builder.getInt32(offset), "idx_" + str_k + "_" + std::to_string(n)); + idx_list[n] = builder.CreateAdd(scaled_thread_id, builder.getInt32(offset), "idx_" + str_k + "_" + std::to_string(n)); } - axes_[params_->get_param_group(v, k)] = distributed_axis{contiguous[k], idx_list}; + axes_[params_->get_param_group(v, k)] = distributed_axis{contiguous[k], idx_list, thread_id}; } } else { @@ -671,7 +672,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, shapes[0] += pad; Type* ty = llvm_type(v->get_type()->get_scalar_ty(), ctx); // create shared tile - if(buffer_info_->is_shared(v)){ + if(buffer_info_->is_shared(v) && !dynamic_cast(v)){ // shared copy PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr->getType()->getPointerAddressSpace()); // phi-node (double-buffering) @@ -825,88 +826,72 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & } if(auto *x = dynamic_cast(ins)){ std::map partial; - distributed_tile* op = (distributed_tile*)tmap_.at(ins->get_operand(0)); - size_t axis = 0; - unsigned num_warps = params_->get_num_threads() / 32; - std::vector shapes = op->get_shapes(); - shapes.erase(shapes.begin() + axis); - if(shapes.empty()) - shapes.push_back(1); + ir::value *op = ins->get_operand(0); + distributed_tile* op_tile = (distributed_tile*)tmap_.at(op); + unsigned axis = x->get_axis(); // reduce within thread - op->for_each([&](indices_t idx){ + op_tile->for_each([&](indices_t idx) { indices_t pidx = idx; pidx.erase(pidx.begin() + axis); - if(pidx.empty()) - pidx.push_back(builder.getInt32(0)); - Value *current = op->get_value(idx); + Value *current = op_tile->get_value(idx); + // current partial result is not initialized -- create if(partial.find(pidx) == partial.end()) partial[pidx] = current; + // current partial result is initialized -- accumulate else partial[pidx] = builder.CreateFAdd(partial[pidx], current); }); - // reduce within warp - Value *shfl = Intrinsic::getDeclaration(builder.GetInsertBlock()->getModule(), Intrinsic::nvvm_shfl_sync_bfly_f32); - for (int i = 16; i > 0; i >>= 1) - for(auto& x: partial) - { - Value *rhs = builder.CreateCall(shfl, {builder.getInt32(0xffffffff), x.second, - builder.getInt32(i), - builder.getInt32(0x1f)}); - x.second = builder.CreateFAdd(x.second, rhs); - } - - // reduce within block - Value *tid = tgt_->get_local_id(module, builder, 0); - BasicBlock *partial_reduce_do = BasicBlock::Create(ctx, "partial_reduce_do", fn); - BasicBlock *partial_reduce_done = BasicBlock::Create(ctx, "partial_reduce_done", fn); - Value *id_in_warp = builder.CreateURem(tid, builder.getInt32(32)); - Value *warp_id = builder.CreateUDiv(tid, builder.getInt32(32)); - builder.CreateCondBr(builder.CreateICmpEQ(id_in_warp, builder.getInt32(0)), - partial_reduce_do, partial_reduce_done); - builder.SetInsertPoint(partial_reduce_do); + // reduce within blocks unsigned addr_space = sh_mem_ptr_->getType()->getPointerAddressSpace(); - Type *ptr_ty = PointerType::get(builder.getFloatTy(), addr_space); - Value *sh_mem_ptr = builder.CreateBitCast(sh_mem_ptr_, ptr_ty); - for(auto& x: partial){ - Value *offset = shared_tile::shared_offset(builder, shapes, x.first); - offset = builder.CreateAdd(offset, builder.CreateMul(warp_id, builder.getInt32(shapes[0]))); - Value *write_ptr = builder.CreateGEP(sh_mem_ptr, offset); - builder.CreateStore(x.second, write_ptr); - } - builder.CreateBr(partial_reduce_done); - builder.SetInsertPoint(partial_reduce_done); + Type *res_ty = builder.getFloatTy(); + Value *base_ptr = builder.CreateBitCast(sh_mem_ptr_, PointerType::get(res_ty, addr_space)); + unsigned depth = params_->get_param(op, "mts.d" + std::to_string(axis))->get_value(); + for(auto& x: partial) { + // current element being computed + Value *lane = axes_.at(params_->get_param_group(op, axis)).thread_id; + Value *&result = x.second; + indices_t write_idx = x.first; + write_idx.insert(write_idx.begin() + axis, lane); + // shared memory write pointer + Value *write_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), write_idx); + Value *write_ptr = builder.CreateGEP(base_ptr, write_offset); + // initialize shared memory + builder.CreateStore(result, write_ptr); + // build result + for(unsigned i = depth/2; i > 0; i >>= 1){ + // current indices + indices_t current(write_idx.size(), builder.getInt32(0)); + current[axis] = builder.getInt32(i); + // shared memory offset + Value *read_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), current); + Value *is_active = builder.CreateICmpULT(lane, builder.getInt32(i)); + read_offset = builder.CreateSelect(is_active, read_offset, builder.getInt32(0)); + // shared memory read pointer + Value *read_ptr = builder.CreateGEP(write_ptr, read_offset); + tgt_->add_barrier(module, builder); + Value *next = builder.CreateLoad(read_ptr); + // accumulate + result = builder.CreateFAdd(result, next); + // write back + builder.CreateStore(result, write_ptr); + } - // Final reduction with the first warp - tgt_->add_barrier(module, builder); - BasicBlock *final_reduce_do = BasicBlock::Create(ctx, "final_reduce_do", fn); - BasicBlock *final_reduce_done = BasicBlock::Create(ctx, "final_reduce_done", fn); - builder.CreateCondBr(builder.CreateICmpEQ(warp_id, builder.getInt32(0)), - final_reduce_do, final_reduce_done); - builder.SetInsertPoint(final_reduce_do); - Value *read_ptr = builder.CreateGEP(sh_mem_ptr, tid); - BasicBlock *read_shmem_do = BasicBlock::Create(ctx, "read_shmem_do", fn); - BasicBlock *read_shmem_done = BasicBlock::Create(ctx, "read_shmem_done", fn); - builder.CreateCondBr(builder.CreateICmpULT(id_in_warp, builder.getInt32(num_warps)), - read_shmem_do, read_shmem_done); - builder.SetInsertPoint(read_shmem_do); - Value *loaded= builder.CreateLoad(read_ptr); - builder.CreateBr(read_shmem_done); - builder.SetInsertPoint(read_shmem_done); - Value *result = builder.CreatePHI(loaded->getType(), 2); - ((PHINode*)result)->addIncoming(ConstantFP::get(loaded->getType(), (double)0), final_reduce_do); - ((PHINode*)result)->addIncoming(loaded, read_shmem_do); - for (int i = params_->get_num_threads() / 64; i > 0; i >>= 1){ - Value *rhs = builder.CreateCall(shfl, {builder.getInt32(0xffffffff), result, - builder.getInt32(i), builder.getInt32(0x1f)}); - result = builder.CreateFAdd(result, rhs); + // result is on the first lane of shared memory + indices_t final = write_idx; + final[axis] = builder.getInt32(0); + Value *read_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), final); + Value *read_ptr = builder.CreateGEP(base_ptr, read_offset); + tgt_->add_barrier(module, builder); + result = builder.CreateLoad(read_ptr); + if(tmap_.find(ins) == tmap_.end()) + vmap_[ins] = result; + else{ + distributed_tile *ti = (distributed_tile*)tmap_[ins]; + ti->set_value(x.first, result); + } } - builder.CreateStore(result, read_ptr); - builder.CreateBr(final_reduce_done); - builder.SetInsertPoint(final_reduce_done); - tgt_->add_barrier(module, builder); - vmap_[ins] = builder.CreateLoad(sh_mem_ptr); return; } tile *ti = tmap_[ins]; diff --git a/lib/codegen/shmem_allocation.cpp b/lib/codegen/shmem_allocation.cpp index b4a903c1a..641170215 100644 --- a/lib/codegen/shmem_allocation.cpp +++ b/lib/codegen/shmem_allocation.cpp @@ -43,15 +43,16 @@ unsigned shmem_allocation::is_ld_padded(ir::value *x) { unsigned shmem_allocation::get_num_bytes(ir::value *x) { unsigned num_bytes = x->get_type()->get_primitive_size_in_bits() / 8; - if(dynamic_cast(x)){ - size_t shape = 1; - if(x->get_type()->is_tile_ty()){ - auto shapes = x->get_type()->get_tile_shapes(); - for(auto x: shapes) - shape *= x->get_value(); - } - size_t n_warps = params_->get_num_threads() / 32; - return shape * num_bytes * n_warps; + if(auto *red = dynamic_cast(x)){ + size_t axis = red->get_axis(); + ir::value *op = red->get_operand(0); + auto shapes = op->get_type()->get_tile_shapes(); + shapes.erase(shapes.begin() + axis); + size_t num_elements = 1; + for(auto x: shapes) + num_elements *= x->get_value(); + size_t depth = params_->get_param(op, "mts.d" + std::to_string(axis))->get_value(); + return num_elements * num_bytes * depth; } unsigned pad = is_ld_padded(x); if(pad > 0){ diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 1a7ec94e5..db3ed1c81 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -58,8 +58,19 @@ void tune::init_c_graph(ir::instruction *v) { shapes = atom->get_operand(0)->get_type()->get_tile_shapes(); else if(auto *downcast = dynamic_cast(v)) return; - else if(auto *reduce = dynamic_cast(v)) + else if(auto *reduce = dynamic_cast(v)) { + unsigned axis = reduce->get_axis(); + ir::value *arg = reduce->get_operand(0); + auto in_shapes = arg->get_type()->get_tile_shapes(); + unsigned current = 0; + for(unsigned i = 0; i < in_shapes.size(); i++){ + if(i == axis) + continue; +// std::cout << arg->get_name() << " " << v->get_name() << std::endl; + add_constraint({reduce, current++}, {arg, i}); + } return; + } else shapes = v->get_type()->get_tile_shapes(); // Reshape @@ -74,8 +85,10 @@ void tune::init_c_graph(ir::instruction *v) { static_params_.insert({{v, i}, 1}); else if(!is_skewed && is_same) add_constraint({v, i}, {op, current++}); - else + else{ is_skewed = true; + add_constraint({v, i}, {v, i}); + } } } // Splat @@ -137,6 +150,7 @@ tune::fragment_t tune::get_fragmentation_type(node_t x, graph_t &graph){ } void tune::connected_components(node_t x, const std::vector mps, const std::vector prefixes, std::set &nodes, graph_t &graph, unsigned group_id) { +// std::cout << "connected component: " << x.first->get_name() << " " << x.second << std::endl; groups_[x.first].insert({x.second, group_id}); if(nodes.find(x) != nodes.end()){ nodes.erase(x); @@ -190,6 +204,7 @@ std::map tune::get_params(ir::instruction* i) } unsigned tune::get_param_group(ir::value *value, unsigned ax) { +// std::cout << "group? " << value->get_name() << " " << ax << std::endl; unsigned result = groups_.at(value).at(ax); return result; } diff --git a/lib/dnn/batchnorm.cpp b/lib/dnn/batchnorm.cpp index dcc9d6a4e..e5143755e 100644 --- a/lib/dnn/batchnorm.cpp +++ b/lib/dnn/batchnorm.cpp @@ -71,7 +71,7 @@ void batchnorm_forward::enqueue_impl(driver::stream *stream, driver::kernel *ker void batchnorm_forward::triton_c_src(std::ostream &os) const { os << R"( -const tunable int TM = {32, 64, 128}; +const tunable int TM = {128}; void batchnorm_forward(float *Y, float *M, float *V, restrict read_only float *X, @@ -94,7 +94,7 @@ void batchnorm_forward(float *Y, float *M, float *V, px = px + TM; } float *pm = M + c; - float m = __sum(mean) * rcpDHWN; + float m = __sum(mean, 0) * rcpDHWN; *pm = m; float var[TM] = 0; @@ -105,7 +105,7 @@ void batchnorm_forward(float *Y, float *M, float *V, var = var + x*x; px = px + TM; } - float v = __sum(var) * rcpDHWN; + float v = __sum(var, 0) * rcpDHWN; float *pv = V + c; *pv = v; float rstdg = 1 / sqrt(v + eps) * g; @@ -167,7 +167,7 @@ void batchnorm_backward::enqueue_impl(driver::stream *stream, driver::kernel *ke void batchnorm_backward::triton_c_src(std::ostream &os) const { os << R"( -const tunable int TM = {32, 64, 128}; +const tunable int TM = {128}; void batchnorm_backward(float *DX, float *DG, float *DB, restrict read_only float *DY, @@ -199,8 +199,8 @@ void batchnorm_backward(float *DX, float *DG, float *DB, px = px + TM; pdy = pdy + TM; } - float sdg = __sum(dg); - float sdb = __sum(db); + float sdg = __sum(dg, 0); + float sdb = __sum(db, 0); float *pdg = DG + c; float *pdb = DB + c; *pdg = sdg; diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index 77c099827..a76c5e593 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -322,8 +322,8 @@ value *builder::create_sqrt(value *A, const std::string &name) { return insert(sqrt_inst::create(A, name)); } -value *builder::create_reduce(value *A, const std::string &name) { - return insert(reduce_inst::create(A, name)); +value *builder::create_reduce(value *A, unsigned axis, const std::string &name) { + return insert(reduce_inst::create(A, axis, name)); } value *builder::create_select(value *pred, value *if_value, value *else_value, const std::string &name){ diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index a29c11914..e6e85ff85 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -597,13 +597,24 @@ instruction* sqrt_inst::create(value *arg, const std::string &name, instruction //===----------------------------------------------------------------------===// // reduce instructions //===----------------------------------------------------------------------===// -reduce_inst::reduce_inst(value *arg, const std::string &name, instruction *next) - : builtin_inst(arg->get_type()->get_scalar_ty(), 1, 1, name, next) { +type* reduce_inst::get_type(value *arg, unsigned axis) { + ir::tile_type::tile_shapes_t shapes = arg->get_type()->get_tile_shapes(); + shapes.erase(shapes.begin() + axis); + type *scalar_ty = arg->get_type()->get_scalar_ty(); + if(shapes.size() == 0) + return scalar_ty; + else + return tile_type::get(scalar_ty, shapes); +} + +reduce_inst::reduce_inst(value *arg, unsigned axis, const std::string &name, instruction *next) + : builtin_inst(get_type(arg, axis), 1, 1, name, next), + axis_(axis){ set_operand(0, arg); } -instruction* reduce_inst::create(value *arg, const std::string &name, instruction *next) { - return new reduce_inst(arg, name, next); +instruction* reduce_inst::create(value *arg, unsigned axis, const std::string &name, instruction *next) { + return new reduce_inst(arg, axis, name, next); } diff --git a/lib/lang/expression.cpp b/lib/lang/expression.cpp index 15e66607a..470f0b3cd 100644 --- a/lib/lang/expression.cpp +++ b/lib/lang/expression.cpp @@ -161,6 +161,21 @@ ir::value* matmul_expression::codegen(ir::module *mod) const { return mod->get_builder().create_dot(A, B, C); } +// reshape +ir::value* reshape_expression::codegen(ir::module *mod) const { + // arg + ir::value *arg = arg_->codegen(mod); + // shapes + ir::type::tile_shapes_t shapes; + for(expression *expr: shapes_->values()){ + ir::constant_int *shape = dynamic_cast(expr->codegen(mod)); + assert(shape); + shapes.push_back(shape); + } + // return + return mod->get_builder().create_reshape(arg, shapes); +} + // min ir::value* min_expression::codegen(ir::module *mod) const { ir::value* cmp = binary_expression(LT, (node*)x_, (node*)y_).codegen(mod); @@ -198,7 +213,7 @@ ir::value* sqrt_expression::codegen(ir::module *mod) const { // reduce ir::value* reduce_expression::codegen(ir::module *mod) const { - return mod->get_builder().create_reduce(arg_->codegen(mod)); + return mod->get_builder().create_reduce(arg_->codegen(mod), axis_->value()); } /* Postfix expression */ diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index 1f6a60ccd..86102a460 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -37,13 +37,13 @@ void parallel_loop_nest(std::vector const & ranges, size_t D = ranges.size(); std::vector values(D, 0); // thread pools - ThreadPool pool(nthreads); +// ThreadPool pool(nthreads); // Start with innermost loop size_t i = D - 1; while(true){ // Execute function - pool.enqueue(f,values); -// f(values); +// pool.enqueue(f,values); + f(values); while(values[i]++ == ranges[i] - 1){ if(i == 0) return; From 899b2b72e176c618200dc227d48ff7e70afbd9e2 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 5 Aug 2019 13:06:56 -0700 Subject: [PATCH 277/494] simple constexpr --- include/triton/ir/constant.h | 29 +++++++++++++++++++++++-- include/triton/ir/context_impl.h | 4 ++++ include/triton/ir/instructions.h | 1 - include/triton/lang/parser.y | 15 +++++++------ include/triton/runtime/jit.h | 4 ++-- lib/ir/builder.cpp | 20 +++++++++++++----- lib/ir/constant.cpp | 36 ++++++++++++++++++++++++++++++++ lib/lang/declaration.cpp | 3 ++- lib/lang/expression.cpp | 5 +++-- 9 files changed, 96 insertions(+), 21 deletions(-) diff --git a/include/triton/ir/constant.h b/include/triton/ir/constant.h index 49f11a1aa..ca44c6227 100644 --- a/include/triton/ir/constant.h +++ b/include/triton/ir/constant.h @@ -3,6 +3,7 @@ #include "value.h" #include +#include "llvm/IR/Instructions.h" namespace triton{ namespace ir{ @@ -36,14 +37,14 @@ protected: constant_int(type *ty, uint64_t value); public: - uint64_t get_value() const { return value_; } + virtual uint64_t get_value() const { return value_; } static constant_int *get(type *ty, uint64_t value); protected: uint64_t value_; }; -/* Metaparameter int */ +/* Metaparameter (int) */ class metaparameter: public constant_int { private: metaparameter(type *ty, const std::vector& space); @@ -55,12 +56,36 @@ public: bool has_value() { return has_value_; } const std::vector& get_space() { return space_; } void set_space(const std::vector &space) { space_ = space; } + uint64_t get_value() const { assert(has_value_); return value_; } private: std::vector space_; bool has_value_; }; +class constant_expression: public constant_int { + typedef llvm::BinaryOperator::BinaryOps op_t; + using llop = llvm::BinaryOperator::BinaryOps; + +private: + constant_expression(op_t op, constant_int* lhs, constant_int* rhs); + +public: + uint64_t get_value() const; + // Wraps + void set_has_no_unsigned_wrap(bool b = true) { has_no_unsigned_wrap_ = b; } + void set_has_no_signed_wrap(bool b = true) { has_no_signed_wrap_ = b; } + // Factory + static constant_expression *create(op_t op, constant_int* lhs, constant_int* rhs); + +private: + op_t op_; + constant_int* lhs_; + constant_int* rhs_; + bool has_no_unsigned_wrap_; + bool has_no_signed_wrap_; +}; + /* constant range */ class constant_range: public constant{ constant_range(type *ty, constant_int* first, constant_int* last); diff --git a/include/triton/ir/context_impl.h b/include/triton/ir/context_impl.h index 54e109862..290d20cc7 100644 --- a/include/triton/ir/context_impl.h +++ b/include/triton/ir/context_impl.h @@ -9,6 +9,8 @@ namespace triton{ namespace ir{ class context; +class constant; +class constant_expression; class constant_int; class constant_fp; class undef_value; @@ -36,6 +38,8 @@ public: std::map uv_constants_; // Metaparameters std::vector mp_constants_; + // Expr constants + std::map, constant_expression*> expr_constants_; }; } diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index 3cc86da26..9886a8a0e 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -93,7 +93,6 @@ private: //===----------------------------------------------------------------------===// // binary_operator classes //===----------------------------------------------------------------------===// - class binary_operator: public instruction{ public: typedef llvm::BinaryOperator::BinaryOps op_t; diff --git a/include/triton/lang/parser.y b/include/triton/lang/parser.y index 1a4b26633..0df37673b 100644 --- a/include/triton/lang/parser.y +++ b/include/triton/lang/parser.y @@ -93,7 +93,7 @@ abstract_declarator ; direct_abstract_declarator - : '[' primary_expression_list ']' { $$ = new tile(nullptr, $1); } + : '[' constant_expression_list ']' { $$ = new tile(nullptr, $2); } type_name : declaration_specifiers { $$ = new type_name($1, nullptr); } @@ -133,7 +133,7 @@ builtin_expression | ATOMIC_CAS '(' expression ',' expression ',' expression ')' { $$ = new atomic_cas_expression($3, $5, $7); } | ATOMIC_EXCH '(' expression ',' expression ')' { $$ = new atomic_exch_expression($3, $5); } | ATOMIC_ADD '(' expression ',' expression ')' { $$ = new atomic_add_expression($3, $5); } - | RESHAPE '(' expression ',' primary_expression_list ')' { $$ = new reshape_expression($3, $5); } + | RESHAPE '(' expression ',' constant_expression_list ')' { $$ = new reshape_expression($3, $5); } ; /* Primary */ @@ -146,11 +146,6 @@ primary_expression | '(' expression ')' { $$ = $2; } ; -primary_expression_list - : primary_expression { $$ = new list((expression*)$1); } - | primary_expression_list ',' primary_expression { $$ = append_ptr_list($1, $3); } - ; - /* Postfix */ slice : ':' { $$ = new slice(triton::lang::ALL); } @@ -279,6 +274,10 @@ expression : assignment_expression { $$ = $1; } ; +constant_expression_list + : expression { $$ = new list((expression*)$1); } + | constant_expression_list ',' expression { $$ = append_ptr_list($1, $3); } + /* Initialization */ initialization_expression : assignment_expression { $$ = $1; } @@ -338,7 +337,7 @@ jump_statement direct_declarator : identifier { $$ = $1; } - | identifier '[' primary_expression_list ']' { $$ = new tile($1, $3); } + | identifier '[' constant_expression_list ']' { $$ = new tile($1, $3); } | identifier '(' parameter_list ')' { $$ = new function($1, $3); } | identifier '(' ')' { $$ = new function($1, nullptr); } ; diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index 939aebbfe..16f56c0e5 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -72,7 +72,7 @@ public: void target_independent(ir::module &module) { optimize_dot.run(module); optimize_trans.run(module); -// optimize_dce.run(module); + optimize_dce.run(module); } void target_dependent(ir::module &module) { @@ -86,7 +86,7 @@ public: shmem_barriers.run(module); } vectorize.run(module); -// optimize_dce.run(module); + optimize_dce.run(module); // ir::print(module, std::cout); } diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index a76c5e593..b10488161 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -148,10 +148,20 @@ DEFINE_UNARY_FLOAT(fneg) value* builder::create_insert_nuwnswb_binop(binary_operator::op_t op, value *lhs, value *rhs, const std::string &name, bool has_nuw, bool has_nsw) { - binary_operator* result = insert(binary_operator::create(op, lhs, rhs), name); - if (has_nuw) result->set_has_no_unsigned_wrap(); - if (has_nsw) result->set_has_no_signed_wrap(); - return result; + if(auto *clhs = dynamic_cast(lhs)){ + if(auto *crhs = dynamic_cast(rhs)){ + constant_expression* result = constant_expression::create(op, clhs, crhs); + if (has_nuw) result->set_has_no_unsigned_wrap(); + if (has_nsw) result->set_has_no_signed_wrap(); + return result; + } + } + else { + binary_operator* result = insert(binary_operator::create(op, lhs, rhs), name); + if (has_nuw) result->set_has_no_unsigned_wrap(); + if (has_nsw) result->set_has_no_signed_wrap(); + return result; + } } #define DEFINE_NOWRAP_BINARY(SUFFIX, OPCODE)\ @@ -161,7 +171,7 @@ value* builder::create_insert_nuwnswb_binop(binary_operator::op_t op, value *lhs #define DEFINE_BINARY_INT(SUFFIX, OPCODE)\ value *builder::create_ ## SUFFIX(value *lhs, value *rhs, const std::string &name){\ - return insert(binary_operator::create(OPCODE, lhs, rhs), name);\ + return create_insert_nuwnswb_binop(OPCODE, lhs, rhs, name, false, false);\ } #define DEFINE_UNARY_INT(SUFFIX)\ diff --git a/lib/ir/constant.cpp b/lib/ir/constant.cpp index ddc10028d..d28da5efe 100644 --- a/lib/ir/constant.cpp +++ b/lib/ir/constant.cpp @@ -127,6 +127,42 @@ metaparameter* metaparameter::create(context &ctx, type *ty, const std::vectorget_type(), 0), + op_(op), lhs_(lhs), rhs_(rhs) { } + + +constant_expression *constant_expression::create(op_t op, constant_int* lhs, constant_int* rhs) { + context_impl *impl = lhs->get_type()->get_context().p_impl.get(); + constant_expression *& result = impl->expr_constants_[std::make_tuple((int)op, lhs, rhs)]; + if(!result) + result = new constant_expression(op, lhs, rhs); + return result; +} + +uint64_t constant_expression::get_value() const { + uint64_t lhs = lhs_->get_value(); + uint64_t rhs = rhs_->get_value(); + switch(op_) { + case llop::Add : return lhs + rhs; + case llop::Sub : return lhs - rhs; + case llop::Mul : return lhs * rhs; + case llop::UDiv : return lhs / rhs; + case llop::SDiv : return lhs / rhs; + case llop::URem : return lhs % rhs; + case llop::SRem : return lhs % rhs; + case llop::Shl : return lhs << rhs; + case llop::LShr : return lhs >> rhs; + case llop::AShr : return lhs >> rhs; + case llop::And : return lhs && rhs; + case llop::Or : return lhs || rhs; + case llop::Xor : return lhs ^ rhs; + default: throw std::runtime_error("unsupported constexpr binary operator"); + } +} + + // undef value undef_value::undef_value(type *ty) : constant(ty, 0) { } diff --git a/lib/lang/declaration.cpp b/lib/lang/declaration.cpp index dba439ce1..6e5c3204f 100644 --- a/lib/lang/declaration.cpp +++ b/lib/lang/declaration.cpp @@ -79,7 +79,8 @@ ir::type* tile::type_impl(ir::module *mod, ir::type *type, storage_spec_vec_cons ir::type::tile_shapes_t shapes; for(expression *expr: shapes_->values()){ ir::constant_int *shape = dynamic_cast(expr->codegen(mod)); - assert(shape); + if(shape == nullptr) + throw std::runtime_error("tile shapes must be constant expressions"); shapes.push_back(shape); } return ir::tile_type::get(type, shapes); diff --git a/lib/lang/expression.cpp b/lib/lang/expression.cpp index 470f0b3cd..355a2a369 100644 --- a/lib/lang/expression.cpp +++ b/lib/lang/expression.cpp @@ -101,6 +101,7 @@ ir::value *binary_expression::llvm_op(ir::module *mod, ir::builder &builder, ir: ir::value* binary_expression::codegen(ir::module *mod) const{ ir::value *lhs = lhs_->codegen(mod); ir::value *rhs = rhs_->codegen(mod); + std::cout << " " << typeid(*lhs_).name() << " " << typeid(*rhs_).name() << std::endl; ir::value *result = llvm_op(mod, mod->get_builder(), lhs, rhs, ""); return result; } @@ -169,7 +170,8 @@ ir::value* reshape_expression::codegen(ir::module *mod) const { ir::type::tile_shapes_t shapes; for(expression *expr: shapes_->values()){ ir::constant_int *shape = dynamic_cast(expr->codegen(mod)); - assert(shape); + if(shape == nullptr) + throw std::runtime_error("tile shapes must be constant expressions"); shapes.push_back(shape); } // return @@ -210,7 +212,6 @@ ir::value* sqrt_expression::codegen(ir::module *mod) const { return mod->get_builder().create_sqrt(arg_->codegen(mod)); } - // reduce ir::value* reduce_expression::codegen(ir::module *mod) const { return mod->get_builder().create_reduce(arg_->codegen(mod), axis_->value()); From d62e581ab3fe30b8b7b1e32b607ab0ab60387995 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 5 Aug 2019 19:33:28 -0700 Subject: [PATCH 278/494] basic split-k across warps working for GEMM --- examples/cpp/dot.cpp | 40 +++++++++++++++++++----------- examples/python/tensorflow/dot.cpp | 2 +- examples/python/tensorflow/run.py | 6 ++--- include/triton/runtime/jit.h | 2 +- lib/codegen/optimize_dot.cpp | 3 +-- lib/codegen/selection.cpp | 22 +++++++++------- lib/codegen/shmem_allocation.cpp | 3 ++- lib/codegen/tune.cpp | 37 +++++++++++++++++++-------- lib/dnn/base.cpp | 7 +++--- lib/dnn/dot.cpp | 25 +++++++++++-------- lib/ir/builder.cpp | 14 +++++------ lib/lang/expression.cpp | 1 - 12 files changed, 99 insertions(+), 63 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 591237fbe..d4f5adb6e 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -26,7 +26,7 @@ struct perf_t { perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K){ typedef float NumericT; - std::string ty = "half"; + std::string ty = "float"; size_t dt_nbytes = sizeof(NumericT); triton::driver::context* context = stream->context(); std::vector hc(M*N); @@ -48,28 +48,40 @@ perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int stream->synchronize(); triton::dnn::dot dot(M, N, K, AT, BT, ty, ty, 8, 8, 8); // benchmark triton - double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::PARTIAL_TUNING);}, stream); + double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::NO_TUNING);}, stream); // benchmark cublas - NumericT alpha = 1; - NumericT beta = 0; - int32_t lda = AT ? K : M; - int32_t ldb = BT ? N : K; - int32_t ldc = M; +// NumericT alpha = 1; +// NumericT beta = 0; +// int32_t lda = AT ? K : M; +// int32_t ldb = BT ? N : K; +// int32_t ldc = M; // cublasGemmAlgo_t fastest; // cublasGemm(HALF_TYPE, stream, AT, BT, M, N, K, // &alpha, da, lda, // db, ldb, &beta, // dc, ldc, &fastest); - double cublas_ns = triton::tools::bench([&]() { cublasGemm(HALF_TYPE, stream, AT, BT, M, N, K, - &alpha, da, lda, - db, ldb, &beta, - dc, ldc, nullptr, CUBLAS_GEMM_DEFAULT_TENSOR_OP); }, stream); +// double cublas_ns = triton::tools::bench([&]() { cublasGemm(HALF_TYPE, stream, AT, BT, M, N, K, +// &alpha, da, lda, +// db, ldb, &beta, +// dc, ldc, nullptr, CUBLAS_GEMM_DEFAULT_TENSOR_OP); }, stream); // result auto tflops = [&](double nanosec) { return dot.num_flops() / nanosec * 1e-3; }; perf_t result; - result.cublas = tflops(cublas_ns); +// result.cublas = tflops(cublas_ns); result.triton = tflops(triton_ns); + + // test + stream->read(dc, true, 0, hc); + std::vector rc(hc.size()); + dot.cpu_ref(rc, ha, hb); + for(size_t i = 0; i < M*N; i++) + if(!std::isnan(hc[i]) && std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ + std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; + exit(EXIT_FAILURE); + } + std::cout << "Pass!" << std::endl; + // clean-up delete dc; delete da; @@ -99,8 +111,8 @@ int main() { std::vector configs = { // {false, false, 8192, 512, 512}, // {false, true, 8192, 8192, 8192} - {false, true, 32768, 256, 256}, - {false, true, 32768, 256, 512} + {false, true, 128, 128, 128}, +// {false, true, 32768, 256, 512} // {true, false, 8192, 512, 512}, // {true, true, 8192, 512, 512} }; diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index 553ad11fa..bdcb5c62c 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -49,7 +49,7 @@ class DotOp : public OpKernel { triton::driver::cu_buffer db(ctx, b.tensor_data().size(), (CUdeviceptr)b.tensor_data().data(), false); triton::driver::cu_buffer dc(ctx, c->tensor_data().size(), (CUdeviceptr)c->tensor_data().data(), false); // template - triton::dnn::dot dot(M, N, K, false, false, "half", "half", 8, 8, 8); + triton::dnn::dot dot(M, N, K, false, true, "half", "half", 8, 8, 8); dot.enqueue(stream, {&da, &db, &dc}); } diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 8dbc6ac55..4b1f7ac53 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -23,7 +23,7 @@ def run_dot(): result = sess.run([c], feed_dict = {a: ha, b: hb})[0] # Test - hresult = np.dot(ha.T, hb.T).T + hresult = np.dot(ha.T, hb).T dif = np.abs(result - hresult) np.savetxt('dif.dat', dif, '%2.4f') print(hresult) @@ -131,6 +131,6 @@ def run_batchnorm(): print(np.max(np.abs(dg_t - dg_n))) print(np.max(np.abs(db_t - db_n))) -#run_dot() +run_dot() #run_shift() -run_batchnorm() +#run_batchnorm() diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index 16f56c0e5..fffec7794 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -73,11 +73,11 @@ public: optimize_dot.run(module); optimize_trans.run(module); optimize_dce.run(module); +// ir::print(module, std::cout); } void target_dependent(ir::module &module) { alignment_info.run(module); -// ir::print(module, std::cout); // reassociate.run(module); if(target_->is_gpu()){ shmem_info.run(module); diff --git a/lib/codegen/optimize_dot.cpp b/lib/codegen/optimize_dot.cpp index ee59145c7..8688e918e 100644 --- a/lib/codegen/optimize_dot.cpp +++ b/lib/codegen/optimize_dot.cpp @@ -33,8 +33,7 @@ void optimize_dot::run(ir::module &mod) { for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()) - if(auto dot = dynamic_cast(i)) - if(dot->get_operand(1)->get_type()->get_tile_shapes()[1]->get_value() != 1){ + if(auto dot = dynamic_cast(i)){ builder.set_insert_point(i); ir::value *A = dot->get_operand(0); ir::value *B = dot->get_operand(1); diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index dc8980a28..e419f5a8d 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -135,8 +135,12 @@ void shared_tile::extract_constant(const indices_t &arg_idx, indices_t &non_cst_ Value* shared_tile::shared_offset(llvm::IRBuilder<> &builder, const shapes_t& shapes, indices_t idx) { Value *result = builder.getInt32(0); result = builder.CreateAdd(result, idx[0]); - for(size_t i = 1; i < idx.size(); i++) - result = builder.CreateAdd(result, builder.CreateMul(idx[i], builder.getInt32(shapes[i-1]))); + Value *ld = builder.getInt32(shapes[0]); + for(size_t i = 1; i < idx.size(); i++) { + result = builder.CreateAdd(result, builder.CreateMul(idx[i], ld)); + if(i < idx.size() - 1) + ld = builder.CreateMul(ld, builder.getInt32(shapes[i])); + } return result; } @@ -854,10 +858,13 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & Value *&result = x.second; indices_t write_idx = x.first; write_idx.insert(write_idx.begin() + axis, lane); + // shared memory write pointer Value *write_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), write_idx); Value *write_ptr = builder.CreateGEP(base_ptr, write_offset); + // initialize shared memory + tgt_->add_barrier(module, builder); builder.CreateStore(result, write_ptr); // build result for(unsigned i = depth/2; i > 0; i >>= 1){ @@ -993,15 +1000,14 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & { shared_tile *TA = (shared_tile*)tmap_.at(A); shared_tile *TB = (shared_tile*)tmap_.at(B); - if(params_->get_fragment(ins, 0) == tune::STRIDED_SCAN) - { + if(params_->get_fragment(ins, 0) == tune::STRIDED_SCAN) { TA->set_vector_size(TC->axis(0).contiguous); TB->set_vector_size(TC->axis(1).contiguous); result->for_each([&](indices_t idx){ Value *res = TC->get_value(idx); for(unsigned K = 0; K < NK; ++K){ - indices_t a_idx = {idx[0], builder.getInt32(K)}; - indices_t b_idx = {builder.getInt32(K), idx[1]}; + indices_t a_idx = {idx[0], builder.getInt32(K), idx[2]}; + indices_t b_idx = {builder.getInt32(K), idx[1], idx[2]}; if(AT) std::swap(a_idx[0], a_idx[1]); if(BT) @@ -1013,13 +1019,11 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & if(b->getType() != c_ty) b = builder.CreateFPCast(b, c_ty); res = builder.CreateCall(f_mul_add, {a, b, res}); - } result->set_value(idx, res); }); } - else - { + else { TA->set_vector_size(4*pack_size_0_); TB->set_vector_size(4*pack_size_1_); TA->set_return_mode(true); diff --git a/lib/codegen/shmem_allocation.cpp b/lib/codegen/shmem_allocation.cpp index 641170215..6e9bf86ff 100644 --- a/lib/codegen/shmem_allocation.cpp +++ b/lib/codegen/shmem_allocation.cpp @@ -42,8 +42,8 @@ unsigned shmem_allocation::is_ld_padded(ir::value *x) { } unsigned shmem_allocation::get_num_bytes(ir::value *x) { - unsigned num_bytes = x->get_type()->get_primitive_size_in_bits() / 8; if(auto *red = dynamic_cast(x)){ + unsigned num_bytes = x->get_type()->get_scalar_ty()->get_primitive_size_in_bits() / 8; size_t axis = red->get_axis(); ir::value *op = red->get_operand(0); auto shapes = op->get_type()->get_tile_shapes(); @@ -54,6 +54,7 @@ unsigned shmem_allocation::get_num_bytes(ir::value *x) { size_t depth = params_->get_param(op, "mts.d" + std::to_string(axis))->get_value(); return num_elements * num_bytes * depth; } + unsigned num_bytes = x->get_type()->get_primitive_size_in_bits() / 8; unsigned pad = is_ld_padded(x); if(pad > 0){ unsigned ld = x->get_type()->get_tile_shapes()[0]->get_value(); diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index db3ed1c81..35445a72d 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -24,8 +24,7 @@ bool is_hmma(ir::value *v){ ir::type *b_ty = b->get_type(); // inputs have to be FP16 result = a_ty->get_scalar_ty()->is_half_ty() && b_ty->get_scalar_ty()->is_half_ty(); - // reduction has to be multiple of 4 - result = result && ((a_ty->get_tile_shapes()[1]->get_value() % 4) == 0); + // reduction has to be multiple of 4: TODO } return result; } @@ -66,9 +65,10 @@ void tune::init_c_graph(ir::instruction *v) { for(unsigned i = 0; i < in_shapes.size(); i++){ if(i == axis) continue; -// std::cout << arg->get_name() << " " << v->get_name() << std::endl; add_constraint({reduce, current++}, {arg, i}); } +// add_constraint({reduce, 0}, {arg, 0}); +// add_constraint({reduce, 1}, {arg, 1}); return; } else @@ -81,8 +81,10 @@ void tune::init_c_graph(ir::instruction *v) { for(unsigned i = 0; i < shapes.size(); i ++){ bool is_one = shapes[i] == one; bool is_same = shapes[i] == op->get_type()->get_tile_shapes()[current]; - if(is_one) + if(is_one){ static_params_.insert({{v, i}, 1}); + add_constraint({v, i}, {v, i}); + } else if(!is_skewed && is_same) add_constraint({v, i}, {op, current++}); else{ @@ -114,9 +116,17 @@ void tune::init_c_graph(ir::instruction *v) { } // Matrix multiplication else if(dynamic_cast(v)){ + ir::value *A = v->get_operand(0); + ir::value *B = v->get_operand(1); ir::value *D = v->get_operand(2); - add_constraint({v, 0}, {D, 0}); - add_constraint({v, 1}, {D, 1}); + for(unsigned i = 0; i < shapes.size(); i++) + add_constraint({v, i}, {D, i}); + for(unsigned i = 2; i < shapes.size(); i++){ + if(shapes[i] == one) + static_params_.insert({{v, i}, 1}); + add_constraint({v, i}, {A, i}); + add_constraint({v, i}, {B, i}); + } } // Element-wise else if(dynamic_cast(v)) { @@ -242,7 +252,7 @@ void tune::run(ir::module &mod) { node_t node = *nodes_.begin(); if(fragments_[node] == STRIDED_SCAN) { ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 1, 1); - ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 2, 64); + ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 4, 32); connected_components(node, {nts, mts}, {"nts", "mts"}, nodes_, dependencies_, group_id++); nts->set_value(1); } @@ -266,14 +276,14 @@ void tune::run(ir::module &mod) { size_t addr_space = ptr_ty->get_pointer_address_space(); if(addr_space < 4){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 1, 8)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 1, 1)); *params_.at(i).at("nts.d0") = *tmp; } } if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 1, 8)); - std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 1, 8)); + std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 1, 1)); + std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 1, 1)); *params_.at(i).at("nts.d0") = *tmp1; *params_.at(i).at("nts.d1") = *tmp2; } @@ -365,6 +375,7 @@ bool tune::check_constraints(std::map> &er // check constraints for(ir::instruction *i: grids_){ +// std::cout << i->get_name() << std::endl; ir::type *ty = i->get_type(); const auto &shapes = ty->get_tile_shapes(); // for each dimension, the product of layout components @@ -396,11 +407,15 @@ bool tune::check_constraints(std::map> &er errors[i].push_back("HMMA must have only 4 fragments per warp"); } int num_threads = get_req_num_threads(i); - if(num_threads % 64 != 0) + if(num_threads % 32 != 0) errors[i].push_back("number of threads per block (" + to_string(num_threads) + ") must be multiple of warp size"); if(num_threads != num_threads_) errors[i].push_back("Number of threads must be the same for all tiles (" + to_string(num_threads_) + ")"); } +// for(auto x: errors) +// for(auto e: x.second) +// std::cout << x.first->get_name() << ": " << e << std::endl; +// exit(EXIT_SUCCESS); return errors.empty(); } diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index 1c1ee8ceb..ebbe699c1 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -54,16 +54,17 @@ std::pair base::get_profile_impl(driver::stream *stream, std::v return num_flops() / ts * 1e-3; }; // auto-tune and save result - if(autotune != NO_TUNING) { + if(autotune == FULL_TUNING || autotune == PARTIAL_TUNING) { std::vector space = {}; if(autotune == PARTIAL_TUNING) space = search_space(); rt::jit::tune_res_t best = jit->autotune(name_.c_str(), src.c_str(), benchmark, space); jit->add_module(name_.c_str(), src.c_str(), best.params); } - else { - params_t params = heuristics(); + else{ +// params_t params = heuristics(); // params_t params = jit->get_valid(name_.c_str(), src.c_str()); + params_t params = {4, 1, 32, 4, 1, 32, 4, 4, 4, 1, 1, 16, 32, 16, 4, 4, 1}; jit->add_module(name_.c_str(), src.c_str(), params); } triton::driver::kernel* kernel = jit->get_function(name_.c_str()); diff --git a/lib/dnn/dot.cpp b/lib/dnn/dot.cpp index 3b9a2e300..7cc7563dc 100644 --- a/lib/dnn/dot.cpp +++ b/lib/dnn/dot.cpp @@ -74,12 +74,14 @@ void dot::enqueue_impl(driver::stream *stream, driver::kernel *kernel, void dot::triton_c_src(std::ostream &os) const { std::string AS0 = "TM", AS1 = "TK"; std::string BS0 = "TK", BS1 = "TN"; + std::string XAS0 = "TM", XAS1 = "TK/4", XAS2 = "4"; + std::string XBS0 = "TN", XBS1 = "TK/4", XBS2 = "4"; std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; std::string lda0 = "*lda", lda1 = ""; std::string ldb0 = "", ldb1 = "*ldb"; - std::string usea = AT_ ? "trans(a)" : "a"; - std::string useb = BT_ ? "trans(b)" : "b"; + std::string usea = AT_ ? "trans(xa)" : "xa"; + std::string useb = BT_ ? "trans(xb)" : "xb"; if(AT_){ std::swap(AS0, AS1); std::swap(bca0, bca1); @@ -92,12 +94,15 @@ void dot::triton_c_src(std::ostream &os) const { } std::string AS = AS0 + ", " + AS1; std::string BS = BS0 + ", " + BS1; + std::string XAS = XAS0 + ", " + XAS1 + ", " + XAS2; + std::string XBS = XBS0 + ", " + XBS1 + ", " + XBS2; + std::string XCS = "TM, TN, 4"; std::string align_lda_str = "multiple_of(" + std::to_string(align_lda_) + ")"; std::string align_ldb_str = "multiple_of(" + std::to_string(align_ldb_) + ")"; std::string res = R"( -const tunable int TM = {16, 32, 64, 128}; -const tunable int TN = {16, 32, 64, 128}; +const tunable int TM = {32}; +const tunable int TN = {32}; const tunable int TK = {32}; const tunable int GZ = {1}; @@ -113,7 +118,7 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, int ryb[TN] = ridy * TN + (0 ... TN); int rka[TK] = 0 ... TK; int rkb[TK] = 0 ... TK; - float c[TM, TN] = 0; + float xc[)" + XCS + R"(] = 0; )" + a_ty_ + R"(* pa[)" + AS + "] = A + rka" + bca0 + lda0 + " + rxa" + bca1 + lda1 + R"(; )" + b_ty_ + R"(* pb[)" + BS + "] = B + rkb" + bcb0 + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; bool checka[)" + AS + R"(] = (rka < K))" + bca0 + " && (rxa < M)" + bca1 + R"(; @@ -121,7 +126,9 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, )" + a_ty_ + R"( a[)" + AS + R"(] = checka ? *pa : 0; )" + b_ty_ + R"( b[)" + BS + R"(] = checkb ? *pb : 0; for(int k = K; k > 0; k = k - TK){ - c = dot()" + usea + ", " + useb + R"(, c); + )" + a_ty_ + R"( xa[)" + XAS + "] = __reshape(a, " + XAS + R"(); + )" + b_ty_ + R"( xb[)" + XBS + "] = __reshape(b, " + XBS + R"(); + xc = dot()" + usea + ", " + useb + R"(, xc); pa = pa + TK)" + lda0 + R"(; pb = pb + TK)" + ldb0 + R"(; bool checka[)" + AS + R"(] = k > TK; @@ -131,11 +138,9 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, } int rxc[TM] = ridx * TM + (0 ... TM); int ryc[TN] = ridy * TN + (0 ... TN); - bool checkc0[TM] = rxc < M; - bool checkc1[TN] = ryc < N; - bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; float* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - @checkc *pc = c; + float c[TM, TN] = __sum(xc, 2); + *pc = c; } )"; diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index b10488161..d47fbbaa5 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -148,13 +148,13 @@ DEFINE_UNARY_FLOAT(fneg) value* builder::create_insert_nuwnswb_binop(binary_operator::op_t op, value *lhs, value *rhs, const std::string &name, bool has_nuw, bool has_nsw) { - if(auto *clhs = dynamic_cast(lhs)){ - if(auto *crhs = dynamic_cast(rhs)){ - constant_expression* result = constant_expression::create(op, clhs, crhs); - if (has_nuw) result->set_has_no_unsigned_wrap(); - if (has_nsw) result->set_has_no_signed_wrap(); - return result; - } + auto *clhs = dynamic_cast(lhs); + auto *crhs = dynamic_cast(rhs); + if(clhs && crhs){ + constant_expression* result = constant_expression::create(op, clhs, crhs); + if (has_nuw) result->set_has_no_unsigned_wrap(); + if (has_nsw) result->set_has_no_signed_wrap(); + return result; } else { binary_operator* result = insert(binary_operator::create(op, lhs, rhs), name); diff --git a/lib/lang/expression.cpp b/lib/lang/expression.cpp index 355a2a369..c54179943 100644 --- a/lib/lang/expression.cpp +++ b/lib/lang/expression.cpp @@ -101,7 +101,6 @@ ir::value *binary_expression::llvm_op(ir::module *mod, ir::builder &builder, ir: ir::value* binary_expression::codegen(ir::module *mod) const{ ir::value *lhs = lhs_->codegen(mod); ir::value *rhs = rhs_->codegen(mod); - std::cout << " " << typeid(*lhs_).name() << " " << typeid(*rhs_).name() << std::endl; ir::value *result = llvm_op(mod, mod->get_builder(), lhs, rhs, ""); return result; } From 26c984946286177a0ff4f3dd1a2d5757a6255eb5 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 5 Aug 2019 21:19:13 -0700 Subject: [PATCH 279/494] [ir][instructions] added permutations option for trans --- include/triton/codegen/optimize_trans.h | 3 ++- include/triton/ir/builder.h | 2 +- include/triton/ir/instructions.h | 9 +++++++-- lib/codegen/optimize_dot.cpp | 8 +++++++- lib/codegen/optimize_trans.cpp | 9 +++++---- lib/codegen/selection.cpp | 8 +++++--- lib/codegen/tune.cpp | 8 ++++---- lib/dnn/dot.cpp | 5 ++++- lib/ir/builder.cpp | 4 ++-- lib/ir/instructions.cpp | 24 +++++++++++++++++++++--- 10 files changed, 58 insertions(+), 22 deletions(-) diff --git a/include/triton/codegen/optimize_trans.h b/include/triton/codegen/optimize_trans.h index c6ec73b4d..8af45205d 100644 --- a/include/triton/codegen/optimize_trans.h +++ b/include/triton/codegen/optimize_trans.h @@ -13,13 +13,14 @@ namespace ir { class instruction; class trans_inst; class builder; + class constant_int; } namespace codegen{ class optimize_trans { private: - ir::value *replace_phi(ir::value* value, ir::builder &builder); + ir::value *replace_phi(ir::value* value, ir::builder &builder, const std::vector &perm); public: optimize_trans() {} diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h index 079a79e40..d3f5e7be4 100644 --- a/include/triton/ir/builder.h +++ b/include/triton/ir/builder.h @@ -132,7 +132,7 @@ public: value *create_atomic_exch(value *ptr, value *val, const std::string &name = ""); value *create_atomic_add(value *ptr, value *val, const std::string &name = ""); value *create_dot(value *A, value *B, value *C, const std::string &name = ""); - value *create_trans(value *A, const std::string &name = ""); + value *create_trans(value *A, const std::vector &perm = {}, const std::string &name = ""); value *create_sqrt(value *A, const std::string &name = ""); value *create_reduce(value *A, unsigned axis, const std::string &name = ""); value *create_select(value *pred, value *if_value, value *else_value, const std::string &name = ""); diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index 9886a8a0e..c0b176ebf 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -585,13 +585,18 @@ private: class trans_inst: public builtin_inst { public: ir::type* get_res_ty(ir::type* in); + std::vector get_default_perm(ir::type* ty); private: - trans_inst(value *arg, const std::string& name, instruction* next); + trans_inst(value *arg, const std::vector& perm, const std::string& name, instruction* next); std::string repr_impl() const { return "trans"; } public: - static instruction* create(value *arg, const std::string &name = "", instruction *next = nullptr); + static instruction* create(value *arg, const std::vector& perm = {}, const std::string &name = "", instruction *next = nullptr); + const std::vector get_perm() const; + +private: + std::vector perm_; }; class sqrt_inst: public builtin_inst { diff --git a/lib/codegen/optimize_dot.cpp b/lib/codegen/optimize_dot.cpp index 8688e918e..e3ebfbcdb 100644 --- a/lib/codegen/optimize_dot.cpp +++ b/lib/codegen/optimize_dot.cpp @@ -68,7 +68,13 @@ void optimize_dot::run(ir::module &mod) { } // dot(op(a), b) if(!trans_b){ - ir::value* BB = builder.create_trans(B); + size_t size = B->get_type()->get_tile_shapes().size(); + std::vector perm(size); + ir::type *int32_ty = ir::type::get_int32_ty(B->get_type()->get_context()); + for(size_t i = 0; i < size; i++) + perm[i] = ir::constant_int::get(int32_ty, i); + std::swap(perm[0], perm[1]); + ir::value* BB = builder.create_trans(B, perm); ir::instruction *NT = builder.insert(ir::dot_inst::create_nt(A, BB, D)); dot->replace_all_uses_with(NT); to_delete.push_back(dot); diff --git a/lib/codegen/optimize_trans.cpp b/lib/codegen/optimize_trans.cpp index 0fb96ac96..16c4605d7 100644 --- a/lib/codegen/optimize_trans.cpp +++ b/lib/codegen/optimize_trans.cpp @@ -7,12 +7,13 @@ namespace codegen{ ir::value* optimize_trans::replace_phi(ir::value* value, - ir::builder& builder){ + ir::builder& builder, + const std::vector &perm){ if(auto phi = dynamic_cast(value)) { // transpose operands std::vector incs; for(unsigned n = 0; n < phi->get_num_incoming(); n++) - incs.push_back(replace_phi(phi->get_incoming_value(n), builder)); + incs.push_back(replace_phi(phi->get_incoming_value(n), builder, perm)); // create phi for transposed values builder.set_insert_point(phi); ir::phi_node* result = builder.create_phi(incs[0]->get_type(), incs.size(), phi->get_name()); @@ -26,7 +27,7 @@ ir::value* optimize_trans::replace_phi(ir::value* value, auto it = std::find(block->begin(), block->end(), i); it++; builder.set_insert_point(it); - ir::instruction *trans = (ir::instruction*)builder.create_trans(i); + ir::instruction *trans = (ir::instruction*)builder.create_trans(i, perm); i->replace_all_uses_with(trans); trans->set_operand(0, i); return trans; @@ -53,7 +54,7 @@ void optimize_trans::run(ir::module &mod) { // trans(phi) -> phi(trans(), trans()...) if(dynamic_cast(op)){ - ir::value* new_phi = replace_phi(op, builder); + ir::value* new_phi = replace_phi(op, builder, trans->get_perm()); trans->replace_all_uses_with(new_phi); } } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index e419f5a8d..3b973de71 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -974,11 +974,13 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & }); } // trans - else if(dynamic_cast(ins)) { + else if(auto* x = dynamic_cast(ins)) { distributed_tile* in = (distributed_tile*)tmap_.at(ins->get_operand(0)); + auto perm = x->get_perm(); in->for_each([&](indices_t idx){ - indices_t out_idx = idx; - std::rotate(out_idx.begin(), out_idx.begin() + 1, out_idx.end()); + indices_t out_idx(idx.size()); + for(size_t i = 0; i < idx.size(); i++) + out_idx[i] = idx[perm[i]->get_value()]; ti->set_value(out_idx, in->get_value(idx)); }); } diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 35445a72d..09017e978 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -98,11 +98,11 @@ void tune::init_c_graph(ir::instruction *v) { } // Trans - else if(dynamic_cast(v)){ + else if(auto *x = dynamic_cast(v)){ ir::value *op = v->get_operand(0); - size_t n_shapes = shapes.size(); - for(unsigned i = 0; i < n_shapes; i++) - add_constraint({v, (i + 1) % n_shapes}, {op, i}); + auto perm = x->get_perm(); + for(unsigned i = 0; i < perm.size(); i++) + add_constraint({v, perm[i]->get_value()}, {op, i}); } // Broadcast else if(dynamic_cast(v)){ diff --git a/lib/dnn/dot.cpp b/lib/dnn/dot.cpp index 7cc7563dc..a43ea1ca4 100644 --- a/lib/dnn/dot.cpp +++ b/lib/dnn/dot.cpp @@ -75,7 +75,7 @@ void dot::triton_c_src(std::ostream &os) const { std::string AS0 = "TM", AS1 = "TK"; std::string BS0 = "TK", BS1 = "TN"; std::string XAS0 = "TM", XAS1 = "TK/4", XAS2 = "4"; - std::string XBS0 = "TN", XBS1 = "TK/4", XBS2 = "4"; + std::string XBS0 = "TK/4", XBS1 = "TN", XBS2 = "4"; std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; std::string lda0 = "*lda", lda1 = ""; @@ -84,11 +84,13 @@ void dot::triton_c_src(std::ostream &os) const { std::string useb = BT_ ? "trans(xb)" : "xb"; if(AT_){ std::swap(AS0, AS1); + std::swap(XAS0, XAS1); std::swap(bca0, bca1); std::swap(lda0, lda1); } if(BT_){ std::swap(BS0, BS1); + std::swap(XBS0, XBS1); std::swap(bcb0, bcb1); std::swap(ldb0, ldb1); } @@ -144,6 +146,7 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, } )"; + std::cout << res << std::endl; os << res; } diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index d47fbbaa5..1f6aa7c54 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -324,8 +324,8 @@ value *builder::create_dot(value *A, value *B, value *C, const std::string &name return insert(dot_inst::create_nn(A, B, C, name)); } -value *builder::create_trans(value *A, const std::string &name) { - return insert(trans_inst::create(A, name)); +value *builder::create_trans(value *A, const std::vector& perm, const std::string &name) { + return insert(trans_inst::create(A, perm, name)); } value *builder::create_sqrt(value *A, const std::string &name) { diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index e6e85ff85..58a81cd3b 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -572,13 +572,31 @@ ir::type* trans_inst::get_res_ty(ir::type* ty) { return tile_type::get(ty->get_scalar_ty(), shapes); } -trans_inst::trans_inst(value *arg, const std::string &name, instruction *next) +std::vector trans_inst::get_default_perm(ir::type* ty) { + auto size = ty->get_tile_shapes().size(); + ir::type* int32_ty = type::get_int32_ty(ty->get_context()); + std::vector result; + for(size_t i = 0; i < size; i++) + result.push_back(ir::constant_int::get(int32_ty, i + 1 % size)); + return result; +} + +trans_inst::trans_inst(value *arg, const std::vector& perm, const std::string &name, instruction *next) : builtin_inst(get_res_ty(arg->get_type()), 1, 1, name, next) { + perm_ = perm; + if(perm_.empty()) + perm_ = get_default_perm(arg->get_type()); + auto size = arg->get_type()->get_tile_shapes().size(); + assert(perm_.size() == size); set_operand(0, arg); } -instruction* trans_inst::create(value *arg, const std::string &name, instruction *next) { - return new trans_inst(arg, name, next); +instruction* trans_inst::create(value *arg, const std::vector &perm, const std::string &name, instruction *next) { + return new trans_inst(arg, perm, name, next); +} + +const std::vector trans_inst::get_perm() const { + return perm_; } //===----------------------------------------------------------------------===// From 5efdb7978e41d482f724fba9417c178a3451ad98 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 6 Aug 2019 16:21:20 -0700 Subject: [PATCH 280/494] more improvements and regressions --- examples/cpp/dot.cpp | 6 ++-- include/triton/ir/constant.h | 3 +- include/triton/ir/instructions.h | 14 +++++++--- include/triton/lang/expression.h | 3 +- include/triton/lang/parser.y | 3 +- lib/codegen/optimize_dot.cpp | 24 ++++++++-------- lib/codegen/optimize_trans.cpp | 18 +++++++++--- lib/codegen/selection.cpp | 16 +++++++++-- lib/codegen/tune.cpp | 48 +++++++++++++++++++++----------- lib/dnn/base.cpp | 4 ++- lib/dnn/dot.cpp | 14 ++++++---- lib/ir/instructions.cpp | 36 ++++++++++++++---------- lib/lang/expression.cpp | 12 +++++++- lib/runtime/jit.cpp | 6 ++-- 14 files changed, 138 insertions(+), 69 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index d4f5adb6e..87bb739e2 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -26,7 +26,7 @@ struct perf_t { perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K){ typedef float NumericT; - std::string ty = "float"; + std::string ty = "half"; size_t dt_nbytes = sizeof(NumericT); triton::driver::context* context = stream->context(); std::vector hc(M*N); @@ -48,7 +48,7 @@ perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int stream->synchronize(); triton::dnn::dot dot(M, N, K, AT, BT, ty, ty, 8, 8, 8); // benchmark triton - double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::NO_TUNING);}, stream); + double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::FULL_TUNING);}, stream); // benchmark cublas // NumericT alpha = 1; // NumericT beta = 0; @@ -111,7 +111,7 @@ int main() { std::vector configs = { // {false, false, 8192, 512, 512}, // {false, true, 8192, 8192, 8192} - {false, true, 128, 128, 128}, + {true, true, 128, 128, 128}, // {false, true, 32768, 256, 512} // {true, false, 8192, 512, 512}, // {true, true, 8192, 512, 512} diff --git a/include/triton/ir/constant.h b/include/triton/ir/constant.h index ca44c6227..ce618d998 100644 --- a/include/triton/ir/constant.h +++ b/include/triton/ir/constant.h @@ -38,6 +38,7 @@ protected: public: virtual uint64_t get_value() const { return value_; } + virtual std::string repr() const { return std::to_string(get_value()); } static constant_int *get(type *ty, uint64_t value); protected: @@ -57,7 +58,7 @@ public: const std::vector& get_space() { return space_; } void set_space(const std::vector &space) { space_ = space; } uint64_t get_value() const { assert(has_value_); return value_; } - + std::string repr() const { return has_value_? std::to_string(value_) : "?" ;} private: std::vector space_; bool has_value_; diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index c0b176ebf..8bb46eb2a 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -584,12 +584,18 @@ private: class trans_inst: public builtin_inst { public: - ir::type* get_res_ty(ir::type* in); - std::vector get_default_perm(ir::type* ty); + ir::type* get_res_ty(ir::type* in, std::vector perm); + std::vector init_perm(ir::type* ty, const std::vector& perm); private: trans_inst(value *arg, const std::vector& perm, const std::string& name, instruction* next); - std::string repr_impl() const { return "trans"; } + std::string repr_impl() const { + std::string res = "trans<"; + for(ir::constant_int *x: perm_) + res += x->repr() + ","; + res[res.size()-1] = '>'; + return res; + } public: static instruction* create(value *arg, const std::vector& perm = {}, const std::string &name = "", instruction *next = nullptr); @@ -609,7 +615,7 @@ public: class reduce_inst: public builtin_inst { private: - static type* get_type(value *arg, unsigned axis); + static type* get_res_type(value *arg, unsigned axis); private: reduce_inst(value* arg, unsigned axis, const std::string& name, instruction* next); diff --git a/include/triton/lang/expression.h b/include/triton/lang/expression.h index f0dac3bc9..6823e8988 100644 --- a/include/triton/lang/expression.h +++ b/include/triton/lang/expression.h @@ -180,11 +180,12 @@ private: class trans_expression: public builtin_expression{ public: - trans_expression(node *arg): arg_(arg) {} + trans_expression(node *arg, node *perm): arg_(arg), perm_((list*)perm) {} ir::value* codegen(ir::module *mod) const; private: node* arg_; + const list* perm_; }; class sqrt_expression: public builtin_expression{ diff --git a/include/triton/lang/parser.y b/include/triton/lang/parser.y index 0df37673b..c44a619e8 100644 --- a/include/triton/lang/parser.y +++ b/include/triton/lang/parser.y @@ -125,7 +125,8 @@ builtin_expression | DOT '(' expression ',' expression ',' expression ')' { $$ = new matmul_expression($3, $5, $7); } | SQRT '(' expression ')' { $$ = new sqrt_expression($3); } | ALLOC_CONST type_specifier '[' constant ']' { $$ = new alloc_const_expression(new typed_declaration_specifier(get_type_spec($2)), $4); } - | TRANS '(' expression ')' { $$ = new trans_expression($3); } + | TRANS '(' expression ',' constant_expression_list ')' { $$ = new trans_expression($3, $5); } + | TRANS '(' expression ')' { $$ = new trans_expression($3, nullptr); } | REDUCE_SUM '(' expression ',' constant ')' { $$ = new reduce_expression($3, $5);} | MAX '(' expression ',' expression ')' { $$ = new max_expression($3, $5); } | MIN '(' expression ',' expression ')' { $$ = new min_expression($3, $5); } diff --git a/lib/codegen/optimize_dot.cpp b/lib/codegen/optimize_dot.cpp index e3ebfbcdb..904b1c6c3 100644 --- a/lib/codegen/optimize_dot.cpp +++ b/lib/codegen/optimize_dot.cpp @@ -8,7 +8,17 @@ namespace triton { namespace codegen{ inline bool is_trans(ir::value *v){ - return dynamic_cast(v) != nullptr; + auto *x = dynamic_cast(v); + if(!x) + return false; + std::vector perm = x->get_perm(); + std::vector ref; + ir::type *int32_ty = ir::type::get_int32_ty(v->get_type()->get_context()); + for(size_t i = 0; i < perm.size(); i++) + ref.push_back(ir::constant_int::get(int32_ty, i)); + std::swap(ref[0], ref[1]); + // true is perm == ref + return std::equal(perm.begin(), perm.end(), ref.begin()); } inline bool is_hmma(ir::value *v){ @@ -28,7 +38,6 @@ inline bool is_hmma(ir::value *v){ void optimize_dot::run(ir::module &mod) { ir::builder &builder = mod.get_builder(); - std::vector to_delete; // iterate for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) @@ -47,15 +56,12 @@ void optimize_dot::run(ir::module &mod) { ir::value *BB = B; if(trans_a){ AA = ((ir::trans_inst*)A)->get_operand(0); - to_delete.push_back((ir::instruction*)A); } if(trans_b){ BB = ((ir::trans_inst*)B)->get_operand(0); - to_delete.push_back((ir::instruction*)B); } ir::instruction *dot_atbt = builder.insert(ir::dot_inst::create(AA, BB, D, trans_a, trans_b)); dot->replace_all_uses_with(dot_atbt); - to_delete.push_back(dot); } else{ // dot(op(a), trans(b)) @@ -63,28 +69,24 @@ void optimize_dot::run(ir::module &mod) { ir::value* BB = ((ir::trans_inst*)B)->get_operand(0); ir::instruction *NT = builder.insert(ir::dot_inst::create_nt(A, BB, D)); dot->replace_all_uses_with(NT); - to_delete.push_back((ir::instruction*)B); - to_delete.push_back(dot); } // dot(op(a), b) if(!trans_b){ + // create permutations size_t size = B->get_type()->get_tile_shapes().size(); std::vector perm(size); ir::type *int32_ty = ir::type::get_int32_ty(B->get_type()->get_context()); for(size_t i = 0; i < size; i++) perm[i] = ir::constant_int::get(int32_ty, i); std::swap(perm[0], perm[1]); + // replace NN -> NT (trans) ir::value* BB = builder.create_trans(B, perm); ir::instruction *NT = builder.insert(ir::dot_inst::create_nt(A, BB, D)); dot->replace_all_uses_with(NT); - to_delete.push_back(dot); } } } } - - for(ir::instruction* i: to_delete) - i->erase_from_parent(); } } diff --git a/lib/codegen/optimize_trans.cpp b/lib/codegen/optimize_trans.cpp index 16c4605d7..b1e0dc4b9 100644 --- a/lib/codegen/optimize_trans.cpp +++ b/lib/codegen/optimize_trans.cpp @@ -42,22 +42,32 @@ void optimize_trans::run(ir::module &mod) { for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) for(ir::instruction* i: block->get_inst_list()){ - // filter transposition + // transposition if(auto trans = dynamic_cast(i)) { auto users = trans->get_users(); auto ops = trans->ops(); if(users.size() > 1 || ops.size() > 1) continue; ir::value* op = *ops.begin(); - // chains of transpositions - // TODO - + // todo: chains of transpositions // trans(phi) -> phi(trans(), trans()...) if(dynamic_cast(op)){ ir::value* new_phi = replace_phi(op, builder, trans->get_perm()); trans->replace_all_uses_with(new_phi); } } + // reductions + if(auto x = dynamic_cast(i)) { + ir::constant_int *one = ir::constant_int::get(ir::type::get_int32_ty(i->get_type()->get_context()), 1); + ir::value *arg = x->get_operand(0); + auto shapes = arg->get_type()->get_tile_shapes(); + if(shapes[x->get_axis()] == one){ + builder.set_insert_point(x); + ir::value* new_red = builder.create_reshape(arg, x->get_type()->get_tile_shapes()); + x->replace_all_uses_with(new_red); + } + } + } } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 3b973de71..8ec454842 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -996,8 +996,9 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & distributed_tile *TC = (distributed_tile*)tmap_.at(C); Type *c_ty = llvm_type(C->get_type()->get_scalar_ty(), ctx); Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {c_ty}); + auto A_shapes = A->get_type()->get_tile_shapes(); size_t red_axis = dot->is_a_trans() ? 0 : 1; - unsigned NK = A->get_type()->get_tile_shapes()[red_axis]->get_value(); + unsigned NK = A_shapes[red_axis]->get_value(); if(NK != 1) { shared_tile *TA = (shared_tile*)tmap_.at(A); @@ -1008,18 +1009,27 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & result->for_each([&](indices_t idx){ Value *res = TC->get_value(idx); for(unsigned K = 0; K < NK; ++K){ - indices_t a_idx = {idx[0], builder.getInt32(K), idx[2]}; - indices_t b_idx = {builder.getInt32(K), idx[1], idx[2]}; + // input indices + indices_t a_idx = {idx[0], builder.getInt32(K)}; + indices_t b_idx = {builder.getInt32(K), idx[1]}; if(AT) std::swap(a_idx[0], a_idx[1]); if(BT) std::swap(b_idx[0], b_idx[1]); + // add batching dimension + for(size_t i = 2; i < idx.size(); i++){ + a_idx.insert(a_idx.end(), idx[i]); + b_idx.insert(b_idx.end(), idx[i]); + } + // load value Value *a = TA->get_value(a_idx); Value *b = TB->get_value(b_idx); if(a->getType() != c_ty) a = builder.CreateFPCast(a, c_ty); if(b->getType() != c_ty) b = builder.CreateFPCast(b, c_ty); +// a = ConstantFP::get(builder.getFloatTy(), 1); +// b = ConstantFP::get(builder.getFloatTy(), 1); res = builder.CreateCall(f_mul_add, {a, b, res}); } result->set_value(idx, res); diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 09017e978..bc4c7118d 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -67,8 +67,6 @@ void tune::init_c_graph(ir::instruction *v) { continue; add_constraint({reduce, current++}, {arg, i}); } -// add_constraint({reduce, 0}, {arg, 0}); -// add_constraint({reduce, 1}, {arg, 1}); return; } else @@ -115,7 +113,7 @@ void tune::init_c_graph(ir::instruction *v) { } } // Matrix multiplication - else if(dynamic_cast(v)){ + else if(auto *x = dynamic_cast(v)){ ir::value *A = v->get_operand(0); ir::value *B = v->get_operand(1); ir::value *D = v->get_operand(2); @@ -124,8 +122,8 @@ void tune::init_c_graph(ir::instruction *v) { for(unsigned i = 2; i < shapes.size(); i++){ if(shapes[i] == one) static_params_.insert({{v, i}, 1}); - add_constraint({v, i}, {A, i}); - add_constraint({v, i}, {B, i}); +// add_constraint({v, i}, {A, i}); +// add_constraint({v, i}, {B, i}); } } // Element-wise @@ -268,35 +266,53 @@ void tune::run(ir::module &mod) { for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i : block->get_inst_list()){ + + if(fragments_.find({i, 0}) != fragments_.end() && fragments_.at({i, 0}) != STRIDED_SCAN) continue; - if(auto *ld = dynamic_cast(i)) + + if(auto *x = dynamic_cast(i)) if(i->get_type()->is_tile_ty()){ - ir::type *ptr_ty = ld->get_pointer_operand()->get_type()->get_scalar_ty(); + ir::type *ptr_ty = x->get_pointer_operand()->get_type()->get_scalar_ty(); size_t addr_space = ptr_ty->get_pointer_address_space(); if(addr_space < 4){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 1, 1)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 4)); *params_.at(i).at("nts.d0") = *tmp; } } if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 1, 1)); - std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 1, 1)); + std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 2, 4)); + std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 2, 4)); *params_.at(i).at("nts.d0") = *tmp1; *params_.at(i).at("nts.d1") = *tmp2; } } + + // initialize grids + for(ir::function *fn: mod.get_function_list()){ + std::map references; + create_grids(grids_, references, fn); + } + + for(ir::instruction *i: grids_){ + auto shapes = i->get_type()->get_tile_shapes(); + for(size_t k = 0; k < shapes.size(); k++) + if(shapes[k]->get_value() == 1) { + if(fragments_.at({i, k}) == STRIDED_SCAN){ + params_.at(i).at("nts.d" + std::to_string(k))->set_value(1); + params_.at(i).at("mts.d" + std::to_string(k))->set_value(1); + } + if(fragments_.at({i, k}) == HMMA_FRAGMENT_C){ + params_.at(i).at("fpw.d" + std::to_string(k))->set_value(1); + params_.at(i).at("wpt.d" + std::to_string(k))->set_value(1); + } + } + } } void tune::init(ir::module &mod) { - for(ir::function *fn: mod.get_function_list()){ - // initialize grids - std::map references; - create_grids(grids_, references, fn); - } - // number of threads num_threads_ = get_req_num_threads(grids_.front()); } diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index ebbe699c1..1ad741240 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -64,7 +64,9 @@ std::pair base::get_profile_impl(driver::stream *stream, std::v else{ // params_t params = heuristics(); // params_t params = jit->get_valid(name_.c_str(), src.c_str()); - params_t params = {4, 1, 32, 4, 1, 32, 4, 4, 4, 1, 1, 16, 32, 16, 4, 4, 1}; +// params_t params = {4, 1, 32, 4, 1, 32, 4, 4, 4, 1, 1, 16, 32, 16, 4, 4, 4, 4, 1}; //NT +// params_t params = {4, 1, 32, 4, 32, 4, 4, 4, 1, 1, 16, 32, 16, 1, 4, 4, 4, 4, 4, 1}; //NN + params_t params = {4, 32, 4, 1, 32, 4, 4, 4, 1, 1, 16, 1, 32, 16, 4, 4, 4, 4, 4, 1}; // TT jit->add_module(name_.c_str(), src.c_str(), params); } triton::driver::kernel* kernel = jit->get_function(name_.c_str()); diff --git a/lib/dnn/dot.cpp b/lib/dnn/dot.cpp index a43ea1ca4..83798921a 100644 --- a/lib/dnn/dot.cpp +++ b/lib/dnn/dot.cpp @@ -74,22 +74,24 @@ void dot::enqueue_impl(driver::stream *stream, driver::kernel *kernel, void dot::triton_c_src(std::ostream &os) const { std::string AS0 = "TM", AS1 = "TK"; std::string BS0 = "TK", BS1 = "TN"; - std::string XAS0 = "TM", XAS1 = "TK/4", XAS2 = "4"; - std::string XBS0 = "TK/4", XBS1 = "TN", XBS2 = "4"; + std::string XAS0 = "TM", XAS1 = "TK/1", XAS2 = "1"; + std::string XBS0 = "TK/1", XBS1 = "1", XBS2 = "TN"; std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; std::string lda0 = "*lda", lda1 = ""; std::string ldb0 = "", ldb1 = "*ldb"; - std::string usea = AT_ ? "trans(xa)" : "xa"; - std::string useb = BT_ ? "trans(xb)" : "xb"; + std::string usea = AT_ ? "trans(xa, 0, 2, 1)" : "xa"; + std::string useb = BT_ ? "trans(xb, 1, 0, 2)" : "trans(xb, 0, 2, 1)"; if(AT_){ std::swap(AS0, AS1); std::swap(XAS0, XAS1); + std::swap(XAS1, XAS2); std::swap(bca0, bca1); std::swap(lda0, lda1); } if(BT_){ std::swap(BS0, BS1); + std::swap(XBS1, XBS2); std::swap(XBS0, XBS1); std::swap(bcb0, bcb1); std::swap(ldb0, ldb1); @@ -98,7 +100,7 @@ void dot::triton_c_src(std::ostream &os) const { std::string BS = BS0 + ", " + BS1; std::string XAS = XAS0 + ", " + XAS1 + ", " + XAS2; std::string XBS = XBS0 + ", " + XBS1 + ", " + XBS2; - std::string XCS = "TM, TN, 4"; + std::string XCS = "TM, TN, 1"; std::string align_lda_str = "multiple_of(" + std::to_string(align_lda_) + ")"; std::string align_ldb_str = "multiple_of(" + std::to_string(align_ldb_) + ")"; std::string res = @@ -146,7 +148,7 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, } )"; - std::cout << res << std::endl; +// std::cout << res << std::endl; os << res; } diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 58a81cd3b..7ae5b73ec 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -482,7 +482,8 @@ std::string retile_inst::shape_suffix(ir::type* ty){ std::string res = "["; const auto& shapes = ty->get_tile_shapes(); for(unsigned i = 0; i < shapes.size(); i++){ - res += std::to_string(ty->get_tile_shapes()[i]->get_value()); + ir::constant_int *shape_i = ty->get_tile_shapes()[i]; + res += shape_i->repr(); if(i < shapes.size() - 1) res += ", "; } @@ -566,26 +567,33 @@ instruction *dot_inst::create_tt(value *A, value *B, value *C, // trans instructions //===----------------------------------------------------------------------===// -ir::type* trans_inst::get_res_ty(ir::type* ty) { - auto shapes = ty->get_tile_shapes(); - std::rotate(shapes.begin(), shapes.begin() + 1, shapes.end()); - return tile_type::get(ty->get_scalar_ty(), shapes); +ir::type* trans_inst::get_res_ty(ir::type* ty, std::vector perm) { + // get argument shapes + ir::tile_type::tile_shapes_t arg_shapes = ty->get_tile_shapes(); + // permutate argument shapes + perm = init_perm(ty, perm); + ir::tile_type::tile_shapes_t res_shapes = arg_shapes; + for(int i = 0; i < perm.size(); i++) + res_shapes[i] = arg_shapes[perm[i]->get_value()]; + // construct type + return tile_type::get(ty->get_scalar_ty(), res_shapes); } -std::vector trans_inst::get_default_perm(ir::type* ty) { +std::vector trans_inst::init_perm(ir::type* ty, const std::vector& perm) { + if(!perm.empty()) + return perm; auto size = ty->get_tile_shapes().size(); ir::type* int32_ty = type::get_int32_ty(ty->get_context()); std::vector result; - for(size_t i = 0; i < size; i++) - result.push_back(ir::constant_int::get(int32_ty, i + 1 % size)); + result.push_back(ir::constant_int::get(int32_ty, size - 1)); + for(int i = 0; i < size - 1; i++) + result.push_back(ir::constant_int::get(int32_ty, i)); return result; } trans_inst::trans_inst(value *arg, const std::vector& perm, const std::string &name, instruction *next) - : builtin_inst(get_res_ty(arg->get_type()), 1, 1, name, next) { - perm_ = perm; - if(perm_.empty()) - perm_ = get_default_perm(arg->get_type()); + : builtin_inst(get_res_ty(arg->get_type(), perm), 1, 1, name, next) { + perm_ = init_perm(arg->get_type(), perm); auto size = arg->get_type()->get_tile_shapes().size(); assert(perm_.size() == size); set_operand(0, arg); @@ -615,7 +623,7 @@ instruction* sqrt_inst::create(value *arg, const std::string &name, instruction //===----------------------------------------------------------------------===// // reduce instructions //===----------------------------------------------------------------------===// -type* reduce_inst::get_type(value *arg, unsigned axis) { +type* reduce_inst::get_res_type(value *arg, unsigned axis) { ir::tile_type::tile_shapes_t shapes = arg->get_type()->get_tile_shapes(); shapes.erase(shapes.begin() + axis); type *scalar_ty = arg->get_type()->get_scalar_ty(); @@ -626,7 +634,7 @@ type* reduce_inst::get_type(value *arg, unsigned axis) { } reduce_inst::reduce_inst(value *arg, unsigned axis, const std::string &name, instruction *next) - : builtin_inst(get_type(arg, axis), 1, 1, name, next), + : builtin_inst(get_res_type(arg, axis), 1, 1, name, next), axis_(axis){ set_operand(0, arg); } diff --git a/lib/lang/expression.cpp b/lib/lang/expression.cpp index c54179943..acbfaf6f6 100644 --- a/lib/lang/expression.cpp +++ b/lib/lang/expression.cpp @@ -203,7 +203,17 @@ ir::value* select_expression::codegen(ir::module *mod) const { // trans ir::value* trans_expression::codegen(ir::module *mod) const { - return mod->get_builder().create_trans(arg_->codegen(mod)); + // shapes + std::vector perm; + if(perm_) { + for(expression *expr: perm_->values()){ + ir::constant_int *shape = dynamic_cast(expr->codegen(mod)); + if(shape == nullptr) + throw std::runtime_error("tile shapes must be constant expressions"); + perm.push_back(shape); + } + } + return mod->get_builder().create_trans(arg_->codegen(mod), perm); } // sqrt diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index 86102a460..1f6a60ccd 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -37,13 +37,13 @@ void parallel_loop_nest(std::vector const & ranges, size_t D = ranges.size(); std::vector values(D, 0); // thread pools -// ThreadPool pool(nthreads); + ThreadPool pool(nthreads); // Start with innermost loop size_t i = D - 1; while(true){ // Execute function -// pool.enqueue(f,values); - f(values); + pool.enqueue(f,values); +// f(values); while(values[i]++ == ranges[i] - 1){ if(i == 0) return; From cf256a636c17b551259df97aa51addf8c807f151 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 6 Aug 2019 16:44:16 -0700 Subject: [PATCH 281/494] fixup --- examples/cpp/dot.cpp | 17 +++++++-------- include/triton/dnn/dot.h | 4 ++-- include/triton/runtime/jit.h | 1 - lib/codegen/tune.cpp | 40 +++++++++++++++++++----------------- lib/dnn/base.cpp | 4 ++-- lib/dnn/dot.cpp | 9 ++++---- lib/runtime/jit.cpp | 6 +++--- 7 files changed, 42 insertions(+), 39 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 87bb739e2..3f04d01ad 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -6,6 +6,7 @@ #include "triton/driver/stream.h" #include "triton/dnn/dot.h" #include "triton/tools/bench.hpp" +#include "triton/external/half.hpp" #include "cuda.h" template @@ -25,7 +26,7 @@ struct perf_t { perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K){ - typedef float NumericT; + typedef half NumericT; std::string ty = "half"; size_t dt_nbytes = sizeof(NumericT); triton::driver::context* context = stream->context(); @@ -34,11 +35,11 @@ perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int std::vector hb(K*N); srand(0); for(size_t i = 0; i < ha.size(); i++) - ha[i] = (NumericT)rand()/RAND_MAX; + ha[i] = static_cast((double)rand()/RAND_MAX); for(size_t i = 0; i < hb.size(); i++) - hb[i] = (NumericT)rand()/RAND_MAX; + hb[i] = static_cast((double)rand()/RAND_MAX); for(size_t i = 0; i < hc.size(); i++) - hc[i] = 0; + hc[i] = static_cast((double)0); triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*dt_nbytes); triton::driver::buffer* da = triton::driver::buffer::create(context, ha.size()*dt_nbytes); triton::driver::buffer* db = triton::driver::buffer::create(context, hb.size()*dt_nbytes); @@ -48,7 +49,7 @@ perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int stream->synchronize(); triton::dnn::dot dot(M, N, K, AT, BT, ty, ty, 8, 8, 8); // benchmark triton - double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::FULL_TUNING);}, stream); + double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::NO_TUNING);}, stream); // benchmark cublas // NumericT alpha = 1; // NumericT beta = 0; @@ -73,10 +74,10 @@ perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int // test stream->read(dc, true, 0, hc); - std::vector rc(hc.size()); + std::vector rc(hc.size()); dot.cpu_ref(rc, ha, hb); for(size_t i = 0; i < M*N; i++) - if(!std::isnan(hc[i]) && std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ + if(std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; exit(EXIT_FAILURE); } @@ -111,7 +112,7 @@ int main() { std::vector configs = { // {false, false, 8192, 512, 512}, // {false, true, 8192, 8192, 8192} - {true, true, 128, 128, 128}, + {false, true, 128, 128, 128}, // {false, true, 32768, 256, 512} // {true, false, 8192, 512, 512}, // {true, true, 8192, 512, 512} diff --git a/include/triton/dnn/dot.h b/include/triton/dnn/dot.h index c655d12b5..2beeede7b 100644 --- a/include/triton/dnn/dot.h +++ b/include/triton/dnn/dot.h @@ -42,9 +42,9 @@ public: size_t M, size_t N, size_t K){ for(size_t m = 0; m < M; m++) for(size_t n = 0; n < N; n++){ - T acc = 0; + T acc = static_cast((double)0); for(size_t k = 0; k < K; k++) - acc += (AT?a[k + m*K]:a[m + k*M]) * (BT?b[n + k*N]:b[k + n*K]); + acc = acc + (AT?a[k + m*K]:a[m + k*M]) * (BT?b[n + k*N]:b[k + n*K]); c[m + n*M] = acc; } } diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index fffec7794..ae227b135 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -73,7 +73,6 @@ public: optimize_dot.run(module); optimize_trans.run(module); optimize_dce.run(module); -// ir::print(module, std::cout); } void target_dependent(ir::module &module) { diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index bc4c7118d..b05f7e79e 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -194,7 +194,6 @@ std::vector tune::get_params(ir::module &mod) { for(ir::instruction *i : block->get_inst_list()) for(auto &x: params_[i]) if(seen.insert(x.second).second && !x.second->has_value()){ -// std::cout << i->get_name() << " " << x.first << std::endl; result.push_back(x.second); } @@ -291,28 +290,29 @@ void tune::run(ir::module &mod) { } // initialize grids + +// for(ir::instruction *i: grids_){ +// auto shapes = i->get_type()->get_tile_shapes(); +// for(size_t k = 0; k < shapes.size(); k++) +// if(shapes[k]->get_value() == 1) { +// if(fragments_.at({i, k}) == STRIDED_SCAN){ +// params_.at(i).at("nts.d" + std::to_string(k))->set_value(1); +// params_.at(i).at("mts.d" + std::to_string(k))->set_value(1); +// } +// if(fragments_.at({i, k}) == HMMA_FRAGMENT_C){ +// params_.at(i).at("fpw.d" + std::to_string(k))->set_value(1); +// params_.at(i).at("wpt.d" + std::to_string(k))->set_value(1); +// } +// } +// } +} + +void tune::init(ir::module &mod) { for(ir::function *fn: mod.get_function_list()){ std::map references; create_grids(grids_, references, fn); } - for(ir::instruction *i: grids_){ - auto shapes = i->get_type()->get_tile_shapes(); - for(size_t k = 0; k < shapes.size(); k++) - if(shapes[k]->get_value() == 1) { - if(fragments_.at({i, k}) == STRIDED_SCAN){ - params_.at(i).at("nts.d" + std::to_string(k))->set_value(1); - params_.at(i).at("mts.d" + std::to_string(k))->set_value(1); - } - if(fragments_.at({i, k}) == HMMA_FRAGMENT_C){ - params_.at(i).at("fpw.d" + std::to_string(k))->set_value(1); - params_.at(i).at("wpt.d" + std::to_string(k))->set_value(1); - } - } - } -} - -void tune::init(ir::module &mod) { num_threads_ = get_req_num_threads(grids_.front()); } @@ -407,7 +407,9 @@ bool tune::check_constraints(std::map> &er else { ir::metaparameter *fpw = params_[i]["fpw.d" + strk]; ir::metaparameter *wpt = params_[i]["wpt.d" + strk]; - multiple = fpw->get_value()*wpt->get_value()*8; + multiple = fpw->get_value()*wpt->get_value(); + if(k < 2) + multiple *= 8; } if(shapes[k]->get_value() % multiple != 0) errors[i].push_back("for dim " + strk + ": shape (" + to_string(shapes[k]->get_value()) + ")" diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index 1ad741240..8c482b0b6 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -62,11 +62,11 @@ std::pair base::get_profile_impl(driver::stream *stream, std::v jit->add_module(name_.c_str(), src.c_str(), best.params); } else{ -// params_t params = heuristics(); + params_t params = heuristics(); // params_t params = jit->get_valid(name_.c_str(), src.c_str()); // params_t params = {4, 1, 32, 4, 1, 32, 4, 4, 4, 1, 1, 16, 32, 16, 4, 4, 4, 4, 1}; //NT // params_t params = {4, 1, 32, 4, 32, 4, 4, 4, 1, 1, 16, 32, 16, 1, 4, 4, 4, 4, 4, 1}; //NN - params_t params = {4, 32, 4, 1, 32, 4, 4, 4, 1, 1, 16, 1, 32, 16, 4, 4, 4, 4, 4, 1}; // TT +// params_t params = {4, 32, 4, 1, 32, 4, 4, 4, 1, 1, 16, 1, 32, 16, 4, 4, 4, 4, 4, 1}; // TT jit->add_module(name_.c_str(), src.c_str(), params); } triton::driver::kernel* kernel = jit->get_function(name_.c_str()); diff --git a/lib/dnn/dot.cpp b/lib/dnn/dot.cpp index 83798921a..65395695c 100644 --- a/lib/dnn/dot.cpp +++ b/lib/dnn/dot.cpp @@ -74,8 +74,8 @@ void dot::enqueue_impl(driver::stream *stream, driver::kernel *kernel, void dot::triton_c_src(std::ostream &os) const { std::string AS0 = "TM", AS1 = "TK"; std::string BS0 = "TK", BS1 = "TN"; - std::string XAS0 = "TM", XAS1 = "TK/1", XAS2 = "1"; - std::string XBS0 = "TK/1", XBS1 = "1", XBS2 = "TN"; + std::string XAS0 = "TM", XAS1 = "TK", XAS2 = "1"; + std::string XBS0 = "TK", XBS1 = "1", XBS2 = "TN"; std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; std::string lda0 = "*lda", lda1 = ""; @@ -105,11 +105,12 @@ void dot::triton_c_src(std::ostream &os) const { std::string align_ldb_str = "multiple_of(" + std::to_string(align_ldb_) + ")"; std::string res = R"( -const tunable int TM = {32}; -const tunable int TN = {32}; +const tunable int TM = {16, 32, 64, 128}; +const tunable int TN = {16, 32, 64, 128}; const tunable int TK = {32}; const tunable int GZ = {1}; + void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, restrict read_only align(16) )" + b_ty_ + R"( *B, restrict read_only align(16) float *C, diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index 1f6a60ccd..86102a460 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -37,13 +37,13 @@ void parallel_loop_nest(std::vector const & ranges, size_t D = ranges.size(); std::vector values(D, 0); // thread pools - ThreadPool pool(nthreads); +// ThreadPool pool(nthreads); // Start with innermost loop size_t i = D - 1; while(true){ // Execute function - pool.enqueue(f,values); -// f(values); +// pool.enqueue(f,values); + f(values); while(values[i]++ == ranges[i] - 1){ if(i == 0) return; From 6c39cdbace56909199e521a4eabccb5b409ab239 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 6 Aug 2019 16:48:53 -0700 Subject: [PATCH 282/494] making sure changes didn't break HMMA --- lib/dnn/dot.cpp | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/lib/dnn/dot.cpp b/lib/dnn/dot.cpp index 65395695c..3b9a2e300 100644 --- a/lib/dnn/dot.cpp +++ b/lib/dnn/dot.cpp @@ -74,33 +74,24 @@ void dot::enqueue_impl(driver::stream *stream, driver::kernel *kernel, void dot::triton_c_src(std::ostream &os) const { std::string AS0 = "TM", AS1 = "TK"; std::string BS0 = "TK", BS1 = "TN"; - std::string XAS0 = "TM", XAS1 = "TK", XAS2 = "1"; - std::string XBS0 = "TK", XBS1 = "1", XBS2 = "TN"; std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; std::string lda0 = "*lda", lda1 = ""; std::string ldb0 = "", ldb1 = "*ldb"; - std::string usea = AT_ ? "trans(xa, 0, 2, 1)" : "xa"; - std::string useb = BT_ ? "trans(xb, 1, 0, 2)" : "trans(xb, 0, 2, 1)"; + std::string usea = AT_ ? "trans(a)" : "a"; + std::string useb = BT_ ? "trans(b)" : "b"; if(AT_){ std::swap(AS0, AS1); - std::swap(XAS0, XAS1); - std::swap(XAS1, XAS2); std::swap(bca0, bca1); std::swap(lda0, lda1); } if(BT_){ std::swap(BS0, BS1); - std::swap(XBS1, XBS2); - std::swap(XBS0, XBS1); std::swap(bcb0, bcb1); std::swap(ldb0, ldb1); } std::string AS = AS0 + ", " + AS1; std::string BS = BS0 + ", " + BS1; - std::string XAS = XAS0 + ", " + XAS1 + ", " + XAS2; - std::string XBS = XBS0 + ", " + XBS1 + ", " + XBS2; - std::string XCS = "TM, TN, 1"; std::string align_lda_str = "multiple_of(" + std::to_string(align_lda_) + ")"; std::string align_ldb_str = "multiple_of(" + std::to_string(align_ldb_) + ")"; std::string res = @@ -110,7 +101,6 @@ const tunable int TN = {16, 32, 64, 128}; const tunable int TK = {32}; const tunable int GZ = {1}; - void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, restrict read_only align(16) )" + b_ty_ + R"( *B, restrict read_only align(16) float *C, @@ -123,7 +113,7 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, int ryb[TN] = ridy * TN + (0 ... TN); int rka[TK] = 0 ... TK; int rkb[TK] = 0 ... TK; - float xc[)" + XCS + R"(] = 0; + float c[TM, TN] = 0; )" + a_ty_ + R"(* pa[)" + AS + "] = A + rka" + bca0 + lda0 + " + rxa" + bca1 + lda1 + R"(; )" + b_ty_ + R"(* pb[)" + BS + "] = B + rkb" + bcb0 + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; bool checka[)" + AS + R"(] = (rka < K))" + bca0 + " && (rxa < M)" + bca1 + R"(; @@ -131,9 +121,7 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, )" + a_ty_ + R"( a[)" + AS + R"(] = checka ? *pa : 0; )" + b_ty_ + R"( b[)" + BS + R"(] = checkb ? *pb : 0; for(int k = K; k > 0; k = k - TK){ - )" + a_ty_ + R"( xa[)" + XAS + "] = __reshape(a, " + XAS + R"(); - )" + b_ty_ + R"( xb[)" + XBS + "] = __reshape(b, " + XBS + R"(); - xc = dot()" + usea + ", " + useb + R"(, xc); + c = dot()" + usea + ", " + useb + R"(, c); pa = pa + TK)" + lda0 + R"(; pb = pb + TK)" + ldb0 + R"(; bool checka[)" + AS + R"(] = k > TK; @@ -143,13 +131,14 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, } int rxc[TM] = ridx * TM + (0 ... TM); int ryc[TN] = ridy * TN + (0 ... TN); + bool checkc0[TM] = rxc < M; + bool checkc1[TN] = ryc < N; + bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; float* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - float c[TM, TN] = __sum(xc, 2); - *pc = c; + @checkc *pc = c; } )"; -// std::cout << res << std::endl; os << res; } From 0e201e18ff90f4ce937e0777e9883da35b497ec3 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 6 Aug 2019 17:14:16 -0700 Subject: [PATCH 283/494] fixed simple FP16 test --- examples/cpp/dot.cpp | 4 ++-- examples/python/tensorflow/dot.cpp | 4 ++-- examples/python/tensorflow/run.py | 2 +- include/triton/dnn/dot.h | 9 ++++---- lib/dnn/dot.cpp | 35 ++++++++++++++++++++---------- 5 files changed, 33 insertions(+), 21 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 3f04d01ad..6d4c728a8 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -47,7 +47,7 @@ perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int stream->write(db, true, 0, hb); stream->write(dc, true, 0, hc); stream->synchronize(); - triton::dnn::dot dot(M, N, K, AT, BT, ty, ty, 8, 8, 8); + triton::dnn::dot dot(M, N, K, AT, BT, ty, ty, ty, 8, 8, 8); // benchmark triton double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::NO_TUNING);}, stream); // benchmark cublas @@ -77,7 +77,7 @@ perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int std::vector rc(hc.size()); dot.cpu_ref(rc, ha, hb); for(size_t i = 0; i < M*N; i++) - if(std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ + if(std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-2){ std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; exit(EXIT_FAILURE); } diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp index bdcb5c62c..453bb87cb 100644 --- a/examples/python/tensorflow/dot.cpp +++ b/examples/python/tensorflow/dot.cpp @@ -49,8 +49,8 @@ class DotOp : public OpKernel { triton::driver::cu_buffer db(ctx, b.tensor_data().size(), (CUdeviceptr)b.tensor_data().data(), false); triton::driver::cu_buffer dc(ctx, c->tensor_data().size(), (CUdeviceptr)c->tensor_data().data(), false); // template - triton::dnn::dot dot(M, N, K, false, true, "half", "half", 8, 8, 8); - dot.enqueue(stream, {&da, &db, &dc}); + triton::dnn::dot dot(M, N, K, false, false, "half", "half", "float", 8, 8, 8); + dot.enqueue(stream, {&da, &db, &dc}, triton::dnn::autotuning_t::NO_TUNING); } private: diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py index 4b1f7ac53..ffdde3f76 100644 --- a/examples/python/tensorflow/run.py +++ b/examples/python/tensorflow/run.py @@ -23,7 +23,7 @@ def run_dot(): result = sess.run([c], feed_dict = {a: ha, b: hb})[0] # Test - hresult = np.dot(ha.T, hb).T + hresult = np.dot(ha.T, hb.T).T dif = np.abs(result - hresult) np.savetxt('dif.dat', dif, '%2.4f') print(hresult) diff --git a/include/triton/dnn/dot.h b/include/triton/dnn/dot.h index 2beeede7b..f36d05db5 100644 --- a/include/triton/dnn/dot.h +++ b/include/triton/dnn/dot.h @@ -24,7 +24,7 @@ private: public: dot(int M, int N, int K, bool AT, bool BT, - std::string a_ty, std::string b_ty, + std::string a_ty, std::string b_ty, std::string c_ty, unsigned align_lda, unsigned align_ldb, unsigned align_ldc); // number of flops @@ -42,10 +42,10 @@ public: size_t M, size_t N, size_t K){ for(size_t m = 0; m < M; m++) for(size_t n = 0; n < N; n++){ - T acc = static_cast((double)0); + float acc = 0; for(size_t k = 0; k < K; k++) - acc = acc + (AT?a[k + m*K]:a[m + k*M]) * (BT?b[n + k*N]:b[k + n*K]); - c[m + n*M] = acc; + acc = acc + (AT ? a[k + m*K] : a[m + k*M]) * (BT ? b[n + k*N] : b[k + n*K]); + c[m + n*M] = static_cast(acc); } } template @@ -68,6 +68,7 @@ private: bool BT_; std::string a_ty_; std::string b_ty_; + std::string c_ty_; unsigned align_lda_; unsigned align_ldb_; unsigned align_ldc_; diff --git a/lib/dnn/dot.cpp b/lib/dnn/dot.cpp index 3b9a2e300..ddad107f0 100644 --- a/lib/dnn/dot.cpp +++ b/lib/dnn/dot.cpp @@ -9,11 +9,11 @@ namespace dnn{ dot::dot(int M, int N, int K, bool AT, bool BT, - std::string a_ty, std::string b_ty, + std::string a_ty, std::string b_ty, std::string c_ty, unsigned align_lda, unsigned align_ldb, unsigned align_ldc) : base("matmul"), M_(M), N_(N), K_(K), AT_(AT), BT_(BT), - a_ty_(a_ty), b_ty_(b_ty), + a_ty_(a_ty), b_ty_(b_ty), c_ty_(c_ty), align_lda_(align_lda), align_ldb_(align_ldb), align_ldc_(align_ldc), locks_(nullptr) { @@ -74,24 +74,33 @@ void dot::enqueue_impl(driver::stream *stream, driver::kernel *kernel, void dot::triton_c_src(std::ostream &os) const { std::string AS0 = "TM", AS1 = "TK"; std::string BS0 = "TK", BS1 = "TN"; + std::string XAS0 = "TM", XAS1 = "TK", XAS2 = "1"; + std::string XBS0 = "TK", XBS1 = "1", XBS2 = "TN"; std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; std::string lda0 = "*lda", lda1 = ""; std::string ldb0 = "", ldb1 = "*ldb"; - std::string usea = AT_ ? "trans(a)" : "a"; - std::string useb = BT_ ? "trans(b)" : "b"; + std::string usea = AT_ ? "trans(xa, 0, 2, 1)" : "xa"; + std::string useb = BT_ ? "trans(xb, 1, 0, 2)" : "trans(xb, 0, 2, 1)"; if(AT_){ std::swap(AS0, AS1); + std::swap(XAS0, XAS1); + std::swap(XAS1, XAS2); std::swap(bca0, bca1); std::swap(lda0, lda1); } if(BT_){ std::swap(BS0, BS1); + std::swap(XBS1, XBS2); + std::swap(XBS0, XBS1); std::swap(bcb0, bcb1); std::swap(ldb0, ldb1); } std::string AS = AS0 + ", " + AS1; std::string BS = BS0 + ", " + BS1; + std::string XAS = XAS0 + ", " + XAS1 + ", " + XAS2; + std::string XBS = XBS0 + ", " + XBS1 + ", " + XBS2; + std::string XCS = "TM, TN, 1"; std::string align_lda_str = "multiple_of(" + std::to_string(align_lda_) + ")"; std::string align_ldb_str = "multiple_of(" + std::to_string(align_ldb_) + ")"; std::string res = @@ -101,9 +110,10 @@ const tunable int TN = {16, 32, 64, 128}; const tunable int TK = {32}; const tunable int GZ = {1}; + void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, restrict read_only align(16) )" + b_ty_ + R"( *B, - restrict read_only align(16) float *C, + restrict read_only align(16) )" + c_ty_ + R"( *C, int M, int N, int K, )" + align_lda_str + R"( int lda, )" + align_ldb_str + R"(" int ldb, int ldc, int bound, int *locks, int grid0, int grid1) { @@ -113,7 +123,7 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, int ryb[TN] = ridy * TN + (0 ... TN); int rka[TK] = 0 ... TK; int rkb[TK] = 0 ... TK; - float c[TM, TN] = 0; + float xc[)" + XCS + R"(] = 0; )" + a_ty_ + R"(* pa[)" + AS + "] = A + rka" + bca0 + lda0 + " + rxa" + bca1 + lda1 + R"(; )" + b_ty_ + R"(* pb[)" + BS + "] = B + rkb" + bcb0 + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; bool checka[)" + AS + R"(] = (rka < K))" + bca0 + " && (rxa < M)" + bca1 + R"(; @@ -121,7 +131,9 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, )" + a_ty_ + R"( a[)" + AS + R"(] = checka ? *pa : 0; )" + b_ty_ + R"( b[)" + BS + R"(] = checkb ? *pb : 0; for(int k = K; k > 0; k = k - TK){ - c = dot()" + usea + ", " + useb + R"(, c); + )" + a_ty_ + R"( xa[)" + XAS + "] = __reshape(a, " + XAS + R"(); + )" + b_ty_ + R"( xb[)" + XBS + "] = __reshape(b, " + XBS + R"(); + xc = dot()" + usea + ", " + useb + R"(, xc); pa = pa + TK)" + lda0 + R"(; pb = pb + TK)" + ldb0 + R"(; bool checka[)" + AS + R"(] = k > TK; @@ -131,14 +143,13 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, } int rxc[TM] = ridx * TM + (0 ... TM); int ryc[TN] = ridy * TN + (0 ... TN); - bool checkc0[TM] = rxc < M; - bool checkc1[TN] = ryc < N; - bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - float* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - @checkc *pc = c; + )" + c_ty_ + R"(* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; + )" + c_ty_ + R"( c[TM, TN] = __sum(xc, 2); + *pc = c; } )"; +// std::cout << res << std::endl; os << res; } From 46e9863ebeadd9c2e8e3b9c611156b47197cc23a Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 6 Aug 2019 17:19:13 -0700 Subject: [PATCH 284/494] better fp16 support for dot --- examples/cpp/dot.cpp | 10 +++++++--- lib/dnn/base.cpp | 4 ++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 6d4c728a8..0a64729e9 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -26,8 +26,8 @@ struct perf_t { perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K){ - typedef half NumericT; - std::string ty = "half"; + typedef float NumericT; + std::string ty = "float"; size_t dt_nbytes = sizeof(NumericT); triton::driver::context* context = stream->context(); std::vector hc(M*N); @@ -112,7 +112,11 @@ int main() { std::vector configs = { // {false, false, 8192, 512, 512}, // {false, true, 8192, 8192, 8192} - {false, true, 128, 128, 128}, +// {false, true, 128, 128, 128}, +// {false, false, 128, 128, 128}, +// {true, false, 128, 128, 128}, + {true, true, 128, 128, 128} + // {false, true, 32768, 256, 512} // {true, false, 8192, 512, 512}, // {true, true, 8192, 512, 512} diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index 8c482b0b6..394084395 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -62,11 +62,11 @@ std::pair base::get_profile_impl(driver::stream *stream, std::v jit->add_module(name_.c_str(), src.c_str(), best.params); } else{ - params_t params = heuristics(); +// params_t params = heuristics(); // params_t params = jit->get_valid(name_.c_str(), src.c_str()); // params_t params = {4, 1, 32, 4, 1, 32, 4, 4, 4, 1, 1, 16, 32, 16, 4, 4, 4, 4, 1}; //NT // params_t params = {4, 1, 32, 4, 32, 4, 4, 4, 1, 1, 16, 32, 16, 1, 4, 4, 4, 4, 4, 1}; //NN -// params_t params = {4, 32, 4, 1, 32, 4, 4, 4, 1, 1, 16, 1, 32, 16, 4, 4, 4, 4, 4, 1}; // TT + params_t params = {4, 16, 4, 2, 16, 4, 8, 2, 2, 8, 2, 32, 8, 1}; // TT jit->add_module(name_.c_str(), src.c_str(), params); } triton::driver::kernel* kernel = jit->get_function(name_.c_str()); From 494bfa7671961008c266b01754c1b4aac9f85cc2 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 6 Aug 2019 17:34:00 -0700 Subject: [PATCH 285/494] didn't break correctness of existing HMMA --- examples/cpp/dot.cpp | 10 +++++----- lib/codegen/selection.cpp | 1 + lib/dnn/base.cpp | 4 ++-- lib/dnn/dot.cpp | 3 +-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 0a64729e9..9e883c792 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -26,8 +26,8 @@ struct perf_t { perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K){ - typedef float NumericT; - std::string ty = "float"; + typedef half NumericT; + std::string ty = "half"; size_t dt_nbytes = sizeof(NumericT); triton::driver::context* context = stream->context(); std::vector hc(M*N); @@ -112,9 +112,9 @@ int main() { std::vector configs = { // {false, false, 8192, 512, 512}, // {false, true, 8192, 8192, 8192} -// {false, true, 128, 128, 128}, -// {false, false, 128, 128, 128}, -// {true, false, 128, 128, 128}, + {false, true, 128, 128, 128}, + {false, false, 128, 128, 128}, + {true, false, 128, 128, 128}, {true, true, 128, 128, 128} // {false, true, 32768, 256, 512} diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 8ec454842..1b93765db 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -999,6 +999,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & auto A_shapes = A->get_type()->get_tile_shapes(); size_t red_axis = dot->is_a_trans() ? 0 : 1; unsigned NK = A_shapes[red_axis]->get_value(); +// std::cout << red_axis << " " << NK << std::endl; if(NK != 1) { shared_tile *TA = (shared_tile*)tmap_.at(A); diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index 394084395..6293139e2 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -62,11 +62,11 @@ std::pair base::get_profile_impl(driver::stream *stream, std::v jit->add_module(name_.c_str(), src.c_str(), best.params); } else{ -// params_t params = heuristics(); + params_t params = heuristics(); // params_t params = jit->get_valid(name_.c_str(), src.c_str()); // params_t params = {4, 1, 32, 4, 1, 32, 4, 4, 4, 1, 1, 16, 32, 16, 4, 4, 4, 4, 1}; //NT // params_t params = {4, 1, 32, 4, 32, 4, 4, 4, 1, 1, 16, 32, 16, 1, 4, 4, 4, 4, 4, 1}; //NN - params_t params = {4, 16, 4, 2, 16, 4, 8, 2, 2, 8, 2, 32, 8, 1}; // TT +// params_t params = {4, 16, 4, 2, 16, 4, 8, 2, 2, 8, 2, 32, 8, 1}; // TT jit->add_module(name_.c_str(), src.c_str(), params); } triton::driver::kernel* kernel = jit->get_function(name_.c_str()); diff --git a/lib/dnn/dot.cpp b/lib/dnn/dot.cpp index ddad107f0..d310ad5fb 100644 --- a/lib/dnn/dot.cpp +++ b/lib/dnn/dot.cpp @@ -80,7 +80,7 @@ void dot::triton_c_src(std::ostream &os) const { std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; std::string lda0 = "*lda", lda1 = ""; std::string ldb0 = "", ldb1 = "*ldb"; - std::string usea = AT_ ? "trans(xa, 0, 2, 1)" : "xa"; + std::string usea = AT_ ? "trans(xa, 2, 0, 1)" : "xa"; std::string useb = BT_ ? "trans(xb, 1, 0, 2)" : "trans(xb, 0, 2, 1)"; if(AT_){ std::swap(AS0, AS1); @@ -149,7 +149,6 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, } )"; -// std::cout << res << std::endl; os << res; } From 7b75b68edc0f8c9fa2c2e59e7622071ebf4e0d5d Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 6 Aug 2019 21:07:13 -0700 Subject: [PATCH 286/494] dirty but working warp-splitting --- examples/cpp/dot.cpp | 6 +- include/triton/codegen/selection.h | 4 +- include/triton/runtime/jit.h | 4 +- lib/codegen/optimize_dce.cpp | 2 +- lib/codegen/optimize_dot.cpp | 18 +++++ lib/codegen/selection.cpp | 122 +++++++++++++++++------------ lib/codegen/shmem_allocation.cpp | 12 ++- lib/codegen/tune.cpp | 2 + lib/codegen/vectorize.cpp | 13 ++- lib/dnn/base.cpp | 4 +- lib/dnn/dot.cpp | 7 +- lib/driver/module.cpp | 1 - lib/runtime/jit.cpp | 6 +- 13 files changed, 132 insertions(+), 69 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 9e883c792..231e98f88 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -113,9 +113,9 @@ int main() { // {false, false, 8192, 512, 512}, // {false, true, 8192, 8192, 8192} {false, true, 128, 128, 128}, - {false, false, 128, 128, 128}, - {true, false, 128, 128, 128}, - {true, true, 128, 128, 128} +// {false, false, 128, 128, 128}, +// {true, false, 128, 128, 128}, +// {true, true, 128, 128, 128} // {false, true, 32768, 256, 512} // {true, false, 8192, 512, 512}, diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index b480da5f0..654faf95b 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -153,8 +153,8 @@ private: alignment_info *axis_info_; std::map axes_; llvm::Value *sh_mem_ptr_; - llvm::Value *offset_a_i_, *offset_a_k_; - llvm::Value *offset_b_j_, *offset_b_k_; + llvm::Value *offset_a_i_, *offset_a_k_, *offset_a_z_; + llvm::Value *offset_b_j_, *offset_b_k_, *offset_b_z_; unsigned num_packs_0_, num_packs_1_; unsigned pack_size_0_, pack_size_1_; }; diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index ae227b135..8758cb78d 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -77,7 +77,7 @@ public: void target_dependent(ir::module &module) { alignment_info.run(module); -// reassociate.run(module); + reassociate.run(module); if(target_->is_gpu()){ shmem_info.run(module); shmem_liveness.run(module); @@ -86,7 +86,7 @@ public: } vectorize.run(module); optimize_dce.run(module); -// ir::print(module, std::cout); + ir::print(module, std::cout); } codegen::tune tune; diff --git a/lib/codegen/optimize_dce.cpp b/lib/codegen/optimize_dce.cpp index ec42729ec..8caf22a62 100644 --- a/lib/codegen/optimize_dce.cpp +++ b/lib/codegen/optimize_dce.cpp @@ -18,7 +18,7 @@ void optimize_dce::run(ir::module &mod) { // iterate through blocks for(ir::basic_block *block: rpo) for(ir::instruction *i: block->get_inst_list()){ - if(dynamic_cast(i) || dynamic_cast(i) || dynamic_cast(i) + if(dynamic_cast(i) || dynamic_cast(i) || dynamic_cast(i) || dynamic_cast(i) || dynamic_cast(i) || dynamic_cast(i) || dynamic_cast(i) || dynamic_cast(i)){ diff --git a/lib/codegen/optimize_dot.cpp b/lib/codegen/optimize_dot.cpp index 904b1c6c3..9e2cc6b9e 100644 --- a/lib/codegen/optimize_dot.cpp +++ b/lib/codegen/optimize_dot.cpp @@ -57,9 +57,27 @@ void optimize_dot::run(ir::module &mod) { if(trans_a){ AA = ((ir::trans_inst*)A)->get_operand(0); } + else{ + if(auto *T = dynamic_cast(A)){ + std::vector perm(T->get_perm()); + std::swap(perm[0], perm[1]); + AA = builder.create_trans(T->get_operand(0), perm); + T->replace_all_uses_with(AA); + trans_a = true; + } + } if(trans_b){ BB = ((ir::trans_inst*)B)->get_operand(0); } + else{ + if(auto *T = dynamic_cast(A)){ + std::vector perm(T->get_perm()); + std::swap(perm[0], perm[1]); + AA = builder.create_trans(T->get_operand(0), perm); + T->replace_all_uses_with(AA); + trans_a = true; + } + } ir::instruction *dot_atbt = builder.insert(ir::dot_inst::create(AA, BB, D, trans_a, trans_b)); dot->replace_all_uses_with(dot_atbt); } diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 1b93765db..89fedab7f 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -138,8 +138,9 @@ Value* shared_tile::shared_offset(llvm::IRBuilder<> &builder, const shapes_t& sh Value *ld = builder.getInt32(shapes[0]); for(size_t i = 1; i < idx.size(); i++) { result = builder.CreateAdd(result, builder.CreateMul(idx[i], ld)); - if(i < idx.size() - 1) + if(i < idx.size() - 1){ ld = builder.CreateMul(ld, builder.getInt32(shapes[i])); + } } return result; } @@ -525,18 +526,23 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id // fragments per warp unsigned fpw_0 = params_->get_param(v, "fpw.d0")->get_value(); unsigned fpw_1 = params_->get_param(v, "fpw.d1")->get_value(); + unsigned fpw_2 = params_->get_param(v, "fpw.d2")->get_value(); // warps per tile unsigned wpt_0 = params_->get_param(v, "wpt.d0")->get_value(); unsigned wpt_1 = params_->get_param(v, "wpt.d1")->get_value(); + unsigned wpt_2 = params_->get_param(v, "wpt.d2")->get_value(); // hmma warp tile size unsigned hmma_wts_0 = fpw_0 * 8; unsigned hmma_wts_1 = fpw_1 * 8; + unsigned hmma_wts_2 = 1; // hmma block tile size unsigned hmma_bts_0 = hmma_wts_0 * wpt_0; unsigned hmma_bts_1 = hmma_wts_1 * wpt_1; + unsigned hmma_bts_2 = hmma_wts_2 * wpt_2; // number of repetition unsigned num_rep_0 = shapes[0]->get_value() / hmma_bts_0; unsigned num_rep_1 = shapes[1]->get_value() / hmma_bts_1; + unsigned num_rep_2 = shapes[2]->get_value() / hmma_bts_2; // size of each pack (interleaving) pack_size_0_ = std::min(num_rep_0, 1); pack_size_1_ = std::min(num_rep_1, 1); @@ -563,7 +569,9 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id /* inter warp offset */ Value *warp_id_0 = builder.CreateURem(u_warp_id, builder.getInt32(wpt_0)); - Value *warp_id_1 = builder.CreateUDiv(u_warp_id, builder.getInt32(wpt_0)); + Value *warp_id_12 = builder.CreateUDiv(u_warp_id, builder.getInt32(wpt_0)); + Value *warp_id_1 = builder.CreateURem(warp_id_12, builder.getInt32(wpt_1)); + Value *warp_id_2 = builder.CreateUDiv(warp_id_12, builder.getInt32(wpt_1)); Value *warp_offset_i = builder.CreateMul(warp_id_0, builder.getInt32(hmma_wts_0 * pack_size_0_)); Value *warp_offset_j = builder.CreateMul(warp_id_1, builder.getInt32(hmma_wts_1 * pack_size_1_)); @@ -571,9 +579,11 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id // a offset offset_a_i_ = builder.CreateAdd(warp_offset_i, builder.CreateAdd(pair_a_off, in_pair_off_a)); offset_a_k_ = builder.CreateAnd(u_thread_id, _3); + offset_a_z_ = warp_id_2; // b offsets offset_b_j_ = builder.CreateAdd(warp_offset_j, builder.CreateAdd(pair_b_off, in_pair_off_b)); offset_b_k_ = builder.CreateAnd(u_thread_id, _3); + offset_b_z_ = warp_id_2; // c offsets @@ -598,10 +608,16 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id idx_j.push_back(builder.CreateAdd(offset_c_j, builder.getInt32(pack*hmma_bts_1*pack_size_1_ + jj*4 + j*4*fpw_1*pack_size_1_))); idx_j.push_back(builder.CreateAdd(offset_c_j, builder.getInt32(pack*hmma_bts_1*pack_size_1_ + jj*4 + j*4*fpw_1*pack_size_1_ + 1))); } + // z indices + std::vector idx_z; + for(unsigned pack = 0; pack < num_rep_2; pack++) + idx_z.push_back(builder.CreateAdd(warp_id_2, builder.getInt32(pack*hmma_bts_2))); + /* axes */ - axes_[params_->get_param_group(v, 0)] = distributed_axis{1, idx_i}; - axes_[params_->get_param_group(v, 1)] = distributed_axis{1, idx_j}; + axes_[params_->get_param_group(v, 0)] = distributed_axis{1, idx_i, warp_id_0}; + axes_[params_->get_param_group(v, 1)] = distributed_axis{1, idx_j, warp_id_1}; + axes_[params_->get_param_group(v, 2)] = distributed_axis{1, idx_z, warp_id_2}; } } @@ -851,7 +867,6 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & unsigned addr_space = sh_mem_ptr_->getType()->getPointerAddressSpace(); Type *res_ty = builder.getFloatTy(); Value *base_ptr = builder.CreateBitCast(sh_mem_ptr_, PointerType::get(res_ty, addr_space)); - unsigned depth = params_->get_param(op, "mts.d" + std::to_string(axis))->get_value(); for(auto& x: partial) { // current element being computed Value *lane = axes_.at(params_->get_param_group(op, axis)).thread_id; @@ -867,6 +882,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & tgt_->add_barrier(module, builder); builder.CreateStore(result, write_ptr); // build result + unsigned depth = params_->get_param(op, "wpt.d" + std::to_string(axis))->get_value(); for(unsigned i = depth/2; i > 0; i >>= 1){ // current indices indices_t current(write_idx.size(), builder.getInt32(0)); @@ -999,6 +1015,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & auto A_shapes = A->get_type()->get_tile_shapes(); size_t red_axis = dot->is_a_trans() ? 0 : 1; unsigned NK = A_shapes[red_axis]->get_value(); + // std::cout << red_axis << " " << NK << std::endl; if(NK != 1) { @@ -1042,10 +1059,10 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & TA->set_return_mode(true); TB->set_return_mode(true); - std::vector fc; + std::map> fcs; result->for_each([&](indices_t idx){ - fc.push_back(TC->get_value(idx)); + fcs[idx[2]].push_back(TC->get_value(idx)); // fc.push_back(UndefValue::get(TC->get_value(idx)->getType())); }); @@ -1088,53 +1105,62 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & unsigned stride_rep_j = wpt_1 * wts_1; unsigned num_rep_i = shapes[0]->get_value() / stride_rep_i; unsigned ld_fc = num_rep_i * 2; - for(unsigned pack_i = 0; pack_i < num_packs_0_; pack_i++) - for(unsigned pack_j = 0; pack_j < num_packs_1_; pack_j++){ - for(unsigned K = 0; K < NK; K += 4){ - Value *_K = builder.getInt32(K); - Value *current_offset_a_i = builder.CreateAdd(offset_a_i, builder.getInt32(pack_i*stride_rep_i*pack_size_0_)); - Value *current_offset_b_i = builder.CreateAdd(offset_b_j, builder.getInt32(pack_j*stride_rep_j*pack_size_1_)); - indices_t idx_a = {current_offset_a_i, builder.CreateAdd(offset_a_k, _K)}; - indices_t idx_b = {current_offset_b_i, builder.CreateAdd(offset_b_k, _K)}; - if(dot->is_a_trans()) - std::swap(idx_a[0], idx_a[1]); - if(!dot->is_b_trans()) - std::swap(idx_b[0], idx_b[1]); - Value *ha = TA->get_value(idx_a); - Value *hb = TB->get_value(idx_b); - for(unsigned ii = 0; ii < pack_size_0_; ii++) - for(unsigned jj = 0; jj < pack_size_1_; jj++){ - Value *ha0 = builder.CreateBitCast(builder.CreateExtractElement(ha, builder.getInt32(ii*pack_size_0_ + 0)), fp16x2_ty); - Value *ha1 = builder.CreateBitCast(builder.CreateExtractElement(ha, builder.getInt32(ii*pack_size_0_ + 1)), fp16x2_ty); - Value *hb0 = builder.CreateBitCast(builder.CreateExtractElement(hb, builder.getInt32(jj*pack_size_0_ + 0)), fp16x2_ty); - Value *hb1 = builder.CreateBitCast(builder.CreateExtractElement(hb, builder.getInt32(jj*pack_size_0_ + 1)), fp16x2_ty); - std::vector idx = { - (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 0)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 1)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 0)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 1)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 2)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 3)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 2)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 3)*ld_fc - }; - Value *nc = builder.CreateCall(mma_fn, {ha0, ha1, hb0, hb1, fc[idx[0]], fc[idx[1]], fc[idx[2]], fc[idx[3]], fc[idx[4]], fc[idx[5]], fc[idx[6]], fc[idx[7]]}); - fc[idx[0]] = builder.CreateExtractValue(nc, {0}); - fc[idx[1]] = builder.CreateExtractValue(nc, {1}); - fc[idx[2]] = builder.CreateExtractValue(nc, {2}); - fc[idx[3]] = builder.CreateExtractValue(nc, {3}); - fc[idx[4]] = builder.CreateExtractValue(nc, {4}); - fc[idx[5]] = builder.CreateExtractValue(nc, {5}); - fc[idx[6]] = builder.CreateExtractValue(nc, {6}); - fc[idx[7]] = builder.CreateExtractValue(nc, {7}); + + + for(auto& x: fcs){ + std::vector& fc = x.second; + for(unsigned pack_i = 0; pack_i < num_packs_0_; pack_i++) + for(unsigned pack_j = 0; pack_j < num_packs_1_; pack_j++){ + for(unsigned K = 0; K < NK; K += 4){ + Value *_K = builder.getInt32(K); + Value *current_offset_a_i = builder.CreateAdd(offset_a_i, builder.getInt32(pack_i*stride_rep_i*pack_size_0_)); + Value *current_offset_b_i = builder.CreateAdd(offset_b_j, builder.getInt32(pack_j*stride_rep_j*pack_size_1_)); + indices_t idx_a = {current_offset_a_i, builder.CreateAdd(offset_a_k, _K)}; + indices_t idx_b = {current_offset_b_i, builder.CreateAdd(offset_b_k, _K)}; + if(dot->is_a_trans()) + std::swap(idx_a[0], idx_a[1]); + if(!dot->is_b_trans()) + std::swap(idx_b[0], idx_b[1]); + idx_a.push_back(x.first); + idx_b.push_back(x.first); + Value *ha = TA->get_value(idx_a); + Value *hb = TB->get_value(idx_b); + for(unsigned ii = 0; ii < pack_size_0_; ii++) + for(unsigned jj = 0; jj < pack_size_1_; jj++){ + Value *ha0 = builder.CreateBitCast(builder.CreateExtractElement(ha, builder.getInt32(ii*pack_size_0_ + 0)), fp16x2_ty); + Value *ha1 = builder.CreateBitCast(builder.CreateExtractElement(ha, builder.getInt32(ii*pack_size_0_ + 1)), fp16x2_ty); + Value *hb0 = builder.CreateBitCast(builder.CreateExtractElement(hb, builder.getInt32(jj*pack_size_0_ + 0)), fp16x2_ty); + Value *hb1 = builder.CreateBitCast(builder.CreateExtractElement(hb, builder.getInt32(jj*pack_size_0_ + 1)), fp16x2_ty); + std::vector idx = { + (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 0)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 1)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 0)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 1)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 2)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 3)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 2)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 3)*ld_fc + }; + Value *nc = builder.CreateCall(mma_fn, {ha0, ha1, hb0, hb1, fc[idx[0]], fc[idx[1]], fc[idx[2]], fc[idx[3]], fc[idx[4]], fc[idx[5]], fc[idx[6]], fc[idx[7]]}); + fc[idx[0]] = builder.CreateExtractValue(nc, {0}); + fc[idx[1]] = builder.CreateExtractValue(nc, {1}); + fc[idx[2]] = builder.CreateExtractValue(nc, {2}); + fc[idx[3]] = builder.CreateExtractValue(nc, {3}); + fc[idx[4]] = builder.CreateExtractValue(nc, {4}); + fc[idx[5]] = builder.CreateExtractValue(nc, {5}); + fc[idx[6]] = builder.CreateExtractValue(nc, {6}); + fc[idx[7]] = builder.CreateExtractValue(nc, {7}); + } + } } - } } // write back unsigned i = 0; result->for_each([&](indices_t idx){ - result->set_value(idx, fc[i++]); + if(i >= fcs.at(idx[2]).size()) + i = 0; + result->set_value(idx, fcs.at(idx[2])[i++]); }); TA->set_return_mode(false); diff --git a/lib/codegen/shmem_allocation.cpp b/lib/codegen/shmem_allocation.cpp index 6e9bf86ff..042f9a19d 100644 --- a/lib/codegen/shmem_allocation.cpp +++ b/lib/codegen/shmem_allocation.cpp @@ -12,8 +12,10 @@ namespace triton{ namespace codegen{ unsigned shmem_allocation::is_ld_padded(ir::value *x) { - if(dynamic_cast(x)) - return 4; + if(auto *trans = dynamic_cast(x)){ + if(trans->get_perm()[0]->get_value() != 0) + return 4; + } for(ir::user* user: x->get_users()) if(auto dot = dynamic_cast(user)){ bool is_hmma = params_->get_fragment(user, 0) == tune::HMMA_FRAGMENT_C; @@ -51,7 +53,11 @@ unsigned shmem_allocation::get_num_bytes(ir::value *x) { size_t num_elements = 1; for(auto x: shapes) num_elements *= x->get_value(); - size_t depth = params_->get_param(op, "mts.d" + std::to_string(axis))->get_value(); + size_t depth; + if(params_->get_fragment(x, 0) == tune::HMMA_FRAGMENT_C) + depth = params_->get_param(op, "wpt.d" + std::to_string(axis))->get_value(); + else + depth = params_->get_param(op, "mts.d" + std::to_string(axis))->get_value(); return num_elements * num_bytes * depth; } unsigned num_bytes = x->get_type()->get_primitive_size_in_bits() / 8; diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index b05f7e79e..26283b536 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -255,6 +255,8 @@ void tune::run(ir::module &mod) { } else { ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 2, 2); + if(node.second == 2) + fpw->set_value(1); ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 1, 4); connected_components(node, {fpw, wpt}, {"fpw", "wpt"}, nodes_, dependencies_, group_id++); } diff --git a/lib/codegen/vectorize.cpp b/lib/codegen/vectorize.cpp index e1319634b..8ec5a99f6 100644 --- a/lib/codegen/vectorize.cpp +++ b/lib/codegen/vectorize.cpp @@ -13,7 +13,17 @@ void vectorize::run(ir::module &mod) { ir::builder &builder = mod.get_builder(); for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i: block->get_inst_list()) + for(ir::instruction *i: block->get_inst_list()){ + if(auto *trans = dynamic_cast(i)){ + ir::value *x = i->get_operand(0); + if(trans->get_perm()[0]->get_value() != 0) + continue; + builder.set_insert_point(i); + ir::instruction *rx = (ir::instruction*)builder.create_vectorize(x); + x->replace_all_uses_with(rx); + rx->set_operand(0, x); + params_->copy(rx, x); + } if(dynamic_cast(i)){ ir::value *x = i->get_operand(0); if(params_->get_param(x, "nts.d0")->get_value() == 1) @@ -24,6 +34,7 @@ void vectorize::run(ir::module &mod) { rx->set_operand(0, x); params_->copy(rx, x); } + } } } diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index 6293139e2..4751f1033 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -62,11 +62,11 @@ std::pair base::get_profile_impl(driver::stream *stream, std::v jit->add_module(name_.c_str(), src.c_str(), best.params); } else{ - params_t params = heuristics(); +// params_t params = heuristics(); // params_t params = jit->get_valid(name_.c_str(), src.c_str()); // params_t params = {4, 1, 32, 4, 1, 32, 4, 4, 4, 1, 1, 16, 32, 16, 4, 4, 4, 4, 1}; //NT // params_t params = {4, 1, 32, 4, 32, 4, 4, 4, 1, 1, 16, 32, 16, 1, 4, 4, 4, 4, 4, 1}; //NN -// params_t params = {4, 16, 4, 2, 16, 4, 8, 2, 2, 8, 2, 32, 8, 1}; // TT + params_t params = {4, 2, 16, 4, 2, 16, 2, 2, 1, 1, 2, 16, 32, 16, 4, 4, 4, 4, 1}; // TT jit->add_module(name_.c_str(), src.c_str(), params); } triton::driver::kernel* kernel = jit->get_function(name_.c_str()); diff --git a/lib/dnn/dot.cpp b/lib/dnn/dot.cpp index d310ad5fb..496ddb74f 100644 --- a/lib/dnn/dot.cpp +++ b/lib/dnn/dot.cpp @@ -72,10 +72,11 @@ void dot::enqueue_impl(driver::stream *stream, driver::kernel *kernel, } void dot::triton_c_src(std::ostream &os) const { + std::string ZS = "4"; std::string AS0 = "TM", AS1 = "TK"; std::string BS0 = "TK", BS1 = "TN"; - std::string XAS0 = "TM", XAS1 = "TK", XAS2 = "1"; - std::string XBS0 = "TK", XBS1 = "1", XBS2 = "TN"; + std::string XAS0 = "TM", XAS1 = "TK / " + ZS, XAS2 = ZS; + std::string XBS0 = "TK / " + ZS, XBS1 = ZS, XBS2 = "TN"; std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; std::string lda0 = "*lda", lda1 = ""; @@ -100,7 +101,7 @@ void dot::triton_c_src(std::ostream &os) const { std::string BS = BS0 + ", " + BS1; std::string XAS = XAS0 + ", " + XAS1 + ", " + XAS2; std::string XBS = XBS0 + ", " + XBS1 + ", " + XBS2; - std::string XCS = "TM, TN, 1"; + std::string XCS = "TM, TN, " + ZS; std::string align_lda_str = "multiple_of(" + std::to_string(align_lda_) + ")"; std::string align_ldb_str = "multiple_of(" + std::to_string(align_ldb_) + ")"; std::string res = diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index d2c31fadd..6378f9593 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -255,7 +255,6 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ -// std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index 86102a460..a854497ef 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -174,9 +174,9 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben std::lock_guard lock(mutex); for(ir::metaparameter *mp: mps) mp->set_value(params[i++]); -// for(size_t i = 0; i < params.size(); i++) -// std::cout << ((i==0)?"":", ") << params[i] << std::flush; -// std::cout << std::endl; + for(size_t i = 0; i < params.size(); i++) + std::cout << ((i==0)?"":", ") << params[i] << std::flush; + std::cout << std::endl; passes_0.tune.init(tt_module_0); passes_0.tune.check_constraints(errors); // for(auto x: errors) From 392b55280da336dede18309726c38fcee9996201 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 7 Aug 2019 11:08:04 -0700 Subject: [PATCH 287/494] [codegen] some cleaning for batched matmul --- examples/cpp/dot.cpp | 10 +++--- include/triton/codegen/selection.h | 4 +-- include/triton/runtime/jit.h | 3 +- include/triton/tools/bench.hpp | 4 +-- lib/codegen/optimize_dot.cpp | 18 +++++----- lib/codegen/selection.cpp | 33 +++++++++--------- lib/codegen/tune.cpp | 54 ++++++++++++++---------------- lib/dnn/base.cpp | 6 +--- lib/dnn/dot.cpp | 17 ++++++---- lib/driver/module.cpp | 1 + lib/runtime/jit.cpp | 12 +++---- 11 files changed, 82 insertions(+), 80 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 231e98f88..50e59e0fa 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -40,7 +40,7 @@ perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int hb[i] = static_cast((double)rand()/RAND_MAX); for(size_t i = 0; i < hc.size(); i++) hc[i] = static_cast((double)0); - triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*dt_nbytes); + triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*4); triton::driver::buffer* da = triton::driver::buffer::create(context, ha.size()*dt_nbytes); triton::driver::buffer* db = triton::driver::buffer::create(context, hb.size()*dt_nbytes); stream->write(da, true, 0, ha); @@ -49,7 +49,7 @@ perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int stream->synchronize(); triton::dnn::dot dot(M, N, K, AT, BT, ty, ty, ty, 8, 8, 8); // benchmark triton - double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::NO_TUNING);}, stream); + double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::PARTIAL_TUNING);}, stream); // benchmark cublas // NumericT alpha = 1; // NumericT beta = 0; @@ -77,7 +77,7 @@ perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int std::vector rc(hc.size()); dot.cpu_ref(rc, ha, hb); for(size_t i = 0; i < M*N; i++) - if(std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-2){ + if(std::isinf(hc[i]) || std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-2){ std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; exit(EXIT_FAILURE); } @@ -111,8 +111,8 @@ int main() { // shapes to benchmark std::vector configs = { // {false, false, 8192, 512, 512}, -// {false, true, 8192, 8192, 8192} - {false, true, 128, 128, 128}, + {false, true, 128, 128, 128} +// {false, true, 128, 128, 128}, // {false, false, 128, 128, 128}, // {true, false, 128, 128, 128}, // {true, true, 128, 128, 128} diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 654faf95b..b480da5f0 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -153,8 +153,8 @@ private: alignment_info *axis_info_; std::map axes_; llvm::Value *sh_mem_ptr_; - llvm::Value *offset_a_i_, *offset_a_k_, *offset_a_z_; - llvm::Value *offset_b_j_, *offset_b_k_, *offset_b_z_; + llvm::Value *offset_a_i_, *offset_a_k_; + llvm::Value *offset_b_j_, *offset_b_k_; unsigned num_packs_0_, num_packs_1_; unsigned pack_size_0_, pack_size_1_; }; diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index 8758cb78d..563d5863a 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -71,6 +71,7 @@ public: void target_independent(ir::module &module) { optimize_dot.run(module); + optimize_dce.run(module); optimize_trans.run(module); optimize_dce.run(module); } @@ -86,7 +87,7 @@ public: } vectorize.run(module); optimize_dce.run(module); - ir::print(module, std::cout); +// ir::print(module, std::cout); } codegen::tune tune; diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp index 74053b717..0ebf7f360 100644 --- a/include/triton/tools/bench.hpp +++ b/include/triton/tools/bench.hpp @@ -38,8 +38,8 @@ inline double bench(std::function const & op, driver::stream * stream) while(total_time*1e-9 < 1e-3){ float norm = 1; // normalize clock if possible to reduce noise in auto-tuning -// if(auto cu_device = dynamic_cast(device)) -// norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock(); + if(auto cu_device = dynamic_cast(device)) + norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock(); tmr.start(); op(); stream->synchronize(); diff --git a/lib/codegen/optimize_dot.cpp b/lib/codegen/optimize_dot.cpp index 9e2cc6b9e..00893c1a5 100644 --- a/lib/codegen/optimize_dot.cpp +++ b/lib/codegen/optimize_dot.cpp @@ -30,8 +30,8 @@ inline bool is_hmma(ir::value *v){ ir::type *b_ty = b->get_type(); // inputs have to be FP16 result = a_ty->get_scalar_ty()->is_half_ty() && b_ty->get_scalar_ty()->is_half_ty(); - // reduction has to be multiple of 4 - result = result && ((a_ty->get_tile_shapes()[1]->get_value() % 4) == 0); +// reduction has to be multiple of 4 +// result = result && ((a_ty->get_tile_shapes()[1]->get_value() % 4) == 0); } return result; } @@ -70,13 +70,13 @@ void optimize_dot::run(ir::module &mod) { BB = ((ir::trans_inst*)B)->get_operand(0); } else{ - if(auto *T = dynamic_cast(A)){ - std::vector perm(T->get_perm()); - std::swap(perm[0], perm[1]); - AA = builder.create_trans(T->get_operand(0), perm); - T->replace_all_uses_with(AA); - trans_a = true; - } +// if(auto *T = dynamic_cast(A)){ +// std::vector perm(T->get_perm()); +// std::swap(perm[0], perm[1]); +// AA = builder.create_trans(T->get_operand(0), perm); +// T->replace_all_uses_with(AA); +// trans_a = true; +// } } ir::instruction *dot_atbt = builder.insert(ir::dot_inst::create(AA, BB, D, trans_a, trans_b)); dot->replace_all_uses_with(dot_atbt); diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 89fedab7f..86bee2932 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -516,6 +516,10 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id } } else { + if(shapes.size() > 3) + throw std::runtime_error("unsupported"); + bool is_batched = shapes.size() >= 3; + Value *_1 = builder.getInt32(1); Value *_2 = builder.getInt32(2); Value *_3 = builder.getInt32(3); @@ -526,23 +530,23 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id // fragments per warp unsigned fpw_0 = params_->get_param(v, "fpw.d0")->get_value(); unsigned fpw_1 = params_->get_param(v, "fpw.d1")->get_value(); - unsigned fpw_2 = params_->get_param(v, "fpw.d2")->get_value(); + unsigned fpw_2 = is_batched ? params_->get_param(v, "fpw.d2")->get_value() : 1; // warps per tile unsigned wpt_0 = params_->get_param(v, "wpt.d0")->get_value(); unsigned wpt_1 = params_->get_param(v, "wpt.d1")->get_value(); - unsigned wpt_2 = params_->get_param(v, "wpt.d2")->get_value(); + unsigned wpt_2 = is_batched ? params_->get_param(v, "wpt.d2")->get_value() : 1; // hmma warp tile size unsigned hmma_wts_0 = fpw_0 * 8; unsigned hmma_wts_1 = fpw_1 * 8; - unsigned hmma_wts_2 = 1; + unsigned hmma_wts_2 = is_batched ? fpw_2 : 1; // hmma block tile size unsigned hmma_bts_0 = hmma_wts_0 * wpt_0; unsigned hmma_bts_1 = hmma_wts_1 * wpt_1; - unsigned hmma_bts_2 = hmma_wts_2 * wpt_2; + unsigned hmma_bts_2 = is_batched ? hmma_wts_2 * wpt_2 : 1; // number of repetition unsigned num_rep_0 = shapes[0]->get_value() / hmma_bts_0; unsigned num_rep_1 = shapes[1]->get_value() / hmma_bts_1; - unsigned num_rep_2 = shapes[2]->get_value() / hmma_bts_2; + unsigned num_rep_2 = is_batched ? shapes[2]->get_value() / hmma_bts_2 : 1; // size of each pack (interleaving) pack_size_0_ = std::min(num_rep_0, 1); pack_size_1_ = std::min(num_rep_1, 1); @@ -579,19 +583,15 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id // a offset offset_a_i_ = builder.CreateAdd(warp_offset_i, builder.CreateAdd(pair_a_off, in_pair_off_a)); offset_a_k_ = builder.CreateAnd(u_thread_id, _3); - offset_a_z_ = warp_id_2; // b offsets offset_b_j_ = builder.CreateAdd(warp_offset_j, builder.CreateAdd(pair_b_off, in_pair_off_b)); offset_b_k_ = builder.CreateAnd(u_thread_id, _3); - offset_b_z_ = warp_id_2; - // c offsets Value *offset_c_i = builder.CreateAdd(builder.CreateAnd(u_thread_id, _1), offset_a_i_); Value *offset_c_j = builder.CreateAdd(builder.CreateAnd(u_thread_id, _2), builder.CreateAdd(warp_offset_j, pair_b_off)); - /* indices */ // i indices std::vector idx_i; @@ -617,7 +617,8 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id /* axes */ axes_[params_->get_param_group(v, 0)] = distributed_axis{1, idx_i, warp_id_0}; axes_[params_->get_param_group(v, 1)] = distributed_axis{1, idx_j, warp_id_1}; - axes_[params_->get_param_group(v, 2)] = distributed_axis{1, idx_z, warp_id_2}; + if(is_batched) + axes_[params_->get_param_group(v, 2)] = distributed_axis{1, idx_z, warp_id_2}; } } @@ -1062,10 +1063,10 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & std::map> fcs; result->for_each([&](indices_t idx){ - fcs[idx[2]].push_back(TC->get_value(idx)); -// fc.push_back(UndefValue::get(TC->get_value(idx)->getType())); + fcs[{builder.getInt32(0)}].push_back(TC->get_value(idx)); }); + Type *fp32_ty = builder.getFloatTy(); Type *fp16x2_ty = VectorType::get(builder.getHalfTy(), 2); Type *fp32_pack8_ty = StructType::get(ctx, {fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}); @@ -1121,8 +1122,8 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & std::swap(idx_a[0], idx_a[1]); if(!dot->is_b_trans()) std::swap(idx_b[0], idx_b[1]); - idx_a.push_back(x.first); - idx_b.push_back(x.first); +// idx_a.push_back(builder.getInt32(0)); +// idx_b.push_back(builder.getInt32(0)); Value *ha = TA->get_value(idx_a); Value *hb = TB->get_value(idx_b); for(unsigned ii = 0; ii < pack_size_0_; ii++) @@ -1158,9 +1159,9 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & // write back unsigned i = 0; result->for_each([&](indices_t idx){ - if(i >= fcs.at(idx[2]).size()) + if(i >= fcs.at({builder.getInt32(0)}).size()) i = 0; - result->set_value(idx, fcs.at(idx[2])[i++]); + result->set_value(idx, fcs.at({builder.getInt32(0)})[i++]); }); TA->set_return_mode(false); diff --git a/lib/codegen/tune.cpp b/lib/codegen/tune.cpp index 26283b536..39dea9f12 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/tune.cpp @@ -257,56 +257,54 @@ void tune::run(ir::module &mod) { ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 2, 2); if(node.second == 2) fpw->set_value(1); - ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 1, 4); + ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 2, 4); connected_components(node, {fpw, wpt}, {"fpw", "wpt"}, nodes_, dependencies_, group_id++); } } } - // Simplify metaparameters for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i : block->get_inst_list()){ - - - if(fragments_.find({i, 0}) != fragments_.end() && fragments_.at({i, 0}) != STRIDED_SCAN) + if(!i->get_type()->is_tile_ty()) continue; + auto shapes = i->get_type()->get_tile_shapes(); - if(auto *x = dynamic_cast(i)) - if(i->get_type()->is_tile_ty()){ + if(auto *x = dynamic_cast(i)){ ir::type *ptr_ty = x->get_pointer_operand()->get_type()->get_scalar_ty(); size_t addr_space = ptr_ty->get_pointer_address_space(); if(addr_space < 4){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 2, 4)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 4, 8)); *params_.at(i).at("nts.d0") = *tmp; } } if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 2, 4)); - std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 2, 4)); - *params_.at(i).at("nts.d0") = *tmp1; - *params_.at(i).at("nts.d1") = *tmp2; +// std::unique_ptr mts_2(ir::metaparameter::create(ctx, ty, 1, 4)); +// *params_.at(i->get_operand(0)).at("mts.d2") = *mts_2; +// *params_.at(i->get_operand(1)).at("mts.d2") = *mts_2; + if(fragments_.at({i, 0}) == STRIDED_SCAN){ + std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 4, 8)); + std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 4, 8)); + *params_.at(i).at("nts.d0") = *tmp1; + *params_.at(i).at("nts.d1") = *tmp2; +// for(size_t k = 2; k < shapes.size(); k++) +// if(auto *x = dynamic_cast(shapes[k])) +// *params_.at(i).at("mts.d" + std::to_string(k)) = *x; +// else +// params_.at(i).at("mts.d" + std::to_string(k))->set_value(shapes[k]->get_value()); + } + else{ +// for(size_t k = 2; k < shapes.size(); k++) +// if(auto *x = dynamic_cast(shapes[k])) +// *params_.at(i).at("wpt.d" + std::to_string(k)) = *x; +// else +// params_.at(i).at("wpt.d" + std::to_string(k))->set_value(shapes[k]->get_value()); + } } } - // initialize grids - -// for(ir::instruction *i: grids_){ -// auto shapes = i->get_type()->get_tile_shapes(); -// for(size_t k = 0; k < shapes.size(); k++) -// if(shapes[k]->get_value() == 1) { -// if(fragments_.at({i, k}) == STRIDED_SCAN){ -// params_.at(i).at("nts.d" + std::to_string(k))->set_value(1); -// params_.at(i).at("mts.d" + std::to_string(k))->set_value(1); -// } -// if(fragments_.at({i, k}) == HMMA_FRAGMENT_C){ -// params_.at(i).at("fpw.d" + std::to_string(k))->set_value(1); -// params_.at(i).at("wpt.d" + std::to_string(k))->set_value(1); -// } -// } -// } } void tune::init(ir::module &mod) { diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index 4751f1033..86d031564 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -62,11 +62,7 @@ std::pair base::get_profile_impl(driver::stream *stream, std::v jit->add_module(name_.c_str(), src.c_str(), best.params); } else{ -// params_t params = heuristics(); -// params_t params = jit->get_valid(name_.c_str(), src.c_str()); -// params_t params = {4, 1, 32, 4, 1, 32, 4, 4, 4, 1, 1, 16, 32, 16, 4, 4, 4, 4, 1}; //NT -// params_t params = {4, 1, 32, 4, 32, 4, 4, 4, 1, 1, 16, 32, 16, 1, 4, 4, 4, 4, 4, 1}; //NN - params_t params = {4, 2, 16, 4, 2, 16, 2, 2, 1, 1, 2, 16, 32, 16, 4, 4, 4, 4, 1}; // TT + params_t params = heuristics(); jit->add_module(name_.c_str(), src.c_str(), params); } triton::driver::kernel* kernel = jit->get_function(name_.c_str()); diff --git a/lib/dnn/dot.cpp b/lib/dnn/dot.cpp index 496ddb74f..5465b26b9 100644 --- a/lib/dnn/dot.cpp +++ b/lib/dnn/dot.cpp @@ -72,11 +72,11 @@ void dot::enqueue_impl(driver::stream *stream, driver::kernel *kernel, } void dot::triton_c_src(std::ostream &os) const { - std::string ZS = "4"; + std::string ZS = "1"; std::string AS0 = "TM", AS1 = "TK"; std::string BS0 = "TK", BS1 = "TN"; - std::string XAS0 = "TM", XAS1 = "TK / " + ZS, XAS2 = ZS; - std::string XBS0 = "TK / " + ZS, XBS1 = ZS, XBS2 = "TN"; + std::string XAS0 = "TM", XAS1 = "TK", XAS2 = ZS; + std::string XBS0 = "TK", XBS1 = ZS, XBS2 = "TN"; std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; std::string lda0 = "*lda", lda1 = ""; @@ -131,9 +131,9 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, bool checkb[)" + BS + R"(] = (rkb < K))" + bcb0 + " && (ryb < N)" + bcb1 + R"(; )" + a_ty_ + R"( a[)" + AS + R"(] = checka ? *pa : 0; )" + b_ty_ + R"( b[)" + BS + R"(] = checkb ? *pb : 0; + )" + a_ty_ + R"( xa[)" + XAS + "] = __reshape(a, " + XAS + R"(); + )" + b_ty_ + R"( xb[)" + XBS + "] = __reshape(b, " + XBS + R"(); for(int k = K; k > 0; k = k - TK){ - )" + a_ty_ + R"( xa[)" + XAS + "] = __reshape(a, " + XAS + R"(); - )" + b_ty_ + R"( xb[)" + XBS + "] = __reshape(b, " + XBS + R"(); xc = dot()" + usea + ", " + useb + R"(, xc); pa = pa + TK)" + lda0 + R"(; pb = pb + TK)" + ldb0 + R"(; @@ -141,12 +141,17 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, bool checkb[)" + BS + R"(] = k > TK; a = checka ? *pa : 0; b = checkb ? *pb : 0; + xa = __reshape(a, )" + XAS + R"(); + xb = __reshape(b, )" + XBS + R"(); } int rxc[TM] = ridx * TM + (0 ... TM); int ryc[TN] = ridy * TN + (0 ... TN); )" + c_ty_ + R"(* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; )" + c_ty_ + R"( c[TM, TN] = __sum(xc, 2); - *pc = c; + bool checkc0[TM] = rxc < M; + bool checkc1[TN] = ryc < N; + bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + @checkc *pc = c; } )"; diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 6378f9593..d2c31fadd 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -255,6 +255,7 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ +// std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index a854497ef..1f6a60ccd 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -37,13 +37,13 @@ void parallel_loop_nest(std::vector const & ranges, size_t D = ranges.size(); std::vector values(D, 0); // thread pools -// ThreadPool pool(nthreads); + ThreadPool pool(nthreads); // Start with innermost loop size_t i = D - 1; while(true){ // Execute function -// pool.enqueue(f,values); - f(values); + pool.enqueue(f,values); +// f(values); while(values[i]++ == ranges[i] - 1){ if(i == 0) return; @@ -174,9 +174,9 @@ jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t ben std::lock_guard lock(mutex); for(ir::metaparameter *mp: mps) mp->set_value(params[i++]); - for(size_t i = 0; i < params.size(); i++) - std::cout << ((i==0)?"":", ") << params[i] << std::flush; - std::cout << std::endl; +// for(size_t i = 0; i < params.size(); i++) +// std::cout << ((i==0)?"":", ") << params[i] << std::flush; +// std::cout << std::endl; passes_0.tune.init(tt_module_0); passes_0.tune.check_constraints(errors); // for(auto x: errors) From 7578c27d3d7e5cf6117ef79ea2ab169ac3a89176 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 7 Aug 2019 21:15:54 -0700 Subject: [PATCH 288/494] [general][filesystem] added structure and namespace to code generation files --- examples/cpp/dot.cpp | 4 +- examples/python/tensorflow/blocksparse.cpp | 2 +- .../alignment.h} | 2 + .../shmem/allocation.h} | 20 +++++--- .../{shmem_info.h => analysis/shmem/info.h} | 7 ++- .../shmem/liveness.h} | 13 +++-- include/triton/codegen/{ => analysis}/tune.h | 2 + .../codegen/{ => selection}/selection.h | 26 ++++++---- .../triton/codegen/{ => selection}/target.h | 0 .../{optimize_dce.h => transform/dce.h} | 4 +- .../{optimize_dot.h => transform/dot.h} | 10 ++-- .../codegen/{ => transform}/reassociate.h | 12 +++-- .../shmem/barriers.h} | 19 +++++-- .../{optimize_trans.h => transform/trans.h} | 2 + .../codegen/{ => transform}/vectorize.h | 12 +++-- include/triton/runtime/jit.h | 50 ++++++++++--------- include/triton/tools/bench.hpp | 4 +- .../alignment.cpp} | 4 +- .../shmem/allocation.cpp} | 18 ++++--- .../shmem/info.cpp} | 19 ++++--- .../shmem/liveness.cpp} | 11 ++-- lib/codegen/{ => analysis}/tune.cpp | 9 ++-- lib/codegen/{ => selection}/selection.cpp | 39 ++++++++------- lib/codegen/{ => selection}/target.cpp | 2 +- .../{optimize_dce.cpp => transform/dce.cpp} | 4 +- .../{optimize_dot.cpp => transform/dot.cpp} | 6 ++- lib/codegen/{ => transform}/reassociate.cpp | 10 ++-- .../shmem/barriers.cpp} | 8 +-- .../trans.cpp} | 4 +- lib/codegen/{ => transform}/vectorize.cpp | 6 ++- lib/dnn/base.cpp | 3 +- lib/dnn/blocksparse/dot.cpp | 11 ++-- lib/dnn/dot.cpp | 24 ++++----- lib/driver/device.cpp | 2 +- lib/runtime/jit.cpp | 2 +- 35 files changed, 224 insertions(+), 147 deletions(-) rename include/triton/codegen/{alignment_info.h => analysis/alignment.h} (97%) rename include/triton/codegen/{shmem_allocation.h => analysis/shmem/allocation.h} (76%) rename include/triton/codegen/{shmem_info.h => analysis/shmem/info.h} (92%) rename include/triton/codegen/{shmem_liveness.h => analysis/shmem/liveness.h} (90%) rename include/triton/codegen/{ => analysis}/tune.h (99%) rename include/triton/codegen/{ => selection}/selection.h (92%) rename include/triton/codegen/{ => selection}/target.h (100%) rename include/triton/codegen/{optimize_dce.h => transform/dce.h} (93%) rename include/triton/codegen/{optimize_dot.h => transform/dot.h} (70%) rename include/triton/codegen/{ => transform}/reassociate.h (82%) rename include/triton/codegen/{shmem_barriers.h => transform/shmem/barriers.h} (78%) rename include/triton/codegen/{optimize_trans.h => transform/trans.h} (95%) rename include/triton/codegen/{ => transform}/vectorize.h (62%) rename lib/codegen/{alignment_info.cpp => analysis/alignment.cpp} (99%) rename lib/codegen/{shmem_allocation.cpp => analysis/shmem/allocation.cpp} (93%) rename lib/codegen/{shmem_info.cpp => analysis/shmem/info.cpp} (91%) rename lib/codegen/{shmem_liveness.cpp => analysis/shmem/liveness.cpp} (84%) rename lib/codegen/{ => analysis}/tune.cpp (98%) rename lib/codegen/{ => selection}/selection.cpp (98%) rename lib/codegen/{ => selection}/target.cpp (99%) rename lib/codegen/{optimize_dce.cpp => transform/dce.cpp} (96%) rename lib/codegen/{optimize_dot.cpp => transform/dot.cpp} (97%) rename lib/codegen/{ => transform}/reassociate.cpp (97%) rename lib/codegen/{shmem_barriers.cpp => transform/shmem/barriers.cpp} (96%) rename lib/codegen/{optimize_trans.cpp => transform/trans.cpp} (97%) rename lib/codegen/{ => transform}/vectorize.cpp (91%) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 50e59e0fa..ef73a7581 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -40,7 +40,7 @@ perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int hb[i] = static_cast((double)rand()/RAND_MAX); for(size_t i = 0; i < hc.size(); i++) hc[i] = static_cast((double)0); - triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*4); + triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*dt_nbytes); triton::driver::buffer* da = triton::driver::buffer::create(context, ha.size()*dt_nbytes); triton::driver::buffer* db = triton::driver::buffer::create(context, hb.size()*dt_nbytes); stream->write(da, true, 0, ha); @@ -49,7 +49,7 @@ perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int stream->synchronize(); triton::dnn::dot dot(M, N, K, AT, BT, ty, ty, ty, 8, 8, 8); // benchmark triton - double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::PARTIAL_TUNING);}, stream); + double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::FULL_TUNING);}, stream); // benchmark cublas // NumericT alpha = 1; // NumericT beta = 0; diff --git a/examples/python/tensorflow/blocksparse.cpp b/examples/python/tensorflow/blocksparse.cpp index 3a6a2505c..e2a0b5144 100644 --- a/examples/python/tensorflow/blocksparse.cpp +++ b/examples/python/tensorflow/blocksparse.cpp @@ -130,7 +130,7 @@ public: // create profile triton::dnn::blocksparse::dot dot(N, params_.K, params_.segments, params_.C, "half", params_.bsize, params_.locks, params_.blocks, OP); // blocksparse matmul - triton::dnn::base* op = dot.enqueue(stream, {&da, &db, &dc, &dlut}, triton::dnn::NO_TUNING); + triton::dnn::base* op = dot.enqueue(stream, {&da, &db, &dc, &dlut}, triton::dnn::PARTIAL_TUNING); triton::driver::buffer* locks_buffer = ((triton::dnn::blocksparse::dot*)op)->get_locks(); Tensor *tmp = nullptr; TensorShape tmp_shapes; diff --git a/include/triton/codegen/alignment_info.h b/include/triton/codegen/analysis/alignment.h similarity index 97% rename from include/triton/codegen/alignment_info.h rename to include/triton/codegen/analysis/alignment.h index 92a15efeb..1d0c4b191 100644 --- a/include/triton/codegen/alignment_info.h +++ b/include/triton/codegen/analysis/alignment.h @@ -12,6 +12,7 @@ namespace ir { } namespace codegen{ +namespace analysis{ class alignment_info { struct cst_info { @@ -41,6 +42,7 @@ private: }; +} } } diff --git a/include/triton/codegen/shmem_allocation.h b/include/triton/codegen/analysis/shmem/allocation.h similarity index 76% rename from include/triton/codegen/shmem_allocation.h rename to include/triton/codegen/analysis/shmem/allocation.h index 0f36ec154..024c3cf68 100644 --- a/include/triton/codegen/shmem_allocation.h +++ b/include/triton/codegen/analysis/shmem/allocation.h @@ -13,16 +13,18 @@ namespace ir{ } namespace codegen{ +namespace analysis{ -class layout; -class target_tuner; -class shmem_liveness; -class shmem_info; class tune; -class shmem_allocation { +namespace shmem{ + +class liveness; +class info; + +class allocation { public: - shmem_allocation(shmem_liveness *live, shmem_info *buffer_info, tune *params) + allocation(liveness *live, info *buffer_info, tune *params) : liveness_(live), buffer_info_(buffer_info), params_(params){ } // utilities @@ -41,11 +43,13 @@ private: std::map num_bytes_; size_t allocated_size_; // dependences - shmem_liveness *liveness_; - shmem_info *buffer_info_; + liveness *liveness_; + info *buffer_info_; tune *params_; }; +} +} } } diff --git a/include/triton/codegen/shmem_info.h b/include/triton/codegen/analysis/shmem/info.h similarity index 92% rename from include/triton/codegen/shmem_info.h rename to include/triton/codegen/analysis/shmem/info.h index f8325d00b..689516cb2 100644 --- a/include/triton/codegen/shmem_info.h +++ b/include/triton/codegen/analysis/shmem/info.h @@ -14,8 +14,10 @@ namespace ir { } namespace codegen{ +namespace analysis{ +namespace shmem{ -class shmem_info { +class info { public: void run(ir::module &mod); // queries @@ -33,7 +35,8 @@ private: std::map refs_; }; - +} +} } } diff --git a/include/triton/codegen/shmem_liveness.h b/include/triton/codegen/analysis/shmem/liveness.h similarity index 90% rename from include/triton/codegen/shmem_liveness.h rename to include/triton/codegen/analysis/shmem/liveness.h index 69210d03f..bec0303c0 100644 --- a/include/triton/codegen/shmem_liveness.h +++ b/include/triton/codegen/analysis/shmem/liveness.h @@ -12,10 +12,12 @@ namespace ir{ } namespace codegen{ +namespace analysis{ +namespace shmem{ typedef unsigned slot_index; -class shmem_info; +class info; struct segment { slot_index start; @@ -30,7 +32,7 @@ struct segment { } }; -class shmem_liveness { +class liveness { private: typedef std::map indices_map_t; typedef std::map intervals_map_t; @@ -43,7 +45,7 @@ public: public: // constructor - shmem_liveness(shmem_info *info): info_(info){ } + liveness(info *info): info_(info){ } // accessors const intervals_map_t& intervals() const { return intervals_; } @@ -53,7 +55,7 @@ public: void run(ir::module &mod); private: - shmem_info *info_; + info *info_; has_storage_map_t has_dedicated_storage_; indices_map_t indices_; intervals_map_t intervals_; @@ -61,5 +63,8 @@ private: } } +} +} + #endif diff --git a/include/triton/codegen/tune.h b/include/triton/codegen/analysis/tune.h similarity index 99% rename from include/triton/codegen/tune.h rename to include/triton/codegen/analysis/tune.h index 7f393a3a0..54dae5524 100644 --- a/include/triton/codegen/tune.h +++ b/include/triton/codegen/analysis/tune.h @@ -16,6 +16,7 @@ namespace ir{ } namespace codegen{ +namespace analysis{ class tune { typedef std::pair node_t; @@ -67,6 +68,7 @@ private: }; +} } } diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection/selection.h similarity index 92% rename from include/triton/codegen/selection.h rename to include/triton/codegen/selection/selection.h index b480da5f0..2bc49f72e 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection/selection.h @@ -7,7 +7,7 @@ #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/type.h" -#include "triton/codegen/shmem_info.h" +#include "triton/codegen/analysis/shmem/info.h" namespace llvm{ @@ -21,12 +21,20 @@ namespace llvm{ namespace triton{ namespace codegen{ -class shmem_allocation; +namespace analysis{ + class tune; -class shmem_info; -class target; class alignment_info; +namespace shmem{ + +class allocation; +class info; + +} +} +class target; + typedef std::vector indices_t; struct distributed_axis { @@ -138,7 +146,7 @@ private: void lower_tile_instruction(ir::instruction *src, llvm::IRBuilder<> &builder); public: - selection(shmem_allocation *alloc, tune *params, shmem_info *buffer_info, alignment_info *ax_info, target *tgt) + selection(analysis::shmem::allocation *alloc, analysis::tune *params, analysis::shmem::info *buffer_info, analysis::alignment_info *ax_info, target *tgt) : alloc_(alloc), params_(params), buffer_info_(buffer_info), axis_info_(ax_info), tgt_(tgt){ } void run(ir::module &src, llvm::Module &dst); @@ -146,11 +154,11 @@ public: private: vmap_t vmap_; tmap_t tmap_; - shmem_allocation *alloc_; - tune *params_; + analysis::shmem::allocation *alloc_; + analysis::tune *params_; target *tgt_; - shmem_info *buffer_info_; - alignment_info *axis_info_; + analysis::shmem::info *buffer_info_; + analysis::alignment_info *axis_info_; std::map axes_; llvm::Value *sh_mem_ptr_; llvm::Value *offset_a_i_, *offset_a_k_; diff --git a/include/triton/codegen/target.h b/include/triton/codegen/selection/target.h similarity index 100% rename from include/triton/codegen/target.h rename to include/triton/codegen/selection/target.h diff --git a/include/triton/codegen/optimize_dce.h b/include/triton/codegen/transform/dce.h similarity index 93% rename from include/triton/codegen/optimize_dce.h rename to include/triton/codegen/transform/dce.h index e40bafef5..169363752 100644 --- a/include/triton/codegen/optimize_dce.h +++ b/include/triton/codegen/transform/dce.h @@ -12,7 +12,7 @@ namespace ir { } namespace codegen{ -class tune; +namespace transform{ class optimize_dce { public: @@ -20,7 +20,7 @@ public: void run(ir::module &mod); }; - +} } } diff --git a/include/triton/codegen/optimize_dot.h b/include/triton/codegen/transform/dot.h similarity index 70% rename from include/triton/codegen/optimize_dot.h rename to include/triton/codegen/transform/dot.h index 76d8368dc..15612e2f0 100644 --- a/include/triton/codegen/optimize_dot.h +++ b/include/triton/codegen/transform/dot.h @@ -13,18 +13,22 @@ namespace ir { namespace codegen{ +namespace analysis{ class tune; +} + +namespace transform{ class optimize_dot { public: - optimize_dot(tune* params): params_(params) {} + optimize_dot(analysis::tune* params): params_(params) {} void run(ir::module &mod); private: - tune* params_; + analysis::tune* params_; }; - +} } } diff --git a/include/triton/codegen/reassociate.h b/include/triton/codegen/transform/reassociate.h similarity index 82% rename from include/triton/codegen/reassociate.h rename to include/triton/codegen/transform/reassociate.h index 3c9cc813b..66d95eb44 100644 --- a/include/triton/codegen/reassociate.h +++ b/include/triton/codegen/transform/reassociate.h @@ -19,8 +19,12 @@ class getelementptr_inst; namespace codegen{ +namespace analysis{ class tune; class alignment_info; +} + +namespace transform{ class reassociate { struct cst_info { @@ -34,16 +38,18 @@ private: ir::value *reassociate_ptr(ir::getelementptr_inst* pz, ir::builder &builder, std::map &offsets); public: - reassociate(tune *params, alignment_info *align); + reassociate(analysis::tune *params, analysis::alignment_info *align); void run(ir::module& module); private: - tune* params_; - alignment_info* align_; + analysis::tune* params_; + analysis::alignment_info* align_; }; } } +} + #endif diff --git a/include/triton/codegen/shmem_barriers.h b/include/triton/codegen/transform/shmem/barriers.h similarity index 78% rename from include/triton/codegen/shmem_barriers.h rename to include/triton/codegen/transform/shmem/barriers.h index 271b745cc..d03360690 100644 --- a/include/triton/codegen/shmem_barriers.h +++ b/include/triton/codegen/transform/shmem/barriers.h @@ -17,8 +17,16 @@ namespace ir { namespace codegen{ -class shmem_allocation; -class shmem_info; +namespace analysis{ +namespace shmem{ + +class allocation; +class info; + +} +} + +namespace transform{ class shmem_barriers { private: @@ -36,15 +44,16 @@ private: std::pair transfer(ir::basic_block *block, const interval_vec_t &written_to, const interval_vec_t &read_from, std::set &insert_loc); public: - shmem_barriers(shmem_allocation *alloc, shmem_info *buffer_info): alloc_(alloc), buffer_info_(buffer_info) {} + shmem_barriers(analysis::shmem::allocation *alloc, analysis::shmem::info *buffer_info): alloc_(alloc), buffer_info_(buffer_info) {} void run(ir::module &mod); private: - shmem_allocation *alloc_; - shmem_info *buffer_info_; + analysis::shmem::allocation *alloc_; + analysis::shmem::info *buffer_info_; }; +} } } diff --git a/include/triton/codegen/optimize_trans.h b/include/triton/codegen/transform/trans.h similarity index 95% rename from include/triton/codegen/optimize_trans.h rename to include/triton/codegen/transform/trans.h index 8af45205d..4bdb62157 100644 --- a/include/triton/codegen/optimize_trans.h +++ b/include/triton/codegen/transform/trans.h @@ -17,6 +17,7 @@ namespace ir { } namespace codegen{ +namespace transform{ class optimize_trans { private: @@ -28,6 +29,7 @@ public: }; +} } } diff --git a/include/triton/codegen/vectorize.h b/include/triton/codegen/transform/vectorize.h similarity index 62% rename from include/triton/codegen/vectorize.h rename to include/triton/codegen/transform/vectorize.h index fe6df9dcf..09fb48000 100644 --- a/include/triton/codegen/vectorize.h +++ b/include/triton/codegen/transform/vectorize.h @@ -9,18 +9,22 @@ namespace ir { namespace codegen{ -class tune; +namespace analysis{ + class tune; +} + +namespace transform{ class vectorize { public: - vectorize(tune *params): params_(params){} + vectorize(analysis::tune *params): params_(params){} void run(ir::module &mod); private: - tune *params_; + analysis::tune *params_; }; - +} } } diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index 563d5863a..1cc8c929a 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -8,19 +8,19 @@ #include "triton/ir/print.h" #include "triton/driver/module.h" #include "triton/driver/kernel.h" -#include "triton/codegen/selection.h" -#include "triton/codegen/tune.h" -#include "triton/codegen/optimize_dot.h" -#include "triton/codegen/optimize_dce.h" -#include "triton/codegen/optimize_trans.h" -#include "triton/codegen/shmem_allocation.h" -#include "triton/codegen/shmem_liveness.h" -#include "triton/codegen/shmem_info.h" -#include "triton/codegen/shmem_barriers.h" -#include "triton/codegen/alignment_info.h" -#include "triton/codegen/reassociate.h" -#include "triton/codegen/target.h" -#include "triton/codegen/vectorize.h" +#include "triton/codegen/selection/selection.h" +#include "triton/codegen/selection/target.h" +#include "triton/codegen/analysis/tune.h" +#include "triton/codegen/analysis/shmem/allocation.h" +#include "triton/codegen/analysis/shmem/liveness.h" +#include "triton/codegen/analysis/shmem/info.h" +#include "triton/codegen/analysis/alignment.h" +#include "triton/codegen/transform/dot.h" +#include "triton/codegen/transform/dce.h" +#include "triton/codegen/transform/trans.h" +#include "triton/codegen/transform/shmem/barriers.h" +#include "triton/codegen/transform/reassociate.h" +#include "triton/codegen/transform/vectorize.h" #include "triton/runtime/launch_info.h" #include @@ -35,8 +35,10 @@ class translation_unit; } namespace codegen{ +namespace analysis{ class tune; } +} namespace ir { class module; @@ -90,18 +92,18 @@ public: // ir::print(module, std::cout); } - codegen::tune tune; - codegen::shmem_info shmem_info; - codegen::shmem_liveness shmem_liveness; - codegen::shmem_allocation shmem_allocation; - codegen::shmem_barriers shmem_barriers; - codegen::vectorize vectorize; codegen::selection selection; - codegen::optimize_dot optimize_dot; - codegen::optimize_dce optimize_dce; - codegen::optimize_trans optimize_trans; - codegen::alignment_info alignment_info; - codegen::reassociate reassociate; + codegen::analysis::tune tune; + codegen::analysis::shmem::info shmem_info; + codegen::analysis::shmem::liveness shmem_liveness; + codegen::analysis::shmem::allocation shmem_allocation; + codegen::analysis::alignment_info alignment_info; + codegen::transform::shmem_barriers shmem_barriers; + codegen::transform::vectorize vectorize; + codegen::transform::optimize_dot optimize_dot; + codegen::transform::optimize_dce optimize_dce; + codegen::transform::optimize_trans optimize_trans; + codegen::transform::reassociate reassociate; codegen::target* target_; }; diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp index 0ebf7f360..74053b717 100644 --- a/include/triton/tools/bench.hpp +++ b/include/triton/tools/bench.hpp @@ -38,8 +38,8 @@ inline double bench(std::function const & op, driver::stream * stream) while(total_time*1e-9 < 1e-3){ float norm = 1; // normalize clock if possible to reduce noise in auto-tuning - if(auto cu_device = dynamic_cast(device)) - norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock(); +// if(auto cu_device = dynamic_cast(device)) +// norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock(); tmr.start(); op(); stream->synchronize(); diff --git a/lib/codegen/alignment_info.cpp b/lib/codegen/analysis/alignment.cpp similarity index 99% rename from lib/codegen/alignment_info.cpp rename to lib/codegen/analysis/alignment.cpp index ed20e01fc..3ed74f7a3 100644 --- a/lib/codegen/alignment_info.cpp +++ b/lib/codegen/analysis/alignment.cpp @@ -1,4 +1,4 @@ -#include "triton/codegen/alignment_info.h" +#include "triton/codegen/analysis/alignment.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" @@ -7,6 +7,7 @@ namespace triton { namespace codegen{ +namespace analysis{ inline int gcd(int a, int b) { @@ -310,3 +311,4 @@ void alignment_info::run(ir::module &mod) { } } +} diff --git a/lib/codegen/shmem_allocation.cpp b/lib/codegen/analysis/shmem/allocation.cpp similarity index 93% rename from lib/codegen/shmem_allocation.cpp rename to lib/codegen/analysis/shmem/allocation.cpp index 042f9a19d..1a2d69536 100644 --- a/lib/codegen/shmem_allocation.cpp +++ b/lib/codegen/analysis/shmem/allocation.cpp @@ -1,7 +1,7 @@ -#include "triton/codegen/shmem_allocation.h" -#include "triton/codegen/shmem_liveness.h" -#include "triton/codegen/shmem_info.h" -#include "triton/codegen/tune.h" +#include "triton/codegen/analysis/shmem/allocation.h" +#include "triton/codegen/analysis/shmem/liveness.h" +#include "triton/codegen/analysis/shmem/info.h" +#include "triton/codegen/analysis/tune.h" #include "triton/ir/basic_block.h" #include "triton/ir/type.h" #include "triton/ir/value.h" @@ -10,8 +10,10 @@ namespace triton{ namespace codegen{ +namespace analysis{ +namespace shmem{ -unsigned shmem_allocation::is_ld_padded(ir::value *x) { +unsigned allocation::is_ld_padded(ir::value *x) { if(auto *trans = dynamic_cast(x)){ if(trans->get_perm()[0]->get_value() != 0) return 4; @@ -43,7 +45,7 @@ unsigned shmem_allocation::is_ld_padded(ir::value *x) { return 0; } -unsigned shmem_allocation::get_num_bytes(ir::value *x) { +unsigned allocation::get_num_bytes(ir::value *x) { if(auto *red = dynamic_cast(x)){ unsigned num_bytes = x->get_type()->get_scalar_ty()->get_primitive_size_in_bits() / 8; size_t axis = red->get_axis(); @@ -71,7 +73,7 @@ unsigned shmem_allocation::get_num_bytes(ir::value *x) { return num_bytes; } -void shmem_allocation::run(){ +void allocation::run(){ using std::max; using std::min; typedef std::multimap triples_map_type; @@ -174,3 +176,5 @@ void shmem_allocation::run(){ } } +} +} diff --git a/lib/codegen/shmem_info.cpp b/lib/codegen/analysis/shmem/info.cpp similarity index 91% rename from lib/codegen/shmem_info.cpp rename to lib/codegen/analysis/shmem/info.cpp index 659afaa4a..63dbfb93f 100644 --- a/lib/codegen/shmem_info.cpp +++ b/lib/codegen/analysis/shmem/info.cpp @@ -1,4 +1,4 @@ -#include "triton/codegen/shmem_info.h" +#include "triton/codegen/analysis/shmem/info.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" @@ -8,10 +8,11 @@ namespace triton { namespace codegen{ - +namespace analysis{ +namespace shmem{ // run pass on module -bool shmem_info::is_loop_latch(ir::phi_node *phi, ir::instruction *terminator){ +bool info::is_loop_latch(ir::phi_node *phi, ir::instruction *terminator){ if(phi->get_parent() != terminator->get_parent()) return false; if(auto *br = dynamic_cast(terminator)) @@ -23,7 +24,7 @@ bool shmem_info::is_loop_latch(ir::phi_node *phi, ir::instruction *terminator){ throw std::runtime_error("unreachable"); } -void shmem_info::replace(ir::value* before, ir::value *after) { +void info::replace(ir::value* before, ir::value *after) { shared_.erase(before); shared_.insert(after); if(refs_.find(before) != refs_.end()){ @@ -70,7 +71,7 @@ void add_copy(ir::value *x, ir::builder &builder) { } } -void shmem_info::run(ir::module &mod) { +void info::run(ir::module &mod) { // Add shared copies for(ir::function *fn: mod.get_function_list()){ ir::builder builder(mod.get_context()); @@ -120,18 +121,20 @@ void shmem_info::run(ir::module &mod) { } // query double-buffered status -bool shmem_info::is_double(ir::value *x) +bool info::is_double(ir::value *x) { return double_.find(x) != double_.end(); } // query shared status -bool shmem_info::is_shared(ir::value *x) +bool info::is_shared(ir::value *x) { return shared_.find(x) != shared_.end(); } // get reference if any -ir::value *shmem_info::get_reference(ir::value *x) +ir::value *info::get_reference(ir::value *x) { return refs_[x]; } +} +} } } diff --git a/lib/codegen/shmem_liveness.cpp b/lib/codegen/analysis/shmem/liveness.cpp similarity index 84% rename from lib/codegen/shmem_liveness.cpp rename to lib/codegen/analysis/shmem/liveness.cpp index 4d8e9c66b..617a764ed 100644 --- a/lib/codegen/shmem_liveness.cpp +++ b/lib/codegen/analysis/shmem/liveness.cpp @@ -1,5 +1,5 @@ -#include "triton/codegen/shmem_liveness.h" -#include "triton/codegen/shmem_info.h" +#include "triton/codegen/analysis/shmem/liveness.h" +#include "triton/codegen/analysis/shmem/info.h" #include "triton/ir/basic_block.h" #include "triton/ir/function.h" #include "triton/ir/module.h" @@ -8,10 +8,11 @@ namespace triton{ namespace codegen{ - +namespace analysis{ +namespace shmem{ // Entry point -void shmem_liveness::run(ir::module &mod) { +void liveness::run(ir::module &mod) { for(ir::function *fn: mod.get_function_list()){ // Assigns index to each instruction slot_index index = 0; @@ -39,3 +40,5 @@ void shmem_liveness::run(ir::module &mod) { } } +} +} diff --git a/lib/codegen/tune.cpp b/lib/codegen/analysis/tune.cpp similarity index 98% rename from lib/codegen/tune.cpp rename to lib/codegen/analysis/tune.cpp index 39dea9f12..7fba702cc 100644 --- a/lib/codegen/tune.cpp +++ b/lib/codegen/analysis/tune.cpp @@ -1,4 +1,4 @@ -#include "triton/codegen/tune.h" +#include "triton/codegen/analysis/tune.h" #include "triton/ir/instructions.h" #include "triton/ir/type.h" #include "triton/ir/module.h" @@ -12,6 +12,7 @@ namespace triton{ namespace codegen{ +namespace analysis{ tune::tune(): num_global_ranges_(0){ } @@ -257,7 +258,7 @@ void tune::run(ir::module &mod) { ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 2, 2); if(node.second == 2) fpw->set_value(1); - ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 2, 4); + ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 1, 4); connected_components(node, {fpw, wpt}, {"fpw", "wpt"}, nodes_, dependencies_, group_id++); } } @@ -270,7 +271,8 @@ void tune::run(ir::module &mod) { continue; auto shapes = i->get_type()->get_tile_shapes(); - if(auto *x = dynamic_cast(i)){ + if(auto *x = dynamic_cast(i)) + if(fragments_.at({i, 0}) == STRIDED_SCAN){ ir::type *ptr_ty = x->get_pointer_operand()->get_type()->get_scalar_ty(); size_t addr_space = ptr_ty->get_pointer_address_space(); if(addr_space < 4){ @@ -452,3 +454,4 @@ unsigned tune::get_num_threads() { } } +} diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection/selection.cpp similarity index 98% rename from lib/codegen/selection.cpp rename to lib/codegen/selection/selection.cpp index 86bee2932..491ff870f 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection/selection.cpp @@ -1,15 +1,15 @@ -#include "triton/codegen/selection.h" -#include "triton/codegen/tune.h" -#include "triton/codegen/shmem_allocation.h" -#include "triton/codegen/target.h" -#include "triton/codegen/alignment_info.h" -#include "llvm/IR/InstrTypes.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/IRBuilder.h" +#include "triton/codegen/selection/selection.h" +#include "triton/codegen/analysis/tune.h" +#include "triton/codegen/analysis/shmem/allocation.h" +#include "triton/codegen/selection/target.h" +#include "triton/codegen/analysis/alignment.h" #include "triton/ir/context.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/type.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/Transforms/Scalar/EarlyCSE.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -485,7 +485,7 @@ inline void to_warps(const std::vector &bs, std::vector &nw, void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { const auto& shapes = v->get_type()->get_tile_shapes(); size_t dim = shapes.size(); - if(params_->get_fragment(v, 0) == tune::STRIDED_SCAN){ + if(params_->get_fragment(v, 0) == analysis::tune::STRIDED_SCAN){ std::vector contiguous(dim); std::vector block_size(dim); std::vector warp_size(dim); @@ -1022,7 +1022,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & { shared_tile *TA = (shared_tile*)tmap_.at(A); shared_tile *TB = (shared_tile*)tmap_.at(B); - if(params_->get_fragment(ins, 0) == tune::STRIDED_SCAN) { + if(params_->get_fragment(ins, 0) == analysis::tune::STRIDED_SCAN) { TA->set_vector_size(TC->axis(0).contiguous); TB->set_vector_size(TC->axis(1).contiguous); result->for_each([&](indices_t idx){ @@ -1047,8 +1047,6 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & a = builder.CreateFPCast(a, c_ty); if(b->getType() != c_ty) b = builder.CreateFPCast(b, c_ty); -// a = ConstantFP::get(builder.getFloatTy(), 1); -// b = ConstantFP::get(builder.getFloatTy(), 1); res = builder.CreateCall(f_mul_add, {a, b, res}); } result->set_value(idx, res); @@ -1060,13 +1058,14 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & TA->set_return_mode(true); TB->set_return_mode(true); - std::map> fcs; + std::map, std::vector> fcs; result->for_each([&](indices_t idx){ - fcs[{builder.getInt32(0)}].push_back(TC->get_value(idx)); + std::vector key(idx.size() - 2); + std::copy(idx.begin() + 2, idx.end(), key.begin()); + fcs[key].push_back(TC->get_value(idx)); }); - Type *fp32_ty = builder.getFloatTy(); Type *fp16x2_ty = VectorType::get(builder.getHalfTy(), 2); Type *fp32_pack8_ty = StructType::get(ctx, {fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}); @@ -1122,8 +1121,8 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & std::swap(idx_a[0], idx_a[1]); if(!dot->is_b_trans()) std::swap(idx_b[0], idx_b[1]); -// idx_a.push_back(builder.getInt32(0)); -// idx_b.push_back(builder.getInt32(0)); + idx_a.insert(idx_a.end(), x.first.begin(), x.first.end()); + idx_b.insert(idx_b.end(), x.first.begin(), x.first.end()); Value *ha = TA->get_value(idx_a); Value *hb = TB->get_value(idx_b); for(unsigned ii = 0; ii < pack_size_0_; ii++) @@ -1159,9 +1158,11 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & // write back unsigned i = 0; result->for_each([&](indices_t idx){ - if(i >= fcs.at({builder.getInt32(0)}).size()) + std::vector key(idx.size() - 2); + std::copy(idx.begin() + 2, idx.end(), key.begin()); + if(i >= fcs.at(key).size()) i = 0; - result->set_value(idx, fcs.at({builder.getInt32(0)})[i++]); + result->set_value(idx, fcs.at(key)[i++]); }); TA->set_return_mode(false); diff --git a/lib/codegen/target.cpp b/lib/codegen/selection/target.cpp similarity index 99% rename from lib/codegen/target.cpp rename to lib/codegen/selection/target.cpp index 4116bcca7..3a5e35aa1 100644 --- a/lib/codegen/target.cpp +++ b/lib/codegen/selection/target.cpp @@ -1,4 +1,4 @@ -#include "triton/codegen/target.h" +#include "triton/codegen/selection/target.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Function.h" #include "llvm/IR/Intrinsics.h" diff --git a/lib/codegen/optimize_dce.cpp b/lib/codegen/transform/dce.cpp similarity index 96% rename from lib/codegen/optimize_dce.cpp rename to lib/codegen/transform/dce.cpp index 8caf22a62..d11caf55f 100644 --- a/lib/codegen/optimize_dce.cpp +++ b/lib/codegen/transform/dce.cpp @@ -2,10 +2,11 @@ #include "triton/ir/basic_block.h" #include "triton/ir/module.h" #include "triton/ir/cfg.h" -#include "triton/codegen/optimize_dce.h" +#include "triton/codegen/transform/dce.h" namespace triton { namespace codegen{ +namespace transform{ void optimize_dce::run(ir::module &mod) { @@ -60,3 +61,4 @@ void optimize_dce::run(ir::module &mod) { } } +} diff --git a/lib/codegen/optimize_dot.cpp b/lib/codegen/transform/dot.cpp similarity index 97% rename from lib/codegen/optimize_dot.cpp rename to lib/codegen/transform/dot.cpp index 00893c1a5..fa1a542f0 100644 --- a/lib/codegen/optimize_dot.cpp +++ b/lib/codegen/transform/dot.cpp @@ -1,11 +1,12 @@ #include "triton/ir/function.h" #include "triton/ir/basic_block.h" #include "triton/ir/module.h" -#include "triton/codegen/optimize_dot.h" -#include "triton/codegen/tune.h" +#include "triton/codegen/transform/dot.h" +#include "triton/codegen/analysis/tune.h" namespace triton { namespace codegen{ +namespace transform{ inline bool is_trans(ir::value *v){ auto *x = dynamic_cast(v); @@ -109,3 +110,4 @@ void optimize_dot::run(ir::module &mod) { } } +} diff --git a/lib/codegen/reassociate.cpp b/lib/codegen/transform/reassociate.cpp similarity index 97% rename from lib/codegen/reassociate.cpp rename to lib/codegen/transform/reassociate.cpp index d0a54ec31..a1594fb17 100644 --- a/lib/codegen/reassociate.cpp +++ b/lib/codegen/transform/reassociate.cpp @@ -1,15 +1,16 @@ #include -#include "triton/codegen/reassociate.h" -#include "triton/codegen/alignment_info.h" +#include "triton/codegen/transform/reassociate.h" +#include "triton/codegen/analysis/alignment.h" +#include "triton/codegen/analysis/tune.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" #include "triton/ir/instructions.h" #include "triton/ir/cfg.h" -#include "triton/codegen/tune.h" namespace triton { namespace codegen{ +namespace transform{ //inline Constant *get_gep_cst_offset(GetElementPtrInst *gep){ // std::vector idx_vals; @@ -154,7 +155,7 @@ ir::value *reassociate::reassociate_idx(ir::value *old_value, return new_value; } -reassociate::reassociate(tune* params, alignment_info* align) +reassociate::reassociate(analysis::tune* params, analysis::alignment_info* align) : params_(params), align_(align) { } @@ -280,3 +281,4 @@ void reassociate::run(ir::module &mod) { } } +} diff --git a/lib/codegen/shmem_barriers.cpp b/lib/codegen/transform/shmem/barriers.cpp similarity index 96% rename from lib/codegen/shmem_barriers.cpp rename to lib/codegen/transform/shmem/barriers.cpp index 717b927fd..be0875b96 100644 --- a/lib/codegen/shmem_barriers.cpp +++ b/lib/codegen/transform/shmem/barriers.cpp @@ -1,7 +1,7 @@ #include -#include "triton/codegen/shmem_barriers.h" -#include "triton/codegen/shmem_allocation.h" -#include "triton/codegen/shmem_info.h" +#include "triton/codegen/transform/shmem/barriers.h" +#include "triton/codegen/analysis/shmem/allocation.h" +#include "triton/codegen/analysis/shmem/info.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" @@ -11,6 +11,7 @@ namespace triton { namespace codegen{ +namespace transform{ bool shmem_barriers::intersect(const interval_vec_t &X, interval_t x) { return std::any_of(X.begin(), X.end(), [&](const interval_t &y){ @@ -137,3 +138,4 @@ void shmem_barriers::run(ir::module &mod) { } } +} diff --git a/lib/codegen/optimize_trans.cpp b/lib/codegen/transform/trans.cpp similarity index 97% rename from lib/codegen/optimize_trans.cpp rename to lib/codegen/transform/trans.cpp index b1e0dc4b9..43cba99b7 100644 --- a/lib/codegen/optimize_trans.cpp +++ b/lib/codegen/transform/trans.cpp @@ -1,9 +1,10 @@ #include "triton/ir/module.h" #include "triton/ir/function.h" -#include "triton/codegen/optimize_trans.h" +#include "triton/codegen/transform/trans.h" namespace triton { namespace codegen{ +namespace transform{ ir::value* optimize_trans::replace_phi(ir::value* value, @@ -73,3 +74,4 @@ void optimize_trans::run(ir::module &mod) { } } +} diff --git a/lib/codegen/vectorize.cpp b/lib/codegen/transform/vectorize.cpp similarity index 91% rename from lib/codegen/vectorize.cpp rename to lib/codegen/transform/vectorize.cpp index 8ec5a99f6..dbf7ee7f1 100644 --- a/lib/codegen/vectorize.cpp +++ b/lib/codegen/transform/vectorize.cpp @@ -1,5 +1,5 @@ -#include "triton/codegen/vectorize.h" -#include "triton/codegen/tune.h" +#include "triton/codegen/transform/vectorize.h" +#include "triton/codegen/analysis/tune.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" @@ -8,6 +8,7 @@ namespace triton { namespace codegen{ +namespace transform{ void vectorize::run(ir::module &mod) { ir::builder &builder = mod.get_builder(); @@ -39,3 +40,4 @@ void vectorize::run(ir::module &mod) { } } +} diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index 86d031564..dae023eef 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -62,7 +62,8 @@ std::pair base::get_profile_impl(driver::stream *stream, std::v jit->add_module(name_.c_str(), src.c_str(), best.params); } else{ - params_t params = heuristics(); +// params_t params = heuristics(); + params_t params = {4, 2, 16, 4, 4, 16, 2, 2, 1, 1, 1, 8, 64, 8, 8, 1, 4, 2, 1}; jit->add_module(name_.c_str(), src.c_str(), params); } triton::driver::kernel* kernel = jit->get_function(name_.c_str()); diff --git a/lib/dnn/blocksparse/dot.cpp b/lib/dnn/blocksparse/dot.cpp index 9c7fd95d9..054904f27 100644 --- a/lib/dnn/blocksparse/dot.cpp +++ b/lib/dnn/blocksparse/dot.cpp @@ -96,7 +96,7 @@ void dot::triton_c_src(std::ostream &os) const { restrict read_only align(16) )" + ab_ty_ + R"( *B, )" + c_ty_ + R"(* C, int lda, int ldc, int N, - int* lut, int* locks, int nlocks){ + int* lut, int* locks, int nlocks) { int ridx = get_range_id(0); int ridy = get_range_id(1); float acc[TM, TN] = 0; @@ -129,10 +129,10 @@ void dot::triton_c_src(std::ostream &os) const { )" + c_ty_ + R"(" c[TM, TN] = acc; )" + c_ty_ + R"(* pc[TM, TN] = C + rxc[:, newaxis] + ryc[newaxis, :]*ldc; bool checkc[TM, TN] = (rxc < N)[:, newaxis]; - if(lockid == 0) + if(lockid == 0) { @checkc *pc = c; - else - { + } + else { int *plock = locks + ridx*nlocks + lockid - 1; int *pcount = plock + get_num_program(0)*nlocks; while(__atomic_cas(plock, 0, 1)); @@ -147,10 +147,11 @@ void dot::triton_c_src(std::ostream &os) const { __atomic_exch(plock, 0); } })"; - os << result; } + + } } } diff --git a/lib/dnn/dot.cpp b/lib/dnn/dot.cpp index 5465b26b9..22198b7af 100644 --- a/lib/dnn/dot.cpp +++ b/lib/dnn/dot.cpp @@ -75,14 +75,14 @@ void dot::triton_c_src(std::ostream &os) const { std::string ZS = "1"; std::string AS0 = "TM", AS1 = "TK"; std::string BS0 = "TK", BS1 = "TN"; - std::string XAS0 = "TM", XAS1 = "TK", XAS2 = ZS; - std::string XBS0 = "TK", XBS1 = ZS, XBS2 = "TN"; + std::string XAS0 = "TM", XAS1 = "TK / " + ZS, XAS2 = ZS; + std::string XBS0 = "TK / " + ZS, XBS1 = ZS, XBS2 = "TN"; std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; std::string lda0 = "*lda", lda1 = ""; std::string ldb0 = "", ldb1 = "*ldb"; - std::string usea = AT_ ? "trans(xa, 2, 0, 1)" : "xa"; - std::string useb = BT_ ? "trans(xb, 1, 0, 2)" : "trans(xb, 0, 2, 1)"; + std::string usea = AT_ ? "trans(a)" : "a"; + std::string useb = BT_ ? "trans(b)" : "b"; if(AT_){ std::swap(AS0, AS1); std::swap(XAS0, XAS1); @@ -99,15 +99,15 @@ void dot::triton_c_src(std::ostream &os) const { } std::string AS = AS0 + ", " + AS1; std::string BS = BS0 + ", " + BS1; - std::string XAS = XAS0 + ", " + XAS1 + ", " + XAS2; - std::string XBS = XBS0 + ", " + XBS1 + ", " + XBS2; - std::string XCS = "TM, TN, " + ZS; +// std::string XAS = XAS0 + ", " + XAS1 + ", " + XAS2; +// std::string XBS = XBS0 + ", " + XBS1 + ", " + XBS2; + std::string XCS = "TM, TN"; std::string align_lda_str = "multiple_of(" + std::to_string(align_lda_) + ")"; std::string align_ldb_str = "multiple_of(" + std::to_string(align_ldb_) + ")"; std::string res = R"( -const tunable int TM = {16, 32, 64, 128}; -const tunable int TN = {16, 32, 64, 128}; +const tunable int TM = {32}; +const tunable int TN = {32}; const tunable int TK = {32}; const tunable int GZ = {1}; @@ -131,8 +131,6 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, bool checkb[)" + BS + R"(] = (rkb < K))" + bcb0 + " && (ryb < N)" + bcb1 + R"(; )" + a_ty_ + R"( a[)" + AS + R"(] = checka ? *pa : 0; )" + b_ty_ + R"( b[)" + BS + R"(] = checkb ? *pb : 0; - )" + a_ty_ + R"( xa[)" + XAS + "] = __reshape(a, " + XAS + R"(); - )" + b_ty_ + R"( xb[)" + XBS + "] = __reshape(b, " + XBS + R"(); for(int k = K; k > 0; k = k - TK){ xc = dot()" + usea + ", " + useb + R"(, xc); pa = pa + TK)" + lda0 + R"(; @@ -141,13 +139,11 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, bool checkb[)" + BS + R"(] = k > TK; a = checka ? *pa : 0; b = checkb ? *pb : 0; - xa = __reshape(a, )" + XAS + R"(); - xb = __reshape(b, )" + XBS + R"(); } int rxc[TM] = ridx * TM + (0 ... TM); int ryc[TN] = ridy * TN + (0 ... TN); )" + c_ty_ + R"(* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - )" + c_ty_ + R"( c[TM, TN] = __sum(xc, 2); + )" + c_ty_ + R"( c[TM, TN] = xc; bool checkc0[TM] = rxc < M; bool checkc1[TN] = ryc < N; bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; diff --git a/lib/driver/device.cpp b/lib/driver/device.cpp index ae66c50c8..41a9561eb 100755 --- a/lib/driver/device.cpp +++ b/lib/driver/device.cpp @@ -28,7 +28,7 @@ #include "triton/driver/helpers/CL/infos.hpp" #include "triton/driver/device.h" #include "triton/driver/context.h" -#include "triton/codegen/target.h" +#include "triton/codegen/selection/target.h" namespace triton { diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index 1f6a60ccd..141bb8054 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -1,6 +1,6 @@ #include #include "triton/lang/lang.h" -#include "triton/codegen/target.h" +#include "triton/codegen/selection/target.h" #include "triton/ir/context.h" #include "triton/ir/context_impl.h" #include "triton/driver/device.h" From f93099bda13b698272cceca9820ba4821d3c2e42 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 7 Aug 2019 21:50:16 -0700 Subject: [PATCH 289/494] [codegen][transform][trans] fixed incorrect replace_all_uses_with --- examples/cpp/dot.cpp | 2 +- include/triton/runtime/jit.h | 11 +++++------ lib/codegen/transform/trans.cpp | 4 +--- lib/dnn/base.cpp | 3 +-- 4 files changed, 8 insertions(+), 12 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index ef73a7581..20b5bc72f 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -49,7 +49,7 @@ perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int stream->synchronize(); triton::dnn::dot dot(M, N, K, AT, BT, ty, ty, ty, 8, 8, 8); // benchmark triton - double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::FULL_TUNING);}, stream); + double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::NO_TUNING);}, stream); // benchmark cublas // NumericT alpha = 1; // NumericT beta = 0; diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index 1cc8c929a..f43f94e8f 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -65,7 +65,7 @@ public: vectorize(&tune), selection(&shmem_allocation, &tune, &shmem_info, &alignment_info, target), optimize_dot(&tune), - optimize_dce(), + dce(), optimize_trans(), alignment_info(), reassociate(&tune, &alignment_info), @@ -73,9 +73,9 @@ public: void target_independent(ir::module &module) { optimize_dot.run(module); - optimize_dce.run(module); optimize_trans.run(module); - optimize_dce.run(module); + dce.run(module); +// ir::print(module, std::cout); } void target_dependent(ir::module &module) { @@ -88,8 +88,7 @@ public: shmem_barriers.run(module); } vectorize.run(module); - optimize_dce.run(module); -// ir::print(module, std::cout); + dce.run(module); } codegen::selection selection; @@ -101,7 +100,7 @@ public: codegen::transform::shmem_barriers shmem_barriers; codegen::transform::vectorize vectorize; codegen::transform::optimize_dot optimize_dot; - codegen::transform::optimize_dce optimize_dce; + codegen::transform::optimize_dce dce; codegen::transform::optimize_trans optimize_trans; codegen::transform::reassociate reassociate; codegen::target* target_; diff --git a/lib/codegen/transform/trans.cpp b/lib/codegen/transform/trans.cpp index 43cba99b7..4edfa6a59 100644 --- a/lib/codegen/transform/trans.cpp +++ b/lib/codegen/transform/trans.cpp @@ -17,10 +17,9 @@ ir::value* optimize_trans::replace_phi(ir::value* value, incs.push_back(replace_phi(phi->get_incoming_value(n), builder, perm)); // create phi for transposed values builder.set_insert_point(phi); - ir::phi_node* result = builder.create_phi(incs[0]->get_type(), incs.size(), phi->get_name()); + ir::phi_node* result = builder.create_phi(incs[0]->get_type(), incs.size()); for(unsigned n = 0; n < phi->get_num_incoming(); n++) result->add_incoming(incs[n], phi->get_incoming_block(n)); - phi->replace_all_uses_with(result); return result; } else if(auto i = dynamic_cast(value)){ @@ -29,7 +28,6 @@ ir::value* optimize_trans::replace_phi(ir::value* value, it++; builder.set_insert_point(it); ir::instruction *trans = (ir::instruction*)builder.create_trans(i, perm); - i->replace_all_uses_with(trans); trans->set_operand(0, i); return trans; } diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp index dae023eef..86d031564 100644 --- a/lib/dnn/base.cpp +++ b/lib/dnn/base.cpp @@ -62,8 +62,7 @@ std::pair base::get_profile_impl(driver::stream *stream, std::v jit->add_module(name_.c_str(), src.c_str(), best.params); } else{ -// params_t params = heuristics(); - params_t params = {4, 2, 16, 4, 4, 16, 2, 2, 1, 1, 1, 8, 64, 8, 8, 1, 4, 2, 1}; + params_t params = heuristics(); jit->add_module(name_.c_str(), src.c_str(), params); } triton::driver::kernel* kernel = jit->get_function(name_.c_str()); From fd49cdc92be0f2c582fa7c2273c24e2ac16e1284 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 8 Aug 2019 19:14:30 -0700 Subject: [PATCH 290/494] [dnn][blocksparse] added dw code --- examples/cpp/dot.cpp | 4 +- examples/python/tensorflow/blocksparse.cpp | 146 +++++++++++++++---- include/triton/dnn/blocksparse/dot.h | 3 + include/triton/runtime/jit.h | 2 +- lib/codegen/analysis/tune.cpp | 14 +- lib/codegen/selection/selection.cpp | 3 +- lib/dnn/blocksparse/dot.cpp | 157 ++++++++++++++++----- lib/dnn/dot.cpp | 4 +- lib/runtime/jit.cpp | 6 +- 9 files changed, 256 insertions(+), 83 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 20b5bc72f..e9ad43f71 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -49,7 +49,7 @@ perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int stream->synchronize(); triton::dnn::dot dot(M, N, K, AT, BT, ty, ty, ty, 8, 8, 8); // benchmark triton - double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::NO_TUNING);}, stream); + double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::FULL_TUNING);}, stream); // benchmark cublas // NumericT alpha = 1; // NumericT beta = 0; @@ -111,7 +111,7 @@ int main() { // shapes to benchmark std::vector configs = { // {false, false, 8192, 512, 512}, - {false, true, 128, 128, 128} + {false, true, 64, 64, 128} // {false, true, 128, 128, 128}, // {false, false, 128, 128, 128}, // {true, false, 128, 128, 128}, diff --git a/examples/python/tensorflow/blocksparse.cpp b/examples/python/tensorflow/blocksparse.cpp index e2a0b5144..1ff5e9f6f 100644 --- a/examples/python/tensorflow/blocksparse.cpp +++ b/examples/python/tensorflow/blocksparse.cpp @@ -46,6 +46,19 @@ Status XpropShape(InferenceContext* ctx) return Status::OK(); } +Status UpdatShape(InferenceContext* ctx) +{ + //printf("UpdatShape: %d\n", ctx->Rank(ctx->input(0))); + + int blocks, bsize; + TF_RETURN_IF_ERROR(ctx->GetAttr("blocks", &blocks)); + TF_RETURN_IF_ERROR(ctx->GetAttr("bsize", &bsize)); + + // (blocks, block_size, block_size) + DimensionHandle bsize_dim = ctx->MakeDim(bsize); + ctx->set_output(0, ctx->MakeShape({ ctx->MakeDim(blocks), bsize_dim, bsize_dim })); + return Status::OK(); +} typedef struct bsmm_params { @@ -72,34 +85,46 @@ typedef struct bsmm_params template class BlocksparseMatmulOp : public OpKernel { -public: - explicit BlocksparseMatmulOp(OpKernelConstruction* ctx) : OpKernel(ctx) { - OP_REQUIRES_OK(ctx, ctx->GetAttr("segments", ¶ms_.segments)); - OP_REQUIRES_OK(ctx, ctx->GetAttr("locks", ¶ms_.locks )); - OP_REQUIRES_OK(ctx, ctx->GetAttr("blocks", ¶ms_.blocks )); - OP_REQUIRES_OK(ctx, ctx->GetAttr("bsize", ¶ms_.bsize )); - OP_REQUIRES_OK(ctx, ctx->GetAttr("C", ¶ms_.C )); - OP_REQUIRES_OK(ctx, ctx->GetAttr("K", ¶ms_.K )); - OP_REQUIRES_OK(ctx, ctx->GetAttr("shared", ¶ms_.shared )); - OP_REQUIRES_OK(ctx, ctx->GetAttr("alpha", ¶ms_.alpha )); - OP_REQUIRES_OK(ctx, ctx->GetAttr("beta", ¶ms_.beta )); - OP_REQUIRES_OK(ctx, ctx->GetAttr("gated_dw", &gated_dw_ )); - OP_REQUIRES_OK(ctx, ctx->GetAttr("axis", &axis_ )); - OP_REQUIRES_OK(ctx, ctx->GetAttr("bench", &bench_)); - OP_REQUIRES(ctx, params_.K < params_.bsize*65536, errors::InvalidArgument("K < bsize*65536")); - OP_REQUIRES(ctx, params_.C < params_.bsize*65536, errors::InvalidArgument("C < bsize*65536")); - params_.pcount = 1; - params_.blk_A = 0; - is_gpu_ = ctx->device_type() == DEVICE_GPU; - if (bench_) { - repeat_ = bench_; - flops_ = (float)(params_.blocks * params_.bsize*params_.bsize); - const char* op = "FPROP"; - sprintf(bench_string_, "%s %02d-%d C:%05d K:%05d blks:%d", op, params_.bsize, axis_, params_.C, params_.K, params_.blocks); - } +private: + void ComputeDw(OpKernelContext* context){ + // get device/stream + GPUDevice device = context->eigen_device(); + triton::driver::cu_stream sstream(device.stream(), false); + triton::driver::context* ctx = sstream.context(); + triton::driver::stream* stream = &sstream; + // extract input + OpInputList x, dy, gate; + context->input_list( "x", &x); + context->input_list( "dy", &dy); + context->input_list("gate", &gate); + // sanity checks + params_.pcount = x.size(); + if (params_.pcount > 1) + errors::Internal("No more than 1 input allowed."); + if (params_.beta != 0.0f || params_.alpha != 1.0f) + errors::Internal("Not supported yet"); + // N + int N = 1; + int rank = x[0].dims(); + for (int i = 0; i < rank; i++) + if (i != axis_) + N *= x[0].dim_size(i); + // allocate output + Tensor* C; + TensorShape shapeC({ params_.blocks, params_.bsize, params_.bsize }); + OP_REQUIRES_OK(context, context->allocate_output(0, shapeC, &C)); + // wrap tensorflow handles + triton::driver::cu_buffer da(ctx, x[0].tensor_data().size(), (CUdeviceptr)x[0].tensor_data().data(), false); + triton::driver::cu_buffer db(ctx, dy[0].tensor_data().size(), (CUdeviceptr)dy[0].tensor_data().data(), false); + triton::driver::cu_buffer dc(ctx, C->tensor_data().size(), (CUdeviceptr)C->tensor_data().data(), false); + triton::driver::cu_buffer dlut(ctx, context->input(params_.pcount*2).tensor_data().size(), (CUdeviceptr)context->input(params_.pcount*2).tensor_data().data(), false); + // create profile + triton::dnn::blocksparse::dot dot(N, params_.K, params_.segments, params_.C, "half", params_.bsize, params_.locks, params_.blocks, OP); + // enqueue + dot.enqueue(stream, {&da, &db, &dc, &dlut}, triton::dnn::FULL_TUNING); } - void Compute(OpKernelContext* context){ + void ComputeYDx(OpKernelContext* context){ // get device/stream GPUDevice device = context->eigen_device(); triton::driver::cu_stream sstream(device.stream(), false); @@ -129,8 +154,8 @@ public: triton::driver::cu_buffer dlut(ctx, lut.tensor_data().size(), (CUdeviceptr)lut.tensor_data().data(), false); // create profile triton::dnn::blocksparse::dot dot(N, params_.K, params_.segments, params_.C, "half", params_.bsize, params_.locks, params_.blocks, OP); - // blocksparse matmul - triton::dnn::base* op = dot.enqueue(stream, {&da, &db, &dc, &dlut}, triton::dnn::PARTIAL_TUNING); + // enqueue + triton::dnn::base* op = dot.enqueue(stream, {&da, &db, &dc, &dlut}, triton::dnn::NO_TUNING); triton::driver::buffer* locks_buffer = ((triton::dnn::blocksparse::dot*)op)->get_locks(); Tensor *tmp = nullptr; TensorShape tmp_shapes; @@ -138,6 +163,41 @@ public: OP_REQUIRES_OK(context, context->allocate_output(1, tmp_shapes, &tmp)); } +public: + + explicit BlocksparseMatmulOp(OpKernelConstruction* ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("segments", ¶ms_.segments)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("locks", ¶ms_.locks )); + OP_REQUIRES_OK(ctx, ctx->GetAttr("blocks", ¶ms_.blocks )); + OP_REQUIRES_OK(ctx, ctx->GetAttr("bsize", ¶ms_.bsize )); + OP_REQUIRES_OK(ctx, ctx->GetAttr("C", ¶ms_.C )); + OP_REQUIRES_OK(ctx, ctx->GetAttr("K", ¶ms_.K )); + OP_REQUIRES_OK(ctx, ctx->GetAttr("shared", ¶ms_.shared )); + OP_REQUIRES_OK(ctx, ctx->GetAttr("alpha", ¶ms_.alpha )); + OP_REQUIRES_OK(ctx, ctx->GetAttr("beta", ¶ms_.beta )); + OP_REQUIRES_OK(ctx, ctx->GetAttr("gated_dw", &gated_dw_ )); + OP_REQUIRES_OK(ctx, ctx->GetAttr("axis", &axis_ )); + OP_REQUIRES_OK(ctx, ctx->GetAttr("bench", &bench_)); + OP_REQUIRES(ctx, params_.K < params_.bsize*65536, errors::InvalidArgument("K < bsize*65536")); + OP_REQUIRES(ctx, params_.C < params_.bsize*65536, errors::InvalidArgument("C < bsize*65536")); + params_.pcount = 1; + params_.blk_A = 0; + is_gpu_ = ctx->device_type() == DEVICE_GPU; + if (bench_) { + repeat_ = bench_; + flops_ = (float)(params_.blocks * params_.bsize*params_.bsize); + const char* op = "FPROP"; + sprintf(bench_string_, "%s %02d-%d C:%05d K:%05d blks:%d", op, params_.bsize, axis_, params_.C, params_.K, params_.blocks); + } + } + + void Compute(OpKernelContext* context) override{ + if(OP == triton::dnn::blocksparse::WGRAD) + ComputeDw(context); + else + ComputeYDx(context); + } + private: bsmm_params params_; int axis_, bench_, repeat_, SMs_, major_, grid_n_; @@ -212,3 +272,33 @@ Multiply the matrix "a" by the blocksparse matrix "b". REGISTER_KERNEL_BUILDER(Name("TritonBlocksparseMatmulDX").Device(DEVICE_GPU).TypeConstraint("T"),BlocksparseMatmulOp); REGISTER_KERNEL_BUILDER(Name("TritonBlocksparseMatmulDX").Device(DEVICE_GPU).TypeConstraint("T"),BlocksparseMatmulOp); + +REGISTER_OP("TritonBlocksparseMatmulDW") + .Input("x: params * T") + .Input("dy: params * T") + .Input("lut: int64") + .Input("gate: ngate * float") + .Output("dw: T") + .Attr("T: {half, float, bfloat16}") + .Attr("params: int") + .Attr("blocks: int >=0") + .Attr("bsize: int") + .Attr("segments: int = 0") + .Attr("locks: int = 0") + .Attr("axis: int = 1") + .Attr("C: int >=0") + .Attr("K: int >=0") + .Attr("shared: int = 0") + .Attr("alpha: float = 1.0") + .Attr("beta: float = 0.0") + .Attr("gated_dw: bool = false") + .Attr("gate_grad: bool = false") + .Attr("bench: int = 0") + .Attr("ngate: int >= 0") + .SetShapeFn(UpdatShape) + .Doc(R"doc( +Multiply the matrix "a" by the blocksparse matrix "b". +)doc"); + +REGISTER_KERNEL_BUILDER(Name("TritonBlocksparseMatmulDW").Device(DEVICE_GPU).TypeConstraint("T"),BlocksparseMatmulOp); +REGISTER_KERNEL_BUILDER(Name("TritonBlocksparseMatmulDW").Device(DEVICE_GPU).TypeConstraint("T"),BlocksparseMatmulOp); diff --git a/include/triton/dnn/blocksparse/dot.h b/include/triton/dnn/blocksparse/dot.h index 488c26c31..f42d5b9d8 100644 --- a/include/triton/dnn/blocksparse/dot.h +++ b/include/triton/dnn/blocksparse/dot.h @@ -29,6 +29,9 @@ private: void init_impl(driver::stream *stream, driver::cu_module *module, triton::runtime::launch_information info); // deinit void deinit_impl(); + // source + std::string triton_c_src_ydx() const; + std::string triton_c_src_dw() const; public: // constructor dot(int32_t N, int32_t K, int32_t S, int32_t C, const std::string &ty, int32_t BS, int32_t nlocks, int32_t nblocks, op_t op = FPROP); diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index f43f94e8f..0fbd21938 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -75,7 +75,6 @@ public: optimize_dot.run(module); optimize_trans.run(module); dce.run(module); -// ir::print(module, std::cout); } void target_dependent(ir::module &module) { @@ -89,6 +88,7 @@ public: } vectorize.run(module); dce.run(module); +// ir::print(module, std::cout); } codegen::selection selection; diff --git a/lib/codegen/analysis/tune.cpp b/lib/codegen/analysis/tune.cpp index 7fba702cc..bd2b0cbce 100644 --- a/lib/codegen/analysis/tune.cpp +++ b/lib/codegen/analysis/tune.cpp @@ -250,15 +250,15 @@ void tune::run(ir::module &mod) { node_t node = *nodes_.begin(); if(fragments_[node] == STRIDED_SCAN) { ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 1, 1); - ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 4, 32); + ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 1, 8); connected_components(node, {nts, mts}, {"nts", "mts"}, nodes_, dependencies_, group_id++); nts->set_value(1); } else { - ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 2, 2); + ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 1, 1); if(node.second == 2) fpw->set_value(1); - ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 1, 4); + ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 1, 1); connected_components(node, {fpw, wpt}, {"fpw", "wpt"}, nodes_, dependencies_, group_id++); } } @@ -277,7 +277,7 @@ void tune::run(ir::module &mod) { size_t addr_space = ptr_ty->get_pointer_address_space(); if(addr_space < 4){ ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 4, 8)); + std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 1, 1)); *params_.at(i).at("nts.d0") = *tmp; } } @@ -287,8 +287,8 @@ void tune::run(ir::module &mod) { // *params_.at(i->get_operand(0)).at("mts.d2") = *mts_2; // *params_.at(i->get_operand(1)).at("mts.d2") = *mts_2; if(fragments_.at({i, 0}) == STRIDED_SCAN){ - std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 4, 8)); - std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 4, 8)); + std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 1, 1)); + std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 1, 1)); *params_.at(i).at("nts.d0") = *tmp1; *params_.at(i).at("nts.d1") = *tmp2; // for(size_t k = 2; k < shapes.size(); k++) @@ -423,7 +423,7 @@ bool tune::check_constraints(std::map> &er for(size_t k = 0; k < shapes.size(); k++){ prod *= params_[i]["fpw.d" + std::to_string(k)]->get_value(); } - if(prod != 4) + if(prod > 4) errors[i].push_back("HMMA must have only 4 fragments per warp"); } int num_threads = get_req_num_threads(i); diff --git a/lib/codegen/selection/selection.cpp b/lib/codegen/selection/selection.cpp index 491ff870f..43e08ee1e 100644 --- a/lib/codegen/selection/selection.cpp +++ b/lib/codegen/selection/selection.cpp @@ -554,7 +554,6 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id num_packs_0_ = num_rep_0 / pack_size_0_; num_packs_1_ = num_rep_1 / pack_size_1_; - /* intra warp offset */ // offset of quad in pair Value *in_pair_off_a = builder.CreateMul(builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), builder.getInt32(4)), @@ -566,7 +565,7 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id Value *pair_a_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), _4); Value *pair_b_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), _4); pair_a_id = builder.CreateURem(pair_a_id, builder.getInt32(fpw_0)); - pair_b_id = builder.CreateUDiv(pair_b_id, builder.getInt32(fpw_0)); + pair_b_id = builder.CreateURem(builder.CreateUDiv(pair_b_id, builder.getInt32(fpw_0)), builder.getInt32(fpw_1)); // Quad pair offset Value *pair_a_off = builder.CreateMul(pair_a_id, builder.getInt32(4 * pack_size_0_)); Value *pair_b_off = builder.CreateMul(pair_b_id, builder.getInt32(4 * pack_size_1_)); diff --git a/lib/dnn/blocksparse/dot.cpp b/lib/dnn/blocksparse/dot.cpp index 054904f27..97823e309 100644 --- a/lib/dnn/blocksparse/dot.cpp +++ b/lib/dnn/blocksparse/dot.cpp @@ -52,40 +52,56 @@ void dot::enqueue_impl(driver::stream *stream, driver::kernel *kernel, driver::buffer *b = args[1]; driver::buffer *c = args[2]; driver::buffer *lut = args[3]; - int32_t lda = N_; - int32_t ldc = N_; kernel->setArg(0, a); kernel->setArg(1, b); kernel->setArg(2, c); - kernel->setArg(3, lda); - kernel->setArg(4, ldc); - kernel->setArg(5, N_); - kernel->setArg(6, lut); - kernel->setArg(7, locks_.get()); - kernel->setArg(8, nlocks_); - int32_t TM = info.globals["TM"]; - size_t grid_0 = (N_ + TM - 1) / TM; - size_t grid_1 = S_; - if(nlocks_) - ((driver::cu_buffer*)locks_.get())->set_zero(stream, grid_0 * nlocks_ * 2 * 4); - stream->enqueue(kernel, {grid_0, grid_1, 1}, {info.num_threads, 1, 1}); + if(op_ == FPROP || op_ == BPROP){ + kernel->setArg(3, N_); + kernel->setArg(4, BS_); + kernel->setArg(5, N_); + } + else{ + kernel->setArg(3, N_); + kernel->setArg(4, N_); + kernel->setArg(5, BS_); + } + kernel->setArg(6, N_); + kernel->setArg(7, lut); + kernel->setArg(8, locks_.get()); + kernel->setArg(9, nlocks_); + if(op_ == FPROP || op_ == BPROP){ + int32_t TM = info.globals["TM"]; + size_t grid_0 = (N_ + TM - 1) / TM; + size_t grid_1 = S_; + if(nlocks_) + ((driver::cu_buffer*)locks_.get())->set_zero(stream, grid_0 * nlocks_ * 2 * 4); + stream->enqueue(kernel, {grid_0, grid_1, 1}, {info.num_threads, 1, 1}); + } + else{ + size_t grid_0 = nblocks_; + stream->enqueue(kernel, {grid_0, 1, 1}, {info.num_threads, 1, 1}); + } } driver::buffer* dot::get_locks() const { return locks_.get(); } -void dot::triton_c_src(std::ostream &os) const { - std::string usea = (op_ == WGRAD) ? "trans(a)" : "a"; - std::string useb = (op_ == FPROP) ? "trans(b)" : "b"; +std::string dot::triton_c_src_ydx() const { + bool AT = (op_ == WGRAD); + bool BT = (op_ == FPROP); + std::string usea = AT ? "trans(a)" : "a"; + std::string useb = BT ? "trans(b)" : "b"; std::string sizea = "TM, TK"; - std::string sizeb = (op_ == FPROP) ? "TN, TK" : "TK, TN"; + std::string sizeb = BT ? "TN, TK" : "TK, TN"; std::string bca0 = ":, newaxis"; std::string bca1 = "newaxis, :"; - std::string bcb0 = (op_ == FPROP) ? ":, newaxis" : "newaxis, :"; - std::string bcb1 = (op_ == FPROP) ? "newaxis, :" : ":, newaxis"; - std::string ldb0 = (op_ == FPROP) ? "" : "*TK"; - std::string ldb1 = (op_ == FPROP) ? "*TK" : "" ; + std::string bcb0 = BT ? ":, newaxis" : "newaxis, :"; + std::string bcb1 = BT ? "newaxis, :" : ":, newaxis"; + std::string lda0 = AT ? "*lda" : ""; + std::string lda1 = AT ? "" : "*lda"; + std::string ldb0 = BT ? "" : "*ldb"; + std::string ldb1 = BT ? "*ldb" : "" ; std::string result = R"( const tunable int TM = {16, 32, 64, 128}; @@ -95,26 +111,25 @@ void dot::triton_c_src(std::ostream &os) const { void bsdot(restrict read_only align(16) )" + ab_ty_ + R"( *A, restrict read_only align(16) )" + ab_ty_ + R"( *B, )" + c_ty_ + R"(* C, - int lda, int ldc, int N, - int* lut, int* locks, int nlocks) { + int lda, int ldb, int ldc, + int N, int* lut, + int* locks, int nlocks) { int ridx = get_range_id(0); - int ridy = get_range_id(1); float acc[TM, TN] = 0; - int rxa[TM] = ridx * TM + (0 ... TM); - int ryb[TN] = 0 ... TN; int rka[TK] = 0 ... TK; int rkb[TK] = 0 ... TK; - bool checka[TM, TK] = (rxa < N)[:, newaxis]; - int offa[)" + sizea + "] = rxa[" + bca0 + "] + rka[" + bca1 + R"(]*lda; - int offb[)" + sizeb + "] = ryb[" + bcb0 + "]" + ldb0 + " + rkb[" + bcb1 + "]" + ldb1 + R"(; - int *header = lut + ridy * 4; + int *header = lut + get_range_id(1) * 4; int offset = *(header + 0); int K = *(header + 1); int column = *(header + 2); int lockid = *(header + 3); + int rxa[TM] = ridx * TM + (0 ... TM); + int ryb[TN] = 0 ... TN; int *plut = lut + offset * 2; - for(int k = K; k > 0; k = k - 1) - { + int offa[)" + sizea + "] = rxa[" + bca0 + "]" + lda0 + " + rka[" + bca1 + "]" + lda1 + R"(; + int offb[)" + sizeb + "] = ryb[" + bcb0 + "]" + ldb0 + " + rkb[" + bcb1 + "]" + ldb1 + R"(; + bool checka[TM, TK] = (rxa < N)[:, newaxis]; + for(int k = K; k > 0; k = k - 1) { int ak = *(plut + 0); int bk = *(plut + 1); )" + ab_ty_ + "* pa[" + sizea + R"(] = A + offa + ak * TK * lda; @@ -137,17 +152,83 @@ void dot::triton_c_src(std::ostream &os) const { int *pcount = plock + get_num_program(0)*nlocks; while(__atomic_cas(plock, 0, 1)); int count = *pcount; - if(count == 0){ + if(count == 0) @checkc *pc = c; - } - else{ + else @checkc *pc = c + *pc; - } __atomic_exch(pcount, 1); __atomic_exch(plock, 0); } })"; - os << result; + + return result; +} + +std::string dot::triton_c_src_dw() const { + bool AT = (op_ == WGRAD); + bool BT = (op_ == FPROP); + std::string usea = AT ? "trans(a)" : "a"; + std::string useb = BT ? "trans(b)" : "b"; + std::string sizea = AT ? "TK, TM" : "TM, TK"; + std::string sizeb = BT ? "TN, TK" : "TK, TN"; + std::string bca0 = AT ? "newaxis, :" : ":, newaxis"; + std::string bca1 = AT ? ":, newaxis" : "newaxis, :"; + std::string bcb0 = BT ? ":, newaxis" : "newaxis, :"; + std::string bcb1 = BT ? "newaxis, :" : ":, newaxis"; + std::string lda0 = AT ? "*lda" : ""; + std::string lda1 = AT ? "" : "*lda"; + std::string ldb0 = BT ? "" : "*ldb"; + std::string ldb1 = BT ? "*ldb" : "" ; + std::string result = + R"( + const tunable int TM = {)" + std::to_string(BS_) + R"(}; + const tunable int TN = {)" + std::to_string(BS_) + R"(}; + const tunable int TK = {32}; + + void bsdot(restrict read_only align(16) )" + ab_ty_ + R"( *A, + restrict read_only align(16) )" + ab_ty_ + R"( *B, + )" + c_ty_ + R"(* C, + int lda, int ldb, int ldc, + int N, int* lut, + int* locks, int nlocks) { + int ridx = get_range_id(0); + float acc[TM, TN] = 0; + int rka[TK] = 0 ... TK; + int rkb[TK] = 0 ... TK; + int *header = lut + ridx * 2; + int offx = *(header + 0); + int offy = *(header + 1); + int rxa[TM] = offx*TM + (0 ... TM); + int ryb[TN] = offy*TN + (0 ... TN); + bool checka[TK, TM] = (rka < N)[:, newaxis]; + bool checkb[TK, TN] = (rkb < N)[:, newaxis]; + int offa[)" + sizea + "] = rxa[" + bca0 + "]" + lda0 + " + rka[" + bca1 + "]" + lda1 + R"(; + int offb[)" + sizeb + "] = ryb[" + bcb0 + "]" + ldb0 + " + rkb[" + bcb1 + "]" + ldb1 + R"(; + )" + ab_ty_ + " * pa[" + sizea + R"(] = A + offa; + )" + ab_ty_ + " * pb[" + sizeb + R"(] = B + offb; + )" + ab_ty_ + " a[" + sizea + R"(] = checka ? *pa : 0; + )" + ab_ty_ + " b[" + sizeb + R"(] = checkb ? *pb : 0; + for(int k = N; k > 0; k = k - TK) { + acc = dot()" + usea + ", " + useb + R"(, acc); + pa = pa + TK)" + lda1 + R"(; + pb = pb + TK)" + ldb1 + R"(; + a = checka ? *pa : 0; + b = checkb ? *pb : 0; + } + int rxc[TM] = (0 ... TM); + int ryc[TN] = (0 ... TN); + )" + c_ty_ + R"( c[TM, TN] = acc; + )" + c_ty_ + R"(* pc[TM, TN] = C + rxc[:, newaxis]*TM + ryc[newaxis, :] + ridx*TM*TN; + *pc = c; + })"; + + return result; +} +void dot::triton_c_src(std::ostream &os) const { + if(op_ == FPROP || op_ == BPROP) + os << triton_c_src_ydx(); + else + os << triton_c_src_dw(); } diff --git a/lib/dnn/dot.cpp b/lib/dnn/dot.cpp index 22198b7af..30cec06a4 100644 --- a/lib/dnn/dot.cpp +++ b/lib/dnn/dot.cpp @@ -106,8 +106,8 @@ void dot::triton_c_src(std::ostream &os) const { std::string align_ldb_str = "multiple_of(" + std::to_string(align_ldb_) + ")"; std::string res = R"( -const tunable int TM = {32}; -const tunable int TN = {32}; +const tunable int TM = {8}; +const tunable int TN = {8}; const tunable int TK = {32}; const tunable int GZ = {1}; diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index 141bb8054..4c7a030f8 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -37,13 +37,13 @@ void parallel_loop_nest(std::vector const & ranges, size_t D = ranges.size(); std::vector values(D, 0); // thread pools - ThreadPool pool(nthreads); +// ThreadPool pool(nthreads); // Start with innermost loop size_t i = D - 1; while(true){ // Execute function - pool.enqueue(f,values); -// f(values); +// pool.enqueue(f,values); + f(values); while(values[i]++ == ranges[i] - 1){ if(i == 0) return; From 1400d960a65dafb8e7ad195d170e516857231ef3 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 9 Aug 2019 16:57:18 -0700 Subject: [PATCH 291/494] [auto-tuning] much smaller parameters space --- examples/cpp/dot.cpp | 2 +- include/triton/codegen/analysis/tune.h | 12 +- include/triton/codegen/transform/dce.h | 4 +- .../triton/codegen/transform/reassociate.h | 5 +- include/triton/codegen/transform/trans.h | 11 +- include/triton/runtime/jit.h | 17 +- include/triton/runtime/launch_info.h | 1 - lib/codegen/analysis/tune.cpp | 298 +++++++----------- lib/codegen/selection/selection.cpp | 12 +- lib/codegen/transform/dce.cpp | 2 +- lib/codegen/transform/reassociate.cpp | 187 ++++++----- lib/codegen/transform/trans.cpp | 214 +++++++++++-- lib/dnn/conv.cpp | 2 +- lib/dnn/dot.cpp | 18 +- lib/driver/backend.cpp | 28 +- lib/ir/constant.cpp | 2 +- lib/ir/instructions.cpp | 4 +- lib/ir/type.cpp | 8 + lib/ir/value.cpp | 6 +- lib/runtime/jit.cpp | 4 - 20 files changed, 470 insertions(+), 367 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index e9ad43f71..ef73a7581 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -111,7 +111,7 @@ int main() { // shapes to benchmark std::vector configs = { // {false, false, 8192, 512, 512}, - {false, true, 64, 64, 128} + {false, true, 128, 128, 128} // {false, true, 128, 128, 128}, // {false, false, 128, 128, 128}, // {true, false, 128, 128, 128}, diff --git a/include/triton/codegen/analysis/tune.h b/include/triton/codegen/analysis/tune.h index 54dae5524..4ab07c974 100644 --- a/include/triton/codegen/analysis/tune.h +++ b/include/triton/codegen/analysis/tune.h @@ -13,6 +13,7 @@ namespace ir{ class instruction; class function; class metaparameter; + class constant_int; } namespace codegen{ @@ -34,7 +35,9 @@ private: void init_c_graph(ir::instruction *v); fragment_t get_fragmentation_type(node_t x, graph_t &graph); void connected_components(node_t x, const std::vector mps, const std::vector prefixes, std::set &nodes, graph_t &graph, unsigned group_id); - void create_grids(std::vector &grids, std::map &references, ir::function *fn); + void create_grids(std::vector &grids, + std::map &references, + ir::function *fn); unsigned get_req_num_threads(ir::instruction *i); @@ -49,8 +52,6 @@ public: bool check_constraints(std::map> &errors); void run(ir::module &mod); void init(ir::module &mod); - unsigned get_num_global_range(); - unsigned get_global_range_size(unsigned axis); unsigned get_num_threads(); private: @@ -61,10 +62,9 @@ private: std::map static_params_; std::map> params_; std::map global_range_sizes_; - unsigned num_global_ranges_; - unsigned num_threads_; - std::vector grids_; + std::vector grids_; std::map> groups_; + ir::metaparameter* num_warps_; }; diff --git a/include/triton/codegen/transform/dce.h b/include/triton/codegen/transform/dce.h index 169363752..dea50996d 100644 --- a/include/triton/codegen/transform/dce.h +++ b/include/triton/codegen/transform/dce.h @@ -14,9 +14,9 @@ namespace ir { namespace codegen{ namespace transform{ -class optimize_dce { +class dce { public: - optimize_dce() {} + dce() {} void run(ir::module &mod); }; diff --git a/include/triton/codegen/transform/reassociate.h b/include/triton/codegen/transform/reassociate.h index 66d95eb44..5f639d23c 100644 --- a/include/triton/codegen/transform/reassociate.h +++ b/include/triton/codegen/transform/reassociate.h @@ -28,7 +28,7 @@ namespace transform{ class reassociate { struct cst_info { - ir::getelementptr_inst* dyn_ptr; + ir::value* dyn_ptr; ir::getelementptr_inst* sta_ptr; }; @@ -38,12 +38,11 @@ private: ir::value *reassociate_ptr(ir::getelementptr_inst* pz, ir::builder &builder, std::map &offsets); public: - reassociate(analysis::tune *params, analysis::alignment_info *align); + reassociate(analysis::tune *params); void run(ir::module& module); private: analysis::tune* params_; - analysis::alignment_info* align_; }; } diff --git a/include/triton/codegen/transform/trans.h b/include/triton/codegen/transform/trans.h index 4bdb62157..73eedd6e4 100644 --- a/include/triton/codegen/transform/trans.h +++ b/include/triton/codegen/transform/trans.h @@ -19,12 +19,17 @@ namespace ir { namespace codegen{ namespace transform{ -class optimize_trans { +class peephole { +private: + bool rewrite_trans_phi(ir::instruction* value, ir::builder &builder); + bool rewrite_dot(ir::instruction *value, ir::builder& builder); + bool rewrite_unit_red(ir::instruction *value, ir::builder& builder); + bool rewrite_gep_ptr_min_off_plus_off(ir::instruction *value, ir::builder& builder); + private: - ir::value *replace_phi(ir::value* value, ir::builder &builder, const std::vector &perm); public: - optimize_trans() {} + peephole() {} void run(ir::module &mod); }; diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index 0fbd21938..20cf39691 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -66,29 +66,30 @@ public: selection(&shmem_allocation, &tune, &shmem_info, &alignment_info, target), optimize_dot(&tune), dce(), - optimize_trans(), + peephole(), alignment_info(), - reassociate(&tune, &alignment_info), + reassociate(&tune), target_(target) { } void target_independent(ir::module &module) { - optimize_dot.run(module); - optimize_trans.run(module); + ir::print(module, std::cout); + peephole.run(module); dce.run(module); } void target_dependent(ir::module &module) { - alignment_info.run(module); reassociate.run(module); + peephole.run(module); if(target_->is_gpu()){ shmem_info.run(module); shmem_liveness.run(module); shmem_allocation.run(); shmem_barriers.run(module); } + alignment_info.run(module); vectorize.run(module); dce.run(module); -// ir::print(module, std::cout); + ir::print(module, std::cout); } codegen::selection selection; @@ -100,8 +101,8 @@ public: codegen::transform::shmem_barriers shmem_barriers; codegen::transform::vectorize vectorize; codegen::transform::optimize_dot optimize_dot; - codegen::transform::optimize_dce dce; - codegen::transform::optimize_trans optimize_trans; + codegen::transform::dce dce; + codegen::transform::peephole peephole; codegen::transform::reassociate reassociate; codegen::target* target_; }; diff --git a/include/triton/runtime/launch_info.h b/include/triton/runtime/launch_info.h index a6a0ddb5b..06e79d4e4 100644 --- a/include/triton/runtime/launch_info.h +++ b/include/triton/runtime/launch_info.h @@ -8,7 +8,6 @@ namespace triton{ namespace runtime{ struct launch_information{ - std::vector global_range_size; unsigned num_threads; std::map globals; }; diff --git a/lib/codegen/analysis/tune.cpp b/lib/codegen/analysis/tune.cpp index bd2b0cbce..21d08d5ea 100644 --- a/lib/codegen/analysis/tune.cpp +++ b/lib/codegen/analysis/tune.cpp @@ -14,7 +14,8 @@ namespace triton{ namespace codegen{ namespace analysis{ -tune::tune(): num_global_ranges_(0){ } +tune::tune() { +} bool is_hmma(ir::value *v){ bool result = false; @@ -123,8 +124,8 @@ void tune::init_c_graph(ir::instruction *v) { for(unsigned i = 2; i < shapes.size(); i++){ if(shapes[i] == one) static_params_.insert({{v, i}, 1}); -// add_constraint({v, i}, {A, i}); -// add_constraint({v, i}, {B, i}); + add_constraint({v, i}, {A, i}); + add_constraint({v, i}, {B, i}); } } // Element-wise @@ -172,11 +173,6 @@ void tune::connected_components(node_t x, const std::vector if(auto mp = dynamic_cast(shape)) params_[x.first].insert({"shape" + suffix, mp}); } -// if(auto range = dynamic_cast(x.first)){ -// unsigned ax = range->get_axis(); -// global_range_sizes_[ax] = params_[x.first].at("shape.d0"); -// num_global_ranges_ = std::max(num_global_ranges_, ax + 1); -// } if(static_params_.find(x) != static_params_.end()){ for(ir::metaparameter *mp: mps) mp->set_value(static_params_.at(x)); @@ -189,21 +185,13 @@ void tune::connected_components(node_t x, const std::vector std::vector tune::get_params(ir::module &mod) { std::vector result; std::set seen; - - for(ir::function *fn: mod.get_function_list()) - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i : block->get_inst_list()) - for(auto &x: params_[i]) - if(seen.insert(x.second).second && !x.second->has_value()){ - result.push_back(x.second); - } - - for(auto x: mod.globals()){ + for(auto x: mod.globals()) { if(auto mp = dynamic_cast(x.second)) if(seen.insert(mp).second && !mp->has_value()) result.push_back(mp); } - + num_warps_ = ir::metaparameter::create(mod.get_context(), mod.get_builder().get_int32_ty(), 4, 4); + result.push_back(num_warps_); return result; } @@ -212,7 +200,6 @@ std::map tune::get_params(ir::instruction* i) } unsigned tune::get_param_group(ir::value *value, unsigned ax) { -// std::cout << "group? " << value->get_name() << " " << ax << std::endl; unsigned result = groups_.at(value).at(ax); return result; } @@ -229,139 +216,164 @@ void tune::run(ir::module &mod) { ir::context &ctx = mod.get_context(); // Create metaparameters for(ir::function *fn: mod.get_function_list()){ + // Build constraints graph for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i : block->get_inst_list()) - if(i->has_tile_result_or_op()){ + if(i->has_tile_result_or_op()) init_c_graph(i); - } + // Build phi constraints for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i : block->get_inst_list()) if(i->has_tile_result_or_op()) init_c_phi(i); + // Layout parameters unsigned group_id = 0; - for(auto x: nodes_){ + for(auto x: nodes_) fragments_[x] = get_fragmentation_type(x, dependencies_); - } while(!nodes_.empty()) { ir::type *ty = mod.get_builder().get_int32_ty(); node_t node = *nodes_.begin(); if(fragments_[node] == STRIDED_SCAN) { ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 1, 1); - ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 1, 8); + ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 1, 1); connected_components(node, {nts, mts}, {"nts", "mts"}, nodes_, dependencies_, group_id++); - nts->set_value(1); } else { ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 1, 1); - if(node.second == 2) - fpw->set_value(1); ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 1, 1); connected_components(node, {fpw, wpt}, {"fpw", "wpt"}, nodes_, dependencies_, group_id++); } } } - - for(ir::function *fn: mod.get_function_list()) - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i : block->get_inst_list()){ - if(!i->get_type()->is_tile_ty()) - continue; - auto shapes = i->get_type()->get_tile_shapes(); - - if(auto *x = dynamic_cast(i)) - if(fragments_.at({i, 0}) == STRIDED_SCAN){ - ir::type *ptr_ty = x->get_pointer_operand()->get_type()->get_scalar_ty(); - size_t addr_space = ptr_ty->get_pointer_address_space(); - if(addr_space < 4){ - ir::type *ty = mod.get_builder().get_int32_ty(); - std::unique_ptr tmp(ir::metaparameter::create(ctx, ty, 1, 1)); - *params_.at(i).at("nts.d0") = *tmp; - } - } - if(dynamic_cast(i) && i->get_type()->is_tile_ty()){ - ir::type *ty = mod.get_builder().get_int32_ty(); -// std::unique_ptr mts_2(ir::metaparameter::create(ctx, ty, 1, 4)); -// *params_.at(i->get_operand(0)).at("mts.d2") = *mts_2; -// *params_.at(i->get_operand(1)).at("mts.d2") = *mts_2; - if(fragments_.at({i, 0}) == STRIDED_SCAN){ - std::unique_ptr tmp1(ir::metaparameter::create(ctx, ty, 1, 1)); - std::unique_ptr tmp2(ir::metaparameter::create(ctx, ty, 1, 1)); - *params_.at(i).at("nts.d0") = *tmp1; - *params_.at(i).at("nts.d1") = *tmp2; -// for(size_t k = 2; k < shapes.size(); k++) -// if(auto *x = dynamic_cast(shapes[k])) -// *params_.at(i).at("mts.d" + std::to_string(k)) = *x; -// else -// params_.at(i).at("mts.d" + std::to_string(k))->set_value(shapes[k]->get_value()); - } - else{ -// for(size_t k = 2; k < shapes.size(); k++) -// if(auto *x = dynamic_cast(shapes[k])) -// *params_.at(i).at("wpt.d" + std::to_string(k)) = *x; -// else -// params_.at(i).at("wpt.d" + std::to_string(k))->set_value(shapes[k]->get_value()); - } - } - } - } void tune::init(ir::module &mod) { for(ir::function *fn: mod.get_function_list()){ - std::map references; + std::map references; create_grids(grids_, references, fn); } - num_threads_ = get_req_num_threads(grids_.front()); -} + int num_threads = get_num_threads(); + int num_warps = num_warps_->get_value(); + auto clamp = [&](int x, int lo, int hi) { return std::min(std::max(x, lo), hi); }; -unsigned tune::get_req_num_threads(ir::instruction *i){ - if(fragments_.at({i, 0}) == STRIDED_SCAN) { - unsigned result = 1; - for(unsigned k = 0; k < i->get_type()->get_tile_shapes().size(); k++){ - std::string suffix = ".d" + std::to_string(k); - result *= params_.at(i).at("mts" + suffix)->get_value(); + for(ir::value *i: grids_){ + if(!i->get_type()->is_tile_ty()) + continue; + auto shapes = i->get_type()->get_tile_shapes(); + int shape_0 = shapes[0]->get_value(); + int shape_1 = shapes[1]->get_value(); + int size = i->get_type()->get_tile_num_elements(); + /* HMMA parameters*/ + if(fragments_.at({i, 0}) == HMMA_FRAGMENT_C){ + + /* fragments per warp */ + // try to make things as square as possible to maximize data re-use + std::vector fpw = {1, 1, 1}; + std::vector fpw_nm1; + int num_fragments = std::min((shape_0/8)*(shape_1/8), 4); + do { + fpw_nm1 = fpw; + if(fpw[0]*fpw[1] < num_fragments) + fpw[0] = clamp(fpw[0]*2, 1, shape_0 / 8); + if(fpw[0]*fpw[1] < num_fragments) + fpw[1] = clamp(fpw[1]*2, 1, shape_1 / 8); + }while(fpw_nm1 != fpw); + // store parameters + for(int d = 0; d < shapes.size(); d++) + params_.at(i).at("fpw.d" + std::to_string(d))->set_value(fpw[d]); + + /* warps per tile */ + // try to make things as square as possible to maximize data re-use + std::vector wpt = {1, 1, 1}; + std::vector wpt_nm1; + do{ + wpt_nm1 = wpt; + if(wpt[0] * wpt[1] * wpt[2] < num_warps) + wpt[0] = clamp(wpt[0]*2, 1, shape_0 / (fpw[0]*8)); + if(wpt[0] * wpt[1] * wpt[2] < num_warps) + wpt[1] = clamp(wpt[1]*2, 1, shape_1 / (fpw[1]*8)); + }while(wpt_nm1 != wpt); + // store parameters + for(int d = 0; d < shapes.size(); d++) + params_.at(i).at("wpt.d" + std::to_string(d))->set_value(wpt[d]); + + /* sanity check */ + unsigned effective_num_warps = 1; + for(size_t d = 0; d < shapes.size(); d++){ + std::string str_d = std::to_string(d); + effective_num_warps *= params_.at(i).at("wpt.d" + str_d)->get_value(); + } + assert(num_warps == effective_num_warps); } - return result; - } - else { - unsigned result = 32; - for(unsigned k = 0; k < i->get_type()->get_tile_shapes().size(); k++){ - std::string suffix = ".d" + std::to_string(k); - result *= params_.at(i).at("wpt" + suffix)->get_value(); + + /* Scan-line */ + else{ + int shape = shapes[0]->get_value(); + int current = num_threads; + params_.at(i).at("nts.d0")->set_value(clamp(size / num_threads, 1, 8)); + params_.at(i).at("mts.d0")->set_value(clamp(current, 1, shape / params_.at(i).at("nts.d0")->get_value())); + current = current / params_.at(i).at("mts.d0")->get_value(); + for(size_t d = 1; d < shapes.size(); d++){ + std::string str_d = std::to_string(d); + shape = shapes[d]->get_value(); + params_.at(i).at("nts.d" + str_d)->set_value(1); + params_.at(i).at("mts.d" + str_d)->set_value(clamp(current, 1, shape)); + current = current / params_.at(i).at("mts.d" + str_d)->get_value(); + } + /* sanity check */ + unsigned effective_num_threads = 1; + for(size_t d = 0; d < shapes.size(); d++){ + std::string str_d = std::to_string(d); + effective_num_threads *= params_.at(i).at("mts.d" + str_d)->get_value(); + } + assert(num_threads == effective_num_threads); } - return result; } } -void tune::create_grids(std::vector &grids, - std::map &references, - ir::function *fn) { + +void tune::create_grids(std::vector &grids, + std::map &references, + ir::function *fn) { // get number of dimensions greater than 1 auto get_tile_gt1_dim = [&](ir::value *v){ unsigned result = 0; - auto one = ir::tile_type::make_one(fn->get_fn_type()->get_context()); - for(ir::constant_int *shape: v->get_type()->get_tile_shapes()) { - result += (shape != one); + for(ir::constant_int* shape: v->get_type()->get_tile_shapes()) { + result += (shape->get_value() > 1)?shape->get_value():0; } return result; }; // bind references - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i: block->get_inst_list()){ - if(!i->get_type()->is_tile_ty()) - continue; - for(auto ¶m: params_.at(i)){ - if(param.second->get_value() == 1) + std::set seen; + std::function bind_references = [&](ir::value *v) + { + // skip + if(!v->get_type()->is_tile_ty() || !seen.insert(v).second) + return; + // recurse + if(auto *user = dynamic_cast(v)) + for(ir::value *op: user->ops()) + bind_references(op); + // bind + const auto& shapes = v->get_type()->get_tile_shapes(); + for(size_t d = 0; d < shapes.size(); d++){ + if(shapes[d]->get_value() == 1) continue; - ir::instruction *&r = references[param.second]; - if(!r || get_tile_gt1_dim(i) > get_tile_gt1_dim(r)) - r = i; + unsigned x = get_param_group(v, d); + ir::value *&r = references[x]; + if(!r || get_tile_gt1_dim(v) > get_tile_gt1_dim(r)) + r = v; } - } + }; + + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *i: block->get_inst_list()) + bind_references(i); + // create grid for(auto &ref: references) if(std::find(grids.begin(), grids.end(), ref.second) == grids.end()) @@ -370,85 +382,11 @@ void tune::create_grids(std::vector &grids, bool tune::check_constraints(std::map> &errors) { - using std::to_string; - - auto get_num_warps = [&](ir::instruction *i, unsigned axis) { - std::string strk = to_string(axis); - if(fragments_.at({i, axis}) == STRIDED_SCAN){ - unsigned mts = params_[i]["mts.d" + strk]->get_value(); - unsigned nts = params_[i]["nts.d" + strk]->get_value(); - unsigned shape = i->get_type()->get_tile_shapes()[axis]->get_value(); - return shape / (mts * nts); - } - else{ - return (unsigned)params_[i]["wpt.d" + strk]->get_value(); - } - }; - - // number of warps - ir::instruction *first = grids_.front(); - int num_warps = 1; - for(size_t k = 0; k < first->get_type()->get_tile_shapes().size(); k++) - num_warps *= get_num_warps(first, k); - - // check constraints - for(ir::instruction *i: grids_){ -// std::cout << i->get_name() << std::endl; - ir::type *ty = i->get_type(); - const auto &shapes = ty->get_tile_shapes(); - // for each dimension, the product of layout components - // must device the shape - for(size_t k = 0; k < shapes.size(); k++) { - std::string strk = to_string(k); - unsigned multiple; - if(fragments_.at({i, 0}) == STRIDED_SCAN) { - ir::metaparameter *mts = params_[i]["mts.d" + strk]; - ir::metaparameter *nts = params_[i]["nts.d" + strk]; - multiple = mts->get_value()*nts->get_value(); - } - else { - ir::metaparameter *fpw = params_[i]["fpw.d" + strk]; - ir::metaparameter *wpt = params_[i]["wpt.d" + strk]; - multiple = fpw->get_value()*wpt->get_value(); - if(k < 2) - multiple *= 8; - } - if(shapes[k]->get_value() % multiple != 0) - errors[i].push_back("for dim " + strk + ": shape (" + to_string(shapes[k]->get_value()) + ")" - " is not a multiple of layout (" + to_string(multiple) + ")"); - } - // the product of mma fragments per warp must be 4 - if(fragments_.at({i, 0}) == HMMA_FRAGMENT_C){ - unsigned prod = 1; - for(size_t k = 0; k < shapes.size(); k++){ - prod *= params_[i]["fpw.d" + std::to_string(k)]->get_value(); - } - if(prod > 4) - errors[i].push_back("HMMA must have only 4 fragments per warp"); - } - int num_threads = get_req_num_threads(i); - if(num_threads % 32 != 0) - errors[i].push_back("number of threads per block (" + to_string(num_threads) + ") must be multiple of warp size"); - if(num_threads != num_threads_) - errors[i].push_back("Number of threads must be the same for all tiles (" + to_string(num_threads_) + ")"); - } -// for(auto x: errors) -// for(auto e: x.second) -// std::cout << x.first->get_name() << ": " << e << std::endl; -// exit(EXIT_SUCCESS); return errors.empty(); } -unsigned tune::get_num_global_range() { - return num_global_ranges_; -} - -unsigned tune::get_global_range_size(unsigned axis) { - return global_range_sizes_.at(axis)->get_value(); -} - unsigned tune::get_num_threads() { - return num_threads_; + return num_warps_->get_value()*32; } diff --git a/lib/codegen/selection/selection.cpp b/lib/codegen/selection/selection.cpp index 43e08ee1e..d76ea5071 100644 --- a/lib/codegen/selection/selection.cpp +++ b/lib/codegen/selection/selection.cpp @@ -243,7 +243,7 @@ Type *selection::llvm_type(ir::type *ty, LLVMContext &ctx) { /* convert ir::constant to Constant */ Constant *selection::llvm_constant(ir::constant *cst, LLVMContext &ctx) { - Type *dst_ty = llvm_type(cst->get_type(), ctx); + Type *dst_ty = llvm_type(cst->get_type()->get_scalar_ty(), ctx); if(auto* cc = dynamic_cast(cst)) return ConstantInt::get(dst_ty, cc->get_value()); if(auto* cc = dynamic_cast(cst)) @@ -478,8 +478,9 @@ inline void to_warps(const std::vector &bs, std::vector &nw, nw[i] = ceil(nthreads, nwarps*warp_size); nwarps *= nw[i]; } - for(size_t i = 0; i < bs.size(); ++i) + for(size_t i = 0; i < bs.size(); ++i){ ws[i] = bs[i] / nw[i]; + } } void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { @@ -565,7 +566,8 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id Value *pair_a_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), _4); Value *pair_b_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), _4); pair_a_id = builder.CreateURem(pair_a_id, builder.getInt32(fpw_0)); - pair_b_id = builder.CreateURem(builder.CreateUDiv(pair_b_id, builder.getInt32(fpw_0)), builder.getInt32(fpw_1)); + pair_b_id = builder.CreateUDiv(pair_b_id, builder.getInt32(fpw_0)); + pair_b_id = builder.CreateURem(pair_b_id, builder.getInt32(fpw_1)); // Quad pair offset Value *pair_a_off = builder.CreateMul(pair_a_id, builder.getInt32(4 * pack_size_0_)); Value *pair_b_off = builder.CreateMul(pair_b_id, builder.getInt32(4 * pack_size_1_)); @@ -1296,7 +1298,9 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & else { result->for_each([&](indices_t idx){ auto value = [&](ir::value *x) { - if(x->get_type()->is_tile_ty()) + if(auto *cst = dynamic_cast(x)) + return (Value*)llvm_constant(cst, ctx); + else if(x->get_type()->is_tile_ty()) return tmap_.at(x)->get_value(idx); else return llvm_value(x, builder); diff --git a/lib/codegen/transform/dce.cpp b/lib/codegen/transform/dce.cpp index d11caf55f..404eaa521 100644 --- a/lib/codegen/transform/dce.cpp +++ b/lib/codegen/transform/dce.cpp @@ -9,7 +9,7 @@ namespace codegen{ namespace transform{ -void optimize_dce::run(ir::module &mod) { +void dce::run(ir::module &mod) { std::list work_list; std::set marked; diff --git a/lib/codegen/transform/reassociate.cpp b/lib/codegen/transform/reassociate.cpp index a1594fb17..4473fe84a 100644 --- a/lib/codegen/transform/reassociate.cpp +++ b/lib/codegen/transform/reassociate.cpp @@ -155,8 +155,8 @@ ir::value *reassociate::reassociate_idx(ir::value *old_value, return new_value; } -reassociate::reassociate(analysis::tune* params, analysis::alignment_info* align) - : params_(params), align_(align) +reassociate::reassociate(analysis::tune* params) + : params_(params) { } @@ -190,93 +190,108 @@ void reassociate::run(ir::module &mod) { // reassociate std::map infos; - for(ir::function *fn: mod.get_function_list()){ - std::vector rpo = ir::cfg::reverse_post_order(fn); - // iterate through blocks - for(ir::basic_block *block: rpo){ - // iterate through instruction - for(ir::instruction *i: block->get_inst_list()){ - // getelementptr instruction - if(ir::getelementptr_inst *pz = dynamic_cast(i)){ - // unpack GEP instruction - ir::value* py = pz->get_pointer_operand(); - ir::value* offset = *pz->idx_begin(); - // reassociate index - ir::value *sta = nullptr; - ir::value *dyn = offset; - reassociate_idx(offset, builder, dyn, sta); - if(sta){ - builder.set_insert_point(pz); - ir::value *dyn_ptr = builder.create_gep(py, {dyn}); - ir::value *sta_ptr = builder.create_gep(dyn_ptr, {sta}); - params_->copy(dyn_ptr, pz); - params_->copy(sta_ptr, pz); - align_->copy(sta_ptr, pz); - pz->replace_all_uses_with(sta_ptr); - infos[sta_ptr].dyn_ptr = (ir::getelementptr_inst*)dyn_ptr; - infos[sta_ptr].sta_ptr = (ir::getelementptr_inst*)sta_ptr; - } - // reassociate pointer argument - if(ir::getelementptr_inst* gepy = dynamic_cast(py)) - if(infos.find(gepy) != infos.end()){ - builder.set_insert_point(pz); - ir::getelementptr_inst *sta = infos[gepy].sta_ptr; - ir::getelementptr_inst *dyn = infos[gepy].dyn_ptr; - ir::value *cst = *sta->idx_begin(); - ir::value *off = *pz->idx_begin(); - ir::value *new_dyn = builder.create_gep(dyn, {off}); - ir::value *new_pz = builder.create_gep(new_dyn, {cst}, pz->get_name()); - params_->copy(new_dyn, pz); - params_->copy(new_pz, pz); - align_->copy(new_pz, pz); - pz->replace_all_uses_with(new_pz); - } - // reassociate phi-node pointer - if(ir::phi_node* phi = dynamic_cast(py)){ - // only optimize the case where py = phi pa, pz for now - std::vector ops = phi->ops(); - if(ops.size() != 2) + std::set replaced; + size_t n_replaced; + do{ + n_replaced = replaced.size(); + for(ir::function *fn: mod.get_function_list()){ + std::vector rpo = ir::cfg::reverse_post_order(fn); + // iterate through blocks + for(ir::basic_block *block: rpo){ + // iterate through instruction + for(ir::instruction *i: block->get_inst_list()){ + // getelementptr instruction + if(ir::getelementptr_inst *pz = dynamic_cast(i)){ + if(replaced.find(pz) != replaced.end()) continue; - if(ops[0] != pz && ops[1] != pz) - continue; - // grab incoming - size_t idx_z = (ops[0] == pz) ? 0 : 1; - size_t idx_a = (ops[0] == pz) ? 1 : 0; - // check if pa is known to have constant offset - ir::value *vpa = phi->get_incoming_value(idx_a); - auto it = infos.find(vpa); - if(it == infos.end()) - continue; - ir::getelementptr_inst *pa = (ir::getelementptr_inst*)vpa; - // unpack dynamically/statically offset pointer - ir::getelementptr_inst *dyn_ptr = it->second.dyn_ptr; - ir::getelementptr_inst *sta_ptr = it->second.sta_ptr; - // we take static offset out of the phi function - builder.set_insert_point(phi); - ir::phi_node *new_phi = builder.create_phi(phi->get_type(), 2); - // new pz for phi has the same offsets - builder.set_insert_point(pz); - std::vector idxs(pz->idx_begin(), pz->idx_end()); - ir::value *new_phi_pz = builder.create_gep(new_phi, idxs); - // fold the static offset into the new pz value - ir::value *new_pz = builder.create_gep(new_phi_pz, {*sta_ptr->idx_begin()}); - // populate incoming values - new_phi->add_incoming(dyn_ptr, phi->get_incoming_block(idx_a)); - new_phi->add_incoming(new_phi_pz, phi->get_incoming_block(idx_z)); - // replace phi uses - phi->replace_all_uses_with(new_phi); - // replace pz uses - pz->replace_all_uses_with(new_pz); - // copy params - params_->copy(new_phi_pz, pz); - params_->copy(new_phi, phi); - params_->copy(new_pz, pz); - align_->copy(new_pz, pz); + // unpack GEP instruction + ir::value* py = pz->get_pointer_operand(); + ir::value* offset = *pz->idx_begin(); + // reassociate index + ir::value *sta = nullptr; + ir::value *dyn = offset; + reassociate_idx(offset, builder, dyn, sta); + if(sta){ + builder.set_insert_point(pz); + ir::value *dyn_ptr = builder.create_gep(py, {dyn}); + ir::value *sta_ptr = builder.create_gep(dyn_ptr, {sta}); + params_->copy(dyn_ptr, pz); + params_->copy(sta_ptr, pz); + pz->replace_all_uses_with(sta_ptr); + infos[sta_ptr].dyn_ptr = dyn_ptr; + infos[sta_ptr].sta_ptr = (ir::getelementptr_inst*)sta_ptr; + replaced.insert(pz); + } + // reassociate pointer argument + if(infos.find(py) != infos.end()){ + builder.set_insert_point(pz); + ir::getelementptr_inst *sta = infos[py].sta_ptr; + ir::value *dyn = infos[py].dyn_ptr; + ir::value *cst = *sta->idx_begin(); + ir::value *off = *pz->idx_begin(); + ir::value *pz_dyn = builder.create_gep(dyn, {off}); + ir::value *pz_sta = builder.create_gep(pz_dyn, {cst}, pz->get_name()); + params_->copy(pz_dyn, pz); + params_->copy(pz_sta, pz); + pz->replace_all_uses_with(pz_sta); + infos[pz_sta].dyn_ptr = pz_dyn; + infos[pz_sta].sta_ptr = (ir::getelementptr_inst*)pz_sta; + replaced.insert(pz); + } + // reassociate phi-node pointer + if(ir::phi_node* phi = dynamic_cast(py)){ + // only optimize the case where py = phi pa, pz for now + std::vector ops = phi->ops(); + if(ops.size() != 2) + continue; + if(ops[0] != pz && ops[1] != pz) + continue; + // grab incoming + size_t idx_z = (ops[0] == pz) ? 0 : 1; + size_t idx_a = (ops[0] == pz) ? 1 : 0; + // check if pa is known to have constant offset + ir::value *vpa = phi->get_incoming_value(idx_a); + auto it_a = infos.find(vpa); + if(it_a == infos.end()) + continue; + // unpack dynamically/statically offset pointer + ir::value *pa_dyn = it_a->second.dyn_ptr; + ir::getelementptr_inst *pa_sta = it_a->second.sta_ptr; + ir::value *pz = phi->get_incoming_value(idx_z); + // extract offset + ir::value *off = *pa_sta->idx_begin(); + builder.set_insert_point(phi); + ir::phi_node *phi_dyn = builder.create_phi(phi->get_type(), 2); + phi_dyn->add_incoming(pa_dyn, phi->get_incoming_block(idx_a)); + builder.set_insert_point(phi->get_parent()->get_first_non_phi()); + // re-add the offset + ir::value *phi_sta = builder.create_gep(phi_dyn, {off}, phi->get_name() + "_sta"); + phi->replace_all_uses_with(phi_sta); + // remove offset from pz + if(auto *x = dynamic_cast(pz)){ + auto insts = x->get_parent()->get_inst_list(); + auto it = std::find(insts.begin(), insts.end(), x); + it++; + builder.set_insert_point(*it); + } + ir::value *neg_off = builder.create_neg(off); + ir::value *pz_dyn = builder.create_gep(pz, {neg_off}); + phi_dyn->add_incoming(pz_dyn, phi->get_incoming_block(idx_z)); + // copy parameters + params_->copy(pz_dyn, pz); + params_->copy(((ir::instruction*)neg_off)->get_operand(0), off); + params_->copy(neg_off, off); + params_->copy(phi_dyn, phi); + params_->copy(phi_sta, phi); + infos[phi_sta].dyn_ptr = phi_dyn; + infos[phi_sta].sta_ptr = (ir::getelementptr_inst*)phi_sta; + replaced.insert(phi); + } } + } + } } - } - } - } + }while(replaced.size() != n_replaced); } } diff --git a/lib/codegen/transform/trans.cpp b/lib/codegen/transform/trans.cpp index 4edfa6a59..946fbb0fd 100644 --- a/lib/codegen/transform/trans.cpp +++ b/lib/codegen/transform/trans.cpp @@ -7,14 +7,42 @@ namespace codegen{ namespace transform{ -ir::value* optimize_trans::replace_phi(ir::value* value, - ir::builder& builder, - const std::vector &perm){ +inline bool is_trans(ir::value *v){ + auto *x = dynamic_cast(v); + if(!x) + return false; + std::vector perm = x->get_perm(); + std::vector ref; + ir::type *int32_ty = ir::type::get_int32_ty(v->get_type()->get_context()); + for(size_t i = 0; i < perm.size(); i++) + ref.push_back(ir::constant_int::get(int32_ty, i)); + std::swap(ref[0], ref[1]); + // true is perm == ref + return std::equal(perm.begin(), perm.end(), ref.begin()); +} + +inline bool is_hmma(ir::value *v){ + bool result = false; + if(auto *x = dynamic_cast(v)){ + ir::value *a = x->get_operand(0); + ir::type *a_ty = a->get_type(); + ir::value *b = x->get_operand(1); + ir::type *b_ty = b->get_type(); + // inputs have to be FP16 + result = a_ty->get_scalar_ty()->is_half_ty() && b_ty->get_scalar_ty()->is_half_ty(); +// reduction has to be multiple of 4 +// result = result && ((a_ty->get_tile_shapes()[1]->get_value() % 4) == 0); + } + return result; +} + +ir::value* rewrite_trans_phi_impl(ir::value *value, ir::builder &builder, + const std::vector& perm) { if(auto phi = dynamic_cast(value)) { // transpose operands std::vector incs; for(unsigned n = 0; n < phi->get_num_incoming(); n++) - incs.push_back(replace_phi(phi->get_incoming_value(n), builder, perm)); + incs.push_back(rewrite_trans_phi_impl(phi->get_incoming_value(n), builder, perm)); // create phi for transposed values builder.set_insert_point(phi); ir::phi_node* result = builder.create_phi(incs[0]->get_type(), incs.size()); @@ -31,43 +59,159 @@ ir::value* optimize_trans::replace_phi(ir::value* value, trans->set_operand(0, i); return trans; } - throw std::runtime_error("cannot transpose phi"); } +bool peephole::rewrite_trans_phi(ir::instruction* value, ir::builder& builder) { + auto trans = dynamic_cast(value); + if(!trans) + return false; + auto users = trans->get_users(); + auto ops = trans->ops(); + if(users.size() > 1 || ops.size() > 1) + return false; + ir::value* op = *ops.begin(); + auto* phi = dynamic_cast(op); + if(!phi) + return false; + ir::value* new_phi = rewrite_trans_phi_impl(op, builder, trans->get_perm()); + trans->replace_all_uses_with(new_phi); + return true; +} -void optimize_trans::run(ir::module &mod) { - ir::builder &builder = mod.get_builder(); - // iterate - for(ir::function *fn: mod.get_function_list()) - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction* i: block->get_inst_list()){ - // transposition - if(auto trans = dynamic_cast(i)) { - auto users = trans->get_users(); - auto ops = trans->ops(); - if(users.size() > 1 || ops.size() > 1) - continue; - ir::value* op = *ops.begin(); - // todo: chains of transpositions - // trans(phi) -> phi(trans(), trans()...) - if(dynamic_cast(op)){ - ir::value* new_phi = replace_phi(op, builder, trans->get_perm()); - trans->replace_all_uses_with(new_phi); +bool peephole::rewrite_dot(ir::instruction *value, ir::builder& builder){ + if(auto dot = dynamic_cast(value)){ + builder.set_insert_point(value); + ir::value *A = dot->get_operand(0); + ir::value *B = dot->get_operand(1); + ir::value *D = dot->get_operand(2); + bool trans_a = is_trans(A); + bool trans_b = is_trans(B); + // NN + if(!dot->is_a_trans() && !dot->is_b_trans()){ + if(is_hmma(dot)) { + ir::value *AA = A; + ir::value *BB = B; + if(trans_a){ + AA = ((ir::trans_inst*)A)->get_operand(0); + } + else{ + if(auto *T = dynamic_cast(A)){ + std::vector perm(T->get_perm()); + std::swap(perm[0], perm[1]); + AA = builder.create_trans(T->get_operand(0), perm); + T->replace_all_uses_with(AA); + trans_a = true; + } + } + if(trans_b){ + BB = ((ir::trans_inst*)B)->get_operand(0); + } + else{ + if(auto *T = dynamic_cast(A)){ + std::vector perm(T->get_perm()); + std::swap(perm[0], perm[1]); + AA = builder.create_trans(T->get_operand(0), perm); + T->replace_all_uses_with(AA); + trans_a = true; + } + } + ir::instruction *dot_atbt = builder.insert(ir::dot_inst::create(AA, BB, D, trans_a, trans_b)); + dot->replace_all_uses_with(dot_atbt); + return true; + } + else{ + // dot(op(a), trans(b)) + if(trans_b){ + ir::value* BB = ((ir::trans_inst*)B)->get_operand(0); + ir::instruction *NT = builder.insert(ir::dot_inst::create_nt(A, BB, D)); + dot->replace_all_uses_with(NT); + return true; + } + // dot(op(a), b) + if(!trans_b){ + // create permutations + size_t size = B->get_type()->get_tile_shapes().size(); + std::vector perm(size); + ir::type *int32_ty = ir::type::get_int32_ty(B->get_type()->get_context()); + for(size_t i = 0; i < size; i++) + perm[i] = ir::constant_int::get(int32_ty, i); + std::swap(perm[0], perm[1]); + // replace NN -> NT (trans) + ir::value* BB = builder.create_trans(B, perm); + ir::instruction *NT = builder.insert(ir::dot_inst::create_nt(A, BB, D)); + dot->replace_all_uses_with(NT); + return true; + } } } - // reductions - if(auto x = dynamic_cast(i)) { - ir::constant_int *one = ir::constant_int::get(ir::type::get_int32_ty(i->get_type()->get_context()), 1); - ir::value *arg = x->get_operand(0); - auto shapes = arg->get_type()->get_tile_shapes(); - if(shapes[x->get_axis()] == one){ - builder.set_insert_point(x); - ir::value* new_red = builder.create_reshape(arg, x->get_type()->get_tile_shapes()); - x->replace_all_uses_with(new_red); - } - } - } + return false; +} + +bool peephole::rewrite_unit_red(ir::instruction *value, ir::builder& builder){ + auto x = dynamic_cast(value); + if(!x) + return false; + ir::constant_int *one = ir::constant_int::get(ir::type::get_int32_ty(value->get_type()->get_context()), 1); + ir::value *arg = x->get_operand(0); + auto shapes = arg->get_type()->get_tile_shapes(); + if(shapes[x->get_axis()] == one){ + builder.set_insert_point(x); + ir::value* new_red = builder.create_reshape(arg, x->get_type()->get_tile_shapes()); + x->replace_all_uses_with(new_red); + return true; + } + return false; +} + +bool peephole::rewrite_gep_ptr_min_off_plus_off(ir::instruction *value, ir::builder& builder) { + auto x = dynamic_cast(value); + if(!x) + return false; + auto y = dynamic_cast(x->get_pointer_operand()); + if(!y) + return false; + auto idx = *y->idx_begin(); + auto z = dynamic_cast(idx); + if(!z) + return false; + bool is_sub = z->get_op() == ir::binary_operator::llop::Sub; + auto *lhs = dynamic_cast(z->get_operand(0)); + bool is_lhs_0 = lhs && (lhs->get_value()==0); + bool is_rhs_eq_x_rhs = z->get_operand(1) == *x->idx_begin(); + if(is_sub && is_lhs_0 && is_rhs_eq_x_rhs){ + x->replace_all_uses_with(y->get_pointer_operand()); + return true; + } + return false; +} + + +void peephole::run(ir::module &mod) { + ir::builder &builder = mod.get_builder(); + // keep track of whether any modification was made + bool was_modified = false; + + // rewrite dots first + do{ + was_modified = false; + for(ir::function *fn: mod.get_function_list()) + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction* i: block->get_inst_list()) + rewrite_dot(i, builder); + }while(was_modified); + + // rewrite other ops + do{ + was_modified = false; + for(ir::function *fn: mod.get_function_list()) + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction* i: block->get_inst_list()){ + was_modified = was_modified || rewrite_trans_phi(i, builder); + was_modified = was_modified || rewrite_unit_red(i, builder); + was_modified = was_modified || rewrite_gep_ptr_min_off_plus_off(i, builder); + } + }while(was_modified); } } diff --git a/lib/dnn/conv.cpp b/lib/dnn/conv.cpp index 0f32455ea..63503c70f 100644 --- a/lib/dnn/conv.cpp +++ b/lib/dnn/conv.cpp @@ -363,7 +363,7 @@ void conv::enqueue_impl(driver::stream *stream, driver::kernel *kernel, std::vector args, runtime::launch_information info) { driver::buffer *a = args[0], *b = args[1], *c = args[2], *bias = args[3]; - unsigned TM = info.global_range_size[0], TN = info.global_range_size[1]; + unsigned TM = info.globals["TM"], TN = info.globals["TN"]; unsigned GZ = 1; set_arg(kernel, a, b, c, bias); std::array grid = {1}; diff --git a/lib/dnn/dot.cpp b/lib/dnn/dot.cpp index 30cec06a4..4ea355170 100644 --- a/lib/dnn/dot.cpp +++ b/lib/dnn/dot.cpp @@ -106,11 +106,9 @@ void dot::triton_c_src(std::ostream &os) const { std::string align_ldb_str = "multiple_of(" + std::to_string(align_ldb_) + ")"; std::string res = R"( -const tunable int TM = {8}; -const tunable int TN = {8}; +const tunable int TM = {128}; +const tunable int TN = {128}; const tunable int TK = {32}; -const tunable int GZ = {1}; - void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, restrict read_only align(16) )" + b_ty_ + R"( *B, @@ -127,18 +125,14 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, float xc[)" + XCS + R"(] = 0; )" + a_ty_ + R"(* pa[)" + AS + "] = A + rka" + bca0 + lda0 + " + rxa" + bca1 + lda1 + R"(; )" + b_ty_ + R"(* pb[)" + BS + "] = B + rkb" + bcb0 + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; - bool checka[)" + AS + R"(] = (rka < K))" + bca0 + " && (rxa < M)" + bca1 + R"(; - bool checkb[)" + BS + R"(] = (rkb < K))" + bcb0 + " && (ryb < N)" + bcb1 + R"(; - )" + a_ty_ + R"( a[)" + AS + R"(] = checka ? *pa : 0; - )" + b_ty_ + R"( b[)" + BS + R"(] = checkb ? *pb : 0; + )" + a_ty_ + R"( a[)" + AS + R"(] = *pa; + )" + b_ty_ + R"( b[)" + BS + R"(] = *pb; for(int k = K; k > 0; k = k - TK){ xc = dot()" + usea + ", " + useb + R"(, xc); pa = pa + TK)" + lda0 + R"(; pb = pb + TK)" + ldb0 + R"(; - bool checka[)" + AS + R"(] = k > TK; - bool checkb[)" + BS + R"(] = k > TK; - a = checka ? *pa : 0; - b = checkb ? *pb : 0; + a = *pa; + b = *pb; } int rxc[TM] = ridx * TM + (0 ... TM); int ryc[TN] = ridy * TN + (0 ... TN); diff --git a/lib/driver/backend.cpp b/lib/driver/backend.cpp index 6f98be75c..aa90fcdc4 100755 --- a/lib/driver/backend.cpp +++ b/lib/driver/backend.cpp @@ -48,20 +48,20 @@ void backend::platforms::init() { if(dispatch::cuinit()){ cache_.push_back(new cu_platform()); } - //if OpenCL is here - if(dispatch::clinit()){ - cl_uint num_platforms; - dispatch::clGetPlatformIDs(0, nullptr, &num_platforms); - std::vector ids(num_platforms); - dispatch::clGetPlatformIDs(num_platforms, ids.data(), nullptr); - for(cl_platform_id id: ids) - cache_.push_back(new cl_platform(id)); - } - //if host is here - bool host_visible = true; - if(host_visible){ - cache_.push_back(new host_platform()); - } +// //if OpenCL is here +// if(dispatch::clinit()){ +// cl_uint num_platforms; +// dispatch::clGetPlatformIDs(0, nullptr, &num_platforms); +// std::vector ids(num_platforms); +// dispatch::clGetPlatformIDs(num_platforms, ids.data(), nullptr); +// for(cl_platform_id id: ids) +// cache_.push_back(new cl_platform(id)); +// } +// //if host is here +// bool host_visible = true; +// if(host_visible){ +// cache_.push_back(new host_platform()); +// } if(cache_.empty()) throw std::runtime_error("Triton: No backend available. Make sure CUDA is available in your library path"); } diff --git a/lib/ir/constant.cpp b/lib/ir/constant.cpp index d28da5efe..6493f23b4 100644 --- a/lib/ir/constant.cpp +++ b/lib/ir/constant.cpp @@ -12,7 +12,7 @@ namespace ir{ constant *constant::get_null_value(type *ty) { context &ctx = ty->get_context(); - switch (ty->get_type_id()) { + switch (ty->get_scalar_ty()->get_type_id()) { case type::IntegerTyID: return constant_int::get(ty, 0); case type::HalfTyID: diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 7ae5b73ec..5b49e240e 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -147,13 +147,13 @@ binary_operator *binary_operator::create(op_t op, value *lhs, value *rhs, const } binary_operator *binary_operator::create_fneg(value *arg, const std::string &name, instruction *next){ - assert(arg->get_type()->is_floating_point_ty()); + assert(arg->get_type()->get_scalar_ty()->is_floating_point_ty()); value *zero = constant_fp::get_zero_value_for_negation(arg->get_type()); return binary_operator::create(llvm::Instruction::FSub, zero, arg, name, next); } binary_operator *binary_operator::create_neg(value *arg, const std::string &name, instruction *next){ - assert(arg->get_type()->is_integer_ty()); + assert(arg->get_type()->get_scalar_ty()->is_integer_ty()); value *zero = constant_fp::get_zero_value_for_negation(arg->get_type()); return binary_operator::create(llvm::Instruction::Sub, zero, arg, name, next); } diff --git a/lib/ir/type.cpp b/lib/ir/type.cpp index e192b7431..e07782ffd 100644 --- a/lib/ir/type.cpp +++ b/lib/ir/type.cpp @@ -73,6 +73,14 @@ const type::tile_shapes_t &type::get_tile_shapes() const { return ((tile_type*)this)->get_shapes(); } +unsigned type::get_tile_num_elements() const { + const tile_shapes_t& shapes = get_tile_shapes(); + unsigned result = 1; + for(ir::constant_int *x: shapes) + result *= x->get_value(); + return result; +} + // composite predicates bool type::is_int_or_tileint_ty() diff --git a/lib/ir/value.cpp b/lib/ir/value.cpp index b404e5eea..0797d0441 100644 --- a/lib/ir/value.cpp +++ b/lib/ir/value.cpp @@ -57,10 +57,8 @@ unsigned user::get_num_hidden() const { } void user::replace_all_uses_with(value *target) { - for(auto it = users_.begin(); it != users_.end();){ + for(auto it = users_.begin(); it != users_.end(); it++){ (*it)->replace_uses_of_with(this, target); - target->add_use(*it); - erase_use(*it++); } } @@ -68,6 +66,8 @@ void user::replace_uses_of_with(value *before, value *after) { for(size_t i = 0; i < ops_.size(); i++) if(ops_[i] == before) ops_[i] = after; + after->add_use(this); + erase_use(this); } } diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp index 4c7a030f8..ae9e1c783 100644 --- a/lib/runtime/jit.cpp +++ b/lib/runtime/jit.cpp @@ -82,10 +82,6 @@ void parallel_for_each(std::vector> const & iterates, std: std::unique_ptr jit::make_llvm_module(ir::module &module, passes_wrapper &passes, llvm::LLVMContext& llvm_context, launch_information& info) { llvm::Module* result = new llvm::Module(module.get_name(), llvm_context); passes.selection.run(module, *result); - // launch information - info.global_range_size.clear(); - for(unsigned i = 0; i < passes.tune.get_num_global_range(); i++) - info.global_range_size.push_back(passes.tune.get_global_range_size(i)); // add globals for(auto x: module.globals()) info.globals[x.first] = ((ir::metaparameter*)x.second)->get_value(); From 4bc5758a22a53f0ff6f30c747b209b5ae4747933 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 12 Aug 2019 21:09:47 -0700 Subject: [PATCH 292/494] [general] some cleaning: * trans/dot -> peephole * isel -> added function for tile-level lowering --- include/triton/codegen/selection/selection.h | 64 +- include/triton/codegen/transform/dot.h | 35 - .../codegen/transform/{trans.h => peephole.h} | 3 + include/triton/runtime/jit.h | 7 +- lib/codegen/selection/selection.cpp | 968 +++++++++--------- lib/codegen/transform/dot.cpp | 113 -- .../transform/{trans.cpp => peephole.cpp} | 182 ++-- 7 files changed, 660 insertions(+), 712 deletions(-) delete mode 100644 include/triton/codegen/transform/dot.h rename include/triton/codegen/transform/{trans.h => peephole.h} (71%) delete mode 100644 lib/codegen/transform/dot.cpp rename lib/codegen/transform/{trans.cpp => peephole.cpp} (59%) diff --git a/include/triton/codegen/selection/selection.h b/include/triton/codegen/selection/selection.h index 2bc49f72e..5c5c6ae2e 100644 --- a/include/triton/codegen/selection/selection.h +++ b/include/triton/codegen/selection/selection.h @@ -120,36 +120,66 @@ class selection{ typedef std::map vmap_t; typedef std::map tmap_t; + typedef llvm::LLVMContext LLVMContext; + typedef llvm::IRBuilder<> Builder; + typedef llvm::Type Type; + typedef llvm::Value Value; + typedef llvm::Module Module; + typedef llvm::Instruction Instruction; + typedef llvm::Constant Constant; + typedef llvm::ArrayType ArrayType; + typedef llvm::Function Function; + private: // utils - llvm::Type *make_vector_ty(llvm::Type *ty, size_t vector_size); + Type *make_vector_ty(Type *ty, size_t vector_size); std::vector extract_shapes(ir::value *v); // LLVM conversions - llvm::Type* llvm_type(ir::type *ty, llvm::LLVMContext &ctx); - llvm::Value* llvm_value(ir::value *v, llvm::IRBuilder<> &builder); - llvm::Instruction* llvm_inst(ir::instruction *inst, std::function value, llvm::IRBuilder<> &builder); - llvm::Constant* llvm_constant(ir::constant *cst, llvm::LLVMContext &ctx); - llvm::Value* llvm_alloc_const(ir::alloc_const *v, llvm::Module *module, llvm::IRBuilder<> &builder); - llvm::ArrayType* llvm_linearized_tile_type(ir::type *ty, llvm::LLVMContext &ctx); + Type* llvm_type(ir::type *ty, LLVMContext &ctx); + Value* llvm_value(ir::value *v, Builder &builder); + Instruction* llvm_inst(ir::instruction *inst, std::function value, Builder &builder); + Constant* llvm_constant(ir::constant *cst, LLVMContext &ctx); + Value* llvm_alloc_const(ir::alloc_const *v, Module *module, Builder &builder); + ArrayType* llvm_linearized_tile_type(ir::type *ty, LLVMContext &ctx); // grid construction void create_grids(std::vector &grids, std::map &references, ir::function *fn); - void create_tile(ir::value *v, llvm::IRBuilder<> &builder, const std::map &references, std::set &seen, llvm::Value *sh_mem_ptr); - void init_axes(ir::value *i, llvm::IRBuilder<> &builder, llvm::Value *u_thread_id, llvm::Value *u_warp_id); - void init_grids(ir::function *fn, llvm::IRBuilder<> &builder, llvm::Value *sh_mem_ptr); + void create_tile(ir::value *v, Builder &builder, const std::map &references, std::set &seen, Value *sh_mem_ptr); + void init_axes(ir::value *i, Builder &builder, Value *u_thread_id, Value *u_warp_id); + void init_grids(ir::function *fn, Builder &builder, Value *sh_mem_ptr); + + // lower scalar instruction + void lower_instruction(ir::instruction *src, Builder &builder); + // lower tile instruction + void lower_masked_store(ir::masked_store_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); + void lower_store(ir::store_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); + void lower_downcast(ir::downcast_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); + void lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); + void lower_dynamic_range_idx(ir::nv_dynamic_range_idx_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); + void lower_reshape(ir::reshape_inst* x, LLVMContext &ctx, Function *fn, Builder &builder); + void lower_splat(ir::splat_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); + void lower_broadcast(ir::broadcast_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); + void lower_vectorize(ir::vectorize_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); + void lower_copy_to_shared(ir::copy_to_shared_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); + void lower_trans(ir::trans_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); + void lower_hmma_dot(ir::dot_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); + void lower_scalar_dot(ir::dot_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); + void lower_dot(ir::dot_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); + void lower_masked_load(ir::masked_load_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); + void lower_load(ir::load_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); + void lower_elementwise(ir::instruction *x, LLVMContext &ctx, Function *fn, Builder &builder); + void lower_tile_instruction(ir::instruction *src, Builder &builder); + - // lowering - void lower_instruction(ir::instruction *src, llvm::IRBuilder<> &builder); - void lower_tile_instruction(ir::instruction *src, llvm::IRBuilder<> &builder); public: selection(analysis::shmem::allocation *alloc, analysis::tune *params, analysis::shmem::info *buffer_info, analysis::alignment_info *ax_info, target *tgt) : alloc_(alloc), params_(params), buffer_info_(buffer_info), axis_info_(ax_info), tgt_(tgt){ } - void run(ir::module &src, llvm::Module &dst); + void run(ir::module &src, Module &dst); private: vmap_t vmap_; @@ -160,9 +190,9 @@ private: analysis::shmem::info *buffer_info_; analysis::alignment_info *axis_info_; std::map axes_; - llvm::Value *sh_mem_ptr_; - llvm::Value *offset_a_i_, *offset_a_k_; - llvm::Value *offset_b_j_, *offset_b_k_; + Value *sh_mem_ptr_; + Value *offset_a_i_, *offset_a_k_; + Value *offset_b_j_, *offset_b_k_; unsigned num_packs_0_, num_packs_1_; unsigned pack_size_0_, pack_size_1_; }; diff --git a/include/triton/codegen/transform/dot.h b/include/triton/codegen/transform/dot.h deleted file mode 100644 index 15612e2f0..000000000 --- a/include/triton/codegen/transform/dot.h +++ /dev/null @@ -1,35 +0,0 @@ -#ifndef TDL_INCLUDE_CODEGEN_OPTIMIZE_DOT_H -#define TDL_INCLUDE_CODEGEN_OPTIMIZE_DOT_H - -#include -#include -#include - -namespace triton { - -namespace ir { - class module; -} - -namespace codegen{ - -namespace analysis{ -class tune; -} - -namespace transform{ - -class optimize_dot { -public: - optimize_dot(analysis::tune* params): params_(params) {} - void run(ir::module &mod); - -private: - analysis::tune* params_; -}; - -} -} -} - -#endif diff --git a/include/triton/codegen/transform/trans.h b/include/triton/codegen/transform/peephole.h similarity index 71% rename from include/triton/codegen/transform/trans.h rename to include/triton/codegen/transform/peephole.h index 73eedd6e4..acd11ecd6 100644 --- a/include/triton/codegen/transform/trans.h +++ b/include/triton/codegen/transform/peephole.h @@ -14,6 +14,7 @@ namespace ir { class trans_inst; class builder; class constant_int; + class dot_inst; } namespace codegen{ @@ -22,6 +23,8 @@ namespace transform{ class peephole { private: bool rewrite_trans_phi(ir::instruction* value, ir::builder &builder); + bool rewrite_dot_fp32(ir::dot_inst *dot, ir::builder& builder, bool trans_a, bool trans_b, ir::value *A, ir::value *B, ir::value *D); + bool rewrite_dot_hmma(ir::dot_inst *dot, ir::builder& builder, bool trans_a, bool trans_b, ir::value *A, ir::value *B, ir::value *D); bool rewrite_dot(ir::instruction *value, ir::builder& builder); bool rewrite_unit_red(ir::instruction *value, ir::builder& builder); bool rewrite_gep_ptr_min_off_plus_off(ir::instruction *value, ir::builder& builder); diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index 20cf39691..869d68ac5 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -15,9 +15,8 @@ #include "triton/codegen/analysis/shmem/liveness.h" #include "triton/codegen/analysis/shmem/info.h" #include "triton/codegen/analysis/alignment.h" -#include "triton/codegen/transform/dot.h" #include "triton/codegen/transform/dce.h" -#include "triton/codegen/transform/trans.h" +#include "triton/codegen/transform/peephole.h" #include "triton/codegen/transform/shmem/barriers.h" #include "triton/codegen/transform/reassociate.h" #include "triton/codegen/transform/vectorize.h" @@ -64,7 +63,6 @@ public: shmem_barriers(&shmem_allocation, &shmem_info), vectorize(&tune), selection(&shmem_allocation, &tune, &shmem_info, &alignment_info, target), - optimize_dot(&tune), dce(), peephole(), alignment_info(), @@ -72,7 +70,6 @@ public: target_(target) { } void target_independent(ir::module &module) { - ir::print(module, std::cout); peephole.run(module); dce.run(module); } @@ -89,7 +86,6 @@ public: alignment_info.run(module); vectorize.run(module); dce.run(module); - ir::print(module, std::cout); } codegen::selection selection; @@ -100,7 +96,6 @@ public: codegen::analysis::alignment_info alignment_info; codegen::transform::shmem_barriers shmem_barriers; codegen::transform::vectorize vectorize; - codegen::transform::optimize_dot optimize_dot; codegen::transform::dce dce; codegen::transform::peephole peephole; codegen::transform::reassociate reassociate; diff --git a/lib/codegen/selection/selection.cpp b/lib/codegen/selection/selection.cpp index d76ea5071..96b25539a 100644 --- a/lib/codegen/selection/selection.cpp +++ b/lib/codegen/selection/selection.cpp @@ -798,30 +798,22 @@ void selection::init_grids(ir::function *fn, IRBuilder<> &builder, Value *sh_mem } } - -void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> &builder) { - BasicBlock *block = builder.GetInsertBlock(); - Module *module = block->getModule(); - LLVMContext &ctx = builder.getContext(); - Function *fn = block->getParent(); - // store - if(auto *x = dynamic_cast(ins)){ - distributed_tile* ptrs = (distributed_tile*)tmap_.at(x->get_pointer_operand()); - distributed_tile* scalars = (distributed_tile*)tmap_.at(x->get_value_operand()); - ir::value *mask = x->get_mask_operand(); - distributed_tile* preds = (distributed_tile*)tmap_.at(mask); - ptrs->for_each([&](indices_t idx){ - Value *scalar = scalars->get_value(idx); - Value *ptr = ptrs->get_value(idx); - Value *pred = preds->get_value(idx); - BasicBlock *mask_then_bb = BasicBlock::Create(ctx, "mask_then", fn); - BasicBlock *mask_done_bb = BasicBlock::Create(ctx, "mask_done", fn); - builder.CreateCondBr(pred, mask_then_bb, mask_done_bb); - builder.SetInsertPoint(mask_then_bb); - builder.CreateStore(scalar, ptr); - builder.CreateBr(mask_done_bb); - builder.SetInsertPoint(mask_done_bb); - +void selection::lower_masked_store(ir::masked_store_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { + distributed_tile* ptrs = (distributed_tile*)tmap_.at(x->get_pointer_operand()); + distributed_tile* scalars = (distributed_tile*)tmap_.at(x->get_value_operand()); + ir::value *mask = x->get_mask_operand(); + distributed_tile* preds = (distributed_tile*)tmap_.at(mask); + ptrs->for_each([&](indices_t idx){ + Value *scalar = scalars->get_value(idx); + Value *ptr = ptrs->get_value(idx); + Value *pred = preds->get_value(idx); + BasicBlock *mask_then_bb = BasicBlock::Create(ctx, "mask_then", fn); + BasicBlock *mask_done_bb = BasicBlock::Create(ctx, "mask_done", fn); + builder.CreateCondBr(pred, mask_then_bb, mask_done_bb); + builder.SetInsertPoint(mask_then_bb); + builder.CreateStore(scalar, ptr); + builder.CreateBr(mask_done_bb); + builder.SetInsertPoint(mask_done_bb); // std::string offset = ""; // if(GetElementPtrInst *gep = dyn_cast(ptr)) // if(gep->getNumIndices() == 1) @@ -832,356 +824,229 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & // std::string asm_str = "@$0 st.global.b32 [$1" + offset + "], $2;"; // InlineAsm *iasm = InlineAsm::get(ty, asm_str, "b,l,f", true); // builder.CreateCall(iasm, {pred, ptr, scalar}); - }); + }); +} + +void selection::lower_store(ir::store_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { + distributed_tile* ptrs = (distributed_tile*)tmap_.at(x->get_pointer_operand()); + tile *scalars = tmap_.at(x->get_value_operand()); + ptrs->for_each([&](indices_t idx){ + builder.CreateStore(scalars->get_value(idx), ptrs->get_value(idx)); + }); +} + +void selection::lower_downcast(ir::downcast_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { + vmap_[x] = tmap_[x->get_operand(0)]->get_value({builder.getInt32(0)}); +} + +void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { + ir::instruction *ins = (ir::instruction*)x; + Module *module = fn->getParent(); + std::map partial; + ir::value *op = x->get_operand(0); + distributed_tile* op_tile = (distributed_tile*)tmap_.at(op); + unsigned axis = x->get_axis(); + + // reduce within thread + op_tile->for_each([&](indices_t idx) { + indices_t pidx = idx; + pidx.erase(pidx.begin() + axis); + Value *current = op_tile->get_value(idx); + // current partial result is not initialized -- create + if(partial.find(pidx) == partial.end()) + partial[pidx] = current; + // current partial result is initialized -- accumulate + else + partial[pidx] = builder.CreateFAdd(partial[pidx], current); + }); + + // reduce within blocks + unsigned addr_space = sh_mem_ptr_->getType()->getPointerAddressSpace(); + Type *res_ty = builder.getFloatTy(); + Value *base_ptr = builder.CreateBitCast(sh_mem_ptr_, PointerType::get(res_ty, addr_space)); + for(auto& x: partial) { + // current element being computed + Value *lane = axes_.at(params_->get_param_group(op, axis)).thread_id; + Value *&result = x.second; + indices_t write_idx = x.first; + write_idx.insert(write_idx.begin() + axis, lane); + + // shared memory write pointer + Value *write_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), write_idx); + Value *write_ptr = builder.CreateGEP(base_ptr, write_offset); + + // initialize shared memory + tgt_->add_barrier(module, builder); + builder.CreateStore(result, write_ptr); + // build result + unsigned depth = params_->get_param(op, "wpt.d" + std::to_string(axis))->get_value(); + for(unsigned i = depth/2; i > 0; i >>= 1){ + // current indices + indices_t current(write_idx.size(), builder.getInt32(0)); + current[axis] = builder.getInt32(i); + // shared memory offset + Value *read_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), current); + Value *is_active = builder.CreateICmpULT(lane, builder.getInt32(i)); + read_offset = builder.CreateSelect(is_active, read_offset, builder.getInt32(0)); + // shared memory read pointer + Value *read_ptr = builder.CreateGEP(write_ptr, read_offset); + tgt_->add_barrier(module, builder); + Value *next = builder.CreateLoad(read_ptr); + // accumulate + result = builder.CreateFAdd(result, next); + // write back + builder.CreateStore(result, write_ptr); + } + + // result is on the first lane of shared memory + indices_t final = write_idx; + final[axis] = builder.getInt32(0); + Value *read_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), final); + Value *read_ptr = builder.CreateGEP(base_ptr, read_offset); + tgt_->add_barrier(module, builder); + result = builder.CreateLoad(read_ptr); + if(tmap_.find(ins) == tmap_.end()) + vmap_[ins] = result; + else{ + distributed_tile *ti = (distributed_tile*)tmap_[ins]; + ti->set_value(x.first, result); + } } - else if(auto *x = dynamic_cast(ins)) { - distributed_tile* ptrs = (distributed_tile*)tmap_.at(x->get_pointer_operand()); - tile *scalars = tmap_.at(x->get_value_operand()); - ptrs->for_each([&](indices_t idx){ - builder.CreateStore(scalars->get_value(idx), ptrs->get_value(idx)); - }); - } - else { - if(auto *x = dynamic_cast(ins)){ - vmap_[x] = tmap_[x->get_operand(0)]->get_value({builder.getInt32(0)}); - return; - } - if(auto *x = dynamic_cast(ins)){ - std::map partial; - ir::value *op = ins->get_operand(0); - distributed_tile* op_tile = (distributed_tile*)tmap_.at(op); - unsigned axis = x->get_axis(); +} - // reduce within thread - op_tile->for_each([&](indices_t idx) { - indices_t pidx = idx; - pidx.erase(pidx.begin() + axis); - Value *current = op_tile->get_value(idx); - // current partial result is not initialized -- create - if(partial.find(pidx) == partial.end()) - partial[pidx] = current; - // current partial result is initialized -- accumulate - else - partial[pidx] = builder.CreateFAdd(partial[pidx], current); - }); +void selection::lower_dynamic_range_idx(ir::nv_dynamic_range_idx_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { + distributed_tile* result = (distributed_tile*)tmap_.at(x); + result->for_each([&](indices_t idx){ + assert(idx.size() == 1); + BinaryOperator *bin_add = dyn_cast(idx[0]); + assert(bin_add); + Value *res = bin_add->getOperand(0); + result->set_value(idx, res); + }); +} - // reduce within blocks - unsigned addr_space = sh_mem_ptr_->getType()->getPointerAddressSpace(); - Type *res_ty = builder.getFloatTy(); - Value *base_ptr = builder.CreateBitCast(sh_mem_ptr_, PointerType::get(res_ty, addr_space)); - for(auto& x: partial) { - // current element being computed - Value *lane = axes_.at(params_->get_param_group(op, axis)).thread_id; - Value *&result = x.second; - indices_t write_idx = x.first; - write_idx.insert(write_idx.begin() + axis, lane); +void selection::lower_reshape(ir::reshape_inst* x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { + distributed_tile* result = (distributed_tile*)tmap_.at(x); + ir::value* in = x->get_operand(0); + distributed_tile *in_tile = (distributed_tile*)tmap_.at(in); + result->for_each([&](indices_t out_idx){ + unsigned pos = result->get_linear_index(out_idx); + indices_t in_idx = in_tile->get_ordered_indices(pos); + result->set_value(out_idx, in_tile->get_value(in_idx)); + }); +} - // shared memory write pointer - Value *write_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), write_idx); - Value *write_ptr = builder.CreateGEP(base_ptr, write_offset); +void selection::lower_splat(ir::splat_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { + distributed_tile* result = (distributed_tile*)tmap_.at(x); + result->for_each([&](indices_t idx) { + result->set_value(idx, llvm_value(x->get_operand(0), builder)); + }); +} - // initialize shared memory - tgt_->add_barrier(module, builder); - builder.CreateStore(result, write_ptr); - // build result - unsigned depth = params_->get_param(op, "wpt.d" + std::to_string(axis))->get_value(); - for(unsigned i = depth/2; i > 0; i >>= 1){ - // current indices - indices_t current(write_idx.size(), builder.getInt32(0)); - current[axis] = builder.getInt32(i); - // shared memory offset - Value *read_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), current); - Value *is_active = builder.CreateICmpULT(lane, builder.getInt32(i)); - read_offset = builder.CreateSelect(is_active, read_offset, builder.getInt32(0)); - // shared memory read pointer - Value *read_ptr = builder.CreateGEP(write_ptr, read_offset); - tgt_->add_barrier(module, builder); - Value *next = builder.CreateLoad(read_ptr); - // accumulate - result = builder.CreateFAdd(result, next); - // write back - builder.CreateStore(result, write_ptr); - } +void selection::lower_broadcast(ir::broadcast_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { + distributed_tile* result = (distributed_tile*)tmap_.at(x); + ir::value* in = x->get_operand(0); + const auto& in_shapes = in->get_type()->get_tile_shapes(); + distributed_tile *in_tile = (distributed_tile*)tmap_.at(in); + result->for_each([&](indices_t out_idx){ + indices_t in_idx = out_idx; + for(size_t k = 0; k < in_idx.size(); k++){ + if(in_shapes[k]->get_value() == 1) + in_idx[k] = builder.getInt32(0); + } + result->set_value(out_idx, in_tile->get_value(in_idx)); + }); +} - // result is on the first lane of shared memory - indices_t final = write_idx; - final[axis] = builder.getInt32(0); - Value *read_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), final); - Value *read_ptr = builder.CreateGEP(base_ptr, read_offset); - tgt_->add_barrier(module, builder); - result = builder.CreateLoad(read_ptr); - if(tmap_.find(ins) == tmap_.end()) - vmap_[ins] = result; - else{ - distributed_tile *ti = (distributed_tile*)tmap_[ins]; - ti->set_value(x.first, result); - } - } - return; - } - tile *ti = tmap_[ins]; - distributed_tile* result = (distributed_tile*)ti; - if(!ins->get_type()->is_tile_ty()) - return; - const auto& shapes = ins->get_type()->get_tile_shapes(); - // nv_dynamic_range_idx_inst - if(dynamic_cast(ins)){ - result->for_each([&](indices_t idx){ - assert(idx.size() == 1); - BinaryOperator *bin_add = dyn_cast(idx[0]); - assert(bin_add); - Value *res = bin_add->getOperand(0); - result->set_value(idx, res); - }); - } - // reshape - else if(dynamic_cast(ins)) { - ir::value* in = ins->get_operand(0); - distributed_tile *in_tile = (distributed_tile*)tmap_.at(in); - result->for_each([&](indices_t out_idx){ - unsigned pos = result->get_linear_index(out_idx); - indices_t in_idx = in_tile->get_ordered_indices(pos); - result->set_value(out_idx, in_tile->get_value(in_idx)); - }); - } - // splat - else if(dynamic_cast(ins)) { - result->for_each([&](indices_t idx) { - result->set_value(idx, llvm_value(ins->get_operand(0), builder)); - }); - } - // broadcast - else if(dynamic_cast(ins)) { - ir::value* in = ins->get_operand(0); - const auto& in_shapes = in->get_type()->get_tile_shapes(); - distributed_tile *in_tile = (distributed_tile*)tmap_.at(in); - result->for_each([&](indices_t out_idx){ - indices_t in_idx = out_idx; - for(size_t k = 0; k < in_idx.size(); k++){ - if(in_shapes[k]->get_value() == 1) - in_idx[k] = builder.getInt32(0); - } - result->set_value(out_idx, in_tile->get_value(in_idx)); - }); - } - // vectorize - else if(dynamic_cast(ins)) { - distributed_tile* in = (distributed_tile*)tmap_.at(ins->get_operand(0)); - unsigned vector_size = result->axis(0).contiguous; - std::map packets; - in->for_each([&](indices_t idx){ - unsigned linear = in->get_linear_index(idx); - unsigned id = linear / vector_size; - Value *in_value = in->get_value(idx); - if(linear % vector_size == 0) - packets[id] = UndefValue::get(VectorType::get(in_value->getType(), vector_size)); - packets[id] = builder.CreateInsertElement(packets.at(id), in_value, linear % vector_size); - }); - result->for_each([&](indices_t idx){ - unsigned linear = in->get_linear_index(idx); - unsigned id = linear / vector_size; - if(linear % vector_size == 0) - result->set_value(idx, packets[id]); - }); - } - // copy to shared - else if(dynamic_cast(ins)) { - distributed_tile* in = (distributed_tile*)tmap_.at(ins->get_operand(0)); - in->for_each([&](indices_t idx){ - ti->set_value(idx, in->get_value(idx)); - }); - } - // trans - else if(auto* x = dynamic_cast(ins)) { - distributed_tile* in = (distributed_tile*)tmap_.at(ins->get_operand(0)); - auto perm = x->get_perm(); - in->for_each([&](indices_t idx){ - indices_t out_idx(idx.size()); - for(size_t i = 0; i < idx.size(); i++) - out_idx[i] = idx[perm[i]->get_value()]; - ti->set_value(out_idx, in->get_value(idx)); - }); - } - else if(buffer_info_->is_shared(ins)) - return; - // dot - else if(auto dot = dynamic_cast(ins)) { - ir::value *A = ins->get_operand(0); - ir::value *B = ins->get_operand(1); - ir::value *C = ins->get_operand(2); - bool AT = dot->is_a_trans(); - bool BT = dot->is_b_trans(); - distributed_tile *TC = (distributed_tile*)tmap_.at(C); - Type *c_ty = llvm_type(C->get_type()->get_scalar_ty(), ctx); - Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {c_ty}); - auto A_shapes = A->get_type()->get_tile_shapes(); - size_t red_axis = dot->is_a_trans() ? 0 : 1; - unsigned NK = A_shapes[red_axis]->get_value(); +void selection::lower_vectorize(ir::vectorize_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { + distributed_tile* result = (distributed_tile*)tmap_.at(x); + distributed_tile* in = (distributed_tile*)tmap_.at(x->get_operand(0)); + unsigned vector_size = result->axis(0).contiguous; + std::map packets; + in->for_each([&](indices_t idx){ + unsigned linear = in->get_linear_index(idx); + unsigned id = linear / vector_size; + Value *in_value = in->get_value(idx); + if(linear % vector_size == 0) + packets[id] = UndefValue::get(VectorType::get(in_value->getType(), vector_size)); + packets[id] = builder.CreateInsertElement(packets.at(id), in_value, linear % vector_size); + }); + result->for_each([&](indices_t idx){ + unsigned linear = in->get_linear_index(idx); + unsigned id = linear / vector_size; + if(linear % vector_size == 0) + result->set_value(idx, packets[id]); + }); +} + +void selection::lower_copy_to_shared(ir::copy_to_shared_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { + shared_tile* result = (shared_tile*)tmap_.at(x); + distributed_tile* in = (distributed_tile*)tmap_.at(x->get_operand(0)); + in->for_each([&](indices_t idx){ + result->set_value(idx, in->get_value(idx)); + }); +} + +void selection::lower_trans(ir::trans_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { + shared_tile* result = (shared_tile*)tmap_.at(x); + distributed_tile* in = (distributed_tile*)tmap_.at(x->get_operand(0)); + auto perm = x->get_perm(); + in->for_each([&](indices_t idx){ + indices_t out_idx(idx.size()); + for(size_t i = 0; i < idx.size(); i++) + out_idx[i] = idx[perm[i]->get_value()]; + result->set_value(out_idx, in->get_value(idx)); + }); +} + +void selection::lower_hmma_dot(ir::dot_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { + +} + +void selection::lower_scalar_dot(ir::dot_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { + +} + +void selection::lower_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { + const auto& shapes = dot->get_type()->get_tile_shapes(); + distributed_tile* result = (distributed_tile*)tmap_.at(dot); + Module *module = fn->getParent(); + ir::value *A = dot->get_operand(0); + ir::value *B = dot->get_operand(1); + ir::value *C = dot->get_operand(2); + bool AT = dot->is_a_trans(); + bool BT = dot->is_b_trans(); + distributed_tile *TC = (distributed_tile*)tmap_.at(C); + Type *c_ty = llvm_type(C->get_type()->get_scalar_ty(), ctx); + Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {c_ty}); + auto A_shapes = A->get_type()->get_tile_shapes(); + size_t red_axis = dot->is_a_trans() ? 0 : 1; + unsigned NK = A_shapes[red_axis]->get_value(); // std::cout << red_axis << " " << NK << std::endl; - if(NK != 1) - { - shared_tile *TA = (shared_tile*)tmap_.at(A); - shared_tile *TB = (shared_tile*)tmap_.at(B); - if(params_->get_fragment(ins, 0) == analysis::tune::STRIDED_SCAN) { - TA->set_vector_size(TC->axis(0).contiguous); - TB->set_vector_size(TC->axis(1).contiguous); - result->for_each([&](indices_t idx){ - Value *res = TC->get_value(idx); - for(unsigned K = 0; K < NK; ++K){ - // input indices - indices_t a_idx = {idx[0], builder.getInt32(K)}; - indices_t b_idx = {builder.getInt32(K), idx[1]}; - if(AT) - std::swap(a_idx[0], a_idx[1]); - if(BT) - std::swap(b_idx[0], b_idx[1]); - // add batching dimension - for(size_t i = 2; i < idx.size(); i++){ - a_idx.insert(a_idx.end(), idx[i]); - b_idx.insert(b_idx.end(), idx[i]); - } - // load value - Value *a = TA->get_value(a_idx); - Value *b = TB->get_value(b_idx); - if(a->getType() != c_ty) - a = builder.CreateFPCast(a, c_ty); - if(b->getType() != c_ty) - b = builder.CreateFPCast(b, c_ty); - res = builder.CreateCall(f_mul_add, {a, b, res}); - } - result->set_value(idx, res); - }); - } - else { - TA->set_vector_size(4*pack_size_0_); - TB->set_vector_size(4*pack_size_1_); - TA->set_return_mode(true); - TB->set_return_mode(true); - - std::map, std::vector> fcs; - - result->for_each([&](indices_t idx){ - std::vector key(idx.size() - 2); - std::copy(idx.begin() + 2, idx.end(), key.begin()); - fcs[key].push_back(TC->get_value(idx)); - }); - - Type *fp32_ty = builder.getFloatTy(); - Type *fp16x2_ty = VectorType::get(builder.getHalfTy(), 2); - Type *fp32_pack8_ty = StructType::get(ctx, {fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}); - FunctionType *mma_ty = FunctionType::get(fp32_pack8_ty, {fp16x2_ty, fp16x2_ty, fp16x2_ty, fp16x2_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}, false); - - Value *offset_a_i = offset_a_i_; - Value *offset_a_k = offset_a_k_; - Value *offset_b_j = offset_b_j_; - Value *offset_b_k = offset_b_k_; - - Value* u_thread_id = tgt_->get_local_id(builder.GetInsertBlock()->getModule(), builder, 0); - if(dot->is_a_trans()){ - offset_a_i = builder.CreateAdd(offset_a_i, builder.CreateURem(u_thread_id, builder.getInt32(4))); - offset_a_k = builder.getInt32(0); - } - if(!dot->is_b_trans()){ - offset_b_j = builder.CreateAdd(offset_b_j, builder.CreateURem(u_thread_id, builder.getInt32(4))); - offset_b_k = builder.getInt32(0); - } - - std::string op_a = dot->is_a_trans() ? "row" : "col"; - std::string op_b = dot->is_b_trans() ? "row" : "col"; - - InlineAsm *mma_fn = InlineAsm::get(mma_ty, " mma.sync.aligned.m8n8k4." + op_a + "." + op_b + ".f32.f16.f16.f32 " - "{$0, $1, $2, $3, $4, $5, $6, $7}, " - "{$8, $9}, " - "{$10, $11}, " - "{$0, $1, $2, $3, $4, $5, $6, $7};", "=f,=f,=f,=f,=f,=f,=f,=f,r,r,r,r,0,1,2,3,4,5,6,7", false); - - unsigned fpw_0 = params_->get_param(dot, "fpw.d0")->get_value(); - unsigned fpw_1 = params_->get_param(dot, "fpw.d1")->get_value(); - unsigned wts_0 = fpw_0 * 8; - unsigned wts_1 = fpw_1 * 8; - unsigned wpt_0 = params_->get_param(dot, "wpt.d0")->get_value(); - unsigned wpt_1 = params_->get_param(dot, "wpt.d1")->get_value(); - unsigned stride_rep_i = wpt_0 * wts_0; - unsigned stride_rep_j = wpt_1 * wts_1; - unsigned num_rep_i = shapes[0]->get_value() / stride_rep_i; - unsigned ld_fc = num_rep_i * 2; - - - for(auto& x: fcs){ - std::vector& fc = x.second; - for(unsigned pack_i = 0; pack_i < num_packs_0_; pack_i++) - for(unsigned pack_j = 0; pack_j < num_packs_1_; pack_j++){ - for(unsigned K = 0; K < NK; K += 4){ - Value *_K = builder.getInt32(K); - Value *current_offset_a_i = builder.CreateAdd(offset_a_i, builder.getInt32(pack_i*stride_rep_i*pack_size_0_)); - Value *current_offset_b_i = builder.CreateAdd(offset_b_j, builder.getInt32(pack_j*stride_rep_j*pack_size_1_)); - indices_t idx_a = {current_offset_a_i, builder.CreateAdd(offset_a_k, _K)}; - indices_t idx_b = {current_offset_b_i, builder.CreateAdd(offset_b_k, _K)}; - if(dot->is_a_trans()) - std::swap(idx_a[0], idx_a[1]); - if(!dot->is_b_trans()) - std::swap(idx_b[0], idx_b[1]); - idx_a.insert(idx_a.end(), x.first.begin(), x.first.end()); - idx_b.insert(idx_b.end(), x.first.begin(), x.first.end()); - Value *ha = TA->get_value(idx_a); - Value *hb = TB->get_value(idx_b); - for(unsigned ii = 0; ii < pack_size_0_; ii++) - for(unsigned jj = 0; jj < pack_size_1_; jj++){ - Value *ha0 = builder.CreateBitCast(builder.CreateExtractElement(ha, builder.getInt32(ii*pack_size_0_ + 0)), fp16x2_ty); - Value *ha1 = builder.CreateBitCast(builder.CreateExtractElement(ha, builder.getInt32(ii*pack_size_0_ + 1)), fp16x2_ty); - Value *hb0 = builder.CreateBitCast(builder.CreateExtractElement(hb, builder.getInt32(jj*pack_size_0_ + 0)), fp16x2_ty); - Value *hb1 = builder.CreateBitCast(builder.CreateExtractElement(hb, builder.getInt32(jj*pack_size_0_ + 1)), fp16x2_ty); - std::vector idx = { - (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 0)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 1)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 0)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 1)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 2)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 3)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 2)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 3)*ld_fc - }; - Value *nc = builder.CreateCall(mma_fn, {ha0, ha1, hb0, hb1, fc[idx[0]], fc[idx[1]], fc[idx[2]], fc[idx[3]], fc[idx[4]], fc[idx[5]], fc[idx[6]], fc[idx[7]]}); - fc[idx[0]] = builder.CreateExtractValue(nc, {0}); - fc[idx[1]] = builder.CreateExtractValue(nc, {1}); - fc[idx[2]] = builder.CreateExtractValue(nc, {2}); - fc[idx[3]] = builder.CreateExtractValue(nc, {3}); - fc[idx[4]] = builder.CreateExtractValue(nc, {4}); - fc[idx[5]] = builder.CreateExtractValue(nc, {5}); - fc[idx[6]] = builder.CreateExtractValue(nc, {6}); - fc[idx[7]] = builder.CreateExtractValue(nc, {7}); - } - } - } - } - - // write back - unsigned i = 0; - result->for_each([&](indices_t idx){ - std::vector key(idx.size() - 2); - std::copy(idx.begin() + 2, idx.end(), key.begin()); - if(i >= fcs.at(key).size()) - i = 0; - result->set_value(idx, fcs.at(key)[i++]); - }); - - TA->set_return_mode(false); - TB->set_return_mode(false); - } - } - else - { - distributed_tile *TA = (distributed_tile*)tmap_.at(A); - distributed_tile *TB = (distributed_tile*)tmap_.at(B); - result->for_each([&](indices_t idx){ - Value *res = TC->get_value(idx); - indices_t a_idx = {idx[0], builder.getInt32(0)}; - indices_t b_idx = {builder.getInt32(0), idx[1]}; + if(NK != 1) + { + shared_tile *TA = (shared_tile*)tmap_.at(A); + shared_tile *TB = (shared_tile*)tmap_.at(B); + if(params_->get_fragment(dot, 0) == analysis::tune::STRIDED_SCAN) { + TA->set_vector_size(TC->axis(0).contiguous); + TB->set_vector_size(TC->axis(1).contiguous); + result->for_each([&](indices_t idx){ + Value *res = TC->get_value(idx); + for(unsigned K = 0; K < NK; ++K){ + // input indices + indices_t a_idx = {idx[0], builder.getInt32(K)}; + indices_t b_idx = {builder.getInt32(K), idx[1]}; if(AT) std::swap(a_idx[0], a_idx[1]); if(BT) std::swap(b_idx[0], b_idx[1]); + // add batching dimension + for(size_t i = 2; i < idx.size(); i++){ + a_idx.insert(a_idx.end(), idx[i]); + b_idx.insert(b_idx.end(), idx[i]); + } + // load value Value *a = TA->get_value(a_idx); Value *b = TB->get_value(b_idx); if(a->getType() != c_ty) @@ -1189,54 +1054,196 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & if(b->getType() != c_ty) b = builder.CreateFPCast(b, c_ty); res = builder.CreateCall(f_mul_add, {a, b, res}); - result->set_value(idx, res); - }); - } + } + result->set_value(idx, res); + }); } - else if(auto *ld = dynamic_cast(ins)){ - // find vector size - ir::value *ptr = ld->get_pointer_operand(); - unsigned starting_multiple = axis_info_->get_starting_multiple(ptr); - unsigned max_contiguous = axis_info_->get_max_contiguous(ptr); - unsigned alignment = std::min(starting_multiple, max_contiguous); - unsigned vector_size = std::min(result->axis(0).contiguous, alignment); - distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr); - distributed_tile *masks = (distributed_tile*)tmap_.at(ld->get_mask_operand()); - distributed_tile *false_values = (distributed_tile*)tmap_.at(ld->get_false_value_operand()); - std::map packets; - result->for_each([&](indices_t idx){ - unsigned linear = result->get_linear_index(idx); - unsigned id = linear / vector_size; - if(linear % vector_size == 0) { - Value *ptr = pointers->get_value(idx); - ConstantInt *cst = nullptr; - if(GetElementPtrInst *gep = dyn_cast(ptr)) - if(gep->getNumIndices() == 1){ - cst = dyn_cast(gep->idx_begin()); - } + else { + TA->set_vector_size(4*pack_size_0_); + TB->set_vector_size(4*pack_size_1_); + TA->set_return_mode(true); + TB->set_return_mode(true); - ptr = builder.CreateBitCast(ptr, PointerType::get(VectorType::get(result->get_ty(), vector_size), - ptr->getType()->getPointerAddressSpace())); - Value *mask = masks->get_value(idx); - BasicBlock *current_bb = builder.GetInsertBlock(); - BasicBlock *mask_then_bb = BasicBlock::Create(ctx, "mask_then", fn); - BasicBlock *mask_done_bb = BasicBlock::Create(ctx, "mask_done", fn); - builder.CreateCondBr(mask, mask_then_bb, mask_done_bb); - builder.SetInsertPoint(mask_then_bb); - Value *result_then = builder.CreateLoad(ptr); - builder.CreateBr(mask_done_bb); - builder.SetInsertPoint(mask_done_bb); - Value *current_result = nullptr; - if(false_values){ - current_result = builder.CreatePHI(result_then->getType(), 2); - ((PHINode*)current_result)->addIncoming(result_then, mask_then_bb); - Value *result_false = false_values->get_value(idx); - if(result_then->getType()->isVectorTy()) - result_false = builder.CreateVectorSplat(vector_size, result_false); - ((PHINode*)current_result)->addIncoming(result_false, current_bb); + std::map, std::vector> fcs; + + result->for_each([&](indices_t idx){ + std::vector key(idx.size() - 2); + std::copy(idx.begin() + 2, idx.end(), key.begin()); + fcs[key].push_back(TC->get_value(idx)); + }); + + Type *fp32_ty = builder.getFloatTy(); + Type *fp16x2_ty = VectorType::get(builder.getHalfTy(), 2); + Type *fp32_pack8_ty = StructType::get(ctx, {fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}); + FunctionType *mma_ty = FunctionType::get(fp32_pack8_ty, {fp16x2_ty, fp16x2_ty, fp16x2_ty, fp16x2_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}, false); + + Value *offset_a_i = offset_a_i_; + Value *offset_a_k = offset_a_k_; + Value *offset_b_j = offset_b_j_; + Value *offset_b_k = offset_b_k_; + + Value* u_thread_id = tgt_->get_local_id(builder.GetInsertBlock()->getModule(), builder, 0); + if(dot->is_a_trans()){ + offset_a_i = builder.CreateAdd(offset_a_i, builder.CreateURem(u_thread_id, builder.getInt32(4))); + offset_a_k = builder.getInt32(0); + } + if(!dot->is_b_trans()){ + offset_b_j = builder.CreateAdd(offset_b_j, builder.CreateURem(u_thread_id, builder.getInt32(4))); + offset_b_k = builder.getInt32(0); + } + + std::string op_a = dot->is_a_trans() ? "row" : "col"; + std::string op_b = dot->is_b_trans() ? "row" : "col"; + + InlineAsm *mma_fn = InlineAsm::get(mma_ty, " mma.sync.aligned.m8n8k4." + op_a + "." + op_b + ".f32.f16.f16.f32 " + "{$0, $1, $2, $3, $4, $5, $6, $7}, " + "{$8, $9}, " + "{$10, $11}, " + "{$0, $1, $2, $3, $4, $5, $6, $7};", "=f,=f,=f,=f,=f,=f,=f,=f,r,r,r,r,0,1,2,3,4,5,6,7", false); + + unsigned fpw_0 = params_->get_param(dot, "fpw.d0")->get_value(); + unsigned fpw_1 = params_->get_param(dot, "fpw.d1")->get_value(); + unsigned wts_0 = fpw_0 * 8; + unsigned wts_1 = fpw_1 * 8; + unsigned wpt_0 = params_->get_param(dot, "wpt.d0")->get_value(); + unsigned wpt_1 = params_->get_param(dot, "wpt.d1")->get_value(); + unsigned stride_rep_i = wpt_0 * wts_0; + unsigned stride_rep_j = wpt_1 * wts_1; + unsigned num_rep_i = shapes[0]->get_value() / stride_rep_i; + unsigned ld_fc = num_rep_i * 2; + + + for(auto& x: fcs){ + std::vector& fc = x.second; + for(unsigned pack_i = 0; pack_i < num_packs_0_; pack_i++) + for(unsigned pack_j = 0; pack_j < num_packs_1_; pack_j++){ + for(unsigned K = 0; K < NK; K += 4){ + Value *_K = builder.getInt32(K); + Value *current_offset_a_i = builder.CreateAdd(offset_a_i, builder.getInt32(pack_i*stride_rep_i*pack_size_0_)); + Value *current_offset_b_i = builder.CreateAdd(offset_b_j, builder.getInt32(pack_j*stride_rep_j*pack_size_1_)); + indices_t idx_a = {current_offset_a_i, builder.CreateAdd(offset_a_k, _K)}; + indices_t idx_b = {current_offset_b_i, builder.CreateAdd(offset_b_k, _K)}; + if(dot->is_a_trans()) + std::swap(idx_a[0], idx_a[1]); + if(!dot->is_b_trans()) + std::swap(idx_b[0], idx_b[1]); + idx_a.insert(idx_a.end(), x.first.begin(), x.first.end()); + idx_b.insert(idx_b.end(), x.first.begin(), x.first.end()); + Value *ha = TA->get_value(idx_a); + Value *hb = TB->get_value(idx_b); + for(unsigned ii = 0; ii < pack_size_0_; ii++) + for(unsigned jj = 0; jj < pack_size_1_; jj++){ + Value *ha0 = builder.CreateBitCast(builder.CreateExtractElement(ha, builder.getInt32(ii*pack_size_0_ + 0)), fp16x2_ty); + Value *ha1 = builder.CreateBitCast(builder.CreateExtractElement(ha, builder.getInt32(ii*pack_size_0_ + 1)), fp16x2_ty); + Value *hb0 = builder.CreateBitCast(builder.CreateExtractElement(hb, builder.getInt32(jj*pack_size_0_ + 0)), fp16x2_ty); + Value *hb1 = builder.CreateBitCast(builder.CreateExtractElement(hb, builder.getInt32(jj*pack_size_0_ + 1)), fp16x2_ty); + std::vector idx = { + (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 0)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 1)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 0)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 1)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 2)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 3)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 2)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 3)*ld_fc + }; + Value *nc = builder.CreateCall(mma_fn, {ha0, ha1, hb0, hb1, fc[idx[0]], fc[idx[1]], fc[idx[2]], fc[idx[3]], fc[idx[4]], fc[idx[5]], fc[idx[6]], fc[idx[7]]}); + fc[idx[0]] = builder.CreateExtractValue(nc, {0}); + fc[idx[1]] = builder.CreateExtractValue(nc, {1}); + fc[idx[2]] = builder.CreateExtractValue(nc, {2}); + fc[idx[3]] = builder.CreateExtractValue(nc, {3}); + fc[idx[4]] = builder.CreateExtractValue(nc, {4}); + fc[idx[5]] = builder.CreateExtractValue(nc, {5}); + fc[idx[6]] = builder.CreateExtractValue(nc, {6}); + fc[idx[7]] = builder.CreateExtractValue(nc, {7}); } - else - current_result = result_then; + } + } + } + + // write back + unsigned i = 0; + result->for_each([&](indices_t idx){ + std::vector key(idx.size() - 2); + std::copy(idx.begin() + 2, idx.end(), key.begin()); + if(i >= fcs.at(key).size()) + i = 0; + result->set_value(idx, fcs.at(key)[i++]); + }); + + TA->set_return_mode(false); + TB->set_return_mode(false); + } + } + else + { + distributed_tile *TA = (distributed_tile*)tmap_.at(A); + distributed_tile *TB = (distributed_tile*)tmap_.at(B); + result->for_each([&](indices_t idx){ + Value *res = TC->get_value(idx); + indices_t a_idx = {idx[0], builder.getInt32(0)}; + indices_t b_idx = {builder.getInt32(0), idx[1]}; + if(AT) + std::swap(a_idx[0], a_idx[1]); + if(BT) + std::swap(b_idx[0], b_idx[1]); + Value *a = TA->get_value(a_idx); + Value *b = TB->get_value(b_idx); + if(a->getType() != c_ty) + a = builder.CreateFPCast(a, c_ty); + if(b->getType() != c_ty) + b = builder.CreateFPCast(b, c_ty); + res = builder.CreateCall(f_mul_add, {a, b, res}); + result->set_value(idx, res); + }); + } +} + +void selection::lower_masked_load(ir::masked_load_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { + // find vector size + distributed_tile* result = (distributed_tile*)tmap_.at(x); + ir::value *ptr = x->get_pointer_operand(); + unsigned starting_multiple = axis_info_->get_starting_multiple(ptr); + unsigned max_contiguous = axis_info_->get_max_contiguous(ptr); + unsigned alignment = std::min(starting_multiple, max_contiguous); + unsigned vector_size = std::min(result->axis(0).contiguous, alignment); + distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr); + distributed_tile *masks = (distributed_tile*)tmap_.at(x->get_mask_operand()); + distributed_tile *false_values = (distributed_tile*)tmap_.at(x->get_false_value_operand()); + std::map packets; + result->for_each([&](indices_t idx){ + unsigned linear = result->get_linear_index(idx); + unsigned id = linear / vector_size; + if(linear % vector_size == 0) { + Value *ptr = pointers->get_value(idx); + ConstantInt *cst = nullptr; + if(GetElementPtrInst *gep = dyn_cast(ptr)) + if(gep->getNumIndices() == 1){ + cst = dyn_cast(gep->idx_begin()); + } + + ptr = builder.CreateBitCast(ptr, PointerType::get(VectorType::get(result->get_ty(), vector_size), + ptr->getType()->getPointerAddressSpace())); + Value *mask = masks->get_value(idx); + BasicBlock *current_bb = builder.GetInsertBlock(); + BasicBlock *mask_then_bb = BasicBlock::Create(ctx, "mask_then", fn); + BasicBlock *mask_done_bb = BasicBlock::Create(ctx, "mask_done", fn); + builder.CreateCondBr(mask, mask_then_bb, mask_done_bb); + builder.SetInsertPoint(mask_then_bb); + Value *result_then = builder.CreateLoad(ptr); + builder.CreateBr(mask_done_bb); + builder.SetInsertPoint(mask_done_bb); + Value *current_result = nullptr; + if(false_values){ + current_result = builder.CreatePHI(result_then->getType(), 2); + ((PHINode*)current_result)->addIncoming(result_then, mask_then_bb); + Value *result_false = false_values->get_value(idx); + if(result_then->getType()->isVectorTy()) + result_false = builder.CreateVectorSplat(vector_size, result_false); + ((PHINode*)current_result)->addIncoming(result_false, current_bb); + } + else + current_result = result_then; // std::string offset = ""; // if(cst) @@ -1250,65 +1257,102 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & // InlineAsm *iasm = InlineAsm::get(ty, asm_str, "b,=r,=r,=r,=r,l", true); // Value *result = builder.CreateCall(iasm, {mask, ptr}); - packets[id] = current_result; - } - }); - // extract result element - result->for_each([&](indices_t idx){ - unsigned linear = result->get_linear_index(idx); - unsigned id = linear / vector_size; + packets[id] = current_result; + } + }); + // extract result element + result->for_each([&](indices_t idx){ + unsigned linear = result->get_linear_index(idx); + unsigned id = linear / vector_size; // Value *tmp = builder.CreateExtractValue(packets.at(id), {(linear % vector_size) / 2}); // Value *res = builder.CreateExtractElement(tmp, (linear % vector_size) % 2); // result->set_value(idx, res); - result->set_value(idx, builder.CreateExtractElement(packets.at(id), linear % vector_size)); - }); + result->set_value(idx, builder.CreateExtractElement(packets.at(id), linear % vector_size)); + }); +} + +void selection::lower_load(ir::load_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { + distributed_tile* result = (distributed_tile*)tmap_.at(x); + // find vector size + ir::value *ptr = x->get_pointer_operand(); + unsigned starting_multiple = axis_info_->get_starting_multiple(ptr); + unsigned max_contiguous = axis_info_->get_max_contiguous(ptr); + unsigned alignment = std::min(starting_multiple, max_contiguous); + unsigned vector_size = std::min(result->axis(0).contiguous, alignment); + distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr); + // vector loads + std::map packets; + result->for_each([&](indices_t idx){ + unsigned linear = result->get_linear_index(idx); + unsigned id = linear / vector_size; + if(linear % vector_size == 0) { + Value *ptr = pointers->get_value(idx); + ConstantInt *cst = nullptr; + if(GetElementPtrInst *gep = dyn_cast(ptr)) + if(gep->getNumIndices() == 1) + cst = dyn_cast(gep->idx_begin()); + ptr = builder.CreateBitCast(ptr, PointerType::get(VectorType::get(result->get_ty(), vector_size), + ptr->getType()->getPointerAddressSpace())); + packets[id] = builder.CreateLoad(ptr); } - else if(auto *ld = dynamic_cast(ins)){ - // find vector size - ir::value *ptr = ld->get_pointer_operand(); - unsigned starting_multiple = axis_info_->get_starting_multiple(ptr); - unsigned max_contiguous = axis_info_->get_max_contiguous(ptr); - unsigned alignment = std::min(starting_multiple, max_contiguous); - unsigned vector_size = std::min(result->axis(0).contiguous, alignment); - distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr); - // vector loads - std::map packets; - result->for_each([&](indices_t idx){ - unsigned linear = result->get_linear_index(idx); - unsigned id = linear / vector_size; - if(linear % vector_size == 0) { - Value *ptr = pointers->get_value(idx); - ConstantInt *cst = nullptr; - if(GetElementPtrInst *gep = dyn_cast(ptr)) - if(gep->getNumIndices() == 1) - cst = dyn_cast(gep->idx_begin()); - ptr = builder.CreateBitCast(ptr, PointerType::get(VectorType::get(result->get_ty(), vector_size), - ptr->getType()->getPointerAddressSpace())); - packets[id] = builder.CreateLoad(ptr); - } - }); - // extract result element - result->for_each([&](indices_t idx){ - unsigned linear = result->get_linear_index(idx); - unsigned id = linear / vector_size; - result->set_value(idx, builder.CreateExtractElement(packets.at(id), linear % vector_size)); - }); - } - // element-wise - else { - result->for_each([&](indices_t idx){ - auto value = [&](ir::value *x) { - if(auto *cst = dynamic_cast(x)) - return (Value*)llvm_constant(cst, ctx); - else if(x->get_type()->is_tile_ty()) - return tmap_.at(x)->get_value(idx); - else - return llvm_value(x, builder); - }; - result->set_value(idx, llvm_inst(ins, value, builder)); - }); - } - } + }); + // extract result element + result->for_each([&](indices_t idx){ + unsigned linear = result->get_linear_index(idx); + unsigned id = linear / vector_size; + result->set_value(idx, builder.CreateExtractElement(packets.at(id), linear % vector_size)); + }); +} + +void selection::lower_elementwise(ir::instruction *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { + distributed_tile* result = (distributed_tile*)tmap_.at(x); + result->for_each([&](indices_t idx){ + auto value = [&](ir::value *v) { + if(auto *cst = dynamic_cast(v)) + return (Value*)llvm_constant(cst, ctx); + else if(v->get_type()->is_tile_ty()) + return tmap_.at(v)->get_value(idx); + else + return llvm_value(v, builder); + }; + result->set_value(idx, llvm_inst(x, value, builder)); + }); +} + +void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> &builder) { + BasicBlock *block = builder.GetInsertBlock(); + LLVMContext &ctx = builder.getContext(); + Function *fn = block->getParent(); + if(auto *x = dynamic_cast(ins)) + lower_masked_store(x, ctx, fn, builder); + else if(auto *x = dynamic_cast(ins)) + lower_store(x, ctx, fn, builder); + else if(auto *x = dynamic_cast(ins)) + lower_downcast(x, ctx, fn, builder); + else if(auto *x = dynamic_cast(ins)) + lower_reduce(x, ctx, fn, builder); + else if(auto *x = dynamic_cast(ins)) + lower_dynamic_range_idx(x, ctx, fn, builder); + else if(auto *x = dynamic_cast(ins)) + lower_reshape(x, ctx, fn, builder); + else if(auto *x = dynamic_cast(ins)) + lower_splat(x, ctx, fn, builder); + else if(auto *x = dynamic_cast(ins)) + lower_broadcast(x, ctx, fn, builder); + else if(auto *x = dynamic_cast(ins)) + lower_vectorize(x, ctx, fn, builder); + else if(auto *x = dynamic_cast(ins)) + lower_copy_to_shared(x, ctx, fn, builder); + else if(auto* x = dynamic_cast(ins)) + lower_trans(x, ctx, fn, builder); + else if(auto x = dynamic_cast(ins)) + lower_dot(x, ctx, fn, builder); + else if(auto *x = dynamic_cast(ins)) + lower_masked_load(x, ctx, fn, builder); + else if(auto *x = dynamic_cast(ins)) + lower_load(x, ctx, fn, builder); + else if(!buffer_info_->is_shared(ins)) + lower_elementwise(ins, ctx, fn, builder); } void selection::lower_instruction(ir::instruction *src, IRBuilder<> &builder) { diff --git a/lib/codegen/transform/dot.cpp b/lib/codegen/transform/dot.cpp deleted file mode 100644 index fa1a542f0..000000000 --- a/lib/codegen/transform/dot.cpp +++ /dev/null @@ -1,113 +0,0 @@ -#include "triton/ir/function.h" -#include "triton/ir/basic_block.h" -#include "triton/ir/module.h" -#include "triton/codegen/transform/dot.h" -#include "triton/codegen/analysis/tune.h" - -namespace triton { -namespace codegen{ -namespace transform{ - -inline bool is_trans(ir::value *v){ - auto *x = dynamic_cast(v); - if(!x) - return false; - std::vector perm = x->get_perm(); - std::vector ref; - ir::type *int32_ty = ir::type::get_int32_ty(v->get_type()->get_context()); - for(size_t i = 0; i < perm.size(); i++) - ref.push_back(ir::constant_int::get(int32_ty, i)); - std::swap(ref[0], ref[1]); - // true is perm == ref - return std::equal(perm.begin(), perm.end(), ref.begin()); -} - -inline bool is_hmma(ir::value *v){ - bool result = false; - if(auto *x = dynamic_cast(v)){ - ir::value *a = x->get_operand(0); - ir::type *a_ty = a->get_type(); - ir::value *b = x->get_operand(1); - ir::type *b_ty = b->get_type(); - // inputs have to be FP16 - result = a_ty->get_scalar_ty()->is_half_ty() && b_ty->get_scalar_ty()->is_half_ty(); -// reduction has to be multiple of 4 -// result = result && ((a_ty->get_tile_shapes()[1]->get_value() % 4) == 0); - } - return result; -} - -void optimize_dot::run(ir::module &mod) { - ir::builder &builder = mod.get_builder(); - // iterate - for(ir::function *fn: mod.get_function_list()) - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i: block->get_inst_list()) - if(auto dot = dynamic_cast(i)){ - builder.set_insert_point(i); - ir::value *A = dot->get_operand(0); - ir::value *B = dot->get_operand(1); - ir::value *D = dot->get_operand(2); - bool trans_a = is_trans(A); - bool trans_b = is_trans(B); - - if(!dot->is_a_trans() && !dot->is_b_trans()){ - if(is_hmma(dot)){ - ir::value *AA = A; - ir::value *BB = B; - if(trans_a){ - AA = ((ir::trans_inst*)A)->get_operand(0); - } - else{ - if(auto *T = dynamic_cast(A)){ - std::vector perm(T->get_perm()); - std::swap(perm[0], perm[1]); - AA = builder.create_trans(T->get_operand(0), perm); - T->replace_all_uses_with(AA); - trans_a = true; - } - } - if(trans_b){ - BB = ((ir::trans_inst*)B)->get_operand(0); - } - else{ -// if(auto *T = dynamic_cast(A)){ -// std::vector perm(T->get_perm()); -// std::swap(perm[0], perm[1]); -// AA = builder.create_trans(T->get_operand(0), perm); -// T->replace_all_uses_with(AA); -// trans_a = true; -// } - } - ir::instruction *dot_atbt = builder.insert(ir::dot_inst::create(AA, BB, D, trans_a, trans_b)); - dot->replace_all_uses_with(dot_atbt); - } - else{ - // dot(op(a), trans(b)) - if(trans_b){ - ir::value* BB = ((ir::trans_inst*)B)->get_operand(0); - ir::instruction *NT = builder.insert(ir::dot_inst::create_nt(A, BB, D)); - dot->replace_all_uses_with(NT); - } - // dot(op(a), b) - if(!trans_b){ - // create permutations - size_t size = B->get_type()->get_tile_shapes().size(); - std::vector perm(size); - ir::type *int32_ty = ir::type::get_int32_ty(B->get_type()->get_context()); - for(size_t i = 0; i < size; i++) - perm[i] = ir::constant_int::get(int32_ty, i); - std::swap(perm[0], perm[1]); - // replace NN -> NT (trans) - ir::value* BB = builder.create_trans(B, perm); - ir::instruction *NT = builder.insert(ir::dot_inst::create_nt(A, BB, D)); - dot->replace_all_uses_with(NT); - } - } - } - } -} - -} -} -} diff --git a/lib/codegen/transform/trans.cpp b/lib/codegen/transform/peephole.cpp similarity index 59% rename from lib/codegen/transform/trans.cpp rename to lib/codegen/transform/peephole.cpp index 946fbb0fd..6140e686c 100644 --- a/lib/codegen/transform/trans.cpp +++ b/lib/codegen/transform/peephole.cpp @@ -1,6 +1,6 @@ #include "triton/ir/module.h" #include "triton/ir/function.h" -#include "triton/codegen/transform/trans.h" +#include "triton/codegen/transform/peephole.h" namespace triton { namespace codegen{ @@ -70,84 +70,96 @@ bool peephole::rewrite_trans_phi(ir::instruction* value, ir::builder& builder) { if(users.size() > 1 || ops.size() > 1) return false; ir::value* op = *ops.begin(); + // trans(phi) -> phi(trans(), trans()...) auto* phi = dynamic_cast(op); if(!phi) return false; - ir::value* new_phi = rewrite_trans_phi_impl(op, builder, trans->get_perm()); + ir::value* new_phi = rewrite_trans_phi_impl(phi, builder, trans->get_perm()); trans->replace_all_uses_with(new_phi); + return true; } -bool peephole::rewrite_dot(ir::instruction *value, ir::builder& builder){ - if(auto dot = dynamic_cast(value)){ - builder.set_insert_point(value); - ir::value *A = dot->get_operand(0); - ir::value *B = dot->get_operand(1); - ir::value *D = dot->get_operand(2); - bool trans_a = is_trans(A); - bool trans_b = is_trans(B); - // NN - if(!dot->is_a_trans() && !dot->is_b_trans()){ - if(is_hmma(dot)) { - ir::value *AA = A; - ir::value *BB = B; - if(trans_a){ - AA = ((ir::trans_inst*)A)->get_operand(0); - } - else{ - if(auto *T = dynamic_cast(A)){ - std::vector perm(T->get_perm()); - std::swap(perm[0], perm[1]); - AA = builder.create_trans(T->get_operand(0), perm); - T->replace_all_uses_with(AA); - trans_a = true; - } - } - if(trans_b){ - BB = ((ir::trans_inst*)B)->get_operand(0); - } - else{ - if(auto *T = dynamic_cast(A)){ - std::vector perm(T->get_perm()); - std::swap(perm[0], perm[1]); - AA = builder.create_trans(T->get_operand(0), perm); - T->replace_all_uses_with(AA); - trans_a = true; - } - } - ir::instruction *dot_atbt = builder.insert(ir::dot_inst::create(AA, BB, D, trans_a, trans_b)); - dot->replace_all_uses_with(dot_atbt); - return true; - } - else{ - // dot(op(a), trans(b)) - if(trans_b){ - ir::value* BB = ((ir::trans_inst*)B)->get_operand(0); - ir::instruction *NT = builder.insert(ir::dot_inst::create_nt(A, BB, D)); - dot->replace_all_uses_with(NT); - return true; - } - // dot(op(a), b) - if(!trans_b){ - // create permutations - size_t size = B->get_type()->get_tile_shapes().size(); - std::vector perm(size); - ir::type *int32_ty = ir::type::get_int32_ty(B->get_type()->get_context()); - for(size_t i = 0; i < size; i++) - perm[i] = ir::constant_int::get(int32_ty, i); - std::swap(perm[0], perm[1]); - // replace NN -> NT (trans) - ir::value* BB = builder.create_trans(B, perm); - ir::instruction *NT = builder.insert(ir::dot_inst::create_nt(A, BB, D)); - dot->replace_all_uses_with(NT); - return true; - } - } +bool peephole::rewrite_dot_hmma(ir::dot_inst *dot, ir::builder& builder, bool trans_a, bool trans_b, + ir::value *A, ir::value *B, ir::value *D){ + ir::value *AA = A; + ir::value *BB = B; + if(trans_a){ + AA = ((ir::trans_inst*)A)->get_operand(0); + } + else{ + if(auto *T = dynamic_cast(A)){ + std::vector perm(T->get_perm()); + std::swap(perm[0], perm[1]); + AA = builder.create_trans(T->get_operand(0), perm); + T->replace_all_uses_with(AA); + trans_a = true; } } + if(trans_b){ + BB = ((ir::trans_inst*)B)->get_operand(0); + } + else{ + if(auto *T = dynamic_cast(A)){ + std::vector perm(T->get_perm()); + std::swap(perm[0], perm[1]); + AA = builder.create_trans(T->get_operand(0), perm); + T->replace_all_uses_with(AA); + trans_a = true; + } + } + ir::instruction *dot_atbt = builder.insert(ir::dot_inst::create(AA, BB, D, trans_a, trans_b)); + dot->replace_all_uses_with(dot_atbt); + return true; +} + +bool peephole::rewrite_dot_fp32(ir::dot_inst *dot, ir::builder& builder, bool trans_a, bool trans_b, + ir::value *A, ir::value *B, ir::value *D){ + // dot(op(a), trans(b)) + if(trans_b){ + ir::value* BB = ((ir::trans_inst*)B)->get_operand(0); + ir::instruction *NT = builder.insert(ir::dot_inst::create_nt(A, BB, D)); + dot->replace_all_uses_with(NT); + return true; + } + // dot(op(a), b) + if(!trans_b){ + // create permutations + size_t size = B->get_type()->get_tile_shapes().size(); + std::vector perm(size); + ir::type *int32_ty = ir::type::get_int32_ty(B->get_type()->get_context()); + for(size_t i = 0; i < size; i++) + perm[i] = ir::constant_int::get(int32_ty, i); + std::swap(perm[0], perm[1]); + // replace NN -> NT (trans) + ir::value* BB = builder.create_trans(B, perm); + ir::instruction *NT = builder.insert(ir::dot_inst::create_nt(A, BB, D)); + dot->replace_all_uses_with(NT); + return true; + } return false; } +bool peephole::rewrite_dot(ir::instruction *value, ir::builder& builder){ + auto dot = dynamic_cast(value); + if(!dot) + return false; + builder.set_insert_point(value); + ir::value *A = dot->get_operand(0); + ir::value *B = dot->get_operand(1); + ir::value *D = dot->get_operand(2); + bool trans_a = is_trans(A); + bool trans_b = is_trans(B); + // only consider dot-nn + if(dot->is_a_trans() || dot->is_b_trans()) + return false; + // hmma + if(is_hmma(dot)) + return rewrite_dot_hmma(dot, builder, trans_a, trans_b, A, B, D); + else + return rewrite_dot_fp32(dot, builder, trans_a, trans_b, A, B, D); +} + bool peephole::rewrite_unit_red(ir::instruction *value, ir::builder& builder){ auto x = dynamic_cast(value); if(!x) @@ -190,28 +202,40 @@ bool peephole::rewrite_gep_ptr_min_off_plus_off(ir::instruction *value, ir::buil void peephole::run(ir::module &mod) { ir::builder &builder = mod.get_builder(); // keep track of whether any modification was made - bool was_modified = false; + std::set seen; + size_t n_seen; // rewrite dots first do{ - was_modified = false; - for(ir::function *fn: mod.get_function_list()) - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction* i: block->get_inst_list()) - rewrite_dot(i, builder); - }while(was_modified); - - // rewrite other ops - do{ - was_modified = false; + n_seen = seen.size(); for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) for(ir::instruction* i: block->get_inst_list()){ + if(seen.find(i) != seen.end()) + continue; + bool was_modified = rewrite_dot(i, builder); + if(was_modified) + seen.insert(i); + } + }while(seen.size() != n_seen); + + // rewrite other ops + seen.clear(); + do{ + n_seen = seen.size(); + for(ir::function *fn: mod.get_function_list()) + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction* i: block->get_inst_list()){ + if(seen.find(i) != seen.end()) + continue; + bool was_modified = false; was_modified = was_modified || rewrite_trans_phi(i, builder); was_modified = was_modified || rewrite_unit_red(i, builder); was_modified = was_modified || rewrite_gep_ptr_min_off_plus_off(i, builder); + if(was_modified) + seen.insert(i); } - }while(was_modified); + }while(seen.size() != n_seen); } } From b8cd63e0dac6f4ff2aceb53278fa70c66659d359 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 12 Aug 2019 21:48:30 -0700 Subject: [PATCH 293/494] [codegen] separated lower_dot_inst into lower_outer_dot || lower_hmma_dot || lower_scanline_dot --- include/triton/codegen/selection/selection.h | 13 +- lib/codegen/selection/selection.cpp | 369 ++++++++++--------- 2 files changed, 198 insertions(+), 184 deletions(-) diff --git a/include/triton/codegen/selection/selection.h b/include/triton/codegen/selection/selection.h index 5c5c6ae2e..3191fd60f 100644 --- a/include/triton/codegen/selection/selection.h +++ b/include/triton/codegen/selection/selection.h @@ -165,11 +165,20 @@ private: void lower_vectorize(ir::vectorize_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); void lower_copy_to_shared(ir::copy_to_shared_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); void lower_trans(ir::trans_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); - void lower_hmma_dot(ir::dot_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); - void lower_scalar_dot(ir::dot_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); + // matrix multiply + void lower_hmma_dot(ir::dot_inst *x, LLVMContext &ctx, Function *fn, Builder &builder, + distributed_tile *TC, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK); + void lower_scanline_dot(ir::dot_inst *x, LLVMContext &ctx, Function *fn, Builder &builder, + distributed_tile *TC, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK, + Type *c_ty, Function *f_mul_add); + void lower_outer_dot(ir::dot_inst *x, LLVMContext &ctx, Function *fn, Builder &builder, + distributed_tile *TC, distributed_tile *TA, distributed_tile *TB, distributed_tile *TD, + Type *c_ty, Function *f_mul_add); void lower_dot(ir::dot_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); + // load void lower_masked_load(ir::masked_load_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); void lower_load(ir::load_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); + // element-wise void lower_elementwise(ir::instruction *x, LLVMContext &ctx, Function *fn, Builder &builder); void lower_tile_instruction(ir::instruction *src, Builder &builder); diff --git a/lib/codegen/selection/selection.cpp b/lib/codegen/selection/selection.cpp index 96b25539a..81ed60e70 100644 --- a/lib/codegen/selection/selection.cpp +++ b/lib/codegen/selection/selection.cpp @@ -999,194 +999,148 @@ void selection::lower_trans(ir::trans_inst *x, LLVMContext &ctx, Function *fn, I }); } -void selection::lower_hmma_dot(ir::dot_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { +void selection::lower_hmma_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn, IRBuilder<> &builder, + distributed_tile *TC, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK) { -} - -void selection::lower_scalar_dot(ir::dot_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { - -} - -void selection::lower_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { const auto& shapes = dot->get_type()->get_tile_shapes(); - distributed_tile* result = (distributed_tile*)tmap_.at(dot); - Module *module = fn->getParent(); - ir::value *A = dot->get_operand(0); - ir::value *B = dot->get_operand(1); - ir::value *C = dot->get_operand(2); - bool AT = dot->is_a_trans(); - bool BT = dot->is_b_trans(); - distributed_tile *TC = (distributed_tile*)tmap_.at(C); - Type *c_ty = llvm_type(C->get_type()->get_scalar_ty(), ctx); - Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {c_ty}); - auto A_shapes = A->get_type()->get_tile_shapes(); - size_t red_axis = dot->is_a_trans() ? 0 : 1; - unsigned NK = A_shapes[red_axis]->get_value(); -// std::cout << red_axis << " " << NK << std::endl; - if(NK != 1) - { - shared_tile *TA = (shared_tile*)tmap_.at(A); - shared_tile *TB = (shared_tile*)tmap_.at(B); - if(params_->get_fragment(dot, 0) == analysis::tune::STRIDED_SCAN) { - TA->set_vector_size(TC->axis(0).contiguous); - TB->set_vector_size(TC->axis(1).contiguous); - result->for_each([&](indices_t idx){ - Value *res = TC->get_value(idx); - for(unsigned K = 0; K < NK; ++K){ - // input indices - indices_t a_idx = {idx[0], builder.getInt32(K)}; - indices_t b_idx = {builder.getInt32(K), idx[1]}; - if(AT) - std::swap(a_idx[0], a_idx[1]); - if(BT) - std::swap(b_idx[0], b_idx[1]); - // add batching dimension - for(size_t i = 2; i < idx.size(); i++){ - a_idx.insert(a_idx.end(), idx[i]); - b_idx.insert(b_idx.end(), idx[i]); - } - // load value - Value *a = TA->get_value(a_idx); - Value *b = TB->get_value(b_idx); - if(a->getType() != c_ty) - a = builder.CreateFPCast(a, c_ty); - if(b->getType() != c_ty) - b = builder.CreateFPCast(b, c_ty); - res = builder.CreateCall(f_mul_add, {a, b, res}); - } - result->set_value(idx, res); - }); + TA->set_vector_size(4*pack_size_0_); + TB->set_vector_size(4*pack_size_1_); + TA->set_return_mode(true); + TB->set_return_mode(true); + + std::map, std::vector> fcs; + + TC->for_each([&](indices_t idx){ + std::vector key(idx.size() - 2); + std::copy(idx.begin() + 2, idx.end(), key.begin()); + fcs[key].push_back(TD->get_value(idx)); + }); + + Type *fp32_ty = builder.getFloatTy(); + Type *fp16x2_ty = VectorType::get(builder.getHalfTy(), 2); + Type *fp32_pack8_ty = StructType::get(ctx, {fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}); + FunctionType *mma_ty = FunctionType::get(fp32_pack8_ty, {fp16x2_ty, fp16x2_ty, fp16x2_ty, fp16x2_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}, false); + + Value *offset_a_i = offset_a_i_; + Value *offset_a_k = offset_a_k_; + Value *offset_b_j = offset_b_j_; + Value *offset_b_k = offset_b_k_; + + Value* u_thread_id = tgt_->get_local_id(builder.GetInsertBlock()->getModule(), builder, 0); + if(dot->is_a_trans()){ + offset_a_i = builder.CreateAdd(offset_a_i, builder.CreateURem(u_thread_id, builder.getInt32(4))); + offset_a_k = builder.getInt32(0); + } + if(!dot->is_b_trans()){ + offset_b_j = builder.CreateAdd(offset_b_j, builder.CreateURem(u_thread_id, builder.getInt32(4))); + offset_b_k = builder.getInt32(0); + } + + std::string op_a = dot->is_a_trans() ? "row" : "col"; + std::string op_b = dot->is_b_trans() ? "row" : "col"; + + InlineAsm *mma_fn = InlineAsm::get(mma_ty, " mma.sync.aligned.m8n8k4." + op_a + "." + op_b + ".f32.f16.f16.f32 " + "{$0, $1, $2, $3, $4, $5, $6, $7}, " + "{$8, $9}, " + "{$10, $11}, " + "{$0, $1, $2, $3, $4, $5, $6, $7};", "=f,=f,=f,=f,=f,=f,=f,=f,r,r,r,r,0,1,2,3,4,5,6,7", false); + + unsigned fpw_0 = params_->get_param(dot, "fpw.d0")->get_value(); + unsigned fpw_1 = params_->get_param(dot, "fpw.d1")->get_value(); + unsigned wts_0 = fpw_0 * 8; + unsigned wts_1 = fpw_1 * 8; + unsigned wpt_0 = params_->get_param(dot, "wpt.d0")->get_value(); + unsigned wpt_1 = params_->get_param(dot, "wpt.d1")->get_value(); + unsigned stride_rep_i = wpt_0 * wts_0; + unsigned stride_rep_j = wpt_1 * wts_1; + unsigned num_rep_i = shapes[0]->get_value() / stride_rep_i; + unsigned ld_fc = num_rep_i * 2; + + + for(auto& x: fcs){ + std::vector& fc = x.second; + for(unsigned pack_i = 0; pack_i < num_packs_0_; pack_i++) + for(unsigned pack_j = 0; pack_j < num_packs_1_; pack_j++){ + for(unsigned K = 0; K < NK; K += 4){ + Value *_K = builder.getInt32(K); + Value *current_offset_a_i = builder.CreateAdd(offset_a_i, builder.getInt32(pack_i*stride_rep_i*pack_size_0_)); + Value *current_offset_b_i = builder.CreateAdd(offset_b_j, builder.getInt32(pack_j*stride_rep_j*pack_size_1_)); + indices_t idx_a = {current_offset_a_i, builder.CreateAdd(offset_a_k, _K)}; + indices_t idx_b = {current_offset_b_i, builder.CreateAdd(offset_b_k, _K)}; + if(dot->is_a_trans()) + std::swap(idx_a[0], idx_a[1]); + if(!dot->is_b_trans()) + std::swap(idx_b[0], idx_b[1]); + idx_a.insert(idx_a.end(), x.first.begin(), x.first.end()); + idx_b.insert(idx_b.end(), x.first.begin(), x.first.end()); + Value *ha = TA->get_value(idx_a); + Value *hb = TB->get_value(idx_b); + for(unsigned ii = 0; ii < pack_size_0_; ii++) + for(unsigned jj = 0; jj < pack_size_1_; jj++){ + Value *ha0 = builder.CreateBitCast(builder.CreateExtractElement(ha, builder.getInt32(ii*pack_size_0_ + 0)), fp16x2_ty); + Value *ha1 = builder.CreateBitCast(builder.CreateExtractElement(ha, builder.getInt32(ii*pack_size_0_ + 1)), fp16x2_ty); + Value *hb0 = builder.CreateBitCast(builder.CreateExtractElement(hb, builder.getInt32(jj*pack_size_0_ + 0)), fp16x2_ty); + Value *hb1 = builder.CreateBitCast(builder.CreateExtractElement(hb, builder.getInt32(jj*pack_size_0_ + 1)), fp16x2_ty); + std::vector idx = { + (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 0)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 1)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 0)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 1)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 2)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 3)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 2)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 3)*ld_fc + }; + Value *nc = builder.CreateCall(mma_fn, {ha0, ha1, hb0, hb1, fc[idx[0]], fc[idx[1]], fc[idx[2]], fc[idx[3]], fc[idx[4]], fc[idx[5]], fc[idx[6]], fc[idx[7]]}); + fc[idx[0]] = builder.CreateExtractValue(nc, {0}); + fc[idx[1]] = builder.CreateExtractValue(nc, {1}); + fc[idx[2]] = builder.CreateExtractValue(nc, {2}); + fc[idx[3]] = builder.CreateExtractValue(nc, {3}); + fc[idx[4]] = builder.CreateExtractValue(nc, {4}); + fc[idx[5]] = builder.CreateExtractValue(nc, {5}); + fc[idx[6]] = builder.CreateExtractValue(nc, {6}); + fc[idx[7]] = builder.CreateExtractValue(nc, {7}); + } } - else { - TA->set_vector_size(4*pack_size_0_); - TB->set_vector_size(4*pack_size_1_); - TA->set_return_mode(true); - TB->set_return_mode(true); - - std::map, std::vector> fcs; - - result->for_each([&](indices_t idx){ - std::vector key(idx.size() - 2); - std::copy(idx.begin() + 2, idx.end(), key.begin()); - fcs[key].push_back(TC->get_value(idx)); - }); - - Type *fp32_ty = builder.getFloatTy(); - Type *fp16x2_ty = VectorType::get(builder.getHalfTy(), 2); - Type *fp32_pack8_ty = StructType::get(ctx, {fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}); - FunctionType *mma_ty = FunctionType::get(fp32_pack8_ty, {fp16x2_ty, fp16x2_ty, fp16x2_ty, fp16x2_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}, false); - - Value *offset_a_i = offset_a_i_; - Value *offset_a_k = offset_a_k_; - Value *offset_b_j = offset_b_j_; - Value *offset_b_k = offset_b_k_; - - Value* u_thread_id = tgt_->get_local_id(builder.GetInsertBlock()->getModule(), builder, 0); - if(dot->is_a_trans()){ - offset_a_i = builder.CreateAdd(offset_a_i, builder.CreateURem(u_thread_id, builder.getInt32(4))); - offset_a_k = builder.getInt32(0); - } - if(!dot->is_b_trans()){ - offset_b_j = builder.CreateAdd(offset_b_j, builder.CreateURem(u_thread_id, builder.getInt32(4))); - offset_b_k = builder.getInt32(0); - } - - std::string op_a = dot->is_a_trans() ? "row" : "col"; - std::string op_b = dot->is_b_trans() ? "row" : "col"; - - InlineAsm *mma_fn = InlineAsm::get(mma_ty, " mma.sync.aligned.m8n8k4." + op_a + "." + op_b + ".f32.f16.f16.f32 " - "{$0, $1, $2, $3, $4, $5, $6, $7}, " - "{$8, $9}, " - "{$10, $11}, " - "{$0, $1, $2, $3, $4, $5, $6, $7};", "=f,=f,=f,=f,=f,=f,=f,=f,r,r,r,r,0,1,2,3,4,5,6,7", false); - - unsigned fpw_0 = params_->get_param(dot, "fpw.d0")->get_value(); - unsigned fpw_1 = params_->get_param(dot, "fpw.d1")->get_value(); - unsigned wts_0 = fpw_0 * 8; - unsigned wts_1 = fpw_1 * 8; - unsigned wpt_0 = params_->get_param(dot, "wpt.d0")->get_value(); - unsigned wpt_1 = params_->get_param(dot, "wpt.d1")->get_value(); - unsigned stride_rep_i = wpt_0 * wts_0; - unsigned stride_rep_j = wpt_1 * wts_1; - unsigned num_rep_i = shapes[0]->get_value() / stride_rep_i; - unsigned ld_fc = num_rep_i * 2; - - - for(auto& x: fcs){ - std::vector& fc = x.second; - for(unsigned pack_i = 0; pack_i < num_packs_0_; pack_i++) - for(unsigned pack_j = 0; pack_j < num_packs_1_; pack_j++){ - for(unsigned K = 0; K < NK; K += 4){ - Value *_K = builder.getInt32(K); - Value *current_offset_a_i = builder.CreateAdd(offset_a_i, builder.getInt32(pack_i*stride_rep_i*pack_size_0_)); - Value *current_offset_b_i = builder.CreateAdd(offset_b_j, builder.getInt32(pack_j*stride_rep_j*pack_size_1_)); - indices_t idx_a = {current_offset_a_i, builder.CreateAdd(offset_a_k, _K)}; - indices_t idx_b = {current_offset_b_i, builder.CreateAdd(offset_b_k, _K)}; - if(dot->is_a_trans()) - std::swap(idx_a[0], idx_a[1]); - if(!dot->is_b_trans()) - std::swap(idx_b[0], idx_b[1]); - idx_a.insert(idx_a.end(), x.first.begin(), x.first.end()); - idx_b.insert(idx_b.end(), x.first.begin(), x.first.end()); - Value *ha = TA->get_value(idx_a); - Value *hb = TB->get_value(idx_b); - for(unsigned ii = 0; ii < pack_size_0_; ii++) - for(unsigned jj = 0; jj < pack_size_1_; jj++){ - Value *ha0 = builder.CreateBitCast(builder.CreateExtractElement(ha, builder.getInt32(ii*pack_size_0_ + 0)), fp16x2_ty); - Value *ha1 = builder.CreateBitCast(builder.CreateExtractElement(ha, builder.getInt32(ii*pack_size_0_ + 1)), fp16x2_ty); - Value *hb0 = builder.CreateBitCast(builder.CreateExtractElement(hb, builder.getInt32(jj*pack_size_0_ + 0)), fp16x2_ty); - Value *hb1 = builder.CreateBitCast(builder.CreateExtractElement(hb, builder.getInt32(jj*pack_size_0_ + 1)), fp16x2_ty); - std::vector idx = { - (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 0)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 1)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 0)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 1)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 2)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 3)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 2)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 3)*ld_fc - }; - Value *nc = builder.CreateCall(mma_fn, {ha0, ha1, hb0, hb1, fc[idx[0]], fc[idx[1]], fc[idx[2]], fc[idx[3]], fc[idx[4]], fc[idx[5]], fc[idx[6]], fc[idx[7]]}); - fc[idx[0]] = builder.CreateExtractValue(nc, {0}); - fc[idx[1]] = builder.CreateExtractValue(nc, {1}); - fc[idx[2]] = builder.CreateExtractValue(nc, {2}); - fc[idx[3]] = builder.CreateExtractValue(nc, {3}); - fc[idx[4]] = builder.CreateExtractValue(nc, {4}); - fc[idx[5]] = builder.CreateExtractValue(nc, {5}); - fc[idx[6]] = builder.CreateExtractValue(nc, {6}); - fc[idx[7]] = builder.CreateExtractValue(nc, {7}); - } - } - } - } - - // write back - unsigned i = 0; - result->for_each([&](indices_t idx){ - std::vector key(idx.size() - 2); - std::copy(idx.begin() + 2, idx.end(), key.begin()); - if(i >= fcs.at(key).size()) - i = 0; - result->set_value(idx, fcs.at(key)[i++]); - }); - - TA->set_return_mode(false); - TB->set_return_mode(false); } } - else - { - distributed_tile *TA = (distributed_tile*)tmap_.at(A); - distributed_tile *TB = (distributed_tile*)tmap_.at(B); - result->for_each([&](indices_t idx){ - Value *res = TC->get_value(idx); - indices_t a_idx = {idx[0], builder.getInt32(0)}; - indices_t b_idx = {builder.getInt32(0), idx[1]}; - if(AT) + + // write back + unsigned i = 0; + TC->for_each([&](indices_t idx){ + std::vector key(idx.size() - 2); + std::copy(idx.begin() + 2, idx.end(), key.begin()); + if(i >= fcs.at(key).size()) + i = 0; + TC->set_value(idx, fcs.at(key)[i++]); + }); + + TA->set_return_mode(false); + TB->set_return_mode(false); +} + +void selection::lower_scanline_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn, IRBuilder<> &builder, + distributed_tile *TC, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK, + Type *c_ty, Function *f_mul_add) { + TA->set_vector_size(TC->axis(0).contiguous); + TB->set_vector_size(TC->axis(1).contiguous); + TC->for_each([&](indices_t idx){ + Value *res = TC->get_value(idx); + for(unsigned K = 0; K < NK; ++K){ + // input indices + indices_t a_idx = {idx[0], builder.getInt32(K)}; + indices_t b_idx = {builder.getInt32(K), idx[1]}; + if(dot->is_a_trans()) std::swap(a_idx[0], a_idx[1]); - if(BT) + if(dot->is_b_trans()) std::swap(b_idx[0], b_idx[1]); + // add batching dimension + for(size_t i = 2; i < idx.size(); i++){ + a_idx.insert(a_idx.end(), idx[i]); + b_idx.insert(b_idx.end(), idx[i]); + } + // load value Value *a = TA->get_value(a_idx); Value *b = TB->get_value(b_idx); if(a->getType() != c_ty) @@ -1194,8 +1148,59 @@ void selection::lower_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn, IRB if(b->getType() != c_ty) b = builder.CreateFPCast(b, c_ty); res = builder.CreateCall(f_mul_add, {a, b, res}); - result->set_value(idx, res); - }); + } + TC->set_value(idx, res); + }); +} + +void selection::lower_outer_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn, IRBuilder<> &builder, + distributed_tile *TC, distributed_tile *TA, distributed_tile *TB, distributed_tile *TD, + Type *c_ty, Function *f_mul_add) { + TC->for_each([&](indices_t idx){ + Value *res = TD->get_value(idx); + indices_t a_idx = {idx[0], builder.getInt32(0)}; + indices_t b_idx = {builder.getInt32(0), idx[1]}; + if(dot->is_a_trans()) + std::swap(a_idx[0], a_idx[1]); + if(dot->is_b_trans()) + std::swap(b_idx[0], b_idx[1]); + Value *a = TA->get_value(a_idx); + Value *b = TB->get_value(b_idx); + if(a->getType() != c_ty) + a = builder.CreateFPCast(a, c_ty); + if(b->getType() != c_ty) + b = builder.CreateFPCast(b, c_ty); + res = builder.CreateCall(f_mul_add, {a, b, res}); + TC->set_value(idx, res); + }); +} + +void selection::lower_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { + distributed_tile* TC = (distributed_tile*)tmap_.at(dot); + Module *module = fn->getParent(); + ir::value *A = dot->get_operand(0); + ir::value *B = dot->get_operand(1); + ir::value *D = dot->get_operand(2); + + distributed_tile *TD = (distributed_tile*)tmap_.at(D); + Type *c_ty = llvm_type(D->get_type()->get_scalar_ty(), ctx); + Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {c_ty}); + auto A_shapes = A->get_type()->get_tile_shapes(); + size_t red_axis = dot->is_a_trans() ? 0 : 1; + unsigned NK = A_shapes[red_axis]->get_value(); + + if(NK != 1) { + shared_tile *TA = (shared_tile*)tmap_.at(A); + shared_tile *TB = (shared_tile*)tmap_.at(B); + if(params_->get_fragment(dot, 0) == analysis::tune::STRIDED_SCAN) + lower_scanline_dot(dot, ctx, fn, builder, TC, TA, TB, TD, NK, c_ty, f_mul_add); + else + lower_hmma_dot(dot, ctx, fn, builder, TC, TA, TB, TD, NK); + } + else { + distributed_tile *TA = (distributed_tile*)tmap_.at(A); + distributed_tile *TB = (distributed_tile*)tmap_.at(B); + lower_outer_dot(dot, ctx, fn, builder, TC, TA, TB, TD, c_ty, f_mul_add); } } From 38a8b0ab199d7f7e2f5f088836a5498af980ef84 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 14 Aug 2019 15:43:50 -0700 Subject: [PATCH 294/494] [runtime] overall of the run-time API --- examples/cpp/dot.cpp | 174 +++++++++--- include/triton/codegen/analysis/tune.h | 8 +- include/triton/codegen/selection/selection.h | 6 +- include/triton/ir/type.h | 22 +- include/triton/runtime/arg.h | 80 ++++++ include/triton/runtime/function.h | 113 ++++++++ include/triton/runtime/jit.h | 3 +- include/triton/tools/bench.hpp | 3 +- lib/codegen/analysis/tune.cpp | 36 ++- lib/codegen/selection/selection.cpp | 8 +- lib/driver/buffer.cpp | 1 + lib/runtime/arg.cpp | 0 lib/runtime/function.cpp | 265 +++++++++++++++++++ 13 files changed, 633 insertions(+), 86 deletions(-) create mode 100644 include/triton/runtime/arg.h create mode 100644 include/triton/runtime/function.h create mode 100644 lib/runtime/arg.cpp create mode 100644 lib/runtime/function.cpp diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index ef73a7581..e93a09994 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -1,12 +1,12 @@ -#include +#include #include #include -#include "triton/runtime/jit.h" #include "triton/driver/backend.h" #include "triton/driver/stream.h" #include "triton/dnn/dot.h" #include "triton/tools/bench.hpp" #include "triton/external/half.hpp" +#include "triton/runtime/function.h" #include "cuda.h" template @@ -19,20 +19,125 @@ void diff(const std::vector& x, const std::vector& y){ std::cout << "Pass!" << std::endl; } +template +static void cpu_ref(std::vector &c, const std::vector &a, const std::vector &b, + size_t M, size_t N, size_t K){ + for(size_t m = 0; m < M; m++) + for(size_t n = 0; n < N; n++){ + float acc = 0; + for(size_t k = 0; k < K; k++) + acc = acc + (AT ? a[k + m*K] : a[m + k*M]) * (BT ? b[n + k*N] : b[k + n*K]); + c[m + n*M] = static_cast(acc); + } +} + +template +void cpu_ref(bool AT_, bool BT_, size_t M, size_t N, size_t K, + std::vector &c, const std::vector &a, const std::vector &b) { + if(AT_ && BT_) + cpu_ref(c, a, b, M, N, K); + else if(AT_ && !BT_) + cpu_ref(c, a, b, M, N, K); + else if(!AT_ && BT_) + cpu_ref(c, a, b, M, N, K); + else + cpu_ref(c, a, b, M, N, K); +} + + + +std::string src(bool AT, bool BT, std::string a_ty, std::string b_ty, std::string c_ty, int align_lda, int align_ldb) { + std::string ZS = "1"; + std::string AS0 = "TM", AS1 = "TK"; + std::string BS0 = "TK", BS1 = "TN"; + std::string XAS0 = "TM", XAS1 = "TK / " + ZS, XAS2 = ZS; + std::string XBS0 = "TK / " + ZS, XBS1 = ZS, XBS2 = "TN"; + std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; + std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; + std::string lda0 = "*lda", lda1 = ""; + std::string ldb0 = "", ldb1 = "*ldb"; + std::string usea = AT ? "trans(a)" : "a"; + std::string useb = BT ? "trans(b)" : "b"; + if(AT){ + std::swap(AS0, AS1); + std::swap(XAS0, XAS1); + std::swap(XAS1, XAS2); + std::swap(bca0, bca1); + std::swap(lda0, lda1); + } + if(BT){ + std::swap(BS0, BS1); + std::swap(XBS1, XBS2); + std::swap(XBS0, XBS1); + std::swap(bcb0, bcb1); + std::swap(ldb0, ldb1); + } + std::string AS = AS0 + ", " + AS1; + std::string BS = BS0 + ", " + BS1; + std::string XCS = "TM, TN"; + std::string align_lda_str = "multiple_of(" + std::to_string(align_lda) + ")"; + std::string align_ldb_str = "multiple_of(" + std::to_string(align_ldb) + ")"; + std::string res = +R"( +const tunable int TM = {128}; +const tunable int TN = {128}; +const tunable int TK = {32}; + +void matmul(restrict read_only align(16) )" + a_ty + R"( *A, + restrict read_only align(16) )" + b_ty + R"( *B, + restrict read_only align(16) )" + c_ty + R"( *C, + int M, int N, int K, + )" + align_lda_str + R"( int lda, )" + align_ldb_str + R"(" int ldb, int ldc) { + int ridx = get_range_id(0); + int ridy = get_range_id(1); + int rxa[TM] = ridx * TM + (0 ... TM); + int ryb[TN] = ridy * TN + (0 ... TN); + int rka[TK] = 0 ... TK; + int rkb[TK] = 0 ... TK; + float xc[)" + XCS + R"(] = 0; + )" + a_ty + R"(* pa[)" + AS + "] = A + rka" + bca0 + lda0 + " + rxa" + bca1 + lda1 + R"(; + )" + b_ty + R"(* pb[)" + BS + "] = B + rkb" + bcb0 + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; + )" + a_ty + R"( a[)" + AS + R"(] = *pa; + )" + b_ty + R"( b[)" + BS + R"(] = *pb; + for(int k = K; k > 0; k = k - TK){ + xc = dot()" + usea + ", " + useb + R"(, xc); + pa = pa + TK)" + lda0 + R"(; + pb = pb + TK)" + ldb0 + R"(; + a = *pa; + b = *pb; + } + int rxc[TM] = ridx * TM + (0 ... TM); + int ryc[TN] = ridy * TN + (0 ... TN); + )" + c_ty + R"(* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; + )" + c_ty + R"( c[TM, TN] = xc; + bool checkc0[TM] = rxc < M; + bool checkc1[TN] = ryc < N; + bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + @checkc *pc = c; +} +)"; + return res; +} + struct perf_t { double triton; double cublas; }; +namespace drv = triton::driver; +namespace rt = triton::runtime; -perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K){ +perf_t do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K){ typedef half NumericT; std::string ty = "half"; size_t dt_nbytes = sizeof(NumericT); - triton::driver::context* context = stream->context(); + drv::context* context = stream->context(); std::vector hc(M*N); std::vector ha(M*K); std::vector hb(K*N); + int32_t lda = AT ? K : M; + int32_t ldb = BT ? N : K; + int32_t ldc = M; srand(0); for(size_t i = 0; i < ha.size(); i++) ha[i] = static_cast((double)rand()/RAND_MAX); @@ -40,54 +145,40 @@ perf_t do_bench(triton::driver::stream* stream, bool AT, bool BT, int32_t M, int hb[i] = static_cast((double)rand()/RAND_MAX); for(size_t i = 0; i < hc.size(); i++) hc[i] = static_cast((double)0); - triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*dt_nbytes); - triton::driver::buffer* da = triton::driver::buffer::create(context, ha.size()*dt_nbytes); - triton::driver::buffer* db = triton::driver::buffer::create(context, hb.size()*dt_nbytes); + drv::buffer* dc = drv::buffer::create(context, hc.size()*dt_nbytes); + drv::buffer* da = drv::buffer::create(context, ha.size()*dt_nbytes); + drv::buffer* db = drv::buffer::create(context, hb.size()*dt_nbytes); stream->write(da, true, 0, ha); stream->write(db, true, 0, hb); stream->write(dc, true, 0, hc); stream->synchronize(); - triton::dnn::dot dot(M, N, K, AT, BT, ty, ty, ty, 8, 8, 8); - // benchmark triton - double triton_ns = triton::tools::bench([&]() { dot.enqueue(stream, {da, db, dc}, triton::dnn::FULL_TUNING);}, stream); - // benchmark cublas -// NumericT alpha = 1; -// NumericT beta = 0; -// int32_t lda = AT ? K : M; -// int32_t ldb = BT ? N : K; -// int32_t ldc = M; -// cublasGemmAlgo_t fastest; -// cublasGemm(HALF_TYPE, stream, AT, BT, M, N, K, -// &alpha, da, lda, -// db, ldb, &beta, -// dc, ldc, &fastest); -// double cublas_ns = triton::tools::bench([&]() { cublasGemm(HALF_TYPE, stream, AT, BT, M, N, K, -// &alpha, da, lda, -// db, ldb, &beta, -// dc, ldc, nullptr, CUBLAS_GEMM_DEFAULT_TENSOR_OP); }, stream); - // result - auto tflops = [&](double nanosec) { return dot.num_flops() / nanosec * 1e-3; }; + // run + rt::function function(src(AT, BT, ty, ty, ty, 8, 8)); + auto ceil = [](size_t x, size_t y) { return (x + y - 1) / y; }; + auto grid = [&](const rt::params_t& x) { return rt::grid_t{ceil(M, x.at("TM")), ceil(N, x.at("TN")), 1}; }; - perf_t result; -// result.cublas = tflops(cublas_ns); - result.triton = tflops(triton_ns); + auto tflops = [&](double nanosec) { return 2.*M*N*K / nanosec * 1e-3; }; + perf_t res; + res.triton = tflops(triton::tools::bench([&]() { function({da, db, dc, M, N, K, lda, ldb, ldc}, grid, stream);}, stream)); + res.cublas = 0; // test - stream->read(dc, true, 0, hc); - std::vector rc(hc.size()); - dot.cpu_ref(rc, ha, hb); - for(size_t i = 0; i < M*N; i++) - if(std::isinf(hc[i]) || std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-2){ - std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; - exit(EXIT_FAILURE); - } - std::cout << "Pass!" << std::endl; +// stream->synchronize(); +// stream->read(dc, true, 0, hc); +// std::vector rc(hc.size()); +// cpu_ref(AT, BT, M, N, K, rc, ha, hb); +// for(size_t i = 0; i < M*N; i++) +// if(std::isinf(hc[i]) || std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-2){ +// std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; +// exit(EXIT_FAILURE); +// } +// std::cout << "Pass!" << std::endl; // clean-up delete dc; delete da; delete db; - return result; + return res; } int main() { @@ -111,12 +202,11 @@ int main() { // shapes to benchmark std::vector configs = { // {false, false, 8192, 512, 512}, - {false, true, 128, 128, 128} + {false, true, 8192, 8192, 8192} // {false, true, 128, 128, 128}, // {false, false, 128, 128, 128}, // {true, false, 128, 128, 128}, // {true, true, 128, 128, 128} - // {false, true, 32768, 256, 512} // {true, false, 8192, 512, 512}, // {true, true, 8192, 512, 512} diff --git a/include/triton/codegen/analysis/tune.h b/include/triton/codegen/analysis/tune.h index 4ab07c974..63bd2bcc3 100644 --- a/include/triton/codegen/analysis/tune.h +++ b/include/triton/codegen/analysis/tune.h @@ -38,16 +38,14 @@ private: void create_grids(std::vector &grids, std::map &references, ir::function *fn); - unsigned get_req_num_threads(ir::instruction *i); public: - tune(); + tune(size_t num_warps); std::vector get_params(ir::module& mod); - std::map get_params(ir::instruction* i); ir::metaparameter* get_param(ir::value *value, const std::string &key) { return params_[value][key]; } - fragment_t get_fragment(ir::value *value, unsigned ax) { return fragments_.at({value, ax}); } unsigned get_param_group(ir::value *value, unsigned ax); + fragment_t get_fragment(ir::value *value, unsigned ax) { return fragments_.at({value, ax}); } void copy(ir::value *dst, ir::value *src); bool check_constraints(std::map> &errors); void run(ir::module &mod); @@ -64,7 +62,7 @@ private: std::map global_range_sizes_; std::vector grids_; std::map> groups_; - ir::metaparameter* num_warps_; + size_t num_warps_; }; diff --git a/include/triton/codegen/selection/selection.h b/include/triton/codegen/selection/selection.h index 3191fd60f..d17c0607b 100644 --- a/include/triton/codegen/selection/selection.h +++ b/include/triton/codegen/selection/selection.h @@ -185,8 +185,8 @@ private: public: - selection(analysis::shmem::allocation *alloc, analysis::tune *params, analysis::shmem::info *buffer_info, analysis::alignment_info *ax_info, target *tgt) - : alloc_(alloc), params_(params), buffer_info_(buffer_info), axis_info_(ax_info), tgt_(tgt){ } + selection(analysis::shmem::allocation *alloc, analysis::tune *params, analysis::shmem::info *buffer_info, analysis::alignment_info *alignment, target *tgt) + : alloc_(alloc), params_(params), buffer_info_(buffer_info), alignment_(alignment), tgt_(tgt){ } void run(ir::module &src, Module &dst); @@ -197,7 +197,7 @@ private: analysis::tune *params_; target *tgt_; analysis::shmem::info *buffer_info_; - analysis::alignment_info *axis_info_; + analysis::alignment_info *alignment_; std::map axes_; Value *sh_mem_ptr_; Value *offset_a_i_, *offset_a_k_; diff --git a/include/triton/ir/type.h b/include/triton/ir/type.h index 04da05b60..13ead1959 100644 --- a/include/triton/ir/type.h +++ b/include/triton/ir/type.h @@ -66,16 +66,18 @@ public: type *get_pointer_element_ty() const; // primitive predicates - bool is_void_ty() const { return id_ == VoidTyID; } - bool is_half_ty() const { return id_ == HalfTyID; } - bool is_float_ty() const { return id_ == FloatTyID; } - bool is_double_ty() const { return id_ == DoubleTyID; } - bool is_label_ty() const { return id_ == LabelTyID;} - bool is_metadata_ty() const { return id_ == MetadataTyID; } - bool is_token_ty() const { return id_ == TokenTyID; } - bool is_integer_ty() const { return id_ == IntegerTyID; } - bool is_pointer_ty() const { return id_ == PointerTyID; } - bool is_tile_ty() const { return id_ == TileTyID; } + bool is_void_ty() const { return id_ == VoidTyID; } + bool is_half_ty() const { return id_ == HalfTyID; } + bool is_float_ty() const { return id_ == FloatTyID; } + bool is_double_ty() const { return id_ == DoubleTyID; } + bool is_label_ty() const { return id_ == LabelTyID;} + bool is_metadata_ty() const { return id_ == MetadataTyID; } + bool is_token_ty() const { return id_ == TokenTyID; } + bool is_integer_ty() const { return id_ == IntegerTyID; } + bool is_integer_ty(unsigned bitwidth) { return is_integer_ty() && + get_integer_bitwidth() == bitwidth;} + bool is_pointer_ty() const { return id_ == PointerTyID; } + bool is_tile_ty() const { return id_ == TileTyID; } // Composite predicates bool is_int_or_tileint_ty(); diff --git a/include/triton/runtime/arg.h b/include/triton/runtime/arg.h new file mode 100644 index 000000000..3f7131fbc --- /dev/null +++ b/include/triton/runtime/arg.h @@ -0,0 +1,80 @@ +#ifndef TDL_INCLUDE_ARG_H +#define TDL_INCLUDE_ARG_H + +#include +#include + +namespace triton{ + +namespace driver{ + class buffer; +} + +namespace runtime { + +enum arg_type { + INT1_T, + INT8_T, + INT16_T, + INT32_T, + INT64_T, + HALF_T, + FLOAT_T, + DOUBLE_T, + BUFFER_T +}; + +size_t size_of(arg_type ty){ + switch(ty){ + case INT1_T: return 1; + case INT8_T: return 1; + case INT16_T: return 2; + case INT32_T: return 4; + case INT64_T: return 8; + case HALF_T: return 2; + case FLOAT_T: return 4; + case DOUBLE_T: return 8; + case BUFFER_T: return 8; + default: throw std::runtime_error("unknown type"); + } +} + +bool is_int_type(arg_type ty){ + return ty == INT1_T || ty == INT8_T || ty == INT16_T || + ty == INT32_T || ty == INT64_T; +} + +class arg { +private: + union value_t { + bool int1; + int8_t int8; + int16_t int16; + int32_t int32; + int64_t int64; + float fp32; + double fp64; + driver::buffer* buf; + }; + +public: + // construct from primitive types + arg(int32_t x): ty_(INT32_T) { val_.int32 = x; } + arg(int64_t x): ty_(INT64_T) { val_.int64 = x; } + arg(float x): ty_(FLOAT_T) { val_.fp32 = x; } + arg(double x): ty_(DOUBLE_T) { val_.fp64 = x; } + arg(driver::buffer* x): ty_(BUFFER_T) { val_.buf = x; } + // accessors + arg_type type() const { return ty_; } + void* data() const { return (void*)&val_; } + + +private: + arg_type ty_; + value_t val_; +}; + +} +} + +#endif diff --git a/include/triton/runtime/function.h b/include/triton/runtime/function.h new file mode 100644 index 000000000..e43301bfc --- /dev/null +++ b/include/triton/runtime/function.h @@ -0,0 +1,113 @@ +#ifndef TDL_INCLUDE_FUNCTION_H +#define TDL_INCLUDE_FUNCTION_H + +#include +#include +#include +#include +#include +#include "arg.h" +// codegen +#include "triton/codegen/selection/selection.h" +#include "triton/codegen/selection/target.h" +#include "triton/codegen/analysis/tune.h" +#include "triton/codegen/analysis/shmem/allocation.h" +#include "triton/codegen/analysis/shmem/liveness.h" +#include "triton/codegen/analysis/shmem/info.h" +#include "triton/codegen/analysis/alignment.h" +#include "triton/codegen/transform/dce.h" +#include "triton/codegen/transform/peephole.h" +#include "triton/codegen/transform/shmem/barriers.h" +#include "triton/codegen/transform/reassociate.h" +#include "triton/codegen/transform/vectorize.h" + +namespace llvm { + class Module; + class LLVMContext; +} + +namespace triton { + +namespace driver{ + class module; + class stream; + class kernel; + class context; + class device; +} + +namespace lang{ +class translation_unit; +} + +namespace codegen{ +namespace analysis{ +class tune; +} +} + +namespace ir { +class module; +class function; +class context; +class metaparameter; +} + +namespace runtime{ + + +typedef std::array grid_t; +typedef std::map params_t; + +struct options { + size_t num_warps; + params_t params; +}; + + +class function { +public: + typedef std::function grid_fn_ty; + +private: + class caller { + public: + caller(ir::function *ir, std::shared_ptr program, size_t n_threads); + void operator()(driver::stream *stream, const std::array& grid, const std::vector& args) const; + + private: + std::shared_ptr parent_; + std::shared_ptr bin_; + std::vector param_tys_; + size_t n_threads_; + }; + +private: + typedef std::pair> cache_key_t; + typedef std::pair cache_val_t; + +private: + triton::lang::translation_unit *make_ast(const char *src); + std::unique_ptr make_ir(triton::lang::translation_unit *program); + options autotune(lang::translation_unit *ast, driver::stream *stream, const grid_fn_ty& grid, const std::vector &args); + std::unique_ptr make_bin(ir::module &function, driver::context *context, const options &opt); + + +public: + function(const std::string& src); + void operator()(const std::vector& args, const std::array& grid, driver::stream* stream); + void operator()(const std::vector& args, const grid_fn_ty& grid, driver::stream *stream); + +private: + // execution context + ir::context ctx_; + // program representations + std::string src_; + lang::translation_unit *ast_; + std::map cache_; +}; + +} +} + +#endif diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index 869d68ac5..24eaa836d 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -58,7 +58,8 @@ public: struct passes_wrapper { passes_wrapper(codegen::target* target) - : shmem_liveness(&shmem_info), + : tune(0), + shmem_liveness(&shmem_info), shmem_allocation(&shmem_liveness, &shmem_info, &tune), shmem_barriers(&shmem_allocation, &shmem_info), vectorize(&tune), diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp index 74053b717..b2492c31f 100644 --- a/include/triton/tools/bench.hpp +++ b/include/triton/tools/bench.hpp @@ -3,6 +3,7 @@ #include #include +#include #include "triton/driver/device.h" #include "triton/driver/stream.h" @@ -29,7 +30,7 @@ private: inline double bench(std::function const & op, driver::stream * stream) { - const driver::device * device = stream->context()->device(); +// const driver::device * device = stream->context()->device(); timer tmr; std::vector times; double total_time = 0; diff --git a/lib/codegen/analysis/tune.cpp b/lib/codegen/analysis/tune.cpp index 21d08d5ea..5caf9533a 100644 --- a/lib/codegen/analysis/tune.cpp +++ b/lib/codegen/analysis/tune.cpp @@ -14,7 +14,7 @@ namespace triton{ namespace codegen{ namespace analysis{ -tune::tune() { +tune::tune(size_t num_warps): num_warps_(num_warps){ } bool is_hmma(ir::value *v){ @@ -183,20 +183,17 @@ void tune::connected_components(node_t x, const std::vector } std::vector tune::get_params(ir::module &mod) { - std::vector result; - std::set seen; - for(auto x: mod.globals()) { - if(auto mp = dynamic_cast(x.second)) - if(seen.insert(mp).second && !mp->has_value()) - result.push_back(mp); - } - num_warps_ = ir::metaparameter::create(mod.get_context(), mod.get_builder().get_int32_ty(), 4, 4); - result.push_back(num_warps_); - return result; -} - -std::map tune::get_params(ir::instruction* i) { - return params_.at(i); + throw std::runtime_error("remove me"); +// std::vector result; +// std::set seen; +// for(auto x: mod.globals()) { +// if(auto mp = dynamic_cast(x.second)) +// if(seen.insert(mp).second && !mp->has_value()) +// result.push_back(mp); +// } +// num_warps_ = ir::metaparameter::create(mod.get_context(), mod.get_builder().get_int32_ty(), 4, 4); +// result.push_back(num_warps_); +// return result; } unsigned tune::get_param_group(ir::value *value, unsigned ax) { @@ -257,7 +254,6 @@ void tune::init(ir::module &mod) { } int num_threads = get_num_threads(); - int num_warps = num_warps_->get_value(); auto clamp = [&](int x, int lo, int hi) { return std::min(std::max(x, lo), hi); }; for(ir::value *i: grids_){ @@ -292,9 +288,9 @@ void tune::init(ir::module &mod) { std::vector wpt_nm1; do{ wpt_nm1 = wpt; - if(wpt[0] * wpt[1] * wpt[2] < num_warps) + if(wpt[0] * wpt[1] * wpt[2] < num_warps_) wpt[0] = clamp(wpt[0]*2, 1, shape_0 / (fpw[0]*8)); - if(wpt[0] * wpt[1] * wpt[2] < num_warps) + if(wpt[0] * wpt[1] * wpt[2] < num_warps_) wpt[1] = clamp(wpt[1]*2, 1, shape_1 / (fpw[1]*8)); }while(wpt_nm1 != wpt); // store parameters @@ -307,7 +303,7 @@ void tune::init(ir::module &mod) { std::string str_d = std::to_string(d); effective_num_warps *= params_.at(i).at("wpt.d" + str_d)->get_value(); } - assert(num_warps == effective_num_warps); + assert(num_warps_ == effective_num_warps); } /* Scan-line */ @@ -386,7 +382,7 @@ bool tune::check_constraints(std::map> &er } unsigned tune::get_num_threads() { - return num_warps_->get_value()*32; + return num_warps_*32; } diff --git a/lib/codegen/selection/selection.cpp b/lib/codegen/selection/selection.cpp index 81ed60e70..3b92311fc 100644 --- a/lib/codegen/selection/selection.cpp +++ b/lib/codegen/selection/selection.cpp @@ -1208,8 +1208,8 @@ void selection::lower_masked_load(ir::masked_load_inst *x, LLVMContext &ctx, Fun // find vector size distributed_tile* result = (distributed_tile*)tmap_.at(x); ir::value *ptr = x->get_pointer_operand(); - unsigned starting_multiple = axis_info_->get_starting_multiple(ptr); - unsigned max_contiguous = axis_info_->get_max_contiguous(ptr); + unsigned starting_multiple = alignment_->get_starting_multiple(ptr); + unsigned max_contiguous = alignment_->get_max_contiguous(ptr); unsigned alignment = std::min(starting_multiple, max_contiguous); unsigned vector_size = std::min(result->axis(0).contiguous, alignment); distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr); @@ -1280,8 +1280,8 @@ void selection::lower_load(ir::load_inst *x, LLVMContext &ctx, Function *fn, IRB distributed_tile* result = (distributed_tile*)tmap_.at(x); // find vector size ir::value *ptr = x->get_pointer_operand(); - unsigned starting_multiple = axis_info_->get_starting_multiple(ptr); - unsigned max_contiguous = axis_info_->get_max_contiguous(ptr); + unsigned starting_multiple = alignment_->get_starting_multiple(ptr); + unsigned max_contiguous = alignment_->get_max_contiguous(ptr); unsigned alignment = std::min(starting_multiple, max_contiguous); unsigned vector_size = std::min(result->axis(0).contiguous, alignment); distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr); diff --git a/lib/driver/buffer.cpp b/lib/driver/buffer.cpp index 111091fdf..53f9d4e07 100755 --- a/lib/driver/buffer.cpp +++ b/lib/driver/buffer.cpp @@ -54,6 +54,7 @@ size_t buffer::size() { return size_; } + buffer* buffer::create(driver::context* ctx, size_t size) { switch(ctx->backend()){ case CUDA: return new cu_buffer(ctx, size); diff --git a/lib/runtime/arg.cpp b/lib/runtime/arg.cpp new file mode 100644 index 000000000..e69de29bb diff --git a/lib/runtime/function.cpp b/lib/runtime/function.cpp new file mode 100644 index 000000000..fde0c7972 --- /dev/null +++ b/lib/runtime/function.cpp @@ -0,0 +1,265 @@ +#include +#include +#include +#include "triton/codegen/selection/selection.h" +#include "triton/runtime/function.h" +#include "triton/lang/lang.h" +#include "triton/driver/device.h" +#include "triton/driver/stream.h" +#include "triton/driver/kernel.h" +#include "triton/driver/module.h" +#include "triton/ir/module.h" +#include "triton/ir/function.h" +#include "triton/tools/bench.hpp" +#include "llvm/IR/Module.h" + + +typedef struct yy_buffer_state * YY_BUFFER_STATE; +extern int yyparse(); +extern YY_BUFFER_STATE yy_scan_string(const char * str); +extern void yy_delete_buffer(YY_BUFFER_STATE buffer); +extern triton::lang::translation_unit *ast_root; + +namespace triton{ +namespace runtime { + + +// helpers +void _parallel_loop_nest(std::vector const & ranges, + std::function const &)> const & f, + size_t nthreads){ + size_t D = ranges.size(); + std::vector values(D, 0); + // Start with innermost loop + size_t i = D - 1; + while(true){ + // Execute function + f(values); + while(values[i]++ == ranges[i] - 1){ + if(i == 0) + return; + values[i--] = 0; + } + i = D - 1; + } +} + +template +void _parallel_loop_nest(std::vector> const & iterates, std::function)> const & f, size_t nthreads){ + //Ranges to iterate over + std::vector ranges; + for(auto const & x: iterates) + ranges.push_back(x.size()); + //Proxy function + auto proxy = [&](std::vector const & idx){ + std::vector x(iterates.size()); + for(size_t i = 0; i < x.size(); ++i) + x[i] = iterates[i][idx[i]]; + f(x); + }; + //Iterate + _parallel_loop_nest(ranges, proxy, nthreads); +} + +// caller + +arg_type convert(ir::type *ty) { + if(ty->is_integer_ty(1)) + return INT1_T; + if(ty->is_integer_ty(8)) + return INT8_T; + if(ty->is_integer_ty(16)) + return INT16_T; + if(ty->is_integer_ty(32)) + return INT32_T; + if(ty->is_integer_ty(64)) + return INT64_T; + if(ty->is_half_ty()) + return HALF_T; + if(ty->is_float_ty()) + return FLOAT_T; + if(ty->is_double_ty()) + return DOUBLE_T; + if(ty->is_pointer_ty()) + return BUFFER_T; + throw std::runtime_error("unknown type"); +} + +function::caller::caller(ir::function *ir, std::shared_ptr parent, size_t n_threads) + : bin_(driver::kernel::create(&*parent, ir->get_name().c_str())), n_threads_(n_threads), parent_(parent) { + // extract signature + ir::function_type* ty = ir->get_fn_type(); + for(int i = 0; i < ty->get_num_params(); i++) + param_tys_.push_back(convert(ty->get_param_ty(i))); +} + + +void function::caller::operator ()(driver::stream *stream, const std::array& grid, const std::vector& args) const { + if(args.size() != param_tys_.size()) + throw std::runtime_error("invalid number of arguments"); + for(size_t i = 0; i < args.size(); i++){ + arg arg_i = args.at(i); + arg_type ty = arg_i.type(); + if(ty != param_tys_.at(i)) + throw std::runtime_error("invalid type"); + if(ty == BUFFER_T) + bin_->setArg(i, *((driver::buffer**)arg_i.data())); + else + bin_->setArg(i, size_of(ty), arg_i.data()); + } + stream->enqueue(&*bin_, grid, {n_threads_, 1, 1}); +} + + + +// module +triton::lang::translation_unit *function::make_ast(const char *src) { + YY_BUFFER_STATE buffer = yy_scan_string(src); + yyparse(); + yy_delete_buffer(buffer); + triton::lang::translation_unit *program = ast_root; + return program; +} + +std::unique_ptr function::make_ir(triton::lang::translation_unit *program) { + // create Triton-IR from AST + ir::module* module = new ir::module("", ctx_); + program->codegen(module); + return std::unique_ptr(module); +} + +options function::autotune(lang::translation_unit *ast, driver::stream* stream, const grid_fn_ty& grid_fn, const std::vector& args) { + std::unique_ptr ir = make_ir(ast); + // extract tunable values + std::vector> values; + for(auto it: ir->globals()) + if(auto *mp = dynamic_cast(it.second)) + values.push_back({it.first, mp}); + // extract search space + std::vector> space; + space.push_back({1, 2, 4, 8}); // num warps + for(auto it: values) + space.push_back(it.second->get_space()); + // exhaustive search + struct profile_t{ + double ts; + std::vector params; + }; + profile_t best = { INFINITY }; + std::function)> benchmark = + [&](std::vector params) { + // options + options opt; + unsigned i = 0; + opt.num_warps = params[i++]; + for(auto it: values) + opt.params[it.first] = params[i++]; + // make binary + auto ir = make_ir(ast); + auto bin = make_bin(*ir, stream->context(), opt); + // benchmark + ir::function *tmp = ir->get_function_list()[0]; + caller fn(tmp, std::move(bin), opt.num_warps * 32); + double ts = tools::bench([&]() { fn(stream, grid_fn(opt.params), args); }, stream); + if(ts < best.ts) + best = {ts, params}; + }; + _parallel_loop_nest(space, benchmark, 1); + // populate options + unsigned current = 0; + options opt; + opt.num_warps = best.params[current++]; + for(auto it: values) + opt.params[it.first] = best.params[current++]; + return opt; +} + + +std::unique_ptr function::make_bin(ir::module &module, driver::context *context, const options& opt) { + std::unique_ptr target = context->device()->make_target(); + // update metaparameter values + for(auto x: opt.params) + if(auto* mp = dynamic_cast(module.globals().at(x.first))) + mp->set_value(x.second); + // create passes + codegen::analysis::tune tune(opt.num_warps); + codegen::analysis::shmem::info shmem_info; + codegen::analysis::shmem::liveness shmem_liveness(&shmem_info); + codegen::analysis::shmem::allocation shmem_allocation(&shmem_liveness, &shmem_info, &tune); + codegen::analysis::alignment_info alignment_info; + codegen::transform::shmem_barriers shmem_barriers(&shmem_allocation, &shmem_info); + codegen::transform::vectorize vectorize(&tune); + codegen::transform::dce dce; + codegen::transform::peephole peephole; + codegen::transform::reassociate reassociate(&tune); + codegen::selection selection(&shmem_allocation, &tune, &shmem_info, &alignment_info, target.get()); + // run passes + peephole.run(module); + dce.run(module); + tune.run(module); + tune.init(module); + reassociate.run(module); + peephole.run(module); + if(target->is_gpu()){ + shmem_info.run(module); + shmem_liveness.run(module); + shmem_allocation.run(); + shmem_barriers.run(module); + } + alignment_info.run(module); + vectorize.run(module); + dce.run(module); + // generate llvm code + llvm::LLVMContext ctx; + std::unique_ptr llvm(new llvm::Module(module.get_name(), ctx)); + selection.run(module, *llvm); + // return binary + std::unique_ptr res(driver::module::create(context, llvm.get())); + return res; +} + + +function::function(const std::string &src): src_(src) { + // src -> ast + ast_ = make_ast(src_.c_str()); +} + +void function::operator()(const std::vector& args, const grid_fn_ty& grid_fn, driver::stream *stream) { + /* determine if should re-tune or not */ + cache_key_t key; + // re-tune if device is difference + key.first = stream->context()->device(); + // re-tune if any int argument is different + for(size_t i = 0; i < args.size(); i++){ + arg_type ty = args.at(i).type(); + if(is_int_type(ty)){ + long val = 0; + std::memcpy((void*)&val, args.at(i).data(), size_of(ty)); + key.second.push_back(val); + } + } + + /* find existing configuration */ + auto it = cache_.find(key); + if(it != cache_.end()){ + it->second.second(stream, grid_fn(it->second.first.params), args); + return; + } + + /* re-tune and re-compile */ + options opt = autotune(ast_, stream, grid_fn, args); + std::unique_ptr ir = make_ir(ast_); + std::unique_ptr bin = make_bin(*ir, stream->context(), opt); + ir::function* fn = ir->get_function_list().front(); + const caller& run = cache_.insert({key, cache_val_t{opt, caller(fn, std::move(bin), opt.num_warps*32)}}).first->second.second; + run(stream, grid_fn(opt.params), args); +} + +void function::operator()(const std::vector& args, const grid_t& grid, driver::stream *stream) { + return this->operator()(args, [&grid](const params_t&){ return grid; }, stream); +} + + + +} +} From 3ece461ce203d70bec25ae6b9a9ab4885e94274f Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 15 Aug 2019 15:59:53 -0700 Subject: [PATCH 295/494] added tensorflow code generator --- examples/cpp/dot.cpp | 5 +- include/triton/codegen/selection/selection.h | 2 +- include/triton/runtime/function.h | 1 + lib/runtime/function.cpp | 168 +++++++++++++++++++ 4 files changed, 174 insertions(+), 2 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index e93a09994..b56f6f6e1 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -154,6 +154,9 @@ perf_t do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int stream->synchronize(); // run rt::function function(src(AT, BT, ty, ty, ty, 8, 8)); + std::cout << function.make_tensorflow_src({2}, "(M + #TM - 1)/#TM, (N + #TN - 1)/#TN, 1") << std::endl; + exit(EXIT_FAILURE); + auto ceil = [](size_t x, size_t y) { return (x + y - 1) / y; }; auto grid = [&](const rt::params_t& x) { return rt::grid_t{ceil(M, x.at("TM")), ceil(N, x.at("TN")), 1}; }; @@ -202,7 +205,7 @@ int main() { // shapes to benchmark std::vector configs = { // {false, false, 8192, 512, 512}, - {false, true, 8192, 8192, 8192} + {false, true, 128, 128, 128} // {false, true, 128, 128, 128}, // {false, false, 128, 128, 128}, // {true, false, 128, 128, 128}, diff --git a/include/triton/codegen/selection/selection.h b/include/triton/codegen/selection/selection.h index d17c0607b..785e32179 100644 --- a/include/triton/codegen/selection/selection.h +++ b/include/triton/codegen/selection/selection.h @@ -1,7 +1,6 @@ #ifndef TDL_INCLUDE_CODEGEN_SELECTION_H #define TDL_INCLUDE_CODEGEN_SELECTION_H -#include "llvm/IR/Module.h" #include "llvm/IR/IRBuilder.h" #include "triton/ir/context.h" #include "triton/ir/module.h" @@ -16,6 +15,7 @@ namespace llvm{ class Instruction; class Constant; class LLVMContext; + class Module; } namespace triton{ diff --git a/include/triton/runtime/function.h b/include/triton/runtime/function.h index e43301bfc..2cbd65fd4 100644 --- a/include/triton/runtime/function.h +++ b/include/triton/runtime/function.h @@ -97,6 +97,7 @@ public: function(const std::string& src); void operator()(const std::vector& args, const std::array& grid, driver::stream* stream); void operator()(const std::vector& args, const grid_fn_ty& grid, driver::stream *stream); + std::string make_tensorflow_src(const std::vector &outputs, const std::string ¯o); private: // execution context diff --git a/lib/runtime/function.cpp b/lib/runtime/function.cpp index fde0c7972..24e66397b 100644 --- a/lib/runtime/function.cpp +++ b/lib/runtime/function.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include "triton/codegen/selection/selection.h" #include "triton/runtime/function.h" @@ -259,6 +260,173 @@ void function::operator()(const std::vector& args, const grid_t& grid, driv return this->operator()(args, [&grid](const params_t&){ return grid; }, stream); } +std::string to_tf_ty(ir::type *ty) { + if(ty->is_integer_ty(1)) + return "bool"; + if(ty->is_integer_ty(8)) + return "int8"; + if(ty->is_integer_ty(16)) + return "int16"; + if(ty->is_integer_ty(32)) + return "int32"; + if(ty->is_integer_ty(64)) + return "int64"; + if(ty->is_half_ty()) + return "float16"; + if(ty->is_float_ty()) + return "float32"; + if(ty->is_double_ty()) + return "float64"; + if(ty->is_pointer_ty()) + return "Tensor"; + throw std::runtime_error("unknown type"); +} + +std::string ref_to_tf_ty(ir::type *ty) { + std::string res = to_tf_ty(ty); + if(ty->is_pointer_ty()) + res = "const " + res + "&"; + return res; +} + + +std::string function::make_tensorflow_src(const std::vector& outputs, const std::string& macro) { + std::unique_ptr ir = make_ir(ast_); + // extract function signature + ir::function* fn = ir->get_function_list().front(); + ir::function_type* fn_ty = fn->get_fn_type(); + // numberof arguments + size_t n_args = fn_ty->get_num_params(); + size_t n_outputs = outputs.size(); + // extract function name + std::string name = fn->get_name(); + std::string classname = name + "Op"; + // extract argument name + std::vector arg_names; + for(ir::argument *arg: fn->args()) + arg_names.push_back(arg->get_name()); + // cached int to str + std::vector str_i; + for(size_t i = 0; i < fn_ty->get_num_params(); i++) + str_i.push_back(std::to_string(i)); + // index of tensors + std::vector ptr_idx; + for(unsigned i = 0; i < fn_ty->get_num_params(); i++) + if(fn_ty->get_param_ty(i)->is_pointer_ty()) + ptr_idx.push_back(i); + // extract tensorflow types + std::vector tf_tys; + std::transform(fn_ty->params_begin(), fn_ty->params_end(), std::back_inserter(tf_tys), to_tf_ty); + std::vector tf_cref_tys; + std::transform(fn_ty->params_begin(), fn_ty->params_end(), std::back_inserter(tf_cref_tys), ref_to_tf_ty); + + std::ostringstream oss; + + std::string result = R"( +#include "triton/driver/buffer.h" +#include "triton/driver/backend.h" +#include "triton/driver/stream.h" +#include "triton/runtime/function.h" + +#define EIGEN_USE_GPU +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" +#include "tensorflow/core/util/padding.h" +#include "tensorflow/core/util/tensor_format.h" +#include "tensorflow/core/framework/common_shape_fns.h" + +using namespace tensorflow; +using GPUDevice = Eigen::GpuDevice; +namespace rt = triton::runtime; +namespace drv = triton::driver; + +std::string src = R"TTKERNSRC( )" + src_ + ")TTKERNSRC\";" + R"( + +class )" + classname + R"(: public OpKernel { + public: + explicit )" + classname + R"((OpKernelConstruction* context) + : OpKernel(context), fn_(src) { } + + void Compute(OpKernelContext* context){ + + // get device/stream + GPUDevice device = context->eigen_device(); + drv::cu_stream sstream(device.stream(), false); + drv::context* ctx = sstream.context(); + drv::stream* stream = &sstream; + + // extract inputs)"; +for(unsigned i = 0; i < n_args; i++){ + std::string suffix = ""; + std::string ty = tf_cref_tys[i]; + if(!fn_ty->get_param_ty(i)->is_pointer_ty()) + suffix = ".scalar<" + ty + ">()()"; + result += R"( + )" + ty + " " + arg_names[i] + " = context->input(" + str_i[i] + ")" + suffix + ";"; +} + +result += R"( + + // extract outputs)"; +for(unsigned i = 0; i < n_outputs; i++) + result += R"( + context->set_output()" + str_i[i] + ", " + arg_names[outputs[i]] + ");"; + +result += R"( + + // wrap tensors)"; +for(size_t i: ptr_idx) +result += R"( + drv::cu_buffer cu_)" + arg_names[i] + "(ctx, " + arg_names[i] + ".tensor_data().size(), (CUdeviceptr)" + arg_names[i] + R"(.tensor_data().data(), false);)"; + + +std::regex regex("#([a-zA-Z]([a-zA-Z]|[0-9])*)"); +std::string grid_str = std::regex_replace(macro, regex, "x.at(\"$1\")"); + +result += R"( + + // create launch grid; + auto grid = [&](const rt::params_t& x) { return rt::grid_t{)" + grid_str + R"(}; };)"; + +result += R"( + + // execute function + fn_({ + )"; +for(unsigned i = 0; i < n_args; i++){ + std::string arg = arg_names[i]; + if(fn_ty->get_param_ty(i)->is_pointer_ty()) + arg = "&cu_" + arg; + if(i > 0) + result += ", "; + result += arg; +} +result += R"( + }, grid, stream); + + } + +private: + rt::function fn_; +}; + +REGISTER_KERNEL_BUILDER(Name(")" + name + "\").Device(DEVICE_GPU), " + classname + R"(); + +REGISTER_OP(")" + name + "\")\n"; +for(size_t i = 0; i < tf_tys.size(); i++){ + bool is_output = std::find(outputs.begin(), outputs.end(), i) != outputs.end(); + std::string mode = is_output ? "Output" : "Input" ; + result += " ." + mode + "(\"" + arg_names[i] + ": " + tf_tys[i] + "\")\n"; +} +result += ";\n"; + + + return result; +} + + } From 4de22df9306b74cc19ebcf002e4589fccdfc783d Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 15 Aug 2019 20:50:10 -0700 Subject: [PATCH 296/494] [python] added skeleton for python interface --- CMakeLists.txt | 27 +- examples/CMakeLists.txt | 1 - examples/python/CMakeLists.txt | 2 - examples/python/pytorch/CMakeLists.txt | 10 - examples/python/pytorch/batchnorm.cpp | 90 - examples/python/pytorch/conv.cpp | 148 -- examples/python/pytorch/run.py | 183 -- examples/python/pytorch/shift.cpp | 165 -- examples/python/pytorch/test.py | 30 - examples/python/pytorch/triton.py | 221 -- examples/python/tensorflow/CMakeLists.txt | 13 - examples/python/tensorflow/batchnorm.cpp | 157 -- examples/python/tensorflow/blocksparse.cpp | 304 --- examples/python/tensorflow/conv.cpp | 82 - examples/python/tensorflow/dot.cpp | 64 - examples/python/tensorflow/run.py | 136 -- examples/python/tensorflow/shift.cpp | 167 -- include/triton/runtime/arg.h | 4 +- lib/runtime/function.cpp | 170 +- python/dist/triton-0.1-py3.6-linux-x86_64.egg | Bin 0 -> 709047 bytes python/examples/dot.py | 42 + python/setup.py | 76 + python/src/pybind11/attr.h | 493 ++++ python/src/pybind11/buffer_info.h | 108 + python/src/pybind11/cast.h | 2128 ++++++++++++++++ python/src/pybind11/chrono.h | 162 ++ python/src/pybind11/common.h | 2 + python/src/pybind11/complex.h | 65 + python/src/pybind11/detail/class.h | 623 +++++ python/src/pybind11/detail/common.h | 807 ++++++ python/src/pybind11/detail/descr.h | 100 + python/src/pybind11/detail/init.h | 335 +++ python/src/pybind11/detail/internals.h | 291 +++ python/src/pybind11/detail/typeid.h | 55 + python/src/pybind11/eigen.h | 607 +++++ python/src/pybind11/embed.h | 200 ++ python/src/pybind11/eval.h | 117 + python/src/pybind11/functional.h | 94 + python/src/pybind11/iostream.h | 207 ++ python/src/pybind11/numpy.h | 1610 ++++++++++++ python/src/pybind11/operators.h | 168 ++ python/src/pybind11/options.h | 65 + python/src/pybind11/pybind11.h | 2162 +++++++++++++++++ python/src/pybind11/pytypes.h | 1471 +++++++++++ python/src/pybind11/stl.h | 386 +++ python/src/pybind11/stl_bind.h | 630 +++++ python/src/tensorflow.cpp | 224 ++ 47 files changed, 13251 insertions(+), 1951 deletions(-) delete mode 100644 examples/python/CMakeLists.txt delete mode 100644 examples/python/pytorch/CMakeLists.txt delete mode 100644 examples/python/pytorch/batchnorm.cpp delete mode 100644 examples/python/pytorch/conv.cpp delete mode 100644 examples/python/pytorch/run.py delete mode 100644 examples/python/pytorch/shift.cpp delete mode 100644 examples/python/pytorch/test.py delete mode 100644 examples/python/pytorch/triton.py delete mode 100644 examples/python/tensorflow/CMakeLists.txt delete mode 100644 examples/python/tensorflow/batchnorm.cpp delete mode 100644 examples/python/tensorflow/blocksparse.cpp delete mode 100644 examples/python/tensorflow/conv.cpp delete mode 100644 examples/python/tensorflow/dot.cpp delete mode 100644 examples/python/tensorflow/run.py delete mode 100644 examples/python/tensorflow/shift.cpp create mode 100644 python/dist/triton-0.1-py3.6-linux-x86_64.egg create mode 100644 python/examples/dot.py create mode 100644 python/setup.py create mode 100644 python/src/pybind11/attr.h create mode 100644 python/src/pybind11/buffer_info.h create mode 100644 python/src/pybind11/cast.h create mode 100644 python/src/pybind11/chrono.h create mode 100644 python/src/pybind11/common.h create mode 100644 python/src/pybind11/complex.h create mode 100644 python/src/pybind11/detail/class.h create mode 100644 python/src/pybind11/detail/common.h create mode 100644 python/src/pybind11/detail/descr.h create mode 100644 python/src/pybind11/detail/init.h create mode 100644 python/src/pybind11/detail/internals.h create mode 100644 python/src/pybind11/detail/typeid.h create mode 100644 python/src/pybind11/eigen.h create mode 100644 python/src/pybind11/embed.h create mode 100644 python/src/pybind11/eval.h create mode 100644 python/src/pybind11/functional.h create mode 100644 python/src/pybind11/iostream.h create mode 100644 python/src/pybind11/numpy.h create mode 100644 python/src/pybind11/operators.h create mode 100644 python/src/pybind11/options.h create mode 100644 python/src/pybind11/pybind11.h create mode 100644 python/src/pybind11/pytypes.h create mode 100644 python/src/pybind11/stl.h create mode 100644 python/src/pybind11/stl_bind.h create mode 100644 python/src/tensorflow.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 2bece7b6f..5b252c520 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,10 @@ project(triton) include(CTest) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") +# Options +option(BUILD_EXAMPLES "Build C++ Triton examples" ON) +option(BUILD_PYTHON_MODULE "Build Python Triton bindings" OFF) + # FLEX/YACC find_package(BISON) find_package(FLEX) @@ -35,15 +39,24 @@ add_custom_target( ALL SOURCES ${ALL_SRC} ) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${LLVM_CXXFLAGS} -std=c++11") +# Examples +if(BUILD_EXAMPLES) + message(STATUS "Adding C++ examples") + add_subdirectory(examples) +endif() + +# Python module +if(BUILD_PYTHON_MODULE) + message(STATUS "Adding Python module") + file(GLOB_RECURSE PYTHON_SRC python/src/*.cpp) + include_directories(python/src/ ${PYTHON_INCLUDE_DIRS}) + set(PYTHON_LIBS ) +endif() + + # Triton file(GLOB_RECURSE LIBTRITON_SRC lib/*.cpp) -add_library(triton SHARED ${LIBTRITON_SRC} ${BISON_Parser_OUTPUTS} ${FLEX_Lexer_OUTPUTS}) +add_library(triton SHARED ${LIBTRITON_SRC} ${PYTHON_SRC} ${BISON_Parser_OUTPUTS} ${FLEX_Lexer_OUTPUTS}) target_link_libraries(triton LLVM) -# Examples -add_subdirectory(examples) - - - - diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 8277f0611..2322a85f7 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,2 +1 @@ add_subdirectory(cpp) -add_subdirectory(python) diff --git a/examples/python/CMakeLists.txt b/examples/python/CMakeLists.txt deleted file mode 100644 index a73011f48..000000000 --- a/examples/python/CMakeLists.txt +++ /dev/null @@ -1,2 +0,0 @@ -add_subdirectory(tensorflow) -add_subdirectory(pytorch) diff --git a/examples/python/pytorch/CMakeLists.txt b/examples/python/pytorch/CMakeLists.txt deleted file mode 100644 index f4b4df758..000000000 --- a/examples/python/pytorch/CMakeLists.txt +++ /dev/null @@ -1,10 +0,0 @@ -find_package(Torch) -if(${TORCH_FOUND}) - set(CUDA_HOME "/usr/local/cuda") - include_directories(${TORCH_INCLUDE_DIRS}) - include_directories("${CUDA_HOME}/include") - link_directories(${TORCH_LIBRARY_DIRS}) - add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1) - add_library(torch_triton SHARED conv.cpp shift.cpp batchnorm.cpp) - target_link_libraries(torch_triton torch triton) -endif() diff --git a/examples/python/pytorch/batchnorm.cpp b/examples/python/pytorch/batchnorm.cpp deleted file mode 100644 index 64559e197..000000000 --- a/examples/python/pytorch/batchnorm.cpp +++ /dev/null @@ -1,90 +0,0 @@ -#include -#include -#include "ATen/cuda/CUDAContext.h" -#include "triton/driver/stream.h" -#include "triton/dnn/batchnorm.h" - -#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") -#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x " must be contiguous") -#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) - -std::vector - batchnorm_ymv(const torch::Tensor fw_x, - const torch::Tensor fw_g, - const torch::Tensor fw_b, - double eps) { - CHECK_INPUT(fw_x); - CHECK_INPUT(fw_g); - CHECK_INPUT(fw_b); - // Wrap CUDA handles - c10::DeviceIndex device = fw_x.storage().device().index(); - CUstream custream = (CUstream)at::cuda::getCurrentCUDAStream(device).stream(); - triton::driver::cu_stream stream(custream, false); - triton::driver::context* ctx = stream.context(); - // get sizes - int C = fw_x.size(0); - int H = fw_x.size(1); - int W = fw_x.size(2); - int B = fw_x.size(3); - // allocate outputs - torch::Tensor fw_y = torch::empty(fw_x.sizes()).cuda(); - torch::Tensor fw_m = torch::empty(fw_g.sizes()).cuda(); - torch::Tensor fw_v = torch::empty(fw_g.sizes()).cuda(); - triton::driver::cu_buffer x(ctx, (CUdeviceptr)fw_x.storage().data(), false); - triton::driver::cu_buffer g(ctx, (CUdeviceptr)fw_g.storage().data(), false); - triton::driver::cu_buffer b(ctx, (CUdeviceptr)fw_b.storage().data(), false); - triton::driver::cu_buffer y(ctx, (CUdeviceptr)fw_y.storage().data(), false); - triton::driver::cu_buffer m(ctx, (CUdeviceptr)fw_m.storage().data(), false); - triton::driver::cu_buffer v(ctx, (CUdeviceptr)fw_v.storage().data(), false); - // create template - triton::dnn::batchnorm_forward batchnorm(C, 1, H, W, B, "float"); - batchnorm.enqueue(&stream, {&y, &m, &v, &x, &g, &b}); - stream.synchronize(); - return {fw_y, fw_m, fw_v}; -} - -std::vector - batchnorm_dxdgdb(const torch::Tensor fw_dy, - const torch::Tensor fw_x, - const torch::Tensor fw_g, - const torch::Tensor fw_m, - const torch::Tensor fw_v, - double eps) { - CHECK_INPUT(fw_dy); - CHECK_INPUT(fw_x); - CHECK_INPUT(fw_g); - CHECK_INPUT(fw_m); - CHECK_INPUT(fw_v); - // Wrap CUDA handles - c10::DeviceIndex device = fw_x.storage().device().index(); - CUstream custream = (CUstream)at::cuda::getCurrentCUDAStream(device).stream(); - triton::driver::cu_stream stream(custream, false); - triton::driver::context* ctx = stream.context(); - // get sizes - int C = fw_x.size(0); - int H = fw_x.size(1); - int W = fw_x.size(2); - int B = fw_x.size(3); - // allocate outputs - torch::Tensor fw_dx = torch::empty(fw_x.sizes()).cuda(); - torch::Tensor fw_dg = torch::empty(fw_g.sizes()).cuda(); - torch::Tensor fw_db = torch::empty(fw_g.sizes()).cuda(); - // triton handles - triton::driver::cu_buffer dy(ctx, (CUdeviceptr)fw_dy.storage().data(), false); - triton::driver::cu_buffer x(ctx, (CUdeviceptr) fw_x.storage().data(), false); - triton::driver::cu_buffer g(ctx, (CUdeviceptr) fw_g.storage().data(), false); - triton::driver::cu_buffer m(ctx, (CUdeviceptr) fw_m.storage().data(), false); - triton::driver::cu_buffer v(ctx, (CUdeviceptr) fw_v.storage().data(), false); - triton::driver::cu_buffer dx(ctx, (CUdeviceptr)fw_dx.storage().data(), false); - triton::driver::cu_buffer dg(ctx, (CUdeviceptr)fw_dg.storage().data(), false); - triton::driver::cu_buffer db(ctx, (CUdeviceptr)fw_db.storage().data(), false); - // create config - triton::dnn::batchnorm_backward batchnorm(C, 1, H, W, B, "float", eps); - batchnorm.enqueue(&stream, {&dx, &dg, &db, &dy, &x, &g, &m, &v}); - stream.synchronize(); - return {fw_dx, fw_dg, fw_db}; -} - -static auto registry = - torch::jit::RegisterOperators("triton::batchnorm_ymv", &batchnorm_ymv) - .op("triton::batchnorm_dxdgdb", &batchnorm_dxdgdb); diff --git a/examples/python/pytorch/conv.cpp b/examples/python/pytorch/conv.cpp deleted file mode 100644 index 91cef5441..000000000 --- a/examples/python/pytorch/conv.cpp +++ /dev/null @@ -1,148 +0,0 @@ -#include -#include -#include -#include "ATen/cuda/CUDAContext.h" -#include "triton/driver/stream.h" -#include "triton/dnn/conv.h" - -#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") -#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x " must be contiguous") -#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) - -torch::Tensor conv_common( - int32_t B, int32_t C, int32_t D, int32_t H, int32_t W, - int32_t T, int32_t R, int32_t S, int32_t NF, - int32_t stride_d, int32_t stride_h, int32_t stride_w, - int32_t pad_d, int32_t pad_h, int32_t pad_w, - triton::dnn::conv::type ty, - torch::Tensor torcha, torch::Tensor torchb, torch::Tensor torchbias, - bool autotune = false - ) { - // Wrap CUDA handles - c10::DeviceIndex device = torcha.storage().device().index(); - // Get stream - CUstream custream = (CUstream)at::cuda::getCurrentCUDAStream(device).stream(); - triton::driver::cu_stream stream(custream, false); - triton::driver::context* ctx = stream.context(); - // Get template - bool has_bias = torchbias.storage().size() > 0; - triton::dnn::conv conv(B, C, D, H, W, T, R, S, NF, - stride_d, stride_h, stride_w, - pad_d, pad_h, pad_w, - 1, 1, 1, - "float", "float", ty, has_bias); - // Bind memory - triton::driver::cu_buffer a(ctx, (CUdeviceptr)torcha.storage().data(), false); - triton::driver::cu_buffer b(ctx, (CUdeviceptr)torchb.storage().data(), false); - triton::driver::cu_buffer cubias(ctx, (CUdeviceptr)torchbias.storage().data(), false); - triton::driver::buffer* bias = has_bias ? &cubias : nullptr; - // Allocate output - std::vector c_shapes = conv.c_shapes(); - torch::Tensor torchc; - if(ty == triton::dnn::conv::WGRAD) - torchc = torch::empty({c_shapes[0], c_shapes[2], c_shapes[3], c_shapes[4]}, torch::kFloat).cuda(); - else - torchc = torch::empty({c_shapes[0], c_shapes[1], c_shapes[3], c_shapes[4]}, torch::kFloat).cuda(); - triton::driver::cu_buffer c(ctx, (CUdeviceptr)torchc.storage().data(), false); - // Enqueue - conv.enqueue(&stream, {&a, &b, &c, bias}); - return torchc; -} - -torch::Tensor conv_fprop( - const torch::Tensor data, - const torch::Tensor weight, - const torch::Tensor bias, - int64_t stride_h, int64_t stride_w, - int64_t pad_h, int64_t pad_w) { - // Check - CHECK_INPUT(data); - CHECK_INPUT(weight); - // Unpack data shapes - const int32_t B = data.size(0); - const int32_t Ci = data.size(1); - const int32_t D = 1; - const int32_t H = data.size(2); - const int32_t W = data.size(3); - // Unpack weight shapes - const int32_t Cf = weight.size(0); - const int32_t T = 1; - const int32_t R = weight.size(1); - const int32_t S = weight.size(2); - const int32_t NF = weight.size(3); - // Configuration - const int32_t stride_d = 1; - const int32_t pad_d = 0; - // Check - AT_CHECK(Ci == Cf, "Number of channels in data and weights must match"); - return conv_common(B, Ci, D, H, W, T, R, S, NF, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, triton::dnn::conv::FPROP, data, weight, bias); -} - -torch::Tensor conv_bprop( - const torch::Tensor derror, - const torch::Tensor weight, - const torch::Tensor bias, - int64_t H, int64_t W, - int64_t stride_h, int64_t stride_w, - int64_t pad_h, int64_t pad_w){ - // Check - CHECK_INPUT(derror); - CHECK_INPUT(weight); - // Unpack data shapes - const int32_t B = derror.size(0); - const int32_t Ki = derror.size(1); - const int32_t M = 1; - const int32_t P = derror.size(2); - const int32_t Q = derror.size(3); - // Unpack weight shapes - const int32_t C = weight.size(0); - const int32_t T = 1; - const int32_t R = weight.size(1); - const int32_t S = weight.size(2); - const int32_t Kw = weight.size(3); - // Compute M, P, Q - const int32_t stride_d = 1; - int32_t pad_d = 0; - int32_t D = 1; - // Check - AT_CHECK(Ki == Kw, "Number of channels in error and weights must match"); - return conv_common(B, C, D, H, W, T, R, S, Kw, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, triton::dnn::conv::BPROP, derror, weight, bias); -} - -torch::Tensor conv_wgrad( - const torch::Tensor data, - const torch::Tensor derror, - const torch::Tensor bias, - int64_t R, int64_t S, - int64_t stride_h, int64_t stride_w, - int64_t pad_h, int64_t pad_w - ){ - // Check - CHECK_INPUT(data); - CHECK_INPUT(derror); - // Unpack data shapes - const int32_t Ba = data.size(0); - const int32_t C = data.size(1); - const int32_t D = 1; - const int32_t H = data.size(2); - const int32_t W = data.size(3); - // Unpack error shapes - const int32_t Bb = derror.size(0); - const int32_t K = derror.size(1); - const int32_t M = 1; - const int32_t P = derror.size(2); - const int32_t Q = derror.size(3); - // Compute M, P, Q - const int32_t upsample_d = 1, upsample_h = 1, upsample_w = 1; - const int32_t stride_d = 1; - const int32_t pad_d = 0; - const int32_t T = 1; - // Check - AT_CHECK(Ba == Bb, "Number of channels in error and weights must match"); - return conv_common(Ba, C, D, H, W, T, R, S, K, stride_d, stride_h, stride_w, pad_d, pad_h, pad_w, triton::dnn::conv::WGRAD, data, derror, bias); -} - -static auto registry = - torch::jit::RegisterOperators("triton::conv_fprop", &conv_fprop) - .op("triton::conv_bprop", &conv_bprop) - .op("triton::conv_wgrad", &conv_wgrad); diff --git a/examples/python/pytorch/run.py b/examples/python/pytorch/run.py deleted file mode 100644 index e7c10112c..000000000 --- a/examples/python/pytorch/run.py +++ /dev/null @@ -1,183 +0,0 @@ -from __future__ import print_function -import argparse -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.optim as optim -from torchvision import datasets, transforms -import triton -from torch.utils.cpp_extension import load -from torch.distributions import categorical - -shift_cuda = load( - 'shift_cuda', ['/home/philippe/development/shiftnet/kernels/shift_cuda.cpp', - '/home/philippe/development/shiftnet/kernels/shift_cuda_kernel.cu'], extra_cflags=['-O3']) - -class shift(torch.autograd.Function): - @staticmethod - def forward(ctx, x, shift): - ctx.save_for_backward(shift) - return shift_cuda.forward(x, shift) - - @staticmethod - def backward(ctx, grad_output): - shift, = ctx.saved_tensors - grad_output = shift_cuda.backward(grad_output, shift) - - return grad_output, None - - -class Shift(nn.Module): - def __init__(self, in_channels, kernel_size): - super(Shift, self).__init__() - self.channels = in_channels - self.kernel_size = kernel_size - if kernel_size == 3: - p = torch.Tensor([0.3, 0.4, 0.3]) - elif kernel_size == 5: - p = torch.Tensor([0.1, 0.25, 0.3, 0.25, 0.1]) - elif kernel_size == 7: - p = torch.Tensor([0.075, 0.1, 0.175, 0.3, 0.175, 0.1, 0.075]) - elif kernel_size == 9: - p = torch.Tensor([0.05, 0.075, 0.1, 0.175, 0.2, 0.175, 0.1, 0.075, 0.05]) - else: - raise RuntimeError('Unsupported kernel size') - shift_t = categorical.Categorical(p).sample((in_channels, 2)) - (kernel_size // 2) - self.register_buffer('shift_t', shift_t.int()) - - def forward(self, x): - if x.is_cuda: - return shift.apply(x, self.shift_t) - else: - print('Shift only supports GPU for now..') - assert False - - def extra_repr(self): - s = ('{channels}, kernel_size={kernel_size}') - return s.format(**self.__dict__) - - -def ShiftConv2d(in_planes, out_planes, kernel_size=3, stride=1, groups=1, dilation=1): - return nn.Sequential( - Shift(in_planes, kernel_size), - nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, - padding=0, groups=groups, bias=False) - ) - - -class Net(nn.Module): - def __init__(self): - super(Net, self).__init__() - self.conv1 = ShiftConv2d(1, 32, 3, 1) - self.conv2 = ShiftConv2d(32, 128, 3, 1) - self.conv3 = ShiftConv2d(128, 128, 3, 2) - self.bn1 = nn.BatchNorm2d(128) - self.conv4 = ShiftConv2d(128, 256, 3, 2) - self.bn2 = nn.BatchNorm2d(256) - self.fc1 = nn.Linear(256*7*7, 500) - self.fc2 = nn.Linear(500, 10) - - def forward(self, x): - x = self.conv1(x) - x = self.conv2(x) - x = self.conv3(x) - x = self.bn1(x) - x = F.relu(x) - x = self.conv4(x) - x = self.bn2(x) - x = F.relu(x) - x = x.view(-1, 256*7*7) - x = F.relu(self.fc1(x)) - x = self.fc2(x) - return F.log_softmax(x, dim=1) - -Net = Net() - -def train(args, model, device, train_loader, optimizer, epoch): - model.train() - for batch_idx, (data, target) in enumerate(train_loader): - data, target = data.to(device), target.to(device) - optimizer.zero_grad() - output = model(data) - loss = F.nll_loss(output, target) - loss.backward() - optimizer.step() - if batch_idx % args.log_interval == 0: - print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( - epoch, batch_idx * len(data), len(train_loader.dataset), - 100. * batch_idx / len(train_loader), loss.item())) - -def test(args, model, device, test_loader): - model.eval() - test_loss = 0 - correct = 0 - with torch.no_grad(): - for data, target in test_loader: - data, target = data.to(device), target.to(device) - output = model(data) - test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss - pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability - correct += pred.eq(target.view_as(pred)).sum().item() - - test_loss /= len(test_loader.dataset) - - print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( - test_loss, correct, len(test_loader.dataset), - 100. * correct / len(test_loader.dataset))) - -def main(): - # Training settings - parser = argparse.ArgumentParser(description='PyTorch MNIST Example') - parser.add_argument('--batch-size', type=int, default=64, metavar='N', - help='input batch size for training (default: 64)') - parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', - help='input batch size for testing (default: 1000)') - parser.add_argument('--epochs', type=int, default=10, metavar='N', - help='number of epochs to train (default: 10)') - parser.add_argument('--lr', type=float, default=0.01, metavar='LR', - help='learning rate (default: 0.01)') - parser.add_argument('--momentum', type=float, default=0.5, metavar='M', - help='SGD momentum (default: 0.5)') - parser.add_argument('--no-cuda', action='store_true', default=False, - help='disables CUDA training') - parser.add_argument('--seed', type=int, default=1, metavar='S', - help='random seed (default: 1)') - parser.add_argument('--log-interval', type=int, default=10, metavar='N', - help='how many batches to wait before logging training status') - - parser.add_argument('--save-model', action='store_true', default=False, - help='For Saving the current Model') - args = parser.parse_args() - use_cuda = not args.no_cuda and torch.cuda.is_available() - - torch.manual_seed(args.seed) - - device = torch.device("cuda" if use_cuda else "cpu") - - kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} - train_loader = torch.utils.data.DataLoader( - datasets.MNIST('../data', train=True, download=True, - transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])), - batch_size=args.batch_size, shuffle=True, **kwargs) - test_loader = torch.utils.data.DataLoader( - datasets.MNIST('../data', train=False, transform=transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,)) - ])), - batch_size=args.test_batch_size, shuffle=True, **kwargs) - - - model = Net.to(device) - optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) - - for epoch in range(1, args.epochs + 1): - train(args, model, device, train_loader, optimizer, epoch) - test(args, model, device, test_loader) - - if (args.save_model): - torch.save(model.state_dict(),"mnist_cnn.pt") - -main() diff --git a/examples/python/pytorch/shift.cpp b/examples/python/pytorch/shift.cpp deleted file mode 100644 index bd80d73d9..000000000 --- a/examples/python/pytorch/shift.cpp +++ /dev/null @@ -1,165 +0,0 @@ -#include -#include -#include -#include "ATen/cuda/CUDAContext.h" -#include "triton/driver/stream.h" -#include "triton/dnn/shift.h" - -#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") -#define CHECK_CONTIGUOUS(x) AT_CHECK(x.is_contiguous(), #x " must be contiguous") -#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) - -void extract_shapes(const torch::Tensor &x, - int64_t &C, int64_t &H, int64_t &W, int64_t &B, - triton::dnn::layout_t layout) { - if(layout == triton::dnn::CHWN){ - C = x.size(0); - H = x.size(1); - W = x.size(2); - B = x.size(3); - } - else if(layout == triton::dnn::NCHW){ - B = x.size(0); - C = x.size(1); - H = x.size(2); - W = x.size(3); - } - else{ - throw std::runtime_error("unsupported layout"); - } -} - -static const triton::dnn::layout_t layout = triton::dnn::NCHW; - -torch::Tensor shift_common( - int32_t B, int32_t C, int32_t D, int32_t H, int32_t W, - int32_t T, int32_t R, int32_t S, int32_t F, - int32_t stride_h, int32_t stride_w, - int32_t* shift_h, int32_t* shift_w, - triton::dnn::op_t op, triton::dnn::layout_t layout, - torch::Tensor torcha, torch::Tensor torchb, torch::Tensor torchbias, - bool autotune = false - ) { - // Wrap CUDA handles - c10::DeviceIndex device = torcha.storage().device().index(); - CUstream custream = (CUstream)at::cuda::getCurrentCUDAStream(device).stream(); - triton::driver::cu_stream stream(custream, false); - triton::driver::context* ctx = stream.context(); - // Data-type - std::string dtype; - at::ScalarType type = torcha.scalar_type(); - switch(type){ - case at::ScalarType::Double: dtype = "double"; break; - case at::ScalarType::Float: dtype = "float"; break; - case at::ScalarType::Half: dtype = "half"; break; - default: AT_ERROR("unknown data-type for shift-conv"); - } - // Get configuration - bool has_bias = torchbias.storage().size() > 0; - triton::dnn::shift shift(B, C, D, H, W, T, R, S, F, - stride_h, stride_w, - shift_h, shift_w, dtype, dtype, - op, has_bias, layout); - // Bind memory - triton::driver::cu_buffer a(ctx, (CUdeviceptr)torcha.storage().data(), false); - triton::driver::cu_buffer b(ctx, (CUdeviceptr)torchb.storage().data(), false); - triton::driver::cu_buffer cubias(ctx, (CUdeviceptr)torchbias.storage().data(), false); - triton::driver::buffer* bias = has_bias ? &cubias : nullptr; - // Allocate output - std::vector _c_shapes = shift.c_shapes(); - std::vector c_shapes; - for(auto x: _c_shapes) - c_shapes.push_back(x); - torch::Tensor torchc = torch::empty(c_shapes, type).cuda(); - - - triton::driver::cu_buffer c(ctx, (CUdeviceptr)torchc.storage().data(), false); - std::cout << B << ", " << C << ", " << H << ", " << W << ", " << T << ", " << R << ", " << S << ", " << F << ", " << stride_h << ", " << stride_w << ", " << op << ", " << layout << std::endl; - // Enqueue - shift.enqueue(&stream, {&a, &b, &c}, triton::dnn::NO_TUNING); - return torchc; -} - -torch::Tensor shift_y( - const torch::Tensor x, - const torch::Tensor w, - const torch::Tensor bias, - int64_t R, int64_t S, - int64_t stride_h, int64_t stride_w, - const torch::Tensor shift_h, const torch::Tensor shift_w) { - CHECK_INPUT(x); - CHECK_INPUT(w); - // shapes for a - int64_t Ca, H, W, B; - extract_shapes(x, Ca, H, W, B, layout); - // shapes for b - int64_t Cb = w.size(0); - int64_t F = w.size(1); - AT_CHECK(Ca == Cb, "operands must have the same number of channels"); - int64_t C = Ca; - // run - return shift_common(B, C, 1, H, W, 1, R, S, F, stride_h, stride_w, - (int32_t*)shift_h.storage().data(), (int32_t*)shift_w.storage().data(), - triton::dnn::FPROP, layout, x, w, bias); -} - -torch::Tensor shift_dx( - const torch::Tensor dy, - const torch::Tensor w, - const torch::Tensor bias, - int64_t R, int64_t S, - int64_t stride_h, int64_t stride_w, - const torch::Tensor shift_h, const torch::Tensor shift_w) { - CHECK_INPUT(dy); - CHECK_INPUT(w); - // shapes for a - int64_t Ca, H, W, B; - extract_shapes(dy, Ca, H, W, B, layout); - H *= stride_h; - W *= stride_w; - // shapes for b - int64_t Cb = w.size(0); - int64_t F = w.size(1); - std::swap(Cb, F); - // checks - AT_CHECK(Ca == Cb, "operands must have the same number of channels"); - int64_t C = Ca; - std::swap(C, F); - // run - return shift_common(B, C, 1, H, W, 1, R, S, F, stride_h, stride_w, - (int32_t*)shift_h.storage().data(), (int32_t*)shift_w.storage().data(), - triton::dnn::BPROP, layout, dy, w, bias); -} - -torch::Tensor shift_dw( - const torch::Tensor dy, - const torch::Tensor x, - const torch::Tensor bias, - int64_t R, int64_t S, - int64_t stride_h, int64_t stride_w, - const torch::Tensor shift_h, const torch::Tensor shift_w) { - CHECK_INPUT(dy); - CHECK_INPUT(x); - // shapes for a - int64_t F, Ha, Wa, Ba; - extract_shapes(dy, F, Ha, Wa, Ba, layout); - // shapes for b - int64_t C, Hb, Wb, Bb; - extract_shapes(x, C, Hb, Wb, Bb, layout); - // check - AT_CHECK(Ha*stride_h == Hb, "operands must have the same image height"); - AT_CHECK(Wa*stride_w == Wb, "operands must have the same image width"); - AT_CHECK(Ba == Bb, "operands must have the same batch size"); - int64_t H = Hb; - int64_t W = Wb; - int64_t B = Bb; - // run - return shift_common(B, C, 1, H, W, 1, R, S, F, stride_h, stride_w, - (int32_t*)shift_h.storage().data(), (int32_t*)shift_w.storage().data(), - triton::dnn::WGRAD, layout, dy, x, bias); -} - -static auto registry = - torch::jit::RegisterOperators("triton::shift_conv_y", &shift_y) - .op("triton::shift_conv_dx", &shift_dx) - .op("triton::shift_conv_dw", &shift_dw); diff --git a/examples/python/pytorch/test.py b/examples/python/pytorch/test.py deleted file mode 100644 index 4c80fd187..000000000 --- a/examples/python/pytorch/test.py +++ /dev/null @@ -1,30 +0,0 @@ -import torch -import triton - -torch.manual_seed(0) -torch.set_printoptions(precision=4) - -x = torch.autograd.Variable(torch.randn(64, 3, 8, 8).cuda(), requires_grad=True) -bias = torch.autograd.Variable(torch.randn(6).cuda(), requires_grad=True) -w = torch.autograd.Variable(torch.randn(3, 3, 3, 6).cuda(), requires_grad=True) -cuw = torch.autograd.Variable(w.permute(3,0,1,2).cuda(), requires_grad=True) -y_target = torch.autograd.Variable(torch.randn(64, 6, 8, 8).cuda(), requires_grad=True) - -def run(x, w, conv): - y = conv(x, w) - loss = (y - y_target).norm(2) - loss.backward() - return loss, y.clone(), x.grad.clone(), w.grad.clone(), bias.grad.clone() - -ttyloss, tty, ttdx, ttdw, ttbias = run(x, w, lambda x, w: triton.ConvFunction.apply(x, w, bias, (1,1), (1,1))) -x.grad.zero_() -w.grad.zero_() -bias.grad.zero_() -culoss, cuy, cudx, cudw, cubias = run(x, cuw, lambda x, w: torch.nn.functional.conv2d(x, w, bias=bias, stride=1, padding=1)) - -print(ttdx[0,0,:,:]) -print(cudx[0,0,:,:]) -print((tty - cuy).norm(2)) -print((ttdx - cudx).norm(2)) -print((ttdw.permute(3,0,1,2) - cudw).norm(2)) -print((ttbias - cubias).norm(2)) diff --git a/examples/python/pytorch/triton.py b/examples/python/pytorch/triton.py deleted file mode 100644 index 2d78e58f7..000000000 --- a/examples/python/pytorch/triton.py +++ /dev/null @@ -1,221 +0,0 @@ -import torch -import math -import numpy as np -from torch.nn.modules.utils import _single, _pair, _triple -from torch.distributions import categorical - -torch.ops.load_library("/home/philippe/development/triton/build/examples/python/pytorch/libtorch_triton.so") - -################################# -####### Convolutions ########## -################################# - -class ConvFunction(torch.autograd.Function): - - @staticmethod - def forward(ctx, input, weight, bias, stride, padding): - if bias is None: - bias = torch.empty(0) - ctx.save_for_backward(input, weight, bias) - ctx.stride = stride - ctx.padding = padding - output = torch.ops.triton.conv_fprop(input, weight, bias, stride[0], stride[1], padding[0], padding[1]) - return output - - @staticmethod - def backward(ctx, grad_output): - input, weight, bias = ctx.saved_tensors - stride = ctx.stride - padding = ctx.padding - grad_input = grad_weight = grad_bias = None - if ctx.needs_input_grad[0]: - grad_input = torch.ops.triton.conv_bprop(grad_output, weight, bias, input.shape[2], input.shape[3], stride[0], stride[1], padding[0], padding[1]) - if ctx.needs_input_grad[1]: - grad_weight = torch.ops.triton.conv_wgrad(input, grad_output, bias, weight.shape[1], weight.shape[2], stride[0], stride[1], padding[0], padding[1]) - if ctx.needs_input_grad[2]: - grad_bias = torch.sum(grad_output, (0, 2, 3)) - return grad_input, grad_weight, grad_bias, None, None - - -class _ConvNd(torch.nn.Module): - - def __init__(self, in_channels, out_channels, kernel_size, stride, - padding, dilation, transposed, output_padding, groups, bias): - super(_ConvNd, self).__init__() - # not everything is supported by Triton - assert all(x==1 or x==2 for x in stride) - assert all(x==1 for x in dilation) - assert transposed == False - assert all(x==0 for x in output_padding) - assert groups == 1 - # initialize - self.in_channels = in_channels - self.out_channels = out_channels - self.kernel_size = kernel_size - self.stride = stride - self.padding = padding - self.weight = torch.nn.Parameter(torch.Tensor( - in_channels, kernel_size[0], kernel_size[1], out_channels)) - if bias: - self.bias = torch.nn.Parameter(torch.Tensor(out_channels)) - else: - self.register_parameter('bias', None) - self.reset_parameters() - - def forward(self, input): - return ConvFunction.apply(input, self.weight, self.bias, self.stride, self.padding) - - def reset_parameters(self): - n = self.in_channels - for k in self.kernel_size: - n *= k - stdv = 1. / math.sqrt(n) - self.weight.data.uniform_(-stdv, stdv) - if self.bias is not None: - self.bias.data.uniform_(-stdv, stdv) - - - -class Conv2d(_ConvNd): - - def __init__(self, in_channels, out_channels, kernel_size, stride=1, - padding=0, dilation=1, groups=1, bias=True): - kernel_size = _pair(kernel_size) - stride = _pair(stride) - padding = _pair(padding) - dilation = _pair(dilation) - super(Conv2d, self).__init__( - in_channels, out_channels, kernel_size, stride, padding, dilation, - False, _pair(0), groups, bias) - -################################# -#### Shift-Convolutions ####### -################################# - -class ShiftConvFunction(torch.autograd.Function): - - @staticmethod - def forward(ctx, input, weight, bias, stride, width, shift_h, shift_w): - if bias is None: - bias = torch.empty(0) - ctx.save_for_backward(input, weight, bias) - ctx.stride = stride - ctx.width = width - ctx.shift_h = shift_h - ctx.shift_w = shift_w - output = torch.ops.triton.shift_conv_y(input, weight, bias, - width[0], width[1], - stride[0], stride[1], - shift_h, shift_w) - return output - - @staticmethod - def backward(ctx, dy): - input, weight, bias = ctx.saved_tensors - stride = ctx.stride - width = ctx.width - shift_h = ctx.shift_h - shift_w = ctx.shift_w - dx = dw = dbias = None - if ctx.needs_input_grad[0]: - dx = torch.ops.triton.shift_conv_dx(dy.contiguous(), weight, bias, width[0], width[1], stride[0], stride[1], shift_h, shift_w) - if ctx.needs_input_grad[1]: - dw = torch.ops.triton.shift_conv_dw(dy.contiguous(), input, bias, width[0], width[1], stride[0], stride[1], shift_h, shift_w) - if ctx.needs_input_grad[2]: - dbias = torch.sum(dy, (1, 2, 3)) - return dx, dw, dbias, None, None, None, None - - -class _ShiftConvNd(torch.nn.Module): - - def __init__(self, in_channels, out_channels, kernel_size, stride, bias): - super(_ShiftConvNd, self).__init__() - # initialize - self.in_channels = in_channels - self.out_channels = out_channels - self.kernel_size = kernel_size - self.stride = stride - self.weight = torch.nn.Parameter(torch.Tensor(in_channels, out_channels)) - if bias: - self.bias = torch.nn.Parameter(torch.Tensor(out_channels)) - else: - self.register_parameter('bias', None) - self.shift_h = self.make_shift(kernel_size[0]) - self.shift_w = self.make_shift(kernel_size[1]) - self.reset_parameters() - - def forward(self, input): - return ShiftConvFunction.apply(input, self.weight, self.bias, self.stride, - self.kernel_size, self.shift_h, self.shift_w) - - def make_shift(self, kernel_size): - if kernel_size == 3: - p = torch.Tensor([0.3, 0.4, 0.3]) - elif kernel_size == 5: - p = torch.Tensor([0.1, 0.25, 0.3, 0.25, 0.1]) - elif kernel_size == 7: - p = torch.Tensor([0.075, 0.1, 0.175, 0.3, 0.175, 0.1, 0.075]) - elif kernel_size == 9: - p = torch.Tensor([0.05, 0.075, 0.1, 0.175, 0.2, 0.175, 0.1, 0.075, 0.05]) - else: - raise RuntimeError('Unsupported kernel size') - return categorical.Categorical(p).sample((self.in_channels,)) - (kernel_size // 2) - - def reset_parameters(self): - n = self.in_channels - for k in self.kernel_size: - n *= k - stdv = 1. / math.sqrt(n) - self.weight.data.uniform_(-stdv, stdv) - if self.bias is not None: - self.bias.data.uniform_(-stdv, stdv) - -class ShiftConv2d(_ShiftConvNd): - - def __init__(self, in_channels, out_channels, kernel_size, stride=1, bias=False): - kernel_size = _pair(kernel_size) - stride = _pair(stride) - super(ShiftConv2d, self).__init__( - in_channels, out_channels, kernel_size, stride, bias) - -################################# -######### BatchNorm ########### -################################# - -class BatchNormFunction(torch.autograd.Function): - - @staticmethod - def forward(ctx, x, gamma, beta, eps): - ctx.eps = eps - y, mean, var = torch.ops.triton.batchnorm_ymv(x, gamma, beta, eps) - ctx.save_for_backward(x, gamma, beta, mean, var) - return y - - @staticmethod - def backward(ctx, dy): - eps = ctx.eps - x, gamma, beta, mean, var = ctx.saved_tensors - dx, dg, db = torch.ops.triton.batchnorm_dxdgdb(dy.contiguous(), x, gamma, mean, var, eps) - return dx, dg, db, None - - -class _BatchNorm(torch.nn.Module): - - def __init__(self, num_features, eps=1e-5): - super(_BatchNorm, self).__init__() - self.num_features = num_features - self.eps = eps - self.weight = torch.nn.Parameter(torch.Tensor(num_features)) - self.bias = torch.nn.Parameter(torch.Tensor(num_features)) - self.reset_parameters() - - def reset_parameters(self): - torch.nn.init.uniform_(self.weight) - torch.nn.init.zeros_(self.bias) - - def forward(self, input): - return BatchNormFunction.apply(input, self.weight, self.bias, self.eps) - -class BatchNorm2d(_BatchNorm): - - pass diff --git a/examples/python/tensorflow/CMakeLists.txt b/examples/python/tensorflow/CMakeLists.txt deleted file mode 100644 index 0dad37f19..000000000 --- a/examples/python/tensorflow/CMakeLists.txt +++ /dev/null @@ -1,13 +0,0 @@ -find_package(TensorFlow) -if(${TensorFlow_FOUND}) - set(CUDA_HOME "/usr/local/cuda") - include_directories("${TF_INC}/tensorflow/include") - include_directories("${CUDA_HOME}/include") - link_directories(${TF_LIB}) - add_definitions(-D_GLIBCXX_USE_CXX11_ABI=${TF_ABI}) - add_library(tf_blocksparse SHARED blocksparse.cpp dot.cpp conv.cpp shift.cpp batchnorm.cpp) - target_link_libraries(tf_blocksparse tensorflow_framework triton) - configure_file(${CMAKE_CURRENT_SOURCE_DIR}/run.py - ${CMAKE_CURRENT_BINARY_DIR}/run.py - COPYONLY) -endif() diff --git a/examples/python/tensorflow/batchnorm.cpp b/examples/python/tensorflow/batchnorm.cpp deleted file mode 100644 index 956ecef24..000000000 --- a/examples/python/tensorflow/batchnorm.cpp +++ /dev/null @@ -1,157 +0,0 @@ -#include - -#include "triton/driver/buffer.h" -#include "triton/driver/backend.h" -#include "triton/driver/stream.h" -#include "triton/runtime/jit.h" -#include "triton/tools/bench.hpp" -#include "triton/dnn/batchnorm.h" - -#define EIGEN_USE_GPU -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/util/cuda_kernel_helper.h" -#include "tensorflow/core/util/padding.h" -#include "tensorflow/core/util/tensor_format.h" -#include "tensorflow/core/framework/common_shape_fns.h" - -using namespace tensorflow; -using shape_inference::DimensionHandle; -using shape_inference::InferenceContext; -using shape_inference::ShapeHandle; -using GPUDevice = Eigen::GpuDevice; - -class BatchnormForwardOp : public OpKernel { -public: - explicit BatchnormForwardOp(OpKernelConstruction* context): OpKernel(context) { - context->GetAttr("eps", &eps_); - } - - void Compute(OpKernelContext* context){ - // get device/stream - GPUDevice device = context->eigen_device(); - triton::driver::cu_stream sstream(device.stream(), false); - triton::driver::context* ctx = sstream.context(); - triton::driver::stream* stream = &sstream; - // get inputs - const Tensor& fw_x = context->input(0); - const Tensor& fw_g = context->input(1); - const Tensor& fw_b = context->input(2); - // get sizes - int C = fw_x.dim_size(0); - int H = fw_x.dim_size(1); - int W = fw_x.dim_size(2); - int B = fw_x.dim_size(3); - // allocate outputs - Tensor* fw_y = nullptr; - Tensor* fw_m = nullptr; - Tensor* fw_v = nullptr; - OP_REQUIRES_OK(context, context->allocate_output(0, fw_x.shape(), &fw_y)); - OP_REQUIRES_OK(context, context->allocate_output(1, fw_g.shape(), &fw_m)); - OP_REQUIRES_OK(context, context->allocate_output(2, fw_g.shape(), &fw_v)); - // triton handles - triton::driver::cu_buffer x(ctx, fw_x.tensor_data().size(), (CUdeviceptr)fw_x.tensor_data().data(), false); - triton::driver::cu_buffer g(ctx, fw_g.tensor_data().size(), (CUdeviceptr)fw_g.tensor_data().data(), false); - triton::driver::cu_buffer b(ctx, fw_b.tensor_data().size(), (CUdeviceptr)fw_b.tensor_data().data(), false); - triton::driver::cu_buffer y(ctx, fw_y->tensor_data().size(), (CUdeviceptr)fw_y->tensor_data().data(), false); - triton::driver::cu_buffer m(ctx, fw_m->tensor_data().size(), (CUdeviceptr)fw_m->tensor_data().data(), false); - triton::driver::cu_buffer v(ctx, fw_v->tensor_data().size(), (CUdeviceptr)fw_v->tensor_data().data(), false); - // create config - triton::dnn::batchnorm_forward batchnorm(C, 1, H, W, B, "float", triton::dnn::FULL_TUNING); - batchnorm.enqueue(stream, {&y, &m, &v, &x, &g, &b}); - } - -private: - float eps_; -}; - - -REGISTER_KERNEL_BUILDER(Name("BatchnormForward").Device(DEVICE_GPU), BatchnormForwardOp); -REGISTER_OP("BatchnormForward") - .Input("x: T") - .Input("g: float") - .Input("b: float") - .Output("y: T") - .Output("m: float") - .Output("v: float") - .Attr("T: {float}") - .Attr("eps: float") - .SetShapeFn([](InferenceContext* ctx) { - ctx->set_output(0, ctx->input(0)); - ctx->set_output(1, ctx->input(1)); - ctx->set_output(2, ctx->input(1)); - return Status::OK(); - }) -; - - -class BatchnormBackwardOp : public OpKernel { -public: - explicit BatchnormBackwardOp(OpKernelConstruction* context): OpKernel(context) { - context->GetAttr("eps", &eps_); - } - - void Compute(OpKernelContext* context){ - // get device/stream - GPUDevice device = context->eigen_device(); - triton::driver::cu_stream sstream(device.stream(), false); - triton::driver::context* ctx = sstream.context(); - triton::driver::stream* stream = &sstream; - // get inputs - const Tensor& fw_dy = context->input(0); - const Tensor& fw_x = context->input(1); - const Tensor& fw_g = context->input(2); - const Tensor& fw_m = context->input(3); - const Tensor& fw_v = context->input(4); - // get sizes - int C = fw_x.dim_size(0); - int H = fw_x.dim_size(1); - int W = fw_x.dim_size(2); - int B = fw_x.dim_size(3); - // allocate outputs - Tensor* fw_dx = nullptr; - Tensor* fw_dg = nullptr; - Tensor* fw_db = nullptr; - OP_REQUIRES_OK(context, context->allocate_output(0, fw_x.shape(), &fw_dx)); - OP_REQUIRES_OK(context, context->allocate_output(1, fw_g.shape(), &fw_dg)); - OP_REQUIRES_OK(context, context->allocate_output(2, fw_g.shape(), &fw_db)); - // triton handles - triton::driver::cu_buffer dy(ctx, fw_dy.tensor_data().size(), (CUdeviceptr)fw_dy.tensor_data().data(), false); - triton::driver::cu_buffer x(ctx, fw_x.tensor_data().size(), (CUdeviceptr)fw_x.tensor_data().data(), false); - triton::driver::cu_buffer g(ctx, fw_g.tensor_data().size(), (CUdeviceptr)fw_g.tensor_data().data(), false); - triton::driver::cu_buffer m(ctx, fw_m.tensor_data().size(), (CUdeviceptr)fw_m.tensor_data().data(), false); - triton::driver::cu_buffer v(ctx, fw_v.tensor_data().size(), (CUdeviceptr)fw_v.tensor_data().data(), false); - triton::driver::cu_buffer dx(ctx, fw_dx->tensor_data().size(), (CUdeviceptr)fw_dx->tensor_data().data(), false); - triton::driver::cu_buffer dg(ctx, fw_dg->tensor_data().size(), (CUdeviceptr)fw_dg->tensor_data().data(), false); - triton::driver::cu_buffer db(ctx, fw_db->tensor_data().size(), (CUdeviceptr)fw_db->tensor_data().data(), false); - // create config - triton::dnn::batchnorm_backward batchnorm(C, 1, H, W, B, "float", triton::dnn::FULL_TUNING); - batchnorm.enqueue(stream, {&dx, &dg, &db, &dy, &x, &g, &m, &v}); - } - -private: - float eps_; -}; - - -REGISTER_KERNEL_BUILDER(Name("BatchnormBackward").Device(DEVICE_GPU), BatchnormBackwardOp); -REGISTER_OP("BatchnormBackward") - .Input("dy: TY") - .Input("x: TX") - .Input("g: float") - .Input("m: float") - .Input("v: float") - .Output("dx: TY") - .Output("dg: float") - .Output("db: float") - .Attr("TX: {float}") - .Attr("TY: {float}") - .Attr("eps: float") - .SetShapeFn([](InferenceContext* ctx) { - ctx->set_output(0, ctx->input(1)); - ctx->set_output(1, ctx->input(2)); - ctx->set_output(2, ctx->input(2)); - return Status::OK(); - }) -; diff --git a/examples/python/tensorflow/blocksparse.cpp b/examples/python/tensorflow/blocksparse.cpp deleted file mode 100644 index 1ff5e9f6f..000000000 --- a/examples/python/tensorflow/blocksparse.cpp +++ /dev/null @@ -1,304 +0,0 @@ -#include - -#include "triton/driver/buffer.h" -#include "triton/driver/backend.h" -#include "triton/driver/stream.h" -#include "triton/runtime/jit.h" -#include "triton/dnn/blocksparse/dot.h" - -#define EIGEN_USE_GPU -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/util/cuda_kernel_helper.h" -#include "tensorflow/core/util/padding.h" -#include "tensorflow/core/util/tensor_format.h" -#include "tensorflow/core/framework/common_shape_fns.h" -#include "tensorflow/core/framework/allocation_description.pb.h" - -using namespace tensorflow; -using shape_inference::DimensionHandle; -using shape_inference::InferenceContext; -using shape_inference::ShapeHandle; -using GPUDevice = Eigen::GpuDevice; - - -Status XpropShape(InferenceContext* ctx) -{ - int K; TF_RETURN_IF_ERROR(ctx->GetAttr( "K", &K)); - int axis; TF_RETURN_IF_ERROR(ctx->GetAttr("axis", &axis)); - - // C ==> K - ShapeHandle x = ctx->input(0); - int rank = ctx->Rank(x); - //printf("XpropShape: %d\n", rank); - if (rank > 0) - { - std::vector shape; - shape.reserve(rank); - for (int i = 0; i < rank; i++) - shape.push_back(i == axis ? ctx->MakeDim(K) : ctx->Dim(x, i)); - ctx->set_output(0, ctx->MakeShape(shape)); - } - else - ctx->set_output(0, ctx->UnknownShape()); - ctx->set_output(1, ctx->UnknownShape()); - return Status::OK(); -} - -Status UpdatShape(InferenceContext* ctx) -{ - //printf("UpdatShape: %d\n", ctx->Rank(ctx->input(0))); - - int blocks, bsize; - TF_RETURN_IF_ERROR(ctx->GetAttr("blocks", &blocks)); - TF_RETURN_IF_ERROR(ctx->GetAttr("bsize", &bsize)); - - // (blocks, block_size, block_size) - DimensionHandle bsize_dim = ctx->MakeDim(bsize); - ctx->set_output(0, ctx->MakeShape({ ctx->MakeDim(blocks), bsize_dim, bsize_dim })); - return Status::OK(); -} - -typedef struct bsmm_params -{ - const int* Lut; - const float* Gate; - int* Lock; - int blocks; - int bsize; - int segments; - int locks; - int C; - int K; - int N; - int shared; - int pcount; - uint blk_a; - uint blk_A; - uint blk_b; - uint blk_B; - float alpha; - float beta; - CUstream stream; -} bsmm_params; - -template -class BlocksparseMatmulOp : public OpKernel { -private: - void ComputeDw(OpKernelContext* context){ - // get device/stream - GPUDevice device = context->eigen_device(); - triton::driver::cu_stream sstream(device.stream(), false); - triton::driver::context* ctx = sstream.context(); - triton::driver::stream* stream = &sstream; - // extract input - OpInputList x, dy, gate; - context->input_list( "x", &x); - context->input_list( "dy", &dy); - context->input_list("gate", &gate); - // sanity checks - params_.pcount = x.size(); - if (params_.pcount > 1) - errors::Internal("No more than 1 input allowed."); - if (params_.beta != 0.0f || params_.alpha != 1.0f) - errors::Internal("Not supported yet"); - // N - int N = 1; - int rank = x[0].dims(); - for (int i = 0; i < rank; i++) - if (i != axis_) - N *= x[0].dim_size(i); - // allocate output - Tensor* C; - TensorShape shapeC({ params_.blocks, params_.bsize, params_.bsize }); - OP_REQUIRES_OK(context, context->allocate_output(0, shapeC, &C)); - // wrap tensorflow handles - triton::driver::cu_buffer da(ctx, x[0].tensor_data().size(), (CUdeviceptr)x[0].tensor_data().data(), false); - triton::driver::cu_buffer db(ctx, dy[0].tensor_data().size(), (CUdeviceptr)dy[0].tensor_data().data(), false); - triton::driver::cu_buffer dc(ctx, C->tensor_data().size(), (CUdeviceptr)C->tensor_data().data(), false); - triton::driver::cu_buffer dlut(ctx, context->input(params_.pcount*2).tensor_data().size(), (CUdeviceptr)context->input(params_.pcount*2).tensor_data().data(), false); - // create profile - triton::dnn::blocksparse::dot dot(N, params_.K, params_.segments, params_.C, "half", params_.bsize, params_.locks, params_.blocks, OP); - // enqueue - dot.enqueue(stream, {&da, &db, &dc, &dlut}, triton::dnn::FULL_TUNING); - } - - void ComputeYDx(OpKernelContext* context){ - // get device/stream - GPUDevice device = context->eigen_device(); - triton::driver::cu_stream sstream(device.stream(), false); - triton::driver::context* ctx = sstream.context(); - triton::driver::stream* stream = &sstream; - // get inputs - const Tensor& a = context->input(0); - const Tensor& b = context->input(1); - const Tensor& lut = context->input(2); - // allocate c - TensorShape shape_c; - int N = 1; - int rank_a = a.dims(); - for (int i = 0; i < rank_a; i++) - if (i != axis_) { - shape_c.AddDim(a.dim_size(i)); - N *= a.dim_size(i); - } - else - shape_c.AddDim(params_.K); - Tensor* c = nullptr; - OP_REQUIRES_OK(context, context->allocate_output(0, shape_c, &c)); - // wrap tensorflow handles - triton::driver::cu_buffer da(ctx, a.tensor_data().size(), (CUdeviceptr)a.tensor_data().data(), false); - triton::driver::cu_buffer db(ctx, b.tensor_data().size(), (CUdeviceptr)b.tensor_data().data(), false); - triton::driver::cu_buffer dc(ctx, c->tensor_data().size(), (CUdeviceptr)c->tensor_data().data(), false); - triton::driver::cu_buffer dlut(ctx, lut.tensor_data().size(), (CUdeviceptr)lut.tensor_data().data(), false); - // create profile - triton::dnn::blocksparse::dot dot(N, params_.K, params_.segments, params_.C, "half", params_.bsize, params_.locks, params_.blocks, OP); - // enqueue - triton::dnn::base* op = dot.enqueue(stream, {&da, &db, &dc, &dlut}, triton::dnn::NO_TUNING); - triton::driver::buffer* locks_buffer = ((triton::dnn::blocksparse::dot*)op)->get_locks(); - Tensor *tmp = nullptr; - TensorShape tmp_shapes; - tmp_shapes.AddDim(locks_buffer->size() / 4); - OP_REQUIRES_OK(context, context->allocate_output(1, tmp_shapes, &tmp)); - } - -public: - - explicit BlocksparseMatmulOp(OpKernelConstruction* ctx) : OpKernel(ctx) { - OP_REQUIRES_OK(ctx, ctx->GetAttr("segments", ¶ms_.segments)); - OP_REQUIRES_OK(ctx, ctx->GetAttr("locks", ¶ms_.locks )); - OP_REQUIRES_OK(ctx, ctx->GetAttr("blocks", ¶ms_.blocks )); - OP_REQUIRES_OK(ctx, ctx->GetAttr("bsize", ¶ms_.bsize )); - OP_REQUIRES_OK(ctx, ctx->GetAttr("C", ¶ms_.C )); - OP_REQUIRES_OK(ctx, ctx->GetAttr("K", ¶ms_.K )); - OP_REQUIRES_OK(ctx, ctx->GetAttr("shared", ¶ms_.shared )); - OP_REQUIRES_OK(ctx, ctx->GetAttr("alpha", ¶ms_.alpha )); - OP_REQUIRES_OK(ctx, ctx->GetAttr("beta", ¶ms_.beta )); - OP_REQUIRES_OK(ctx, ctx->GetAttr("gated_dw", &gated_dw_ )); - OP_REQUIRES_OK(ctx, ctx->GetAttr("axis", &axis_ )); - OP_REQUIRES_OK(ctx, ctx->GetAttr("bench", &bench_)); - OP_REQUIRES(ctx, params_.K < params_.bsize*65536, errors::InvalidArgument("K < bsize*65536")); - OP_REQUIRES(ctx, params_.C < params_.bsize*65536, errors::InvalidArgument("C < bsize*65536")); - params_.pcount = 1; - params_.blk_A = 0; - is_gpu_ = ctx->device_type() == DEVICE_GPU; - if (bench_) { - repeat_ = bench_; - flops_ = (float)(params_.blocks * params_.bsize*params_.bsize); - const char* op = "FPROP"; - sprintf(bench_string_, "%s %02d-%d C:%05d K:%05d blks:%d", op, params_.bsize, axis_, params_.C, params_.K, params_.blocks); - } - } - - void Compute(OpKernelContext* context) override{ - if(OP == triton::dnn::blocksparse::WGRAD) - ComputeDw(context); - else - ComputeYDx(context); - } - -private: - bsmm_params params_; - int axis_, bench_, repeat_, SMs_, major_, grid_n_; - float flops_; - bool gated_dw_, is_gpu_; - char bench_string_[256]; -}; - -REGISTER_OP("TritonBlocksparseMatmul") -.Input("x: T") -.Input("w: T") -.Input("lut: int64") -.Input("lut_dx: int64") -.Input("lut_dw: int64") -.Input("gate: ngate * float") -.Output("y: T") -.Output("temp: int32") -.Attr("T: {half, float, bfloat16}") -.Attr("blocks: int >=0") -.Attr("bsize: int") -.Attr("segments: int = 0") -.Attr("segments_dx: int = 0") -.Attr("locks: int = 0") -.Attr("locks_dx: int = 0") -.Attr("axis: int = 1") -.Attr("C: int >=0") -.Attr("K: int >=0") -.Attr("shared: int = 0") -.Attr("shared_dx: int = 0") -.Attr("alpha: float = 1.0") -.Attr("beta: float = 0.0") -.Attr("gated_dw: bool = false") -.Attr("gate_grad: bool = false") -.Attr("bench: int = 0") -.Attr("ngate: int >= 0") -.SetShapeFn(XpropShape) -.Doc(R"doc( - Multiply the matrix "a" by the blocksparse matrix "b". - )doc"); - -REGISTER_KERNEL_BUILDER(Name("TritonBlocksparseMatmul").Device(DEVICE_GPU).TypeConstraint("T"), BlocksparseMatmulOp); -REGISTER_KERNEL_BUILDER(Name("TritonBlocksparseMatmul").Device(DEVICE_GPU).TypeConstraint("T"), BlocksparseMatmulOp); - - -REGISTER_OP("TritonBlocksparseMatmulDX") - .Input("dy: T") - .Input("w: T") - .Input("lut: int64") - .Input("gate: ngate * float") - .Output("dx: T") - .Output("temp: int32") - .Attr("T: {half, float, bfloat16}") - .Attr("blocks: int >=0") - .Attr("bsize: int") - .Attr("segments: int = 0") - .Attr("locks: int = 0") - .Attr("axis: int = 1") - .Attr("C: int >=0") - .Attr("K: int >=0") - .Attr("shared: int = 0") - .Attr("alpha: float = 1.0") - .Attr("beta: float = 0.0") - .Attr("gated_dw: bool = false") - .Attr("gate_grad: bool = false") - .Attr("bench: int = 0") - .Attr("ngate: int >= 0") - .SetShapeFn(XpropShape) - .Doc(R"doc( -Multiply the matrix "a" by the blocksparse matrix "b". -)doc"); - -REGISTER_KERNEL_BUILDER(Name("TritonBlocksparseMatmulDX").Device(DEVICE_GPU).TypeConstraint("T"),BlocksparseMatmulOp); -REGISTER_KERNEL_BUILDER(Name("TritonBlocksparseMatmulDX").Device(DEVICE_GPU).TypeConstraint("T"),BlocksparseMatmulOp); - - -REGISTER_OP("TritonBlocksparseMatmulDW") - .Input("x: params * T") - .Input("dy: params * T") - .Input("lut: int64") - .Input("gate: ngate * float") - .Output("dw: T") - .Attr("T: {half, float, bfloat16}") - .Attr("params: int") - .Attr("blocks: int >=0") - .Attr("bsize: int") - .Attr("segments: int = 0") - .Attr("locks: int = 0") - .Attr("axis: int = 1") - .Attr("C: int >=0") - .Attr("K: int >=0") - .Attr("shared: int = 0") - .Attr("alpha: float = 1.0") - .Attr("beta: float = 0.0") - .Attr("gated_dw: bool = false") - .Attr("gate_grad: bool = false") - .Attr("bench: int = 0") - .Attr("ngate: int >= 0") - .SetShapeFn(UpdatShape) - .Doc(R"doc( -Multiply the matrix "a" by the blocksparse matrix "b". -)doc"); - -REGISTER_KERNEL_BUILDER(Name("TritonBlocksparseMatmulDW").Device(DEVICE_GPU).TypeConstraint("T"),BlocksparseMatmulOp); -REGISTER_KERNEL_BUILDER(Name("TritonBlocksparseMatmulDW").Device(DEVICE_GPU).TypeConstraint("T"),BlocksparseMatmulOp); diff --git a/examples/python/tensorflow/conv.cpp b/examples/python/tensorflow/conv.cpp deleted file mode 100644 index 00bf05473..000000000 --- a/examples/python/tensorflow/conv.cpp +++ /dev/null @@ -1,82 +0,0 @@ -#include - -#include "triton/driver/buffer.h" -#include "triton/driver/backend.h" -#include "triton/driver/stream.h" -#include "triton/runtime/jit.h" -#include "triton/tools/bench.hpp" -#include "triton/dnn/conv.h" - -#define EIGEN_USE_GPU -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/util/cuda_kernel_helper.h" -#include "tensorflow/core/util/padding.h" -#include "tensorflow/core/util/tensor_format.h" -#include "tensorflow/core/framework/common_shape_fns.h" - -using namespace tensorflow; -using GPUDevice = Eigen::GpuDevice; - -class Conv2dOp : public OpKernel { -public: - explicit Conv2dOp(OpKernelConstruction* context) : OpKernel(context) { - } - - void Compute(OpKernelContext* context){ - // get device/stream - GPUDevice device = context->eigen_device(); - triton::driver::cu_stream sstream(device.stream(), false); - triton::driver::context* ctx = sstream.context(); - triton::driver::stream* stream = &sstream; - // get inputs - const Tensor& tfa = context->input(0); - const Tensor& tfb = context->input(1); - // get shapes - int32_t B = tfa.dim_size(0); - int32_t Ca = tfa.dim_size(1); - int32_t D = 1; - int32_t H = tfa.dim_size(2); - int32_t W = tfa.dim_size(3); - int32_t Cb = tfb.dim_size(0); - int32_t T = 1; - int32_t R = tfb.dim_size(1); - int32_t S = tfb.dim_size(2); - int32_t NF = tfb.dim_size(3); - assert(Ca == Cb); - int32_t C = Ca; - int32_t stride_d = 1, stride_h = 1, stride_w = 1; - int32_t pad_d = 0, pad_h = 0, pad_w = 0; - bool has_bias = false; - // wrap buffers - triton::driver::cu_buffer a(ctx, tfa.tensor_data().size(), (CUdeviceptr)tfa.tensor_data().data(), false); - triton::driver::cu_buffer b(ctx, tfb.tensor_data().size(), (CUdeviceptr)tfb.tensor_data().data(), false); - triton::driver::buffer* bias = nullptr; - // template - triton::dnn::conv conv(B, C, - D, H, W, - T, R, S, - NF, - stride_d, stride_h, stride_w, - pad_d, pad_h, pad_w, - 1, 1, 1, - "half", "half", - triton::dnn::conv::FPROP, has_bias); - // allocate output - auto c_shapes = conv.c_shapes(); - Tensor* tfc = nullptr; - TensorShape out_shape({c_shapes[0], c_shapes[1], c_shapes[2], c_shapes[3]}); - OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &tfc)); - triton::driver::cu_buffer c(ctx, tfc->tensor_data().size(), (CUdeviceptr)tfc->tensor_data().data(), false); - // enqueue - conv.enqueue(stream, {&a, &b, &c, bias}); - } -}; - -REGISTER_KERNEL_BUILDER(Name("Conv2d").Device(DEVICE_GPU), Conv2dOp); -REGISTER_OP("Conv2d") - .Input("a: float16") - .Input("b: float16") - .Output("c: float32") -; diff --git a/examples/python/tensorflow/dot.cpp b/examples/python/tensorflow/dot.cpp deleted file mode 100644 index 453bb87cb..000000000 --- a/examples/python/tensorflow/dot.cpp +++ /dev/null @@ -1,64 +0,0 @@ -#include - -#include "triton/driver/buffer.h" -#include "triton/driver/backend.h" -#include "triton/driver/stream.h" -#include "triton/runtime/jit.h" -#include "triton/tools/bench.hpp" -#include "triton/dnn/dot.h" - -#define EIGEN_USE_GPU -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/util/cuda_kernel_helper.h" -#include "tensorflow/core/util/padding.h" -#include "tensorflow/core/util/tensor_format.h" -#include "tensorflow/core/framework/common_shape_fns.h" - -using namespace tensorflow; -using GPUDevice = Eigen::GpuDevice; - -class DotOp : public OpKernel { - public: - explicit DotOp(OpKernelConstruction* context) : OpKernel(context) { - } - - void Compute(OpKernelContext* context){ - // get device/stream - GPUDevice device = context->eigen_device(); - triton::driver::cu_stream sstream(device.stream(), false); - triton::driver::context* ctx = sstream.context(); - triton::driver::stream* stream = &sstream; - // get inputs - const Tensor& a = context->input(0); - const Tensor& b = context->input(1); - // get shapes - const int32_t M = a.dim_size(0); - const int32_t N = b.dim_size(0); - const int32_t K = a.dim_size(1); - // allocate output - Tensor* c = nullptr; - TensorShape out_shape({(int64)M, (int64)N}); - OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &c)); - // return early if possible - if (out_shape.num_elements() == 0) - return; - // matrix multiplication parameters - triton::driver::cu_buffer da(ctx, a.tensor_data().size(), (CUdeviceptr)a.tensor_data().data(), false); - triton::driver::cu_buffer db(ctx, b.tensor_data().size(), (CUdeviceptr)b.tensor_data().data(), false); - triton::driver::cu_buffer dc(ctx, c->tensor_data().size(), (CUdeviceptr)c->tensor_data().data(), false); - // template - triton::dnn::dot dot(M, N, K, false, false, "half", "half", "float", 8, 8, 8); - dot.enqueue(stream, {&da, &db, &dc}, triton::dnn::autotuning_t::NO_TUNING); - } - -private: -}; - -REGISTER_KERNEL_BUILDER(Name("Dot").Device(DEVICE_GPU), DotOp); -REGISTER_OP("Dot") - .Input("a: float16") - .Input("b: float16") - .Output("c: float32") -; diff --git a/examples/python/tensorflow/run.py b/examples/python/tensorflow/run.py deleted file mode 100644 index ffdde3f76..000000000 --- a/examples/python/tensorflow/run.py +++ /dev/null @@ -1,136 +0,0 @@ -import os -import tensorflow as tf -from tensorflow.python.framework import ops -import numpy as np -from time import time - -data_files_path = tf.resource_loader.get_data_files_path() -library_dir = os.path.dirname(os.path.realpath(__file__)) -module = tf.load_op_library(os.path.join(library_dir, 'libtf_blocksparse.so')) - -def run_dot(): - M, N, K = 128, 128, 128 - a = tf.placeholder(tf.float16, shape=[M, K]) - b = tf.placeholder(tf.float16, shape=[N, K]) - # c = tf.matmul(a, b, transpose_a=True) - c = module.dot(a, b) - # Reference - ha = np.random.rand(M, K).astype(np.float16) - hb = np.random.rand(N, K).astype(np.float16) - # Run - sess = tf.InteractiveSession() - sess.run(tf.global_variables_initializer()) - result = sess.run([c], feed_dict = {a: ha, - b: hb})[0] - # Test - hresult = np.dot(ha.T, hb.T).T - dif = np.abs(result - hresult) - np.savetxt('dif.dat', dif, '%2.4f') - print(hresult) - print(result) - print("dif: %f" % np.max(dif)) - -def run_conv(): - B, C, H, W = 16, 32, 32, 32 - R, S, NF = 3, 3, 32 - a = tf.placeholder(tf.float32, shape=[B, C, H, W]) - b = tf.placeholder(tf.float32, shape=[C, R, S, NF]) - c = module.conv2d(a, b) - # Reference - ha = np.random.rand(B, C, H, W) - hb = np.random.rand(C, R, S, NF) - # Run - sess = tf.InteractiveSession() - sess.run(tf.global_variables_initializer()) - result = sess.run([c], feed_dict = {a: ha, - b: hb})[0] - - -@ops.RegisterGradient('ShiftConv') -def blocksparse_matmul_grad(op, dy): - shift_h = op.get_attr('shift_h') - shift_w = op.get_attr('shift_w') - stride_h = op.get_attr('stride_h') - stride_w = op.get_attr('stride_w') - x = op.inputs[0] - w = op.inputs[1] - dx = module.shift_conv_dx(dy, w, stride_h=stride_h, stride_w=stride_w, shift_h=shift_h, shift_w=shift_w) - dw = module.shift_conv_dw(dy, x, stride_h=stride_h, stride_w=stride_w, shift_h=shift_h, shift_w=shift_w) - return (dx, dw) - -def run_shift(): - B, C, H, W = 2, 16, 4, 4 - R, S, F = 3, 3, 16 - stride_h, stride_w = 1, 1 - np.random.seed(2) - a = tf.placeholder(tf.float16, shape=[B, C, H, W]) - b = tf.placeholder(tf.float16, shape=[C, F]) - hshift_h = np.random.randint(- (R//2), R//2 + 1, size=C, dtype=np.int32) - hshift_w = np.random.randint(- (S//2), R//2 + 1, size=C, dtype=np.int32) - c = module.shift_conv(a, b, stride_h=stride_h, stride_w=stride_w, shift_h=tf.make_tensor_proto(hshift_h), shift_w=tf.make_tensor_proto(hshift_w)) - # feed values - ha = np.random.rand(B, C, H, W)*0.1 - hb = np.random.rand(C, F)*0.1 - sess = tf.InteractiveSession() - # check gradients - grads = tf.test.compute_gradient([a, b], [(B, C, H, W), (C, F)], c, (B, F, H//stride_h, W//stride_w), - extra_feed_dict = {a: ha, b: hb}, delta=1e-2) - dw_t, dw_n = grads[1] - dx_t, dx_n = grads[0] - #import sys - #np.set_printoptions(threshold=sys.maxsize) - print(dw_t) - print(dw_n) - print(np.max(np.abs(dw_t - dw_n))) - print(np.max(np.abs(dx_t - dx_n))) - # Run - sess.run(tf.global_variables_initializer()) - result = sess.run([c], feed_dict = {a: ha, - b: hb})[0] - #print(result) - - -def batch_norm(x, g, b, epsilon=1e-6): - shape = x.shape - C = int(shape[1]) - assert g.get_shape().num_elements() == C - assert b.get_shape().num_elements() == C - return module.batchnorm_forward(x, g, b, eps=epsilon) - -@ops.RegisterGradient("BatchnormForward") -def batch_norm_grad(op, dy, mean, var): - eps = op.get_attr("eps") - return module.batchnorm_backward(dy, op.inputs[0], op.inputs[1], - op.outputs[1], op.outputs[2], eps=eps) - - -def run_batchnorm(): - C, H, W, B = 8, 4, 4, 32 - np.random.seed(0) - # Placeholders - x = tf.placeholder(tf.float32, shape=[C, H, W, B]) - g = tf.placeholder(tf.float32, shape=[C]) - b = tf.placeholder(tf.float32, shape=[C]) - # Feed values - hx = np.random.rand(C, H, W, B) - hg = np.random.rand(C) - hb = np.random.rand(C) - # batchnorm - y, m, v = module.batchnorm_forward(x, g, b, eps=1e-5) - loss = np.sum(y) - # Run - sess = tf.InteractiveSession() - sess.run(tf.global_variables_initializer()) - result = sess.run([y, m, v], feed_dict = {x: hx, g: hg, b: hb}) - grads = tf.test.compute_gradient([x, g, b], [(C, H, W, B), (C, ), (C, )], y, (C, H, W, B), - extra_feed_dict = {x: hx, g: hg, b: hb}) - dx_t, dx_n = grads[0] - dg_t, dg_n = grads[1] - db_t, db_n = grads[2] - print(np.max(np.abs(dx_t - dx_n))) - print(np.max(np.abs(dg_t - dg_n))) - print(np.max(np.abs(db_t - db_n))) - -run_dot() -#run_shift() -#run_batchnorm() diff --git a/examples/python/tensorflow/shift.cpp b/examples/python/tensorflow/shift.cpp deleted file mode 100644 index cb28ce281..000000000 --- a/examples/python/tensorflow/shift.cpp +++ /dev/null @@ -1,167 +0,0 @@ -#include - -#include "triton/driver/buffer.h" -#include "triton/driver/backend.h" -#include "triton/driver/stream.h" -#include "triton/runtime/jit.h" -#include "triton/tools/bench.hpp" -#include "triton/dnn/shift.h" - -#define EIGEN_USE_GPU -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/util/cuda_kernel_helper.h" -#include "tensorflow/core/util/padding.h" -#include "tensorflow/core/util/tensor_format.h" -#include "tensorflow/core/framework/common_shape_fns.h" - -using namespace tensorflow; -using GPUDevice = Eigen::GpuDevice; - -template -class ShiftConvOp : public OpKernel { -public: - explicit ShiftConvOp(OpKernelConstruction* context) : OpKernel(context), layout_(triton::dnn::NCHW) { - context->GetAttr("shift_h", &h_shift_h_); - context->GetAttr("shift_w", &h_shift_w_); - context->GetAttr("stride_h", &stride_h_); - context->GetAttr("stride_w", &stride_w_); - R_ = 3; - S_ = 3; - } - - void ExtractShapes(const Tensor &x, int64_t &C, int64_t &H, int64_t &W, int64_t &B) { - if(layout_ == triton::dnn::CHWN){ - C = x.dim_size(0); - H = x.dim_size(1); - W = x.dim_size(2); - B = x.dim_size(3); - } - else if(layout_ == triton::dnn::NCHW){ - B = x.dim_size(0); - C = x.dim_size(1); - H = x.dim_size(2); - W = x.dim_size(3); - } - else{ - throw std::runtime_error("unsupported layout"); - } - } - - void FillShapes(OpKernelContext* context, - int64_t &C, int64_t &H, int64_t &W, int64_t &B, int64_t &F, - const Tensor& tf_a, const Tensor& tf_b) { - if(OP == triton::dnn::WGRAD) { - int64_t Ha, Wa, Ba; - int64_t Hb, Wb, Bb; - ExtractShapes(tf_a, F, Ha, Wa, Ba); - ExtractShapes(tf_b, C, Hb, Wb, Bb); - OP_REQUIRES(context, Ha*stride_h_ == Hb, tensorflow::errors::InvalidArgument("operands must have the same image height")); - OP_REQUIRES(context, Wa*stride_w_ == Wb, tensorflow::errors::InvalidArgument("operands must have the same image width")); - OP_REQUIRES(context, Ba == Bb, tensorflow::errors::InvalidArgument("operands must have the same batch size")); - H = Hb; - W = Wb; - B = Bb; - } - else { - // shapes for a - int64_t Ca; - ExtractShapes(tf_a, Ca, H, W, B); - if(OP == triton::dnn::BPROP){ - H *= stride_h_; - W *= stride_w_; - } - // shapes for b - int64_t Cb = tf_b.dim_size(0); - F = tf_b.dim_size(1); - if(OP == triton::dnn::BPROP) - std::swap(Cb, F); - // checks - OP_REQUIRES(context, Ca == Cb, tensorflow::errors::InvalidArgument("operands must have the same number of channels")); - C = Ca; - if(OP == triton::dnn::BPROP) - std::swap(C, F); - } - } - - void Compute(OpKernelContext* context){ - // get device/stream - GPUDevice device = context->eigen_device(); - triton::driver::cu_stream sstream(device.stream(), false); - triton::driver::context* ctx = sstream.context(); - triton::driver::stream* stream = &sstream; - // get inputs - const Tensor& tf_a = context->input(0); - const Tensor& tf_b = context->input(1); - // shapes - int64_t C, H, W, B, F; - FillShapes(context, C, H, W, B, F, tf_a, tf_b); - int64_t D = 1, T = 1; - bool has_bias = false; - // shift offsets - int32_t* shift_h_data = h_shift_h_.flat().data(); - int32_t* shift_w_data = h_shift_w_.flat().data(); - // create configuration - triton::dnn::shift shift(B, C, D, H, W, T, R_, S_, F, - stride_h_, stride_w_, - shift_h_data, shift_w_data, - "half", "half", OP, has_bias, layout_); - - // shapes for c - std::vector c_shapes; - for(int32_t x: shift.c_shapes()) - c_shapes.push_back(x); - TensorShape out_shapes(c_shapes); - Tensor* tf_c = nullptr; - OP_REQUIRES_OK(context, context->allocate_output(0, out_shapes, &tf_c)); - // return early if possible - if (out_shapes.num_elements() == 0) - return; - // matrix multiplication parameters - triton::driver::cu_buffer da(ctx, tf_a.tensor_data().size(), (CUdeviceptr)tf_a.tensor_data().data(), false); - triton::driver::cu_buffer db(ctx, tf_b.tensor_data().size(), (CUdeviceptr)tf_b.tensor_data().data(), false); - triton::driver::cu_buffer dc(ctx, tf_c->tensor_data().size(), (CUdeviceptr)tf_c->tensor_data().data(), false); - shift.enqueue(stream, {&da, &db, &dc}, triton::dnn::PARTIAL_TUNING); - } - -private: - Tensor h_shift_h_; - Tensor h_shift_w_; - int stride_h_; - int stride_w_; - int R_; - int S_; - triton::dnn::layout_t layout_; -}; - -REGISTER_KERNEL_BUILDER(Name("ShiftConv").Device(DEVICE_GPU), ShiftConvOp); -REGISTER_OP("ShiftConv") - .Input("a: float16") - .Input("b: float16") - .Attr("shift_h: tensor") - .Attr("shift_w: tensor") - .Attr("stride_h: int") - .Attr("stride_w: int") - .Output("c: float16"); - -REGISTER_KERNEL_BUILDER(Name("ShiftConvDx").Device(DEVICE_GPU), ShiftConvOp); -REGISTER_OP("ShiftConvDx") - .Input("a: float16") - .Input("b: float16") - .Attr("shift_h: tensor") - .Attr("shift_w: tensor") - .Attr("stride_h: int") - .Attr("stride_w: int") - .Output("c: float16"); - -REGISTER_KERNEL_BUILDER(Name("ShiftConvDw").Device(DEVICE_GPU), ShiftConvOp); -REGISTER_OP("ShiftConvDw") - .Input("a: float16") - .Input("b: float16") - .Attr("shift_h: tensor") - .Attr("shift_w: tensor") - .Attr("stride_h: int") - .Attr("stride_w: int") - .Output("c: float16"); - diff --git a/include/triton/runtime/arg.h b/include/triton/runtime/arg.h index 3f7131fbc..af55f4014 100644 --- a/include/triton/runtime/arg.h +++ b/include/triton/runtime/arg.h @@ -24,7 +24,7 @@ enum arg_type { BUFFER_T }; -size_t size_of(arg_type ty){ +inline size_t size_of(arg_type ty){ switch(ty){ case INT1_T: return 1; case INT8_T: return 1; @@ -39,7 +39,7 @@ size_t size_of(arg_type ty){ } } -bool is_int_type(arg_type ty){ +inline bool is_int_type(arg_type ty){ return ty == INT1_T || ty == INT8_T || ty == INT16_T || ty == INT32_T || ty == INT64_T; } diff --git a/lib/runtime/function.cpp b/lib/runtime/function.cpp index 24e66397b..034738c93 100644 --- a/lib/runtime/function.cpp +++ b/lib/runtime/function.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include "triton/codegen/selection/selection.h" #include "triton/runtime/function.h" #include "triton/lang/lang.h" @@ -260,174 +261,5 @@ void function::operator()(const std::vector& args, const grid_t& grid, driv return this->operator()(args, [&grid](const params_t&){ return grid; }, stream); } -std::string to_tf_ty(ir::type *ty) { - if(ty->is_integer_ty(1)) - return "bool"; - if(ty->is_integer_ty(8)) - return "int8"; - if(ty->is_integer_ty(16)) - return "int16"; - if(ty->is_integer_ty(32)) - return "int32"; - if(ty->is_integer_ty(64)) - return "int64"; - if(ty->is_half_ty()) - return "float16"; - if(ty->is_float_ty()) - return "float32"; - if(ty->is_double_ty()) - return "float64"; - if(ty->is_pointer_ty()) - return "Tensor"; - throw std::runtime_error("unknown type"); -} - -std::string ref_to_tf_ty(ir::type *ty) { - std::string res = to_tf_ty(ty); - if(ty->is_pointer_ty()) - res = "const " + res + "&"; - return res; -} - - -std::string function::make_tensorflow_src(const std::vector& outputs, const std::string& macro) { - std::unique_ptr ir = make_ir(ast_); - // extract function signature - ir::function* fn = ir->get_function_list().front(); - ir::function_type* fn_ty = fn->get_fn_type(); - // numberof arguments - size_t n_args = fn_ty->get_num_params(); - size_t n_outputs = outputs.size(); - // extract function name - std::string name = fn->get_name(); - std::string classname = name + "Op"; - // extract argument name - std::vector arg_names; - for(ir::argument *arg: fn->args()) - arg_names.push_back(arg->get_name()); - // cached int to str - std::vector str_i; - for(size_t i = 0; i < fn_ty->get_num_params(); i++) - str_i.push_back(std::to_string(i)); - // index of tensors - std::vector ptr_idx; - for(unsigned i = 0; i < fn_ty->get_num_params(); i++) - if(fn_ty->get_param_ty(i)->is_pointer_ty()) - ptr_idx.push_back(i); - // extract tensorflow types - std::vector tf_tys; - std::transform(fn_ty->params_begin(), fn_ty->params_end(), std::back_inserter(tf_tys), to_tf_ty); - std::vector tf_cref_tys; - std::transform(fn_ty->params_begin(), fn_ty->params_end(), std::back_inserter(tf_cref_tys), ref_to_tf_ty); - - std::ostringstream oss; - - std::string result = R"( -#include "triton/driver/buffer.h" -#include "triton/driver/backend.h" -#include "triton/driver/stream.h" -#include "triton/runtime/function.h" - -#define EIGEN_USE_GPU -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/util/cuda_kernel_helper.h" -#include "tensorflow/core/util/padding.h" -#include "tensorflow/core/util/tensor_format.h" -#include "tensorflow/core/framework/common_shape_fns.h" - -using namespace tensorflow; -using GPUDevice = Eigen::GpuDevice; -namespace rt = triton::runtime; -namespace drv = triton::driver; - -std::string src = R"TTKERNSRC( )" + src_ + ")TTKERNSRC\";" + R"( - -class )" + classname + R"(: public OpKernel { - public: - explicit )" + classname + R"((OpKernelConstruction* context) - : OpKernel(context), fn_(src) { } - - void Compute(OpKernelContext* context){ - - // get device/stream - GPUDevice device = context->eigen_device(); - drv::cu_stream sstream(device.stream(), false); - drv::context* ctx = sstream.context(); - drv::stream* stream = &sstream; - - // extract inputs)"; -for(unsigned i = 0; i < n_args; i++){ - std::string suffix = ""; - std::string ty = tf_cref_tys[i]; - if(!fn_ty->get_param_ty(i)->is_pointer_ty()) - suffix = ".scalar<" + ty + ">()()"; - result += R"( - )" + ty + " " + arg_names[i] + " = context->input(" + str_i[i] + ")" + suffix + ";"; -} - -result += R"( - - // extract outputs)"; -for(unsigned i = 0; i < n_outputs; i++) - result += R"( - context->set_output()" + str_i[i] + ", " + arg_names[outputs[i]] + ");"; - -result += R"( - - // wrap tensors)"; -for(size_t i: ptr_idx) -result += R"( - drv::cu_buffer cu_)" + arg_names[i] + "(ctx, " + arg_names[i] + ".tensor_data().size(), (CUdeviceptr)" + arg_names[i] + R"(.tensor_data().data(), false);)"; - - -std::regex regex("#([a-zA-Z]([a-zA-Z]|[0-9])*)"); -std::string grid_str = std::regex_replace(macro, regex, "x.at(\"$1\")"); - -result += R"( - - // create launch grid; - auto grid = [&](const rt::params_t& x) { return rt::grid_t{)" + grid_str + R"(}; };)"; - -result += R"( - - // execute function - fn_({ - )"; -for(unsigned i = 0; i < n_args; i++){ - std::string arg = arg_names[i]; - if(fn_ty->get_param_ty(i)->is_pointer_ty()) - arg = "&cu_" + arg; - if(i > 0) - result += ", "; - result += arg; -} -result += R"( - }, grid, stream); - - } - -private: - rt::function fn_; -}; - -REGISTER_KERNEL_BUILDER(Name(")" + name + "\").Device(DEVICE_GPU), " + classname + R"(); - -REGISTER_OP(")" + name + "\")\n"; -for(size_t i = 0; i < tf_tys.size(); i++){ - bool is_output = std::find(outputs.begin(), outputs.end(), i) != outputs.end(); - std::string mode = is_output ? "Output" : "Input" ; - result += " ." + mode + "(\"" + arg_names[i] + ": " + tf_tys[i] + "\")\n"; -} -result += ";\n"; - - - return result; -} - - - - } } diff --git a/python/dist/triton-0.1-py3.6-linux-x86_64.egg b/python/dist/triton-0.1-py3.6-linux-x86_64.egg new file mode 100644 index 0000000000000000000000000000000000000000..87a0f96634233209f5f9fbc5ec0380012688b069 GIT binary patch literal 709047 zcmXVVV{j#1!|hBsv7JdK&crq*wr$(C%@fp;gY7H9U8D@jg9y zzP*1=*<>9pysbM-buv0!wLWEVuIgQw3KY)>{(3+rKo1N)=NNvUVn{AyWtn}?CM{tk z+I@Dxm3x!Xkh}Hi>No;?s37;gOmz{hh54<7`22KQiyox#8PbQfPMJ=7vs0lm&uDyaW&D4%IB1G)w1;Y z7bo`DQ-n|<&V%>YlAYPA_-A567f&@87Q0X2xRdrju0go65SsHx^G)+3x5?%*;}Mdr zOri@xyMM<-LlB0DxNUeLjnOgjaClCVG-1dF{d8vff-$2hMFc%@mxQG>xKPH}5mi0i zCi@n#RQ$w&>s-7^8KgTxQAm2`#pV%q?17|eB7zoJcy>_yMa0yb!k8z+b)&j9D6X0W zYXM<)xBkVc#gau526pvouWBQv3lE#p=+z>N`orwZ5J7EZ zi3NH>L`lMv@pC_5VhaqPB{}RkSY4yFAhUH??v6>3JUUtn!{#zForeoUg~^f+=-R0+ zccJSo;47h_E6!L|0_d4fh1*aCu)m;#sA!@FE700%Lcx;7NRhGArH|{)?lCjK-Kdj} z1Xn$s17iBn6ijxk^es=+9YuO?x2H+K#+i{NGxW^RF;wsaMtVq=%qhh$kUuug)i6}^qVcF^)Tu2>`eax z^oU{h-FEa1C+UpuLsvGi6MH7d_@V6ZtGWv@6(uEsl>VN`feH(t0yug9LB`*d)Db;S z!^Ynt2|_B8^_L(*MivxpDb?iTiJzLuFBOR)3~mtAjz!Nj`lnDRGP#Y*2gr{F(MgO5 ztSsz~anY+I40`QX)tZl+Vk(?VZFGVqG^E4i2a*I(juJF!iow^j1PjL(&9M`<9NuAi zBJB{;v|W;2_ z%0}1B!W?Tewo-Mu>I6H2ko6=E*JJUgrTpWU{s)FzgokuvveCL*8e$g`s# zgq}$kRS>QZTV_iu8e@w|&1adaA2=3|8cXVaoUa^6X=6${(J&a+cN904LTbpF;~|b` zyyvTY5~oPGEk4Z#o7{n!G_G9>wlge5)j-I^5mL;{ClEWZee=? zum6b1kv#B#G<)L>OE|hAz-D>R4pE*og1mZS4;~Ji6k~vc)~icT0@j2L1y4mPA^`r| zKe|c=BacLiUsQd?k!A+j2y21fQ>|&F7E-7(RM=cOp$)x!L7Fgb%;D(}Swa{!Vl2^c z&i1z*E=lCoFJiFWxIx@x8lWZ+w=aq&!!>}JonX*+tQl=8KL5tj!s4V+5*B-zy;gru zHdkS7weGmm2$@D158LhFRC0ye5a=1_D<_x`ww^+TA;g_WGNdQg-+T&<#peu5jlC3d zeO8NrF&pBYM*Vm_q#AZPO`Z$WeUNd!lu%El|)FD8DAO z;)Y`P==OtBtN2H_?DiCc5t|XPrQ0xF{)zTywlGNt{~`m-{3Baj)YS72Mhnvu51obr zTx-S&g0scAOCr@{ib&&4Qe=`J*cO@vQj!w8b78B*jwvQ@(*;&W)__x)WV+pjQ6ZU< z*8I|9AzcvJOr~3BV&f;32<*Ya3Ok`G=#~-P1HkgSuY;{iH>NmhQ%~~y$5?0nt0%dq zkxM%E@-xH;5Hi&y6^J0%`fVoEEJb=eP8uti@{bMq3sxX-@|2J)&=Ors5SEo4Emsz^ zudou)Y2?mG^@Q=3tEGjpq-Hockkmami8d_nj_%`m3PweoI$oHlfm&6Rnbcl#Mbq68 zq5KDaoLJrQ?sEku=i~ITNOkn{eV~v3O!OfqqRULX-4$4XCNkeh4D0N(+1U>;5SziL${um6kWQKs}$# z#FJn(k>Rq#0u3Wbww%IC=#|qvu#j@xD>wqR%5#kbgA3G`% z>Y$eRSW=F3h~bJZV&G;crY@fdAMPjtNvIf*r}oBhcbowt1V0HbkuWusl)_{uXsf`| zBbH1ehx}|@>Pg7zMk7o^VAm6LF8VweIK+T{i=k^46ikd^aLsl{MMzjD+7mFcD|SM{ zo7|PsMjB@jp$54@BZKXcceapfOJtY27ijQIc&|>7&cqlI{2RGzgEt z@Xi85D5}n4%1B|os$)_xy$qq~zp6aGWROEB*b@#nm$rh=gfjzcQU4pusu+U7$lVf@ zR#UY$%T!;Qr2&FX%;3;z7IktQB0@4r8K(;?d-D7{BldokK7ky+o9NK8&<%Pd`+IED zz>CqXv0Gv_(%?Wmkrd??ed1J*q_LgVg?A`HMPwCf%CPBx+dP95LP9D7IIDC*|N89V z@5e_D6xBh;L;E@C9!Y~N4I{GA1 zCRVxJ>0oINlXbC91$GDtbiFZ79p{9B4%KEV`4B~ zw}^50b#xMVqKqy==NR+Bk=0BRCL4lN*ke38h6I&L&XDf5uI(LJNq=LsvpAL~F?*u< zm*3Mf%p!>O=v#!V6k5j9KE~Gt z^v4tg4##?wgpylZyCxAxa!P(O6Oxc9BVy=PRu7mG2mt6(G$UzRMy5qe^l%>bhx8@d zn>frQaPIQyrcMYCVf|!3ZO?|6^)1my3}+}+DcC4m%2o3k3kB229r7p{y&3YeCJAx( zN(PR3=}}tUzPNrBeVSM{6j74fU6T`;1FNcha(A0A@y<6i%8JZBO|6Y zDaZekLe~=50Hpd$1d|?4AVm>qV!MZ?#t4|uDG92@1)`q_>z~}_HELqO8AG=XqcyRH z47b*V515`d!Gyly;oeoo{_Ti;lkkrrsY>pq5}hB8cQnp6EUry~<<<1Z<-<%3#cK7P zup{Jh%L|b%p%6J_Y&3kB#nSvMA=gIEVFep;9nLmrNRi!l9hG__u1ORI-yBV2f{Y%f zCy>?zb4Q|*{5BrO)g;X$CP8wZ=acnrWF{z`mxRX8l!tkXo2Nd-H+67)jAU${h1L_Q zE2HWZ(n!}|gFB}!3jNWx8%hrk9kS6I8Kxh`AnXm(O&lBQPb})k^wHHN5t%odFXB!6 zLoEi>O%{ZuQ5&li+MEu4FaW|7B+K6xl9a98b@J6>D_%AWq=*jGxYv>n5~iCfarsBE z{_ckuczA7=!ZEdR&@dyJPsH1u#RLqtQt zC7;DWht(({v~N(h?&k?Tl>_6x&3@mb;fnBay;D8)igYk_4Gxrh2pA@K zhU)q>e={kdVz_;O*y4Z~)%SISjrEtK!#4bfzW)yoRoGkTr;;oRRw~kGeJ6Tio2XaH zk3n7NxgJZ zSA%?*{?NS~~9Vm+psZvcL9qw@$k2C?58W`V)h!QfXwM~jQgh2 z;ZT)W-QbG4Dy9N5V4~>&We!4Z3I{^NBwu$1RN7gJZdms0k?+H8(C>P16+u6K`OE$a znWzD8>VXdpR240FY(z_-NXE`}!~UPewtwPRbYolg;9YP3%jm|^?1n?{=2wN4DD|Qh zfc|_v=OM2ST-X*_cvHagg$;y56kiA;Xzo@vR||~{W(y6;i~(u?atCyhUj+2FWJWNG+h;ZB`OkZwoK%oxp>zpx6#G93cy;vQ07H$7~v$J`HVU z3yIe);0h*_gVOf(dQL0r4OP9(+~M~`_anQ+S*H|N@Y6T@8s{@|UZLxDCKgurzfgg# z`Ojdn=H>p1L3Z6g;id}~PyUek5;R&yX7DE#%d^9oLDYBSQu)GgZ7Wz&#V+c?IO`%r zZ^KsA{NU1sTG*6$-}>;vN=go3tpM{;1+^gf;jc#&sIX?mXZ;-@MW?%;t0oMi?SH@@ z|8blLdD$Yu%UQvca0wgH-?Ayu2qnoKK<{IfdBVH>q~V8~?@=<>0}`vfev-M^5( zGoys}Iht=&!3dmUMHH>9B3#kn^TbN()p!+-3=LbjLS9YM!&Cy z1Nj#2Vv}^?SQImY0L{KqIII0+MH`#*K+Ssxjh&^Gv6u#d~d^nE(Fz$6h8Ycwz zhx;DBvqhl?wWtraEQ);=#I+levTQSLv{h822Gqm$hlp+7BB#QMzau!S`O5FsPzsMf z+&vNM=zlu(g8F@?_QVrZdd#mwm=S;Y)!_O6*{^;is+Oe+;bXH5{aJ!fve`9{4 z5Bq_T2S7&#+~mNo#(kX&Lzcq0=O94P{fyK_mMmPA5=iTTO@TU8{J=u5>;Nc+F8z4q4-I^Te5;1>lD1OH)K`u$p_c-a!D`F$W78jKkl#3t%%DC)Nn z1L0A6@kmfl014kqFdSka3p@0+`W4nSN+w6+$52)U;)Xui<5pIcHYzc!-U1yp4a=H~ zwz>_!aEX_QP+}><6I?uR|IK1npM)!T^$vC8=WqKTJiR8*&|7-Hy!+@vMRz_ycjdo6 zXe1K~OO4PI>qLcQ!QoB);g5U3%A%0$86?3`Qs9m0P>lW97oy*mJk;ckGzxw%2cV3D zs;K^m(1oH3GWzoa|2jyi2K$}h2M7Ts{~uHqJ}^sA0zm=Tem>-aZoFY%$>CsJwqLEb zV9?u`s69|@P~TmRCJSYf8~#dj6uV#>re4LToja-$`!M|@Y8q_9=QjX{d@E368ro&& zXM=(j9#M8L<8dI(aj^Drh|Dz>u;-6wNH!f;T_@8e)a=)JNHp$C*T1|!5Rx$|t-VKhY+NW6);Ys2RZywskMtjB&ugF#dDNhZ=!Z$*typbTGDbgIxO-ZbOafvj6JFQSSZTQQvmyK?K3k zL;24o!ktsj`|zRus-Xej-!Lqbf@VNt(4{rxW5M+eunjd&ju`(Q9?(N2ul$Bc83jo) z1YQ#bSeiUp73$|G1IZ5UY;NJ@*|8vczDt{f8T5*G#!iGp=2an4)WEvi@{g*2M zLvf8~AIBB);~I&l#{@Hw;Q3|A<5T!$YWUW49seJgpvYIfH0%hNn7JDCU$}fe_ON>;8}d8wzug4Z2eJHXrv{5qm-AbI#QzA1<=)mGmX0=} z>?ytn$$5z7cdLCfwnK;dk08UBfjk7dB#`l&tbe~7zbuAHP5sgpwI~yxN;&|R#iM|j z-`A*Fa!#4UG_LSo*ANifM4o|qo#!qVUhDe#P+?%Ap2Q+Y`yMAW+QwtkRFx&oEW zLaTnDY#)sPH}ZCW9HpaZ%^xs&f2lx!K6gWxO^8!`#8~|CRfiXZ;QM9iOScTWR`L)# zOdeXCgc3Uv1MN}L#$?E$v>7rU3~Kx9Hpf@p1yv1fz!q2f^C44QmL>?A4KC3(NnIzX zA{fRN%6$8Kmk=@aIF(s3)`5S|Ez=;6YsA$ZBrZScW548Ui2r)+-^CG1qE5+sIBww8 zHE8fQ*}Fe4-TRV@Ud{B@KM-OyqOS|gJ@kn8GiL0uIy#8|sQi~|UhJ>mEpACD;k;N!WE146f@b(2TxC%(vr!bZ{O;e0_<6?poO3QgT`l}7 zci=SqJu7vWf299;_wo9gP~8MU`V&ZQ?sczop)ZT~(xnO&^GmD-M!tllbeO3uBYHfL z>VFv%vi%>g-$N6(xQS9IN#MKa9BhBmd{j?q1RPK*{;$$|oJ@01ZHU%%*P;fG#!f4< zKH#BRR9A|USllmJDc`bEnZ<;;#&#h{kXBTz8by}gCc_sjfTUFf56hg+Np8muJ9~3M z!8p#M0PN(pR+IIf%Cg*ygKBT%0<=gKiVaI%iOK5L^o#BN4*kZyc4Ie*k4U=I)?j$7 z(^Q@0ipb5*yB^uRD7y#SkR`GK-B^smQA#QU%~hF{lLJXB_58GzY#s8O!9VUR?*dtz z?%6UbDi%_A>_lh6i|Q|B#@QU$E%qA2A;A076L+C})FTcYGuk;%jKLQ5lxsqjo5SVT zzcLXQIt6rtkf>Q7+`M&JtkPjVtli=#moJYK2%}mi1M_g623zOe?ve&`zpE94su4zm zkwFxsGjMUYnW@86x6rOwRB342o8pUhI!;XvrK=|ongBMJ+Q03rCdj3MI<+ZfGbuv= z?&JDPt>ttr%D?oQSRU>5^JBCLNhcpkA3)x{6M1xn4aVd4mX|rz3rZ_8AFcESuEl>I z5tjl8PcFrVu*qI*X``ui*NP4fX4kDzaS*Tcj}k_Z$`AC$m$Tl5<)+p%$Sbo6c{&{& zpC7(o{|IgB4ffe)?#LnvI2Sf7@DcDkTfKdTtj&h(bfir%ToT<(6Q*vI+PyQs+U~Tk zFKu{Dz&5KdQR;0z9|R2qD)v-;UVD@{rZ(Ml>dI!LGx6p-fEHV+E|6Vd(y7n<T^V(GPEwg z#p|oLG>%#;B_xWgYacG80O=auN7Zl30hR?Jw;5m+j z0|1k=Le$phMSbY_k#6sin+&7YKi|9-oOh1(D-|zs75(Nu z3dUVq$d%{g{ylLot#t4y+1QPO_YqEq-gz(>CtSk&!es=pn728z{!FWODMCrcIr2_> ziOlJ61tvfZ(%xoBVq&zU{qC{p`Rv3coh{toYj0^l>K>S@L|eSEg>#i`C$jt~5t97w zmMTB1H-KLxo}CCFRfod#3ww0qEv8lN^w~VkX1EZK*HZR+bJa_G>l@F!WSBF9p+*2} zODx-P{y=;?f3K>y%BlqQGv|KA;zc+~KPaVU=~!SHT~e_>|FCgxuP-LH@Ql+ox0e|7 zN;`AsroY>KZ1U0zW{iyPKemBI`#l#sY+CM})#P;Q@T9|0M@tD%^psxha(gMSd8EGC zNo8#ztv`%F&AwBY5nsW$jk9;N@ubwAva_4!M!$a9jG~s7PJg%*oI2x;J?2^9_3Sz! z4T&Jxts?tuvAS1LXmRQwQ9pa|S|M8%kZ|z{(>fzJ{b)FNFKI^mj0)OKYl{LPXzR7PMm8C8rBJQO7>p*i*a&J+mmahGr&yTVF{It%CS`fETp zZ|c2(6+E@ZbCt;5+KZq-kvVx*-gqk7(e9Gnx)@H{MR%(uaoJklt8>lLc5jT>>2hD{qKP+mXf zo`lY`lEi*9c;B;HjG%q4DE4B>EJI@w`JDb&iw0nm<@#46wRN33PytvFf`)2GT~A8t zJQe#w3n|aYddQUJ zQV;X?Syg~f*Iq|%IBYo>tnfWEpH1D1doUc|qJce~)zdNcmj8TpliD?5oJlvI{Rn^` zYj=-x^yg7>+iw)2eRTGD+TS1so*lMOxy=Qm4LVVgw6;_>hwlR5&ABpdSvqAL1Ei;^yZy%#v7>l!# zo*Qk<%3{NA0DI+emryQ|f5&Q637g*Pwg$a6q-brPva7Oz+b6-?W0hye?(J3U4l;Lk z?KJFB^^IZYY}RKv$#q^OQ=ZPM)7_DWpuK~}YYPyE*{>w;$kA;y7&y#}QAX5AcSl2W zfp7If!o{kqv~AU1vtuik-STH;Q?(Eu0d@k}()B~6GU*;AlXEoU*-ekbE#`0KQW4D# z5(pDqWgMNRADvMY_(v_$3KeChaT^Nv4>GDXiMqLWa9=SU5fQZ6pSL&swwnsl&5yAH z;W8If;|!P|&)Ul=H<$K~OA*!s;GBt^Vuop=!FmeSPstKEOUyVSF!rRyHr8|L%}wXc z3)df7KrcWAIxu!}*%t7{4pe%`+G^q5B-N(OQ(rE=RX?5LV8zqnd6(ZEGFrNmD@-ij zQ?GTVqZA!yv4>j!vl)>wd3l*`2F~Uj^0x;ZK3U`x zRvdF=D;lLiTGfVWjY!W_qp5 zO5>%pT%kEmo3(Vv)!OG900dLAHHYDlduHiE{Hw=c$_7bdPofHkrz?fw+_jkB%MhSW zj0FE3MkV~Px&*$xQ`OZfAvGm#m{uhje9};gnpT2{c8Q=j|LGw^4QO^E*V|`78^(in zF)7__Hb%GNv*6ry{JO3(UK5#d&pr2hv6kIf+~9aY#v0$43w9bv^tmB_)b2p~{zKsP) z)2zxcvA>Tt6VJ`mkh%=w3A+NKaPQQAzLcyCb(k*#xn!@`TCV0zU``(Fkw~B$4r^pG z5}e`b*=}+ZA2Fe3EQA7m>uITdV0E*Y$gZ3*`?p4B-vM_%CHc6I;%Xz1ITm zFC!hYniWj8M7Ssp`q5M zRcHc~F7vWf+EN`n2IVfFZN`zQW3S^Ute(PuRbOl3CU!2u{!| zneeTZvg^~`Dl5>#`gUOewua^%-NR;Yu89!@a17Ki3h%cJ^~!XDK6U(U+m&hV+jZ$Y zK=@#MYD#+*M}l&)v!m4^gwA!0S)^SDFrm$4@&YEVx!OGz!Cv3)-{gJk43|YOVaO=Q zwbF~XRo3`dllC!fOF#gSMiN16CILKj)wpRn+Cq4v?R?fx==)^mV-JceYG)+YqCK_5 zcc9U6%HyBkS-D9M2&tmcvfpbci&$qcevc2yXrN@3p;Fg4^d{b!h}A>`oE36*(L&=r z6}RhVlza3v(3VbC7Uf=n1ap1Cer3&xrZQF)YD>0Md`2jNr7k-s%z$#4Tn5FQT>oz)J4J~W_1dF-5 zJhd~+#2NME<0l1m}X{V4Yy2gUvsZ+3&{RYO|ouf&8p61 z!%w5(+P9$Ml6z5RU6mGF-r5lN7(L?cWQTNxOU-OhuWA>~>rH z`I7LLH$Z!U!je|3*vY&D*ctty+}B_5AtZx5lAU#^^@8GOI`%iTaFsQ}eLF;!G}nKWLp& zsfg)hzyFO-gQLy#4WXW;H0s?NY?P91_M@!W!XYrxG5ALjGk>HYVY`20#(QM|` zS!@rP=E8U!H>Q7va&3t6ImyXS557z912Rv2&HR!o0UCbSB&W*jX;1 zj&W2&rje1YG`RWI(V4&R`p2R|e0HM_G%CvZ^G#V2YkBH%Q8TsF*?qnABs*it1Jl`6 zDwBfK@j=>Wr#+g-gv43i?tw^Yp?^gAl_LNT&6A|i&hvr#uB&4CE!3JxZeD%rBBs>q z6X&7Zj%>PgTwllJ>Qk-`v~mI32lcn z|J~~dZQ2EAwx!*KY_mSQ;j-x(X%Uyz8{SmCU#t2o$ZV2Nb8=B&-yH{N&Jb@ zWGGTB<>&69V|n z%y8xg3hsPv22rHTI}4gXGxo|CF!YmMNj1$qEw?e4zIb(7^(7}nPXOF*XzDlQM=s}c zH}4-Dlx%oPoN7NulGwWXuN&UE)Yfv-(F4+-s?~SRH(Pn=*DJ^@#}V{0)CJvN&g!daELfznH1KuutM)NC-BZ`Zp>n@U z$52++O@?RD(F0Y&vA3wzzHUqkZLqIW259|IYzlW+Zswt{Vu%)c%NDuklxWqwEn`sD zKG)b7gj)X*wzoO1Ws^Kc_Z{TUf||Ru)!)DJJbN&6GJqy!(`ORU7IZPcOn*teyEl(V z@3h~wF=KGQ;;mFyeF(VBI4=kumM#+1Bm1en2e?j^7meUsj;r|8s!?+mo6J(AeMoE- z=}`ONNmxJ2qi1wl;`nSnA?Vmd$s^+{EPXADKI=a1dc^xX>ZFZUwbQTNkMY$L`GJ1F zVixyid#!FaypvmM-ChNt@xJ-9_g?KX=P_tj$#K!2;vfqo>}0Y?TVASdw*t0!yVRLm z`#fs*7^_#+ZXf$2xB(K3*V3Zgc@|o{Kc^WN<_HD|wW`g6-L-_Z+C$Ddc%(HdC|I*T zC})mJqA!ydGO;$U#@4ie*Ug#G88L@7{Z9v!&1U1xv}xj4Ls~jrHmV1x-Lu@z^ZJ*Q ze`?7lW>z=|lV#d!n_Z}QA1vQ?2dME2w|H%M_i0X8UR+zgu6H3ODyFZRK`?4tBi3bj z@*QlHM2WGjdD4V+Ufil?P#gauDCs9YxF}+rERS1{%eC;@oPk@k?nz2r@YGiGe*Z`_ z9$93s#eE)4$4tgQYvAEve7DBOdS5i^QY7!fod7rdz#Ep9gq2yn6*eRH+*dzke92~8 z&F|l2if|copMf6FA%N!8d`(JuPXmk9#vvQ!^s2a~%<(;Z^Hk*=q`KQPnZ_!`s+m~* zB$X4W;s?mzY}ly`bI^(C;B)YcX`Jjw;ZjNl%cfZp_|Q7n>`#V5L{Ee8nO81=w{oEk zAk)`k(RI?VN#HB+vf5tiwEp)L;}loc-Rp+iT(XVM3EP4d7Zv%#*F!OviE?VgURyd( zb+b0hU!AADLYw{qBOAzrs%mAn0#?g-N3j7SVCzwaQ9#j#>t;asp$-niYXvHm^yLz~ysp;4sr& z${m4LQ{8qQsDFC5#3V9nrx5L|TtIKXvG}=!#LU>PAT@~4O8a(eO6S;XHN?L(I^SNl z&JDMB5^eH0z~DqU`y0Iiwp7is&1H`riBI1D>+j3XlvBBC7ZE&_I)xiwYSVO|$>0Lf zWZgtrMZ65xC2ohy4tM)!RL{?}MJ2t3w<8#FYFu8Hh6QS!(;}bU%*vR{)Gr>|kJ)L6 z2xXMDrD~85`*{bSNuN>%vw6bedJ~MM?D;e|QBLRH{uU>QMMw=GoF=!^ZqG>BVY*vT z^On@R+f|2hvxdhO5IpfRehtq2LA+=cSnpM6$xOMxTUFX#_q0ycwAda9H!6MF?wZEh zc-~ZoC;9POPUUKB5q!Ccp55yAdayAjvsE|z*C3&IpmH?Hrk%OzxwEuFE=+4=t|#=a z;czHu3J>xE7P|a=iGHJ!$ic+%hR}?D%*!Jg-{H?GP^&{ONI2g3tHotRucPtc756PD zl>$nuf4TN;<&9{NbGfUaFz^Q^|8AbGx8r&2j$2=}xF|SEZS^ z|Dk1Xv}Bi4+FK79!({00e&3LV>&hCST17mbft()Akr&-v?~Tg#Ku~XeWVtBD-S|RE_Y-_esM@KUt3@S~#Bu zp7#w@*{93R^u{sT>tCU5QQZMM4@T^KF37fkBDh?9=|(rVR%eF=^gS8Z@JRzuogQaL ziN8mw*?Tn<^0zJ%ef}EPUJ?$G2+Z!T@dPJmImbU)sOWbY!6vs$q|++2?2?wq@V1o# zfpold>c+dz!2`ux#AwwQsfrwIWuIEzl#K%JQ&X?3Xjtob46n%ZCqOt09r^Ji%kWJOe{Hme+7vpKM5PN}^Q zXf}(u$xz4Q^WJ|+oNv~n6&81SA9xrmR?xGF%Z)Etb!vwHSxCn2j3&TOF_*uyL~fkc zQOq9cc8FgTqnU)yfiMT+4>7adc%^AtA1mK=y<8+%#l>^ioh~+<=gh^ zscc$TSv*^eD2WwogxH6x97(SsV%#?t5>6b9{GGtt8N0SZv9Ji^WeA2&^lQ)xuQO=UwhJxAM?}k z^|;J5OZJw)w;@rpyGe1j+=Y4iyTe! z)hc;+&8B$KGkj&wXGH5#T665=&~I-|D>5UxdT}9h>89);a1fDTQlV!~W_EAc|{t$*HjsqVr*9*Yn1mKXOjEqm42agy<=<6=m8i?exXl)VE0!|}Rb{?r5= zm03RT5dc4|?l4!aJ*m#{OeSyG-gMhgHb11ZM{`nPf{tcWX6{@McSCobd+~v{xbx@i z&2X*AF+04l{Fb)!L|$K~dAa_@uj8doQYtk$z8Pj&#&K-989cfxP-eXUUV$Uan-{JA zWjSXCV3r7I?vF;fFcZ7x)qO1{Hn51VP#6hcNoBXj3^w<#ZQ~7;ka4~R>x5arzZZLhjBlK^#fgFFUSf5#r zu~NdS>?%cCx#~5CsP^pw-DFM%;S?d+Tb}pHc6gnwBoP+#Ju7wsA$#4W*=xj%Hy#f5 zaC5%o)z+`uh_B<^7~fCKvcsjwv+_JYjAzjW;{z|Az^$C$MbCccc(5PgD9tTG-K6Gl zefZVM;rjijQ~Zw}qH+XEEI)*oMf>+ljvl5csZU0G923Y>W@%d)X4j9?p`$+d44ENJ znRTW3vNj{k|jk z0n&&J&Ot~W%W6$Fj%=HqagQ`y&w=HWwZahaJr;8>Z#$2(RaRc!*5kP(a>hv`Owo-D z8*MIGtNbRm<}cvaCCAg%s9%6oFxN)+#alxsRc}3f{{z}Yy+t~fkL_WvS}H}^l>{n7 zrU{;CqUl4*rkmi{%G>2wb!5YF`qUl;i;lXN7s4BfiU6&yYxKY}NYjF^u8p>Fo2A)O zC9Tuc!mGIVxRDjl8g>r(3yFtwU6E>F$Y8>Qi4ijl?XsjO0LG1y`)hIckC%4wy6#f_ zZ=-T1laBIh7D;^Biu6e01wb=~rv86=L3E<+h zr@&hwY!{_CAC6^&b73DV z)4KSEcdAx}xIwi0KGj`mX75loY4p=`J-bQE9$xtbuefp8#dOKuyHa7Mt&P2*!u!}I96HjajtYn7v{YKi07wMNc9#-?8zI@&%CPkp9c&dPJR zUmrt9f7L1wR!37kS~%2BPO?7Vqb_kK?SRyKj|bxks>JTW0}cx zO*+mR+*i4~bCWonPRcS?$Ez+H>aFYzJsqCKR>$+hhtZN=P}1Hu_kZ8cB6mhB{cr3$ z%^G+PWcGMOQ@v1N8;8u79-AKS3nE)hm-IhKcPlNYM;z>$&t;M)L+3eV-M!}?c26nG zF6ywum70Uqc1T`IGbY@fVVs{nT~BDrIM1BR;%vXB@>wSg;Ule6IWNbY;@5q_S85M# z&TAiv8Jp^nfxI)WpNAHQ14AtNCJjJ$&zs4K&8arperdB;%{TV^tIK7J%&7~qz=ecU z9k))SfUVUwXB676sKTBQcpom>;uur11mDvD}+XG(U}0Sia-hFViz3T~-_yt~9b zrbX1lTB$6VL`JSetR@;xALV7X>wyqbaP6zP7xPW@qV`n~2 zN#DiXm0P6Z@FTT%N%i|MSGz%NAO#rQo#WHcoXIPtf&i^|quD8^IZSI#46ss{LuZ-4U8N}tjqc$i%l7=iX`&i#+VTo5dUINnV)Y-K{FnG~9gB|Ps8JXAQq%^(nqHdc zR~^6OK|XS;7lRTt`xa_I04;(GU2{qM{e$zTZS!jFthQT7$Xj2`vxLK5Q`1R8xCV0_ zq<9oXreej@*>j_0R->9mrPAy{c;~ubK^X=CW%GqmMI$yX{+&lT-+p<#RO`{Y!|iG& z=}|lSrrJ)?(}U&f#55KH7X}Pab6jfS1*PM@hnR#YUqmqVC3kl>s4Bu&X(&faaz zdWzj`Z{gaPSME#RW%@@ALJ}Jrk5=|)lPMZi)_S{{FrR^^BP=4WyzI79d%X^a^dgIj z(h6r8A4nqj_bOlZ&R9O=qF)ayb13!);lXd=i*unWRe1}sO_{T7<`Ia zEk{|N%0q1Kv$flc%oUe1Z@Fg_Ft`%4ESo#+lx6R-+&CUNb{8x&v5l#RUfK*pgC>V= zK<&IaE0u8?pfjZ&45HiA;XYik^E&B7q}DD3-)jz1%w_;BbF|WS9<>g)NuMgI7D`9w zv|VjpvjIvf_XsS2FnGkVs=Ig09V4%U4!1T7w{2Zxxz!Z*dIBlgEOAg5pi?NXf5D_>y7o7{o!WkQGq)!D)GfEZ&B#-`r|3R{);J&_9gN^kpPqqqON5LzbKkT1<_W979< zYO1Q^lEV5Cdj7X9Jti$Hfd~ib?0r+xA{@$?maZ#x; z>=LrPd|{vEWhOVXr^h>*5fjwK0AI< zhicVs?Ef)z=J8CpaU4&IBox2M9g)hh4)+;~5Go-#%avR?hs{=UCPxXCV@V?SeT~h1 zoBKY-92;hHZnMqy^WXFL_w_uV*Y|mUKcDA0J|K9Qtcd}_N^arQnO}0o?v*hP&z2p! z#}uatQ>>fTVQ!9;km8~c&`wK->ms@@9r}FiWk7;obr>cKm*tC)Sx4E~YD=tp8(Bq* zgRCln-49F5t1#v-hXze1>8sRA=R}aN^WVmX^Zy{-hj5jficdF|c%wc=*fR1Aql@ZD zY~v7@yhsrRwd%yQoG(~4mA2df!z}Y~_#5D;FOmH7=EhS8!1q^LD%_q(UmO>-j|;c# zZfLKhe)pzy-uhfZTZvB)HwsHGRO}wgrc0#W2-Ymv^nA0De#F71B~txN#=I=Bk5TP$ z)Sb>E-7C@5tChcAG!J|ncrfGUl7I2F!;3WVXH~-VorIjGPVZ;71T|5^p_QkNZw){F zK!)ts4u(AOLJlR}+hvX%F8C=Sn9TgV2O_y1QNLpolO=l-dpM+Tamm_9^rCtmsa95g z?fK-A-=_N|+{IP=LG87B`{YoN7Lm9Dv`?auVCnrqrL>@CQln(INXb9-x$O~nW6l(F zv9jvj%-+($lFp!mwHX6G+yz-llQA&lJa2!xQM=^MYUIbM;S#hq_we`P4QANp@rzDa zV_nQ4Y<%wD!{uDoJ3vCVeX>*DU39=Wc-E=&XN?7cj8|Pg{!zZSo#rlHSNgpM`!-{_ zy>ip>f?qJm3G1@g=QGh#Fz{fDsiv>9P?T`f9CBsiS96W&(th!ssLMBf_te8ipyZrL#I zCe?{*g*D%`QMyY^yfraijQUxo0)b62cW>YIK(8wA{knOt6mdt>%QfsCn8CGxm-&nO zQPkw%k^xi-32OBVs9lM17chX3nY@i}P}l535AN|qH8DFU(pNAvQHQN7A)DByyq}yl zV_uBNOie|4l3yUaER+vsZF+mdPZgtwreDvc^}xM09D4!CzA{GCMoptjUsw}7)jt0t z_{F}B?0mLqkq=!ZaM-KvVej>a*MkgruB6z50}o+C*LmWviyYA!wscc`Yq}?xRfF;_ z-)r&I-5Xn4Jp5)E_n=SfM@GozLJ1C_ey1mcic>?dJ+>i-i-^^ucrga_<6`0u)pyI_ z@IyA{Ws4F6K#Aj}@Io`>D3F?jT^mm*1UlYvS!5u7v{S|BXaDR`L7a1SSsUZQzqVVK zEU&$NC?L-mHM96{`ks4xf$d!I9Au%yH)x@5a8J!u*BisWu4QTVBwC4h+TMA-5 zVW&9Je|xSG_9ca>WpD@8E7g>zG-QpL>*Gv_>$ATz^SuI&E-&$M9LhiIwL|coZWAbB zz?B9Nv_G^&N9l%f2sH>d=NT>7AMtkqDrBqH5ta<>OTVtw=dV3!=OY@fP(~fG`NxVj ztHjY5)A6C~{H3bj1E>)4@y(2O(t%*+;(`Ici__8eP&Wpm;*U^d>`oQ@Hr~Fe5;niT zb4!ropW2Lngr}P(jEa#nE=}*pjimgFN#$QZv#vbFw6znl@|(K>bb^j;hOz-wZJMFaB_Od!~T}Ur7)sWVR zWy^l4b0c`J=3#(cE<#OgNLFrk6}oQ5`SDkqw0wqVJZbKGagTAz5Bt(dzc>3KOdrE% zaC>Cna=RHk0oJGhsUS3k!VW_TAaQ^{GHtO6^WOP|hhOGg@+csc9%Byjb+GbB5F#uz zetxq?WN#m~_~3k*U7!96=(8b&iDN4>>+me0|e&*b4j* z3u*ee-zA>ZHRZq@7%dBaI90~=cu&c6NamJ-7X>`*xiBWKzv(?sDfI6MpBcD&&F=SgNB!FFVcV0pv|pIxi>E*_l62~#XCV?`B%<96W@6bu*6aN_tq||_Z0j~Jik5ZZ!=DT z^RaI6YrtM=ZcK9T3+3L0ss<}+-$s}n?EbVJH0R%r)zl_Ef4*O86{p)gJCj`)u3WVn zy-E!4?+U6mIewzk=S1z}bp0p2oUKEc%yV7q}G1fuf za4pMn-_R!m1liZk?QiJ6mw(b1H~OTJ zGr*5LMUCj)@HtY+YddrOp?!3f+O5B#)dPgG?RUsP zFa5eko{y#9zmsB-mWbc{{%@b zy;60r(`jeu6!Pm?!@JF!cn_PcKo>pPBtKi*kP>Rp&Ap@_W7{~EZSK!+RoCb#9i>>v zIP>=I%19J669uZynqLcv5sMmW4+@&nu}ITY4gO`Bk7MUD5bL|)5DrF7OLRi}P7pwz zZ|cG(zEK*}`cdpAgwwK?>+PB6Woq|y^3bKuI`GCHN^dh~6lsaCyD(W`O|u&zP^&JQ><}yV;m2^ zWPY!!pG4`*qFM24G9t`ZDueB{59MD7&&;g9M~%A!Q#tkxHR@0|#MGT~d_DYI@V2>H z!RYK@oj`i?%t!f8+E(_jDxf%}^d`a9?WQfA=A-7mxTiD+{s;2ARL|N#&)oSHwIuT^ zfmQ18$t~1gH_~z)R>noSN4G#bw1YB*WRVXVv^QdQD8vO{X2m2HR*fdQ|NfE*0K! zX1Yw0ji{6Zmy$FdTWUDW$}V7y3(&dusW0qr_$}Mdv7S3}|6co#SzRky%=|7&@tThX z~X*#Td&atcyq0YCW7vE1kdDoS|d6r&$>OT z(3u2m*6^`jPyFs10P(f8dWi1$hkKZG7_E~JO;CFo0ApsI!P-E1Xf<<ab?O$X2WM#lF#tqrU5vnpK~ zzUmk61g{{GlH8+lmj=eIR*b;a-=v!J-t2QlOE0F4?l@50iz zk1BT3XAsH12|O5!tbNjP@X*{wXwA~Z{i5|y%XOYByXE_kt@mfCzNN)-S-y)~xRK?~ z==MQhLX9nE8)||^d-xs7t|x7+4`#y&t8lJ%505C*5wgmfvWxaLN=RnR(ZHm@yz(WRU~N7S&MkG445&o`7CRZ!B|A+OwP! zS}F7V<_qEHceUStGw0OPmTz79d~REj**kTeY%LCZHhQ?3r*8vM(q?YZw=;DsHrfq| z|1Egk<$(h$w#Ite-aPT0l9#^6ypE>YDC$4m7zhBVJZx=`;?o{jJ%UA06B2qs8vO8N$l`pTzS%U z5dHb5Ul6#jIqV>p5@DJ9eprrXhR6*xcjr-uiY**G=U8XTfUY+Vr_R`d7u+z27=LriWmz%EFXC;ap0~k=I!^H@HWy8 zXZvhe{&V!gfIB*(>aX7?vP5kV<*`>`2T<3xSEJ zZkg!3dJy_L+^0%~M?LVw^YrX_urDl#E#ELwp^*}3-P<>xRc0*{#62V{zv%g@#>-<4 zW&btJu)hrWuW^Zvl@ZwD3Z>P6=dInlA-Z9+_>b5wRl~5t?3t2(4PfVZZfTGK@CEFj zHKXWY#+;1x?Wz-Yv&sZxyFT5Ft9cBt&R*Lj?A!=_jPy#m+iw{E&5Z;VK)>-*Yu4`+ zn?bW0dWs~5*1h?VN{e?$vNH2|iM~0dQq#NgA?4-3KSFi8`owa>N?}jdTE!(_dDF_h z3*e!UlA*W@8ln3RHHTSAk4raqrIcM~at}7deYN3=M^-9r%oex5xlpc*L zWZuJ!H)d~n7xF6}fUS|`K%&{-v80}^ZwtceO&cmOSK4c5+fviI@3yjxV{{}4(5~1= z4x0QqMEEYE6BwLzku&B)Yt~B?I=kMPx+9K=qwNc$H2zb($sn}0ap{Nvq#7q@k;R4y zI``PmkQ0q-vIR3MLE}%-dpQR`ea^HVyyBAehvL(qj@h@L@)#^@5C>EVr`TflY7^i- zvEA+u6YeYzabpD|O?Ccz+88#HVCAhZ00;}J>F(gFQ$b&N;$VA6Cci(pUt*UyWJxl5 zF($e1&D;!648{Ys>j$bW>;9eGAr>ArhPoWaeyP&Y2r8{Y5pg3mJfvAu^E&evx5Eea zP_k~y{xpAO+4?AkI@52gD1$$LTMj`5+LBg!%~j|i7Yt()LZ+D8eFkVi|E>aZ%p_TOBP#R2sGN!uhd zn3&*O(fp}9S9krIMkv#|myfNtZw-B#-JL&r#@u%}=HwH1EIrcn2(rnAkn^{w6mGa{ zy6sEnDA3ookOhNKOBI?G=YWcX(xQmfwKaR5AyAdvTM!UDfg5c)Nacq*)40!VEjSJQ zvxg#3_UbMr33`h@?pv6;ums2A^AkSYR=$NnEnz6?f+<#7`|Zp8r}n z1UO?B#bEUir|Dd_!!!p_fw%P+VCVKR%ul;J9RG}G(}B_VOjU70dHZ+$+JDx5oBc^M zT3uSiVF|;Pf1X+`?pa$^l@8+Tn~q>{Ytzu;hTN?+Rt>O2>Kr>jc1GyGoj>CwIvbkS zb-13TU7=(#0`CzSoZLV9M2XYgZ^+J}>M*c9x2{$hl9Rt9-s+~ZXR}v>=%$B zEw$eya$g<`@#aBx{~c_mdJ1I0nTWvG`X=7AUQW)(*l}oPEoi)*6FJ3vTMf0wfBf`{ z!=Ub~zj^wMYdx%iqV8;6xAghIIU7W%zi_3B`0vNzL_Bg&(wM!_#G)kWXx$cSJmJt2 zfEof69g-4LM>wIO3_U(jI=|=m28|{&DEsD*=@7-H0h9=8DKMvux8b7E;XK6+TCRQ0;#FNd&*rNAw|$>@gNw;)NV&LV9nc-bn&z;S8|iuEzgyL-$Z1W#-dHgQR;RAi(ptj*-Wjr+{z8w4ZBzNWBmp% zu|_%5`rXp1(=po*0DmKaZfT!qzqN@EuEI_`_Rnd0mcMdY!PWbup5A(aCzRCJ6k9#k zU0VAdXB(VgH|f_laX$Zo>%?l&rEQ)B`%w?#Nb`&ZE9el_EO7X6!k?19z-_cpvk_>z=?uVhs3deIDXx+lRiycEjX=gfFn zVN1n$@fqjcM|mDAeOPSJp@Zr``y2?9=D?Uj)!&d2_uNq1_Zw6{OtP69%qo~fs|;7B z*391*{-!BeKd=UA+Z#~&cXq^YRy$oN_)RsfXfa!cXlP$lFJ?V_1PZyvmn-x!eKFq5 z_ag68=kfD=Tk)(`K6Z;0<%^ifl3eq=13^AdZNN-XSKEQBz}A)UV8``rw>hVg)nyil zuwvDw!Ia3m4HDdp)29P5@Gk{K4|w(xHLlRgO1C|rRzsW#gJ)=r zaDFStNl){W{=Mxr*!2_}QS_%op`bx|`F`r?LV^HlYh1&#G7m$lOFE{zN2`s)%jCBY zk92e6>|g7Mzp;DSyZ)iD%wB&#a=zA8<#}!~Og#y(aZr@M3$OgmRnrGXD=C5y0j=qG zSgi>&$E9B%ZeWdU#e-?L4MvpS7CQKSGsSdrrEA!eUI>o7!zZMbIT(N!Ua77Wk4>~Y zm$8wz8UPwD1K$Bh0bMbOW6gIog#Ld2<*~xGVnynu<+L>|x=n)B%rzNyooYln#b+!1 z5F`|~=qOCTA-nN~6y0lCrKH3cokZDw%50wuj!{~!FnxCS{BICRQM};x`J}!#Ti=}s zYF%T&#?KFBS*s_-(V|UIlAeGCjcm}hZ1Pi6dolQ-{c)iw>2OPl@nY+P%I6a8bRJ)R zlB;Xx*ariPI-4W*g@zYfzfGHTsfTLeaG%E&|3?49<{Fa_tV)ethbb2{wE&)X^EJ#& zU>e;eA)be_*1`r5hfy8%f{9I{AJmX*&Ocx+A2AVQpm9CexRLi>LldN}N$I0+x_wY( zmL2`X@aQydE!6Z?W_d>_^@2c5{*BJ%30i!6hLG^5HvP=bW!aaXU;*Dd7-yw1 zXgDoZ6~f5qex`p00gj|F)_FsOuUthS5OI{yPh6cy;L_vW!@1)QGF0Fe!(K<4#9TB7 z!(Q@+KmdM7{u~T8Q1q9~d8Qix0V#~95FI?_5sb1FN|;7m`0SNfqO){`xi1IBii5gA z03^aZdBNk)UmtI=V0PBMJBA$1^|Y(6+(Kx3M4S<2@c6zy5SP`(R%CUOXC!oWIJ@J@ zRMLL!|KtVNnPD`qvB#AmWpZV5IsKM+gjzSKQrXR^C5r!J?!!~QV7Y-n39Q6(V~%pfkDd8%G_F^2XCA7HJQ z_#@J|N0vjAR>z;ul;=fjJkUdFR{n=1>nCD%Zt?p6`jwbt&=?TRv|IQ1k_2a$cW4?ES7@PAL{H&I><8^NeN$RT>igEyYr%Q+~U|9h#}87 zP}0{Oakw!~!!d53Ne+&=VkhJ0ILV9s!gC3Ybw8tB370z!DSljZEXdw9g_@6_$& z2mWhZ?D*N<~a0 zLzHk80sS4BY4qh+-Op_)49`h5@*hplZMk4*b}7m2B~q*JR5-LK`>_>)B8a4JP!tAj$e(pSn#wH>%%i`)Q!|>!^b+Wv4jKU>oZSpW+RuHqr%&?SeMEt9_ z_cM%O0F1!I{@c6z_%F#rRZ^?ZzIPmbl*Zh$@Qfo`7_~v~q$Oj;cuC;;$9mdzQ+&q@ zO1z|H&$y6?mwE~tQWRkG8L{7Xom0&8b9>V7CY=k#H1vk3@oZoD6R#%5jQr( z&Cx)%IL$LCjrm|2V=^Cq27T6;!bz5b)Ww|1vJ1Z<=G#jb7mr(Bj)_+l<2T-kM}DJ! ziB;(`?l4C^+DX^bOVs+BTofaPYe;xtcktmsW-Zp+k8+LRnH16pII(9;VKT{|`8>5cq1SxzBfS2;zxFd`0 zQHL)qHNMRJ`!!uGZ!pS+qp9-|&QPC&&XbbN6WZRnF%yqx zauH{mmXag28^@GeYT|Hu5daTYL_PBPMaYqeCioWa4>p$(-wrKKYSi^=F9b_3nl$4_1 zH0NN}ok-kxa%@t~*Z070<>HT}b{rKh4KHuyY*>Fk&A*ib*;0KKD7glJC*~) zKefY+DXz^@o6^@!C`gMxdsDo79j1CZ!O8Uk&KwZ%@RRw(PhSdr57LLP$x7j$iF0U_ zKiBiI{%+i1DYd%&ghT7My9`3x>NE^1jb9$(4R)61+XeaF%rQ@qK1h#yFki#e33ym^Ue{B0?og_mL{k%Rm!y=mz$ zHJM?uipH9Zrz<2Uo9vcIVPL!ryfOzk%m~M65*O0>j6?nC;D!0*BP+|a=6J|M@f{E5 z-fK(O!dC~eF5WGG(BZIc$(I1l2?i+%QM}BwO2u#O5Q*no>dpw!=L|!eCXq;CmPJ}6 z0~zj;9ay;DrpD>QTr$=)`{DX%$6#m!co{wu&?s`7!p=i8m6^X#->M(cFFC+G4lfdG z^<5t$b_wvM%{m=JeeD=P)SGa1bM+-wwlShX*)Md7G+zACJ2iHmCgYG{my2Q|r(ld! z5|jL9kr@*+DiCNkD5&l(o>?{e6r6skm6gR8|7HhyT+}<7S;9DO<|Z9AeJ7DW)n5XU>-TZh92DbCg8=YmsfhJSZzM5{s*ebeC-@ z>%loFrs5nKiNCFVB)lSgr9PUJu)Hv%GojyX5w7coe-GH1xK&@kJRZ|7DR3OVRasM3npjcxh zW7|0bzyhw4A^6<^C0PqE!%=)-2?aY(X}>YsJ^6uDahCr3i$rfs@Tw^7RCeQk{u&8uWSSw) z;affjtmu?^(;g|VAC z)F3&Jmdq(owvdq9yER@nw#Uxw$SMaC_tO3V-eP`D?caw5% zmAL?jYQM%twO47mmUf@vD|H0D6l<&fX%gi9_b|dc^gv<(?xPAKekyvPQ)`KH_{l<0 z<05}=o7FkW%IOx}0I{EPD2xI>86&%$f=H&Iau@nWm+1=S!tNs|EDN+*kP4K5Ytwk~ zpArKZY42^Tb3jAugyWza_z$W0Bd(&0nu|Y{oB%e5VS#{M(G3T$Q)QI%G_{nqF}>j& z{=gT)!T1TSGuF*{pc%)wrF*5L#|F&5w9BiTn+et0h~!D(B{9n{7 z+)r~RXgRqqwWitu7=wGTV^ew{eU$5C7&v`#M|bf6zMB`AtrJ?bjQqjZA-WSS5y%MXG1p~y z)?Q*PsD?o-48rnpQXqgk^uEqyegGS*4-3s$>nEz=(oVoIXJIFv!2xUmAmorLwuTgr z0O>GL9O#blecBEFa@zBI*N*eH&8YnsE?Asln(!GNFgXtZLw}Ah+j}reoW4_LBHb6l z(z0cdAp6Qf%fh2z4_+I1=;pl9~JG;HUFH)&rw zmSb%WRlL>|RU*-3+C?QPOEGgxY>ssP>FiEnobkYfijU#v;P#Y-{n~2tFVgU#wCAln zzxN%xfM<0+!0yJamCcc)R8}+Rar0kLDFN?OWD+iQzmsa>AO$3aCbO=*0NpN^hVLL{U@Y3b*kYUDsVNdRA}<%B z7-8eK3@4)f_$73&&R1>T1AlV%@Ra%VekPn7Y?YrCMR_(Q`#ytrp7R}9t2#*`6*Y6^ zAorLH;^EzjXNoc4Q?DnX%pvb1Lz&>nq0oAA^kRAL{89x0d0^=Bc-5UBb%s=V#Y&l&Ygjp=pMD zQjoh@Iv00(77py;^^}&5_&~F$MBI_`lKamQ#;5*j)whnT6v=NBmnlgTGGuoBG2=A7j&FI4bbf?+`yxTv zSv`qAJCSY688LR`n*T!QM!~I_BM;FYlmLF9<)S*$=bkWG_C^HajbWg4$B&Q55#WYh zEY!Y&G2%)w47%x1Eff)yueEbZ4bZWb3DYe_xf~r2+U%sK;m{mA0_+^6+K+Z1@5SxU zB2Z4-)3$DrF+JP(<}>wQ=DAns*ELRlg!ueM+_448-N|UunRrhM?cEquOO~;kdjX!b zHRoGi;KN5mtcBMp}ZAY!#K@c=%QyEGphR%Tc7G zSL#;fUx{_<(UPUw>n1J?us|M9YnlN0aA^61=KWJpn~(U%wvHH2T_jB``gk~wbQF1b zC*~1%|H=%_cu!LEfqCBkb9TC^GbTMG(UI*c>R&(}+M6dtWKV7;VIwo1rEHHSq{n%m z+q3b$_Pk7evd$+#4k&E@TCGh4V}B-``q6AuRsGHbi2Maenp2(KE*nGkk+#739gZEt z$#SH|CGzjmjzyD1D-w^65GM~C?5tRoN; zRo4Tk!g}biA2fYvPCttE>}{yGO0tqN4{U!C`WnHy>kj(r+@)%MW;OL|inX*~IriE# zcyOp}Cx4R{;>LpL{7H~bgr|>iwF098#V#+IqIGqwa>FC(>-ETTl_kJA^dXoPGtChw z*6MKhu*qcM1IdH4-j8D@#lzwSXw**?r}?83TUgk$CCFzN%c=EI=X-QW@h@@Cuxm-q z<{>8qnpwCQgYMWIGoCrHA3;vjA_A}ZM}Kh(ubOF&-bR>z41O( z>0+H@H&yA-j8BZeyF6rCi^K2xlUXz;{p)3RsBY2{G*We%o05?!y11pen#58x#jV-Z zob+lt#@bPrSuS%H3huqGp0qA~TbYH{wGdk5{rN2Ho8 z_M~j0DoD&ht60y&tHwRuMCDn|qe&PE`T@-a_$ZpuHd~?cHiA1~`Z$Y*(abz1DxaoC zI-PGv-1#`cnn{?a;J`^)M76vFqJY3+yGVkw*O{0Z^Oa^2bt%FD_bh958i-=v$}*zValJnN zsV|JnbeLB|{untxyN-|OrJhD1MJ>(W8JFvH zr|9=I2(5VWkX7xZ z7kI>~C6}|W$xffs9cKXj^ulzdZ{Tu!i04zd}Mr-$3(*Fjzy2mRGlV#+Q5Q z!;$G^hlR zI4Kdn#2eO3$I}^s{5*!sh8c41`$-S$$#1j~XVmKkU#)Zt>))MAsuiDBC8xYIDj5)l{mS z(bBmV75OMJbD8@4GP?$`JhTdkRnDD_bc)-7y`IR;m1}q{1@TKPH1vlm<~G+Wp6b^t zns-~#h7Jx(Xx>%vn*A4QUQ~`?rF3$fo~F%WJSyPaPH+J{MW%u_&kp-*=>}CfIeeQB zNQ>S-Pw~_SGJNaHer#_zG|5zwpZ&XoDaZ_0J(4A)7h%EVj3sz|y{oKR_pyPkYKCI(wI5~H``#*KU(5w(-*RisrAx6U>I0w z7aL|jYF&%-iDL0ZZUD)rw52YSGNMt)vsle9F~mYWx^zdY-Bs*0H8tOSgK(lIN}=&A zg!quw@tV0?)eGU|taU*-o`;jy&XYjEq|Q+O90!4&&vD5qcA2E+oA_mvl=*lYS?G6m zFaQL}kV%f=vr(fvYi@3TGy$Vifx&I3z2n2<7I8xL!SMbGYHfJwCE5&+7C{^ z>=Ms9mN_Fm*Ps06%VX^=a%*vqy4M>jC?GTPQOivusivE%$6df1yEwYT>OY(z4@?S1 zKVc?O>EiFpR18GT_~Xq5z*GHaNbs<0gnXsHSy5s5rYjVidGhr>4<_ycNpq2BDgO+w zy=e=(OvwjmmI+y!(vW2>nLX4w#%3VlQ=W`7j;-^mj|NXFlYsXIjS!(nW7>14%3iXE zf)&IMPK6ovZe$sW-bAfTd{5{WGXOhSwsc{I^REK<%)!U!cH(88@{nRmnBwm>K`)gi zQ97Tf=h;gUuO%^;XMMbyF#PmfFO<&tCs&8&K5W|xZYGB7aS+&|0^KiTnCu)w51(je zvEm@6^_m`Gmr%Yvla>$=J1AoecrX1bLNRNbSO7YIsMc$T*5$@TcAtaEobtUZ7F_mq zNiTEKF^;MZtVK|xMJHm4%qy^HcIF-GRT?~N=qmT^RD!J7*>~6PJ=JOYp5Sb#!XBP@ z6@30+Uo7}bbXEL;(^Rkj3ncLY`*@B2k*jT7Si@06c~M!_p*)x5(=ybAM7~TL!Zg?M zKT7Epzbj|c(c<9$Xs}m@&*A7;lXr*K%7QKJ^0QF&C7WSSe_YLt7T>1Rj^DmE37xJV z1NKyqPp~#c7x#~{C}mE<(8VQqQZqFJcYe{hRyV(J2SsM}9$IwRMa~y>P|unzajvmY zORv&>JP%fIs)!6RE@spiFeC66>~b}frv*^ot_?M0q&uO-F2q2V_eR#Py%MJ;L*iJ( zn_C#K<{4*)X=)depdBPzMHtdlbk%R%OJu1_uF`fqR)Zs4geUQmn2TLgX-&i=D0IYF z7n%7^PlR?04wVfb=dp7+s$xZ8xN z{wfU_2_!P%#vtF`tJ~C6jPJVjgaqrv4f+Ryxs5&^MES+4sf;u77@Gev}5&r=PN z2A7wC(5ryniBu0ND~wuQ`5)zJLJs=yycC=OFXzEjI2_@%5HF+-Ov{6RaUAF==L?`h za>Z6;Tcgy|tUmNuSYHH-PNE0yAAgjX@h!u-yNKa^pzXVonLq)c0*oy%_4P)BYtZ8pF)mNq5EBdRz!`(P-0u$r=G6_WeqUSyEm6!Y%fyRoDN1Tz#!tuO6c7vbx1dXlG)eQqMAzf3 zD%?^SA;0&R^tA3_ravWlF#@@$`J?r!Z-;F&^KaP#D&%-b7x2LnN$&#q z2~oLqKj2T1BS|=C*DQ*@jPH`-ph<}^JQUq!%*D7tO3`^mG@F(=v$mmkE%#(ELbnWQB5d=;e7!%u zek^u?gl8O-P5fuYDE2%Z8We%+7LCog!^(L0+MA8XWJDm1OG~n+-5W7-S{7HO4VJZX!479A|88+pWYy?o5YE>qc0nGk++8*s2?YOvrx z>t*GmDuHnE)JLyH^T@UUUF+Po{%vM`L^Ap5+}dHEsmZs^e0X%VkY1NSs6!ke%`bdL z^hr|Tyr~L=jbrj!EYRKGLE^5#GxkVFIsOz*n%?9pjX))aUP#HZgxcgXM z1-2Xwpd6g*Pcl`=^=VRjy6)csa@N)l-r5`hK7?L}#39>eeNC?~iwRT(@~R7kb!6-S zR)J|-Ph^;1OU%pp)|KCvorU+DhWsuOur-&|xF5p3N__@gcppBUnotDPi95QrQ5SnJZ4e_lujQ0d^?B?}ny-V_) zKAw_L_@mn26-6*T3zHm@EKX{npO?;0%9aFN9PBiTc54&Y)TMDA5mE~@EX7{0lv-XS z@4Q6qfsEAF?cW`mw2t%u-v#;$?!G!JAkJMFMGSi!vfp9AQXoFuYvjYZtB`K+pYOps zCZTG(HNUnP5h8R!^BoC1qlwcvSl~Q7*&U9fKPnGRWG|(U`S~@#Kj8XT_CAZc{6VLB zPpj`t7w$N6SNhY8m8cFuT%#F!_G9M{u6d3m(fPJ$XWCJ_zd>~Fhuept>pd*LeezOQ8IURHb&_(7KYtQ>V=pTaZ8>ty{{3XJHW zK3|PfMB32)w0zi+vo#J$WXn#M)+@h)BH389*R(NnkVusr zs5c8$bX+?!<|4f%fbJtihAR||4J$aWg)-cG?x0Y0k(nd@dy|sKAC^1KZaJ9hQOdGvM$k z*YC2tZJTJ(iARjBm&2$Cv82!Un;o5J-4(f?dvH@YNe-%$bp?i$iZx7-S`T|@_*G5f z*`_7I1Gn=dw$AhMcQ(yVcGhLLMZ%TyM~-ISGdV-_SHD<-9WIC4&YU)WDOZW~jMaBRE&a4ntwL3rX(Byc=m@a9IM)Il z@%LMwI++M_&Zx6EYpr_LH$n3aIs{I-@(g*-fuCR+|QOAx2GNZcKqg z4*+?c!ZB}R^dCNE0#C(gm8X9z4!;YOUy_*)16k^KSSF4((^wqdheTaCexhe@x4=`- zelnA>W}&`so5x8-K=5kk{P0O<1ok{tmXQSlykO2%qaD7bA1OmC1HY2EPhf(;YCu4} z>iZxz#8mV))Tl_GoUxEy!uQ>REBx(ZM}u-pdtz=+uHSBG|H%u@=@U*nN3%lyaJA)q zZS5s}>7AP@P+wbh>I0u)0gnA|Z+|VyIrPp7pV(BRB!;9Z8H}K1SXenah~;RwGjq|e zfsK}bOIMu*5_W36$1e!!E58hF7z2i%wTn{?5Tf7B(J=#F2x>Y3dvU6%pQ_R$F8V&t z)i(T97f>))UE{LyyCnW8b|&%Qe~PaCpXu+9hlI#Ts9csP6&b1Aw@T5Kq*9+Sm!zxv zeY3e!L{cPdE)^xWBzJSa$Hm* zR&Hs5sRlOuGIO3wNVyJ6C2;H9UM{2Bg$x^yMe%^=g|lylAd;)RuUdd>6ZJ-0Rc|_S zF?VI@GYdx88`LZ$^SrZ$roYu%21gySco=p03=W1T57$BfLAcc%|EAMzb7ucrNat~&DKY*um3Zk}V$B)TK?aO%6x zmAnHUv=MGXJ)Lt?fu+RB^qI|%ApAp^O9@s^>5{({^`1c_tFbq!hBCDZAFESd(#e=wv_ zN!NW&bw%mwA+%WLUn?0ixyaMI{Q#ZK&GOZp(Ucs#D)en_nm=W}ZyUNxnywl@6GRdhpd}UzmQHQec3(XIseo31He&?r!(-j zj0BG}4}4)^{!+n&`XbTHfiQtP!`KNqoOXzf;cXQRiG>@ri6R|kBfXt*xq2%br*Y&^ zRdItvcruAyVz!h!t?A8^KQ)q8MCAJ4wHFu3Z%;LmgX0C`fAYuU=(?C4V5h2(_W8yS zYHQcUA0#I)gXOrPr;??7Uvi9t2k{`&Y8o*bu+mo_t{rPwe0gy2_8R+CMAgsP{$KE~)-@S>QW*V|`ax7}+RN-xlu??JjloE5-tQaNLY=?Qd z=`eciyBqgC!3nRXy(NiJnHdrWA3TdT^I@u7MPZ?@`Ew-; zQ1h@DUELHjry7X8jruOgNJgjxn~l7@nT3im-`ZE6Dj53@4l>bg+Y{({+Ien%;yS}A zAn=U6$M#28L7F%zj^sv*K2QmQAkl_yXtw)&?Vma+mLz%@C=~8zgm;mCu>Ld{Y+M^` zS99OUPN|lt@vMZX0@-O;;UvI8Cp{D2{?VY_SylB<@%!_&+uS2a-;>~iF$}%jGgzcq zqZI786h!IRVR8XjHIsmkf&BGU#?Q>?5FRA~ejZH%;vJuAr_p1!6uFE@N>NLR$w=R> z;T)WOua%D`(^j?6`V7hYWITM>k5-`IFL!2P|ApBAg9S5lpN*ix1hSTS!skRT-3{v} zIREmDp{?wLLQW_H4XR(Oh`55nQt%8nPa_aNp0R*cJ$AwnCkfu6fXm^=6) z$zIZzBb5Obfv1PCBY)EylPb2X=al@gb7TVVrP*|ys#WJ}W#^fD)#CmMr_b7*sg+uB zsJDDl_BF}zwT|V?PQEN(@zQA+NL$h5-%mF>)9Zsp+3a-&Xh5TRR3?O=YL2-Hx~+bl zbY-4q%FDO!$hSwf9s2MwJ_2>_2PEFi4QkU0^2?`VYz2RGhhH4b8dsC#T~zK|44enX zAE*vldNNS(=dSyzy#Xp(@ca_%NbhIc5+wCw{~Qlt`zQ2?SdgZ>Cc#_w&$Dm)?pBnA zXkiGvCug@yFAuIgv_0#CP6+$*wy5;nz`9)HRk({3gIc7wA~`vqih8w6;#liE?BQMrFPyvjclSaw72t zIhVHsVx}7LDo148w2M~VbB}}D6)gM8{=0A`HoRcBk1M$=J*gJ$M)5kc5`DuIX}pKB z4RN^#tS06Id{8LR5PC2VrxF>yxlQ*YHP`~3eoCl2wzrNup)mF!qQ+x=#cB7zDuEb& z2jBLpBlhG1eYXJ^DT6={rLA3mc8KBY+uQf+XDoHYei=ZFbWIm%k8LCN$i|3$*Aj7u zZ;H-lmmycTaqu}GyI|Wf>y^Wk5xtK-6srzS)@pCIBz&w+o_TZuSuk``nyPijNUBlq zl3|Oqe|vQ$Otd#omeW%4!9qi0kX!cK_Khy#GD=$T&3+S+xNgLt$RbX!s2A3oCwh_7 zMhhW!A1J36%{Q}RawtEQ5ZR|R``_ZYY$cj?eeDG_+=vlo297|Ck&DwBO3L$8=cSyAc zlM`?GZEcw_yvQ+=T)k-o6=~M^d365-EP2AbS=8+*9F_e59Z{5eUDd<4AHuMAHqK^N zPct_)U!7!W{!YD3G^!Wk9wj;T)`fMIpBkQ)WXwvd;`RxOXTrO&QGDVGBF97i+tLoT zp4HtAfBh)hL9IOT-yzSPHx6-al#hRhLRoLFpVn+V;P0uC1$>E$Z5fl%7nt-k)sb(R zldKCgr+0>hwGTP`w2y{#bt(D$Jj9;}5R5b{RP-;=Y%S7(6rn@s{VC>j{*NsK)Zr|38Z*9Lz3oo*looAm1BH>QT zZ!ceAS2DalK#kH@Ya{kF#|juS+KxQHVM56V!UDeXGToh&?F|D zc_8Jg#?089eJna$0Fb6n2a_AalbOgBZQ+_VzfOF$F}%Q4a+B!%-S>wi{59vl_8Hm_ z2QtRz_dixhk8@9@(szL)b06Mh*)j%oqSuCRn1LU!Y&o*>ZXapF{K$jI-tcnyc?+(_ z!x@X48lr8zXzvsR2`q5^oI=cbQ!ISd6nIbS>_%29J0+$xFnLAXY@o|M;+%aKYjLUD zy8LM!qBd5eEe+`2i+5FF6;Wyy`CqM(hjdw0h#s9WI-K-xb@seVvuvMLw0+uwi=3Ia z6c=z#SR!j7g?|d1g0}t5h6|G5uO-!NS0pT;W~i=MrKR=NI{K=$=8AbGQ)b-(TAcc5 zb+yGEk@_kJT`ea-O%8oWs9Q=#7fQFcWM^YF3(X?@ zYQ(M3ZUp5ex`GK|>3+>cb^F1tO}<0%w+jeI@@i<8dhnUK9Ho_JC_&@%^KxLPa^TGl zeQAI=dc!v=?1}1GYz@sWh8JsXZS)Wf)066grG~&P& z&eV#J1l-mvzSu%>ZfJ*4fm!?G+fUa$Y<57en9rQuMSbEfzO$AAS6bk8!j1y?DhIyd zy$<&^`=>X7i(Eb*aen0EwCIO|?Gek|U6_XitMtpMbGHQ!#Ut4*PGsXd?mozS2G?{k zk^a!~pF&gVekJ14?oG&*uE~sQs{GfOeyO}3zPg!-33e}C`Mvfe=M_|u<(Qv# zon3BM0=(-V5PWp0;h7n1tyBCv3Bl4_J7;DBSy82Z*iAHo;XCen2(R2TopgF~4^5Hd z>7+1o0ICa58iN_?;8`n>Kh55jE()w+4=NBHzza>k&KJTbnEZ zjSq`jVot}qnUw!09^Tuf4EdMndKtz)BMCsk!=7Ul&ZLV&f zr5J1R)SVgVgYo_TIulMYm^bnD=I*sJhntjY6$BB#Hh#K>xje2>Tzeo@s1yzAX2#`m zuEJ~gVH&@mT_Q={bnaD(p{dQuW(EfF_EIdr2p$O{9NF66!1|aE?sM;69G#!vC5+u2 z9AsYZyLfyKcB`V8*={lW=mX(r@@i=S7+`VHE&zL_SK?oMbLfYmQB{DMQx6a9ew2}X zA-7D9^8AZGcQsZn#G=@CMKUATDIjF1Fk-vpr;i(IS009ohK>6|*1VOyA!>)^mAbo* z+=@q>8*4`H()fjnv6SqwRpBL}Zv2^yyybdG=-4@wc;JpyaPP$YUT_#ysZ)@h^~>C>W+3_=SE?A2&>Miidx`2{4wU_&*Hf za9z({1i0S62(aIDQ5q%{E)cJyB&e}wZ^V&S?dl1J1MUh-|8}(-{K<9a)DjHFqAj(1 zm0I^4)&lU-mufn#Lei}%>RznL)X@FSA1f(_AJ0+r6w#_H49x`>$fDdoow*mcH(B!a`<8~LYQbb}I?Z#jL$fKcfVd!mI~ql) z<2bCs!~v@l+@Gux(Yx6%0@WQGRw^M!Ne~0hPP2sYCFLlPB%;vz9*pfIw!xCj^VY|k zl_aM4wEb}>-y5h|S_4K7%+Olh9njzLD;cAa(e`6~(uh^SSo8k@ zpAOr$0sI{~29fd~OMWhD4sc9iTrutdYRQB7W$L00*v#}eg= z1R$Bf$jz1??TvjNZ~n`XO=7f#d9pxpch03-BFryH%jkP zrOY@$j+(#k5DNW6VTqWB06;km_Lh_teyHZ&!PKW5UI+vY8ju zLCc32VX$`W{{04Rz0>A?%sCg9wfy7zf|{p$*;)PlYk%92FTIw3S*^f{GdvIIi0$_U zZj#7N^UHGsp3k-YYuA;tpzP!p!q3?6xsX4y{*<%NQ4Ck{wcMpR((wzHhN}M6XBn4b zkPqWS?cas5E|LN3+=Ha=7Z?hFFr-swc=wNBc_5?^vo~-g-pm)Nj>kBA2(o^a0A3Q= zg|ghTAlqI|H8J+}UOf%nit@8sT*ean*f+_w&%b`&>P2st;98HYPKKvxhSfR@Hsi() zMw}dc%S5V57PP!8_5P3$Dt5Jw??iB42-Xh`!b8i!>(VQ*BJ0P^cvbkz^D|6^Wo_tw zhP^*%s_HqvvO*<9avZd!doV~jf$QC>{k)IjSn(+XA_+HQ{CB!|T;5)RfBr5|a9&jR zOXY!YMtmsZ3{*xm`mkNfb8bX~mb1_FvHVIywNy2?YTMOxwHy$%Yvtc7)9~O1@DeQa z(K&x&j06#E^;g{s~TNqY?VV^Mg66rb!W(C7l3IsNw-@ ziWVk!@!zD<2MV*hgWkV_mnZa- zrnC`z&%XOia$_8LVrBu63Ort2Z}y60SYgw#Y)d)wrlYDAZ$Y}Dp5%*=9kz`vQnmnJ=28ScL-wk(%Oy=LZ5Bw+(8TOXoG zkH?%=EhoT5?-C|R>Z>4rJ8bYj$GfO$^tqWYCuWQCRZ-j+FlGb{jThoVlZ-)LoD%v) z_L&*=_jXCsk4=$Rft=0LW)GlKfA=Czk0aMpLZyBlecoJHwR`i;^Hqf$mUO0r$z*k)Afa;0Eg`ffJXO|g3 z3HHT-$;;E{S*2n#iot2S?I^x&{L{)NkU@Dt+K98r(~PgEOiJr4>#>k@&b3@RvSem? zgfH#i-+j{E3F^6gLNVwVrFY1Ee^QI`gNY~Y79M4bI_NXC0|(fDr5CbHH<~qp%KU}k zho;1iR`WlXOL{sYqrpA;Hi8MA5ESt?1KkZNq|4H^v>kf(_w{9SKLDPnZBN}RWZxQ> z_TS#HUXkHObG=K6#cm?8Lub#1M)2d9*?URCB=+ky;iwVr&TCG2yXExQmrA{lit*Bc z_^f;btCZ4fCY$F4hQ$@d3wh_?TkwYR_XhXxE(bi;Cw@}p_XM>XcYFAp92eaI3ZkAk zi^i3Z18P@33r@zdm?1@ADtP3IaVxe^sI!OO$lnpV3w)=w_zmF4Tq^f88`_@xdNxU7 zb*kDwjZl4ASm*EG6vz=V0YVDDNTytZZ_#qCBQguT64-PTtv&GBH&mUOX|>!K$5NqO zBeJrf@u)k)+|5JpY+dl6(?z_g_{gSRj(liamk4B#n1{(OtEg;keEnbKS-=mn9MOB5 zFr`^lUZYRI89tr!qd)FLs*F}XD?2v_DZ_|mhTb+MtkPJH&Lzi5Tyw0M=ceTI$f4&G z**@x`hJ47>tJ2YH(a@#fm+w7`(hBX0q+p=765i_#yUmBH`*%jZ0aadV!Ha0Jy}V-o zaFH7$uq!Y)A(UwdygrBuEIqHB~JQqC&(U*8%BU0r-zD6g#xe6h>r1t5g`|2eYAebgDE_-7l zL!ceqBq>0~z1QPD=&+BEPMA#xV+Fd2rF|#9K5jneYuln_AMxrnuYNauznMi(-wyv} zTo6=Mj1|xhGdlm=s?~E~achAqpLrOigzX9((gJ^nw+)?qS0KFK!_jVbNe7`?w-V($ z)$z{_TFeg$W`B_>Ds$B^-huD~#5?i)M$}$Re;+!h*E;-++j_XHR%+Y2NG#I*>VU86 zQjdyLqTpr@`Ft3<%Ja;WF83r4{Je%mCN1)%C6{zd>%zh^4h?qzV<53wr*5ahOuR`D zbZ_TOm=8DsK&)2PMOzuxUhQa}KGh)|xOYq+a@3>c#%UAVX8nVLi*0M_DK<}fDS<(C zN}EatywmQa#N~~m$li+h_@(>|lkb*Lc^+gtK{*s*V>y^m)`9$L@BiPW2O%grj6Iv4 zJ5xP;WY){2$?dyXM9IaOgy}74zI;AY2cDC; z;~&5SaHpP^{y_LxGiD##Rz?u_b07by14AXoX8Rl}3}$>q@2R$#RitHQNz4-kUh3A? zEutQ!T6p_!sE)5hXk`#n1zuq1!C9!W;3%5Q{wr_%ClMm4n|1bQoTl87Ix6_N!h-Rn zGrW6v7u&4w4}JVB{uzv0e!KOgqV3?)0vc_IzB-*X$Qtan@PF##GZ6JRUyArXPFQ5N zjk*X6&(CNjF3zx)v{UDg=)TnH^o8cyXvS4z6Wx4rK9;I#95o6(wtdDDULQgz&S@Rw z?V&UUQuelp=OqffaDI(zEyq6ny)>r*dWv@#4d%TEFyJ^4oU)mxLos=^f;BV|Za4BVJ!zID| z;Ush+w2hMbJciQTxWqZe4!F+^7e_x5@jITAH!#(+y9Y{)r9gjec1% zViFXyN?~7N2jRK$u7?q&*8IEqftNf7ObBzExpgv}?^A2=f~O@J14}Dc&URUg0yr%D z)RIsxaxGE{zVGjr=vB!?(Y;D3qSn^zjoL2?hvI(hy?8t5N1xBzJ<~I@J3U@&!1V3Y z6B{NpGmEP{0TY!y<@S-?w0%`wTb)95Hm|ucaA;u7K;aFJL}$^#fDg@qq|8996%&S1 z;L*TKJV~m9t+9F&G8P?VWyk!EYV2w3bT$2*6vUy95lp4DByo*x#h~Z-)jaC>YsR%i;+o8Cu&o$ zc3Y21?J0{LR0yeR6Q0C9Wp@PR+BaZpgk<$8!1s6B*)vvQA#qxMwM%pYFZ(T{seef4GF} zXX~sCnci3L`dW!Jhrh#hM~Qyr=CO2ERKq?X{CoV!_dUN_`dyOVQn?G<`kSl(@;5G1 z3j;00(^l`fu3J?)p8N#&Oa!_E+?l`WKHFBq?0e`?DqN9fYPt9h2~Od+sAS?;OQV(j zdsO+N-820pml(?21;Lu}A>5V`Cq$hbvcG7$Y-?{sVo~Cv`=0nF(wC4}-w*}b6_ouz z?LnN?9{-f&iDuE%ax1_)h-T03qP2VqUp)dU6x0?9GT0m1caj=ST^PgBq-(=-%PLqe z`0|$VO$z?tLC9#h(Mok^C;dQ}kTXao`R>c21mUx6vh(H>ucpwgR*)QnoetU__l!lQI2Bp9`);*(<)(+cHOuk zn>loBndodKnzeCq%U^Ac=i~cy>$CfIo4ext%T&}3VW6+^Yh$xTgNY~66$buT?!ueK zR7HBTngA-r+q(J)Cp-ZV_@3Zq1w7GRc*Se~8Hvr#uN^$*^GAA24~t3ujqW+*TAsYg zfdTfe;w0b~4v6@fV@TWN%u}hKft;JO`oM3yu~uuGWA|$%!}CwMKHHFaY9C#$}|7H@l0})&PhT;l(7{%^B8m27XLt^Kn?&)n**w2(q=vN zaleo<>;Qq(Enas)ZFG55^~vS>ATwg^v&St@PcB@KI3S!eVO_AFxj6$;(g9PQdb2pr z--`zzk$7ujy`jansR}lm;ofr&0u>vt%9%HRTHEt0^`u5EQ0K^lP27M(B{p=Fi7^-F znI%69hNQ6PZK~w=nk@f0oSRk2E+^Jr!T%Q|FB29n2xE|>M?|n2#S4U$H8KM z720uiee}X|q=VDDzsoRBK7Fx(I3k3g!^Z474vERp21dj7hQ>#OTcj7om;Z}i~s6jdl8A@1C&ZJ+K?q=#FPW1 zwdwZUFP){ae+vG!JXb7 zIIL~1H%a{w4 z!*k}2cu$ZAdxE7b!@(pkr=2HeUAm8$B=ssSd)7TP(VlxQPk?AwQJ52cbY8|Qb7YTW3vm;OG0d8&6k6;d(--eW4!ET>CqK8Pn6IoC`% z9U@WA`;#ndTfx7gmPVe((HnFBLJmqHoguWN#88Ylqyl#}#dhKPT#{w_vEt@Q_ig>t&7c!z!NMQaECjmp;rmz zwR|N;H{3dDlGcbC{QL6lR3uEbOMQ36L4`S!3;U zN0id3w<})7k<#^g-5Z1F>nNzb``8NX>F|Ckyr-;}+azgW6NGF_w5Ss1SnV*My!?Gy z&Jw8{eEMyT97IW21YUw1-06H3D-kq%wzY+I$bMbi5=nW`3mE`WV)JmDrhla{Qg#>S zoC=#6YOcX2$y?4(alBpOZK(ww8anb1{OqY{jjuS-vo&AfG;Vb+9#v=jSIOc{&-{La zA<&piPtjQfpY%mr^D7iydoERp8Cax0)DzznAeF2O4agl%&NAYiQzicrNjCTZ4?&Jq zd{~mQl5GC#ANL04<-#OoowJ`w@8*Ca_Jkztk+d4?Hx0o*HbZ@mXrr{kHVrw%i$r^7 zV@cjt1RlO+jQq{ZN7}~BZHs$B$0`bi8>1%_q|Vq5gg&?r>^bTip|G~!?op2^08xm* zm-GfUup@@2=f>pe5aNr)DpPbQD@OhAA>tc*>TGjJJUV{1+PYR!xbpHgQ2!!0Otr(I zo-5JYHz04Q#2D^jPguNB%X<#ZJnVW5Ju+VwULryq8pPy?4zAqA939F^k&S3{tNdmx z9~iXf{ejU1%t?Ih)er|TUu|LI?vJn`i)fSfRcDbSx8Fdj{1ghA%628SG5DxD-IZfT zut(zIMXBCot|_>72oNk*+aorknGFb2-gt)1T;NEAx$Z#R!?Nd}oefs<`5-0yRlzP# z3Diu8IN*?(NSkoTH`#(Og*=App!#9+J>Gs4t;C5f7=V?$(;RVv6E?H zE2k|`uQR4@KY&37tKSDj`&hJ$eI~+pZPb3_+*dmnWMJ8ONSiaXWA~fL;jMr5y?7Rk zL*%a3=y!=$O}0Sb_#<1CSj-c zuMR9Y=*D7MhE1Jqy@|gG!pUo1O|=uajLjw$PD-9Wdb1S}5B#Ktu&>7?<`3bgQ{?F< z4k1se=Pb5fO4GDm%mg|(4fRnSDw_7KcC2*4U-@SOuhMNmAyFW~>jlHGtr%zX3k-Zu z%g$);oJ7kYvkFSO?gZ1a#tE0!&@Ir%A+^|dGzm_GL~StX)NeG>B_^zl)x`$x%7fHM z&`?`@-x*$B8j-U`sPLmQhOwXPifQ34YhIP;wUcBLZW}!EBOI2-c~}$bYQe_e%`jK; zE)m0#H^)>v4>_R{5Ysui|24NhJO#2(4D{1?-+Q$0)AoALkoU7urEhxvaSC`3b0wG$ z25b5pH!dKq?&BQ)VK?~rJ_QDN1q?{Nk=Mql%sT20dXI<;!H)+UVpf# zd>@kAQyhYFhi&a%RlkQ9+e-6+k$=XYGGq_ZGb*Q=6sTyGY!H?IE_?L`!lV}AzsF8gxGzEe0Ra;otaUz?~d?gB6|RqEiqn>E0uQG<}rbrCiPa+F4oCDvt5-p}ua+ zt8iwe`7hN&n)=A&e$!O=w&|r%U-*r3talocO!|Wh-MQjui!2KHkmVblTY13}B7a!FtQ0+I}%1Jz=sk!M! z(*$y4hA84AqWNth;G3TDz%zdxcjp5_RK#p0NefE7X7UN1A&iQ9Xz@1ICl`;oTf{S! z4OrE=-8@c~$LvO29_i+oaloz;myP+sVPS;P&~3iuLMiN?^Ho`zx|P9idmkIrC996j zIsiv}i10-Cq!(bJMz_T^$gb^391Rn;wPr7S2i2_ZpXo1Q8QDxpL{}5}7r%GpglWb^ z7I}L<#g$ylbu~#IZFGBf-gh=-Zsn%vG$(O?F(cMARFzc}JDZlMjL-dbGxrv7;ZfuL znYH_sdRr8Oss6|c$<*46NnKTO(XZq#W8HMo#>0rm`-H3DyTBJuw21TfPXj*x`(pPv z%(#>8d}Z95ePLMHuW3=h+zugH1$`kE%4{}|^xd{o%e*Mu7iHb;K5Kvz3--22+RR#* zc_NznSM}A?E8bab6I%My7^^~m8!m<00c}p6FKvvLoefTU3^^eC&P2}t?u`Y@1VfZe zYt4ptxViVXe$tRPajP?Uz%wN0C8Ql4pxX?!o0a2xOLplUYmyLpsF+O4!k*)*87-@^ zvvtXnuUR`VzcH^i`mss6tjaK2M?S#7Gd#xkLJQ51^At7~@7&El#d>k*U$tt8@8n!^ z;Lo?qisRZ8?52g4Dr&IHG(+E&VagV>qDuUm58t~<)%+*AEB&Ev~0O;?)v z!sTp?P;y2)Z%s3N>&XQ04T)wp#FNBEAvbi{pHf5+;h7QS@l{c1Z>ju5?Ga6chc3RM z(lXWF!Vgolk)nGGz;sUzVV~(E{OIrBIg)}D-q1i1vo-_ZCX9;!57fz18 zHJs(i@~BtxJ5aDH`7{T?;}KKhc%@}ti^(v;(xo=B6>Z@?8W^dtqHQW)$nBbuh;D2r zpU@$|kSeQw2ll;B@A$s@0kWsj%WN$(3(wK2EM^*m&HT?&hxg;=G{d@FhHXE&kROc| z_rg-Z7)8W%{eW*H$8NX?xy#OCqh?#Gs4<`kW8Nmt8Y35SDMp~XK>A)XXh`*#Myj<& zBKVQLo|wTi68=SBC?ppA0ZZ&do^8Wfx%U1wZtHcGG`MP3jcnOt8OPVUt!d$-N4#TH zznb;J!D&`|Yx6p_)HpqBe&^83&WFGwGo6^w;9)?^Y3%5e~xoGJq7ZQ1p2S~n+Ioyq4;9g z7-pZf%d+MeFg>8lZ6dkhg9d>#7`>Vt;hMb3r%hG{ZoL$|4XMA9gGYJzS62g+cyF8D za=amNd_Qrk@zlvT^!vH3{%Yd3>GPL}Kv{?d!HL=@@Km*`1X>zV^6~dlU%`QpoY~H> zZA;QBHI3nO0tI03dUOdE@6#foX5pS8vGSndT;puq>PZOOG z?HmvPi$-IrWO&?G;Gbapbl{Vww$)ap4Yi})kiBriXrdpb>F`wX!pUgI*?tBCv)yYR zr?G}1lf;O7j@(zLC6q`k94p)sl`63QG*_6~-qLK5+KM|?tsCbLi9i*w2sW+RYhGKg zUzQ|mco9T(nxw}-|2bScT|2sT%`Elb=l>Xbw7~m2HJ7Q%8{#K4HUYrlrEhSbG?u{lM52+9-uPSu8fgKj(bJShAk{hH$Y??IJQw_$ zyWDZlxy_-MMR6<yxgCc?%AF4T z8vR~RRBKD^BxYtP`r^;?qf+emH%l^X#}eOe&Fxsh=+u7lzh65=tQwM%$4*z5Aho$xxOc11^*lh;Gk0AS3yod8 zG_*4>%^UGIaE4k`VAuQ|T|?+IS$i!f<;(8Auaf(?21A;t=|{~SBkUbUuaU>Cl=t<+^jYI6@H3%L7fe!!wq zU^HNov+s3v1W%(1+^1OuK4ri)9YTF=ON+z6IQTc&A1XFa`kYe5I$cR^YucM|0z}={W|bE5w5w}kT)*f z@U*Lip@sb9YyKI%ZFT%tu}_bq)zT;u^j5oVmMU$XeP-95KKGpt|z>e&XlKgT8b z7*%K{DE%1$%v4U3IRZcEL|*uDQ}+|3uRi5h$!U8W<&+u4C{@leF43BL6-C=;DS5tk8BcvRmr9b+_98$6`r2| z+=i6!9H`NoW$l9DUKvXZ*|5jc8($$F&Q_^xUw@o3|0*hrl^~8OwPS#mSB{K@0|ytR z3M(H(e3%VD;uXvzjzfTrpapXS6-Gkklb(dX6Ft7m0=6OJB1xwz?LoikG-6`ZKn40P z>~17JW1FY=wqo>5y|`>1y6PpY`Ak*4m|nc4fr0>XUq>WV8z~2ET)$b*3Sg&QJq)`nH zOfzCw;E%f`%Rpa-RSNsbf*M{w+Sp(jKM1;V-q}e;>*;8dC~wInts>!8saS;1Rw; ze=P2tQN3=Dd2GxEo!E)nXbTht%nK+|t<$XrKB&j}^yL&)|sJ#Cip+u`@lUM>B( zN4@cHSmmm$#-B=>BRe6&bdQ!(uB%fgD}^PoZ+>(Xl?Q=JVYGhrUcdkaK7uu+$n{VT zPqcAAXspHK+_F4{j0^|NpFX~Zp3i$B^gqz^(20zq6uVUGmMfsr=|-a7Sl`rDOLQv9 zu~9*{bs-ZgCJdxVQK7y3mRNTHC&Ks?LYLhamM{(9!+)m99{<#AkOH3wm+P^a1cWq^ zE{eQk^F`;ZI)xo2;9F8w4Xan4;(VmVk;~@aLI=X-0))WU+BW)&n;6h<#+Omuwz&rv z1QD~XyuqB~Q~hCt40k|w$C{Sz9#z`9$Cug-#$FmZQTF`uZaK1QZ?EiX0-I#Tm)&Z4KV+?Hdx>2iOZF!jWAe67Vo zQ2)JHx{N?@%CcVt53CBNe@bKdEI3isZC`%{nO@afx{f#pDoxUt)JHxIp2;9 z_vjvym*PE;pZT?UeGYNNex^V0z{U&(s%5t;e3jOOPBN$2jiqM=^m-=|1|{U*rif(b zwu5%5_V6&_!J#K67R8>lUkIM^wymv{;F* zgp79x^*22M>roJ|?EqfvxpUdH*c!~T(TuES2nb#^)?i0Y&6tOr(6CwJ8E&8JO59b^ zqtUh)Ut(H5+tMXVq1<%0p?UU z24ix25s;S^S$D zQJ_784XOg9EXS;mNP6a;bxre{AO&qH*AQ{%dv%H$e>)Q-O{BES+1COSUxO^B;83(Z z#8=6kyRY%@NKtF)X{zg9U|PmwT+dWRoS+!6g;pXqypZPBC!&f9u-Zc_YW|v#4!TKn zwZrd)=jij1uWzZ~ln)sA9i;<{w_dF(qH$RMGvD*jUQgPb;DkWnC~Z%O6QT^ zVTc)v#kIL*@E%rgyvG+;ODysrO?hprCl&$6$u_tQt-#M%X{}%IKpsKlshUdksdll^ z?eu26J;h7dGc4@we-y$~N(T*hZ0GHgTFd|FEWwPHrZ*MV)N)=0np|C4&ezN&+}L#f zhfEucX^zRm9RZVVy_1>gtalR34W9EDxKe`sDY55s2Xe)o$Lsk*g_Y+so8;dH+uyUw zo9kD=jPDU1^55sKN;4FO@FMRewt2Mdg;*wq;Vr*pNh+f6i{4JKZ2dJ!J1`N_bXh<& z?h|^JBI^`|J0+wN7i!($?(1z@YvHm3HZmDAH@EY=Ws4N!pKb}1HXt<*Emh^; z=)NM{GGCJj9TuipYxHTVU`OSJ*%#H5-}z#uHMU+WUBW6=riY@XBHR$KZB^zi?Gy>= z{?NAFcr++K0$0}R)Tft^=(d{>i@4t+a>Lt4L_v-((&HiRsf^EI2835i=mi8OWsYL( zAPPEA4M1~Ahx$#MwUVrD37 zz=akaNRwm$%!(M&2N1<_L-XW(3*4dn$OuEyN}{iM zg&)`!+}-<4I|8``Bk6Pby!6(S=Y`+jC1NhRywzMnH#|uqdz|UY)In>Jd7!UynF3Lqpw`g%XYONKZkst7|KNP>UTQi`J z&d@Y#7|w?Ze4+o(b^*FBi}V6@peN;-j!Q<{1~9*%jL|N(6}ecN1D%zt5O!O|MfOFT zkKckE6oZV*Es?mAVOJwtO-v7By%)gi?yTkh;T+mcC}E$oVU1erTsx6;i&(?_p1*lQ z+s(vI(|TOA!qP&oU8T?9&RKoeS-GR}iUe?$gxuEsizKyT-T(wnza0n;J>QTIR=lAG zt2#cH>19*4ub9_`k^AqXz45BY5~1g4*$>=7C%ps4@{Rp9L7z<@sU6xV%$Ubg61a|e zpj*vu(~IIJMe212zM2|rgE(W0b@_?~guV*xFFYG52EQalRX&o+pgw5BZ7HoDWmcS; zsA>4}ap#jxmf!#}>qxd$$j+4+ zvx2nC5IfTRo8*>!fk-NRCfvlq`J-8-k~um*q&;(T0yL)dPHW9RfL9S6KZw6xO8+(;>>;0dHds_%gp*5AMmm*>+p zswlS%rd|NXYB*QFOZRZ`m?oEJSBUPfRInG9Iu59d|KoakB_g-N@@Ykw zDhKLPecWJD(KZ?!`YGf??5}NFI70~zy@2BH3l36*2dyWd8D{Tx$9k54%j zqNV+quxvEhw$mlFs&4h#w14Uc;I~EF)d)ZLiNS`YWWjrcJLw{^xH@8wCiQjssVa(7 zf#7`WIt6-}Qc>~rjR^zrbX(SE-}^TOFw=5`-L2OCV=O6nV#S<&=AL@HfoZWou6WUA z?0!!F%J+D#LL&7;eg@r0X#B=ItB0n2)Vk$k56$c2&8$wd_j8UJ7L}A6`$T~vksIbi z9@2k*$kf-uNWc^&l{Xux(~JYgSkBZ|Pu%YlLAWx!fNx}^*imY`Mr)c!xD5;qfGpMQ ztK}DCuFX8!E@L$%&g;;%u?beyIm9L}&KN2Xr;I%l#b(d(<|UrvgG%z$M!W#WIL z0^)C|3#%Io*6b<&(s$drbJGW9CyZb0`#kCzN5V(xx zyg_$`tCWu`LySACK&@-Ik8(P>A`kG~G}PN_(CWS!Km@rd=1)g~ zg}8SBbOmEUR3vUznKxlOgu_k-Vq!2G%Ag&PHMiCLWd|k zHV||50FL5H{V<`T2UPC^zL=M#Wa=U{TWHm?x{s|oEv-yIf30TD}isHcaPZL zq1edZa9>1$qx*|&KLmLxH0riX0IDG zLg;t-my7X8yV>T~<6c1h5vkYz^e6mi3@`CQ$V+_(i}6eUPUWxA-}eyob3dN9!k^O1 z^q2QxSMmHnaT30Py}Ew682_VKpQe3)r#FN8p<2#IJO;z%UU?JjHV45^4MN`=buGJC zUCVOd#yI9t`ad0YZTeXJ;m+5BFYP1R%lb7R?*qMH8NIJhdmZg3h;^HW`}eC|knh0w z`E8FP&NvLPq-y;v3i)^G!1_F`f?fvcn9JG`;MV`Jq?Sno@`#by^Z_# zV=?}>VRzbPDB?i27V85s#S#4uc5mj-+TtkOQw%TKg*s5(+tRp*+uCryGyU`r@T>AG z>3(eqzU}c)zP;ZoGOj>;cO0HKqrO;Ai(vc*-~(rGD%QWA>1EB2;2wXK#=F*|K2aCy z6G?mibwhDpJ`fk?638w41DDeCCC#%~RD}4Y-NhG?j_F8vAzv}O>krRK`GLl<^nHqW zW@dlb4ft2luLYl6B-aOVo=&EhEr7i@kNMY5gC3eJr1INl$Sdg|D%Ml#t(^}*&J58w zw=H98)9rX^#Md;@r ze+NH2joSTw0?tV}vvZEP8MxETpLkWjHBRY&Q17oZLGANfT!{0+_-Y>N(a&Ld8B1KS zoB3#*YX1RPpI)&(vL4VQw?nTv66X%{9dR$lIa172)|GxMiSxCW7{9Ea_~e7)yyT(o zUaEepfl)wEG`Mh&jKJT{3%kE}*+3zm` z|DR9eZzm!?yze|3hp-aocM*#_{SdgU-3QY9;tBX4yBJ;F*9<&uRxfV{)SK>GF77pX z&xXfA5B*NGl<~a>pMiWpmN&PR4ZnRF@x#f@=zVd(mcZN06MQCx8`*;OWaGSUF#M#5 z1C(;$CRc(VGJZQ6_ygT6zv+z+U^ixUqs|RuUdK}Zay#Pdx>$T&&*q3vL;k+Z4+;X0 zG>@I%B{+|LqeMS4PWTq+3sI&od;&j7D={4Tp7~6lx*zqe`nC`~m*M$b)ZK_Ozh~Zj?C+D}JSOq{jQgP% z)CkTq4hP_S5B$XkQNPuts1M>CL;czxY-n9~zz-aZ@$CL`NIUsFYA2sCT=4TACb#@; zf9RDjQTx@hg&6;Lg3d3Mc17S|%w_mjS0E3)pVcwE8upyLPq982&qCz;_p*Hd@VCJ2 zWO?Ez0!Mu`!%;sL^=kWAz1mMeZ=Da(d5xa{ya?1SlI_Nc~6|CnE?Vx8r#b0>Ga=H_7^#@aX$Of?IbTa};1&y4?=r>1N~k5_z&o)IpH>7w>)sdSmC} z1=NX7K11Uu{sTEbkJW8??Iz%3zbRTq(9y1_d%K9;7bAxQ$CcZuI$?)LoU^nmK8yO@ zy{O+UIAczPkn5Zw-01Z&ON#r=w0~ zlG)*h^dlbrO~D_eefUos`|4(}hxD?%j2Wm;_y;AcCkF5JmgjNvbe_@ z)CJ4GfciDI0z;gy zx*`3nZphJFp)S%UG)`kM_(m_o&6p+a_RK$fMFH#;dx{;Db&wx<9_M<9Xessj3w8&8 zTT0K}5s**jG5O?wuuJs8E+OlM4Tk+E`7iNY%5~p3;HUdu(1ScLZ^BRKxs>V!?dy>b1bD?_}}npTU0a z;dLc|7nfg8^WZjp5&Bd)@PuWa?EW~vzl-)$&_0ZKk~~JY8x2Cf&)rm>e5M!naM&$( z1|62}13Y~M44UJf&86rW`Ew&34hINR1YdaT>uZuj~j%2 zAHw$i(-UELMZSi_FM4Np%(qs^&zE5RYhXVqWqjin`0L9#z69*C%UQh5jFsSH44?Bs z)UWAg^=qEr6@HvUX@2G01CbYYIQ8$G0{rA&hM#;4@OASTzV0K4n_bN8jkh3wt^7l= zK7UF(E&LSq@KYR)_Onp`H?K(OmC_z^!)(Y`(2pctZ3_Qd5A&~`F5}g&qvLTQ@4o9$ zy6^jckNX07uTq|T5%nwknVcMW0P!|4dR{K?f&CDEir4Tw0zIya>2YTQKe>;^^K61T zI7z05b|Bunlf`>qg8Ekdzzdap<`#*wy$i);3bd)W{AFJ<(< z064efInFKoUvroq@d@g)BpI&GoPS{5&Z2U{KY`=g2^?2R|Nrd6zBB*+@ux%24Ke(O zt#Q8(5$(4G9ctn9x?D6`oYQac{QfS)kNqb4k@oq<7op$%AfCU651)^Cyi#Uoy9E5I zi}9;&;NXpC`E>2mVBa1h1THr=rcCC4JuCR<59sL4cVTZoLdZE1|84Rng0FgTuSt34 z&`Z$Ibu^xBOW;O!Gkn5#aevjb`z!bY?Awq__s8?+QQx8dQFX2jKS0^Ee;)U{0o|Y`PuRe@BXsy1__% z7y6OJ)%gMXSO0vvZjGnm{slhC@7Tw~Pw|< zTJ#|6lKhIexa5xFyFcUkbqC=-`byCG7TAExgRpy?DCq4y_}^bdeXCy7w~~2J?#ZwN z6j42WzEALhA;6R=#>z;1}zWO_@qq^lzs8iNK<=!7#fh*4Bz-d3h z-p>5ammv-(%JQ2kM?>$Zqw@b*SHX_)m^km}N?grBz!`wuOpa#~>_7E)Qhstc^z8%l75-5AwP7oxQlv&HqLc7^ zf7lzNj1M1!INQ9P5a%HCc47DP!0vY%+P6Z_a>|!ZKknI0u0g!e3Tm%@ zRo07S_PSS|Kz;;^=NSw;&k&}swI7drpXDi!M;@rBO02^hc)ly*MU%rQJ(TnykC@f< zINO7~1H_--f#<=maW63cX&CooFS{SFM_hU-^N&nHynPQlrxSqB-#u0Ifb=%Uj@yg% znK6&WjoZY(_3QwC_%P+$CqvG0oKEGOu4O`=aqKHbDD&~}`563-(N7HL)XC224yZ%a zbsv5HHSh$wCNh7`G2&iZ$@H3|w@17R?k}n5&i@{9N35RI4O>8Ox|@FYnU6pxOi#IW zBId=`|GWorFEc$cfqJrCEPmh@_&Yp2J{-8m`EFW=<#NQI_cJ@dUWY*bXL{L9@cSp{ zQ-1O%@NnuG9?qG_TOZH#zWq?wqnpV^d!k-#&qN_Pbl`b8aIw1B`Eq^*eeitIv&0wG zfR6S1afBG>Y&^amb$|Ndzms;76VXo}>!$^F<|ypUGOqHh4NyOX+6z=T~rkyV&_1 zhB~l)0h*tFBkb5t*3Y!9kWZaQGS_2=YgIP8v$abKc(A7Z;7!^&SMf=m}Jg zod`ab$KuWI_(Sw$R^xp~km-U8PCP5EneEv7ojV)z< z=hLr*oL`K(Jm`N^8REcEud@alIUM<N_dlzW6D~;mgH2Is@}siTWn_ zz?YVOpVJ`sc0=x!`du07h%I9Ef`7<|9r8ZW(vRo6egr$)Ah8azPWtx0!@j}l9Q}c~ zg>DwN@a+qT=SMx(YOK%QKF3|A#dV$ z2lCn$GyCMns883;;>X^@`5nT}Zzb?*oUBgGH^6)B2EUblnX2Wm zH-0AimvIhF(}mog&*b(Wfw$Jj;+gZV!9Ba3_^zz$I^;_DUmm5nWY56PUq6M)6dadRu%{VzRGgPrfVbJbf%vYZx7&eN=3(PmfjAZ?lW#kKv)B8z*cZuHzeaql^JPl6 zV_{z{hkaGzKi{XIf0oZxx;^xrD-gcyJC)}o)2a6yh3gMDOikD&|j-GyQWuEX=%?S)^s{#YRgZV$!a zjlaQuJDK`(4+37_BIcKzf_jMMD@FgZPUYj9!+v!ay+^8X-#L#I=Y1;9(Z7-3RNg`J zn;fX4l;1%4()`O2=gjge|2G5v(}(FeR{$q4%5Vbbqko5w_CE-Axq4P_?ltI-`E|7Y zjIV)fbRpG$_XD1P7sI!?9_{N{`?-&z{Z`^!|4YgPh%=eN{F}=Ds1N&?=>JFT<8;^o z$|s>79NKTXgTUd*e^>Os59oO#iLb?Qi65HE3VZdSdUeI9}^!)P;h_8EHeDQR&e-(9yqb&a9C-^-bt%?CDVOVT1@`v= zF|SLozfZh~bF?q;c+o!5iF=vFS8M@%*YcyOo^t8`jI^EZK$6j$3gWkmA zvKmky#L4iqzuF1%&mCf3AK`}?IFG%+osjq^3vB9-gl~nNKSs~{m9S6N|0L*nH2VMO zL7bN)^ZOqRTmoj-ZL9$uGI{ftu+5mbV^N(KvT&Z$~zkLt%&Ez>^-DDi{o%QG+c^uMC zJHHBgG?Vu(uLu8Ra_j>eA+K=?jpka?g>P{+CcTxxIJRod0>qWpY=BXC|=|9ARP_k_it>^dFtC-NDkynUM+ zdALoYrL0?YH0%jcW=~jF3b~Zo_g+5>d>(nklKy+2)$BrBIflURBl+#q1%e*>S^e1y zro(Q}@JYtw-kHJVuIbC+Pq`Ly7uIu&vCx|?75%)4_P>9R_#9?W*w!Y0c3;F(IaoYZ zC+INWDdsEVj`leSak<0A2rJOO5dO-3cJ4kr1ooLxq92K4IeACuYktA6Bu>ag)K95L z-n6tc9FKZu^}u14_N$|>gMF-n;!E5IKW6@66wmY9FTtM~&f(Fpdvr0o$5R3Dnd!8C z?(K+&%wusWzYoK`!2ICX{epXqLdR`oOx)N1p!9$C?x2TW!B1{R`(L8KF=l-GgA;(4&Ypj)lRKPaQ6wp~vkZ=n%{1#L4u;Wr+Wu$KwCz0>3@_HT-&5 zhj$SVR{wX(|NlDy`Iu}So<#gcKC?fsSOz}7Bdr(T{sZi~tiJOO-@-4<;zf@xMO+i^ z3n|as^%LxYET83rE#RlyPmDyySFE@O{!qlFpM&RBuaJHY)< zi2mm?zyG_)D_O+ig+`%nvSX20pCxkLpcm9Lz2I#4vlg>^GY@v}e%#j*KXaSWkhhtg zp#yd5qi2Y9uE6??2cDRR-Ge6q-=q5ydcVKhEa+!6i!11Zoht7Z*thY#9`#uJ**#bT z`Lp{j%7^zl9P?#*+)AA5Bs2#Zp7i-P4o5k3qgKh@h8WiUYLjJgWCZQ zp`7izLs6PTk+2d^cw-4gaVUXsAmLD&)IS_b&9q61&mZ$Q2jU5TY}%x%U?l9H=4}Z2 z#c$TXD2;QxS_j@>;6(MG$Ao8x)c>zbxJs3&4N4;B_p6;Y#{40FIN>3W!x5ifY4FDV zdV6W4r$?pKHB4g1t@Hh6>HaHGG$=G{7{GPO?OUkrq^e$f~7w5xwiE>6S(iAcDl zQT?siA1?K(GwFy2;;uqjA`6}5YV zZT@PlyIS<8J}pdiME$O!W`9CS1cH7g-r|k=3(ijOR0?loawz$PSbM3CM zF%s=y@7EQn8>Y>vahB59GD~Ul27+3jb%}zIH(KMFoL)n`(OQ=%4Ep2onz{m|y3Sjp z{xj+g#A<5Qf45dwR|n=)PpvCaR%KxBf@mb#7W5|kStc9uheDBdjbx_P7)It&XBJlH zpmwLPb2V>}S}eua5%z`xjY^|8o-lReiurZwP>DWaqM$bQAh>i2h(!JBXMEMIrVff4 z)Wa9+5G@jsm}wyTj{1Z8d59)rsV|X6rO}pvem<XCQQtt*Mcye=CUl6Lza4$5SN6qyABC8h3^ITd`8p zs<)_E(-Bu_BX*fayk@&tq7F!7tu>SDN)?w&snBRq9N=oZjrG*lRVg+%;?!vh?(zop znh&eID$&@2Cn|^1XdeIas~eABSAhW(rx-vEcNyc58pDdUH6{X)u$gapW6f>aXZV>) z{orzH8yEG)v__l{uvY57)Fshd*~k;MwgLq?(Cl(3(>m9M2fc4H19wrlJ#|Ist(Aap zjy0Edm8osjeiO!~mU4&Nt`@R`Ps|{#++@m0afp^eem0NOGzB%S4Rim+6yLuA=rc$J-EScWKgDJfJiNz45rB zt+_r}Qy2PmTg^XdzCP&BR5u6IFF2M) zF75HKdVIo(CQYIhWL2VXzRTrutF)z#OWgpq^O{DLgSZ-7yfKx9c-8Y&)2K1^#_DSA zj#1f+N_5pzL!nG0UleaqrBatu7eKt)p1R!XQ3?7~Hs|yCec3pwQ~h!{ZYgpaf73Mq z^(cnSOsmuvNu=L5UTYT8`)UaZnZ=Z1DW(*+tNp0Atkq||dWY36?XFbURlgy{1zs~@ zYOVfTb^GKpRelS%>071lSU3_3d4uWO+oLwqg@iHDDe4ueE_-9NBZKgfWzS0D{$`y3 zP0LV{UIrm|lw@6m60@vkwG0JJq9`ygxV}uq>c=482+^hnizH{K%fG2BF(u!qO&Y4T z#aT5<2n;YK3_J-AoSdqTx>69Te$UjXUuaNQyg<<`E;z-0h#e|DHfh3jps7Z@7)W<( zAl4#9y(Z)8rq$>d{0VK|U|i!zHMMHb)zcJp0P6pysq;*@i#3%k5>mX4jp`gzy(tNA zvq>slb-ra-+#UYpx);)QH!QiXh8;!}%;A?BNu`}qFD!spWqc^Cng(fJb?zGjr z&(uZH_--`lZS+@%Lh7#MaH$n)Qc;LIPLlRI2qJNZbIr(7pDSexRl4Rrs`=VD6h&BE<*hKq^*DcSMm&AE7{4mus!rlVuBf z&t&BEreEu_Uln!PuL?d7D>hq~6PsOyT>}lTnu=Os-qwLJj^{!TrNKbEKdgQ`%Yin% zUYKYL`>Wj58HG>L$d$QKiOOwTLLu)e@LIj!lm*%wim&p>g#%fCS!s?%+M?|1t=W4h zY19l_V(HaeDV)&sDCuh@=PG4RSuK4@rH3)XTp(zqnE7f|jdZL+ufj}o)>Io+zVUmd zx6#_#!)&9Hn|kXsPAV1fHJz7k0dn(UtWj?+-djNZDIf=K_7}-@v~rJYE_Hgx6q$9< zqO9D%$jto*iDDy>Z81h3CT+tM zpFiOZ1YJc!_D$(>n(?zG5}BoLxKqj2SmWevVZFJ@>ztVqoQiHIG27Nk%p_4_O%h^? z_5@LqGeNkqhs`l>v?c2>MX? zRjNtYi75~OG@`a{QQ(Wr4r{?AHoq4(#3EiFX`tT0g;vo)N>eDIDxW82uxOjN)ZmhC zO*y2kHq@PYL)|up; zNMEKz2EZ33Q)<<&Z8Dy8poP{>*ilNGRM}P@ES1=bjBzsZH&vz$1~d9{^^q28r`JkO zGI_0Csn|MAR)DuLB`mBe4XMh8y4~%5Aq1|T|IQnY`c+*XwoYSAUeF(IPPEt>s0Zer zb?JTpIDrk~&-!(?46!RUin>Bm{cUlT=f;G=EK?9ml^(=0DPs`JnliGS;-fMP08aJR z2&*6QYhp<#s@^6VS=x#v#*=rK~3b|b^ z-nihRDg$f_CbadAM7afwSjnVO*2YTaa*H`83f!rXBU`d5mN5@R-qMYsI+w3*&936w zMi~$MO}yr&OPASrWj;vc_mV)wTGS&y)MXM+>oSX{b(zG|x-7yan~PaGwKQ-M`Ig2? zX@M9Pfl`Rran*H%8X`eG|3Fh;rU{0pZobP^V>I!*3e;~*R_SG0L~Ez=4WU)*-4spN zHoZjUh?CV-*Nys0(X`nCRq}VJeW=%Us%hag%{{6aacVS@p4OqCE|+WK#Oaf&40d3> zvCCBjbA;AdeMcutZDa|RZevkcmG3?uP{m#xfLVc}#!D^9dVji@#DCxhHc*fUg#OiXXR9h2P1qbzBH;zo- z6a%?bn_qP|tD@!<5~44@T2M2!Qt)Q=eZQ-)PTL)gf9VUIX}?{?lMwT*p0h?RJgz9A z{Y9OWMy#dsS8d^3ZtUBUVYLr^b6rKUK}8(X&=)mB|0Qi<>n~(_C+4?)2S>ivY8l*4 zu?&eZ{)V>ZiEy50T#Uj>JwkPg3{b@p<8PG7W^v;Ns>h5RG9x>3xh31Eo)e8};kXq( z;nXWl{h%&fLz;7xR;fJJc+UACO0))S2x%(KaU><`NVI;$nM#i(-BE)DYiKo&G8m)> z&E=LXEIU)0tlr`}XDLM=&pZnP?cA- z52n6XR;Nh=+6=W|0|P~>gUUqiCZe3w7R*SL?1YudJJZrl;VSk8H1$PGeDLWKsH;#n zzadmEp^0?q8`#o0qwcNng6Xdpt2d^~1`@^;Bg1X43SWf^{5`PSyUt#^YBN6d}s7kriqqsk6=T3zh~Wv;8qGBs8p1q2i0AQa00(d{gE9X;fM~+P$6DRHkbGB$QcXMVUpJDN{so znP!jYgp<_OD0Sm>;w%Dn)|FZ9gJH5oVzQAoEu4BiovAt}@xQDSBg;`1Po=74A_?6e zYrGtV-mBSiq&gNcC^ux5R+G#)N0(Dbtw&DvhloT{q*w}cYxXrazE&;70|;8VeAiV< z-?5aTa#Wp})TarG76T&um9kfR?#g!4p4EzjFPLqGGEkf87R#%?<*STk)LB&yZPr7w zmxu=Ky({((nQ2R|4o#z_9wy+(_8qx*hK-1oYEe z&g9B2Gl%Y%WsP+yGbh)ncO$QSwbt8U;m8Ud|!TFt72`#9aSPO+Jd z_Ge*+Srz6st3n|Sq-I4UPwWS5%(u4=s4X>zmXv1Q%2KPy?%YRVmE^M~;}}2#!bFv4 zTf4X|<;YU4y+O6e_}1FfF){d>s!&vGTcZA{eWuK#QhK0KdhO+olvvv)^s$!-7mF)% zRD%2fag+1el}<^t_+y&KP!oG|8jB8(QEBHqgkL43fiG1jto>nGxr^H$(sRI+25;l6 znh9VlcI?8y(^nzjwTJ|~7TE^879rrZfCjwU2N`8#%0)cBDCZ1~tkR8`sqh*=x=B4+ zFK=F`JVAMy^^`ip!}V*>5Xp?AE&HoRx!OFvXkdA|>D#bRujx51)A|w#G+1g87HGN3 zJV4G>kP1Fez=}%Yw&YXEzKo#>EUw2K8Kg5XQ&hSW!H-(FmxiD=JK#&SRMQZkR4kbO z4Kw3zrA%IDq|+j^{Nc=OK6N8Zj|6-9V)}lnm=m;LUwAL13 zMrpA!)>G=trM+fK^?0h(_AYm6u4A9*ZSV(k9Zf2}$gaI<*NcpJdtRR}rXlUbqgwK^ zRYU<%^ECqJpr~mqIXgnn6mM9Vjx%IQ%&BC_rl`^8n&Lbj)`u^L!>MyUOMQtrBq>i4 zu6l&tYO7pK`C+jYKP=A54>=x*Hx>){(+S@)63k#x)(*iW5hiWIInxp9G#`EHeY`$Z ze+)s=Od{DF733pjfyPjjeL)IrcEMda$!bMb@n-bXO~y~tpGXX^eWa(JR1mKTS4P^j zphFYM87@Zi>Iy=#`L!g4DX9c(dJxQ3%l?VRBF$QkG#_w<9&N0y=8VRcW=rp#`afNS z7cHk~E_f~SQoS%+vhSU%7~#zRIfj&e84M-5oI2?!d{{^Ls0@UsM9CNmEC-Oo*URE3 z#P}YSseC|vU!`uHPmb|72EDO#WMeT%T^owKW{f$aOdR`a(xK-5uZV}z;mkCIPkmBW zr@KAWPPFMXH0T~|r~1yAfLGh%ECN-r&PRl@t#ha}JE0Ht5y?3WoDd}@s;`5^&7XEU zxC+MwBMsi*bZ;!6#h>Hw%K;gpBK5tKjkOpv_rySWmbclj{Tyd56?#maiawE|X?pUz z7MB{Fq&Z7ltzv`>hQcC!5mNy}!ep`vl9R2y$>kERtp?$THJhD?gKi*=Pjv}(tE?5n z2BYOi_qN%^LYM+64A`C7j7OIKK}*|EYc6)7);gD+RVb^0&*X~Ab)hrj>6x=+*4{v@ zl4`Z*b%nM`wPhr!HjAmCNZS^1Ux-b~wjH(7`lxMPU5S!5M^4ky>$UvNrbukIH|EO- znwT9LPBSd5PV$~d58~q)fpTf96qqY(wA42}GSs{m6V*mzmDD~=%#9#wS0jJ65tbhA zSY5q|+owly71gNJYUCu!L5x{9cGqlRl{p3f>G0cvaci-E{y91FrL}I&&0%&jZ#3Hg z48VvM`&N+Jw;bZxR*ijUUw(lXA%iY01Her$5)>9?jy_vINa*g2A=+MxqRdP&j;>aq; zXRdX};6RHEt2Kr-$+RuA#T6rFTuoq3h58WeNwb${HFXQ7@fy{c0gdTYndcNB2iniL z5MeP$(M@StENORrv9n7wP8sLD7ItWAZOZRcXzTaUMZ^?h0+rM#%d)(#NgEFdlF_8rVLQ?7K}JIms7^W?lNBTxqQ=Z4c(GRTQhphRyQp_ zmpKfWlP-s^VS|rUrMV`(Yi3h0n5e{tX9_USJ;vOfG6z>QH^=ghSgAB)(3yso9K zsVV3e+01MBaf`G@(!Dv5)D+@w*AH2v9Qs)YUa_2HZ zB?KGcTWo6%+#st!q`|tbuUTLA`E7=KrcUR^89d6icK*ypvB>VNI|!fc;}dEjSSS+jZ(DAz_RFW`e;OKm!!<5 zG}R6Kh%yex3Ug~JbygXp24n}t32#h2u3DYD=4hKCMkIBBWCoPU;g-p{)XL>G;~3!T zk`*Uy*@aW(7iy#0K-gz+fTafJa(jKgR4>)7nh!Aj&opNet5*@15fv#()Ud3g=SgFP z57C~18Jvu%*Dy7==Ifji6MLs2I{A#iG*&N`Od2z-EgJM|UgnxmG?)v|OykOEsb5<8 z5Di!Z>uxYqxm;ohdXl5dsjd{@OR9~@k7-3Py{e@;El%{F*inI6b+Q1XSz|1e8slVD zR~%ns+fG~1X({Y`VASgs+42y)Y8%5tki$~3S`2T@ z9N|Sl{aM5dgF`S-d^)*$jL4GUp~CXK(4mzr#U zGR3mc7ipMAUDj)hHnF+=XdCNz4pQ^cHlBlx$SUfB&WD6Xu!Z1ZC#=K~iy2}7cxH1> zx`7$<#DqeT{>k)~2@b{lwz-M97fY+ZMORks^h!d}Hgyl%1F=M#H)v(SF7SqYN@GwF zx0lPQ*{^5$V_|=A;J|S7*SKWqHbz>w4PvI%D4mVbw*Narl<*~)nz(FyQ{nc2FW^-g zv-M?eXwxIzV7u;EtDO4M#Reth4>h6S5D(}X0EbaFVkk@%&eN{P_}UqSHFOK#fd2!8 zDZpIyrBd1UT3alg@iez-p%&i86WRhXKY12YIksMB+t{ul5=&$|GtOL(f=xq4!nqG3$D%e$~*=QLsG##_SYP;_28WA@`jYa1XkPH*$vC>V0>@A8q8lq9%k7JVjLFxvfhm3ue+LKFKbB)smP;vQC++S>u8k&sxng{%F?K z==RzuY8H(wv_8Hnc3PihJ9B@Ao7AQ!-ohMzW-)M!AT`SYncK0?WpEIZ45W2Y4|?@J zXEaY1HqJwnVg;+n9g0+DM1B zBxN0q4lZK_;)Vu2^IN)e%M==_O6p(8xFrdEVkjO=^3a`%Mft*FfX=UJk>S)tWQh&q zKt$=*hKAMqkjGq6nSNV*rBu^y!?M1@8qFu?t!#=GxYv9uQ*V}GE5iwkyEWszehpB@_s2Vuun9w#Bu-`b|>&QzJgFPGGDb>g_9 zU}~yr(}3Jy+5|K@7M6m5p(;v>C4jLQgTe;xY#9oYm8l<5DzeofE70qZjmc7nEC)&~ zG413kvg1q~EVmNc={L1=;f=>5jRAG1T}6g}7%>&o)MXfv78nQU(+tGp6~#`B*X!!h zPeX&EHg=oDa^$6;hG~`E#RV5VVqQ3Ta^x6wjJ2q1BI!HiYk_CNq1R= z>=k3atIae^8}-bIrieM-w8G3-DzaYM71l6M)*sKM-k*Ux3S&u=7B`}PEv$1ER-rEg zI4V&kkWRjKgkV>hI*7PdKie>dv}|i4lq?)ZmDOyIm_{8SvjIKe^4np5GyC?yAKSus zOUNIRHtcwr4nz~zqE5(6LD#M6Tn3vgse$O(y5@py&w9N+^^7<9WAF!%YsL0krPhg_ z89r31N22)kXcl!c>IdRBx9h-tR8%Y0?(~E|ybixx*sAJZoZKz93zc9gbFMSBb)jv6 zWzm%^?M-!7EE)@JZtGMw`#mj$T9WqStgiAZ6;RpJIykPS;uqtvjRI|kgQn3#U6mq} z({#HA$6&(sGge$bV=dChb4#!=45Cnx11M1}KnK?3*2|Ta){#W87MEjZ-y#|Q@+zFZDDq`nuj5dlKE zT>l{o#8s;gbG&+F5*n@&f~$T;OLf6OFn4{8)n5|cK+w26Hs{smXjdyPb=e|5 zsK?q<%{Z$4;C3pAUJyU}((kB0d1C57)pgUJsdq^*(AXimD@iYb+}4Q=uIh;m&T8#X z^*@((D>VheKBd9mq=owHw$PAviH5Yg>5ZW%Vl*36?Mkx`1blxPQ5mea)j3&jOLgmQ zO>WlPVqYxK?vE9V=v{el%JbRS5{rZ*15CEiLkml!62NL+!Cq+J+opxCY?Gd2selE0Y8!R`kqNZS?qxmFA{B3>zHZl02l)f@Y>7@m<6lrA83razya>xWN+h-(wWQgdrU1}JBD%I#?rq#pMLv9pu$D|BCt9tEf ztk)Zl2b#l1P{-Az8ao9;SYt7%MgxHMfSh~&wGw|=y$AsKH4tA&hfWe5CmkSp%Hw7cnxOqkvOW+1YWS*2ff*EL}OsA3T6Ryp+#2LnKwLt6L1Y8ALrg7M> z=vJ?IPMhX(i=P#hE5#aBWi;Z@;GbQ^iq2pBA{AIu@T~5bshTjK=0+MH0wi>?;?ZQl zU{2$<8bj8BXPvCYmx@$fAncb{VV1_`H8+pKzC&}{9QcAN&jhspR!3E9%AtGL0CsP& zVgovStNzz;{PfPS*&=#_*fpxlg?d<5InkI)WSFQ_9rqXbB8f~~iMO@j*_yPd)I}A& z3bCpQXquRZtJ2^_6OGmzw=z-7FoELZCWf?u>d`gOJ?e^=FA$GL;{E~eTVkgPT~!3| zm1(9Holy_)(m2=7()iVL66vsDmL^il%3e)xn9Q~ro7;0v-qOT3c>}PW)iY)5?8K!M zMxtiP)%+#bKz=FnxkoPGten=q*m#2~9lkl0Mf%`!dAEqX>zlm_uUcN*R#o5Cv*J>$ ze{>mtG~zZEB{ZC%nhBwh(GKZSOkJ4lk|hSaq)SOD5}A$NCDPapTy^?+_DSBoadc?Z?UE)S11GdL09e{v|8nc2ENK-SWYF|h+z43D*{8a z)Y5}IRxcBh8vfX<+cDN=JM?iXxs1!rR?00R4>Z{{HI7n6Ot#ACN?fPkG)Lx`JY{wf z7p~kQE?ffUKW#Y7e59@XziEOArREmOehv*FzM)jSQ%;pOu?6OD(Eie6ac4YqC5kYl zsQ*{0tU>>FXUw91;~5jZG^hYzK%c*TIu+yL%JzA!bx6RBo7Q%Wq7{Wr5!INiD?5fA zsPv$e90#KF8Lj?+{HUBwh2`lP@DI4m*Xh zTB18a$7gYZuFJd0mYt5(+1(h8-HluKqq1Yd7NZ)-d&{t)l{sx_k-nj0bKTH^W|y|R zH%4ElN^PC0*YXhTwu%p7TAjwNl}T;tY1bF0ChPf~RBpFoC$gxZs;;K$ZC?%TiMzb^j}TB zDgDa>?pvuf-@fAQ;&?}>ArfTH&eTYXy-HjQ?ld{<^dB2C{n(h{$5Qo2EvMUTLKN?| zg;imHR&}5u5C~Yluj9PL0={Z<0D<;`RnJ9Xe6}~5ig0Eh5qv|dS-o24SN&frL0wGs>l&bb&8uRvjnPA3E!CyA z+1OuEyd}_-$+;&QWH}~ilua-~vy7A~+St;n_?c5HI$r1WY8rx__#=L0@{f7i?uu!) zKULcPQGkerw3^TciHlz(^-46Q;%32Z$>aBU1uGu+@jmMrczb zDt4%%l8za$8b`iNhZ+!Wxdt6IA?WRhv?ZjMZVgH@o5g&L-L-DUZow*qM5oJhiq~do z2mIEgY%J|g8pQbE>btnaywmz@wizWvQf2CBz3DKsLA^kdscd~w` z*ynE#H2PQJE2cMSsHdE=7Hp!Pw=y}VxG43)uukZZSrup7u8JGIjV*p9L)#KD$hZiO zHbJbc%4cO)xqsAhrp9H<{J~OCi{0Q?qbH1AI@vAWKucl?mmkXE10}$0&qN9?SG*&v zr^=*KNv$yM_@W91K$hRyif$rQu#VN?aYlQ5r6NQIa2}K4cAUW$UnvZd<80v5#wjS* zXAPY;&H$Zevl?O$@iaDyoaO2wvzY2uW&UthO2RT(yHqSb_kUt9GdnrQ*+dNt>?Fv* z`5+Aw)V;FyVFiEbpfX++lEf%4$&o%%V!^=B2egL2_Ngj-;=XE)#%;y5=k`PSA2AwNF%tXqQ6U(^{#WR*uPOBdJKS%|b0O-i39*^|Lx=q9^M+H2VP3 z41?w{uJT(Z8P>4&Y4oVSWaJMrW33h}cv1p4180n8k+G36VnhsBiR)8Yp z59#|_&=OI{B@DevB6V0zTf2I_Wuq$7*Da5*e5E7H@2Roav|G&;vFyv`GBjfu&6Z89 zN1o~Z;!JO*!v?KiHvDcqerH6TbH?EtTMXoSIhGC4vRo=VJ}VtF_LFesGzq&Ir)HJw%(cQ9Stg%h#Z80F_Gl?D z;BuQHWZGbFGvhuLU?Cfmsvh4PP zUhBf8(B0Lqr)&FX9`&kwn?>^NSMS}by4~+@-cNPa3Q3Jn1Z`Ok zjgSmu5YSj4k3+&VAQX*3rUjBU7NJmM!eX$E-IfW*IBZL8JB&IUA(LVB$Y)(~+Ljuwz`~V`V~U$tNJ2PuoBDJQL|TZN7QoWz=gy~{7+Gib+E(SR z<++U1JX@*A0$AoNY8uO|tzVSPHfxiUSmsW}&}o}HNh!?BQrTC_bVU}zoJ1|IWl|-H zQC7p0fE?L2wnJ3@)GFWoAmi?SaT=ZAN$;T4z1>cSERQMT5$P40wH9$B;$%4N_1R;) zG)ToRPv%u*q8BaLHEmH{ME9&svbo+#j=7(h`R65w0!m;lwo;;?qPt6^mOi8yK8b8a zG3sy{otr0dHhJyYS~?5qSB75T>7uZ(#(9!LVVWhVEF+C&*sP)wGOHl|*v9yr1X1=Te~Pmd)s0Wb)#KPPAZS50nz5>DlSxwyJ%Fu zr2!m~GKNp1u(D3U8OkhMLM>b6qc$JbY_~om)|2S7YPCBWX4N2pKDP($*D$w-Eu1)P zD=m8BinwCcc0az8wv%2>i2+yF1$fO@qHgmr=I2aD@gNzBH(^YUra~C4QNj`N1L?C5 z38~1VY_GMl1Wv%lX^&smobC8_yV?CK*lW?|$TONhPyK2%r|O8#ZztKa@5bb8-36Pv z7#<91=c>B$qEsheoiN%F@O)VTYR;94Ea=H+5IGtp~aqla0nPK<3WWCc7OPj3u||f_~SNZGgIDGS{xA zNa5Pqbuv+sSeMGhcgKFGGli0gbxYJGr}RDYb$S>teK`V`SCL_{3P+6=?S=2W+&rL>~?K~TMaoKx=gCRX0KdMQVp~UaA znV%JLm$pg|B+)M0Y7Uw^X(t_<`=L@(!`6N}PFmy1&^%xiiP!HmhXd&Vo}KFw?PRQD zk+G=jNxeet-$<#RMt_NvgZm7P{>pBL%oDRO>v`ZWeWPZp(a+yW$h{7BJe#>4aULMV zFFPh3zki24owuwkXZ35R0$%Prc|pxFo%9Z$74gs& zztP7W(|J~j>2x$oHZ+JQU7Q_usZtb!blHn+N_*~z^jOw)*#wYGmtb*SW8;d#@k;Y> zgG!N&I0~DnO8;cvx9LDmN6PvwciC23Vg&+KzKUAD`HI4NX*Y4mAt$Dne^3HMr9_2j z4dP%72E=@^_1Dn`zv>i9f6F83TBbBrT73&sn&fI*n2rgsGCaDrFkL*PLD;c{>EHks zS4>-&GWDFt6?$XPd>L9INSy0Os)}qaY_-+#hO4gi2T6}`tCiXv`l}-;aV*+RmT+W> zZYWr`CY$ugtU)vp2tt>i+RfT<*dMYk=wQY=thW@*8ksUId=_RRm|O=HcgkoqIH0G& zyt!6ILN~h=w$yf>&zOAcw*}cMAaiH3)SA=|NY~#Gk@K?ZkE(zle>tDq}Mw>3CAKki>-3OaDMFRB`az|Ec%u)$(Y3f;)zCr;(8sI=~k zszQ^sPIE9y+S@7hy?2$GQ1AXQ`PGnvr%^LaT%i5~(c6)yg%nAhT~E7<$klxw-7KzWKA)r^LF0L z>=K1b;sUkxjcfFID&`Hb|EVRc*ALVSXiZ&O=IG6yOM0!>VP(vV=rd>%mABOw$~fN~ zH*1HgDi25z*EX)*A|qFSK;1*`wbp$zbja1+h9c8i7rrxXP-n=~Ti0NE+WZs}r z z%CKf>j?uY{D02g$aTpCg?0hu(AnXH1KXYa7A z1#fu!6pX*{_6gcs;r_%5Hvp^?t4%?&4IV)tUz8vXMUWydheSRH53=>@To7!~aG{NU zZ#UhOC80wp`;~_id1jxqx9R+xA}lTW*4o@9STZkVxrtV-t8s2wE1o4NpV{i1qZeq^ zhyA@Fy$g#+01Wt_rm&1lt@?$7#!^h9GR}J3G=;aXU#d|dZe>+kof^NQ@9Oz2`qXje zTSjXB3eWR`&gzFihw;;6n zXR`1m;H)6U#;2n=klZ&GcbG5B3h|w^3}V$;FPt}b+AIysybWJeKpqdJMB%s{96;-B zk6YOheKx)p?cdW7Wip@j&N|8$NrK`)nxS77wc2G%m?nX>VX%o)j6(rjY|)8wi; z&vStKXO7T1n39_6we<8YqtldQ1fii7zi zFEaROV~=N{Phsipb*!hdULz}9^g6$#S%Vxf85Thu-bRsk?82CYnq1ll3~VrhX17+* zx}g;TYsXy`YmkR;o~9({wE=VV*GN@k<8jkidS=r|KiN$?bXS>WUi5c=M_aaVap#=} z!HORG#dN%XF6}jkN3bE9T(EAverdm+HqJY3(W*5MuL;jwF{SOvVnLa@&UGW&o^_VK z?GVy-$IY}CG*&GuRM?T%?ET_KUW|83QeR+^9f2LVD(Fx>mdjxnJg~M*;@}-8-po8x zk)P_=da_#u^zP=kET#Tpe|Y=NVSh5vvkf02S8R;ELlV~`FH;BBZ| zjsDg&kyNhiq9QxD@hh$cYj;|m{)kyT7y^?Vwi32A!r}^|rL=cIG|-O8zbDD^o&BaI z4ja}D9>wV>?lyyJFb7fQHB>VJoOq{L;9_XZO@?P`{3 zv-S2Q9fo9EroARq*7brYZjNH6EPPXlRBExw6QHT2)ru#*R-ZJW)|i@$0NP1!G#L!~ zw0Fmyw3`Mr;RMy zEDb*=7eGj@A~Vgq{YejB)TOdGCDP~Xu>5VkDHv5&R7ULii%@&8+8@wxX5t_aK}O@G zJB8->$=i_eAhf8Agf-0&i{9xZd(BQP4sQgsX7L<|@G`O^lz0x&vfw~p7a?5mS+D0W zk+8>D#woD@F>TNj>ciprtjQfR>(xmw&;wDrHLZeR4wb=>48+9p=N6_lCh71D${8UU zvBY7vpiz?e)AUaDgodyN)Ze(@Cx*Ot6jKf0D}JfWLV~$pn(KYY<^xH{=v7&j8S&O1 z&ZJE2L;DV~5!CH#-44~lde(!|9#Q;+Z3^S5t6k#pcGA5b9SlReU_KmVC$C2IO{IXr zKOat70luT?e4#dUiPjG>qh!nt8uJ)k_*h9s`rEz!9aL2;^;?}t92E!~Vkl*$EKxQO zh+42d38Su18zk3~^J~aYk7clu}%{Cp z>~t!KR@n)W{@^I?laHO|0G)rtCO|QN&?+W}Yh-8wDu2vZT=O(Vdwf z>45+#nZAb6=RIVPAK>Wk?v4_?HO=McvBr0rWCBiKJ_sMJ7=!SZA|~$v?Ru>!ai9?` znu@r1H>jQE`F7Jm+#$t9cPVL+hgTA->!8eKgG9$W*@QY>?PdBNh3&>pQ(AGdz4&Cg ze8QBC`K}DJ?~h`eC&q9Fvk_DVkOrSmK3fXNr=Y>*qs-=z`KUCge59!+E)$smlY?b4 zjLOVLiz=C629GPDi4c{eX~hATrI8Y)UE?ANPPt}MRIZNo9bDcn1{eu^OI0W-yi8EF z4p!T66%L6|IVsD5xO`L+T#GWTSt47Mkrb7yWvLaHuZqCHLu4-Z^FQgopp5OI0wWT>`h-}S_`J_kK++;nf>BF9Y+6I>0eJJ84)21H?P ziTBjNwj_v9wQJqc$5pPK992#^3ISJ0g(7^klwu{O7%gRLoR44*^g#Lu7GjiU_3#v& zehqZE{IpOSWPUmwZjNRQ{Xx#r%(S=}lObRUIb$+W;%2-BC340q5#UA?81{u6Q8Y4K zThRj4kZnb$!&y@`k`8H20TIqu0?~d*UkOs-=2mh*B64n}kl@;0XddOj9IWx2Nc(J~ zM*6nuz8==M1ys28F=Zkv&7D!S(iOl&MqU%A_2bK$c0R39~9}12fR7 za2iiObVmodFOw%VoLn(6MC7dRCHcx|p+e+k-}%Di>7d4xQ{S*d71Th6$<4I4h|14M z7*xJ|BQ&gh1zAvT`HpQ^ZUq@8H+5SXl@lPrv`*XZH>z=3i7`2wx9w5+T1YXuT7nHw z+1hA>^bDdT!t@M@!%LjUcT7>@f;Pw+NI@xK)<7l)ac zP&lmQ5?#>Pk%csejU72vQ14KpufuwWN*Xi|fl=dO^AL?XsK+3e#LPrNHUPJXWTXk3 zL5kx7fiuXADrlsV&NYOMR5D@E+yn2~gwOw#vmvNF6arLFGag>S^)xdnZk)FsyFrfg zc6ywBF`XJh+7}}!sz<<6PPiVSQDI8xJeP$ksG9?7gwoHI;YKI}9ZJ8UaXJ<%jDFpR z_D~A;ut3dR%|{P$Ggk{OCST)$NmQ<8N>uH_3c zUx^~p10_+~a|Al#`lcsF5m#UkZWyK|5jlt|u08Y*m8TNO7v3%9P&}MPutd|Lgf&FP z;pp_}Hr$A2j!?r5Or|(9+`ub^jo~_q7XO9QiXO*>Yo;J13ul#Ja20N}QX-{r+6yGG zhb|$lEm(5F`D1E8QJ8gbKL4%R>5$}(Xq~Can|Rw`kC(x-{mpIvJ(we-rfvK?cpsi zU}Lyz>3#h5RRvTw7BWw6j%6)Ou0C>K?#9J(RK|8npMHVCMk&`swy({#ac$pB=4*Ynod8q=bQ1f@sVqRDil|d~^2l}!G2E53 z-u|PmLF4+5fyP%m)lsvgM4kLpq7KZq)rXT-_LGy!ohW-ne`WG~PPW*5q+@WG3_VT=xzqs{0~X z_@r>{)S_hYGVqQsmMs9>_~IhQ+fJ~c9B4aD)ZV;pJBPTuoz!0Ipv-h})xUoS+>1# zwd9};@|>lCJlu1J#GageAx;qcYIX#q;cb;PU0V>)OB>^C}ki*kN0c|IBdKx zNceT-Xqworo}PQ+n!Cjw!%YrIy~qKWOpC zM&ZfB6oY}u$x7$z129qq)d!43o}9wAMXe;BTmq*Cb{bFnwgv)Wnz)R=*N|1(4wAeWWqYauv6cpH= zpPvB=>iH@aHn04cQczwh6}In7XQ_hvKBU8zEkE)VR5q0gJD-4u(}Ly`Mq=~~P#mTU zn*qv14fo1sez&u42~dE#sE)TTYv zX{d|1NDnpG+w!T$v_61tAjS5ax3*2|13o=C!>fEb!k`mhdOdj8TPD*!j#uA?$Utus zHi!86$jo8tUcG_PQeS%OKEAJZ9h>o9g_zG#c*)G+tUf_qK{(zf?24oD$*Uaz@EQ(h zU>@2ONL$uk8bi>yhX%C`=_{${OpCW`822O53X&qb{nj|3Sm2bC->4db2E8rP72D|5 zfFXX`mmwld*vG;fyHBP32WTY_vGr>miJKl_SJnE+jqx3 z<4~b=4DufDc zTFiSzftt+se0*c4W8=m{>fC$@%2wS$!8UMx9|S9~X)gmiLKHg_*veP;7_e08ZUMIT z;i^A2H+B6UJ5Csv;L!u1w3>ePlDzi4d}qHo<~N#LQhBW~?MIb_xGp0yC7{z=N5V>L z!oN1g(MsGHkg;jfqiafJklCyQIinRe{_GNGp80^|+U2Y3alF@?#I3`_s1jf9lR3|z z-$`3XwMwm)U1h!y-cbPWCB|FS{9L6D;~XwCnF}Ul%vgZTSkB=xcl+%LT^E_Z%vjDL zGA}dE+1kcjg$`sLuIbFPp1-ECoI_;hJv{U37hyf{yj^nFqjKEp2sdE{8AJFu}4kP1haEX7g2V?%brQD07eLjME{>%A{-;PL`#&B=-g?0CYgd* zxusfR3eP(4jB*7s3$bA3jcfI6QDTn9D}b$2kJXK9WP~Im`z-pyGI2_>zHyCuax;p8 z<`Ptgt%;c3%yj??VC#}Lok1iYt*`C0#)q@F9TXFf)!euyoYUE)31Q@t=x7EZe6->I z6MBXeEyq{ly@dL5orH!XY|$wsU4>;!aJObU7Sqg&nPG+fgyfcE!CB*kYazwfsSowlI3oPRRM%a0$?aT)jqaJ&{#<2Aa`9IS&;nD}WAX7?@-3tzv9=iZEo%Dg5!Xe^MQiXW82weqlDZ5oRowJOPGhmC^ zEn%qFjg<|z*^H1az)xP&dBnFn((~sD=aQV_vy^``0evP;)s{8$)ei``Q?OLWr@)_-$;D zJr0~%%Cf1IzsipdZywb)Dz&4X61`Vsn)#!Lvoq)Op6_jQ+;c#J!Pu39s!u9zV<~p1 zB1HfAT%#;kw3GowiUJb5s{QP(j(*YuZTQ^r5Ze14%{lia|5X%uU`UI!M*kWl0I)yDp&tHSYz}aFTc~D6)wpB@w zu&r`>l&PGFz~^?uWf6?)9k!Gbmb>2tH%S~=M2{=35%}D3ogg#^uU9??3SU!D*YGe6 zA!OArBxipk&}9uNny)@Va<8gqOz)iJMB`~5*@h{iP2l!ck$ItH^3+70&Zn2~guYHCZGK~V6MC;4ou|M5 z01@%U@w8<*t*@&nX#@Cr8~W6V5i>n4@FIM=7Z|${E)Z(sXDrARrT0(ibRxvO(Lgj! zPM8Vdyr>YZ2ZSAkx1FH4rBGo|?Z!}q&{89+ACh2dWF9n#I)ssL;guK`J2!T;)Z(5+q&`OvRqCe%neNoyR9 z#+!|EF(v3EqtPZwPkB(NR*kR3$&lVVUMCbwnLu%`-%eNxF6m;7C_F7bFLZg6fwVmt*>^s(g#-W$s(sai9kxnmj$yOfRtSHOEP)6N4Vj z6z^)9Ap=?2;eJ~tOk>KpK>FC%bcE&{1HS0sao!*?C3t<7Jxs&q2cS zyd|_BVXKr$+M{lNNM9Gt-zJme!hH3Yl#GYqU@ZRRO-5}iR2Zqu_ z<$f;e>L?kX9q!d`bxn)LQN#)XlGbVYR=E{1uwN&$!7vJ0AoJ3tlES!5V=R;>-#Jxl z*( z#yG-#fua@C-(*=Ey>Tly_6*wu(MYNR=m4TGp z%pFnKXpIfkyUj^w9Mf0Sd+l0RrL7;(#AR1v%y>bEmM(k6G+QU1dH6$(I($ge9{!r2 zm!LgMJ4e<~YRTAX*~;G_oi+k!)Orey2k6i`3z$HAY)u)-^yhrl&y6`X5jM+MKwGm_ zbH4PeE|R(StI$4d2=7IS>@tr$nbQRjNgVE*lZl+d>_)iAS6!5}{fX}mS4^1L1u56z zMJl2>b!$wr>iQ@Z-Rc1%l)g^iIgRP&F-dg!^7Sj5^xe~DC;f7A_Uy2=zq-8L9FqCU zCY_vs?^n*hS-)ogymiI;?OgqW_1nedu({XoS-)Ry_L`~n>y>mcvVUyUmzT=c&sVS4 zw{KX#*86voVSU5;y+Qn&mi_C|Xq)= zOI}&p7p1ziuTq6+--;^JKS`CQeUz$A`=(T!_EoAn{hM5Q+E=amv=n&->R*Z~)HR+} zqW*oS(KnQs6^v;P>aSiq8iQLX#ca<1+QiC&F1}NllHajKyl&vnOCl?u4KkA zax%n9p}3t6LM=C)5Qb`U%ACUKIckD!U799RkuOe!l934=jZN&4bd%P&)}@OOjq!3T z84mexNw3{8k=M3p+#=7U&v*DXx8rR(XYSyxQF+o!-<~9F^@fLMu3q7- zhkRVl)@1m^R$cLu6H9v!Q!q47+0-eFW!44~x#kdBGciq+0$Q<^{J993tzTcim)b5JI~$TA zWhBenM}s7)uqn#8S|1O$k7`?28*vm-yS^d#-(nU`&LfHlNv<10u-;(W-AZ-~=25iK z>Gu*L746lrB%)xra=qD^Bx{7-)n+%5dSbLx>+PkzNul4O>hv zTgjl)Y$e^KH@4cHXuZ|%4pL%XD6_0Z0^ZXN0ga8gaW*FHw-QGy4Z4PYov+My2OXM# ze7{O8bQJB?JM*oMZLwB4$B<&XTLO@^qBQi4JprTjb=8uLmG4v zJ0(%w2}3rDwv~2zt~pAod&54-sNb&D54xs3BWa0qs)0bFX9>2cSrTsBuJGGT?p?MS zg-m<3@CIpb+nMYdQ*IQF79#a>e~|R(rX4#Xv)LcTZxMhLCgD8RIg7MEGLnmKL%viP zG5Kdq3}FmZD0Y+r>%$lj`n5WQaPQoLp z>9nVaJYX>m`~#bbEDN_%&wDac-WU=kkJ*InOvT91Y9x^?tp=tr@aTAXbUZv%ZJN0l z(&&>pn3I-5=aTswwZo(}VbeA?ABd{bOzv#EO*;8#h7}lNXA~}4G&<_F_J{pm`sE}f z+EqEuXga>)x}O0ZF)&>&0o&VC7Q;1Ba1UGM1g0GlNn_@{8fkKDW@^|y0Zh@-=r!gb*Ygb`cdWMOZ6hFeB)QUTG#MN# zz1C__IYO~qtrC@ek=)G-lkr5#*x&Ms^loO6zs0Fp>mT2Un>a&b3!r_*oU)|^Y2pa#FL9MF9ZCr@?k z(mt1+pGiaAn!_$CYroYjyc&qRJkEBD;!blX>BQqBkg1xUmpSV~jC%H|1>u{^7gfYA z8`nAC%u%YLu@t+M7A;=gTyb&`zkdDhMk-r9EQH(F`7+wowSv=cISAN7aCtBqo%R~kFcrt7QLz$(#@ z?>oh$F!5xRjN&`#ct6WaO24)vHWOiIF(-@Y*@$c-chFCJV^b3|)2{P9;PFz0Z=V{t zMxNl;vu*m2bUWQT1L-YMZU>iGF(L7-VR(PlTCIbrDC>u zp0^flG&1YD+nscjf)h`59=eWJcG6yRcog>s+y{0T(DidHDKw#t)hQ*JLJRHmU=do- zGa4q{0g=d{1*GG*@C#%XZ}-vPHoOpn#0B9qg($h!(%r7MZW#< zF^qBZ&COd%7BQ1_Sd{xcXOO4_}p{0yZw!7TU+vyta=ouQ!jeTyGvuB* z^|d6MobNogtgbFrgZb3y*bV9K^KM9~fNvdB$K-h(Pse0=9aG2Tc^zZ97ThzHypE}B zcKXfs+#Sqy%GKf3Ew+nQvN;@a8A{fx}CK3TBP3coC>=t z<>4ywF_d~;Ew!$dy{Tvp*rXe4Rc#%L2~$HTTEd znxbXiNz*U8CI$9FMtXEni6H2*@YZJzkGD+evSnj*r+bQGT=7Qk)0u zyPr^A%AQc3@dN;|qf?#%XpC3d$=efWQ4KlsDe<7D-kZp(OE%yvo}M?41DMbh+TlvC zFkT)egQ4+412*sICgUc1LW5p}?@qg%vt_<~m7kLtjbV&2Z{_-Vp5+Ot0@KMN_5q&7 zVTrXPLxOk3f*Xz)bZeorSAeu)6 zD)c}6hK3)%Qp*6CXx=->!)rVABVK?-(KjI_Ji5C+&aUl*{TVy?N7RMxI+>H{o)jpT zfe}oh@yL6ePF#G_b5;s#$XEStpqIx$6TR|Xr&mW=0|Su2?S&fXmCpuh$<&*u<XfjN>nV)= z$)F)svxZisCdet8Nb9D`sId$1)LYEy zRmjF+NxRSdtbU*QS=7Dj;4DAz-N0FX__g6{7&;HcD9bvX^;!UFrHQM!vIL&iCWU2` zo3O{RH{sLnpu_Z2iP0}(FD?tsB(e5nTt+ZBYdo=qhrz03Ck&-L?B5L|jq<&`cCOuY zUE-P~@mVf$%`0&&MB*+~p(63ZRM^f(!nV31EiY7Rxvv(vxAS>}JMUFJZ&S5p!}1bU z1AT=+9t$-&s`@5C)hkKWR4pgqG`OWa8!vj?5T8I#)tPHmjEHppE3_ky)&-?RFW!EI8$aP^` zc!lA(mD>dJ8+{B!Ziws6{G|`r|5w=PW3UseF`3i^SqMYPwFie4wb+RCy2w8{XN^Sz zRut}d*g-6$))HZL~>J{i=V8r(mROT zN4+L3OjaL<*gQdOi=QIDGzY<4I=6wT5P0wbqhaatwSZ=lMYbYH3`eICRBG@yf|MAUQ<>WJvsA&P-#=Mi#eXAvz)i{=HSt+x168fhocd>i7Y@8BF>j$~s0<>j(O zD|9D%!0w3UM_R3CU2Pmz(-FIHEto~obE+xZ)lS>fT!;wTlEK7P#GR|{{ ze{=0%X0f2s&L!N%r&5o|SkpF!qIJpSmp)d`rMxESD7~3P04g2 zG>dIurKi#P7_-y&J#=(2^Lqfcd3~3rwP~nPm>O zVRa|}Wa#jQX`wmhM^EOqg(hvvpAAM#T$7zox=cn)OQZ2|%vp7YuEe^8hVj%OqbZf% z;`5Qy(Bg}4dg;of4GiPafbQfwdYdJkfuCuK#>CH<&QObRrfsT4EsY_5OkF%M#gBm$ z_MVkZ$sIi_Poi^?90&sg`KzH6syksS1xR6tg0q^&s>3p_(6&c^w$W6VJUEXfhfttXo#gR{oBbyct%eV$EhC~KgM!wRFFr}eapF2xaBrwzmzpvd| zAM$5|ry@^VOrDDJ-P9kPJ#pm9TU(Ac*<>w^K?bJ+%TDd-tAwVupeHiaq_X~Ls!270 z!8Wo8|95EBmcS`v$kW0?MG?u+R$GRZm{q=7@PoL_RsK^or;MU)3{U7HTi+8~NL80k%D*U`F7!>J)i+s;6rRp{jT zB+A~j^4Xa#zN4vFJatopD6FDE35HpRfrn(HO*y#gTOT3xGzJ51&=<6=hW0RWw+A>{ zTO0St+PDpVhoOzjgSx?ZDTB7Lixe|nhn?3w`}~u8+%b;A`B!H&hqQFeLOCi)m&b?i zG+Vdxx9^N^+DW4qIO{s?ofO*JbZREiVlv6K3`()+H(@||7Inb`9GP(^`-=Nh=Mx;J ziKu!>P)KhfP^SYIUI^33qP8O3Ew$CT-fRMHClGR3<&B;zc?%C#|Fml~$ClSPZg;kA;6|_&9QWR9#Q&_mTCx;x8J$k38QM z_rY-HUK;leHVb&~h7`cXaqWO+?e+w(?gE9EgmVfWJ!gU|?wF}X<})h&BbZPbnv62T zG;CQKCaQ*P)LEHHz1f^fJEEjTT)0Lh>3LWBvZwLERM2L}$*@a?Q6iqLS4h%1C5H#1 zf+RQ1=Zc6Z|4Xd!8S;XQ{?(8bMw$pOjdPx9E*3u%<53uh>GG9A&)Qf9RMwtpr)wNG z#*{41AQ+^>6u>cDFT4PfY-?$hy?0bo-PbKlM?pYQK%_nN4g%7o!~!T?K|s3nPDFZ3 zC?XHNgLFlTi1c0q(xmrZ5(vE})Bqv5hu^(-eBT)NzV8_KkBsEx?6db=Yt1!RIGnvV z{M%@85eO1!!#Kb8b>zvQ!JcI$Npy6Spie@9X{x=}_?Y-zEtWbPoh@g`BUieLC7_0aRRsD9ZCL88UXKK`~-gaO;ealUD@C>R=|mn2x1omUR7F72MB z^XkJ~tp+W~y*Kl=zsj;mFENkZVICQ-^h@?tLnp{d%E1M}g-QqIh{Q2^ zGKPjcfz4fdin0sg{;heW+J;v|crR+6`&PWCnRnpVgE^X(lVy+K(6ETMJhq2&|FCUn z`ftmya&?v+|I2i6q!R&tC!%G+-z#FbcE!9mG0P=)5%chBmm0;D3OP+eSIlIU2N~^1 zk-IiBSG5Ne(4r{KA~UHNvf@^%>X7feYSe17`wh%LwNx@s7;jF0+TX0-86A<6bS0m3 zD!OB5l|r(^@02p_^Vw9v&q*uqqgYtZPlFsj!rPG_cT_$QduZ5L`A(d^Eg-qt&Mb=) zSpTam`{~UiuRqcX@F^y(6|IZHE-hRSv4TmFgLA=jt(59$sU zGfgpd7n{{iX@ptO%xVlx-^*B1k|i(C`kC{BcSkwuAD`!JDN-j_bkZ#%?J65Y|DJ!% zch*-a#r@MKLY=JQ2G!{;EuN6olhQU_r1y)ZhEM$#w$Et%$6e`~r^9Sp(dzEak! zdF%gVolDdxSIC&QXWR+Ib&IEcmsfjztotIwq9a#==H3`}u5^T5Wjs@5tH0hXm8(2K zj`%QGDyLv2^eVICDM{9QLA0>rJTOf{;hCzg^~@8nWR17hnzXEjv?d>%N?MN(uF31P zzWNY6{)YDJ!Ri~@t=@zQ5+60-qy)>PWKgROoZx(il7U zOQ|=>%HsIjmyx_=-!C%HWyBK7B5O?N8zI72tY7_};VqO?b3n&(ddz#g@}0Yp=kIml z7n=6V%yiAsQBXC_ZG|&(Wqum=X#E+jBog-uMS))%A2tzw-^TlY{1o)Rhh}P|m3DFq zSj;J5h{y^(lihu?xwe*V^S7+q@O%4&r%^{r;eq>QGB9@M&ZGB z+HGa3CHn07O8TjV_dhNE>RYpiGG0lJcw{Ggl=or>N$pC0I%&>!h3Rek;rdIe9q9W6 z92O?)j|vy0XI7&v1%~z?CI|0&Tr3~{;Yw^it*ZO(H1$Ra-XieGu@n-qVGt4BI-Kvq z&)($Z&i#hHXunr)K6h)^gwX}9|CV8jGw{yj{D4W3^8l-@1tW1l`YgxO0)t`($-~Fi zrPMu9{SU=rpI_0^S2sO#ODp!5A7RC2IVa>GUReChcQ^VcK=Ir^u909b<;PIkfwMnLh4n=Xg z8ohe!gz^H_RGjmKO=Q_lxrDWQAJhNV9#Rcp$|ionA1Sg8CO32Ibsom2HyBPxK2H79 zK9)jzRO8qFNZVfxVAu~yx3fZWY|L_UM5O||b4j=L(5!xk-D^_~)#yn?wOY2OJymi) z@96H&C|hwGS?bHsi&NcB=~>van!yTLI#PwPreybU%z z)k#h2XVyK#anD^dvO|I0&^x@N3W$n-({DYUFMAf7dihSbRD;aLPmYX{!sqsGn$e|v zjtf(7A22OAJ!8MsF7EANOh7 zu?MKv4{|=Hh{)Git-|g^5N{aPf3tve-gcN?`Yh2*TZS zlr6@4;$<9F6?f=jmHMjuqEKIZ*N8ZCW+9|3Kd7WsEPeLr1W{WokFDjtIyLV>&%I?C zxB8;V6Q`5@1)S~@VJgu z_mTbXp!H5X<1O^It?$=Fqd}mC=Dzkv2Q8D(YCW zUzWI7CrLkw+NJ(M)<;E8nF_c5CT75EpBAP!Iq)YZPFVjD%L}%Xe>j|b-ag2+;rYyV zSj}QwX6|Sbv0eY`C??rG*_rW=lG4Nhiu+hq)U(tvW9fFp^nq5pQV%s5tzqYr0#o9X z&+4&9$@RiTtL^9NMS~`eWx8(f89m3RlN%W-gS7l0?cwJq4WKujWk*An5Qq)OL3tIo zgG)pRUlel%d1mF68v@L&ODjlOMK!L{Wo6Ymzo{q zQ%WY+;pz6TgrqS0gIZb^)i2j=%|ZP+tYRrhNN?#~$*uXkdv3My>YQW!*@1pMUFo*j z?H}wI(Od2O1|hxZdh@A0{jB>O82D|1unCDgN6Gq ze?0z^xjHk}ptC_*yjB@su;JsWq%PwNR>^BHS`C-$;J=p1b_t(` zIeQw-Vx-}bqRwVzXdxAOO436Axnxthx%3L>*L{gEaYY#Ue#N4INvrBU5hfYcPiXnD zQKjLh!uz!DuhlYQGmz+e49`9@&CoP)qn*CoFQp3pKoa-Ldt5A?IQGg*GbwrCiA~Ae z0Il0f>Zj!BC{A(PA~mCfSI6J8HYE9WlH8>>l+{|)vGbSD^|4Zwlgj9r)r8B3C*%|X z|B$Tx;>kOS`+0D9CzE)7M`!xXUShWOQgO0EM0;eY$(+Nt6lowFs}Db0JWiKnH_@)ceE6s^mw2a4V?OJ4 zgSdlwQK^Sf)d`k9Euo`e`M*Zfp9$SZw(6k7paKokQ|_A2hqhm3D<5*H5TE zbt}^T{W|V_)R1xQeDJ3)0kX@B!{*3=SfPmrZO7Msz*_iUbedaqhwUU$ejeprcRo9D_@(AMw2UNxHLnDVW8Ixu)#sO>iuIib7Sj2*=fy zwL061I*eAD%&kP*c1ks~%|yp!yJDWTki75Xsh9X3clNx^#jh`F0qG*2X)d#CJl~c4 z)qXk{V}(0o0TMX7+?lr@0_Hkjt=3E~(7_Ysb{PT+OP^i^6TNpc%rCp-yG^M}VApQNh zU=*V9wK6+%W+R_~QBdBKoL*P9Ai;`%ZRGRMRfBG)tL@V|Jy}k!tA0Jd|8;+LwDT<6 zx+8ITOf%P&$mfOc$QxIiIN=2m%7r|^-pmuWHraX+smz?()lQXNO%`(WJeP*|PE3$|13N;#Xaw9rX2X zTQhp^x{CI9pH!Jy@+S_&f3*O zXmy17=bqZ$##+VnzL)VtrO~E^_R#cxMnH_Jt>CoEcG_IQpio_%?Bawg?MSH_SB~$8 z+t@x$m7lg1<{RG#l78m5_4#C(_k9DU#up6EmDc?v=%j~z6K5KY=ufmwvuPP;rp|no zxiqj;kio$$45#HccV(p_TAxFOznXKom!_dMe7~&nGe3-U%2SF^{NXtgEizVn@;UMs z(a&o-t>+n*;dQs=R~9?vG}73yS)XI|%pP;h_*lO^EYo8~ zrH?g=al%@zWWZ}$aKZ40(es=sYB6v=ELKRZjz@HQLVum{-jes6l7`Ed{EtHkevCCk z4_=Nr_i^~FR6Z0&%rzbS(VF(G%_!L+-~P3nQDR#vfMnj481(w|Ec7hukJqh3>J(mE z;8{%sPT5I5!SVie1H5-|8UdK z-VZC^R!z zu0P0;TqD!2ly2p=!g5OxMg3WxVOI25FAS?lnzYkZKw;LcXW0ZPR*F3dD4~J=_LouWjn*&d#a=_Gug{_ zgcG}kbKY7H@%^!R(Cy6fvY*c=*6O>eerArts7iO8ufx1#o|(?Ru#w?hHCaAz!)gDM z$-w-W*uI9Q!K^(b%!abpcfxpQT+&J>hVzd_&2~w>dca$`K3T!s`-lv=h5KsTujs2EZ<#LdAXJ5R$)PK_GZSB^Tl}Cg55%! zB2%kID@U5IU6oT&{+WZvq|?++sR)>g-&kIiTJ_KL*URU|F&<7#B4%Iu(liMnZ07=F zmKXGr$4gAS^=F)>dGB5VFPk(z7eA4=iFIH9O{7km{(P#XG!&L*Jc)?Z33L8j_|11f z%gDiRnKVX`B1DCaE=Gku#`UB*vyLbf*T(7E9!YO~_jqAtFE!qKKWa@XOrDX1tGcxQU&tPub6xI}OSu@&%jFOKs#K*S zk2CwI@MO~10gqZsPTM%hZq(iX+$m}k@R@7g@Rap|t3E(Y3wxfX^n326s`6aEUov8! z@GaNaToO!ic37;J@4K&jaLwB0*_L!xmAPIZ-B&dyxP&Q6TdA;V`jk~p@wRQ?#PX8w zKWms!HH& z!~UJqMTo~&R$%4s&bYG%r|3sjwpZWgoVLGcX#MSsnjH@5h$^L>fiB8N`%cge7&~U! zuzpcBkD^y>dHpP&uIcmKaA@LD7q=U$X1& zF0gjS+|N&rg-LN$84w1@{wKjXYWk#O>2P zDqP21sA9Wf+$88U_@l%mKEzVxj9bzV5C?YJ7tGpQ)botJHAr-$NKN66Sho&0tDrmN z^6xSa>XCWZewM$WkO2|slu9v5VyBE)6ES)!S^h8T5sibDBXyDBtkK)Yj(cCadV^nX z)bK`7=|&eDZXW zojT>5D^E;%s+R$`zf{`b;6kCU_pC^HaE5r-{?-|-i?%>Yh%D2w5_M*`Z)9jORc^R1 zmxMvoOYgo7x9W}#LQ|>X^D@-Al15<4pBe+@haPu_%*l^uhiocMI3R;%-c0G@c3N-N_dm;x=GI`rsb?)!MFUW}(qwS*^% zzT0iw>A)jVW@)KK?9q5ewN&B}%ZYP&aXb%dHM$@&$9}(h>6utl8F7m%%*DvHKHab) ztVU#oKJ(up!%873&E2bt#@m7XvA;^^ID8`%2pzFR4_YLO=8oJs zyN&-h?91K?t7SY{fX%4n=h6bsx{h~F4x!VpI8?%@ZTC(6ToD{G{gyG}2jf3~55IZ= zk$Oav^nCHK_I1i>c;BmfzM5;avB*)1gS+8729&HYRW{o_(3KW+wyf4s-A(?|ev(MC zW>VPYfOqKZ)%WeuF?N9rt-AAeXzlIQ}= zIG&fwYSAd>{V+1TGnyxF3EKT(^q{U&B=SY`>z0@1S&r0kidEbh{r-6`++Bx;S=}z0 zLx(HVif#BG#6&+(^eH@iN>e*j7wR=D7AdSkGwgpi3ka{Sz>!|#l&xH3{dVHmR8SIw zuWy>{^Rif9(QKo#MMf0Lt*NFP7U5sDN3L1;iwo*7J0_*58TO)X%Nk8I<&PYCKWL-Z zZLsR{$hB`c1W|iv;$<>mP>Jk#3}k&)soZLCQF zb;&L(Thmt)-iT*-NJojZRHxL=X48Z6zSUA#%DX!E)-pN2-sFuo7W&w98gSe5jiBAi zqI;TvAFG~Z>zgPYy^}??Q>_0@Fx`OF$@(x-E~;C&%R(l=D*tDZ)VZS8mq8@ zyt8_AUisuen=I7&mXu!?opsI+S3FREj1P?pC#4c%!o8z{Z_L18murmDkw! zU`V;p-qkO1qp;}aEz6GrBDX9wTBAKW@~5XP$07GExwGw@LU#5y8y*;huWKiys`1kToPf$2s|8wRtZ19hwJ4&|Cm zX^G|{xebw7qA;BcopBKw$Gxph6V?`x?8RM%*E(pjF1~~v*Ip}*86GPUuQ*##yaqO^ zTI@jfnV0X%PJ=90ws&gpj9_oB&094*yW4E4K%OwSl|Z&F=*Ro28IF0)x}rq>Yg!uj z%sQ9pHx8Pl5TSW>+3)jWLnhN4Gd}gRWhimx>Uy|G+?|yMQ!qUDcz8MT zE7L>?b$oI2!=k!+*eZNzZxamA1{34$}H=bXoygAp{-$YC6G%bs68b}~wCSSnKf={p^9Vw1=1 zlK!vgh0=JUwklTf;TcWb?53LRf_<`qN7qnAisPVE)&%`^26Io+GT+i}L{aM8I*R%5 zw0Ha9t;VOHQ(#~XwI5?L+X4YgY9CMDZ0Y39&Ncm`VPN)A8~z34asihl<1`fRUFEj0i8iN~1-5R|zWO{G81>V?93*09Tr`oWd_{brsnDW%`( zDn{;E84mZ$C!NkNe`qD%x$`^CwcxkiomYd0%S=+utGAhqEpa^$@OSM-)J+{D$cv6< z3QTB9lao0gz3`TAkIxSBYm+XU3U#N1htjdQ=+!w7!cHFwKROsNiD2iqc3&ZWh4PaV zhzv0ke-Rfnde>>^I5v{JF!H|QpOQjHp@rBTnHC59MfRsbb4JTeZ!-D%24$Rr);n0v zP8Q#%*H-uV-b~o_SZ0qm$p1O0_%=WI!e;Z7s6_awu3S~achF$dN-K%CRV44ZZEL7y z=)!xwOUlHP)j#W$X4|4$<0HRZmv}rf@M6#luNNDCaLX0jvy;>)UMi2Tp}n|Hd8@7D&2X z$Jc1`wcMx1NxwPf7@nRMJ`hQC_RRNCz)G# zVs}hfb3f6x;_K1XZsSPR98X~V7`*OwtCr{UmZcig!S9BH7q8eq^gk3mPe_0ieiXIh zKOK^{+RJ254`!Nek#>8*MdLG<}PokvYOQP5_{FO+UbwWcvsWtxK)?&ty@F1 z=#S&c0Y6#lR)t;19i2=5eb&j2-sz#*cx<~{{%0>Dn`OgxDvnP=hr;irwJNq7GZQB@ z=WwO}iJS58e1WlPj5!_>-v#EeT8S(9(5h|HY0cM_rzBZfq>SxfN;dOUsgE81^W3@1 zIL+(zrBiG2BP`}m1)MH~0WmjLyH7dAtmyrO#}zSmk9_!SKIUZLU6YF=YPF<~rP6d- zt7&N2?3K{+m&=Ln5EJRl>I>=d*+moQL7Mxl{>eqpD}LUdGPQFmHVmZwP(7nXZG%u7 z-=~RN6W~=Oe#@sva-s4d9c735`}c$J4^v_1V$Nn4Uv5ZLS@yl2!SL9Tx&`q5BF9qL zGi(~h(%r*Lo^`{;^(??4b$-3(w6 z>h$|6la*|Tqh3ZZk7x67i>Xcum+A}SS?dhe=2fy?Hb@;_`McQQ?vr$oY497iKT=v*Hs+LG|JHFGeVbhWV~C-`z@A(TT~*%+D6<6AG2` z-~KB1_`xd7%^;3z_+F(f9b7AxB6@I!uRXUac~GJ@s!CkCSJj}Hj?jABGYOhflfaRy zz@7U(<;bYKHn+x=*F{7`aAIzHA<5$_%9-QOs`7Szzh6W& zT6K$zCZV6i;R%g6|LbO#>Pj$Y9GGC;mG7l#PUOGu6>dqlFdlJOx@oSBYQgAp~BYcUf zj!l@pa>n~Ls4rNPz$684TVfa{mamzvf4?~Qp5D7L^+WEA?D72v0s3$2M@ORy$re4|sbVB4`ft?$>-_MCq;1QKSXLb#ZnscBIQibmLEEKav6tB z{I8$2B5?TA4Bgcgvn%qY~%wlUBMwOOo~uz97&bvh9JMIC=z zHAk20aO5twS}7z=*}<1uzBeu+v4!j8==PVt^A}&g^^e?I zK3OnGs44oy%KiL`V)U-?Oi}vhmgJ&`>zhZ+W{FSE(p#v_XtT`vJk$EnRio(5uV)g} z-RjW8buA{QJc|Uelw4fa*tHyuAfRH$xVAN zC*YxD`DjyT^w6BQ_A2q`!f$8M+x-Ub9ja1~d{vuUVm_)WvRkhw253wk4kRjZAMM}Q zI5wp8N#D+>-%I-%Yfk3?;!OeVA`QzgWu8kcwZ6NIWB`dm%&;vOU-a?nX7@-`?*Gdd z?$MwV=m`u|c_JPq=M#gKxCq_mRWBGx!>x}GT%4OHwLvvAAbQ0owFTND3~>CHlUAJO zD|15*XrLzUdVO3p^3d~W#QZ`Xgi$L#nJNIk-4w8DBhAHuO*SBH#2xff!%1?3~23N z>eZVZFyiz`adLzz7WCzm8%6U?9^Q@SeZv|Iy@i@EU&ZMDHAjqm@W1O*Of?_cu1$!o zY}Y1kIay+c(jjbuVPrSGvH-oBkzwu!8xTS+7~y~*Dzl;m z8iKq64%rB>h3zzixY+hXB5XqYAtAQ79ZraG!i)66NqtNa=ihOpIMzv6JKM8sN-X;% zbP)aO4W0P(&l?OP#^6OzI(e)37l`j6rLEuZ;K_r;N+W=VDi<%@m0;npYz?2SO zbK}%q638P2{2!D7Ar5>CD|rJGMvaob@oHpy2?2hl3UDg;0Y*6pCW3m}4pT-*6SN7S zG70cf!H2}y-|gfCup6xAv>4MMb2^MdkTeO7x*f)c@+O4nBF+ih1X1S%%l9$FLGXK+ zmqAcNL>vKblN6`e4tVa8VDYukAiU)??z|n7dGpb4 z8b^pTHoPLofQIHl?RdkRWB;3D{D00vZ$7HyW9{(-P-TQ?5L6AoiDL|glVA)e1K1Ge zyZ}w{-)`a`P6A;-!1+-A1bCic7%>WJi~tBh6$~f$QAQjR2IwJZ2ymrD01Lqk7|x*F z`+!dY6MG5pA?+Xnj9xH^5R-c2(KHeOX>oflA@&WTAdn8|r-%T@aN|!9-;F+wULWY3-2lMgoYhr8xm|>p{0SuxU1pU!|PH1)yLx4ylh1^1A z62T}>MQ>n2ZeYS6V44Z=_W+of5lXDWPrG0M42TYJDd;``6M8161h@dke*;vwW0N?j zJnBT7pbbbYLdAO+-5`JpcMwn&M3FmqzM}?cUM6N> z@a9cffOY02Zdk=G(SI+)FqwG}LbIm7=w3ijfPnqbj0J^Z7-Y0~>L?(1+Fos9?14=z zAR?ItG62=nOPE2#sF~$eAR7yw_JWD95$)u6@mk-Z|AKE{zr-!Pz~6=OoX}j{!rp6V zGiA68_q)m)1x+qc0WF|uqxgkqx#qM1U!H+05yU87BItEmu)LK8n9wZaUpI3|@C=%Kkdy*Hv9*SiEz4wQJi zb_8sMpyfo@aTZ8s;yKzUI!1UVPH<-}O%AdvF8wm_cX1)T5%_QBlB^T*tk|3RyJgBEB-hRUV$*>)Geso$UljG7?8 z1JMRFo#IA2bJ_u~f-d2o;kQtD4k!HMz#Ez3GUg@6WX>txYZtZKT_b?XW||XVVSUqm zggC48YjTWW$OwBo-pdwGh|{}ib!Y>$!cPt2h_SE4uV_$ab=o8-r<;~|&W$)g!OU+O zfChmMAZlAHV(lOx$q8`V&JYSz?#NBIDmPb*+)^ANfIQ6{fk;+_-@uiw@)c>9Ccz1` zm)jnj2SL-?&BM3xm4Ia8sl>|5iLk>UhGYI9yIo=iLR?&%Y7I09zZe$@=s%nu3#0^# z1@DcV_e&CBAUAe-S6#@okI1iM0dMY--iUtI95je6@@SbH)1UhQ2v_}Qd72N6B)Q=P zBhz}Ugjnv5%|JjhcGkt(p&0xs;N+2M_Z)%%?1&5wa6uRB?(y>=@0*HaHjRU+bi<@? z!X0&^?dAhFvOnRhK#rJe*#Mr7^L@pF&`FYLKph4u^T+Y}+jp=TqiBSEw(A(7 zQz2=>7-dSxLYFNGW$ zqnhclrj$6qE^tTg`~O&As(w|EuoJ>i{a;X@8_mN=MU7EW^w>gDV5~#HaY7hQN?b50 z)UiYR+VJLDJxa5B1OTXv6gm?Esz3mv@X}+`NkP|vsb}E-615$G8arlQStCt}n+(al zq!Hd9`fp&;7|mzi#25S?6-)}V>i|Xxr^j~0y+5(~-~N{dH2;?%npbRf{5Sl-)pcwApCs0W*(Xf=mz>$8_qVz6ey4d=%}YvL4k^^S0|<|{ zFs9hN?qmXH#b18o&;gj`Htavq8wmkb{fDfa>i(E!`fKd30w90>SLWbyYDBT_bH_q! zP#UhR<*dZ(!*o*EReK_iI2HyJ2-@6okRIYt9YYiNs~w=0fBg{Vx{MKO-vMd>_o#y` zdd#myqdxHmaLK|kO2MEM6dsvG3QYUu&*7k_M;x-Hs8enTWqq)JN#uhMwsJ&#ZfKpc~m&;7931&?2jz5R{wCR!QNo3%-6ezAhgFr||*zP*Srk?n;<> zb|IEM))31X=-6m)yB1NTjrW}cg|#cr%_257wQErhxb2P}KZF(@C60X?tLtUm17=z* zX&l+5V}{C~HqyesT)s&{7cP*Umo*YY4azrUn%xz5Q7~ZzL=5IOBo0*~lUs_}#X?SN z?;Pk!-#M@up$>qwW+P$U@clndjj>CZ7e}S?pFWmnC^mS*o}heB%wj>&#}nb;-U}gl z%a5XcKHdAF{ujo?xvxYKGUiy@peVGP4*lN>$^C&dO4&A(+7RS?isM4mE&2YuXo zj3t(H(8D+8NF%QqMSn9bMoE}uppT*rY+IfpYZ=Q&e5`shB>C$ z9vg?ONf*tGheJMdZs{=p7aJ`$EvH?Bz5_l3Q=$Idwdu$1ONeDjVO^R9ai}zsJI;Do zyQf)vISJhs%>c?M#xs@=`2_YbNi1ifr=#obS|pH5H@!i7DA@+G>Z8+aTjY@`jG{gP zA4U6pRC}Dn&C<{tH#m1A2GJL$a^dh^vH+3&dX)gOWm%Vxq77~zfy;ZBLj5}hiT4Rx z(?{I-_eZ3#CLmr&-o9#p-8`}K1lc)jKfKdDP!uON@_ZdQ{K7jY`@E-yJ8RyzqZGdI6*rlh~a_kZrRuig?E@~X9u?G4a!Wf_ZSZ%(L`?o&cfB{!_ zO^jgxb{b^UfGLSW7-0;77!sT-$gJ-|6ya-a8ms%SIpRdd9Bhtf8az1k=$fiwxrh5(B#= z`L|vHsoYC-P}@s$u+F?`vE%-#Cn}CqxiwPnFKi3L)yJNx7#<3gU5m=gDN^}%6L+1!j zxlRCg?*NLNxu`R3P4P__T)FjE2g4^&c`tYqD~(`q&)P`>GJ1gQd^wuIM6rRvu0;r7 zZh0R-?@aOkPb|U=?i>Jd9r4lYagtb$2gnS>QsVrgye})~G&x!itiDhMHkH%kR%A#r8l>xy&1hkwp%?t^Bo*xQ7Aw|k(- z>$~PDBXdRmpe{Ur6(_h!r&yJP zPD$4AD;EXG)8(MUp;N~rcfZ_@a)gWvW_WEXw%*7558L`vFF!?DEe4#zCXC4(;Z6ei zeeHm;2dI;|XYf7HQ`9V)VU-)xdltH)KAywhseX~Q?!&$Lod|s9O^>fWXA5|FRN4bW z%RaydU+RZLwaln|?w==R?2?^dMoZhYgdt&8cqUv8atTwo4AN#^l;elNMoOQa{*p$L z@=_nzFdnYRK)K2xSgnN-=1sR9)!R5ec<(1HYwPKwe#~n=$j1 zOTUF=Atf=2Cgu9rTfOAy&}`o5dR>qls#JC)8&j-UzS8_2ifUwrF+gX$#3BEzU!ld8 zwP)+s)HmsnqHpj~Xz6{?ZqT|+0fsdzTpYOZsE!7q6-rNJ&Z1V0lk|JHL;aD^bL}gb z8Pe@Iu0!P}ZSps?|dd8+j3@OYA9`P~g@s}|B*pD#F zLRZ~X44#{^((mKcBPYI`eWUa_PaD##0VUVFk^7f7g(MZIHs{tYAf$4U%?`9jP#h?{ zP0`Y}E!@a`#>4)bYO6IeiH}aRX%R2p;gU>+u&~&PC@*ke!c58(0H{-1DZ|tM@EOET^CWnbLT4Q|43A z{n0blEs97@M(~i&tDa_wa-Fff-^ZngNgOE8Ksm8(;l25F@KjJ2FJb0u zQP}2|+c=GW)aGsFe7^dsA%_wa1TR;^D$mfZf>F)7actIC_raCd5j5V|2k@><#egrn z9cV#^a{8mdfIgf;!2jUw+rOH)*1v1DQbkQIXjCpa)wVQJO)JN8&z#m;+ER^5Ewu<~ z>+L9NL_}^wMny$INv#)Z1=7|FZK*^LDhMINEkvP`R@791Oi08i5hg=~Ovq&BeRBG) zcYW9U2Yi3%vTN@>d-i@VpXWM6Bv(KSY^P)fyHJVGl$j%@4v|*$J@hwy9xW|J^0{*2 zY_LM&lB3@{oV2+7E4Lp8zlurUDz$E5-Zp40@Qv_c(n^X_24X&8zSPB1o{Jo<>$F;P zGpj8p=~JVer9tfM(`Vt@q#$V<7N4?=&I4;0u-4}&{{eSczl#mgOmNNIiQI3uiWuoo z?jGckI+K~9?64o@&$fT>8@AkqTwnNI#!$|Je#Jw_uFRxH$_mrdaokPoiqWfo_zI;>8A2=bcS!*lU1eeX+HH$Z5Vu{o>BGhA z%psA|eV!CnC**i!%aNofM^7My`uqEdRNP1W!2R6F8rgYex=cb?y?eTr4i+}=UeBY73SG;9S6uDso| zmUlopn0GQw(y)Jco3Tb3^oFWr6)SX~wG)3-RW>?z8Hc0CL&upA2~z#2PFd>OEVi0Q zMlmxvre+>BQX^6A+@E_T6Fkd4TBuc-rVirFwv9 zYoFa+RkM=Kvxe+wc(%J;bO|$BB4krlkJ4wRV_o$ZmDdV3%WSdbSodZ6$q7;y7q$Fd zP0qXX24^Y>so)J2e-%4JcgDV3J%RhvJ!52z=seR_)nP&hHk=^tWxSmgV6Au=Z*pzD zf<2qx9$v4UHMpta`{Bd~SU+*K_kiGO%mle)B&Qv*){QThj7)4tp2*jfZ*qqKC9p5C zw;B!%cSrM15&VXe!y#j_TFHoWy7J$7mh6wob7+}o;Etf7Xm~^}p;ng`9Ko8%_~?;1 z@+^aIE$9mV!!6{32HW;ICNi6q7W*rm&I{tl+7r@JH#XfV01>E+ESUaOm5M_DzxUdGa@nE#z6_ z`>u>_yclFfitGZj``S7xEV<@@(&Bi1P|{E~Y*8;n1;Bg3L&qHUyyXn-93_Thes+G& zC5!Y{L#_~MW(CYR-Qw4~*ek~2++V%4batIpr@Zfyfh>#c*&kC_wCapQRkD)JaVOso zSpl41Cl>VC#u{2mhBfWZMAZd`s*3+9_6NfXb~5mYS5=}DysG+QEa}$V3{biMMWPZCdP5DckG*cY_b~~z5kz%#>(Lrma z4dugl?X(w&4tXN8&Uw`&cO%PP=c#E!i8;RghP1$`c(XN*nKqE*Hy8>yCHMk+%=

    >uWK5dXTlo z*o!@OJ}>RwJ>Bwdj-X%HEzw#HZXtHeHnMPzo&Sb(KVpvBE>@&`;yPR<`HER7`jOcn z4l(P}n4L_P?#$oHbJg-L2-~;A?#Rx^**oy(na`A6t~^7kko(pBAg$q9R6F)KeJ)!! zxJ6x-^Vco14Adv}W=-*EzVQ=R$6%Z`JQL%gb&fBd$nJCtvHHQ+hO9Y0{n3D?nUPid z1s2CV0FQ!U?io7FR618_jILAkj)6^0+VS!$p3xrV$b0SB6Y?LR+lPjWqU43-V%-w5Ce&%wf_KS8%h0IdjkM_LZ zOWX>eX2Q?B1JC$Rj>?0(Pt)s$(hm{+?A9yc&pvCHTo7ML<6Q9Y(II`lsNY>}Pxq9W zvCt{y{^ctN^aBQY5aHKvAQuC(|Em&hCi8p4>Wt>od?RCC_HIOXC~1Y5AF@onsqwkDvfW`eE@+ zXFPe1nFnCCAlYz`w8^KT__Gwf!kOKzAF7CRZ30H&O>@%`j<|rwq#fN_>lzFj||zsb#WIA zt5{xDcq_6K6-Wi5e&+Gf6GYSK%g!YYhle*5hndN5WTD`4yf{#8{J{0PHQC|0-7Vz4 z1#6cEmP^*7I}FmX%vBbIor}HTTp^cQo_@)0_vSe3Yfo#XuIENeS?LODSCic5zV({t{nYjAMz!qd3V z$BKaWGON>{YaDq%mv!pV#0%^uO~o?$W3Za>VDAZ1&Zj5c^FjCPgXq^w z^X@@k8tiOF6(d-GKY};x~;jXnuq#I&etvQgSC@VRncom|CFNyTMxS5&EEKzn* zNS2@_e79zTyu`hyueTUC5w5@joE+n$dAh^D8=S+YkouGWqb{%;t*w_)&ZyD>P*=?>Xfoxj%icRKE!2DU+zPYl#B#5o@Q{ zYox2`H>?g*e641R^sXy|o>=AAl;~_2PiJ>3n~2?nOpiMz+kauj=EP%(0c^EfW?J`~ zlCD|nT!>1n<4t9qSc`9-Seuz?43aM}To&NB(tfV|xnL;wus*ZrCGKWk4h0R+t)h%x zKe@`Xhh8{ZX^M|nsNhFS0`TV1V7AmvnWd4&0=z9dwNdgK@qzl5^-Tt~g|M$$ufHJJ zoI3@-pHWY5EbcHxkExFkI;!4&w^tX8m)mX^C0-96M;JY2_KSm}vM5ta-oiG;HMyBL zmU~pTF}4+IJxK>;Hu1~!`v9|*z>X0#xQTm8UhQv?nCrWVBE2bMBn!taS-;>0`g5fX ze(N_8>RN1y#jiB{kaQ50BNZ^SWvD=be2LFy-y%wanYrD^>E5B5gX9x(F`3OZQ@hYZ zdLcb|D4_t~it#l+yXTvFtKwRaXW6r*w{i~2hU9DMlkj%J9bFZzd(TbywI&8U$mR$t8s%8;cmG3?-ky+vKkzxMS*~l&o7O$d08HS1UcE zC#1K>4q`U;XIjvp=u3*8R!j`k2FQzS2kkm0xkYjX5#aZn7fqS>1&0*FQRa0&E6eDy z(YU_wdfqGYc~}h}xtaYY-8X7A`P9m-Ukn9w3flN-EW+mFsdSvUASaN>-y?hKnSk8& z1lp&f3Z6XLdVM0&<2=7nJx%0v`f_C+o;PaFYzv^a4{vFb?FgJHcmyHrYtFq*nY+ps zi`#ccXeu4DY{lL2kPf!?hD_k<-TTZPk>&doSHz~=TER5=HOo2roD5i>LtMsY# zXjNjfDlH|PJ#8WB$tiJay=wuzd6Z{b4LIhE!1;oHj{rZQmp10!<~Bsn#9Cax)ANdR zH>&+rX7m(5I9B<4)-p&urtB4LgT9Y5jui@WDCQ$HOyw(o2l9kuT4@mWVD~@RbNUW? zi$*sY?{fBCo04_a_5+n%=5s)jmLias4E~E*)E$y_-R&{sj)pJjpNcC?T~9-FymWcA z{8y4}G-OKim}NRY>SZg>B&s#OM6|n~lQ-zK^vA_+Q^1bWY0Oru&KOba?2V~4T%9_U z9LT*wEGZOOQ97VjAI%mcmME$EzO9^kaHM8E6c%29bUb7$Hb6dN!ECY)()qR7J4Bid zJ5;|zCN(Jx5=zZUKF(opo+HwmmC|Qzwt34oZQk-hxJ6sDp%ZJfc}o>TNBGP2rQH8m z`DF04L;3$!X0HDqD~+ck|67bmZ;K9tq{;32MOMUe)SKOswH1+_J<2esJe)cbUbA7i zUlaR)`Wg+h23S)69Qw6Qb&U(eyXCK-+pKp@;oXvmZ98P=maIxCX^==!0Tye5|sRBf62p zqIK9j&2ihC_TC=ZRK%Ef@pR%ZVDsa;k7p?@f6|_zc}KBrm_I86Fiux%=&CArOKM)# zdhw5tRG4F^{-gYQYo0IVe#$i&c9zG;FWK_!c|GBi@J`zy)j-`8^%2LKqpBv?Z%v|% z)-{pgr$uY9U+lJ$*UyId@R@NVWc_))YT zzAtgd6*gOi=k8Gbh#uE%(p*W2hg1zw$la5FfgbVv`y(^Io)^s4LJ|II@sd7weff)+F>MnQ+@5JtUEdz0G>kpuiQJXq zC;mMRMHeZneee#~E2!e$F60AL00se&R)nsR(@`-1v z@?mX^aiH$IBm8B8p6EBQ0A)o{#=pS#;`-zx^firSw)D&B!i+a^kbd4s!*P_I_cQVu zy^FB4)qU7-Vz_bzB+RCkVMU~kUTl1rHwn3wwoOgs#n=Nb7W`HTJ}1(h&8O4Zg%(i) zl1pisyr#q}o{z9cU7gj43w@H#dxrA_gOo=OjhMHcIIwrK3P@C0_VyUWf+h@O`| zPM<3O4Nn*%hL}&l&n2TZ4TC$FELF6zkNN!G;DF}@SK^6z=FmG`zDSEa3=JUCKW=~! z1H0_>K?~gh77h?r+PZs=?dh{E?ScV1I)&7P)46L)CNevnsz=vX0Y-eFA!kXuoy*M? zF7ao~3;OY%ojK9{mC50Im5-TNCctp4&#|^*3F1u=N`8^$(JPdeS;YMg+o6&O)pr>e zAZAcK(}2;GKcExDHRkT?#?zitBOkY`26J|!vkjBXxzXf{#2x3JOZq(!%J^JV(p%lx zc5NB&AofVT=3a;G;t8zRDKs^()6D#+h53@S%rZh^IVs@JL^1H`|vDncE*f z7yZUx#b%J!tO|WE-CNatUb#kemPyvcO;HZgi`Ywe1zkC+GLyALnU3;^=})YaEFIZF z)Qn`GCXW;n9_z#v7Ux)=&N{|MFq^HiuTxJsr&zK{y;gQ7N@Y?Y8YWl~f zhW}A7FdhRV;M|~Izwr^XUVUNQZ&pr>?a&@-y6&DQ&}J<5l2_Wu?{HclA|I)M<> zyUZ)rWQ%HzB!W#6^G&f;B~92)^d%1bQcH45$&s3kP~2{qEmCLhfTCb(n|qP*tuRMq z7}UZG3w)ddM@sB5G?JYbJetw)s=UyB#Z**_ysY_oP}1R6=S!T~KJc;WD=DTr8JcO} zn-t&%V%yz&ZHw>86H#AqUlYMz$js~jjg)fvvd4K-luy*pGhewcQ)dG+TjH*ZUnEZB z-KOkI@{Q{0NVn@f`k%V{)JTxCDWtY4(lSVYQCtF4%v>Q)uoe*TKP%-KknxJ|3~edd z9W+@-)(B=G?KuY&cIPzwcAnBcSt+5W24=RY?Ahb6TWpKZM5p=xE7$-7-Wls>tz-_H ztT>G6$gD82k#PRMQbHZJMz}q!*?qrQivWruEo3z248N9S zpV+;CUQp$3<4p2q`T&HZMN!)iDXqDB>d#`LFBM8(eVXub_yhO+6d5;>D4A%Rq4^bu zkB2)2jf!eSfOJvsqvLg{_S2m{jRmwO01Z{*_DNN7Z6)cki{v$yJG@m!xeTp^dR;n- z__7CVMU=Zo=||ky{XBcx^9^krMNE1(D`Gyk7MahMRzOf9&&euqe#ER*D(oSpssq?y zgs+A-@~a=$>e>pa`3gXG<*S-A`ZaXtoyz9XaoGLrbM(q8Nh`54Z(K?rR1%21YDzFD zXyjk+{SY&75&f7gZwlT^O=qM47ajF7o~GFCvJmvzn`|MZ>_Te6HwDL$zjctSdD%851 zX@TQZ=b_^2(=9k4HM2+Q%|=-^+uJo7!hSO-VO}#>OqDKX*lsd@>uvkk0N&LLiYY@U z(auGklBt@#@GFAOsN%+mGC&r6gUn)xBL?n7tRuFa8E_&XN|^YaRoLHWerq($$poz+?9z+=0coVO^-03nmnx1T%Um0o2~J0Abl!3&Sy$!9ov|qB7ly(V zhMZoCxL>XlT~h`ky}RcaACtcFGwx-5B@qHs+JhNlqA#<%To*?N7Su$i9LW?dGOf-k zb9`?fpC?_VGZ{a06+rH;hK96#nsmD&tnr01`V3oaYKi6O|7haIi+1%~T(?8i8dop! z(*&m+Bu(;VbZW6}gL>$2Q^UUDfCu{H)L6(w=lpi=o~>AFVO|;w8=2EyQzr_Q$13sh zf&uhZW`$MJw^$?j!xFtPAK&5@D8H_Mk*jo{-(h?wQ<{KE#G9tezjd6VkX8D4ssJ)H z#AD|i<$!z-%^M0itlWg39*Pk#e6eH!TRs@*$Pka$|Clqcza$-ch{uaJ`LahHFWDtf zI#CT6*U?oiN2Wn&XXY?sZ zJDl0UdEKz%`MFaa*0PN$Mf#n+Z+rDhe#CK}vi4vvV}E4S(yw(Ybsg=urc?!%C*XU%2X_Z{TaksHx^S&Y^y4rOw)N&`sF2PeZ`fOL>FUja&G*RiB2Rn`N622n?~JX>R`+CE02(qnJG7!v$dNVaR6499Pc@Ecqxfhh$`O0(&@Vbe8#Aw#o2=2r`65fiG;nzR&~nQQ z^eRow4ctZ^a829s?mW#;j`eoGJlS;3HQUGbjzLvh;ts(pjHr{>D!9~pNV(P1R8q~F zj^C`|<2NZW^M!T%CG!4yLwMgNuoVJS_7C~NG%b?OjbnVRhx_!k3W4SqTNJgV zr=WdUqnv|vj-=7&#Y*$iYvJ~UQs)8X7G#FJ(w1f?)kE~E(L_^Gq-r9WCPw5cNP6{; zhAtZF+TfmmYvr4n7o8Q&Jag`+kKAW{YJUin-p%l%2Zj<4D$UF|=AVGt0o`GbwARgy zI?9z~Ujuw}sAZrEL&4n5j3N4^;;sXRzZPnt9nopZG&Rtx#ott?jKp;&y`sS#?@{4Q zP?J8>I7fbDa4H&a_3g_XbMF)NYnIR#JAIo~J3WiV#_vEYrTv)IqWquFn|+ z)OZ2?&K6{UVMwAC?+<^ly%pcx@O%hUEEE@mHHeaAJ(lzDjYS`j6kwYX{tN% zmgGsA)oK#axrJa9f6;OSbzd@b48QbA+%bm)h2iFKS9Fn}!@9Xmu(O;8GV+Rv8+P(B zJ)4Av5J^Yzn8p`k>9+{qBdRA*O)IiIvp6G={qkNdgm(JfYk9~mIl{anhRWd{PYnG) z6B2~o&OQTZjx`oF4ADbFioUc`X#u{t@r8?7dksN&Pg)8STimr*^@K@c#<%k?iKxzN zn7!tD@p(FBDEA;~aWA&(P@Zp!H7ihFOCHtY{T72MY;9t{`YV^`lhtjI*>s+AUh%Ti5=57#=fsE$n6 z@g6dXrt|lF)QxG`jI3C1E?S|PDeZ8sH8~peme@{PBfg8BDAs#s+XK+zQAdPeywXg+ zsVTPwQm@^t5pdO>%TH@A?SoLM)wP^9%Lyt;&9f+?6>|^+V8|LV_FBp#(Emzz7`WlR z)O+3W^!or!eSsCT1r~CV=78l=RuS;_YNR}xJ?1`Owyn?va1D-Fdx0zHX^XCI=v&VZ z!PmG6*dJgGf3dYwBelp0vu=&Ve(d4u;*#senGn53p;6Ag^HHDfZn|z1@7t8k>AFkx z`S{Z;h@c!<)Ob&?u4ebz=eT-H4&5+X-G@XI-TKYBJdv~EcUlK&B1yFLMWVw~a38Ts zJN?=aD=R%BeVX->e5&F66y$r$2}|mAa z9*o^^Dd}BSKQrP=c~0Hwd5c<7Dk+wcb(5HEL#8R3@(eat+Jl)eLV8g86^F3DI*R_< zWFe}~bPaMkes$iy5_+vdu)Y3ynghXl*ZJd?nI?zS9X>r<#m3-sZ7#_~l&aue-4 zI^IHb%k<*;%K$<65u`uSmRD&H2+}nrCF2M6X={1hMMDtC+I1^3L58r+rDOWoe9rmXUXwFXlx?l@)bLg zsiToS9nNW%Q1}aKGBY`iJz*12L^SsjVzAC()E3pE(%)0%0H zF<6}ST!4ImvSN}d6CH>3roG0*k_eqK6qZl4LobymgLuT{)V*4BpJh}c2c}k7V%BrB zY~dy8>~r#HovX2fVD4z9jCDzsyW=f5g;7aB{0xHC{CAJmk~_nWJ*lx2b7N z3XxQx>ht)>YGU(6JJc_WGQ_XS=Nc*>42$*V?kH>$(kTB_-RaWPTf4#i9!lIpew>#I z86Gjhe&YzCLdsOd((QSTaZ&7{#Oulhw!BAf_G@N_!PlgSOgT($Z|6ucG}#>3NgIP1*D4_aOBTjSO1OlJ}hSCcBUs=hU}`??9r| zIyRk2bzl7)ul~uVq9#GZP)&p=2)k$fjQON9DNTnybtTk3&8)R{>^;o4z+*XUGOxVE%KbAM@#i{)g4w} zbftBAAM#gutI{DaMjpuz((iVPnk&-ELn-p9vk_cjKh9|V3=JZ=NBs_8_B^R*4Eak{ zOyq}yGu+vV?CSj|q_}1?6P@?lxF=B&sKgo2z~AIV&>@9oSO3(S+jAEZ(&%< zx72NKJQ`z}F&nbStJ@&TJPvtggEzT6)!U&6jql#p+-u&{@jP!bxg9e3xr>ZvplQ*$ zN&V%MriM2xuXSu3*w$7=*IRcY!n4d<%Az!W$~_bIOnflWk!iE{+H+98(w)N+&ZvxS zJyv#Sgx2?CYNuhbC8-W7GfmDdeYw{KK|^oJW@yejvlXP~2l;Yqbl-Nr)deamEfjZo z+II-*1E`W--7KrM=7-t1apY2M$Gs$~riYv9Tr3={ZJ85Pd#h!6&$b8kJ8G6bX&Zc< zY{Ju#sXg0%AG2SErdzSbUmb0F2b)a!K!urm|7$J%R&m0MFlxX^C9!qj}kQSal?`uhP_;b(BP!^*hy#In6GzrM^cpk-HOMR*%C= z9iJUieN4?o7prICw_R`1Ki$KatxjHR;xksk%Vg)XMc{)j2vRi0_mJ;pYU-A2uF^&* zxjC<}mQZ)Fg!u&Q>Rng;GWtdJY*9vVv|MP#XGrrcSNeFh;yKde!$I^HSs|Jup5yda zVv#8|y)>8^Z-~tf4lL0b`XW5#0=21O9DZv|p{OQamg?E5=mn3F9b??}aU8R-YF!xno98IKXf)j16{+7wUx0RPMO(D}mVG(% zZ(S30OGpfkH0Rnc+EbV<-hAl;ma5xPx{pVT$Bb_v2d5qr$`3w1MV zHXCjrO&yUuA3=-tmsZ8Ol+}Gps{|Cevvdxc41tO4DWce}Fypy6m53J$NA82MycWFtTn6I>U)Has8SmXvE$Wt%)}7 zRa)Fzs7hCTG94UP5kUBnP0r?A>SUhmdHD}E9H4Z0Q`~q~V8AWz73vqsYSxk)(Ro1d z7~P6ahlDEdDwwNxd}R-ICbdbgdGhh?9-Z!(Y^u1KNl-UlVtWUhXwyFN!x+T5n}6q?1776%QkVvVx3< zl~15{|FuRMAnT8LX6lOMHDuKfVoXhE`lT3a#h2_Jte9_(OHjYa{Q(YVRA9vf;udt+ zeO@d*C@3hg@ahG8_IoGBWECeIP+8gev~$R5OS@5d&#{r?X|BSVTil4a!zyJah%vvE zrfzmk${WNsoy?v$nTC%n{J19Pe)C8m9)J)Thv?fwe*2WI(4(Kb zHtrzO_nh(hA!NrZ@QH{BZSQWhYf>pQxUL z?6ZAtab5N(2ol}N{2NeZjvJxE=$NClU@Lsq_6Bi8c^E#b?7K*R3KUF$yALnT^qhv0>Gx!QSoU$ z=Q`{Tp|n`PqjxGRv$j~U;kPfQQ zQZb5I@jkvcqIz9DmHQs=E+*Hz1>~@2I~_D6DZ8sEHb8)vERQTR)Lcb+!K?-a*V7fY>&rA=0$j@-fWkGF>R#LC6l@yTuW6PMuaF#6^Qjgo1aCDBcVmdxu z)*OC~wS?3!4hm26E3{N^h z6en6FV}NSrsCUu*7Aaj8hzBDjV_kcQrLGT{?dnO&K3c>c^l(`Ly3JavTO&h%^(g6B zaCcR&s+NQP9+36res6c8$~p4m9%Yu&)*p+AxdD{jr{7%A%^g+_)g>|_2s*KEWs}&< zZiN1_kKAl@NL@$NkwXiO`4 zne#)Mobba`{pr_CHTSuEV$dUyn%MpH1SLkfZ`SxBU|E=dkX+UzCGI07#uUg6Vz5QA zg9V%mS?`<*l^O`#>P1uK74H2OWdt{lxWQuT6KDq$gZ{n~-3&U>azfQAXsWblcXF+m zS-c`2X;0fi74+0h!2fjd=y77stVm-%VA9l+pqo_MJ5oVkFUIZ7;FYO4~bYVVg*oXuYQA2Acy(r+oq*e>8+f}{Rf3basp`&-2NjNA$GB0 z^65m;PUh?3i;eNr$LKY;)sn8}EZsZ6F2w@9NHv!$?~#n>?u;Vr9Y}haZSHbDGtSDH z{CYNx$G*4JmAchCh7XAjY2Y)KxlNH{7#=fDH z+oP0ublk19FQBT=JpsC##z%G5g3}mXnn$0>Lc|Xa zT#)jzE|asHB#-L&eUiVpe8D!4PErj9@s>{50Aj#kWEzc7q&W`tU_ObzAf$^Q`CIUQ7;5mOyOpU zbVyUET>>(bT(xcnJG;M<`oM`dgAyvRPOy_=bujn5tPyVwpj7tP(IakAUr7|_$9-?P zVovpfz!KMPl=om#5PhN4x*UBQ6zN9~LeB?kj_>ukZClzf(2|+Xh8dqcPs?1IZdMln zpP}S2%V$`KH2P&`zSG?-08hc0Ub>UXRX;Dg41(P}Leg(Ku|{C74|>f3`J>vQ3_5>^ z+ru?lW%dXVMO`FYB(jdhF^ddXUwsq@_S5;JCAF3iv-2Q3&rKp*v9@~q>!6+48l^#R zaj29!u&&d9bZ545Xt)V)JV5KASrJbYY-BzKor?tRP2A1!k*-lhQ%~}mOTg7WUxhVE zYFJA>ybpis5T%eBf?hDBJS=;}Hqh8s(8dSsWi`OtwJJx~*8qzwYvuZJv!_(QkQ&g;2!0c#9+%IRtzMJ%O{m0G`MYKEML zuvRLbTGm?-$W@T>%>ljK1Tr1@7rIPd@vGfg#vKuL!q>Ls!<@V33@{Ql;O6Q`*;IP* z5Wqy8{w?{BWO6ePAbteILiSU6P2W|G^?AGrkPdZo6d%lBd@Zs-e8*Er5!dyvh})fR zlWxqIq&AG|o#Qp9K$>sDqZRkrrtaRFnH&^6{H5)A4fdHpC;0syMM=Hv9nEip0(7%; zYD>{?Tvh|QW>kHhnv1n}p^7<41EJm{1wI;3i(qU{ot zH(jG94^R>JFjWxy(Yc$8EaDtTLc$N9iAuSW*` zq@Ffv@ixws3U?EvGkoHK^T~HAkMdq9AnnyEt~tH|aI+hr5WL$>!m z_n3)A=&!}xn5#F7iesz5lMIE%6$zXX8wO(tny7f>UUd-J?9nOHJ<)6-S=|x`&VKlR zJQe~Pjk(8+54v<$ag{9c>p-q;MqZV!+9r$=GqWVKCpstasTxKPJ^U6Be|)9&48Wv zMsPY4T-@jftj;+STai_5k!BUR32$`RlnGu|Tpi2z8Y;<7E(xuJ?Y<-5ZsMnLp=h}t z$EVOQfRHBNtb7q^2kCfu2)Ye`by-INAeIB2XM;1*;V$=OHiM7d3$iQ(e9`4_y2Rap zk%Ml9eL6hf3t#!93^5qW130~G!>Ae%*zWt>VWc%XRhO9N5osz$1Vjf5RT{vRX>wJr z-AgkP^eqp}zGW-NWb)f;kt|N<1QGFovmvv);_V(_Edqj%Tm{K%W-*3@x4D6P+*e?F zGr3L7i$Ji-9xjk;1F#GhOUAR&aMA`Kp-98|N*{n`x51HXE_w0|JLC_j@xV{jbuNJY z23&RvEc>9CGjmrw7FQ-a-W!WaZ&Sq5iQqKf>uUjta6(lDcTlN3bj4>JF@+Km5X240Zt68;Fro#geuzz^e*x?-oxG6LqdFBvlHOycgO;zu>ta0XaPmQBLQjQOS?P*`*tg0($@p7pM}(O|61-4 zd`JO(M?leN5H!IUI}fk}VHfK*H&?*@MOQf=ybDC<3s`X%s7fZIjnVcFu#@Q#NHQ(1 z;-$T4c-xYH?f#3zGz}qjm4pH(s zAj?s-d_EJ^jbu$VR8GX;*yJEs*9^FI!=OM7Tp~;$sXsXossiPFHHDz(e^Me;(3upi z*+Y~~gKxZ$R%qO-0wH5H0M9qJ+X~`q4>!1n>jfCDbt_F50cD=Rf19o^ zJfD?M;w>fpIVUZ?fn2loFYXDE!G?*8(Ml=_bRY;oM4uoTfJSWyG6038mw*xw3VH($ z8922v83c-oK>;%l^l7#iazN|785H>TvW{|t=8Ol9`&{H8uOMJvvSJID*BgNKi=a*y za-AEmdDW?ZS$)bOEkqsy0{bZ|0+mnbl70|&9taTvN`n0kfFK&kUcjCN(GbBq6iFO4 z0_rUVtt+YH!r1+wko)vRpWS8Szxm zuYzcl>T$aL&hZ>{4KOyk7d(Mxko7`%_bVU_ArH8GgoK!^!^5M%)IPA8P{j8=zG>Yt+m)-!_Q=oRxMDPUi(y%))Mt@=_kVaeF@$tGeN`Az%ze zI!MoT!_)jR=ttTBFj@jWO&}cSSD@|`C)Of`;HGBSv#pui;sJZXEnza;aABZ=!fP%& zAo4f}OWy&DLe#NADu+jrA~GQ6&`9;a1h>!qH9ZYro}--Lwh2ILv4>1Jq5uvrA-Q1Pn8G>6xT7D89|E@!-m3)q zNljiQ2b3D`wKCu@;*z|d8^beMguOh7Avg%ZpI~OwAu#Mt<_<|o)=GzfoZni@6=60; zqHdz3AZlIvFfVF67?ILJo!qk+&~HVC$Co_|7HM~TrX}q9!dQB%0XOA=B8PS^0n{7i zz8%k=2fED$W~cUWQ&ky2*=2D01Tf{ucDDd&bmAt7i{&$+fU~#3C4CD_m?N1h#s@*J zFM?dCK&Zk2$h`@>`9Qt;9r7>iCCS`A5u6Rd!63s8IIV}UOTaT${UqOQ@ddt<>6}1) z-zB5tN(!66zd>^+Tj%R!Jl%>Vfc`&HNmGfQvnBU*V!QDL-3>g;BqZb!F+&TrfhWkz$GPK z^H1LbNY{HI-OK`Ovh^Q8q397~hOwWcSOitjVm4q4AdB=2=Cn66)* z_UmQGnYraB&AUE$6#jlpnD)=7KNn?vP}p_4RO&}P^REaKFyouWUWOs&q>;v+q#Ig^1o`R;Q=byl&N6e^*@0E833t+35-i zb?F9TP7|iQGpxwkT3AxV%rnAN987&gO+JRTo=s4H{ZK&t?9ca(eL@N5a);v1Wo5yb z-CyP1=}m1YO!1ac^I`dtRq8D;)to9nnpnhOA5$g5g@>7WDc_jy&f^YQ&u6_4vpxQ* z>F{8PHuVD*Zc2Rt4_2>I?}n-0{Z)Q2HBWmU#<~*J+hOdAzv2ipul#%S2)w)ET$Tq; zd{b!r2b_3SAvX(_zpV}UfJNFyBx!44{eFK=2yV?yIU;i>lvO0uwx{ytRCrbTgLGVPhwi}47|CqW4n}sE)lVR$JesKRL%GI$~3re}(NQ35^T>do>;G z0aqJ>g>bbvSU62+41=jp3Y~>;r!sQ`&^zvYZ4`{HFLZkEdCZ&!%UcrEGFX1epX9?U z|EYC*ulo^2z;ga_&tf+2A(yw7^Ebf3MITb*;0ed77%}X!y^xy?PlRZFyw_b$y$e&- ztI!>=+3)@WFFogJufkYY0{U+lyW%ea>HybT{>wGm{$H+%=fX~_3jfQ{@62cCa;%q( zAa&*6n9srybQQFhjA=sg-vX)lTOf5_;roc11lylYK>rpLVDNw<_js*f*7V7b}hz0CQnov_6nf29DncumU(QUTKrz(J0$LRZ36 ztG_hRg#{Q8?J^kjTZMiEgS5ZI%Y_niy!W0L=>NdJL90-&fPLpL^`0h5>lF>f$7x;} zz2PsN0b_Hu39!7Nf*ye7UkNp@!wLUgs5%NeeQUPCY54DObEEyK~@A6aTC$L3A0-6h3 zobV^Sr;5~OdIk1lD%pFN1T^y@mjMD&;}z8psaP1aB*e^vJ=PYI-ea9HyWv=wA5tEe zTDOY%7i{&|AMpzKeC;5N%}Zcbz}O*wJUV$4 zd)9f{1{mu~h(X|`UljgV0+&%b7)w~ic!lkuKQ;%}3AIWun?9ke$1o`{&G|Re_^{BD zz-)kpwf>m*Snp`1@GkNjv&?(lb96Uc_fH|a5Vrrc&<#h6yFj~PY`QST%g7il0-WH* zQwrE=Ndl7qJALEN1p<@o!%~ZME~mGQRn&fY?1Wb!)gKnt&n+)8U)k`8DF$te*mg1! zUjX{HJSnA9b{I6S>$4Mo8@nqr9x?|(>ms&YjKCLy-YriG=u{mB`RqEi$5A;KB=pJ* z6H|Do0+x)6z~3|rg(-`)uYm%lJ@{Jox5c}5?~L>%P{N7@QCU9 zv_B7I3)FA}+)8&L%PSl~tJEk=edn)w2`(3<4gFgUy%yt!Klcg{;r+q_pbwaZk6|Y; z3tj;D!(ZhUkyo{quw0~mYRI~JgX;p0{%=`4W$XWv#e4Q&i)Z=Yw0MX9mn@#kYw-}I z3M)(n$>*P=_kcWP3xmDT901=6Bb$$P_J0l`U?VJ z;X7I#Oo1L+ywZ7&-VfXVDHMCH+oy%LzxD7Oj7=Ao|E-51Ua^d)gs{_+1k~$$edDj3 z>D9w;rIxOFoWcxx=#}7i3-w-mTw?Yb%=84*i%F&aO0V)qYd3j;=3^=nros|XuhVtJ zKiqp_36%|FU8`cehGR>iUIANtYxV_Fg;dbZaFCCLnm1tT>q1d6gLvH(uc4VF)VvCV zXki5q1l$y_Gq)}Q^*U$Mgqj$b`lL_=rvW!5)O)A%wchR57plB~dd&O_EN@9bGhq28 zf2r3W{HN9crvW!51Eze>$d9TN4B{1$Q}DJIgc|P!Qwl49boyn~QCO3{3iV>rQ-1<5 zDJE1~<24+gP#0k;Y*mjJ8e_HpwP5koGuYx6p(Yl#__&bt(x}wzH5?)5=_;5yEYw5~ zV_pI2_TKs&T@Hhku-wbmg<2;ZORbK@EDcjBV+j?JEEAGSktO?*8YE@OCsdXiX+}cB#LSrGcOBp7`#k?V ze?HGY_v^mTeeQGK*LB|4b-m9$W5BIF%F#%fh9qE7b-PQQhql|w(a1@r3Es#GTgDsc z4rSpWAne(MLuj_QrT}=22G#(J@mA1?$^syPvY-T$LRmNtU57T|{zrh>B0FsueJ~1? z1;n2Bn{bFd3avTN)@RRkg|^B~OFHI0l?KKXJ!IVgARWrlh)Q+|9;j&Eqh=@wB2aZj zERbt0MXP1Y$cBzV%F##`_$7EivV$|>6o|B8cp^eM7nM4l)){2eA;u!I z$)0-vMtQc0h$z0d6^m{i3__}vxoL@k>C~0b&)%MkGIZJ|B1*%0tz<;l!whe@C1%f! z_%Vtk5n35BJFXdYWNjiMSn91FFd!`20NjE_%K{jCl=(<8gd{Yh+HjYO7V@?-AIVrXpr%ot{;6@uwTRjTe7c4BQyRWLx_w1aSYIvXxoE`?j7s-Gi}F z5{`|&AQgn63#BMcuiWJJ(y%U;j7UXHU)Xmg-Bu~6P#`uYZF~a zNQ}Ie_iaBlx(73_WW0gXVvSihjp2|n+e$8}$ezM#nX+=D6EmV7<>f+XV~x=_YOivzfDWyXFu6~RDpb{6fj~TN!j!b)aLUwk_rr<=q&jQrfRmebNf5rcjQ30lmUvG`IL zem%pKE@UCKxQ5fhr|9g*)jZk%C#grU;R`uj6I>i0 z2pG2pfwz6*17RUIFx0@WJDS8fEe@3G1Gt(ytV;;AEleQE*a{#OyRJn4v7;J*6($4; zX3V;{B3M)sD}XNKEoK|T&pAh=TEV%<=yKk-5b6YwK_T~F-Zuut2~Kz90!2)zmjRQl z;X35#3OF@06ymger@)d5E9L_Jlhnlr2s8$PUclpOI^@EQnZ%@dCHH@L-y*3(Kn9C! z7~h%diX2S?p!apL2r~xUe}^T9Fry`tEL+ZLDWt6I!MGoGUCsNpkxByK`(45PGE@uX zXzott%>G3&3Gi+VYY$zhNGt;wlEex^j&_HBXQ)cZ(cJCG(WqN&_!?$RK^z%8r^~QI zj>ZAmPg1=RX6`O#3@@Gx#OX6>>Bb8j!5v=UQl;pBIW3`-LO4~77Ui^PQ8W-}mKS12 z^D^GI!&FB=cA!Cq)8a)bg?ni=>p3l!@gAh;3u_qCh#t{moEAMw-~pU&R`^OLF@9c2 z9d52-d_`ec0u+>D81!Id;R7~iOlrI%EX(b1afEsl;H+o(AbK=o5IulJq>YF)yBR`= z(tHu*X)Me-A=l!YCs@plyr=lsx$0S8L4L zk(Io!y?jbNOliwiK*?hgNMgwqL&<{(Ahj48;CuNInDH>R5l**hLkIKUV2W~(5dw`2 zyv}l@3rUE~!1%(9Xhdo36}~^Lp*f#h{ivT8^gADt#B0SBk#XtvnoTh&#}nB9>{%_ zvgTVMB`3ZB(n#9m(fJR6y}$Ei+zog7TlOu5(c`i3JOC67ebk&Mu-{H zF17&XYX)h6>o+cdlMI8fbfI<;lp_;rG>`$1Bf)^R$mzDQzT1Lu$N(}p;*ntsFHf)+ z^6D0pNNksw0!I7BcLNnS1g(Hv9pAmg6o~wi?Z(oa7IDgLAhLoiM6n%pC(3@ZOF?-6 z(c2i3fF6Ygw9N|N%_JtwDv`bn49GFpbysp)R#L_wHQ)(A z@rrm265j`$p@ND`DR@Ai5e~pbJzx$7#Stix8NLh=r4;#@p@k}AE*QU)ig<044IVv0 z17-^H(yU$YzL+LgKZfu3suMkUSJAVG!b1!U1D&+zjB*X6&PnUYl3yf@NC8 zFhY4GR~Ut)B+4V%5R7N2naI&iP)8Y*3V??UM<)WePtfT}kl}0s$ zy7US^1rpTw5^8Ya1VwxaHIN<>G=!EW1G2Kx#!`q=L&oJL)ZpHo)CEKjSFq&>6-Xnz zgAe-TDg|qT=%E1g_#n}+nWOW77e4@Nn;Q<~nT#t&xvv2-qZS1V@a~60Ml6ogfev@V z_ae~MVB5`bRc4H3d~qJOA&n)5gcA`o*)nQ3;xrd>g0~n^nCAe=&mVyv0b+zur4Xly zOZD6XjLT%HBG5CiW`>Lb1bPIJ?xY4HSGz*=9H9mRek<-)#OY>I^a~sV1Inq+pk$Z4@5EC*-0C`LjwVTGW8 zBrRnTNDl)d7C?ymED&yvqGSmB!&J~^t_NgbF*T|G0-+U?uOZoY6>(b@e6)-T830lr zNR5hkA-q5cXaY*tTLc4(rXRE(6l)x){cuADgu2ZRSx9fD)H(35EdfH?GtLuE&17K3 zFHj*(f>wo8c#@h3_8c^XK;~0G@DJ^C{L|WlBt!O+?1S1-qF@7ysm6eaFOic4pgz=I zIF*-OD(hu%x~&Suqb}&(LP`gceFSA7Oq@N?YQLc{sxT0xz>T>ywR0V1zBE)Z0PB4Y}H&V@D9q#D4z z!`Z;{LQ2?D>QhIl57k{^#xTU-j@(teVn`y8lDoDwLFSmC^Z`Z84S}g+@fENjR)#>4 zc*+8pl^ebpIoccGC4>v3T8^alVMEBjEtCcD!L4vv1bPPS?vrVm%M%y@=w%2r4&1Fz zl|i7P9M{C^LUR8^VIxOJgHpUp!2pM@h0CG32wF{DQGIDoV1jZ84y;^8m8T2QL~vk~ zAJrm{DE@_9vjq_vDWjYk2)y%@6g(ma^ivYUfyJg&B}9$}=(dMq2{bU{ZbFzdz>+l9 zd9-81!6NLZdZSv-l>tVdT^B9OafG@E3kr;032;TBJJdxpI2CCI&-+j_P+jDLeo>A0 zKskhQ5!JmNi1%46O2_iuVZA0X0xjk`f3m%&7tSaGo-= zAEB;Cly-;MFr_*GCzPVWKYJ*UNCCAmLamBC-2j`+777+vbc=;Ys)-1Q9ywhbFcBC) zw1k2?mE1v&>r+*brxAHJFxsJY#|Q+23e_4;w~v53*HU3qrmipo+1ZW?eAH0ng9{ZY zWR&?>dl6^>?4>?a$avyP*w}#z*c(^91bS(o>#+nn1A*ql&M}?>Jd?5If>QU41D=6a zm4*-$8(4C+3vx9P0&_ogX35pdQSuOPSjQPPUlc>4!HAIp$P`L|y=n|wIF;i=di+Fb z1|KGbW00#C0NJ%cFi5ft_*h8!ivn>7ShbPr0SmIf0UV}BSp@D{6aL|J0;Q7>rTMTk zn4kzDN)sRfj>Qv!o0<#|07@INkXH;DN0B_m!pxf(ut6h<=>Vb9SRgfH;sr3>HU=5( z2?79RpH&M~n5*+j^S-vIn)}C9U!9DDDtXm6{_DmJ*cE&g)mZK*R;{GI%270IoBKZ{ z+oum^uPNCkye)UVy27++_F8pC;q5m6Xedrw$!t)UdxRUG{C^SGqb7@DLI5MC26SRy zd<;TAv=oq+!J(Noutye)zYMF9TLZrFjKd>x4?z@ur{tk5VhlpEg@O{%ts1Ceym(tM z&p!PdzelhG3c->TL2@ry7s&XM;tgJo)dezsqIiRs6Ii=YeI!E}T}y=*5E-Hv@Pl#? z1W(3B4CKl+jBO~1kRfLQP~!((YI2z4Pb58(5D>UZRESqR}rl1N++z+$ceQv81|t~LLE z5!cod1O;Yu?7UJg1X?5&_G_*d*`RcssaZf&JYgLYB4}qT&hSU^*aYc9oPw%}IRSQ1 zmhpQZAo*m2Qr{?Kj%Cy*z+|~5D9@iMeGiCwu!Il-l9UC=13BUFqNOhmmN48f2(oi( zJS-v978qf*=tU$06yX$iPGEHg`xWnx~gA!Zc{tuz~zex!Z#=*7bWl>fcIj zp!Lz-u2CpGe378_{oVHVnM>_tUoq~FFrsv=fm=y0zQ0g6TRB<3-m`hQ?DO>C$%lTy zwI6>c=L}d&MeyDplK!2Xbg5l^?AIstjNzbH;nVnx$*N#~-W->X<3wM$+ z8WWSjCwadh4zHRQM66o0P3MOl6vWtnZ$89!+$p&G4(H5iKO|6@Xg3O0@7+}X{~xf6?~Hv~Fme{3`ODXM}CDLM@%9ioc3jKfOq1*U`a#)h z|Kq#d)+l?_x+xy%O zwB}1Zr zom8l4|Qf+o=x^g8ll_J~Z;9+4nzop4G=;sXuVKJ|>e9l?)R1cfgnmBVjM z61L&}Q}fhkh0R7Cd@AvewU~`9GD%G_#D6V1kM;fxs=O#U^*pll9P>kQ+f$tAb5{%1 z=_kd$Yb>=_1y`PyRC{kAWw!g#+j9n?m7mfNyA+9Xj{P_IE5wrT_0{_%r`e@aDq()R#kxDavYd|MlNCw#$5y{|COLm}aU3WgDIb-}xcIp3pX;20 zF4bd?7`N8HbRju=KPgdDNyC!9?7$C?ZS_eAy1m|vZ7cKc#WBZ%wJ&Fl>eQk{z7O0m zF_Jkj>Tyrrq;Em$%U0a6dCCce0P6)aQ)w6Dyv5kZ8@@LjT9JoKmNzCA*anSOjBnNN z>n^=;)O5c@MEx4NN{se3d--cJiSpdHI|wExT{ekQWZGn|c@B2pmfBA%Xnu3^=ygG| zFejPgl&xgN+995_;Q<-X8j7tImgGUK*HK6aU^c_FVB84-eYGCQ=aQS zjkgz`QGG39-;Yd*-)Wladz}B-t=BNrBXO$0W%t!Lk_x)V9<2{Onc=uo>R^(SW?EgT z^>l&Dg@m=C=M+5*H2+f@$)g*bNIJ0kZ{B!Q)r*wrKdmZXI%U+H&A&2Y9QSd|oyJHC zac65w@4EQ-YguZSd_Q+U^Zu}bR1~fxo47gZTy5!8<*_dT;?Kn_e1FHCyE}T>tRQev zYoph`&ZrL>rUNefS6jHuse71ee%Us2u4aEU?&i7lMdvedCDrG|{Z%YfGkS9uyHus3 z#46w)jlo2PvGm0+rsB_W7OLsJ*BT0X7Y<8{q0#&=#Rc40`RvOeUpP!#QjO*UVWalh z`~iAMZC&s47p`WV)){kqy9;-ZsOsfB2AoX|cSb*e* zi($T%m9O`R`v-6(!-q}nd-wB%{tK)Q&LSKKFk%CDxnvja~=%w**4W4D~zP{!M4^1EN| zlFP5x-=1pS(KCIzzU(YXk)oh^y}?D!bpPX$#0N)&dvD`SJ4MR+MKa!r^y-_2Ke$7IH(&dmMzY!%8#GqxH1O^=GB;>^WWn#Zz4Tq4qcEd*H6hVkbFfflqo4H`uP1yUnw8?exTysCT(mSDtx>rEcrRo1Si= z^*y6~Z`?eku+7%8&5P?g88dIkl;#+xhC>QWpN*6*j=3Z{LLs}5Gg4u<_Bcj=diW!*%=iAN?wZE|*A zai!HRCTSfavrX4UCU=FFncdKhL7sAS!VjtAO@E2R7OG1*td1N_2#~}3jhew2a*ui}KuF8pDyN=5>5n>i^tS@7Dq>IRDnidEjAa=Mr(0s(H zdxUQ*YUs1WP>exY!stI(j zh}qS5dzN{TPbtg(pYToUn)Qfh8i!`SRG0y1W9{;|2!NOj~ zN%WNYA7{O|jLriD%?%^exgApP_r!&*Yu(l~q#xIv9uvL;^QAbV$Ng19a)jI}M*9(K z!{os4K@CHi$$@1z$6B|vx1W^p_%O!Ozd1Hzp4O?7>FR&H*WhtP8zM6U0u)9b(c#&_L(SKu_xK-*QZhgOn&JsBXgD&-#;e(D~Nl>*!Wq! zmyBckXjgmDEW7$dGI4AB-15~XgY_NUPUY-5hI?p%y#VTD%p>(snjs-d!R{OL~yu zJyP>Vw6?CNHtbT4j`!=8+q?>4&R<80OP7mv!9RQ_ zlB91M@dC_~t7>}9Gl_jXw9O=BT*S?5wO+CU#l|vp`na6z50MZRQR=k~>2}9v%;HvT zecfHYPqrtmB5BF_UG?kV-{Ze>OJ*lK?rHZLG>1~?aD&SHKhY&nQ)^2|5(y}qP8|O8$ zw*T*|=O?TF9zUWUxUw^X?(r5c@cDWAXXxzl7hf)_o_Jk-ZDl%i?tFeSas2p;TY)QA zJKj^YQKQ`S+T(cy_hxy?)?XIVRjqS>I=7!WW_slCQFee7FLtrd%bRb36O#{aVn@?#J35O&G%LNz?~s;Svr+5#9<};GJ3nV*Pw49OCf)zhxK`D^`{}KIEjcUF38ZRO!fE#joSlPLEu=)De zNt24ks?6Kf8mHVId8xVV7QH@hjYcc_e8#dFZ*&92b(g#jtVh@Hu|-;ACWiwkPI$MdAH){^88?6Hn7J)5UDdHT$S%I<@;| zwUA@Xt4`inpB^^2(cjtqq~+7|5ov0fkGy1E60AFOo*F(HcSg{T-)*m4Z$G}~Nc!Gm z5muIA&_<2C*6Hayk9%>08&wRbo@V+zQw|6kEp=UK2QW zTdAQ`WAC&P;gNaYMFp2XUsk6F`NeCiNt7#os$75Msg)(R@Lgr4<5R27JTuqMN8=V~ zDY`#jTX^__@HYKi->kliopq-@@*W0F`u+^^o)Z9b&3otRtbTla;@K|R3&obo>}8#Q z$X`#Tn)DV9KI}QgJ%Srckf89aD-{|!5>@v;rAG|Wv5%E$tNx_r-7Z?+=H30)v%kaB zII{bqRak`8!}}Hs$0nNz?|mYy%KzS*X>Q+h$#}>8h1FZCsrPpER<9kO+*Us(bou+6 z@Xw~BFR#U%Z_XJvl4`u*Q!t`^>Kfi&G_v??M8@V}p^95?jiUofyH96+n%o=t=8VRD z5$a^|-pg2{m1Rx(3d7@eVSiK-r#+h9*q^*IerVHQgTLpU96QJNbU&|@nX581{}ia8 zVZ9}Y@Ln^*D(&yRh}kU;%?G&QXDW%#{ejy*1#Zz8isK|z_G?U4SM2)w=qV@X@yOAm zJRAD6jDQ<-E2n1r8skq-r~J;o^r_$9KV$0Q=r7^?Yry~0ak{&FD3;KkZy<1amVW7P-@60Qz9srK}vzYEQ(ygnbkHE-kfBY&wAcK`Wbc^RN^cv$vm z)!$R_XLFu!l>g=IxuKu-%q3>h&L~gD^SJF!rqr?jrn-(B=JkC2m`*-2o%o`Ue_QPDS`XR8ltR2|bWx>_hzAknFBDI(OKDTJ%D?{~WPbZl1LwY@QXq+ z_>xiOz@O{Rgl7l1NR&*t+1lYds~;YF(|)AZqUO$mly64IVVk>mWj4M3h&S1_obpWg z@fm~8M~;pA{vo|Rob&dsRHtF4-hty64@$dr*w@^#SG#N2YS(e;Soyt6-?T8~0L_6} zzq~q@bs?v!G(!BkZ-ln~JqtCVE%mv#US=x`PoI)C{O0S_nfWx`-a)POq(J$#pj9VL zz}WhcnDBJe$^riNaQEelyFSeYlz2y`_|5;Qq2>*? zq6>xQPmkWyb4C|R&MJE8s(SD3I?DR54t`8ka7aD9EnVZr6IP98LhNYTzD=de3~v2L zKL1V4`4F-Fh}A!06#S)okCV3*bQ+xgC+RD$B3J*^7MCrH`?yi%NyFv+5emK zQzAmjXqa31D!9(c>6_lLtNW{gH0M3LyQpF-UtY-fn~Z0!|NC=Ux8yJJmAQNWOI&A| z8=9K%$Xa`Z=yGD%rdOzlWuI$e>&;thSX8mNHMj5cjEpSC?~s|jlC^F+B7yuOY;?v;F!kU?nEnqha`65^tid*N4&COkWv<)bafy>`JuBWW z+0tuoJ*H>wgxOmew-(KPbk1fC&Lam(f(f6a+0aPNn3klA1x%qiE=C8-T63u zCr7V|?)02Tg~eZhu1MxY6J)=Wmxzg^moYPugHfCqHL)CvR_WpH56vIJUn}^We&TYI%OWiHZNB zgT+NR*~CO#g_(wW{|h*jcqY2L5)#!SgBX%k^w!p8=Ef ztrX5Jb7Ag?x*TCQ-Sqb1>xcsl{8Y<6;h_4%L0oE6!r-m3*%M~mShYy;puH|k73$$C zx&uYxbH1Y}7UnU_?%v#(pnDf?GxVVl-+742-w}`v-cMmJSt#0tNh&Gl_5NJx^b}`Y zMi1K#!o#E9@GxQNxizS9NXSqqt$p)R&&DmWZ5)Sv{ZXPr&I5+~pZ^`UdNY0Q0{QO- z;;Z0(og?RWm;Pxw9CLo+@xFWMZ`K(q1>wrfy5Cl5{K>xk-0hp5!sXzrG}bJ0SvRA# z>{T!!^X02x*BS>@nx6nRa|Eje!6E~!iD&OTOGISJGvy_cUuF~UQ~uxuaf~+h?=@_; z3bE+}*_RcpK#cApkN%LxF6o${NJr)9y|CI_F^3IyDsZtAT2=sJbNnp z_O9qXHcbu>4sF)h-^Z!$(X%o1d8_=CUH)t$y2PjN^b51**sm>{c~aA!RrG-i;(ooQ z-_y&W@$z*wj@D2Vy4HjR>NG`4#~e^+7Xj+mpusF}sid%%6-UTsv{!Q9!4M_s=#kB6 zoZ`WJk2j6`y2pAck9R)0aZ|<=hYr?cSz>;>46Y8Sk!V1k&o4In&OOo;W z7kB*LZ!N{4%d2?GIFD0J_i;*dvgWz* z!+$RvdQH~ivbPeazjvGS{e12)cFab6WcQ`uma{9Ah`DX#Cj@?B`#8;#%ic^Z`Pr?) z=ceJp{?Gyl0T-}4rD=6TvMGB#eyp*Q7-y8$ek}h|NqCj6_^8mYgm$6+3>+4}xpsfo zPV9bOIu=v-QK>(>p>X!7+#|0QF3FkR*nI=RgTgtJdVWIh|H@RQoVh&TzbohxZ7<2o ze`fs22`%27mecXMz^AM^d;U-hS$bTwYu&i%rwAcp*SGFK_CoU?+P{3mkzHSv79Vrp zk9AwHf;*1t9d7zV`W#(hJ=Oc!ZostfqLj;NYfFhLy34sfnQE8EYJsISmu`}8d}22? zBuMei1n~RfZb8=<7et+s-xxlvXU%PrZ6*Do-Qeoo$C1iLTe+DOvg0gmJHH_ZHv|8( zoclSY`|{e+oLJ~%eUGm8>Tv(6Jx$xUi+8PmbGE76>*7;d*=uIaZUfvmAL0)Yk7)R% zW;x0ExP~=)Jl4Le+|Q{v+p+G0{F~pq6Dc(RL62{Qs2D9T@@$c6w^_YP`?|=+^c>0l``igt zJt>;)D0Qp9BzkB#7an2?&%A%p|HHOfr`ziTx$3fouQi{!RVHb!NVRe9y>luqYOTc> zcj7>9uk&qO44@Qn$6fQji)^)!RFyk{{ zTey}G&+(c6mGWoiic){pJY)C0^oYE{)FakKRDq3AwP8ESb48=yk7VO`3<>u<1XmT! zFxOb{atE;vgU-jb+gA!?q5=OZnHd;gdZA|A7E`^1AsO9J)p0UmfeHOaFoDDgJe>3F0W`B- zdTBEE^oeww5k=}X7;q>P`UFP#tpTJV=TN8pzN@iwMa{WC({YW27{%);?7qZD2=)0T z)Z%?GgJtn9gnC;0gBr1}Sqt?c4^5@xc<&=E(bEzma4|fvG(&G#{kSczwre9ycV18r z{<^#bsqvDJ>THp(?5!9UP6@2E5S-;Ri^E{1cIPqH9bD9+8$c_@`FTN9{7-Yr0=SuG z07E!N_i(rq64N1_Yz#-+RInp~8O!B?^yqg>?|3;0-*|lU{ zyEnM3nH4=bSLBaY;Z(l}EDkY}1f462UC2225o9%@{(tww#=yb%48UZG<^hS>5$BbS|g{%Z2{O|I z2R^gb-nArky5|I$JpgfhrjD;p}u~G%nvgzIh8oVW=Wyz zR=-{Y?A&K)?1D+z;i+Intt-sD-Vx-_a#XQBpoo7wiooR3X9w9VG4$LNdTyZHY=%Hc zRg0vJ^`YynVs0gy=ZX^HmkuEVdWi|-iIJ!wF1LjLHG9_AYp>*(e{oW5u6 z*}I6GFVaBlPIo98ocGdEZ^`rZZ>EgC5X1RQrY07$Ol=DU8(RxbQ*18+$6m+=mvbkQ zQVDO#?w9JnmQSIsta0NXw!}+~Kx9lQ2kOm#e}UhU{hl`IxTwj$a8qgMXI1_zHW#;b zE&<{;?4-N=ToI1Ih8#RL2nDY_ldsfGo({>uu={YM)cH8ncX+Yz_)ptv+tg5VcAsq{ zYk}S8!MJ(q>hCw+nItYm=-dE5l+Wa5c_|9;vqjP#=TMuNmzq7Fxqj|jD$Ul`llYDt z;L9N!5;@d(z(IP{d6?f0Vc64p2gm#PFdb0jd_K)*zROhlKo)Lz*2A%y@keot-;gM z8B~VhRqjOC+3?(a19k+>R>$p^-cfD!zu7xyu6VI%Uw5yms&3t0WBuUXim=+_?rwq) zWPTu@Nt`RXjT=^O8PCBrzMT%bE!C1atJ25r%e-{Hm))oH!i;8HG-!kk8Jo%-2z)p^ z$Cr@m5+Ij1!Wu*LdmEL09?@#!_ov3K%K-D*F8!xM2sk$(!r`6y<1^L?A>VP&U z8fCC|NkrHX>bA3x|RZI@1Rh6hH``@G%=4sbnziz^IgZzb)WUg4Zj04E3k$1 z&N-LSu0pl<>n)DMl$n*vcb$PfMnfjro2*59j6}3+={^OGhGsBQh#ib%VNq*%T$G3! zx!2Gp#|Uk5mb!bR?(5Kj_5wy5DFG}y9z$SRnnwulbaX}hHZB0>@@R^=6-}|0hTc)5 zFU)MI+8$KBc$bKH+V*L)D!Sxn&y4V{w?Cm#A?qoOQ37vOOWE5YFfU$3R@oz~(3Z&` zrN67D-FeJKbP%heYrR?cJlp+_Z0z~^&}`9irR?pXQr<$MP_?+|kd;1;whOsu5~DX{ zw8MrVK;MPwThoCW)jgpxOcY)3go!rh!@T+k`>0qA?0u5a&OsaP9I#89j~H}qhpxe1 zk2k}eXor%%BE>??lXF+8gPd%6ZVyr4_3mipiYK#BHgw#$qudEWCU;_1RK2==jQ%xz zndjM2V@(t15Iz%jF%Z-WhNQ3Y=&%PJ=2$}<{gN674xKx(&G;}-7Lx&)oHu!)`&9j< z8rq9SFLdy!@eVYa=_k7LL&nVPGJt7U)%1xRd!)t3%>t^AfbN{g7pTGbI@KT|#y#7GAI|=kd1664lp6u}yHFi%i)h)cqr-`+l-6NgS&W$n-QMour@>rBR?({1< zu_by(&_hRKW-4>ge&6plTjxy7aMB|~eQadLVb^p1#>_{i8CqD5RJZQ)x1`+8JotOE zzUS_F{kOAH*gZ7RwItc^NX_-so7X*K_l&zVMm!tZ;_&cQuTBT| zq}6E7b_8^KO!B!nyGJRu9_GiCF0367HV)B4gJ~}f))X49I-BS1@%#TO^V( zTf=U5-p6dEZ~aIk(0LNb#5vq7dXkw4!a0v9=uti9JDaD3CTRL#08%#3;3Axlu&+8+ z_W!^)V`s{L;+2r3H)iG_192zD#F4{zAZ797$tCBJ+iPLuPL;+b=lzRJZ&`AlQCdNe zamdIb`lKCsyqSU^SRB8+z!+r5N@xy$>|)HcMyd>-BK-@nzHxcpO%j%Y)2N&! zq*YC5Lz$usCIR4@#kksQTFnuWlE7n^q+;F41@!!_K|_at}2K{ae-f>yEyuy6gP zPLj5}zn5_cxdq0uf7ZST;Kys>yt&~c;A!s)?IhVO&CDqehJnKxhoHHORyU6K2wd_& zdN6=%j(7r7vuQQZ?2np5sCqDnN5de8aBg55Kt8s#TDUDgzynU_!S}TF7h!01e=vj{ z14HOE_rS>bWgFSM7<)=^)xC>14ww4L3QenJ%v87 zT8|VUlD0KAXyn0m7Z~kXs_1Ho&3|DSr=eDKXD@2-1u46$sSU1!_P8+9TM4LfFKQfFdKCv+ zH0&SY8QR5{_b^5(U&isqA#+cNw=ul2C+aCVH^~)dGiSKB)FPF}CP9Zvl0mw6utZ## zHf@>cIuMNR2)}`(?TNbwkK3U@DEM_aq2fUesGsvh24dRg1}S|BaHvy(e!8^r#d*aP^=0 zA_6_)k)UHPFJ)m)4$JARfRBy%Zy-+^()MnPNIgW2W0UEeRryMtq$7u0vb1_D&O<>IO?zFqy zlupvL&F3N&Y~;}so$MYgr3ym(;_pLqd0E$--;?;tZ-eJ+jL<>*?ReGqMe1NG4Ud!k z@3b*9>OgMahnw5FCw`h5Cyjh<9J>a$ESzi~dOtS0X6XIcA034JWV`F$lA7?@?|GZD z{MCNqGrl4@`xy?HuZFUN;NzCbP1S=inXLC*_M-baI0(OpFtpu<<+RQj+4jbA8=xN6 zN7$r=U7E|@8F+$ejLV z*N3r!BN2Be2=tDeLNglPqiu5zChDj+ysm%T1*-O-clTuVa`d{s^O~ztH;KRJ7O2`z zLKk%7oq7POcJeByT6=WBUzLq{&}$r9cprROT#@y;+wpNlR(j%cqk&ajYeJ;4de}PK z2T)QL4QdTHOEjbR?$6G`!S%!D#v0aIq;KVszLh}-D|Cv=xVq+ zk6!W$ud0Dj!i2)L`}p=YcQB2b-_EdmY=ciUCeT3VU)8F*sFK4Son$PbS>X!UmDV)2v z{1BSf&Ow&P(?Dl_DB~faUg-c9HoshuIUI%#INuq)G}u17?0eT_iAc@vMb~T5d+`rvkrvKEIqd24_Ja7^Ba0vjmwy4?Z+-SI(3yLT4Yg&G zC&dL^XY^{_`Z-1>e9iy`kLEw&0qiZ^4t3 zHTbiz3tC|dl?LMkFr4w?4CZHlJnS2{l@Q{};!`~6(>wxEGPA1*md{|KCg4HSug zMUAuBxoz_T2*lT%W(BPN)U(ZVc^Z0?f`QT0=*8m%S?y*(8y#1_%naRql<%YS{U&=78e(C-he|2`oG?Kg;cC6=lC zt^9J(e&VK6R#!+Btea^miJm4D!Kt>hhA-E<=n4rbjlbp6!lDdn_RDhn+;mxeD|K03 zSd6R^#@K&{@H7!4oNFRftRO@Cx^-dIH;s#=+rhnm?qO=1GUVonEZ@1HW9x17o+(AE zHefF0H;<}uzp4nVCU6#>skR8;L_ay+ylBEQm>~&5CIpKXzp({eq8+c;mD8-j6Dt&$kI$5HM;-X<9ya4S zLY4xhyF}}M7HO)2V-v*Z%+lg3f~j}S$JibP+&2#e|4ewu@x~rEk4j%j_gOrogHyd3SGwp^QXo!aV_*#$?w@7@3Xn!aG=xJTbrUd}t51II&^ zH~H?nAbwWEfe+uNx8h)K@x?z1B*$@XYzY-X+X)tyybivkk8@xvsR$lnTf$OM^pIG?tX3A(&Ho;4h~+6+3Je|!TEzd1HpcQ)(*!#I&zJPg(U8Mv87y^iojpuEuXBm2sS_DzPU?GnDyAf^GN0I`6zH5C%(v$BBv1_ z{a^l&ME<}y{;xk)3i!w$@Mi@v4~ZTV9x6PNiXdbhTfz+^Yb*+6J_`zo2IH`*o@vBd zCeeeVbXbu4Zq$R_tg)(Y&3yiRVBZyqu=mp}#p2S&7g3+qUu930*V$a0>eu;mNP4JQ z!&C-mA)MAHdUk(>?nqNp?epQ@a~^@Net{JoO^tf0rYo{jm6u=2yP}j>eKI^dZcHcm zg;5yG`RFoHyR@RjTes&PZP}@`ULtsJ{XNqFpHVl~%e*%dJwYVxx9nB5wiic!+Rb4& z#;Mya_C{rNd^z`B=lkLMvBd4|M~@zmYfEtF-W=9`LNC9PRx_r2E&Do~Z#K~0 zcz%pJEz!zaF2p$GG4(s><4|vSsV>>ZImYQ#>?$Fq(vNsgAAh#TN6vwv@0SfQ?EYL2 z&rUKm-?8*^l1{Wlg-1|yy{C5~&Gua*?xTk4?k9hKy+~WP_w`w~TgSrjB5Qt)88&YC zKwGP^=9;|E=Z#h!-q+*BtVb21#WX3OUN~00xfPg0YCThNVyi6GaHnwH2Of)=iX~Ie zNm27RZXZ0XZGfXR{1uDDiihFj)MgHx8&UKv3S0sD-2I+pZ20S+n`~{=eWQM8*)2PoV35SY5HP}G!sW#zpOK;v%^zDUMBhX>c4kPG@8cT@Z9_47xlMUhHaU_ zQ|W77QF*@GSM-+p&DXqtA-T9Rc6By8iyrTEb-rfeRK*wZ9p(WdVfV^BF5L~QX zxiiF$aR-Sy{1FXZzJfbrVzJ&1zVp%IcuLP^wMOqbBZc2L2F^G)ZS*~6`u0}nr7wH& zIXZzW)$}%Bia!|iyTYJKYtR(;!a&>koo98rg<-aL=nD^Bb-%W6Y56YaUWT=usrRfN zF2e@a+RKDiZaBO1`3Z6N%tKe{GTz^0@@F3E-i&qUuwRvZaMyLuE{g3+y*YPNg7zen z?Q%H#08Nx;>_#r4t9;R@Je1usU9(ncZbYjBL4n&>$@?zi5yOn`=64ZGrY;bl;U!uO#|)S-RCUBJllUheBJx+p&NU7 ztXDPGPC2c;)Gpuu2$okb^R6qtI7J(l9n(3ZIwiTf(CvMGUC|D4_g$pPhp&s}H{OxS zGEdN4-J8E=!^TTbE1c8PX7)Nat6{Rro(TVXA}l3!1V`|AF>ZG1kYj&NMPBBNXa4q_ z=>vz}1=uSp%U-!NI4djuFG;3lL0tQ2p0`E9uUozMzw{afY`PnW{hPq=vM}X2O)vcX zd%m;&Ws=`z6~}8Tjzi|YW0a~M-7dWPz)p*^+a11S{T~2UK&ijZW|y;?mU#<%Id4XJ zn{HL*-5rVYikhj&|t&d0}GMz1IGD zIO}5jg=55xs(wa~hV%G3sNGBYpDpjoK%SI} zVI;+IWSrx$am0_NrlTSzwi&s3wB<4zN9*?|PPkmo{-PstKiW_Is7J#fneQfw%i~WM zJ^nNtND-ALB;Ft-7MT(s?TD?Gaw71t2m^7b0d zc$F>eBjYxXvW+#U8IAYF5e=VOE2DSAV0}HcqCVz#FzX}0tv<5btNO@k=UyK{vOe;A zyVXaKE%%WwW_{e%!J|Hs11$9s!1dwpYUcW&s&IDK4UB;zxM3jn73%x9haR&FZxxrj z)U%>}i(R;xCcD%njJLm{*l@VO9v3b8A-(YugYfd7#US{QIK3Q` zxBlMj`ba(m9u?JTye~M#v0wBrgijIAogu~|yo=5v+$-)QMjo?xP~1zGi_zn^X48R= z3p3h~Mf?;=le6_k7^cm}rDBB3TelEHV}m#;O=@Ux--VmA0MTc;H>0_!twWtP?dVSB zc0G%ypLXu|gCR$#rJ4?zat;V@qyA7cHk~{6lDJR(T`W#c;_ea^wc6js9;7U}Q468t z;{}fvy)^$rF_6<~J2koY#8aJT1eo_x>;3A_j_&273*)ZsBtNTP?tKvB*yiHFfpcRM9;h#|xAXZ{gKW7es0M{Rj+gZ%vB4BI0M z#RI|sL(Em$kqTU^X^DI@7i*orn^C07AZR3R-W#jV-_)c0TdZ52P@DV<6sF09XBSx^S(;(m3)|4M)6n2;LgM=rB$eil7e1kUzi@Bil z+!O9cKctTiF~;%HA?DFj_~;%m6%A5>5;W0T1vgK_L zl-E>NH(gXQ8@OK;HN`fnaWw``+vrVn-cD$Wt{vP9swM^dj+h1AbCmkgLAtl5g(cV) zZ+5u!g6!y?Qa7_bqe3g_u=1&vH@U=1$q)N$E>^eMt069zVBc^|bWKJ?N9J zhP~o45?u|a+F{FE`-fSy0Ap!@1-SbsIIZPlySs80z)Km31bb&~rAr z%{OP(Te{U^bB=k+A1CSkxn}SGy!h#5N8}#sGUi1Db$2_VME&Iaui#P%_ZU|xyD;9 zZc)yR6K>B@kD7v`PBgbP3Ee>=;kvY)=L|HQ<~1>$wOld%9NfW_dn)GBhUf~So#>pM z@EAW-U+nd)@~uNkrFg2Mou(rlqa#J*Ip3jg!Q3paY$}ZNy~A>H7fF-zBaNc-82j{^Y?D){OVG(NU*FXC4;_U{pwv~vx%g?;7x`H1C;?dNQ2r`KUbp!e1Lhwdtf6G65% z)aQwt3+4ZE1#~}dsetaCJSJYb579Wb?-<>HztGd1xBWQ(`+&Un+8(5|1D@k7lGm*B z9z9L#OqX^lc}}0ErsP?Ju;$XoUYM$iTSLX&MrJB*Di>G8%IZNU7Nt$+dEk@)*^_NK zgWB=b2$#QM+bN$|Xee}cIEcC*vd;O7P&vd!jetSjd0bXv2rkIRhL4O3K5wS&G(jJ9 z^)uFpi&k{#1%K)qw&nS^npw5xvfuG`b$=K)mNO{ zSL!j*v*isrVutKaZSweW*){EE*R&gn_02BK?Zwq{*cpC+TJ5JcW~sWe)nRaS)jaj zJb$z0z1LPgD@-|Iovv~>vgKs7;Z;BRbT=4FV7e!X3$Kk_lBC}B3-_iy7~_s&0%H7Z zKHQ44#9$`>z3$?6bd96N-M^}`jcsRE)<73JmUGvU7R&aCpDvr~=C)NB+*5;m>E?af z$d-Dbz3S#oemsYlWslTl-{kCn0KM4!b%@c4e@!z7=iSHU{59r??Ch)QTpP1&)Cl&h zKGXnx)p>G%XvW$@R>vVV&flr&);T|EmNTGTdj7i){h+}nZdS5O(10mQAmny~E-^L8 zq+YQ%Pqf4Z)XZ_tnNKvlY_?dBhW@ryS-M6ByVuB(25}?WFK*Vhd~sJ5rNAQ33Hq_d zIZaKxHi>akm}q}-y>rkBTBF_P+=>CM-dT5$9+GT%jeG%!4q0`=47HV3{63dkk7-r+ z>o1y-F4zB?W|vVZ?s4Pt=~pvC{6 zF$r|qIdar-rdo6i7Qbltj>7l(LqBGim)qIsM1Adrc2(cv*1x6UXF6aFrWbaqZX>TI zYY@a{1ylbS*;cjdF03gXLA#!F6z%%ypRzyvKtX1STXf-F5kby?M*Q;K-@Qv=%X_1l z<2{&*{^(vB6rZ-?)Ac?{Uhf-@wsaG?-BNW?zT5z8dBa@v_=jc;ccB~Hc0}oR&ZX;- zCVGq4Id7KT-1$9s`?w=5DZ!QkDESadGA&9P8@_Yv=kjGSHNZfN0)tWor}6ZGsH&@5|XmGCaM?r%EEf2+yyr@iRL0DJDQ6YYi9 z1i8JBv-A77$HCvU^gb>nZnpdTxKFaCq{Ph$o0V>N>}=tETuNNF`+Knsw!Eh9G(5}> zvpqy#KPqBH+6%?q>=ygbQ#rN#=k9nJH_<(>94q%OEcu=}JB41L^xAqgy+Fa2sP8eZ zvg!9M$nh!c^+JOD{T{mH&y&B;nr|-@!{%m}C{y7CQS%?uH?W7%AJ`H_?ACDBUzb0t z1C{;WP4&%3bDURLt+%0XKeO&sz*9%FOay`>c zze9RSmV3Ym%kS?~zyH{l{|4P83&%$0&ZakEf%0wArKxzEbjzh&8Ah^>n75}by+zt5 zMu(9fP;rdUw$R&wjdJ~q)@Mf1L)lT|@DFqk8He{jchh%CIjYrr&cAq%bSNuPzDKH^ z(WBu4&-X|+{T_)H>4g6mqWx{^RZ^pkUL~bRj?CTIhKdlwMaw}hh-e)AF&J){o(+R; z^PzuP+b7zqeadUmK9QRf2Ko7^iZLoTsS)H}zEO_d&Ru_SzmZGtm&vkWz}RGL`vxo3 zC@jqrwly|m3D^&F*?Ex*X^z}b<-Qk3erpIl@{A6mX-ZJi`u$-TB}aCok?y;Op7K4( z$L2cM+_v(X)$8U|ma6Y~-5eJYPqW_qj*7Gg1Lgr^FbcSzVhK9^8(J6_cd_-h{O9GU znzVjj*l2nXd{jPT$O(um!sn9vU-a84;mTw<@^Jq2Gog@&J;&{Ns=Mx!7g<69~%A-dv5|BMU^!S zr;`8;f)$WJRF+0srz5BdZYcwrKmyfRtvHC_C>l{Qqo6VAh9DqIcL!?9R%TqsaUUHA zmr=)YS4e6gaksmzjMyLRb8D0o#p-B`Tys8pVy&N%dLCQJ?GrB zpA@S#Z?ipqmAJvV;+NE+Ax{{ylkf8&+%cDaRfPHLt$y$~a1prY+z-y+^z=3a0=fnW zTH5dE?_(T9J-1px6#14A4n9|FPWCW~#y@iYYVlu9iSgrbz4>^g3){thTtBt!1z6QzXNSU8}bu| z`sKT9`O2NH+?sroNIoW)2VegqNo@Un+OFT*(+V$H{7DGhPHP1IFO)4Y7x}SnEE0IU zyUi+}qg75ybBo`Q673y|Cas#|#saHIp%(nVv9q}fM0E(Bg9RcRlyUtHJR z7V2mh%G^j+E)Mlw9-J>R6P`RlqHz{n3#@^wLNI_BDt0yu(Z|J^iBz zCZQ~YTqg6J=rf_Y-cEDf17M2HwUB%uyYVo5vl`}e54mJg>5DVriwA^2QzA=&9rdG$ zJVMzsU5&woz$Wz1DVwQE2V2Xuy?ib1wrdHPBLlnf&thx&3f6)cTjG^PSkDz4UsGf{ zXR3a%ZUP}|**9IxwJ9qRi(z@!klM4$jmz_hHDt=4&`Q1_xGV%8r=zX8L;xg#hlJpd zHr$KdxEJqK!P+zUzU>i$b;2(x%kCwuokIjg=TO=OY- z4-CP-f|2|4XeL)@rob0`XX#VO>XC2SpM=22t>ocy(|ado?}EFiHvPo0t(@|!8Mu$^ z?hM>?)|%j6gkRpMe?wos6TZCLH$*?*7PD^xH;7H(VRqgVZ^RkY;(eUL&e+RWCR+Rn z_+*OmHJy)3&`_p)gRj8L&9i^Po9I(8UmmOnx!;i9i`jSPw0X6by$au@&G9Wj?(84W zV<`ex5>_7tfG8)}uZ=tv7q5+^Uq1qIrwM_I2zM_6>jN-qEl0RJA5>DL8T~p^(Qk0* z7oCie3c`+PmwBj8w9XU#3~0&+7`#st`ifzM{tS5)j(}Q6T=zkYz%iNb`dtRoeUO-_ z$#VxLu@l21Ruj3g7@eJmOi0;f-jKn?r02zy;;r+CePva!S^+MoxEp6AfkV0Z;O>BYui}#`Oe-oN&l>_=L z3o@OpzmjCcx%}L^yy>*2ANiX4#;vJ0`sh}Yv*DYt0-r(m11FFb{Eb}=5teKM@NPt( zyy{CK>sls9ty%rs8IAF>)oAR!P!@eKX5w+Ow=zsti~t%wYr%gQWCeYDGcLF>d>{sA zjGO+Ltw;#!s4IsW+N25!%Vl&S@jXa?NiO9hr6Sun>uhylw=p+l9v-YYzc3R z(NS$e93A}(+}>+7;HDznZl_kbqyF?#uLNx@D5csv??sTG>Juq!V+&x318s@8yP#4t7_BetJsJvM zx|}re^ftqVcLUz*V<}|$dwnIHIzN&}hmXD$N2jV3gj;_>+$!gfecQ@j8+ela{Xq-+ z+t(IHxKgr&&SQXMbNGHyNtN`TKGj_^Q@Az899%E^D;iQs1j!zXRZUiIJiv+ zJjo1NPmLrKRYjH&@ce2^#57U8iz^8&WeZ+NwZN6HaNfN!2-TWzkpc%@aO0dhV_^*v z9Ehi>`3^W}9?k_O^f?2RjHq3&9fqUE9a`}MjX%LrljA_Vt;uncGN(jfWNkZ!6Om}b zJF(a)#A-((6L*4J;74W@jwQg#9O0Q7eQlx{XO`MKo0hfza${NRPw-{c2!Wo&8KP8g zh4S92w4K#Rc#qf8ghE(3F)x&3b1Ul*>Sy3;NnefG+w$V=ZL)aC+ao`FEKuyLFpbZr z@YSZ}vCu^1rLEsbqIujST9cDOoU$e-i*Za87D720?$8YmDfHQTI(c&%+{)M-_)l`z zjGchbX&-&df$~rf29z`c${+(MsRmGT#jz}%&2Hh#Mo<8kO)wB`>0c=GVEH8kua7S? z;Pt|B6JEhL=#|vE^XZu<%xC7YXP5wE+HT>jFM<`a`~(S&tdyUyKdHm~gw;t29pop> z<@pKY&meAAL|>WL4HGB###@H4<5$}}0*oDvYAe=6nDAJwozZHBkw$aj??1%1xO zV_tn;dLulA_N;?A>pZ2AO*|DRzAxP0j4Q1%=dbr@KYuQi9n`+qfFT}b*yO;b&UB*m z>>&Fghscv%7(`5aQVcF1wux8e!6V1C*&FaY3t~BJ8=A`s5i5#lTS9rx0{Ekl;RV;)sXC*A{kBSC!b)E!uD_y?K625eMhNJI#F@>v{=5| z21&gxjol|9L%m^wq&!65LHVPIbwR==dpP^hEW0c=_|DNsqS8A!!G9u22&|&;yc%uC zT0G_(gur093EW66box&u`?evl07gU|+f4ELexM_dc+{PO{vBEm@gW3lEiHuwzR3gDRUnaaBG^-ci-J3o)m(Y_BA7TKycqFW8x1E zweM?~H#v~~L0qNHgS!Y@!h-V8roYMewwQbm%X*T}hVwnQy1=Dfy%A}rMjHb(8j`vN%K>o(_i9DU z+*olusp(LzQWM*r;8!L#|M{B^v3VsS@HH$KVrh>~vH9iQX5g+Bo8RecV)K&}$bOix z8I66xZ+f@EW*le0SDm18zVe`p$C_5_BU+7kM=SpRE~!ntF03+Fc*aMPoHMm$x#6$FQKdXimQIR^ZKtLROF31;LN+Q(nucnrA674*bohMw~L z#?C*LNr><`u~iIx_G2lr%r$PM2{scyg^_4oVxQ7}ZTU{VGTWs;-DrI0a?5w9p}OTe zbI-!>yve?UL2GxPg-h}%5x01NFKNP=vF}J|=Z$E)*IHIpGnOp9Ec0?G#V~yXfEZ}F zChOoAPjb^s$O8x9JL;`RXQ!;}D(M&A9orw%-}$jMZw~o!pqe7X0PbaHM~$`Jgl8ix z2(L>hq;Z?eHkMRW>K4Bi0zJB!)e=&el6=R`NL)%B_o4iu4+5D}&@wV*0nb0K2aP;S zAXy8GJh46lFE%CDE5VVY?_S;`=*`?9RM?@6jB-sFpfI}W%jdcKN;5|KzXy7c@XSe;*&2M53{}@!0?W(Ls)*05?ZJt(?4W5F zxX_&KwQ-vP6TPOpi7%$VEy^aDwWC8^dA(npGY9abnNg<@xXYN|Ehl4v@86MulgyuJ z^8cX+r#Oq-=HYd0erToukXVsLq_bdBsU&XZo-%ZBKD*t3A9t4c-%Yi60Wm-ZgZ5fs z4>>L0-OtLN4kIn6)x>DICf|jufLNS9A0h;Fm{e%xA}PJc5mU(O~8<}Shm5j6BaqUt|vQxdUlp^ zNoZZu(?IK(SGK#ks9WMV%3pdI7_v?XZ0^Aw?6ekyMV}unSRxzC@hkmJj)GVM06zp+ z9OL0@Pq*;!hpXbxihQOP-SroigL)+wTAuvW)P)YzN1~w13=lP6k8s-bW$9I7l{W>~ zDr)l%r)w-14LNW4Jrg#V)%f$U8Euk$zL_gi_eICBDd&6b7PnAV1yT&@+llnx>U zCii4U#J)$#l%TEoiiLUHXh*5A0po&QCc-a8_;_`TjR4BVW#RNPObzsA3 z4Di2gWWjbfq*6#ksSwDa;OZz7N!NU5h@?gi-VnN7quqy-dGxT6znAV7C&9}iigC)f z_;MQCf+gQ^ldR0cc-b&1d>9uF(;vkYE)#isTZOvC2NA>6`9%PKpHs)IU8{d-oW*_T zrQnDfw>lZsPxK#-Bmbc>un-twYa_-DAt&cC{f?t%{_lq-H4i!vO9fV{w6|h+k`EA_ zJAt~-q3r7cJpZE#EqQP?j^26hDXMaqcY4<+gtkhUALe@-4LK#(lts^29i!~n(Ib=% zt9Pc6`3C7NU5va(YaZ%_o#de!yEaHK?`rrx$64O%(aMW@T#E`8-!FI% zH>68LG`f@o4-_gq*^IcwcqaesXIA1SIi+k^A1j69(HFelq;P#2{#1QMsD};K*LN}f zOJgl~lz5WmxiB*Z&TA~;(e!27+-GoK%;F`(<|kOIm$=~DZK{{-?1S+i)#zM;>Er{y z^^w+--iW#Br;+>~)%rdO!&bB=N&Y&UTLE$8-5!LO#O#1Ct0QYze4Fr=mKz#tlf_z? zVDUcOwE_dP3bbAyQLwk-5(2|?#!lD10@O7~M~;Ct`#ss88C)`I0p#_c#&Jo1Z_c62 z{QltE6pDNT7?Qr?n8`59^* z{Tu0?N8HLfp{$`LmQJ`Cc3QDBcQzbxCcsLCjEx8Q#!Hp%!q+eM}OG z!KYE~hC&29e%d*}1~lJ*f?+!OHhAn?jgP$(`dlzMe73jUOAi8N;YlMqJ3v8srcdLe`2$)FzUwzd*JP3g|&0ScsJj#8tCS2dsu? zgkS_$UWeQN#h>wnGF8t{S9Pv>E+zQA$Djk;$UmFGPj0lzm=CP#d>b%1ueWv4i(8Xo zynCSDvuDgNIoSxCIE&&kzXjT-M9e*Kizl%^-E0!R5P0@_-f6E5eL2)&m3dx7vh<=K|iM#8HyWOw?!QiU+-Y^Tfva>AC3# zfz5Y8n{_?fqJi8p406lRo_mj2Wci+Gl-sR0hJT3HF9xG*GT%hHEkfYVWXnQ+_^_>T zaBgfZS;7eJCpXe@(p}-vSVbK1^!e0W3 zaVW5hPypk8IHNvWZ!zO$u1SG)qon>TdTAZ{RXN^V!wwx~uSfL39B*uidEtRwX$c$R zLSL%&^B5mvGS5=-*>jm^^ZJ;~gHP+{Qh35X_UMQf?)1VPd<`FkBS4i*ky^jfJjUhn z&H?@B#5Nq_56B_bupD5R#1a-0G zK2t>O+l@K%fBU|{Xi9pTsMge#Y;pQyb+O~opoM9^V{_jp@`Y+T;sb$Nf!W$Kx7r57ONx9^7Ea5%8Y9qxT3En)Kw8 zL_8$Qe@GP#whI27qZmmA6m};PQ-QSSN=WqcGsGrG|o^N)L(i5D{ByX~ZAxm2QQ#7jR z6thXWySbYp2K&hYegiB?xJJ|Zy+eE`YbT}O>kqcDG<>GeaBY0}Ar>k~GWt#3+VYQ| zzA^)kYGQ#$$0D(<8>sK>-f{k9Uy_x`NRI0rYbMKK{VPU`*LJlqz1d%xgeW<9O$PA` zp=@GI00@k0VH>kXFKDok-h_yS^e!^S;iUp}MAMz zLesCe2!vH1?arSO|%eg`AT2txa5geDka)X~QthF8*0FWnJWBW3c(lf1E@%{Yq zG(*gzcOtDZ(530ubu;Ig{6dVM*n#OQ6NJm@m(`#DBDoLeb(V^SroKPR^S3(({uuJoqMDAk&Q)IHbm zJ&E7QcZgzE>hDs(S|eyL$SHZ6=U?_(&%oYY+UB=g;}ETR;~cM+qrE^pEClX5%4lTi zYDOdE&M8E$Kj~-;X$R@ACdcRl@%Hmh@YYyrO9GbyZ#&)=ZwIdK5N}svOWKQ%^FZH^ zjvxX@^e~}kBP($T$42RinB0-3qmc*eKU?GA_$qKdMjV{?GPgLh)H?{R-_N7r+Ca1; z*s=)2E%uS}9RWUd#U3&=1&335{=x7T^E|qwU9jlHnvS^E35uorRzH_d`TOJ6^C{_{ z$nOVXnv!sVnFS|flv~ANbM->toh0TAzyb;Rag4+Y0KVaw6pFcD2sYz5_Nc*@7wxl> z@`FiKU&&&xYRmsvJmUoz<{a33+(O0Us@qcWLZB1MLYb#Mh9u#Ysw7ITI^(#HAhh7R~F-|Nbg z#v-1=nBor`89x4a+%zz4)7Yie5k_Sb*e>Z6`=e3qSv5juX(AP1B0~@wcF`78Ed(1- zWUgpUl0$b!^=MseiD!l9{PKDzKafWY*I0`_$B~T`mzq@CD?$4kTA4thJPp|_D9QDd zJ+M#jx}OdzJ$pYL@w#-3{p%p3$FA_(9;H@)?gV>iQ4717MhDFv{UrRWUH@)`MyjQf z3s`NyYo~aQM_DO?$DWGN%SvmE$Mg>mwXzo>P|Ld75rysHEM_l!0y&HjM3F;(t%XhE z@(BC)AUllpKew{Ucu%s)4L!;3pWV#vzuv_Def}i-cSAEPBbb6HUW7UCD<5Pt-48H| z?@z#!%l!w^WcB?FO6FOSMP84H5Lj$uaaMLoX_1wYPLuz1mDLxpl9#OhwVcGn1Kz8U z{)and^)tL8EvhP~qoUP2(2`FwI|XO)&2s=x^(ySze!%Uow8QJZRc?Q^UCzG5>3fkJ z!m|1{o6U!8HlU;&#Q|X!mhn26$2jYG)^jMzW97g$fU1cTA>j7-C z#0`&*j~o7x^?8uQF^*ZKOuwH&^Kv@frt?i!3LLUJ*WnTO&HAmZ&QFuY7CatzXba0R zysm(v=>5eui?@pyQ#<=3^2bA0(Wlwc@ z2jF)w9u>bMlp&Ls{By~K>3adb0-w+^=E;OFC?3>*;&H;no#bFb%52%EJ8Kn6#+0*oEM&QIlQ6?WTvUw^M}Rh5U9J-QF5^8vxuI;WcCF_D6BI9YXL+e%nR2 zpN_lj5`uri+y2U|PWg)*nr)Bu1W+33q!5n@fe#X}1jPAy+4nl;-y*9E(%i_Szq2uu z?ERmDrp3C%2oI06nE@8})BWMjBf^8pz{qEm3+c&D633=X!h_{)SRFragooGCuA<`; zRF4bzH|D>@z9EFD4*~NAI>=j>j34NWAJ{0Q!@n^0diqNES)>d90#^dCK=oROn(Gh~ zg+~?>H@+EzmbVFk-SC%sMYs>*d+vt@!uJD&2YeU5p0^LtJs-0<%6T43>>OETd%!BxY zrkE>!HSZ@*lDlcWIBt8mN=n!*-i#`HU#Ac_mcTg6`{=7t`#ND&jJc>+=qS`qUxo)B zz_lmBb&ZhTli<3V;Hr-%>l9MIZ&+qI%s>*Z1vp%@zJmrovHlZBfa!Q|$FJsmO#F;} zG5i@nSNXd8ho>_>YvDg<#n!ryuT|sgGPsbJ5{Y1ezGva@%x!uRF>E}Vl>Pd$))+tC z4mYnR4s|5_)%1TB|4TZ+|Bqt$U-oa~e_6-)|2)Tk|G$C%iDIrpSo}}nfAP=Y|7h6v zG5^c>U;J<2{|Eme{2vE!82gLy|AT)S|BL@8@!#lQhB6Qmz!6E=>;~lsy-k~DWsKDDWZf79q+WhN-~`3^M+j`mz7KS2mCrd_)S7FFlH+M^txVYOt;F+yn~ z9~M8xq2?z@HDh>>+%YtilH^wCpuHVK4B&7-<~v7HXM+0v11eh|qTDDeS{uBPmE$B^ zHEFxh;G&F43$NwZse(k4ww8=K`DyxHB&O78%S(7Y{3h+;`y)~9w<~CBTE%Bffm;5% zNK8J%euM}dY96)dfR=!sR#_pLM^-}EddqvBkRHSwav?oL&`+PVP)MJ;Oh~`;1tERr z+d{ezSit=CmR!Lgij^omAIOkG`6+TJl!B8B&2tKJsLS@ddnGWXQVFKf^x=aRp|Vs2j#f_94kgy2@bk(9^n1c0Ok*^>W+ z&3ApS@V+Nu>D3^Tyob{<69*+_qP;pf1^(SC1UKM?(4E>%2=%F}mPw)a>{Rg$A@$Zn zlrhK0h4)f&{U`0kH-XyEYS5?TDZAXNoZ2iYa!RulnqZgwyPBo+^|`i6DZM7wzq^^< zI1_(mmMazha@hP!ggmiUQZDIHd~lgDyqct65C1O9Jyx0QJ3k-9yFD=fNeS?G;?flF z)_i667~zBK&y7qUtz7SjgljmR>j0djGLPrK=>+le%bA7n1TI z1fL>X;tZ$c3C(b3$zq)lXiT8}oa!I}ViU3X)K8DmH=j*3vZn6&JeD=}76EK*sT6v9 zAw9lusR5~*3H1+$&oIwvOTWMDDU_#S)-l{71cu}D*%*<7e$m;dsiR$#>k5DPAIPu) z>I;vp$~>lh`G`^V4C}(d#XP5=eE=6RYx=SBz&7u|g}`OuocfGdIjl4xEpkqC?4sMcyg zQxaFfMUb1YNZr0te6aXi#61`A-JUFAd#i6!8$B&^Tdx0eBuyKvE;Qgr%-jvH`6L6(K0r;!7BZEk7A`d8Dqy~N}*Qq zK2QcI%v%XJSIg<;(kR&e1;WFR5%y9K85r?fGioSBJNXA3hcYmKb(e!FL*dDnH0ZCJwXUDHjHgp&dmdm_@ zif?FcEMB0jl~9}>yBWXba>sp(3ZiL`(zjR;KL6SfDP7NuO6bwSul))~Uv!*71Q~ymg_4Y>$0NI7W<>)o@aC<6D3MgF&q`*7jzs1=f9Z259BJ^5Qe}QG{ z7n3YJYQwF-*Y0?d<#+yY7o(t;f%j_9|2<;PAIG6FeuovJ_Y^T1|HpUO<5j@_sMD=g z{+nXRe<=Kc{OQ!L`J*8Kf>i1xAy9R>LwT@ZaUDv68B7@hcF1@X6%o#ZQxk zSSuP?yced_ILqr%9|S2ef@QYhczDk_@i>m|;T>F{M>mJuP4%IphFAWD0Lecp^WFT24YR9rUi z<)u_nW#^dw=~n86;0`)!H-p?*N72$#xZK#c)Q}sk8r$?`TwcV9c*Of0#aDsIoI=ht z{r611$dBp2Pg7*hB#@;xqr8_-_Gcsix1Pz3b^0EwJ>3{CO<$Z~`2SZ}iJ?)m(nekJI-%n;(Za<&ffpgy22or?1!jBy)Zh)=hlQToV%)7ajPJr~#`7 zq}I2Pu(lQDL}jEC_9?D@4c2&pQ+@e+ygWcTb(g9~x@r8Fk=vVa+*N2vn1HbZ@O*&k zU(e+MyLMe!g!cli)>3ES?+lWKG0}k11(f|}oINaO4#Rk|r-7VW*~KEKe$a)< zsX`zdCx}%xcvR1&Pw$7_2=&r`14_Sx5gPeI0Nk({^Xif3_1X+~b3g2Y8?+IDU$3ot z8B<;!b1tL#{=-QV=Jh!w)egFL3X6!qS?#4LYF<=Bl~VBcGWOe!ziH7*iWGXZ7Jt>` zP>Go8oD_J%uMwZ|VnO81xX`q4Mo0PY2#=z8cxD2F`hJB?HR!ZxS;Af{r$dZ$RC`7Wy0jdf9;Kg!j^xjBG@1gPvT z`}bvns5gVk=RD6=i_N3ePig6He`Sg+?gNp1WijIY3@1rO1(LrCwzeLwzXsRygtulm z2f+V^(cLu`$>dSE93Ht0QKk7|d~>7|2p0|u?1^~U+aye_@=ZX@#}XB4X_2-*7>SMv zU4H?nMR`L0eyk{kEFr!Zb8npJqDtf{9oah(@-D4kVI;o6yi%cT&D@wPyj9h7DA!gM zUK+!n{Zjg2ZS%B9l;j7L9awX{mzLlhg~0DIDCt$RFR_($Zrw+u^S8+zl1}acBAv%3 zwIdkdCB2V6)KpCPCXZtIV-$ZP%E%WFu%x*;6@feXie zE*x(|R#t+3CQCnbQ;z+N*+1E52Xr963W4*YoC4T4cX1ifbl zQXWjEefgPqnKs{udm;H%2wV~WxUbFQe>C{>zboHQm!<;;raB-->#hr-p5W3uvkVaerAc>A5-;U2J|VsB%K zH8z9@Qb((|7pC_DGb+{YzX`qu3oD`+w|V#+D^UaxD&-JDXe~6L6&h=osRA2Ze2QO` zZ7nFH6(W?E+8DH#;aZ0N4$C>-V3*87j;;ifI;(jpyA4``AJ5%@^98iX;O_a^xHegA z!Ea<3lTb)n1ck&#ll!Y>a&KFHujaoI&JI485Bb047*ZQpVQAhxl{BN-7OXRe`(_iNNbXP#Q*(sly&<_C6rd*_OEg8xtO1|0txxxy2bve@j~ z0=Hy134xMd=*sP$WMh*6sGw`v*|GpM1>t_y@hdv_15gLrVp`9Q*h(3Gskv-9bB2oz zxEdHw63Whl6~Pu|z+xuk3Qtzjs=QfjEnJT?*KsCaM_(8yLp&(>?cHf9lChLca90TS z>E31)uys>XEUV!Ccz$mFe;I7d|Czy^&A+TW^##ws?sq>s{~PDGnSZIhqxt{Z9zXvg zK&&Yoxd7#)UpoJD1{w2D8x%kPT%{UY0tsdNu<}X#+%siR9%y_v??RXM;ye?7VAA1k z?eRwl9>6?5KTO)erY-^n3Bi3QS!c{bX&9xm&UE&*(1tA)kXT3AvVK1;pVpVs>fwGW8gzmMQb%i%u`bCY8>ldYBx1`WyfbwQ<4=LM& z#nihSA+uuwOwIT*0dZ)wmiMyHhRj+NH7-30%18^E457zDF1*67yH%iXK zva(R02}yQsH+n++jHla-h2(?0?4p;tn5$7A<0mvM>2&P(9kV}3;)J;VXE;8>%q>=V zySlR{IZBFYEwWOldHpP|g)N5;ceb2Ku$p|nn0#7GcVjKsoFjfMPuSzva*)=N7QdE% zaA<)rlB>>iX}|q-gwT(D6t99te*LeRi_RsR(4AZ;hsGDd_LS1sr=5*6{~8X`cz*e{ z+0RRk@8{*lY`Wq2R`xjbo1Tr_f3izb9{^H7t-oRrRMv%iEsO6T7KaaR@sJRxMv}_D z2HyV^eyKOOa>b2888%YL6K@zUl;JG$K!oxi0VZ$L^2FL%UfiQvo55{hsD9y$*nBPh zx7ze8%f?m)UFGZ?N6BE^>>Q+ViN(b#|KNmDt?7qZH)9~BQkQo1ZzE)91-Z-(B3A&N zqqszbp}r-i--(UF;d?ctNnm?!nWj;5(@N204axJ_CYKEoDk1H*9n1L$e6OP19)ydGe)s_>Tg|n z>ixrM#J+tQUS!FkyX+nGNoqH449_dQ-P~~MJ4$>U(SOp^g!SsNue=zlsSXT`gBCM!{=CVU1gFK3Z*DS=Az@(R)sx5a>c4#VN4Zz z&9Lueu2`L`ROSm~_p(rtUT4JQvN5#CGcfdRq5~9#It-$~snoHOxX1*tF;)xAN8FqX z8V+MnFD+mcRK)59f3FleuvwcFy3ei@Idg^gqg1~@*XES6DU65$(vI$HxPbqXN zMPzp=Qc7Jp!uzcf=<(D)ZnkBj=7E*{4-%AxKn0e)4jx1GgidwX#jcXS+K#m1;v9#e zG&7M`dsDa_eI=UQrd^Ap&u|vnt!LwIOz*_5R$PzH6ORdtvH2F9(PjINak9YzI>vtG zISZJh?IR$zW$=9QJ^1Y0CjRWNF^@9#)T69kXTEr_cp@JBW!;+jgFE?y5_|A&q}^rc z_j|K3=3l|T3y;PBoz@ip_h**B^RT(GDTV;SV;kh#8qM{IO+wjfn<@8Vw>2jBQa{@1 z%i7%U6}x}mu@3H!H1A`xP@yUIoyo(6C7VvbUErVoa7Pn-UXm>qGNy1E2DQ{0l89Th=LnBiHT}r=`GpjQi??FO53ZX5ER;eQG4KXL-N$G_y3dHqVsz@iAi`Q-vCS+vf_%n}$y4ui>BX^q&A08r z6J?1V4iY>Emy{gmxuHf}T0M&uNV+Lv^g^%FF{RkJzmMJj={M&6O1&kHzCb7|wCg^? zH^zMN?@SzHSth(R+TjMdm>GLsJ;vr~T+e5hZzkGm$JA2wGx`{LtG|@59qio!Upw~! z;A@|9?oSDc-(!t$rPM*57{<_)QYW%Et-093*YqHDe&JZJG$Q`yaN!z$N`~Dby$bJ4 zh~1fDh~`*z4E`=K{tng&LA@ukf;E{Nb3eO^T?5W93+_6p;`mikg?@#*qP43O%IlSI zb!dR@uf0l2Up>bTYs``SVIa7*C>lOPpHtYUyyv?^xj;={V~H5#x9Onq?@y9KeX%iR zg7(t{vv@*<2W!y3p2EnioRb;0&hRLe7$t|VhnT;W)BxK6;XnLhBK+6ncN67QSnJ?#)Zlr4<=qOZK`#5 zxA~7>70R`FyC@amFvx#fGuM!TPSkxpByp>ED3@anOPh8{-BaLcAutPQ(O+SMNA|%Z zzFk%cFO4t*2tG|;Zs{{|3OSDiI?^-Nb8+u1vyx{Ho?d_Dl3r9ptepaCQX%P0t2w|N z5~ea@G|MuBQ(&5!J2Va_f-DQ4^a1#Vt-cvsnoPy8j`#P#l&0g#?nN9I;(`>G+j0r%o+_>5u5=Q5izRU$yYp-NX@V>3l#P5=fXHh6!O@mv$bS0+j$a z0Y1h|g2NcOsQk~+fR)C#;1cDKq^|-%Ou11A>`P!mGol_s(Q03^_MsUcmroi8KOV3Z zdd&GDgB-4i@@18%_QXakj#+6}rl2!b2>gy#@(@>a7@eta(B$AMf=mxHIRTL?bB%s0 zrAMzOL}l`@SmCpax)8XZn)<|$)!Ppc74CX>S+Dx`vw^}jxIBG=k#DE_@pXU;2~0R? zvi3CVbOPlj`ZfFE)MlnlaVCOty{=j|HcVT=9F9#1TQoHPn2nzQwTUE||VJH^*{ zx*?zw-AG6`QxIp%InLJXL6%woysSKf8dqZf8L?IfY)^>E*-Z1m{1XRLe#k$0k?EhT z?#vD(gPd$fh58mpNdeb08nvNbCVppOGapNtFbNyoRcn=FxMDGvDpO$LPo_evNv2vM zV}#u??YkLq>N9x&x96(&y0nX65L*=-nQ-Lh38U+@gBS}UvO`m#q>4Bjh|+~2I}3rC z;k1W)N~f?FfQVs7z)o_M6n@Tfo-+i{kNe*r#Am4tzL)$%?udgzhDNd)+ zB8A|^U99WFjxNXViCZ7zUnp15`(JYX^+SUcJL(5@3OXE!GbEI7a3WmIiLm$Az>u&} z6li6+!drDHMVP${Jjslmd`PF%K{GBp4iCg4E4)17LBxVtj*@#h797zA7E=WkE3{nL z&j?HUb8xwpu@3FMTnjq5#myWU&P?R1EFswmOs4y?8F?6iVtKSuH9BUROl1F^4E5P@{?`;hr8dc*QGVVWN_QJU#}o zQWFBJsKnY-hp^w^zOqwJ=h zP=`i=nO7mrRfM}r6+b1RAo@6-p`pHMpp2kgm}84qrVLvWh>8UkV}t(9VfL9R-0g@+ zQz)yYr4qp}DgWz8g z9T+2cgFRPxvKkMys|ki!{Un@<2WqbHL^ZO)JEjpk_72Mr98)JaK|=Rxx#Bm)+j7M( zg~j*NhI}*YD|}RBkNP&}V@$|tiR=KGf*om;I`Tx4?etcP7oQ^d|AeeTU64j08_&^M zQ6~g0#RHO1VOTOdu4ys8c8ghptpyPoFk_n|!|?clfJp) zmK@=UDg@1-+ZYsol@qA5Lh%QHVsQpSkTL+83K#^xgU5v62915iFd4p68;$C3>;`JA z3%^E)`$)9!i_C1z$tB@{b!HSlzdk(oDCI` zW7!~Y{d{0!un}crZ5+FDStJTZDIht%*lJ)q861!A!zJjm4IuJWaN7L@V|TXfi@(RW z5(}t>-@~Hsx3%IJ*iNq$2~B@!do*gg7`g4PGZ&-9t4Q4$EFQ;SVC%zN{Wd;G(ud)9 zS|YoKI-mYW@J>%L+KV$A~EB#cAYb*V{HI9YgHOaBO zkI<;^hQW6O_+4`w8h)TSzR&T_-NuZp5}FU>QGGJYFZm-rm4{D-MuqTq7W=&je-EeM zT>JwmQ5y7< z6tX(1KR&;5d?ac?nV+@wD%BnC_i{F5<@qq=eK>`?r$^$>pCt&lRUOs~`|qjObMAsz zy`GQq+t=%P#kgjz*R#yHXsOpT&$t#>uV;GxDeLu&i@V*fUXLs8cDs5#-Q#YztJkwX z@1L#L<5rHv5f!NoHP3~pT*Iq#OJd{|FifUtG{C+G%W*h zq1j@}5x^J_KZuhXggop`DF> z9J9`yvXyM_?c<%{TWn0UqAUlLb*uTXjASP|w8KwL|1amqlV-&5H$A64{-)-1fWHYj z?eX`(RUP5)ma9&QzaLz6O8kB5s#D_coU2ZWzrVTaU&r4L>PN}y`Eq>yD4n*hNz1&? ztkNNr<&o)%a)am4{eQZC6x&r^Kk8EcjZu@>Hw6Dr^Z}|LmGO(~NApJ~Qb)?SLv2@L{ z`1(-~lZAw)<`%yyHtRH6_{%S48q zC)I#TzF8kK{At|!JF6#Utlg~VXvW|Ff_hR5+tia9+peAzCg#AGvZ!7X2<<&bq8!dD zJ@GV1L8U-JoN@fi~Ig~5BpO8yT*u80&C%9I4ydtZDnF-ltT}`m$cZ9C7 z$;xJF(;hka6$U)yWNAWR20TZ8&}w=jvHcV4_!B`7nHR3L%b*+jciOU63xU_kvHdmM z1^<4V5co#|pPV@m&I%sI=H&he4C5n)#)eVW`S;p#vKov$>~+$HgT|F}k*X{7#1*Io z(D!`f(K);o!E!{nFVDGENY8U>s7YyvbqzO7W%Sc-a8j!_=rm0l$@{*)+J+N`$qPZ5 zDoV5{hsTGKw;{0o&57RG@V`l{$^iU=m3@&w=?{6%)$KovI_5dIN_m$zJ9XOET=c{$|OB`*^jp|uaNtogrM8fl=N8D&G zhIgFsBY|hpHU!J(^kuwQj^pv1HMTzwLbuv^i1?H=?{ASkZn%r(D9LJ(3qA_+jf2)R z2Na#7lA4YyV#^T%z9{Q-fu(!uWz2aS>S3iTdOC&ReP}T^Cn`KV5%<5C^b0nh@8gb) z81$L`0p}Or0+MYlQrvrqagrFwZ@0x?!H~1~Dw z8~B->M!W5&-JVNA-Do!mb+~~ydQp~J;Bbeo&G2BkYVm7tnl|yW2sQS(`ZDaucg%Z+ zM`;RwxePC5u}c^jG}j@8CT75p@Yc89v$TPiF^fzpJreb%%1TQ(ME;*0S^~T13sZVF zr75!GO04k?)^=SQp~?~8UrO?q_biYT^;{$Vi}W9M$G6w5&WG3YGf;@cm(S8(p_l8m zXZg#iAW-T}bUM{*OZj_Y@^fRBt2;C{gDQE^mkTdtovI6rFO$%T1yTMQVI45W8ef*F zp+LJ4&f_BtvW}2~BV5l%NVblU)_#QDm#`6jxWqC7V!EEdM_74@Wduk25nkdWJZ&Ap zfg`-p%0^gd9l_OpgaSUo6zd2s9AOF{;YwqKQ_92th5AbHvY5Uy=i>JI%AFT?ps!50 zxV^qI^5Ty4mGdq>rM}Yb;#2A?+K5x?D{DraQeSy(#6PRAnENBq{)}|u9%TuEiB#{y zxGWFjU9y z&GM)Zrr>5f-JyE{bA7a_Z(o9&%J${uOmkmqVUs;dMEiXvV>f?CynqpAU}cA>_gu`D zcNxrY293dxpdiyk0vKbTM>(K*XpB06>U;7g0i{_mYZ}Tag$m7)&asTt&oUBAdrzi; zcFRgFRzL#`N#p~j03P!J>x==}@-|!?TV7R0Y=WM@n9?*)L{%58iDw8Ck?oWz3x%=umLl)`!2GQPKFRXgAa&aA9D!k;_fSuV3>vUbR>Ew+ENu`CSSO(eY6g&0@a>aCi&BTQNcqe}jrp0o zfgshzsA@`L8UEu%-5Q4V={(g>X zotAzhIyo*q(6<+6_n}jy#)*fg)ppU}tjj~MZuCuNU? z>kJUleD#RLH5hH7>>*?RJ}f|tRak0(x#1htz>~;4q-CHb z47f0svB20KDmRlo94{sj40lAaQp)!j5xq}4JR}m;KQ!yKC|kmhY4;9|jeuGjjc|Mz zlM!=)B?1t}LIpkKV zWin{jV;zop&Qi1cN+Ko@h|FB*f6R93?koAW_&~bg7%afS65ws1kGaq~-I0B%j_xg*J8hF(wfr& zQ)V-}5`)5CH%^ZQ!BEJd1tQ)7Zk5;O5$|@CoSs>(_m0*1Kw+(j?0w<$+uFN?!1Sou zmqilwxV|iCeSa3%4(hD;x48Z+MK@aQZ#u?RnbS$51})R$?2l$Y z6`V`Eek#@0{wymZmazteukElN{;9{C6=9aVQdSKV3lA6a1}#tF|NPXZo`$sO02K;S z%UBa6>J$a+wVp(J$9-Qg^d(Vn2pL%F$u^T~M_r=F(G&5E!v zuWBuU_{r<6iPG!Hw_AakCh^X0o& z8J}#@>Zda%`5Wdnge#w29-*3aT3}#=Riwh=1FZgAv(1}{tgA_T@^bS~I5~~i560^Y zV%K+1V<#2$aB2~PXeR}(W_QC1g5&6qWK;%{ofvfr@?8r_Pn3p1YQUll5`dK~VgRcV zo*;5xoybl$M{ND@|I1jE>q5rbS8|#9bQ?7ZHu@QN<&CK1&5PT{Z6t7zTIVYWcx}`jro@uU^5eAeQ&})G_L4^aaWr&d&3z;vr@p$} za`OS>=2~|1H?LT3PQ9J4k95G{TyaeyGbl=Epou9}+P6TQ;?`Md7|S&&vyL7?VeNCe zK3fBE8nibTLgh|sFU!!v1;6#(tt?MO9pH3}x5764o)SWKXd5wYA6uU@Edhp9wrg)* z%s9l;C||v1PPB#v?7lUP?oz<6VS`V7?J@RP4U$MG?um))iJ~_wPuvstL?(M;)?Y18 zxZ|D>*b~l|`DEZB?N`%Gs7XQ~Oa+i?Kt9W1C+Ho{381drti3QG63vFIvm6wcSAa#2 z2zZDbIW}z}zdySc3y)2J~y7RkB(N*l#*55So2%94xn1IMr)=Hr-z zl!y7B2QgGcO$PC4A?07G6H>H(H^9?Mb+}>~#Z+Eyqr_WcBUt+k?GQ+>Sz7DaG_2tN zB!THWdB{%dF;(E45mbS}%>*f5AAWvYWke`mSyUo zY5*)R*^;lbgH=BrI)ufm+0hmEfyqzeBcsDA#<}MTf2g8Pu$1?ASB@LxU#vNKM%cT| zjV1w%sRsa@iB+5_kpuZF-r;6+JmY;=V~lNCkor~b76S9|Y!`R>e*L*|T-&ura(HwO zXRr}6aaU`zMPjg1Y~;XmD_?rVyCiP~9n<*UlM{^hfPTVHx-s__CS4u=Gpot~7^dBx z!FA0_AyCFB|LwoT2ui#CR?ZggVK;C8y@}xaw6Vs`YuL>Tms@V88#goAP4N}W&7Q{1 zzU=0K7kEW}(*JIW&FL^~Q|2~&WzLz%MTD|lC(+$n=j}^skMM+r}9a!1{*iCq& zwxVA&^gNNm>=t-n4$=#4)_%xi;_AiGc`1v#40%oB_2Pse zy6oYGuy`DuzLSbPq4|ZB;Lwf7ui7bSYdz1WsOe0faf?yjo*{d-Ls-0qP`$aMJfiI4 z__gX#hQjObOUYs`+v1KdBVCm@oi@xCU*v{vadJHhnUekw1KD8dH&Wco@q^F#112 zwuw%Uw@DD*&O>;vVWNkZK6VH{#y*q>NAInd@nY!uyol*{Xo2xdg+P9k9V>JBH^~2o z^$CFLZU^dkvL^fr`pN6?PYiO_&tu`F<88eXv_?mSjfavpwpfXJ&z044ulXZMVYC0t zq_Eac7+>0}oqIDor&3rHz$%M;l6vOD4CRrLYA0`E)`V~2hW3i(#!vjl3b@hqqUFX~ ze&ZFm@waDVH!`J+?8pVPq;&KF@vCtnZlLY-4ls-VC|x>`y;yK zyClD6^N1%UCEw*+=T5KIC&l7`;nWNNA%>40vEKKx-X|KX>w8T96(zjY6Se6(F9Cw zovHtcYK#Ei9%qrO-AaucPF}b8CF)7z;^gXk6Y=w33QHDZ-6S*}H2RjKhMZz&X{{o) zZJJA3-bfG}?wLRpkGtZj9~x7HCqMjK$SSCxZJuxST5US^39G?AVS%k=GX+FZvoSXi zwUI?vqanw;87|?UhbJ+)3pCz*2?Xmb3FYk@seEyXgr<-bBBwm%Cs}E9D-n(-b$DJe zOQDPix}VGcr>NTpLel`8J=N{&lSp;@;N4vDn>=CME*!#A-{aKdd}A}lS!<(l0QJO8 z!szd{Vq{U+Rp>^nc?Hyr4t9w`jnpxvvWfv_Lglzg6GEPW{ zm_iu}1C+Hn8>4B&k`&zTXSiIW5 zD4*Y!^FJ@2&phw{$>;xr^7(UD{%^|Xjw}EB_L zm3Cn-q|8b?LU115HAP&07+qH~K%`k~lPW5N}89R#vmZVKo&|a&Xl)#F`FUIu55- zvL41WoI-rVVz+-!17!=cmc%cj!R_{~826bX`CH)OXYKTvWGCimwb*sW>d1xUPU}*Z znV>aD+-wfsqs#4zX(bGK1nM zUYS1t3PX7NGDfG9CQ{H7^+wOQgaV(!4Lyy(r*+X*G)iCeFiq<1#Vqh?^=W3{)2KWa zbCiEN?>{K1w0Pp?xZyI}30bM=pdNc$JMU`m1WCN#F?+nE<^ulRpMj5!!fM18`ObX* zbS!sOZ@hk364!Zqi`O}3_f&J8t77;*>W>F{vi0TkMDF&WD**toYMG%=EWwSasO_>U zE`ET^3Y$MKNI+t3^n_59!6w>0NPEG~@2GqLD@3U}t7c3V+xsy@C_-;+*Ej~m;4wf% zV~9M2z+EI%mw;C60lBZvEv_qB;Km?PSv*L#cV-($J$E;{>a&YT_*lZq*}?tdp6VEv zINBvFrvBC>p$6-G=jk&Oc|5^vJLpg17+3Mx)Q^PuzncgLt0ed=esdc(F(DZ;Ts!;1 zT=s&n7!v?y55`^nRkuiIePul245!0;xsf!jy)n?7CwW+k`v^<0ABLJkgHm9Rn>yPl zn|&FwvLEfshl<6_s5%;-ATQD|>-qiwia>i1b(H@n&g?K7--LIwx=pRvq_6;IkY&t(zRu1cl2`WohQhnTl5RL3 z{}EjCv#mxSp8a|QNhhte9xu7q4_*>C(n_=!9M)G``cd-OHHR5ciOF-Ip-pzRi>#z> zBuY;wMQ)MV>RLoh>cFdo&t!0z-9;SlD((|sj;`2Ivd^M6R$=#mB9PH3UoA9)HS=;v zp}A>Pk`5?BtHjb!%5EuCnt}d~Ml27gPGlwUn47t$lexUKa6gcT;Jd}`zB}IfU94ln zO5ojO{T#&}Ym=}56?{wbuaeMVhw*R{i>!QL3=i_zq1`uz`EX-VpuWkL2vFMhkFtwg z#m|NRZpt07JYfm?`LiV;5iAhR1!lUBWuRxq$ZZq;aAM@P&i>nY?eTu=y%y+9J1@fpDO6A4)UdAfs{B(ct& zi|V(*lP+y`GageD?9#B5JamGur*aWuh2VdaKu=XHC$P;>6L{0P^{5#o6nD)70L<l-n_}tB*9KN*b~X$CJ8$C3XKoR&+PUZGW%* z_54Uw-vpaWWvaE zPN>Oq(g_EO6oMN5#8Wc?bz@!}7>g1a)oZZ5(LzqAWQ8iy-gh9*Mo8)?1frdiraju& zN>wGvq#=uJT0-D`lJ2O-OCIK2jin-`DXbwU<#QPJOBNhT{qInxtHJEZ9BpJT7LAld zDIUC+p#Kf_8Kjv%<*;bpp$W`V<6!v?&KyQcX&5g;i1Yy&4rON!D_IJY2&Yb8_VaX2 z^*z1)k+O?>&mM+&aGo{Zmp*wt8keb01a&;A*6yH!*M*WCD#&t+2UxXWmcS7zvd8yh zFObx!jy!cHwi@x~$jagHK=XV-JS`+VZBy$khVQa?7#ZGgfZ?scQx~yC>pK|1G6m?X zFpce$cIb#@hmd<`^g%mQCE1x!Pc?TfcyENtNP+HKfQrdEn0>(DdUJ2gKe(Op58jTn zvV4f2eA|t#g0dL(oD@?sXysc61JX;CF2v4v8W2}`i1 zSYGzsz{^G;FPmuOsp!~9(S zN$M!Rg4eU55LB%#?QA95k<>Cq0%)KuEzr&#fZw}d`}|DSNlZ=Mw3tVMGW^C?VVo3! z&m#%FfZOubNFL&dc1@-uJl%_jh~9?#1N8g?y;H5dcsgY~(1ki505cU|D2o=VPn~?+Uy*Q^sm>1H0rCXt_d`J9K zX?${N+9W(#Lv*S82I`>WRw(*)0;&Q-_VYcF zw2GmTXr7`0ddT7~z>wm18H>5`L~GQt148ft6ht53{mj+jPAWPr^Lo?DGgM%>hL<%b zk?1o-J8dV@Qyf0zV3q=545&jU(&WTHl*K2Qr8!`N`h`Y(XX=o9EL)c>8oIs^AOa&= zT-QB9BwS7;yq(n+O+U=mQ_cw#r|T`vtjpY(xy@~Y)I0}Aek(K9*3C1dUhYoq4(-ti zX&5_4oznooO1V5_znh2;Rs%bxdC!T&KBBTx^`WUict4^=)1bXo#m|%9@@!a?b_k>} zy|*FXtEso$Ygym_VeZT0o2s(M8%iNS@dX8pf*Q1=sg6!jm}x9%OB;BR1dA-&aDZAE7!J3otxX<;QXQ@85 z0%_(7#Ghy<+#jq77h2@(<5V>bLvl92rjzpd4T6@s)yl4J%L3nEUAC_=H#N!(oz^ju z#M|Y#zH(H)nI_pX)sIGJj)yYB4)YGV5e=Y2?u zbJH=iF%yRLSn&zY^oR(s7{|K^|M7fgpf(eF^DH?-ulrz+AOI6}KFTm}vd-weU3SW? zwhBI9#^osK77Y-UK#PM_tdZu`F?}?Wbep7)hG}11(#VtsS+FdeNkCK_7eZflHQ%Cp z2M8ZqyFvMHr&7MQZ$F@PS2C_8AE^tH~Q5!TmYkQ#vz-4}vwEjk=MYVVOAb=)v?oy|UiG)pBp zLI1oI+c#y6TP96^z`}_I?njpdT#)3ikp;IEQ~p7__LmrIv>wX}+?os^i-4lr?@jxQ zHtehk=!ECVD6Cf|9(%jg`(T^V3U(UH5Wn*-J|#*4eHU27IxI=Oua2Y z5aICrIcSeaPu?(=hC{7GFMFM&Pa=@IR?$W%;Zcv44}DXHKT6UBiBc`9Vy*Xin2YmX z&~?KDv##(eIbKlfWmT=B{zhTS^I=xriWSqA8(z%Vs)oUoCQ!x?Y%s5%cv-#?(%u&LcmOVw<2pSFv3Ra_Z@7 zXbKRqHt+>HSqbKYtTae2CX`sHC;zgrmz5{46xyLflF(6qOA2G;l4l;J)S8IPlTlEL zDlC~|`+_kB{7R&l zX8jX%Ep&>Yb4MI69m~n#0Z#M>Sdsx6&24m}z)A06TBYb_^LbN24n4RIA1tnf>s-1< zn^N>0S{+z57c{(=2oxUSNmle}U5?=Lo$t3}Vxj03{jwD%3xXd3jlPDD(7qA5odnZ* z$}SH*+35*7PDk?Ny8J}LhI$*c_W)s2Xc0!;e26BjG^?=eHvwL9C-Z0JYb_5m8@L&^ zf!gi6V@Y3ak;4bRVjOQ`3z(*Tg0o_Zt!?0YZO%{FXAYM*u_A?K9|V05J9Qdj z@S%$uh`~+{>~Xy|>>L(@u!nF%SAjIW3H;tw3bIhDm6Dx4Iq+)14c%;NMTJ1=I_EZ*+y;S`!`0Y6q59*XFVj0r(r_aH)*t?L&v_}d(1~*`B zW_$#HF&9R`Vk`ADF8_s^$56Ig>LzAQB-en7dTW7n^GbNT-!eoFyF5!)$WnvAWIwet z!L#c{Iy|qyUqG{XFujx@3gkd1*j%1qG=NnGtrKsJP$nUCFQ=UXLY7N3^~ymI+XaX* zeLiBJ6zp~mfjHV3;aZF37vnl?b9LC}8L-`!fUUAmyTC$_{n`xKwV6qfU5Su|Zq~Ln zL$z13B$c3FBFI`qX0(QcNdnU{0j6F_U^+T?u9 zDTzpvm$Brot{{24*_rNS*!|UgjMzrVSWpup?1XiOhZCWNn>Q4>4i!kg&GPV#9%Uvo z9|{isjF3pKsG`(Y8QOgjV4cH}Q75>STCVAADu6|P5xf(v!L{>z1ePke%l zK&L6033G#=@us-p``b-wlA zn89cw7Hvk%#Z3UwY#S@rabdOMPCOY}=~Z8)(Rh`^Ky~B~uOmiZL=0Kk=5g)tNWHh< zK?~)v!^79ylY|;}PxcPaR2uUX8~jK^`|r{sr9M6KHWQ>5)=!YD*l+ zB&W}H3h)tH-%PE80^y*1ekZ8)OMJ>fU-mK2)MM0v9e!X3(i6Q}qQ~zl#HqhvOL_Mq zyxsXVs*NTWA*LCI#Y(2SB1_P^(rwD_b?h}x7XhnS0ZJdWpxs* zbMf&rud_!1K#e9(hxy2@ooK=2b^as+%?p#AURMm(S(bWl5?C~j*7?veUT2RIX>^^( zXBg}J2&N?%4$_AlY(47W{F-UB*rC<3baOQdv|+AQ-r)zmQojR)5MJejasvYv{>aQa zw{fcP-(JUsb4FC24iEAO4=!zQ;KBLre-|FCx~OS9c>BBr9@ti!@ZgzFjpMGxbC^SZ&G60;6$ZCIfD94Yh73H7b7yklS4jNosJ3Tc{oT1mQ^dPXJqhY z*EK+fMT`s|dX=AbWh5{UkA-rf8!0#kv%aVSAowHDJ~aL{HhwM=JSS?~U8j@$PG9x` z&(wMy73vuk?#41%N)@8QK+HLqBu2IkMlo`Pj)DHJi|k=2O%9T#lMpAaRa_Wm?ygw8 zF}W@UG4?}K;IRi|RiuZ!9x(Chsg6MA*KpmFr-d;y*xPs;U% zNAu?6u*Jk*1XB5wUkR^B!-#p5S_~f7?XS#de24(R27u}79RFP8GmKon{H7z9k=@s( zXZKx9*?r3vc%T!(X+8wS13Hp|Bd~K!_2F~v2mRcHx5BZ&sj1xb#aPcdJ0c4*qVBuk+~ZX~x&ucKEu}0XpZ3Zy=9N6LCjQo;3Np zp_iGQM7v5phxf3(QKw;ig3w;a33rm2hjG=V z(7iAM)`y4h6%gZ!9pbk39BJ6r_9Kb5)*PX>Zosx8J7bCUOqu69qP@UX z+xI-SeO1uj3+DFF8)9i@8r!UNXC&Q&NSk~TT^x!t-Jzd~!lPT_%Z3M&+IGW=%ryP% z%NzcP?D$2LZDv%~Fx`bhR;+5@8A$E~t*BV*%QVSaa!SP4^2sh3u*4Ic!|RA*e0X!OfLw>Af|s$fT<+Rg`7| zO%2sWuAK$a%`21^(tFEX+N5@tjiHTFzb}hid*JpaWpMMvInuLyG^(EhW*?Wc56H@w za`mxhupMb8yQ%|0WHD{EYYz&v5Ef&nIKmfsaBmcK109&?CSzwB6JV}xqF{bW&#{V9 zLRwJ$Xkcl%x0Vhge_+RC*5c$oFl?>!04aWWZL7Vg*e((gS}W4g46 z>3XS#dZ|yTGxi@|*TgKl*nh}UaCJ)*8jpa?*vB$svuM54fnHX(?>Fr~A`SJ@#}oUH zDJ>agh#s&Sv`6ief79^x|JL^+N+oNd*zUF8!~~wCY8~Y*F#hKo)H+sJP|M7bLc1u^ z$V8iZpas%Z71HJp^)Y$_=`&%uew7}I{ju8iqG~45D)Hq`L3kU)-&Hz=o&?TdHfaO3 z3Ob`CAe&kDhs|GTymS@SX`mwir#WA_tvavu- zD++gBiS5F-#V-`pop6UXx}|=bid-9j1eYOY8sFBAySP9tYmQo+Wp(xL^cU*lq16Ra zzqPJaQomJ7zvhTTI|a3Bf?BC)b@rhK)GCFpNBTrd_P;V%rFnEO6K)rB`~>YSO$z>} zxtJI7FIQ`j$0ORr8T7_#d0I)T2-Sncf!gwl5V5<#lM zCrt*D1$xa}L-cy;vs8LL)Z#SsI=lJ*O0U3K<|S<*?#wDqqSqfm>oe^i`jH-%OtI~F z{{SSL@oUp0`x%q$e4n!0*gbrJG@I*DzC)TVmV$q1uG8$VOtW7n(d=tcoo076O0&PD z8#McEIwKBaG9Fgxgewnk|9E4B`{*V=r&m%|p%~+7#mr z1EG(X(S`R5-9LpTcJLCz(hghI$ZlbL3mu)#hW`UTd0OcBbR&J*)K($f$JXd};d-g* z?ZO|h`Z4E8>ppRjv0aF38EqTeF2uFHnYdj5oB~7=d(pj8u(a735viTfYZe2aUZh>a6w{``Bjcul5@V4|46*r@j7wL4)d()( z{z5zNt445X+F}g-7VHkiq7^A$+h}s8Ais&t8&f?gR){<0d`AjafNrh5OLuV(@hbd^ zZ22*f7Z*j3i$u=|+=rAOOYB3E(fjCec-%Rr=WT8<1xXNUV3MwrNJ=6t)L)LyUvl-d=WjD1Qgy#6S0eQZ8lzlv1J zdnL9t18j&C>VS?WvXuaMa|D2-d$Z1&_;Azz<;9OMV>xo?2GUE81V_nG4jQ!#C{a^zaQfM_bc*7h^$Y%e+|C> zEWYncdVeau|3|z(CT_le3i0z^ynj*B`~C6#lSrZAp4;%9e0=Aa@MjJ8?26C7BMR2N z36>*IQBLwIclnw=ZH=$Z=VnfQ0Kebh--qz`?2SD9#q7(q9q%dP-ZOk}4c-gzH%svM z1N{2~{5@2>`w}{&R;>iIS{dod-(7)24#RUTcb{pc$=M5-3UY-JlxlT(YM1kq!(xlPX+_y6mCe54W=Pr6evtvulu$#G9iI?R7h*T z6v9Uy*QRl8y6&ZV^+V`nd$;!N7qLb{Qb$yJFrL@n8VBNYJ>nR1qeu)LI6s0pwkwUW zMb46o+R7k@hx0qTP5BkDbNt(NZ{%k)|JIwcN2aB5{$f*m_nfZ1I&LrS%X@^=lwhjb zN?0(D>tVb9@91g+YX$vk^%M}0nT2T>5iBQ7@s5}o^&EI9vRnA8D{DQj{ekMpcYJM( zFWOi5OSzNsNi;P+0q_epXvBzzZW^CY?fL(N&!`zSjn7+- zoALR!Gvo8*6nq~1<5}VJ9R@!4G~@HCovHZzTrxhlv^0j#QmB`O5X%7)6;^ODg-foivneAQoMx;?~&(L>fLs3 z1mC3~r2tLz`GdfzAIGK z{lC(~T7w?$7W7b+L=S^7S&5M&;EMrr`0tcyV(PE-N10hqaXD^~=Fd|i1ZD9~YG6J` zJBPN?RfsFGg|WVf$>-0#3iI%6l$BM1bE7?Y{waaU`%Hm^tNeBhOqRp5^9ek&F*U4I zv6b*-HuW2xd4M{sfuYe$Ve#kFVjeGkqIINziTU_jr5UI`yBsca@;6Gj2wj=qCmTy> zAvHV1s&D(;vPylZ^J(hJH+p)YZ9Hik^^0`OZfg#TQ9bCJTr_zkPzd6T1C$o86A!E9sIj1VWF8`qnEJP66Ad25I;qr^V-r1-|r#C(S8R zL%itSU(O>-$<}v-r37PSq+Jg*GtwtkhCy7G1f^;V0t1H(UZ+Y)X~`cPdTtNwj3P5PV6tD#kl?Hntn> zV50x!$p~S#7tqUy5d$1!uYs0n1=X1}7>s`6(TN_z3lYyX^{CLG7jLEY(!5HbT&~`k zA*Z)Q=TR~!9RT`UMXiK!Rb|U6%%*l!3U+KwQ=pp_^_cGm=O|4NqZOutP{JaGc9Tse zM0p4nq|I;xmK+cW0{t?B2+XKyo`dtIwNU5dV$t~og8rs(dyN)EanAuvFZa3*NWrO8 zo{nySTHu}se`mnrTLJ{8Z3_S1f0n7QTSVw{4US+jc z#LbtsD01aDM&RbS`PF_0<%+yen!c(T#?2Qvr3wswGRG?uE2OCEaEaA0gvx4|n95>i zDhEvY8pfX~zWRW_9Yt5SDZM(c%sJM{e&4eNHq8?ZLS0N9E5tjs=8N!Qi42>sTpj;i zh&G6lQm`H3beQr!!Ba-11MsecEZl-xpJvHQzE0#M1X;3Ehe`715 zZ~Z%9X{e_##&D#2)vjLk0V~EqS|a{dd$1=mBAdvBEnv4{AZ` zOonB;tYF`JD2TFFyPDjv=_^nwBlpJ>c|Ye@CdjSpMlGFNVU~o9Y7pouGnpg>t=DSX zx8qAX-{Obv@<{$kLr)taJ>ssEvfc9{lTLE|TO#^2%;Oof!>#BWqJLsekr9~Z1C&0W zdS{l^>)H;QNz(pLR&NkgashpZ@%H%?Z*QWLyyE2i604^^nEb@{rrvzA|nJ ztutvhD$QDU0lwGzuZ8Z!`nfRF2h-RB&-|OGqSm41!&e^H4zF?y>mBmfFg0p1i&|Wa zYpxCX5)}yM2`nh3P#S4Wp&2|-lFa&XS=T!F=BLH6c;t#Y)B3DJc0t3cmO_|9C90C& z>s~2X4vj_X2>-OVK8%s<$2qe`#o=xk;#M)54sClJ#v=QA|6 zP{~9o_=n>rX*q-Byh%(iu_VN_@_8fm-nu8`b9BG3hlWt`h)k;^{ug=uII%mVNStG2 zIL3^+z5&ci72@HUcG}1v)idfP3-OwLsB72A;i1W5k66qOqHi@sf)!5$oOQ9J69EvL zbQ&;}jtsg#^K@H22%vgz6aZxw%6Rt|xJ&XgJiw627&#n#F|DCY1|1MHUR2aN&13T_ zqpL_P(-__5+S7+p;Is9B8EfEf4rEQM#?vJQ!3t6WzJkr&yxHUH(zG;P>Y5s5T22`$M`3d8CDDcj** zsN#Xbt75EyAEUo$mwqe0UQAnpuilBpqaQ)j=e|Pml!b4Rm2KC*p`$)a#Z;A=#B2{O zN}4UsAMRjLy@v?+`~+9owF5g8BrKvWXJ{Y5+tI07n-5Y)2Q58EZ3}6L4GB`i!g<=G z@GycQquSluXpH4^V2tN&jgbgY=Ng~la9!Jhv->Z?C?CU$n@i&)agIKR-)dJ-=8T@X z*v{u$2rSWZ4B&j2)_5^-icmZdjG#Zq6azs`g~f7sKJzQydyuL=KaKC$b#BDkte`<%j;49lIb5=i z%B{gm$gas%9*wrfIaiM?7zMuJnOaMK09GD&I~Q5k#U}kNm-@r%5`jgF&0A4kVT|6a@$mD2wy zEvf%UUTe_*%U<=NVP4le^`z$YCk|3*_Ze9}<#0uL0!nBPso*rbrB0t(>Ts*~J3(K` zA%p;aFrWIr1ydF#^_)N3pJC zF7bqSP)=&ox1iZ=rB^9&81tWxPdwp4m?x~Ppw`WnH^uGL9zfzjy?RbdV*OkX%&=x+Ld??Qc*$LhB=Fi=qHR&DN=r5kjkx=Y1wm;eBuR)J4d^E4T;Kn zqPAKow3aqj6H3ygpiC+>LDQ^_P@YzVIs81p$SuplOcE|E6Q&Z-y_t8j_(lJ4QUtX- zPcoUmS}!b-g~MPyoV+~-=#)5%V5=imO@J7d$-xWeFP4>c#(t&@HWub}qWqLe>*VCG zd7V72fTL8BrcK3la!qoMo1m`~X0HBm6RnZ4KD67w_0iXc5^cC~0UgB4VWpNX6f4!S zMaoJ!X{8<^e;(GApo{W#iIrk1a;u|r$gRxFI`AU4f&NlaWpRi0^8OjwlmOCA`~oo*iZCs6G~;lr=#J<(tHGLEhz@PqVDn$MT)H^ zzu8y{{soztPBQZulC1FTwj6p`>z>OGK1>tYeYJPLrd9N+0kk}@VI)7_tB#%lmnDSY zSYe#@8kh-J-*2=u_8YY0nZ7O~P3&++wcs5NCE-P@GjDRd%1%V#}kx~x}n#Rn6 zaI)1hpR$4x2Xt#BK5g`IL%7Q!EXAk>Gv#fV^V=U;dSd521P z%N$q=HKP*bB+{(9ddbSU6uS5r?eNT&#LgKFEQ!4TUE%P#c1xi=RA>hq`%G=Z5rZgx z(dj}bin~dfoNpqEPezm^(Ztfyl7`fvSOMQl<#6wY#Gp*hAp(LI{EZRB>@R;8LDZzD z;Of=QbX--_&x#->rZ-LyN4=JPqLY!Sos?VjPCBNafgolk2x8_L31X(1Ak+>2Jwg1Z zF@kWM6+zrtXAp$G*=#kh5ps}%7sN>BnF+%0AY$119ub2S+pkj+qhO{k7)yxPj6w$ zLrvg` z#dyD>-gv*PKK1<(_2PZn&p!)GLA#z2m2@4!YkT5)5AOHrSd6a&c>NY#!v-Ys`7zq7 zM7}bzk=~D9o>HD=%CTCh*8}+& zjUjp&CQKtNCuI-L#RQF%B_8u{+GWbOk2aHsQ7==z^kK@>W#mumQ8pzTq7&zHSn;QY zC(q{?`xnf$sr}jIc5T0gtnAj+G}&k5EyM< ziQdp^e}rG)h^+YX@oo$9bHMuq$Ri$g5a{VX*cP~RF%Z&$K3y=RLB*(6Jj5B)VYfF- z!+kVnOCfiaIdRj5b%82*pB^4t=27-) zJ8DT+abWEuQR1mKUu^#NksJL=?Q_D%jsD)oevcF0e`OK5?)Pc4suKIW=yiI0(Qd8$ zb(#_v7LE*yk^D#f0P8mof1MpLt%M;wUkvgjBw9d0ktYFv%<;r2b*w-y@t^C%TFGPD zl#VXI&=7d*UYwk;M%)YioWS#N%B3fYS*1?24vW0~{07f*B+QLwDXLj5rdN)kkIn~6 zO}kllP5~uAL1$Q#GT)K>ZdS;R)v1EHMVy8nPqm*B4tKmJ-~hvjzJB)f9^8UJy;b)k z25O5eBYFH=DS%XC~`3GSN!7#?u-UC8AxY1wjrTAD> zgQgHXJ9N2%4m6r|;RLq(HSI?_)9~LTt3!s#>d29@E9xKQ;mG6jWEe#nis&3*j@0#X zE9S7viC#kfar#@!t>!m9oOTX03(U6R`aZCk#^F&XIW^}$5MVY;_(qA3JK<{X)$OM} z_J|Z6l*jFH5tK`r!JNt-?ORYgFy7r>fVJRz+t4WvllcKfNzp4R&HfKX$@C_>zVWvx z!g}!ch`Do?C7rvLaK(IAS8pwFUnXhNFutLK?}gF9nHRsr$lV*$m%jh9ler zCkG8n!DR&ZGN<;Z7ii*MwZ!VhOo``{|Jo+x_*S5s&V;SHeK-K~ZUOxZP>gG9Tw35& zPsK_r2B=#Dn<%dxHosvO!xfWxxt2*mPIJ16=u}UDd4_gv)NUTN3-IXd@@Lt`w&Jig zr4W|ye2?14iWL^+Y*7V3lcx7UEpU+Fk2{zz6#m=bkIN(b40=N{bcpKqku|4`^E-Wf zQH9^r!9mKUR`Urc()|H?=K7LcU`uwL@(=W?ldWF$&mQH7SN)T$-i+eLoBbOGd@Gt< zeJouREEtn7t5aY^$GzEmI8H2`yaFa8WL$MpXg037Wv{F}WW^Fc4eOp@G$-Izmy{IE z@6-D4*L>BuH@XUVAvA~QvbbqNS(5cMCT8A?Mw_EI)gd%b+7$^eLpHaJP;g z*U`X@R3h(D(#?;{*@v{~Qvk>Gt0E5w{w7-7uWb|$$Q{9;e|Q4VwfXo^od7_&%E#F; zPkFQVdD6ROQTrF**5aSq-$C#tKGo}udz6y{Wc3lC>Ps-Gx&3Z0f(r`WhZ`LbbIWK0^E56{L zu?F>uGxU4rdc_B3#~Rct{!YJ_RIj*(?>1Gh`1Pd64eAxoNP66$UhxA-j~mo0zA5SP z+0-j;q<$t%gUU?4LZEFCy<~)5e+ugzS;yG9tmJCM-cBU-CV-+;KXyA+qWna-MuqGVvUgm6{?YF15=iRc7M+SfUOT(hnJI z8_^H3E+YWT-w)UTbf{I3mB@{P4;zB+OQ()GPjy>eh0d{UP#+0BjB@{5(QZa8&2a9M1ZS5C&Q~{t^M!Gf{Vy5bM=0U43Euv@ivZT0&Hh{_XrjaYa)5JztDfs~ zGW-J(0pvfx11$a6=p=N0c^GSA`|lUvIk`F(cdN~_7&`F%xGH0T0>xZvS#|u&MR~4M z2vwvh5(Q?TBF5{+TxnJ;SBpvM2wsbHaNUB~q>goZ)rTA)pM1&@pVWId^4{|}$NLX> zjH$d!!K@v9M-zZ={rI3{idG1Y14X4)tp^kyFu z$#bN?*JA%13p=qjg~2Kxk*w$PAOEIxMQs|@N1Yf@lV2($yq$OqtNJkb9JNC^oS!2r z>k!-?WgVS=$f4CzaJ5B0|9~yBG`+QP`hkF9ugV;+@(`bSz%`ysIjFKMm7^u)zA#@~ zG>;WgJA*kRqo+u-DK%_je9H1N2F*!>UUChs4$ zNPv?-G?#z?m_z~2R$7>muqdZsLkWxW3s$0`%(0Y3UG2t;vtRr5ZB9nOKJsQLKlu;T zs_%^JaT?I79{P&Tc^Ko#axI|xMT|(cM)uAxr zhn-$m6zc#=y}zOWYnd|3OiI%RzRhWqyxCDeYI*7jUimy6{nQh*@|~5f)!HQmN^Rs1 zc>IJr%+cuoI+fy@Xw@Ta@Qi0xnB09l6ES}AL-r>a800N-=jTAPKYu+Dp!Uc*zIp45 zF@2o>NA;Hu%M<#`Z8IC{FYcL*=r1`l8|p8eXEvt4w3vBX{bl#Fr`2CpJ$qXH<&9@g ztG_({?0=)b$Z8)OE@UpARg@^R^PZmp(okI4_!AS(=g8&pAetaUEbm>~h{<$JjmS0;Z0mm-Y#} z4u&InptI7RV_FpC$xG_h2vSG*C1rb)dZ)a~=$-jNYW(RI zQ+2n)Y{>bmwSgrE>$efjUy&{xhY8~_J$?v}fAfX`dg6JpT$^uWDwr@0{Nt^Iq^xZkeuVe^I>v8 zR|_dfFkemg=-Y0Op0C1(gDz08;QClQ+2wB{8}vh5yz|#IHgMA_kcBP#S z{BlkT_+4=_6@GRBzd6q{{BjzBU+^7YPO!ZJ;cyUsM>i$T)7N7h#x&udzAREtVfxD==%=B58)AX|%0xcjdk7Xy8$kyz zJxl!{(ObVwzaPTir`}>b2N^zgB3s3+6z>k3L9Hu))C*tS^cJ`3o5&Ks1GZ|bWi?=A z4c{z&hOdXfb@VNK{bDhQUqGYKcGYkjtwbAN;>|B=_?s-zMy6<^JzPibCI6nDRbycU z19+&~iIx1~k55xQoByNwM&P-Gz7ct%p}w(6zh~Ars`Q&CedBHYUXs4?+!LqOH%gNp zH_$hRBt34RZ@7{kH_$gOOnQ7a_7aa0XZgEUm%nzDza(m3vZRXI4=lQ?&aJKqAZ-BW4i?%brEHz9>^VaFO$=rWc< z+cD1ixyB;u>AWlgq6U}#9}9%ES&jUQceFKcvQzzxN>8nu;1(G#?2W1X?Vh+UAKZe! zMVjVsgA)AhN^%ZzTO(ak?3EEd_|}>&(7F3^(|MXGz69{qy1c2o6!Wf)mLoDs_(uu8 zJg_ObMmrJY$*K&aTODYn$8%T9YJompA3QGL_uHHj<878K&w$Ud*Ku|9E954hmLiM! z1!s^3yhEF?jQPY@)yyY8ebQ`Wh5fxZJjJTVBW%J#%qMw^Z>K~DUh?|5*6SMQVDpfN za*nwoQt-BBWMifb41+$2CCbt7qds$t*L4!g+sxiQ14!}Mba4ogY zJJzF@f`cq<55MST9_N_zr0H1wNiBoUCt0P6pY)6kQZSp|ZJq_{b^9#P>NiT+@K=^5 zZ=hw4zJTq5dW@Cm{ZGisAMwrRS*-sf&EXCnMuMMIrQ*F@F~ynY&z3P!m)hGQyP8Q9 zxHqPw+S}>sWtFC5QU>#R=$*oxav^aBUt07cs>^M_?R*1nn7@EvCcD05JD;A}01MB9 zB=}wmq1T4&S|d$+5`L84+qFs^;4H`kx{wMrS?aZw(@y{^+moUn!hPHl97=&N-rGYt zL5w^J)DP!r*<5<%*q6HQjlN;fV{kP*_Ww*aKq<5=&8P>ceBbq3Don3m6Ie45tgEH$ z%w({xYGD79l_=X47tjgb_u6tKUjP?pfLKleKy=cl*UDr2Ku!!EI9{VFB6TMmJz~n@veG+pK^0^UvE)d&MDfn?3=Ayi^DDPp%*t@*y z6gwS`e8ag$w1e|waXb=68hfmVp5#Od{>@^5D+LP)0~-(pjbD0f4N>W>4yt)~wdf=E zBKILxe3c*wN3+ut#B9P3#B12s-t17Dxe6T8v}sA$`x9KL*GJDKd+9>m0N4@jrF|W) z^<#FkxwHzAb&oq(h34hZ2Wgli!F7xfQz|@QAm^cm>ramkBpFpX{~P1&npD1E;6ZBs zltNQJlgY$kafk)3wPSu{N{jwv&_iQ+Vi*r`Ya|7a(w!aJpFcv>Xb(HX?HF|eEnXkF zEYAMAJ(BU9(3{{I=>=3p^t3H-Ro5yQfh_d_IP~5i;LwEq*yGyaACA3Nk*#VQ_ZVF) z;(x`w&ySAaxc9=O{3rTccZ<1diF`HCloX^oZ#cogY`6xJEn&K1K2Vv^^Of!JurFB! z4@cJ;fCTNbe~A+zsU0RmblaqM`8NFFz+)l&0eiW0_?=^F zARouG>mTOvBw$>A=JC8+-sE_mne%(c^Vc(shaHc`j7}QU`No(oG>@qg$3rH7^fQl# z2BV^TS-Pwg=ckVAn->}%SCT&#!fWi9zcvA{$x|AG7pd68Uv6Z4XxQP=IE>HqeS1ds ztp9mD`zALw9y-PS>Yq))iwC3>KT| z?Q)&Inixm9sj>bY*#Gj`Dez!0*cz_OuIGMt|93Urf8km8|J3jAfBVFy=l{~# z_TS}yn19p#7oK(h@6Y`G^PkZ4{9ioV{tJG8|93UoKjZ(4XN&*4p8fsvAKzsE7|(Hb z{ZIY<{omDS|GfVHIBWQyssE<=;r2(H1r_;301>Sz9Ee%P@aW&f zIL-nd>&lvh$A`22TX;mC{-5D-L!;v%cwCk`o~Gf^#5m3ZKbMp?NguB~f41~-@Ttc7 zuN318Z8RKRPWZ2o!{eQ@vKC!|ul3`eWl$Q&-amu}e)!wVDl2Xd1ZQu_PSay?bNyoU zzdRzmW^74EidNj+9SyWXN952mX;|;z2f5<7gb71ow^UjL7Z!9F35+R_7PQZr0n6Jt z3vbBEP-{(dJNYA94NvmGT`*Z&ibE!*kfMqY=~MJA7k-cM*+!$T)gpL6ELx7fs>3O? z(LXD?4eP_-nVF{jhNiPQqhs+wn0KDDbg>V z^l$PVnqt=ue-evhVm~}gh7TjlpC@}g)LU9dTj` zI&vsmF^9e#2H2U==t5Wz2cZ$}V^??GsMSqi^Yd5Q?1y5>`CHi2_i$swlwBSDeaa`p zDDRQ6ir5!u&QNxddTScxg-mfig3XydCAA4zg+>5va;=v(ZkD@;WaFDB-$ra0T(Wl)~gGt)=xWji_ij-xvbE^KKxj=odG7}NM z$O32LHX>P=@haDGgxtvGfAVD%n*Nh`23Vg+G>h z6FhIt^E;M~na?%^_INEPpshP(b?97o*VT*uf^AmsOh?0Eb}jIsIdb@B4X^NYK+Vy9 zd7E$qpfzZgH|(1ytFS1wg%ptFQ|fVf(ZF-})3NvvX+fdw_8#YtyRSmCSJZd+2c&5) zqhQf`RMa<Lcc10>R<1^L$K4@j3sBN%X{Wsc4Vg1Jo6Uu&1#A4`=d ze15n-#eCc#JQ>*#ol@zu>>->nVNMR5SQkrqnbj6($rzW02Quut;8jcP+U4)+{vI1J zREpd;@#~~-)O@Q~*(Vai)KzO%^nvLNS_7v5)V>_U3PkyMtBw7@*@my4%XjE}G2C|p zPz2|_UDSmpP=jFm=?y%_8k=kCL8}^zapgtq7wJ{@u-h#OU+u_TH23Q$j``T&<|UMe z4uzB!oQ{L>k~2)E0sGrhB z!}4b3Yw=kGwjsAipVfxn7o$|zFbpe3(8K!YkB#wJ!AkAES7Pz#j~H*BICnb&D9h`R z&x%6>v^8O65I6)300ZFmun0klxaLW-Y$5#Xbkq`5?aW5Pe}oqLT@-|`BN z`tdR51H=DGUfYr>tGKc;N%-Jz#(>v{&Qf0Ar_1ZT!&Brn>6A*HzooJ{vU3!^hF*Co z`sx=0qy@cgDAqCBtgb@K>d~&WyEZJ$4jQ@h4Ao7`1PDtD2H47v+s0j6emu*611P6y zm}P=SRH!PXM5NOyA2UCwV>J4j+1}J&XA!YYv6J%$Dt;xVHR95MSikAL#1OlMk$zBJ z{)Q-UKRHs=a0xUE{KYI*mQRV|)N?iWQBIeqf;PM#1lAv3;?F%8)QH{-Ko7+jVn;EG zaFN65lL7gp$MAqYao*BnTMbr9gMa>t|K#I81s1}gHU;IAka_hHQHxgM)29IC8I@8- zr^Um}sy&>gGY5kc^wf+L$9~uS>`U$Ngm1Qkp!Fzy93G_<;DkY#R+%Hi<67_W$bC*B z4)(2JqOnn_fTMUz{w*#hTf>tTp@d->vs;$-kSms3WY=z+cQ98&YuU zQ&-?RrTXEKKjQw>xgo#pHVa$UKKTca2u8e&e5LklBTmMX!LJ85!XU^Edz>y;kzUZL ze#JfLyiiNt58B7WkZO#0E5ko@?VtgeCia9+S>aRG@mA(R_%Gv}Utvpm)s?bRYL#pI z2zQV^?BIY|L~8pum}$18c(?jyc)E0PN{kvIDeAj3VwCmO0^82MK!ts2yNPj{9`!HC zjIw{joXwadR#x_C|G14`S2GrV1Ea7+&{dK}#qfOmy0)-5F_6L`Jh;LV9#A?aN31O@ zLqx7A#Eo$w@TkI(xF<8Mgw|~a*I!af%aZs-{*2O`<&~-OIMtuFjNo+Dl}Vg&B)inw z<=U14U{>*$0azVtM&51jBet6|?XHxjzfDOcvi7E>ab*8%(*+EW-yZE3{$m%)p!);h zuvZyr(|X^XcrI7$wV;b32H#tyU>A|%p}4jv3-?bHK}azhMjiI(n8f_b%YXpMCxN65 zy3;7zmv%tSt}D8`qLFL*DBo)vjAhPu)<&V3H)O)isu~wu* z5(yJqdeq7GUg{s`Xsh0gQ7UM11AGw{BEc2_!I6=+k*`Kl+hLkR<+WE6a0@>;@E}gkdSl|l4m9hqY z9RN<;N5>;CW5~d7Ai1<*Lu1C=Wi^+R<3#UcrS&@ASq|{3kIkXxPsr+%81bxZkr%!7 zCjXB%U&nA`@+It(#&&>B#1sq+3U{m0HjXFZc*kZ3HQpAAacU&B_xqx`0e zZIITo`Z0B=E}ctkDRmQDCk4^Zr|Y*}S9_FSJfG=fVZReusSo$HV3|tfFUsd0>Ae-Q zt9tCMxL~W~y7^RWIrm(O zuomO6qR$&~9z`Vkg~{hCUG^jFmunA`lO{ByrydI5%h28Q0Kk9e9kDq1L5?));Tlz) z{8%gAiyU5r zU2)3slCNF*B+JHpLt%T+vqRrZDpvuR*J}WPeDv+) zBYWzm%rx>UfcsYtV3o3ora=HWM;dgJ&p{SptVz$Wox*o^Xw645KOQuIVC6-1aV>M& zpeV=e&&tIm&5`x8K_U-DP(X@ET3=`)Nqvn43d|E1)>O8w{dpR%1;DrM?F6wQR)QKX zNxr@$1G93`8IJsLcwuZw3*%^FVZ!A1OiQ!Z&p0P_g#WT(9-8_DFH3Rb%Q7;tEG?NY zE!VUm2E-ft3Lw5r2V2>%}$*=G=KbTxdTFYSOUU#3_cV2XWx5f^3S+eSV9tUxyeg0 zl9r;uD!lV>GJr=@798eHA_b6Lo00O8%rDV7i%dC@%;0rs&dEAw4!3=22O80wDg2os zYA4K~=rF6n4>GRL6I@2>i2r0GbYGisw$N?Eyq9_J4BI#EA=t9Mfby~y zHfz5Q;na3v`sdE9+=VAwQi0uu(T^bp`V!)yW@?nJQaGSX`BoQ(bBg~Wj01HNZglcT*5 zI$N~BnHTfSkMutiVl2DcAcY)*6dcK<@Xx!M>o=x96POr{igCfi$QTrcP!eTOy>L2; zN<&l;`PMFb5bu<6tfBVv&CJtn6y{%}ZF-(rjw6jzm03-YMJ${DC_@&ucc5+#0~n69 zn}JK5rR0t@&JVj?2kPYSbf6Xm&lad{b)Yi;ezEvWfPLVuGXvIrm(IV>o*tcSy%O~3 zXp|mjKbK06pN=%sqf?vk973+`vzz10tunU#k*fM&tHiD|iQsZ*MlkM1NYPsHJ4kW( z&1WLTu0t^&6zEprM{?NwyPtIm^w5Q=+ZsSd*{;1k(?G1<6mV;>%D35^@aDDsr_LH|qp%eyi&Q2d16|F39gmqdGpA z-)bC8J^Gy8j-?zlmd^UBYbCfl(5W2xWDcP5J5a{HMn^26I>dcIp> zzY-yX>bXgH=M;h99_3cp6}OX>{+QWHR`0TUU5BOMt7#Z+ww4lz5=)Hs5`HB|{Z=~c zl9i5FpUR_@T0LsO>d6b({I?m80bm_H@EBi#20co^>L1|24k+|;7h3#Qd2v$!pXZg@ z0PAtoMIPgB(*^4Meh5)FP5}>Dj)K}>HW_voJ zO-zJ*eJgDQq2APUNap}o}WXB)ilwIwl;I$N0-ygUQMmb)= zS2irdB?T|QtLohuvSmHov@kktq7Xo}J2{!rYzoo)+r$i zO1RFYYYaz;zC)`6tLB1+_fq8qc!U|`qfhJSP`G^O`(gF_Sfg9?%T|;_!*zZ@qp#s3 z%B+1m`4Fv-1Stb}ft{Xwf=}}Gyf73_sNM$cJwVtLT7*IFAEFVpGGP~L6ucsX!YGRYO2s+I2<&>ybu&`-66t1jAp<{T{l_r;0nKH~;X*45 z5R$yFJ;LS`J2H(-RtDKfSd2`#n8gyR*qJJho0AC_Wx*~vW6wsGKf|<~@hpFanWYGx zC&f1iMO5%D8iv;O5h3{W)^rY@02Cj z^d%sX$_-sq4)^@epACEC#v5R3uK`d_~GNAXg##7j9^pOlwc@=N4TVn*j;{l`cWi?kcA<%3RL zAm-|rK7_h=wqMKT&tr9wPi0j7ITw!c2o}Lid zaM|t)#8w`T)*f|Zl`qV^=P0I{)yX4e{F*ch;XnhSqvDZbBTPh40$uDYPU`XQFHJrE zd)(=I#I-FZf1*D8+AOoQ2xNhzn42Og!aYl`GpOa&JfN2Ky@cj?#$IOjVdsx19U_=$%8P2O;+pr6>F0mM!YquM_8R|eatCi(I{Ej&Ze zjFHFpt=o}47Mp0sru7-0xQ9GWS+0E@j>TQeF}{)$LhA%`XM$ibc?G;uxK+6`tGRX= z_e9EJ(@fk;Y2(I&R|Wi2-RtABe?`9!~LAW~5!PaO~*Lf#MVdepea2939(a zFM{Xp*KTn)VCRUXGa6v&mpKNOp2`I*&CWLj`03g|RK!Amk1zT4hCJQ;v?{5w==g6sqY6E7k?AE5jHhi4g@p8rxC7ncwht8`R?9qd5*|WE>9N+<;$hucZ75Kpa~U% z9{US@!AWm+YgI$^tU-6!xo&ik*${+z%Uw}4NjSJ2WOQ~!5{Ee-#2&JST&In})&$=} z;js)&!O(^W^?6S9PPvaLiAwe%Je`D%Zx32oOnm}dZiz`Kw@*x@!cKCalR~bB>Glg$ zx2>W+Qwn|yZ&cK`_Fr64-%6U^p1ck^>;r}*4{ttAwrq&+6nQ}QYq{Nt*(f#I#jqn5 z`4!kNNu(JDJrUlk0`?Ut)AJ7R*LH$Z5gD1dJ}?2Uha|460PYbvah(Gg6Uj|nZ@3n& zor&v`JW*CgT+cym+G=WlDqPo}((5p3mBVq|dw{Lr>NUF+;k_T-@V4|N)PxDG^}r`9 zx&BS$Y6H(Z{u%JRpl2*jeoWdHJO&DPKkz4j4mx-iw zSIu@a5&V2pO!wPcQ_ie=<4tsaCl3DA$xe^*HE>ex!pCu(KahQ*M)Q(&=IXs4VMiTK z%}1Oio8$^)r%wh6bNyW~ExS@1zr}tBOcjs%YcTdBKETLx2p|7-C4Wr*m;=R^?eT*{ zlC2!y!f)>AtIv8D>66{BOw4*p9?g2^YMS+5m+-6~?PJVs-xQwPQBO?7nVbFH2Y`qu zqrKn!84b&*5{Zszk1X)&=T_vTAvn|yZ-0TmPT}@msY7zzr@c@S;}XfWQt&ra#mGT! zIGsGWVZ-FEB>o7pgvvW>E_h!kO+da*%|&}W8UnrOuGBi>E6wy*aWcWa%$z`x`PI`vL`W9TZAC)!Ra^m4nj#_HbzXgn)%85Uzp44Cq1yry?}!;>l* zD!u{aD5uZ!nrX!(eyYWpXk9x@q})oqARsub^Uho&Qg&3R5cyq^k>`f@Qu$GQDYG%$z8u-rytm}44)Agnqd18rxq z>5IAa#Zxw&YUb)xQ=l#YPe8E0*+6@#++b-p4(FAjU7c8*H765!{yO~a z{Z#y)BIfb2K93@G9`z*_;70QR=g|NUNHnE+X0Y-C(aZ=3w`eWQA1EyWt{zT+(oA`$hgdwS()!lzGjaoVq?K^Sb|L)9|Wjcy=CM z)#>MTjrqe>^x=+k=)+Z}5A_MbMD#g{sV3&t9U7&SDG9n*{aMO9c1|+U1xMP63B1gbW6SPmn_ zv6dnTSs$#%yxktxN-3L6g`%tqx2zVxRwP6!Qt$)~@^O97e=C5*%Pc=M*f*%=e)cFV%B8*ynf#mEj^)*7DO6`N|h1eEsY+4|h zQzQ4@zQT~6g@f?%V(Kgellh8S{fpWVVgYF7!6+6@ARvTocz~&3> z#ge)U{ka}>kku3RVR}T&3|Zwr7o+Pu;ZiHTeLz;=3R+x6m#Z+3;VPdpiKuu(ng@27 z8@S({_|{fazgrvWSIk9Lizy@G201M2J#XcnPHAzdFhN+@Yrkb;tQo{o9Med@RpkJ{0z-jiFT&Lv5T>R#HISiFex zUNQqKh<}aZVdN2NZrd9v&9xKFwM{gaAAJwr%w3Z5=I(L2ASox54^j=tzE+Bu2qS0J zf`4OG6Hp$x;x^P{u}oPG8On&O8_YzmMi!6Jr!(>X5(m*g>F z(q)-xWN8rW`j%W-2~>)G2xI-6bnWKJ=5Kags(({PyJB&bEdI~lkDo1|!%-ocn6G?VgQx{svcWvj*HjATwZPnaWw3XeVE3|AEvVja z)$+#UqM@#hWd{Wmm}Vyv5vvn7nCgkg-V zLg84Uy>wM9e%oD*;OZ-j3|zJL714w{w5xB_5pDtup;llUTs47hk-qz&t@E?VE|&XF z3NE4lyV#@$HyFC)Ck3W8r*rn;gJgc4pC50i1L+z~h50MqD0JhmicMcpv0;kJ@6k*O zehx4E5%y~L&69#RkSUwk+F@yOhzW$WcyAb!PVAC8(oKdRH8x-exf;=Wz38l~;I8PhY>Q`7;4Dnvdx2C&@HN4IgtC?2N! zf(s2B=ZlxqTAn0>YxhZSu#I!(wRD22U40+=GYVzaMh78+ECH7guDp4oJrn>G|11)! zDFRWObO(7mKGg~O3wLh`-7Z3lfR4H?c%BcYJ(#9#>=M&u?~vOg1%&BBbDLnrHPs>I zp);Z%7*dPQVMu*6$oM*vLBvWd)<;mS(*|{mac+s_+W33Vi5x578kU&O_t;c)S9!$00+Z>B<%%3(Te^eF2*bTpKUXDlhEXYKME2(Q?Lj)g;f zRA|XPSMw5YI8J@lXiMNdH87WXi#I%Jp4Q_cu}fV#gqj;V4?nAMS2f4vEZU?|In**=I^@Xj(t!%8sNp!~(iHa3DOPaG?QqS(9z1h9MGPZeCiMI&Vh?_(K=DkXN~qmPnsBKe5g$vL`Bj&b3FuV zG+L9Czu>n4vidQ0LuUdX(NWs{Wil1P!o(dvdX(e7>}5XXD^K-_bYTzd=cI()%9?)R zOq&Sj^SZWqq?=dEu2ud~9;^s#8*^8w)2l3twlnHZxAwZMW8~6v(F`M3EVIh4>Ocgu z-gLuQVNvyGj>q+#{~9?w)`3%e{<2s+SQWTfRts}e@>^lypy*oc;WLny+ojAjP5DzS zxqhAD%sH}w2eH}$-6!$^iDfRl!kInLS+O@qBXjg)ZyVg;uw-_mM)?GbC9COP6#&r( zMV(j8!P>i^G;3~TRpE?(A}4UwN}=AAtb752z!$!&jNn{IspJP{$#{g7?^Cw>lntKj zdY@A5sXmr&th))$J<<#~uK_r($J(gDAL%WC_4(5Db?|!(Xeq~|9SnG1?{(!$({NEs zJDOqsv)8pz3cW;|@uNE2D}Ct1`Uc(%k8>cu=yWNbNPIkL6>@bYh89M)dl*lL>y0YeKuI%2-P%Djz?}H5%vph6}JGn zMIIs7zMi|=!m>YowK(_IdwD9Gx!E_s)wG`+sYiulvP~mgtqq6E=rUx)ds6-32z#df zN>P>Q=c``ch=(iTi25gw@;%mhw+VFeW&cbyG|cD(VATw4LMKeKmju^X2Ypy~mW7=d zYQ)LqiMu!Blj3*r02DE_U$`5P*&Yrq>UtEjo&))#e_cKY#9_)1jxxI5fo{M*;@}@qZWYn<%M+!dx0bGg@2niSA&9|&NhZ)K1@yN4 zKpP&acvxc9uI))Ap`JjJsF6Ygh$QASNepF@_)w5U87@qTPx(HDB=VE#p<6ONtRs3j zY@&zljnKojURRDZZABCG@HWxIZzg((O2N)J)Np`!QJYJc9;{eTQ5~I2{)KW_IAQpD zAVuuqL*Ih<*Eo-2#AZuGKVjJ5|_fYf79jc)s*A|8|W=hU4d9=cmQI4 z9-xm~1bti|7oq(73*wtWHI3X&1T!itAI&r^QLP;XQeE3}5qDN>5W{_!Ck<|J^5FU; z4=(p|Cam(SSaYcX61=xHkf3xW-&jEz*c#R=hGo5^K?(?D!Bm3^;ixm={es6PGh(}$ z5tC_@Vwe%@sMsfR-p|n}5xzPL^iH2^zZ5FM8eHl}S&V0hT8L=>K>|{}SwoOIihar! zud?2oz1ySIdjPY^esAEJH{3bbz_Vq5XP*%sG(m#P#7;Vq=9 zy6n<4%v8|;&(=txXYuPql{>(*<5EzceCHBrAIo^JhAHsR=p@HlVYJa5@J%!x_HPiRZtc}*WdUI2{_7fM`Vc$5_PPJP)%CUj9^V9fjc-+kS!|JxS*+GMM(e!6rCC1 zdbyRVSgc~Pilub{t3gE3gpdSW1Ii*On=H>T?E5mw{Lk{-J9lOR(!Rg<_x}5-WbXDX z=XsvlviWI>N@MPgytQ)l-EywmbuF4#fmkNb+~X=A&$UV zu$-!Jof+4kjlVyH-=gaUjj<%WR@?fL{(wnKY@A$Tzyy5SR&#;r-N-fP7i1k7G37>b z+k~64Yl)LrAC|(+ie`kHZ%!-R*yXohl5Z$em*nR|ou`s-dZ(^4AI>w6vl;m&*J;W( zdquDFd7gD4e*VWiURC4{jDm)_twFaRy@njfh_SDU1CyaasL(SCqwpVfis)U<;gG#m zfFnK8MSyt>zqG-xk8q2hx=5HQqfdrkyEiiqn1}Ir<(^WUC(CjgV-f-EYp=~6t)+M` zGeOGwTc%yg`g3NA!(nNv!=ZiNuWRCcWC_jdJ*J#h8{WajsHf6lCSK`DlC!cic{)FO zIZV+BolWN?Rf~v=hbA;P*45VB&%{2sjC_Psem(xP&F)`F^O*uu66SNzgTX8`rE$Y> z`;9jlrh@0Sj5av^#$kpgv+QokC(ombZd@NZfE!1X5*pajCZ7!9Ssg_?uRhJQkY?UH zRCTeonl=xUf!?32ADx1K)}&+kLmae!%u46|V`uXI5lnApUawF>Bh!=T^#LXL`<8LJ z{oYPIuYJ^wzInLVF+H6NC;ga<&F$k&2^O-4^p`j_g1>EKGTcYaG~#4ft^^mSV*_6$ zpKBz;bS6WwwzfE1B2W&T(UIwJcDhLi+D9cJ9`3@MiJt{l6rC(# z0;urhZzL72O0!E5ekny9PqW0;ktnWym9&$d**s`r+ec9@N!mx5zfEh={61QaB7=6w zHV-x{!Mm}w%)|DI+dD7;{AtMGCV1jI>8OjS^O+KD=b;Q#WsQ~(x!O2P^tsYB4#+D) z|4Wdb29V`AtXf`QZF<dlx5_YC-Q8JsTIp$uEmdrF#&pA z(6hz~!vBgU z%0tltDH(u z@OnT3o?1dYEtvpU5o}GdfpG|dT^V9sp4?xH9Kd?e1cuA9zE2i!a1Uj&Us`dmM__q3 zQsDOJKNNc>crwZ_MU(kD75#-^?_nhv!mo^?Od9<< zcKbh9Ax0Yf0pCLy93N;{79H+IuWVynCb6|zv9%q(Tz;2K8AsA(2G4#L1%L}G*9mgizm!`-o`qnI|e;zV11@lkw{M*l*fVt6#i z2^6Dmm=f2oP`yRMFKvssvBxNdFU=EeFAQ-Xng{iwRXWq;n~u0X2@;`3TyittJo`c$ zK2IF~NNe;PkEl4#sy=o=Gnp2Y;1<#)hr|Al8@v`N*47ngFTqW_I+iN8+3eb?_k^@- zD?!GRb^AbUn=%>I-sbGvqmt~~KJRxG!)^BMnLh7N-w&)^j6;gKH6>%qvD>p@Lg z5W!{^grg0Q{Mv*C@vK}Bw-zQWi0@icAMcb4;-dJ1xSREGb3Lp(kB52XdH=zBsA!$A z9^Sg;%-6$%-Op$}oY(qP>*0s&g!QoVod0G$e0a|3*2578)8w=EupU0XiPyuq991KZ zcW(Z?!-PHV+N`yjOaf$e%lXf&ZW7n&UNUEeMkO2hZk_F{oHO?#n)2tbO5whEKQYPd6LsE~`H) zp+EabfBKq~}FY!;lRX z=?_rOgtnh?$YQ;4^+T64avkZCh40-0L>xxP4iucL&XlnBG4XDew2&bRxy^zdQWAv5 zF-kM$btN>HrQsA5YMNI*wvX85-xH12H6wFRX&;N(%s<>&k#Sw48Sfav^&Q@2D~HvO zRYeqc8um#`K0C1=2v2QHIPt2bi65SenIGD>YD|)hXSYkR+W)y-LZ6>$$L!*Z^GJ66 z-~h_M?b%?zoY3Nv>@V6$@|`zPUVu^H5+(qIOoyXcm>13#9zVOv26n=yPD`=1#&{LI z%tRrr1kgci3-$5iV6M2FO7MA3u!|>N9}&FseHf=s?EEO5jK|S4`wfs?$0K*@+#C=m z^wGc<9?B0iIhDyhNkK|M4Hw4mkj|SaQ~_}W#E-9{58OlfTj7ERL1V`HZDZ2dCSgqW(B(999-5d-LJ2+9EZWKaDF5as z*H8P$^+_G;)0t_;H({EkbZRqgDz)L?$87%kIOz^H#q+xt9I1E0ti^L|{dertromh- zSh1JJ3SF&218wgEr-v%Hr;vzD%LajCkVfmdBymW~Z)U`wQ3QxeF}T>3&EmpWEo*v_ zgQ>OOe(S$>R*=r;y~T zzfjPHBOJgLHoryk3j1B_gcWvm>x9wFY|SgI%+0H6mF*Ouq6U2~czvmhOh{yNrD1Zg zk`&xLyf+GWgr%EP{)gPn0=pbJhEeYNRjeuXi}9>iJM#hlYwzak#dgi&9s;6F#n3M)ro{ z0ig1h!6rmPh!Q+OyW()UE8_m>D%O68Z;htB{!9Dgbld*;yHoCup62$)@03s*+8@8Q z?T^roayN@@M@6H9q}uZt*$c#X3jQqjWQo!H(Fv@c;BJhGh`3Rp^~)oFONr^N_FBxQ zYa*-?+)Lz*l`HVBNWVD#UZ5VtQvRu@!R*aajR8&)%DRE1rk%%!@k=U-(5(4K7?Paa zs)%Cd0k6yQdqw#$=o)-8)TMUor0TtEV4!P`(SRDaI9bJ6p=xbv%D*w^w-OY%2yTMi zKfEEH#~Sy+1NME;S`C-Dc>1Hfkc8JZFpk$IQJYSLXK~=vY?|;uv;(-+znc%WbwUdY zj(2zt@}2tH811TCbJvHj$0p4MQBj;1i<*aLwOs$}XW=%x9&MW&#fxW=RYv^SCJEmY z)?bmCFNpBWzJ`sC!c$QtDhk5R4GRP3xRX58K)(Jp=tTMmS8=xy(A~C|K1*2=P(zZ5 z9{Oa+znQ!QH>t)zckXhT=K%&NCDp_G(tfvxPSUG=&acc=4|oXV(~A%q4o{YR*@w(r z56C6oI~t`JK<8#vZ@K)*9GJc%Tz+hq#ZrpU{qCj=(_kdCK1lx(*6CC&4I3F}E7-pK z`G2jbeVt7E;;+zoyRO)I$ghpYq!J)2-<#*xABLrPNYx*9_`JKzdm>-Mlj*MHL}FC9 zd_x%7NsA_2CM zZ4^CRtbN)s@pABhtJ~##z@@i5e=54Hg4l~BMnh{FOu~y=^1Qq69F%e1o=Ys|4W8t+ z#ole@9emco*}in-flD(YJ5>FX*>d%_6hENxL;qkBEza?y#1FK~j2-|N`J%UzR?A?xqvoXA-c!3J9%Zls}Q?I8ci%kTgFg}YGcQ1?&`6Ij(&b9CV zmam3N%tESxDexY8D^qD?_!+W<549b!ZRf*f2jz)U7N~Ax;E$6|v|mI|l|D=SdRc=? z*>Wt$A>lrsRKSs25WwC9EPJt+6UuK zIgv`S>O?g05c9q?W?t{O68*$}68+&xWOcHB&Bj}njZdpz4+rtV)UVG}xhBGFRP`*; z*x5SzU3smPMofP9ehW_VMaR`SsBauq^<2^4!I1HMR4u(iBpj@ZvfWnS$}G(Wsb zs@)aN=GKuQoqZz#$_TjG_$>&^Kl;90cdCr9+ z_ZTb)R8y~{gZ%r@6fqp(f(U-2vJ-XlcNan%ya>1*nnja$;wUr$4a>s`mT7a=8sbdw zhwAH0ko&H6CdhqR&Steq;#AFOlXBk#{qZ+a+q)oU=4E3)63W3sK_we#fk1;}4J4=2 zoVA)Y&ymoq*^Y!}{aG4~erSpQk4K{kPAMOY$KY|~-=GQpb1P_qgVR}Noo^F(E8c}m zxcBz-MEuzQCH^3i`NO^rG>DI%pTr+#UX#opULxVZT-#IW>ObNSl}D2JLm`#{#GIC% z%pb0VCpP|YX7u6I4;GP5K_5K$(Z~(6GZEqfRX@+KUn+Kx3g2>61{2~YlMv7KOeTcU zWGBQ^O(}%vl0b<3L^>=G?;cK~L+>W0Loh9w4wp6|9h$K}UVSS5cpv(2lw0;kiL&)e zEOI;BAAfGoOg{(Lb6)uax$7etGGAC;qLb>Fo5(ND=>8KgJiC)?;FuFd_nZpMI_mN{ zL1i14A%+#i6Tx-siXsqlPml%RUg_jFQP!F3NlPzcqnT5yZTLkS9W?6cb!Z9 z6CEwC`Zp=G;B0jM)|!oytHw@7?Ob)=$*7%uZI+jkeXXR+6s|hES+072g1@+(uX08> z>Bn>f-H3z3^&6V;*TcQ5)aI{;?o?;!tVaQvS)xX^@gBP(qDv0_(bNwPn*$saMI z;F?oJ6! zo@^il&3Ap|0V&#_a{LrCbS32i`?V-Y);XF_aM{s>m6VVr)5f;xE~>PpbTK9sO&2K-GU&pgC2ow+GK7L4VWxKs%alHrmAdeZZ&qkE`BeO7Lph58x|w zzs3xSV-x$`&hK?8&wTH&_Zwlsx!<3XoDID34(RuK>bDY9o#sL18qsklVv3SS*=b{F{5 zDP(I0Vnv2;wEJ^El@Y!!Zm%lJJwiHv_+|2S-4kvn|2DC&H;LiEywW|eGYo}ta(Cx0 z59c-}ZxKBXGXt0atW`RwJBnNl|KfKLAVl`53X3$OgD^1>RT* zH~ts*hd0i)_J?uVN&CaFL;wHSA8xqPwmU%6=V4QZRSKa^jYygxj4 z(7r$1cksX5A0Ao*W7p|u(*Dr#AWx6q!=Q??oyq&dkppM6KS+Om0;+9p5=gmOzYI^) z{B#2Q%g)baU+9xzGYr__cwOT5_6)!a3=Zfz%H`L_F|0WX>;o{Js@#7Gk7tVw-n{B@ ziHA!JO3x9Gpb-|6r+^S91ERart(KLoEs^m-M_dvACJRR?$G-me^SC4+-`WB1>8-c+ zr;J`!bFS{F-yBHLsv|Fm<-)Qg%Y{4aj4I9D&tv>J>xzlEx@(eAckAr=gpXK9tT4(8 z9Z4pnxpMnID;`EC4K`&i5g`L3`&O=!{3cz_y1(tib=!&}Qv0|_m6_Iqmr#A~5mDM6 z9YywX-f;|9F;8;ooL1b8$5sYoDqEQLSdKQBRrm}l@wy&y<)~eIRE}z~P^g#6_3Nds z#8V{UvI-uItC##i2E|eHQHRsp2Nt1MuCwAi%3`bPxvKs+JbV=X^9B}uPtV%X8LaA} z%siN$x=)5o8kInzW$DQzvhpcT+#4l#Wiw#Ua(M@B?a$_Cs4d9PBtz^=pAE|5D#k<2 zzfrw)<##6^AQnu5+2K|uzefrb2nFZ}u*2pm@%2?U{{_pvcgk3K${!QE&SE!St_65l zk)wQgya1U1{2B~`SZ-qNrk+>de^rW`1o=x}JXSj9#zf#OG*DwDXezu9&jP2Bt1QQm zV$>k#rt@iG2I0b~N!pj*CkFvYk}6fQW#UmBqsKDDGfyz2sNN@!^MuC@Iaj0+6PSM% z*)NpPUu;_AmmZ5zO95f|?gWWo(e4E0?CsqwAk4K32o<-%>R*_mB9dPS5=?BnkS1>U zM3dz|x9sC`OsvSI zJMg^wJyJ^wF=!wr;o`}OfbvUW5&_K0K>2*qL7AB7MrbNML(I9z#_I66CXpkJX>D>O zPW4p?j94pk`7m_Z#-xDT`LAaFxdGW-*S2H;V0M@8;liOyam|V}pp4dWepibBl=yUv zKgb}rd;*eB;&(Y`#P58hFyUxtFH-}@IxCakg4Y&f#6o1SDQCsc=Si?PI-F03^LR90 zqV_8Hl$PI>{0=-X;c^4DPAr@3X0@Yl;`97on4816h?1>ic@?iv-bQf=89B^iv2pbD zbrNet+5sHKHlD~FT3zyQM|6XGu<^x3wKw35S%O#UyvqdP#b zu$M4@NW{~vH7f&XtmHUIx9ng4H>{6CVy|E>H`i1y<*?i#0R-$9$*5<%uPe)mIM z#WXd3LMur&$Ro#_$2svEJ@T5dbN`axJ;s|i<9ELj{q7Ij{O%*@ch9!%8-7AtMtNuS zR~SXxJg76>J)*uL7LyqTT^6-e?e}>VnLshO1;0S{2AkjgPmxM7{>P|2-|i61|7C|r zS_MQysrMk`A>b~t7@@gQLbysrF$ziP_(gtzn5L%0}7q^lO`tG9>RbkRx1ZCA{D3_RKNx{;lx-N_>B*5dzs2T*nI^q?ZPpFNy2;1p;mR9T}A-y2Q3FoW~U5*f67oI7IrnM2zz3PZB%q>SVAe zeKF%t*P&qh?4f9kw`=!zIE$C~^nOII-yy&v2WLe(#Q8zJjh>?t=@~6QN9{E>Qwf|; z4&FDr;>c5eW8x%Ln^8*msrAqKpqU8+iO|iB;{12xMC#vJ}dx*jZ z?ldX9_7Wa~piXW*@0Jm2PTQyNBCN>UTae68WLBAyN+*dnPPpEFW5jHV`K(j2i)Q1i z{BVZtRll}{@zzX3NYQ+%ab9CAmY_d}XEjRP^~L-hf>DaMzNXK@lNw{>$8wIZ*H9ok zMQcz}cP+$Wbwo|^k(1n#fGXqSG?pd$jG>vn`o84+KN(sdekC=Vl z?=c_*8o0?}1%zy-BR!sOL(o5ji4Gr9@qzp-Cz(7L8Emg`K8y-Z-z-_NYS8Q=d>z>N z!XF}b{&Z@lIcoTpcR=RoojAuL5r)LuNz!7^^GSsGm^t)O7vEzY#`W(mZS4>xTcajk z-K+~4ux!q@=qVu0ZtBEu1h&~nI@wABMcuOv+l%ZEvzI;vE5aQ{dqL8EVmmMIl@d`+K8&_*aMWQ-IDc$vMHa9Zo+) zQp)+q@wnFJUy#=+*>-r%M!W4WYvX_HUod2KlEHB62HtVIOYJg#uKvY_1pfl2R=%oV zoOrS>(=M$ixG8iCEbjty==xh|4sE~P zG4{=}WsIK227u+Pcf!J@X?9gJlKh&po0sHqol#YP#2g~o)Zvf%p+(SHRJC^A7TI@nX{Y{8B%cxvM7Dly0?Da3d(r#v~1b$QBsCfcLZIvv@T zUr7SoHR&Wde0vvBbKy{vTfBC@lpH$J3*)my?*^oXiricpelg?YbcaJqQ_D=h8;*C% z<}NA0ZD-+$bAprc11=oq?vvE&ZE47UH`AFqIVl2(h%}3}?rP0A`WYtf44) zB1m<8{#vTPKL42fyGi~P^6x(S8;Ce7YE|zSSlU!-7(&FU2A7$;p6fRz$8A!2?{(yh z4ue&!MWxS>GMR8U^e4-RtlB!!;R000Um_(1*@_eUtB>@!EjfIZ$yeI-x{=Pmn5`U8 zy(2N)p#k=}UPT~>!@^6dU%*^1t<;HlS)z}PJAhMKQc7?j3;w&3sUi)5(W<$w-;Bg66{O{7W7@f zId?W){raZClxc0(a5?92!@Yu=RQ@ooEEPo6PD4<>dR3aACt&H)7HzbAb}b5{Z9MFu zfRcZ@N9hFrNt=eatp&Shaw6;0%<|lz@(Iw2p@VHH+g|U)t+>~0Mb+R_w_@w+Uuea5 z&eOF5_tVW^n#Y7_VEG!9ckvB~3E`R(HvjB|nE}!R%?wyY?6*;$z651i@e)XbYT7&W zCgFmcmV1~X-l7a8*hmV*5Vz`GQr_LK<&n~?6}iJ7G|IeHuu_!ZN|0d}BCig=6y^JV z?K)DfwIS~C6g(A&T@DilV7wI}KEsYj=0S|Ox@$YkAXo&z$m@$S{JM^Hz-x!m6dZ7? znGy1uj#h+Z@bB~v<;SyTpLUc^uVrS~nZ$!&zV*ved-cmwf@6V{;?m`~ca9P)Q&lY_ zf+&Uigw}y`vLUs!%>)l!$1Dr&Uw?K`1Rcm5E09q>g^=FKk44|}DY|YELv@pynMSVz zGlp4DS?gWNU(qrUufT4t<9UfVx0M;9r}7a@{s^>|4Po8Bj*nCl>wT@1G_~DTt;sU! z;CTX-;exByndV9pr_r+qLn;TM=2p+=4JOSNx&|}-9$MGDHBec_ zGII(Izyr4d;^9>|TW|`e%!QTYaeI-+TomR!+KjzCd{2yFvo~87<%F+&@Xo+Htr<}x z%BAt189LCm80}O)NJOUXt0hlIc1}x=w@YgY&Ito!x`ob{+5o%5lZyg3F$tItuVN~w z(XMSv`UDJb`zn$Pm*S@eo{wD`0X&HYUYClTMndV(G1DMFG2G{pr_< zs-F@%kA#whEU~V#?%1Hl`_Bhr1LwT#hUJqrCzK4uL z=!)kiekI=88O1tt=A+K8crf6bHF(MU+jcU8q_3op*(_0Aaky=bYGJgZMq3 zXaxEq7RdN4awdoEWeJT7`Xkkn5t0$w53oSvLlg=G5GwRs`7xIm|X_rWZ^;wobpvmAkS~6XX+E=;=*AL8~GCo}=L` zPIz-G((kbWsdyTF+#E$jy?~KMwaK)WwGTA!u&K4>+2VoH;#v%y5Emtf+ zYjF?HDT{lNd=3elkUIilUJ-g0_>_b zOAX{X;5YEz?j9u=B?6~=I8l*>z6*t=3gNIU#e$#Y{*aKnSdv6bZ;Tbx7v-^VPt$+s z57AhGUSXy!-YX8TM`@-I-5j|4b{wo_15x-w$13sbEm2901NhlDv|*qF6wfCM?db9i zaylI)Uo&%}OX+jXGW?9<<8We^KSA&U66o;>DidKWhPerC2^QZ#7_Q!E!fa!+U$*o1 z&m3*HGt`y0MC&Dqe7&jlQkT?riu+DrAIh@AJ)<(e`@YB;%o#ci=mIgpIDZ9i;H5=!Y*7*ib5G~DFN=)7ox$Bk}>;;&>nUxM_RG|ufF+n$~7#-20j z33RP^4(f?da4akiyh4`yG(mF#X88&XvtRSEoGysf1S5;d6UCNWVluV2 z=m*RI9($-!;^lhovh!1FnAKxMF2x?VcUnF6bC3OIkD+rQ{kOgBgVv739>?RcCp`sy z>tOX(_7dGM5_{=R>E)v>xtDJ4p_QN-I*B$=@%qYyUhbe?MsGF&hs50*K{9?u+5>Qr zIcSlEA8oSdca7g47~^zlq#k!z93@}68yjV^CZy1}pOIu4t|I7%mfio|n!Zot#fzT9 z;Wl)V)-t`BN5>KS;iARSSe*Szi}gQhS#{X^1&H>WBO3|KE+%HnxUIEckVJv|t|4#L znzxRM2j$2AHgD|}!{n{MePQ`Uh(Yp8J^ot0yfy9%+)GHgm<1d%@^kq1m?!mQUrEATtnJ z959Blt)+kSf&}(f$67vew)cs%{l$SH3T)FPv(RyyoKhn)K8zADg4;(3%?A1}!sNfC zzZqpOCa<&37Jr|`)9&7Cb~am>D8lw+9Ij=FBCK7X&(8>78P|i5Nh?$GB`aJa(WO^^ z%r*^uIwGBm2dJ)zZgLI)@yNKvMKd+v;IZ;)Uq#w8|DdcAlHTY#5U01_ySeKv%uk-IP3Fb~x(q#CDIWWXj6L{TXNr)BL>8;Y^}XI~?GW2*$T)O0{>Js*OunVS zu2IR>m$G==IAJ>^GNChz8 z|H#iA-5G~EU7kI==rCHmf%#Plqz}eaAC-kCYM*~c+)-VN+_AQzo9WQ-z~QJp%ti@j zqmq6%?9Sn?G1Ja}z&xm56L$~M;+we7$blV{gy~M?4z4p}+ zQ?*5Tiy7*JxHdt=96;=cREA+H3JwT^zf=~T?<9178}})%&}mM68F90JK+P%m4w~Uw z_8KI)PCC0nVAZG!b=gCKlGq7sLkTgbH2CoBAS%btW4ekA&S5FZ(f=dAd| z^h}`3C7o-AMX9&2{l zqUNA4`>ZemGs;>5SDE3~0ntJUKA##~Es}z(kE}C~y0t1_V2|m)ZG~5Z@>rjHM1~3w zR0y;Sy$$lk_)lD1Q6_Xb%3Yfz^kd&dDPOP&Usm5IU;b5j z#UDUscj-|wseu@w1pnr=`}nPQ#(n(I#y1XGLZB$iNhjv8Na+M7Qw`MQB}I}w`3*79 zHPeWJDxp9EI?_mqj`WU%@F=4_vsl~JEKZ>f$0G@+a2e6Ls}09vwc}Jc9vGk996r*U zj%0kKZ43<}@sak5Tj#TYi5MY1^SnMo^3)7*ZzSO~X7ED3l$nK!7AEfh3!_Q#mP+Vs z=7mWymxRaBA_+J5cJfLXpT~uTKO%#MEP6eO4IBO1Nznj5!zF}-yCnQxBV&5Bv*qGi z70x4d@iB@G!M#w7oEIgf*M(r(ZRp#d!>=_GDlaC%tB+D9qsmu6@+kep?6S#n?))gB z*SCC>(Cd6|nwa`2EBvK%_&N*ji_7*gTl!X30WV9v|&wF~~|!@5(|gLtE^iX!OdpjpP_^vMtP=tLg<=f?YnmBI8XnoUy-wwW0NsN4ZAjL&|9 zfL0g6&z;xO&-to>uYyPu+Jv>fL2j+=TzammFWzK&5>q&>s)vX~`Xn;y>xn)He!UU^ zNkF#0Ndx;NTB=}^6tzd>_E<+5z@2p=gxGS5>&en%I4CXm;v6E&5(-^;pK?5D$HfH& zSdSq|w#@X8&_hgNBMaMs>(N`S1e*}_K%a%>=|l)sXL9P}%fvpy19*Xagp3b3&IOdY zd(Op9?T0>Mqb+}-6{qwU`UWNlzN!STfKPiBfSPtOmZ2A@(83bVTusd)MoO9QX*?^d zcv3VVf&acdYPE4OWt;D{nDe{y2gXGWO7LC+aX9JM=U}l)XceZrhQ*NP*RD_l^|0~t zr+Q@rsS`1A({eQ4&KW>E{NAeEUwF#qO9bJ$61xDLhLU{}*pS`}J9QOoU}bgyV^m z47)I-4|;}HI+{2pRtse`ao6=qTA`TcUU~!4$dUivMpYk`Cw3O%VElekp&sJF>RFbR zgT~U=|JvHI$jt&X23y=;CO+#;{vxl>Z0#sm*b*O6p!P;FgY41EhW4;sAf=lZT&Ci74bg<5Legd1}RTaT(X=lVntv@5VL2tRjRjit4{( z9hFoEaL0cyW-lE?jR4WHS4-y=Ij~KvY=fKGKS*QLX{m!`I-4*wz4gfd-zVL8670rs zXCub{!kvN@yI|a=HzC~f91qR?pW6^C+ecjr&F6-QMtL3vW%HPf%Meo!N3DqU;pFZ* zizaQiS)=4Eg)eC;)rV#`@$B?zKO%r~acFC?whboLbgJ_`ge@>9X;%!OneuoWo}`zN z`$EV??O0L_qA_}?@OAahG;(2d)&#aVeSu>jX3xTk%yK99Z7q4hVf2XAd5Rn3DCjf$ zdvm7aR^H^-XHdtCHVs&2?n-sV9yL(sEY`kbzpW)sHBjwVvJ3X9-qmCGsz!%v)avc& zs*jjYNSXoOkXuKZH?(L&X3c9mzquQd4OE^i0FpuIO|l;gjd zl4EGIX5v`rG#1Dz?DWn!g!Gm+-w3(SR?;vb?ksL=NEC5tEi#IC?Tn&``NZ!}&n7NS zViUQoc?4enR)*zA&O=wI3?C>*phA*a??x95W)qf^{aMr`6r%fIS%12oNB76qV#Bv+`oqH;==a!j z$@XV3X4 z;{W#Q8+{>%C&U2uK`6O^`QKXo%cUQJ3Se zy?mNTi8C_a?boyFLJq#^B$?2QN9TsIuPMd{@>|SIg*D=}g;6WtpxH;*c-^AoFZZ!U zVjt7!_=Yz7q;BbDdD0C2H;St?+}si%0JMbjrx1IDFD{`Lcm77Q0yzi+GBJ+R%Atab zbm?XkfbrKlI!~Cz%e9FTN8bbkJjf*+zm@5XH;NH8R_o#!5&c?eUXuLWmICc(WHec6 zPt26JAAE88S@H8*zeur$j}s`if*QUUcT6%`z7Gu(?*O-3@h(XOA=fbyvRrn7^r%?$ zSzOMg?KXM4`or!xOS&H2IFPd)L0R^x&;G|`+0WB?lZr0;KageP+ti$we{q|-G?FU8 zJ~=mOn|gSzeWgk9wb_;SLLL(LT~a_R0++87s9#lk2zO|ABG#Z#PbKiDC2OOGgJxQh2_@If&LC@vid920%gjm;T6Yi`De7c@UITruSwE|u~{F^ zKOD1$(eo(}V~>5NKHu+{Y79H3dcRXKNo+q5a6+ghz>>>ehxYC4HR9@*uzi<>+rro% zR36E9W6X2-+XK+p1|)m8gS3n|@S7|!;-YuRjIqpSC?jnuF`ORWZeWU*NM^Sbz6;OU z7c+H=@MV-EbqljbG<~hcZp>OL&t=RRi}8PNM$PJFW5i2u%k%$#M&-N^_gY`Zxxrm; z+Bt0bKgG>(Bl35&V~NO0kz;PqB0uZUJy&Wq$!+>FX6-bhyY zR=BR@p$aX3jE7(2J%I{>T``f99Wzk(SDr^alg_qokUa!XM$E8D$i&N^F4s8HUpt zJr-lrwZ)^Nyv1HY0nqdYbXQ-n3!YYQZy}V8NQa{&eu)DcqMY^^m+rO=BpqP=s?oMz zCJ}rpMQ1!h70Wi(hh|EJoJ?$S{TzNSY_jZ`b6NW*BY@HJ*fwr|ETwZm00IH%ByZMH zfO{x|J51)#Zsg?35C^uncC@&{BG;eF5RY#$1BL2@u{z2E60+s#uVZ&Bi7FtrVlc#V zXaIF&5n4@PQ(sOq^$59Ph4QIhgPN30SPM#a@g)374~nOYE(@aMK6PYK8?FQf6Ul@9 zDkXA_tOAmmA54K(0 zE##}Ul9|#~$taHK&Jn%WaRX0q+zEzt8EcQ9;S);ecaJ_~22ergm2$EJ1_>^*jJ1Ji5{A|8M?S*%z(I-7M`~%Nn4sGIn z80wgc^@%LB9?x;mSwznfRP;KWCYJChKBJBok4eOrn>9!W6j{(Xsz3bG6G+@$SS89S;V<4TOAtZh^5UCxM!TtRpnb% z&B#xxqbif?$hYb+1|(LXCRGscW491J3mp4{GWp?5>gU~BYsVr&`2^PZDn%9<8Bx;^ zDPdCLm0!Y@m&Pj}B`YUA|7wSf>W~(B&{o@iJGHeV(J(3Z;29Ep#UTzoN9H*9r6ttCrO)e^_6=zhN`-imvm71f*43aYoKY0UMxoz$}DFHdUO?1Yx# zmAE;vWjCa@tp36?Y1#d+G()2=r(o1&ewo}mgp(V@)4-5pHy5v3a+{2~vK=4k3C)`O zR!Xz(jW>(fWwWhn_$$GyIh1*`kT^xkyhBh-p;ozxnDANJtXJU&XkLZd{@j)7(f!aY z-cbj-jEfxFtV8tPEZQAWuAQagHW7Rx1GeW)Ews?SlNQMf^+}tcF-0g6<-l`&5@w93 z4@FP%CX4sCU4VCv$vgVQEELec>5MnG%A2?^>a*}lwBKO!^4s*Dqx?*0(X(k#9j^D z#EyuO_Tq2&#M)puA_zB^2R+0|c-SlF@-uV_*J=}s6#1T*9dxn=WR$9?WgtKmP&DGf zIdD2I0=QolbNPi_I`6JCSgy)cwGz}q=BLm|tiCLzrcLph8nC7pWKC3vK8vQSMvtJn zO;kY^{8|gY0-rDxWk;xbJ~iYVmC|PRhyQ*P_=Gp^6!2Y|O^kI_N$ocl^Vc_l);A~< znC^3Q1jLu{w1GQ`Ro10@9AXqb4xx;Sow@B!@^7k~Pl6N0&(u@5Ww%DLfgDd>qb!+c zAB2t8r~OFz3qgbZP5f+zj4rQs=1phHN77mnl61x@WOh9%3p>9AbvG_8&YIN3!j-Jo z%D~rsDiuW;_$rxYEp8y%8mKLjOBd^6S|~4SDpsxjLOmrhT(4}P4$`@k?l@gP*PdZN zt7Sj`w!rM?+Nf1iYCrd#K|c$pr1o>!Df*dX_4DGFEIz8w#8r$y5BHvj>|__QlM*>g zt@r`tD=y3*V}$mRygxi@N%cZ5;?qfru#T}z@#YV5oSHW55Zc~R5qDp8s@Pmi+LY&#_J>36wB#`C8+B!N9*0IPw*F(+!mJhyz0 z?53yy_7 zfn3j($#WDU`2dsTM`B~_r>P^uR#=~*B4u`Zr!?IkGvdUK zpK?6n7FW|K3~{i{{vn&q9`k-y4v+^z8E9fM${QrL zXLBQ8r$)AzCtJZL7}=0IYRFrEI!!|)zk-H5_qx@P7yo6O-^0&*eis^LPSI}_AJa4>)KSrM`+uMh_61_Q9{I#-mQmo2*q# zSGA&<;&=0+Qt=-ZS0w$~99PQXyf9LzjP<(DbH~z?kADQ?g2c2h78b7yR z4*7{tL6!JN=|oqsIny6KKhI{&;`iRQy(euws@|bN)rL$`F*cWfPo+o${yiOjXB6vW z^I-kt;Jup-&tcwYhs=a$nZ?Sva0bIHEkBdXDc-KMjveTM(*63dSirc}2UYG4T^P;%C(-Y+V4s%(=C)U_?t4t$Q@ z!PQ&R#oQ$@*?PU}U}*}dm&wG=Vr}RgXii{Pp1LsJ8dy=)yV6j?VIjb;%-dx1`?+Lk ze5v_YS!pddrv@m!PSw;zgfmoK`Ottkkif#+)V^>L$$J_ESjSeAVK{t>%z{ge|4-;yU=43}L#o{ccFH5Bh+c z=tX&c!_%M|9Z!sF?GXDXqCqp=llUU;Fc`_oZ+CA1{sujX1NVX$#cy9inv|c8se6IC z|92>p%yB5|T@GDBND4tVC%n7p2BJA4UxM-`1|vvHV1h?D%Cqq>_ZY^SnVuIIeAb-n z1qM>1JSJ3Qkc*B63qAYT#lvTeVHXc_Dxnfl1Z1P-S>w=jv_m?55VBP|HFU5C7}{tT zwp7K!@~H#G2)rnMvn*9y1qR?2f0%*~I+1jUug0ZApN&WF5Ci{eNrywQPn>48WX4y^$+W~lOEAk{ zTyG9qS%q!Tj)s0=(0*0(zcFY7c+kEs|1SnDz4jD?)+=~sgLc_ptU=3~owQo)G?c3) zD`mAH4!51~2?n1zwI<;sqiuwLGWZLGf4BPoKH+tvlntf>M*kOt*K1Be_$AMrj_`4< z`d>3F!q0o{zg#Ujw$(EEF<31##`0=`XL(c%y>RL>o5>`6_FfylLhqJJXv#5=Q$Fg`9{1%?*)j4v@qx%zq(5Nu@*@18 zPtd|n9$W`uw+$D^{XmSK??2kw0kVTrzI)G0>?`tRQ$l_Wqon`s!t9Pg{sC zL*Y(G%<=k+uCF<4jh{Y?rpPNeMFOM^IrU2XMX;DLke|2|WjIex5;=zy{OKr_UmPhI-Os@QzrEYP&Im&Bc78!I1aX znaV~EC}KwlGD%}n!r4>lP09&iw0yo5E9iKrKlGi1L;Z8*A&)(th^)A|N5oC+&Ag1S zuBR(H=0$IpCj0SB?$h*)vurJ{N_f_ho>k6fPZI2ccz#ieC(v`w6i=X(p+ylej3R65 z$UV0m9u+jP2Q0c3{7n7=Qf(1`o)k@-6q6>|Ck5PiWYVd7H)ztSdv~oYhz+IKN%7&w z>|r!AMB^X%3#Pe|2I3m>n87?el#B;g(S7LHb3|VcUO>W(#Z=Qk^Uh6=vNd=v0Vo0(&=bV;=2cH`JD^pyKmsT`x~2mw_d)x_XGYr7T>LH{@qvPyEXFN zoABM2&3A!nPj7|}JkTVQ{WX7*+3Lw=j@40r^J|e)ovg2%n0&IXg!)6hVf;sW-_eMM zH*KWKBePWF`bS5=4!q{+sPzVVE`$ph{5>O&mEXSCjK|7nQ=-;EWrD{_N0;fbg3rWu=2h%?Lr31>uYTkPycmroFOjFC z&CTA2qPV{%#+Oc9ILN1gsCUq(eM9+%w2Fm(ttrJ_tY3~_@=df9;RG$3DFzb5a!~+T zvHnL;_lu{Bk<+9X7ct?Yx#Es#mIoBxm@oR%nf%0=U{#ZV{VgP<%eM$l z1goM<5^L}iV@TC!4?#H}OUyONyWzVkF_bLIGDS_%Tq@(=k75})ee;7-Uh zG9|dhX^+*Sb20~dj6d-TZPgFEnKzTB$Bc^`B0bjpo8brNlLlL(Fg$Iwr4@n6{Rc$V1w zoJ$h@oWD?l7vcN8ZqJ0>7!ihA>;1Dr%au?|%o%=T4t#aLUoW2O=ZrXbcn#Do7>3DM zDkcqu*%)I-(Cd+Cfu)(nq1}isv{#ILB4t-vAp`LJ(d)Fz*v_SN^Ei(2wvN9SQ%U57s6M%(}oT=qt+=Lh9CpT6YppNLA__tU*~ zak;JjN29iC%4IcIoW6j@$D^iyd&j$X*uL)}+dm8*Ogd?J$wD*6(ne~5sy#HQDj;;Ul`%6#%y+N^79XknI)yFH( z=+jG|rB7SiY53WheR{@cRKdmZ~Ii0pI&zQPw$4U5Am|bp+9_& z_XsFN!B1@P5+(dOXZ3~x;*Y}5{{{0;QEs3{SlLHu7yKSm6(;8>6lcAKh5NNFoLjTv zIx&L0#J7`R4XwJ!BQ6^yV-xH-0V_J2`2fxQlj6b&z?*&$P5}%F&SNG zk3pC^TfRH{J~Jw1pV>;De26>K>qgjYR$XM!Y@+*Ctktf5l(#wW@yZNxNep z)j9kE%Q4Wb6FONEulJM35GQym6Q3G@(Ay-DLG?=TB%S8=Jd2wo{PyNJoC(drOVK*s z97k$%-aWs$=9I>p)BUkH8UA`_oDAQewAGuLSTA0R7w)m@ojNh4-lAsfEsNLtAqg@x z0fW<_%TQHGU!w;lDX!j>RutSSfvz#6RgPVt7Czn!ACHBP;e%2{b3)1$DD&#z^;aEK zPMI=~Kh-Fs5~BR*=vI)&LZCj4UPOZqK03M|m;}R>Q=B8eZ;I@>MWL;Q+IFRAAILNao~RTF1b#evBzL9u z5!!;ZdF0#(^V-P}i7W6grd+5(kjN>hwQ9?b9)><1)~0Ngt0(ljebLn*!lL~Aef%p% zpb17FjG0YpC9poMOed>Bz>(z3e|8eYay91|nU8L-#x&1tl5IHwRSdaXrl^QDaqg3K zd>1p$yT#`%P}m#p$$*}f;LMocOjne7pI{F$R5d2Po_+duze}cf%pZb{r>HYZ&Rm<^ zXpyG#o0(QiMMG0!BSlx)zRaZ37$q{fp{n*nN_!Tg_^H20{L4!7A|7aJlGNq$exTme zBKlZna;YgrKG-Uc**Oh(Qww9)*pw#zoEVXunPQvM+u~U&xwfM2SFUryx+Lrpz)Z|YJm3bc^aSJiPXpfjZi$`Hx9Ob?k)}j8PXQsJ=YT=7r z)O{#6r3H6U0Uye#W;#Tl9cz*+2WH`XdUT3;moj%RKH9{d)q7-FZI5DEBT*1yEOJsR z%MpPgg^Fk_!41?mg|kXg{fyGI9g&$FCU#)9t50UJcK8=5Q$9J$O){C$!NVq%aqzd$ z6t4TlViwDW~^aguMLr0X*qLvOvpvLK~8Qq%xJhqelUSOhGX#>rY16yUulVr&V zyQ|a8g!fBwcUS-10%mAg?)pea*1w?0w2_`XZ;cW_;JMI|^4&$byKgko-zBYH+f_i) zC(2HNDksV;umK)Q10L4gfR6bqZw7A3kZDppdZ>v8F%vi5ED~Z-aJlYPDe*`Sk;0m$ z?_!bSd*^D=rMZGLnlE_sE3)9iwEH||1%W1~PQGtS=q}jt3Iesx0`Fnv84T{12Q?NA zo&nCldL?|HW0H5iBrgy&o9igQ8wCSO&$&ByJ&p%0-4uG<|FW&ws%-Y%2eH``Jee{V z8x|wFL!-GJ`QG84oQiqa12+nNI4Q6dD29*}WQ=o(Uz4ae9M?NyaIVDi2Ou~a7Ptwi z@FgmYphZwD;?p(AdSqQPIp7m7ORx(r8m-kXpP7CJf4Iya8T5mkke~!?PKa=jU3lFf zo3~67$;VO|2fKM_Hmm&$TTqQ;a-pZ-IN$s0|IKQZ(FdH-)|gZ;n%^!>j-E}-YNVKII-zXDGW zkKm6dS$BSxcS@~0D@HJhP!YD7C*MX_?WdQb3Z0Nt#yY?5b}Oqs1NYigkWG#R6$CGZ zd#I&?w0ty01v%KJITgemS3z<|$n{$Asi}U2pE+RRH?U4LXJA|>s@h{~P=2CLgqPx} zAy$LRN2N6Aj(CHRDO%dj&!p|VN}IY88rqf~mn!q1vA~5kz>`<`6HshkH5HqDV5fQE ziw~OT%LSythu54i`3S2+nB8k$4XR#!jqYa zy*7SVYVo`MFG>0ZYFh*v)AjkdfmFO+5LgO~?$Bd@;)t~H=f~~m@rY7B3UQ&Uh<=~) zz~T-z9Kp|zi1El_;6CZP7>0X9l*wC})-ANXsA2%!XWEi0+$IBV^ zqMV{_Xahs7DMw5Hm!-S*nn0^+@jY>MyIO7SFmVFQ%EXNiN9F87O;R-$n^TLphG!5t zU*gm{|BoqCYkOi#h;}-);GEW=IV6KJB@8MA&S740vIDOhKRWER<-JG0= zwpIzeDES5C@Ex$ghOguEw)~!6B@tN5zq_Zf# z)#?`aAMH3ZS0~tUf(RiHYY5|?>G1c0Sb{&uhaSR<=_bu1q>5HIwGg-diG|y&hieD_+#{SQ zyWX$8Nky6kiN3JYh1TQkxp+G`3`WrC6Ek_2ANC`63mScjc7U?%>w_2*nzYdJRG_Ve zHG8y2J9E%4=-b+Xn@xV9;c=S--&3%KBL|&m`%~b%c)cu|fQAtCWVFD>{JAA<+ZVrS z!66(K6)#0&WMlYEn2NAy+BA6&Ap)lHoU{N6rg`#cj)FiVU+aZ7b_o&S zhjGH^3l+)lN{E;Bc9)Z6L`8Y)vFAvfFlVrmS$%PT`jFU~b0UF4;m>yS!-=PgqEH>B}G+T0P zw^Te;m>94pE)nnSXo|52m}~@nY&bLW9Ld~Wh<`2bxrrcsC39DYr?7n955-mZ1XGTK z>|dgUz9nd%pqwz6R~vVIJw1c?wUle!EdM@gDFVFQFpGVvHn13#t+*jwY6E0Pf6S1+ zLuS$AVSxKETz`EZ3E$r*gO^pBO$1i8KFvzNaKBjr815&@zt>{Zesf2&@-MuN+hhm% zG%oVCxPv#c9K21;)pB*&aXTncnC}4vH?`Mr&nV@YH%`QoB;@MblO^PN@=m^WXO_H^ zY2DG~9fx)2ae0SE2|cr(KNO3F@gF)TwMNs4o=(4RVtg#=P7|d08osH~lpC%4xN5>r zM6EobD-oo8uej*hXe_)nYQuk*lmaq^*EhxQ%kvC=N=k$hT1I$U6Ih4JOwgeM^)9jL zk4#aO6{r@bY@eV&jsI)10_D~JpG};6tp9k)DJPB+Tm=e*SOaf_zdS(xw04Sw-`+15 zm`TC6Onn1n(eMQ)NX{9w-6r3_^(RT!Z*EJtM(5Qo+vv+V2kB4SgXB-~t9?e`wV^RE&cQvjyoC#KS+BzfAmE;;42-#08-3EVI&d5V8EEP0AA8kRi8=MGDr z;x7$LnBs6JIE?*3z8Wt59E|JV?*;uEeuwNI$~Go9ln#Z#R_MUeAHEDnn|L+aSf~=5 z0mR2)5myYO^Q6ph;6N-sv?g@e*~5})a&%~N$F~hl=(rNP3BA*9iX|7PgkDB}!Iz~E z;9UHtGTyKnHy$I9cijs^n;V~rr1D0>|AYb}80;H1q-)}6`FDW)drfkWp?>Ie=-*iX zzngy;tR#FPs3ZI~zD5p}4;B-J!(B|*Pgy~K`8P-Y?SxG-B6>t4R12{ZolK3M18+nH zn&Og2`N@}H#X1!4d*1r()#P9PATE|$sc=d*wGJb(f%9wuHWz*bo6%F&Tufezwc%|| zaeP2NpKrqRc~x68WROoggejLf7hbc8@NT5D`mE0~rJ9EMnqi|W%Mv-0qm1OP)u40G z9@Jt`-pj$38NAJrs){*A&2)|A4q6>gSDNk-Q}2pO7*2kb5_F`bcf52fh*-v8&j1u6 z$t|rMbJT2BjTEsF!3Z@-hbZyT3*_8^M+-rP-Z4lAk$wr<7c%t;W-X6D{0tDX&QOaUgO`FpPU8-3K}nX&1x04g!3m-xI2|9Fp~-2iBD zw^WTX?w{`0^NRI<;A7v@_4udQ8?ERZo0U{MlbR!D`1PSx3ayN|&6bLtPw{0AbFOZ;vXw3=Zyu_8&E(CSZW~ZK z*f=WwicxX#aeD<_eET$^_kCoNy&cE_J{z3Sl)Dm}GJ7ENcrjE0`Wn}Vywu+X1;m3Q zHPi8a_#pD7m1;D}U)HA`f%P9_2u9k>G00t;fiCz-F0E+c?o}){V5p^mz4z&np z=!YfWz50cyyls~Huqr(CE$<5&}?#@(xCAHdZr8ps!S@je;R!Sc_|w7ePd zY7!ApFM*|60&FOlfmf8o;&Wu)jANcIam-btkINk81vrEZY3^q{aWngXH64tZRPC~l zQ9BnyrlWc5ZM4X$Mu4?10@6j?QTjtEG!Vfz_%Ejo&!<2y0zsXC4)HQRbT+hXZWQ#2 zM{WSEF`BM3GH9|RCiD>{=%pn{cte~FV=;xA9Dk01EXgUJ8($qc$MU1D!PN5T&~`}G zmO(q+(p6cji?oz@(go!3c1Wjo`1BzIRIO4{AtS~(49GQUU75{fB*y=byl;VTqRJXi zXn_Eg2~Z>`YH%al3TmsMjrGw&3o|-V6i`%De4~OOB*4l;Gzl<{S?dEA->B?n zBDxAKg%(s0QIv;(qQVUU%DaV>=6}w)cP5jj1zp_z*8P9KZ>5<#_s+fNp7%ZX95pf= zG^ro__ZsWf`?-U4nf^4n-gD!dU1}3y_X(a3XHL zfC^H0y`pUle22!@=b%W)UfM9AXDNImE%*L0s{R&Fg#H|I2~~f^Nr)lxOFzdqcq8a8 zK8&secCaUbDa&fu&g)4h`=+3s?L!`&SBr7|>~^}@d_U+Cr8Aiq6fa5!%1)u>RI~Sa zx4#sKf;Dt?rnHQmg2){SOyf?{{AD87D&3nI6vDnVOu^k28Ym;dTv&$fL5Zus>{ ze-)v`fufFf|MB?k>&ntPM>3y5(MwOdtpi|C8f(n6z_(n{2RK#z-l2#jonq&UI;=~Y zi_cdJ`2ZRX>2IqPUhkHIOFP?n-pY z`?Np~UdVfJImhrZwEuIsvNMge8C3j)U^yFoh628*|A|Tp%`uy<#N`BhXCb~~dK)u) zIi_>^`lfmTWez%k>cygqsDE`NczaXe-3sA7?v4~#KY8cTVC}g|zuC)<1*PmacE zrZFCnK3dC;yI3H>;Fbajlt_OE)>sX%%QBIG3rTIK9FZODp&L+rCG@wWH72MR6V!1M zDC0^k(6U6+Bg}RP3Ga|zoS_==8dbJ1{hc-=&2~$Hf#hL9z5O|k zXcOqfKPU53@1Ka5)mse}wLUY^ zV?Uq(1I7gzgFGE^5vac2#ZqvS9b*OQPR0{tSv|gngPOARW-qdC_dkv^w;L-48}-Rk z>vIfjX1~eMzZXX+yuvL#cn6JDoMU8e*ZYx0EQQXbYV+?+Gt_OY=)=|q5Ha=`QHHc% zY;mC8P@JfCIbpm$AX~q@fEyU6A>E)6eHSPd`h@IYxt$(qdo;=)8yS;$U^A?CSDQuJ z)!FoCkrduQdy!d07YQ-((jsja{f=ZVl7j_QMG7dTZz?9y80%CcMGkmXZ3EiPSW2Cfth5A(=n#gO-ap&2 zlmq>{6h#ICB3@!nv6fG<8h$DCXC!l_s`*1{x5>{@sY zAm(3a5Cs_WEIv?#*A@J{$$Oe=;fNG`FQiV2)*ywhYzK&3coz&QV|6;cZw{qBNY+i| z2I0=(DDh^|M?|4et#pnW88wHrV9asA#lRQ`b}<)Bz8p+ZGs@NX!#+IPqa_@ z6xg$AY<(iiQsQ&i8+jJ(kLXh`&ejzxRYpfy&x6P&%3rnMH0*t09~kkcJ(l_{W_vca zpM>p8nruHG+Yk4~vMH8!O#gzqT4Vx9SZ>v0>D--c97XtW5Y?*s@Ep<1uH_A-`3K0c zD>McRFXQDdk*ylakXe2bi{%)6aTBw7ko0w~fwyT6vuc>;kMeO9u30QQ; zU7tlULg8hIub_H4aB)*I6q1KN8W@z3Mm7;5=O63>(0!KzBu?|69+<2$=iF)Hj2-`PqkQvVz- zhBg|m9(FHRYsb)GITeg$*~5;Y`pmVeK96xdqrg;s)}Gb0>hnD}u;wJ_kBXZDrmu&*&qZa6U8tgH_VgX072G0$Ee9AolMHuh% ze+H1`+=~yCcnA)GQa-CK>0l_vRHar>M4z|Fzrmims5g=P?f2Bh7Tun-*Y6*W2I9V4 z(=l1WGK#?h9y`^N)V6e($+ZE!4UV0Hb{V9A(ncCE)~9A`hXm4VwI=pmC4dcEAcgO? zi`~^{l^I+5Uxq7#NP4U~>e_3@8N>H3OQe49 z`K0~9PRwmi9-Bajz8$ypPp<3r-LS6JyG{Q_^u-afU%LpBF%O~z_? zqR=GN9VxBR^HN%$-YIhHa4qE1^DJ7(;^>*XK-j0kBDy{8QQw`(M?~pJnd34^O47k`@b>_$4=&W%4o`!1S*Mu zKGnNycV(33FT)BnrTOK>S5bjLk?HyGgH5iF`7x*>1eO|YRE_kRMX2IcP`F2=&lDYkX_u!euA4voZhqBF-U2slRMwBT*Wm3TN!eTe%e-DE|?BBKC~6;u@y-NL5d7rzr`s zUrD}yI=+p3jE|TDW)tm84=$|l3LkyVD;6V0>_KfQHQYLhGyjwMYIhk!gF?{`SLjDY z5+Tfk4Q?YvyibqrMz8^5gsm6F8{G6EncSP-=J5uH&Z{?aJDr7td?bbzwlVkCQ+!fh zcW8dD!Au2J1m;le8^b#z7J3(H3?JhocFHZ#;Tqmpte0j3bWkha6-(eTKK>vk={SZ-o5@aWW5uX7l{mVO#1^2=!#7>?2@)}a1qc=8+`5`A$z|L z#PGlyRHrOYsaE%4ciP)34DI;l9^8Ua1?C(W77kD;xKJxCqFDa#>Bz5!#V{;9xER!X zW<7{b+7I`-amHVr1AFrb+BdCmrJ!8vBV5r{#{GC*IPmK(8;JdyyD^*GlDm#PUL(gSYjauvP(ED*^=0HD? zrg9>1eo~TF!k_mG5xP{?24^@ZV@C46j(%xfw7*q;T~`BlL%Gm?O~Un3Mkeu zTc1>bvBiKay**ssd|~dB0nf%cKF})>9{;7Xww0R}Q-6mP ze-83O2kzb<@c<3JLT{dfDWbMw_1)IuZkHtyNC&Yp>~VQCFzOCJ-e8&O zxIBM5if6B3!){NTY8x|W30v{|##5|ePv5|&_);H?j~FED_q(*cti+F^5|bWOUl;9i zWxYk;Am_^YDwa&u+pAi+uv0B{WxbcU$0hwPinkmf>6P6RTA~NO5`3_A*+9-VL(0#@|^ER^|v5 zGO$JV#^7NfhK0bXq9ZOHT^{#Bk1~Xnf-I`WjJwU{Y@(2)MFm&4m20B6hP=vXkB70X(7=U2SKvQ3$!1mI@u5-&Mr`lobYOwEY;p7ToSm!6a9HIn@T~ zJMBLB?*&w;cwuKm_$k*I4rKpnbk`oxq_^m|CU7;T3NyH-@Y8AE8?{1q(hyaceFXl~ z!OqIAK(4QPoC;AVUWKO$cCXKC;O^Y5?6?8I1D|A9oo472*7@(l>^Mi7e=D5-c)z8- zM)Ko9RAr3GCCiZaBK6k?#5Kera^834)4@%d+8m&c_#HH^!d0?2hfzK?KgOLIY5v%RTkzqJ_;B+A zNj$t6VW3apvdm4v z%ufXt3C=qH&IJGaH*xUS zFr7?C+f%sax(DG?pMw?YCZ#9frsQx3SueT%Ar#-rSd?p(_Oq+Zeo|l0E89(*KQu{?BYo{{U1#tG`BUAz&Wp->N>^q3Rc_T2#$F zh~Y^UbuDCX)r79&97-hfeNs?BSG53IM$qo3K*ZcdAcEA~op}FH_U@U`TaNUaWwY@+ zlm}2|+~~%j#KC9a_BS5Lkk<=@WV#s7k8-PJ?;&Xh`NX`75Pnj>L$bY2dXVmGdUr`P zbeoN56MG836!C=l7q{}zXH?Btj*QCp-LSGks#`~@8MONB`&5moxplN>O5}p}a>eEr z`3KnhAlrQ28h>Bny(*^msb#BNh$soR4m~50%(qqT9M~i$Ub={GHT65CdT~95#^0CV zei0vR@)sD%4jWaP|6^(X0m|h%3=|5svL_0+14atCrp30u)Vz~+F>Xbn9O?F+&^*~? z1CY8gA4VxgZX=YdRqZw>+kY}n_BMx5ABr#tDYlcfACR{HB;WQET7N~#saC5-#yMHA zJC1OUdE-ReRMpQ(?mJ9ZB-^Xly~x!pm38|$Wbcv*J!KScNb_vqjVNyi8ydtWb}^yh z8dG!uAZ}_8fT%tlQMnMt2p7fsn&OqxyuQv0oD-ZE9Ou8+ zC|O5=iO7yb!MYPu`Q~^fDTPTdAg5#5+omAie-Y%fkJe7E2SsOF^%63fp^8cX7C!pUn2wh``Mg3&8BsJP((OTjIsym!jIR6(9a3|* z;mItvc$Z2urrI!YzLtlLtM(lrtB`*=EXYhOjOYwC?CvQVwlw1%T3-!&t{<|Zo*-mk zIMk)LQ2^0a=FjGO3?+q~0IGv(|4G$;he~+tQH)oL^gdqk9td2lz%)XCpaY#la_$~h z2p#2&p3yrK+`lVEtYN2$1`PGK+?`5BZx4;%75c=U zXPf$_(ciD5FMi!h=!;W&G@&n=@~D8UMW8wz=h@=gM{kP3HJ`YoP)6z# zrxaRcOL^juLN6jD^bszE_#b@wZ7$Rnhan_f$gnT;=@VQS(CWr0R@}E&ZcawAuI-+T zV#V5uC=`|$&o%`r6`fK&Y-Mwehu;SNtmxNMgzB@#vkie6;-hWOmYkRelyT)hV>2D?LEflSl;{Dd|Rz;?<8nPBtyoz*T2hK$t zc;^K4=*uG|EginxEvol>e+LzX;T{aOd-WpJ_sV_tDp)CZopf@Oj}9ghfT5-U7hCLV zB3}zn92&WRYSED+Nch_2DPTy|*bZK`)!?WzzKB=u@v=vMa$)Wv{%L7@z_`Y9~~b z(Y+B%vpke@NA|9jLK|p+dtJzM=Znv`#hk0T2jq&K_@P~vm2|Jd0&f{=WbhJl7vL1G zqf4VjLMJKWu7qH@3;t%rP;@ME2TD>AQ}s)B$<=+uC7e~aK>8ULSmWe_u2{!1*6w7~^#FXk z{n-kRsC&^Zvz0#!Y*+AV4_$Tm{v<@AULN58SbYfO8s3&l&G%bY{4pZT<&lx2Y)ZvX z8PO{n#CaSJ9gG8i8VjJFkHI!kLK=Xb5JMY2{f@x% zfM6})wK4)>DbV}^?v^KZPyDzt+JTgdYBsbTQuq%4amdL|gjKNPuR=t*2-mfjwSgzG z?I`(g9KvPnfKyHX8R9Q6T|=ZEPUXr^wMgNtz>k#{jrsAxG=BUSdt-iljoswOQm8$q zD&ag?3N1rJm+s-?rM@6x)+A!meHCERxT0k)3hsAfRVG|fH+Eh(uBxBat)A{Bbn$7m zv6V!(l;4a$4n#o$x?wo#buzPeRiiI(fI@t9Kgh*@A3m3cDCrf1$#SFdmk@)x3~6oV((i;?|mm&db?74 zuQ`GDUU`D0w+nk`i{A6Bz1^w3pBB9zv-Wml@5P6C?}^slIjOyeiQa>)y>qa)U-UlT z=$-BoDEUp`v*vD$&v$g?_bcjfq;+qD^dc6MzBpKD#B{)($fd|u;f z5}zwv9G^dMHG|J{T+QQi$i??{va30KzS-40KKB>BFLE`9&mLFv`0Nn9ZLa3j^$s9wF5r* zFcP9Roz(boha)CY#J(u zks)(=`Hs@pG0efIMXxN;c7NF0W_Ur_A9`Ak1*j~{CR`0Gd*1Xq?mv?}7oe$Puy8ki zP^F{=3Zp-Lj2n}eX@NP+2H)2Mb5I0#KE6H?<>g{lO7Az&`9u}_TNiHfzrudXsy7NR zJ#;B2_C+Vu3;ofwhtODX$b~mL{W*v$pc}Ws3?O*-RlCvANEH$OD42iG^UzZQ7B3i2 zhF<<>(#x}xdihFe`W&v8v%20C`-AFtYlZacR$%AjP@q|(Cf=T_)kxDHwM1+vbzB-z z`(WXDRR6te{2gTNb6)g-J>8NHiWK__G_vS0@K4!X_x9NjbM<>vV?8SA-&ly6XY{s& z5l{3%Qt$pmn&C^}?Jd-|>DGmkpm?ZE0A=)SQ~&-iv?qU(lJ=z5Dczn77|*Qs0>Y()@GmNgr)D3Aun69N6B`bLc_ROumfk4lf*D5 z${;;QC+$sW8ZO593FK~pVveKGbc}HdJ8&s{jxi2O;S-9@iz%Ed%x2J z6eWAL4tR?g(3u=?pcqiL4oFhJLCDTCtpmDJ2ka;YbR`E&NPNIrsj(vPj2Li(WEs$% zI$(vw2XrR~d|M3os&zop4GP474_gP!NgZ&E7%(R};Pqm_A=Uv&U07Yu2P`xPOeHys zJ@=oMC$FFR^X19NMEof7a&`X zCtI?PC{L=hjwnxF&pM(!(Xx&xPwvY4CGteo`#aE0ME=OuDrk7QUt`2dZndKAmhDw? zMIu9~IGCY&KbNJ;SIFL_{$8?vw(QLtdm`#@6V9YNG#KjVt-wO8=gZ!LaZx%t$2c4X z5PMd65})$RE1QUS_&NU37LEO*^-cLlNA^6jf3&6EqPThY=*ji*H1}v`x_fj+Zkl^^ zU)v+PM{jTJAAPSm|LDa>_K%u;$cdl6Le*Qvj^}!GKW8zOTEycN7peD`x)txzu^?^w zxk~ew;hi>C8d|1k>jEcX;MW>xm8AiT6luUJPX3tBCvqdksd;aNA=ddBo>jx@t~wtV zyiTQ}IIJj+t-RA!*9xCa5-LkjT zRxCZXIQEn!{t)waZf#*W=9hBB`)tS{C3huSRoZHWB1&>N1`cp46&MsAmvfgY+A?I{ zs<&?JO@JFpE|_6^l!`+cKJPbDFi1)LRshEOa;sGDZYk`=%dhrwpLai=c}rn*gs9%1 z0PT353m(O~GquXuvj^kiK7!Vv@WB~$z~#Q&Z+zb9C6Nmb%GxAHpo?m+RlQ5_Z+{h#adav2`KMhXt!be}zu57nApcs;f|+ zh-L{tx)JwSntuTJ2wftUBNu9I+u7Iyz_R|2_(`jCo8>2_uLF%6=*$DW0OLkGNtTio z@+4^ht?-x{Gwjjjb-sQd*e0k2s6IWAqiTnI-UdbL|23Yob|9r+@9kk-fJ4FTUB27~ z1#AB%?!Vs&e?a$XlNXd|yTauY-&eI5jD=T}0`Fo1RfpdrN5;8Pd8qq6cAtp+%_T?r z6p*555smb?>bNYKbib$|3f*)R>(-ENX( zJK5v(5ozWRM7#~~T||T_^b~}eH1jdC#HPZYVDyzglPkI-;os;j$@ps}2zr6d-vv|E z!O#u)>*dH4H|xWf(18xZwz8^|xJO7^jprVJL4CNL>>Y`k3r2#7Gz?VMkRO-tWf^$7 z69gUH9^Gd-e^I=sl7t=*F4G7)mm_2NqL#1)J@B)C(B$AGmkn0oA0}(Dm`-{EXgdBL z*e7}p_L=CWRWD%=Q6EXWf=+*?xNLqmntJ>~Nx)6@x8N0Rk@ihR%_-3#2}~nKH|R(U z+7Q?pUCnQQvio<(2}lDn>JzLlk8Wd67jn1ngN}MbzPbcGNTGG4BHe&0l22QwiDpl_y%BCbBYnl*Owq@FqwCLay#9m#;`(`=?a>ZP zz!X5^{`C6?C}1}W=m6IQcA;u()ZB6qZyZL(eTWGFr>g;FcdL=C7IH2H$*A5h6zTHa zvUiUZ;yK{jDqe>)V>ukll!S6%{h2(r5NU~75zcYED@!Ei@c|5El)<8ra=~sZ6x*bW!?5Rm@)r60Jzbp0% z&g2PLTI@YGt`-#&+y9Nc`1-77<;4&=GQc4Yr+7VB7i!Rch~7%&?LTqWhgzcAKH8a4Kok+<`HVAIC| zVBJJE{oG@F<;<@;A7Jhn}l$%ALUP_;i zo&`J-r7hySu+W_%UP%glDMT!6K95)W(h{%KH#J^qgTEEdc>J|=U!k6qdbXL4|2N8_ zmS$v-TdwYx%mK9(^Ifx1ZOK;f7a(@DRJ^MLmyxicGG-gu^T!)P2DN^`+#HKFA?SD$ zQ6w98ILv^Nvn66i;lMq{tRdZJe)lJcq%K9@9&&s|_pK<8pW=Ped=jtX=> znb5g74V@8gDfqnA(0|S#d@cdN80c*AHwk-~vmp$7p&5#OxfE}b(Y82gp2>Y_kK0mX zYzr6(8>G-l@HVVnnvU_m3kTx9T+*#RQL3Mdr%fUk6Z|WSfAeU6P+h4mDiF^|H!0D^ zxJ$g_+~aTOlKKRSPwiEoDDjR#5*jzvAfHxrC3X!6b_7uy*i$Kdz)&J+Uw5*qY|Fkn ztouryH=u`c*N`TGoMaOl(2;*RE|6Ww#O5x8~5cY?|Z#E$Ze#Lyt3F@Z~h5tgan(ceM9*Scf~0sON6*~DZH{q3hfd~`0N}*q27Xa zM^}2+KQO;sL3LWywwWR?DJD#jw=_jJ5OOM; zenUuLuJiwy?u)cHOoXjCyGit>|Lo>xmphoU+7`#MC z(l6{n!})=VkQ(@K1U=_le_NTPDb{pTP}gao8)fVOMx*Pap!X6XLrwV!W|!aLZ8s&XsJhH3_M zOT@mWBAQrj27PGu)9q{@wlb9HOb;8~lf$0Z3uf+f@gkmxk*a39F$x0y{ebs=VZ!6UFA>GLLq#Sf>N;fDQ8-Tq#FJdK{QQ zK_pZljjM*u_&%OcyxaYErKD7N^dv(m6FC(+iJS_K)9NkxKU9yLa>dFE;2UV(3k0h) ze;K7sVDB7=Q#9IVTtDCg`xoG+dSHn7yX@jE@uV7PtxW4zbWgiyu4DFj!_u<_R&F_ON_A~Y& zqanEumo&2viA{VTZcf>UHr*QSgOS@}+VA)|(UIVfz~@(k+4ozTh5xH368snb*aZ9^ z;&2EA~eyMa{a!DkU3LLAsOo{l*hsxSg zKu^=(I1uLP1S6~fYiAUL_2#V#M2myv_WT2sH{=Rv?*IlGwcEdzy%m9NbSb8~f+dVZ zbQPGqT(++Tef-1oxa?82S;OfH&~tcsRE><9E9-geR@9Dl-l9{BPm9Xc{YYt}d!BSM zh_S83gNmz=D zRB(!gpEn9Zmcrd=fsF!@Z9(P=*7fn9H8v~AF2HIZ(>)Yb(@C1w*KvU}&<+R*6Gln% ziX1^^hs~dn!Sl%SH4Jo@=wouwY&dR#q4c=jcrloAl=QLJC!(vUUfDX}^of;t--u^- zHRVZ%&BV{5O2VSgO7HjUNQSOSC$wHE^gF`ot?Zu|DSdo0j)Y@_6v`0ENT0#>1Ic{B zoLabg7F}W1cjV>50bLVJ+Z5+WC733TGURy9G_T(g=OE{8+Y#JeXp3zY(bywT;?bSG zcknodA*c}6F8H2Gyqav5cx-U{2w594Gk;a%T1*A)4W2nEkCK6paT)lFcnCThF;2us zCHFH_-!6}2{@&-UlR}57q-lvKg>Mj@x}h>d~pot zixY`2-bQ>;CAQC7bZYT|L7B=IxADX~#1}tmW#NmY|8Yv<7NztcQk$Tt8BQYL(wP>n zF~Z~;FK;%u#t;DV-4?EahXwA$}HOvvjlzQ#PqugQh2qk2{uuLT%S?}BVFP$!(GMwL7EnFff0?kzzN-u3($4` zvku1-(L*UNV3Wi6`w&|hKQFr0YR{VVe667OPeG55`YiNrvZdGpmG=-n7tr&Fw!ayA_PPGY=y~!Ll`OA$*r{{&ck3i3hzB~#&WBhni^z7~OZ>8r;TqZr=c<|WiIo|$Q>3QupCOz-$ z%;|aFPwDjhSbI*-Pyf^eJ%85z7tr(UR=*i~o|64PM$d0{{blsrEfN29=s9Ek(doHn z#}Vjx&*w*>=iTj^rsq35{afkzzHF18QNK7=dhXfoSn0XVS0+85+>z7smhaQ)c~M(V z&!2psLeCn~^Uoz^Inr{{k8N#w@0t0_SVfC}nXC^SBUu;yQo4S{c?7zi_#33_uR8xS zx-N+ynXa=fbnQ42UHAOv=ycs>+Y#vcgwImx`oqR_-PO@FT|Y|ld&*L4VnQjr{MVxE zmz*YDzrWj{>#{U@UJ(7hNqW9n(DQYzfu4sPg`Tf&hMw_W9rLa<>G`tuoSu*0m`=}o zT5)a71Udd}?l%jnr(`zz7&bzdHxo(F6>0zF@{^eFVK zWHwFD74815^t>v|r01@BGZ9=c#QuJ@;RiPS2-iaC$y{-7(Sg8{ZoA z+&!K`&nZU<9?MaJNy@68mpn=UsuBqy==xXFzYjS~x|X&byZ-$K7(BGYwzH@E=R{B2QZqoPV zn~$BoW3|Uh-^UFMPhbBAvb;tmX9m(28TD?=KAcexT;JN8i^Uj{k4b_rooI z8GYAp{gvqZhsvYV_vc?9fxcJGKMH++v9D?RR!FY@+vxjRKoLRTqc=K1rwVm-ce{{_7_x^!@YV{r1NbiKEB+owYnI-fwcuU;@9fc)vjh_tWG3ZrjP3=cpy=@qP-lnel#;_s1#TuVe|2 z_mk!ga9FO_mF-PQ_eJ?=j`%zJ#zHH*E{i_X=(nEw+mc`If$f+{-@>&+o;5bQ_7(|w z%IS&UQ=mpZ@=(f!>X2upjsG9~Ia~ZmH(zAJ+>oaX9_*E-;uCmfj^!1%EQy~_=92pe8F?2CUWHij9L~!h!&v@+m zM0`9(4B(VP=h?+1MP5lg&!@kNV=50+;Xgj_7AdsHhI8We!|?(Y=FItmu<4{p7vVLT z&d;8EmJLHcYLJ3w;N5x5u7+5#5OeA?54jq!3=ya}fUKGSz~9dj5ab{SogJ^Aof6er z(8mmUZ@Dwug*e9`x9nZb?+J@*!l7kS_#EVI-qqtT#Cw(nsyEIn0Z8F%pAt8w<3H&8Os$-VOf0VZA2coHYRZx(nz!S9dRN0c%?yPl!y0WBezUe}~wUmAO%{brM@M*ND z&d(NE8Xqwl9lY`W5=-M)qY=|mLE}q(EsaBr#udDA`9+q-0;BOA-Z=a`OXG<~;}g6w zqtMcb`W!V*z`6~_-eT_0=qWs3m+XC(s7wkiBksD1eMlPDKzH)@|}MCyuu9;fZAm!dK<(lZ9Ll};p4{|XHXj75zMchG&vQSkdY zVp|U^p?|zF{}|BkI3ohGauU`I_Q?J`Rr?+@#vs4P3}Hps zN-n&@J+3=q*-*LA?-|ao)FW@tkhSCGV3h+GF;v!11gs4_ssxufaDL&Dq&UHpDgV}o zTw77FD)yldU@_jwm@DeU1W!acP=+^$Vp+hhl%uAQ?MEc%q!6g#uTV;ck3WbTnJwR? zb+Ml0t@wyDY`qukM2#OP_Z0RRzKXS9HwAVf6xI;47gq?S@Jj@##V7)-`M8i|oRl1s zJx~pj5ydD@Tg>=Q*j*C9#HpT4;@Y57rSrZlJJN4+zpIW0fxB`M86C*QUjnc!%Ypw!F| za3f|2=w`Kt)AIyOax31tCh`Pq2za1jLXoJ9mQ0mtsi%bbO1#&*{G$PBpa=VG3uwp4 zp!A8DCII@6F9!lGrn}#PtGVF3<7?6^6-u+XO+|}$V+m$p>*r>d;yGflRAgS!)=(PF z^t{LLEj}g;?*B_^K43#BEF;rXjA?Xcr_`taHS1fwxjKHt{H;f+Z*~2v=?Pq$sBcyE zWxR0$*K^*B|0n8OrPgyeFi6o>mul5m(f5JvuM_^1f_6b>IOXxs!g3?2vyz*0EeX01T{m4HNe`vlF3xTaI|0v`^;tBG zwibp(HV|G;KB_tJWj=mCy4By3eMEXCjV|l&QJda6F%Z7;F@2b%#Gl}$(ojMQbNhQ% z9)BB`giGGas`k*js6?$;+Aa8MB+Oj_yp$|`nr(Z-Tvk5ip*s4K22?h&q`!a0mxLcR zm$T#V=5p|>=}Y+xULf{>RnAMu^$;R(OX1G6k?+ygM}|zJL4niUWh<5DRg;&n6yP_D zKo#lhRJ1Q(PKX_i<~t33&*Z<$K%Oo{R9{R=q)(sX7<#ReL@{DkvfJu*$J;f zhu{oSDHSSSypd9~ggnR?*gxMQSnv&-GEELHf~9{0E8Fvq*y3O%JlOA zy&uhiXYBIVlORCnA3OPu+JM@Zv@%7#EB|mOE&gz{L%0vz-|L^E$mqgQtEK1#bwr^n*d%kda%AU`8>4Oza88K^3;uRawj z=>P|s!MvZh%HIw&|7t5*iO9|GuvfE$@Dy6Q_r{o}Nx&&&vZAzrSljpC=dl8`y%JL$W_j+U{ z-x;S8tit_Ly|u78ZHV{aDIcb;N?hsWzEzd#CiW^e)~ZjK5a3*P=CNm|&& z5`8Z289VJmYV?nQE>E2xx*#I;;=TNrBR|rr6IYTiqL{1Eu|MuK|?OM>oBdYQ_eHGCk*hGls`|Y~ zn%;`^aH@m7k*)oNd#XpELy0`C1%bJy-^Ia?mTppdDh1jB_+tM{*H^#3DLH2N>}Y zuxo5Vv@wvvm`;uZiGv5=1khsdJ`qpca3cjWu!3g|0neR*p#T3lIa~Po{-3!ykN^!Q zwi>?HB)fdbEoR53u<#$moZWA2+ll2^JU5K`B;b45lo=ytr zNCXmK%bvr*#{NTy;&tz$QVCiYDU_I!$|fubkuZ@fsW5IHKd{MNB_nlS z<#8ixF;|!Zz+>{)$4&x#oQ0+io&xtKqgl^hME(QPm221qG#L`gW549k_rA=LX{gCm zRP8${yp`({BO2)wdS~zNIJ=^QUUjnpTR*6F)20!E%`K;e%%p$3EBwEc3-1AD*ecLD z`ml|kbILj@rg#G8pNydmH3Slb&!I}8bZcv=?B&iB=pa}KX6)>bngYZ~)X%hS7we2TOW?tvwscwXXq$;j6%U*XztU~}xJMvweU z*j^Ayu?4C1z9y#N>j)uSi%DMAa^bqYI|;a{@8nk%$Fjit?A45U9r0 z6MTm_@W)WWhUq8-|1;2k4dy;`(&6Svz~&KaaV-_GR-_Mp0o0HAJp=oYm&n*~oHZXT zw!6&sfxys)a)Snb4Fzo(IQ%#;ld6^6TGgWX7N(efr1?(Wlt(bRyT1FSCXf%;6MGUucJ4__O6axAQf$MSaG0qb<5_lTZVz{gqs! zM%Aa5Jw6+m1TRaMV6S{*Xug1$giv3;fKNK{`naIVaTPFnQiAhm^pNgE8?4=*KyiL@ zVVs}Hh)Io~#Tn#A!8e^IC!PF4oWdUb@NevM^oi_lQr}aw!|0r@HC!T*Tk%lNj}}Cl zKZLdZNhkvtjp;CP%sl*qKPK9Q3JiEnN4NaI&+rV07y+jiD4^kiq3m6X8=WJC zYQ69bo-q#0oKn~? z&e>fKULq0{r5qknh`%S&F{=M$VpEjt90zif^KzI+X3wvKU3wW=(_+9mK9Q!NY6YVe zy!;fu38+>GsGearVc#@VLD{ahS>hGa;HrA}OQBgdBYxu8;d`+azRTwle6y|3v=1ob z^^FKBrwOWYYdKLGyC8+{wH#f#q;Oxc6K)50{qUX0GpVFLmp7~TtbJ}_4eJRbqwe!sSWk2IQ z^G_c%f1Ww%@Bh{3ncvPort{2*_Os4256nwD&%9;!f8;##S3m!K|JcrdXa2nN-}lnb zfB*C!gjWN?i*OquwwJ0-eoi^S-<)UYia`A1TuODRlhC)i)Wy+w6y7LsgN+(KP^^{O zr0|18rFUbHpZ`*yww;sAt@DHiRq94E!NLO7{hVCR_M@S+3`HYGcMI%Pz=g9t6=V%) zY~R9Af%!;Js>V`afTkM8;UOFChzA;4KUeG%@tjDrmK|D-D=2kE@8E}utW5|ZyA#{q z);G zN`nuSem9SS^c9W(+ST|(+)0)8|4uQVWjCDQ1I~d`VFLetfTb$ zSTQu5mrSp}g*o1ii^Td=z~DP;ywUh;ecq{V?gux&HRFv&-wKx6Q6;}0zeUmw^w!4+ zkBacra-9_Zid2^USFMz}qO% zHw7N^b2U=0Rc)7Q|IXjWhX(q&S%^pVlB$7`B+bw=>8D)^PiTt;6Xvy=DSA@iWU$xE^lTQx4;RCSFL&^UL!w~-Z}4~IS@B&jip{GE zlvC&^wp|9p-eE%*T@}w`o*dU>bNzZr;J-ZCpj)gj^+^<4yA!%{^tS=o2+11 zWrAVioHQ^Luv_MsV94`d!NKr;;!MN-w9a>^7(e`Z>(u#PoH}3UoK!gE8F07}#%%%) zHe=iYjo`a=MasDIUr3Itcz5_!1csZ}4_5SxnCtC0h4$3=^d3wnJ7pV>Q(%YRGM7%R zqf%No(IGEI;bZ8g8H&;65jw2lGQ|q(8m6uguwG?_^%4PVuoqqsA5c|((M-nCr7W0Nfv9^Wiafye95Tj314G&J>xVe1YPn5{!T$9o3GScjXLI@~qM z;TEd^-s7--AC+dVq0@BE+)T$4=m=8yoz`>+v$V19jeZ?W&~M|eT)i7GBp3G_UK4h5_GT0Vo{TWEjXf`q?Bk za4v8?9aVIGGJYny{4USpp6;!jM-MkfpOMLh3o1kHp2L&%cj>N-8X1YO_M?9WM33cK z4IZ+C-3Du~5*KOE-G+VG=@d8i5&hs{@^~OmboYX8I3{7ZG~hLIV?2q&ad0Z;dwUcm6*#XKXTZD`TjoW&CI;Ju^!)(5>_cdu0K83av0(~E z&#O<~k37P93jU?_cptE=Cog3^CpWX6t8hJEOo#QTVm(dnxBQFt`<-cx;cD*plhc}> zM-{-0DL9Xh@UIu9#EpwD0%6aX&TaC5-k7E;IYnhnMNY%`v~ zM=s15+1AFMe3=H0t&V-z#3+xjD&V!TnXw;Y-^Vt^HpJG)9*;dL_DegjvYB`tZ7-Ny zb>v4T)1X$?-t{yJ;R$(|osMuyP*&-BfpJ?>)`N6l0Ft&y_AZq|(>n8(0Pw}E!PA9#@y&)wO>k8$7mL^_0jmqt>&Rl=7|_XQ&#(W`uYKK)Fe z{z^FlgkS6Od4G~ZxK`u7CVhG6a9JNQ0+R+N%w9<d}WczdIXkbzfK7U<2ht6~HPuAi!afS`MO_8RuE<06yjDwDb*r1mMN2l1^ z5l(jYM6T>RjF3}2FXp!|F|aX=?3^v1%IN^VjML*%s&$Y*rL|EE=l%yd-S|dxnj~x2 zVbY|G+A}bxrWR(bo_#+#IVxHUH1M;;JTrK-0jl3Hl0FjAf1RvPv9Vq+8q30?U2PpZ zksi?B82gchv0~`S^WKX08zXdd@DbltqWD4~8jAQ6DYV{+@qG9+2>9>dR6y}i;hOq{ zc~B(lGx-VJv;vc^(4JN!922OJS2f2D&YY9fgMW{(8+rmBv2p@3+NN3lo^5?@Q2~b9 z`M<;ITR^&4z88;h#q%-?{>er(lnb+&dR0;~`FW@DEP2bLZQ|vYrJD>k5ac&))2LqzN7GB7a*vl9YlQTa%}3 zICru{9wv)Ug|*Ygb&MPzLs#DCngD;o!*3^%f%Y!F&)j(O6y&$v0e)Xug97)Jp`7r0 zC}4h}!`3v;tNNQX6t?A=2+`k(?A0wc5;u^|LOVS&&_QQEyiL_|Om+<{k57-j{)C6K z`X@X&c79Tew8wVO;lWA@nN0KAlwJ1Haqh>rgGrBi3yNGoB15R{Vw$<9d;et zO}1W!@l@~^{RtvdifK^@O&%7_D;10qGta-PlKAG8sZMU{`F9l}+hu@ke`v+&ibjb% zK{skCuD^<4gqYNdA!FfRRyE2%B!ymXmHax58Eb!g8Aekk>+@ccQlA%(=fOnBb9b1~ znTXfaS6hLfrJF|N=TQNH@fUk!Ya(H;k26v9n96zR!6oqT_<+ke0*TJAmXkrWnR0bC z7Y{J8sdxY>^cEW8h4;Co2Vc&x(8|(CJdpzT^)30zUq2?ecWB8a?&|{X{A$lultC1; z{pc18(DgBS^yvc2r#aA#@BUo4|IYf&H5Q=1Nx#?$Tkw|E2(YaB4=1YP`W4Dt(zp5z zC!tN4-2}avtO2pzN^=IQxa%2CXEJEm?j&&;bk&$S?4bY!B1%DW%R)`?iR?+uQrp4% z@cc@!2sJX!VV#yq{$D8Ug_Z~N5I z$o#U5CdjL8h^?6v@NK#afM`N~5ineX0zCQ9bP^9?7d6e2%{J_e*|(FpGA zLzjgS7&rx)Xlh{KZEd*=apCMZ#hDcVEsRj~@xzmz!s8Gh3UI~`-H969VW$+n#?I|o zP{+WwXp7@|hdQLtJb^w1?4F6qJW!)Z)d)XR%fp164(*Vo(PFyY? zgR;{ma{aH!dWj%hdw}N0;e^04u2Z=LbiO@|T~7r>+4P#?GbQ=G}LZ>BdA2q)8bR3ME`X5Hy8l5|nnm*h<< zU3?*hw&J=9i$UIOpkyn>InunnDjLpo#!;L{VdQMYbdH82aD7A3)Zn}bZv%0*Je+xE z;FN}WTm~Pev|WI+TxChI)Qkvj!I1BKA!*mp2a^{J7X&aYYxVLzfb zR$qW?WwWPStj_gM8jRSYWcl=a96Y>>$@VpPjJZP!%|`tp7?+T;cOeGN$jziDb}1~k zq?622+f2S3%ZSg?LaH)dNvgeH?#Y)Y2kSDV>F1DQ0E9jWa`jB^$RL zn7aq_^z0)Jr|mkEP8O=!=zo~U4&e(jka$Mo$P|>=Xw#E6e5sSd*2U(3JKMVXmDtRu zQ#p?3z&WyZooX1ArW*#6A2-Bml;5Rk5{z7|x=azjWpWcDK7U-ZuseR71y5BxF5gM0 zD(l%xnK#JeTgWVjFO9$`Yxek~{3_TcJQ8yIfTJ*5kb8rNiwpu|FxXq$h^XY|&-Frf z-J{%~hp`J)Z4bNjQO+D`kNKm5M;oz+IYNRms{qgX1){r!Jt?{k*-6Jfi%@ODr|gz* zlcJj<*x}0{H*=&=FU!}t{LcFMQai^K)qIZ(Rgt!-W<3|ytg*00iB50$-Ue=O1KwNw zF`^I;D==<%<*$#veF)Pdh^qt9m+KNucs06i2Tp!k4Ud~0Dr^07qGy=#nV>`>@5o=p z9{U4_`YwyIi3r9WTI)!ItY#x0!M8ZWK^cZ`MPH+XM}9e^T3|LQzN4SuzK7f=_;C8{ za$=z3=EzzUniF{{TAl?gLWewqoZL|*vVe`aEzYxm+0l!InVW`rCI(g*pT|gG!09ng zKL)|jt(!pm*zH0-7LIWGucezPD+7D@EvJG9Ebc8NQ5Ag_30lM{tnC=g_r&usMYqKh z7H%9~FNOXI5RYzeNc!JVHFCyY;X?0EhYgXKg;LdYz0yUDOiy&d4$Siq#VB)Tzr)Jq zM7;mQqnjntdq;CIQNZpP&3oERFV~RK@l>6oWNfO=;h(@Ac$57p{7NnaB=G+QQ$KaY zA+(y<_O#}QuzBx2qB$z_bW&7gCR&L9IT!u|{UR5>d{tW;Ycq}<)$|MlFT`HL!-3~0gM~Hy zW67@{e>;q9>wh_(^tc=1KPR4ayq*?M`oqZoSUl-}VZ8lm{$#xU+PmXv@%BrMXV!T8 zxyD0Fy#2Grvy^!I>HKMP@%DG5G^fSe%PGxi@%FhX&1v!WZBv?$7H_ZWSHYB9DcYAx zMa=C3;hbBpYE?=_T}!#5A;Ska{Kqx01A3lP@m)*p3&mbuqHXf-8QY`8yRk%=L$Gb#>^f3-KB0Idh$4>G#=tHw6WP0Nh{@`6ZHURt= zTAa}G2W-ivmK&kvI(>`_T3pc5vQX|e_EK+yH2pCfQ7Xs^Ag}jqI5lX$+S`D$nQ%w3 z_u%-O`7ExhyuGZ%yKG!2db^3Y-i8Tf-icY`b)WYWdPUeP@cc3F#O(1A{yetDd@jvE z*5{p=IpHtS0&^S~R|@7L>wBD!JF%NI{k8<=qNe_@ivA_up_$V31lmU&GNxVkwVGRn zAeMs-E&MmYe>40SVU~afJHIv5 z*yqjMk4xSKQT4HavnTkp{YvfzFsYP^!!42FqE+=rOjM~<{M1shZ-@=Vc;G<}o(MyY zOmeX2Ceg*Bn@~qF9|OJe$Ctokm-*PmM)Sw#$iYfCs1*Lb#s$F|Hkg|Ha}>buS(w|w zks4R#1H@_Sxe5$Vh=*WZ3;#K2vH82ef9?L_{KoGmKljspG?Pc^NYs4#IX#ua6X>dy zwdFn0ay{myDv!OEmY+q@zT)uI_Hy}kzk?OMKx;Y`WyD!vZ`l~xTU~!RQ{-kzyb|Yu zcxl4IMn`&>>pTL25Cz{sV(_0N{dCQr|DY@26H)}PbxRL&X}?jwD~J7U2y$uA$Cbjt z9%p1&OE8AM@%KWty`^@#c5No81P(<*g*|mbDfeZSo>`!jFEvGSlq6a&o9boKbi75a zR4=Kc@O5#?b!k|9mLw}4b=r5s2XnP+G8GK_@q&KwBhfHOvxDY6JSnu31e0A3Ryg=M zcKi422g*f9n487~QS>o|`PV>ZW8VI0}}N#F+sft&k@ zT(K=fjtu+K!F8XmpkDjz`+Zt(58jw?;Qaz;^ctRzmfIasH)4pHn;+C{X+|p~z5D~@ zilD@(72joo>Ji=Fka{0`Zk(gbWdX|LWf<1TCg`3h$ac6GA;iMM`plf48idC z0K^^e6CrNij|SrQf0-k08yrL26WPoop72TZ>w6pD59If8QDv!aP2Z`;pkMlz-%`8q%Or%Pvv?0+()FNM^}tViW2G3A1{PgrU?r#( zO3zZ`(9&&sk%y6mp4n0MF5-II^qcM6)OFPv$_*R+Aa^jhW&0xh>iH$w*i5?f?cF4W z?j^zp5;=r;fT<(jrInveYef9h?uB)JiS%?e9!$x(U*q`*P=gvNSqY2sIS2xvi-fw7 z-L#0qYY2@K0JUL>1z^hqn~7=Wu0>b@NiHRVl4xP9@+NK_xG|i;guT7< zYvPGw?_w!RIQogq6!1lDKLedTgF`-1BJCM zg-`=$qi?{h9;xy2-eH~sUVR6*CPL9G1KrX5gb`Qi_lSxE`GO)RTFzXavDunTu4FS0 zlL{!1KkyPQKia~C3$TyKFwa0V02w+D=ulu|GwZqi5wRYT0#6E!!}YvyJ*?->)b$kJ z?Uo*cZ$SqEN1n8^$24ot-aenI1y+ zz0%Z755G6S`}xo1e1yCGo}8R?`#B7^-@RAh!xPc-hvV5Xly9X%xfM%osoCnay=rbH zTCCQK4Zmx2vElwU{1hX1GOA~M7@D9R09QpbM2k0hz2X0{_vZ0Y6&MXDb?!ZNs_v7ZWH| z+Gm|4?E?>hv=+ZOai2r3QsVN8Q6#b5zY%gbb^UpA>!DcEdKH9M@d=MaV}-C3KKTQm z>VuFE_IVn|wP2KZ{FGi;?vhQelTB`*L>OxRR$QVJEe2(v_%NknnMY^fc~^d1tc8SO z*iJPYVpM>LVPNhlkJ9gQ+*sz2vCi!JDD=@oPQ1m~V7~QRn!VhU;c#?|)PAW2MrBYk z7Uh4@;TmvmlW=F&wBti~{8uT|ib?RoE{SB9TK2E24>cu?{F@EpZ&wOv$|SA!Tyk(K zxA?@^A6xr%9e@_2bX|<=v^?$jo{{uNf zIrAxNMf5o2!^Zb`Irg}jCVL$An-TrpKmA1eyUObCU|M2-FHSpA|IuaET>ta?o4O1w zfEL&tcYYm@S@f5NUf&i&reMiHEP?$g*_Vsb)5 z#;jscoE?j%ksbTjy=@6B(UR##SKYWmRD7$e;uq53lHTu3{h4-)&Jlv=37gh|TbJF2 z8{jqMXIgURyGitXPiOLaS>803*UmR(oeqbfSkVp4ukIDcP%gf6cZkyr$jSA21iJu5Na_nKZMQk{#sn zLsj8FM&)7|n(kl;o@(~;@4@dt{OOruI^XRpVVTE#6)ueuU)yfd4YM8{R$m0GFB95n z;^7l>4sW}n&-UxJc`rv{zJmAQQYhc^hp2s?^0#)|{6x%G#@J@wa;Us);z{4-lx?9_=k%BcF0zI`Vm>&(QYeV zZ57uYu@zs^DvM8PrIjjrNAo(`$no(*K6^Z=SvmbYb-bc=J;vL#ZsdbmPBCi;UTFSw zty~=xE@YGW*$i?jFPLOayxg=EFRXrJC1&9@aVuzi=P5u9&Z8@m|6-U zbq@OB;p}l<2bWHtOse+>>j(k1koEwn9%3^8`!1M;gXtUH@KCwayYjv$zEBb7GVdy*Tm+qR5jYcgBzcxv?J7u} zf*bLSuHXYX{=pA;e1@qv6!J4Fz#Ng z*sVS?6UU5<~H&mNQ~|6%Y8kGEW$Li?$Z4)e1W77~3=nC69U!AzQCK3^IG z=vT>yJewc#*XESFVWx1e%$}S9uZGt@XXF1cgwLJ_wKGFW$-R~+_?8j%Es8^ zu2dfPaC+S2uU?e#KYHGc*7M$>1t8Ssrzn!930thpOW_!#(K1MA<+!k&ofX=_4lHXl=Xlh*k_c7aN)Bg>E zC>yEOz>Zi}LvNgh4fRv*0PcJ)nkrg+jMhR^!K|E95o_%usa9*Rpw{+EQ%A!d91VLX zE29CX5m6!3q?)nOAP-L6d zq0nbF3ib609Uj;a89?tFbMlq!{O+yjiwYea4c^BNDy!UORf5%z;OZA~^>tkRZ6{iN zCi~y!cRr~9t4`~$qx!YlE|AOjhOaGtYf)VbtZSv5^F=t59Ropyx7f3o6rR=^%QJhx zLruBUhprbq`&-4=@T|k?d{16?dF;AN;JRP7(SGKLh#T(#6~6i^IPbhI$^IimzYyp< ze&a9FDm7g6n1kapAf#x8D3>M)|fXt;+t$xAvNgx_JpGExCSreN1|ILYJ%0tb6|fi%$%k4J!b_u&&|IA6*9#Tto#bz zEePoh?5;Q)_p5{L)6FB4-UWL-o4>h&6LY?*{2DGV;mb#J-cV(xx#!|G>L|EHbk+d# zvMVT5Je+Q>=p$~Ad;FZI?MgbdKZ%E*@Z!B@zIo8Ih2}-t%wK%QJs^&2_ z##}R&^u5GM9&#RrCqDV9BN_N1;BX`XKP2A%w7A_miPB@v9#3hG*mc-r|9Ol`O1Q*n zesK|%GkA!WW4fN*C|~HZBMCfa22% z@qCf}>)6iZ9pwoZEQpIifR0V6km(GtaKS=Xk@15s`zt-W6;jf{nOSo)3*c&MQOd7wX1lJ^QH7_|YdW@aMf-WW1-%gLo;X+3ZE%@p_k@_X!-f<<*|rsONpI zJ+;>B+)VqM`D%`Mw8wFuz>m#gn}!`k`Sxq~q@S$UY)sL5rw@RPPG4TjtetRhLe2K0 zdZ;hv8R{vY=qaD;c|VG`ulN}}*%%NUpKe^A4i3qg#DICW2t-%>QWp;NrJ(f_VFDthU0?4_N<4i*lpFSYvVVqZ-?^wTAxzg$ImO*3}-sh|9z zbnGX8$~u2agD>yLitfRSjZ16Wm!fM0Ao}%C^G>rqz&-3Y!_>b~&-?NGVMaH2>6UU!A%;i*E8HSg$ zV25{|Vg5Ovtj@DY?x;A$Z?r|8wD3>c%xWT3?(l)F!!%N=@7MWseAcb`WQqs)jQ>(@ z?E)y)skk!i69(MJ2K)?jSUznfpUroj;xoPqr}1+Jf{X9lfwL+&8IE?J?5H@?{K<?rj=wKumsYz3wRV=XDA?84ke&q!1lz?rZ(`_%cxTZc+d^ge7VLV>= zHeS;oys|}axSWf458wAx2Jd^B6yqBO$@+H*>wcif zxWfr~Mv3{BbUek?q2c%X)@)A;zha#iDCG6kv!Cv7__yK4uaNYEXcE{r*H|C!(qRj0 zH(Jr|m*I?|DL1v@X3NhFg3$(>HVT7v>t(`U`xRm=FUx|)?=$_PC=!9d-e1_uepFbs zW9<*$4)2%)Fb4g32kE>(e{hh0xA=n(XZm1MsWAG0hs~U1|)`M5OA+<0MK(X`rT}HsAomk(ckpb2|!uiW(Vi4_n zKTgQPbAYHpcbjH}G zBpX)S@?U*<+bSkQ?JZN=Xd`H>IkHk||A0s-`CGB}!^_5M`*v|j)XtjVO!-iHq z4=1mF87`;s1bw|TTz$lf9~fT?;#hoN?vH#>daU&o+R!4&`br~efcZW`bN?mRS0-8O zE3i^3Mttq>G90|}Vp}OF*5%$ia@nBFn(v7J7ue1*;RMhUt@alg6kf)8ngLtL8F)rY z6ZMmW_Z`wR4q(lclSvC{+i(f(ryIBm_rXT6>+mdi zX|K|)?d2vwvdDH!7^Aop{I`*pPy7Z&k4*oyG<&Qg{*>YM>r24o!C?`wZFEPm%Wr&3 zJD#3?0j7;DA=jdth~34Hokl{&1!2th6>j5W@woC=7*df z_wp$@(7}(P5Dh7JSIit*YUHO2S>HqlBS=YMMWI(|=F9Erg6)Wa9q9*$F!H!ucdU3P zAS5)342>*+eRUuPISCmr#jRCfacBB<)`DXu^j&DYyP<#soCSP+pmOMDT0s~=o7#%KxDSG`$e>=MjeaBkMW{3Ate zMaB_eD_qy2yhCDwD)@d5d{6dadI7p~8N~Pu(H?*HkNTX$z*~C)AJ?42$iKVMC=|l@ zK>T<|$EHsaH?~gzg7nOMfE z)0;PBTVptiFREPgyld_Ip6Geg0rYoP8=<{(oV59P9$3x5cTp-WbJE@iqO?j2ER1$o z;dfiqZ3Jb9z~G91UyES_icVSjVyF}aI%uc_y5TYMLh-_%9>-N-QczUGWu$`qloEPf{9Ebkk{bDR070=H$#w*<@ zn|c2gI22*%4D0Y`x9X#4ERKdC_b3>O+G9h-_ci93MngE8&-WDlA;yY$Hwl699dAW2 zWt{QaVdU@r;TPm7&p4C%_)<}BjzGr;ls2Y|jEAn=5NQN|dB#+k)*~-rIP*)9MS5sd zj{)5s<_lL*$ntY@j7Rv{KM_9nf9OwtQ}Vy>PwyW4|NGPbi~jT(xBkEBPuJb@|3!Zq zz*HkDp;U#8&yo>qG$j;Dd@n55x|k8)6o%mX*x^Fy-V zAwCX9RBK!g)%r2X)DLj<`6+QNeEf`#SiimpX&u6TU5mG=l45*Cl@!i#7oFkOY7eqf z9=`n8ol)*a(S%g{^R3Pp=*!;ao6}BJ{0I2WX~!yhj4_&_j<;}NJOH-iNvx#jF>Obd zb#AtJlovt*T`t3;=&tPw^(Sx_TNkgW2z=hBIL5zpbAlP0S>xYL2$v)@; zeQ@7tzz?)F*|-%XN_L9T!f%bS)|<5X{{)Oyz`}#U>36|P%ul`0Jr$2e{v@Bvz+;Z% z3+(YB;m{>VJg!4NHPMucX)|ts_e2|@IBTM*6{poi(<()iN1KmBbf;#$6I!y4??W$-=$vn>Bx;xc+7rZuAA~-m#M} zuM@Jma3QdCfgYOe1h;3q`P*UAmKQp818&~RCt_E@#1M`)2^Kmduf+25>mpwY$6#MC z;TQzOd)CZxw{>8kJJj!hP%5x}?O^+qhQV74cIonZFwq2XmJ-W@YEn3CH`o?LnGU`g z&uQ455Yq+}sTI~<)0zs#{cJ0iJsM-#pRqZ(6sPb*vLI7C<_}HZkQwPu`6!>9&)Wul zRnd>uW9ca3?NnG9K^cUv>x9VVOXrh6mMNY}UwI~Xr8{OAV(;A@9Zj%_et0LWk$rH6 z=;ujS&>UDecNbhkPTxyI+B#6Pt;=y z*;P+@PdBb}h6*|w#yr~&Z^=OYLqkI|kLu50MAcXq>^lcKM!eo@_M`xRZE@du#)LWI zxtb~BomRP*^vGu402HlfZ#4J8s!FiP5ve=Ie)l!Is`TKQ2}fwtk=SW|$e9RDboqmB z40Qy{GV_JrNIWGTgbn{pf-PAfC)YYR+d_o}4p!U>`pu_h15oxW%ragR>7ky79Xl_( zxRdj;eZlK;=#*>!5tUKNhyxj@kK8 z9&(Rw$Cm2b?<$P>2;nwuf<$|Y}P8yeEP_qi4MR4*(GjjVS1RjH@ z5_T?lvL3wC=?h)wG9E*ELoIHiMto^&I^z8_?^^ma?ZGZ+F3#$%nBfaOgzB7waN7s7 za?BpZq|x2vd)ltgk1?w?k?!_=HRbv5!KWFM1p*qp=eyj3SJV&nn(yThToSh?UyDJH zUw}S6hnE$FN>PW#fDXZQv&ZO|4&kw#LWfQ@mz_-y`n9uZ`KF(kPtdtpqVO3itc)Pe zEA$-AJ`sBp8V?3QjAXC@_6^%1nA1QwKG|#a{MpDo+GNqhh=LR%gF;FNpsnR%ODO z311we<8cODE2gp3aBKbR+Q}1Zdg;Q4c(&6eF3@w&Uu4l39w%6O_Z>Nqp2=(j}&4wuW{4-)n;V>*tdOHor$#!UC6B>z_(#+1pZ*J_?&1$#a zf;*uAsx!x1ztBq(an_oVWusU3aYK(odaVhJ>M$ z8-#*VSIHO50-HH>;c=ziu4uXo{@C*p0dW}p^(ei-NHX`QBxxy)U zOk5KA!My4+qSMyu;?-!ZGat?6O%>1|7PG4#q|bnJzB#*044N$SK)lsh=nx+^_o1+H zSiKQ)Le%xmj9b4L>Zld&!w%dKc@^B@53O2)AIr)iE#dUr<%!L>`Ga8id^zR?t#ZRu zUGxfF9=y=}r*Ln?Oe-`hTX6Lo%Q8jz7+79v)DMj~{R@~YZiaz%|H-&CCmaTjv*DqJ zKcRHCDmwaDyw5kUle|CJGSo3fObwA~;}(nQxhL~_->q;~Qjns7pkp9fJj%&T|5P~d_3sp=N1NCh}b;I=3XKXNk;Ssfl zkW5=cEd=536zoU+`*Y~+*RT!XGX+}O1&>Nx)_RFr=5_t*20R)TyP*R2YFvL3MbFz$ zfx`e_D#i`Ug-d9@as#Mb#)Ll)b!;rGbHMiIg?NhD#=<`X$YK$Vw?gz(=n#j(6YBA&?C=|xn2+GRHDl&uL(!x`?OseQG6KbP|0F@u zvI8Z!V@|zSd@aZAPbL2?e?odS(cv*qf;r^7#mM-#j}CR#FMTOsvW;j zPXhE)@U?OTAuX$dk33VGcND<%p;fMY=XV{>IqXX8vjfXbc5V))4q# z3!A7Tw!mO|NDqlQR4All*(&%jCl!D&%kU2HY7a$vmL{K%hlo|ed|mqA3+sq2u?lU$ z4-`|>-+Y9&+US-At>kOgii**W1F-FIeupqz%NlXh=7*i;)rbrTWXAdfq&IV^m{fu^ z#oCnlzx{A9eJ~$4WZH-ytLk_}>Y=MLW5e;Rd0;8=-6A~rfK&1@@GFiitv?kDHOt0P zE^4^HUt0$a{$NIz=Z4xxqxm&H){N_)#oCLXg%ORDS}?QMxhzxRHN~7HgU!aXF(x3H zdBc_Bh1qeK{fyA4shfs6%)X-_{;SnyfEOIfcuABA(pB)$kTFesp!3;~Q6|2UN@Uz2 zzZw~1#n*<6o3NmIw4mJ(yX-ptY7RqS#HgKr1lIjdejh$Y^n}VkKTAjyY{h)$(5f69 z{Re%37BCj(;PAQw>_v8qUel^U){PPH;@BSw*(^1!V(d>zmt)_HYc#Lr(zqWa$9+!R zxGzYvkNaF2_g?$BUz7GT#{C<(eFVx3Qqj22O&Iq%H14%PI;`iWw_EGS6blVh^Y8+j z4lS$oi~hzyxFC9&op~#QbV4l%fLcfn%_Pt`WKTWk7e)*Ta3}-W1Velaf!|XE*%xRO zXmIGlFUHt5z!wD{?J z%t(p_c|=*zt51ph!l&Bym0#7`PDiN}o97P^v~7n8zlU}cj*;vzx19pLw~&0B0KIGQ zT@_1KSa~6ucs9~-N%&B!eZCf2=C$}eLHxYLk92sUSK&9E_~9i%9E{;VwJY<#g*bKs zn_x@eXo^<53?|bfE=10`u|a3WQx?Mb*_b}S!NVf_pE!P*D!)0v?}wqtFY`G3u)Q@( zdy@@YVfpdXen831R$6(|9;6EKxO)Z2?#azyq+R8VkWzR9xT-U6)y89MZF!Krf3z(>G?) zdN`m`sZ@`f0h=D;p(9hQArXYL+CbHf8C^V~4`TXk+P^1k@}UesMcaxYewlp`DV4fH_s}d z%d)M@Mn7zS{i{ARw(Iqi@O?e{X=3NKzsvg+L7cVFngtk3@r50!h6Ja*)UR{*cXODh z495DxC*z)n>$vUGVSyR=ylnAAf1<}i#sSY;sQm`&86;!~e~x2&9d6wz4KiN@F29)p z7yXSY?G2T_I4JfU$z#F{B2n3?P}wXw--fw%p|O$DA3npZVo+Gff2rA3252(lh_d7} z2}1SxPHV?ZSn+Vf)?nE``HgRjj2*n4SH;k{L&p9LSrD)evctT72p&&ht&=qPvJd$o z>RXhuUe9ZhYbsN+a^#wdyP_AKZV)y9I0*0chai(OrF9HMLib?e8J=hRtvToniiWr; z{gB>UKlEs0-+He()knTHW#fWheZ5Apv0U61Cp6Iw5J0~%+u2a)=BXfI^8*JF7dSCT z{N;*LFZMv~i)ffzl6sMR>M^plHn)cCdZk8gq`UR)?E%GHfR55k8h zdMn1-17A0sZ(sGHz*nZ3)`PUWtvjXYo$QCe*8(Uqsi<-{CP&65_l8;7SFc%LW?%Ao zbr9_Tlgj-8N{>OBAI>B0yEX1y4mCy0pkN;PUfFNlXB+~3$~Qg)efoq#>iV2rw3om| zK1HJ_1dr=KPAlpQk35Hi;>J#&u}dn|;5en)i?N}-X2u|1St}6TbTO$Tq5tSMm^Py4m&J| zcoyxfGSLO+Sp^QlVn>qrwOjzTZG)Mf#kuLQAX;%<+~Q~Rh-0mD@VaLPoIk3rZ&1>{ zvgXO1B3JEEqZ5t=%tqv8Da8HM9=@58wSV>4dzcsCX%X}T+nAkhTL-lk=Xab(_rkio z2?}1t*+xfs-SZG^X(t1d*uBG@af_Y>a?$fwS&8a@ika>t1#Rz2>faaDzm)W!%D|U< z4j+{mO_^|zP&qKZQjV(>8)F$zcAKwsCB}d4N&)55D4=YQC(kER?W=^D~?7fv(vr; zyV5)&Uf+>rysHbAHJV{4Tf=t9zSlXPT>`mxgChB{_UtvQqETE7TGEATUDSmZg$DYK z@1SG+!E>Owp91YMqE&uRsTs2!!w&jtHst7`z8lcpA0p%w#ZA73Ld5Th&t~|9)5K4X z?eZQT)1{5~)wXT?j(+ve#Mh^RuqT4A-;|LQU;pn61z$fNPWb`IF77vQ7&>O#jp5&A z$o%dbZMhVLt2|gSKDJ43JK=O7p)h!l6Aw&)jY=S+qh-d$`kL=jMMpy#`Bpt8Vs^BP z&O)4Se8QO}k3V<`?CugvoiUpH*#~{b0pFS0UpnpBLFwNd8=t-` z0iS+b0zUmDwM1CS6$$wCJM8%M{&sviB=sH-pMECd)7`$%)j5b!&-+Nc&o53RjQZd- z!l<7N81?rTKv5>S*Q93Ei~_uDzRfy1jjaVxQN@Fa5ZGKX9{2gI&VVh(3W z4#zy->HqHY%3j4>g8%V(WjFsA?f4nzm2EsD;k>eo9skdtS5}pf+$P>UaNLYPnAL*A zxiE#6Y>R!tfzTe&y~6e~=mw819+!!j$;eO{7ovXdi8e#e9JWaeD@+D#)DKlU5xoXk zA-=p1D?g#&9vH{KCVw!Ox2q0z2P{u8zg4Ss;?8`R=)rqib=*I@+^OdssyMHD^dR)8 zbcFc_&Fz;^vNUX9MmgLSIQiU+zI#V+1Th`OJfaR4@7_HjD_3~wxUnc6yPF-=vs>X7 zII{}WC2wQJHDEq&1Gl2reBrp6kBhgv_NMq&`>s~o8&NI~Vn*3zR*vNKxTG@nLkfp| zAK8TOU2y6#EMU6&VhsD~HR1=dzYv@pkO=FD%oEq+*$8;tfqBiyHU!SxAk3jTM*N8U zXx@B)?*JLUMeA&ak3zeu;AMVQbt)8+IR<^#Uln%Y-hz7F^w>EDXqE}#1E0ThN?Y>$0^Cybi?()D|RMwwg3cTiUmRM%j~XDHQFyb|IY|55MKI+RZzmMmbe>KQWXKE1 zNw;5EXMgYxF&REAW;GyRhgSOnXiiPL2D54*AD)_3(=O!u(t+{~m@i%<#$-V9xgEhm zuTkKIrKN@f%GqrwKmm<3p8<#H&BX1iU_g!5gLk=n#@8TU;4ahvf0jpRKu_raYk~d1 zAz`x(883wC4jq!a)!c)VjDpO_KVo!v--jM>58GtcrBf2F8*!or2>dOgMjOlku8aYc zGg#opL&vPRo_II_Z~<)_W3La+GJnVecBe2i{H&Ev9|GeZnho{k$ok-E`*7C3lT)nw zVPJeZyax*1;d5hgM0`&DAP%(2Y}yw^1$`zQ#rI%ITeQQxaX4vhO&EPia~u?og~HIN z>%S2`%+5^wJ`TQrjakj>%()k{|L>ri$A(GGA%iwZ2SEN70zx(S(Z%8gMoVauL;fl+IEfaPSHpDozU-*NcLLQq?CGlA) z-r2)+qe!TjkSxzp0Y3aVqvv{`(IjrI+^_)BwEcJ}Xdj%V+$_4_4t3U0O%vRv$P+Nn z3jI1qZ@40dhS#^@Vz3+yr%*R`?1&;R8$vDS+KZ?MPrZmbEwv+Rdp`&(j~E18(}503 zk|%FIdj1Gux(Xlb`)}41LoTngccf|k< zVQe!4gW=IN3;x6tkKMW>^U6@qJ#c`TV0$+{BS_Us(G8DYQ;;b>&9{MPB*%`=hwX{J ztZ)I>t9OwO&ugU#=q>PU{g^6GMxb(3;(P{hUb-=ju>OUPieBJQlX$u@WHTMl zl0paX&fiPpsCc{h49#|#*Y~GvMNQOab{|F$^CC&jW9pBG1a(&qJ=HNe^i+o&!FjVe zoPQ4#zhkW{T@bv&qbe zi-yoePU|8+Z7p92n>N7K*0Nhls8xVF$Pc=3A_cLrj*6_3VVmGz{+nd|0l+XPE?HfR zPgcKuH;h+$enZ4xj`$lc{&L}2COJO`pXX<>Bc=3UxA;?~Fs5vKlKJ8({zi?dQyiYJEuN5-!;f(<{-Uy6 z@sNh&GtTcA@4}lPq)6BA*IXxUaCUb{Ho2?2B{JBr4O)u~_G`B^3dIqZ0V_YqB<{J4 zXrx!GyW@32xh?}P9oL{bd0;n;J&>C(`nHTm2cEAE-}8f_{wu+Xz=kA3bINUhW%4h- z<725#v6H1L`y`$!Wx68K4mA3ro$rF9xL>WtZf`a@oI8)SlpnrTAKsr=7QXD2AAAmP@FEK@8TR`bjQ7HE?JaEgKfdghzZ{0I z%K}N@ha;YX3jvD1R+$yz|D>5RJ5sLEum-P|OlVnf%m*m4#}(i!xpnmxx#Y-IKrm&d z1g&@YH^4;;!&8$HN)yqjCsTPjSGf8Okq@KNbr~*$23d~QFYM1A@>nd)3s-u6z3hdH zuyCpvSt;ms8XBYdC(x45fL_HC+4!Ocbe#n%>+D>45VVILW&Z--g^wKkNSuS5To~?h z5q!sE8ZPL<^551!2B~V^>UnB2`$CJB-24)vYsIsZ+q_l@^M!K^T$Sqq_~bf)yq?){ zyBJs4`48Ek$zG#ZL&8W1t&KBj9>x_()CH)|~fKP)P>-Y37x>%H>LLPV9q`&L_6$mYWL z^imXq+yIxafsBs*^`{U7Y`d=5KlFJYC%=N6O)-36L~HY_f49#rh;~71{d^8_I%tt9Vn;1mxzuxp*8ovshB?S$N*82nvLNuen zjRoxDEITe227=!pltW()EUD;05~DN z1O^g_%Gn7fy5=phfd4eG`)REo@K&;KBQ58gy!JNQFq{fY%Wgv6Cmm1-!Bi;c7kF#l zBGN;kfzg9!#J!DaG>$D>0%Pv4A-7CSL1KmS)9>Oa!XLZMCv@fPEmEDlkLgSo+NfQ! zm1Tw?Uh4URQ-om}Y#I%H`fY0lr+gx?yffFh`S|Y*$u3O$!mLxHmJSbJqJ-BvNxnaRDxcS_Uoi1%r}qYvYGL6ucBuZK7GrXeU;+}#U$_SkP1SV zz>@gN=*;}xcz9xppy7SEA|uFy@v;H-~RQ!2fG!`0oa7j4vw`mPClsSv5)E!IztE-mA^ zb~dh?38*=-svV#2*8tq2_xEe&%uAWvtvC$gNTVupWsXUsyb*9ZwzZ{0#5=3{xj1HB znXCRBaNfTOMpM%rf-Pb}^K>&`LyTMyRZUBEt?#C3ar$wx`?*>vWpsma9b-l!sVeaB zB|O^dLy_?*gB#rb-RfYT&*AAnBYFs+@8{d~OTvG98yYS_fk>Va!P7WMS-?`lH{)y9 zO+stC+8}7^TMJ%G>Ki>PSZZz%>qOTxab6PJJetAn;rSKtqRZnXL)#DEzu?_>hV8Uz zYvs$6q^oq`Fr6m>#_CEkCFRh{Azaxsos`Y*V;*$%1iGANqY(#GlvI>Op5e);$f(!3?yN= zI_Yiug*~=a%9YrqvMh4A&%+X=O?zS+BNu;cs}o1rkzG!0z1fd)^1Yyd)Vb+BC#A*z zjd7oMji~wPIU)s@)y^j`fx~s0bdpuh=FdQDCU;Ba9gKqjijf1HyF-q@gntpOo4?3> zLi1a2Y29PMLHR=daPu2+Pf2n{1LZYXkdi7YwHJaHW8Ss^XYxvMi6{-UesOPcPKd~F z*-+%UvR}`tIVUW6HqAc8&n^;D5lbmZKeJP*I%nRFs z^eKYa`~sSauix7>of{Y^j7Og)LF>~Oxd6rjb6Y9t=nml8+BkrHzJ}9U;GwAC#aJFn=%4l4q3Q{ z7j?w~wM)Z7y?aBmZ-L+<+!n!fVg6p*k6ydthYdM&*rN^`f$Y)+t_tszCkDuz-oGz@ z0sofpd)2T=vsW>u24M6`5AhQ1aU7Y5b!S_~g0UtDv^X}8IjX?9r8w1yo~eN3jX?$t zBJ4wn4P&2zgFIM4c;dW^#T>JKE7dWFCH$FSmj{&0<=*&YGE!>UegZW@TcU8KWBfM~ z>{@K6;lLMvKPa+J)E#-oUbC+0pvs1oDrxIYwZsoj2p0sQrAd>nLE{3Lr5bHp{x{*O z{9jPUtk9#~G0f0~eTL-<@`%LIOzK>m#u()|7ccyWk?_9_!$~e@tuZd}stwM_1}3nU zFMn^4vl+QST!@bArH4?B&COi8_41Xvki}i7!fj=l*J*KXY{{*|yO1GYsHW9^20tEv zzj-~3<#FqExCUT+Y}i48dQEq2c~Rpz^|RJ;)^(w)ntqzpbGqBzyo#15A%kS$fa8h|{M&2K574oNTk!GnV(X?K=%gIYL z;n1Jqx_3&H@~Ui|09iu>aoC{n@dr?_=!(SJP!E97+n1qyyZFvHRfJ`4D%LX`krf4S z-?_^O!))5%nRE1X`3!^F!>(TmYfdAamAh+@Geu=G<(3>)muTNcnVMg27^a)QO7z~) zFIv(LE9LtW%o9rG*B6*ea5>A6YX!PhSu`sP^9_1qr(L4o)|_*~*G@N^al-7>Dod8*D;YLPJk-9J^`P3z#HFa%fL&NWaqRUrPM7 zlrQ9nZs&NjUbBz{XTRk=RK8YvkNV8ue274B)lss3Zn45rGJNksMz01(jRNe#D^*xc( zbaJhjOZ^zUF^8}%thu)E#Z7PAhS`aqj!2k?^nIxH%e9PUp)I{U*V}kwx8gp2=#zjA z>AWJCU?q6Zq{`Vjb+WnQj_PotjRx$i(g{k5@{V+iu8#{R-0%qsCu{d#AA_F$=+soW zVdG=fyK#$^sL)xLy;Nx;Gs*e_3u};Q!5ec26fK^Cdms4N#y`^bPlZ3a+% zU|AnMyxzRVp*1=oTi69PL&|#i&({~>BY{~A%2e$g?wU?Z$Qxas-0}kVqT@<(`W?~= z^AOV+zLW=J9v2o;cKG)v?5Os4zkgqA6dN9jtlw4K|I=bbaAjoRD; z$vAHLyWYkP|0y%7ZB}U2OuT4pCtkk2K8SYVu(cMad8ut>I!$v=(zGpYGAyI=z5*r< znI||^NZ2BhO2;~qsUxma1eg{0>N-TT?Es9k@b_1==x!5Sskpw`W{qD>lrp4}N@KIl zH0E$UQ|at}-1ea0Lwnlu@7*HKsE#_im1AX#2ZOB+&9qe&2Ltw~Cm9XeMTzhWhd#+N zG_K)^S>QBX2%5a91r(;{lIp2QD!hM+i%obE!Vsw1BNHX~-utb|N;7j3DewcUH2`WeoEkr>Q?*)_O>|KBm3 zAdWxOOdQO}x#yQn=g<5bh+BM!`;BenJ)bPft8#A4*<*oIs(LcO^me%Ef!|&`=#{|} zzX~l>vm!uCk5IFVyo_riN(N6#aWXc(nwX@N7+`#*9*flQaA4mXPtv&AbA@m%J3i%c zeSE}O)-q|)FzHY}*XOLdrrCvG+53{g^YwE7-e%_ba90d5DgrOPL6GUI=JiQEfs|Mn zSx}->P$Fpl>HZ_irvW0dAF7_b2IBPjg|7E#>QfkbNn^@+aqSER2Y3;oaN$1|Hd(e?%?ItiS@e$TYZ11F{w*H8Y;-T~O^kP>qWKZJL=CG^fxya2-zITOQrR+>vi2w=z3}No@TyKpw(Qd0ab@ zA{gLMCvBrrXqqn#mpHN^auSx#Uc;?})S4T}0!pfI_=`;Id*$NTg2#fznQ?^J#{2(56W0fZsZ*^E_(Tr>^E8jC!F%Vo4x?QURbz4B$)xHrjHzWy@>(D~pdR ze^Vs9n5ePBYg@CZX;rAU39Gidk|)N=nvAz)8Coo4lSTO$|N@^k(|$@ zFHpu}T;W~Lr;Lg#rS2qi8@xVZW`$k{TLAu=SHWm%E4S=)78+a8`WINb>TvlDWh`4+ znn$#&VofxgQ=}AE1630ulLD|% zee0nBaAp838b^TZ`r_Pw^0=`ERrtmI_fj(Nl3IMQ@ooGeo5o@{r~zh5b|FT1>X)>3 zmHX)Wp?NCsV=R8pIJ}#&B}4L!sX$tUI&O_rNj6 zZpg<9(aITZXraV)5uWp*k7IRxXfoT!PKoc_4DI9FF#MgkSo}H2lMvLHa_(b**r@Ud zW5TjAaMVhuJMn;Zg7#a{?B2>+DlHm5IBF~Fnky}|bYON*f#<@vBQ1|!Dv$%B!kB$NYQM@OVubeX~1)$n1djz|cp6P^!wjPqy8+5PG%b!3^+RXLv1X|Pg#i%O9K zU%WiudZ);JU&%Cl+uAa&9bbI4G>HXYyhO?*AVt!OF=1##IdK^t?h*phG3CiV1Ev|T z^xs(zZ?xh`$hHxtkrTatfupO=Ct2beV=T`J6EN z>i0Bm{2^Fsu?(3=k}zz~8Ej(Kt%-@?0)e|v@@zD{@zSdwZDrf1*K^N(x@td`RBX%2 zC@Lo+G8$T9R^O{xn|770Ea9vD5LABuEDs14_@h>rs_?_Y#xWhz*X|~|DRkNpxkqk- zWiM?^VZNvu`sTPz2zAK-^?E~hU_l!^;ZFPtmu4eB`u~762{nJm2Ye{q(C;<<;W6DWZRf0H^%0*VTYFrI`N;tJ0&<)Nc3ihg~`w~ea8>91Rg{oBZ z3DPVnnz{0N`W8$|K!?;48EI##`L&eQVdY>!oW6M)=>fYC5uT`$2k8p6V%b94Bw$Fc zFPV7~Am4*`A#Lox6qp)hfkA2YPnm#P=O0Sw!;h5nP|_K6YiR~WpZ+BaKsC$VR(*~L zDyu31j(eK;!Lnk|Jx$zbbqaKep9IOWXwW@n9H{{q>TH~JFPIq)^ocI%$#T6D*NG^p zdyNkC$XLF%RgxfQk_5*xZP0v%1TIioZSxnm0UlrDCEPIDwFoNoku*($1wB9{svo;z zXThl@X}0!-S5Rl?QMvQ;{S=D#k~L`{S_M0PmrTsQlPgD*Xyv~s6!Xy@?T}AI34R2Y zL;%(E0O^V(#e&t6F4I#7b03sbZ)qLEi3-7vn38Z~Tk?4Y>54dDr%f3jFjsVgQhVpY z{|!Z3lUw%)u-CBS-f&7A<*=JbhW!ELo{bw0|F!?uNRH9(4FmBz; zcg1|2{7B+;9sk}8tSwag<>Y0vtoS*>V1BJQa0D$4viDxy!~lql>ys4OLGzqdkL70g zySjGUCkeh}R1ic$9EfRgxK!;5tW0hFk`XmSze7zks$LJ!Ms*xSt@g7YaMxqB4Zm$B z_mtj}Y*C^o2XV^e=A!btQ!V(Kw_^HMU&aC0>GCyb^XYPlS^^&m67W!d?$SE=R8{35 zMhmKXVa&l+^y!`aqC9A|hZK+YLiRcCN-nb1fe&Tek&&nhj!&rmOz_~zPYTR>qg;82 zK1RH}Yw+M91_VlV$-3|miv@N?w&5aS^d{LzQSwCt??lJpL3^^2CE$s}vkZ;E2WiFW z!b>ge?Aq?`2loza5d#fJzF1^k7(zQOTckj#WNQHE*5sX}?1S%(>dHgC6-4*OTk_7Y z`RbkiF7I{=59I-BDYHX{G-kjoDH8$Tb4 z=f2B#64qeny6qsgAswL4WmU(h!zonR4PF9)?~OobxS%dc>R?D&c3U~1^Jf%L;A4>2_XTu-@^+J#0y^*t(M&CQ=%x;SA`S)FE3PwCaoK_T{=;lK3&_ zQOcU7&AV_}#Y8)CI}($-fE$SX!=-~%*%&?3l+j{Eu;;uUgK4dvpHb@}HPB0a4wR6` z@DrR}#TmcIy!&L2jas(RJanJ=js;4!v5-U$`c6x$^A>G3J2=RIcSW5d^!70T@gwkcCTXp zoGCdXjDX6L{dl82r)P?u6V;KEljf8fW0!kWLz6Vjv7x68s+yrEO8PToqe9skk~+5N zl{tGQCso>QV_p#ME7Huk-$$ndXmTvZ`0ktI+zZhlcki&**ihV<;mL5^SATDmg)m}G z*$zwo(4TX=wqx(L>^J$FeqqS5m!TjsC5l0V(VQ>b6~nHG@alXp+7nIzqg zdzQdlh{@bHdw{v5XPEj$n96N)5CL~jM3l_zNGbY7Sbw+PMn34+;MuzV@lut1abFy3nPj8Ws$@k z%pk`yZ*x?#;%4KH{f@Vbi7N*<%dv_sC*ytpy4gbN{nDhd+Og;)A#Y9z+g5<(ZtSwU zkx#-B9PjH~ubxgPktB)=_y|t>x?$7_2*+-|oZ&yi8t@IcAY6OzNfOTB;ZczQ*K;zEwfAx-=hQ0m9CdNu62M1yc8op(%v-g+)g*dy=R*6%<$H;q71+gXF{EemXvgkf z)rTc0V8H45?OWNQc&REEU0UzYZFO*$vL1Yw3LB7y1SUQkSQCWqYydDR3M5Qtl9LT;4RSlO10gT@d7zs^Vggy>qt7Y+Rm z?-ziP{w1oWY%~vp(Q7~4eM#9Pt1SxBq#0^UEh{FCg}$f(QiLIig;${wA6%@mvh0$)a5hlmoGS|(R{G}k9VaKY0m= z;3{15=D9>UdmAj)FB-q+8!6#R+d8S2CVeio&0%<)Xms z-XzkD+k)Jck+uVNzV&KNlC>W+i2k&NDhFDe(Kc(3EW$7s2d)!8{<61|Y|cu$jeS)L zwgXnuem8MX5Zol<9CF}^MMc@gyN!ABFz3pB-z_d=Qg<6;N$R!^Y+A3AK4i*x1DBZV zon@|Lo{+dsEUKoTU^UR(4P=Phqe;WGHJ2>^DOel3L!)KrCgp}7*iy64O}}L_mCtA)BMlkN`W9{fw`7GM2bve4*V`=$v(;XoRoYbKN zzeyB9S6TzHZLw)F^Q0s;^S&_$hQ))j)#$nHuZ~)N-0Q*Ha|FBE=8OukqaNgBYMI37 zy?MSCb?(=48*}}m0RL^X7EmLM>isG4Ki~^`sE4ZOw%hfPTl$Gk_k)*`t{k463a{@?xn>pDHzbug??rXgITqA#{(*ZcCbEYlSu;IUBM$x>X zs;yB>EA;AQCE1y3@nOU77L)sjhF#CeD+V%tn+hx-WM&bGit>)fU+$rn`1vgUXPbR_ z$W44jM*GOVIs$F;)|+h&6wB+(ty^>f4fyS;&&h}U+qC`^*006tgYv=AEe2z&G0y0| zP|1$_$bjqT4XfsfZDo{E{wN0J-RL}U^E;xHOMi!E3r|`{K}^T~7+{X6=8XXa4v8$X zCrq0)P(=oN|AknNIIxHcu&oDvpIcTrwSeib7yk?w32AyaL~sV%qj|GVBHEjP&*g)7 zHrt`mi{3VC!1@OqT8=oh&}v@IUlzsMbYNEVBwQ9<(5Svn+%i#8n^#gh*C4Lgr@9i8 zgM4e21@WmikbM<+cJ7q)U94=wj{7_DGUVlt0D!IAV7-f4q_yxG>W6lTkgVYqA zGk;pnhnzDYI~AO<(iP$VTt}k*ZF#>4SA@^B3R%65qKezYvrZl}B_UiTox8b?y5a3R z(Kh9H68hI$M~<*FZbYv<3wR*Zp{^JY!|`QbuK*VVD~}a>e{`Q+9P7{_qF-dqm=hOH z@wDeedkM+L_dldN`jRu@v*-e@{M%eZi_d1dA45{sr-tFhU)cZG*_XV2p?_Fle{t{k zIJKJbVt28C*#se7S(V+n{Rihu&z;C{VtoKJqRo1%>tCJFz-!pSS0fz~Sfx)Ys+K16 z4~X}ejyfAP_^b0@S0?huH2muhYy64IA9-16=p1f*hMA#2c|vj)ToFt@Yxd?; z`eu?1YmD{S=2qg*)_zqrckQ$>PD>^vu?c(rV)w(7Ovx<0bMLi1e|h!l@GOO)qg1*gxXbEypvQP!Or(wa2@DfQ6KTSN*hSsFcaKBLMhZc0`1TgIi*(xpnpQ}imNq$Q9$ zZz+^<<;sEAGSUeWs+5AyN&h=8PbjAt7Z3(2rBEUd0?kj3xzInIgLCjxl{WPb*-^8(GmK-QwuVt)`-#NIR@9llE7 zwL@X5vQ-%7wFBV1n{p3L!cF<3~*L)yM`o97(j|5IZ;03&}&rAY2-su ziSDE1em)D%W!SHVhSwd+k5$KQEclX6MHvH2%14L`GjiWX`%IDWpoZ6ptu&2gOANL- z_+p}9ON5}4FlC@1ON6l9`AXRjJ=l>_GoI!ctg2nnhr|z+|`6AEJ{>(peb?4GIiS%PyL?)X@%RiH{VC z=sC72VXfH$oS?EDY8;!~fK6VA=#9_vjlMup{inu#|u z9Wk>y%mF&9wNw`x@i#AN{st*?nLZ_j7NsZ2ySt@<*wR#Qv!`+I2k|~Fv5eFeheuV} zhozXwyY5h(x@5B86pptMLv0+paddILNqey@3Hjp;@pPre0E>pXen+^i)|9E0+;rhl zAL+yj4BA)i9=xH+l(sZU6>L|wdstHzX7JzYSv#DpL$+Lz111M_I`|?q!8~?X2 zR%?lV)ZPb!2amxfLS^@3bjnUtfzc?hm~)Rn%BDe`D15-zqVW=uvipVVgU5G@r!U5p z2S^p4cd38Um9z(sV1m@{^%A6_ZVHNTg#1B{{j=qFis0DYD#X%t8aTNK`~$B4gm?G# z2S~-wfyVq>u#>E{e-j^@5;yaZDsTfkRfzfRR3|{*Or+p0(e&Ha{AbHR%BaXbJtEnc zp#OxYs6Hhk*_PWj;dJWwuVnQ|_JiIlWcynAPSP4|`_Uq@9*uBU!RHlXc1!ZDjDO}5 z3A;u4PJ(~dm;ajd|C$|f+vPd$l|Z-duRq9Rh(+9#P?Nt#)rb>;?0QJ)Eb)9P-%Vi; z+Wxb#{IjwBvvKhKYi7w&i>2R60O>a3_NoZfQ9gmBxtu$W|5&{Y{=wPG4qh~`YAWq6 zH`%?In`1(vE}y09iM(#eaBv>)uupW_2sg1pf>khDgGwR0?sEW7AA@ik>St_^$=;d! zDb|$x#>jB^>lIJ}LVG?769%33bV1CiH0x8MVFl8uAkA4Hp*!(2{E)ONy%Y@PGa5{> zK}sIu;%KrQ*J;K(@z(0EPTK4Py}k@8utn<6bigZJ6qv68WtMcpk}i|dU$QaEcuEU) zVfk?XgqrIpX_Y>T*w!ZQQvF3`;kBXv#Ml21HufK^F1hrN67j>2e^e4UBme1&n1J-+ z7DGJ3|KP8%|4~*N{o|ic`p@4P_n$7v*Z99MQ~&XLfc)oYAuZ$YgF26QMf#t=+&_Qn zce?*>`R+|D(c6`4`$5z~tM)vPmbb zC?zxHljaUm`!}I^b#xm`AvOY-QQU)Cwc>Gbz>~vDgz}r7KVU zks@Aa6fjp@jypQJS36gHu0np+2l(Rn3z*_xnQ1FN2cawrs?E@5O&%T1v0_T{UXgsm z<6qIFZ7atab0YW57<-Hv@5?!d*M3AwXvwjnO_Is2&6H=|qb=yD=(T6H-Kl*miM+}= zm!uubsU;COCS0b zD6G{JT7l4hybz$qf(KAFpEoJ9{u+aG;q`rLYy%p1yw^-`C?x5V^C+Ib~=$P~zzwJmo<@S5JU z1bZ|U;N=T72XSjkrN{ALrpH4_ zE!~eIm-wB4%M?h^Wl(V4j~-gJ^g)@ct=rm@BB;Plai!} z0t|8R^J@34^~>H@a5q#+@1^TUV_d}xncopdq^cY(qw+&->ssGLbq{86{czH}d)l0N zm`SOWq{$t;udW4e&7!AIrns^fbZKZQb+RE~2FfaCna*|CQ@G;3T$!_B)TP_ z9(HF!uDm%Ll}$ZYb&{r*La?f&UrdEt+}Q~CU$HuReDkn^)ump3{rk{)o^>Dpb?lDi z+y^fv1$-2LzL{c7X-0~T_ZDj_b!BOXrZfJ@yC*H9Gls2g``WrmHT)8=Jr)-H5m00d z@DDU6Rt0f2^9wIDn|MuKR6fq_{A^dX*7bls;9_2q9BiEtVjJZ67&Jz&ick)TBv`rC zq?Y((Mzp;y@&Z!}VD>;TuGAo-MdJg#Qq?A)WjT@|XmB%4Ozt{pfOMV#{qmnc@`pD$ z&@}CNgH?jzhVox5UqaqJ-hH&N2UZslS=R*-GwA}G0#lxQUZp#pXpRX)z(=B=tD!jy z^LK@8xpa8Gby0a&gr7wF07D)Lhy!mh5i&6jrl($MrKisN7p$rZfLTh$ z3xOfBO*`5kw`jc74U%Cp3uCgXBKBKMhSFa0INjNl)t;x4l&Gj9R$7c0vn`T zVkTSh2$eG?BIp_BzvLO1Y@>H=;i|BCNVpgG72e_M7nwWyE2vFkT67vZ>=$^=0veaU zQ7(NPxka#3w#WG*c{V1<1cq_$Z`55lKcZ9 zVcaMfVl}JRz>VENBtAdbRTdUb{@i?Bww`S^*ept_Evy`+cRX>JUe!Ih|HzMN0{{xg z5FxK(d#i2(gZT0kI)A81h)3ZKNLTu0g{8nn(;RSFL36+tKosraI-F44H^BhQQ^4-? zBVOH05H%TK;S!e}%#;5fzggxLK&j_TYH|&-{4m>jQGukvLKuD?VS6JtY6Ue5gcn>` zE3*=_0%M@Q*Kd_6)sCVjznBqVR|4?>iyj$8>jD)m}-P(Bd*?{kKrmA00pcDGySBD>F(=XxscvIDgJKKboQN<&@Y3P63+A*mpXFR!3vi| z9lJ9fMSa6^n^r6PtO`-C!Q5*YCw{V}R8_WUmlWn)Q*4-+;<%n;VY(pbAOhu+}u zRksowId^$$R_g8YwDY)+zY#>VxQvE;y^6PrRYL8(GOMtoDx5v1{an|&)B^jMg6qFT zJ-@v|Ys^s3j-Cj@w^##fFX3GDC$2;gQM?l;S`X6Qp2HEp#4zmyjd;W3h9x@VYin(8 zp@WQkQ}6`PZbRD+Xk8>!C?3#63&Y`&pJ5Kav{N?^%3i$c`5a{Ht(0cmeVa`Rw(4=teIvbK?-4i7&})a~C`(JT7owG!>Qo27{}*H25B<8iSdAm?Z9q`^Sxc$1a3e z?7eb_*|&~UBr5Gk3_W*RpSy!;l;GSM#dlh)RYH^(jlLbHnqR zOmxHPB2Ge`t?-dKd1hRNsc`=0Z?1c$70>6p6JZ;;YZGDuHflu7WzybI;RdV{J8aX` zDIK^}EQJFU^|*e}e082XN+)okbngTnz((s#bNJ-xzKa-ed*32Ir}oY~);}axpjcEo zmaxdi;FCA61A{{85QEFpzQ5h;LzxrgN|Gq`3NP{Ijpw>07W}ReTRPFA{X+0)u@EY~ z^#jXaXKSdYp2~b(-UKsvF1JRQN>0JnA%-RKl32Zel^ya{)=qQ|56kKxXZb zUh`ADgyUuIHn0UVzd5$hWPI+Lm?`M5r&FP`jh>cLT428Ix7sRtDNe9n1Vboz!#W=b z`ertqkeRENjujQQzC$n9>UZymQ!t~Slq*lX*yembkw-{y_4im%jKW|_qU`$fvp%lJE7a!?XKCsU@4LsGuO-{Yx{VB^ zY81!f=Zdw}{yhW@-k$IA>(7LhA^rXM=Y_T$0@*;-O8GB{Ke+?#bmOH3Cdc9Ycu0_V z$-=QXBQkxn6DAcbn+Yv5XTN%Y1q+Q=24;L*8o^qdhH^lhEjW!)-f^6%O+1CiASgw` zH-jq*&mHSfCq3}9vHguhx~z~JyO~RFv*P#Rny@)7vJ{|Q>NG9^aoOW#nbU1u+7ED1 zlv6)#V|%Yct9vnehAtsWn}{2;7><;LdG<8)XApHnd0@g!*}W07MX?5%)evDF23-o@ zVxW0mQ2H`Mx2v(1_)m0Ukg-2LyCX9KpsKKgV9D!2gFwV5{piR| z7nB4KE4{f!Wycg++=8{Au25Rk;|fx%8tTz&pyrKII&TF%mr|W zM|qM~_MguGxD8K3Lg%3A32XBLXhmQuf@;6es8_aM9F9}1)9C6}{8uJgawuqndEA=l z8PQZzen4cZfn^xm_ei5?;~&A$z*uD-6HTKyFi7TSQ_WjU4cN4v3!!aa378X>!*?T0 zLlDMy7@k;CS7qg z)w*G0$@%s1lQ*%)@hug)M$ZAnlOlMr6p$1&Ak3NDw=SCf+{DK|9 z5gQO$dxbLDfeFeZlD2^a8v6!s37Kr-+f6G?BhY}ak}0Y)rgME$q1+PMZ^d0tO7BJ` zWQ@lBJ-c!nGP=`gzkO-rc%3H}Y6DSa;@ibo&9caEK1exv{+>E8Nhu|wn}8lF_GW6z zGI)TXOT0z)V^hug_e>%?wQKAyhx!vJoh`6ze-OadTQNtBg3R-$@N?(VCB|6Al~vZQ zijy(=spZc^j%I4;UvRN92+y^HwK$msiWz%Kosor*Zzv$13!9f4DwJ(1q(v{0 zGEyosI`tuZps6XzSe#c2omcBGbTK@Az)9!iVQpATeO)$_{kS_$p0|<4K`C*KL@RN} zrhb%!pY1A?BFx6amT+oSnqIMjLbmf)K%ImLy|)1)4+( zexQ_*nb;t4de);od+_=$jYY8?ah*D0e3R`*w+DG&0AeRK5HgQowDr9>pka0$iEC?W zkzMAmjDSmYi(JVAfcfoQ%NV;Cia!|d+f(T{@^{juSl@`ws?=araN1Or)BFP_9n-f9 zXP*OzztnlYi-kP&mPjP49CNThimrP5Y!g77-V1)v=9k)~in12-qVwNRIWN+$p1;cgz8NNF24)|l)b3=jb9!+oX`l4? zWov$mdnBOggK#cI<7Zl%W)5CM{GA&x!;1A}dg-3o8aXsPxG}$3uyKp=K*<}=J^w9^ zwALcU7v_m|iu`7;%+L;%g=)FR?LJBQ1u~;<0XvUU-~d;uhI~r8#KA?1XS01v0i z@yu~Vx!6|_>o;L=9*=J^%GE#dW`ALP^H6@_&3-ShXpXs--1+RKgdppH=iT86+XR8NeY2#mW0n6hFo=!b;sqgqn{+2n3G0)pUfM%PD!mF|^ z6E$?ka9#Oo#JyR^a`0o2OKY1ak(Kh~%Y>088YtB}7prhjW%D*xtcP$bl~H6KE7gk489HT4(6eiuhz4Z!m2Kuopkfy#4+ zZsf;iWGrP8&7LiqH@M^tukvZnn>Vlo>7Taum=c={;(6cG@7XofuqRk0b8G_BM{41r zyhW_))2@hl@Q+VaUp5xZOKl*-J=wKa{3Fi^di9HE8n(i}IfD!G47^X3I)#xAo^*X9 zz6K+I)U84cmfa%D!>0axO;v#7$eT&J;aHwO&YYi_pKoACz`?Y*J7Sh{G}$|L`4~K; zZKT2-&~O6;Om*s&dL`EHRp`fkOsZv_=IUau8w7t6{(KbW$V-VYqhBu=Eh!E*if}cz z2~lREgeG$?kEe28Gi#&Ldu7x#_6$g`n<%=se?j5<0s4a>bguHET~m6uoYMT)i0W963tVL+k=%l3>01X#*mFD8wke4*;^?;?my#?Gc!hsDH> z?Za)FryqM&Te^d`1Y{c-))Vkj)MB!lUrR6cF!Oi#D3Qty${bdgG zco%eJuCv{&+7JozQ}yjBD#O+eqYig>9<@Ry#4ZR<_kHy>HcY-_No6t&MLI{ zSMnI178CFqU^9p3Wr%_v$VeH=r_M<^i03r)9MBZG`?)`;0QJ#xu29H#%=*y-BWmAe zH(5;lnQg6{{!f!@D^{y_zc6h4xxE@=|EVv1+7&+U8zEzT13?A+ThmfX3YizFDWUn? z9$uc8wLFy)IfXG&(s~yVv$AMndH4j@{IK?6^tE&RwEyU7we?U;i_Fg)i1^jft9Jmu zne!Ha^P;h3i0(qUTqwJfBHb1Qt710fn{Tq&b|Q@{#@La%y>foFT{1TJh-o9Qfs&3x z;e|;$_M)6BbSA}+$3qVGhshnIb1~bf_#z_&+JZ-BRvIGb>EsOanncNW#H1-!6?}gu z_X7DfF3sv(PE~sTV}w~4;Y`kOX>Tw~+6l6*f)?;tElhjB8%9gTmo4Xh6X9}RkvyEAa}Hb`dPEfVhy z0Ks0mL5#-u6>@3jeUJ?d=LGeOqSeezSbp2?4O!Qf!;{?)QFJbtpx2o zMmI~9b@+epmSXYW$Ff?9tXd)waa(C|cb1kk-WzC^aEA2mAbn}(cB^1N<2H4jePvoA z*Urw*c0U^4D(jvtMbND5KU2k?EmMH>s1;kD!`82sA5}&Rj)TrOpQ#_5z?7a>zf?Xo z3mb1_{~MQvyYJqGU60cZ(p&eIaj%v^OPU`nFROcRmOs3+>n_ES@1litjCU%?U~JLk zKhp|*@}KU;eeOupCl!jooZ~Y5($S_-LCp7`#1~F{4zT!io9x9seVAn?GQdfz89v^{ z@oN5_1t!*F+FX& z+O}=mwr#t6+O}=mwr$(Cef$0HpLLR*ot0CSR6RSVQhQ}yD5do`!afL`^OP@DTSo1I z_xHSwtYHin16uog6%5z{PkY;IQ@$j}-`-n3%*h-xIdX*9L$kpEJs}mwxFQo#ifY2r zCoGIWkVb36g_Z58$xRP-V!Q9Q6Z5!=Y0=99LdCJ2qYoDoWm2r|qnE%Kz{yX}8TKxk(Fp;NM_O77T3C_Bq{#0K9!ugjI z{E<4a*nu_6uN1^k%~!cHf+{8n)SpQL2J9+qQ5wZjnzc#yz&e4QZEc zFz+6b(T8G^+^>(%N_}{{EW~ISp=L@QS0nu}ndM9fYN_dQB!|agrEH0SH%{B|D(+Lw zIo8KhV`O%wb=hni?KvqgV{8?#r%b^H^=!+-x4l9i`KxWx-uD} zY4mz)E^Uu0_%(+eD!R4#NDQO~6X%V+H?+%E57~&PiI3=p;~dpXHzEGZ(XZB0mSm8C zEzRijMJX>eEVwrWFR9C;+6`Om_7Kdu$(=`YUyof94PJ2?eXr238t21K0P7XLfDUzF zm+blUdQ!uwg2BUKL21S@v(&l5K=JTZ3+sD&P{lskq;)#^bRP4(w^o?me zsW|NEXm~&t;U+0pShk6agwvxqCTi z(w2JT%bH(Cr zQ#YWr|6rjWZQtugzH4w zz}Mf#yb!Xmwm%hKGlp&4C6z846R3>h$Pu=bbXv=?koV|%pE8m) z9glHtcp5)^n?aZjfP)U$1Xaq;3o0);xf`o(6!!|RFYLmTEC_(h2|Bj>gJ?*^7!6d$ zrfZptu&0!`I|x7^^_7OqQrMhv;EP7?37l9cl<*>HFYMYkEWJMAFTH*w>ZRCgdwI~_ zAjz`NKdB>=;p29BSdrON$4+e8COgwdxJiy7$shLl_T*x_e$)w~2+Tc3ovj^*p<0B@ z0!7rol(;baQ|o!qTldF*1)04Tnm91> z==@2C?_!XIp;iJIyk-!7y=Q=#zINRp^mCt=i!J-3#aJDz{i)9MR(U~d#k z)KVPxcN44rGkq-v;yOU$g5&sf^z~PSQRSc7G-=CnF}zC9BbA^3SCBlh_^oJs{Ev5R zV9m64O(J9v_sP&-l^e_u9s8wsK;IFcd9r|Z)6?k-LSN@(bl_|M4P<36q!3XAi77MG zX$Zk%{TCM-PH^3X6oLwy8auU%1}Q%WhG8kUnE;%wD54(JS|zRF(&k}y!xiG!G?woQ6L(wDo1i_ z$W$rUoIa{whL+L~44f}pfv<)H?TIk3v=HfukjNT&Q!iQ3%_3|IOBw{&>k7Yp3+L&I z?{jxlDVNB3%c!&?07R~uRD>T|l4|p*qKki6kQA@$y>;B?TiejbTJp(kZi9fC>%I6g z;qymGg~>K|;moak_tH;+JJf(UwYEb^72L|Cs0#t=+9)VJw6z-TiHV^jxv%S}X7jM~pOo-t6Gz zZ(4&HD???Q=81ve_bUFB!GMP^!LNqFp?U zP{`E@d;Rx7=Y8V{W(CZA;KVqe<-v>et?fOr6UKC2w zgP?dVxJwN^WZw0PZUy<}A1+)0T zXIx>~Z5>mP4XNK;DqRsn$O+`C@?9A)HTix_>c4cQBpS9cUKv9SZA?{gPmn9Sf3H#S zax~P#Hz?Jk=O_a^Ug!gPCClU#wXRWnhi^e!nlJMY=UR;|$Hy)O8J0Idr&zW=CH=Nv ze+2041G4FPv4J00eRE>I)-WN+C}g7LNcAlji}WL4@rEQL@DA-}SKK5yixKrTW#}Sa zV{7Z`q?Ee^;$0cp7dyfm>XxblAWb89mHzwFE>ysIr z$&=+*D37_e&<#bw>?fH4z5r}9IH3i(uqBK$)}*anRpCQyPt41~@H+uLZ?-(H{d5>? z2AzOWFb$6|2|vp@I&;E|Q1pN z3%(f;1FKdgN4Obs!jTwP9BPr(l(6@L=cG@*sExu%0lN&{3zjI#{)xJ#JaTUou)#%8 z_p1}6;#pOwt?VkUfAjt1`^pMhQZo0h164U|4GNerUPA>S@0*@usCy@o`K<6qDmp?! z;_`O@b^bGn!?)&W#a#*HHx?>Yh|cKYp;bTm4M38L7g-|XBwn@RJpwt_Wo!E=!Bpit zGm$LRT$4=ufy|}WnF4@DaCn*wtT3%uREW@V;~L*BzkT7H-{pdR?n;_bO~J54v{g#_iwA1x;zdQ)5*+$#_~F>OS~Ty z@9KT~iLl_Cz=P%O{6f-4$N$TkZD6kZ-ZArw1nLcp{*%q`%Vjt_<;y;gM`@;e2PmgI z(70H@5~f2dCFaYZ$@*a(^po{_`oU!!FX_tx`mI(0R$EyO^Mh-K+lGGL+i@e*tQvO> zk|X5_OFHNC4W@P*t$VOAa^o4tTZhQ6;K?&s&WARCtcV*M5_;}24v za*DRL{htGIR_z24h3}IEFf37}toY$@#f9Maw602&bDD+L@X>5H6b>Yw1e{8X zSOVChw?uV-P*G3LHo46OY=g04X<_WbDiOFa;N+DVBD%NFhwDavEW?y4g5LA>34Bv0^BSNyOQm#YOl!N9(w7 ze8K-j5(S1YvT+In;i(QxRtKy5TZ-4~<@9UoHx5t#c*UA#sc~gSJrP=rx%-Q~y8kPE z4c=me=;I+HNd0a;tS9S@0j+02uL#Zu+FzW{9Rg^cw>}Ye&xihKMOU$8)T;g44jG41 zl@RQL&jHi;75yW~Q;s*&WN&Ps1z3*ZY37gXvTUs9kM1`%?@z|S$a}IJ@0CICrbi!@ z&x4K6@3Ui`DjYALYYjz-mceH#Y{=KgbG?lCRWxG7FF`)jNHlVb5)*yx~;~V zG>xTZmR= zh1S)UHvQ<-P-C;r7c=Uc#av?CKK;s{S)Pk88h@R{pL7}VexR{#AM7-+XYB$DlSUQn<*&dcRc-&ST%|A46 z;H3AOH2ukslriEEh{L`CsSj>dgTftI!X1Njx8y<{s1dFt=(pOm`-iZDqL7iIB$%-% zLxN*NqB5AV0O5jS1AI4H`9}ECWcX5c;y6@-Ne28(eZe$T#L`~`O6alfn6YL0_>sM$ ziNXck!Uba~q5`z?f$*n_1V1Q7t$jm9)=j&ebLBFr>4h<@pioYhx#u*=h=1KEOszxZ z6t4L*H3kC^^goAr7We5_<>_bJ?ED9iE)fy`l_HG<%nVWFZg`tmX3rY)8=1f zd}i2*A;}*O=o?uqKm=Re2@r6G85^p2hNffgZ)|<+j{p)>x)=OE=jq^*E#bS=z*h*i zy{+-PsF6sFv*#rrU{C-|TRw@Ps-KMLBp4^9Y7~)a@gD(jv2x?IbrIjbbm(YJ2 z74%CI_kS?t}em<3J-_-=NyWhWjLhyZi}jehafO`#tF@Yda>X zf|u2!vZtaAC!%22eW|j--s7-oE38uXL83+003Ur))^v-LMX3};!<_^Cwfq``P+Elj zwLH3q#>lf$%iE;08YElu$P*sPH$|QjOUHLGWRL%{Er5&Lq{!0}eHD_&T4E@G)ACYL zC?46st&p7UB#82(IbsG)O6AH4CRn*s0x7f*i1hf6@`Am44j{`8+_GfsZ#SRQMI^x@ z5D5uk0Gms-SHKt+uvu>>Pvz|1q21_OnE`~u&BWiR)r_jyo=OcXD^NDN6_fo)RVl0I z!>tmT(;CE$6NPwWa+yb#SpobyrXuAlwy6Ei-s+m1QNPcnPj)TBW)v}gO5yDqmNCzL zDQU)bV&zfZaZ~TLrTu=8(Wq@Uz#Ur*Skp}oKR*CSrTLHNdJP)!>DYCFQ9g$}7f|!C z=ZfIg0Hj`>e%*%M82<%xF_Cgrx$)WnkEfuD_LPD-^$w;VJR~1caz2PNI8=cV9?}$u z7i(Z7;d;o|QtuLbzubAE_Mr>s%VDLR&E~goIBNA4ZVP)o3hG<0j{>pB_;5|hTs}-8 z4m`V*%^uX+A#^VwVHqQvIRp*6m-t|Fl0PA_y{?N2sWigY=(@9^iv$*8lSi*vK~=YT zK9+QJE4V>mpEv)FlK{$ODDuYXc~gZYlRo}?UnI=~(aO+&jmVX3+vM*Je!$P^r0Lw+ zVdaQ8v*4z{;WoYg0cPWGUPgqAK8KgTd}h`HU@&vb>$l1r=37swExm`f@dNDP!+YQO}Xc^TT4!5^Lxhy=k#VrMeM#vh`M|XKUd98WGIEI)%{!fjZv`LVw_GQ zax_0gTUBO(&Hxy8%GtfQCbB|b5sf9BXKNlgSE{1z&*!`$MbA${@)Vw90E}YlEokyt zl@8y8T}fG@9771(VBJzhHVkACYl~N|PDioC)))LmOOaXX*+d=qt{q}HqhifxDcFBL z;NO*m8=fCoH-10D-hN57LP6E~gS3d3mLrggi8`-Q^m;$f+~50AOX)T;L*C6yq20Ny zHS#t?U2B}Ym6K5^MIU$V8z=JrbolQ5X>}!N**$T2XLx#3_kKY2;n#-(qTx@+LwbKY z#sfCI6~pVr?NCdHF#TFCZ>R^1v&I9TGz8O_T_d1+mt1V{J33|E=;DRKZOK(H?g;_h;X)VeiH}Z2-?lu-m%86~2whf4F6X{C!D&a%E1_ z$uw|m>PEN5^fQ(_+!Cl6tgY7@vJ*Wm189--N1VPe4d=c9qndze?zlpfr~L%!kN+mr zj6D$BMI=GO0u%3VKiL!OcYm5f(k2HSn2gdDk+tFWjW56kD@GRIzsCYp2sxsW z5;cf|8)$wbB)K&E?7Hmt-=Z3R>iLQg5{#pLmCFeY8hmp#r7r8&5lDRd1rGf*L#Ml= ztlP=6C*c)Oe|Og%^*mI%-cV+BI3Yo*&NOkS8t-$5rtzmjnb(2P-LyMn%@ZM@v1fN}WWki35G4D(0+)@>egjc^j;bIAF zg!%zFkHO$&8pyJrJDqA)N*vc=>r2sL!IU?+B+JK)Pv6*zADYk#uQh^&BH(Bh>_el z400G@$R&Y;=-o^(&iIDS>52uCAgn%$a7YHc7j*;bKPJkRaf#dQZg;A-(>z7X2#l=SQhP(U27IYz( z5Dw=byUvfobi9)@UU(OiGy}kGv4%UxYt2C^23ra;R%PXyaA{>=th!Ki|FqHadeGB5b{oO??43VVBBhiRI&13P{=Xc z|16M%3wKfMoy=Qxyg<|4yuRl4-mf}wdcuEB+Yp_&*-ZEh7hPgUD15@w5Z8-#yB1M_ za@Bgk%cvFd+;LMU_+Hl{*R9=$myO?e@h57^P&nVhcHL8uh`Cq?CAViJ$atC7{_a3z z-0xKAm1}Q?I-mp1f;B0By%L*CU>}IXt`|kCO&F+14jjP2YvL!8NqReKUHq6!A!_&q zPVg+Q-3rKk41@FW&#DIFZsN_eB&0#)K64UzRXD@VAvBh==B4ugF3r)n zE58YNL#zysR0}_Awg^qm&$0zS0Z-P-vMt>QPe#qM70cGiePq%{eqkcfOJ2fzO)OoU z+u(4YOV)z$*;J?iLM`7?{~SBlUrg7~c|?&7ZUbs42upU*lf$sswk>q4YS0DNcWYx_ zZsmjMc8}}&b5#{Mw^8nKCb5GAjH;hkDP?%0#13HcEk&!NR#mRg=%{Q&lL1rIn}6qdj_G?k4aX1CViC z%UfKsj7sJPYOc+OR^fl&;e99ywSn4Nm>tJ*gq`kueVC(nAEf&$W=Q%T9*&>MrX}vX zF6~(EF1f@FciN`JjsA-Ir(({QFhYgUI7Apn7MqQLESpE?o~J0P7~+rD3EXK(UhA2@ zz!EUVx?o;iY~< z(fmMK8xSw)qZDbBX;gIhw0(K*>1Od1l&ei9Kd&5%@xS=@%^kZDkBppd!*%HdcP}y% z)`PdV36pkVeO>079gKt3GL-)X&_1_gt13ktcli^)v;X8Vp0D`JE7_h@S&Ld&;AZsaVFzKtV$MRQ|TU4f9jE261dC73a8k`6>&+b>`&``+5GSb z`DN-&S-!;LY^t_X2F@D-X#e&Qf)Wn-f=WqwgA~xgL^+Co?hCUAu?>N9wY!%r`Xy@l zTw<{J?~asSPq4x;EU*UR8r~@PM7vb&LRZoOJWRJYSJB|vjC!u3& zZ8%fSEM@tJG8%~;2u$4wdp5nhoe@C^oYgG(%B!lyKhDB8NHN|<4{#^lrh_vDXEOqY z&Sud)zoJ=)ieB}~})PGIWY}B^HFjUJZMSARaNKBKl zhZiGqgU5EQ;H={frt+9P71TGo0nAntZ2GlHwJr6<%FusgC){VU)(8MQ%s^L%{}>5$ zfxu4Al|0P7VOA_aG$(&U3-=TPbg65&f`KPMn_wVLUW36NK%34@RT3%=%sj1@pD(Q% z?#}HsJd&LkcbN~$%xG+xD6m@BNS6xUPW1mJ!m;Hv2)@USHc2b@8+MeME}RlCF~9{+ z@tonymzdjQSNT==*;v5-syT(O^`=oeUR!t9RU}=>uIF<-Ga%HbR}oqJl7{1HLKYeH=enqA z^~m)@SGIuF1yiXQQYmHqWD56y|N3ev4`RLUv>3uUe5?8MoAa`xBvtP(X}1S^55S6@ zvIDg`IzLZ4;ZN=(Hke@Qzh8UgmiZjO(;n2-$yNq0XaRb8#<73(VePa7Yw6a8ZvHU` zi1bOfNmxhgwmklgdZX&+tvo7jqGsO?eF3sAybUtO1I4Q+2e^?G^)qJzgD`T{|kz|^= za&yADG?j|j*V5#LU_49|29s!xq{Z4;R<_P74G@&G2T z;{dMX8-6o^d}^X!umn}6;{2kjZH{7YLk`Yz)hj3==fXjP18IKUrs}xrAxYVR8GzWF zsVJ|%Fwfp@_?L?LVAv@VMK=ZoriEGqd`&;k398DbkRq?KY z*L1}dV}9FwoQjK>Cg{H6KjyFE7Ko+CVxBt)6sH?QS3dauN=R4TY+wtVvf8znSs~^L zRs*c6W#9m(O0HYDjR;%N#J%?!@<=CNl6k;DhMZx%g@?>=8+3D%LS;0BH_VF*s* zH?|doI{6t8F%9VP?qhuSS)(5#BQ{xLMbS;CbrGGe^UG%B?|E6)DmvV@XEX!eW+43I zMWbv46R5xQctdC**p_scdRMO7ICvyysuBk5OD$dO?gWcz+}-_HjE@m^T(%Ft3lP_j z78jTcZvyh+-gahJKJ`#ucstau&tM=mcbzia>x-7kDa-mQ&if^#v>H;*-1=y*Mr@G= zeu4*5mqlndMs3^&jI$aK%lMA5bi;tUuXDV%GGqc?152oVKI0;0;7-x!fkbGflVRjpMW{j@!Q6A6yC7N zk(AvDvP7Jq|IFUDQl!%vqM=8#KM8AT=TyiUeN)hDs~&C?hpd-;Ch8>VG94sfsGZnc zy{TmqLZj^IqttNKzWl%MI{S9LRpok6mswhCMnjJcJCR$jXK7e{|4C*{2eLKnyj$uh zGsepW*K6o4JFqpk%b?6>kmHZ&+>MB+cn2rRDM$nLRH9>YhGP=8V+q#d!QoLiGs&q% zx+Pu7sWbdtg%+A#4AmOJ9W}KH14MOqSiq>aOV*1gYWP2B z#^DI$d#YjV1Muk|cg*Y$CN8!g>jt;z$~*1|%3tqySkd!%1k%4*(qW1gliGD~uCQ`L=DHp>@R4We`7 z)8oR1V@M=ewm(3|zpDhekqKw5`&&>>qbL|8Vg?ac)Q*kon^#b3-9V{*UNy%cGgDlW zwl*wuABMY)jFisjY~`TXeC!)FB{s~cXazukET8`??!{N-%{1{+u1xBT4GRUrFXlcS zO8Zn?s9;8CsT4`Nv|28AN|gjhl@*n|^k#@KPxdc$PJdnWW%`?~&Y@E6{nphregW@h zo1(kCGdMj5z;VVoBj-m%Z-N|ATSfe}mQ8~%ZYcsS&M65VI3O!Om|#mQ?-xmNSrUjg zEJs0iMt|q7c&E`2kCr$um^fLIVOQc#PhRXx9%nYV&z+WG<`<|w9e1=5FG6X;Fz$l^ zLAmd4V6_w|7(84+5Py0zFlz@rzgt_QsY@huHAMd!P0r@Gh8|HiB6w)5C}dZn3`3+S zL_A)o%ta%;h6#<2 zT}UHZ`-!k@GZ3tH5H1NOPj6&Hi|j(Roj?;vt?6g%Q=&SFv0(6e?Hm;~dl4-4npVZb zP_{#a$LrWP^S!*eA&=>RL#6&6(^14B3HS95Sgz5P8)8dLQTFmzb{4R$5`?N@GH_5s zrVK&@(y~u&(;Sh&Mh7>3fWu^6bq)Nz|oPIV5YBkftocc{C8SHduj@&WdguzrpH1?F)o*q zFN;#BmpZQDO%pX2T1F2xJcdINjsJNsf`~|To&O_9U52Y~oVe)dZL2_Hz-lDMq6nB6 znypPtt!$)SH;tQ%iBU^RoO1lRgHd_B7(3A+`nXOM!bxI;OID&c070!2hG^1X--`zP z)hVK}-VG<`w7C_x4+J9C(XsIUQo&@~&@M*dV_PSIU6BEk%RARC# zkb;xCyP?$(!`R@MovuV7$Uxp9ywyQbPU3n{;ygN{h4 zj*5_}KeJ(8ls}SV02@{*CLWrI^dP-jRs>G+pta2rE~}JMQWl^(l&emcGB{`0x;CKT z1e8?H?v%buvJ|K^aVOYJi`78+vna$wN@F%y+2>ND22qld(+Vh0hwMS?H4E!IZYBAZ zEb_|W`p|>TdkZNuh{q#|vw{$7Hgsz=dni?C$B~myD0RjiFPKhAQSUHChH20#yHS$T zrQ{o|DQS$8ep-E&V6FOCtQcYFP^9=ZHg1N2Bl?YDdH z-On12VfWtr^}e~mPSA*h){g*|=iDa=jSR0j^Qjz@d?eTvfxMVt_VJ^rzET{dsOH7G zGmb+?ci{cxXv!A8os%{4K~zmWSw^bJ{p6H2!Dz~pHTaMhFhNv?DZ@bH1}pkOY*aN8 zXiWin%#al<{YemC?D~4aO^B9(JJUe#5qgb&JdhV!K}>(YC~LZDE97oWe7X8+E`O{Q zDS95DJ}4`sL1sTSOWOW4P@iFaJ%oPs|Ew}P2opnWFiT3TrpYK|(59QHxkGE>o&s zATpzM=Ytht$P<2;jd)g?;NpTHjGG$f9W#00HMA(6l#_m5B_hL3m~*X~C;BiWQT{jK zMowS7`o z!Q8)9UU6?#(zd+X%m-+%IGs;guiZrf{)^T>hvtc0&r z(5^-_zn)Wzo8SAB^cp5ZV(HhoQSwAh7IR0I>q_QTiud=x6`d=9FJ}j)AD@3irL@ar^+Ct zWY_bE>3U1Q61Kjo-TW9GPDn;4rypeE(zAz~2w8ZZt;MXptG44dJ~ZoyT3^|0B<%0& z9e==KBI)4B|p3V0LlPt73BQ z&$;|ac-xS=ht6XG$M&o;iJ@zJfx_X~?220X65TL#(#{I-;du7pHk(mMQ;37-Fq;8R z9WYpESP%J?MW#!V?L>bdqq$ zazc!cFkLC{^cCa07o{Koaz)^C+^_@rNWf-^ow46&1z9NJ+?9b0iW%*iIBDzH?C+#|Pk_>fl3Ij@j*>1eDDSQbBYr5cT z|Hr@WSNfmw>_1wc6S#obK)N{e;QjFWujC07O&_@n%U%kv$EX8Q5;`?3g4EFQ){(vg zyJGM?r;GRi0(oSDR6ADl>5uGb*aA@pW*s;V9^3UBZBP8drQ}r3C=c7wRJ$wRT}GvgC_#aW?sYzaibxpTLm+JZLG0~Tg4kHwu_#pnKB(7cNdp4B8?lU7 z+@rrmq7MA115@x50-6f3mkt~y8{-C?b1$zk-gfc8M-%^cOC7UuZ)$RJ&x87Sq8o`E zq_jEreYhE7h$dg(Jk!uyeyO$dPe4KOEG1cr=Uk(AJB#;>o()%)XLB{gUb|jR`^|qJ zD*%~@IauSsaS+XXGkSS#T#Bs6Nr4w+WLb*;TR#ge2L5WI@=ZR}N#7VB(@fVjGW%{U zN=9CbIhtkz@g=F0x-$BdP2Lz^lmCG?6D@n1^VGHj?InMw(awtX^EeMRg0mm+{||~6 zh@R_UmWZG0n*M{N&b}rXw_m8@^f# zNDf%-HXprf?S`qBp-IvU6K%=t_S>nT(#FvtxFxTo9H;GeVz=|IRNXhAgKP!fKiNH0 zy0G5D4PHVGVy$QGEvazRbua#c{1gZg0+kPcAu6vKtRqBP;3)l1!IT{g3{?X`pUC=A z7%E19G=^v+^{^o?*n+6^Qs#j?R1d_UXw>;N+?V*C5dlFiIM#gDW6qt_=F$)%U?^&=HAQR0Ica<*Y z?<#s-9v|W)hZ^wdHv8n*tNWf3l&?$Zuj(h4HIVMv+D5s}q|Dp)Q_KCYW8dJNdhIzb zEs5GPmR-_QE@Y@3lkzvEP#V3o>LHdr4X~<^v=vK0?;(27dAF*%66i~$7}20ljQvgF zLk7Ny3>D&x*pQZ-{pv#dwLP3r7APNTy3|3J$RCYEeE=v#6v;}^rf;tRwJ9&we_7%i zt*~_O$!TT^_{uB7WSl6`ypTf~(}Wbze>$Wgd?Gn=qMQc$s7|l>U5!NM9>931gx&}G zCL@LAU#4+@9KV9?o*l#zubFsXW~5xPfl2d~mDQggcRz)vZA{O@%5cIQ;PhEkHVwwT zV4Ecms|F@z={PFvw^7@xh~0#mtv5{rG0Pvkl(mg48@oosrVREv6S7Tlb^|K|9TEJJDEK71a$x{Z6zXc#b=ontN|ru5(x^dZVF z!08t7j`M>okg*fysa%{s&gncHu-WjwKiPLu#P)dn^tD^x$UlV=0j-NGnF&q%{lKVx z8oKJrBSvN$GWje>G^;>-0)F$UQH#~{A;yy5mC#e!p zgjh~TI9SJextSChsO+Q%_BD%lGfXFFeZIcQ95z<1wnqv}BuhcrR-mD4|M7{&6eRVZ zc^1Q)RAp^9BX>3h8O|o{-o@V2bdi^yt%;#|c+-;llm8jIKJ=y9<`Pvs&Pi37yA{@S ze%oLZITO*_CB|BGJLSCSH-7Q7WCHiVDZ}K0%%YyV7n+7)eq{J0L6>LMXT}(b0!{XJ zRX7=r1-FSTclb(2LtMe}(nqSEFGlKGp6hy+8aI#Fv)f!j1nt#jSXTFHOST;B@TJVG zf!%K191!k?FS3^|Lo*^5xAj+uvv&q5_Z4U8)Yl3Ij?0kX_oH0iOA8eV`)z*83n0qF zFK^q&bQ0{DQb@!rg%oFl8Ck2lRfD^Qjj)X)3LB6Zp?#;8=V`mb__nsh+JWXewiMtB zkMz2Lfg@6r4~Bu5MB*oo**+Ue@IIT(9ZJCh1^5)}vlp0YqFQ=sNh5q)rwZNMhlxA! zusi(Mu90`_wNc5H(-+p{l}d^O0E)1Fqpaf6)fMi$8;G4f!PIEK4c~8?9p4{A52pWd zUX8>CsD>(Ag-`py9z~bDTZaFPkZ~uJ_rm&w9O;d%VvTn4&MD^!fR3<^*hdW}%lAjP zlXyhoaNT{d_3nbSy^rWfwc3K+rD2HLmod+~DP{`5#eiqM1LkdYIPKBHv2ctVhd1x{ z<>tlh@K*56p{cdfd@R!O&NI#+OHa24ER$~eEf;z>Nh8MXx=PP;^uy%(XQPoJ9X0;9 z=yTAs;Iyjs+_(tG=rvgJ=k3<%6$eAbF(1-E1{l1{mO{hxUCKRf&sW^)*PZG`btrXR z_m+2C`lnMm|C5TSz`KQL#OLF{72>K+)=zeVuQF37)yryN8>0H|yCve9ZkVprjjy#g zW0&`H`x>o==U;H0s?Euf&a;T7yGl3f=eT`$?B=LR&OGhRwVaYK>~`(n2j$kg1vp$f zLzTnB%(iB zNZ8V!dqDNy|BNFDyk0OmkGa@y7AS|)eWD;s&T)(VqhfoJj;nI69Y5Q@?JdPD2ZXE4 zy!!O!7o@CM^YRKgZ5W}m<*)nKz5$!S^K6%;iycu@r|c&4+M=BZ3Sfd~(9`JtaWQMz z4hb-r8QYj9%2nZN-|z02I;*nh0j5$q=D!~hZhA75SoI5>sQF3kk)!TRgHefaHqjs4 z?`AU%k{{@;l5|2M5z0^{lE9{@2s%)BLXX zu1zIAI>in(eXl=eH&3f|;qTspeBU0(suuhpBk55xGad(QGK1MrE znGG|qjhAASBopj4@V~0Y8${F}%J9^~`#@$ODk|2m$y3UDPxm%^@n&-4q-oT$U`N&< zo~(H_=To;;Z2KrUT8lRkoZ1?G?bV8$TvvwWu|dO)0od>Rg9}`+6zPY zCp4Cuzd@f5P!cn3A_;#+r!I6&G~O1YEPbOr1^sQ0>96Xk z3B_?n1GmFJ1z9Q2QliuHJ7tlxF-Kw3(t)_~6jal1QWRf-Lufqch5s0>v zvkq{Cx_LGV*Joy(co5BIQ!jMm6`+S0-WTNyVKMjO{}H%jx$AYG?`?SNe&xrFe8%g% zxHY;;e+iMaweM0Y?fs2)hBjs|k%&&Vdu>bXaBus=gm!;g@yl7r<_?3K)jM6Bqm8)de=O%L^!|}uxh>*guX=v z_wh=~^ZYq5P3D(_I&e>L3&;tERC^WWd=9UhvN=A+vBmCPE>%c3b*=?1BU{5iJHwM> z(O`cd1osG1(H7leAf+KZMW6XHQti9CD?*KPN$&r#YGvK?&t2Jbn%^*=T;{XvzlcZ& z&w8p+vn@5Foiju<#zT^?gn5og~CkR zDfS%th~)rHeM;z`w+!pVoxW$SabsVIuIyTRbBhH7*EFLob?(QL+vSL-|17?tsQ~dk z1j`o%x;{5X;s{`YXQwYP4Ekd?ErBcU2!bbm{ve01*=Qf}kgL~hTTW1|Vskw4c)?sB znyhpVHvi=svjACXYB)8U*VydskYm8DV@QJ4-fN(16w=y0|Lp}*UxxxG4shGxCz$8A zR(Q*|^eew|R8_2921NtUPlVCQoK-Q={EF`@nT_zTZ=_&Xe()Not4S;|Uf~9&lKxjh zQ7;2x*3U!Em0cBfAhu!L0W2RFX~OMPjqK~(S;mfcxiZfVFs;9bFeS}xUT6K%+3xua zG4)hAo+vTGS4A8rK)YJKz$N{TPpp7WSQLjYiyGfyCu#9kjJO5XCGSc>MEfbOO$PcmMit3^jkF#@_Mx<-r z{pB}SSz!pWQkx2LA$?w|z@l~1nXQ?p{b7T|C?AQ&g0S`XpWG)(#==HFd1Sq^n4fEo zt7?2ub`U#CJpJu;x6` z%bZ-Vmhx$&`r#CH`&su{06b2{916SAj1eH&EHk{Rqpokg?KBtOOscsU_6Iw&McC^1 zmC1&p;=~y8lq+b`zJt%RJ1rp;tMYNdr~SEZz`^ z&sTS>N5oX_iDD5QdHWp&xct%@tO$bPP43}(XytG#^F*8IG~MD)c!gdporRKq78Xzbvt<1l0;f3H4JY z*@g>;KOAYTIb+Hwk{eCQm!_<{xnyZeHo^L{wFkbK<_c!8ooM?UyKklpzo%}ePO&QO z;=nah*cOIfKtttQ1zMf$dSB_eLfr_3eNFI6j#F$a+i4&L1liVO;bz}S=YMVdX3 zz=$=b3bx~B-xlV2Um}uOxG9X=vQuJ+hC0m}`bE)*z8P7^iw24--vRQ*8IV(YmK+5g z*|`kkJ6Wihq8VNDsI45tf+YKYTljS@oy(BIL0V%sE?4x@GfW#^!CbCTZcR#uyo?gA zFIA+a9`7-(6+UE=FmL6MH@)H`-i@4?pN%=-kaHi8PJ)jov==^Hdn;s(bY{0Fa54{1 zrCQaEIQ}sm_@iQ$viFFBd8HeF51;&7i?4ai#Or6qv^TUZNe9>n;EZ1NZgQ~fCFn$d zB~u-&2S(Xc=*C=Md%=BSk<9ODq2muof?}_H9MeKs75!4XEBUt$hBKBfd9VHlP^o-j zTsR{}h0;V!j9X}gex}Wt6(#I(IBbGV55p_BxCUpt0y`Pu0%+Tut;)zKRFP`1*i!G; zNT~82@q}wsn-6K)v`EqTjIv0(tQS)^4LsCno~=XBTzm*Z0}ux{gc6!^46iU()>8=# z0Zyl`-`+Y?^g*rZQFa|Y>8`P7vSHG+5W9jtYKo%2H1-p1

    |Aeimwj1YDWS&kOxv zoxMS#b?(8*UTVtHE~y_XK2lUwGh|%fbrJNB_eiaZ#aS~;J%KLCatq0X(wLz0fJ4bp zll3%zfMl8p`$54;0pDEkBRu&W+cQ=1O!sV* zbQ|xV*FjZ}eT5U=pef`6^ zWJdR=|D()Vy9!_JXy8R!jhk=|^3;NEF7e;r^yVeMP%vi{v^%(uQ6o4$E%-ZRn5Z*E zXLcdWR2q#G5XMKcUF?W${702G<(9;+Cvj7`>?qcFi6Ysgw2sYk`;Wg@mVG%{TUj%= z+dK}OYh4Z!?U&M1B$2V#gO7bb;1AKCboNSpxCN z9x{vL^r0L5A5_v098-R;1Vf?+&$mYOT{nXNeBYU{=*6Rxg%h5W?IVRrDZHu~fe{DM zA`6RPwda9Cj3`h${UpCq+$pX|8o9|NFmJmM1uZU;BIJAt=2Ip63O+>DAFsdxXN@sS zGg=gEN&2%b5Go0D&7A|Ts;vlLdqYw(Di{ZXGzSy{ArkMJBv>wC7slgvYJg_9jX zqQA$V{6hJ}#EQzj-qPLP55LX`4G#r>&&yV1N)C2f1T$auxjqY!xjBzNQh|{1ug4g4 zzfBdwmfsZ~KTdhRDWO<5 zLvoD#ugBU-@pCW)G_a!^{clO2&L4YYWHSM?PQyrY=zH(Oyb6Xx9jxj>1L~f%C1tCo ziXT3{wWn=^viKq~m!7BjoU3Cs^iRfFRXOeU4z-Qg*wv?Tg4aaf!Vgk@j4!V>!)pTo zM}IsH%PvDsRqpLOGDIYWizwQbR%lThph$IK*#2VipU&g@sSJ-zTM3~H(*2+$dx#jb z<)r0*HcX^sY(wN*me%`T&OXtDw5z1xGCSdr+s9-8Eqoba@*X_z2*sEtt!duxJ&_9P zS=gVlwxX-is$^@z@{e#?ZRW{pr5e@czfc)*BNIl$tkBV5=?>2$R~k-ZE( zMwN6dqyO1trT7#$mzSLj3BG^{&E?fQvJ%iC3nuE1M^6G9o5Xc=m$qCDFgB@(4nQJX zkZ*Ga7-E}8R3K(YBH9MfO$WS`Gm0304wf=ffog*$DQd+dCZ(!3-a5oL8tU`mRu51H z)jhiq{#bj36{n^8Q)eC46JgAgBprMQ=HU=io17ZI7k?>I{vDZR78U5#qn2=haJWd+SpYeTFfPjnm*JHH)oxF$;gegvb)KTlB|RqF zJ4UZ8HD*&aEJ#}AAtsN{D*K{kvwp1YXfeQB|4zV3(2XfHZY0rfIs+ z#gL04tO9+*0E_&0k^YqTSsH z@3r&Nz1or-uE|3h$5GW`K9xR-ETQ^$Za82gXXWzP@THHPN#wd-09h)~2w;l9gk ztg_=;o%JZ#Pn-7r3AAI_>0yoEbr-ROT3Jcheh?NiF*Es~C3qp7dUK=3I943x_J~6n zq0uF`x}gIelD*CnV+PH=^w4lT*DT|;-xS8QDuHNc{kXDFnR(ZX=cBwzH?VpMxTEqR zsL-!#F;YMVM<40A27q-(1H-i-bK)=JqN{fp#B4>Y7bD)Sw_f7s;v1KSx@rWEXY8y4 z6q?Lie&;|pWFlvdBYE8`o@SRDe-O0MWOoTVWAj-ke)037;ZIO6@;xV|g(&vHQ&pn5 z0Vgm^nZE$yn7m=$?|sFOcNOF!q2}VPjo&^1-!C)^Mq9ZA$B$$RMmqsEkr=!cZW)d$ z6!_sTkU8wv^i~YSWj`TaB9!WdDFa6{rXJgw+%x zl53I%_5xP;d`m^bD#HDr_(<}fF8X`lXnUdjH&)NS=!as&OQlGmR&SkNWAP)X^*8^4J8bX98_N&D{-~LpN&edWt zP{IIhpea2uh8$`ZG)i4hpnzME<@sl@Fxmm?!u@8X#IvH5mjeN(zE5^YJ`VS63t8%2 zdvj-yi+Dl7%u$y2LW{U730anFv{JV*+T` zqzEMA8o@|!=x0q`dM0Kcal+`;@%1fwIDUe6lc@LQXoZyLEfgADjh81~g!pj) z!S|i_#__DQ<-?*~y-O&dKUNCs)o%_h^5wpncbwx!E7W5+-8jP}@3^?hBKDAyp_vky zn5)w`(1Zz>laSJsbWjHR2t4M3k2t&ZtyYR}V%Tuj2<9Q}E=t>76M1X&G2qpE13%VC zE4$cJE`3%k<~0z0@)U}DZy$;#>?azBDT}klpQ^3NM>_Nv0#LH}$qLA%d%O#o(x${8h%IH~81XFm`%t-mi$t z)?2H-u&j?&``rarLr4j)kqi4i0z`)-;lSaODY@j4tw6ZLHVqw^8UrY)C>0NX@80h{`}vi=0M*o!ZXasQ#HlGTy;%N*!LamK`eEie1F@RZ&b;-P$j zFHyP&`Kd)%Wu>;Wp!RM!ogA>PeiP~u{(Gs05E`~eF4%FVXK2;DMYiHSZ>@iC=tb^Z z!S{XsSN?j?dg7zoNc=WP@tdBP37!xch%zjAR2{e?74?zCuzBJ;-zcxes29&E(n|*a zPYngw${%YK^{ZyhO_I+SMOWUd;ho?qLiCCACL{yoS zG`uV$JLIM|Z*IUu<>`BXH?tp;8o@i6j@MP0n1e78{F4dRVTO|CEp4BePpkI9v;m?~ zI3WBXF&%jF(sdd-u%%ZA@82e==o)d~to_rp{(^9hB0aZxK&*`iu}nVJ4l2X>?D!9! zLj1?ZtLM(fs~hWBy<=ew%Qm^9ID0ta%2j0wC;u?w6e+)1qPlKsvC`QZ1ATG_$3POk z`MRS@oc`sxOA=QqCx_XV{BCBK=u4x0DYzM(>jqhW4N8ZkSi0zs?zbB(Sw5o9LW~`L7-D$L(De^J>nnSO(!<%Qg9CKQ)ly(RdwjKe}WvLatl)4|OA)XErlhFG;zyIzL zZDp0e9sZgb4gq24;MIPI-(rvSwx!c|v@rf!1_ ze{XhezL8t-i~TGLaNd1gA@%*NWqWd3%RH&qrpHnqfTO%31<|k5mSGl{d`IGc-I(z# z&R+_Fj;^BNnBB0E573YM$Se6MJNji-*!vb!u%8S%wJZ`qO}*nd1{rE_^GLAmAZ0}y zH9SoeSTwu(K4kau2_YDL2QIK0b)e@nDyslvkdY!5*gB?DmFyo?UnE zVv*D>!dN$0pkl~&NUDsdd5e-V3O#VSb1&TmO+(}rF6p`4r~Nv$89r%|G>S;(!RyNx zNjgTu+5*Sev2CIf>2&Bi_>dKX;mdGxRW%GbO=fQkAVE4s~kcDl^i0mq@E zdB-`fKkkB+OgU%D)fhA=Ot)`)uV?5-1nA)Af#Kgy!JTQZKpBDL0ee-8~wm z?l^_E?2c4yX}_#4Ngx%h2{n46<6i}^b(icchW%8qccgv+{T(FHR93v0NM(z|5iMsB zg4+!2*NOprh_;Zp`FTJ$C)8zeVsY*0Q`Y?W;DOJBhMnE)fsg8t8G8>I=Pt$zf5+)* zrwC!^#~RsV6iyrT75$YEZoL`LyPs?A-4<;Vp=loS2#DB<%#gHty2%ngoIco7*u6K@ z?#K_1e8+SDW#jT_|4ClLHDtK7K}|(82frL>2SjFjo8}F0AGKcSVeIy_o4#r!19N43 za(AvWAm)`ghkx-D&fL81vQuN7{vBHcqUJ`DxlA!95Dr5Dmm-fXEfFTak-0&A_Hyi5 zspLt-_)xMCCuI5b?>w8fdWs1;wLp5ZCHTi!!u(75#$iG1VJ#S4+o-18)&6wR`~b(K zF&gC{_0h~eFie#~sa%Rxc{-j=;-dGyi~WL(d6}0E-iiEs*iz7B)%|3cfpgd>u&XMF zeI;A3ZU?W@CwesYN)srQy~_zS8oK85uQT@q?^W-(;-V&L)CV+7a?UWB$naerzgw?O zdGHoPvfa#a7({x3eGJ+qemxhDJZ^8Sg_d-2s%)MGGsaXKXc*HuM`RYX9*QOX-%63z zts1y?L)x>1By%X~RIpJiS-a5|zAX5L_%9Gc%yD5rr%5>?--I? z6YWscQ5=%R;5vE0Y`s8j5#LjHUA<2By@guB$2`G03xRrKw*+~(fjCW;VO|c%LV?G? z<^Lh1tBA&0mZLP>I)6Mzi!fBQG3%<#eGm*4KJZIx$cKx2OU!pn>J_iC*Y1d|xgnxUw2BHuaOFeZ&hCym9d_hbh} zk9;Nz6@%%jU25COr+8WXn>Dgr>Z&ev{XpSndTEa~?DJeGU`V@R7W|qFxD1l}fKFs7 zR#DW%#lb1cgBK?q72J|zK6$&IwXaqUKnzHt;f< z*{`F7p*^-u&7*$6TI7eDg<}Q3oTu;;Gr^~Z_&1%KuN2tR94`H<%!~-}rZ=&C9hM)) z_pgZI1wz9kA4RtA`Jua^@@k(iD z$_EaUCFX@;C)pOLDNs+S2)-`sng!KdTsk1C^sq4aN9Hg z-i=?RM8BCO!*#+aUZG>2{xg7dGrK#=wW=&z=IKEg;#4lvHB6yHjQ-R-*EJV6RSf-9 zE<{HA)~BS(i-zEID?}r~skVwvEAsD|j=1Z}U{aU}KcSDTzglwUX=5GSl;&j)Z$B1T zTAI`EiP(975Vy+6f>7ujO>2}j!FIA)rD%N{ZhrJtwh_F(xb7XN2|ny^#s1rsA1}Oh zu)jH3cF!`tyIKC;O+!*+oTjYMvQoK(=;4Autrg1n9s&o_?K?H)uzNs(JMO@#yN))g z<_^goV-s3Ypc@FqX`UJ+)%zLdT716PlOjp{{kS<`%F<@!56^l7z|N|lDzCmhsR31_ z_!w3cW~deZ>}68TJC=FQap!2hotK;YxP>cSWXhr4faBN^`^48K6e4+R?6ORQpE^Btrp%ptT#9wm-m+4W*l`;?kv zQ{p-UpyjlTOWAlBeNiH4z5q&otw&gMmpNq1)_QOv2AtiICRo(?A) zj-IPBbSgVmF)S4oQleTohp(l!{c+A#(U zqU#ha8uF2F0fa(v9P$brkwqfoai;g~z{>XwC!ddAh;6rtHHf-56$(+5W+`I6Z;C^t zyZYgP%fXoeqHT)GF58QYh^}?jya*JN13sw5dX}NBk5rMqM@fYQndUnr9O#Dg{8q-0 zSYU(ma05zYlS#7jpB+|i*dU%N6c{lYAtoK*PuJz7_8M2miLd#qallm3m)uC>XPUjc zF9=uLUpwIiwXtchOF!`fy*FSycqn~*R7eAfRU|S?wi>eJ__mPSFxK?gMFLX@+{tUo zMKDWfsr~1Q#~rYKWQQ$t!BLiei8hJ9|-BciCoa;xNrmkVw{ zds-vV5PD>)E62H!R|kbfNl`0@&NYYcpM;uD18;%+9OP`P5XIOOvDHvV%j5no|om_qjWS6p3KPl!i1u zxrj(Kw{|{_RDoJ-U_es{__SuvC|RKMxLM9^HP6(7^E{I&?U&=MD2m=oe~HN*b7nUE z;SpSW`pLTRY6>V*3mRzo)3@#@CImD6(3;Us`FaoM@Sy9V6jeaVgL z8o?M)$Cj}*``oRIH|%GV98j<%31FhMh=H<_E~|EWq|z{LuJaFeB}aNC#9`l9E8Mv)vFLjefZBRQ z;_tDNK1Ib#&eSh^*&|a&y4g!#7qDk}S3QLV!I@3GW)I07v!1+mxM5Grtv@=2rI+n6 zpDmc4&Bzh=b*E)hI>d>{`WTYcb{oWsB;y1*t(>FXgdL+$XKaO{M}|EXDR!Jjm1tf)g=Bd5t(|Wv*4j7Kyv* zyH;SiMkbZ<8U`bAIkVqZNfVHXwQ(t;x>lI+aj*~)xTnF2x;~fjPk^B-A;zL}1s);< zp^Rf=kvD|=1}g-IhbWUyP9;*xCE9Phq*7nkSY|+6E0T8cG34^kB{|u}xIcMFW5UnR zB0br?AUXLtD+<7!xkot#_0AdK;pNXL3Hx`TVSIcLcVQ~&s_Z&=EQA|`I(V-2v!tTs z1B;Ffhzbs6&~v2C;`pv4BsW9K>2Q1$Fnkwyg?B^f3y_4j8Ay-{5fWwMMks`i<~g?t zJeaO9*OEn1%(o$@C?)L3%4S~u_cZ3-7<#9>%cnPWwG7M(4(U*H#U+g1DKOsmsb$?h zQ3VnDv&CLWv&F>o`$0Ag%o#+f3JDrz(vm2Gj+Qxe3Ot%wCqHeAMlMvmHEskdhrJrt zCw_#Sst(;(G|rP7mXA!|={RPr)xs7@yXoIoVBOctFN7O2oREr%lxzscBVz;jyBN4L zh!7Q!b<^=h?>z&Zf3fc_niNIYr{NdgSD1M(k_)z^HfY@BqNT6aDOl!Y&_0ADU zKMb#Q&>tabpTiQ&f)wnKUm!XoJ^4v}^37+U|Hj$LG6#(>n)Qdfx!1Qf+z8h36=&}- z=O1|d2c`|Ep<2EpC~7)sKFt=UI2(jREF~ToOIX(|k@tNpgU(Hk3&UTJm+VTybrQ*6 zJWgk1ieCb1qS0WY(bGJd(~@c%D#Kb($jU^?WHSoU7PWJHv}x|&=Dz)|B!MXhs_^N= zc44YZW`}KP6&lVKYXl(k6|}{#GjKHP;w@Ur>bugQy5cY^uDcZ$gjlmtK5`a-``6Yy znCH-CQqtk1!_mv9oVfnkH~;y~B5IrjUr7S#$Pl z6k)G9lb>bS% z_wdoeElzAvA%0Z{$(LBj^-ne@f`g@FWxWVxI2S8m7ECZ@$+V5V=tHl-m=zFI=m>w0 z9=^=JXcN>kKFgOIL1=lhfM7T?%LngDBW{PIXA!hDIBWZlol=CF`LEvAaQT$KtanhW zi_kxOS+T?A)BobH#NlVP2-Xv9JD(Ya?Dt;<3G4ukuYxRjl^;Sa3Kx>lzyS7&L~mQr z+?j&8MMe0N`x-C}5kEknzp09l+bo#gNP)naAwTc7U{q zI9XJbebL3CStvsg{enxJul1}QYj%y!u%T3@K*rfP1jD+oa#ta7JJ&F?pq{>e=hS+a zF_`*#&zgj4^WC6K?y9_0A%JN=7R$p@@=VjfGHZ9 zF}*(-W|49E#J{Lsze>dP*RR79KJ3xBkrx1~SW0VmZR@V{iNe#+#%Zsq?gJNWvLZMR z>Oj`bOm@3lx;^q@NNhkLbxEYO$cyMDHxVe&;iB^aY^{MeB|I#Yaz~_{b1;(?-e5_zVmG{&Lf|yh8KNC#H^|6WotKcKHouDVf zA@Pg6jzB-n{q~oWh@hwWfibZLGEwdGHjvXm(6ev1_q!9yN5&rjH^T>vHw}%jW9Hk6 z9jR~F`qRfp;b~j0BB)8oF{IApNA#@ghWA7I2Ns_)>$|nJ|3=n5Hb?lhvBEZn9q_X+2%})YUXKcp)z8tI&(Z6J@J;+?<;#}pb*~T0 z1yFLf6guRY7+oJJJl(97BWxZzY#vDU3b5WTMap*f_{0L5@ZPRHtK_#V4;w$&*f{`Y z|4G(8qJDWlI#fqc_+%9>H#Z;=j;oxn=|pS|{tHI`u{B&ORS z%Y~oc{$-fqdMm?x_s;)-6j1s<#`hnS|BuN;2mV{f?mw#jA8`H)u4Ljhc4mZpIsbOw z9rho@{tI!I|Mn1Y%2@Ev*lH~PrQ833=6}r@{%!6}{ST0TP>Vume~qSsKSbfnsOuRN zC|gaOn+o_pG~^T4To3=e;0fnn!2XYsL1iuv^PLF&2bTW<+&}PjG5<2_&j+x7TR0$m z8AJTAO0;pP4mmP#GQu~rLH&GqZNcgYXz!^Bb3_tL9UisY96VpwJ9lr(lT5d$u^a4F zu$;5dJwN;U(`v4Di+j!=bEt7MR&unf(NEb})_cl3Xj$&4{3nxH;VaW;ZSyNr+C$%N z6Yrm|fc7tAX`NGcn?4!A%57idsQrtO8kE&#OQu@XR3*qF|5LANrMY?8p{=&2`R}ju zHNErc?thXTp>&-9Mw)bAZm;Q)=XCBwzr9a62_c0c05@+<0*uDxZ@@8+L*3}{AfTM5 z#u&ad_iy|SbBzV3LTAW=5w^`bYYlTbB8ClThC62YyZRzG@xt^W;47AtelvX?*`O6! z!J#+k^9_%MKh}c(u8ndzrfWun9)X@f0HftcPa@s zCWuZHF+vB?LIy!LaOILQ9Iy+~11%I@GyQNgsF)vXm~7r;=CQ-Ea~lDZo{+6<#t0*w z8C1?|&A|^J@iyfO=&KkUGv`v!ppG{-pE#3kQ;cxjFhUa}fWsUulKDoKOF^Y76!Ftx zfHsdqfpr^nnc^XUTT%$ROt9aE*yVnJ5u@IrMz~T^K-dOr1{Mg6&)=Vkwz^ThQ8+4( z^r2mQnAx>R7a}(GpfuVp#jXnenw%E- z+pYL-LO04dpQ5YdSeu9V9a030Z^LaH<0jgZRzqeCNSKWVFDHtm>y`KJOP1~PBSIP+ zT3SdZ-(g*Wb2ceS*!!xhtnqxt5(BC)YfN59V%Ynh+CL1_>_D#fH%EP*Es%e(O;sBK zIo5%_{9e(;W>6BMwJH?>w< zAC$)#)EePgIPo!Z z!3m`m&H7s~J}qKVhm#ujNP|dp=9-P$-Zks$VEK{Uze@}a_#{E$J zTKknbEN1PdNmSZ@;d-$F-)$;oKg+g96YBG;xjOk+)Ea>w3CYaj*|{;?6suk9`MWg# zi_<_%?w*q2Vjf;uK~mcx--evTYF`W_LHs*I?nWbZ>w@Hi9#(KGREF^iLw7A!nxrc8 zF0$0q2u+T3`ekzIkVm5YB+6&0t^Fx;H`xm*C3MzrN!fA3L`G@eEl;6kVWyV49F{`) z_{QlyA$9f*tD_Ra^zqJO^?{xz11iN!jIa-O_SdbSB#M&!ywZMTxj>ZTlp2@g>YW&@ z*81W-Gqo2#_g!W6sqhEo27a#r)YGvG2n+~VFi|9wK#ag`$Jbm}2`%>{n(gFd=PNE+ z*hSyD!-K=uZygVf@-b3eBIWr-TLcz}y!2NGNtec>Hj2_ER;C7QP_np`ah3gWXrs(j z^^IDa-C^(6M->Bvyzk-sVb~+LArh5W(RmN|1gCN$!nt!`BMMZOmi5`wwRr$b8*hN2 zZ|s|SixI20nKl$${@z|7dKvWoC=nlOvK=H9(VKDxeuJp8H0-Fy35njLBMlyiZu4Jl zxRy?GOmHg>P;A+)r_t>Xid!H}&U#_77<0`;%@%=;|Nhgd3LmutVJcBb^;fhSE$7v8 zV5BsC*g+7^no~~M)b9Z|3%ATSjLH+4Gf&J1{ zze7D)9C2n3owzvL;?IIaS__%=d z65;CDhE;Sj<7URn7iqLt2cu9+W~z50;X#Q zABHf!$2V7KZg@4I4>0lzmUsBJ&^9*!-ePqQq`*B8K|?tkc7;mjza+}K zA-R$x(+hjfGT`yKoyJs0pWxcCH72(+zlK|!PB5?2yMT&pl15$w(_#SyZ5>bVOtXmN zg~Kr!!XL)B*#-gfA`G9Xy<<(ej1Dh~TPTR=zfRyXRFt>SmWi$U0@yBdL+|o)6ln@S z4>FyAte<1YfF_*ACR11ON8^TX@6^Q+U4Pt|i%IWbl~6T0i$$lWELS5hMb^GImNm(j zFl83gepgQ~tW?8h)9-&)YoZnXc&|I!p3f!kQ*FAPw->bzT&RY-JHKK zk2~uH;K%ntqRL-_O7AR^H}S&Ii_b$EX~eOQA4R5fpBy6I+){P#7%#=FVW!1u#A!dj z(xdw{ZqeOP5j`Y%v}WJ{_A%@vJxbf)vDVEw{k2$S7V}pYDAHAr?iiX{*}|!5JY)1^8yRq2vqm?%-lpnf zJky*4!I*)s5BMah&}!nHBW?$EvV3-T;QP%j{#KV0_L%Ir=)$x5YpYB!n71d8Kr(}T zdQ{X*W_scrfH2tvdVBCKPtY~5T>}BBoxg5@sgyv=hrqRJ#8fc{NjwJQ|YJ6_&hd6r`P#Ew4 z=wbX!c-Btmi+HN5twz^pF$Q0G>4?J(Q(|!jo?wMI_3*q=A1n7bTQOGA<*=Jhr1*Y# z8AK83DbVh*bw3UlYU$u?qvr4XjMD;LzeQejBP}AkybK48W`*23;*6{9uv;j{H&!;V zk7Ek{dCP4l0irvvQ#feHbL{9ZE+7Bsd)i`h`1_f-aQU25V#uq0qMu=vdp!`Ys+n#{ zJGGUSBzg3FMf|hVc_UagqLBHiA8BL6mx`795+KbBi)cRr)sfg_(%dUQf%foqw3_39 zit|{(ri^ND7n+MyaC<3e*UWBK+rtN%p9H$gT2!`N?40Fj4z@;H+q}iGj+zfaIZ8x5 z#K`PM4CRGN8-7pL-5)gqpl4o)dHH4nVySW^ooF)KH0%=UtC=7CVwQKVYgB^6x-d-lxph74H_K!CuKAnG+S=cXL*ZDp#o zhNZg-vlJk%4`QD0^I zXp)tA=7=s7v~N}nLr5IM9k0B9U(;8%lX>M4!vNUFlw{gBsWi!)QW6wBBXe4oMh)k* zpP(McWD_#v6KvCLd^`6B=C}^#x-+_eP7Fz4X(WS!GZBb7y(~=E;>0=Ex;|ws54A_p?z_yu%w?{cAawbCB zV-uey_TvnirrP0HR|v>gAf;+~j|>DDu-O#p;tW!Q3{0XctP`Sn71dP~6Vl5zL7gU; z_cC=N9$^<*R;(K$R96htV<^1y%ff}M^;_opGZ>vr7GW|o+uVWZBcmFo z$(yGkJY=ZR^0klY&;@a!eC0u444#5Hv~p5YRWHPQ1|IcewNk`4&WfgIIB7!C7jZRx z?1nS1eWs*_E3rjPGgXfxTs-Mden)aupwTN8f3vA-1Ob7-!PRwEjZk)8RnDV!s|gV7 z@lF_Q=OcpgJXORXc<&Qr>%4zHpb(bfouizWDCnxJob~Jd_8Gb-n`r2%R;i~n$ZHbX zoRB|iAVk#A#Nk?e-y`iACfV4*S8*bWie72)O0_(urc_&1!*gCs_Anu7>9G|jL5-YV z|1kA~hPP5zsDWJC=4Yy#g|OrVJ+efT14h*O#EJtT>n7E*~)85Z2=k0me7N;Yoh172*hz&gD0RA~0y>ZhlKrqMY zWekERFoYDgCHNyrg9w>t0J}qY{R6{aKs}k1QHQeJfp$C6ZLyrqctbX!IDp## zZO)u#KiL#+*6h8?W%ZP~4Hs>EE&LNKO!)*h-ul?_(IS@50*K~;&#mCYO zw$-3Fi_%K2N?(TSa4vl8SD~J@tMkWIiqQN+ysF4-+vZk68kO+C9s07FE~y>r(36oM zxEpEf=SpGPIuhgBrxSl;n9=RGnC$~sksL|h!kbuGoRIzlpT3b+(o{VGGws25#2Wl> z>@fB1pu!w)M_54$;NyfQRBZtSE}dnp|c@UnV%t=FK^*TDg7kKift(uG3XqkY6;@fgPNzys)ANaw1EwtWkwb1%xM~jNLXLl zMy&5<-k?Y$A^w5A+|P#Og8ZMIcBwY+hugrz%z9duZ^=$3O|QRI?BR+hIf}xLN8Pm^ zomsh3V-+FI)V~c$@q}y+IsTwi_@j|kpXp$-|E!Pm$T|mg;pHG*9p>@jjDCmojV+=W zbP`ROF!|dBs4tsObR|2>%g;X}p$gqWldt6i^$8+SORP-r9Ln}IR|Vej->T7yAhq_O zP8g7&%uEg`s(Z|P8NsYgQc)aC^Q{T1^naU3KE>kFHw@2W4MY5aM#bCCUyVQ=Hw(S3 zhJp3i)jplJtA@eMKpE#o!P7$v4`F~QoStJ!fc=g_2IwSqK_S=ZhtbqQMMD~*J~uM2 zhSAJI$#Nh8c!}XqHIhg`?=nm3)4F1lFZRJKv5L8Ag@!8MTwZ{dPDiVRz!rLg7gn>t zaO$G`en_&6ab)$=8D!c6X*lbdlxO|#gEhDj`Uf*M0|d&Gc6d;pmCuw|5Hmd8bTU$t z%rqdXQSHoDsv_`z$Gg2Bz#ri?7OfH#kCIZZ44Yy&s0HO{Bid#oyjp{BuZ3sIYV2Fq zxAis0I41kOu9iKlcmYNb!J@_jI0u1Q{380RGs(u%bI&eAqHzg;?9v;fsnlh$|3lI> z21nL)-HB~mGqG(aGqG*kwmrd2Y}vr9>&)V0y zSFh28_Far+*ZlTf(~F)Vb8U_U$9=@u$^skm(Tv7dw%{NMblx*`t9qp z1TIWrL(SDZeqEf{`txf!T|X)2@aogPYNLb(Cg|$|NIOw4DEI`Qo;N6X1T`w^@bA9w zU#lsbJ~4*F*$_pIe#fR$Wy%>SG=h|bZMW~uO-z(LO~m4z8>Z20ZN(edrE{OeQijv9 z>Cy!iJ}^=wgZBW9?AE+6G+l5E>KFeK-(FL~@gG52_zLkz)^DaRjYIQKlkcx7!`JSg z*~MsB8&CU(iZ#ORskhCctfhHlI@^%G$0jH$*kVTSC zv~xqSB^{S-zp4usITAW>lmR=c*AkT<)pAIs(M(!yBK1pQ{ zUc+~HH9lJS!wCPjj^{G!vR4X6ojim#Qoomrra*J2K72pObMs>rT}5A92QZ|A;0~M+ zT7)}F#x(gaF_Q5pI6MRbfjvC`Ke}6?0I%ru`PzSq2kiaQ1z(88+&oa43XOzwsgB8k zv_v$ZS)H@&0tdmdR|I9-ar_Ov=8leW)>n=`l^<_{EV?gmiUG{~I49E-`5%Uc!Oud1 zgxV-22EN>*OcU|@<4_`O$m#EoioDkJpEV^3u8XxFHew5Ns8>EH;YEc*yPqGzmhWFp zKP`FjgZ&*6NbuRS-YZ4_g&cMRep>P}O1>!&!mSwe-#%dMT>GASdh%z+hmG7mn5gs5 z6+O%#u*IeMOO)jjpRDXN@5THBg}=ZQ`2PsI77Bg4vX}AnKgP_Ee+vXnNNz`LQU$+p z8X13L?A%}ol|AJ@*7{`rW7Un6##_gfFWx{XCpJ@ntp2;an4;cZMpK|(&(U10=1%gv zOwAglrC4oC09UE9%dw83TG^AiKpoHdE1Ewt6tAA&`vPO3uh}YJOONNuzJpq8d_KNxcBOo_>KEX( z0vM`UJp**K4dbNQqZr2q*I>yMya&1S0Dc z0TFxR|Hh>8q5dv=6^dOR1($f4JO%6qFaJONw*lxRa@sYQ4|FKVv)Ve&n*v0A%@&H` zIEpNE&51O&<^A`TrMTAFT#(6KHnWL?_}N0{P*Qq-qAogZRzx^HeoR_rK6&|7dghE= z5InfNSozoPAF2EI%*^Dj7WO;)Z}tBZc=F;CLUHo*;Q`@y6P5~1&NQj~h7ro-21$OU zm-|PeHK381%jfpAT!n0FB>hvVgZD^TLpf`Wr!Yi5IO%dq5dgEJ1P0f+X2cx}<) z0?T6Dh?U11ZgzDoz=)85X#&5QfwAtaM-#39ri1S-9N2JDsoz5qfie+iPh#L}D)8PQ zdIA*-To>=#Ldm-qVh#E>4H&aOPchLZd9NGz+i^K}=D+@s2ab?TNKY4%LUy};Ci6NK zPe@O?Z^M<2z{+pa)Ov>nl{5`Z?wI|4G${PAC^Vg8O%XMqwOL8$KV0p<0+jX#_J#j+ z38O9gYqL@)Q70Uu2?w|A&zL!eee-|~RBE*T$=TZCy=ZfYa&C5}V^O&qTzGc+HfR3c zB4el{&>sEG(R=0mjt1ir=KKOGiT3*A$H^M6f|Y-s(O>60vW|axZ{r2 znVR(@OYwy@yKT)}NY*c$5C}u3qcb4c)-1E&t}92hxfoP00)0OWFi$-a8C^zirekQfFaZqBO2;~91JZOO7a=|jk*T6aF|qSK`x zWet2%FCQ$>!$4D7JPKVv?gu=?k1OKxhYOrw3gSh;p=CtYpMw)9+v{T-^%m04ym1lJ z;`V)H)zfx38IPRY(>gQNcnQw+6kkGzf>=J3@*leqX(1FUzUpz31dNQVm%tOFKYp8A zu1zRwyfw40sC)CAXI98!4+5QDJ_*WhBfFVo9EGx;y>WlX*LT*M!XPY709DVTR=_Gc zG?F)DvV+m?ugZJRZp}_;BA%zv_$6Cy-Dh44ho*VhDdS0so|S=nI96sm?`Rc9P3^ue z-mX8ets^0!o#Aj1rFvGb*)b-~r(j!HDE_&TBT8cy;VIn>afsOV|1^d z!is}&43dS@JPcX^KEa%URAg%j)mJVyxvQ|8*_ZwJaz0HFs|b_``-4#R;b=mqOV z{5mo~UlU^sv~62?U(@n`+$A!KWUA(BT312mFJPWJ-hA-{(f0V_35HiKCf&(u`V9&u zV?TN&0yXU^!#6^XKqj2lzd*XR#&7e5IHu8RfdG*XPN)w&L4~eU5zumR>h21=$-PKy ziz;S@zhZN}o(=i0Cn>y7`13s@O`m)NPoMS5 z?bV&x=7Cme_Io+S5o`B*^l$Uu zA&lcV)|XDtw2TI@2O}`!sg?>ZUIM&&NQH``YLPSAO@DM_VEx9>0DhbNAxLy#(?+#B z9a)58p%**ftY9M-?fzl`4{a{D>Y5Em%#|W#! zC42~r2jm6&iY1yWQ|MS^dOx?>0X|uR>SdzKb@#LuY03d6)3)d+c90H+=Qps;zqPsm_u?8Hbfl{T~~xU>Nuv9^cWYW|B*g0JEfu z1KGl)Zyidna@JNzI^Wdw^vZ=%=l zRr7sSpLeJeo!C7o!ZHQo%E~eX@+plZU-$_bZdEBR&^>%uOVIrd7b5;kB;!+0i6>Gb zKCuOUxX-pZzow`MJPw?FhB_!bE4S!b;i4XgNW7jlsOP^xdEx|(A&SKF0yiOZezbV` z(ZuUVZ#y_4MGVKgLj(=I)5cf)jPT7d(L2VThnpnt-wMVm*Xx&YYkbB{wqqp`0@-tr zoQh43JkG#zkWd|_O^yKUxH98kYI@d!3H{{Dpwbgf8c1JlTeTCWcIZbZX5ef@{QsCW zlM(=4Ot8N{`@lDoa*ZR$K=B+^3K@#XUlJ|{tD06qr9Uo_)RDfrAvcp=rVxQ4Gz@6i zX(yEOLTM$yDQdoYXKIf8&{X#VrpSw6@6UGEb!E0PxO#E5s`UCU+G8A={A)=#*xZ*d za;Vv~p}!2bZ`4i@lZZ2G#lxqu(}9d#TolZbJn7`fl61C1vJlB>gvv=?l{JztXOT8U_T`nlG6DG}FP}$1!Lz6Y;~nGtY74*<(Pg4$o<8l$@|IW=A7;1yP$0*2B z4%1j)H!@(D^=GA~SfKySc7x)~8c1;#{&;_%StgQMDfQMT7iJ|>rwl@8XAGL=eE3#; zmM75rm%$jEYZbAxaaE-8ZP?W$1qFz1l;UBDW1oVuMQEP_>f+g;SJ?Q9ay!2wp>CR8 z-*+pOL9MSFo97y7a)5tPv)U_4;5)*#yc*Ox)K@{Ml|A1o6ihs9r!We`fy|n;_?H93 za|+_q02sbrD-U?Ae=N(~C^x;U1MA!`LUic^op^B3O*Ass?%F*z^y~7$OJ~95Yc11N zEmz@`c1SuMQ(Sx33pa-EFD?>_9nP6`KG=Ai*hCX<`_3KtQWi=hmHE1z1`y|;vnW7? zI;~XRt$9gPhw<5FAyZ2`q^i|73VbL5shDE^O^EFkhW$orc#fR;;)s0gh2~rAhUf2w ze4X{r?6|6jQSn1nQoUM<_BkBo##9H~IcI72+YtYL)iMn_9!sKC_OBR2R5%KED25GX zskLpBs&O->ln6_A)C*3$lAGr2N6T8MAQ2xwOF$B@5@Ffr+yQazb6i1Ru*w;pNd>Tj zHIBeeh@nU>PR1$(U(57cXh56)a;h&*vR6`WCx4agHvGjB!9;e?l2h+ zJKzx}uyGZuM`HhU@gJ}*M-cSnoKnPvyZ8;V@T(Kp&w4Cz>a=im={%rI#SbC;Wz<9^ zwF3WpHw6XuH%<@--4_8!YBl6cZY4<)Qp8n@LS_Y^CjU43O^QjAB&f)(ddldRw~-z9 z-{Hk@ihn_FJHNC6jO0>61~`9;KDBY;ie_rrqM9rMke<#ljW2D!{#Wd&qEZR9ff8c& z?1K?~6f$Vru}oAm-n3L&_R^|{v(BU5yNKnSHs~cUXJLZoUZF!^IebxZySp{}Qc0Rv zyw@$kfOa!-!^g=r7!W21XIyeEbH{!zi7eh?D-yWfE;Hc--9|P8v0H)7JCZZqoEyh_ zZJfKu%{5Wn-a@r-T;4j%Y1A+Kg0mLOPaTQ=@ZAoGSWiOiyUXN$UtQI!hRP%qrjJk| z+xo)bP#nSOZC75l zs~ww_`CTu;|H@${G!b-3@gDqJz)eoGSku2{O))zC9dZDx3Qj3{VCL`CLT7HD_&c|X zi!fwSw5$~uzU-o(93&;smzEGC8^BA5v8EAWQZTHxllIk)c4yUSs3P)>%|lK6>v@ob zYP3RCblDbkb#&XRj2@O!vVIqB0d1_d*{<1I{Cvf%yU`L;5IzC5*VRiLt;bt=0QLiW}*!!RSKQZ$Y7n*Dm1Ua8;Xq429LtE5Wox4(Unp7b?@|le~px z%t7dH^Ug#E&PrO>T$4Jbg~p5yu5-%UbOgU;0B>7;N_qruLPGZ>e_MY5o_nQp3Swn{ zTYfACSq&(PDp zg`*p+IIKdj>CFI!f)aMwHj^LkAH)G5TK|=uO9BnnT5aqZ#Z#0C*@aE+)e1fHjEmvE z<)0Je(_0!}g5fPjY8WhBdDjCwJ&*)E@;X}>-J9X_(6+Mp){`cUH(-zTLzo)(GiG3y z1PA+5OlBae@-jtIyxZJr5c%u*T|t|-y_I<1?Y=ULh#arHMOL2-1$=u`@lIIQzFx4% z1m%t24Rwhr*E_GLqXLQ_c6}+x_X}i%Ft{V4aCOH>F{Z=cOcLNMi0*JhWbbxR!2HFC z(*ov&LF~TDUQDhW{tQX?23ywq%qXwsQ*g7xZbnbOo$j_$Y~DpMH<&kWelP}&fXLjH zn_COYqcN3gqmtkI-H6>@`okVCuiG+=7=o9o+>`kuKV5&=g98{cT0=laiou{LPI*gOl6Icvu6}L9H<~H7q(26SL%FjxQ)6bL97XWX=M{OK; zK4W=Y)z_Fj_4z_%$>E_wSPeozP2IK%*=zL?Jg*7ht2ntm7@6qnJO7XR|3~Ko|D*10 z6_$M2uT{rq#$c@3bjNuDi2tMe|0fh&Q(lc(5hk`#*P7nj7u~EM>pb7e`)@DR{Cfsw z!rpYpX3k$MxZe23W~TWSFrM0DPZ@Z~a+t1M7=>1DsR%(=w-)}yhZY+C#3SFz?w^hb zQoae`?FL%xJq%~Z0>ZMTKc7NN{sz31!;YU?OEn?!DB{~Yuzxq0%~ zJ2E<)Wnx*FWUJ!u-*wyTkL+%@H1fp%wY3s&5}8I}R2`(zEI<+>S#<^S9}d*{X>mV7 zR!EUd)vG$Y1r-V4qszD)hx8aO<*|1 zP{qOX5*zZ45F*)#crKZgRiXSeT&NX{?k&mn!;F(V^8W&ZC+gqzF*~YF1@Q@%Z+_Z< zL)c#%WC>UCW;+iK*xplbAKx^X!*c^LX1>}yp#nv}9Et<_E?@ks!EJl^)|XLhYS0Z1 zrv}_xm?u64XE5H*CdqH${Y$QI<^2WwXVCpC-h3C{^oaZk*5)e$j4cI(%#oB~#7+a+ z$3{&Bg0cIpk`E!12vVJ~Pw`qt8zk2O-x_?U5pkTs^(*ZpJ&^{)kY~88Nl1M;2E?W} zxvT}*`1eQs3ptk&QFzWlWd|8MWH?Zq)+> zParV(2SYc?{4ByFBDZ3J%541N$aahRTmxd_tCDu)eeWwbtXSO4_N2j3Y5WpT@vh*C(>YQK5n5b>UEYc&=C;&O&#-7>48!B?WhhiqyN3kFp4=P z*W-pwi=O~zCwGO}j69WadIqiv`f|@g$X|a_fkcR8?+DtzCxHA-v>xH4OmmQhQCR0x z)amjS4JSHtt2E4*yTSjihm_Y3?@iprrs!#%DNVF0JgWb_bp#mK|6cc$jAg0f-qm_s@!3!%jE7J|5LBN**N z*cNe-Zd9qICu?VeAZw4Gv^h-J@Xi8G_&RagVaQ@ur^shPIJKu**^qhcQ2~D>PnEQ7 zsND`8nzN1K$aL}MKoEmgrGM+D28r{*CW2vVx_7Xeww#?V@3BV6(c~l+n`)TSw6pP_ zPcgduzs){;tSs$!CRpA@pDEZ5}{s82C^OC zCdSDNJ)bckzc8x`0D6{i;3mJ6rzqC zx4IY|#`nyT3){G2L3l_Y@2x(YhNJ(aKMHKeWh$Q+WmT#$;7f z$%h(}!dK+!hgg(YVdzc`Cb;La@~hNYZ<9+_barMbywtTR!M1ZSuZ`BMSJM4L8hYxf?wm5| zQ6mV1iyrE7T9tvqg@hBS{mH9P^(%%YH#yMQmnWLG^Yz~j^Pe>FLVMk>Z8#zbiIb9x z5i^?NL~DV|O0#O6%B{+si~1VNj+_)$NT}e{YTuInHYRP-1$44c^bqLtn|01m+J#}7 zZU6cc2?si6ClJyeBcvD&ccuMaA%);fm*ggG{wHHiN81th8cQTI`T9rRfstkdi{eEq zx2oCjh){OHQ(`t0r}XnAbfFGrm`G{E3IPfFXf*7Vs?4?Fb?(03Z^95kWg*p6H_5IM z`r@7z+4!xb;<9z5o5d)DO3VThRP@oGRFK~RN%zg{CE_b#F?Vyty<-ThK6LDN`x6q! z3TO**+3iu0HU;&ur=`H8baIVuIh1Sqf;JkqN*ce;DOeaeIT|(`CGYI1NJ55D?#AeF zS9Iv%lC<{mNo4|+%V6r4?Wkn{sgAHb?{P&%Jb6_xH^f-@Jf%-f#+t!cPN{i`5Of!t zfejG2kr4pEMg9C9Csf(L_)ts_fW0*6znRoYES|?q>iPsv#3;$506U_fl?)iN+7p!D z5639q@wN6$v-AXKr_Y#b6EdOROyY&7Q$fp=*ZXoI&tr8XQ~EuuI4iCQtV#6jPCp6m zu=)D^vM8aAh-lH0N27U1TgoCX^jk}&RqV4U|6^Jzi;u$%Ll3K1jwo5g#Wz!gL&dk; z?)7&DaTM3Ib5P{I`s4xz@NVBJ6!ozRi>RD2>IbeMidU}Szf*MaD_^I@F?J#4Lg2>z zCEkMktR?;-(2hl)#YyVXhQY;yAuK~d|rvtOH z?GntPa{nC;MR6-kh;H^{&eqdlIhFMiJi@WMPSWRAO*Lv$yLQkMKvRtKK<;Asus)@0 zJb|OSTP_^MQZQ()w2REha$@%&M%g3+JSu~B{47LwK~NK|v7*u_;L663GzE7jAEK^rUik z)t@)}JU%Ufnri7!dU}g|MjWHyh)0qi3&{k$8LDmQ!n>I-!7766P{Dk$+;5D1p{tentP4@@1zIcPcuQpZ3eB2BFW}$IduawUR*EXKPBq3=@0nC$@Rug;16um38 ztnvy(^Tnk`#Yp63DU|7e7?oTqjsy%6c&{7TA(%-#wZ$ zE<6&&a~SWmt}*hEjq0ybVfWZVMqsk%VPa>(d!Go$6_>w?sRqh?5epk6Nebe(%>kW2O{;5^3_V%Y7%T9ULn?L zBlP*^z(32Y$}AdoTUN#>5=__@Yp@ceqy3hurB`sEZfnpdPp8D6TP~`!h!;FpzYa?Y zEBa`WG6xZ`CL!TjUzUrbu?H1F6%`IDC?7pI*sFO1BJWoB1Nc!r1fJXSO6Q9z)O0~M zfkFwwR>qjcP&$q&c0<4~!7o(DL5-K?kmJMo@A3q_h1x69-yTetPm~d`?@Q0zi zvR+&RhnJ4z=_Z^ro`gg*mfK?3x8X+>8rRA*0sZ~dWXd(4_cSZYRLjN!g$Ra7A=k}LbLbHfpPnuq zy;08cnL#?aYx12`)%Bh-y(j%oW!{@BStpTMtb|XM^<}kQ0Jj2+RMJ*B&ZMOnQnC8n z2ek!c>>%3?)l31569pT{8e03m$@RTfCPdD5EvxgT%j%guR7LQI9R7?irL7#Jr&iguwj@%9rnKvS_x+(wy=@%vIG%c>=a9E zJC?$r@HDA?_HClYa}N!ub}ecW<|Z}?TEI8_YnN@S58IU_ocdz)VkH18^8##t^5yx5;5AOu&w%ESYJWt19|(jBdKm`y ze?c?F5iO3fUC4lDEBFd;~EF5FQ%AmF6t z>$Md=a_8X*Jv=x-6YmIUi>;IkNGk_x9vgj~+U@BM&IR(56qF%l;cqTC@UojzPG-Aw z<5}M}(bRrBd}p^()Rx6*L-C~4P`rA0dWOI-e%TYxrCN}@&5zKBtQ<*#r$k}GRi1W6l@2N zbacWZjNhwtlv_*v+e)GxJfzp}JGD!)_+S&E&Y-?PU7aD{Q>dmwl-V?e-5POWoU0pR z&@D(8fv2J_!`A3XPHU?HYh$=oZw}&GF$6eo`4jki{E%gD9<{JGy3L*+7SUxqroqr( z8e1aSA2+XSW6~01(g*-8j5p-KskX6~zMa`RBMMnSt8Ar3oG$zfnOD`WsU_(c_e;#` zUyT76^JfPg9m7oinWPFR(1v}%cqhkmXOo|`cOG^G7IJ$SB_{csIIa^vloFqItNV4GMEy_6oHjji_*vOVSx!Ck$%0y-C`Aevz(A&V$cwyhhR>UWImqk-UQ?XrwLSH1BueL2hRXa$*&)>F%wSbq)9Ffj<=KJq5qmw2f zj&-{Twy4a8;m}UFbmV8Y+?%H?llXY z8)yrdpQokDlFiMNz0YUY{Y@oy#VHK1@$oUi{_$9qlrAvbC3EsJjLI7B z!w#x{s@V<(TjDC;Jtp&5&!;4NZt|oiD@X37s%(MMM@BEZ_;K-hX}z(n6*4}9=q|qo zOeT-$6LIy;wTReG)Q5=Gn%mEI@u1v<0DtC)_6(-y<0M&cqr-RJGfDs_Mx~FC`H2CTGkZac>Oar;pm&I$IB6sJzrR>JItd<0i&}#!Vefi7oZRt)@2Wbnn+FiQE6^M4EsA0d2CA8lx%V~rrEJE z7@o&4^`%}L-{?Me;gIQU3&t9l$kahxKR|xu&zD7p@lh%WfJ=BNR8 zGL~stG6dmZah?0w3uRWAeFldOi{>>(*T}^}W}(UAjzm8ZEG_IH7#j7o@V^2I^p7ZY z2;ej7W`afCGHn1qaRGDYRF6!UWu|9W5WPqA6vk4r8*k9*50!Lz4Ohv%6NBuMciD`T z;LUFvo&Hpfr5E5U6taR3!|yehG_0O(Jd)6@8|9MFIKPKRx09uD@v)tjUqfzGWBB^xJcso}cd57yG{t$e>t2CFI_Iu1N8|&3} zJT-IoxM|A4c9gy~EgCSws+K5}9?^b(eXn1OvTE{9G}3)%GCs@kpX?xVSz;s(e95&AOUoGSQtCOL12gK`L!D?mVkIC3hhi-Yw5@j) z7G%Lw*Z5d)OL$cIz_zWuZt^lTFN{$m-VSB0 zlIaE`-H@Xa`JUU-HZnU8VV=3Km57M|KCyI8X7z#+Lbp(!Rr^}b^iyH^1*ysSX7uRo z97Q`_Cec;>kgT=Go#lf@uMl;g;!2XiO|Y#;msx6sNokRpWHqgs`E@HN)G5a$vuR^A zVCPQPg!ir$`O>se{c?4amF=!*srH3JYj0-V!3h*K2l!dI^gL!nW9!I!GRDGa4n2kX zG4X5bB!RO*X?})Tw==W3%Mx~mcXX!$ox;Hwv)8YUkA;$L*B>h$~ zS!KLdh9ReQcR~A1T^YQ_u)Q=pT~Em^2@Qi~5osSon$6I(Uv9;9KTrzo+A~dtJQKw_ zj1?{jFsz&c#Fo>I{_vTgQc?hS@G^og;%8X>m|rPRRa(P`>(DkL^78eB#R7aFH1(?Z zf94_llZ%RY=BDAmw`bzw6P*^G-6dZf z#1D6guee2Lwr>P>?G<9UkNvq|?J+=)spZR2w1L)~Z0`o|u7f#?pyI$^VcHEnalaz< z{{tXagBA_@D#ENs4#_dD5yzWW#u{JKGTlzpoS^~5yWnS<&QRSs-eKAr%;wlCM>UB5 z!|$l^!pTQBVYryN(x$#cxz;9%d8PTJk`1VM5i$~6Y$o$pSk-h#4gVc3Nq(1sntY8I zJ!se`PbkJBF*uwN!{EwKk<7QgCSc+Si`xq8TRt}D?r`*kID>0`!vk zh6tq(Mv|Dske7Q2iT8OnD!&|hc<#1&7H5x8@zMsK>yrk?>N^O@$zU*Mj8-)DkmrJU zd?#I=A_OBWM~$92#>otF#cU0c*Z3pyUkHuKB4l$03Z;?zNkzGYplq0G+#@bTW7&i^ z?fms(*z#mzoWf5{e(5pBk)^g!wl!I2?Jx_tWzSCi{3Lo;se1gRdSCc?`Tp;N<~$it z8t`KO5KtNr{4@aA6w3D6QMgwyq1ws|IJupVeJL&5`FLMQ*{TW{&ysmxh$}63o3f~> zew&9s$&F537$6yWuf0=9roy{d$dpw1ygN7!{IL3WocEsS!7i1{1ohhUKSdPM+OQ>{4V(@Kd}t?D7RgVeJvFz9A&)3#i|O>7C0GxU!dcUSSt*^SD04f zy2$muA|(7Td5rCF`gDbv*=bSgdnx^!MewM==S&@8$Mcn;xO^;q{L8+Ud|$xeDPoL2 zz`a+X;Te8U>;PegU~&G_4fiO(g8?ob3Y!p5LA%J2S=XLLB!x@z6`g4k(%p9danAxX z%7*emjY3Id>>K!=P%D3p+MN5(UQg_OlF@=8^NzyZV61fHZJK%&=EC`Tql)OlrMc;c z0qz1G5MI7eVS>`nUodl5aMka4LiUvEMS|BSsF!J%_Vdg#E6E6A6=9}2c;=yw>a&5E zxsBKyd!6i^5kwJ`EN>y1VRn{){3s%0r%FOk=evabC4wZaYYSmy8>j1NcJUesF+u|B z{k$1^1=k?X7K%~l1Q`(O6yi_MRXDo+Xgk{dXfu94&|uqPP%I`A2+6dRe>9qAS_jzy z`zF4uxfe_{i`fO2`*4^yf8-KLjYIpS?m#s6aZb$*J-Vazxe=bJ(4*k>KMImasXfpA zvc_%hWp5twq`e@?A7LbEl&QB&yz)```Bx5q1XO{sVx%=<1-c>tH}a1}qGEI|oU^KKy!oXhFMk?}MGgUfOm5aEaDQ zbhH|5J17v40rRIeq7jM6R>?Nkkw4sluyfEn&+BrMko6+t-ZI_qR)wo1kK&(>4h+x7 zj>-{!17i;xK|A`M4Y4_G@`;;pk)YM$7}MZsIyuVu!+(efceeAKJ{uxw&PNmYo9rY= zHed+J&4CLDvSp=hqxDQR3uhMKt?XjFvZq9?t77oyPCl%n@?wUa!Fm;@3LjKeZX3V& za_ySzS=c3OfVEXwOd_ymO#Yy?OluEuGN_HS`n9`l5Ao0wL5qSSY zZGVQC3B#h%rqMX=Xn~8S1bq}=Y-K2B|1o!;jnqgIW_A{BtLQk?H^UyeG$iRUQfBM=qXWX8AT^l$2z z!aYxr&V9f=hd(T~+h73ilV?tvy_JD{NiH0ct6Q8Ik4M+zervi``oZVbGV?ktyTzA$ zW<>jt*i-6U8N-HYd_)6aVG2S066RX}%jueCJ1XekKX8Lv{@8fR$)4;G4D*$V{$#jD z`lfm`5T=;c4|iT!40DKvMXl23kn-POF8CyNTB7kSVtiXFZ;ia5g=~&P@KsJWL+})p z#yz=Y7wnf_Z(F^M1H$I(@0k{|l#{(_5L7!eqXRy|oaI#prq*mfOinLR43v^B3z~2;OBZ~*5^2B zm{7#+D>B|yK*d!+8^`as2k*<{uOxJr>+hh)ONF-?f}56qj(?gL%BhT`3PHTetNp`g zx`375WE1~YDcm5>XQE%THLa`NlSfHgy;L`(mV51H*ti^`dkvGgu|WcD*%VZS>ZuRx zQ$+(UL|Z|1?r}9a8~EC2Git$x&<6G6OiixL3Nr#UzwvFlyx~ehBV4b2=*lCc8rmXu zc$1sb&hA2E;7SR%YK==sT`Yggjjf(#juRiOC${pAtXkezU@)#_q9)qOq}ZiAG(faW zjMYV67Yz9c5`OuLu2R#tp{7oYYB~l`Ga6U1C#F#H7O@)QA^hocJK#4>V<0(I`_i`8 zCKU!^&vFH!=)ff-m?NC^5HN;JsxFQ4MKmN%o6~Ugs-<_KR=At60qhNC6b8yBv9%j$U)7_T{$36~90j zE6-2xHsuM)jFwaC4@8MIab0%!b>M$Vys(-F810(dVy^M04_pZeRZty-7LmL78pKE> z_~5N(`Aqc73Wz}AMqbNVWAQ@=HfdOT5Ku}A=@Xm%md4gW7v42{1hN+X zENifS5Qn1AyGpMsOJvHJ8Z2(;kn!KC>)U9sPWi{;n!WN{s9(-NQ?`lMdZu!}Z$1?b zmAG#Ln)m|WT4XyYR1shAU3#EAMG`+rk~Rf;qZhn2J2=?%Ml`RuT{!<1)LU$Xk1-L> z23ML#^P&$C7H+a_ZZt$qH)=J+Q_BeJky^j3IhjdLhbL3EjG1oK=R8=FR*;0a20d=U z(@JOUQe^$7)8wZD9JBuT5~f>jU^q_>$K-_d0O4&)jgk9cc+{`#)-jassOp6!Wvg_& zv5|Zl?F~6mOO9XC;p}MIR4JC|uSSeCbSL_v;9$PC_iRmoy-RPV5YA`Xm=i9g$U40t5`}5L|vZ>`+ZEK>Z7R#fg zI$te+JBFq7>ld`IMx!}+?7k$R2bXs*Ysog>${42ojEk-y7~|2&a{}DK$^725siT?1 z-g@by;8L+$x1~}`frnumQb)nnnP1up0?8JHaCk?zzi~{=bPs<#58j%MEA^V5)G^b1 zoLESnZAqyQMLsR-&Fv4}+&Ahe<~bTgZW;EmXZrBS=I`s9M+ba6G2y76L|nPZFm!k+ zxxK~8TgVBCpt`u;^j~ujl4l=lHT^b*6FUrB_wXpUw45o{mf|;c-(<$oh5>J#E48AGf)ZA{Fr1oJhgu)tHvrsmd5RxHF=tyxRnD-2ixUqh z>OXI|^!nY#f4QSRD=1TKNVHnNQ?*Mu@aNC19~$eV zh#9b}*ky>>Lz!H8Ks>$w!?p8c0?m{S8<5_R+qJS`jSAq=AS0y@aNgmV$p-&Pb5fZY zr-2P=z1`r<9pS^OA1d(o#1_2T2mYxx5q@=q; zx;utOLb_{6>5d_0=Dd8qpXYghe|@jt`_Ig|u32mCd*5rFv*+x!_uBl2mVJi|a1+9w z$~9J|ONK!lnm76ZtKX3p)NW%_Q055wzu@d3%W}6|rwvO_Emm@;tfr#IIVfrgQ-H@T z7E)ijg8TigQjetY=xDLyNtux71op`2Uk$e|6qV_adx?hnW6LB??Jdb7``Nq&*6#UC z!<>1C=ywKpMXzHEmUqT0>|=KPA%@S|nP!f~JDER+B^m{AD%O;?znr3>$NJp?XS=`@;Cnz@%>Ez^F)0x-me;cUsi^Ywtc2@t#WTwQD_zw1)uU57;*GUZrr2I}^$$ zs1Rgx%st;R-H^={Uhx!%oq-s~(MMD$E2b+eMsR?EiE_pr>&-(?Ih`k?L1swiY(vIj z_fbiEf#`S(hT=`#gzR)|TsKR-wsBsGm+@UXx+~h7fu)`1&vUqI+EJOBUE)9Lr3f5o zu(B&DxCSY`bG!f88a=6P(P&zG#nvu&c(lu)tUTOX^nK;zZGlGX!v#D|{+s)vj?B|G zVutD@_0S(0)w>1XB$+tv)Kw?O_IxX?=QVFKvh&ncUCrt(!QL7^f+q8@^h%qu4}n$d zPe^YTJt>I4k9sx7CU}azmrfd=BkrUJKO)JSbKsTk1Xt!ZIW6!7LlymBqkB>oad^e zl$Yd7_87C*5GR_e%k|!~nsmABq>sFa{Jw1uL+TXI*e!jZ9$w>;a_FV&&ZD&k2~f(sySH@oSjLt+kxYN z2C`p^-R%JPm0fFqdlkFswTMk`X(sRGUe)Z5=O$A?&VPXA#R%3(d5JepaiXLnh?NVj zm4CD6UdzaiKwGF(t4V>82rT6GVHy-hcHG$Ax4{tQejb&04SD4S(4rwu9B8yb?Dbt; zAYv3U3g2!^v*e?5&;LB>xb{hBYPprE5<`vU@I3Co(oLhsN~nlnb}@jvNgd*#1__md z>?%Nri*8V_e{RoSzP}N94l{?}ZM!r7*mkeZje#2|K$wLFkFBYP9WtWfEea5aUpH== zX9jgBtQ+n^K*UCYLtJq{)C_G2 zYf(qJ7u-^xWfpFN6$ilD1Hg|FDYT1y9Pn}w_A?rq69eUEgQmf|QB;r{>Ap|63!oo| z?MZbrWgZq@GOc}*)$2{Nen@u`0GsY7c(VX0EBg$ZX=4AI7$l^>C#h2{{pJ>ju|ymz zq84Krt}<+r@Wja@-@TCOq@8N*?F!q&n7WrPQm5X8W*Ehl;y+ zZ6{*K$XyzT*kD2?!9cI=I1?D8&DGN1p@1>H3+*yEF+wO|v)lLLKt^#vCAC2lbC>%5 zWE6+6U220PbXa7gN&L4|&tVo4TbC`}5B#bhKdXlSJ!RM+c9KL5)40Zd(h*vm`tvPe zG*bC3$ZW%!KceIi)CkD}vs~6%w^Sbc2CdV677i`;co;c}5Zk6=iXLs)*!Tmj5;v{~ zGP$ERkKPV#l$Uz;Aa5rDR>e#OzqkazSA_SKkA`217 zR}z7eB;Lfg>-N1PXc*QF5YVoh>2!?M`bbA2I!#qwhI|$0y;3gV*c(t|dbAZS@R2_K z=Ycv6vXYfz)42R&vd${upOOG}*BZS#M0gP&PGy;H!8tX4QZ)8z1j!XvrPm~$)SJ5) zZxNqAb_d5vb%G9ave-0{Mj2zsbz%xpvLEo@h#H__c^D0LHopTxj?fPw1T5v_LhafZ z4V#$ZLlA~Xgyo8PWzmjRM^TyFI0m31+{%YIFDV=3m&^7!pK7}E`wNkmzAj1O%zF8% zURQ@S^ux!`7ujj{TNh9ErG66i2Qy1!kEmhLtj&r>Ku2c| zGk&&plgvjZ+x~||94t;b-`A5NSicP)5=OiG)FY2vmvlW(=w@zO%!xMC4xXJGEJ)_Q zbnQP_lWpK%C$&vENFL6#@G$&KKwO$+)=ubc+?CmV%(vChb1dx{rXeTutp=s8ER0?e!=d)z3*NRGA`@@C%#4uv%50y;x#e9r20iUiqUoZUh zT`S-%W=@bkQ*G>!@k@UuCY~9j;uo0liZ;3DqWCaQUY`9&3$03GnKuumaWU~aKfmbX zL>0DnT-p!ow13w-1i0B#mALE4ZbGqtQ$z*sez*O&ecxm}* z&rpWere3;*pzm9|T9U{U70tP)-!}#XhcH4M>N|8sGeS(m`m ze`6sF*IRZ%Yi2wqJwx^%yng-9tx+5rou!UuGKasi z(0QL|7Vh6i=k0C2#DwlM3|n={hY7oV!)u&gztVn8G3z8Xxz1QId|%c&0r$?>>pPQK zU88JJiihCa0)G6sU8P2wXwZe7;0uRWPw;wE5_{YHEaFJrmog0OAATE8puK$f(D3SK zqO4||fZ68>R{B@rmx)geets?DzSq;LMw38wKnghWFoon-zM2!yGusXwgymPn%A zcBZ~GjoqnIETrp`Zk;N#u$BNl;3Ff{&u?xuv`kXOABO_}9a6G77o`=u?tGb^&(3JA zi4#O&OEcsxjYp)dIhPtmC3q=R7E$m`zcBTXv9a)@PyOdA%}=Qp&d(cG3f+z$;&@^{ zLF&y>?uvruW85nrsYFU`=i?9j^;cPB^o!D#YVh-Yx-ujB(uu$3=13sXN#6B_XsDSC zj-6-)@+zXI{mm4FoV`R^YC~?;9-x;~zk%<E{9DPyz71?=g#(P5!?}QaBijmlp21YSN3(`Nj!ILtOh*yzhUJSa<&_|j5782~ zuS$C1QCF#zGu7WNorqdZ@Hk4ru;NtSKJppI53ZDNC}B;5p;H=Ja<#jI`Z|}}uA0Pz=)2@qBO-f~|JcG^beTGLjC^YJaJPrt?h@il98aTZ zFH*Pe*Y=jDfP%>xHyKJ12_Wtl3MF-G{flEb-WciBtgya=cJb({0o5*W+gfG@cl>Lb zlt)(*u!%F5CL-@P4>r2B{$&RH7+BXgbiz9ET8-mRy)(C~&>zH|$lfZmP%2Wy?bxWw zb&Gf{dW*T1E6T3x)rT;X#i-N=drQWbTl5n;3f?dkxyAfZUcf?1~ z+s{E-DV=8>-SJCHP3hq+4$W&fzb0^nf$t70;Q`9umF{1c{A#!u_-USC^w$>mKhTde ze*g|Pf6!k(7(J*PIhJI^^hn>^*{@ew$j+i~;lu-+T1`kjVo^)Ws5_TbPLiz0M6YoC z4%#{@(?yjP`E`^WO)Kh@$#`9|*t}}2DV?@Yx~=~lXVm4xoIq`gBbELVqNKkEZ@%kw zRsR?G1t{xMTe(W( zW5XbAlZIJxqru2}Pz~$W=dxc_J+)o0;(L7=OWGSfYV6@$pCCf-^Md(-dH-OOD+}&8 z-lf4Su_MROqsDHzp>VD__K#~b#KcdFZyicb1*nnpGTsnf}Qt4}}L-mvnv${IvEgEyU897_7P+Iwl3p-&^ zyokBy4JuuVzHN4*E4^pp-zJSy!-V#>Qyt1szxa1P%tc}Iq1_MqmHN4>m~bPZlaO~{ znqi1-1o3CE3VTd7D#3&N?sUHcXhsPy!N(x194j4=4xiP`Iwm&a$5l)bnJ~&Lmb^a^ zhy6e8?~lxq(X3^fWYIHFmBX_JA9+ezg0EH-wQsemf1(baMd{M=&PCxZL0T6s@B2U~FI3N@$Vgbr#PTE;^~J=$?y?4fUB?7#7@LopO;L*% zlzUUnWKzF+wULUGUe&O}~!X=nLx zbba+pJ%4V;vIh?jNDzDB%65*n@6ViqUE{U@{3ldN!j5tlBs>%ZLqO1n$+<&_{IWAE^f@^(Ja{K47AW+s=7QN~PKE-pNev zBpauBU~{T@z+JPNhuwN&_Lr9I4WT!-6h>@SFgAJq4DFmSw2CW<_S)i=Tgo~vGY733sn~UNt2#3&G!#1Cbrg$l zzJhVBGKgdE76;lZWjC`?vW%EKtfle!55 zYrsmAYUS_hZl;uw=u6x8h1i#f^nAB{1-+{;vR}7 zNH@gA}j9__UpKNzGHsi7T6 zxBN_B8AlxX>A}Xurelhhp2F9_3Sj87phd(-%_$EO{vy{XEJI?g1Ai-IyR$l|Tfj*nV?|)l4nV>(M)onYpKE z`C!t9F0d5|)Er>V^UZ=*qj+jrkC#VMd}a&o%Qjv`xSQ4UT&4;x2c{^DzA0u_bx>MrXg&d`A^dX7K*le19# zX5t>@qI+S6;}2Hshjhd=I2xKzh%wDj*YOT8m=ZY=nfesyAgNvYI6VzKR-C;P1j;rg zrV2A1KC{^QhH)6_MNLf}+&<=g6*toHJquv}N$}{?`!>1FwAz3&W|BjKfY%n9@I9K#2d7=g5si%8J6JM@f;pi-9=3#!Fe$aAj$B*& zx_8|8SGi+@pp|>epSZe?1cjlMd210POiR%ej%@1p3Td;^g11)hjExx}ZVIdH9}y-vrAhC9RVtg0>U4m7%yLidUF=+|DL=hl-D$d)9w(#>4J1#1yrU5k@jzjy0UF`n;k4a81Zs-#EZKH{L*? zq(C0?ej8ObapYhx@%8Gjr2ter>0VQvhe_|gvXd{*Bu(pMOJhHK@b`v4T$w?WRkmlb zm3oiE9c4yhzaK~SB2JCtJW_?6$Tkuu9V2`L5OoEW+e*c1?f%*~sfdL5TLO^(J(h*%HoF)pnvH8IkL{ z8I96TD>;@-tHaqwdIq>`8F7xhr?))EYAopld;}xirI~uXyT~6OYyJs8HBO20==W9f zfLU>@Gb}Kk3QnGUc-AUC#;hD?!rsbGSh4@QOYF6N&(P=&M$gw>aqI8N-)3i~IEl(# zwJQB3S{Jn@u_3?W!@$WW4tg@z4wEMpY)HjPQs6|J6MZ#J3fwt7L+eI@*%W#KTDaz& z8)|iHnbNWZ#_r*>tN~w8LM0AbF3bo)N3ct#+zbJc;!fk#6nL@P;$LM4d!VBE7ImK z8JfXO8@UU@f%X|ag8VV2@69rvYT%`v|Iykg={^32oO7gJDeIO9HDO;5w+4w4nDuuA>~ij(OZN)=_0|7mb2J;UMM#KqHws=u};@vnavpb9n#gnR-vP4+2+$epo&*NKby zVPv`UEH%wSrQJST7jF!A{HjQGXgMH_b26FF*)jk5vBzJ^$4Q6G-}r5QJ*YS>t1_*l zxEt7G>4r%pNmNn!C0TB~^UyN?6KK^L^sbaU4;L+&8-!)?XG>Qs6-mp}k1tjm$_Nq^9&NJPf_ACeHG-_zb@j)myo0q&fpQYRvG1yR~n+%`{W zAKvN_ufK!_JK8I0IoY$Np<+ob!-LMfp0E3miN*AE^W#`1YxjzDsR$dnTW=`#X6SGP z@DAgdjp{60*p40soZq&z0<(_J-tFw6w}5*sI@1v%k$4M@toae-@*@HPxFbWa5<*jN za=*6Ix3(FD;FS|k;BUelk!?&Qm)WIkmw(5&ln?67-g@XT>0-8Jl9qoN!;{K{o7ed_ z6gu5M*u52Tp#2_IpaBF6dds2sx-c>E5qiV82kiP}Ez`Lr43OengMASA;+6?MzMHj`#4I3Lboo#iMZwQo3I0;d%Z#WkYe|r0k3AwL}Tow+gzoJEs`Z zcKo_Wr+=JS+FcyT+6#{A?&3ck3^7cO! zHkZ)AJ(HZ<%$|L)VF@&ncU=$z(5_NdbSh`Dj^}FhRhV6)Je*u93h`c$G6s}o?ml%Z zzYEoVvjj7Fzr=ups1FUeb7Z4*AC<*%hYdoz;4zw{fR?2rl&j*sBeD(`I1;+PhJ}Ve z|F(s|t0d`w`}rPzE9B^@tj+cIJ=4@F{{8*rI9Jr2e_6h+SS@8*D8Fr$r+45YkvBy1^l%l;wALbOat;iVm?(BNd=^IlQUXBK#t=@2W!9+t<66HN_)LNKmL$h5 z$x1zH$B;odb1cxKf_wFnsbrmd{SqF&XVUFWyq5kteJ}IhfgofSnh*AK{|xF$T}8Th z#Nr-HMmA_aSOF;)p-rauU~zlks6D{b7@(}Xb@;OnK2WV~0J)JKuN0v@Ccm%7r-dLe z%PV-_P@VRXt;AM*IgR=e((roCNHnq^%=ycMTx3P?p+}uZWO>2CdKIA z_orcfF&?r~TBnRQGV35xg~CTD%$vSRzixYO$Q;4Dr?huFfm$+Vb~+9w&V9Ltf0_*M zo@5&$RCTs)LpBFPrJKmMi=6f3$`1_V_77-cN~+ms^FGDcc*(Fy(7Ztx{62rEKTydi zf<9G?DO{qB#Rd4QJ`PFfu?P@6iHP@Dq}@=^+Dg9_A51;vbO2sIKdzWJ6L+2$eM=_Y*jlGt+*%VWx1z)O>#P{9W5B&)JV!*QHUGHoZ7<#e4(jdVWPZ@; z72yU6AXY(wW`AuV)#^)UpCg}k8H49@BJ$Wcgj;z0ivHm-aW2-Q_7=8>U=lKM3E8aF z$k!Jzm3XY{`{8~2kbb+iMFwC`aYPI1^fnz4_vmFh@_Z#75t!9Z`y(>a6a4_Nxt~ak z+>=-u14L;<9F}x@gsd3X#1_)MB}7nUZD zc2kZwS9~^5m0t8av!>zssFnCN_g?R~P^_hzlcQ&3G+5nlXtK)hc@(|$2HeLEX&|)$ zmw$t1_|p&5(e~sK@92lkD*Eptd;{8ACgq?KVx+&o0|ESZY!c`QA{u~fM2&E}lm88x zEe$jG`Wg`y#_zvzfGdo;q}z@Ksk=wM>Y_INb}Vp47dgDOk@>*q$8o{w=#Pc_F=xE- z)&D&F8Z)Dy@nm0zhZ$@Hzb53Sn*$ETj7nFnWHx z-~G;0=#8Z3d24a2YKD|2zGFzroM7svFEWPn7=g=rJgS4PZPn4wZG}g?IjQK;tFerN z|Aw7KNHtZTc_PW%Zi{ne!_e`(Pqy@5E5_UO$;T-|7`Fz!5;jxx3kKa@XKXf)6wkn6 z6LnAuR=i90N)%u}hQ>W6<~_SI;mtykkEs#vbedTMk83}&Isp0lciWQQ#_f7{Qp2$) zj}v|+n9F!`B_IRMI}hzthz8Yv3{_Q>e{txakriP)`?3#~i+nA-uJDrXt+TFx$eDgL z{fZ*A2)HKMF324TZ`A|Fcd*W04!rkdO#QO~dwBM;S6Mu?+hlAV#-WyhsG=t~ZxVk& zt?en%lDK)(e84;4LPCA{9`y?O`RoG8-6$@WDdTB;d3I{Tpq>Fze}j&#JJipH9Hw-?g#$z24(~(`xV$;?IyF&?WiJby!e5gzSEdyKP(|~iNk{v2GT-bnQ zB)%JGyZ)SWM3COn%c_WEbfTc5Z*5**orze@PcVCzi_&ga9hkU`s@M#Fw}cWt^8Lwq zBwz^a|3up~+K?T+9{q_1cVC)x->(wf2YO)zfX6chtqru$G>|Ja29Q1M=^ju*vGJ36~_WAB|R> zy#U-Npd7nUs#yU12IbIo3AAR58^~uID*fU4)j%REda~d=x62Wn{n7Nk2&>3ymifwM zugJ;Ep`|}RX*EKn&qVTaD_`HJF`?!Rc8_@orJHh^3{_`)&_`+HcVdI_ZeCn+edl%R zeFC1v;Vx0x;oD=gDy~Fqw3g}G76fa2A0(pewz2kM`s*WmzYYqgtiuj+O&;`(^B+{2 zRB*3lu%BK#1n4utPp)}Q%W_OnD4{TbY{pI+XMc^1(q>ec5dcw7G5%>|dRs3@4}DM1 zcLL3Pn`Zgh84!}HmHojo0JzzpdDnbgC~jc%mN7vtGA|KO0Ow@XNn)OSdKZ_)r`+G~ zn#bbj^4fJqIrBO{Y5SiheI%e02AF#*r12%4`~l~#mkgfDJYL^_e|<~vrmNvj@@acN zE*!yseH$ev+ca^sx7r75Nm zVUi`tFm0H#)`qiTyL2tnxlIneZ()(MxSVrqSo=pa)GF50l<|8_svqr!qz0b)X0|`9Y1XRb7K=&pP#R@jxS( z?vx@4n;P$p0rc6!A6$j7Z(5O50l3&uG~Da1XAG^2DB#ZGh5 zFD1_PSWogMtz6mQMnW@X!Z{+_Vv$UQP~|T0=f^i-RVP5UJJVeV z0aaOU_1)>?XAGK2lD<-LKt?&ca_S&0Z5n zsrJk#J05w&I1W~a)tF#(;hsun!$iaUZ+|`X4L=d!>(F_3{|}FIt%c~BN;v+~1pB)cy7{YZzX4g>52h^TzKu3(XBk9tI{mLWGPBfF>rV zAo7u9Rg#h3e(f|;ddn%?>~>SA*~>ZVg=1Zr72?>0K*zPfBH{xMi$_Y_{nCL;%H7HZ zuZdN;0W3{A&MK(o6wd2p=OHe=rQp}Nsk<&iT)Ioi4bs(=LmU-LF*-~RHgMaOIB=bh z3#Nzn>rDY`>zf-AxoQ3yXrYiftT3TMTGUYdWAb?QGw~a|%)AbPOa9`3?pP&{7=&PM zd2rd_naslIBtD^Gj!4Z5Dgu~sP&~dj&!~k2@bL=e@VWsvdhvqux|iu>z)$Tr(1~~Q zNlb#)hwfy_J@qW!=N4}!io?lVXpPHd&ZTMoc*&VAQnd?mnALq81nA}un`haDx_2!3Ck%dWt3AuWa zO$FXU#Kl+JeFMsH?NqcXk_o9D($3toh7tdc^u2$J_IQZf(&E^dm2r*z~RY zvz){v)K8`!1YJksa?S9!Enzg#Tf^}aG2+Ob9D2JrbAy=Ebm18c;OcnM5hjKUs2KQLz_K6FSIFGh=5R}G-_WF^^(vbHvguObZ z&9|;J5j?m}5a&VJW4O%-q-92(X!=F)2bz9)TkdD}m?EXZc)e8iPD})t(|j!j<1H;# z{v-bBr^{U7nvkosb>&*F%{j$h1kHpwv(c2EZ)7i3l;LDQAu zs$1xUMsv9!?i{BZyf}xD!M5ne3gY)i(q>I%7_0-Fh?*Wd7S_ZJlEz%WyRY#r+)i-W zKOev8rX(_i-*uP<-l?pk!!9o4et}KgES8vxmioVJjJgO%D;VCNWmrDoh2vQ~h!Sy1 z{D_^dtM`sCL7vxLr-^M{LZJ~vL@H#1S_* z0>3+7wdEjAcXp>m@2Bn#<14)88{{|0hk-4)PRAS;;=K2#>tsD{C|=UB+8%Am+c6Ns zU5a=x;1hr6y>35`*<>y=?Mtb2#m|W_TQ`pJTOt$-gWznUq6a}?jhaO~a8kF*XRGA} z&V{7zi~>1jW<~|lE5SgK<1FjZMEP%q<<73Eg|nq_>;1S?kJBu)>kAOGf54) zg`>~J;#(d$5ohrgJzIy+>7PD}zvEaSjf>>R4#k!M4PY$Ib0WNZKmix5nzn?O^ZU+X z@!l_;sMQxe^7LtyDSs>W zFev=})G#+>9EdIq90Y|l9y>?Fm^lw}8ED)?2A5+REjVLpa-v_1$s!wd*2ncYY3 z)M~}eobEd7=%Bd0#g!ZYOWV^rs>@f%Nld=2jp6Z8`|$IDeeEOs+PB;8U8trn#f;`- znOMKo&j9JdG_e)HZt5)BhZ?(VaOaJ9e%yLx1@Y6f{zQElh5Ux(s zA_?MSWJ|>_j@8$PD4Df-l7{%R5zlS9GG zE7iEO%q;D6?IZDZbVf7*>E$Nq6$zo%k#SEO3t^Lcn48u21rbC|7lZ5)JUX;ZvvjB- z%gA26sbVZ4tvwEL^~Uc`b1YUz`{=L-#q0@d>k$OMhdZV$gw8>DSd>;zNts+M^JCx75>9^b;`1edv8)-4@U z_;xp;isz2MuELD}TKi=z+0K)8?I!7vJH9nW?)vrI7vbL7gD4ie?)KRYUDDzR2{2(gF zy!k8?2y&3`?P+r zEm3gWko^u-ulpVP%JWm=AnKu$Nj{oS>1C(duNxvx#JhD!U?Z5!pnJ4*W^*Cj;K;Nf z{_cK7YO(1W`|^#0sR{0`6f-I|TH;yWDBkX1H)@hFz*UhMRT+KsJkJ|%_a!q(UQU&I z;zr6VJKC5%Zx&~_Ovsg}2)ucu^ze)f$};aKXoH^fV7f zJ#iOsLCVaHzpE(Kydd}1n0fhw9GESyA9q(o>eh}K)gEokm50FFt&wUzt9fM1oc=U# zGN4vk4na1dioZ)L<(2l2F*4@14|2zp6PAIlTeg7fec=$Q38O&Q$mk57yltFaHYu;N zXdbc&@4#ABx!}#~LL2>AXcinT`bBsWm#+O;R5;1 zlgp%9Q&?pC@;3=tSrb#qd!;t4i*nIh91O4Tkr}3v73H}?LRp!1`LXbC?V7US z%TPb$|37#^Jr3AI-@-`9p85X^c-xr&?JMX1gSfArY&sIlRbCcc#}}QE|4&~R-7YAx zE`%ADZ6qs7bE95m-5vZxhs=Md=VNFqerPmt-_=k5cGZ6NA7(cG!%X!*%pBG1-m^iy z`wufit{27sF@Bl-Z%>yA1|f-<0YM$f4t;-=ypao_QUn!FV=;H=zS10=jCSV+y6Hxin9X_lDre+sdQtB zyF1#1J2CexV*E>sYfEH%*Ehz(`vmY#yxRpSma9q1|Ahh(^n+Kt6L;Ps{+PU#Ezy3! z%S=O4ubFj+?%XvBX;;XiesSTEGfKKMMA00-Bl5^dV~5S;Xli<;B#NIX#Lju>XML-I55xDM{cvp`1G3z0{@$x;nuAlWQMytYr=1H5B3 z6t_e<95}8mQYP{jdyhaW+1unOTl}|>zzpS!^he7LT0b*)N*;Utl#1fGrT1%B$~Fki zU>RczG_DTXq~6iP^Ti$~f81=8UBT+*=Eqn#b{-13GXB5ni1z>1`ld^aNtWe{3x;>9 zBs*UN+hca7F}>2HMmcWh|NdX_#~>Lb}-kLSfjjH*1u zFC%}3rCkf0y{U`I!42z2(22b8V3{DHtnz z7e&4kBgK1@^uN^mmffShZ=M?WU&mWKYO9tSWp8F3^Hrp{Z~cG3AO4TuRsQ35%K!EI zoj&e8*!Xw6WdDL6dhBI!?{`=EtUD^MGjfpPzfre{`7iNl{~=!Ce~Cx1;r}=76|GGU zuLMb6jh)kX4fy{OC;T7cqU8RExaCCF-xXszWNmU;{Vy(ja(Kyi;{V~D&F{z{@3-p3 zOF!ehv5ioB>VG5N=|2(A6oY&OQp`Sn+FX(U-)M8leg+&!_t1H3!9<_?qBx zK$2Q&;g9*hp^4_Fu4L~ujsw7!RRz5P&0VXT0m^4%?sb@76BYPYQPGUNdBM255>m~v(e~VVskenz;2X@= z1cpTV`rewLVdq|{8Tqr+6b6lhTRC#qp!@w%RC+6TD9fH=FJ)!H#-XJEAuL5{JqKI< z#XK#Sh2e)EEPk3DedY&W46;)WVjJ`H`mP^Wziw-#QE`eE9I(Ky!bl{R&Z>Du-~NT` z;G4mrbZ@O8R*HlI&bzg@{dO;A~ZfmW#b!NG}N2W7h}wiiRg1 z`htunzQQ<0Y?yw%HA>CS>?*pg^Z3WH{nr7H)Jx_@9tp~dAIHY|jrGndn1#=0dkfdy zPD^%0R{lnMT5od&cK^LS^v?cuo9wI78YF_5aW~*n<2rUrTgmnXHCO);B|eR;m{m

    n-(m;J;%(|7o=`SnEbP)hDZuQ(s_ADn0vMy^}$!Am#gP?5?s*&tl+6%#w zOf-;pt5=h`XL&kdpaWiJK_}dXie54O%B)vF1?=r@A4TY+@@{6MJXxkkLHCN~ZE}}2 zff4gt-+i^fKkt4)l#b!8kmeF?O~T?nt025ze-!kJLL80)!YVanSUcEOG*^{jUMHw6 zuj{l7gkRVg(G8Ku^QWez1G0wm=e1gmsB8G#5Ayk-mc*d6%kuUXa}NjgyG9@q zmhJl~B|G9J7prQ-H>&SK;Mgc`XCXU z_o64KaN_&RK5lv-8A!MlIT`1s1AM7`kiq8%q%PKI{0-Rslw~UR2JHCWxEK?_-59t5 zJAvgxK9s_=FI^CYZ}D|S&x9f{(wOb3U0Rqq`U9$S-J+^%d*>FyYI`2gxm~zJ-3)Y{rQ2rRMex+_9s(E zB+q)y9yCYg1W)Z(1QmIapn_%1!gX%6buJ6B&=SJg>gLbJ3!%#>-c+>1UBA>#hM}op zJGiXl>=0pT_SE_Y{VHP+q0P$$Z!#$IQMJuJTNXfoHQMn9Q<1#!$E90+Tl-NkqnbDMX}yDpc5bb9UfY5*27&E+*|KsVlLwwUfHLZWpfCp?U4bjknO|_V4}R1k5Pt%kFAMgr#i@4M5Yq!9>;r z7CgJ|$6^GMP!5QO<#_~R)A)Xpd z)-N=x!HP`kW_V~_Be>)Nj_z`|=atp#ajN_3eX7s6kBZT%rBp&wen+A6>x?}a!s5~_ z{AVh9O9Fb&y7@!cAT^C^1dc6*4g}^a4o=%rTV8$q8cj9n%lzzQdw(aq^S8c^ZhmQL zaPbMU44+Ow`txg`Su1LnnB#|-Dq*yq)amY@#JT{3r+OwuipNJUOOHdhija_cOHTy<)q{M5t4IRtnw3~*wi4h zxe)1*B1#qymQf%oP3G7=&Qll2;1aEQE;w5A!C>XZ_PfvZG4v~>RnZpBfA~eo_Ie0Y zS=D?sl5YtlAG>V@yU6=84r40c{L!0aM(HCN7EBw#85zwgbK>O6f)1kX6zn=diAnRO z%}UWe`IEJm4k;jw_2{=Rsg}B> zMhj*`+#wB$yW>Q=zqmI8YWV7FBF7Y5D)ek89B0BUeW<<I-ZhVaWlS(!^>{RP7KpbqTRw~`Ri6ippfgW#l^p^Q(o3)c1#=})H zb|t=VcS<;wGfFd&`46iO>x16c5Em3yrfG=p4)Yc24@eb0dsmcW+2Hd z@yqBx{9V?GM31_}cQR}lWt9#j7g5@k6bv!816bA@Pk7^B5m|?>2*-N)w*w zwHl@pfBw_^mw_q0Qqh!_cyR_D;)>^L#G|P&Mwibx+@>w4Q~r4$@-Ka_-hj5 zp^d8%it&SHkK1)dD@J;}lqXt_PX$$2sZu$r@0BDLqt!+(Zc9wuX6Hp%{wYTFWe+@k z_5T1vK)k>A!G+XgPG3XJBlGCx_V6+ZaavA>wHu*piL)cl;@Gf9_kmo17oZXPYdWxm|^Z`#y&E`<6?SOVhh;=+JBL>-XEE=Z;zOGG`?FyBTZ1fw!*X+8;uuw#B_mwGVLB$5__6r z;sqBEJpHNu+1Idfc_%R2>BUMdFe*f_{Rq@jX$}-fBg&CYw}sI_3Un! z&aclzOL8(vw9*Vt75O?tk>PnD#^D{ylwmkP-Qg9A3(&IDtnJS?^_HJ%&9Bwpjp7BO?#-G+9lFtrOjW{rYXJ}aR! z*d@pVaImtWxH8voX{XK8mDT?)sQ*qWKq9?>=aKl_?m@li1o<*3-MiSOe$Z8*r;HA| zTosEw;zQ;H+t&LpLK+J_iJ80Qmepx7Roc#ET_%$vR&A-SX9T015A_|HunW{A!AIR4+O<<4XB!|ZwSLHNYLEbMcWzzewFj-3GGfAkG-RTbwkgF)avK-hS6 z9y6qq^5y4Q9?Y_|hAhM6;^(O){L22y2R|Vz`QRt?@UAKT#~GxU$aCp{3zbZtlVc$j z`i>3b_gR7Aj2Cr})s}N4YqqRU@0f&MIdn46KRatMTUmG#Y~_fO+!S*N_CJ=@?QCB_ zErQjtc|y3ENQ2fB&TJm+{;|quaazZ(x>_(e@i4~0cr3Fy)HSo$t(YHs7g{;fFZfpR zt8zvTG_47MW|O@5Q<6G%vbJC7hVn2Y(=j?wp3>q#dE##_Puu7JeR(GSf7E>md{jm9 z@JtRECHMveiHb6~qb3}hD6AQQHIW3~;6zalqln^i7&!!C1`tG!nE{^9r*Rit)|C}k z*~J?b5YRwEAPFKE4mm__gdT@ zAq1cD7dSpSFNNb{3BxBbh!1doKNI1TU?uRt?05G{LUbh094IePO0WV(i8@ARogeT# zF|g*oXR+Ip?EK$2K8Znm9>-hh1NdldKLNd2E{U`9b>avZZNRT(u1?q@HIL2yoi{rN zn)QE5c+L*;Bqu|e{|z>0yjVa^vMGP#h(9$$aAk=8P5SK~6qyd~857z04sDD@*=mQm zOUxagRM$yQ=z($_&%38ftG)+S3bm9|I855~G?t3yxf8Tc*WSDUQKd? zG^7_$OIqa>6jK7fs7Sj}R&HS7L_qXW*ay>!7g~(I2Z%$Fj{dBU{cWP1C z{eq-ZS-IkKg7}(Sr z$i-L&F2=G0uMK+&hP> zD}xO4NlSnxGm7J2gh&sZ4*Eizl?=bpX!t3`XdTH8STWQ62_T=c)tTf4>StG_kavMrRNj7PG+q0X?*xBl(y-`!cBEGiOTh1(2Qf;(BGTw_xg;Y^4rbGNV^# zmBZG}DtYj=U41giZu=3mL%Y;xmuPdc1u^Qv+q~HU(lB|SJo^`FwjF%IW~EoYh7YCG zTOiH+0&k)e;y|pR_9#u_++ksD=W)ghHA42%cx{{-YIEiZ{(~X%IYC9q zwdf$=o0{5y-&3L8u>_s6+h}iHsRj@utammO;;F3}Z>J5>syfIeboh#`DUsXThvw z&SyzyzBLE>T1`RxMg9-+xV{|1|GPQ=Uj~sIRWxY)2>Bn&JE1;HtiRq6N6M|>|K5WC zdvpFTaR+#xrh+@j|FLZ5>u64Il+`iAK(6G+FHMQ_a$S7@=${7rm!N3^g2X-5$>BLj{zovPMuTGPOu4+^G#98 zbd-7>^9@xTRs|Cx#*$RDCoQ<~(BRd2TC$^K2 zR3^3wl?$MX$!`1>ERL!2>_P*{pm%4{*s3@L!b?K^0aAbu;RSf(XeJm5HObo*N=3oh zhh7o(^{9YA-^8S;WV`D8Kc1FT;D5fvZ$F3pe@LvL?DM7EW*F@xGs0iY@0Z1c8y5zs zeL)1)f?RYCp4i1*9pjp~9=#Rp%&WMv3fyrG`H^bVtB|;BiH-< z(F}Ox-|OF<7e&9zP%aD_kfn{a8Y#KJX(01>n0I}1X-IK$6jsOCfw$)Eq4`svt^CB= ze*jao@Ta)-76V@vjAvM>MgTs1G<1K*p-f1m{hi>wRj5BG{lPeP0lT^Aj0^4zTn97k z_#re2?gu`24rlehX=Uy`483>21DE=#J(JiG8rI&!`l)Teu;@LtpVH*_B4ew3_yWFOq*N=1c(I! zOzAhNML#X>l&tUZz+p=hlHPn4f%||n1=YC2D>be3aT%G z8qq!V2N*wFmGhce_jPi%V=?cNCUp%ceC8p6eD0({0P1LSE4-Ipr`p#Kn9Kq>f`a7&qb+k)$32!>(9~a+e7PfwWUViLW+OuWWe+S zI*XN}-LvTC8*9x)oqd?ycEEWr@bIwxw90>qc!$^rV@#-0v?zCvL(cx1*4!d9RB|1o z>gdrn;RqMfe2%{wz(?mT!EOgFn&rL?bB+|Yb)WKYCbscC?i&I#PHZ~O$)$J?+%i$l zp-Vd}L3p|cCAiFi=ZV7q$6=5>Q)$D519?i#(!Jw@OwJWLoQHh_aMNG-#r5_9p#~0Lf>Xf5V>QmNXZQ$ zlstkp1B6T>K^6b}Y3NFukrprr%HAS_!NP9;SnPmmGlZAHXZ=@|kdJXzpeYI9OlyKJ zn!%!XwF$gS4An;k9jB2#%7HwUK{{=quF1JfhxA9^!(kK(_D6^zMQ?&u^yZ-VsF4j{XRG&iaR-=bjH6&~w+4|F7r?#B`|706{x|9qzmv z=ym_@PN*={IwuRh51+@L0IWaaJRWMGc#$^@ddHx@2{DcKRQ3sJ3JdU zCs*rz9`?T+eLg-DJ|A=RxZ|LwFLXi1-5BR2eba~t0rvV#bm^NN$@qD?gPOOHes_D7%Jbo`HzO?Oe6#-lM zccS9>d|>@{sFO&kOwd)yzY4~3*e<({3dVA_M!OE}L*{sz+u1Ne-K=()h1tqTvur$z z#FvfbdvIA7^+B3n*9rTzDqz3<5uWFTy}-`H?fnN6B=58<)c-TpFWFcQpT4^Rj;z$DO#fkv4BQCp17Rb4YA1Ocguyx zVqxB-R>E-0Mn8M|QS{H?Cars8QaC?X1n7DXTDNNjqjr#$GDmWy9hkYWHr4@c+lr)l zXKzZ4Qu5TgiSJjJ}MEPMO?u4y6*~&BzW!8IeLt;R z=WL~?)vm==+4;^B{ISzcbvUvdPvo5~{=z)DWK%Vc$K$$OQE;oV1nOeXS*i6t48Med z82tdoWBfUCzi?KsH#*{hfaVoNLf)M=v3p*^cwi{YX24u!GH5bv02qht=85p_k%y5jfFoo*yu?t_Xuu?c7^H3l3vI5-04^kCLZ{ z?YI4qBX+hr(*9`toFAIL-&Qx18>fSx4s0NOHKOVJZBq_3*l#=i2;Xme?-c=+N{-O@ z)BBpX-!@i2v~gHqzb$!0$bMVoepZCPx(W&SJ-Ms_#vGvJgfO4AoLa#1r>ODwfdk$9+d_j>S0%o&+k0}<(M)i7sdnQJP`Y4cEoB4{H}I`fYseP?F(}6NB24$AnRFyFj!Zz7Is(q$%9_sKy)jhWTCly}@0P zKcj4@?Jr7A2SvQrxt+ZNg2Dd{JVN-_;|AR^RE_LPvFi~8;H|!>x_9^PL|wc4P07s- zYp*9Bs$#E?$RRTxr9`K{;F3|y=Dgm3Zqj4e!bQJ3bU##9 z&U14jsX-Wk+Sh_xJrs`sgJfoRfGAmy8y##T5z3WAPVg)&I}jHoa^RXKifthnmNufn zy{_hbT`6L|hXQWfmvT6*iicL!2ltmlmg=4I?9zL4b3MKS^XQ=JO|j z&|B6#BBc|Eqauo^Vr#w4NJ4rOO0nHBL5f+Vc?mH(#omk&6!Tb@(A388uaB&Y;5(<6 zHXsE#GZ$xOf1pZ%pbAo`RR0dhl!?#1Iz2ts<)YYOoseT;(uNW;`z=n$7UA^L>G_Ii zw9tRGuK$J6Q&14f?G>z3Du6gllNV zstQBu2c>{vL81KZ*yn?Il*0#ucqo7CY^B!!Ek0SqpSaG->eobQ)VXEbA()Xf231iQ zU~UF*77v2ZTon1;TMt-#d2&^QfW_-s<{-Xq^c=(k7TZw3BE~}=VoCg&>oiXPl;HF? zQ)lrH?+kKJKo;@Fw;azFG%u!xQ!zbmRXlZ&YYJ z$|&hv!5SWg_!A00hn9~g7*GZd0O7Yf7T+vPtHB-fOCeG(t2DL1u7UlctadRduL($ zr)7M8-l43tP_HZd6ES^IMoYm-W9#Eq%{*d-SSyt-ijMXTs*LI`{ zRE#{i^JV1HkLMFE!_(Gs&Hp6E1uYwhuXqpYO3sF-@VJPW_lMYSIysS#6hIGdXW3l> zQt+TdCEQtG2QD~maIuEjHyqD$0GHfv;u3r%0Sm39kfywnxDaOF2&j&bbER5YxtBTy zdnyB25az87#&MJ6dG;Dy`X;4d9n>UP62-Msd0Lkzr8pA-x~Uv+_IFRn{!^AV%IN5^ zbP?g^&4Z6@;Ic>ybwwwy3si~~4ps@dF6OE@BCIj}c&KIQ6J)6HgS?n+wDfiaixUf;owSyN4B zP(v}OR5h$as?c#DhdL~g?Ruq390@8zQOMDuhE+pYy-YP1RN<94&iWW<6rM4$A`oGj zJY;0*!^HT!cB@&_OlD^rm;uCgYHR#0c~>A=)QPC4F_v9b?|ZkF$48;1sVyu+T5DQ0 zJRFT21~Z4pErQ6YLCyv$U>Vj0^zg?3@F#r}F|Ir>gou*&QyN*KTncBtS;W`Lc(rfWJUmLv$!mt3cPff1YRgXVWGa>)K6=FhU%##!5qnc zbfY=c35jCN?OI%WG48-}K;1l=jJvNP<*`&=^X0hxtKk2Zc$CBflfRI~%F?HW@|u8< z>R2jk3EVT5Txi!E-{u7O*X}pTonvJ8hBTA`#$wsF-E~U!qV&sDPSF4t9*uJs7>&hT z46x+c=o?#_M-<^CAorz`Q|LO6{6+S56+?QPJ59Bn5ac#by}GWLvSzml-R1%BLl(GGu@FVJP29b{A54 z<8Bbxu2Zs+%XMBHC{MrC#k5qUZeDv20UDPsEBnwgEj%2HbW{6)C1+=NMaHcZNk^j# zl4(BYHj;hnLRyE!VGYo}>Q^Eu>Q^oiQs(6`RrNX2s@~?ud$yD24uo0mW{K|V?x1qP zcWKVF7Wky6yE;ai_go9&3oh3RoS-U@_>;A%pvNsPpi`2SEVnNLbFE$LlW13-wAhuw zB;CkKA<7FWT`bl{T%_)QJWSew$+fsTMA}sgaC`Iwr9&H)Vz>P$c@h(FhiEbvbtnsa zkr~Sii}{WcZ$gT*1;s5LVjs>VA%tQqwQ_wLt#+Yv08Q<`fsK{A8Rc1}FoEOqsFsd+ zGGUAYeI*I>$$hHGQyK=osIO1ABS zPsNe;;$m(Gutp4j- z=Fldj2vGw{5b`da^=?Knl}oWwK)x^LCx(}<4ypy&%4xsE?F)5;6;FmcAIs*}2UW7$ z0O$>Nsn7QqtlNr?*r3_?(ol*qY47MvYNj29Uj59OX82;jDB#aB73M9N`T?ww*svK- z=l;iG<8w>QLsbU^Kb$#s^}!^&?OR#-Mvo1?8DoQKLx{u@yNWu6*^Iwj9kP|9_T+Eu zvoB%S(rc%Y^0O~d_boD&GH-#T>66aa4FVv<5D{hvR7v8N3U8IG2{fLY7LzyvgUC$wc292hEYG4YkPX*QrCFGs z2KC64STIspDJUHt8OWZRKej5KjS4URfKtcZlvK2%KL4>0*WsN{Gz(Aa(zhlr;r)(z!&=AmuXW>7X_ZDqT>aE(X1Zw`4aYaNAWue#P31JVCg)TM@HAN`8#ol53`O}1QTMES1`qv60OB^#rQ+QhcFxw&- zNr6#)eEntq`^4g#1&i-cJhrVbzJd@4=yp&WB4Ns_zEMS6r&#Z5e9Os&l1A~Hf31&Z}k zdmd)f9wGlXs6VtBO5_xz3^hS?1CTSiL+!HHj!_d_$uIU=c+rlcD1yY6)9Fi1*l~F} z8=exgy46EhFyc7R&G}O!JQpc@oPrIQ*b347*lWVkUYjvgORC80Uj5XN-p z*htzVpwc?iNg*cOZt^L{+HJ*>=OyTgjHY=gT+v$+m{)AuEMb`$e2)tIBrt%Ir(Ych zx%xS}TnAB`6E$k&wL13T)%k^MQEzp7D}*!^OZVEq$5Z z8g)$XTNm{_dJEADpf27^gn~sFNsrS91tc3eg^vY1fMt0~4m9q(i1rb!2@RGgu{8O< ztO~}tk*b3_T|IkL#@0f05>Pz2{_%r2?b7YZU=hB0`e1#+_=lG)vh5e=wZad?)cXQP zEd^BYr>@{r;9XzJFFG8fr11n|hSaK0if>MJ06q*~sjY-=riR5c1(rBGWNPYZmOM}5 zF)C&Rg3c{Lu*O~pkC%Ed$)Vn#`W8V+uD zxPX&OQ^D>lkb7~JV<12&_cGA=q~7qokabW>e1EFu_NgujC6DnkmoBTv!th+U#QBHM zrS9%381n(GWwmz>u$V(wuIfs%V>LO+eH1kB1Zczp84dnti zK#cfu9hRh%vIc2Yy2wxX`&ch>j=uR-H0S7#;!LmS{@^NuYo3q;L&rp4(xh0lW;<9}J7 zpuCYHeSWrq3jURNYN{0e_=eTjb60B#yO)lsA95&{+4_k%(cz*0ES8)rHnbOE!>^0! zh!2`WIp0VMRLfr?p1JW^TWYq**OGtY+#52LBlvCx#xvX>p`8=7C`DD9f z0XclJ^)#EXcH5rlY^?C_LAZ7ZA%=l9+~;+ObsLQZF8jXYIgk*d^RRIH){{!fWg3x0 z@_bYm6kD~=LYaTm@wT9u{O5Ty^Xmfi`2ZgI--8Od0@L>a6N;!6Zg4#8#y#PZrzhqv z%LN6^cb6bVr}`lG2>?2a3`euD^PB4etjqC{Jl*)_h2h=3)ZfDo2_ih*49x{n%Sq5Vnf`C8hW)Nwk6o#Y&a4vC|0g>a~^yKDgz;_ z^)}nxb&^9jV1Ca^ar8Rcz7y68N@Q%_^HVc~Vj*98V9vj6pi`comy=BROErDW!H1IPd$d3lF*MxF z3|TOm^S;E;(eA}+L|0pxUaK1oa;?6)^e11P)(58R$}+CsVlg)INs8-|R(Wxs%Au_s zR;%lDhFMEdr>nDTFQt`uM+98mV{5s}KWizf%{3)nr+DQq1w~~l{4lu>|1xXo3-Imm zd}_DE`(mK_EK$A0>j^xYA9(gk;8|Ya+XC^8?R^x4*D3-|NC!La`tvsVqVon461oS1 zp{?OAHO;5;7iYx7*YDEzrId{{#_o+Ne1P}dL#mqz7hUgY+C&(T4n>mWHHf~Md+`DHG|nugfxWeb zDa7-C<=fI^z{%tcF%cSNqg17zAC`c`T-P3Ix=mRwdslT#U@~;HK`&)8lP!xgL0;oHx8aUpU+SST2nIBeC)|# zyJT0spZuWewGwAof@Domtl}kYXP;!z)xiCUmsvDRw%z4TSB7-OJE@TA+Jr-raGe+&FK;Bqr%{ z;Zd+d;;w_ajXv~zBqvL|J(#}&+9jfEUmeCdta+?NJZRmKp)Rl@n?VOGq=V71yn|cq z>Iz<1Yx5WACr0lF7fB>EE4vL{`X4ji{vso9pNJ3b+p+!SMr?n^620+{V|e39*f^C8 zY4}=Mow0~byDc;(GE*711p4Dk?m<>Ng0FVnhXWC>1%1d)VOsQ_S-R1p&4pl&#i8XU zVs!tVS*aoCiL~jZVCk<#1!EmLrtMN@Ju4=4c!&uj<_~}*J}iulIv1%H-{oUm+s8PI z(2bc|S;Flf@(`T^hE5FbtC{t${A9nWHsBvIaNSqzt|j4=tm{`*uVyEr6aZ5xV-0Wd z6wSGl-pv`(f&!eXshdX|>iehVJRVR!V7;dxqWBdbbRrl?6uO zyA!f)F9=m<64@a=_?l26@^HC}EKC9dVyyE~y}W+Hp_1zrXs6k=J`?z?Kf)iPk!g1q z_jlN;;NiFQ&`Yvh%}=it${1Bxes=@MNO5cf$4DT~-+}?=UQ`5pV}s{w_%dl%X|)yR zN$L(Brlb4MIF0gu`5B93rP!HhFFYAT(;OC8@2fFEKMe9UZC*$ol(=H@cLm|&OR5dtPo{cL z>(b<5Bk!Wzpi31JXVA+H4{1HuhxPP}QTHS3lT-9HwkFV5VSniBi5h{2x2N9EXJpoZ zjNzT6h)&u<2eE+;CK<(fL=-Th?d6vPZI8sZms}1kMil7+0-pBVQbOzWR*1-Kv9ylgzBY>PD$lS`0#x)!u)W(lhWD&@OXVI8maDUbV-`T`u3sp`z;|6WKBRPURDgX8HGKEMTXxs& zRk}V&#cnLR}apSvF8?Sv(tj zCl`7D?8c7+sPS(tZ6Fs5AL50Q2apW{9ifbKs7p~oLal6DjYFw+-egzr3(BmTJFsvy zJg}hMwjT*C99Z$}&>d04;x~6hibctT+{iwI7ruMB+2Wrqte>$@z{BTI#$|?BJQN{S zBI5y@N3iN-tSVE18H?z=gIix_b_0pyxj?{xlf z1TUP(drzXS_uv3O055!7>vZ`$C^Qy}8$tDG1c3mUeLArv_rd+#(ByI6L(A3opWv|6 z^YFUn?A0!$l3FTI$6vbqB7S_lyn@6^9z9$+fqpIQUqMo&>}Ro{tQkIj#K^AW|2!J? zzzVLx!AS4PSTp%<+G{C5=a{mR{Y>?+X}gwX#xTOfk;9!`M`bAvYx|6~DR+@#xmOgU z&ia^9oYkTGsl=O95!~RQe){0Dz8c!Qh8Cc07fD6+uCCB+nt}?7C0ha?&y1RC8fC(n(coJt^>de&$>MS2=5S$$c>0SQv3Qk@AaI<)LQ6Ck0`bB3{#q&`* zv1iD;5^w&ZlL5o~ioNp%5ocW#n;@0NUWK0V`MZp~Nb&yxWk*W(=3S%^v?b*J#5+LI zCq13X%3=ZplNg+nfxFUa2R#1u_wc~*49ruNsE@H4Km9$5@=O=4vun|Y(Of1SMm!Hg zwY2UhFEul)#RY^tIF`nRH!+Q2KX^3h_GWabXh}QnG=jJ4!Xzq_4)p#ye$fRkFD-2$4)~sA ze$p!Azy5%q!ZzzvrNM|_`ayuI*cZ$1j%Z#l$lc`hSyh>W7;k)_3BlOl7th_&#XlkUyslCmvWiaE)gYBT;#b3##xbajG zn9ZyLwQyB7Q$UM%Gt5pd!nB+UwCA5kGtS+s=1n?vd|Mo563W07Y3Ye#;?MUEqxQYj zKD9KJx0Ij4TS_Bom0`j|w+17MbQGB?$zOhASIkuwcl9X~ZTo;3+y^Y+ zbZW=j9zOcBuRtYx{4rS*oy4+fgh)P0NcPX0z+TD6HrOAo$FEq2C0@@DUKkof8Iu6_ zYRS>PJyxJAP(`e>Qa6JMl)9M!R}%ke3V$iFB4;g+)*NI=84Yh~ULCEM&keMwva@L*7{(T!w zf+-*Ku4K#)>rnEseb$eku8;i$p=nt|ML6%%zB&fu>(d1|R#^})rcMl22$^phCS6@j z9e+uYCh?_jb3ulC9tJLl5ya&j1(ISjZXl^OfyLcMU(x|8T$$JkN*WYDaw#PZdU7eY zM1qb3HP{@Gi~=ZXbTKuW#sgjWa+7ZfTLHR^YhcsuGd8s8g85lGuf(ylyX(+*No4nY zY^1Xl8GL6KMzys=>t=A+%H@&*G|?SW-*V)y3AiDz6xxl&`s6c~IHxs_B&p8vVOujDe{e_6<Ihos{P13(CCeA0-OqzD$1whb1O z_-j85nHYR}60dK`;m1VYed5YY_a$E+m%S6FOrC#=>-T>rsEXJ-LmG;Tu7BVs<2ISA z>ivsw|KNG_0in3SK(RaSqL^zR-yy(=TP`_CB?|U`@|G*BL|bkNszmNhW(S=tS#?WL zCE|JeBu@ua>=)k>va6FUY=ifNutOxeBqB0<5%zb?Rr+$n7Lr8;VphmlwZEC-gcg#h z#SBdUTF?onVLy}P>1srAYZQyiN~)kOE~uhd$TgV#Zmqi&I{5d>>L0^J?}4^K(fdw2 zqHk(}?A~1e^CuXouHN*~0zBMTDp!@sT7M${)qmu}5R-=IcOU0t(f0Asv5>qI(A1yA zH;QyrGoXF{_eh8uGN@*lFbw`5DL5pDiLkD9qv+rL0V$(=~lj)7e~(O8Y|eXRp1H%H(smwU=50GozmS6Io}BYSfP(%#=IR z8;}mSOLGs!lA+K2t=-q21LX<(W7Txx*K0;mskxRov~jZ>DuD^cEmc>%WF2zV5EFa| zeb8L5TQphANVCjZj^HmLWnRge1(vVFcv#%{P#>e;_^`B^_N4M+^gADl`I)!o^Rx1> zszb}i4N!+Naz0)E0JdE^(PAZLB`onyfWMQ*J#`Om>tz)8ZIeO?;Q_QSB`~uo z{}DZ$F)JtlR`MrH?CxSSca*++Gr7KS1Txj7i8OX&0Itsb1c^9hcvq0-c)V7c_oG3l z`JY46%QIkz*=L(!8cP$o$x?Uli#K2*t^t*4-PJ_54dP7D8+5?;0h1&7u&jLJNIojt z%4DfenceoSH1Abfzgs>G&1I}kAdmD3iihJ~98S=DbA71PyLBX&i2R);?A;nt*|JLm zH+X3+AB$ehSf z=h%8)znX6l&61TtdBH0_{>2-hmul85U*B4s@7dntb~-PIPFw$W{EZimc;13y{@Jkb z#m2(I_r|>!sw$rsNzRlnLdAS&jsd@jyYxwCSK)ftOAX_Z=tN^VemxB-#H9LW!D}N z2{*^mk-U)nowLaQ5?Rc9O=z(|{?3EgC4-%D*nk7cLOx18KU3 z$y#sWO?~ltQuS{72cGwzo~9=eaOusp7tOKW6ySSkh_B!ERbla7@kic_0#7%+f&Cpt z12K97S6v^TfAF1X^)GbM@z?8b=qc)NP7JUAbhP>_^!h{e`g8R9N<{sR(ds+(`ZwzJ z@7L?wBm4hrBlK-gGci8@@8R@qt2h6(MIWxCJurXX09oyT?v-}Pl@DDby4>AP&rdKU zBA+EsWHLSpFI%bMX+oY#3pu}ij(wAc5b&e2a^U>7hbx=xMf}9`+j8*yb}oPBI&BA$ zpy$E9E|@#S*SB4x_jLtRc>yqHx=u${x>+eh*+yLNC_m;M{x-Bv+S?qkOCLtbsZL6A zsADYIwyJF9LblZBB%Xd~P3y{8oT6`bp)(HGq3)Di&SU;&Ch&N_7ngerrhWu%CZ<+m z5dWv?o=Us2&ou~>a(PuKlAT;7V{dx_z*llq!vWI%?okBV(YwwL*F+&dN_p!>260M2SC4zD|MtT2Qy}7(x{$ zt^odrZlCSMDT`-S;hE%}Z;Zh3)tiI+-&nuW0s6~bR4p9Rb%q9zV*V^3hDGvJ8aOfb zb-FS!CRhkGfNE!QfEexV$*1kJYj_u1u)Wze260~IF!u2!O|V(4 z|F%9ZH24XK^JXbIfp9M6gWt72xZe+wCv`JQW7KdjIyAiK^mO}aacf#WT6bPsDR2yqjgt0-Y} z+ENfc-O{idT{3iXcX}F0)6oXjdSF$34%EJs*U9j>xZy;!VyCPdJ!nTStJL`il)a$- zC!!aMCu55=958itcIR$Zo*yxmHV@uGL80|DIBeC@-1m@eOG>IquGO zfd$3Ot#u8daQ;C`CIekc&YmSD56Y90hc1zlM=qC=$AMIHl}X7n4&$Xr4oxE$gBJf@ zReY%T!;8SdI;D-UN|<|u@(-@VKWjxG1-!SA9ecbSBcaG*A=4e;F7e{+iYdG$^q{co zF>FYCz$_0hm)&(nS!v<2f#DlxolkM5*c&i+Ptp4%ylAyJF4Z7;KO#5!$Z{Zf;JlT) z+B9(pve}LKfqp~4G`ZBXk-6$oqRh@q;*FyjV zf*?c9Y*Pa-YWyqMx%dY6FLA}~G^Ln|ixd;=RZ!_Ofi>g6kcjc13fHn-WxRn>Q{pkF zLo9(T&Ve-l+XU?@yi0aS5ql|~XI~Ay4+xJqRu}heoq~5LW0+iV-)o`6c574bfCXH6 z)Lb+wBHx-23#3w|(M==`%)g6ms_HSN8O+O96|(#@t=7~t{N_J3{@@H=Z4K-yq!Bcb32&_+dgz2SOyFuWoj z`(@I`K5N76hBWpe9Sn#cIIoi-8?)1kRC#hq?;PY-1TLrAT$~Qd7mQ(^Z zt+tXRb|RO8nnMD4Q;?lp?f`jnBOXFQkpn6KX0OVI!VbJd26rgD`ybT08M*YyOaSR) ztPPN^r!^>qq}RT#3!dW%v$-FNypP18bpZj&CQEja*z%To9v8JHD5thgIhvKq&ktW8 zkWrg_=viUyW66faVR+ie1=Ra3_zH90Ot1WYQaLs;Fe&HQLo1gqbZQc%TI(oc_H!C?wSF80W1@4DEnM1(icUktFM0)e zb}aF%F~Fl^1&>Z79%c5<)4UDOBZy;)=a}bK7{s(zw7 z$g_b{+Gho@m>|`Hs4zet$tR)UKC6K@0q{7JG zN&yVYLs}OTXW7@U!UYrTFrbW8ZcZr5s447I z@{wbb76dIdm4lXyY^+XC!s~Fsm>dBAhNd~x>=XxDQeLQemyFkbse=G%7Y+}M@C?pm zjwv`c)7*V=Y^JB+*o-#x&lo(~kUoD#fn5N8K!Lx#H)8-x#)Sph!o4wt{rCW%?;Uiw zWHphs<8aXq>Qob3l)`01S!Ch>SBD1nLOFJg{O5ZSb>sZYm3%++9Q#WqNAJz6ssdd3 zF>c&4j~2Z{>{ zZ<|aP6jq_WeLn>?f$VwKLaL#xJz-4;k%?CnvQH>t4s!|&Fk^zpaZf+{99@r%3vO|y zb)bRL9Lnk}OrI(Dh+hbEL-Kx1g78IaqQN-@KTQVbcpUuE0ihk>_e@M9qF_&hD@Z+9Il%}uxxSIz&@Sb3H{+8; zi~e{x{O{QW`QMY|KlJ|_-#ev>>uC0X^jKy!v#3Joc)Re#(2+F9$_xe=rfLop!~JJA=ga zd&B8R*-3o^fdlh`zx@U5DOy)3cTW-9Vc>9fO6QclD<0E-{?F)n_Doagx##&S(i3#7 zgM`-c>vVc{5^&DB?{}f+XQ1OP&6`e7sWOC~b6^=lIDhyPr{~dFou2Xlj#{BGo$JG-~V(Ahh=bVApd)3Sgf0z|NW%~`QJZk$w%XOc_oKx zKShVfkA1Oe`QJb4$RXMo6UhJm_nOLJ{`YWw;Th8ax}3BUNtP+Cx+DM#`WJ{|?Yi;> z_o4F2fLk(CsT*xz2U59Ha_jVlhCB(SgM(Wu4=A$iub4D9)&Djpc7$|kW!Q`X=# z$#akv&|WV+M-vP0;-t7jTC=8``0b_X6fdb@Agz9%b6oHi=ld|}bOL~-U*LRgkE-AW z&V7>jj}$icd?mlix#DzXePk|r+AN@4^hLq>bH5%1J-|%Bcz1AMFy}F*_o2vMOu#eH znkaej=nRj3hUb)aVU##egiR1GR3(6rz(r&MaQB)vjzW@e;UO6&|L5%4pDTr3Uh+ct zjk!;C=L3C@A%JTUd;E4C*IB?${?dm2i@<)POm_fq6!2bLHF&{@R8d%7Q-^H^DQ-qeWh&mK6;PwLD}jQvt1nw3hslJaOErxshc6K z8VtH_GW0foAb`CORcG62*VmY6+YCBO0x8UNZSf)28Rv)^-*aZr#~&tCCGRJE6y>Np zejF~Rd}Wxf4(dvgkliOBTkCsT_mA`Vj8WGI)1#u5eH}2@`b2!=z<2@FwsOrB?|BxX z?`@LZz0IH|OfZxA&*ql%AqE+6<1gGW9}dPG-lsW`pKuzW-x<+hZ-!lejwsUUC5+zy z(V+Yk;E`C?{v;U}mefjSyq4>3zoa!Vg+lgUE<k0TP35-`YDwKWEF!T^Cla^z@a=ooyFd zF7J-2ka9fSuisg(e~Vt}d6^dVT)|QUB=mqW=4#^@VKIJ|#nwR_nLvtIh8m z0u#xM=cal&)UIDlMoD<d=sCrx3@q{X#m73D!^r1`x@ozuZiQglxjhX7u-X*xj z1?R=Rr=nt$D=Qz)H;9c*?-{woO{pwMn#97dMf$H2_mbe(Wx=n@gI`w%zm|!w4uv{} z54?!M*%E~oTAJ*8Py%&%3@aIBzQA2eTMeE+yOZ5|f##h5EWE?s_<2@|964X*1s>E@ zkTlEXA%^AjEB**hVRcM8W>nRe1n(K=O)_2oaFV_6LkI6TGsUY_pptvWe~{Kbi?%yw4iP{?Yb5#6r-F-=R~xn; zwqM2Xbpe*cacE#t_jnt}MDolcdhAC>_}zd5bhjAGW86@Kl9SsTa?%i*Z7JNzL3O8EUe2OXWi6t|EBT=}aE z+?6kOw#kwIIhCnr_qPsiac3*HIn?umVl7XF8im>3%6?jVBTr-a&3PQ#kFm3CW38YSJV09d zO7%DjvF|+~uDGNeH10U7469oo1QJ*w@!6G)!nlu&eaIr=7#JCtLt!_XRT3nk*k7}th`tSu<7jJ$`kG5l^F z-3Wei`4#ZHvpfR7n;YPl!11dc_h0ba_)`$Sz2_qE)3OqIYwOPGnuG|i@qGkO!sEfv z-mQ$7lAyXjgqMwCTJYq9GH{^I@pM11)-A~K< zFbV9&5@2JT3e99S#;!EOD?}~&E*75v6pK}RNK66@$FeKi9?~-7P1(sO9JV9QR5YJB+n zw3_huIX?t{3+qf)E{XtE;P5FXx{K{tgD8e`7#pSaHsglzxxk)ZG5VY-X6`A!FhgJ& z%I4Qm;w6=<(cOK_!u7q>9mD~(15ztFDel9KuCU>4Ko`n41VjErD}gE4a1OjKP<5GE32lOb zuoy;oE;|&&Q&zJ~JiU`$;TNJY`-w_CO+G7I10W)eH`DVX9d1?r4+9;Q+{6883pW&>WJ|UzVC$ZhhF)8GQRJ1 z<@X4v@1`ri?~3nhtFHLI3%)-P`Civgvz1LWSd^`r1`I|EJ*~fHz&>piIt`urfF+s2 z2JET>IAC)_2kdOffPHW*a=_ThXw$I4fOR|+Wxzf<$p`GhX|C_^^?w@$_-y8f1e7X{ zZ4*)bflzfeZy<3H)n3iRP`&xbMtO7n(MVKlPc#YDKOT&NYT*fv>Q$#JL(V4wzL@Hb z^qp;l@cLE;9?tZu#bq6Ey z?Rl(8__}_Gg0Df~E3?M&m3|Ap?xRuheI+Ig-*I~y#rK*9_&Sa@319ENDEOL>a(r`6 zhU05jcIyF{Tt5E_WDgRuyUU^?`?fI**;%_AMYiLCNMxTn(j;UT?Tv!$AC7Qj$DF_w zINJT&S;|&!k1AVm@20_KjdzEF2M#FU_&rkPB+o*8ueP21Fgd$B4DDnp6$ zzg;PIy5Rj_{=WVRe2>=SPaXJ;kU4nNVF1F4ff|*pS8Zi|mgfxSJ8xb*YY9>&%X3`v z9-w|4v=0+mmq;3kh9Mj0VGJz>en~DnBsRdZ%rs~Y zwCQme&r#%5H!F=+sebwp-?@4Jc*y#SJ2qL;s?8Y>!{5@N zwndv0>syh(FtI2so-%5A%1ZTG)5~KrIQCT2%hP7nolP%KY2oV4QOiTAEKCtAieicc zwo5OX*C2j0vV3bS|6bF}Ux($LO)uXT%V##dJm$w>ZJJ)bJ(fQ%HM#svSboJ7%Ktv{ z?vFP|)kZ(19j<`9%iG?lyi461DeuPr*d%$kY-bdCclVE6-c3IeMc(~S@qOcLRQbOC zau~i#${NKtV^1W$Gxs+M-_<*!;M;pY$5%ZZ1>XpJL;ej0$F)^aQU3N)7|I`i*C@&l z?v6zHl^>d9aBSKh1?B!faFiDv3SYl}FZ(|w`YOKfmoSvqZfo5B|Gt6!zprWbe_1s9 zf1j}b4>qv>$$npata1B&#+BdSj_-$F`TcG9zSouC--_>Vy7GIBNcGhoz2f^n;QIqt zet!$T&%g3}3*mp|_Zl$#4rEa7KLBB z?&196`mq7OVBR5L*WFj77ltJJT3-1*9xw2n`TmOUF_g@=^~&#YW5DIN}YUgf`9HfokM;oLF7iRspQHJQ{ z@A(kz0BJTVuK|9*h@dk6DiqVsM@6yY#V{0O8b|TnvPcw<>}mpv?W3XipIscqbNebI z&wED0*P#@m+WxC>UT`2P&Rs8r;e5^JMsfc1yGWeR>}-MuK50u7oWIz~alX7ad_Df1 z@NIK2D!#oc!tlMhv{8K5Z;QmYc1IKNP5UMazC}AYzAgSczQ2~A7qyGJyYleQVYuE^ z(kQMww?^XHYDW|J*|9kau6wp~Ts!P(5AiNrUxtO@vzEs26})ps1<`@aA0 z`2O1Ur-c+1SFZDGX0bo`t9J)K*;8;9*TnB<}+dV zeXyZX{2tgGiJ!8y3HWW?7zMwFwsQRbwj&I`-(@{L6;%cuI332O>k1oX)2B)!ab5gv z6V{i#MNx1a{w>G#!|lKO`m*}#sJNc@hvB-Tpix{Wl|`zwLMD=d_d2^?zTu{@*yR?{195^+;(`^#6ir`hTg= z|F{1B>*@5JQIYM&!jNs7-zc)HiXxG{P|^fs?_3`R*|jAc*_f>j?YZAHoNFqgV))3( zFbwbhx={>wZHUCMbyG0RUKa(!y~P|u3y9?3Ykpq%E-I4aPJ|)(;8%?zd9E-L$*#ps zn4h`%QIM?I#F6Z=rSbXsYx#Bmil~TsehSzB);6yHH9$0TQ&aT6ucPUI0?|jliRyp& zwfKFwJSu+g9S_6LyQWe6ZYzkyZ^XtX;P>`dQSj@uk>fXUb3nh{@mux3f1Zk_{~Zh0 z|5i7y|E-TiO(|-M{*Wn=2CHMS<^6b;h5Ek(7vEo4kz^sIW zh5zCAypZpULcT8v`MxaV`|^su;V>a2vNpJ0 z-hN8f+Ly8MM+MWw%F0{RFCF>UX7LJdQPY%y5>H+C$2^p>x)|Ybs2>*yL=kj{_BC%@ z;hjA#Kzh>Td7)?o(047vDc0zR5*psILE6MiA74X$f_ zRdIUWOLr08uIbE@)z0F8fo$_)H?HHDT7sZ+GZS4%*#&B0fI6UgCmpzOmtbWb64zI< z?P~&2@;oDwi6!xrO&a!3NyRilK055#!GJXgInN-ZmD;x&cdFB5_c@c?120k)Z4-w} z9)3?L-hIt)z-xwQBZuK>4m`M-M#-f9P1#DvT9X{gS%0E_u1M`|UZdxWZAZK411xsi z_Y+T0*4NE=BFX33EP21c#8I|W(j1=8?2~0QN}uq9A{-Wn?TquqK)VB>e++E*?r?Uu zYYwZ)FVTJQ+hz3;y!^o_$vqyoOg;-g6EPv?c|NGNLT59%$77aB$qmR%T*JxS0ZAO7 zfqN5>`;1EsklCA0Y2=&;0-hoUVA7B>=`@qCu`7d3cBPM*{gaj#!CTPseb{FyfiL(~ z%Z0uVOzs$r$L@h6%od;+upO9kPOr}9tmiXgFU{1uy9r59E_u7+JqWWm;uH=&JHz?R zyB~%_@_Zm}^)=m@%3maj#9}=q|(NR{4)1_6lveMroD?hNSaog1m3~fY& zlYGO%^F7&~u;x0KV%$N;a~Sat^zLu*-9_}#aZLy2npJHS+{9Ls(xAWWUrhs`ea;JX zgZcY8js(qHJi{PehoA7A7!;WkXXJ82phz4Q!G{GjI0I@$jz5vVi;ig_y8@SbAE&!C zoVwxo;cyq`o;n5 zG_t-)9)~DFJAw*kMtlGdj~qNr9<{XX|?53g_lC#NeECrRkNePpe)~ z*?)$;@dYj%MeJ2_{~uz5KM2lKc0G0wvF^p|fNLI(6Xr8Fv)%kDUDN|pP4Ye^viX?l z^_O9<&64*H@tP8lvqfRA6D4o0;C&0dej)6&Me=?tUMJD($HQJHN#1|ZRWD(ip^@rbeiD8i8=dFgX`M+}Biqoqqa14(Vt1qUuLGt4jMfT(i z_QIN2yL+SY0WINnS?hU~Y`Z);OHSS*Z}(%mNzaacTEb;n%GxK#4mEJGvcX<>90b$P zvhAQW_XMph#ggYu5(;1}lpGG>sWfaL$?;?jbHZ~>*w%xW#npb4NJA8YdLF6#U z4rTy!D1O$5LWdHpm|KTZIl7X3<3~SFYAcXAl zmq=$9(0QDRE-M=f>QGjq9M5&mEK5Hv^AFYVMW&2qS!sbc=?#V?ldyEI#zZd3dt~Dw zra@Q!Wo+&q{d`g&pMk8dr*n#G^ZNck@qE%4QJuoy>Pi$)?z0K*x>nM>e-n?kUHCHQ z94M5B$9*mv%jysF{&xC!=^Bf7wF!WUG z{&X-$LZ6S3;%)!agTg|SQh1Jp^fi@wLWGs)xn-086-wF=e-Y)s85GFqj5-lzbneeH zI_F?U=i8)Y%J3{DYdo>@pjlG#&^#%52B#d(59eB7-hCbecy=ccfb;pAo8rs~i?3 z$uk{F0;~^M;BBci7d4CZ$fy80o(JAhSr2bsjWPV*h-rxiSQ;^uaIZtDKxnSA+jdFbt!SfgP4S3BspRcU#qAHb zw;Jjid3)-ZBpK5!YeZVSaf3zVyM_J=CqE+hSX>UZoIP}y;18bl-(4yogFpQdjWTKO zT1rPS9WS)lzz0tvsX^i56}~jQ@nU}-C{P@zW1r!WjY;AF0WjW4DB$J?NVj3T=))oM zHEd$_6!|qKiLNpid1ZpMejWGY-T8hrOqXa)d%d&;{5NGwT;P}ChewNgF>@Z#WTZzv~x?2F&>wS3Xi@0Yd z0Oxhl;Pn4QKuVrR>+$;HLy~to$@ojY{`FyPcJw_`AIOs&+OFggH%$iHJ}lZk0xkPY z)ZQ`wzeH*z!*V|VFQKH80zdbC)(+F`` z@W(GI8|2@;%p)&<`fpBIytDBX`DOooh5(s@eUOv%SIY5bpHTgC9mA4-C521wG?d&u zm+F4l5W!vYF=rgzYlHxQ8L zl5EShOggKJ^YeDw0m+kqQp`5SGI<6`7S@wO0&*=5ytgGN!|k?nlV8jRyEZYFOYNKK zJ}Yz#*zj7U4U&6-u@O=IJ->fZh-x40hf_R`$C?B!8XfBJB(zDN#xZv&ha32*CjHUg z<1MQN$~xOqzWyTKwKR8Jh#YolLi+wQt$gX8N))GHT5Xl)_6w1wHZD!SBx&lm+ukDjOP&>6oU&~c zwEu&wk|6c3AC>1&r-Ar{;UekzI(qd&AiBXf$H4Cy3xlgoL%94WIZ%@487@9uuc7F~ zt6xJKyva90+;0%lp5*CVFYW{8Z5|bE9z&@~m!JTXekig*e)9om{?^NBe+BoBc8N)x z@~bBC5jqzi|M?$&<0Q@?r7Hf-TTwK=8=-|ozi08eccRSZU*6{kWqb;=IjDbK47G># zDZD%er|{R-Wx68vB2D4T{u>29O>E}qk()H7b=W-ak!zWJB4i#Pl02C>3wn&POf*aL z7T_f7F(yfJzl_>h8glfA&0=0Fxk+T7o6m03i%{2jp36#g>FlGhG%k044>#c59Qp6cpDka^#W2H zI`SLIc*>=R)(cS310u~yf_GD{M*2u|+Q;y@w>+?@zi-5RIa8V;B_`e&m~E2h6J0DK zb8ekWdK4b_ZV%F=KT?>Pi*7p@qf5`6YwaC ztzmc)5(t|;C`eS)sH02-HBpcZizJYc9+<%>tAGMRFhGD9W&j~M3nA$~PsBMO)1Xp_>TufP0VERqHoSY}5rC9hgjn>Q-xi>)V8LcN2 zgT=g;D;$W^=8`z~6J&Kr=E67XBn4BPP*FqC*kq1({4L~o2_C1E2Nvhc^Ovtd&fIyB zyY(yCo?;Urmxx>xDX|k7A(N-tAnjNA(YfB2Pb!ekB@&%g@d-Waz>O_j0q(XM$NY7P zyPZ3r=5C%yuorn(|G1I}Y1TgfDtPjRpMWM$u;$Ic`TChj1|)MYRd}4&WPMkX5L}@R z64maI`%f;YM=tjf=$V!y&LUp8;Q0jk?L`LLo!G&lMEL13(+|&LEWYC1CC9VF}UB$R&9Q5MD9cRwM}Q<#fWEYM^NlSW_xi9BS7H^%oe-ansm0NM0aU1`dCq^w_}zMN zws{Uz$iSc8g}`5%=>z_Sw-9)5JOZD68v$SNi4XX{j}q_^p8@|Qd4SPH#S*w324?~o zo4(5gcY&>)Iy0LmUv|H%;{~dM1xjpO5cv(Yk+jz2v|+t@%kG&Yp}Rh~&%c3>*ue`f z+7)q81h^F#-VL)0;V#H#KE7H`X2yh%So?}p(^F1S38k7OBMy}jX0GViTm4kn_oN$l zH~mW;?}EvS6p|JDF*NgbnoPOo#J$pa`)I!Ny@pgNPNJ}aV%&Z3x(d;JUre-OReZqh z{}s)63GTB;KiT2GC{$0skM-x?4nh^}3s#;BV*BG0vZo~7 zxSEhN2gmsb|0k?A44|iEsflOEJqm}Kz=;LP4Ca_&I}3KgguxJRcrX(osLf(Lnx%vI}OBJX^x-${&?O3*^4|G$A-LBKZlyhoVa^ofKfLlHY!EfpA8 zf%mwcJpedCcx#)hTVRB1%QlqXzJmpL%z!J3WqecE&#ce1#6J^%@!pPlVBs&0NTE~2 z89u)(D%ElZ_C%7nBJhygED)bgsN_5O4hhim;R=UBF7=Wq1MSek`9uiG3+Wyp6?-IX z3))~yuF@L6g}40dxD&TBb~3Cn0TM`aqqyNkITsiE*odyKN8PrlC?$sC-I#u17oP3G z?4RI*d*MZi$WYg4g*;wc2VgWswj9O0cR{d?HyI<&*dwl9gMA$^Skv%a4p)i>U zvgl?-Z!dxEK=F}M+B=Mau#yQ9!}=M}bFE5h0(!}+WYBzZ#(IQQn>Ke( zH2pnOeEuCbZr86*V6ceI-lH9EvnLaL$*W(*`2`*(`#aM1DS`UPJH(O}eTY2E`=^?J z{3W>$kyJrZo|}}QiXq3CE)(BJT7HYf= zyNE|y`rjE%B#f-CqT#YxT@0Q|;SAEX$MA&tcX3NvgTB=x$2_w=_k09Ck71_hCDev! z93}01m(_BH@~HKAG9m8{ZW$g_>ESfzw&RLE#OKU7F8@tgkHSM&;;J8N>V@G6sjeYx z5tZ76F-#N9G4t*mP})I5HKba8;@t0mTt|JjCFHMp2IEiG_%!`%#_+$kW08Ilis0moXqq|Hd8mFirC68 zzZnb-_dF?7=s=o66W8_Eg6Nl<(g;S#yy{bUItr_ry2MnWbf^* z@I|Hr@4r^Bia+g2W<&LO0#20_%SJqLuxtxz#``^an7ii`sLPu@U*k)lv*!bR#I#Gl zyh+0Cu6JvQPX7Bf%Jz&#lHaDdwb_+7V&=}*7DNNmYvT-Z^3L1;Y$$mr$eUt0KWJd% z3)Z9N=ICvc0)D!>{>EW939K9JLoEFxuz6rp9Wge&!oSmv5O9>X`9gy#M$;#+l?Hr8uOWQ1+J%6x zw+g;NVZir5cO*SX&^kGEY*c4TMM3AO8z~sl7CC|;CXIIAUx0L!{eu~s^m!*0;zR`mON=sg)yPv-L+7G z_YYTSd065KEf4>v^M9=PZ=FZKHYcp}=o9`GbRPYu7yhrDM~5?yc&Pxdpe-^$hU0!l zg`}KDUX`kHf%Gbiw_rtC3p`jZGBxZ1!i0Q;e7cxcW6=`X#@e13w_;L@;+j%DpRenNMw z!xST4T_by-g+nerfmH!`p_=UPLU%^u-jUMza94B+sJz0=7%9C8Uy&Xw{f(3UnyDtt z@|j5$>B-Fu=$zn%bk;%o7+xr%jjp0n`kt5SWk`SN9-}Zbt0Fxof?m;1PX#(}q(|Q- zobVlZ#w&{6W&fvL^CIQWXOVK{vuL$aV++=RORr6+tWx>xnDQtpHEOYjfx*+lfOPl` zmSaT`x?bpOqw*LlJysOOv8M z+8!Q3*Hc-$S=erl+-^nDETzH%<#DmEZZUf@Qr&m4?x#xKFO>?bmB&(F-7V}Xj_Ouo z-D65!QK@iFd90Nmg`(YrVhn8RyNmcpBrE%WGBs$#7qO8(0u7HO%Fy?-BC)-=NMFC6lM9^oTYpj zi!SFPJgVg`%AhY7D2=}EtMsYz@=N7$i?6&=d3j8Eto4-x6w*th@@Q5b6O_j!Q za;<=NU(#NX@{L{e6JN=&PG9s>X=6d&Hur2QAU~bP_hX}^e2M;bqQzqD?C3~OQGp^i zUD1hJ?T^=QKSHnl1iAf=O8cH$3e?9Viajm;e`7?q`+(8%JN13QbI;cV-3L4@zw^5f zSWa*C?gM_}R~&R7aIs%;(0#y(e#Jre0sH$EhjSm04T4&YRE^Rbk$#kB<715s9G|~~ z#^+xVf#dT;gl~Mfg6kq6EijG|SFqIpM^PA-sL$l%IL^UA>tkhu24m%cST(NsW=J6n z6S;%<4CtY3ltwSC!KnG|?ZPGRZd{-6ny1%J4;+H9Nr^#k6B8HGPE9-(*E?>4Gfywd z>x>l`sla*C3dF==r~WQ2pj7Qu3y`#q_G}7}?p6%1E9te+Ze#h)|HAPZlB0}I^SQy}Q=O+VKF9Kc z$7g$<*7&T-YjAvC%WH6aX5}?FK2PK|I6mF;{vQ}0=gZ3YOq>%uK11ecj8F2M;PL4^ zM{9g;oYUa=T$EhI(6wttoYKY$qpFQD#Y!84{n}W8)}&Mg?*&R#rc73< z+~HrPR8{3mRh1F>N|m!S{i@(WGWUuFQq5$!N;kOq$8;gA#xS{A{B|bUQJ+%-j&`gG z(Ae;D-|O_!>19ohQ9+TRf@0rX1&S;`DDG=>iPYaNE# z!BfEb;7K6r-a%!hO4+BrvS=z>td!;W$~-f$EJG(%WQu+n?T=rMEZ$H>sSm_s&ooctQE5;e7h` zL8)N=UJ;-A79GQ*tKu=wJ>;`ju2bSUIlBM36RZ6-$O?zNua&^@X>sH7n))>1Twu=j zw?KW9^R+{AdNQOauO3x|_&R0=E`6@7>%U@Pui?9d7^bKb? z1fC@pFTX~I?&GYTvVlQFw>byct)u8ibnd)@&b`y4;_J)z`TBqg8?H5BuN0qO6NaNZ zAN+jzXCqiisZvi4`BssyxB>Hz`D*5i^yeHE)Jg@;d*Ki!=cUqmz%}`AF9^52}XBfN@$l8l}7hWUaU3@GAuHxgj@f$8Tu~~lS7L|Z0#wJaF zS5LSO4C8hdj$D(s6~qav44ssxniAN$?;?ux-T*&%Q{<&k6*esF;+E;KVX-*^BBJm; zvQ05gfk_Rn;C?){_gxZRi25!WB^Kkyj)&j~Z4)>`i|38;15bo>4^M8#k_~*n7aUpq z54?8|W@d}K1l*ls$D*2g&u(DL?p}RIMzdi&!3O%-andpFua6OdV7ZB%a+H;Z>*eO@ z_(EB4&GAL_?f{#;^VNvkLxW}z24S+t-0PzxwPK0X9nFj&OaS%bCK)Zg=bdLlW&7tqg(j+omSyI zzo9PLy5Vnza}BrjWxR3QhAWtj50Az0?kAWEu?{Nsc+ALtF%{!;__g6gG#{~(y^M7} zcJ@;^XN^A(U|942 zj^qj;h7Nmb!WH~PPZ3wJn3inDb;9U^!aJqJ31i`Qw)=@# ziU%WAbYxpCl~~WQ)#}QqwF*Wn-m!SC6cCw!^dy50mYB-@dw3KV}ZJzq$bfX z=YGQM{cJ)p`U&Y_tPb7I?QvF_7K|)|K#sH=J)e>YqZgkv%C(PMHYw3Y2?0|Pu z&%uo{ylN|4^z^~MJ3W8ElAs0|&fd>7&Y!eB8{gph#KW1&mfyK)q0c97pVqkZiLab# zxc^bA{&NER_id35off!7nmSFLU-*A+fARFv|JMHEKSqbOzu43rw7)og>i@NPS3diK zNBaV9@;FRMT)`l8->VIb1g_wWzeA9jA}<6v4Lb%o4L1cDA!CD_hIN36YYh1bfjvXy zTr@idqS@iVvbv|5t&9s}JclPh(vw}$Ws1+PIL3?=yHF?I zF@@ZH{{g!u31aga!MM5G$|R>*c0P8%HpqxMm+doRSQ{tp!fEl>Y-r<*6e$I>SGi^- zF{rUA6YtLQwX}D*($dGOme9%(U&)r{_A$87!z5j#)S!{sVN!!<|d7A<{o2)B^pQk>ZEvjDDbPv$gfH#;pDIhUTQE&`~n+1 z12;E?Y__Bdzr2dJB(G^p%KR%J_+0du<4j?7l^l~4Q9CFn%^V|nH|{q}*L~Qe!->`7 z8Ju=WA!c-Rk$!c>Vb zqQ5#4uW`0y9Tl!bZK{E@CDTVMpcGBiWAsxdk~s7+dLWr+*hH=PlRkPS{MiV4wVcoZ zy{;Rg0Qq}X5WRL~Y0+yqQd=eW{es6coBuAVW|DdGyg$M!}Wfbj206_)sjxt***gD10HBxwJLA1r6LlI{? zLt>4%AE;@mcJMr2+Y)M!-w&g`h*~x!`n$VyvNKU=bw^#YVq18B8gG&MBTIOKnEKJX zp_s4O40?oTgtYI2_u72dm6_=E} zB)Tu<9&h5nX|M)PZD7J=402R~0sG=mr~dPt6dRCt7bYVNCo2j~N@za_11L@u*bH_L zF!JNhLDXGdXz8@1aI2zBi~AbcI!K~eem$k$r^BXJc~l~^Zdc-nW}X|FGU`d$uFbE- zcztb65WP>>Xm#bat>uNhHnqHaP%H)O+Y`Z*xb6AJYT?`4+26#i!mj)cZ0w_YS~f7X zt9x#hGQV9dJu`7E==Y~lPE%p2Lu3H9HGLYnc#zR?SlF1teV5v{Zi0~7rVc_YfZfGF zKnPM?sco;af~($d#NMNV;v1pf6ytgohARKm-T9YWaYfIdXR7r=3&ael<;S&sk&oC` z|GRXrb(nF@*y)VS>BgrgI>t>99&KBmexrjI+QZ=oI0aFj-d<9t%fNmZW`wemvE#?* zj-H+~b}dxqg;~am?bbe^G zu{=E%KTGtxpQ!$Ry5{#08s9r7!uJz$w7%a{7ySM72EKn!<9mRIQD~gj_s<4>pR#(E z0|$^*%bbuZq_>57O|f2kHkbxauR5O|j_v0-asi-;&@TI?WU2!kKglt0B58s!E4DoO zW`ifD&PUIQQyuVq&U!~K0W(1Wr4pw|3mgLWedRSdz-T(0V0CwGVU0FOUV!7nL8`Rt zhf=s}#V_%=;eB*`jeN~`@IS~OFzOk9e}FubTjgJF!xhb^ne2Tya3<3{o(9k{kRlO{ zM;Xbk2NIM?LemkqDP&?T$xDB^6#hI=4MW5u>BC_6B;UqnL0fSSPG*A<2W#5aV(RO3cm4U?PO2S%5A8hu?>~4A3IeV zeqm-1wTBGCIgY_wM!$gb{Om)%U@X^$;5>ioPn_pRD`}oz8ta?q*>BQ3cZ?_h0P*Ym z^)SohxuV@s@|0)O+}0&<*`E$t?$?Aey>e`m+tr4^dG7Jy(WD+>3*+G|0`E?zt?R4N8RtW>Hk*H_b3Omr?YvR z8KBR3re2YCf`t@$SWT_ioEjcpNLofILgN>la~{(dKK(LW7;E&bvtMdkLcb~p#w;X3?sUx26G1+Ky-Q}#Z=6^w{pddad#ZF)! zh_5`sOjyft=Rs!lj95*t3*VO~;MaX3__GPfq{&1f5?PU~Wd__Dj>`FK1 z&oZ`7;a=I4_>0%%>mSo-i4)KXk&K^kuaxGOMWi&}1O=@J)zn+ca(_;x$bDtSLk7W? z$~NBy{tUDD1l^XObR^!OY=BB^zSy&3pkF>T-i0UqA%%V0b4*`%p^qV7ygt7+f_t^J zO#aM^!3`N?^qeB_g!`<| z*VLzCL?ZE_ku~-CbB(PXc4pG@E@nns&NKL3ZFtw?=KR^LpbPJQEXLbjj_=Z(&)>+J zx0>^xV}ni03cbK2e5f4ZKSlE=?5>y+k5Rz0`Af242C(ZwdP8^5RCCM z(z9=d#{SDSX;(ptClVZ+J{=FN!zivEX`nzOyl_dJIe>VQ%fMUGVrY}b=FT%2G4Pia zt3{c-ZDD`OJ<1fpTgt4$VJ;s7hqC(JpuP+Da9mv~G44tnI(pDG1cd$8T_c?*>e15# z=RZ)7Of+3}R8;Tt77&n-?nX))q+3J*3F!uxkVaBUU{?tN>69)3=}w7-rKAz1B$kwh zrEBAN{d~{wAI@RVv-8e8^US-u_uiQ|W=*a{50BknRl!3>e8UQ^I1=p2V?1WO8=Dzh zJ(08gyH1j#qmrU=Ny9!o5=%v~@yXGRTZWhh2Qll=uPUrH4&HCr;!{7hJD$I+Xmc$( zTf3(wdVF-DJLwd5jW_T2ER*{0y99a3jjsdXu-&pV{XwGvse3{GwGN2G)HRB{StMbU zvSZ_Kxsid(10N59Q`#2&f4BW^ao>J_y(Y)r@yW>yZSo}G=6yPwMz&+c((*aVlS z*HNsZR2IZ}bYVHVrgI~B%X{9~u4@~@eAi4AN_#PBcO60K!M*7~mn*FnhXm=Oz|j|u zcuZf`&${NVtF95Qr|+6JGi`H2dOXI~zLW7Ny;fn=Mmx*rE*mD{d0}qJSRu-t2^agQ zdxswHLXD$VZuEbxOg@h<w<#}}14mYY|{G{;fpO~GNZY=Eh4(*+mPY=Jgdb{l5fA8p5eiM|9=%85WCWSGF zjP?fbHlQ}myw!sa?-jNgc&akzpNVL|v zj3P(Yo5jn-eU|?SkN?}rHM-MjO)Y7|pzDS2)^GR9fJz@!%-61TcBZ9AZmCRf=j~Qi z^nvDGvEWQ<&vn~)uF9+Tyu!~(riYYwIa7*<)7;2-bW;4LPEuo-bOjiOuFdRwRK?VDn z!xJOpQF=ATK~XoDeYU6Z*seCyP=uZQRIW>O{gZbal>sq3ZMZr_ma1lN?F7Dhcob<0 z-AD%dUaqyz zFBl~ze3mur=THyi!Z&_zx38wU(DLU<-;3^qk{(dX?JPAog{2rn0!dZWha=QC$JccZ zO17g5I4t3d4@zvk(`{Z`~QO*-xnjD37y zuPZC_#@i*R?hI`H)5z9aQXiYEx-L5EXJ)URZ|mV$ncV|g12=A!* zQJg3v_c3m*?@FaBU#I00@V4m&;!-3gzgC-8grI#syhDri!7&FhtZtMehcFr~D%=BU zyk^1T{D%g~PaYk;QhlV@A;|(GI95FeUyZ`_@Iep0J*JS^zi{)`?$W?Yw=U1os(T}1 z>WLZWp{ya{V|Qs-Ta3Wp3UAV9Bvjs}@w(!y6Png|nZVjGKq9)KQ0{&IXDYEO$;pU; z#6_PP;%H2xAau+kPoPmhE0qi7Jn04h+I{nAo76h%VPj$cTato((a$qPAH+0k?)$x$ zC#Z5qTzR;JK$wF)rjX$0pH5Naw_v;BY6Qx?l(+27h67W=p)}ct2Kh`cc}_6Zvdh;a zYx%`Rh{)jOEJG|-ij&C5)P&F_Yl5rWAamlUP~dt&pXaJ>@kK@XmrpJ!8RB{hLVk2# z42|BHFnsCunQs&MFzPqRsQN=AV&!q}s&ncQD_vQ_Cjz*df)S!Cj~XwSb&UHO&oCSd zq(snNa>;hhMkclK)X7zZrIT+XzN9cZ41}^NJnwv>4E&fg zRDxvJqL(Xf^M>Dn%`Cr}KuKpIfF*nTsx}R-=PL69|Xk zR6|_%t%EO$S|7!q`Tp7fg`@zX0+f&?xBKUNSB1Cxk~e$s{Ya=GB>pu@y#i9QWCBfL zMbB{}FZMiw@9!tv4Vpk3S<%4TH!cw0Zf^*YS2t_IGob&L_h?@*!GQl+^EuyB(?{(p z-R>!^Mq-Q>&RjD;<-sejZsno)9bdCWocU$jXOU0y;0h?0&4LzfgO`6vn{HFF8dt!h1O);O6VwVxRd1m&CMB$hh(jUcbLN>4+D_BDD`zwK zNyqW@6M{uetJ)^oDpcp6$fzJrg&)wuMfk5jsIHlZcZ|^HLiAD6ZT6oiv{3BKK`rIQ z+)c-)TZ^=3sF{mjN5>x~Zy^?=WoeR0wZ^CfelrnGIn?VIi%24Rlpwp$l@WrRM9ZG5?9I+NrVoE3Wo$ppobp)IQc|RDfkJW7U>C3p8kCHI<>&v4>Zp453 z8sg3jDB4BwRBNZ9Wb9Q7@UqNQ-cu@|lNi)VeLEr1rvZMeLed@#B3}~XE6T6ziH!LR zB@%Sc5{g}JSh>JHar%Uuq7Gu*^4>VP_*5_f1Y2B5@ptttZmYy7JvxM0i9#8?IZ5iA^TQeL4)S4*l_X z;d$w%p4WJodRa5Sk)m3&&I$MQdpL-Z^-a-1Fj-yus+ivF#<9UZM+qDxPxstWXw{`I zz=w7?h&=b4n$SESb;g<0n^Lq^Ah7dtqfUXVhsF5B(G;!)JLDKqQq)LM!eo>>dt|@Y z-(JW%oU^{WTPSKDAzIAnd^31|VOc9tiM-GM`>?9T+iERaqR!04k?enBh#O3Bc zL#H7(sy6Zm&tw=$C7GyVhP8;EBP;B*yMom3k9!Z69)?Cer)`W9;{-Fy7uG818XQYe zTmEb`xf>q=sgf0ky}#Qx_=r3sov)*PUMi<7#2?tQ>^QSl05>gLs7$y&CUvTmNU1z; zLn%3*sW;+uN}2a6>Im`h5qs@Z`jHya#N(&0qx>}kUP^}`b6H=FbLRY2D#}dwJp{Sb zyo&U{wYk0WoJqW^uC^kT64_)-Df>l}aR1Lx19;>XCP7WKYY|NJ*?;^qdB=o_Am3G@ zI>Q@=*B`}yf9Za}va)!@gXTP|+&`)hsIlTW((w54%?|Rjedn6_}qTS?ds-++483ncppE+k>Hjz zi0^KkeNYUXu@WG(zt9u#I>7W4a(T|EGYiUEV%=bMhP^t~OTPb1@w94T``f0?(-a#p ze1`rf`mY@#p_&6TUv9YO+IDYWsuU&n_4!4d^VI~}0JAdT(wa$XYtm$2&06v(k{sC> zbR8Et0(!xOsim^v>3gtUBI2(}&>nqQvL4upv=!d6nh?|y6R7oPu*WXS-V?p_{&KD9 z5{4d6vlq{M}#Dr>D22>B-j{E;maTuD$AbnLBaJ68u`J z6J3cviZVd_wkAXGEV4a-zhft>F~>f#h>XTqD|k@ej=d-+;41=Yzn&yz{U+QT@FTIlu?8NVKQ zS#jq2{`BCJa_?nNlN+z#YgfG#41B(9bH|gGc zCzib41m|-z@$pkXjk44|m?cl*2a|H9D;X~^qolA@eL3BEe55Rf3be6I&Yd9i*5DlL zpsTfeLC+#FN~J>>_)GUF439bzV*?5NbadX|^|AZBn{&42$yscR5|Wf;FFqk)MCk3q zo|^R~;&Em3)Dju>1P!+C*%`-#mNnZc0AJ*4BZVDSV%T*{pctT&9);y3?*|x(}pSK5p zfhv0K)v$HzAA6L!)2kkP^PGWJ0e}hF{>1BUoC2mo=Kq$LyAzdQ!>o<6}wf;M#td^*dnb4*KQqI!KG=g4bG=UBd3vfroM1 zBN9_?ckLzq*`<#=;}9KDvXdrnoJ%v1MVOv+NCkS;m31Yu;NX{VCT$W%P&84>S8K+Nql%xIjiNYlJ zLDRsT@))7~s>Q4J&2K}*o@!R>5#466O|vO0vCRl!90Q;2=LjKrpEVzj)X4*-F0SMF z@ArQ86~AoC_agBp*c!R7g| zj?kCbs(?CiVdMB#!&K!%Ur1G29jIj2;l}HQsZH#~yRd`m!HIB05am6_1V&}N^#i#A z`fGx3fd}bU@$A5Lc5bF70^jwe>^1bIYPPOU{$yO_iZwjr3M#g1n5JLo`Bg%thc3G&Odto*jjw7E zWRp0%8I4I)G`zcuPqob`R9WAU(u<4IK+?r(8htlQ;ty}T!0Lr@Q=P$-W9{s|KP(Nl z;5KNqb;}*=boC=%>Y;7?MF&~0(5Sv~e6;y|iW(jLWk3*Rqe4Cj+0x^-Z=9D*!Y-r= z?)huJF1*sQ8CNv?+!k5O;rSEn0C`%n^p|1M&F(=hmfAz9 z#c(@tdDgCa?%Y$gJ=;wVU(t2MJ5#V8H}MykRP+;@gp7s-p`5zGDLsw7+`1_d ztFw!qGF}6pROf=lqz@*yDZn!!NKxs<03HY6Sm7ESP|M%G!*UvX*TH)V7Q6;$D?!#3 zSz0aJ8`@EOFWg_J4s*W~^^#J}q+3y!*O;?Zw<$Pb;_431P|V!f51|&_Q0&9EZMf!|A8s}@;*+@R zJRhu&uh~2$rmP{5SNnvO{8+;c$04ut zUa`SCfo1QD>n&)yjt0zz=qNAja_@-3wKb>s>$;}~Q>h}?{->0+$YO(r`Vi_XNg)z7 zit9FKRNH%&qMT!66AP`E6L{a=a)C*@-4TJKp%vbLS>`)2axv+S*i9i9^*!uebI;kO z?LH*7W;w~8fio*u2|kPdFx}42Tks|0f%)K%eF+ID=o*`w>h5qVdS^{+aqvU+HBtwJ z$-9aMHF&V)JgdVrTTICs>!2YIlqJ7UJ+SMRL4VNZE#sCLI(A3zsOoBK%GGwj{F4-#Rt zdaDIx3pDju!WQhn@z^UTZ3iMRmS5Um$6WQV)9kqd7ZS^@E16oUF2CNNw&%dmd(?$G zg{K5i)A6Uyws=b$N*Bqp61G}!;$BWOcl&xT36(@SW!|%cBxg(wvB;3a7sOlTX;C zGB!;nCg;qayG-J|#C5wzf0)31#WKflzxBhdWi$4@=AHdLJMEt@oi`rvFeY|QoOyn9 zCH4q>o&~WpXV~c0u@fLgJapcBTYIJT0fUsaFF3f%9!O6TeEXrVYDxzZ_1?lOJ z1Fw?8+NM-BsA&gME1Lap2EO=S-QHXUgthT9H$-^i-ic^9&WRl`^P5ki4WF~36<%lk z`KiR8z_Y1VxG!{VzVn@t;XC$aTcycjYr#i9D9Z;WD^Nu8RR;NnSKZ}+j(hRpTZGB- ztFzLcO*x&>4h#6az#kDOL3^IipX2Jjf<-eweV6&C=1$ns5}MN*8LT}}e+HaXMqJ8r z+P+MP<-Ear51YRAh_rCQh}K3!oTKlg0tov|vRub?m1 z@*e>ncJ#^HiV(-8cYhIaI3>!o(C}4y{T`5{U?~@)UTENVaSia1xu)jU)6zJQh(=C>AGr>~KLJS3S{Qoh$5vY;MyMRs%lPV??6xz{C?o}Dg} zeF4qJaVAoCn4ygs;2TY69^I*u*y~BZ2b?QAB$(wPP`%KIFn_;k5+lz z5rdVZ%zrjsoG`nv-~Z86uUu%hdHq6H5w)EqQGuq&Lp{mjv6;zT z^Tv=tTQ&i#ZocPOtmN1%kOuDyy<^YaXTgVbT#s7hQ8X&n-JH02TAyjBsMgxaRYo$*f(bo0R34E^iM zbsZ!ID?3tscAJY0JWqi+J|5Z89Y>sB?cz_bXXWobXOsd zx{dx=2=;C$)>Ya=>hj$-vixQz7FpGl8$U4@vp+Z=&fHLQq~0K~Ul3FFPOl5cjdPzKnY!lw!O&1QRkBw{`(C*t_}J*a1o)mAq#UTxZv;H(WatlD#v zxXQ>BDJf(WF5QS9Aauc$^Y__=S)L1a9+^Fb$B0v+g~wg3k1GPM?shBSxd&0I7$`qu z7%%5{Cw`L+FH(h#wdnn*l1t1B&bM_O-d7u_?YIR_$o2Af-JDDK{er%q)MIvi4kt|Z zDDZ1BW~qn+goCtd6rr1#FYoz@#CxK-F5UgZ!AVKnLZORi<(J232Rb^E&ax6=9<%kp zBf{~r{)B)B-q&l?A=zA{ssv(>+d3}0=&FDGcA9F- zaL=mBBPf~HQcrh$_UQ>F;t^+U91$2i*{!8Mv)TE)OMMhtQ1 zTeD6E)50re-I3Y`8;pFqxJBTm^L%78Zq)8Al>9WmwL}>-C?Iq)o-Tf>Rvd*N=!q`Y z>V9@p9Uyi$fA=D>ivx5>QxRJ8%LKNo=fD5=+tXT?(&@_h%NcW}Fq~P^T%Xig((KBL zEfpU>(H$;u>!NaXbXqLkDbQMNThjJD!ew?8|MA_*Q*qCM+buC-eCQvcC^*N39x>F3 z8hjCgRzuvK-wv*sELh?|z2+#v{C}hvpr)`%eVmOs%Lw=(M(p~n^Dk+ni=8-P)I;SO z74Vf6{PAQXNGQ3r(D0`97x{;sVtaPmomnX(+ntpu?stm&e0Dp*vsj3b#-_FF%RoM| z&vAtt`;Lr^v>>QTAzp|qLy}^>CeWth)P%kC;*~J>*m5+UXSv}LZj=zn;r@o2DI|`rz$a*pTBWpey_mdKGSyL6ePnG3_2`)D%z1o(rstKuCYuD94Vm@ zVFh~d@p$Gz36EXn)ODWwf^@K27Zpi2tG`J>Kw$|<_hifS)LY>xn7;^hDFHj;Z3#@O z#@-2-vggFqy&(K4?3gbGdmvlz9l|QgMf*JJ#{4?|8;m_hWp%N);5>Eek$87CL$t?h zFX`)a9$wwxSxbC4t=@Qap12MX2%p-4CZea+C#1Rd+veZGlf$V_uWKPQ8j^1O@4VMF zaX)*n6byrk94BXBtse+;W1g)pHooeW;@}qV&b4ERJYLl7>%HplBkW$x+FiXf5_`TI zAukvVDulscWY>Rcc(3D~X&B&n;{a5{U;29Xoe@Iew;?|Kp*DGm)V9;w9Xqv z1WWgJHk-zxgY#NP0-=;J@yFw2D0MLWEd>L-lRK{Os)>xGn^dW6HOPmOIhf-?iHcY= z0wqR4boI~C#b&~8km{#F;EcZwMAvs|rrnb2)j`aWv{sz$VzCku3JiAJG$5IwZ;gA!FScV*Wk%+XX5T$+K8~@*gNWAwQJ=< zm{bIH9gT;d)0%4$%+E(j{!f-<=_ps@1tz%}3Y85ISCx&$#ul&ZnjG46R8YbQx~n)0XcLK= z&R+OuPskszH(7Q7*DO3~wsK+WhEF~m1>y>-P{oHUNJLFLru>T*`Qx9tYQWsH4Y`U^ zuWR>7qEU30g+0a?aL+=bsL3z+AIb1nr1qHiMwcGGPQi_pjFHD{GByiP?le?o= zbHK0J5!UK>VfO`;kyhzIs=}}pSV16G71X~bx z%p5+wMer|HV;v%UOr9Z-4wa3L)8Gxnp$NagozyGf#kpRo6ZxXw&nV$7hIGt?@IxNJ zmv)f@xKCUY#POe=Y5O6sbiBtOvNN6KR71#v*nu0qbQOUDldRqA!wBa;pCnJ{Dbg0EZi1)=ClI%`K|m zT`l|r0rBzzPEjpDzy3?nI|o3I34k6AZ04cg7@RSG!|gGH6$hY%Jk&n>-4*Y@<%sfu z@K^xJ=ZPGEbT|&-&Omk!f#u2^F`wUH6wCh=z$pQ+90%W>32`5qn$a2a>VNv$E6zg& zB_F``{$&Q7N)J$A1cVHL6czwBH2^l_`0lR40aWM#Jn27_oMI{}$P_?508l^G@1*1e z?plGvQD^$DkX{WCHK#vN!?r-n$9Gq>05S5!cUJ*`Q^SB$ZvoD|6z;z0%ASXpqV9MW zpgNHa5Kt_Vnb?<#R{clqAni>h-qpw5T;0;KZ3lcNM2yr=`5*a_iZ$5fPwC=weGb( zMJgh-8KEW(zc(y)ab`~#VZUVgtAJkyhHIE?7WD7UE%vNy1DgwjLWEKQ43i;*9tuS$ zmBvA~9bpZ?d~p;+3m=+l_Xgv-?g(s@g$TJ$5MY0%@3`mZQ=}yG%;2OXI}o)jK$an( zB8|&q>wjtOPz18E3~ZpjCzUc3QSf}VrjyYL;!(6~bps}TX91%21I<|MjL}<5)GItt zq6QPFog+F#A3z^7BhzC^N72qeX@h4PAz-e%Z7=P+NWxqWl?5fG@`HTP1tREr?kQ3W z>ax-5ZE1Q7(M&@_Om#3yTDKUxtXm~&Ay#q?47_q0L%P2vb?(UqE{|+JSwz1xy{#pL zjzV{Kq0mHhc40Z7tX7@yX~c*c9FgKM1 z`3g&h=<(gOf^^V_JU4S71GL`SsdC{aJmKGqCg?-bWXK&5!;iYne1N^X^GGQ?Q> zMlAxa$fALM3wRvz{nP`Lh+O=}fl&?T!eo_g=Cgpw!(r^N4bZ6Pr^od`5S2ju#rzmB zVKPK%_Xo5N!C3KqU;)MRA9b(hQXz(>xj+N$TfJdu7huiE!{sNYjeq;Uk<{M1drnLr11QXW4hjOX>$6d6m z|L!otV?eGA~Y z0bDtN!yAU?0ys|q7Xja5et+r#C_*voG=UJxOs0WOE8m2ZSlzU8sG|>oRD=PksIW+r z+Fzqpe$1rtGZnJ<5`96+g^{e-g!=(F1$Fe>xg?0qnv|h$Ov9bu|G1m|$KBh90IUqi z+V|7rA-0>Vukpwrd4Zd6-z8+$)f8^7IEIz9ky?iHLR8>LO`}tf>>A`E4JW1rNX5{9 z%O*Fr0@1$Z#JqO}lyWc`))Jp7k2<(lOp0G5c{F9BpcwIG~|v;bP8E#7Ux(*biY)X)!! z|KSu^DyC8()WE7Pz^aycG=NG9k|EUJPir5kqm6ccCW|InuAlQcs{}oht-1k%A|808+n_;CH!Tx4Opv3T!A*BEE@a__**lwW2 z9s)I?0*JVP12g@fIwK=BE;d2D!_b<4a{8k5Z&m$asJ_*W2?Hla!*>$|{a?j$y+5rD z-vrVASFx9bfH|XKC}pgKi$ zboKwM&b8t0{eP+h>;6Ywg6`?DYZ~$n7^a9K54{4!!dW`ii+=*hqJjH7o$1^*}H=Kc2k6^!vM{pQ>x(~q9PM5R4i z=+#GwnYF>=wB4k+7uuv-oxAy!SNMJMj(y&Z7;e|})5ZZ9mY?b`F%a8p?`B(%4Wgtvqu!9Wh~Q{> zwfvaAEbi3PHolm&P{-dQthwmfgj?$ab-Z3rTGpzXR>JH+(vJ&i+9j^wIraKi)`sn%I@ggc7EXGMBX&T0>n|IB)2(l`p_(ba560M zOJjy`l^3yoa{B9tcHg9=>9LMdQSVa5n}DQ-*4)vMC^$lmQqqqs_Ku8ojXDT|75<@L z>n3}QZ-=cm<~fM5cu1~N<2lbk%fOszgv*b=PIhlCM%s$Ro|?wYaYxC|wT{uSdrv}e z&LfL``8}~4$ZtsF)g5^;j?|@Ss@g;R2YdnksSA4}k^}SX)Lagz@TWgI+5WKkFmT-+ z3=W=|l;xUo(10f%yOsMP=-#bx|L$q)zVnjc{(wo{4Mr_n$u30XaaYRiB~vfLw7$?8 z!dcB=ltsLlon_rI6SqP6e2!xI{Cf^n!YAyEggQc7MJ^}cROOr@nobBh0i*QyY%gSO za=SC(+4wd&EAIHV4IBx|p2TQwxYD{;UJlH)JV{F>jjCNX0)LCIW#)kz4$Qf8LC<1p zt6$EB(0pk`d@@KWTOZ`r2b_A9D0n^agjwoG?yPzS2OS4wvy{ z)W6_X?3}QhijsvB$JUN3h^eL}>gk7+jsKK1G4Q}Q0!x{I>26SxJ*1@5upb+{?p_311|4Nhsq&+0f=XUrOVOi-%kl- zD|mZxTnaY=zvL_qJ3JBdCK5SJOrSx-sAGp_2Cv6s=`1Z)F7>aj*)ZNGcw^}^ zCONk$-BB_jWZCS4dexKV8spl-jdZ7+_*{}*xxZk&^5dF?Gw2A{+467edQiPF&1Q(6 zsqDTO-D!W9$a+5Q!4{zomI4uGlknJ9VPQkqjMjFK)_q~9>I16hv!FqQ`Ji&d7cx4$ zh7U#2)nBQ`7AB}k93dLXA3F_BnszPmP*j6VQfEh-_p=eLM(Qye9P;shW2HQCF?g|O zwO)zeMdJByanrJ0$t~BUjT2$jF8K`s54iCq%_DdnE{kg9N8EbcsvHMq+YYgLpPbP? zWoUbV8B4dck#bx1IC&Y!Fj|(kbuojmQTDgSa%-zy#eFrHYSlRZz6Hm^;3r*UJa#l8 zc;LlxV$}Hgg7#Cl8ids4>lfckvKKtGYz5=KE@DL|{T<{wqV})FUd3;O$FdoJv>~W0 zB$@63$1JwTvWZkH`0mN~7}#3x9MW*h3R$#+@7N{m83!fle6n20TEDDiz}*v|>FK6+ zRsnH%o`kL+Q9W1(iF{Fv{fh?S-jljo&x1el2V^S*(0)Ki+3oD&I@8BJZs+SCX?<#G z6n*@JOF5WGNaK4@b#P|%{z@J1sPXvAw!k5$))>)>9^{hZp**{^u$vNx6LzWxW4JfD zme`t3i}kGPBM%ufXtw`D8%vouQTI99RI{7wp)^n|^z|nt$`5IX#FEeB$uUdfb5#%a zxYA=h?AQNn_IaqezLsupA-DvYKzZ13`N|du`YlT?2G7}!iEJxLEB8yB~3DZfpF^ zb`(=zr@Ydn(|s+qZgPdm@8F7W5|>eN<-gg6FRh5l@+RSKG})a zT8whvM_BMeeTO}x(MqCC^qPmZ6qBJ@>~8B;+5x2xES7N={)nnjRNh}{5R0(EU60!R z5X*MmqpR@=bK+;MicrL%{u%0|{=zMDWYge+IC5D+c-EoZN4`g;;>4J{9$P5RykVa3 zO`{+@RKU?#z_rF!qQ*?F=?K`8U!o6%{^!Et%pJj8iiW;7JDk{e5?> z?u{iTThaFHafA8gq*G=9UYpmN=Ow`oD;!CrKo%QYWbVP+@^?~GV8~Z@2}GhJl2uSu+3BS5@i*qWc^@}{_JQa!egC&@=a2j2VKY_U@GZzQR z4{z!P{)@?-_0IT^O_^w6`UO8zL8FO>d41=L`1$@H`gsfy%T=WL&yPcGt^0}DYCNBO z6@QmXDsMW!cA?wtHbo58iyIIZKquXzRM ze^C2Hrr0Uqcyx(v0xg8n%z^PpS<}~AUi|+PGOm{FXq?)YW zZ9wMs`z${3Ij@9`Au{^ihkB4s_3rF%N$c${*!oMB!2`d33caXvaOEclY z<)`B`TlM;Z@}=^hM>b@(>Wu@%pWm|BrK){BZOaac`Qu68UIgjNw*Hu>8PAUTGI z5ZKPKa7*7yWu2nR(!-W`!FaH5!a3C2>gM&N@K715+<_Y4OT|nU+@0>fd7s^bj;bwyyE79Lw=03r_qbuQ8Jz1SkkzaF^@7?U=GWA# zfk(k+7tVbvVIv#Apc#8-7oN9yk;hjd@rOLme;g{EW3@OJd$4^gHr23RWAe0n{r+SQ zvQ{a)JJW%1=?}Vg!RpRR;b`fpBl{&bpYZ17r98)3g`fxb!|30J5}sI#8U({JxSIW=k!7=kw&8RHbq~x>Eh;-o)VvxDt7DDMn5sC0 zGoa{aBa0G)hqQUBH3Um&UWmIA(RAPw-}_5a1c?R`sw3%ylEnOUI6Cx+6Hkh!e=_;r zCYOrv)OxOdT5T_St9PA3fGw6n{usZ2fnH}-gw>wmH482uj`XFaj>FR!8DoEjgcag* zl9?A_xa%ShBQ5h@S{JOEsCMWc=4zw0*Pf6Ln+ZpUbaRB2y`bt;!0Xm-c)F-g8%3hewZS>OxMuk;xXl> z)-pQN%uFuRx0hO;u)%QNfznTdKqJM?+AFHh80)uWou1z479sRS@#M)#+HZ;0bN7mv z{spd;IEkFr`E}t_qFT1}!v&mzVZobB6geOLAP70{UV{_-mdkob@-VdO2_a()_5zow|nEm|jWl!T3qUZch zRCk~%i)N~b$RRW_PXJ6Jsz!Ka-fMUr&2Nf(&;t`N&F~)kAVCcYudjCwp zQmOrm)!_Q}!LaN&dQc*1n#9PWzz}oYFD`CqpW-_RiPoQ^Tc!Yt9SPjy=Vas*RU-W| zS`i6nElG0;%M!R<@u-DZi7<;<@1*Tp)cvJJC*ScnoWu7QQtAhne6~Im^=BF%rW$AK z7W&MeG^FQLW^@lxFEJ`N-gE3>m&74a*Pe*QxsXoDIy=u~); z;2}tz?~SYHp{^^J*B7lV%Jxm9h8TM=2JGFJ{HC@>WBO}^-A~j9MINh5UVSN(Zl}2y zsp+GwsqLk$iI>l~zqHd8xeIitaes{nU>3vrYy2|aj7Jwn{x*|I>Xqh z*|lr68AtEsHfYG1O)Oow*OLf0i|!kxUScM2=*4n2y;09^_hzyDQ(GNME+|qs#ATG> zhE3&I-f!cPK}#fsC3YrepZqYVNA%_9%XYq-Mik9(gV=$Dpr8Bj!@0eW5CQkVhrv{C z7l)rdyVhvjJ=m%CmD37de_g-B;a!ZNpkyLGK#=J?@}fAB7o&-a>HBs0xp-(YVMA~( zgKGX#GC*qYFxEb;PHj?mHvbQlNBF*{FLUQu@lNCvbVg#_&1b6tg4#H!dFPO3gyYo) zmLr>eA2!D8R`CnYI zSjICY3^d$$QYFvmaXCJiCeBD%e;Qycla%z;qhTgFFVJ6OyJ$BowLUwHre%{4F!u7P~?Vr!)E4jpQ(2RXdXu%zaMUIsMv~m*ULLTItaarbJR748$>3)rnM!Uwu=ZmZi`y#k zVq_jVNH?#srWeU%31<2NVJ#bUV*omwGhP~u;$;Ym#;X+%Q{pdY4u~%Ph@lv2lr^?# zd>Dr^s={k^vZoy0z4vQmjM`1rNpca$n~jsOfcD>X;1lJy(BNU#qp1Tu)KWbj6KBa1 z-;#`T=-IBvyOE8f*WNj8*pjTL^y%c{HodHE6#T=FX;uhb^uc)q(((alvn!QE#{^ru zflVK#w=R|xYS3yXFx@$xK!?&(VpgDYmt%__f0y@R#1Co`mr$)YzHF3O;}bnQRDFV3 zgcK{VPoey6p{01qL|@Euu2-PZma5T~Ps{0qprpY+bfX)w+uK6pdiKKgC-Y>NFu3x> z@#zJ`vGtdAL*qI4td1fSvWo#jG#2!Y)Fjv)ZzuW8tYl(iawae8=IY(7>J#LLabv%@ z`I*lLx&MrpozLgTl`+mRz~57kxE>_r%-;?|=scxe(u}Rk8=i++mEnee@`*@sx^MP& zvhOp7w$odtb2KxyA!l51_*6(lP(*#?TaYd>$kfg!E;?+kVPKr7HiJeJ(#!4t=FEN2 z&l45@gezL<5cXEX!)F*nuphNcKDK%jLz}K{Bo$zBjnva+;ZHL-7Qa>b)OY1{*)cY? zxPP7Dzy`7o`IOYHN#P^nFreYGxGeRl)yPpm$l+cbzy6n3KAn<@uYNiFQty-7Im9eo~tBtAvG% zWpAapNGGh&6+y#&-Wxm7l($M@7Vvd$22lk?9OJFo zx#0rmFs3CnVG;B6?-l!9_}Hjw3MF3*qc7#!=Ql0$mR!?#U!V&xfpogW;l4vpd2U9( zM5Vz=N0AeM=#^|JT!OXrGyc5QQySu7V-UO1s4STx+!snhlBnF*9(tp_>NP96GHwE^iXMooy2?@?k##4s$lUR_ zuP5l2Jw1JvIxJ*)*77+0!YF%LVvabgWb1L*9-I66hi*a=i~7v)G6{59Q^NhI>C46U zOW^DaW--$@aTko)16bTWwIa=#X*7%XcVvV;6MpPvk>eAoBF{n7EJ>^NLzuT|k9*Ed zqOsGG3PjzoLTuDH4H6l6m<9602C_UKRoi=xw z-(H-&C2*1Nc1G0>nae5T0bA=dv~P3I>xyD?wdjPqJZl`j<#7M`$8uPt^3WPvHM3Ld z$DO@Xj1O);n;uV!pz%0AV=6FW(Em1E0K~7=eRjGjrja6iESD3|*!|`&q3P}lwW4DB zcvQ<)mIv!h1dEnxthpkCWn$j2i5?i|k-BhAfp7r-2ZwJT99|3md3f|%N+wgaeV2M( z?Bf})CpI@@yq^dYbHbvP*ur<2p#sN!2dNKWqWR!DclF8)WPMfVkc`~?#-thTZ1Q0P zX?j1*JgiCWHDS=^>X0DTg&q@zOIuf(zmZGC#??7189t6amn%#MufsI%m? zEr-q@dk(Awb@sTnV$k$Hy^YV3QaEw7-*`j$4_0#U_~I?>)ysn)Esk@frDE9B)6<77sUhx<`q6a(~ls z=mW3fQi&tpkw}a3sD3u&*M-v}9t(80W0e)mFAt}P?ki>$y+s;;>p_T(36X|ik1xIBZK311F!4wD$^ZTqpizbGtkAz)q@^xq{RxCIJ_X! z<3zl85}Yg9Pf{ANl3eEEE^^-x@?4Ir(=E?3{O6LoPet(0(Lix^W+EuQo{1`zDRvk7 zO4X`=CWn97fn1*RE5P|0ohH{Qq8Ow?gBI08f75VI4#|K1?y|BUKKJupMZjeRSNwj%K-7f@WM=5fNfGMpGU0@u%&>Irn(T`CPi;*MBhl_**=D? z^c0S_NsImozoK6qZY2*DYjjfs(vMZfvwwdUDk)HBXFQyozdH^h zca_#1@!{NER;c1m9~tze;2pG9E2Ft+Nd{?ku*o;v3ysLAlJbTp!Q$<4JNIfG1NoY3 zW3?rM(rZuapd|X!)bd?4#T$AFA@5g@*fIh~A#%6fq5k|TsRm_~gVsu+zoYzwtvdVI z(W&*2x=0~CntB)fjedj*pul_>8{owGGSV0?&=_10=PHe|o`oDKDdJeik!4pqi{YVw zh=6`7;&*LHbVI;RfS883Bf!o^<~vlkHnK{$^1T%5-303rzLvZpnDQZ61<-e(mbpgV zyo9})*P%d!FB8&^&V*n2tD`#rtn>FqzdXV!4I35(iP7H$O=j|16#UgPbZL_?5=E zI`=X7U8(j~YNCIJUqG7heJOYg3?zDE+%E+LUBbieuxw-aQUx9!xKg)xVZR7Kr{PFo zM-CC}eDj0`JN`4Z;~XE0)Epm6OOrO1!%et({p_ry2igj_EV{(t4}!Q8_dfQ3YFvDuKO&TYlaOAF z@uI1rlO4s30;J%U*j(V308q&^)~a^=E7Q~i#13uOtpmw+!@X!XWPiz8b^YUPHMiKZOG^~Fz)Z%#}3~E zC)VJvVD_!t_yO$A)LDH(gCSb-c)V%=~z;8A^ z&%UZ-$eQX7)L1>vqioTe7QjcWJ&a4{%0?;lAzO$ZOOnf%N(!pjqQf;kn{nz3dua!_ zxRq@osQ$v04gHGbIzHzIx}r)tk>AW{WmKO-K<82H_Sl38OFha@w2-yxgO4%X-E2;@ zN-Y@A?$k7&#|&V$KMBl+0+$^^s{RUt@k})yB!6H906*mh4CT^^wI&u<_|&h&2;LXB`~QbgM2L^ z$bUSlAxJ5dMNF(=7N==zyI+uh^jXoY^Rz@Y>hwo7z0}qedEsQ17f$dN0*fmxdX;eb zNV^=}bu`hwgzkRPR}A}WuoBIMKe4}Q_B|_7wo;&qFDXpcN(IJ3c}chL-S;%>bW6$1 z9i7-L^eP&XLZt+nvtnP8{c(u}iS{R&{ZUeSar{x{c);3k1R4CT*!>u?Ga)piKgx6W zFpiFe0FgJ$Ax!_$1o#H&d$K=jgYMGdR=(qKjLX&Q@50r?fFl33a(F4Rcj3pE(lPu9 z4}!*=0nO)#}itsK03TfA1!(Kw|JvLVUtadC`}a zqiu}ltLbk&k(W?gq{WMX-18{1w+YJI98GzP7U3}Y2K@$jk>p$Sj0)d?_5q(HJFg-6 z_69Ns{N{G9@U=smOWElt+FInur>t+P#+|d`J*!&$9db&gE>C1e9ssVtTWq|5hi$6* zFgYf}^7{sRB9m>N$UGOpa1&~QFJFkbh>qj7>97sX{x1J&K?Yi8if7C5YVE_kWsm9Y z`Trq@pG19|C-OS$+WN;&7ZRyS=WaE8sC8aoc#?dcqx=kYT*r1 zd2jmuVb%t~O5Lcb&AiIa*eM!cqpUZq*VV@L@;ZO@wR0Di2z`Ks7Dz0bhm~|zwad4t z4_N6Ak1Fs+##8uPkrpd77Zv)bUyJn5qnIIQjjx+}U_Q@&&wPp=LVIy6_9fog$annv z+>w$zcTEZVKG&2C=7E;D;2&Aba||Z^7jR9I!x~+r+;%>RxB|Ujx2d0y9~UV^k^F6t z7VjXB*DV}a;J=RTj>Deh6AgGcO3^qxqU}K2Cw-iYaR>z)Jhx>?&O7^e9*r>?Ug9#E5$Ft)}Vc{F}MM=Hk?0{4LZyWdPYmG(>evq~lMsZ8GtfriRp`unb~&dNjoU{b57 z@4qX&glgQ%1}UiWaq^#lu0p_|;FxswG)KfC(HQYPbj30{mVcK_EH}2;n75hYW&19LxGKn$6+B<=ieeo8NvL8nSDMR{>@<_M0_=VO5iQ%}#nmn@Km}bEc zRr)Tg+z&g@KoY*D5g{jek1q_HZ}x^c+Mj> z>uQi^DYkU@eMl)6+T4-r@__XMbhyTsqy9XX1+gFIVw`=0?-bybVqY%BTdVk%e=j3Z z0D$m3`09IKclGtTyb>?aE3Lk+u$!?6(Z6l#QoiYrxltCTuo8+@Mf}@aokbn)R_~Zg z@u0r$Zso^lQLFCvv|Am_PyD-3AxpS8Jc>T{AujQ%G+g&|-HbZZ(*$lm133`bRwyGAs>5l=g zm?t}9{#V_Bl@^Z^WP~{n5S2y53&-p zO8e=?>!I-~tK2YTxA`lB{98(K(Z%FC0t}RT?&4JL=`{5nA0PbJd_3+h$US|5_C??) z`JVq*;M`D-^mPPjnFwR-8TJUM60{gQ(1$pH-Z>~G2EXzBj~df|RD9ldya z7N5Ak&q|>ekIib6Ud*4xVB?>ahF*-Bbu4;u^(>|rm(Oa8Uf5=}NiVWz@i99(GY!2^ zXC9kgteVMqvSMai^y0;t$EFvHMGFgNwnZ<-%{(@}C>AaBo7one2~z4P=m@ZLD>*zmq_8t?PkX>Gy#d((~$?~jNU z?w{5cy!)md8{UVC7Ch71g7=H39UI79Oc>iEZoACbR6o&VQr=)@Rd!`%<-fx@2@P6}@w&2}0rA>H0dkVvQ zuPJHZ-7@9a@UBkgecnE~EqJe_UwFU1tOgc8aUo2WUcT!vMZk=>&c;7!!$iEZYlz%7wAIiV~Dfz?BemI^#tOHLX zA%9r?g!UI(#%_JmaH5<5Tca!T2`uQLpigkGhUegO5z(kA;tZ8prr(^SHL~QN_46 z@zD$87#}@3E)70fGVa*;Xwo=7hhxUIg^vb|J2pN#N3?MIxVG?7*0^KiqhH2y3qOu+ z3m+{Xdu)94tZ3o!v2Ed_d&V9cAB_<$l#Xo+A6+^2c<|AGIK1c1NeAy^Z)12Laa#(! z%eS=&?-$(0@P77fY2e*@+p*yNz^x4Ldv0wD-q+sRCcM9UE5rNix2A#j$8J3~ywAOr z_j$&xZNdAEw;mhbFBL6Zcxzkm-sRR~!+YHrZsFH4ZNdAxF~^4Yw?qrCj%f?tmyS6$ zyw4CVOd8V`yk9%!{{eVEJS`o(myKq4pFBDR-b+Wf3GY{oW_a&AIt{$rMjs2_Ge$GK zH{H?}yl=myO?Y2<3&Z=Tx1@phmv1>Xyf3+h_xYY%+Jg59w;UVZ2Z$D~yrnI8x8HJX zc+V0o92wOXy#F-n*zmr56u0ovsJ7t!nNi1v_j^POb4RrW@3)LP9(dRJO_?AaE!aMc zI(d+yPK*pE9Dx(E5`Z~R3YKRkABxP(&8uXfHrpzSSFCN2l5^mq%VKZCwxQ~Pabz(w~AQ0SplzX(b+2ixKr|a=_ z(c`HpJ!Tnuw5j#OjXi>laa0e;XsgGI)Asn;mJKdINZ=S7MmJIpI<6yJJHUpneNtc-JD%lWy2H;?^1Dll2@{80toz?Y zMssR<*;xvB{!@Wc&n`z_C|Kn1BnJ%A9X^rrBPdlK=TWb6)FTbpiykw?|41jVGEy%J@kfu~fxfpUZ)NYv|OA9Je z>o?HU5AoC|-C>w|%$yA5mFSW6mB{oa2NpU^Qt&xC=c8@36XA}YaG8U$S@nmp?SpgH zLJ7+r$PljQ+A?+(+i(?ZLm|ELBEFJapE~U#vo`I^%wpPnejMyC=SYSktxf~1Zh1!H z_DT1@lME30hW5|k4K2(_=;eG$7&Xe~dk8o4oI8%OnZdc#-EvBfL;-O*w5}d>S?hE= zuWcv0hdBX^&DyTqiieYpOdBA--N%Mg7_lRR1>4B~pDe@u9yyo}WWf4q_aa>!>bQKzD0IxTawq#go#_W2-q4jN_PNtjKmX0^{m% zk!-{=>x%?U{x(sYixkX2C-$798d|1ZtTbuyg3=#nGFn-3dy8o9q81IQQQ+ze6FDd? z2VwOBL%fru+=%nbkNf9i1}h2ypkUD~yL$UXV~idg0f#veY^D24Xk<+ubopMR7KqM@ zanLs(W&eDF$mE&===Yz9W{ylSoV7KkWH?2%8MU_>^HI^E?u3VWHD>?D^Sp{qcJ!Ph zl$FX5O7k9$=$E4j`Bw&NII5Nc`{rTHC58)NBnGtH1zTPx<#wT#*K*5WHf#Quk}?pN z^r2=wzF;ym6MH`%|1bmBcUz18d;-2-sl_8C$Ug>V0n-8?!?1!Ra3o|GVv-0`0tvsK z>H?IBx&ViOrw7!s8+GX?$75Jt9j~QWY)p+|`3E4H*Wxt&O{VGoVN?|$jD$TNDsI<8 zKiX61$DLi$QR#Fvjq@-V=(O*4L>l)ltf#chp70E$(U^Fj=5wb7CL)J(aYj5HDd;DT zM1=0pf%|rxcHpGYbR@s}K&AVKAe3O88O-mLN>Ae~TT;E0|CaSXcieyI`cr)4zr6lK z#{L(re+obIz0S{cqZB;1!N8{h@;K-L9ufE%xARw7iMpi$Ut&Us&_^Z_H>k&LH|dCx z%zU)NhzJUuuB6uXXt{@S^k95Id^2u~6BAGFPFX9iKsk>^afOaVgDJ{9 zzk)JP$jawZ@EWs@n&SKhvl@gz&ei-uwWNL;U4(8A-)h9SKt-0AlV*7T>AL?necyn= zTFYZE9 zGSK)Z#Mfcrx!M}E!!@|GM>_T>UT4ig`4Nm9!EU!+hfP}Y7SfPVi^2cj9llI09)ru{ zyD#>%wl8hdr3(!7YnZI;EE+n5vua_A;~*U9&=@*ykxB5!cb1e(5vo$I_Z}&C*itEX z)H71<_V)iT%n@Fu1C5qhBixkQdWcn7?hfDG zr4cajNsV%3NRAx%35fqU=2h<8F9X{f@!j6=H-ELVCQxbdhO_!%=ESVaXVn_~eNTwa zF|6;)S$;e0iwXnmVt+Yf$q=jWM`d}D^nP)dmI+GnNiDIjm=41bXaq&!;x0$v!I9WI zYU@??A}eNGU#TnRsZ89YglM8tL;(w|FjXwp=Akq$;l3S~IbhnIug=Rd6)9^!vYOZn zI)idJb#At0n~E?EyeWTOJ_Z1)PK*VB&9b^FFpRy;EplYARd#+O1qr(;ltxsxii`)c z#T{5_t#(=I2_RewXv~vqTvT?=wzGh2q2&)qgvjs7a!7A=1x? z&u+muoLaT=x_XWYlPR%0G=lkqcVWcJi)i)@qf$LwVv0SB--UkgJs#V#^q@4|WtDP) zVdLd+$2Fdz3?ac#N+jFMJFlTzY+y7H+XY&3U^M`We{R0Jw zQDZ?4tOs0JPU*{G#Y<)9DnO{23O7dM9CoRm@blUGO(GB5hwWfY_i(m}7Rq$9%xCXU`N+|W9@eTOdTxiq6*N|^t>9cm z!RA5{ju`?B=Oc%@90FTbW>Jr#iY?N2%#v(vI5lb%?}fRLgOe&afILoG_cY@>%`iSw z!uZY<c^rdd`=peiYK&ZXx7`8YHo3UI zwb*&Ly{A-;xp-hMfB7&=%Y$F5{t@3VBqx%Yq$?VU7A8#Sga@;Aa86?Lqx9yn2oKeY z?O2SfQEZ>&=9g;ZT%gcRGy`0V&B)_?Aff_#%g*Ie`Ask>b=_FXx|H&%v)XrpjIvv+ z`g9*QsXcl{eQS}jqC}}DK9s5}MfwhzqytH2qeob$CWk$U14`*@PW^~pL0I(CI z>%tagDMaa{BYpiJ#}Ay^zOBIz(mSX$bWkcfcxF0xFxbM!nDe$?NP6#}mh(W1pFNYa z*(N1Cpm>tfBJ4hMSw}jLt~|LgyQCVmM(e0F%(m9G(z5;0m7Je8(s%YphM{|ItZyyB zgNNy3ivX_xjJpHBn&m)+nJimHkpX7-+mo4nkQjK?&aWI!q0K!{oA-BUT2;EoalMY1R5ja{nY?%u&C-#tY)=K3U+L6 z6%n$tI@vN3QuU3-MGQyR^SHz z(nLHJZ!jhoCwP#rN`Z&I^!n*X4e)T%QLfa~Jx%NF==EKP4XtlF%=PVJSEe++u?pWl z)7W@dxB&j+2cW0V{viuiVrzbdy7nRD^u4S0hjwCbs83u9e?EhEkjev`ga_s&+d1W}{T#$}pG!!-bZ0#UQ@z0M^r5h53~kUGc1|(NCkYi zl-@5y`6PBGwhxR$psADZ%s^9T|Cu<~^F&_=Af50=6YOZP!nc8V+7VUsr)p_@Kienm zZ+n1=ysE_OF$ve%cyjeReeF3bjq!Sa(lsh5jn`w6t{+RfUYm4{Rf(z+^Jj=!tD@}q z087n4YbP9D%|I%Y2Y+qiFQR{wa#NnH48#KE2VZw+1K3g z4+xQ-$P9}Y32v+3B`cG2Jj!GWwol6SL3$x|1JrTb}jh&Ck4ZhFaj`67EL_>0f zM`=>OhKFABQPfEN9dXCkg>c&bMmbaG!LiDo{jnIFR>=JILWbOWmw;SAdy%UTBRo7> z0o=vI%gUS{Q3;jS*UenDW7C3 zR?y&AEr$o$K>DF_f0&9t5G0=O9eXL|H9kmrPJE{myrS)*5OrV6rNw)N9_Owcldm1+ zoY^Rq)-W-{mgW&tJdsT>DV65Jq&Th9w!va3WGh^RrLN|5#PyLD^(6Dl6SSb{QFBK06*KD3=Cs3=nAJnqAxgnF zh?G^)5;(sq+oG#1W7{Vz<<+W2jymuO4ckP;U3lfRCG$CuKc@0fg?HGb@=M|KSzU?? z?{G-vsJe>pf8v-hEjynZF|Ka2m zND}75xR$`?klNWv6$NOSLw^Ali09K#ie-KGaE2havV!cl9%XD^)GzFE1TTx4kaH+s zfO#SQshxKKgju}E10qLc4j#FXl9c}NKSodff%9Y+9lA^Ie! zg3gaJc}PS5E}(jHr0Y@%SlqE$i)8cK*6jj6eQ;0W_&%^EUPsTnkoX{^>omW?N$=Z|u3d@O z`9x(6?fq~;;`-uL=D@(03Go~!5ZzT;-s)G z>>u9NNwZtY!u&EP_2FYU+-b~rG;|oSCJ6LgYW>$C^}m^n=Uv&u+c`G!8 z&{LA%0L_%-|1ty<@;77Zs=%*SgdlQXIPYAdmH;uGjT{fNH%H!gq%c45H&6U6U>JsJ z=qZ@nqe+$_z;}+GzMwZ&FeA#2BRgjZp?Mn`*AOJkU&+cK;HU-_0BhuOhf6H(u-}B! z$*q^2`+X;3?PFz+y7epuA#)VuH+VWLj$A?lrN!Oq zsC?Q;7qLY>%$j{=dx~F*9G;OUhX=!M|8{MpQ-+-Tja+#+Q+6J9OV|A-I}b>~i--g2 zWS5;CrA4Sikjif*6UB~LPeXo@@M#`rcWF^6z2KrVx*U#irA2mb@b8eF zZ`@zJnG$J{8@2l4U_3=*9G=W^n3`=cb7V5PVn>!y^-Qe3SK|^e4P z5Uzvm@C^lykx02O8y9apI>ISi)oa|mB<{fC$B=6zRtA$Ka&q+ZsA~f&^D1SO9qWo# zQhj60Cq)G_GxF*=-PF(OArRD~53{}~d^LLL*k=9dGaTBS`#j+rU}4MhFbNfKC=Rc( z)~)>J&Rylst@HxAfmL>!E8LaMnO^56SZo5E0#6u_eg~<1IkZvMi`?q9eR9gq0%_4J zxYy;c^XV4rTkYSG-|BXLj|i{9Nr#m0~$RNuXp6cY%dA6>|3HKPwXdkRIzZuMDlf6pbxq*x>aV*In6 zp2|w~h$#AR8=tG)%0MQbXYEV}C2aC_9AT&r3#6!KM0DtN^`fp__HS_sQ4RwEhu!TE zwO`234<iVNCo|JJtJU#hiVp3wf` zetKdPm2 zG(pRu2LF1mvcDw1wIqLU^h)yA5S}m;_ev`97$MaYW8Ju&vKvtXD#^zr{peCj;*ZC< z8s+u=RyJfJZP-~7tRR{0O%wVXtd`2R;0_0Vr8B@beWI^A{+|r&{s1fC3gA54${xf{ zx$E4yRbJ&g#7}PXa(CqsKvO?^rR$b^oE5%&PuS{lo;)X5);&6&@r@_k-Q&!W7EL4k zKe#oHFWk-w#20&{;N^t6ilpq^>0+9p$=2iET4^ z%bbXlTYMIo*yD3D(4%W%PS;u(5B7H;H<}Cl#Q`4uY2GLQ;B!6B zX)yeQ(qbw%OO7!w2*;Sc!vV@z88YW?Y0)zj8b-EeSDu^0rLTcGe2wJ*b6b##CrteiP>jeJHF_Qf}W^|2tA`*P8R zrY#qklPFJ|bFHs0z%S+_-JEi^UBchx6!s~`{HGanzLNr{>xw3f9Wr>VtRGdc)vdb$ zSd7Nutr@tmk{xtHQ<&oXdl9123{EsaOJctx5qL-0PDcce4 zN8_!gSVGoEkQIT%_`%ncB_VQwZb{hJlPn3>{hcfcYoBIIg6hyM38QXh!D9S*LLZAK z+Y^~yKyu#^)RMSlr5=@4r^fmx`zOZp8Sk01EAC|&bp;#zr@_rx=cm-K_b4&9bEIAN zor>LWCHo#G_T5wV9g?Co@yxz`IkWG)5~NE};)j*Zl-Rdgop5D+>j7;ejojKK$5$1N%n>8Z=Q9am@G%8=KwaGY=M3Adsd0hPzYh>G4a_$YWJTXB&ZoHKQX#qQ!-f0&7Shh49COiGWS1}X zI*)+R1$T#)`oG5E!H7rXu1#Oh&y^Ef6u*Y`@IOSVR823s!jj$>%c=U_* zIv#zo8{yIRFA^SI@;KwsXU@{`XxEX9M|+)WXf0|PNmzN8C1@^_HQJ#gcQD%C{0H4vF!w4t-UU5gd4;1f@{J)k*n zc#_3`Dld-8i=*2Q=(GW+@g-1c^?|F|0O3(~P|1RQgar+s(D`*lCtj`7Lp-ksCrGsw zoQ`-p3}&GvIfXs?L*l+-b#fc_3hRQ^o~_i&Rs^UnR}1e)L~Rn6aIff9uZ4Y6=^$+h z+Hwi;m@h2^j7|2sP+Yl@u6Sbk)9S-D-G%?79F@)QEHW}4AQBniUn~H)+3k7 zE8UI0a2;DkTwz2M7_+Lc^sX1#rQ%yz-@~w#1@WycO^Q}tDo47+bsKA?a#TFVZ4;=VFJ*^K@tzv12* z#nP!l`Zd~uiMEkaPSoYbLU-6tAfawNmweMOFUWiX`>;ta%zhQPXa7Lsx7~=ovb_+_ z8lVfj?f=!t6X;enPwW0n*QWT3NsJKvRnaHU`b&IJ?GkBf;&y>n0}Qp2r8q&|9zh*aU9nMe)k zN%}fw{$XA@T+ku39sVfa%8`huXDkKpL`A*wRm@6qtzMtkcVc0wZFa3@JUHNSMx~$^ z-YqH|W|IO~=HataT0?RP?_^hPPR&6)G6N2Ijpol$dDth~~f^l>Qslcn#7jD2*m7Y??$(e^0R{8nl4S(zL@ zd?&hsc(DS6>Q#=y(4k*{2aj{L6nZJ{4e~jb&6=K%u6d&h;)P6pkV52ksQm19j&g~_ z=9oVhx(&;=C_2?MU0IeP))oIopNq$1Qqk)!cBvdw^8h)HEOe;nWO1^cmTugkH!?mC z#eqM>9v1K%?r(>^c(|g-`L*=aa@%r$7nJ{S8%~2g4iB=ygZZ0LocKsn+3gO3ygdxi z06bc7H6r3G;J6gg1f|dkxIZNEDVM?6efFFhmz^p44ifxYl^Hga{qD#^-;$ES<|N}N zpC6F_aFY08bQ!Aoov)*Y$JbA&Nv*}O{XNz77wFp}v;GN|K?;0@bOOs_!d?{vWb~9m zlc?@T6t*U6O9i1VmueuSfd+t*9Qd@60Vo-y5T2+m%?z2UXcT$E5gcAo;q|uJeex@! zO`1PzabcMa)>!(ZiaYhxazL1&iX!RZnxZqlkA4@g+u;enjwb}ur=K2;m~=guJ6vKd z`bZbl;g}SK=i8jkvv$FgoRa)?B&e4J*TLNUBb-fh&T)t56hvFF1P;2`qb&9{Fq2mdPb((1Z2fGXH@qOn<81L=B#TJ!n<%CL{lYrgrND4f z`pv~dQ1b!7m{f99mWrw9FCeDFQfRhWM@&s#7dA1p>C;riIg znqI=d7sy7wKz8o&wJR>1D$1jE66Mi8UQ6AH@@V5Jw0|X~;fhQp+ILp;8QSk7(t{O& zrc9UgP_@fe?eC`1y`D(-hS}3AII%eu!x%FH)KCyzD|89VETRQ!!44i=PF+?>BT|h|(+ro2UNEr)4FfRW` zFoD_cBJur4lmdj5_0f7mzJ_prTX={adAJ8|B~tyJ^dJzQ1`wu;q^GNk&e#w|&{2!q zvwinC_pfWG|JnU}q3L(_FYB25m(|ApWwx<@8!puLFDqsLKKjGIynmlv8Q;H5+`p{j z-oLDr{mV?)zhHKKtLD#G&pHecK4)9o(qWWY$*4PHMb2j3cLgBxZtv=bAiQ&$Z4f2Is`TjUWevSIE*dH#% zTufHk8Iyt(f2Y)8d~A*f<=WU;ar_74&!n+e35(Je%6;j(5r&2GI7)E@llKud==%5h zB&G~i947Q$t5GjU?Vw2f=bK3tqvb-MJ=ptRNowlw97a%;8w9{C;(Hmdzxp9M<$N#y{%K*}}uC2KGP-{vc40R_qY{ zC5*)me?p(H8&P8%2rWT>FMy{mqDiHHF{zOZVS^%jQ$aF+q}4lQ?b3$cTK#7r+*?Qj zGOpnL%h1F95nXS@bq}vUxjwn>;ZLIO;iZoHR#6+ezb#s4z*m&tfJ|BscBt3N8^x?bS$$=Jv0XcIej z|KU&_otatyc6AS$Q6g6~@5?l)_n$+@OTAQ>R!%>Wv1&ShOwOeyoB-?7p*8gZGR#&{ zyl!M3tlvDFC!9wSTFQO_b`2oxWF>-K9w!(re=JjjyM37ickKx78od#8=^5)FXw!;n zx|Ltu&V5qw3(8zN$l(c>ILKH844ubW=~XH{(ty>JtcGfhe{0fejeEEnod@w60YL)i1yS%XeYHR`qPc^?L}M|!*+yFj$sM*x3GCIP<@wHn72&z^O@nX`b| z5qV(dkn50n)d#UqNiLL%`D@(HsW#sNQHkBE&g!MRfsb*xl?sn@2cA}Gz&D8O&cWO# zLJ#a=zjdoRx~HC;iq4+DI}LBU2IUm@V@N!6lO`s4l@(HG97#i}n^LWOE<3*xsY)=| z#@H7-^&aejePKQ|y|**u+-QEQp403Pl!iuRQ^d#{Wld#m@2E8)K1Wje5QjgN_s@41 z#Xn!FPhIAzuf_uNV(NLA14ryW6*r8}zenQy5t3|J9$dpb3nyuKyFKCUJ4@Xts7k>L zcf7vswy0ZPLZy-`E*X{jwKnr$7=`Hb(D0s^b~NlX;9Mb2~Gh%uB$N z4fyE9vlvh25uPkQg5g5ARyY=97_RKDEs#YDUWV@fUZKAy{MIN1t?*6k4deNMKHzv0 ziimF4<9X19^%72Sk-rBp*@`n5)%82Ts4goDP+e<3gCAyp8@F*E^q`hB*Ae2oq8}f) zvpXd4Z!K|u$a{Znq|fKr5>i|sjRZz6GSWTT#Nb>>L??x*`J~2OD*mQX83qWTszQwY2N7TfVRuv-JMK&ZV_23zpOo7 z0-}7<0wDZeiD^mDj^12uN>K8I{bkzIR?IP?KHi0mIt7SoCZSbm3~6$ZZA-P1+E0jE zNe8g~o$aI6V1*QFX-^c-8ov{?m`u8zl2dz1;hmmTl`^cWAAdVlBQ+L(w{(2w+r($SE`vyhbu59;d=-~J2OCe~ zGqLe9o3Nc=;}01c@14Yd?oPvherx1E%WcfHV}Wg06m60gLKtH9S*{wY zvmEi-+do`E2oU5ARexAc2s~C$2GVdo0PL$a*M8&Ov z*L$(Gj)b8CkFrm711`Y~@O-vBz#F9=tM%>U$7&$kvBkNv;hCwA(=*#}uISFq z*@km9Dm#9zY}&b+Bn4k0xVl6Jyi!5KW?bL~6ot7BIN1*3h@g0$yct`?((G&>$2g@@ z`Ma4K#u=M!*n^;*v=##EtfH3oAl9Lzp@(RLTG!$tQg4`ch;j~1pi?)L)(=nArCss( z1nrF!`x~^IC=S#{`C(?j_mj4JBC{o(4l8*QcHJb9cj6rT&RK8WrJLDGP4vGJ3+@HpC|-PA5N8I$jArY0|f zA0|$?wh_1h8_TykN@?&YuhCC+ejO?us|nV;$^sY4z;3{mB>}>f3+^ObIk6w%O5a5o z!EHkn1enYrP>@RszMo}O98_D}3cvPyQ2U4MIMli|{3PHuhvN`O|!G5yuenJ~cDXK8NKPRc`d!$YzQXwV4IY$_4BNf*9W8he% z&{mx4y(^Grr_HE{e2kpG4q2mHvzU*O@ig!$c*xE^(VIEEi1-y8g{;9@^r=SSZ$N5~ zxTlU3+(q+5_5gWAbOj*z75gW0O*lO(o> z!(h~Xh-V^nPFmr#>&MT8s%&(qa9s+B*9 zp4Ij8IIh9WPj1TKaShh^Nim*1`0$e)e2ByVIZCN1?zdQFjka$#_G^tkeJJVrsPQ@{ zy6{lab<*?qo8op!&flx=?&l_Lci%8+ySs|sI~cvEDXtGl{;s=O`>sj*E|f$0n}mZF zo{?ZXiY$*s_Z`va7y3b_b$hM}lLW`t4Mlw?h(ekUuE*gzp@T!GSG?09ey1v0c-YVv zPzwLF1otq!Gcx%Oj6@1z#76W*P`DT+?1}Ny9sZ%o5||U)0^s8y%|S(jPFVgCE&YDg#u>*;5}8r zrda8D3x>~Df(&%Q7Lb7|h|{l%RpY_AWG8>p+yI|6TvV_1L);8B8di6}lUNWw-v1+f z9O%NxS0~i)y0g~5DLR^^tS|7TTet_c9kdtwXIx%tjmt}cUxA|BN3=Nn)-$Gr%z&H$ z62l7)p`R{0p@|AKuMxTWHYwnFIqrJFsVu>`atcXb1{UnI{I^wdq z)Vf_T{lMLJ6s{eFg>QoWz8aYLtKm@jGc*kJ12Pq|kzeSyQEh*@P?4{cg7;BO&=lxg zsquz=TTE&$ZALZKtNM+$g72RM_d$?Gq*jgO>?GJHcer?qi8c%PN42)0jwWJF<#oo9 zDpp?WAzo)DX*vqL%0U>J~X*KP|>2uyfxtusLrdn**(FLCTQlyVf1P)`9vN z(uq*pJvDZR2JTRYb=1z1!{5WLL=7sCi;`3zy+w%|AjGIeKu(8}M5_131n)!k5tqZQ z3~*^_(m;mMcNE?3wG4&dN{cS1yrEY#|Bz|oG+nvB)EVv9DfBXAv~M%?R3Bs%S{gTZ zx#D>`S@8yfnbL%24U#i}^TE6#pVRkvL;?r-{C&{~=WhbAfa*7U>L`a;^cF+@)r9!V zR_**?mTmIGm;>zk6uUL`3;{%)Z)f9{JG{_BE5|B_D5)G8uXK}zur;l>L5YreXVFojeY-MX4 zJRWu#tEY|v2FR~S%m(Z4lMGr3TXy+x3`mO_2z(l%?E!pZKd6h;x-{eK#285(+Q93P zUy%?%{BK_e7Fx17R}J5SYqRk>JP?2ZozcM z{J)OKuSg6QMgPkG;&|mBrGG14IkLU(c;b~CI;X@dkFBlyABk7~FJ7qB(R_z$fQv7<-*zfm9J z67&D!`VhNFFY1yU&23U zHvfzGXEoZge;5B$HU0k={|s#SUHtR>;r~?p)A8_sDE`@T^1sFZH2#<1AL+gJ!3y7) zaM$n)4pX?Vf|28a8gog0MbU>Cni)Hd_p@IIUvXf$8J|h-4a5Z8J^FR@N44ufwK;zs z=Q|2=awes#6Wa2%Q~LIWzXvt=QogA?+`BT`E0WbQu)Tfms$ykzZl(G2*o)%1p;mW_ z;s!l`BcnJc7iRho=1?Ty5q0+usH|+1+`=zID)*Z;W5M6q1+$BN33?_*`tDzrWkMBW zu|4NQ%r{w`l}W1PKMP4TXBlOoWSvc*sk1B%+<^L;@;{S?vGZ_8wh7Zyp~bgW-Al2I zy&7)D5Mke9QEL z9`w=&b!_d;5r&;@>WgrP;(|~Qr@F&WWa%X)(76ChO!&N5UIO)~hb7jX6m`}T9Q@$~ z>c_fFkI7uQfz<2gj3@Q_OT&3QpZ%ntd#Z;`!e>IN|Gi^jk0004{Ku9qHPqiJQtPz( zJH=dQ$i`)BaWM84&B%g^tLRuApP*1UjFa-j*3ukNnWJ02r);aE#3%3kQrD_3+=S-6 zzJGcX<`KDK(;GOQ<@puroHcm=iqe&tCN*}1)IXRBWCn2Eu{V?A_c`6(yNBvpF(!Yh zxPC#qj{X&X8S=du^Dk`gK+8CNk-m%z*3&X>8%xVLrj(a)?v6Sw|H9tW*x$YW=Q=Gv z2zThS*SChttdzUYa-c4bA1S}b+3Vpj8(j54oSmNIrV1k_k8`{KdTb&enkd4W@vyur zNzFEbRgszEt*f!;6ZDSMXYeIT7qE{~U{KbDkJr?3p2J#nJVsEHW+$&|^zm_%xc*h#d0f-ad&3KJ zFbDl0cX%}W_Z-w*22OH^7qb6&w}XW8yLoaCY90fU<}t7gJqG4};8l)7Q&voe|E|Y5 z#cuJ9cZWwg+|I42^w*rn7ZB(1TRLZkOUaobPdKN&@D*c^xK>gMm9wk0Q8S705CHCc z3jfxg({fXww6aZ& zc0v8=P;?UmA__M)c1ibjCuy;hzdjqk?kv9kdlz)8n_|yne>!wiG!zG;@=vf+d0KHW z7ZwM@!eC5ox%84^)hdIjKd!5b*UVzu+$%rS39ULRUPa0>7d@48zb%d%2ypF^FQr)w;t~ItY2BheUca{T!>AsDS&CB@v#BFuS{@?NZ1BZm$ zH-SKk?{}FEcWu4}hXoM9fS`ee7S|;7;hTkpY+-%UN(!6yz|A^xO^7QwA@2NKr7rr$ z__414-YoRrVkVf_N;xWGyId(DAQX&)V+zm56*8#oX_jcYeH_zCM21)~{{>f;NCI{$ zLbpeq)tY`XcWp{P8BC<&qBL3R`)kuprDzjRMcG=Hkk9(xoX_W{AK!d#O9pc>pI0~j z?tFHyNjIOF8+kq-ZZXbB;NE5iuCzfq!m>rk9Du9q67na=q^ekJ{qu<)_0p-reKEw< zK&^iMjGQ{P9328Gu>>H9L%)y_cau)gab?C3L)*_DKT*rRXSC=rM2D&VoOl3Al}1pY z$Jig$eP7cQkyr=lQ)6Q@zh~0_;g3X2Vmq|>L)z1Nk%{M6dpeKzbf0#agY)El75A|3 zYIS!{-eNaa=lCsYmuZ7x-n83l&mHM^dBukGyWCmZWzPG3^s96`UA}>Ldd&}YDfzzs zJLY?+$8pbh%Gc@V``!BEns4W{^L=zZ&v)DA|C#x=KmPeXky)R1zQ25TT=VVqWxDyk z^c~N)c2nwnQ|kexWvRpz=Ys4@(^Q9ijy>gZay(0F*}mt@rsO2=>*1E#qmG%=aDSM} zC0fnC{Zz9L#~53|r*7;obtFZMAk*?_tNXmsK&z_Rqg3wL&mF82xsoECP(tN&MfnY=EFyd&LPIQaWpgGqr{;YDpM-wE+_02Kh5S{n#;R%g z6(DkiJ5>knw3vNo$&qW!Ab6|?*|19m6|QquVXXS++AIDoUZszCve8eZv=05`*zCPe zKtqAR=_tGtx@T?(!;RHpjRxZ}0v+w^nswT4ljXSrFGG9Lo>Z&8P0xJ9A+o=vN7?Sb zB#A*>n!h)?Ai~vqPAhg>s?+%-94K3{n5@IY38a{I=5UrQje4Tr z3_J6S8xaqNieeZUreul~+k0~i3`iY=J@D`x58OLj-PTmsT6!}D1s*_kKn|ucyt$i* z6GUx@>gi?(tDOZrIFNH)-aa%iMXy>^)S$%=t7g0PyBAXgWOYJBB{7*^C0Tq&i!}R`&Fm}QVRIYmH<++z-LsA z1%Vo&hoXq7oL9K-i#Otkx;}jqE-Goj8q~K=q@@MumVyh&kNQaxE}c)JaOpo;mjp^1 z<$J486mXL}`KxO&++ZFEor4TQ=L+hG+``<-*T#6|1bK6h#~DkNH(&ZilQ&~+%bP6* zc@vWj1+S+e-2yV_7M3~Ji&kf|#B)8jdOBvmjSHaN^pHHPe@ek4OWJcwE)TCC><9T7 z)=(vSy`XbAHyEw1`tz6Uh?B#s@(}IhW}(*F^^16jU?dwu`=t=2mX)IXl!5%>F}9I^7aaf;F{A7-gonT(4nZDKVH+vE)ZqfF+BCOH z!Y!D@1RcTDo4*ocjb6u|mdZat66*ufjr#P8q%TSS%>9ZnIPUlzhHM@%BpIzJ(X4eM;>SHN)3$1z`EqN3XH>q5M|$ix^6l{Ii|0iRt4%Rt!e zO2lqgsOAR7ZunrG%Dl2)1>b z6vA|3>Kn@dHJxTQ@$$Tq6Cw8KIYQ-;D9{}8$t?UjXP217IQH8 z47GRw!(ey@*j+8(pM zX)hcRn}(QRu%@j~zj1&Q;C`tmDbnFo$a3lav0nnys|9I0&;!R)DcGJiVs9(W6qn5Ut9<>jitIfuI6iM(i z0z0p6j>nIrZD&0q-e;QiA)Xc=;?kxzhj=FranFAEbUc50fysZWXrWoh*uiU?(wv*; zTHvz_Q$Fhy@3>pK&-!hF&kn`obW&dU7&zYOhNjf{iswB#0s4Bc?dKx^+M-ydg z;K|+Tp40%WM{k2C>inPKS82L;E&-W2dL_Q&eiwfAN%zh+AYak$_|AH4(VFfZjQ5E) z057Hf331Y|qzl8tq4^7UT*=2pI~8#2&x`}XKaEwyYL^=HRa9GnxM9?1ToKA`HQx)_6=dLyy$TWQ<%8hdqrZowf z{Om55Ixq{V4O7e?l%@{^>5m$&uu3OUS`nN^4d7@Lj8>wg^eT`iQJj(#gO7hp2pN~i zg^G{MI~bHXQCSz?Wlfpm0$f{-nGn@C#Y9|5>(H&rAT>=&)R)iI^yN2vWc*lLg&&vm zD%e*EefjUrzc0_HFbu3a7PKm zzaHWV-{!!eB&8m~&;`q16dq;;DQ9q?$;>&T0!*&DFI;syN`WbudepY-S;we(vLH)WhUw!iPjtA?6nf{&_@#oSw-M)PK z`^29wEFu2fbUE|qIaS&h*YX!9y`*#WzXE#``;wpL@zc-7>g|1vPu~#8>-uKi%He=5 zVlgd0W1=Is{5#l+&6yZ#n+7{A#mrV@GZ13tnvC?y)$-5MsGdRU0CAKy(k z%L&VPmTy!g+Z)3Jt%>#1HCl6s=9<>@*qDi?NAkK#YmLUK4gHAp13afg>(kF^&8NR> z+Uk^cK979L^VzyQ#eRAm<1aXl@jvvTVf^K47o;rRby+5Rp z{It`UnZeW0PEopPeDy&cPqheVqG?q&6(bfQ#o(7!>8IA`lj8!nsc)y7-l30qdRJ8* z!(PET_C?CZNvZaVfmYp4!1?RZ)8}DgK}#*3@EFXUF9oqxR3%$HNEc(>lk+vYoj+1d zY2PiBMsi;~U*l#I>tD}f{qJR3zQ+47Mwr87EmH7PdJ7rwLZ98{e})1;TGe_~Z&Epk zAN?;feW+Epn}kh3*yUL}tLbX7bK=BM=4P!nS$;PO}l-={h5@?%MBpQ41${p0&iz#(>ZNi$xc63>4X7A5eXtyU3MK3mJ$~U+=pHTw58}PTr`XZ!5i@Ub929hG ztDd`@62%Th7MOgF%C;j2j7F`ig*5&EAnJ+aNmm)`WU!BYBZsfYx?%o03Z2HsosIs} z_&kT7hlez>^Aa1QwEQ`T)D1tg4%tR7l=l#qTaMm#2s{ZsI zFYspg>xH3uJ<7q@Dw-uUX`G)eX`&_UaVOaQJhk*^HVsn3xfn6hCO5i8V)D@(tTfWg z5MP~8Zv^4?R}P4ctLgnk09Jzy>}@fdT=RIOQs@SZfPGTP?~9YHB+(&Qer4S2VoX%U zxR&ids6Rjz>n@|d?>gY12EtLeey!|W>zjy5|6y|YMnH$Mg)Gl#=P1Tv@5*3nvEqW$ zKLD22l|$hSiVJ$U2B^}Fuo#0Ka^yyfTr+^{=hY10a_`EGIq=U0LK5uUz`UyDIwS%Q zVO{iuWRK3Rkprz}X>l!{E1aWx_Il*yh#dCM!y#WMgJ2?t?7Isq z!N6#h8SvZ>ozKXjd>2%K0tY)x47LOYTOtNqA_iLm@0Zx1U77TAWQ-pIzn5+&%Xj|* z_y_YC;!0Z=Px#HrH7-#@P=@AYtUN-^$6gY8o8Wo2=tV?C*^4a;G+yhv$-d(vfQ~qX;C(`E|R7~a`@HiW!#m^vOcJA^` zz$5bW^RV$Fmy%l~<~=O>1U;0miS*lveW~AliwjyqNjJ+J=xG0q>W%cN(bvIwz;^;R z3{5@uJi=BXcB(${Hi}V&9%Zb>9d37BWZZ>~9t9du#>%ixIqbuSiEXHaH?E#`#}&<| z0u8%-$oR;)-+Ro5)GKh5@1g%{WZcOaaeoK5bA_}R?L%shZ|hR)H*YYC?FAHpAQH8D zk{C72d5rcnPk5{igD#L)gSB_VmgCBq_h63JU!>qQl*5P24$8I`?(ag3;}*~ae+hed6+bW@C9ZHLAj2Z1#3Y3Vk*RaCmD1roi=&DFhVe2& z$>AFB1J~8*`QtEWadEN*2p`i>31COp&c?4J{REsk%HhDYYv^4F=`Nh6Uq;qb~eb%un)eOirqfTf@v@!z_n z{(vJ;-`I>$T#^%gT=z#uw+44)KK3_5>koI^tqg<3)!f;Kk;fj^+z6N0qGucO@8C9d z#dOK*RO`#^LbLHHg!x+anSTIj{%e$PRy`%?HQro|HzQX(`befpJ@;$+9me0MlYd;5 zTCMWSZ@kMNFMf*YI}#T472wj<;wZoQ1iiU2wh1pD5TDI?SNH2r$k(}EGtmPYUEoL& zP@bu)cbS0;0N$u5m;dDECu@^g-34gJXGM!vv!n%J4{S=+^=x2MesLR;p-&S zEWvMtCr~cmN1WxW@2mYBhuC0V?! z<^Q1XKcJ32gX4p+9`IlE_O|?m+Y2oGKWXnWw0^wB^^=eHn}h!!b?+YERM9*RCv8#! zw49(6f`U{Gk_t$xfQ?*iXbUHpY8AOCpnaYSPsJM#B!F_&q|)QzAl~tgk2kzL>Z3jt z5TpfZ0Ux!X2m%U<$~gq2K!vtI^UlofIk~im{=R>`em>HZ-JRLn?Ck99?94{=!e)4U zdSWLDy5pWOG*?&9E|LED6(s0(>dkareQ|lzyxQ6+^@e}5Q|dDxpfO7Lhwz^8E^d-C z69_-NKk9GkWQeG_gDJGTl>Ak*C^h z<`(s~YLi*r_AEO=z|nuBD1D`_r>WoQABBDXZ$jp4ER*|)Kj0Nc?1hea2e(13l#s8X zf%4Y(u}8__wlpr)1txPValw7kDI~#kGH1P#eCVxLpJtGr>Xbx~rM$cq7*+U7dPhRP zdwmbTKV?(>=C?G}cRglI^~itGRIkXQsm|=hrrPsvz5Z;b;#W4cpT6HhzhdYgwvQo= z9!P`Ca-$Vf4YdP$=#i@FdG<7aUv%ZiZ3c8V z`X5+Yw;7^c&NRC3QBQ^wYOcI&1uf!i zznmUjm~`LEC)H=3gt=R0`1L!WsxM)+<85Le2k?J9)WeBezn+e_0nPGXlwkjPDw*4V zZl?8wZ(~LC=k!h(Z5?yjW8r*My5#FmST=~6$I&LW1NuG#v0&V@4A%Ue{^Ufx=_TCo zQP$pw4e|j@9|tz77#oEMi(*<9dIZI@_=f$HH>}-eF3ApY7Alq=#tUXN0xD*RF#HJ( zk>0TL$P)hvX72Qr>;s&sB$6EA zlAH@Ys*$x(K6Bvf2=`P*^= zBjkVI{~4Um8boDWgV()-1tF&wdE@<^eVI-4))$68bBAi?4M)sS6Na*U1rVf`Rn8w} z&?Krniq1}$-!UQ@^hHa;1P=Y)8(4!6ts9}u!Nca+L&TDe)Jh4+es)C6JYvLU=u`!h zUxVj{4nFsKe_J1VT;EGeMm57RJg|pUndD~7m$}84*?SlwL`t!`e7xOwZx%(!bHQCjDHDKmW(K>90^GdZR@=d&58C@5ti6 zMHAAN=A$VXW>&F^ zAHHiyt;IJE_#of??aV&VtQo&T17{uNL8Mc+#uu6Oy}^InUki&);DgCgO#Y|yv^?Nj z%|^eo4{OSq7=D$#S$jH;e;H2^cB9SQP}nonO;wx+#rAE(%k|O64;3#nnr#MVq&~xn zhksNbsfr(?wW{(m>ThE8GxYvBefxX8P2?6Jr^Vy864)m!>+^*Qn`dETPa)JU38Pz7 z7KAnnWf=4}*w_qmHqxE2uAQi)w|mt>PuTK>b*ZB~4>Y1XKp`{$m6XyJig&pR4OCC= zDu89{t4VLufhjH)w?iq`rd~q5c=Le0Y<#ztr{l%HPdG zMOToJzFByxXJD;Y`XYOqN2r)&UU;gve~_>Q{U|5l9%)(G)l#nh&B|{|A@pjVk#D+f`N?8 zf$lNF0{5d`ZAMNaHH0e5Q~bF@dI=TxnFFfFT{~}o?!w57!1tqM%V!0OXkM7plU*GM z6*;+~NN)Ksrq>WEJV}KmRe&WaygIv5>kx@qZv_}4y3#y+;Sjt>$wD8gN9eppw4Q-6 zc0pXz#zu^I&Ru9=wM`UuR)taND#54}8$TZ_ho3hXKR4jd;l|HySolg*KvYw-Q>q@Badhjb=TFOVIA z2R3S_$xn5q8OKh;$X$@$Gtk~={Bq#ee#;Gln|?tP-3H|yyKpjppDZ{?A;IQxdb z`db_PaeX8~zR2FpxJNmQNBWcd<@9|o1Fu;I6O~k;3@~{fXdVFV79N@xYyU4Q)>SW9 zOv8nWPB|cYW-M%aQ&`lwFD=~JivYgF@~CLugsXWWZ_MoMAL^kDv6wr}sWdX#6 zhdzgI*sD|VW@i_2!nw}WK8AWkCpxRy zXv1M8cPZ%78Bg{*HQdf=JqTs9S zucU^Js`h#`-jgydiFhfQ$;ZZXNPFfbB!gPmK?ii?Gh22knd)0B+Ph3>h>??zmTtT9 zv3+oO37X7!KFy_*13o4i+Yo`1Og?tTe>ZdoM!-DZ0u1Y06K<Q&R@Fza3>I{t zYOD-}2q^sHJd5cQY@K=+6wxL|_2&ev&ZJkzL*MA)G7+rQM$=Ane-F?)br3*N?b^c_ z^PR5mKq5W3YPrD|m8o!Un{E@<**)eRo(CdCVI86FXUlP_B2$JB?zF(~fHRTw435WU zpRjJob=u-)9FFW_HZsG$hcts2A_!jUX#YmUiYF!w4aV4Nq?psM&bJ zP_km3PjZB;6I%#BX#s|=nrn%27Q&;7Xh>a+s#BE2JnL}fsH4fiAkA-cnaJt)m2rN=_nci1>IB{s4P)bc&pyTP^lE7!AA#mXgXBkS%;DTR^h_d zhS7i&H1Pbom^|zf_2z;oTaBN3ofgeq0~n?W2*=+m^%-!imT_;R0kXmC(!- zLjOquiv1M~MdC%QY`qvEp%H{FilSMYt)K2a>u}#!mva=dwR3j{;(@_JM=Yz%37|t0 zKqniZySqg}Q(FT~N&vmg09_sf+P*c=n$z*1NB*mWPL2U>*Ba>J1kjHR(92>#+qMRJ zLll(A@;(Ff$c0f*oYz$y5#k*azt)WYl(9N^}g7R93Z5 z7{iMbK)nX&5o;9G)EelojWj+D(cV9Xy+?}Hwm{xGj4-V>e4r!z658TbuFGIJt|Jdf z8Hh}_&VekSoa0qS*!lBi*x~2(B~RAs2&JAJ)bVHW`4|w%hu7})Jx$zC-IPn%wY#T$ zYz9s+w!7M5HnAqrpi#4~Y)qW)Xp_4%8q|b|>NOt_HM!dA)KnL`$eHN)5dTy25*nuEADHE-)>$e@U)&nYqX9#R;|8cwPmfV z*9Ik^xRs&Uk^JXT96uS4;zEYv)uca<;{B8Hi*_eohsgu8E%7Mi!kuu!(TBPc=*323 z-Ii#75TL5lCMTd$7^>@%{yM4)5;``VP^GMFZaYF)%`yUtA5NpM4=l}UueEvHHT~gJl&vTu* zN2*p2ze|@xxbHUiHc=G?(kvU}2W`|Z-=(Im>u59;c&m+&a^@FZ7bjR_cWQt8J%0O; z7?Pe)0Y`GyeqQ=}Jo#Klpab7)_t(j1cYVVC$6B$*SlnlC#qtEuO>~Kp-qn0=(8&mz zi%jyO9of`{JG_ky3&{^TyW>YcOgf|n%jf@6;xBbkL}iu2#0 zkz9}(9m$^YBWZ}w7aliA9#vXu-aG6(U_R%0ppS{42X-E7ObhU4_?vDk6? z>}`FV9yI^SaeDOavyIbbiQ{znG2__r15(^ry|#in+SQ^pWDuXoG#PUJ*JpaihyYF6 z{-g1z+G8tZ+!^Cp4MPDVC#NqxgDaCxFAZ!e7%?wPKvntHIZ+KYxmrb4cro(H)0TvO}Mp}rv73_fTV1#{h5-MicfpnF?3E2~Duol3o!k8q|Ob4zjp=<$R zoe4~LrcIlE#OMLl@hsi2rngTrq$qv7IZ|5dgo#l{oz_1A#W;pylQ|BB>5otx`Ymw~ z5EB*GYJZF8=M8V@Yx$lIQGps?@U}S5JedHx)&RXS1~jEL&|4BfA2dK4(h@+sC4lxw z03B(7u1!lgyCs7Dd^jF7)c}1kE#d5z2>NCM=+@VD(2;2g=hQ^ddlEpOGC)()63(fK zpjReX(H%z37{JcP^POsi01D9S}PcBIT} zjya2D(O)L`cZ>8kMFsu_ZQ?J9WcE9fS$`z6+=P~v>eX3)utZyGYptc$1kip4=qr{4 zQ7#d5%g^zk$6nAuODze~LL%rR2x=qHTqNd?4bVd18FV+&D*;^{c5#JKyoei|KNQSO^7am5w3ES? zeS=TRqJg_)f4mClS~BAt__M*B8@Oav7o!4F4HeKplP;MxX*+*P9Or9bW>*csky`Ty zdCPxF(EpHmUiuw%Q3ckOrnBgO5`v{P+N3EHWp zdkt0GJ{-4|%k8JzC3pNPmbKGr_VNW{SrrR><>m~ zaUH1hpJsAfT|6^uMTyzh#Fd!mC!iS2P~7wvP}F`OkD}ozc5eF%D3tF>k7QE&hK$>T z8uh)W^uc@KSGIr(@Oq%HQB~{`8Wn8?DRif^R=g5EXDbYaZkz8D%_ab3*%YSGV|Vp!L&Z z)XF|b&!z&ib97A5y3}ahG40+2R8JGCw0(7PsG^;1IUSek)8YKim{Ah7eGJv%ld)W) zE(XOu9Yrq_*N6YOGaf}-LXrN~$(B1zue0nhFYZhvQ+!RZ>S83*2mc^%${*y7+L>sZ zL3U;E0?slIqP1x=3(|5Zb ziMw6tj>KMU!FF7P?eMk8i;hgzALMn~5r2a?#OT4*b&M(K9pE>KQReX7w`8N*?p8E^ z+(w5wp<;j?A0gGDJ*4da0?kF=f_$MOwWFw{@115b1+`+aOIZpd^|>9moC!dxk4 zZV)rKGb2oG;kVKE1lhZ_`QJq7nT+UF=G(Kksa z`5VzP3_VKET}VNOR-5R7%H^XcNIx%OPZ=}lZGu;-^qH&u$v#+H@=PGEMsMa(pS#)b z@L_z~8V0lcHVy;Dz48eA^~y~)U=CGiCfL;Wg55qOY%pG?VJ>12gzLEJ^GQReLHI15ysBQy|B}iYOrF@M8k!r#K zrVpiBV+W$k|HJejk^lPizdiA>$lAR*IuVd zf4$e6Z~p7Oe)8sX_xgd-v-W!Ko|s>} zuedJ4{_9~ti|!_2QAc!%j?a7WsgGE-&q792S`!pd3vWZ|W@vL^vPpfk46wWwe&uX< zK03|utT)Btc>&+blUt~>z)e3xaQ>lDnA^^ry|}xX74H!(POn$=hpGkHm0IqZgaq`^ z)GES8VbK}t|K?bP2OAUM8&Aa~WN)Sd+EZsk3a$L0fwgjY6zNNlr2YL5VCFTo=-Je! zgoey%NJzK?N@%J0z_$&1igcq7=IWPVeSERPlMy|>=V(#J*yQ)xMR(YLp;xMBajnSD zAiB3APn+&=;LwEZO7wvn>mc&~|L}UeaT{Xs$^Sr~r2awZv0U!=(t+Nla0mX*6Ochp zH$*JAp?JuuU*o=Um|26-2{0y`8-#bCLml{DPzv^y_iMflH z(Px(r7|l9*S@q!4eN?p7Yx4P7ou)(wy$rEly_fx7P5fz3%mJtZpCrFKGvsoeSw zWHN7*vHl~iKYrWWB%2KXZ`MZcjj$iMH0 z_}`g(;_-Fe69caeekuOFsO~t5tX7%dNp;b)^vX*JJqBsYydtzcB!N2%ES8jr7k#xTvC%afF`f35{EloxfF-HTdm(F*5vhN zkE{#Je)e}ke*?91>LXH(x_KJ85qYUnsHl|kocYq-_&C~Tc(~sHOknbz7L!_w(Oq^I z#q~#&hjxWJMk8I6V0;3sP$`OtgVQ_0=gc+}_&6!!;lw|H95*n11@_ANeJ_{EUknZ1 z10UoTLh8O1{WmF=TA`y!=$LDV|1;qKtO-OfbcR;PKBFUe9uWZFV~we?E`{G__NmZI zOp|E>cHo<-&;j8Lf6W(1KQ}|U*sz5-5nnLUz7CrrcdI40uB+%&-|LEly-jN1U-c|1 zJWN5BTJsD|ft?&r^+}k5c9`mpn}TiV*q}Q)HYmKGreMUq^^xptWfc&R2@yv+j{vmM3WwX>`zU(tp5m$k#d!8wQ4G!52R#B9*%XCRR+?tBkHesX(09G z54stDOwV?*!O?z;@i(Uj9Yv;MVi<(bNj91d^b$3#5&tODVQ(4o1xNYDEkJJi0$I{gwh^OAKT z0istPrXxCR0}=$ora`ilF)HDQ0j1?)ZtlL|A_Jal+o+!Pw7|>^#rWWD826M$y%Ka+pBeV zPe!pcqCm7gN`Yu^X^MMq8*Db?jxck|!ns#9n*z7NpNJ_i9(W3I6xbNl9KE&;{!vN* z#KvOS$BOMnrMVf5CD%yP9MZIIHMuUfVsZ!25J)9B*|hjKEJb%x>=W8k^ABhbzH5r} zuTP7n9@!3&R9iSF&f{83PD5ZvDi4{n2Gbb!+VJwaEDF$EjV<#r)7!%RQT!N#QW zBC`;Bj{QywHYS%BC9!z5@H;u!XelpBHhx=zjn?uaix7I7{k8@hQ_731Lg+R2J0;lI zro1Rc2+d=^+XNeJ@p7D%=r`teA+JkqA@ zh$nN)0A4_$zbM6T1zE8`+Uhk|da5)Fd9oAkwR8y9m_0&A3%aDSd)$q)4(3Z+gFn`p zw|LxJfX|d{ixF8WeCek0R6Ba{aDTm^t~%dgHdR|3ChmyzuE~7r4&mmVOB)VNKW`5? z>d>Y{^mpMfTj&y`+RXD~@r&+bfg{{c^lExLez^gcy-m9vek@&0?`u07>8tVTTT6l8 ztJ%FPe?Pt2s;`UHc+08%3}1lbdrY8b7GL8R-n?j=@rxdu;Y*CwXq%_~7r*FH;@kK| zj~idcFEnW+4>dsExwiv(he_i#G<7n3?>*6Czd-h;emsV_zf?Uzd#4bkqL}rGuSjYV z7~#o_I3uheMmX@uMa0?a@Q*SdPhy-AekDAFrYWL%y%@aOf>prl z(^kP}Zuv0c{nPF8z(?V?G`$;O*@P@nx&U}qzb`pgoOqGW6{lVl%N6h1Ph7F^q9|9S zZ^jj){W;F%7{dP)oY4(U+uB{{w`pH?Cw{xGJL9(rd{!Sf?SE|k>i!OM*5~&~tbC$GLQSXz^1O7O!GcLtC(D`$1h{O z*Ma<>hqqJu@u)o5*p1KrGPVuUZ+S>9kxIzKWN`Iskz|NHl$pMf!H6{$Jp7vc?a67K zWL-auQyW4PZzp$qB#)#{3iPOAO2ZmeZ@#*e_%YfVP6-;r=#y6=*rUU%5a?j zH*#jPp{2i1I#ZR(=^~(E@2tzVu}BXXPZK?{b=S7^26kp0bRoVI8+r*o#g@7`eM-Ml zI3kHkKhHlcCZE#l*%JIS@M{n-ebT9XIbFK|0n&EiA7$t#_gYNl$;`W1$uT%Od&q@| zM_A_zn4qP>xdG&w$wP*~S#<^xTG0Cv9BgE@eW zAu(5pC7a=Mh|j$vutQX=E2dcRJZKh$bpu3JMwE_Cd7$w@bKsEzbcg(IC-gnj{hGg7 zOv1YSfC*;$e6hsZA3YDq-80}1`Tt-(a><3}E)dKy-HTT;!^F%c5d4?dz(J>rP;q{B zvZ*}11LOu9CJT!SI-v^@OVDTD8QF*K-fFBrbY+AI^|Y(8{*V8Hs(!(xIVL7Dg$!*aSo);DQ>+Q&gZ?c=c=(qH%4j5ix+yJ~$$lOBFGYZg!%Z+W38#I}P zvRlzp5e%NA#+v3}t41Dl(TBxrr4C@21pS zdFn8px{*>Z;P~&?@pt73cjyV-dBS)-p&L&asV7{-69(xCBX~kDJz*eE=w$SQCnW0$ z!+F9neP}#9;YU3omnYQf2}5|oIz3@1Pk2{P7|9b}&=ZF7gvEM70Z*7?l;ES}xQj+f zdtPr~4_@*Hy`+UFpk5S+^8C&&^yviiy+J!gK{KtP4ZWGHD3;(G+=4G~G1w3o4c974 z>m%~eP3bOVu-ns#>#T=A7&t*{(mrnD^<2#B*?F2LWblLf#0P@yMurXpB0~_?2f&)ibW<8DH=WW90hsjE{Im zj-D}(XFSIA&V#6tta&72@~~%%Xz}JdO|;* zaJ8P$mnUTE34M5iLr=JjC$!NMGI_#@27RsZgkSW8-aO$OJ>gQGut867@`Mlcgh4#v zWj$d4PgtraT*(s_=m}TwgnRUat9ZgBJ;BWre0o9}PZ+Exr1FHz^n|uNp{t%?;|VrB zp$$(sd6Flj@PuFW1S?P2t|zqP37_c+Nj%|0OaKn*K$%Oj-fu^?$^`^D{Z2di4mD4>?(o-`$2T$w>fq%)MK)&B*uJsHa?hqdNpbZ`= zyI>W|sx56uQgn%O7g4^^R-K=OR{(8%=)WFC1IdOhqQC|Jg*KAow!7dIFI%ex7e0-) zv38==BX_C>LG)7r+kTChJlw{-AWIwI@Ut9JDXfP3lvq+HJmi6+UcQ`mpMdgq`c(J> z;>9I=OoiO10J zAzw=SwF3&G={xYvvV@J*o~&nj3@DzJuiO{`5i1{^7Yj?4B%!B@8)9^0UiVh68`};O z;t4i&79RSXWUwvd6aM6`;7@{bzYx@t++PWcUIal6uSp+*zXd`?M^O>YAh&sy`^@Bt zIUm(!x3bI|%#fQDY)XRMB%X`fvjIFi39^%eP05g*tY;@NTW`#UQ;KK_Hd!FsqGu-y zp?x-%ZGmiSu*nM9Rz2GygerKp6|z%;O(~F_qGwx$&|;pQ0@-bXO>H2%jh>w%g#OO6 z+hBC9V3Q59ZF+VaA(X?jZIInI*whxX+v?eHn(M%`LHZTj1)JJIb~`=0tq}UT4a;r^ z+3kZ(?IF9pp50Cet>)Q4Nn&cSDHXC)_3ZXSXerN5h3vFoQyOHa>Dj45XbR6xgX|8$ zrVfzZLC?mJH$1xoWDAfd=y@GHqydz+p4^syupw>UBifO(w5*)byBUzyv?Viwh@zI?YOu29psEPwIkQk zd6d-i+&*W^ zZ$X+eybX|_cYReuGO!0}xFcAZgvSLJ&^;|K^^MtRiEtBncgBN)%PS4HVaT+hhuf2O z>s$O&vfHOv;0)2J8O{wKL5fGdF}CgnS0V@U$fGhaIz#5Evduyn8b^9-hBFs>%Fs`5 zu$a`${jvV0p*rFrj-jRulW8^nBV1SOk%v3cK&z~h4*g~I)GNPpJT|;YeX6{k+?!`{ zr9%l>TBKctXFM580oEcM1(#3y91G@RkqquAhIkns%H}#newErDO}}#K5G5whwyC{l z;_*qcZf}RY3sQx$t1Wc!R1eO>ES4C0nG*9c@&Nn}=2(QX2U$7RGPUn~vLb%69i3ob z)X`~k(0D~oEBgvsTr#d(%}A60OkUY6rT-1fN^P_skD>|{3Kc^kAGSZ7kIe)s8(2Vy zpf}3>z(`WCnDnTNg zv{%vb-MXBVojL0J0T7#BBip^9bFT+V=<#DTju!N9NTIYTUrzV7#Y?rXy-GK;Y|SV1 zY5nkz8#bBlgWIw#30rR$9zsXYKB=Y}O&`dt2KJX#G5gC9X3YxwOC5RG_pIc*OaSU+ zyUb4Px?QLM`u7M0wdKZkGt<~^HsgX600(Y20h4(P+iond-Bi+cv;8Kv-SC}8D8onz z3(-o4EPVW(aYXI;Qun)T*tuOkDeV>;Z9dp&yR=?ga5xg#+C=M-X2-lZ9`pZaqwW0RD>c&MzXaHl+=DXNtD+fPJCEV z7R;~O*-zTPNE@0`75>QSyV@{;#7qLoL>5&uS_jA|+CwL4E*;mwT#nw$rJHJVJMvXV zGYT|0y$$XAFtO7RJ&FrH=_{IDloh1zcVIG%+d~*AR;Un24hU_g>6|27*y(X$ z+o=X7Bf#vPgxRY_wqU1a48N*{ieup}X!j8n7p$$w3aE~|$=wjx$qd94Yp2nuYO@sK zBx?7-@GJ|gyaOZAX7Ts`IAH0#Zh&ti7plXh2UOO0$L|6FVfl`lRcoa{8njrg`*9*G|*vlWwN| zL7#MKHIY7KKeJPi^lP0w+<*BQGbCpt9655b7?aB zXH5T_ar*|MuKz+1XamsEYt(zL!3HrSDBXVNa^|FS^lkN#VZL%vVZkL>*f5+Vy{&9PL7n+hGC8}5`*HXJwhzL28NYU|8*U{1o@1DS78jB13vlvauU1p?7+vf?z;Eh zX=ozo!gT`sR* z#`g{$jE530C}pn$C^%2QzYzj^6q;@hA0i<#ebJ{V=nS;2>dj-6|2BtiPyKg@=&~GV zN}V%BM@K*JqMv%^tvu7geqL|UjD;@@MGZdlve?`YiN+ zi%$Pa+TJF#Xa~qrs1j`qaRu0D1@I-2>Wp0qJgOe8hdkVnC5uz&4F6|hOD?|WtZTtb zgBG&_j;Nz=Ldu4{fW|PIbC55C;ILNKM)U14Yop_>3`0i3;*i+lnEDAVj_`l=#j$lZ zTO4=A(0@tX;<&yfc5!%0;ugmhB{7TR$w{p&j_czVhkoDB*T;V6AFPk5E!GFda8vtd z!TNZ%=v?cgVd!67A2W0Q^!hO3VV-8ed})!ih?k~X@Z2dK!O6s>qlQE_IW~MOC{g4^ zTqnQzt`0|L>SI*5>u~kT@2*EwK-@(u^=v>Pb!e?b|8)#C;8mK7DUk!^> zXTJ&~yq&-)`n@>XOm?d{kq8NJs^7)roiQHWt5H{HUxPOJj|5Tanj_4O?bmm>z3|7O5*1;g{9f#(^9$3KK#mmN_v zi?Kn?h-Aqvtt^APT7<&Or}vNu@coTIcpu}wVZ{AR>(-lZ+$yPiRMK8Yb-gNI7v=y^N_5YE3VQk|VAgp@B6gAupwnnNw4ALg$coqt z%TCa$V|I=d6K+541^srvPc41*J)@<+sUOeal=3=_0ELH~F$-o{xeHEZNWj?j9NzU@ zlUFIQv)yMBoef|&DU?(n#vrLiZ^^xL*_>o~gB!7VKKBur6Z3$2{UuS^lP0!QjZoQE z^`|Qt!yg|j^K(sH{KY=EH~w)Y8_M5sqU)zq7Xss`&BS%7JqIzQpWb+EH#C&`z}kNp zP9zk;#-KkQk@0|f1#Hs9hIt&U8kh)m(o8-oqyXUIKU*7^{5)70CusBBs9u%Fx`v~~ z4plfxNAH6Qw(Id=c zV)y~&J6OIvx&g~%apB}9jD__pK17>xD~<~6J7k#8HkcU>oW(=KF=q}sM9%WbGmym# zA+&{1r)RUrYg6GbEEoS6pL9IDiF(O)BBOs92_2=w{I5*&MU#DC5`WN(HQqlw_B@Qz zKdycL2uB=joeGRqD0`JQf#?gK3ynBXgZUdBV#;3kp1>hdUW-#D4qnS=L7{9lJ`-yfiR0&xZ)5laOTuq1RZsQD#+BJAV@@qrCkv37oJyh?g3MHWHo%a`iUw(<4j0rk;bhWb9>YvEsw`Rgo7 z4R9_`66G7=jNV3+#@RQ-pZ_nJ-Vu)dfs_~>0iM@ycry09E?os5QO6JqIEsS!gK2{E^4(sWe9QJ1%au9SQo2(YUUpQxQ102ZuJl)Z1k8ccKunbwX7hMndzSj}L`Ra$c%Z6oKE; z=+wYrRHR9JCjjv|BKzXw10$2Hhu)om-r3{0BQy^jHryX(`2!? zWf-s<7^by1n)n6PFYR_ zEp1Tb5=7WQ$?Xa$Pd~-4nlc2-Anvoj4vM?T0ZLgeE#$p5Wls3WVwf;`r0NFw$;-~L zxRK*CD6yngH1AS-kO7;5^%;!KC+&kNRVLR>@JyVvL0!``-hMwAvCXzF6#LpJpwJ|0j8+OIbvXO~}0W~SPE zB#2KXHQrwJ!G7$N2cIV=vmHOuzdmukf)R`GRON;#vGT50YUU=+((hzQA3WrOeaGhO z(>sp99oaBpaKMM1j$F}Tph*nIjNF#Qu=v=^Xx@T+FcWC)I;Agcem|kCSc6&tpL7?< z&tF}P(o#tTf7!&7yTz*g$zJII6#j*1HaO32VnQc7T13W>(MO@2GD;cHA6cm~+$>5d z*UAuz~q|zY91^t6$JGg0HjlmSMCHIJqR#c15cr3xLsYC z$rKch=n^b~Mc|4NE$Kb$?(jF!_>v&D*bA}`YXgw9I(T$rTpokDUnbiF0>JoGyBGeM zlM(m6fa%-ik@jFiJO34A^Rfb`sUb-S)zU|@5PE|?QiU?S{94KUC$o1KOYwP$s|#L| z0wl)649>B)3rxg1J5sWZy(5tGu!g{kn9`@%cNnI(#p))p7e}#_IU)?`(Da*cWCZhW@?MS2P=< zDBou*lt;MYh(8B7kTk&QmC^4{HeFHsq$&(po>?Vk9uun?lf2G8Ok8;lS1fS z0EW#h#p|w|BFwKrJ(!)ve$L)4!ZE=fF46q~$)$ExNXO0;0ux?OtjQDE?v=ZE@u4)D z=Vzk-7^q>xjGNKgyK0{Wa9fhFNnUxZ%cw!jTn|l2!qD5n{{e3IkYz%d0w0C)6gx01 zuWWG!mXOmTswPt82tRGeCqrm~(-S{90HS__z0pDEGi0Te>wp@`emo=McV^-LMYwD- zBGc@VX|_m~CsGU3oP}vQ(WT^vG%;o=vH45Af#z?*C}aNK>PGX|{%xAS#V@h>d)0+6 zUUAfhZcR%@MO1dB8cFBQ4%LlCgr>o3pegBwnglhqyD{%S7PEQRE<8$8HKY9XZUBJ!~%cTS>Pjd1W5mHh8kf{Fc1mj2a0M&?_izn>gRqJ`2Mead7 z^MFN`A}RdVk+VpAXD$~ohIc@L;nYldkY0AtMLSO2c zz%KieSTgY0krpE8U|efs;wKL*=OtS2k20{B=X=hQ-;L*A4Egjpg2j^^*p{ca#q_VE z=_f|8^rOE+`X|x!pLzOsnEpyMeH%}&!gSf-#KQ||36_p9mf${G2kD+SXeFncfGMlk1v#=I@|d z{T|3X{H2=GReyg+7n#5FT8dhzss9b-fjmrkISj$xXbj!|Pzyt;!D!|U}!anTp zgR;!+aQaiy_fYb0Nb09i#6h`yuNT^=t}Wu)NT2%vEQk&l>q}%KERiGrht<;{9)yGH z7ze*R9UNUfKW4DilX+f8-y4BV{D@Nm^zTG zp7L_xfNZbH%O%RG$rJhCVzzQ%1oMjE|M6-HsY~;+*u!qT7MNpW0?wMQYzm*~g^ija zLM1uKPCC)zpJVsBPy4s3mtyO-kpCC`KJ9<(wxHF?7Ud*7hseWi^we{t&3_)8-f|!7 zjd})rH)&+ZwWJrC@Skgl{Cd8GhD9YgBAH<^>!H_{k?2-x z_bI^IB@xz8o&z0C0+I|q2rf1Vb}vO>0-DQ!M7kqm`)e{Db!X$5ZY?4e9-gH1Qp8}jMU-Z^VvsQ;%BYB2o0$!B36ki+nK#13s%jfj;>~c0gJ)#Hh*$!v zxVrl~7}fA6Odl9`R4mqyiu%249S)JUlRb2ODFt`T-+;cq=)e^rq!WhuTTIf@90&XL zTNnDp=XPHgHn%I`oTzS~J*q~PQ&E|7*x$}eYG&8~ZR$v7Z$URk>a$~^jjQ9&7blr~ zwH>+rnROyN+K?YCo7!#|ZGJwbID=3sGqOk>I3oieyU9gFuVombOSv0eVi2(wy3a6$ zwpSU4S~cAM;2fv^I~Q#N@mxH+fR^13^;t?1%6}s1W=@9J-3)zuqC4Aa7k8(FuEQra z#K*@)!gx z$OaVmXHZjmk{Y?)LO zepuIob#i*89a?dO?E}ygjqQas>g;xGEOSAmQ3sDVnH#wgMslZ5E&-nVyfX{Y1S{>L zaxJ?#^twO83u>>T%3+y;aUA@Wzv=UKW!)T;`h761n88{%i z@4~v@M@S`*fx}`M#;Edn!Zt4#R~(3-=}$aKo8_3N@$Nz$b_=@$>QKE9iQVjz-d{*$ zyoPmm@DL}^@}>*e#$ZD2nj3pAeI}G$K>AO#j8tBAE>EVjI7+9haaF^3)seLDj?+%M z`;VLpfy8P;&IS|vPlRvcmQ>8=oCe#44HF$a5k6h!XxXfoZ;^i!S%=7E6aUf+P9-k~ zN{o*wU95jYksMw$mlZ9={MknS^VC?(FXiQ?QMm#;6w648qI4xiQD$)vA_Ao#7xQzI zj6M{_6fM@Vv!c_m==d14(_+dl*tYku;n;eA} zlcQA?aj5ahe03OkKBrwB+KyT045Mq!9i)a(SB%5;PvU03PR3`?kxbNd;SFwJFuxYW z9}R8Uk6iT!UwtH0>F-O}_NtM~2^zPe0hco&Q~jK*9p*ZV5%QvIIQo<7=q#2lV*2Yn zx#1K3=!((+(8PE;1^_!!+vFkG5mo4j#ca5#uiK~dWV#bnErZzQi4Em$J3=zv46~?~ zU%-Wasyd?XOhyBAR2h;a4OMc$Sc(j^!m>EWxHuE@BDA#P8-5tNu$^LKaKNQa5yA z2_{;Zd({uSuub9Gc6_pNhe^RpLy)NC#k8eHk2ffm2F8dFe1@BGoW4^T`X*4ndJ~=H z__jkvEZmaf+DK39sNkuj4fl_3rC69#%e?)*& zgnO-bUu|LAGPUdpLLzje&lNP{v#m5TLTD^`(@tTvF^~Gt4F`Nv53%bE$p{s{SFrCo z{C=N(H;58%0Ic>>6k&@+w@Nmep}usZzJ`+z>p-D0iwV*wFXPTdnQ{PGqpn9djQ;KA z=Xt-Lr+)e5$r;h9y%wLJ;SJFF2ST zRP2D)y~jU7RAyx0LcgDKXAjCp+bh(Fv1Vh#&@g7KdPkZ*`*%4V{u|_OPHlv7AH~EJ zW+nQA54iYvwJk>9mNtanB|VKnHmlJO^=GY@--qmlfn+S z7mbHF9;#tpDigIzacF@(toP@N%6ONUxl61%h`4KA;dJbmI-2kTj-7r7;Kg^qbozGJ z1opG^fXa|Sx6p-~dT}b-@Uzt3q=jO;V4MtM6Uxv}Bgi4RmBuPIqV}I?5)L%a&P+e3 zGFp0iPwVpmOqDTuq2ma6_CY_mfxoG`&NxUiTpwe~iUQMbg_7O*z5` z;>Z6?I|RL*vu6Zy5D?l8lJW7Lc8nOq`>{90463G`KC)k=vR%t+hdz{$er5xE2BmN3 ze2$(BFM}*?3@Z|r)&1t^uy~bm4q~n-Bdc7&W>6`$2Rer5oJDUntxuxfihzh>Hu?*6)s>fu%^pY$v%p}O zx`F&qHOGbq>z}!NZl}|qlP~?Q4T|BHL`gR>yK*?zL&s;^)a_b?G{|r3aDh$CUK3B4 zI2AeU4dM(SWV?D*5>ok2QJ!E&6D@qpZMlwJzZCiO3PcQ?!c29?yit^;Ls26!^?=!r6AqZd4l*X znaZ&B_=btC$ML9^;AxLzn!<%Kw%>n8Lr}wPMDzGNHD3I76Kiq?cP2I7;ggGjbV^-5 zxlgQaPTg`TlOnQ230-D0DkA^AQfY>|5;F)*bgsF#lv*+KOExgh&QK{oswIVN;;I>VC+-Sx*sp)ZK%pP2Ai^|KJV!_01n zy?8fSc33D!r5=jS$3)?>jmL!X7Uwa+p4QWz45_YKKtu`Fwvcd4xOx2);qpzwLldz6 zdywG<*XWY_Mj_Oj)FXQ%Pmos8xP!2(c|+<+HWxEhXwYSq!sXWYg=CZPAliW*7baT& z1;0dH9$QF_7PB`(`-QRsC{l=)!nZqJg_&?-F`{>b*3fiOL>oNMIfREs#fCbe=i2JX zUL;eB(k7p|4kPsd3D0%9u2XK$w)yWS!SfgjBEpH|BzvN~1LB)1SlQIM@X+V*0?Tjk zN?&=+U*fPy=$Ce>D5=z|t!nTWv)ER}F|33!tkgcl!^l!mW?r_nC3Hbc2iyOIUx@CP zagj&3DSJL%pq%!~UHljNq=s-B*-zR;_ZCqYxfRwsvZugdyc{Thg3a9ttoIm9)c_~T zOk*u<1E6zQD=%|V*po}MU{(tN`f{MCho0eEyd7kIyw}OK;VzJQT$FZsr9EV?Qnt*jj)cH?`edG%lPKcW5xUdJvqQU8?*u~vI=b^^G|X>FJMQN z3hivAJ)Ep>p2b(QucA0r3CavJtz2tioZ*5sHr3vfjRF^AEZh+KnpPzk&LnV0+;a-H zG5iO|eRb4kZmY296_5{MfcR^lxAEb;9*YAwh4$DPgO5tJI$it(bWv;21uZ9S9Bz^e zFTkrhT5gP1SI{s$&|D8`4PJ!<{`SrbuBezyuc!UyiCDQwug|Ln5vC zUk)Auin*qxX-|rNnx;bqYR>tonmL#nVHfov!OWK5u2zx30xGm}-ZfaDB!7px^-!R&fy)kCRqVI|x02yz%-{ zFV;V37gK*5!7gpTrHyPJ-e%gcFGcLp7ahp5cjN63U8LY4EqpP1kAyUFmZF!PMJ z>Wszb4hz9YQ3#Rtc&mQ4jFkKK&PRJHCn=G?$j4_u>`I0z_xyI8Divi|dYO%4Rij08 zf9I9H5FTpV#$>uy@%Of^BG!bl$xf;-M~;(ns2G()r9!BR#KRN*_DE5zeJNxI7p%7V z@I(Veg$M5?$2IsOPVqa7yvnUPIM7dKp^fj@TntzgBBg{<$tx+e=yfHa2p z=L6|{4Pwwc(?|^JZzVBk1e1b9HdZg6jILoHOcqam7?SDk#>A8j^gP!^iVb8^gY&p@ zobMOYyvi7R>|JCHyHI2;e}c)n@RuA=;;Xihq58|<#e`741rIeq`^sWYNo(k@nwHCV z?)CiroQSSahT~&Ld~c+q-x6+T*pp$tgXPOn#T`{IEjI2tnd4C|?x72)0$i!_%AwgY zU0N9PZCtuZ?1~lftTZJklq7cLig}ehuQDbthv#kLd7EPLa(P}Y&x=(|i9By7&)XT3 zSH$!7@w|O8dEx^d9=`64g0-MV#A1k7n<1x8xEhpCySWIqa9)hw8st|*M%(> zJ55(E7n(PTWt(*jbb2GrWoKB;d@qY~$i;Na$uClMeKW$m&wo4k@0IKl6{K7|Jv zVre&qKuPsyHu2|yQJfBr(?vZyfkJ%jt-F3@D|1qay@lu3xR{s2^KxSHcJjPjo|hYw zw~yzEJWq_tQ+Zwy&nt?_tK)g&dEWS#yat|E%zK9S#_BY@Qwmnj=GQnG>NP3Iwddui z-)1o;K+}!)U#~gX717!(CV_BXGVJ*ANGwP9nWG$?KRAqI>r79Fyj`7fh%xq@1S!X^ zN~ew+9LTHSRaI}OT|3c;qM6qFG+H7ta;RZspik$Dm~Z{Yn~?w~4^9@E3$_8`mfAK_ zGYxItmxNq~Sv6;B$C(^u>^cWfo`7Agf3uNeSAS|sZ05o)^^x!<6r>KQo$8K9)CKl> zy*wYVv|};7{+RVT{9bsM>US_JEEwaP`vaQK5zTgWsGY9l(4gmE7Ct%d{t1&~!+JL& zz+SZHRk~lb87)%}i7gE1;k>yC<+h2;gN7YWo%!qr*v`Z*X%}7GHQ!xKc!}YfTcdyp;h}T-L87P zPo7mO%CmA4O!}E7uhUm_72%&Z$V=VlHMn?!ei8df4=ed9P zUx>ZahDYeUj)sbR)iOB9K~~@wp@Ie!wM;oSq$)WNC@;^2`rj-e)Sdh#R+_!;LujJv z&k)&rVIg#iayDkDTk1ica{m(eg+S;H>q!!5kEriqmfT|E%EmQE{TwEkIp0|-R6sL? zvJX(vlIW-*mxua;9-w@wU1aF>P!DO>GexCSt|%w_l)km>0RbJ>^Bi8Oz(m?hbuxcy zWyd6KN3=a&=G_8WG^iQ1od9e!Ap?+T&oR9RAGZu4bR%{iMotw|i<#ukKpAW1ZOOt1H{RgJBD_l60UQ_W#dF9J^$rlOJ;+1wRvfUpYyG*x zr+DGLs09{L&b7n*6miMip)TgANZlf>SGysy?2R5ARY$PSoS&l#Wd|@YUKXPV8Ri=4 zr)(P}o(Lj0PZC1sM~FUzvMTbJpGYG}8+^Rnhy0gN|2bV?Tq~lB4~`|!1>V9OQJ}teoh>ZXg?VDHb(k$*)XZqEz>^)zOB7$q+=JMcS2bh zYcQXQo~z=>-Ru5MC@V#M%lT(Goy^H)d1h33zW|(68-F_5KGt%ZP&Nwh8a_4f$}*;( zsImxUS#lO_rPzQ{B8v$eP*zPrN&Ln(TUmbDb`ezdgGbfpQB5;xT?Q-f)*~ zqjx93OYY!r8keBBK=rB#^=2Q|^6~Zv-+pCNg}K7>HCmpoVM4v>0rcxR4o8=nG-pQG zYff)4Vi%T7p#$kQuXM&MeTz|I$-|1M+zW(v+?N>!mQZN+27iN}J7M1$f!uD4Nvu+@ z5zQOKt*5+7es_~lk&~O#IPVTmPJ{5sC4kYBvuVK{p7KJIr+kcAs2GdNFj1;A^u0pG zV6U_%H|MLleemgP&X)OhPIoF4x94nG@Vn^VIWwpH{M_;!p`r+DEu`o_;m_!O2H;P-k8JY^n#%Yy&0$FNaNYxz7=xt-dK14Q9qTC>-Zz7`)i%4EwiHsgNo%;#?P9v)RNp$lR6+y;uuHjqfvx~`Z3`Sa*Rs1ne6%vg z?v+Poc>92mX+B0*4csIIV;UEq#*zKl0uLK8hlH9G}SngMd9K$ADMV5fTAS zT+oc5OfJ(66Nz$#1%(yCte~RI04svQnE`s+)~w>HySiTMjp9WF6i5O|5U+5`%8dv; z41q`lA`s^P-mB`Ko@A2M@Av!ve)jkK?Pn#^)m7D1uU@@+@6|ilOSG@RJ5Vl~YO~fb zs~eJ2R}a$q?1mwzkCTvr86ucHBHCd+$BZ$UHt|awGckP3)*Y*BebusQ61==isUll?#1KMEM-PB6#{MZJLOp9CqN^ z<@?e&$0nozkRAHy4fT=yebjvi=TT4d*mr@2%wnb|vH6hBO8uv^1fDbMJWBtrf9S(C z!?MR&WR6EgvP|cp8xvq}Xe(2QOdUl$*`w`Q9pTDDpmbNXV;AiB3FiG}4>v^gagqa2 zbZqFCet6ca`sj5$0M#sG6My+0Z<5mo9%Qm+O#H=eO+PSdK^W8a!-m5 z0mDfE#)2P5#RHzBfo@%x$Y0C9PcPJ|^!_XGJqa| zos%rOuWc^nr_lC?SdCIvqsCMNZ(`SK6`=+)BPp?%AWUKK@Pg~(E14Syn|C+oc}ZMF zq{+NvLNtse%3XY!sqCC986eP4@v`&CPf;5aYA{d}jzVrD0vLL~)?ffC8UU||mwrH6 zmRopfHkBrkNk%CXTlkw)jcIAf2xWTAtIqtD178)!ygCka6(|@IL-bO*IMHGlIcC)P zwFDH4a7wdruC@I7B9+#SKV;&Ah1<%C_+aM`=`r?1(5Tz-YY-IMbTW(mY4(+oh_39C z3MtNrB?AYDnyu(#!Bu+qELXH_(_A+R|(z!e$m0yvI_XY5v*vPTmKRn30Z6co@y^9TZ&a zn3i@qa|(bW9CUXY^f4`gYC>^8Cw`Iyt#c`ewB4SiD&$76lSVcYr$)%TGex)USG#>g z=!tPowPjRov2!*hGx&r}53yLAdZ~%5nsN7|ALHTf3qQug-333khP%^#Y!!EJ`>|Er zy`FysA&7pzjNYg42I=_99@8M1Hz0Hx^oUc0vY!sH>Zvpft5IDD^yvuLxd~qnn&zA> z1aD}H=zDtLI~|fN+GzC31x~ma2y_xxzT5IL4rFZdsKs{Of7DHPlQ>6hb}G3Ws;+?AKXU3$xqhRclgH*C_%zac6L0_s+JQQDdFfz=^lG;S_?dG* zuu?m_4L+hLWFvI9Y1i*ZwE;-ERhUH{NC~@bUc&Ec4r$AJGHa;m zR_-MLV?dn0c#Pj1+c^~ywc~V*b^Am6BulP-0nB16heY+E1KO*d(ctF+c!QSlXoH!| zjqk0TX$?m35np?U(A`urTkQ48jF0fR>SkVU@QZU{=)eKY*@tSAx>-l{F10`IGtt!& zY@1A%IRp&PJ_c82r;jT3cfL1OcT;uFyu!;%uy>gMGeijae#djgqbARW<<3Kkcy?Wa z|Ep%_mxUcmO(>rfny*@z!M1VwOhA_!bzoy6;>}@1r-CpJhLO`N+i?!#m^Ni!YZ!HL zPZLI&$?2rM@zAPZZ-cS1_Q7YhoiGH2i7;qBRbero+PP}+IRXtp+Uij!+EE{>p|5~G z02^IFC-S>VlmGWet)Hu7ZP9a82#fg5!yDEa8p~Z2zY+_@XwV4tX^BoQH4vFO!l;NzraEFGdU1#f*c4;s{}Vw^dY& zZs~XJF>n4oJ&`K0&HqjkKl~!v@;$BY{~qSUuQ9q>%XY_Gt(SJkBLed7)`-9ZyIUm! zBX_rY__=m7{u58Qj5=;zz^^D>>>5%C?9{~tog>2sH-E@9%U5(xVcrSY0l$c88%24& zSn*3Tv(WoOc;W|yhU;n}u%CY13&6{^34xCj=?PXP+b#rN$ES*Y$)a^56gqsr3w2MX zJk{`o@vf*cNEU%<)CruCfrr>HhY%pR%aza*$fp&)uS9@!+IkuQ1*z#xs&b4{b0Pa> z#ob^{f@yZu?*WK?)1LSsVi-9H zET;5oL)fD`;vwwz z9jzhk>K&~j>}xw(Mc73%MRttX3#c6HM z*Q^7((ZP!Exyh0?fxkmdiFW(fY}lj#!X9JB5%`|HE#+@ZW8O~ro^^zZFYS-CRF9KB zEXI2yMF{5L2AIDNwQA-}%dRxil@7j7`&^}`$5!s{j9wT0JDd}qSzzx<2cs(bPcA zX449N&isTz*XgcH&8N9@Ykbi!xEL?r7B?|Bqg}0hUkJQ|hSASLC!v?mwKYud)_B9b zb89Qm&x=1Sc++=ltI)S^ZAFj@?!rsWU1uY$VVe*gLv#Xwl|Ym*~FDtu0Y!oJwalmGEEC zaD6&ANq?8mZ?!&8{`b`Bwq}zUcQ$pKn$2c3C%9WSaW%9t8=Vxd+{Cu1X*T++m-s8w zZ1h)wO{R^VZr<3};f|RMugPo;C(4}$LsR%)FLxr8@t>FQpJiBFe>%b$8CYcGWnS*W zq9{BEe?y^P+JeT3jq#wdXJadCTfMQhwOz8Y)wTW0#@5zW#PeQc1JpeY*zEp>|18CS zs#+y6pW<#C0G)^4<17C>?<$F5eCSC=rb4eGl5gZBZz3=JmRH)(|88q!f3N?A{e6^| zGiIPFG@3v6JdON`TyF|q2J#ICsq6btPmCwu9wX5TT`)+xS-whMk>4_|yzxG`$=eC~ z@FlVr!2cxwVYI1!tMTfULiC5oB{paJ1qpkY$Z;ex5DC=Hwy0h?1dCBSXgl8%* zuZ;nRJKlWu-Dc|N-?eXJJhv76w((nkWBo3Lap#+F&1@$uF6VLa&FzIx+f6aG?=Uic zdt3AN{{~+w#dLUkr|{o(;jMf=d-ziKdvAEhCs z+4G!c&r_Q{k8AckEcThqPL+8;6$kz)w+}|a_MTrd9Q z4Gk2DvHzP;$NN1LRp)^`U&F2mG71O7rw_VsD!c!HvY*$v7gLb3$%(yzejkqsNb>e z`ph0KVubz9Y!NOS^Y9ee#J&!*>GGLGQCPa2;@DvPS+kewT47-yEUNe+LBfc(u->7D z2+wrzJw`;;%1N;FsNPH9EB%~eMI@m>xrflU3{)<%V7Ed6F^!MqcSq zI(;a6f@B6Px*B~y(%lBSg7C~k6!TqMtKp^XV9Hf$^9&l`WVZDfk|bq{Fs+G!^TC&?I>ikIk8jIxaKceKXn~#oP&-G$a6ZZ_3cDGqRH<({W z9cCk|NpiA{O7TLqjxbJb<{J2C-yS9u+YL9#NR8=B$7`54e&g397h}ZsTvNREVo@0l z0>=HZSvWB@n_MfVX?QabpjhGlMo|a^>B3|r=h}P_*H^F zB*7Q4Odan`r;J@CbM4X8PAnKht}|T-R?#D~Rc@B|jY3!9lB67Nh#-@QR+|P7dl5`4 zScga%lgms9sx(+Vk(oh1@Rmc(tzAxo3rr$!$5f9pksK(}aYO}aMbR9`>PA@>k$>IY&dluUq!gN#5Ew+g-3O4l&qj%@Fn5*RlU67Cn*?sCf{{xX z{*~a=1pQ$?0lmKze2Z29)k&Dan?<hKV3Hb{`;wcSm?L^NYs_23ufgVZT=UIh#s)jKKT*ss@UR{OSg@wAF11 zn^@ZHOBUzh!La%oIv9MNa9*xDB>%pF9_6YjgE4U0T#GiDtg!95G&mxRNlu`NQAovseSbW62ZBv|PC=M<@&Qb6GK~kr1dL8lpT$TxOTRCU39IT3^So zkG_*x>76F}D-!69?CIRZ)*S*gYx(qO z7uYH9Ca;RahgG$jzRbD+AY_qAHnnA~GERm%>+RtClZ3#{R{AyDqrSGDfa+xhJ|S=~ zLCcSzbqi&6dhd!m{bX*j)g}biS#eA&m^t_%?c;U0)_%4`eq177T6l0Ntq=yLewsq# ztxngzf3JB(Z#U?@oR$Sp{C4=jP^-;X#io|Tp~=x!A}xXF76*0iFc7waT<#kW9GpF8 z5nJd`FWsMn7P`a*l-Y@<`d72^C}?Bhm9_l-D4;`68Ghcma&<0{qfSJkA{%po`g96E zVU}miGhw+VaL%T6i9rJn2>x+I)zDR>n)5SHU%&3mTa8C9t_^Hkym_ByNnr9r4CLmv zgq+=5%z;iOsRrEdTG9~7a@F}(v7u339?S|69Vs6ov<&^FiL8qMp(R?@paFyS&m=NU zSY{MF%?frJ1&<|RX051wHqS&g+oSaHD6?!V?Y2A^ZL$aO^WN($&Nx}%`k6Tp>M5+r zc~A&kV8v{xbs0M_q5&m91ew3Dgd5Ah)>hy;h;E3fEFw6DUa?7iku_^d@VLUhS`07L zH&@U7%cH>r5#O`c*evWKwGbE>iD`ix9;-%yU+|1c!O%WwAmaYrqZFUg<8)}9 zKS1QAtku9n6{xwLZ73F|{Y!Lh_g*A&pXJ@G^4_JqGCC^<>m>DJ8)J>WTirl8vm7{^ z0&5eodJdeA1KlSbFgbjk&L9!I)SSN*xCS>mlBBSRKd30=v66RVbvG0zzj;F+i_ks! zx%@r^%tEc*`});J98XNV=p^>dy_(f3j`D`L!`7P}8&9^YMXITx*g39RVc{o;ysM)Q z;Kzl9uRu{=2K=8Q&;(vQ$AX|pa!qIfG83+44Z7yA?eToDiX8@xmDC`70Pw)bHzO(! zIFQ%uBKmJScrE4yytv+fpUvvMiRHbU0|#B`=-CF{|0mDJ^8=2u`F63SeHeF0$-}&X z@~BQ2a*yOU{f(YwFb}f~`ra{KIKuxh$;&2x1Q13jfPojWaw^^2HHmUPMYzH=7=+aW zh^!hV4k3^jTVfXin=COUKx<#IP=+Z7CT{psfil^FVXVd%pp^gy47#a6nT3%Ny(t)B z7eVGbj;85%VuW4okH`iEcS!PX-(E?XKv8l>>1BI-KvKt(+@QJj!kxZ+LbG9YV0_oPObsuUEvp z2K@huzfPsse~WpYDg@Dku$c%G>pumHU}mFCtMq(A45VRwq=e?{^Gl2MdE452ea2di z^$`N+qb(v^BVZngI$<(!ls)~lj&o?yvMGEq#$P#yYxQV4MldGNJ?+z#4dgAq6#n9} z_P~I~wcKwhqCLloeIZeqlMdZct4p={E1AIcCwd+6_VmbS!hbR8F($GXzC#kF9mZv6 z-^*XgtlEaGB+z<$01B`enWb2q%8GpUeuGr*e6F2q_A_`G~hZNYPfj@Z^kymg_uHTtfFy7Jc7n_H*Ew(iASUtn&Xf~_$)D86Z2Zfczx+j{46)_Tiw zQ|nZ0?d7dk>aCmOw80O1)TPKan#6*a$d?AXyNPAYJ*F)_i?kfkV-kJ2hj!O;h919P zj#zNOdx1y(AzX_kA}jG76_q>O*o)Jn-j}ZTbTx$mnnX^$m$I7B(zjP6MA(5Y%`*s( zD;lE}Pkd^s7!^9E{HM~(DighQecF;(Hv`HoX6nnEfj1n>$ml{9awOa#n0*2d_NY zrf1=PnS2PQQsu5R>N-&l$87LY;Kqu2BRAGv=b!Rzn5DT}1Jf%i`8ppq#^wu$E?HnU zrUL$sDXSml9N_yO6BXmxt?VQ@vg}qEyz1Ch_@QpGAL`2T$i;l;$hgoaL3S?1;5P!$ zCJvfl_om`x*@ayq%g+ukV%<`?AA!8v)At5R2AQSF9B1;M#GA+P#tUBNa)!2vC@FP4_?z4a>;gNk%bf3ND;5q(~^b?wF=+9O}dG`&%!h%Q)?=D56y!&-5 z-T~~&afUB?gN_JgoNpIwpuVux)5X+HG<<0L)O#J?% zvl$vZ{~@8_bnLMkpn-NJqQOk!{)NDoL~N!`)ZcEPx9R-tTzWer6Xb(b&0ZVDrwI&T zp}`+A{Y;F!@O3@Cjv-IY;taNP$cav&e`$S1VRK46UJ$IY;d-BU9D zA7hucJU{Z5#yLOIm$p1VE?ye%{Ajnd&GX~X`)!^d>)&tl{CNBQHqVbg zzyE*1`M!01bbVI5*K&Pce=pAZl)l$;eI9u)-ug^=ug&$zf3MB;Nqw))_37|lo9lCU z$^QlG^ZMVS>(g^d%k@cK5@&tt-)*@*JKv4BKGpBGxjt{c+vfT#dbiE>`Qy87uFueS zf7ALj3ul!hLCCx)eJDO`R1UbJ$>J8cM8|1?#6?wz>8)qgVmeSJ$5 zfByA$OZ=&NI}ZMQ@ODf5dExDN__N^cHu2}4x7)-Y@$EM8C++Pv@#nm^|7Y>1*Z$~A zzyH?1!JoQ)(aQ7RY6E|qo1+zr-fA6xC|A+5R*R)A9^w>k85R(Qu@T6YCA@`&FP3sm z1TyHi_M+&r7-+EGKNb0}I|#X6EHPC|W0hXB7?oJ;)jNt{q9n11`fMjuN4ZRz3~-s3 zYVwTrs80GX`|j`s^x+=nTD+!us5Mg$Wru}mdOfz z;TCKT@P#=#zc&KVY&79fB88~5D>=mSjbQxQAk!;HsK3r{$F^swh+^8)t4>02+c|Ul z_2+ndozF5_>1<(6r|?3$_i9m2!&!(|;!BzSMUqdlcv3&>mg*jN*4G zAf+TZwN-y*S0pwM$wl7`S96nXAQgFB`@CH|at#xVwBGAbf-1+Dv49UYT-2vuXEF~H zc;?!pcZ7Vc*nXqG$ktc+dWElICLFQtmJi5}Mv1;X&P9#@68 zzo_&SU6(!j*SX!pB|1L9M0!3F5T2ZdQ$ZIke~nFYt@r&XVywVFHnZ! zx9t*qTU15^D8G@=J@OkVtyYp7B*X*jHVKAN;My(K zDwF?{k|DaT6CUq|MNf93u~gca%h-3meIC~rl8{>&I;rC`;-g2MWitghJHZfgf}K?6 zJE+r}!`<03=ej-^|O~Gd9J_FCJwCzyW9(MGedYT!!iYgg{8ED1Zg2_yyLXoWbE=k}$j;cS2w^ zId%5(xE#Xca|mCAC&}a3Uv8s$18xN}sM3udSLWP(#wf`MpYpggNG?2d0#~q92{3N$ z6dChlnOm_XrLFe>fUdjHrlOW|OX06Nf#b-n1?Cd$K7;lFg~ zJk#r-J^rnSLe=bj3eOz6o2<9QsE#P)mEBO3NuNi5Go=8_isWI1KuO$C1on6iq)%D2q^-y4_H5HoTk& zZzi&aemZ}YykaQg>0#NdW%gnbdlAhs zB=BnI$|^6Z)f`AFU>*kYgUp zZ6J@B2>%u9;b23zp5ymKN*-VqAOt3jQ*u&&)foG;F+O_D-0q=%XZiSU6kVB*Tr50! z(OL8UtML6OVVuWgD8<{k;Bxqpu_JWiOoZVkX7MZ!Wf15H%x|0}6i1TOTNa4`&;OE z>PHze7aT*!%bKda5z5-bk^{d>m+~WV&s7&#PkAq=Y+A~7bO6&g zUaWCbcFJm6?i2tipjx%}Mm+3}LcexuCe|aIS2YL^ScERw_V6l?L;5D-;rfSa@F~-BnK=BB{BJ%ME;9^0VUj`=#_ zFKnz)&`BPRFRMBa#w?3ap&eR`2dc2Xil_5#N)3zq>ICeo^mGiET2W`hR{+v^EC=Y& zM(jeAZwH*sW;mN*AzVdpyS5>|mLtC6;^4{vzHT6VO%S@c8NOoV72H*lwXOhIMv21F zp20BYaI~uKpX}~r|VDk)0r=4>h>4h=RXM!ka!8o!TDXBrcaa6!Kj^jAKz;R&Q zNobZ0yqkkz_)^bfhE zc$CyqJZk@G3-QSNT$|$2l;-l$vex9IA#KY?CN%+oZPW-!XW!Xjx901w*#GF*`-V79 zVQCG4?kCzqop7ht6X@0p3p!I!yhA*5(g3Hn1n<}>Av=~}DpQ2sItf6RMgF78c_3cW z*@g}Q%P7>}`LK8Fl6g^d)EUV(kz{QMb#n=lpjb1H0I*hlhaT#+#DOzNQ! zwo;|$Kzi^P5w{gE$?D{eJWB^5MYuF_$ZS|4ZD* zJ8&Pze0-Vl@nx8API&-`T6*xACMvh7)v z^fmvSiC++xPhtOShwDphEf6Q)s?Sis!H9Tuq539MeTI+UAwG&Lb^jmQ#uA3SX`raY zzzd7Y5a9cyv~yxwBge{0!pe&h^ov}Mm9@fx%NSN_mn~y*8FoOsc%7)a9k`AA11d>1 z?TKTMyW@gNHTgF7I2r&v#tzSR{tUo7BLt4qiG?O$+l`UJLGgPUBDfjV-l1*IHRN3ifA**u>@Hp{2ruXMm~oLWFd za1VwT+PhC9{$cDQZ&wMs<@t|U8RkFMbMnrAnvpzFz5_6Ea-1WD?qMqcjZcHT-5ub% z5)Hva%T4s3M(#fY@9~O!IlWIY+25|@_h~ynXc)Nz>H73l}Ui=6#u=BQV=uvT@o;ib3 zbTb!KybvxzeG6>)LSiFy-#Iq^mj0;irKq2rBC&3KM+!d0rv|RK9GP*W;C&NK7 zVGw=moJ6yv;7HUJFPR1s@)z~fyVx=UA+jN96{A1J!wv2MgbqC<{_uhS^ zpxLdBu7G7wF+Jrh1+AkjY3lTL@!qWAk>l!O((C@9RfU_10~3`Dm*QQsotXwz7LQv?XnO-Pb6XlBLQg(MluY=O4#zpec!VU zTl-`~WSE-N8)YUK6XtZW>H5in;1@&H0UhV{=j-5667_j`vM&&oW|Tm z`7pDP)%U#zl_I3ZS8sJlXxl#rCVCY*r{nn8*k;1VAA+&L;s1?q z2My$V+Kmwa<2OP8bGZ_+!1m#_1YZ9+pc2;^RHP&P((2xCh_z=Td5N7zZejJdQ6pGF zjQK|>^Os5?cgH)^V*=G)jZprIR#zD zNhp=6lpL(|Fq6rd=k=iwQxtYSjpe?=ed|%C+cDpQhW>>haC?)|4VOtY71l)S9_`9i zsLbm>I2kC9I!BCVAJkJ@wvoqDQu8PUrjZFK9ZKA$k=@y~k3|pI>eh;?XpWahdRtiV zRdr4gg*`E6^44nuCKmH}_^5T$E}DoUEYClv&ar2Fk+F%l!-G=$3>k8DhMk~!EB6qD zv!V)4WsGAxNNtE@`6D(Vun*Y@Oa&1*MSYR~${uY)Wkcj7OURcj1kb`Baz4Q+DxdST z_AO^P@?7s`W2P7;Uq4P$z4Jm2RRabCzG{d>*8~U4)55u3pQ+R3f{43{}eDgKsU&+OAI7f;+D&7Fg zUd$HOEH#%5vl{HwN>OvJl2`9JkUDwrc$>EUDQ0e;Lbl+CB)LY*+rX+iN)A~ue^xq$ z(PVzl9#@_3f#`Z+c*pThhjzgRCL5ZwKW!I+{gdd9qj}yZ$+gRSA!RI4#xMsvA-JCF z_#Laq$U`Sh=$`DQl1Uo(%U=saF+ ziZ>78i4nTp0H4Q;5Ak`pXda>88Fsw1V}7Cy0B}yw*3M<5)&}^mZG+P!bR~<^DawaK z%gNWIjsnkVZyPn`wc6)&_5$(xxQc0!=O}d95k4HX@8tAw4W9qZ3RvDM$={NIa`ts> zKO+V2QIY1UvjH33>2N|zlp^$zBGNFs(WFBHa?E66JphA*;R>*#8`!fB@xgziQZejqSNaYf!8vZc71h%X*5uHKCO z3gt(b?%pwj(fhrDhKROf6}wSNoWbDxB^Flm!lMtd{1MR2jMt$2{Z;Iysog48`KZjE zPa@wUZ>KDuTIh53awEJf)o;-aYw36A;W0SPS&U{?6VazNg~@%)?m3^7w70+@Fl|eS zRnnD+*9ICWX&g^JnnI5(fBIzzo6z}z1|xp!T1@`qdo%nwJ(HP3)SjRBu-lK=F^E#O zjtkYo)41mHsN5(jt0~Rwpw-3j$9svWpx|=M0UUdKQKol77gQhE*8=?@W#eCG(=kLG z8al<^-UyW+V~KE|s*g6%?w|o+J@O-G?NpFTpou6y!5;ri*>2^(;^)Qrm3!F}OMUyq zbQb2rq1!v7V0sqd{Q&8Q8F*jP-QT-PaJmKv`t)958yh0G%MA6mgEd*gU-K1+m zHMW)%qi7Pu%WFOA-R?xpnkiY&n&hUn>Drn{#i{uy?bxY47Gy?@QJEG|M|Ob-Vc- z-6-w!>Xt@nJ++@p*rq&E!i>_ol2IDQ%ip7IXJx)4+^l|m39~mVjbm@tLSOUm?EBBW z6gy5d7fdzm`{k$=;O!+X?E8n5{1*Fub~EBp*5J;-tDSM8x=@c;Y$NYh;-dce5~az? zWd5x%q5|~+V?-X)h{_+0jtHhw095e3N7;xz6qQvJubVLL*+#eaOUmY9*1@h0@_IYel3e160(eXJjQIUOQ z-Sg2oyfx2k*!kij-UL^6wqD;EjMkqWQ~!=4y|Q6Pd8#y8dDuKNfaa#+*o#DzW@A?| zkvu3SxXr6*n4!L5>hSd37OQw@uCa)!A3o0@v`1aGA%-o zWoM#z9B!ekiUs7zMwte+1pc~AcS(~~Sa6KGe|LTKV?6Li*5{Nf1oL|kg2zPq7%VDE zIn4v0E&;!D{ABKfAp1D|hcin~TW4O(z;VnJ@nd>1sxr=6eZAQ9sP||n6@H(J-wr<& zwRh<1%$3VUWhjPNdE;1bi?1IB&*=(>gC1i=t_Z!U-@j&j5&F=4UlTW8%4$&^nD>?* zC}eE8DL%JBO`1jbx!g*?PmMGyWE=$ zi{YE4P?FfUN`xTFVsE2Ou@G3<>Iz{GusrEP0AppDLeIpYL!PYhwPO6WpO(mKD2=C- zMiGhS<3xWyjKai1){H*d8z7|l(h#J%45T!f&<(o(7UK)Bx6lV4p?aF&2*NNhd+NZ* z8#)v(M9U;~;MrI8f`l~`&#DVLYLdJgt8=OpxhS@r~D? zu;la{9`i@}jBS5E9x);!Oz!DtjO+w%+{Cl|&Ok|;*6BwSgkD?~u_biSP$J(&BGuwsKznsTfIxh;s(21sK`}l-CG@9u*vw%P)G298Q zVb+xB>jKD<{PtlWev&&-=e;0j(GtQ~5S@oJ`MzWjEYuTdU9u@u)MUOt)Yqn6bWuW+ zsUpxuI4mQza#i^4w%2fabi5(DEK~#WSdwxbcf9PvakcPOUWf@Be9g z|BL_n-?I1rddjb|_rHE|3xk%|eE&Q4{^;NALDLl_-yXFf@T8R+Rsyd2hi(&boaK1h zTrbwJg|BgQQ?_zPwYw4i9=7^+<8`T|CY6IsV$=C~&VL-XdiT-oFteBBd^Jh$c+akL z^+Uw8Gm^YZs%T2|2ulx(D~YmGGR71l9u==>Ia3KYL8VD9z%|U47nRDboD6& zPBOr_rHVs|lrIQiTtLR$XyPbW;~J`6FU9n1c!&2I%_)IRjoTRbD;?F@I5oU$!)yC| zQO##2-@i~sufvIwn!62e295oIeJ!w%_-1AIffaP;=25a5J#uyJ0{)`;M(6_Fq5Wj&Qsx)|BWQGZRaTw6 zuGO15!%?HnpN0ApSc5G2gl^urkVR!%Xo||FpRCHWM6_L%UdeqU@MCAP9&(v#obV89e1_)={l*Q&vgSjb0Dz2k>jW5L^PkPA3!^MXxU93SIOXo+Xyhj5+jo6f!JEOa__b&c}E}2xT{l%1mIW=UHvO z2izr*e%`TB)`{JxLMfoiNg7Ip5I`S;l83Exh2Res{p{4C#~{jLPqUbb%#y`?uJ=TE z93s9sMxX+Gt;sma>h#jr6a}l!4@LK#i$$rtHv9(NM>OA}*Ntl~g9(8@5r^0^R%JzK z0;fAL!MVPk@n#VkOtfa;&=+8(@X*%^KL+`1Jt$B+L*`|9?0LP3$ZoHc%^b4`9ycbaGZ#EkOl ze4BHMoIg!elM|r7n*2^Z5GC{N*VN>9#gUz?po>ylb-v^324NnY$u;>GccnTheQWZa z-RNiPked8{7tqi2M{4p1Tu47N&#TEF%zgrD=I7$in*3rMB~a5>{+?)=B0Nxm=`L*) zoKWbJGuh@LvaegP5v)w1f*Z8*uW3mvFwk%QQd~=hPg1)4!d4 zkFZJz2C2v7^MC4%wd#2-Xh)S_L#;hMC%|(FJ!1y3`TG!F2n35HQ+jtW|^_u4a)#(m*@&^sN z+%Vh?tW~}AM92`p2Dg?S1T+R2ho&QFhx7ly&v&oEoFdqgrY9>Z+S)}yf$ za%XAPqW_$A=D0Pez}FsB*hLKb6{X{6P_djxR{`WM2j5Y0MA-N?cI`F-ewQ-Ue@-U+ z2WR>ScQd8~;6E#sK1!i~JnHlWk6LUUrY2oa_EN7~ILq@D!Rqb{5~7T!fj#{mfE*}#u3(Ff69zhH2mDC6{rJhjB&Y+dpnA0$MHVsX(C61q>3LCwP)`|h!66} zlh7eyxO(?Bz)C>}@+1*&q{L)Cixi5Mi?h*7+@Zb9Zo`vy97SvFPTY9dqYl`JMriOU z*U6bzdRzy+Z{SBSg^z|`AiVjU0>)$TBdKX*D3boI02kEjY|S$-vDu~S3+x3a{!o$J0G z_>$$VABoWe7aJMvZTQB{-=va<8VFUe59rc&=4|vg#(OCT5GK5Kno^1iW>m@MvS^We zv@?IAXpt$BI-Es}bbagXr3H&w#K^hD+@BpIMt(*9r4g;T*zjjxn~v9f?}+{+eg)=& z@6P%&OEv1kSh9i9z(sflO~c?2ZxhwaMW?Jge#OO+s7`YyiWNU56{wH&x9I-GK06L5 z7b8L78VsWG&x0;Ur%THHq&@GYRo%nFBCfiJtwA?QKFY^`N_&eH`obO;ib5FHX>EG= z^=2^^n1=w&!65|R;sbYh)O%qYZ;Xx|MW*bfY-ynBH7_YzhMQ1?OA*RwNa-z;mW+kp~wy7~$JS zu_00THvA?_Us~?HKvF-aB;Ur>VlDQGmL=GoXl=HmUYs6pN1d9?U~z16^BuKovS~-9 z8y7m&lX2`zc(TYk?>tF`9YmW3w}a2Y41DfO)dtUrz23^zN7CSJc`MUPyN{1#@?>@^ z(~WLrDDTi7t&kPMzF^m1?#XPQJb_c*YcYY5g+6B=Iy^NB#`7ne!T17Y!^HY8TbjWs zZ&ExseS0?>UCrIi;k4{-6Fn=TUZ**y$8`D88}Yh)gm>v}(dAvyF2_3yu|IcAf0a+f z>#ra0&)K5C3!?p*TqKjV)02o3$^@3cbi;Fi0DwEvr3nEWYTAAkP4TFY;AQloqyp

    31zo`jDPR!2G~XmSa<%j74YOT6Bo0XiOrT9^@|1kXy$9I3VkV1?Z(O)l}C< zSZLQhc!x|JNg@Ti9q+VzJ9^}E2+kRIpgLx-v%qys2%MyC3w6Rhbh{{j4JHdoF97~- z1|?kC;SHnnTYI5PBL2-MFG>290>4tS{6jkWz93(`nw%8It=90;decJhAe8}i@v`g+zW{O`PWXQ@hQ~Zc&8)W)zHIMVVaF}4~%arxg8O+ z<)3Tmhy!F!dO5^_(eQlVcvQeK^j+772SZvRsy;R(Qk25$RJ~&uP9r?m!{2)Ni0|** ze~*lB(I~}r%sbqJcaqDaw7yQPsKcev3@!|CstV4j`{V zeLz?HIgMO4z5l?l*Ct_jA<0@T0bMGP@n+b+E;Z)9k`$1`)d4f{Bnh?C@5#jFquBqF zCDhm*B0t}gzHMldO3Jm6hC1s~250;fE`eHZ!&FMfJ6p4H-U`Cx`M&UYBLxC(fqtqRn&gM<=q``BGHQh7@n`(WZ2N|RH^UUsO z-gAr^+=4Cq(2>P^HQ_1R)j7PWsMkP4k-uWwFt+gU7F}=UmcQD=V>m2vI5xC;T0a3k zvGzIkiHmrH#L1*@;PyjqanKymn~kSM2TlAvQ7Yig5n{?c>ke&`dUU|is$mHKl^YmB4?-@3<_IFd79<+P2k-!PtF+wW~Ocr-a zxpyoA;3&f2$)|~mAy{%pGa+m6Xr@4)ZMORL^#Yny!lf<@8qltg!#MnBZNn&|lgKF6 z$*EBc#xG53p%2vW+C>C<%xp7Zwy+2_Nltdt@*|-xj^2A0u1hEMx_PfC&eBBvUZCrW zGf9++YyKHpn;|&mdq%NIWkx5Z=0tq?0Ckb){bK6|i*^{-?PQRz+e@%+vZAkBKc;cf11h8%(R&;Tbbm4^SH1VJcP@5)1$uVP=cGh_vU4k z=Vct=c}u9B@)?3ws@R*f(oPhtDjPtX0mf4$TBAc0x zC(1x>Z^}4J;t{uvWgp;?06Otdj~sI(e0qn)7<{_iLoa8Q#Uq# zmdDkIZ-08{So3@f$MX4}8fKoaYwT~CZ^vIV--AVQ=DV>d?tE7k>GPd9toeNZQS|TT ztJ|Z(7GMeHfKD^J*rR}_%)V$OWxZSEY38{CyF{xal-Z5ga{&u^BgyNun&E7Fufe|{ zW+UEqrFI^9udjoFx7wY)SC0%X!o7H(!NS`OkbrQP0HzgLRldZrBCJa+^`&&q%w?tRyZ6_kk2t1z1?Q}D^J>CIHqjaE9v$6ZNwuJ^?FBVUMaecdSAkC-vZwbKZAI>iqv|8 zqW9$JK{=ZRP9fdiF|8+5rhws^Ftero>~GZPhLrKK`au)EV(pq+$WDKX@PG&1-eO%N z2}j9s6R=vj#X-3nZy|$zq09LU#XEAois&$AKcXy3dOE>eXR6kr(7^F){%qju4<3fE zJ3P&4b2A(OPC&80f7oNRS<2hI>0vlN{P!*3V@C8MQCV6-K9_G=$VD&KZA?@a+Kn?j zRZAd)V+_-i%aXOq(8jm1%^pOJP2SW4w- zi>`g-i50s#waK}RK}_+<<|h>`o4 zsRaeS8(}y6R*1ZfaOnOk@a%x+sPHTolFN+|`A;rP6)~it9sy7;Oo!*pXe7YGOn7#~ zvztG|HU^SelOytHAjuf3udqDTdc9 zn4Vl%ER>CyE_5lu$3=y(!IPi)25Q1`bm zowqY@i#czvrt|g>w;AUxbHcb%cf!cO%_JYNb9E_llesPZ7viqx%ebBi5B{IKo-80@ z^Cd08Qq!uI@>PXrTvh%!9Ts`^qQcp*q^DLJt2=sFdiX9M@oV6legFUS@I6&w!1qYyuZM5Z?WXvTx_p39YP?zKfG#3`pQ_*FT%`LA zuzQ@ZJjOlFU?&lyl^*j6(wzRVcqrR+GAN%Cj`SE!!hc5K zY1Z)8ZyG#Xh2X0e3YOZPC4?UDbW|rfP_sy>B`JL+`lut@Y26qpbTkW>(23Y3FR5+@ z80^w`0Gm5TFJk10KBkQw$)t#wJkQ^IFg#O@HuzqAIbP56Jk{`T#aO_YfIPfeYs_vi zMQyhB0Y(Lt(7_(-<2nu14FuNe#&F2RD)CzrZ+fyR@b^AGdKdQC&ru&2}`5Wdf>pzky$p@MzuGhL?A95AT z;q5wOXojoNOBOG}yrZX3{umZZCsLS_E0k&3!s<`@a|?c-6OCeo)%DzcTZ#e9lmf^z;*Dea_GaSr7C`$5jHPKa-}6Hm;ih{w8^-{YQG zF~2RlogFtc7S%w?M1f=44|r8>t-|oIat?Gl)h?kQ0IjKOBU+L__isB^?6AYjSHmlxm~ss`u^2s}-*O!uetRXkNg~&CfZ!JDHvRa_!d1E$rclK7VV9 zUwM}9dbc(Y+5C+WYL?Q!s+|Co>&dY}Sa8V7bq;&^KC@{*-_3}*^{DqkJUk_Jo{hr5 zw9|ee+AzRbKv!-6Jr~U3IL}XyA0RJuTQ?ey#do5&@eEQEUtsLHV5PU9RnD7yJ1vlN{A4Q_zto{U${W&=QN^BK9hqx zu%A;*1Q_B)tPMj)v~&t#%R<(_6?Z#r$)P)W+;N;4nIiliT?de7rHDtREOhi{A@Chk zTu$W=#>S)MWT8F9qs04Nj7RxOMqS8`_L&4x9}vx@cx&t75g@u}FzboVxD);b?2EQ# z4w*qwjDX`>b+&$Adhr}~$|Z>n&GQwSB8mIZF$9BE$n{cM6ANL<;?W+lSa?v?>a%)K z)p}anQ!JkNxVsv7ZUG;x!0aR*TYQ~ltuCw;(3xEl9 zq`*$BOcMPSNpjX%M5iA`H9v_mrdLp894(fX}dX9%rR6JMvT zS!5)imvJ&L;{b(hGvt046RtZRl`F6gxsI@6*qmdHV$zjf6P2fcRLEyCzF-S88uJuR zZywwL^GaGPfi!9Tf!@lMP?}V6*nYCo>Pwyi1n&3lK%MWZ^G4h?r9;ldMI-KN7QH}X zGiQ9XGNVu&G^5Dd9q*Xik@s3V(O(TKFrzSBU5fX=Nf-!Ay8;fI@G?;y_{Ia|OR^sW zjjXz%S$sM^er6#5tEO=8_3@@K2B&aEt5djzPocE!@qb|&|6O@7{^c2D>uRaFj zpF6PG_(j(#UytzR(RfQ`!>yTO1XF$1P;7k{<|XjKAyv=d=Q`zWz;^yhE3!$wgtGH8 z1#dk*FB)#eR{nB)<2%@3y1z5&_WpJg)9sOfvd4S=xX_}TqVji(`#O@BPcXB05SK@Y z7023%>IAFkuSpa!&fmgU=v>XS;laAbjr`pWci<;R*(jW9+V_mnsx45Ry9-GDUR} zRvM}%0iZDbNnU!?r=0UOb8TY9p+s^t!6Rku)fkJVVdnXgax<{9Htn&&WUn*F$Lwe} zd)Yzq_p2Gb*Y1M%Fim!f@woO{0k8c!udQY^k6$@*RF?cRAgNm@+l?M6+P!ELtrVsc z%1#OP`@Qdyj4$T@h|W{q3rl8aTD%ttWu>U%&dS^(EUrMW2zkBX7oi8i`{7323`jv2 zj&u-LYuEI)tek+tdDZF(g!(m+jdEr9)A;t}>czJ+Q@r7sV#NcjNvY2FTFDFUAt*SR z<*@rM7L|fxjED~`{YJF-b5+cReToq>@`?epZZv%9+MmbqK^sS7-w8nvOF5LGOr4h-#H5biGu5Dtmt&d!HZ#Q_+!f zs?pvd4$4pA`HBYRgRG6Hn=7oIj@}>XK#7%s?|;}EBa7Nabu`FX>k3i2y)Khph z`J`4ggmzeJmb{K2>8kV&gf23T9;Q(b(|Hfog(%h3U)vDLmCwnQ_~3^gvNF~QWjPLi zy)`#;uMjLjmTT=fKF%3-ZRuE8l(G>H|8Z+(tq`1u?_bm3=Rn_G-J;UoqdIGGS_Qz9 zOJr{hfc<1Nu%q$-XQ*#0^~X5Sxq{5~@`N(4(=H)Lcp-MV^Q`_oz!aMhz;p}9SWPkf_m$z(q2Odik`TO& z)Cglzv5#yhWd2P;Ac8%8-o)?~`T^VuK)*P?(0gY&oUq)xAbeOma}69rQ-(K0PFC2w zD?(>uMmq)XlSMV@c8nA)iWU2l7lQ!Isie$XW}dR^eMYB2pY+Omf2BTgc@e z{(lk#GQ0L*Dp$LDToLd2ix9ofzaBRj5%EWy>99}X-6{U=_(1+HQ0`6hC^H;zc6wnS zPjcYVn#+u%3P#{3W;kNv(H2GdmJFbs#b*0j%-(Tn#1wBK+Lsmj3a`<=@5hEo!y)bA zDc~CJ9>@;3B{w#^pVotQpBN#oHrXj+aAk4+5$4{Cu@ZX3glI+wlZ44 z(ITa7L1!+e4yi`n#TCueAv<^~fi0v2A7;ZF7jg`;u7ibb5a!WR-0e)6B1~ib3xU~` zlAyVIME9=;qHuo#`Pa9qBl#Xz8T!s~2;Fm7%vz)?_FT-O4r6CmGMGR@50HyrF2pA{ zV;A&b&q(92qjYlX6qRf@_BI`>PZaxRXGV2L*oI|IGe>Q%VNsgtH#5>jr4cNu5O@`- zUDWg+frXlSj7(5*FbPTi%Ofy8z~FSeZ`dtz1xnhtqHN@wU|9^WCHhwKNEf@s+e=h_2WZwsv~FY8 z!(A|25^%-0*hrb>`p&lvyX3JQpS+HvI(sa+PQMqS#|P>01;SO>Q5Wla`HABEb%@t7 z_cI74A+Qyd9;kHUIdR;mPYbO!#W%@_ggzjBZzT_a{z{XcCgEZDr5(8q$n%&q_-FrZ z`ry!80SB}KF z=@x>pPgD68Wqs>g)uY_&_Vm5iGz_VtDTzXxAgI1-V-wSw^Bs#SY#8awe54fC9{y@) z(S}v7K%Jvsv(CO$QuEHSw4fNEwlN|J1zR&tYTf!!wOqL#$-)OQWT6_!!sE1SfGpHU z$-*bjwai3D497Vh&u*$78m)ws_;XTjrXvmBtqcZrjE$iKiSnQND~w;?aEc%zR}ywRE0 zFjJS1QB}7t9e)kmgPejCToXe3{9)yirx7jgsY@~b+c=u`Hob7y z0M0*;V}sGlF&;LTKW1Q-32nCDVR-h(G2}cY(Gt+(N7*Q_IV&TV!FqZ zlmQh)M!mdQ~Gj(i25#O6%vWAx1dwz^(2$T8`O#s1; zqr+Yns~Uy>aXKJEC&%daa$1<&twvDvq?kIdPM~;!=;#OlfdL^DXea;?fCD>dI^B-0 zj_So55E4S!RJ%u+Wb-I}DUo9iin@h1ZOjA&Nnead;9fMre=}1n*+s6 zu2bMWELzf1r|`mMCY{2y*P>3LN+`=?It9(!uX+53lXRg8qAmq|fr>!DSY6*1QW+AM z51PU_W=#m)aHnBks62`7p%IA0zQ5|?a91eM#bFE!Nm~vD+(V%UVy+uX|4))H%crz0 zU%osgM!xLQtA%{|exKIm%cuJM3i)!?`2RWi^1<;WUp{ z-h8v4bTd7m_foCPcw<+4#peECTEjTf+q{SG*$H4lJmcD%t@zePFI&W+%S>|C>5c+X7upzpX*ICz7Rx~XV^-Kd~Auj6a1`N3XmQn9T4_Soo$aE38{X!$T z5q984MwIwzA>d5lJpM~bEqLR%laM#ALGN|6`PrAt5Jw!Ub~1C}xMlI5i8%0LkMbz+ z$B$-86%pi$PgCb_#N~=be^}>=l?f>nq0s}qn3#DSW?ZsLg-nRsFl>-Hc|{2Zq`sajTl#uHZy!COG!QJC2LmfJV1Q;EzT!k5zR z4n{xAWjStAwezT%5X??wyE9!J1fmY`zp$Hjk8OyA?n^*_5Pd&F>3VXxLhQi^CMKzH zBi=zXru!Sw|4V|f;A;w(`b(m>gQymYgmS2B?k%dK1CvYxx{1~euurPpXk%B**l)R5 zLxc+>A#El<3&Ah2%lYe;TbiY8*ZJQt@VmtZzk7?$_uw+#(^7saNr=IbRot{uM1i7E zveIQqkq5A{n0jckw2la*+E>Yz>=Eb0>VPw#<2cV=Pf(u@>&utaSV~;S5@Q{uu1toT z&UG7mv^R$k?fHay%nO_pf|xE=VR-hIo5lX4MaF{gFrqs!{uYbfVo=)73!8uJM)+9C z=d7Nu(1a07!XoB-K=^iv=juj;|NfPBY9oT}pWh+Y{!4EC@n7|ue^z2=b*VmA9dO{Ov^$oL|( z7T}>R=>g)>4Q}}7I=3NVaag3MurqBo>bu}@pN`VgXWbdR2z;BgPEZHakWlijYezGG zsC;M;{+Jj3*~JR4yNdWticO6P8_p2EvOSCUZSPnxiX{lgz_lPiQoVyb>ZBADy^)(! zhS@Pfl`_MYRkFS$SI%*09|#RVIeb}Bqx9{ZCC7W_sW$IHqa}6(O)1dJ{w)9ciP{Wm zEUI&@-j~R7UCBw+^Yf^=m9?UamK2vnR1%qS;dYaq8Il<_cm48^7{j2&#BmknhCKOe zO!h~kLlps1AmW7dgLF4z*#!KM>^ek%w2gp4SN0|x}$DQA5?1kZKy(no6HIr{QZ`G>IAye zR|Y%DxEr(d?$Bm901Xn8D3NnYt{j@bj}B%;E8Mi0!*dg!zN9s>ODg8}Ap6{8jDr=u zkHR>i_drozFyfnnyPOm-g{I&SM)pz)0fD?L=7Yy%S$GOi`lzmz9_22e3cXma(iVA2 zwSgUU`{-syLt33G=WlWuDL;g5e z?(2gxNZ*YAkG(emZ>mZk#*=OYL~l@*swj1)YU|o6Xv3l{ZQ))>wJJDOQR}E!6{Si7 zb=XZJy%QsK#BFpGM`y%kM%)kqp=D`7TncWuK~5hbMLw5p5;C7zA!>8#7chCc%hX*dW`&+_@WdUG0N*3lv)k0f7_c+W;y@7mwrA- z;9-d~7fRr7cA{V_ZOJ78OL%B|aA@Jm80A9_qF&L3VUz)#40x#1b7671z+ zwh!U&$i~QrN%EqZ|N9{Nw|=VXtEt%eR<%vA$U{RxT+00cxIiUj&DQ!#`Kyl3FFLRdNR$0TRX;}6hf9y=zS;TPG=lux zot|#Ue`E6O72@sA#CxmuFt#;PZkDiVX-)w>jJ$#VI6z%W^S7B{kT*C`_&WR44xO{t zNxx;!Nhasn3Fz&FYJ{1~3Me?pZI6`c=VX_0^)=)m(K%F)sKL8znrpk~A`AWvd<{Jr zyP2v6y(A)I$+hQxJcK3wumXJ+yyg9rk|mj%O|gdTJWZl=24-3M)|07WL1BzbBEPPs zz=>$^dgKT3PC0ueDdq752d7PVX({rxfmNQfgBO@Su=gB5qg+_?)u*|0o79FKDS&b2r(rLK4|*D= zqFYbSqmdP2b%FCEcDL0Ize6%Ouw=Wfw-}S*|r;KyG&)`Lv1#H-L z`Soq|@9y9k99}%`B3f-!Q0SXQDX?ko(48P=d$K8Q+AS4a+O0r48#U@iSe_A%XN2J) zrNO+-N-ZY-*7QT-7q*y-jS3Z%;ydx^gJ;SSpsPAu?T}a6#=Gh zMPwcR)0p62NBcjO_W$;y+5fG1a{s$`y#KRl|NAEF|EbCQfAvxC|4+34UF~xJ`~Ld< zfA?S7|9+PJXL@V}0gV_Fy||iSp$0Y`TX5St0*4wC9O`U|rYeR9d+ib7;RGu@TxP;U z9>PP)g(f__lWu{BeaG_-MUQ8A=%1Mc4__XC7tt?(^q1QhzmcGGNZ38?jlFEet zvnL}=H-~~@nc$QA1dMS?X)az{7bk2iXndRsH9@KfQ2r&fju*B``ViwGy{ zZ~&n_0+r1`;&Q4JWlm&-(RK=q~aC^rM(L%?S7=RZr3)CiU<87H`XWD zH@Ni+t@O3%2cWO$1|6E1m0a%h^dE@!7GnAW&9%)tpJ;EA@j_gl$8ZWzOG@j@krP7C zm?q&l6pguE`#l()sQH=-#Sb~?P7K8OERBir{i>B1cX#rtj_Q%9?zS0QA3~ZghDX#XGP>o{O5yzb^YG5ao3ZM z>tOwEKF_*-{W25RZ)ADm`la`>tltyo|BtL+=f@7Sex5vDzfm^J`aOFrPxA3&dHpsV z-)8;(J^Xohw>J3m>%RZ<__L{N8~piJ*I$i4?L7{IKR2FrWc)chwLScKR##s1d%LxV zKf88qi$BeHN;E0qc78&!+4`n7A@8TUNWQ{7Xg}h8c!@IaLbFJnTi=OM?x$nWXpypb zw+wT)O~o*GtOti-?nXH`1dzM;M9nbwx81=(Q+${%Xn$Gg9aKI=6x1hU$i{lb|0W$O zQYKVr6zjgxor~!zQ;Vhnn`HCcs?7TuK02be3uP9rGX-0!m)gX|$}Wmmk1yIUU0CYO zv1E}F-Z5<=;%)Kze&I_o{t!2EWoLxKv}|SZCu*=TPaTjx@1Zkno?88LY}>3XZVumP z>>rLrlhGVSjipW-4v37#VvXH0DkDq6s4l#lFDPx|3n`Z3Mu$4!MyDqS=}>27UPf&y z-9uH3j(?LP8Q^o@y3alzrC|M_<~s=NV#J0eR=2UD-E+d_SkOsrsz;KD`OlT1I}yQr znu`@f%WPOjOw-pr5G*HM&ljS7_&uCkGzwPPHZ3oy6k@@Y|^%JsyljV@dfbd|RFEf6UXT-Oh$_ zXN_!avkW4G>X$4NPvSQ$@s0oa`84e}=5y}9Kc7GBKf-)g9f~IC3l8^*0!l2U?=Mjn zSE7eim6(WfRM1I29Zi^q3_lE5g1&udUkBgrY}SGi!wy-e_^ZFa|H8w3f67sO|NVmr z->0*F*gZX`{XH$Sl#Y$}=MLT`$6IAbd9q3jGX}ifGGNZp)=7N0mv{bIaUl%L%#Zs& zIi3X<9A-R&%<(*bAj;>L+5H=-y1*6gU`C`T`>ho_i#SRXxcp2alf$z#*UCwo(AlUI z?H80DWtFCXV6x;yyKtv9Q|E1mdAQC-8bCU1wInefy5 zljf@TWT-ARYYH$ieX~_eZ`>%JIsrx8`W$379452j?aDRo>#*-N;(`59!+*}2&m(`g zI(;jZ<)&%^XuUB!kP|FaId-y8g-RW!1T7bKTQa=@?-!^p z>7yKaE4!ZlzM4(-snF{)ByHgi#XpbAFR_=e;$KSs4?|m3-;Q{9sF%m80BWdrA-W7{ zsW%n?hncQmgZ1S6S-k(>a_(FN8sBV z^IXSEjpKs_bVkkrd|+MQ0mEn>?XGVxHH5BSjkLNWP@IU04GP-P3Ug4~985J9cdxOy zJBel9Au@XzDZpsFfr=Pb-@xJp@No4&7BsRbnlMu=9Ei-32b29!F>o??YWfi1lWuTo zdQWlI>HGpZ#fxo4>>-|Gw|IyHo2~?|Wj7hb4vF=%W^pwY*j&YLmlj3Z;QqA1#~XH! zBkpe(++QX19(p!(w8xe9_V+kEL&h5-8fY=l0M|%R{HxG`A~epag{BwAL1F{9)&5=8nXyF!`n>15?I8rc0i_u zLTGS^th08R@E*Qkzl?jO$KxF$Ed|Be^(l3 z=`&w4 zQhYvySH)ir{rVQ-Bu-KS7j(iCMK2kf6E4&ah-3pE38z}4T zVlLG{rqC)55dz4n6lZ`e@4mSuNW}?#ttTq}ff?kLOp>>+^;jkFZHDn#v971_#p8Oi z&+VBYXe&HISfr{9T%@wVMqJM+49~HwlN&3A!b`j^EYzt;ZTY6ZHbhV9dRw1np1E= zWFa_-7||IHk4PofL_nZpPt5LDQn*6u)dgj)O{zYa++3>_|1xs5nM4`h#YY*qKW+eM zxjyUIZen?&6WYf@OfUgJ#pAo83ByvG8e{aWS?r7v@?c>oI!xtFe!lV3)#J6q7cMXqEJbb({t{2l2*Z1P6dHUVK7kDj; z`|LXGn(He>-RbNhG|jr8#(K+z9v;Hr0^Sli9Xhl{1?JMD`=W7Tt%sU?B00ED^f2f7 z_C85h0K$>itTz0P(lmZ^2(UIa#Dmj90N+LY6(H_v&qy3DoXzfOVXF~h*S4j;71Dp*Dwh$6F#2?~>?KH&+_Qmoq=#R4h$ryGTqx1bMDs9(WvjdHY~^NowKMF!qj zK~Ab@d|>-M4CVfP)XM;~m*=2Oo$A`7_-EKKP|zKkV3)~-ur{1)_sFZ<`Y!KDkt-?B z%~$T+@4e-yIW4L^CmS7J`YZF!F6w_y_N4EXc~_w+FIHIa*(67#nA)$bEw?N4%Hh6B z_qP?+=CJPiTW_Ojy@nB@JwLE18jCy+@9i@Egw{d($VUnK+{{FMZiezs#%Rh?lp>Y6 zZS$#Nhzh2bTMq)hmJ4gWQ(e=l__1EC+qF|`!tIVJ572KAC6Db+q6Ipt2PfOg?As|f zU8pbxHMs9|&mLW*_rzRs1KoNUbIH|_KLDQK66Ku~(;=WlFFJ&|v#0548V&6yC?V<*qk83aFwB^IJFXin7n4TL%EwOX2V2cJc)lEg=hk zmZ4uS3+z^E(fiOl2rw0-G@xOy$+@<#}4qa9PkHhc`9S2wg;annzGyWKZG%h+B?`-Bw(~9}M$U_o;kO~?7UJ%)z z3V7sH;o8KIk=F=&)ld4Nwxz?s$cc}RVN#p{f5QcoFVa$=OVaC?jv?nkG#XZlZ^#fN zmnKN1t@JH$NUBu(8qp^br80U|!1GT^VNFOS4U87gM#KB_M%l=&_%mn;aGsw^C(uf1 zMO6Hl*hoy?8I6UXJYJjFr3pq_!j7VT{!KBU?XII)5V#Cvuu2 zw!uk3vu9dS;M3Dp)jq-1X7V>=G_leP;58Fp6@#I8z*+=CCXZSE}b9z)N%W&2p`~X2UsA=4tkV_mv0Vpgltf2u;qzILBHyfawSzcS}hd zm6qLzKnxGy1c%=~U=q{|Uc0$uHMuBeIk6ds3e#xHInQaA2Y zO7!>c1s2dU({KNFdz8rHox7;58)O_xr78Y+0nyS3yu1X>km5eRajkiyif{a6-uT@G zmd&Rx?C?Glc`l+ut4=(Kz8AF3e|q%z@w}tKkNF0EY&P&CI#!4udz$!hY$AS~ZQ#cP zsU6|Re{|wq*py)$tFOsKb7b83Tl?X0IKF!@j_)Q{;TcXLev5G*A-?A} z#1UV*iTHwhqe+Ob5fPssP-%FP)$hN3yw{ZnD@xLhmFd4X+AiLs5l`M^#e3H$s0q;G2QUcx+KgRLiq@KxmuL$v8%ApSM-l7=H>&K4p-ZS`gdqVxM4)I^Q#D8f= zjsMal{!2^1e`n!0D-!VE3BLmWMGnF1Cm#G4@ZYYs_-~B`|H;wbnR;aW7w;)Nlki^) zjo+Pn82rbB=qvL{a{kiY zY;eHi?s;1hSUKZG>JQPFGv|(c>HF$?Zvmc0@Z_@t(cjkDmgnw1G zQ@~Y~;$MtczQ{DU>j(DE#c(ULXhbNWBlgIhJ!0qHuQCu1K)vd!QiUn}5 zFxh_>hWcq_K^+Pxxg6u*YfbT#P?*=5@UPu-`E~FxK5T;@D$cS9Qxw?$#FtTuK<+{8 zJ0>ir+tcOkLx&P>-$A!Me+_R&euMp;_L%wOIlVZwmK#ANiIrs8+3Mq~Ja?QTOGhPb~b*jNGn%+qb4R(@+V<80)!`Fs6HT`l- zrHX1rXqcxq4Q6ku!RQr)vB87$pa}@`!?NhrDth(uEKI)&fAipP4hZ&H@L~?Xn1|1E zFb5Xqz~JNFGLo>tDdEE&$~IBW4SZj3)W~0yS6-^Sak*&L>#Cj^-lgWzI>zm+Vs` z+RQ6)PB=|DFEq{fi_a@q-Hdaid3}^HuR3!+|1kUmn(S_jY;D+*q1g|*g9FN3dprde zebv+~0tSk)xdCJe4a9o{BdkM8p1vB~gqOUZx`P*D5fcZZ)4nKmU`iJxAUC(ky>1Gz z*aN<7)FEXT30P!J|A0c0CBA@h56&x6x(&?>KYGwYukgMT?>oaqn4i?l+n!2!+rK}+ zdE5U{^j@KjcoLAje>xDCZ%poR`qsMkS;NAD&y|_iVA{pd1abk*(swC~*J_x1XCqY| z+95`N5sfK}KOs~37E$pT4|2bhv9~?r!;xS+~M?bl-<`%UVik zp=^{Dccz^49V=HQQ*5MGrXxIkc|veUod%R%)xQ$6(j*ZNPfE~_j>cFbXwgKeF~Tzq zzKiZVDV1LMe2e@{_sf-TL!9Ay#P{^O8h+=6_aNUnW*xBu+qxOFed6i6xIE|zVg;Yx z%M;|ut#4^Fxz#OE;#*Dt-WXnrv~-wH_WzQ8=*P}S)DN|GEdT7|*=*Vy{TK8@H+DU| z40QRg)eoI2hjTR!hxJ3dEppI~V-Bw$>O*?w6F)g>{m?)0JJ9iFkgz+{59xcOad~M` zYg_5Vl$T=b;__03RX^0n)DQI$e^}eDe(0BEF^Tj;*RMZH{m@3z0_20(jq!FEeoqni0>}$vRq1`a{Bj|^A?O{+kdr$Om z^3ic|`KUp@v3GaWDj)q3zm=dLdLkoPJ{n2#5to!sl8=Vt>!=^9C5rNhEQw1Z&En&g z#npCE@Ksw$Xmwl?%9Z1~X*V}pa4w;S9rW&jFXOV$<0$*Q@(CfZ30+C-crU9x+2_n} zSoRtF9kne#){uQ3$|A{TY*z#OP39XJ#~L@rWVID_UW>dD`7^ZY&EM3rtns}65qkc2 z5WwT~{0S7t|4YyRYPsT7xx?1c!TRw~)oj&upDAVU5&`*c@%)eyKDg>|`pmmm9ko8QcqfCz#2rUVue-@N zv>gfbdf&=qeWo}RJ$ib*aHZ5|c1*9Uy6$R2ueZeM^?f^dT(56$hhCo(e>eOl((C=J z2|+cb5$!zo#P;a*gio1XFZq(%&Q3Sz^?C}6?YAV&z-}M%jjPj*8;deKrq^Tmn@?u$ zviegEj+r%yi}cli@``+$?IX2@ZI@o9*;i<54!T|2l@P{9YI-zslHB()*NVZl=k!t@ z!f-LSUa#p(hlfgzPjQC|WA0GbPM3wUI~6JOPK5u@$u2DTcIrnOzR|na)Xz{yQKN>+ z0lhYB4dPPG-mI;`JnB;lhbZ&9cPc4ZGc_~Pm-Qn67TtSM|IMWZpG-Nmq_%q}4Zd+1 zd;@c@H2ioe*40|G&+TfQw6>_WbCRABJ_f_N*t<#9cT%3Y(BMNjJ~zMXrgyP~Xy6V~ zUXHNY;c>50WuCpHHVr<}9KOQpKL`DlVZPLQCE&uS%hx*g2bjKnUNKs?Q< z?Z3wOU!c`a&Gn6{uUGstX&E;yxYWi7Irm$1`&8arq55`Ssy?CSE>-Po#c>oSe$c5r zv>c-tFw!M-Cx)8eK@LMpfcMl~TZ^tKy4KTAndb#?9SL(PjRDU!$q{}V-%0XQ`Fzqg zpndkj+B|sCyP7$uV5S zmoSbZ6!!zoLjOr(sPZIUp%3-WR)aNF`l#&Q#J1Ir%C4YzFS8ai2K)raWS6^xWeyCl zaqFwxxeb8*rktE9j&=cpMTbJ&Nx5o6B4c zp3X%e&OYRzFo{XoIm+S@*%cgKj2bnW|4`(7h69*fE|@b>n8NJDI3F2H ztm11pWo1S^C9A|};-0A#OG~4V`mQVq`J;6b7aLU67O2Q_WaPk1j)ur6bq? zKv;kiZf<5NHU}Tjs4#IP@o}nrRE}goRf81;=z>ED1cDX7>#YV#VVCg9zPrI{wD^}O zi_gN;dCI&lYVZU&Y5i;6*(Zy=Uoc53IvC{y8C%3?3XmgMo;b&2fB=(?hUrm0;DN??qk zQEV;%#$1L~mn(s9k?t+_WLE^S?0ikN_~uijCHgWYaIW;OEk{T$ce9Q`?aY;o?*=Fx zUyUAB*L@*31CI~+@7CaH%7SB5*Mt4@=MpYmUU99lw=+J%YFtS-;S*Sa=@L zBif|!-DZAu2|_SAjBMU4#(d5JMkbh}+bAFEugIX%Bs6r_3*t!#a+C1!7XDyQvwB){ z_!5(!z=C!=zOTNRimO14@q6lfKBkE}P##Qla#loeXNZ%5{1z`|xxyqx150lSlq zJT3y%5vTShGi*zy*^>nVI|a{4UK^ITP?^`0S?!W?J21dTdb!btL#|l`#2ReZL2gK2 zi~Y#??`kdA?Zw+Q`m_JaC|s7T&>2W;Ww^`HIM@dtK~^=v&)yyt2OFX>eW@&T*?BY? z(Z<8c>@tf!NUJfrGeZrAbn+947E8d(fH%jZkes;Agnjq8QB?F?s3C zFdtb5soc1`Q{JtJFRF|$VOG{$VYq9P+JLHfv?JVg$h+0?g@?ONdAG(A2kFnIH3=Go z(y}Y?EK5GCP2MFLaf#3BTngMRl-o86c!@>*$|B-d6%OwDLtZrqIS;~3*MFLR}2X{ zR)zx+*=Rc9J{Ea=&GU$8 z`2U$BY2B5i{yguOi# z;cR=`d~OJT;=CQ?n{krVIv}vx*-oN_uciFhtQhF61coEu$I31kD(fh)iwJt1g(f=q z*D3IedO?ssH{26ToZYv`#2_8a;!HV+G< z&5%R%HR_tZ@Srn1P-ebOLwz3wwM}l6%G|NO}#wcwCEEjXn>#T1@}1$k3{Q&>=K zpVT`tz?8=e`{yZtX(;SpZC9RI>R;nM4H&l+WZX6v1+T7zm#f5ytD;hY>VL*&=za7r zN$x-w5f(d23r1IKq3)ln!C^HXmj>*n+Igim{U99(eQ=FuAdJQl?rPW%^))azkT-MR zcZV*wyF*vT!d*~8*wak!o@Ra5yF9UMi4wqUjLEs6Ou0CVg7`=Y<;F^nSWz4VwqQ+` zIQYKgka@>`M9Bgr4A{8~42v6DJV07^ePM)7_yuyaLYZeXgHrycXsqZO z31LOoh9AchtM5ec1K0xyR04ysbZiYC%8m1*roUYNsyGFnc6jEx=Kla)K%&1r=GssR z{Mras$zTIhJ!f7Urh4>|nd&R+|I1W&%vA3}5^4xLN5fQiV@jJu(5ai1wm9KNiaS@f zN0M91lv|YiU^3CS&pR>`?fvIrndrt3ndM#lA(P#jHca%i1SWd?+ec!eJ+j9!QjqSu zEY3vt*cNA^-6@iZHtG@K`CXeR9JeD5BV!y)WUCKz4*w_Fb~=9gj*^|0hcbzs zrf$~R;_NgY;(FP>h7U1v(_-odlYM^SjiX|p|MBu++2=#6c*}RMYDXAW0@H28J|_so zrhO6F-R(bg%s=0g$UpzS1OEAyPp$m(x1S{N&*vhkcD;Ti{`q1#jq_JA^4PQZu>7-W zFX7^-2LEhSykP5d{wlPWWd1sP$@*0WA3Zs>9X@*IH*7Dtw&=QKdr5SvBRn>>E&tq( zy(F`Zy<`i_`-U5tdmB(h_L3q88$e)HRkNCzG?~cqzj=6d4Ovbs`da<~vfXSb$&MRJ zW^YO~lweZB^0MF+jxsFiutwTRE{@wt`gfX=9X=~1p&mYDX`%Mwkea9 z#ehe&;5B&&$6Bxm`0-^j1s1{$`(+33slrk}%Di!Q_~*Cc%_R&2`!|vw1zu+DLGR^K z8)hqVOEW@K!VPC>!2x9^qJq+bL&}2`r%miSnhUaN8Kk*@sxY*KX~RK$vdhK69lNIHJjEMw;&p;`>#I!SLx&Ldih_`$KUqP|B4PB ziw^%^>*H_VV2To_zIEoaI;@{TCU$@dKPlTG<>f^&N97&#dN6HhgNqOSMH;zW0&<{!% zED|ZkIubAfd12K9cSO=e2!566TctFyUCaFjU51tSmM2IPb-+exz5^glv`FtOcj#u8 zCeT%)s2St%Ho+x1VD(K>%R9oW+ws4$>f?X$exL@gw^0fnQJ1G}g6 zRrR0LHG9iKX~5$;CufEGrI>p9kJU9Xc;G7P?^I@pfL{ojbQh z9Yh`Yf#=5e9l+Ji^pLOt<1~D@DNSL$Q**7r0MQ-dgqPU~t1Ams7Fp_6kfq$8*-T;T zOIo0Igek@U4Ru}W2>&JSfBP}Ol2u*+s8T^*xF;6=6Eu=;-hsG$I|}7nbPKe)UETVc zX|q{MPy;I+zBtCoN-4HW&0i}%e!T<9H{(@6IPTyr7U!h;g9-W8P<~lP(o5pp z0@<;49m^)MjF{|L7WWU`sQ9lx$g-6)JkG4sVeuUE#68E{rsvomaoX!0%6(UpleFPE z*2JD;2+3u05a;ahIBA7=al?vL2y_Q)%FA2_y_w`iR)_K*yrB4B>OlVc2g`qvf0~|I z+hG8=u8ziTG>}tKMR?SHi!J9DvgI6$UReX#E6Ys@iRIzae2q@7SM=9DKC&HN20uUT?g{soeQk;-e2BLkoV3&d2jDyN0Rp@JkP@) z`#d9!cV6gN-m_qbb;-w`g*=YML~1t`{)+ckl1KjvB~WjVhf}60{^vSzIHli_ffpHy zKSp=b75^o4r;FnM(19176@PYTjt51FLvo00ufhWDp3X@;;yjhCul_b+U9`-^58rfjAHE~` z!?zzw{_yWC@gi+OCRYyU#FG<-lad57YacnP;d~uT8qQa3)?wwrc>FdFWP=iT6ahaE z#PIM!N(Mv&Do9+P(|RWksOabhG%R^QTAKk~Y>i)iWEwLS>79EgeEuugo?50MM0scN z17kFA{^_Vjv*?dWqglN_c{K0tx8}F!{lf7yB<{}}UC-cw_!9?m*3k{*!sLNma3Fag zS!ifm_%>|H8T*a%jRRTNBVizWlg{%?4+~Vh7KQ7_(XPwsm zI{fD7hH-ZCFwSZ-jLiMk{3?kg?oNl$-(-B<@CM>*#5|a1XO4payKgGyEB_`MV?_Dw zUMc^(L$6>zW|Th0L2pOKc!2C}966A11&}R|#F+pt{*l8D0x*Lt@#X8$7?WX#64*)U z=eAS4`xwmMNvhEu#(Ig~_&bDNq5699@xuo99~?m<8?tnrZ5$JyP|t&OD&-Dou71TP z`Ryu7KTkuUWcIQQj*i*o^-_npn;99L&m$iY?Qk2@W5PsHBWc?I)HfAWg8Zi+aFn!cNN!ej33 zz2_6yt3nHVWd)Lq8Sv55S}EU52m9ueUtKsizHc+%OWrq&==GbqmThyT7~3ZLb4NVB zyXIQ9D@!tYPKu6(HOo_fhl38!hz*_Hp;+I86SNnGp{tm45+$%2q7{2_~p2T_vZTi<_|I?xW{N0h~ zXdL3I#YT@W?Y7`U@k*m{|6#Z<&UxMVc|ZRA@bAXYarpC@2PJD6XB!f^ON5Er|JG8{nkDp3<98AP6_r+pKpI7`?h%~X)Rk2u11}1iXkdKcY{)Puo zlzP8+SESgMq~rRmUY9%@#ox0{cWLc>bY~mBVf~@Dt=@0bYJNMdUTD+mPwYQpaOb|| z(-J}=YVzwv^`j(hOj7ktvP#Q&A!hjJE#hO4g~#B;PUR}IL=|e{taS2S-lkhEwpGG#2otV-AJ2a@wY3K=N=Vdgm6&d z#TatCc%+G-T+^2el}NA8KspBILyJca{zC)K5)a=G`#TIP|9+vH&Msa5$Sd;J<8A9@ zp?wSWY6&tA3LUVvo`ryI-a~A2f)e0*Mgh*i@q!eA$@Q$%AtUtz&GIH7=YW9{3%P)! zF3TSgeo@0sb`*6@er4!RcW|mueLFJU!3lus-m8>vXW{s_qBt1jdjsA>Y1c?EHdMHS zW3kvvCgVscmzM={E`c(!P#rK)Rs7?yvp~$#6-{v4iT8s?z9v{y z_B!ZAEla+jr!P6dG-O#dGS!%-sx{-qJ@cR_S~n_Q?3m<%KS&Tbj; zK=}wqtm)9??8(*LHl7NF=Nn^L@Uj(GEB?y5k9Uuv^ufgWTn2-MI!SQ)QY;4p&oTj;8t3#TamG6gcj2I62h&2O zmggDo@Ggf>p}Kzk9$N(H8yPRr^^+(UuGu0^dw`Gnsz;*M{4D)$HydwQ8#Rc+&jDlS ze+=+ze}dPm^nV}bIx!4A!3bR@EGVH0q$r`#vOuiM~PlfnnB-6XVIQ|WH3vV zl6owNSE6I#T`bW?7`7Skckq}py#lmZV_M4 z;#*gJz+BAm4{&Q?UEuCQR)F>!ns$aQRu1Q1;(Q+|G@E>xat#{k2HwdtR;7@(fO9Q1 zQ8huiZfg9pJbqabzpRX3R>d!?-z3ebPD5G31+T7-scU{z zL+3P$ZOA_xr>GR0soc7X{}kFeJ(W(wFo>M~3TZ#}N6PtE4gVrLhOZvT2+4M$WxmO< z!>wBCpfrcv9|m}x?%-sgl;w6+AIZ{>;=hS-{cbG0m%)_Y*9g~MB4U-O-zEhb%D5KKiTp0Iprfm^S3h#OxpSoo6_ z{~Neo#DZ1KmB(_uxXW*lkP+DhQmah@mh{@dA(%bxMvlcDe4eE%R&(P}-ITv6NcpG? z35&H48V%nOkfngPh;xnMFBjiW=VF~-P3OVm1kF!XIt*TgHP~GxJ)JKkd1`5?}fL6Oj@w0*gb_OF^Ht$sJji%f_*Z~~jecdC!m3X`me`CF| znr?BHt*F3P(8cq8FeOwI^Tq7r3w*7eJQu3IrFmEeYJpLT&4P`4*$ErB+LG(h4tDbQ zmL`4ASu?lI_ac8ult-jII0=y1uD=5JbrIGEMwGon`2HY;)$p?iWkJ*xV#+jLk^-IXb zH7Pq2y6j#{Vyi>)BDxp{yB@h8ZsDZFFThSjKEMu#*Ymee7*8+I(=TQacF#*hEQDPb zQS%j|JN%6Vll?al^xCQJI>*LUT6-#fbg%I3>!tX2(M6`>@1MdCJ9*AF+KSLV$hT*rGpzzW%>{qNTo-R zDx2NwD5 zhS1R)ww22yK8rB|b?w+Eo}6u&`v=cjCyaAVOPK4^3rw-+ijGE*DE9o>Ym9)Iqc-0^ z(fWONoA0M5e7`q*KRhv(oOjXEAKROkKWpi9&od@0$W>?h4^V;2pJB7ecFHUuC?YV` zT>F>bg)d^dC6P5LO1Vep8L~OYKam_)GD+IMb^b1}aQTyXr*h?^HSj3X9Qi1+AD*PNiF+6ECvhA`36_-~yQck%g+B3Y4hO?1b}Ur+>HgbXem~`SoNU3}zVDCD!;M z7@z0Pd#_0t+sWqG{^_%fP2%VCmLT^g9(g`#d@u?eqS@TgPo-<^j-!sB18&jupGg?@ zZFMH77@!+q7QCHRw81=rM=1Au)wfSk^-oYCH=zW6*fEQl_ue99Uia);nU4Jvl;{UAE?Vx3#uD>M=)aJ6f^VyeF>&s*&rFzLL4G~uk44xdCfKe200M3*<=7gE zC&Vbm&_{JGQ)XPplB383Qg>PCoGhSp`M`q^H=ji~LA-Y#ZrEg}#1o;LW3sx&p0DAJ zP4p4c%Y@Rt<>NJqGcG&G-)={2WF|R+mIX70h5DYa273aR04!X_@L*aFa58q))$F~b zEU?itMNK6EyCu?@;4M(^8AlD8N*XyhF4(n64P5~*cW=^y-C-4egtymtJ{zr!J$B^M z6q}L@(qF{Ma3CI83Yfbf&Fdsp(gv~YG5FmY^K7Q+C;`m&CbmBjHRMYmdJcdmam;k4 z^9AvJ9sv8ST9^z%R#-Cf`$O!2n1zQLI!G5EPUExN17tTb7dP?bnrN&p4j-_GUx;4! z5FQYhKgg{guf}hts~{#^rv`1yEbSBOe^yZPjf6c{0jM#A5$$MWFy}O;jMwx7@EJ}2 zNYlS40~*kTyo;|@p_-e(=_@m^;wBc=Q3A_s!|HL-mG}^p z<)I&k>EQrFdtZ8R{`eQ(piImInH5&q?n_$7fkX>MWUcrcawnldSKkU}Dg3lag3g=T zX8b2lWvqL%A0^P`krsr$j@G~J(vRXW%`CtyQolb6cOgu>a8Shp=0A&}_wo7z#h2?( zdB17>$JuH90o@oI(3qBr>H;GzLVcd9>R~naOLh8Q>b4hLV7JPHm>*VO37b;; zGIcbi5Vcz$k#S^S)oJ#QUXsBs3t%U;e9NT&cGwS2pB+P!V7?{ zM7Il`GT{ZbMixt8cnOi_dR=6#)jlyuy9t2i=V=Keh4XD@1P@F4oDWi=80aBD+xB~S zXWE~mF*E6p8oVSq=Z_j3;!vA1m_;7qpuhUSY=)cQ5GUL;!+_x&rDClim+AmMmvGBH zlXxQ5yfJHeajGoz5n!W9CZ#tCLEQDy)bNrzH&_hvCak6E&|E7$7iu98mU0_lqz&Oz z)z@GjU8grPuC2ER`@W;P8XE7P4c!0|egnytw*|U@p^nQ`L-WyPObjF)0%qjjZroaR zTa@x-t2WPxtY8HAxrO+)f!VS$uZ=2{Z5CyJ1D@wAvH4zn%1sebUL;%dLR3xU4VXKP z=_4z%4j3hppCvc3z}N6K-=L?qosJ@z7mm+P-<=L%p5Dikgx=9q^6VvqPD7XNKr|Qh zX1YUnsUn4}ZP!=0gFr)|x4}3~1pVB{Nx7GWY5Lv4iC8tjlPNy=0}pC7{KW%#f?MAo z`Jj&M2Yk!BO(hjZDW%&#Lp=8fmOP%M$GbhJx%FQNtiv@HeiielaRcuLjk0ROSbD(B z8h{P5c_zqNEvI>)y7rHw5g{WPhK8j)=q;k;O=Hl-z~kLXuGJ@tQ{|w0;@`kFPft~k zL_RR=9yY$~{jp71y`;!h?+jlkNoe@^tq1y_WLDhdL(mw;J z$!QwO$5Cn-6HabsJ}Eol86PTrDBzi)S)}0^mnRmWKXhnfp=dxXA1bbBDwdw7#g+1} zbaDhxF0O+AtHtN!hv&q@3NY7*j1uNGCcDZc*tsTO+{JM_s>kpg-E%*OF2H$v1s`j! zt)3E!FQEDlzCo}3NU)9|y|c&%o!Mne*hD<|o-R9mId)F|i@43{)}NP;38Kd0`<~LB zzPk{j2(=KF1!&GtgPme*PRlgWPl^5tdl2npER-Xi+Xy~NjeQ^K@Xlu@8Uy-+rL^fI zyM`aIQ9LJ)6Gkv7;g$2-d4)E%m;gaO%MACG`M2EqrZ8pX#Y0_6A^C%z4dZ;~;MBt@ zMxHLSTLWN9x!-Py{T{A8`I?ErcS}9WI1Kp`KfK1L`Q%B)?Mi@(avNU z*z|kAbIU<+rS*sA^fWeB>VWX)d%Wb4w^0+c+RBe;dJMssbi^aC()8UL0P*y#xUC6l z6jU$g*B@;{o&Tttvnut9M(5Rdyu@p$R0r~2y~LKnXR8W0dR8v?ZWeXq2skFYoP?ZE z9XCqFJlW*k8+y<1xSvLUpO(RGv)D1%67w@h9^D#p_Lk!@LLUrAZ~Tj{n!+(1@$0rm zc1^Z%-7MtloTd32O>1W!aw(C!O?mBWPr2HJCT`mj#2I2OxzhKTxC5k9Ecb{UZIy#f zsJ^8DBXDkey#3SZHX$D1ZO%(LS);NmG~aSNaHgm#L_V^@lTWwR0c5d*1M97&Dw5JV zJ{J%Iz5>m?pBwRJ!9Hcdu?|Y@C9iO=wq;sZQS-WTv0_D|38 zE8aiTdGZZ%Y$h~yFX@4!=cQ{ov_N z`6xEviiumsV8)AYygQ8MpS(!=1iAg8$-tqE;2_966q23v4B>=CX7~lrLWNi8L9KjH zC|3WKH>(OaYoHN^fhph_CSi+R-;O^uPQM1wc$OmvSpLe5N zbTJ7<<&iP5&72W0~HFRQ?kqk&sSqR9~QxSaqYNxScvZ;=r8< z{5fi{%CV#c@Q>JEhZy@OsTL_9GtrNzy1n0P2*-G6KD~i#S1N^S_zEO?%TirJ@l$;P z%=Y6B-UU?t`WzFdRI{N<7pi@or*j$7uHUC{;|jtYbw*i%3T4JMW;uapQ;c!~vL00> zHmPD2)ns7nJ7JI*A<1B2raB<~qD`j*ndXvnGp0%8n8~5D!Adw96@Z_5XA?8Az{c^F zRT#8aY3NhR;Lv3vLb$99-2>eEk7ceMnlj`IlnT3PC?vV{-DGdZIM%Y<9oqC5$%Ib0 zu%W2As6=lCS@UhT{sZ;~!sbTr8>*{!wr3cy(K%(VgKmAFW(b-Kfu z+*WORD|V{PL~RrNppK6z(dz+w=&KMHqnM-V--=>%GNf2R4nG7+Q>gt}Y1YSlvA(?xFU9%a0#A7p}(xu5)m}jSj zM@~yGE@Y;V0(Bd?lItCJGo$fWR8+Z0-d&;iU!V?A?`FUlhDgStOYy&PFosl43A{v+ z8iV?e0Fm{>U^E_&N}eAMkf&#iH?WmXutn~iMtXntH%GFBfET`>JREH3N(Unj&oTWc zZ$k@=(9jBwqKLvnqx~yNY!Lq*HnT|ufhBu>e=@2;=+l5la>R@GBaGq{a=Zvy`7t$jB}ZW^Gs#tkBC(YR)8M*6^wFEb zF}Y44*AYx9{=Z{pB(9h}=$R{FZ}R`h)&ipV9QC@Cgw1p@dJN;qf8G<75zv4OuaGyN zB<*68xaV5_?SlKFFGwk3=FN;XXMp$-d9g0Zf9weQt+iJD7UeQT{ni(n{xh(j;+D~d zimOSgxR#nKu2xoYjkG1HxB$tO1t}>3D!Fzm{tcw$TJ9zcLrSjBWx?^8C>#$%T@>I4 zw`;#ezcq#QTRGENzva%|-?4t{XXwd^QuI8xD?3~9k0Uj}sBGBT4H%OjM`fIROst(| zAf8q&cb!B$Q!^#fQ69X&juH?ep4u~sl^ID~P@&pXO5Q0=rL4?pDrFUnJ~9hk7qI^d zv?NX-Dal%x}T>Ot?5TPkD zS4}(?o0MJRj#JDHa_6s7gd;{gWU(tpxuu!u5+s8R@o$y%EHS)+#GqS&Z4S+tMss)^ zrvgt4Q{%m;mg`F6I2dMOLNtjH#eWd9XitDt(}(ZoHTPz1(w@|mg|amlPb&Y>a_SbYvebl@i*7WA=bv< ze8SRtj%+=bTRUXy*_O`dTbi#-BP9VWPBVt~(4M5j8#DLwS0< zhgSJPTRMHPLIQe~*<@CBO*%Kpk$>~JX|4RgGgw!Rg(#Qt` zVgCudy{LlOJVJK`&|oDu5WoF9wMY2DpD+AAN*?6Z5~hIm3)_QCJt^H{jq3VX@n1sV zJrua8)I>$Nfe+CH)4d7x;YGo|b5+;MB!&0}mfTj{!c0=0tP8b}zj;XXCLPDZ<~06! zmrg%d$)DSQ7p1%&G_^H-ryNyAuZv$U$FIvmWuamaS{tx!rsA)Z!cK)7utak@ zvl4}fTkGYIEQ3D;N>K%LzA6h7Zb$*kq2}&D0kWYrok+)s40!2XX}0bJ>#fW<(E$Uv z1w9GaPC^~2dDxli3KI@i{Au_eYQZDt$vl^sJLvOt;zWAZUgdP_H#;a@5Qyt&i2^Qi zOt!;w8!C3Ta!uh>chCzA;H^MBe{$QuLumn+l;3xv-Y7v)#rPL-Qwqs|Da!9O8O&7R zFP)Ti1`X=sqUo$s0sb--bKSuF2Vmk+|MlrMBZJW*S|E22CjwvGsoI77;<0w4QK&Lr zH_yR*yYHZas-a3{Mm{lI2Vt5MnBmH(_45t{jm3X8dnNrI^vkEG+ZxkyfCCx{Y?Rj_ zMv%d@-@Z#zYztCor^(`Acl(k0-!;bSEyGUpb{_ATtliOhY1e=lL`h!S`OQvt6yVA%CY zb(OOZ?me(C;XL}_U;zgB8#aIJK-{i608MX3;%z0+3}6S;+hnuy_b546gyJ+XEpf>EJU_Oq)nmgxspnc3`Aq$Uac{TSgztK z*!Ei)8&$Q%MuskC{U3NAs!VKzwb5Zya9si5a#v_J44|$ks>3%);T3ErG znG|8G7T^Uru#-Zj|E)5N;tLCZCa`Av@?J3)!-#dX)B8afhL z4=Z*X2#{i*L^*l*DvA;A&}`zQp@(~o9s-sgPBeS4o^x12I_mqcVHfukD%%`v#s3%@ z5x#^L;Ri{?U%`v~qV#M~Zj?N#JOpC>6$JHoKE#z8U6Ug4B48&Y%WyxQ#(!qw>Uyx0pFUUYC23bzpbl;%oSD#LdZuXm4W3=z5 z@yY+qG59%*ZZAMb84wdZ15iXLL(8yLMxb++EFUC0%xr+Joakd>c*h(=`3Fj!sEL%f zo+v?h9@O$any)clTV$Yb6cGKDjI16b%smRVvKz!Ogpv@FnBYhZmdT@Hzmk~_mS(ww zDQeIu9wA+8DQv~lq$bGUO=%FViUQsYlJqTl1S4F?9h^H&a{Eo33Cw6wXStqZ(K=K~ ztpmmR0DW_Vs8`R5iWW<=CuFnP)XXp>vxvk^)Zth(0PB)<7%fsXB|SsPH&^N~ zcE)uWHbce^y=mw$oKjFGl|#s9C_b#pgDi_=UMQQZ+&q(24SAN<1(w!UB>}Wnh z+KkA*LP!mZZ|6c|a}5TcozwrM?2~b7;aOsv`(kKGI^j zZnb={R`yoMALR9(m5uqMuxn=qKb|j})pN62xxhl+-kBah2dGwox5UUQi=JRJ9;6(+ zy?Cfa*5547f7C>&_;W5{TWJVnFa(iQbe_nVoNh}?i|X2|_+8YlUCeg0dXEpUN*KX; zev7tavzS_AorSOs<$=&te$ZLlv7ZL9FiW4=>p_^O8IUgTybmlMv$#PspZ*dlGRFt&Jrfw4s@mf%xo_@7Gd zakH@BY2h9>iwo|EbB~SVEZpN}(F=Ao+(hv@K&oSFafZtMmjh=J?qT>xh2A2qbW^-G zl~o&U)iGls5j2D5n0Ww=w14a;$?*YVDRHoF)MCw)}LPP3mt-=_1e5o#gBLkm`CIaCy^Z(O4Z)FlHu$d19uYMO@Ax_!cJ- zMG-m3DtC9#TR@_36KlkcndKPQf!(Bjl|PMKgm6UjjubeTQ~-sg0bJ8d_5{*YbN`f-Wa{rOF7o#ABfY7#I4W>=AXune=PWp#kRsZ9*sf z@Xb7a;6tIQP2A4Sp0PAT4tps#BL$N?_(D2DB1w6EG{;W^Nejd}4C5)njK#&oZ%*am z{%(@gqgb6*e1m2-AVbxKdDJ(@aT{lX6Bg!22y8LpsYH>wm0W9LMdOEo)m@&NzErK9BEJyqLFhWXX=mHTj^gIbMce+v1c#8-# z8glz-!SN)5q|AuM%7S-e1iyF;)~Zb34k8FvhT0%T-wlg!1?Q*QyNxQui6z_^Takqi|;o|hsA zjF5pg;R2gw{)K4F;w3-WE$y*!&#?>@p)Y-*5$Im1!IQB9ld+ z%QS_bVM2)H;4?AyQk?GaLX1N}x2P?eUN24>%U0(+QmX;|)L9pvM~4}mnluztQ&8@G zHM^2jYF~Sb3~&3hE2C{3*L#Uuhx351$G>o3k0-}&>gszlf{FfR~hKGSbPx`B#mh7+E zU5&E8s`0u=QE^kACW#(LoP!66WOaFlej50F+2Wqapx!cbm|u|t#3<@G}?=Q7W3oXa6YaqO=$dj~H}(Lypu zH6c@PCSpH#=&l^GXcT_%`T`@$r^Fb^xT}qk#G`yp65EFGNHz`OD4!k_<->l+qM2{P zxk6*ZFQR@3pR$}Kj_GExY-{Lmd`@nw8yh;2f)`p+l0_ zsoxL_R-8t?j>)c0=<Nb5U z{{EO&^aa`P)7*>V??=`CB9Y~M#?Wu}f`hTx(P0{QXi{OS*03e5%)UeNyTQAhINlP9 z=feDxw4WS*MVk%NJ4u`(dAuH_5>LXu$^G0RY^4%UGG0*p&ynqISzhEhnRR2kT<|jh zql)~2Sbxx@m!BcM+(mJ#+tuk?*Y9s1i&N&89B<<_&a zQM0Awf&kzsKtZ>jm3~BS*As0e!7N9-gx=VpQiq%Kem#ZTJz5!?mW{#PiFSNwR)DEO zu|7~ze)L4os6dTWR?W5ujPotc$tfxn$YNzlyG1Ch;-(>bg#x@DPt)l{MV)j|XjgpG zWlDGHRb`ds#JED7b%1qwk4jG_r$vXCX}Qj`crwj39Dt<0C5Mya&F$D1%G3f{ouA+p zL!nn0*8&=jP%=TS)%6KpFua&*EFUr4pbTdxQWSA>!;9$&R@lw4w6yv=;cm%}=UO^m z*@4p&G@p~?7S)0O)4$>9l;k-D-!$}i?fE~oQD+7W-Ceb%^IA*uCrmG<*=bUL$Eusq zd#rqVfwZ0$y&-8^ExNmv@`KI%LBorwz@jWvrGn1t@B~e3 zGrgFeCVivWeIXl4obZeXvR?dtIyGf9uwMKyh}Hb%YUpjg_mw>Hnd0Kxn58a9LsA77 zu!zTTmAMXJC`+~C&%iwNp%K~D`Sn;Xtb2BNiOs0%g{b>a*gE)6Jd$N+|C1sjR>>GA zwW(M}NESO8pam)ckj9^CqT)+|e^q7(2j#UA4L6e3M6-XW>c^=1a1d|rbZWr|nDd)Q zA&mM8E%z7IwbOH)s(%bfOiaq7m)!O+zJY5}gSnD0lBmqJ1?|~2o&j{*EEeHfh8`nI zz!hWhM+-exD;MohfNYWuc-8y9#=768gkQjHXuoZ+Y1m$k;5msD+Ig1v zfkb)jn9JIe*B%|n@>*zMJMvo1z{ARGnG^0OFA^!IOp@|ip5i}fV`$0y56Ek?ZE<;R zXjxl%ZM;}BfaSGS19(&JG3B*)21t4B>$dXRQv+CDdxUgrvxp4>k*z!PMF{*#U>B>* z=cKo-!;K%T;O@OM6XdtSQhsydy5z*=w;VNDep5S?-^NJ!Z4=0EAGVd>DD#1Ex#J+} z#%N%Y-IME_*@ZE*i1$Zz>o zalxZU|No5qc73~Y$vZAyi`QJq$Mr9+cICGRTs(HC>_16<8%Oe+8oDz_9JmaT)WiaG zX0V*I9pX*ZIA_0;=M3eyb6tGSPIt9)&YUjeoT0}{MA+!}wj;=IFJBs8#*Ye)QhrOn zxV?3nS-|Twqab;myah*;-|qcGdtDBcUG{C$TA*-_Bw9rjMHDBjqQb>Q z@G6QX5soohr|Ya-XTS~2r)(l%9)@a>HKIe=i#o^>>R!<4m_?mDqCO-b8$v`2I=? z8+79P8)INZuss8GSga>X{H}{jVsBm0} zg7f|H-*kAu++uN3MZ8I*JhaEs_doo7jV3NKH}ONCWD^u*UWO+=FbSI4Px1T~kd*wV zb7-I>C0Du|l&~XS6F;~$U>`6(6bh^P{Q`))`S>buEpflJrO`Yh@MYKue=T-c<&R|P^c%JFY zdfgf$X)}}JbJW1AC`+Gn(rBiVxpe*O8q;r`tFC0uTrYN+e&;1WQB1{2^PZ4l-ZPH- z)EVsJWX-9juXU@F68>6eu&0|EZ|GIT4ECIiCJbb}+6QFr^C#9g4eW??2HR|nqx=6f zF!`A&JJ5}F8rb#d3^wxLqF3SGq>0Hv-1q2vB%XD%yP?xC9yutillLZVRSt@W&t(RC zoM9@n*AS_#xr&tu4xP6yH71!aImUeH?U~P^$em6>?pN|Vu@UVj_A8}*3-$XCue_g;g2HtkwuAp z;76X$p5_BzI&+JRYRp1MD0zD}UV=OjpW)RWkDi^y(>c0JqW7VqxcG zw*NVgFV1I2NrxBW9q;%n-f>uyWaD(%*wphH(k|oA`#Zd2PVc5#JzKVVMx$08$yUw0 zkKfWo=OLNDiQI&N z68v7eZ*i2%2jOt;M?VQrRxP$l(`(?K&U% zx*Dvo`8;(~H1rg*dD8!!Bl*al%^LBMBhTU4k2q(q;jxs$M>_1`r<)TWxwZ*D5<7HohdLi= z58Xr&H-MKMm|&-wVg%7R9SP_Gk-VO2+y^AkvEUiM5DopJ{FHkzcNf_=#VX;cgkbQT zr71aEp~NVK5~CFUm7wkSOcu0__9g{wKA4Hm=GE@}OG#V1)02`mpAG$fNNN@=ccR1$ zo#fG949om^#@oo}tibT>W|

    +(MfPDRdK$O=gK3)( z-$He-Y(&Ji!-VR-e%plVHeUDVs7|Jw)r}=R^?qn{qLT#%Qry;&0P&#p{=zcaM!PCz+^j`Z6J*Mq!L+`aqh50l?=|uc3vtVN2i3Afa!+(b5 z65nJuluNvy{TFhH7p_V&)YG#6XXFy)7c_Mhbj@Zcbjogc6=Y^NDVNxt^*7`aUX)AR zOsH)_9)aA<$DWp_t+rr=c9gW81< z>&y)lk&I!slrh*t96DIGwdy)kTerxz*0OlyN}`+lG{4>ukL*8<#3Q5LC-KOdS6Mu= zn|4QeY?ko(>3J-&y`YOL8HZS4zqvbPTP1TI^ctMyIaPOSH%5v zGAv}s9)aufd^CWkBq))4;lFlDI^2)ecddSSu)}__Qat`dbyA+(07yW$zdYee%{Soh zIr)^};j}>WefEjX>}{K;3`vP(KJ>0R9>3N%<$c5X{MGZ{=CPUHTJzY3E)DW?!Fvz9=ky8mjhNqs z&h41kBxs6WL>_?E>+{}=(2l3hf70|pX9Gq>|L<^V;3SM)FvjpF$SoaXn%(#kQ~ldg z>!`5zn7c>ZmlNuB?%L-hG~!@d0=89+CugJ3IZxPSD_ zGMpC1Y+hqwdEjst<`6D_wmaqyeiiTnrTmZ zd!E>=$Aiwi;nkG=qqot{XrtyWXrp7I$U(d6)UM_&m{>)HnO)_0IH!F17;09 zH~iO?TX!|rV(*)cx~oG6RYCYVtWSw z$>kTqxJuZT?AWN03nx+c2jp;4(wc}X2eO`mRuKQAYmvqP_y)H@ zCx^2pD5b-9*)QpEAL0-h)FNG_hg#yX%lu~vUZo>p4{pNzEWnZ71hiM53D0sZC>xQd z5*eJi9rmZat&d_5+S_7a0}K+62@r=U==lM>Z5|&OOg1y52+9;FlZR-?Xm~Dy=OXz$ z4xUHD^Jw{8tVGf=ogM~RFlCx3g#X3@>nj4Sk^##Xix=A)iKovyoo15%GS1{aCx<7* zrE>T)5uXsP-12lLiY`BcXodKXjq)d&N??fM zS?)hMUTDy#!?RS+a3#=zPTM9iV(i|8-hlsi$|5&?UMXLNMOpqfN<>i<&q{mf8j?9p zq?i`6vFmrcsFH({tP$$-BI@*Vc5;Pc?HUB@ zL3EL+Lc$(4ChQ9gt{4<9pqZ7(a6nxDlK7%!k^obJL$X|ev}~5Z7!2m%ve-lCSxNAb zz!A_`pi0M7npj@}0$`(M2xI04iqs*FblS`L2iR}oWmt$?qqq=HiIPE-vrO7YrH(3c zUyi6g6}Qt6K=~+`01`_ZnQw%x2F64i0stDBN75snL7p*OI|X_-H)u}m5E#Wt9Sj+j zKD>o6yrqxf@V*eWa75!D5vDy+qpJZ+DJ6=2W{f!`z=o1FXdKP=Nw*Mdy- zEbZSq5oBTh@nxv2F20l@T||uBwWu@#TrIj~a`C+da$MRTjHyKan*dgslD5Le&RB2s z;A><=7S|dAf5l1AaxBXnf9n7G`t`iK!}SBC8J(CV zlidMrvGY+g@hY=EeA%8ZioB+%>C8``Bx=NEP$TIknz0aQvcL$U88fw~=|h|8!*0fh zo$E;FF=3@V>$On7xMzPuX0>|ve)#l>$l{<)3DZ-W0FS`+Z^d-9*a z|NN3afq(f6Fs6Py`W8<$f`9VEm+`|>@k6v$1{Pv$tgPMML2 z`{%^WDmHGxQ;+xTJQ9%up`Pzj`epiG#Bc`Jaby)@Y4<)tEbVts7%c6zW|o$C*TyWZ zqNn~umiA)=9#qTve~zWi_!qOTKMhP}X|D)z`8NZJrTrtwEbW4)nS~{`_5(s6edbT^ zZV~NIVj}q^i_X%vVU~9GnM}{6@U!ZF!p~l7=4Z)iYl6c*ot+zqrzi!GhVyE7+i=s_XI5LJ96M$)5_2%NxC_NkVbsQwIx2340GI6JTPySoR`;chwj;iNiV2}#K2;EDTqlcC3yKSHoC*9P#3~H+#35-rG&>p8|E_uL^t5RO$23q(0@^C-^?< zmG|G%rzF=)X6~Kpjr8zFrr1z)LJ~qi^m;IU4E=b}=h;tAL#d>`J=6}6OJVM=-1YHp z6jy(}KV8v!!Z~ts%BXF0bK65ZES%raruIbE`dFIR6A!?hv{vaklnDYS?1pqyWQ7Mp zn*|GP7LNJhKobTI&FJ)61-751a9V+171&Ci4Su+Ymvg393im92IE--i${@AA=+HJc zWG=FQRXl6`r;)vFC0Rn_$Fua0(RnqzOO1B6F>Cu=3&g5`iE%bQ>oKApoH!pRQV&j; z9Hk>>NQAp2lKDB1zRyRohl>Nkq3wqxNH$UeCnSwtn?|Hj&D8D$%p!45U&i)DP5E0H z42-1(e^s2M4K9b(othGlG}_*xR#swi2s#LqbL3ljHo7XFz4jUZhN8k;`2Q}9Z5w(8 zzbLEDfJ-^{2!;_k1Qdmgi23K^nPg*@WL_JVO#VUi@ql^%L^RY<$8*CG&-pv-W4S|Q zcx#DYu4)~KlhVX#*2kE!$wND5;ZBI0>y3=Joj`fQiV{5(2}dZQYueC_=o%DqQp?11 zq*E~Pxb<;wq)A&cD5a1sE4bScV6CpD%;kxFgxAS!eHOt$`y5~sID_|W(5ol2a3 zc`uiD?M=k#@Ozm!b!jHzG|0qh=X-!Sosv$fuzQFwwTH^+)iftsWs&ZxCOEOR)$jFc zw~(t!y`8O)QmIK@TH2UEoe0RHWiwqJs zcPH|%&2-!gQ(-Z;VK6IHKNz|8)Q4dy?nl{<6rLJElcoWLyk-_w{5$f3~NSY5g??`FK zvo!Do-En>S?6lS+ltgfxaK*fQ+YH*+> zAjH%CVWaJ)8v=c5%gqa>2{a8_ELQYmS^a+N74i(VlNjDY4l@w-CEk?4)4Z9myV;Xn1>QrG?W9G0z?( ze0m>6P9Lm$rp@oy*aOrlo=#)Wv`Cut7bMTL67v5{lm3E@9TUci? zwcx@QJ~+lLlv+$JWHoMKmTcj9a|>D6!f4sT19}S$R0^s7QOW%t+^NuJCyLh(*IlSp zX2TtSdqzqLoQmJlQ%Z8|p-a*mo{Dc9or+#)*dCeWBvHeQbOy%r8E9i=3e-skcn=a1 z3^V3y+>A1v?*OE>Kc$}Rkz89-vt&h>-UY?&){xtA2EID*PJ_~)N zF>&eH=6szdEB1?k{&fird%_^%a4F&;)XIWPl;j@2B)_@nwb z-iep-hhOuDfnQ{zmefIXZ=!$gn5lnfO8z(0<1|2?DtRm8qOnlR$uzZYN3pDp0e0>83XY-AjcfO=OwzxmHZ zD``wMgMDKj8WDx>F^%mu_Eg7j0WqA1*nG0=J4+Os`_96?2eJXF3#-oNzOzJuxo_8B z_ubaqw=1RZgGXv*gvp&pOntks?>R>r_3aYnM@)U^{dM2Z9x?Tur}sS=`~IiwJ5NlK zeP4uq=V9MxnEN)W#vA0X%M%yM;Z!t;{x-g(Y5a*42%*m{nGKQUMVSqe<)|jZ6zg++K~>8&A(3-Zyn+Ry>f_`v;&!cOS)wy7n1nDRgS2u zPum|`b39f4vt1~1T)%S?(#U84Q!yq_qLE#!=RmljiIKTPD><@sIkMAeWXXI%@JiJ4 z__iu+$ReA7>fOZm^;~^PJzr3rDL@~v1+i`vpPAx6@H>fMPdL7^_Dzj%g6!a?`V)>% zF^|t-9$#0{OICNw@tum}bKv+UQfiM)qNN<&HXUK7jIKZ*-I+~{u9Ntoj+-y9t8HM` z{~4DAb}K|3rwr;jS?h^9)1aIXsxu0vug~q5(=H z{9tQNA$Ud;Cw z@mVDE2!Nex0;;05=5*$@6xF#m12-X#+E{N~Ph4I~KBsk16q{t+Z_sLNvq4|OWeYvt zn*}@uDkfwK;3$)4@N`x8I zv40)s+(1jJdAmNgw>BYWkU!31u}OOARRSu~?e(rK7^P|lP}=qrY{tlyPL*TQPPVC@ zin}jo|6w#ijGxN$Kru4XNF?M-RG}3ARNR%geSW9phn@|A-AR4~{g#6jHm|2<%BN~% z6hOWT_8`LA!X&hLgu6DwoA z-Fk(tO;m{cS2UW*iul)TAHt7M#6OTf?O_E)R^wCDMI}J?DcUil{EGJ3Ctjb!woKaQ zhMBP8Aeiw>J1DB$BwDovHJ{lOvLdQjcT9%Ct+S3TEu4JZb(q~r8Ab`a7 zqHJK)TlD;uG(4tf}N6`C665tm)3E=xsQz$l^gfnV0-j7xYC+lGZ zzt+w06-@N4jkYPlGW1AxU~YRhXgZO9RMNhDe^SL8nW&I<@hTLBVrqG94GGY*fuA{U zwLkI;O2+Zb3m=Y0!`QDAawv}`DFUU*salD4&~#7i@{d=ggE_0Y(VoVu0V7*%LrZ!k zlKK8kXi`z=M0@_b>8FECz>>z!O48Ulj}4wj=JWJS`Z4HgDeTLW$I-q#V9_=7pPfiU z|6)=@AAh_=iA>2-wcTp=LAC67x-WW{6_8Wfh-k)zWYzeyMT*)pXv5#pE-fj*Ry^hQ z85}mJKSnOMi1XU8Eerq=b53U0C@Y08+Pl)2muIiPm6E*|g(v$q;NIz{mTgTJbvu#o z=!aTyHPUQE^p11uz1rPyXH8IG-cBV`LczsD2WaRe`LUZh4|=Me|Fh#w>-mow_l-Au zuM#Y`VfV_)9o}f~mFUI}L-QQGr>ElCZ4XUOu#E6x7-;MkU2eMocVMigA(x_0htsEf zloO! zMX;7WRG+dr6jy16etB?2y^%gI-q78$uK;~I@{HyXiROE?gY4I#uTz1&El~6Lfg*iF zu5c?V2hGBN;5%~R!J^UR;DAM=^`dcDG|pH!EGkBuBeY8+hoN-xcfSc(gM;MmbHBR? zPcNbNn`4aH^9?O#j!BCdxz&a~1W5l)pcozlo*skvCmcnC2c%UI$@#QfY{5loWG6+m z3`SNWQuUFd_Muhw@QoCzsIV9s0>E50rn^OoY96;r6T7j{x-{0Zhl&W-Oe+@eIQYiB z2I@~=P)dg(e3)%D*#kjJ`T?{xLAZ`1g}^%AqM$Dfs@+24iIo_No(g+PsrUQ&bEh;0 zK?M}=7Tr!wT#iaHw60)%s7-E*En~YAXc3Qhy^DW4FpNnSkto4Z7)v3Bc=dOr8J%pN z(T|jbwVVQ#;LDG!EK#!we`~fy=zj|L3iHT@3U`uJ%S$E`Z0`*WvpwuhCk^D(6at^< zAxzg~s|luA^#!5dfEr`Ble{EppGJdfZMBCnAx_dZO}0b1p&x`d2D~4&G=z6>2-<1E z^kb1EUC*}bKw~_$qn1JzWySU!PuiDIV&ob6a^Gbo;7ml=oKLVQRJhJ((tn%W&}0ag z*gMCJe6oDYe$v?5Y*5T^LuCP3xWl@_5d4ZHa8s3pMOHD$}gRwg27Elk=kT!k46O*Dq8{X+5?PlY|yiByy~ z!fkpNV9$Nt$S{{Tay~`riK+Hs-z(u|_V8EKOSPhlTI^38#?iC5#fGvHOiZeJ#ySBn zr!m!V7D+{lvygJY)^W6Mm zp>Dm>^Tufp&$Kp};p1JB^dZ)VJy9t(*I~8Pk&?Yh>PTl>2pDh&9kehoG!!YH02q^8 z(kw>=>kMFYRbDt9!h9{|$)(D992nZnQ+7lqE z2oQ0%&R647o`(#6{}cwFo91BFu3HO+Y`Fa9|u{ARKTCYvjkN&>#=?Ixk*65QPsui@Nyd%~AP9N+5l z$jeA<2Aoi2g3ITr^J)8i_F>zljCC{F^Y0PM57bI=WIb|Bsp7~v;nJxekT~)nN?|_E z4;@ssUjw5^5{U`geA&mQblJPyVmqpSMr?JNU=3|9=Jlys+=Tgnw?__g}<6 zE%yH%{FA=#|G)UBa&PTl;h#Th|J(S7WRd{$*C?cHc)h=eH~KNcs<=#5S~g2o8JJpyvUbZV6!KSAnPTdt zRce(vrYf^cRfd|XtX-EHrRo12@tIQM0fwfMpGS5`yT~n2*pu|Hnf4r z5@)W%L0{|6BfE$cHRtn&jYiUxvy*sgEtA-fu4NmIc4VW0B~!U1u$v?|`$h3up3`G6 z1wD!)PQe^R@rr%PDdeq9S)i$=D#fNMpKMH4$xN-Hn5yKNsywhGS!LBPDQoJqCfC$v zZM3E#J2Ft$=bP6wM{l-=soDE}X|$%rza-a`qNmfEUQS}JSI|@*MoIcjvmMGv3J7-} z>y@EG7t!q(2KXu51KPK2iB}nUoHq#zzX)tcH3bdQ8@?Xn0ydr0vLIgUz{lNbY;Fv{>_qq*^8)c zejfZ*WElzOD)L8LrjNUNIy{e)<;C*vB>8u$yi{y*d@@o2BYQ_91AkbR7ASWOV`64+ z_GTq`*sf@+F=ft_q40pBE=yo?fqia!_e_Tp>Fj1P_*h#-hSSm@Fw%9IY42tg3p%Q-X7xxb@=l!G7k>dY0`Ai8Ts+hYn{jvKX?{j zqsl`&(Wf&6G;MesQJm3Y_~Sp*o*gE|a<*@}J$qD%J$vjdd-g4J?Adn$VHK#dXHTm( zfij#2-=e$CurA(c)J44gQdXf?mhmX7YbwL|0++fn?1o>s$V&DS1#MZQ*SZ1f;Vk0E z!}(NWOAUoc2SzXHV)m?)hBBW+YMqtQ)OiglgnOyZvfI09IR{!)Zj4XYZxrDjaQ zwPP0x^}AYA{0hTxu2DANPte39I-uR@X_omc5UAMqw*9$9)7M#0xH=e+uYK;M^yrmU z(+{_>KXN`iFD#}>vbsu4$AyeeC66*~QF7nYUPWqkd;PZu@aoGx6f7$)i=|Uo`pmaF zr@>c)bNK7Qc?2<}%G0chHaJThDn+Xg5<_6r=FpQQA+}f%Yyi6sA*_$qiP6b1f*H!w zGtfFpNAZr_RpRcY)QYMtv4_5uVo`?@TnyNJ6JVnyt8nanJkS>L9ec#&#kC|9z7(V8 z279@!fmWn-AZZj}m2+;mqC1BniuP%VLkJdHtfDhY8e}*5WFXkft>}6@v61W_>z4xG z<8P%zGA}IX&ibcl=Hf5#exf|~ZF+RW8Fk3%hJ%?2X;LnJawGUU z92<#$`c(AqPD=8RR+GF<)qZNQ+&@?0i?~dM+A?h*~4BdUnX0=MI-{K zqcU+h{96H|F81Pr=ZSYw^taEpH-Urh?~c__Fwp^4n1;B6622L!x*LE9+MWHgKKjd! z7^CmVqaVbhx5?4h;GU$oDYTCEa1n`;jkS#BOFbdYP!zPb_&Tl%KaXnFH2Hgy=Et5@JyE^v~_bLc!68#z*PMRE%v|DWcDGnYM-Fg`tC z`ZRW&EzbN48iD)J#SCuG0RVSMHkG?R_RukI{U1;ipGEV>@rZ(!F#lEDb$q5_zmfUL zrFh8JD3R$H20BsQAb#aA+rjC`q)u1tLu)WjBz&Hj<+QSXpY{38Dfl1`>bHS3D7;}~ zvyhREX3oEe{KjKexSmSj`Mrx|JVULy*!5j4$HKh%$J&G*@A81O6A+0Ngp&eiK*K{W z!B#K?91OHXUgD;>Fp&<{0T{ot$QD>h*pKZJbUx0e2iokYf8SNq{6H>S&}5*7);==L zl6#O+mqpu`)9H$qBiN@^DV2qS@o!-@{q;~-bnGJb-VdhxqhWj(TG~yPR6`e=*`B+w z$Q!w3DvGZ%WdM(97V#EYqfWDlm-&Z7JWW5wI=zuQ(J{k+p+1j(Vk(=i+QdZu;b`Jy z1sA*6ZyK#sFyOcnnd%e+x%F{k>-V*WB}-@V%l9nzU-3OtlATZhe>uyKkiX*MKrCq- zu~?5=X&8Bk!9UQ_VxY8g5k^cz4)|!Tz_D;>|D7mE+i>`Fi*YdkXW^(o#h?-?c2d4V z%*R=Q9qh)d6b{H@R5j>2@dwd2+y*y^^PW+FFjlrnvwU@mq=H|}pm-cx@M?+lcH;@E ztCengN&NR6yLM+8dOvD9ki(8kivgX@_zd>Pt`(-5Z~;^0=f7j{RRaZWIGA0t_Li2t$VGKy;=z_vXULZS{Sz# z#bK)Un-cB+i@I`W?kZ`2v*yhio>G-VGNPHjJ~c#bI1D~?(GeU<1xp}|uz zUPDj#4W!pK6uhAeJOD#+jRPOr!h;JQocNFdKU}08iuW}3f88pjn$wII>!jdWR=`rQbhBNT@43cO9sBRY&yzA(E8g)(U4Pr4y$@5`TgdIr@lkuZvOP7~-nZ%8 zUd0dQ_KqNDf~{Tpt)yhr{mmsJg@cyc66l8BbQ|}RdPf7fU1dvLo`Ws=f8hHgY5xfP zYL^te;-rW}{tP9U4s6>Hd96A8XOcZQ>J5WEfr?s!ga8J+-2`|OvEPk&sF1lp-8zFw47Jz$`C8`fWZ! zAv)kiEEsT{8-jVKw1xR;0Sg1aHj>ynw%%J&X+pW}>Kdcm{8V^fzo993UyU6vIriu9 z&iJ0;&GEAzbT&or@SxvOYb;nllS!8{{Tue&v%{?>>f z8nI#j+*ylJJy1UnYtR`9!7oeMoKn?*Q;p&D_lPrr2&Je!@f!8UhWxtfAqL7P1Ix?gCt+pNQKm7tS@zodYlr+w3zQCBvaMm*X4& zYB#liQn4;iY!+o-!>Atnv=N=F^=3NvvK{dLWem@R+Ancq<1e4c=Pyy-7XJw`(w#?n zK%9O<2oZ$uCp%&U(6D%8-L-Et*(SGV)tRBowwp|h%KKq`p_Xid>21Na!GvK z_@nqB{wRnaq2?iW1^$EP4dRK$pZAuNzXCr$j_Bcb(??E9XBlgUEwsTFeDo5H%O;`f z%wdwA>C!Z|u6h%ACt?@LzJqFIx-LMjv(F2eY1HNK0zQdIR>ggU0oi&f?HbWuJ|C?# zlQ+2EkLM<0bC2ozu!F;M()=y=CHY$*!(%gc8u}Xr&5+FW@qxcZABS5=`uP49l0Lq> z#R=)-(Jdr>d})jSkUs9xLej_OWoG*L%Q7bLJ}zsLK7PLJ#PqSmmqktDOQp$D87*fp z4dFf#IkXrPyP%BhxyLlLw?=v9PdXE&)rptjRH2X&&?2G7?V<%zyFgGY+R4d4c>Z%S zsXc|j8{xnRc$BU_8Tk$a;z>Uj()V%Pw6;SW@l~nuDhxIU?xcrya+sV!DcEFcZfR(l z|3Z{tX0Q-zqZoC74R59GbuWKnBU=dslZ)~-)Ma~}aS!pc=dpNC*+5RF7@(pRrZ^tXKpqt+krpY$zhE-~(Bx)E%|lSR5Jp0(0RCOV;5`KtVqolD^VCt;Xh8TZAUb*L-m&cndg z(f0XDG^4~DbuT8n4RQCk=#>ssa~$s0V0kfY?f=4|IOkjAPy{~U<^rFfdk<22CmSJ3 zaRc_bUgpQAZmZ&UhR@9>kmb~6fcmyZ4$3UE&iTmhv09qhH9eSs5$E$`K8 zj5t7V5`X+1K_!g0x_ZXgQAXWi{@j`n-1)(lV_Hs#CShW_LyQRE%kmi<4%2^QK$ zM7@1%6x?_th9Yar0gUnJoca;jY^&BEjbKNe7T{jK)8exp!x%agr>W84<$X`l(vtS} z%KW6e!E*ZyUm_*emzqe%`Z}NR9l|fSGzmeB3y!-aa+L#xhuIsHY@nrLO4;FbrR-2T z?#~;5Y*I0-(k9>)P2LSih z*};DfoOVLEfTEdaHH0w6WJMjBwO5Ms-UkZen*D$fOW|1l@2&s(|6u(;_{;VGpz->D z{eN}+Rqde3#Yb9ATuR$OTZ#k5bYv1P-w<4SC*e|Kf25S*f^r>kEjeY3rGQpGtr|6A znCj75sb;T5fWexi0lW}OE);&CBL&6ovyrV%o7pV4v3UuuKVBLgnrQ>5dOg4U+j_N1 z2CTSeEv`VR&su@&Og!C|`3y4#*fd82ZKe({vNiKY`=jHJB{nQ+_hZ_~74{jFM>2dc z@Uu^=Vfazjgz+UQ`8GJRm<_jCWHb=j)BBxmf2LHiF2(_E{;W3fRrA~ImBd|a607Fn zMXuIS2qLsa%BUiUzK#QmF?}vM{-6)ZvkRCAK-)453rw2`)7?hfHxuG_){VI+kPb@sC^ra-Vall=pMyQ* zJu77V;IQr|+}lkAij0Y|i#jHAed%XHD(s5p79uV%8U&LULXNvk*fu+<_(I7gS0 z%)ZZ=q<|VIh=o{O zc*wMfN2?jS`OJ|^@G1n}9|=1k(7iyPVG=~Q4&I?D;PoPSJ4*28b7Ffvt|EbV34u2s zNHu#1RWBkP7C-iW(oX`Y&(jo&&WlqKH~)l)gA%axca!`O`l-n1M2xyCmap5Fk&_V- zNU?_~zFf+ZtI=s>uj-1~rT>J{kN=;-BKuIdk0d>-_mdy%t6RCFNLeQYEG5<$S+BlTQu;EF-r$#l-EkoOJXK{1O&p5&;p@`3~i|CIWhMH}2+u2rW55SDhCedm$ zf`O7~pw9ATnPb*Q!Qk&9h&(3CkcE1TCxmPirEiY*dFw2zQl65=)oLu{!kisQe*0ba_&MWm@697^K$(8ZFDU?=B}6BkST)R*BSpN+}-I3 zD=EVlqgQM6b2mEesnIK`XL8^(^?a-BnajE7aYoPIxJ!(l4-!507$qR4p4+>Pj^CR_ z9e1s_5;c4AR`VVoNgdB=(D528cRa_`aj7*4S1OoiO#vpsxWNe#8$?rwqsJUB#qq1b zN?_2DH*70Q2_nE@dp&hfdgNm+&7o2~j{87ZHIoCL@85z{V_`_Tr><~ zHcDm6_F3|k4PViIHeQA46;f^A@FC(i_Rz}+(Oc>5ofI1@y!>cFo>PuYgLDNE0%c4H zNK}ZSNW1F9fQ}}S9$;LGCv7x)Hlgjy7sMxjP6#?_gMtxt?BO{6d}s+G^Cs-C^jZcZxB~MMEfiD(y>P6xZ`6j8kgg7i#>KU8K)#u>q275y|q>n zOJ4jFAK)xBXTh@Q2?gITk?&3Wj^8gAS@?dje4k@%SmV7d`95`ruMoR{42Un4ugpi{C$S1&{UBeN z&(KfeOY#^cLnMR}b92V+>iJa4jmd5nHH(T+#9^L0|6g+XyO z%uC+K=9lJ}Mh$?YfnPYAH`oJnQUequ z;bBvP4>~P=PTvz9V&vmg&^|fxpiMj-C-ThSod$+cA7B_OX&B!bLUE%mTVyT#ndB(Q zCxpnRtHX!q5Z`rAM^^P-_o1QcUA>jZzy9OK_7BHK<^u7#UuwV^5%^_i?T{a`0hU7dho;}M?zv$PpoA~qkR5w_wH{?vgpW`FYmkFZgT zdqnQXwR&RBjxhN6dR?C!9q=}Aa!snm#LKU7dy&l7v%6bJmYOKZA*KbxH~@c0yBuh! z&4f+SxOX!-EnqzHt5uXL?=Wuhpoe1 z9dz*1XnsPIEb;v(WR*GdC~;DKno-l+#k0h8wMSdbOe)$WK7E1hHy@*MaYXS3+c3P# z-?j;ZNOPJ}v}06~cFiS9<@Ee?Cc$If$X<8Azj2BDo0zd_x=DjQnjH`7Al|_;b+j4cPC|jWRF!Vo|UK7Ij6#cpexC zEjr*%_?AV0V)-j*tlxcg58Tq_ltQQ=@r>KNrrY&3{qclr+8G*DgBi$A(ZYJQH)!n| zuP@-AY=mQwuI*^%;kC4yc*}_6AMr~5D|qNsOv7+wP%C0;#@2q0QM$cnc}Z#WP6AX=#F3r?ksMYzE;$|R7(jv z5iup6Hm~IAR=JWH))TGdiXEoYImDG`NnORtAYQiIn|@%Q>(ubTj`GtI=kZhXUulW6 z`00g5h;+*lj*ob?tyE5?&V34F0D)hAW7-P~hQ;{!?t7MxZ`5Krp(*kS&+q6)=eJll zE-_7F8U7N#6v|tT4{+)%COq9_~S#V1$Fu?*;G3^DBq`77hJ1 z{Q&<4eoZHrhT(&b55G<7{GL`?cH&VRwCnM!1$L+LlsrM0FQ!Y&jtWkrTN?V(&UP0UA= zDX+GuAXq&JAk)kn_5N((*iK5cebrE{9t~u|s~)&g)#8QPX7yVsp_yK=0Eb+Nf(p!& zToBx0^=k2<+Dc{hmO>QraH)V&F;pv4R#&Ux)6_@{|4609QcS0*YSuuh66rWpJH)eJ zU5(<5x+8pgRVQF@*ixyq($2Q{y| zvogO_+N+|oO*BJbCJ|)pJmkn%iu!@d43dufFGOvUvap^j*eRy7r)mnSX%ua#Vjs2$ za2Bk9Xr`1^r<1&#={=0Yxld%2q5EDhHw#m;veMUL zwNeM`u+$fJsP@9e!Na%;feN%JS1EN!g$-Sa_gD+a7l?w)1-^#QRISDyI+<)%SE+TY zlopG;u$uR~JL7UL!f;1Gur{N>8tB0v?Y&6pNj`}OADF=ArKctJTfs_e@W>$lu;5~= z65MU|t7^od{mN#bs&)wWj=K7HG@@!jJEvQfFYJ@sCIKU{1QuXglfv8tv=aCV2aW+g zcUuErA};Ksp!+@L$3=WZ3HS5}Mg@AXD?MIafnm=5Aw#Jnv4ybX$0 zqh=pdvg-}~-+t<&HNd2T^?yt4kJJ;6tYg!TG2b$x#74a6EXC2SmZ;VWpLMIhIqClZ z`+1J|2YNMFxx-SPzl`mO;5{X!36{a-R8Wr?bPI+wpqyW2cvQ+nx-WYF65s*%`3I1j z-67Nbv7L`_K*@Hz)XrXL2Q|Uj8 zx>-+ZQf~;?1-X8Nmj!-iy=-Cb!NS}>$Q8x#hleqFR@+193^$*2w=KyZ4vY8NLsuKc z{aD|~tK*H{>X4jaQN?f+tex91*rr$Op;Qj%{fQJ&OQwiysfxDVtZ1`UTgi2CkNEmi zX+v7hc@7bqbb5(+KUB2!dL8u)Tg_^=_HNR$JzT+oY+1?HkLep!0%;T;;wY(n?4i$e z1<(k%-7iqIkv0@_*+XxW>&0Z;Av`%p{OKe2}CJ9HCU| z$_U=#v{y~~vTrL`=DJDh%K%G2w7<&5_aBo8A=5_svXg!weOce;6!r>lX)%K9OMMx| zpMN2#FT>{+|7N=-7X~Py6s5W@64hm6%B3<=Ny3SBk(i@U!YRidYIOoQ9j6&KrGr#- z9i)PZqXde^Hogd?(8l8?`JPEnhNO(>KfhLzqLF@VPd`$sull=Vf#b=2nj8Zw#o+ao zw7GSC*gc|Vo}qK=m-K%dX;QoS$XB!+<5qf(!fFnDk%_)XUeoQzQMKh@)fNsZc1J>w zcWm^p7)rV0QWus~p!JJCF4QOGZY>vGeg^*HN!l;)7?_dVGO?)%8BMwA_@@7J_;3FI z=kVY2|8K^BX8WFEqSGl3=N0a}p)-fNdjZfWsLl0o6q>8rkCCgK-pGS+SPl#YP6Xo} zcnNT7b|oy^a<5jeVxH0JbX8lSWH0q)S9(1)KCRyV@Tcjhc**FfM3q5^5bY{}`cN}p zwEcHTm6ja>;w*NNS3~N>x)iD9?4^3L3|1a%rT4;;=jZ#Pm)Co>wb*YBo?1Bi?H4K8 z%aMw{LS5L9euf3RI1MSC7?5N;hic2z)qfEE%%vv!*%v-jjkL6f5{N5w8agfTjW?Z) z9fl3^g%7HBwE}i)Gw8E|4VsXsya={mL2%n3HM`ZH&D)}Y^G?siqj057 z&HfG{g&dL97rlwSm0akl1mxb2P&ap(x-w3FGG$gYpR{KWt5H4QP1%D^^TBP$y`F7q z)O&#Zl>PzRvMt8K?Fql(4!Vp2<92}uCpoMTS1wukl=I=T*1L z+ijEueS11nAX_xXXik^LEtccC%k=ZZ{z@(Abu}VrZLf+xOUwKVye+u?Z$|w=RNrSo z&!q}Ip5GumRp)x~S`(K%i$o9m+k;K)=Qq}lVgl$VP=8*Fb<+t zmAj1j16cWh&9fBKU;(|V{iv+orqr!byX7GHIS2rRA8hw5MSOw*`|UGHFG*4Hf#k$% z9|_ckW0vs`+0W^vr!7@iS9@DD!wx$@hcK9J=E6Xnn2%-g?@N@g zk}TMM6_uV<-t1BpZMfdYC@vm21f{BdNQG)GwLb+fs_b)1?K4I(gJutV(fd7`Z(_Vy zSuLLa3>mJzfTHmN%dS;D`!Orba-th>ZEp;*sTz>|Yt`ueZ7e=yGl}Nufjw)#!^DWR zkDr{x&oCPmjx>tQS9v2{(H~5!Lhs$WDsPL8-fnBva9iMBDqxn1);v(9;ynwSp`|U=WHt^Iq7bbnJme!=qoe<05agbtPgm!c$fELK~&|%=LQRMJQJDMge=` zx3SL$@=6fkU?hUgM0>d2zgi|q^R(7?SS+hTGhfTx`TUTR#i>(F0%G~(<07G~2MHFmT;aP!aMLsL=Tm;WDuJfoOcpeRZ zN5kK7@OPXKWrttF)s9h+is7PH=IJKyUTw24Tc|)zdF>yUni7q6D+P>Tg{_Vz?{mCS zZ#laA>2_xT&v(c!=X@j7O|Y~We$S!bmHD$Iol~r6lb%;1lV`!-SxV(#D)&Y%e^-f2 zde^HJy{l-G--YKn@H_{e=fLwEs6S7(vsC^?BNaku(};hBaJSPv(~;mEtbkUF0KQiOY=<$-wW3+@ z{4PAdE1v<@R{~6j&5_Rl>nrEM^E~-HFLGsR^)_P3a<+MFpnecNHIX}E>-&>HvI?zOVuivf}s$YdYo>pB34Fr z2NlI?@OZI3^cFl(E3sdXndP~Q;Qb+ZKjxVJzTl`SUSKNkXQKobp*0pWXM>)QNKpKQ z=QrAmOve#*PMVF3(+*M^qoTwWL=&!2IZDLmP-Hd>)DMg)%q{gseU5lXxQ}4_0x=u0 zVCoGN(jM~nS0V){NsV}vx_xkde!>*Z;e!GA3(T07t7twTdSPT9i$4$QjL=#yQITjk z6eur2R?|y{ESn!iL_XGrtXyL~0#t}U0r!pVbK~rq)u3 zY(*=~5>G8=Pr)6F*j8cH6X>tZw(mcq?fZVZgnj_(#Km0mRrN|_ifAgyHqmz!_VWAh zY7?PSOv$x|^lh8PKgru&plSNuHi~4EesQ!l?VF1u8P#Y+a1p@^vw);ik(Q+dhd8a2 zH^Z#V4iqG}<6_Tb|6yoC@ zd^o}yTSH8PzhdelhD=o%LuTINrhGW)zjqXw)trX@>sjQ#u7~j4iqj{rbAZa+Ac~OM zr&G|QJ)*~F2^ruEBf_i_&pjYzbc^kwV-~6IH>_Y{Gx04nj4{qQc=xc^q=+iR?H`gF z2Sd?DJ28M(4**q;DPBV4^nbKOt!KJ5@M+SnBL246@gRBw;oZPh#NjW1Qc0(V` z#x@zRL~9c)dK>Z?iQe}AK%zH1E|i*kKpp=UYyn*6dj3q8ry!ufE)CH!0V_s6&?Wae*9f%KmG>JHa03j>U)IrLfG)}nJNB|rlddvv8f?SzX8v; z>JE4RQ#jjx1A3-+xA<>^AMI@LV-kBI=97x|HBdE$=Q(u$t;Md#EX~Bq#n`oUwjOCtXY0}$HyMQYonh$yo3x2*dQbrA#}R$LaG&mY z%Q3`O&}YkZ%YjmbZAC6$2%GI?RP=w8YJWz_(HgH%<_k)9iXz?%Z;LR-n@ItKJ-jwO zsUtA->(R^~mhl?e#6q%+rof6cJk$%ufouqs0_F{86ZLue`P2=d+uNYIg=ml~ZbidE zZPEOG7IvTbWuM)V`LE|J77743IA@=|?O+#B&*;cx!H3cCkZ)x&d@A3KY4Bk&ynDEW zzj=5my?z)eSN>vfo)Ud{mVAq4_?IkFDjycfuO`7)Yvor{`K`;UzkT>!n)AaBelafj zqMBb&WL529PE+;pOnzc3@{`3Aa6rWSAD~R@Oxzt4|52ML&_ZqsIvYuk%)=i+*iE5k zWQf08_(&Q)3X|smnG_jtU>IsvN~+M$WR;j0PU@a0&Muv?XrF%2j(B~(Il>`YBqR?X zWuNiGsXI20t!N5mZI|fzb)qSSQBOuH3rJVfrsG0v`&qK1*A()u*1~L6-%~961?@}1 zbL8WTXzUX?4NBOO{cHRqhLSx^kd5FoyWq4Ume`|)Jv5rX;8Z46TIo9aXgIIdldLG($_qkvXK8!ZUj^ ze5Ld|-J1!xIoyYAEBmaj;KB9%%fTJ^B1wS^u{IoECmLT{V|n#LR z5|wjFp5nmYU*qradbqwC_k)3fpsV_M9HXx$q+37z>wSZ|7_*u%V2CzYA77lQi#`wY zKxhHx&1-7QL4&V_XiRg;VH_9CH4GW2euehn+*7Y)PqnM4r&GD7EjzfU?;c_s^jG1Z z*d+<0kqe*^@=4`({&>%5=RIy`6>EB5lI;Yko$}Cne@h;9+B^;(ayV$1&2ECz8sC5< z*S7ef2Jzrz(!q-1HmjI&IFaDPjC7QEp!IE{FIjW- z3(i23aXqphw=X63>lOqk#~c@3E~)^eVXQr`W`XJDJq!PR98rQ~9Rt-E{-{rXx5=oBF1%`aR-;1lv5il88m@ ziAV$c|F(5~;NBCP@q6wz#d9QhR&o%|F|lv&YiMuZkpCAi)xi;qC{tkzp@D?(MecA< zQtN7A?SEz|6eWR(z%gIr1;#yomgJvWkOBtcAI& ze6Y2AYGf)hpjthBwAUWqjm>(F`Lsp;tk~}sY2WONTtjWG)j#`=+S*_bFG3DJ(#{uN z6<9|ufRT|l3ZL0IetvR}Y9!yGMso0l|4db@@p-CHd$Po)Cu@-B(5qFb+EUDKhI}z} z=8InYvqjZbshH5JEZ)+v3pk`iJYG*_;DF*;>Mv5l2mKwhL};k{53`qXlT z@1N)x{ejy!_#~K@J;PK>yq=ZzFlOP1-AejiZ1233_U6g<($(lQ?&$=iDt}v5rZ$cA zjX$SoOXHhW?Q?n?-`Z3>8)8s~*}M@K18sTGgt25ue2uUgm#Ljhy1smMf7F0OS?^Udi}I>3p3<6dW=d zMEhBUk>96?^U=(=u;u80OYl!+<^=e|)O2Ps{FJo8W$@G9HuFaKQ<*uA%BK&4f*Y{C zq3E6y?QqU{OFK45_m>uteTX|YqDaWX(*SfNr$8GwKeQ{*I=3{I zi;}bjcOhO%$8ZYM2c8`O3aG-WYJcDbe8%f^cZ=n7@Or)$5Tjws1N@`KaDgM&MAE)Y z8YDhMCO9Q3YL*hLA83Dsgh-{WY>zDW%|d4Z4Ww=ZxRzf<|qmIFw?0oBbS5G zHpI*+Y=u+vVS;^M`n!`plP}%FH^dgnrRB3-b~&44O}mFnUApt;B5^wu#lI$^0fRkW z2QYgMPJMirVUH~JSE_b^yv~Vb(a;$;GBCya9PQ~bVnk+m0gly+7FA4PD)yoEqU}wb zyN=9fFo>2&+=y3wp-qj9b*hm&U5Y1e52cY=^PA*|=#Ad5$0@y+;#TxW3sIUfOeLjS%zL3dI(Q5a<&_q(4FlrO0csX&ICWfOW^{2 z-}>l457_p#)iS?@h4A~nw@D1EN4)o+h9?x?x6$9*pDXPQXV?BrS%H0S;nlgjFm=4i zDZhe!S1rNn<1LmJ%q+YbKI?+s+tVcwY71(_?1N2 z(~nHEhc;#4kM8#H`I!iy;gluQNnX2`MBMd+!ke|hRWqeKU7O6{x6^g}^OB3YyTusHqopeFwF`D`MmsTR3^6^8(ND@n7D#SHQ6XYX4 zi>u)RZ~qF5zHyh*q3qD%1sRnrAx`)6l4g;e)^TypO|=v@z$TrdEs~eAiN&+otCdRn z-H+K0h5c?W!*-K?(T65;=`4%c*Lln2+GON{hMR}CTGHWquKajvme*r6 z%4tqa050162>bm@Koa8%w8Cp(IFsxB~ho6AWL>ZIC%TFSb8HF(nZa4+iCWYga%KY}Y{FV8g@eitd6kBD! zgVRCoLE%BP!;&`v`@iN-a>@Hz5rY2ajRdI^o?%b0@UhkixoO38mKRcFjXoEA0k8<( zK_T(*ojr6G1-2r8-JHN1pzHUKP$060YQH=aTIxb?|hVU*$%d&?)KoNLD9#`L& zaE_NI&oOB=y;{65_Yajaj)dqdxW7l9gLff^G{Ara)qF!`S#x)>tuDOd)9Zyj61{}t zV7p>yO9k{6DhsD5kvQIjty6vY2w?`;6%Dyqftn=~!4d~Dtmup-I@ zQ%uveWYZ)~S_(;%w7YbZ#wLX#;IdzB*6tU(cheN82ATjdL{ULe@jVot51*juPw@-2 z$hTh<#E<7ED(Ef+p`!gD!2ZuUGjs3UyZ7#HioPHJ_aiXfd+*GdIp@roGiPSb%=o4b zy7%O^e6qkN0B274*tf`fz9qsXuIn!R`&04^U|-yy}U)O zXS5afuHW;=`JUaEpH!=8i}4~~Y%1O-XCyJLltc(#LsU#gjg;O^K5}PW?CTV8u@Q5Ql8AlRj=I3DghXpm=r!2|JOZ z7C*3qql!+XkYy~l;j_hh_hWHMz?jl>O{|5^CATY%8_;GY`VaJ-U#-1?JnmJ}iWkJB+5bo;-!hB zR9q8pu3-|vlmO=vQ2IQV6vp+5dYqHaA9IkEr=JgjP~=H2)@R9-20fc{P3*+&rc8BT z4*J72xru$Kb3c1vZJi23Z|Yy3%bh;WClsH{?faymbFV%ME1+-0)7b+`^ftPY4(jE! z7y100Yw}H$q3#r3`=Wsd>d|Rv8@Nu)4vKw4?zeQB@b)^<3Fl_EK#v7g4kO0CMXpIU zp0Re&Teb^73EpQJ=ZLJ?&gue9U47~T2KC6&EAB(fxagkIw$U#${qgC4XSyq&l;jWM z7eRdf6Nkk7@4Z$0?kMP{>1xH7WuJH7v{v8jojxmf%T0Xoa)E1N-%+gJU^hk@eMeq3 z{Jdw{{_-V;$D09U)-WOPNZKm3QywY@p&zesFdbzew>-Skh^;qmNu&d`^ z_@ydtM!)!-BD3u3FX3_}0HRH1GRHi4bkrSH+%)^{cZy6i-Zn` ze7=)u*6^8P6`@7#OtW`K?MyRIbPlH3TcQ=1X6|S6?>@lK4ZEE9_uTEwWWCrjD7pRH zSfLHLEIgkrW83GA^~jItJbb&kA}5%$EHtE>59$r+hJ&Vtbk#xQd^MpN9>M00N+JKJ zS;PMyF95;3+Hk!T5y~&cCp@C2zMAACB^8cq2vFQzJ?D3~^qqfOmuvEcD!N2{-y6jF zo%i9K2%ayUeL+-o|5OaV*cdz(Q)i?eJ8Rd(u?Gvf@vxe_659Uz3ryIX*z0fj>sxQa z>F4=p0AT*-2k`v3nYN$txT>4bK6qn2*$2IJ+)k#JtcU!!4;lh|agppF`Y$2-uYAj) zwpV?fH6cQ8yIRyE{^36TsIvex2HvznWhw{~Np~3z}lC_a;8KVpRsKBXdv&$Fb}U zDL6%!f)gYucmpr}KXIYxF=raS8!l2>Y{yORKOlzAu;&$S1$Is1I(YtO#$)PiuYXyu zi79hUTuQ5pYd!>GojdcBlxH3*E1zc(Bgmk)V+6Yy!!`MhS4`aw9`h*LGPbNge?aOI zX4>|C9vj??D9v#{J2j0R({RF{KF=q)4*I4vuxS1*j5A#aa|@ub{Pvd%!fSys^Rbr% zGaq`%VCLQMoIe}UUG*|s!Z4&C51tVRN2Nn{0&!u83B>1L;sTNXx0kpiR9NRH@f?N= zwi5KU+~FX!u{(K`+-M)q{Wi!BcCr_MM{?f@(s5BUYwia|Hb;4 zr*F*~eEQ-nJBH!sP+b1AFBhLLIrRcf*6x00i*NdigH@KRKs@d}t|ee|^@BKpLhM3? zsLl7%O?Y?}$aG&_|4d}jfh+Nnmuju1e>$=VU0=ze`*fNqbU(ZA6m0P|zNsworxte& zK)h!PXAvgo7>>?d|8#48_7^xSu@p@_8UTsie8AHI zqF>~h2T$5O-&8YMQJ$&15BG)qMLUU)IKEFeo#Wn#zD=wTrAez=-%M|9EnBvqy2rC5 z4_Ik$q2v$Jk|aXBQ9$6i#*+uUGflPVb$c+kFd(eSdA)QVMlEfoszoYmT_@tkAz;Z8 zP8;-)wetCHUbZd`?8F;Hpx$!d{539c;SxQax-s=Y|I{zNOJ4Ne^WwaIJj-D|H6_Rn zCr=x2&<{myHSEV@p7Tz3i#z396S$qNe|lXPjx`?g9(>+Ay{-e#HbzfQWHFsi2ZOfh zOKSJM9oI+MI@#m&`4U;p_1!z7!0}^q3y{Gd5 zI=@0Y#k0CjYsE{G_PO|dPkUh4FnCDA^P98#J*zWy{Cqta>BDgAMBJ$1nSyM9f&HCN zWj{ycGe*D|+F*jW^J&-BC!j3QO3bvMvEL(^>u0jf##?OmH}y|{j{Hbw`l|ZT%(<3G z`krgzn+h2>5c+FTu)XLNvwKg~fh$PBaZ;Nt6nbaQeB6gZjE3f7c2xA`xaji=O`hg{ z`5s(SyRX*Q8LqF({*G2|cIT>)2sV&XYgWR z*mUnL=&bS4%}7IBq0?%u*fPGq@!iZ`xgf;+AD-0H|I$dm@5e*Ar~jL)^>KFJ^hd6v zcK%~>nY%izCE?Rqj#EV7jfh=e>J5WXY>avgf zrqAxjjF=nMBe{z�jeNK(sg3xep-gpy9e*cl9;n7Vf>d-A`1pb3t$~EM6#pL_bOm zZ1ya0l@5UJX1^Ak=YqA z^)6eqkM=Dd=xeUVhzEOW85crz*%hAYI3(G=2|(E%>hR1Q`%5~Ui_NAU_#956F^@jbl%8gcPt+^9 z?32vHs}6gVEu+i_Yo&u%sdr`vrE7*BF~5{>g5EZNJ)SV|>vxivFTNGT{NfLhIoFNP z!ygiJx=t6ve%Nnz``kwoFH50=)A7C^D&(s#j zhTuUTiofDuvBNigGPZ(#oVT4OrF}HDf>Y7|0p(vCW3t~r^#fVGe*Ib-&Dg zV3^0=sd1tXJh0I{nCG$YJRL4PYrzEAJ`Ec$o>#po6s(E zI4f`l_nQ1Jcjf`C^8#*6%=>U;ux}$(_%ok@N=|;TP$+r%?-y{~XC!!r68uS2F%FKN z`84G7jt`pjqj%~c9ouz)mWQvYwj4@6R?WO>hrvpE`D$j6HRHxqeF0r;!`I#bpBJ$; zMx4)M*VD842qp>>^^5rPfv)qbaDoDTV_lPDF07%L0ipAo>Bk3seoPMnIDE%+ji zGN)s#FBZ$1_%gy@2KNmc{{J|A46{0F%+=$SRb z7QONYWPW^%W>4no7rA)O^%<0oyj>>+i=|b#`aO5BO{|J8bVJezb0ff?If4?a`pI z1E=ihSl-oNA#9l>+{Dg(oML6UCi)iW7UA2NMaZE5`Z^Ac7*AhIM7Sne0buvxvjLC! zih@TkG3xlaO5Jx;i=)$(e4X|6uB+&jP^fz>wtwnR-h2KupSyHiw1{ef?oxqDc*1q{ z^Dt=IFx~s-MQ&OxZ{~Q&RJbNCo5wId8p&nEEHGh?^0SWat{++iQsBDldyd3xoX3ed zy;>%Qsg3Eg#QAHFdfsrzbL!!V$2_jyd+?;L-S^=Nt?FA16i&7j*ebyyYIXc@<_Qf~ zEO#qa{dkAdgRZN(=bQNO0I)z2%;?C!4cGj6zsH8Rxj$aZ=kXbgYvR-@T82~gB-ysy zU)Hgm3HpKAEC%)3BG2xIs_ehg@&R70*DS!GUFFX0+Fw8vz)EJvBt!l!Z?I$cwMUg^ zH*#6AS*UTDXE>&#ef?nhq+c1=m^pEN%-buG;_qy6y}zS3~1isj(fToeXu`LDmk&)aZ6nSV@P zzq#j+^N`N^OW17E2u*L_$HqXece2$Y(xFS4=Ji$m5i({QGj3u+3w?lZ>Q(~9UlM3M z$^s7~yw`Qrt+f1g@7bPv_AT(t)cl6k{+@h;C*X^^U# zpCtkVlBw@`4(|27;jg|^_jo5;>%FeN@6mx5)!9ebJPuyojBwe9`k^tN#7|f1Ku?3* z;f>7Atz#OG)&V9E((_KPmpi1X>xrf*MhK=tCgyKEEbMIuxqkbLVtv#;eg&^boOEOg z`||ohG}QmkS3xqqdRSghM7hB643L8NAW$}6VQ0V3jH$VIj_l_hNPdq5CsbNNJ1#fS z!_0um6RKH%4}JXP+dw_srm}CmS;>lmBL<&3q~5v9=$$>%=Rwf}Py7`B{aXLK9%Ku* za#n~ve(Q7ZL1(Lj2>1j~gX5?zxd#TZO0LyYV~tEta4Ef(y`-`LaMa^0I;M`KI+}(Jp>LxPKpO_7}Wi>=g~8BLWKZ z=|gONQ|R9g(!c$HPk!u>p+CXia9#ZoTzzD%W8x1l6$(OdNm_Fc(}?*@?CrkG`yO^v z!ckSesk`~X;Kj4wR;Jx}Y#BpLXPW1B#Org4A_ga~rTx(>Cz(2n=pxxp^@DHY0ve#Iwt})(zsb z7~B~HzI9vh`yQ6&Vs>&1tN#3alBBVbT+LsYl5U_Sa_BLqzL(qcbfNG9-q&`qa2%qy z)K-t{y?eyiokqz&UC64R28TGSc;7#mw=nvBGy|D&GA)sV&6 zhwIsK|M}nioegiG#dV#3Tdix-4dAy`d1l%Izr`ju|BMe;v7V9Kns2Xv9?Nv~J&&s9 ztTYG4J%Q{r_z?dR!htiGAgHSuu( zU41a!w&re}T{?Eta%fI9_1uT^&)rjd+w4x?v!$A~c}DJYxu3q*j+|3$F?@qPk$>&m&v`wat)rp9C6v?T0q*}qO&ege6Y+}X zDXqRXH_)}ezz)`!Ier1+_w8h%;C}2o{P_Reaqma|cOB(hJwYhDuKdvg)?=6v9i_6f zcph$U;icpuviDkckJl{iOyi@UyO;;)Dc7;M-|<~*&8+V!^t$f4`26nOPgdog!h$=W zK)FJ-!`2|Vqtetn_v^c)@4->hZlxMsimyoXIikytQ#ikoWQ5Os_40x(-!Ix_$M^s2 zWnV9L=nDA?6XMXP;OdF7ui6}CI&uP6sj6~E17+UM`&_%vVf$Kt6J@;k$ryCEb&8IWis<&=@ed~a<>J86(a`QlYqf&K4>yps7MfmZGEcAE zrr661gV{J!|7Z8F_8qH_Ysl)F9{%+r)7A069AuvHq{HcKy_4&+d-Ina5YDJG*Rx6? z`w!PCF9H=!XQzs3DA&Z7X)J{6gAeB3+|H}^z3{PX;#tiFV)s?vJTDxj*7_;_+CRF# zAP?oJXSN$Rd8C08I=y>m&aL&%j4l!k$_=F$8Ede&d2o+VKxBH%uqyG*EbfGI*%4V& zm$N!K#9!+0g`BPNsAV!#r*7xBX5P(=2%Aa!udo!Ly6zpbZ}HoA+O_zvcCoM5Io6=l zWtqGhB^KniB#HEirg_X7K$`JpfJ4xVUR0>pW=x%UOiZO`u}&U~uu6DHxjV1Ne7?d#J91A%3`P2<`I5}r zsk~ax!ExRY7P%%<iXPIl9%|=&w{?Y4>HbIJxkruyTVjvHIp26{XzX%)Gks}RmwMNNZxrL`a8Dn z{FN2#B->hF=e116a1MwrOmJOwES1<@O-cJZ?{QuA@FAggL{GZ=u0y#0@nL@H_mX>% zvM&tpFMOwg(|&3r^MFy=9!7RE_>xC+Q^Q3P?3#E#kx>)KNz)`FGF} ziN6-e^DB0yX-sW*_iL_!I!m+q%x^zj;C^}}fm5@&OJG=+AAX)r6)2j>oLqevT|DDT zzU45P6ia08-gSkzTmt6^fi3x;k*@`*30%2kjV6C+mPtOnklWSt{1?BBCRp1W!F0zO zg^I5Y0oiT)3c3La(Rk`Us)4(=6$Me@aw8GxrnvM8%tR&dDh@f zRj#W(NBXvHC6xKizjC(z26@4@2lA^)iq>suKCv6p}C)mz2(&7&^Ok&CTJt^ z4Db4Yf zM-z9z(qwgx%6|@|xgV8R&eu+ecNcz!S$)UHvbqa?b5}q!$LZk<@&(#5){W)R^x8Iv z$&X<(zNm%hJ?kn^lfS=&90cjeJjZDa&H_4u-OF}Nczw(%^7jpF8*pc2zhMT@0d%Ix z3Sm+PH!zkj^f9n%#Q8^QT!pW%&%0i!GjhoN@BLD@beVASH9G%a5+09>W%?iR6J{QTD%tmQ+vYdgp*LK8 z&ge`XVLD$=wmxNg7*XfVC9B(#aRk3_Z zYuJ2pYjV+3I1m2^Y1)!yV9W2@92(8tG-r}Kqup4PVJkAbR-SHxc)Bc-z zALA*4p4~6jWEZ1SXj%-7W# z6s1>L`uwL^i+r?0KcsOT^kj5R7jI+j_?}_q=eNJ)(7o(!)1}lrqwuvhX+}X@FgFVQ zcJ5OUlaC!{ggwCBoA1rP>#(^9uP7qkhB)rCWCjt#ZADWsV2J?(bE~k3s+Z*=Ea!de zXO=X0rWbgoZ==pd-xA`ZJAI(1oVjRGH*>?ShSAjYMT>kht@nC6f2+NlJc?0@>hK0K z8fi?g&V3rsC8UcdwH2UZ!LadM?CE?|JKc--opiQ%X4cka|7BmF@TUcwQ_pMdo=z}X zzL)(AULFFuXTQS4V=I>g&vaL_*>SEw=cuEvQ(+V2+l$wqJO~CYuG?$;%=W@w?j1dh zb{kD|RFnRoGF+4BdbIDObiTUDC(olhjhau|YU|uz(C-5k9%PVxvbM;J?3-WIy~ys* z-7$py*Jg_o*+>Z|vK%>)^_hIgo>L4TvSs2P2{>~v;v3yXf0A_#9hj8TJtQDCcMn&7%QTZOG}IdTOUBx& z@W7v^tGfKMga7|Bh{xr7b==bjh z=Bf41MYlex9>GJr^rn4I69pv1ovo#vzLfVzt=#2iof(=BKuw|x<9 zWe|JZGcpB&f5Izw)GN8eE9CR3+y@FDW0u#wsNc^7#XvPM^$T<|Hl1Iz-g17?DY?JU zO4Dob{G!dx%mC}hU&r%{3SY80zX!(*=N3yc-S?T=TS9yMs?vMQvc^lH=qghR` zZG)gGuKT>CDRyuCM`((fqYX{5l9>%#mj^%0pttiQxo4JR z2NbTa?Q=iN_J!ciC$=vHCtKJA5)PVs7HRW*(+%Bt-A^-4qTo2lJA;E;Y0l3#bvt<| z?9K1~eN}e8mkoZo_rf%kIo-1euQg&jXgut`laKKY|2HelQ}e?5Mf(rs@t6(~a1(h43ArU3a~!`M|v}*!l{*?AbSO_aCb!e#MT3>ORm5CZ22J>1sC1 zM1B-Fze3J`j8b%AxskH*G<&DpY3WK3Gx2ctEuQJ)AMi|X!V8&xO6stRp#f>}Vhfs9 zo?Up3R?8ko|!+Eppv;a`oPQ4E2dVbfR7N zIcle4FaqO;%AK1+@tv-LCifE(0LBfVWFtt1yk34bbJnNljkI5ewa%ZoCNYVp^GB$K zTz&3`v2FD0htEMGU*|7flfSPb@$pVQtTp5xuhP$zvn0EhC38*U^ko0^8$fB0cB4)R zx7O5V-?*>yIy`C42QrK2$*u5#oOV8(-3K!DDP%tp?V2F8X4X8s6Zijx_&#y;e>jZ( zRJ`f-&zy<0d-2w;d-ffz&)-iS_D*-A+WET=Z&JN*0S=`1K%?ddwj<@JbB=io(CDA4 z%00alr@_8JC(Lypm_$6PU6Vhi@mU|9>o`bb@dH|oclUxS@ALq>vt(x7BEIzoBBs|Z z^30sMmyS8OyuL#V@J(MGx!9xJXnGJ&p%+He_qAoR(X=Rkg5ayLo7g#2+;)1}vm3Ij zWfNi`M;+N)y}Osw#N9gF;{|FDMFmH*T*@a;66d+<4TE{N-BfLOx49<&h6APo-mf$D zFtM_3!&sew)4eyQ-oXo%zP1rpSBj=N0ox5`-a|u5g<&HMT@N-p7o6ZrT zI@e_8DAw#yH%gYlJveKB#eJwoWxqK?esS|H5Pd&+WivOU+{(`)#`j{2ASYz#c&0DI z2Ei^moUZw~cj_@}6`y6T!pF8VLer?%_mQ)9FpYCd2gzcu<`>oI>!OnUyCz(8hYq!YKZ789rpKA7PI~cTws#$E zUg%v1YCRxv9*{gPb*{-vNNcw4rA=*G{UB&T<|*8fJMiN|foIcIBzHRiEg^SzC`EGj z3B@dTD#@LSayOX$3`b`HD@sgoEP!5y26yYA7e;Z+pTp(PtS26GP5uh^<|6xClY7x0 zglURj&{PcBVQfL{HpN&F+a%w|L0*__T+l#n^K6PO_x4*!CZG=_ned=YWK!OP&$A2v zrfYsCWP*8w&?zq-+E!Rk<+6Q+Z=pzeUm@VUfSTe#)MsvY0iAe4KBy6~Ws&H*BZCh|A|`J=iyM{9YWdZSmsF(>?o+G3*^WZxfOKJKZdO=6r?hxs$zAF_=Loz@b+# zxg8v{*EfBfZ|V)WQLj1o*lq9d4i}mDMi=J$*Wu$pj5qo zaZTKUhX+F+gWc22N6GL7Ltr94U%$G~H+@uo7tZ;!j?Iv7-oH_!6ZHzlNHunIHP>1D zH(w3w-%i%c$NFq$^{M)(KCzp_`c!kT)U*0jMSbQcg?}}+2Hvhb$gDB80Xp}Cn{lS$ z0X)*yIL-;8al9~!2ito1!Kti?F=OOGdgoifp{49wV$j!4PE!9t&jU>F(n+mgkFay3 z4&(ma1?~cpHPAmjNbO}Wq#--wEOcz>eAEXW>XN*Bp4bQAn?BE+?#(QnYqD;h+^*J1 ziusiMXtjW6=iIWUMyD-7y?8qhW$~yicB|bOj6#pQ4F%#4xsUz@I>-;Af4dmR*5h7y z*gyLCtgpRdzxA)NzP5p7pIn5Eh2K=^5x0mxxSqx{`)au#>jT&t z@%&$J=YHSxO4r1r+`c-&OFE5Zu7O$C3MSf0-&A{U{b^{T-GRg0s_{_sbKcHpw57hO zKa$>F%}tMaMWzQ`SKIlU?5}9d`@Vzh{L~ju-CxM359<3SYZfEEuA`aG1@mX(O|)=k zQoe`bQ~td#;@TbX=@D{7ko6}RUoYyeTgU9h<~s8cz1gE*ulK=f*6dyRPqF*^ywJgj z(~`ycTE3Z+_liT4AJ{h^^v%>Op6RprdC8#do>JHt_N^XXKAmyw(vM`DYTb)sJat^| zUr*g{tb*^pjWze>FJsXUf<|*qK8MENTe;qY9x1?@(`Kz&Zrk>yT2&nt~n@Fucl<%Yo;pb@+Eoh;< zmjJ$sSmhH1eBMpKr_%E@J-59g`4&uFgWvW|=N7z)rlze%InH4Gs-8KBr)a_AC^j&# zS<0WrlYNTgZx<6px8SPr)VqqcB&CP5^JZbDlW@nq@!RFP^A<#uMY5I{a@{jlgei>(XSad;;+tUn^$WG@s z?oV5a^M`DDhR^jvJkIeldl(vi5B0;-$KHvvUY!qVD~t&ioZo)|OrEKq;&gWIZESsI zP#sV7WpD`YZow@CcOLG+CAhmg!5)N%LvRQd+})jrySoH;cZbLF``7M%*xD~s)6+dw zcc!Q2p4)xyJzx3zpQr^Z4aQDt{T9xJe8F7RzSDS(V$nS1S%Ev-&Mq)F8hauVQZeH{ z)i>AI6u@`ix#sKmas`%tjsp&jg3`Uyo)@FjqefV`nd_OaW{pQMp3q!v0MdQbWma?`jA-c57L-TK(=IXfn>QRqrpm?6E zHf#0$*#D)BrIK&l&2R}rz`N55%;BWbU-!~ayP2wQ6gtmlz;gi+*Ki;ij8uCN-DmvX zx2VEq({T_OwxH0ou;VLxs9d@CW8Z;NKkMGYkEhmQ^4mH@ z@L~6(hV5U58UQi%42X5Rt}{93`{TJ^3umWir{3ocrkGUZ&I(5f!k(Ac^Rf;tHpq71 zT&V`scObdd;*1JQs#SQQfY`8v6f$+aFhd8TN4CTttw_1m`7ht7hgfb>sxrYcqv5NSD$a0z0!>E2EHA%bTp>kzc=O zOjBvTX()*V&mN0f&6Bp&Xp;oqP;w6()5Ko>q41rW%bgP=GDLoJ7Hdq(RELu?fg*)m z(MEDE^;jB)oqeAg4-!&G*}Y-+g%k-S1>OClj}pwy68~Eo8;ZqR{VxNUA`zd64VQkf z2?o8#-}Os^uNgKsL&noA{2wFzP-wzI9qxHY`qY)9#dye=%qN+HZ$~nMd_*ccnC8wu zmeW?C<}EARMHq2qK}oRlr`ybc(QJ=Sb<9u3Y@d0=Gd6Z zSjrVTeP9++RLf^>RO>So_lwQ0Sn}o>c-24Ax2SOK;%5+Hg=%QG_y?g&?)-*px{@$)AE-xl=4`&+CivN55rTYKIny3M#=p34(yJq;6eJ63X4tFP zp9+2%#YoK%8YQ)IWk!{$#FE-uGUnRIx)b=r2^Y*DyZ`LOzC^Gl0Ho7Jd3`^|Io~WXoCXT1iNW1eKy%I52h4-g+8g)lJ)}rfWLf`DW$DPY1{O(=6_HG^-4ldGE2Ksvlr?i>?p~1~(ql%*Rw{j)L06A-gQ&mvH!E`c~wQAqV9&bH0%js1BH#04H;NQ?A0zGdSupqnyV0532w zOB#-TmNK0FqZq4W)Ayi5*B<|@PkEd&rql6j+~eFfeJWywE=^SRg|qy2k=x$<$kz1s zsP}WG)QQYagS;ipZOuO;C)=qapm%2HyY=&!hluRk%Ge=?Qskk;iegH8GX0%av7r^w z*c9tb`I2pP8sBk>rql)^3W;$>Ng2MW$tCc&@KF8og#)@Yhp#WmLWRxa`TiG)tdMp{ z&K=|-aW||g_xa6G5zx3Rclw|}9~&n5fsEg;0;a6`pQ<`@6oIn8QAh&)y6-%(lx4x1 zsW5g%?~{S}!a?{nM~2o!8~t6oQw>IKw^t42$nTyrmbcU9`%PiOsnrvUw4%SMGTv!! zmz63p{BRNJ4R8Npj?3h^{k*yZ#?A$z?lE!ToDlOPXkbV*Jw8SK@gvom5jK?Hoz#ez zpKkh+Nz)#ieecDd5q)$Ma`txGVD@vZygj!-qEk==IPm_+<{{M1 z;VRo|6}ZudP8h6sUGzb-?pChL~ZYqJd8=M;VZaEBPp?-su29V#9bYy`L*RO_6U` z0X@2y*HHh=JB~jPe(T`%E#cA!8>DpQ+1`Fjjp$fne8EFrMV}RNnYy$55`Ovi9m8o9 zfxn0z0udP@M=|>{W-qf@_a#H7e%lhcU-dno&bcWs`&X<3%>~h>V9DOS6F)zZY?Jt% z1oV|~!n(~>6GEMC(=Rm1{st@@{M0&@!1Cm|+HfnAbVE}=WchZXctr`5*dZ`hVm@B9 zv{5zI5PQ?_ieWdwF3rl~vZE{XJ7Y%kIh_ zI@;fk*!t|xsd?hjMBlBBoxrV99b;=VHMVy>!Fj`0KTzT8r(~enZ^bJGppTq-HK)F( zt$6t-jAT9E6}hJ4mF|PxV0Bexf)JFgh9&HZWZKNptQRLK&sK8lWqwp0vzs)b>b3!yQbN{z#V3NNM z9~X^!GGAaL^GuXqW&5P{<|>FKGt^QvuxX8)FBfAxBh{D1E?Y=ky-QC6%HjyWoWG^~ zEF1cgq9>)-g%a><{`S{$ufQ9##*;K76_=FAqj`$NPm!6HVc+E~)=HlOnU5OgCbcvNn}_>gjK z&l0jk&7W#cD%WJ$5d`N`f9Qa6n}Dds5jTnpJzU^3-_cExdlf!N93(n3L*MjE;k9!I z<`l#JSx-_w{;Ii*>2B1|H`^~(`V~0kp`CN^SKy^W*v!J zSXoZepvIZ3kT|VOQ;#l1)8sHN!5P*hlM$r4eVkL1vw_u3q+#0Il6*n0NZV3mL8qJo z_pe-ROT77URPjET3x8RS|=Dw%QO@cK=pDS?!YS9-Z^-(dEUX+O(I`2~wTQ@h$b z+5Y#i`cGWdmlIM8`vC>z%U3wXk6YvlTT7>5Qm;)v49$xX3u}5-m5nQsw|`0LO_3n* zelfc6u450KM4W9~n6ghqi`a4$^+qOhZV-_)jY1s*R1tYS#|2oZ0)OBZuiuSq9@Trv zDe?SL*wP@VcZ2s1hfaXc_9pC(^iA!tEKNd07q}JiXkh*m{H~)>IlL}?GTTu*WK`z$ zAxuo#|2fV4n}4H>BTM&Zw)PxZQZnTbQEOC3We>_uZlPZIQ^G_OpYNi0g@e%e^z_85 zTPbXTOqH8(%-s_dN&({$Wk1?1TEsT9qf4?Od>CR}go-w(Bi1&E z$LMT>5*XcB4AC`CA~ke@{5;ao2)uzd5mbxgyMkRD0|dWkbntqw7)8TpKz~@KuR@!- z9w$|I>#uNiEg6S884F@Pn|Ml?bkgST7ig32W@u!VB`Y%Twh4Drzk{V5a!}9jX-c7C zJ#>#7IYshTOl4z&VP4W7#gzp?b~C8TW_d)THvKi1e1t21jW3bBE%fOxB=Y8Z6xPgd zuv(m97HN*GxFxV?+`NM3pd4d&e*Jcirt12Ic%K#UZ7#8(41UXsYgp){nwXe(NmXMr z^op1ur_iejhg@!;_6o$;DH$OhBGJ45h^s2dEnT=8rWQ>vBtH*t>@viVqwhjiGXxdB zixXdZOzm1qG>?Z}Fr~9Z{z9ZcYue6LO+*pG{Yh?$1BJQ8NFyOF>=ZFdk^}$h^A72d zhfJd}*K0=b8)BzDLc=NE-2^g<9bK&&N`Xb5wXxY0&Jp()BtXmeUi96$Fq;ZYNlitD zCMV9?JGWnag5tKT1FDO~?M@SU`l^{A#>xbYD(tTaKWY;_0*dAjIOY~3c@b^j) zDK3la89ZcT;iTO;P2=C6I>ykY%!!D9%6)(2P2p?^ZfxuA&uRN3Xo)!t0-ZubY(Bvf zesWx14N_!TEuV6)qm{AvKK0GkEJ;W?zc!RdhF`aK35BULaSz{(d0sIkc_HhbsII-! z4A)#GRZJfBfhXd(h|O!fA0WSF4FW00Bg~U^uX4kxrEty7gJ8Vq(C3}@zcNzEPPK1i|4puuU zO9R?vZfDzm7%&s|LIO-;8JyybcTKxlVc(6`DbKb-HmsRMXyCvDZ2(bYxrD|iTxXQl zd^Yz=1-g(azE;I;R=n0V(rB=G*#xupgH>O(3PkLCLxAj%h3gaMOHGLO-el`Ry*%4! z%ov!60lnxs<>|N&tok_W+t3IQfM}m6EyQ?SQ{8lh16?*23YJEH|BKN%s#5!%;SXgP z0a3k{KPwqA0Im#IVz+OKmo%Cp#B2-}Y`E0we(oGq?bPtdyd>9(@=%U|==A@6-;pzK zoh*5+#JWw{0mt&i#U#D&ZAzwBNO&$!vz~xm>?cbg62<@1a;m$o!h(!8yDxWXiGfSo z`61!WR9jhAH;T8DaGfM?im&*W9=D4>HqQ>L68sNF0Lu>a<}l&+_KWYMmj!kd0$!r{ z{>+vzr4bzve0Qx8c$~-8E5ij**8;;$Kqbz@jix|0&O7XcqRS~P>yF+CyjzdqrCvzN zljWWt&EDv1!d-FcN79vcDF0EQ4T6_#YQZxm-7}m6G=cyG>AB)Q9fj4Z=2|HkAn+nR z#R?E;AQyO33rGDi0*N!;9{~^+CZZlnfU|29N6N}z+Y?@s>!30Zo0v9!L*#C=N)syu ziXggO0?Wz;w@tX6iRa7(?}IOgy>n z-3+pFRh9UveU{g9Jz4rbV81EfK&}l;s?Cv~X6>C!@+Sn+5!Y72zQ*o5JfIi6)>9KA zP5#8}-ObuXn_)C$jGd6s-1OHT0e_zW6y%5tAz*!d_}A2VZIX59d~Z45Vc}8HD9@=Z z`(^(f1Z=!I*^o>X$=H54NzZ+?o8}-1w{AY9vA5_!?3~cd5I$suQ^_FXAv7nsF>H>l zfNYDpp$?=~b)qoy7IVWZFf}U5&JScWkUof_Ec>BI!4S@R?rH!N>Q&1f=p!!ftN)ze zMC2}97dP~)2nm)J@!Dh;9jY;tT+*f84#r*)aFa<#4U%&6!J<}Y8*-4%4ww#j{c}hV zAL^onMYy~43CGnJ*EeWFU9;YU85aV^r5i0|hx*4tgLQn-FPos-^js%M zC?r6QG-M5p7B1$JtL&UBHLU+hsoj>FY;6Rqw@9bFHt*omhKdXt>|?4AhH{S)haurt zMP8@4UFXcsxv$YLxgmH|;T64sb!||#7($vCaGFhH?qN;hS9~P>sGo`FI7C_we@buP z+Oxff=!DxZ+kVQJ?0ckz%?CR5^JAr~4LOGk`&m=c=?k}K5Y~y zywu@O&|lA~I@ny1A%ZFg1D+$1Fx3!5MwY!;Hka&YiWueIBo>kIf0~P*M810zZ{Gbu z%e6&%UA5w58_>3rU=|I*GK+#0koZNaIvyOXU_8%k@x?T(K~Wte{4;y*FKchbS=W%i z9Q-OnhQ|V`-V?ct+9M;gHCATV4AmA@Sgoz?5m8ecBm^Qf4k^}%hn%oxL}QN<*g4;n z=ytHD)IPUsJE*cFO*$E!A0nXK!ffY(6sU}8zf_OnVkzhld=8VwMzx;v8JE9g`4-}M zS-$|28i)3$ndiguDl(H|C7noi6sVS`{ zRtl-$$mA`}NkO#-gcXq}w@dZC<>B*$qR94qZZIh0ufpPT;8#j%gO`!jN)_kvi9rX7f76~i8Z}})Z?p8Qh&!a zoM~(qEvs7)mN$Ux*F#uRMheBpnp)Juz`Owq3aO|*dy_AU*F&7vdq{VjZbw_7d5RY(&n$!%q7JOEh zxo!Eg4aJT03dJNWn{lk03H2A2nnHFj?*!hzm1-AJC{x^EvX3C z{?ta%ad}IRYUBO1F%?cs*QKt#YFYpk{R`1A^D*qj@JT%`gkI*^vbFV?-p_NZOc@hi z5^CL`Wz9&TB}(2cc7RIEY$EM;*MuJRPP{tP` zL6>7fhN`P)kGQ-J4h^JL4*FdBgN8(1L@|-pno+FXDN?CDzQE|uTKQWb(yxJQG|6N8 z3FSM{&r6H6d5ZIf!Q(YXP2Dz;u_RGniH(We^B(P^pzR7J#*9~_f512sW}$E zr;q&_mpVX-5k2?L$gQN8Y^1ysZa>Lgi({zu`Z?k>_02RA(&wv_qIxRNI5`{nllMx` z9HvU=x-{t;<0fK(kH10ekw{)x^Hl@sVFjyN^Ywd?C7klXO)*oq@}5OlMv1Ykg3hk;f!Qf_g*709a)VqMiE<025t50r6~puW>zyJ zm*tl;aUiac6wT`BSQ`;NVAWnhS~v;%9(^aKUbbGW%~N3`ApjVX1V2E?LodIqYhHr4 ziY^x$y|;2EPQPBD(Jmm`mCoJuJ9yvB;m~7dLitKYa>CQ0QD(ZbSRWaepSX##8U%n% zq3?Yg`KD;gz~1}?KSv~JKG*7)riq*01gkOJzQNp-x+WbR6zT@~A+pgNx(3b0F=QSz zPRa?D5CPBoEUY(5`wt5!28@<()i|`yVck2hL3pgxLaNf)G;fyM#037dm(durRdNAt zgTMhPS5+EgekR|MUi(+t4qeTD(0K)3G=p4|Oa=^-);}Oy8&2Lkj;#s1uRfudN{Aki z-M0ef!!7?fQ92Jg`~Tv&oqBNY2QbRg@EGYXi=YH(LPp(>)PXy4)=wo#*R??xsv{qg zjJ5(^8oyHZ41GNr$#<$nerB#t03>$J4>mqn{02$CwCpzrlV&=9vbWDea4u`Jdi5@= z5;ijrQ+7^YS79d#USTpuE>xo$rE4`bP7PJ;15@7Bk=wf@Lz+h7K0LF*epW^(_PZ_h zqbdZhHU>oM%K%B*1~bCh5OcrQJqP?aO4<5Acq4`ls(Qd3tl#YKHu-`X#~a5!#)JQZ z;@qz%tDmcX9I+mxYgOz0dIbiQ~^*_|Bxvr(!IR?;<`49IcyB8;r6?+6UC2N65R?T4?la%P* ztP3_i0Nt}vi?0B2YsMS>N=6cdvHxGET+)WQIJm1r^-+~uq~Nk(r842|%SgYn+yh?{ zYf;>jB^{vPHYNd8djgX1Wcx9d*Om=2AF#3vLfjO{I`41xrZE!oRT7K>(0Vq1&9ob8 z#k#zHX4+5{1>&9pF^WQxH(*u16f7atOd00|Z5Z$LNScUpb6~CMKnPlM@0MIXE!@q^ zH4>?BlC!c(x-(Dnz`~|yn0W9GMDY%QM(_#UnAs11m`&9T8PQ0FkVK;J;Sstpvzsx* zGg_Ph{2oG&#zaqkHD4`Vqw5fZ=ZF@!s~sFXiHxpFTh_D#q&qDfTzyT{?Wnr@)ndHi z1)i0CB@eP14yLsvJ*raH&S{FwTzYXUE4+4%yIP&}%Q;kO9UXxI6-OpI#0=~iHN7r1 z4kb&~%RF*VgPZnRSPOye$k}mR=35+Q69#}9%z`eo5gB3H(&R=ybr4vWF`EF$B)nT( zt_F2&p~U>GO*kx4UT%dD{=>ENSv28K6=?t;km~6^8_rVNny^}CbRxU19z|xxB;i-< z7o^DQ`8-Ne>UW;UCY8NAIWh}@p@ZMYRJ>x7E`IGqQrT3VD$zzQiU-b`hqqrc`$|S( z_`z`(>F0cvrN%+Vv`S94b&L!t{M2V;=I5=NIn4diYvkg;nm&h}mzWMFnlxq3@bjq4 z9kBZ^Kb}x%;AYgHzJo6^^2Q2;Fk3}f8_>Yv$2k1dTmin!&bM!v%8URajb*l;-{aCn zq|eLFP(YgiUiX^P&3nva2Z`X#2ac!wH5o+HD zA#E$Ax!gxAl}M+{-ZzCP7SLOtTOIqEzmcXvG6+q;y$Bi;=R{(>WkH(($mgsc`?od&E`uOBJZR(FmA zrcVFFldRAbp`$;-Kr#%A_)OS?pZiNpvZv^z8KVhbOhD!$)&MnnilaVf7LUE;D>N~T z`sw+svYqd9VsIM{mT<7098LQF-WUQ=?x^D_ymd{OX{t3#sgwgv1O+MyOkLS5@OJc{ zT2`PsPc(YU@Vp2svAp`;-QW3O-WT5v_BA2ReN^3m7D(`E`#Hc8y-Mz~_BHZCanFJ+zC;bK-`_+PNt1aHg(WZsk? ze;;vO^0+K`)QWg7#VNL5x}4;fcKjIAbUn!koR@Qci@DyNlPlr7v^pwl%0}u~;rz#@r~UWB9UIP|&=ZGf zhoHymG&wr8I5)c@dZiJ3R=pVNo_oRvyYsVw@>?0MG$XkF+-^6D4z=%a^_Va~e|48o zkvyeg+D5u~0jykte#j*B#!QCXfX6_>(VPtDu97snvI+h68ej7adBoFB++&w=-A0EA zb?Wr%sk#MtfiA1#)XA+*t$mfneBKg!BLn18pul{UnNvN*Cj&a-rtA-=J!7i}U& zhDMtCVoHQNE7yO?y5Pe6RJA+&OtQVk;$q>){R$s4@)KuXYo-eg`xap!_sULhK3<2S z@LZ;_iFGwu(L&9J*4x@FsdP{pd+X4OGlNAjQM-VDh!zr^zY#$#IYPhKlE4bWabhs*RMAN@O!ESd^ zy)SnEyzgdeCmZQCHSW_P1{iN?e<+Kx!aR8I)eXoArSmD@M#lC#T&}gF;5rY^C0fhd zo}x0sd@UQU?CoQ+W&9)7NC?a$!H74u3M!a78!a!;{EEO|F>1h+V2j+on>Xv?A0$Wg zqR`cEnj+|Udcb>HiKhiS0c4t*>73-FYbDE-i~^dkSeE#Y-w@mC$QeK7Jrfsz7uaO5 zJ@Om|Klv8tMc+`TCGY($OW8vvNZnb80qVXVx9oeA@T%nI?;N4~qu@){AM!Z+J}S@ea*5+hAoXA?E*M1@eo7s41b&?oQA zZr6cfN)d1BOa&3~DX=W=;0>2wV0_zfS^7GBf=X;>J)Ay0dFQwMyT!kD{G^n-TYfVB zq0Ahy?Ll5c+X!{3n12IsK^yT5vvm0bs^9#HYTxF9{6x^*C#gH(rKedxO7^620^#q> z`#>hPb<;n*V)ehFS4Fi6DG>Mg7;eYgQ41lY-Yeh zJlRlgClr;);PX-OWa}X7=pYQlPO_sI-eEZ;47;gq7GAiS1{@M%A_dGYSTUz94S+c` zT@@xHZK!PXkg9*>%Y93#XUK)iMe0WR9j)SgVxryc0GX!$tC4i~CU{uCV47v#BVA?k z#I^JNhor=T2PUDo*(UqS52I+bg4`pOnRi<2yH5xv zaSZ|p;wm#*OCpb1acIO+jBxyb0DcPb?M#(wPrZWsg4y9)_QP`NoYU&h@6bHoRfd_p zk*_LeNs?r?&(Q)smq-YI52-}(u}dbA%9d?e!TW?(tF<>E7P$oyI3NdB>LX_}slpw_ zmn7oufxf++rK&N0B|^7z4cl%^D8^SKA}h+n-EN&uq$iUp(KS{!lH^Huia^Sr4b0^I z696lz9#MO`9RMS0V{xci?jUb|CJK`yUpwu_R!z&^Fk>xI5{M!`nM}U##;`EFlphHa z6A`}kwpLE37l}yX3enPGCPGjv<(h53kggt%kt3F99+`oUJ-T|W_6^Pd|TnV3ft@qW3G;vP*$Yf!M z5!_Yg2G~Y&o^Gu1MG=UVWY;LhX$a4H_zbe1YDN*@v@S6B5|iE9t!LpCjxWMz?D)3> zveb~7Mp9^EVTXjliQjgzp&MyOXW@QA*_UM6V8L5{%dHzw8;s~s5zNl{(WPY7M@z28 z?`priO(?O7UnAM3a1e*fw+>#BNM)fV_EnGyDm_z>w7Z(>Pt@%%xN#0ytUw9%B0{(v zmSy>jd-?}Gb)MM|$yXL*7j1=-bb&QN_Qm@fp>i%nm?4w3jf3J0 zXS_TU>rz26o`F46ex98CQ)0Llm1&%^=B#dz>GK&Uvoba>n~G?wlNnR$aJG^>t=kw~ z>;N9|?-L9U_SqmV-H zlQAKvCn@xEj94?A7+#MzqGL$)MTB^;qxd5Q=o1JRV_*F7=`pA0fI<&GhrEXY;~spW zyk&sMBl5VmTZX<$k)?+tYxxWa#7aFVnW_1OT-X~H^^qxV$! zGpHp6tcr5?W@3}$`1O*dg1;Sx61Ug&Wo5F6+5k@QyM{XaMykwn1rvemwqaQ;d66Dt z65u#HM#LYs-Ce(|=UJCBc^vlTF)?EH`A@^EMfKCGos-dIH~S?ECzINjP?y`;-7rAf zRo&0nmc}ei(IyzL<$DBywBm2JYQv1>mb$Xi6%VKulnk`9^_EjD|CBoO)SIuMrlVpwTE! zKze+_4U;%9osdj`qK=(m5pvri07!33H+`9X8nk%WmjhZNO=zGf{qh05X%T0cT;G)B z@I8>*iSH->1vqZ0WwgG0PTwy7@y?0NdkA$`GKmFYm3(7OC%Bu%*~PpCSLm=G4a%wmzUNMK zye0S`*8O#d5Z@Gq|)54AHFOOEOT zU^FyV>p;0PKo3|v3;3|}+J=fhvKe!`vpqu>0-2T#zFj@bY?h#%_Mo z7#!0n_k7AdUX7QVv}&S4=$9#!?rbUn2g`p=_ZNkn4H$8FvdsLi{+;c%yGdDrNA7L< z5_2?69J0TrQzZtSxvRU9f2S1dm<-_IhY&thnLWx&U127hWPz7heh8}}yr!|iELejX zubC1qxGmO^oipUAD(QivQr#`e>MD>0(qFFhMreEuP@ed~$i z2~&4@B;`|3vln+EN+mHV2@6#>?m0OZKZ@FF&Rd@3QqoG5Eh+udg(s)#^0{Xsd(HBN z6{FaxC(bLXv+MW`KSA4SLga)M3KQMgC<+%q{oChg2Bc=RrLqnR#RvSsxm6g zr|%au&MHQ8&y#;^U?fMku4=WcrbJg-X*cAWYeTPZmUL)b8arDg9fz`W*z}?qy&*oE zuy%5`Nqq1cq;gZuEps#$KDe=Ew^OfFH|_e;u0utoox6=1j6))vIRV45>m{%-*5=xB z#I}x_)PmC8I^PSmDy|=PJ`BEgx%h<&*@`- z?EV%n8}%hmH{~k2JSX>>66R-ZIj^uLyMJ&A6GK{-^LEv+QRB&98pRXu+A6QNs!Yk` zovOIfFd!S40ht)GGSDPJ=}7A;`cH-STWr(%)X$T+8g-phMw2}L4}^~`lb!Anc0X7G z#T6IbukI`?)S5Oci_H7}lX1^h7Ck&6TqfixcgBH+;cacXH#j&i6dAk^R=VD>tz+K2 zsnEu|d?I}8$Or|CHlz3KU_=~_)AqzwOt`i!+-5%id>I1C3c#zdC@9swj(CQNnbeXNZ*Tq zj<1fWkMT9ci4-J+@pte7o%Ab;7(u8L9q5ECfETeQ6dnIV^92_X9CC>TdcGk4dMkz~ z{wKhR3G_lCh?KLB5Q_XA!xzH@GiT3d4HL>fhhV^8A>%mCIUOQjDa_TT_ZvZz7 zbU-D3;_X~8*uLDyAKdT?282i4RQ9VM{xj$;iBXfh{Y}QEp|IJsw>X7fQ2}0s*bzHg zuzBBXs!L#Bo^7lLB)uA;FA$Rn?m|6jp`B0Hs@GUG)laf&b>PnMCpPCvMzANrA6auf zWQabL|B#F`Lh16I(rPh{0&tE|pL6<09M*f6W2=%x$xJZ&sQdHKQaMr5{L?6^|ClO@jmR%go!ExvUwGb~r zih{^RWXgCsKtmOkkn8WbmCZL3w=yXn>P7^{#E`wtS(z}` z&bhKL7A2pGJve7Lu-7T1ae!K zI*l)x_<*ncz>)75f3^hfY)Uv0uze$STQy2+NtiFO-lUSp0FA%Q6_Uke|TCLr|+f0e!EhZ4-E5;iG)jj;$z23piYH zRMDgMOsMjLHzY#}_&Q<6pzLkk#uh$R1OP~z(aI#Tr4ct~zO2FOXp>ny;?@djtH4`4 zdPD~52qVkCz{(C*qI!crYa110b$`G@Drqq=Jvsk26~jhH{J5Zof#1UOfkmM!{Qhml zRegyIwY;pau-R3HLO8$6(?q!fiDr%CEQtGRKbu{T(1LJtN z7&$+IIwKMHB~y#!MMj|Z`?1^vC0Nt1Pd>V_8M6GE2!O{EdORG*h^H?`NHazO>YydBp@g3B z=mP<0S%9Sj^275BwTZREGYxJZgpjpcM+6^-qAXmN+u8Nfa$=Cu3H(-XtEksqrP{Nv zxZg-?bl-;YMc2OYo?s0A65o`k!2{j}M&C<&koe`45DGQ=3j+7U3j&Y59=N0}Yn{g( z{S#gG>3Xy>*Tp|j)WzDr*#q*Ge^n^rr?E2|9g;^6(5Zf%!o`2ix1!(H$$&}^v0kUE zGdq}*U3kR=8&Wa>4Z(LuM`WjJ>(h^Dd!+ax&4^;yAb3Q1it=Ah{GbAiWKl0xkQ!wM zlrbBKgE9kdYYpRTs1rAc3*&20HFD3fC;eS>Iz<74RVmRz$rm(qMCsXGdt%sm3`A@H z-X1fIdN_k(+_%7XRh-@x^$1j4Nju3{VRI9&2u==V}=85Pi z)z1hbQj4_csmlT7|-PwJGv(ZOp=m`IOAm!$C3L2P$<1S%H+&Q0-&CYZ>_qF{CB&SH?L=@|MeoScapU7`ZlA|F=_&@ z)pHCFg6!`p`bZ73rSv`3#1np*UhdK08N=fiDGs`vJ+Va23fK&-Cf!b>6nt7={s;}J zDar$!tQPII$}nayCW5D+wEJcj@$G+2Dl z`=1OJSjr6Otwrp*;23dU{GMry`%`#5%8z-x99iyJr2GfF<%?)b;d>vk-19hP>V*$dqj)`DS?(FfprvpV_cAAX{IunVF#*aJdpvrfR6?{A z_Ywf@1VTaq?7Gb`Vi-N?6l4BgjUV0^5tKk7c%8G{^B(vOdaDAXl;WQ|=_PuP6e1JF zt}w6eBPEE5LV_X#b}I?}6Uj$@i7kb?xR(?BF40cUnSo|Z2?G0CG+q*bzJ^tWI#GfWDpOf}p}a0$2 zcdKw&>~jx0A_oR}u#;e-JNnQZIRMHXmJ%%ouBSU@67M|32@d3r@jImYVPLsOi{c;R zNANBlaPmS)Kd-S5ws~|UgMBP2ENrxZ{7QpE#+zH{0X!nT4-&*Rw-Wt57*E%tfgFIJ zfj4{YNd9C&`Rt{ow)^DnbHDQT#ksjK;FgV(Z%Pj|w5~uJ7(tWsXz=I#P_u^L2tEh* z#0@RjuS(_agCC|g?$2s|2>#6mQ@<9drgRzdXHJ;06G6=DmZzsl1qKE@&f9NiM!&ZG zdt7Ze>A3LOcznct^xR8o^Hfq$P*gNcz>InUF_Ky8$%Gj>_#{6Tnk;= z2nD#j-SPO|=G}h~?1~tRx-X98Qf4Z~)H?IZLx8|+z0!6anb(wiy(7|zCrQMKF^mj*QuG)3rHL+WE}_?Odxl9ojI$#RpHfvP)EY0V z4AsxoJM(?D2ls487Ly$5H`F@|{#E!A7kA1%wYxAU3V(V(3#=7JB6v>3hDyOs;^}^Q zNJ71YQ0)zNrPCpn`9IeN@`CsPqUITWE!W(8IGdlZ@6XgV!F%AFB*_uSBu}Wbw@({Y z|MHmO$vVZYm6PeCM^p=BwF{ZGCX~}~o37vEJXtTDqn~yLvt~mkq%>4eRHfi%<(TYe ziL6!ZdlKuTX$k6Q>SYBezkf+RRFBL(&6Z#O8dNL!t$4nn!GghNo@nlPr8el?L1j#J zcflfF;rSJ?!I5A)8@YD(vJt;HEO2oBFUxhpbwY1~eu0koKy4%_8rnUKZHjk}cZ~NR zff>Yy*Ol)8!b|80OVopVqHA)9eSPfQmd#DfV{7QT! ztiy&O+Er@SuGSlZNmYd&uVnVGfysYl4H=bvMxNv?CJbp=<%?Mn&Meft(E&b_Eelr} zqscELKLtA(3N!gGQrFjAD;suwm42?hXLxS781ls&6gFi%jD7F+-(w*&Jn*#?lJk8x z5W0_YEwq^&S5j;3=wI#}Pc*RI0nf@qEPaMLT{_;~m!4+p`uTzP2ERDwz*?a%TE9w; zi&z6+UmxVBeEzKYgdNA%tv@R*W_3Koc2umXjj279Ii9oZuLCRDV+N&NXWYfln_bBp~6kGI% zDmwJx>gc zbe8Mb%{1`3G4C(Q=rqqrnR-2Sk^@`h5p|?|^1G#?JuYixw>-9kF#?f-Z2NI>0 zejo2@zVG|)f1iAF{$HOUPsd)nr32oD*2MOeJH%}vukYbv-gJ_8(0zBmh+%Dg+wT49 zPUrbuinZjK{G9S-| zA+}Np@1)}0?QPiyeR*|7s9z9Bepd@Py1&nBNMajJKUO;ScMtd0ZE0o)EBIA>Id-|} z1nJF>s|MRNrsYMb*QAy0Gc+-(OcpcEyO*kKl%33@>cy-VJE-!M6&&M90nAJF)CrJd63?V$*V8jw%$|{+v%2)#~|g%&{KP>Q^~VRd^E$JsJGh`1k0p`z=DSF;dW4~r!#zH>v< zHvXFTEfd4|ySoOw?m7mL{>o7cy&Oh=(f2T5*25+a(NH_K1w?_QYz}j92q@2QQd<(G z&9hx`+=s2Iq&gz(_9wUhspV*2_qV`P;`(*bGxgo9K5%0;-3{CUp`CW|i{I7S7d{mT zSpTO49Dk=Pdw6P70nStP@#rx{a=buF92wvB7Y>N$KV+CT;QYXgcmqiXwN-;J=Sfj3 za?5#|(`qkA_|Lav-JTYDp0!_db#G}#Mv99yCvzSEl|+1bK51ej(|)FR1z zJ_`R|JiT{VQ_T}LtPct*O++9_iHHc&i%KUTB~k*U&G3W``XMe@?)>^_AHP{cqH|cj(dgUO z508_b1(H+011CrL%}>*HvNM<8Lh`nmO3^yf;;9nD?*?q{p`IjcJ3XQ`WcGsgV0k8~ zOIIWm2YjU`@05C%3T1?mohrg|oUcy~>!j*L>a^%MN+rI%H#7y`h8#cfpFS|;+HI6< zP?{+3u)$|M?#4ctEJT@|rs||iT6SKlHYaRbkbqr?qEaIdJG{^&Ff(ir5w|T`>XP9g zX)lpG;0rL6vT2e;_Fy?m*|}SIu``E<_NytRj8+CfSnzhoRfs$;1=`yce*Kw}kcs`D@HJx`z zBC1kWH}~PW@J-p6mE`74z;6S26Np_i#aA(M51X5Zv#Y1jB=k7bJN1 zlNa^?Z$E*W)5J#`PIjMh?-%b^2YY7-4m~w|-6|0&ls~sAl-{!G(kN@ektps^*E1Mv z;${78cCqhJoEJ=Qzob(g>uyj6Xt7}LZx}c5lO)66Eq*&#lb&uirMDR{;{UN>8pbG< zCOMbTMS*AQZ>0Xbu<5V83`nqfnk1QL1il9bL~}&niw007geQ`OWtU>eatZGqMSD?q zfTV;qK77@9?cpdW{M^M?Pyk)aGN~ckvJ7F@u&beDphi?y*~&;o19C|6H8jb zOH~K2JN_PH7^)rJ*WX`mS=w;w$Wwx6YH5_02;`PY%l93%YERu)U>|2cnb}^dX0rpz z9@aPOQ1W3OEFaln)kru+rg}GY;JL|)BbUJn5{QmF@YZM(#QH*0hg{BYCEk=T^DV5R;GC(>s1#CPv<1`|omKbgf4pl6ej)S|rjt$?BLz(Bs)R9GJKD zqjHm5{&^y~vgm$qza=BBYJ>dVtVPFibY9AN`#K`d#L7ncQLTx#Fm1_8=9*KzBDhXL zlY;Akhji`A^1J2LbfR!ORJVX8Q8EPE0GoSmaM?}wcmehfsx7^7$~s4Sa^|Lg_KT?- zl+$obig{w!AeZVaNbm_EE2I%cdI0a~p6&HMu9TgViRJr#8Bi$EE%TA=wwV{4;Du=E z;O_tG6}u9Pg z9{g{1ugkjJZ4U*n7;W7jwDHj+eGbLd?1YOFPBeUAVe++l9dm9qVW~t27pftB3izkZ z`frve^vr#(9{8k&pJ-t+Ktgsdb{$Hd|I@Spt`w$?wa$9)2}_^b&P8u|`ilIGkvANE z#a2N(eSNRX)HSz1jm*DS5WEaRO}xro$aN?A?ST`#iw`b6pkZxg-Vjud9caeRTubaYs*uB*jw%Hfc_rh5OS~R|vGr3rJ|k}L%Ci%^%7-#s zbk+f{yX#^vhhNnh*TQ3c5){@I?7n(sWjVcLFPeVGi+Z8I#qM~|@w;nU&LC5;Any+^ z*U+fAvh~idYFX0k*K^7|B;}xv;zo6F>ynLjNQh~ettIz1>vtcV$JOBAD2d;I;TO)$ zU)+nm-o13J`q>B(EV>^QdMZ<-HkbH%Q+NGFuq3x*j#K+{NC^0SlMicP3UA^iC}zDn zas7Q9u6&0lYB85{Xj~>T&=4U1*iRg6^}Uwr4HN*LQ_VnVNy56T=%NC?hYcM*Y~+{O zZ1k*5WlEOpy8OiyNaJpDETTHBlgR02E_-~S)-xqg)8eB>TD)w_ZRzk)Gv3Tua}p*) zyS?y-oG|hT6V5QH>-a?ae zB};~aaYyiqOouEoOIFRu(!0xKsljxvn#^u7;QUJCJc)azjWXRs{Pg>vs-U(xd@sG!abpLHp;m7qZXne_V1(%N+S!{{7wfJAByOTmyxy z`mN~hPM_9hjx4?a5twTt9*iVKURp(@OATbXwR`(LUjM*wN9j z?&qY5%04V@>HOcDh2~0Q^L+g|JE$mAGBf*K`vc(`7oE2XfegHzu|v5w?f25$bti8I z@T_;aYD8^$@3FhPgTfe{u`7p*IsG{3!=hK3&UcHZBoed(R(3WbsD?U>Pihi8&RN>J z|M*$;jvbL1o{_n`e0`flbhQGTaRG1BpYlsS16JcR$iOypEwID{_sE;j+fc^s#z#Gu z*;5jrliJ=bA`ukr|)RSHo9DQ}`!AnF) z2vpevI50;hWb<;hL9b!Jp-RLAD<$%Yof0TmLk!M>msmv!dkzNd_}*MtKdD{<1t1o- zY|vBx)`+6yAA_YVGix^OHUw*GgwvsQ&iTx$7WNQl9!E zc(I>93kVhxY^0Axd7E9vck9Nl7;>(1*KLX1+OjaRFh(@JT(ef6*=0?gwm%*#nVbh* zEQNf1JtV3O4quPx+Xfm;60s1L9*{=En9j0e-0=0Bx%)5*<(bX4pe4Y`IS1EYL@1nT>mQ@cIs$K`hfp z>g!+SRgEEk?k^)UGh-X~Tid@KXNhujx1pLhHU#zb9ULxIHyg_bWL+gG8xz|@AhI6# zhi!>I8C8W4f9HPA1wBn47x(1Ot%k&|K*_8CVa~r!n6^cEUys5+T>*gu0y@B)2jX^zm z-G&{^$YqOh(@^#OhCkf?7p6T1@YtGMu9aG}=~==0(2v)q4i2bu++6wE{Q3gy#gOI* zt#(CRTjX|M2}=7@><8XE@onlr>!(9$FV=My`eUn5es>e@AFHffxZLGhQGN1Y{3hDv z^PVcWyTejy*a7aVuK;Dq8z?}!vX$|UYWY1(z9{|GO?9wklyV`<0!MKu2? zA{%aA(q`|FRrAk?B~Z69sAO6&UU53re)Hbk&tG=!$&@2K8~wL?xK_H~iKc54Sp!K%*AAor>)#Bw`Fzj_}CyVkn`Aa;|)$8 zl5)wFUGT?<);{yG_Dj<|j~U3NYWwkOsq-|qncmB^XRjv(r^b>fr#WPH`YxNxUnYLb zGJIq9M%HSaW?z-%>GHVA@Jv(15=6a)-OK3bvRq@e;Mv{o*+0``UbzLVuUU;BGDybW z-_KPznr9&5sHO&}6`+(J`fYqJ*lG1@ z-0l6I%tyKAT_b!}6e6tkClA%rbc$Af__YE-Q9rK)<6cj0%^#%@G%SDTMWubc$joYi-z zEsrU$3%uCVVlQ}@^_lxXH>DP}Bg>cSQ@%B*U8>dYQdFlklJl}6`rp=bSovqw`10!fxYVX^@2pTXzr}Vg)%?~@(OMb-Rtx{#>oy)Dne*7igu3#> zWK*Q7_i9qn$BNbJJ-lbUq|S0#KE6iTgXeRn>{Bm3Ojnte&O4f&8kfEt^(9-EPQx2^ zh8N6E{Y(9HgfgCmjhE9%xZ!ewN_*g_CUX>GuYP1!8vUP{QfpTCyKTSAU{k_vBDVWS z(YiT`)BE7x>vWcEyA{RI@c7Ezqy3+gVtpsR-IBOP1-(m&(d4bIAz-dsHJ4(?58LA@ zh0*Qnn*sGnoU-M+=dyZv$y@BQ20PzBY-lMMC2#eaw2t(TjO4+$`adUuWJjID0-n{g zMDnF3i@mj4j$wj@xn2&I@DIvA)$lPBZ~9sMp}|nB`P-AjwNL#fbsJinw3{goTZx=T zKgzTtZOBj~J=nmU3+{W1qQsxXW6>ls*dZS1-SRPCIIakWuQY+i<5% zUf&2Y_yOXI}jTHdxG1c^Fd#B8uW6urS>eo4O=zpY5a8J{YWoo z!<$7stKXkx0}W4xTYi=b8rTS-9!Wj96TQKG`mn5Hm1~7bIF)b3%gR{EdOkF4Z46Yq zrJ=r2gXGK^6&$X4UX@2lPi6k>M81mo@!1Uz693A+9+9v0zSL3|41L`_B3AS?S&Nq`?*)-UcPTuHvRK#v4>bPVYi>FGWrAj14974%i7ykeI zsdXwD_o!)^VBkS)YpRkv7djzKL12|dQ<7HeamCG3KlLYDK7F}8%UOBCp<>vJzS^V+7FCZcyB32_H zVE0$ZvEHRMukwYDcbEzxQ;RcM#?DVx!rD$W2M7W)y;WKmhQV3e(xC8BJ!dGT>YHIZ zGK{R)B9hOl7|B<{ew`E427f+m_il?_s;wvo^ayA<36QP(n^PBQxWUxalNzii2o}^;sNBmvkSPcn;tQ}@;|e$^M3b0 zvgHSls9B8JR~M4|dP0(yCLuMzf4TUgx%eSr(dC6>r2=0?t1uz!A~eoL?$!xZlLU6? zroHd>y7@qeo!G@5cdfsSN-am{@k46U`sk_rT$W}aDMj8vcj%o4Y*LE zHN=-Z{NOBaa_JDB&|cT=6BUB(UZkn>5zFAU8;jQpsOkQs7zMd%Kx^98>dMQl!3()gZcRt!U5zCg7M4kjbd)$?fm*l>jwObVt z%t6+kt^6BkEdyiW#=rZ?dqD208N%;o6-+Wmuk^PW^&A9W;9Qe=*c~lH&Oc8Mv&T9L zrjF)@xQD42tB1*Dzsq2(?9bBx50q!5r7xk~UBn2p9ie^fllz~Ok)wEmN2y|C@ciBj zP_+~o-#cuH@$xxvfhMe8el5p&i;g0CYT$0Pw<=tZ9GyHJQut>c{?|9#HBx={r+Ekl zeVaH4ZQ&c^{$hbo*Hrc=%$4+HUVE{K+QY9tYYD%tCmWk#4wo*z-beQ9&u- z^|i+o;`B*vCT`v-vux*DHZtQ=MomY`Ya=5z#-&3~OKTQ)@_kz|oMgaslB(S&e!jQr=CE5Po61Ua4*>u_(88#-3+tM!}m50pFH(4BJs{$flHx z6-z~wh0^wv;qbjLTZjyzmYjL<#y_|_1N zitX;VjSU}#%zmzE#{+{YD?<3Zyvf7%{-1IAv5u|-A649E&*C+w9+jy0@zje`TMrJITo2;ypG zEKthUNAHV{QNUSUclb}a<#--iW~68yjJV@?eRDF8Rx^3|)vGammHnma^1oT*4xqWv z4!GePeLnC&64%O54PCR`e~$B;zI$gz(>|MbO^k-C(z07};r-H##W>lZ#)>$$yfK1T zp<;nM&5w~T^`F9-nP2ND|FHY}RSBLz`WH{$C7P0WQX1--3nzpTxWt`fMqb>-+j`}8 za%gp-yQ!eqIse&Hf;g+!e}HFu<=2l_#@;;p$Vl zfzvle!RdAV3)HvJLr(QDPW*Z!@F{|bD?!J)j>+*EIRuCZiE%7@32I_vc(mwaft|PH z^`thX^ka<~-fyZhK{#u^ryjWNr#{Ot)E_?*zzhvl`%LHsJNIx0IUla+~c8)?4-m3H794)I=>z^ zmN$OEZKG5%7VXREaBh*@F@hnCf z4;q^NuOhDEb-I4iSBGtXGvmfp$GO+uA$!0>smy0AD-u%Z)z9z8 z{TxN*3XbJiI&Q_U@Hig^?6)B;&#+GxVwiZNCI+>&nI-1tQR4OQh?PlQt{(dauOA$W zX>;btG4}Dg;@rPKd}jXk?4bKrJ#~48$iZG~X8%4;OLyeKbR>G(0C{ZnlHqN=+_JQM zJF;50;y5MA6aIA9bpbITs$BxnibqyzY9Cn4FW1xZy)c^{UP?t%0uP|??1M5nAdt18 zQ7}mxZQ|3Q%MAyc6F4*ngoYK%izb`kIQcwyy0egw)mV$+3pP=q6}zp(rn(*3jkkTa z8p24~j=G1WP9NJ$Y{^cY=n= zr_eN?sjY--nVN@fWP#o zlYp4}uPd$AyrC9#d06tR3Jz=jP@N@Zd^1jD0yPov0%!sp1FPHPzJ#Wr(PqdzfBK)D zXNKQKPaEc=l`JBe=mw5wT}~hQy4}zHE==Y>!MJ>z*2RB3X_ngu$5-hx+6Fnj$kol% z2rZp)ZVJ@O)gA4z@so>dH=J{qJL%fRUPBbmZr{Tul~1y>)n8)eknCKUe)C>8cll?w z#m(w>5irtbd!4S(95=_fnU+G(hFw^$Rn!;BMzFSeE;n2cSN({STG3DL{6U(@?6!d~ zWY_n$Dy^v=id zF@nlPx*vpkuuCXrOR{sxr0m4>u)RaK=J{Z_AEeE-<15|Z^FKe?jovUj-jSyhO##z9;C4G%Evm2t|sp^*mEBNU)LDU)$mg7oElH~ zFjUSu7qfLgKyVw^%=TTbo2rajvsWwkFk1I zdpyV%$zb=e!OG))=;7m2$n9-#)buWUF_7J=CbZZCs9$4JNITo-MJ@gIb{C*$A0ty` zRdHwSw+i3H=#C!)-s`cP#>}oCpi{#)u~5O}D@geHtkcAophb$I+&CB9l%Tb=+c~w( zL^Vi)(tPc?4Yf;v$sP%R;|yeVr*{(pJr9$LmZf%11L+FPlPWTcqiq&H=<7W0@|rS{ zJq6`m7M%=1kZL)vYQmjB(U*31tp4VA6vX~PHfMc+j6MSuDZApvJ^zO8pYkT4}{-(dVCDuZVXTvx?7z_xJ;{UWOLhi*8$k72J1- zkWNc=+?c+W0-!ne4EYh#mWfw6H#kllG2dwjy?I2+@$Vel$ASD z@{4LA=k-K`8PJ=W^dcX=qx^N7c{JOMH&-6PG0*T3OWap9>IQQPn-aSog)7<(??S_m-v(wLq!mx+b$a}f#_pp^L-O)m0p+VtEG1uuK z3R_Ur8g;HZD9Hi^@m(Ct50`|Qk2e*oc%+2Iw_v3Q6T4wbbos-mf9j&3J#_zpZ6I8qK5eSRM93j3H!F9#>>gk*$_ zTZh9aRb!Rr^NCn6lpd$HyG{?GB97qH!nt2`(5?$&73cDcFwm^aB^~_es}4Y5qYA2f z<&_>5^~+~UnkbGaRq-eQHHR(aiv;vFM;&Xnk*ie}oID8g)Vbj^E6GaKp7{uJd^i6- zhwGUCy!1bs9X2j8ebMY zLSBZAor|1)_+mCd{D3vbGvn%}YAueV&Bv2m9zvV1<^nX>9QtIQ%;$44_Fbf$g(~;v zV$BiVeKfmE<@z2fjs``#)|v`5&vr^MyR+X^86{Ra^u>bTj6_#DD(otzoC4UVzDW=G zgP*+@TDYF{ekxwV)oqIT&$72p(Q@3G9fzODLNvAX&ZW^1eEu61=2u8>&zWW-WJypq zN%}qQ$@rcIlEftIF1=GssSD>_WcBGFM*~S8o^cE~(&JH}_U7<=Pd|8auYt|dV3G9@$~U|w{lFznxCxa`=sRi}|#n{`=ppNoBE_OqU^q6%e++&vXJ zQe67xsK7^r2l?92@m6R?Uv!S@dpP0Tin#TDj&E_CdLKn7MZsN1RZLeI zxLE}BX$h;1xaY@Y<~8VH4|*#*D|Hay1i~U`zeyLY6E(PcFxL0{MhmA z!#b|{#F@-Wl=5$eT(!&U2Wve7c>86fDvYeSZPlPk7<&qhW+*E0j@IP^UKLhvbBxv{ zQxSKwxF9!M*$&3C#ao@C8@-k`JHn!ulk{JU4IV+GN0iB2@>m1JrERoT_(!xn!jB90 zzlP@gs8ue=fB3Kl)vcm+t43>n6q>7*x(JZ6&;4N8EQM?S|lS7|}Y_3!|$@ zGGezrm`Xd9)Ls1UVwHZfuFIu1n2`&S*N}+Zo&5DD7ZPbvCg;CHzjl#3kEdQ8S^Uob zR@|k`DtyMgpJul9_reQkcutxB4*IJYqW$q{yym{KlS#e>c?;okBVqr#u8PqYrRbyKM{r=sr$7| z;Z8TdaF<4GK;D&&sBbMqfKgXp$_bif`Hn24Y>hqIS0(7bgh;hky_Cy6OjHMy(o3%A+i9Jgijf5_&0MZDQq?_)>cba6PQ zI}xKngS(OBJYoJ2l=R3gJtSwmIQ>O?FEa?|mA6zZB>sUXK{ubX7??FxZ0R)me!`~T z#6P)_nEG$4P7&`VA~)(Xp_y$8GxoK+7v&j+dDwx;)oFi6#Jz-!MW?V$j)$rwH-;#O zWW@30e9+JNc$V{MpQzk>-p#wPdXH{h1EK3Y!DI^4{3=EVRSWWzj}m*--l5N!{z?zy zGtP}^Px0;YUUZ&#Y$H4JWLuc7khTazKj;9GSn4Q}lLI}9^TwdG#M@m>Ux42kE%;Bvv&X#UkBV4|fP+C45k7!2VDp(nFu=meuo5bFm*#=})CK320#v6+Rb z%a|s8hu1D4U$)+$MAerZy+IM{SMZl9=RU_lpwnn3?AyOrk55_>Dq?B&h5E^Cm0(6Z zun?4gXtU$Tu?v0KE*_^uS8WSbo#maPH3Qlc(}ls(tw2hxzM2A%-L)#~eq!3AeKi6z zf;US1Un(cf90S+YMnV=|g&zX9l}9KI``2;u)?KnU#{UR;x?lqcWol7+XpOm~$93MA zC-)q0^|8s#-e%O0__Vt6_0Z!RZ0jaDF=5v3y>SuIG}cA8hDs-g4ncdHdEpAWTT=`trFX!vjugE8b<`n~pqh z^8NGcH>1x$cdF9@$0C=dUocj2+<>Vh|xUE{aFdH(&=%-)RsX zs){x@_UVP8>8DOhFWSdHW-~~J4)5x}qTwL*$61aRcn*pKCoWwWbSLtkTOWWRPvITHRd`c78 zw!3B4zHyOgJoH??b?jparkfAb$%5&<0`cO)dots_IH<+%r4q6bP^s-A`mT(ROun{v z9rMvb^T4X+yZab0O|=GpHJHs^Lt0}jUIM%Ni**9^nl7IzJR_VZC(;J{_F5>QC)Ej; zbp{r4SC)?&|F|$7Tb_#by4^aSWW@QUQ*1UV(^5C?)*W zLWJo3@0s^kC4^VK+9JTX(#R_eoww&DKCPn(OVE5uPt-ALVy#~6c+b=bRKp!^)jQlx zFD}gHtvl_$ssthmIVQ)VnZ-Zg0MDg0JSEI_oS+E1#0XS=1S*kor&geA%0H)V)j%B; zZN+_<|Il;(*0IDAOcx*E7Ym^CLj=n24yDc-rKc}D?Ou1Ze<%{7;?2{qDKlKEJy{!Lgo!$XBqyDL885$*|p4G?PEKqr(R zrdzzVR=;!oS6Bq<30(m*@0*A;hk2U)<~z9nHEDMZ;EYz_o(GQtWo}HJoNdU;@}e62 z`0+%S@#t*v4-0$>W?>qEI_la&MCN@Ns$7h^bhR?D`u7$0?%Ns<6&~?1D``;LB!T}o z0o>SCZ@Rzl>1=NBQU=~s3*}a$%n-laSRxM7b&Kw=B%RF_-sY&x}t>I$)54 z%@s>Pw*lSi9lG^f32(0yH8Y54>LYJfHPi0{)M?H@3E1%{(II7Nx$$_Zv1k?y#q219 zsJ)&>c`8PENok0G(oZNrUzy+Xq%1x}5fxWPxNc6QC^nxX6w{N6>3WJm(F1;oQ+vrr zy`p~*${E3gu)B*YLjU?UqAtW*z=g#8vd45AVLBTyy|*D=wD=#a#0=kS+eIwl8Br7? zYcqtp(JB!aKRyAYEaRDcu5iq+L`-)ThT>`$p0!-ds zqXP8!In|DloO;}xPy%v4$oj>wXTCPnt>dv)IUkiOl|iCCSMU-s6%rA<5((K7pO_MC z=|n(9#HwQ0cPB87oH$Lc46P+bMH)XAFEk$QG6v%ir>OKb|C?|fdCr$IIUml#zXkaI z_4c);ZxxSWN+>J=p+QlXMa*3?RJ?CK>RZIMhikG?0W{99;OA;L822$p(&)M*dC0zEh)Z#I_?-bpi&)Q6Fz5U;ea_Ul+D((K8 zI0lPPJQgsf#e|P45ul_%2^Lk%FY(QM!i_Up?5ZGwTQd-`?%@0OS}q{jQ-%~c5ZQu} zkHf3}-;s_xmt0~r?0J`qZo(Lb)cIKpXpw;5_sx=QpucaM>E5^0`F*3*?r!~9fbh2M z6(UBLX9d#}Q;7oLz5d>w0P0Y>{Q$uUZW)hru~&>5`)Dp6@tFUsXsO-bz#We8=4-0W z|8)VBn-YD;qa}Rvltar5l!jEKN1ls8#N)0a7z&moKA4NgGypn_lVgBY7`ZQjWAXK4 zU$vCR;)KSMuKNC!qqD7wKouuzk{{x)LdHK*nuA6<@BdHT(XDGG{JnP*Dv4sb8Lf}V zd)}_gW2vJ`qpQrnX-ARwT`EAG&t)?DPYe^{k16aihGQTs_G+avO8iXh{}+(^qQFU) zCRAXx_JvLvmxOp09=!Wzf?Y%55Bc@Zu5VH>K%)kekDUAllZtVa*_E2B5&v+NZcUDE z^^q4RG4R@|XIdcZwhe}J=ln?{{6&Wjf4lHNVv>(6=yN+W+aTf0Q?|918VTsW;@*2> z4!myYvgdk`fAog@Dg~@x8xpAhCx1%p_5)eSoAb}WIM&>?xx}YR2kN) z+v`>+p!XjiV#VngU8pqnG}+VcPkR=LZ@K&ISgEVcsit=7)MIb=&)aI$RkB0&CcmZpZ1 z>AFj<0jb!_cI}@|bA3TKivV4H!3cBKQ@rCmmFnT7JBC2J_1*P2Tr?q{rV99q!FTOF z*7gQfGkw>$5x(4xuX?`uR7)AKu*zSzMZ5kwB%K2;K0_2>r$Bvc41weH)78Z|9@p+0 zW;xx!M_~v%IXMa1CCqbRn(dYJw>M1Q{!Q?6lSvqI((%>+k@2kF*&F)l?6)T~ zI_J^()jr@ZQe@-s`9!R~7~6o~)9rjVvPyls%XH}BD<1=~i-Qq;0^xJE8n3sH91KTpOzM|<~ z>h*a~ReR}&DS-=x2GeMYJqzlPJ9Avl(`XrsTpy2a`G3|C!Qe5ICiDjM=`!Er;c4R8 z_%K91O7h$2e-prk%JjmX@k(iyK3pEU{smk7*wdslHEHVcc&aUNC@Ze_c6H~kYdhM@ zCQ#k_M%D>22{u*c?_1Ah$%dmCdD!u&Lrc9WVcbuNSQLZx$vSOq*?(wq!UnquiKcjcM@sTIq_-m8OA^o2MIHxG8D8r>ur}M71RT zx7!8CtLzc<}@dd)$9(q&bh26o$3C+l6WS7#1Xbg3&pJVpIPZKelOpEXA~tEe+q{6^0JB3rl5MBm``QHX_a<9* z*u-JHy@FN;So;4!i--z|xZUvs{;*gYUQ8E&+Uu_cd)b|iy)wpEaDbU+M;V!h`vDhL zap;I~siV|#4k%oa0R~2m<8K`OfL?gsoCg+lE zmb2vd6VO!r^DP&IyxfeBLUR<>y3D%Z6o#P4V!7Db?(QVpXA(7DF>0k4WQ)7qE6%WEZLRGj^o}UKngwzx=8C9AiDZ2EQ4dbgrqlsHQZIXB%jW@%8F_ zz&@)F2V4s6e;^HGX|Q2^M}87VA!;BnYUKq@mCbRq{_ENPzy(xE=)slQTbx@I|v)5f-G|3*pEySbKc^MAjgp? z$n4WfloPE&wf^eMoq#1vrBx3oK9G-%Xc-GPqRR_6u4{$qh{Po*D?pGmiZEF(ydQPBY6 zu~iWABla+PmR+WLnZJhlm0s9d&lBHotcrtTHOvSn#@4Z|bi$5^obZXJ2;?u(d!)0j-tlWnc8j!ku40yL+s;+IW(z2&71+ zNu8!YdUC0-sgO9pnu`Y zbW3F*UdgwJ67{>1Zg<;7JVJ#+lqLK%POvTN-gO>N;}@VQ&F7jN|7dFK7jN}ZlrG=L zCO&CE;SBl}qC#;gX(@Pm#>E_AowmNik5-s3oV5cOQZ;9gKF#d*@#CvK#?A8{kg~s( zmBq5NQ>l-fOl*Je36Th4wTA7OP`8Z{Ax2xE$BQi`i1x#fs|M}Y#S zo8oF<-FKL|&c{S4z6jtKV(SkBzXJc3_0EZ`?C6A8UNS*3g^}3Im3uw`ocXNOI$rzr zj1~GaC`?cNj4 zuch1EH7T%HB#4~C)UR9r%BZ!PC@;afhKto+$=tLne^z*IXU(n6LyJr?p8RfPMZSDpHQ!-n)#N(8Fgv&lsgfi6 z1u2#q&BjC{cCgX{v#V9DcahE4LYx010)Abo?z-=OXVhbwiu?j1!_CEpcrOj{x=!t- zXy#+)_d0Dog1$Q(hso$X!*usxI?>0tcanlN zO?|NVuEa;E={U#AR9&Y9D4=G4lIgZWV+@?x4^j8-1Z3q?ESK zx*aNr;@=@04%Ou6Hhn!%O|j!x0BOp$KZK7qyl0H^H=|v~V|i{yn=hw&ap6+;C7gK|LRA#=nJ1qO1j77#uTUJfCBW|< zKo9zkJI5)^pRQ0fuZZ{P?@XekRk7t*BJ2EI{Nuy^P+5EfLf*j!5I=_J+ZSC}l`~CB zJJ)11TuvthRL#fQVDr%~J3y{xmB#p2*4Z&+^JAoDlHnddpmSJ~%(dU5nWX0`w!fmR z=$>{C)~_BuCh_gu2Ru@cg$#L6N0F_-3TFFF5UOb}338>OO9$6tC zXM@dUolL~MfNcw(!S7f95j^ke8+A;v)6uC>n|(8(9^jzE7OG1^^unUxbG6)CBinSrgXs-Hsu~I-~QCP?H&rq3UvMy}s9}J!SD_DBl~%APKIX z8*=TBi17j-h*;9L8^wc<;RI^3h0E|hIIr)6svVg0V0&W{VBcZTFYrNU#)P+DWWq0F zrD7Smu5TQ2to`Ohu=ZkYu_KQx=|&4UF#@)29-2)VSn&_sbgSM8f8YC##BXN|wb{+L z3BR3X5aR_yqP+=ywV@~Xr3T1Yc&ZSGth)y=AB|Lt$OE*qunhMuIlO3 zkga;V5G&`Bq!Q4jvQq7Tq46<{`_iwhk~~P+&SKc*=N%PAVU(BLy(Dk@^17LNQ10F^jW^)#$$;fg@;w3FSO;%s++zuPJ>MwQ3wu8 z(5piyxBRd~rGYs$vdaHQ*;j@&;e~xmD+s7aNQ#JbcaD%25D*w3-L0g=7>EjrNVk-< z#E1b>1EfoOz=$c`qhn*+yZ`&XU!M=pmwniMU1!(s^ZVWBu5%w-fpbJzf}iI}G!Et+ zs)UY!ms*1?BHrZGs3Hr!TNw_b7Xw0#IX0AP^xMQli}fLdGO4t6@AXJ7ytMg(Yx$oy z!h@4_^M~j2oIl&+K|9`v7~!h!qe1EA*z~W`2Z|rw4M2COnPfZk$K_M6#1{kL_Pzen z$80NFWDSS_zSeQi>2$8xtz>B>5;hI*Q0h-&U(=_gbAz7lF*fdSZ@-3@s;`fKw0fEk zjI((mUrW1%m(C@ez*>7E+KdO=Y82@|=rRgu&9!CkYRkTkxxe(mXz5eB!eYRy)gIhT z9Gmxc@7(}LLQ<8O>(}e1wz!kQAc6ttDKXC9*4kS34lNp8lfGND;hKudlRoGg6j_8a zMqk#mg zd3-pRhN|#i-)&&P%4OI)t^<+*4(Ywo!e>7y2qlDb+X?VQUHr-Ye3qVVbTS%{eE=)P zO)?kS{$R!E+#yd5{AX*0lHrXIkXY!X*doq%kSlWvs;3 zkOaGr9QQ8zX0RvR9}zz2Ryh&wU?{={@07ng5oCM`~^(Zy#HFE5?g&m@A3tTO{0G4#p(p{0HF z0BdGr*sEPGphcw?PGq-+qGpN@iv&xg@R{E}@R+l%ug_`Xc4}NIwOY=0+5B@`?BA1T zAMWMMKA3)>cAK-(y3T?V+Uw$6mgTOgew*{d#9>n@Ab6MN0M?1}U%6aaxm>zjTCDa2 z=bvBqo`e3Je)rEi{l2BR)xJe*S~bCJd58L43N)g#>+zxrK5cDkjDq=Ydd|IH zdmCFvTg_7lA4#PzcO^)~Q%7t*i4UyE2}_{_+$vwB36he7C(<_T>g72rHxAzJ*c7SI z`q#6<{Z4(qhW_Y6L<6W0Jqn`2V{U=jMJ8I3-*#RsJIEF3i@Z-{Q5lR6x;1{MtKs9X z&$lf?gIMqk|{Fe zAv<8Ej6D3Eb+(HLUIiO(HtFoVDN_UKh7Kq&=+wO^gYh9)liXUS+*C z9MFy`;iEm;Uv}q~fy8Oca`?`LY`qv^UU}w#IA^hMM;bYEHlkiS{_nVk1 zL^MzFmym0U>4w(X(TC=py0ltm!X?WRpZZI{KWr^)SZ;6Z!GV?Wo0S@D#yeVP{k~{U zx~YsOfaD4fyx*yr$-PC9n5x!)Z+99els|jza4(Z<=$Y?6sQ;lucyA;i9sD|!zO`2I zrpc0m#yW>hCfl%Xd-pN2Ec)qj zs(3Lba&{V4iVCI0M7-V92d5wJ{_@L#xr1y}>l%7E6d=|<`hITN;{m&V44PYG)K*@s z=TADn*uK!7Cbs|t&LDjyxi&OMN=B-WvnavjvmC90O5thBq^Dr%^y7dMM&dPs` z_(kQM)i@V{)i|za#h{DZ?gK4wpQ4ZnK~Zsyfwe^Q3^7jYN39%?58oj4XG+fm97RR} z;73`&5{CL!gRd`I$RCe{oPB^AJ8A98<&1>K!!Y9x*eP;HAy3Nv3@67&1?R3<-$kaK zk$M?4PXIui!`8y*+pLTj&vXV={V@C88FOabu!`q+USf?a+X{!{x~2DRj@1SrX%Zc% z4&{1SD&#kcte08Vk>*Uha>RXa$wdBkh;W6jnxm7N9tW$n_V8lTUfsm+>j$UmYn#(0 zu8lJuI9ZM+fFOnCD}7H zvD$yamh;HwZ&>+_HD4{e-6TKQoNRPLnX{VO`}842K44|GBMuJOWzcJFw@=d)&LP91wHy0ZP-_ScREe%rs zv0CSg#x`2}Dp?A)z;FOpu4dsf!XO3I>1^U#ejXPHk0`qS@>rfbt1od!(NQh*QY$e; z&$s;DqXQ@D&W2{Ymqs_PdqlMmgVFAXcpdLKvMLPXGb%thjntZ!6Pb`cfi}>_u1~#o z(4DzYpKo>FFxX71-!Mqs%=)}HSUTegNBUi*GT-ls7aIF+EkEYAZG-(+>|x7(i^;7e zd(y%Z5Lc47KG21SX%9TnvKrrP^}2J8hCiHHS#pGVv!ng9lo=lOO=$LPT-w<}479h< zdpTUpjBfg<5LHR9R?TKE$1b-!5=;`-!biXpnFM2~p_)Mo4soK?qJNX!_CU`y5Rb)BKoVEa9(oo?u5ynX<1myXWd|DKF9% zWK`2l<@TBvj5j|)x88z1IWqO47J>7?LXXZruDA8#rw8V!V7)fn{fsARR}u1{@J}2L z;WR_RHY#${|E&S@I~o&?gwTh7^Zs{A9S!jz3O}Z>XJ<@OxZj_LI+%<<)iCdx$g9wN$jP;M07wd`VY!Qd>*phu#(Yk<#me zo=#ZsM-Lk9Q%9;*=#%0D87AV*Z$}5E{uvDwI;G?0Q^CgpNAJ3NE=X&|gmQ+{d>gCl z`Wb8MMe}zxTQk|{rf7L{dzh_me}_M;EX4n|3aVEkPN#ZH+Y?F3ebz50TvBynbXzRWS{E`nxyUAXynP_gJQ-*bfUuv)0ekU)X8Ip>cnC7F$}FX>y@L5b_JJMI*X zb5xK~ft5e^g#KDPRZDGJ?Zpe<$k6d|=6m@vgh;}qIF&R*A)y+`3pdk$YBgWK;HD=A zjeN&#@lati$~>C+vCF&4ZE~N_0nA&TDgqxBkv31K?73NblQzo% zdL}iLFvhF_+-b<_6;KZC$(yIgXe36sf*;YfDj%S{qqG}D&$2F|;Wag{K(RrJb#UnK_)#G%)?J{LL%`$QH836q0GZn|AOn&WI= zVeavudvZN!)ugRr&Pvpo@VuQ;BINT`bbp4=U~sq35Q^}c1cJX2i{gn27f7;DaNV1! zY51#$f2U?Ss+OFjqpRnNm2-;CMbrTY`YPF#*d^-5Z+VczKbM~t;S+WSJ4QB?@T=w8Nb zzbxFN(R>=*!}I4YHpRdGbu*q}cXZEJbo`rP0PF4m7riJYu&*vcxw8Xc7<+-M#dao2 zh5uc3PNEJj64Hy14ggOn9@jr0xMW z;m!`EI2sx70YR%>Gv`J9gXE;=(d|@)foH|ElHKa~ZGOXdD{+#}{N@xhZrF}`YUi^@$SW!boRG)Q5Qry*y!d8>Fv zE3oNf>JVi}IqXH5p$qGoO~+jE*`e{^nKFyz=C7gfC|vL}JR%C?cGjuB$;RWwaPCam zxA<6qzR8bj;bW+7w3 z$;#s}X^v9gotDW>V3Dt5?DX!+ul@d#Ks}#^B_CMyZ@Q`A-*wwXF(2VFeb&O;ZN%;n z?y%R_FSZ+ug-wbM7pmy}tYn^`(tFB^`IJ$wrV9YRK%YD3_#$_Xp5in5>)wismv7aG zP9k13TSye?`*hpfx?p3)|3re5`;e`5G6yei#NP<(ob!>Px)k@OKU@T}7tHYNwN;J_ z;+*kA4{h_kW|x4?%IgCAZuy_Cboin#7CKJ?+-#%f_kq~>XgOv4^sTJfJ1NFu*F&v| z;^rwY{%z`OZ1Ot6*(CW)6Gs~)#i;^DJ`WwgN&NN2ONk_OPw}0xe54cmSwQ)Pg=1jW zCBx_9cRy%OJZN)MS1UR)l&bUQoY9N#U6jFVh$mV;amq&-xOc_~4G^lQOzm6=&Okx& zh^qWW)0!wdYfWm!Rz>ab^BbR|u&!~qC`xC}TIO}ASlg@FInsa~Tn(FFfe&P2SS{R_y&t>Lx-2s za)^lM5$LMO2F7(vil6CAq61jryPkw~vd3A&GKz~Lldub&9l%VJ4D=BeQ-eLd%d`0i zGQr9PK!m@)PxlS?kh@ew@&4M|i(+WPgXt6iZ%n(aosJW@#M=RUWL5yM$JliF4rQhO zhy@f0{k5tfiJIVEl6m1#X`fnhtM$~wQSxw9t~8_c8Pv3?sX8_A?>gPK(7w)&*D&UQ z=Y2R5=opxD3D4SeZ2%R}Cr;-o@KI0VX0`HfAC&Fsds(HfB?;=-!Q0n|UB4slJ!s5ONH_&wz@&hx(;W;ZK*stra zINVt%JKL`E5z|Du@O(wSWi`wR9LEQxWtN^|3Q=28OGSaI{ATTE8D28#ZeIh0LBr|v2bN{@4%L&NbCuD>pm7+eCGH?N(QH+ z#ojM}>Syxhz`VCvyj78VHqAj)&e(WvI#Vk+S=Rp^r0In&M~pH=O{l@Z#&%?G*F|cP zoh26X6{gELPFD(0XB>3r}gDTW5L^u6$IBoVCL9IC4e-a zDPLVw5QibZwW27aA@Au*0M!44gQr9j0RY08Jq{`IJA1!3X1uFTa)PV6Y%ug%I)Qlr z?7Ex$CtGr8*wYdGGK)A}v!Sk)yX~=kDCjApG-$`hX+myt^YaE@1Y*l}lyB*K9}qjpmP& zz3~UMLt2VK=u|KV-6Ed4>T3^wWZF;BsSu%B$}6*TWt}Dzqw`PV2!*tY0;#`NBto@3 zEVF4tF38N7ZkeMLz8%X|g(rz3wBZzQ!kuJ}C#=*?QeV9|<@c;KSTcl3{UJ3B6;X5- z#_0Y_1j#qHVYf`C80y!Yl~+u|gvU%kbK4~c?+v@cT!xV{55uY{P%7EK_pTj*?Vu1S ztn?^?ncer{nQ~bBZ%k{#T*U@RD}Rn?1x-i4()nS)ih8BDif_ky)tmk(qIi6f03Ozl zmw89;I-C-&DN)q_j!O%B`fAMHiBKmrv$nH&(zo&B*4hugp!G)h&$di9(NqUz`=szU zW`E@zgK#-h`E^$)6Q2{ptGt(gve5>ppB1X?MXo{-B4llNQX*sGfLRUX8yc_|yDRF2 z%c=44fcmX@wy(8ocU6v89FC0v2Y`a`9De9|V-BfO@^6j{1&ne#ww!_5CN$Z5N z2WC`C513ongys}~a9oa^=E=C=ugozeCR3Uh)IRLaUNYGN=Of$rXWvQLzFB{2kvgAN zClznOt^=sW$GO4AZy)r3m~O;<*l(qI$A2a^jzJtw&h}9rgS=mHi0qnN-oLhamL9O71nV`u z`5lTaZ^W%t0EPo1wG-A3V~IwW^QoxR3Y?p-l=Wf+G0Z?yrvz%?EMfw0{w z#%op)uL|n$;QGu)xmVz4VZ#zT7iEeU2U8J0yC>J#vyqU*&U;6?UrHw~Ow9dCmsz&m8U;8tMd3%44l)W2Po8 zJ==!b9kbqn(&F(NM`4Q$otG)q;KJ~gxzb{mia&EoRkp6@?B*HQ;YMi-SV&nR%=ZP> z5b4<4513D`_SZ#+-Q5k?Lv3y>Dt~53Fx|(MO^e}jpZ>z%k(|Te7ek90MMgreP_$+9 za)1{0-wO2Gd5^kn$k+2LtVHpvPhuFa<~r@b9krcwKY07!@dmWb=8U#I&)DHRkdBp4 ztD5dKhm%w2G-Pl&7FO}e z%569<=#4s&)!uGYRe3^Lz&I+IX)JhBFr<2fik;%bwj9}%5|Oke^{@?gQ4m@^TG6^V z*!RqPiTsH?iFwGU?|`=Fxu7ln*9M2lf2gOw7arej)mq|OR#}ViiG7O%W!3mkFE6@; z8vY`Ry2$I#a~JIm7w4AR-cD|sth~%KRzM^v_pDo-Ef?(L)CZZH-aQeweWZ!G5Ioji zn^9h%9rYA``OryNWzB#Y(c3Y~bx?ZU2c!MK>{$UT`O)C*Vo6S~lcd&e#EZk7EJW@p8y5V@hc~s4o0nC$9C4vOMCVtHW;} z$8*Febg9j?|DvJb#V*#t=h&FS3Lo7*3@I&+yZF(vjrHFX-VOZa()2!^-di5P4L`f> zA1#QZ@LiN#b)UL!yNp4@YqK}&Y(GtebXy#it?=zce9_8Q9}bxOzMJz{jOiNH-yoL9 zxQ#*4OiXodO@`}jlco3Dt9Kd8d9eb;V=VrtUQa5I&{w4n->M>;H!0j=+kZ1qq3KyB zMlo5^Lm0*waYMc8%LHsQI(w@-)JYx^uNhX1HNBQ0KE0(!t#k?05IcxW8kw_a>qGuZ zGBYut-!fDdA(!7-n&!f$1>9dQ$K|PE0;IOi%iG5;65gdf9L{DZN%I9iw)>OkBD6luUt2nmFX3j@=!QY%azTvj483My38# z5~_2qd_?k81O=Nw;wFNZIN^exoDH%x7uw?0rzzJ|lTlL9iX0rgg%}=^PGme$GYXX? zU5*%g!@lW@@x-XJ)~*0N>YmyWYM4g$kGjxNS*Dp(Kn~IK%bEuod9+-M<)1o=Ig!mn z(h)%rg!J~ae}D?_o^O%|+WE2a*r+K~h;O125Nh%4=qvq(+$TRtuJML6;xF)f_=~hy>&v2qSj5isz)((Z;-;#vT%|H&4 zHar7O@Vy-_F&UBfd2v=sCs}R2lllJ7F;4Yq4QHFPKRgA>y~cR0rt?KL z@!v8wL@0Zyi0HsvX*6S3f)Y!XwEm|RW(7rK@iY`HY}Pxwh>Mnv#=oOnp&r8_^8Dr^ z+zA0?ssChS`8sDx-g8FdXaIbCGl%plI2D!}RA9qmZs`p+Sw=UZb-_wgkGy}e1QG z5_>MJC;c%NwVZQT4e5`Xoos)*(SdjivHm8s_2cORIeE89?E;q&iQV;a$LVJa{vV{K z1xgi7dSZa>%Ma~gx#+k5`eKPz&IV6a${zcUDQjaf4&obWqg^hxT;l-wm!$}zgtN%E z2H=ja)s=JY%Zge~yD`+5{X>?oBzh6Zp4Gv~`M-x)vIOk&Y{b?7+lbrEJa>m8 zM@S~7YTfI&{oIUh5LUck!p_)NdYH#=bMieHts%s=p~lr}@F_>+Ru4;&QX~9PJ2!^)T3x z_QxM#j58sDQ>IezI5FRBmMxR@nhF@|moN%|lI<`fSHf1p01 z8y+gGk0o~)gcUFQAW%a5BON0q4*GjWsp&=Q*2qm0chAKfzR|f7bGsWk=ec0!hE%=- zn?SU%yRYSVyMbtaHp?1(vUyM`gL8M#T!XCzu3CFH$1KW&{~iqxddLvQz3FSd18L)q zc6q0-;Ik0LZ41+rHm~W$P)8=y|zy(Vj6t= z#aF}DQ%^CJWw^4QDl&gesQO{ zS`AOFtVIC;-!i)(u%nE!J_+}C9hChz)}5@jH1JO?=hZ9Tx;4*BA^D)jxEW>obAsDatrShiL#)Sfmq?1$J)W>Y%x zhrNu78#zt;oA~T)hIH1K=NOkJFqU@1yEhOH}{oAhDXK8+*ln|SK#KumkowCD*A5NKDzG$q& zciR}D^i%)1qfGf>Qb^cMW1^J&R#=5E4~f5TW_pB+C==Ab&L0GO*#s^Y(2zfGPBylq z=l|M)C-Y)z{pD=q>9d(13;$dpa%n!PjCJ2JI_Us_fNZpK3)Y$Z<2oT>Nx>u8eOqIP z-R(3uGVI~q>du*cdr5k?W$tRBS5YEC2hes3jj)yT7x?WTOW2t&P;PC^Vi$DdCb*Y# z-gD}DdW!$cqi^Wkh^38>R*bCc%=+bD&_JXaxXmJCT}5ir{J8U4;6`` zO6pT@_1q>T;srx~W)?M@EhYq2tA@}#?L_~HF1(yd%~56qDY#zZRUjYB4E63jMMaF^ zYFd;$&d6{jT?!`^iidkU?OV_Qg?M}C_b273T7DV4F%%n%XbI%%7ukMOhx9K$P)sxp z?UM`(ds2|=zlHd=9BmveTk*~^R9&Z%z4AM*?e2?QM4|3CWL&ouxE~r2`5>W+JPL#- zI(&|cTi)th>4@^Cf1EH~k-N$<(Z*Pk9QG};PuC{3aCOxE+M83_7TEY5PBU({au6F+ z;3Oiv4W*C;0qPY)I$<3X+(ONM7xiip9EA*iR-b}(H-_Z| z8Wsl;ac)8d^w!m1+YI>kOe5 zQagxKBw=643iC&@Z9JVJiFnI*A0y**SCzaY-u_XG$V_IPJTQyb#{+ifRN;*NyuFYF z*h=YmOnai=ruGdh#DX%pjDEdK;rw&wZrJ6WP$5;cFE6$w`^*dH|9~{%Gn1D z@**XMBfmawqoGeGbOYS>F&u8W=K7v13Kfu&zz%o5EI5h5y(&773+L=nZ*UbHmvdlTQRmOp3cx#^JGMHz*iN77RA%2*YGpFP_PW!m|Yf7x{dX4|I=V6OZeba2w*>$>CCCOt7Pe~x?aJCBgzMbYh~ z-k1rk(=UmR!8Zl-m#b}Dt@JTcH~uWnx$Sg21Bp0)-A?_Ki0bfX#yXJp7;)W=W!bS6 z{YFK9nM>;9C-~;>rEQ00q;l?4`gk0IeU^Qsn8i^62Zu;_)!>$a_!?H$4z5N3wq$$&a-jiAb zJ9mHabdy`@C0AM$+a)q&2&&1Y=8@~Y$)75F6bDWpH?682P_S@!+7>$I-f&hgktsFG-#LB!ln~mayiv zkzBu=x*i1m6+?1z8RsVb)B5WfqVL z3Ka<1@Q9 zGpf(Vjf){+23fQnVOAooGs+0n z=Y*#lGL$$A+X&R`$59dX1KW%D$;h}s``Lu6xGuov8iXppGwB5N>T3KaRVAz!pnZ+G zO1PVle?||cJpmil#c&6z9z>xt{x20cQgM}5>iO7uMvL6oFKqB1uPeh!8YLGk^IRJt zQ%WsWy@TF^MJL=P)Tj>Hmf2E@sCvQ{1_f+JuO5V6ZbHd`~q z&7_qYI`VEY3S_~D)c=>3a8=3M*~x!X7Gv8fJdB6vV!!?3E~aHyN1vG!8e={hHtsOnQ^w@oP&2AR}EyE3P+&&591Wi8U#%?uoj>vpE z>&STs@~xp>v`OKJ?6}~)iT_8ax6tBN*!z@)e^Ngpku-|PDJyB?!~X0#)6mYZ9DPzA z4$?-pSR~~?5w=YVzl5k`#geZa3kHjYC_6dBal}V5=qqF^pJ2qvT;SgXguGDa0>W=v zu|H8D5Zj1aTDh0zT3K(+oWu1!OjLhbxYM5h6&IB?vP3(~8GDegOxgDL)aJ8&!NGpsaR)eSbDXzMtt@&lcmu2x&G%Bwo8N_q zjqxH`zO+jberl!jV>?3TA$FhcHJ}R5)p1E(f)A74#f&R@?rJPu)f@AHIj#gSX(ztR zzuZ7iZ`e1EhuWNvPlg{uPR86X^75TcwL{3v7&WKPq*`;7wQa2v8PC(jS9*Q!u8Pd5 zJ61|AwgK^FC+#KBaaWlSXN*H&75rrh>P{|oaCO;-3_3AO;4N>YkJ@hf|WJtm1?FsOPB18wpcOhZLHnoCIgiU7P*8cA3fE%_Z51}pr(8V`A7n%_Kc?E{(|vdzuf`2iyz8oluEE0wbjJ6)7=Rkt$ZdX~B< zWvhrYQ+hn21F2#Hi99!JBkc59 zrHQMOdfK~~Wd%2Nw)1) zVP!7~3L`b^wf|qLOA;*=(?_JYTKm-QGiy50XurL%-5pjlV!^x~{;0mUK(<<--R`Ax zHhqXkl{#NO^B%~u2Exn|Y6;J0W&vCJ&`P_cf1P{OC_!bjD1#gQtZT8<+*W zGB7dB0^S*z9Og=o3=D+1(l-Me#q8;xfsJF{^UA=cG4FY1U$W;*o`C?}OgYh%FSyng=VW@g&q?F(M<5~t9783X$`%g6U_X*nF+aPF{ zgrHroOad7D31H03AZXVj$fNH6qcMtrMs5NcYb9w|h+Vd7-`bI}x?D@31!DS01pd`J z5jNfBKOlVU@{p@~2qcLy{SQE)cHXRT;;NJ$`Ka?ZK#tl78WwWtUVC=CB36cT#Wys1 z@A4k@6A&ZDx_OIX++Kq~h^o6l{uon|k-r4)8qj!>nGM=ocP+8PX$ch6j5PJjWCT*j z^tIo@{zqLp0(CwAqb?1By72#~izZNa{xN~NEdL19T_aG}%k1MMbMLL%^3U!=nzt6C z-z9@V4y;fnBB4?`aWd%BiB(cb3b$k8Y!)afQ8lGxP9*%c<&Q`>(SLCdtz~_rG`Z!^ zNQUz#2vZe3QXy-VhUj$80{Und#tD8^|J!cV1#)E2pb1J+sPe>XVnRglkq{AZpP3Vl zkz3Y9rW0jbsR$5PKUb-|RsBk(lA`*BiYH0+Yn45+YBd!WqH1Fmi5s#Y-U9AOGogZ# zXfv(?`uAo61!>V{JO#5+W+DZR@8ORN&LiQx1=P{-rv+p0;erKy!S;miw<#4*dV)vV zc*o7TppnY0u^{j^A<7G+bXzV6yyK=`5J=@_GZ9J({gq*FNNmaP1|qg3a>Ei^^0|=^ zTT;746I)8SsS{f=y7dqv9=mZ7XH!6ph=eX6M=8XX;%;li*>sW#k!Hl%<|>usgpkMc zM_dEXxqEJb1u2wh;s+`8?Sum;6ePT;)HyWa9BFVByIE`ggB%C5! zPlbi7`lU)k`_*DexCw5f@=q3;<0*n)(s+IR9G*v1e zPTU1Yvr)R;DKMpS+bNj6?e?)?meOsbVD^q%V8JYvTXlidZ8xO?DN47V0;xN04FytE zZpS&Ihqv9_3mPfigeMTBQ1OW+>uF=pud2#>$(`YcH!^N{EAADaA8c7Vj=%i#YeFuP zVM5-Dc)TRYJNaPTUDWMQU+p#inT0J(=34s?Q|)GYSJ<(TQaz}AyrA!L@S?asIm`(& zy#$`)TCCi}?y)bxq)@h(gILv5TwucM2q`lp*QJaUe2VMRPzt`lb?GVvpX0g&OGz(r zVW^}wcajKZC=MZr;gU+PwYaeL5DaYCN6ln9HtciE0w-MvV>;^U$LjH>jiau(wj8c zZD%w6;}qE|>m2OJX#|i@5-vTeT;fz!h z1VM4XOY`7y3Uk~t|A3W;;M+>p{EE7>YCpxYe#)ld<9ud+-?TLiDYH7$Zz7j3?HjE& z-o1*K@=jYbmx9AgzXi_%$IaxF%^af%0?YV#g1}eGWjk#xP>NwrVBpe2u);i#w!H4E zO$e@_|8>VnxO#lr+3+cyl-Yv7xa$ni8ls3davUQFbm9XE0s$%bqQLmT%=}A;BG|-n zjUX_Lw`dwVRhMGu)Xy{gIo|)wH)Si|V-%*J2l-j|+xz6k%pbof9im*-g7?l*1e-aY zFRouVF?%jN8y}wqZV@Dqel%eP!j81-+lah$%=11$zb9;oFhr}o?!&?ine=z5eNsrpsfwpB>)8~00hCk~92u9uP7QqKt0-c1G zZQ_FMI{3d{IL#I*g+|VXq6G|RY!2-27Bu{f03R@Boy>#frVf|Dhv?J7( zDOWo9YMfl*hkhj%zxLGy3y#@G3I6osK_y~ynH&E5)G%fA2tEm}628o353*(s8eGe0 z6+VCrt#MY4w3Nk|W+busg_j982gk4?E9x@mHm(e;Gm4evV2*(cA(r!jZ`EDqikpN@ zXK*RH);K>)V4!FyJ}m`2VL2v zn}-{VZmq8(=H%v>`*vwVziL@EQQ8@?nqkNfKzTj;SDp*1AM;#%#Vl=jDeK2m4)0O6 zC&XTZ2Zb)&&H+>{C0(I9hk+XQJAg)xo7W3$UG1!9q@!z-yPd?|;)h0O9M%t;aEp(p zVS1H#A3L(PUNi@Jhi68U7rR|9;Rt)S`o$V?@=T5Jj7jQ=jlakIhR~-MJy5gDG$-2W zF9%zB%;|@Vk0BA3R);=zfvnQEs)8)7GTyFRNmS2l9GLvB(VMbLRH&W>k=OonuRDxW z4IXnFAsqRUgd<#ue(-y%gD?**?ehyR?h zRC6mE#sHsV{teLKdhLi6Rn|wF>T0M;>0;&la3D{Nvp{sB%EOLu_y# zuW6OR>jAd4_L1kSYISO-!rYiy`7qWx&IgS!ED; z;C<6t!|R2sTAz1NV7||$D-aQ!f)ZlAyjgh3tx)}J(_O#G*4jN|lB~1_M609;W}H7z zPP)DIu8Pe}XX6GtnajUg3;D$2-^Ry`v>kt*AUz48uqO#J4aoLRq}A~r4uiC`wIZ!L zLOR%5-3g^N&yy$kTJXD7aRQEv!}Hg^C0_@|AzzCU9j_CbzBY6@;JJhye`9K=@ox@j ztG@KGcJbqjgo;>NtFID_`?p@7@;kDv2lzXCk;55_zb+TJYA;>U5}Yy3sFJtyg0-*Q zEl&OvCp==d6_=2d-E?lWVVIb@8?tRT$adL-BtPE$it1{X{ym4 zLd!uvx3}&2kFSo&Q|5-2bV0WI*u?eGtC5jIJozSRlLa3t9OF)Z^{Kl|*k7rZlg5)s zy8O3zY7ob*{@vr=-P*l>A4i|Bebq#Xsh4wzCat8X`bv9@xKO}*32qv*DHAB4mkfSO z@BykfSjDW>`BUzt@vK!C6776qdW{QZe69ffk~m(Yp-LN=*tZgZ{KbFPp}(;icZPER z6JLrUj~i4@AJZ0kiYhflUt$^*QrO@wMJsWZ*ZW^>B-Ww(VqCfEer6uS+^vAnl5Gp8 z3+yS^{D=^GYf6x#2+t~8gvF6mQOx_%r{Aq&$#lWe0W&Bak@(0!;x*I)7mENzHLYYv z8|e_ko~mJNUJ}jI)*j|PVuMTTrlK*rs0O*c5;D!KjJ&h9prG`8UHVSO{}y5qWG2MUNdF|6 zrSp(kZq5<;E%K+Wkk_@(jC^nOF?kERH@b@a-)HFW(A)dus=_J56_d^LgN65Ia;TKRc_ zKAP?S0q z|KUs7eh#KWvGKidIo&8iN+v%W4~NqL`mI_&o2?n|&fw~-6Jt_+%g|npDBI_BH|QU; zL&b9W^oC`1kh?)E$^O5R%m3egubg~o+9>8{uuZQ&md^>|%glR=p5`)T>gMibrOFM! z={hQQr*{|HE+s{9 zCI0uC+5CT$?sMNMu1P>eka-nKJmq?@Q;JenOae$oZ)Yq<9s$o_56 zjpZz5+`TrCZ$9HLsg=r523bWLX5_a@i#4*?UEVHqr7O8Iknd>$UqASIX{^Vp1ck0T`n?$h=$!mZ-_r-F*vTW4IbNJp}WzqEXCw{&N328ik-G zm22m4EdE7%qyo24d%m7EXUEwa&8EjvY%7;$9E-gqOd)bV9ui}%w^f_|+Rdea{ zmGlEh-;DGZQTki6wD-WIQcO2D2BazNK8*B*RGy2nbow{F$M8%_>NzQfs};*4f1W}5 zyJAM_Ccd>rv~bi-T#XG1}hXj>{=024=nzHtkUZQ@z1~L z-8n)0^EJJfp-)Q!t>rpF`~zCAfaV5mOhBswEw(z4#@0W^=)IU3_GXx6uN9&16sk7T z;}5z)JINa-c~>g(1~V5e&kjAdT&2WWzZb|0BhIRaJkM21oV6h{tT>C-Ui-oC_|=T` z5b+RC5Oh8JgDk4op*@5 z;7hn>Mt%?EGCOxy)coHANuMwnN-6{AEi`r(f>!yP8R

    ^ycXFc*QB)9NkH=qeBxr zlB{OPDub{7P$X+LWIf5*L|$MVN;(^(1+5!qq=8_vp4FDZPVEhwNJo7xr?%U0Vz7NE zf7RQELiSw_85P&gus7!{$DAl!uReoQ3ke`Sv`)f4QTD~Y8Aq}_kmW>KUJLymA-%nR zU9kD9eac{{6TBB+3|jLIGn(%!<-poFtOi&ou=Ogeaf)Jq87Wr5*!d4y^Z~ZNzi+x& zZtVMql0Hf$V@OG_#>Roo-k`-S9`p+iiwEZ6uw-E49A@{>dpugK5ZI^+WAA@K=Oe(_ z9iKVB)L$28Hbg3aGH_X3>A#(jQ-?a4$rfXO8N?Rpp|5VC90T-FSR?l<%Lf^1IS#Fy z9LRY{R}T3#C*+Jl&Ob>`b4I9f1u>iH;b3ct^}A5N-_B^y{{w5}u+CldE}$0c1J=%A z!@#;Y%mb`fg;70X(Pjg{#;6`&9v>=}b)g=A5Y>ade+hXNH!AaomgDvHlBHe;6zhfL zwL@OqO*7Ic$va0N&oO^_hF$cIo+8h-czG_!E38%Ief*0>%WH-_k569H;^mF+qIcyK zd8aR4o_!i)$}Ka}7bGuUATLX`rV&D3{WN`_nvq_he*Nxoq2Cole9$I%FO{8t^}_Ge zDRGR$8jql0y>I=@16q-Q7P}kct$>ycTD5@Y1g$|pD*>$)v>z!nCxufAts1m00j&wN zegUl$v=ISq05q?FHV#_MZThsBZ!jO$p;7y!LT72WDeaw~zUcO@hP(lvyr{*?>xaCG z+ZFrXa%?Dht3t`MK~HgaD1N<0ATJERUivNk%^g}lTm`I~!|H+caaaqmK@RH#Ho{?j zz{WXj7}zX_d4NULYq6+VdiP6=P4YsZOn*hq4{qL*JEvuM3kBac2V-v(fV9M%eK zgu}XkjdNK4EWPif#YTWd-8qM`@gF*f1NJf1qxuh0AG%oQ{W-ev$t31@#Md{@;Z1Y|vk)nFX!+(HZI9`FukwH~+>M zWKWtC6H?D(ky)Q;fULnk&PW%EWU2R~xy;G>OUpUYv{pT8G#Uz^#|xH0kJ8Wd$Lkzb zoYibHZwm1)VVcEg@SYd=U7-NDYqNIVUkV*pwa!R~P+b4Q&$#tAwk6V`?5WDb?|hx# z?CsQ6$S!$&M*2YgEhqanbDd@567v!xt!lFOIBobFK-$5c8R;y8L0KosM^^d%?>Usl zi!^a4>vjG#RGc--)%$M}mKY8Ff6}gPqyp>s4rQJKBlS(tK zB^?p^JD7`FOz(NpY_>19VX4tD`s9qXr=nv!jsM7F1Xw?n>Cq#CtwBD>twB0{-&n>V zxBRIY>3jY*Kh?jU9Si+cfL22CPd!3i=aa3r3)EGX*T0dz1L zNMHE$jPx~q+uKi|BXz&Fn?Bc7QLg;$svGh=&&)_~Q2rkz2YjEULOj5&e^WWU;H!If zMjEHKYEKT;R^(sy&~K}pX!{Z8PjGA|c<+^R)lH;4iic5Wo8w>qgeT;_Tw<1%nTF6Qph>WZO$f|uOGO}#@!e71{8ChkJl{z>h z{lz5qP3GH8TxVSq;GEd^AIKUwC9wZcza9Xs_JbKIS@D5tTRYolU5af+L05H1TS&H9 z_M;&4s*3`)cChdNafmlQ(Ds``V67i;=l_5We6slYDfPP!$ch>cf4)5iKKpQF)@aiA zLwxb+jC6x}9(GY}G+$uWHw+g4L00Jq{8DHeqxslUqn=v@tDD9@$SU|^MmlzWvW)se z3=N3-c1&kk()KqRlD_=!pJkzTcccIOoyO4qABGx(R$~k`4GM-(vV9A5W%y9B{fB=T zmhIEpR>lEFgAM#2`PZbBbHAx3uWJ5JJBgM5$&NPJHA@L>J*@_E(ofR%W!j6^1l;vy z@cqlwANo*_$|Jlio#m?^sK*y4^?5pE^&_KtF~Gt0&{@!q<0%SX&;0nPf2YO6UQ@mD z`-e)OcLJ;6uwr11YMB;jbH5GO%k5 z24y_XLkzP)vv+$0X=;(?aFr}Cuohs6DlF!En9l<{ScTbu4gFkO-ZWrQM|$NuUVZfk z7I&n!uEoGofu*YJ53B&#q3ZerD+Tsbb^U?W9H}i|8?a_z$qHu2UoUK=3)rjFhWjO` zV?d{B4FMkLcq8XsSsY2RX$*AXO!G=n)UUogF!b6~yU~1dkl2*U=Y~uN{7?_u8-E~w zf5plEUXUfP$JSGOe>EcwQTn@(-h7wI&w(Z-nH@;e{ErzaiOxa%CP3yI^`28&n;QB) z((jy+ZX;!_5PgR*{aV|%~jwE~Odur6S+z@k-HKd=}s-3YKKE}a*cMU{?hGzR8l z7?<~GYS-~DhH7(JMe;E+C`)?s{y*>}gJ+pab|J8&aqZdhQecU|mMR#HBUK5qKE6-> zZCy~C%eT6&56?H$*itBr#D>jCzt(%w4rD0WCi(}S`pKFQU@ zCp1SWgsh|qCH|7vnb|!GxyrfQD(F5P^e33#jSqc(0KGRO$oYX}rQI9g2j0PN3w0*# zdi1^izL6`RDKJ0>j-!n%?6DELeVf|u<^6TP75q-;3|N9q&My*kpHj#wJjN@19<1zLq4h_y zIv~p%Az21Yd76%mjI094vZqHzRwHDML`c>MWYr!Q8Cj_Z!{7fRGP0^5t3N`rdLgUg z_{hkLI|SozMr35UA*(Awvf3ca{maP6@$6`U9uSwoOD9wAvt zhhluSM@Ci!WTj_DMpidu4Mj*+%#Sf&TM-#qMUa)46&YDAkkuO@S>upZ=7@}}jKdIr zXGca>Eo5~>NY(&k73D-mmMzI>n2nIEGRSJqjf|{L$Z|L%Bg=BQ(J&ezSuV(`%ZrSx zCdf+7kBqEQ$Qp={thApP4V5b+BdZ#+5(*+Cs}HieBP1*Sr$$4`s>sMHfvl+2k&)F7 zS#1%LH49m;HIb3!Ovd~$Lb4hltI-u1S;LT(u{JWYl8-PNh9e}a60&O6EnXJQ?|L9B z=_Ic-8%)-xf%C#y@Qs6SC>YDdF=Cfv@0AE6y(NKPV z_(e^MP0Oj@eh3M z3*d`82IF7Z0>(e^87^MH_y@k$1@Lu&&vnTH#y{|R7r>W%EXKb}7cl;T&sn~J@eh3C z3*Z|7U*l!r@v-%vbc}zOd)eI@LcaIs_r0VDvMg6DUY48eAF_HO>xB7yhp?Y5*h3nUD)vxzDtMD$ZgyRs;eWILc`JrCDS8rT)ZA$FS_Z#We{)}FwUuc$3bFTJkbWHL|4n;v=>h<~deK$qmvWaVt(4E3 zj2(~hU_{o7*p}$J)HT-Be62+>|CfRM!GEUKuc*(-*in3qO4Er)pc>H#%H}ux}g>tXpp27G{D}?u7LB;@NoJ%s^|0alx zU+dPP9sHIr75>M=<7ele!Jqb}a&OYr!T9OG(xbZPX{j9Puz}exm3zFhztOh?!98BY zUkUz%uV$p(#J`__pNjBcVBe%Z(+U0|@Kd{3{=PfF7o4MVqwM}`q#r~2M5_0;-DBV68^ripOXy_YNeDZC(!}7`04wc{^1OHCqKO-3bN$5<^3h$ljhMb1K%lm09 zhlQ*cI(4P)ZhYT;c2Rbtq37>`XCs;YW8m%myW&q@^Xiv5i`jp!(EcIA@()FC_j!eO zbC!#X*3g*u`-q@9TQI^_M1 zx_|15YVp7@#~L2Ixfkx_>GD_sHq6{4e!oFZkQQUrYR^>0o2oT2WulSb_P| zzm&fG#;&mA(kk%Bf33V-^zbfme5MnZk{xz|zw~RxuFH2V%B~aHIA%1|LSDM!`xO1J zw%X3~@84woKMV0H(w#;;QNik69#C(}rwsXY&m*5Ef2lKvxtyEBG$XGHWiPJ9HWgMsXZPPB zuZD5;+4>T{>{-Ze^~oNX3`;gU|KUVDeV+Hfdj5m$e?q>larwUW5c6}oFYb2TR|GH@ za+4o~EXVoZ-I@aRqY|`%VRA+~kmhSu6QT0SrSmF(6I?b+YZM)j*YBB;R?r^#%!$x3 zU%k=XufIm&@Wn_`&_mz!jFd()_Y5g>wY6MyELHYBMUdC!Rr<)g<6-p?w*M<1@$q?! z-*Za+Z3wdL8q?iKO@=ku{-r`Bv*58^jguKMC`j468tK_w>(#GAk7LzF z;bn#F{^M0fgRRV~IoD_hR`-y1(eE0P4c0+cYBSn!K4%Kc&6f(_Rr#a%tl>EFhWNg?4Suq!QrPoBVa#XThrk8XBM`yH2f3+fujS(Hlqw;yL^r~eZZBlJCw_5ZI zOzSaG&|@-u#AkG-?CF05j91yJ_d4z7Hgn*DOdfk*&bb!z#h1L&S)p}E_g@`n)Ez(& zQS^uTQWLUk#9uG+Z|H*Fm-zAz`X&xR1^SiA_MI;*Lwj{z z>XlZ}9dsN29x8UrwRRiLJ1pXFEA0N;bC4RWy~pJ!#C+louT*0Y*uFM)I!8|!wP_vXj6=>Is`0fM*ch;{ zL@~cpW7mg^#zbn%G05uf{_nRX`G?Yz5ud@1{y*MKWB0$qjvMauN|_YHJ@RF!K1gx? zDVGF@vzQE@QGJ*r;Hs~!Y5wcJ7tPN zOXbp+11r!;Pv=JKKy!n(!cQw_=SDM?J3rb$uLnKbPp|an)eBn14zKq8b71AbqEy&8 zurgr#DH!$Nm{aI}8n1LG#R73(aBrBe_ut#cjLd(B-`sDCx7q!lMVL=;WpD%Q+c7tv zpm|vZuwGypioa05+jFUEUPf)vjI=|(@+xnMvHMSej{rZ2@U^6iReoIzfHn%6@BT>k z{90n?H_7jxoMtq{zpgEJJg~UewYp0N7RzCFU@^edXPpayMR8auFbjuO z0W)w|J+RpwTC4?_m%}=Nc{r>O*tiN~-@nk_W5B+pdOAN3J@(q^<#h3V?Ww0D9!8nt zRC!Rlw*q5#N=|+jI*%ZCO0x6s$iw-Tc3#>F%<3{r=fJe5@XASBEL>^?0C}GfLH#$n)hkY@2f>r zZ=^3n`tvCN!#@qO-a{Y8J`D_?v!hSe`^ZW8Hx*;P4Lx-kHEqY%2pcVhYS1giE4I7Qw9Y@SjDQwD=vCcEa0y+?d7uAFvi+>N_p^fHe7UJil#a zvv0}y+Ql-89h1&N{|3*kqS$fntv}j8qyJ_YZ1i@y5wrO4yhQhQi%b@4wIJ`P&%DxF zfBTtM`p$lIgH`}qzM_96e#%zu2THw?+f+v~z2NJJ*1vx*&TTXl?Ymp^K4=oKLJrFS z=Hf6HumTP%0p{eeN?;CP>anyAn4QC#fn@+w+d~JibYM$WSTC@KPrdS+qY5?ztoc*z z7C>C-uTpfSR7HpbU|c1uSoI#m0a3#{>3zA>T>f9cR$*{AZpM$4^3>OPTgwUw&g z40L>_zq6IMYkN_iu}_sgy4O%ZAEh*7NE82=l4j<^fHd?yGU^<(2htp=-LI+MSC(q3 z;XVlE$anWR!0Y^69y1KSyVNi7U%@2DKS)#Zxpr<>1O2d%Ec2pIwkEkW zTSVpdhpY;feDZZ8XzOxd?B2y_O}$sb*K0P|(Oe=9b})?e2hg{Wzkd*FF5#rLrMpBA z`YPpepNsMN3vDb@aW3sA^J?y&sRh;ojJqFKd;bi~D=AC8X*aq^3&`~zLOr6sRO+4j zLC|`W508QFDv^FQtqE8|)|=+1*XqwtbEDmmS6Bjh|3u8W$Th%=1u-JoSts&{`+6=m z2#7f{<+lV~<|B-HYjz}K7BcF0E?fp%|3~06Yuz1@I|b_?Yk054%OV>XK9{~9d)fQ- z7U!VWPo=8v%F2@SjyfOyGj{QLlRXtfueHegI`WYpy%%adl=_irmmnskv9|?fOWJ$) z|I50KZG+K}jIrVWW8H@8-i-X);{Jc?o=|EuwD0%-Q}+U;-Mi60S5S=n+&lAGv)Up0 z?#R~vP@mxg{`JdL=1PudeR5W3&C zo6H@{g0Bz9Lq}ca-BLQ~-26@`nGQM!@qvEB`^#AdnXOBAOT{#C@0x*7a_j<{%&b6e>@|vEfFmhv}7F` zTmJ&h2AXeNXYZc_Ycm9vlj5eD3t)&y&*HLg-VTV%h`(9MkDu33%+>{3Drom>V+QrT zkMbV6Tusct+Qa}IHGp>?YUf+t4n5|hP4>}mlPuruf2i>7H9tOf|10ztd(dunj@|P2w?ymD zZqws|eL575tmV<)%l3Z(Y?vPNhwebL+z843Pc(NRe~$!sL)` zB$OEqy^wP?^_7F(3cWS~+ZYZK2h%!HIb?bc*)8Rf%<(sal#LerO=nsgjcXv+h5u&K z7l1sS2#lNomA4PFM-SaCJxsD+d^2=;A#Y2NeNhy{+AcO4(vo&d7e&!|i8n*}ESGiI zXmVIbjRn?m6nYee9yXe96MtN!_NqZ1_SD_dyVPE11k(w%SEKMuMjzx1rYd%P*qi!x zOs!_}j?ufxqa;-RwPe%Z~nFdn?xmyv=kfhpeOVm0KY^ zokMbUxbit%W&74c?>X)-$LrU6<1Rn7gI7 zNT#7bL|ZoKwk2Etg6z7q-Tw6ncJ3RpYk@sWvj300_l}G5Xdb|4pXUaSjsl8+3ig5} zQ6nZ6L`7^76O;J9rWiGEjIp4xYZAMNsHmtYYOps{R8;KPupu_=U9lTuSMJc~H}l+q zcgLNO^8UWR-{Qu zWGIKfGM5Y2LH)i0yo0lq2b@O{8$6rpTgs@^JM1}rb1jBwCWmAzySZN994Uxq8*<7u zwvOL+wJ{W_@1IdVeoeELlRVe4;z&VlvJLBX>xWpHs`SFBjU=|nR-UsRwW>hA)jhtU zHk@a+@nZT5aDMsF>bnk!6qgWL@H{2so+aGmLgH!$(0E@++?s@Yu@JcG{U3z;h;aXH zZl9XX?@7(gXRuT|kkWaN|8wcs9!{rp{;jhWca05uxG8z5&X(_M7~UMz4&?i*vdz{G z{0R2Q0#=h?4=i9!33kT<)`4JIZQuF=+;93(om?eY&Ijzz9yV<&O|9%_33q)|M3$}l zR(Wq4rcyr6&-1qzb^e>;YLfibxTdb|>ijpsS`%!939p(#hWa<1C-)}UUV@pfuLKe- z)dChwux%D;+Y>Cs0;x8r;(YD+WnA9PUbe=xPD@!4j37itpBk6>#VpzgsS=-YZ@Pd|QtLI29@LJ^Jsp}uqCL#z=J@d44zd3F=+v0EAkb3{sF!E=Y8FfRLY2AJ9 zsH*nH+|qmH1^1*G-@m1Fm6rdxbYaGNA5Q6XY)91I{|>)F-Nsu+%5SSQ&3z;57}2Vo zn63EmI9zJ~JMC|x)yN91=D9YeAIVp{CR^Fhd};R=eAbBk$pGu^AvBjJ*Zb(T*~%TJ znVD8FO=@Ec-f_POmw7MI^iR%Ke&90O?=Q&Kt7oOmp}FIT_7?%={;)odl%u#vFP+)? zZ4kjcH<6ub0&7h$|4rFuZ=iG{*q4;{0~1(3BfXL)utpABRU%kZ3s`-Ebt0JQ7!*#hFD>x8 z5^RtK-XMbYD`qa2Xo4kJz!C_SZUIXs*eiksn7|GY%(J-8?7f(a1gmKQdql9-7BJ0l z(!T^VeQ(d3U|lU>fduPR++2Ra1nW`Uylu3%La&>Vp6NR1Fe5!vSPa3ww16!lSO*JO z3c=c2z|sj8ZUW=IFIR^f%4_YtK8}YG2L7$ZGt8gjqba^4$6MIDUz=Rt)74}z8K>(& z>5~6Kx)@65oNS!Cwy8@89AtNnZKMSRl3fnXBOq zW`Bx*K=H@S%VC^9sL9x7d#=>|zmz6&L$>msk>*&bnQ?zaTUYPahJAF5H;nKS2=7@Q zUfw?8+r}gEhWaPG119~JZKD)|WfJULEAo}Tcd2%0RXn%ouJmrq9 zUGmOoGhWarLwkM~9{Y#gjO}YH`?bI9D)?+jgxuWL8kTzgpXgom)+v*C9QkCIE&s^y z@|l^RV|erI6{QdK)hTnh-&e4ao-4Ll{@0?^`%g%g(YfhgrT(e({*-1AwzS#8|2XQh0>(psY5)@)J#eP z`MI4IZL7b@vb{7aZv%EHPkFq3T%F&Cxk_|2G!|zwokcqfUUMS@p}KI56x?Nyxk3v6iCV zdq_0BDc{R1lR7)B=;esM=CAe{^xy;H_i8DfGR9nI5z{;(H8c2T#>igeB zt91pP@|&%G!{wP^ULMu{p>)ZVu9k_dlTNU!1p6|tJ@S2I)D+vgYCCcLQ@npg-GA-Q z10D;*qXgto9(>mt@5ztm$MSN=+Ln2nwfCf@8_Q4l^()ahntzU?tI2zQQv19Q!#r*Z zrK??8rwlRX^UgLEEVX;bB@~AJPZMZ7ul&!nkM=|>rOH1;D}iV^2mUi{?E%s1R86Pu z3$)qR#^vuj<)X>{s`k(D5J9y3YWz7Kxa}^ZbfYQVW#*w(f%W~&)>fq9xoa_D=4&x* z>w1!W5;kTVY@5=X^XXNoL;kn9gD28>T~qhB_s#Jb8cnp4-Xq(HX?<$mcZQhw41Da1 z3ZcHyI#!OB`0l7<#CzuZI^_WGFB+I)%8T_oEW)L)tnOoK>Ld@5tX^V~cV|d_s*9ky zI)&d;)YM2RsN5}G!{ryYY+7~vpG5v~J)OD_20muJ)ki&>#q)9r6qiVGXKkJTG}Bp< zbt{F+^uRg;EpzKpJ6S-v`h~8?eYU8v&GIGH<14ZW@38GvW79j=PlHu+KmW8A^E8Ke zN+q7g*%~*^&s4cspMnZeY26`OnnpS$)fTN*CTm{7@_k!tjBH2Lj-mQ$Y}D}|o8Cb; zFi&H%f64wrC&JxExCx9a1zTUt8D`$!)wesrcn=b#_Y2V}3yu21#=e3s`SulnVP5wR zwVyA^oyJ0AG?>rjv7H!0 z_4BNW&Tt;6i48uRneaJWW9tHtmU;Id63tG{b&4aeb@~)oc6H;8_4`knsbenJyMJr4 zRk&SWNiJw_H{URJ*s91{80MRYO%)K)Mwjm@ zR?VEo#`8>>6n~cD)ir^BWJ&ZlzJI{^eN47^UVd+Nd;h%bT9{_1(hH=zjiYq@7PY2o zvTARs_y12LeNFJ`R_IA@6?SW<#%~F-zUxQnyxZywb2U6`KhQC6FMbTgMN^#mZfe2f zTtBO|WwdLU-qmTmUir`1NjwhMjHUh1*=PG3$UZs<$t{)*1E{w@xzE1FcvOv~#T~uNCIrIHwy#dBk?o8Q!h8&>cPUR7Xpv z|CpGZTwh6)?johT$-MO~a7$8ztzN-4qhrL`a zJeX~`$G!dh$fhCDCBbWpmYZ)-8oJ-ceS~n-${zhD_1ka9CIx3F{p=5 zz3UuW+p_tFS*THUkOH1R&ZP9qD1Fa&ygl(vKE1B)ln*u2BwZ{ZEuEzC79e(tST^Ca z1T5;QQ@q(WI1}Jrrn>$~w3hYJ8SZs7w#{S2S0=&KJ*(f@;A_4)U)soK;lEzT3mKEkBatMi|8$^PoEQ~F!8Ia`|8 zoF?_g`8=Y0x>C9hoX?ni`BXB^hw=SM#<2s8_+1Ld4=4P+1C98N-;J-1fAh%S8KhG_ zv(o+s<`tu@sp+ix)+v;a-(a0mf%Cb&;vGI=8|!mr!F}`kP5ZpFKYNj`C_c^q-kS z%pzKG!*t3jTeQsYfBBEC9oTGF)g+@R=VzUA-Msy&who_@G1gY7Q&|4{`1_H}lYY@D zbJ$k;ba`R+_}aLht*L6KZ6jX%)3cR*#`?%xCkvLoF_}9yynjo$nZpYu&z5B8>TOx3 z>ioxi0hLDTl=bHM@=n4pO~*3E?L@fS^5cGNh8tqQjV9diQH7(MLbzG^ar4n-eVIkL zgGL+uUNDRguL!o6VDIO(<<5@m@bLH$%>AF_l|}KjxIOP*YQ3K|SACM@`A%nmVK3m9 z3&MX8n?a^S6CoAtcwo>c;gyL8rvBjka~kJLUpA@QI<%Rv>@sIj3M-`xK4*X-P)gCh3%JJa7EzyZIX~$wRGNfT`N!Dcr|N za95}5l=HUE)2r*WtY0ivX#3`CL5(dghiPdA(kCB;&D8>v<%e)v(JrR(AhvLAh0n7E z5q^8Zp9E$$=3C$NV`jc{hDK57I`R56lA?&Os9GNTU6XIqkQ1x{w^D(EMv8 z>69*s(lsks&$l8SX?-0pL_I!X^;9;u!62&RIn#}OBy7>&?jtPA(MtrpnxV5i$7;4m zyIVecy;nZzhFQpx`MpDo)}N_UdU2mgTv+h<_ejWR#5F=%Cr3< zowC82T|3>}mf*IYN_l1x{!`BLN6NFkExpyraA}H3f0vzv#cY`BJ9c91u^6qHC{L0& zk8)Jc!A!KnM;ys2P&3HOUU`;ZqZ zTIJ1wF}?a+_@u(o>qfXulWfytdoh}Dh7-;JD|(e@;JR9zRpm6PlqUU8q+$L2jM8|o zeY@_I&tDI94gSeubm4tbs{cbyRFA)qmgUuh(xy_{C`*6EU;4~cVB8+!2)A5Xw(?cM z^YZP;UzuoQ@>O-_tVOUtn`6okb1~Ra&7a39f09w{bvh-Q$Ia673L2+cxIU8PQt!A| zSM~phW<-ik>CJPdZ{`;4howqVKN|)<^SEubfS6R{+Hxz_8KHS&PV={GCYGni8#{#? zH-UtisR>o??rQ{GB%1RJbw`ZNJyK6>`OwVwFp_(v-8#jO*JYgxA%|na)?wGY^CicK z=Cd@NlEZd|Zcd?S!aM9TzRTM$NkGD3!+NjfnBMg5m_8`q+Pn01zBMV!!}!gezOz^g zZ{gI;igCa`$|B+;C0(Zsk_+q$yylo`Yt4~qy`nbr;HXYH!DG7ToOi6Xsn^iIw6ZxP zBv}Bzjw6>p6{%= zj_j4oI)$IQ<8y|lZK4(P)=X^^8?4O+X;DVaPMFl4rA-8MI` zTT*?p+{ucsowx5Sm1rg3)+uGVZrjd&M?0ZvmE|w2&{FG!@3hn;3y8a~Q(D{VU*=~Y zr7v8Ui59L&BAk93Ga!rt%?Qv^-7iiaUsqczyFCb_4Pmy;O*)tYVd@rhrkNnreNWUz zIy}@Vn^{h=vrNm@)Us?zvK;m5q+a`r+NQNapRW&{BPVZdbL}Q-6A{QyMX? zhO-K8SH}dktFu(Dt5oQ-BpL!Vr$93Tgwk(g?r}C&1w(n8-&w&I5PV`pl~z$8i~y~T z#EfN3C%!sl8{5^^xWCn|ys0fb$T7+*Chjk{E0$MRqIC@P$}R4P-^CTIf2iC2+t?4o zbNgY4u^+a~?Oao=d*>~ks2Q5~L+N{CH{?FtQtgIn8FPrQN_KkXILk{nv!FKdPI-lG z5U|ZbuXwP$j?OGdUM;QAYMQHCV~JLbi(Z-Q#Ah~UTDLD-x(=6kUBlw23(p&6Qu;^U zdZo2^%S*oX*$JiwvB8dV-Y7upqc_y~%9#b%Q-UOSwq6T)bE%z(W>!hP@(bIKqh=N~ zZilOP)4jufROdg5rvHv?<+2~q99IZFk2%Yqd)Qv@SKt5HMDYdKR?@*;s%Y?o1KRoM6iAYGd(vMO|S@p<(^Ad-#;K&4}ygmY;<>G z&9MKU;EM=ucK(rI2?R4eD|(e+=@xlEBiI28n05>KdloQ1f~8u}t4Xj27O0?*-NHVpX7LlVCtF1 zSJTbcddVp5EzI11wH-%LntqXby=T=-?-`4=agHMby?o>S((PFKJ=r4 zb&Jv~2iXSyW_m%suX-Ou$9LR;$-Zv{(aefA%B;flg6*t#oC9UOl}d!#Ptq%!)U~Qu z>tmSepS5z0;HLwuj9{w%qxwysY@~Dluh9u6I^o;+z74&4&x+RBnTJ`@ZO-!s_apAGUJGLkjhgD6TK(_tCV1WT0-t*RP zD|k+a=fl6UwULqaSj}DJ|Hhe*$IN#))mvABtLHpBPc3*|tblW#>irj#SHx`Ncw;O} z3gt70V8NV^+thd4n?hb;^>B%??ldHqm^q{;nb!R&g*nG#fB1}-DL02)BhuH zQpORje)IK8LoAS%;c^(XPK?Ko2UN}!YEu#1Mn9cmK7I@ryfZFuPcP@~xko_I620PO zSGc^hZRE|gIuWfz8gr`@{AM-33)#WuopwC#Q+r5VsaM8xeZ|KVbVh{l-4&trYsVXuH^tM$qc^6V> z{_ME_G^O+jl-Dlx?#9W5*;|!ltH-kri6wge>-EYPJU?EeKzc1%hrBGn54%G&{VGvg zW*@xnWb19l^vupxNq%6Rrg}#>-wR(mO~BbL`u~%A;njJ6s^e~fdZj9tY2l=T^%g0@ zgVxKGHy=0R4r<4V1?gn!7tZn}r+3dK0LkZBW4&_H zndi(V7Tni{yU1zQ&knT6Jv-2zXb%5aZ#X-UHPPnTfnkLAituXixrKv;$y>-f3bH)hiF&SwGs4yV?0cXSt!)Hcj>ZyF;Xp z7FwStiKe(mhu-?n>it&)%ORMb2`rglmG;S=@9WwgaWS4zm^vSOp)4LgQ znTKZtdzDL5^*7t*-CyW_*nl5yv~}O^VeV|=!hDu@#}B7zN@=_g>y?4#Y1n4}A}`U$ zPEB)zJoXHtG+impJ)<0~o`XMZ;hh@K#d3L5D1GhS`rNf!cWI2lk2!Xj*H;Sfoz?o(+xs^BV*Hoz>@?)OKBt;=57&%e?Z1=DVkf@mdq!a4J*fd~*2Q)w{VtUTlvI zBl$-X-c8QC_XNv(30Iq*b7A~N%wIa$Z`SUoj1y*ithoL!GW`s_`X-vDq#6Czxo=_t z$tgJ(2lL}_zU2t#D&eTM^6ya=?V+71o9etwd%ghus9t%<_(!6w*ge6#Vb#p;VY!Vy zLiQ=8v$#vDrKQ}GiGJ!)BR><~%@3C`i}^oloOAgx%2>BO)st^bQ$7Dcvdtl!NVd6J zMVZ<_P4e1e3$oUI^Usy2=iexO>*M63@VHsktjxK4FuGe5*)ac^A)x(9z1f|3(F6-8 zSP3IcJ^xLxFHiope!?=iL;N{s>XlDe24!<)AT`f@SXaIO^eDA2!uiQcpWzwby#0Je zp6XM5#Sy*-%k1QM>vk%ya#_A}5oYLb{V2adr}WCo07XE$zr6m&^CJcw$#px2=xa~w zl?%+-jy)h+2QKNAX!a|H+u+UA9!t>3N6@xk!F?u}WT&~MS0wHO zv&Nb0`*42Y<*hQ}eNz!!pM=jdzbX%A<3p>w`=>&kBJ#qlD|k#xq_knT^$Nc=q4{iF z{yrLFzCV=p(p5^cm(r+ji#Ik)6K2#)>fx%ln#pjFgEuAUaaXS-bGi@5nfARfv%dG# zqVF{^_^=%){eyd?qg8!4&U)Ydn7xNO`4y^Bzsw;TuO8?PZ#GvPXI&>V4;%CA7q(3o+xJ>eUwhtAk_kU)R{(7ueIvMr2dJckM?FlxoQ0H%C z`CV@uY&#)f*%Q*~+&6s3TJIap^6tGYX{B$3<(+>&C1Bh0zt0!X&$K35?l1MqbGrg* zshw7D)9G4l)6rmEvyJL^*=xPhf&Jg$G1lwY^gfqxdAO~mRrUQBs+$xkN7aKG{b*b3 z&NZb&*6)ODP3f1(Ifk>ZXUx*SU2kb;(D-KQn;Y%{iKg_CnjFI##HP_!?51#jt;lrQ zTFrMarE|B-QOa_^(NKMXeU54?Yi5tOwB0A=trav?PqQ^O&i*&g z3mffIuJ=fyGoAV9I@;WxjgZcnOfMSN{Rsb+Hb?2s_4mzaD>hSv*|RKR#{Hv-X9U!A z%`u$WHRttX(>&Ds4+wV=;f^-iG}iay_sn-Kg!=y3S>lh<*5kD8*gpAQ^^e$|3OY^x zn_G@jie*-2w9$TPn`=&~^7xW)ya}fw%R{n|Yq(s@^em`){}bis?Ve*;V^rz2&Rbtd zpg2E@J8A1IH@C~X7Vo0cz~9Z{oObS8QLl)%#3DILTjs6%DBEr3zD1h}QQsx9Bxz{> zBgN0RJgeSRa)M!iZj zlQ-&>yS%64zRhw*$lqJOhtWdi$)Du@tYVH5!Rf1xE=ZrX;t}oaJJz*TWh82y+1^+Ta6s$F4y_$ zQC9ZFs7=VM!>RKh=LGn_m!n`=o^Q8NFHKD9B^BkLtB8d zPpbaUc>$HaF!rHY@6fvsEl3;Zxrb5upbk_YJg?N*M%yxbGe?q}TenAB<*qjzAbvBu z^pJ4>?K@m!mfE@r9Lft1a(7&3`>vm}~eb9IJ_zCrWDm zVV9`C5UoUWA2P3fjWcauEe&TEhEbY`?uAdoe~Ka+FP1t0?2_eq??HwJH@*kE5h;Au zZlf1{%=d44sSX@3Gg79Hq{=kL+^xb$iSLoGz5l(U8`W#MmH3jvvouA1|LY$@jFfPk z1XCsY7Lc1ym2=Wndw}y!IOj9SaydpV>5+Q_3UBhr^a04IUjEsqaH!^kaz=bNen0wa z`Je{YNXe3#M}R06NWiJu*&@+FY;(X14sgriZ7yoMkk2CkmPo6`WmmW(<8U`vgHObE zH<+g79UM`PND&Jiai<&Hahfj8b%SND5=EbL<%q|wZYq_pwuw&$~i&C6Edup8ym}`Er~%;%YflLT`ZMn*})Mz zRKM$gfPCeAj^+;!(Ai*&DJ?k){aDY7Cpi zW=gPBe(ZcB1Qxl=v5jGUkxz~_hIp?}5*x$95})J*!)rgJ*pa0^Ne_lc0n)675K~5) z+Yr{2!Hfp5vkVgCXc>%e2v^D=!MBvd4Z$$BJWdFP6XkJQFf6Emn}Xp;1&nM04=Zkx z<~M=ORd7QSxLXakH-XdD6=`7z+g}}->Na3{~eDWj&&VD3~4~CZ?;mL+Du0AF= zf|>P^Aam+VL^Zv>L{wvWWq5xu5_4CAF^kZHk(i#`7^yHz8zU8Cry4%o7^z@a)Ud8G zB^n#T<(eA8e9lwD>q0p1v=Dq00w+R>5adk=5+9?Q5PDJ*B)*mS!J^kfP4NYeVuBqW zb`*#0kis`v_ABi1xq~>Y#mP=0+5vTrVvHjmae^t144>=BkS&gQ)e+Jh8Rvo{K5~@r z^R;|YPPoVkesjWPCwYw%0S-7Z@v}~bD*f3|r|~gDj1i~Ab!u&gHMqY7#Mom(30S4Y z=e{t)0dM-i8K>!jK)YRWm9L0&4ArQ;HsyNqem5Qp5gC z-6bfib0rw_l?oW`gM>22m&40^nOc@F>U`mm3dw6{+!m)ubHyc%I3yE^M;d&tf#GP{ z7UO%0F12Pk0Bm=}(LQk088b`34j0^50#e-qNQJoL4R5$w6ytp1s0U8-g%~d!?JF|9 zFvCYA6vLf9BC8njDlU67Jdt@-N-%hf5AN~-#Rtdwnzntv>7qZt7#T0Qz$)xXs>dE* zy1)bNVnYXT_NLz&*a+GfzJ2P^@8F0LGQ1F_V`bPRHKXuk*%0g6fPUNfO%||37$Mw$ zo0l9ZV3rY5^p4=S2p`Cpg|I@CAyjDj?+(PotldJ&K+L!v~3-E9d8Fo&scHD*z? zE@y$$1au;V`9`Ad{pnXH_KO4!%+ORcz{YUoOwo$6dtt;Y{e2^V%>tLnuw25iGTfDD zz+T9-_Zu_t>W_ycctki7>|@<=S28I|sNW3%b3`WtFRTK;!JPv7HUT)2Tfk5i`bI#% zE@Z13Qpl<<^1>TlmoP_%%xX!tbHB$Gf{vp69f$Jr5-f@Zqt2y>WZn1d8 z`wl#QX*l z1vg8K(ZC7`_sDQd!bA;hX8HbZ6mrFuJ9#lf!js5N=@~+fjPnsTYjBz!%w$e8&`7jl zZ{P;DQ3E?A%+$a`vL-aJSjI;hc+K?IY7CgZKO#5RkES5pkX{n~7aClN5RLB}T6mGs z1WOdSU%+nT5W7}%n?%4!3Aa{--=v8Y-Xvp2RhX{{0mvrvrz%Xu;_+2s8>Ud?NPC;Sg7trx2TA+1O=fW<{IBM_bxJwhrp%L~T^LY6mP2!xd-a=7u7#%ERGL1})S zT`7mxszO}(5fmo#DY7c0RpyC;wY=TnApg6_|Nh{ABiXu{&i|GLl0G{ah?F0b zY!nzN!BJ6+ayM3u-vE3l!$k?#XyBeCM{D4WT%5uqjRLAMm&9mC-MhnDJ6Ns31UuNK zk&RLMN}`DETGAZ52&L*s3Dzw?hIGTBV=x1?P$T5uQt> zcO%zNEW!@C4u!91iW5A_SfM`-USN048P?cinhV({Na3gUxX}e>XgN%y&J-6&(|$+c zoentK1DN`H$keWJ0>x4dKG48cE<~z^!M8d16!8_B@Z>%e?+chF+L208jJ>~J zFMNbB7_T9Z6%;%w@FK!S85Ip=%D9c|Ql{`LjYQ#fT%%`@BW)0L>q;5|hJ`;6aN7u~ z-VLuJX_x!c@(pZip zxfUB8;haWFc7#aky^b&qrECXyfzJeeY~VI_T#FaTG;}CVfF}+Th1WP*$?N@%(hh1I z^E9~C0d{bMd5Cz)0T$cw8pnDqE^~l`+P(m1*}=+oFgD2U74!lYigvM5-stD{)pulQ z3Ymt#QASX)Dl|0nkbgyjC&Gh<-1EkmL4$whaUe#z>nNbK^2DOz0a#W_-VodZUB zL5d^Z_k_hx3`um3kqP&L8=m)qX!rIM9#aGlc){W#?i4;zgpn>6?L}eoZMJ&BSWl$z zHcy6M^Q2ridNF|Vo-96M8wE$W8jJ|yIm-p+NWnlBfjr6;Qn10F4HF9dV2+oO9erh%1~i z&J}h#^DD&DT#3pO*Qx+lIrP-kN}bf{i|e5(==pK%+|mNiG$ISe3%9c zX@#JVT^cm%wR)oFVu7!r72t#yrdNO`UK<#+ zy*O^J0H=$;=8rq%epG}=pSl!&?88}4^(~!T9u!~PTpsTF;jQwJQu2r((7n<~NE`h< zi25ae4qptYOz=5nFsnQqF2j&BWhLToX*nj8R!$~(h8lidj${#8o>|>mo+&@5zy%y$ z5vj-|kg@V_;!A)S85g%EMRHS~+Zy)R)4!8k3BY6VZUc&{b=;eZEP!fi*M z9C+@;u``_A(^`Py%;AMDo)<%5mMe$1yLzq*g%xfb&UEv<84A1IC&~2j&;!pkgAJaT z*$mElqP`hi^qMH1HiM(Z$TwR>j0ut(~SlfdtOlkqcKg8Q%Fzdr3 zGJ$jr^vv?K5WF1wA(>NE6lr;p8tQpzg*=)D9y>SV>_yl*>aa6p_jL{75qFg-8uz4z zpm2XkK=)CN;3bbsadz&j8`9Wv-GwZ#;!)yCeb`Xq>+|&?$*01Tk6@-BQh29d@tg*r zD@FM6rQL5egbDs^R{6VcZvc-1YK)~E$`D{wS)9}m;>+TUhOn_L&T9yF%VA6-7+2ms zz5y(*fJ+)eY6XT|t`L}2A7Uy>1m9e7hoFebm2qwZcwHG6H-NcSwo9uTz>8|Qp#ely z$H+#ou{!Rp2X|`VqL1Km%?jrmz|8kKcE|hvyXu2dhv64KU}lHcE&hfW3X-UZ!|O>@ zx|n(rl{}%IM50(xPa>w*)|03dJL@sc$sc(E9BzpB8^X1QB@HU2)lby4r3K=iJ*47? z^Xy@SJyQ52k83y8AYttiiii>y>%uEJPGr@En~3x=!wye>0PE~I{7UPS76eNjlQi`4 z+!Z$l!6~;FoMeK>TIoR@IOElqvs^jaz>PnLx=_99BIMK?kjWg;&LLJWWb{(cMmz}9KA9zvl_y-J5uFK%~ zAiNm_YlD!;?XJfViy1i6uoI1gI+5qYdR0yp>}HpGEWsg$jnDO&tMT&x6H)4#1Y0;F z+nD0LZ|2GXWWF7Agi9Loc(-H8nT}++ttb4g_9fRl!UaBtw3-L-GsaQ8e610b=!%Rp z?O~ZF6v)mlxq`%RNABBVmi{jL{~F?lBMI@gikQTRQ&q(Ae+My&5dV;IsXe4H;%qEQ z*4q?D+`u@;|L+n1S}mOwc)*#4*zw|(Gn_{c`bc(oDWYB2Du{Q1O)h8{H>|vt@ z)7Y#Iduj(Ub}~VB*-bIbANl^P3)w*uUXSY`TNz6WCDrlV!i|Q zR3bi86DMA2;V~x3DnelYZ|`ZSyW>oRy^=eHA2Zr44UR@~DJghbg9KGoXCc-CDox`u zPm}PFhBJJlfw3CAu7M>QnZo-t48LS_V%=-g?`r9ROg_+6qhHixJ*QtJZO0Wputs}@ zxIFK?RU9t?$6Ybg2NK-ygb$o^t3!}O?s&`xW);Bu1vZpAf`@mEWT3_ot_Yr9`i-2U8Zc2VdZq>}*Zg})4Oopx z;d_V|s>4LPx)hFaz?0P>-YMX4br|c62dl#&XWU;M4!YpJ>Y%vbp6amQbuv=)T2I_o z9iDh%WOX>;g=4G3h+_DAHCSD2y-d+beoXsW$vAPd8tg8OH>$zQ(xoVz7=Q<>K}G=X ztp>4WaA!5RSVkuJ&a$|v8vIrc*Hwd4h)Gp&S~YlD1)l}NA3VEoq8d}U zTn#C5M|I|fZT3NU&5kD%boAej!_SS@ix?l|LlPl$8HJ!IRvHzLSJPmbQfhWRN^24C@Hl!u<2*5IG> zJk};L9n5q>I^n4ju*C`2mViypk0re~ zTr7$dky8{2>!cUH@P-FGVSBy=gR@F7ShaiqCEC^?Cl+=|@@6-7s;REY`E`dqa%&nT z@VT_k9TK@`Um?%L4aPZ@C0E;pp;KM)w3|5YMsc^?aI8D5a%X_a9CEQK0`Ew0QlbpV zO{CxpqxvuTOh^x9GRdWrGHj7?rwqG|bFJyz@8a8uX0_=1q&O;equ46r7!(sVNa2+l ze54V>5!WG1r$H6v`N*-Wkm1{ne)?U6XU0jupY!_VU$u1Mh(EQpEj zj8FWR5Jf)95aT5=OTu}QVMb9HeT_Pz3OB_srAS$f7o>;M1TK-qd>Qx3?0;m+?1`zv zl6l?t{yLi^c~iz!elUX+kS}aTO!b2cc6i$lez%kNl!QYL@|luw-2vZ}gt3mk6rSdU zS4zT2=hEq=;9?P+;0Gs);vrv{=z$AM!CtSP@xCy-gcM&2bUxvv*8Kvho0J?vP@cl= z$HRfZ^%`p9#Ywah&eFhSO*zBd<^YO4M_!LT9G38b9h{YMlO1f;NRREs~B{LuwX{*FjPI?LTG8m?xZZgsfnE`BX z{6Jre)d|^Ya%)nKV>Oi^FM@`fEY2cqy$t7NLxf`w+$ta=m#?-TlKVc__*IRwJSgK9 zgi#v2jIc+8v+Q64GUNa^uXD)YSUU!@Jig{(VkL2}f%`I#5adBq_@Uau)FC6uI5qfK zKA^>fzHr(BPkaT7oG_^mtaHX?-@-f>Oz8uOuK4_0h;zebePD?@?(7Uxi{RP5u%IZ8 z?*cPCFtsl%^2C!}AkGU95WE=9?FuuB<1vDJvn4u7Rl(u8CNLFZC|n6j(o*3oT=Id-?9iFeaphz(wW6Jx(f>| zql+r4t}LYMU0FPd-?3n_zhjXc`40=Xp99LX72qN9^@DP~cj%kBnB=y*#FtsS29|U7PaQ#q-^~8*U z5b1?m30@4Z4}`JBanDeg?Ts%6!juwtatOrxVALR(?2Ds@!a_fsHwb2z#Dzm(bt!z( z9~PI!_#v>tAIA>hA@%-HNG*f22f(JX_;e`jFNdoKz>e~$9||Wb;BE?6#FazfL?t{u z0MaVsg`sf13f`k|RlGsrKzuU*GOFR>p>VxAP8|ptHSi>bYvPiDaQZ!5ITY@_k6S2Q z3-v?bUTusY443QR(;@KW16(s0Zq>ybLqHdV(}uvqdU$^bjQj}q42Bo~-!1oY&IQQAhhp3(-RG8j%Z#tTE>YzSWY4lZ_Od7l4{!`FD>@Av*(+z|t~pwkC% z5tk3(!fhYG#X3HK3v_b;7bSZD7h>W-W`F)bW_-gyX8GVi7WSoqEat}pS-@ikv1n%v zVxg`Y#MQNH5LeWxL0m<52XSS-9>mo6Av?)Zj>j*CsRfN_=@TbXh+N{LM}z z+u;s7*lfq)gBE)lO8q8bj)rv9KCu?zrKS#jBqCB+oryN)cUa&ALXr-P7qTD&b%6%v zn&!tlHppXuYg+ux0aDc|()jm8g()(=KvI1)29n0SfI`Qq9wWxu!DQy-H#@U2ExwIt z5dQN|!fp~+ghG0$IcnMPy_+^!KvB%GuXH{_BO9JPn@G$O|=$lcE~v z$JCsEXM(w+f1BqlHP0J1^HjtMtlEUQ6_xk&hEpO?9{~wc;L`{)xdP++z&1H>S|5ng z1WxD!XElMNNathV;of9_1Wt&6-FAUndqa$U$ayKJ5)R$nlDRjjni~l#y%TjR;uf7I+~7 zUbzLHjDR%vz{3#`TO@D~#TE(N8UaZ~1J_5uo1%d$BOu))@J1h)?HPEX58Uw#JlO{} zdIcWt1EY%t?&$+3iUn-#1JjEKtS8Qk2dwM^@!kRP{o%5Az_k7_zeK=<{$y_kjOq_d zeQ@Cb$n?SZ0TAPd@dII-U%>0`kmDEdygO_w8Sr~|P)Y{e><+t21zhS5V@d~{>JA4= z2OQ}R6Z`}Ab%zZ9fbHF3NBq zYvIHnc?S)^ypVt=1K~tSz(`W@O;9%wRyM&gKf02dCSVy5A3?KgJvVAn{||(+_U8#VG^{_q|UIGu(Fu z2`?O{41g)^aAkiu^9gPp2y?$N`tVK{1U{2soN>ZumSCG{mxLF{v61nnC&VM}_J9L+ z8087i?QyXuEO5Y7PdMs`FFhg038#9&au;0X1@nvG3Jd9nIdvaD+Jvl3KD2e1mFDALni%A~xVlofBm?W?K^yOvmy;AAd zcI;aeTvos?;VXiwqC<2$=nfKd8IGp&lFt0sS8OFB|jiLl^d1u?-UL0g@ zj$?-S&0ZX}XNba1;Akzcyp7l5JYEZ7hPAj)3&*r(x@eJuh;vA_#|7I3tZ~BGI|NK{ z!)fybDDHT2t`JMS@X-dr{`MTQd#9C{DwgjeOWX%@wh5Twi!we&SH03XrUEBe_aV0c+frmvf2@fCd?Du=V_=R|otK)_8EFfv8J zPBI7Xp4Z!RWXjfo>j#=lrAR_x6T)Etr|Y2jMA!O0>zO{ zae!Qg^))e*5^Q}BN30Q$@II!~*Roo8d=;@&8~I|gC$%w)espzkDxqBY0O!!x?Yj6P ziC7B4m74{euZL-C1?>9}*VEULk8s910V(y7?@Qaz0B_OPtcJLOvYp)sR}$CyMm2QA z^}R4;fsAQ^i%59Cx4;V<1&nHmF%*~85}(tL(h^hX=XNWcOvGMOccUCGx5g-{iC3+0 zD*aq-gU5-At__}}pDQ2ZHTv1#7Vpx}rM8$zq<4hl%Txj9!f`VJceTS6RG$~x;V!b*kL{PZ zj0(2)E1XIKnf^8M{-!6o#i`v#NgOWz5v)7O=6kuMCp-Wj*hS6Ua`OJ4`OVg`M! z`VQl%U`hYM=_L8s?{N=(P49;1Rtw1PhFj@N_XBRHxXazYTS#Azf5hJ@?pzP#OT~`% z#N`y1*b6g=`BlAf^fKZ)0#DP|w*IP+Gx}pBeckDg6X|E|fDah>_!qU=jvFp;DnZYZ zoy6dK^qWm&M~@UZf+Tf(v=L6fy9h614B16gRd?w38R4BAYlwd|RV(WAfFR6rP3fw|XGb2vm4f?%9zq{%8SFPk(^ zsGaU+0&9g=^q6aUR=4zn|{to@JkaOslyT`;eLhS?D^vhL0F-fSEKA(QMx;M}-SM5Ri z6d%UuC+q#CsWo5wTE1zU+x=A)(f>)+^1je(+d4^Z=RMc)P9^qzgy^6 zq2KYOY({S&eEKz=%P4w8#>wuyo%gOAjJF@_V1zpk#^cnPa(4;xq{@FP@-u{1J@k4! z+Qe({cRRSC!Hf1h+WcVMX6((Sto4>#~J4_!z8p{qJ(9dIg%%Yzs zj(B@GHDD*aLO+TV{y{%$oH6zn*zb%AecW)tGxRf!46&bKr7Iqyk7Rc|N zz`KAxZh7E{pJA0(^J&B2w_-1sX*mcWsOHNyw@{0vdPIGaA={P4`r z5M2_Z=wlA~dA~wTX`D(Q^ZaqruMic0FBz>2DnlW> z-^1g-z=HQNnaS0{z4WuHHr^Wy^Xp(NA#C^nxBmib>tZa$>7t^sD!$L|es_h7i&2tH-R7@jr86~s?u6I?wQa+=^e`iNV zZU(>q$`wm(v(_+tV+YeTIN6TpD$^;9c;619>=?3<8~uJe2H&z{@JxGzf8LjjQ+>%q zdGFuxCI9CG{?7;epAY!|Yaeiggezp&E?XO-zv3Z!m4uUBfIOQGF7N{JlnYEy_qb*_ z@}YtmPB_aIE<53NS9s#g@G(3BN4apKBo|I}%!Ly@abfgWS4K~9WpsAWzP{@Uyr%IR z`B+79t_QpuG{0rhCJx+Xb693acn~v=fIm=A{}vC5X3p+-XNIe zgy})B+zB&-c-i}65NvVAtRUFxf)9e=qzgU^f=pMO69ieVxF`tDyJ2Dw+;hX^AUNZW z_iDmz@*3+xW)Zwm7akPBH+A7;QC!`KPk9jkV?1z1Bbe@i^BTc4PaN9_7I@;MMv&me zXNpq1@KyuZSPWk@fV5)xrUC3Njt?6^W^vwTbJUwR;av9SO*j`z;Od6(tOTYtgy%kZ zx&e&w#mfy~tRH4Hgc*LA)&ORf#GD{lS`z0pfMuodcwj?#Fh5oDC+vq^jX z@q933`lBuwb_HNuV>ljw>j|d}K5h)B%V2Z}Y%hyj=zm>9P5h>9L4j`gM|T(}a9>B> z`MJF#Z~audvjO;$y0(NbJMxZ_F38!|4 zkj7jp6>~hi{Xi$ zu(}xD>DjL4phJ^z2J2P z9N8P@Rm8=;A+;jz>kZc`;?3SLsuGTgVE2A;1RSh{`y${`CA=8{ag{N$5A3XrG%RFQ z#*98Np$b0k14&gdwl5s7f}8rnqbhi&FO02OWR zhgC6l01OYrS)`H!aozw}5Qs|#z}7(ggS6KB%;)y^@j-vccptM!xxSCvy203582ba< zsD%#)z>8WKGY}$cvjU!98`B1YzD_wxmG}WB_k(>O;PHMip>7(><4zFH><6Rk;hcW3 zxE?O*2V3i5Vm~-h51;jiTlFxfKV*~1Ishhoh|vRJ-iLU303>~gw+6t$5ApSPkX@e{ zifVv!J3@Q|ywC+w8sMgGaHs*U>cXccH*|qN8sPRW5ZMslbb<6njDM*S?(PB)8{z&g zFd-O^c7eIUc%};^1>?CcusaxMe+CzVam^2~Fr)|J$2Y-@0g%{)7cveu;iZiWO>pA? zc-RDYkg%F!8jW{N@n~0w4rTgtLvdy|SRIPLb%Ud!xU?JG3&l0v;8=6q&=GDo$5kCc z-y9crgh^qzxC@VBlo zrZq0@3UgZHny!%48aH)?J*{!q3JWbHFn6jhIM z<_MUQdzhs;J{QmxF9OPnFZ@*!-i;`(Iqnk>&WrkeaVD?q8{$gvx%gImr1dM@ zCV1I@xqwZ^NHuXOs;4lH`xj;j=!{4C!1MA($j!hmmE^vh+I`GX+c* zxJ&TCHiH{9orkdILMo-zQy%g8VT@NwnQIsz%XhnboM&GaHxIahP)qw=t6C(wTGM>_P zbu16iA7cbeunBSVs6>ORzyv-H{b!K!<>uyDwGc@8DOm&MCuNO#em?!l(A%b->ONtc zg_6|)rVDQ6EB{patRQIv3KV(aN^GU_3Iq9THB}ghn^#r!xcz*d?f4VP3Rl{Ee7iW3 zn)*)F-bNIU%9d@n5D+*2_kfQB?-Z~l4@3%Xj=)U<*5!dhEt^T98gJp+QhrdOC|Tjj z8Yo}+{wh<2;?u2Y#eCISIO-MZtCu6}6|ggxx@Q&3`ul4wvv3SpkyQTXVnxrCtW^k5 zp*T0sPCd_2l=sco!XL#@Zt4?+g}wFv9uCKsP>rVE;#VB44)1@3#y`Qwe>;!=PvVpi z?z|jb5&tEAz+N!ebPnRb5%+iN!FYxuQ)K*uXDIAmNjO~fcX+>NHRQA8cwL5h5LW@2k$hKxm0k0>qgl;CKO}P3_=;Z0%1}?cmEsxNmj(-KOCyBE}&`BVVMDduN2g zm*UXZFuaAZUBX4`l?@cWmU~4@;d}{W{dj$fak7{#^#@oc^NmF*Z;x~o7++`!80w-T zBa=d1FT-wuX)-((cv6=4N-ZYJu$<}YHl7vsZAc? zLI>C(;U8K!CgDg22ztfz}eMdfwW9at4A{u&uOpuV{8o= z9?*`$_X7N%SBK+ezM=4;vUs2d+$|f<`IalUs0Pd}ukujAgZ?k7$T`la@*{wTW*OiMjOYm!}*|ETOo z+Ws(eWz9ZNgA}LIO}TssGo>0SOYN{Dm&{40yK0hX23=m049U!4pC@I~Xie?6O!`Bf z_GivRqpE44DwJ7GQ>i^NaIg9Oim+bt@N`rvMS}FiK)ER5Wb=UGaR4B|F|lR|B<)wrtB|Q z(>|pCe?Av*bzMT9%zW?CqenHx)pw1VefLu{8<0;jlZf`C_?>ATyNCZWm9Dn1VKkj- z;frRiwUDW%fS5LE+Mx-b^1Smm%2$l}g_7YWezJ@;(G`@OFj1y|k3J@2k(P5@yST7S z`n@?Uj9Db-LK-D@pf~Y|?OOcc-CBEzDRI`D$oE<`_W3hRzT`Oh%A99Srpac$w2gtk zT)r~fV2y4zkzx~FY$9Kn{>>l$gMOL+e)T%qm-G}N6-vzapd(*shE?nYGrxYY&tp-# zoNH$A-?*9RL^C;N>LmQ|^#eNdaZPvn^2Iy3e}6tByL#2ckLU-+XE2DgD;&TP`j9lYUe8jT$*9 zw{GtKrAEH{|GL$!`v#3fWp$6_9|o9uvI>+d_2?~>Exj*&+4Wlm-Py%*x7_O5os9?f zUD~h9ed@RLmj7x?pTqSxvqSi`@z&qZ;a^$m^Xt-mjmDR1G5Vg~vZ8V(e#6;pHvjmQ zMrT*j#-{eCODmB@mFTc6^Xe?RJB#1jC;Zm_<(I-lTdd03;_n(QF|i&%j5^XgRo!^$ z*i?SDjGX|h%=8B<>xrj`U!iB!V&C%cZtF|g57bGVZ(Yg5DBjg{rpAY!!pU<-S+?Js z=w7}^*%{FVrFWzsYGk7{8LSOp(SV;_JX6R|_`!c>r2;}8qi>VfvFd8}6?|UvY4QXH zNPO+FrydMn*0?)U{va;4>NYa-Gd{}aGuLb6Ri=*6=u*B4UA=@sjWOC5?TCfpYE2)EbEs|0AtGE8{v!hozAHJfnnNZDw~1AGi5sW$>OkpWZ-j zM~KHKmq(fCR0}z0qNS7!wA5ywWe2O8SH7FR&!eV!+EEi3XZ}~>3w*JxeOjqD!pxV? z6FWUYGsnz)l{yxC4M_Zeb)lJm-UovWf8wFI1r&>CEKJ;UTq`&KNTctilah3?S?9)H zhfMrgh%sjJoAwg>e2({7Bh7pfN51!s5}oRDjg-rt>Tf97jl1)Sc{jU4tTHu5v!kI) zzkI!tM;1l=m%J=Y`yiG7mVJB*zh-=x%5O2U&nv84ACvM0jzS*gv(M8t@>vB(A(Q{4 z?Y7)!$8i~OpUUR{Qzd@)yrB=LEU{C9~G?B$WCd) z2b?x~CAk(h-RllEk;z&U=B>Y}q2Zy-YSWaEm%HraD>kDv_44>h=qAs~(yzTI`E8fh^jdO{v4h0T ziAS2Scx|!t&_Zc%{O(XU`q0H9@a>!FBjmY%vE73{Mqa=+zu$5#y@@lYtrGU8H9`j={~&B|~`?qyl1 zW%zdn>Rzeam0#h%p_*ZOPwQtTXK8)WX|*WeGqxkN5f*aFbp1jLe;9&&o@x;bPUXJH zzPv}XO|+1&O)nhdKij|ExGUbli^Z;mE6ikMVwpZ}#4n@u=oN(D9^}t^)Jd!`%SNQ- zREs9oPCQEa=d=2CU(-9A*79>hxK3?p>Ph;bB0JXM4_a*HcjCH>>-m*^>A(zf)S?}u{PU!G^s{XO?)J^ja@fQ{PQNc05MNg)Bg+lwu9cna~Q zh_!!oi(B1AA0%&4)IlbTllX1yw&y)FS{BceG*Bv$K^jdrc zbS~H8-@Vkfc*ps*w%uG}Wi3#w3(~lN73H>S%Pi!LhuBC}Gmo{9^`?6KWdKz_wUE{3 z$)XQvv`|~DjkNF;WOi7{2y-j;d87FsgeTTzSWeX}?79c}LC^K59fC(S%)*p5iy;Kd(@!*0eRBV?31@-EPy^AhV6I zY@W~imo=9LA%90{a0Tm&#%sMk;GHNvY2gnhZ?up(7Rr<|i>R;$P1rYB-&k{{g)f87 zj*p5QkN+-1Xg9vzJFztP`W)7Kv&NN}uFE0)t#nTg ziCI}kzAlB|t&XPLM)*9+=Q(6q8ZFD=i^&YkAsf<1TSd8Bv5V4!b(UXxNFw{jwu$?7 z-&}eyz(8tHo1K|tzlkn2^G5{;UmSkDnLk1I1y=vz&Ra5@9(k7^hE}ZWeV`^AL&$a1 z6VdHWRmN)MgNhY??eA${qVW-$h|&jW56&=?p{Cr=&HORq{bn-7OlM%uHBtNg6%^Wo zlgzx&UTG%9X8v1WznRbTB-RuCubLmtOT@Q7Xmo7ixw*4uGTHn};xE@UzjF`khK=NlakYX#RW2f(r+nQM9~nMvrdsut}HT>d1i55 z+x(O~Uvu;un;8RWebx;BGsPNkzW$Ygfh)5+m9+CPI_NPM2d zuP$Gt({Kj)Hr++Y>I}-h!(6r+sy)Ikkfts>Q^^KFP|pm zH_T)0Z_ds1%v0pMoB2w;2fNeQQ)F_FLF~Kby=lA;`IbKi+W&sKx(_M4pFbwO?19hO zA6NF_pJUH(J#mrY!-$rar`SmMsf*ojTmF>COLQ&eGk2^{IG$J!nhl2LP}XSKm4NKB zSbUW9qm=zWgX+HZed1f=t^8YODSu*p17+3v2m97(izP@2YYXiEHTYJYrdJa(RHL&f z3ySDE%9k}?LrK_l4$Iu}K75?!*VvKVY^u$?k4)T!m}{xUltY$g3n_U&ah&(i|H5`z zeI(lDAuFFt*kUE0nCFQJ2^;Uq*{DBc=Ib-Rn>bn6Q8V+8i{(__9{yn>&z}s%o@Y7O z+5A|}P>tfdXoaWnZ9=St=I z8r@|f@0sari&+2dg8GY+syqPM2VbY45B6Y3vSgTGuFY3!K}aeJm24QUC(p%^}T+7{pOl$uDRd$dwHGLd7pFcbMB2|Dnd6V`gA_)8apxp)OlboXYkf!%c zA%`&rPW<$FXTQHbAY{d;Dy^Pyi&_+rQ1+z{o^_(QuVBtRtvl2LRCc}p_Trl2{M=2N zop6O_!uwmN!Uh)^iVp(iCU0h(@FJsC`ai>OoieLjd?7VUxIgq=6Yd#8!(D%0p6`cU}tKJN0S?B7^=vCz^ zbVsCgmK}80CAIOUSCHG?R{mdi^n<<`%cA76#*L33UENi&5|HI#I8ek|4zW zC``SZT7|fi!Q&a={ZLLTJ9+HP#xIHc8W6*@=wHB$+QLm9X`WBX>U9Rr33w*Frkx;H zaZNpGt~Rc|tA6I$A)(zA{c#J|k0cGq2OAYTX5~JsoiIlON_j>Q1YIs>S-}76{HxsL zHUS@Ms1nUbENA8=LFgqy?#O*lR+%t$_eFSt0BMxpJ0q?S+lT*fSaQ1p(;(EDlMwZI z`&9tSXV6%}AlEytD=amRHfBidT--8!A8*xWX(A^h8}57h{-tru6_H#8ra%hZK_X3{QPpMUT-GvW1V_!T(*G-O-5Fh0)dXfEEhZ`GJ?PR`lzNQd;vM_K7Iqf@niDU zB@uE(Ue`E$PTXTE=N!4h(jzDIOJDEvj2~Vi1hs~OFMVW!cc5phx(_BiW*F*8d#20l z+;?BC>#9ZigSVu`n`G*d3fTm<`vdN)!u(daC+0Iz#U2VPhc5wBTY+9DAKd4gJS~zT z=27oyp}3o|xhF*1L*<$*{3FhZ1y{UHZtHFM`(g3L5zVqom|rJnPS8xbbw0MH zrFs9nRCCdOk-e)Oq^zTvCyUJBMa{Uzm(G*&NwS8n>pT3!*6as`u5CM{L59tzbZ+es`S&$tcz;? zd^DpK)@C8hdP1`XKm`lA`Xgc0Qkj<4XXT!Dcnv{+oc(AJHQW^&K0Gbf}+lZxDNHllkB+Nf3E&!9Z` z>jCZOQ%5YjmJEO=$aD7M4x4Wz3y89?@NCo5GIJ=>F;BI1t0k8 zN@v*o^%}Ha?b6B!Tz7a7AWJjnuc$f>-BwgP6AyRIHchHNGf_dz3!Z-}!mN?jNjE(s z-PJG_c-2QuBQw$4(s(S8mG((LQ|6HstT-G%uKCVqF#2)jt1+Qg&a4D${&;sOO>#mA ztRYHY#takdsq^6)P9r&QdJ2KH^7hYm5A0Y5EVfb~*sDMZ5}T$$kiHf;BD~88_Ql*R z=w#HR0rkEL5rf~a!=0D+e{S=^vA@ZVxx})!Vs!%_@&wcfuHquzeCu^Y!O<#{BZ=p~ zSccS`$y^$2cih+_XLxvX(M+5N(WeZECxs*inSz@}CiT57__o_?Co?4m1q3&tCQg4( z5+n!tSG&497|sj7&h`3)^1I3p?e2T*V5xGw?j4XXm{O)N@Z@jpyf;V*4s(x7yRHGw zzU>M9p0*X__&sT5bi<=(?Cr+od**GG6vc~J37@Z8+819+yisbIZ8K~88fTO|dHr}2 z&t^Z8ccIdnV8!_UzI<^upEhjvLAl`gv%3Z5*p&dM&=XAOx#~8l=RZ#e%U<#u?6bUn z$=~4;=F-6Cm770PQJ=#OUp!VyYNwAWYfZ4gP}02I!~*^OtH)`ydip@tRr3z|9X>nU%n$f)xnX?|USuHQjX(Q9HVbesoCo#T_kLzv)JC#t=GTG8 zB4loU1K(5fs_Z-Sa5a&>y4=aSfi|4cd-9Js!i(F1yC~E9u9(|h&AniO&`{}$nJG{H zA4y0X!)v|tmR8)iu-MIs+KO-4$32+C_olocNe)%Fzj>rRnMR07X7qb(Xy8G8vr3ss zeWy~c2R;-;q)K(2KBDe%KjY`E`U|n_T;J=086T~a9wa9XE{djgnKuRe+_fm>DQ~e7 z2`NuYdO-CWh`MvuA2GM&ATgQT0c;+9``PLOIsPJlFz03NKZVklqs~iRlLf7p)#xXz znGc0820o{?d&!M$>k&50M}@rOk~fd?Prkb<;^b4w`t#@>P;{LIxwm9CJ{`D9sB(&X z>N#FW!siFglFx<=&iP*Z_ITv;ovInn;7`>|&mP0d2+}!zarc=XdgHjUh|`mqUZX3@ ze^DZZIh$8M2rc8EW(yo2?TkCI?IQAUJn(U4ToV|PWu2$D^e)6c>N9fbDEOsKrbF!Y zuUBkFMuI&Js>?LRlTCPj%cTTJRAT7I5%M>^yL~$x3+PKJn392O(yJ3LtX+Z&B-c>q zz1jBg8U9B#o@z3<{ zOZr?JBfsxxC{*6;th=M^vq7-#@JPP%-1%lpL4r^v!dLZkx!tw9n|4bHz+t(cgRu`M z9;H}`2U<%v!Kmic43JE#VpKP1Bu{p$Ygae$n!(!Kh*#OQ#*g#Mv^xFyesus3PToI5FR=p~3?gMbdbY#gdu1f7`CQf0DVsw>8vh`h?TV8>LHbli_jczSPfL zee*$6i+)2>e|%5Nt0p>hJ{q5#e7OB5F3xQK=SQbsj>ek}-~OOJbVcstLu^~oTbr|M zirE(WgB7W0o}Z1aePBnmkyb=tj`uO?Kc`t>UGwG4ajimRwd$(RqR*Pol2XI^F-hM~ zTJ-HpbOJYji{}csS3s1$J)RhQ-0!}$_bfQEmMLi8yl1|Y<%Yd zDo&;UG|%-wsH|Fh8oR$z!Q&-KRZLS;_4t#c|;aI`wLqZ@h{gYAzxylDx!PgBp&)k;x%6(tw z!#X%v(83zyDb0r-)3Lb+@JRdnfaZxF;3w{Os1L6=XLGn%T7d5S87o}$XR_EB2e@1sZTE#w-%2ivGru zw+~YvFu4lxg=G3H;!_GTR5HPN5$OHE+oIp>%e;$)<;ZP&1$tdNcq*}I&}Q(JY63jW zNJ6q@)x3Z?)V*0LjbyG#GE75caEI=pI4{`4pBV3*~L+rG7buEQ<{W&6xH z4sNUjw0D$wvbdicdpZ&Sv?(cxW+XmjOg{bA%4M~Cd-k?LaP`*>OOuMU5$!*3ZNhUN zU%u+J7-H2erNk^fC7vTO^kHB+Q1{10G+90W;_mruK_BDb!XeO7%EgR8PpFz+ilFcH zTV8|1Tit#cB%kZ!AF7k!ejb9%FZ+3KevX$|PZzo1;hIJ(OxSU*8YApFgkB83**iA! z%C|2p7EXE^MDrJ#Eip2&>YA0%n0joSI8r=QnZm2FSQM;QD>wOA-sjO^N-(Qp>7qba zZ1tk5<_)J>b^0357STmg=SEV4$_xSbo5q;CE~zzkZw_IjG0g@)ymBXTPLd3Dn)slE zzx0{8!bk}A%6w~NwNpZu(^v=#cE15$6=!t~GXdd#3pP|3!gy>fFj8RSdex%4vjdDAw z=hv%@OT2!5|MS#$NufGaIYXIQU~GP^uPyx`P&iY!+%_~l4%Xix8or)+yJk7;*}bqq zoaf^&86ZBQmhwd_rEB8v#_xF>@$q+#vOCr8^fi><;E@CLL!Q|f^}OuCm19y$)$sM>m4}({Gc8)<+hdZpfFoz+2V9? zeZA6dyES^zINxenjd@c&=e~bDH@*JTFK>65G1*m&h<+zbd6&)ayor9|v)iGEOA#m~ zZfBcU31^RFeDj`+SKz!C2ASPG#rX3xySr605G=1&@L2jU`HzD#tnsCRc+&HPmj0U?56LGD6!C^<_cIh@}lkghn70q17GAS z%zb{c0C}KFt?8uwE*FAs_W`SN@$2dD|GeE<5INPL93G~tOO3yqxcpl6TeO*t(Bt{q zDr-h}u|C!RmMkQ9zj*51C??o72G(}3=nvrfk-43r`(&R$iGdec-;4-aCtE&_Mfa6H zW<_P!GeYa30(Boh91*BEf7)ql<2flrP1gRu`7)EN_J_sw(mSo_sy1Y ztibyh9@G9dbW>k*Gqq6Mdok zSk0#|?0X+IZ?wZT>Bf7TTZZQ`fLA24-AzI4&@-8AG_7FUT%8b4}Z= zpSV^g+{axKwblID&2VQfC{p@6l>fQjbI=Z8U7`GNxbO;(AUA%0=L@0cePRbiqe_~$ zrZt~}o{PPSlIVGSK)pr3Tyu1vFJH|8B=_niIqSOG}0 zSBqmauEl*?`{{A9eT1m^mZWB3H}M_WIwAeTtOFe_pk{J~3{tcza?wrX_Mtv5pNOsI zm$}lu`)dwNL<*Ll8RcC$pxd&>acI46xUR77upavKAd#mi68NIGMYYARnbdMW_e+YK zNBOBykw)RbC`?pT%j)#mr@T9|&yVU0c!r4h9$br_+CLEzsTySy7_`;B-FEr3uH2HN7A@Zh86ruS-wT?KoHM(Hbk?Y%%V1!1(ga*j75f zDe2*CtBMxZ&b0oiBi%zxRCqb8;V8AeA;z`dj2-gBb@N5Z`TEs?9_l5Ju%-%>@tAAW z>Um#aXYgHLf~#`PcZQk{H0t(;AIED%15YfAkF*8T&DvqVJA(I#54TdwI@|EeZj!zF zDR>O{xk@rnlVosGt`_EIY*E4 zPC5BQ-OyWib>d}?AG&V8sRuVo1)*!p(Jvm~S(_VOBOUD#_f2Zp37-D{NI!e`%b3I* z{VSDoJ833YV5AC`3t|K$@YHnYc6*f1A4|r2TA_IMgjlV$H;bsfS=>IgXKhX;?Fo^#Z|`7Hdj|vT z9nA2}p23#(W)XUDPd>JQd*etty=S2fU+#&6roCsOOZKjpE#AFTM}F@VM(mvgt5JJH zwAnjY(f<(4XwQs;itNp>&fdXJ@4bt#HxBUSy>T4d8;9)P!P55*_CGWV|ZU~lwh{{#BH z`vK|rJu8zYvnToz;NCSRwWrnc^Sx_~ckeO_?VZ5#dna(y-lLqZ_iEuc;UbqgQyS&z;HUC+qTx~u5q&||ed;v6TRf6crOkqFA?y3s{nvHYJ=U$t$DAX% z$_?t`Z=do#Tx0g4pv8Zh?zZnpB!6>KbNh7X^wkRoScf}WtUz%LP&>@;KmjiAu8Y!h zr5fdPY(Za}2$V47@I=}=>Qp&higtGW4g)1x11Fu^IkW;kWIv-N-FGm3np8S1&kqqC z74tn5d?+k(yJuDT9=q0BD170=^yBP%}uX4Y&O45%I!& zeRKe`naZBgjd@i6#Rr#IcMHG~C*CtO?cmKK!>-bJcmF)@S(h4=fv{ zs=hp|DHiDKd9S{%AK|J6cEQ^qomI-XigAnWTNFzD)~s$g#9(0-0)rTA&DO&q2CK7> zScpOPeBi}F^=m&`g8Hwkng6y$2KqCvpmI!mj%@j=b-LZPRPR2eBmaid^U2pznG4m^ z9we~^8IE68U)p+f_E9lGf(7fnzIlxlbQQ7{zkG3NOX8*fi;UlABfegxfw#STO}=5y zpj~Z)ruwdbV_DY33!jn3qHP$Z5}}(qb~#c+jCX636ty%r_IfMq)jd$}`IT-QgpN>d zg*DzoBlzgy?M<-P<@mzUqC&o~Z5(a0)>4QHQlu&@wi@P+iat0@r}J0_zTyA=Sgd1F#`1ss)QZ#k;!xPmT~U5OW_|JH4V zMcqR;2r@_`6Dg2ZR@FUpp$tQbN?7Kn-z z;W#nY_R{LmYAegL9abyN5~i1YoC&|Vf_x&#rcRvPZ^=WKLte|fL%`jxE| z1MP>Q*mS3v=L%FS=K|0eavQG%Boy3(n@VP>;)zW3s0g9q9-HW(4Hu?!uVQ6R&^Z%- zwzE5408&&k(4UuX0=^mHijN?xC{i>lrgBA1(w7g|M5`cuLX6qckbPA7Jani4h07SR zpLvPW8fA<)0QC*a1IEH6(%IJ1x$Kja(sH&KB=kBl^C&oQ?NvDZ1mq|+K2KC;B%RRX z?!4*gRL^5g17=Dz6SG=M}I!iIZ zg3(-C;Zd&eE$h(`VOr2R?2Cq{;qlQ^2dSWIlZAeOG1Sv=QcKSWcA~+ypU3~t47)Ha z+LihmYwr#u3eY=aT!GZeHPa?&Ir{cf`Vid~^w7`2*6$1{7IBJ33tim5LbA;Rn21d- z8xe(K40(6o>z_@;IW0Wj%}ZCH(XoySv_N97FjSCI_`2yRhO)>__1{Go9FJZ)E zL@iz@fKtsqs{7*(5o`n`pX=d*s#7{;Oqlr7oFa!A3ZfdY3tRaSX2b{?2)i>+6t74@ z<{{z6#I*?uEmc0o6>Cq_%MC+to>{6w@yo>z@z5QLF*ugZiYSGQ%`_BX3g-3hHzs@K z;uR=TQsv7zG3jB)j#^n%($bcOD+OTxW@7hJ zVKy-Ir#Ai^W8P1gwXn4i=ZFRQ;RwqRVO;iIV>Yk6D;oD8OOB%Ij$RI0!JW$U$_3%+ z+_dj}J)pY^@TJU-1rQe~f{)Q>%NSK$x8JpD;}YJ<__8BDf*-}Ct*5}_4O!oYMdzx$ zC>t~RO?4{Gtcwq(pYwO3im;spQ@UsY=y$B12}V(5byfTCk9qpZuSQ9VkkWf?$|ljk z(8e56?DF{2yxzI3tMIm6VM&X?3xUrL5dC@X2-6*~C zTsC(;BdSft)V6<>>pL2-xQ#P7OlZhwA2G-!LZQa0c9fMCV^v3r0@hL(wR%(v8A4h3 zb9nNu5p?aw61kRdX*Fy|;Y%~-NxKcr_C5&SYIElhjCv0CM9g!gh-RB*=5Y`1fgT7J zoBhtmAUQA^l-A)qjL`!fFscG<>RU5Leck?c1cLb*0hR55y=<{r6rN2`+C(WLj8$V* zI^%j{?@Q4ln}Bs82YZHYjP}CjHbO24I;{A-iDr+qvJ`cgG!@!RjWPOw{9p*_HS@J1 z0f&+`mLHCulA(P*-j*W#+lKy#M%&FhxY7JGpP6n91o08h1nR&VL#zCu!D0!tZzu7B z7EMLW4w%cjPz++RLcbseabPETXuj7C!OkzP_3p|Y68lZ&&2#MBDkI#^3K@!Kui+MZ z?|e;t)^?i$I%8_k1(6Xz-5%&PQMKhz6_<}mW#I>w$)Dh+s%b1?gR@K}n&Kin+Li4> zbEF6iqVxUUvxJ#Gi|{&E{28VaACUx2~lB1We}GxpKt#R2I+D7K?)jkjxUk+u?m>z$5FH=4E6O$2;^J zM@z_KSJ>cWFCL8K;dlq1*4=!;u6{}Cy1L{~Y!dJEb2P%<1hPz1;BKiX_N z8r&JlKFv`u=P0l_(6V=C*vhCUF|OZ;8Cd=kD89gaqQ)n769kpRF-qVV`JLk6Yof3} zyM|n2V<(Kz)-=bJhHLECuu2O1r7P(_Wt-$6z{_*knNTh3ZSM&}Bm1_4pu0R>;lj}3 zKF0xk+O8rkY>@2(fxhFo9pW&AQ0Cb246;ig&^V4ri6a94N2I&R7DllOIU;9H@c-~d zgDv7qaOe~cVdX&oN!YQF?(L)8osyjq52XH@kH6A7*RxUmCOv;B1|BNv+yT-wP4~a) zR_`yhaBGF<3_i=|N|V|XDJ_|~TxQ3{>MU9uuM&wMKAB<9Li2e$Al(N;5tcweFFA64 z2^0UgTorVZ(e-?1Dh`Xjm>5QWmN4fUlHvcy$39Se`>La4Cu+ZGdgnkXar61uElD6J zGdleJE)RJ8S-!L|p{^9>22Y;0y*;&isexsDb*Z&rY|0|OCS0wg-i*!JDKEJVl@W-b_ zd99$k4yQh*Z73D1tvs%&_g#6eIn}&_R)QtZfD-`mX**9j2Y$M|Aj>@GsaQn6ak~H9 zQ=)7wFEfJM7$;Yt-?gnuF1FBDm_s$6ntP7nRQOaSr^2`MFU~zz&E;%8Q{rs-zE+t- z5dj?=uAEG1Byci?G{(u4umMuS9Ey`E|ED2dyff@S#mwOpv(bxFa*`=P;$M;2Kca$v z-TVDF?*BF1mi))p_P-#u{{^xAudk4QeTDq%E975aXZ}Sm{l7(1c}YAEI9#lFG>S6E zwWs*caX9_M;nX0L!<}mVT_oqXd{X=WBwO=e{%`#wdicY?QPh@0@eg4X`N57OYWEvr%1PEYr~F)ENqW-ruaD?wGnA)+DA1#-1Q)fdOyd|I+HV4&aB%* zkn08gaG6_-^QFz0^Ws{!rU|cRI}O@LZNwR=g|Fn5p1ia!a${*aAoYrJSsP(Ck`%p9 zOlZf#uDJqcSvZw|Shu&0K%COpf-!`5#E*bXIlo_b?%a$(>|-2MTvxxoQJHrDq%>XB zcmVWM7*zdPf<7h)@(649PsU6r_T6@PHXp;0Z<_J7*(Pqb(=Gz>d0mKu|6IEbl8>Q1 zBTv|WA5r|G2|-V#_Z(VWHS9+!NEK+Oz%<7 z*va5!e#`G&PF_7V7jCW$ZGw6_K6UNuU^ShlmT*%zvx8gH;F7KF@gm>YcHy0i3b2a;{(b{obyp6YJ)!%4 zSbpIK{4!Ci;4;srDQBWVF8ctgu4oDa9AwGlf#Nrd`XiVL#t0si8YS`x1oghkkuge` zu5+|S{7R*{7~tv*`KLRB^S}0E44y?JW}RrPb#J2VYs15k=Pv1(wn;VcO7ZS0P%;>n8}Mr5`Kab{N;)EKSPgV@p8D{=m&* zZ0xiS^*^VFh|IdTNayZtQz9yj32~%_TJJ0#fWA8q2)yH8`Dtw+y(my+HX$})qI=SU z$s;OOo1!{Xbhg&%9$kkR()V-sgM6SUWfhxbl8*61UjSzn)i}@ywT^veAp5(1@a+Tc z2;^vz^NVGtLv#(-F=QdXEl4$v_Tv~0f$i=!R(;`WCpEapRfYMSpA~uo{A$xAec=Z@ z`5fE7kWq7k5?h|We5CP*81utD_DJj~-jhbtskkzlu0-`pKkXU`btKk?N0U)MunWw@ zG-0B$v34SMf%FX=nS(bb%6l|nxz5BG9$$s8sO@x%LcpUW&P5J4!}9D$!C8Lc4KJUd zVNMaudS^~v3|wN1F=tARK`-z@HFsmefyZgOqs|~B{q=IJ@{TkGf$ar~)-xuFi8kNS zHiw~_f#T^R4p>3?bF1aJM^&}=zJtd$w>>;QY!l@J$JTRbM835@WBeYtHRRpU2se!L3wIW;U%Vo5mG=n5kpC!L5 zp{pZLS>1`i3b{*lUh?t0u4q0^Kjt^~Vk2f#@8c9N6S+LvC;-~uj#-apf^k1H9F2AzZ|24-VU;#3 z#TQPs&(?K8EZGNehlrbXXx9jqP+$m%GldD*y1JrDvBx?ZsXEC8YEfP%;%p>Cne%TQ zxR(!$vs~5O!BO)M7Ln*5;QYxoq1~xOt{M?YU?o=42(GHwFZ!b_?aj|+SWNJs47sBm zEeW9*%oHh7B5nZBPy%7TkSrP2Y(Ze9P42+f(%L` zOSPK>Ov5r0=9~{QwZ|Q-Mr~qUwQY!UzrCI-mD%4$mz-KjYRx%+l**rruKKl8_XhuR z!}l?hv{>UO)LDDk8$fUaI?T=_PQ)-vUNfN+-@$sOO(}V3)Lp4g`LsDAUWHMMEZwdY zhXq(fBTk)z4nL5&O80U64Z$xqJ z>ED_9O^BAM8uL+A69d9Z=kpnLsGjO*(wyN9JOIy?9e#$Qgay5M!MW&x7$BW?gV;V1 zJ;kHd5qvoWAxv{mm0ytD0iLGlR>(9RTR1Wuo=Ps(YH5<{#BnpliKffZQJmHtM8UpK>ATMP)=VV4uvM_(cvP7A&gpkjbC(35(uT17(}MWJSQn;B?j1wxy@$WD?Cs^kx;e|35FcFx=*LbOg@K*V9 z(3E8=RgZ`_Qbk$+cJ{fG3%1P(ZFTJjYSK+d%?L}& zr0~{xGkC3uJZTBRB_4i#2h#~r3rL-u(H{TFowqYk_E$<2_9v|^IiQ&}?LR|!u5S1B zI1#{5WQ>X|Er*UEI~KqIdb6UPpG_~5>9C_Zb^0_y_ngfduU*c$oVikNx{^&Z>rryA zV>z3^%V~bU$pJ)D&dM8eo_8*wh9S@?d$uR#WoV^d9uV$a1=m4C*?yd%f~aN3niZt^ zkATYbsgIMrHlkurrY zn35T+C5z?}97V{Eq%-(L|NCsN*?bsZSlpcRtGS8;v<>4tyuf5I-|p25ctQ50wKyIA z^#C*5UYq19Oh8J3qm{}*vEjW;8tn6=H)m0Dm7qb~B}XUrp48z+Q8vEnqI$%uCSJ}S zo|5gwPD<~LDA5-J5tECV!|=_tMK^L za9p!t`DHlEb?0ZD0f89@=@NyfF{9RK=XO?Qgd7j>)UV zp&ft6zu$*6+!1&asg010*b%JdMuYJ6kmQC-$$m3W>~i4EY}?OdntV?<%Suk!-;ldZ z6n9JY|6Q{2&hE}X~qJ6h)%W!38trT92#{SweeggoAUp1u;hSO2a#{vR7fEw(Z}E^QNh>F7g3zF4`QHU!FMe zbLT{=*IU?>_Pcv7EQ>D^WqSEJ9{QU44m)v%rFdlMmvcp>U(2RyOfa3#O4Y#jAN`xG z2Cp>jTDAM9`;(--Pvd`op@k#dbxF0o0bASKokels1;0#Ha3&ZFt^QiO`f|YYw$SAG z;8FFvsYS(CzzC>2nDGjWyQ~`P>gqUu8YlkDQ;eXeYqF#2eTLz}#Vp`k+4dCM26Rpn zo=ZZ}xbs7!n9_`gu^F?-P5sF%1xB6O5}kbp8~=F;BN`_A``JgV`hfM?3)kW|XKHS7 zmNHk>)h#LNhaR^jr_EI>RoG%gX3VIT+a^gh!u5@a&9L_m1Q9M_O|S@j3swq7ov_hK0#NfCQSN81KJgfl7Z(Iy#+Fn|8UPMCmL zD5CgURMVFV&e1LI`~i8&Bi{# z^5cImw{tecZ&^8l-ao2)Vl712lGfg5IvkC-3`8N^hp|)re4v0S-`L-2bAxK4OgO%$ zo+gEer`@EH{Q^wGl$xC8&(RAh`jnCn@EpKoTI}+G&dd)0r783r8VqJ3^HLTd%u9?D zr(tzGkkX~P{=^YXtqWLsb^TRwEO5U5BGz(h0F1nkC6i4XndfH*B{UKy+><`mts#ys zBr`GU8UslN@=IhhV)Dm(=wOA7K7;BjpzE6y%rdnk1`=ik1jyJ*3cPCje zDtRGG#!xmxh{2=D%iavlg9ya98d@Vr*#gOaj>qt8rUvM7+Z_b?>$+bQGx}u*T@RVJ z;eST}WYbin3Ln0D4imr(`XofD*fItlWSV}-+?pGDdL~QMvdte4Fetq^O8{*7JH>Xq zDu4|onS?dxH0G%iY)1)MqPwuz$R`iqK&BKo)3(#mG0-pEVZ=LDZHa>LMVaVADz4K$ zcmQvRz5V+`cuD#fh4CM1ew^zqS!x4F>#Nf&YkMgqfU+8@xsp_SoIv8o;*3;p_7^19%fK+Ibjm6}guG+gxbkO-`cMdVA;fmTGT}SD>T*^TdEq1_ zqg=jZYl|AhKSy1cXQcj7W^`?x9(Rc#ADUAQg@(@a{RYjF0lC`^t)pzI9fj9?>;S`} zN0wP37GX`O!eRt5J|8yEASfv(Fm1{%gYh?9L^soROTfu~8@etw{?ega7WY8|O8y3$ zykVhWBv?smyM^Z8V*CX$)amWz#x(r>*|hS=4nr~l1}1oFgX<4aI}ySQIeR|e zCt%`vr3WzQ=+*p?hyd_hjn6<%?BXhqNm3$i+mb&w9i-5|Tsb-ujGL+7ZE9kokDwlW zXEhB65<8`71anPeRlF?Ku|G)%sa3{?yQnU8)?B|dZ#epW8);_Hz*+SnRR?`_YpN2W z4JV$)w_weTRPm>%wY(G?J$EU)bCi%={IK}Km=Z-qo0PNaE5+72-|TPKG0Bd0E$d6N zQoRhyrC93mPvezWYK{DYqOil;u3HWJne6jW^u<+ff0U9fOsk&7EroY(hfQ*W{E3}u z0lM|m^+Qe|((e-fSy>O6#deiqtv-AYfhckrW8`9Ygvqm`>Qf4rA^D2V&4rmcy8AVN z;UK0C(c^v$B0Y~iDoikS{El{A@vD_gU;-MCZC$vNvE4LKH(3@1%{vZou0J(p+mYuO zBD8fYw=F|8$b{`FR0xrwA+h{$A*Q?Kk&}{o{w}tFfD`z!IPI^oO~57~#9ZB9raQQL zGH&Nq23U9I!~&=^6(5|^Ra+hflli=C^^g#2s^j5;T9R{tcXAG&vNG*3s}u~bxyE_qoVg+O>@xb=A6J2 zuz9*bW{{Zy7>NMFiW~eJB)IAKJY>9G4^esQ=V+OVOLsc%YCvvcNn4jeUtI99CaTr( zb1Yqo02^@EOB%Sf!C+4dwp1UjddMpnWBfFh>$R}=)7WY2C_SmQr*%JtLj==LJ6=+@ zz47Uqf{VWb1!qI}0Sym!T;+cqVLk)Is4oHFQQs7p!m!N`_SP_6@r^K z=l!`Wo6z4(!cDS6L`!>reAs*@5k!OwJ%n zlj!vkGFmdRDf|py8f>s~o!h$`z9qJ-(@VH~=$Rd-!&>pS}^nro@IaiZYw zBxL8H^*F2BfoSjuEQB*UY{E zC9Z{8lu%V)8lKL1>SNAxPa{wUA1DwfNvYY^@Q|3r=vQuU{b~tv?K!-DEp2;YKj2oV z2TlNXZJld+Loy;H)qkZWL|vm5gqEkE$&(+a%Q&-$f%U1?8|geO2J`02Qkb9U386?Gisajro z`f6+E@0o7=LH3WFE;fk0U;pj(oy^8x$yQlgk8AS$6vO{UAb0@Qa%i>fO((51>Osh8 zfY_0#(6|KEN3ZqA2%32YNtwDN`{SBtWVG$a{We#XG)(~LGl3KRfQxnGh_x7u$$}qaAPXu*^BmFJ*81kN{QJk0mKqZHHI#XW ze|}mBniKtJytWy5YeNb^86c*8!8{ZDgLnCq&BG86Z&aL0%aU=2+dd z(0DzY|2pH=Jn6Hub^Z_;tU9kzCA^36u9BGV`+=l++>ZF?SWyeHn-;iMq{zkQdzcpv(5TaFO5RCkGts~3c zFB(pCb2jvv0S0s!dhI@ZI&0T~_%gLWRI%Y=6&E2X%Ta<=(xQG2KIgRHImuJZvQ-0D+M3MAhE1_ynXeGy&4PcH-?%C?NQ{{7?4;#{tQf<2#>wOO7!EZP z@~f|b^xLNc4A+qjy1hkU$CVelS}p(?l*>H&h`u7XA>4%2t<2O2J?3BETSrgJIkB23 zb(7u@m~=|eljz{G5F9eH5mf_DMaE7i_poBO-TuJSV9Z~2jVlq-nJNLA+i%9Hfz))` z))i#%HJ{~x3)EtH*i9RHLkns>{bo~VvFnbdYUrnT?O32lX&3;fJ5B6wUs|FQ$3WTlj8h*#a{jK9HS3q zPVZj;bV@`c8izVS!1-?=4W2yE+$uyH&>VHMv3Oxxti$I*fNdy$#>-sCl2xb&>QtXu zA*dvlVFOajxffL{#7NVW!4x(u)_;M<}Z(To>P2v$IRkL`;!Nv zg3RVxBjC&IMeZD^Sa3ylRa@KGN5L~|PJ>NJpl~rry8(Yet@BI)Ihjwe(~x~Cpd?ae zM7*CxD}3|U>cpBJ5t{x<_OnU`;7|4B6dDwnzRKpEQVMlE9If^^daA z_b2UwVLc&j6T^^NlHNGDt>31JOJ<|w@%FnbvtF(i-ZKU*CW`T>aW2>~{d4J6&rZCa z2>4Vvl_u-8e!iIxpl zEAfk}o4(vr{#8VWD=_*W_w(VcHyfGL4;kI32a){Kh^u-?QJs!yuc^O`?BJ0@IL_r5 z&0LbJ&@9Y!ys4nD*!0c&ap^4g&Y{RfNEcl2xfr2T4hWkY64|z2ibb6zUx=L-Q zP|}T!spmp(*?be$@3{aza?1nGPqzO0C>1;2heZ%f8hYrI1uEW2;muX%&)OmLwr0U)SB-N+jzhiCL0da|yX_Blk-i zg<))So!D%QUF`Pr`~N)7xfvSN@yXq0jI@je+V&5UpxtI^AiCuBFBf-yJ{<)ntP&<|GOMViu z4*!X`b~!ZJ3@6B!-5CB$Mt+6af^Nf;-Rr>GP`}THtapJk-KW7uH#%Ns0+S$np%}x6 zhdUC^E>@Q(4x`SmIYgnB_FP_;DVL# zuX4V7C{#Qio(}F#4bwu;J(_|x4<_U=n+%LzBU3^ocJjEqkj9K(!CAJ1nY+*zN!BJT z6qI}xW-!cB+s(S?GebjP13!ixX=+YJ&it19hLO}_>bo~#LW1hLtOjw`VOqxbbQa4) ze!p0Ig;ke5H3Km6X{(7lT`&m}B2CqvJ;{hfvCZJ3^My$H0BaTsi^~5J}?p3<3M8xkXb@+UJQC>=im|Yusxk@@|b&vCU1ey zX@QD1Ea;We*sjjT)Ahdbq)z4H@5$zvJzzz^)QRQu-wsKd#`p!Ghf99%cTfr*wvTs5 zlqq*%Ib2WKonudK|7;6VUiP?vVKk$Rb|+jVWQuc_Yp0(`15))Zf-HjXgc3t`Kz1Z> z__6KGtRp|HvDLAnj5^_+sEx>LL#u;rOiSb7jZq#Kh=~1>^cXs(ZRTyDu54wPb4+!? zS@<`q^xv#ockb+65An&ZkaH8TcAb2~i1-nWN;lo5gv7%MuI^4X= zwe0Qi1;+afiLL8-Xk7QuioTBx_W*vd<#kvrK&LDG7o_s{E%p>F*7*xbQfj)aFb($u;=@ScBso-#gjP8bqUj-GUMR#Xk1pK;>raaQs<+fT3b^~zu@N!x@ zHtk0r+{tGKINLlxSm~i9h6pM+7pWXF|7V3Zp-V5SW^-*QBL*(KC?cugl}fl+{h?0c z#nydEoaAx0#$I9d--H;JlVtT2^Oqxu%x=+1d9^6hZV(trkA~8#qcqT=Ch1h$ztmcH zc3eN2V~vc>xAheLH};Ysqo;(BY4+%}B-lbcL=;ajCjOwgK+=W1@^XRopWiC5f1-X< zraGe$1E^ez;IqPjQ9$=t&1Fm7c)cNl+TZ))($WS$BF+Q}51OSKzYYgY?(}SLTIGf; zaNnScH39;x86aiUcUeo7BjNRW3GT1E3DbT2?OEI;j1H4*=`1~Ax-MLJh~>2A;mwaW zRg3`2l==4?uF|=RQ~D zd^*|~S3~ii%TbY-V{V{>Ebxio+;S1uM6~sab-dV4R$FHHMc?_^Jo&5zM%TG#{7Df1^Z!^sn4 z{q9yVr0k{ngRrs^PFZ_6N7YB3H$LekpVH@*sr;}q&H5K}5l2m12g_&I&i^Y{6&^e6 zwY_rnO9+DAoR?mw|7LoZWY@H_Gqg1V6^JMOXLZ5t2Cn(q2B6N}`GF+|5m4!Mv<2&> zim-5HvHQXbLYuhN7+ zy%O-qs-S~e2)UgC>V*mH|1=$HjO(I>?)@r1Gx{V4CI7w%=quV@rFa<&W4`53rl9Q> z=pa|f3T52qiP3|J2RW1jfKra4I3B`G3;P(lhU35XZV7BqQEbxr-YzhMK)F`fIb6G1 zcr~@OfHj;}5W#-7SCnbGJep=A{E`H33}oz-V`9Tqu(O`jpxRI+gyb6vTEe;@oznQt z$kHoqhPhdOLm5YW0LdJKzidOd-QXCPNbIGy90|Gw>KM5rpvjsVAU?Lb1Mz)m5Ut2ai}vq$)|vHgo%9EA3C{#b9(J(j)4pO<#z$(ze3(6~a@ANGl`PsKPObpL44 z+?U@YpdExkT0 zZ4UoA)bF)Vq8|2aHL1d1C=IqPZL|^cg8fopX zFJuMX-W#atHsI7PdQp)zCEtkgO|<4l#o}+t#a-~**!~C(>81P4Nbl;VoB`{!^L-s~ zG5+P}eT>8~8QqS#BVnq(bT8vy(mh35BE9Y4^_neKkFwmWe@L|lbI9jJTSEU4(mhd* z!7I=v_ff#f&_FxJ1u~2yE!q>F4u8vun)JyQ7D)bH=330V<{^;I)0&8TpBN!PMc7Lu z6{r&hBAv`gejaHY)QeiumyAqSO%X%1X0}DdC$Ti7+r(+FA}x1X)NJ)|ZOa8IAKznT zS-t$T!Fi`bX^A!3HO0rM>SWn0k7nXaBBP0%Y8pmq^%egdBZW&w?X_`}9pIlVeSCan z(a-ubsSr(Ww4Ex4RZ(*Su=jD>Zq2i;MIuKO=`?mcXX5w^9Wdq33bx#o#)%N+wv%O1 z{Jk>cf0lDiZi~_FrF;ELZ3micoV2mQq)E`}?TXWf=6~WzV2s(cI!oY#iXHn@BHyuV zeonn0ZQfZjyB<@f26?~Iy#?jK>(CO2MW3N%cmtphaCt|-25y{u3X!B>SGAY_bRbGH z`QE%SSRY_F#!j9rE?N5ZJvL$6c8Gpo^C>2gJVC)IEE&oq|tB zZk=BWnR!uYM%?N~4MJ|!(K?nSh7*Nbrg*ofe3z7x8Xn67D>R03dkT)NIv77!Ty+kf z*(*93ahX_DXsfaKI~~1aS#$hZm>rR1Nza4_YM~rB?oH$mVV$~ z3Bn*+#n*ms9Sk=mv{8=S#~VupA7y=-uR|Poe}gh5Rs3x%`zm(CQn#5X*C6mZ(=g0D zud=?cGmlWS$Q^k!SJ7h`W`sKEy0 z!J51m1o#k7$08s309pfm%CxD~;BSV+EFDzfdebD)9!D_6RKQdlv3XQ+XA>$uiO-ft zqg)R$P_gG)#7CdiAu)ZWN%vkOnKs)>$csUWRsUvc6Qa?7*{!)(ay2 zlQ8D;dZU)K-PtNNT#%Hp=(JcOR4^4Y2d-965}fX-F5Fldt!t0P{-)S7jY1(AreTk4 ztwDQSNB))`T>gsY<_`VTKR+g!)wG>O*-XFCOR2QD_GpoBTYy*&X67vL-zOfeF~$|g zP47x4roH zmI{vzjm`Cm5XL&nhvPpck(9UBiJI5Zp}*pn+%bOTc5OgX=5je80 zaWEVB>n^QXjY*{cY;V|dT-ECSQ+Kn7>T!ow54ikNOT+DllJdXmf38jEpV*qF1F)YB*E|_8sZnC3=^KO*s9cj_ z0}|A7s5k)DF|wb;cgH4iz3u0USx#@SNXDiE>diKf(nho3{*6@xLYCu*Ind%?xguBirNj3maPBQ~?!bnEGN zmhQfGjb9QL#v~N*KarUTM|8yfG9D19UaR}X+BFt8y(UyYTsTTNA$Pj+8g|J|{JB6Q z9OC_`JyiU{kt}sDIIpHdV*mMtSSa%P1bwv`MGxX)l$lwp_p0r!`4$D`G#yt6t*N=+ zV(eRg2kAWy3OCrc`nG$Jf6@0-X&?(u*noBOA{{ON$1tjB=@&T;5DXP=7QL;Q zwOeCMhM(lWO9=)&Y%kqo13EcEhl5TaA(z3}Uh?|C>9&3-_O^v@-jw=u6=z{I`jYXa zCP#hsw3X<_0p;Z_Yp2QgO7(XxiayZZnVFi;+pv=#&=aM$+cBCScGT~BK%$w0tp3e$ z$B&=8=tD@tU7q@j_isPY{oouVk7Jdo*9*J-@J~?Zzl@inb3}i(`03T(45sTnF{X*} z@NN3ae+gq*H<;to?1t4AmtNRXrrjC%FtdN!^u(jA3nS^y9EQZ_Y}W2C@EPeu1%9JjoM*4hT1nk)*)`$WS|Z;+_eXP@STlE#rZoPvo7&Xl zp+J_YD8B7|a6A|{J%hWL{otkN|0pw)%*Vsnk@wh_my(o6& z#!Ezt#%yHw@-2)HX57l-DDiP1%fA^;1s+MxtR9|}IC;*5T9g9^ei1G`uI|Do#=^>e zzr}3L5IJwAtl6Aj9`01hCd>S^yTXMdk;aov&4pLMFaD4vNMZJp!gP=4sMBcM#*+D` z*9^~%o|*Q2^b~S+ngc&JaZI6cp$u9OfJm*#pS9U51XRa_uLBweaoedH-gKEEx;_%X z`M1UyhiEfYkpZVXpF9rLusYW&uGr<{jyTj|&imYYK@H**UUDs%Jzy~%zT1Zg{lPGC z+eRpT+A%zfdA74M#3EF+hmgFp1?K3U)5^>?AeL8bK&wSAX*b!Woo9R>5<{)8SYoLQ zZ#o?QieJ1*whPR_aBmJpPP}@?isUu=N=8j9PcA>@h&&R04*}?>>bDZLqS8hHGRKdh za2&PWpc#21%0p*#*G@SX_x(f*=*|{E4#k7_d{6+!rgeQ-{6775$!;wi<1Sl4g(pEx z&rY|#jP+m%;JU`E^XtB1-Zft$S0nMNzuVnCzcCV&=2=JNs51|1`W$I3Uohy^=?reBR*e{7?a6;Y}4LtX{ zPWh9$C9gmYGp%t-3FxVPQoz3*I*8kR0^1{N&v*u$^cL+ZISZa2UJ-)_Karl+wgZS2 zDiXIUHIEoNwR2G6%tX@-#{l#az4=S05!#X09oS#&IFjFMKB+fWrZNWZW$k&DbxoP# zlEH;k3bnhN+*7CPowG@|X%>N*m1>J;LIZEssx5wAgCs*;d9wuv)4Q1S^XBv8rM#f8 zOU@|*=c%UXE{WsUhZeUtkdH=rPf(`kD|B89 z1DD=cQ^Oaip|ha`S&b3f^&nWyt@A1FhT!q1?pk<-`_R5r%!dZ0$)XuAZ(i>=K^^oD zQ`oZLZUak+k=*M>;$yXv%;M#B+6h)l3(l<6-2FSuZV3yjPaY&+cO&}K4N>cKX0K#f zF}}%9F`Pm*0O()lYm}2oaE5MgeJWJTTa<5p^LLx*C1Y6J6Sv>gx?Ypfu=FTM-4~Usu>ip4mj=BHktw1vfvzVe`57isz9RZTjA5mLaE6caG^x?O!5bwW!KSY=dll};{W1(xafytG!B=c_{sNU1g zdJ8h$h^aJz?0fz#vXu1jmwCxg_&v8hF$o5krOYPSh(uWZZShZAz68$LNuBd&6s^v< zE;|sMPp!DJWTFQ)jO&B#M>4}V5Kl?Ymzz$f-`)5VC!w7jwJ_>Cl4+heVr#7GGv>J1 z+}$e~ARY_QOy{2Jm*+pxoIry(5{(sFqit)`#aIC9Rd8oxO{Nm?@al$u>>xkt*_9u< z3)Me=c(FcVkwzU`cxC|>`Zu{hkbNoQB09ydFQq2w4(XN+=sbaF!4!{#ELa>**Tw5G z7s@9VqxMEuaTb*eSQhJ6+1h)B+a*NK8#Q*tJ5q3&vdmDIKh|GxbaY339{G=EjAQ^& zFWZLl@%^;eE@yYtzUjgn(-OrVwlkA=d0Dv3mB;_-vu|6{QaB6ovT@vOHnY?61@RX( zv=E%5-*nJN9pFzJj`TRjmXbmZBd;T?k<=ktY5j4AlgFQ{j~SOGT8}kr@;=Ru-{75e z_bP*$Fb~xo#4L=50pPv6F^E!J>64nG4hEPk2!j9AKQ|D0w0|2Z4PnwoJ~{Vn^y`RB z8>o{#7q6u(WWAMtJt^$3i#sZ@_ylGdcV!hPKt}v&%@w^qAMOf|qJa~@LXtOv3GCTf z8#>O+GpotNYA1+m0jtFe-Ba_I5tc*z&nh*}EB@vY`GuMd-}EKxcPmH?hvn1f#TSl5 znGmZuE9HU4Y*;=M!qZ{y@FRF^!t?$+0l#X+k20RX&qY(|6Z z0kh+Be5%w`ysqA#kTEIoAZE6q1L`Q(0Z9`XkeA$&f2OfhUzcyPO;THwDx(5 z=F9(n@N*iQ>u8I#T{*n~ci`r=_$1Tbh+Sd-)DwU&_dj*dm1kJk=39Ief1U!VE<^>i zhi<_(DW-_n0?pI+w}vsJhd4Wzg?noB(|9IfRh_p77jGX-NMq94f0IYnx~Ec!|FN(t z{I>*)_8ke-$Sad&&VM-kuJOrCPUrpEoXMf!2Ghyg)WELNYjK#p30jWUMGKRC6m%?i z3P^3MAQ#ph#kWn`7KkwEs^nl&mWuO=J}Y+3pJZurgCMvmjCg%Ckte>;*4;9{C%S%3 zT5C;gftq!ge-xIzvHwZDq2H95_B3vu|4XdmJWsB@Eb=nI8!+xm8?8Q5R3GmujFALx z0##&PyxNv^loU08pN%cFd9vHeZ2*S$A{Nd4>-cd(3#7TRgg-B^KU+F;#cofu8SV-~ zUN(Jt@7iSg@u0o8qC2w00%ipyxN^GX>w#du>b%*t02B4obQP;l69n75M60nbvQ^g1 zBmF=2?ODc^G>0HdLD{o^zRIPSW^)SS`~;JTuich6?{Iv$_WCK=bBO!l>Sj8g5@m;` zu@@U_Lh!vYl!)gfBNty`B#&rvYXmaT1RlBlo51^K?&NJw_+X0-Nb(bU@~dK}|_ zzMbGcKc$4F|HJPeYW6hUXI!Nif&Qt5hchRgLY?SZ*4jNAON&MbiS#@M8jY&yCDby5f)Ohjn>7iz@s_F`2FvslL{-}sr6$E%y-tOZDz zDI=4%x%wB=w6Xck3l*KuB(s@jp9F$Yi+i-zJE9Z3F{*HZ`d=)>@tgmaMjN#u;i+rK zKH;Y9K1e@1GfT*^)$5GEzY^r5dQZmrY?qS44V$bP`O=pc)(*R}q`4QbDhl(gcaAdP zBIubjiF>@D;m-lI=-}eYy^JYi# z{=F$!3U|wzog@?vDLXPLIj*vLwWv1%@8sW6bF>9TRjs{U*g>2 z$F}h(_Y#l4jo7w4msu^{&)&U)>mhn=UA6>Ht?pk7j4yc#(;N4amaG{<2LdESmb8j( ze3~{tVkC-48WM+^jjLZl{+^}eRdv*UW$Gy^+AO7%xF68d_i-VT>ThqkKU@tNr^He~ z6By|~nMWNDFA(+>mZXVwzDbLS^XYkOa$hi^0u6yvqLy=uHO6Wf{ZVlt5^T^Xl68P= zAF)q78X&niQG(^5me*wWr<|}o^mu}kWcc}-OUZ!bO5p6+s{77a8%N?|2=Boh{bAFu zmZ$HuORCbvnt`QoT+^ZYVNt|mhB=sfQod*Eu?^c5J>x4doMm01sX30sLg1CMn6s{# z?cM`>mecEQ>s@;jc~tPeAut3&e|S`~>NjfU<2DmGT@PkQOd9e2NiAJM`frtNG7gTO z^CL!-LrvKzH`~XIKyXQf`Y0Cex%4{(MX_=U``RrebJAtj{KExng#;T3sN`#=Ytxby zWxvv4uvzQCY{f=tS^#-*s-HHjQM_yQ?4urofV5y(&CkK=)3ikD^bh{wi$OkaSCVii z7hk^QrI)p3Y2?t2cb<-9H0YnQj!L0Bmm{;q$)pDEPU@5sPYz=h|6^I49BE>^Sl~*GUwgFHLVrDAV_Y-j=>fR34Jl zhb3gT2TvTBde0Dq{P+oF$GBIUQaoAwu4@T6F(?jFTs2!;fW`f3uiHqX` z)3+A>okaH!;!E@wh5g`74vnlLe%hG61>Vqy`c$&`B~ks#mS3m@A<~&eNCQ7bp*^{! zjLe^E*D;evXYKG8G5LJo<nW@})5i`s|q_iOVk?cI_G6Bb~yC7zUN{=y@-nsS7mbpfKu3BUhD9xc1EqEr~2 z@IC9J7@}d-@U+9wFftB~x1kJx-6pp!?3870&YHH8rb2nAA}R}kP??oHwUAUUwhCLy z>G;(#rGkv54Tmv}Me&6pB|kMD0);zzWKDZCGN-NfMwKpsPDdwfw8Q+WbDKISxHTa2 z5AKUZeSegz~jVFv5WLor~x|b?=v1SU-+(`aTk=g zr0N_a$;vC6dc48vesuCAW-&KoD1r8~eX3xg=IdbF7Tpo8ld&TR6%Ggi2 zahkY(>uObXrkI;RrwwP>_VKWz#^&wzXH(Hl=t9vD6J7?*G|pIk0+sz-mE6%Ayg$Vx z{<6#%Bp-zYKAy1pTEg__ZCXM5&pli|UW&q2Jv=F^(~Tm60NySq7nKD5Z5XgBzHIa~ zUea6NA6YW*EGi1Wp&u;?ACiyP!8@Cs(Og(Bpr9tI5X=T>4$WVnf`96lPE;^chN)`- zY&8y@^|Q3yOGkUG2kVRd0c_DwPd!;|5xkJiTDk8Y=ttb5W8w5GoOgG3&}AfO)$;R= zwyYom1#LS}PI|&KME<3M^M{=2-rAZL;BZ zN9?}abaZ)>^^M5Z6`f$?tBatms4w4HfJRxNhxbG~vieO=xI=ipUQI>_l25$Z;^h-* zgz1SSSW-rS_%labN?GX<&Rda7@@pP-QNk~S+#%M*A7k_DQ7jz+A(4yraqBQW`DDmX zlrNU7x0>*gKqh(mJz!l$F08D2@~K>(cN_DGHT;E?Q|9F_Z!{_pxXI!g4(%4&iQb7| zQGdB$-bGkv_|3@fiY&|S5D;>7ZnTVNx_tk&HEGT|WX4oIsx%3hsRDFnWvH(IO+LQj zasTN$L1q;D0i4v#gg@8$dhRTK<9)h#%djx`rgW`QMwv=_oiIDw1AR&D;d`TGI_=%i za;R*B$vmQsMP5{#X1?|bkQ`@OEOt(WWsNKGF3o}VgEQqD=R58%njP$Yd($|cegR`I z(ImbS>wFn}eMxkFjoxj1v3Dtw30mwZ;cBpSqlyGBr#!x6-27;r*Nk2A`Vp~*vW>sd z{avMRYBogv4Q*0HL*gv;?H4yk;d?WPQ!2oW2Y6CA=BZ90|IgoNK=g}Fj5&WZqN*d1 zwZ<(Mrnr5jn)JM&&L$=mSz%*{6PoPEz&;EgJ^%1AC2Txo2e^T|O@eX))bzLx3cX{o zKRsSXbX%?d`=sVYljSB<*%HPhCZRCmfjGP$=fE|0jI_BCV2O3=9uL+2^Dkm%|JiKug1p`C?`bBw3*=L%~{;|H|%dS%Zz-jhCa?=AZFm zj_gi>`O?^f!je0A<{b$VwJ**a?30x-VSkeF23SYxq=nudaYn44F{!F(GzmT& z)|UmHNF22+szFVN&Vj621Qeg>1j3t0&xxSwCS?(DreOu6!BSYm99d^Ac3vWVwxhA` zo%)0+nyU{J0m75sZoL{!5BtGDxo*`7zB;0K`N%F7(#Xi0@tTQ0Sp7-m+(Y(lCmu6I zK#aJ4u8ok%)B}bAmd8UL$!fE%6YB$G2uAlW#Y6Ii^+kF+#qsgOll0R!wAxXO)1r8t za$2OORdKC5yd>aW+dQt9qKpUV6fEfCHejy{`;+t%C-;_WemK86e^XBR`R~v63pk_$ zUc3t_t-VhSznGn%yE}?v4rWYNf9cTAA^?AVIF>SL`TJ-(FMPgEiy4Fa{YQ-%wPKMHRA)weH<>&-B5Scbl`pS*LRhFe5k4x z0G+X$%f0=VAAU#@8p9l!b! z7O!p*=Zy`QPi2#ob#B;91ueb*^VJLG$bC6@O@EnwXTl8q@(d#HE%#!XM}}p2I>i(E^-W{jS~15XH*NcVR?$b~)|zLC20R<6e0bK?(=!Dl;ee-~SO)lSmpAcW zhlgFXfgEV5C23}3$@-YJwS(b`_TXeqsuIvi@c2nsx<-sKCn{X?yPc%j(K`ONMQ>5= zlDk)X;Xb$4*UzE>MG}{cCYc{x3uiOsV#9H`IsGndwZrx3Iu8^5CC8P^busgGxHb6; zI>KFc?@E-vs&;vY-rc=XvH1iB9u4noMfS7S*Sh5Gq@mAEkIss9=FZ$Z<2>^~r+{DY z(+e2vZjhvM2g!#fZ{8WONPm^EsH=8Y+Wf6o@=n%}l^|atfA3Vkv}7TtpU8kHSR#Cv zN7Wh{x=ZYM>PdHZRC%ueuNect1Yo-x8`w4j#2}01z+9+vlgvQ=)*K|Ibjv#$z{qu0 z=f1(y`OUV|!qZB?8ste zH2_MLYDz+eivJ1ADP|I~L0fg=88zFa+iRRThq7V;4ry#sUH#KCXah7=#RgOHs^iyu z)wDn*aH!<9n7Dt_b8t0iJDr{`iVrYh@prtQnh- zAK6!0)*~z%j{P8_5q{2MyYxHv&si-zwP{_@b!5QZ8TB?VqS(JJqhc0Qsv>zI8;wi6Okhob}fe7qbL}$pcJ!z=?@GlQZ_j`>20`j0RV; zg*CmyFq)omeo2^2_;!6yeceU$?iWveuDP&8lq0h?X3ul&J2(Cyb$3ax8R;r_^Z>4 z2`pN0Mb`9Uj-&fKz%-63JhnS?((BaGyRg8{I)Ro&_&VWkb+v7u;CEu=h5Ys$9x%h7 z!l>t|PHVMH`Q`ybez5QfCLRWI7yTX=2_}i+ zdf6LH0(7b;iw-ps42%P9y;*3P8Ua2QzNH`*QpErC-suB%V@u!%S>L5uW$Fb`PxTK? zi1P*0m$(9t-H=S|2dgAQWBa}dfKoOZiB@Dy>KDy~nQ%XXggfQ3FLSsK&b zu;$Iq0>Tfwl}vG7F{Nfr8@!G3Sq(E`hRd65w0JNc(1{hd(Y=!N>|1qJ(~3AETw~tu z1h_Qx$a2>BiG@7TnJkr-ZT;Xr&i=-{e4dGf=&~(y(l;*~n-Wm!587|ueZa+L*@YAC z7i)Xi8bo~+($gl8F9g~Mwy?oH7NPGiwm3nm=VwscbS$p&9GD^dn`{a7;q|XWB$KF@ zTGAF@liO>p#j_D~>jhq!C`>!r^QDW1V-~NrjSX-j47S+w%-)Uu{P+nU`pDXMcmrv+ zym)e;=F_waP<)s2LYt{HcPkx1EckeW)b&EnjwePsNH@&+YAJyR9Hk{^ySR4-97mvM z2l%7Z{+8+U>w)6Hj}Lmb?1nxX+PYRvM7zCu4>w1g&%d5!CX2y{nqioAhw>I-*E(hP!?|hz_gWv@mZF;jqDx@U(*X@?i|Ll&+gO z7#n6lT6Ej8oE{ZDgN~Yc-(nI0>}Jm&jYGJOtSEzjoX0#IxHwhuSyz&BMNm%(y|V*! zT|7XzoAmS5nb5@9rKSHyB=y4Sn265ca=I#D!v4EBX5Y?3~Bcg>W9Rd zIiG-Ku#>C%1_pbKX_TPhl*OCE&3QjUQj|fyEGy$RGZ0AJ7wZSwYr&@JXyjJWg5X~1 zdIJbQ-+J{rMTf!G=J6epo;e=6QSbopy25r$bB+-s+PmVHEKQ3`C=i;!*3O>`cEs6e ztH~to1)xXuA=X9%<9r8OmZ-TUg-w0({G??ZNf~?lDCtC$&Tt~PaMb_|_DooF!(NA3 zQd|E2+6isgMeYvHaqh>v0T+~nMOM5@cb?guVSf4Hh)pXg@)ywm&f=k+RFb#SCQg_6 z1YWbAy4R{EXZBlXg0!HBC)B9qXTrl;)r4G@SS|eTO+fu`cq+1Nq==6Vj2TXVXOJDuw)x zlQ~~bq#V0XRQ!%x{E0s)Y*AmnkDGAs`_3F_%mLBFbX?TI>e}DrTOE8i7<=y2Ym$Yr zs6j)%Rq!j**lcdPI01|(M;C-!%NL$XX`3*>2<|3wU&CCc(RxCcG2C}kR&g~yoG;u1 z8TZqo3EP0!AeNBKwMRI*e{@hGd6IzUIY9ow1fK5(Su>;uJm>Y_dKgUubt`nvFE-)i zxb-0w$XV#1w^Lebzm89lY*UqI-fTl?A_Dno#4WAt&2W@w2v%;*n(k@JuIn??AC zbqfkt_Om`p0&^UD0jlY^%SybCJ+K43;20LokEp^b;kPRjBiDs73zEAnd74PGxqMrb$kADZTCNeY1bMQt;k3v)Q2SMwB}Vr%E*$h z#a?mRL1Q)Px`>)6{Ry3j>-(}{ly%OUKWpZN2b-+lUVx26>+5t$=N^?}2O3u-6*{$Xe`G+Fec&h|QLufX6XL7{zOD&f<}cgM%I36lp|*14toJ4zxVp>>iXE%7n}2}b`0kQpKH!2J9=zj+ zJEt5>$(wT@T_-dy7oH8en5}5g-+lSO%)OjNC~ydQCs|K^M&3~1uMsT|0A&ZMWZm65 zSwuey)Q-#g@`I}zsvU>Yd_Nts$ zjON26dI@T8V}sd82T+RONoGg~InC8f4SGC|7a2s~7ejX)Altlv>O?KoCnLL3+Q~(h zrl>~ixv^>&COXuZ*&o5;FF2}at`tsfm`RH={!ZDz{!*u3zqa^n%bdK)7b+vAIULCm z2fcgF1Kou!{}k#(b2{h4k7MhGf2R=%e&~W>tUKtN!&K zY+^W=YVClV)Fj2thnM`Z_iAti7r|`Er~s(dz5P+AI(6D-eaWm%rJus9)K@`Qtt{=DosB|I{N4dTnLk5(}=Pb8 zP|ZHE1mhGa0Ch`P=0`b84$Z|q@PK`oJVx;j;pHsXzYn>M-8>Td(qn1tRqAi!w}6Zg zBmN!n4egnG3hHxNEjeDXBZ`5Pc-(&-N#;C)V8x%Mf>eH4QDMooC2=t+;XF*Qk37T2 zvAV1I!?yY6g+36;^%Zo}?pfP!LLV7 z9F%@G4R%&FmS9FpCZX3^#t+7BM1IKA_(MkjzzZbz#DRx}+wA2xKkJg0 zf(<>1tSau&6_+&&zenw?G(%7KLwy54`Taz`{P5qKQ^j-hXyPu^kT*Z5uD=%7?A0i2 zi&ktD#3;{+=A4XfJ>`SxYAF1DJQ6#|zq^*Gk=&{SQj6xX`n)DAV+`#y7bwb!d zpSOU4q2`(*gpuu5lDb+cf)4OYQTyI|-u2D{5D(uc?33&_9xOQT6q z*e6!Yfzz%bLn+Ct3II8 zmXJdY1j`!=v6d>M_SQ6=N*2HdZYS7y5u+2>Ug{9~Or3jNa&Nq1uau@Os&)u~B6mep zMkO4K0cVB(<4eLcrpef0c+k(HbSgqQND7lQ#`3E1z1HvCk5^nHR znUstW0+6?OVTgOSnxV@9=JV5Z>oYn0I`J*D-4G{uH^%i_3w02$NQJ`h8>~JJ7Mw>|z#K!LEOA=ZFhN|~D0XNT z_Nd@@Y5kIO2xK4_v|3C*^jsypz>2Vj;HEFJ1nby3hY9(&qHY=p`6p87U1Z8#=K&6JI8tcZ;K5Z z;2#X%3n>Ryq$oPcKl^>Hl5vhJVq2{?#&xm9+B)X>k(CWgBHr5q15l$Vph`oaaD z;W`VZCWC$I-_34VbP7$`z8P|9_^m+yc~E}MpPjS%hatAtK0)2tvR2#2&d#qZ%2@-~ zEU`gpZ$*ZfMEb=TW9v@Hz5F`TK4rXB-UoOZM{w10`YP$~uCO<&>3T-t^Y8c;PbU6| zf;t1B6-&N8a{FthAiGTf6hGz`12`7Y_(y?iPDkMV^Rmq`FQyPwegn3v$r2}=f_;PH{5)Ek%7yzXB@Pu*juDKEOu zG0Z-K6YGs~W2me#m3g2G4T&Y?<+)RfSi6^RP1~^v=iJ-}ayCVW-x6+iaKAvl0&yt) zy@t#o4&&XuvU+9u~eY75N}H)s+1=gtN_ z3`ZXgC?-q*a?)oB*P^_YF0ZsCdlw!E(F&JWCA{nbN<-+EJaTkv$&SAu!_vZc)0?qkB)35*7s`^JDSA#2E?I&ycUu6IZQH-mnWUC?XoaB+PF49h?Qw<+k$2|_ z@!_XX`nXnsR_i=yh8RAIiTPDF>F!cLUN;CMl{7E#HxAOq2OP|>VUv%so3l$oo0U88 zg*V%+w{784`eR5Df3eH^$W8^HFUS3VVKIz;?WfpeUYH(YoZ1fC@C!*UKxqzK=@NIT zf87`XtEug&l@I)U&>=j(WQR-^H=7ca7}BUKgh09H9%MX-H_Kd9BQO`JV>VOd;KhveK1*4En5#Plzdl!w|daPJv2_)7VDtmBi1-719neI97)clyb zWq?<9D2jQ>s37h&u5XTe=wOZ(8WP_i3@04Hb`=S{iixDmx`IScWZoNJ+GL$>mPAJQmSzzJ zM_j4_C9IQ>;swMdVz2d358#%A^v?b%?H~A0g~7pK;KNMEmjIfw;()}8ut^o6?kUR@ zGFQVya_?Hdn$07kv<9ZhT6Ess_)8$rxX`M38w}CBAv9rbp`L%Hj$Ki5SxTbur1sy} zD3tc9;BqnO3;TN{l{jsV^NPna>Qkg8ri{v)$Bx0i7)TkEu+3 zYo6(d9;g2Mg{kIhtt9{qM)9q0V*KbVu^*3ps7%*Bx%J)lM_g8L#kf&7DJwl3rM z!7@~s8NYw3247jl8ngCTVO?Y89EdRX$Sj%ncZS-joRaEOZ5OBoGtm(JMH(ud*tV+l z?dIWDNEL#=>>%eX>V?B^O=XKVKl}V6N}rKMb;Wcp?Cm0lUBwP?m*dezTBc`I5=B8) z(VlSqP~4qXw)kshn}E57OkcBJJ-?E@JFObL*3@zcyY7Gc(VClk$VUWnA%ZT%g8USx zGgRq?HKcmiq{*pUL)*%2tb4qIzf8gS^Rv7ys}kd_ng-zt6t6)-QVj zINNj`INg@$BjE-2ri+toJTs`VVZi$t$0WXm0lTzXib)Z~g(X4H%AR8GLr49crtpN} zNKz$sUUfsqO4saRI0~lURG#!WH3U3PF5xxn@_43>>w*V>|IJIIMbZjm$GcnGTiTw?cPcl*dCHS*;rL1QO zkiQ%#%wC6nZ}Rp5kw-AMI{nyh_ffbPLQ8NS=sy&N&Bot8T!>i~w(t05c22G=M_ak` zEN!E~pI*!7zHO4&8uzoRwDb3FoZD*@cxHmff1-Wfbd3S+C-XUgdJ}W@) zPX$bJBP47o#*h7^Vq2x3PZwM5@gUt0qWNF1*5UH(1Fq9S6!xKexQ)`KF7L#S;XB^1 zp|;ax)`gxp{}a!cOH6N;x}pdR1rgC-c|?COuGPD?k}b%amk~RHZu8E|fNeE$gsE2g z1Da)HagXhmkQiI``8nGG2c?%i5e{pyYcW{)$eH~&*voHSIKs!?pU~u{h`aRYPT{$n z-AI2<&IxtyPJ+FD?B_%ZZTz{g5FQ(YhJMA*vCi_mL95q@rG`hYpE>sS;Nz!@c2s)^ zXI#5x3-t!BS@_Eb-H)?f)nEE^MBD5>*5B0V;-vSy245NOM$1Qdr|w&}YWM3`k~FOm z5k6s2bw|uf^F}Gbh_jd>y_xR}sV1`<(9)F5` z+5)Q_a(gCcE3EFvh(tB&`N>Sjyyh6ZiPXx)q5~CJ7z#XIYmla0=44mq{C*;O!5`SC zw&7>rQ@lvisY_bCr~iQiIjgofwY_6ZZ>|ZE`2ft?lGct~Am3wP?Y=q%-efM@e`%ch zhL6Y~UXy*^-GyAZ3OT}|g+`)u4XI_Qq-Ft<;bxOZjDZ0hRm&U~mwDZ!z2(qKytpBh zn++~n2%LUaBHRymONgS*+AjZxptLD?C2^q9H&x%Si(&SxGi+9c5(i_7wT%=o3Z;yO zz_%Q7E&}hd$Fd_D7;HkkX^_QgFUs;)1ap^SyL6VJ#*c=Yv3w@<=@9 zL@SdX?WcavNCol!ClY=pCNmK9J)ZS>7iCO*fj)b7h6cXbT$y<;!U4sGY3RU4oOip1 zhc!iVGd$LkV*Xy<;lncK19|mtpJZ=*sX=_#yc+!D@nP-_!iMj|C6s9f0@!;4Dv6t6 z8h$ru5p9QE0R?_BF2PslGV@LE=NM0=*mJmzjOLSGOo{1Kz!UZ|ApIr4sVPJmdXq5b@DXIu0l<+ZVB%IzfTrd= zzmOhpURl<-h&e_Io;)(Ar|o`%@>}dH{n#&cYSk=%bA!vF3b%g*`MSwmORyQ|l%r;lnf^B?q-JinwWoNQwEup|>NlaVV$tKk-h^cg+IumM6knv-vG8^!v88$* zwilB&>!+oQ7{BdL;N6j6(7OMwXJXk9e!qE0VTUKp1J7N#?&whL4~vfs`Q3l@Qd$qS zv6yhvY`pOqr~|b8hjX$Qj^xl!aeR%JYF!XDgWI&-@M{Sx{ttJRC}gaYF3y!aDQkw;2&~iOW;bOm0y%5Go0qDC z{i)k1#oqeXdo)!%07?bzm~C7z%(3D>^-KEd>!uTg$jy(#>aUYx?%`Q>-8yQ7>o{SG zLR<|(ZIid#zBCrN5iuF>$tO{8bEN`Jk~X@KNK!GUP8D+yqRC)6V=Yt#bK&_fsCs+` z_@DOAx^GdDXbzajJ9%aAODM;5Z%^_sZ$hp1No||873TBfAl-Su7w|dGVa+F-^B*PP zGJH^t!zFI_dV59L9nP||S)lht>j}JX!)M~C2VEpU z;@`l?!mHnSciZ>5DAsxAyGsY0i=}%4)AZLWuPZLS6O2yuA{`S=yqnpz10g>!dePJF z&YV65;FJW)4J&qUpt9`@I7o>wStRlr?j7*n?Fhkev)@I^O`_V_jq@xHlTGO*_DR!H zBf(+;viWo`=-P_M@}7iO26j_!O&(p4_XonW9im${I&{c1x*7@e7khDvBH^dxrxSR$ zSxDGEHN-HaeD+u?_s_N=`9zW^3WvU(U3gJ7f^w*0?!e#eE>_-& z=t9|1SP6ExL`D_MYIgzHyWPoo-g8-cND89s5&U!bS*1PAw_mWWzkcRx(Q{ zF(BEsLU9k65oi0!rq?7RK&9kk2BW{sYECd14*RA3AKR{4ge)1J7Qh!d-Yuc{2VZ#o z_lkV2m|yxzJY;JT@vfG*XSH{E!B!pbW?U3KZ%M2*o7591g4q`Z-v^L%SKsbhtCaKv zW>n&&QJ)q{cMD$^#m@B-ZV{{G#<6yb$$Q|ar719C|0%Vvw#_f|t}F*8P8)9PsxP(S zo>ycvpY?aZc~Cyc(FyHq+KN1v?yRi6Y=x+|w(!?ZR!=Wpkp+Q<|AXK8*sO?k|C|B3 zL2tRe?SoO#WuxM}r=1V~oO#cCq4CLgs@93Q(u2Jm(Q@=tf7PMMjadUNC3WdKTy`_w zjigGe)T2Qn!={fkdw%?yRTvqx;9Rdh(+aq8<-nkap*a`6|CF_3%;V)Jv5;%N`&Md)nSi8tgQ6sHqz9(fLR{DvOy3DqrzwoU& zgeffVlfAe3yCE*`1_EYhG4A7uKIp48uwAzSplOED1=mPz3+*E1I*=ia!zv!~j6@(0 zAjtc89r^ca_I0KU8oeJ*3?^O7&*yM9&-g=$ zH=q(lK^Zw$VV{4v48#oi=ZGHiL{1%O4?kI!J|nwJ&b>;5_B+5U1Kwu}E4*yw_P_5* zhz6!Z|^`T5!RO<8_T66>8V!sft(=DK_=o zp@apNw3opy<>JWVTB>|z7|k7{PmByfki5H3_sg#9q+`3(Hi$vEd5Nj=!`qUt+k6gn zf&D0Pf!(J;Qu@I!5#!8itzrGDiZ!9yq`y6Uu0YG(kK&KNKrF_?odFnvupT zCuuDIOnC3*3FhHTt3jPP4qHJMMZOle`mC|+U1r~Brc@wR4JH1zL_Zn7ZBWF6M? zf?>SHOeHkFdoVoPe;d>IXSallQ{7$=eXz$KnBCud{=hm`atC4y=6?<7*Qy7H3yMcM z6#6XSmxT65hLA&=Uwr@xXywz|h8fa|mrk%Bjv|HgQ5|jLw9&vj)Reru>+1}UZ7`DEsNGfLr*B{%W5d}GJ<{@Vu#xwzf_6jH8Po1uDwdV|??^Ak1Me#ZQa zHu3rPc+#kwkb2pF5=9JvU>z#2(_;75?#FJe0R>Khrbuy)uuo|w>I!(9G~;;QI{>;g zb0SSPqR>e(`B)!hQfT< z$VY=A8rmg+Dc&NYc7iB_19N?urMPI?t`l_}@XF z>(hg|Ye_m0)CVVh4c<{hWv#m&^QSJ$>F-J0rDLD`sq(QjbynV|<1-Z>wWJ&O(a zlE?62g!~ z0#Fkx&!OW6ZSaM>Qul+Y4+O!)!_X3l!Ghj*6!|zyzpk77{|Qbz;v=*xofbi&C>7&F zsQRbsez9dI4iK?XqvR@&^7EZ(P$bn44a4O2*-h|PD~YNL)AM(0B_bDcmb1UCdrR0) z#0;z<{s(e%iwK$$B6(9=wH{!MN6?`_`8kQC(#11HG3F}Jg(3>Pf%?}x0dFIqjqDe8 z)daxTTY52FcbpwU=O44+J6vBI`&ZsaaLa+Hl%tG9u%19HrA?hTV+E2VoQ;k$YuD@5 z?>bTx0ZE*}R<`I3(Gmm8JC8_Oh-J;aoQbz0nV=VyPiW_c?EoYkzOu*eO_&f^^Eq9& zwgE|}xd&W(YCDo{wj``tS$)$^`U6;X_*1W6!!`FnycF4r5CImr>!6!|dOu;^_1ydX zK(coKt5M7A?Xf{?ng$P1p_+e5J$wCrP?I^f7yjC1^)^)?2zF1cBC6ods@I@`C@q=pFj<1g$OS9b?B z->5&q3E%FGpX&ObK-g2qno?|#0mw5thapl5>e#Pgsv8>A6G|rbsnsRCTd@7sY6=K^ z;30J^=nSpZQVT}m(+)ev~WVX|m>6NslL5 z=0L@dAyrbqY}lm*yU?lXb+ze_Q=6h0qRGPC+dlE)qTgpl<6Ip0`giz;NXml`mpcmb zbk<=WQx9hfhXPK#dlfSM1}W^*W>idH>1oaRr&9r@mNkC;I-otn{(^t#@TaGILu&g+ zROBH>myU=EQ6u@xOI0E&CO_(@?GbDZjQ-NnJd~MFVV;G9Q`h}K4cv!MLU!sVe~ z)bnT>b6Fh=l%qixCgS64HPq)k$Zi_Y@QP_a-bP4VWf8ccKbnK34^@O@?D)4DX~jl! z-P}*-!K?s^7!UE}smPHUF~xgq%XyS(4L{ugnES=ef?H5Qo*qm9&VP6BHP2uVO|8$d zw1y)KsSYo+Nb!*CA6sobT-e1Vw~W?fmF`fc|7y-q08F(mD&X~WTm6@)V?Qxrb2eWx zZm}3S;@jNF0ZJ-ew~%8fFs^0!OUtR(DRSXF8irNxQ9Ojlx^r)B-xoyof65ZaWN`0# zNTtRLaW(KD_B{NX-mG}cq?nvENe4j%87@3bFf^b|V*SajxQO}!#!=cLNLN5x`Sdeq zqz}BHGn8E+%u3Tkd;_IG5rklSCxkNM44nml;EyU=TPE=2pOjU(f2CJc^!V}fV|Sz% zfWceq&C<#_9>8{pr<7ism#hSKK~#Ok!dueA`KGC@la`KY?EK*5R#i-m{nS=uEcH_+ zQWSbQ@#INK->}c2rHN+G?iz`mJMBfXf$aE@aN3d{bI@n)+y?7kgrtWSAUoBQD`0lE z&%NR_N*DN=#^hEtfW`6BBNu|tdGoX=BI=XC@Li=52n*#3UEfyP$Ab3ax9b1UjqEb< z+N;={I~aI!8HTRFFgcD`wcVhSU4!YK8#M7(cpeZA(>x6n^bA&pv)l~OysU46!$j`P z+-0B4!+}?k3{Vlg)?^YtPTD+%GNwNn5M;5G;u&_3sM}@Ao>w#`_xvz2Wo~tK?1Kxz zEfdDwF*z^K?^CjK8K9YnvpL-t$=aL79H z51z1m!Uj+arxu@svtX>~Y_kbbcmm^+0+3|g;4nN9!YuVIvwzCm-?!a)PdVm!U583% zXk>dF+WRu&l0bU(tYmOb2w4p1gqG5x4uNyXBY|#ky*c)ZY-}5Nm?IXEnr?uXCW~6-rq+_uCBUlvgp-h>OZX;Lj?XJ4D%mY zIw(ui`&RcEdlbU!jU-^7r-Aj|63(0p_`zbJx9~8W)+Y6m0K;q}n6Tp7;zY1lL8NA) z4itlj1Pe)M@(6LCQ;D1bjQRjeHwo}S;Vvrx8g8`_nZ!DwgSzMU>Y688&R*%=Z||?_ z%o92;W`jLCOY^y)zTLB=4r)NW!~_kngGyUYF;^9!<_FB82hE0=pxn)PX||O8BqKG6 zDYf~a6wro)jVYo%Ste1(-Vl$PK?M6|$0)UB!$deiVw7+RxiY)(06T*D9hjyBQ9`jz z72x~J(8Nq1)CF?jI(Hv? z{2&wye8hV8Ef(j{0AyX>!GRb3>&)KMlQCCNPjO)O56mwg!jB?#flV6oAL4#57R467 zl+-h6bU4WPAvjWx?1~m^fc1zzqi<;eUnOqW<&xp)u)WXP~iML`hlOfVYrWTg+qJY%E3!w6}||6olrn4V1gQRP%7oYHu-)|k;NU`*{kbSxt?L;|)(-u55h zhNELr3BT*V z5>$;Dgy+zEY$Ueun=B!ACJLLjxd@-`P(cIM@zePjcaSkokJ%5%!ZJELNI?)m9c&ob zYNa8T`rrq*f+IM&Lq3}x`v}<*jz(>e_xC}P5aZ~|=EYblkXe7WBF`Lm8l6E8Tx6 zXcyu~h$*b9wX_~?NChgeVVh=e=f4Ou-F=z7a98#`mX*S!Euwqj@IK`PZ{{qha&{cW zaJh!KWHzF|Nqd0m#_kk!)d3B0EFs3Eo`fUgNfhddc-|dh)~B*qP}>@{(wE5v_dqMR zY@&tsjNon9oqGx>Ybr0D$@wtkB*jLM9ZveV?!r`=2TdH{5!rMzhNr8}(4o@X5CF3K z82mu#ZKeU@BsB;Reroq*fIEcH^N5YI|2Er1b6O?84O)WVBcm)k;+rDMa7_~c=Ec78 zkga+sbtM;pGAVrkfTu6$ujEn9eC?4|!Jo@O2f9GTU((TtEI&{I%L6AwA_h-N_exOw zS?Nl`K|3hPpnT1HBas{9a6NCBg2Vp2FZH&n!CN-*P$Ubr`mGwZOh8J2miy zijEH%JO(kP-q01hor{xZeGP;?%9M*8X1Hl1kij3e<=n;LduQ6+-V!gf$1uY|YSK4&Gii zdpPBTTJ=6RX@PFUXW<0jz=j1YaT_Sq-BL0mH!#Mz#}d$i#5Enqm~nECL$r$o;Z93T z>H;fODJ0z@wz}mf20V7~3G-kQRND0fV>$_cRE%g|DfdM=BQHn{UT-T?o%>hxXn6dWN&s@Qd|puSEp zaa72u^d&M(WRNY|3qIX+-Iv`A?m(Ddji3~x(ot*z>M5}iwKA|twX#T`S{2BGX{jTfKg>i27QJsKoc9l+TxkGfUx$!ZsCcz% z#=oHzDpSy}4nw8pRE_;XXzubH9AFRseh{)u{okN6$0;8yOApgr zDQv4z;v}rF0lYvt&Pvx(vFqX9WNE=ZAd#)Fn-36sIBD2-gd6IvcLbO&4Q&#%qC%rW z$UVNy81Q3ik_5Z6$C|k0li7>GQf&}f?ENn!KWe@7j~2-VNSG*lK>7d^vgby{pQv)E zMqUdFY?u)yW`L=87Gm#mPBo9&qpT{OqRp1kfOy7927BHt+i!$6@;6f#1yM)&=n4i8 z#J>R>Q~!bxyX>pMLu6N|2|D;Td`Ps4R{gyU!4*?Rw5Do2%mx)B?*Gc3~HvSRA&$<$ZeM6vvb6c-9maxqEf31MuPUAjhpVm zR0lf(3WKWfOxMx)^NLU4rb0A*U(QKk5uE03hDE=14Qid#&Ol)!s{-3OxvS@}*AR;6 z+k)R6-Vy*F_|mnZVVOQkz6Luy8^%D`oW6$1*e#2r%q?*-ZyL@BZVEs%bDQb`qU{Yy} z$aww9MW9K*RLUvSTbAI z9)8_kN!v(=n>}vy25<((x^2%+Ng8f=;}-|if-HZfqhQ2FN20#g}5aq-+x^7JEQ^5xTENY%mxHv)W3)1jVoyrL^olvr~)I=ZCg~_x??Lb^s^9b zb6^|09qsUtK$v%Bt_wDQCCxXE@3T0Y0;+n;LE0nso4`+2BgFOr3Z3H1d@bFFxi(kQ zD&hcqiwUqY8x3UhPqNLS76CQ1%2qdrDSv3Kx(osCVSl47lGr^_XpO|Y z7Hs%qPs|vly}4*=+p)QM#_n)2+cwH$=A^%EC9CQdUqs)}kiRUSLX`c!4#wvcD`@}e z!UU;m7s-!>)vCI?yL%u=(|282L9BAnCGPkOO;<#hha|7|Uc5W47W$s_pW(az^bzg_ z$~-n>tq2mWM{WL{VnNqEx4AnnDqD0Eh!Hp6i3QGbY}b40#?w^PmW1l!=P#ZZJ28i) z?s0Zu(1{NS8F067#FCj@4luGlL^q&Y>w?VNA|F_oGM#+7)`U`ui zZzOTux`~4(5Z_HXf`(}MWZGO+HuqQr+y3roj79OZ^NzESveaV+iuciYk*`ex8l83C z!4WwIsJ?c!NYcL9UHUex2X;pgK7y(NEye%!Xdo>U_XZVB;! z*`6-1dA)YS!MFF}Q@vkaX?>wU-g@b!3-*3io$nheW2&|VP+s(s##@5a0#0wxmCY5H zn=C$nLf)y|rOB^mez5f&?*qLxOE#E(`*E~CDch|Uy*2CX^sBF~Ixm!Mh*$lwtfa@` z2gp4BUneE|oSnYR9X09Q#B&iAlyKh@rqZR3H>{6R@M)gR*y*#Mov)Cu&1E;TM>1>s zM}@$~cGD3}#ltshJ${m>Op|QY{x+Tpy8JU#@+2|M{J$7CX0=SO#G!;)NYtVT> zdflpdg`$yW&C|S<3qpcUiSkE}R{9DL{t!iv#`)=c-|q0vN`H1DXSW2fG$Mzkg3QSW z!7FJ63JwJw&sZ;fEEKSKd>vlKQo*AP*FkOv47f zkUmAAExY|iOr8UZ`^bRISNG!;i&~1P3I6<22aI>*JLtQ;y{12H&J4T8#WuEpju~T| zU`lE?z`ZKBTr{#JGKZ?&Vpi(hDLMe?wep4#q`C+uVSqj)Sv zggqU?r1$4-zFxSfcGq3$g`BoR!$O+@v}^yB3edWEa#`uoz;kPTu_LF_8+M3Yv8s3D z>w`yga9nmZ4R}1i%N@l@?&G_vm&VVo5U1GwjP*?ipMpif)UNj3v)!&|5 zB@`1S_d|iPiN{p$fa+zRZa2f|t0>)ZGrOYYBB|CTrE>9Kr)Si*{6?c`RCSXC z(*sxN4}9&Xc_1WWEb@RS5-*+Mt#@Z7F%)A-{3D|EozJP~$v1I*Re!PBzjspZRodJi zF3IN}Ol6U{bTOJorGvVq&n4o=r_0AgC1T2sKPO$76&0Sw_Nb+(eb_Wni+?vnfqNEx zF$xW^h~KxLbh+TUSGDoEpor_hSIbz}wbV(qacbW``Kr+-c-)2_U~Bv-M!d+T^+9cy z?$isF@#A8HJ2$WNS;UQ3RrJm}x_(uGjQ?;!DM+5TwGfs?|&wt+-geWDS)v%p8eotqy zkM(46LwJ08H9Nr(0Q=1eX`anL7l%{2oraS;|Y(JMMQADsCQsaq;FQ|3059$IkI3uZ;6Bmiot&i?pQdNRWhs2F9@X-&;8e)dng?I?1>Y`xnr=rsrF8_dH2f&*K|FRhNX#| zWdrfyoG4wLkWCK*=RIf|F7e7qL%@*2Kw!Mv^V$mmJvqM5r+fBu!c>7t6Lo&Qx}%YV z8wE{uJJOd?T+H8J+Sjrlqw_L6`!=>ukva!2_1c8BG`@~Zbh?VsY&sX4U;g=HeQDs4 z_wE&GZ|n4h`^NXV*M0oRMHl7UQVCzb8tC;U-#vtsFZJIkw{e@= zLU%Z9{Ip>%ES%Q;vcsBxQYv(SO76>5^?5&GsiuLwE#;!%^nRkMuW(By^iA^5qjgJp zERWKP#iJyT*Fk>K-Cjv4Z$$7?W=ZB^$Hyk>RNt1fxl41+#v6D(*n&k)@80izR_bCr z-k@nAb^P{&+c#eS`TOCe#A8JBoBRnq(=g^=&8pXV_hiin_%J(TuxnvhOR2CV<$R)D zcU7i&LG_kAur&F_mw`8(7jH=yqdPck2Y!NFfu$Ok_2P`Vh`ENIZ&f>;#A=- z(imcOFeMTF7XPwK<7qkA!MY)a3oac}ySRvmo=FF5C%+dn&O12EEmTGhYnZty%FSJq zGPM_0E}i=2Tf(4j;#y_gEg{hv<9~$=7?Z+;A~ zpP~B;83N?A`xEGlf*12Vd$yX2@n~R&ES{hCa;MablBBw$&)G(lVK%|&fVcs1!`HaO zxzJie2f5jS#nT?01J6E1HhUn~0-V)x^q-3%pDK@l|Li?fn*0XK_`s#*@wUuAUjgKZ z#w1%kt!ZA`-4iaaSD80`(WosDzR&s%=*%2?dZboUsyskH{>a@58^=3ahf@Z`8$7iM z1GFjwr>a5MM%9$VaDz8=`Sv%TWiYwcivE!uXK9HZ7ptT)LO^1b)_VeL*8?Q)w=@a7 zWlWv=EKP9shlHq!yo;$nW3y?R+^ALfXPVFESl(z%3ReQ@U4XE$xonYMxu0$2=I4}a ze>;for_V6d_kH@tx{KL2cT=&~rfOcMMTt8nTO2Jl^hOHaKj`z* zfGx=hGLFgD)Ez+;B6NPOK3Q<3o4pVBG-Q^ciB*-LSL&VX@qu5Ckvh10PYwCoaLIZH zZy$&AFP;tl%hLF--r`2Xe1)8K%1w$-%&fo>)elFqhZ{wfvpnkSwzMaS%T;x~?j7S5 zns<8V?B6Hr8mIj9WxUCI;tb8VVtSlzyTG`QEch#B8Jyrok+uxA_;a)*d{XZD5zDqy ze@^|s_fOkE#T02+Z8)ZMIiQ#_FAJ-U`oEi(V6~qxrAq;Bl==Uwxe|b*%xl7GV=$%v z|EAy9`U`t#NtSgxXn^9v#V%P!s4|4`0edj+mFk#i_;P?aqbvgBx7R2Dn{2~tQtX9M zxa9zQMp-n*Z@*CyHra)5`2RYiDf5!B+Rqq;<$(XwwEPxg_dVd&T)1HO50J1C%TAu^ zQ@)Ds`ad^Dl*O|x&v;O_?fAs~#v8g>z(q5LhX~58L1A0f;{r6)yeQ;J+>K-beXs)Q zV+Z9$y(v>kjI60n@tRwHVrHsnsBV+(XeP0AK>FYoUF=j}W}NJgvQiBdO~FG7QB zei-n;T9kDEX~&z7FSj0KG03UERW2Qm7sPu`BM9DY<>FJiZ77fioLes zDt^2>a;hd^BjHY8DD4~!Gx*Z=Cx6woe{XjmdZbBJZU067s*QeNUv9h**cXwg% zw#W;yG%a4A(>3bC21{vcMDc6>@%Oc!U%I1xu0dY*r{XSG=fUdF&i|+dnO3`1n)ERF zMg5>C0hwqOm*j+J-eLy1mZ#(kzmXxpDnE$teZ#1{wCoHK0@|;u9oE?pn@=bwYPVLU znwVa@aIO8jLAbf6NnzH&o@dFQ+r0r6(4+8@YlG{n_dFF7oqcd?D<2*wz2Nb1U-TUh z74(ZRMXA2+yOjFSZcACgFXnCO^sA%ef*DWXY8Qjo#mnlBow}nlE!!^YdVlB3Gvtk) zGk)?>I*yb-ZPoi92HlUcE~uPqR)>sJwbKO){_fs?@LS`Jn%3YSZiEGvu=z*zzl4yo z+lW}(3>kapOLab6sNFx1%RGZI{!K_VL_owmW$113e2?H3rETGjW;nbYoKd%|BrvbF zj&OxEs`7O04}m8!?k2V&!SjOx4vLinbDuwUd{`(gvwzhP15<65VZA-AE5iaVvcOiqK|!sRCpTax z|9syINN&%%nyKC7oSgL7L;Gu@WC|ZTR7ot@xMA8c=Eu%I>ca@7C9V6^ukvd#O0O6v zd8+@`-T==>R&~2BToyW)wY?*zeA)d_yKgZ#cpX*_R)<6DpX8(Wykl71dZ z9zSj0#+MpveW)t*F+)65VdKWk#dOhg*DH{(hxYIJ%1=VMs`^_g&2MUtjzGrF z5AJfg`z!5D4^EvRugYES6N6llfZTH2?omJVnHffp497OKs~Res=RjKeWZN~vu(zDv zyV_e_v_4E+l(czsQLi_ih9m!=Z4+~SvWvMf?E^}{>O zaK(P`O8O;J4elH3`@b=`zdDmEV<1Nl<;HO>?LO$Q%|635n)CN4)18L=-YCh--m6ga zk!G1?3)5dw8g(X{SVJM_P5Wpkfdiw8wEyOgUq7?#q^9^z7D2c^?Y6wb(9bs0QK}Qn z4eY@F=%kkWI4Yrj^ZQW3ilejmJ=PMn67K7mlze^W8)fwCk+qCTzoEKr|B%~Le_&!o zcxx;;i+3z&Tf5OY4DHp(1`#s=L`4hJpbFw0v@@1%42`xqB+-K%fik6blVeL zqUmqbUwF1Vn0FS7xlJ8>7R;xoGd2wMZB#NHK%(Y`+9uJ_OaM|e#GQz z6>iES)AZreo1<%AbG+AI`Y$tw&&Z8Oa*T{p#Kmv_a{Vv<%HN}~=vh-gM2fhU=^?!< z`;$7Vbo?X#5X{#2^X69mU~7)%rDxh3CVAHvwl8ykuJW+&y7tZWv>nC)nq_=`LooGq zAa+cJT)>n!qcZM0iumy9`S7iuK2`t8%TT2K1ixbkX!L7zOW|X9w0YUy)*7EfhE3p) zDujX*W1C1!n%{~NJ4|O)*vd(a%+(mb9t1BuYhGB6?rA&WsG$DdU)L~$kGSKuKAZG+ zQmLQ`NZ?v6^VM*%G%+flthkvdUoCQ0!t3fYQK!NoT7a|rsb$9XEtm*{=%f%YmJE9dT^7YBpp9~>Y4Mb$3Xz3BQv%q8E|v<9qgqr z1;*HYcQd&X`H0GB~k6KVel#hVS`{%+8ZY^28Q%!1_wU7?Qci#Tj@9)%q zyoWP9%^X7<<(YiT@a{Df-XBfw@>WX?;Yq_|)RPTO%f?Tjea>WIweGW{#7ZXn_}Y zua?ZX$<ha$hl{(gk)(^jbix&r>F6aQ=2wW^~z*|PT(@1(RH(QG>D z)9U=f>2DLy`q(@7-MS`G$+OOPx*i2;ZPzBtKiXMeu3dXSWMATQ+PlOoeDmGyRf^`B z9{;Asnq2Z?uH*Jm^5F0<+24Zs-kJh2XJ}bBcjmPY4Ie2g=oFX2zDfVhxL-U=e0z$6 zer~}WXoaeCamN%^TY?S+nt4ysTuhSzb`SpYrG4J_T|SopuaJ4&J(|b6P;=5kSIv(h z^zz^7qc4xSV9)9v#$7)1xVY`rOdT(1U!W5D&|lP4q9JR7=Rv!>{Mr=%RQ)XpWw>t# zQDEf5ZO?!nAE3z%QV`Zz8NV9ZTUUNqY4sw~&R6JDClOJ6~-pMrPMGQut~_}+6T z(K&80saaV0+{%ASMnA0ee3LO-4|Lk+1S1Lyj4vkX=*6lm&y=(TisE% z7wo2K7dE|5O{JW~Owc~t2s+)%XjRz$V!8YrQ-R*}PfeS`syuxG6}1hBC1zx2eGjbP z=}fvRy0yGHHfy|tFUiovzSS(b!CJ>xfM0$ZxE&dF(0jJ2?WxOHy+d2X3Xg;2*^|}k-jDt z#^%;GPQhPfx3iPGpMJ>jQ>8+R-*T+jb{HR#IZW!>gx$A5FP z3C>ldZ3dbR6H6@$gF4j?uSvm=RhcfbfpiX+ipX1crv2M%vG|!hJ+#H&kme4#6=qLt^#NUsTfDA`6LsImPTpDX}5P^gVVxxNio+Jwh{x z{nkEv4gC8B<_SXBXVj$vAozsMAb#L76P2^E@ejNBsraZlVXiA1I$RTq03TP4M=7Qs z(h~fC4y-{P%`kk}U~6-bYKE^w>Nru$A-9T7`6r@dhjb#TWA!A=oc&^wNr_%mE~Ao@ z`v{}b?}j4^m>kKZMX={GMy+DC#5yX|PH}Wap5jYwG5@dE^Llr&KkkAo08}}kOM*KK zDQUu9yAM*hRxJ6Q6qgA-CO^SYVSW>N2cWq=Bam`CTlZmwqSGUQ)L@KjwwsB8KbW%x)Ay zd<9AK72r$tB+XY()_nD3%~#K(`AWNNEZ&v~-rg*+hcendOabkjU#GB1q#_~y2G$x_ zf}rO3w~^3=*$-KusP+)-lhEWV1Dp;nO!-;y2N4%;aFL|+nIK8A;C(~!vT4I zU6qVvJy@{#@m5P(GQ;Et)Y*%Z|FNugu!m>dd?y)myi3;b3@(? zh*X&Lq-v)%4rl#fy~KXxuzC8B85(djML~fmQdRZsKFbmOxL8D?YLVSA3>zf#ZG9$e zP9-U1r0p#hF0+B|&Nq4?sIawDjZ^H#D{PCdL&;%ik2fqlftsP8892Im)Z~Ta2raGf zT{o9ruZO+8VT&oo;Ck5D+{C7tIHkuP+pVr2S=s3hGYF+_qEoHrN}Rw`WxU^Ni`>qB znd@Lu6cjHZyf~r4V9>-T@Oi>(Ff;_4Swp*yH^`$SHx(rb=4aI!T6bM_e~d@t8Pq&$ z+e7URp#(Wap&2cBamEgiF2;QTM5CV1>8mT5kcN3FLum^9JmTktF3iK=YHA=f;jrQ@ zJ>g{fyv47ar`Z9Fng`4WS{Tz}D7akP^4$_|3Dxl1iw(cD*I5o6X}&+F9RWBW z6GQe8eE5#WW-34O~vLvC`JTx)}xVdWa zP*1l(J63#84}tZA;ofvV4uKiJS;j%ri$VUDX`Z>y*IH8`BOEGN6C7)RmAgzLqW8EF zu8DNM$`%b%l&}|C?cSjSsk+_Kq{D=BHoOYnpRu4B=`t}{B?+V<22Iq!huV@DiykH| zIxYZdC0KAi`KHs5nox5}NK-r({(Vj6%dF-1lFd_v%(;*vxZ0WxZZ(+{~3 z{SZ?Kaf_d&lSGIvmTJJpQ0Fc6`&^x`m~2oSR;Skv&q8C7NLZj_xAHRl!H z!UAy7C7}Si0bp$Sh`o!T(&#{DFHKsW%aKXvMl8@LXcv~XAvc;->sJi}wLId71}o)4z`%+`Uf z?9$ULy^@RIs6>GGsL_0M!P+C47V2m-f$&Kr2^`H8@wSiIEt614_A4IlL9{(K zo9lq#zdHty=Cl`vE&pn-JKfj!5;K{ZXz*syT!cfBO0$#KiKtur2m#h8qo=`UsW)Vz zwL6St+MKF+#(pqeo4A7up8vo*vny_Bn(mK&O6VXYn)oY`)$()hm8y&)+hApIy)to>jQ?e(Pc8(o6eOw68t-bcxq5cG6DES6AZVg)`PngZh}(N zc2;2g+o}Osx7;F31v?G96`@4#7mp9M*-m1M64GsoWa>M>!NWQm@(h$aji%*-un z;nRyuxOzQyjBWIohTjPK4fgZfC_&29QlZNX55Sln%(G~0?%6Z|7RjcHld3{CKL!z` z9|XpJ%}c%*0ftL{ahO!I+=NMvhF(c7NNKe^)9D z%u|BsEK-LqLB8QHXA;xJP)>B3huLBIo9x!@h8#@C@E!bWZxq^o(kr9;3&P_1jZp~y z1OH4pR4rSXS*u(keS3Y@jXJuYXwxmiKtavm62UI zlJO>mol+0x4O1MMGU29hgL%b~JgWC>&zZkAxQE{-%dARArRmTlHYySDU@qlsiiCLB zJI&t;LU0=3U$AXdTZSXT)08Nbl188=osKHLag}k5D=CrQ{*y5vW|YjCXOp?44aeD-h}yQ$#zAS zYf?a0a|h|sQD&|Lb1voPn{Wzoe(s7rkh&6X1PC0{iw0Oz4q^6qL1bUgsgP;Pp0kgq zMST*CCm706G80K+RO|Ga05!>s0_EdtCL6kwJQLg<%`o6U=i%bW_|uG>Vt2%JH|f*F zhEq!KsFdzgyv193YpLYOg5B74-0=u=-x!&4%4{F=d1nS03=hu$9w%16$ivhy&(u{5 zA0r}N%C~kdD=q|uEO;>_C1j<~3mC*w{O`pbic)yBHD)ee$+vl+4aI~kVV4MQbr&Q% z($oE4I;}_i$NA!=l>Rs#9x8BfUmOQRAQ-T`j6C$>?wCD=ttlV9Y;3!w#Qd>+VWZff zBVvP-V6YlzO@C{O9rKWkzP(`^Fbf%*{UY%#?uqhL>Pq2vuj;wO5*bFj$+PTn4=&Xd z68jQhC1}iW6~t2{`q9|Vj<^*;uM?0pjySP*tIO(>4$t_=%M+61?)+g^fK;1Z6kr!Z zy4@bUCXfh65}ac1AazrG@;G37FLdgl4d3Z|9k0xc>RCV2(!;_Os|)CK0}SJ#dbGvc zWRFI(8gQ4new*)Vn-RCu3a>z*9u`Ck_<}AaBd>9Ys#P`mYT$gL83ZjUfhvSQ3~X@r zEOuP?hFzddkj`~WJ+6m5;Orkg4;ZLCfH$&D?j3S+heJ;d&)``G3+lI-)kv~#Hlinm~qY&vD$MR=>z3gd`mhSdjIFfvs-=4q#@bp3cH z*L#8|a7y5x`DR??g(8a#jw5SuGaFC@;oMSH#+KTGT>d^;BUOkGu^IMw6%6xubfY05 zG>b_@b`XsBdqPp^fj~C8mfsHcdVmaKO6tx#nZ{=aq3@h5W_n?y^UXY3E-$6PrXKy; zOc%u-FpNS^%(-K8$(F|JVj9*TL86gax7X&HW9g&|9Ul~vWA@@>W=63O`^?tS&0*cl zVsi>I&2%SIZ?-qZp^{jk0z&n`8QufH`fY6*rmm4|DF`ej&|S29Qz_LV-RqWzhkkcH zYz^e#XMUqyn~b8^fU=JrZ^KMyEy@v3N8>R$Ydm1=u8Ry9J5<5Uy%6l7kk+CP+HEyv zW^KHogbRxRLqOHV&lWuvSczkS2T^`DJrY>)J>xuJM|Zg28po*>W3`+rf?6T{VqYpD z84e!0)X<72x5<*xgE!~5o26pGbU0+M2B)@sncJY0!C-oAYm-^)HqjSuQgSq2wx-a& z(|w$FYclCkf*&M#1uE2*v7iuOV$cCkvuf1q*Qe93jLZ-@vY&O(VJNHi%w+7$me85q zz}aYQgr1os7#uY9aCq~;aK+e)aRCwsPwYD!tT@x-GINxUUuMBEqjNR{yMZ!*4n&Lj zbv6mT-8rBhz2-rRM`e>Mmkya!qV*=(&yjiH$*T z?DyFu3Qh$@!WA(rfnBFXd5RoYDhMt06%xM3awu3Ws7krjQyLzFd7&FJ0#$Qr*r@}C zTE8Fkkpt5o2H{lMaRc_afu}Pm9-^B&$d>C8#BS{w`S>ue@W?b*ISo;M1jjkQ1kE^` zgR~TN1a66Q(`JwBikX83Np$O!6Cwz^#8k>;Fid9`We-OT3^2e+cZ1LTCknyZZSHceaMk0pp)(CxS#w^$-~$o23$a-)@YR--UYFgbqeoe8()Lin7yX!a3=5eN8<=zZybtpyp=Cfxf=lI zu}UC~2K9DuU(W67Ij-PQ29L0D7jIs97$GgvbB!f&oKYeJF;5WAk}{3*-ePaWU!~RG z8)2;Cx)WV@5}2+*r4-m4v&({nyUABpB#YZSfI`{A?-%irY%s7SOzSszoeS(a_9?-d z@HZMvJ^`BV;kM#GD|Aojl=*XZ`>i_VFQ++_L}e_yT=F z#iypZ4;KeO0b5FM`iBut?x&dyNqc-gFoIrILThMy^M>BhR9)d0=&) zpJ?0bakp^K=kQrTOM1O)%!SxYg*UmKS(AGNObM+C_`D4r>o}trI&K~+CWy!uZCWYH zy*a@@`o7-S zI>mJiHW6}4BExZ-GaSN2w-nt(rP2-b2_cJ-4xhlVK~30My+Ml^0thHiDxwk8&r2)T z97!}aZ)AyU@swz;xRz#O>mHB&1tPAy`nA=efGVg?0Fc&+u1)O~ zRw`=Pa^mX|RvzF`ohG*m_jl#S>V7y5ItZ7AS%Q|_P!J5=(ubp7o+wCq%F-G=GXMiN z6ir*50UW74D5dC69aY7I&YMj9(*e`TnR$QL#j?CfYYz%7`264ui=#me>mEe)TF4Gw zWHZPAM#oCkkAUY8)I%OQc8F9dzLy6fqG$uj^-J{~T}bsrkl)rJ^A7=~XP>*ytZ0S-fy}PVv(HGv zht55xo5=L8i;k-GIaC!)?arh(W=ECX$*%kBG_e^m07$pNp{*%JC9tMixpXuFAPsH2 z{O1;cE(qmluMD*)I`%Vc#w>@U;@&1~Hr%G%OEPO;$fQl+DjekDcN!q0VZ>9Ig>9ohBPZZxfhAN7*BhX@<>nSj!ASe5b zlIjMvqp4__nD}MnLP~^Ef~2as>gMQbFlnw^E5!5{?&qpI(xyh7Ou(-JF%4$Y@26pZ zqRg?ScIa$yt+h8AroAjE2EjZj4lsVSAHk-NBpa(gvvhlPHjFUD?D!w_rZzkx553m; zW<~1(f}-%+I*O2kZp=ZecfCAluZ3YSX!qyYR0DB*$CctczNSf+j}a6Wj}ZWfwzUlh zkQY1APUSo`GJ%5-K=I`%{Kab?0)nw7KIPzEauP2MK?i|-;grPUNh+O>bKXf!l)pvu^Kjb+K()%ptg9OpQjs3C9)t z9^)(QN}Y#i*oxp;+2%z<>^mDw$pYV+tU0)bs`3^0ctv?5iXn?-=FKhS#l@d?8d8)f z0V8I^5E<}SvqUG`XlIrlrEF+F$yJaUz*?se=jyFHc0vEZiRSn3V&^e_xTjghr)lRod1S1(YDViaxR!(d|vOVqrPuz>JRVHJ`JS zmbaN1)4xo<7o_^}a}0Jn$kc_NkM3RQK{&v|yPJ=@dW6gh&L0I`bX79rrvMtU!+>~Kso$YrgZU{9|aF#!yY%dhaV zL}rAI5{{t)9Yf5ITHPs>;WfCe5b;;YYiFr}HYT(JEV{v@W34$hI^rhw&xP!aAY)k{ zjpr%et+hf=i}c1j+AU5eb9TZ&Sei{p?~om#Pz4lvTimH)f9Fi8np7zz%UMbnC0O(5>^$ER)Y7FJHY5*=}mOX`Rv694dJ_a zUVBRftlJ&Veq^{+Oe@^YjO4o!@{yx9v%+@}Rx0|u|4Uas&z@m z$m(WP8~_1sNd?DBxLCryUfATjx^M~531^utft=MHq><-T(y->A$;K?RjSo_SWA@3(xT-KaA^FX@_BQ@+UPqC!4*T> zT!G^%`muv$FWc}g0d*Vuf&h(iJuYcVfwA2o-*q&BfN;))6*i%c&M>bdj%aQvf+xwQ zKZy@+#E#MY)jGr@9c^$l6NtG94f^fgYl2L>D-Or<;x{4nMa*Wpn2$W~2CnvlLKMg2vL6qX{=^`6+Xdch^eQ)E|p1wm%Fpahv^7eZ*$ zvad&$)jDz|p_sPX6)9D80b0?k;q~N*kgVvnhYR>mdTYSt8UyAHq35p@v|8Iz&4UGT zs7$ZwZYl?+WDC_=v*`#hG-SsqI;<-@`vkB?IkvHdsct`W^@WKfK)E?F4ym+;ie+$Q z4>2i8Zdk^7eb{^`^sIvWVrS-oh|q3hqb{sp;zAQnm85V2#jK6^A~AuRmo!E#p5R1c zj!+&CwO+5f=P~c{L4#)zUjL z)y|9zeYtyLdg!4VXC&!RijF^KnZR1?`knnNG5Sh9nGws0BbUfk^_W}=^6$X_(Fx>`O9P{{1%cUdts*_O z2WZ6EcEl)RN~u3dMg>Ar&?qCqtTE_|_mGPv6Mi`wprHsiHNyAOEDSJtIORwuCNu}~ zMoJ82a1k{y@7Bmuu{G$zSU8gvJ6O92@ZAlt`<9kHUP&Z?#Ql90i>3;^SyB zF+}d_O{UCx9iuj`veICS5{ghGCrN@UrGGY4;&D!;&m52$TYP8;2ov1E{=SIkpzz*# zCb8ZQG~wd}vodRr5rflO4-BxEU=Au?GxE&I@EqUtqL#vIx)5&!?FRGS>Oh>CH$jAj zTA*0B9Ub_j7PS^SdI3}dD{*!Lv{;Z6pu&TYlAw9gA;~^SPAA4GQNRSkE~A+e`ic&b zLT5Y^CrBX-BDJ|?iQO%8WN_$_VoY)0@Vr1dv47MAN-;nA)k2IQ8L8SU=+ zGyo3muFeUE7o?BeMME+0Ism~9Xv#L>cD5*5G zzZ%+Wyed*Xj0zqDKYDwgY59J@fvYBVED=6t3y0lEQ&65eOZ;5*>65}PZ2!l(-xa=_ zO%E>j1KEVOW_?P(AM7y|&ul!9BLnt9=9o-vp_yX|j*t`?I|6%7u{*jtjB}e=2sX+2 zN+?NO+d6NnU`w<1NfbJHO0T8*g*e8~_6zh6-M@lq=-A;ZozT%52AD(sungph(~+-r zn9Yj(DxJiz0lSH$j|*?3EueWCrh<)XDf7`=n6wE?G;3v&cHYNYg9(qM<|>*{+Bet@ zQr)_=aAeschnTkW`mJ#oba#3pYpre`sYOsZ?8AI{aO4cxRu3%_#kTe=BCZgxlWORe z{J1A9xqB5J6=K4xXQt{OxRtLSU6vLjq7>(q}t@HYq%Bd)l8S(6P5=oyqH9 zI4|L%gqoPGCv4<^d##>P2%eg|)H`(MRD#0)XPW73E8NG%VX@^CrQ@`AomzXCf-k4t zt(8MHVHL0YYFkVRJ<4MW<&;#EawU6RM}y0Dgnpf$bHu3%?I7e0MeMy4l_?* zcswkkdeyK6Ec8h)r!f)N`s`6dVLl{xbx=6+QzF$Ch&5%ZSI;T7?voN_KeO%wjiP5F zxX-bhPRAT$=Z;Ekd@iCkPRka`Ed@8i9OZ;2^NUzNZr~Ta;a+bTkO%9ye7o#WI}WXI z{U`*uk>XuJqp)1Tk3<~F4Wk>t(@SW4*z=IROwdL}p^b1oR)#@5(VPRclsMwRNmGAc z(4~b(q)Q{eM($*-W!#j=7H{+;5m3rRI~m>Za@6DIY=&2`+XjmHWsM(3cCS+%w62M| zVs2cA+mp zIG7A{xK>I#^<#j2ra3w?(Op8_MbuPMq_OihmCS1vsA6KPeAohFvo$}(bvJ2-a_%K) zHZJouXJE}8S01neT(-ob6e@(gv8d+!!feoH(uMs_4X{BXOAvtp^dSB~4D&c>pV-1p z@zqKP%rZ23hG&F1b7j9cMTb+J1e{zu`ZgDZ_}?%lr>Oi>se#5SXtUdvoIlM-kWEJ; zV62(5JsnJn9#E1kc2=Q-k7 z$U2wU*0g29-9j&fejObChoKB;`EFD!#f{u;0c0t@tTjtl>!9H$#st;s=W0W}@qdgh5x{$Vuum+{* z()EUA7{D5~9l|RCv{M!hO0y~x*QeJuc+yGNj9@ZFO8$5*(r_1j8~E6eW9YqC^97|)IyU)~Pi{^{^bl%8 zCsrnUX7O`E`;T@(_mM%a z`2(UmBGQl+#gN)HWW_EtY)K3Z(DOl{XQ8~x)9n$*6&=Q-6nlE{o@Q&Ah3f`)Ru$P4 zdK@p4KvZWdLr|k3(Gew@oJrd$?2N{7!ZkUHLFmG?zI+Q+*y;$bBO{Cpd@pl+o9#s9 z?m#Jw!0pN@PpA~YC+F_}u1>SjhQPn4##<}&hRvGaxagm5obyhwiDd3q22@@_2CyTe zIScpKXOjtl;|^VkH!m?+&)k!Xh}~vu!}~{H3Fg#8TGy{jaRWvQxa>Eb;_z_L=P)%e zRdkiRW||IbnYwpP5TdN^MWLWnw)q8La6}Vqo)#PCt8W}N+_K?gl%AhURCB3C_EW>qIg+-rLFiJ0uLudm zdUquTtAsA#O62-AJKGL`7$l|i?z#J7QmxUdJ1a*bv-GRr*WZp9kVf0C#ysp0p4j% zT%-PFXIr7#zihz(G}GIWKO4^4u)X|tum@R`6TIHA4n6>805g(OqTtOS%d@1#+S!SY znBmz{R$|Ah*W*?(e%!epv7>AjrPQN3u3unm(@sV1R z261bUl$5s#16X;^dj(ulMC{f)+mM)I4m9$poY*vb3&!N)Ab}x|f>A1}{QAbClx-wa zTwBY52{CMMRbvI!X&Y5eTk7V*xHaiy;RBVzShET^kwG;2y4*8U-nyJ}1dN28(UoAB zNjD-%{~-u@Aq8WVNjv1uhD@7CtZ-R0=4n(H`NOnjt@OTZ`YG6v+(D2pQQ9Em=!wj# z`J;shv-NNZY3d_wwNN|74A_rE!NlEO@`&L#w2~H=NA?`w9}Usp==Uu!2{x&0j;2W` zebgMYj@*DaOBOc-oewEX#qNk{BQ1nzx-1+-_T)hZL9iqT2ayz{JKAN7A5#-2K{@t8 zwK|Uo09zOEEh{~W0U>~O2k=#II+Y;=vfw92?vOQ8^61SpMa|p@wah7~rBd@mW^+^s zr0QN}lYo1bj{t7IViG!q9G9Pw>)gcr4x}#1j!-7Gkrala2qnRWNrqokxsZ#(7vb*b zcT-|)uu}Uabz_y#{c=j;*l-KZk#P%ZuWg5U?u{XvY(NG`CMMdHGW^Flg$u8V2*WfJ zAO*gp@;R>3Q)cH#L8L}FaSun8^lXSTrAW8Ve|N-xnCr;}>k)JG_Z381#5uSB5b$&gIyy>L%pME-d8A z=W>9Ps)nMZ&he#<%?2lh5TNmi&FyapMm=u>BI>+?20Z4IvV%8<%c-1mtnn^qgf)rMQIATJn}wt&r3WBPYXYRAVgle9OQr zs}Ycl70CiP=WjKau&+{S9V&6g>7J>v0r9$yEg>s;$_9l^+(DJh&6KaNkZ&KG^D1Zk%C9L#tA@H)2?g< zt-GBf8ueEK3X^m?7TspD@nHWPe*^&E08j&6l;i_5B?2aL?6D+eMY1AIIhzqA5ldn> zDz%U8v9L=X#4W^lT+V?s)o8{{?#v`l6GVl*cd~ zs6n*cUKe}~q)2YS`4)6{1W`swGQO6dHawGcGp+*ZO9-2GM&mhq znTC)3NM_9A9!EtDmOn%1eQPJJu1sc(T&7e+&gsw2VE}HE zvsq`yFJm4p?b%TN{}eMoDVto+T=^(UMxNki|Dd_4a|<_5A2Lh1 zxp_Ef9zv(prb6BmXX}goZYu|;kZ{GjmNGG0$e+h-G#rb?8LDD90nyuQK94s?d`P553 zIwSXgRoMp}<)K7#EL0A5);j+b-x<85lFe9~?DB3v(Jnhdm(qAAjYwMW?_nu5(zR;Y z2R=t8`a^lVMzF&vp?3zG4Clr|vM3MP9;%C-s$!?_@#ZWSHn!kRWy3@*v~uEN+zG9X zNI*(47d#9^=wI`*YZ3K=C?wVoH*2X_5kNb`K&d*B->d{1%c;TJ^g7H?}mRj zsrBXdf=N*YL=zH~hJ&8ml$dCQTan#=iJ+m``$~4290>Stu!G}VYYybhY;<+LkC_TJ z4ELx&%)1tJX2>OzgEeMxO@jqB7{`wJX|UP_1K0Kh7e_#FTLl-I*{|&OvuwZmc)mhT zA@KSZ1BbX@&>@AnvBY|#`Y6-dTq$x>N6HqJ9?W{x;`X%M0oj8h70@+Td+Z=V$2x;D zA-4K#KPuq4OyCoXw5=U{k^(0V1x|nf^_WRd7Yr%oto^msE)vGoqDpcB-3%h*gnG!; z12YoMd%5Ygm4{mfX&DBR((v8VDctB@1{;$acrv{OEzB@WX?8J_<-Fb<`NP&^GP+7( zu44-*DE;Yk#}iUJSyxkNb8Mj6gg#1Ymlhi_OH$Y&HxsQr>X3?+*UvT`9BcO#GT5r& zG#ra0VQNC69NQ9bXzh%)n~kl@O>EF07&G8sTbLl35Fh-g9-*b7MeWx1Xh##ldh%Hi zjd*dfE2Vb-(steMSiwvn=nJSQ{We(M;Mot9VhFZrAoI}~&uzW*Or9^v`bZ(+;))1q zrxY+ozC34UvLedBh#H9iZ47`qNC+9%_v8M)#Oa#RLZpmo>%2WgGUB)5%N4TEmJi@2 zW+^OO&-u2BwE?->Gj}(sXKoZ~QQeB=S2vC=$gSkOW>2Sg*ZB!lZuC?F_hTYRUqOgQ zt>oyY9B!gxwSS_|c)kc{X>qkHk}7iJJ85#QWi#p0*yPHbPYJ?>2o^E3h~ zq3dS`NtEv2f;?aWh6=JTZF-{`27Aatm|8SljEkm^6rx{?h~9RN9!eLJPo#Ccvj>Ef zwIV{lJNzAmbCKGY$;r4pu2Ld{p`GH7a&Nj&m-qtWKxYbkx z-+ihyUlWyA5Y6ldt!)rAKcj>&$wUEg6kxH;1Ye8^(kbiBIxw}ZU7~Rl8B)-*3V+{b zT~ycEZMtotsvj$NG+^p=#`~zTSb}}1q6QRmNb78osve30C3g;zp>_zMN-d_9;g`6? z`h66xzIuxqBvP$vsgpyTGVwA@>%2jk=&;HMP>}5)URl5Wn|TjLPIJj?>6T&XEo z(ukY_o_Vn229`>-*^sR$Kc0;GcKwD0SPde)pt&cIR>^h9Is<5e&#r|2)!uYp=a*LA zgq?M>jx}((+IsVJNMr?|zjy+V*oXbrI2R&Pot-F+2@E9NxT48mAW_2Y1#7t7-m`j~ z6D*kz#)MwntOr|pSB?EI43?u&9KFUF%A1>;R-#nesYd0c-ikYeUShV*b6Gsxn|t5q z%Uu`;LZS_5M$7=TPv$3Dyomn}+5Q1z!M-v=O6Gbn3?kbxcXbZm%E2f;^h-p^(Y&ja zkf2luVu6_Ob`o~KF=>fP0yDysa-K@0063AJ<$b7!jLE2go~v0mtMjo4b5%G1ihb(% z5!nev6Vhl}YBJ@7Esn)^b0rL>Ym+_dcP->AAw5wR>4`DA2)!f=;AkMv>&T^?7K1M5 zqb+0_7R$wN$~KbQXMFx&{8etH^TtY99l-1{vOKVSBVS{~V{HkZWXPp2!l=fj)~rlO z4*~Rate3sXSX3V)WLMZaN!yb^>q#E6ywR-lS22Uh`fQbC(1nGRDMP9g%|G4a?v6kpEnMPZ(wYe;D zNm})Z0y{OMa~7J|C6^KbO}YAH>U18a(#Ic$qwI^R6`e|#g(X`kzY%U9;(D&YJVW05dww5JWX$Ewga&S z9<5x&23F98+|0!Z7nuvt!s9&gkEXx|6GJcRK`)gPBs25U*}GuTeZm=~o2+EcW*<{4 zpy9|C7|m=kEO87KPl1JI2?^KSOuBkw}#>vr_r>+(tBhMWk z=*_t|nMB7TT8(yq#$l7`k3!zkxlAYp38q?)ovXY8 zfZwnA8w6nY?!ix&=Bf7-o_gqd0QK#MOx+%O{$W;CNi$i?>Zv1JC9wf9P3&oM#KfMw z#d5*|midW1jP~&&w5cATgzbU*ecDVOFC6H_TV`PEb0?W&u3!m)uzo}mB8T~J5JyM? z)JhoF#G;_u(^2Jud$cAg@|5WW^iG|l>4RZ7n(X$0C_gs4<_!jej*W9Eb+z;oREUu9g|;Sd^3Q_6tvCR9e4%czLaEQOaqzG%rS{j zF%^klhzbC=f>!qwr=qwq@M_r&OgH&m2am=GeIAt@#ORR=!PS)MjT@bONej2uQ zV*ai3S`xWAKhH+Jj=^s(In?_{^>YrrsPzx_apQC-%$NropmCSl;?{gnXG%VjC0Z|W ziX9w8oV>X$BRRj4tM1Av%;)f$FmrZmW_Z}^_F8^NG}1syRx!xqVMfyQgT+>4i@vmi z*wsVq3KfHygO+RH3`{%i*Ga%W42GQ`Mr>=c90R?x+8S3xH+MRs!))izz`dmdjf|M+ zx)SDfnw*v|3!$8P%Bjtn{i`h~dG3W%JCC(yw`tk^cyILC8ulalmYNU$j|LrSsOkvc zAY?MO7VqW?P?n=)=|iTb4UmZ?SniiF(AAsY6cl>^UI^O(IQcyjlc81V9+ zYq_bAlN7L#_aGAn<`DQOj9mCl$2?J3hIzu4Y3HF`6syP>k&2D4XJA*=E*CC0ckW&c zW?C0W;O#kE&ByC1dY1%!e1O#s0Mo1tPI;OLjyl{ix8Lh_0|R+e9>d1r?ob3Y5N*w^ z)ns}DVxXmDSI)DOCkoergS8pe!eyXcl%6Fvu$4=R0QSZG^weG4H$~nw5^GYCeVwx= zAz_j=^%G;OMH+}Qy`Fe+Zd!$O6dhBq?ih%XSDi$t+69aI6&s?i_6EJV&o?Iz+&?(bX`4^6{h@bm6=W0n8XR1mx@|q3iVR#ir9>;$@R4m$M>#zO}PQZ;X%`>u3tJ} zGsHt2e!%R2W^)-f*hON)&Pv!|HSmT8+esx2^vXEmmRs^4|5L3aY=+)B+3dv;#^Z;P{qHwdtw$$MQW#x;iB16U$ z3S>x$j>6uQ!#E%W#xxJ{3TVaHCfAlD3Aj#Ha z9%2=)N(^C9F^VS&g5qKpP#Dq*1%E1MZ*?N-7WIr?#P+p2HZpqM^fA)Wd1(mv6)!$D zGYt?CtT3toeT(&-D?1-%nh%;3@Ye(0)6ocwx493n$S_GY6820b zqP|Ye=5n~3H4S}2V?6)qJ$cZ-*Gz>y3ty(5rkD@RUT92A|$RV3_c7k%uPLS^Y*Zn+c#JZYBl=QwV za%CV<5>JSbOy<~t=Rwxt9lFEWW zrPByhIwOEeiRBA9^j2~Y4`G!bQ^;1t;HAdefYgO@G!q_^;s zSV^ET@x);`#7O>b67^7ahDPeY4O285itcMlAwWcAWfF@Od>Qp+J<>ob&;lt<0zz*x znG5&%#d3TxxNp|#8w4wLiC{(JA=Y@UG-KR#xUW{(P(sr|#2GAfshS%WQ&ST@Rn6vLRsVKyUqx3(zj9y!_o?CArf6*CmbKb z>BReRXx>W`k{8W&j@3H9p4ed}C+7oTxi&kg$)=wwQxS1wyudVb!0GY zrwDf*%ilz4r0rYFi5Tx+J0;9KH*iDgH7qnut~h|c>lffE;qeC|4$DIZK_#3Kqn?O~ z+409K;@C)Dk{+x;uWg=2u`MK-TG0lP9U;Dn)B~qx9yqnkeN`h0(Jes@4YynJvGzRAeMTy^L z837wa0&`ZY1U~Bz5YHjUe&A>lUF9GJwD(L&$l&Fv&vJ^iRVdw_+4sX{ zgB$VE%Uh}7O{I*e097P$oEMWmhVy2DJCqRy>~?0tz(2MKKvIOX7$uJcH^Nc5ju)|h zBzj_^u+WAtvwWoETLeUDZ_14OGJzE04pu+Jp!|v)v08>D$zq?-1 zg2Iua1+tk-1lLZv7XmzW95vBP?5>w1(Qa318TNpnO55FqPLdYLt+X_$!BmCDs={>W z8YiXD2&~!(#Vvx7*+3RX8loVufBXH_AphF!T$#2Q84;01E|2;VhFi9B;t|AjT~ni? zgHCG*ePVA`zF+eBWpc9Z2JrtLm;}HXW!_~C{@x9CG1r)@d?}WCwM}2Lku#5q_|h!X zL~IAs2+N^6*gyy^H)&n<$JjES5AjS(Xg)H|0%as?)#@a&=^TfH6uLosw%1cOrkxfkLMXUoAYkR0 z*7>rOLtz_RJc~_^+wNwZjwxvf%4i|Yq==dvp}3w$ewrgul*)n8jmSxklF;Qysvmcl z(1hWr(_?2xtrPMd*}2;trAIv0{J12>sjlJ9VMq7fo&CK=gu)V-RoLUM4+SzMe&a8clh_S^IjCKRE>fkx=-atUwZ@v8mdtpq;J03L>J%j3lB0iJdAX zf@hNbKtald*`Up&>D&mr+AlF9kYN?1LDoi}dX|!2*xvfc-g74tm*i9%&8gMaz`?QR~}Mw7c-L94=Nh!o#tNKt5Bu5eIs%8(f2?i&|5Q^mE>Z zcvMdNqw``5i)}l}>XiG+#CJtzMWHjECAVP94IngkM;z{~^g={)=8Jcxq`vxRf+=r- z21bc*C^Mreva@_s4a&`0;P%(eO3u4G3(hfVsJwZjvjU5qmt86JqjQ+x{&r5?WmESlXZ*GMZKK0 zU$Y5q%;SKo8y+9Kj?RW&(KCixtVr=y0nQ!p`d7J0tR8gIqGZ{H2v%k$A&ea^Qx;QK zZj3E<$x+(nX^elVmH%0!mRfg4Inl|Eq!3%EP9QR`w}8|`Vg{^H>N^zIM3;PV=AasG zq^sL?$MDS3XipLBeNh7hV{gywWKBh517Pw6EE8}{<*ZXNB8`cOZdC?W%Z_Y1=bcVe z=WE!$j%xcRf=-dPMADZr2C6)rjhW+cmthu>OSvQiM-+2B#S1Bfm1S5*XQY_oDIQTc zlW8BFiMV$2G(T9le@ULs2l)|$yy0;advS}JMapi#HHGMKk=f3`p5A5pI=rH=zpjo8 zn5&ek%uz423h~riJfbvv(RS=pnr$W4eWEG9coVi<9g2$u2`CfI4U-kNpP-Z>P9NAC z`AE@}g}e#_)MB=sO$l;C|H5QjQCW-15f(zXJ|Z$_=v(H4Qz8*L6L2F|Gwn7LXuknq z|GNeQ&od%Tkuq>KFqq3LkKyG}8FIX31CDB>T=965da3w-g0!E=n#p$X2EcsIWCwVG z51dNyY2kZAXhoiI(lW0|^ak4v zctInvVWV=-k={sCQj0LPq-~%U z#jTgHB^+ngHi~ss@1M_J3BKwnji1|4RDWU>J=C#(Is$d{5M**oKocXb5bh~1h0UW# zbts$y=MuQI6%3SYVHx$9U*5Ue8yX!BO~!}o6m?2Lqn-~203oXJ_&Yyl?WaiG${@Qq z;O1smD^O;f*h445m@PsQd1yi?ST{b7xq+ZC|7|w+l}#Lt0{cZ`obQc5ur74&TB;+sqTN>R`(xACTtpgrMjHM zR*IY@RxE*&R)!50F&RO6U@=D!B2J+0O7d`u>!>EtJKj~!CJ>PU*FBiv=sxLwn9mru9CJQ9~)WzZXh{q?nYYB zL|(#@f8q36&N}4$T-jnn#0=Kk7RNsbj%_*W#de_(ixM4*$q!j-Xd8d-dl3ehpfg#S%5gRx; zwgyhs`J4?h4ALms+~V+X(2ooBAZ!$+DzU(bT8(!KKBK26i3cPBx)R| zyy7kos_e!o#O0*ZFkEF(DYua%iYjDAwggUDLZav8HE=76oI~kGI2(^gn3&F@W=@NI z>XW)y$vKXWP1tnQH^^rsuaJo+ya)$D=FN*3uqTJDw?a`V>1=EY=Qy084ZXf-%z#Pl zVkHIhxh6gE%GQl0h>o=hqG8`aI!|l9E$5geLb*J-gH;wIR@7od=G`Be)_Bu?W~s4q zy}5%yt>WH*pmLI-rU#XC(lIWjXmrI{>C7Na5OuLB@<{@TVmi>KZahOw6VBio=~4;l zw$e0?9AT+r-!O<6@{IBk`oGHTP8O<$idnc&IjUz`0nZeBQ;6Dz?R|Ct^@Gb?^b%?t z24IO$C~h?m`0}8= z7KXu~-Je@9=FRyIC_$eYP>pFY$P5w~VA1Rf{4^4rR777g8a%NuMv0cMPp@q=yO0XB z;rE!ex63wTGl$GZR{eN-O|JJ^z*ofRGU*1<^eT0RE#2Q|!`QgWs5-MKGqsK~)sHX6 ze*BU2T0PQhc%&iZasWVKQJUJ?({U9kdMT#p{+d$%IbP|P_2Ov?%t4x&l2ZNR&#?ARf-rI#8 zj2pcU^ijzU1sNf99v)eSwT?;0d3Nlv3Fh^Ghu(!vO!0U6spnp;(?8t@c0?TwnA-Sb zBZFpQXrY;qY)v%+bQ9)~9nz9)wV8F;^*SEgi(!tVm|~B5V`E#KLr$iO&9egVLXG+$ zOt`sUI0bWbibx&V#I=0(>9yX}2zqd;HE7Et`4Xk5`HyJBnA@MQM1e(hKh_zd!wvK?At2jTAGz4hZ(M*y;1#y%V~z=K@_AozqD`uNO$gV_h4Yl%&c=?R%O^&*3gK85 zik;m(r#1=LH$yaM+0LF&*TsUJ{K-lQ(CQkIja8#lqd@$kX-OpjCKZb0a@Msfhb+Xa zJ$0Pfor5-*QktELesUq$4_WC zoCR^)oiuWntVyz!o!H;5XS|RJvvslVZy=*=wTJPIEZ#6ra!<|&40<7+$RaLa)XAGA z!N>kA>a8MIE*`mR%kUdBh)HV9UA{7d zIq8Gv4V|c$WF~<$i~N|-vbGvuepwVR@ib~^1{ccVes6c`p8EgWdy^Q+wlqyBG8?M0 zh14YKMU}chDg+A)&sa8m^DuIN00Tk!|&v4K1pTP!?7cCLWmmcBn zHr;#9fBxYcZm2LJ4JCSF_g0B3YZ5Uc4J3tk`&>^;Wbe!WXO{oT%e?42!+>>=~JYqb|U*g zyff&IfAx_MY0L|@P#sK2ZU)yU$0ckT)xXjmJir`0(!#3yucUrXGh>OboZMNrJPqyh zhCus7g|RHey@N>8kXW+zGMCs<#4sYey2B#x>FR#)I5RK+M(d#&Z9Py8LEOq;teAA~ zx<^9SIf`rvB87$%U5Ob$fM#9@95?#|vcQZ}5m;@Zpl=NUeLD^Esox;r5qBS$L{t?P9N!R zi#g?B>6ugo*9x0NjDS|`V6_6!0dS$ib8uU5XpX0{W>ro0-mU0tFw?CCNO59bY&nwG z1MI-qpZ!)LGIz54WU(Elkl_D(@XN#BhIl zdnRT6x-3?+a$VczTHtL5+A41Pe!v1=GR-6h+Y}bkM2X-KOGUr7tu5e!2gD2ZuuH`b z14ACl~tT+>`KvNZJ^ExKdc1 zM%6)e8MXLxBC#iimv=iRx`VdwQ98=r7)l%>C6YqLbtkX$r*cw`^085r=+@t^sWSeWT ztg?Q70qoi0x9t3d`Cy3??Hf=qVgI{nSVMyBMA2XMLABFJ^Vn$ZlW)2F&qMK`GFI-4 z2Q?#00w-&94TV@cf3@A_cT@J}(1bBmm$b(l4jTAwj;RUJrljj7;_@oLHrwlrzWgm? zUVpz@Y?gJG+8TK2tq>WmBlg7mG80cEGf^0AqidPNOzhUo#1oB~7`8M1;I}ijz$%h^ zMT`UDmT`$}?u0r28E-cH3i8@|&i_)09zQpZITwp9K{5jHEdUFQf1bsiUk1_<$s;>V zyqZYrAa^(OPxHm|+&%MYe*t>De$)|aga{cqiT9towKYaPkCN2r?Up(YJf=d2XrB5L zUPEo!ZZgn&QTOuGW+X-okid@w?F19~G-0uB4j-9>&N$elO3;^I0<@sKh}(Cgaj|xH zOAHGnKNwFEsY}K0cxQw1S2XBp=a#;qAQ`L)IC{pe2K2md!JXk^`j*QGhytQTt*^jJb?d89V?E(Q^f#>l{MC;#q<;Fj|#X#6NkJfb@77vF(^H z59OS#F9d;`pRq7j0yAoHS0B$k7^sfc>y>N4K!~~3RQ#aie1z_VTiBeFUoS_iF`tyb zKl9@s%q|`pwtu@`ya5e_?eOwxX0LvBHqOTeAStF8FA>>*{D#$eF<}L1x!&@CY6aJJ zv?ztn_FIX zFPP8NZmyJjgkmoN8cr4^m#7fK{}GCZ^)%9CWzJcq1ZuZbngf*1>L<8y@l7Xs4KCMr8J6<8_XGKY79g{Xtwt z3--h5Xjw-~#P`MsC^8G2A%EJ>HR&SPPg5Wwo8%E~ci=wxz(KnB(+8 zu!TBmOW?r7;u3veLYo~(S?&W7>FDr@E;Lol}#jY!I7p z(^03q-8$2nZ;#Sm!uD6n8kL3XF5HUKd;i*DSHSLgbTe(}5%Usp)FO9{Rb9ajtF za{@5pJ`h|Mo7<@^Fhu7pePQ(0YFf7PkwSr=`M_GKh%R2UskM2-69zjWul*Yb+lQG~ zmyBpSXVb-VvEncc)o*U#Klw@{(KTrAR`dRk)`PYLY@*0@RMN zqD-I_4(!`R2Zy&iu=h1)WpixW6$X`iI9CK-YH1GnNFJMwWHQ6uJGS{s zI^LPunkw&hxaJf}rLLY|R^BKk510Am;UHAAt#;4yTx@gfZO#+$fs{Fsx)|fBv54WF zwb-RMJfFEf>c#BctTmw8zQGUQV`bCwVKFVT?y7iRVOIzvhoGC~j{4LU=t|2l8tNXI zU|@UY`L&E{PlZH>niuFDMXcC&wuiT&XehodaG5OlJ@|@1{gi+?#3dd-u-zC#g>5|k?n*0miRmOh^uiNp60NY*wc;Le&+PHo2=%CL@WVLEpc;Kx zE#@#4OA&HCf%s!2td4F8T)0=m?U|aB0y-yQQm4R6Jm{o$1;PaN5`t>!r#RaW6M%`- z)bV(c#et_&@+3iKS^Fr=GEyc&91dTREcO`fP~s;<*41pZ*oK&uj_{m^s-Gm{MnViO z11D(KC@TyIt77J4+5%{?DfL`C*@Z#NI`rMC+klCV1?&l&jj(O{_TnAy>#+><=Ao=O zW)Hs=?<-4qy3tUsD_ONFG`E(N^sKQ#Cs`_4lY{qkdNVI48rPWL zL#UDyL`zcB03Oz;L$-jaWGg_GInl{++Nz4AIvzc2U-Qf&H4A>K}IE7*@~CfvvILVDhHjn(o9gfQLG(RcoC+bzGk1LS>D;z*&CHk;(S?`i}k5d9Fra@1kexyPPCw z)FvD)FyY`b&y#ZI)an9V$=*4J|Kaxac3o8YX?2aC^K?GBW~tgMw!+3U>&kn3xMn#h z;@x*ZMEG9kA-FK=7>+Q5f`hj>Sjm@BgI0M#O|SOh|ExczQk zLWE-%WH5)nO!#&2=65l1X`g|Pn(qt6H}lV4$zB(gXobRWd?4j9S4lW2yb63NNGT5u zww|^cvq}Py=tCV0Y7nJUznKTY%N21)t9RQ*z1zdL3HVc+T_wcC%_->P%SxJRhCd+X z!Y;@dK5;_99fQEQC|N}xlYqC`W_lcUpsIO*?YtCV>Sy6$_Sc>7;wBv5+dp8ivB z%HG_ZT~U1DJDjaLW}Mo-L!l4pLP?qN)^DZ{LLUcXhFyG21Tl9n+1%|cr=5A7 zW2@(KxO>qmyhnAo6&7g5foW%=bCIgP-ESQtvMH%vRg+@^S=%WBHNNORRUZg zvArwUORE&uzOEj>ghT+&?u?9xe;?XKNCH%ejVw3e$F~FFUTUNf_nOWg{K1BzExFJ`SS`Y#TA{`VQS$u zaPR9BmfY!FR;056?TQ~G*mH_ z5EiW6nr+GZy=8}+N*zEwnQBs)T18X|r5dW8Th77Nhi|oUJ0fx5mHyFmHP zIt19W(kxqO_*6G+DFPkEqpoFntnZ0^G`cOOv|U6ep|svdg9yKh2dUyYf@0gO_$G-mQl!Be2G3=~8b<6Z-U<`g^3LZaV8RUTem zpXZ+T#nnwtEfds>ei%*f$W;<0zxmxIJ!yq(Gn;cXkBnPPZayM5jaYe}Z~D5IZO__h z*Sg+_b~w7!n0J#m^r)lHk5O3ld{jZ&`*1!T-B&nbZkgXo@g5KeV5zq)y|olczJBPaN#?&QnxB^AS{+1I8=*oc8Exhif)EZU5VTTd#+qQ%v^Zhg-<|F6Ok9&Jhn7GTBrqEyeF6#4ns?dodp8kjt)C?2S5ltB0so;#%5n7n?O1 z`;ygH+^A>f5W8m1=6+smL;roeXUR;!W`cNmE~;C-B$>jl&@v?Aa?AutDoB^^puF&Ckz2`st< z1u0^Na9=bWNeLQ6L{x8(u7yOjZ7RafU?lAiyZED>9F$lfFMBR}5}u?ajEL4dMvCxzVF^pXVVZPelBFPjR`YaF{tG zKuwS8_vLHJ=bg0-S}<$RD3G_O$5R_hSj{bM;3+qUI~FIllWtOd z{q{>CJflPxN>!a?O&Ipka1EMx4N$NK%((=Mt#XKdUtzJ)5N?mt#p2WG0o)&^kqNG> zLyn^Z9Y<9en~c6}wtuvj2w2Qeq1#5Zr!l-6ap?NzRe4U5lKA=oWID5z~g#r;tF zqer?SJ4$BGM>ASYI|>VRY2l>mQAm^FI&|PD#6te2fXIh)b735h=Vd)s0h-r9;w&Sk zl6&X;nKp9@n?&|}CXp;*S*`Mq+NF-%|LRdQe9T*23c?fzSst+AwP18>?>Dtkm?_^pkb^7x1_q1IdTWIW~d0N1dr zF6Kj#=$d)&qq)$Ak8S~{>GAZnE;VsZxUOoH%FA8=4yU0aHYpcMkYYMcz-*cU({r_Y z^QYOgCZO2eThkq}%f1C~Nq2XM-o1`EEavw)PbEk^Zg#@+nOoR9>rv$fsV~Xvp@S6EBEd>2hi>A*G}t_q4W6xT?HkRprNbH%G53 zDP=DYDvlQK+;c37imYOH;CnP3kTYzqM-Kx^>7Jet7+%J%SfGgtQzEVVf4rc_vjl3Q z$8&^uM!9|^QqRzBk-i+VnIT=a4SghYd1#WgPGJb4Q1l;QyFE!{95w3GO=Vr`p&HQm z=B8$MID$H<0ho6Gk2lN|Y{~fihizgGDhoR@#bv(`An3^>?Ekg@*himElD(C z)`HQ`yx>{^>O^$4;jcs81CNjqb@4pD7++4ZIxvYzQffUBDJrnN7MnFi);tuw;Ff}( z;wOpIa)ddiCq@XCahC_zsxuz3=gei|Mjg{(*<1VV8m2+nQA~qW?^1Zof=VWPV;D8T z8}zos2Bt~HJbnM>?eIIUFy;}KZwgw0#5$TC;V%#_G&*Uf9F2peK-)SS>qDEIA~-m} zG}wSGu$pAPogjK_S0b%YB@zMoC++*@W-g6e)8xf0nG1Rk>B@@`x`&(ULG%uk?vLfl zgDV#r?y@Z#bmp->20!v$Vm~y6413r^qefO=SEGpS3PCB6&P#J=Tbsr;QQ}4>hHzRx zmn&wc?&Yo@Ms|)x;_>vHZMo`!6xd7|69nsCpKpykmJ{qHY$#KQ*j)Q@&pikN z%FCrAy<9ecSQ1{G305w#u0gY9n?sNJ&T}pzQRXXubE^j8#d=EVCV8W`D~ifUo}Q}6 z4^wZ6=wq>Mzv{_c6W9hkVndsXRW>lF1fBE`*WUts8uu+A?+bfuXQCbnhnJ2`B=ywb zv`caV!_6GrrI|~f$n_^YRKlb3>vgLt)!c%gO(vJz4z6xWu_us|^@I5T@xJyxv&xY55uK_iMn@8C*#fN?a8;s(n`!Q?-tm#H z{rcU_c~1zx4z~lgv_QQaT68SvSE{quJ7cd2al9i?DmJgD3%?-3dR}W@3n5%nEIJD) zMcYfaQL9A~B|Rl>t8NJ5Ky9*x#}8MMDReT%NzfCwkDj<4@WegU53O39VKWEf)}XvC z%hI_~p)4yj_OoFkP36oOmQPT12|hsNND@+^@K;}V=w>M&w6rI8l_yZ!hh>9{LzhM}$5BVt^DNp}#*-5hT?aPi$@G~rzv zjy)2fKt0IRN{4lVBj!WwF0pgViv*-vos!oYv0IjFzQfyV2ly=qGZ)%aOeQZJxX6Q- zg;hd7AijE)sqYl*8k8r=#BAwy&)} zY0b=96Vy={PBDb51-WGTz;3NJR`g6X()3wKs%(NLXJ0?a-mv{R_-q4`9PV1G(p(s# z;f311z))1(K&e9;WJj9g_%t%4e(!36Y;2%%eEJ!+cg*Ja5VVVjXQYR+_j;Ycf)#6C z%M!ZOyP33j-;msAC3>?ctyWJT6w8P8U zbvI}Kqg`(fvW}pq)0#wzOJWNRQXOE=b*_plIg2Iw4Gv^chB@her=CwhO9hUxm%~>8 z$H-6H6%m{r{HJx++yQw#ggM#GiF0k#)#Fv)xd$x+--nm;pk>Zow2U#X<_4ULd12RO zGfW(7G;*fPcoO>E4X)okU5r#~QWK8(qH^8Z2i4`(aV;Exj=uQ7ZW26hbXz`UEXF|u z9#fs=-i^Q_ z(k)R>BCSoGN-RG2dGGnExEqk-v%c|IP!`c<2RB>G#d0%6Q>78YI@aa=X0fTPBLb{; zN6QL|Di#wzuul+ai^x>z#kO&bHss33>;Q3~EW!3Q1dD-2VuLDEsh)#m*DFh96z<66 z{hJT$c4rM%-F<}Nw15kD`1g~9MM-0kV2QO&gX!ua5j%HTvns$jOvld?H^)wX#U0v1 zs9%gd9v@BSDp)f=cP6rCA#y*HGPQ|ePr9vrlhf?p*DnB0lbTSJ~!cQW?BZWO{wa7%_pT7vv;!=|LNY|(KFT8_|}42e4Eek24fZ{ zjwo$*93@t&F$D{iFh$VCe08f&*d58h6O&T~`>a4W^~p|vpIwh8+d|nyNc-H6IQQ-v zCNLMOC{g@5mg~4Gyvf(0Zwh%>8|do2o26)wq3IM}lUsw~duYni4=0M+yM)={DJ(Ts z2woua2DUY_2)iQ7f4xCP_u}q`|H8nN?p)l!HI#bYH}B7&D{%w&j@U<@7KiJ`VT5K6 zMMx8~jEY+e%r_YHtfWAb6H^aUTy}C{vivv z8PcBd=dhi)1L0%%(nMs6sVPR*hGe;7rFV|V*GkWJMAyz7bvJc6y`)+9f&BkWZc5?J zYQ?OV&s1{5HO=A&YZ<1vTBM|RVj!nD#tIs7x8xqx^WuJl48VHsK$)TOg1igW7Mw5q z_h57qP&`P~fv$Tg)FjU_vT%y>CcE#b&5kHjH1-xEG^@ej2@UVq@9@ICSi^Kqjp)TA zFTk;aEtlVLceQ;zug?w7Yna#LCgP^^PoZIAJl*$33Z<`T&VzcA2SStMTv>2)8|XVz zd_b!;+!ca)5=4RcgzAAs^G`VOl42)WgM;dZi^a5CaJb2ZRmV|R^OvaDh6g~0-)*?} zeIdiYxVt#N$=@t7A>?m)@bUT$`_9MfL9W$T5<+k>Ujv9n8Gfu;s90f^KhrUOS!26J zS=C~GA4ZAs5Zte!viYdWu^QE;UDrv%(xPgQYaCzO0o)Z!clFU6@nC1}yDBi5f}dqg zzG^D=mdekP$!tZ1ppvt@Klk2G9n(XeWgI&?XqsROhx)WvZ$4$#*b}L}X4FKc13r5! zrCDH##?b8DI2F!2I`!fijmw}*b7@uyiExCSI7P4Ea38VYF~{KM=ph=$y1iRCm7#x! z9O&oKYSIV7%H4FatSA{+%V@&~;cO!j`I=?sf(=>U54!c{@f{_AQe6&|610}*lAom1 z2dT~aL=7zs*O`!I{a1o4 zIE0y;d5u+xa9)G#?8vCSSfa&F%g`6Q2e6W_sK$*YE(;<&<;-evXD+$OArw=%b$7Yp z>i{pzw#-OpSse~)a@~xNpum+iIJ@uSY_^2o?Xs`X!Rrs%$1lfXv8m2t*HPvQALcrj zFTzzJxvuq)GL)h6HgA%9s#2_+lut*(aGMR%AI=u5f^=Ay(4p%Y+w$Mz76U zmW#-r^W>jKBB|d>>hR%9%dc5Ulcjk-m?_S>`5$x;5#D$r~gw;5A5Q_T)tgqW)jk2}!VLd6rwDhEuLvJiI`G3r9NX;n5f`H(%IC#B}Jj z6$FnR0BP`GHw`@4&9V1rg9lY(Z@tLZVDzrl$GN=xb|`ZX2&ZEsbZVAcM4|9p&N)du zeIbSR8BHc|pm89d?js(XUac1k(Dd2nFfpqJ_Nl?UgzDUs?~AXRjX2nb=JAPb`9M$4 zM(6M3-z&8t6aABtqHJhp9)VUmGjY8&%!L+4JlqhHp)l&IhmNZGoe|*0k)RLrWE5G={NhGw7D{~;r<(`goDnYqsanQaNBjqQ&G!`M zNIHQv4?)}g9}m(6l`z)gzuAe@cH;ZorxapJK&fKlBoBy$+#GHX&`>b4PA&o#4x-sD zjY5{j^&J{ZJQAM3Ioq2RThkXHrU8cwK8W4a8;Ec>4To`5Yb$8qgrRRf+0I8Z9pUbR z_6|AZ$(%5Ms^;hrb(fDHKGoeI8#w2QjSb}_SIHgf*&Px3iIdXnRGI#&Sgvr=r7Lz@ ziG>t-aeBW3NVF&9OwC3yyJ#x@w3A-Eb<>Nt(FyO{T!~}6fXt!M%gZoVVrSQ?Dqx;w zHS%J0xIkbCZ9-!597UxcY8Y@v=}euw!sqAFtA3tS=YCkhQR0Cv-QELyIEhyMzS+#+ z(1h)9SAds@_*G(xo7HX0TFEV;F7Szekb;p%X@KwQ#h3Ni1-L6$>{-B9iytW#(ZKZr{jv#jcUNH-<0T5i$Ak-E zkjQY$1txdYl2IGPjBqOXUWn4|V5EvY8NyJ-35OkBVBB~gDs?`xBrLI=R9rm>l5vOw zuHlB2?0_3UW6{!~MrlpZGbP%A)J8I-B~39yUza=6oTzc({R-C|DQhcfQnL+aBV(rCQX2Q-k>)}rlS#drr9qe*l(j%Ekew#%xEocUIP)dRp4mR2WyzU|J~*yx zT*|P)?IwymANv4$sdtB!K;sU9^GEC$m=4L;=_yksUM_n@>y$jp;MH~usUESXn|k^Y z)e@5Qu_+wNQ`w}^5hCIH+#CytHF<}RVM8sBNXz~^JC0(svj0vkht%CWPB|nP>#5}s zrLpps50ZTvD{nQ8mB=b+O$rUSRj&b7^Y;NEm@npRqrE5Qhh`2xIDZ;iG#Cpa2lEjN z6nA2R;s(Rp!7rhS!43>mzKfgD1Z>qu09Spu1oRqc|HO&~qQp*(BU*rcpsilV9S2-ju~C%K&Z;6=I! zUlcf->1{+ws$s?Y(K;b+p!I9+NLU2rg2abfDKhD5k+q*6V4}B6rwB2b0?)gGepJDZ zAHteLWBRZVEP#wtgey6~RJBA!L4e!#VvO8AXA2Yf_*OmK*ZjP8LIJq~hK5k5ZH*HX z>NLAG-xi-o_lvpLif5-Oq5JuJ011ZzjGf4Uh3|Y~@9G*$@e%dJro`@3jx@w}}Pfid@`wOxkmG z=xWeBwqP3gf%4~S;E#AF6bAo64ZMK}dfd@kO*wjuO&1Kjx5YG8HFXobMWZ-~V>NYC zET0dyYwB^t$iz-Hd}z_y4J$?@FcI{%z7I7XXC-U}8g$Pqs5Mr0=Q8^A6A#c}7f;om z2)Y_`=S;W~7K1?W{V2Im~7pVtIk zt`$H3^3>j*P}2W)y?CQ3>9fzWTIJWpTG^9)4rpVL~Zs%k(mV7lt zrvc`7Yh{OzZjW@IHl}Y{)|o(m1x=?WD8-bWAh3z&M7%iB9OP|{YJ)3JFUq5Wt9R^f zWSTwmWHXy>twu{o_R?<0oPYpOfQGtBkVxH)@e)G>pawIFFG$;8w=4MJdNHjArC+Tp zqS`2D^G?^|1o{Ds3M2#x{OgR}#+hsHIWx>XBbZPLhY=e}_BJYaJ0-aePT;aGTQz%q zvRF%HZ(Zr|_7+)IGDqm=N)7%c?7?xoaBCf~*XRNH8xqiPz~sE44J|z>lceg5s)yBa za;00s!B#pB_vJBOA;I9v9to>cUP3fqO{Q#apq zyP4M+hbc)z)^e>}`Ar-_iqt$|bR!3PfI#W6xqUneP-kSrUG@z)n~bSK)Q*J~cJn3G zedV@s#fqNSobyjwZOG>MBRGQC4jVlu&z`B}=z^G>#o5j+U{gDqP`B-b+I05gO(ns`uTWC~K*d@^-;lVcl@JDn3OCxWjXE`uzLo%~3a zg^?Q8Ktp%GTG+&agi#>S2fx$#d%Lt7xGWx+VO`8WuEy`retB_o#r5u?>T+41t-<`h zoEHMMr8T1RtRd+YjKir{zrQ#m_(?j*1ud`q?o*~jfGI%AQYby!v(-wuxF z5oP&`kw(JV2~5-LpHL&EuHen4xshG(=7GL1MPqLd^thFINX^S9|F>L~Gq^@d25<~o zP7&iL7D9QG>niBvSBlgY$!$?vbeMI7^670E}W+CmW2ym8v^h5A9@kW z-p;O-G5ulv^!6(cXPjxqJ6n+r0VYGzHbc)pJd zD`mn)WyXPU9Lvc`s>gEHJVKEYbIogyr<^%B{mejH&nE@c;NAlVW=QP3S8_I(?uwR_8`)v8-YaQWC# z$hN_FI;twGb(A48HGarx6TTP{gd=^7=Eoe=NlS|nxrDT1Un ztTW*pp$@{M>L3K0TUnlh7a3i1rIM7c9@+_jV8Vj^a5`ENc=(+)1sY~^H;Gn6MuiUT zX-q4~Sk;-;xH6S*3?_a{(eU>TnAnXQP1)j{qXKNo%;W^-@wgD#^Pm>8v`N)d$;W}I zr(CN-WGy)K%dr4K%Y#A95UvonFB1^}+(S);|FbSsN9bxOBH@c$7FvQ*?fHVGZVpV$ zfi5@2*&L#-v=mjr=;XJQIk1dA3A7fyKu`b&7Cl75Cxl_}>U(5*ZjX1>SV$N$;9)AS z7fxyiL0WaUNaaKSVjf{Sgg>f@ujI^EBWTH1hHOpH$D3%;g8&aJv9Q|;7^~@eL}|mC zKCRY;5+^uItqWGw^PDnDJ5~M!PWn&c-eYs+yW>efCWu1yC1KGB=ELoKmUo@A-ES=< zLgUJT@vBe7l};C%TSWDC!h}2YQz!|+Sw+c->{9%5ulay-4W*qRwL99t9KB7nyo<- zwF|(_fQx$p$xyf)aJZe$7!htPr^Ac+y4l9HcEjEZ5UW4Vi|U$H*k{OT^qI4Kg_?m$ z@-knZ3YXGCp_(8fenMY*%mra{^_VBUlR&{`MUL$(XR4gLjpr>Mb4_bG^84jX0`V@h z6UO4Kq5Oz0Op+&xk?%J3 zX>>!RQ7a-vr~yT2Oo{au9?bN{75__eKPtT2PbsNouUk9MJ*_k&c0B0}&3?@f}k!wy>bR1ychiX+i#1Mm+^iEZF~G(*vuzT&p6h-q|g zfXPq^i)C#CE|4``bFuytWuaKZ%N$(0>@f~*;YT0wI*+Cp@S3@WEfJTysH49PFd^O? z!XqCYklP~-ar2D_+vDkVvF2om)75<*$)S0V(njBghm~#E^p&9?sCUzKUZdF*oCU^S zI)dd046Co%^~F^D+6fA>^C&wz1P8JsUe)G*k=+rtJ{R-C?l~OmC`N>vz;b(bNUvHx z!RAXiZJG`CdXwkT3d!Q^P9ifBV-za{VZp)w5kROO;^sH=iOF9}E_*aUeuRl)yS-E3 zXPWgM1b}vwM2eUk4@`-HNPQ*Tizu#=G3;vRHtX#BE7Aqme4mO~mRZCyLrOQ=IcRR? z7B5m;eQka9v2!;dUJqtsTTqf8&t{Z%{qenrf^POd^g0CKFd|w)5VC@}JtgNDRrUyV zY4YF9M~=z-aeL2}(Q7bOZP%$P!lWdxMd&O2l0_Q$B^0j1xZ_h=It8UAMS(e7u@%Oa z{pF^5K#GmBiL|CXJ$dr-5#xDTmJ(=Us{ibb112%cNz1!&_ML<*B|&4zX|Ky)Xxanb zra3j}!$-04K@-OZXTkA-H-mw`mzbjL>iuTkN2N(`!W3dOPbDZvPukT6 zmXLkr5NE<$>RF+6f*4LS8b>b4K9)jZ?mZ2LxPLij`%i<@lTrpf7MItf&!YF?$n}}i z45PS$P0Jx_StbCXFcSqQs9t`7u)vij4rfA~&Q1n14nIPzH|4I$pSL{$*&nI+gF0?% zYv-fQ=ZjxnT~2ilUG&0-bB_IuQrJB+Z{7!1bK!FbTl^Z8~P%zXK(SN+Z9`m@&^vW|*wU)2-|W{>T2LseD{ zCm`;qj7E|}BK3AyB$yg^x`2_Fva`||>xQh0#LK+I3!8zMM-Yh<6{}5QPPoG1#q8az zg)QabrJi50_leO+RfIzm-V+CbLW_}edHFYdu9rHUm?%*M`?V)lTa{%zXdxOgWxC)> z4A5v_523~*#y@2?3An)1Na zhecwsiExdi4uKo1(MZRYXU-G|zLsaU8nFOS0lIw9uu#ZNDDs&gu!%9PT7HZ6-IOKc ze)|Z?4B;rR+uOYsX*Xy8dqhKlT~a;G-aAzk{{`-bA(nlhYK^8;5o;kO%y#XTXQoY* z0oT8tR#-Yf{i!)L5s&POEXcB6jTB<7hIU%3CALC&<6@_25J(Yb?hlE1)ZO0DI;_3S-8Ka>U-Vn+RzK?*u=_tq}|t zRpl9bgq*hF%ih|KD!@VddOs{Hn*JagP52BQdj8HnJ)u&|513IRWvIo z;J<~#3U?aRIK^NZ%Ro}oAIekgrMlRJ9U=X$P{JT7t#wCJS}Gfy4O(+dzI^}-pJpDB z;bce<3$Kdh%5BXjj5osmS!eb=bVOEM>5*Ee)lZ~xdLN;7$K!6z4*LxqSI0mu&tv<) z=u%4Q{x7u#v~8%IssS8NW4yRyAAirNtP@~1#%Rn`=V`oQANt*B%j#3Zs%y9%3eJ|C zis~2T&Zd7Kr0JU=&T}111)QJzCbrhEb>^d4;W)BW(#V1gV?2kaeAe%Wc(+mo5<*Xw zZMSQ-?&k3M*qLJmd6H{Ki^}<%9w(2Ba&D8h)KvblCeO*X3cKCs0B{8i^gC8ieAvux zi`6TEh88^Gau=TPF0I%hwp;!Bo3b05rohSG zTr}BG!8*Z*3mv z1#g`~D0NM_@bRcRS|AA5Q~_rerJb1A7I2H~JzaTvTK>cv7OXF`PN-(z9%d1Mjlj%@)b}Mvww14zrdF zNDhE>jM3MEJ2NrHhrLFKl~@z6%(=^wVH!L}e(m>0S3L~B=3x(P5Ue2Sc5@imw$xE*YmcxIVFPr_bs2rFY`*lI|Qj(dC!*h7bI)!TV5<&_ZvT7hE z8hy~7Lab`!)FVUhb+JBOf^H4IwdxFk-npxG>aDf{TP_*cbU!6z!6=piJ19gVB9sgb zRXV9+s7F=8la)~E^Hs05P-47-jhw6fdT(Rxtc2Um8eJ-#b0-IG0;2bZ6FbA+Q-K^S zyqq*$JqNbK!Y%u#=CsKrTy=%v@5R%==q-!$I%`$My6!S#Qz+N_&Vl;XS8$2pAy&}H zRZ@79U331?_245*=_#T|7;BZ)R2^-i;V0}g$_YDUl&ZeBP$$H`YBgn?kY@RN)a}Mr z(oVzBT0oXP<;Sp}6{$9PjTNM$HVKK~&TcUtvpR6RSgqKnHr}E{8JQj0+VFmm_3P{gk20vE2N)oE zi?JrIH1jJkRX|=v&k%Br(!rYua70-pxTcbWZ7KIA02lO&Lx^iQ>Nm;L#a--jQ_8oj zEmI|((BQ8R5)jBcZE`#7BAhtBw|AFbG&vWy*`7b;lkI#o(|wW*(_+KCy?$iK*0(7o z5{BEe(XxUI3_ zes3;o@8rH;X&NJmcF@nL5Ki1XS$pIND*TK6~jzaKon}1l$i)(zs@F(7!tv|m7 zS3OQxEu9%j8&sIeIkqx&Jz8#8vErm$Ta-3dc!hP!$W|I(ULBT+ov!W$KC7b*wPWZr zW5T654mwN+OHiNkq}EE!X7(_uIBr#{KtmyxX7}V0e#5_S_RMVl3=|{n>Y=<_x2BU@ z{26X(tlWlA1RAvtZH@7U2Yt2{R8{LS&?cR$EuS`?3bDu3r=qhabWlP8X@CgnVb?_& z1;YV8#dOi>v6#J|4lk0@T0iESbFIcY;}NS=ET$EwH&#8KAE{A^i+FOqjcgGv)<5-UMp26ncfflO?)uuiRHq1-#( za8DQ$UkS%1F#9IBsaRN8l~9snv%b7z$;`a47$lr-?1FYX8U#?gnJgTSNAo$md}ATg zIb!>>8jqGmUW{iW4MSPeo1rLU=Pn+y`TAiq{{+pSrf9tJX8Yi2NTy1!4Q6fp!G1C-f^YdI6P5|Kj!y&g;us6hk-Dd-8qk3HDOk49?$TN2n0$}+C3X} zGIYK5{i@EixmK2Du9f9npx1D_fu5-Q8jgO88AkI@$42y%=2#cFGZ>9$>5(ei_y zpwQK;UI|2$*v^R4U0Z2xarM4|f$n-W+)AIS_r>~wWxawB>gyQlR0`NPwPRzbV`gKY#lYMrDeu;JgqctSd8|rSOlvP+5$VUC9zoRpYm5CSN7O!7ii5ly zjwL%I#VBZjrhP`CdiHm+sc1-FEKyKZU4H-hGbou?prcu?xA{}UF0=m)@%QHwBnZYE-GKM7l6EBGG*D)oNw^2Q<{`8q~Og>VP_)#3@=O6 zhu(wq-bYz!?rd@ke`TjWO!M<3ZA^#KnFO%x=!9%&E;(fEV>vhYC|C8V)#b=s%?nCN z%Qyy6KdK#`-t2NZVY*jdqWB|P;@ttc&PGV`71kMRo5;a`Yd$oQ6(Sln>eU=w_8ky! zb}#;Yxvc2Mt$uNqe5gtIp*{K{o~&}_#r=`ww!=NvV2Y}BTurPP4EGZH@!BVAc9n}4 zcADm!=Xv#f3uVP(xtnM5ewX4lB9dSN))bac4Z8DhN7Vx&E$WDXs9x+tnEU~!YauI} z)?>LQXC}Q|NPC{;CP7*2sSl7`&|Vc|-q_2F-)=@z{@s)MyASMr4p6zKm%nFq0UzAM zcR>~de$Z^>V&C&0PU^?7AF(w(gJ~LkPX;u6x^8_5c_p8NbX{dt7JT-$-ND27? z<+JIlOigy%#5sCQw;SHh!7+*Q(IY!OSC{y+gkDc8N1U47>!GO)+_z$Ohu&K+rLGn| zB^8E%!>Ll>zzH@^^&PRxQPmrCvRf|lPGtllRWo!8fczz4~oDQ zbchdwY)!5#0NP#sm%~*A-CBu4rKh;5HfcB2rs1aAPjgeXhu^3@S=)VgJRLU`M zXqAf!OV1Nnlw9G%i+yc2VEDl&9~w5I5xCN2(xtV!l+C6VgN^#!+&!;{=MFuJ2C}$V zOttnc^Ejod?vxX$!{7!f$gxIZnBukl-?%7dh3%cP@g#BmMJ6i zmKuqbGnQ86d~w1!a=a>+F18EzO|RZJaPI#o8z(xb3@fj?tT6g&>8*YP2bc?tWsA@C z1Vr@$=*=h!E=nksKkTJC78f(ao)Opiv2u0+BLnu{Vkfa0(wz=?W<7DVhx?HTdNK8|1(~emI^1Yos=`{mtA_ zzw}LRgE*zio9^bOYoGphpQ7e=y5|W_V4GqB`{h8lI~829Q>%%nKk!rrF8|iXN@(~= zN;sbOf1+nYyD8$REI*B1L_@$43Cgmyf?EUfu}rOFbs|-gNl7cvB2j_&MMh+81A6MI zVO)~Xu0}L5)E8+Kc$E3FYrYR`qsW#?fKymU6C6d`dz%K zC?mT1m;2+*HHVW60-r$J5&#NKAtV|T{q4HOE`wJK-I2mAJ@&IYi`bmQDhp%r!H(ip zl9^s5W)V7sHuvx>J}ojvX|`~u&C##(p)mTOMiEJlk+s+ZJ2u^eGpQq-S>)@*-5n6S zJz^1EJ!5qCK3m(V;-$?KW!^AHcL?HawO-6YZdY&oNIJH6`iTe~9vFW%rd*Q1~ zV3f33F3Vmi17PbB>9Z&tEP1N++99kyM+g|R)ASKuNnRjCuwS?PY1o1B3rkN~K290V zsfANdSNxfbwu;oef;4j zAN&BYWhf&qIdpkE4bLfz)anK16wu#WEfxgB(W4rez>$hKRtWJ_nMhmv)DldHD@KZL z5S7$KSB&mHm4c{s3}u2{(p|GB-9BX(bAfhE1;HLMbjxbZ2eS13^4+=cT_kM|B8hT3 z(9g;)hFd;&%TFg0E~=QC)TE(^+$`8qA#Um;7_))W-fpMG=ae*_qBdFehNrXd;XgU9 z_h_^(oMwc>S!n*-&I$F%n7>lSn}u}xrxQWd2#R-{Tp=t{yiI#T)DT!-z?>ZV55=Tb6VQfo@GPWo`d8iT z1n-i`T@`UG@!yWD3(>@=B3_77NpNfR;W+mua80irAwFYlPO$T>Mz5B+2p0o+v}zq* zJU=~761D1!JXlk5n6_0Pr8SWWZy2AjT)!5@O_P=LNCA-3ZCSZlaBUL~@$G1O7lT8X z$Bt2!YvRdoi}l62n5maAz>9oeCKbE97&A^N=Fm&k7h;e#7|Ch{H5Bt$z2NrZ7rPgwJ?Y&wZFEu91p(c zMClNwV7gt7Kssq)ucI#Zx@MZNtc5xV4c41+0e(!@n?J)l%ZQ5QYfP;vp&~<%(xjp2 ziK9T%zB-DpM}r(oh;EO(U9JdiFn_56_<;G7qkCVM94PO6F<*C1q8Mzp5yua!#eC7qzi(zu zuA#v$6T2L(`Qq6b!St=WX6x9{t@Pt~$RrZ|vfiw$P!+bA7@DXrl0g#=Gha4~R&%ha z7*9BHPMh`3oWLvJ*vy#W%{cD`%8xaWUX^QMj_VZC78b;zLns7#UxJk2G^?Uo5z;YL z$a&h(Q#Z7p^i9W zIRS)hE?#XP@DUy>w69FHYk%{ew$Am)dq_W~OU)pfxj!(sw_a^MXXhYYz_ONKiDp-cWmMRd@fhDzRDXN->%UWj~kqtK74nB|bYs`lRuvzT=5s@jsp>#AlnvMqd zLEyR=@o6dB&pMlhWjI)x4MYS9s(I`*jF)?9S43|O>w(ILbHY&|iGNczbD}HIY3!^Q z3$Vi3=CHad0%G9K+7eXiGGXaA8+|JH-5ZR@P6*6AYOM5om9|;h*#C>F8tdr8BX|Dr zvX8;IksTZn$E{R=3!R%K*_DVZ%#|lmWf9UQ0JKnuF_z)5dS-<2(3x9yw_7abX7SOw zH^7mTc2n;#2nDh4i!D&NXIEB@GP8;$r2-RyotU~*qcrO9^b zGy54TJg|rlGbDZZs90C*g(Qo^9zT-pRXKySVV*q-qAc~qDCb71$1C2|qv+p)6DW8F zAr9ZlnA zCwR5EC51@}z?Ajqf zVnFUQ_MqYA^_$$|DsmMS${?d7~Yd5i)n8HinG}-tDNT^gaOyZ%QKA! z-%Z&M*aK$l_hxJ{r1vU!lJj&^Fg{Ow|KmIaHQ~XEg8^9G z7xO8w+6%h%Xr{Hey~4Yo_+vM6A;CIgL@?VM@7R_qNl#51(UdhFip`3Bo^o6{HzreQ z@!c|5hirGPGK~?IQZEIisC^7{(n`O*Ei(ZhR&#nt)3qS>?Zm{Z6Bbday6%exw z1ABc_5$0PGv1Fn*JxH#I=A~y^EXQ$mTiZ<)eH`l>YQo<9mR{#CJlSS_9%d&e)> ztw@MT%?4YSlV?_3z!AzG&8>xu(}L`Pz8n^0Pc(H>=MS!>z%k$XAGNM;mCt}I zYb4xb^x2EBl2=^QyHL|vY`#O)?R$g+3ksJJJiaGOtK@YCx+;DwM?9%#zr8h4%fgq2n9ut6=Qi62?Z_v8P8DIUd7r? zej7ZJPIZBUNah|m!3;&v-D!zUjX8bGXP}gM-mTLuI8j%R&pvl<*ZRG5RTswXga$_D zHbD5OOIRP{M8HluvP>O`V6ML0LX$@vW*!<}S1^*( z)Y>AIRal5G{Fz+AGdFeV2>;|&!D=0lmag+I{XXv5BKJ?g4umF>Ra?twz#g>r+(W~& z@)CgJ;d^8oBia7axTb0brBWdCt|P%E4I<2UndB)0DJsSg%m0M03YIKG1vyPr!1lcRs^x=MFrrUTC_3VfZxk?u|5MwYxWWE%KL%V70SBeuiWvvm~RN< z@dTW9q$uu=r{~Z|8&HV2S9i0~ov*mWlz}C1#hq!RZl`X24C4Tk73UE*7#nWiE!dhm zt>8=ileH#=URt?fj|%1Pukp9k%!Cwg4yLB;{Z#pLxqd+7VJmUi3kub!hr_WJq#6&e zMljs3WVg%|HgAb>ju3O8Dw0cw_y{y-y zz(o@BYCF4KOihW>n3_qP8EsfNnE8(I{V?Dw$Km81mFHl3GNqQc&Axd=Jpd$HJEw4f zA~PmVs4)l+@*F$npe2*Xrs!ed{7{GpJZfx`+nt*nvM+^m^)!4H^{FdF!30}THK%zC_+s{M*1~drsNo5) zbAp@%MaUlVh;yJsrw|9yB|4qA?!(F%EEp=;Aw}uv%OfZOe4IoQi!`6eqaPsYmCPZ> zL@&3?9p}JxC6teK;Ko~7uHv~>FA2^k3NYQQHbpJFp|T@(@Wg$aNlp2>7|d`mdB&sBuOJwSsiYgF>R}`SCTs{WA=?+n@tnL~FQ@+B z@IUDdIEtWe$VZfT5R}%iN0-X?!SRjP4KaHsy62Sa7aZfHgNW{I2?54(TC&5FsTocV zFcq1n!)qboTNX=-lL{#lwq}6FB<<<~wHj;Im3|C2^E37uJDtNckiAm_EbQ(TF>{I{ zH3!qtvMMGwC7|!`hLACHren2c37qr6GhsjCp%#V3x@uLZ=%Z>(z2h83_Fv(>9IQ^> zaMj7Fi|p{Yy~WR7p6H6tK@9EmX5vgBE<2yV6KlQS^zNYar32|pTNq5TmxGTsx-FkF zR#4`7w%0ef6?eQIhsvS}bpJ(S_2Gb*i*13(JUncK@TJqR z529&NBNB4N`@NGeP}YnNQq3N(^GsbE@pvDviwLBlwKQX#G+_Zk<(3la>4CAxF9p3tgCBLJtUD0&xe}5%MU+yat6BWmg=l zOl=#Y!W0(aqi{f2fa&cb!LoZpQqQ+ZSDL=O}jtD#+9b*dj6YVEhZC zI+uvts)@*v>fmhsdAL}fZB{F&2Aa;V(q;w%_TjM#HJsddtc=q&Nw~8A@H+d9! zMnT`jrr2-m>uI|!A*LS&_JE&{1>)m<0lxN(_pJhvlCj+hF4angOC^9-esl%KaO678 z!#ScuWU;7rhyj>CVcw);;liCq)7YlIVcAK3E{{mz8~hqj3^NwZ?s>(2xUMGeKZ&Pg z_i5dSC(0oJVap7o^n3o}+PFRxl{@3<8Rq&!evbXRTtB?#S=!*$443?FTmi1swFJ4| z^}6=9@x|to@$_0UEKZ?qtzX{9+3+&)^8SXE>|7CSxL8c(4C=z6q`G}rfa-L3YS`gT zVz?uH(UV!+U7X+KZAhg2&^9bnO`dJ ziwYixLk~ERTweQd0^P0?=!04@uhk?eOzk`-W4&WC*4cYV>0x|D0@UGPIgktngkevo z@)@QERcJbuLV_)8j;AgkNa}E`vrfh_+BPkD=b*s~;Y=60ld3>%8zP~g^q!37Vt8Jb zrIIQ*cqSa^btVY^>FJoVonmj9;wLB0n|IM>vF(>^3CkX8YO!(~vr^MKB)98lsG8V2 zUSJTXurnH(iw3Fr&5oCoxphI`u>plL6?keN1v{R&M@Tu?W=-q-#K@f zq9z7TpM1G0XOR4=emUD(Pa0L!Z_69Jq=epP0(ZmV1=tduA#Xct#%s`1lMpkuucbXZ zbU7|@*rn<`^2rLw98GlK2uR7a=z{+bZ#^KlBvB3Yzo4u!Dz42FQEbm zf8S;RvBwIwQ?W^H`|A8FG7->O^yONT?DiENm$;ZzK;NQEv7lL5cM}tH?u^57@E*J9 zPS%Da!`smWZ%8?@$A!l%vS!EY+0@s@@5t$>S(v7D@@$Fdbu|T8cSUIanUXYJxPmZG zRtqw(mc^KR+)1}OQ!z)Ok~^^DrxQbGvcxitec}_VM&UpMm1gYeJ>_~{H#2{pv;!(a zC5^=3+&5$A^Rf1N!8X1us_1Nu_p%RAET=?9QY8{-9RL#NYU}fSDSGIUp{+U`K5N4B zDmom8X&D1puBfUHV1>8ZfLqn!C#(_R2JHJW1p1~Lw70kVMaptnda<(Um$TVm;4Ql3 zu21r&7sBhsn#wAITOAA*&aoq_oG`rB$fd$wj zZnAKAb$#lM1P$b>Dr&0!W@y7L!1(Qq2#2GW#af6o^hverGuF3Y7DAaj+XyUR%lkQi zPB70kpl<4=*>TItKT;~inI3ac4wwRtw^=qERh4L*1ECQwsYL9gD2MP&MILF-i`6IC z5;I;y)6O>Cic3A#If|}93hY~c*6WK>6~$STe}l!8YBgpnWf6#chR2*kAD|);rVcD0 zW~tm~Ze6g0bhK{gFy(331`eBeQqbpm8KcS8L|4D$#H(@G#LFbDzP}!xv>KE{g?V)} zlj4*qxmWzi-&0G;(lqCFD&e%pnrTUA3|mO?Hb#W9h++<8f;q5R6|5Qm`DUN5pp z8^aa;aw;6rp~aa?1Pks!591e@(iWjQCF6}$wG4J9NS?OknUd7^6EXss7k zL-Pa%DRy&h-A4?G9A0mC+++#sSCXwk38YC%sx<}$)py%yvC{+LOQZFAC6kj{H$x}o zsET)Qf3sH}vQxnC#f-!%JUHkWRlt!xPD`E^pLil#pQ0bf)+odV zE%%`0n6f1=TK$0+t<6M|8jJZ34Mm*D-vE{>VMlvB4V;WG*glVFO{hkRCZbvY*_9#{P5|xQCB)lBHSygyz-d77;N_e0oyWD z;x@D)*&^g2NX(Xv-)K}2RrPLvvYx5Z)i`ihscaG!grB*|x5+VdFztDd?{$c-1xi)S68?Y*;>o(@8i2dFw7w8P@F}r}Q<~;x?r`4H zT3kD7OJtU~L>5$+$PSQ~7#M(7+###Y9CpZPTTEE+?>!fzPwH)DE$;M-@8Wy5r<4vw zxCM6|0G)sJCwntvpNtn<`Dn`C!F&X89}n=rSL!AcxNTieqT1@?D6w2smYawlSyf99 z39RMKO|4U2hIGp9Z5A%>GPXe%pEgTQglEyTfLkTjusF%zv!yyf^RinwpzB|)4(^Il zIdHFBRJdLNmg)oz*qa-+x+WmT9yqjvj+c7>-MO z9nqlB_0ur~J0oaF;c8G$VSY1JdZI~yk{Y%nJ$WqkMnVH;gXLtgVrRhVT3}oT!>8uS z*>_9+6YRX<_fZdZYel7Y-cRYwZLx<4BAvm^l_yHbGU%L83+>#>{%B(qEVV(!J&<*d5H`sJ07;+(B z*tUfyZo5G+BpC>h>Qkp)yM`@47jH*(6lKoKfFnZdcUC;=+<9>Gi^BLE{ z0Z6^goon;fLv?}gU!QpornmB7@>;cNIEdkvbE9&RM>!sWv`IvET$6}hv@gs)rgI(p zE@h}61)BB44k4tdyd7J21f;tf29J*_!j7&7Xd;l@_1nz?$POPu^y)pyCZML76uz<^ z0JbLMLP55Ij-x@nJXdAU1LbT(ltuVXeh(&n78dkT<+>EMjHlKyEqjl+#aRkPtS8U~ znhTLORRF-k1PxtSowX{tIo$~G6|iAMc?x+T#HbmdzwEZGM5t3UWT(W9bbKL1X>F({ z*`+f2T8-RkX2DiFuxE9Cw4RlB-tC|9H?0{z)ng$b0zNXJc$!$&*r_KCJl&Kv|4Zvh z)Ng3 z@iLt?Qhu{ctRtxW#a4v-OWJ(0{^(_Z!=>-Mgn9l)w7G>=}JXc z!W*Xn{1~A{a>cK~!DcvUsbgLPK6mk|Ilp@5d9DtAWMJ_@4go#ep;&}id41G}G z6SlsPJc04)Vo~KE**Bp1>hF4(OT&A_7>IzTg9r%8ND6^(OToxRy9Z{kPE!w+=}76B z;7GPjq{cXa6RLWAe=amp=i>h5ObP8PT)QPZfz%S8uEphK2KgfjOq%0p!Z-pCp?Se_ zF8=Mo%!bHkremhBD{495+CK9`@Shr!2Q zYG*^pXhx(5{U(Ag$&Uzc5tWU`Gy8sVy)0Kxjzin^;%a|9s}?pU3m0`8Z8v&lx~;|1 z-ONAD7teD_L}~XG5Uw=I)?j@tTjS09Vs?u93!qfEozG_8(3@EJdf_eZ4A4b-{zI?!i*YV-*a(42v3K}$@AF>FCkACy)v z;C92YjWGy;egj`58(y*x=@_28pSC;BS{KdfBdW@wT(g-P&_SgL>9=yf!(1j#sk19!KEp=9o(ehgrSFx_o^(evpqTER=4P z?Gc&DiLGOVm#` z;JRjJ$J64v)*C)77DmVzF6uUw6Gh$jR2S|l3&L_3tXvy-qI96;ptq%|Q*@&Y$iZrr zpQA4ak#Z>y&l8yah>I%vu(z|>&{;}!cX71?hsV~<+__28flP4XPMr?5s?W@#s?N7r zur24s2!!M;qS=*`LS4ZK#7Mt0D*evF>Gv)t(i=;;*tb*z?3VA-23ZmIO2n8pY9zxs z&VA(}hGIbL5-qJ+5Z%Cr(e+A`7TbWF>ushVv>dS5||=22Cxu57ce7;+HTodiUy_6GFX_mqt&X!RMebWq6^2N{lV~cHDxU@ zC(d;f1_RyQt+mB1VYGF9Ym*+cr1r}%s_|$FZ*EJduQe*e#&EBlk-^UR?PequXebsg zi7$v~D)f}5f^x2SA+ZZjV=d&laYmL~x<4}MC_1pg`xC3h zJ^H>3ceJroKZbzU&UW)VUs&^RBzC@_?UWa`Rk>`&BAaX4wy(@IJ?u0!KdvaT;t49mKDDl>t-72C-BYISaZN}FK1;cw_F-*Dj79%<~x-g??i3(gZj;f zytVAd{3Sa#QZ&0oDahICoPA!AC$IP)2r^<6R9?uBRy1mdUpic;vFr}TPf|HPGc-*G#>QJ0LOrDSz`r(cO6*AaIe+=GTv+CmGe!utk-)Ov& zu{wxJghiLtU>sr`YlfHOX6$n>8$>vqVlU}jvnAWmDx+Z*oBvr(ERQ^OJhZ98(7ofm zuYdSGsjf8IXj_i2!qo#SPnHyJCM@+V>z^5pR;%McdY(6%_8L-uc|zSugo#?&i|aNgL8HqGT+%MdA%@SGs??GD({4`2Ims-Wx3DpVqEUI zz1YhaI{((*GafVde3cO*$=1=t=C*kV4t0#lW7YmV}FNaVG7))Bo4UR?k70tf6lZb(6X zi6d7$=}N2QTsJ=V_tG{v^!N07d7kFBeADZE&mGXcD;BIjT9nvm*-O1L!LX^_m1n)k z#ck+}OJ*InBs?u9VXoq}=$c2v!VQsIp20_X8*deRRcE>?rgclbyr|c^2@TG?Umb_Q=L7QrmRLV_O8ryv#VZ%Dyq&#-#dv_ zmZ&s;GI9IqxYAeI#+1^@gqxK&GJP8&_k{g*PiJ?n*&9w$Hms;CQu0#iLsW`OF6mBh zmxR*UY*>^|dKtMupN=AVCtkcJ{=zU;T*OR4v)Q@KOV1PFGL}jxmOe)<_(N0HD*hwb^-j1>?{R27RP@PfP@QvskIYTogUvGtHT|WKSykGn&IR64v-?Z;>j7*zFyX1{?S*)>P%l-GK zr#9gpee=}0o7B}@RYe^!SV5kOgRVbx#)$_8b{barheVZXu6{s&38KAJYS1xWkGFca z4IOc5FwC~5lawaOsd^^pqRtE$&(67=w|}CEU)XQqXV!Tv@Oxb?!``7|vi0eBu|F+2x~|He40ibny9GC*`D2v!1lv z>MTSd>TTo}N^hi*T9|&5d@4hvTRRaJz_9g`TSCK@C~hhwMAf4YH?QK|Zj*hsLywiB zRivah$JtB*(x-zsgIDpA3a>or?tQ(n(bR?tPa!i=s19uvnui^YOY~*Aw7xbJs_pH5 zE(or=S?w@IWU7F#Gb*v~!7J4T4^qZta0i9qmPbi~rLj!n0FmBr+=F;4%~gM^7(jYt zd$I;2myIH?qiU_cFF>$*DS=zuTOJP63Q}_&I3wx!^CR)yK`s~B%I?PzjmxHZFA!fJ z-}A#tMw-4Q+I**JJp^$DBAy$EjpxIrnWW<)03IlsQ=g%OTu*#WIsv2>Tt{3h;CGYO z5x44>8jn_@+nUOaRw z9tfT5V$ERUr;T&jU|Cl;y9W;vXZWuaNFi||he4??M}?madDAXUv>^sU^p%%Nq>@B1 z26uu6S0mt!fO;VtPv;A?00-_IC^^U}KyX6W`f)$NEk!vTu0lC4ln%DW3TX(ijV~Ah z(`M)3wGF_TE(Z7x4{T#FX^0#=E*L9PD5ro)r}BgXI1qwQjKj{3z#nUZbifzyA17jV zvHJ=@_C*SHjyE!RVu}e0{;GH~KcT>f01^Qz+@!(za}ok1L9eDkhQehc9@NSuVu&QL z9S4FP^sNnm8i?a_LGd|-ipnDe^&I1-Zum95NH%Wr1o+8MHcv%SyR(?1Y%d~6hVzs7 z8*D(j90^)8R(PKTja#+AYj`D`5DbR{7-8QalXL+Ae1nJD0E9RA2EmvMKk79F?4=J8 zz-RajI6%X}g?u_Fix(wxv9o+USpc|PAi}Z3z-$yG2HI`LyqtIU>nkFuqW*6YjSk|0 zb;3HK`I}|_6cFDy&4&XScqtOlAA45B2NhhMB$p_QiHXUGrQT9-sY$|U5uuR~3PmXF z92|@so$XzWU7h~v97uDNduhAEj)zUDvrRysDM?Q%v~NTm)kPk93;XaPq1Jp!*~FZX z(wA|Jkn$l*H)^?RHuvSD(boxbq07HM+c;igslq;(ui7=RdAXFd#B%HCcX~0p5d`Sw zt5wz}dt_tIDOoIE0h1lqZ4HK5A4b*rJbBbN6A}8RZggey%M(Eq1I4^v%lGEQ79Fc) zjv;lc#ubO--*N6N>qq|IN*bgA}_5KY8ts7s{It<%K7LzQLpz<3DCfX<#6=4(=5RCl4r4J?pOj&5QG!&`} zQPG$PL>Mv#88)xj@W-k@z~U0hlK<4$UZ}?U66-(hdLt)RX{sZ(9D@IB6Y=vsNOyE3 zDjI{{6oqkh`i1imf;jt_(Q{&n+btuBTyw*MD=U;6_1f?`ZE@oLMXx_k|tvKHIHHIut O=;?s!w;h@(CiXuAPatUk literal 0 HcmV?d00001 diff --git a/python/examples/dot.py b/python/examples/dot.py new file mode 100644 index 000000000..e7c7b1664 --- /dev/null +++ b/python/examples/dot.py @@ -0,0 +1,42 @@ +import libtriton + +src = """ +const tunable int TM = {128}; +const tunable int TN = {128}; +const tunable int TK = {32}; + +void matmul(restrict read_only align(16) half *A, + restrict read_only align(16) half *B, + restrict read_only align(16) half *C, + int M, int N, int K, + multiple_of(8) int lda, multiple_of(8)" int ldb, int ldc) { + int ridx = get_range_id(0); + int ridy = get_range_id(1); + int rxa[TM] = ridx * TM + (0 ... TM); + int ryb[TN] = ridy * TN + (0 ... TN); + int rka[TK] = 0 ... TK; + int rkb[TK] = 0 ... TK; + float xc[TM, TN] = 0; + half* pa[TM, TK] = A + rka[newaxis, :]*lda + rxa[:, newaxis]; + half* pb[TN, TK] = B + rkb[newaxis, :]*ldb + ryb[:, newaxis]; + half a[TM, TK] = *pa; + half b[TN, TK] = *pb; + for(int k = K; k > 0; k = k - TK){ + xc = dot(a, trans(b), xc); + pa = pa + TK*lda; + pb = pb + TK*ldb; + a = *pa; + b = *pb; + } + int rxc[TM] = ridx * TM + (0 ... TM); + int ryc[TN] = ridy * TN + (0 ... TN); + half* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; + half c[TM, TN] = xc; + bool checkc0[TM] = rxc < M; + bool checkc1[TN] = ryc < N; + bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + @checkc *pc = c; +} +""" + +print(libtriton.make_tensorflow_src(src, [2], '(M + #TM - 1)/#TM, (N + #TN - 1)/#TN, 1')) \ No newline at end of file diff --git a/python/setup.py b/python/setup.py new file mode 100644 index 000000000..057362b0f --- /dev/null +++ b/python/setup.py @@ -0,0 +1,76 @@ +import os +import re +import sys +import sysconfig +import platform +import subprocess +import distutils + +from distutils.version import LooseVersion +from setuptools import setup, Extension, find_packages +from setuptools.command.build_ext import build_ext +from setuptools.command.test import test as TestCommand + +class CMakeExtension(Extension): + def __init__(self, name, sourcedir=''): + Extension.__init__(self, name, sources=[]) + self.sourcedir = os.path.abspath(sourcedir) + + +class CMakeBuild(build_ext): + def run(self): + try: + out = subprocess.check_output(['cmake', '--version']) + except OSError: + raise RuntimeError("CMake must be installed to build the following extensions: " + + ", ".join(e.name for e in self.extensions)) + + if platform.system() == "Windows": + cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)', out.decode()).group(1)) + if cmake_version < '3.1.0': + raise RuntimeError("CMake >= 3.1.0 is required on Windows") + + for ext in self.extensions: + self.build_extension(ext) + + def build_extension(self, ext): + extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name))) + python_include_dirs = distutils.sysconfig.get_python_inc() + python_lib_dirs = distutils.sysconfig.get_config_var('LIBDIR') + cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir, + '-DBUILD_EXAMPLES=OFF', + '-DBUILD_PYTHON_MODULE=ON', + '-DPYTHON_INCLUDE_DIRS=' + python_include_dirs] + + cfg = 'Debug' if self.debug else 'Release' + build_args = ['--config', cfg] + + if platform.system() == "Windows": + cmake_args += ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(cfg.upper(), extdir)] + if sys.maxsize > 2**32: + cmake_args += ['-A', 'x64'] + build_args += ['--', '/m'] + else: + cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg] + build_args += ['--', '-j4'] + + env = os.environ.copy() + env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format(env.get('CXXFLAGS', ''), + self.distribution.get_version()) + if not os.path.exists(self.build_temp): + os.makedirs(self.build_temp) + sourcedir = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)) + subprocess.check_call(['cmake', sourcedir] + cmake_args, cwd=self.build_temp, env=env) + subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp) + +setup( + name='triton', + version='0.1', + author='Philippe Tillet', + author_email='ptillet@g.harvard.edu', + description='A language and compiler for custom Deep Learning operations', + long_description='', + ext_modules=[CMakeExtension('triton')], + cmdclass=dict(build_ext=CMakeBuild), + zip_safe=False, +) diff --git a/python/src/pybind11/attr.h b/python/src/pybind11/attr.h new file mode 100644 index 000000000..6962d6fc5 --- /dev/null +++ b/python/src/pybind11/attr.h @@ -0,0 +1,493 @@ +/* + pybind11/attr.h: Infrastructure for processing custom + type and function attributes + + Copyright (c) 2016 Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include "cast.h" + +NAMESPACE_BEGIN(PYBIND11_NAMESPACE) + +/// \addtogroup annotations +/// @{ + +/// Annotation for methods +struct is_method { handle class_; is_method(const handle &c) : class_(c) { } }; + +/// Annotation for operators +struct is_operator { }; + +/// Annotation for parent scope +struct scope { handle value; scope(const handle &s) : value(s) { } }; + +/// Annotation for documentation +struct doc { const char *value; doc(const char *value) : value(value) { } }; + +/// Annotation for function names +struct name { const char *value; name(const char *value) : value(value) { } }; + +/// Annotation indicating that a function is an overload associated with a given "sibling" +struct sibling { handle value; sibling(const handle &value) : value(value.ptr()) { } }; + +/// Annotation indicating that a class derives from another given type +template struct base { + PYBIND11_DEPRECATED("base() was deprecated in favor of specifying 'T' as a template argument to class_") + base() { } +}; + +/// Keep patient alive while nurse lives +template struct keep_alive { }; + +/// Annotation indicating that a class is involved in a multiple inheritance relationship +struct multiple_inheritance { }; + +/// Annotation which enables dynamic attributes, i.e. adds `__dict__` to a class +struct dynamic_attr { }; + +/// Annotation which enables the buffer protocol for a type +struct buffer_protocol { }; + +/// Annotation which requests that a special metaclass is created for a type +struct metaclass { + handle value; + + PYBIND11_DEPRECATED("py::metaclass() is no longer required. It's turned on by default now.") + metaclass() {} + + /// Override pybind11's default metaclass + explicit metaclass(handle value) : value(value) { } +}; + +/// Annotation that marks a class as local to the module: +struct module_local { const bool value; constexpr module_local(bool v = true) : value(v) { } }; + +/// Annotation to mark enums as an arithmetic type +struct arithmetic { }; + +/** \rst + A call policy which places one or more guard variables (``Ts...``) around the function call. + + For example, this definition: + + .. code-block:: cpp + + m.def("foo", foo, py::call_guard()); + + is equivalent to the following pseudocode: + + .. code-block:: cpp + + m.def("foo", [](args...) { + T scope_guard; + return foo(args...); // forwarded arguments + }); + \endrst */ +template struct call_guard; + +template <> struct call_guard<> { using type = detail::void_type; }; + +template +struct call_guard { + static_assert(std::is_default_constructible::value, + "The guard type must be default constructible"); + + using type = T; +}; + +template +struct call_guard { + struct type { + T guard{}; // Compose multiple guard types with left-to-right default-constructor order + typename call_guard::type next{}; + }; +}; + +/// @} annotations + +NAMESPACE_BEGIN(detail) +/* Forward declarations */ +enum op_id : int; +enum op_type : int; +struct undefined_t; +template struct op_; +inline void keep_alive_impl(size_t Nurse, size_t Patient, function_call &call, handle ret); + +/// Internal data structure which holds metadata about a keyword argument +struct argument_record { + const char *name; ///< Argument name + const char *descr; ///< Human-readable version of the argument value + handle value; ///< Associated Python object + bool convert : 1; ///< True if the argument is allowed to convert when loading + bool none : 1; ///< True if None is allowed when loading + + argument_record(const char *name, const char *descr, handle value, bool convert, bool none) + : name(name), descr(descr), value(value), convert(convert), none(none) { } +}; + +/// Internal data structure which holds metadata about a bound function (signature, overloads, etc.) +struct function_record { + function_record() + : is_constructor(false), is_new_style_constructor(false), is_stateless(false), + is_operator(false), has_args(false), has_kwargs(false), is_method(false) { } + + /// Function name + char *name = nullptr; /* why no C++ strings? They generate heavier code.. */ + + // User-specified documentation string + char *doc = nullptr; + + /// Human-readable version of the function signature + char *signature = nullptr; + + /// List of registered keyword arguments + std::vector args; + + /// Pointer to lambda function which converts arguments and performs the actual call + handle (*impl) (function_call &) = nullptr; + + /// Storage for the wrapped function pointer and captured data, if any + void *data[3] = { }; + + /// Pointer to custom destructor for 'data' (if needed) + void (*free_data) (function_record *ptr) = nullptr; + + /// Return value policy associated with this function + return_value_policy policy = return_value_policy::automatic; + + /// True if name == '__init__' + bool is_constructor : 1; + + /// True if this is a new-style `__init__` defined in `detail/init.h` + bool is_new_style_constructor : 1; + + /// True if this is a stateless function pointer + bool is_stateless : 1; + + /// True if this is an operator (__add__), etc. + bool is_operator : 1; + + /// True if the function has a '*args' argument + bool has_args : 1; + + /// True if the function has a '**kwargs' argument + bool has_kwargs : 1; + + /// True if this is a method + bool is_method : 1; + + /// Number of arguments (including py::args and/or py::kwargs, if present) + std::uint16_t nargs; + + /// Python method object + PyMethodDef *def = nullptr; + + /// Python handle to the parent scope (a class or a module) + handle scope; + + /// Python handle to the sibling function representing an overload chain + handle sibling; + + /// Pointer to next overload + function_record *next = nullptr; +}; + +/// Special data structure which (temporarily) holds metadata about a bound class +struct type_record { + PYBIND11_NOINLINE type_record() + : multiple_inheritance(false), dynamic_attr(false), buffer_protocol(false), + default_holder(true), module_local(false) { } + + /// Handle to the parent scope + handle scope; + + /// Name of the class + const char *name = nullptr; + + // Pointer to RTTI type_info data structure + const std::type_info *type = nullptr; + + /// How large is the underlying C++ type? + size_t type_size = 0; + + /// What is the alignment of the underlying C++ type? + size_t type_align = 0; + + /// How large is the type's holder? + size_t holder_size = 0; + + /// The global operator new can be overridden with a class-specific variant + void *(*operator_new)(size_t) = nullptr; + + /// Function pointer to class_<..>::init_instance + void (*init_instance)(instance *, const void *) = nullptr; + + /// Function pointer to class_<..>::dealloc + void (*dealloc)(detail::value_and_holder &) = nullptr; + + /// List of base classes of the newly created type + list bases; + + /// Optional docstring + const char *doc = nullptr; + + /// Custom metaclass (optional) + handle metaclass; + + /// Multiple inheritance marker + bool multiple_inheritance : 1; + + /// Does the class manage a __dict__? + bool dynamic_attr : 1; + + /// Does the class implement the buffer protocol? + bool buffer_protocol : 1; + + /// Is the default (unique_ptr) holder type used? + bool default_holder : 1; + + /// Is the class definition local to the module shared object? + bool module_local : 1; + + PYBIND11_NOINLINE void add_base(const std::type_info &base, void *(*caster)(void *)) { + auto base_info = detail::get_type_info(base, false); + if (!base_info) { + std::string tname(base.name()); + detail::clean_type_id(tname); + pybind11_fail("generic_type: type \"" + std::string(name) + + "\" referenced unknown base type \"" + tname + "\""); + } + + if (default_holder != base_info->default_holder) { + std::string tname(base.name()); + detail::clean_type_id(tname); + pybind11_fail("generic_type: type \"" + std::string(name) + "\" " + + (default_holder ? "does not have" : "has") + + " a non-default holder type while its base \"" + tname + "\" " + + (base_info->default_holder ? "does not" : "does")); + } + + bases.append((PyObject *) base_info->type); + + if (base_info->type->tp_dictoffset != 0) + dynamic_attr = true; + + if (caster) + base_info->implicit_casts.emplace_back(type, caster); + } +}; + +inline function_call::function_call(const function_record &f, handle p) : + func(f), parent(p) { + args.reserve(f.nargs); + args_convert.reserve(f.nargs); +} + +/// Tag for a new-style `__init__` defined in `detail/init.h` +struct is_new_style_constructor { }; + +/** + * Partial template specializations to process custom attributes provided to + * cpp_function_ and class_. These are either used to initialize the respective + * fields in the type_record and function_record data structures or executed at + * runtime to deal with custom call policies (e.g. keep_alive). + */ +template struct process_attribute; + +template struct process_attribute_default { + /// Default implementation: do nothing + static void init(const T &, function_record *) { } + static void init(const T &, type_record *) { } + static void precall(function_call &) { } + static void postcall(function_call &, handle) { } +}; + +/// Process an attribute specifying the function's name +template <> struct process_attribute : process_attribute_default { + static void init(const name &n, function_record *r) { r->name = const_cast(n.value); } +}; + +/// Process an attribute specifying the function's docstring +template <> struct process_attribute : process_attribute_default { + static void init(const doc &n, function_record *r) { r->doc = const_cast(n.value); } +}; + +/// Process an attribute specifying the function's docstring (provided as a C-style string) +template <> struct process_attribute : process_attribute_default { + static void init(const char *d, function_record *r) { r->doc = const_cast(d); } + static void init(const char *d, type_record *r) { r->doc = const_cast(d); } +}; +template <> struct process_attribute : process_attribute { }; + +/// Process an attribute indicating the function's return value policy +template <> struct process_attribute : process_attribute_default { + static void init(const return_value_policy &p, function_record *r) { r->policy = p; } +}; + +/// Process an attribute which indicates that this is an overloaded function associated with a given sibling +template <> struct process_attribute : process_attribute_default { + static void init(const sibling &s, function_record *r) { r->sibling = s.value; } +}; + +/// Process an attribute which indicates that this function is a method +template <> struct process_attribute : process_attribute_default { + static void init(const is_method &s, function_record *r) { r->is_method = true; r->scope = s.class_; } +}; + +/// Process an attribute which indicates the parent scope of a method +template <> struct process_attribute : process_attribute_default { + static void init(const scope &s, function_record *r) { r->scope = s.value; } +}; + +/// Process an attribute which indicates that this function is an operator +template <> struct process_attribute : process_attribute_default { + static void init(const is_operator &, function_record *r) { r->is_operator = true; } +}; + +template <> struct process_attribute : process_attribute_default { + static void init(const is_new_style_constructor &, function_record *r) { r->is_new_style_constructor = true; } +}; + +/// Process a keyword argument attribute (*without* a default value) +template <> struct process_attribute : process_attribute_default { + static void init(const arg &a, function_record *r) { + if (r->is_method && r->args.empty()) + r->args.emplace_back("self", nullptr, handle(), true /*convert*/, false /*none not allowed*/); + r->args.emplace_back(a.name, nullptr, handle(), !a.flag_noconvert, a.flag_none); + } +}; + +/// Process a keyword argument attribute (*with* a default value) +template <> struct process_attribute : process_attribute_default { + static void init(const arg_v &a, function_record *r) { + if (r->is_method && r->args.empty()) + r->args.emplace_back("self", nullptr /*descr*/, handle() /*parent*/, true /*convert*/, false /*none not allowed*/); + + if (!a.value) { +#if !defined(NDEBUG) + std::string descr("'"); + if (a.name) descr += std::string(a.name) + ": "; + descr += a.type + "'"; + if (r->is_method) { + if (r->name) + descr += " in method '" + (std::string) str(r->scope) + "." + (std::string) r->name + "'"; + else + descr += " in method of '" + (std::string) str(r->scope) + "'"; + } else if (r->name) { + descr += " in function '" + (std::string) r->name + "'"; + } + pybind11_fail("arg(): could not convert default argument " + + descr + " into a Python object (type not registered yet?)"); +#else + pybind11_fail("arg(): could not convert default argument " + "into a Python object (type not registered yet?). " + "Compile in debug mode for more information."); +#endif + } + r->args.emplace_back(a.name, a.descr, a.value.inc_ref(), !a.flag_noconvert, a.flag_none); + } +}; + +/// Process a parent class attribute. Single inheritance only (class_ itself already guarantees that) +template +struct process_attribute::value>> : process_attribute_default { + static void init(const handle &h, type_record *r) { r->bases.append(h); } +}; + +/// Process a parent class attribute (deprecated, does not support multiple inheritance) +template +struct process_attribute> : process_attribute_default> { + static void init(const base &, type_record *r) { r->add_base(typeid(T), nullptr); } +}; + +/// Process a multiple inheritance attribute +template <> +struct process_attribute : process_attribute_default { + static void init(const multiple_inheritance &, type_record *r) { r->multiple_inheritance = true; } +}; + +template <> +struct process_attribute : process_attribute_default { + static void init(const dynamic_attr &, type_record *r) { r->dynamic_attr = true; } +}; + +template <> +struct process_attribute : process_attribute_default { + static void init(const buffer_protocol &, type_record *r) { r->buffer_protocol = true; } +}; + +template <> +struct process_attribute : process_attribute_default { + static void init(const metaclass &m, type_record *r) { r->metaclass = m.value; } +}; + +template <> +struct process_attribute : process_attribute_default { + static void init(const module_local &l, type_record *r) { r->module_local = l.value; } +}; + +/// Process an 'arithmetic' attribute for enums (does nothing here) +template <> +struct process_attribute : process_attribute_default {}; + +template +struct process_attribute> : process_attribute_default> { }; + +/** + * Process a keep_alive call policy -- invokes keep_alive_impl during the + * pre-call handler if both Nurse, Patient != 0 and use the post-call handler + * otherwise + */ +template struct process_attribute> : public process_attribute_default> { + template = 0> + static void precall(function_call &call) { keep_alive_impl(Nurse, Patient, call, handle()); } + template = 0> + static void postcall(function_call &, handle) { } + template = 0> + static void precall(function_call &) { } + template = 0> + static void postcall(function_call &call, handle ret) { keep_alive_impl(Nurse, Patient, call, ret); } +}; + +/// Recursively iterate over variadic template arguments +template struct process_attributes { + static void init(const Args&... args, function_record *r) { + int unused[] = { 0, (process_attribute::type>::init(args, r), 0) ... }; + ignore_unused(unused); + } + static void init(const Args&... args, type_record *r) { + int unused[] = { 0, (process_attribute::type>::init(args, r), 0) ... }; + ignore_unused(unused); + } + static void precall(function_call &call) { + int unused[] = { 0, (process_attribute::type>::precall(call), 0) ... }; + ignore_unused(unused); + } + static void postcall(function_call &call, handle fn_ret) { + int unused[] = { 0, (process_attribute::type>::postcall(call, fn_ret), 0) ... }; + ignore_unused(unused); + } +}; + +template +using is_call_guard = is_instantiation; + +/// Extract the ``type`` from the first `call_guard` in `Extras...` (or `void_type` if none found) +template +using extract_guard_t = typename exactly_one_t, Extra...>::type; + +/// Check the number of named arguments at compile time +template ::value...), + size_t self = constexpr_sum(std::is_same::value...)> +constexpr bool expected_num_args(size_t nargs, bool has_args, bool has_kwargs) { + return named == 0 || (self + named + has_args + has_kwargs) == nargs; +} + +NAMESPACE_END(detail) +NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/python/src/pybind11/buffer_info.h b/python/src/pybind11/buffer_info.h new file mode 100644 index 000000000..9f072fa73 --- /dev/null +++ b/python/src/pybind11/buffer_info.h @@ -0,0 +1,108 @@ +/* + pybind11/buffer_info.h: Python buffer object interface + + Copyright (c) 2016 Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include "detail/common.h" + +NAMESPACE_BEGIN(PYBIND11_NAMESPACE) + +/// Information record describing a Python buffer object +struct buffer_info { + void *ptr = nullptr; // Pointer to the underlying storage + ssize_t itemsize = 0; // Size of individual items in bytes + ssize_t size = 0; // Total number of entries + std::string format; // For homogeneous buffers, this should be set to format_descriptor::format() + ssize_t ndim = 0; // Number of dimensions + std::vector shape; // Shape of the tensor (1 entry per dimension) + std::vector strides; // Number of entries between adjacent entries (for each per dimension) + + buffer_info() { } + + buffer_info(void *ptr, ssize_t itemsize, const std::string &format, ssize_t ndim, + detail::any_container shape_in, detail::any_container strides_in) + : ptr(ptr), itemsize(itemsize), size(1), format(format), ndim(ndim), + shape(std::move(shape_in)), strides(std::move(strides_in)) { + if (ndim != (ssize_t) shape.size() || ndim != (ssize_t) strides.size()) + pybind11_fail("buffer_info: ndim doesn't match shape and/or strides length"); + for (size_t i = 0; i < (size_t) ndim; ++i) + size *= shape[i]; + } + + template + buffer_info(T *ptr, detail::any_container shape_in, detail::any_container strides_in) + : buffer_info(private_ctr_tag(), ptr, sizeof(T), format_descriptor::format(), static_cast(shape_in->size()), std::move(shape_in), std::move(strides_in)) { } + + buffer_info(void *ptr, ssize_t itemsize, const std::string &format, ssize_t size) + : buffer_info(ptr, itemsize, format, 1, {size}, {itemsize}) { } + + template + buffer_info(T *ptr, ssize_t size) + : buffer_info(ptr, sizeof(T), format_descriptor::format(), size) { } + + explicit buffer_info(Py_buffer *view, bool ownview = true) + : buffer_info(view->buf, view->itemsize, view->format, view->ndim, + {view->shape, view->shape + view->ndim}, {view->strides, view->strides + view->ndim}) { + this->view = view; + this->ownview = ownview; + } + + buffer_info(const buffer_info &) = delete; + buffer_info& operator=(const buffer_info &) = delete; + + buffer_info(buffer_info &&other) { + (*this) = std::move(other); + } + + buffer_info& operator=(buffer_info &&rhs) { + ptr = rhs.ptr; + itemsize = rhs.itemsize; + size = rhs.size; + format = std::move(rhs.format); + ndim = rhs.ndim; + shape = std::move(rhs.shape); + strides = std::move(rhs.strides); + std::swap(view, rhs.view); + std::swap(ownview, rhs.ownview); + return *this; + } + + ~buffer_info() { + if (view && ownview) { PyBuffer_Release(view); delete view; } + } + +private: + struct private_ctr_tag { }; + + buffer_info(private_ctr_tag, void *ptr, ssize_t itemsize, const std::string &format, ssize_t ndim, + detail::any_container &&shape_in, detail::any_container &&strides_in) + : buffer_info(ptr, itemsize, format, ndim, std::move(shape_in), std::move(strides_in)) { } + + Py_buffer *view = nullptr; + bool ownview = false; +}; + +NAMESPACE_BEGIN(detail) + +template struct compare_buffer_info { + static bool compare(const buffer_info& b) { + return b.format == format_descriptor::format() && b.itemsize == (ssize_t) sizeof(T); + } +}; + +template struct compare_buffer_info::value>> { + static bool compare(const buffer_info& b) { + return (size_t) b.itemsize == sizeof(T) && (b.format == format_descriptor::value || + ((sizeof(T) == sizeof(long)) && b.format == (std::is_unsigned::value ? "L" : "l")) || + ((sizeof(T) == sizeof(size_t)) && b.format == (std::is_unsigned::value ? "N" : "n"))); + } +}; + +NAMESPACE_END(detail) +NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/python/src/pybind11/cast.h b/python/src/pybind11/cast.h new file mode 100644 index 000000000..8d0fd5d90 --- /dev/null +++ b/python/src/pybind11/cast.h @@ -0,0 +1,2128 @@ +/* + pybind11/cast.h: Partial template specializations to cast between + C++ and Python types + + Copyright (c) 2016 Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include "pytypes.h" +#include "detail/typeid.h" +#include "detail/descr.h" +#include "detail/internals.h" +#include +#include +#include +#include + +#if defined(PYBIND11_CPP17) +# if defined(__has_include) +# if __has_include() +# define PYBIND11_HAS_STRING_VIEW +# endif +# elif defined(_MSC_VER) +# define PYBIND11_HAS_STRING_VIEW +# endif +#endif +#ifdef PYBIND11_HAS_STRING_VIEW +#include +#endif + +NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +NAMESPACE_BEGIN(detail) + +/// A life support system for temporary objects created by `type_caster::load()`. +/// Adding a patient will keep it alive up until the enclosing function returns. +class loader_life_support { +public: + /// A new patient frame is created when a function is entered + loader_life_support() { + get_internals().loader_patient_stack.push_back(nullptr); + } + + /// ... and destroyed after it returns + ~loader_life_support() { + auto &stack = get_internals().loader_patient_stack; + if (stack.empty()) + pybind11_fail("loader_life_support: internal error"); + + auto ptr = stack.back(); + stack.pop_back(); + Py_CLEAR(ptr); + + // A heuristic to reduce the stack's capacity (e.g. after long recursive calls) + if (stack.capacity() > 16 && stack.size() != 0 && stack.capacity() / stack.size() > 2) + stack.shrink_to_fit(); + } + + /// This can only be used inside a pybind11-bound function, either by `argument_loader` + /// at argument preparation time or by `py::cast()` at execution time. + PYBIND11_NOINLINE static void add_patient(handle h) { + auto &stack = get_internals().loader_patient_stack; + if (stack.empty()) + throw cast_error("When called outside a bound function, py::cast() cannot " + "do Python -> C++ conversions which require the creation " + "of temporary values"); + + auto &list_ptr = stack.back(); + if (list_ptr == nullptr) { + list_ptr = PyList_New(1); + if (!list_ptr) + pybind11_fail("loader_life_support: error allocating list"); + PyList_SET_ITEM(list_ptr, 0, h.inc_ref().ptr()); + } else { + auto result = PyList_Append(list_ptr, h.ptr()); + if (result == -1) + pybind11_fail("loader_life_support: error adding patient"); + } + } +}; + +// Gets the cache entry for the given type, creating it if necessary. The return value is the pair +// returned by emplace, i.e. an iterator for the entry and a bool set to `true` if the entry was +// just created. +inline std::pair all_type_info_get_cache(PyTypeObject *type); + +// Populates a just-created cache entry. +PYBIND11_NOINLINE inline void all_type_info_populate(PyTypeObject *t, std::vector &bases) { + std::vector check; + for (handle parent : reinterpret_borrow(t->tp_bases)) + check.push_back((PyTypeObject *) parent.ptr()); + + auto const &type_dict = get_internals().registered_types_py; + for (size_t i = 0; i < check.size(); i++) { + auto type = check[i]; + // Ignore Python2 old-style class super type: + if (!PyType_Check((PyObject *) type)) continue; + + // Check `type` in the current set of registered python types: + auto it = type_dict.find(type); + if (it != type_dict.end()) { + // We found a cache entry for it, so it's either pybind-registered or has pre-computed + // pybind bases, but we have to make sure we haven't already seen the type(s) before: we + // want to follow Python/virtual C++ rules that there should only be one instance of a + // common base. + for (auto *tinfo : it->second) { + // NB: Could use a second set here, rather than doing a linear search, but since + // having a large number of immediate pybind11-registered types seems fairly + // unlikely, that probably isn't worthwhile. + bool found = false; + for (auto *known : bases) { + if (known == tinfo) { found = true; break; } + } + if (!found) bases.push_back(tinfo); + } + } + else if (type->tp_bases) { + // It's some python type, so keep follow its bases classes to look for one or more + // registered types + if (i + 1 == check.size()) { + // When we're at the end, we can pop off the current element to avoid growing + // `check` when adding just one base (which is typical--i.e. when there is no + // multiple inheritance) + check.pop_back(); + i--; + } + for (handle parent : reinterpret_borrow(type->tp_bases)) + check.push_back((PyTypeObject *) parent.ptr()); + } + } +} + +/** + * Extracts vector of type_info pointers of pybind-registered roots of the given Python type. Will + * be just 1 pybind type for the Python type of a pybind-registered class, or for any Python-side + * derived class that uses single inheritance. Will contain as many types as required for a Python + * class that uses multiple inheritance to inherit (directly or indirectly) from multiple + * pybind-registered classes. Will be empty if neither the type nor any base classes are + * pybind-registered. + * + * The value is cached for the lifetime of the Python type. + */ +inline const std::vector &all_type_info(PyTypeObject *type) { + auto ins = all_type_info_get_cache(type); + if (ins.second) + // New cache entry: populate it + all_type_info_populate(type, ins.first->second); + + return ins.first->second; +} + +/** + * Gets a single pybind11 type info for a python type. Returns nullptr if neither the type nor any + * ancestors are pybind11-registered. Throws an exception if there are multiple bases--use + * `all_type_info` instead if you want to support multiple bases. + */ +PYBIND11_NOINLINE inline detail::type_info* get_type_info(PyTypeObject *type) { + auto &bases = all_type_info(type); + if (bases.size() == 0) + return nullptr; + if (bases.size() > 1) + pybind11_fail("pybind11::detail::get_type_info: type has multiple pybind11-registered bases"); + return bases.front(); +} + +inline detail::type_info *get_local_type_info(const std::type_index &tp) { + auto &locals = registered_local_types_cpp(); + auto it = locals.find(tp); + if (it != locals.end()) + return it->second; + return nullptr; +} + +inline detail::type_info *get_global_type_info(const std::type_index &tp) { + auto &types = get_internals().registered_types_cpp; + auto it = types.find(tp); + if (it != types.end()) + return it->second; + return nullptr; +} + +/// Return the type info for a given C++ type; on lookup failure can either throw or return nullptr. +PYBIND11_NOINLINE inline detail::type_info *get_type_info(const std::type_index &tp, + bool throw_if_missing = false) { + if (auto ltype = get_local_type_info(tp)) + return ltype; + if (auto gtype = get_global_type_info(tp)) + return gtype; + + if (throw_if_missing) { + std::string tname = tp.name(); + detail::clean_type_id(tname); + pybind11_fail("pybind11::detail::get_type_info: unable to find type info for \"" + tname + "\""); + } + return nullptr; +} + +PYBIND11_NOINLINE inline handle get_type_handle(const std::type_info &tp, bool throw_if_missing) { + detail::type_info *type_info = get_type_info(tp, throw_if_missing); + return handle(type_info ? ((PyObject *) type_info->type) : nullptr); +} + +struct value_and_holder { + instance *inst = nullptr; + size_t index = 0u; + const detail::type_info *type = nullptr; + void **vh = nullptr; + + // Main constructor for a found value/holder: + value_and_holder(instance *i, const detail::type_info *type, size_t vpos, size_t index) : + inst{i}, index{index}, type{type}, + vh{inst->simple_layout ? inst->simple_value_holder : &inst->nonsimple.values_and_holders[vpos]} + {} + + // Default constructor (used to signal a value-and-holder not found by get_value_and_holder()) + value_and_holder() {} + + // Used for past-the-end iterator + value_and_holder(size_t index) : index{index} {} + + template V *&value_ptr() const { + return reinterpret_cast(vh[0]); + } + // True if this `value_and_holder` has a non-null value pointer + explicit operator bool() const { return value_ptr(); } + + template H &holder() const { + return reinterpret_cast(vh[1]); + } + bool holder_constructed() const { + return inst->simple_layout + ? inst->simple_holder_constructed + : inst->nonsimple.status[index] & instance::status_holder_constructed; + } + void set_holder_constructed(bool v = true) { + if (inst->simple_layout) + inst->simple_holder_constructed = v; + else if (v) + inst->nonsimple.status[index] |= instance::status_holder_constructed; + else + inst->nonsimple.status[index] &= (uint8_t) ~instance::status_holder_constructed; + } + bool instance_registered() const { + return inst->simple_layout + ? inst->simple_instance_registered + : inst->nonsimple.status[index] & instance::status_instance_registered; + } + void set_instance_registered(bool v = true) { + if (inst->simple_layout) + inst->simple_instance_registered = v; + else if (v) + inst->nonsimple.status[index] |= instance::status_instance_registered; + else + inst->nonsimple.status[index] &= (uint8_t) ~instance::status_instance_registered; + } +}; + +// Container for accessing and iterating over an instance's values/holders +struct values_and_holders { +private: + instance *inst; + using type_vec = std::vector; + const type_vec &tinfo; + +public: + values_and_holders(instance *inst) : inst{inst}, tinfo(all_type_info(Py_TYPE(inst))) {} + + struct iterator { + private: + instance *inst = nullptr; + const type_vec *types = nullptr; + value_and_holder curr; + friend struct values_and_holders; + iterator(instance *inst, const type_vec *tinfo) + : inst{inst}, types{tinfo}, + curr(inst /* instance */, + types->empty() ? nullptr : (*types)[0] /* type info */, + 0, /* vpos: (non-simple types only): the first vptr comes first */ + 0 /* index */) + {} + // Past-the-end iterator: + iterator(size_t end) : curr(end) {} + public: + bool operator==(const iterator &other) { return curr.index == other.curr.index; } + bool operator!=(const iterator &other) { return curr.index != other.curr.index; } + iterator &operator++() { + if (!inst->simple_layout) + curr.vh += 1 + (*types)[curr.index]->holder_size_in_ptrs; + ++curr.index; + curr.type = curr.index < types->size() ? (*types)[curr.index] : nullptr; + return *this; + } + value_and_holder &operator*() { return curr; } + value_and_holder *operator->() { return &curr; } + }; + + iterator begin() { return iterator(inst, &tinfo); } + iterator end() { return iterator(tinfo.size()); } + + iterator find(const type_info *find_type) { + auto it = begin(), endit = end(); + while (it != endit && it->type != find_type) ++it; + return it; + } + + size_t size() { return tinfo.size(); } +}; + +/** + * Extracts C++ value and holder pointer references from an instance (which may contain multiple + * values/holders for python-side multiple inheritance) that match the given type. Throws an error + * if the given type (or ValueType, if omitted) is not a pybind11 base of the given instance. If + * `find_type` is omitted (or explicitly specified as nullptr) the first value/holder are returned, + * regardless of type (and the resulting .type will be nullptr). + * + * The returned object should be short-lived: in particular, it must not outlive the called-upon + * instance. + */ +PYBIND11_NOINLINE inline value_and_holder instance::get_value_and_holder(const type_info *find_type /*= nullptr default in common.h*/, bool throw_if_missing /*= true in common.h*/) { + // Optimize common case: + if (!find_type || Py_TYPE(this) == find_type->type) + return value_and_holder(this, find_type, 0, 0); + + detail::values_and_holders vhs(this); + auto it = vhs.find(find_type); + if (it != vhs.end()) + return *it; + + if (!throw_if_missing) + return value_and_holder(); + +#if defined(NDEBUG) + pybind11_fail("pybind11::detail::instance::get_value_and_holder: " + "type is not a pybind11 base of the given instance " + "(compile in debug mode for type details)"); +#else + pybind11_fail("pybind11::detail::instance::get_value_and_holder: `" + + std::string(find_type->type->tp_name) + "' is not a pybind11 base of the given `" + + std::string(Py_TYPE(this)->tp_name) + "' instance"); +#endif +} + +PYBIND11_NOINLINE inline void instance::allocate_layout() { + auto &tinfo = all_type_info(Py_TYPE(this)); + + const size_t n_types = tinfo.size(); + + if (n_types == 0) + pybind11_fail("instance allocation failed: new instance has no pybind11-registered base types"); + + simple_layout = + n_types == 1 && tinfo.front()->holder_size_in_ptrs <= instance_simple_holder_in_ptrs(); + + // Simple path: no python-side multiple inheritance, and a small-enough holder + if (simple_layout) { + simple_value_holder[0] = nullptr; + simple_holder_constructed = false; + simple_instance_registered = false; + } + else { // multiple base types or a too-large holder + // Allocate space to hold: [v1*][h1][v2*][h2]...[bb...] where [vN*] is a value pointer, + // [hN] is the (uninitialized) holder instance for value N, and [bb...] is a set of bool + // values that tracks whether each associated holder has been initialized. Each [block] is + // padded, if necessary, to an integer multiple of sizeof(void *). + size_t space = 0; + for (auto t : tinfo) { + space += 1; // value pointer + space += t->holder_size_in_ptrs; // holder instance + } + size_t flags_at = space; + space += size_in_ptrs(n_types); // status bytes (holder_constructed and instance_registered) + + // Allocate space for flags, values, and holders, and initialize it to 0 (flags and values, + // in particular, need to be 0). Use Python's memory allocation functions: in Python 3.6 + // they default to using pymalloc, which is designed to be efficient for small allocations + // like the one we're doing here; in earlier versions (and for larger allocations) they are + // just wrappers around malloc. +#if PY_VERSION_HEX >= 0x03050000 + nonsimple.values_and_holders = (void **) PyMem_Calloc(space, sizeof(void *)); + if (!nonsimple.values_and_holders) throw std::bad_alloc(); +#else + nonsimple.values_and_holders = (void **) PyMem_New(void *, space); + if (!nonsimple.values_and_holders) throw std::bad_alloc(); + std::memset(nonsimple.values_and_holders, 0, space * sizeof(void *)); +#endif + nonsimple.status = reinterpret_cast(&nonsimple.values_and_holders[flags_at]); + } + owned = true; +} + +PYBIND11_NOINLINE inline void instance::deallocate_layout() { + if (!simple_layout) + PyMem_Free(nonsimple.values_and_holders); +} + +PYBIND11_NOINLINE inline bool isinstance_generic(handle obj, const std::type_info &tp) { + handle type = detail::get_type_handle(tp, false); + if (!type) + return false; + return isinstance(obj, type); +} + +PYBIND11_NOINLINE inline std::string error_string() { + if (!PyErr_Occurred()) { + PyErr_SetString(PyExc_RuntimeError, "Unknown internal error occurred"); + return "Unknown internal error occurred"; + } + + error_scope scope; // Preserve error state + + std::string errorString; + if (scope.type) { + errorString += handle(scope.type).attr("__name__").cast(); + errorString += ": "; + } + if (scope.value) + errorString += (std::string) str(scope.value); + + PyErr_NormalizeException(&scope.type, &scope.value, &scope.trace); + +#if PY_MAJOR_VERSION >= 3 + if (scope.trace != nullptr) + PyException_SetTraceback(scope.value, scope.trace); +#endif + +#if !defined(PYPY_VERSION) + if (scope.trace) { + PyTracebackObject *trace = (PyTracebackObject *) scope.trace; + + /* Get the deepest trace possible */ + while (trace->tb_next) + trace = trace->tb_next; + + PyFrameObject *frame = trace->tb_frame; + errorString += "\n\nAt:\n"; + while (frame) { + int lineno = PyFrame_GetLineNumber(frame); + errorString += + " " + handle(frame->f_code->co_filename).cast() + + "(" + std::to_string(lineno) + "): " + + handle(frame->f_code->co_name).cast() + "\n"; + frame = frame->f_back; + } + } +#endif + + return errorString; +} + +PYBIND11_NOINLINE inline handle get_object_handle(const void *ptr, const detail::type_info *type ) { + auto &instances = get_internals().registered_instances; + auto range = instances.equal_range(ptr); + for (auto it = range.first; it != range.second; ++it) { + for (auto vh : values_and_holders(it->second)) { + if (vh.type == type) + return handle((PyObject *) it->second); + } + } + return handle(); +} + +inline PyThreadState *get_thread_state_unchecked() { +#if defined(PYPY_VERSION) + return PyThreadState_GET(); +#elif PY_VERSION_HEX < 0x03000000 + return _PyThreadState_Current; +#elif PY_VERSION_HEX < 0x03050000 + return (PyThreadState*) _Py_atomic_load_relaxed(&_PyThreadState_Current); +#elif PY_VERSION_HEX < 0x03050200 + return (PyThreadState*) _PyThreadState_Current.value; +#else + return _PyThreadState_UncheckedGet(); +#endif +} + +// Forward declarations +inline void keep_alive_impl(handle nurse, handle patient); +inline PyObject *make_new_instance(PyTypeObject *type); + +class type_caster_generic { +public: + PYBIND11_NOINLINE type_caster_generic(const std::type_info &type_info) + : typeinfo(get_type_info(type_info)), cpptype(&type_info) { } + + type_caster_generic(const type_info *typeinfo) + : typeinfo(typeinfo), cpptype(typeinfo ? typeinfo->cpptype : nullptr) { } + + bool load(handle src, bool convert) { + return load_impl(src, convert); + } + + PYBIND11_NOINLINE static handle cast(const void *_src, return_value_policy policy, handle parent, + const detail::type_info *tinfo, + void *(*copy_constructor)(const void *), + void *(*move_constructor)(const void *), + const void *existing_holder = nullptr) { + if (!tinfo) // no type info: error will be set already + return handle(); + + void *src = const_cast(_src); + if (src == nullptr) + return none().release(); + + auto it_instances = get_internals().registered_instances.equal_range(src); + for (auto it_i = it_instances.first; it_i != it_instances.second; ++it_i) { + for (auto instance_type : detail::all_type_info(Py_TYPE(it_i->second))) { + if (instance_type && same_type(*instance_type->cpptype, *tinfo->cpptype)) + return handle((PyObject *) it_i->second).inc_ref(); + } + } + + auto inst = reinterpret_steal(make_new_instance(tinfo->type)); + auto wrapper = reinterpret_cast(inst.ptr()); + wrapper->owned = false; + void *&valueptr = values_and_holders(wrapper).begin()->value_ptr(); + + switch (policy) { + case return_value_policy::automatic: + case return_value_policy::take_ownership: + valueptr = src; + wrapper->owned = true; + break; + + case return_value_policy::automatic_reference: + case return_value_policy::reference: + valueptr = src; + wrapper->owned = false; + break; + + case return_value_policy::copy: + if (copy_constructor) + valueptr = copy_constructor(src); + else + throw cast_error("return_value_policy = copy, but the " + "object is non-copyable!"); + wrapper->owned = true; + break; + + case return_value_policy::move: + if (move_constructor) + valueptr = move_constructor(src); + else if (copy_constructor) + valueptr = copy_constructor(src); + else + throw cast_error("return_value_policy = move, but the " + "object is neither movable nor copyable!"); + wrapper->owned = true; + break; + + case return_value_policy::reference_internal: + valueptr = src; + wrapper->owned = false; + keep_alive_impl(inst, parent); + break; + + default: + throw cast_error("unhandled return_value_policy: should not happen!"); + } + + tinfo->init_instance(wrapper, existing_holder); + + return inst.release(); + } + + // Base methods for generic caster; there are overridden in copyable_holder_caster + void load_value(value_and_holder &&v_h) { + auto *&vptr = v_h.value_ptr(); + // Lazy allocation for unallocated values: + if (vptr == nullptr) { + auto *type = v_h.type ? v_h.type : typeinfo; + if (type->operator_new) { + vptr = type->operator_new(type->type_size); + } else { + #if defined(PYBIND11_CPP17) + if (type->type_align > __STDCPP_DEFAULT_NEW_ALIGNMENT__) + vptr = ::operator new(type->type_size, + (std::align_val_t) type->type_align); + else + #endif + vptr = ::operator new(type->type_size); + } + } + value = vptr; + } + bool try_implicit_casts(handle src, bool convert) { + for (auto &cast : typeinfo->implicit_casts) { + type_caster_generic sub_caster(*cast.first); + if (sub_caster.load(src, convert)) { + value = cast.second(sub_caster.value); + return true; + } + } + return false; + } + bool try_direct_conversions(handle src) { + for (auto &converter : *typeinfo->direct_conversions) { + if (converter(src.ptr(), value)) + return true; + } + return false; + } + void check_holder_compat() {} + + PYBIND11_NOINLINE static void *local_load(PyObject *src, const type_info *ti) { + auto caster = type_caster_generic(ti); + if (caster.load(src, false)) + return caster.value; + return nullptr; + } + + /// Try to load with foreign typeinfo, if available. Used when there is no + /// native typeinfo, or when the native one wasn't able to produce a value. + PYBIND11_NOINLINE bool try_load_foreign_module_local(handle src) { + constexpr auto *local_key = PYBIND11_MODULE_LOCAL_ID; + const auto pytype = src.get_type(); + if (!hasattr(pytype, local_key)) + return false; + + type_info *foreign_typeinfo = reinterpret_borrow(getattr(pytype, local_key)); + // Only consider this foreign loader if actually foreign and is a loader of the correct cpp type + if (foreign_typeinfo->module_local_load == &local_load + || (cpptype && !same_type(*cpptype, *foreign_typeinfo->cpptype))) + return false; + + if (auto result = foreign_typeinfo->module_local_load(src.ptr(), foreign_typeinfo)) { + value = result; + return true; + } + return false; + } + + // Implementation of `load`; this takes the type of `this` so that it can dispatch the relevant + // bits of code between here and copyable_holder_caster where the two classes need different + // logic (without having to resort to virtual inheritance). + template + PYBIND11_NOINLINE bool load_impl(handle src, bool convert) { + if (!src) return false; + if (!typeinfo) return try_load_foreign_module_local(src); + if (src.is_none()) { + // Defer accepting None to other overloads (if we aren't in convert mode): + if (!convert) return false; + value = nullptr; + return true; + } + + auto &this_ = static_cast(*this); + this_.check_holder_compat(); + + PyTypeObject *srctype = Py_TYPE(src.ptr()); + + // Case 1: If src is an exact type match for the target type then we can reinterpret_cast + // the instance's value pointer to the target type: + if (srctype == typeinfo->type) { + this_.load_value(reinterpret_cast(src.ptr())->get_value_and_holder()); + return true; + } + // Case 2: We have a derived class + else if (PyType_IsSubtype(srctype, typeinfo->type)) { + auto &bases = all_type_info(srctype); + bool no_cpp_mi = typeinfo->simple_type; + + // Case 2a: the python type is a Python-inherited derived class that inherits from just + // one simple (no MI) pybind11 class, or is an exact match, so the C++ instance is of + // the right type and we can use reinterpret_cast. + // (This is essentially the same as case 2b, but because not using multiple inheritance + // is extremely common, we handle it specially to avoid the loop iterator and type + // pointer lookup overhead) + if (bases.size() == 1 && (no_cpp_mi || bases.front()->type == typeinfo->type)) { + this_.load_value(reinterpret_cast(src.ptr())->get_value_and_holder()); + return true; + } + // Case 2b: the python type inherits from multiple C++ bases. Check the bases to see if + // we can find an exact match (or, for a simple C++ type, an inherited match); if so, we + // can safely reinterpret_cast to the relevant pointer. + else if (bases.size() > 1) { + for (auto base : bases) { + if (no_cpp_mi ? PyType_IsSubtype(base->type, typeinfo->type) : base->type == typeinfo->type) { + this_.load_value(reinterpret_cast(src.ptr())->get_value_and_holder(base)); + return true; + } + } + } + + // Case 2c: C++ multiple inheritance is involved and we couldn't find an exact type match + // in the registered bases, above, so try implicit casting (needed for proper C++ casting + // when MI is involved). + if (this_.try_implicit_casts(src, convert)) + return true; + } + + // Perform an implicit conversion + if (convert) { + for (auto &converter : typeinfo->implicit_conversions) { + auto temp = reinterpret_steal(converter(src.ptr(), typeinfo->type)); + if (load_impl(temp, false)) { + loader_life_support::add_patient(temp); + return true; + } + } + if (this_.try_direct_conversions(src)) + return true; + } + + // Failed to match local typeinfo. Try again with global. + if (typeinfo->module_local) { + if (auto gtype = get_global_type_info(*typeinfo->cpptype)) { + typeinfo = gtype; + return load(src, false); + } + } + + // Global typeinfo has precedence over foreign module_local + return try_load_foreign_module_local(src); + } + + + // Called to do type lookup and wrap the pointer and type in a pair when a dynamic_cast + // isn't needed or can't be used. If the type is unknown, sets the error and returns a pair + // with .second = nullptr. (p.first = nullptr is not an error: it becomes None). + PYBIND11_NOINLINE static std::pair src_and_type( + const void *src, const std::type_info &cast_type, const std::type_info *rtti_type = nullptr) { + if (auto *tpi = get_type_info(cast_type)) + return {src, const_cast(tpi)}; + + // Not found, set error: + std::string tname = rtti_type ? rtti_type->name() : cast_type.name(); + detail::clean_type_id(tname); + std::string msg = "Unregistered type : " + tname; + PyErr_SetString(PyExc_TypeError, msg.c_str()); + return {nullptr, nullptr}; + } + + const type_info *typeinfo = nullptr; + const std::type_info *cpptype = nullptr; + void *value = nullptr; +}; + +/** + * Determine suitable casting operator for pointer-or-lvalue-casting type casters. The type caster + * needs to provide `operator T*()` and `operator T&()` operators. + * + * If the type supports moving the value away via an `operator T&&() &&` method, it should use + * `movable_cast_op_type` instead. + */ +template +using cast_op_type = + conditional_t>::value, + typename std::add_pointer>::type, + typename std::add_lvalue_reference>::type>; + +/** + * Determine suitable casting operator for a type caster with a movable value. Such a type caster + * needs to provide `operator T*()`, `operator T&()`, and `operator T&&() &&`. The latter will be + * called in appropriate contexts where the value can be moved rather than copied. + * + * These operator are automatically provided when using the PYBIND11_TYPE_CASTER macro. + */ +template +using movable_cast_op_type = + conditional_t::type>::value, + typename std::add_pointer>::type, + conditional_t::value, + typename std::add_rvalue_reference>::type, + typename std::add_lvalue_reference>::type>>; + +// std::is_copy_constructible isn't quite enough: it lets std::vector (and similar) through when +// T is non-copyable, but code containing such a copy constructor fails to actually compile. +template struct is_copy_constructible : std::is_copy_constructible {}; + +// Specialization for types that appear to be copy constructible but also look like stl containers +// (we specifically check for: has `value_type` and `reference` with `reference = value_type&`): if +// so, copy constructability depends on whether the value_type is copy constructible. +template struct is_copy_constructible, + std::is_same + >::value>> : is_copy_constructible {}; + +#if !defined(PYBIND11_CPP17) +// Likewise for std::pair before C++17 (which mandates that the copy constructor not exist when the +// two types aren't themselves copy constructible). +template struct is_copy_constructible> + : all_of, is_copy_constructible> {}; +#endif + +NAMESPACE_END(detail) + +// polymorphic_type_hook::get(src, tinfo) determines whether the object pointed +// to by `src` actually is an instance of some class derived from `itype`. +// If so, it sets `tinfo` to point to the std::type_info representing that derived +// type, and returns a pointer to the start of the most-derived object of that type +// (in which `src` is a subobject; this will be the same address as `src` in most +// single inheritance cases). If not, or if `src` is nullptr, it simply returns `src` +// and leaves `tinfo` at its default value of nullptr. +// +// The default polymorphic_type_hook just returns src. A specialization for polymorphic +// types determines the runtime type of the passed object and adjusts the this-pointer +// appropriately via dynamic_cast. This is what enables a C++ Animal* to appear +// to Python as a Dog (if Dog inherits from Animal, Animal is polymorphic, Dog is +// registered with pybind11, and this Animal is in fact a Dog). +// +// You may specialize polymorphic_type_hook yourself for types that want to appear +// polymorphic to Python but do not use C++ RTTI. (This is a not uncommon pattern +// in performance-sensitive applications, used most notably in LLVM.) +template +struct polymorphic_type_hook +{ + static const void *get(const itype *src, const std::type_info*&) { return src; } +}; +template +struct polymorphic_type_hook::value>> +{ + static const void *get(const itype *src, const std::type_info*& type) { + type = src ? &typeid(*src) : nullptr; + return dynamic_cast(src); + } +}; + +NAMESPACE_BEGIN(detail) + +/// Generic type caster for objects stored on the heap +template class type_caster_base : public type_caster_generic { + using itype = intrinsic_t; + +public: + static constexpr auto name = _(); + + type_caster_base() : type_caster_base(typeid(type)) { } + explicit type_caster_base(const std::type_info &info) : type_caster_generic(info) { } + + static handle cast(const itype &src, return_value_policy policy, handle parent) { + if (policy == return_value_policy::automatic || policy == return_value_policy::automatic_reference) + policy = return_value_policy::copy; + return cast(&src, policy, parent); + } + + static handle cast(itype &&src, return_value_policy, handle parent) { + return cast(&src, return_value_policy::move, parent); + } + + // Returns a (pointer, type_info) pair taking care of necessary type lookup for a + // polymorphic type (using RTTI by default, but can be overridden by specializing + // polymorphic_type_hook). If the instance isn't derived, returns the base version. + static std::pair src_and_type(const itype *src) { + auto &cast_type = typeid(itype); + const std::type_info *instance_type = nullptr; + const void *vsrc = polymorphic_type_hook::get(src, instance_type); + if (instance_type && !same_type(cast_type, *instance_type)) { + // This is a base pointer to a derived type. If the derived type is registered + // with pybind11, we want to make the full derived object available. + // In the typical case where itype is polymorphic, we get the correct + // derived pointer (which may be != base pointer) by a dynamic_cast to + // most derived type. If itype is not polymorphic, we won't get here + // except via a user-provided specialization of polymorphic_type_hook, + // and the user has promised that no this-pointer adjustment is + // required in that case, so it's OK to use static_cast. + if (const auto *tpi = get_type_info(*instance_type)) + return {vsrc, tpi}; + } + // Otherwise we have either a nullptr, an `itype` pointer, or an unknown derived pointer, so + // don't do a cast + return type_caster_generic::src_and_type(src, cast_type, instance_type); + } + + static handle cast(const itype *src, return_value_policy policy, handle parent) { + auto st = src_and_type(src); + return type_caster_generic::cast( + st.first, policy, parent, st.second, + make_copy_constructor(src), make_move_constructor(src)); + } + + static handle cast_holder(const itype *src, const void *holder) { + auto st = src_and_type(src); + return type_caster_generic::cast( + st.first, return_value_policy::take_ownership, {}, st.second, + nullptr, nullptr, holder); + } + + template using cast_op_type = detail::cast_op_type; + + operator itype*() { return (type *) value; } + operator itype&() { if (!value) throw reference_cast_error(); return *((itype *) value); } + +protected: + using Constructor = void *(*)(const void *); + + /* Only enabled when the types are {copy,move}-constructible *and* when the type + does not have a private operator new implementation. */ + template ::value>> + static auto make_copy_constructor(const T *x) -> decltype(new T(*x), Constructor{}) { + return [](const void *arg) -> void * { + return new T(*reinterpret_cast(arg)); + }; + } + + template ::value>> + static auto make_move_constructor(const T *x) -> decltype(new T(std::move(*const_cast(x))), Constructor{}) { + return [](const void *arg) -> void * { + return new T(std::move(*const_cast(reinterpret_cast(arg)))); + }; + } + + static Constructor make_copy_constructor(...) { return nullptr; } + static Constructor make_move_constructor(...) { return nullptr; } +}; + +template class type_caster : public type_caster_base { }; +template using make_caster = type_caster>; + +// Shortcut for calling a caster's `cast_op_type` cast operator for casting a type_caster to a T +template typename make_caster::template cast_op_type cast_op(make_caster &caster) { + return caster.operator typename make_caster::template cast_op_type(); +} +template typename make_caster::template cast_op_type::type> +cast_op(make_caster &&caster) { + return std::move(caster).operator + typename make_caster::template cast_op_type::type>(); +} + +template class type_caster> { +private: + using caster_t = make_caster; + caster_t subcaster; + using subcaster_cast_op_type = typename caster_t::template cast_op_type; + static_assert(std::is_same::type &, subcaster_cast_op_type>::value, + "std::reference_wrapper caster requires T to have a caster with an `T &` operator"); +public: + bool load(handle src, bool convert) { return subcaster.load(src, convert); } + static constexpr auto name = caster_t::name; + static handle cast(const std::reference_wrapper &src, return_value_policy policy, handle parent) { + // It is definitely wrong to take ownership of this pointer, so mask that rvp + if (policy == return_value_policy::take_ownership || policy == return_value_policy::automatic) + policy = return_value_policy::automatic_reference; + return caster_t::cast(&src.get(), policy, parent); + } + template using cast_op_type = std::reference_wrapper; + operator std::reference_wrapper() { return subcaster.operator subcaster_cast_op_type&(); } +}; + +#define PYBIND11_TYPE_CASTER(type, py_name) \ + protected: \ + type value; \ + public: \ + static constexpr auto name = py_name; \ + template >::value, int> = 0> \ + static handle cast(T_ *src, return_value_policy policy, handle parent) { \ + if (!src) return none().release(); \ + if (policy == return_value_policy::take_ownership) { \ + auto h = cast(std::move(*src), policy, parent); delete src; return h; \ + } else { \ + return cast(*src, policy, parent); \ + } \ + } \ + operator type*() { return &value; } \ + operator type&() { return value; } \ + operator type&&() && { return std::move(value); } \ + template using cast_op_type = pybind11::detail::movable_cast_op_type + + +template using is_std_char_type = any_of< + std::is_same, /* std::string */ + std::is_same, /* std::u16string */ + std::is_same, /* std::u32string */ + std::is_same /* std::wstring */ +>; + +template +struct type_caster::value && !is_std_char_type::value>> { + using _py_type_0 = conditional_t; + using _py_type_1 = conditional_t::value, _py_type_0, typename std::make_unsigned<_py_type_0>::type>; + using py_type = conditional_t::value, double, _py_type_1>; +public: + + bool load(handle src, bool convert) { + py_type py_value; + + if (!src) + return false; + + if (std::is_floating_point::value) { + if (convert || PyFloat_Check(src.ptr())) + py_value = (py_type) PyFloat_AsDouble(src.ptr()); + else + return false; + } else if (PyFloat_Check(src.ptr())) { + return false; + } else if (std::is_unsigned::value) { + py_value = as_unsigned(src.ptr()); + } else { // signed integer: + py_value = sizeof(T) <= sizeof(long) + ? (py_type) PyLong_AsLong(src.ptr()) + : (py_type) PYBIND11_LONG_AS_LONGLONG(src.ptr()); + } + + bool py_err = py_value == (py_type) -1 && PyErr_Occurred(); + if (py_err || (std::is_integral::value && sizeof(py_type) != sizeof(T) && + (py_value < (py_type) std::numeric_limits::min() || + py_value > (py_type) std::numeric_limits::max()))) { + bool type_error = py_err && PyErr_ExceptionMatches( +#if PY_VERSION_HEX < 0x03000000 && !defined(PYPY_VERSION) + PyExc_SystemError +#else + PyExc_TypeError +#endif + ); + PyErr_Clear(); + if (type_error && convert && PyNumber_Check(src.ptr())) { + auto tmp = reinterpret_steal(std::is_floating_point::value + ? PyNumber_Float(src.ptr()) + : PyNumber_Long(src.ptr())); + PyErr_Clear(); + return load(tmp, false); + } + return false; + } + + value = (T) py_value; + return true; + } + + template + static typename std::enable_if::value, handle>::type + cast(U src, return_value_policy /* policy */, handle /* parent */) { + return PyFloat_FromDouble((double) src); + } + + template + static typename std::enable_if::value && std::is_signed::value && (sizeof(U) <= sizeof(long)), handle>::type + cast(U src, return_value_policy /* policy */, handle /* parent */) { + return PYBIND11_LONG_FROM_SIGNED((long) src); + } + + template + static typename std::enable_if::value && std::is_unsigned::value && (sizeof(U) <= sizeof(unsigned long)), handle>::type + cast(U src, return_value_policy /* policy */, handle /* parent */) { + return PYBIND11_LONG_FROM_UNSIGNED((unsigned long) src); + } + + template + static typename std::enable_if::value && std::is_signed::value && (sizeof(U) > sizeof(long)), handle>::type + cast(U src, return_value_policy /* policy */, handle /* parent */) { + return PyLong_FromLongLong((long long) src); + } + + template + static typename std::enable_if::value && std::is_unsigned::value && (sizeof(U) > sizeof(unsigned long)), handle>::type + cast(U src, return_value_policy /* policy */, handle /* parent */) { + return PyLong_FromUnsignedLongLong((unsigned long long) src); + } + + PYBIND11_TYPE_CASTER(T, _::value>("int", "float")); +}; + +template struct void_caster { +public: + bool load(handle src, bool) { + if (src && src.is_none()) + return true; + return false; + } + static handle cast(T, return_value_policy /* policy */, handle /* parent */) { + return none().inc_ref(); + } + PYBIND11_TYPE_CASTER(T, _("None")); +}; + +template <> class type_caster : public void_caster {}; + +template <> class type_caster : public type_caster { +public: + using type_caster::cast; + + bool load(handle h, bool) { + if (!h) { + return false; + } else if (h.is_none()) { + value = nullptr; + return true; + } + + /* Check if this is a capsule */ + if (isinstance(h)) { + value = reinterpret_borrow(h); + return true; + } + + /* Check if this is a C++ type */ + auto &bases = all_type_info((PyTypeObject *) h.get_type().ptr()); + if (bases.size() == 1) { // Only allowing loading from a single-value type + value = values_and_holders(reinterpret_cast(h.ptr())).begin()->value_ptr(); + return true; + } + + /* Fail */ + return false; + } + + static handle cast(const void *ptr, return_value_policy /* policy */, handle /* parent */) { + if (ptr) + return capsule(ptr).release(); + else + return none().inc_ref(); + } + + template using cast_op_type = void*&; + operator void *&() { return value; } + static constexpr auto name = _("capsule"); +private: + void *value = nullptr; +}; + +template <> class type_caster : public void_caster { }; + +template <> class type_caster { +public: + bool load(handle src, bool convert) { + if (!src) return false; + else if (src.ptr() == Py_True) { value = true; return true; } + else if (src.ptr() == Py_False) { value = false; return true; } + else if (convert || !strcmp("numpy.bool_", Py_TYPE(src.ptr())->tp_name)) { + // (allow non-implicit conversion for numpy booleans) + + Py_ssize_t res = -1; + if (src.is_none()) { + res = 0; // None is implicitly converted to False + } + #if defined(PYPY_VERSION) + // On PyPy, check that "__bool__" (or "__nonzero__" on Python 2.7) attr exists + else if (hasattr(src, PYBIND11_BOOL_ATTR)) { + res = PyObject_IsTrue(src.ptr()); + } + #else + // Alternate approach for CPython: this does the same as the above, but optimized + // using the CPython API so as to avoid an unneeded attribute lookup. + else if (auto tp_as_number = src.ptr()->ob_type->tp_as_number) { + if (PYBIND11_NB_BOOL(tp_as_number)) { + res = (*PYBIND11_NB_BOOL(tp_as_number))(src.ptr()); + } + } + #endif + if (res == 0 || res == 1) { + value = (bool) res; + return true; + } + } + return false; + } + static handle cast(bool src, return_value_policy /* policy */, handle /* parent */) { + return handle(src ? Py_True : Py_False).inc_ref(); + } + PYBIND11_TYPE_CASTER(bool, _("bool")); +}; + +// Helper class for UTF-{8,16,32} C++ stl strings: +template struct string_caster { + using CharT = typename StringType::value_type; + + // Simplify life by being able to assume standard char sizes (the standard only guarantees + // minimums, but Python requires exact sizes) + static_assert(!std::is_same::value || sizeof(CharT) == 1, "Unsupported char size != 1"); + static_assert(!std::is_same::value || sizeof(CharT) == 2, "Unsupported char16_t size != 2"); + static_assert(!std::is_same::value || sizeof(CharT) == 4, "Unsupported char32_t size != 4"); + // wchar_t can be either 16 bits (Windows) or 32 (everywhere else) + static_assert(!std::is_same::value || sizeof(CharT) == 2 || sizeof(CharT) == 4, + "Unsupported wchar_t size != 2/4"); + static constexpr size_t UTF_N = 8 * sizeof(CharT); + + bool load(handle src, bool) { +#if PY_MAJOR_VERSION < 3 + object temp; +#endif + handle load_src = src; + if (!src) { + return false; + } else if (!PyUnicode_Check(load_src.ptr())) { +#if PY_MAJOR_VERSION >= 3 + return load_bytes(load_src); +#else + if (sizeof(CharT) == 1) { + return load_bytes(load_src); + } + + // The below is a guaranteed failure in Python 3 when PyUnicode_Check returns false + if (!PYBIND11_BYTES_CHECK(load_src.ptr())) + return false; + + temp = reinterpret_steal(PyUnicode_FromObject(load_src.ptr())); + if (!temp) { PyErr_Clear(); return false; } + load_src = temp; +#endif + } + + object utfNbytes = reinterpret_steal(PyUnicode_AsEncodedString( + load_src.ptr(), UTF_N == 8 ? "utf-8" : UTF_N == 16 ? "utf-16" : "utf-32", nullptr)); + if (!utfNbytes) { PyErr_Clear(); return false; } + + const CharT *buffer = reinterpret_cast(PYBIND11_BYTES_AS_STRING(utfNbytes.ptr())); + size_t length = (size_t) PYBIND11_BYTES_SIZE(utfNbytes.ptr()) / sizeof(CharT); + if (UTF_N > 8) { buffer++; length--; } // Skip BOM for UTF-16/32 + value = StringType(buffer, length); + + // If we're loading a string_view we need to keep the encoded Python object alive: + if (IsView) + loader_life_support::add_patient(utfNbytes); + + return true; + } + + static handle cast(const StringType &src, return_value_policy /* policy */, handle /* parent */) { + const char *buffer = reinterpret_cast(src.data()); + ssize_t nbytes = ssize_t(src.size() * sizeof(CharT)); + handle s = decode_utfN(buffer, nbytes); + if (!s) throw error_already_set(); + return s; + } + + PYBIND11_TYPE_CASTER(StringType, _(PYBIND11_STRING_NAME)); + +private: + static handle decode_utfN(const char *buffer, ssize_t nbytes) { +#if !defined(PYPY_VERSION) + return + UTF_N == 8 ? PyUnicode_DecodeUTF8(buffer, nbytes, nullptr) : + UTF_N == 16 ? PyUnicode_DecodeUTF16(buffer, nbytes, nullptr, nullptr) : + PyUnicode_DecodeUTF32(buffer, nbytes, nullptr, nullptr); +#else + // PyPy seems to have multiple problems related to PyUnicode_UTF*: the UTF8 version + // sometimes segfaults for unknown reasons, while the UTF16 and 32 versions require a + // non-const char * arguments, which is also a nuisance, so bypass the whole thing by just + // passing the encoding as a string value, which works properly: + return PyUnicode_Decode(buffer, nbytes, UTF_N == 8 ? "utf-8" : UTF_N == 16 ? "utf-16" : "utf-32", nullptr); +#endif + } + + // When loading into a std::string or char*, accept a bytes object as-is (i.e. + // without any encoding/decoding attempt). For other C++ char sizes this is a no-op. + // which supports loading a unicode from a str, doesn't take this path. + template + bool load_bytes(enable_if_t src) { + if (PYBIND11_BYTES_CHECK(src.ptr())) { + // We were passed a Python 3 raw bytes; accept it into a std::string or char* + // without any encoding attempt. + const char *bytes = PYBIND11_BYTES_AS_STRING(src.ptr()); + if (bytes) { + value = StringType(bytes, (size_t) PYBIND11_BYTES_SIZE(src.ptr())); + return true; + } + } + + return false; + } + + template + bool load_bytes(enable_if_t) { return false; } +}; + +template +struct type_caster, enable_if_t::value>> + : string_caster> {}; + +#ifdef PYBIND11_HAS_STRING_VIEW +template +struct type_caster, enable_if_t::value>> + : string_caster, true> {}; +#endif + +// Type caster for C-style strings. We basically use a std::string type caster, but also add the +// ability to use None as a nullptr char* (which the string caster doesn't allow). +template struct type_caster::value>> { + using StringType = std::basic_string; + using StringCaster = type_caster; + StringCaster str_caster; + bool none = false; + CharT one_char = 0; +public: + bool load(handle src, bool convert) { + if (!src) return false; + if (src.is_none()) { + // Defer accepting None to other overloads (if we aren't in convert mode): + if (!convert) return false; + none = true; + return true; + } + return str_caster.load(src, convert); + } + + static handle cast(const CharT *src, return_value_policy policy, handle parent) { + if (src == nullptr) return pybind11::none().inc_ref(); + return StringCaster::cast(StringType(src), policy, parent); + } + + static handle cast(CharT src, return_value_policy policy, handle parent) { + if (std::is_same::value) { + handle s = PyUnicode_DecodeLatin1((const char *) &src, 1, nullptr); + if (!s) throw error_already_set(); + return s; + } + return StringCaster::cast(StringType(1, src), policy, parent); + } + + operator CharT*() { return none ? nullptr : const_cast(static_cast(str_caster).c_str()); } + operator CharT&() { + if (none) + throw value_error("Cannot convert None to a character"); + + auto &value = static_cast(str_caster); + size_t str_len = value.size(); + if (str_len == 0) + throw value_error("Cannot convert empty string to a character"); + + // If we're in UTF-8 mode, we have two possible failures: one for a unicode character that + // is too high, and one for multiple unicode characters (caught later), so we need to figure + // out how long the first encoded character is in bytes to distinguish between these two + // errors. We also allow want to allow unicode characters U+0080 through U+00FF, as those + // can fit into a single char value. + if (StringCaster::UTF_N == 8 && str_len > 1 && str_len <= 4) { + unsigned char v0 = static_cast(value[0]); + size_t char0_bytes = !(v0 & 0x80) ? 1 : // low bits only: 0-127 + (v0 & 0xE0) == 0xC0 ? 2 : // 0b110xxxxx - start of 2-byte sequence + (v0 & 0xF0) == 0xE0 ? 3 : // 0b1110xxxx - start of 3-byte sequence + 4; // 0b11110xxx - start of 4-byte sequence + + if (char0_bytes == str_len) { + // If we have a 128-255 value, we can decode it into a single char: + if (char0_bytes == 2 && (v0 & 0xFC) == 0xC0) { // 0x110000xx 0x10xxxxxx + one_char = static_cast(((v0 & 3) << 6) + (static_cast(value[1]) & 0x3F)); + return one_char; + } + // Otherwise we have a single character, but it's > U+00FF + throw value_error("Character code point not in range(0x100)"); + } + } + + // UTF-16 is much easier: we can only have a surrogate pair for values above U+FFFF, thus a + // surrogate pair with total length 2 instantly indicates a range error (but not a "your + // string was too long" error). + else if (StringCaster::UTF_N == 16 && str_len == 2) { + one_char = static_cast(value[0]); + if (one_char >= 0xD800 && one_char < 0xE000) + throw value_error("Character code point not in range(0x10000)"); + } + + if (str_len != 1) + throw value_error("Expected a character, but multi-character string found"); + + one_char = value[0]; + return one_char; + } + + static constexpr auto name = _(PYBIND11_STRING_NAME); + template using cast_op_type = pybind11::detail::cast_op_type<_T>; +}; + +// Base implementation for std::tuple and std::pair +template class Tuple, typename... Ts> class tuple_caster { + using type = Tuple; + static constexpr auto size = sizeof...(Ts); + using indices = make_index_sequence; +public: + + bool load(handle src, bool convert) { + if (!isinstance(src)) + return false; + const auto seq = reinterpret_borrow(src); + if (seq.size() != size) + return false; + return load_impl(seq, convert, indices{}); + } + + template + static handle cast(T &&src, return_value_policy policy, handle parent) { + return cast_impl(std::forward(src), policy, parent, indices{}); + } + + static constexpr auto name = _("Tuple[") + concat(make_caster::name...) + _("]"); + + template using cast_op_type = type; + + operator type() & { return implicit_cast(indices{}); } + operator type() && { return std::move(*this).implicit_cast(indices{}); } + +protected: + template + type implicit_cast(index_sequence) & { return type(cast_op(std::get(subcasters))...); } + template + type implicit_cast(index_sequence) && { return type(cast_op(std::move(std::get(subcasters)))...); } + + static constexpr bool load_impl(const sequence &, bool, index_sequence<>) { return true; } + + template + bool load_impl(const sequence &seq, bool convert, index_sequence) { + for (bool r : {std::get(subcasters).load(seq[Is], convert)...}) + if (!r) + return false; + return true; + } + + /* Implementation: Convert a C++ tuple into a Python tuple */ + template + static handle cast_impl(T &&src, return_value_policy policy, handle parent, index_sequence) { + std::array entries{{ + reinterpret_steal(make_caster::cast(std::get(std::forward(src)), policy, parent))... + }}; + for (const auto &entry: entries) + if (!entry) + return handle(); + tuple result(size); + int counter = 0; + for (auto & entry: entries) + PyTuple_SET_ITEM(result.ptr(), counter++, entry.release().ptr()); + return result.release(); + } + + Tuple...> subcasters; +}; + +template class type_caster> + : public tuple_caster {}; + +template class type_caster> + : public tuple_caster {}; + +/// Helper class which abstracts away certain actions. Users can provide specializations for +/// custom holders, but it's only necessary if the type has a non-standard interface. +template +struct holder_helper { + static auto get(const T &p) -> decltype(p.get()) { return p.get(); } +}; + +/// Type caster for holder types like std::shared_ptr, etc. +template +struct copyable_holder_caster : public type_caster_base { +public: + using base = type_caster_base; + static_assert(std::is_base_of>::value, + "Holder classes are only supported for custom types"); + using base::base; + using base::cast; + using base::typeinfo; + using base::value; + + bool load(handle src, bool convert) { + return base::template load_impl>(src, convert); + } + + explicit operator type*() { return this->value; } + explicit operator type&() { return *(this->value); } + explicit operator holder_type*() { return std::addressof(holder); } + + // Workaround for Intel compiler bug + // see pybind11 issue 94 + #if defined(__ICC) || defined(__INTEL_COMPILER) + operator holder_type&() { return holder; } + #else + explicit operator holder_type&() { return holder; } + #endif + + static handle cast(const holder_type &src, return_value_policy, handle) { + const auto *ptr = holder_helper::get(src); + return type_caster_base::cast_holder(ptr, &src); + } + +protected: + friend class type_caster_generic; + void check_holder_compat() { + if (typeinfo->default_holder) + throw cast_error("Unable to load a custom holder type from a default-holder instance"); + } + + bool load_value(value_and_holder &&v_h) { + if (v_h.holder_constructed()) { + value = v_h.value_ptr(); + holder = v_h.template holder(); + return true; + } else { + throw cast_error("Unable to cast from non-held to held instance (T& to Holder) " +#if defined(NDEBUG) + "(compile in debug mode for type information)"); +#else + "of type '" + type_id() + "''"); +#endif + } + } + + template ::value, int> = 0> + bool try_implicit_casts(handle, bool) { return false; } + + template ::value, int> = 0> + bool try_implicit_casts(handle src, bool convert) { + for (auto &cast : typeinfo->implicit_casts) { + copyable_holder_caster sub_caster(*cast.first); + if (sub_caster.load(src, convert)) { + value = cast.second(sub_caster.value); + holder = holder_type(sub_caster.holder, (type *) value); + return true; + } + } + return false; + } + + static bool try_direct_conversions(handle) { return false; } + + + holder_type holder; +}; + +/// Specialize for the common std::shared_ptr, so users don't need to +template +class type_caster> : public copyable_holder_caster> { }; + +template +struct move_only_holder_caster { + static_assert(std::is_base_of, type_caster>::value, + "Holder classes are only supported for custom types"); + + static handle cast(holder_type &&src, return_value_policy, handle) { + auto *ptr = holder_helper::get(src); + return type_caster_base::cast_holder(ptr, std::addressof(src)); + } + static constexpr auto name = type_caster_base::name; +}; + +template +class type_caster> + : public move_only_holder_caster> { }; + +template +using type_caster_holder = conditional_t::value, + copyable_holder_caster, + move_only_holder_caster>; + +template struct always_construct_holder { static constexpr bool value = Value; }; + +/// Create a specialization for custom holder types (silently ignores std::shared_ptr) +#define PYBIND11_DECLARE_HOLDER_TYPE(type, holder_type, ...) \ + namespace pybind11 { namespace detail { \ + template \ + struct always_construct_holder : always_construct_holder { }; \ + template \ + class type_caster::value>> \ + : public type_caster_holder { }; \ + }} + +// PYBIND11_DECLARE_HOLDER_TYPE holder types: +template struct is_holder_type : + std::is_base_of, detail::type_caster> {}; +// Specialization for always-supported unique_ptr holders: +template struct is_holder_type> : + std::true_type {}; + +template struct handle_type_name { static constexpr auto name = _(); }; +template <> struct handle_type_name { static constexpr auto name = _(PYBIND11_BYTES_NAME); }; +template <> struct handle_type_name { static constexpr auto name = _("*args"); }; +template <> struct handle_type_name { static constexpr auto name = _("**kwargs"); }; + +template +struct pyobject_caster { + template ::value, int> = 0> + bool load(handle src, bool /* convert */) { value = src; return static_cast(value); } + + template ::value, int> = 0> + bool load(handle src, bool /* convert */) { + if (!isinstance(src)) + return false; + value = reinterpret_borrow(src); + return true; + } + + static handle cast(const handle &src, return_value_policy /* policy */, handle /* parent */) { + return src.inc_ref(); + } + PYBIND11_TYPE_CASTER(type, handle_type_name::name); +}; + +template +class type_caster::value>> : public pyobject_caster { }; + +// Our conditions for enabling moving are quite restrictive: +// At compile time: +// - T needs to be a non-const, non-pointer, non-reference type +// - type_caster::operator T&() must exist +// - the type must be move constructible (obviously) +// At run-time: +// - if the type is non-copy-constructible, the object must be the sole owner of the type (i.e. it +// must have ref_count() == 1)h +// If any of the above are not satisfied, we fall back to copying. +template using move_is_plain_type = satisfies_none_of; +template struct move_always : std::false_type {}; +template struct move_always, + negation>, + std::is_move_constructible, + std::is_same>().operator T&()), T&> +>::value>> : std::true_type {}; +template struct move_if_unreferenced : std::false_type {}; +template struct move_if_unreferenced, + negation>, + std::is_move_constructible, + std::is_same>().operator T&()), T&> +>::value>> : std::true_type {}; +template using move_never = none_of, move_if_unreferenced>; + +// Detect whether returning a `type` from a cast on type's type_caster is going to result in a +// reference or pointer to a local variable of the type_caster. Basically, only +// non-reference/pointer `type`s and reference/pointers from a type_caster_generic are safe; +// everything else returns a reference/pointer to a local variable. +template using cast_is_temporary_value_reference = bool_constant< + (std::is_reference::value || std::is_pointer::value) && + !std::is_base_of>::value && + !std::is_same, void>::value +>; + +// When a value returned from a C++ function is being cast back to Python, we almost always want to +// force `policy = move`, regardless of the return value policy the function/method was declared +// with. +template struct return_value_policy_override { + static return_value_policy policy(return_value_policy p) { return p; } +}; + +template struct return_value_policy_override>::value, void>> { + static return_value_policy policy(return_value_policy p) { + return !std::is_lvalue_reference::value && + !std::is_pointer::value + ? return_value_policy::move : p; + } +}; + +// Basic python -> C++ casting; throws if casting fails +template type_caster &load_type(type_caster &conv, const handle &handle) { + if (!conv.load(handle, true)) { +#if defined(NDEBUG) + throw cast_error("Unable to cast Python instance to C++ type (compile in debug mode for details)"); +#else + throw cast_error("Unable to cast Python instance of type " + + (std::string) str(handle.get_type()) + " to C++ type '" + type_id() + "'"); +#endif + } + return conv; +} +// Wrapper around the above that also constructs and returns a type_caster +template make_caster load_type(const handle &handle) { + make_caster conv; + load_type(conv, handle); + return conv; +} + +NAMESPACE_END(detail) + +// pytype -> C++ type +template ::value, int> = 0> +T cast(const handle &handle) { + using namespace detail; + static_assert(!cast_is_temporary_value_reference::value, + "Unable to cast type to reference: value is local to type caster"); + return cast_op(load_type(handle)); +} + +// pytype -> pytype (calls converting constructor) +template ::value, int> = 0> +T cast(const handle &handle) { return T(reinterpret_borrow(handle)); } + +// C++ type -> py::object +template ::value, int> = 0> +object cast(const T &value, return_value_policy policy = return_value_policy::automatic_reference, + handle parent = handle()) { + if (policy == return_value_policy::automatic) + policy = std::is_pointer::value ? return_value_policy::take_ownership : return_value_policy::copy; + else if (policy == return_value_policy::automatic_reference) + policy = std::is_pointer::value ? return_value_policy::reference : return_value_policy::copy; + return reinterpret_steal(detail::make_caster::cast(value, policy, parent)); +} + +template T handle::cast() const { return pybind11::cast(*this); } +template <> inline void handle::cast() const { return; } + +template +detail::enable_if_t::value, T> move(object &&obj) { + if (obj.ref_count() > 1) +#if defined(NDEBUG) + throw cast_error("Unable to cast Python instance to C++ rvalue: instance has multiple references" + " (compile in debug mode for details)"); +#else + throw cast_error("Unable to move from Python " + (std::string) str(obj.get_type()) + + " instance to C++ " + type_id() + " instance: instance has multiple references"); +#endif + + // Move into a temporary and return that, because the reference may be a local value of `conv` + T ret = std::move(detail::load_type(obj).operator T&()); + return ret; +} + +// Calling cast() on an rvalue calls pybind::cast with the object rvalue, which does: +// - If we have to move (because T has no copy constructor), do it. This will fail if the moved +// object has multiple references, but trying to copy will fail to compile. +// - If both movable and copyable, check ref count: if 1, move; otherwise copy +// - Otherwise (not movable), copy. +template detail::enable_if_t::value, T> cast(object &&object) { + return move(std::move(object)); +} +template detail::enable_if_t::value, T> cast(object &&object) { + if (object.ref_count() > 1) + return cast(object); + else + return move(std::move(object)); +} +template detail::enable_if_t::value, T> cast(object &&object) { + return cast(object); +} + +template T object::cast() const & { return pybind11::cast(*this); } +template T object::cast() && { return pybind11::cast(std::move(*this)); } +template <> inline void object::cast() const & { return; } +template <> inline void object::cast() && { return; } + +NAMESPACE_BEGIN(detail) + +// Declared in pytypes.h: +template ::value, int>> +object object_or_cast(T &&o) { return pybind11::cast(std::forward(o)); } + +struct overload_unused {}; // Placeholder type for the unneeded (and dead code) static variable in the OVERLOAD_INT macro +template using overload_caster_t = conditional_t< + cast_is_temporary_value_reference::value, make_caster, overload_unused>; + +// Trampoline use: for reference/pointer types to value-converted values, we do a value cast, then +// store the result in the given variable. For other types, this is a no-op. +template enable_if_t::value, T> cast_ref(object &&o, make_caster &caster) { + return cast_op(load_type(caster, o)); +} +template enable_if_t::value, T> cast_ref(object &&, overload_unused &) { + pybind11_fail("Internal error: cast_ref fallback invoked"); } + +// Trampoline use: Having a pybind11::cast with an invalid reference type is going to static_assert, even +// though if it's in dead code, so we provide a "trampoline" to pybind11::cast that only does anything in +// cases where pybind11::cast is valid. +template enable_if_t::value, T> cast_safe(object &&o) { + return pybind11::cast(std::move(o)); } +template enable_if_t::value, T> cast_safe(object &&) { + pybind11_fail("Internal error: cast_safe fallback invoked"); } +template <> inline void cast_safe(object &&) {} + +NAMESPACE_END(detail) + +template +tuple make_tuple() { return tuple(0); } + +template tuple make_tuple(Args&&... args_) { + constexpr size_t size = sizeof...(Args); + std::array args { + { reinterpret_steal(detail::make_caster::cast( + std::forward(args_), policy, nullptr))... } + }; + for (size_t i = 0; i < args.size(); i++) { + if (!args[i]) { +#if defined(NDEBUG) + throw cast_error("make_tuple(): unable to convert arguments to Python object (compile in debug mode for details)"); +#else + std::array argtypes { {type_id()...} }; + throw cast_error("make_tuple(): unable to convert argument of type '" + + argtypes[i] + "' to Python object"); +#endif + } + } + tuple result(size); + int counter = 0; + for (auto &arg_value : args) + PyTuple_SET_ITEM(result.ptr(), counter++, arg_value.release().ptr()); + return result; +} + +/// \ingroup annotations +/// Annotation for arguments +struct arg { + /// Constructs an argument with the name of the argument; if null or omitted, this is a positional argument. + constexpr explicit arg(const char *name = nullptr) : name(name), flag_noconvert(false), flag_none(true) { } + /// Assign a value to this argument + template arg_v operator=(T &&value) const; + /// Indicate that the type should not be converted in the type caster + arg &noconvert(bool flag = true) { flag_noconvert = flag; return *this; } + /// Indicates that the argument should/shouldn't allow None (e.g. for nullable pointer args) + arg &none(bool flag = true) { flag_none = flag; return *this; } + + const char *name; ///< If non-null, this is a named kwargs argument + bool flag_noconvert : 1; ///< If set, do not allow conversion (requires a supporting type caster!) + bool flag_none : 1; ///< If set (the default), allow None to be passed to this argument +}; + +/// \ingroup annotations +/// Annotation for arguments with values +struct arg_v : arg { +private: + template + arg_v(arg &&base, T &&x, const char *descr = nullptr) + : arg(base), + value(reinterpret_steal( + detail::make_caster::cast(x, return_value_policy::automatic, {}) + )), + descr(descr) +#if !defined(NDEBUG) + , type(type_id()) +#endif + { } + +public: + /// Direct construction with name, default, and description + template + arg_v(const char *name, T &&x, const char *descr = nullptr) + : arg_v(arg(name), std::forward(x), descr) { } + + /// Called internally when invoking `py::arg("a") = value` + template + arg_v(const arg &base, T &&x, const char *descr = nullptr) + : arg_v(arg(base), std::forward(x), descr) { } + + /// Same as `arg::noconvert()`, but returns *this as arg_v&, not arg& + arg_v &noconvert(bool flag = true) { arg::noconvert(flag); return *this; } + + /// Same as `arg::nonone()`, but returns *this as arg_v&, not arg& + arg_v &none(bool flag = true) { arg::none(flag); return *this; } + + /// The default value + object value; + /// The (optional) description of the default value + const char *descr; +#if !defined(NDEBUG) + /// The C++ type name of the default value (only available when compiled in debug mode) + std::string type; +#endif +}; + +template +arg_v arg::operator=(T &&value) const { return {std::move(*this), std::forward(value)}; } + +/// Alias for backward compatibility -- to be removed in version 2.0 +template using arg_t = arg_v; + +inline namespace literals { +/** \rst + String literal version of `arg` + \endrst */ +constexpr arg operator"" _a(const char *name, size_t) { return arg(name); } +} + +NAMESPACE_BEGIN(detail) + +// forward declaration (definition in attr.h) +struct function_record; + +/// Internal data associated with a single function call +struct function_call { + function_call(const function_record &f, handle p); // Implementation in attr.h + + /// The function data: + const function_record &func; + + /// Arguments passed to the function: + std::vector args; + + /// The `convert` value the arguments should be loaded with + std::vector args_convert; + + /// Extra references for the optional `py::args` and/or `py::kwargs` arguments (which, if + /// present, are also in `args` but without a reference). + object args_ref, kwargs_ref; + + /// The parent, if any + handle parent; + + /// If this is a call to an initializer, this argument contains `self` + handle init_self; +}; + + +/// Helper class which loads arguments for C++ functions called from Python +template +class argument_loader { + using indices = make_index_sequence; + + template using argument_is_args = std::is_same, args>; + template using argument_is_kwargs = std::is_same, kwargs>; + // Get args/kwargs argument positions relative to the end of the argument list: + static constexpr auto args_pos = constexpr_first() - (int) sizeof...(Args), + kwargs_pos = constexpr_first() - (int) sizeof...(Args); + + static constexpr bool args_kwargs_are_last = kwargs_pos >= - 1 && args_pos >= kwargs_pos - 1; + + static_assert(args_kwargs_are_last, "py::args/py::kwargs are only permitted as the last argument(s) of a function"); + +public: + static constexpr bool has_kwargs = kwargs_pos < 0; + static constexpr bool has_args = args_pos < 0; + + static constexpr auto arg_names = concat(type_descr(make_caster::name)...); + + bool load_args(function_call &call) { + return load_impl_sequence(call, indices{}); + } + + template + enable_if_t::value, Return> call(Func &&f) && { + return std::move(*this).template call_impl(std::forward(f), indices{}, Guard{}); + } + + template + enable_if_t::value, void_type> call(Func &&f) && { + std::move(*this).template call_impl(std::forward(f), indices{}, Guard{}); + return void_type(); + } + +private: + + static bool load_impl_sequence(function_call &, index_sequence<>) { return true; } + + template + bool load_impl_sequence(function_call &call, index_sequence) { + for (bool r : {std::get(argcasters).load(call.args[Is], call.args_convert[Is])...}) + if (!r) + return false; + return true; + } + + template + Return call_impl(Func &&f, index_sequence, Guard &&) { + return std::forward(f)(cast_op(std::move(std::get(argcasters)))...); + } + + std::tuple...> argcasters; +}; + +/// Helper class which collects only positional arguments for a Python function call. +/// A fancier version below can collect any argument, but this one is optimal for simple calls. +template +class simple_collector { +public: + template + explicit simple_collector(Ts &&...values) + : m_args(pybind11::make_tuple(std::forward(values)...)) { } + + const tuple &args() const & { return m_args; } + dict kwargs() const { return {}; } + + tuple args() && { return std::move(m_args); } + + /// Call a Python function and pass the collected arguments + object call(PyObject *ptr) const { + PyObject *result = PyObject_CallObject(ptr, m_args.ptr()); + if (!result) + throw error_already_set(); + return reinterpret_steal(result); + } + +private: + tuple m_args; +}; + +/// Helper class which collects positional, keyword, * and ** arguments for a Python function call +template +class unpacking_collector { +public: + template + explicit unpacking_collector(Ts &&...values) { + // Tuples aren't (easily) resizable so a list is needed for collection, + // but the actual function call strictly requires a tuple. + auto args_list = list(); + int _[] = { 0, (process(args_list, std::forward(values)), 0)... }; + ignore_unused(_); + + m_args = std::move(args_list); + } + + const tuple &args() const & { return m_args; } + const dict &kwargs() const & { return m_kwargs; } + + tuple args() && { return std::move(m_args); } + dict kwargs() && { return std::move(m_kwargs); } + + /// Call a Python function and pass the collected arguments + object call(PyObject *ptr) const { + PyObject *result = PyObject_Call(ptr, m_args.ptr(), m_kwargs.ptr()); + if (!result) + throw error_already_set(); + return reinterpret_steal(result); + } + +private: + template + void process(list &args_list, T &&x) { + auto o = reinterpret_steal(detail::make_caster::cast(std::forward(x), policy, {})); + if (!o) { +#if defined(NDEBUG) + argument_cast_error(); +#else + argument_cast_error(std::to_string(args_list.size()), type_id()); +#endif + } + args_list.append(o); + } + + void process(list &args_list, detail::args_proxy ap) { + for (const auto &a : ap) + args_list.append(a); + } + + void process(list &/*args_list*/, arg_v a) { + if (!a.name) +#if defined(NDEBUG) + nameless_argument_error(); +#else + nameless_argument_error(a.type); +#endif + + if (m_kwargs.contains(a.name)) { +#if defined(NDEBUG) + multiple_values_error(); +#else + multiple_values_error(a.name); +#endif + } + if (!a.value) { +#if defined(NDEBUG) + argument_cast_error(); +#else + argument_cast_error(a.name, a.type); +#endif + } + m_kwargs[a.name] = a.value; + } + + void process(list &/*args_list*/, detail::kwargs_proxy kp) { + if (!kp) + return; + for (const auto &k : reinterpret_borrow(kp)) { + if (m_kwargs.contains(k.first)) { +#if defined(NDEBUG) + multiple_values_error(); +#else + multiple_values_error(str(k.first)); +#endif + } + m_kwargs[k.first] = k.second; + } + } + + [[noreturn]] static void nameless_argument_error() { + throw type_error("Got kwargs without a name; only named arguments " + "may be passed via py::arg() to a python function call. " + "(compile in debug mode for details)"); + } + [[noreturn]] static void nameless_argument_error(std::string type) { + throw type_error("Got kwargs without a name of type '" + type + "'; only named " + "arguments may be passed via py::arg() to a python function call. "); + } + [[noreturn]] static void multiple_values_error() { + throw type_error("Got multiple values for keyword argument " + "(compile in debug mode for details)"); + } + + [[noreturn]] static void multiple_values_error(std::string name) { + throw type_error("Got multiple values for keyword argument '" + name + "'"); + } + + [[noreturn]] static void argument_cast_error() { + throw cast_error("Unable to convert call argument to Python object " + "(compile in debug mode for details)"); + } + + [[noreturn]] static void argument_cast_error(std::string name, std::string type) { + throw cast_error("Unable to convert call argument '" + name + + "' of type '" + type + "' to Python object"); + } + +private: + tuple m_args; + dict m_kwargs; +}; + +/// Collect only positional arguments for a Python function call +template ...>::value>> +simple_collector collect_arguments(Args &&...args) { + return simple_collector(std::forward(args)...); +} + +/// Collect all arguments, including keywords and unpacking (only instantiated when needed) +template ...>::value>> +unpacking_collector collect_arguments(Args &&...args) { + // Following argument order rules for generalized unpacking according to PEP 448 + static_assert( + constexpr_last() < constexpr_first() + && constexpr_last() < constexpr_first(), + "Invalid function call: positional args must precede keywords and ** unpacking; " + "* unpacking must precede ** unpacking" + ); + return unpacking_collector(std::forward(args)...); +} + +template +template +object object_api::operator()(Args &&...args) const { + return detail::collect_arguments(std::forward(args)...).call(derived().ptr()); +} + +template +template +object object_api::call(Args &&...args) const { + return operator()(std::forward(args)...); +} + +NAMESPACE_END(detail) + +#define PYBIND11_MAKE_OPAQUE(...) \ + namespace pybind11 { namespace detail { \ + template<> class type_caster<__VA_ARGS__> : public type_caster_base<__VA_ARGS__> { }; \ + }} + +/// Lets you pass a type containing a `,` through a macro parameter without needing a separate +/// typedef, e.g.: `PYBIND11_OVERLOAD(PYBIND11_TYPE(ReturnType), PYBIND11_TYPE(Parent), f, arg)` +#define PYBIND11_TYPE(...) __VA_ARGS__ + +NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/python/src/pybind11/chrono.h b/python/src/pybind11/chrono.h new file mode 100644 index 000000000..95ada76e0 --- /dev/null +++ b/python/src/pybind11/chrono.h @@ -0,0 +1,162 @@ +/* + pybind11/chrono.h: Transparent conversion between std::chrono and python's datetime + + Copyright (c) 2016 Trent Houliston and + Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include "pybind11.h" +#include +#include +#include +#include + +// Backport the PyDateTime_DELTA functions from Python3.3 if required +#ifndef PyDateTime_DELTA_GET_DAYS +#define PyDateTime_DELTA_GET_DAYS(o) (((PyDateTime_Delta*)o)->days) +#endif +#ifndef PyDateTime_DELTA_GET_SECONDS +#define PyDateTime_DELTA_GET_SECONDS(o) (((PyDateTime_Delta*)o)->seconds) +#endif +#ifndef PyDateTime_DELTA_GET_MICROSECONDS +#define PyDateTime_DELTA_GET_MICROSECONDS(o) (((PyDateTime_Delta*)o)->microseconds) +#endif + +NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +NAMESPACE_BEGIN(detail) + +template class duration_caster { +public: + typedef typename type::rep rep; + typedef typename type::period period; + + typedef std::chrono::duration> days; + + bool load(handle src, bool) { + using namespace std::chrono; + + // Lazy initialise the PyDateTime import + if (!PyDateTimeAPI) { PyDateTime_IMPORT; } + + if (!src) return false; + // If invoked with datetime.delta object + if (PyDelta_Check(src.ptr())) { + value = type(duration_cast>( + days(PyDateTime_DELTA_GET_DAYS(src.ptr())) + + seconds(PyDateTime_DELTA_GET_SECONDS(src.ptr())) + + microseconds(PyDateTime_DELTA_GET_MICROSECONDS(src.ptr())))); + return true; + } + // If invoked with a float we assume it is seconds and convert + else if (PyFloat_Check(src.ptr())) { + value = type(duration_cast>(duration(PyFloat_AsDouble(src.ptr())))); + return true; + } + else return false; + } + + // If this is a duration just return it back + static const std::chrono::duration& get_duration(const std::chrono::duration &src) { + return src; + } + + // If this is a time_point get the time_since_epoch + template static std::chrono::duration get_duration(const std::chrono::time_point> &src) { + return src.time_since_epoch(); + } + + static handle cast(const type &src, return_value_policy /* policy */, handle /* parent */) { + using namespace std::chrono; + + // Use overloaded function to get our duration from our source + // Works out if it is a duration or time_point and get the duration + auto d = get_duration(src); + + // Lazy initialise the PyDateTime import + if (!PyDateTimeAPI) { PyDateTime_IMPORT; } + + // Declare these special duration types so the conversions happen with the correct primitive types (int) + using dd_t = duration>; + using ss_t = duration>; + using us_t = duration; + + auto dd = duration_cast(d); + auto subd = d - dd; + auto ss = duration_cast(subd); + auto us = duration_cast(subd - ss); + return PyDelta_FromDSU(dd.count(), ss.count(), us.count()); + } + + PYBIND11_TYPE_CASTER(type, _("datetime.timedelta")); +}; + +// This is for casting times on the system clock into datetime.datetime instances +template class type_caster> { +public: + typedef std::chrono::time_point type; + bool load(handle src, bool) { + using namespace std::chrono; + + // Lazy initialise the PyDateTime import + if (!PyDateTimeAPI) { PyDateTime_IMPORT; } + + if (!src) return false; + if (PyDateTime_Check(src.ptr())) { + std::tm cal; + cal.tm_sec = PyDateTime_DATE_GET_SECOND(src.ptr()); + cal.tm_min = PyDateTime_DATE_GET_MINUTE(src.ptr()); + cal.tm_hour = PyDateTime_DATE_GET_HOUR(src.ptr()); + cal.tm_mday = PyDateTime_GET_DAY(src.ptr()); + cal.tm_mon = PyDateTime_GET_MONTH(src.ptr()) - 1; + cal.tm_year = PyDateTime_GET_YEAR(src.ptr()) - 1900; + cal.tm_isdst = -1; + + value = system_clock::from_time_t(std::mktime(&cal)) + microseconds(PyDateTime_DATE_GET_MICROSECOND(src.ptr())); + return true; + } + else return false; + } + + static handle cast(const std::chrono::time_point &src, return_value_policy /* policy */, handle /* parent */) { + using namespace std::chrono; + + // Lazy initialise the PyDateTime import + if (!PyDateTimeAPI) { PyDateTime_IMPORT; } + + std::time_t tt = system_clock::to_time_t(src); + // this function uses static memory so it's best to copy it out asap just in case + // otherwise other code that is using localtime may break this (not just python code) + std::tm localtime = *std::localtime(&tt); + + // Declare these special duration types so the conversions happen with the correct primitive types (int) + using us_t = duration; + + return PyDateTime_FromDateAndTime(localtime.tm_year + 1900, + localtime.tm_mon + 1, + localtime.tm_mday, + localtime.tm_hour, + localtime.tm_min, + localtime.tm_sec, + (duration_cast(src.time_since_epoch() % seconds(1))).count()); + } + PYBIND11_TYPE_CASTER(type, _("datetime.datetime")); +}; + +// Other clocks that are not the system clock are not measured as datetime.datetime objects +// since they are not measured on calendar time. So instead we just make them timedeltas +// Or if they have passed us a time as a float we convert that +template class type_caster> +: public duration_caster> { +}; + +template class type_caster> +: public duration_caster> { +}; + +NAMESPACE_END(detail) +NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/python/src/pybind11/common.h b/python/src/pybind11/common.h new file mode 100644 index 000000000..6c8a4f1e8 --- /dev/null +++ b/python/src/pybind11/common.h @@ -0,0 +1,2 @@ +#include "detail/common.h" +#warning "Including 'common.h' is deprecated. It will be removed in v3.0. Use 'pybind11.h'." diff --git a/python/src/pybind11/complex.h b/python/src/pybind11/complex.h new file mode 100644 index 000000000..3f8963857 --- /dev/null +++ b/python/src/pybind11/complex.h @@ -0,0 +1,65 @@ +/* + pybind11/complex.h: Complex number support + + Copyright (c) 2016 Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include "pybind11.h" +#include + +/// glibc defines I as a macro which breaks things, e.g., boost template names +#ifdef I +# undef I +#endif + +NAMESPACE_BEGIN(PYBIND11_NAMESPACE) + +template struct format_descriptor, detail::enable_if_t::value>> { + static constexpr const char c = format_descriptor::c; + static constexpr const char value[3] = { 'Z', c, '\0' }; + static std::string format() { return std::string(value); } +}; + +#ifndef PYBIND11_CPP17 + +template constexpr const char format_descriptor< + std::complex, detail::enable_if_t::value>>::value[3]; + +#endif + +NAMESPACE_BEGIN(detail) + +template struct is_fmt_numeric, detail::enable_if_t::value>> { + static constexpr bool value = true; + static constexpr int index = is_fmt_numeric::index + 3; +}; + +template class type_caster> { +public: + bool load(handle src, bool convert) { + if (!src) + return false; + if (!convert && !PyComplex_Check(src.ptr())) + return false; + Py_complex result = PyComplex_AsCComplex(src.ptr()); + if (result.real == -1.0 && PyErr_Occurred()) { + PyErr_Clear(); + return false; + } + value = std::complex((T) result.real, (T) result.imag); + return true; + } + + static handle cast(const std::complex &src, return_value_policy /* policy */, handle /* parent */) { + return PyComplex_FromDoubles((double) src.real(), (double) src.imag()); + } + + PYBIND11_TYPE_CASTER(std::complex, _("complex")); +}; +NAMESPACE_END(detail) +NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/python/src/pybind11/detail/class.h b/python/src/pybind11/detail/class.h new file mode 100644 index 000000000..b1916fcd0 --- /dev/null +++ b/python/src/pybind11/detail/class.h @@ -0,0 +1,623 @@ +/* + pybind11/detail/class.h: Python C API implementation details for py::class_ + + Copyright (c) 2017 Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include "../attr.h" +#include "../options.h" + +NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +NAMESPACE_BEGIN(detail) + +#if PY_VERSION_HEX >= 0x03030000 +# define PYBIND11_BUILTIN_QUALNAME +# define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj) +#else +// In pre-3.3 Python, we still set __qualname__ so that we can produce reliable function type +// signatures; in 3.3+ this macro expands to nothing: +# define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj) setattr((PyObject *) obj, "__qualname__", nameobj) +#endif + +inline PyTypeObject *type_incref(PyTypeObject *type) { + Py_INCREF(type); + return type; +} + +#if !defined(PYPY_VERSION) + +/// `pybind11_static_property.__get__()`: Always pass the class instead of the instance. +extern "C" inline PyObject *pybind11_static_get(PyObject *self, PyObject * /*ob*/, PyObject *cls) { + return PyProperty_Type.tp_descr_get(self, cls, cls); +} + +/// `pybind11_static_property.__set__()`: Just like the above `__get__()`. +extern "C" inline int pybind11_static_set(PyObject *self, PyObject *obj, PyObject *value) { + PyObject *cls = PyType_Check(obj) ? obj : (PyObject *) Py_TYPE(obj); + return PyProperty_Type.tp_descr_set(self, cls, value); +} + +/** A `static_property` is the same as a `property` but the `__get__()` and `__set__()` + methods are modified to always use the object type instead of a concrete instance. + Return value: New reference. */ +inline PyTypeObject *make_static_property_type() { + constexpr auto *name = "pybind11_static_property"; + auto name_obj = reinterpret_steal(PYBIND11_FROM_STRING(name)); + + /* Danger zone: from now (and until PyType_Ready), make sure to + issue no Python C API calls which could potentially invoke the + garbage collector (the GC will call type_traverse(), which will in + turn find the newly constructed type in an invalid state) */ + auto heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0); + if (!heap_type) + pybind11_fail("make_static_property_type(): error allocating type!"); + + heap_type->ht_name = name_obj.inc_ref().ptr(); +#ifdef PYBIND11_BUILTIN_QUALNAME + heap_type->ht_qualname = name_obj.inc_ref().ptr(); +#endif + + auto type = &heap_type->ht_type; + type->tp_name = name; + type->tp_base = type_incref(&PyProperty_Type); + type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE; + type->tp_descr_get = pybind11_static_get; + type->tp_descr_set = pybind11_static_set; + + if (PyType_Ready(type) < 0) + pybind11_fail("make_static_property_type(): failure in PyType_Ready()!"); + + setattr((PyObject *) type, "__module__", str("pybind11_builtins")); + PYBIND11_SET_OLDPY_QUALNAME(type, name_obj); + + return type; +} + +#else // PYPY + +/** PyPy has some issues with the above C API, so we evaluate Python code instead. + This function will only be called once so performance isn't really a concern. + Return value: New reference. */ +inline PyTypeObject *make_static_property_type() { + auto d = dict(); + PyObject *result = PyRun_String(R"(\ + class pybind11_static_property(property): + def __get__(self, obj, cls): + return property.__get__(self, cls, cls) + + def __set__(self, obj, value): + cls = obj if isinstance(obj, type) else type(obj) + property.__set__(self, cls, value) + )", Py_file_input, d.ptr(), d.ptr() + ); + if (result == nullptr) + throw error_already_set(); + Py_DECREF(result); + return (PyTypeObject *) d["pybind11_static_property"].cast().release().ptr(); +} + +#endif // PYPY + +/** Types with static properties need to handle `Type.static_prop = x` in a specific way. + By default, Python replaces the `static_property` itself, but for wrapped C++ types + we need to call `static_property.__set__()` in order to propagate the new value to + the underlying C++ data structure. */ +extern "C" inline int pybind11_meta_setattro(PyObject* obj, PyObject* name, PyObject* value) { + // Use `_PyType_Lookup()` instead of `PyObject_GetAttr()` in order to get the raw + // descriptor (`property`) instead of calling `tp_descr_get` (`property.__get__()`). + PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name); + + // The following assignment combinations are possible: + // 1. `Type.static_prop = value` --> descr_set: `Type.static_prop.__set__(value)` + // 2. `Type.static_prop = other_static_prop` --> setattro: replace existing `static_prop` + // 3. `Type.regular_attribute = value` --> setattro: regular attribute assignment + const auto static_prop = (PyObject *) get_internals().static_property_type; + const auto call_descr_set = descr && PyObject_IsInstance(descr, static_prop) + && !PyObject_IsInstance(value, static_prop); + if (call_descr_set) { + // Call `static_property.__set__()` instead of replacing the `static_property`. +#if !defined(PYPY_VERSION) + return Py_TYPE(descr)->tp_descr_set(descr, obj, value); +#else + if (PyObject *result = PyObject_CallMethod(descr, "__set__", "OO", obj, value)) { + Py_DECREF(result); + return 0; + } else { + return -1; + } +#endif + } else { + // Replace existing attribute. + return PyType_Type.tp_setattro(obj, name, value); + } +} + +#if PY_MAJOR_VERSION >= 3 +/** + * Python 3's PyInstanceMethod_Type hides itself via its tp_descr_get, which prevents aliasing + * methods via cls.attr("m2") = cls.attr("m1"): instead the tp_descr_get returns a plain function, + * when called on a class, or a PyMethod, when called on an instance. Override that behaviour here + * to do a special case bypass for PyInstanceMethod_Types. + */ +extern "C" inline PyObject *pybind11_meta_getattro(PyObject *obj, PyObject *name) { + PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name); + if (descr && PyInstanceMethod_Check(descr)) { + Py_INCREF(descr); + return descr; + } + else { + return PyType_Type.tp_getattro(obj, name); + } +} +#endif + +/** This metaclass is assigned by default to all pybind11 types and is required in order + for static properties to function correctly. Users may override this using `py::metaclass`. + Return value: New reference. */ +inline PyTypeObject* make_default_metaclass() { + constexpr auto *name = "pybind11_type"; + auto name_obj = reinterpret_steal(PYBIND11_FROM_STRING(name)); + + /* Danger zone: from now (and until PyType_Ready), make sure to + issue no Python C API calls which could potentially invoke the + garbage collector (the GC will call type_traverse(), which will in + turn find the newly constructed type in an invalid state) */ + auto heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0); + if (!heap_type) + pybind11_fail("make_default_metaclass(): error allocating metaclass!"); + + heap_type->ht_name = name_obj.inc_ref().ptr(); +#ifdef PYBIND11_BUILTIN_QUALNAME + heap_type->ht_qualname = name_obj.inc_ref().ptr(); +#endif + + auto type = &heap_type->ht_type; + type->tp_name = name; + type->tp_base = type_incref(&PyType_Type); + type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE; + + type->tp_setattro = pybind11_meta_setattro; +#if PY_MAJOR_VERSION >= 3 + type->tp_getattro = pybind11_meta_getattro; +#endif + + if (PyType_Ready(type) < 0) + pybind11_fail("make_default_metaclass(): failure in PyType_Ready()!"); + + setattr((PyObject *) type, "__module__", str("pybind11_builtins")); + PYBIND11_SET_OLDPY_QUALNAME(type, name_obj); + + return type; +} + +/// For multiple inheritance types we need to recursively register/deregister base pointers for any +/// base classes with pointers that are difference from the instance value pointer so that we can +/// correctly recognize an offset base class pointer. This calls a function with any offset base ptrs. +inline void traverse_offset_bases(void *valueptr, const detail::type_info *tinfo, instance *self, + bool (*f)(void * /*parentptr*/, instance * /*self*/)) { + for (handle h : reinterpret_borrow(tinfo->type->tp_bases)) { + if (auto parent_tinfo = get_type_info((PyTypeObject *) h.ptr())) { + for (auto &c : parent_tinfo->implicit_casts) { + if (c.first == tinfo->cpptype) { + auto *parentptr = c.second(valueptr); + if (parentptr != valueptr) + f(parentptr, self); + traverse_offset_bases(parentptr, parent_tinfo, self, f); + break; + } + } + } + } +} + +inline bool register_instance_impl(void *ptr, instance *self) { + get_internals().registered_instances.emplace(ptr, self); + return true; // unused, but gives the same signature as the deregister func +} +inline bool deregister_instance_impl(void *ptr, instance *self) { + auto ®istered_instances = get_internals().registered_instances; + auto range = registered_instances.equal_range(ptr); + for (auto it = range.first; it != range.second; ++it) { + if (Py_TYPE(self) == Py_TYPE(it->second)) { + registered_instances.erase(it); + return true; + } + } + return false; +} + +inline void register_instance(instance *self, void *valptr, const type_info *tinfo) { + register_instance_impl(valptr, self); + if (!tinfo->simple_ancestors) + traverse_offset_bases(valptr, tinfo, self, register_instance_impl); +} + +inline bool deregister_instance(instance *self, void *valptr, const type_info *tinfo) { + bool ret = deregister_instance_impl(valptr, self); + if (!tinfo->simple_ancestors) + traverse_offset_bases(valptr, tinfo, self, deregister_instance_impl); + return ret; +} + +/// Instance creation function for all pybind11 types. It allocates the internal instance layout for +/// holding C++ objects and holders. Allocation is done lazily (the first time the instance is cast +/// to a reference or pointer), and initialization is done by an `__init__` function. +inline PyObject *make_new_instance(PyTypeObject *type) { +#if defined(PYPY_VERSION) + // PyPy gets tp_basicsize wrong (issue 2482) under multiple inheritance when the first inherited + // object is a a plain Python type (i.e. not derived from an extension type). Fix it. + ssize_t instance_size = static_cast(sizeof(instance)); + if (type->tp_basicsize < instance_size) { + type->tp_basicsize = instance_size; + } +#endif + PyObject *self = type->tp_alloc(type, 0); + auto inst = reinterpret_cast(self); + // Allocate the value/holder internals: + inst->allocate_layout(); + + inst->owned = true; + + return self; +} + +/// Instance creation function for all pybind11 types. It only allocates space for the +/// C++ object, but doesn't call the constructor -- an `__init__` function must do that. +extern "C" inline PyObject *pybind11_object_new(PyTypeObject *type, PyObject *, PyObject *) { + return make_new_instance(type); +} + +/// An `__init__` function constructs the C++ object. Users should provide at least one +/// of these using `py::init` or directly with `.def(__init__, ...)`. Otherwise, the +/// following default function will be used which simply throws an exception. +extern "C" inline int pybind11_object_init(PyObject *self, PyObject *, PyObject *) { + PyTypeObject *type = Py_TYPE(self); + std::string msg; +#if defined(PYPY_VERSION) + msg += handle((PyObject *) type).attr("__module__").cast() + "."; +#endif + msg += type->tp_name; + msg += ": No constructor defined!"; + PyErr_SetString(PyExc_TypeError, msg.c_str()); + return -1; +} + +inline void add_patient(PyObject *nurse, PyObject *patient) { + auto &internals = get_internals(); + auto instance = reinterpret_cast(nurse); + instance->has_patients = true; + Py_INCREF(patient); + internals.patients[nurse].push_back(patient); +} + +inline void clear_patients(PyObject *self) { + auto instance = reinterpret_cast(self); + auto &internals = get_internals(); + auto pos = internals.patients.find(self); + assert(pos != internals.patients.end()); + // Clearing the patients can cause more Python code to run, which + // can invalidate the iterator. Extract the vector of patients + // from the unordered_map first. + auto patients = std::move(pos->second); + internals.patients.erase(pos); + instance->has_patients = false; + for (PyObject *&patient : patients) + Py_CLEAR(patient); +} + +/// Clears all internal data from the instance and removes it from registered instances in +/// preparation for deallocation. +inline void clear_instance(PyObject *self) { + auto instance = reinterpret_cast(self); + + // Deallocate any values/holders, if present: + for (auto &v_h : values_and_holders(instance)) { + if (v_h) { + + // We have to deregister before we call dealloc because, for virtual MI types, we still + // need to be able to get the parent pointers. + if (v_h.instance_registered() && !deregister_instance(instance, v_h.value_ptr(), v_h.type)) + pybind11_fail("pybind11_object_dealloc(): Tried to deallocate unregistered instance!"); + + if (instance->owned || v_h.holder_constructed()) + v_h.type->dealloc(v_h); + } + } + // Deallocate the value/holder layout internals: + instance->deallocate_layout(); + + if (instance->weakrefs) + PyObject_ClearWeakRefs(self); + + PyObject **dict_ptr = _PyObject_GetDictPtr(self); + if (dict_ptr) + Py_CLEAR(*dict_ptr); + + if (instance->has_patients) + clear_patients(self); +} + +/// Instance destructor function for all pybind11 types. It calls `type_info.dealloc` +/// to destroy the C++ object itself, while the rest is Python bookkeeping. +extern "C" inline void pybind11_object_dealloc(PyObject *self) { + clear_instance(self); + + auto type = Py_TYPE(self); + type->tp_free(self); + + // `type->tp_dealloc != pybind11_object_dealloc` means that we're being called + // as part of a derived type's dealloc, in which case we're not allowed to decref + // the type here. For cross-module compatibility, we shouldn't compare directly + // with `pybind11_object_dealloc`, but with the common one stashed in internals. + auto pybind11_object_type = (PyTypeObject *) get_internals().instance_base; + if (type->tp_dealloc == pybind11_object_type->tp_dealloc) + Py_DECREF(type); +} + +/** Create the type which can be used as a common base for all classes. This is + needed in order to satisfy Python's requirements for multiple inheritance. + Return value: New reference. */ +inline PyObject *make_object_base_type(PyTypeObject *metaclass) { + constexpr auto *name = "pybind11_object"; + auto name_obj = reinterpret_steal(PYBIND11_FROM_STRING(name)); + + /* Danger zone: from now (and until PyType_Ready), make sure to + issue no Python C API calls which could potentially invoke the + garbage collector (the GC will call type_traverse(), which will in + turn find the newly constructed type in an invalid state) */ + auto heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0); + if (!heap_type) + pybind11_fail("make_object_base_type(): error allocating type!"); + + heap_type->ht_name = name_obj.inc_ref().ptr(); +#ifdef PYBIND11_BUILTIN_QUALNAME + heap_type->ht_qualname = name_obj.inc_ref().ptr(); +#endif + + auto type = &heap_type->ht_type; + type->tp_name = name; + type->tp_base = type_incref(&PyBaseObject_Type); + type->tp_basicsize = static_cast(sizeof(instance)); + type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE; + + type->tp_new = pybind11_object_new; + type->tp_init = pybind11_object_init; + type->tp_dealloc = pybind11_object_dealloc; + + /* Support weak references (needed for the keep_alive feature) */ + type->tp_weaklistoffset = offsetof(instance, weakrefs); + + if (PyType_Ready(type) < 0) + pybind11_fail("PyType_Ready failed in make_object_base_type():" + error_string()); + + setattr((PyObject *) type, "__module__", str("pybind11_builtins")); + PYBIND11_SET_OLDPY_QUALNAME(type, name_obj); + + assert(!PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC)); + return (PyObject *) heap_type; +} + +/// dynamic_attr: Support for `d = instance.__dict__`. +extern "C" inline PyObject *pybind11_get_dict(PyObject *self, void *) { + PyObject *&dict = *_PyObject_GetDictPtr(self); + if (!dict) + dict = PyDict_New(); + Py_XINCREF(dict); + return dict; +} + +/// dynamic_attr: Support for `instance.__dict__ = dict()`. +extern "C" inline int pybind11_set_dict(PyObject *self, PyObject *new_dict, void *) { + if (!PyDict_Check(new_dict)) { + PyErr_Format(PyExc_TypeError, "__dict__ must be set to a dictionary, not a '%.200s'", + Py_TYPE(new_dict)->tp_name); + return -1; + } + PyObject *&dict = *_PyObject_GetDictPtr(self); + Py_INCREF(new_dict); + Py_CLEAR(dict); + dict = new_dict; + return 0; +} + +/// dynamic_attr: Allow the garbage collector to traverse the internal instance `__dict__`. +extern "C" inline int pybind11_traverse(PyObject *self, visitproc visit, void *arg) { + PyObject *&dict = *_PyObject_GetDictPtr(self); + Py_VISIT(dict); + return 0; +} + +/// dynamic_attr: Allow the GC to clear the dictionary. +extern "C" inline int pybind11_clear(PyObject *self) { + PyObject *&dict = *_PyObject_GetDictPtr(self); + Py_CLEAR(dict); + return 0; +} + +/// Give instances of this type a `__dict__` and opt into garbage collection. +inline void enable_dynamic_attributes(PyHeapTypeObject *heap_type) { + auto type = &heap_type->ht_type; +#if defined(PYPY_VERSION) + pybind11_fail(std::string(type->tp_name) + ": dynamic attributes are " + "currently not supported in " + "conjunction with PyPy!"); +#endif + type->tp_flags |= Py_TPFLAGS_HAVE_GC; + type->tp_dictoffset = type->tp_basicsize; // place dict at the end + type->tp_basicsize += (ssize_t)sizeof(PyObject *); // and allocate enough space for it + type->tp_traverse = pybind11_traverse; + type->tp_clear = pybind11_clear; + + static PyGetSetDef getset[] = { + {const_cast("__dict__"), pybind11_get_dict, pybind11_set_dict, nullptr, nullptr}, + {nullptr, nullptr, nullptr, nullptr, nullptr} + }; + type->tp_getset = getset; +} + +/// buffer_protocol: Fill in the view as specified by flags. +extern "C" inline int pybind11_getbuffer(PyObject *obj, Py_buffer *view, int flags) { + // Look for a `get_buffer` implementation in this type's info or any bases (following MRO). + type_info *tinfo = nullptr; + for (auto type : reinterpret_borrow(Py_TYPE(obj)->tp_mro)) { + tinfo = get_type_info((PyTypeObject *) type.ptr()); + if (tinfo && tinfo->get_buffer) + break; + } + if (view == nullptr || !tinfo || !tinfo->get_buffer) { + if (view) + view->obj = nullptr; + PyErr_SetString(PyExc_BufferError, "pybind11_getbuffer(): Internal error"); + return -1; + } + std::memset(view, 0, sizeof(Py_buffer)); + buffer_info *info = tinfo->get_buffer(obj, tinfo->get_buffer_data); + view->obj = obj; + view->ndim = 1; + view->internal = info; + view->buf = info->ptr; + view->itemsize = info->itemsize; + view->len = view->itemsize; + for (auto s : info->shape) + view->len *= s; + if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT) + view->format = const_cast(info->format.c_str()); + if ((flags & PyBUF_STRIDES) == PyBUF_STRIDES) { + view->ndim = (int) info->ndim; + view->strides = &info->strides[0]; + view->shape = &info->shape[0]; + } + Py_INCREF(view->obj); + return 0; +} + +/// buffer_protocol: Release the resources of the buffer. +extern "C" inline void pybind11_releasebuffer(PyObject *, Py_buffer *view) { + delete (buffer_info *) view->internal; +} + +/// Give this type a buffer interface. +inline void enable_buffer_protocol(PyHeapTypeObject *heap_type) { + heap_type->ht_type.tp_as_buffer = &heap_type->as_buffer; +#if PY_MAJOR_VERSION < 3 + heap_type->ht_type.tp_flags |= Py_TPFLAGS_HAVE_NEWBUFFER; +#endif + + heap_type->as_buffer.bf_getbuffer = pybind11_getbuffer; + heap_type->as_buffer.bf_releasebuffer = pybind11_releasebuffer; +} + +/** Create a brand new Python type according to the `type_record` specification. + Return value: New reference. */ +inline PyObject* make_new_python_type(const type_record &rec) { + auto name = reinterpret_steal(PYBIND11_FROM_STRING(rec.name)); + + auto qualname = name; + if (rec.scope && !PyModule_Check(rec.scope.ptr()) && hasattr(rec.scope, "__qualname__")) { +#if PY_MAJOR_VERSION >= 3 + qualname = reinterpret_steal( + PyUnicode_FromFormat("%U.%U", rec.scope.attr("__qualname__").ptr(), name.ptr())); +#else + qualname = str(rec.scope.attr("__qualname__").cast() + "." + rec.name); +#endif + } + + object module; + if (rec.scope) { + if (hasattr(rec.scope, "__module__")) + module = rec.scope.attr("__module__"); + else if (hasattr(rec.scope, "__name__")) + module = rec.scope.attr("__name__"); + } + + auto full_name = c_str( +#if !defined(PYPY_VERSION) + module ? str(module).cast() + "." + rec.name : +#endif + rec.name); + + char *tp_doc = nullptr; + if (rec.doc && options::show_user_defined_docstrings()) { + /* Allocate memory for docstring (using PyObject_MALLOC, since + Python will free this later on) */ + size_t size = strlen(rec.doc) + 1; + tp_doc = (char *) PyObject_MALLOC(size); + memcpy((void *) tp_doc, rec.doc, size); + } + + auto &internals = get_internals(); + auto bases = tuple(rec.bases); + auto base = (bases.size() == 0) ? internals.instance_base + : bases[0].ptr(); + + /* Danger zone: from now (and until PyType_Ready), make sure to + issue no Python C API calls which could potentially invoke the + garbage collector (the GC will call type_traverse(), which will in + turn find the newly constructed type in an invalid state) */ + auto metaclass = rec.metaclass.ptr() ? (PyTypeObject *) rec.metaclass.ptr() + : internals.default_metaclass; + + auto heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0); + if (!heap_type) + pybind11_fail(std::string(rec.name) + ": Unable to create type object!"); + + heap_type->ht_name = name.release().ptr(); +#ifdef PYBIND11_BUILTIN_QUALNAME + heap_type->ht_qualname = qualname.inc_ref().ptr(); +#endif + + auto type = &heap_type->ht_type; + type->tp_name = full_name; + type->tp_doc = tp_doc; + type->tp_base = type_incref((PyTypeObject *)base); + type->tp_basicsize = static_cast(sizeof(instance)); + if (bases.size() > 0) + type->tp_bases = bases.release().ptr(); + + /* Don't inherit base __init__ */ + type->tp_init = pybind11_object_init; + + /* Supported protocols */ + type->tp_as_number = &heap_type->as_number; + type->tp_as_sequence = &heap_type->as_sequence; + type->tp_as_mapping = &heap_type->as_mapping; + + /* Flags */ + type->tp_flags |= Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE; +#if PY_MAJOR_VERSION < 3 + type->tp_flags |= Py_TPFLAGS_CHECKTYPES; +#endif + + if (rec.dynamic_attr) + enable_dynamic_attributes(heap_type); + + if (rec.buffer_protocol) + enable_buffer_protocol(heap_type); + + if (PyType_Ready(type) < 0) + pybind11_fail(std::string(rec.name) + ": PyType_Ready failed (" + error_string() + ")!"); + + assert(rec.dynamic_attr ? PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC) + : !PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC)); + + /* Register type with the parent scope */ + if (rec.scope) + setattr(rec.scope, rec.name, (PyObject *) type); + else + Py_INCREF(type); // Keep it alive forever (reference leak) + + if (module) // Needed by pydoc + setattr((PyObject *) type, "__module__", module); + + PYBIND11_SET_OLDPY_QUALNAME(type, qualname); + + return (PyObject *) type; +} + +NAMESPACE_END(detail) +NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/python/src/pybind11/detail/common.h b/python/src/pybind11/detail/common.h new file mode 100644 index 000000000..bec8ccf3b --- /dev/null +++ b/python/src/pybind11/detail/common.h @@ -0,0 +1,807 @@ +/* + pybind11/detail/common.h -- Basic macros + + Copyright (c) 2016 Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#if !defined(NAMESPACE_BEGIN) +# define NAMESPACE_BEGIN(name) namespace name { +#endif +#if !defined(NAMESPACE_END) +# define NAMESPACE_END(name) } +#endif + +// Robust support for some features and loading modules compiled against different pybind versions +// requires forcing hidden visibility on pybind code, so we enforce this by setting the attribute on +// the main `pybind11` namespace. +#if !defined(PYBIND11_NAMESPACE) +# ifdef __GNUG__ +# define PYBIND11_NAMESPACE pybind11 __attribute__((visibility("hidden"))) +# else +# define PYBIND11_NAMESPACE pybind11 +# endif +#endif + +#if !(defined(_MSC_VER) && __cplusplus == 199711L) && !defined(__INTEL_COMPILER) +# if __cplusplus >= 201402L +# define PYBIND11_CPP14 +# if __cplusplus >= 201703L +# define PYBIND11_CPP17 +# endif +# endif +#elif defined(_MSC_VER) && __cplusplus == 199711L +// MSVC sets _MSVC_LANG rather than __cplusplus (supposedly until the standard is fully implemented) +// Unless you use the /Zc:__cplusplus flag on Visual Studio 2017 15.7 Preview 3 or newer +# if _MSVC_LANG >= 201402L +# define PYBIND11_CPP14 +# if _MSVC_LANG > 201402L && _MSC_VER >= 1910 +# define PYBIND11_CPP17 +# endif +# endif +#endif + +// Compiler version assertions +#if defined(__INTEL_COMPILER) +# if __INTEL_COMPILER < 1700 +# error pybind11 requires Intel C++ compiler v17 or newer +# endif +#elif defined(__clang__) && !defined(__apple_build_version__) +# if __clang_major__ < 3 || (__clang_major__ == 3 && __clang_minor__ < 3) +# error pybind11 requires clang 3.3 or newer +# endif +#elif defined(__clang__) +// Apple changes clang version macros to its Xcode version; the first Xcode release based on +// (upstream) clang 3.3 was Xcode 5: +# if __clang_major__ < 5 +# error pybind11 requires Xcode/clang 5.0 or newer +# endif +#elif defined(__GNUG__) +# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 8) +# error pybind11 requires gcc 4.8 or newer +# endif +#elif defined(_MSC_VER) +// Pybind hits various compiler bugs in 2015u2 and earlier, and also makes use of some stl features +// (e.g. std::negation) added in 2015u3: +# if _MSC_FULL_VER < 190024210 +# error pybind11 requires MSVC 2015 update 3 or newer +# endif +#endif + +#if !defined(PYBIND11_EXPORT) +# if defined(WIN32) || defined(_WIN32) +# define PYBIND11_EXPORT __declspec(dllexport) +# else +# define PYBIND11_EXPORT __attribute__ ((visibility("default"))) +# endif +#endif + +#if defined(_MSC_VER) +# define PYBIND11_NOINLINE __declspec(noinline) +#else +# define PYBIND11_NOINLINE __attribute__ ((noinline)) +#endif + +#if defined(PYBIND11_CPP14) +# define PYBIND11_DEPRECATED(reason) [[deprecated(reason)]] +#else +# define PYBIND11_DEPRECATED(reason) __attribute__((deprecated(reason))) +#endif + +#define PYBIND11_VERSION_MAJOR 2 +#define PYBIND11_VERSION_MINOR 3 +#define PYBIND11_VERSION_PATCH 0 + +/// Include Python header, disable linking to pythonX_d.lib on Windows in debug mode +#if defined(_MSC_VER) +# if (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION < 4) +# define HAVE_ROUND 1 +# endif +# pragma warning(push) +# pragma warning(disable: 4510 4610 4512 4005) +# if defined(_DEBUG) +# define PYBIND11_DEBUG_MARKER +# undef _DEBUG +# endif +#endif + +#include +#include +#include + +#if defined(_WIN32) && (defined(min) || defined(max)) +# error Macro clash with min and max -- define NOMINMAX when compiling your program on Windows +#endif + +#if defined(isalnum) +# undef isalnum +# undef isalpha +# undef islower +# undef isspace +# undef isupper +# undef tolower +# undef toupper +#endif + +#if defined(_MSC_VER) +# if defined(PYBIND11_DEBUG_MARKER) +# define _DEBUG +# undef PYBIND11_DEBUG_MARKER +# endif +# pragma warning(pop) +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if PY_MAJOR_VERSION >= 3 /// Compatibility macros for various Python versions +#define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyInstanceMethod_New(ptr) +#define PYBIND11_INSTANCE_METHOD_CHECK PyInstanceMethod_Check +#define PYBIND11_INSTANCE_METHOD_GET_FUNCTION PyInstanceMethod_GET_FUNCTION +#define PYBIND11_BYTES_CHECK PyBytes_Check +#define PYBIND11_BYTES_FROM_STRING PyBytes_FromString +#define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyBytes_FromStringAndSize +#define PYBIND11_BYTES_AS_STRING_AND_SIZE PyBytes_AsStringAndSize +#define PYBIND11_BYTES_AS_STRING PyBytes_AsString +#define PYBIND11_BYTES_SIZE PyBytes_Size +#define PYBIND11_LONG_CHECK(o) PyLong_Check(o) +#define PYBIND11_LONG_AS_LONGLONG(o) PyLong_AsLongLong(o) +#define PYBIND11_LONG_FROM_SIGNED(o) PyLong_FromSsize_t((ssize_t) o) +#define PYBIND11_LONG_FROM_UNSIGNED(o) PyLong_FromSize_t((size_t) o) +#define PYBIND11_BYTES_NAME "bytes" +#define PYBIND11_STRING_NAME "str" +#define PYBIND11_SLICE_OBJECT PyObject +#define PYBIND11_FROM_STRING PyUnicode_FromString +#define PYBIND11_STR_TYPE ::pybind11::str +#define PYBIND11_BOOL_ATTR "__bool__" +#define PYBIND11_NB_BOOL(ptr) ((ptr)->nb_bool) +#define PYBIND11_PLUGIN_IMPL(name) \ + extern "C" PYBIND11_EXPORT PyObject *PyInit_##name() + +#else +#define PYBIND11_INSTANCE_METHOD_NEW(ptr, class_) PyMethod_New(ptr, nullptr, class_) +#define PYBIND11_INSTANCE_METHOD_CHECK PyMethod_Check +#define PYBIND11_INSTANCE_METHOD_GET_FUNCTION PyMethod_GET_FUNCTION +#define PYBIND11_BYTES_CHECK PyString_Check +#define PYBIND11_BYTES_FROM_STRING PyString_FromString +#define PYBIND11_BYTES_FROM_STRING_AND_SIZE PyString_FromStringAndSize +#define PYBIND11_BYTES_AS_STRING_AND_SIZE PyString_AsStringAndSize +#define PYBIND11_BYTES_AS_STRING PyString_AsString +#define PYBIND11_BYTES_SIZE PyString_Size +#define PYBIND11_LONG_CHECK(o) (PyInt_Check(o) || PyLong_Check(o)) +#define PYBIND11_LONG_AS_LONGLONG(o) (PyInt_Check(o) ? (long long) PyLong_AsLong(o) : PyLong_AsLongLong(o)) +#define PYBIND11_LONG_FROM_SIGNED(o) PyInt_FromSsize_t((ssize_t) o) // Returns long if needed. +#define PYBIND11_LONG_FROM_UNSIGNED(o) PyInt_FromSize_t((size_t) o) // Returns long if needed. +#define PYBIND11_BYTES_NAME "str" +#define PYBIND11_STRING_NAME "unicode" +#define PYBIND11_SLICE_OBJECT PySliceObject +#define PYBIND11_FROM_STRING PyString_FromString +#define PYBIND11_STR_TYPE ::pybind11::bytes +#define PYBIND11_BOOL_ATTR "__nonzero__" +#define PYBIND11_NB_BOOL(ptr) ((ptr)->nb_nonzero) +#define PYBIND11_PLUGIN_IMPL(name) \ + static PyObject *pybind11_init_wrapper(); \ + extern "C" PYBIND11_EXPORT void init##name() { \ + (void)pybind11_init_wrapper(); \ + } \ + PyObject *pybind11_init_wrapper() +#endif + +#if PY_VERSION_HEX >= 0x03050000 && PY_VERSION_HEX < 0x03050200 +extern "C" { + struct _Py_atomic_address { void *value; }; + PyAPI_DATA(_Py_atomic_address) _PyThreadState_Current; +} +#endif + +#define PYBIND11_TRY_NEXT_OVERLOAD ((PyObject *) 1) // special failure return code +#define PYBIND11_STRINGIFY(x) #x +#define PYBIND11_TOSTRING(x) PYBIND11_STRINGIFY(x) +#define PYBIND11_CONCAT(first, second) first##second + +#define PYBIND11_CHECK_PYTHON_VERSION \ + { \ + const char *compiled_ver = PYBIND11_TOSTRING(PY_MAJOR_VERSION) \ + "." PYBIND11_TOSTRING(PY_MINOR_VERSION); \ + const char *runtime_ver = Py_GetVersion(); \ + size_t len = std::strlen(compiled_ver); \ + if (std::strncmp(runtime_ver, compiled_ver, len) != 0 \ + || (runtime_ver[len] >= '0' && runtime_ver[len] <= '9')) { \ + PyErr_Format(PyExc_ImportError, \ + "Python version mismatch: module was compiled for Python %s, " \ + "but the interpreter version is incompatible: %s.", \ + compiled_ver, runtime_ver); \ + return nullptr; \ + } \ + } + +#define PYBIND11_CATCH_INIT_EXCEPTIONS \ + catch (pybind11::error_already_set &e) { \ + PyErr_SetString(PyExc_ImportError, e.what()); \ + return nullptr; \ + } catch (const std::exception &e) { \ + PyErr_SetString(PyExc_ImportError, e.what()); \ + return nullptr; \ + } \ + +/** \rst + ***Deprecated in favor of PYBIND11_MODULE*** + + This macro creates the entry point that will be invoked when the Python interpreter + imports a plugin library. Please create a `module` in the function body and return + the pointer to its underlying Python object at the end. + + .. code-block:: cpp + + PYBIND11_PLUGIN(example) { + pybind11::module m("example", "pybind11 example plugin"); + /// Set up bindings here + return m.ptr(); + } +\endrst */ +#define PYBIND11_PLUGIN(name) \ + PYBIND11_DEPRECATED("PYBIND11_PLUGIN is deprecated, use PYBIND11_MODULE") \ + static PyObject *pybind11_init(); \ + PYBIND11_PLUGIN_IMPL(name) { \ + PYBIND11_CHECK_PYTHON_VERSION \ + try { \ + return pybind11_init(); \ + } PYBIND11_CATCH_INIT_EXCEPTIONS \ + } \ + PyObject *pybind11_init() + +/** \rst + This macro creates the entry point that will be invoked when the Python interpreter + imports an extension module. The module name is given as the fist argument and it + should not be in quotes. The second macro argument defines a variable of type + `py::module` which can be used to initialize the module. + + .. code-block:: cpp + + PYBIND11_MODULE(example, m) { + m.doc() = "pybind11 example module"; + + // Add bindings here + m.def("foo", []() { + return "Hello, World!"; + }); + } +\endrst */ +#define PYBIND11_MODULE(name, variable) \ + static void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &); \ + PYBIND11_PLUGIN_IMPL(name) { \ + PYBIND11_CHECK_PYTHON_VERSION \ + auto m = pybind11::module(PYBIND11_TOSTRING(name)); \ + try { \ + PYBIND11_CONCAT(pybind11_init_, name)(m); \ + return m.ptr(); \ + } PYBIND11_CATCH_INIT_EXCEPTIONS \ + } \ + void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &variable) + + +NAMESPACE_BEGIN(PYBIND11_NAMESPACE) + +using ssize_t = Py_ssize_t; +using size_t = std::size_t; + +/// Approach used to cast a previously unknown C++ instance into a Python object +enum class return_value_policy : uint8_t { + /** This is the default return value policy, which falls back to the policy + return_value_policy::take_ownership when the return value is a pointer. + Otherwise, it uses return_value::move or return_value::copy for rvalue + and lvalue references, respectively. See below for a description of what + all of these different policies do. */ + automatic = 0, + + /** As above, but use policy return_value_policy::reference when the return + value is a pointer. This is the default conversion policy for function + arguments when calling Python functions manually from C++ code (i.e. via + handle::operator()). You probably won't need to use this. */ + automatic_reference, + + /** Reference an existing object (i.e. do not create a new copy) and take + ownership. Python will call the destructor and delete operator when the + object’s reference count reaches zero. Undefined behavior ensues when + the C++ side does the same.. */ + take_ownership, + + /** Create a new copy of the returned object, which will be owned by + Python. This policy is comparably safe because the lifetimes of the two + instances are decoupled. */ + copy, + + /** Use std::move to move the return value contents into a new instance + that will be owned by Python. This policy is comparably safe because the + lifetimes of the two instances (move source and destination) are + decoupled. */ + move, + + /** Reference an existing object, but do not take ownership. The C++ side + is responsible for managing the object’s lifetime and deallocating it + when it is no longer used. Warning: undefined behavior will ensue when + the C++ side deletes an object that is still referenced and used by + Python. */ + reference, + + /** This policy only applies to methods and properties. It references the + object without taking ownership similar to the above + return_value_policy::reference policy. In contrast to that policy, the + function or property’s implicit this argument (called the parent) is + considered to be the the owner of the return value (the child). + pybind11 then couples the lifetime of the parent to the child via a + reference relationship that ensures that the parent cannot be garbage + collected while Python is still using the child. More advanced + variations of this scheme are also possible using combinations of + return_value_policy::reference and the keep_alive call policy */ + reference_internal +}; + +NAMESPACE_BEGIN(detail) + +inline static constexpr int log2(size_t n, int k = 0) { return (n <= 1) ? k : log2(n >> 1, k + 1); } + +// Returns the size as a multiple of sizeof(void *), rounded up. +inline static constexpr size_t size_in_ptrs(size_t s) { return 1 + ((s - 1) >> log2(sizeof(void *))); } + +/** + * The space to allocate for simple layout instance holders (see below) in multiple of the size of + * a pointer (e.g. 2 means 16 bytes on 64-bit architectures). The default is the minimum required + * to holder either a std::unique_ptr or std::shared_ptr (which is almost always + * sizeof(std::shared_ptr)). + */ +constexpr size_t instance_simple_holder_in_ptrs() { + static_assert(sizeof(std::shared_ptr) >= sizeof(std::unique_ptr), + "pybind assumes std::shared_ptrs are at least as big as std::unique_ptrs"); + return size_in_ptrs(sizeof(std::shared_ptr)); +} + +// Forward declarations +struct type_info; +struct value_and_holder; + +struct nonsimple_values_and_holders { + void **values_and_holders; + uint8_t *status; +}; + +/// The 'instance' type which needs to be standard layout (need to be able to use 'offsetof') +struct instance { + PyObject_HEAD + /// Storage for pointers and holder; see simple_layout, below, for a description + union { + void *simple_value_holder[1 + instance_simple_holder_in_ptrs()]; + nonsimple_values_and_holders nonsimple; + }; + /// Weak references + PyObject *weakrefs; + /// If true, the pointer is owned which means we're free to manage it with a holder. + bool owned : 1; + /** + * An instance has two possible value/holder layouts. + * + * Simple layout (when this flag is true), means the `simple_value_holder` is set with a pointer + * and the holder object governing that pointer, i.e. [val1*][holder]. This layout is applied + * whenever there is no python-side multiple inheritance of bound C++ types *and* the type's + * holder will fit in the default space (which is large enough to hold either a std::unique_ptr + * or std::shared_ptr). + * + * Non-simple layout applies when using custom holders that require more space than `shared_ptr` + * (which is typically the size of two pointers), or when multiple inheritance is used on the + * python side. Non-simple layout allocates the required amount of memory to have multiple + * bound C++ classes as parents. Under this layout, `nonsimple.values_and_holders` is set to a + * pointer to allocated space of the required space to hold a sequence of value pointers and + * holders followed `status`, a set of bit flags (1 byte each), i.e. + * [val1*][holder1][val2*][holder2]...[bb...] where each [block] is rounded up to a multiple of + * `sizeof(void *)`. `nonsimple.status` is, for convenience, a pointer to the + * beginning of the [bb...] block (but not independently allocated). + * + * Status bits indicate whether the associated holder is constructed (& + * status_holder_constructed) and whether the value pointer is registered (& + * status_instance_registered) in `registered_instances`. + */ + bool simple_layout : 1; + /// For simple layout, tracks whether the holder has been constructed + bool simple_holder_constructed : 1; + /// For simple layout, tracks whether the instance is registered in `registered_instances` + bool simple_instance_registered : 1; + /// If true, get_internals().patients has an entry for this object + bool has_patients : 1; + + /// Initializes all of the above type/values/holders data (but not the instance values themselves) + void allocate_layout(); + + /// Destroys/deallocates all of the above + void deallocate_layout(); + + /// Returns the value_and_holder wrapper for the given type (or the first, if `find_type` + /// omitted). Returns a default-constructed (with `.inst = nullptr`) object on failure if + /// `throw_if_missing` is false. + value_and_holder get_value_and_holder(const type_info *find_type = nullptr, bool throw_if_missing = true); + + /// Bit values for the non-simple status flags + static constexpr uint8_t status_holder_constructed = 1; + static constexpr uint8_t status_instance_registered = 2; +}; + +static_assert(std::is_standard_layout::value, "Internal error: `pybind11::detail::instance` is not standard layout!"); + +/// from __cpp_future__ import (convenient aliases from C++14/17) +#if defined(PYBIND11_CPP14) && (!defined(_MSC_VER) || _MSC_VER >= 1910) +using std::enable_if_t; +using std::conditional_t; +using std::remove_cv_t; +using std::remove_reference_t; +#else +template using enable_if_t = typename std::enable_if::type; +template using conditional_t = typename std::conditional::type; +template using remove_cv_t = typename std::remove_cv::type; +template using remove_reference_t = typename std::remove_reference::type; +#endif + +/// Index sequences +#if defined(PYBIND11_CPP14) +using std::index_sequence; +using std::make_index_sequence; +#else +template struct index_sequence { }; +template struct make_index_sequence_impl : make_index_sequence_impl { }; +template struct make_index_sequence_impl <0, S...> { typedef index_sequence type; }; +template using make_index_sequence = typename make_index_sequence_impl::type; +#endif + +/// Make an index sequence of the indices of true arguments +template struct select_indices_impl { using type = ISeq; }; +template struct select_indices_impl, I, B, Bs...> + : select_indices_impl, index_sequence>, I + 1, Bs...> {}; +template using select_indices = typename select_indices_impl, 0, Bs...>::type; + +/// Backports of std::bool_constant and std::negation to accommodate older compilers +template using bool_constant = std::integral_constant; +template struct negation : bool_constant { }; + +template struct void_t_impl { using type = void; }; +template using void_t = typename void_t_impl::type; + +/// Compile-time all/any/none of that check the boolean value of all template types +#if defined(__cpp_fold_expressions) && !(defined(_MSC_VER) && (_MSC_VER < 1916)) +template using all_of = bool_constant<(Ts::value && ...)>; +template using any_of = bool_constant<(Ts::value || ...)>; +#elif !defined(_MSC_VER) +template struct bools {}; +template using all_of = std::is_same< + bools, + bools>; +template using any_of = negation...>>; +#else +// MSVC has trouble with the above, but supports std::conjunction, which we can use instead (albeit +// at a slight loss of compilation efficiency). +template using all_of = std::conjunction; +template using any_of = std::disjunction; +#endif +template using none_of = negation>; + +template class... Predicates> using satisfies_all_of = all_of...>; +template class... Predicates> using satisfies_any_of = any_of...>; +template class... Predicates> using satisfies_none_of = none_of...>; + +/// Strip the class from a method type +template struct remove_class { }; +template struct remove_class { typedef R type(A...); }; +template struct remove_class { typedef R type(A...); }; + +/// Helper template to strip away type modifiers +template struct intrinsic_type { typedef T type; }; +template struct intrinsic_type { typedef typename intrinsic_type::type type; }; +template struct intrinsic_type { typedef typename intrinsic_type::type type; }; +template struct intrinsic_type { typedef typename intrinsic_type::type type; }; +template struct intrinsic_type { typedef typename intrinsic_type::type type; }; +template struct intrinsic_type { typedef typename intrinsic_type::type type; }; +template struct intrinsic_type { typedef typename intrinsic_type::type type; }; +template using intrinsic_t = typename intrinsic_type::type; + +/// Helper type to replace 'void' in some expressions +struct void_type { }; + +/// Helper template which holds a list of types +template struct type_list { }; + +/// Compile-time integer sum +#ifdef __cpp_fold_expressions +template constexpr size_t constexpr_sum(Ts... ns) { return (0 + ... + size_t{ns}); } +#else +constexpr size_t constexpr_sum() { return 0; } +template +constexpr size_t constexpr_sum(T n, Ts... ns) { return size_t{n} + constexpr_sum(ns...); } +#endif + +NAMESPACE_BEGIN(constexpr_impl) +/// Implementation details for constexpr functions +constexpr int first(int i) { return i; } +template +constexpr int first(int i, T v, Ts... vs) { return v ? i : first(i + 1, vs...); } + +constexpr int last(int /*i*/, int result) { return result; } +template +constexpr int last(int i, int result, T v, Ts... vs) { return last(i + 1, v ? i : result, vs...); } +NAMESPACE_END(constexpr_impl) + +/// Return the index of the first type in Ts which satisfies Predicate. Returns sizeof...(Ts) if +/// none match. +template class Predicate, typename... Ts> +constexpr int constexpr_first() { return constexpr_impl::first(0, Predicate::value...); } + +/// Return the index of the last type in Ts which satisfies Predicate, or -1 if none match. +template class Predicate, typename... Ts> +constexpr int constexpr_last() { return constexpr_impl::last(0, -1, Predicate::value...); } + +/// Return the Nth element from the parameter pack +template +struct pack_element { using type = typename pack_element::type; }; +template +struct pack_element<0, T, Ts...> { using type = T; }; + +/// Return the one and only type which matches the predicate, or Default if none match. +/// If more than one type matches the predicate, fail at compile-time. +template class Predicate, typename Default, typename... Ts> +struct exactly_one { + static constexpr auto found = constexpr_sum(Predicate::value...); + static_assert(found <= 1, "Found more than one type matching the predicate"); + + static constexpr auto index = found ? constexpr_first() : 0; + using type = conditional_t::type, Default>; +}; +template class P, typename Default> +struct exactly_one { using type = Default; }; + +template class Predicate, typename Default, typename... Ts> +using exactly_one_t = typename exactly_one::type; + +/// Defer the evaluation of type T until types Us are instantiated +template struct deferred_type { using type = T; }; +template using deferred_t = typename deferred_type::type; + +/// Like is_base_of, but requires a strict base (i.e. `is_strict_base_of::value == false`, +/// unlike `std::is_base_of`) +template using is_strict_base_of = bool_constant< + std::is_base_of::value && !std::is_same::value>; + +/// Like is_base_of, but also requires that the base type is accessible (i.e. that a Derived pointer +/// can be converted to a Base pointer) +template using is_accessible_base_of = bool_constant< + std::is_base_of::value && std::is_convertible::value>; + +template class Base> +struct is_template_base_of_impl { + template static std::true_type check(Base *); + static std::false_type check(...); +}; + +/// Check if a template is the base of a type. For example: +/// `is_template_base_of` is true if `struct T : Base {}` where U can be anything +template class Base, typename T> +#if !defined(_MSC_VER) +using is_template_base_of = decltype(is_template_base_of_impl::check((intrinsic_t*)nullptr)); +#else // MSVC2015 has trouble with decltype in template aliases +struct is_template_base_of : decltype(is_template_base_of_impl::check((intrinsic_t*)nullptr)) { }; +#endif + +/// Check if T is an instantiation of the template `Class`. For example: +/// `is_instantiation` is true if `T == shared_ptr` where U can be anything. +template class Class, typename T> +struct is_instantiation : std::false_type { }; +template class Class, typename... Us> +struct is_instantiation> : std::true_type { }; + +/// Check if T is std::shared_ptr where U can be anything +template using is_shared_ptr = is_instantiation; + +/// Check if T looks like an input iterator +template struct is_input_iterator : std::false_type {}; +template +struct is_input_iterator()), decltype(++std::declval())>> + : std::true_type {}; + +template using is_function_pointer = bool_constant< + std::is_pointer::value && std::is_function::type>::value>; + +template struct strip_function_object { + using type = typename remove_class::type; +}; + +// Extracts the function signature from a function, function pointer or lambda. +template > +using function_signature_t = conditional_t< + std::is_function::value, + F, + typename conditional_t< + std::is_pointer::value || std::is_member_pointer::value, + std::remove_pointer, + strip_function_object + >::type +>; + +/// Returns true if the type looks like a lambda: that is, isn't a function, pointer or member +/// pointer. Note that this can catch all sorts of other things, too; this is intended to be used +/// in a place where passing a lambda makes sense. +template using is_lambda = satisfies_none_of, + std::is_function, std::is_pointer, std::is_member_pointer>; + +/// Ignore that a variable is unused in compiler warnings +inline void ignore_unused(const int *) { } + +/// Apply a function over each element of a parameter pack +#ifdef __cpp_fold_expressions +#define PYBIND11_EXPAND_SIDE_EFFECTS(PATTERN) (((PATTERN), void()), ...) +#else +using expand_side_effects = bool[]; +#define PYBIND11_EXPAND_SIDE_EFFECTS(PATTERN) pybind11::detail::expand_side_effects{ ((PATTERN), void(), false)..., false } +#endif + +NAMESPACE_END(detail) + +/// C++ bindings of builtin Python exceptions +class builtin_exception : public std::runtime_error { +public: + using std::runtime_error::runtime_error; + /// Set the error using the Python C API + virtual void set_error() const = 0; +}; + +#define PYBIND11_RUNTIME_EXCEPTION(name, type) \ + class name : public builtin_exception { public: \ + using builtin_exception::builtin_exception; \ + name() : name("") { } \ + void set_error() const override { PyErr_SetString(type, what()); } \ + }; + +PYBIND11_RUNTIME_EXCEPTION(stop_iteration, PyExc_StopIteration) +PYBIND11_RUNTIME_EXCEPTION(index_error, PyExc_IndexError) +PYBIND11_RUNTIME_EXCEPTION(key_error, PyExc_KeyError) +PYBIND11_RUNTIME_EXCEPTION(value_error, PyExc_ValueError) +PYBIND11_RUNTIME_EXCEPTION(type_error, PyExc_TypeError) +PYBIND11_RUNTIME_EXCEPTION(cast_error, PyExc_RuntimeError) /// Thrown when pybind11::cast or handle::call fail due to a type casting error +PYBIND11_RUNTIME_EXCEPTION(reference_cast_error, PyExc_RuntimeError) /// Used internally + +[[noreturn]] PYBIND11_NOINLINE inline void pybind11_fail(const char *reason) { throw std::runtime_error(reason); } +[[noreturn]] PYBIND11_NOINLINE inline void pybind11_fail(const std::string &reason) { throw std::runtime_error(reason); } + +template struct format_descriptor { }; + +NAMESPACE_BEGIN(detail) +// Returns the index of the given type in the type char array below, and in the list in numpy.h +// The order here is: bool; 8 ints ((signed,unsigned)x(8,16,32,64)bits); float,double,long double; +// complex float,double,long double. Note that the long double types only participate when long +// double is actually longer than double (it isn't under MSVC). +// NB: not only the string below but also complex.h and numpy.h rely on this order. +template struct is_fmt_numeric { static constexpr bool value = false; }; +template struct is_fmt_numeric::value>> { + static constexpr bool value = true; + static constexpr int index = std::is_same::value ? 0 : 1 + ( + std::is_integral::value ? detail::log2(sizeof(T))*2 + std::is_unsigned::value : 8 + ( + std::is_same::value ? 1 : std::is_same::value ? 2 : 0)); +}; +NAMESPACE_END(detail) + +template struct format_descriptor::value>> { + static constexpr const char c = "?bBhHiIqQfdg"[detail::is_fmt_numeric::index]; + static constexpr const char value[2] = { c, '\0' }; + static std::string format() { return std::string(1, c); } +}; + +#if !defined(PYBIND11_CPP17) + +template constexpr const char format_descriptor< + T, detail::enable_if_t::value>>::value[2]; + +#endif + +/// RAII wrapper that temporarily clears any Python error state +struct error_scope { + PyObject *type, *value, *trace; + error_scope() { PyErr_Fetch(&type, &value, &trace); } + ~error_scope() { PyErr_Restore(type, value, trace); } +}; + +/// Dummy destructor wrapper that can be used to expose classes with a private destructor +struct nodelete { template void operator()(T*) { } }; + +// overload_cast requires variable templates: C++14 +#if defined(PYBIND11_CPP14) +#define PYBIND11_OVERLOAD_CAST 1 + +NAMESPACE_BEGIN(detail) +template +struct overload_cast_impl { + constexpr overload_cast_impl() {} // MSVC 2015 needs this + + template + constexpr auto operator()(Return (*pf)(Args...)) const noexcept + -> decltype(pf) { return pf; } + + template + constexpr auto operator()(Return (Class::*pmf)(Args...), std::false_type = {}) const noexcept + -> decltype(pmf) { return pmf; } + + template + constexpr auto operator()(Return (Class::*pmf)(Args...) const, std::true_type) const noexcept + -> decltype(pmf) { return pmf; } +}; +NAMESPACE_END(detail) + +/// Syntax sugar for resolving overloaded function pointers: +/// - regular: static_cast(&Class::func) +/// - sweet: overload_cast(&Class::func) +template +static constexpr detail::overload_cast_impl overload_cast = {}; +// MSVC 2015 only accepts this particular initialization syntax for this variable template. + +/// Const member function selector for overload_cast +/// - regular: static_cast(&Class::func) +/// - sweet: overload_cast(&Class::func, const_) +static constexpr auto const_ = std::true_type{}; + +#else // no overload_cast: providing something that static_assert-fails: +template struct overload_cast { + static_assert(detail::deferred_t::value, + "pybind11::overload_cast<...> requires compiling in C++14 mode"); +}; +#endif // overload_cast + +NAMESPACE_BEGIN(detail) + +// Adaptor for converting arbitrary container arguments into a vector; implicitly convertible from +// any standard container (or C-style array) supporting std::begin/std::end, any singleton +// arithmetic type (if T is arithmetic), or explicitly constructible from an iterator pair. +template +class any_container { + std::vector v; +public: + any_container() = default; + + // Can construct from a pair of iterators + template ::value>> + any_container(It first, It last) : v(first, last) { } + + // Implicit conversion constructor from any arbitrary container type with values convertible to T + template ())), T>::value>> + any_container(const Container &c) : any_container(std::begin(c), std::end(c)) { } + + // initializer_list's aren't deducible, so don't get matched by the above template; we need this + // to explicitly allow implicit conversion from one: + template ::value>> + any_container(const std::initializer_list &c) : any_container(c.begin(), c.end()) { } + + // Avoid copying if given an rvalue vector of the correct type. + any_container(std::vector &&v) : v(std::move(v)) { } + + // Moves the vector out of an rvalue any_container + operator std::vector &&() && { return std::move(v); } + + // Dereferencing obtains a reference to the underlying vector + std::vector &operator*() { return v; } + const std::vector &operator*() const { return v; } + + // -> lets you call methods on the underlying vector + std::vector *operator->() { return &v; } + const std::vector *operator->() const { return &v; } +}; + +NAMESPACE_END(detail) + + + +NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/python/src/pybind11/detail/descr.h b/python/src/pybind11/detail/descr.h new file mode 100644 index 000000000..8d404e534 --- /dev/null +++ b/python/src/pybind11/detail/descr.h @@ -0,0 +1,100 @@ +/* + pybind11/detail/descr.h: Helper type for concatenating type signatures at compile time + + Copyright (c) 2016 Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include "common.h" + +NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +NAMESPACE_BEGIN(detail) + +#if !defined(_MSC_VER) +# define PYBIND11_DESCR_CONSTEXPR static constexpr +#else +# define PYBIND11_DESCR_CONSTEXPR const +#endif + +/* Concatenate type signatures at compile time */ +template +struct descr { + char text[N + 1]; + + constexpr descr() : text{'\0'} { } + constexpr descr(char const (&s)[N+1]) : descr(s, make_index_sequence()) { } + + template + constexpr descr(char const (&s)[N+1], index_sequence) : text{s[Is]..., '\0'} { } + + template + constexpr descr(char c, Chars... cs) : text{c, static_cast(cs)..., '\0'} { } + + static constexpr std::array types() { + return {{&typeid(Ts)..., nullptr}}; + } +}; + +template +constexpr descr plus_impl(const descr &a, const descr &b, + index_sequence, index_sequence) { + return {a.text[Is1]..., b.text[Is2]...}; +} + +template +constexpr descr operator+(const descr &a, const descr &b) { + return plus_impl(a, b, make_index_sequence(), make_index_sequence()); +} + +template +constexpr descr _(char const(&text)[N]) { return descr(text); } +constexpr descr<0> _(char const(&)[1]) { return {}; } + +template struct int_to_str : int_to_str { }; +template struct int_to_str<0, Digits...> { + static constexpr auto digits = descr(('0' + Digits)...); +}; + +// Ternary description (like std::conditional) +template +constexpr enable_if_t> _(char const(&text1)[N1], char const(&)[N2]) { + return _(text1); +} +template +constexpr enable_if_t> _(char const(&)[N1], char const(&text2)[N2]) { + return _(text2); +} + +template +constexpr enable_if_t _(const T1 &d, const T2 &) { return d; } +template +constexpr enable_if_t _(const T1 &, const T2 &d) { return d; } + +template auto constexpr _() -> decltype(int_to_str::digits) { + return int_to_str::digits; +} + +template constexpr descr<1, Type> _() { return {'%'}; } + +constexpr descr<0> concat() { return {}; } + +template +constexpr descr concat(const descr &descr) { return descr; } + +template +constexpr auto concat(const descr &d, const Args &...args) + -> decltype(std::declval>() + concat(args...)) { + return d + _(", ") + concat(args...); +} + +template +constexpr descr type_descr(const descr &descr) { + return _("{") + descr + _("}"); +} + +NAMESPACE_END(detail) +NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/python/src/pybind11/detail/init.h b/python/src/pybind11/detail/init.h new file mode 100644 index 000000000..acfe00bdb --- /dev/null +++ b/python/src/pybind11/detail/init.h @@ -0,0 +1,335 @@ +/* + pybind11/detail/init.h: init factory function implementation and support code. + + Copyright (c) 2017 Jason Rhinelander + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include "class.h" + +NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +NAMESPACE_BEGIN(detail) + +template <> +class type_caster { +public: + bool load(handle h, bool) { + value = reinterpret_cast(h.ptr()); + return true; + } + + template using cast_op_type = value_and_holder &; + operator value_and_holder &() { return *value; } + static constexpr auto name = _(); + +private: + value_and_holder *value = nullptr; +}; + +NAMESPACE_BEGIN(initimpl) + +inline void no_nullptr(void *ptr) { + if (!ptr) throw type_error("pybind11::init(): factory function returned nullptr"); +} + +// Implementing functions for all forms of py::init<...> and py::init(...) +template using Cpp = typename Class::type; +template using Alias = typename Class::type_alias; +template using Holder = typename Class::holder_type; + +template using is_alias_constructible = std::is_constructible, Cpp &&>; + +// Takes a Cpp pointer and returns true if it actually is a polymorphic Alias instance. +template = 0> +bool is_alias(Cpp *ptr) { + return dynamic_cast *>(ptr) != nullptr; +} +// Failing fallback version of the above for a no-alias class (always returns false) +template +constexpr bool is_alias(void *) { return false; } + +// Constructs and returns a new object; if the given arguments don't map to a constructor, we fall +// back to brace aggregate initiailization so that for aggregate initialization can be used with +// py::init, e.g. `py::init` to initialize a `struct T { int a; int b; }`. For +// non-aggregate types, we need to use an ordinary T(...) constructor (invoking as `T{...}` usually +// works, but will not do the expected thing when `T` has an `initializer_list` constructor). +template ::value, int> = 0> +inline Class *construct_or_initialize(Args &&...args) { return new Class(std::forward(args)...); } +template ::value, int> = 0> +inline Class *construct_or_initialize(Args &&...args) { return new Class{std::forward(args)...}; } + +// Attempts to constructs an alias using a `Alias(Cpp &&)` constructor. This allows types with +// an alias to provide only a single Cpp factory function as long as the Alias can be +// constructed from an rvalue reference of the base Cpp type. This means that Alias classes +// can, when appropriate, simply define a `Alias(Cpp &&)` constructor rather than needing to +// inherit all the base class constructors. +template +void construct_alias_from_cpp(std::true_type /*is_alias_constructible*/, + value_and_holder &v_h, Cpp &&base) { + v_h.value_ptr() = new Alias(std::move(base)); +} +template +[[noreturn]] void construct_alias_from_cpp(std::false_type /*!is_alias_constructible*/, + value_and_holder &, Cpp &&) { + throw type_error("pybind11::init(): unable to convert returned instance to required " + "alias class: no `Alias(Class &&)` constructor available"); +} + +// Error-generating fallback for factories that don't match one of the below construction +// mechanisms. +template +void construct(...) { + static_assert(!std::is_same::value /* always false */, + "pybind11::init(): init function must return a compatible pointer, " + "holder, or value"); +} + +// Pointer return v1: the factory function returns a class pointer for a registered class. +// If we don't need an alias (because this class doesn't have one, or because the final type is +// inherited on the Python side) we can simply take over ownership. Otherwise we need to try to +// construct an Alias from the returned base instance. +template +void construct(value_and_holder &v_h, Cpp *ptr, bool need_alias) { + no_nullptr(ptr); + if (Class::has_alias && need_alias && !is_alias(ptr)) { + // We're going to try to construct an alias by moving the cpp type. Whether or not + // that succeeds, we still need to destroy the original cpp pointer (either the + // moved away leftover, if the alias construction works, or the value itself if we + // throw an error), but we can't just call `delete ptr`: it might have a special + // deleter, or might be shared_from_this. So we construct a holder around it as if + // it was a normal instance, then steal the holder away into a local variable; thus + // the holder and destruction happens when we leave the C++ scope, and the holder + // class gets to handle the destruction however it likes. + v_h.value_ptr() = ptr; + v_h.set_instance_registered(true); // To prevent init_instance from registering it + v_h.type->init_instance(v_h.inst, nullptr); // Set up the holder + Holder temp_holder(std::move(v_h.holder>())); // Steal the holder + v_h.type->dealloc(v_h); // Destroys the moved-out holder remains, resets value ptr to null + v_h.set_instance_registered(false); + + construct_alias_from_cpp(is_alias_constructible{}, v_h, std::move(*ptr)); + } else { + // Otherwise the type isn't inherited, so we don't need an Alias + v_h.value_ptr() = ptr; + } +} + +// Pointer return v2: a factory that always returns an alias instance ptr. We simply take over +// ownership of the pointer. +template = 0> +void construct(value_and_holder &v_h, Alias *alias_ptr, bool) { + no_nullptr(alias_ptr); + v_h.value_ptr() = static_cast *>(alias_ptr); +} + +// Holder return: copy its pointer, and move or copy the returned holder into the new instance's +// holder. This also handles types like std::shared_ptr and std::unique_ptr where T is a +// derived type (through those holder's implicit conversion from derived class holder constructors). +template +void construct(value_and_holder &v_h, Holder holder, bool need_alias) { + auto *ptr = holder_helper>::get(holder); + // If we need an alias, check that the held pointer is actually an alias instance + if (Class::has_alias && need_alias && !is_alias(ptr)) + throw type_error("pybind11::init(): construction failed: returned holder-wrapped instance " + "is not an alias instance"); + + v_h.value_ptr() = ptr; + v_h.type->init_instance(v_h.inst, &holder); +} + +// return-by-value version 1: returning a cpp class by value. If the class has an alias and an +// alias is required the alias must have an `Alias(Cpp &&)` constructor so that we can construct +// the alias from the base when needed (i.e. because of Python-side inheritance). When we don't +// need it, we simply move-construct the cpp value into a new instance. +template +void construct(value_and_holder &v_h, Cpp &&result, bool need_alias) { + static_assert(std::is_move_constructible>::value, + "pybind11::init() return-by-value factory function requires a movable class"); + if (Class::has_alias && need_alias) + construct_alias_from_cpp(is_alias_constructible{}, v_h, std::move(result)); + else + v_h.value_ptr() = new Cpp(std::move(result)); +} + +// return-by-value version 2: returning a value of the alias type itself. We move-construct an +// Alias instance (even if no the python-side inheritance is involved). The is intended for +// cases where Alias initialization is always desired. +template +void construct(value_and_holder &v_h, Alias &&result, bool) { + static_assert(std::is_move_constructible>::value, + "pybind11::init() return-by-alias-value factory function requires a movable alias class"); + v_h.value_ptr() = new Alias(std::move(result)); +} + +// Implementing class for py::init<...>() +template +struct constructor { + template = 0> + static void execute(Class &cl, const Extra&... extra) { + cl.def("__init__", [](value_and_holder &v_h, Args... args) { + v_h.value_ptr() = construct_or_initialize>(std::forward(args)...); + }, is_new_style_constructor(), extra...); + } + + template , Args...>::value, int> = 0> + static void execute(Class &cl, const Extra&... extra) { + cl.def("__init__", [](value_and_holder &v_h, Args... args) { + if (Py_TYPE(v_h.inst) == v_h.type->type) + v_h.value_ptr() = construct_or_initialize>(std::forward(args)...); + else + v_h.value_ptr() = construct_or_initialize>(std::forward(args)...); + }, is_new_style_constructor(), extra...); + } + + template , Args...>::value, int> = 0> + static void execute(Class &cl, const Extra&... extra) { + cl.def("__init__", [](value_and_holder &v_h, Args... args) { + v_h.value_ptr() = construct_or_initialize>(std::forward(args)...); + }, is_new_style_constructor(), extra...); + } +}; + +// Implementing class for py::init_alias<...>() +template struct alias_constructor { + template , Args...>::value, int> = 0> + static void execute(Class &cl, const Extra&... extra) { + cl.def("__init__", [](value_and_holder &v_h, Args... args) { + v_h.value_ptr() = construct_or_initialize>(std::forward(args)...); + }, is_new_style_constructor(), extra...); + } +}; + +// Implementation class for py::init(Func) and py::init(Func, AliasFunc) +template , typename = function_signature_t> +struct factory; + +// Specialization for py::init(Func) +template +struct factory { + remove_reference_t class_factory; + + factory(Func &&f) : class_factory(std::forward(f)) { } + + // The given class either has no alias or has no separate alias factory; + // this always constructs the class itself. If the class is registered with an alias + // type and an alias instance is needed (i.e. because the final type is a Python class + // inheriting from the C++ type) the returned value needs to either already be an alias + // instance, or the alias needs to be constructible from a `Class &&` argument. + template + void execute(Class &cl, const Extra &...extra) && { + #if defined(PYBIND11_CPP14) + cl.def("__init__", [func = std::move(class_factory)] + #else + auto &func = class_factory; + cl.def("__init__", [func] + #endif + (value_and_holder &v_h, Args... args) { + construct(v_h, func(std::forward(args)...), + Py_TYPE(v_h.inst) != v_h.type->type); + }, is_new_style_constructor(), extra...); + } +}; + +// Specialization for py::init(Func, AliasFunc) +template +struct factory { + static_assert(sizeof...(CArgs) == sizeof...(AArgs), + "pybind11::init(class_factory, alias_factory): class and alias factories " + "must have identical argument signatures"); + static_assert(all_of...>::value, + "pybind11::init(class_factory, alias_factory): class and alias factories " + "must have identical argument signatures"); + + remove_reference_t class_factory; + remove_reference_t alias_factory; + + factory(CFunc &&c, AFunc &&a) + : class_factory(std::forward(c)), alias_factory(std::forward(a)) { } + + // The class factory is called when the `self` type passed to `__init__` is the direct + // class (i.e. not inherited), the alias factory when `self` is a Python-side subtype. + template + void execute(Class &cl, const Extra&... extra) && { + static_assert(Class::has_alias, "The two-argument version of `py::init()` can " + "only be used if the class has an alias"); + #if defined(PYBIND11_CPP14) + cl.def("__init__", [class_func = std::move(class_factory), alias_func = std::move(alias_factory)] + #else + auto &class_func = class_factory; + auto &alias_func = alias_factory; + cl.def("__init__", [class_func, alias_func] + #endif + (value_and_holder &v_h, CArgs... args) { + if (Py_TYPE(v_h.inst) == v_h.type->type) + // If the instance type equals the registered type we don't have inheritance, so + // don't need the alias and can construct using the class function: + construct(v_h, class_func(std::forward(args)...), false); + else + construct(v_h, alias_func(std::forward(args)...), true); + }, is_new_style_constructor(), extra...); + } +}; + +/// Set just the C++ state. Same as `__init__`. +template +void setstate(value_and_holder &v_h, T &&result, bool need_alias) { + construct(v_h, std::forward(result), need_alias); +} + +/// Set both the C++ and Python states +template ::value, int> = 0> +void setstate(value_and_holder &v_h, std::pair &&result, bool need_alias) { + construct(v_h, std::move(result.first), need_alias); + setattr((PyObject *) v_h.inst, "__dict__", result.second); +} + +/// Implementation for py::pickle(GetState, SetState) +template , typename = function_signature_t> +struct pickle_factory; + +template +struct pickle_factory { + static_assert(std::is_same, intrinsic_t>::value, + "The type returned by `__getstate__` must be the same " + "as the argument accepted by `__setstate__`"); + + remove_reference_t get; + remove_reference_t set; + + pickle_factory(Get get, Set set) + : get(std::forward(get)), set(std::forward(set)) { } + + template + void execute(Class &cl, const Extra &...extra) && { + cl.def("__getstate__", std::move(get)); + +#if defined(PYBIND11_CPP14) + cl.def("__setstate__", [func = std::move(set)] +#else + auto &func = set; + cl.def("__setstate__", [func] +#endif + (value_and_holder &v_h, ArgState state) { + setstate(v_h, func(std::forward(state)), + Py_TYPE(v_h.inst) != v_h.type->type); + }, is_new_style_constructor(), extra...); + } +}; + +NAMESPACE_END(initimpl) +NAMESPACE_END(detail) +NAMESPACE_END(pybind11) diff --git a/python/src/pybind11/detail/internals.h b/python/src/pybind11/detail/internals.h new file mode 100644 index 000000000..f1dd38764 --- /dev/null +++ b/python/src/pybind11/detail/internals.h @@ -0,0 +1,291 @@ +/* + pybind11/detail/internals.h: Internal data structure and related functions + + Copyright (c) 2017 Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include "../pytypes.h" + +NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +NAMESPACE_BEGIN(detail) +// Forward declarations +inline PyTypeObject *make_static_property_type(); +inline PyTypeObject *make_default_metaclass(); +inline PyObject *make_object_base_type(PyTypeObject *metaclass); + +// The old Python Thread Local Storage (TLS) API is deprecated in Python 3.7 in favor of the new +// Thread Specific Storage (TSS) API. +#if PY_VERSION_HEX >= 0x03070000 +# define PYBIND11_TLS_KEY_INIT(var) Py_tss_t *var = nullptr +# define PYBIND11_TLS_GET_VALUE(key) PyThread_tss_get((key)) +# define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_tss_set((key), (value)) +# define PYBIND11_TLS_DELETE_VALUE(key) PyThread_tss_set((key), nullptr) +#else + // Usually an int but a long on Cygwin64 with Python 3.x +# define PYBIND11_TLS_KEY_INIT(var) decltype(PyThread_create_key()) var = 0 +# define PYBIND11_TLS_GET_VALUE(key) PyThread_get_key_value((key)) +# if PY_MAJOR_VERSION < 3 +# define PYBIND11_TLS_DELETE_VALUE(key) \ + PyThread_delete_key_value(key) +# define PYBIND11_TLS_REPLACE_VALUE(key, value) \ + do { \ + PyThread_delete_key_value((key)); \ + PyThread_set_key_value((key), (value)); \ + } while (false) +# else +# define PYBIND11_TLS_DELETE_VALUE(key) \ + PyThread_set_key_value((key), nullptr) +# define PYBIND11_TLS_REPLACE_VALUE(key, value) \ + PyThread_set_key_value((key), (value)) +# endif +#endif + +// Python loads modules by default with dlopen with the RTLD_LOCAL flag; under libc++ and possibly +// other STLs, this means `typeid(A)` from one module won't equal `typeid(A)` from another module +// even when `A` is the same, non-hidden-visibility type (e.g. from a common include). Under +// libstdc++, this doesn't happen: equality and the type_index hash are based on the type name, +// which works. If not under a known-good stl, provide our own name-based hash and equality +// functions that use the type name. +#if defined(__GLIBCXX__) +inline bool same_type(const std::type_info &lhs, const std::type_info &rhs) { return lhs == rhs; } +using type_hash = std::hash; +using type_equal_to = std::equal_to; +#else +inline bool same_type(const std::type_info &lhs, const std::type_info &rhs) { + return lhs.name() == rhs.name() || std::strcmp(lhs.name(), rhs.name()) == 0; +} + +struct type_hash { + size_t operator()(const std::type_index &t) const { + size_t hash = 5381; + const char *ptr = t.name(); + while (auto c = static_cast(*ptr++)) + hash = (hash * 33) ^ c; + return hash; + } +}; + +struct type_equal_to { + bool operator()(const std::type_index &lhs, const std::type_index &rhs) const { + return lhs.name() == rhs.name() || std::strcmp(lhs.name(), rhs.name()) == 0; + } +}; +#endif + +template +using type_map = std::unordered_map; + +struct overload_hash { + inline size_t operator()(const std::pair& v) const { + size_t value = std::hash()(v.first); + value ^= std::hash()(v.second) + 0x9e3779b9 + (value<<6) + (value>>2); + return value; + } +}; + +/// Internal data structure used to track registered instances and types. +/// Whenever binary incompatible changes are made to this structure, +/// `PYBIND11_INTERNALS_VERSION` must be incremented. +struct internals { + type_map registered_types_cpp; // std::type_index -> pybind11's type information + std::unordered_map> registered_types_py; // PyTypeObject* -> base type_info(s) + std::unordered_multimap registered_instances; // void * -> instance* + std::unordered_set, overload_hash> inactive_overload_cache; + type_map> direct_conversions; + std::unordered_map> patients; + std::forward_list registered_exception_translators; + std::unordered_map shared_data; // Custom data to be shared across extensions + std::vector loader_patient_stack; // Used by `loader_life_support` + std::forward_list static_strings; // Stores the std::strings backing detail::c_str() + PyTypeObject *static_property_type; + PyTypeObject *default_metaclass; + PyObject *instance_base; +#if defined(WITH_THREAD) + PYBIND11_TLS_KEY_INIT(tstate); + PyInterpreterState *istate = nullptr; +#endif +}; + +/// Additional type information which does not fit into the PyTypeObject. +/// Changes to this struct also require bumping `PYBIND11_INTERNALS_VERSION`. +struct type_info { + PyTypeObject *type; + const std::type_info *cpptype; + size_t type_size, type_align, holder_size_in_ptrs; + void *(*operator_new)(size_t); + void (*init_instance)(instance *, const void *); + void (*dealloc)(value_and_holder &v_h); + std::vector implicit_conversions; + std::vector> implicit_casts; + std::vector *direct_conversions; + buffer_info *(*get_buffer)(PyObject *, void *) = nullptr; + void *get_buffer_data = nullptr; + void *(*module_local_load)(PyObject *, const type_info *) = nullptr; + /* A simple type never occurs as a (direct or indirect) parent + * of a class that makes use of multiple inheritance */ + bool simple_type : 1; + /* True if there is no multiple inheritance in this type's inheritance tree */ + bool simple_ancestors : 1; + /* for base vs derived holder_type checks */ + bool default_holder : 1; + /* true if this is a type registered with py::module_local */ + bool module_local : 1; +}; + +/// Tracks the `internals` and `type_info` ABI version independent of the main library version +#define PYBIND11_INTERNALS_VERSION 3 + +#if defined(_DEBUG) +# define PYBIND11_BUILD_TYPE "_debug" +#else +# define PYBIND11_BUILD_TYPE "" +#endif + +#if defined(WITH_THREAD) +# define PYBIND11_INTERNALS_KIND "" +#else +# define PYBIND11_INTERNALS_KIND "_without_thread" +#endif + +#define PYBIND11_INTERNALS_ID "__pybind11_internals_v" \ + PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) PYBIND11_INTERNALS_KIND PYBIND11_BUILD_TYPE "__" + +#define PYBIND11_MODULE_LOCAL_ID "__pybind11_module_local_v" \ + PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) PYBIND11_INTERNALS_KIND PYBIND11_BUILD_TYPE "__" + +/// Each module locally stores a pointer to the `internals` data. The data +/// itself is shared among modules with the same `PYBIND11_INTERNALS_ID`. +inline internals **&get_internals_pp() { + static internals **internals_pp = nullptr; + return internals_pp; +} + +/// Return a reference to the current `internals` data +PYBIND11_NOINLINE inline internals &get_internals() { + auto **&internals_pp = get_internals_pp(); + if (internals_pp && *internals_pp) + return **internals_pp; + + constexpr auto *id = PYBIND11_INTERNALS_ID; + auto builtins = handle(PyEval_GetBuiltins()); + if (builtins.contains(id) && isinstance(builtins[id])) { + internals_pp = static_cast(capsule(builtins[id])); + + // We loaded builtins through python's builtins, which means that our `error_already_set` + // and `builtin_exception` may be different local classes than the ones set up in the + // initial exception translator, below, so add another for our local exception classes. + // + // libstdc++ doesn't require this (types there are identified only by name) +#if !defined(__GLIBCXX__) + (*internals_pp)->registered_exception_translators.push_front( + [](std::exception_ptr p) -> void { + try { + if (p) std::rethrow_exception(p); + } catch (error_already_set &e) { e.restore(); return; + } catch (const builtin_exception &e) { e.set_error(); return; + } + } + ); +#endif + } else { + if (!internals_pp) internals_pp = new internals*(); + auto *&internals_ptr = *internals_pp; + internals_ptr = new internals(); +#if defined(WITH_THREAD) + PyEval_InitThreads(); + PyThreadState *tstate = PyThreadState_Get(); + #if PY_VERSION_HEX >= 0x03070000 + internals_ptr->tstate = PyThread_tss_alloc(); + if (!internals_ptr->tstate || PyThread_tss_create(internals_ptr->tstate)) + pybind11_fail("get_internals: could not successfully initialize the TSS key!"); + PyThread_tss_set(internals_ptr->tstate, tstate); + #else + internals_ptr->tstate = PyThread_create_key(); + if (internals_ptr->tstate == -1) + pybind11_fail("get_internals: could not successfully initialize the TLS key!"); + PyThread_set_key_value(internals_ptr->tstate, tstate); + #endif + internals_ptr->istate = tstate->interp; +#endif + builtins[id] = capsule(internals_pp); + internals_ptr->registered_exception_translators.push_front( + [](std::exception_ptr p) -> void { + try { + if (p) std::rethrow_exception(p); + } catch (error_already_set &e) { e.restore(); return; + } catch (const builtin_exception &e) { e.set_error(); return; + } catch (const std::bad_alloc &e) { PyErr_SetString(PyExc_MemoryError, e.what()); return; + } catch (const std::domain_error &e) { PyErr_SetString(PyExc_ValueError, e.what()); return; + } catch (const std::invalid_argument &e) { PyErr_SetString(PyExc_ValueError, e.what()); return; + } catch (const std::length_error &e) { PyErr_SetString(PyExc_ValueError, e.what()); return; + } catch (const std::out_of_range &e) { PyErr_SetString(PyExc_IndexError, e.what()); return; + } catch (const std::range_error &e) { PyErr_SetString(PyExc_ValueError, e.what()); return; + } catch (const std::exception &e) { PyErr_SetString(PyExc_RuntimeError, e.what()); return; + } catch (...) { + PyErr_SetString(PyExc_RuntimeError, "Caught an unknown exception!"); + return; + } + } + ); + internals_ptr->static_property_type = make_static_property_type(); + internals_ptr->default_metaclass = make_default_metaclass(); + internals_ptr->instance_base = make_object_base_type(internals_ptr->default_metaclass); + } + return **internals_pp; +} + +/// Works like `internals.registered_types_cpp`, but for module-local registered types: +inline type_map ®istered_local_types_cpp() { + static type_map locals{}; + return locals; +} + +/// Constructs a std::string with the given arguments, stores it in `internals`, and returns its +/// `c_str()`. Such strings objects have a long storage duration -- the internal strings are only +/// cleared when the program exits or after interpreter shutdown (when embedding), and so are +/// suitable for c-style strings needed by Python internals (such as PyTypeObject's tp_name). +template +const char *c_str(Args &&...args) { + auto &strings = get_internals().static_strings; + strings.emplace_front(std::forward(args)...); + return strings.front().c_str(); +} + +NAMESPACE_END(detail) + +/// Returns a named pointer that is shared among all extension modules (using the same +/// pybind11 version) running in the current interpreter. Names starting with underscores +/// are reserved for internal usage. Returns `nullptr` if no matching entry was found. +inline PYBIND11_NOINLINE void *get_shared_data(const std::string &name) { + auto &internals = detail::get_internals(); + auto it = internals.shared_data.find(name); + return it != internals.shared_data.end() ? it->second : nullptr; +} + +/// Set the shared data that can be later recovered by `get_shared_data()`. +inline PYBIND11_NOINLINE void *set_shared_data(const std::string &name, void *data) { + detail::get_internals().shared_data[name] = data; + return data; +} + +/// Returns a typed reference to a shared data entry (by using `get_shared_data()`) if +/// such entry exists. Otherwise, a new object of default-constructible type `T` is +/// added to the shared data under the given name and a reference to it is returned. +template +T &get_or_create_shared_data(const std::string &name) { + auto &internals = detail::get_internals(); + auto it = internals.shared_data.find(name); + T *ptr = (T *) (it != internals.shared_data.end() ? it->second : nullptr); + if (!ptr) { + ptr = new T(); + internals.shared_data[name] = ptr; + } + return *ptr; +} + +NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/python/src/pybind11/detail/typeid.h b/python/src/pybind11/detail/typeid.h new file mode 100644 index 000000000..9c8a4fc69 --- /dev/null +++ b/python/src/pybind11/detail/typeid.h @@ -0,0 +1,55 @@ +/* + pybind11/detail/typeid.h: Compiler-independent access to type identifiers + + Copyright (c) 2016 Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include +#include + +#if defined(__GNUG__) +#include +#endif + +#include "common.h" + +NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +NAMESPACE_BEGIN(detail) +/// Erase all occurrences of a substring +inline void erase_all(std::string &string, const std::string &search) { + for (size_t pos = 0;;) { + pos = string.find(search, pos); + if (pos == std::string::npos) break; + string.erase(pos, search.length()); + } +} + +PYBIND11_NOINLINE inline void clean_type_id(std::string &name) { +#if defined(__GNUG__) + int status = 0; + std::unique_ptr res { + abi::__cxa_demangle(name.c_str(), nullptr, nullptr, &status), std::free }; + if (status == 0) + name = res.get(); +#else + detail::erase_all(name, "class "); + detail::erase_all(name, "struct "); + detail::erase_all(name, "enum "); +#endif + detail::erase_all(name, "pybind11::"); +} +NAMESPACE_END(detail) + +/// Return a string representation of a C++ type +template static std::string type_id() { + std::string name(typeid(T).name()); + detail::clean_type_id(name); + return name; +} + +NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/python/src/pybind11/eigen.h b/python/src/pybind11/eigen.h new file mode 100644 index 000000000..d963d9650 --- /dev/null +++ b/python/src/pybind11/eigen.h @@ -0,0 +1,607 @@ +/* + pybind11/eigen.h: Transparent conversion for dense and sparse Eigen matrices + + Copyright (c) 2016 Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include "numpy.h" + +#if defined(__INTEL_COMPILER) +# pragma warning(disable: 1682) // implicit conversion of a 64-bit integral type to a smaller integral type (potential portability problem) +#elif defined(__GNUG__) || defined(__clang__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wconversion" +# pragma GCC diagnostic ignored "-Wdeprecated-declarations" +# ifdef __clang__ +// Eigen generates a bunch of implicit-copy-constructor-is-deprecated warnings with -Wdeprecated +// under Clang, so disable that warning here: +# pragma GCC diagnostic ignored "-Wdeprecated" +# endif +# if __GNUC__ >= 7 +# pragma GCC diagnostic ignored "-Wint-in-bool-context" +# endif +#endif + +#if defined(_MSC_VER) +# pragma warning(push) +# pragma warning(disable: 4127) // warning C4127: Conditional expression is constant +# pragma warning(disable: 4996) // warning C4996: std::unary_negate is deprecated in C++17 +#endif + +#include +#include + +// Eigen prior to 3.2.7 doesn't have proper move constructors--but worse, some classes get implicit +// move constructors that break things. We could detect this an explicitly copy, but an extra copy +// of matrices seems highly undesirable. +static_assert(EIGEN_VERSION_AT_LEAST(3,2,7), "Eigen support in pybind11 requires Eigen >= 3.2.7"); + +NAMESPACE_BEGIN(PYBIND11_NAMESPACE) + +// Provide a convenience alias for easier pass-by-ref usage with fully dynamic strides: +using EigenDStride = Eigen::Stride; +template using EigenDRef = Eigen::Ref; +template using EigenDMap = Eigen::Map; + +NAMESPACE_BEGIN(detail) + +#if EIGEN_VERSION_AT_LEAST(3,3,0) +using EigenIndex = Eigen::Index; +#else +using EigenIndex = EIGEN_DEFAULT_DENSE_INDEX_TYPE; +#endif + +// Matches Eigen::Map, Eigen::Ref, blocks, etc: +template using is_eigen_dense_map = all_of, std::is_base_of, T>>; +template using is_eigen_mutable_map = std::is_base_of, T>; +template using is_eigen_dense_plain = all_of>, is_template_base_of>; +template using is_eigen_sparse = is_template_base_of; +// Test for objects inheriting from EigenBase that aren't captured by the above. This +// basically covers anything that can be assigned to a dense matrix but that don't have a typical +// matrix data layout that can be copied from their .data(). For example, DiagonalMatrix and +// SelfAdjointView fall into this category. +template using is_eigen_other = all_of< + is_template_base_of, + negation, is_eigen_dense_plain, is_eigen_sparse>> +>; + +// Captures numpy/eigen conformability status (returned by EigenProps::conformable()): +template struct EigenConformable { + bool conformable = false; + EigenIndex rows = 0, cols = 0; + EigenDStride stride{0, 0}; // Only valid if negativestrides is false! + bool negativestrides = false; // If true, do not use stride! + + EigenConformable(bool fits = false) : conformable{fits} {} + // Matrix type: + EigenConformable(EigenIndex r, EigenIndex c, + EigenIndex rstride, EigenIndex cstride) : + conformable{true}, rows{r}, cols{c} { + // TODO: when Eigen bug #747 is fixed, remove the tests for non-negativity. http://eigen.tuxfamily.org/bz/show_bug.cgi?id=747 + if (rstride < 0 || cstride < 0) { + negativestrides = true; + } else { + stride = {EigenRowMajor ? rstride : cstride /* outer stride */, + EigenRowMajor ? cstride : rstride /* inner stride */ }; + } + } + // Vector type: + EigenConformable(EigenIndex r, EigenIndex c, EigenIndex stride) + : EigenConformable(r, c, r == 1 ? c*stride : stride, c == 1 ? r : r*stride) {} + + template bool stride_compatible() const { + // To have compatible strides, we need (on both dimensions) one of fully dynamic strides, + // matching strides, or a dimension size of 1 (in which case the stride value is irrelevant) + return + !negativestrides && + (props::inner_stride == Eigen::Dynamic || props::inner_stride == stride.inner() || + (EigenRowMajor ? cols : rows) == 1) && + (props::outer_stride == Eigen::Dynamic || props::outer_stride == stride.outer() || + (EigenRowMajor ? rows : cols) == 1); + } + operator bool() const { return conformable; } +}; + +template struct eigen_extract_stride { using type = Type; }; +template +struct eigen_extract_stride> { using type = StrideType; }; +template +struct eigen_extract_stride> { using type = StrideType; }; + +// Helper struct for extracting information from an Eigen type +template struct EigenProps { + using Type = Type_; + using Scalar = typename Type::Scalar; + using StrideType = typename eigen_extract_stride::type; + static constexpr EigenIndex + rows = Type::RowsAtCompileTime, + cols = Type::ColsAtCompileTime, + size = Type::SizeAtCompileTime; + static constexpr bool + row_major = Type::IsRowMajor, + vector = Type::IsVectorAtCompileTime, // At least one dimension has fixed size 1 + fixed_rows = rows != Eigen::Dynamic, + fixed_cols = cols != Eigen::Dynamic, + fixed = size != Eigen::Dynamic, // Fully-fixed size + dynamic = !fixed_rows && !fixed_cols; // Fully-dynamic size + + template using if_zero = std::integral_constant; + static constexpr EigenIndex inner_stride = if_zero::value, + outer_stride = if_zero::value; + static constexpr bool dynamic_stride = inner_stride == Eigen::Dynamic && outer_stride == Eigen::Dynamic; + static constexpr bool requires_row_major = !dynamic_stride && !vector && (row_major ? inner_stride : outer_stride) == 1; + static constexpr bool requires_col_major = !dynamic_stride && !vector && (row_major ? outer_stride : inner_stride) == 1; + + // Takes an input array and determines whether we can make it fit into the Eigen type. If + // the array is a vector, we attempt to fit it into either an Eigen 1xN or Nx1 vector + // (preferring the latter if it will fit in either, i.e. for a fully dynamic matrix type). + static EigenConformable conformable(const array &a) { + const auto dims = a.ndim(); + if (dims < 1 || dims > 2) + return false; + + if (dims == 2) { // Matrix type: require exact match (or dynamic) + + EigenIndex + np_rows = a.shape(0), + np_cols = a.shape(1), + np_rstride = a.strides(0) / static_cast(sizeof(Scalar)), + np_cstride = a.strides(1) / static_cast(sizeof(Scalar)); + if ((fixed_rows && np_rows != rows) || (fixed_cols && np_cols != cols)) + return false; + + return {np_rows, np_cols, np_rstride, np_cstride}; + } + + // Otherwise we're storing an n-vector. Only one of the strides will be used, but whichever + // is used, we want the (single) numpy stride value. + const EigenIndex n = a.shape(0), + stride = a.strides(0) / static_cast(sizeof(Scalar)); + + if (vector) { // Eigen type is a compile-time vector + if (fixed && size != n) + return false; // Vector size mismatch + return {rows == 1 ? 1 : n, cols == 1 ? 1 : n, stride}; + } + else if (fixed) { + // The type has a fixed size, but is not a vector: abort + return false; + } + else if (fixed_cols) { + // Since this isn't a vector, cols must be != 1. We allow this only if it exactly + // equals the number of elements (rows is Dynamic, and so 1 row is allowed). + if (cols != n) return false; + return {1, n, stride}; + } + else { + // Otherwise it's either fully dynamic, or column dynamic; both become a column vector + if (fixed_rows && rows != n) return false; + return {n, 1, stride}; + } + } + + static constexpr bool show_writeable = is_eigen_dense_map::value && is_eigen_mutable_map::value; + static constexpr bool show_order = is_eigen_dense_map::value; + static constexpr bool show_c_contiguous = show_order && requires_row_major; + static constexpr bool show_f_contiguous = !show_c_contiguous && show_order && requires_col_major; + + static constexpr auto descriptor = + _("numpy.ndarray[") + npy_format_descriptor::name + + _("[") + _(_<(size_t) rows>(), _("m")) + + _(", ") + _(_<(size_t) cols>(), _("n")) + + _("]") + + // For a reference type (e.g. Ref) we have other constraints that might need to be + // satisfied: writeable=True (for a mutable reference), and, depending on the map's stride + // options, possibly f_contiguous or c_contiguous. We include them in the descriptor output + // to provide some hint as to why a TypeError is occurring (otherwise it can be confusing to + // see that a function accepts a 'numpy.ndarray[float64[3,2]]' and an error message that you + // *gave* a numpy.ndarray of the right type and dimensions. + _(", flags.writeable", "") + + _(", flags.c_contiguous", "") + + _(", flags.f_contiguous", "") + + _("]"); +}; + +// Casts an Eigen type to numpy array. If given a base, the numpy array references the src data, +// otherwise it'll make a copy. writeable lets you turn off the writeable flag for the array. +template handle eigen_array_cast(typename props::Type const &src, handle base = handle(), bool writeable = true) { + constexpr ssize_t elem_size = sizeof(typename props::Scalar); + array a; + if (props::vector) + a = array({ src.size() }, { elem_size * src.innerStride() }, src.data(), base); + else + a = array({ src.rows(), src.cols() }, { elem_size * src.rowStride(), elem_size * src.colStride() }, + src.data(), base); + + if (!writeable) + array_proxy(a.ptr())->flags &= ~detail::npy_api::NPY_ARRAY_WRITEABLE_; + + return a.release(); +} + +// Takes an lvalue ref to some Eigen type and a (python) base object, creating a numpy array that +// reference the Eigen object's data with `base` as the python-registered base class (if omitted, +// the base will be set to None, and lifetime management is up to the caller). The numpy array is +// non-writeable if the given type is const. +template +handle eigen_ref_array(Type &src, handle parent = none()) { + // none here is to get past array's should-we-copy detection, which currently always + // copies when there is no base. Setting the base to None should be harmless. + return eigen_array_cast(src, parent, !std::is_const::value); +} + +// Takes a pointer to some dense, plain Eigen type, builds a capsule around it, then returns a numpy +// array that references the encapsulated data with a python-side reference to the capsule to tie +// its destruction to that of any dependent python objects. Const-ness is determined by whether or +// not the Type of the pointer given is const. +template ::value>> +handle eigen_encapsulate(Type *src) { + capsule base(src, [](void *o) { delete static_cast(o); }); + return eigen_ref_array(*src, base); +} + +// Type caster for regular, dense matrix types (e.g. MatrixXd), but not maps/refs/etc. of dense +// types. +template +struct type_caster::value>> { + using Scalar = typename Type::Scalar; + using props = EigenProps; + + bool load(handle src, bool convert) { + // If we're in no-convert mode, only load if given an array of the correct type + if (!convert && !isinstance>(src)) + return false; + + // Coerce into an array, but don't do type conversion yet; the copy below handles it. + auto buf = array::ensure(src); + + if (!buf) + return false; + + auto dims = buf.ndim(); + if (dims < 1 || dims > 2) + return false; + + auto fits = props::conformable(buf); + if (!fits) + return false; + + // Allocate the new type, then build a numpy reference into it + value = Type(fits.rows, fits.cols); + auto ref = reinterpret_steal(eigen_ref_array(value)); + if (dims == 1) ref = ref.squeeze(); + else if (ref.ndim() == 1) buf = buf.squeeze(); + + int result = detail::npy_api::get().PyArray_CopyInto_(ref.ptr(), buf.ptr()); + + if (result < 0) { // Copy failed! + PyErr_Clear(); + return false; + } + + return true; + } + +private: + + // Cast implementation + template + static handle cast_impl(CType *src, return_value_policy policy, handle parent) { + switch (policy) { + case return_value_policy::take_ownership: + case return_value_policy::automatic: + return eigen_encapsulate(src); + case return_value_policy::move: + return eigen_encapsulate(new CType(std::move(*src))); + case return_value_policy::copy: + return eigen_array_cast(*src); + case return_value_policy::reference: + case return_value_policy::automatic_reference: + return eigen_ref_array(*src); + case return_value_policy::reference_internal: + return eigen_ref_array(*src, parent); + default: + throw cast_error("unhandled return_value_policy: should not happen!"); + }; + } + +public: + + // Normal returned non-reference, non-const value: + static handle cast(Type &&src, return_value_policy /* policy */, handle parent) { + return cast_impl(&src, return_value_policy::move, parent); + } + // If you return a non-reference const, we mark the numpy array readonly: + static handle cast(const Type &&src, return_value_policy /* policy */, handle parent) { + return cast_impl(&src, return_value_policy::move, parent); + } + // lvalue reference return; default (automatic) becomes copy + static handle cast(Type &src, return_value_policy policy, handle parent) { + if (policy == return_value_policy::automatic || policy == return_value_policy::automatic_reference) + policy = return_value_policy::copy; + return cast_impl(&src, policy, parent); + } + // const lvalue reference return; default (automatic) becomes copy + static handle cast(const Type &src, return_value_policy policy, handle parent) { + if (policy == return_value_policy::automatic || policy == return_value_policy::automatic_reference) + policy = return_value_policy::copy; + return cast(&src, policy, parent); + } + // non-const pointer return + static handle cast(Type *src, return_value_policy policy, handle parent) { + return cast_impl(src, policy, parent); + } + // const pointer return + static handle cast(const Type *src, return_value_policy policy, handle parent) { + return cast_impl(src, policy, parent); + } + + static constexpr auto name = props::descriptor; + + operator Type*() { return &value; } + operator Type&() { return value; } + operator Type&&() && { return std::move(value); } + template using cast_op_type = movable_cast_op_type; + +private: + Type value; +}; + +// Base class for casting reference/map/block/etc. objects back to python. +template struct eigen_map_caster { +private: + using props = EigenProps; + +public: + + // Directly referencing a ref/map's data is a bit dangerous (whatever the map/ref points to has + // to stay around), but we'll allow it under the assumption that you know what you're doing (and + // have an appropriate keep_alive in place). We return a numpy array pointing directly at the + // ref's data (The numpy array ends up read-only if the ref was to a const matrix type.) Note + // that this means you need to ensure you don't destroy the object in some other way (e.g. with + // an appropriate keep_alive, or with a reference to a statically allocated matrix). + static handle cast(const MapType &src, return_value_policy policy, handle parent) { + switch (policy) { + case return_value_policy::copy: + return eigen_array_cast(src); + case return_value_policy::reference_internal: + return eigen_array_cast(src, parent, is_eigen_mutable_map::value); + case return_value_policy::reference: + case return_value_policy::automatic: + case return_value_policy::automatic_reference: + return eigen_array_cast(src, none(), is_eigen_mutable_map::value); + default: + // move, take_ownership don't make any sense for a ref/map: + pybind11_fail("Invalid return_value_policy for Eigen Map/Ref/Block type"); + } + } + + static constexpr auto name = props::descriptor; + + // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return + // types but not bound arguments). We still provide them (with an explicitly delete) so that + // you end up here if you try anyway. + bool load(handle, bool) = delete; + operator MapType() = delete; + template using cast_op_type = MapType; +}; + +// We can return any map-like object (but can only load Refs, specialized next): +template struct type_caster::value>> + : eigen_map_caster {}; + +// Loader for Ref<...> arguments. See the documentation for info on how to make this work without +// copying (it requires some extra effort in many cases). +template +struct type_caster< + Eigen::Ref, + enable_if_t>::value> +> : public eigen_map_caster> { +private: + using Type = Eigen::Ref; + using props = EigenProps; + using Scalar = typename props::Scalar; + using MapType = Eigen::Map; + using Array = array_t; + static constexpr bool need_writeable = is_eigen_mutable_map::value; + // Delay construction (these have no default constructor) + std::unique_ptr map; + std::unique_ptr ref; + // Our array. When possible, this is just a numpy array pointing to the source data, but + // sometimes we can't avoid copying (e.g. input is not a numpy array at all, has an incompatible + // layout, or is an array of a type that needs to be converted). Using a numpy temporary + // (rather than an Eigen temporary) saves an extra copy when we need both type conversion and + // storage order conversion. (Note that we refuse to use this temporary copy when loading an + // argument for a Ref with M non-const, i.e. a read-write reference). + Array copy_or_ref; +public: + bool load(handle src, bool convert) { + // First check whether what we have is already an array of the right type. If not, we can't + // avoid a copy (because the copy is also going to do type conversion). + bool need_copy = !isinstance(src); + + EigenConformable fits; + if (!need_copy) { + // We don't need a converting copy, but we also need to check whether the strides are + // compatible with the Ref's stride requirements + Array aref = reinterpret_borrow(src); + + if (aref && (!need_writeable || aref.writeable())) { + fits = props::conformable(aref); + if (!fits) return false; // Incompatible dimensions + if (!fits.template stride_compatible()) + need_copy = true; + else + copy_or_ref = std::move(aref); + } + else { + need_copy = true; + } + } + + if (need_copy) { + // We need to copy: If we need a mutable reference, or we're not supposed to convert + // (either because we're in the no-convert overload pass, or because we're explicitly + // instructed not to copy (via `py::arg().noconvert()`) we have to fail loading. + if (!convert || need_writeable) return false; + + Array copy = Array::ensure(src); + if (!copy) return false; + fits = props::conformable(copy); + if (!fits || !fits.template stride_compatible()) + return false; + copy_or_ref = std::move(copy); + loader_life_support::add_patient(copy_or_ref); + } + + ref.reset(); + map.reset(new MapType(data(copy_or_ref), fits.rows, fits.cols, make_stride(fits.stride.outer(), fits.stride.inner()))); + ref.reset(new Type(*map)); + + return true; + } + + operator Type*() { return ref.get(); } + operator Type&() { return *ref; } + template using cast_op_type = pybind11::detail::cast_op_type<_T>; + +private: + template ::value, int> = 0> + Scalar *data(Array &a) { return a.mutable_data(); } + + template ::value, int> = 0> + const Scalar *data(Array &a) { return a.data(); } + + // Attempt to figure out a constructor of `Stride` that will work. + // If both strides are fixed, use a default constructor: + template using stride_ctor_default = bool_constant< + S::InnerStrideAtCompileTime != Eigen::Dynamic && S::OuterStrideAtCompileTime != Eigen::Dynamic && + std::is_default_constructible::value>; + // Otherwise, if there is a two-index constructor, assume it is (outer,inner) like + // Eigen::Stride, and use it: + template using stride_ctor_dual = bool_constant< + !stride_ctor_default::value && std::is_constructible::value>; + // Otherwise, if there is a one-index constructor, and just one of the strides is dynamic, use + // it (passing whichever stride is dynamic). + template using stride_ctor_outer = bool_constant< + !any_of, stride_ctor_dual>::value && + S::OuterStrideAtCompileTime == Eigen::Dynamic && S::InnerStrideAtCompileTime != Eigen::Dynamic && + std::is_constructible::value>; + template using stride_ctor_inner = bool_constant< + !any_of, stride_ctor_dual>::value && + S::InnerStrideAtCompileTime == Eigen::Dynamic && S::OuterStrideAtCompileTime != Eigen::Dynamic && + std::is_constructible::value>; + + template ::value, int> = 0> + static S make_stride(EigenIndex, EigenIndex) { return S(); } + template ::value, int> = 0> + static S make_stride(EigenIndex outer, EigenIndex inner) { return S(outer, inner); } + template ::value, int> = 0> + static S make_stride(EigenIndex outer, EigenIndex) { return S(outer); } + template ::value, int> = 0> + static S make_stride(EigenIndex, EigenIndex inner) { return S(inner); } + +}; + +// type_caster for special matrix types (e.g. DiagonalMatrix), which are EigenBase, but not +// EigenDense (i.e. they don't have a data(), at least not with the usual matrix layout). +// load() is not supported, but we can cast them into the python domain by first copying to a +// regular Eigen::Matrix, then casting that. +template +struct type_caster::value>> { +protected: + using Matrix = Eigen::Matrix; + using props = EigenProps; +public: + static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) { + handle h = eigen_encapsulate(new Matrix(src)); + return h; + } + static handle cast(const Type *src, return_value_policy policy, handle parent) { return cast(*src, policy, parent); } + + static constexpr auto name = props::descriptor; + + // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return + // types but not bound arguments). We still provide them (with an explicitly delete) so that + // you end up here if you try anyway. + bool load(handle, bool) = delete; + operator Type() = delete; + template using cast_op_type = Type; +}; + +template +struct type_caster::value>> { + typedef typename Type::Scalar Scalar; + typedef remove_reference_t().outerIndexPtr())> StorageIndex; + typedef typename Type::Index Index; + static constexpr bool rowMajor = Type::IsRowMajor; + + bool load(handle src, bool) { + if (!src) + return false; + + auto obj = reinterpret_borrow(src); + object sparse_module = module::import("scipy.sparse"); + object matrix_type = sparse_module.attr( + rowMajor ? "csr_matrix" : "csc_matrix"); + + if (!obj.get_type().is(matrix_type)) { + try { + obj = matrix_type(obj); + } catch (const error_already_set &) { + return false; + } + } + + auto values = array_t((object) obj.attr("data")); + auto innerIndices = array_t((object) obj.attr("indices")); + auto outerIndices = array_t((object) obj.attr("indptr")); + auto shape = pybind11::tuple((pybind11::object) obj.attr("shape")); + auto nnz = obj.attr("nnz").cast(); + + if (!values || !innerIndices || !outerIndices) + return false; + + value = Eigen::MappedSparseMatrix( + shape[0].cast(), shape[1].cast(), nnz, + outerIndices.mutable_data(), innerIndices.mutable_data(), values.mutable_data()); + + return true; + } + + static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) { + const_cast(src).makeCompressed(); + + object matrix_type = module::import("scipy.sparse").attr( + rowMajor ? "csr_matrix" : "csc_matrix"); + + array data(src.nonZeros(), src.valuePtr()); + array outerIndices((rowMajor ? src.rows() : src.cols()) + 1, src.outerIndexPtr()); + array innerIndices(src.nonZeros(), src.innerIndexPtr()); + + return matrix_type( + std::make_tuple(data, innerIndices, outerIndices), + std::make_pair(src.rows(), src.cols()) + ).release(); + } + + PYBIND11_TYPE_CASTER(Type, _<(Type::IsRowMajor) != 0>("scipy.sparse.csr_matrix[", "scipy.sparse.csc_matrix[") + + npy_format_descriptor::name + _("]")); +}; + +NAMESPACE_END(detail) +NAMESPACE_END(PYBIND11_NAMESPACE) + +#if defined(__GNUG__) || defined(__clang__) +# pragma GCC diagnostic pop +#elif defined(_MSC_VER) +# pragma warning(pop) +#endif diff --git a/python/src/pybind11/embed.h b/python/src/pybind11/embed.h new file mode 100644 index 000000000..72655885e --- /dev/null +++ b/python/src/pybind11/embed.h @@ -0,0 +1,200 @@ +/* + pybind11/embed.h: Support for embedding the interpreter + + Copyright (c) 2017 Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include "pybind11.h" +#include "eval.h" + +#if defined(PYPY_VERSION) +# error Embedding the interpreter is not supported with PyPy +#endif + +#if PY_MAJOR_VERSION >= 3 +# define PYBIND11_EMBEDDED_MODULE_IMPL(name) \ + extern "C" PyObject *pybind11_init_impl_##name() { \ + return pybind11_init_wrapper_##name(); \ + } +#else +# define PYBIND11_EMBEDDED_MODULE_IMPL(name) \ + extern "C" void pybind11_init_impl_##name() { \ + pybind11_init_wrapper_##name(); \ + } +#endif + +/** \rst + Add a new module to the table of builtins for the interpreter. Must be + defined in global scope. The first macro parameter is the name of the + module (without quotes). The second parameter is the variable which will + be used as the interface to add functions and classes to the module. + + .. code-block:: cpp + + PYBIND11_EMBEDDED_MODULE(example, m) { + // ... initialize functions and classes here + m.def("foo", []() { + return "Hello, World!"; + }); + } + \endrst */ +#define PYBIND11_EMBEDDED_MODULE(name, variable) \ + static void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &); \ + static PyObject PYBIND11_CONCAT(*pybind11_init_wrapper_, name)() { \ + auto m = pybind11::module(PYBIND11_TOSTRING(name)); \ + try { \ + PYBIND11_CONCAT(pybind11_init_, name)(m); \ + return m.ptr(); \ + } catch (pybind11::error_already_set &e) { \ + PyErr_SetString(PyExc_ImportError, e.what()); \ + return nullptr; \ + } catch (const std::exception &e) { \ + PyErr_SetString(PyExc_ImportError, e.what()); \ + return nullptr; \ + } \ + } \ + PYBIND11_EMBEDDED_MODULE_IMPL(name) \ + pybind11::detail::embedded_module name(PYBIND11_TOSTRING(name), \ + PYBIND11_CONCAT(pybind11_init_impl_, name)); \ + void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &variable) + + +NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +NAMESPACE_BEGIN(detail) + +/// Python 2.7/3.x compatible version of `PyImport_AppendInittab` and error checks. +struct embedded_module { +#if PY_MAJOR_VERSION >= 3 + using init_t = PyObject *(*)(); +#else + using init_t = void (*)(); +#endif + embedded_module(const char *name, init_t init) { + if (Py_IsInitialized()) + pybind11_fail("Can't add new modules after the interpreter has been initialized"); + + auto result = PyImport_AppendInittab(name, init); + if (result == -1) + pybind11_fail("Insufficient memory to add a new module"); + } +}; + +NAMESPACE_END(detail) + +/** \rst + Initialize the Python interpreter. No other pybind11 or CPython API functions can be + called before this is done; with the exception of `PYBIND11_EMBEDDED_MODULE`. The + optional parameter can be used to skip the registration of signal handlers (see the + `Python documentation`_ for details). Calling this function again after the interpreter + has already been initialized is a fatal error. + + If initializing the Python interpreter fails, then the program is terminated. (This + is controlled by the CPython runtime and is an exception to pybind11's normal behavior + of throwing exceptions on errors.) + + .. _Python documentation: https://docs.python.org/3/c-api/init.html#c.Py_InitializeEx + \endrst */ +inline void initialize_interpreter(bool init_signal_handlers = true) { + if (Py_IsInitialized()) + pybind11_fail("The interpreter is already running"); + + Py_InitializeEx(init_signal_handlers ? 1 : 0); + + // Make .py files in the working directory available by default + module::import("sys").attr("path").cast().append("."); +} + +/** \rst + Shut down the Python interpreter. No pybind11 or CPython API functions can be called + after this. In addition, pybind11 objects must not outlive the interpreter: + + .. code-block:: cpp + + { // BAD + py::initialize_interpreter(); + auto hello = py::str("Hello, World!"); + py::finalize_interpreter(); + } // <-- BOOM, hello's destructor is called after interpreter shutdown + + { // GOOD + py::initialize_interpreter(); + { // scoped + auto hello = py::str("Hello, World!"); + } // <-- OK, hello is cleaned up properly + py::finalize_interpreter(); + } + + { // BETTER + py::scoped_interpreter guard{}; + auto hello = py::str("Hello, World!"); + } + + .. warning:: + + The interpreter can be restarted by calling `initialize_interpreter` again. + Modules created using pybind11 can be safely re-initialized. However, Python + itself cannot completely unload binary extension modules and there are several + caveats with regard to interpreter restarting. All the details can be found + in the CPython documentation. In short, not all interpreter memory may be + freed, either due to reference cycles or user-created global data. + + \endrst */ +inline void finalize_interpreter() { + handle builtins(PyEval_GetBuiltins()); + const char *id = PYBIND11_INTERNALS_ID; + + // Get the internals pointer (without creating it if it doesn't exist). It's possible for the + // internals to be created during Py_Finalize() (e.g. if a py::capsule calls `get_internals()` + // during destruction), so we get the pointer-pointer here and check it after Py_Finalize(). + detail::internals **internals_ptr_ptr = detail::get_internals_pp(); + // It could also be stashed in builtins, so look there too: + if (builtins.contains(id) && isinstance(builtins[id])) + internals_ptr_ptr = capsule(builtins[id]); + + Py_Finalize(); + + if (internals_ptr_ptr) { + delete *internals_ptr_ptr; + *internals_ptr_ptr = nullptr; + } +} + +/** \rst + Scope guard version of `initialize_interpreter` and `finalize_interpreter`. + This a move-only guard and only a single instance can exist. + + .. code-block:: cpp + + #include + + int main() { + py::scoped_interpreter guard{}; + py::print(Hello, World!); + } // <-- interpreter shutdown + \endrst */ +class scoped_interpreter { +public: + scoped_interpreter(bool init_signal_handlers = true) { + initialize_interpreter(init_signal_handlers); + } + + scoped_interpreter(const scoped_interpreter &) = delete; + scoped_interpreter(scoped_interpreter &&other) noexcept { other.is_valid = false; } + scoped_interpreter &operator=(const scoped_interpreter &) = delete; + scoped_interpreter &operator=(scoped_interpreter &&) = delete; + + ~scoped_interpreter() { + if (is_valid) + finalize_interpreter(); + } + +private: + bool is_valid = true; +}; + +NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/python/src/pybind11/eval.h b/python/src/pybind11/eval.h new file mode 100644 index 000000000..ea85ba1db --- /dev/null +++ b/python/src/pybind11/eval.h @@ -0,0 +1,117 @@ +/* + pybind11/exec.h: Support for evaluating Python expressions and statements + from strings and files + + Copyright (c) 2016 Klemens Morgenstern and + Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include "pybind11.h" + +NAMESPACE_BEGIN(PYBIND11_NAMESPACE) + +enum eval_mode { + /// Evaluate a string containing an isolated expression + eval_expr, + + /// Evaluate a string containing a single statement. Returns \c none + eval_single_statement, + + /// Evaluate a string containing a sequence of statement. Returns \c none + eval_statements +}; + +template +object eval(str expr, object global = globals(), object local = object()) { + if (!local) + local = global; + + /* PyRun_String does not accept a PyObject / encoding specifier, + this seems to be the only alternative */ + std::string buffer = "# -*- coding: utf-8 -*-\n" + (std::string) expr; + + int start; + switch (mode) { + case eval_expr: start = Py_eval_input; break; + case eval_single_statement: start = Py_single_input; break; + case eval_statements: start = Py_file_input; break; + default: pybind11_fail("invalid evaluation mode"); + } + + PyObject *result = PyRun_String(buffer.c_str(), start, global.ptr(), local.ptr()); + if (!result) + throw error_already_set(); + return reinterpret_steal(result); +} + +template +object eval(const char (&s)[N], object global = globals(), object local = object()) { + /* Support raw string literals by removing common leading whitespace */ + auto expr = (s[0] == '\n') ? str(module::import("textwrap").attr("dedent")(s)) + : str(s); + return eval(expr, global, local); +} + +inline void exec(str expr, object global = globals(), object local = object()) { + eval(expr, global, local); +} + +template +void exec(const char (&s)[N], object global = globals(), object local = object()) { + eval(s, global, local); +} + +template +object eval_file(str fname, object global = globals(), object local = object()) { + if (!local) + local = global; + + int start; + switch (mode) { + case eval_expr: start = Py_eval_input; break; + case eval_single_statement: start = Py_single_input; break; + case eval_statements: start = Py_file_input; break; + default: pybind11_fail("invalid evaluation mode"); + } + + int closeFile = 1; + std::string fname_str = (std::string) fname; +#if PY_VERSION_HEX >= 0x03040000 + FILE *f = _Py_fopen_obj(fname.ptr(), "r"); +#elif PY_VERSION_HEX >= 0x03000000 + FILE *f = _Py_fopen(fname.ptr(), "r"); +#else + /* No unicode support in open() :( */ + auto fobj = reinterpret_steal(PyFile_FromString( + const_cast(fname_str.c_str()), + const_cast("r"))); + FILE *f = nullptr; + if (fobj) + f = PyFile_AsFile(fobj.ptr()); + closeFile = 0; +#endif + if (!f) { + PyErr_Clear(); + pybind11_fail("File \"" + fname_str + "\" could not be opened!"); + } + +#if PY_VERSION_HEX < 0x03000000 && defined(PYPY_VERSION) + PyObject *result = PyRun_File(f, fname_str.c_str(), start, global.ptr(), + local.ptr()); + (void) closeFile; +#else + PyObject *result = PyRun_FileEx(f, fname_str.c_str(), start, global.ptr(), + local.ptr(), closeFile); +#endif + + if (!result) + throw error_already_set(); + return reinterpret_steal(result); +} + +NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/python/src/pybind11/functional.h b/python/src/pybind11/functional.h new file mode 100644 index 000000000..7a0988ab0 --- /dev/null +++ b/python/src/pybind11/functional.h @@ -0,0 +1,94 @@ +/* + pybind11/functional.h: std::function<> support + + Copyright (c) 2016 Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include "pybind11.h" +#include + +NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +NAMESPACE_BEGIN(detail) + +template +struct type_caster> { + using type = std::function; + using retval_type = conditional_t::value, void_type, Return>; + using function_type = Return (*) (Args...); + +public: + bool load(handle src, bool convert) { + if (src.is_none()) { + // Defer accepting None to other overloads (if we aren't in convert mode): + if (!convert) return false; + return true; + } + + if (!isinstance(src)) + return false; + + auto func = reinterpret_borrow(src); + + /* + When passing a C++ function as an argument to another C++ + function via Python, every function call would normally involve + a full C++ -> Python -> C++ roundtrip, which can be prohibitive. + Here, we try to at least detect the case where the function is + stateless (i.e. function pointer or lambda function without + captured variables), in which case the roundtrip can be avoided. + */ + if (auto cfunc = func.cpp_function()) { + auto c = reinterpret_borrow(PyCFunction_GET_SELF(cfunc.ptr())); + auto rec = (function_record *) c; + + if (rec && rec->is_stateless && + same_type(typeid(function_type), *reinterpret_cast(rec->data[1]))) { + struct capture { function_type f; }; + value = ((capture *) &rec->data)->f; + return true; + } + } + + // ensure GIL is held during functor destruction + struct func_handle { + function f; + func_handle(function&& f_) : f(std::move(f_)) {} + func_handle(const func_handle&) = default; + ~func_handle() { + gil_scoped_acquire acq; + function kill_f(std::move(f)); + } + }; + + value = [hfunc = func_handle(std::move(func))](Args... args) -> Return { + gil_scoped_acquire acq; + object retval(hfunc.f(std::forward(args)...)); + /* Visual studio 2015 parser issue: need parentheses around this expression */ + return (retval.template cast()); + }; + return true; + } + + template + static handle cast(Func &&f_, return_value_policy policy, handle /* parent */) { + if (!f_) + return none().inc_ref(); + + auto result = f_.template target(); + if (result) + return cpp_function(*result, policy).release(); + else + return cpp_function(std::forward(f_), policy).release(); + } + + PYBIND11_TYPE_CASTER(type, _("Callable[[") + concat(make_caster::name...) + _("], ") + + make_caster::name + _("]")); +}; + +NAMESPACE_END(detail) +NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/python/src/pybind11/iostream.h b/python/src/pybind11/iostream.h new file mode 100644 index 000000000..72baef8fd --- /dev/null +++ b/python/src/pybind11/iostream.h @@ -0,0 +1,207 @@ +/* + pybind11/iostream.h -- Tools to assist with redirecting cout and cerr to Python + + Copyright (c) 2017 Henry F. Schreiner + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include "pybind11.h" + +#include +#include +#include +#include +#include + +NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +NAMESPACE_BEGIN(detail) + +// Buffer that writes to Python instead of C++ +class pythonbuf : public std::streambuf { +private: + using traits_type = std::streambuf::traits_type; + + const size_t buf_size; + std::unique_ptr d_buffer; + object pywrite; + object pyflush; + + int overflow(int c) { + if (!traits_type::eq_int_type(c, traits_type::eof())) { + *pptr() = traits_type::to_char_type(c); + pbump(1); + } + return sync() == 0 ? traits_type::not_eof(c) : traits_type::eof(); + } + + int sync() { + if (pbase() != pptr()) { + // This subtraction cannot be negative, so dropping the sign + str line(pbase(), static_cast(pptr() - pbase())); + + { + gil_scoped_acquire tmp; + pywrite(line); + pyflush(); + } + + setp(pbase(), epptr()); + } + return 0; + } + +public: + + pythonbuf(object pyostream, size_t buffer_size = 1024) + : buf_size(buffer_size), + d_buffer(new char[buf_size]), + pywrite(pyostream.attr("write")), + pyflush(pyostream.attr("flush")) { + setp(d_buffer.get(), d_buffer.get() + buf_size - 1); + } + + /// Sync before destroy + ~pythonbuf() { + sync(); + } +}; + +NAMESPACE_END(detail) + + +/** \rst + This a move-only guard that redirects output. + + .. code-block:: cpp + + #include + + ... + + { + py::scoped_ostream_redirect output; + std::cout << "Hello, World!"; // Python stdout + } // <-- return std::cout to normal + + You can explicitly pass the c++ stream and the python object, + for example to guard stderr instead. + + .. code-block:: cpp + + { + py::scoped_ostream_redirect output{std::cerr, py::module::import("sys").attr("stderr")}; + std::cerr << "Hello, World!"; + } + \endrst */ +class scoped_ostream_redirect { +protected: + std::streambuf *old; + std::ostream &costream; + detail::pythonbuf buffer; + +public: + scoped_ostream_redirect( + std::ostream &costream = std::cout, + object pyostream = module::import("sys").attr("stdout")) + : costream(costream), buffer(pyostream) { + old = costream.rdbuf(&buffer); + } + + ~scoped_ostream_redirect() { + costream.rdbuf(old); + } + + scoped_ostream_redirect(const scoped_ostream_redirect &) = delete; + scoped_ostream_redirect(scoped_ostream_redirect &&other) = default; + scoped_ostream_redirect &operator=(const scoped_ostream_redirect &) = delete; + scoped_ostream_redirect &operator=(scoped_ostream_redirect &&) = delete; +}; + + +/** \rst + Like `scoped_ostream_redirect`, but redirects cerr by default. This class + is provided primary to make ``py::call_guard`` easier to make. + + .. code-block:: cpp + + m.def("noisy_func", &noisy_func, + py::call_guard()); + +\endrst */ +class scoped_estream_redirect : public scoped_ostream_redirect { +public: + scoped_estream_redirect( + std::ostream &costream = std::cerr, + object pyostream = module::import("sys").attr("stderr")) + : scoped_ostream_redirect(costream,pyostream) {} +}; + + +NAMESPACE_BEGIN(detail) + +// Class to redirect output as a context manager. C++ backend. +class OstreamRedirect { + bool do_stdout_; + bool do_stderr_; + std::unique_ptr redirect_stdout; + std::unique_ptr redirect_stderr; + +public: + OstreamRedirect(bool do_stdout = true, bool do_stderr = true) + : do_stdout_(do_stdout), do_stderr_(do_stderr) {} + + void enter() { + if (do_stdout_) + redirect_stdout.reset(new scoped_ostream_redirect()); + if (do_stderr_) + redirect_stderr.reset(new scoped_estream_redirect()); + } + + void exit() { + redirect_stdout.reset(); + redirect_stderr.reset(); + } +}; + +NAMESPACE_END(detail) + +/** \rst + This is a helper function to add a C++ redirect context manager to Python + instead of using a C++ guard. To use it, add the following to your binding code: + + .. code-block:: cpp + + #include + + ... + + py::add_ostream_redirect(m, "ostream_redirect"); + + You now have a Python context manager that redirects your output: + + .. code-block:: python + + with m.ostream_redirect(): + m.print_to_cout_function() + + This manager can optionally be told which streams to operate on: + + .. code-block:: python + + with m.ostream_redirect(stdout=true, stderr=true): + m.noisy_function_with_error_printing() + + \endrst */ +inline class_ add_ostream_redirect(module m, std::string name = "ostream_redirect") { + return class_(m, name.c_str(), module_local()) + .def(init(), arg("stdout")=true, arg("stderr")=true) + .def("__enter__", &detail::OstreamRedirect::enter) + .def("__exit__", [](detail::OstreamRedirect &self_, args) { self_.exit(); }); +} + +NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/python/src/pybind11/numpy.h b/python/src/pybind11/numpy.h new file mode 100644 index 000000000..b2a02e024 --- /dev/null +++ b/python/src/pybind11/numpy.h @@ -0,0 +1,1610 @@ +/* + pybind11/numpy.h: Basic NumPy support, vectorize() wrapper + + Copyright (c) 2016 Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include "pybind11.h" +#include "complex.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +# pragma warning(push) +# pragma warning(disable: 4127) // warning C4127: Conditional expression is constant +#endif + +/* This will be true on all flat address space platforms and allows us to reduce the + whole npy_intp / ssize_t / Py_intptr_t business down to just ssize_t for all size + and dimension types (e.g. shape, strides, indexing), instead of inflicting this + upon the library user. */ +static_assert(sizeof(ssize_t) == sizeof(Py_intptr_t), "ssize_t != Py_intptr_t"); + +NAMESPACE_BEGIN(PYBIND11_NAMESPACE) + +class array; // Forward declaration + +NAMESPACE_BEGIN(detail) +template struct npy_format_descriptor; + +struct PyArrayDescr_Proxy { + PyObject_HEAD + PyObject *typeobj; + char kind; + char type; + char byteorder; + char flags; + int type_num; + int elsize; + int alignment; + char *subarray; + PyObject *fields; + PyObject *names; +}; + +struct PyArray_Proxy { + PyObject_HEAD + char *data; + int nd; + ssize_t *dimensions; + ssize_t *strides; + PyObject *base; + PyObject *descr; + int flags; +}; + +struct PyVoidScalarObject_Proxy { + PyObject_VAR_HEAD + char *obval; + PyArrayDescr_Proxy *descr; + int flags; + PyObject *base; +}; + +struct numpy_type_info { + PyObject* dtype_ptr; + std::string format_str; +}; + +struct numpy_internals { + std::unordered_map registered_dtypes; + + numpy_type_info *get_type_info(const std::type_info& tinfo, bool throw_if_missing = true) { + auto it = registered_dtypes.find(std::type_index(tinfo)); + if (it != registered_dtypes.end()) + return &(it->second); + if (throw_if_missing) + pybind11_fail(std::string("NumPy type info missing for ") + tinfo.name()); + return nullptr; + } + + template numpy_type_info *get_type_info(bool throw_if_missing = true) { + return get_type_info(typeid(typename std::remove_cv::type), throw_if_missing); + } +}; + +inline PYBIND11_NOINLINE void load_numpy_internals(numpy_internals* &ptr) { + ptr = &get_or_create_shared_data("_numpy_internals"); +} + +inline numpy_internals& get_numpy_internals() { + static numpy_internals* ptr = nullptr; + if (!ptr) + load_numpy_internals(ptr); + return *ptr; +} + +struct npy_api { + enum constants { + NPY_ARRAY_C_CONTIGUOUS_ = 0x0001, + NPY_ARRAY_F_CONTIGUOUS_ = 0x0002, + NPY_ARRAY_OWNDATA_ = 0x0004, + NPY_ARRAY_FORCECAST_ = 0x0010, + NPY_ARRAY_ENSUREARRAY_ = 0x0040, + NPY_ARRAY_ALIGNED_ = 0x0100, + NPY_ARRAY_WRITEABLE_ = 0x0400, + NPY_BOOL_ = 0, + NPY_BYTE_, NPY_UBYTE_, + NPY_SHORT_, NPY_USHORT_, + NPY_INT_, NPY_UINT_, + NPY_LONG_, NPY_ULONG_, + NPY_LONGLONG_, NPY_ULONGLONG_, + NPY_FLOAT_, NPY_DOUBLE_, NPY_LONGDOUBLE_, + NPY_CFLOAT_, NPY_CDOUBLE_, NPY_CLONGDOUBLE_, + NPY_OBJECT_ = 17, + NPY_STRING_, NPY_UNICODE_, NPY_VOID_ + }; + + typedef struct { + Py_intptr_t *ptr; + int len; + } PyArray_Dims; + + static npy_api& get() { + static npy_api api = lookup(); + return api; + } + + bool PyArray_Check_(PyObject *obj) const { + return (bool) PyObject_TypeCheck(obj, PyArray_Type_); + } + bool PyArrayDescr_Check_(PyObject *obj) const { + return (bool) PyObject_TypeCheck(obj, PyArrayDescr_Type_); + } + + unsigned int (*PyArray_GetNDArrayCFeatureVersion_)(); + PyObject *(*PyArray_DescrFromType_)(int); + PyObject *(*PyArray_NewFromDescr_) + (PyTypeObject *, PyObject *, int, Py_intptr_t *, + Py_intptr_t *, void *, int, PyObject *); + PyObject *(*PyArray_DescrNewFromType_)(int); + int (*PyArray_CopyInto_)(PyObject *, PyObject *); + PyObject *(*PyArray_NewCopy_)(PyObject *, int); + PyTypeObject *PyArray_Type_; + PyTypeObject *PyVoidArrType_Type_; + PyTypeObject *PyArrayDescr_Type_; + PyObject *(*PyArray_DescrFromScalar_)(PyObject *); + PyObject *(*PyArray_FromAny_) (PyObject *, PyObject *, int, int, int, PyObject *); + int (*PyArray_DescrConverter_) (PyObject *, PyObject **); + bool (*PyArray_EquivTypes_) (PyObject *, PyObject *); + int (*PyArray_GetArrayParamsFromObject_)(PyObject *, PyObject *, char, PyObject **, int *, + Py_ssize_t *, PyObject **, PyObject *); + PyObject *(*PyArray_Squeeze_)(PyObject *); + int (*PyArray_SetBaseObject_)(PyObject *, PyObject *); + PyObject* (*PyArray_Resize_)(PyObject*, PyArray_Dims*, int, int); +private: + enum functions { + API_PyArray_GetNDArrayCFeatureVersion = 211, + API_PyArray_Type = 2, + API_PyArrayDescr_Type = 3, + API_PyVoidArrType_Type = 39, + API_PyArray_DescrFromType = 45, + API_PyArray_DescrFromScalar = 57, + API_PyArray_FromAny = 69, + API_PyArray_Resize = 80, + API_PyArray_CopyInto = 82, + API_PyArray_NewCopy = 85, + API_PyArray_NewFromDescr = 94, + API_PyArray_DescrNewFromType = 9, + API_PyArray_DescrConverter = 174, + API_PyArray_EquivTypes = 182, + API_PyArray_GetArrayParamsFromObject = 278, + API_PyArray_Squeeze = 136, + API_PyArray_SetBaseObject = 282 + }; + + static npy_api lookup() { + module m = module::import("numpy.core.multiarray"); + auto c = m.attr("_ARRAY_API"); +#if PY_MAJOR_VERSION >= 3 + void **api_ptr = (void **) PyCapsule_GetPointer(c.ptr(), NULL); +#else + void **api_ptr = (void **) PyCObject_AsVoidPtr(c.ptr()); +#endif + npy_api api; +#define DECL_NPY_API(Func) api.Func##_ = (decltype(api.Func##_)) api_ptr[API_##Func]; + DECL_NPY_API(PyArray_GetNDArrayCFeatureVersion); + if (api.PyArray_GetNDArrayCFeatureVersion_() < 0x7) + pybind11_fail("pybind11 numpy support requires numpy >= 1.7.0"); + DECL_NPY_API(PyArray_Type); + DECL_NPY_API(PyVoidArrType_Type); + DECL_NPY_API(PyArrayDescr_Type); + DECL_NPY_API(PyArray_DescrFromType); + DECL_NPY_API(PyArray_DescrFromScalar); + DECL_NPY_API(PyArray_FromAny); + DECL_NPY_API(PyArray_Resize); + DECL_NPY_API(PyArray_CopyInto); + DECL_NPY_API(PyArray_NewCopy); + DECL_NPY_API(PyArray_NewFromDescr); + DECL_NPY_API(PyArray_DescrNewFromType); + DECL_NPY_API(PyArray_DescrConverter); + DECL_NPY_API(PyArray_EquivTypes); + DECL_NPY_API(PyArray_GetArrayParamsFromObject); + DECL_NPY_API(PyArray_Squeeze); + DECL_NPY_API(PyArray_SetBaseObject); +#undef DECL_NPY_API + return api; + } +}; + +inline PyArray_Proxy* array_proxy(void* ptr) { + return reinterpret_cast(ptr); +} + +inline const PyArray_Proxy* array_proxy(const void* ptr) { + return reinterpret_cast(ptr); +} + +inline PyArrayDescr_Proxy* array_descriptor_proxy(PyObject* ptr) { + return reinterpret_cast(ptr); +} + +inline const PyArrayDescr_Proxy* array_descriptor_proxy(const PyObject* ptr) { + return reinterpret_cast(ptr); +} + +inline bool check_flags(const void* ptr, int flag) { + return (flag == (array_proxy(ptr)->flags & flag)); +} + +template struct is_std_array : std::false_type { }; +template struct is_std_array> : std::true_type { }; +template struct is_complex : std::false_type { }; +template struct is_complex> : std::true_type { }; + +template struct array_info_scalar { + typedef T type; + static constexpr bool is_array = false; + static constexpr bool is_empty = false; + static constexpr auto extents = _(""); + static void append_extents(list& /* shape */) { } +}; +// Computes underlying type and a comma-separated list of extents for array +// types (any mix of std::array and built-in arrays). An array of char is +// treated as scalar because it gets special handling. +template struct array_info : array_info_scalar { }; +template struct array_info> { + using type = typename array_info::type; + static constexpr bool is_array = true; + static constexpr bool is_empty = (N == 0) || array_info::is_empty; + static constexpr size_t extent = N; + + // appends the extents to shape + static void append_extents(list& shape) { + shape.append(N); + array_info::append_extents(shape); + } + + static constexpr auto extents = _::is_array>( + concat(_(), array_info::extents), _() + ); +}; +// For numpy we have special handling for arrays of characters, so we don't include +// the size in the array extents. +template struct array_info : array_info_scalar { }; +template struct array_info> : array_info_scalar> { }; +template struct array_info : array_info> { }; +template using remove_all_extents_t = typename array_info::type; + +template using is_pod_struct = all_of< + std::is_standard_layout, // since we're accessing directly in memory we need a standard layout type +#if !defined(__GNUG__) || defined(_LIBCPP_VERSION) || defined(_GLIBCXX_USE_CXX11_ABI) + // _GLIBCXX_USE_CXX11_ABI indicates that we're using libstdc++ from GCC 5 or newer, independent + // of the actual compiler (Clang can also use libstdc++, but it always defines __GNUC__ == 4). + std::is_trivially_copyable, +#else + // GCC 4 doesn't implement is_trivially_copyable, so approximate it + std::is_trivially_destructible, + satisfies_any_of, +#endif + satisfies_none_of +>; + +template ssize_t byte_offset_unsafe(const Strides &) { return 0; } +template +ssize_t byte_offset_unsafe(const Strides &strides, ssize_t i, Ix... index) { + return i * strides[Dim] + byte_offset_unsafe(strides, index...); +} + +/** + * Proxy class providing unsafe, unchecked const access to array data. This is constructed through + * the `unchecked()` method of `array` or the `unchecked()` method of `array_t`. `Dims` + * will be -1 for dimensions determined at runtime. + */ +template +class unchecked_reference { +protected: + static constexpr bool Dynamic = Dims < 0; + const unsigned char *data_; + // Storing the shape & strides in local variables (i.e. these arrays) allows the compiler to + // make large performance gains on big, nested loops, but requires compile-time dimensions + conditional_t> + shape_, strides_; + const ssize_t dims_; + + friend class pybind11::array; + // Constructor for compile-time dimensions: + template + unchecked_reference(const void *data, const ssize_t *shape, const ssize_t *strides, enable_if_t) + : data_{reinterpret_cast(data)}, dims_{Dims} { + for (size_t i = 0; i < (size_t) dims_; i++) { + shape_[i] = shape[i]; + strides_[i] = strides[i]; + } + } + // Constructor for runtime dimensions: + template + unchecked_reference(const void *data, const ssize_t *shape, const ssize_t *strides, enable_if_t dims) + : data_{reinterpret_cast(data)}, shape_{shape}, strides_{strides}, dims_{dims} {} + +public: + /** + * Unchecked const reference access to data at the given indices. For a compile-time known + * number of dimensions, this requires the correct number of arguments; for run-time + * dimensionality, this is not checked (and so is up to the caller to use safely). + */ + template const T &operator()(Ix... index) const { + static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic, + "Invalid number of indices for unchecked array reference"); + return *reinterpret_cast(data_ + byte_offset_unsafe(strides_, ssize_t(index)...)); + } + /** + * Unchecked const reference access to data; this operator only participates if the reference + * is to a 1-dimensional array. When present, this is exactly equivalent to `obj(index)`. + */ + template > + const T &operator[](ssize_t index) const { return operator()(index); } + + /// Pointer access to the data at the given indices. + template const T *data(Ix... ix) const { return &operator()(ssize_t(ix)...); } + + /// Returns the item size, i.e. sizeof(T) + constexpr static ssize_t itemsize() { return sizeof(T); } + + /// Returns the shape (i.e. size) of dimension `dim` + ssize_t shape(ssize_t dim) const { return shape_[(size_t) dim]; } + + /// Returns the number of dimensions of the array + ssize_t ndim() const { return dims_; } + + /// Returns the total number of elements in the referenced array, i.e. the product of the shapes + template + enable_if_t size() const { + return std::accumulate(shape_.begin(), shape_.end(), (ssize_t) 1, std::multiplies()); + } + template + enable_if_t size() const { + return std::accumulate(shape_, shape_ + ndim(), (ssize_t) 1, std::multiplies()); + } + + /// Returns the total number of bytes used by the referenced data. Note that the actual span in + /// memory may be larger if the referenced array has non-contiguous strides (e.g. for a slice). + ssize_t nbytes() const { + return size() * itemsize(); + } +}; + +template +class unchecked_mutable_reference : public unchecked_reference { + friend class pybind11::array; + using ConstBase = unchecked_reference; + using ConstBase::ConstBase; + using ConstBase::Dynamic; +public: + /// Mutable, unchecked access to data at the given indices. + template T& operator()(Ix... index) { + static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic, + "Invalid number of indices for unchecked array reference"); + return const_cast(ConstBase::operator()(index...)); + } + /** + * Mutable, unchecked access data at the given index; this operator only participates if the + * reference is to a 1-dimensional array (or has runtime dimensions). When present, this is + * exactly equivalent to `obj(index)`. + */ + template > + T &operator[](ssize_t index) { return operator()(index); } + + /// Mutable pointer access to the data at the given indices. + template T *mutable_data(Ix... ix) { return &operator()(ssize_t(ix)...); } +}; + +template +struct type_caster> { + static_assert(Dim == 0 && Dim > 0 /* always fail */, "unchecked array proxy object is not castable"); +}; +template +struct type_caster> : type_caster> {}; + +NAMESPACE_END(detail) + +class dtype : public object { +public: + PYBIND11_OBJECT_DEFAULT(dtype, object, detail::npy_api::get().PyArrayDescr_Check_); + + explicit dtype(const buffer_info &info) { + dtype descr(_dtype_from_pep3118()(PYBIND11_STR_TYPE(info.format))); + // If info.itemsize == 0, use the value calculated from the format string + m_ptr = descr.strip_padding(info.itemsize ? info.itemsize : descr.itemsize()).release().ptr(); + } + + explicit dtype(const std::string &format) { + m_ptr = from_args(pybind11::str(format)).release().ptr(); + } + + dtype(const char *format) : dtype(std::string(format)) { } + + dtype(list names, list formats, list offsets, ssize_t itemsize) { + dict args; + args["names"] = names; + args["formats"] = formats; + args["offsets"] = offsets; + args["itemsize"] = pybind11::int_(itemsize); + m_ptr = from_args(args).release().ptr(); + } + + /// This is essentially the same as calling numpy.dtype(args) in Python. + static dtype from_args(object args) { + PyObject *ptr = nullptr; + if (!detail::npy_api::get().PyArray_DescrConverter_(args.ptr(), &ptr) || !ptr) + throw error_already_set(); + return reinterpret_steal(ptr); + } + + /// Return dtype associated with a C++ type. + template static dtype of() { + return detail::npy_format_descriptor::type>::dtype(); + } + + /// Size of the data type in bytes. + ssize_t itemsize() const { + return detail::array_descriptor_proxy(m_ptr)->elsize; + } + + /// Returns true for structured data types. + bool has_fields() const { + return detail::array_descriptor_proxy(m_ptr)->names != nullptr; + } + + /// Single-character type code. + char kind() const { + return detail::array_descriptor_proxy(m_ptr)->kind; + } + +private: + static object _dtype_from_pep3118() { + static PyObject *obj = module::import("numpy.core._internal") + .attr("_dtype_from_pep3118").cast().release().ptr(); + return reinterpret_borrow(obj); + } + + dtype strip_padding(ssize_t itemsize) { + // Recursively strip all void fields with empty names that are generated for + // padding fields (as of NumPy v1.11). + if (!has_fields()) + return *this; + + struct field_descr { PYBIND11_STR_TYPE name; object format; pybind11::int_ offset; }; + std::vector field_descriptors; + + for (auto field : attr("fields").attr("items")()) { + auto spec = field.cast(); + auto name = spec[0].cast(); + auto format = spec[1].cast()[0].cast(); + auto offset = spec[1].cast()[1].cast(); + if (!len(name) && format.kind() == 'V') + continue; + field_descriptors.push_back({(PYBIND11_STR_TYPE) name, format.strip_padding(format.itemsize()), offset}); + } + + std::sort(field_descriptors.begin(), field_descriptors.end(), + [](const field_descr& a, const field_descr& b) { + return a.offset.cast() < b.offset.cast(); + }); + + list names, formats, offsets; + for (auto& descr : field_descriptors) { + names.append(descr.name); + formats.append(descr.format); + offsets.append(descr.offset); + } + return dtype(names, formats, offsets, itemsize); + } +}; + +class array : public buffer { +public: + PYBIND11_OBJECT_CVT(array, buffer, detail::npy_api::get().PyArray_Check_, raw_array) + + enum { + c_style = detail::npy_api::NPY_ARRAY_C_CONTIGUOUS_, + f_style = detail::npy_api::NPY_ARRAY_F_CONTIGUOUS_, + forcecast = detail::npy_api::NPY_ARRAY_FORCECAST_ + }; + + array() : array({{0}}, static_cast(nullptr)) {} + + using ShapeContainer = detail::any_container; + using StridesContainer = detail::any_container; + + // Constructs an array taking shape/strides from arbitrary container types + array(const pybind11::dtype &dt, ShapeContainer shape, StridesContainer strides, + const void *ptr = nullptr, handle base = handle()) { + + if (strides->empty()) + *strides = c_strides(*shape, dt.itemsize()); + + auto ndim = shape->size(); + if (ndim != strides->size()) + pybind11_fail("NumPy: shape ndim doesn't match strides ndim"); + auto descr = dt; + + int flags = 0; + if (base && ptr) { + if (isinstance(base)) + /* Copy flags from base (except ownership bit) */ + flags = reinterpret_borrow(base).flags() & ~detail::npy_api::NPY_ARRAY_OWNDATA_; + else + /* Writable by default, easy to downgrade later on if needed */ + flags = detail::npy_api::NPY_ARRAY_WRITEABLE_; + } + + auto &api = detail::npy_api::get(); + auto tmp = reinterpret_steal(api.PyArray_NewFromDescr_( + api.PyArray_Type_, descr.release().ptr(), (int) ndim, shape->data(), strides->data(), + const_cast(ptr), flags, nullptr)); + if (!tmp) + throw error_already_set(); + if (ptr) { + if (base) { + api.PyArray_SetBaseObject_(tmp.ptr(), base.inc_ref().ptr()); + } else { + tmp = reinterpret_steal(api.PyArray_NewCopy_(tmp.ptr(), -1 /* any order */)); + } + } + m_ptr = tmp.release().ptr(); + } + + array(const pybind11::dtype &dt, ShapeContainer shape, const void *ptr = nullptr, handle base = handle()) + : array(dt, std::move(shape), {}, ptr, base) { } + + template ::value && !std::is_same::value>> + array(const pybind11::dtype &dt, T count, const void *ptr = nullptr, handle base = handle()) + : array(dt, {{count}}, ptr, base) { } + + template + array(ShapeContainer shape, StridesContainer strides, const T *ptr, handle base = handle()) + : array(pybind11::dtype::of(), std::move(shape), std::move(strides), ptr, base) { } + + template + array(ShapeContainer shape, const T *ptr, handle base = handle()) + : array(std::move(shape), {}, ptr, base) { } + + template + explicit array(ssize_t count, const T *ptr, handle base = handle()) : array({count}, {}, ptr, base) { } + + explicit array(const buffer_info &info) + : array(pybind11::dtype(info), info.shape, info.strides, info.ptr) { } + + /// Array descriptor (dtype) + pybind11::dtype dtype() const { + return reinterpret_borrow(detail::array_proxy(m_ptr)->descr); + } + + /// Total number of elements + ssize_t size() const { + return std::accumulate(shape(), shape() + ndim(), (ssize_t) 1, std::multiplies()); + } + + /// Byte size of a single element + ssize_t itemsize() const { + return detail::array_descriptor_proxy(detail::array_proxy(m_ptr)->descr)->elsize; + } + + /// Total number of bytes + ssize_t nbytes() const { + return size() * itemsize(); + } + + /// Number of dimensions + ssize_t ndim() const { + return detail::array_proxy(m_ptr)->nd; + } + + /// Base object + object base() const { + return reinterpret_borrow(detail::array_proxy(m_ptr)->base); + } + + /// Dimensions of the array + const ssize_t* shape() const { + return detail::array_proxy(m_ptr)->dimensions; + } + + /// Dimension along a given axis + ssize_t shape(ssize_t dim) const { + if (dim >= ndim()) + fail_dim_check(dim, "invalid axis"); + return shape()[dim]; + } + + /// Strides of the array + const ssize_t* strides() const { + return detail::array_proxy(m_ptr)->strides; + } + + /// Stride along a given axis + ssize_t strides(ssize_t dim) const { + if (dim >= ndim()) + fail_dim_check(dim, "invalid axis"); + return strides()[dim]; + } + + /// Return the NumPy array flags + int flags() const { + return detail::array_proxy(m_ptr)->flags; + } + + /// If set, the array is writeable (otherwise the buffer is read-only) + bool writeable() const { + return detail::check_flags(m_ptr, detail::npy_api::NPY_ARRAY_WRITEABLE_); + } + + /// If set, the array owns the data (will be freed when the array is deleted) + bool owndata() const { + return detail::check_flags(m_ptr, detail::npy_api::NPY_ARRAY_OWNDATA_); + } + + /// Pointer to the contained data. If index is not provided, points to the + /// beginning of the buffer. May throw if the index would lead to out of bounds access. + template const void* data(Ix... index) const { + return static_cast(detail::array_proxy(m_ptr)->data + offset_at(index...)); + } + + /// Mutable pointer to the contained data. If index is not provided, points to the + /// beginning of the buffer. May throw if the index would lead to out of bounds access. + /// May throw if the array is not writeable. + template void* mutable_data(Ix... index) { + check_writeable(); + return static_cast(detail::array_proxy(m_ptr)->data + offset_at(index...)); + } + + /// Byte offset from beginning of the array to a given index (full or partial). + /// May throw if the index would lead to out of bounds access. + template ssize_t offset_at(Ix... index) const { + if ((ssize_t) sizeof...(index) > ndim()) + fail_dim_check(sizeof...(index), "too many indices for an array"); + return byte_offset(ssize_t(index)...); + } + + ssize_t offset_at() const { return 0; } + + /// Item count from beginning of the array to a given index (full or partial). + /// May throw if the index would lead to out of bounds access. + template ssize_t index_at(Ix... index) const { + return offset_at(index...) / itemsize(); + } + + /** + * Returns a proxy object that provides access to the array's data without bounds or + * dimensionality checking. Will throw if the array is missing the `writeable` flag. Use with + * care: the array must not be destroyed or reshaped for the duration of the returned object, + * and the caller must take care not to access invalid dimensions or dimension indices. + */ + template detail::unchecked_mutable_reference mutable_unchecked() & { + if (Dims >= 0 && ndim() != Dims) + throw std::domain_error("array has incorrect number of dimensions: " + std::to_string(ndim()) + + "; expected " + std::to_string(Dims)); + return detail::unchecked_mutable_reference(mutable_data(), shape(), strides(), ndim()); + } + + /** + * Returns a proxy object that provides const access to the array's data without bounds or + * dimensionality checking. Unlike `mutable_unchecked()`, this does not require that the + * underlying array have the `writable` flag. Use with care: the array must not be destroyed or + * reshaped for the duration of the returned object, and the caller must take care not to access + * invalid dimensions or dimension indices. + */ + template detail::unchecked_reference unchecked() const & { + if (Dims >= 0 && ndim() != Dims) + throw std::domain_error("array has incorrect number of dimensions: " + std::to_string(ndim()) + + "; expected " + std::to_string(Dims)); + return detail::unchecked_reference(data(), shape(), strides(), ndim()); + } + + /// Return a new view with all of the dimensions of length 1 removed + array squeeze() { + auto& api = detail::npy_api::get(); + return reinterpret_steal(api.PyArray_Squeeze_(m_ptr)); + } + + /// Resize array to given shape + /// If refcheck is true and more that one reference exist to this array + /// then resize will succeed only if it makes a reshape, i.e. original size doesn't change + void resize(ShapeContainer new_shape, bool refcheck = true) { + detail::npy_api::PyArray_Dims d = { + new_shape->data(), int(new_shape->size()) + }; + // try to resize, set ordering param to -1 cause it's not used anyway + object new_array = reinterpret_steal( + detail::npy_api::get().PyArray_Resize_(m_ptr, &d, int(refcheck), -1) + ); + if (!new_array) throw error_already_set(); + if (isinstance(new_array)) { *this = std::move(new_array); } + } + + /// Ensure that the argument is a NumPy array + /// In case of an error, nullptr is returned and the Python error is cleared. + static array ensure(handle h, int ExtraFlags = 0) { + auto result = reinterpret_steal(raw_array(h.ptr(), ExtraFlags)); + if (!result) + PyErr_Clear(); + return result; + } + +protected: + template friend struct detail::npy_format_descriptor; + + void fail_dim_check(ssize_t dim, const std::string& msg) const { + throw index_error(msg + ": " + std::to_string(dim) + + " (ndim = " + std::to_string(ndim()) + ")"); + } + + template ssize_t byte_offset(Ix... index) const { + check_dimensions(index...); + return detail::byte_offset_unsafe(strides(), ssize_t(index)...); + } + + void check_writeable() const { + if (!writeable()) + throw std::domain_error("array is not writeable"); + } + + // Default, C-style strides + static std::vector c_strides(const std::vector &shape, ssize_t itemsize) { + auto ndim = shape.size(); + std::vector strides(ndim, itemsize); + if (ndim > 0) + for (size_t i = ndim - 1; i > 0; --i) + strides[i - 1] = strides[i] * shape[i]; + return strides; + } + + // F-style strides; default when constructing an array_t with `ExtraFlags & f_style` + static std::vector f_strides(const std::vector &shape, ssize_t itemsize) { + auto ndim = shape.size(); + std::vector strides(ndim, itemsize); + for (size_t i = 1; i < ndim; ++i) + strides[i] = strides[i - 1] * shape[i - 1]; + return strides; + } + + template void check_dimensions(Ix... index) const { + check_dimensions_impl(ssize_t(0), shape(), ssize_t(index)...); + } + + void check_dimensions_impl(ssize_t, const ssize_t*) const { } + + template void check_dimensions_impl(ssize_t axis, const ssize_t* shape, ssize_t i, Ix... index) const { + if (i >= *shape) { + throw index_error(std::string("index ") + std::to_string(i) + + " is out of bounds for axis " + std::to_string(axis) + + " with size " + std::to_string(*shape)); + } + check_dimensions_impl(axis + 1, shape + 1, index...); + } + + /// Create array from any object -- always returns a new reference + static PyObject *raw_array(PyObject *ptr, int ExtraFlags = 0) { + if (ptr == nullptr) { + PyErr_SetString(PyExc_ValueError, "cannot create a pybind11::array from a nullptr"); + return nullptr; + } + return detail::npy_api::get().PyArray_FromAny_( + ptr, nullptr, 0, 0, detail::npy_api::NPY_ARRAY_ENSUREARRAY_ | ExtraFlags, nullptr); + } +}; + +template class array_t : public array { +private: + struct private_ctor {}; + // Delegating constructor needed when both moving and accessing in the same constructor + array_t(private_ctor, ShapeContainer &&shape, StridesContainer &&strides, const T *ptr, handle base) + : array(std::move(shape), std::move(strides), ptr, base) {} +public: + static_assert(!detail::array_info::is_array, "Array types cannot be used with array_t"); + + using value_type = T; + + array_t() : array(0, static_cast(nullptr)) {} + array_t(handle h, borrowed_t) : array(h, borrowed_t{}) { } + array_t(handle h, stolen_t) : array(h, stolen_t{}) { } + + PYBIND11_DEPRECATED("Use array_t::ensure() instead") + array_t(handle h, bool is_borrowed) : array(raw_array_t(h.ptr()), stolen_t{}) { + if (!m_ptr) PyErr_Clear(); + if (!is_borrowed) Py_XDECREF(h.ptr()); + } + + array_t(const object &o) : array(raw_array_t(o.ptr()), stolen_t{}) { + if (!m_ptr) throw error_already_set(); + } + + explicit array_t(const buffer_info& info) : array(info) { } + + array_t(ShapeContainer shape, StridesContainer strides, const T *ptr = nullptr, handle base = handle()) + : array(std::move(shape), std::move(strides), ptr, base) { } + + explicit array_t(ShapeContainer shape, const T *ptr = nullptr, handle base = handle()) + : array_t(private_ctor{}, std::move(shape), + ExtraFlags & f_style ? f_strides(*shape, itemsize()) : c_strides(*shape, itemsize()), + ptr, base) { } + + explicit array_t(size_t count, const T *ptr = nullptr, handle base = handle()) + : array({count}, {}, ptr, base) { } + + constexpr ssize_t itemsize() const { + return sizeof(T); + } + + template ssize_t index_at(Ix... index) const { + return offset_at(index...) / itemsize(); + } + + template const T* data(Ix... index) const { + return static_cast(array::data(index...)); + } + + template T* mutable_data(Ix... index) { + return static_cast(array::mutable_data(index...)); + } + + // Reference to element at a given index + template const T& at(Ix... index) const { + if ((ssize_t) sizeof...(index) != ndim()) + fail_dim_check(sizeof...(index), "index dimension mismatch"); + return *(static_cast(array::data()) + byte_offset(ssize_t(index)...) / itemsize()); + } + + // Mutable reference to element at a given index + template T& mutable_at(Ix... index) { + if ((ssize_t) sizeof...(index) != ndim()) + fail_dim_check(sizeof...(index), "index dimension mismatch"); + return *(static_cast(array::mutable_data()) + byte_offset(ssize_t(index)...) / itemsize()); + } + + /** + * Returns a proxy object that provides access to the array's data without bounds or + * dimensionality checking. Will throw if the array is missing the `writeable` flag. Use with + * care: the array must not be destroyed or reshaped for the duration of the returned object, + * and the caller must take care not to access invalid dimensions or dimension indices. + */ + template detail::unchecked_mutable_reference mutable_unchecked() & { + return array::mutable_unchecked(); + } + + /** + * Returns a proxy object that provides const access to the array's data without bounds or + * dimensionality checking. Unlike `unchecked()`, this does not require that the underlying + * array have the `writable` flag. Use with care: the array must not be destroyed or reshaped + * for the duration of the returned object, and the caller must take care not to access invalid + * dimensions or dimension indices. + */ + template detail::unchecked_reference unchecked() const & { + return array::unchecked(); + } + + /// Ensure that the argument is a NumPy array of the correct dtype (and if not, try to convert + /// it). In case of an error, nullptr is returned and the Python error is cleared. + static array_t ensure(handle h) { + auto result = reinterpret_steal(raw_array_t(h.ptr())); + if (!result) + PyErr_Clear(); + return result; + } + + static bool check_(handle h) { + const auto &api = detail::npy_api::get(); + return api.PyArray_Check_(h.ptr()) + && api.PyArray_EquivTypes_(detail::array_proxy(h.ptr())->descr, dtype::of().ptr()); + } + +protected: + /// Create array from any object -- always returns a new reference + static PyObject *raw_array_t(PyObject *ptr) { + if (ptr == nullptr) { + PyErr_SetString(PyExc_ValueError, "cannot create a pybind11::array_t from a nullptr"); + return nullptr; + } + return detail::npy_api::get().PyArray_FromAny_( + ptr, dtype::of().release().ptr(), 0, 0, + detail::npy_api::NPY_ARRAY_ENSUREARRAY_ | ExtraFlags, nullptr); + } +}; + +template +struct format_descriptor::value>> { + static std::string format() { + return detail::npy_format_descriptor::type>::format(); + } +}; + +template struct format_descriptor { + static std::string format() { return std::to_string(N) + "s"; } +}; +template struct format_descriptor> { + static std::string format() { return std::to_string(N) + "s"; } +}; + +template +struct format_descriptor::value>> { + static std::string format() { + return format_descriptor< + typename std::remove_cv::type>::type>::format(); + } +}; + +template +struct format_descriptor::is_array>> { + static std::string format() { + using namespace detail; + static constexpr auto extents = _("(") + array_info::extents + _(")"); + return extents.text + format_descriptor>::format(); + } +}; + +NAMESPACE_BEGIN(detail) +template +struct pyobject_caster> { + using type = array_t; + + bool load(handle src, bool convert) { + if (!convert && !type::check_(src)) + return false; + value = type::ensure(src); + return static_cast(value); + } + + static handle cast(const handle &src, return_value_policy /* policy */, handle /* parent */) { + return src.inc_ref(); + } + PYBIND11_TYPE_CASTER(type, handle_type_name::name); +}; + +template +struct compare_buffer_info::value>> { + static bool compare(const buffer_info& b) { + return npy_api::get().PyArray_EquivTypes_(dtype::of().ptr(), dtype(b).ptr()); + } +}; + +template +struct npy_format_descriptor_name; + +template +struct npy_format_descriptor_name::value>> { + static constexpr auto name = _::value>( + _("bool"), _::value>("int", "uint") + _() + ); +}; + +template +struct npy_format_descriptor_name::value>> { + static constexpr auto name = _::value || std::is_same::value>( + _("float") + _(), _("longdouble") + ); +}; + +template +struct npy_format_descriptor_name::value>> { + static constexpr auto name = _::value + || std::is_same::value>( + _("complex") + _(), _("longcomplex") + ); +}; + +template +struct npy_format_descriptor::value>> + : npy_format_descriptor_name { +private: + // NB: the order here must match the one in common.h + constexpr static const int values[15] = { + npy_api::NPY_BOOL_, + npy_api::NPY_BYTE_, npy_api::NPY_UBYTE_, npy_api::NPY_SHORT_, npy_api::NPY_USHORT_, + npy_api::NPY_INT_, npy_api::NPY_UINT_, npy_api::NPY_LONGLONG_, npy_api::NPY_ULONGLONG_, + npy_api::NPY_FLOAT_, npy_api::NPY_DOUBLE_, npy_api::NPY_LONGDOUBLE_, + npy_api::NPY_CFLOAT_, npy_api::NPY_CDOUBLE_, npy_api::NPY_CLONGDOUBLE_ + }; + +public: + static constexpr int value = values[detail::is_fmt_numeric::index]; + + static pybind11::dtype dtype() { + if (auto ptr = npy_api::get().PyArray_DescrFromType_(value)) + return reinterpret_borrow(ptr); + pybind11_fail("Unsupported buffer format!"); + } +}; + +#define PYBIND11_DECL_CHAR_FMT \ + static constexpr auto name = _("S") + _(); \ + static pybind11::dtype dtype() { return pybind11::dtype(std::string("S") + std::to_string(N)); } +template struct npy_format_descriptor { PYBIND11_DECL_CHAR_FMT }; +template struct npy_format_descriptor> { PYBIND11_DECL_CHAR_FMT }; +#undef PYBIND11_DECL_CHAR_FMT + +template struct npy_format_descriptor::is_array>> { +private: + using base_descr = npy_format_descriptor::type>; +public: + static_assert(!array_info::is_empty, "Zero-sized arrays are not supported"); + + static constexpr auto name = _("(") + array_info::extents + _(")") + base_descr::name; + static pybind11::dtype dtype() { + list shape; + array_info::append_extents(shape); + return pybind11::dtype::from_args(pybind11::make_tuple(base_descr::dtype(), shape)); + } +}; + +template struct npy_format_descriptor::value>> { +private: + using base_descr = npy_format_descriptor::type>; +public: + static constexpr auto name = base_descr::name; + static pybind11::dtype dtype() { return base_descr::dtype(); } +}; + +struct field_descriptor { + const char *name; + ssize_t offset; + ssize_t size; + std::string format; + dtype descr; +}; + +inline PYBIND11_NOINLINE void register_structured_dtype( + any_container fields, + const std::type_info& tinfo, ssize_t itemsize, + bool (*direct_converter)(PyObject *, void *&)) { + + auto& numpy_internals = get_numpy_internals(); + if (numpy_internals.get_type_info(tinfo, false)) + pybind11_fail("NumPy: dtype is already registered"); + + list names, formats, offsets; + for (auto field : *fields) { + if (!field.descr) + pybind11_fail(std::string("NumPy: unsupported field dtype: `") + + field.name + "` @ " + tinfo.name()); + names.append(PYBIND11_STR_TYPE(field.name)); + formats.append(field.descr); + offsets.append(pybind11::int_(field.offset)); + } + auto dtype_ptr = pybind11::dtype(names, formats, offsets, itemsize).release().ptr(); + + // There is an existing bug in NumPy (as of v1.11): trailing bytes are + // not encoded explicitly into the format string. This will supposedly + // get fixed in v1.12; for further details, see these: + // - https://github.com/numpy/numpy/issues/7797 + // - https://github.com/numpy/numpy/pull/7798 + // Because of this, we won't use numpy's logic to generate buffer format + // strings and will just do it ourselves. + std::vector ordered_fields(std::move(fields)); + std::sort(ordered_fields.begin(), ordered_fields.end(), + [](const field_descriptor &a, const field_descriptor &b) { return a.offset < b.offset; }); + ssize_t offset = 0; + std::ostringstream oss; + // mark the structure as unaligned with '^', because numpy and C++ don't + // always agree about alignment (particularly for complex), and we're + // explicitly listing all our padding. This depends on none of the fields + // overriding the endianness. Putting the ^ in front of individual fields + // isn't guaranteed to work due to https://github.com/numpy/numpy/issues/9049 + oss << "^T{"; + for (auto& field : ordered_fields) { + if (field.offset > offset) + oss << (field.offset - offset) << 'x'; + oss << field.format << ':' << field.name << ':'; + offset = field.offset + field.size; + } + if (itemsize > offset) + oss << (itemsize - offset) << 'x'; + oss << '}'; + auto format_str = oss.str(); + + // Sanity check: verify that NumPy properly parses our buffer format string + auto& api = npy_api::get(); + auto arr = array(buffer_info(nullptr, itemsize, format_str, 1)); + if (!api.PyArray_EquivTypes_(dtype_ptr, arr.dtype().ptr())) + pybind11_fail("NumPy: invalid buffer descriptor!"); + + auto tindex = std::type_index(tinfo); + numpy_internals.registered_dtypes[tindex] = { dtype_ptr, format_str }; + get_internals().direct_conversions[tindex].push_back(direct_converter); +} + +template struct npy_format_descriptor { + static_assert(is_pod_struct::value, "Attempt to use a non-POD or unimplemented POD type as a numpy dtype"); + + static constexpr auto name = make_caster::name; + + static pybind11::dtype dtype() { + return reinterpret_borrow(dtype_ptr()); + } + + static std::string format() { + static auto format_str = get_numpy_internals().get_type_info(true)->format_str; + return format_str; + } + + static void register_dtype(any_container fields) { + register_structured_dtype(std::move(fields), typeid(typename std::remove_cv::type), + sizeof(T), &direct_converter); + } + +private: + static PyObject* dtype_ptr() { + static PyObject* ptr = get_numpy_internals().get_type_info(true)->dtype_ptr; + return ptr; + } + + static bool direct_converter(PyObject *obj, void*& value) { + auto& api = npy_api::get(); + if (!PyObject_TypeCheck(obj, api.PyVoidArrType_Type_)) + return false; + if (auto descr = reinterpret_steal(api.PyArray_DescrFromScalar_(obj))) { + if (api.PyArray_EquivTypes_(dtype_ptr(), descr.ptr())) { + value = ((PyVoidScalarObject_Proxy *) obj)->obval; + return true; + } + } + return false; + } +}; + +#ifdef __CLION_IDE__ // replace heavy macro with dummy code for the IDE (doesn't affect code) +# define PYBIND11_NUMPY_DTYPE(Type, ...) ((void)0) +# define PYBIND11_NUMPY_DTYPE_EX(Type, ...) ((void)0) +#else + +#define PYBIND11_FIELD_DESCRIPTOR_EX(T, Field, Name) \ + ::pybind11::detail::field_descriptor { \ + Name, offsetof(T, Field), sizeof(decltype(std::declval().Field)), \ + ::pybind11::format_descriptor().Field)>::format(), \ + ::pybind11::detail::npy_format_descriptor().Field)>::dtype() \ + } + +// Extract name, offset and format descriptor for a struct field +#define PYBIND11_FIELD_DESCRIPTOR(T, Field) PYBIND11_FIELD_DESCRIPTOR_EX(T, Field, #Field) + +// The main idea of this macro is borrowed from https://github.com/swansontec/map-macro +// (C) William Swanson, Paul Fultz +#define PYBIND11_EVAL0(...) __VA_ARGS__ +#define PYBIND11_EVAL1(...) PYBIND11_EVAL0 (PYBIND11_EVAL0 (PYBIND11_EVAL0 (__VA_ARGS__))) +#define PYBIND11_EVAL2(...) PYBIND11_EVAL1 (PYBIND11_EVAL1 (PYBIND11_EVAL1 (__VA_ARGS__))) +#define PYBIND11_EVAL3(...) PYBIND11_EVAL2 (PYBIND11_EVAL2 (PYBIND11_EVAL2 (__VA_ARGS__))) +#define PYBIND11_EVAL4(...) PYBIND11_EVAL3 (PYBIND11_EVAL3 (PYBIND11_EVAL3 (__VA_ARGS__))) +#define PYBIND11_EVAL(...) PYBIND11_EVAL4 (PYBIND11_EVAL4 (PYBIND11_EVAL4 (__VA_ARGS__))) +#define PYBIND11_MAP_END(...) +#define PYBIND11_MAP_OUT +#define PYBIND11_MAP_COMMA , +#define PYBIND11_MAP_GET_END() 0, PYBIND11_MAP_END +#define PYBIND11_MAP_NEXT0(test, next, ...) next PYBIND11_MAP_OUT +#define PYBIND11_MAP_NEXT1(test, next) PYBIND11_MAP_NEXT0 (test, next, 0) +#define PYBIND11_MAP_NEXT(test, next) PYBIND11_MAP_NEXT1 (PYBIND11_MAP_GET_END test, next) +#ifdef _MSC_VER // MSVC is not as eager to expand macros, hence this workaround +#define PYBIND11_MAP_LIST_NEXT1(test, next) \ + PYBIND11_EVAL0 (PYBIND11_MAP_NEXT0 (test, PYBIND11_MAP_COMMA next, 0)) +#else +#define PYBIND11_MAP_LIST_NEXT1(test, next) \ + PYBIND11_MAP_NEXT0 (test, PYBIND11_MAP_COMMA next, 0) +#endif +#define PYBIND11_MAP_LIST_NEXT(test, next) \ + PYBIND11_MAP_LIST_NEXT1 (PYBIND11_MAP_GET_END test, next) +#define PYBIND11_MAP_LIST0(f, t, x, peek, ...) \ + f(t, x) PYBIND11_MAP_LIST_NEXT (peek, PYBIND11_MAP_LIST1) (f, t, peek, __VA_ARGS__) +#define PYBIND11_MAP_LIST1(f, t, x, peek, ...) \ + f(t, x) PYBIND11_MAP_LIST_NEXT (peek, PYBIND11_MAP_LIST0) (f, t, peek, __VA_ARGS__) +// PYBIND11_MAP_LIST(f, t, a1, a2, ...) expands to f(t, a1), f(t, a2), ... +#define PYBIND11_MAP_LIST(f, t, ...) \ + PYBIND11_EVAL (PYBIND11_MAP_LIST1 (f, t, __VA_ARGS__, (), 0)) + +#define PYBIND11_NUMPY_DTYPE(Type, ...) \ + ::pybind11::detail::npy_format_descriptor::register_dtype \ + (::std::vector<::pybind11::detail::field_descriptor> \ + {PYBIND11_MAP_LIST (PYBIND11_FIELD_DESCRIPTOR, Type, __VA_ARGS__)}) + +#ifdef _MSC_VER +#define PYBIND11_MAP2_LIST_NEXT1(test, next) \ + PYBIND11_EVAL0 (PYBIND11_MAP_NEXT0 (test, PYBIND11_MAP_COMMA next, 0)) +#else +#define PYBIND11_MAP2_LIST_NEXT1(test, next) \ + PYBIND11_MAP_NEXT0 (test, PYBIND11_MAP_COMMA next, 0) +#endif +#define PYBIND11_MAP2_LIST_NEXT(test, next) \ + PYBIND11_MAP2_LIST_NEXT1 (PYBIND11_MAP_GET_END test, next) +#define PYBIND11_MAP2_LIST0(f, t, x1, x2, peek, ...) \ + f(t, x1, x2) PYBIND11_MAP2_LIST_NEXT (peek, PYBIND11_MAP2_LIST1) (f, t, peek, __VA_ARGS__) +#define PYBIND11_MAP2_LIST1(f, t, x1, x2, peek, ...) \ + f(t, x1, x2) PYBIND11_MAP2_LIST_NEXT (peek, PYBIND11_MAP2_LIST0) (f, t, peek, __VA_ARGS__) +// PYBIND11_MAP2_LIST(f, t, a1, a2, ...) expands to f(t, a1, a2), f(t, a3, a4), ... +#define PYBIND11_MAP2_LIST(f, t, ...) \ + PYBIND11_EVAL (PYBIND11_MAP2_LIST1 (f, t, __VA_ARGS__, (), 0)) + +#define PYBIND11_NUMPY_DTYPE_EX(Type, ...) \ + ::pybind11::detail::npy_format_descriptor::register_dtype \ + (::std::vector<::pybind11::detail::field_descriptor> \ + {PYBIND11_MAP2_LIST (PYBIND11_FIELD_DESCRIPTOR_EX, Type, __VA_ARGS__)}) + +#endif // __CLION_IDE__ + +template +using array_iterator = typename std::add_pointer::type; + +template +array_iterator array_begin(const buffer_info& buffer) { + return array_iterator(reinterpret_cast(buffer.ptr)); +} + +template +array_iterator array_end(const buffer_info& buffer) { + return array_iterator(reinterpret_cast(buffer.ptr) + buffer.size); +} + +class common_iterator { +public: + using container_type = std::vector; + using value_type = container_type::value_type; + using size_type = container_type::size_type; + + common_iterator() : p_ptr(0), m_strides() {} + + common_iterator(void* ptr, const container_type& strides, const container_type& shape) + : p_ptr(reinterpret_cast(ptr)), m_strides(strides.size()) { + m_strides.back() = static_cast(strides.back()); + for (size_type i = m_strides.size() - 1; i != 0; --i) { + size_type j = i - 1; + value_type s = static_cast(shape[i]); + m_strides[j] = strides[j] + m_strides[i] - strides[i] * s; + } + } + + void increment(size_type dim) { + p_ptr += m_strides[dim]; + } + + void* data() const { + return p_ptr; + } + +private: + char* p_ptr; + container_type m_strides; +}; + +template class multi_array_iterator { +public: + using container_type = std::vector; + + multi_array_iterator(const std::array &buffers, + const container_type &shape) + : m_shape(shape.size()), m_index(shape.size(), 0), + m_common_iterator() { + + // Manual copy to avoid conversion warning if using std::copy + for (size_t i = 0; i < shape.size(); ++i) + m_shape[i] = shape[i]; + + container_type strides(shape.size()); + for (size_t i = 0; i < N; ++i) + init_common_iterator(buffers[i], shape, m_common_iterator[i], strides); + } + + multi_array_iterator& operator++() { + for (size_t j = m_index.size(); j != 0; --j) { + size_t i = j - 1; + if (++m_index[i] != m_shape[i]) { + increment_common_iterator(i); + break; + } else { + m_index[i] = 0; + } + } + return *this; + } + + template T* data() const { + return reinterpret_cast(m_common_iterator[K].data()); + } + +private: + + using common_iter = common_iterator; + + void init_common_iterator(const buffer_info &buffer, + const container_type &shape, + common_iter &iterator, + container_type &strides) { + auto buffer_shape_iter = buffer.shape.rbegin(); + auto buffer_strides_iter = buffer.strides.rbegin(); + auto shape_iter = shape.rbegin(); + auto strides_iter = strides.rbegin(); + + while (buffer_shape_iter != buffer.shape.rend()) { + if (*shape_iter == *buffer_shape_iter) + *strides_iter = *buffer_strides_iter; + else + *strides_iter = 0; + + ++buffer_shape_iter; + ++buffer_strides_iter; + ++shape_iter; + ++strides_iter; + } + + std::fill(strides_iter, strides.rend(), 0); + iterator = common_iter(buffer.ptr, strides, shape); + } + + void increment_common_iterator(size_t dim) { + for (auto &iter : m_common_iterator) + iter.increment(dim); + } + + container_type m_shape; + container_type m_index; + std::array m_common_iterator; +}; + +enum class broadcast_trivial { non_trivial, c_trivial, f_trivial }; + +// Populates the shape and number of dimensions for the set of buffers. Returns a broadcast_trivial +// enum value indicating whether the broadcast is "trivial"--that is, has each buffer being either a +// singleton or a full-size, C-contiguous (`c_trivial`) or Fortran-contiguous (`f_trivial`) storage +// buffer; returns `non_trivial` otherwise. +template +broadcast_trivial broadcast(const std::array &buffers, ssize_t &ndim, std::vector &shape) { + ndim = std::accumulate(buffers.begin(), buffers.end(), ssize_t(0), [](ssize_t res, const buffer_info &buf) { + return std::max(res, buf.ndim); + }); + + shape.clear(); + shape.resize((size_t) ndim, 1); + + // Figure out the output size, and make sure all input arrays conform (i.e. are either size 1 or + // the full size). + for (size_t i = 0; i < N; ++i) { + auto res_iter = shape.rbegin(); + auto end = buffers[i].shape.rend(); + for (auto shape_iter = buffers[i].shape.rbegin(); shape_iter != end; ++shape_iter, ++res_iter) { + const auto &dim_size_in = *shape_iter; + auto &dim_size_out = *res_iter; + + // Each input dimension can either be 1 or `n`, but `n` values must match across buffers + if (dim_size_out == 1) + dim_size_out = dim_size_in; + else if (dim_size_in != 1 && dim_size_in != dim_size_out) + pybind11_fail("pybind11::vectorize: incompatible size/dimension of inputs!"); + } + } + + bool trivial_broadcast_c = true; + bool trivial_broadcast_f = true; + for (size_t i = 0; i < N && (trivial_broadcast_c || trivial_broadcast_f); ++i) { + if (buffers[i].size == 1) + continue; + + // Require the same number of dimensions: + if (buffers[i].ndim != ndim) + return broadcast_trivial::non_trivial; + + // Require all dimensions be full-size: + if (!std::equal(buffers[i].shape.cbegin(), buffers[i].shape.cend(), shape.cbegin())) + return broadcast_trivial::non_trivial; + + // Check for C contiguity (but only if previous inputs were also C contiguous) + if (trivial_broadcast_c) { + ssize_t expect_stride = buffers[i].itemsize; + auto end = buffers[i].shape.crend(); + for (auto shape_iter = buffers[i].shape.crbegin(), stride_iter = buffers[i].strides.crbegin(); + trivial_broadcast_c && shape_iter != end; ++shape_iter, ++stride_iter) { + if (expect_stride == *stride_iter) + expect_stride *= *shape_iter; + else + trivial_broadcast_c = false; + } + } + + // Check for Fortran contiguity (if previous inputs were also F contiguous) + if (trivial_broadcast_f) { + ssize_t expect_stride = buffers[i].itemsize; + auto end = buffers[i].shape.cend(); + for (auto shape_iter = buffers[i].shape.cbegin(), stride_iter = buffers[i].strides.cbegin(); + trivial_broadcast_f && shape_iter != end; ++shape_iter, ++stride_iter) { + if (expect_stride == *stride_iter) + expect_stride *= *shape_iter; + else + trivial_broadcast_f = false; + } + } + } + + return + trivial_broadcast_c ? broadcast_trivial::c_trivial : + trivial_broadcast_f ? broadcast_trivial::f_trivial : + broadcast_trivial::non_trivial; +} + +template +struct vectorize_arg { + static_assert(!std::is_rvalue_reference::value, "Functions with rvalue reference arguments cannot be vectorized"); + // The wrapped function gets called with this type: + using call_type = remove_reference_t; + // Is this a vectorized argument? + static constexpr bool vectorize = + satisfies_any_of::value && + satisfies_none_of::value && + (!std::is_reference::value || + (std::is_lvalue_reference::value && std::is_const::value)); + // Accept this type: an array for vectorized types, otherwise the type as-is: + using type = conditional_t, array::forcecast>, T>; +}; + +template +struct vectorize_helper { +private: + static constexpr size_t N = sizeof...(Args); + static constexpr size_t NVectorized = constexpr_sum(vectorize_arg::vectorize...); + static_assert(NVectorized >= 1, + "pybind11::vectorize(...) requires a function with at least one vectorizable argument"); + +public: + template + explicit vectorize_helper(T &&f) : f(std::forward(f)) { } + + object operator()(typename vectorize_arg::type... args) { + return run(args..., + make_index_sequence(), + select_indices::vectorize...>(), + make_index_sequence()); + } + +private: + remove_reference_t f; + + // Internal compiler error in MSVC 19.16.27025.1 (Visual Studio 2017 15.9.4), when compiling with "/permissive-" flag + // when arg_call_types is manually inlined. + using arg_call_types = std::tuple::call_type...>; + template using param_n_t = typename std::tuple_element::type; + + // Runs a vectorized function given arguments tuple and three index sequences: + // - Index is the full set of 0 ... (N-1) argument indices; + // - VIndex is the subset of argument indices with vectorized parameters, letting us access + // vectorized arguments (anything not in this sequence is passed through) + // - BIndex is a incremental sequence (beginning at 0) of the same size as VIndex, so that + // we can store vectorized buffer_infos in an array (argument VIndex has its buffer at + // index BIndex in the array). + template object run( + typename vectorize_arg::type &...args, + index_sequence i_seq, index_sequence vi_seq, index_sequence bi_seq) { + + // Pointers to values the function was called with; the vectorized ones set here will start + // out as array_t pointers, but they will be changed them to T pointers before we make + // call the wrapped function. Non-vectorized pointers are left as-is. + std::array params{{ &args... }}; + + // The array of `buffer_info`s of vectorized arguments: + std::array buffers{{ reinterpret_cast(params[VIndex])->request()... }}; + + /* Determine dimensions parameters of output array */ + ssize_t nd = 0; + std::vector shape(0); + auto trivial = broadcast(buffers, nd, shape); + size_t ndim = (size_t) nd; + + size_t size = std::accumulate(shape.begin(), shape.end(), (size_t) 1, std::multiplies()); + + // If all arguments are 0-dimension arrays (i.e. single values) return a plain value (i.e. + // not wrapped in an array). + if (size == 1 && ndim == 0) { + PYBIND11_EXPAND_SIDE_EFFECTS(params[VIndex] = buffers[BIndex].ptr); + return cast(f(*reinterpret_cast *>(params[Index])...)); + } + + array_t result; + if (trivial == broadcast_trivial::f_trivial) result = array_t(shape); + else result = array_t(shape); + + if (size == 0) return std::move(result); + + /* Call the function */ + if (trivial == broadcast_trivial::non_trivial) + apply_broadcast(buffers, params, result, i_seq, vi_seq, bi_seq); + else + apply_trivial(buffers, params, result.mutable_data(), size, i_seq, vi_seq, bi_seq); + + return std::move(result); + } + + template + void apply_trivial(std::array &buffers, + std::array ¶ms, + Return *out, + size_t size, + index_sequence, index_sequence, index_sequence) { + + // Initialize an array of mutable byte references and sizes with references set to the + // appropriate pointer in `params`; as we iterate, we'll increment each pointer by its size + // (except for singletons, which get an increment of 0). + std::array, NVectorized> vecparams{{ + std::pair( + reinterpret_cast(params[VIndex] = buffers[BIndex].ptr), + buffers[BIndex].size == 1 ? 0 : sizeof(param_n_t) + )... + }}; + + for (size_t i = 0; i < size; ++i) { + out[i] = f(*reinterpret_cast *>(params[Index])...); + for (auto &x : vecparams) x.first += x.second; + } + } + + template + void apply_broadcast(std::array &buffers, + std::array ¶ms, + array_t &output_array, + index_sequence, index_sequence, index_sequence) { + + buffer_info output = output_array.request(); + multi_array_iterator input_iter(buffers, output.shape); + + for (array_iterator iter = array_begin(output), end = array_end(output); + iter != end; + ++iter, ++input_iter) { + PYBIND11_EXPAND_SIDE_EFFECTS(( + params[VIndex] = input_iter.template data() + )); + *iter = f(*reinterpret_cast *>(std::get(params))...); + } + } +}; + +template +vectorize_helper +vectorize_extractor(const Func &f, Return (*) (Args ...)) { + return detail::vectorize_helper(f); +} + +template struct handle_type_name> { + static constexpr auto name = _("numpy.ndarray[") + npy_format_descriptor::name + _("]"); +}; + +NAMESPACE_END(detail) + +// Vanilla pointer vectorizer: +template +detail::vectorize_helper +vectorize(Return (*f) (Args ...)) { + return detail::vectorize_helper(f); +} + +// lambda vectorizer: +template ::value, int> = 0> +auto vectorize(Func &&f) -> decltype( + detail::vectorize_extractor(std::forward(f), (detail::function_signature_t *) nullptr)) { + return detail::vectorize_extractor(std::forward(f), (detail::function_signature_t *) nullptr); +} + +// Vectorize a class method (non-const): +template ())), Return, Class *, Args...>> +Helper vectorize(Return (Class::*f)(Args...)) { + return Helper(std::mem_fn(f)); +} + +// Vectorize a class method (const): +template ())), Return, const Class *, Args...>> +Helper vectorize(Return (Class::*f)(Args...) const) { + return Helper(std::mem_fn(f)); +} + +NAMESPACE_END(PYBIND11_NAMESPACE) + +#if defined(_MSC_VER) +#pragma warning(pop) +#endif diff --git a/python/src/pybind11/operators.h b/python/src/pybind11/operators.h new file mode 100644 index 000000000..b3dd62c3b --- /dev/null +++ b/python/src/pybind11/operators.h @@ -0,0 +1,168 @@ +/* + pybind11/operator.h: Metatemplates for operator overloading + + Copyright (c) 2016 Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include "pybind11.h" + +#if defined(__clang__) && !defined(__INTEL_COMPILER) +# pragma clang diagnostic ignored "-Wunsequenced" // multiple unsequenced modifications to 'self' (when using def(py::self OP Type())) +#elif defined(_MSC_VER) +# pragma warning(push) +# pragma warning(disable: 4127) // warning C4127: Conditional expression is constant +#endif + +NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +NAMESPACE_BEGIN(detail) + +/// Enumeration with all supported operator types +enum op_id : int { + op_add, op_sub, op_mul, op_div, op_mod, op_divmod, op_pow, op_lshift, + op_rshift, op_and, op_xor, op_or, op_neg, op_pos, op_abs, op_invert, + op_int, op_long, op_float, op_str, op_cmp, op_gt, op_ge, op_lt, op_le, + op_eq, op_ne, op_iadd, op_isub, op_imul, op_idiv, op_imod, op_ilshift, + op_irshift, op_iand, op_ixor, op_ior, op_complex, op_bool, op_nonzero, + op_repr, op_truediv, op_itruediv, op_hash +}; + +enum op_type : int { + op_l, /* base type on left */ + op_r, /* base type on right */ + op_u /* unary operator */ +}; + +struct self_t { }; +static const self_t self = self_t(); + +/// Type for an unused type slot +struct undefined_t { }; + +/// Don't warn about an unused variable +inline self_t __self() { return self; } + +/// base template of operator implementations +template struct op_impl { }; + +/// Operator implementation generator +template struct op_ { + template void execute(Class &cl, const Extra&... extra) const { + using Base = typename Class::type; + using L_type = conditional_t::value, Base, L>; + using R_type = conditional_t::value, Base, R>; + using op = op_impl; + cl.def(op::name(), &op::execute, is_operator(), extra...); + #if PY_MAJOR_VERSION < 3 + if (id == op_truediv || id == op_itruediv) + cl.def(id == op_itruediv ? "__idiv__" : ot == op_l ? "__div__" : "__rdiv__", + &op::execute, is_operator(), extra...); + #endif + } + template void execute_cast(Class &cl, const Extra&... extra) const { + using Base = typename Class::type; + using L_type = conditional_t::value, Base, L>; + using R_type = conditional_t::value, Base, R>; + using op = op_impl; + cl.def(op::name(), &op::execute_cast, is_operator(), extra...); + #if PY_MAJOR_VERSION < 3 + if (id == op_truediv || id == op_itruediv) + cl.def(id == op_itruediv ? "__idiv__" : ot == op_l ? "__div__" : "__rdiv__", + &op::execute, is_operator(), extra...); + #endif + } +}; + +#define PYBIND11_BINARY_OPERATOR(id, rid, op, expr) \ +template struct op_impl { \ + static char const* name() { return "__" #id "__"; } \ + static auto execute(const L &l, const R &r) -> decltype(expr) { return (expr); } \ + static B execute_cast(const L &l, const R &r) { return B(expr); } \ +}; \ +template struct op_impl { \ + static char const* name() { return "__" #rid "__"; } \ + static auto execute(const R &r, const L &l) -> decltype(expr) { return (expr); } \ + static B execute_cast(const R &r, const L &l) { return B(expr); } \ +}; \ +inline op_ op(const self_t &, const self_t &) { \ + return op_(); \ +} \ +template op_ op(const self_t &, const T &) { \ + return op_(); \ +} \ +template op_ op(const T &, const self_t &) { \ + return op_(); \ +} + +#define PYBIND11_INPLACE_OPERATOR(id, op, expr) \ +template struct op_impl { \ + static char const* name() { return "__" #id "__"; } \ + static auto execute(L &l, const R &r) -> decltype(expr) { return expr; } \ + static B execute_cast(L &l, const R &r) { return B(expr); } \ +}; \ +template op_ op(const self_t &, const T &) { \ + return op_(); \ +} + +#define PYBIND11_UNARY_OPERATOR(id, op, expr) \ +template struct op_impl { \ + static char const* name() { return "__" #id "__"; } \ + static auto execute(const L &l) -> decltype(expr) { return expr; } \ + static B execute_cast(const L &l) { return B(expr); } \ +}; \ +inline op_ op(const self_t &) { \ + return op_(); \ +} + +PYBIND11_BINARY_OPERATOR(sub, rsub, operator-, l - r) +PYBIND11_BINARY_OPERATOR(add, radd, operator+, l + r) +PYBIND11_BINARY_OPERATOR(mul, rmul, operator*, l * r) +PYBIND11_BINARY_OPERATOR(truediv, rtruediv, operator/, l / r) +PYBIND11_BINARY_OPERATOR(mod, rmod, operator%, l % r) +PYBIND11_BINARY_OPERATOR(lshift, rlshift, operator<<, l << r) +PYBIND11_BINARY_OPERATOR(rshift, rrshift, operator>>, l >> r) +PYBIND11_BINARY_OPERATOR(and, rand, operator&, l & r) +PYBIND11_BINARY_OPERATOR(xor, rxor, operator^, l ^ r) +PYBIND11_BINARY_OPERATOR(eq, eq, operator==, l == r) +PYBIND11_BINARY_OPERATOR(ne, ne, operator!=, l != r) +PYBIND11_BINARY_OPERATOR(or, ror, operator|, l | r) +PYBIND11_BINARY_OPERATOR(gt, lt, operator>, l > r) +PYBIND11_BINARY_OPERATOR(ge, le, operator>=, l >= r) +PYBIND11_BINARY_OPERATOR(lt, gt, operator<, l < r) +PYBIND11_BINARY_OPERATOR(le, ge, operator<=, l <= r) +//PYBIND11_BINARY_OPERATOR(pow, rpow, pow, std::pow(l, r)) +PYBIND11_INPLACE_OPERATOR(iadd, operator+=, l += r) +PYBIND11_INPLACE_OPERATOR(isub, operator-=, l -= r) +PYBIND11_INPLACE_OPERATOR(imul, operator*=, l *= r) +PYBIND11_INPLACE_OPERATOR(itruediv, operator/=, l /= r) +PYBIND11_INPLACE_OPERATOR(imod, operator%=, l %= r) +PYBIND11_INPLACE_OPERATOR(ilshift, operator<<=, l <<= r) +PYBIND11_INPLACE_OPERATOR(irshift, operator>>=, l >>= r) +PYBIND11_INPLACE_OPERATOR(iand, operator&=, l &= r) +PYBIND11_INPLACE_OPERATOR(ixor, operator^=, l ^= r) +PYBIND11_INPLACE_OPERATOR(ior, operator|=, l |= r) +PYBIND11_UNARY_OPERATOR(neg, operator-, -l) +PYBIND11_UNARY_OPERATOR(pos, operator+, +l) +PYBIND11_UNARY_OPERATOR(abs, abs, std::abs(l)) +PYBIND11_UNARY_OPERATOR(hash, hash, std::hash()(l)) +PYBIND11_UNARY_OPERATOR(invert, operator~, (~l)) +PYBIND11_UNARY_OPERATOR(bool, operator!, !!l) +PYBIND11_UNARY_OPERATOR(int, int_, (int) l) +PYBIND11_UNARY_OPERATOR(float, float_, (double) l) + +#undef PYBIND11_BINARY_OPERATOR +#undef PYBIND11_INPLACE_OPERATOR +#undef PYBIND11_UNARY_OPERATOR +NAMESPACE_END(detail) + +using detail::self; + +NAMESPACE_END(PYBIND11_NAMESPACE) + +#if defined(_MSC_VER) +# pragma warning(pop) +#endif diff --git a/python/src/pybind11/options.h b/python/src/pybind11/options.h new file mode 100644 index 000000000..cc1e1f6f0 --- /dev/null +++ b/python/src/pybind11/options.h @@ -0,0 +1,65 @@ +/* + pybind11/options.h: global settings that are configurable at runtime. + + Copyright (c) 2016 Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include "detail/common.h" + +NAMESPACE_BEGIN(PYBIND11_NAMESPACE) + +class options { +public: + + // Default RAII constructor, which leaves settings as they currently are. + options() : previous_state(global_state()) {} + + // Class is non-copyable. + options(const options&) = delete; + options& operator=(const options&) = delete; + + // Destructor, which restores settings that were in effect before. + ~options() { + global_state() = previous_state; + } + + // Setter methods (affect the global state): + + options& disable_user_defined_docstrings() & { global_state().show_user_defined_docstrings = false; return *this; } + + options& enable_user_defined_docstrings() & { global_state().show_user_defined_docstrings = true; return *this; } + + options& disable_function_signatures() & { global_state().show_function_signatures = false; return *this; } + + options& enable_function_signatures() & { global_state().show_function_signatures = true; return *this; } + + // Getter methods (return the global state): + + static bool show_user_defined_docstrings() { return global_state().show_user_defined_docstrings; } + + static bool show_function_signatures() { return global_state().show_function_signatures; } + + // This type is not meant to be allocated on the heap. + void* operator new(size_t) = delete; + +private: + + struct state { + bool show_user_defined_docstrings = true; //< Include user-supplied texts in docstrings. + bool show_function_signatures = true; //< Include auto-generated function signatures in docstrings. + }; + + static state &global_state() { + static state instance; + return instance; + } + + state previous_state; +}; + +NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/python/src/pybind11/pybind11.h b/python/src/pybind11/pybind11.h new file mode 100644 index 000000000..f1d91c788 --- /dev/null +++ b/python/src/pybind11/pybind11.h @@ -0,0 +1,2162 @@ +/* + pybind11/pybind11.h: Main header file of the C++11 python + binding generator library + + Copyright (c) 2016 Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#if defined(__INTEL_COMPILER) +# pragma warning push +# pragma warning disable 68 // integer conversion resulted in a change of sign +# pragma warning disable 186 // pointless comparison of unsigned integer with zero +# pragma warning disable 878 // incompatible exception specifications +# pragma warning disable 1334 // the "template" keyword used for syntactic disambiguation may only be used within a template +# pragma warning disable 1682 // implicit conversion of a 64-bit integral type to a smaller integral type (potential portability problem) +# pragma warning disable 1786 // function "strdup" was declared deprecated +# pragma warning disable 1875 // offsetof applied to non-POD (Plain Old Data) types is nonstandard +# pragma warning disable 2196 // warning #2196: routine is both "inline" and "noinline" +#elif defined(_MSC_VER) +# pragma warning(push) +# pragma warning(disable: 4100) // warning C4100: Unreferenced formal parameter +# pragma warning(disable: 4127) // warning C4127: Conditional expression is constant +# pragma warning(disable: 4512) // warning C4512: Assignment operator was implicitly defined as deleted +# pragma warning(disable: 4800) // warning C4800: 'int': forcing value to bool 'true' or 'false' (performance warning) +# pragma warning(disable: 4996) // warning C4996: The POSIX name for this item is deprecated. Instead, use the ISO C and C++ conformant name +# pragma warning(disable: 4702) // warning C4702: unreachable code +# pragma warning(disable: 4522) // warning C4522: multiple assignment operators specified +#elif defined(__GNUG__) && !defined(__clang__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wunused-but-set-parameter" +# pragma GCC diagnostic ignored "-Wunused-but-set-variable" +# pragma GCC diagnostic ignored "-Wmissing-field-initializers" +# pragma GCC diagnostic ignored "-Wstrict-aliasing" +# pragma GCC diagnostic ignored "-Wattributes" +# if __GNUC__ >= 7 +# pragma GCC diagnostic ignored "-Wnoexcept-type" +# endif +#endif + +#if defined(__GNUG__) && !defined(__clang__) + #include +#endif + + +#include "attr.h" +#include "options.h" +#include "detail/class.h" +#include "detail/init.h" + +NAMESPACE_BEGIN(PYBIND11_NAMESPACE) + +/// Wraps an arbitrary C++ function/method/lambda function/.. into a callable Python object +class cpp_function : public function { +public: + cpp_function() { } + cpp_function(std::nullptr_t) { } + + /// Construct a cpp_function from a vanilla function pointer + template + cpp_function(Return (*f)(Args...), const Extra&... extra) { + initialize(f, f, extra...); + } + + /// Construct a cpp_function from a lambda function (possibly with internal state) + template ::value>> + cpp_function(Func &&f, const Extra&... extra) { + initialize(std::forward(f), + (detail::function_signature_t *) nullptr, extra...); + } + + /// Construct a cpp_function from a class method (non-const) + template + cpp_function(Return (Class::*f)(Arg...), const Extra&... extra) { + initialize([f](Class *c, Arg... args) -> Return { return (c->*f)(args...); }, + (Return (*) (Class *, Arg...)) nullptr, extra...); + } + + /// Construct a cpp_function from a class method (const) + template + cpp_function(Return (Class::*f)(Arg...) const, const Extra&... extra) { + initialize([f](const Class *c, Arg... args) -> Return { return (c->*f)(args...); }, + (Return (*)(const Class *, Arg ...)) nullptr, extra...); + } + + /// Return the function name + object name() const { return attr("__name__"); } + +protected: + /// Space optimization: don't inline this frequently instantiated fragment + PYBIND11_NOINLINE detail::function_record *make_function_record() { + return new detail::function_record(); + } + + /// Special internal constructor for functors, lambda functions, etc. + template + void initialize(Func &&f, Return (*)(Args...), const Extra&... extra) { + using namespace detail; + struct capture { remove_reference_t f; }; + + /* Store the function including any extra state it might have (e.g. a lambda capture object) */ + auto rec = make_function_record(); + + /* Store the capture object directly in the function record if there is enough space */ + if (sizeof(capture) <= sizeof(rec->data)) { + /* Without these pragmas, GCC warns that there might not be + enough space to use the placement new operator. However, the + 'if' statement above ensures that this is the case. */ +#if defined(__GNUG__) && !defined(__clang__) && __GNUC__ >= 6 +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wplacement-new" +#endif + new ((capture *) &rec->data) capture { std::forward(f) }; +#if defined(__GNUG__) && !defined(__clang__) && __GNUC__ >= 6 +# pragma GCC diagnostic pop +#endif + if (!std::is_trivially_destructible::value) + rec->free_data = [](function_record *r) { ((capture *) &r->data)->~capture(); }; + } else { + rec->data[0] = new capture { std::forward(f) }; + rec->free_data = [](function_record *r) { delete ((capture *) r->data[0]); }; + } + + /* Type casters for the function arguments and return value */ + using cast_in = argument_loader; + using cast_out = make_caster< + conditional_t::value, void_type, Return> + >; + + static_assert(expected_num_args(sizeof...(Args), cast_in::has_args, cast_in::has_kwargs), + "The number of argument annotations does not match the number of function arguments"); + + /* Dispatch code which converts function arguments and performs the actual function call */ + rec->impl = [](function_call &call) -> handle { + cast_in args_converter; + + /* Try to cast the function arguments into the C++ domain */ + if (!args_converter.load_args(call)) + return PYBIND11_TRY_NEXT_OVERLOAD; + + /* Invoke call policy pre-call hook */ + process_attributes::precall(call); + + /* Get a pointer to the capture object */ + auto data = (sizeof(capture) <= sizeof(call.func.data) + ? &call.func.data : call.func.data[0]); + capture *cap = const_cast(reinterpret_cast(data)); + + /* Override policy for rvalues -- usually to enforce rvp::move on an rvalue */ + return_value_policy policy = return_value_policy_override::policy(call.func.policy); + + /* Function scope guard -- defaults to the compile-to-nothing `void_type` */ + using Guard = extract_guard_t; + + /* Perform the function call */ + handle result = cast_out::cast( + std::move(args_converter).template call(cap->f), policy, call.parent); + + /* Invoke call policy post-call hook */ + process_attributes::postcall(call, result); + + return result; + }; + + /* Process any user-provided function attributes */ + process_attributes::init(extra..., rec); + + /* Generate a readable signature describing the function's arguments and return value types */ + static constexpr auto signature = _("(") + cast_in::arg_names + _(") -> ") + cast_out::name; + PYBIND11_DESCR_CONSTEXPR auto types = decltype(signature)::types(); + + /* Register the function with Python from generic (non-templated) code */ + initialize_generic(rec, signature.text, types.data(), sizeof...(Args)); + + if (cast_in::has_args) rec->has_args = true; + if (cast_in::has_kwargs) rec->has_kwargs = true; + + /* Stash some additional information used by an important optimization in 'functional.h' */ + using FunctionType = Return (*)(Args...); + constexpr bool is_function_ptr = + std::is_convertible::value && + sizeof(capture) == sizeof(void *); + if (is_function_ptr) { + rec->is_stateless = true; + rec->data[1] = const_cast(reinterpret_cast(&typeid(FunctionType))); + } + } + + /// Register a function call with Python (generic non-templated code goes here) + void initialize_generic(detail::function_record *rec, const char *text, + const std::type_info *const *types, size_t args) { + + /* Create copies of all referenced C-style strings */ + rec->name = strdup(rec->name ? rec->name : ""); + if (rec->doc) rec->doc = strdup(rec->doc); + for (auto &a: rec->args) { + if (a.name) + a.name = strdup(a.name); + if (a.descr) + a.descr = strdup(a.descr); + else if (a.value) + a.descr = strdup(a.value.attr("__repr__")().cast().c_str()); + } + + rec->is_constructor = !strcmp(rec->name, "__init__") || !strcmp(rec->name, "__setstate__"); + +#if !defined(NDEBUG) && !defined(PYBIND11_DISABLE_NEW_STYLE_INIT_WARNING) + if (rec->is_constructor && !rec->is_new_style_constructor) { + const auto class_name = std::string(((PyTypeObject *) rec->scope.ptr())->tp_name); + const auto func_name = std::string(rec->name); + PyErr_WarnEx( + PyExc_FutureWarning, + ("pybind11-bound class '" + class_name + "' is using an old-style " + "placement-new '" + func_name + "' which has been deprecated. See " + "the upgrade guide in pybind11's docs. This message is only visible " + "when compiled in debug mode.").c_str(), 0 + ); + } +#endif + + /* Generate a proper function signature */ + std::string signature; + size_t type_index = 0, arg_index = 0; + for (auto *pc = text; *pc != '\0'; ++pc) { + const auto c = *pc; + + if (c == '{') { + // Write arg name for everything except *args and **kwargs. + if (*(pc + 1) == '*') + continue; + + if (arg_index < rec->args.size() && rec->args[arg_index].name) { + signature += rec->args[arg_index].name; + } else if (arg_index == 0 && rec->is_method) { + signature += "self"; + } else { + signature += "arg" + std::to_string(arg_index - (rec->is_method ? 1 : 0)); + } + signature += ": "; + } else if (c == '}') { + // Write default value if available. + if (arg_index < rec->args.size() && rec->args[arg_index].descr) { + signature += " = "; + signature += rec->args[arg_index].descr; + } + arg_index++; + } else if (c == '%') { + const std::type_info *t = types[type_index++]; + if (!t) + pybind11_fail("Internal error while parsing type signature (1)"); + if (auto tinfo = detail::get_type_info(*t)) { + handle th((PyObject *) tinfo->type); + signature += + th.attr("__module__").cast() + "." + + th.attr("__qualname__").cast(); // Python 3.3+, but we backport it to earlier versions + } else if (rec->is_new_style_constructor && arg_index == 0) { + // A new-style `__init__` takes `self` as `value_and_holder`. + // Rewrite it to the proper class type. + signature += + rec->scope.attr("__module__").cast() + "." + + rec->scope.attr("__qualname__").cast(); + } else { + std::string tname(t->name()); + detail::clean_type_id(tname); + signature += tname; + } + } else { + signature += c; + } + } + if (arg_index != args || types[type_index] != nullptr) + pybind11_fail("Internal error while parsing type signature (2)"); + +#if PY_MAJOR_VERSION < 3 + if (strcmp(rec->name, "__next__") == 0) { + std::free(rec->name); + rec->name = strdup("next"); + } else if (strcmp(rec->name, "__bool__") == 0) { + std::free(rec->name); + rec->name = strdup("__nonzero__"); + } +#endif + rec->signature = strdup(signature.c_str()); + rec->args.shrink_to_fit(); + rec->nargs = (std::uint16_t) args; + + if (rec->sibling && PYBIND11_INSTANCE_METHOD_CHECK(rec->sibling.ptr())) + rec->sibling = PYBIND11_INSTANCE_METHOD_GET_FUNCTION(rec->sibling.ptr()); + + detail::function_record *chain = nullptr, *chain_start = rec; + if (rec->sibling) { + if (PyCFunction_Check(rec->sibling.ptr())) { + auto rec_capsule = reinterpret_borrow(PyCFunction_GET_SELF(rec->sibling.ptr())); + chain = (detail::function_record *) rec_capsule; + /* Never append a method to an overload chain of a parent class; + instead, hide the parent's overloads in this case */ + if (!chain->scope.is(rec->scope)) + chain = nullptr; + } + // Don't trigger for things like the default __init__, which are wrapper_descriptors that we are intentionally replacing + else if (!rec->sibling.is_none() && rec->name[0] != '_') + pybind11_fail("Cannot overload existing non-function object \"" + std::string(rec->name) + + "\" with a function of the same name"); + } + + if (!chain) { + /* No existing overload was found, create a new function object */ + rec->def = new PyMethodDef(); + std::memset(rec->def, 0, sizeof(PyMethodDef)); + rec->def->ml_name = rec->name; + rec->def->ml_meth = reinterpret_cast(reinterpret_cast(*dispatcher)); + rec->def->ml_flags = METH_VARARGS | METH_KEYWORDS; + + capsule rec_capsule(rec, [](void *ptr) { + destruct((detail::function_record *) ptr); + }); + + object scope_module; + if (rec->scope) { + if (hasattr(rec->scope, "__module__")) { + scope_module = rec->scope.attr("__module__"); + } else if (hasattr(rec->scope, "__name__")) { + scope_module = rec->scope.attr("__name__"); + } + } + + m_ptr = PyCFunction_NewEx(rec->def, rec_capsule.ptr(), scope_module.ptr()); + if (!m_ptr) + pybind11_fail("cpp_function::cpp_function(): Could not allocate function object"); + } else { + /* Append at the end of the overload chain */ + m_ptr = rec->sibling.ptr(); + inc_ref(); + chain_start = chain; + if (chain->is_method != rec->is_method) + pybind11_fail("overloading a method with both static and instance methods is not supported; " + #if defined(NDEBUG) + "compile in debug mode for more details" + #else + "error while attempting to bind " + std::string(rec->is_method ? "instance" : "static") + " method " + + std::string(pybind11::str(rec->scope.attr("__name__"))) + "." + std::string(rec->name) + signature + #endif + ); + while (chain->next) + chain = chain->next; + chain->next = rec; + } + + std::string signatures; + int index = 0; + /* Create a nice pydoc rec including all signatures and + docstrings of the functions in the overload chain */ + if (chain && options::show_function_signatures()) { + // First a generic signature + signatures += rec->name; + signatures += "(*args, **kwargs)\n"; + signatures += "Overloaded function.\n\n"; + } + // Then specific overload signatures + bool first_user_def = true; + for (auto it = chain_start; it != nullptr; it = it->next) { + if (options::show_function_signatures()) { + if (index > 0) signatures += "\n"; + if (chain) + signatures += std::to_string(++index) + ". "; + signatures += rec->name; + signatures += it->signature; + signatures += "\n"; + } + if (it->doc && strlen(it->doc) > 0 && options::show_user_defined_docstrings()) { + // If we're appending another docstring, and aren't printing function signatures, we + // need to append a newline first: + if (!options::show_function_signatures()) { + if (first_user_def) first_user_def = false; + else signatures += "\n"; + } + if (options::show_function_signatures()) signatures += "\n"; + signatures += it->doc; + if (options::show_function_signatures()) signatures += "\n"; + } + } + + /* Install docstring */ + PyCFunctionObject *func = (PyCFunctionObject *) m_ptr; + if (func->m_ml->ml_doc) + std::free(const_cast(func->m_ml->ml_doc)); + func->m_ml->ml_doc = strdup(signatures.c_str()); + + if (rec->is_method) { + m_ptr = PYBIND11_INSTANCE_METHOD_NEW(m_ptr, rec->scope.ptr()); + if (!m_ptr) + pybind11_fail("cpp_function::cpp_function(): Could not allocate instance method object"); + Py_DECREF(func); + } + } + + /// When a cpp_function is GCed, release any memory allocated by pybind11 + static void destruct(detail::function_record *rec) { + while (rec) { + detail::function_record *next = rec->next; + if (rec->free_data) + rec->free_data(rec); + std::free((char *) rec->name); + std::free((char *) rec->doc); + std::free((char *) rec->signature); + for (auto &arg: rec->args) { + std::free(const_cast(arg.name)); + std::free(const_cast(arg.descr)); + arg.value.dec_ref(); + } + if (rec->def) { + std::free(const_cast(rec->def->ml_doc)); + delete rec->def; + } + delete rec; + rec = next; + } + } + + /// Main dispatch logic for calls to functions bound using pybind11 + static PyObject *dispatcher(PyObject *self, PyObject *args_in, PyObject *kwargs_in) { + using namespace detail; + + /* Iterator over the list of potentially admissible overloads */ + const function_record *overloads = (function_record *) PyCapsule_GetPointer(self, nullptr), + *it = overloads; + + /* Need to know how many arguments + keyword arguments there are to pick the right overload */ + const size_t n_args_in = (size_t) PyTuple_GET_SIZE(args_in); + + handle parent = n_args_in > 0 ? PyTuple_GET_ITEM(args_in, 0) : nullptr, + result = PYBIND11_TRY_NEXT_OVERLOAD; + + auto self_value_and_holder = value_and_holder(); + if (overloads->is_constructor) { + const auto tinfo = get_type_info((PyTypeObject *) overloads->scope.ptr()); + const auto pi = reinterpret_cast(parent.ptr()); + self_value_and_holder = pi->get_value_and_holder(tinfo, false); + + if (!self_value_and_holder.type || !self_value_and_holder.inst) { + PyErr_SetString(PyExc_TypeError, "__init__(self, ...) called with invalid `self` argument"); + return nullptr; + } + + // If this value is already registered it must mean __init__ is invoked multiple times; + // we really can't support that in C++, so just ignore the second __init__. + if (self_value_and_holder.instance_registered()) + return none().release().ptr(); + } + + try { + // We do this in two passes: in the first pass, we load arguments with `convert=false`; + // in the second, we allow conversion (except for arguments with an explicit + // py::arg().noconvert()). This lets us prefer calls without conversion, with + // conversion as a fallback. + std::vector second_pass; + + // However, if there are no overloads, we can just skip the no-convert pass entirely + const bool overloaded = it != nullptr && it->next != nullptr; + + for (; it != nullptr; it = it->next) { + + /* For each overload: + 1. Copy all positional arguments we were given, also checking to make sure that + named positional arguments weren't *also* specified via kwarg. + 2. If we weren't given enough, try to make up the omitted ones by checking + whether they were provided by a kwarg matching the `py::arg("name")` name. If + so, use it (and remove it from kwargs; if not, see if the function binding + provided a default that we can use. + 3. Ensure that either all keyword arguments were "consumed", or that the function + takes a kwargs argument to accept unconsumed kwargs. + 4. Any positional arguments still left get put into a tuple (for args), and any + leftover kwargs get put into a dict. + 5. Pack everything into a vector; if we have py::args or py::kwargs, they are an + extra tuple or dict at the end of the positional arguments. + 6. Call the function call dispatcher (function_record::impl) + + If one of these fail, move on to the next overload and keep trying until we get a + result other than PYBIND11_TRY_NEXT_OVERLOAD. + */ + + const function_record &func = *it; + size_t pos_args = func.nargs; // Number of positional arguments that we need + if (func.has_args) --pos_args; // (but don't count py::args + if (func.has_kwargs) --pos_args; // or py::kwargs) + + if (!func.has_args && n_args_in > pos_args) + continue; // Too many arguments for this overload + + if (n_args_in < pos_args && func.args.size() < pos_args) + continue; // Not enough arguments given, and not enough defaults to fill in the blanks + + function_call call(func, parent); + + size_t args_to_copy = std::min(pos_args, n_args_in); + size_t args_copied = 0; + + // 0. Inject new-style `self` argument + if (func.is_new_style_constructor) { + // The `value` may have been preallocated by an old-style `__init__` + // if it was a preceding candidate for overload resolution. + if (self_value_and_holder) + self_value_and_holder.type->dealloc(self_value_and_holder); + + call.init_self = PyTuple_GET_ITEM(args_in, 0); + call.args.push_back(reinterpret_cast(&self_value_and_holder)); + call.args_convert.push_back(false); + ++args_copied; + } + + // 1. Copy any position arguments given. + bool bad_arg = false; + for (; args_copied < args_to_copy; ++args_copied) { + const argument_record *arg_rec = args_copied < func.args.size() ? &func.args[args_copied] : nullptr; + if (kwargs_in && arg_rec && arg_rec->name && PyDict_GetItemString(kwargs_in, arg_rec->name)) { + bad_arg = true; + break; + } + + handle arg(PyTuple_GET_ITEM(args_in, args_copied)); + if (arg_rec && !arg_rec->none && arg.is_none()) { + bad_arg = true; + break; + } + call.args.push_back(arg); + call.args_convert.push_back(arg_rec ? arg_rec->convert : true); + } + if (bad_arg) + continue; // Maybe it was meant for another overload (issue #688) + + // We'll need to copy this if we steal some kwargs for defaults + dict kwargs = reinterpret_borrow(kwargs_in); + + // 2. Check kwargs and, failing that, defaults that may help complete the list + if (args_copied < pos_args) { + bool copied_kwargs = false; + + for (; args_copied < pos_args; ++args_copied) { + const auto &arg = func.args[args_copied]; + + handle value; + if (kwargs_in && arg.name) + value = PyDict_GetItemString(kwargs.ptr(), arg.name); + + if (value) { + // Consume a kwargs value + if (!copied_kwargs) { + kwargs = reinterpret_steal(PyDict_Copy(kwargs.ptr())); + copied_kwargs = true; + } + PyDict_DelItemString(kwargs.ptr(), arg.name); + } else if (arg.value) { + value = arg.value; + } + + if (value) { + call.args.push_back(value); + call.args_convert.push_back(arg.convert); + } + else + break; + } + + if (args_copied < pos_args) + continue; // Not enough arguments, defaults, or kwargs to fill the positional arguments + } + + // 3. Check everything was consumed (unless we have a kwargs arg) + if (kwargs && kwargs.size() > 0 && !func.has_kwargs) + continue; // Unconsumed kwargs, but no py::kwargs argument to accept them + + // 4a. If we have a py::args argument, create a new tuple with leftovers + if (func.has_args) { + tuple extra_args; + if (args_to_copy == 0) { + // We didn't copy out any position arguments from the args_in tuple, so we + // can reuse it directly without copying: + extra_args = reinterpret_borrow(args_in); + } else if (args_copied >= n_args_in) { + extra_args = tuple(0); + } else { + size_t args_size = n_args_in - args_copied; + extra_args = tuple(args_size); + for (size_t i = 0; i < args_size; ++i) { + extra_args[i] = PyTuple_GET_ITEM(args_in, args_copied + i); + } + } + call.args.push_back(extra_args); + call.args_convert.push_back(false); + call.args_ref = std::move(extra_args); + } + + // 4b. If we have a py::kwargs, pass on any remaining kwargs + if (func.has_kwargs) { + if (!kwargs.ptr()) + kwargs = dict(); // If we didn't get one, send an empty one + call.args.push_back(kwargs); + call.args_convert.push_back(false); + call.kwargs_ref = std::move(kwargs); + } + + // 5. Put everything in a vector. Not technically step 5, we've been building it + // in `call.args` all along. + #if !defined(NDEBUG) + if (call.args.size() != func.nargs || call.args_convert.size() != func.nargs) + pybind11_fail("Internal error: function call dispatcher inserted wrong number of arguments!"); + #endif + + std::vector second_pass_convert; + if (overloaded) { + // We're in the first no-convert pass, so swap out the conversion flags for a + // set of all-false flags. If the call fails, we'll swap the flags back in for + // the conversion-allowed call below. + second_pass_convert.resize(func.nargs, false); + call.args_convert.swap(second_pass_convert); + } + + // 6. Call the function. + try { + loader_life_support guard{}; + result = func.impl(call); + } catch (reference_cast_error &) { + result = PYBIND11_TRY_NEXT_OVERLOAD; + } + + if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD) + break; + + if (overloaded) { + // The (overloaded) call failed; if the call has at least one argument that + // permits conversion (i.e. it hasn't been explicitly specified `.noconvert()`) + // then add this call to the list of second pass overloads to try. + for (size_t i = func.is_method ? 1 : 0; i < pos_args; i++) { + if (second_pass_convert[i]) { + // Found one: swap the converting flags back in and store the call for + // the second pass. + call.args_convert.swap(second_pass_convert); + second_pass.push_back(std::move(call)); + break; + } + } + } + } + + if (overloaded && !second_pass.empty() && result.ptr() == PYBIND11_TRY_NEXT_OVERLOAD) { + // The no-conversion pass finished without success, try again with conversion allowed + for (auto &call : second_pass) { + try { + loader_life_support guard{}; + result = call.func.impl(call); + } catch (reference_cast_error &) { + result = PYBIND11_TRY_NEXT_OVERLOAD; + } + + if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD) { + // The error reporting logic below expects 'it' to be valid, as it would be + // if we'd encountered this failure in the first-pass loop. + if (!result) + it = &call.func; + break; + } + } + } + } catch (error_already_set &e) { + e.restore(); + return nullptr; +#if defined(__GNUG__) && !defined(__clang__) + } catch ( abi::__forced_unwind& ) { + throw; +#endif + } catch (...) { + /* When an exception is caught, give each registered exception + translator a chance to translate it to a Python exception + in reverse order of registration. + + A translator may choose to do one of the following: + + - catch the exception and call PyErr_SetString or PyErr_SetObject + to set a standard (or custom) Python exception, or + - do nothing and let the exception fall through to the next translator, or + - delegate translation to the next translator by throwing a new type of exception. */ + + auto last_exception = std::current_exception(); + auto ®istered_exception_translators = get_internals().registered_exception_translators; + for (auto& translator : registered_exception_translators) { + try { + translator(last_exception); + } catch (...) { + last_exception = std::current_exception(); + continue; + } + return nullptr; + } + PyErr_SetString(PyExc_SystemError, "Exception escaped from default exception translator!"); + return nullptr; + } + + auto append_note_if_missing_header_is_suspected = [](std::string &msg) { + if (msg.find("std::") != std::string::npos) { + msg += "\n\n" + "Did you forget to `#include `? Or ,\n" + ", , etc. Some automatic\n" + "conversions are optional and require extra headers to be included\n" + "when compiling your pybind11 module."; + } + }; + + if (result.ptr() == PYBIND11_TRY_NEXT_OVERLOAD) { + if (overloads->is_operator) + return handle(Py_NotImplemented).inc_ref().ptr(); + + std::string msg = std::string(overloads->name) + "(): incompatible " + + std::string(overloads->is_constructor ? "constructor" : "function") + + " arguments. The following argument types are supported:\n"; + + int ctr = 0; + for (const function_record *it2 = overloads; it2 != nullptr; it2 = it2->next) { + msg += " "+ std::to_string(++ctr) + ". "; + + bool wrote_sig = false; + if (overloads->is_constructor) { + // For a constructor, rewrite `(self: Object, arg0, ...) -> NoneType` as `Object(arg0, ...)` + std::string sig = it2->signature; + size_t start = sig.find('(') + 7; // skip "(self: " + if (start < sig.size()) { + // End at the , for the next argument + size_t end = sig.find(", "), next = end + 2; + size_t ret = sig.rfind(" -> "); + // Or the ), if there is no comma: + if (end >= sig.size()) next = end = sig.find(')'); + if (start < end && next < sig.size()) { + msg.append(sig, start, end - start); + msg += '('; + msg.append(sig, next, ret - next); + wrote_sig = true; + } + } + } + if (!wrote_sig) msg += it2->signature; + + msg += "\n"; + } + msg += "\nInvoked with: "; + auto args_ = reinterpret_borrow(args_in); + bool some_args = false; + for (size_t ti = overloads->is_constructor ? 1 : 0; ti < args_.size(); ++ti) { + if (!some_args) some_args = true; + else msg += ", "; + msg += pybind11::repr(args_[ti]); + } + if (kwargs_in) { + auto kwargs = reinterpret_borrow(kwargs_in); + if (kwargs.size() > 0) { + if (some_args) msg += "; "; + msg += "kwargs: "; + bool first = true; + for (auto kwarg : kwargs) { + if (first) first = false; + else msg += ", "; + msg += pybind11::str("{}={!r}").format(kwarg.first, kwarg.second); + } + } + } + + append_note_if_missing_header_is_suspected(msg); + PyErr_SetString(PyExc_TypeError, msg.c_str()); + return nullptr; + } else if (!result) { + std::string msg = "Unable to convert function return value to a " + "Python type! The signature was\n\t"; + msg += it->signature; + append_note_if_missing_header_is_suspected(msg); + PyErr_SetString(PyExc_TypeError, msg.c_str()); + return nullptr; + } else { + if (overloads->is_constructor && !self_value_and_holder.holder_constructed()) { + auto *pi = reinterpret_cast(parent.ptr()); + self_value_and_holder.type->init_instance(pi, nullptr); + } + return result.ptr(); + } + } +}; + +/// Wrapper for Python extension modules +class module : public object { +public: + PYBIND11_OBJECT_DEFAULT(module, object, PyModule_Check) + + /// Create a new top-level Python module with the given name and docstring + explicit module(const char *name, const char *doc = nullptr) { + if (!options::show_user_defined_docstrings()) doc = nullptr; +#if PY_MAJOR_VERSION >= 3 + PyModuleDef *def = new PyModuleDef(); + std::memset(def, 0, sizeof(PyModuleDef)); + def->m_name = name; + def->m_doc = doc; + def->m_size = -1; + Py_INCREF(def); + m_ptr = PyModule_Create(def); +#else + m_ptr = Py_InitModule3(name, nullptr, doc); +#endif + if (m_ptr == nullptr) + pybind11_fail("Internal error in module::module()"); + inc_ref(); + } + + /** \rst + Create Python binding for a new function within the module scope. ``Func`` + can be a plain C++ function, a function pointer, or a lambda function. For + details on the ``Extra&& ... extra`` argument, see section :ref:`extras`. + \endrst */ + template + module &def(const char *name_, Func &&f, const Extra& ... extra) { + cpp_function func(std::forward(f), name(name_), scope(*this), + sibling(getattr(*this, name_, none())), extra...); + // NB: allow overwriting here because cpp_function sets up a chain with the intention of + // overwriting (and has already checked internally that it isn't overwriting non-functions). + add_object(name_, func, true /* overwrite */); + return *this; + } + + /** \rst + Create and return a new Python submodule with the given name and docstring. + This also works recursively, i.e. + + .. code-block:: cpp + + py::module m("example", "pybind11 example plugin"); + py::module m2 = m.def_submodule("sub", "A submodule of 'example'"); + py::module m3 = m2.def_submodule("subsub", "A submodule of 'example.sub'"); + \endrst */ + module def_submodule(const char *name, const char *doc = nullptr) { + std::string full_name = std::string(PyModule_GetName(m_ptr)) + + std::string(".") + std::string(name); + auto result = reinterpret_borrow(PyImport_AddModule(full_name.c_str())); + if (doc && options::show_user_defined_docstrings()) + result.attr("__doc__") = pybind11::str(doc); + attr(name) = result; + return result; + } + + /// Import and return a module or throws `error_already_set`. + static module import(const char *name) { + PyObject *obj = PyImport_ImportModule(name); + if (!obj) + throw error_already_set(); + return reinterpret_steal(obj); + } + + /// Reload the module or throws `error_already_set`. + void reload() { + PyObject *obj = PyImport_ReloadModule(ptr()); + if (!obj) + throw error_already_set(); + *this = reinterpret_steal(obj); + } + + // Adds an object to the module using the given name. Throws if an object with the given name + // already exists. + // + // overwrite should almost always be false: attempting to overwrite objects that pybind11 has + // established will, in most cases, break things. + PYBIND11_NOINLINE void add_object(const char *name, handle obj, bool overwrite = false) { + if (!overwrite && hasattr(*this, name)) + pybind11_fail("Error during initialization: multiple incompatible definitions with name \"" + + std::string(name) + "\""); + + PyModule_AddObject(ptr(), name, obj.inc_ref().ptr() /* steals a reference */); + } +}; + +/// \ingroup python_builtins +/// Return a dictionary representing the global variables in the current execution frame, +/// or ``__main__.__dict__`` if there is no frame (usually when the interpreter is embedded). +inline dict globals() { + PyObject *p = PyEval_GetGlobals(); + return reinterpret_borrow(p ? p : module::import("__main__").attr("__dict__").ptr()); +} + +NAMESPACE_BEGIN(detail) +/// Generic support for creating new Python heap types +class generic_type : public object { + template friend class class_; +public: + PYBIND11_OBJECT_DEFAULT(generic_type, object, PyType_Check) +protected: + void initialize(const type_record &rec) { + if (rec.scope && hasattr(rec.scope, rec.name)) + pybind11_fail("generic_type: cannot initialize type \"" + std::string(rec.name) + + "\": an object with that name is already defined"); + + if (rec.module_local ? get_local_type_info(*rec.type) : get_global_type_info(*rec.type)) + pybind11_fail("generic_type: type \"" + std::string(rec.name) + + "\" is already registered!"); + + m_ptr = make_new_python_type(rec); + + /* Register supplemental type information in C++ dict */ + auto *tinfo = new detail::type_info(); + tinfo->type = (PyTypeObject *) m_ptr; + tinfo->cpptype = rec.type; + tinfo->type_size = rec.type_size; + tinfo->type_align = rec.type_align; + tinfo->operator_new = rec.operator_new; + tinfo->holder_size_in_ptrs = size_in_ptrs(rec.holder_size); + tinfo->init_instance = rec.init_instance; + tinfo->dealloc = rec.dealloc; + tinfo->simple_type = true; + tinfo->simple_ancestors = true; + tinfo->default_holder = rec.default_holder; + tinfo->module_local = rec.module_local; + + auto &internals = get_internals(); + auto tindex = std::type_index(*rec.type); + tinfo->direct_conversions = &internals.direct_conversions[tindex]; + if (rec.module_local) + registered_local_types_cpp()[tindex] = tinfo; + else + internals.registered_types_cpp[tindex] = tinfo; + internals.registered_types_py[(PyTypeObject *) m_ptr] = { tinfo }; + + if (rec.bases.size() > 1 || rec.multiple_inheritance) { + mark_parents_nonsimple(tinfo->type); + tinfo->simple_ancestors = false; + } + else if (rec.bases.size() == 1) { + auto parent_tinfo = get_type_info((PyTypeObject *) rec.bases[0].ptr()); + tinfo->simple_ancestors = parent_tinfo->simple_ancestors; + } + + if (rec.module_local) { + // Stash the local typeinfo and loader so that external modules can access it. + tinfo->module_local_load = &type_caster_generic::local_load; + setattr(m_ptr, PYBIND11_MODULE_LOCAL_ID, capsule(tinfo)); + } + } + + /// Helper function which tags all parents of a type using mult. inheritance + void mark_parents_nonsimple(PyTypeObject *value) { + auto t = reinterpret_borrow(value->tp_bases); + for (handle h : t) { + auto tinfo2 = get_type_info((PyTypeObject *) h.ptr()); + if (tinfo2) + tinfo2->simple_type = false; + mark_parents_nonsimple((PyTypeObject *) h.ptr()); + } + } + + void install_buffer_funcs( + buffer_info *(*get_buffer)(PyObject *, void *), + void *get_buffer_data) { + PyHeapTypeObject *type = (PyHeapTypeObject*) m_ptr; + auto tinfo = detail::get_type_info(&type->ht_type); + + if (!type->ht_type.tp_as_buffer) + pybind11_fail( + "To be able to register buffer protocol support for the type '" + + std::string(tinfo->type->tp_name) + + "' the associated class<>(..) invocation must " + "include the pybind11::buffer_protocol() annotation!"); + + tinfo->get_buffer = get_buffer; + tinfo->get_buffer_data = get_buffer_data; + } + + // rec_func must be set for either fget or fset. + void def_property_static_impl(const char *name, + handle fget, handle fset, + detail::function_record *rec_func) { + const auto is_static = rec_func && !(rec_func->is_method && rec_func->scope); + const auto has_doc = rec_func && rec_func->doc && pybind11::options::show_user_defined_docstrings(); + auto property = handle((PyObject *) (is_static ? get_internals().static_property_type + : &PyProperty_Type)); + attr(name) = property(fget.ptr() ? fget : none(), + fset.ptr() ? fset : none(), + /*deleter*/none(), + pybind11::str(has_doc ? rec_func->doc : "")); + } +}; + +/// Set the pointer to operator new if it exists. The cast is needed because it can be overloaded. +template (T::operator new))>> +void set_operator_new(type_record *r) { r->operator_new = &T::operator new; } + +template void set_operator_new(...) { } + +template struct has_operator_delete : std::false_type { }; +template struct has_operator_delete(T::operator delete))>> + : std::true_type { }; +template struct has_operator_delete_size : std::false_type { }; +template struct has_operator_delete_size(T::operator delete))>> + : std::true_type { }; +/// Call class-specific delete if it exists or global otherwise. Can also be an overload set. +template ::value, int> = 0> +void call_operator_delete(T *p, size_t, size_t) { T::operator delete(p); } +template ::value && has_operator_delete_size::value, int> = 0> +void call_operator_delete(T *p, size_t s, size_t) { T::operator delete(p, s); } + +inline void call_operator_delete(void *p, size_t s, size_t a) { + (void)s; (void)a; +#if defined(PYBIND11_CPP17) + if (a > __STDCPP_DEFAULT_NEW_ALIGNMENT__) + ::operator delete(p, s, std::align_val_t(a)); + else + ::operator delete(p, s); +#else + ::operator delete(p); +#endif +} + +NAMESPACE_END(detail) + +/// Given a pointer to a member function, cast it to its `Derived` version. +/// Forward everything else unchanged. +template +auto method_adaptor(F &&f) -> decltype(std::forward(f)) { return std::forward(f); } + +template +auto method_adaptor(Return (Class::*pmf)(Args...)) -> Return (Derived::*)(Args...) { + static_assert(detail::is_accessible_base_of::value, + "Cannot bind an inaccessible base class method; use a lambda definition instead"); + return pmf; +} + +template +auto method_adaptor(Return (Class::*pmf)(Args...) const) -> Return (Derived::*)(Args...) const { + static_assert(detail::is_accessible_base_of::value, + "Cannot bind an inaccessible base class method; use a lambda definition instead"); + return pmf; +} + +template +class class_ : public detail::generic_type { + template using is_holder = detail::is_holder_type; + template using is_subtype = detail::is_strict_base_of; + template using is_base = detail::is_strict_base_of; + // struct instead of using here to help MSVC: + template struct is_valid_class_option : + detail::any_of, is_subtype, is_base> {}; + +public: + using type = type_; + using type_alias = detail::exactly_one_t; + constexpr static bool has_alias = !std::is_void::value; + using holder_type = detail::exactly_one_t, options...>; + + static_assert(detail::all_of...>::value, + "Unknown/invalid class_ template parameters provided"); + + static_assert(!has_alias || std::is_polymorphic::value, + "Cannot use an alias class with a non-polymorphic type"); + + PYBIND11_OBJECT(class_, generic_type, PyType_Check) + + template + class_(handle scope, const char *name, const Extra &... extra) { + using namespace detail; + + // MI can only be specified via class_ template options, not constructor parameters + static_assert( + none_of...>::value || // no base class arguments, or: + ( constexpr_sum(is_pyobject::value...) == 1 && // Exactly one base + constexpr_sum(is_base::value...) == 0 && // no template option bases + none_of...>::value), // no multiple_inheritance attr + "Error: multiple inheritance bases must be specified via class_ template options"); + + type_record record; + record.scope = scope; + record.name = name; + record.type = &typeid(type); + record.type_size = sizeof(conditional_t); + record.type_align = alignof(conditional_t&); + record.holder_size = sizeof(holder_type); + record.init_instance = init_instance; + record.dealloc = dealloc; + record.default_holder = detail::is_instantiation::value; + + set_operator_new(&record); + + /* Register base classes specified via template arguments to class_, if any */ + PYBIND11_EXPAND_SIDE_EFFECTS(add_base(record)); + + /* Process optional arguments, if any */ + process_attributes::init(extra..., &record); + + generic_type::initialize(record); + + if (has_alias) { + auto &instances = record.module_local ? registered_local_types_cpp() : get_internals().registered_types_cpp; + instances[std::type_index(typeid(type_alias))] = instances[std::type_index(typeid(type))]; + } + } + + template ::value, int> = 0> + static void add_base(detail::type_record &rec) { + rec.add_base(typeid(Base), [](void *src) -> void * { + return static_cast(reinterpret_cast(src)); + }); + } + + template ::value, int> = 0> + static void add_base(detail::type_record &) { } + + template + class_ &def(const char *name_, Func&& f, const Extra&... extra) { + cpp_function cf(method_adaptor(std::forward(f)), name(name_), is_method(*this), + sibling(getattr(*this, name_, none())), extra...); + attr(cf.name()) = cf; + return *this; + } + + template class_ & + def_static(const char *name_, Func &&f, const Extra&... extra) { + static_assert(!std::is_member_function_pointer::value, + "def_static(...) called with a non-static member function pointer"); + cpp_function cf(std::forward(f), name(name_), scope(*this), + sibling(getattr(*this, name_, none())), extra...); + attr(cf.name()) = staticmethod(cf); + return *this; + } + + template + class_ &def(const detail::op_ &op, const Extra&... extra) { + op.execute(*this, extra...); + return *this; + } + + template + class_ & def_cast(const detail::op_ &op, const Extra&... extra) { + op.execute_cast(*this, extra...); + return *this; + } + + template + class_ &def(const detail::initimpl::constructor &init, const Extra&... extra) { + init.execute(*this, extra...); + return *this; + } + + template + class_ &def(const detail::initimpl::alias_constructor &init, const Extra&... extra) { + init.execute(*this, extra...); + return *this; + } + + template + class_ &def(detail::initimpl::factory &&init, const Extra&... extra) { + std::move(init).execute(*this, extra...); + return *this; + } + + template + class_ &def(detail::initimpl::pickle_factory &&pf, const Extra &...extra) { + std::move(pf).execute(*this, extra...); + return *this; + } + + template class_& def_buffer(Func &&func) { + struct capture { Func func; }; + capture *ptr = new capture { std::forward(func) }; + install_buffer_funcs([](PyObject *obj, void *ptr) -> buffer_info* { + detail::make_caster caster; + if (!caster.load(obj, false)) + return nullptr; + return new buffer_info(((capture *) ptr)->func(caster)); + }, ptr); + return *this; + } + + template + class_ &def_buffer(Return (Class::*func)(Args...)) { + return def_buffer([func] (type &obj) { return (obj.*func)(); }); + } + + template + class_ &def_buffer(Return (Class::*func)(Args...) const) { + return def_buffer([func] (const type &obj) { return (obj.*func)(); }); + } + + template + class_ &def_readwrite(const char *name, D C::*pm, const Extra&... extra) { + static_assert(std::is_same::value || std::is_base_of::value, "def_readwrite() requires a class member (or base class member)"); + cpp_function fget([pm](const type &c) -> const D &{ return c.*pm; }, is_method(*this)), + fset([pm](type &c, const D &value) { c.*pm = value; }, is_method(*this)); + def_property(name, fget, fset, return_value_policy::reference_internal, extra...); + return *this; + } + + template + class_ &def_readonly(const char *name, const D C::*pm, const Extra& ...extra) { + static_assert(std::is_same::value || std::is_base_of::value, "def_readonly() requires a class member (or base class member)"); + cpp_function fget([pm](const type &c) -> const D &{ return c.*pm; }, is_method(*this)); + def_property_readonly(name, fget, return_value_policy::reference_internal, extra...); + return *this; + } + + template + class_ &def_readwrite_static(const char *name, D *pm, const Extra& ...extra) { + cpp_function fget([pm](object) -> const D &{ return *pm; }, scope(*this)), + fset([pm](object, const D &value) { *pm = value; }, scope(*this)); + def_property_static(name, fget, fset, return_value_policy::reference, extra...); + return *this; + } + + template + class_ &def_readonly_static(const char *name, const D *pm, const Extra& ...extra) { + cpp_function fget([pm](object) -> const D &{ return *pm; }, scope(*this)); + def_property_readonly_static(name, fget, return_value_policy::reference, extra...); + return *this; + } + + /// Uses return_value_policy::reference_internal by default + template + class_ &def_property_readonly(const char *name, const Getter &fget, const Extra& ...extra) { + return def_property_readonly(name, cpp_function(method_adaptor(fget)), + return_value_policy::reference_internal, extra...); + } + + /// Uses cpp_function's return_value_policy by default + template + class_ &def_property_readonly(const char *name, const cpp_function &fget, const Extra& ...extra) { + return def_property(name, fget, nullptr, extra...); + } + + /// Uses return_value_policy::reference by default + template + class_ &def_property_readonly_static(const char *name, const Getter &fget, const Extra& ...extra) { + return def_property_readonly_static(name, cpp_function(fget), return_value_policy::reference, extra...); + } + + /// Uses cpp_function's return_value_policy by default + template + class_ &def_property_readonly_static(const char *name, const cpp_function &fget, const Extra& ...extra) { + return def_property_static(name, fget, nullptr, extra...); + } + + /// Uses return_value_policy::reference_internal by default + template + class_ &def_property(const char *name, const Getter &fget, const Setter &fset, const Extra& ...extra) { + return def_property(name, fget, cpp_function(method_adaptor(fset)), extra...); + } + template + class_ &def_property(const char *name, const Getter &fget, const cpp_function &fset, const Extra& ...extra) { + return def_property(name, cpp_function(method_adaptor(fget)), fset, + return_value_policy::reference_internal, extra...); + } + + /// Uses cpp_function's return_value_policy by default + template + class_ &def_property(const char *name, const cpp_function &fget, const cpp_function &fset, const Extra& ...extra) { + return def_property_static(name, fget, fset, is_method(*this), extra...); + } + + /// Uses return_value_policy::reference by default + template + class_ &def_property_static(const char *name, const Getter &fget, const cpp_function &fset, const Extra& ...extra) { + return def_property_static(name, cpp_function(fget), fset, return_value_policy::reference, extra...); + } + + /// Uses cpp_function's return_value_policy by default + template + class_ &def_property_static(const char *name, const cpp_function &fget, const cpp_function &fset, const Extra& ...extra) { + static_assert( 0 == detail::constexpr_sum(std::is_base_of::value...), + "Argument annotations are not allowed for properties"); + auto rec_fget = get_function_record(fget), rec_fset = get_function_record(fset); + auto *rec_active = rec_fget; + if (rec_fget) { + char *doc_prev = rec_fget->doc; /* 'extra' field may include a property-specific documentation string */ + detail::process_attributes::init(extra..., rec_fget); + if (rec_fget->doc && rec_fget->doc != doc_prev) { + free(doc_prev); + rec_fget->doc = strdup(rec_fget->doc); + } + } + if (rec_fset) { + char *doc_prev = rec_fset->doc; + detail::process_attributes::init(extra..., rec_fset); + if (rec_fset->doc && rec_fset->doc != doc_prev) { + free(doc_prev); + rec_fset->doc = strdup(rec_fset->doc); + } + if (! rec_active) rec_active = rec_fset; + } + def_property_static_impl(name, fget, fset, rec_active); + return *this; + } + +private: + /// Initialize holder object, variant 1: object derives from enable_shared_from_this + template + static void init_holder(detail::instance *inst, detail::value_and_holder &v_h, + const holder_type * /* unused */, const std::enable_shared_from_this * /* dummy */) { + try { + auto sh = std::dynamic_pointer_cast( + v_h.value_ptr()->shared_from_this()); + if (sh) { + new (std::addressof(v_h.holder())) holder_type(std::move(sh)); + v_h.set_holder_constructed(); + } + } catch (const std::bad_weak_ptr &) {} + + if (!v_h.holder_constructed() && inst->owned) { + new (std::addressof(v_h.holder())) holder_type(v_h.value_ptr()); + v_h.set_holder_constructed(); + } + } + + static void init_holder_from_existing(const detail::value_and_holder &v_h, + const holder_type *holder_ptr, std::true_type /*is_copy_constructible*/) { + new (std::addressof(v_h.holder())) holder_type(*reinterpret_cast(holder_ptr)); + } + + static void init_holder_from_existing(const detail::value_and_holder &v_h, + const holder_type *holder_ptr, std::false_type /*is_copy_constructible*/) { + new (std::addressof(v_h.holder())) holder_type(std::move(*const_cast(holder_ptr))); + } + + /// Initialize holder object, variant 2: try to construct from existing holder object, if possible + static void init_holder(detail::instance *inst, detail::value_and_holder &v_h, + const holder_type *holder_ptr, const void * /* dummy -- not enable_shared_from_this) */) { + if (holder_ptr) { + init_holder_from_existing(v_h, holder_ptr, std::is_copy_constructible()); + v_h.set_holder_constructed(); + } else if (inst->owned || detail::always_construct_holder::value) { + new (std::addressof(v_h.holder())) holder_type(v_h.value_ptr()); + v_h.set_holder_constructed(); + } + } + + /// Performs instance initialization including constructing a holder and registering the known + /// instance. Should be called as soon as the `type` value_ptr is set for an instance. Takes an + /// optional pointer to an existing holder to use; if not specified and the instance is + /// `.owned`, a new holder will be constructed to manage the value pointer. + static void init_instance(detail::instance *inst, const void *holder_ptr) { + auto v_h = inst->get_value_and_holder(detail::get_type_info(typeid(type))); + if (!v_h.instance_registered()) { + register_instance(inst, v_h.value_ptr(), v_h.type); + v_h.set_instance_registered(); + } + init_holder(inst, v_h, (const holder_type *) holder_ptr, v_h.value_ptr()); + } + + /// Deallocates an instance; via holder, if constructed; otherwise via operator delete. + static void dealloc(detail::value_and_holder &v_h) { + if (v_h.holder_constructed()) { + v_h.holder().~holder_type(); + v_h.set_holder_constructed(false); + } + else { + detail::call_operator_delete(v_h.value_ptr(), + v_h.type->type_size, + v_h.type->type_align + ); + } + v_h.value_ptr() = nullptr; + } + + static detail::function_record *get_function_record(handle h) { + h = detail::get_function(h); + return h ? (detail::function_record *) reinterpret_borrow(PyCFunction_GET_SELF(h.ptr())) + : nullptr; + } +}; + +/// Binds an existing constructor taking arguments Args... +template detail::initimpl::constructor init() { return {}; } +/// Like `init()`, but the instance is always constructed through the alias class (even +/// when not inheriting on the Python side). +template detail::initimpl::alias_constructor init_alias() { return {}; } + +/// Binds a factory function as a constructor +template > +Ret init(Func &&f) { return {std::forward(f)}; } + +/// Dual-argument factory function: the first function is called when no alias is needed, the second +/// when an alias is needed (i.e. due to python-side inheritance). Arguments must be identical. +template > +Ret init(CFunc &&c, AFunc &&a) { + return {std::forward(c), std::forward(a)}; +} + +/// Binds pickling functions `__getstate__` and `__setstate__` and ensures that the type +/// returned by `__getstate__` is the same as the argument accepted by `__setstate__`. +template +detail::initimpl::pickle_factory pickle(GetState &&g, SetState &&s) { + return {std::forward(g), std::forward(s)}; +} + +NAMESPACE_BEGIN(detail) +struct enum_base { + enum_base(handle base, handle parent) : m_base(base), m_parent(parent) { } + + PYBIND11_NOINLINE void init(bool is_arithmetic, bool is_convertible) { + m_base.attr("__entries") = dict(); + auto property = handle((PyObject *) &PyProperty_Type); + auto static_property = handle((PyObject *) get_internals().static_property_type); + + m_base.attr("__repr__") = cpp_function( + [](handle arg) -> str { + handle type = arg.get_type(); + object type_name = type.attr("__name__"); + dict entries = type.attr("__entries"); + for (const auto &kv : entries) { + object other = kv.second[int_(0)]; + if (other.equal(arg)) + return pybind11::str("{}.{}").format(type_name, kv.first); + } + return pybind11::str("{}.???").format(type_name); + }, is_method(m_base) + ); + + m_base.attr("name") = property(cpp_function( + [](handle arg) -> str { + dict entries = arg.get_type().attr("__entries"); + for (const auto &kv : entries) { + if (handle(kv.second[int_(0)]).equal(arg)) + return pybind11::str(kv.first); + } + return "???"; + }, is_method(m_base) + )); + + m_base.attr("__doc__") = static_property(cpp_function( + [](handle arg) -> std::string { + std::string docstring; + dict entries = arg.attr("__entries"); + if (((PyTypeObject *) arg.ptr())->tp_doc) + docstring += std::string(((PyTypeObject *) arg.ptr())->tp_doc) + "\n\n"; + docstring += "Members:"; + for (const auto &kv : entries) { + auto key = std::string(pybind11::str(kv.first)); + auto comment = kv.second[int_(1)]; + docstring += "\n\n " + key; + if (!comment.is_none()) + docstring += " : " + (std::string) pybind11::str(comment); + } + return docstring; + } + ), none(), none(), ""); + + m_base.attr("__members__") = static_property(cpp_function( + [](handle arg) -> dict { + dict entries = arg.attr("__entries"), m; + for (const auto &kv : entries) + m[kv.first] = kv.second[int_(0)]; + return m; + }), none(), none(), "" + ); + + #define PYBIND11_ENUM_OP_STRICT(op, expr, strict_behavior) \ + m_base.attr(op) = cpp_function( \ + [](object a, object b) { \ + if (!a.get_type().is(b.get_type())) \ + strict_behavior; \ + return expr; \ + }, \ + is_method(m_base)) + + #define PYBIND11_ENUM_OP_CONV(op, expr) \ + m_base.attr(op) = cpp_function( \ + [](object a_, object b_) { \ + int_ a(a_), b(b_); \ + return expr; \ + }, \ + is_method(m_base)) + + if (is_convertible) { + PYBIND11_ENUM_OP_CONV("__eq__", !b.is_none() && a.equal(b)); + PYBIND11_ENUM_OP_CONV("__ne__", b.is_none() || !a.equal(b)); + + if (is_arithmetic) { + PYBIND11_ENUM_OP_CONV("__lt__", a < b); + PYBIND11_ENUM_OP_CONV("__gt__", a > b); + PYBIND11_ENUM_OP_CONV("__le__", a <= b); + PYBIND11_ENUM_OP_CONV("__ge__", a >= b); + PYBIND11_ENUM_OP_CONV("__and__", a & b); + PYBIND11_ENUM_OP_CONV("__rand__", a & b); + PYBIND11_ENUM_OP_CONV("__or__", a | b); + PYBIND11_ENUM_OP_CONV("__ror__", a | b); + PYBIND11_ENUM_OP_CONV("__xor__", a ^ b); + PYBIND11_ENUM_OP_CONV("__rxor__", a ^ b); + } + } else { + PYBIND11_ENUM_OP_STRICT("__eq__", int_(a).equal(int_(b)), return false); + PYBIND11_ENUM_OP_STRICT("__ne__", !int_(a).equal(int_(b)), return true); + + if (is_arithmetic) { + #define PYBIND11_THROW throw type_error("Expected an enumeration of matching type!"); + PYBIND11_ENUM_OP_STRICT("__lt__", int_(a) < int_(b), PYBIND11_THROW); + PYBIND11_ENUM_OP_STRICT("__gt__", int_(a) > int_(b), PYBIND11_THROW); + PYBIND11_ENUM_OP_STRICT("__le__", int_(a) <= int_(b), PYBIND11_THROW); + PYBIND11_ENUM_OP_STRICT("__ge__", int_(a) >= int_(b), PYBIND11_THROW); + #undef PYBIND11_THROW + } + } + + #undef PYBIND11_ENUM_OP_CONV + #undef PYBIND11_ENUM_OP_STRICT + + object getstate = cpp_function( + [](object arg) { return int_(arg); }, is_method(m_base)); + + m_base.attr("__getstate__") = getstate; + m_base.attr("__hash__") = getstate; + } + + PYBIND11_NOINLINE void value(char const* name_, object value, const char *doc = nullptr) { + dict entries = m_base.attr("__entries"); + str name(name_); + if (entries.contains(name)) { + std::string type_name = (std::string) str(m_base.attr("__name__")); + throw value_error(type_name + ": element \"" + std::string(name_) + "\" already exists!"); + } + + entries[name] = std::make_pair(value, doc); + m_base.attr(name) = value; + } + + PYBIND11_NOINLINE void export_values() { + dict entries = m_base.attr("__entries"); + for (const auto &kv : entries) + m_parent.attr(kv.first) = kv.second[int_(0)]; + } + + handle m_base; + handle m_parent; +}; + +NAMESPACE_END(detail) + +/// Binds C++ enumerations and enumeration classes to Python +template class enum_ : public class_ { +public: + using Base = class_; + using Base::def; + using Base::attr; + using Base::def_property_readonly; + using Base::def_property_readonly_static; + using Scalar = typename std::underlying_type::type; + + template + enum_(const handle &scope, const char *name, const Extra&... extra) + : class_(scope, name, extra...), m_base(*this, scope) { + constexpr bool is_arithmetic = detail::any_of...>::value; + constexpr bool is_convertible = std::is_convertible::value; + m_base.init(is_arithmetic, is_convertible); + + def(init([](Scalar i) { return static_cast(i); })); + def("__int__", [](Type value) { return (Scalar) value; }); + #if PY_MAJOR_VERSION < 3 + def("__long__", [](Type value) { return (Scalar) value; }); + #endif + cpp_function setstate( + [](Type &value, Scalar arg) { value = static_cast(arg); }, + is_method(*this)); + attr("__setstate__") = setstate; + } + + /// Export enumeration entries into the parent scope + enum_& export_values() { + m_base.export_values(); + return *this; + } + + /// Add an enumeration entry + enum_& value(char const* name, Type value, const char *doc = nullptr) { + m_base.value(name, pybind11::cast(value, return_value_policy::copy), doc); + return *this; + } + +private: + detail::enum_base m_base; +}; + +NAMESPACE_BEGIN(detail) + + +inline void keep_alive_impl(handle nurse, handle patient) { + if (!nurse || !patient) + pybind11_fail("Could not activate keep_alive!"); + + if (patient.is_none() || nurse.is_none()) + return; /* Nothing to keep alive or nothing to be kept alive by */ + + auto tinfo = all_type_info(Py_TYPE(nurse.ptr())); + if (!tinfo.empty()) { + /* It's a pybind-registered type, so we can store the patient in the + * internal list. */ + add_patient(nurse.ptr(), patient.ptr()); + } + else { + /* Fall back to clever approach based on weak references taken from + * Boost.Python. This is not used for pybind-registered types because + * the objects can be destroyed out-of-order in a GC pass. */ + cpp_function disable_lifesupport( + [patient](handle weakref) { patient.dec_ref(); weakref.dec_ref(); }); + + weakref wr(nurse, disable_lifesupport); + + patient.inc_ref(); /* reference patient and leak the weak reference */ + (void) wr.release(); + } +} + +PYBIND11_NOINLINE inline void keep_alive_impl(size_t Nurse, size_t Patient, function_call &call, handle ret) { + auto get_arg = [&](size_t n) { + if (n == 0) + return ret; + else if (n == 1 && call.init_self) + return call.init_self; + else if (n <= call.args.size()) + return call.args[n - 1]; + return handle(); + }; + + keep_alive_impl(get_arg(Nurse), get_arg(Patient)); +} + +inline std::pair all_type_info_get_cache(PyTypeObject *type) { + auto res = get_internals().registered_types_py +#ifdef __cpp_lib_unordered_map_try_emplace + .try_emplace(type); +#else + .emplace(type, std::vector()); +#endif + if (res.second) { + // New cache entry created; set up a weak reference to automatically remove it if the type + // gets destroyed: + weakref((PyObject *) type, cpp_function([type](handle wr) { + get_internals().registered_types_py.erase(type); + wr.dec_ref(); + })).release(); + } + + return res; +} + +template +struct iterator_state { + Iterator it; + Sentinel end; + bool first_or_done; +}; + +NAMESPACE_END(detail) + +/// Makes a python iterator from a first and past-the-end C++ InputIterator. +template ()), + typename... Extra> +iterator make_iterator(Iterator first, Sentinel last, Extra &&... extra) { + typedef detail::iterator_state state; + + if (!detail::get_type_info(typeid(state), false)) { + class_(handle(), "iterator", pybind11::module_local()) + .def("__iter__", [](state &s) -> state& { return s; }) + .def("__next__", [](state &s) -> ValueType { + if (!s.first_or_done) + ++s.it; + else + s.first_or_done = false; + if (s.it == s.end) { + s.first_or_done = true; + throw stop_iteration(); + } + return *s.it; + }, std::forward(extra)..., Policy); + } + + return cast(state{first, last, true}); +} + +/// Makes an python iterator over the keys (`.first`) of a iterator over pairs from a +/// first and past-the-end InputIterator. +template ()).first), + typename... Extra> +iterator make_key_iterator(Iterator first, Sentinel last, Extra &&... extra) { + typedef detail::iterator_state state; + + if (!detail::get_type_info(typeid(state), false)) { + class_(handle(), "iterator", pybind11::module_local()) + .def("__iter__", [](state &s) -> state& { return s; }) + .def("__next__", [](state &s) -> KeyType { + if (!s.first_or_done) + ++s.it; + else + s.first_or_done = false; + if (s.it == s.end) { + s.first_or_done = true; + throw stop_iteration(); + } + return (*s.it).first; + }, std::forward(extra)..., Policy); + } + + return cast(state{first, last, true}); +} + +/// Makes an iterator over values of an stl container or other container supporting +/// `std::begin()`/`std::end()` +template iterator make_iterator(Type &value, Extra&&... extra) { + return make_iterator(std::begin(value), std::end(value), extra...); +} + +/// Makes an iterator over the keys (`.first`) of a stl map-like container supporting +/// `std::begin()`/`std::end()` +template iterator make_key_iterator(Type &value, Extra&&... extra) { + return make_key_iterator(std::begin(value), std::end(value), extra...); +} + +template void implicitly_convertible() { + struct set_flag { + bool &flag; + set_flag(bool &flag) : flag(flag) { flag = true; } + ~set_flag() { flag = false; } + }; + auto implicit_caster = [](PyObject *obj, PyTypeObject *type) -> PyObject * { + static bool currently_used = false; + if (currently_used) // implicit conversions are non-reentrant + return nullptr; + set_flag flag_helper(currently_used); + if (!detail::make_caster().load(obj, false)) + return nullptr; + tuple args(1); + args[0] = obj; + PyObject *result = PyObject_Call((PyObject *) type, args.ptr(), nullptr); + if (result == nullptr) + PyErr_Clear(); + return result; + }; + + if (auto tinfo = detail::get_type_info(typeid(OutputType))) + tinfo->implicit_conversions.push_back(implicit_caster); + else + pybind11_fail("implicitly_convertible: Unable to find type " + type_id()); +} + +template +void register_exception_translator(ExceptionTranslator&& translator) { + detail::get_internals().registered_exception_translators.push_front( + std::forward(translator)); +} + +/** + * Wrapper to generate a new Python exception type. + * + * This should only be used with PyErr_SetString for now. + * It is not (yet) possible to use as a py::base. + * Template type argument is reserved for future use. + */ +template +class exception : public object { +public: + exception() = default; + exception(handle scope, const char *name, PyObject *base = PyExc_Exception) { + std::string full_name = scope.attr("__name__").cast() + + std::string(".") + name; + m_ptr = PyErr_NewException(const_cast(full_name.c_str()), base, NULL); + if (hasattr(scope, name)) + pybind11_fail("Error during initialization: multiple incompatible " + "definitions with name \"" + std::string(name) + "\""); + scope.attr(name) = *this; + } + + // Sets the current python exception to this exception object with the given message + void operator()(const char *message) { + PyErr_SetString(m_ptr, message); + } +}; + +NAMESPACE_BEGIN(detail) +// Returns a reference to a function-local static exception object used in the simple +// register_exception approach below. (It would be simpler to have the static local variable +// directly in register_exception, but that makes clang <3.5 segfault - issue #1349). +template +exception &get_exception_object() { static exception ex; return ex; } +NAMESPACE_END(detail) + +/** + * Registers a Python exception in `m` of the given `name` and installs an exception translator to + * translate the C++ exception to the created Python exception using the exceptions what() method. + * This is intended for simple exception translations; for more complex translation, register the + * exception object and translator directly. + */ +template +exception ®ister_exception(handle scope, + const char *name, + PyObject *base = PyExc_Exception) { + auto &ex = detail::get_exception_object(); + if (!ex) ex = exception(scope, name, base); + + register_exception_translator([](std::exception_ptr p) { + if (!p) return; + try { + std::rethrow_exception(p); + } catch (const CppException &e) { + detail::get_exception_object()(e.what()); + } + }); + return ex; +} + +NAMESPACE_BEGIN(detail) +PYBIND11_NOINLINE inline void print(tuple args, dict kwargs) { + auto strings = tuple(args.size()); + for (size_t i = 0; i < args.size(); ++i) { + strings[i] = str(args[i]); + } + auto sep = kwargs.contains("sep") ? kwargs["sep"] : cast(" "); + auto line = sep.attr("join")(strings); + + object file; + if (kwargs.contains("file")) { + file = kwargs["file"].cast(); + } else { + try { + file = module::import("sys").attr("stdout"); + } catch (const error_already_set &) { + /* If print() is called from code that is executed as + part of garbage collection during interpreter shutdown, + importing 'sys' can fail. Give up rather than crashing the + interpreter in this case. */ + return; + } + } + + auto write = file.attr("write"); + write(line); + write(kwargs.contains("end") ? kwargs["end"] : cast("\n")); + + if (kwargs.contains("flush") && kwargs["flush"].cast()) + file.attr("flush")(); +} +NAMESPACE_END(detail) + +template +void print(Args &&...args) { + auto c = detail::collect_arguments(std::forward(args)...); + detail::print(c.args(), c.kwargs()); +} + +#if defined(WITH_THREAD) && !defined(PYPY_VERSION) + +/* The functions below essentially reproduce the PyGILState_* API using a RAII + * pattern, but there are a few important differences: + * + * 1. When acquiring the GIL from an non-main thread during the finalization + * phase, the GILState API blindly terminates the calling thread, which + * is often not what is wanted. This API does not do this. + * + * 2. The gil_scoped_release function can optionally cut the relationship + * of a PyThreadState and its associated thread, which allows moving it to + * another thread (this is a fairly rare/advanced use case). + * + * 3. The reference count of an acquired thread state can be controlled. This + * can be handy to prevent cases where callbacks issued from an external + * thread would otherwise constantly construct and destroy thread state data + * structures. + * + * See the Python bindings of NanoGUI (http://github.com/wjakob/nanogui) for an + * example which uses features 2 and 3 to migrate the Python thread of + * execution to another thread (to run the event loop on the original thread, + * in this case). + */ + +class gil_scoped_acquire { +public: + PYBIND11_NOINLINE gil_scoped_acquire() { + auto const &internals = detail::get_internals(); + tstate = (PyThreadState *) PYBIND11_TLS_GET_VALUE(internals.tstate); + + if (!tstate) { + /* Check if the GIL was acquired using the PyGILState_* API instead (e.g. if + calling from a Python thread). Since we use a different key, this ensures + we don't create a new thread state and deadlock in PyEval_AcquireThread + below. Note we don't save this state with internals.tstate, since we don't + create it we would fail to clear it (its reference count should be > 0). */ + tstate = PyGILState_GetThisThreadState(); + } + + if (!tstate) { + tstate = PyThreadState_New(internals.istate); + #if !defined(NDEBUG) + if (!tstate) + pybind11_fail("scoped_acquire: could not create thread state!"); + #endif + tstate->gilstate_counter = 0; + PYBIND11_TLS_REPLACE_VALUE(internals.tstate, tstate); + } else { + release = detail::get_thread_state_unchecked() != tstate; + } + + if (release) { + /* Work around an annoying assertion in PyThreadState_Swap */ + #if defined(Py_DEBUG) + PyInterpreterState *interp = tstate->interp; + tstate->interp = nullptr; + #endif + PyEval_AcquireThread(tstate); + #if defined(Py_DEBUG) + tstate->interp = interp; + #endif + } + + inc_ref(); + } + + void inc_ref() { + ++tstate->gilstate_counter; + } + + PYBIND11_NOINLINE void dec_ref() { + --tstate->gilstate_counter; + #if !defined(NDEBUG) + if (detail::get_thread_state_unchecked() != tstate) + pybind11_fail("scoped_acquire::dec_ref(): thread state must be current!"); + if (tstate->gilstate_counter < 0) + pybind11_fail("scoped_acquire::dec_ref(): reference count underflow!"); + #endif + if (tstate->gilstate_counter == 0) { + #if !defined(NDEBUG) + if (!release) + pybind11_fail("scoped_acquire::dec_ref(): internal error!"); + #endif + PyThreadState_Clear(tstate); + PyThreadState_DeleteCurrent(); + PYBIND11_TLS_DELETE_VALUE(detail::get_internals().tstate); + release = false; + } + } + + PYBIND11_NOINLINE ~gil_scoped_acquire() { + dec_ref(); + if (release) + PyEval_SaveThread(); + } +private: + PyThreadState *tstate = nullptr; + bool release = true; +}; + +class gil_scoped_release { +public: + explicit gil_scoped_release(bool disassoc = false) : disassoc(disassoc) { + // `get_internals()` must be called here unconditionally in order to initialize + // `internals.tstate` for subsequent `gil_scoped_acquire` calls. Otherwise, an + // initialization race could occur as multiple threads try `gil_scoped_acquire`. + const auto &internals = detail::get_internals(); + tstate = PyEval_SaveThread(); + if (disassoc) { + auto key = internals.tstate; + PYBIND11_TLS_DELETE_VALUE(key); + } + } + ~gil_scoped_release() { + if (!tstate) + return; + PyEval_RestoreThread(tstate); + if (disassoc) { + auto key = detail::get_internals().tstate; + PYBIND11_TLS_REPLACE_VALUE(key, tstate); + } + } +private: + PyThreadState *tstate; + bool disassoc; +}; +#elif defined(PYPY_VERSION) +class gil_scoped_acquire { + PyGILState_STATE state; +public: + gil_scoped_acquire() { state = PyGILState_Ensure(); } + ~gil_scoped_acquire() { PyGILState_Release(state); } +}; + +class gil_scoped_release { + PyThreadState *state; +public: + gil_scoped_release() { state = PyEval_SaveThread(); } + ~gil_scoped_release() { PyEval_RestoreThread(state); } +}; +#else +class gil_scoped_acquire { }; +class gil_scoped_release { }; +#endif + +error_already_set::~error_already_set() { + if (m_type) { + error_scope scope; + gil_scoped_acquire gil; + m_type.release().dec_ref(); + m_value.release().dec_ref(); + m_trace.release().dec_ref(); + } +} + +inline function get_type_overload(const void *this_ptr, const detail::type_info *this_type, const char *name) { + handle self = detail::get_object_handle(this_ptr, this_type); + if (!self) + return function(); + handle type = self.get_type(); + auto key = std::make_pair(type.ptr(), name); + + /* Cache functions that aren't overloaded in Python to avoid + many costly Python dictionary lookups below */ + auto &cache = detail::get_internals().inactive_overload_cache; + if (cache.find(key) != cache.end()) + return function(); + + function overload = getattr(self, name, function()); + if (overload.is_cpp_function()) { + cache.insert(key); + return function(); + } + + /* Don't call dispatch code if invoked from overridden function. + Unfortunately this doesn't work on PyPy. */ +#if !defined(PYPY_VERSION) + PyFrameObject *frame = PyThreadState_Get()->frame; + if (frame && (std::string) str(frame->f_code->co_name) == name && + frame->f_code->co_argcount > 0) { + PyFrame_FastToLocals(frame); + PyObject *self_caller = PyDict_GetItem( + frame->f_locals, PyTuple_GET_ITEM(frame->f_code->co_varnames, 0)); + if (self_caller == self.ptr()) + return function(); + } +#else + /* PyPy currently doesn't provide a detailed cpyext emulation of + frame objects, so we have to emulate this using Python. This + is going to be slow..*/ + dict d; d["self"] = self; d["name"] = pybind11::str(name); + PyObject *result = PyRun_String( + "import inspect\n" + "frame = inspect.currentframe()\n" + "if frame is not None:\n" + " frame = frame.f_back\n" + " if frame is not None and str(frame.f_code.co_name) == name and " + "frame.f_code.co_argcount > 0:\n" + " self_caller = frame.f_locals[frame.f_code.co_varnames[0]]\n" + " if self_caller == self:\n" + " self = None\n", + Py_file_input, d.ptr(), d.ptr()); + if (result == nullptr) + throw error_already_set(); + if (d["self"].is_none()) + return function(); + Py_DECREF(result); +#endif + + return overload; +} + +/** \rst + Try to retrieve a python method by the provided name from the instance pointed to by the this_ptr. + + :this_ptr: The pointer to the object the overload should be retrieved for. This should be the first + non-trampoline class encountered in the inheritance chain. + :name: The name of the overloaded Python method to retrieve. + :return: The Python method by this name from the object or an empty function wrapper. + \endrst */ +template function get_overload(const T *this_ptr, const char *name) { + auto tinfo = detail::get_type_info(typeid(T)); + return tinfo ? get_type_overload(this_ptr, tinfo, name) : function(); +} + +#define PYBIND11_OVERLOAD_INT(ret_type, cname, name, ...) { \ + pybind11::gil_scoped_acquire gil; \ + pybind11::function overload = pybind11::get_overload(static_cast(this), name); \ + if (overload) { \ + auto o = overload(__VA_ARGS__); \ + if (pybind11::detail::cast_is_temporary_value_reference::value) { \ + static pybind11::detail::overload_caster_t caster; \ + return pybind11::detail::cast_ref(std::move(o), caster); \ + } \ + else return pybind11::detail::cast_safe(std::move(o)); \ + } \ + } + +/** \rst + Macro to populate the virtual method in the trampoline class. This macro tries to look up a method named 'fn' + from the Python side, deals with the :ref:`gil` and necessary argument conversions to call this method and return + the appropriate type. See :ref:`overriding_virtuals` for more information. This macro should be used when the method + name in C is not the same as the method name in Python. For example with `__str__`. + + .. code-block:: cpp + + std::string toString() override { + PYBIND11_OVERLOAD_NAME( + std::string, // Return type (ret_type) + Animal, // Parent class (cname) + toString, // Name of function in C++ (name) + "__str__", // Name of method in Python (fn) + ); + } +\endrst */ +#define PYBIND11_OVERLOAD_NAME(ret_type, cname, name, fn, ...) \ + PYBIND11_OVERLOAD_INT(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__) \ + return cname::fn(__VA_ARGS__) + +/** \rst + Macro for pure virtual functions, this function is identical to :c:macro:`PYBIND11_OVERLOAD_NAME`, except that it + throws if no overload can be found. +\endrst */ +#define PYBIND11_OVERLOAD_PURE_NAME(ret_type, cname, name, fn, ...) \ + PYBIND11_OVERLOAD_INT(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__) \ + pybind11::pybind11_fail("Tried to call pure virtual function \"" PYBIND11_STRINGIFY(cname) "::" name "\""); + +/** \rst + Macro to populate the virtual method in the trampoline class. This macro tries to look up the method + from the Python side, deals with the :ref:`gil` and necessary argument conversions to call this method and return + the appropriate type. This macro should be used if the method name in C and in Python are identical. + See :ref:`overriding_virtuals` for more information. + + .. code-block:: cpp + + class PyAnimal : public Animal { + public: + // Inherit the constructors + using Animal::Animal; + + // Trampoline (need one for each virtual function) + std::string go(int n_times) override { + PYBIND11_OVERLOAD_PURE( + std::string, // Return type (ret_type) + Animal, // Parent class (cname) + go, // Name of function in C++ (must match Python name) (fn) + n_times // Argument(s) (...) + ); + } + }; +\endrst */ +#define PYBIND11_OVERLOAD(ret_type, cname, fn, ...) \ + PYBIND11_OVERLOAD_NAME(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), #fn, fn, __VA_ARGS__) + +/** \rst + Macro for pure virtual functions, this function is identical to :c:macro:`PYBIND11_OVERLOAD`, except that it throws + if no overload can be found. +\endrst */ +#define PYBIND11_OVERLOAD_PURE(ret_type, cname, fn, ...) \ + PYBIND11_OVERLOAD_PURE_NAME(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), #fn, fn, __VA_ARGS__) + +NAMESPACE_END(PYBIND11_NAMESPACE) + +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) +# pragma warning(pop) +#elif defined(__GNUG__) && !defined(__clang__) +# pragma GCC diagnostic pop +#endif diff --git a/python/src/pybind11/pytypes.h b/python/src/pybind11/pytypes.h new file mode 100644 index 000000000..2d573dfad --- /dev/null +++ b/python/src/pybind11/pytypes.h @@ -0,0 +1,1471 @@ +/* + pybind11/pytypes.h: Convenience wrapper classes for basic Python types + + Copyright (c) 2016 Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include "detail/common.h" +#include "buffer_info.h" +#include +#include + +NAMESPACE_BEGIN(PYBIND11_NAMESPACE) + +/* A few forward declarations */ +class handle; class object; +class str; class iterator; +struct arg; struct arg_v; + +NAMESPACE_BEGIN(detail) +class args_proxy; +inline bool isinstance_generic(handle obj, const std::type_info &tp); + +// Accessor forward declarations +template class accessor; +namespace accessor_policies { + struct obj_attr; + struct str_attr; + struct generic_item; + struct sequence_item; + struct list_item; + struct tuple_item; +} +using obj_attr_accessor = accessor; +using str_attr_accessor = accessor; +using item_accessor = accessor; +using sequence_accessor = accessor; +using list_accessor = accessor; +using tuple_accessor = accessor; + +/// Tag and check to identify a class which implements the Python object API +class pyobject_tag { }; +template using is_pyobject = std::is_base_of>; + +/** \rst + A mixin class which adds common functions to `handle`, `object` and various accessors. + The only requirement for `Derived` is to implement ``PyObject *Derived::ptr() const``. +\endrst */ +template +class object_api : public pyobject_tag { + const Derived &derived() const { return static_cast(*this); } + +public: + /** \rst + Return an iterator equivalent to calling ``iter()`` in Python. The object + must be a collection which supports the iteration protocol. + \endrst */ + iterator begin() const; + /// Return a sentinel which ends iteration. + iterator end() const; + + /** \rst + Return an internal functor to invoke the object's sequence protocol. Casting + the returned ``detail::item_accessor`` instance to a `handle` or `object` + subclass causes a corresponding call to ``__getitem__``. Assigning a `handle` + or `object` subclass causes a call to ``__setitem__``. + \endrst */ + item_accessor operator[](handle key) const; + /// See above (the only difference is that they key is provided as a string literal) + item_accessor operator[](const char *key) const; + + /** \rst + Return an internal functor to access the object's attributes. Casting the + returned ``detail::obj_attr_accessor`` instance to a `handle` or `object` + subclass causes a corresponding call to ``getattr``. Assigning a `handle` + or `object` subclass causes a call to ``setattr``. + \endrst */ + obj_attr_accessor attr(handle key) const; + /// See above (the only difference is that they key is provided as a string literal) + str_attr_accessor attr(const char *key) const; + + /** \rst + Matches * unpacking in Python, e.g. to unpack arguments out of a ``tuple`` + or ``list`` for a function call. Applying another * to the result yields + ** unpacking, e.g. to unpack a dict as function keyword arguments. + See :ref:`calling_python_functions`. + \endrst */ + args_proxy operator*() const; + + /// Check if the given item is contained within this object, i.e. ``item in obj``. + template bool contains(T &&item) const; + + /** \rst + Assuming the Python object is a function or implements the ``__call__`` + protocol, ``operator()`` invokes the underlying function, passing an + arbitrary set of parameters. The result is returned as a `object` and + may need to be converted back into a Python object using `handle::cast()`. + + When some of the arguments cannot be converted to Python objects, the + function will throw a `cast_error` exception. When the Python function + call fails, a `error_already_set` exception is thrown. + \endrst */ + template + object operator()(Args &&...args) const; + template + PYBIND11_DEPRECATED("call(...) was deprecated in favor of operator()(...)") + object call(Args&&... args) const; + + /// Equivalent to ``obj is other`` in Python. + bool is(object_api const& other) const { return derived().ptr() == other.derived().ptr(); } + /// Equivalent to ``obj is None`` in Python. + bool is_none() const { return derived().ptr() == Py_None; } + /// Equivalent to obj == other in Python + bool equal(object_api const &other) const { return rich_compare(other, Py_EQ); } + bool not_equal(object_api const &other) const { return rich_compare(other, Py_NE); } + bool operator<(object_api const &other) const { return rich_compare(other, Py_LT); } + bool operator<=(object_api const &other) const { return rich_compare(other, Py_LE); } + bool operator>(object_api const &other) const { return rich_compare(other, Py_GT); } + bool operator>=(object_api const &other) const { return rich_compare(other, Py_GE); } + + object operator-() const; + object operator~() const; + object operator+(object_api const &other) const; + object operator+=(object_api const &other) const; + object operator-(object_api const &other) const; + object operator-=(object_api const &other) const; + object operator*(object_api const &other) const; + object operator*=(object_api const &other) const; + object operator/(object_api const &other) const; + object operator/=(object_api const &other) const; + object operator|(object_api const &other) const; + object operator|=(object_api const &other) const; + object operator&(object_api const &other) const; + object operator&=(object_api const &other) const; + object operator^(object_api const &other) const; + object operator^=(object_api const &other) const; + object operator<<(object_api const &other) const; + object operator<<=(object_api const &other) const; + object operator>>(object_api const &other) const; + object operator>>=(object_api const &other) const; + + PYBIND11_DEPRECATED("Use py::str(obj) instead") + pybind11::str str() const; + + /// Get or set the object's docstring, i.e. ``obj.__doc__``. + str_attr_accessor doc() const; + + /// Return the object's current reference count + int ref_count() const { return static_cast(Py_REFCNT(derived().ptr())); } + /// Return a handle to the Python type object underlying the instance + handle get_type() const; + +private: + bool rich_compare(object_api const &other, int value) const; +}; + +NAMESPACE_END(detail) + +/** \rst + Holds a reference to a Python object (no reference counting) + + The `handle` class is a thin wrapper around an arbitrary Python object (i.e. a + ``PyObject *`` in Python's C API). It does not perform any automatic reference + counting and merely provides a basic C++ interface to various Python API functions. + + .. seealso:: + The `object` class inherits from `handle` and adds automatic reference + counting features. +\endrst */ +class handle : public detail::object_api { +public: + /// The default constructor creates a handle with a ``nullptr``-valued pointer + handle() = default; + /// Creates a ``handle`` from the given raw Python object pointer + handle(PyObject *ptr) : m_ptr(ptr) { } // Allow implicit conversion from PyObject* + + /// Return the underlying ``PyObject *`` pointer + PyObject *ptr() const { return m_ptr; } + PyObject *&ptr() { return m_ptr; } + + /** \rst + Manually increase the reference count of the Python object. Usually, it is + preferable to use the `object` class which derives from `handle` and calls + this function automatically. Returns a reference to itself. + \endrst */ + const handle& inc_ref() const & { Py_XINCREF(m_ptr); return *this; } + + /** \rst + Manually decrease the reference count of the Python object. Usually, it is + preferable to use the `object` class which derives from `handle` and calls + this function automatically. Returns a reference to itself. + \endrst */ + const handle& dec_ref() const & { Py_XDECREF(m_ptr); return *this; } + + /** \rst + Attempt to cast the Python object into the given C++ type. A `cast_error` + will be throw upon failure. + \endrst */ + template T cast() const; + /// Return ``true`` when the `handle` wraps a valid Python object + explicit operator bool() const { return m_ptr != nullptr; } + /** \rst + Deprecated: Check that the underlying pointers are the same. + Equivalent to ``obj1 is obj2`` in Python. + \endrst */ + PYBIND11_DEPRECATED("Use obj1.is(obj2) instead") + bool operator==(const handle &h) const { return m_ptr == h.m_ptr; } + PYBIND11_DEPRECATED("Use !obj1.is(obj2) instead") + bool operator!=(const handle &h) const { return m_ptr != h.m_ptr; } + PYBIND11_DEPRECATED("Use handle::operator bool() instead") + bool check() const { return m_ptr != nullptr; } +protected: + PyObject *m_ptr = nullptr; +}; + +/** \rst + Holds a reference to a Python object (with reference counting) + + Like `handle`, the `object` class is a thin wrapper around an arbitrary Python + object (i.e. a ``PyObject *`` in Python's C API). In contrast to `handle`, it + optionally increases the object's reference count upon construction, and it + *always* decreases the reference count when the `object` instance goes out of + scope and is destructed. When using `object` instances consistently, it is much + easier to get reference counting right at the first attempt. +\endrst */ +class object : public handle { +public: + object() = default; + PYBIND11_DEPRECATED("Use reinterpret_borrow() or reinterpret_steal()") + object(handle h, bool is_borrowed) : handle(h) { if (is_borrowed) inc_ref(); } + /// Copy constructor; always increases the reference count + object(const object &o) : handle(o) { inc_ref(); } + /// Move constructor; steals the object from ``other`` and preserves its reference count + object(object &&other) noexcept { m_ptr = other.m_ptr; other.m_ptr = nullptr; } + /// Destructor; automatically calls `handle::dec_ref()` + ~object() { dec_ref(); } + + /** \rst + Resets the internal pointer to ``nullptr`` without without decreasing the + object's reference count. The function returns a raw handle to the original + Python object. + \endrst */ + handle release() { + PyObject *tmp = m_ptr; + m_ptr = nullptr; + return handle(tmp); + } + + object& operator=(const object &other) { + other.inc_ref(); + dec_ref(); + m_ptr = other.m_ptr; + return *this; + } + + object& operator=(object &&other) noexcept { + if (this != &other) { + handle temp(m_ptr); + m_ptr = other.m_ptr; + other.m_ptr = nullptr; + temp.dec_ref(); + } + return *this; + } + + // Calling cast() on an object lvalue just copies (via handle::cast) + template T cast() const &; + // Calling on an object rvalue does a move, if needed and/or possible + template T cast() &&; + +protected: + // Tags for choosing constructors from raw PyObject * + struct borrowed_t { }; + struct stolen_t { }; + + template friend T reinterpret_borrow(handle); + template friend T reinterpret_steal(handle); + +public: + // Only accessible from derived classes and the reinterpret_* functions + object(handle h, borrowed_t) : handle(h) { inc_ref(); } + object(handle h, stolen_t) : handle(h) { } +}; + +/** \rst + Declare that a `handle` or ``PyObject *`` is a certain type and borrow the reference. + The target type ``T`` must be `object` or one of its derived classes. The function + doesn't do any conversions or checks. It's up to the user to make sure that the + target type is correct. + + .. code-block:: cpp + + PyObject *p = PyList_GetItem(obj, index); + py::object o = reinterpret_borrow(p); + // or + py::tuple t = reinterpret_borrow(p); // <-- `p` must be already be a `tuple` +\endrst */ +template T reinterpret_borrow(handle h) { return {h, object::borrowed_t{}}; } + +/** \rst + Like `reinterpret_borrow`, but steals the reference. + + .. code-block:: cpp + + PyObject *p = PyObject_Str(obj); + py::str s = reinterpret_steal(p); // <-- `p` must be already be a `str` +\endrst */ +template T reinterpret_steal(handle h) { return {h, object::stolen_t{}}; } + +NAMESPACE_BEGIN(detail) +inline std::string error_string(); +NAMESPACE_END(detail) + +/// Fetch and hold an error which was already set in Python. An instance of this is typically +/// thrown to propagate python-side errors back through C++ which can either be caught manually or +/// else falls back to the function dispatcher (which then raises the captured error back to +/// python). +class error_already_set : public std::runtime_error { +public: + /// Constructs a new exception from the current Python error indicator, if any. The current + /// Python error indicator will be cleared. + error_already_set() : std::runtime_error(detail::error_string()) { + PyErr_Fetch(&m_type.ptr(), &m_value.ptr(), &m_trace.ptr()); + } + + error_already_set(const error_already_set &) = default; + error_already_set(error_already_set &&) = default; + + inline ~error_already_set(); + + /// Give the currently-held error back to Python, if any. If there is currently a Python error + /// already set it is cleared first. After this call, the current object no longer stores the + /// error variables (but the `.what()` string is still available). + void restore() { PyErr_Restore(m_type.release().ptr(), m_value.release().ptr(), m_trace.release().ptr()); } + + // Does nothing; provided for backwards compatibility. + PYBIND11_DEPRECATED("Use of error_already_set.clear() is deprecated") + void clear() {} + + /// Check if the currently trapped error type matches the given Python exception class (or a + /// subclass thereof). May also be passed a tuple to search for any exception class matches in + /// the given tuple. + bool matches(handle exc) const { return PyErr_GivenExceptionMatches(m_type.ptr(), exc.ptr()); } + + const object& type() const { return m_type; } + const object& value() const { return m_value; } + const object& trace() const { return m_trace; } + +private: + object m_type, m_value, m_trace; +}; + +/** \defgroup python_builtins _ + Unless stated otherwise, the following C++ functions behave the same + as their Python counterparts. + */ + +/** \ingroup python_builtins + \rst + Return true if ``obj`` is an instance of ``T``. Type ``T`` must be a subclass of + `object` or a class which was exposed to Python as ``py::class_``. +\endrst */ +template ::value, int> = 0> +bool isinstance(handle obj) { return T::check_(obj); } + +template ::value, int> = 0> +bool isinstance(handle obj) { return detail::isinstance_generic(obj, typeid(T)); } + +template <> inline bool isinstance(handle obj) = delete; +template <> inline bool isinstance(handle obj) { return obj.ptr() != nullptr; } + +/// \ingroup python_builtins +/// Return true if ``obj`` is an instance of the ``type``. +inline bool isinstance(handle obj, handle type) { + const auto result = PyObject_IsInstance(obj.ptr(), type.ptr()); + if (result == -1) + throw error_already_set(); + return result != 0; +} + +/// \addtogroup python_builtins +/// @{ +inline bool hasattr(handle obj, handle name) { + return PyObject_HasAttr(obj.ptr(), name.ptr()) == 1; +} + +inline bool hasattr(handle obj, const char *name) { + return PyObject_HasAttrString(obj.ptr(), name) == 1; +} + +inline void delattr(handle obj, handle name) { + if (PyObject_DelAttr(obj.ptr(), name.ptr()) != 0) { throw error_already_set(); } +} + +inline void delattr(handle obj, const char *name) { + if (PyObject_DelAttrString(obj.ptr(), name) != 0) { throw error_already_set(); } +} + +inline object getattr(handle obj, handle name) { + PyObject *result = PyObject_GetAttr(obj.ptr(), name.ptr()); + if (!result) { throw error_already_set(); } + return reinterpret_steal(result); +} + +inline object getattr(handle obj, const char *name) { + PyObject *result = PyObject_GetAttrString(obj.ptr(), name); + if (!result) { throw error_already_set(); } + return reinterpret_steal(result); +} + +inline object getattr(handle obj, handle name, handle default_) { + if (PyObject *result = PyObject_GetAttr(obj.ptr(), name.ptr())) { + return reinterpret_steal(result); + } else { + PyErr_Clear(); + return reinterpret_borrow(default_); + } +} + +inline object getattr(handle obj, const char *name, handle default_) { + if (PyObject *result = PyObject_GetAttrString(obj.ptr(), name)) { + return reinterpret_steal(result); + } else { + PyErr_Clear(); + return reinterpret_borrow(default_); + } +} + +inline void setattr(handle obj, handle name, handle value) { + if (PyObject_SetAttr(obj.ptr(), name.ptr(), value.ptr()) != 0) { throw error_already_set(); } +} + +inline void setattr(handle obj, const char *name, handle value) { + if (PyObject_SetAttrString(obj.ptr(), name, value.ptr()) != 0) { throw error_already_set(); } +} + +inline ssize_t hash(handle obj) { + auto h = PyObject_Hash(obj.ptr()); + if (h == -1) { throw error_already_set(); } + return h; +} + +/// @} python_builtins + +NAMESPACE_BEGIN(detail) +inline handle get_function(handle value) { + if (value) { +#if PY_MAJOR_VERSION >= 3 + if (PyInstanceMethod_Check(value.ptr())) + value = PyInstanceMethod_GET_FUNCTION(value.ptr()); + else +#endif + if (PyMethod_Check(value.ptr())) + value = PyMethod_GET_FUNCTION(value.ptr()); + } + return value; +} + +// Helper aliases/functions to support implicit casting of values given to python accessors/methods. +// When given a pyobject, this simply returns the pyobject as-is; for other C++ type, the value goes +// through pybind11::cast(obj) to convert it to an `object`. +template ::value, int> = 0> +auto object_or_cast(T &&o) -> decltype(std::forward(o)) { return std::forward(o); } +// The following casting version is implemented in cast.h: +template ::value, int> = 0> +object object_or_cast(T &&o); +// Match a PyObject*, which we want to convert directly to handle via its converting constructor +inline handle object_or_cast(PyObject *ptr) { return ptr; } + +template +class accessor : public object_api> { + using key_type = typename Policy::key_type; + +public: + accessor(handle obj, key_type key) : obj(obj), key(std::move(key)) { } + accessor(const accessor &) = default; + accessor(accessor &&) = default; + + // accessor overload required to override default assignment operator (templates are not allowed + // to replace default compiler-generated assignments). + void operator=(const accessor &a) && { std::move(*this).operator=(handle(a)); } + void operator=(const accessor &a) & { operator=(handle(a)); } + + template void operator=(T &&value) && { + Policy::set(obj, key, object_or_cast(std::forward(value))); + } + template void operator=(T &&value) & { + get_cache() = reinterpret_borrow(object_or_cast(std::forward(value))); + } + + template + PYBIND11_DEPRECATED("Use of obj.attr(...) as bool is deprecated in favor of pybind11::hasattr(obj, ...)") + explicit operator enable_if_t::value || + std::is_same::value, bool>() const { + return hasattr(obj, key); + } + template + PYBIND11_DEPRECATED("Use of obj[key] as bool is deprecated in favor of obj.contains(key)") + explicit operator enable_if_t::value, bool>() const { + return obj.contains(key); + } + + operator object() const { return get_cache(); } + PyObject *ptr() const { return get_cache().ptr(); } + template T cast() const { return get_cache().template cast(); } + +private: + object &get_cache() const { + if (!cache) { cache = Policy::get(obj, key); } + return cache; + } + +private: + handle obj; + key_type key; + mutable object cache; +}; + +NAMESPACE_BEGIN(accessor_policies) +struct obj_attr { + using key_type = object; + static object get(handle obj, handle key) { return getattr(obj, key); } + static void set(handle obj, handle key, handle val) { setattr(obj, key, val); } +}; + +struct str_attr { + using key_type = const char *; + static object get(handle obj, const char *key) { return getattr(obj, key); } + static void set(handle obj, const char *key, handle val) { setattr(obj, key, val); } +}; + +struct generic_item { + using key_type = object; + + static object get(handle obj, handle key) { + PyObject *result = PyObject_GetItem(obj.ptr(), key.ptr()); + if (!result) { throw error_already_set(); } + return reinterpret_steal(result); + } + + static void set(handle obj, handle key, handle val) { + if (PyObject_SetItem(obj.ptr(), key.ptr(), val.ptr()) != 0) { throw error_already_set(); } + } +}; + +struct sequence_item { + using key_type = size_t; + + static object get(handle obj, size_t index) { + PyObject *result = PySequence_GetItem(obj.ptr(), static_cast(index)); + if (!result) { throw error_already_set(); } + return reinterpret_steal(result); + } + + static void set(handle obj, size_t index, handle val) { + // PySequence_SetItem does not steal a reference to 'val' + if (PySequence_SetItem(obj.ptr(), static_cast(index), val.ptr()) != 0) { + throw error_already_set(); + } + } +}; + +struct list_item { + using key_type = size_t; + + static object get(handle obj, size_t index) { + PyObject *result = PyList_GetItem(obj.ptr(), static_cast(index)); + if (!result) { throw error_already_set(); } + return reinterpret_borrow(result); + } + + static void set(handle obj, size_t index, handle val) { + // PyList_SetItem steals a reference to 'val' + if (PyList_SetItem(obj.ptr(), static_cast(index), val.inc_ref().ptr()) != 0) { + throw error_already_set(); + } + } +}; + +struct tuple_item { + using key_type = size_t; + + static object get(handle obj, size_t index) { + PyObject *result = PyTuple_GetItem(obj.ptr(), static_cast(index)); + if (!result) { throw error_already_set(); } + return reinterpret_borrow(result); + } + + static void set(handle obj, size_t index, handle val) { + // PyTuple_SetItem steals a reference to 'val' + if (PyTuple_SetItem(obj.ptr(), static_cast(index), val.inc_ref().ptr()) != 0) { + throw error_already_set(); + } + } +}; +NAMESPACE_END(accessor_policies) + +/// STL iterator template used for tuple, list, sequence and dict +template +class generic_iterator : public Policy { + using It = generic_iterator; + +public: + using difference_type = ssize_t; + using iterator_category = typename Policy::iterator_category; + using value_type = typename Policy::value_type; + using reference = typename Policy::reference; + using pointer = typename Policy::pointer; + + generic_iterator() = default; + generic_iterator(handle seq, ssize_t index) : Policy(seq, index) { } + + reference operator*() const { return Policy::dereference(); } + reference operator[](difference_type n) const { return *(*this + n); } + pointer operator->() const { return **this; } + + It &operator++() { Policy::increment(); return *this; } + It operator++(int) { auto copy = *this; Policy::increment(); return copy; } + It &operator--() { Policy::decrement(); return *this; } + It operator--(int) { auto copy = *this; Policy::decrement(); return copy; } + It &operator+=(difference_type n) { Policy::advance(n); return *this; } + It &operator-=(difference_type n) { Policy::advance(-n); return *this; } + + friend It operator+(const It &a, difference_type n) { auto copy = a; return copy += n; } + friend It operator+(difference_type n, const It &b) { return b + n; } + friend It operator-(const It &a, difference_type n) { auto copy = a; return copy -= n; } + friend difference_type operator-(const It &a, const It &b) { return a.distance_to(b); } + + friend bool operator==(const It &a, const It &b) { return a.equal(b); } + friend bool operator!=(const It &a, const It &b) { return !(a == b); } + friend bool operator< (const It &a, const It &b) { return b - a > 0; } + friend bool operator> (const It &a, const It &b) { return b < a; } + friend bool operator>=(const It &a, const It &b) { return !(a < b); } + friend bool operator<=(const It &a, const It &b) { return !(a > b); } +}; + +NAMESPACE_BEGIN(iterator_policies) +/// Quick proxy class needed to implement ``operator->`` for iterators which can't return pointers +template +struct arrow_proxy { + T value; + + arrow_proxy(T &&value) : value(std::move(value)) { } + T *operator->() const { return &value; } +}; + +/// Lightweight iterator policy using just a simple pointer: see ``PySequence_Fast_ITEMS`` +class sequence_fast_readonly { +protected: + using iterator_category = std::random_access_iterator_tag; + using value_type = handle; + using reference = const handle; + using pointer = arrow_proxy; + + sequence_fast_readonly(handle obj, ssize_t n) : ptr(PySequence_Fast_ITEMS(obj.ptr()) + n) { } + + reference dereference() const { return *ptr; } + void increment() { ++ptr; } + void decrement() { --ptr; } + void advance(ssize_t n) { ptr += n; } + bool equal(const sequence_fast_readonly &b) const { return ptr == b.ptr; } + ssize_t distance_to(const sequence_fast_readonly &b) const { return ptr - b.ptr; } + +private: + PyObject **ptr; +}; + +/// Full read and write access using the sequence protocol: see ``detail::sequence_accessor`` +class sequence_slow_readwrite { +protected: + using iterator_category = std::random_access_iterator_tag; + using value_type = object; + using reference = sequence_accessor; + using pointer = arrow_proxy; + + sequence_slow_readwrite(handle obj, ssize_t index) : obj(obj), index(index) { } + + reference dereference() const { return {obj, static_cast(index)}; } + void increment() { ++index; } + void decrement() { --index; } + void advance(ssize_t n) { index += n; } + bool equal(const sequence_slow_readwrite &b) const { return index == b.index; } + ssize_t distance_to(const sequence_slow_readwrite &b) const { return index - b.index; } + +private: + handle obj; + ssize_t index; +}; + +/// Python's dictionary protocol permits this to be a forward iterator +class dict_readonly { +protected: + using iterator_category = std::forward_iterator_tag; + using value_type = std::pair; + using reference = const value_type; + using pointer = arrow_proxy; + + dict_readonly() = default; + dict_readonly(handle obj, ssize_t pos) : obj(obj), pos(pos) { increment(); } + + reference dereference() const { return {key, value}; } + void increment() { if (!PyDict_Next(obj.ptr(), &pos, &key, &value)) { pos = -1; } } + bool equal(const dict_readonly &b) const { return pos == b.pos; } + +private: + handle obj; + PyObject *key = nullptr, *value = nullptr; + ssize_t pos = -1; +}; +NAMESPACE_END(iterator_policies) + +#if !defined(PYPY_VERSION) +using tuple_iterator = generic_iterator; +using list_iterator = generic_iterator; +#else +using tuple_iterator = generic_iterator; +using list_iterator = generic_iterator; +#endif + +using sequence_iterator = generic_iterator; +using dict_iterator = generic_iterator; + +inline bool PyIterable_Check(PyObject *obj) { + PyObject *iter = PyObject_GetIter(obj); + if (iter) { + Py_DECREF(iter); + return true; + } else { + PyErr_Clear(); + return false; + } +} + +inline bool PyNone_Check(PyObject *o) { return o == Py_None; } +#if PY_MAJOR_VERSION >= 3 +inline bool PyEllipsis_Check(PyObject *o) { return o == Py_Ellipsis; } +#endif + +inline bool PyUnicode_Check_Permissive(PyObject *o) { return PyUnicode_Check(o) || PYBIND11_BYTES_CHECK(o); } + +inline bool PyStaticMethod_Check(PyObject *o) { return o->ob_type == &PyStaticMethod_Type; } + +class kwargs_proxy : public handle { +public: + explicit kwargs_proxy(handle h) : handle(h) { } +}; + +class args_proxy : public handle { +public: + explicit args_proxy(handle h) : handle(h) { } + kwargs_proxy operator*() const { return kwargs_proxy(*this); } +}; + +/// Python argument categories (using PEP 448 terms) +template using is_keyword = std::is_base_of; +template using is_s_unpacking = std::is_same; // * unpacking +template using is_ds_unpacking = std::is_same; // ** unpacking +template using is_positional = satisfies_none_of; +template using is_keyword_or_ds = satisfies_any_of; + +// Call argument collector forward declarations +template +class simple_collector; +template +class unpacking_collector; + +NAMESPACE_END(detail) + +// TODO: After the deprecated constructors are removed, this macro can be simplified by +// inheriting ctors: `using Parent::Parent`. It's not an option right now because +// the `using` statement triggers the parent deprecation warning even if the ctor +// isn't even used. +#define PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun) \ + public: \ + PYBIND11_DEPRECATED("Use reinterpret_borrow<"#Name">() or reinterpret_steal<"#Name">()") \ + Name(handle h, bool is_borrowed) : Parent(is_borrowed ? Parent(h, borrowed_t{}) : Parent(h, stolen_t{})) { } \ + Name(handle h, borrowed_t) : Parent(h, borrowed_t{}) { } \ + Name(handle h, stolen_t) : Parent(h, stolen_t{}) { } \ + PYBIND11_DEPRECATED("Use py::isinstance(obj) instead") \ + bool check() const { return m_ptr != nullptr && (bool) CheckFun(m_ptr); } \ + static bool check_(handle h) { return h.ptr() != nullptr && CheckFun(h.ptr()); } + +#define PYBIND11_OBJECT_CVT(Name, Parent, CheckFun, ConvertFun) \ + PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun) \ + /* This is deliberately not 'explicit' to allow implicit conversion from object: */ \ + Name(const object &o) \ + : Parent(check_(o) ? o.inc_ref().ptr() : ConvertFun(o.ptr()), stolen_t{}) \ + { if (!m_ptr) throw error_already_set(); } \ + Name(object &&o) \ + : Parent(check_(o) ? o.release().ptr() : ConvertFun(o.ptr()), stolen_t{}) \ + { if (!m_ptr) throw error_already_set(); } \ + template \ + Name(const ::pybind11::detail::accessor &a) : Name(object(a)) { } + +#define PYBIND11_OBJECT(Name, Parent, CheckFun) \ + PYBIND11_OBJECT_COMMON(Name, Parent, CheckFun) \ + /* This is deliberately not 'explicit' to allow implicit conversion from object: */ \ + Name(const object &o) : Parent(o) { } \ + Name(object &&o) : Parent(std::move(o)) { } + +#define PYBIND11_OBJECT_DEFAULT(Name, Parent, CheckFun) \ + PYBIND11_OBJECT(Name, Parent, CheckFun) \ + Name() : Parent() { } + +/// \addtogroup pytypes +/// @{ + +/** \rst + Wraps a Python iterator so that it can also be used as a C++ input iterator + + Caveat: copying an iterator does not (and cannot) clone the internal + state of the Python iterable. This also applies to the post-increment + operator. This iterator should only be used to retrieve the current + value using ``operator*()``. +\endrst */ +class iterator : public object { +public: + using iterator_category = std::input_iterator_tag; + using difference_type = ssize_t; + using value_type = handle; + using reference = const handle; + using pointer = const handle *; + + PYBIND11_OBJECT_DEFAULT(iterator, object, PyIter_Check) + + iterator& operator++() { + advance(); + return *this; + } + + iterator operator++(int) { + auto rv = *this; + advance(); + return rv; + } + + reference operator*() const { + if (m_ptr && !value.ptr()) { + auto& self = const_cast(*this); + self.advance(); + } + return value; + } + + pointer operator->() const { operator*(); return &value; } + + /** \rst + The value which marks the end of the iteration. ``it == iterator::sentinel()`` + is equivalent to catching ``StopIteration`` in Python. + + .. code-block:: cpp + + void foo(py::iterator it) { + while (it != py::iterator::sentinel()) { + // use `*it` + ++it; + } + } + \endrst */ + static iterator sentinel() { return {}; } + + friend bool operator==(const iterator &a, const iterator &b) { return a->ptr() == b->ptr(); } + friend bool operator!=(const iterator &a, const iterator &b) { return a->ptr() != b->ptr(); } + +private: + void advance() { + value = reinterpret_steal(PyIter_Next(m_ptr)); + if (PyErr_Occurred()) { throw error_already_set(); } + } + +private: + object value = {}; +}; + +class iterable : public object { +public: + PYBIND11_OBJECT_DEFAULT(iterable, object, detail::PyIterable_Check) +}; + +class bytes; + +class str : public object { +public: + PYBIND11_OBJECT_CVT(str, object, detail::PyUnicode_Check_Permissive, raw_str) + + str(const char *c, size_t n) + : object(PyUnicode_FromStringAndSize(c, (ssize_t) n), stolen_t{}) { + if (!m_ptr) pybind11_fail("Could not allocate string object!"); + } + + // 'explicit' is explicitly omitted from the following constructors to allow implicit conversion to py::str from C++ string-like objects + str(const char *c = "") + : object(PyUnicode_FromString(c), stolen_t{}) { + if (!m_ptr) pybind11_fail("Could not allocate string object!"); + } + + str(const std::string &s) : str(s.data(), s.size()) { } + + explicit str(const bytes &b); + + /** \rst + Return a string representation of the object. This is analogous to + the ``str()`` function in Python. + \endrst */ + explicit str(handle h) : object(raw_str(h.ptr()), stolen_t{}) { } + + operator std::string() const { + object temp = *this; + if (PyUnicode_Check(m_ptr)) { + temp = reinterpret_steal(PyUnicode_AsUTF8String(m_ptr)); + if (!temp) + pybind11_fail("Unable to extract string contents! (encoding issue)"); + } + char *buffer; + ssize_t length; + if (PYBIND11_BYTES_AS_STRING_AND_SIZE(temp.ptr(), &buffer, &length)) + pybind11_fail("Unable to extract string contents! (invalid type)"); + return std::string(buffer, (size_t) length); + } + + template + str format(Args &&...args) const { + return attr("format")(std::forward(args)...); + } + +private: + /// Return string representation -- always returns a new reference, even if already a str + static PyObject *raw_str(PyObject *op) { + PyObject *str_value = PyObject_Str(op); +#if PY_MAJOR_VERSION < 3 + if (!str_value) throw error_already_set(); + PyObject *unicode = PyUnicode_FromEncodedObject(str_value, "utf-8", nullptr); + Py_XDECREF(str_value); str_value = unicode; +#endif + return str_value; + } +}; +/// @} pytypes + +inline namespace literals { +/** \rst + String literal version of `str` + \endrst */ +inline str operator"" _s(const char *s, size_t size) { return {s, size}; } +} + +/// \addtogroup pytypes +/// @{ +class bytes : public object { +public: + PYBIND11_OBJECT(bytes, object, PYBIND11_BYTES_CHECK) + + // Allow implicit conversion: + bytes(const char *c = "") + : object(PYBIND11_BYTES_FROM_STRING(c), stolen_t{}) { + if (!m_ptr) pybind11_fail("Could not allocate bytes object!"); + } + + bytes(const char *c, size_t n) + : object(PYBIND11_BYTES_FROM_STRING_AND_SIZE(c, (ssize_t) n), stolen_t{}) { + if (!m_ptr) pybind11_fail("Could not allocate bytes object!"); + } + + // Allow implicit conversion: + bytes(const std::string &s) : bytes(s.data(), s.size()) { } + + explicit bytes(const pybind11::str &s); + + operator std::string() const { + char *buffer; + ssize_t length; + if (PYBIND11_BYTES_AS_STRING_AND_SIZE(m_ptr, &buffer, &length)) + pybind11_fail("Unable to extract bytes contents!"); + return std::string(buffer, (size_t) length); + } +}; + +inline bytes::bytes(const pybind11::str &s) { + object temp = s; + if (PyUnicode_Check(s.ptr())) { + temp = reinterpret_steal(PyUnicode_AsUTF8String(s.ptr())); + if (!temp) + pybind11_fail("Unable to extract string contents! (encoding issue)"); + } + char *buffer; + ssize_t length; + if (PYBIND11_BYTES_AS_STRING_AND_SIZE(temp.ptr(), &buffer, &length)) + pybind11_fail("Unable to extract string contents! (invalid type)"); + auto obj = reinterpret_steal(PYBIND11_BYTES_FROM_STRING_AND_SIZE(buffer, length)); + if (!obj) + pybind11_fail("Could not allocate bytes object!"); + m_ptr = obj.release().ptr(); +} + +inline str::str(const bytes& b) { + char *buffer; + ssize_t length; + if (PYBIND11_BYTES_AS_STRING_AND_SIZE(b.ptr(), &buffer, &length)) + pybind11_fail("Unable to extract bytes contents!"); + auto obj = reinterpret_steal(PyUnicode_FromStringAndSize(buffer, (ssize_t) length)); + if (!obj) + pybind11_fail("Could not allocate string object!"); + m_ptr = obj.release().ptr(); +} + +class none : public object { +public: + PYBIND11_OBJECT(none, object, detail::PyNone_Check) + none() : object(Py_None, borrowed_t{}) { } +}; + +#if PY_MAJOR_VERSION >= 3 +class ellipsis : public object { +public: + PYBIND11_OBJECT(ellipsis, object, detail::PyEllipsis_Check) + ellipsis() : object(Py_Ellipsis, borrowed_t{}) { } +}; +#endif + +class bool_ : public object { +public: + PYBIND11_OBJECT_CVT(bool_, object, PyBool_Check, raw_bool) + bool_() : object(Py_False, borrowed_t{}) { } + // Allow implicit conversion from and to `bool`: + bool_(bool value) : object(value ? Py_True : Py_False, borrowed_t{}) { } + operator bool() const { return m_ptr && PyLong_AsLong(m_ptr) != 0; } + +private: + /// Return the truth value of an object -- always returns a new reference + static PyObject *raw_bool(PyObject *op) { + const auto value = PyObject_IsTrue(op); + if (value == -1) return nullptr; + return handle(value ? Py_True : Py_False).inc_ref().ptr(); + } +}; + +NAMESPACE_BEGIN(detail) +// Converts a value to the given unsigned type. If an error occurs, you get back (Unsigned) -1; +// otherwise you get back the unsigned long or unsigned long long value cast to (Unsigned). +// (The distinction is critically important when casting a returned -1 error value to some other +// unsigned type: (A)-1 != (B)-1 when A and B are unsigned types of different sizes). +template +Unsigned as_unsigned(PyObject *o) { + if (sizeof(Unsigned) <= sizeof(unsigned long) +#if PY_VERSION_HEX < 0x03000000 + || PyInt_Check(o) +#endif + ) { + unsigned long v = PyLong_AsUnsignedLong(o); + return v == (unsigned long) -1 && PyErr_Occurred() ? (Unsigned) -1 : (Unsigned) v; + } + else { + unsigned long long v = PyLong_AsUnsignedLongLong(o); + return v == (unsigned long long) -1 && PyErr_Occurred() ? (Unsigned) -1 : (Unsigned) v; + } +} +NAMESPACE_END(detail) + +class int_ : public object { +public: + PYBIND11_OBJECT_CVT(int_, object, PYBIND11_LONG_CHECK, PyNumber_Long) + int_() : object(PyLong_FromLong(0), stolen_t{}) { } + // Allow implicit conversion from C++ integral types: + template ::value, int> = 0> + int_(T value) { + if (sizeof(T) <= sizeof(long)) { + if (std::is_signed::value) + m_ptr = PyLong_FromLong((long) value); + else + m_ptr = PyLong_FromUnsignedLong((unsigned long) value); + } else { + if (std::is_signed::value) + m_ptr = PyLong_FromLongLong((long long) value); + else + m_ptr = PyLong_FromUnsignedLongLong((unsigned long long) value); + } + if (!m_ptr) pybind11_fail("Could not allocate int object!"); + } + + template ::value, int> = 0> + operator T() const { + return std::is_unsigned::value + ? detail::as_unsigned(m_ptr) + : sizeof(T) <= sizeof(long) + ? (T) PyLong_AsLong(m_ptr) + : (T) PYBIND11_LONG_AS_LONGLONG(m_ptr); + } +}; + +class float_ : public object { +public: + PYBIND11_OBJECT_CVT(float_, object, PyFloat_Check, PyNumber_Float) + // Allow implicit conversion from float/double: + float_(float value) : object(PyFloat_FromDouble((double) value), stolen_t{}) { + if (!m_ptr) pybind11_fail("Could not allocate float object!"); + } + float_(double value = .0) : object(PyFloat_FromDouble((double) value), stolen_t{}) { + if (!m_ptr) pybind11_fail("Could not allocate float object!"); + } + operator float() const { return (float) PyFloat_AsDouble(m_ptr); } + operator double() const { return (double) PyFloat_AsDouble(m_ptr); } +}; + +class weakref : public object { +public: + PYBIND11_OBJECT_DEFAULT(weakref, object, PyWeakref_Check) + explicit weakref(handle obj, handle callback = {}) + : object(PyWeakref_NewRef(obj.ptr(), callback.ptr()), stolen_t{}) { + if (!m_ptr) pybind11_fail("Could not allocate weak reference!"); + } +}; + +class slice : public object { +public: + PYBIND11_OBJECT_DEFAULT(slice, object, PySlice_Check) + slice(ssize_t start_, ssize_t stop_, ssize_t step_) { + int_ start(start_), stop(stop_), step(step_); + m_ptr = PySlice_New(start.ptr(), stop.ptr(), step.ptr()); + if (!m_ptr) pybind11_fail("Could not allocate slice object!"); + } + bool compute(size_t length, size_t *start, size_t *stop, size_t *step, + size_t *slicelength) const { + return PySlice_GetIndicesEx((PYBIND11_SLICE_OBJECT *) m_ptr, + (ssize_t) length, (ssize_t *) start, + (ssize_t *) stop, (ssize_t *) step, + (ssize_t *) slicelength) == 0; + } + bool compute(ssize_t length, ssize_t *start, ssize_t *stop, ssize_t *step, + ssize_t *slicelength) const { + return PySlice_GetIndicesEx((PYBIND11_SLICE_OBJECT *) m_ptr, + length, start, + stop, step, + slicelength) == 0; + } +}; + +class capsule : public object { +public: + PYBIND11_OBJECT_DEFAULT(capsule, object, PyCapsule_CheckExact) + PYBIND11_DEPRECATED("Use reinterpret_borrow() or reinterpret_steal()") + capsule(PyObject *ptr, bool is_borrowed) : object(is_borrowed ? object(ptr, borrowed_t{}) : object(ptr, stolen_t{})) { } + + explicit capsule(const void *value, const char *name = nullptr, void (*destructor)(PyObject *) = nullptr) + : object(PyCapsule_New(const_cast(value), name, destructor), stolen_t{}) { + if (!m_ptr) + pybind11_fail("Could not allocate capsule object!"); + } + + PYBIND11_DEPRECATED("Please pass a destructor that takes a void pointer as input") + capsule(const void *value, void (*destruct)(PyObject *)) + : object(PyCapsule_New(const_cast(value), nullptr, destruct), stolen_t{}) { + if (!m_ptr) + pybind11_fail("Could not allocate capsule object!"); + } + + capsule(const void *value, void (*destructor)(void *)) { + m_ptr = PyCapsule_New(const_cast(value), nullptr, [](PyObject *o) { + auto destructor = reinterpret_cast(PyCapsule_GetContext(o)); + void *ptr = PyCapsule_GetPointer(o, nullptr); + destructor(ptr); + }); + + if (!m_ptr) + pybind11_fail("Could not allocate capsule object!"); + + if (PyCapsule_SetContext(m_ptr, (void *) destructor) != 0) + pybind11_fail("Could not set capsule context!"); + } + + capsule(void (*destructor)()) { + m_ptr = PyCapsule_New(reinterpret_cast(destructor), nullptr, [](PyObject *o) { + auto destructor = reinterpret_cast(PyCapsule_GetPointer(o, nullptr)); + destructor(); + }); + + if (!m_ptr) + pybind11_fail("Could not allocate capsule object!"); + } + + template operator T *() const { + auto name = this->name(); + T * result = static_cast(PyCapsule_GetPointer(m_ptr, name)); + if (!result) pybind11_fail("Unable to extract capsule contents!"); + return result; + } + + const char *name() const { return PyCapsule_GetName(m_ptr); } +}; + +class tuple : public object { +public: + PYBIND11_OBJECT_CVT(tuple, object, PyTuple_Check, PySequence_Tuple) + explicit tuple(size_t size = 0) : object(PyTuple_New((ssize_t) size), stolen_t{}) { + if (!m_ptr) pybind11_fail("Could not allocate tuple object!"); + } + size_t size() const { return (size_t) PyTuple_Size(m_ptr); } + detail::tuple_accessor operator[](size_t index) const { return {*this, index}; } + detail::item_accessor operator[](handle h) const { return object::operator[](h); } + detail::tuple_iterator begin() const { return {*this, 0}; } + detail::tuple_iterator end() const { return {*this, PyTuple_GET_SIZE(m_ptr)}; } +}; + +class dict : public object { +public: + PYBIND11_OBJECT_CVT(dict, object, PyDict_Check, raw_dict) + dict() : object(PyDict_New(), stolen_t{}) { + if (!m_ptr) pybind11_fail("Could not allocate dict object!"); + } + template ...>::value>, + // MSVC workaround: it can't compile an out-of-line definition, so defer the collector + typename collector = detail::deferred_t, Args...>> + explicit dict(Args &&...args) : dict(collector(std::forward(args)...).kwargs()) { } + + size_t size() const { return (size_t) PyDict_Size(m_ptr); } + detail::dict_iterator begin() const { return {*this, 0}; } + detail::dict_iterator end() const { return {}; } + void clear() const { PyDict_Clear(ptr()); } + bool contains(handle key) const { return PyDict_Contains(ptr(), key.ptr()) == 1; } + bool contains(const char *key) const { return PyDict_Contains(ptr(), pybind11::str(key).ptr()) == 1; } + +private: + /// Call the `dict` Python type -- always returns a new reference + static PyObject *raw_dict(PyObject *op) { + if (PyDict_Check(op)) + return handle(op).inc_ref().ptr(); + return PyObject_CallFunctionObjArgs((PyObject *) &PyDict_Type, op, nullptr); + } +}; + +class sequence : public object { +public: + PYBIND11_OBJECT_DEFAULT(sequence, object, PySequence_Check) + size_t size() const { return (size_t) PySequence_Size(m_ptr); } + detail::sequence_accessor operator[](size_t index) const { return {*this, index}; } + detail::item_accessor operator[](handle h) const { return object::operator[](h); } + detail::sequence_iterator begin() const { return {*this, 0}; } + detail::sequence_iterator end() const { return {*this, PySequence_Size(m_ptr)}; } +}; + +class list : public object { +public: + PYBIND11_OBJECT_CVT(list, object, PyList_Check, PySequence_List) + explicit list(size_t size = 0) : object(PyList_New((ssize_t) size), stolen_t{}) { + if (!m_ptr) pybind11_fail("Could not allocate list object!"); + } + size_t size() const { return (size_t) PyList_Size(m_ptr); } + detail::list_accessor operator[](size_t index) const { return {*this, index}; } + detail::item_accessor operator[](handle h) const { return object::operator[](h); } + detail::list_iterator begin() const { return {*this, 0}; } + detail::list_iterator end() const { return {*this, PyList_GET_SIZE(m_ptr)}; } + template void append(T &&val) const { + PyList_Append(m_ptr, detail::object_or_cast(std::forward(val)).ptr()); + } +}; + +class args : public tuple { PYBIND11_OBJECT_DEFAULT(args, tuple, PyTuple_Check) }; +class kwargs : public dict { PYBIND11_OBJECT_DEFAULT(kwargs, dict, PyDict_Check) }; + +class set : public object { +public: + PYBIND11_OBJECT_CVT(set, object, PySet_Check, PySet_New) + set() : object(PySet_New(nullptr), stolen_t{}) { + if (!m_ptr) pybind11_fail("Could not allocate set object!"); + } + size_t size() const { return (size_t) PySet_Size(m_ptr); } + template bool add(T &&val) const { + return PySet_Add(m_ptr, detail::object_or_cast(std::forward(val)).ptr()) == 0; + } + void clear() const { PySet_Clear(m_ptr); } +}; + +class function : public object { +public: + PYBIND11_OBJECT_DEFAULT(function, object, PyCallable_Check) + handle cpp_function() const { + handle fun = detail::get_function(m_ptr); + if (fun && PyCFunction_Check(fun.ptr())) + return fun; + return handle(); + } + bool is_cpp_function() const { return (bool) cpp_function(); } +}; + +class staticmethod : public object { +public: + PYBIND11_OBJECT_CVT(staticmethod, object, detail::PyStaticMethod_Check, PyStaticMethod_New) +}; + +class buffer : public object { +public: + PYBIND11_OBJECT_DEFAULT(buffer, object, PyObject_CheckBuffer) + + buffer_info request(bool writable = false) { + int flags = PyBUF_STRIDES | PyBUF_FORMAT; + if (writable) flags |= PyBUF_WRITABLE; + Py_buffer *view = new Py_buffer(); + if (PyObject_GetBuffer(m_ptr, view, flags) != 0) { + delete view; + throw error_already_set(); + } + return buffer_info(view); + } +}; + +class memoryview : public object { +public: + explicit memoryview(const buffer_info& info) { + static Py_buffer buf { }; + // Py_buffer uses signed sizes, strides and shape!.. + static std::vector py_strides { }; + static std::vector py_shape { }; + buf.buf = info.ptr; + buf.itemsize = info.itemsize; + buf.format = const_cast(info.format.c_str()); + buf.ndim = (int) info.ndim; + buf.len = info.size; + py_strides.clear(); + py_shape.clear(); + for (size_t i = 0; i < (size_t) info.ndim; ++i) { + py_strides.push_back(info.strides[i]); + py_shape.push_back(info.shape[i]); + } + buf.strides = py_strides.data(); + buf.shape = py_shape.data(); + buf.suboffsets = nullptr; + buf.readonly = false; + buf.internal = nullptr; + + m_ptr = PyMemoryView_FromBuffer(&buf); + if (!m_ptr) + pybind11_fail("Unable to create memoryview from buffer descriptor"); + } + + PYBIND11_OBJECT_CVT(memoryview, object, PyMemoryView_Check, PyMemoryView_FromObject) +}; +/// @} pytypes + +/// \addtogroup python_builtins +/// @{ +inline size_t len(handle h) { + ssize_t result = PyObject_Length(h.ptr()); + if (result < 0) + pybind11_fail("Unable to compute length of object"); + return (size_t) result; +} + +inline size_t len_hint(handle h) { +#if PY_VERSION_HEX >= 0x03040000 + ssize_t result = PyObject_LengthHint(h.ptr(), 0); +#else + ssize_t result = PyObject_Length(h.ptr()); +#endif + if (result < 0) { + // Sometimes a length can't be determined at all (eg generators) + // In which case simply return 0 + PyErr_Clear(); + return 0; + } + return (size_t) result; +} + +inline str repr(handle h) { + PyObject *str_value = PyObject_Repr(h.ptr()); + if (!str_value) throw error_already_set(); +#if PY_MAJOR_VERSION < 3 + PyObject *unicode = PyUnicode_FromEncodedObject(str_value, "utf-8", nullptr); + Py_XDECREF(str_value); str_value = unicode; + if (!str_value) throw error_already_set(); +#endif + return reinterpret_steal(str_value); +} + +inline iterator iter(handle obj) { + PyObject *result = PyObject_GetIter(obj.ptr()); + if (!result) { throw error_already_set(); } + return reinterpret_steal(result); +} +/// @} python_builtins + +NAMESPACE_BEGIN(detail) +template iterator object_api::begin() const { return iter(derived()); } +template iterator object_api::end() const { return iterator::sentinel(); } +template item_accessor object_api::operator[](handle key) const { + return {derived(), reinterpret_borrow(key)}; +} +template item_accessor object_api::operator[](const char *key) const { + return {derived(), pybind11::str(key)}; +} +template obj_attr_accessor object_api::attr(handle key) const { + return {derived(), reinterpret_borrow(key)}; +} +template str_attr_accessor object_api::attr(const char *key) const { + return {derived(), key}; +} +template args_proxy object_api::operator*() const { + return args_proxy(derived().ptr()); +} +template template bool object_api::contains(T &&item) const { + return attr("__contains__")(std::forward(item)).template cast(); +} + +template +pybind11::str object_api::str() const { return pybind11::str(derived()); } + +template +str_attr_accessor object_api::doc() const { return attr("__doc__"); } + +template +handle object_api::get_type() const { return (PyObject *) Py_TYPE(derived().ptr()); } + +template +bool object_api::rich_compare(object_api const &other, int value) const { + int rv = PyObject_RichCompareBool(derived().ptr(), other.derived().ptr(), value); + if (rv == -1) + throw error_already_set(); + return rv == 1; +} + +#define PYBIND11_MATH_OPERATOR_UNARY(op, fn) \ + template object object_api::op() const { \ + object result = reinterpret_steal(fn(derived().ptr())); \ + if (!result.ptr()) \ + throw error_already_set(); \ + return result; \ + } + +#define PYBIND11_MATH_OPERATOR_BINARY(op, fn) \ + template \ + object object_api::op(object_api const &other) const { \ + object result = reinterpret_steal( \ + fn(derived().ptr(), other.derived().ptr())); \ + if (!result.ptr()) \ + throw error_already_set(); \ + return result; \ + } + +PYBIND11_MATH_OPERATOR_UNARY (operator~, PyNumber_Invert) +PYBIND11_MATH_OPERATOR_UNARY (operator-, PyNumber_Negative) +PYBIND11_MATH_OPERATOR_BINARY(operator+, PyNumber_Add) +PYBIND11_MATH_OPERATOR_BINARY(operator+=, PyNumber_InPlaceAdd) +PYBIND11_MATH_OPERATOR_BINARY(operator-, PyNumber_Subtract) +PYBIND11_MATH_OPERATOR_BINARY(operator-=, PyNumber_InPlaceSubtract) +PYBIND11_MATH_OPERATOR_BINARY(operator*, PyNumber_Multiply) +PYBIND11_MATH_OPERATOR_BINARY(operator*=, PyNumber_InPlaceMultiply) +PYBIND11_MATH_OPERATOR_BINARY(operator/, PyNumber_TrueDivide) +PYBIND11_MATH_OPERATOR_BINARY(operator/=, PyNumber_InPlaceTrueDivide) +PYBIND11_MATH_OPERATOR_BINARY(operator|, PyNumber_Or) +PYBIND11_MATH_OPERATOR_BINARY(operator|=, PyNumber_InPlaceOr) +PYBIND11_MATH_OPERATOR_BINARY(operator&, PyNumber_And) +PYBIND11_MATH_OPERATOR_BINARY(operator&=, PyNumber_InPlaceAnd) +PYBIND11_MATH_OPERATOR_BINARY(operator^, PyNumber_Xor) +PYBIND11_MATH_OPERATOR_BINARY(operator^=, PyNumber_InPlaceXor) +PYBIND11_MATH_OPERATOR_BINARY(operator<<, PyNumber_Lshift) +PYBIND11_MATH_OPERATOR_BINARY(operator<<=, PyNumber_InPlaceLshift) +PYBIND11_MATH_OPERATOR_BINARY(operator>>, PyNumber_Rshift) +PYBIND11_MATH_OPERATOR_BINARY(operator>>=, PyNumber_InPlaceRshift) + +#undef PYBIND11_MATH_OPERATOR_UNARY +#undef PYBIND11_MATH_OPERATOR_BINARY + +NAMESPACE_END(detail) +NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/python/src/pybind11/stl.h b/python/src/pybind11/stl.h new file mode 100644 index 000000000..32f8d294a --- /dev/null +++ b/python/src/pybind11/stl.h @@ -0,0 +1,386 @@ +/* + pybind11/stl.h: Transparent conversion for STL data types + + Copyright (c) 2016 Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include "pybind11.h" +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable: 4127) // warning C4127: Conditional expression is constant +#endif + +#ifdef __has_include +// std::optional (but including it in c++14 mode isn't allowed) +# if defined(PYBIND11_CPP17) && __has_include() +# include +# define PYBIND11_HAS_OPTIONAL 1 +# endif +// std::experimental::optional (but not allowed in c++11 mode) +# if defined(PYBIND11_CPP14) && (__has_include() && \ + !__has_include()) +# include +# define PYBIND11_HAS_EXP_OPTIONAL 1 +# endif +// std::variant +# if defined(PYBIND11_CPP17) && __has_include() +# include +# define PYBIND11_HAS_VARIANT 1 +# endif +#elif defined(_MSC_VER) && defined(PYBIND11_CPP17) +# include +# include +# define PYBIND11_HAS_OPTIONAL 1 +# define PYBIND11_HAS_VARIANT 1 +#endif + +NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +NAMESPACE_BEGIN(detail) + +/// Extracts an const lvalue reference or rvalue reference for U based on the type of T (e.g. for +/// forwarding a container element). Typically used indirect via forwarded_type(), below. +template +using forwarded_type = conditional_t< + std::is_lvalue_reference::value, remove_reference_t &, remove_reference_t &&>; + +/// Forwards a value U as rvalue or lvalue according to whether T is rvalue or lvalue; typically +/// used for forwarding a container's elements. +template +forwarded_type forward_like(U &&u) { + return std::forward>(std::forward(u)); +} + +template struct set_caster { + using type = Type; + using key_conv = make_caster; + + bool load(handle src, bool convert) { + if (!isinstance(src)) + return false; + auto s = reinterpret_borrow(src); + value.clear(); + for (auto entry : s) { + key_conv conv; + if (!conv.load(entry, convert)) + return false; + value.insert(cast_op(std::move(conv))); + } + return true; + } + + template + static handle cast(T &&src, return_value_policy policy, handle parent) { + if (!std::is_lvalue_reference::value) + policy = return_value_policy_override::policy(policy); + pybind11::set s; + for (auto &&value : src) { + auto value_ = reinterpret_steal(key_conv::cast(forward_like(value), policy, parent)); + if (!value_ || !s.add(value_)) + return handle(); + } + return s.release(); + } + + PYBIND11_TYPE_CASTER(type, _("Set[") + key_conv::name + _("]")); +}; + +template struct map_caster { + using key_conv = make_caster; + using value_conv = make_caster; + + bool load(handle src, bool convert) { + if (!isinstance(src)) + return false; + auto d = reinterpret_borrow(src); + value.clear(); + for (auto it : d) { + key_conv kconv; + value_conv vconv; + if (!kconv.load(it.first.ptr(), convert) || + !vconv.load(it.second.ptr(), convert)) + return false; + value.emplace(cast_op(std::move(kconv)), cast_op(std::move(vconv))); + } + return true; + } + + template + static handle cast(T &&src, return_value_policy policy, handle parent) { + dict d; + return_value_policy policy_key = policy; + return_value_policy policy_value = policy; + if (!std::is_lvalue_reference::value) { + policy_key = return_value_policy_override::policy(policy_key); + policy_value = return_value_policy_override::policy(policy_value); + } + for (auto &&kv : src) { + auto key = reinterpret_steal(key_conv::cast(forward_like(kv.first), policy_key, parent)); + auto value = reinterpret_steal(value_conv::cast(forward_like(kv.second), policy_value, parent)); + if (!key || !value) + return handle(); + d[key] = value; + } + return d.release(); + } + + PYBIND11_TYPE_CASTER(Type, _("Dict[") + key_conv::name + _(", ") + value_conv::name + _("]")); +}; + +template struct list_caster { + using value_conv = make_caster; + + bool load(handle src, bool convert) { + if (!isinstance(src) || isinstance(src)) + return false; + auto s = reinterpret_borrow(src); + value.clear(); + reserve_maybe(s, &value); + for (auto it : s) { + value_conv conv; + if (!conv.load(it, convert)) + return false; + value.push_back(cast_op(std::move(conv))); + } + return true; + } + +private: + template ().reserve(0)), void>::value, int> = 0> + void reserve_maybe(sequence s, Type *) { value.reserve(s.size()); } + void reserve_maybe(sequence, void *) { } + +public: + template + static handle cast(T &&src, return_value_policy policy, handle parent) { + if (!std::is_lvalue_reference::value) + policy = return_value_policy_override::policy(policy); + list l(src.size()); + size_t index = 0; + for (auto &&value : src) { + auto value_ = reinterpret_steal(value_conv::cast(forward_like(value), policy, parent)); + if (!value_) + return handle(); + PyList_SET_ITEM(l.ptr(), (ssize_t) index++, value_.release().ptr()); // steals a reference + } + return l.release(); + } + + PYBIND11_TYPE_CASTER(Type, _("List[") + value_conv::name + _("]")); +}; + +template struct type_caster> + : list_caster, Type> { }; + +template struct type_caster> + : list_caster, Type> { }; + +template struct type_caster> + : list_caster, Type> { }; + +template struct array_caster { + using value_conv = make_caster; + +private: + template + bool require_size(enable_if_t size) { + if (value.size() != size) + value.resize(size); + return true; + } + template + bool require_size(enable_if_t size) { + return size == Size; + } + +public: + bool load(handle src, bool convert) { + if (!isinstance(src)) + return false; + auto l = reinterpret_borrow(src); + if (!require_size(l.size())) + return false; + size_t ctr = 0; + for (auto it : l) { + value_conv conv; + if (!conv.load(it, convert)) + return false; + value[ctr++] = cast_op(std::move(conv)); + } + return true; + } + + template + static handle cast(T &&src, return_value_policy policy, handle parent) { + list l(src.size()); + size_t index = 0; + for (auto &&value : src) { + auto value_ = reinterpret_steal(value_conv::cast(forward_like(value), policy, parent)); + if (!value_) + return handle(); + PyList_SET_ITEM(l.ptr(), (ssize_t) index++, value_.release().ptr()); // steals a reference + } + return l.release(); + } + + PYBIND11_TYPE_CASTER(ArrayType, _("List[") + value_conv::name + _(_(""), _("[") + _() + _("]")) + _("]")); +}; + +template struct type_caster> + : array_caster, Type, false, Size> { }; + +template struct type_caster> + : array_caster, Type, true> { }; + +template struct type_caster> + : set_caster, Key> { }; + +template struct type_caster> + : set_caster, Key> { }; + +template struct type_caster> + : map_caster, Key, Value> { }; + +template struct type_caster> + : map_caster, Key, Value> { }; + +// This type caster is intended to be used for std::optional and std::experimental::optional +template struct optional_caster { + using value_conv = make_caster; + + template + static handle cast(T_ &&src, return_value_policy policy, handle parent) { + if (!src) + return none().inc_ref(); + policy = return_value_policy_override::policy(policy); + return value_conv::cast(*std::forward(src), policy, parent); + } + + bool load(handle src, bool convert) { + if (!src) { + return false; + } else if (src.is_none()) { + return true; // default-constructed value is already empty + } + value_conv inner_caster; + if (!inner_caster.load(src, convert)) + return false; + + value.emplace(cast_op(std::move(inner_caster))); + return true; + } + + PYBIND11_TYPE_CASTER(T, _("Optional[") + value_conv::name + _("]")); +}; + +#if PYBIND11_HAS_OPTIONAL +template struct type_caster> + : public optional_caster> {}; + +template<> struct type_caster + : public void_caster {}; +#endif + +#if PYBIND11_HAS_EXP_OPTIONAL +template struct type_caster> + : public optional_caster> {}; + +template<> struct type_caster + : public void_caster {}; +#endif + +/// Visit a variant and cast any found type to Python +struct variant_caster_visitor { + return_value_policy policy; + handle parent; + + using result_type = handle; // required by boost::variant in C++11 + + template + result_type operator()(T &&src) const { + return make_caster::cast(std::forward(src), policy, parent); + } +}; + +/// Helper class which abstracts away variant's `visit` function. `std::variant` and similar +/// `namespace::variant` types which provide a `namespace::visit()` function are handled here +/// automatically using argument-dependent lookup. Users can provide specializations for other +/// variant-like classes, e.g. `boost::variant` and `boost::apply_visitor`. +template class Variant> +struct visit_helper { + template + static auto call(Args &&...args) -> decltype(visit(std::forward(args)...)) { + return visit(std::forward(args)...); + } +}; + +/// Generic variant caster +template struct variant_caster; + +template class V, typename... Ts> +struct variant_caster> { + static_assert(sizeof...(Ts) > 0, "Variant must consist of at least one alternative."); + + template + bool load_alternative(handle src, bool convert, type_list) { + auto caster = make_caster(); + if (caster.load(src, convert)) { + value = cast_op(caster); + return true; + } + return load_alternative(src, convert, type_list{}); + } + + bool load_alternative(handle, bool, type_list<>) { return false; } + + bool load(handle src, bool convert) { + // Do a first pass without conversions to improve constructor resolution. + // E.g. `py::int_(1).cast>()` needs to fill the `int` + // slot of the variant. Without two-pass loading `double` would be filled + // because it appears first and a conversion is possible. + if (convert && load_alternative(src, false, type_list{})) + return true; + return load_alternative(src, convert, type_list{}); + } + + template + static handle cast(Variant &&src, return_value_policy policy, handle parent) { + return visit_helper::call(variant_caster_visitor{policy, parent}, + std::forward(src)); + } + + using Type = V; + PYBIND11_TYPE_CASTER(Type, _("Union[") + detail::concat(make_caster::name...) + _("]")); +}; + +#if PYBIND11_HAS_VARIANT +template +struct type_caster> : variant_caster> { }; +#endif + +NAMESPACE_END(detail) + +inline std::ostream &operator<<(std::ostream &os, const handle &obj) { + os << (std::string) str(obj); + return os; +} + +NAMESPACE_END(PYBIND11_NAMESPACE) + +#if defined(_MSC_VER) +#pragma warning(pop) +#endif diff --git a/python/src/pybind11/stl_bind.h b/python/src/pybind11/stl_bind.h new file mode 100644 index 000000000..1f8725260 --- /dev/null +++ b/python/src/pybind11/stl_bind.h @@ -0,0 +1,630 @@ +/* + pybind11/std_bind.h: Binding generators for STL data types + + Copyright (c) 2016 Sergey Lyskov and Wenzel Jakob + + All rights reserved. Use of this source code is governed by a + BSD-style license that can be found in the LICENSE file. +*/ + +#pragma once + +#include "detail/common.h" +#include "operators.h" + +#include +#include + +NAMESPACE_BEGIN(PYBIND11_NAMESPACE) +NAMESPACE_BEGIN(detail) + +/* SFINAE helper class used by 'is_comparable */ +template struct container_traits { + template static std::true_type test_comparable(decltype(std::declval() == std::declval())*); + template static std::false_type test_comparable(...); + template static std::true_type test_value(typename T2::value_type *); + template static std::false_type test_value(...); + template static std::true_type test_pair(typename T2::first_type *, typename T2::second_type *); + template static std::false_type test_pair(...); + + static constexpr const bool is_comparable = std::is_same(nullptr))>::value; + static constexpr const bool is_pair = std::is_same(nullptr, nullptr))>::value; + static constexpr const bool is_vector = std::is_same(nullptr))>::value; + static constexpr const bool is_element = !is_pair && !is_vector; +}; + +/* Default: is_comparable -> std::false_type */ +template +struct is_comparable : std::false_type { }; + +/* For non-map data structures, check whether operator== can be instantiated */ +template +struct is_comparable< + T, enable_if_t::is_element && + container_traits::is_comparable>> + : std::true_type { }; + +/* For a vector/map data structure, recursively check the value type (which is std::pair for maps) */ +template +struct is_comparable::is_vector>> { + static constexpr const bool value = + is_comparable::value; +}; + +/* For pairs, recursively check the two data types */ +template +struct is_comparable::is_pair>> { + static constexpr const bool value = + is_comparable::value && + is_comparable::value; +}; + +/* Fallback functions */ +template void vector_if_copy_constructible(const Args &...) { } +template void vector_if_equal_operator(const Args &...) { } +template void vector_if_insertion_operator(const Args &...) { } +template void vector_modifiers(const Args &...) { } + +template +void vector_if_copy_constructible(enable_if_t::value, Class_> &cl) { + cl.def(init(), "Copy constructor"); +} + +template +void vector_if_equal_operator(enable_if_t::value, Class_> &cl) { + using T = typename Vector::value_type; + + cl.def(self == self); + cl.def(self != self); + + cl.def("count", + [](const Vector &v, const T &x) { + return std::count(v.begin(), v.end(), x); + }, + arg("x"), + "Return the number of times ``x`` appears in the list" + ); + + cl.def("remove", [](Vector &v, const T &x) { + auto p = std::find(v.begin(), v.end(), x); + if (p != v.end()) + v.erase(p); + else + throw value_error(); + }, + arg("x"), + "Remove the first item from the list whose value is x. " + "It is an error if there is no such item." + ); + + cl.def("__contains__", + [](const Vector &v, const T &x) { + return std::find(v.begin(), v.end(), x) != v.end(); + }, + arg("x"), + "Return true the container contains ``x``" + ); +} + +// Vector modifiers -- requires a copyable vector_type: +// (Technically, some of these (pop and __delitem__) don't actually require copyability, but it seems +// silly to allow deletion but not insertion, so include them here too.) +template +void vector_modifiers(enable_if_t::value, Class_> &cl) { + using T = typename Vector::value_type; + using SizeType = typename Vector::size_type; + using DiffType = typename Vector::difference_type; + + cl.def("append", + [](Vector &v, const T &value) { v.push_back(value); }, + arg("x"), + "Add an item to the end of the list"); + + cl.def(init([](iterable it) { + auto v = std::unique_ptr(new Vector()); + v->reserve(len_hint(it)); + for (handle h : it) + v->push_back(h.cast()); + return v.release(); + })); + + cl.def("extend", + [](Vector &v, const Vector &src) { + v.insert(v.end(), src.begin(), src.end()); + }, + arg("L"), + "Extend the list by appending all the items in the given list" + ); + + cl.def("extend", + [](Vector &v, iterable it) { + const size_t old_size = v.size(); + v.reserve(old_size + len_hint(it)); + try { + for (handle h : it) { + v.push_back(h.cast()); + } + } catch (const cast_error &) { + v.erase(v.begin() + static_cast(old_size), v.end()); + try { + v.shrink_to_fit(); + } catch (const std::exception &) { + // Do nothing + } + throw; + } + }, + arg("L"), + "Extend the list by appending all the items in the given list" + ); + + cl.def("insert", + [](Vector &v, SizeType i, const T &x) { + if (i > v.size()) + throw index_error(); + v.insert(v.begin() + (DiffType) i, x); + }, + arg("i") , arg("x"), + "Insert an item at a given position." + ); + + cl.def("pop", + [](Vector &v) { + if (v.empty()) + throw index_error(); + T t = v.back(); + v.pop_back(); + return t; + }, + "Remove and return the last item" + ); + + cl.def("pop", + [](Vector &v, SizeType i) { + if (i >= v.size()) + throw index_error(); + T t = v[i]; + v.erase(v.begin() + (DiffType) i); + return t; + }, + arg("i"), + "Remove and return the item at index ``i``" + ); + + cl.def("__setitem__", + [](Vector &v, SizeType i, const T &t) { + if (i >= v.size()) + throw index_error(); + v[i] = t; + } + ); + + /// Slicing protocol + cl.def("__getitem__", + [](const Vector &v, slice slice) -> Vector * { + size_t start, stop, step, slicelength; + + if (!slice.compute(v.size(), &start, &stop, &step, &slicelength)) + throw error_already_set(); + + Vector *seq = new Vector(); + seq->reserve((size_t) slicelength); + + for (size_t i=0; ipush_back(v[start]); + start += step; + } + return seq; + }, + arg("s"), + "Retrieve list elements using a slice object" + ); + + cl.def("__setitem__", + [](Vector &v, slice slice, const Vector &value) { + size_t start, stop, step, slicelength; + if (!slice.compute(v.size(), &start, &stop, &step, &slicelength)) + throw error_already_set(); + + if (slicelength != value.size()) + throw std::runtime_error("Left and right hand size of slice assignment have different sizes!"); + + for (size_t i=0; i= v.size()) + throw index_error(); + v.erase(v.begin() + DiffType(i)); + }, + "Delete the list elements at index ``i``" + ); + + cl.def("__delitem__", + [](Vector &v, slice slice) { + size_t start, stop, step, slicelength; + + if (!slice.compute(v.size(), &start, &stop, &step, &slicelength)) + throw error_already_set(); + + if (step == 1 && false) { + v.erase(v.begin() + (DiffType) start, v.begin() + DiffType(start + slicelength)); + } else { + for (size_t i = 0; i < slicelength; ++i) { + v.erase(v.begin() + DiffType(start)); + start += step - 1; + } + } + }, + "Delete list elements using a slice object" + ); + +} + +// If the type has an operator[] that doesn't return a reference (most notably std::vector), +// we have to access by copying; otherwise we return by reference. +template using vector_needs_copy = negation< + std::is_same()[typename Vector::size_type()]), typename Vector::value_type &>>; + +// The usual case: access and iterate by reference +template +void vector_accessor(enable_if_t::value, Class_> &cl) { + using T = typename Vector::value_type; + using SizeType = typename Vector::size_type; + using ItType = typename Vector::iterator; + + cl.def("__getitem__", + [](Vector &v, SizeType i) -> T & { + if (i >= v.size()) + throw index_error(); + return v[i]; + }, + return_value_policy::reference_internal // ref + keepalive + ); + + cl.def("__iter__", + [](Vector &v) { + return make_iterator< + return_value_policy::reference_internal, ItType, ItType, T&>( + v.begin(), v.end()); + }, + keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */ + ); +} + +// The case for special objects, like std::vector, that have to be returned-by-copy: +template +void vector_accessor(enable_if_t::value, Class_> &cl) { + using T = typename Vector::value_type; + using SizeType = typename Vector::size_type; + using ItType = typename Vector::iterator; + cl.def("__getitem__", + [](const Vector &v, SizeType i) -> T { + if (i >= v.size()) + throw index_error(); + return v[i]; + } + ); + + cl.def("__iter__", + [](Vector &v) { + return make_iterator< + return_value_policy::copy, ItType, ItType, T>( + v.begin(), v.end()); + }, + keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */ + ); +} + +template auto vector_if_insertion_operator(Class_ &cl, std::string const &name) + -> decltype(std::declval() << std::declval(), void()) { + using size_type = typename Vector::size_type; + + cl.def("__repr__", + [name](Vector &v) { + std::ostringstream s; + s << name << '['; + for (size_type i=0; i < v.size(); ++i) { + s << v[i]; + if (i != v.size() - 1) + s << ", "; + } + s << ']'; + return s.str(); + }, + "Return the canonical string representation of this list." + ); +} + +// Provide the buffer interface for vectors if we have data() and we have a format for it +// GCC seems to have "void std::vector::data()" - doing SFINAE on the existence of data() is insufficient, we need to check it returns an appropriate pointer +template +struct vector_has_data_and_format : std::false_type {}; +template +struct vector_has_data_and_format::format(), std::declval().data()), typename Vector::value_type*>::value>> : std::true_type {}; + +// Add the buffer interface to a vector +template +enable_if_t...>::value> +vector_buffer(Class_& cl) { + using T = typename Vector::value_type; + + static_assert(vector_has_data_and_format::value, "There is not an appropriate format descriptor for this vector"); + + // numpy.h declares this for arbitrary types, but it may raise an exception and crash hard at runtime if PYBIND11_NUMPY_DTYPE hasn't been called, so check here + format_descriptor::format(); + + cl.def_buffer([](Vector& v) -> buffer_info { + return buffer_info(v.data(), static_cast(sizeof(T)), format_descriptor::format(), 1, {v.size()}, {sizeof(T)}); + }); + + cl.def(init([](buffer buf) { + auto info = buf.request(); + if (info.ndim != 1 || info.strides[0] % static_cast(sizeof(T))) + throw type_error("Only valid 1D buffers can be copied to a vector"); + if (!detail::compare_buffer_info::compare(info) || (ssize_t) sizeof(T) != info.itemsize) + throw type_error("Format mismatch (Python: " + info.format + " C++: " + format_descriptor::format() + ")"); + + auto vec = std::unique_ptr(new Vector()); + vec->reserve((size_t) info.shape[0]); + T *p = static_cast(info.ptr); + ssize_t step = info.strides[0] / static_cast(sizeof(T)); + T *end = p + info.shape[0] * step; + for (; p != end; p += step) + vec->push_back(*p); + return vec.release(); + })); + + return; +} + +template +enable_if_t...>::value> vector_buffer(Class_&) {} + +NAMESPACE_END(detail) + +// +// std::vector +// +template , typename... Args> +class_ bind_vector(handle scope, std::string const &name, Args&&... args) { + using Class_ = class_; + + // If the value_type is unregistered (e.g. a converting type) or is itself registered + // module-local then make the vector binding module-local as well: + using vtype = typename Vector::value_type; + auto vtype_info = detail::get_type_info(typeid(vtype)); + bool local = !vtype_info || vtype_info->module_local; + + Class_ cl(scope, name.c_str(), pybind11::module_local(local), std::forward(args)...); + + // Declare the buffer interface if a buffer_protocol() is passed in + detail::vector_buffer(cl); + + cl.def(init<>()); + + // Register copy constructor (if possible) + detail::vector_if_copy_constructible(cl); + + // Register comparison-related operators and functions (if possible) + detail::vector_if_equal_operator(cl); + + // Register stream insertion operator (if possible) + detail::vector_if_insertion_operator(cl, name); + + // Modifiers require copyable vector value type + detail::vector_modifiers(cl); + + // Accessor and iterator; return by value if copyable, otherwise we return by ref + keep-alive + detail::vector_accessor(cl); + + cl.def("__bool__", + [](const Vector &v) -> bool { + return !v.empty(); + }, + "Check whether the list is nonempty" + ); + + cl.def("__len__", &Vector::size); + + + + +#if 0 + // C++ style functions deprecated, leaving it here as an example + cl.def(init()); + + cl.def("resize", + (void (Vector::*) (size_type count)) & Vector::resize, + "changes the number of elements stored"); + + cl.def("erase", + [](Vector &v, SizeType i) { + if (i >= v.size()) + throw index_error(); + v.erase(v.begin() + i); + }, "erases element at index ``i``"); + + cl.def("empty", &Vector::empty, "checks whether the container is empty"); + cl.def("size", &Vector::size, "returns the number of elements"); + cl.def("push_back", (void (Vector::*)(const T&)) &Vector::push_back, "adds an element to the end"); + cl.def("pop_back", &Vector::pop_back, "removes the last element"); + + cl.def("max_size", &Vector::max_size, "returns the maximum possible number of elements"); + cl.def("reserve", &Vector::reserve, "reserves storage"); + cl.def("capacity", &Vector::capacity, "returns the number of elements that can be held in currently allocated storage"); + cl.def("shrink_to_fit", &Vector::shrink_to_fit, "reduces memory usage by freeing unused memory"); + + cl.def("clear", &Vector::clear, "clears the contents"); + cl.def("swap", &Vector::swap, "swaps the contents"); + + cl.def("front", [](Vector &v) { + if (v.size()) return v.front(); + else throw index_error(); + }, "access the first element"); + + cl.def("back", [](Vector &v) { + if (v.size()) return v.back(); + else throw index_error(); + }, "access the last element "); + +#endif + + return cl; +} + + + +// +// std::map, std::unordered_map +// + +NAMESPACE_BEGIN(detail) + +/* Fallback functions */ +template void map_if_insertion_operator(const Args &...) { } +template void map_assignment(const Args &...) { } + +// Map assignment when copy-assignable: just copy the value +template +void map_assignment(enable_if_t::value, Class_> &cl) { + using KeyType = typename Map::key_type; + using MappedType = typename Map::mapped_type; + + cl.def("__setitem__", + [](Map &m, const KeyType &k, const MappedType &v) { + auto it = m.find(k); + if (it != m.end()) it->second = v; + else m.emplace(k, v); + } + ); +} + +// Not copy-assignable, but still copy-constructible: we can update the value by erasing and reinserting +template +void map_assignment(enable_if_t< + !std::is_copy_assignable::value && + is_copy_constructible::value, + Class_> &cl) { + using KeyType = typename Map::key_type; + using MappedType = typename Map::mapped_type; + + cl.def("__setitem__", + [](Map &m, const KeyType &k, const MappedType &v) { + // We can't use m[k] = v; because value type might not be default constructable + auto r = m.emplace(k, v); + if (!r.second) { + // value type is not copy assignable so the only way to insert it is to erase it first... + m.erase(r.first); + m.emplace(k, v); + } + } + ); +} + + +template auto map_if_insertion_operator(Class_ &cl, std::string const &name) +-> decltype(std::declval() << std::declval() << std::declval(), void()) { + + cl.def("__repr__", + [name](Map &m) { + std::ostringstream s; + s << name << '{'; + bool f = false; + for (auto const &kv : m) { + if (f) + s << ", "; + s << kv.first << ": " << kv.second; + f = true; + } + s << '}'; + return s.str(); + }, + "Return the canonical string representation of this map." + ); +} + + +NAMESPACE_END(detail) + +template , typename... Args> +class_ bind_map(handle scope, const std::string &name, Args&&... args) { + using KeyType = typename Map::key_type; + using MappedType = typename Map::mapped_type; + using Class_ = class_; + + // If either type is a non-module-local bound type then make the map binding non-local as well; + // otherwise (e.g. both types are either module-local or converting) the map will be + // module-local. + auto tinfo = detail::get_type_info(typeid(MappedType)); + bool local = !tinfo || tinfo->module_local; + if (local) { + tinfo = detail::get_type_info(typeid(KeyType)); + local = !tinfo || tinfo->module_local; + } + + Class_ cl(scope, name.c_str(), pybind11::module_local(local), std::forward(args)...); + + cl.def(init<>()); + + // Register stream insertion operator (if possible) + detail::map_if_insertion_operator(cl, name); + + cl.def("__bool__", + [](const Map &m) -> bool { return !m.empty(); }, + "Check whether the map is nonempty" + ); + + cl.def("__iter__", + [](Map &m) { return make_key_iterator(m.begin(), m.end()); }, + keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */ + ); + + cl.def("items", + [](Map &m) { return make_iterator(m.begin(), m.end()); }, + keep_alive<0, 1>() /* Essential: keep list alive while iterator exists */ + ); + + cl.def("__getitem__", + [](Map &m, const KeyType &k) -> MappedType & { + auto it = m.find(k); + if (it == m.end()) + throw key_error(); + return it->second; + }, + return_value_policy::reference_internal // ref + keepalive + ); + + cl.def("__contains__", + [](Map &m, const KeyType &k) -> bool { + auto it = m.find(k); + if (it == m.end()) + return false; + return true; + } + ); + + // Assignment provided only if the type is copyable + detail::map_assignment(cl); + + cl.def("__delitem__", + [](Map &m, const KeyType &k) { + auto it = m.find(k); + if (it == m.end()) + throw key_error(); + m.erase(it); + } + ); + + cl.def("__len__", &Map::size); + + return cl; +} + +NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/python/src/tensorflow.cpp b/python/src/tensorflow.cpp new file mode 100644 index 000000000..12e64fa4f --- /dev/null +++ b/python/src/tensorflow.cpp @@ -0,0 +1,224 @@ +#include +#include +#include +#include +#include +#include "triton/codegen/selection/selection.h" +#include "triton/runtime/function.h" +#include "triton/lang/lang.h" +#include "triton/driver/device.h" +#include "triton/driver/stream.h" +#include "triton/driver/kernel.h" +#include "triton/driver/module.h" +#include "triton/ir/module.h" +#include "triton/ir/function.h" +#include "triton/tools/bench.hpp" + +typedef struct yy_buffer_state * YY_BUFFER_STATE; +extern int yyparse(); +extern YY_BUFFER_STATE yy_scan_string(const char * str); +extern void yy_delete_buffer(YY_BUFFER_STATE buffer); +extern triton::lang::translation_unit *ast_root; + +using namespace triton; + +inline std::string to_tf_ty(ir::type *ty) { + if(ty->is_integer_ty(1)) + return "bool"; + if(ty->is_integer_ty(8)) + return "int8"; + if(ty->is_integer_ty(16)) + return "int16"; + if(ty->is_integer_ty(32)) + return "int32"; + if(ty->is_integer_ty(64)) + return "int64"; + if(ty->is_half_ty()) + return "float16"; + if(ty->is_float_ty()) + return "float32"; + if(ty->is_double_ty()) + return "float64"; + if(ty->is_pointer_ty()) + return "Tensor"; + throw std::runtime_error("unknown type"); +} + +inline std::string to_tf_scalar_ty(ir::type *ty) { + if(ty->is_pointer_ty()) + return to_tf_ty(ty->get_pointer_element_ty()); + else { + return to_tf_ty(ty); + } +} + +inline std::string ref_to_tf_ty(ir::type *ty) { + std::string res = to_tf_ty(ty); + if(ty->is_pointer_ty()) + res = "const " + res + "&"; + return res; +} + +inline triton::lang::translation_unit *make_ast(const char *src) { + YY_BUFFER_STATE buffer = yy_scan_string(src); + yyparse(); + yy_delete_buffer(buffer); + triton::lang::translation_unit *program = ast_root; + return program; +} + +inline std::unique_ptr make_ir(ir::context& ctx, triton::lang::translation_unit *program) { + // create Triton-IR from AST + ir::module* module = new ir::module("", ctx); + program->codegen(module); + return std::unique_ptr(module); +} + +std::string make_tensorflow_src(const std::string src, + const std::vector& outputs, + const std::string& macro) { + triton::lang::translation_unit *ast = make_ast(src.c_str()); + triton::ir::context context; + std::unique_ptr ir = make_ir(context, ast); + // extract function signature + ir::function* fn = ir->get_function_list().front(); + ir::function_type* fn_ty = fn->get_fn_type(); + // numberof arguments + size_t n_args = fn_ty->get_num_params(); + size_t n_outputs = outputs.size(); + // extract function name + std::string name = fn->get_name(); + name[0] = static_cast(std::toupper(name[0])); + std::string classname = name + "Op"; + // extract argument name + std::vector arg_names; + for(ir::argument *arg: fn->args()) + arg_names.push_back(arg->get_name()); + // cached int to str + std::vector str_i; + for(size_t i = 0; i < fn_ty->get_num_params(); i++) + str_i.push_back(std::to_string(i)); + // index of tensors + std::vector ptr_idx; + for(unsigned i = 0; i < fn_ty->get_num_params(); i++) + if(fn_ty->get_param_ty(i)->is_pointer_ty()) + ptr_idx.push_back(i); + // extract tensorflow types + std::vector tf_scalar_tys; + std::transform(fn_ty->params_begin(), fn_ty->params_end(), std::back_inserter(tf_scalar_tys), to_tf_scalar_ty); + std::vector tf_cref_tys; + std::transform(fn_ty->params_begin(), fn_ty->params_end(), std::back_inserter(tf_cref_tys), ref_to_tf_ty); + + std::ostringstream oss; + + std::string result = R"( +#include "triton/driver/buffer.h" +#include "triton/driver/backend.h" +#include "triton/driver/stream.h" +#include "triton/runtime/function.h" + +#define EIGEN_USE_GPU +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" +#include "tensorflow/core/util/padding.h" +#include "tensorflow/core/util/tensor_format.h" +#include "tensorflow/core/framework/common_shape_fns.h" + +using namespace tensorflow; +using GPUDevice = Eigen::GpuDevice; +namespace rt = triton::runtime; +namespace drv = triton::driver; + +std::string src = R"TTKERNSRC( )" + src + ")TTKERNSRC\";" + R"( + +class )" + classname + R"(: public OpKernel { + public: + explicit )" + classname + R"((OpKernelConstruction* context) + : OpKernel(context), fn_(src) { } + + void Compute(OpKernelContext* context){ + + // get device/stream + GPUDevice device = context->eigen_device(); + drv::cu_stream sstream(device.stream(), false); + drv::context* ctx = sstream.context(); + drv::stream* stream = &sstream; + + // extract inputs)"; +for(unsigned i = 0; i < n_args; i++){ + std::string suffix = ""; + std::string ty = tf_cref_tys[i]; + if(!fn_ty->get_param_ty(i)->is_pointer_ty()) + suffix = ".scalar<" + ty + ">()()"; + result += R"( + )" + ty + " " + arg_names[i] + " = context->input(" + str_i[i] + ")" + suffix + ";"; +} + +result += R"( + + // extract outputs)"; +for(unsigned i = 0; i < n_outputs; i++) + result += R"( + context->set_output()" + str_i[i] + ", " + arg_names[outputs[i]] + ");"; + +result += R"( + + // wrap tensors)"; +for(size_t i: ptr_idx) +result += R"( + drv::cu_buffer cu_)" + arg_names[i] + "(ctx, " + arg_names[i] + ".tensor_data().size(), (CUdeviceptr)" + arg_names[i] + R"(.tensor_data().data(), false);)"; + + +std::regex regex("#([a-zA-Z]([a-zA-Z]|[0-9])*)"); +std::string grid_str = std::regex_replace(macro, regex, "x.at(\"$1\")"); + +result += R"( + + // create launch grid; + auto grid = [&](const rt::params_t& x) { return rt::grid_t{)" + grid_str + R"(}; };)"; + +result += R"( + + // execute function + fn_({ + )"; +for(unsigned i = 0; i < n_args; i++){ + std::string arg = arg_names[i]; + if(fn_ty->get_param_ty(i)->is_pointer_ty()) + arg = "&cu_" + arg; + if(i > 0) + result += ", "; + result += arg; +} +result += R"( + }, grid, stream); + + } + +private: + rt::function fn_; +}; + +REGISTER_KERNEL_BUILDER(Name(")" + name + "\").Device(DEVICE_GPU), " + classname + R"(); + +REGISTER_OP(")" + name + "\")\n"; +for(size_t i = 0; i < tf_scalar_tys.size(); i++){ + bool is_output = std::find(outputs.begin(), outputs.end(), i) != outputs.end(); + std::string mode = is_output ? "Output" : "Input" ; + std::string arg_name = arg_names[i]; + std::transform(arg_name.begin(), arg_name.end(), arg_name.begin(), [](char c) { return std::tolower(c);}); + result += " ." + mode + "(\"" + arg_name + ": " + tf_scalar_tys[i] + "\")\n"; +} +result += ";\n"; + + + return result; +} + + +PYBIND11_MODULE(libtriton, m) { + m.doc() = "Python bindings to the C++ Triton API"; + m.def("make_tensorflow_src", &make_tensorflow_src, "Creates C++ source code for a custom Tensorflow op corresponding to the specified Triton kernel"); +} From c7cb5f82ad816f8050857d8b887a66016980dcdf Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 16 Aug 2019 15:56:58 -0700 Subject: [PATCH 297/494] [general] removed LLVM #include's in all Triton headers --- examples/cpp/dot.cpp | 2 - include/triton/codegen/selection/selection.h | 87 ++++---- include/triton/codegen/selection/target.h | 90 +++++---- include/triton/ir/builder.h | 8 +- include/triton/ir/constant.h | 5 +- include/triton/ir/instructions.h | 80 ++++---- include/triton/runtime/jit.h | 1 + lib/codegen/analysis/shmem/allocation.cpp | 1 + lib/codegen/analysis/shmem/info.cpp | 1 + lib/codegen/analysis/tune.cpp | 3 +- lib/codegen/selection/selection.cpp | 94 ++++++++- lib/codegen/transform/peephole.cpp | 3 +- lib/codegen/transform/reassociate.cpp | 2 +- lib/ir/builder.cpp | 88 ++++----- lib/ir/constant.cpp | 26 +-- lib/ir/function.cpp | 1 + lib/ir/instructions.cpp | 187 +++++++++--------- lib/ir/module.cpp | 1 + lib/lang/declaration.cpp | 1 + python/dist/triton-0.1-py3.6-linux-x86_64.egg | Bin 709047 -> 0 bytes python/examples/dot.py | 57 +++++- 21 files changed, 454 insertions(+), 284 deletions(-) delete mode 100644 python/dist/triton-0.1-py3.6-linux-x86_64.egg diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index b56f6f6e1..f97cc2021 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -154,8 +154,6 @@ perf_t do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int stream->synchronize(); // run rt::function function(src(AT, BT, ty, ty, ty, 8, 8)); - std::cout << function.make_tensorflow_src({2}, "(M + #TM - 1)/#TM, (N + #TN - 1)/#TN, 1") << std::endl; - exit(EXIT_FAILURE); auto ceil = [](size_t x, size_t y) { return (x + y - 1) / y; }; auto grid = [&](const rt::params_t& x) { return rt::grid_t{ceil(M, x.at("TM")), ceil(N, x.at("TN")), 1}; }; diff --git a/include/triton/codegen/selection/selection.h b/include/triton/codegen/selection/selection.h index 785e32179..3b871dce0 100644 --- a/include/triton/codegen/selection/selection.h +++ b/include/triton/codegen/selection/selection.h @@ -1,7 +1,6 @@ #ifndef TDL_INCLUDE_CODEGEN_SELECTION_H #define TDL_INCLUDE_CODEGEN_SELECTION_H -#include "llvm/IR/IRBuilder.h" #include "triton/ir/context.h" #include "triton/ir/module.h" #include "triton/ir/function.h" @@ -16,6 +15,28 @@ namespace llvm{ class Constant; class LLVMContext; class Module; + class ConstantFolder; + class IRBuilderDefaultInserter; + template + class IRBuilder; + class ArrayType; + class Function; +} + +// typedefs +namespace triton{ +namespace codegen{ + typedef llvm::IRBuilder Builder; + typedef llvm::LLVMContext LLVMContext; + typedef llvm::Type Type; + typedef llvm::Value Value; + typedef llvm::Module Module; + typedef llvm::Instruction Instruction; + typedef llvm::Constant Constant; + typedef llvm::ArrayType ArrayType; + typedef llvm::Function Function; +} } namespace triton{ @@ -35,12 +56,12 @@ class info; } class target; -typedef std::vector indices_t; +typedef std::vector indices_t; struct distributed_axis { size_t contiguous; - std::vector values; - llvm::Value* thread_id; + std::vector values; + Value* thread_id; }; class tile { @@ -48,40 +69,40 @@ protected: typedef std::vector shapes_t; public: - tile(llvm::Type *ty, const shapes_t &shapes): ty_(ty), shapes_(shapes){ } - virtual void set_value(indices_t idx, llvm::Value *v) = 0; - virtual llvm::Value* get_value(indices_t idx) = 0; - llvm::Type *get_ty() const { return ty_; } + tile(Type *ty, const shapes_t &shapes): ty_(ty), shapes_(shapes){ } + virtual void set_value(indices_t idx, Value *v) = 0; + virtual Value* get_value(indices_t idx) = 0; + Type *get_ty() const { return ty_; } shapes_t get_shapes() const { return shapes_; } protected: - llvm::Type *ty_; + Type *ty_; shapes_t shapes_; }; class shared_tile: public tile { private: - void extract_constant(llvm::Value *arg, llvm::Value *&non_cst, llvm::Value *&cst); + void extract_constant(Value *arg, Value *&non_cst, Value *&cst); void extract_constant(const indices_t &arg_idx, indices_t &non_cst_idx, indices_t &cst_idx); public: - shared_tile(llvm::Type* ty, const shapes_t &shapes, llvm::Value* ptr, llvm::IRBuilder<> &builder, llvm::Value* offset = nullptr); + shared_tile(Type* ty, const shapes_t &shapes, Value* ptr, Builder &builder, Value* offset = nullptr); void set_vector_size(unsigned vector_size); void set_return_mode(bool return_vector); - void set_value(indices_t, llvm::Value *); - llvm::Value* get_ptr_to(indices_t idx); - llvm::Value* get_value(indices_t idx); - llvm::Value* get_pointer() { return ptr_; } - llvm::Value* get_offset() { return offset_; } - static llvm::Value* shared_offset(llvm::IRBuilder<>& builder, const shapes_t& shapes, indices_t idx); + void set_value(indices_t, Value *); + Value* get_ptr_to(indices_t idx); + Value* get_value(indices_t idx); + Value* get_pointer() { return ptr_; } + Value* get_offset() { return offset_; } + static Value* shared_offset(Builder& builder, const shapes_t& shapes, indices_t idx); private: - llvm::Value *ptr_; + Value *ptr_; bool return_vector_; - llvm::Value *offset_; - llvm::IRBuilder<> &builder_; - std::map ptr_cache_; + Value *offset_; + Builder &builder_; + std::map ptr_cache_; unsigned vector_size_; }; @@ -90,16 +111,16 @@ class distributed_tile: public tile{ typedef std::vector axes_t; typedef std::vector ordered_indices_vec_t; typedef std::map indices_map_t; - typedef std::map values_map_t; + typedef std::map values_map_t; private: void init_indices(); - llvm::Type *make_vector_ty(llvm::Type *ty, size_t vector_size); + Type *make_vector_ty(Type *ty, size_t vector_size); public: - distributed_tile(llvm::Type *ty, const shapes_t& shapes, const axes_t &axes, llvm::IRBuilder<> &builder, bool vectorize); - void set_value(indices_t idx, llvm::Value *v); - llvm::Value* get_value(indices_t idx); + distributed_tile(Type *ty, const shapes_t& shapes, const axes_t &axes, Builder &builder, bool vectorize); + void set_value(indices_t idx, Value *v); + Value* get_value(indices_t idx); unsigned get_linear_index(indices_t idx); indices_t get_ordered_indices(unsigned id); void for_each(std::function fn); @@ -111,25 +132,15 @@ private: values_map_t values_; ordered_indices_vec_t ordered_indices_; size_t vector_size_; - llvm::IRBuilder<> &builder_; + Builder &builder_; }; // Selection pass class selection{ - typedef std::map vmap_t; + typedef std::map vmap_t; typedef std::map tmap_t; - typedef llvm::LLVMContext LLVMContext; - typedef llvm::IRBuilder<> Builder; - typedef llvm::Type Type; - typedef llvm::Value Value; - typedef llvm::Module Module; - typedef llvm::Instruction Instruction; - typedef llvm::Constant Constant; - typedef llvm::ArrayType ArrayType; - typedef llvm::Function Function; - private: // utils Type *make_vector_ty(Type *ty, size_t vector_size); diff --git a/include/triton/codegen/selection/target.h b/include/triton/codegen/selection/target.h index c080d1c07..5a0a84694 100644 --- a/include/triton/codegen/selection/target.h +++ b/include/triton/codegen/selection/target.h @@ -4,14 +4,36 @@ #include #include #include -#include "llvm/IR/IRBuilder.h" namespace llvm{ -class Instruction; -class Value; -class Module; -class LLVMContext; -class Function; + class Type; + class Value; + class Instruction; + class Constant; + class LLVMContext; + class Module; + class ConstantFolder; + class IRBuilderDefaultInserter; + template + class IRBuilder; + class ArrayType; + class Function; +} + +// typedefs +namespace triton{ +namespace codegen{ + typedef llvm::IRBuilder Builder; + typedef llvm::LLVMContext LLVMContext; + typedef llvm::Type Type; + typedef llvm::Value Value; + typedef llvm::Module Module; + typedef llvm::Instruction Instruction; + typedef llvm::Constant Constant; + typedef llvm::ArrayType ArrayType; + typedef llvm::Function Function; +} } namespace triton{ @@ -21,13 +43,13 @@ class target { public: target(bool is_gpu): is_gpu_(is_gpu){} virtual ~target() {} - virtual void set_kernel(llvm::IRBuilder<>& builder, llvm::LLVMContext &ctx, llvm::Module *module, llvm::Function* fn) = 0; - virtual llvm::Instruction* add_barrier(llvm::Module *module, llvm::IRBuilder<>& builder) = 0; - virtual llvm::Instruction* add_memfence(llvm::Module *module, llvm::IRBuilder<>& builder) = 0; - virtual llvm::Value* get_global_offset(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned stride, unsigned ax) = 0; - virtual llvm::Value* get_local_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax) = 0; - virtual llvm::Value* get_block_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax) = 0; - virtual llvm::Value* get_num_blocks(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax) = 0; + virtual void set_kernel(Builder& builder, LLVMContext &ctx, Module *module, Function* fn) = 0; + virtual Instruction* add_barrier(Module *module, Builder& builder) = 0; + virtual Instruction* add_memfence(Module *module, Builder& builder) = 0; + virtual Value* get_global_offset(Module *module, Builder& builder, unsigned stride, unsigned ax) = 0; + virtual Value* get_local_id(Module *module, Builder& builder, unsigned ax) = 0; + virtual Value* get_block_id(Module *module, Builder& builder, unsigned ax) = 0; + virtual Value* get_num_blocks(Module *module, Builder& builder, unsigned ax) = 0; bool is_gpu() const; private: @@ -37,37 +59,37 @@ private: class amd_cl_target: public target { public: amd_cl_target(): target(true){} - void set_kernel(llvm::IRBuilder<>& builder, llvm::LLVMContext &ctx, llvm::Module *module, llvm::Function* fn); - llvm::Instruction* add_barrier(llvm::Module *module, llvm::IRBuilder<>& builder); - llvm::Instruction* add_memfence(llvm::Module *module, llvm::IRBuilder<>& builder); - llvm::Value* get_global_offset(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned stride, unsigned ax); - llvm::Value* get_local_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); - llvm::Value* get_block_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); - llvm::Value* get_num_blocks(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); + void set_kernel(Builder& builder, LLVMContext &ctx, Module *module, Function* fn); + Instruction* add_barrier(Module *module, Builder& builder); + Instruction* add_memfence(Module *module, Builder& builder); + Value* get_global_offset(Module *module, Builder& builder, unsigned stride, unsigned ax); + Value* get_local_id(Module *module, Builder& builder, unsigned ax); + Value* get_block_id(Module *module, Builder& builder, unsigned ax); + Value* get_num_blocks(Module *module, Builder& builder, unsigned ax); }; class nvidia_cu_target: public target { public: nvidia_cu_target(): target(true){} - void set_kernel(llvm::IRBuilder<>& builder, llvm::LLVMContext &ctx, llvm::Module *module, llvm::Function* fn); - llvm::Instruction* add_barrier(llvm::Module *module, llvm::IRBuilder<>& builder); - llvm::Instruction* add_memfence(llvm::Module *module, llvm::IRBuilder<>& builder); - llvm::Value* get_global_offset(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned stride, unsigned ax); - llvm::Value* get_local_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); - llvm::Value* get_block_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); - llvm::Value* get_num_blocks(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); + void set_kernel(Builder& builder, LLVMContext &ctx, Module *module, Function* fn); + Instruction* add_barrier(Module *module, Builder& builder); + Instruction* add_memfence(Module *module, Builder& builder); + Value* get_global_offset(Module *module, Builder& builder, unsigned stride, unsigned ax); + Value* get_local_id(Module *module, Builder& builder, unsigned ax); + Value* get_block_id(Module *module, Builder& builder, unsigned ax); + Value* get_num_blocks(Module *module, Builder& builder, unsigned ax); }; class cpu_target: public target { public: cpu_target(): target(false){} - void set_kernel(llvm::IRBuilder<>& builder, llvm::LLVMContext &ctx, llvm::Module *module, llvm::Function* fn); - llvm::Instruction* add_barrier(llvm::Module *module, llvm::IRBuilder<>& builder); - llvm::Instruction* add_memfence(llvm::Module *module, llvm::IRBuilder<>& builder); - llvm::Value* get_global_offset(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned stride, unsigned ax); - llvm::Value* get_local_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); - llvm::Value* get_block_id(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); - llvm::Value* get_num_blocks(llvm::Module *module, llvm::IRBuilder<>& builder, unsigned ax); + void set_kernel(Builder& builder, LLVMContext &ctx, Module *module, Function* fn); + Instruction* add_barrier(Module *module, Builder& builder); + Instruction* add_memfence(Module *module, Builder& builder); + Value* get_global_offset(Module *module, Builder& builder, unsigned stride, unsigned ax); + Value* get_local_id(Module *module, Builder& builder, unsigned ax); + Value* get_block_id(Module *module, Builder& builder, unsigned ax); + Value* get_num_blocks(Module *module, Builder& builder, unsigned ax); }; } diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h index d3f5e7be4..a104cc6b4 100644 --- a/include/triton/ir/builder.h +++ b/include/triton/ir/builder.h @@ -59,7 +59,7 @@ public: value* create_cond_br(value *cond, basic_block* if_dest, basic_block* else_dest); value* create_ret_void(); // Cast instructions - value *create_cast(cast_inst::op_t op, value *v, type *dst_ty, const std::string &name = ""); + value *create_cast(cast_op_t op, value *v, type *dst_ty, const std::string &name = ""); value* create_si_to_fp(value *src, type *dst_ty, const std::string &name = ""); value* create_ui_to_fp(value *src, type *dst_ty, const std::string &name = ""); value* create_fp_to_si(value *src, type *dst_ty, const std::string &name = ""); @@ -71,7 +71,7 @@ public: // Phi instruction phi_node* create_phi(type *ty, unsigned num_reserved, const std::string &name = ""); // Binary instructions - value *create_insert_nuwnswb_binop(binary_operator::op_t op, value *lhs, value *rhs, const std::string &name, bool has_nuw, bool has_nsw); + value *create_insert_nuwnswb_binop(binary_op_t op, value *lhs, value *rhs, const std::string &name, bool has_nuw, bool has_nsw); value *create_fmul(value *lhs, value *rhs, const std::string &name = ""); value *create_fdiv(value *lhs, value *rhs, const std::string &name = ""); value *create_frem(value *lhs, value *rhs, const std::string &name = ""); @@ -89,7 +89,7 @@ public: // GEP value *create_gep(value *ptr, const std::vector& idx_list, const std::string &name = ""); // Comparison (int) - value *create_icmp(cmp_inst::pred_t pred, value *lhs, value *rhs, const std::string &name = ""); + value *create_icmp(cmp_pred_t pred, value *lhs, value *rhs, const std::string &name = ""); value *create_icmpSLE(value *lhs, value *rhs, const std::string &name = ""); value *create_icmpSLT(value *lhs, value *rhs, const std::string &name = ""); value *create_icmpSGE(value *lhs, value *rhs, const std::string &name = ""); @@ -101,7 +101,7 @@ public: value *create_icmpEQ(value *lhs, value *rhs, const std::string &name = ""); value *create_icmpNE(value *lhs, value *rhs, const std::string &name = ""); // Comparison (float) - value *create_fcmp(cmp_inst::pred_t pred, value *lhs, value *rhs, const std::string &name = ""); + value *create_fcmp(cmp_pred_t pred, value *lhs, value *rhs, const std::string &name = ""); value *create_fcmpOLT(value *lhs, value *rhs, const std::string &name = ""); value *create_fcmpOGT(value *lhs, value *rhs, const std::string &name = ""); value *create_fcmpOLE(value *lhs, value *rhs, const std::string &name = ""); diff --git a/include/triton/ir/constant.h b/include/triton/ir/constant.h index ce618d998..dea139d1d 100644 --- a/include/triton/ir/constant.h +++ b/include/triton/ir/constant.h @@ -1,9 +1,9 @@ #ifndef TDL_INCLUDE_IR_CONSTANT_H #define TDL_INCLUDE_IR_CONSTANT_H +#include "enums.h" #include "value.h" #include -#include "llvm/IR/Instructions.h" namespace triton{ namespace ir{ @@ -65,8 +65,7 @@ private: }; class constant_expression: public constant_int { - typedef llvm::BinaryOperator::BinaryOps op_t; - using llop = llvm::BinaryOperator::BinaryOps; + typedef binary_op_t op_t; private: constant_expression(op_t op, constant_int* lhs, constant_int* rhs); diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index 8bb46eb2a..e9791e2a1 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -2,15 +2,19 @@ #define TDL_INCLUDE_IR_INSTRUCTIONS_H #include +#include +#include "triton/ir/enums.h" #include "triton/ir/constant.h" #include "triton/ir/value.h" #include "triton/ir/type.h" #include "triton/ir/metadata.h" -#include "llvm/IR/Instructions.h" namespace triton{ namespace ir{ +class constant_int; +class constant; +class constant_range; class basic_block; class context; @@ -95,19 +99,18 @@ private: //===----------------------------------------------------------------------===// class binary_operator: public instruction{ public: - typedef llvm::BinaryOperator::BinaryOps op_t; - using llop = llvm::BinaryOperator::BinaryOps; + typedef binary_op_t op_t; private: std::string repr_impl() const; protected: // Constructors - binary_operator(op_t op, value *lhs, value *rhs, type *ty, const std::string &name, instruction *next); + binary_operator(binary_op_t op, value *lhs, value *rhs, type *ty, const std::string &name, instruction *next); public: // Get operand - op_t get_op() const { return op_; } + binary_op_t get_op() const { return op_; } // Bool bool is_terminator() const; @@ -127,14 +130,14 @@ public: void set_has_no_signed_wrap(bool b = true) { has_no_signed_wrap_ = b; } // Factory methods - static binary_operator *create(op_t op, value *lhs, value *rhs, + static binary_operator *create(binary_op_t op, value *lhs, value *rhs, const std::string &name = "", instruction *next = nullptr); static binary_operator *create_fneg(value *arg, const std::string &name = "", instruction *next = nullptr); static binary_operator *create_neg(value *arg, const std::string &name = "", instruction *next = nullptr); static binary_operator *create_not(value *arg, const std::string &name = "", instruction *next = nullptr); public: - op_t op_; + binary_op_t op_; bool has_no_unsigned_wrap_; bool has_no_signed_wrap_; }; @@ -146,30 +149,28 @@ public: class cmp_inst: public instruction{ public: - typedef llvm::CmpInst::Predicate pred_t; - using llop = llvm::CmpInst; - + typedef cmp_pred_t pred_t; private: std::string repr_impl() const; protected: - cmp_inst(type *ty, pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next); - static bool is_fp_predicate(pred_t pred); - static bool is_int_predicate(pred_t pred); + cmp_inst(type *ty, cmp_pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next); + static bool is_fp_predicate(cmp_pred_t pred); + static bool is_int_predicate(cmp_pred_t pred); static type* make_cmp_result_type(type *ty); public: - pred_t get_pred() const { return pred_; } + cmp_pred_t get_pred() const { return pred_; } private: - pred_t pred_; + cmp_pred_t pred_; }; class icmp_inst: public cmp_inst{ using cmp_inst::cmp_inst; public: - static icmp_inst* create(pred_t pred, value *lhs, value *rhs, + static icmp_inst* create(cmp_pred_t pred, value *lhs, value *rhs, const std::string &name = "", instruction *next = nullptr); }; @@ -177,7 +178,7 @@ class fcmp_inst: public cmp_inst{ using cmp_inst::cmp_inst; public: - static fcmp_inst* create(pred_t pred, value *lhs, value *rhs, + static fcmp_inst* create(cmp_pred_t pred, value *lhs, value *rhs, const std::string &name = "", instruction *next = nullptr); }; @@ -196,33 +197,28 @@ protected: //===----------------------------------------------------------------------===// class cast_inst: public unary_inst{ - using ic = llvm::Instruction::CastOps; - private: std::string repr_impl() const; -public: - typedef llvm::CastInst::CastOps op_t; - protected: - cast_inst(type *ty, value *v, const std::string &name, instruction *next, op_t op) + cast_inst(type *ty, value *v, const std::string &name, instruction *next, cast_op_t op) : unary_inst(ty, v, name, next), op_(op) { } private: - static bool is_valid(op_t op, value *arg, type *ty); + static bool is_valid(cast_op_t op, value *arg, type *ty); public: // accessors - op_t get_op() const { return op_; } + cast_op_t get_op() const { return op_; } // factory methods - static cast_inst *create(op_t op, value *arg, type *ty, + static cast_inst *create(cast_op_t op, value *arg, type *ty, const std::string &name = "", instruction *next = nullptr); static cast_inst *create_integer_cast(value *arg, type *ty, bool is_signed, const std::string &name = "", instruction *next = nullptr); private: - op_t op_; + cast_op_t op_; }; #define TRITON_IR_DECLARE_CAST_INST_SIMPL(name, op) \ @@ -232,19 +228,19 @@ class name : public cast_inst{ \ : cast_inst(ty, v, name, next, op){ } \ }; -TRITON_IR_DECLARE_CAST_INST_SIMPL(trunc_inst, llvm::Instruction::CastOps::Trunc) -TRITON_IR_DECLARE_CAST_INST_SIMPL(z_ext_inst, llvm::Instruction::CastOps::ZExt) -TRITON_IR_DECLARE_CAST_INST_SIMPL(s_ext_inst, llvm::Instruction::CastOps::SExt) -TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_trunc_inst, llvm::Instruction::CastOps::FPTrunc) -TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_ext_inst, llvm::Instruction::CastOps::FPExt) -TRITON_IR_DECLARE_CAST_INST_SIMPL(ui_to_fp_inst, llvm::Instruction::CastOps::UIToFP) -TRITON_IR_DECLARE_CAST_INST_SIMPL(si_to_fp_inst, llvm::Instruction::CastOps::SIToFP) -TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_to_ui_inst, llvm::Instruction::CastOps::FPToUI) -TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_to_si_inst, llvm::Instruction::CastOps::FPToSI) -TRITON_IR_DECLARE_CAST_INST_SIMPL(ptr_to_int_inst, llvm::Instruction::CastOps::PtrToInt) -TRITON_IR_DECLARE_CAST_INST_SIMPL(int_to_ptr_inst, llvm::Instruction::CastOps::IntToPtr) -TRITON_IR_DECLARE_CAST_INST_SIMPL(bit_cast_inst, llvm::Instruction::CastOps::BitCast) -TRITON_IR_DECLARE_CAST_INST_SIMPL(addr_space_cast_inst, llvm::Instruction::CastOps::AddrSpaceCast) +TRITON_IR_DECLARE_CAST_INST_SIMPL(trunc_inst, cast_op_t::Trunc) +TRITON_IR_DECLARE_CAST_INST_SIMPL(z_ext_inst, cast_op_t::ZExt) +TRITON_IR_DECLARE_CAST_INST_SIMPL(s_ext_inst, cast_op_t::SExt) +TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_trunc_inst, cast_op_t::FPTrunc) +TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_ext_inst, cast_op_t::FPExt) +TRITON_IR_DECLARE_CAST_INST_SIMPL(ui_to_fp_inst, cast_op_t::UIToFP) +TRITON_IR_DECLARE_CAST_INST_SIMPL(si_to_fp_inst, cast_op_t::SIToFP) +TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_to_ui_inst, cast_op_t::FPToUI) +TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_to_si_inst, cast_op_t::FPToSI) +TRITON_IR_DECLARE_CAST_INST_SIMPL(ptr_to_int_inst, cast_op_t::PtrToInt) +TRITON_IR_DECLARE_CAST_INST_SIMPL(int_to_ptr_inst, cast_op_t::IntToPtr) +TRITON_IR_DECLARE_CAST_INST_SIMPL(bit_cast_inst, cast_op_t::BitCast) +TRITON_IR_DECLARE_CAST_INST_SIMPL(addr_space_cast_inst, cast_op_t::AddrSpaceCast) //===----------------------------------------------------------------------===// // terminator_inst classes @@ -591,8 +587,8 @@ private: trans_inst(value *arg, const std::vector& perm, const std::string& name, instruction* next); std::string repr_impl() const { std::string res = "trans<"; - for(ir::constant_int *x: perm_) - res += x->repr() + ","; + //for(ir::constant_int *x: perm_) + // res += x->repr() + ","; res[res.size()-1] = '>'; return res; } diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h index 24eaa836d..a7fb5deeb 100644 --- a/include/triton/runtime/jit.h +++ b/include/triton/runtime/jit.h @@ -25,6 +25,7 @@ namespace llvm { class Module; + } namespace triton { diff --git a/lib/codegen/analysis/shmem/allocation.cpp b/lib/codegen/analysis/shmem/allocation.cpp index 1a2d69536..ead6143b3 100644 --- a/lib/codegen/analysis/shmem/allocation.cpp +++ b/lib/codegen/analysis/shmem/allocation.cpp @@ -1,3 +1,4 @@ +#include #include "triton/codegen/analysis/shmem/allocation.h" #include "triton/codegen/analysis/shmem/liveness.h" #include "triton/codegen/analysis/shmem/info.h" diff --git a/lib/codegen/analysis/shmem/info.cpp b/lib/codegen/analysis/shmem/info.cpp index 63dbfb93f..b674560bf 100644 --- a/lib/codegen/analysis/shmem/info.cpp +++ b/lib/codegen/analysis/shmem/info.cpp @@ -1,3 +1,4 @@ +#include #include "triton/codegen/analysis/shmem/info.h" #include "triton/ir/module.h" #include "triton/ir/function.h" diff --git a/lib/codegen/analysis/tune.cpp b/lib/codegen/analysis/tune.cpp index 5caf9533a..ec67ef254 100644 --- a/lib/codegen/analysis/tune.cpp +++ b/lib/codegen/analysis/tune.cpp @@ -1,3 +1,5 @@ +#include +#include #include "triton/codegen/analysis/tune.h" #include "triton/ir/instructions.h" #include "triton/ir/type.h" @@ -7,7 +9,6 @@ #include "triton/ir/constant.h" #include "triton/driver/device.h" -#include namespace triton{ diff --git a/lib/codegen/selection/selection.cpp b/lib/codegen/selection/selection.cpp index 3b92311fc..0ca17f9e0 100644 --- a/lib/codegen/selection/selection.cpp +++ b/lib/codegen/selection/selection.cpp @@ -203,6 +203,88 @@ Value* shared_tile::get_value(indices_t idx) { return result; } +llvm::Instruction::BinaryOps llvm_op(ir::binary_op_t op) { + using llop = llvm::Instruction::BinaryOps; + using ttop = ir::binary_op_t; + switch(op) { + case ttop::Add: return llop::Add; + case ttop::FAdd: return llop::FAdd; + case ttop::Sub: return llop::Sub; + case ttop::FSub: return llop::FSub; + case ttop::Mul: return llop::Mul; + case ttop::FMul: return llop::FMul; + case ttop::UDiv: return llop::UDiv; + case ttop::SDiv: return llop::SDiv; + case ttop::FDiv: return llop::FDiv; + case ttop::URem: return llop::URem; + case ttop::SRem: return llop::SRem; + case ttop::FRem: return llop::FRem; + case ttop::Shl: return llop::Shl; + case ttop::LShr: return llop::LShr; + case ttop::AShr: return llop::AShr; + case ttop::And: return llop::And; + case ttop::Or: return llop::Or; + case ttop::Xor: return llop::Xor; + } +} + +llvm::Instruction::CastOps llvm_op(ir::cast_op_t op) { + using llop = llvm::Instruction::CastOps; + using ttop = ir::cast_op_t; + switch(op){ + case ttop::Trunc: return llop::Trunc; + case ttop::ZExt: return llop::ZExt; + case ttop::SExt: return llop::SExt; + case ttop::FPTrunc: return llop::FPTrunc; + case ttop::FPExt: return llop::FPExt; + case ttop::UIToFP: return llop::UIToFP; + case ttop::SIToFP: return llop::SIToFP; + case ttop::FPToUI: return llop::FPToUI; + case ttop::FPToSI: return llop::FPToSI; + case ttop::PtrToInt: return llop::PtrToInt; + case ttop::IntToPtr: return llop::IntToPtr; + case ttop::BitCast: return llop::BitCast; + case ttop::AddrSpaceCast: return llop::AddrSpaceCast; + } +} + +llvm::CmpInst::Predicate llvm_pred(ir::cmp_pred_t pred) { + using llop = llvm::CmpInst::Predicate; + using ttop = ir::cmp_pred_t; + switch(pred){ + case ttop::FIRST_FCMP_PREDICATE: return llop::FIRST_FCMP_PREDICATE; + case ttop::FCMP_FALSE: return llop::FCMP_FALSE; + case ttop::FCMP_OEQ: return llop::FCMP_OEQ; + case ttop::FCMP_OGT: return llop::FCMP_OGT; + case ttop::FCMP_OGE: return llop::FCMP_OGE; + case ttop::FCMP_OLT: return llop::FCMP_OLT; + case ttop::FCMP_OLE: return llop::FCMP_OLE; + case ttop::FCMP_ONE: return llop::FCMP_ONE; + case ttop::FCMP_ORD: return llop::FCMP_ORD; + case ttop::FCMP_UNO: return llop::FCMP_UNO; + case ttop::FCMP_UEQ: return llop::FCMP_UEQ; + case ttop::FCMP_UGT: return llop::FCMP_UGT; + case ttop::FCMP_UGE: return llop::FCMP_UGE; + case ttop::FCMP_ULT: return llop::FCMP_ULT; + case ttop::FCMP_ULE: return llop::FCMP_ULE; + case ttop::FCMP_UNE: return llop::FCMP_UNE; + case ttop::FCMP_TRUE: return llop::FCMP_TRUE; + case ttop::LAST_FCMP_PREDICATE: return llop::LAST_FCMP_PREDICATE; + case ttop::FIRST_ICMP_PREDICATE: return llop::FIRST_ICMP_PREDICATE; + case ttop::ICMP_EQ: return llop::ICMP_EQ; + case ttop::ICMP_NE: return llop::ICMP_NE; + case ttop::ICMP_UGT: return llop::ICMP_UGT; + case ttop::ICMP_UGE: return llop::ICMP_UGE; + case ttop::ICMP_ULT: return llop::ICMP_ULT; + case ttop::ICMP_ULE: return llop::ICMP_ULE; + case ttop::ICMP_SGT: return llop::ICMP_SGT; + case ttop::ICMP_SGE: return llop::ICMP_SGE; + case ttop::ICMP_SLT: return llop::ICMP_SLT; + case ttop::ICMP_SLE: return llop::ICMP_SLE; + case ttop::LAST_ICMP_PREDICATE: return llop::LAST_ICMP_PREDICATE; + } +} + /* convert ir::type to Type */ Type *selection::llvm_type(ir::type *ty, LLVMContext &ctx) { // function @@ -283,24 +365,24 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::function(inst)){ Value *lhs = value(ii->get_operand(0)); Value *rhs = value(ii->get_operand(1)); - return builder.Insert(BinaryOperator::Create(ii->get_op(), lhs, rhs)); + return builder.Insert(BinaryOperator::Create(llvm_op(ii->get_op()), lhs, rhs)); } if(auto* ii = dynamic_cast(inst)){ - CmpInst::Predicate pred = ii->get_pred(); + ir::cmp_pred_t pred = ii->get_pred(); Value *lhs = value(ii->get_operand(0)); Value *rhs = value(ii->get_operand(1)); - return builder.Insert(CmpInst::Create(Instruction::ICmp, pred, lhs, rhs)); + return builder.Insert(CmpInst::Create(Instruction::ICmp, llvm_pred(pred), lhs, rhs)); } if(auto* ii = dynamic_cast(inst)){ - CmpInst::Predicate pred = ii->get_pred(); + ir::cmp_pred_t pred = ii->get_pred(); Value *lhs = value(ii->get_operand(0)); Value *rhs = value(ii->get_operand(1)); - return builder.Insert(FCmpInst::Create(Instruction::FCmp, pred, lhs, rhs)); + return builder.Insert(FCmpInst::Create(Instruction::FCmp, llvm_pred(pred), lhs, rhs)); } if(auto* ii = dynamic_cast(inst)){ Value *arg = value(ii->get_operand(0)); Type *dst_ty = type(ii->get_type()->get_scalar_ty()); - return builder.Insert(CastInst::Create(ii->get_op(), arg, dst_ty)); + return builder.Insert(CastInst::Create(llvm_op(ii->get_op()), arg, dst_ty)); } if(auto* ii = dynamic_cast(inst)){ // get pointer diff --git a/lib/codegen/transform/peephole.cpp b/lib/codegen/transform/peephole.cpp index 6140e686c..d5d678628 100644 --- a/lib/codegen/transform/peephole.cpp +++ b/lib/codegen/transform/peephole.cpp @@ -1,3 +1,4 @@ +#include #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/codegen/transform/peephole.h" @@ -187,7 +188,7 @@ bool peephole::rewrite_gep_ptr_min_off_plus_off(ir::instruction *value, ir::buil auto z = dynamic_cast(idx); if(!z) return false; - bool is_sub = z->get_op() == ir::binary_operator::llop::Sub; + bool is_sub = z->get_op() == ir::binary_op_t::Sub; auto *lhs = dynamic_cast(z->get_operand(0)); bool is_lhs_0 = lhs && (lhs->get_value()==0); bool is_rhs_eq_x_rhs = z->get_operand(1) == *x->idx_begin(); diff --git a/lib/codegen/transform/reassociate.cpp b/lib/codegen/transform/reassociate.cpp index 4473fe84a..6893a7a10 100644 --- a/lib/codegen/transform/reassociate.cpp +++ b/lib/codegen/transform/reassociate.cpp @@ -36,7 +36,7 @@ namespace transform{ inline ir::instruction* reassociate::is_bin_add(ir::value *x) { ir::binary_operator *bin_op = dynamic_cast(x); - bool is_bin_add = bin_op && bin_op->get_op()==llvm::Instruction::Add; + bool is_bin_add = bin_op && bin_op->get_op()== ir::binary_op_t::Add; if(is_bin_add) return (ir::instruction*)x; return nullptr; diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index 1f6aa7c54..ef2d81abf 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -1,10 +1,10 @@ #include +#include #include "triton/ir/basic_block.h" #include "triton/ir/builder.h" #include "triton/ir/constant.h" #include "triton/ir/instructions.h" #include "triton/ir/type.h" -#include "llvm/IR/Instruction.h" namespace triton{ namespace ir{ @@ -93,14 +93,14 @@ value *builder::create_ret_void() { return create_cast(OPCODE, src, dst_ty, name);\ } -DEFINE_CAST_INSTR(si_to_fp, llvm::Instruction::SIToFP) -DEFINE_CAST_INSTR(ui_to_fp, llvm::Instruction::UIToFP) -DEFINE_CAST_INSTR(fp_to_si, llvm::Instruction::FPToSI) -DEFINE_CAST_INSTR(fp_to_ui, llvm::Instruction::FPToUI) -DEFINE_CAST_INSTR(fp_ext, llvm::Instruction::FPExt) -DEFINE_CAST_INSTR(fp_trunc, llvm::Instruction::FPTrunc) +DEFINE_CAST_INSTR(si_to_fp, cast_op_t::SIToFP) +DEFINE_CAST_INSTR(ui_to_fp, cast_op_t::UIToFP) +DEFINE_CAST_INSTR(fp_to_si, cast_op_t::FPToSI) +DEFINE_CAST_INSTR(fp_to_ui, cast_op_t::FPToUI) +DEFINE_CAST_INSTR(fp_ext, cast_op_t::FPExt) +DEFINE_CAST_INSTR(fp_trunc, cast_op_t::FPTrunc) -value* builder::create_cast(cast_inst::op_t op, value *v, type *dst_ty, const std::string &name){ +value* builder::create_cast(cast_op_t op, value *v, type *dst_ty, const std::string &name){ return insert(cast_inst::create(op, v, dst_ty), name); } @@ -131,11 +131,11 @@ phi_node* builder::create_phi(type *ty, unsigned num_reserved, const std::string } // Binary -DEFINE_BINARY_FLOAT(fmul, llvm::Instruction::FMul) -DEFINE_BINARY_FLOAT(fdiv, llvm::Instruction::FDiv) -DEFINE_BINARY_FLOAT(frem, llvm::Instruction::FRem) -DEFINE_BINARY_FLOAT(fadd, llvm::Instruction::FAdd) -DEFINE_BINARY_FLOAT(fsub, llvm::Instruction::FSub) +DEFINE_BINARY_FLOAT(fmul, binary_op_t::FMul) +DEFINE_BINARY_FLOAT(fdiv, binary_op_t::FDiv) +DEFINE_BINARY_FLOAT(frem, binary_op_t::FRem) +DEFINE_BINARY_FLOAT(fadd, binary_op_t::FAdd) +DEFINE_BINARY_FLOAT(fsub, binary_op_t::FSub) // Unary DEFINE_UNARY_FLOAT(fneg) @@ -145,7 +145,7 @@ DEFINE_UNARY_FLOAT(fneg) //===----------------------------------------------------------------------===// -value* builder::create_insert_nuwnswb_binop(binary_operator::op_t op, value *lhs, +value* builder::create_insert_nuwnswb_binop(binary_op_t op, value *lhs, value *rhs, const std::string &name, bool has_nuw, bool has_nsw) { auto *clhs = dynamic_cast(lhs); @@ -180,18 +180,18 @@ value* builder::create_insert_nuwnswb_binop(binary_operator::op_t op, value *lhs } // Binary -DEFINE_NOWRAP_BINARY(mul, llvm::Instruction::Mul) -DEFINE_NOWRAP_BINARY(add, llvm::Instruction::Add) -DEFINE_NOWRAP_BINARY(sub, llvm::Instruction::Sub) -DEFINE_NOWRAP_BINARY(shl, llvm::Instruction::Shl) -DEFINE_NOWRAP_BINARY(ashr, llvm::Instruction::AShr) -DEFINE_BINARY_INT(sdiv, llvm::Instruction::SDiv) -DEFINE_BINARY_INT(udiv, llvm::Instruction::UDiv) -DEFINE_BINARY_INT(srem, llvm::Instruction::SRem) -DEFINE_BINARY_INT(urem, llvm::Instruction::URem) -DEFINE_BINARY_INT(and, llvm::Instruction::And) -DEFINE_BINARY_INT(or, llvm::Instruction::Or) -DEFINE_BINARY_INT(xor, llvm::Instruction::Xor) +DEFINE_NOWRAP_BINARY(mul, binary_op_t::Mul) +DEFINE_NOWRAP_BINARY(add, binary_op_t::Add) +DEFINE_NOWRAP_BINARY(sub, binary_op_t::Sub) +DEFINE_NOWRAP_BINARY(shl, binary_op_t::Shl) +DEFINE_NOWRAP_BINARY(ashr, binary_op_t::AShr) +DEFINE_BINARY_INT(sdiv, binary_op_t::SDiv) +DEFINE_BINARY_INT(udiv, binary_op_t::UDiv) +DEFINE_BINARY_INT(srem, binary_op_t::SRem) +DEFINE_BINARY_INT(urem, binary_op_t::URem) +DEFINE_BINARY_INT(and, binary_op_t::And) +DEFINE_BINARY_INT(or, binary_op_t::Or) +DEFINE_BINARY_INT(xor, binary_op_t::Xor) // Unary DEFINE_UNARY_INT(neg) DEFINE_UNARY_INT(not) @@ -209,7 +209,7 @@ value* builder::create_gep(value *ptr, const std::vector& idx_list, cons // icmp instructions //===----------------------------------------------------------------------===// -value *builder::create_icmp(cmp_inst::pred_t pred, value *lhs, value *rhs, const std::string &name){ +value *builder::create_icmp(cmp_pred_t pred, value *lhs, value *rhs, const std::string &name){ return insert(icmp_inst::create(pred, lhs, rhs), name); } @@ -219,25 +219,25 @@ value *builder::create_icmp(cmp_inst::pred_t pred, value *lhs, value *rhs, const } // Signed -DEFINE_ICMP_INSTR(SLE, llvm::ICmpInst::ICMP_SLE) -DEFINE_ICMP_INSTR(SLT, llvm::ICmpInst::ICMP_SLT) -DEFINE_ICMP_INSTR(SGE, llvm::ICmpInst::ICMP_SGE) -DEFINE_ICMP_INSTR(SGT, llvm::ICmpInst::ICMP_SGT) +DEFINE_ICMP_INSTR(SLE, cmp_pred_t::ICMP_SLE) +DEFINE_ICMP_INSTR(SLT, cmp_pred_t::ICMP_SLT) +DEFINE_ICMP_INSTR(SGE, cmp_pred_t::ICMP_SGE) +DEFINE_ICMP_INSTR(SGT, cmp_pred_t::ICMP_SGT) // Unsigned -DEFINE_ICMP_INSTR(ULE, llvm::ICmpInst::ICMP_ULE) -DEFINE_ICMP_INSTR(ULT, llvm::ICmpInst::ICMP_ULT) -DEFINE_ICMP_INSTR(UGE, llvm::ICmpInst::ICMP_UGE) -DEFINE_ICMP_INSTR(UGT, llvm::ICmpInst::ICMP_UGT) +DEFINE_ICMP_INSTR(ULE, cmp_pred_t::ICMP_ULE) +DEFINE_ICMP_INSTR(ULT, cmp_pred_t::ICMP_ULT) +DEFINE_ICMP_INSTR(UGE, cmp_pred_t::ICMP_UGE) +DEFINE_ICMP_INSTR(UGT, cmp_pred_t::ICMP_UGT) // General -DEFINE_ICMP_INSTR(EQ, llvm::ICmpInst::ICMP_EQ) -DEFINE_ICMP_INSTR(NE, llvm::ICmpInst::ICMP_NE) +DEFINE_ICMP_INSTR(EQ, cmp_pred_t::ICMP_EQ) +DEFINE_ICMP_INSTR(NE, cmp_pred_t::ICMP_NE) //===----------------------------------------------------------------------===// // fcmp instructions //===----------------------------------------------------------------------===// -value *builder::create_fcmp(cmp_inst::pred_t pred, value *lhs, value *rhs, const std::string &name){ +value *builder::create_fcmp(cmp_pred_t pred, value *lhs, value *rhs, const std::string &name){ return insert(fcmp_inst::create(pred, lhs, rhs), name); } @@ -247,12 +247,12 @@ value *builder::create_fcmp(cmp_inst::pred_t pred, value *lhs, value *rhs, const } // Ordered -DEFINE_FCMP_INSTR(OLE, llvm::FCmpInst::FCMP_OLE) -DEFINE_FCMP_INSTR(OLT, llvm::FCmpInst::FCMP_OLT) -DEFINE_FCMP_INSTR(OGE, llvm::FCmpInst::FCMP_OGE) -DEFINE_FCMP_INSTR(OGT, llvm::FCmpInst::FCMP_OGT) -DEFINE_FCMP_INSTR(OEQ, llvm::FCmpInst::FCMP_OEQ) -DEFINE_FCMP_INSTR(ONE, llvm::FCmpInst::FCMP_ONE) +DEFINE_FCMP_INSTR(OLE, cmp_pred_t::FCMP_OLE) +DEFINE_FCMP_INSTR(OLT, cmp_pred_t::FCMP_OLT) +DEFINE_FCMP_INSTR(OGE, cmp_pred_t::FCMP_OGE) +DEFINE_FCMP_INSTR(OGT, cmp_pred_t::FCMP_OGT) +DEFINE_FCMP_INSTR(OEQ, cmp_pred_t::FCMP_OEQ) +DEFINE_FCMP_INSTR(ONE, cmp_pred_t::FCMP_ONE) diff --git a/lib/ir/constant.cpp b/lib/ir/constant.cpp index 6493f23b4..4b06af60e 100644 --- a/lib/ir/constant.cpp +++ b/lib/ir/constant.cpp @@ -145,19 +145,19 @@ uint64_t constant_expression::get_value() const { uint64_t lhs = lhs_->get_value(); uint64_t rhs = rhs_->get_value(); switch(op_) { - case llop::Add : return lhs + rhs; - case llop::Sub : return lhs - rhs; - case llop::Mul : return lhs * rhs; - case llop::UDiv : return lhs / rhs; - case llop::SDiv : return lhs / rhs; - case llop::URem : return lhs % rhs; - case llop::SRem : return lhs % rhs; - case llop::Shl : return lhs << rhs; - case llop::LShr : return lhs >> rhs; - case llop::AShr : return lhs >> rhs; - case llop::And : return lhs && rhs; - case llop::Or : return lhs || rhs; - case llop::Xor : return lhs ^ rhs; + case op_t::Add : return lhs + rhs; + case op_t::Sub : return lhs - rhs; + case op_t::Mul : return lhs * rhs; + case op_t::UDiv : return lhs / rhs; + case op_t::SDiv : return lhs / rhs; + case op_t::URem : return lhs % rhs; + case op_t::SRem : return lhs % rhs; + case op_t::Shl : return lhs << rhs; + case op_t::LShr : return lhs >> rhs; + case op_t::AShr : return lhs >> rhs; + case op_t::And : return lhs && rhs; + case op_t::Or : return lhs || rhs; + case op_t::Xor : return lhs ^ rhs; default: throw std::runtime_error("unsupported constexpr binary operator"); } } diff --git a/lib/ir/function.cpp b/lib/ir/function.cpp index 5c7ca1e2a..c15440e9d 100644 --- a/lib/ir/function.cpp +++ b/lib/ir/function.cpp @@ -1,3 +1,4 @@ +#include #include "triton/ir/function.h" #include "triton/ir/type.h" #include "triton/ir/module.h" diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 5b49e240e..074b55bb8 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -1,3 +1,4 @@ +#include #include "triton/ir/context.h" #include "triton/ir/basic_block.h" #include "triton/ir/instructions.h" @@ -87,60 +88,60 @@ phi_node* phi_node::create(type *ty, unsigned num_reserved, const std::string &n std::string binary_operator::repr_impl() const { switch(op_) { - case llop::Add : return "add"; - case llop::FAdd : return "fadd"; - case llop::Sub : return "sub"; - case llop::FSub : return "fsub"; - case llop::Mul : return "mul"; - case llop::FMul : return "fmul"; - case llop::UDiv : return "udiv"; - case llop::SDiv : return "sdiv"; - case llop::FDiv : return "fdiv"; - case llop::URem : return "urem"; - case llop::SRem : return "srem"; - case llop::FRem : return "frem"; - case llop::Shl : return "shl"; - case llop::LShr : return "lshr"; - case llop::AShr : return "ashr"; - case llop::And : return "and"; - case llop::Or : return "or"; - case llop::Xor : return "xor"; + case Add : return "add"; + case FAdd : return "fadd"; + case Sub : return "sub"; + case FSub : return "fsub"; + case Mul : return "mul"; + case FMul : return "fmul"; + case UDiv : return "udiv"; + case SDiv : return "sdiv"; + case FDiv : return "fdiv"; + case URem : return "urem"; + case SRem : return "srem"; + case FRem : return "frem"; + case Shl : return "shl"; + case LShr : return "lshr"; + case AShr : return "ashr"; + case And : return "and"; + case Or : return "or"; + case Xor : return "xor"; default: throw std::runtime_error("unknown binary operator"); } } bool binary_operator::is_int_div() const { - return op_ == llop::UDiv || op_ == llop::SDiv; + return op_ == binary_op_t::UDiv || op_ == binary_op_t::SDiv; } bool binary_operator::is_int_rem() const { - return op_ == llop::URem || op_ == llop::SRem; + return op_ == binary_op_t::URem || op_ == binary_op_t::SRem; } bool binary_operator::is_shl() const { - return op_ == llop::Shl; + return op_ == binary_op_t::Shl; } bool binary_operator::is_shr() const { - return op_ == llop::LShr || op_ == llop::AShr; + return op_ == binary_op_t::LShr || op_ == binary_op_t::AShr; } bool binary_operator::is_int_mult() const { - return op_ == llop::Mul; + return op_ == binary_op_t::Mul; } bool binary_operator::is_int_add_sub() const { - return op_ == llop::Add || op_ == llop::Sub; + return op_ == binary_op_t::Add || op_ == binary_op_t::Sub; } -binary_operator::binary_operator(op_t op, value *lhs, value *rhs, type *ty, const std::string &name, instruction *next) +binary_operator::binary_operator(binary_op_t op, value *lhs, value *rhs, type *ty, const std::string &name, instruction *next) : instruction(ty, 2, 1, name, next), op_(op){ set_operand(0, lhs); set_operand(1, rhs); } -binary_operator *binary_operator::create(op_t op, value *lhs, value *rhs, const std::string &name, instruction *next){ +binary_operator *binary_operator::create(binary_op_t op, value *lhs, value *rhs, const std::string &name, instruction *next){ assert(lhs->get_type() == rhs->get_type() && "Cannot create binary operator with two operands of differing type!"); return new binary_operator(op, lhs, rhs, lhs->get_type(), name, next); @@ -149,19 +150,19 @@ binary_operator *binary_operator::create(op_t op, value *lhs, value *rhs, const binary_operator *binary_operator::create_fneg(value *arg, const std::string &name, instruction *next){ assert(arg->get_type()->get_scalar_ty()->is_floating_point_ty()); value *zero = constant_fp::get_zero_value_for_negation(arg->get_type()); - return binary_operator::create(llvm::Instruction::FSub, zero, arg, name, next); + return binary_operator::create(binary_op_t::FSub, zero, arg, name, next); } binary_operator *binary_operator::create_neg(value *arg, const std::string &name, instruction *next){ assert(arg->get_type()->get_scalar_ty()->is_integer_ty()); value *zero = constant_fp::get_zero_value_for_negation(arg->get_type()); - return binary_operator::create(llvm::Instruction::Sub, zero, arg, name, next); + return binary_operator::create(binary_op_t::Sub, zero, arg, name, next); } binary_operator *binary_operator::create_not(value *arg, const std::string &name, instruction *next){ assert(arg->get_type()->is_integer_ty()); constant *mask = constant::get_all_ones_value(arg->get_type()); - return binary_operator::create(llvm::Instruction::Xor, arg, mask, name, next); + return binary_operator::create(binary_op_t::Xor, arg, mask, name, next); } //===----------------------------------------------------------------------===// @@ -171,37 +172,37 @@ binary_operator *binary_operator::create_not(value *arg, const std::string &name // cmp_inst std::string cmp_inst::repr_impl() const { switch (pred_) { - case llop::FCMP_FALSE : return "false"; - case llop::FCMP_OEQ : return "fcmp_oeq"; - case llop::FCMP_OGT : return "fcmp_ogt"; - case llop::FCMP_OGE : return "fcmp_oge"; - case llop::FCMP_OLT : return "fcmp_olt"; - case llop::FCMP_OLE : return "fcmp_ole"; - case llop::FCMP_ONE : return "fcmp_one"; - case llop::FCMP_ORD : return "fcmp_ord"; - case llop::FCMP_UNO : return "fcmp_uno"; - case llop::FCMP_UEQ : return "fcmp_ueq"; - case llop::FCMP_UGT : return "fcmp_ugt"; - case llop::FCMP_UGE : return "fcmp_uge"; - case llop::FCMP_ULT : return "fcmp_ult"; - case llop::FCMP_ULE : return "fcmp_ule"; - case llop::FCMP_UNE : return "fcmp_une"; - case llop::FCMP_TRUE : return "true"; - case llop::ICMP_EQ : return "icmp_eq"; - case llop::ICMP_NE : return "icmp_ne"; - case llop::ICMP_UGT : return "icmp_ugt"; - case llop::ICMP_UGE : return "icmp_uge"; - case llop::ICMP_ULT : return "icmp_ult"; - case llop::ICMP_ULE : return "icmp_ule"; - case llop::ICMP_SGT : return "icmp_sgt"; - case llop::ICMP_SGE : return "icmp_sge"; - case llop::ICMP_SLT : return "icmp_slt"; - case llop::ICMP_SLE : return "icmp_sle"; + case FCMP_FALSE : return "false"; + case FCMP_OEQ : return "fcmp_oeq"; + case FCMP_OGT : return "fcmp_ogt"; + case FCMP_OGE : return "fcmp_oge"; + case FCMP_OLT : return "fcmp_olt"; + case FCMP_OLE : return "fcmp_ole"; + case FCMP_ONE : return "fcmp_one"; + case FCMP_ORD : return "fcmp_ord"; + case FCMP_UNO : return "fcmp_uno"; + case FCMP_UEQ : return "fcmp_ueq"; + case FCMP_UGT : return "fcmp_ugt"; + case FCMP_UGE : return "fcmp_uge"; + case FCMP_ULT : return "fcmp_ult"; + case FCMP_ULE : return "fcmp_ule"; + case FCMP_UNE : return "fcmp_une"; + case FCMP_TRUE : return "true"; + case ICMP_EQ : return "icmp_eq"; + case ICMP_NE : return "icmp_ne"; + case ICMP_UGT : return "icmp_ugt"; + case ICMP_UGE : return "icmp_uge"; + case ICMP_ULT : return "icmp_ult"; + case ICMP_ULE : return "icmp_ule"; + case ICMP_SGT : return "icmp_sgt"; + case ICMP_SGE : return "icmp_sge"; + case ICMP_SLT : return "icmp_slt"; + case ICMP_SLE : return "icmp_sle"; default: throw std::runtime_error("unreachable"); } } -cmp_inst::cmp_inst(type *ty, cmp_inst::pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next) +cmp_inst::cmp_inst(type *ty, cmp_pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next) : instruction(ty, 2, 1, name, next), pred_(pred) { set_operand(0, lhs); set_operand(1, rhs); @@ -215,23 +216,23 @@ type* cmp_inst::make_cmp_result_type(type *ty){ } -bool cmp_inst::is_fp_predicate(pred_t pred) { - return pred >= llop::FIRST_FCMP_PREDICATE && pred <= llop::LAST_FCMP_PREDICATE; +bool cmp_inst::is_fp_predicate(cmp_pred_t pred) { + return pred >= FIRST_FCMP_PREDICATE && pred <= LAST_FCMP_PREDICATE; } -bool cmp_inst::is_int_predicate(pred_t pred) { - return pred >= llop::FIRST_ICMP_PREDICATE && pred <= llop::LAST_ICMP_PREDICATE; +bool cmp_inst::is_int_predicate(cmp_pred_t pred) { + return pred >= FIRST_ICMP_PREDICATE && pred <= LAST_ICMP_PREDICATE; } // icmp_inst -icmp_inst* icmp_inst::create(pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next){ +icmp_inst* icmp_inst::create(cmp_pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next){ assert(is_int_predicate(pred)); type *res_ty = make_cmp_result_type(lhs->get_type()); return new icmp_inst(res_ty, pred, lhs, rhs, name, next); } // fcmp_inst -fcmp_inst* fcmp_inst::create(pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next){ +fcmp_inst* fcmp_inst::create(cmp_pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next){ assert(is_fp_predicate(pred)); type *res_ty = make_cmp_result_type(lhs->get_type()); return new fcmp_inst(res_ty, pred, lhs, rhs, name, next); @@ -252,45 +253,45 @@ unary_inst::unary_inst(type *ty, value *v, const std::string &name, instruction std::string cast_inst::repr_impl() const { switch (op_){ - case ic::Trunc: return "trunc"; - case ic::ZExt: return "zext"; - case ic::SExt: return "sext"; - case ic::FPTrunc: return "fp_trunc"; - case ic::FPExt: return "fp_ext"; - case ic::UIToFP: return "ui_to_fp"; - case ic::SIToFP: return "si_to_fp"; - case ic::FPToUI: return "fp_to_ui"; - case ic::FPToSI: return "fp_to_si"; - case ic::PtrToInt: return "ptr_to_int"; - case ic::IntToPtr: return "int_to_ptr"; - case ic::BitCast: return "bitcast"; - case ic::AddrSpaceCast: return "addr_space_cast"; + case cast_op_t::Trunc: return "trunc"; + case cast_op_t::ZExt: return "zext"; + case cast_op_t::SExt: return "sext"; + case cast_op_t::FPTrunc: return "fp_trunc"; + case cast_op_t::FPExt: return "fp_ext"; + case cast_op_t::UIToFP: return "ui_to_fp"; + case cast_op_t::SIToFP: return "si_to_fp"; + case cast_op_t::FPToUI: return "fp_to_ui"; + case cast_op_t::FPToSI: return "fp_to_si"; + case cast_op_t::PtrToInt: return "ptr_to_int"; + case cast_op_t::IntToPtr: return "int_to_ptr"; + case cast_op_t::BitCast: return "bitcast"; + case cast_op_t::AddrSpaceCast: return "addr_space_cast"; default: throw std::runtime_error("unreachable"); } } // TODO -bool cast_inst::is_valid(op_t op, value *arg, type *ty) { +bool cast_inst::is_valid(cast_op_t op, value *arg, type *ty) { assert(arg->get_type()->is_tile_ty() == ty->is_tile_ty()); return true; } -cast_inst *cast_inst::create(op_t op, value *arg, type *ty, const std::string &name, instruction *next){ +cast_inst *cast_inst::create(cast_op_t op, value *arg, type *ty, const std::string &name, instruction *next){ assert(is_valid(op, arg, ty) && "Invalid cast!"); // Construct and return the appropriate CastInst subclass switch (op) { - case ic::Trunc: return new trunc_inst (ty, arg, name, next); - case ic::ZExt: return new z_ext_inst (ty, arg, name, next); - case ic::SExt: return new s_ext_inst (ty, arg, name, next); - case ic::FPTrunc: return new fp_trunc_inst (ty, arg, name, next); - case ic::FPExt: return new fp_ext_inst (ty, arg, name, next); - case ic::UIToFP: return new ui_to_fp_inst (ty, arg, name, next); - case ic::SIToFP: return new si_to_fp_inst (ty, arg, name, next); - case ic::FPToUI: return new fp_to_ui_inst (ty, arg, name, next); - case ic::FPToSI: return new fp_to_si_inst (ty, arg, name, next); - case ic::PtrToInt: return new ptr_to_int_inst (ty, arg, name, next); - case ic::IntToPtr: return new int_to_ptr_inst (ty, arg, name, next); - case ic::BitCast: return new bit_cast_inst (ty, arg, name, next); - case ic::AddrSpaceCast: return new addr_space_cast_inst (ty, arg, name, next); + case cast_op_t::Trunc: return new trunc_inst (ty, arg, name, next); + case cast_op_t::ZExt: return new z_ext_inst (ty, arg, name, next); + case cast_op_t::SExt: return new s_ext_inst (ty, arg, name, next); + case cast_op_t::FPTrunc: return new fp_trunc_inst (ty, arg, name, next); + case cast_op_t::FPExt: return new fp_ext_inst (ty, arg, name, next); + case cast_op_t::UIToFP: return new ui_to_fp_inst (ty, arg, name, next); + case cast_op_t::SIToFP: return new si_to_fp_inst (ty, arg, name, next); + case cast_op_t::FPToUI: return new fp_to_ui_inst (ty, arg, name, next); + case cast_op_t::FPToSI: return new fp_to_si_inst (ty, arg, name, next); + case cast_op_t::PtrToInt: return new ptr_to_int_inst (ty, arg, name, next); + case cast_op_t::IntToPtr: return new int_to_ptr_inst (ty, arg, name, next); + case cast_op_t::BitCast: return new bit_cast_inst (ty, arg, name, next); + case cast_op_t::AddrSpaceCast: return new addr_space_cast_inst (ty, arg, name, next); default: throw std::runtime_error("unreachable"); } } @@ -300,9 +301,9 @@ cast_inst *cast_inst::create_integer_cast(value *arg, type *ty, bool is_signed, assert(arg_ty->is_int_or_tileint_ty() && ty->is_int_or_tileint_ty() && "Invalid integer cast!"); unsigned arg_bits = arg_ty->get_scalar_ty()->get_integer_bitwidth(); unsigned dst_bits = ty->get_scalar_ty()->get_integer_bitwidth(); - op_t op = (arg_bits == dst_bits ? ic::BitCast : - (arg_bits > dst_bits ? ic::Trunc : - (is_signed ? ic::SExt : ic::ZExt))); + cast_op_t op = (arg_bits == dst_bits ? cast_op_t::BitCast : + (arg_bits > dst_bits ? cast_op_t::Trunc : + (is_signed ? cast_op_t::SExt : cast_op_t::ZExt))); return create(op, arg, ty, name, next); } diff --git a/lib/ir/module.cpp b/lib/ir/module.cpp index 678c6119d..7adcbb14a 100644 --- a/lib/ir/module.cpp +++ b/lib/ir/module.cpp @@ -1,3 +1,4 @@ +#include #include "triton/ir/basic_block.h" #include "triton/ir/module.h" #include "triton/ir/type.h" diff --git a/lib/lang/declaration.cpp b/lib/lang/declaration.cpp index 6e5c3204f..3f706bee1 100644 --- a/lib/lang/declaration.cpp +++ b/lib/lang/declaration.cpp @@ -1,3 +1,4 @@ +#include #include "triton/lang/statement.h" #include "triton/lang/declaration.h" #include "triton/ir/function.h" diff --git a/python/dist/triton-0.1-py3.6-linux-x86_64.egg b/python/dist/triton-0.1-py3.6-linux-x86_64.egg deleted file mode 100644 index 87a0f96634233209f5f9fbc5ec0380012688b069..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 709047 zcmXVVV{j#1!|hBsv7JdK&crq*wr$(C%@fp;gY7H9U8D@jg9y zzP*1=*<>9pysbM-buv0!wLWEVuIgQw3KY)>{(3+rKo1N)=NNvUVn{AyWtn}?CM{tk z+I@Dxm3x!Xkh}Hi>No;?s37;gOmz{hh54<7`22KQiyox#8PbQfPMJ=7vs0lm&uDyaW&D4%IB1G)w1;Y z7bo`DQ-n|<&V%>YlAYPA_-A567f&@87Q0X2xRdrju0go65SsHx^G)+3x5?%*;}Mdr zOri@xyMM<-LlB0DxNUeLjnOgjaClCVG-1dF{d8vff-$2hMFc%@mxQG>xKPH}5mi0i zCi@n#RQ$w&>s-7^8KgTxQAm2`#pV%q?17|eB7zoJcy>_yMa0yb!k8z+b)&j9D6X0W zYXM<)xBkVc#gau526pvouWBQv3lE#p=+z>N`orwZ5J7EZ zi3NH>L`lMv@pC_5VhaqPB{}RkSY4yFAhUH??v6>3JUUtn!{#zForeoUg~^f+=-R0+ zccJSo;47h_E6!L|0_d4fh1*aCu)m;#sA!@FE700%Lcx;7NRhGArH|{)?lCjK-Kdj} z1Xn$s17iBn6ijxk^es=+9YuO?x2H+K#+i{NGxW^RF;wsaMtVq=%qhh$kUuug)i6}^qVcF^)Tu2>`eax z^oU{h-FEa1C+UpuLsvGi6MH7d_@V6ZtGWv@6(uEsl>VN`feH(t0yug9LB`*d)Db;S z!^Ynt2|_B8^_L(*MivxpDb?iTiJzLuFBOR)3~mtAjz!Nj`lnDRGP#Y*2gr{F(MgO5 ztSsz~anY+I40`QX)tZl+Vk(?VZFGVqG^E4i2a*I(juJF!iow^j1PjL(&9M`<9NuAi zBJB{;v|W;2_ z%0}1B!W?Tewo-Mu>I6H2ko6=E*JJUgrTpWU{s)FzgokuvveCL*8e$g`s# zgq}$kRS>QZTV_iu8e@w|&1adaA2=3|8cXVaoUa^6X=6${(J&a+cN904LTbpF;~|b` zyyvTY5~oPGEk4Z#o7{n!G_G9>wlge5)j-I^5mL;{ClEWZee=? zum6b1kv#B#G<)L>OE|hAz-D>R4pE*og1mZS4;~Ji6k~vc)~icT0@j2L1y4mPA^`r| zKe|c=BacLiUsQd?k!A+j2y21fQ>|&F7E-7(RM=cOp$)x!L7Fgb%;D(}Swa{!Vl2^c z&i1z*E=lCoFJiFWxIx@x8lWZ+w=aq&!!>}JonX*+tQl=8KL5tj!s4V+5*B-zy;gru zHdkS7weGmm2$@D158LhFRC0ye5a=1_D<_x`ww^+TA;g_WGNdQg-+T&<#peu5jlC3d zeO8NrF&pBYM*Vm_q#AZPO`Z$WeUNd!lu%El|)FD8DAO z;)Y`P==OtBtN2H_?DiCc5t|XPrQ0xF{)zTywlGNt{~`m-{3Baj)YS72Mhnvu51obr zTx-S&g0scAOCr@{ib&&4Qe=`J*cO@vQj!w8b78B*jwvQ@(*;&W)__x)WV+pjQ6ZU< z*8I|9AzcvJOr~3BV&f;32<*Ya3Ok`G=#~-P1HkgSuY;{iH>NmhQ%~~y$5?0nt0%dq zkxM%E@-xH;5Hi&y6^J0%`fVoEEJb=eP8uti@{bMq3sxX-@|2J)&=Ors5SEo4Emsz^ zudou)Y2?mG^@Q=3tEGjpq-Hockkmami8d_nj_%`m3PweoI$oHlfm&6Rnbcl#Mbq68 zq5KDaoLJrQ?sEku=i~ITNOkn{eV~v3O!OfqqRULX-4$4XCNkeh4D0N(+1U>;5SziL${um6kWQKs}$# z#FJn(k>Rq#0u3Wbww%IC=#|qvu#j@xD>wqR%5#kbgA3G`% z>Y$eRSW=F3h~bJZV&G;crY@fdAMPjtNvIf*r}oBhcbowt1V0HbkuWusl)_{uXsf`| zBbH1ehx}|@>Pg7zMk7o^VAm6LF8VweIK+T{i=k^46ikd^aLsl{MMzjD+7mFcD|SM{ zo7|PsMjB@jp$54@BZKXcceapfOJtY27ijQIc&|>7&cqlI{2RGzgEt z@Xi85D5}n4%1B|os$)_xy$qq~zp6aGWROEB*b@#nm$rh=gfjzcQU4pusu+U7$lVf@ zR#UY$%T!;Qr2&FX%;3;z7IktQB0@4r8K(;?d-D7{BldokK7ky+o9NK8&<%Pd`+IED zz>CqXv0Gv_(%?Wmkrd??ed1J*q_LgVg?A`HMPwCf%CPBx+dP95LP9D7IIDC*|N89V z@5e_D6xBh;L;E@C9!Y~N4I{GA1 zCRVxJ>0oINlXbC91$GDtbiFZ79p{9B4%KEV`4B~ zw}^50b#xMVqKqy==NR+Bk=0BRCL4lN*ke38h6I&L&XDf5uI(LJNq=LsvpAL~F?*u< zm*3Mf%p!>O=v#!V6k5j9KE~Gt z^v4tg4##?wgpylZyCxAxa!P(O6Oxc9BVy=PRu7mG2mt6(G$UzRMy5qe^l%>bhx8@d zn>frQaPIQyrcMYCVf|!3ZO?|6^)1my3}+}+DcC4m%2o3k3kB229r7p{y&3YeCJAx( zN(PR3=}}tUzPNrBeVSM{6j74fU6T`;1FNcha(A0A@y<6i%8JZBO|6Y zDaZekLe~=50Hpd$1d|?4AVm>qV!MZ?#t4|uDG92@1)`q_>z~}_HELqO8AG=XqcyRH z47b*V515`d!Gyly;oeoo{_Ti;lkkrrsY>pq5}hB8cQnp6EUry~<<<1Z<-<%3#cK7P zup{Jh%L|b%p%6J_Y&3kB#nSvMA=gIEVFep;9nLmrNRi!l9hG__u1ORI-yBV2f{Y%f zCy>?zb4Q|*{5BrO)g;X$CP8wZ=acnrWF{z`mxRX8l!tkXo2Nd-H+67)jAU${h1L_Q zE2HWZ(n!}|gFB}!3jNWx8%hrk9kS6I8Kxh`AnXm(O&lBQPb})k^wHHN5t%odFXB!6 zLoEi>O%{ZuQ5&li+MEu4FaW|7B+K6xl9a98b@J6>D_%AWq=*jGxYv>n5~iCfarsBE z{_ckuczA7=!ZEdR&@dyJPsH1u#RLqtQt zC7;DWht(({v~N(h?&k?Tl>_6x&3@mb;fnBay;D8)igYk_4Gxrh2pA@K zhU)q>e={kdVz_;O*y4Z~)%SISjrEtK!#4bfzW)yoRoGkTr;;oRRw~kGeJ6Tio2XaH zk3n7NxgJZ zSA%?*{?NS~~9Vm+psZvcL9qw@$k2C?58W`V)h!QfXwM~jQgh2 z;ZT)W-QbG4Dy9N5V4~>&We!4Z3I{^NBwu$1RN7gJZdms0k?+H8(C>P16+u6K`OE$a znWzD8>VXdpR240FY(z_-NXE`}!~UPewtwPRbYolg;9YP3%jm|^?1n?{=2wN4DD|Qh zfc|_v=OM2ST-X*_cvHagg$;y56kiA;Xzo@vR||~{W(y6;i~(u?atCyhUj+2FWJWNG+h;ZB`OkZwoK%oxp>zpx6#G93cy;vQ07H$7~v$J`HVU z3yIe);0h*_gVOf(dQL0r4OP9(+~M~`_anQ+S*H|N@Y6T@8s{@|UZLxDCKgurzfgg# z`Ojdn=H>p1L3Z6g;id}~PyUek5;R&yX7DE#%d^9oLDYBSQu)GgZ7Wz&#V+c?IO`%r zZ^KsA{NU1sTG*6$-}>;vN=go3tpM{;1+^gf;jc#&sIX?mXZ;-@MW?%;t0oMi?SH@@ z|8blLdD$Yu%UQvca0wgH-?Ayu2qnoKK<{IfdBVH>q~V8~?@=<>0}`vfev-M^5( zGoys}Iht=&!3dmUMHH>9B3#kn^TbN()p!+-3=LbjLS9YM!&Cy z1Nj#2Vv}^?SQImY0L{KqIII0+MH`#*K+Ssxjh&^Gv6u#d~d^nE(Fz$6h8Ycwz zhx;DBvqhl?wWtraEQ);=#I+levTQSLv{h822Gqm$hlp+7BB#QMzau!S`O5FsPzsMf z+&vNM=zlu(g8F@?_QVrZdd#mwm=S;Y)!_O6*{^;is+Oe+;bXH5{aJ!fve`9{4 z5Bq_T2S7&#+~mNo#(kX&Lzcq0=O94P{fyK_mMmPA5=iTTO@TU8{J=u5>;Nc+F8z4q4-I^Te5;1>lD1OH)K`u$p_c-a!D`F$W78jKkl#3t%%DC)Nn z1L0A6@kmfl014kqFdSka3p@0+`W4nSN+w6+$52)U;)Xui<5pIcHYzc!-U1yp4a=H~ zwz>_!aEX_QP+}><6I?uR|IK1npM)!T^$vC8=WqKTJiR8*&|7-Hy!+@vMRz_ycjdo6 zXe1K~OO4PI>qLcQ!QoB);g5U3%A%0$86?3`Qs9m0P>lW97oy*mJk;ckGzxw%2cV3D zs;K^m(1oH3GWzoa|2jyi2K$}h2M7Ts{~uHqJ}^sA0zm=Tem>-aZoFY%$>CsJwqLEb zV9?u`s69|@P~TmRCJSYf8~#dj6uV#>re4LToja-$`!M|@Y8q_9=QjX{d@E368ro&& zXM=(j9#M8L<8dI(aj^Drh|Dz>u;-6wNH!f;T_@8e)a=)JNHp$C*T1|!5Rx$|t-VKhY+NW6);Ys2RZywskMtjB&ugF#dDNhZ=!Z$*typbTGDbgIxO-ZbOafvj6JFQSSZTQQvmyK?K3k zL;24o!ktsj`|zRus-Xej-!Lqbf@VNt(4{rxW5M+eunjd&ju`(Q9?(N2ul$Bc83jo) z1YQ#bSeiUp73$|G1IZ5UY;NJ@*|8vczDt{f8T5*G#!iGp=2an4)WEvi@{g*2M zLvf8~AIBB);~I&l#{@Hw;Q3|A<5T!$YWUW49seJgpvYIfH0%hNn7JDCU$}fe_ON>;8}d8wzug4Z2eJHXrv{5qm-AbI#QzA1<=)mGmX0=} z>?ytn$$5z7cdLCfwnK;dk08UBfjk7dB#`l&tbe~7zbuAHP5sgpwI~yxN;&|R#iM|j z-`A*Fa!#4UG_LSo*ANifM4o|qo#!qVUhDe#P+?%Ap2Q+Y`yMAW+QwtkRFx&oEW zLaTnDY#)sPH}ZCW9HpaZ%^xs&f2lx!K6gWxO^8!`#8~|CRfiXZ;QM9iOScTWR`L)# zOdeXCgc3Uv1MN}L#$?E$v>7rU3~Kx9Hpf@p1yv1fz!q2f^C44QmL>?A4KC3(NnIzX zA{fRN%6$8Kmk=@aIF(s3)`5S|Ez=;6YsA$ZBrZScW548Ui2r)+-^CG1qE5+sIBww8 zHE8fQ*}Fe4-TRV@Ud{B@KM-OyqOS|gJ@kn8GiL0uIy#8|sQi~|UhJ>mEpACD;k;N!WE146f@b(2TxC%(vr!bZ{O;e0_<6?poO3QgT`l}7 zci=SqJu7vWf299;_wo9gP~8MU`V&ZQ?sczop)ZT~(xnO&^GmD-M!tllbeO3uBYHfL z>VFv%vi%>g-$N6(xQS9IN#MKa9BhBmd{j?q1RPK*{;$$|oJ@01ZHU%%*P;fG#!f4< zKH#BRR9A|USllmJDc`bEnZ<;;#&#h{kXBTz8by}gCc_sjfTUFf56hg+Np8muJ9~3M z!8p#M0PN(pR+IIf%Cg*ygKBT%0<=gKiVaI%iOK5L^o#BN4*kZyc4Ie*k4U=I)?j$7 z(^Q@0ipb5*yB^uRD7y#SkR`GK-B^smQA#QU%~hF{lLJXB_58GzY#s8O!9VUR?*dtz z?%6UbDi%_A>_lh6i|Q|B#@QU$E%qA2A;A076L+C})FTcYGuk;%jKLQ5lxsqjo5SVT zzcLXQIt6rtkf>Q7+`M&JtkPjVtli=#moJYK2%}mi1M_g623zOe?ve&`zpE94su4zm zkwFxsGjMUYnW@86x6rOwRB342o8pUhI!;XvrK=|ongBMJ+Q03rCdj3MI<+ZfGbuv= z?&JDPt>ttr%D?oQSRU>5^JBCLNhcpkA3)x{6M1xn4aVd4mX|rz3rZ_8AFcESuEl>I z5tjl8PcFrVu*qI*X``ui*NP4fX4kDzaS*Tcj}k_Z$`AC$m$Tl5<)+p%$Sbo6c{&{& zpC7(o{|IgB4ffe)?#LnvI2Sf7@DcDkTfKdTtj&h(bfir%ToT<(6Q*vI+PyQs+U~Tk zFKu{Dz&5KdQR;0z9|R2qD)v-;UVD@{rZ(Ml>dI!LGx6p-fEHV+E|6Vd(y7n<T^V(GPEwg z#p|oLG>%#;B_xWgYacG80O=auN7Zl30hR?Jw;5m+j z0|1k=Le$phMSbY_k#6sin+&7YKi|9-oOh1(D-|zs75(Nu z3dUVq$d%{g{ylLot#t4y+1QPO_YqEq-gz(>CtSk&!es=pn728z{!FWODMCrcIr2_> ziOlJ61tvfZ(%xoBVq&zU{qC{p`Rv3coh{toYj0^l>K>S@L|eSEg>#i`C$jt~5t97w zmMTB1H-KLxo}CCFRfod#3ww0qEv8lN^w~VkX1EZK*HZR+bJa_G>l@F!WSBF9p+*2} zODx-P{y=;?f3K>y%BlqQGv|KA;zc+~KPaVU=~!SHT~e_>|FCgxuP-LH@Ql+ox0e|7 zN;`AsroY>KZ1U0zW{iyPKemBI`#l#sY+CM})#P;Q@T9|0M@tD%^psxha(gMSd8EGC zNo8#ztv`%F&AwBY5nsW$jk9;N@ubwAva_4!M!$a9jG~s7PJg%*oI2x;J?2^9_3Sz! z4T&Jxts?tuvAS1LXmRQwQ9pa|S|M8%kZ|z{(>fzJ{b)FNFKI^mj0)OKYl{LPXzR7PMm8C8rBJQO7>p*i*a&J+mmahGr&yTVF{It%CS`fETp zZ|c2(6+E@ZbCt;5+KZq-kvVx*-gqk7(e9Gnx)@H{MR%(uaoJklt8>lLc5jT>>2hD{qKP+mXf zo`lY`lEi*9c;B;HjG%q4DE4B>EJI@w`JDb&iw0nm<@#46wRN33PytvFf`)2GT~A8t zJQe#w3n|aYddQUJ zQV;X?Syg~f*Iq|%IBYo>tnfWEpH1D1doUc|qJce~)zdNcmj8TpliD?5oJlvI{Rn^` zYj=-x^yg7>+iw)2eRTGD+TS1so*lMOxy=Qm4LVVgw6;_>hwlR5&ABpdSvqAL1Ei;^yZy%#v7>l!# zo*Qk<%3{NA0DI+emryQ|f5&Q637g*Pwg$a6q-brPva7Oz+b6-?W0hye?(J3U4l;Lk z?KJFB^^IZYY}RKv$#q^OQ=ZPM)7_DWpuK~}YYPyE*{>w;$kA;y7&y#}QAX5AcSl2W zfp7If!o{kqv~AU1vtuik-STH;Q?(Eu0d@k}()B~6GU*;AlXEoU*-ekbE#`0KQW4D# z5(pDqWgMNRADvMY_(v_$3KeChaT^Nv4>GDXiMqLWa9=SU5fQZ6pSL&swwnsl&5yAH z;W8If;|!P|&)Ul=H<$K~OA*!s;GBt^Vuop=!FmeSPstKEOUyVSF!rRyHr8|L%}wXc z3)df7KrcWAIxu!}*%t7{4pe%`+G^q5B-N(OQ(rE=RX?5LV8zqnd6(ZEGFrNmD@-ij zQ?GTVqZA!yv4>j!vl)>wd3l*`2F~Uj^0x;ZK3U`x zRvdF=D;lLiTGfVWjY!W_qp5 zO5>%pT%kEmo3(Vv)!OG900dLAHHYDlduHiE{Hw=c$_7bdPofHkrz?fw+_jkB%MhSW zj0FE3MkV~Px&*$xQ`OZfAvGm#m{uhje9};gnpT2{c8Q=j|LGw^4QO^E*V|`78^(in zF)7__Hb%GNv*6ry{JO3(UK5#d&pr2hv6kIf+~9aY#v0$43w9bv^tmB_)b2p~{zKsP) z)2zxcvA>Tt6VJ`mkh%=w3A+NKaPQQAzLcyCb(k*#xn!@`TCV0zU``(Fkw~B$4r^pG z5}e`b*=}+ZA2Fe3EQA7m>uITdV0E*Y$gZ3*`?p4B-vM_%CHc6I;%Xz1ITm zFC!hYniWj8M7Ssp`q5M zRcHc~F7vWf+EN`n2IVfFZN`zQW3S^Ute(PuRbOl3CU!2u{!| zneeTZvg^~`Dl5>#`gUOewua^%-NR;Yu89!@a17Ki3h%cJ^~!XDK6U(U+m&hV+jZ$Y zK=@#MYD#+*M}l&)v!m4^gwA!0S)^SDFrm$4@&YEVx!OGz!Cv3)-{gJk43|YOVaO=Q zwbF~XRo3`dllC!fOF#gSMiN16CILKj)wpRn+Cq4v?R?fx==)^mV-JceYG)+YqCK_5 zcc9U6%HyBkS-D9M2&tmcvfpbci&$qcevc2yXrN@3p;Fg4^d{b!h}A>`oE36*(L&=r z6}RhVlza3v(3VbC7Uf=n1ap1Cer3&xrZQF)YD>0Md`2jNr7k-s%z$#4Tn5FQT>oz)J4J~W_1dF-5 zJhd~+#2NME<0l1m}X{V4Yy2gUvsZ+3&{RYO|ouf&8p61 z!%w5(+P9$Ml6z5RU6mGF-r5lN7(L?cWQTNxOU-OhuWA>~>rH z`I7LLH$Z!U!je|3*vY&D*ctty+}B_5AtZx5lAU#^^@8GOI`%iTaFsQ}eLF;!G}nKWLp& zsfg)hzyFO-gQLy#4WXW;H0s?NY?P91_M@!W!XYrxG5ALjGk>HYVY`20#(QM|` zS!@rP=E8U!H>Q7va&3t6ImyXS557z912Rv2&HR!o0UCbSB&W*jX;1 zj&W2&rje1YG`RWI(V4&R`p2R|e0HM_G%CvZ^G#V2YkBH%Q8TsF*?qnABs*it1Jl`6 zDwBfK@j=>Wr#+g-gv43i?tw^Yp?^gAl_LNT&6A|i&hvr#uB&4CE!3JxZeD%rBBs>q z6X&7Zj%>PgTwllJ>Qk-`v~mI32lcn z|J~~dZQ2EAwx!*KY_mSQ;j-x(X%Uyz8{SmCU#t2o$ZV2Nb8=B&-yH{N&Jb@ zWGGTB<>&69V|n z%y8xg3hsPv22rHTI}4gXGxo|CF!YmMNj1$qEw?e4zIb(7^(7}nPXOF*XzDlQM=s}c zH}4-Dlx%oPoN7NulGwWXuN&UE)Yfv-(F4+-s?~SRH(Pn=*DJ^@#}V{0)CJvN&g!daELfznH1KuutM)NC-BZ`Zp>n@U z$52++O@?RD(F0Y&vA3wzzHUqkZLqIW259|IYzlW+Zswt{Vu%)c%NDuklxWqwEn`sD zKG)b7gj)X*wzoO1Ws^Kc_Z{TUf||Ru)!)DJJbN&6GJqy!(`ORU7IZPcOn*teyEl(V z@3h~wF=KGQ;;mFyeF(VBI4=kumM#+1Bm1en2e?j^7meUsj;r|8s!?+mo6J(AeMoE- z=}`ONNmxJ2qi1wl;`nSnA?Vmd$s^+{EPXADKI=a1dc^xX>ZFZUwbQTNkMY$L`GJ1F zVixyid#!FaypvmM-ChNt@xJ-9_g?KX=P_tj$#K!2;vfqo>}0Y?TVASdw*t0!yVRLm z`#fs*7^_#+ZXf$2xB(K3*V3Zgc@|o{Kc^WN<_HD|wW`g6-L-_Z+C$Ddc%(HdC|I*T zC})mJqA!ydGO;$U#@4ie*Ug#G88L@7{Z9v!&1U1xv}xj4Ls~jrHmV1x-Lu@z^ZJ*Q ze`?7lW>z=|lV#d!n_Z}QA1vQ?2dME2w|H%M_i0X8UR+zgu6H3ODyFZRK`?4tBi3bj z@*QlHM2WGjdD4V+Ufil?P#gauDCs9YxF}+rERS1{%eC;@oPk@k?nz2r@YGiGe*Z`_ z9$93s#eE)4$4tgQYvAEve7DBOdS5i^QY7!fod7rdz#Ep9gq2yn6*eRH+*dzke92~8 z&F|l2if|copMf6FA%N!8d`(JuPXmk9#vvQ!^s2a~%<(;Z^Hk*=q`KQPnZ_!`s+m~* zB$X4W;s?mzY}ly`bI^(C;B)YcX`Jjw;ZjNl%cfZp_|Q7n>`#V5L{Ee8nO81=w{oEk zAk)`k(RI?VN#HB+vf5tiwEp)L;}loc-Rp+iT(XVM3EP4d7Zv%#*F!OviE?VgURyd( zb+b0hU!AADLYw{qBOAzrs%mAn0#?g-N3j7SVCzwaQ9#j#>t;asp$-niYXvHm^yLz~ysp;4sr& z${m4LQ{8qQsDFC5#3V9nrx5L|TtIKXvG}=!#LU>PAT@~4O8a(eO6S;XHN?L(I^SNl z&JDMB5^eH0z~DqU`y0Iiwp7is&1H`riBI1D>+j3XlvBBC7ZE&_I)xiwYSVO|$>0Lf zWZgtrMZ65xC2ohy4tM)!RL{?}MJ2t3w<8#FYFu8Hh6QS!(;}bU%*vR{)Gr>|kJ)L6 z2xXMDrD~85`*{bSNuN>%vw6bedJ~MM?D;e|QBLRH{uU>QMMw=GoF=!^ZqG>BVY*vT z^On@R+f|2hvxdhO5IpfRehtq2LA+=cSnpM6$xOMxTUFX#_q0ycwAda9H!6MF?wZEh zc-~ZoC;9POPUUKB5q!Ccp55yAdayAjvsE|z*C3&IpmH?Hrk%OzxwEuFE=+4=t|#=a z;czHu3J>xE7P|a=iGHJ!$ic+%hR}?D%*!Jg-{H?GP^&{ONI2g3tHotRucPtc756PD zl>$nuf4TN;<&9{NbGfUaFz^Q^|8AbGx8r&2j$2=}xF|SEZS^ z|Dk1Xv}Bi4+FK79!({00e&3LV>&hCST17mbft()Akr&-v?~Tg#Ku~XeWVtBD-S|RE_Y-_esM@KUt3@S~#Bu zp7#w@*{93R^u{sT>tCU5QQZMM4@T^KF37fkBDh?9=|(rVR%eF=^gS8Z@JRzuogQaL ziN8mw*?Tn<^0zJ%ef}EPUJ?$G2+Z!T@dPJmImbU)sOWbY!6vs$q|++2?2?wq@V1o# zfpold>c+dz!2`ux#AwwQsfrwIWuIEzl#K%JQ&X?3Xjtob46n%ZCqOt09r^Ji%kWJOe{Hme+7vpKM5PN}^Q zXf}(u$xz4Q^WJ|+oNv~n6&81SA9xrmR?xGF%Z)Etb!vwHSxCn2j3&TOF_*uyL~fkc zQOq9cc8FgTqnU)yfiMT+4>7adc%^AtA1mK=y<8+%#l>^ioh~+<=gh^ zscc$TSv*^eD2WwogxH6x97(SsV%#?t5>6b9{GGtt8N0SZv9Ji^WeA2&^lQ)xuQO=UwhJxAM?}k z^|;J5OZJw)w;@rpyGe1j+=Y4iyTe! z)hc;+&8B$KGkj&wXGH5#T665=&~I-|D>5UxdT}9h>89);a1fDTQlV!~W_EAc|{t$*HjsqVr*9*Yn1mKXOjEqm42agy<=<6=m8i?exXl)VE0!|}Rb{?r5= zm03RT5dc4|?l4!aJ*m#{OeSyG-gMhgHb11ZM{`nPf{tcWX6{@McSCobd+~v{xbx@i z&2X*AF+04l{Fb)!L|$K~dAa_@uj8doQYtk$z8Pj&#&K-989cfxP-eXUUV$Uan-{JA zWjSXCV3r7I?vF;fFcZ7x)qO1{Hn51VP#6hcNoBXj3^w<#ZQ~7;ka4~R>x5arzZZLhjBlK^#fgFFUSf5#r zu~NdS>?%cCx#~5CsP^pw-DFM%;S?d+Tb}pHc6gnwBoP+#Ju7wsA$#4W*=xj%Hy#f5 zaC5%o)z+`uh_B<^7~fCKvcsjwv+_JYjAzjW;{z|Az^$C$MbCccc(5PgD9tTG-K6Gl zefZVM;rjijQ~Zw}qH+XEEI)*oMf>+ljvl5csZU0G923Y>W@%d)X4j9?p`$+d44ENJ znRTW3vNj{k|jk z0n&&J&Ot~W%W6$Fj%=HqagQ`y&w=HWwZahaJr;8>Z#$2(RaRc!*5kP(a>hv`Owo-D z8*MIGtNbRm<}cvaCCAg%s9%6oFxN)+#alxsRc}3f{{z}Yy+t~fkL_WvS}H}^l>{n7 zrU{;CqUl4*rkmi{%G>2wb!5YF`qUl;i;lXN7s4BfiU6&yYxKY}NYjF^u8p>Fo2A)O zC9Tuc!mGIVxRDjl8g>r(3yFtwU6E>F$Y8>Qi4ijl?XsjO0LG1y`)hIckC%4wy6#f_ zZ=-T1laBIh7D;^Biu6e01wb=~rv86=L3E<+h zr@&hwY!{_CAC6^&b73DV z)4KSEcdAx}xIwi0KGj`mX75loY4p=`J-bQE9$xtbuefp8#dOKuyHa7Mt&P2*!u!}I96HjajtYn7v{YKi07wMNc9#-?8zI@&%CPkp9c&dPJR zUmrt9f7L1wR!37kS~%2BPO?7Vqb_kK?SRyKj|bxks>JTW0}cx zO*+mR+*i4~bCWonPRcS?$Ez+H>aFYzJsqCKR>$+hhtZN=P}1Hu_kZ8cB6mhB{cr3$ z%^G+PWcGMOQ@v1N8;8u79-AKS3nE)hm-IhKcPlNYM;z>$&t;M)L+3eV-M!}?c26nG zF6ywum70Uqc1T`IGbY@fVVs{nT~BDrIM1BR;%vXB@>wSg;Ule6IWNbY;@5q_S85M# z&TAiv8Jp^nfxI)WpNAHQ14AtNCJjJ$&zs4K&8arperdB;%{TV^tIK7J%&7~qz=ecU z9k))SfUVUwXB676sKTBQcpom>;uur11mDvD}+XG(U}0Sia-hFViz3T~-_yt~9b zrbX1lTB$6VL`JSetR@;xALV7X>wyqbaP6zP7xPW@qV`n~2 zN#DiXm0P6Z@FTT%N%i|MSGz%NAO#rQo#WHcoXIPtf&i^|quD8^IZSI#46ss{LuZ-4U8N}tjqc$i%l7=iX`&i#+VTo5dUINnV)Y-K{FnG~9gB|Ps8JXAQq%^(nqHdc zR~^6OK|XS;7lRTt`xa_I04;(GU2{qM{e$zTZS!jFthQT7$Xj2`vxLK5Q`1R8xCV0_ zq<9oXreej@*>j_0R->9mrPAy{c;~ubK^X=CW%GqmMI$yX{+&lT-+p<#RO`{Y!|iG& z=}|lSrrJ)?(}U&f#55KH7X}Pab6jfS1*PM@hnR#YUqmqVC3kl>s4Bu&X(&faaz zdWzj`Z{gaPSME#RW%@@ALJ}Jrk5=|)lPMZi)_S{{FrR^^BP=4WyzI79d%X^a^dgIj z(h6r8A4nqj_bOlZ&R9O=qF)ayb13!);lXd=i*unWRe1}sO_{T7<`Ia zEk{|N%0q1Kv$flc%oUe1Z@Fg_Ft`%4ESo#+lx6R-+&CUNb{8x&v5l#RUfK*pgC>V= zK<&IaE0u8?pfjZ&45HiA;XYik^E&B7q}DD3-)jz1%w_;BbF|WS9<>g)NuMgI7D`9w zv|VjpvjIvf_XsS2FnGkVs=Ig09V4%U4!1T7w{2Zxxz!Z*dIBlgEOAg5pi?NXf5D_>y7o7{o!WkQGq)!D)GfEZ&B#-`r|3R{);J&_9gN^kpPqqqON5LzbKkT1<_W979< zYO1Q^lEV5Cdj7X9Jti$Hfd~ib?0r+xA{@$?maZ#x; z>=LrPd|{vEWhOVXr^h>*5fjwK0AI< zhicVs?Ef)z=J8CpaU4&IBox2M9g)hh4)+;~5Go-#%avR?hs{=UCPxXCV@V?SeT~h1 zoBKY-92;hHZnMqy^WXFL_w_uV*Y|mUKcDA0J|K9Qtcd}_N^arQnO}0o?v*hP&z2p! z#}uatQ>>fTVQ!9;km8~c&`wK->ms@@9r}FiWk7;obr>cKm*tC)Sx4E~YD=tp8(Bq* zgRCln-49F5t1#v-hXze1>8sRA=R}aN^WVmX^Zy{-hj5jficdF|c%wc=*fR1Aql@ZD zY~v7@yhsrRwd%yQoG(~4mA2df!z}Y~_#5D;FOmH7=EhS8!1q^LD%_q(UmO>-j|;c# zZfLKhe)pzy-uhfZTZvB)HwsHGRO}wgrc0#W2-Ymv^nA0De#F71B~txN#=I=Bk5TP$ z)Sb>E-7C@5tChcAG!J|ncrfGUl7I2F!;3WVXH~-VorIjGPVZ;71T|5^p_QkNZw){F zK!)ts4u(AOLJlR}+hvX%F8C=Sn9TgV2O_y1QNLpolO=l-dpM+Tamm_9^rCtmsa95g z?fK-A-=_N|+{IP=LG87B`{YoN7Lm9Dv`?auVCnrqrL>@CQln(INXb9-x$O~nW6l(F zv9jvj%-+($lFp!mwHX6G+yz-llQA&lJa2!xQM=^MYUIbM;S#hq_we`P4QANp@rzDa zV_nQ4Y<%wD!{uDoJ3vCVeX>*DU39=Wc-E=&XN?7cj8|Pg{!zZSo#rlHSNgpM`!-{_ zy>ip>f?qJm3G1@g=QGh#Fz{fDsiv>9P?T`f9CBsiS96W&(th!ssLMBf_te8ipyZrL#I zCe?{*g*D%`QMyY^yfraijQUxo0)b62cW>YIK(8wA{knOt6mdt>%QfsCn8CGxm-&nO zQPkw%k^xi-32OBVs9lM17chX3nY@i}P}l535AN|qH8DFU(pNAvQHQN7A)DByyq}yl zV_uBNOie|4l3yUaER+vsZF+mdPZgtwreDvc^}xM09D4!CzA{GCMoptjUsw}7)jt0t z_{F}B?0mLqkq=!ZaM-KvVej>a*MkgruB6z50}o+C*LmWviyYA!wscc`Yq}?xRfF;_ z-)r&I-5Xn4Jp5)E_n=SfM@GozLJ1C_ey1mcic>?dJ+>i-i-^^ucrga_<6`0u)pyI_ z@IyA{Ws4F6K#Aj}@Io`>D3F?jT^mm*1UlYvS!5u7v{S|BXaDR`L7a1SSsUZQzqVVK zEU&$NC?L-mHM96{`ks4xf$d!I9Au%yH)x@5a8J!u*BisWu4QTVBwC4h+TMA-5 zVW&9Je|xSG_9ca>WpD@8E7g>zG-QpL>*Gv_>$ATz^SuI&E-&$M9LhiIwL|coZWAbB zz?B9Nv_G^&N9l%f2sH>d=NT>7AMtkqDrBqH5ta<>OTVtw=dV3!=OY@fP(~fG`NxVj ztHjY5)A6C~{H3bj1E>)4@y(2O(t%*+;(`Ici__8eP&Wpm;*U^d>`oQ@Hr~Fe5;niT zb4!ropW2Lngr}P(jEa#nE=}*pjimgFN#$QZv#vbFw6znl@|(K>bb^j;hOz-wZJMFaB_Od!~T}Ur7)sWVR zWy^l4b0c`J=3#(cE<#OgNLFrk6}oQ5`SDkqw0wqVJZbKGagTAz5Bt(dzc>3KOdrE% zaC>Cna=RHk0oJGhsUS3k!VW_TAaQ^{GHtO6^WOP|hhOGg@+csc9%Byjb+GbB5F#uz zetxq?WN#m~_~3k*U7!96=(8b&iDN4>>+me0|e&*b4j* z3u*ee-zA>ZHRZq@7%dBaI90~=cu&c6NamJ-7X>`*xiBWKzv(?sDfI6MpBcD&&F=SgNB!FFVcV0pv|pIxi>E*_l62~#XCV?`B%<96W@6bu*6aN_tq||_Z0j~Jik5ZZ!=DT z^RaI6YrtM=ZcK9T3+3L0ss<}+-$s}n?EbVJH0R%r)zl_Ef4*O86{p)gJCj`)u3WVn zy-E!4?+U6mIewzk=S1z}bp0p2oUKEc%yV7q}G1fuf za4pMn-_R!m1liZk?QiJ6mw(b1H~OTJ zGr*5LMUCj)@HtY+YddrOp?!3f+O5B#)dPgG?RUsP zFa5eko{y#9zmsB-mWbc{{%@b zy;60r(`jeu6!Pm?!@JF!cn_PcKo>pPBtKi*kP>Rp&Ap@_W7{~EZSK!+RoCb#9i>>v zIP>=I%19J669uZynqLcv5sMmW4+@&nu}ITY4gO`Bk7MUD5bL|)5DrF7OLRi}P7pwz zZ|cG(zEK*}`cdpAgwwK?>+PB6Woq|y^3bKuI`GCHN^dh~6lsaCyD(W`O|u&zP^&JQ><}yV;m2^ zWPY!!pG4`*qFM24G9t`ZDueB{59MD7&&;g9M~%A!Q#tkxHR@0|#MGT~d_DYI@V2>H z!RYK@oj`i?%t!f8+E(_jDxf%}^d`a9?WQfA=A-7mxTiD+{s;2ARL|N#&)oSHwIuT^ zfmQ18$t~1gH_~z)R>noSN4G#bw1YB*WRVXVv^QdQD8vO{X2m2HR*fdQ|NfE*0K! zX1Yw0ji{6Zmy$FdTWUDW$}V7y3(&dusW0qr_$}Mdv7S3}|6co#SzRky%=|7&@tThX z~X*#Td&atcyq0YCW7vE1kdDoS|d6r&$>OT z(3u2m*6^`jPyFs10P(f8dWi1$hkKZG7_E~JO;CFo0ApsI!P-E1Xf<<ab?O$X2WM#lF#tqrU5vnpK~ zzUmk61g{{GlH8+lmj=eIR*b;a-=v!J-t2QlOE0F4?l@50iz zk1BT3XAsH12|O5!tbNjP@X*{wXwA~Z{i5|y%XOYByXE_kt@mfCzNN)-S-y)~xRK?~ z==MQhLX9nE8)||^d-xs7t|x7+4`#y&t8lJ%505C*5wgmfvWxaLN=RnR(ZHm@yz(WRU~N7S&MkG445&o`7CRZ!B|A+OwP! zS}F7V<_qEHceUStGw0OPmTz79d~REj**kTeY%LCZHhQ?3r*8vM(q?YZw=;DsHrfq| z|1Egk<$(h$w#Ite-aPT0l9#^6ypE>YDC$4m7zhBVJZx=`;?o{jJ%UA06B2qs8vO8N$l`pTzS%U z5dHb5Ul6#jIqV>p5@DJ9eprrXhR6*xcjr-uiY**G=U8XTfUY+Vr_R`d7u+z27=LriWmz%EFXC;ap0~k=I!^H@HWy8 zXZvhe{&V!gfIB*(>aX7?vP5kV<*`>`2T<3xSEJ zZkg!3dJy_L+^0%~M?LVw^YrX_urDl#E#ELwp^*}3-P<>xRc0*{#62V{zv%g@#>-<4 zW&btJu)hrWuW^Zvl@ZwD3Z>P6=dInlA-Z9+_>b5wRl~5t?3t2(4PfVZZfTGK@CEFj zHKXWY#+;1x?Wz-Yv&sZxyFT5Ft9cBt&R*Lj?A!=_jPy#m+iw{E&5Z;VK)>-*Yu4`+ zn?bW0dWs~5*1h?VN{e?$vNH2|iM~0dQq#NgA?4-3KSFi8`owa>N?}jdTE!(_dDF_h z3*e!UlA*W@8ln3RHHTSAk4raqrIcM~at}7deYN3=M^-9r%oex5xlpc*L zWZuJ!H)d~n7xF6}fUS|`K%&{-v80}^ZwtceO&cmOSK4c5+fviI@3yjxV{{}4(5~1= z4x0QqMEEYE6BwLzku&B)Yt~B?I=kMPx+9K=qwNc$H2zb($sn}0ap{Nvq#7q@k;R4y zI``PmkQ0q-vIR3MLE}%-dpQR`ea^HVyyBAehvL(qj@h@L@)#^@5C>EVr`TflY7^i- zvEA+u6YeYzabpD|O?Ccz+88#HVCAhZ00;}J>F(gFQ$b&N;$VA6Cci(pUt*UyWJxl5 zF($e1&D;!648{Ys>j$bW>;9eGAr>ArhPoWaeyP&Y2r8{Y5pg3mJfvAu^E&evx5Eea zP_k~y{xpAO+4?AkI@52gD1$$LTMj`5+LBg!%~j|i7Yt()LZ+D8eFkVi|E>aZ%p_TOBP#R2sGN!uhd zn3&*O(fp}9S9krIMkv#|myfNtZw-B#-JL&r#@u%}=HwH1EIrcn2(rnAkn^{w6mGa{ zy6sEnDA3ookOhNKOBI?G=YWcX(xQmfwKaR5AyAdvTM!UDfg5c)Nacq*)40!VEjSJQ zvxg#3_UbMr33`h@?pv6;ums2A^AkSYR=$NnEnz6?f+<#7`|Zp8r}n z1UO?B#bEUir|Dd_!!!p_fw%P+VCVKR%ul;J9RG}G(}B_VOjU70dHZ+$+JDx5oBc^M zT3uSiVF|;Pf1X+`?pa$^l@8+Tn~q>{Ytzu;hTN?+Rt>O2>Kr>jc1GyGoj>CwIvbkS zb-13TU7=(#0`CzSoZLV9M2XYgZ^+J}>M*c9x2{$hl9Rt9-s+~ZXR}v>=%$B zEw$eya$g<`@#aBx{~c_mdJ1I0nTWvG`X=7AUQW)(*l}oPEoi)*6FJ3vTMf0wfBf`{ z!=Ub~zj^wMYdx%iqV8;6xAghIIU7W%zi_3B`0vNzL_Bg&(wM!_#G)kWXx$cSJmJt2 zfEof69g-4LM>wIO3_U(jI=|=m28|{&DEsD*=@7-H0h9=8DKMvux8b7E;XK6+TCRQ0;#FNd&*rNAw|$>@gNw;)NV&LV9nc-bn&z;S8|iuEzgyL-$Z1W#-dHgQR;RAi(ptj*-Wjr+{z8w4ZBzNWBmp% zu|_%5`rXp1(=po*0DmKaZfT!qzqN@EuEI_`_Rnd0mcMdY!PWbup5A(aCzRCJ6k9#k zU0VAdXB(VgH|f_laX$Zo>%?l&rEQ)B`%w?#Nb`&ZE9el_EO7X6!k?19z-_cpvk_>z=?uVhs3deIDXx+lRiycEjX=gfFn zVN1n$@fqjcM|mDAeOPSJp@Zr``y2?9=D?Uj)!&d2_uNq1_Zw6{OtP69%qo~fs|;7B z*391*{-!BeKd=UA+Z#~&cXq^YRy$oN_)RsfXfa!cXlP$lFJ?V_1PZyvmn-x!eKFq5 z_ag68=kfD=Tk)(`K6Z;0<%^ifl3eq=13^AdZNN-XSKEQBz}A)UV8``rw>hVg)nyil zuwvDw!Ia3m4HDdp)29P5@Gk{K4|w(xHLlRgO1C|rRzsW#gJ)=r zaDFStNl){W{=Mxr*!2_}QS_%op`bx|`F`r?LV^HlYh1&#G7m$lOFE{zN2`s)%jCBY zk92e6>|g7Mzp;DSyZ)iD%wB&#a=zA8<#}!~Og#y(aZr@M3$OgmRnrGXD=C5y0j=qG zSgi>&$E9B%ZeWdU#e-?L4MvpS7CQKSGsSdrrEA!eUI>o7!zZMbIT(N!Ua77Wk4>~Y zm$8wz8UPwD1K$Bh0bMbOW6gIog#Ld2<*~xGVnynu<+L>|x=n)B%rzNyooYln#b+!1 z5F`|~=qOCTA-nN~6y0lCrKH3cokZDw%50wuj!{~!FnxCS{BICRQM};x`J}!#Ti=}s zYF%T&#?KFBS*s_-(V|UIlAeGCjcm}hZ1Pi6dolQ-{c)iw>2OPl@nY+P%I6a8bRJ)R zlB;Xx*ariPI-4W*g@zYfzfGHTsfTLeaG%E&|3?49<{Fa_tV)ethbb2{wE&)X^EJ#& zU>e;eA)be_*1`r5hfy8%f{9I{AJmX*&Ocx+A2AVQpm9CexRLi>LldN}N$I0+x_wY( zmL2`X@aQydE!6Z?W_d>_^@2c5{*BJ%30i!6hLG^5HvP=bW!aaXU;*Dd7-yw1 zXgDoZ6~f5qex`p00gj|F)_FsOuUthS5OI{yPh6cy;L_vW!@1)QGF0Fe!(K<4#9TB7 z!(Q@+KmdM7{u~T8Q1q9~d8Qix0V#~95FI?_5sb1FN|;7m`0SNfqO){`xi1IBii5gA z03^aZdBNk)UmtI=V0PBMJBA$1^|Y(6+(Kx3M4S<2@c6zy5SP`(R%CUOXC!oWIJ@J@ zRMLL!|KtVNnPD`qvB#AmWpZV5IsKM+gjzSKQrXR^C5r!J?!!~QV7Y-n39Q6(V~%pfkDd8%G_F^2XCA7HJQ z_#@J|N0vjAR>z;ul;=fjJkUdFR{n=1>nCD%Zt?p6`jwbt&=?TRv|IQ1k_2a$cW4?ES7@PAL{H&I><8^NeN$RT>igEyYr%Q+~U|9h#}87 zP}0{Oakw!~!!d53Ne+&=VkhJ0ILV9s!gC3Ybw8tB370z!DSljZEXdw9g_@6_$& z2mWhZ?D*N<~a0 zLzHk80sS4BY4qh+-Op_)49`h5@*hplZMk4*b}7m2B~q*JR5-LK`>_>)B8a4JP!tAj$e(pSn#wH>%%i`)Q!|>!^b+Wv4jKU>oZSpW+RuHqr%&?SeMEt9_ z_cM%O0F1!I{@c6z_%F#rRZ^?ZzIPmbl*Zh$@Qfo`7_~v~q$Oj;cuC;;$9mdzQ+&q@ zO1z|H&$y6?mwE~tQWRkG8L{7Xom0&8b9>V7CY=k#H1vk3@oZoD6R#%5jQr( z&Cx)%IL$LCjrm|2V=^Cq27T6;!bz5b)Ww|1vJ1Z<=G#jb7mr(Bj)_+l<2T-kM}DJ! ziB;(`?l4C^+DX^bOVs+BTofaPYe;xtcktmsW-Zp+k8+LRnH16pII(9;VKT{|`8>5cq1SxzBfS2;zxFd`0 zQHL)qHNMRJ`!!uGZ!pS+qp9-|&QPC&&XbbN6WZRnF%yqx zauH{mmXag28^@GeYT|Hu5daTYL_PBPMaYqeCioWa4>p$(-wrKKYSi^=F9b_3nl$4_1 zH0NN}ok-kxa%@t~*Z070<>HT}b{rKh4KHuyY*>Fk&A*ib*;0KKD7glJC*~) zKefY+DXz^@o6^@!C`gMxdsDo79j1CZ!O8Uk&KwZ%@RRw(PhSdr57LLP$x7j$iF0U_ zKiBiI{%+i1DYd%&ghT7My9`3x>NE^1jb9$(4R)61+XeaF%rQ@qK1h#yFki#e33ym^Ue{B0?og_mL{k%Rm!y=mz$ zHJM?uipH9Zrz<2Uo9vcIVPL!ryfOzk%m~M65*O0>j6?nC;D!0*BP+|a=6J|M@f{E5 z-fK(O!dC~eF5WGG(BZIc$(I1l2?i+%QM}BwO2u#O5Q*no>dpw!=L|!eCXq;CmPJ}6 z0~zj;9ay;DrpD>QTr$=)`{DX%$6#m!co{wu&?s`7!p=i8m6^X#->M(cFFC+G4lfdG z^<5t$b_wvM%{m=JeeD=P)SGa1bM+-wwlShX*)Md7G+zACJ2iHmCgYG{my2Q|r(ld! z5|jL9kr@*+DiCNkD5&l(o>?{e6r6skm6gR8|7HhyT+}<7S;9DO<|Z9AeJ7DW)n5XU>-TZh92DbCg8=YmsfhJSZzM5{s*ebeC-@ z>%loFrs5nKiNCFVB)lSgr9PUJu)Hv%GojyX5w7coe-GH1xK&@kJRZ|7DR3OVRasM3npjcxh zW7|0bzyhw4A^6<^C0PqE!%=)-2?aY(X}>YsJ^6uDahCr3i$rfs@Tw^7RCeQk{u&8uWSSw) z;affjtmu?^(;g|VAC z)F3&Jmdq(owvdq9yER@nw#Uxw$SMaC_tO3V-eP`D?caw5% zmAL?jYQM%twO47mmUf@vD|H0D6l<&fX%gi9_b|dc^gv<(?xPAKekyvPQ)`KH_{l<0 z<05}=o7FkW%IOx}0I{EPD2xI>86&%$f=H&Iau@nWm+1=S!tNs|EDN+*kP4K5Ytwk~ zpArKZY42^Tb3jAugyWza_z$W0Bd(&0nu|Y{oB%e5VS#{M(G3T$Q)QI%G_{nqF}>j& z{=gT)!T1TSGuF*{pc%)wrF*5L#|F&5w9BiTn+et0h~!D(B{9n{7 z+)r~RXgRqqwWitu7=wGTV^ew{eU$5C7&v`#M|bf6zMB`AtrJ?bjQqjZA-WSS5y%MXG1p~y z)?Q*PsD?o-48rnpQXqgk^uEqyegGS*4-3s$>nEz=(oVoIXJIFv!2xUmAmorLwuTgr z0O>GL9O#blecBEFa@zBI*N*eH&8YnsE?Asln(!GNFgXtZLw}Ah+j}reoW4_LBHb6l z(z0cdAp6Qf%fh2z4_+I1=;pl9~JG;HUFH)&rw zmSb%WRlL>|RU*-3+C?QPOEGgxY>ssP>FiEnobkYfijU#v;P#Y-{n~2tFVgU#wCAln zzxN%xfM<0+!0yJamCcc)R8}+Rar0kLDFN?OWD+iQzmsa>AO$3aCbO=*0NpN^hVLL{U@Y3b*kYUDsVNdRA}<%B z7-8eK3@4)f_$73&&R1>T1AlV%@Ra%VekPn7Y?YrCMR_(Q`#ytrp7R}9t2#*`6*Y6^ zAorLH;^EzjXNoc4Q?DnX%pvb1Lz&>nq0oAA^kRAL{89x0d0^=Bc-5UBb%s=V#Y&l&Ygjp=pMD zQjoh@Iv00(77py;^^}&5_&~F$MBI_`lKamQ#;5*j)whnT6v=NBmnlgTGGuoBG2=A7j&FI4bbf?+`yxTv zSv`qAJCSY688LR`n*T!QM!~I_BM;FYlmLF9<)S*$=bkWG_C^HajbWg4$B&Q55#WYh zEY!Y&G2%)w47%x1Eff)yueEbZ4bZWb3DYe_xf~r2+U%sK;m{mA0_+^6+K+Z1@5SxU zB2Z4-)3$DrF+JP(<}>wQ=DAns*ELRlg!ueM+_448-N|UunRrhM?cEquOO~;kdjX!b zHRoGi;KN5mtcBMp}ZAY!#K@c=%QyEGphR%Tc7G zSL#;fUx{_<(UPUw>n1J?us|M9YnlN0aA^61=KWJpn~(U%wvHH2T_jB``gk~wbQF1b zC*~1%|H=%_cu!LEfqCBkb9TC^GbTMG(UI*c>R&(}+M6dtWKV7;VIwo1rEHHSq{n%m z+q3b$_Pk7evd$+#4k&E@TCGh4V}B-``q6AuRsGHbi2Maenp2(KE*nGkk+#739gZEt z$#SH|CGzjmjzyD1D-w^65GM~C?5tRoN; zRo4Tk!g}biA2fYvPCttE>}{yGO0tqN4{U!C`WnHy>kj(r+@)%MW;OL|inX*~IriE# zcyOp}Cx4R{;>LpL{7H~bgr|>iwF098#V#+IqIGqwa>FC(>-ETTl_kJA^dXoPGtChw z*6MKhu*qcM1IdH4-j8D@#lzwSXw**?r}?83TUgk$CCFzN%c=EI=X-QW@h@@Cuxm-q z<{>8qnpwCQgYMWIGoCrHA3;vjA_A}ZM}Kh(ubOF&-bR>z41O( z>0+H@H&yA-j8BZeyF6rCi^K2xlUXz;{p)3RsBY2{G*We%o05?!y11pen#58x#jV-Z zob+lt#@bPrSuS%H3huqGp0qA~TbYH{wGdk5{rN2Ho8 z_M~j0DoD&ht60y&tHwRuMCDn|qe&PE`T@-a_$ZpuHd~?cHiA1~`Z$Y*(abz1DxaoC zI-PGv-1#`cnn{?a;J`^)M76vFqJY3+yGVkw*O{0Z^Oa^2bt%FD_bh958i-=v$}*zValJnN zsV|JnbeLB|{untxyN-|OrJhD1MJ>(W8JFvH zr|9=I2(5VWkX7xZ z7kI>~C6}|W$xffs9cKXj^ulzdZ{Tu!i04zd}Mr-$3(*Fjzy2mRGlV#+Q5Q z!;$G^hlR zI4Kdn#2eO3$I}^s{5*!sh8c41`$-S$$#1j~XVmKkU#)Zt>))MAsuiDBC8xYIDj5)l{mS z(bBmV75OMJbD8@4GP?$`JhTdkRnDD_bc)-7y`IR;m1}q{1@TKPH1vlm<~G+Wp6b^t zns-~#h7Jx(Xx>%vn*A4QUQ~`?rF3$fo~F%WJSyPaPH+J{MW%u_&kp-*=>}CfIeeQB zNQ>S-Pw~_SGJNaHer#_zG|5zwpZ&XoDaZ_0J(4A)7h%EVj3sz|y{oKR_pyPkYKCI(wI5~H``#*KU(5w(-*RisrAx6U>I0w z7aL|jYF&%-iDL0ZZUD)rw52YSGNMt)vsle9F~mYWx^zdY-Bs*0H8tOSgK(lIN}=&A zg!quw@tV0?)eGU|taU*-o`;jy&XYjEq|Q+O90!4&&vD5qcA2E+oA_mvl=*lYS?G6m zFaQL}kV%f=vr(fvYi@3TGy$Vifx&I3z2n2<7I8xL!SMbGYHfJwCE5&+7C{^ z>=Ms9mN_Fm*Ps06%VX^=a%*vqy4M>jC?GTPQOivusivE%$6df1yEwYT>OY(z4@?S1 zKVc?O>EiFpR18GT_~Xq5z*GHaNbs<0gnXsHSy5s5rYjVidGhr>4<_ycNpq2BDgO+w zy=e=(OvwjmmI+y!(vW2>nLX4w#%3VlQ=W`7j;-^mj|NXFlYsXIjS!(nW7>14%3iXE zf)&IMPK6ovZe$sW-bAfTd{5{WGXOhSwsc{I^REK<%)!U!cH(88@{nRmnBwm>K`)gi zQ97Tf=h;gUuO%^;XMMbyF#PmfFO<&tCs&8&K5W|xZYGB7aS+&|0^KiTnCu)w51(je zvEm@6^_m`Gmr%Yvla>$=J1AoecrX1bLNRNbSO7YIsMc$T*5$@TcAtaEobtUZ7F_mq zNiTEKF^;MZtVK|xMJHm4%qy^HcIF-GRT?~N=qmT^RD!J7*>~6PJ=JOYp5Sb#!XBP@ z6@30+Uo7}bbXEL;(^Rkj3ncLY`*@B2k*jT7Si@06c~M!_p*)x5(=ybAM7~TL!Zg?M zKT7Epzbj|c(c<9$Xs}m@&*A7;lXr*K%7QKJ^0QF&C7WSSe_YLt7T>1Rj^DmE37xJV z1NKyqPp~#c7x#~{C}mE<(8VQqQZqFJcYe{hRyV(J2SsM}9$IwRMa~y>P|unzajvmY zORv&>JP%fIs)!6RE@spiFeC66>~b}frv*^ot_?M0q&uO-F2q2V_eR#Py%MJ;L*iJ( zn_C#K<{4*)X=)depdBPzMHtdlbk%R%OJu1_uF`fqR)Zs4geUQmn2TLgX-&i=D0IYF z7n%7^PlR?04wVfb=dp7+s$xZ8xN z{wfU_2_!P%#vtF`tJ~C6jPJVjgaqrv4f+Ryxs5&^MES+4sf;u77@Gev}5&r=PN z2A7wC(5ryniBu0ND~wuQ`5)zJLJs=yycC=OFXzEjI2_@%5HF+-Ov{6RaUAF==L?`h za>Z6;Tcgy|tUmNuSYHH-PNE0yAAgjX@h!u-yNKa^pzXVonLq)c0*oy%_4P)BYtZ8pF)mNq5EBdRz!`(P-0u$r=G6_WeqUSyEm6!Y%fyRoDN1Tz#!tuO6c7vbx1dXlG)eQqMAzf3 zD%?^SA;0&R^tA3_ravWlF#@@$`J?r!Z-;F&^KaP#D&%-b7x2LnN$&#q z2~oLqKj2T1BS|=C*DQ*@jPH`-ph<}^JQUq!%*D7tO3`^mG@F(=v$mmkE%#(ELbnWQB5d=;e7!%u zek^u?gl8O-P5fuYDE2%Z8We%+7LCog!^(L0+MA8XWJDm1OG~n+-5W7-S{7HO4VJZX!479A|88+pWYy?o5YE>qc0nGk++8*s2?YOvrx z>t*GmDuHnE)JLyH^T@UUUF+Po{%vM`L^Ap5+}dHEsmZs^e0X%VkY1NSs6!ke%`bdL z^hr|Tyr~L=jbrj!EYRKGLE^5#GxkVFIsOz*n%?9pjX))aUP#HZgxcgXM z1-2Xwpd6g*Pcl`=^=VRjy6)csa@N)l-r5`hK7?L}#39>eeNC?~iwRT(@~R7kb!6-S zR)J|-Ph^;1OU%pp)|KCvorU+DhWsuOur-&|xF5p3N__@gcppBUnotDPi95QrQ5SnJZ4e_lujQ0d^?B?}ny-V_) zKAw_L_@mn26-6*T3zHm@EKX{npO?;0%9aFN9PBiTc54&Y)TMDA5mE~@EX7{0lv-XS z@4Q6qfsEAF?cW`mw2t%u-v#;$?!G!JAkJMFMGSi!vfp9AQXoFuYvjYZtB`K+pYOps zCZTG(HNUnP5h8R!^BoC1qlwcvSl~Q7*&U9fKPnGRWG|(U`S~@#Kj8XT_CAZc{6VLB zPpj`t7w$N6SNhY8m8cFuT%#F!_G9M{u6d3m(fPJ$XWCJ_zd>~Fhuept>pd*LeezOQ8IURHb&_(7KYtQ>V=pTaZ8>ty{{3XJHW zK3|PfMB32)w0zi+vo#J$WXn#M)+@h)BH389*R(NnkVusr zs5c8$bX+?!<|4f%fbJtihAR||4J$aWg)-cG?x0Y0k(nd@dy|sKAC^1KZaJ9hQOdGvM$k z*YC2tZJTJ(iARjBm&2$Cv82!Un;o5J-4(f?dvH@YNe-%$bp?i$iZx7-S`T|@_*G5f z*`_7I1Gn=dw$AhMcQ(yVcGhLLMZ%TyM~-ISGdV-_SHD<-9WIC4&YU)WDOZW~jMaBRE&a4ntwL3rX(Byc=m@a9IM)Il z@%LMwI++M_&Zx6EYpr_LH$n3aIs{I-@(g*-fuCR+|QOAx2GNZcKqg z4*+?c!ZB}R^dCNE0#C(gm8X9z4!;YOUy_*)16k^KSSF4((^wqdheTaCexhe@x4=`- zelnA>W}&`so5x8-K=5kk{P0O<1ok{tmXQSlykO2%qaD7bA1OmC1HY2EPhf(;YCu4} z>iZxz#8mV))Tl_GoUxEy!uQ>REBx(ZM}u-pdtz=+uHSBG|H%u@=@U*nN3%lyaJA)q zZS5s}>7AP@P+wbh>I0u)0gnA|Z+|VyIrPp7pV(BRB!;9Z8H}K1SXenah~;RwGjq|e zfsK}bOIMu*5_W36$1e!!E58hF7z2i%wTn{?5Tf7B(J=#F2x>Y3dvU6%pQ_R$F8V&t z)i(T97f>))UE{LyyCnW8b|&%Qe~PaCpXu+9hlI#Ts9csP6&b1Aw@T5Kq*9+Sm!zxv zeY3e!L{cPdE)^xWBzJSa$Hm* zR&Hs5sRlOuGIO3wNVyJ6C2;H9UM{2Bg$x^yMe%^=g|lylAd;)RuUdd>6ZJ-0Rc|_S zF?VI@GYdx88`LZ$^SrZ$roYu%21gySco=p03=W1T57$BfLAcc%|EAMzb7ucrNat~&DKY*um3Zk}V$B)TK?aO%6x zmAnHUv=MGXJ)Lt?fu+RB^qI|%ApAp^O9@s^>5{({^`1c_tFbq!hBCDZAFESd(#e=wv_ zN!NW&bw%mwA+%WLUn?0ixyaMI{Q#ZK&GOZp(Ucs#D)en_nm=W}ZyUNxnywl@6GRdhpd}UzmQHQec3(XIseo31He&?r!(-j zj0BG}4}4)^{!+n&`XbTHfiQtP!`KNqoOXzf;cXQRiG>@ri6R|kBfXt*xq2%br*Y&^ zRdItvcruAyVz!h!t?A8^KQ)q8MCAJ4wHFu3Z%;LmgX0C`fAYuU=(?C4V5h2(_W8yS zYHQcUA0#I)gXOrPr;??7Uvi9t2k{`&Y8o*bu+mo_t{rPwe0gy2_8R+CMAgsP{$KE~)-@S>QW*V|`ax7}+RN-xlu??JjloE5-tQaNLY=?Qd z=`eciyBqgC!3nRXy(NiJnHdrWA3TdT^I@u7MPZ?@`Ew-; zQ1h@DUELHjry7X8jruOgNJgjxn~l7@nT3im-`ZE6Dj53@4l>bg+Y{({+Ien%;yS}A zAn=U6$M#28L7F%zj^sv*K2QmQAkl_yXtw)&?Vma+mLz%@C=~8zgm;mCu>Ld{Y+M^` zS99OUPN|lt@vMZX0@-O;;UvI8Cp{D2{?VY_SylB<@%!_&+uS2a-;>~iF$}%jGgzcq zqZI786h!IRVR8XjHIsmkf&BGU#?Q>?5FRA~ejZH%;vJuAr_p1!6uFE@N>NLR$w=R> z;T)WOua%D`(^j?6`V7hYWITM>k5-`IFL!2P|ApBAg9S5lpN*ix1hSTS!skRT-3{v} zIREmDp{?wLLQW_H4XR(Oh`55nQt%8nPa_aNp0R*cJ$AwnCkfu6fXm^=6) z$zIZzBb5Obfv1PCBY)EylPb2X=al@gb7TVVrP*|ys#WJ}W#^fD)#CmMr_b7*sg+uB zsJDDl_BF}zwT|V?PQEN(@zQA+NL$h5-%mF>)9Zsp+3a-&Xh5TRR3?O=YL2-Hx~+bl zbY-4q%FDO!$hSwf9s2MwJ_2>_2PEFi4QkU0^2?`VYz2RGhhH4b8dsC#T~zK|44enX zAE*vldNNS(=dSyzy#Xp(@ca_%NbhIc5+wCw{~Qlt`zQ2?SdgZ>Cc#_w&$Dm)?pBnA zXkiGvCug@yFAuIgv_0#CP6+$*wy5;nz`9)HRk({3gIc7wA~`vqih8w6;#liE?BQMrFPyvjclSaw72t zIhVHsVx}7LDo148w2M~VbB}}D6)gM8{=0A`HoRcBk1M$=J*gJ$M)5kc5`DuIX}pKB z4RN^#tS06Id{8LR5PC2VrxF>yxlQ*YHP`~3eoCl2wzrNup)mF!qQ+x=#cB7zDuEb& z2jBLpBlhG1eYXJ^DT6={rLA3mc8KBY+uQf+XDoHYei=ZFbWIm%k8LCN$i|3$*Aj7u zZ;H-lmmycTaqu}GyI|Wf>y^Wk5xtK-6srzS)@pCIBz&w+o_TZuSuk``nyPijNUBlq zl3|Oqe|vQ$Otd#omeW%4!9qi0kX!cK_Khy#GD=$T&3+S+xNgLt$RbX!s2A3oCwh_7 zMhhW!A1J36%{Q}RawtEQ5ZR|R``_ZYY$cj?eeDG_+=vlo297|Ck&DwBO3L$8=cSyAc zlM`?GZEcw_yvQ+=T)k-o6=~M^d365-EP2AbS=8+*9F_e59Z{5eUDd<4AHuMAHqK^N zPct_)U!7!W{!YD3G^!Wk9wj;T)`fMIpBkQ)WXwvd;`RxOXTrO&QGDVGBF97i+tLoT zp4HtAfBh)hL9IOT-yzSPHx6-al#hRhLRoLFpVn+V;P0uC1$>E$Z5fl%7nt-k)sb(R zldKCgr+0>hwGTP`w2y{#bt(D$Jj9;}5R5b{RP-;=Y%S7(6rn@s{VC>j{*NsK)Zr|38Z*9Lz3oo*looAm1BH>QT zZ!ceAS2DalK#kH@Ya{kF#|juS+KxQHVM56V!UDeXGToh&?F|D zc_8Jg#?089eJna$0Fb6n2a_AalbOgBZQ+_VzfOF$F}%Q4a+B!%-S>wi{59vl_8Hm_ z2QtRz_dixhk8@9@(szL)b06Mh*)j%oqSuCRn1LU!Y&o*>ZXapF{K$jI-tcnyc?+(_ z!x@X48lr8zXzvsR2`q5^oI=cbQ!ISd6nIbS>_%29J0+$xFnLAXY@o|M;+%aKYjLUD zy8LM!qBd5eEe+`2i+5FF6;Wyy`CqM(hjdw0h#s9WI-K-xb@seVvuvMLw0+uwi=3Ia z6c=z#SR!j7g?|d1g0}t5h6|G5uO-!NS0pT;W~i=MrKR=NI{K=$=8AbGQ)b-(TAcc5 zb+yGEk@_kJT`ea-O%8oWs9Q=#7fQFcWM^YF3(X?@ zYQ(M3ZUp5ex`GK|>3+>cb^F1tO}<0%w+jeI@@i<8dhnUK9Ho_JC_&@%^KxLPa^TGl zeQAI=dc!v=?1}1GYz@sWh8JsXZS)Wf)066grG~&P& z&eV#J1l-mvzSu%>ZfJ*4fm!?G+fUa$Y<57en9rQuMSbEfzO$AAS6bk8!j1y?DhIyd zy$<&^`=>X7i(Eb*aen0EwCIO|?Gek|U6_XitMtpMbGHQ!#Ut4*PGsXd?mozS2G?{k zk^a!~pF&gVekJ14?oG&*uE~sQs{GfOeyO}3zPg!-33e}C`Mvfe=M_|u<(Qv# zon3BM0=(-V5PWp0;h7n1tyBCv3Bl4_J7;DBSy82Z*iAHo;XCen2(R2TopgF~4^5Hd z>7+1o0ICa58iN_?;8`n>Kh55jE()w+4=NBHzza>k&KJTbnEZ zjSq`jVot}qnUw!09^Tuf4EdMndKtz)BMCsk!=7Ul&ZLV&f zr5J1R)SVgVgYo_TIulMYm^bnD=I*sJhntjY6$BB#Hh#K>xje2>Tzeo@s1yzAX2#`m zuEJ~gVH&@mT_Q={bnaD(p{dQuW(EfF_EIdr2p$O{9NF66!1|aE?sM;69G#!vC5+u2 z9AsYZyLfyKcB`V8*={lW=mX(r@@i=S7+`VHE&zL_SK?oMbLfYmQB{DMQx6a9ew2}X zA-7D9^8AZGcQsZn#G=@CMKUATDIjF1Fk-vpr;i(IS009ohK>6|*1VOyA!>)^mAbo* z+=@q>8*4`H()fjnv6SqwRpBL}Zv2^yyybdG=-4@wc;JpyaPP$YUT_#ysZ)@h^~>C>W+3_=SE?A2&>Miidx`2{4wU_&*Hf za9z({1i0S62(aIDQ5q%{E)cJyB&e}wZ^V&S?dl1J1MUh-|8}(-{K<9a)DjHFqAj(1 zm0I^4)&lU-mufn#Lei}%>RznL)X@FSA1f(_AJ0+r6w#_H49x`>$fDdoow*mcH(B!a`<8~LYQbb}I?Z#jL$fKcfVd!mI~ql) z<2bCs!~v@l+@Gux(Yx6%0@WQGRw^M!Ne~0hPP2sYCFLlPB%;vz9*pfIw!xCj^VY|k zl_aM4wEb}>-y5h|S_4K7%+Olh9njzLD;cAa(e`6~(uh^SSo8k@ zpAOr$0sI{~29fd~OMWhD4sc9iTrutdYRQB7W$L00*v#}eg= z1R$Bf$jz1??TvjNZ~n`XO=7f#d9pxpch03-BFryH%jkP zrOY@$j+(#k5DNW6VTqWB06;km_Lh_teyHZ&!PKW5UI+vY8ju zLCc32VX$`W{{04Rz0>A?%sCg9wfy7zf|{p$*;)PlYk%92FTIw3S*^f{GdvIIi0$_U zZj#7N^UHGsp3k-YYuA;tpzP!p!q3?6xsX4y{*<%NQ4Ck{wcMpR((wzHhN}M6XBn4b zkPqWS?cas5E|LN3+=Ha=7Z?hFFr-swc=wNBc_5?^vo~-g-pm)Nj>kBA2(o^a0A3Q= zg|ghTAlqI|H8J+}UOf%nit@8sT*ean*f+_w&%b`&>P2st;98HYPKKvxhSfR@Hsi() zMw}dc%S5V57PP!8_5P3$Dt5Jw??iB42-Xh`!b8i!>(VQ*BJ0P^cvbkz^D|6^Wo_tw zhP^*%s_HqvvO*<9avZd!doV~jf$QC>{k)IjSn(+XA_+HQ{CB!|T;5)RfBr5|a9&jR zOXY!YMtmsZ3{*xm`mkNfb8bX~mb1_FvHVIywNy2?YTMOxwHy$%Yvtc7)9~O1@DeQa z(K&x&j06#E^;g{s~TNqY?VV^Mg66rb!W(C7l3IsNw-@ ziWVk!@!zD<2MV*hgWkV_mnZa- zrnC`z&%XOia$_8LVrBu63Ort2Z}y60SYgw#Y)d)wrlYDAZ$Y}Dp5%*=9kz`vQnmnJ=28ScL-wk(%Oy=LZ5Bw+(8TOXoG zkH?%=EhoT5?-C|R>Z>4rJ8bYj$GfO$^tqWYCuWQCRZ-j+FlGb{jThoVlZ-)LoD%v) z_L&*=_jXCsk4=$Rft=0LW)GlKfA=Czk0aMpLZyBlecoJHwR`i;^Hqf$mUO0r$z*k)Afa;0Eg`ffJXO|g3 z3HHT-$;;E{S*2n#iot2S?I^x&{L{)NkU@Dt+K98r(~PgEOiJr4>#>k@&b3@RvSem? zgfH#i-+j{E3F^6gLNVwVrFY1Ee^QI`gNY~Y79M4bI_NXC0|(fDr5CbHH<~qp%KU}k zho;1iR`WlXOL{sYqrpA;Hi8MA5ESt?1KkZNq|4H^v>kf(_w{9SKLDPnZBN}RWZxQ> z_TS#HUXkHObG=K6#cm?8Lub#1M)2d9*?URCB=+ky;iwVr&TCG2yXExQmrA{lit*Bc z_^f;btCZ4fCY$F4hQ$@d3wh_?TkwYR_XhXxE(bi;Cw@}p_XM>XcYFAp92eaI3ZkAk zi^i3Z18P@33r@zdm?1@ADtP3IaVxe^sI!OO$lnpV3w)=w_zmF4Tq^f88`_@xdNxU7 zb*kDwjZl4ASm*EG6vz=V0YVDDNTytZZ_#qCBQguT64-PTtv&GBH&mUOX|>!K$5NqO zBeJrf@u)k)+|5JpY+dl6(?z_g_{gSRj(liamk4B#n1{(OtEg;keEnbKS-=mn9MOB5 zFr`^lUZYRI89tr!qd)FLs*F}XD?2v_DZ_|mhTb+MtkPJH&Lzi5Tyw0M=ceTI$f4&G z**@x`hJ47>tJ2YH(a@#fm+w7`(hBX0q+p=765i_#yUmBH`*%jZ0aadV!Ha0Jy}V-o zaFH7$uq!Y)A(UwdygrBuEIqHB~JQqC&(U*8%BU0r-zD6g#xe6h>r1t5g`|2eYAebgDE_-7l zL!ceqBq>0~z1QPD=&+BEPMA#xV+Fd2rF|#9K5jneYuln_AMxrnuYNauznMi(-wyv} zTo6=Mj1|xhGdlm=s?~E~achAqpLrOigzX9((gJ^nw+)?qS0KFK!_jVbNe7`?w-V($ z)$z{_TFeg$W`B_>Ds$B^-huD~#5?i)M$}$Re;+!h*E;-++j_XHR%+Y2NG#I*>VU86 zQjdyLqTpr@`Ft3<%Ja;WF83r4{Je%mCN1)%C6{zd>%zh^4h?qzV<53wr*5ahOuR`D zbZ_TOm=8DsK&)2PMOzuxUhQa}KGh)|xOYq+a@3>c#%UAVX8nVLi*0M_DK<}fDS<(C zN}EatywmQa#N~~m$li+h_@(>|lkb*Lc^+gtK{*s*V>y^m)`9$L@BiPW2O%grj6Iv4 zJ5xP;WY){2$?dyXM9IaOgy}74zI;AY2cDC; z;~&5SaHpP^{y_LxGiD##Rz?u_b07by14AXoX8Rl}3}$>q@2R$#RitHQNz4-kUh3A? zEutQ!T6p_!sE)5hXk`#n1zuq1!C9!W;3%5Q{wr_%ClMm4n|1bQoTl87Ix6_N!h-Rn zGrW6v7u&4w4}JVB{uzv0e!KOgqV3?)0vc_IzB-*X$Qtan@PF##GZ6JRUyArXPFQ5N zjk*X6&(CNjF3zx)v{UDg=)TnH^o8cyXvS4z6Wx4rK9;I#95o6(wtdDDULQgz&S@Rw z?V&UUQuelp=OqffaDI(zEyq6ny)>r*dWv@#4d%TEFyJ^4oU)mxLos=^f;BV|Za4BVJ!zID| z;Ush+w2hMbJciQTxWqZe4!F+^7e_x5@jITAH!#(+y9Y{)r9gjec1% zViFXyN?~7N2jRK$u7?q&*8IEqftNf7ObBzExpgv}?^A2=f~O@J14}Dc&URUg0yr%D z)RIsxaxGE{zVGjr=vB!?(Y;D3qSn^zjoL2?hvI(hy?8t5N1xBzJ<~I@J3U@&!1V3Y z6B{NpGmEP{0TY!y<@S-?w0%`wTb)95Hm|ucaA;u7K;aFJL}$^#fDg@qq|8996%&S1 z;L*TKJV~m9t+9F&G8P?VWyk!EYV2w3bT$2*6vUy95lp4DByo*x#h~Z-)jaC>YsR%i;+o8Cu&o$ zc3Y21?J0{LR0yeR6Q0C9Wp@PR+BaZpgk<$8!1s6B*)vvQA#qxMwM%pYFZ(T{seef4GF} zXX~sCnci3L`dW!Jhrh#hM~Qyr=CO2ERKq?X{CoV!_dUN_`dyOVQn?G<`kSl(@;5G1 z3j;00(^l`fu3J?)p8N#&Oa!_E+?l`WKHFBq?0e`?DqN9fYPt9h2~Od+sAS?;OQV(j zdsO+N-820pml(?21;Lu}A>5V`Cq$hbvcG7$Y-?{sVo~Cv`=0nF(wC4}-w*}b6_ouz z?LnN?9{-f&iDuE%ax1_)h-T03qP2VqUp)dU6x0?9GT0m1caj=ST^PgBq-(=-%PLqe z`0|$VO$z?tLC9#h(Mok^C;dQ}kTXao`R>c21mUx6vh(H>ucpwgR*)QnoetU__l!lQI2Bp9`);*(<)(+cHOuk zn>loBndodKnzeCq%U^Ac=i~cy>$CfIo4ext%T&}3VW6+^Yh$xTgNY~66$buT?!ueK zR7HBTngA-r+q(J)Cp-ZV_@3Zq1w7GRc*Se~8Hvr#uN^$*^GAA24~t3ujqW+*TAsYg zfdTfe;w0b~4v6@fV@TWN%u}hKft;JO`oM3yu~uuGWA|$%!}CwMKHHFaY9C#$}|7H@l0})&PhT;l(7{%^B8m27XLt^Kn?&)n**w2(q=vN zaleo<>;Qq(Enas)ZFG55^~vS>ATwg^v&St@PcB@KI3S!eVO_AFxj6$;(g9PQdb2pr z--`zzk$7ujy`jansR}lm;ofr&0u>vt%9%HRTHEt0^`u5EQ0K^lP27M(B{p=Fi7^-F znI%69hNQ6PZK~w=nk@f0oSRk2E+^Jr!T%Q|FB29n2xE|>M?|n2#S4U$H8KM z720uiee}X|q=VDDzsoRBK7Fx(I3k3g!^Z474vERp21dj7hQ>#OTcj7om;Z}i~s6jdl8A@1C&ZJ+K?q=#FPW1 zwdwZUFP){ae+vG!JXb7 zIIL~1H%a{w4 z!*k}2cu$ZAdxE7b!@(pkr=2HeUAm8$B=ssSd)7TP(VlxQPk?AwQJ52cbY8|Qb7YTW3vm;OG0d8&6k6;d(--eW4!ET>CqK8Pn6IoC`% z9U@WA`;#ndTfx7gmPVe((HnFBLJmqHoguWN#88Ylqyl#}#dhKPT#{w_vEt@Q_ig>t&7c!z!NMQaECjmp;rmz zwR|N;H{3dDlGcbC{QL6lR3uEbOMQ36L4`S!3;U zN0id3w<})7k<#^g-5Z1F>nNzb``8NX>F|Ckyr-;}+azgW6NGF_w5Ss1SnV*My!?Gy z&Jw8{eEMyT97IW21YUw1-06H3D-kq%wzY+I$bMbi5=nW`3mE`WV)JmDrhla{Qg#>S zoC=#6YOcX2$y?4(alBpOZK(ww8anb1{OqY{jjuS-vo&AfG;Vb+9#v=jSIOc{&-{La zA<&piPtjQfpY%mr^D7iydoERp8Cax0)DzznAeF2O4agl%&NAYiQzicrNjCTZ4?&Jq zd{~mQl5GC#ANL04<-#OoowJ`w@8*Ca_Jkztk+d4?Hx0o*HbZ@mXrr{kHVrw%i$r^7 zV@cjt1RlO+jQq{ZN7}~BZHs$B$0`bi8>1%_q|Vq5gg&?r>^bTip|G~!?op2^08xm* zm-GfUup@@2=f>pe5aNr)DpPbQD@OhAA>tc*>TGjJJUV{1+PYR!xbpHgQ2!!0Otr(I zo-5JYHz04Q#2D^jPguNB%X<#ZJnVW5Ju+VwULryq8pPy?4zAqA939F^k&S3{tNdmx z9~iXf{ejU1%t?Ih)er|TUu|LI?vJn`i)fSfRcDbSx8Fdj{1ghA%628SG5DxD-IZfT zut(zIMXBCot|_>72oNk*+aorknGFb2-gt)1T;NEAx$Z#R!?Nd}oefs<`5-0yRlzP# z3Diu8IN*?(NSkoTH`#(Og*=App!#9+J>Gs4t;C5f7=V?$(;RVv6E?H zE2k|`uQR4@KY&37tKSDj`&hJ$eI~+pZPb3_+*dmnWMJ8ONSiaXWA~fL;jMr5y?7Rk zL*%a3=y!=$O}0Sb_#<1CSj-c zuMR9Y=*D7MhE1Jqy@|gG!pUo1O|=uajLjw$PD-9Wdb1S}5B#Ktu&>7?<`3bgQ{?F< z4k1se=Pb5fO4GDm%mg|(4fRnSDw_7KcC2*4U-@SOuhMNmAyFW~>jlHGtr%zX3k-Zu z%g$);oJ7kYvkFSO?gZ1a#tE0!&@Ir%A+^|dGzm_GL~StX)NeG>B_^zl)x`$x%7fHM z&`?`@-x*$B8j-U`sPLmQhOwXPifQ34YhIP;wUcBLZW}!EBOI2-c~}$bYQe_e%`jK; zE)m0#H^)>v4>_R{5Ysui|24NhJO#2(4D{1?-+Q$0)AoALkoU7urEhxvaSC`3b0wG$ z25b5pH!dKq?&BQ)VK?~rJ_QDN1q?{Nk=Mql%sT20dXI<;!H)+UVpf# zd>@kAQyhYFhi&a%RlkQ9+e-6+k$=XYGGq_ZGb*Q=6sTyGY!H?IE_?L`!lV}AzsF8gxGzEe0Ra;otaUz?~d?gB6|RqEiqn>E0uQG<}rbrCiPa+F4oCDvt5-p}ua+ zt8iwe`7hN&n)=A&e$!O=w&|r%U-*r3talocO!|Wh-MQjui!2KHkmVblTY13}B7a!FtQ0+I}%1Jz=sk!M! z(*$y4hA84AqWNth;G3TDz%zdxcjp5_RK#p0NefE7X7UN1A&iQ9Xz@1ICl`;oTf{S! z4OrE=-8@c~$LvO29_i+oaloz;myP+sVPS;P&~3iuLMiN?^Ho`zx|P9idmkIrC996j zIsiv}i10-Cq!(bJMz_T^$gb^391Rn;wPr7S2i2_ZpXo1Q8QDxpL{}5}7r%GpglWb^ z7I}L<#g$ylbu~#IZFGBf-gh=-Zsn%vG$(O?F(cMARFzc}JDZlMjL-dbGxrv7;ZfuL znYH_sdRr8Oss6|c$<*46NnKTO(XZq#W8HMo#>0rm`-H3DyTBJuw21TfPXj*x`(pPv z%(#>8d}Z95ePLMHuW3=h+zugH1$`kE%4{}|^xd{o%e*Mu7iHb;K5Kvz3--22+RR#* zc_NznSM}A?E8bab6I%My7^^~m8!m<00c}p6FKvvLoefTU3^^eC&P2}t?u`Y@1VfZe zYt4ptxViVXe$tRPajP?Uz%wN0C8Ql4pxX?!o0a2xOLplUYmyLpsF+O4!k*)*87-@^ zvvtXnuUR`VzcH^i`mss6tjaK2M?S#7Gd#xkLJQ51^At7~@7&El#d>k*U$tt8@8n!^ z;Lo?qisRZ8?52g4Dr&IHG(+E&VagV>qDuUm58t~<)%+*AEB&Ev~0O;?)v z!sTp?P;y2)Z%s3N>&XQ04T)wp#FNBEAvbi{pHf5+;h7QS@l{c1Z>ju5?Ga6chc3RM z(lXWF!Vgolk)nGGz;sUzVV~(E{OIrBIg)}D-q1i1vo-_ZCX9;!57fz18 zHJs(i@~BtxJ5aDH`7{T?;}KKhc%@}ti^(v;(xo=B6>Z@?8W^dtqHQW)$nBbuh;D2r zpU@$|kSeQw2ll;B@A$s@0kWsj%WN$(3(wK2EM^*m&HT?&hxg;=G{d@FhHXE&kROc| z_rg-Z7)8W%{eW*H$8NX?xy#OCqh?#Gs4<`kW8Nmt8Y35SDMp~XK>A)XXh`*#Myj<& zBKVQLo|wTi68=SBC?ppA0ZZ&do^8Wfx%U1wZtHcGG`MP3jcnOt8OPVUt!d$-N4#TH zznb;J!D&`|Yx6p_)HpqBe&^83&WFGwGo6^w;9)?^Y3%5e~xoGJq7ZQ1p2S~n+Ioyq4;9g z7-pZf%d+MeFg>8lZ6dkhg9d>#7`>Vt;hMb3r%hG{ZoL$|4XMA9gGYJzS62g+cyF8D za=amNd_Qrk@zlvT^!vH3{%Yd3>GPL}Kv{?d!HL=@@Km*`1X>zV^6~dlU%`QpoY~H> zZA;QBHI3nO0tI03dUOdE@6#foX5pS8vGSndT;puq>PZOOG z?HmvPi$-IrWO&?G;Gbapbl{Vww$)ap4Yi})kiBriXrdpb>F`wX!pUgI*?tBCv)yYR zr?G}1lf;O7j@(zLC6q`k94p)sl`63QG*_6~-qLK5+KM|?tsCbLi9i*w2sW+RYhGKg zUzQ|mco9T(nxw}-|2bScT|2sT%`Elb=l>Xbw7~m2HJ7Q%8{#K4HUYrlrEhSbG?u{lM52+9-uPSu8fgKj(bJShAk{hH$Y??IJQw_$ zyWDZlxy_-MMR6<yxgCc?%AF4T z8vR~RRBKD^BxYtP`r^;?qf+emH%l^X#}eOe&Fxsh=+u7lzh65=tQwM%$4*z5Aho$xxOc11^*lh;Gk0AS3yod8 zG_*4>%^UGIaE4k`VAuQ|T|?+IS$i!f<;(8Auaf(?21A;t=|{~SBkUbUuaU>Cl=t<+^jYI6@H3%L7fe!!wq zU^HNov+s3v1W%(1+^1OuK4ri)9YTF=ON+z6IQTc&A1XFa`kYe5I$cR^YucM|0z}={W|bE5w5w}kT)*f z@U*Lip@sb9YyKI%ZFT%tu}_bq)zT;u^j5oVmMU$XeP-95KKGpt|z>e&XlKgT8b z7*%K{DE%1$%v4U3IRZcEL|*uDQ}+|3uRi5h$!U8W<&+u4C{@leF43BL6-C=;DS5tk8BcvRmr9b+_98$6`r2| z+=i6!9H`NoW$l9DUKvXZ*|5jc8($$F&Q_^xUw@o3|0*hrl^~8OwPS#mSB{K@0|ytR z3M(H(e3%VD;uXvzjzfTrpapXS6-Gkklb(dX6Ft7m0=6OJB1xwz?LoikG-6`ZKn40P z>~17JW1FY=wqo>5y|`>1y6PpY`Ak*4m|nc4fr0>XUq>WV8z~2ET)$b*3Sg&QJq)`nH zOfzCw;E%f`%Rpa-RSNsbf*M{w+Sp(jKM1;V-q}e;>*;8dC~wInts>!8saS;1Rw; ze=P2tQN3=Dd2GxEo!E)nXbTht%nK+|t<$XrKB&j}^yL&)|sJ#Cip+u`@lUM>B( zN4@cHSmmm$#-B=>BRe6&bdQ!(uB%fgD}^PoZ+>(Xl?Q=JVYGhrUcdkaK7uu+$n{VT zPqcAAXspHK+_F4{j0^|NpFX~Zp3i$B^gqz^(20zq6uVUGmMfsr=|-a7Sl`rDOLQv9 zu~9*{bs-ZgCJdxVQK7y3mRNTHC&Ks?LYLhamM{(9!+)m99{<#AkOH3wm+P^a1cWq^ zE{eQk^F`;ZI)xo2;9F8w4Xan4;(VmVk;~@aLI=X-0))WU+BW)&n;6h<#+Omuwz&rv z1QD~XyuqB~Q~hCt40k|w$C{Sz9#z`9$Cug-#$FmZQTF`uZaK1QZ?EiX0-I#Tm)&Z4KV+?Hdx>2iOZF!jWAe67Vo zQ2)JHx{N?@%CcVt53CBNe@bKdEI3isZC`%{nO@afx{f#pDoxUt)JHxIp2;9 z_vjvym*PE;pZT?UeGYNNex^V0z{U&(s%5t;e3jOOPBN$2jiqM=^m-=|1|{U*rif(b zwu5%5_V6&_!J#K67R8>lUkIM^wymv{;F* zgp79x^*22M>roJ|?EqfvxpUdH*c!~T(TuES2nb#^)?i0Y&6tOr(6CwJ8E&8JO59b^ zqtUh)Ut(H5+tMXVq1<%0p?UU z24ix25s;S^S$D zQJ_784XOg9EXS;mNP6a;bxre{AO&qH*AQ{%dv%H$e>)Q-O{BES+1COSUxO^B;83(Z z#8=6kyRY%@NKtF)X{zg9U|PmwT+dWRoS+!6g;pXqypZPBC!&f9u-Zc_YW|v#4!TKn zwZrd)=jij1uWzZ~ln)sA9i;<{w_dF(qH$RMGvD*jUQgPb;DkWnC~Z%O6QT^ zVTc)v#kIL*@E%rgyvG+;ODysrO?hprCl&$6$u_tQt-#M%X{}%IKpsKlshUdksdll^ z?eu26J;h7dGc4@we-y$~N(T*hZ0GHgTFd|FEWwPHrZ*MV)N)=0np|C4&ezN&+}L#f zhfEucX^zRm9RZVVy_1>gtalR34W9EDxKe`sDY55s2Xe)o$Lsk*g_Y+so8;dH+uyUw zo9kD=jPDU1^55sKN;4FO@FMRewt2Mdg;*wq;Vr*pNh+f6i{4JKZ2dJ!J1`N_bXh<& z?h|^JBI^`|J0+wN7i!($?(1z@YvHm3HZmDAH@EY=Ws4N!pKb}1HXt<*Emh^; z=)NM{GGCJj9TuipYxHTVU`OSJ*%#H5-}z#uHMU+WUBW6=riY@XBHR$KZB^zi?Gy>= z{?NAFcr++K0$0}R)Tft^=(d{>i@4t+a>Lt4L_v-((&HiRsf^EI2835i=mi8OWsYL( zAPPEA4M1~Ahx$#MwUVrD37 zz=akaNRwm$%!(M&2N1<_L-XW(3*4dn$OuEyN}{iM zg&)`!+}-<4I|8``Bk6Pby!6(S=Y`+jC1NhRywzMnH#|uqdz|UY)In>Jd7!UynF3Lqpw`g%XYONKZkst7|KNP>UTQi`J z&d@Y#7|w?Ze4+o(b^*FBi}V6@peN;-j!Q<{1~9*%jL|N(6}ecN1D%zt5O!O|MfOFT zkKckE6oZV*Es?mAVOJwtO-v7By%)gi?yTkh;T+mcC}E$oVU1erTsx6;i&(?_p1*lQ z+s(vI(|TOA!qP&oU8T?9&RKoeS-GR}iUe?$gxuEsizKyT-T(wnza0n;J>QTIR=lAG zt2#cH>19*4ub9_`k^AqXz45BY5~1g4*$>=7C%ps4@{Rp9L7z<@sU6xV%$Ubg61a|e zpj*vu(~IIJMe212zM2|rgE(W0b@_?~guV*xFFYG52EQalRX&o+pgw5BZ7HoDWmcS; zsA>4}ap#jxmf!#}>qxd$$j+4+ zvx2nC5IfTRo8*>!fk-NRCfvlq`J-8-k~um*q&;(T0yL)dPHW9RfL9S6KZw6xO8+(;>>;0dHds_%gp*5AMmm*>+p zswlS%rd|NXYB*QFOZRZ`m?oEJSBUPfRInG9Iu59d|KoakB_g-N@@Ykw zDhKLPecWJD(KZ?!`YGf??5}NFI70~zy@2BH3l36*2dyWd8D{Tx$9k54%j zqNV+quxvEhw$mlFs&4h#w14Uc;I~EF)d)ZLiNS`YWWjrcJLw{^xH@8wCiQjssVa(7 zf#7`WIt6-}Qc>~rjR^zrbX(SE-}^TOFw=5`-L2OCV=O6nV#S<&=AL@HfoZWou6WUA z?0!!F%J+D#LL&7;eg@r0X#B=ItB0n2)Vk$k56$c2&8$wd_j8UJ7L}A6`$T~vksIbi z9@2k*$kf-uNWc^&l{Xux(~JYgSkBZ|Pu%YlLAWx!fNx}^*imY`Mr)c!xD5;qfGpMQ ztK}DCuFX8!E@L$%&g;;%u?beyIm9L}&KN2Xr;I%l#b(d(<|UrvgG%z$M!W#WIL z0^)C|3#%Io*6b<&(s$drbJGW9CyZb0`#kCzN5V(xx zyg_$`tCWu`LySACK&@-Ik8(P>A`kG~G}PN_(CWS!Km@rd=1)g~ zg}8SBbOmEUR3vUznKxlOgu_k-Vq!2G%Ag&PHMiCLWd|k zHV||50FL5H{V<`T2UPC^zL=M#Wa=U{TWHm?x{s|oEv-yIf30TD}isHcaPZL zq1edZa9>1$qx*|&KLmLxH0riX0IDG zLg;t-my7X8yV>T~<6c1h5vkYz^e6mi3@`CQ$V+_(i}6eUPUWxA-}eyob3dN9!k^O1 z^q2QxSMmHnaT30Py}Ew682_VKpQe3)r#FN8p<2#IJO;z%UU?JjHV45^4MN`=buGJC zUCVOd#yI9t`ad0YZTeXJ;m+5BFYP1R%lb7R?*qMH8NIJhdmZg3h;^HW`}eC|knh0w z`E8FP&NvLPq-y;v3i)^G!1_F`f?fvcn9JG`;MV`Jq?Sno@`#by^Z_# zV=?}>VRzbPDB?i27V85s#S#4uc5mj-+TtkOQw%TKg*s5(+tRp*+uCryGyU`r@T>AG z>3(eqzU}c)zP;ZoGOj>;cO0HKqrO;Ai(vc*-~(rGD%QWA>1EB2;2wXK#=F*|K2aCy z6G?mibwhDpJ`fk?638w41DDeCCC#%~RD}4Y-NhG?j_F8vAzv}O>krRK`GLl<^nHqW zW@dlb4ft2luLYl6B-aOVo=&EhEr7i@kNMY5gC3eJr1INl$Sdg|D%Ml#t(^}*&J58w zw=H98)9rX^#Md;@r ze+NH2joSTw0?tV}vvZEP8MxETpLkWjHBRY&Q17oZLGANfT!{0+_-Y>N(a&Ld8B1KS zoB3#*YX1RPpI)&(vL4VQw?nTv66X%{9dR$lIa172)|GxMiSxCW7{9Ea_~e7)yyT(o zUaEepfl)wEG`Mh&jKJT{3%kE}*+3zm` z|DR9eZzm!?yze|3hp-aocM*#_{SdgU-3QY9;tBX4yBJ;F*9<&uRxfV{)SK>GF77pX z&xXfA5B*NGl<~a>pMiWpmN&PR4ZnRF@x#f@=zVd(mcZN06MQCx8`*;OWaGSUF#M#5 z1C(;$CRc(VGJZQ6_ygT6zv+z+U^ixUqs|RuUdK}Zay#Pdx>$T&&*q3vL;k+Z4+;X0 zG>@I%B{+|LqeMS4PWTq+3sI&od;&j7D={4Tp7~6lx*zqe`nC`~m*M$b)ZK_Ozh~Zj?C+D}JSOq{jQgP% z)CkTq4hP_S5B$XkQNPuts1M>CL;czxY-n9~zz-aZ@$CL`NIUsFYA2sCT=4TACb#@; zf9RDjQTx@hg&6;Lg3d3Mc17S|%w_mjS0E3)pVcwE8upyLPq982&qCz;_p*Hd@VCJ2 zWO?Ez0!Mu`!%;sL^=kWAz1mMeZ=Da(d5xa{ya?1SlI_Nc~6|CnE?Vx8r#b0>Ga=H_7^#@aX$Of?IbTa};1&y4?=r>1N~k5_z&o)IpH>7w>)sdSmC} z1=NX7K11Uu{sTEbkJW8??Iz%3zbRTq(9y1_d%K9;7bAxQ$CcZuI$?)LoU^nmK8yO@ zy{O+UIAczPkn5Zw-01Z&ON#r=w0~ zlG)*h^dlbrO~D_eefUos`|4(}hxD?%j2Wm;_y;AcCkF5JmgjNvbe_@ z)CJ4GfciDI0z;gy zx*`3nZphJFp)S%UG)`kM_(m_o&6p+a_RK$fMFH#;dx{;Db&wx<9_M<9Xessj3w8&8 zTT0K}5s**jG5O?wuuJs8E+OlM4Tk+E`7iNY%5~p3;HUdu(1ScLZ^BRKxs>V!?dy>b1bD?_}}npTU0a z;dLc|7nfg8^WZjp5&Bd)@PuWa?EW~vzl-)$&_0ZKk~~JY8x2Cf&)rm>e5M!naM&$( z1|62}13Y~M44UJf&86rW`Ew&34hINR1YdaT>uZuj~j%2 zAHw$i(-UELMZSi_FM4Np%(qs^&zE5RYhXVqWqjin`0L9#z69*C%UQh5jFsSH44?Bs z)UWAg^=qEr6@HvUX@2G01CbYYIQ8$G0{rA&hM#;4@OASTzV0K4n_bN8jkh3wt^7l= zK7UF(E&LSq@KYR)_Onp`H?K(OmC_z^!)(Y`(2pctZ3_Qd5A&~`F5}g&qvLTQ@4o9$ zy6^jckNX07uTq|T5%nwknVcMW0P!|4dR{K?f&CDEir4Tw0zIya>2YTQKe>;^^K61T zI7z05b|Bunlf`>qg8Ekdzzdap<`#*wy$i);3bd)W{AFJ<(< z064efInFKoUvroq@d@g)BpI&GoPS{5&Z2U{KY`=g2^?2R|Nrd6zBB*+@ux%24Ke(O zt#Q8(5$(4G9ctn9x?D6`oYQac{QfS)kNqb4k@oq<7op$%AfCU651)^Cyi#Uoy9E5I zi}9;&;NXpC`E>2mVBa1h1THr=rcCC4JuCR<59sL4cVTZoLdZE1|84Rng0FgTuSt34 z&`Z$Ibu^xBOW;O!Gkn5#aevjb`z!bY?Awq__s8?+QQx8dQFX2jKS0^Ee;)U{0o|Y`PuRe@BXsy1__% z7y6OJ)%gMXSO0vvZjGnm{slhC@7Tw~Pw|< zTJ#|6lKhIexa5xFyFcUkbqC=-`byCG7TAExgRpy?DCq4y_}^bdeXCy7w~~2J?#ZwN z6j42WzEALhA;6R=#>z;1}zWO_@qq^lzs8iNK<=!7#fh*4Bz-d3h z-p>5ammv-(%JQ2kM?>$Zqw@b*SHX_)m^km}N?grBz!`wuOpa#~>_7E)Qhstc^z8%l75-5AwP7oxQlv&HqLc7^ zf7lzNj1M1!INQ9P5a%HCc47DP!0vY%+P6Z_a>|!ZKknI0u0g!e3Tm%@ zRo07S_PSS|Kz;;^=NSw;&k&}swI7drpXDi!M;@rBO02^hc)ly*MU%rQJ(TnykC@f< zINO7~1H_--f#<=maW63cX&CooFS{SFM_hU-^N&nHynPQlrxSqB-#u0Ifb=%Uj@yg% znK6&WjoZY(_3QwC_%P+$CqvG0oKEGOu4O`=aqKHbDD&~}`563-(N7HL)XC224yZ%a zbsv5HHSh$wCNh7`G2&iZ$@H3|w@17R?k}n5&i@{9N35RI4O>8Ox|@FYnU6pxOi#IW zBId=`|GWorFEc$cfqJrCEPmh@_&Yp2J{-8m`EFW=<#NQI_cJ@dUWY*bXL{L9@cSp{ zQ-1O%@NnuG9?qG_TOZH#zWq?wqnpV^d!k-#&qN_Pbl`b8aIw1B`Eq^*eeitIv&0wG zfR6S1afBG>Y&^amb$|Ndzms;76VXo}>!$^F<|ypUGOqHh4NyOX+6z=T~rkyV&_1 zhB~l)0h*tFBkb5t*3Y!9kWZaQGS_2=YgIP8v$abKc(A7Z;7!^&SMf=m}Jg zod`ab$KuWI_(Sw$R^xp~km-U8PCP5EneEv7ojV)z< z=hLr*oL`K(Jm`N^8REcEud@alIUM<N_dlzW6D~;mgH2Is@}siTWn_ zz?YVOpVJ`sc0=x!`du07h%I9Ef`7<|9r8ZW(vRo6egr$)Ah8azPWtx0!@j}l9Q}c~ zg>DwN@a+qT=SMx(YOK%QKF3|A#dV$ z2lCn$GyCMns883;;>X^@`5nT}Zzb?*oUBgGH^6)B2EUblnX2Wm zH-0AimvIhF(}mog&*b(Wfw$Jj;+gZV!9Ba3_^zz$I^;_DUmm5nWY56PUq6M)6dadRu%{VzRGgPrfVbJbf%vYZx7&eN=3(PmfjAZ?lW#kKv)B8z*cZuHzeaql^JPl6 zV_{z{hkaGzKi{XIf0oZxx;^xrD-gcyJC)}o)2a6yh3gMDOikD&|j-GyQWuEX=%?S)^s{#YRgZV$!a zjlaQuJDK`(4+37_BIcKzf_jMMD@FgZPUYj9!+v!ay+^8X-#L#I=Y1;9(Z7-3RNg`J zn;fX4l;1%4()`O2=gjge|2G5v(}(FeR{$q4%5Vbbqko5w_CE-Axq4P_?ltI-`E|7Y zjIV)fbRpG$_XD1P7sI!?9_{N{`?-&z{Z`^!|4YgPh%=eN{F}=Ds1N&?=>JFT<8;^o z$|s>79NKTXgTUd*e^>Os59oO#iLb?Qi65HE3VZdSdUeI9}^!)P;h_8EHeDQR&e-(9yqb&a9C-^-bt%?CDVOVT1@`v= zF|SLozfZh~bF?q;c+o!5iF=vFS8M@%*YcyOo^t8`jI^EZK$6j$3gWkmA zvKmky#L4iqzuF1%&mCf3AK`}?IFG%+osjq^3vB9-gl~nNKSs~{m9S6N|0L*nH2VMO zL7bN)^ZOqRTmoj-ZL9$uGI{ftu+5mbV^N(KvT&Z$~zkLt%&Ez>^-DDi{o%QG+c^uMC zJHHBgG?Vu(uLu8Ra_j>eA+K=?jpka?g>P{+CcTxxIJRod0>qWpY=BXC|=|9ARP_k_it>^dFtC-NDkynUM+ zdALoYrL0?YH0%jcW=~jF3b~Zo_g+5>d>(nklKy+2)$BrBIflURBl+#q1%e*>S^e1y zro(Q}@JYtw-kHJVuIbC+Pq`Ly7uIu&vCx|?75%)4_P>9R_#9?W*w!Y0c3;F(IaoYZ zC+INWDdsEVj`leSak<0A2rJOO5dO-3cJ4kr1ooLxq92K4IeACuYktA6Bu>ag)K95L z-n6tc9FKZu^}u14_N$|>gMF-n;!E5IKW6@66wmY9FTtM~&f(Fpdvr0o$5R3Dnd!8C z?(K+&%wusWzYoK`!2ICX{epXqLdR`oOx)N1p!9$C?x2TW!B1{R`(L8KF=l-GgA;(4&Ypj)lRKPaQ6wp~vkZ=n%{1#L4u;Wr+Wu$KwCz0>3@_HT-&5 zhj$SVR{wX(|NlDy`Iu}So<#gcKC?fsSOz}7Bdr(T{sZi~tiJOO-@-4<;zf@xMO+i^ z3n|as^%LxYET83rE#RlyPmDyySFE@O{!qlFpM&RBuaJHY)< zi2mm?zyG_)D_O+ig+`%nvSX20pCxkLpcm9Lz2I#4vlg>^GY@v}e%#j*KXaSWkhhtg zp#yd5qi2Y9uE6??2cDRR-Ge6q-=q5ydcVKhEa+!6i!11Zoht7Z*thY#9`#uJ**#bT z`Lp{j%7^zl9P?#*+)AA5Bs2#Zp7i-P4o5k3qgKh@h8WiUYLjJgWCZQ zp`7izLs6PTk+2d^cw-4gaVUXsAmLD&)IS_b&9q61&mZ$Q2jU5TY}%x%U?l9H=4}Z2 z#c$TXD2;QxS_j@>;6(MG$Ao8x)c>zbxJs3&4N4;B_p6;Y#{40FIN>3W!x5ifY4FDV zdV6W4r$?pKHB4g1t@Hh6>HaHGG$=G{7{GPO?OUkrq^e$f~7w5xwiE>6S(iAcDl zQT?siA1?K(GwFy2;;uqjA`6}5YV zZT@PlyIS<8J}pdiME$O!W`9CS1cH7g-r|k=3(ijOR0?loawz$PSbM3CM zF%s=y@7EQn8>Y>vahB59GD~Ul27+3jb%}zIH(KMFoL)n`(OQ=%4Ep2onz{m|y3Sjp z{xj+g#A<5Qf45dwR|n=)PpvCaR%KxBf@mb#7W5|kStc9uheDBdjbx_P7)It&XBJlH zpmwLPb2V>}S}eua5%z`xjY^|8o-lReiurZwP>DWaqM$bQAh>i2h(!JBXMEMIrVff4 z)Wa9+5G@jsm}wyTj{1Z8d59)rsV|X6rO}pvem<XCQQtt*Mcye=CUl6Lza4$5SN6qyABC8h3^ITd`8p zs<)_E(-Bu_BX*fayk@&tq7F!7tu>SDN)?w&snBRq9N=oZjrG*lRVg+%;?!vh?(zop znh&eID$&@2Cn|^1XdeIas~eABSAhW(rx-vEcNyc58pDdUH6{X)u$gapW6f>aXZV>) z{orzH8yEG)v__l{uvY57)Fshd*~k;MwgLq?(Cl(3(>m9M2fc4H19wrlJ#|Ist(Aap zjy0Edm8osjeiO!~mU4&Nt`@R`Ps|{#++@m0afp^eem0NOGzB%S4Rim+6yLuA=rc$J-EScWKgDJfJiNz45rB zt+_r}Qy2PmTg^XdzCP&BR5u6IFF2M) zF75HKdVIo(CQYIhWL2VXzRTrutF)z#OWgpq^O{DLgSZ-7yfKx9c-8Y&)2K1^#_DSA zj#1f+N_5pzL!nG0UleaqrBatu7eKt)p1R!XQ3?7~Hs|yCec3pwQ~h!{ZYgpaf73Mq z^(cnSOsmuvNu=L5UTYT8`)UaZnZ=Z1DW(*+tNp0Atkq||dWY36?XFbURlgy{1zs~@ zYOVfTb^GKpRelS%>071lSU3_3d4uWO+oLwqg@iHDDe4ueE_-9NBZKgfWzS0D{$`y3 zP0LV{UIrm|lw@6m60@vkwG0JJq9`ygxV}uq>c=482+^hnizH{K%fG2BF(u!qO&Y4T z#aT5<2n;YK3_J-AoSdqTx>69Te$UjXUuaNQyg<<`E;z-0h#e|DHfh3jps7Z@7)W<( zAl4#9y(Z)8rq$>d{0VK|U|i!zHMMHb)zcJp0P6pysq;*@i#3%k5>mX4jp`gzy(tNA zvq>slb-ra-+#UYpx);)QH!QiXh8;!}%;A?BNu`}qFD!spWqc^Cng(fJb?zGjr z&(uZH_--`lZS+@%Lh7#MaH$n)Qc;LIPLlRI2qJNZbIr(7pDSexRl4Rrs`=VD6h&BE<*hKq^*DcSMm&AE7{4mus!rlVuBf z&t&BEreEu_Uln!PuL?d7D>hq~6PsOyT>}lTnu=Os-qwLJj^{!TrNKbEKdgQ`%Yin% zUYKYL`>Wj58HG>L$d$QKiOOwTLLu)e@LIj!lm*%wim&p>g#%fCS!s?%+M?|1t=W4h zY19l_V(HaeDV)&sDCuh@=PG4RSuK4@rH3)XTp(zqnE7f|jdZL+ufj}o)>Io+zVUmd zx6#_#!)&9Hn|kXsPAV1fHJz7k0dn(UtWj?+-djNZDIf=K_7}-@v~rJYE_Hgx6q$9< zqO9D%$jto*iDDy>Z81h3CT+tM zpFiOZ1YJc!_D$(>n(?zG5}BoLxKqj2SmWevVZFJ@>ztVqoQiHIG27Nk%p_4_O%h^? z_5@LqGeNkqhs`l>v?c2>MX? zRjNtYi75~OG@`a{QQ(Wr4r{?AHoq4(#3EiFX`tT0g;vo)N>eDIDxW82uxOjN)ZmhC zO*y2kHq@PYL)|up; zNMEKz2EZ33Q)<<&Z8Dy8poP{>*ilNGRM}P@ES1=bjBzsZH&vz$1~d9{^^q28r`JkO zGI_0Csn|MAR)DuLB`mBe4XMh8y4~%5Aq1|T|IQnY`c+*XwoYSAUeF(IPPEt>s0Zer zb?JTpIDrk~&-!(?46!RUin>Bm{cUlT=f;G=EK?9ml^(=0DPs`JnliGS;-fMP08aJR z2&*6QYhp<#s@^6VS=x#v#*=rK~3b|b^ z-nihRDg$f_CbadAM7afwSjnVO*2YTaa*H`83f!rXBU`d5mN5@R-qMYsI+w3*&936w zMi~$MO}yr&OPASrWj;vc_mV)wTGS&y)MXM+>oSX{b(zG|x-7yan~PaGwKQ-M`Ig2? zX@M9Pfl`Rran*H%8X`eG|3Fh;rU{0pZobP^V>I!*3e;~*R_SG0L~Ez=4WU)*-4spN zHoZjUh?CV-*Nys0(X`nCRq}VJeW=%Us%hag%{{6aacVS@p4OqCE|+WK#Oaf&40d3> zvCCBjbA;AdeMcutZDa|RZevkcmG3?uP{m#xfLVc}#!D^9dVji@#DCxhHc*fUg#OiXXR9h2P1qbzBH;zo- z6a%?bn_qP|tD@!<5~44@T2M2!Qt)Q=eZQ-)PTL)gf9VUIX}?{?lMwT*p0h?RJgz9A z{Y9OWMy#dsS8d^3ZtUBUVYLr^b6rKUK}8(X&=)mB|0Qi<>n~(_C+4?)2S>ivY8l*4 zu?&eZ{)V>ZiEy50T#Uj>JwkPg3{b@p<8PG7W^v;Ns>h5RG9x>3xh31Eo)e8};kXq( z;nXWl{h%&fLz;7xR;fJJc+UACO0))S2x%(KaU><`NVI;$nM#i(-BE)DYiKo&G8m)> z&E=LXEIU)0tlr`}XDLM=&pZnP?cA- z52n6XR;Nh=+6=W|0|P~>gUUqiCZe3w7R*SL?1YudJJZrl;VSk8H1$PGeDLWKsH;#n zzadmEp^0?q8`#o0qwcNng6Xdpt2d^~1`@^;Bg1X43SWf^{5`PSyUt#^YBN6d}s7kriqqsk6=T3zh~Wv;8qGBs8p1q2i0AQa00(d{gE9X;fM~+P$6DRHkbGB$QcXMVUpJDN{so znP!jYgp<_OD0Sm>;w%Dn)|FZ9gJH5oVzQAoEu4BiovAt}@xQDSBg;`1Po=74A_?6e zYrGtV-mBSiq&gNcC^ux5R+G#)N0(Dbtw&DvhloT{q*w}cYxXrazE&;70|;8VeAiV< z-?5aTa#Wp})TarG76T&um9kfR?#g!4p4EzjFPLqGGEkf87R#%?<*STk)LB&yZPr7w zmxu=Ky({((nQ2R|4o#z_9wy+(_8qx*hK-1oYEe z&g9B2Gl%Y%WsP+yGbh)ncO$QSwbt8U;m8Ud|!TFt72`#9aSPO+Jd z_Ge*+Srz6st3n|Sq-I4UPwWS5%(u4=s4X>zmXv1Q%2KPy?%YRVmE^M~;}}2#!bFv4 zTf4X|<;YU4y+O6e_}1FfF){d>s!&vGTcZA{eWuK#QhK0KdhO+olvvv)^s$!-7mF)% zRD%2fag+1el}<^t_+y&KP!oG|8jB8(QEBHqgkL43fiG1jto>nGxr^H$(sRI+25;l6 znh9VlcI?8y(^nzjwTJ|~7TE^879rrZfCjwU2N`8#%0)cBDCZ1~tkR8`sqh*=x=B4+ zFK=F`JVAMy^^`ip!}V*>5Xp?AE&HoRx!OFvXkdA|>D#bRujx51)A|w#G+1g87HGN3 zJV4G>kP1Fez=}%Yw&YXEzKo#>EUw2K8Kg5XQ&hSW!H-(FmxiD=JK#&SRMQZkR4kbO z4Kw3zrA%IDq|+j^{Nc=OK6N8Zj|6-9V)}lnm=m;LUwAL13 zMrpA!)>G=trM+fK^?0h(_AYm6u4A9*ZSV(k9Zf2}$gaI<*NcpJdtRR}rXlUbqgwK^ zRYU<%^ECqJpr~mqIXgnn6mM9Vjx%IQ%&BC_rl`^8n&Lbj)`u^L!>MyUOMQtrBq>i4 zu6l&tYO7pK`C+jYKP=A54>=x*Hx>){(+S@)63k#x)(*iW5hiWIInxp9G#`EHeY`$Z ze+)s=Od{DF733pjfyPjjeL)IrcEMda$!bMb@n-bXO~y~tpGXX^eWa(JR1mKTS4P^j zphFYM87@Zi>Iy=#`L!g4DX9c(dJxQ3%l?VRBF$QkG#_w<9&N0y=8VRcW=rp#`afNS z7cHk~E_f~SQoS%+vhSU%7~#zRIfj&e84M-5oI2?!d{{^Ls0@UsM9CNmEC-Oo*URE3 z#P}YSseC|vU!`uHPmb|72EDO#WMeT%T^owKW{f$aOdR`a(xK-5uZV}z;mkCIPkmBW zr@KAWPPFMXH0T~|r~1yAfLGh%ECN-r&PRl@t#ha}JE0Ht5y?3WoDd}@s;`5^&7XEU zxC+MwBMsi*bZ;!6#h>Hw%K;gpBK5tKjkOpv_rySWmbclj{Tyd56?#maiawE|X?pUz z7MB{Fq&Z7ltzv`>hQcC!5mNy}!ep`vl9R2y$>kERtp?$THJhD?gKi*=Pjv}(tE?5n z2BYOi_qN%^LYM+64A`C7j7OIKK}*|EYc6)7);gD+RVb^0&*X~Ab)hrj>6x=+*4{v@ zl4`Z*b%nM`wPhr!HjAmCNZS^1Ux-b~wjH(7`lxMPU5S!5M^4ky>$UvNrbukIH|EO- znwT9LPBSd5PV$~d58~q)fpTf96qqY(wA42}GSs{m6V*mzmDD~=%#9#wS0jJ65tbhA zSY5q|+owly71gNJYUCu!L5x{9cGqlRl{p3f>G0cvaci-E{y91FrL}I&&0%&jZ#3Hg z48VvM`&N+Jw;bZxR*ijUUw(lXA%iY01Her$5)>9?jy_vINa*g2A=+MxqRdP&j;>aq; zXRdX};6RHEt2Kr-$+RuA#T6rFTuoq3h58WeNwb${HFXQ7@fy{c0gdTYndcNB2iniL z5MeP$(M@StENORrv9n7wP8sLD7ItWAZOZRcXzTaUMZ^?h0+rM#%d)(#NgEFdlF_8rVLQ?7K}JIms7^W?lNBTxqQ=Z4c(GRTQhphRyQp_ zmpKfWlP-s^VS|rUrMV`(Yi3h0n5e{tX9_USJ;vOfG6z>QH^=ghSgAB)(3yso9K zsVV3e+01MBaf`G@(!Dv5)D+@w*AH2v9Qs)YUa_2HZ zB?KGcTWo6%+#st!q`|tbuUTLA`E7=KrcUR^89d6icK*ypvB>VNI|!fc;}dEjSSS+jZ(DAz_RFW`e;OKm!!<5 zG}R6Kh%yex3Ug~JbygXp24n}t32#h2u3DYD=4hKCMkIBBWCoPU;g-p{)XL>G;~3!T zk`*Uy*@aW(7iy#0K-gz+fTafJa(jKgR4>)7nh!Aj&opNet5*@15fv#()Ud3g=SgFP z57C~18Jvu%*Dy7==Ifji6MLs2I{A#iG*&N`Od2z-EgJM|UgnxmG?)v|OykOEsb5<8 z5Di!Z>uxYqxm;ohdXl5dsjd{@OR9~@k7-3Py{e@;El%{F*inI6b+Q1XSz|1e8slVD zR~%ns+fG~1X({Y`VASgs+42y)Y8%5tki$~3S`2T@ z9N|Sl{aM5dgF`S-d^)*$jL4GUp~CXK(4mzr#U zGR3mc7ipMAUDj)hHnF+=XdCNz4pQ^cHlBlx$SUfB&WD6Xu!Z1ZC#=K~iy2}7cxH1> zx`7$<#DqeT{>k)~2@b{lwz-M97fY+ZMORks^h!d}Hgyl%1F=M#H)v(SF7SqYN@GwF zx0lPQ*{^5$V_|=A;J|S7*SKWqHbz>w4PvI%D4mVbw*Narl<*~)nz(FyQ{nc2FW^-g zv-M?eXwxIzV7u;EtDO4M#Reth4>h6S5D(}X0EbaFVkk@%&eN{P_}UqSHFOK#fd2!8 zDZpIyrBd1UT3alg@iez-p%&i86WRhXKY12YIksMB+t{ul5=&$|GtOL(f=xq4!nqG3$D%e$~*=QLsG##_SYP;_28WA@`jYa1XkPH*$vC>V0>@A8q8lq9%k7JVjLFxvfhm3ue+LKFKbB)smP;vQC++S>u8k&sxng{%F?K z==RzuY8H(wv_8Hnc3PihJ9B@Ao7AQ!-ohMzW-)M!AT`SYncK0?WpEIZ45W2Y4|?@J zXEaY1HqJwnVg;+n9g0+DM1B zBxN0q4lZK_;)Vu2^IN)e%M==_O6p(8xFrdEVkjO=^3a`%Mft*FfX=UJk>S)tWQh&q zKt$=*hKAMqkjGq6nSNV*rBu^y!?M1@8qFu?t!#=GxYv9uQ*V}GE5iwkyEWszehpB@_s2Vuun9w#Bu-`b|>&QzJgFPGGDb>g_9 zU}~yr(}3Jy+5|K@7M6m5p(;v>C4jLQgTe;xY#9oYm8l<5DzeofE70qZjmc7nEC)&~ zG413kvg1q~EVmNc={L1=;f=>5jRAG1T}6g}7%>&o)MXfv78nQU(+tGp6~#`B*X!!h zPeX&EHg=oDa^$6;hG~`E#RV5VVqQ3Ta^x6wjJ2q1BI!HiYk_CNq1R= z>=k3atIae^8}-bIrieM-w8G3-DzaYM71l6M)*sKM-k*Ux3S&u=7B`}PEv$1ER-rEg zI4V&kkWRjKgkV>hI*7PdKie>dv}|i4lq?)ZmDOyIm_{8SvjIKe^4np5GyC?yAKSus zOUNIRHtcwr4nz~zqE5(6LD#M6Tn3vgse$O(y5@py&w9N+^^7<9WAF!%YsL0krPhg_ z89r31N22)kXcl!c>IdRBx9h-tR8%Y0?(~E|ybixx*sAJZoZKz93zc9gbFMSBb)jv6 zWzm%^?M-!7EE)@JZtGMw`#mj$T9WqStgiAZ6;RpJIykPS;uqtvjRI|kgQn3#U6mq} z({#HA$6&(sGge$bV=dChb4#!=45Cnx11M1}KnK?3*2|Ta){#W87MEjZ-y#|Q@+zFZDDq`nuj5dlKE zT>l{o#8s;gbG&+F5*n@&f~$T;OLf6OFn4{8)n5|cK+w26Hs{smXjdyPb=e|5 zsK?q<%{Z$4;C3pAUJyU}((kB0d1C57)pgUJsdq^*(AXimD@iYb+}4Q=uIh;m&T8#X z^*@((D>VheKBd9mq=owHw$PAviH5Yg>5ZW%Vl*36?Mkx`1blxPQ5mea)j3&jOLgmQ zO>WlPVqYxK?vE9V=v{el%JbRS5{rZ*15CEiLkml!62NL+!Cq+J+opxCY?Gd2selE0Y8!R`kqNZS?qxmFA{B3>zHZl02l)f@Y>7@m<6lrA83razya>xWN+h-(wWQgdrU1}JBD%I#?rq#pMLv9pu$D|BCt9tEf ztk)Zl2b#l1P{-Az8ao9;SYt7%MgxHMfSh~&wGw|=y$AsKH4tA&hfWe5CmkSp%Hw7cnxOqkvOW+1YWS*2ff*EL}OsA3T6Ryp+#2LnKwLt6L1Y8ALrg7M> z=vJ?IPMhX(i=P#hE5#aBWi;Z@;GbQ^iq2pBA{AIu@T~5bshTjK=0+MH0wi>?;?ZQl zU{2$<8bj8BXPvCYmx@$fAncb{VV1_`H8+pKzC&}{9QcAN&jhspR!3E9%AtGL0CsP& zVgovStNzz;{PfPS*&=#_*fpxlg?d<5InkI)WSFQ_9rqXbB8f~~iMO@j*_yPd)I}A& z3bCpQXquRZtJ2^_6OGmzw=z-7FoELZCWf?u>d`gOJ?e^=FA$GL;{E~eTVkgPT~!3| zm1(9Holy_)(m2=7()iVL66vsDmL^il%3e)xn9Q~ro7;0v-qOT3c>}PW)iY)5?8K!M zMxtiP)%+#bKz=FnxkoPGten=q*m#2~9lkl0Mf%`!dAEqX>zlm_uUcN*R#o5Cv*J>$ ze{>mtG~zZEB{ZC%nhBwh(GKZSOkJ4lk|hSaq)SOD5}A$NCDPapTy^?+_DSBoadc?Z?UE)S11GdL09e{v|8nc2ENK-SWYF|h+z43D*{8a z)Y5}IRxcBh8vfX<+cDN=JM?iXxs1!rR?00R4>Z{{HI7n6Ot#ACN?fPkG)Lx`JY{wf z7p~kQE?ffUKW#Y7e59@XziEOArREmOehv*FzM)jSQ%;pOu?6OD(Eie6ac4YqC5kYl zsQ*{0tU>>FXUw91;~5jZG^hYzK%c*TIu+yL%JzA!bx6RBo7Q%Wq7{Wr5!INiD?5fA zsPv$e90#KF8Lj?+{HUBwh2`lP@DI4m*Xh zTB18a$7gYZuFJd0mYt5(+1(h8-HluKqq1Yd7NZ)-d&{t)l{sx_k-nj0bKTH^W|y|R zH%4ElN^PC0*YXhTwu%p7TAjwNl}T;tY1bF0ChPf~RBpFoC$gxZs;;K$ZC?%TiMzb^j}TB zDgDa>?pvuf-@fAQ;&?}>ArfTH&eTYXy-HjQ?ld{<^dB2C{n(h{$5Qo2EvMUTLKN?| zg;imHR&}5u5C~Yluj9PL0={Z<0D<;`RnJ9Xe6}~5ig0Eh5qv|dS-o24SN&frL0wGs>l&bb&8uRvjnPA3E!CyA z+1OuEyd}_-$+;&QWH}~ilua-~vy7A~+St;n_?c5HI$r1WY8rx__#=L0@{f7i?uu!) zKULcPQGkerw3^TciHlz(^-46Q;%32Z$>aBU1uGu+@jmMrczb zDt4%%l8za$8b`iNhZ+!Wxdt6IA?WRhv?ZjMZVgH@o5g&L-L-DUZow*qM5oJhiq~do z2mIEgY%J|g8pQbE>btnaywmz@wizWvQf2CBz3DKsLA^kdscd~w` z*ynE#H2PQJE2cMSsHdE=7Hp!Pw=y}VxG43)uukZZSrup7u8JGIjV*p9L)#KD$hZiO zHbJbc%4cO)xqsAhrp9H<{J~OCi{0Q?qbH1AI@vAWKucl?mmkXE10}$0&qN9?SG*&v zr^=*KNv$yM_@W91K$hRyif$rQu#VN?aYlQ5r6NQIa2}K4cAUW$UnvZd<80v5#wjS* zXAPY;&H$Zevl?O$@iaDyoaO2wvzY2uW&UthO2RT(yHqSb_kUt9GdnrQ*+dNt>?Fv* z`5+Aw)V;FyVFiEbpfX++lEf%4$&o%%V!^=B2egL2_Ngj-;=XE)#%;y5=k`PSA2AwNF%tXqQ6U(^{#WR*uPOBdJKS%|b0O-i39*^|Lx=q9^M+H2VP3 z41?w{uJT(Z8P>4&Y4oVSWaJMrW33h}cv1p4180n8k+G36VnhsBiR)8Yp z59#|_&=OI{B@DevB6V0zTf2I_Wuq$7*Da5*e5E7H@2Roav|G&;vFyv`GBjfu&6Z89 zN1o~Z;!JO*!v?KiHvDcqerH6TbH?EtTMXoSIhGC4vRo=VJ}VtF_LFesGzq&Ir)HJw%(cQ9Stg%h#Z80F_Gl?D z;BuQHWZGbFGvhuLU?Cfmsvh4PP zUhBf8(B0Lqr)&FX9`&kwn?>^NSMS}by4~+@-cNPa3Q3Jn1Z`Ok zjgSmu5YSj4k3+&VAQX*3rUjBU7NJmM!eX$E-IfW*IBZL8JB&IUA(LVB$Y)(~+Ljuwz`~V`V~U$tNJ2PuoBDJQL|TZN7QoWz=gy~{7+Gib+E(SR z<++U1JX@*A0$AoNY8uO|tzVSPHfxiUSmsW}&}o}HNh!?BQrTC_bVU}zoJ1|IWl|-H zQC7p0fE?L2wnJ3@)GFWoAmi?SaT=ZAN$;T4z1>cSERQMT5$P40wH9$B;$%4N_1R;) zG)ToRPv%u*q8BaLHEmH{ME9&svbo+#j=7(h`R65w0!m;lwo;;?qPt6^mOi8yK8b8a zG3sy{otr0dHhJyYS~?5qSB75T>7uZ(#(9!LVVWhVEF+C&*sP)wGOHl|*v9yr1X1=Te~Pmd)s0Wb)#KPPAZS50nz5>DlSxwyJ%Fu zr2!m~GKNp1u(D3U8OkhMLM>b6qc$JbY_~om)|2S7YPCBWX4N2pKDP($*D$w-Eu1)P zD=m8BinwCcc0az8wv%2>i2+yF1$fO@qHgmr=I2aD@gNzBH(^YUra~C4QNj`N1L?C5 z38~1VY_GMl1Wv%lX^&smobC8_yV?CK*lW?|$TONhPyK2%r|O8#ZztKa@5bb8-36Pv z7#<91=c>B$qEsheoiN%F@O)VTYR;94Ea=H+5IGtp~aqla0nPK<3WWCc7OPj3u||f_~SNZGgIDGS{xA zNa5Pqbuv+sSeMGhcgKFGGli0gbxYJGr}RDYb$S>teK`V`SCL_{3P+6=?S=2W+&rL>~?K~TMaoKx=gCRX0KdMQVp~UaA znV%JLm$pg|B+)M0Y7Uw^X(t_<`=L@(!`6N}PFmy1&^%xiiP!HmhXd&Vo}KFw?PRQD zk+G=jNxeet-$<#RMt_NvgZm7P{>pBL%oDRO>v`ZWeWPZp(a+yW$h{7BJe#>4aULMV zFFPh3zki24owuwkXZ35R0$%Prc|pxFo%9Z$74gs& zztP7W(|J~j>2x$oHZ+JQU7Q_usZtb!blHn+N_*~z^jOw)*#wYGmtb*SW8;d#@k;Y> zgG!N&I0~DnO8;cvx9LDmN6PvwciC23Vg&+KzKUAD`HI4NX*Y4mAt$Dne^3HMr9_2j z4dP%72E=@^_1Dn`zv>i9f6F83TBbBrT73&sn&fI*n2rgsGCaDrFkL*PLD;c{>EHks zS4>-&GWDFt6?$XPd>L9INSy0Os)}qaY_-+#hO4gi2T6}`tCiXv`l}-;aV*+RmT+W> zZYWr`CY$ugtU)vp2tt>i+RfT<*dMYk=wQY=thW@*8ksUId=_RRm|O=HcgkoqIH0G& zyt!6ILN~h=w$yf>&zOAcw*}cMAaiH3)SA=|NY~#Gk@K?ZkE(zle>tDq}Mw>3CAKki>-3OaDMFRB`az|Ec%u)$(Y3f;)zCr;(8sI=~k zszQ^sPIE9y+S@7hy?2$GQ1AXQ`PGnvr%^LaT%i5~(c6)yg%nAhT~E7<$klxw-7KzWKA)r^LF0L z>=K1b;sUkxjcfFID&`Hb|EVRc*ALVSXiZ&O=IG6yOM0!>VP(vV=rd>%mABOw$~fN~ zH*1HgDi25z*EX)*A|qFSK;1*`wbp$zbja1+h9c8i7rrxXP-n=~Ti0NE+WZs}r z z%CKf>j?uY{D02g$aTpCg?0hu(AnXH1KXYa7A z1#fu!6pX*{_6gcs;r_%5Hvp^?t4%?&4IV)tUz8vXMUWydheSRH53=>@To7!~aG{NU zZ#UhOC80wp`;~_id1jxqx9R+xA}lTW*4o@9STZkVxrtV-t8s2wE1o4NpV{i1qZeq^ zhyA@Fy$g#+01Wt_rm&1lt@?$7#!^h9GR}J3G=;aXU#d|dZe>+kof^NQ@9Oz2`qXje zTSjXB3eWR`&gzFihw;;6n zXR`1m;H)6U#;2n=klZ&GcbG5B3h|w^3}V$;FPt}b+AIysybWJeKpqdJMB%s{96;-B zk6YOheKx)p?cdW7Wip@j&N|8$NrK`)nxS77wc2G%m?nX>VX%o)j6(rjY|)8wi; z&vStKXO7T1n39_6we<8YqtldQ1fii7zi zFEaROV~=N{Phsipb*!hdULz}9^g6$#S%Vxf85Thu-bRsk?82CYnq1ll3~VrhX17+* zx}g;TYsXy`YmkR;o~9({wE=VV*GN@k<8jkidS=r|KiN$?bXS>WUi5c=M_aaVap#=} z!HORG#dN%XF6}jkN3bE9T(EAverdm+HqJY3(W*5MuL;jwF{SOvVnLa@&UGW&o^_VK z?GVy-$IY}CG*&GuRM?T%?ET_KUW|83QeR+^9f2LVD(Fx>mdjxnJg~M*;@}-8-po8x zk)P_=da_#u^zP=kET#Tpe|Y=NVSh5vvkf02S8R;ELlV~`FH;BBZ| zjsDg&kyNhiq9QxD@hh$cYj;|m{)kyT7y^?Vwi32A!r}^|rL=cIG|-O8zbDD^o&BaI z4ja}D9>wV>?lyyJFb7fQHB>VJoOq{L;9_XZO@?P`{3 zv-S2Q9fo9EroARq*7brYZjNH6EPPXlRBExw6QHT2)ru#*R-ZJW)|i@$0NP1!G#L!~ zw0Fmyw3`Mr;RMy zEDb*=7eGj@A~Vgq{YejB)TOdGCDP~Xu>5VkDHv5&R7ULii%@&8+8@wxX5t_aK}O@G zJB8->$=i_eAhf8Agf-0&i{9xZd(BQP4sQgsX7L<|@G`O^lz0x&vfw~p7a?5mS+D0W zk+8>D#woD@F>TNj>ciprtjQfR>(xmw&;wDrHLZeR4wb=>48+9p=N6_lCh71D${8UU zvBY7vpiz?e)AUaDgodyN)Ze(@Cx*Ot6jKf0D}JfWLV~$pn(KYY<^xH{=v7&j8S&O1 z&ZJE2L;DV~5!CH#-44~lde(!|9#Q;+Z3^S5t6k#pcGA5b9SlReU_KmVC$C2IO{IXr zKOat70luT?e4#dUiPjG>qh!nt8uJ)k_*h9s`rEz!9aL2;^;?}t92E!~Vkl*$EKxQO zh+42d38Su18zk3~^J~aYk7clu}%{Cp z>~t!KR@n)W{@^I?laHO|0G)rtCO|QN&?+W}Yh-8wDu2vZT=O(Vdwf z>45+#nZAb6=RIVPAK>Wk?v4_?HO=McvBr0rWCBiKJ_sMJ7=!SZA|~$v?Ru>!ai9?` znu@r1H>jQE`F7Jm+#$t9cPVL+hgTA->!8eKgG9$W*@QY>?PdBNh3&>pQ(AGdz4&Cg ze8QBC`K}DJ?~h`eC&q9Fvk_DVkOrSmK3fXNr=Y>*qs-=z`KUCge59!+E)$smlY?b4 zjLOVLiz=C629GPDi4c{eX~hATrI8Y)UE?ANPPt}MRIZNo9bDcn1{eu^OI0W-yi8EF z4p!T66%L6|IVsD5xO`L+T#GWTSt47Mkrb7yWvLaHuZqCHLu4-Z^FQgopp5OI0wWT>`h-}S_`J_kK++;nf>BF9Y+6I>0eJJ84)21H?P ziTBjNwj_v9wQJqc$5pPK992#^3ISJ0g(7^klwu{O7%gRLoR44*^g#Lu7GjiU_3#v& zehqZE{IpOSWPUmwZjNRQ{Xx#r%(S=}lObRUIb$+W;%2-BC340q5#UA?81{u6Q8Y4K zThRj4kZnb$!&y@`k`8H20TIqu0?~d*UkOs-=2mh*B64n}kl@;0XddOj9IWx2Nc(J~ zM*6nuz8==M1ys28F=Zkv&7D!S(iOl&MqU%A_2bK$c0R39~9}12fR7 za2iiObVmodFOw%VoLn(6MC7dRCHcx|p+e+k-}%Di>7d4xQ{S*d71Th6$<4I4h|14M z7*xJ|BQ&gh1zAvT`HpQ^ZUq@8H+5SXl@lPrv`*XZH>z=3i7`2wx9w5+T1YXuT7nHw z+1hA>^bDdT!t@M@!%LjUcT7>@f;Pw+NI@xK)<7l)ac zP&lmQ5?#>Pk%csejU72vQ14KpufuwWN*Xi|fl=dO^AL?XsK+3e#LPrNHUPJXWTXk3 zL5kx7fiuXADrlsV&NYOMR5D@E+yn2~gwOw#vmvNF6arLFGag>S^)xdnZk)FsyFrfg zc6ywBF`XJh+7}}!sz<<6PPiVSQDI8xJeP$ksG9?7gwoHI;YKI}9ZJ8UaXJ<%jDFpR z_D~A;ut3dR%|{P$Ggk{OCST)$NmQ<8N>uH_3c zUx^~p10_+~a|Al#`lcsF5m#UkZWyK|5jlt|u08Y*m8TNO7v3%9P&}MPutd|Lgf&FP z;pp_}Hr$A2j!?r5Or|(9+`ub^jo~_q7XO9QiXO*>Yo;J13ul#Ja20N}QX-{r+6yGG zhb|$lEm(5F`D1E8QJ8gbKL4%R>5$}(Xq~Can|Rw`kC(x-{mpIvJ(we-rfvK?cpsi zU}Lyz>3#h5RRvTw7BWw6j%6)Ou0C>K?#9J(RK|8npMHVCMk&`swy({#ac$pB=4*Ynod8q=bQ1f@sVqRDil|d~^2l}!G2E53 z-u|PmLF4+5fyP%m)lsvgM4kLpq7KZq)rXT-_LGy!ohW-ne`WG~PPW*5q+@WG3_VT=xzqs{0~X z_@r>{)S_hYGVqQsmMs9>_~IhQ+fJ~c9B4aD)ZV;pJBPTuoz!0Ipv-h})xUoS+>1# zwd9};@|>lCJlu1J#GageAx;qcYIX#q;cb;PU0V>)OB>^C}ki*kN0c|IBdKx zNceT-Xqworo}PQ+n!Cjw!%YrIy~qKWOpC zM&ZfB6oY}u$x7$z129qq)d!43o}9wAMXe;BTmq*Cb{bFnwgv)Wnz)R=*N|1(4wAeWWqYauv6cpH= zpPvB=>iH@aHn04cQczwh6}In7XQ_hvKBU8zEkE)VR5q0gJD-4u(}Ly`Mq=~~P#mTU zn*qv14fo1sez&u42~dE#sE)TTYv zX{d|1NDnpG+w!T$v_61tAjS5ax3*2|13o=C!>fEb!k`mhdOdj8TPD*!j#uA?$Utus zHi!86$jo8tUcG_PQeS%OKEAJZ9h>o9g_zG#c*)G+tUf_qK{(zf?24oD$*Uaz@EQ(h zU>@2ONL$uk8bi>yhX%C`=_{${OpCW`822O53X&qb{nj|3Sm2bC->4db2E8rP72D|5 zfFXX`mmwld*vG;fyHBP32WTY_vGr>miJKl_SJnE+jqx3 z<4~b=4DufDc zTFiSzftt+se0*c4W8=m{>fC$@%2wS$!8UMx9|S9~X)gmiLKHg_*veP;7_e08ZUMIT z;i^A2H+B6UJ5Csv;L!u1w3>ePlDzi4d}qHo<~N#LQhBW~?MIb_xGp0yC7{z=N5V>L z!oN1g(MsGHkg;jfqiafJklCyQIinRe{_GNGp80^|+U2Y3alF@?#I3`_s1jf9lR3|z z-$`3XwMwm)U1h!y-cbPWCB|FS{9L6D;~XwCnF}Ul%vgZTSkB=xcl+%LT^E_Z%vjDL zGA}dE+1kcjg$`sLuIbFPp1-ECoI_;hJv{U37hyf{yj^nFqjKEp2sdE{8AJFu}4kP1haEX7g2V?%brQD07eLjME{>%A{-;PL`#&B=-g?0CYgd* zxusfR3eP(4jB*7s3$bA3jcfI6QDTn9D}b$2kJXK9WP~Im`z-pyGI2_>zHyCuax;p8 z<`Ptgt%;c3%yj??VC#}Lok1iYt*`C0#)q@F9TXFf)!euyoYUE)31Q@t=x7EZe6->I z6MBXeEyq{ly@dL5orH!XY|$wsU4>;!aJObU7Sqg&nPG+fgyfcE!CB*kYazwfsSowlI3oPRRM%a0$?aT)jqaJ&{#<2Aa`9IS&;nD}WAX7?@-3tzv9=iZEo%Dg5!Xe^MQiXW82weqlDZ5oRowJOPGhmC^ zEn%qFjg<|z*^H1az)xP&dBnFn((~sD=aQV_vy^``0evP;)s{8$)ei``Q?OLWr@)_-$;D zJr0~%%Cf1IzsipdZywb)Dz&4X61`Vsn)#!Lvoq)Op6_jQ+;c#J!Pu39s!u9zV<~p1 zB1HfAT%#;kw3GowiUJb5s{QP(j(*YuZTQ^r5Ze14%{lia|5X%uU`UI!M*kWl0I)yDp&tHSYz}aFTc~D6)wpB@w zu&r`>l&PGFz~^?uWf6?)9k!Gbmb>2tH%S~=M2{=35%}D3ogg#^uU9??3SU!D*YGe6 zA!OArBxipk&}9uNny)@Va<8gqOz)iJMB`~5*@h{iP2l!ck$ItH^3+70&Zn2~guYHCZGK~V6MC;4ou|M5 z01@%U@w8<*t*@&nX#@Cr8~W6V5i>n4@FIM=7Z|${E)Z(sXDrARrT0(ibRxvO(Lgj! zPM8Vdyr>YZ2ZSAkx1FH4rBGo|?Z!}q&{89+ACh2dWF9n#I)ssL;guK`J2!T;)Z(5+q&`OvRqCe%neNoyR9 z#+!|EF(v3EqtPZwPkB(NR*kR3$&lVVUMCbwnLu%`-%eNxF6m;7C_F7bFLZg6fwVmt*>^s(g#-W$s(sai9kxnmj$yOfRtSHOEP)6N4Vj z6z^)9Ap=?2;eJ~tOk>KpK>FC%bcE&{1HS0sao!*?C3t<7Jxs&q2cS zyd|_BVXKr$+M{lNNM9Gt-zJme!hH3Yl#GYqU@ZRRO-5}iR2Zqu_ z<$f;e>L?kX9q!d`bxn)LQN#)XlGbVYR=E{1uwN&$!7vJ0AoJ3tlES!5V=R;>-#Jxl z*( z#yG-#fua@C-(*=Ey>Tly_6*wu(MYNR=m4TGp z%pFnKXpIfkyUj^w9Mf0Sd+l0RrL7;(#AR1v%y>bEmM(k6G+QU1dH6$(I($ge9{!r2 zm!LgMJ4e<~YRTAX*~;G_oi+k!)Orey2k6i`3z$HAY)u)-^yhrl&y6`X5jM+MKwGm_ zbH4PeE|R(StI$4d2=7IS>@tr$nbQRjNgVE*lZl+d>_)iAS6!5}{fX}mS4^1L1u56z zMJl2>b!$wr>iQ@Z-Rc1%l)g^iIgRP&F-dg!^7Sj5^xe~DC;f7A_Uy2=zq-8L9FqCU zCY_vs?^n*hS-)ogymiI;?OgqW_1nedu({XoS-)Ry_L`~n>y>mcvVUyUmzT=c&sVS4 zw{KX#*86voVSU5;y+Qn&mi_C|Xq)= zOI}&p7p1ziuTq6+--;^JKS`CQeUz$A`=(T!_EoAn{hM5Q+E=amv=n&->R*Z~)HR+} zqW*oS(KnQs6^v;P>aSiq8iQLX#ca<1+QiC&F1}NllHajKyl&vnOCl?u4KkA zax%n9p}3t6LM=C)5Qb`U%ACUKIckD!U799RkuOe!l934=jZN&4bd%P&)}@OOjq!3T z84mexNw3{8k=M3p+#=7U&v*DXx8rR(XYSyxQF+o!-<~9F^@fLMu3q7- zhkRVl)@1m^R$cLu6H9v!Q!q47+0-eFW!44~x#kdBGciq+0$Q<^{J993tzTcim)b5JI~$TA zWhBenM}s7)uqn#8S|1O$k7`?28*vm-yS^d#-(nU`&LfHlNv<10u-;(W-AZ-~=25iK z>Gu*L746lrB%)xra=qD^Bx{7-)n+%5dSbLx>+PkzNul4O>hv zTgjl)Y$e^KH@4cHXuZ|%4pL%XD6_0Z0^ZXN0ga8gaW*FHw-QGy4Z4PYov+My2OXM# ze7{O8bQJB?JM*oMZLwB4$B<&XTLO@^qBQi4JprTjb=8uLmG4v zJ0(%w2}3rDwv~2zt~pAod&54-sNb&D54xs3BWa0qs)0bFX9>2cSrTsBuJGGT?p?MS zg-m<3@CIpb+nMYdQ*IQF79#a>e~|R(rX4#Xv)LcTZxMhLCgD8RIg7MEGLnmKL%viP zG5Kdq3}FmZD0Y+r>%$lj`n5WQaPQoLp z>9nVaJYX>m`~#bbEDN_%&wDac-WU=kkJ*InOvT91Y9x^?tp=tr@aTAXbUZv%ZJN0l z(&&>pn3I-5=aTswwZo(}VbeA?ABd{bOzv#EO*;8#h7}lNXA~}4G&<_F_J{pm`sE}f z+EqEuXga>)x}O0ZF)&>&0o&VC7Q;1Ba1UGM1g0GlNn_@{8fkKDW@^|y0Zh@-=r!gb*Ygb`cdWMOZ6hFeB)QUTG#MN# zz1C__IYO~qtrC@ek=)G-lkr5#*x&Ms^loO6zs0Fp>mT2Un>a&b3!r_*oU)|^Y2pa#FL9MF9ZCr@?k z(mt1+pGiaAn!_$CYroYjyc&qRJkEBD;!blX>BQqBkg1xUmpSV~jC%H|1>u{^7gfYA z8`nAC%u%YLu@t+M7A;=gTyb&`zkdDhMk-r9EQH(F`7+wowSv=cISAN7aCtBqo%R~kFcrt7QLz$(#@ z?>oh$F!5xRjN&`#ct6WaO24)vHWOiIF(-@Y*@$c-chFCJV^b3|)2{P9;PFz0Z=V{t zMxNl;vu*m2bUWQT1L-YMZU>iGF(L7-VR(PlTCIbrDC>u zp0^flG&1YD+nscjf)h`59=eWJcG6yRcog>s+y{0T(DidHDKw#t)hQ*JLJRHmU=do- zGa4q{0g=d{1*GG*@C#%XZ}-vPHoOpn#0B9qg($h!(%r7MZW#< zF^qBZ&COd%7BQ1_Sd{xcXOO4_}p{0yZw!7TU+vyta=ouQ!jeTyGvuB* z^|d6MobNogtgbFrgZb3y*bV9K^KM9~fNvdB$K-h(Pse0=9aG2Tc^zZ97ThzHypE}B zcKXfs+#Sqy%GKf3Ew+nQvN;@a8A{fx}CK3TBP3coC>=t z<>4ywF_d~;Ew!$dy{Tvp*rXe4Rc#%L2~$HTTEd znxbXiNz*U8CI$9FMtXEni6H2*@YZJzkGD+evSnj*r+bQGT=7Qk)0u zyPr^A%AQc3@dN;|qf?#%XpC3d$=efWQ4KlsDe<7D-kZp(OE%yvo}M?41DMbh+TlvC zFkT)egQ4+412*sICgUc1LW5p}?@qg%vt_<~m7kLtjbV&2Z{_-Vp5+Ot0@KMN_5q&7 zVTrXPLxOk3f*Xz)bZeorSAeu)6 zD)c}6hK3)%Qp*6CXx=->!)rVABVK?-(KjI_Ji5C+&aUl*{TVy?N7RMxI+>H{o)jpT zfe}oh@yL6ePF#G_b5;s#$XEStpqIx$6TR|Xr&mW=0|Su2?S&fXmCpuh$<&*u<XfjN>nV)= z$)F)svxZisCdet8Nb9D`sId$1)LYEy zRmjF+NxRSdtbU*QS=7Dj;4DAz-N0FX__g6{7&;HcD9bvX^;!UFrHQM!vIL&iCWU2` zo3O{RH{sLnpu_Z2iP0}(FD?tsB(e5nTt+ZBYdo=qhrz03Ck&-L?B5L|jq<&`cCOuY zUE-P~@mVf$%`0&&MB*+~p(63ZRM^f(!nV31EiY7Rxvv(vxAS>}JMUFJZ&S5p!}1bU z1AT=+9t$-&s`@5C)hkKWR4pgqG`OWa8!vj?5T8I#)tPHmjEHppE3_ky)&-?RFW!EI8$aP^` zc!lA(mD>dJ8+{B!Ziws6{G|`r|5w=PW3UseF`3i^SqMYPwFie4wb+RCy2w8{XN^Sz zRut}d*g-6$))HZL~>J{i=V8r(mROT zN4+L3OjaL<*gQdOi=QIDGzY<4I=6wT5P0wbqhaatwSZ=lMYbYH3`eICRBG@yf|MAUQ<>WJvsA&P-#=Mi#eXAvz)i{=HSt+x168fhocd>i7Y@8BF>j$~s0<>j(O zD|9D%!0w3UM_R3CU2Pmz(-FIHEto~obE+xZ)lS>fT!;wTlEK7P#GR|{{ ze{=0%X0f2s&L!N%r&5o|SkpF!qIJpSmp)d`rMxESD7~3P04g2 zG>dIurKi#P7_-y&J#=(2^Lqfcd3~3rwP~nPm>O zVRa|}Wa#jQX`wmhM^EOqg(hvvpAAM#T$7zox=cn)OQZ2|%vp7YuEe^8hVj%OqbZf% z;`5Qy(Bg}4dg;of4GiPafbQfwdYdJkfuCuK#>CH<&QObRrfsT4EsY_5OkF%M#gBm$ z_MVkZ$sIi_Poi^?90&sg`KzH6syksS1xR6tg0q^&s>3p_(6&c^w$W6VJUEXfhfttXo#gR{oBbyct%eV$EhC~KgM!wRFFr}eapF2xaBrwzmzpvd| zAM$5|ry@^VOrDDJ-P9kPJ#pm9TU(Ac*<>w^K?bJ+%TDd-tAwVupeHiaq_X~Ls!270 z!8Wo8|95EBmcS`v$kW0?MG?u+R$GRZm{q=7@PoL_RsK^or;MU)3{U7HTi+8~NL80k%D*U`F7!>J)i+s;6rRp{jT zB+A~j^4Xa#zN4vFJatopD6FDE35HpRfrn(HO*y#gTOT3xGzJ51&=<6=hW0RWw+A>{ zTO0St+PDpVhoOzjgSx?ZDTB7Lixe|nhn?3w`}~u8+%b;A`B!H&hqQFeLOCi)m&b?i zG+Vdxx9^N^+DW4qIO{s?ofO*JbZREiVlv6K3`()+H(@||7Inb`9GP(^`-=Nh=Mx;J ziKu!>P)KhfP^SYIUI^33qP8O3Ew$CT-fRMHClGR3<&B;zc?%C#|Fml~$ClSPZg;kA;6|_&9QWR9#Q&_mTCx;x8J$k38QM z_rY-HUK;leHVb&~h7`cXaqWO+?e+w(?gE9EgmVfWJ!gU|?wF}X<})h&BbZPbnv62T zG;CQKCaQ*P)LEHHz1f^fJEEjTT)0Lh>3LWBvZwLERM2L}$*@a?Q6iqLS4h%1C5H#1 zf+RQ1=Zc6Z|4Xd!8S;XQ{?(8bMw$pOjdPx9E*3u%<53uh>GG9A&)Qf9RMwtpr)wNG z#*{41AQ+^>6u>cDFT4PfY-?$hy?0bo-PbKlM?pYQK%_nN4g%7o!~!T?K|s3nPDFZ3 zC?XHNgLFlTi1c0q(xmrZ5(vE})Bqv5hu^(-eBT)NzV8_KkBsEx?6db=Yt1!RIGnvV z{M%@85eO1!!#Kb8b>zvQ!JcI$Npy6Spie@9X{x=}_?Y-zEtWbPoh@g`BUieLC7_0aRRsD9ZCL88UXKK`~-gaO;ealUD@C>R=|mn2x1omUR7F72MB z^XkJ~tp+W~y*Kl=zsj;mFENkZVICQ-^h@?tLnp{d%E1M}g-QqIh{Q2^ zGKPjcfz4fdin0sg{;heW+J;v|crR+6`&PWCnRnpVgE^X(lVy+K(6ETMJhq2&|FCUn z`ftmya&?v+|I2i6q!R&tC!%G+-z#FbcE!9mG0P=)5%chBmm0;D3OP+eSIlIU2N~^1 zk-IiBSG5Ne(4r{KA~UHNvf@^%>X7feYSe17`wh%LwNx@s7;jF0+TX0-86A<6bS0m3 zD!OB5l|r(^@02p_^Vw9v&q*uqqgYtZPlFsj!rPG_cT_$QduZ5L`A(d^Eg-qt&Mb=) zSpTam`{~UiuRqcX@F^y(6|IZHE-hRSv4TmFgLA=jt(59$sU zGfgpd7n{{iX@ptO%xVlx-^*B1k|i(C`kC{BcSkwuAD`!JDN-j_bkZ#%?J65Y|DJ!% zch*-a#r@MKLY=JQ2G!{;EuN6olhQU_r1y)ZhEM$#w$Et%$6e`~r^9Sp(dzEak! zdF%gVolDdxSIC&QXWR+Ib&IEcmsfjztotIwq9a#==H3`}u5^T5Wjs@5tH0hXm8(2K zj`%QGDyLv2^eVICDM{9QLA0>rJTOf{;hCzg^~@8nWR17hnzXEjv?d>%N?MN(uF31P zzWNY6{)YDJ!Ri~@t=@zQ5+60-qy)>PWKgROoZx(il7U zOQ|=>%HsIjmyx_=-!C%HWyBK7B5O?N8zI72tY7_};VqO?b3n&(ddz#g@}0Yp=kIml z7n=6V%yiAsQBXC_ZG|&(Wqum=X#E+jBog-uMS))%A2tzw-^TlY{1o)Rhh}P|m3DFq zSj;J5h{y^(lihu?xwe*V^S7+q@O%4&r%^{r;eq>QGB9@M&ZGB z+HGa3CHn07O8TjV_dhNE>RYpiGG0lJcw{Ggl=or>N$pC0I%&>!h3Rek;rdIe9q9W6 z92O?)j|vy0XI7&v1%~z?CI|0&Tr3~{;Yw^it*ZO(H1$Ra-XieGu@n-qVGt4BI-Kvq z&)($Z&i#hHXunr)K6h)^gwX}9|CV8jGw{yj{D4W3^8l-@1tW1l`YgxO0)t`($-~Fi zrPMu9{SU=rpI_0^S2sO#ODp!5A7RC2IVa>GUReChcQ^VcK=Ir^u909b<;PIkfwMnLh4n=Xg z8ohe!gz^H_RGjmKO=Q_lxrDWQAJhNV9#Rcp$|ionA1Sg8CO32Ibsom2HyBPxK2H79 zK9)jzRO8qFNZVfxVAu~yx3fZWY|L_UM5O||b4j=L(5!xk-D^_~)#yn?wOY2OJymi) z@96H&C|hwGS?bHsi&NcB=~>van!yTLI#PwPreybU%z z)k#h2XVyK#anD^dvO|I0&^x@N3W$n-({DYUFMAf7dihSbRD;aLPmYX{!sqsGn$e|v zjtf(7A22OAJ!8MsF7EANOh7 zu?MKv4{|=Hh{)Git-|g^5N{aPf3tve-gcN?`Yh2*TZS zlr6@4;$<9F6?f=jmHMjuqEKIZ*N8ZCW+9|3Kd7WsEPeLr1W{WokFDjtIyLV>&%I?C zxB8;V6Q`5@1)S~@VJgu z_mTbXp!H5X<1O^It?$=Fqd}mC=Dzkv2Q8D(YCW zUzWI7CrLkw+NJ(M)<;E8nF_c5CT75EpBAP!Iq)YZPFVjD%L}%Xe>j|b-ag2+;rYyV zSj}QwX6|Sbv0eY`C??rG*_rW=lG4Nhiu+hq)U(tvW9fFp^nq5pQV%s5tzqYr0#o9X z&+4&9$@RiTtL^9NMS~`eWx8(f89m3RlN%W-gS7l0?cwJq4WKujWk*An5Qq)OL3tIo zgG)pRUlel%d1mF68v@L&ODjlOMK!L{Wo6Ymzo{q zQ%WY+;pz6TgrqS0gIZb^)i2j=%|ZP+tYRrhNN?#~$*uXkdv3My>YQW!*@1pMUFo*j z?H}wI(Od2O1|hxZdh@A0{jB>O82D|1unCDgN6Gq ze?0z^xjHk}ptC_*yjB@su;JsWq%PwNR>^BHS`C-$;J=p1b_t(` zIeQw-Vx-}bqRwVzXdxAOO436Axnxthx%3L>*L{gEaYY#Ue#N4INvrBU5hfYcPiXnD zQKjLh!uz!DuhlYQGmz+e49`9@&CoP)qn*CoFQp3pKoa-Ldt5A?IQGg*GbwrCiA~Ae z0Il0f>Zj!BC{A(PA~mCfSI6J8HYE9WlH8>>l+{|)vGbSD^|4Zwlgj9r)r8B3C*%|X z|B$Tx;>kOS`+0D9CzE)7M`!xXUShWOQgO0EM0;eY$(+Nt6lowFs}Db0JWiKnH_@)ceE6s^mw2a4V?OJ4 zgSdlwQK^Sf)d`k9Euo`e`M*Zfp9$SZw(6k7paKokQ|_A2hqhm3D<5*H5TE zbt}^T{W|V_)R1xQeDJ3)0kX@B!{*3=SfPmrZO7Msz*_iUbedaqhwUU$ejeprcRo9D_@(AMw2UNxHLnDVW8Ixu)#sO>iuIib7Sj2*=fy zwL061I*eAD%&kP*c1ks~%|yp!yJDWTki75Xsh9X3clNx^#jh`F0qG*2X)d#CJl~c4 z)qXk{V}(0o0TMX7+?lr@0_Hkjt=3E~(7_Ysb{PT+OP^i^6TNpc%rCp-yG^M}VApQNh zU=*V9wK6+%W+R_~QBdBKoL*P9Ai;`%ZRGRMRfBG)tL@V|Jy}k!tA0Jd|8;+LwDT<6 zx+8ITOf%P&$mfOc$QxIiIN=2m%7r|^-pmuWHraX+smz?()lQXNO%`(WJeP*|PE3$|13N;#Xaw9rX2X zTQhp^x{CI9pH!Jy@+S_&f3*O zXmy17=bqZ$##+VnzL)VtrO~E^_R#cxMnH_Jt>CoEcG_IQpio_%?Bawg?MSH_SB~$8 z+t@x$m7lg1<{RG#l78m5_4#C(_k9DU#up6EmDc?v=%j~z6K5KY=ufmwvuPP;rp|no zxiqj;kio$$45#HccV(p_TAxFOznXKom!_dMe7~&nGe3-U%2SF^{NXtgEizVn@;UMs z(a&o-t>+n*;dQs=R~9?vG}73yS)XI|%pP;h_*lO^EYo8~ zrH?g=al%@zWWZ}$aKZ40(es=sYB6v=ELKRZjz@HQLVum{-jes6l7`Ed{EtHkevCCk z4_=Nr_i^~FR6Z0&%rzbS(VF(G%_!L+-~P3nQDR#vfMnj481(w|Ec7hukJqh3>J(mE z;8{%sPT5I5!SVie1H5-|8UdK z-VZC^R!z zu0P0;TqD!2ly2p=!g5OxMg3WxVOI25FAS?lnzYkZKw;LcXW0ZPR*F3dD4~J=_LouWjn*&d#a=_Gug{_ zgcG}kbKY7H@%^!R(Cy6fvY*c=*6O>eerArts7iO8ufx1#o|(?Ru#w?hHCaAz!)gDM z$-w-W*uI9Q!K^(b%!abpcfxpQT+&J>hVzd_&2~w>dca$`K3T!s`-lv=h5KsTujs2EZ<#LdAXJ5R$)PK_GZSB^Tl}Cg55%! zB2%kID@U5IU6oT&{+WZvq|?++sR)>g-&kIiTJ_KL*URU|F&<7#B4%Iu(liMnZ07=F zmKXGr$4gAS^=F)>dGB5VFPk(z7eA4=iFIH9O{7km{(P#XG!&L*Jc)?Z33L8j_|11f z%gDiRnKVX`B1DCaE=Gku#`UB*vyLbf*T(7E9!YO~_jqAtFE!qKKWa@XOrDX1tGcxQU&tPub6xI}OSu@&%jFOKs#K*S zk2CwI@MO~10gqZsPTM%hZq(iX+$m}k@R@7g@Rap|t3E(Y3wxfX^n326s`6aEUov8! z@GaNaToO!ic37;J@4K&jaLwB0*_L!xmAPIZ-B&dyxP&Q6TdA;V`jk~p@wRQ?#PX8w zKWms!HH& z!~UJqMTo~&R$%4s&bYG%r|3sjwpZWgoVLGcX#MSsnjH@5h$^L>fiB8N`%cge7&~U! zuzpcBkD^y>dHpP&uIcmKaA@LD7q=U$X1& zF0gjS+|N&rg-LN$84w1@{wKjXYWk#O>2P zDqP21sA9Wf+$88U_@l%mKEzVxj9bzV5C?YJ7tGpQ)botJHAr-$NKN66Sho&0tDrmN z^6xSa>XCWZewM$WkO2|slu9v5VyBE)6ES)!S^h8T5sibDBXyDBtkK)Yj(cCadV^nX z)bK`7=|&eDZXW zojT>5D^E;%s+R$`zf{`b;6kCU_pC^HaE5r-{?-|-i?%>Yh%D2w5_M*`Z)9jORc^R1 zmxMvoOYgo7x9W}#LQ|>X^D@-Al15<4pBe+@haPu_%*l^uhiocMI3R;%-c0G@c3N-N_dm;x=GI`rsb?)!MFUW}(qwS*^% zzT0iw>A)jVW@)KK?9q5ewN&B}%ZYP&aXb%dHM$@&$9}(h>6utl8F7m%%*DvHKHab) ztVU#oKJ(up!%873&E2bt#@m7XvA;^^ID8`%2pzFR4_YLO=8oJs zyN&-h?91K?t7SY{fX%4n=h6bsx{h~F4x!VpI8?%@ZTC(6ToD{G{gyG}2jf3~55IZ= zk$Oav^nCHK_I1i>c;BmfzM5;avB*)1gS+8729&HYRW{o_(3KW+wyf4s-A(?|ev(MC zW>VPYfOqKZ)%WeuF?N9rt-AAeXzlIQ}= zIG&fwYSAd>{V+1TGnyxF3EKT(^q{U&B=SY`>z0@1S&r0kidEbh{r-6`++Bx;S=}z0 zLx(HVif#BG#6&+(^eH@iN>e*j7wR=D7AdSkGwgpi3ka{Sz>!|#l&xH3{dVHmR8SIw zuWy>{^Rif9(QKo#MMf0Lt*NFP7U5sDN3L1;iwo*7J0_*58TO)X%Nk8I<&PYCKWL-Z zZLsR{$hB`c1W|iv;$<>mP>Jk#3}k&)soZLCQF zb;&L(Thmt)-iT*-NJojZRHxL=X48Z6zSUA#%DX!E)-pN2-sFuo7W&w98gSe5jiBAi zqI;TvAFG~Z>zgPYy^}??Q>_0@Fx`OF$@(x-E~;C&%R(l=D*tDZ)VZS8mq8@ zyt8_AUisuen=I7&mXu!?opsI+S3FREj1P?pC#4c%!o8z{Z_L18murmDkw! zU`V;p-qkO1qp;}aEz6GrBDX9wTBAKW@~5XP$07GExwGw@LU#5y8y*;huWKiys`1kToPf$2s|8wRtZ19hwJ4&|Cm zX^G|{xebw7qA;BcopBKw$Gxph6V?`x?8RM%*E(pjF1~~v*Ip}*86GPUuQ*##yaqO^ zTI@jfnV0X%PJ=90ws&gpj9_oB&094*yW4E4K%OwSl|Z&F=*Ro28IF0)x}rq>Yg!uj z%sQ9pHx8Pl5TSW>+3)jWLnhN4Gd}gRWhimx>Uy|G+?|yMQ!qUDcz8MT zE7L>?b$oI2!=k!+*eZNzZxamA1{34$}H=bXoygAp{-$YC6G%bs68b}~wCSSnKf={p^9Vw1=1 zlK!vgh0=JUwklTf;TcWb?53LRf_<`qN7qnAisPVE)&%`^26Io+GT+i}L{aM8I*R%5 zw0Ha9t;VOHQ(#~XwI5?L+X4YgY9CMDZ0Y39&Ncm`VPN)A8~z34asihl<1`fRUFEj0i8iN~1-5R|zWO{G81>V?93*09Tr`oWd_{brsnDW%`( zDn{;E84mZ$C!NkNe`qD%x$`^CwcxkiomYd0%S=+utGAhqEpa^$@OSM-)J+{D$cv6< z3QTB9lao0gz3`TAkIxSBYm+XU3U#N1htjdQ=+!w7!cHFwKROsNiD2iqc3&ZWh4PaV zhzv0ke-Rfnde>>^I5v{JF!H|QpOQjHp@rBTnHC59MfRsbb4JTeZ!-D%24$Rr);n0v zP8Q#%*H-uV-b~o_SZ0qm$p1O0_%=WI!e;Z7s6_awu3S~achF$dN-K%CRV44ZZEL7y z=)!xwOUlHP)j#W$X4|4$<0HRZmv}rf@M6#luNNDCaLX0jvy;>)UMi2Tp}n|Hd8@7D&2X z$Jc1`wcMx1NxwPf7@nRMJ`hQC_RRNCz)G# zVs}hfb3f6x;_K1XZsSPR98X~V7`*OwtCr{UmZcig!S9BH7q8eq^gk3mPe_0ieiXIh zKOK^{+RJ254`!Nek#>8*MdLG<}PokvYOQP5_{FO+UbwWcvsWtxK)?&ty@F1 z=#S&c0Y6#lR)t;19i2=5eb&j2-sz#*cx<~{{%0>Dn`OgxDvnP=hr;irwJNq7GZQB@ z=WwO}iJS58e1WlPj5!_>-v#EeT8S(9(5h|HY0cM_rzBZfq>SxfN;dOUsgE81^W3@1 zIL+(zrBiG2BP`}m1)MH~0WmjLyH7dAtmyrO#}zSmk9_!SKIUZLU6YF=YPF<~rP6d- zt7&N2?3K{+m&=Ln5EJRl>I>=d*+moQL7Mxl{>eqpD}LUdGPQFmHVmZwP(7nXZG%u7 z-=~RN6W~=Oe#@sva-s4d9c735`}c$J4^v_1V$Nn4Uv5ZLS@yl2!SL9Tx&`q5BF9qL zGi(~h(%r*Lo^`{;^(??4b$-3(w6 z>h$|6la*|Tqh3ZZk7x67i>Xcum+A}SS?dhe=2fy?Hb@;_`McQQ?vr$oY497iKT=v*Hs+LG|JHFGeVbhWV~C-`z@A(TT~*%+D6<6AG2` z-~KB1_`xd7%^;3z_+F(f9b7AxB6@I!uRXUac~GJ@s!CkCSJj}Hj?jABGYOhflfaRy zz@7U(<;bYKHn+x=*F{7`aAIzHA<5$_%9-QOs`7Szzh6W& zT6K$zCZV6i;R%g6|LbO#>Pj$Y9GGC;mG7l#PUOGu6>dqlFdlJOx@oSBYQgAp~BYcUf zj!l@pa>n~Ls4rNPz$684TVfa{mamzvf4?~Qp5D7L^+WEA?D72v0s3$2M@ORy$re4|sbVB4`ft?$>-_MCq;1QKSXLb#ZnscBIQibmLEEKav6tB z{I8$2B5?TA4Bgcgvn%qY~%wlUBMwOOo~uz97&bvh9JMIC=z zHAk20aO5twS}7z=*}<1uzBeu+v4!j8==PVt^A}&g^^e?I zK3OnGs44oy%KiL`V)U-?Oi}vhmgJ&`>zhZ+W{FSE(p#v_XtT`vJk$EnRio(5uV)g} z-RjW8buA{QJc|Uelw4fa*tHyuAfRH$xVAN zC*YxD`DjyT^w6BQ_A2q`!f$8M+x-Ub9ja1~d{vuUVm_)WvRkhw253wk4kRjZAMM}Q zI5wp8N#D+>-%I-%Yfk3?;!OeVA`QzgWu8kcwZ6NIWB`dm%&;vOU-a?nX7@-`?*Gdd z?$MwV=m`u|c_JPq=M#gKxCq_mRWBGx!>x}GT%4OHwLvvAAbQ0owFTND3~>CHlUAJO zD|15*XrLzUdVO3p^3d~W#QZ`Xgi$L#nJNIk-4w8DBhAHuO*SBH#2xff!%1?3~23N z>eZVZFyiz`adLzz7WCzm8%6U?9^Q@SeZv|Iy@i@EU&ZMDHAjqm@W1O*Of?_cu1$!o zY}Y1kIay+c(jjbuVPrSGvH-oBkzwu!8xTS+7~y~*Dzl;m z8iKq64%rB>h3zzixY+hXB5XqYAtAQ79ZraG!i)66NqtNa=ihOpIMzv6JKM8sN-X;% zbP)aO4W0P(&l?OP#^6OzI(e)37l`j6rLEuZ;K_r;N+W=VDi<%@m0;npYz?2SO zbK}%q638P2{2!D7Ar5>CD|rJGMvaob@oHpy2?2hl3UDg;0Y*6pCW3m}4pT-*6SN7S zG70cf!H2}y-|gfCup6xAv>4MMb2^MdkTeO7x*f)c@+O4nBF+ih1X1S%%l9$FLGXK+ zmqAcNL>vKblN6`e4tVa8VDYukAiU)??z|n7dGpb4 z8b^pTHoPLofQIHl?RdkRWB;3D{D00vZ$7HyW9{(-P-TQ?5L6AoiDL|glVA)e1K1Ge zyZ}w{-)`a`P6A;-!1+-A1bCic7%>WJi~tBh6$~f$QAQjR2IwJZ2ymrD01Lqk7|x*F z`+!dY6MG5pA?+Xnj9xH^5R-c2(KHeOX>oflA@&WTAdn8|r-%T@aN|!9-;F+wULWY3-2lMgoYhr8xm|>p{0SuxU1pU!|PH1)yLx4ylh1^1A z62T}>MQ>n2ZeYS6V44Z=_W+of5lXDWPrG0M42TYJDd;``6M8161h@dke*;vwW0N?j zJnBT7pbbbYLdAO+-5`JpcMwn&M3FmqzM}?cUM6N> z@a9cffOY02Zdk=G(SI+)FqwG}LbIm7=w3ijfPnqbj0J^Z7-Y0~>L?(1+Fos9?14=z zAR?ItG62=nOPE2#sF~$eAR7yw_JWD95$)u6@mk-Z|AKE{zr-!Pz~6=OoX}j{!rp6V zGiA68_q)m)1x+qc0WF|uqxgkqx#qM1U!H+05yU87BItEmu)LK8n9wZaUpI3|@C=%Kkdy*Hv9*SiEz4wQJi zb_8sMpyfo@aTZ8s;yKzUI!1UVPH<-}O%AdvF8wm_cX1)T5%_QBlB^T*tk|3RyJgBEB-hRUV$*>)Geso$UljG7?8 z1JMRFo#IA2bJ_u~f-d2o;kQtD4k!HMz#Ez3GUg@6WX>txYZtZKT_b?XW||XVVSUqm zggC48YjTWW$OwBo-pdwGh|{}ib!Y>$!cPt2h_SE4uV_$ab=o8-r<;~|&W$)g!OU+O zfChmMAZlAHV(lOx$q8`V&JYSz?#NBIDmPb*+)^ANfIQ6{fk;+_-@uiw@)c>9Ccz1` zm)jnj2SL-?&BM3xm4Ia8sl>|5iLk>UhGYI9yIo=iLR?&%Y7I09zZe$@=s%nu3#0^# z1@DcV_e&CBAUAe-S6#@okI1iM0dMY--iUtI95je6@@SbH)1UhQ2v_}Qd72N6B)Q=P zBhz}Ugjnv5%|JjhcGkt(p&0xs;N+2M_Z)%%?1&5wa6uRB?(y>=@0*HaHjRU+bi<@? z!X0&^?dAhFvOnRhK#rJe*#Mr7^L@pF&`FYLKph4u^T+Y}+jp=TqiBSEw(A(7 zQz2=>7-dSxLYFNGW$ zqnhclrj$6qE^tTg`~O&As(w|EuoJ>i{a;X@8_mN=MU7EW^w>gDV5~#HaY7hQN?b50 z)UiYR+VJLDJxa5B1OTXv6gm?Esz3mv@X}+`NkP|vsb}E-615$G8arlQStCt}n+(al zq!Hd9`fp&;7|mzi#25S?6-)}V>i|Xxr^j~0y+5(~-~N{dH2;?%npbRf{5Sl-)pcwApCs0W*(Xf=mz>$8_qVz6ey4d=%}YvL4k^^S0|<|{ zFs9hN?qmXH#b18o&;gj`Htavq8wmkb{fDfa>i(E!`fKd30w90>SLWbyYDBT_bH_q! zP#UhR<*dZ(!*o*EReK_iI2HyJ2-@6okRIYt9YYiNs~w=0fBg{Vx{MKO-vMd>_o#y` zdd#myqdxHmaLK|kO2MEM6dsvG3QYUu&*7k_M;x-Hs8enTWqq)JN#uhMwsJ&#ZfKpc~m&;7931&?2jz5R{wCR!QNo3%-6ezAhgFr||*zP*Srk?n;<> zb|IEM))31X=-6m)yB1NTjrW}cg|#cr%_257wQErhxb2P}KZF(@C60X?tLtUm17=z* zX&l+5V}{C~HqyesT)s&{7cP*Umo*YY4azrUn%xz5Q7~ZzL=5IOBo0*~lUs_}#X?SN z?;Pk!-#M@up$>qwW+P$U@clndjj>CZ7e}S?pFWmnC^mS*o}heB%wj>&#}nb;-U}gl z%a5XcKHdAF{ujo?xvxYKGUiy@peVGP4*lN>$^C&dO4&A(+7RS?isM4mE&2YuXo zj3t(H(8D+8NF%QqMSn9bMoE}uppT*rY+IfpYZ=Q&e5`shB>C$ z9vg?ONf*tGheJMdZs{=p7aJ`$EvH?Bz5_l3Q=$Idwdu$1ONeDjVO^R9ai}zsJI;Do zyQf)vISJhs%>c?M#xs@=`2_YbNi1ifr=#obS|pH5H@!i7DA@+G>Z8+aTjY@`jG{gP zA4U6pRC}Dn&C<{tH#m1A2GJL$a^dh^vH+3&dX)gOWm%Vxq77~zfy;ZBLj5}hiT4Rx z(?{I-_eZ3#CLmr&-o9#p-8`}K1lc)jKfKdDP!uON@_ZdQ{K7jY`@E-yJ8RyzqZGdI6*rlh~a_kZrRuig?E@~X9u?G4a!Wf_ZSZ%(L`?o&cfB{!_ zO^jgxb{b^UfGLSW7-0;77!sT-$gJ-|6ya-a8ms%SIpRdd9Bhtf8az1k=$fiwxrh5(B#= z`L|vHsoYC-P}@s$u+F?`vE%-#Cn}CqxiwPnFKi3L)yJNx7#<3gU5m=gDN^}%6L+1!j zxlRCg?*NLNxu`R3P4P__T)FjE2g4^&c`tYqD~(`q&)P`>GJ1gQd^wuIM6rRvu0;r7 zZh0R-?@aOkPb|U=?i>Jd9r4lYagtb$2gnS>QsVrgye})~G&x!itiDhMHkH%kR%A#r8l>xy&1hkwp%?t^Bo*xQ7Aw|k(- z>$~PDBXdRmpe{Ur6(_h!r&yJP zPD$4AD;EXG)8(MUp;N~rcfZ_@a)gWvW_WEXw%*7558L`vFF!?DEe4#zCXC4(;Z6ei zeeHm;2dI;|XYf7HQ`9V)VU-)xdltH)KAywhseX~Q?!&$Lod|s9O^>fWXA5|FRN4bW z%RaydU+RZLwaln|?w==R?2?^dMoZhYgdt&8cqUv8atTwo4AN#^l;elNMoOQa{*p$L z@=_nzFdnYRK)K2xSgnN-=1sR9)!R5ec<(1HYwPKwe#~n=$j1 zOTUF=Atf=2Cgu9rTfOAy&}`o5dR>qls#JC)8&j-UzS8_2ifUwrF+gX$#3BEzU!ld8 zwP)+s)HmsnqHpj~Xz6{?ZqT|+0fsdzTpYOZsE!7q6-rNJ&Z1V0lk|JHL;aD^bL}gb z8Pe@Iu0!P}ZSps?|dd8+j3@OYA9`P~g@s}|B*pD#F zLRZ~X44#{^((mKcBPYI`eWUa_PaD##0VUVFk^7f7g(MZIHs{tYAf$4U%?`9jP#h?{ zP0`Y}E!@a`#>4)bYO6IeiH}aRX%R2p;gU>+u&~&PC@*ke!c58(0H{-1DZ|tM@EOET^CWnbLT4Q|43A z{n0blEs97@M(~i&tDa_wa-Fff-^ZngNgOE8Ksm8(;l25F@KjJ2FJb0u zQP}2|+c=GW)aGsFe7^dsA%_wa1TR;^D$mfZf>F)7actIC_raCd5j5V|2k@><#egrn z9cV#^a{8mdfIgf;!2jUw+rOH)*1v1DQbkQIXjCpa)wVQJO)JN8&z#m;+ER^5Ewu<~ z>+L9NL_}^wMny$INv#)Z1=7|FZK*^LDhMINEkvP`R@791Oi08i5hg=~Ovq&BeRBG) zcYW9U2Yi3%vTN@>d-i@VpXWM6Bv(KSY^P)fyHJVGl$j%@4v|*$J@hwy9xW|J^0{*2 zY_LM&lB3@{oV2+7E4Lp8zlurUDz$E5-Zp40@Qv_c(n^X_24X&8zSPB1o{Jo<>$F;P zGpj8p=~JVer9tfM(`Vt@q#$V<7N4?=&I4;0u-4}&{{eSczl#mgOmNNIiQI3uiWuoo z?jGckI+K~9?64o@&$fT>8@AkqTwnNI#!$|Je#Jw_uFRxH$_mrdaokPoiqWfo_zI;>8A2=bcS!*lU1eeX+HH$Z5Vu{o>BGhA z%psA|eV!CnC**i!%aNofM^7My`uqEdRNP1W!2R6F8rgYex=cb?y?eTr4i+}=UeBY73SG;9S6uDso| zmUlopn0GQw(y)Jco3Tb3^oFWr6)SX~wG)3-RW>?z8Hc0CL&upA2~z#2PFd>OEVi0Q zMlmxvre+>BQX^6A+@E_T6Fkd4TBuc-rVirFwv9 zYoFa+RkM=Kvxe+wc(%J;bO|$BB4krlkJ4wRV_o$ZmDdV3%WSdbSodZ6$q7;y7q$Fd zP0qXX24^Y>so)J2e-%4JcgDV3J%RhvJ!52z=seR_)nP&hHk=^tWxSmgV6Au=Z*pzD zf<2qx9$v4UHMpta`{Bd~SU+*K_kiGO%mle)B&Qv*){QThj7)4tp2*jfZ*qqKC9p5C zw;B!%cSrM15&VXe!y#j_TFHoWy7J$7mh6wob7+}o;Etf7Xm~^}p;ng`9Ko8%_~?;1 z@+^aIE$9mV!!6{32HW;ICNi6q7W*rm&I{tl+7r@JH#XfV01>E+ESUaOm5M_DzxUdGa@nE#z6_ z`>u>_yclFfitGZj``S7xEV<@@(&Bi1P|{E~Y*8;n1;Bg3L&qHUyyXn-93_Thes+G& zC5!Y{L#_~MW(CYR-Qw4~*ek~2++V%4batIpr@Zfyfh>#c*&kC_wCapQRkD)JaVOso zSpl41Cl>VC#u{2mhBfWZMAZd`s*3+9_6NfXb~5mYS5=}DysG+QEa}$V3{biMMWPZCdP5DckG*cY_b~~z5kz%#>(Lrma z4dugl?X(w&4tXN8&Uw`&cO%PP=c#E!i8;RghP1$`c(XN*nKqE*Hy8>yCHMk+%=

    >uWK5dXTlo z*o!@OJ}>RwJ>Bwdj-X%HEzw#HZXtHeHnMPzo&Sb(KVpvBE>@&`;yPR<`HER7`jOcn z4l(P}n4L_P?#$oHbJg-L2-~;A?#Rx^**oy(na`A6t~^7kko(pBAg$q9R6F)KeJ)!! zxJ6x-^Vco14Adv}W=-*EzVQ=R$6%Z`JQL%gb&fBd$nJCtvHHQ+hO9Y0{n3D?nUPid z1s2CV0FQ!U?io7FR618_jILAkj)6^0+VS!$p3xrV$b0SB6Y?LR+lPjWqU43-V%-w5Ce&%wf_KS8%h0IdjkM_LZ zOWX>eX2Q?B1JC$Rj>?0(Pt)s$(hm{+?A9yc&pvCHTo7ML<6Q9Y(II`lsNY>}Pxq9W zvCt{y{^ctN^aBQY5aHKvAQuC(|Em&hCi8p4>Wt>od?RCC_HIOXC~1Y5AF@onsqwkDvfW`eE@+ zXFPe1nFnCCAlYz`w8^KT__Gwf!kOKzAF7CRZ30H&O>@%`j<|rwq#fN_>lzFj||zsb#WIA zt5{xDcq_6K6-Wi5e&+Gf6GYSK%g!YYhle*5hndN5WTD`4yf{#8{J{0PHQC|0-7Vz4 z1#6cEmP^*7I}FmX%vBbIor}HTTp^cQo_@)0_vSe3Yfo#XuIENeS?LODSCic5zV({t{nYjAMz!qd3V z$BKaWGON>{YaDq%mv!pV#0%^uO~o?$W3Za>VDAZ1&Zj5c^FjCPgXq^w z^X@@k8tiOF6(d-GKY};x~;jXnuq#I&etvQgSC@VRncom|CFNyTMxS5&EEKzn* zNS2@_e79zTyu`hyueTUC5w5@joE+n$dAh^D8=S+YkouGWqb{%;t*w_)&ZyD>P*=?>Xfoxj%icRKE!2DU+zPYl#B#5o@Q{ zYox2`H>?g*e641R^sXy|o>=AAl;~_2PiJ>3n~2?nOpiMz+kauj=EP%(0c^EfW?J`~ zlCD|nT!>1n<4t9qSc`9-Seuz?43aM}To&NB(tfV|xnL;wus*ZrCGKWk4h0R+t)h%x zKe@`Xhh8{ZX^M|nsNhFS0`TV1V7AmvnWd4&0=z9dwNdgK@qzl5^-Tt~g|M$$ufHJJ zoI3@-pHWY5EbcHxkExFkI;!4&w^tX8m)mX^C0-96M;JY2_KSm}vM5ta-oiG;HMyBL zmU~pTF}4+IJxK>;Hu1~!`v9|*z>X0#xQTm8UhQv?nCrWVBE2bMBn!taS-;>0`g5fX ze(N_8>RN1y#jiB{kaQ50BNZ^SWvD=be2LFy-y%wanYrD^>E5B5gX9x(F`3OZQ@hYZ zdLcb|D4_t~it#l+yXTvFtKwRaXW6r*w{i~2hU9DMlkj%J9bFZzd(TbywI&8U$mR$t8s%8;cmG3?-ky+vKkzxMS*~l&o7O$d08HS1UcE zC#1K>4q`U;XIjvp=u3*8R!j`k2FQzS2kkm0xkYjX5#aZn7fqS>1&0*FQRa0&E6eDy z(YU_wdfqGYc~}h}xtaYY-8X7A`P9m-Ukn9w3flN-EW+mFsdSvUASaN>-y?hKnSk8& z1lp&f3Z6XLdVM0&<2=7nJx%0v`f_C+o;PaFYzv^a4{vFb?FgJHcmyHrYtFq*nY+ps zi`#ccXeu4DY{lL2kPf!?hD_k<-TTZPk>&doSHz~=TER5=HOo2roD5i>LtMsY# zXjNjfDlH|PJ#8WB$tiJay=wuzd6Z{b4LIhE!1;oHj{rZQmp10!<~Bsn#9Cax)ANdR zH>&+rX7m(5I9B<4)-p&urtB4LgT9Y5jui@WDCQ$HOyw(o2l9kuT4@mWVD~@RbNUW? zi$*sY?{fBCo04_a_5+n%=5s)jmLias4E~E*)E$y_-R&{sj)pJjpNcC?T~9-FymWcA z{8y4}G-OKim}NRY>SZg>B&s#OM6|n~lQ-zK^vA_+Q^1bWY0Oru&KOba?2V~4T%9_U z9LT*wEGZOOQ97VjAI%mcmME$EzO9^kaHM8E6c%29bUb7$Hb6dN!ECY)()qR7J4Bid zJ5;|zCN(Jx5=zZUKF(opo+HwmmC|Qzwt34oZQk-hxJ6sDp%ZJfc}o>TNBGP2rQH8m z`DF04L;3$!X0HDqD~+ck|67bmZ;K9tq{;32MOMUe)SKOswH1+_J<2esJe)cbUbA7i zUlaR)`Wg+h23S)69Qw6Qb&U(eyXCK-+pKp@;oXvmZ98P=maIxCX^==!0Tye5|sRBf62p zqIK9j&2ihC_TC=ZRK%Ef@pR%ZVDsa;k7p?@f6|_zc}KBrm_I86Fiux%=&CArOKM)# zdhw5tRG4F^{-gYQYo0IVe#$i&c9zG;FWK_!c|GBi@J`zy)j-`8^%2LKqpBv?Z%v|% z)-{pgr$uY9U+lJ$*UyId@R@NVWc_))YT zzAtgd6*gOi=k8Gbh#uE%(p*W2hg1zw$la5FfgbVv`y(^Io)^s4LJ|II@sd7weff)+F>MnQ+@5JtUEdz0G>kpuiQJXq zC;mMRMHeZneee#~E2!e$F60AL00se&R)nsR(@`-1v z@?mX^aiH$IBm8B8p6EBQ0A)o{#=pS#;`-zx^firSw)D&B!i+a^kbd4s!*P_I_cQVu zy^FB4)qU7-Vz_bzB+RCkVMU~kUTl1rHwn3wwoOgs#n=Nb7W`HTJ}1(h&8O4Zg%(i) zl1pisyr#q}o{z9cU7gj43w@H#dxrA_gOo=OjhMHcIIwrK3P@C0_VyUWf+h@O`| zPM<3O4Nn*%hL}&l&n2TZ4TC$FELF6zkNN!G;DF}@SK^6z=FmG`zDSEa3=JUCKW=~! z1H0_>K?~gh77h?r+PZs=?dh{E?ScV1I)&7P)46L)CNevnsz=vX0Y-eFA!kXuoy*M? zF7ao~3;OY%ojK9{mC50Im5-TNCctp4&#|^*3F1u=N`8^$(JPdeS;YMg+o6&O)pr>e zAZAcK(}2;GKcExDHRkT?#?zitBOkY`26J|!vkjBXxzXf{#2x3JOZq(!%J^JV(p%lx zc5NB&AofVT=3a;G;t8zRDKs^()6D#+h53@S%rZh^IVs@JL^1H`|vDncE*f z7yZUx#b%J!tO|WE-CNatUb#kemPyvcO;HZgi`Ywe1zkC+GLyALnU3;^=})YaEFIZF z)Qn`GCXW;n9_z#v7Ux)=&N{|MFq^HiuTxJsr&zK{y;gQ7N@Y?Y8YWl~f zhW}A7FdhRV;M|~Izwr^XUVUNQZ&pr>?a&@-y6&DQ&}J<5l2_Wu?{HclA|I)M<> zyUZ)rWQ%HzB!W#6^G&f;B~92)^d%1bQcH45$&s3kP~2{qEmCLhfTCb(n|qP*tuRMq z7}UZG3w)ddM@sB5G?JYbJetw)s=UyB#Z**_ysY_oP}1R6=S!T~KJc;WD=DTr8JcO} zn-t&%V%yz&ZHw>86H#AqUlYMz$js~jjg)fvvd4K-luy*pGhewcQ)dG+TjH*ZUnEZB z-KOkI@{Q{0NVn@f`k%V{)JTxCDWtY4(lSVYQCtF4%v>Q)uoe*TKP%-KknxJ|3~edd z9W+@-)(B=G?KuY&cIPzwcAnBcSt+5W24=RY?Ahb6TWpKZM5p=xE7$-7-Wls>tz-_H ztT>G6$gD82k#PRMQbHZJMz}q!*?qrQivWruEo3z248N9S zpV+;CUQp$3<4p2q`T&HZMN!)iDXqDB>d#`LFBM8(eVXub_yhO+6d5;>D4A%Rq4^bu zkB2)2jf!eSfOJvsqvLg{_S2m{jRmwO01Z{*_DNN7Z6)cki{v$yJG@m!xeTp^dR;n- z__7CVMU=Zo=||ky{XBcx^9^krMNE1(D`Gyk7MahMRzOf9&&euqe#ER*D(oSpssq?y zgs+A-@~a=$>e>pa`3gXG<*S-A`ZaXtoyz9XaoGLrbM(q8Nh`54Z(K?rR1%21YDzFD zXyjk+{SY&75&f7gZwlT^O=qM47ajF7o~GFCvJmvzn`|MZ>_Te6HwDL$zjctSdD%851 zX@TQZ=b_^2(=9k4HM2+Q%|=-^+uJo7!hSO-VO}#>OqDKX*lsd@>uvkk0N&LLiYY@U z(auGklBt@#@GFAOsN%+mGC&r6gUn)xBL?n7tRuFa8E_&XN|^YaRoLHWerq($$poz+?9z+=0coVO^-03nmnx1T%Um0o2~J0Abl!3&Sy$!9ov|qB7ly(V zhMZoCxL>XlT~h`ky}RcaACtcFGwx-5B@qHs+JhNlqA#<%To*?N7Su$i9LW?dGOf-k zb9`?fpC?_VGZ{a06+rH;hK96#nsmD&tnr01`V3oaYKi6O|7haIi+1%~T(?8i8dop! z(*&m+Bu(;VbZW6}gL>$2Q^UUDfCu{H)L6(w=lpi=o~>AFVO|;w8=2EyQzr_Q$13sh zf&uhZW`$MJw^$?j!xFtPAK&5@D8H_Mk*jo{-(h?wQ<{KE#G9tezjd6VkX8D4ssJ)H z#AD|i<$!z-%^M0itlWg39*Pk#e6eH!TRs@*$Pka$|Clqcza$-ch{uaJ`LahHFWDtf zI#CT6*U?oiN2Wn&XXY?sZ zJDl0UdEKz%`MFaa*0PN$Mf#n+Z+rDhe#CK}vi4vvV}E4S(yw(Ybsg=urc?!%C*XU%2X_Z{TaksHx^S&Y^y4rOw)N&`sF2PeZ`fOL>FUja&G*RiB2Rn`N622n?~JX>R`+CE02(qnJG7!v$dNVaR6499Pc@Ecqxfhh$`O0(&@Vbe8#Aw#o2=2r`65fiG;nzR&~nQQ z^eRow4ctZ^a829s?mW#;j`eoGJlS;3HQUGbjzLvh;ts(pjHr{>D!9~pNV(P1R8q~F zj^C`|<2NZW^M!T%CG!4yLwMgNuoVJS_7C~NG%b?OjbnVRhx_!k3W4SqTNJgV zr=WdUqnv|vj-=7&#Y*$iYvJ~UQs)8X7G#FJ(w1f?)kE~E(L_^Gq-r9WCPw5cNP6{; zhAtZF+TfmmYvr4n7o8Q&Jag`+kKAW{YJUin-p%l%2Zj<4D$UF|=AVGt0o`GbwARgy zI?9z~Ujuw}sAZrEL&4n5j3N4^;;sXRzZPnt9nopZG&Rtx#ott?jKp;&y`sS#?@{4Q zP?J8>I7fbDa4H&a_3g_XbMF)NYnIR#JAIo~J3WiV#_vEYrTv)IqWquFn|+ z)OZ2?&K6{UVMwAC?+<^ly%pcx@O%hUEEE@mHHeaAJ(lzDjYS`j6kwYX{tN% zmgGsA)oK#axrJa9f6;OSbzd@b48QbA+%bm)h2iFKS9Fn}!@9Xmu(O;8GV+Rv8+P(B zJ)4Av5J^Yzn8p`k>9+{qBdRA*O)IiIvp6G={qkNdgm(JfYk9~mIl{anhRWd{PYnG) z6B2~o&OQTZjx`oF4ADbFioUc`X#u{t@r8?7dksN&Pg)8STimr*^@K@c#<%k?iKxzN zn7!tD@p(FBDEA;~aWA&(P@Zp!H7ihFOCHtY{T72MY;9t{`YV^`lhtjI*>s+AUh%Ti5=57#=fsE$n6 z@g6dXrt|lF)QxG`jI3C1E?S|PDeZ8sH8~peme@{PBfg8BDAs#s+XK+zQAdPeywXg+ zsVTPwQm@^t5pdO>%TH@A?SoLM)wP^9%Lyt;&9f+?6>|^+V8|LV_FBp#(Emzz7`WlR z)O+3W^!or!eSsCT1r~CV=78l=RuS;_YNR}xJ?1`Owyn?va1D-Fdx0zHX^XCI=v&VZ z!PmG6*dJgGf3dYwBelp0vu=&Ve(d4u;*#senGn53p;6Ag^HHDfZn|z1@7t8k>AFkx z`S{Z;h@c!<)Ob&?u4ebz=eT-H4&5+X-G@XI-TKYBJdv~EcUlK&B1yFLMWVw~a38Ts zJN?=aD=R%BeVX->e5&F66y$r$2}|mAa z9*o^^Dd}BSKQrP=c~0Hwd5c<7Dk+wcb(5HEL#8R3@(eat+Jl)eLV8g86^F3DI*R_< zWFe}~bPaMkes$iy5_+vdu)Y3ynghXl*ZJd?nI?zS9X>r<#m3-sZ7#_~l&aue-4 zI^IHb%k<*;%K$<65u`uSmRD&H2+}nrCF2M6X={1hMMDtC+I1^3L58r+rDOWoe9rmXUXwFXlx?l@)bLg zsiToS9nNW%Q1}aKGBY`iJz*12L^SsjVzAC()E3pE(%)0%0H zF<6}ST!4ImvSN}d6CH>3roG0*k_eqK6qZl4LobymgLuT{)V*4BpJh}c2c}k7V%BrB zY~dy8>~r#HovX2fVD4z9jCDzsyW=f5g;7aB{0xHC{CAJmk~_nWJ*lx2b7N z3XxQx>ht)>YGU(6JJc_WGQ_XS=Nc*>42$*V?kH>$(kTB_-RaWPTf4#i9!lIpew>#I z86Gjhe&YzCLdsOd((QSTaZ&7{#Oulhw!BAf_G@N_!PlgSOgT($Z|6ucG}#>3NgIP1*D4_aOBTjSO1OlJ}hSCcBUs=hU}`??9r| zIyRk2bzl7)ul~uVq9#GZP)&p=2)k$fjQON9DNTnybtTk3&8)R{>^;o4z+*XUGOxVE%KbAM@#i{)g4w} zbftBAAM#gutI{DaMjpuz((iVPnk&-ELn-p9vk_cjKh9|V3=JZ=NBs_8_B^R*4Eak{ zOyq}yGu+vV?CSj|q_}1?6P@?lxF=B&sKgo2z~AIV&>@9oSO3(S+jAEZ(&%< zx72NKJQ`z}F&nbStJ@&TJPvtggEzT6)!U&6jql#p+-u&{@jP!bxg9e3xr>ZvplQ*$ zN&V%MriM2xuXSu3*w$7=*IRcY!n4d<%Az!W$~_bIOnflWk!iE{+H+98(w)N+&ZvxS zJyv#Sgx2?CYNuhbC8-W7GfmDdeYw{KK|^oJW@yejvlXP~2l;Yqbl-Nr)deamEfjZo z+II-*1E`W--7KrM=7-t1apY2M$Gs$~riYv9Tr3={ZJ85Pd#h!6&$b8kJ8G6bX&Zc< zY{Ju#sXg0%AG2SErdzSbUmb0F2b)a!K!urm|7$J%R&m0MFlxX^C9!qj}kQSal?`uhP_;b(BP!^*hy#In6GzrM^cpk-HOMR*%C= z9iJUieN4?o7prICw_R`1Ki$KatxjHR;xksk%Vg)XMc{)j2vRi0_mJ;pYU-A2uF^&* zxjC<}mQZ)Fg!u&Q>Rng;GWtdJY*9vVv|MP#XGrrcSNeFh;yKde!$I^HSs|Jup5yda zVv#8|y)>8^Z-~tf4lL0b`XW5#0=21O9DZv|p{OQamg?E5=mn3F9b??}aU8R-YF!xno98IKXf)j16{+7wUx0RPMO(D}mVG(% zZ(S30OGpfkH0Rnc+EbV<-hAl;ma5xPx{pVT$Bb_v2d5qr$`3w1MV zHXCjrO&yUuA3=-tmsZ8Ol+}Gps{|Cevvdxc41tO4DWce}Fypy6m53J$NA82MycWFtTn6I>U)Has8SmXvE$Wt%)}7 zRa)Fzs7hCTG94UP5kUBnP0r?A>SUhmdHD}E9H4Z0Q`~q~V8AWz73vqsYSxk)(Ro1d z7~P6ahlDEdDwwNxd}R-ICbdbgdGhh?9-Z!(Y^u1KNl-UlVtWUhXwyFN!x+T5n}6q?1776%QkVvVx3< zl~15{|FuRMAnT8LX6lOMHDuKfVoXhE`lT3a#h2_Jte9_(OHjYa{Q(YVRA9vf;udt+ zeO@d*C@3hg@ahG8_IoGBWECeIP+8gev~$R5OS@5d&#{r?X|BSVTil4a!zyJah%vvE zrfzmk${WNsoy?v$nTC%n{J19Pe)C8m9)J)Thv?fwe*2WI(4(Kb zHtrzO_nh(hA!NrZ@QH{BZSQWhYf>pQxUL z?6ZAtab5N(2ol}N{2NeZjvJxE=$NClU@Lsq_6Bi8c^E#b?7K*R3KUF$yALnT^qhv0>Gx!QSoU$ z=Q`{Tp|n`PqjxGRv$j~U;kPfQQ zQZb5I@jkvcqIz9DmHQs=E+*Hz1>~@2I~_D6DZ8sEHb8)vERQTR)Lcb+!K?-a*V7fY>&rA=0$j@-fWkGF>R#LC6l@yTuW6PMuaF#6^Qjgo1aCDBcVmdxu z)*OC~wS?3!4hm26E3{N^h z6en6FV}NSrsCUu*7Aaj8hzBDjV_kcQrLGT{?dnO&K3c>c^l(`Ly3JavTO&h%^(g6B zaCcR&s+NQP9+36res6c8$~p4m9%Yu&)*p+AxdD{jr{7%A%^g+_)g>|_2s*KEWs}&< zZiN1_kKAl@NL@$NkwXiO`4 zne#)Mobba`{pr_CHTSuEV$dUyn%MpH1SLkfZ`SxBU|E=dkX+UzCGI07#uUg6Vz5QA zg9V%mS?`<*l^O`#>P1uK74H2OWdt{lxWQuT6KDq$gZ{n~-3&U>azfQAXsWblcXF+m zS-c`2X;0fi74+0h!2fjd=y77stVm-%VA9l+pqo_MJ5oVkFUIZ7;FYO4~bYVVg*oXuYQA2Acy(r+oq*e>8+f}{Rf3basp`&-2NjNA$GB0 z^65m;PUh?3i;eNr$LKY;)sn8}EZsZ6F2w@9NHv!$?~#n>?u;Vr9Y}haZSHbDGtSDH z{CYNx$G*4JmAchCh7XAjY2Y)KxlNH{7#=fDH z+oP0ublk19FQBT=JpsC##z%G5g3}mXnn$0>Lc|Xa zT#)jzE|asHB#-L&eUiVpe8D!4PErj9@s>{50Aj#kWEzc7q&W`tU_ObzAf$^Q`CIUQ7;5mOyOpU zbVyUET>>(bT(xcnJG;M<`oM`dgAyvRPOy_=bujn5tPyVwpj7tP(IakAUr7|_$9-?P zVovpfz!KMPl=om#5PhN4x*UBQ6zN9~LeB?kj_>ukZClzf(2|+Xh8dqcPs?1IZdMln zpP}S2%V$`KH2P&`zSG?-08hc0Ub>UXRX;Dg41(P}Leg(Ku|{C74|>f3`J>vQ3_5>^ z+ru?lW%dXVMO`FYB(jdhF^ddXUwsq@_S5;JCAF3iv-2Q3&rKp*v9@~q>!6+48l^#R zaj29!u&&d9bZ545Xt)V)JV5KASrJbYY-BzKor?tRP2A1!k*-lhQ%~}mOTg7WUxhVE zYFJA>ybpis5T%eBf?hDBJS=;}Hqh8s(8dSsWi`OtwJJx~*8qzwYvuZJv!_(QkQ&g;2!0c#9+%IRtzMJ%O{m0G`MYKEML zuvRLbTGm?-$W@T>%>ljK1Tr1@7rIPd@vGfg#vKuL!q>Ls!<@V33@{Ql;O6Q`*;IP* z5Wqy8{w?{BWO6ePAbteILiSU6P2W|G^?AGrkPdZo6d%lBd@Zs-e8*Er5!dyvh})fR zlWxqIq&AG|o#Qp9K$>sDqZRkrrtaRFnH&^6{H5)A4fdHpC;0syMM=Hv9nEip0(7%; zYD>{?Tvh|QW>kHhnv1n}p^7<41EJm{1wI;3i(qU{ot zH(jG94^R>JFjWxy(Yc$8EaDtTLc$N9iAuSW*` zq@Ffv@ixws3U?EvGkoHK^T~HAkMdq9AnnyEt~tH|aI+hr5WL$>!m z_n3)A=&!}xn5#F7iesz5lMIE%6$zXX8wO(tny7f>UUd-J?9nOHJ<)6-S=|x`&VKlR zJQe~Pjk(8+54v<$ag{9c>p-q;MqZV!+9r$=GqWVKCpstasTxKPJ^U6Be|)9&48Wv zMsPY4T-@jftj;+STai_5k!BUR32$`RlnGu|Tpi2z8Y;<7E(xuJ?Y<-5ZsMnLp=h}t z$EVOQfRHBNtb7q^2kCfu2)Ye`by-INAeIB2XM;1*;V$=OHiM7d3$iQ(e9`4_y2Rap zk%Ml9eL6hf3t#!93^5qW130~G!>Ae%*zWt>VWc%XRhO9N5osz$1Vjf5RT{vRX>wJr z-AgkP^eqp}zGW-NWb)f;kt|N<1QGFovmvv);_V(_Edqj%Tm{K%W-*3@x4D6P+*e?F zGr3L7i$Ji-9xjk;1F#GhOUAR&aMA`Kp-98|N*{n`x51HXE_w0|JLC_j@xV{jbuNJY z23&RvEc>9CGjmrw7FQ-a-W!WaZ&Sq5iQqKf>uUjta6(lDcTlN3bj4>JF@+Km5X240Zt68;Fro#geuzz^e*x?-oxG6LqdFBvlHOycgO;zu>ta0XaPmQBLQjQOS?P*`*tg0($@p7pM}(O|61-4 zd`JO(M?leN5H!IUI}fk}VHfK*H&?*@MOQf=ybDC<3s`X%s7fZIjnVcFu#@Q#NHQ(1 z;-$T4c-xYH?f#3zGz}qjm4pH(s zAj?s-d_EJ^jbu$VR8GX;*yJEs*9^FI!=OM7Tp~;$sXsXossiPFHHDz(e^Me;(3upi z*+Y~~gKxZ$R%qO-0wH5H0M9qJ+X~`q4>!1n>jfCDbt_F50cD=Rf19o^ zJfD?M;w>fpIVUZ?fn2loFYXDE!G?*8(Ml=_bRY;oM4uoTfJSWyG6038mw*xw3VH($ z8922v83c-oK>;%l^l7#iazN|785H>TvW{|t=8Ol9`&{H8uOMJvvSJID*BgNKi=a*y za-AEmdDW?ZS$)bOEkqsy0{bZ|0+mnbl70|&9taTvN`n0kfFK&kUcjCN(GbBq6iFO4 z0_rUVtt+YH!r1+wko)vRpWS8Szxm zuYzcl>T$aL&hZ>{4KOyk7d(Mxko7`%_bVU_ArH8GgoK!^!^5M%)IPA8P{j8=zG>Yt+m)-!_Q=oRxMDPUi(y%))Mt@=_kVaeF@$tGeN`Az%ze zI!MoT!_)jR=ttTBFj@jWO&}cSSD@|`C)Of`;HGBSv#pui;sJZXEnza;aABZ=!fP%& zAo4f}OWy&DLe#NADu+jrA~GQ6&`9;a1h>!qH9ZYro}--Lwh2ILv4>1Jq5uvrA-Q1Pn8G>6xT7D89|E@!-m3)q zNljiQ2b3D`wKCu@;*z|d8^beMguOh7Avg%ZpI~OwAu#Mt<_<|o)=GzfoZni@6=60; zqHdz3AZlIvFfVF67?ILJo!qk+&~HVC$Co_|7HM~TrX}q9!dQB%0XOA=B8PS^0n{7i zz8%k=2fED$W~cUWQ&ky2*=2D01Tf{ucDDd&bmAt7i{&$+fU~#3C4CD_m?N1h#s@*J zFM?dCK&Zk2$h`@>`9Qt;9r7>iCCS`A5u6Rd!63s8IIV}UOTaT${UqOQ@ddt<>6}1) z-zB5tN(!66zd>^+Tj%R!Jl%>Vfc`&HNmGfQvnBU*V!QDL-3>g;BqZb!F+&TrfhWkz$GPK z^H1LbNY{HI-OK`Ovh^Q8q397~hOwWcSOitjVm4q4AdB=2=Cn66)* z_UmQGnYraB&AUE$6#jlpnD)=7KNn?vP}p_4RO&}P^REaKFyouWUWOs&q>;v+q#Ig^1o`R;Q=byl&N6e^*@0E833t+35-i zb?F9TP7|iQGpxwkT3AxV%rnAN987&gO+JRTo=s4H{ZK&t?9ca(eL@N5a);v1Wo5yb z-CyP1=}m1YO!1ac^I`dtRq8D;)to9nnpnhOA5$g5g@>7WDc_jy&f^YQ&u6_4vpxQ* z>F{8PHuVD*Zc2Rt4_2>I?}n-0{Z)Q2HBWmU#<~*J+hOdAzv2ipul#%S2)w)ET$Tq; zd{b!r2b_3SAvX(_zpV}UfJNFyBx!44{eFK=2yV?yIU;i>lvO0uwx{ytRCrbTgLGVPhwi}47|CqW4n}sE)lVR$JesKRL%GI$~3re}(NQ35^T>do>;G z0aqJ>g>bbvSU62+41=jp3Y~>;r!sQ`&^zvYZ4`{HFLZkEdCZ&!%UcrEGFX1epX9?U z|EYC*ulo^2z;ga_&tf+2A(yw7^Ebf3MITb*;0ed77%}X!y^xy?PlRZFyw_b$y$e&- ztI!>=+3)@WFFogJufkYY0{U+lyW%ea>HybT{>wGm{$H+%=fX~_3jfQ{@62cCa;%q( zAa&*6n9srybQQFhjA=sg-vX)lTOf5_;roc11lylYK>rpLVDNw<_js*f*7V7b}hz0CQnov_6nf29DncumU(QUTKrz(J0$LRZ36 ztG_hRg#{Q8?J^kjTZMiEgS5ZI%Y_niy!W0L=>NdJL90-&fPLpL^`0h5>lF>f$7x;} zz2PsN0b_Hu39!7Nf*ye7UkNp@!wLUgs5%NeeQUPCY54DObEEyK~@A6aTC$L3A0-6h3 zobV^Sr;5~OdIk1lD%pFN1T^y@mjMD&;}z8psaP1aB*e^vJ=PYI-ea9HyWv=wA5tEe zTDOY%7i{&|AMpzKeC;5N%}Zcbz}O*wJUV$4 zd)9f{1{mu~h(X|`UljgV0+&%b7)w~ic!lkuKQ;%}3AIWun?9ke$1o`{&G|Re_^{BD zz-)kpwf>m*Snp`1@GkNjv&?(lb96Uc_fH|a5Vrrc&<#h6yFj~PY`QST%g7il0-WH* zQwrE=Ndl7qJALEN1p<@o!%~ZME~mGQRn&fY?1Wb!)gKnt&n+)8U)k`8DF$te*mg1! zUjX{HJSnA9b{I6S>$4Mo8@nqr9x?|(>ms&YjKCLy-YriG=u{mB`RqEi$5A;KB=pJ* z6H|Do0+x)6z~3|rg(-`)uYm%lJ@{Jox5c}5?~L>%P{N7@QCU9 zv_B7I3)FA}+)8&L%PSl~tJEk=edn)w2`(3<4gFgUy%yt!Klcg{;r+q_pbwaZk6|Y; z3tj;D!(ZhUkyo{quw0~mYRI~JgX;p0{%=`4W$XWv#e4Q&i)Z=Yw0MX9mn@#kYw-}I z3M)(n$>*P=_kcWP3xmDT901=6Bb$$P_J0l`U?VJ z;X7I#Oo1L+ywZ7&-VfXVDHMCH+oy%LzxD7Oj7=Ao|E-51Ua^d)gs{_+1k~$$edDj3 z>D9w;rIxOFoWcxx=#}7i3-w-mTw?Yb%=84*i%F&aO0V)qYd3j;=3^=nros|XuhVtJ zKiqp_36%|FU8`cehGR>iUIANtYxV_Fg;dbZaFCCLnm1tT>q1d6gLvH(uc4VF)VvCV zXki5q1l$y_Gq)}Q^*U$Mgqj$b`lL_=rvW!5)O)A%wchR57plB~dd&O_EN@9bGhq28 zf2r3W{HN9crvW!51Eze>$d9TN4B{1$Q}DJIgc|P!Qwl49boyn~QCO3{3iV>rQ-1<5 zDJE1~<24+gP#0k;Y*mjJ8e_HpwP5koGuYx6p(Yl#__&bt(x}wzH5?)5=_;5yEYw5~ zV_pI2_TKs&T@Hhku-wbmg<2;ZORbK@EDcjBV+j?JEEAGSktO?*8YE@OCsdXiX+}cB#LSrGcOBp7`#k?V ze?HGY_v^mTeeQGK*LB|4b-m9$W5BIF%F#%fh9qE7b-PQQhql|w(a1@r3Es#GTgDsc z4rSpWAne(MLuj_QrT}=22G#(J@mA1?$^syPvY-T$LRmNtU57T|{zrh>B0FsueJ~1? z1;n2Bn{bFd3avTN)@RRkg|^B~OFHI0l?KKXJ!IVgARWrlh)Q+|9;j&Eqh=@wB2aZj zERbt0MXP1Y$cBzV%F##`_$7EivV$|>6o|B8cp^eM7nM4l)){2eA;u!I z$)0-vMtQc0h$z0d6^m{i3__}vxoL@k>C~0b&)%MkGIZJ|B1*%0tz<;l!whe@C1%f! z_%Vtk5n35BJFXdYWNjiMSn91FFd!`20NjE_%K{jCl=(<8gd{Yh+HjYO7V@?-AIVrXpr%ot{;6@uwTRjTe7c4BQyRWLx_w1aSYIvXxoE`?j7s-Gi}F z5{`|&AQgn63#BMcuiWJJ(y%U;j7UXHU)Xmg-Bu~6P#`uYZF~a zNQ}Ie_iaBlx(73_WW0gXVvSihjp2|n+e$8}$ezM#nX+=D6EmV7<>f+XV~x=_YOivzfDWyXFu6~RDpb{6fj~TN!j!b)aLUwk_rr<=q&jQrfRmebNf5rcjQ30lmUvG`IL zem%pKE@UCKxQ5fhr|9g*)jZk%C#grU;R`uj6I>i0 z2pG2pfwz6*17RUIFx0@WJDS8fEe@3G1Gt(ytV;;AEleQE*a{#OyRJn4v7;J*6($4; zX3V;{B3M)sD}XNKEoK|T&pAh=TEV%<=yKk-5b6YwK_T~F-Zuut2~Kz90!2)zmjRQl z;X35#3OF@06ymger@)d5E9L_Jlhnlr2s8$PUclpOI^@EQnZ%@dCHH@L-y*3(Kn9C! z7~h%diX2S?p!apL2r~xUe}^T9Fry`tEL+ZLDWt6I!MGoGUCsNpkxByK`(45PGE@uX zXzott%>G3&3Gi+VYY$zhNGt;wlEex^j&_HBXQ)cZ(cJCG(WqN&_!?$RK^z%8r^~QI zj>ZAmPg1=RX6`O#3@@Gx#OX6>>Bb8j!5v=UQl;pBIW3`-LO4~77Ui^PQ8W-}mKS12 z^D^GI!&FB=cA!Cq)8a)bg?ni=>p3l!@gAh;3u_qCh#t{moEAMw-~pU&R`^OLF@9c2 z9d52-d_`ec0u+>D81!Id;R7~iOlrI%EX(b1afEsl;H+o(AbK=o5IulJq>YF)yBR`= z(tHu*X)Me-A=l!YCs@plyr=lsx$0S8L4L zk(Io!y?jbNOliwiK*?hgNMgwqL&<{(Ahj48;CuNInDH>R5l**hLkIKUV2W~(5dw`2 zyv}l@3rUE~!1%(9Xhdo36}~^Lp*f#h{ivT8^gADt#B0SBk#XtvnoTh&#}nB9>{%_ zvgTVMB`3ZB(n#9m(fJR6y}$Ei+zog7TlOu5(c`i3JOC67ebk&Mu-{H zF17&XYX)h6>o+cdlMI8fbfI<;lp_;rG>`$1Bf)^R$mzDQzT1Lu$N(}p;*ntsFHf)+ z^6D0pNNksw0!I7BcLNnS1g(Hv9pAmg6o~wi?Z(oa7IDgLAhLoiM6n%pC(3@ZOF?-6 z(c2i3fF6Ygw9N|N%_JtwDv`bn49GFpbysp)R#L_wHQ)(A z@rrm265j`$p@ND`DR@Ai5e~pbJzx$7#Stix8NLh=r4;#@p@k}AE*QU)ig<044IVv0 z17-^H(yU$YzL+LgKZfu3suMkUSJAVG!b1!U1D&+zjB*X6&PnUYl3yf@NC8 zFhY4GR~Ut)B+4V%5R7N2naI&iP)8Y*3V??UM<)WePtfT}kl}0s$ zy7US^1rpTw5^8Ya1VwxaHIN<>G=!EW1G2Kx#!`q=L&oJL)ZpHo)CEKjSFq&>6-Xnz zgAe-TDg|qT=%E1g_#n}+nWOW77e4@Nn;Q<~nT#t&xvv2-qZS1V@a~60Ml6ogfev@V z_ae~MVB5`bRc4H3d~qJOA&n)5gcA`o*)nQ3;xrd>g0~n^nCAe=&mVyv0b+zur4Xly zOZD6XjLT%HBG5CiW`>Lb1bPIJ?xY4HSGz*=9H9mRek<-)#OY>I^a~sV1Inq+pk$Z4@5EC*-0C`LjwVTGW8 zBrRnTNDl)d7C?ymED&yvqGSmB!&J~^t_NgbF*T|G0-+U?uOZoY6>(b@e6)-T830lr zNR5hkA-q5cXaY*tTLc4(rXRE(6l)x){cuADgu2ZRSx9fD)H(35EdfH?GtLuE&17K3 zFHj*(f>wo8c#@h3_8c^XK;~0G@DJ^C{L|WlBt!O+?1S1-qF@7ysm6eaFOic4pgz=I zIF*-OD(hu%x~&Suqb}&(LP`gceFSA7Oq@N?YQLc{sxT0xz>T>ywR0V1zBE)Z0PB4Y}H&V@D9q#D4z z!`Z;{LQ2?D>QhIl57k{^#xTU-j@(teVn`y8lDoDwLFSmC^Z`Z84S}g+@fENjR)#>4 zc*+8pl^ebpIoccGC4>v3T8^alVMEBjEtCcD!L4vv1bPPS?vrVm%M%y@=w%2r4&1Fz zl|i7P9M{C^LUR8^VIxOJgHpUp!2pM@h0CG32wF{DQGIDoV1jZ84y;^8m8T2QL~vk~ zAJrm{DE@_9vjq_vDWjYk2)y%@6g(ma^ivYUfyJg&B}9$}=(dMq2{bU{ZbFzdz>+l9 zd9-81!6NLZdZSv-l>tVdT^B9OafG@E3kr;032;TBJJdxpI2CCI&-+j_P+jDLeo>A0 zKskhQ5!JmNi1%46O2_iuVZA0X0xjk`f3m%&7tSaGo-= zAEB;Cly-;MFr_*GCzPVWKYJ*UNCCAmLamBC-2j`+777+vbc=;Ys)-1Q9ywhbFcBC) zw1k2?mE1v&>r+*brxAHJFxsJY#|Q+23e_4;w~v53*HU3qrmipo+1ZW?eAH0ng9{ZY zWR&?>dl6^>?4>?a$avyP*w}#z*c(^91bS(o>#+nn1A*ql&M}?>Jd?5If>QU41D=6a zm4*-$8(4C+3vx9P0&_ogX35pdQSuOPSjQPPUlc>4!HAIp$P`L|y=n|wIF;i=di+Fb z1|KGbW00#C0NJ%cFi5ft_*h8!ivn>7ShbPr0SmIf0UV}BSp@D{6aL|J0;Q7>rTMTk zn4kzDN)sRfj>Qv!o0<#|07@INkXH;DN0B_m!pxf(ut6h<=>Vb9SRgfH;sr3>HU=5( z2?79RpH&M~n5*+j^S-vIn)}C9U!9DDDtXm6{_DmJ*cE&g)mZK*R;{GI%270IoBKZ{ z+oum^uPNCkye)UVy27++_F8pC;q5m6Xedrw$!t)UdxRUG{C^SGqb7@DLI5MC26SRy zd<;TAv=oq+!J(Noutye)zYMF9TLZrFjKd>x4?z@ur{tk5VhlpEg@O{%ts1Ceym(tM z&p!PdzelhG3c->TL2@ry7s&XM;tgJo)dezsqIiRs6Ii=YeI!E}T}y=*5E-Hv@Pl#? z1W(3B4CKl+jBO~1kRfLQP~!((YI2z4Pb58(5D>UZRESqR}rl1N++z+$ceQv81|t~LLE z5!cod1O;Yu?7UJg1X?5&_G_*d*`RcssaZf&JYgLYB4}qT&hSU^*aYc9oPw%}IRSQ1 zmhpQZAo*m2Qr{?Kj%Cy*z+|~5D9@iMeGiCwu!Il-l9UC=13BUFqNOhmmN48f2(oi( zJS-v978qf*=tU$06yX$iPGEHg`xWnx~gA!Zc{tuz~zex!Z#=*7bWl>fcIj zp!Lz-u2CpGe378_{oVHVnM>_tUoq~FFrsv=fm=y0zQ0g6TRB<3-m`hQ?DO>C$%lTy zwI6>c=L}d&MeyDplK!2Xbg5l^?AIstjNzbH;nVnx$*N#~-W->X<3wM$+ z8WWSjCwadh4zHRQM66o0P3MOl6vWtnZ$89!+$p&G4(H5iKO|6@Xg3O0@7+}X{~xf6?~Hv~Fme{3`ODXM}CDLM@%9ioc3jKfOq1*U`a#)h z|Kq#d)+l?_x+xy%O zwB}1Zr zom8l4|Qf+o=x^g8ll_J~Z;9+4nzop4G=;sXuVKJ|>e9l?)R1cfgnmBVjM z61L&}Q}fhkh0R7Cd@AvewU~`9GD%G_#D6V1kM;fxs=O#U^*pll9P>kQ+f$tAb5{%1 z=_kd$Yb>=_1y`PyRC{kAWw!g#+j9n?m7mfNyA+9Xj{P_IE5wrT_0{_%r`e@aDq()R#kxDavYd|MlNCw#$5y{|COLm}aU3WgDIb-}xcIp3pX;20 zF4bd?7`N8HbRju=KPgdDNyC!9?7$C?ZS_eAy1m|vZ7cKc#WBZ%wJ&Fl>eQk{z7O0m zF_Jkj>Tyrrq;Em$%U0a6dCCce0P6)aQ)w6Dyv5kZ8@@LjT9JoKmNzCA*anSOjBnNN z>n^=;)O5c@MEx4NN{se3d--cJiSpdHI|wExT{ekQWZGn|c@B2pmfBA%Xnu3^=ygG| zFejPgl&xgN+995_;Q<-X8j7tImgGUK*HK6aU^c_FVB84-eYGCQ=aQS zjkgz`QGG39-;Yd*-)Wladz}B-t=BNrBXO$0W%t!Lk_x)V9<2{Onc=uo>R^(SW?EgT z^>l&Dg@m=C=M+5*H2+f@$)g*bNIJ0kZ{B!Q)r*wrKdmZXI%U+H&A&2Y9QSd|oyJHC zac65w@4EQ-YguZSd_Q+U^Zu}bR1~fxo47gZTy5!8<*_dT;?Kn_e1FHCyE}T>tRQev zYoph`&ZrL>rUNefS6jHuse71ee%Us2u4aEU?&i7lMdvedCDrG|{Z%YfGkS9uyHus3 z#46w)jlo2PvGm0+rsB_W7OLsJ*BT0X7Y<8{q0#&=#Rc40`RvOeUpP!#QjO*UVWalh z`~iAMZC&s47p`WV)){kqy9;-ZsOsfB2AoX|cSb*e* zi($T%m9O`R`v-6(!-q}nd-wB%{tK)Q&LSKKFk%CDxnvja~=%w**4W4D~zP{!M4^1EN| zlFP5x-=1pS(KCIzzU(YXk)oh^y}?D!bpPX$#0N)&dvD`SJ4MR+MKa!r^y-_2Ke$7IH(&dmMzY!%8#GqxH1O^=GB;>^WWn#Zz4Tq4qcEd*H6hVkbFfflqo4H`uP1yUnw8?exTysCT(mSDtx>rEcrRo1Si= z^*y6~Z`?eku+7%8&5P?g88dIkl;#+xhC>QWpN*6*j=3Z{LLs}5Gg4u<_Bcj=diW!*%=iAN?wZE|*A zai!HRCTSfavrX4UCU=FFncdKhL7sAS!VjtAO@E2R7OG1*td1N_2#~}3jhew2a*ui}KuF8pDyN=5>5n>i^tS@7Dq>IRDnidEjAa=Mr(0s(H zdxUQ*YUs1WP>exY!stI(j zh}qS5dzN{TPbtg(pYToUn)Qfh8i!`SRG0y1W9{;|2!NOj~ zN%WNYA7{O|jLriD%?%^exgApP_r!&*Yu(l~q#xIv9uvL;^QAbV$Ng19a)jI}M*9(K z!{os4K@CHi$$@1z$6B|vx1W^p_%O!Ozd1Hzp4O?7>FR&H*WhtP8zM6U0u)9b(c#&_L(SKu_xK-*QZhgOn&JsBXgD&-#;e(D~Nl>*!Wq! zmyBckXjgmDEW7$dGI4AB-15~XgY_NUPUY-5hI?p%y#VTD%p>(snjs-d!R{OL~yu zJyP>Vw6?CNHtbT4j`!=8+q?>4&R<80OP7mv!9RQ_ zlB91M@dC_~t7>}9Gl_jXw9O=BT*S?5wO+CU#l|vp`na6z50MZRQR=k~>2}9v%;HvT zecfHYPqrtmB5BF_UG?kV-{Ze>OJ*lK?rHZLG>1~?aD&SHKhY&nQ)^2|5(y}qP8|O8$ zw*T*|=O?TF9zUWUxUw^X?(r5c@cDWAXXxzl7hf)_o_Jk-ZDl%i?tFeSas2p;TY)QA zJKj^YQKQ`S+T(cy_hxy?)?XIVRjqS>I=7!WW_slCQFee7FLtrd%bRb36O#{aVn@?#J35O&G%LNz?~s;Svr+5#9<};GJ3nV*Pw49OCf)zhxK`D^`{}KIEjcUF38ZRO!fE#joSlPLEu=)De zNt24ks?6Kf8mHVId8xVV7QH@hjYcc_e8#dFZ*&92b(g#jtVh@Hu|-;ACWiwkPI$MdAH){^88?6Hn7J)5UDdHT$S%I<@;| zwUA@Xt4`inpB^^2(cjtqq~+7|5ov0fkGy1E60AFOo*F(HcSg{T-)*m4Z$G}~Nc!Gm z5muIA&_<2C*6Hayk9%>08&wRbo@V+zQw|6kEp=UK2QW zTdAQ`WAC&P;gNaYMFp2XUsk6F`NeCiNt7#os$75Msg)(R@Lgr4<5R27JTuqMN8=V~ zDY`#jTX^__@HYKi->kliopq-@@*W0F`u+^^o)Z9b&3otRtbTla;@K|R3&obo>}8#Q z$X`#Tn)DV9KI}QgJ%Srckf89aD-{|!5>@v;rAG|Wv5%E$tNx_r-7Z?+=H30)v%kaB zII{bqRak`8!}}Hs$0nNz?|mYy%KzS*X>Q+h$#}>8h1FZCsrPpER<9kO+*Us(bou+6 z@Xw~BFR#U%Z_XJvl4`u*Q!t`^>Kfi&G_v??M8@V}p^95?jiUofyH96+n%o=t=8VRD z5$a^|-pg2{m1Rx(3d7@eVSiK-r#+h9*q^*IerVHQgTLpU96QJNbU&|@nX581{}ia8 zVZ9}Y@Ln^*D(&yRh}kU;%?G&QXDW%#{ejy*1#Zz8isK|z_G?U4SM2)w=qV@X@yOAm zJRAD6jDQ<-E2n1r8skq-r~J;o^r_$9KV$0Q=r7^?Yry~0ak{&FD3;KkZy<1amVW7P-@60Qz9srK}vzYEQ(ygnbkHE-kfBY&wAcK`Wbc^RN^cv$vm z)!$R_XLFu!l>g=IxuKu-%q3>h&L~gD^SJF!rqr?jrn-(B=JkC2m`*-2o%o`Ue_QPDS`XR8ltR2|bWx>_hzAknFBDI(OKDTJ%D?{~WPbZl1LwY@QXq+ z_>xiOz@O{Rgl7l1NR&*t+1lYds~;YF(|)AZqUO$mly64IVVk>mWj4M3h&S1_obpWg z@fm~8M~;pA{vo|Rob&dsRHtF4-hty64@$dr*w@^#SG#N2YS(e;Soyt6-?T8~0L_6} zzq~q@bs?v!G(!BkZ-ln~JqtCVE%mv#US=x`PoI)C{O0S_nfWx`-a)POq(J$#pj9VL zz}WhcnDBJe$^riNaQEelyFSeYlz2y`_|5;Qq2>*? zq6>xQPmkWyb4C|R&MJE8s(SD3I?DR54t`8ka7aD9EnVZr6IP98LhNYTzD=de3~v2L zKL1V4`4F-Fh}A!06#S)okCV3*bQ+xgC+RD$B3J*^7MCrH`?yi%NyFv+5emK zQzAmjXqa31D!9(c>6_lLtNW{gH0M3LyQpF-UtY-fn~Z0!|NC=Ux8yJJmAQNWOI&A| z8=9K%$Xa`Z=yGD%rdOzlWuI$e>&;thSX8mNHMj5cjEpSC?~s|jlC^F+B7yuOY;?v;F!kU?nEnqha`65^tid*N4&COkWv<)bafy>`JuBWW z+0tuoJ*H>wgxOmew-(KPbk1fC&Lam(f(f6a+0aPNn3klA1x%qiE=C8-T63u zCr7V|?)02Tg~eZhu1MxY6J)=Wmxzg^moYPugHfCqHL)CvR_WpH56vIJUn}^We&TYI%OWiHZNB zgT+NR*~CO#g_(wW{|h*jcqY2L5)#!SgBX%k^w!p8=Ef ztrX5Jb7Ag?x*TCQ-Sqb1>xcsl{8Y<6;h_4%L0oE6!r-m3*%M~mShYy;puH|k73$$C zx&uYxbH1Y}7UnU_?%v#(pnDf?GxVVl-+742-w}`v-cMmJSt#0tNh&Gl_5NJx^b}`Y zMi1K#!o#E9@GxQNxizS9NXSqqt$p)R&&DmWZ5)Sv{ZXPr&I5+~pZ^`UdNY0Q0{QO- z;;Z0(og?RWm;Pxw9CLo+@xFWMZ`K(q1>wrfy5Cl5{K>xk-0hp5!sXzrG}bJ0SvRA# z>{T!!^X02x*BS>@nx6nRa|Eje!6E~!iD&OTOGISJGvy_cUuF~UQ~uxuaf~+h?=@_; z3bE+}*_RcpK#cApkN%LxF6o${NJr)9y|CI_F^3IyDsZtAT2=sJbNnp z_O9qXHcbu>4sF)h-^Z!$(X%o1d8_=CUH)t$y2PjN^b51**sm>{c~aA!RrG-i;(ooQ z-_y&W@$z*wj@D2Vy4HjR>NG`4#~e^+7Xj+mpusF}sid%%6-UTsv{!Q9!4M_s=#kB6 zoZ`WJk2j6`y2pAck9R)0aZ|<=hYr?cSz>;>46Y8Sk!V1k&o4In&OOo;W z7kB*LZ!N{4%d2?GIFD0J_i;*dvgWz* z!+$RvdQH~ivbPeazjvGS{e12)cFab6WcQ`uma{9Ah`DX#Cj@?B`#8;#%ic^Z`Pr?) z=ceJp{?Gyl0T-}4rD=6TvMGB#eyp*Q7-y8$ek}h|NqCj6_^8mYgm$6+3>+4}xpsfo zPV9bOIu=v-QK>(>p>X!7+#|0QF3FkR*nI=RgTgtJdVWIh|H@RQoVh&TzbohxZ7<2o ze`fs22`%27mecXMz^AM^d;U-hS$bTwYu&i%rwAcp*SGFK_CoU?+P{3mkzHSv79Vrp zk9AwHf;*1t9d7zV`W#(hJ=Oc!ZostfqLj;NYfFhLy34sfnQE8EYJsISmu`}8d}22? zBuMei1n~RfZb8=<7et+s-xxlvXU%PrZ6*Do-Qeoo$C1iLTe+DOvg0gmJHH_ZHv|8( zoclSY`|{e+oLJ~%eUGm8>Tv(6Jx$xUi+8PmbGE76>*7;d*=uIaZUfvmAL0)Yk7)R% zW;x0ExP~=)Jl4Le+|Q{v+p+G0{F~pq6Dc(RL62{Qs2D9T@@$c6w^_YP`?|=+^c>0l``igt zJt>;)D0Qp9BzkB#7an2?&%A%p|HHOfr`ziTx$3fouQi{!RVHb!NVRe9y>luqYOTc> zcj7>9uk&qO44@Qn$6fQji)^)!RFyk{{ zTey}G&+(c6mGWoiic){pJY)C0^oYE{)FakKRDq3AwP8ESb48=yk7VO`3<>u<1XmT! zFxOb{atE;vgU-jb+gA!?q5=OZnHd;gdZA|A7E`^1AsO9J)p0UmfeHOaFoDDgJe>3F0W`B- zdTBEE^oeww5k=}X7;q>P`UFP#tpTJV=TN8pzN@iwMa{WC({YW27{%);?7qZD2=)0T z)Z%?GgJtn9gnC;0gBr1}Sqt?c4^5@xc<&=E(bEzma4|fvG(&G#{kSczwre9ycV18r z{<^#bsqvDJ>THp(?5!9UP6@2E5S-;Ri^E{1cIPqH9bD9+8$c_@`FTN9{7-Yr0=SuG z07E!N_i(rq64N1_Yz#-+RInp~8O!B?^yqg>?|3;0-*|lU{ zyEnM3nH4=bSLBaY;Z(l}EDkY}1f462UC2225o9%@{(tww#=yb%48UZG<^hS>5$BbS|g{%Z2{O|I z2R^gb-nArky5|I$JpgfhrjD;p}u~G%nvgzIh8oVW=Wyz zR=-{Y?A&K)?1D+z;i+Intt-sD-Vx-_a#XQBpoo7wiooR3X9w9VG4$LNdTyZHY=%Hc zRg0vJ^`YynVs0gy=ZX^HmkuEVdWi|-iIJ!wF1LjLHG9_AYp>*(e{oW5u6 z*}I6GFVaBlPIo98ocGdEZ^`rZZ>EgC5X1RQrY07$Ol=DU8(RxbQ*18+$6m+=mvbkQ zQVDO#?w9JnmQSIsta0NXw!}+~Kx9lQ2kOm#e}UhU{hl`IxTwj$a8qgMXI1_zHW#;b zE&<{;?4-N=ToI1Ih8#RL2nDY_ldsfGo({>uu={YM)cH8ncX+Yz_)ptv+tg5VcAsq{ zYk}S8!MJ(q>hCw+nItYm=-dE5l+Wa5c_|9;vqjP#=TMuNmzq7Fxqj|jD$Ul`llYDt z;L9N!5;@d(z(IP{d6?f0Vc64p2gm#PFdb0jd_K)*zROhlKo)Lz*2A%y@keot-;gM z8B~VhRqjOC+3?(a19k+>R>$p^-cfD!zu7xyu6VI%Uw5yms&3t0WBuUXim=+_?rwq) zWPTu@Nt`RXjT=^O8PCBrzMT%bE!C1atJ25r%e-{Hm))oH!i;8HG-!kk8Jo%-2z)p^ z$Cr@m5+Ij1!Wu*LdmEL09?@#!_ov3K%K-D*F8!xM2sk$(!r`6y<1^L?A>VP&U z8fCC|NkrHX>bA3x|RZI@1Rh6hH``@G%=4sbnziz^IgZzb)WUg4Zj04E3k$1 z&N-LSu0pl<>n)DMl$n*vcb$PfMnfjro2*59j6}3+={^OGhGsBQh#ib%VNq*%T$G3! zx!2Gp#|Uk5mb!bR?(5Kj_5wy5DFG}y9z$SRnnwulbaX}hHZB0>@@R^=6-}|0hTc)5 zFU)MI+8$KBc$bKH+V*L)D!Sxn&y4V{w?Cm#A?qoOQ37vOOWE5YFfU$3R@oz~(3Z&` zrN67D-FeJKbP%heYrR?cJlp+_Z0z~^&}`9irR?pXQr<$MP_?+|kd;1;whOsu5~DX{ zw8MrVK;MPwThoCW)jgpxOcY)3go!rh!@T+k`>0qA?0u5a&OsaP9I#89j~H}qhpxe1 zk2k}eXor%%BE>??lXF+8gPd%6ZVyr4_3mipiYK#BHgw#$qudEWCU;_1RK2==jQ%xz zndjM2V@(t15Iz%jF%Z-WhNQ3Y=&%PJ=2$}<{gN674xKx(&G;}-7Lx&)oHu!)`&9j< z8rq9SFLdy!@eVYa=_k7LL&nVPGJt7U)%1xRd!)t3%>t^AfbN{g7pTGbI@KT|#y#7GAI|=kd1664lp6u}yHFi%i)h)cqr-`+l-6NgS&W$n-QMour@>rBR?({1< zu_by(&_hRKW-4>ge&6plTjxy7aMB|~eQadLVb^p1#>_{i8CqD5RJZQ)x1`+8JotOE zzUS_F{kOAH*gZ7RwItc^NX_-so7X*K_l&zVMm!tZ;_&cQuTBT| zq}6E7b_8^KO!B!nyGJRu9_GiCF0367HV)B4gJ~}f))X49I-BS1@%#TO^V( zTf=U5-p6dEZ~aIk(0LNb#5vq7dXkw4!a0v9=uti9JDaD3CTRL#08%#3;3Axlu&+8+ z_W!^)V`s{L;+2r3H)iG_192zD#F4{zAZ797$tCBJ+iPLuPL;+b=lzRJZ&`AlQCdNe zamdIb`lKCsyqSU^SRB8+z!+r5N@xy$>|)HcMyd>-BK-@nzHxcpO%j%Y)2N&! zq*YC5Lz$usCIR4@#kksQTFnuWlE7n^q+;F41@!!_K|_at}2K{ae-f>yEyuy6gP zPLj5}zn5_cxdq0uf7ZST;Kys>yt&~c;A!s)?IhVO&CDqehJnKxhoHHORyU6K2wd_& zdN6=%j(7r7vuQQZ?2np5sCqDnN5de8aBg55Kt8s#TDUDgzynU_!S}TF7h!01e=vj{ z14HOE_rS>bWgFSM7<)=^)xC>14ww4L3QenJ%v87 zT8|VUlD0KAXyn0m7Z~kXs_1Ho&3|DSr=eDKXD@2-1u46$sSU1!_P8+9TM4LfFKQfFdKCv+ zH0&SY8QR5{_b^5(U&isqA#+cNw=ul2C+aCVH^~)dGiSKB)FPF}CP9Zvl0mw6utZ## zHf@>cIuMNR2)}`(?TNbwkK3U@DEM_aq2fUesGsvh24dRg1}S|BaHvy(e!8^r#d*aP^=0 zA_6_)k)UHPFJ)m)4$JARfRBy%Zy-+^()MnPNIgW2W0UEeRryMtq$7u0vb1_D&O<>IO?zFqy zlupvL&F3N&Y~;}so$MYgr3ym(;_pLqd0E$--;?;tZ-eJ+jL<>*?ReGqMe1NG4Ud!k z@3b*9>OgMahnw5FCw`h5Cyjh<9J>a$ESzi~dOtS0X6XIcA034JWV`F$lA7?@?|GZD z{MCNqGrl4@`xy?HuZFUN;NzCbP1S=inXLC*_M-baI0(OpFtpu<<+RQj+4jbA8=xN6 zN7$r=U7E|@8F+$ejLV z*N3r!BN2Be2=tDeLNglPqiu5zChDj+ysm%T1*-O-clTuVa`d{s^O~ztH;KRJ7O2`z zLKk%7oq7POcJeByT6=WBUzLq{&}$r9cprROT#@y;+wpNlR(j%cqk&ajYeJ;4de}PK z2T)QL4QdTHOEjbR?$6G`!S%!D#v0aIq;KVszLh}-D|Cv=xVq+ zk6!W$ud0Dj!i2)L`}p=YcQB2b-_EdmY=ciUCeT3VU)8F*sFK4Son$PbS>X!UmDV)2v z{1BSf&Ow&P(?Dl_DB~faUg-c9HoshuIUI%#INuq)G}u17?0eT_iAc@vMb~T5d+`rvkrvKEIqd24_Ja7^Ba0vjmwy4?Z+-SI(3yLT4Yg&G zC&dL^XY^{_`Z-1>e9iy`kLEw&0qiZ^4t3 zHTbiz3tC|dl?LMkFr4w?4CZHlJnS2{l@Q{};!`~6(>wxEGPA1*md{|KCg4HSug zMUAuBxoz_T2*lT%W(BPN)U(ZVc^Z0?f`QT0=*8m%S?y*(8y#1_%naRql<%YS{U&=78e(C-he|2`oG?Kg;cC6=lC zt^9J(e&VK6R#!+Btea^miJm4D!Kt>hhA-E<=n4rbjlbp6!lDdn_RDhn+;mxeD|K03 zSd6R^#@K&{@H7!4oNFRftRO@Cx^-dIH;s#=+rhnm?qO=1GUVonEZ@1HW9x17o+(AE zHefF0H;<}uzp4nVCU6#>skR8;L_ay+ylBEQm>~&5CIpKXzp({eq8+c;mD8-j6Dt&$kI$5HM;-X<9ya4S zLY4xhyF}}M7HO)2V-v*Z%+lg3f~j}S$JibP+&2#e|4ewu@x~rEk4j%j_gOrogHyd3SGwp^QXo!aV_*#$?w@7@3Xn!aG=xJTbrUd}t51II&^ zH~H?nAbwWEfe+uNx8h)K@x?z1B*$@XYzY-X+X)tyybivkk8@xvsR$lnTf$OM^pIG?tX3A(&Ho;4h~+6+3Je|!TEzd1HpcQ)(*!#I&zJPg(U8Mv87y^iojpuEuXBm2sS_DzPU?GnDyAf^GN0I`6zH5C%(v$BBv1_ z{a^l&ME<}y{;xk)3i!w$@Mi@v4~ZTV9x6PNiXdbhTfz+^Yb*+6J_`zo2IH`*o@vBd zCeeeVbXbu4Zq$R_tg)(Y&3yiRVBZyqu=mp}#p2S&7g3+qUu930*V$a0>eu;mNP4JQ z!&C-mA)MAHdUk(>?nqNp?epQ@a~^@Net{JoO^tf0rYo{jm6u=2yP}j>eKI^dZcHcm zg;5yG`RFoHyR@RjTes&PZP}@`ULtsJ{XNqFpHVl~%e*%dJwYVxx9nB5wiic!+Rb4& z#;Mya_C{rNd^z`B=lkLMvBd4|M~@zmYfEtF-W=9`LNC9PRx_r2E&Do~Z#K~0 zcz%pJEz!zaF2p$GG4(s><4|vSsV>>ZImYQ#>?$Fq(vNsgAAh#TN6vwv@0SfQ?EYL2 z&rUKm-?8*^l1{Wlg-1|yy{C5~&Gua*?xTk4?k9hKy+~WP_w`w~TgSrjB5Qt)88&YC zKwGP^=9;|E=Z#h!-q+*BtVb21#WX3OUN~00xfPg0YCThNVyi6GaHnwH2Of)=iX~Ie zNm27RZXZ0XZGfXR{1uDDiihFj)MgHx8&UKv3S0sD-2I+pZ20S+n`~{=eWQM8*)2PoV35SY5HP}G!sW#zpOK;v%^zDUMBhX>c4kPG@8cT@Z9_47xlMUhHaU_ zQ|W77QF*@GSM-+p&DXqtA-T9Rc6By8iyrTEb-rfeRK*wZ9p(WdVfV^BF5L~QX zxiiF$aR-Sy{1FXZzJfbrVzJ&1zVp%IcuLP^wMOqbBZc2L2F^G)ZS*~6`u0}nr7wH& zIXZzW)$}%Bia!|iyTYJKYtR(;!a&>koo98rg<-aL=nD^Bb-%W6Y56YaUWT=usrRfN zF2e@a+RKDiZaBO1`3Z6N%tKe{GTz^0@@F3E-i&qUuwRvZaMyLuE{g3+y*YPNg7zen z?Q%H#08Nx;>_#r4t9;R@Je1usU9(ncZbYjBL4n&>$@?zi5yOn`=64ZGrY;bl;U!uO#|)S-RCUBJllUheBJx+p&NU7 ztXDPGPC2c;)Gpuu2$okb^R6qtI7J(l9n(3ZIwiTf(CvMGUC|D4_g$pPhp&s}H{OxS zGEdN4-J8E=!^TTbE1c8PX7)Nat6{Rro(TVXA}l3!1V`|AF>ZG1kYj&NMPBBNXa4q_ z=>vz}1=uSp%U-!NI4djuFG;3lL0tQ2p0`E9uUozMzw{afY`PnW{hPq=vM}X2O)vcX zd%m;&Ws=`z6~}8Tjzi|YW0a~M-7dWPz)p*^+a11S{T~2UK&ijZW|y;?mU#<%Id4XJ zn{HL*-5rVYikhj&|t&d0}GMz1IGD zIO}5jg=55xs(wa~hV%G3sNGBYpDpjoK%SI} zVI;+IWSrx$am0_NrlTSzwi&s3wB<4zN9*?|PPkmo{-PstKiW_Is7J#fneQfw%i~WM zJ^nNtND-ALB;Ft-7MT(s?TD?Gaw71t2m^7b0d zc$F>eBjYxXvW+#U8IAYF5e=VOE2DSAV0}HcqCVz#FzX}0tv<5btNO@k=UyK{vOe;A zyVXaKE%%WwW_{e%!J|Hs11$9s!1dwpYUcW&s&IDK4UB;zxM3jn73%x9haR&FZxxrj z)U%>}i(R;xCcD%njJLm{*l@VO9v3b8A-(YugYfd7#US{QIK3Q` zxBlMj`ba(m9u?JTye~M#v0wBrgijIAogu~|yo=5v+$-)QMjo?xP~1zGi_zn^X48R= z3p3h~Mf?;=le6_k7^cm}rDBB3TelEHV}m#;O=@Ux--VmA0MTc;H>0_!twWtP?dVSB zc0G%ypLXu|gCR$#rJ4?zat;V@qyA7cHk~{6lDJR(T`W#c;_ea^wc6js9;7U}Q468t z;{}fvy)^$rF_6<~J2koY#8aJT1eo_x>;3A_j_&273*)ZsBtNTP?tKvB*yiHFfpcRM9;h#|xAXZ{gKW7es0M{Rj+gZ%vB4BI0M z#RI|sL(Em$kqTU^X^DI@7i*orn^C07AZR3R-W#jV-_)c0TdZ52P@DV<6sF09XBSx^S(;(m3)|4M)6n2;LgM=rB$eil7e1kUzi@Bil z+!O9cKctTiF~;%HA?DFj_~;%m6%A5>5;W0T1vgK_L zl-E>NH(gXQ8@OK;HN`fnaWw``+vrVn-cD$Wt{vP9swM^dj+h1AbCmkgLAtl5g(cV) zZ+5u!g6!y?Qa7_bqe3g_u=1&vH@U=1$q)N$E>^eMt069zVBc^|bWKJ?N9J zhP~o45?u|a+F{FE`-fSy0Ap!@1-SbsIIZPlySs80z)Km31bb&~rAr z%{OP(Te{U^bB=k+A1CSkxn}SGy!h#5N8}#sGUi1Db$2_VME&Iaui#P%_ZU|xyD;9 zZc)yR6K>B@kD7v`PBgbP3Ee>=;kvY)=L|HQ<~1>$wOld%9NfW_dn)GBhUf~So#>pM z@EAW-U+nd)@~uNkrFg2Mou(rlqa#J*Ip3jg!Q3paY$}ZNy~A>H7fF-zBaNc-82j{^Y?D){OVG(NU*FXC4;_U{pwv~vx%g?;7x`H1C;?dNQ2r`KUbp!e1Lhwdtf6G65% z)aQwt3+4ZE1#~}dsetaCJSJYb579Wb?-<>HztGd1xBWQ(`+&Un+8(5|1D@k7lGm*B z9z9L#OqX^lc}}0ErsP?Ju;$XoUYM$iTSLX&MrJB*Di>G8%IZNU7Nt$+dEk@)*^_NK zgWB=b2$#QM+bN$|Xee}cIEcC*vd;O7P&vd!jetSjd0bXv2rkIRhL4O3K5wS&G(jJ9 z^)uFpi&k{#1%K)qw&nS^npw5xvfuG`b$=K)mNO{ zSL!j*v*isrVutKaZSweW*){EE*R&gn_02BK?Zwq{*cpC+TJ5JcW~sWe)nRaS)jaj zJb$z0z1LPgD@-|Iovv~>vgKs7;Z;BRbT=4FV7e!X3$Kk_lBC}B3-_iy7~_s&0%H7Z zKHQ44#9$`>z3$?6bd96N-M^}`jcsRE)<73JmUGvU7R&aCpDvr~=C)NB+*5;m>E?af z$d-Dbz3S#oemsYlWslTl-{kCn0KM4!b%@c4e@!z7=iSHU{59r??Ch)QTpP1&)Cl&h zKGXnx)p>G%XvW$@R>vVV&flr&);T|EmNTGTdj7i){h+}nZdS5O(10mQAmny~E-^L8 zq+YQ%Pqf4Z)XZ_tnNKvlY_?dBhW@ryS-M6ByVuB(25}?WFK*Vhd~sJ5rNAQ33Hq_d zIZaKxHi>akm}q}-y>rkBTBF_P+=>CM-dT5$9+GT%jeG%!4q0`=47HV3{63dkk7-r+ z>o1y-F4zB?W|vVZ?s4Pt=~pvC{6 zF$r|qIdar-rdo6i7Qbltj>7l(LqBGim)qIsM1Adrc2(cv*1x6UXF6aFrWbaqZX>TI zYY@a{1ylbS*;cjdF03gXLA#!F6z%%ypRzyvKtX1STXf-F5kby?M*Q;K-@Qv=%X_1l z<2{&*{^(vB6rZ-?)Ac?{Uhf-@wsaG?-BNW?zT5z8dBa@v_=jc;ccB~Hc0}oR&ZX;- zCVGq4Id7KT-1$9s`?w=5DZ!QkDESadGA&9P8@_Yv=kjGSHNZfN0)tWor}6ZGsH&@5|XmGCaM?r%EEf2+yyr@iRL0DJDQ6YYi9 z1i8JBv-A77$HCvU^gb>nZnpdTxKFaCq{Ph$o0V>N>}=tETuNNF`+Knsw!Eh9G(5}> zvpqy#KPqBH+6%?q>=ygbQ#rN#=k9nJH_<(>94q%OEcu=}JB41L^xAqgy+Fa2sP8eZ zvg!9M$nh!c^+JOD{T{mH&y&B;nr|-@!{%m}C{y7CQS%?uH?W7%AJ`H_?ACDBUzb0t z1C{;WP4&%3bDURLt+%0XKeO&sz*9%FOay`>c zze9RSmV3Ym%kS?~zyH{l{|4P83&%$0&ZakEf%0wArKxzEbjzh&8Ah^>n75}by+zt5 zMu(9fP;rdUw$R&wjdJ~q)@Mf1L)lT|@DFqk8He{jchh%CIjYrr&cAq%bSNuPzDKH^ z(WBu4&-X|+{T_)H>4g6mqWx{^RZ^pkUL~bRj?CTIhKdlwMaw}hh-e)AF&J){o(+R; z^PzuP+b7zqeadUmK9QRf2Ko7^iZLoTsS)H}zEO_d&Ru_SzmZGtm&vkWz}RGL`vxo3 zC@jqrwly|m3D^&F*?Ex*X^z}b<-Qk3erpIl@{A6mX-ZJi`u$-TB}aCok?y;Op7K4( z$L2cM+_v(X)$8U|ma6Y~-5eJYPqW_qj*7Gg1Lgr^FbcSzVhK9^8(J6_cd_-h{O9GU znzVjj*l2nXd{jPT$O(um!sn9vU-a84;mTw<@^Jq2Gog@&J;&{Ns=Mx!7g<69~%A-dv5|BMU^!S zr;`8;f)$WJRF+0srz5BdZYcwrKmyfRtvHC_C>l{Qqo6VAh9DqIcL!?9R%TqsaUUHA zmr=)YS4e6gaksmzjMyLRb8D0o#p-B`Tys8pVy&N%dLCQJ?GrB zpA@S#Z?ipqmAJvV;+NE+Ax{{ylkf8&+%cDaRfPHLt$y$~a1prY+z-y+^z=3a0=fnW zTH5dE?_(T9J-1px6#14A4n9|FPWCW~#y@iYYVlu9iSgrbz4>^g3){thTtBt!1z6QzXNSU8}bu| z`sKT9`O2NH+?sroNIoW)2VegqNo@Un+OFT*(+V$H{7DGhPHP1IFO)4Y7x}SnEE0IU zyUi+}qg75ybBo`Q673y|Cas#|#saHIp%(nVv9q}fM0E(Bg9RcRlyUtHJR z7V2mh%G^j+E)Mlw9-J>R6P`RlqHz{n3#@^wLNI_BDt0yu(Z|J^iBz zCZQ~YTqg6J=rf_Y-cEDf17M2HwUB%uyYVo5vl`}e54mJg>5DVriwA^2QzA=&9rdG$ zJVMzsU5&woz$Wz1DVwQE2V2Xuy?ib1wrdHPBLlnf&thx&3f6)cTjG^PSkDz4UsGf{ zXR3a%ZUP}|**9IxwJ9qRi(z@!klM4$jmz_hHDt=4&`Q1_xGV%8r=zX8L;xg#hlJpd zHr$KdxEJqK!P+zUzU>i$b;2(x%kCwuokIjg=TO=OY- z4-CP-f|2|4XeL)@rob0`XX#VO>XC2SpM=22t>ocy(|ado?}EFiHvPo0t(@|!8Mu$^ z?hM>?)|%j6gkRpMe?wos6TZCLH$*?*7PD^xH;7H(VRqgVZ^RkY;(eUL&e+RWCR+Rn z_+*OmHJy)3&`_p)gRj8L&9i^Po9I(8UmmOnx!;i9i`jSPw0X6by$au@&G9Wj?(84W zV<`ex5>_7tfG8)}uZ=tv7q5+^Uq1qIrwM_I2zM_6>jN-qEl0RJA5>DL8T~p^(Qk0* z7oCie3c`+PmwBj8w9XU#3~0&+7`#st`ifzM{tS5)j(}Q6T=zkYz%iNb`dtRoeUO-_ z$#VxLu@l21Ruj3g7@eJmOi0;f-jKn?r02zy;;r+CePva!S^+MoxEp6AfkV0Z;O>BYui}#`Oe-oN&l>_=L z3o@OpzmjCcx%}L^yy>*2ANiX4#;vJ0`sh}Yv*DYt0-r(m11FFb{Eb}=5teKM@NPt( zyy{CK>sls9ty%rs8IAF>)oAR!P!@eKX5w+Ow=zsti~t%wYr%gQWCeYDGcLF>d>{sA zjGO+Ltw;#!s4IsW+N25!%Vl&S@jXa?NiO9hr6Sun>uhylw=p+l9v-YYzc3R z(NS$e93A}(+}>+7;HDznZl_kbqyF?#uLNx@D5csv??sTG>Juq!V+&x318s@8yP#4t7_BetJsJvM zx|}re^ftqVcLUz*V<}|$dwnIHIzN&}hmXD$N2jV3gj;_>+$!gfecQ@j8+ela{Xq-+ z+t(IHxKgr&&SQXMbNGHyNtN`TKGj_^Q@Az899%E^D;iQs1j!zXRZUiIJiv+ zJjo1NPmLrKRYjH&@ce2^#57U8iz^8&WeZ+NwZN6HaNfN!2-TWzkpc%@aO0dhV_^*v z9Ehi>`3^W}9?k_O^f?2RjHq3&9fqUE9a`}MjX%LrljA_Vt;uncGN(jfWNkZ!6Om}b zJF(a)#A-((6L*4J;74W@jwQg#9O0Q7eQlx{XO`MKo0hfza${NRPw-{c2!Wo&8KP8g zh4S92w4K#Rc#qf8ghE(3F)x&3b1Ul*>Sy3;NnefG+w$V=ZL)aC+ao`FEKuyLFpbZr z@YSZ}vCu^1rLEsbqIujST9cDOoU$e-i*Za87D720?$8YmDfHQTI(c&%+{)M-_)l`z zjGchbX&-&df$~rf29z`c${+(MsRmGT#jz}%&2Hh#Mo<8kO)wB`>0c=GVEH8kua7S? z;Pt|B6JEhL=#|vE^XZu<%xC7YXP5wE+HT>jFM<`a`~(S&tdyUyKdHm~gw;t29pop> z<@pKY&meAAL|>WL4HGB###@H4<5$}}0*oDvYAe=6nDAJwozZHBkw$aj??1%1xO zV_tn;dLulA_N;?A>pZ2AO*|DRzAxP0j4Q1%=dbr@KYuQi9n`+qfFT}b*yO;b&UB*m z>>&Fghscv%7(`5aQVcF1wux8e!6V1C*&FaY3t~BJ8=A`s5i5#lTS9rx0{Ekl;RV;)sXC*A{kBSC!b)E!uD_y?K625eMhNJI#F@>v{=5| z21&gxjol|9L%m^wq&!65LHVPIbwR==dpP^hEW0c=_|DNsqS8A!!G9u22&|&;yc%uC zT0G_(gur093EW66box&u`?evl07gU|+f4ELexM_dc+{PO{vBEm@gW3lEiHuwzR3gDRUnaaBG^-ci-J3o)m(Y_BA7TKycqFW8x1E zweM?~H#v~~L0qNHgS!Y@!h-V8roYMewwQbm%X*T}hVwnQy1=Dfy%A}rMjHb(8j`vN%K>o(_i9DU z+*olusp(LzQWM*r;8!L#|M{B^v3VsS@HH$KVrh>~vH9iQX5g+Bo8RecV)K&}$bOix z8I66xZ+f@EW*le0SDm18zVe`p$C_5_BU+7kM=SpRE~!ntF03+Fc*aMPoHMm$x#6$FQKdXimQIR^ZKtLROF31;LN+Q(nucnrA674*bohMw~L z#?C*LNr><`u~iIx_G2lr%r$PM2{scyg^_4oVxQ7}ZTU{VGTWs;-DrI0a?5w9p}OTe zbI-!>yve?UL2GxPg-h}%5x01NFKNP=vF}J|=Z$E)*IHIpGnOp9Ec0?G#V~yXfEZ}F zChOoAPjb^s$O8x9JL;`RXQ!;}D(M&A9orw%-}$jMZw~o!pqe7X0PbaHM~$`Jgl8ix z2(L>hq;Z?eHkMRW>K4Bi0zJB!)e=&el6=R`NL)%B_o4iu4+5D}&@wV*0nb0K2aP;S zAXy8GJh46lFE%CDE5VVY?_S;`=*`?9RM?@6jB-sFpfI}W%jdcKN;5|KzXy7c@XSe;*&2M53{}@!0?W(Ls)*05?ZJt(?4W5F zxX_&KwQ-vP6TPOpi7%$VEy^aDwWC8^dA(npGY9abnNg<@xXYN|Ehl4v@86MulgyuJ z^8cX+r#Oq-=HYd0erToukXVsLq_bdBsU&XZo-%ZBKD*t3A9t4c-%Yi60Wm-ZgZ5fs z4>>L0-OtLN4kIn6)x>DICf|jufLNS9A0h;Fm{e%xA}PJc5mU(O~8<}Shm5j6BaqUt|vQxdUlp^ zNoZZu(?IK(SGK#ks9WMV%3pdI7_v?XZ0^Aw?6ekyMV}unSRxzC@hkmJj)GVM06zp+ z9OL0@Pq*;!hpXbxihQOP-SroigL)+wTAuvW)P)YzN1~w13=lP6k8s-bW$9I7l{W>~ zDr)l%r)w-14LNW4Jrg#V)%f$U8Euk$zL_gi_eICBDd&6b7PnAV1yT&@+llnx>U zCii4U#J)$#l%TEoiiLUHXh*5A0po&QCc-a8_;_`TjR4BVW#RNPObzsA3 z4Di2gWWjbfq*6#ksSwDa;OZz7N!NU5h@?gi-VnN7quqy-dGxT6znAV7C&9}iigC)f z_;MQCf+gQ^ldR0cc-b&1d>9uF(;vkYE)#isTZOvC2NA>6`9%PKpHs)IU8{d-oW*_T zrQnDfw>lZsPxK#-Bmbc>un-twYa_-DAt&cC{f?t%{_lq-H4i!vO9fV{w6|h+k`EA_ zJAt~-q3r7cJpZE#EqQP?j^26hDXMaqcY4<+gtkhUALe@-4LK#(lts^29i!~n(Ib=% zt9Pc6`3C7NU5va(YaZ%_o#de!yEaHK?`rrx$64O%(aMW@T#E`8-!FI% zH>68LG`f@o4-_gq*^IcwcqaesXIA1SIi+k^A1j69(HFelq;P#2{#1QMsD};K*LN}f zOJgl~lz5WmxiB*Z&TA~;(e!27+-GoK%;F`(<|kOIm$=~DZK{{-?1S+i)#zM;>Er{y z^^w+--iW#Br;+>~)%rdO!&bB=N&Y&UTLE$8-5!LO#O#1Ct0QYze4Fr=mKz#tlf_z? zVDUcOwE_dP3bbAyQLwk-5(2|?#!lD10@O7~M~;Ct`#ss88C)`I0p#_c#&Jo1Z_c62 z{QltE6pDNT7?Qr?n8`59^* z{Tu0?N8HLfp{$`LmQJ`Cc3QDBcQzbxCcsLCjEx8Q#!Hp%!q+eM}OG z!KYE~hC&29e%d*}1~lJ*f?+!OHhAn?jgP$(`dlzMe73jUOAi8N;YlMqJ3v8srcdLe`2$)FzUwzd*JP3g|&0ScsJj#8tCS2dsu? zgkS_$UWeQN#h>wnGF8t{S9Pv>E+zQA$Djk;$UmFGPj0lzm=CP#d>b%1ueWv4i(8Xo zynCSDvuDgNIoSxCIE&&kzXjT-M9e*Kizl%^-E0!R5P0@_-f6E5eL2)&m3dx7vh<=K|iM#8HyWOw?!QiU+-Y^Tfva>AC3# zfz5Y8n{_?fqJi8p406lRo_mj2Wci+Gl-sR0hJT3HF9xG*GT%hHEkfYVWXnQ+_^_>T zaBgfZS;7eJCpXe@(p}-vSVbK1^!e0W3 zaVW5hPypk8IHNvWZ!zO$u1SG)qon>TdTAZ{RXN^V!wwx~uSfL39B*uidEtRwX$c$R zLSL%&^B5mvGS5=-*>jm^^ZJ;~gHP+{Qh35X_UMQf?)1VPd<`FkBS4i*ky^jfJjUhn z&H?@B#5Nq_56B_bupD5R#1a-0G zK2t>O+l@K%fBU|{Xi9pTsMge#Y;pQyb+O~opoM9^V{_jp@`Y+T;sb$Nf!W$Kx7r57ONx9^7Ea5%8Y9qxT3En)Kw8 zL_8$Qe@GP#whI27qZmmA6m};PQ-QSSN=WqcGsGrG|o^N)L(i5D{ByX~ZAxm2QQ#7jR z6thXWySbYp2K&hYegiB?xJJ|Zy+eE`YbT}O>kqcDG<>GeaBY0}Ar>k~GWt#3+VYQ| zzA^)kYGQ#$$0D(<8>sK>-f{k9Uy_x`NRI0rYbMKK{VPU`*LJlqz1d%xgeW<9O$PA` zp=@GI00@k0VH>kXFKDok-h_yS^e!^S;iUp}MAMz zLesCe2!vH1?arSO|%eg`AT2txa5geDka)X~QthF8*0FWnJWBW3c(lf1E@%{Yq zG(*gzcOtDZ(530ubu;Ig{6dVM*n#OQ6NJm@m(`#DBDoLeb(V^SroKPR^S3(({uuJoqMDAk&Q)IHbm zJ&E7QcZgzE>hDs(S|eyL$SHZ6=U?_(&%oYY+UB=g;}ETR;~cM+qrE^pEClX5%4lTi zYDOdE&M8E$Kj~-;X$R@ACdcRl@%Hmh@YYyrO9GbyZ#&)=ZwIdK5N}svOWKQ%^FZH^ zjvxX@^e~}kBP($T$42RinB0-3qmc*eKU?GA_$qKdMjV{?GPgLh)H?{R-_N7r+Ca1; z*s=)2E%uS}9RWUd#U3&=1&335{=x7T^E|qwU9jlHnvS^E35uorRzH_d`TOJ6^C{_{ z$nOVXnv!sVnFS|flv~ANbM->toh0TAzyb;Rag4+Y0KVaw6pFcD2sYz5_Nc*@7wxl> z@`FiKU&&&xYRmsvJmUoz<{a33+(O0Us@qcWLZB1MLYb#Mh9u#Ysw7ITI^(#HAhh7R~F-|Nbg z#v-1=nBor`89x4a+%zz4)7Yie5k_Sb*e>Z6`=e3qSv5juX(AP1B0~@wcF`78Ed(1- zWUgpUl0$b!^=MseiD!l9{PKDzKafWY*I0`_$B~T`mzq@CD?$4kTA4thJPp|_D9QDd zJ+M#jx}OdzJ$pYL@w#-3{p%p3$FA_(9;H@)?gV>iQ4717MhDFv{UrRWUH@)`MyjQf z3s`NyYo~aQM_DO?$DWGN%SvmE$Mg>mwXzo>P|Ld75rysHEM_l!0y&HjM3F;(t%XhE z@(BC)AUllpKew{Ucu%s)4L!;3pWV#vzuv_Def}i-cSAEPBbb6HUW7UCD<5Pt-48H| z?@z#!%l!w^WcB?FO6FOSMP84H5Lj$uaaMLoX_1wYPLuz1mDLxpl9#OhwVcGn1Kz8U z{)and^)tL8EvhP~qoUP2(2`FwI|XO)&2s=x^(ySze!%Uow8QJZRc?Q^UCzG5>3fkJ z!m|1{o6U!8HlU;&#Q|X!mhn26$2jYG)^jMzW97g$fU1cTA>j7-C z#0`&*j~o7x^?8uQF^*ZKOuwH&^Kv@frt?i!3LLUJ*WnTO&HAmZ&QFuY7CatzXba0R zysm(v=>5eui?@pyQ#<=3^2bA0(Wlwc@ z2jF)w9u>bMlp&Ls{By~K>3adb0-w+^=E;OFC?3>*;&H;no#bFb%52%EJ8Kn6#+0*oEM&QIlQ6?WTvUw^M}Rh5U9J-QF5^8vxuI;WcCF_D6BI9YXL+e%nR2 zpN_lj5`uri+y2U|PWg)*nr)Bu1W+33q!5n@fe#X}1jPAy+4nl;-y*9E(%i_Szq2uu z?ERmDrp3C%2oI06nE@8})BWMjBf^8pz{qEm3+c&D633=X!h_{)SRFragooGCuA<`; zRF4bzH|D>@z9EFD4*~NAI>=j>j34NWAJ{0Q!@n^0diqNES)>d90#^dCK=oROn(Gh~ zg+~?>H@+EzmbVFk-SC%sMYs>*d+vt@!uJD&2YeU5p0^LtJs-0<%6T43>>OETd%!BxY zrkE>!HSZ@*lDlcWIBt8mN=n!*-i#`HU#Ac_mcTg6`{=7t`#ND&jJc>+=qS`qUxo)B zz_lmBb&ZhTli<3V;Hr-%>l9MIZ&+qI%s>*Z1vp%@zJmrovHlZBfa!Q|$FJsmO#F;} zG5i@nSNXd8ho>_>YvDg<#n!ryuT|sgGPsbJ5{Y1ezGva@%x!uRF>E}Vl>Pd$))+tC z4mYnR4s|5_)%1TB|4TZ+|Bqt$U-oa~e_6-)|2)Tk|G$C%iDIrpSo}}nfAP=Y|7h6v zG5^c>U;J<2{|Eme{2vE!82gLy|AT)S|BL@8@!#lQhB6Qmz!6E=>;~lsy-k~DWsKDDWZf79q+WhN-~`3^M+j`mz7KS2mCrd_)S7FFlH+M^txVYOt;F+yn~ z9~M8xq2?z@HDh>>+%YtilH^wCpuHVK4B&7-<~v7HXM+0v11eh|qTDDeS{uBPmE$B^ zHEFxh;G&F43$NwZse(k4ww8=K`DyxHB&O78%S(7Y{3h+;`y)~9w<~CBTE%Bffm;5% zNK8J%euM}dY96)dfR=!sR#_pLM^-}EddqvBkRHSwav?oL&`+PVP)MJ;Oh~`;1tERr z+d{ezSit=CmR!Lgij^omAIOkG`6+TJl!B8B&2tKJsLS@ddnGWXQVFKf^x=aRp|Vs2j#f_94kgy2@bk(9^n1c0Ok*^>W+ z&3ApS@V+Nu>D3^Tyob{<69*+_qP;pf1^(SC1UKM?(4E>%2=%F}mPw)a>{Rg$A@$Zn zlrhK0h4)f&{U`0kH-XyEYS5?TDZAXNoZ2iYa!RulnqZgwyPBo+^|`i6DZM7wzq^^< zI1_(mmMazha@hP!ggmiUQZDIHd~lgDyqct65C1O9Jyx0QJ3k-9yFD=fNeS?G;?flF z)_i667~zBK&y7qUtz7SjgljmR>j0djGLPrK=>+le%bA7n1TI z1fL>X;tZ$c3C(b3$zq)lXiT8}oa!I}ViU3X)K8DmH=j*3vZn6&JeD=}76EK*sT6v9 zAw9lusR5~*3H1+$&oIwvOTWMDDU_#S)-l{71cu}D*%*<7e$m;dsiR$#>k5DPAIPu) z>I;vp$~>lh`G`^V4C}(d#XP5=eE=6RYx=SBz&7u|g}`OuocfGdIjl4xEpkqC?4sMcyg zQxaFfMUb1YNZr0te6aXi#61`A-JUFAd#i6!8$B&^Tdx0eBuyKvE;Qgr%-jvH`6L6(K0r;!7BZEk7A`d8Dqy~N}*Qq zK2QcI%v%XJSIg<;(kR&e1;WFR5%y9K85r?fGioSBJNXA3hcYmKb(e!FL*dDnH0ZCJwXUDHjHgp&dmdm_@ zif?FcEMB0jl~9}>yBWXba>sp(3ZiL`(zjR;KL6SfDP7NuO6bwSul))~Uv!*71Q~ymg_4Y>$0NI7W<>)o@aC<6D3MgF&q`*7jzs1=f9Z259BJ^5Qe}QG{ z7n3YJYQwF-*Y0?d<#+yY7o(t;f%j_9|2<;PAIG6FeuovJ_Y^T1|HpUO<5j@_sMD=g z{+nXRe<=Kc{OQ!L`J*8Kf>i1xAy9R>LwT@ZaUDv68B7@hcF1@X6%o#ZQxk zSSuP?yced_ILqr%9|S2ef@QYhczDk_@i>m|;T>F{M>mJuP4%IphFAWD0Lecp^WFT24YR9rUi z<)u_nW#^dw=~n86;0`)!H-p?*N72$#xZK#c)Q}sk8r$?`TwcV9c*Of0#aDsIoI=ht z{r611$dBp2Pg7*hB#@;xqr8_-_Gcsix1Pz3b^0EwJ>3{CO<$Z~`2SZ}iJ?)m(nekJI-%n;(Za<&ffpgy22or?1!jBy)Zh)=hlQToV%)7ajPJr~#`7 zq}I2Pu(lQDL}jEC_9?D@4c2&pQ+@e+ygWcTb(g9~x@r8Fk=vVa+*N2vn1HbZ@O*&k zU(e+MyLMe!g!cli)>3ES?+lWKG0}k11(f|}oINaO4#Rk|r-7VW*~KEKe$a)< zsX`zdCx}%xcvR1&Pw$7_2=&r`14_Sx5gPeI0Nk({^Xif3_1X+~b3g2Y8?+IDU$3ot z8B<;!b1tL#{=-QV=Jh!w)egFL3X6!qS?#4LYF<=Bl~VBcGWOe!ziH7*iWGXZ7Jt>` zP>Go8oD_J%uMwZ|VnO81xX`q4Mo0PY2#=z8cxD2F`hJB?HR!ZxS;Af{r$dZ$RC`7Wy0jdf9;Kg!j^xjBG@1gPvT z`}bvns5gVk=RD6=i_N3ePig6He`Sg+?gNp1WijIY3@1rO1(LrCwzeLwzXsRygtulm z2f+V^(cLu`$>dSE93Ht0QKk7|d~>7|2p0|u?1^~U+aye_@=ZX@#}XB4X_2-*7>SMv zU4H?nMR`L0eyk{kEFr!Zb8npJqDtf{9oah(@-D4kVI;o6yi%cT&D@wPyj9h7DA!gM zUK+!n{Zjg2ZS%B9l;j7L9awX{mzLlhg~0DIDCt$RFR_($Zrw+u^S8+zl1}acBAv%3 zwIdkdCB2V6)KpCPCXZtIV-$ZP%E%WFu%x*;6@feXie zE*x(|R#t+3CQCnbQ;z+N*+1E52Xr963W4*YoC4T4cX1ifbl zQXWjEefgPqnKs{udm;H%2wV~WxUbFQe>C{>zboHQm!<;;raB-->#hr-p5W3uvkVaerAc>A5-;U2J|VsB%K zH8z9@Qb((|7pC_DGb+{YzX`qu3oD`+w|V#+D^UaxD&-JDXe~6L6&h=osRA2Ze2QO` zZ7nFH6(W?E+8DH#;aZ0N4$C>-V3*87j;;ifI;(jpyA4``AJ5%@^98iX;O_a^xHegA z!Ea<3lTb)n1ck&#ll!Y>a&KFHujaoI&JI485Bb047*ZQpVQAhxl{BN-7OXRe`(_iNNbXP#Q*(sly&<_C6rd*_OEg8xtO1|0txxxy2bve@j~ z0=Hy134xMd=*sP$WMh*6sGw`v*|GpM1>t_y@hdv_15gLrVp`9Q*h(3Gskv-9bB2oz zxEdHw63Whl6~Pu|z+xuk3Qtzjs=QfjEnJT?*KsCaM_(8yLp&(>?cHf9lChLca90TS z>E31)uys>XEUV!Ccz$mFe;I7d|Czy^&A+TW^##ws?sq>s{~PDGnSZIhqxt{Z9zXvg zK&&Yoxd7#)UpoJD1{w2D8x%kPT%{UY0tsdNu<}X#+%siR9%y_v??RXM;ye?7VAA1k z?eRwl9>6?5KTO)erY-^n3Bi3QS!c{bX&9xm&UE&*(1tA)kXT3AvVK1;pVpVs>fwGW8gzmMQb%i%u`bCY8>ldYBx1`WyfbwQ<4=LM& z#nihSA+uuwOwIT*0dZ)wmiMyHhRj+NH7-30%18^E457zDF1*67yH%iXK zva(R02}yQsH+n++jHla-h2(?0?4p;tn5$7A<0mvM>2&P(9kV}3;)J;VXE;8>%q>=V zySlR{IZBFYEwWOldHpP|g)N5;ceb2Ku$p|nn0#7GcVjKsoFjfMPuSzva*)=N7QdE% zaA<)rlB>>iX}|q-gwT(D6t99te*LeRi_RsR(4AZ;hsGDd_LS1sr=5*6{~8X`cz*e{ z+0RRk@8{*lY`Wq2R`xjbo1Tr_f3izb9{^H7t-oRrRMv%iEsO6T7KaaR@sJRxMv}_D z2HyV^eyKOOa>b2888%YL6K@zUl;JG$K!oxi0VZ$L^2FL%UfiQvo55{hsD9y$*nBPh zx7ze8%f?m)UFGZ?N6BE^>>Q+ViN(b#|KNmDt?7qZH)9~BQkQo1ZzE)91-Z-(B3A&N zqqszbp}r-i--(UF;d?ctNnm?!nWj;5(@N204axJ_CYKEoDk1H*9n1L$e6OP19)ydGe)s_>Tg|n z>ixrM#J+tQUS!FkyX+nGNoqH449_dQ-P~~MJ4$>U(SOp^g!SsNue=zlsSXT`gBCM!{=CVU1gFK3Z*DS=Az@(R)sx5a>c4#VN4Zz z&9Lueu2`L`ROSm~_p(rtUT4JQvN5#CGcfdRq5~9#It-$~snoHOxX1*tF;)xAN8FqX z8V+MnFD+mcRK)59f3FleuvwcFy3ei@Idg^gqg1~@*XES6DU65$(vI$HxPbqXN zMPzp=Qc7Jp!uzcf=<(D)ZnkBj=7E*{4-%AxKn0e)4jx1GgidwX#jcXS+K#m1;v9#e zG&7M`dsDa_eI=UQrd^Ap&u|vnt!LwIOz*_5R$PzH6ORdtvH2F9(PjINak9YzI>vtG zISZJh?IR$zW$=9QJ^1Y0CjRWNF^@9#)T69kXTEr_cp@JBW!;+jgFE?y5_|A&q}^rc z_j|K3=3l|T3y;PBoz@ip_h**B^RT(GDTV;SV;kh#8qM{IO+wjfn<@8Vw>2jBQa{@1 z%i7%U6}x}mu@3H!H1A`xP@yUIoyo(6C7VvbUErVoa7Pn-UXm>qGNy1E2DQ{0l89Th=LnBiHT}r=`GpjQi??FO53ZX5ER;eQG4KXL-N$G_y3dHqVsz@iAi`Q-vCS+vf_%n}$y4ui>BX^q&A08r z6J?1V4iY>Emy{gmxuHf}T0M&uNV+Lv^g^%FF{RkJzmMJj={M&6O1&kHzCb7|wCg^? zH^zMN?@SzHSth(R+TjMdm>GLsJ;vr~T+e5hZzkGm$JA2wGx`{LtG|@59qio!Upw~! z;A@|9?oSDc-(!t$rPM*57{<_)QYW%Et-093*YqHDe&JZJG$Q`yaN!z$N`~Dby$bJ4 zh~1fDh~`*z4E`=K{tng&LA@ukf;E{Nb3eO^T?5W93+_6p;`mikg?@#*qP43O%IlSI zb!dR@uf0l2Up>bTYs``SVIa7*C>lOPpHtYUyyv?^xj;={V~H5#x9Onq?@y9KeX%iR zg7(t{vv@*<2W!y3p2EnioRb;0&hRLe7$t|VhnT;W)BxK6;XnLhBK+6ncN67QSnJ?#)Zlr4<=qOZK`#5 zxA~7>70R`FyC@amFvx#fGuM!TPSkxpByp>ED3@anOPh8{-BaLcAutPQ(O+SMNA|%Z zzFk%cFO4t*2tG|;Zs{{|3OSDiI?^-Nb8+u1vyx{Ho?d_Dl3r9ptepaCQX%P0t2w|N z5~ea@G|MuBQ(&5!J2Va_f-DQ4^a1#Vt-cvsnoPy8j`#P#l&0g#?nN9I;(`>G+j0r%o+_>5u5=Q5izRU$yYp-NX@V>3l#P5=fXHh6!O@mv$bS0+j$a z0Y1h|g2NcOsQk~+fR)C#;1cDKq^|-%Ou11A>`P!mGol_s(Q03^_MsUcmroi8KOV3Z zdd&GDgB-4i@@18%_QXakj#+6}rl2!b2>gy#@(@>a7@eta(B$AMf=mxHIRTL?bB%s0 zrAMzOL}l`@SmCpax)8XZn)<|$)!Ppc74CX>S+Dx`vw^}jxIBG=k#DE_@pXU;2~0R? zvi3CVbOPlj`ZfFE)MlnlaVCOty{=j|HcVT=9F9#1TQoHPn2nzQwTUE||VJH^*{ zx*?zw-AG6`QxIp%InLJXL6%woysSKf8dqZf8L?IfY)^>E*-Z1m{1XRLe#k$0k?EhT z?#vD(gPd$fh58mpNdeb08nvNbCVppOGapNtFbNyoRcn=FxMDGvDpO$LPo_evNv2vM zV}#u??YkLq>N9x&x96(&y0nX65L*=-nQ-Lh38U+@gBS}UvO`m#q>4Bjh|+~2I}3rC z;k1W)N~f?FfQVs7z)o_M6n@Tfo-+i{kNe*r#Am4tzL)$%?udgzhDNd)+ zB8A|^U99WFjxNXViCZ7zUnp15`(JYX^+SUcJL(5@3OXE!GbEI7a3WmIiLm$Az>u&} z6li6+!drDHMVP${Jjslmd`PF%K{GBp4iCg4E4)17LBxVtj*@#h797zA7E=WkE3{nL z&j?HUb8xwpu@3FMTnjq5#myWU&P?R1EFswmOs4y?8F?6iVtKSuH9BUROl1F^4E5P@{?`;hr8dc*QGVVWN_QJU#}o zQWFBJsKnY-hp^w^zOqwJ=h zP=`i=nO7mrRfM}r6+b1RAo@6-p`pHMpp2kgm}84qrVLvWh>8UkV}t(9VfL9R-0g@+ zQz)yYr4qp}DgWz8g z9T+2cgFRPxvKkMys|ki!{Un@<2WqbHL^ZO)JEjpk_72Mr98)JaK|=Rxx#Bm)+j7M( zg~j*NhI}*YD|}RBkNP&}V@$|tiR=KGf*om;I`Tx4?etcP7oQ^d|AeeTU64j08_&^M zQ6~g0#RHO1VOTOdu4ys8c8ghptpyPoFk_n|!|?clfJp) zmK@=UDg@1-+ZYsol@qA5Lh%QHVsQpSkTL+83K#^xgU5v62915iFd4p68;$C3>;`JA z3%^E)`$)9!i_C1z$tB@{b!HSlzdk(oDCI` zW7!~Y{d{0!un}crZ5+FDStJTZDIht%*lJ)q861!A!zJjm4IuJWaN7L@V|TXfi@(RW z5(}t>-@~Hsx3%IJ*iNq$2~B@!do*gg7`g4PGZ&-9t4Q4$EFQ;SVC%zN{Wd;G(ud)9 zS|YoKI-mYW@J>%L+KV$A~EB#cAYb*V{HI9YgHOaBO zkI<;^hQW6O_+4`w8h)TSzR&T_-NuZp5}FU>QGGJYFZm-rm4{D-MuqTq7W=&je-EeM zT>JwmQ5y7< z6tX(1KR&;5d?ac?nV+@wD%BnC_i{F5<@qq=eK>`?r$^$>pCt&lRUOs~`|qjObMAsz zy`GQq+t=%P#kgjz*R#yHXsOpT&$t#>uV;GxDeLu&i@V*fUXLs8cDs5#-Q#YztJkwX z@1L#L<5rHv5f!NoHP3~pT*Iq#OJd{|FifUtG{C+G%W*h zq1j@}5x^J_KZuhXggop`DF> z9J9`yvXyM_?c<%{TWn0UqAUlLb*uTXjASP|w8KwL|1amqlV-&5H$A64{-)-1fWHYj z?eX`(RUP5)ma9&QzaLz6O8kB5s#D_coU2ZWzrVTaU&r4L>PN}y`Eq>yD4n*hNz1&? ztkNNr<&o)%a)am4{eQZC6x&r^Kk8EcjZu@>Hw6Dr^Z}|LmGO(~NApJ~Qb)?SLv2@L{ z`1(-~lZAw)<`%yyHtRH6_{%S48q zC)I#TzF8kK{At|!JF6#Utlg~VXvW|Ff_hR5+tia9+peAzCg#AGvZ!7X2<<&bq8!dD zJ@GV1L8U-JoN@fi~Ig~5BpO8yT*u80&C%9I4ydtZDnF-ltT}`m$cZ9C7 z$;xJF(;hka6$U)yWNAWR20TZ8&}w=jvHcV4_!B`7nHR3L%b*+jciOU63xU_kvHdmM z1^<4V5co#|pPV@m&I%sI=H&he4C5n)#)eVW`S;p#vKov$>~+$HgT|F}k*X{7#1*Io z(D!`f(K);o!E!{nFVDGENY8U>s7YyvbqzO7W%Sc-a8j!_=rm0l$@{*)+J+N`$qPZ5 zDoV5{hsTGKw;{0o&57RG@V`l{$^iU=m3@&w=?{6%)$KovI_5dIN_m$zJ9XOET=c{$|OB`*^jp|uaNtogrM8fl=N8D&G zhIgFsBY|hpHU!J(^kuwQj^pv1HMTzwLbuv^i1?H=?{ASkZn%r(D9LJ(3qA_+jf2)R z2Na#7lA4YyV#^T%z9{Q-fu(!uWz2aS>S3iTdOC&ReP}T^Cn`KV5%<5C^b0nh@8gb) z81$L`0p}Or0+MYlQrvrqagrFwZ@0x?!H~1~Dw z8~B->M!W5&-JVNA-Do!mb+~~ydQp~J;Bbeo&G2BkYVm7tnl|yW2sQS(`ZDaucg%Z+ zM`;RwxePC5u}c^jG}j@8CT75p@Yc89v$TPiF^fzpJreb%%1TQ(ME;*0S^~T13sZVF zr75!GO04k?)^=SQp~?~8UrO?q_biYT^;{$Vi}W9M$G6w5&WG3YGf;@cm(S8(p_l8m zXZg#iAW-T}bUM{*OZj_Y@^fRBt2;C{gDQE^mkTdtovI6rFO$%T1yTMQVI45W8ef*F zp+LJ4&f_BtvW}2~BV5l%NVblU)_#QDm#`6jxWqC7V!EEdM_74@Wduk25nkdWJZ&Ap zfg`-p%0^gd9l_OpgaSUo6zd2s9AOF{;YwqKQ_92th5AbHvY5Uy=i>JI%AFT?ps!50 zxV^qI^5Ty4mGdq>rM}Yb;#2A?+K5x?D{DraQeSy(#6PRAnENBq{)}|u9%TuEiB#{y zxGWFjU9y z&GM)Zrr>5f-JyE{bA7a_Z(o9&%J${uOmkmqVUs;dMEiXvV>f?CynqpAU}cA>_gu`D zcNxrY293dxpdiyk0vKbTM>(K*XpB06>U;7g0i{_mYZ}Tag$m7)&asTt&oUBAdrzi; zcFRgFRzL#`N#p~j03P!J>x==}@-|!?TV7R0Y=WM@n9?*)L{%58iDw8Ck?oWz3x%=umLl)`!2GQPKFRXgAa&aA9D!k;_fSuV3>vUbR>Ew+ENu`CSSO(eY6g&0@a>aCi&BTQNcqe}jrp0o zfgshzsA@`L8UEu%-5Q4V={(g>X zotAzhIyo*q(6<+6_n}jy#)*fg)ppU}tjj~MZuCuNU? z>kJUleD#RLH5hH7>>*?RJ}f|tRak0(x#1htz>~;4q-CHb z47f0svB20KDmRlo94{sj40lAaQp)!j5xq}4JR}m;KQ!yKC|kmhY4;9|jeuGjjc|Mz zlM!=)B?1t}LIpkKV zWin{jV;zop&Qi1cN+Ko@h|FB*f6R93?koAW_&~bg7%afS65ws1kGaq~-I0B%j_xg*J8hF(wfr& zQ)V-}5`)5CH%^ZQ!BEJd1tQ)7Zk5;O5$|@CoSs>(_m0*1Kw+(j?0w<$+uFN?!1Sou zmqilwxV|iCeSa3%4(hD;x48Z+MK@aQZ#u?RnbS$51})R$?2l$Y z6`V`Eek#@0{wymZmazteukElN{;9{C6=9aVQdSKV3lA6a1}#tF|NPXZo`$sO02K;S z%UBa6>J$a+wVp(J$9-Qg^d(Vn2pL%F$u^T~M_r=F(G&5E!v zuWBuU_{r<6iPG!Hw_AakCh^X0o& z8J}#@>Zda%`5Wdnge#w29-*3aT3}#=Riwh=1FZgAv(1}{tgA_T@^bS~I5~~i560^Y zV%K+1V<#2$aB2~PXeR}(W_QC1g5&6qWK;%{ofvfr@?8r_Pn3p1YQUll5`dK~VgRcV zo*;5xoybl$M{ND@|I1jE>q5rbS8|#9bQ?7ZHu@QN<&CK1&5PT{Z6t7zTIVYWcx}`jro@uU^5eAeQ&})G_L4^aaWr&d&3z;vr@p$} za`OS>=2~|1H?LT3PQ9J4k95G{TyaeyGbl=Epou9}+P6TQ;?`Md7|S&&vyL7?VeNCe zK3fBE8nibTLgh|sFU!!v1;6#(tt?MO9pH3}x5764o)SWKXd5wYA6uU@Edhp9wrg)* z%s9l;C||v1PPB#v?7lUP?oz<6VS`V7?J@RP4U$MG?um))iJ~_wPuvstL?(M;)?Y18 zxZ|D>*b~l|`DEZB?N`%Gs7XQ~Oa+i?Kt9W1C+Ho{381drti3QG63vFIvm6wcSAa#2 z2zZDbIW}z}zdySc3y)2J~y7RkB(N*l#*55So2%94xn1IMr)=Hr-z zl!y7B2QgGcO$PC4A?07G6H>H(H^9?Mb+}>~#Z+Eyqr_WcBUt+k?GQ+>Sz7DaG_2tN zB!THWdB{%dF;(E45mbS}%>*f5AAWvYWke`mSyUo zY5*)R*^;lbgH=BrI)ufm+0hmEfyqzeBcsDA#<}MTf2g8Pu$1?ASB@LxU#vNKM%cT| zjV1w%sRsa@iB+5_kpuZF-r;6+JmY;=V~lNCkor~b76S9|Y!`R>e*L*|T-&ura(HwO zXRr}6aaU`zMPjg1Y~;XmD_?rVyCiP~9n<*UlM{^hfPTVHx-s__CS4u=Gpot~7^dBx z!FA0_AyCFB|LwoT2ui#CR?ZggVK;C8y@}xaw6Vs`YuL>Tms@V88#goAP4N}W&7Q{1 zzU=0K7kEW}(*JIW&FL^~Q|2~&WzLz%MTD|lC(+$n=j}^skMM+r}9a!1{*iCq& zwxVA&^gNNm>=t-n4$=#4)_%xi;_AiGc`1v#40%oB_2Pse zy6oYGuy`DuzLSbPq4|ZB;Lwf7ui7bSYdz1WsOe0faf?yjo*{d-Ls-0qP`$aMJfiI4 z__gX#hQjObOUYs`+v1KdBVCm@oi@xCU*v{vadJHhnUekw1KD8dH&Wco@q^F#112 zwuw%Uw@DD*&O>;vVWNkZK6VH{#y*q>NAInd@nY!uyol*{Xo2xdg+P9k9V>JBH^~2o z^$CFLZU^dkvL^fr`pN6?PYiO_&tu`F<88eXv_?mSjfavpwpfXJ&z044ulXZMVYC0t zq_Eac7+>0}oqIDor&3rHz$%M;l6vOD4CRrLYA0`E)`V~2hW3i(#!vjl3b@hqqUFX~ ze&ZFm@waDVH!`J+?8pVPq;&KF@vCtnZlLY-4ls-VC|x>`y;yK zyClD6^N1%UCEw*+=T5KIC&l7`;nWNNA%>40vEKKx-X|KX>w8T96(zjY6Se6(F9Cw zovHtcYK#Ei9%qrO-AaucPF}b8CF)7z;^gXk6Y=w33QHDZ-6S*}H2RjKhMZz&X{{o) zZJJA3-bfG}?wLRpkGtZj9~x7HCqMjK$SSCxZJuxST5US^39G?AVS%k=GX+FZvoSXi zwUI?vqanw;87|?UhbJ+)3pCz*2?Xmb3FYk@seEyXgr<-bBBwm%Cs}E9D-n(-b$DJe zOQDPix}VGcr>NTpLel`8J=N{&lSp;@;N4vDn>=CME*!#A-{aKdd}A}lS!<(l0QJO8 z!szd{Vq{U+Rp>^nc?Hyr4t9w`jnpxvvWfv_Lglzg6GEPW{ zm_iu}1C+Hn8>4B&k`&zTXSiIW5 zD4*Y!^FJ@2&phw{$>;xr^7(UD{%^|Xjw}EB_L zm3Cn-q|8b?LU115HAP&07+qH~K%`k~lPW5N}89R#vmZVKo&|a&Xl)#F`FUIu55- zvL41WoI-rVVz+-!17!=cmc%cj!R_{~826bX`CH)OXYKTvWGCimwb*sW>d1xUPU}*Z znV>aD+-wfsqs#4zX(bGK1nM zUYS1t3PX7NGDfG9CQ{H7^+wOQgaV(!4Lyy(r*+X*G)iCeFiq<1#Vqh?^=W3{)2KWa zbCiEN?>{K1w0Pp?xZyI}30bM=pdNc$JMU`m1WCN#F?+nE<^ulRpMj5!!fM18`ObX* zbS!sOZ@hk364!Zqi`O}3_f&J8t77;*>W>F{vi0TkMDF&WD**toYMG%=EWwSasO_>U zE`ET^3Y$MKNI+t3^n_59!6w>0NPEG~@2GqLD@3U}t7c3V+xsy@C_-;+*Ej~m;4wf% zV~9M2z+EI%mw;C60lBZvEv_qB;Km?PSv*L#cV-($J$E;{>a&YT_*lZq*}?tdp6VEv zINBvFrvBC>p$6-G=jk&Oc|5^vJLpg17+3Mx)Q^PuzncgLt0ed=esdc(F(DZ;Ts!;1 zT=s&n7!v?y55`^nRkuiIePul245!0;xsf!jy)n?7CwW+k`v^<0ABLJkgHm9Rn>yPl zn|&FwvLEfshl<6_s5%;-ATQD|>-qiwia>i1b(H@n&g?K7--LIwx=pRvq_6;IkY&t(zRu1cl2`WohQhnTl5RL3 z{}EjCv#mxSp8a|QNhhte9xu7q4_*>C(n_=!9M)G``cd-OHHR5ciOF-Ip-pzRi>#z> zBuY;wMQ)MV>RLoh>cFdo&t!0z-9;SlD((|sj;`2Ivd^M6R$=#mB9PH3UoA9)HS=;v zp}A>Pk`5?BtHjb!%5EuCnt}d~Ml27gPGlwUn47t$lexUKa6gcT;Jd}`zB}IfU94ln zO5ojO{T#&}Ym=}56?{wbuaeMVhw*R{i>!QL3=i_zq1`uz`EX-VpuWkL2vFMhkFtwg z#m|NRZpt07JYfm?`LiV;5iAhR1!lUBWuRxq$ZZq;aAM@P&i>nY?eTu=y%y+9J1@fpDO6A4)UdAfs{B(ct& zi|V(*lP+y`GageD?9#B5JamGur*aWuh2VdaKu=XHC$P;>6L{0P^{5#o6nD)70L<l-n_}tB*9KN*b~X$CJ8$C3XKoR&+PUZGW%* z_54Uw-vpaWWvaE zPN>Oq(g_EO6oMN5#8Wc?bz@!}7>g1a)oZZ5(LzqAWQ8iy-gh9*Mo8)?1frdiraju& zN>wGvq#=uJT0-D`lJ2O-OCIK2jin-`DXbwU<#QPJOBNhT{qInxtHJEZ9BpJT7LAld zDIUC+p#Kf_8Kjv%<*;bpp$W`V<6!v?&KyQcX&5g;i1Yy&4rON!D_IJY2&Yb8_VaX2 z^*z1)k+O?>&mM+&aGo{Zmp*wt8keb01a&;A*6yH!*M*WCD#&t+2UxXWmcS7zvd8yh zFObx!jy!cHwi@x~$jagHK=XV-JS`+VZBy$khVQa?7#ZGgfZ?scQx~yC>pK|1G6m?X zFpce$cIb#@hmd<`^g%mQCE1x!Pc?TfcyENtNP+HKfQrdEn0>(DdUJ2gKe(Op58jTn zvV4f2eA|t#g0dL(oD@?sXysc61JX;CF2v4v8W2}`i1 zSYGzsz{^G;FPmuOsp!~9(S zN$M!Rg4eU55LB%#?QA95k<>Cq0%)KuEzr&#fZw}d`}|DSNlZ=Mw3tVMGW^C?VVo3! z&m#%FfZOubNFL&dc1@-uJl%_jh~9?#1N8g?y;H5dcsgY~(1ki505cU|D2o=VPn~?+Uy*Q^sm>1H0rCXt_d`J9K zX?${N+9W(#Lv*S82I`>WRw(*)0;&Q-_VYcF zw2GmTXr7`0ddT7~z>wm18H>5`L~GQt148ft6ht53{mj+jPAWPr^Lo?DGgM%>hL<%b zk?1o-J8dV@Qyf0zV3q=545&jU(&WTHl*K2Qr8!`N`h`Y(XX=o9EL)c>8oIs^AOa&= zT-QB9BwS7;yq(n+O+U=mQ_cw#r|T`vtjpY(xy@~Y)I0}Aek(K9*3C1dUhYoq4(-ti zX&5_4oznooO1V5_znh2;Rs%bxdC!T&KBBTx^`WUict4^=)1bXo#m|%9@@!a?b_k>} zy|*FXtEso$Ygym_VeZT0o2s(M8%iNS@dX8pf*Q1=sg6!jm}x9%OB;BR1dA-&aDZAE7!J3otxX<;QXQ@85 z0%_(7#Ghy<+#jq77h2@(<5V>bLvl92rjzpd4T6@s)yl4J%L3nEUAC_=H#N!(oz^ju z#M|Y#zH(H)nI_pX)sIGJj)yYB4)YGV5e=Y2?u zbJH=iF%yRLSn&zY^oR(s7{|K^|M7fgpf(eF^DH?-ulrz+AOI6}KFTm}vd-weU3SW? zwhBI9#^osK77Y-UK#PM_tdZu`F?}?Wbep7)hG}11(#VtsS+FdeNkCK_7eZflHQ%Cp z2M8ZqyFvMHr&7MQZ$F@PS2C_8AE^tH~Q5!TmYkQ#vz-4}vwEjk=MYVVOAb=)v?oy|UiG)pBp zLI1oI+c#y6TP96^z`}_I?njpdT#)3ikp;IEQ~p7__LmrIv>wX}+?os^i-4lr?@jxQ zHtehk=!ECVD6Cf|9(%jg`(T^V3U(UH5Wn*-J|#*4eHU27IxI=Oua2Y z5aICrIcSeaPu?(=hC{7GFMFM&Pa=@IR?$W%;Zcv44}DXHKT6UBiBc`9Vy*Xin2YmX z&~?KDv##(eIbKlfWmT=B{zhTS^I=xriWSqA8(z%Vs)oUoCQ!x?Y%s5%cv-#?(%u&LcmOVw<2pSFv3Ra_Z@7 zXbKRqHt+>HSqbKYtTae2CX`sHC;zgrmz5{46xyLflF(6qOA2G;l4l;J)S8IPlTlEL zDlC~|`+_kB{7R&l zX8jX%Ep&>Yb4MI69m~n#0Z#M>Sdsx6&24m}z)A06TBYb_^LbN24n4RIA1tnf>s-1< zn^N>0S{+z57c{(=2oxUSNmle}U5?=Lo$t3}Vxj03{jwD%3xXd3jlPDD(7qA5odnZ* z$}SH*+35*7PDk?Ny8J}LhI$*c_W)s2Xc0!;e26BjG^?=eHvwL9C-Z0JYb_5m8@L&^ zf!gi6V@Y3ak;4bRVjOQ`3z(*Tg0o_Zt!?0YZO%{FXAYM*u_A?K9|V05J9Qdj z@S%$uh`~+{>~Xy|>>L(@u!nF%SAjIW3H;tw3bIhDm6Dx4Iq+)14c%;NMTJ1=I_EZ*+y;S`!`0Y6q59*XFVj0r(r_aH)*t?L&v_}d(1~*`B zW_$#HF&9R`Vk`ADF8_s^$56Ig>LzAQB-en7dTW7n^GbNT-!eoFyF5!)$WnvAWIwet z!L#c{Iy|qyUqG{XFujx@3gkd1*j%1qG=NnGtrKsJP$nUCFQ=UXLY7N3^~ymI+XaX* zeLiBJ6zp~mfjHV3;aZF37vnl?b9LC}8L-`!fUUAmyTC$_{n`xKwV6qfU5Su|Zq~Ln zL$z13B$c3FBFI`qX0(QcNdnU{0j6F_U^+T?u9 zDTzpvm$Brot{{24*_rNS*!|UgjMzrVSWpup?1XiOhZCWNn>Q4>4i!kg&GPV#9%Uvo z9|{isjF3pKsG`(Y8QOgjV4cH}Q75>STCVAADu6|P5xf(v!L{>z1ePke%l zK&L6033G#=@us-p``b-wlA zn89cw7Hvk%#Z3UwY#S@rabdOMPCOY}=~Z8)(Rh`^Ky~B~uOmiZL=0Kk=5g)tNWHh< zK?~)v!^79ylY|;}PxcPaR2uUX8~jK^`|r{sr9M6KHWQ>5)=!YD*l+ zB&W}H3h)tH-%PE80^y*1ekZ8)OMJ>fU-mK2)MM0v9e!X3(i6Q}qQ~zl#HqhvOL_Mq zyxsXVs*NTWA*LCI#Y(2SB1_P^(rwD_b?h}x7XhnS0ZJdWpxs* zbMf&rud_!1K#e9(hxy2@ooK=2b^as+%?p#AURMm(S(bWl5?C~j*7?veUT2RIX>^^( zXBg}J2&N?%4$_AlY(47W{F-UB*rC<3baOQdv|+AQ-r)zmQojR)5MJejasvYv{>aQa zw{fcP-(JUsb4FC24iEAO4=!zQ;KBLre-|FCx~OS9c>BBr9@ti!@ZgzFjpMGxbC^SZ&G60;6$ZCIfD94Yh73H7b7yklS4jNosJ3Tc{oT1mQ^dPXJqhY z*EK+fMT`s|dX=AbWh5{UkA-rf8!0#kv%aVSAowHDJ~aL{HhwM=JSS?~U8j@$PG9x` z&(wMy73vuk?#41%N)@8QK+HLqBu2IkMlo`Pj)DHJi|k=2O%9T#lMpAaRa_Wm?ygw8 zF}W@UG4?}K;IRi|RiuZ!9x(Chsg6MA*KpmFr-d;y*xPs;U% zNAu?6u*Jk*1XB5wUkR^B!-#p5S_~f7?XS#de24(R27u}79RFP8GmKon{H7z9k=@s( zXZKx9*?r3vc%T!(X+8wS13Hp|Bd~K!_2F~v2mRcHx5BZ&sj1xb#aPcdJ0c4*qVBuk+~ZX~x&ucKEu}0XpZ3Zy=9N6LCjQo;3Np zp_iGQM7v5phxf3(QKw;ig3w;a33rm2hjG=V z(7iAM)`y4h6%gZ!9pbk39BJ6r_9Kb5)*PX>Zosx8J7bCUOqu69qP@UX z+xI-SeO1uj3+DFF8)9i@8r!UNXC&Q&NSk~TT^x!t-Jzd~!lPT_%Z3M&+IGW=%ryP% z%NzcP?D$2LZDv%~Fx`bhR;+5@8A$E~t*BV*%QVSaa!SP4^2sh3u*4Ic!|RA*e0X!OfLw>Af|s$fT<+Rg`7| zO%2sWuAK$a%`21^(tFEX+N5@tjiHTFzb}hid*JpaWpMMvInuLyG^(EhW*?Wc56H@w za`mxhupMb8yQ%|0WHD{EYYz&v5Ef&nIKmfsaBmcK109&?CSzwB6JV}xqF{bW&#{V9 zLRwJ$Xkcl%x0Vhge_+RC*5c$oFl?>!04aWWZL7Vg*e((gS}W4g46 z>3XS#dZ|yTGxi@|*TgKl*nh}UaCJ)*8jpa?*vB$svuM54fnHX(?>Fr~A`SJ@#}oUH zDJ>agh#s&Sv`6ief79^x|JL^+N+oNd*zUF8!~~wCY8~Y*F#hKo)H+sJP|M7bLc1u^ z$V8iZpas%Z71HJp^)Y$_=`&%uew7}I{ju8iqG~45D)Hq`L3kU)-&Hz=o&?TdHfaO3 z3Ob`CAe&kDhs|GTymS@SX`mwir#WA_tvavu- zD++gBiS5F-#V-`pop6UXx}|=bid-9j1eYOY8sFBAySP9tYmQo+Wp(xL^cU*lq16Ra zzqPJaQomJ7zvhTTI|a3Bf?BC)b@rhK)GCFpNBTrd_P;V%rFnEO6K)rB`~>YSO$z>} zxtJI7FIQ`j$0ORr8T7_#d0I)T2-Sncf!gwl5V5<#lM zCrt*D1$xa}L-cy;vs8LL)Z#SsI=lJ*O0U3K<|S<*?#wDqqSqfm>oe^i`jH-%OtI~F z{{SSL@oUp0`x%q$e4n!0*gbrJG@I*DzC)TVmV$q1uG8$VOtW7n(d=tcoo076O0&PD z8#McEIwKBaG9Fgxgewnk|9E4B`{*V=r&m%|p%~+7#mr z1EG(X(S`R5-9LpTcJLCz(hghI$ZlbL3mu)#hW`UTd0OcBbR&J*)K($f$JXd};d-g* z?ZO|h`Z4E8>ppRjv0aF38EqTeF2uFHnYdj5oB~7=d(pj8u(a735viTfYZe2aUZh>a6w{``Bjcul5@V4|46*r@j7wL4)d()( z{z5zNt445X+F}g-7VHkiq7^A$+h}s8Ais&t8&f?gR){<0d`AjafNrh5OLuV(@hbd^ zZ22*f7Z*j3i$u=|+=rAOOYB3E(fjCec-%Rr=WT8<1xXNUV3MwrNJ=6t)L)LyUvl-d=WjD1Qgy#6S0eQZ8lzlv1J zdnL9t18j&C>VS?WvXuaMa|D2-d$Z1&_;Azz<;9OMV>xo?2GUE81V_nG4jQ!#C{a^zaQfM_bc*7h^$Y%e+|C> zEWYncdVeau|3|z(CT_le3i0z^ynj*B`~C6#lSrZAp4;%9e0=Aa@MjJ8?26C7BMR2N z36>*IQBLwIclnw=ZH=$Z=VnfQ0Kebh--qz`?2SD9#q7(q9q%dP-ZOk}4c-gzH%svM z1N{2~{5@2>`w}{&R;>iIS{dod-(7)24#RUTcb{pc$=M5-3UY-JlxlT(YM1kq!(xlPX+_y6mCe54W=Pr6evtvulu$#G9iI?R7h*T z6v9Uy*QRl8y6&ZV^+V`nd$;!N7qLb{Qb$yJFrL@n8VBNYJ>nR1qeu)LI6s0pwkwUW zMb46o+R7k@hx0qTP5BkDbNt(NZ{%k)|JIwcN2aB5{$f*m_nfZ1I&LrS%X@^=lwhjb zN?0(D>tVb9@91g+YX$vk^%M}0nT2T>5iBQ7@s5}o^&EI9vRnA8D{DQj{ekMpcYJM( zFWOi5OSzNsNi;P+0q_epXvBzzZW^CY?fL(N&!`zSjn7+- zoALR!Gvo8*6nq~1<5}VJ9R@!4G~@HCovHZzTrxhlv^0j#QmB`O5X%7)6;^ODg-foivneAQoMx;?~&(L>fLs3 z1mC3~r2tLz`GdfzAIGK z{lC(~T7w?$7W7b+L=S^7S&5M&;EMrr`0tcyV(PE-N10hqaXD^~=Fd|i1ZD9~YG6J` zJBPN?RfsFGg|WVf$>-0#3iI%6l$BM1bE7?Y{waaU`%Hm^tNeBhOqRp5^9ek&F*U4I zv6b*-HuW2xd4M{sfuYe$Ve#kFVjeGkqIINziTU_jr5UI`yBsca@;6Gj2wj=qCmTy> zAvHV1s&D(;vPylZ^J(hJH+p)YZ9Hik^^0`OZfg#TQ9bCJTr_zkPzd6T1C$o86A!E9sIj1VWF8`qnEJP66Ad25I;qr^V-r1-|r#C(S8R zL%itSU(O>-$<}v-r37PSq+Jg*GtwtkhCy7G1f^;V0t1H(UZ+Y)X~`cPdTtNwj3P5PV6tD#kl?Hntn> zV50x!$p~S#7tqUy5d$1!uYs0n1=X1}7>s`6(TN_z3lYyX^{CLG7jLEY(!5HbT&~`k zA*Z)Q=TR~!9RT`UMXiK!Rb|U6%%*l!3U+KwQ=pp_^_cGm=O|4NqZOutP{JaGc9Tse zM0p4nq|I;xmK+cW0{t?B2+XKyo`dtIwNU5dV$t~og8rs(dyN)EanAuvFZa3*NWrO8 zo{nySTHu}se`mnrTLJ{8Z3_S1f0n7QTSVw{4US+jc z#LbtsD01aDM&RbS`PF_0<%+yen!c(T#?2Qvr3wswGRG?uE2OCEaEaA0gvx4|n95>i zDhEvY8pfX~zWRW_9Yt5SDZM(c%sJM{e&4eNHq8?ZLS0N9E5tjs=8N!Qi42>sTpj;i zh&G6lQm`H3beQr!!Ba-11MsecEZl-xpJvHQzE0#M1X;3Ehe`715 zZ~Z%9X{e_##&D#2)vjLk0V~EqS|a{dd$1=mBAdvBEnv4{AZ` zOonB;tYF`JD2TFFyPDjv=_^nwBlpJ>c|Ye@CdjSpMlGFNVU~o9Y7pouGnpg>t=DSX zx8qAX-{Obv@<{$kLr)taJ>ssEvfc9{lTLE|TO#^2%;Oof!>#BWqJLsekr9~Z1C&0W zdS{l^>)H;QNz(pLR&NkgashpZ@%H%?Z*QWLyyE2i604^^nEb@{rrvzA|nJ ztutvhD$QDU0lwGzuZ8Z!`nfRF2h-RB&-|OGqSm41!&e^H4zF?y>mBmfFg0p1i&|Wa zYpxCX5)}yM2`nh3P#S4Wp&2|-lFa&XS=T!F=BLH6c;t#Y)B3DJc0t3cmO_|9C90C& z>s~2X4vj_X2>-OVK8%s<$2qe`#o=xk;#M)54sClJ#v=QA|6 zP{~9o_=n>rX*q-Byh%(iu_VN_@_8fm-nu8`b9BG3hlWt`h)k;^{ug=uII%mVNStG2 zIL3^+z5&ci72@HUcG}1v)idfP3-OwLsB72A;i1W5k66qOqHi@sf)!5$oOQ9J69EvL zbQ&;}jtsg#^K@H22%vgz6aZxw%6Rt|xJ&XgJiw627&#n#F|DCY1|1MHUR2aN&13T_ zqpL_P(-__5+S7+p;Is9B8EfEf4rEQM#?vJQ!3t6WzJkr&yxHUH(zG;P>Y5s5T22`$M`3d8CDDcj** zsN#Xbt75EyAEUo$mwqe0UQAnpuilBpqaQ)j=e|Pml!b4Rm2KC*p`$)a#Z;A=#B2{O zN}4UsAMRjLy@v?+`~+9owF5g8BrKvWXJ{Y5+tI07n-5Y)2Q58EZ3}6L4GB`i!g<=G z@GycQquSluXpH4^V2tN&jgbgY=Ng~la9!Jhv->Z?C?CU$n@i&)agIKR-)dJ-=8T@X z*v{u$2rSWZ4B&j2)_5^-icmZdjG#Zq6azs`g~f7sKJzQydyuL=KaKC$b#BDkte`<%j;49lIb5=i z%B{gm$gas%9*wrfIaiM?7zMuJnOaMK09GD&I~Q5k#U}kNm-@r%5`jgF&0A4kVT|6a@$mD2wy zEvf%UUTe_*%U<=NVP4le^`z$YCk|3*_Ze9}<#0uL0!nBPso*rbrB0t(>Ts*~J3(K` zA%p;aFrWIr1ydF#^_)N3pJC zF7bqSP)=&ox1iZ=rB^9&81tWxPdwp4m?x~Ppw`WnH^uGL9zfzjy?RbdV*OkX%&=x+Ld??Qc*$LhB=Fi=qHR&DN=r5kjkx=Y1wm;eBuR)J4d^E4T;Kn zqPAKow3aqj6H3ygpiC+>LDQ^_P@YzVIs81p$SuplOcE|E6Q&Z-y_t8j_(lJ4QUtX- zPcoUmS}!b-g~MPyoV+~-=#)5%V5=imO@J7d$-xWeFP4>c#(t&@HWub}qWqLe>*VCG zd7V72fTL8BrcK3la!qoMo1m`~X0HBm6RnZ4KD67w_0iXc5^cC~0UgB4VWpNX6f4!S zMaoJ!X{8<^e;(GApo{W#iIrk1a;u|r$gRxFI`AU4f&NlaWpRi0^8OjwlmOCA`~oo*iZCs6G~;lr=#J<(tHGLEhz@PqVDn$MT)H^ zzu8y{{soztPBQZulC1FTwj6p`>z>OGK1>tYeYJPLrd9N+0kk}@VI)7_tB#%lmnDSY zSYe#@8kh-J-*2=u_8YY0nZ7O~P3&++wcs5NCE-P@GjDRd%1%V#}kx~x}n#Rn6 zaI)1hpR$4x2Xt#BK5g`IL%7Q!EXAk>Gv#fV^V=U;dSd521P z%N$q=HKP*bB+{(9ddbSU6uS5r?eNT&#LgKFEQ!4TUE%P#c1xi=RA>hq`%G=Z5rZgx z(dj}bin~dfoNpqEPezm^(Ztfyl7`fvSOMQl<#6wY#Gp*hAp(LI{EZRB>@R;8LDZzD z;Of=QbX--_&x#->rZ-LyN4=JPqLY!Sos?VjPCBNafgolk2x8_L31X(1Ak+>2Jwg1Z zF@kWM6+zrtXAp$G*=#kh5ps}%7sN>BnF+%0AY$119ub2S+pkj+qhO{k7)yxPj6w$ zLrvg` z#dyD>-gv*PKK1<(_2PZn&p!)GLA#z2m2@4!YkT5)5AOHrSd6a&c>NY#!v-Ys`7zq7 zM7}bzk=~D9o>HD=%CTCh*8}+& zjUjp&CQKtNCuI-L#RQF%B_8u{+GWbOk2aHsQ7==z^kK@>W#mumQ8pzTq7&zHSn;QY zC(q{?`xnf$sr}jIc5T0gtnAj+G}&k5EyM< ziQdp^e}rG)h^+YX@oo$9bHMuq$Ri$g5a{VX*cP~RF%Z&$K3y=RLB*(6Jj5B)VYfF- z!+kVnOCfiaIdRj5b%82*pB^4t=27-) zJ8DT+abWEuQR1mKUu^#NksJL=?Q_D%jsD)oevcF0e`OK5?)Pc4suKIW=yiI0(Qd8$ zb(#_v7LE*yk^D#f0P8mof1MpLt%M;wUkvgjBw9d0ktYFv%<;r2b*w-y@t^C%TFGPD zl#VXI&=7d*UYwk;M%)YioWS#N%B3fYS*1?24vW0~{07f*B+QLwDXLj5rdN)kkIn~6 zO}kllP5~uAL1$Q#GT)K>ZdS;R)v1EHMVy8nPqm*B4tKmJ-~hvjzJB)f9^8UJy;b)k z25O5eBYFH=DS%XC~`3GSN!7#?u-UC8AxY1wjrTAD> zgQgHXJ9N2%4m6r|;RLq(HSI?_)9~LTt3!s#>d29@E9xKQ;mG6jWEe#nis&3*j@0#X zE9S7viC#kfar#@!t>!m9oOTX03(U6R`aZCk#^F&XIW^}$5MVY;_(qA3JK<{X)$OM} z_J|Z6l*jFH5tK`r!JNt-?ORYgFy7r>fVJRz+t4WvllcKfNzp4R&HfKX$@C_>zVWvx z!g}!ch`Do?C7rvLaK(IAS8pwFUnXhNFutLK?}gF9nHRsr$lV*$m%jh9ler zCkG8n!DR&ZGN<;Z7ii*MwZ!VhOo``{|Jo+x_*S5s&V;SHeK-K~ZUOxZP>gG9Tw35& zPsK_r2B=#Dn<%dxHosvO!xfWxxt2*mPIJ16=u}UDd4_gv)NUTN3-IXd@@Lt`w&Jig zr4W|ye2?14iWL^+Y*7V3lcx7UEpU+Fk2{zz6#m=bkIN(b40=N{bcpKqku|4`^E-Wf zQH9^r!9mKUR`Urc()|H?=K7LcU`uwL@(=W?ldWF$&mQH7SN)T$-i+eLoBbOGd@Gt< zeJouREEtn7t5aY^$GzEmI8H2`yaFa8WL$MpXg037Wv{F}WW^Fc4eOp@G$-Izmy{IE z@6-D4*L>BuH@XUVAvA~QvbbqNS(5cMCT8A?Mw_EI)gd%b+7$^eLpHaJP;g z*U`X@R3h(D(#?;{*@v{~Qvk>Gt0E5w{w7-7uWb|$$Q{9;e|Q4VwfXo^od7_&%E#F; zPkFQVdD6ROQTrF**5aSq-$C#tKGo}udz6y{Wc3lC>Ps-Gx&3Z0f(r`WhZ`LbbIWK0^E56{L zu?F>uGxU4rdc_B3#~Rct{!YJ_RIj*(?>1Gh`1Pd64eAxoNP66$UhxA-j~mo0zA5SP z+0-j;q<$t%gUU?4LZEFCy<~)5e+ugzS;yG9tmJCM-cBU-CV-+;KXyA+qWna-MuqGVvUgm6{?YF15=iRc7M+SfUOT(hnJI z8_^H3E+YWT-w)UTbf{I3mB@{P4;zB+OQ()GPjy>eh0d{UP#+0BjB@{5(QZa8&2a9M1ZS5C&Q~{t^M!Gf{Vy5bM=0U43Euv@ivZT0&Hh{_XrjaYa)5JztDfs~ zGW-J(0pvfx11$a6=p=N0c^GSA`|lUvIk`F(cdN~_7&`F%xGH0T0>xZvS#|u&MR~4M z2vwvh5(Q?TBF5{+TxnJ;SBpvM2wsbHaNUB~q>goZ)rTA)pM1&@pVWId^4{|}$NLX> zjH$d!!K@v9M-zZ={rI3{idG1Y14X4)tp^kyFu z$#bN?*JA%13p=qjg~2Kxk*w$PAOEIxMQs|@N1Yf@lV2($yq$OqtNJkb9JNC^oS!2r z>k!-?WgVS=$f4CzaJ5B0|9~yBG`+QP`hkF9ugV;+@(`bSz%`ysIjFKMm7^u)zA#@~ zG>;WgJA*kRqo+u-DK%_je9H1N2F*!>UUChs4$ zNPv?-G?#z?m_z~2R$7>muqdZsLkWxW3s$0`%(0Y3UG2t;vtRr5ZB9nOKJsQLKlu;T zs_%^JaT?I79{P&Tc^Ko#axI|xMT|(cM)uAxr zhn-$m6zc#=y}zOWYnd|3OiI%RzRhWqyxCDeYI*7jUimy6{nQh*@|~5f)!HQmN^Rs1 zc>IJr%+cuoI+fy@Xw@Ta@Qi0xnB09l6ES}AL-r>a800N-=jTAPKYu+Dp!Uc*zIp45 zF@2o>NA;Hu%M<#`Z8IC{FYcL*=r1`l8|p8eXEvt4w3vBX{bl#Fr`2CpJ$qXH<&9@g ztG_({?0=)b$Z8)OE@UpARg@^R^PZmp(okI4_!AS(=g8&pAetaUEbm>~h{<$JjmS0;Z0mm-Y#} z4u&InptI7RV_FpC$xG_h2vSG*C1rb)dZ)a~=$-jNYW(RI zQ+2n)Y{>bmwSgrE>$efjUy&{xhY8~_J$?v}fAfX`dg6JpT$^uWDwr@0{Nt^Iq^xZkeuVe^I>v8 zR|_dfFkemg=-Y0Op0C1(gDz08;QClQ+2wB{8}vh5yz|#IHgMA_kcBP#S z{BlkT_+4=_6@GRBzd6q{{BjzBU+^7YPO!ZJ;cyUsM>i$T)7N7h#x&udzAREtVfxD==%=B58)AX|%0xcjdk7Xy8$kyz zJxl!{(ObVwzaPTir`}>b2N^zgB3s3+6z>k3L9Hu))C*tS^cJ`3o5&Ks1GZ|bWi?=A z4c{z&hOdXfb@VNK{bDhQUqGYKcGYkjtwbAN;>|B=_?s-zMy6<^JzPibCI6nDRbycU z19+&~iIx1~k55xQoByNwM&P-Gz7ct%p}w(6zh~Ars`Q&CedBHYUXs4?+!LqOH%gNp zH_$hRBt34RZ@7{kH_$gOOnQ7a_7aa0XZgEUm%nzDza(m3vZRXI4=lQ?&aJKqAZ-BW4i?%brEHz9>^VaFO$=rWc< z+cD1ixyB;u>AWlgq6U}#9}9%ES&jUQceFKcvQzzxN>8nu;1(G#?2W1X?Vh+UAKZe! zMVjVsgA)AhN^%ZzTO(ak?3EEd_|}>&(7F3^(|MXGz69{qy1c2o6!Wf)mLoDs_(uu8 zJg_ObMmrJY$*K&aTODYn$8%T9YJompA3QGL_uHHj<878K&w$Ud*Ku|9E954hmLiM! z1!s^3yhEF?jQPY@)yyY8ebQ`Wh5fxZJjJTVBW%J#%qMw^Z>K~DUh?|5*6SMQVDpfN za*nwoQt-BBWMifb41+$2CCbt7qds$t*L4!g+sxiQ14!}Mba4ogY zJJzF@f`cq<55MST9_N_zr0H1wNiBoUCt0P6pY)6kQZSp|ZJq_{b^9#P>NiT+@K=^5 zZ=hw4zJTq5dW@Cm{ZGisAMwrRS*-sf&EXCnMuMMIrQ*F@F~ynY&z3P!m)hGQyP8Q9 zxHqPw+S}>sWtFC5QU>#R=$*oxav^aBUt07cs>^M_?R*1nn7@EvCcD05JD;A}01MB9 zB=}wmq1T4&S|d$+5`L84+qFs^;4H`kx{wMrS?aZw(@y{^+moUn!hPHl97=&N-rGYt zL5w^J)DP!r*<5<%*q6HQjlN;fV{kP*_Ww*aKq<5=&8P>ceBbq3Don3m6Ie45tgEH$ z%w({xYGD79l_=X47tjgb_u6tKUjP?pfLKleKy=cl*UDr2Ku!!EI9{VFB6TMmJz~n@veG+pK^0^UvE)d&MDfn?3=Ayi^DDPp%*t@*y z6gwS`e8ag$w1e|waXb=68hfmVp5#Od{>@^5D+LP)0~-(pjbD0f4N>W>4yt)~wdf=E zBKILxe3c*wN3+ut#B9P3#B12s-t17Dxe6T8v}sA$`x9KL*GJDKd+9>m0N4@jrF|W) z^<#FkxwHzAb&oq(h34hZ2Wgli!F7xfQz|@QAm^cm>ramkBpFpX{~P1&npD1E;6ZBs zltNQJlgY$kafk)3wPSu{N{jwv&_iQ+Vi*r`Ya|7a(w!aJpFcv>Xb(HX?HF|eEnXkF zEYAMAJ(BU9(3{{I=>=3p^t3H-Ro5yQfh_d_IP~5i;LwEq*yGyaACA3Nk*#VQ_ZVF) z;(x`w&ySAaxc9=O{3rTccZ<1diF`HCloX^oZ#cogY`6xJEn&K1K2Vv^^Of!JurFB! z4@cJ;fCTNbe~A+zsU0RmblaqM`8NFFz+)l&0eiW0_?=^F zARouG>mTOvBw$>A=JC8+-sE_mne%(c^Vc(shaHc`j7}QU`No(oG>@qg$3rH7^fQl# z2BV^TS-Pwg=ckVAn->}%SCT&#!fWi9zcvA{$x|AG7pd68Uv6Z4XxQP=IE>HqeS1ds ztp9mD`zALw9y-PS>Yq))iwC3>KT| z?Q)&Inixm9sj>bY*#Gj`Dez!0*cz_OuIGMt|93Urf8km8|J3jAfBVFy=l{~# z_TS}yn19p#7oK(h@6Y`G^PkZ4{9ioV{tJG8|93UoKjZ(4XN&*4p8fsvAKzsE7|(Hb z{ZIY<{omDS|GfVHIBWQyssE<=;r2(H1r_;301>Sz9Ee%P@aW&f zIL-nd>&lvh$A`22TX;mC{-5D-L!;v%cwCk`o~Gf^#5m3ZKbMp?NguB~f41~-@Ttc7 zuN318Z8RKRPWZ2o!{eQ@vKC!|ul3`eWl$Q&-amu}e)!wVDl2Xd1ZQu_PSay?bNyoU zzdRzmW^74EidNj+9SyWXN952mX;|;z2f5<7gb71ow^UjL7Z!9F35+R_7PQZr0n6Jt z3vbBEP-{(dJNYA94NvmGT`*Z&ibE!*kfMqY=~MJA7k-cM*+!$T)gpL6ELx7fs>3O? z(LXD?4eP_-nVF{jhNiPQqhs+wn0KDDbg>V z^l$PVnqt=ue-evhVm~}gh7TjlpC@}g)LU9dTj` zI&vsmF^9e#2H2U==t5Wz2cZ$}V^??GsMSqi^Yd5Q?1y5>`CHi2_i$swlwBSDeaa`p zDDRQ6ir5!u&QNxddTScxg-mfig3XydCAA4zg+>5va;=v(ZkD@;WaFDB-$ra0T(Wl)~gGt)=xWji_ij-xvbE^KKxj=odG7}NM z$O32LHX>P=@haDGgxtvGfAVD%n*Nh`23Vg+G>h z6FhIt^E;M~na?%^_INEPpshP(b?97o*VT*uf^AmsOh?0Eb}jIsIdb@B4X^NYK+Vy9 zd7E$qpfzZgH|(1ytFS1wg%ptFQ|fVf(ZF-})3NvvX+fdw_8#YtyRSmCSJZd+2c&5) zqhQf`RMa<Lcc10>R<1^L$K4@j3sBN%X{Wsc4Vg1Jo6Uu&1#A4`=d ze15n-#eCc#JQ>*#ol@zu>>->nVNMR5SQkrqnbj6($rzW02Quut;8jcP+U4)+{vI1J zREpd;@#~~-)O@Q~*(Vai)KzO%^nvLNS_7v5)V>_U3PkyMtBw7@*@my4%XjE}G2C|p zPz2|_UDSmpP=jFm=?y%_8k=kCL8}^zapgtq7wJ{@u-h#OU+u_TH23Q$j``T&<|UMe z4uzB!oQ{L>k~2)E0sGrhB z!}4b3Yw=kGwjsAipVfxn7o$|zFbpe3(8K!YkB#wJ!AkAES7Pz#j~H*BICnb&D9h`R z&x%6>v^8O65I6)300ZFmun0klxaLW-Y$5#Xbkq`5?aW5Pe}oqLT@-|`BN z`tdR51H=DGUfYr>tGKc;N%-Jz#(>v{&Qf0Ar_1ZT!&Brn>6A*HzooJ{vU3!^hF*Co z`sx=0qy@cgDAqCBtgb@K>d~&WyEZJ$4jQ@h4Ao7`1PDtD2H47v+s0j6emu*611P6y zm}P=SRH!PXM5NOyA2UCwV>J4j+1}J&XA!YYv6J%$Dt;xVHR95MSikAL#1OlMk$zBJ z{)Q-UKRHs=a0xUE{KYI*mQRV|)N?iWQBIeqf;PM#1lAv3;?F%8)QH{-Ko7+jVn;EG zaFN65lL7gp$MAqYao*BnTMbr9gMa>t|K#I81s1}gHU;IAka_hHQHxgM)29IC8I@8- zr^Um}sy&>gGY5kc^wf+L$9~uS>`U$Ngm1Qkp!Fzy93G_<;DkY#R+%Hi<67_W$bC*B z4)(2JqOnn_fTMUz{w*#hTf>tTp@d->vs;$-kSms3WY=z+cQ98&YuU zQ&-?RrTXEKKjQw>xgo#pHVa$UKKTca2u8e&e5LklBTmMX!LJ85!XU^Edz>y;kzUZL ze#JfLyiiNt58B7WkZO#0E5ko@?VtgeCia9+S>aRG@mA(R_%Gv}Utvpm)s?bRYL#pI z2zQV^?BIY|L~8pum}$18c(?jyc)E0PN{kvIDeAj3VwCmO0^82MK!ts2yNPj{9`!HC zjIw{joXwadR#x_C|G14`S2GrV1Ea7+&{dK}#qfOmy0)-5F_6L`Jh;LV9#A?aN31O@ zLqx7A#Eo$w@TkI(xF<8Mgw|~a*I!af%aZs-{*2O`<&~-OIMtuFjNo+Dl}Vg&B)inw z<=U14U{>*$0azVtM&51jBet6|?XHxjzfDOcvi7E>ab*8%(*+EW-yZE3{$m%)p!);h zuvZyr(|X^XcrI7$wV;b32H#tyU>A|%p}4jv3-?bHK}azhMjiI(n8f_b%YXpMCxN65 zy3;7zmv%tSt}D8`qLFL*DBo)vjAhPu)<&V3H)O)isu~wu* z5(yJqdeq7GUg{s`Xsh0gQ7UM11AGw{BEc2_!I6=+k*`Kl+hLkR<+WE6a0@>;@E}gkdSl|l4m9hqY z9RN<;N5>;CW5~d7Ai1<*Lu1C=Wi^+R<3#UcrS&@ASq|{3kIkXxPsr+%81bxZkr%!7 zCjXB%U&nA`@+It(#&&>B#1sq+3U{m0HjXFZc*kZ3HQpAAacU&B_xqx`0e zZIITo`Z0B=E}ctkDRmQDCk4^Zr|Y*}S9_FSJfG=fVZReusSo$HV3|tfFUsd0>Ae-Q zt9tCMxL~W~y7^RWIrm(O zuomO6qR$&~9z`Vkg~{hCUG^jFmunA`lO{ByrydI5%h28Q0Kk9e9kDq1L5?));Tlz) z{8%gAiyU5r zU2)3slCNF*B+JHpLt%T+vqRrZDpvuR*J}WPeDv+) zBYWzm%rx>UfcsYtV3o3ora=HWM;dgJ&p{SptVz$Wox*o^Xw645KOQuIVC6-1aV>M& zpeV=e&&tIm&5`x8K_U-DP(X@ET3=`)Nqvn43d|E1)>O8w{dpR%1;DrM?F6wQR)QKX zNxr@$1G93`8IJsLcwuZw3*%^FVZ!A1OiQ!Z&p0P_g#WT(9-8_DFH3Rb%Q7;tEG?NY zE!VUm2E-ft3Lw5r2V2>%}$*=G=KbTxdTFYSOUU#3_cV2XWx5f^3S+eSV9tUxyeg0 zl9r;uD!lV>GJr=@798eHA_b6Lo00O8%rDV7i%dC@%;0rs&dEAw4!3=22O80wDg2os zYA4K~=rF6n4>GRL6I@2>i2r0GbYGisw$N?Eyq9_J4BI#EA=t9Mfby~y zHfz5Q;na3v`sdE9+=VAwQi0uu(T^bp`V!)yW@?nJQaGSX`BoQ(bBg~Wj01HNZglcT*5 zI$N~BnHTfSkMutiVl2DcAcY)*6dcK<@Xx!M>o=x96POr{igCfi$QTrcP!eTOy>L2; zN<&l;`PMFb5bu<6tfBVv&CJtn6y{%}ZF-(rjw6jzm03-YMJ${DC_@&ucc5+#0~n69 zn}JK5rR0t@&JVj?2kPYSbf6Xm&lad{b)Yi;ezEvWfPLVuGXvIrm(IV>o*tcSy%O~3 zXp|mjKbK06pN=%sqf?vk973+`vzz10tunU#k*fM&tHiD|iQsZ*MlkM1NYPsHJ4kW( z&1WLTu0t^&6zEprM{?NwyPtIm^w5Q=+ZsSd*{;1k(?G1<6mV;>%D35^@aDDsr_LH|qp%eyi&Q2d16|F39gmqdGpA z-)bC8J^Gy8j-?zlmd^UBYbCfl(5W2xWDcP5J5a{HMn^26I>dcIp> zzY-yX>bXgH=M;h99_3cp6}OX>{+QWHR`0TUU5BOMt7#Z+ww4lz5=)Hs5`HB|{Z=~c zl9i5FpUR_@T0LsO>d6b({I?m80bm_H@EBi#20co^>L1|24k+|;7h3#Qd2v$!pXZg@ z0PAtoMIPgB(*^4Meh5)FP5}>Dj)K}>HW_voJ zO-zJ*eJgDQq2APUNap}o}WXB)ilwIwl;I$N0-ygUQMmb)= zS2irdB?T|QtLohuvSmHov@kktq7Xo}J2{!rYzoo)+r$i zO1RFYYYaz;zC)`6tLB1+_fq8qc!U|`qfhJSP`G^O`(gF_Sfg9?%T|;_!*zZ@qp#s3 z%B+1m`4Fv-1Stb}ft{Xwf=}}Gyf73_sNM$cJwVtLT7*IFAEFVpGGP~L6ucsX!YGRYO2s+I2<&>ybu&`-66t1jAp<{T{l_r;0nKH~;X*45 z5R$yFJ;LS`J2H(-RtDKfSd2`#n8gyR*qJJho0AC_Wx*~vW6wsGKf|<~@hpFanWYGx zC&f1iMO5%D8iv;O5h3{W)^rY@02Cj z^d%sX$_-sq4)^@epACEC#v5R3uK`d_~GNAXg##7j9^pOlwc@=N4TVn*j;{l`cWi?kcA<%3RL zAm-|rK7_h=wqMKT&tr9wPi0j7ITw!c2o}Lid zaM|t)#8w`T)*f|Zl`qV^=P0I{)yX4e{F*ch;XnhSqvDZbBTPh40$uDYPU`XQFHJrE zd)(=I#I-FZf1*D8+AOoQ2xNhzn42Og!aYl`GpOa&JfN2Ky@cj?#$IOjVdsx19U_=$%8P2O;+pr6>F0mM!YquM_8R|eatCi(I{Ej&Ze zjFHFpt=o}47Mp0sru7-0xQ9GWS+0E@j>TQeF}{)$LhA%`XM$ibc?G;uxK+6`tGRX= z_e9EJ(@fk;Y2(I&R|Wi2-RtABe?`9!~LAW~5!PaO~*Lf#MVdepea2939(a zFM{Xp*KTn)VCRUXGa6v&mpKNOp2`I*&CWLj`03g|RK!Amk1zT4hCJQ;v?{5w==g6sqY6E7k?AE5jHhi4g@p8rxC7ncwht8`R?9qd5*|WE>9N+<;$hucZ75Kpa~U% z9{US@!AWm+YgI$^tU-6!xo&ik*${+z%Uw}4NjSJ2WOQ~!5{Ee-#2&JST&In})&$=} z;js)&!O(^W^?6S9PPvaLiAwe%Je`D%Zx32oOnm}dZiz`Kw@*x@!cKCalR~bB>Glg$ zx2>W+Qwn|yZ&cK`_Fr64-%6U^p1ck^>;r}*4{ttAwrq&+6nQ}QYq{Nt*(f#I#jqn5 z`4!kNNu(JDJrUlk0`?Ut)AJ7R*LH$Z5gD1dJ}?2Uha|460PYbvah(Gg6Uj|nZ@3n& zor&v`JW*CgT+cym+G=WlDqPo}((5p3mBVq|dw{Lr>NUF+;k_T-@V4|N)PxDG^}r`9 zx&BS$Y6H(Z{u%JRpl2*jeoWdHJO&DPKkz4j4mx-iw zSIu@a5&V2pO!wPcQ_ie=<4tsaCl3DA$xe^*HE>ex!pCu(KahQ*M)Q(&=IXs4VMiTK z%}1Oio8$^)r%wh6bNyW~ExS@1zr}tBOcjs%YcTdBKETLx2p|7-C4Wr*m;=R^?eT*{ zlC2!y!f)>AtIv8D>66{BOw4*p9?g2^YMS+5m+-6~?PJVs-xQwPQBO?7nVbFH2Y`qu zqrKn!84b&*5{Zszk1X)&=T_vTAvn|yZ-0TmPT}@msY7zzr@c@S;}XfWQt&ra#mGT! zIGsGWVZ-FEB>o7pgvvW>E_h!kO+da*%|&}W8UnrOuGBi>E6wy*aWcWa%$z`x`PI`vL`W9TZAC)!Ra^m4nj#_HbzXgn)%85Uzp44Cq1yry?}!;>l* zD!u{aD5uZ!nrX!(eyYWpXk9x@q})oqARsub^Uho&Qg&3R5cyq^k>`f@Qu$GQDYG%$z8u-rytm}44)Agnqd18rxq z>5IAa#Zxw&YUb)xQ=l#YPe8E0*+6@#++b-p4(FAjU7c8*H765!{yO~a z{Z#y)BIfb2K93@G9`z*_;70QR=g|NUNHnE+X0Y-C(aZ=3w`eWQA1EyWt{zT+(oA`$hgdwS()!lzGjaoVq?K^Sb|L)9|Wjcy=CM z)#>MTjrqe>^x=+k=)+Z}5A_MbMD#g{sV3&t9U7&SDG9n*{aMO9c1|+U1xMP63B1gbW6SPmn_ zv6dnTSs$#%yxktxN-3L6g`%tqx2zVxRwP6!Qt$)~@^O97e=C5*%Pc=M*f*%=e)cFV%B8*ynf#mEj^)*7DO6`N|h1eEsY+4|h zQzQ4@zQT~6g@f?%V(Kgellh8S{fpWVVgYF7!6+6@ARvTocz~&3> z#ge)U{ka}>kku3RVR}T&3|Zwr7o+Pu;ZiHTeLz;=3R+x6m#Z+3;VPdpiKuu(ng@27 z8@S({_|{fazgrvWSIk9Lizy@G201M2J#XcnPHAzdFhN+@Yrkb;tQo{o9Med@RpkJ{0z-jiFT&Lv5T>R#HISiFex zUNQqKh<}aZVdN2NZrd9v&9xKFwM{gaAAJwr%w3Z5=I(L2ASox54^j=tzE+Bu2qS0J zf`4OG6Hp$x;x^P{u}oPG8On&O8_YzmMi!6Jr!(>X5(m*g>F z(q)-xWN8rW`j%W-2~>)G2xI-6bnWKJ=5Kags(({PyJB&bEdI~lkDo1|!%-ocn6G?VgQx{svcWvj*HjATwZPnaWw3XeVE3|AEvVja z)$+#UqM@#hWd{Wmm}Vyv5vvn7nCgkg-V zLg84Uy>wM9e%oD*;OZ-j3|zJL714w{w5xB_5pDtup;llUTs47hk-qz&t@E?VE|&XF z3NE4lyV#@$HyFC)Ck3W8r*rn;gJgc4pC50i1L+z~h50MqD0JhmicMcpv0;kJ@6k*O zehx4E5%y~L&69#RkSUwk+F@yOhzW$WcyAb!PVAC8(oKdRH8x-exf;=Wz38l~;I8PhY>Q`7;4Dnvdx2C&@HN4IgtC?2N! zf(s2B=ZlxqTAn0>YxhZSu#I!(wRD22U40+=GYVzaMh78+ECH7guDp4oJrn>G|11)! zDFRWObO(7mKGg~O3wLh`-7Z3lfR4H?c%BcYJ(#9#>=M&u?~vOg1%&BBbDLnrHPs>I zp);Z%7*dPQVMu*6$oM*vLBvWd)<;mS(*|{mac+s_+W33Vi5x578kU&O_t;c)S9!$00+Z>B<%%3(Te^eF2*bTpKUXDlhEXYKME2(Q?Lj)g;f zRA|XPSMw5YI8J@lXiMNdH87WXi#I%Jp4Q_cu}fV#gqj;V4?nAMS2f4vEZU?|In**=I^@Xj(t!%8sNp!~(iHa3DOPaG?QqS(9z1h9MGPZeCiMI&Vh?_(K=DkXN~qmPnsBKe5g$vL`Bj&b3FuV zG+L9Czu>n4vidQ0LuUdX(NWs{Wil1P!o(dvdX(e7>}5XXD^K-_bYTzd=cI()%9?)R zOq&Sj^SZWqq?=dEu2ud~9;^s#8*^8w)2l3twlnHZxAwZMW8~6v(F`M3EVIh4>Ocgu z-gLuQVNvyGj>q+#{~9?w)`3%e{<2s+SQWTfRts}e@>^lypy*oc;WLny+ojAjP5DzS zxqhAD%sH}w2eH}$-6!$^iDfRl!kInLS+O@qBXjg)ZyVg;uw-_mM)?GbC9COP6#&r( zMV(j8!P>i^G;3~TRpE?(A}4UwN}=AAtb752z!$!&jNn{IspJP{$#{g7?^Cw>lntKj zdY@A5sXmr&th))$J<<#~uK_r($J(gDAL%WC_4(5Db?|!(Xeq~|9SnG1?{(!$({NEs zJDOqsv)8pz3cW;|@uNE2D}Ct1`Uc(%k8>cu=yWNbNPIkL6>@bYh89M)dl*lL>y0YeKuI%2-P%Djz?}H5%vph6}JGn zMIIs7zMi|=!m>YowK(_IdwD9Gx!E_s)wG`+sYiulvP~mgtqq6E=rUx)ds6-32z#df zN>P>Q=c``ch=(iTi25gw@;%mhw+VFeW&cbyG|cD(VATw4LMKeKmju^X2Ypy~mW7=d zYQ)LqiMu!Blj3*r02DE_U$`5P*&Yrq>UtEjo&))#e_cKY#9_)1jxxI5fo{M*;@}@qZWYn<%M+!dx0bGg@2niSA&9|&NhZ)K1@yN4 zKpP&acvxc9uI))Ap`JjJsF6Ygh$QASNepF@_)w5U87@qTPx(HDB=VE#p<6ONtRs3j zY@&zljnKojURRDZZABCG@HWxIZzg((O2N)J)Np`!QJYJc9;{eTQ5~I2{)KW_IAQpD zAVuuqL*Ih<*Eo-2#AZuGKVjJ5|_fYf79jc)s*A|8|W=hU4d9=cmQI4 z9-xm~1bti|7oq(73*wtWHI3X&1T!itAI&r^QLP;XQeE3}5qDN>5W{_!Ck<|J^5FU; z4=(p|Cam(SSaYcX61=xHkf3xW-&jEz*c#R=hGo5^K?(?D!Bm3^;ixm={es6PGh(}$ z5tC_@Vwe%@sMsfR-p|n}5xzPL^iH2^zZ5FM8eHl}S&V0hT8L=>K>|{}SwoOIihar! zud?2oz1ySIdjPY^esAEJH{3bbz_Vq5XP*%sG(m#P#7;Vq=9 zy6n<4%v8|;&(=txXYuPql{>(*<5EzceCHBrAIo^JhAHsR=p@HlVYJa5@J%!x_HPiRZtc}*WdUI2{_7fM`Vc$5_PPJP)%CUj9^V9fjc-+kS!|JxS*+GMM(e!6rCC1 zdbyRVSgc~Pilub{t3gE3gpdSW1Ii*On=H>T?E5mw{Lk{-J9lOR(!Rg<_x}5-WbXDX z=XsvlviWI>N@MPgytQ)l-EywmbuF4#fmkNb+~X=A&$UV zu$-!Jof+4kjlVyH-=gaUjj<%WR@?fL{(wnKY@A$Tzyy5SR&#;r-N-fP7i1k7G37>b z+k~64Yl)LrAC|(+ie`kHZ%!-R*yXohl5Z$em*nR|ou`s-dZ(^4AI>w6vl;m&*J;W( zdquDFd7gD4e*VWiURC4{jDm)_twFaRy@njfh_SDU1CyaasL(SCqwpVfis)U<;gG#m zfFnK8MSyt>zqG-xk8q2hx=5HQqfdrkyEiiqn1}Ir<(^WUC(CjgV-f-EYp=~6t)+M` zGeOGwTc%yg`g3NA!(nNv!=ZiNuWRCcWC_jdJ*J#h8{WajsHf6lCSK`DlC!cic{)FO zIZV+BolWN?Rf~v=hbA;P*45VB&%{2sjC_Psem(xP&F)`F^O*uu66SNzgTX8`rE$Y> z`;9jlrh@0Sj5av^#$kpgv+QokC(ombZd@NZfE!1X5*pajCZ7!9Ssg_?uRhJQkY?UH zRCTeonl=xUf!?32ADx1K)}&+kLmae!%u46|V`uXI5lnApUawF>Bh!=T^#LXL`<8LJ z{oYPIuYJ^wzInLVF+H6NC;ga<&F$k&2^O-4^p`j_g1>EKGTcYaG~#4ft^^mSV*_6$ zpKBz;bS6WwwzfE1B2W&T(UIwJcDhLi+D9cJ9`3@MiJt{l6rC(# z0;urhZzL72O0!E5ekny9PqW0;ktnWym9&$d**s`r+ec9@N!mx5zfEh={61QaB7=6w zHV-x{!Mm}w%)|DI+dD7;{AtMGCV1jI>8OjS^O+KD=b;Q#WsQ~(x!O2P^tsYB4#+D) z|4Wdb29V`AtXf`QZF<dlx5_YC-Q8JsTIp$uEmdrF#&pA z(6hz~!vBgU z%0tltDH(u z@OnT3o?1dYEtvpU5o}GdfpG|dT^V9sp4?xH9Kd?e1cuA9zE2i!a1Uj&Us`dmM__q3 zQsDOJKNNc>crwZ_MU(kD75#-^?_nhv!mo^?Od9<< zcKbh9Ax0Yf0pCLy93N;{79H+IuWVynCb6|zv9%q(Tz;2K8AsA(2G4#L1%L}G*9mgizm!`-o`qnI|e;zV11@lkw{M*l*fVt6#i z2^6Dmm=f2oP`yRMFKvssvBxNdFU=EeFAQ-Xng{iwRXWq;n~u0X2@;`3TyittJo`c$ zK2IF~NNe;PkEl4#sy=o=Gnp2Y;1<#)hr|Al8@v`N*47ngFTqW_I+iN8+3eb?_k^@- zD?!GRb^AbUn=%>I-sbGvqmt~~KJRxG!)^BMnLh7N-w&)^j6;gKH6>%qvD>p@Lg z5W!{^grg0Q{Mv*C@vK}Bw-zQWi0@icAMcb4;-dJ1xSREGb3Lp(kB52XdH=zBsA!$A z9^Sg;%-6$%-Op$}oY(qP>*0s&g!QoVod0G$e0a|3*2578)8w=EupU0XiPyuq991KZ zcW(Z?!-PHV+N`yjOaf$e%lXf&ZW7n&UNUEeMkO2hZk_F{oHO?#n)2tbO5whEKQYPd6LsE~`H) zp+EabfBKq~}FY!;lRX z=?_rOgtnh?$YQ;4^+T64avkZCh40-0L>xxP4iucL&XlnBG4XDew2&bRxy^zdQWAv5 zF-kM$btN>HrQsA5YMNI*wvX85-xH12H6wFRX&;N(%s<>&k#Sw48Sfav^&Q@2D~HvO zRYeqc8um#`K0C1=2v2QHIPt2bi65SenIGD>YD|)hXSYkR+W)y-LZ6>$$L!*Z^GJ66 z-~h_M?b%?zoY3Nv>@V6$@|`zPUVu^H5+(qIOoyXcm>13#9zVOv26n=yPD`=1#&{LI z%tRrr1kgci3-$5iV6M2FO7MA3u!|>N9}&FseHf=s?EEO5jK|S4`wfs?$0K*@+#C=m z^wGc<9?B0iIhDyhNkK|M4Hw4mkj|SaQ~_}W#E-9{58OlfTj7ERL1V`HZDZ2dCSgqW(B(999-5d-LJ2+9EZWKaDF5as z*H8P$^+_G;)0t_;H({EkbZRqgDz)L?$87%kIOz^H#q+xt9I1E0ti^L|{dertromh- zSh1JJ3SF&218wgEr-v%Hr;vzD%LajCkVfmdBymW~Z)U`wQ3QxeF}T>3&EmpWEo*v_ zgQ>OOe(S$>R*=r;y~T zzfjPHBOJgLHoryk3j1B_gcWvm>x9wFY|SgI%+0H6mF*Ouq6U2~czvmhOh{yNrD1Zg zk`&xLyf+GWgr%EP{)gPn0=pbJhEeYNRjeuXi}9>iJM#hlYwzak#dgi&9s;6F#n3M)ro{ z0ig1h!6rmPh!Q+OyW()UE8_m>D%O68Z;htB{!9Dgbld*;yHoCup62$)@03s*+8@8Q z?T^roayN@@M@6H9q}uZt*$c#X3jQqjWQo!H(Fv@c;BJhGh`3Rp^~)oFONr^N_FBxQ zYa*-?+)Lz*l`HVBNWVD#UZ5VtQvRu@!R*aajR8&)%DRE1rk%%!@k=U-(5(4K7?Paa zs)%Cd0k6yQdqw#$=o)-8)TMUor0TtEV4!P`(SRDaI9bJ6p=xbv%D*w^w-OY%2yTMi zKfEEH#~Sy+1NME;S`C-Dc>1Hfkc8JZFpk$IQJYSLXK~=vY?|;uv;(-+znc%WbwUdY zj(2zt@}2tH811TCbJvHj$0p4MQBj;1i<*aLwOs$}XW=%x9&MW&#fxW=RYv^SCJEmY z)?bmCFNpBWzJ`sC!c$QtDhk5R4GRP3xRX58K)(Jp=tTMmS8=xy(A~C|K1*2=P(zZ5 z9{Oa+znQ!QH>t)zckXhT=K%&NCDp_G(tfvxPSUG=&acc=4|oXV(~A%q4o{YR*@w(r z56C6oI~t`JK<8#vZ@K)*9GJc%Tz+hq#ZrpU{qCj=(_kdCK1lx(*6CC&4I3F}E7-pK z`G2jbeVt7E;;+zoyRO)I$ghpYq!J)2-<#*xABLrPNYx*9_`JKzdm>-Mlj*MHL}FC9 zd_x%7NsA_2CM zZ4^CRtbN)s@pABhtJ~##z@@i5e=54Hg4l~BMnh{FOu~y=^1Qq69F%e1o=Ys|4W8t+ z#ole@9emco*}in-flD(YJ5>FX*>d%_6hENxL;qkBEza?y#1FK~j2-|N`J%UzR?A?xqvoXA-c!3J9%Zls}Q?I8ci%kTgFg}YGcQ1?&`6Ij(&b9CV zmam3N%tESxDexY8D^qD?_!+W<549b!ZRf*f2jz)U7N~Ax;E$6|v|mI|l|D=SdRc=? z*>Wt$A>lrsRKSs25WwC9EPJt+6UuK zIgv`S>O?g05c9q?W?t{O68*$}68+&xWOcHB&Bj}njZdpz4+rtV)UVG}xhBGFRP`*; z*x5SzU3smPMofP9ehW_VMaR`SsBauq^<2^4!I1HMR4u(iBpj@ZvfWnS$}G(Wsb zs@)aN=GKuQoqZz#$_TjG_$>&^Kl;90cdCr9+ z_ZTb)R8y~{gZ%r@6fqp(f(U-2vJ-XlcNan%ya>1*nnja$;wUr$4a>s`mT7a=8sbdw zhwAH0ko&H6CdhqR&Steq;#AFOlXBk#{qZ+a+q)oU=4E3)63W3sK_we#fk1;}4J4=2 zoVA)Y&ymoq*^Y!}{aG4~erSpQk4K{kPAMOY$KY|~-=GQpb1P_qgVR}Noo^F(E8c}m zxcBz-MEuzQCH^3i`NO^rG>DI%pTr+#UX#opULxVZT-#IW>ObNSl}D2JLm`#{#GIC% z%pb0VCpP|YX7u6I4;GP5K_5K$(Z~(6GZEqfRX@+KUn+Kx3g2>61{2~YlMv7KOeTcU zWGBQ^O(}%vl0b<3L^>=G?;cK~L+>W0Loh9w4wp6|9h$K}UVSS5cpv(2lw0;kiL&)e zEOI;BAAfGoOg{(Lb6)uax$7etGGAC;qLb>Fo5(ND=>8KgJiC)?;FuFd_nZpMI_mN{ zL1i14A%+#i6Tx-siXsqlPml%RUg_jFQP!F3NlPzcqnT5yZTLkS9W?6cb!Z9 z6CEwC`Zp=G;B0jM)|!oytHw@7?Ob)=$*7%uZI+jkeXXR+6s|hES+072g1@+(uX08> z>Bn>f-H3z3^&6V;*TcQ5)aI{;?o?;!tVaQvS)xX^@gBP(qDv0_(bNwPn*$saMI z;F?oJ6! zo@^il&3Ap|0V&#_a{LrCbS32i`?V-Y);XF_aM{s>m6VVr)5f;xE~>PpbTK9sO&2K-GU&pgC2ow+GK7L4VWxKs%alHrmAdeZZ&qkE`BeO7Lph58x|w zzs3xSV-x$`&hK?8&wTH&_Zwlsx!<3XoDID34(RuK>bDY9o#sL18qsklVv3SS*=b{F{5 zDP(I0Vnv2;wEJ^El@Y!!Zm%lJJwiHv_+|2S-4kvn|2DC&H;LiEywW|eGYo}ta(Cx0 z59c-}ZxKBXGXt0atW`RwJBnNl|KfKLAVl`53X3$OgD^1>RT* zH~ts*hd0i)_J?uVN&CaFL;wHSA8xqPwmU%6=V4QZRSKa^jYygxj4 z(7r$1cksX5A0Ao*W7p|u(*Dr#AWx6q!=Q??oyq&dkppM6KS+Om0;+9p5=gmOzYI^) z{B#2Q%g)baU+9xzGYr__cwOT5_6)!a3=Zfz%H`L_F|0WX>;o{Js@#7Gk7tVw-n{B@ ziHA!JO3x9Gpb-|6r+^S91ERart(KLoEs^m-M_dvACJRR?$G-me^SC4+-`WB1>8-c+ zr;J`!bFS{F-yBHLsv|Fm<-)Qg%Y{4aj4I9D&tv>J>xzlEx@(eAckAr=gpXK9tT4(8 z9Z4pnxpMnID;`EC4K`&i5g`L3`&O=!{3cz_y1(tib=!&}Qv0|_m6_Iqmr#A~5mDM6 z9YywX-f;|9F;8;ooL1b8$5sYoDqEQLSdKQBRrm}l@wy&y<)~eIRE}z~P^g#6_3Nds z#8V{UvI-uItC##i2E|eHQHRsp2Nt1MuCwAi%3`bPxvKs+JbV=X^9B}uPtV%X8LaA} z%siN$x=)5o8kInzW$DQzvhpcT+#4l#Wiw#Ua(M@B?a$_Cs4d9PBtz^=pAE|5D#k<2 zzfrw)<##6^AQnu5+2K|uzefrb2nFZ}u*2pm@%2?U{{_pvcgk3K${!QE&SE!St_65l zk)wQgya1U1{2B~`SZ-qNrk+>de^rW`1o=x}JXSj9#zf#OG*DwDXezu9&jP2Bt1QQm zV$>k#rt@iG2I0b~N!pj*CkFvYk}6fQW#UmBqsKDDGfyz2sNN@!^MuC@Iaj0+6PSM% z*)NpPUu;_AmmZ5zO95f|?gWWo(e4E0?CsqwAk4K32o<-%>R*_mB9dPS5=?BnkS1>U zM3dz|x9sC`OsvSI zJMg^wJyJ^wF=!wr;o`}OfbvUW5&_K0K>2*qL7AB7MrbNML(I9z#_I66CXpkJX>D>O zPW4p?j94pk`7m_Z#-xDT`LAaFxdGW-*S2H;V0M@8;liOyam|V}pp4dWepibBl=yUv zKgb}rd;*eB;&(Y`#P58hFyUxtFH-}@IxCakg4Y&f#6o1SDQCsc=Si?PI-F03^LR90 zqV_8Hl$PI>{0=-X;c^4DPAr@3X0@Yl;`97on4816h?1>ic@?iv-bQf=89B^iv2pbD zbrNet+5sHKHlD~FT3zyQM|6XGu<^x3wKw35S%O#UyvqdP#b zu$M4@NW{~vH7f&XtmHUIx9ng4H>{6CVy|E>H`i1y<*?i#0R-$9$*5<%uPe)mIM z#WXd3LMur&$Ro#_$2svEJ@T5dbN`axJ;s|i<9ELj{q7Ij{O%*@ch9!%8-7AtMtNuS zR~SXxJg76>J)*uL7LyqTT^6-e?e}>VnLshO1;0S{2AkjgPmxM7{>P|2-|i61|7C|r zS_MQysrMk`A>b~t7@@gQLbysrF$ziP_(gtzn5L%0}7q^lO`tG9>RbkRx1ZCA{D3_RKNx{;lx-N_>B*5dzs2T*nI^q?ZPpFNy2;1p;mR9T}A-y2Q3FoW~U5*f67oI7IrnM2zz3PZB%q>SVAe zeKF%t*P&qh?4f9kw`=!zIE$C~^nOII-yy&v2WLe(#Q8zJjh>?t=@~6QN9{E>Qwf|; z4&FDr;>c5eW8x%Ln^8*msrAqKpqU8+iO|iB;{12xMC#vJ}dx*jZ z?ldX9_7Wa~piXW*@0Jm2PTQyNBCN>UTae68WLBAyN+*dnPPpEFW5jHV`K(j2i)Q1i z{BVZtRll}{@zzX3NYQ+%ab9CAmY_d}XEjRP^~L-hf>DaMzNXK@lNw{>$8wIZ*H9ok zMQcz}cP+$Wbwo|^k(1n#fGXqSG?pd$jG>vn`o84+KN(sdekC=Vl z?=c_*8o0?}1%zy-BR!sOL(o5ji4Gr9@qzp-Cz(7L8Emg`K8y-Z-z-_NYS8Q=d>z>N z!XF}b{&Z@lIcoTpcR=RoojAuL5r)LuNz!7^^GSsGm^t)O7vEzY#`W(mZS4>xTcajk z-K+~4ux!q@=qVu0ZtBEu1h&~nI@wABMcuOv+l%ZEvzI;vE5aQ{dqL8EVmmMIl@d`+K8&_*aMWQ-IDc$vMHa9Zo+) zQp)+q@wnFJUy#=+*>-r%M!W4WYvX_HUod2KlEHB62HtVIOYJg#uKvY_1pfl2R=%oV zoOrS>(=M$ixG8iCEbjty==xh|4sE~P zG4{=}WsIK227u+Pcf!J@X?9gJlKh&po0sHqol#YP#2g~o)Zvf%p+(SHRJC^A7TI@nX{Y{8B%cxvM7Dly0?Da3d(r#v~1b$QBsCfcLZIvv@T zUr7SoHR&Wde0vvBbKy{vTfBC@lpH$J3*)my?*^oXiricpelg?YbcaJqQ_D=h8;*C% z<}NA0ZD-+$bAprc11=oq?vvE&ZE47UH`AFqIVl2(h%}3}?rP0A`WYtf44) zB1m<8{#vTPKL42fyGi~P^6x(S8;Ce7YE|zSSlU!-7(&FU2A7$;p6fRz$8A!2?{(yh z4ue&!MWxS>GMR8U^e4-RtlB!!;R000Um_(1*@_eUtB>@!EjfIZ$yeI-x{=Pmn5`U8 zy(2N)p#k=}UPT~>!@^6dU%*^1t<;HlS)z}PJAhMKQc7?j3;w&3sUi)5(W<$w-;Bg66{O{7W7@f zId?W){raZClxc0(a5?92!@Yu=RQ@ooEEPo6PD4<>dR3aACt&H)7HzbAb}b5{Z9MFu zfRcZ@N9hFrNt=eatp&Shaw6;0%<|lz@(Iw2p@VHH+g|U)t+>~0Mb+R_w_@w+Uuea5 z&eOF5_tVW^n#Y7_VEG!9ckvB~3E`R(HvjB|nE}!R%?wyY?6*;$z651i@e)XbYT7&W zCgFmcmV1~X-l7a8*hmV*5Vz`GQr_LK<&n~?6}iJ7G|IeHuu_!ZN|0d}BCig=6y^JV z?K)DfwIS~C6g(A&T@DilV7wI}KEsYj=0S|Ox@$YkAXo&z$m@$S{JM^Hz-x!m6dZ7? znGy1uj#h+Z@bB~v<;SyTpLUc^uVrS~nZ$!&zV*ved-cmwf@6V{;?m`~ca9P)Q&lY_ zf+&Uigw}y`vLUs!%>)l!$1Dr&Uw?K`1Rcm5E09q>g^=FKk44|}DY|YELv@pynMSVz zGlp4DS?gWNU(qrUufT4t<9UfVx0M;9r}7a@{s^>|4Po8Bj*nCl>wT@1G_~DTt;sU! z;CTX-;exByndV9pr_r+qLn;TM=2p+=4JOSNx&|}-9$MGDHBec_ zGII(Izyr4d;^9>|TW|`e%!QTYaeI-+TomR!+KjzCd{2yFvo~87<%F+&@Xo+Htr<}x z%BAt189LCm80}O)NJOUXt0hlIc1}x=w@YgY&Ito!x`ob{+5o%5lZyg3F$tItuVN~w z(XMSv`UDJb`zn$Pm*S@eo{wD`0X&HYUYClTMndV(G1DMFG2G{pr_< zs-F@%kA#whEU~V#?%1Hl`_Bhr1LwT#hUJqrCzK4uL z=!)kiekI=88O1tt=A+K8crf6bHF(MU+jcU8q_3op*(_0Aaky=bYGJgZMq3 zXaxEq7RdN4awdoEWeJT7`Xkkn5t0$w53oSvLlg=G5GwRs`7xIm|X_rWZ^;wobpvmAkS~6XX+E=;=*AL8~GCo}=L` zPIz-G((kbWsdyTF+#E$jy?~KMwaK)WwGTA!u&K4>+2VoH;#v%y5Emtf+ zYjF?HDT{lNd=3elkUIilUJ-g0_>_b zOAX{X;5YEz?j9u=B?6~=I8l*>z6*t=3gNIU#e$#Y{*aKnSdv6bZ;Tbx7v-^VPt$+s z57AhGUSXy!-YX8TM`@-I-5j|4b{wo_15x-w$13sbEm2901NhlDv|*qF6wfCM?db9i zaylI)Uo&%}OX+jXGW?9<<8We^KSA&U66o;>DidKWhPerC2^QZ#7_Q!E!fa!+U$*o1 z&m3*HGt`y0MC&Dqe7&jlQkT?riu+DrAIh@AJ)<(e`@YB;%o#ci=mIgpIDZ9i;H5=!Y*7*ib5G~DFN=)7ox$Bk}>;;&>nUxM_RG|ufF+n$~7#-20j z33RP^4(f?da4akiyh4`yG(mF#X88&XvtRSEoGysf1S5;d6UCNWVluV2 z=m*RI9($-!;^lhovh!1FnAKxMF2x?VcUnF6bC3OIkD+rQ{kOgBgVv739>?RcCp`sy z>tOX(_7dGM5_{=R>E)v>xtDJ4p_QN-I*B$=@%qYyUhbe?MsGF&hs50*K{9?u+5>Qr zIcSlEA8oSdca7g47~^zlq#k!z93@}68yjV^CZy1}pOIu4t|I7%mfio|n!Zot#fzT9 z;Wl)V)-t`BN5>KS;iARSSe*Szi}gQhS#{X^1&H>WBO3|KE+%HnxUIEckVJv|t|4#L znzxRM2j$2AHgD|}!{n{MePQ`Uh(Yp8J^ot0yfy9%+)GHgm<1d%@^kq1m?!mQUrEATtnJ z959Blt)+kSf&}(f$67vew)cs%{l$SH3T)FPv(RyyoKhn)K8zADg4;(3%?A1}!sNfC zzZqpOCa<&37Jr|`)9&7Cb~am>D8lw+9Ij=FBCK7X&(8>78P|i5Nh?$GB`aJa(WO^^ z%r*^uIwGBm2dJ)zZgLI)@yNKvMKd+v;IZ;)Uq#w8|DdcAlHTY#5U01_ySeKv%uk-IP3Fb~x(q#CDIWWXj6L{TXNr)BL>8;Y^}XI~?GW2*$T)O0{>Js*OunVS zu2IR>m$G==IAJ>^GNChz8 z|H#iA-5G~EU7kI==rCHmf%#Plqz}eaAC-kCYM*~c+)-VN+_AQzo9WQ-z~QJp%ti@j zqmq6%?9Sn?G1Ja}z&xm56L$~M;+we7$blV{gy~M?4z4p}+ zQ?*5Tiy7*JxHdt=96;=cREA+H3JwT^zf=~T?<9178}})%&}mM68F90JK+P%m4w~Uw z_8KI)PCC0nVAZG!b=gCKlGq7sLkTgbH2CoBAS%btW4ekA&S5FZ(f=dAd| z^h}`3C7o-AMX9&2{l zqUNA4`>ZemGs;>5SDE3~0ntJUKA##~Es}z(kE}C~y0t1_V2|m)ZG~5Z@>rjHM1~3w zR0y;Sy$$lk_)lD1Q6_Xb%3Yfz^kd&dDPOP&Usm5IU;b5j z#UDUscj-|wseu@w1pnr=`}nPQ#(n(I#y1XGLZB$iNhjv8Na+M7Qw`MQB}I}w`3*79 zHPeWJDxp9EI?_mqj`WU%@F=4_vsl~JEKZ>f$0G@+a2e6Ls}09vwc}Jc9vGk996r*U zj%0kKZ43<}@sak5Tj#TYi5MY1^SnMo^3)7*ZzSO~X7ED3l$nK!7AEfh3!_Q#mP+Vs z=7mWymxRaBA_+J5cJfLXpT~uTKO%#MEP6eO4IBO1Nznj5!zF}-yCnQxBV&5Bv*qGi z70x4d@iB@G!M#w7oEIgf*M(r(ZRp#d!>=_GDlaC%tB+D9qsmu6@+kep?6S#n?))gB z*SCC>(Cd6|nwa`2EBvK%_&N*ji_7*gTl!X30WV9v|&wF~~|!@5(|gLtE^iX!OdpjpP_^vMtP=tLg<=f?YnmBI8XnoUy-wwW0NsN4ZAjL&|9 zfL0g6&z;xO&-to>uYyPu+Jv>fL2j+=TzammFWzK&5>q&>s)vX~`Xn;y>xn)He!UU^ zNkF#0Ndx;NTB=}^6tzd>_E<+5z@2p=gxGS5>&en%I4CXm;v6E&5(-^;pK?5D$HfH& zSdSq|w#@X8&_hgNBMaMs>(N`S1e*}_K%a%>=|l)sXL9P}%fvpy19*Xagp3b3&IOdY zd(Op9?T0>Mqb+}-6{qwU`UWNlzN!STfKPiBfSPtOmZ2A@(83bVTusd)MoO9QX*?^d zcv3VVf&acdYPE4OWt;D{nDe{y2gXGWO7LC+aX9JM=U}l)XceZrhQ*NP*RD_l^|0~t zr+Q@rsS`1A({eQ4&KW>E{NAeEUwF#qO9bJ$61xDLhLU{}*pS`}J9QOoU}bgyV^m z47)I-4|;}HI+{2pRtse`ao6=qTA`TcUU~!4$dUivMpYk`Cw3O%VElekp&sJF>RFbR zgT~U=|JvHI$jt&X23y=;CO+#;{vxl>Z0#sm*b*O6p!P;FgY41EhW4;sAf=lZT&Ci74bg<5Legd1}RTaT(X=lVntv@5VL2tRjRjit4{( z9hFoEaL0cyW-lE?jR4WHS4-y=Ij~KvY=fKGKS*QLX{m!`I-4*wz4gfd-zVL8670rs zXCub{!kvN@yI|a=HzC~f91qR?pW6^C+ecjr&F6-QMtL3vW%HPf%Meo!N3DqU;pFZ* zizaQiS)=4Eg)eC;)rV#`@$B?zKO%r~acFC?whboLbgJ_`ge@>9X;%!OneuoWo}`zN z`$EV??O0L_qA_}?@OAahG;(2d)&#aVeSu>jX3xTk%yK99Z7q4hVf2XAd5Rn3DCjf$ zdvm7aR^H^-XHdtCHVs&2?n-sV9yL(sEY`kbzpW)sHBjwVvJ3X9-qmCGsz!%v)avc& zs*jjYNSXoOkXuKZH?(L&X3c9mzquQd4OE^i0FpuIO|l;gjd zl4EGIX5v`rG#1Dz?DWn!g!Gm+-w3(SR?;vb?ksL=NEC5tEi#IC?Tn&``NZ!}&n7NS zViUQoc?4enR)*zA&O=wI3?C>*phA*a??x95W)qf^{aMr`6r%fIS%12oNB76qV#Bv+`oqH;==a!j z$@XV3X4 z;{W#Q8+{>%C&U2uK`6O^`QKXo%cUQJ3Se zy?mNTi8C_a?boyFLJq#^B$?2QN9TsIuPMd{@>|SIg*D=}g;6WtpxH;*c-^AoFZZ!U zVjt7!_=Yz7q;BbDdD0C2H;St?+}si%0JMbjrx1IDFD{`Lcm77Q0yzi+GBJ+R%Atab zbm?XkfbrKlI!~Cz%e9FTN8bbkJjf*+zm@5XH;NH8R_o#!5&c?eUXuLWmICc(WHec6 zPt26JAAE88S@H8*zeur$j}s`if*QUUcT6%`z7Gu(?*O-3@h(XOA=fbyvRrn7^r%?$ zSzOMg?KXM4`or!xOS&H2IFPd)L0R^x&;G|`+0WB?lZr0;KageP+ti$we{q|-G?FU8 zJ~=mOn|gSzeWgk9wb_;SLLL(LT~a_R0++87s9#lk2zO|ABG#Z#PbKiDC2OOGgJxQh2_@If&LC@vid920%gjm;T6Yi`De7c@UITruSwE|u~{F^ zKOD1$(eo(}V~>5NKHu+{Y79H3dcRXKNo+q5a6+ghz>>>ehxYC4HR9@*uzi<>+rro% zR36E9W6X2-+XK+p1|)m8gS3n|@S7|!;-YuRjIqpSC?jnuF`ORWZeWU*NM^Sbz6;OU z7c+H=@MV-EbqljbG<~hcZp>OL&t=RRi}8PNM$PJFW5i2u%k%$#M&-N^_gY`Zxxrm; z+Bt0bKgG>(Bl35&V~NO0kz;PqB0uZUJy&Wq$!+>FX6-bhyY zR=BR@p$aX3jE7(2J%I{>T``f99Wzk(SDr^alg_qokUa!XM$E8D$i&N^F4s8HUpt zJr-lrwZ)^Nyv1HY0nqdYbXQ-n3!YYQZy}V8NQa{&eu)DcqMY^^m+rO=BpqP=s?oMz zCJ}rpMQ1!h70Wi(hh|EJoJ?$S{TzNSY_jZ`b6NW*BY@HJ*fwr|ETwZm00IH%ByZMH zfO{x|J51)#Zsg?35C^uncC@&{BG;eF5RY#$1BL2@u{z2E60+s#uVZ&Bi7FtrVlc#V zXaIF&5n4@PQ(sOq^$59Ph4QIhgPN30SPM#a@g)374~nOYE(@aMK6PYK8?FQf6Ul@9 zDkXA_tOAmmA54K(0 zE##}Ul9|#~$taHK&Jn%WaRX0q+zEzt8EcQ9;S);ecaJ_~22ergm2$EJ1_>^*jJ1Ji5{A|8M?S*%z(I-7M`~%Nn4sGIn z80wgc^@%LB9?x;mSwznfRP;KWCYJChKBJBok4eOrn>9!W6j{(Xsz3bG6G+@$SS89S;V<4TOAtZh^5UCxM!TtRpnb% z&B#xxqbif?$hYb+1|(LXCRGscW491J3mp4{GWp?5>gU~BYsVr&`2^PZDn%9<8Bx;^ zDPdCLm0!Y@m&Pj}B`YUA|7wSf>W~(B&{o@iJGHeV(J(3Z;29Ep#UTzoN9H*9r6ttCrO)e^_6=zhN`-imvm71f*43aYoKY0UMxoz$}DFHdUO?1Yx# zmAE;vWjCa@tp36?Y1#d+G()2=r(o1&ewo}mgp(V@)4-5pHy5v3a+{2~vK=4k3C)`O zR!Xz(jW>(fWwWhn_$$GyIh1*`kT^xkyhBh-p;ozxnDANJtXJU&XkLZd{@j)7(f!aY z-cbj-jEfxFtV8tPEZQAWuAQagHW7Rx1GeW)Ews?SlNQMf^+}tcF-0g6<-l`&5@w93 z4@FP%CX4sCU4VCv$vgVQEELec>5MnG%A2?^>a*}lwBKO!^4s*Dqx?*0(X(k#9j^D z#EyuO_Tq2&#M)puA_zB^2R+0|c-SlF@-uV_*J=}s6#1T*9dxn=WR$9?WgtKmP&DGf zIdD2I0=QolbNPi_I`6JCSgy)cwGz}q=BLm|tiCLzrcLph8nC7pWKC3vK8vQSMvtJn zO;kY^{8|gY0-rDxWk;xbJ~iYVmC|PRhyQ*P_=Gp^6!2Y|O^kI_N$ocl^Vc_l);A~< znC^3Q1jLu{w1GQ`Ro10@9AXqb4xx;Sow@B!@^7k~Pl6N0&(u@5Ww%DLfgDd>qb!+c zAB2t8r~OFz3qgbZP5f+zj4rQs=1phHN77mnl61x@WOh9%3p>9AbvG_8&YIN3!j-Jo z%D~rsDiuW;_$rxYEp8y%8mKLjOBd^6S|~4SDpsxjLOmrhT(4}P4$`@k?l@gP*PdZN zt7Sj`w!rM?+Nf1iYCrd#K|c$pr1o>!Df*dX_4DGFEIz8w#8r$y5BHvj>|__QlM*>g zt@r`tD=y3*V}$mRygxi@N%cZ5;?qfru#T}z@#YV5oSHW55Zc~R5qDp8s@Pmi+LY&#_J>36wB#`C8+B!N9*0IPw*F(+!mJhyz0 z?53yy_7 zfn3j($#WDU`2dsTM`B~_r>P^uR#=~*B4u`Zr!?IkGvdUK zpK?6n7FW|K3~{i{{vn&q9`k-y4v+^z8E9fM${QrL zXLBQ8r$)AzCtJZL7}=0IYRFrEI!!|)zk-H5_qx@P7yo6O-^0&*eis^LPSI}_AJa4>)KSrM`+uMh_61_Q9{I#-mQmo2*q# zSGA&<;&=0+Qt=-ZS0w$~99PQXyf9LzjP<(DbH~z?kADQ?g2c2h78b7yR z4*7{tL6!JN=|oqsIny6KKhI{&;`iRQy(euws@|bN)rL$`F*cWfPo+o${yiOjXB6vW z^I-kt;Jup-&tcwYhs=a$nZ?Sva0bIHEkBdXDc-KMjveTM(*63dSirc}2UYG4T^P;%C(-Y+V4s%(=C)U_?t4t$Q@ z!PQ&R#oQ$@*?PU}U}*}dm&wG=Vr}RgXii{Pp1LsJ8dy=)yV6j?VIjb;%-dx1`?+Lk ze5v_YS!pddrv@m!PSw;zgfmoK`Ottkkif#+)V^>L$$J_ESjSeAVK{t>%z{ge|4-;yU=43}L#o{ccFH5Bh+c z=tX&c!_%M|9Z!sF?GXDXqCqp=llUU;Fc`_oZ+CA1{sujX1NVX$#cy9inv|c8se6IC z|92>p%yB5|T@GDBND4tVC%n7p2BJA4UxM-`1|vvHV1h?D%Cqq>_ZY^SnVuIIeAb-n z1qM>1JSJ3Qkc*B63qAYT#lvTeVHXc_Dxnfl1Z1P-S>w=jv_m?55VBP|HFU5C7}{tT zwp7K!@~H#G2)rnMvn*9y1qR?2f0%*~I+1jUug0ZApN&WF5Ci{eNrywQPn>48WX4y^$+W~lOEAk{ zTyG9qS%q!Tj)s0=(0*0(zcFY7c+kEs|1SnDz4jD?)+=~sgLc_ptU=3~owQo)G?c3) zD`mAH4!51~2?n1zwI<;sqiuwLGWZLGf4BPoKH+tvlntf>M*kOt*K1Be_$AMrj_`4< z`d>3F!q0o{zg#Ujw$(EEF<31##`0=`XL(c%y>RL>o5>`6_FfylLhqJJXv#5=Q$Fg`9{1%?*)j4v@qx%zq(5Nu@*@18 zPtd|n9$W`uw+$D^{XmSK??2kw0kVTrzI)G0>?`tRQ$l_Wqon`s!t9Pg{sC zL*Y(G%<=k+uCF<4jh{Y?rpPNeMFOM^IrU2XMX;DLke|2|WjIex5;=zy{OKr_UmPhI-Os@QzrEYP&Im&Bc78!I1aX znaV~EC}KwlGD%}n!r4>lP09&iw0yo5E9iKrKlGi1L;Z8*A&)(th^)A|N5oC+&Ag1S zuBR(H=0$IpCj0SB?$h*)vurJ{N_f_ho>k6fPZI2ccz#ieC(v`w6i=X(p+ylej3R65 z$UV0m9u+jP2Q0c3{7n7=Qf(1`o)k@-6q6>|Ck5PiWYVd7H)ztSdv~oYhz+IKN%7&w z>|r!AMB^X%3#Pe|2I3m>n87?el#B;g(S7LHb3|VcUO>W(#Z=Qk^Uh6=vNd=v0Vo0(&=bV;=2cH`JD^pyKmsT`x~2mw_d)x_XGYr7T>LH{@qvPyEXFN zoABM2&3A!nPj7|}JkTVQ{WX7*+3Lw=j@40r^J|e)ovg2%n0&IXg!)6hVf;sW-_eMM zH*KWKBePWF`bS5=4!q{+sPzVVE`$ph{5>O&mEXSCjK|7nQ=-;EWrD{_N0;fbg3rWu=2h%?Lr31>uYTkPycmroFOjFC z&CTA2qPV{%#+Oc9ILN1gsCUq(eM9+%w2Fm(ttrJ_tY3~_@=df9;RG$3DFzb5a!~+T zvHnL;_lu{Bk<+9X7ct?Yx#Es#mIoBxm@oR%nf%0=U{#ZV{VgP<%eM$l z1goM<5^L}iV@TC!4?#H}OUyONyWzVkF_bLIGDS_%Tq@(=k75})ee;7-Uh zG9|dhX^+*Sb20~dj6d-TZPgFEnKzTB$Bc^`B0bjpo8brNlLlL(Fg$Iwr4@n6{Rc$V1w zoJ$h@oWD?l7vcN8ZqJ0>7!ihA>;1Dr%au?|%o%=T4t#aLUoW2O=ZrXbcn#Do7>3DM zDkcqu*%)I-(Cd+Cfu)(nq1}isv{#ILB4t-vAp`LJ(d)Fz*v_SN^Ei(2wvN9SQ%U57s6M%(}oT=qt+=Lh9CpT6YppNLA__tU*~ zak;JjN29iC%4IcIoW6j@$D^iyd&j$X*uL)}+dm8*Ogd?J$wD*6(ne~5sy#HQDj;;Ul`%6#%y+N^79XknI)yFH( z=+jG|rB7SiY53WheR{@cRKdmZ~Ii0pI&zQPw$4U5Am|bp+9_& z_XsFN!B1@P5+(dOXZ3~x;*Y}5{{{0;QEs3{SlLHu7yKSm6(;8>6lcAKh5NNFoLjTv zIx&L0#J7`R4XwJ!BQ6^yV-xH-0V_J2`2fxQlj6b&z?*&$P5}%F&SNG zk3pC^TfRH{J~Jw1pV>;De26>K>qgjYR$XM!Y@+*Ctktf5l(#wW@yZNxNep z)j9kE%Q4Wb6FONEulJM35GQym6Q3G@(Ay-DLG?=TB%S8=Jd2wo{PyNJoC(drOVK*s z97k$%-aWs$=9I>p)BUkH8UA`_oDAQewAGuLSTA0R7w)m@ojNh4-lAsfEsNLtAqg@x z0fW<_%TQHGU!w;lDX!j>RutSSfvz#6RgPVt7Czn!ACHBP;e%2{b3)1$DD&#z^;aEK zPMI=~Kh-Fs5~BR*=vI)&LZCj4UPOZqK03M|m;}R>Q=B8eZ;I@>MWL;Q+IFRAAILNao~RTF1b#evBzL9u z5!!;ZdF0#(^V-P}i7W6grd+5(kjN>hwQ9?b9)><1)~0Ngt0(ljebLn*!lL~Aef%p% zpb17FjG0YpC9poMOed>Bz>(z3e|8eYay91|nU8L-#x&1tl5IHwRSdaXrl^QDaqg3K zd>1p$yT#`%P}m#p$$*}f;LMocOjne7pI{F$R5d2Po_+duze}cf%pZb{r>HYZ&Rm<^ zXpyG#o0(QiMMG0!BSlx)zRaZ37$q{fp{n*nN_!Tg_^H20{L4!7A|7aJlGNq$exTme zBKlZna;YgrKG-Uc**Oh(Qww9)*pw#zoEVXunPQvM+u~U&xwfM2SFUryx+Lrpz)Z|YJm3bc^aSJiPXpfjZi$`Hx9Ob?k)}j8PXQsJ=YT=7r z)O{#6r3H6U0Uye#W;#Tl9cz*+2WH`XdUT3;moj%RKH9{d)q7-FZI5DEBT*1yEOJsR z%MpPgg^Fk_!41?mg|kXg{fyGI9g&$FCU#)9t50UJcK8=5Q$9J$O){C$!NVq%aqzd$ z6t4TlViwDW~^aguMLr0X*qLvOvpvLK~8Qq%xJhqelUSOhGX#>rY16yUulVr&V zyQ|a8g!fBwcUS-10%mAg?)pea*1w?0w2_`XZ;cW_;JMI|^4&$byKgko-zBYH+f_i) zC(2HNDksV;umK)Q10L4gfR6bqZw7A3kZDppdZ>v8F%vi5ED~Z-aJlYPDe*`Sk;0m$ z?_!bSd*^D=rMZGLnlE_sE3)9iwEH||1%W1~PQGtS=q}jt3Iesx0`Fnv84T{12Q?NA zo&nCldL?|HW0H5iBrgy&o9igQ8wCSO&$&ByJ&p%0-4uG<|FW&ws%-Y%2eH``Jee{V z8x|wFL!-GJ`QG84oQiqa12+nNI4Q6dD29*}WQ=o(Uz4ae9M?NyaIVDi2Ou~a7Ptwi z@FgmYphZwD;?p(AdSqQPIp7m7ORx(r8m-kXpP7CJf4Iya8T5mkke~!?PKa=jU3lFf zo3~67$;VO|2fKM_Hmm&$TTqQ;a-pZ-IN$s0|IKQZ(FdH-)|gZ;n%^!>j-E}-YNVKII-zXDGW zkKm6dS$BSxcS@~0D@HJhP!YD7C*MX_?WdQb3Z0Nt#yY?5b}Oqs1NYigkWG#R6$CGZ zd#I&?w0ty01v%KJITgemS3z<|$n{$Asi}U2pE+RRH?U4LXJA|>s@h{~P=2CLgqPx} zAy$LRN2N6Aj(CHRDO%dj&!p|VN}IY88rqf~mn!q1vA~5kz>`<`6HshkH5HqDV5fQE ziw~OT%LSythu54i`3S2+nB8k$4XR#!jqYa zy*7SVYVo`MFG>0ZYFh*v)AjkdfmFO+5LgO~?$Bd@;)t~H=f~~m@rY7B3UQ&Uh<=~) zz~T-z9Kp|zi1El_;6CZP7>0X9l*wC})-ANXsA2%!XWEi0+$IBV^ zqMV{_Xahs7DMw5Hm!-S*nn0^+@jY>MyIO7SFmVFQ%EXNiN9F87O;R-$n^TLphG!5t zU*gm{|BoqCYkOi#h;}-);GEW=IV6KJB@8MA&S740vIDOhKRWER<-JG0= zwpIzeDES5C@Ex$ghOguEw)~!6B@tN5zq_Zf# z)#?`aAMH3ZS0~tUf(RiHYY5|?>G1c0Sb{&uhaSR<=_bu1q>5HIwGg-diG|y&hieD_+#{SQ zyWX$8Nky6kiN3JYh1TQkxp+G`3`WrC6Ek_2ANC`63mScjc7U?%>w_2*nzYdJRG_Ve zHG8y2J9E%4=-b+Xn@xV9;c=S--&3%KBL|&m`%~b%c)cu|fQAtCWVFD>{JAA<+ZVrS z!66(K6)#0&WMlYEn2NAy+BA6&Ap)lHoU{N6rg`#cj)FiVU+aZ7b_o&S zhjGH^3l+)lN{E;Bc9)Z6L`8Y)vFAvfFlVrmS$%PT`jFU~b0UF4;m>yS!-=PgqEH>B}G+T0P zw^Te;m>94pE)nnSXo|52m}~@nY&bLW9Ld~Wh<`2bxrrcsC39DYr?7n955-mZ1XGTK z>|dgUz9nd%pqwz6R~vVIJw1c?wUle!EdM@gDFVFQFpGVvHn13#t+*jwY6E0Pf6S1+ zLuS$AVSxKETz`EZ3E$r*gO^pBO$1i8KFvzNaKBjr815&@zt>{Zesf2&@-MuN+hhm% zG%oVCxPv#c9K21;)pB*&aXTncnC}4vH?`Mr&nV@YH%`QoB;@MblO^PN@=m^WXO_H^ zY2DG~9fx)2ae0SE2|cr(KNO3F@gF)TwMNs4o=(4RVtg#=P7|d08osH~lpC%4xN5>r zM6EobD-oo8uej*hXe_)nYQuk*lmaq^*EhxQ%kvC=N=k$hT1I$U6Ih4JOwgeM^)9jL zk4#aO6{r@bY@eV&jsI)10_D~JpG};6tp9k)DJPB+Tm=e*SOaf_zdS(xw04Sw-`+15 zm`TC6Onn1n(eMQ)NX{9w-6r3_^(RT!Z*EJtM(5Qo+vv+V2kB4SgXB-~t9?e`wV^RE&cQvjyoC#KS+BzfAmE;;42-#08-3EVI&d5V8EEP0AA8kRi8=MGDr z;x7$LnBs6JIE?*3z8Wt59E|JV?*;uEeuwNI$~Go9ln#Z#R_MUeAHEDnn|L+aSf~=5 z0mR2)5myYO^Q6ph;6N-sv?g@e*~5})a&%~N$F~hl=(rNP3BA*9iX|7PgkDB}!Iz~E z;9UHtGTyKnHy$I9cijs^n;V~rr1D0>|AYb}80;H1q-)}6`FDW)drfkWp?>Ie=-*iX zzngy;tR#FPs3ZI~zD5p}4;B-J!(B|*Pgy~K`8P-Y?SxG-B6>t4R12{ZolK3M18+nH zn&Og2`N@}H#X1!4d*1r()#P9PATE|$sc=d*wGJb(f%9wuHWz*bo6%F&Tufezwc%|| zaeP2NpKrqRc~x68WROoggejLf7hbc8@NT5D`mE0~rJ9EMnqi|W%Mv-0qm1OP)u40G z9@Jt`-pj$38NAJrs){*A&2)|A4q6>gSDNk-Q}2pO7*2kb5_F`bcf52fh*-v8&j1u6 z$t|rMbJT2BjTEsF!3Z@-hbZyT3*_8^M+-rP-Z4lAk$wr<7c%t;W-X6D{0tDX&QOaUgO`FpPU8-3K}nX&1x04g!3m-xI2|9Fp~-2iBD zw^WTX?w{`0^NRI<;A7v@_4udQ8?ERZo0U{MlbR!D`1PSx3ayN|&6bLtPw{0AbFOZ;vXw3=Zyu_8&E(CSZW~ZK z*f=WwicxX#aeD<_eET$^_kCoNy&cE_J{z3Sl)Dm}GJ7ENcrjE0`Wn}Vywu+X1;m3Q zHPi8a_#pD7m1;D}U)HA`f%P9_2u9k>G00t;fiCz-F0E+c?o}){V5p^mz4z&np z=!YfWz50cyyls~Huqr(CE$<5&}?#@(xCAHdZr8ps!S@je;R!Sc_|w7ePd zY7!ApFM*|60&FOlfmf8o;&Wu)jANcIam-btkINk81vrEZY3^q{aWngXH64tZRPC~l zQ9BnyrlWc5ZM4X$Mu4?10@6j?QTjtEG!Vfz_%Ejo&!<2y0zsXC4)HQRbT+hXZWQ#2 zM{WSEF`BM3GH9|RCiD>{=%pn{cte~FV=;xA9Dk01EXgUJ8($qc$MU1D!PN5T&~`}G zmO(q+(p6cji?oz@(go!3c1Wjo`1BzIRIO4{AtS~(49GQUU75{fB*y=byl;VTqRJXi zXn_Eg2~Z>`YH%al3TmsMjrGw&3o|-V6i`%De4~OOB*4l;Gzl<{S?dEA->B?n zBDxAKg%(s0QIv;(qQVUU%DaV>=6}w)cP5jj1zp_z*8P9KZ>5<#_s+fNp7%ZX95pf= zG^ro__ZsWf`?-U4nf^4n-gD!dU1}3y_X(a3XHL zfC^H0y`pUle22!@=b%W)UfM9AXDNImE%*L0s{R&Fg#H|I2~~f^Nr)lxOFzdqcq8a8 zK8&secCaUbDa&fu&g)4h`=+3s?L!`&SBr7|>~^}@d_U+Cr8Aiq6fa5!%1)u>RI~Sa zx4#sKf;Dt?rnHQmg2){SOyf?{{AD87D&3nI6vDnVOu^k28Ym;dTv&$fL5Zus>{ ze-)v`fufFf|MB?k>&ntPM>3y5(MwOdtpi|C8f(n6z_(n{2RK#z-l2#jonq&UI;=~Y zi_cdJ`2ZRX>2IqPUhkHIOFP?n-pY z`?Np~UdVfJImhrZwEuIsvNMge8C3j)U^yFoh628*|A|Tp%`uy<#N`BhXCb~~dK)u) zIi_>^`lfmTWez%k>cygqsDE`NczaXe-3sA7?v4~#KY8cTVC}g|zuC)<1*PmacE zrZFCnK3dC;yI3H>;Fbajlt_OE)>sX%%QBIG3rTIK9FZODp&L+rCG@wWH72MR6V!1M zDC0^k(6U6+Bg}RP3Ga|zoS_==8dbJ1{hc-=&2~$Hf#hL9z5O|k zXcOqfKPU53@1Ka5)mse}wLUY^ zV?Uq(1I7gzgFGE^5vac2#ZqvS9b*OQPR0{tSv|gngPOARW-qdC_dkv^w;L-48}-Rk z>vIfjX1~eMzZXX+yuvL#cn6JDoMU8e*ZYx0EQQXbYV+?+Gt_OY=)=|q5Ha=`QHHc% zY;mC8P@JfCIbpm$AX~q@fEyU6A>E)6eHSPd`h@IYxt$(qdo;=)8yS;$U^A?CSDQuJ z)!FoCkrduQdy!d07YQ-((jsja{f=ZVl7j_QMG7dTZz?9y80%CcMGkmXZ3EiPSW2Cfth5A(=n#gO-ap&2 zlmq>{6h#ICB3@!nv6fG<8h$DCXC!l_s`*1{x5>{@sY zAm(3a5Cs_WEIv?#*A@J{$$Oe=;fNG`FQiV2)*ywhYzK&3coz&QV|6;cZw{qBNY+i| z2I0=(DDh^|M?|4et#pnW88wHrV9asA#lRQ`b}<)Bz8p+ZGs@NX!#+IPqa_@ z6xg$AY<(iiQsQ&i8+jJ(kLXh`&ejzxRYpfy&x6P&%3rnMH0*t09~kkcJ(l_{W_vca zpM>p8nruHG+Yk4~vMH8!O#gzqT4Vx9SZ>v0>D--c97XtW5Y?*s@Ep<1uH_A-`3K0c zD>McRFXQDdk*ylakXe2bi{%)6aTBw7ko0w~fwyT6vuc>;kMeO9u30QQ; zU7tlULg8hIub_H4aB)*I6q1KN8W@z3Mm7;5=O63>(0!KzBu?|69+<2$=iF)Hj2-`PqkQvVz- zhBg|m9(FHRYsb)GITeg$*~5;Y`pmVeK96xdqrg;s)}Gb0>hnD}u;wJ_kBXZDrmu&*&qZa6U8tgH_VgX072G0$Ee9AolMHuh% ze+H1`+=~yCcnA)GQa-CK>0l_vRHar>M4z|Fzrmims5g=P?f2Bh7Tun-*Y6*W2I9V4 z(=l1WGK#?h9y`^N)V6e($+ZE!4UV0Hb{V9A(ncCE)~9A`hXm4VwI=pmC4dcEAcgO? zi`~^{l^I+5Uxq7#NP4U~>e_3@8N>H3OQe49 z`K0~9PRwmi9-Bajz8$ypPp<3r-LS6JyG{Q_^u-afU%LpBF%O~z_? zqR=GN9VxBR^HN%$-YIhHa4qE1^DJ7(;^>*XK-j0kBDy{8QQw`(M?~pJnd34^O47k`@b>_$4=&W%4o`!1S*Mu zKGnNycV(33FT)BnrTOK>S5bjLk?HyGgH5iF`7x*>1eO|YRE_kRMX2IcP`F2=&lDYkX_u!euA4voZhqBF-U2slRMwBT*Wm3TN!eTe%e-DE|?BBKC~6;u@y-NL5d7rzr`s zUrD}yI=+p3jE|TDW)tm84=$|l3LkyVD;6V0>_KfQHQYLhGyjwMYIhk!gF?{`SLjDY z5+Tfk4Q?YvyibqrMz8^5gsm6F8{G6EncSP-=J5uH&Z{?aJDr7td?bbzwlVkCQ+!fh zcW8dD!Au2J1m;le8^b#z7J3(H3?JhocFHZ#;Tqmpte0j3bWkha6-(eTKK>vk={SZ-o5@aWW5uX7l{mVO#1^2=!#7>?2@)}a1qc=8+`5`A$z|L z#PGlyRHrOYsaE%4ciP)34DI;l9^8Ua1?C(W77kD;xKJxCqFDa#>Bz5!#V{;9xER!X zW<7{b+7I`-amHVr1AFrb+BdCmrJ!8vBV5r{#{GC*IPmK(8;JdyyD^*GlDm#PUL(gSYjauvP(ED*^=0HD? zrg9>1eo~TF!k_mG5xP{?24^@ZV@C46j(%xfw7*q;T~`BlL%Gm?O~Un3Mkeu zTc1>bvBiKay**ssd|~dB0nf%cKF})>9{;7Xww0R}Q-6mP ze-83O2kzb<@c<3JLT{dfDWbMw_1)IuZkHtyNC&Yp>~VQCFzOCJ-e8&O zxIBM5if6B3!){NTY8x|W30v{|##5|ePv5|&_);H?j~FED_q(*cti+F^5|bWOUl;9i zWxYk;Am_^YDwa&u+pAi+uv0B{WxbcU$0hwPinkmf>6P6RTA~NO5`3_A*+9-VL(0#@|^ER^|v5 zGO$JV#^7NfhK0bXq9ZOHT^{#Bk1~Xnf-I`WjJwU{Y@(2)MFm&4m20B6hP=vXkB70X(7=U2SKvQ3$!1mI@u5-&Mr`lobYOwEY;p7ToSm!6a9HIn@T~ zJMBLB?*&w;cwuKm_$k*I4rKpnbk`oxq_^m|CU7;T3NyH-@Y8AE8?{1q(hyaceFXl~ z!OqIAK(4QPoC;AVUWKO$cCXKC;O^Y5?6?8I1D|A9oo472*7@(l>^Mi7e=D5-c)z8- zM)Ko9RAr3GCCiZaBK6k?#5Kera^834)4@%d+8m&c_#HH^!d0?2hfzK?KgOLIY5v%RTkzqJ_;B+A zNj$t6VW3apvdm4v z%ufXt3C=qH&IJGaH*xUS zFr7?C+f%sax(DG?pMw?YCZ#9frsQx3SueT%Ar#-rSd?p(_Oq+Zeo|l0E89(*KQu{?BYo{{U1#tG`BUAz&Wp->N>^q3Rc_T2#$F zh~Y^UbuDCX)r79&97-hfeNs?BSG53IM$qo3K*ZcdAcEA~op}FH_U@U`TaNUaWwY@+ zlm}2|+~~%j#KC9a_BS5Lkk<=@WV#s7k8-PJ?;&Xh`NX`75Pnj>L$bY2dXVmGdUr`P zbeoN56MG836!C=l7q{}zXH?Btj*QCp-LSGks#`~@8MONB`&5moxplN>O5}p}a>eEr z`3KnhAlrQ28h>Bny(*^msb#BNh$soR4m~50%(qqT9M~i$Ub={GHT65CdT~95#^0CV zei0vR@)sD%4jWaP|6^(X0m|h%3=|5svL_0+14atCrp30u)Vz~+F>Xbn9O?F+&^*~? z1CY8gA4VxgZX=YdRqZw>+kY}n_BMx5ABr#tDYlcfACR{HB;WQET7N~#saC5-#yMHA zJC1OUdE-ReRMpQ(?mJ9ZB-^Xly~x!pm38|$Wbcv*J!KScNb_vqjVNyi8ydtWb}^yh z8dG!uAZ}_8fT%tlQMnMt2p7fsn&OqxyuQv0oD-ZE9Ou8+ zC|O5=iO7yb!MYPu`Q~^fDTPTdAg5#5+omAie-Y%fkJe7E2SsOF^%63fp^8cX7C!pUn2wh``Mg3&8BsJP((OTjIsym!jIR6(9a3|* z;mItvc$Z2urrI!YzLtlLtM(lrtB`*=EXYhOjOYwC?CvQVwlw1%T3-!&t{<|Zo*-mk zIMk)LQ2^0a=FjGO3?+q~0IGv(|4G$;he~+tQH)oL^gdqk9td2lz%)XCpaY#la_$~h z2p#2&p3yrK+`lVEtYN2$1`PGK+?`5BZx4;%75c=U zXPf$_(ciD5FMi!h=!;W&G@&n=@~D8UMW8wz=h@=gM{kP3HJ`YoP)6z# zrxaRcOL^juLN6jD^bszE_#b@wZ7$Rnhan_f$gnT;=@VQS(CWr0R@}E&ZcawAuI-+T zV#V5uC=`|$&o%`r6`fK&Y-Mwehu;SNtmxNMgzB@#vkie6;-hWOmYkRelyT)hV>2D?LEflSl;{Dd|Rz;?<8nPBtyoz*T2hK$t zc;^K4=*uG|EginxEvol>e+LzX;T{aOd-WpJ_sV_tDp)CZopf@Oj}9ghfT5-U7hCLV zB3}zn92&WRYSED+Nch_2DPTy|*bZK`)!?WzzKB=u@v=vMa$)Wv{%L7@z_`Y9~~b z(Y+B%vpke@NA|9jLK|p+dtJzM=Znv`#hk0T2jq&K_@P~vm2|Jd0&f{=WbhJl7vL1G zqf4VjLMJKWu7qH@3;t%rP;@ME2TD>AQ}s)B$<=+uC7e~aK>8ULSmWe_u2{!1*6w7~^#FXk z{n-kRsC&^Zvz0#!Y*+AV4_$Tm{v<@AULN58SbYfO8s3&l&G%bY{4pZT<&lx2Y)ZvX z8PO{n#CaSJ9gG8i8VjJFkHI!kLK=Xb5JMY2{f@x% zfM6})wK4)>DbV}^?v^KZPyDzt+JTgdYBsbTQuq%4amdL|gjKNPuR=t*2-mfjwSgzG z?I`(g9KvPnfKyHX8R9Q6T|=ZEPUXr^wMgNtz>k#{jrsAxG=BUSdt-iljoswOQm8$q zD&ag?3N1rJm+s-?rM@6x)+A!meHCERxT0k)3hsAfRVG|fH+Eh(uBxBat)A{Bbn$7m zv6V!(l;4a$4n#o$x?wo#buzPeRiiI(fI@t9Kgh*@A3m3cDCrf1$#SFdmk@)x3~6oV((i;?|mm&db?74 zuQ`GDUU`D0w+nk`i{A6Bz1^w3pBB9zv-Wml@5P6C?}^slIjOyeiQa>)y>qa)U-UlT z=$-BoDEUp`v*vD$&v$g?_bcjfq;+qD^dc6MzBpKD#B{)($fd|u;f z5}zwv9G^dMHG|J{T+QQi$i??{va30KzS-40KKB>BFLE`9&mLFv`0Nn9ZLa3j^$s9wF5r* zFcP9Roz(boha)CY#J(u zks)(=`Hs@pG0efIMXxN;c7NF0W_Ur_A9`Ak1*j~{CR`0Gd*1Xq?mv?}7oe$Puy8ki zP^F{=3Zp-Lj2n}eX@NP+2H)2Mb5I0#KE6H?<>g{lO7Az&`9u}_TNiHfzrudXsy7NR zJ#;B2_C+Vu3;ofwhtODX$b~mL{W*v$pc}Ws3?O*-RlCvANEH$OD42iG^UzZQ7B3i2 zhF<<>(#x}xdihFe`W&v8v%20C`-AFtYlZacR$%AjP@q|(Cf=T_)kxDHwM1+vbzB-z z`(WXDRR6te{2gTNb6)g-J>8NHiWK__G_vS0@K4!X_x9NjbM<>vV?8SA-&ly6XY{s& z5l{3%Qt$pmn&C^}?Jd-|>DGmkpm?ZE0A=)SQ~&-iv?qU(lJ=z5Dczn77|*Qs0>Y()@GmNgr)D3Aun69N6B`bLc_ROumfk4lf*D5 z${;;QC+$sW8ZO593FK~pVveKGbc}HdJ8&s{jxi2O;S-9@iz%Ed%x2J z6eWAL4tR?g(3u=?pcqiL4oFhJLCDTCtpmDJ2ka;YbR`E&NPNIrsj(vPj2Li(WEs$% zI$(vw2XrR~d|M3os&zop4GP474_gP!NgZ&E7%(R};Pqm_A=Uv&U07Yu2P`xPOeHys zJ@=oMC$FFR^X19NMEof7a&`X zCtI?PC{L=hjwnxF&pM(!(Xx&xPwvY4CGteo`#aE0ME=OuDrk7QUt`2dZndKAmhDw? zMIu9~IGCY&KbNJ;SIFL_{$8?vw(QLtdm`#@6V9YNG#KjVt-wO8=gZ!LaZx%t$2c4X z5PMd65})$RE1QUS_&NU37LEO*^-cLlNA^6jf3&6EqPThY=*ji*H1}v`x_fj+Zkl^^ zU)v+PM{jTJAAPSm|LDa>_K%u;$cdl6Le*Qvj^}!GKW8zOTEycN7peD`x)txzu^?^w zxk~ew;hi>C8d|1k>jEcX;MW>xm8AiT6luUJPX3tBCvqdksd;aNA=ddBo>jx@t~wtV zyiTQ}IIJj+t-RA!*9xCa5-LkjT zRxCZXIQEn!{t)waZf#*W=9hBB`)tS{C3huSRoZHWB1&>N1`cp46&MsAmvfgY+A?I{ zs<&?JO@JFpE|_6^l!`+cKJPbDFi1)LRshEOa;sGDZYk`=%dhrwpLai=c}rn*gs9%1 z0PT353m(O~GquXuvj^kiK7!Vv@WB~$z~#Q&Z+zb9C6Nmb%GxAHpo?m+RlQ5_Z+{h#adav2`KMhXt!be}zu57nApcs;f|+ zh-L{tx)JwSntuTJ2wftUBNu9I+u7Iyz_R|2_(`jCo8>2_uLF%6=*$DW0OLkGNtTio z@+4^ht?-x{Gwjjjb-sQd*e0k2s6IWAqiTnI-UdbL|23Yob|9r+@9kk-fJ4FTUB27~ z1#AB%?!Vs&e?a$XlNXd|yTauY-&eI5jD=T}0`Fo1RfpdrN5;8Pd8qq6cAtp+%_T?r z6p*555smb?>bNYKbib$|3f*)R>(-ENX( zJK5v(5ozWRM7#~~T||T_^b~}eH1jdC#HPZYVDyzglPkI-;os;j$@ps}2zr6d-vv|E z!O#u)>*dH4H|xWf(18xZwz8^|xJO7^jprVJL4CNL>>Y`k3r2#7Gz?VMkRO-tWf^$7 z69gUH9^Gd-e^I=sl7t=*F4G7)mm_2NqL#1)J@B)C(B$AGmkn0oA0}(Dm`-{EXgdBL z*e7}p_L=CWRWD%=Q6EXWf=+*?xNLqmntJ>~Nx)6@x8N0Rk@ihR%_-3#2}~nKH|R(U z+7Q?pUCnQQvio<(2}lDn>JzLlk8Wd67jn1ngN}MbzPbcGNTGG4BHe&0l22QwiDpl_y%BCbBYnl*Owq@FqwCLay#9m#;`(`=?a>ZP zz!X5^{`C6?C}1}W=m6IQcA;u()ZB6qZyZL(eTWGFr>g;FcdL=C7IH2H$*A5h6zTHa zvUiUZ;yK{jDqe>)V>ukll!S6%{h2(r5NU~75zcYED@!Ei@c|5El)<8ra=~sZ6x*bW!?5Rm@)r60Jzbp0% z&g2PLTI@YGt`-#&+y9Nc`1-77<;4&=GQc4Yr+7VB7i!Rch~7%&?LTqWhgzcAKH8a4Kok+<`HVAIC| zVBJJE{oG@F<;<@;A7Jhn}l$%ALUP_;i zo&`J-r7hySu+W_%UP%glDMT!6K95)W(h{%KH#J^qgTEEdc>J|=U!k6qdbXL4|2N8_ zmS$v-TdwYx%mK9(^Ifx1ZOK;f7a(@DRJ^MLmyxicGG-gu^T!)P2DN^`+#HKFA?SD$ zQ6w98ILv^Nvn66i;lMq{tRdZJe)lJcq%K9@9&&s|_pK<8pW=Ped=jtX=> znb5g74V@8gDfqnA(0|S#d@cdN80c*AHwk-~vmp$7p&5#OxfE}b(Y82gp2>Y_kK0mX zYzr6(8>G-l@HVVnnvU_m3kTx9T+*#RQL3Mdr%fUk6Z|WSfAeU6P+h4mDiF^|H!0D^ zxJ$g_+~aTOlKKRSPwiEoDDjR#5*jzvAfHxrC3X!6b_7uy*i$Kdz)&J+Uw5*qY|Fkn ztouryH=u`c*N`TGoMaOl(2;*RE|6Ww#O5x8~5cY?|Z#E$Ze#Lyt3F@Z~h5tgan(ceM9*Scf~0sON6*~DZH{q3hfd~`0N}*q27Xa zM^}2+KQO;sL3LWywwWR?DJD#jw=_jJ5OOM; zenUuLuJiwy?u)cHOoXjCyGit>|Lo>xmphoU+7`#MC z(l6{n!})=VkQ(@K1U=_le_NTPDb{pTP}gao8)fVOMx*Pap!X6XLrwV!W|!aLZ8s&XsJhH3_M zOT@mWBAQrj27PGu)9q{@wlb9HOb;8~lf$0Z3uf+f@gkmxk*a39F$x0y{ebs=VZ!6UFA>GLLq#Sf>N;fDQ8-Tq#FJdK{QQ zK_pZljjM*u_&%OcyxaYErKD7N^dv(m6FC(+iJS_K)9NkxKU9yLa>dFE;2UV(3k0h) ze;K7sVDB7=Q#9IVTtDCg`xoG+dSHn7yX@jE@uV7PtxW4zbWgiyu4DFj!_u<_R&F_ON_A~Y& zqanEumo&2viA{VTZcf>UHr*QSgOS@}+VA)|(UIVfz~@(k+4ozTh5xH368snb*aZ9^ z;&2EA~eyMa{a!DkU3LLAsOo{l*hsxSg zKu^=(I1uLP1S6~fYiAUL_2#V#M2myv_WT2sH{=Rv?*IlGwcEdzy%m9NbSb8~f+dVZ zbQPGqT(++Tef-1oxa?82S;OfH&~tcsRE><9E9-geR@9Dl-l9{BPm9Xc{YYt}d!BSM zh_S83gNmz=D zRB(!gpEn9Zmcrd=fsF!@Z9(P=*7fn9H8v~AF2HIZ(>)Yb(@C1w*KvU}&<+R*6Gln% ziX1^^hs~dn!Sl%SH4Jo@=wouwY&dR#q4c=jcrloAl=QLJC!(vUUfDX}^of;t--u^- zHRVZ%&BV{5O2VSgO7HjUNQSOSC$wHE^gF`ot?Zu|DSdo0j)Y@_6v`0ENT0#>1Ic{B zoLabg7F}W1cjV>50bLVJ+Z5+WC733TGURy9G_T(g=OE{8+Y#JeXp3zY(bywT;?bSG zcknodA*c}6F8H2Gyqav5cx-U{2w594Gk;a%T1*A)4W2nEkCK6paT)lFcnCThF;2us zCHFH_-!6}2{@&-UlR}57q-lvKg>Mj@x}h>d~pot zixY`2-bQ>;CAQC7bZYT|L7B=IxADX~#1}tmW#NmY|8Yv<7NztcQk$Tt8BQYL(wP>n zF~Z~;FK;%u#t;DV-4?EahXwA$}HOvvjlzQ#PqugQh2qk2{uuLT%S?}BVFP$!(GMwL7EnFff0?kzzN-u3($4` zvku1-(L*UNV3Wi6`w&|hKQFr0YR{VVe667OPeG55`YiNrvZdGpmG=-n7tr&Fw!ayA_PPGY=y~!Ll`OA$*r{{&ck3i3hzB~#&WBhni^z7~OZ>8r;TqZr=c<|WiIo|$Q>3QupCOz-$ z%;|aFPwDjhSbI*-Pyf^eJ%85z7tr(UR=*i~o|64PM$d0{{blsrEfN29=s9Ek(doHn z#}Vjx&*w*>=iTj^rsq35{afkzzHF18QNK7=dhXfoSn0XVS0+85+>z7smhaQ)c~M(V z&!2psLeCn~^Uoz^Inr{{k8N#w@0t0_SVfC}nXC^SBUu;yQo4S{c?7zi_#33_uR8xS zx-N+ynXa=fbnQ42UHAOv=ycs>+Y#vcgwImx`oqR_-PO@FT|Y|ld&*L4VnQjr{MVxE zmz*YDzrWj{>#{U@UJ(7hNqW9n(DQYzfu4sPg`Tf&hMw_W9rLa<>G`tuoSu*0m`=}o zT5)a71Udd}?l%jnr(`zz7&bzdHxo(F6>0zF@{^eFVK zWHwFD74815^t>v|r01@BGZ9=c#QuJ@;RiPS2-iaC$y{-7(Sg8{ZoA z+&!K`&nZU<9?MaJNy@68mpn=UsuBqy==xXFzYjS~x|X&byZ-$K7(BGYwzH@E=R{B2QZqoPV zn~$BoW3|Uh-^UFMPhbBAvb;tmX9m(28TD?=KAcexT;JN8i^Uj{k4b_rooI z8GYAp{gvqZhsvYV_vc?9fxcJGKMH++v9D?RR!FY@+vxjRKoLRTqc=K1rwVm-ce{{_7_x^!@YV{r1NbiKEB+owYnI-fwcuU;@9fc)vjh_tWG3ZrjP3=cpy=@qP-lnel#;_s1#TuVe|2 z_mk!ga9FO_mF-PQ_eJ?=j`%zJ#zHH*E{i_X=(nEw+mc`If$f+{-@>&+o;5bQ_7(|w z%IS&UQ=mpZ@=(f!>X2upjsG9~Ia~ZmH(zAJ+>oaX9_*E-;uCmfj^!1%EQy~_=92pe8F?2CUWHij9L~!h!&v@+m zM0`9(4B(VP=h?+1MP5lg&!@kNV=50+;Xgj_7AdsHhI8We!|?(Y=FItmu<4{p7vVLT z&d;8EmJLHcYLJ3w;N5x5u7+5#5OeA?54jq!3=ya}fUKGSz~9dj5ab{SogJ^Aof6er z(8mmUZ@Dwug*e9`x9nZb?+J@*!l7kS_#EVI-qqtT#Cw(nsyEIn0Z8F%pAt8w<3H&8Os$-VOf0VZA2coHYRZx(nz!S9dRN0c%?yPl!y0WBezUe}~wUmAO%{brM@M*ND z&d(NE8Xqwl9lY`W5=-M)qY=|mLE}q(EsaBr#udDA`9+q-0;BOA-Z=a`OXG<~;}g6w zqtMcb`W!V*z`6~_-eT_0=qWs3m+XC(s7wkiBksD1eMlPDKzH)@|}MCyuu9;fZAm!dK<(lZ9Ll};p4{|XHXj75zMchG&vQSkdY zVp|U^p?|zF{}|BkI3ohGauU`I_Q?J`Rr?+@#vs4P3}Hps zN-n&@J+3=q*-*LA?-|ao)FW@tkhSCGV3h+GF;v!11gs4_ssxufaDL&Dq&UHpDgV}o zTw77FD)yldU@_jwm@DeU1W!acP=+^$Vp+hhl%uAQ?MEc%q!6g#uTV;ck3WbTnJwR? zb+Ml0t@wyDY`qukM2#OP_Z0RRzKXS9HwAVf6xI;47gq?S@Jj@##V7)-`M8i|oRl1s zJx~pj5ydD@Tg>=Q*j*C9#HpT4;@Y57rSrZlJJN4+zpIW0fxB`M86C*QUjnc!%Ypw!F| za3f|2=w`Kt)AIyOax31tCh`Pq2za1jLXoJ9mQ0mtsi%bbO1#&*{G$PBpa=VG3uwp4 zp!A8DCII@6F9!lGrn}#PtGVF3<7?6^6-u+XO+|}$V+m$p>*r>d;yGflRAgS!)=(PF z^t{LLEj}g;?*B_^K43#BEF;rXjA?Xcr_`taHS1fwxjKHt{H;f+Z*~2v=?Pq$sBcyE zWxR0$*K^*B|0n8OrPgyeFi6o>mul5m(f5JvuM_^1f_6b>IOXxs!g3?2vyz*0EeX01T{m4HNe`vlF3xTaI|0v`^;tBG zwibp(HV|G;KB_tJWj=mCy4By3eMEXCjV|l&QJda6F%Z7;F@2b%#Gl}$(ojMQbNhQ% z9)BB`giGGas`k*js6?$;+Aa8MB+Oj_yp$|`nr(Z-Tvk5ip*s4K22?h&q`!a0mxLcR zm$T#V=5p|>=}Y+xULf{>RnAMu^$;R(OX1G6k?+ygM}|zJL4niUWh<5DRg;&n6yP_D zKo#lhRJ1Q(PKX_i<~t33&*Z<$K%Oo{R9{R=q)(sX7<#ReL@{DkvfJu*$J;f zhu{oSDHSSSypd9~ggnR?*gxMQSnv&-GEELHf~9{0E8Fvq*y3O%JlOA zy&uhiXYBIVlORCnA3OPu+JM@Zv@%7#EB|mOE&gz{L%0vz-|L^E$mqgQtEK1#bwr^n*d%kda%AU`8>4Oza88K^3;uRawj z=>P|s!MvZh%HIw&|7t5*iO9|GuvfE$@Dy6Q_r{o}Nx&&&vZAzrSljpC=dl8`y%JL$W_j+U{ z-x;S8tit_Ly|u78ZHV{aDIcb;N?hsWzEzd#CiW^e)~ZjK5a3*P=CNm|&& z5`8Z289VJmYV?nQE>E2xx*#I;;=TNrBR|rr6IYTiqL{1Eu|MuK|?OM>oBdYQ_eHGCk*hGls`|Y~ zn%;`^aH@m7k*)oNd#XpELy0`C1%bJy-^Ia?mTppdDh1jB_+tM{*H^#3DLH2N>}Y zuxo5Vv@wvvm`;uZiGv5=1khsdJ`qpca3cjWu!3g|0neR*p#T3lIa~Po{-3!ykN^!Q zwi>?HB)fdbEoR53u<#$moZWA2+ll2^JU5K`B;b45lo=ytr zNCXmK%bvr*#{NTy;&tz$QVCiYDU_I!$|fubkuZ@fsW5IHKd{MNB_nlS z<#8ixF;|!Zz+>{)$4&x#oQ0+io&xtKqgl^hME(QPm221qG#L`gW549k_rA=LX{gCm zRP8${yp`({BO2)wdS~zNIJ=^QUUjnpTR*6F)20!E%`K;e%%p$3EBwEc3-1AD*ecLD z`ml|kbILj@rg#G8pNydmH3Slb&!I}8bZcv=?B&iB=pa}KX6)>bngYZ~)X%hS7we2TOW?tvwscwXXq$;j6%U*XztU~}xJMvweU z*j^Ayu?4C1z9y#N>j)uSi%DMAa^bqYI|;a{@8nk%$Fjit?A45U9r0 z6MTm_@W)WWhUq8-|1;2k4dy;`(&6Svz~&KaaV-_GR-_Mp0o0HAJp=oYm&n*~oHZXT zw!6&sfxys)a)Snb4Fzo(IQ%#;ld6^6TGgWX7N(efr1?(Wlt(bRyT1FSCXf%;6MGUucJ4__O6axAQf$MSaG0qb<5_lTZVz{gqs! zM%Aa5Jw6+m1TRaMV6S{*Xug1$giv3;fKNK{`naIVaTPFnQiAhm^pNgE8?4=*KyiL@ zVVs}Hh)Io~#Tn#A!8e^IC!PF4oWdUb@NevM^oi_lQr}aw!|0r@HC!T*Tk%lNj}}Cl zKZLdZNhkvtjp;CP%sl*qKPK9Q3JiEnN4NaI&+rV07y+jiD4^kiq3m6X8=WJC zYQ69bo-q#0oKn~? z&e>fKULq0{r5qknh`%S&F{=M$VpEjt90zif^KzI+X3wvKU3wW=(_+9mK9Q!NY6YVe zy!;fu38+>GsGearVc#@VLD{ahS>hGa;HrA}OQBgdBYxu8;d`+azRTwle6y|3v=1ob z^^FKBrwOWYYdKLGyC8+{wH#f#q;Oxc6K)50{qUX0GpVFLmp7~TtbJ}_4eJRbqwe!sSWk2IQ z^G_c%f1Ww%@Bh{3ncvPort{2*_Os4256nwD&%9;!f8;##S3m!K|JcrdXa2nN-}lnb zfB*C!gjWN?i*OquwwJ0-eoi^S-<)UYia`A1TuODRlhC)i)Wy+w6y7LsgN+(KP^^{O zr0|18rFUbHpZ`*yww;sAt@DHiRq94E!NLO7{hVCR_M@S+3`HYGcMI%Pz=g9t6=V%) zY~R9Af%!;Js>V`afTkM8;UOFChzA;4KUeG%@tjDrmK|D-D=2kE@8E}utW5|ZyA#{q z);G zN`nuSem9SS^c9W(+ST|(+)0)8|4uQVWjCDQ1I~d`VFLetfTb$ zSTQu5mrSp}g*o1ii^Td=z~DP;ywUh;ecq{V?gux&HRFv&-wKx6Q6;}0zeUmw^w!4+ zkBacra-9_Zid2^USFMz}qO% zHw7N^b2U=0Rc)7Q|IXjWhX(q&S%^pVlB$7`B+bw=>8D)^PiTt;6Xvy=DSA@iWU$xE^lTQx4;RCSFL&^UL!w~-Z}4~IS@B&jip{GE zlvC&^wp|9p-eE%*T@}w`o*dU>bNzZr;J-ZCpj)gj^+^<4yA!%{^tS=o2+11 zWrAVioHQ^Luv_MsV94`d!NKr;;!MN-w9a>^7(e`Z>(u#PoH}3UoK!gE8F07}#%%%) zHe=iYjo`a=MasDIUr3Itcz5_!1csZ}4_5SxnCtC0h4$3=^d3wnJ7pV>Q(%YRGM7%R zqf%No(IGEI;bZ8g8H&;65jw2lGQ|q(8m6uguwG?_^%4PVuoqqsA5c|((M-nCr7W0Nfv9^Wiafye95Tj314G&J>xVe1YPn5{!T$9o3GScjXLI@~qM z;TEd^-s7--AC+dVq0@BE+)T$4=m=8yoz`>+v$V19jeZ?W&~M|eT)i7GBp3G_UK4h5_GT0Vo{TWEjXf`q?Bk za4v8?9aVIGGJYny{4USpp6;!jM-MkfpOMLh3o1kHp2L&%cj>N-8X1YO_M?9WM33cK z4IZ+C-3Du~5*KOE-G+VG=@d8i5&hs{@^~OmboYX8I3{7ZG~hLIV?2q&ad0Z;dwUcm6*#XKXTZD`TjoW&CI;Ju^!)(5>_cdu0K83av0(~E z&#O<~k37P93jU?_cptE=Cog3^CpWX6t8hJEOo#QTVm(dnxBQFt`<-cx;cD*plhc}> zM-{-0DL9Xh@UIu9#EpwD0%6aX&TaC5-k7E;IYnhnMNY%`v~ zM=s15+1AFMe3=H0t&V-z#3+xjD&V!TnXw;Y-^Vt^HpJG)9*;dL_DegjvYB`tZ7-Ny zb>v4T)1X$?-t{yJ;R$(|osMuyP*&-BfpJ?>)`N6l0Ft&y_AZq|(>n8(0Pw}E!PA9#@y&)wO>k8$7mL^_0jmqt>&Rl=7|_XQ&#(W`uYKK)Fe z{z^FlgkS6Od4G~ZxK`u7CVhG6a9JNQ0+R+N%w9<d}WczdIXkbzfK7U<2ht6~HPuAi!afS`MO_8RuE<06yjDwDb*r1mMN2l1^ z5l(jYM6T>RjF3}2FXp!|F|aX=?3^v1%IN^VjML*%s&$Y*rL|EE=l%yd-S|dxnj~x2 zVbY|G+A}bxrWR(bo_#+#IVxHUH1M;;JTrK-0jl3Hl0FjAf1RvPv9Vq+8q30?U2PpZ zksi?B82gchv0~`S^WKX08zXdd@DbltqWD4~8jAQ6DYV{+@qG9+2>9>dR6y}i;hOq{ zc~B(lGx-VJv;vc^(4JN!922OJS2f2D&YY9fgMW{(8+rmBv2p@3+NN3lo^5?@Q2~b9 z`M<;ITR^&4z88;h#q%-?{>er(lnb+&dR0;~`FW@DEP2bLZQ|vYrJD>k5ac&))2LqzN7GB7a*vl9YlQTa%}3 zICru{9wv)Ug|*Ygb&MPzLs#DCngD;o!*3^%f%Y!F&)j(O6y&$v0e)Xug97)Jp`7r0 zC}4h}!`3v;tNNQX6t?A=2+`k(?A0wc5;u^|LOVS&&_QQEyiL_|Om+<{k57-j{)C6K z`X@X&c79Tew8wVO;lWA@nN0KAlwJ1Haqh>rgGrBi3yNGoB15R{Vw$<9d;et zO}1W!@l@~^{RtvdifK^@O&%7_D;10qGta-PlKAG8sZMU{`F9l}+hu@ke`v+&ibjb% zK{skCuD^<4gqYNdA!FfRRyE2%B!ymXmHax58Eb!g8Aekk>+@ccQlA%(=fOnBb9b1~ znTXfaS6hLfrJF|N=TQNH@fUk!Ya(H;k26v9n96zR!6oqT_<+ke0*TJAmXkrWnR0bC z7Y{J8sdxY>^cEW8h4;Co2Vc&x(8|(CJdpzT^)30zUq2?ecWB8a?&|{X{A$lultC1; z{pc18(DgBS^yvc2r#aA#@BUo4|IYf&H5Q=1Nx#?$Tkw|E2(YaB4=1YP`W4Dt(zp5z zC!tN4-2}avtO2pzN^=IQxa%2CXEJEm?j&&;bk&$S?4bY!B1%DW%R)`?iR?+uQrp4% z@cc@!2sJX!VV#yq{$D8Ug_Z~N5I z$o#U5CdjL8h^?6v@NK#afM`N~5ineX0zCQ9bP^9?7d6e2%{J_e*|(FpGA zLzjgS7&rx)Xlh{KZEd*=apCMZ#hDcVEsRj~@xzmz!s8Gh3UI~`-H969VW$+n#?I|o zP{+WwXp7@|hdQLtJb^w1?4F6qJW!)Z)d)XR%fp164(*Vo(PFyY? zgR;{ma{aH!dWj%hdw}N0;e^04u2Z=LbiO@|T~7r>+4P#?GbQ=G}LZ>BdA2q)8bR3ME`X5Hy8l5|nnm*h<< zU3?*hw&J=9i$UIOpkyn>InunnDjLpo#!;L{VdQMYbdH82aD7A3)Zn}bZv%0*Je+xE z;FN}WTm~Pev|WI+TxChI)Qkvj!I1BKA!*mp2a^{J7X&aYYxVLzfb zR$qW?WwWPStj_gM8jRSYWcl=a96Y>>$@VpPjJZP!%|`tp7?+T;cOeGN$jziDb}1~k zq?622+f2S3%ZSg?LaH)dNvgeH?#Y)Y2kSDV>F1DQ0E9jWa`jB^$RL zn7aq_^z0)Jr|mkEP8O=!=zo~U4&e(jka$Mo$P|>=Xw#E6e5sSd*2U(3JKMVXmDtRu zQ#p?3z&WyZooX1ArW*#6A2-Bml;5Rk5{z7|x=azjWpWcDK7U-ZuseR71y5BxF5gM0 zD(l%xnK#JeTgWVjFO9$`Yxek~{3_TcJQ8yIfTJ*5kb8rNiwpu|FxXq$h^XY|&-Frf z-J{%~hp`J)Z4bNjQO+D`kNKm5M;oz+IYNRms{qgX1){r!Jt?{k*-6Jfi%@ODr|gz* zlcJj<*x}0{H*=&=FU!}t{LcFMQai^K)qIZ(Rgt!-W<3|ytg*00iB50$-Ue=O1KwNw zF`^I;D==<%<*$#veF)Pdh^qt9m+KNucs06i2Tp!k4Ud~0Dr^07qGy=#nV>`>@5o=p z9{U4_`YwyIi3r9WTI)!ItY#x0!M8ZWK^cZ`MPH+XM}9e^T3|LQzN4SuzK7f=_;C8{ za$=z3=EzzUniF{{TAl?gLWewqoZL|*vVe`aEzYxm+0l!InVW`rCI(g*pT|gG!09ng zKL)|jt(!pm*zH0-7LIWGucezPD+7D@EvJG9Ebc8NQ5Ag_30lM{tnC=g_r&usMYqKh z7H%9~FNOXI5RYzeNc!JVHFCyY;X?0EhYgXKg;LdYz0yUDOiy&d4$Siq#VB)Tzr)Jq zM7;mQqnjntdq;CIQNZpP&3oERFV~RK@l>6oWNfO=;h(@Ac$57p{7NnaB=G+QQ$KaY zA+(y<_O#}QuzBx2qB$z_bW&7gCR&L9IT!u|{UR5>d{tW;Ycq}<)$|MlFT`HL!-3~0gM~Hy zW67@{e>;q9>wh_(^tc=1KPR4ayq*?M`oqZoSUl-}VZ8lm{$#xU+PmXv@%BrMXV!T8 zxyD0Fy#2Grvy^!I>HKMP@%DG5G^fSe%PGxi@%FhX&1v!WZBv?$7H_ZWSHYB9DcYAx zMa=C3;hbBpYE?=_T}!#5A;Ska{Kqx01A3lP@m)*p3&mbuqHXf-8QY`8yRk%=L$Gb#>^f3-KB0Idh$4>G#=tHw6WP0Nh{@`6ZHURt= zTAa}G2W-ivmK&kvI(>`_T3pc5vQX|e_EK+yH2pCfQ7Xs^Ag}jqI5lX$+S`D$nQ%w3 z_u%-O`7ExhyuGZ%yKG!2db^3Y-i8Tf-icY`b)WYWdPUeP@cc3F#O(1A{yetDd@jvE z*5{p=IpHtS0&^S~R|@7L>wBD!JF%NI{k8<=qNe_@ivA_up_$V31lmU&GNxVkwVGRn zAeMs-E&MmYe>40SVU~afJHIv5 z*yqjMk4xSKQT4HavnTkp{YvfzFsYP^!!42FqE+=rOjM~<{M1shZ-@=Vc;G<}o(MyY zOmeX2Ceg*Bn@~qF9|OJe$Ctokm-*PmM)Sw#$iYfCs1*Lb#s$F|Hkg|Ha}>buS(w|w zks4R#1H@_Sxe5$Vh=*WZ3;#K2vH82ef9?L_{KoGmKljspG?Pc^NYs4#IX#ua6X>dy zwdFn0ay{myDv!OEmY+q@zT)uI_Hy}kzk?OMKx;Y`WyD!vZ`l~xTU~!RQ{-kzyb|Yu zcxl4IMn`&>>pTL25Cz{sV(_0N{dCQr|DY@26H)}PbxRL&X}?jwD~J7U2y$uA$Cbjt z9%p1&OE8AM@%KWty`^@#c5No81P(<*g*|mbDfeZSo>`!jFEvGSlq6a&o9boKbi75a zR4=Kc@O5#?b!k|9mLw}4b=r5s2XnP+G8GK_@q&KwBhfHOvxDY6JSnu31e0A3Ryg=M zcKi422g*f9n487~QS>o|`PV>ZW8VI0}}N#F+sft&k@ zT(K=fjtu+K!F8XmpkDjz`+Zt(58jw?;Qaz;^ctRzmfIasH)4pHn;+C{X+|p~z5D~@ zilD@(72joo>Ji=Fka{0`Zk(gbWdX|LWf<1TCg`3h$ac6GA;iMM`plf48idC z0K^^e6CrNij|SrQf0-k08yrL26WPoop72TZ>w6pD59If8QDv!aP2Z`;pkMlz-%`8q%Or%Pvv?0+()FNM^}tViW2G3A1{PgrU?r#( zO3zZ`(9&&sk%y6mp4n0MF5-II^qcM6)OFPv$_*R+Aa^jhW&0xh>iH$w*i5?f?cF4W z?j^zp5;=r;fT<(jrInveYef9h?uB)JiS%?e9!$x(U*q`*P=gvNSqY2sIS2xvi-fw7 z-L#0qYY2@K0JUL>1z^hqn~7=Wu0>b@NiHRVl4xP9@+NK_xG|i;guT7< zYvPGw?_w!RIQogq6!1lDKLedTgF`-1BJCM zg-`=$qi?{h9;xy2-eH~sUVR6*CPL9G1KrX5gb`Qi_lSxE`GO)RTFzXavDunTu4FS0 zlL{!1KkyPQKia~C3$TyKFwa0V02w+D=ulu|GwZqi5wRYT0#6E!!}YvyJ*?->)b$kJ z?Uo*cZ$SqEN1n8^$24ot-aenI1y+ zz0%Z755G6S`}xo1e1yCGo}8R?`#B7^-@RAh!xPc-hvV5Xly9X%xfM%osoCnay=rbH zTCCQK4Zmx2vElwU{1hX1GOA~M7@D9R09QpbM2k0hz2X0{_vZ0Y6&MXDb?!ZNs_v7ZWH| z+Gm|4?E?>hv=+ZOai2r3QsVN8Q6#b5zY%gbb^UpA>!DcEdKH9M@d=MaV}-C3KKTQm z>VuFE_IVn|wP2KZ{FGi;?vhQelTB`*L>OxRR$QVJEe2(v_%NknnMY^fc~^d1tc8SO z*iJPYVpM>LVPNhlkJ9gQ+*sz2vCi!JDD=@oPQ1m~V7~QRn!VhU;c#?|)PAW2MrBYk z7Uh4@;TmvmlW=F&wBti~{8uT|ib?RoE{SB9TK2E24>cu?{F@EpZ&wOv$|SA!Tyk(K zxA?@^A6xr%9e@_2bX|<=v^?$jo{{uNf zIrAxNMf5o2!^Zb`Irg}jCVL$An-TrpKmA1eyUObCU|M2-FHSpA|IuaET>ta?o4O1w zfEL&tcYYm@S@f5NUf&i&reMiHEP?$g*_Vsb)5 z#;jscoE?j%ksbTjy=@6B(UR##SKYWmRD7$e;uq53lHTu3{h4-)&Jlv=37gh|TbJF2 z8{jqMXIgURyGitXPiOLaS>803*UmR(oeqbfSkVp4ukIDcP%gf6cZkyr$jSA21iJu5Na_nKZMQk{#sn zLsj8FM&)7|n(kl;o@(~;@4@dt{OOruI^XRpVVTE#6)ueuU)yfd4YM8{R$m0GFB95n z;^7l>4sW}n&-UxJc`rv{zJmAQQYhc^hp2s?^0#)|{6x%G#@J@wa;Us);z{4-lx?9_=k%BcF0zI`Vm>&(QYeV zZ57uYu@zs^DvM8PrIjjrNAo(`$no(*K6^Z=SvmbYb-bc=J;vL#ZsdbmPBCi;UTFSw zty~=xE@YGW*$i?jFPLOayxg=EFRXrJC1&9@aVuzi=P5u9&Z8@m|6-U zbq@OB;p}l<2bWHtOse+>>j(k1koEwn9%3^8`!1M;gXtUH@KCwayYjv$zEBb7GVdy*Tm+qR5jYcgBzcxv?J7u} zf*bLSuHXYX{=pA;e1@qv6!J4Fz#Ng z*sVS?6UU5<~H&mNQ~|6%Y8kGEW$Li?$Z4)e1W77~3=nC69U!AzQCK3^IG z=vT>yJewc#*XESFVWx1e%$}S9uZGt@XXF1cgwLJ_wKGFW$-R~+_?8j%Es8^ zu2dfPaC+S2uU?e#KYHGc*7M$>1t8Ssrzn!930thpOW_!#(K1MA<+!k&ofX=_4lHXl=Xlh*k_c7aN)Bg>E zC>yEOz>Zi}LvNgh4fRv*0PcJ)nkrg+jMhR^!K|E95o_%usa9*Rpw{+EQ%A!d91VLX zE29CX5m6!3q?)nOAP-L6d zq0nbF3ib609Uj;a89?tFbMlq!{O+yjiwYea4c^BNDy!UORf5%z;OZA~^>tkRZ6{iN zCi~y!cRr~9t4`~$qx!YlE|AOjhOaGtYf)VbtZSv5^F=t59Ropyx7f3o6rR=^%QJhx zLruBUhprbq`&-4=@T|k?d{16?dF;AN;JRP7(SGKLh#T(#6~6i^IPbhI$^IimzYyp< ze&a9FDm7g6n1kapAf#x8D3>M)|fXt;+t$xAvNgx_JpGExCSreN1|ILYJ%0tb6|fi%$%k4J!b_u&&|IA6*9#Tto#bz zEePoh?5;Q)_p5{L)6FB4-UWL-o4>h&6LY?*{2DGV;mb#J-cV(xx#!|G>L|EHbk+d# zvMVT5Je+Q>=p$~Ad;FZI?MgbdKZ%E*@Z!B@zIo8Ih2}-t%wK%QJs^&2_ z##}R&^u5GM9&#RrCqDV9BN_N1;BX`XKP2A%w7A_miPB@v9#3hG*mc-r|9Ol`O1Q*n zesK|%GkA!WW4fN*C|~HZBMCfa22% z@qCf}>)6iZ9pwoZEQpIifR0V6km(GtaKS=Xk@15s`zt-W6;jf{nOSo)3*c&MQOd7wX1lJ^QH7_|YdW@aMf-WW1-%gLo;X+3ZE%@p_k@_X!-f<<*|rsONpI zJ+;>B+)VqM`D%`Mw8wFuz>m#gn}!`k`Sxq~q@S$UY)sL5rw@RPPG4TjtetRhLe2K0 zdZ;hv8R{vY=qaD;c|VG`ulN}}*%%NUpKe^A4i3qg#DICW2t-%>QWp;NrJ(f_VFDthU0?4_N<4i*lpFSYvVVqZ-?^wTAxzg$ImO*3}-sh|9z zbnGX8$~u2agD>yLitfRSjZ16Wm!fM0Ao}%C^G>rqz&-3Y!_>b~&-?NGVMaH2>6UU!A%;i*E8HSg$ zV25{|Vg5Ovtj@DY?x;A$Z?r|8wD3>c%xWT3?(l)F!!%N=@7MWseAcb`WQqs)jQ>(@ z?E)y)skk!i69(MJ2K)?jSUznfpUroj;xoPqr}1+Jf{X9lfwL+&8IE?J?5H@?{K<?rj=wKumsYz3wRV=XDA?84ke&q!1lz?rZ(`_%cxTZc+d^ge7VLV>= zHeS;oys|}axSWf458wAx2Jd^B6yqBO$@+H*>wcif zxWfr~Mv3{BbUek?q2c%X)@)A;zha#iDCG6kv!Cv7__yK4uaNYEXcE{r*H|C!(qRj0 zH(Jr|m*I?|DL1v@X3NhFg3$(>HVT7v>t(`U`xRm=FUx|)?=$_PC=!9d-e1_uepFbs zW9<*$4)2%)Fb4g32kE>(e{hh0xA=n(XZm1MsWAG0hs~U1|)`M5OA+<0MK(X`rT}HsAomk(ckpb2|!uiW(Vi4_n zKTgQPbAYHpcbjH}G zBpX)S@?U*<+bSkQ?JZN=Xd`H>IkHk||A0s-`CGB}!^_5M`*v|j)XtjVO!-iHq z4=1mF87`;s1bw|TTz$lf9~fT?;#hoN?vH#>daU&o+R!4&`br~efcZW`bN?mRS0-8O zE3i^3Mttq>G90|}Vp}OF*5%$ia@nBFn(v7J7ue1*;RMhUt@alg6kf)8ngLtL8F)rY z6ZMmW_Z`wR4q(lclSvC{+i(f(ryIBm_rXT6>+mdi zX|K|)?d2vwvdDH!7^Aop{I`*pPy7Z&k4*oyG<&Qg{*>YM>r24o!C?`wZFEPm%Wr&3 zJD#3?0j7;DA=jdth~34Hokl{&1!2th6>j5W@woC=7*df z_wp$@(7}(P5Dh7JSIit*YUHO2S>HqlBS=YMMWI(|=F9Erg6)Wa9q9*$F!H!ucdU3P zAS5)342>*+eRUuPISCmr#jRCfacBB<)`DXu^j&DYyP<#soCSP+pmOMDT0s~=o7#%KxDSG`$e>=MjeaBkMW{3Ate zMaB_eD_qy2yhCDwD)@d5d{6dadI7p~8N~Pu(H?*HkNTX$z*~C)AJ?42$iKVMC=|l@ zK>T<|$EHsaH?~gzg7nOMfE z)0;PBTVptiFREPgyld_Ip6Geg0rYoP8=<{(oV59P9$3x5cTp-WbJE@iqO?j2ER1$o z;dfiqZ3Jb9z~G91UyES_icVSjVyF}aI%uc_y5TYMLh-_%9>-N-QczUGWu$`qloEPf{9Ebkk{bDR070=H$#w*<@ zn|c2gI22*%4D0Y`x9X#4ERKdC_b3>O+G9h-_ci93MngE8&-WDlA;yY$Hwl699dAW2 zWt{QaVdU@r;TPm7&p4C%_)<}BjzGr;ls2Y|jEAn=5NQN|dB#+k)*~-rIP*)9MS5sd zj{)5s<_lL*$ntY@j7Rv{KM_9nf9OwtQ}Vy>PwyW4|NGPbi~jT(xBkEBPuJb@|3!Zq zz*HkDp;U#8&yo>qG$j;Dd@n55x|k8)6o%mX*x^Fy-V zAwCX9RBK!g)%r2X)DLj<`6+QNeEf`#SiimpX&u6TU5mG=l45*Cl@!i#7oFkOY7eqf z9=`n8ol)*a(S%g{^R3Pp=*!;ao6}BJ{0I2WX~!yhj4_&_j<;}NJOH-iNvx#jF>Obd zb#AtJlovt*T`t3;=&tPw^(Sx_TNkgW2z=hBIL5zpbAlP0S>xYL2$v)@; zeQ@7tzz?)F*|-%XN_L9T!f%bS)|<5X{{)Oyz`}#U>36|P%ul`0Jr$2e{v@Bvz+;Z% z3+(YB;m{>VJg!4NHPMucX)|ts_e2|@IBTM*6{poi(<()iN1KmBbf;#$6I!y4??W$-=$vn>Bx;xc+7rZuAA~-m#M} zuM@Jma3QdCfgYOe1h;3q`P*UAmKQp818&~RCt_E@#1M`)2^Kmduf+25>mpwY$6#MC z;TQzOd)CZxw{>8kJJj!hP%5x}?O^+qhQV74cIonZFwq2XmJ-W@YEn3CH`o?LnGU`g z&uQ455Yq+}sTI~<)0zs#{cJ0iJsM-#pRqZ(6sPb*vLI7C<_}HZkQwPu`6!>9&)Wul zRnd>uW9ca3?NnG9K^cUv>x9VVOXrh6mMNY}UwI~Xr8{OAV(;A@9Zj%_et0LWk$rH6 z=;ujS&>UDecNbhkPTxyI+B#6Pt;=y z*;P+@PdBb}h6*|w#yr~&Z^=OYLqkI|kLu50MAcXq>^lcKM!eo@_M`xRZE@du#)LWI zxtb~BomRP*^vGu402HlfZ#4J8s!FiP5ve=Ie)l!Is`TKQ2}fwtk=SW|$e9RDboqmB z40Qy{GV_JrNIWGTgbn{pf-PAfC)YYR+d_o}4p!U>`pu_h15oxW%ragR>7ky79Xl_( zxRdj;eZlK;=#*>!5tUKNhyxj@kK8 z9&(Rw$Cm2b?<$P>2;nwuf<$|Y}P8yeEP_qi4MR4*(GjjVS1RjH@ z5_T?lvL3wC=?h)wG9E*ELoIHiMto^&I^z8_?^^ma?ZGZ+F3#$%nBfaOgzB7waN7s7 za?BpZq|x2vd)ltgk1?w?k?!_=HRbv5!KWFM1p*qp=eyj3SJV&nn(yThToSh?UyDJH zUw}S6hnE$FN>PW#fDXZQv&ZO|4&kw#LWfQ@mz_-y`n9uZ`KF(kPtdtpqVO3itc)Pe zEA$-AJ`sBp8V?3QjAXC@_6^%1nA1QwKG|#a{MpDo+GNqhh=LR%gF;FNpsnR%ODO z311we<8cODE2gp3aBKbR+Q}1Zdg;Q4c(&6eF3@w&Uu4l39w%6O_Z>Nqp2=(j}&4wuW{4-)n;V>*tdOHor$#!UC6B>z_(#+1pZ*J_?&1$#a zf;*uAsx!x1ztBq(an_oVWusU3aYK(odaVhJ>M$ z8-#*VSIHO50-HH>;c=ziu4uXo{@C*p0dW}p^(ei-NHX`QBxxy)U zOk5KA!My4+qSMyu;?-!ZGat?6O%>1|7PG4#q|bnJzB#*044N$SK)lsh=nx+^_o1+H zSiKQ)Le%xmj9b4L>Zld&!w%dKc@^B@53O2)AIr)iE#dUr<%!L>`Ga8id^zR?t#ZRu zUGxfF9=y=}r*Ln?Oe-`hTX6Lo%Q8jz7+79v)DMj~{R@~YZiaz%|H-&CCmaTjv*DqJ zKcRHCDmwaDyw5kUle|CJGSo3fObwA~;}(nQxhL~_->q;~Qjns7pkp9fJj%&T|5P~d_3sp=N1NCh}b;I=3XKXNk;Ssfl zkW5=cEd=536zoU+`*Y~+*RT!XGX+}O1&>Nx)_RFr=5_t*20R)TyP*R2YFvL3MbFz$ zfx`e_D#i`Ug-d9@as#Mb#)Ll)b!;rGbHMiIg?NhD#=<`X$YK$Vw?gz(=n#j(6YBA&?C=|xn2+GRHDl&uL(!x`?OseQG6KbP|0F@u zvI8Z!V@|zSd@aZAPbL2?e?odS(cv*qf;r^7#mM-#j}CR#FMTOsvW;j zPXhE)@U?OTAuX$dk33VGcND<%p;fMY=XV{>IqXX8vjfXbc5V))4q# z3!A7Tw!mO|NDqlQR4All*(&%jCl!D&%kU2HY7a$vmL{K%hlo|ed|mqA3+sq2u?lU$ z4-`|>-+Y9&+US-At>kOgii**W1F-FIeupqz%NlXh=7*i;)rbrTWXAdfq&IV^m{fu^ z#oCnlzx{A9eJ~$4WZH-ytLk_}>Y=MLW5e;Rd0;8=-6A~rfK&1@@GFiitv?kDHOt0P zE^4^HUt0$a{$NIz=Z4xxqxm&H){N_)#oCLXg%ORDS}?QMxhzxRHN~7HgU!aXF(x3H zdBc_Bh1qeK{fyA4shfs6%)X-_{;SnyfEOIfcuABA(pB)$kTFesp!3;~Q6|2UN@Uz2 zzZw~1#n*<6o3NmIw4mJ(yX-ptY7RqS#HgKr1lIjdejh$Y^n}VkKTAjyY{h)$(5f69 z{Re%37BCj(;PAQw>_v8qUel^U){PPH;@BSw*(^1!V(d>zmt)_HYc#Lr(zqWa$9+!R zxGzYvkNaF2_g?$BUz7GT#{C<(eFVx3Qqj22O&Iq%H14%PI;`iWw_EGS6blVh^Y8+j z4lS$oi~hzyxFC9&op~#QbV4l%fLcfn%_Pt`WKTWk7e)*Ta3}-W1Velaf!|XE*%xRO zXmIGlFUHt5z!wD{?J z%t(p_c|=*zt51ph!l&Bym0#7`PDiN}o97P^v~7n8zlU}cj*;vzx19pLw~&0B0KIGQ zT@_1KSa~6ucs9~-N%&B!eZCf2=C$}eLHxYLk92sUSK&9E_~9i%9E{;VwJY<#g*bKs zn_x@eXo^<53?|bfE=10`u|a3WQx?Mb*_b}S!NVf_pE!P*D!)0v?}wqtFY`G3u)Q@( zdy@@YVfpdXen831R$6(|9;6EKxO)Z2?#azyq+R8VkWzR9xT-U6)y89MZF!Krf3z(>G?) zdN`m`sZ@`f0h=D;p(9hQArXYL+CbHf8C^V~4`TXk+P^1k@}UesMcaxYewlp`DV4fH_s}d z%d)M@Mn7zS{i{ARw(Iqi@O?e{X=3NKzsvg+L7cVFngtk3@r50!h6Ja*)UR{*cXODh z495DxC*z)n>$vUGVSyR=ylnAAf1<}i#sSY;sQm`&86;!~e~x2&9d6wz4KiN@F29)p z7yXSY?G2T_I4JfU$z#F{B2n3?P}wXw--fw%p|O$DA3npZVo+Gff2rA3252(lh_d7} z2}1SxPHV?ZSn+Vf)?nE``HgRjj2*n4SH;k{L&p9LSrD)evctT72p&&ht&=qPvJd$o z>RXhuUe9ZhYbsN+a^#wdyP_AKZV)y9I0*0chai(OrF9HMLib?e8J=hRtvToniiWr; z{gB>UKlEs0-+He()knTHW#fWheZ5Apv0U61Cp6Iw5J0~%+u2a)=BXfI^8*JF7dSCT z{N;*LFZMv~i)ffzl6sMR>M^plHn)cCdZk8gq`UR)?E%GHfR55k8h zdMn1-17A0sZ(sGHz*nZ3)`PUWtvjXYo$QCe*8(Uqsi<-{CP&65_l8;7SFc%LW?%Ao zbr9_Tlgj-8N{>OBAI>B0yEX1y4mCy0pkN;PUfFNlXB+~3$~Qg)efoq#>iV2rw3om| zK1HJ_1dr=KPAlpQk35Hi;>J#&u}dn|;5en)i?N}-X2u|1St}6TbTO$Tq5tSMm^Py4m&J| zcoyxfGSLO+Sp^QlVn>qrwOjzTZG)Mf#kuLQAX;%<+~Q~Rh-0mD@VaLPoIk3rZ&1>{ zvgXO1B3JEEqZ5t=%tqv8Da8HM9=@58wSV>4dzcsCX%X}T+nAkhTL-lk=Xab(_rkio z2?}1t*+xfs-SZG^X(t1d*uBG@af_Y>a?$fwS&8a@ika>t1#Rz2>faaDzm)W!%D|U< z4j+{mO_^|zP&qKZQjV(>8)F$zcAKwsCB}d4N&)55D4=YQC(kER?W=^D~?7fv(vr; zyV5)&Uf+>rysHbAHJV{4Tf=t9zSlXPT>`mxgChB{_UtvQqETE7TGEATUDSmZg$DYK z@1SG+!E>Owp91YMqE&uRsTs2!!w&jtHst7`z8lcpA0p%w#ZA73Ld5Th&t~|9)5K4X z?eZQT)1{5~)wXT?j(+ve#Mh^RuqT4A-;|LQU;pn61z$fNPWb`IF77vQ7&>O#jp5&A z$o%dbZMhVLt2|gSKDJ43JK=O7p)h!l6Aw&)jY=S+qh-d$`kL=jMMpy#`Bpt8Vs^BP z&O)4Se8QO}k3V<`?CugvoiUpH*#~{b0pFS0UpnpBLFwNd8=t-` z0iS+b0zUmDwM1CS6$$wCJM8%M{&sviB=sH-pMECd)7`$%)j5b!&-+Nc&o53RjQZd- z!l<7N81?rTKv5>S*Q93Ei~_uDzRfy1jjaVxQN@Fa5ZGKX9{2gI&VVh(3W z4#zy->HqHY%3j4>g8%V(WjFsA?f4nzm2EsD;k>eo9skdtS5}pf+$P>UaNLYPnAL*A zxiE#6Y>R!tfzTe&y~6e~=mw819+!!j$;eO{7ovXdi8e#e9JWaeD@+D#)DKlU5xoXk zA-=p1D?g#&9vH{KCVw!Ox2q0z2P{u8zg4Ss;?8`R=)rqib=*I@+^OdssyMHD^dR)8 zbcFc_&Fz;^vNUX9MmgLSIQiU+zI#V+1Th`OJfaR4@7_HjD_3~wxUnc6yPF-=vs>X7 zII{}WC2wQJHDEq&1Gl2reBrp6kBhgv_NMq&`>s~o8&NI~Vn*3zR*vNKxTG@nLkfp| zAK8TOU2y6#EMU6&VhsD~HR1=dzYv@pkO=FD%oEq+*$8;tfqBiyHU!SxAk3jTM*N8U zXx@B)?*JLUMeA&ak3zeu;AMVQbt)8+IR<^#Uln%Y-hz7F^w>EDXqE}#1E0ThN?Y>$0^Cybi?()D|RMwwg3cTiUmRM%j~XDHQFyb|IY|55MKI+RZzmMmbe>KQWXKE1 zNw;5EXMgYxF&REAW;GyRhgSOnXiiPL2D54*AD)_3(=O!u(t+{~m@i%<#$-V9xgEhm zuTkKIrKN@f%GqrwKmm<3p8<#H&BX1iU_g!5gLk=n#@8TU;4ahvf0jpRKu_raYk~d1 zAz`x(883wC4jq!a)!c)VjDpO_KVo!v--jM>58GtcrBf2F8*!or2>dOgMjOlku8aYc zGg#opL&vPRo_II_Z~<)_W3La+GJnVecBe2i{H&Ev9|GeZnho{k$ok-E`*7C3lT)nw zVPJeZyax*1;d5hgM0`&DAP%(2Y}yw^1$`zQ#rI%ITeQQxaX4vhO&EPia~u?og~HIN z>%S2`%+5^wJ`TQrjakj>%()k{|L>ri$A(GGA%iwZ2SEN70zx(S(Z%8gMoVauL;fl+IEfaPSHpDozU-*NcLLQq?CGlA) z-r2)+qe!TjkSxzp0Y3aVqvv{`(IjrI+^_)BwEcJ}Xdj%V+$_4_4t3U0O%vRv$P+Nn z3jI1qZ@40dhS#^@Vz3+yr%*R`?1&;R8$vDS+KZ?MPrZmbEwv+Rdp`&(j~E18(}503 zk|%FIdj1Gux(Xlb`)}41LoTngccf|k< zVQe!4gW=IN3;x6tkKMW>^U6@qJ#c`TV0$+{BS_Us(G8DYQ;;b>&9{MPB*%`=hwX{J ztZ)I>t9OwO&ugU#=q>PU{g^6GMxb(3;(P{hUb-=ju>OUPieBJQlX$u@WHTMl zl0paX&fiPpsCc{h49#|#*Y~GvMNQOab{|F$^CC&jW9pBG1a(&qJ=HNe^i+o&!FjVe zoPQ4#zhkW{T@bv&qbe zi-yoePU|8+Z7p92n>N7K*0Nhls8xVF$Pc=3A_cLrj*6_3VVmGz{+nd|0l+XPE?HfR zPgcKuH;h+$enZ4xj`$lc{&L}2COJO`pXX<>Bc=3UxA;?~Fs5vKlKJ8({zi?dQyiYJEuN5-!;f(<{-Uy6 z@sNh&GtTcA@4}lPq)6BA*IXxUaCUb{Ho2?2B{JBr4O)u~_G`B^3dIqZ0V_YqB<{J4 zXrx!GyW@32xh?}P9oL{bd0;n;J&>C(`nHTm2cEAE-}8f_{wu+Xz=kA3bINUhW%4h- z<725#v6H1L`y`$!Wx68K4mA3ro$rF9xL>WtZf`a@oI8)SlpnrTAKsr=7QXD2AAAmP@FEK@8TR`bjQ7HE?JaEgKfdghzZ{0I z%K}N@ha;YX3jvD1R+$yz|D>5RJ5sLEum-P|OlVnf%m*m4#}(i!xpnmxx#Y-IKrm&d z1g&@YH^4;;!&8$HN)yqjCsTPjSGf8Okq@KNbr~*$23d~QFYM1A@>nd)3s-u6z3hdH zuyCpvSt;ms8XBYdC(x45fL_HC+4!Ocbe#n%>+D>45VVILW&Z--g^wKkNSuS5To~?h z5q!sE8ZPL<^551!2B~V^>UnB2`$CJB-24)vYsIsZ+q_l@^M!K^T$Sqq_~bf)yq?){ zyBJs4`48Ek$zG#ZL&8W1t&KBj9>x_()CH)|~fKP)P>-Y37x>%H>LLPV9q`&L_6$mYWL z^imXq+yIxafsBs*^`{U7Y`d=5KlFJYC%=N6O)-36L~HY_f49#rh;~71{d^8_I%tt9Vn;1mxzuxp*8ovshB?S$N*82nvLNuen zjRoxDEITe227=!pltW()EUD;05~DN z1O^g_%Gn7fy5=phfd4eG`)REo@K&;KBQ58gy!JNQFq{fY%Wgv6Cmm1-!Bi;c7kF#l zBGN;kfzg9!#J!DaG>$D>0%Pv4A-7CSL1KmS)9>Oa!XLZMCv@fPEmEDlkLgSo+NfQ! zm1Tw?Uh4URQ-om}Y#I%H`fY0lr+gx?yffFh`S|Y*$u3O$!mLxHmJSbJqJ-BvNxnaRDxcS_Uoi1%r}qYvYGL6ucBuZK7GrXeU;+}#U$_SkP1SV zz>@gN=*;}xcz9xppy7SEA|uFy@v;H-~RQ!2fG!`0oa7j4vw`mPClsSv5)E!IztE-mA^ zb~dh?38*=-svV#2*8tq2_xEe&%uAWvtvC$gNTVupWsXUsyb*9ZwzZ{0#5=3{xj1HB znXCRBaNfTOMpM%rf-Pb}^K>&`LyTMyRZUBEt?#C3ar$wx`?*>vWpsma9b-l!sVeaB zB|O^dLy_?*gB#rb-RfYT&*AAnBYFs+@8{d~OTvG98yYS_fk>Va!P7WMS-?`lH{)y9 zO+stC+8}7^TMJ%G>Ki>PSZZz%>qOTxab6PJJetAn;rSKtqRZnXL)#DEzu?_>hV8Uz zYvs$6q^oq`Fr6m>#_CEkCFRh{Azaxsos`Y*V;*$%1iGANqY(#GlvI>Op5e);$f(!3?yN= zI_Yiug*~=a%9YrqvMh4A&%+X=O?zS+BNu;cs}o1rkzG!0z1fd)^1Yyd)Vb+BC#A*z zjd7oMji~wPIU)s@)y^j`fx~s0bdpuh=FdQDCU;Ba9gKqjijf1HyF-q@gntpOo4?3> zLi1a2Y29PMLHR=daPu2+Pf2n{1LZYXkdi7YwHJaHW8Ss^XYxvMi6{-UesOPcPKd~F z*-+%UvR}`tIVUW6HqAc8&n^;D5lbmZKeJP*I%nRFs z^eKYa`~sSauix7>of{Y^j7Og)LF>~Oxd6rjb6Y9t=nml8+BkrHzJ}9U;GwAC#aJFn=%4l4q3Q{ z7j?w~wM)Z7y?aBmZ-L+<+!n!fVg6p*k6ydthYdM&*rN^`f$Y)+t_tszCkDuz-oGz@ z0sofpd)2T=vsW>u24M6`5AhQ1aU7Y5b!S_~g0UtDv^X}8IjX?9r8w1yo~eN3jX?$t zBJ4wn4P&2zgFIM4c;dW^#T>JKE7dWFCH$FSmj{&0<=*&YGE!>UegZW@TcU8KWBfM~ z>{@K6;lLMvKPa+J)E#-oUbC+0pvs1oDrxIYwZsoj2p0sQrAd>nLE{3Lr5bHp{x{*O z{9jPUtk9#~G0f0~eTL-<@`%LIOzK>m#u()|7ccyWk?_9_!$~e@tuZd}stwM_1}3nU zFMn^4vl+QST!@bArH4?B&COi8_41Xvki}i7!fj=l*J*KXY{{*|yO1GYsHW9^20tEv zzj-~3<#FqExCUT+Y}i48dQEq2c~Rpz^|RJ;)^(w)ntqzpbGqBzyo#15A%kS$fa8h|{M&2K574oNTk!GnV(X?K=%gIYL z;n1Jqx_3&H@~Ui|09iu>aoC{n@dr?_=!(SJP!E97+n1qyyZFvHRfJ`4D%LX`krf4S z-?_^O!))5%nRE1X`3!^F!>(TmYfdAamAh+@Geu=G<(3>)muTNcnVMg27^a)QO7z~) zFIv(LE9LtW%o9rG*B6*ea5>A6YX!PhSu`sP^9_1qr(L4o)|_*~*G@N^al-7>Dod8*D;YLPJk-9J^`P3z#HFa%fL&NWaqRUrPM7 zlrQ9nZs&NjUbBz{XTRk=RK8YvkNV8ue274B)lss3Zn45rGJNksMz01(jRNe#D^*xc( zbaJhjOZ^zUF^8}%thu)E#Z7PAhS`aqj!2k?^nIxH%e9PUp)I{U*V}kwx8gp2=#zjA z>AWJCU?q6Zq{`Vjb+WnQj_PotjRx$i(g{k5@{V+iu8#{R-0%qsCu{d#AA_F$=+soW zVdG=fyK#$^sL)xLy;Nx;Gs*e_3u};Q!5ec26fK^Cdms4N#y`^bPlZ3a+% zU|AnMyxzRVp*1=oTi69PL&|#i&({~>BY{~A%2e$g?wU?Z$Qxas-0}kVqT@<(`W?~= z^AOV+zLW=J9v2o;cKG)v?5Os4zkgqA6dN9jtlw4K|I=bbaAjoRD; z$vAHLyWYkP|0y%7ZB}U2OuT4pCtkk2K8SYVu(cMad8ut>I!$v=(zGpYGAyI=z5*r< znI||^NZ2BhO2;~qsUxma1eg{0>N-TT?Es9k@b_1==x!5Sskpw`W{qD>lrp4}N@KIl zH0E$UQ|at}-1ea0Lwnlu@7*HKsE#_im1AX#2ZOB+&9qe&2Ltw~Cm9XeMTzhWhd#+N zG_K)^S>QBX2%5a91r(;{lIp2QD!hM+i%obE!Vsw1BNHX~-utb|N;7j3DewcUH2`WeoEkr>Q?*)_O>|KBm3 zAdWxOOdQO}x#yQn=g<5bh+BM!`;BenJ)bPft8#A4*<*oIs(LcO^me%Ef!|&`=#{|} zzX~l>vm!uCk5IFVyo_riN(N6#aWXc(nwX@N7+`#*9*flQaA4mXPtv&AbA@m%J3i%c zeSE}O)-q|)FzHY}*XOLdrrCvG+53{g^YwE7-e%_ba90d5DgrOPL6GUI=JiQEfs|Mn zSx}->P$Fpl>HZ_irvW0dAF7_b2IBPjg|7E#>QfkbNn^@+aqSER2Y3;oaN$1|Hd(e?%?ItiS@e$TYZ11F{w*H8Y;-T~O^kP>qWKZJL=CG^fxya2-zITOQrR+>vi2w=z3}No@TyKpw(Qd0ab@ zA{gLMCvBrrXqqn#mpHN^auSx#Uc;?})S4T}0!pfI_=`;Id*$NTg2#fznQ?^J#{2(56W0fZsZ*^E_(Tr>^E8jC!F%Vo4x?QURbz4B$)xHrjHzWy@>(D~pdR ze^Vs9n5ePBYg@CZX;rAU39Gidk|)N=nvAz)8Coo4lSTO$|N@^k(|$@ zFHpu}T;W~Lr;Lg#rS2qi8@xVZW`$k{TLAu=SHWm%E4S=)78+a8`WINb>TvlDWh`4+ znn$#&VofxgQ=}AE1630ulLD|% zee0nBaAp838b^TZ`r_Pw^0=`ERrtmI_fj(Nl3IMQ@ooGeo5o@{r~zh5b|FT1>X)>3 zmHX)Wp?NCsV=R8pIJ}#&B}4L!sX$tUI&O_rNj6 zZpg<9(aITZXraV)5uWp*k7IRxXfoT!PKoc_4DI9FF#MgkSo}H2lMvLHa_(b**r@Ud zW5TjAaMVhuJMn;Zg7#a{?B2>+DlHm5IBF~Fnky}|bYON*f#<@vBQ1|!Dv$%B!kB$NYQM@OVubeX~1)$n1djz|cp6P^!wjPqy8+5PG%b!3^+RXLv1X|Pg#i%O9K zU%WiudZ);JU&%Cl+uAa&9bbI4G>HXYyhO?*AVt!OF=1##IdK^t?h*phG3CiV1Ev|T z^xs(zZ?xh`$hHxtkrTatfupO=Ct2beV=T`J6EN z>i0Bm{2^Fsu?(3=k}zz~8Ej(Kt%-@?0)e|v@@zD{@zSdwZDrf1*K^N(x@td`RBX%2 zC@Lo+G8$T9R^O{xn|770Ea9vD5LABuEDs14_@h>rs_?_Y#xWhz*X|~|DRkNpxkqk- zWiM?^VZNvu`sTPz2zAK-^?E~hU_l!^;ZFPtmu4eB`u~762{nJm2Ye{q(C;<<;W6DWZRf0H^%0*VTYFrI`N;tJ0&<)Nc3ihg~`w~ea8>91Rg{oBZ z3DPVnnz{0N`W8$|K!?;48EI##`L&eQVdY>!oW6M)=>fYC5uT`$2k8p6V%b94Bw$Fc zFPV7~Am4*`A#Lox6qp)hfkA2YPnm#P=O0Sw!;h5nP|_K6YiR~WpZ+BaKsC$VR(*~L zDyu31j(eK;!Lnk|Jx$zbbqaKep9IOWXwW@n9H{{q>TH~JFPIq)^ocI%$#T6D*NG^p zdyNkC$XLF%RgxfQk_5*xZP0v%1TIioZSxnm0UlrDCEPIDwFoNoku*($1wB9{svo;z zXThl@X}0!-S5Rl?QMvQ;{S=D#k~L`{S_M0PmrTsQlPgD*Xyv~s6!Xy@?T}AI34R2Y zL;%(E0O^V(#e&t6F4I#7b03sbZ)qLEi3-7vn38Z~Tk?4Y>54dDr%f3jFjsVgQhVpY z{|!Z3lUw%)u-CBS-f&7A<*=JbhW!ELo{bw0|F!?uNRH9(4FmBz; zcg1|2{7B+;9sk}8tSwag<>Y0vtoS*>V1BJQa0D$4viDxy!~lql>ys4OLGzqdkL70g zySjGUCkeh}R1ic$9EfRgxK!;5tW0hFk`XmSze7zks$LJ!Ms*xSt@g7YaMxqB4Zm$B z_mtj}Y*C^o2XV^e=A!btQ!V(Kw_^HMU&aC0>GCyb^XYPlS^^&m67W!d?$SE=R8{35 zMhmKXVa&l+^y!`aqC9A|hZK+YLiRcCN-nb1fe&Tek&&nhj!&rmOz_~zPYTR>qg;82 zK1RH}Yw+M91_VlV$-3|miv@N?w&5aS^d{LzQSwCt??lJpL3^^2CE$s}vkZ;E2WiFW z!b>ge?Aq?`2loza5d#fJzF1^k7(zQOTckj#WNQHE*5sX}?1S%(>dHgC6-4*OTk_7Y z`RbkiF7I{=59I-BDYHX{G-kjoDH8$Tb4 z=f2B#64qeny6qsgAswL4WmU(h!zonR4PF9)?~OobxS%dc>R?D&c3U~1^Jf%L;A4>2_XTu-@^+J#0y^*t(M&CQ=%x;SA`S)FE3PwCaoK_T{=;lK3&_ zQOcU7&AV_}#Y8)CI}($-fE$SX!=-~%*%&?3l+j{Eu;;uUgK4dvpHb@}HPB0a4wR6` z@DrR}#TmcIy!&L2jas(RJanJ=js;4!v5-U$`c6x$^A>G3J2=RIcSW5d^!70T@gwkcCTXp zoGCdXjDX6L{dl82r)P?u6V;KEljf8fW0!kWLz6Vjv7x68s+yrEO8PToqe9skk~+5N zl{tGQCso>QV_p#ME7Huk-$$ndXmTvZ`0ktI+zZhlcki&**ihV<;mL5^SATDmg)m}G z*$zwo(4TX=wqx(L>^J$FeqqS5m!TjsC5l0V(VQ>b6~nHG@alXp+7nIzqg zdzQdlh{@bHdw{v5XPEj$n96N)5CL~jM3l_zNGbY7Sbw+PMn34+;MuzV@lut1abFy3nPj8Ws$@k z%pk`yZ*x?#;%4KH{f@Vbi7N*<%dv_sC*ytpy4gbN{nDhd+Og;)A#Y9z+g5<(ZtSwU zkx#-B9PjH~ubxgPktB)=_y|t>x?$7_2*+-|oZ&yi8t@IcAY6OzNfOTB;ZczQ*K;zEwfAx-=hQ0m9CdNu62M1yc8op(%v-g+)g*dy=R*6%<$H;q71+gXF{EemXvgkf z)rTc0V8H45?OWNQc&REEU0UzYZFO*$vL1Yw3LB7y1SUQkSQCWqYydDR3M5Qtl9LT;4RSlO10gT@d7zs^Vggy>qt7Y+Rm z?-ziP{w1oWY%~vp(Q7~4eM#9Pt1SxBq#0^UEh{FCg}$f(QiLIig;${wA6%@mvh0$)a5hlmoGS|(R{G}k9VaKY0m= z;3{15=D9>UdmAj)FB-q+8!6#R+d8S2CVeio&0%<)Xms z-XzkD+k)Jck+uVNzV&KNlC>W+i2k&NDhFDe(Kc(3EW$7s2d)!8{<61|Y|cu$jeS)L zwgXnuem8MX5Zol<9CF}^MMc@gyN!ABFz3pB-z_d=Qg<6;N$R!^Y+A3AK4i*x1DBZV zon@|Lo{+dsEUKoTU^UR(4P=Phqe;WGHJ2>^DOel3L!)KrCgp}7*iy64O}}L_mCtA)BMlkN`W9{fw`7GM2bve4*V`=$v(;XoRoYbKN zzeyB9S6TzHZLw)F^Q0s;^S&_$hQ))j)#$nHuZ~)N-0Q*Ha|FBE=8OukqaNgBYMI37 zy?MSCb?(=48*}}m0RL^X7EmLM>isG4Ki~^`sE4ZOw%hfPTl$Gk_k)*`t{k463a{@?xn>pDHzbug??rXgITqA#{(*ZcCbEYlSu;IUBM$x>X zs;yB>EA;AQCE1y3@nOU77L)sjhF#CeD+V%tn+hx-WM&bGit>)fU+$rn`1vgUXPbR_ z$W44jM*GOVIs$F;)|+h&6wB+(ty^>f4fyS;&&h}U+qC`^*006tgYv=AEe2z&G0y0| zP|1$_$bjqT4XfsfZDo{E{wN0J-RL}U^E;xHOMi!E3r|`{K}^T~7+{X6=8XXa4v8$X zCrq0)P(=oN|AknNIIxHcu&oDvpIcTrwSeib7yk?w32AyaL~sV%qj|GVBHEjP&*g)7 zHrt`mi{3VC!1@OqT8=oh&}v@IUlzsMbYNEVBwQ9<(5Svn+%i#8n^#gh*C4Lgr@9i8 zgM4e21@WmikbM<+cJ7q)U94=wj{7_DGUVlt0D!IAV7-f4q_yxG>W6lTkgVYqA zGk;pnhnzDYI~AO<(iP$VTt}k*ZF#>4SA@^B3R%65qKezYvrZl}B_UiTox8b?y5a3R z(Kh9H68hI$M~<*FZbYv<3wR*Zp{^JY!|`QbuK*VVD~}a>e{`Q+9P7{_qF-dqm=hOH z@wDeedkM+L_dldN`jRu@v*-e@{M%eZi_d1dA45{sr-tFhU)cZG*_XV2p?_Fle{t{k zIJKJbVt28C*#se7S(V+n{Rihu&z;C{VtoKJqRo1%>tCJFz-!pSS0fz~Sfx)Ys+K16 z4~X}ejyfAP_^b0@S0?huH2muhYy64IA9-16=p1f*hMA#2c|vj)ToFt@Yxd?; z`eu?1YmD{S=2qg*)_zqrckQ$>PD>^vu?c(rV)w(7Ovx<0bMLi1e|h!l@GOO)qg1*gxXbEypvQP!Or(wa2@DfQ6KTSN*hSsFcaKBLMhZc0`1TgIi*(xpnpQ}imNq$Q9$ zZz+^<<;sEAGSUeWs+5AyN&h=8PbjAt7Z3(2rBEUd0?kj3xzInIgLCjxl{WPb*-^8(GmK-QwuVt)`-#NIR@9llE7 zwL@X5vQ-%7wFBV1n{p3L!cF<3~*L)yM`o97(j|5IZ;03&}&rAY2-su ziSDE1em)D%W!SHVhSwd+k5$KQEclX6MHvH2%14L`GjiWX`%IDWpoZ6ptu&2gOANL- z_+p}9ON5}4FlC@1ON6l9`AXRjJ=l>_GoI!ctg2nnhr|z+|`6AEJ{>(peb?4GIiS%PyL?)X@%RiH{VC z=sC72VXfH$oS?EDY8;!~fK6VA=#9_vjlMup{inu#|u z9Wk>y%mF&9wNw`x@i#AN{st*?nLZ_j7NsZ2ySt@<*wR#Qv!`+I2k|~Fv5eFeheuV} zhozXwyY5h(x@5B86pptMLv0+paddILNqey@3Hjp;@pPre0E>pXen+^i)|9E0+;rhl zAL+yj4BA)i9=xH+l(sZU6>L|wdstHzX7JzYSv#DpL$+Lz111M_I`|?q!8~?X2 zR%?lV)ZPb!2amxfLS^@3bjnUtfzc?hm~)Rn%BDe`D15-zqVW=uvipVVgU5G@r!U5p z2S^p4cd38Um9z(sV1m@{^%A6_ZVHNTg#1B{{j=qFis0DYD#X%t8aTNK`~$B4gm?G# z2S~-wfyVq>u#>E{e-j^@5;yaZDsTfkRfzfRR3|{*Or+p0(e&Ha{AbHR%BaXbJtEnc zp#OxYs6Hhk*_PWj;dJWwuVnQ|_JiIlWcynAPSP4|`_Uq@9*uBU!RHlXc1!ZDjDO}5 z3A;u4PJ(~dm;ajd|C$|f+vPd$l|Z-duRq9Rh(+9#P?Nt#)rb>;?0QJ)Eb)9P-%Vi; z+Wxb#{IjwBvvKhKYi7w&i>2R60O>a3_NoZfQ9gmBxtu$W|5&{Y{=wPG4qh~`YAWq6 zH`%?In`1(vE}y09iM(#eaBv>)uupW_2sg1pf>khDgGwR0?sEW7AA@ik>St_^$=;d! zDb|$x#>jB^>lIJ}LVG?769%33bV1CiH0x8MVFl8uAkA4Hp*!(2{E)ONy%Y@PGa5{> zK}sIu;%KrQ*J;K(@z(0EPTK4Py}k@8utn<6bigZJ6qv68WtMcpk}i|dU$QaEcuEU) zVfk?XgqrIpX_Y>T*w!ZQQvF3`;kBXv#Ml21HufK^F1hrN67j>2e^e4UBme1&n1J-+ z7DGJ3|KP8%|4~*N{o|ic`p@4P_n$7v*Z99MQ~&XLfc)oYAuZ$YgF26QMf#t=+&_Qn zce?*>`R+|D(c6`4`$5z~tM)vPmbb zC?zxHljaUm`!}I^b#xm`AvOY-QQU)Cwc>Gbz>~vDgz}r7KVU zks@Aa6fjp@jypQJS36gHu0np+2l(Rn3z*_xnQ1FN2cawrs?E@5O&%T1v0_T{UXgsm z<6qIFZ7atab0YW57<-Hv@5?!d*M3AwXvwjnO_Is2&6H=|qb=yD=(T6H-Kl*miM+}= zm!uubsU;COCS0b zD6G{JT7l4hybz$qf(KAFpEoJ9{u+aG;q`rLYy%p1yw^-`C?x5V^C+Ib~=$P~zzwJmo<@S5JU z1bZ|U;N=T72XSjkrN{ALrpH4_ zE!~eIm-wB4%M?h^Wl(V4j~-gJ^g)@ct=rm@BB;Plai!} z0t|8R^J@34^~>H@a5q#+@1^TUV_d}xncopdq^cY(qw+&->ssGLbq{86{czH}d)l0N zm`SOWq{$t;udW4e&7!AIrns^fbZKZQb+RE~2FfaCna*|CQ@G;3T$!_B)TP_ z9(HF!uDm%Ll}$ZYb&{r*La?f&UrdEt+}Q~CU$HuReDkn^)ump3{rk{)o^>Dpb?lDi z+y^fv1$-2LzL{c7X-0~T_ZDj_b!BOXrZfJ@yC*H9Gls2g``WrmHT)8=Jr)-H5m00d z@DDU6Rt0f2^9wIDn|MuKR6fq_{A^dX*7bls;9_2q9BiEtVjJZ67&Jz&ick)TBv`rC zq?Y((Mzp;y@&Z!}VD>;TuGAo-MdJg#Qq?A)WjT@|XmB%4Ozt{pfOMV#{qmnc@`pD$ z&@}CNgH?jzhVox5UqaqJ-hH&N2UZslS=R*-GwA}G0#lxQUZp#pXpRX)z(=B=tD!jy z^LK@8xpa8Gby0a&gr7wF07D)Lhy!mh5i&6jrl($MrKisN7p$rZfLTh$ z3xOfBO*`5kw`jc74U%Cp3uCgXBKBKMhSFa0INjNl)t;x4l&Gj9R$7c0vn`T zVkTSh2$eG?BIp_BzvLO1Y@>H=;i|BCNVpgG72e_M7nwWyE2vFkT67vZ>=$^=0veaU zQ7(NPxka#3w#WG*c{V1<1cq_$Z`55lKcZ9 zVcaMfVl}JRz>VENBtAdbRTdUb{@i?Bww`S^*ept_Evy`+cRX>JUe!Ih|HzMN0{{xg z5FxK(d#i2(gZT0kI)A81h)3ZKNLTu0g{8nn(;RSFL36+tKosraI-F44H^BhQQ^4-? zBVOH05H%TK;S!e}%#;5fzggxLK&j_TYH|&-{4m>jQGukvLKuD?VS6JtY6Ue5gcn>` zE3*=_0%M@Q*Kd_6)sCVjznBqVR|4?>iyj$8>jD)m}-P(Bd*?{kKrmA00pcDGySBD>F(=XxscvIDgJKKboQN<&@Y3P63+A*mpXFR!3vi| z9lJ9fMSa6^n^r6PtO`-C!Q5*YCw{V}R8_WUmlWn)Q*4-+;<%n;VY(pbAOhu+}u zRksowId^$$R_g8YwDY)+zY#>VxQvE;y^6PrRYL8(GOMtoDx5v1{an|&)B^jMg6qFT zJ-@v|Ys^s3j-Cj@w^##fFX3GDC$2;gQM?l;S`X6Qp2HEp#4zmyjd;W3h9x@VYin(8 zp@WQkQ}6`PZbRD+Xk8>!C?3#63&Y`&pJ5Kav{N?^%3i$c`5a{Ht(0cmeVa`Rw(4=teIvbK?-4i7&})a~C`(JT7owG!>Qo27{}*H25B<8iSdAm?Z9q`^Sxc$1a3e z?7eb_*|&~UBr5Gk3_W*RpSy!;l;GSM#dlh)RYH^(jlLbHnqR zOmxHPB2Ge`t?-dKd1hRNsc`=0Z?1c$70>6p6JZ;;YZGDuHflu7WzybI;RdV{J8aX` zDIK^}EQJFU^|*e}e082XN+)okbngTnz((s#bNJ-xzKa-ed*32Ir}oY~);}axpjcEo zmaxdi;FCA61A{{85QEFpzQ5h;LzxrgN|Gq`3NP{Ijpw>07W}ReTRPFA{X+0)u@EY~ z^#jXaXKSdYp2~b(-UKsvF1JRQN>0JnA%-RKl32Zel^ya{)=qQ|56kKxXZb zUh`ADgyUuIHn0UVzd5$hWPI+Lm?`M5r&FP`jh>cLT428Ix7sRtDNe9n1Vboz!#W=b z`ertqkeRENjujQQzC$n9>UZymQ!t~Slq*lX*yembkw-{y_4im%jKW|_qU`$fvp%lJE7a!?XKCsU@4LsGuO-{Yx{VB^ zY81!f=Zdw}{yhW@-k$IA>(7LhA^rXM=Y_T$0@*;-O8GB{Ke+?#bmOH3Cdc9Ycu0_V z$-=QXBQkxn6DAcbn+Yv5XTN%Y1q+Q=24;L*8o^qdhH^lhEjW!)-f^6%O+1CiASgw` zH-jq*&mHSfCq3}9vHguhx~z~JyO~RFv*P#Rny@)7vJ{|Q>NG9^aoOW#nbU1u+7ED1 zlv6)#V|%Yct9vnehAtsWn}{2;7><;LdG<8)XApHnd0@g!*}W07MX?5%)evDF23-o@ zVxW0mQ2H`Mx2v(1_)m0Ukg-2LyCX9KpsKKgV9D!2gFwV5{piR| z7nB4KE4{f!Wycg++=8{Au25Rk;|fx%8tTz&pyrKII&TF%mr|W zM|qM~_MguGxD8K3Lg%3A32XBLXhmQuf@;6es8_aM9F9}1)9C6}{8uJgawuqndEA=l z8PQZzen4cZfn^xm_ei5?;~&A$z*uD-6HTKyFi7TSQ_WjU4cN4v3!!aa378X>!*?T0 zLlDMy7@k;CS7qg z)w*G0$@%s1lQ*%)@hug)M$ZAnlOlMr6p$1&Ak3NDw=SCf+{DK|9 z5gQO$dxbLDfeFeZlD2^a8v6!s37Kr-+f6G?BhY}ak}0Y)rgME$q1+PMZ^d0tO7BJ` zWQ@lBJ-c!nGP=`gzkO-rc%3H}Y6DSa;@ibo&9caEK1exv{+>E8Nhu|wn}8lF_GW6z zGI)TXOT0z)V^hug_e>%?wQKAyhx!vJoh`6ze-OadTQNtBg3R-$@N?(VCB|6Al~vZQ zijy(=spZc^j%I4;UvRN92+y^HwK$msiWz%Kosor*Zzv$13!9f4DwJ(1q(v{0 zGEyosI`tuZps6XzSe#c2omcBGbTK@Az)9!iVQpATeO)$_{kS_$p0|<4K`C*KL@RN} zrhb%!pY1A?BFx6amT+oSnqIMjLbmf)K%ImLy|)1)4+( zexQ_*nb;t4de);od+_=$jYY8?ah*D0e3R`*w+DG&0AeRK5HgQowDr9>pka0$iEC?W zkzMAmjDSmYi(JVAfcfoQ%NV;Cia!|d+f(T{@^{juSl@`ws?=araN1Or)BFP_9n-f9 zXP*OzztnlYi-kP&mPjP49CNThimrP5Y!g77-V1)v=9k)~in12-qVwNRIWN+$p1;cgz8NNF24)|l)b3=jb9!+oX`l4? zWov$mdnBOggK#cI<7Zl%W)5CM{GA&x!;1A}dg-3o8aXsPxG}$3uyKp=K*<}=J^w9^ zwALcU7v_m|iu`7;%+L;%g=)FR?LJBQ1u~;<0XvUU-~d;uhI~r8#KA?1XS01v0i z@yu~Vx!6|_>o;L=9*=J^%GE#dW`ALP^H6@_&3-ShXpXs--1+RKgdppH=iT86+XR8NeY2#mW0n6hFo=!b;sqgqn{+2n3G0)pUfM%PD!mF|^ z6E$?ka9#Oo#JyR^a`0o2OKY1ak(Kh~%Y>088YtB}7prhjW%D*xtcP$bl~H6KE7gk489HT4(6eiuhz4Z!m2Kuopkfy#4+ zZsf;iWGrP8&7LiqH@M^tukvZnn>Vlo>7Taum=c={;(6cG@7XofuqRk0b8G_BM{41r zyhW_))2@hl@Q+VaUp5xZOKl*-J=wKa{3Fi^di9HE8n(i}IfD!G47^X3I)#xAo^*X9 zz6K+I)U84cmfa%D!>0axO;v#7$eT&J;aHwO&YYi_pKoACz`?Y*J7Sh{G}$|L`4~K; zZKT2-&~O6;Om*s&dL`EHRp`fkOsZv_=IUau8w7t6{(KbW$V-VYqhBu=Eh!E*if}cz z2~lREgeG$?kEe28Gi#&Ldu7x#_6$g`n<%=se?j5<0s4a>bguHET~m6uoYMT)i0W963tVL+k=%l3>01X#*mFD8wke4*;^?;?my#?Gc!hsDH> z?Za)FryqM&Te^d`1Y{c-))Vkj)MB!lUrR6cF!Oi#D3Qty${bdgG zco%eJuCv{&+7JozQ}yjBD#O+eqYig>9<@Ry#4ZR<_kHy>HcY-_No6t&MLI{ zSMnI178CFqU^9p3Wr%_v$VeH=r_M<^i03r)9MBZG`?)`;0QJ#xu29H#%=*y-BWmAe zH(5;lnQg6{{!f!@D^{y_zc6h4xxE@=|EVv1+7&+U8zEzT13?A+ThmfX3YizFDWUn? z9$uc8wLFy)IfXG&(s~yVv$AMndH4j@{IK?6^tE&RwEyU7we?U;i_Fg)i1^jft9Jmu zne!Ha^P;h3i0(qUTqwJfBHb1Qt710fn{Tq&b|Q@{#@La%y>foFT{1TJh-o9Qfs&3x z;e|;$_M)6BbSA}+$3qVGhshnIb1~bf_#z_&+JZ-BRvIGb>EsOanncNW#H1-!6?}gu z_X7DfF3sv(PE~sTV}w~4;Y`kOX>Tw~+6l6*f)?;tElhjB8%9gTmo4Xh6X9}RkvyEAa}Hb`dPEfVhy z0Ks0mL5#-u6>@3jeUJ?d=LGeOqSeezSbp2?4O!Qf!;{?)QFJbtpx2o zMmI~9b@+epmSXYW$Ff?9tXd)waa(C|cb1kk-WzC^aEA2mAbn}(cB^1N<2H4jePvoA z*Urw*c0U^4D(jvtMbND5KU2k?EmMH>s1;kD!`82sA5}&Rj)TrOpQ#_5z?7a>zf?Xo z3mb1_{~MQvyYJqGU60cZ(p&eIaj%v^OPU`nFROcRmOs3+>n_ES@1litjCU%?U~JLk zKhp|*@}KU;eeOupCl!jooZ~Y5($S_-LCp7`#1~F{4zT!io9x9seVAn?GQdfz89v^{ z@oN5_1t!*F+FX& z+O}=mwr#t6+O}=mwr$(Cef$0HpLLR*ot0CSR6RSVQhQ}yD5do`!afL`^OP@DTSo1I z_xHSwtYHin16uog6%5z{PkY;IQ@$j}-`-n3%*h-xIdX*9L$kpEJs}mwxFQo#ifY2r zCoGIWkVb36g_Z58$xRP-V!Q9Q6Z5!=Y0=99LdCJ2qYoDoWm2r|qnE%Kz{yX}8TKxk(Fp;NM_O77T3C_Bq{#0K9!ugjI z{E<4a*nu_6uN1^k%~!cHf+{8n)SpQL2J9+qQ5wZjnzc#yz&e4QZEc zFz+6b(T8G^+^>(%N_}{{EW~ISp=L@QS0nu}ndM9fYN_dQB!|agrEH0SH%{B|D(+Lw zIo8KhV`O%wb=hni?KvqgV{8?#r%b^H^=!+-x4l9i`KxWx-uD} zY4mz)E^Uu0_%(+eD!R4#NDQO~6X%V+H?+%E57~&PiI3=p;~dpXHzEGZ(XZB0mSm8C zEzRijMJX>eEVwrWFR9C;+6`Om_7Kdu$(=`YUyof94PJ2?eXr238t21K0P7XLfDUzF zm+blUdQ!uwg2BUKL21S@v(&l5K=JTZ3+sD&P{lskq;)#^bRP4(w^o?me zsW|NEXm~&t;U+0pShk6agwvxqCTi z(w2JT%bH(Cr zQ#YWr|6rjWZQtugzH4w zz}Mf#yb!Xmwm%hKGlp&4C6z846R3>h$Pu=bbXv=?koV|%pE8m) z9glHtcp5)^n?aZjfP)U$1Xaq;3o0);xf`o(6!!|RFYLmTEC_(h2|Bj>gJ?*^7!6d$ zrfZptu&0!`I|x7^^_7OqQrMhv;EP7?37l9cl<*>HFYMYkEWJMAFTH*w>ZRCgdwI~_ zAjz`NKdB>=;p29BSdrON$4+e8COgwdxJiy7$shLl_T*x_e$)w~2+Tc3ovj^*p<0B@ z0!7rol(;baQ|o!qTldF*1)04Tnm91> z==@2C?_!XIp;iJIyk-!7y=Q=#zINRp^mCt=i!J-3#aJDz{i)9MR(U~d#k z)KVPxcN44rGkq-v;yOU$g5&sf^z~PSQRSc7G-=CnF}zC9BbA^3SCBlh_^oJs{Ev5R zV9m64O(J9v_sP&-l^e_u9s8wsK;IFcd9r|Z)6?k-LSN@(bl_|M4P<36q!3XAi77MG zX$Zk%{TCM-PH^3X6oLwy8auU%1}Q%WhG8kUnE;%wD54(JS|zRF(&k}y!xiG!G?woQ6L(wDo1i_ z$W$rUoIa{whL+L~44f}pfv<)H?TIk3v=HfukjNT&Q!iQ3%_3|IOBw{&>k7Yp3+L&I z?{jxlDVNB3%c!&?07R~uRD>T|l4|p*qKki6kQA@$y>;B?TiejbTJp(kZi9fC>%I6g z;qymGg~>K|;moak_tH;+JJf(UwYEb^72L|Cs0#t=+9)VJw6z-TiHV^jxv%S}X7jM~pOo-t6Gz zZ(4&HD???Q=81ve_bUFB!GMP^!LNqFp?U zP{`E@d;Rx7=Y8V{W(CZA;KVqe<-v>et?fOr6UKC2w zgP?dVxJwN^WZw0PZUy<}A1+)0T zXIx>~Z5>mP4XNK;DqRsn$O+`C@?9A)HTix_>c4cQBpS9cUKv9SZA?{gPmn9Sf3H#S zax~P#Hz?Jk=O_a^Ug!gPCClU#wXRWnhi^e!nlJMY=UR;|$Hy)O8J0Idr&zW=CH=Nv ze+2041G4FPv4J00eRE>I)-WN+C}g7LNcAlji}WL4@rEQL@DA-}SKK5yixKrTW#}Sa zV{7Z`q?Ee^;$0cp7dyfm>XxblAWb89mHzwFE>ysIr z$&=+*D37_e&<#bw>?fH4z5r}9IH3i(uqBK$)}*anRpCQyPt41~@H+uLZ?-(H{d5>? z2AzOWFb$6|2|vp@I&;E|Q1pN z3%(f;1FKdgN4Obs!jTwP9BPr(l(6@L=cG@*sExu%0lN&{3zjI#{)xJ#JaTUou)#%8 z_p1}6;#pOwt?VkUfAjt1`^pMhQZo0h164U|4GNerUPA>S@0*@usCy@o`K<6qDmp?! z;_`O@b^bGn!?)&W#a#*HHx?>Yh|cKYp;bTm4M38L7g-|XBwn@RJpwt_Wo!E=!Bpit zGm$LRT$4=ufy|}WnF4@DaCn*wtT3%uREW@V;~L*BzkT7H-{pdR?n;_bO~J54v{g#_iwA1x;zdQ)5*+$#_~F>OS~Ty z@9KT~iLl_Cz=P%O{6f-4$N$TkZD6kZ-ZArw1nLcp{*%q`%Vjt_<;y;gM`@;e2PmgI z(70H@5~f2dCFaYZ$@*a(^po{_`oU!!FX_tx`mI(0R$EyO^Mh-K+lGGL+i@e*tQvO> zk|X5_OFHNC4W@P*t$VOAa^o4tTZhQ6;K?&s&WARCtcV*M5_;}24v za*DRL{htGIR_z24h3}IEFf37}toY$@#f9Maw602&bDD+L@X>5H6b>Yw1e{8X zSOVChw?uV-P*G3LHo46OY=g04X<_WbDiOFa;N+DVBD%NFhwDavEW?y4g5LA>34Bv0^BSNyOQm#YOl!N9(w7 ze8K-j5(S1YvT+In;i(QxRtKy5TZ-4~<@9UoHx5t#c*UA#sc~gSJrP=rx%-Q~y8kPE z4c=me=;I+HNd0a;tS9S@0j+02uL#Zu+FzW{9Rg^cw>}Ye&xihKMOU$8)T;g44jG41 zl@RQL&jHi;75yW~Q;s*&WN&Ps1z3*ZY37gXvTUs9kM1`%?@z|S$a}IJ@0CICrbi!@ z&x4K6@3Ui`DjYALYYjz-mceH#Y{=KgbG?lCRWxG7FF`)jNHlVb5)*yx~;~V zG>xTZmR= zh1S)UHvQ<-P-C;r7c=Uc#av?CKK;s{S)Pk88h@R{pL7}VexR{#AM7-+XYB$DlSUQn<*&dcRc-&ST%|A46 z;H3AOH2ukslriEEh{L`CsSj>dgTftI!X1Njx8y<{s1dFt=(pOm`-iZDqL7iIB$%-% zLxN*NqB5AV0O5jS1AI4H`9}ECWcX5c;y6@-Ne28(eZe$T#L`~`O6alfn6YL0_>sM$ ziNXck!Uba~q5`z?f$*n_1V1Q7t$jm9)=j&ebLBFr>4h<@pioYhx#u*=h=1KEOszxZ z6t4L*H3kC^^goAr7We5_<>_bJ?ED9iE)fy`l_HG<%nVWFZg`tmX3rY)8=1f zd}i2*A;}*O=o?uqKm=Re2@r6G85^p2hNffgZ)|<+j{p)>x)=OE=jq^*E#bS=z*h*i zy{+-PsF6sFv*#rrU{C-|TRw@Ps-KMLBp4^9Y7~)a@gD(jv2x?IbrIjbbm(YJ2 z74%CI_kS?t}em<3J-_-=NyWhWjLhyZi}jehafO`#tF@Yda>X zf|u2!vZtaAC!%22eW|j--s7-oE38uXL83+003Ur))^v-LMX3};!<_^Cwfq``P+Elj zwLH3q#>lf$%iE;08YElu$P*sPH$|QjOUHLGWRL%{Er5&Lq{!0}eHD_&T4E@G)ACYL zC?46st&p7UB#82(IbsG)O6AH4CRn*s0x7f*i1hf6@`Am44j{`8+_GfsZ#SRQMI^x@ z5D5uk0Gms-SHKt+uvu>>Pvz|1q21_OnE`~u&BWiR)r_jyo=OcXD^NDN6_fo)RVl0I z!>tmT(;CE$6NPwWa+yb#SpobyrXuAlwy6Ei-s+m1QNPcnPj)TBW)v}gO5yDqmNCzL zDQU)bV&zfZaZ~TLrTu=8(Wq@Uz#Ur*Skp}oKR*CSrTLHNdJP)!>DYCFQ9g$}7f|!C z=ZfIg0Hj`>e%*%M82<%xF_Cgrx$)WnkEfuD_LPD-^$w;VJR~1caz2PNI8=cV9?}$u z7i(Z7;d;o|QtuLbzubAE_Mr>s%VDLR&E~goIBNA4ZVP)o3hG<0j{>pB_;5|hTs}-8 z4m`V*%^uX+A#^VwVHqQvIRp*6m-t|Fl0PA_y{?N2sWigY=(@9^iv$*8lSi*vK~=YT zK9+QJE4V>mpEv)FlK{$ODDuYXc~gZYlRo}?UnI=~(aO+&jmVX3+vM*Je!$P^r0Lw+ zVdaQ8v*4z{;WoYg0cPWGUPgqAK8KgTd}h`HU@&vb>$l1r=37swExm`f@dNDP!+YQO}Xc^TT4!5^Lxhy=k#VrMeM#vh`M|XKUd98WGIEI)%{!fjZv`LVw_GQ zax_0gTUBO(&Hxy8%GtfQCbB|b5sf9BXKNlgSE{1z&*!`$MbA${@)Vw90E}YlEokyt zl@8y8T}fG@9771(VBJzhHVkACYl~N|PDioC)))LmOOaXX*+d=qt{q}HqhifxDcFBL z;NO*m8=fCoH-10D-hN57LP6E~gS3d3mLrggi8`-Q^m;$f+~50AOX)T;L*C6yq20Ny zHS#t?U2B}Ym6K5^MIU$V8z=JrbolQ5X>}!N**$T2XLx#3_kKY2;n#-(qTx@+LwbKY z#sfCI6~pVr?NCdHF#TFCZ>R^1v&I9TGz8O_T_d1+mt1V{J33|E=;DRKZOK(H?g;_h;X)VeiH}Z2-?lu-m%86~2whf4F6X{C!D&a%E1_ z$uw|m>PEN5^fQ(_+!Cl6tgY7@vJ*Wm189--N1VPe4d=c9qndze?zlpfr~L%!kN+mr zj6D$BMI=GO0u%3VKiL!OcYm5f(k2HSn2gdDk+tFWjW56kD@GRIzsCYp2sxsW z5;cf|8)$wbB)K&E?7Hmt-=Z3R>iLQg5{#pLmCFeY8hmp#r7r8&5lDRd1rGf*L#Ml= ztlP=6C*c)Oe|Og%^*mI%-cV+BI3Yo*&NOkS8t-$5rtzmjnb(2P-LyMn%@ZM@v1fN}WWki35G4D(0+)@>egjc^j;bIAF zg!%zFkHO$&8pyJrJDqA)N*vc=>r2sL!IU?+B+JK)Pv6*zADYk#uQh^&BH(Bh>_el z400G@$R&Y;=-o^(&iIDS>52uCAgn%$a7YHc7j*;bKPJkRaf#dQZg;A-(>z7X2#l=SQhP(U27IYz( z5Dw=byUvfobi9)@UU(OiGy}kGv4%UxYt2C^23ra;R%PXyaA{>=th!Ki|FqHadeGB5b{oO??43VVBBhiRI&13P{=Xc z|16M%3wKfMoy=Qxyg<|4yuRl4-mf}wdcuEB+Yp_&*-ZEh7hPgUD15@w5Z8-#yB1M_ za@Bgk%cvFd+;LMU_+Hl{*R9=$myO?e@h57^P&nVhcHL8uh`Cq?CAViJ$atC7{_a3z z-0xKAm1}Q?I-mp1f;B0By%L*CU>}IXt`|kCO&F+14jjP2YvL!8NqReKUHq6!A!_&q zPVg+Q-3rKk41@FW&#DIFZsN_eB&0#)K64UzRXD@VAvBh==B4ugF3r)n zE58YNL#zysR0}_Awg^qm&$0zS0Z-P-vMt>QPe#qM70cGiePq%{eqkcfOJ2fzO)OoU z+u(4YOV)z$*;J?iLM`7?{~SBlUrg7~c|?&7ZUbs42upU*lf$sswk>q4YS0DNcWYx_ zZsmjMc8}}&b5#{Mw^8nKCb5GAjH;hkDP?%0#13HcEk&!NR#mRg=%{Q&lL1rIn}6qdj_G?k4aX1CViC z%UfKsj7sJPYOc+OR^fl&;e99ywSn4Nm>tJ*gq`kueVC(nAEf&$W=Q%T9*&>MrX}vX zF6~(EF1f@FciN`JjsA-Ir(({QFhYgUI7Apn7MqQLESpE?o~J0P7~+rD3EXK(UhA2@ zz!EUVx?o;iY~< z(fmMK8xSw)qZDbBX;gIhw0(K*>1Od1l&ei9Kd&5%@xS=@%^kZDkBppd!*%HdcP}y% z)`PdV36pkVeO>079gKt3GL-)X&_1_gt13ktcli^)v;X8Vp0D`JE7_h@S&Ld&;AZsaVFzKtV$MRQ|TU4f9jE261dC73a8k`6>&+b>`&``+5GSb z`DN-&S-!;LY^t_X2F@D-X#e&Qf)Wn-f=WqwgA~xgL^+Co?hCUAu?>N9wY!%r`Xy@l zTw<{J?~asSPq4x;EU*UR8r~@PM7vb&LRZoOJWRJYSJB|vjC!u3& zZ8%fSEM@tJG8%~;2u$4wdp5nhoe@C^oYgG(%B!lyKhDB8NHN|<4{#^lrh_vDXEOqY z&Sud)zoJ=)ieB}~})PGIWY}B^HFjUJZMSARaNKBKl zhZiGqgU5EQ;H={frt+9P71TGo0nAntZ2GlHwJr6<%FusgC){VU)(8MQ%s^L%{}>5$ zfxu4Al|0P7VOA_aG$(&U3-=TPbg65&f`KPMn_wVLUW36NK%34@RT3%=%sj1@pD(Q% z?#}HsJd&LkcbN~$%xG+xD6m@BNS6xUPW1mJ!m;Hv2)@USHc2b@8+MeME}RlCF~9{+ z@tonymzdjQSNT==*;v5-syT(O^`=oeUR!t9RU}=>uIF<-Ga%HbR}oqJl7{1HLKYeH=enqA z^~m)@SGIuF1yiXQQYmHqWD56y|N3ev4`RLUv>3uUe5?8MoAa`xBvtP(X}1S^55S6@ zvIDg`IzLZ4;ZN=(Hke@Qzh8UgmiZjO(;n2-$yNq0XaRb8#<73(VePa7Yw6a8ZvHU` zi1bOfNmxhgwmklgdZX&+tvo7jqGsO?eF3sAybUtO1I4Q+2e^?G^)qJzgD`T{|kz|^= za&yADG?j|j*V5#LU_49|29s!xq{Z4;R<_P74G@&G2T z;{dMX8-6o^d}^X!umn}6;{2kjZH{7YLk`Yz)hj3==fXjP18IKUrs}xrAxYVR8GzWF zsVJ|%Fwfp@_?L?LVAv@VMK=ZoriEGqd`&;k398DbkRq?KY z*L1}dV}9FwoQjK>Cg{H6KjyFE7Ko+CVxBt)6sH?QS3dauN=R4TY+wtVvf8znSs~^L zRs*c6W#9m(O0HYDjR;%N#J%?!@<=CNl6k;DhMZx%g@?>=8+3D%LS;0BH_VF*s* zH?|doI{6t8F%9VP?qhuSS)(5#BQ{xLMbS;CbrGGe^UG%B?|E6)DmvV@XEX!eW+43I zMWbv46R5xQctdC**p_scdRMO7ICvyysuBk5OD$dO?gWcz+}-_HjE@m^T(%Ft3lP_j z78jTcZvyh+-gahJKJ`#ucstau&tM=mcbzia>x-7kDa-mQ&if^#v>H;*-1=y*Mr@G= zeu4*5mqlndMs3^&jI$aK%lMA5bi;tUuXDV%GGqc?152oVKI0;0;7-x!fkbGflVRjpMW{j@!Q6A6yC7N zk(AvDvP7Jq|IFUDQl!%vqM=8#KM8AT=TyiUeN)hDs~&C?hpd-;Ch8>VG94sfsGZnc zy{TmqLZj^IqttNKzWl%MI{S9LRpok6mswhCMnjJcJCR$jXK7e{|4C*{2eLKnyj$uh zGsepW*K6o4JFqpk%b?6>kmHZ&+>MB+cn2rRDM$nLRH9>YhGP=8V+q#d!QoLiGs&q% zx+Pu7sWbdtg%+A#4AmOJ9W}KH14MOqSiq>aOV*1gYWP2B z#^DI$d#YjV1Muk|cg*Y$CN8!g>jt;z$~*1|%3tqySkd!%1k%4*(qW1gliGD~uCQ`L=DHp>@R4We`7 z)8oR1V@M=ewm(3|zpDhekqKw5`&&>>qbL|8Vg?ac)Q*kon^#b3-9V{*UNy%cGgDlW zwl*wuABMY)jFisjY~`TXeC!)FB{s~cXazukET8`??!{N-%{1{+u1xBT4GRUrFXlcS zO8Zn?s9;8CsT4`Nv|28AN|gjhl@*n|^k#@KPxdc$PJdnWW%`?~&Y@E6{nphregW@h zo1(kCGdMj5z;VVoBj-m%Z-N|ATSfe}mQ8~%ZYcsS&M65VI3O!Om|#mQ?-xmNSrUjg zEJs0iMt|q7c&E`2kCr$um^fLIVOQc#PhRXx9%nYV&z+WG<`<|w9e1=5FG6X;Fz$l^ zLAmd4V6_w|7(84+5Py0zFlz@rzgt_QsY@huHAMd!P0r@Gh8|HiB6w)5C}dZn3`3+S zL_A)o%ta%;h6#<2 zT}UHZ`-!k@GZ3tH5H1NOPj6&Hi|j(Roj?;vt?6g%Q=&SFv0(6e?Hm;~dl4-4npVZb zP_{#a$LrWP^S!*eA&=>RL#6&6(^14B3HS95Sgz5P8)8dLQTFmzb{4R$5`?N@GH_5s zrVK&@(y~u&(;Sh&Mh7>3fWu^6bq)Nz|oPIV5YBkftocc{C8SHduj@&WdguzrpH1?F)o*q zFN;#BmpZQDO%pX2T1F2xJcdINjsJNsf`~|To&O_9U52Y~oVe)dZL2_Hz-lDMq6nB6 znypPtt!$)SH;tQ%iBU^RoO1lRgHd_B7(3A+`nXOM!bxI;OID&c070!2hG^1X--`zP z)hVK}-VG<`w7C_x4+J9C(XsIUQo&@~&@M*dV_PSIU6BEk%RARC# zkb;xCyP?$(!`R@MovuV7$Uxp9ywyQbPU3n{;ygN{h4 zj*5_}KeJ(8ls}SV02@{*CLWrI^dP-jRs>G+pta2rE~}JMQWl^(l&emcGB{`0x;CKT z1e8?H?v%buvJ|K^aVOYJi`78+vna$wN@F%y+2>ND22qld(+Vh0hwMS?H4E!IZYBAZ zEb_|W`p|>TdkZNuh{q#|vw{$7Hgsz=dni?C$B~myD0RjiFPKhAQSUHChH20#yHS$T zrQ{o|DQS$8ep-E&V6FOCtQcYFP^9=ZHg1N2Bl?YDdH z-On12VfWtr^}e~mPSA*h){g*|=iDa=jSR0j^Qjz@d?eTvfxMVt_VJ^rzET{dsOH7G zGmb+?ci{cxXv!A8os%{4K~zmWSw^bJ{p6H2!Dz~pHTaMhFhNv?DZ@bH1}pkOY*aN8 zXiWin%#al<{YemC?D~4aO^B9(JJUe#5qgb&JdhV!K}>(YC~LZDE97oWe7X8+E`O{Q zDS95DJ}4`sL1sTSOWOW4P@iFaJ%oPs|Ew}P2opnWFiT3TrpYK|(59QHxkGE>o&s zATpzM=Ytht$P<2;jd)g?;NpTHjGG$f9W#00HMA(6l#_m5B_hL3m~*X~C;BiWQT{jK zMowS7`o z!Q8)9UU6?#(zd+X%m-+%IGs;guiZrf{)^T>hvtc0&r z(5^-_zn)Wzo8SAB^cp5ZV(HhoQSwAh7IR0I>q_QTiud=x6`d=9FJ}j)AD@3irL@ar^+Ct zWY_bE>3U1Q61Kjo-TW9GPDn;4rypeE(zAz~2w8ZZt;MXptG44dJ~ZoyT3^|0B<%0& z9e==KBI)4B|p3V0LlPt73BQ z&$;|ac-xS=ht6XG$M&o;iJ@zJfx_X~?220X65TL#(#{I-;du7pHk(mMQ;37-Fq;8R z9WYpESP%J?MW#!V?L>bdqq$ zazc!cFkLC{^cCa07o{Koaz)^C+^_@rNWf-^ow46&1z9NJ+?9b0iW%*iIBDzH?C+#|Pk_>fl3Ij@j*>1eDDSQbBYr5cT z|Hr@WSNfmw>_1wc6S#obK)N{e;QjFWujC07O&_@n%U%kv$EX8Q5;`?3g4EFQ){(vg zyJGM?r;GRi0(oSDR6ADl>5uGb*aA@pW*s;V9^3UBZBP8drQ}r3C=c7wRJ$wRT}GvgC_#aW?sYzaibxpTLm+JZLG0~Tg4kHwu_#pnKB(7cNdp4B8?lU7 z+@rrmq7MA115@x50-6f3mkt~y8{-C?b1$zk-gfc8M-%^cOC7UuZ)$RJ&x87Sq8o`E zq_jEreYhE7h$dg(Jk!uyeyO$dPe4KOEG1cr=Uk(AJB#;>o()%)XLB{gUb|jR`^|qJ zD*%~@IauSsaS+XXGkSS#T#Bs6Nr4w+WLb*;TR#ge2L5WI@=ZR}N#7VB(@fVjGW%{U zN=9CbIhtkz@g=F0x-$BdP2Lz^lmCG?6D@n1^VGHj?InMw(awtX^EeMRg0mm+{||~6 zh@R_UmWZG0n*M{N&b}rXw_m8@^f# zNDf%-HXprf?S`qBp-IvU6K%=t_S>nT(#FvtxFxTo9H;GeVz=|IRNXhAgKP!fKiNH0 zy0G5D4PHVGVy$QGEvazRbua#c{1gZg0+kPcAu6vKtRqBP;3)l1!IT{g3{?X`pUC=A z7%E19G=^v+^{^o?*n+6^Qs#j?R1d_UXw>;N+?V*C5dlFiIM#gDW6qt_=F$)%U?^&=HAQR0Ica<*Y z?<#s-9v|W)hZ^wdHv8n*tNWf3l&?$Zuj(h4HIVMv+D5s}q|Dp)Q_KCYW8dJNdhIzb zEs5GPmR-_QE@Y@3lkzvEP#V3o>LHdr4X~<^v=vK0?;(27dAF*%66i~$7}20ljQvgF zLk7Ny3>D&x*pQZ-{pv#dwLP3r7APNTy3|3J$RCYEeE=v#6v;}^rf;tRwJ9&we_7%i zt*~_O$!TT^_{uB7WSl6`ypTf~(}Wbze>$Wgd?Gn=qMQc$s7|l>U5!NM9>931gx&}G zCL@LAU#4+@9KV9?o*l#zubFsXW~5xPfl2d~mDQggcRz)vZA{O@%5cIQ;PhEkHVwwT zV4Ecms|F@z={PFvw^7@xh~0#mtv5{rG0Pvkl(mg48@oosrVREv6S7Tlb^|K|9TEJJDEK71a$x{Z6zXc#b=ontN|ru5(x^dZVF z!08t7j`M>okg*fysa%{s&gncHu-WjwKiPLu#P)dn^tD^x$UlV=0j-NGnF&q%{lKVx z8oKJrBSvN$GWje>G^;>-0)F$UQH#~{A;yy5mC#e!p zgjh~TI9SJextSChsO+Q%_BD%lGfXFFeZIcQ95z<1wnqv}BuhcrR-mD4|M7{&6eRVZ zc^1Q)RAp^9BX>3h8O|o{-o@V2bdi^yt%;#|c+-;llm8jIKJ=y9<`Pvs&Pi37yA{@S ze%oLZITO*_CB|BGJLSCSH-7Q7WCHiVDZ}K0%%YyV7n+7)eq{J0L6>LMXT}(b0!{XJ zRX7=r1-FSTclb(2LtMe}(nqSEFGlKGp6hy+8aI#Fv)f!j1nt#jSXTFHOST;B@TJVG zf!%K191!k?FS3^|Lo*^5xAj+uvv&q5_Z4U8)Yl3Ij?0kX_oH0iOA8eV`)z*83n0qF zFK^q&bQ0{DQb@!rg%oFl8Ck2lRfD^Qjj)X)3LB6Zp?#;8=V`mb__nsh+JWXewiMtB zkMz2Lfg@6r4~Bu5MB*oo**+Ue@IIT(9ZJCh1^5)}vlp0YqFQ=sNh5q)rwZNMhlxA! zusi(Mu90`_wNc5H(-+p{l}d^O0E)1Fqpaf6)fMi$8;G4f!PIEK4c~8?9p4{A52pWd zUX8>CsD>(Ag-`py9z~bDTZaFPkZ~uJ_rm&w9O;d%VvTn4&MD^!fR3<^*hdW}%lAjP zlXyhoaNT{d_3nbSy^rWfwc3K+rD2HLmod+~DP{`5#eiqM1LkdYIPKBHv2ctVhd1x{ z<>tlh@K*56p{cdfd@R!O&NI#+OHa24ER$~eEf;z>Nh8MXx=PP;^uy%(XQPoJ9X0;9 z=yTAs;Iyjs+_(tG=rvgJ=k3<%6$eAbF(1-E1{l1{mO{hxUCKRf&sW^)*PZG`btrXR z_m+2C`lnMm|C5TSz`KQL#OLF{72>K+)=zeVuQF37)yryN8>0H|yCve9ZkVprjjy#g zW0&`H`x>o==U;H0s?Euf&a;T7yGl3f=eT`$?B=LR&OGhRwVaYK>~`(n2j$kg1vp$f zLzTnB%(iB zNZ8V!dqDNy|BNFDyk0OmkGa@y7AS|)eWD;s&T)(VqhfoJj;nI69Y5Q@?JdPD2ZXE4 zy!!O!7o@CM^YRKgZ5W}m<*)nKz5$!S^K6%;iycu@r|c&4+M=BZ3Sfd~(9`JtaWQMz z4hb-r8QYj9%2nZN-|z02I;*nh0j5$q=D!~hZhA75SoI5>sQF3kk)!TRgHefaHqjs4 z?`AU%k{{@;l5|2M5z0^{lE9{@2s%)BLXX zu1zIAI>in(eXl=eH&3f|;qTspeBU0(suuhpBk55xGad(QGK1MrE znGG|qjhAASBopj4@V~0Y8${F}%J9^~`#@$ODk|2m$y3UDPxm%^@n&-4q-oT$U`N&< zo~(H_=To;;Z2KrUT8lRkoZ1?G?bV8$TvvwWu|dO)0od>Rg9}`+6zPY zCp4Cuzd@f5P!cn3A_;#+r!I6&G~O1YEPbOr1^sQ0>96Xk z3B_?n1GmFJ1z9Q2QliuHJ7tlxF-Kw3(t)_~6jal1QWRf-Lufqch5s0>v zvkq{Cx_LGV*Joy(co5BIQ!jMm6`+S0-WTNyVKMjO{}H%jx$AYG?`?SNe&xrFe8%g% zxHY;;e+iMaweM0Y?fs2)hBjs|k%&&Vdu>bXaBus=gm!;g@yl7r<_?3K)jM6Bqm8)de=O%L^!|}uxh>*guX=v z_wh=~^ZYq5P3D(_I&e>L3&;tERC^WWd=9UhvN=A+vBmCPE>%c3b*=?1BU{5iJHwM> z(O`cd1osG1(H7leAf+KZMW6XHQti9CD?*KPN$&r#YGvK?&t2Jbn%^*=T;{XvzlcZ& z&w8p+vn@5Foiju<#zT^?gn5og~CkR zDfS%th~)rHeM;z`w+!pVoxW$SabsVIuIyTRbBhH7*EFLob?(QL+vSL-|17?tsQ~dk z1j`o%x;{5X;s{`YXQwYP4Ekd?ErBcU2!bbm{ve01*=Qf}kgL~hTTW1|Vskw4c)?sB znyhpVHvi=svjACXYB)8U*VydskYm8DV@QJ4-fN(16w=y0|Lp}*UxxxG4shGxCz$8A zR(Q*|^eew|R8_2921NtUPlVCQoK-Q={EF`@nT_zTZ=_&Xe()Not4S;|Uf~9&lKxjh zQ7;2x*3U!Em0cBfAhu!L0W2RFX~OMPjqK~(S;mfcxiZfVFs;9bFeS}xUT6K%+3xua zG4)hAo+vTGS4A8rK)YJKz$N{TPpp7WSQLjYiyGfyCu#9kjJO5XCGSc>MEfbOO$PcmMit3^jkF#@_Mx<-r z{pB}SSz!pWQkx2LA$?w|z@l~1nXQ?p{b7T|C?AQ&g0S`XpWG)(#==HFd1Sq^n4fEo zt7?2ub`U#CJpJu;x6` z%bZ-Vmhx$&`r#CH`&su{06b2{916SAj1eH&EHk{Rqpokg?KBtOOscsU_6Iw&McC^1 zmC1&p;=~y8lq+b`zJt%RJ1rp;tMYNdr~SEZz`^ z&sTS>N5oX_iDD5QdHWp&xct%@tO$bPP43}(XytG#^F*8IG~MD)c!gdporRKq78Xzbvt<1l0;f3H4JY z*@g>;KOAYTIb+Hwk{eCQm!_<{xnyZeHo^L{wFkbK<_c!8ooM?UyKklpzo%}ePO&QO z;=nah*cOIfKtttQ1zMf$dSB_eLfr_3eNFI6j#F$a+i4&L1liVO;bz}S=YMVdX3 zz=$=b3bx~B-xlV2Um}uOxG9X=vQuJ+hC0m}`bE)*z8P7^iw24--vRQ*8IV(YmK+5g z*|`kkJ6Wihq8VNDsI45tf+YKYTljS@oy(BIL0V%sE?4x@GfW#^!CbCTZcR#uyo?gA zFIA+a9`7-(6+UE=FmL6MH@)H`-i@4?pN%=-kaHi8PJ)jov==^Hdn;s(bY{0Fa54{1 zrCQaEIQ}sm_@iQ$viFFBd8HeF51;&7i?4ai#Or6qv^TUZNe9>n;EZ1NZgQ~fCFn$d zB~u-&2S(Xc=*C=Md%=BSk<9ODq2muof?}_H9MeKs75!4XEBUt$hBKBfd9VHlP^o-j zTsR{}h0;V!j9X}gex}Wt6(#I(IBbGV55p_BxCUpt0y`Pu0%+Tut;)zKRFP`1*i!G; zNT~82@q}wsn-6K)v`EqTjIv0(tQS)^4LsCno~=XBTzm*Z0}ux{gc6!^46iU()>8=# z0Zyl`-`+Y?^g*rZQFa|Y>8`P7vSHG+5W9jtYKo%2H1-p1

    |Aeimwj1YDWS&kOxv zoxMS#b?(8*UTVtHE~y_XK2lUwGh|%fbrJNB_eiaZ#aS~;J%KLCatq0X(wLz0fJ4bp zll3%zfMl8p`$54;0pDEkBRu&W+cQ=1O!sV* zbQ|xV*FjZ}eT5U=pef`6^ zWJdR=|D()Vy9!_JXy8R!jhk=|^3;NEF7e;r^yVeMP%vi{v^%(uQ6o4$E%-ZRn5Z*E zXLcdWR2q#G5XMKcUF?W${702G<(9;+Cvj7`>?qcFi6Ysgw2sYk`;Wg@mVG%{TUj%= z+dK}OYh4Z!?U&M1B$2V#gO7bb;1AKCboNSpxCN z9x{vL^r0L5A5_v098-R;1Vf?+&$mYOT{nXNeBYU{=*6Rxg%h5W?IVRrDZHu~fe{DM zA`6RPwda9Cj3`h${UpCq+$pX|8o9|NFmJmM1uZU;BIJAt=2Ip63O+>DAFsdxXN@sS zGg=gEN&2%b5Go0D&7A|Ts;vlLdqYw(Di{ZXGzSy{ArkMJBv>wC7slgvYJg_9jX zqQA$V{6hJ}#EQzj-qPLP55LX`4G#r>&&yV1N)C2f1T$auxjqY!xjBzNQh|{1ug4g4 zzfBdwmfsZ~KTdhRDWO<5 zLvoD#ugBU-@pCW)G_a!^{clO2&L4YYWHSM?PQyrY=zH(Oyb6Xx9jxj>1L~f%C1tCo ziXT3{wWn=^viKq~m!7BjoU3Cs^iRfFRXOeU4z-Qg*wv?Tg4aaf!Vgk@j4!V>!)pTo zM}IsH%PvDsRqpLOGDIYWizwQbR%lThph$IK*#2VipU&g@sSJ-zTM3~H(*2+$dx#jb z<)r0*HcX^sY(wN*me%`T&OXtDw5z1xGCSdr+s9-8Eqoba@*X_z2*sEtt!duxJ&_9P zS=gVlwxX-is$^@z@{e#?ZRW{pr5e@czfc)*BNIl$tkBV5=?>2$R~k-ZE( zMwN6dqyO1trT7#$mzSLj3BG^{&E?fQvJ%iC3nuE1M^6G9o5Xc=m$qCDFgB@(4nQJX zkZ*Ga7-E}8R3K(YBH9MfO$WS`Gm0304wf=ffog*$DQd+dCZ(!3-a5oL8tU`mRu51H z)jhiq{#bj36{n^8Q)eC46JgAgBprMQ=HU=io17ZI7k?>I{vDZR78U5#qn2=haJWd+SpYeTFfPjnm*JHH)oxF$;gegvb)KTlB|RqF zJ4UZ8HD*&aEJ#}AAtsN{D*K{kvwp1YXfeQB|4zV3(2XfHZY0rfIs+ z#gL04tO9+*0E_&0k^YqTSsH z@3r&Nz1or-uE|3h$5GW`K9xR-ETQ^$Za82gXXWzP@THHPN#wd-09h)~2w;l9gk ztg_=;o%JZ#Pn-7r3AAI_>0yoEbr-ROT3Jcheh?NiF*Es~C3qp7dUK=3I943x_J~6n zq0uF`x}gIelD*CnV+PH=^w4lT*DT|;-xS8QDuHNc{kXDFnR(ZX=cBwzH?VpMxTEqR zsL-!#F;YMVM<40A27q-(1H-i-bK)=JqN{fp#B4>Y7bD)Sw_f7s;v1KSx@rWEXY8y4 z6q?Lie&;|pWFlvdBYE8`o@SRDe-O0MWOoTVWAj-ke)037;ZIO6@;xV|g(&vHQ&pn5 z0Vgm^nZE$yn7m=$?|sFOcNOF!q2}VPjo&^1-!C)^Mq9ZA$B$$RMmqsEkr=!cZW)d$ z6!_sTkU8wv^i~YSWj`TaB9!WdDFa6{rXJgw+%x zl53I%_5xP;d`m^bD#HDr_(<}fF8X`lXnUdjH&)NS=!as&OQlGmR&SkNWAP)X^*8^4J8bX98_N&D{-~LpN&edWt zP{IIhpea2uh8$`ZG)i4hpnzME<@sl@Fxmm?!u@8X#IvH5mjeN(zE5^YJ`VS63t8%2 zdvj-yi+Dl7%u$y2LW{U730anFv{JV*+T` zqzEMA8o@|!=x0q`dM0Kcal+`;@%1fwIDUe6lc@LQXoZyLEfgADjh81~g!pj) z!S|i_#__DQ<-?*~y-O&dKUNCs)o%_h^5wpncbwx!E7W5+-8jP}@3^?hBKDAyp_vky zn5)w`(1Zz>laSJsbWjHR2t4M3k2t&ZtyYR}V%Tuj2<9Q}E=t>76M1X&G2qpE13%VC zE4$cJE`3%k<~0z0@)U}DZy$;#>?azBDT}klpQ^3NM>_Nv0#LH}$qLA%d%O#o(x${8h%IH~81XFm`%t-mi$t z)?2H-u&j?&``rarLr4j)kqi4i0z`)-;lSaODY@j4tw6ZLHVqw^8UrY)C>0NX@80h{`}vi=0M*o!ZXasQ#HlGTy;%N*!LamK`eEie1F@RZ&b;-P$j zFHyP&`Kd)%Wu>;Wp!RM!ogA>PeiP~u{(Gs05E`~eF4%FVXK2;DMYiHSZ>@iC=tb^Z z!S{XsSN?j?dg7zoNc=WP@tdBP37!xch%zjAR2{e?74?zCuzBJ;-zcxes29&E(n|*a zPYngw${%YK^{ZyhO_I+SMOWUd;ho?qLiCCACL{yoS zG`uV$JLIM|Z*IUu<>`BXH?tp;8o@i6j@MP0n1e78{F4dRVTO|CEp4BePpkI9v;m?~ zI3WBXF&%jF(sdd-u%%ZA@82e==o)d~to_rp{(^9hB0aZxK&*`iu}nVJ4l2X>?D!9! zLj1?ZtLM(fs~hWBy<=ew%Qm^9ID0ta%2j0wC;u?w6e+)1qPlKsvC`QZ1ATG_$3POk z`MRS@oc`sxOA=QqCx_XV{BCBK=u4x0DYzM(>jqhW4N8ZkSi0zs?zbB(Sw5o9LW~`L7-D$L(De^J>nnSO(!<%Qg9CKQ)ly(RdwjKe}WvLatl)4|OA)XErlhFG;zyIzL zZDp0e9sZgb4gq24;MIPI-(rvSwx!c|v@rf!1_ ze{XhezL8t-i~TGLaNd1gA@%*NWqWd3%RH&qrpHnqfTO%31<|k5mSGl{d`IGc-I(z# z&R+_Fj;^BNnBB0E573YM$Se6MJNji-*!vb!u%8S%wJZ`qO}*nd1{rE_^GLAmAZ0}y zH9SoeSTwu(K4kau2_YDL2QIK0b)e@nDyslvkdY!5*gB?DmFyo?UnE zVv*D>!dN$0pkl~&NUDsdd5e-V3O#VSb1&TmO+(}rF6p`4r~Nv$89r%|G>S;(!RyNx zNjgTu+5*Sev2CIf>2&Bi_>dKX;mdGxRW%GbO=fQkAVE4s~kcDl^i0mq@E zdB-`fKkkB+OgU%D)fhA=Ot)`)uV?5-1nA)Af#Kgy!JTQZKpBDL0ee-8~wm z?l^_E?2c4yX}_#4Ngx%h2{n46<6i}^b(icchW%8qccgv+{T(FHR93v0NM(z|5iMsB zg4+!2*NOprh_;Zp`FTJ$C)8zeVsY*0Q`Y?W;DOJBhMnE)fsg8t8G8>I=Pt$zf5+)* zrwC!^#~RsV6iyrT75$YEZoL`LyPs?A-4<;Vp=loS2#DB<%#gHty2%ngoIco7*u6K@ z?#K_1e8+SDW#jT_|4ClLHDtK7K}|(82frL>2SjFjo8}F0AGKcSVeIy_o4#r!19N43 za(AvWAm)`ghkx-D&fL81vQuN7{vBHcqUJ`DxlA!95Dr5Dmm-fXEfFTak-0&A_Hyi5 zspLt-_)xMCCuI5b?>w8fdWs1;wLp5ZCHTi!!u(75#$iG1VJ#S4+o-18)&6wR`~b(K zF&gC{_0h~eFie#~sa%Rxc{-j=;-dGyi~WL(d6}0E-iiEs*iz7B)%|3cfpgd>u&XMF zeI;A3ZU?W@CwesYN)srQy~_zS8oK85uQT@q?^W-(;-V&L)CV+7a?UWB$naerzgw?O zdGHoPvfa#a7({x3eGJ+qemxhDJZ^8Sg_d-2s%)MGGsaXKXc*HuM`RYX9*QOX-%63z zts1y?L)x>1By%X~RIpJiS-a5|zAX5L_%9Gc%yD5rr%5>?--I? z6YWscQ5=%R;5vE0Y`s8j5#LjHUA<2By@guB$2`G03xRrKw*+~(fjCW;VO|c%LV?G? z<^Lh1tBA&0mZLP>I)6Mzi!fBQG3%<#eGm*4KJZIx$cKx2OU!pn>J_iC*Y1d|xgnxUw2BHuaOFeZ&hCym9d_hbh} zk9;Nz6@%%jU25COr+8WXn>Dgr>Z&ev{XpSndTEa~?DJeGU`V@R7W|qFxD1l}fKFs7 zR#DW%#lb1cgBK?q72J|zK6$&IwXaqUKnzHt;f< z*{`F7p*^-u&7*$6TI7eDg<}Q3oTu;;Gr^~Z_&1%KuN2tR94`H<%!~-}rZ=&C9hM)) z_pgZI1wz9kA4RtA`Jua^@@k(iD z$_EaUCFX@;C)pOLDNs+S2)-`sng!KdTsk1C^sq4aN9Hg z-i=?RM8BCO!*#+aUZG>2{xg7dGrK#=wW=&z=IKEg;#4lvHB6yHjQ-R-*EJV6RSf-9 zE<{HA)~BS(i-zEID?}r~skVwvEAsD|j=1Z}U{aU}KcSDTzglwUX=5GSl;&j)Z$B1T zTAI`EiP(975Vy+6f>7ujO>2}j!FIA)rD%N{ZhrJtwh_F(xb7XN2|ny^#s1rsA1}Oh zu)jH3cF!`tyIKC;O+!*+oTjYMvQoK(=;4Autrg1n9s&o_?K?H)uzNs(JMO@#yN))g z<_^goV-s3Ypc@FqX`UJ+)%zLdT716PlOjp{{kS<`%F<@!56^l7z|N|lDzCmhsR31_ z_!w3cW~deZ>}68TJC=FQap!2hotK;YxP>cSWXhr4faBN^`^48K6e4+R?6ORQpE^Btrp%ptT#9wm-m+4W*l`;?kv zQ{p-UpyjlTOWAlBeNiH4z5q&otw&gMmpNq1)_QOv2AtiICRo(?A) zj-IPBbSgVmF)S4oQleTohp(l!{c+A#(U zqU#ha8uF2F0fa(v9P$brkwqfoai;g~z{>XwC!ddAh;6rtHHf-56$(+5W+`I6Z;C^t zyZYgP%fXoeqHT)GF58QYh^}?jya*JN13sw5dX}NBk5rMqM@fYQndUnr9O#Dg{8q-0 zSYU(ma05zYlS#7jpB+|i*dU%N6c{lYAtoK*PuJz7_8M2miLd#qallm3m)uC>XPUjc zF9=uLUpwIiwXtchOF!`fy*FSycqn~*R7eAfRU|S?wi>eJ__mPSFxK?gMFLX@+{tUo zMKDWfsr~1Q#~rYKWQQ$t!BLiei8hJ9|-BciCoa;xNrmkVw{ zds-vV5PD>)E62H!R|kbfNl`0@&NYYcpM;uD18;%+9OP`P5XIOOvDHvV%j5no|om_qjWS6p3KPl!i1u zxrj(Kw{|{_RDoJ-U_es{__SuvC|RKMxLM9^HP6(7^E{I&?U&=MD2m=oe~HN*b7nUE z;SpSW`pLTRY6>V*3mRzo)3@#@CImD6(3;Us`FaoM@Sy9V6jeaVgL z8o?M)$Cj}*``oRIH|%GV98j<%31FhMh=H<_E~|EWq|z{LuJaFeB}aNC#9`l9E8Mv)vFLjefZBRQ z;_tDNK1Ib#&eSh^*&|a&y4g!#7qDk}S3QLV!I@3GW)I07v!1+mxM5Grtv@=2rI+n6 zpDmc4&Bzh=b*E)hI>d>{`WTYcb{oWsB;y1*t(>FXgdL+$XKaO{M}|EXDR!Jjm1tf)g=Bd5t(|Wv*4j7Kyv* zyH;SiMkbZ<8U`bAIkVqZNfVHXwQ(t;x>lI+aj*~)xTnF2x;~fjPk^B-A;zL}1s);< zp^Rf=kvD|=1}g-IhbWUyP9;*xCE9Phq*7nkSY|+6E0T8cG34^kB{|u}xIcMFW5UnR zB0br?AUXLtD+<7!xkot#_0AdK;pNXL3Hx`TVSIcLcVQ~&s_Z&=EQA|`I(V-2v!tTs z1B;Ffhzbs6&~v2C;`pv4BsW9K>2Q1$Fnkwyg?B^f3y_4j8Ay-{5fWwMMks`i<~g?t zJeaO9*OEn1%(o$@C?)L3%4S~u_cZ3-7<#9>%cnPWwG7M(4(U*H#U+g1DKOsmsb$?h zQ3VnDv&CLWv&F>o`$0Ag%o#+f3JDrz(vm2Gj+Qxe3Ot%wCqHeAMlMvmHEskdhrJrt zCw_#Sst(;(G|rP7mXA!|={RPr)xs7@yXoIoVBOctFN7O2oREr%lxzscBVz;jyBN4L zh!7Q!b<^=h?>z&Zf3fc_niNIYr{NdgSD1M(k_)z^HfY@BqNT6aDOl!Y&_0ADU zKMb#Q&>tabpTiQ&f)wnKUm!XoJ^4v}^37+U|Hj$LG6#(>n)Qdfx!1Qf+z8h36=&}- z=O1|d2c`|Ep<2EpC~7)sKFt=UI2(jREF~ToOIX(|k@tNpgU(Hk3&UTJm+VTybrQ*6 zJWgk1ieCb1qS0WY(bGJd(~@c%D#Kb($jU^?WHSoU7PWJHv}x|&=Dz)|B!MXhs_^N= zc44YZW`}KP6&lVKYXl(k6|}{#GjKHP;w@Ur>bugQy5cY^uDcZ$gjlmtK5`a-``6Yy znCH-CQqtk1!_mv9oVfnkH~;y~B5IrjUr7S#$Pl z6k)G9lb>bS% z_wdoeElzAvA%0Z{$(LBj^-ne@f`g@FWxWVxI2S8m7ECZ@$+V5V=tHl-m=zFI=m>w0 z9=^=JXcN>kKFgOIL1=lhfM7T?%LngDBW{PIXA!hDIBWZlol=CF`LEvAaQT$KtanhW zi_kxOS+T?A)BobH#NlVP2-Xv9JD(Ya?Dt;<3G4ukuYxRjl^;Sa3Kx>lzyS7&L~mQr z+?j&8MMe0N`x-C}5kEknzp09l+bo#gNP)naAwTc7U{q zI9XJbebL3CStvsg{enxJul1}QYj%y!u%T3@K*rfP1jD+oa#ta7JJ&F?pq{>e=hS+a zF_`*#&zgj4^WC6K?y9_0A%JN=7R$p@@=VjfGHZ9 zF}*(-W|49E#J{Lsze>dP*RR79KJ3xBkrx1~SW0VmZR@V{iNe#+#%Zsq?gJNWvLZMR z>Oj`bOm@3lx;^q@NNhkLbxEYO$cyMDHxVe&;iB^aY^{MeB|I#Yaz~_{b1;(?-e5_zVmG{&Lf|yh8KNC#H^|6WotKcKHouDVf zA@Pg6jzB-n{q~oWh@hwWfibZLGEwdGHjvXm(6ev1_q!9yN5&rjH^T>vHw}%jW9Hk6 z9jR~F`qRfp;b~j0BB)8oF{IApNA#@ghWA7I2Ns_)>$|nJ|3=n5Hb?lhvBEZn9q_X+2%})YUXKcp)z8tI&(Z6J@J;+?<;#}pb*~T0 z1yFLf6guRY7+oJJJl(97BWxZzY#vDU3b5WTMap*f_{0L5@ZPRHtK_#V4;w$&*f{`Y z|4G(8qJDWlI#fqc_+%9>H#Z;=j;oxn=|pS|{tHI`u{B&ORS z%Y~oc{$-fqdMm?x_s;)-6j1s<#`hnS|BuN;2mV{f?mw#jA8`H)u4Ljhc4mZpIsbOw z9rho@{tI!I|Mn1Y%2@Ev*lH~PrQ833=6}r@{%!6}{ST0TP>Vume~qSsKSbfnsOuRN zC|gaOn+o_pG~^T4To3=e;0fnn!2XYsL1iuv^PLF&2bTW<+&}PjG5<2_&j+x7TR0$m z8AJTAO0;pP4mmP#GQu~rLH&GqZNcgYXz!^Bb3_tL9UisY96VpwJ9lr(lT5d$u^a4F zu$;5dJwN;U(`v4Di+j!=bEt7MR&unf(NEb})_cl3Xj$&4{3nxH;VaW;ZSyNr+C$%N z6Yrm|fc7tAX`NGcn?4!A%57idsQrtO8kE&#OQu@XR3*qF|5LANrMY?8p{=&2`R}ju zHNErc?thXTp>&-9Mw)bAZm;Q)=XCBwzr9a62_c0c05@+<0*uDxZ@@8+L*3}{AfTM5 z#u&ad_iy|SbBzV3LTAW=5w^`bYYlTbB8ClThC62YyZRzG@xt^W;47AtelvX?*`O6! z!J#+k^9_%MKh}c(u8ndzrfWun9)X@f0HftcPa@s zCWuZHF+vB?LIy!LaOILQ9Iy+~11%I@GyQNgsF)vXm~7r;=CQ-Ea~lDZo{+6<#t0*w z8C1?|&A|^J@iyfO=&KkUGv`v!ppG{-pE#3kQ;cxjFhUa}fWsUulKDoKOF^Y76!Ftx zfHsdqfpr^nnc^XUTT%$ROt9aE*yVnJ5u@IrMz~T^K-dOr1{Mg6&)=Vkwz^ThQ8+4( z^r2mQnAx>R7a}(GpfuVp#jXnenw%E- z+pYL-LO04dpQ5YdSeu9V9a030Z^LaH<0jgZRzqeCNSKWVFDHtm>y`KJOP1~PBSIP+ zT3SdZ-(g*Wb2ceS*!!xhtnqxt5(BC)YfN59V%Ynh+CL1_>_D#fH%EP*Es%e(O;sBK zIo5%_{9e(;W>6BMwJH?>w< zAC$)#)EePgIPo!Z z!3m`m&H7s~J}qKVhm#ujNP|dp=9-P$-Zks$VEK{Uze@}a_#{E$J zTKknbEN1PdNmSZ@;d-$F-)$;oKg+g96YBG;xjOk+)Ea>w3CYaj*|{;?6suk9`MWg# zi_<_%?w*q2Vjf;uK~mcx--evTYF`W_LHs*I?nWbZ>w@Hi9#(KGREF^iLw7A!nxrc8 zF0$0q2u+T3`ekzIkVm5YB+6&0t^Fx;H`xm*C3MzrN!fA3L`G@eEl;6kVWyV49F{`) z_{QlyA$9f*tD_Ra^zqJO^?{xz11iN!jIa-O_SdbSB#M&!ywZMTxj>ZTlp2@g>YW&@ z*81W-Gqo2#_g!W6sqhEo27a#r)YGvG2n+~VFi|9wK#ag`$Jbm}2`%>{n(gFd=PNE+ z*hSyD!-K=uZygVf@-b3eBIWr-TLcz}y!2NGNtec>Hj2_ER;C7QP_np`ah3gWXrs(j z^^IDa-C^(6M->Bvyzk-sVb~+LArh5W(RmN|1gCN$!nt!`BMMZOmi5`wwRr$b8*hN2 zZ|s|SixI20nKl$${@z|7dKvWoC=nlOvK=H9(VKDxeuJp8H0-Fy35njLBMlyiZu4Jl zxRy?GOmHg>P;A+)r_t>Xid!H}&U#_77<0`;%@%=;|Nhgd3LmutVJcBb^;fhSE$7v8 zV5BsC*g+7^no~~M)b9Z|3%ATSjLH+4Gf&J1{ zze7D)9C2n3owzvL;?IIaS__%=d z65;CDhE;Sj<7URn7iqLt2cu9+W~z50;X#Q zABHf!$2V7KZg@4I4>0lzmUsBJ&^9*!-ePqQq`*B8K|?tkc7;mjza+}K zA-R$x(+hjfGT`yKoyJs0pWxcCH72(+zlK|!PB5?2yMT&pl15$w(_#SyZ5>bVOtXmN zg~Kr!!XL)B*#-gfA`G9Xy<<(ej1Dh~TPTR=zfRyXRFt>SmWi$U0@yBdL+|o)6ln@S z4>FyAte<1YfF_*ACR11ON8^TX@6^Q+U4Pt|i%IWbl~6T0i$$lWELS5hMb^GImNm(j zFl83gepgQ~tW?8h)9-&)YoZnXc&|I!p3f!kQ*FAPw->bzT&RY-JHKK zk2~uH;K%ntqRL-_O7AR^H}S&Ii_b$EX~eOQA4R5fpBy6I+){P#7%#=FVW!1u#A!dj z(xdw{ZqeOP5j`Y%v}WJ{_A%@vJxbf)vDVEw{k2$S7V}pYDAHAr?iiX{*}|!5JY)1^8yRq2vqm?%-lpnf zJky*4!I*)s5BMah&}!nHBW?$EvV3-T;QP%j{#KV0_L%Ir=)$x5YpYB!n71d8Kr(}T zdQ{X*W_scrfH2tvdVBCKPtY~5T>}BBoxg5@sgyv=hrqRJ#8fc{NjwJQ|YJ6_&hd6r`P#Ew4 z=wbX!c-Btmi+HN5twz^pF$Q0G>4?J(Q(|!jo?wMI_3*q=A1n7bTQOGA<*=Jhr1*Y# z8AK83DbVh*bw3UlYU$u?qvr4XjMD;LzeQejBP}AkybK48W`*23;*6{9uv;j{H&!;V zk7Ek{dCP4l0irvvQ#feHbL{9ZE+7Bsd)i`h`1_f-aQU25V#uq0qMu=vdp!`Ys+n#{ zJGGUSBzg3FMf|hVc_UagqLBHiA8BL6mx`795+KbBi)cRr)sfg_(%dUQf%foqw3_39 zit|{(ri^ND7n+MyaC<3e*UWBK+rtN%p9H$gT2!`N?40Fj4z@;H+q}iGj+zfaIZ8x5 z#K`PM4CRGN8-7pL-5)gqpl4o)dHH4nVySW^ooF)KH0%=UtC=7CVwQKVYgB^6x-d-lxph74H_K!CuKAnG+S=cXL*ZDp#o zhNZg-vlJk%4`QD0^I zXp)tA=7=s7v~N}nLr5IM9k0B9U(;8%lX>M4!vNUFlw{gBsWi!)QW6wBBXe4oMh)k* zpP(McWD_#v6KvCLd^`6B=C}^#x-+_eP7Fz4X(WS!GZBb7y(~=E;>0=Ex;|ws54A_p?z_yu%w?{cAawbCB zV-uey_TvnirrP0HR|v>gAf;+~j|>DDu-O#p;tW!Q3{0XctP`Sn71dP~6Vl5zL7gU; z_cC=N9$^<*R;(K$R96htV<^1y%ff}M^;_opGZ>vr7GW|o+uVWZBcmFo z$(yGkJY=ZR^0klY&;@a!eC0u444#5Hv~p5YRWHPQ1|IcewNk`4&WfgIIB7!C7jZRx z?1nS1eWs*_E3rjPGgXfxTs-Mden)aupwTN8f3vA-1Ob7-!PRwEjZk)8RnDV!s|gV7 z@lF_Q=OcpgJXORXc<&Qr>%4zHpb(bfouizWDCnxJob~Jd_8Gb-n`r2%R;i~n$ZHbX zoRB|iAVk#A#Nk?e-y`iACfV4*S8*bWie72)O0_(urc_&1!*gCs_Anu7>9G|jL5-YV z|1kA~hPP5zsDWJC=4Yy#g|OrVJ+efT14h*O#EJtT>n7E*~)85Z2=k0me7N;Yoh172*hz&gD0RA~0y>ZhlKrqMY zWekERFoYDgCHNyrg9w>t0J}qY{R6{aKs}k1QHQeJfp$C6ZLyrqctbX!IDp## zZO)u#KiL#+*6h8?W%ZP~4Hs>EE&LNKO!)*h-ul?_(IS@50*K~;&#mCYO zw$-3Fi_%K2N?(TSa4vl8SD~J@tMkWIiqQN+ysF4-+vZk68kO+C9s07FE~y>r(36oM zxEpEf=SpGPIuhgBrxSl;n9=RGnC$~sksL|h!kbuGoRIzlpT3b+(o{VGGws25#2Wl> z>@fB1pu!w)M_54$;NyfQRBZtSE}dnp|c@UnV%t=FK^*TDg7kKift(uG3XqkY6;@fgPNzys)ANaw1EwtWkwb1%xM~jNLXLl zMy&5<-k?Y$A^w5A+|P#Og8ZMIcBwY+hugrz%z9duZ^=$3O|QRI?BR+hIf}xLN8Pm^ zomsh3V-+FI)V~c$@q}y+IsTwi_@j|kpXp$-|E!Pm$T|mg;pHG*9p>@jjDCmojV+=W zbP`ROF!|dBs4tsObR|2>%g;X}p$gqWldt6i^$8+SORP-r9Ln}IR|Vej->T7yAhq_O zP8g7&%uEg`s(Z|P8NsYgQc)aC^Q{T1^naU3KE>kFHw@2W4MY5aM#bCCUyVQ=Hw(S3 zhJp3i)jplJtA@eMKpE#o!P7$v4`F~QoStJ!fc=g_2IwSqK_S=ZhtbqQMMD~*J~uM2 zhSAJI$#Nh8c!}XqHIhg`?=nm3)4F1lFZRJKv5L8Ag@!8MTwZ{dPDiVRz!rLg7gn>t zaO$G`en_&6ab)$=8D!c6X*lbdlxO|#gEhDj`Uf*M0|d&Gc6d;pmCuw|5Hmd8bTU$t z%rqdXQSHoDsv_`z$Gg2Bz#ri?7OfH#kCIZZ44Yy&s0HO{Bid#oyjp{BuZ3sIYV2Fq zxAis0I41kOu9iKlcmYNb!J@_jI0u1Q{380RGs(u%bI&eAqHzg;?9v;fsnlh$|3lI> z21nL)-HB~mGqG(aGqG*kwmrd2Y}vr9>&)V0y zSFh28_Far+*ZlTf(~F)Vb8U_U$9=@u$^skm(Tv7dw%{NMblx*`t9qp z1TIWrL(SDZeqEf{`txf!T|X)2@aogPYNLb(Cg|$|NIOw4DEI`Qo;N6X1T`w^@bA9w zU#lsbJ~4*F*$_pIe#fR$Wy%>SG=h|bZMW~uO-z(LO~m4z8>Z20ZN(edrE{OeQijv9 z>Cy!iJ}^=wgZBW9?AE+6G+l5E>KFeK-(FL~@gG52_zLkz)^DaRjYIQKlkcx7!`JSg z*~MsB8&CU(iZ#ORskhCctfhHlI@^%G$0jH$*kVTSC zv~xqSB^{S-zp4usITAW>lmR=c*AkT<)pAIs(M(!yBK1pQ{ zUc+~HH9lJS!wCPjj^{G!vR4X6ojim#Qoomrra*J2K72pObMs>rT}5A92QZ|A;0~M+ zT7)}F#x(gaF_Q5pI6MRbfjvC`Ke}6?0I%ru`PzSq2kiaQ1z(88+&oa43XOzwsgB8k zv_v$ZS)H@&0tdmdR|I9-ar_Ov=8leW)>n=`l^<_{EV?gmiUG{~I49E-`5%Uc!Oud1 zgxV-22EN>*OcU|@<4_`O$m#EoioDkJpEV^3u8XxFHew5Ns8>EH;YEc*yPqGzmhWFp zKP`FjgZ&*6NbuRS-YZ4_g&cMRep>P}O1>!&!mSwe-#%dMT>GASdh%z+hmG7mn5gs5 z6+O%#u*IeMOO)jjpRDXN@5THBg}=ZQ`2PsI77Bg4vX}AnKgP_Ee+vXnNNz`LQU$+p z8X13L?A%}ol|AJ@*7{`rW7Un6##_gfFWx{XCpJ@ntp2;an4;cZMpK|(&(U10=1%gv zOwAglrC4oC09UE9%dw83TG^AiKpoHdE1Ewt6tAA&`vPO3uh}YJOONNuzJpq8d_KNxcBOo_>KEX( z0vM`UJp**K4dbNQqZr2q*I>yMya&1S0Dc z0TFxR|Hh>8q5dv=6^dOR1($f4JO%6qFaJONw*lxRa@sYQ4|FKVv)Ve&n*v0A%@&H` zIEpNE&51O&<^A`TrMTAFT#(6KHnWL?_}N0{P*Qq-qAogZRzx^HeoR_rK6&|7dghE= z5InfNSozoPAF2EI%*^Dj7WO;)Z}tBZc=F;CLUHo*;Q`@y6P5~1&NQj~h7ro-21$OU zm-|PeHK381%jfpAT!n0FB>hvVgZD^TLpf`Wr!Yi5IO%dq5dgEJ1P0f+X2cx}<) z0?T6Dh?U11ZgzDoz=)85X#&5QfwAtaM-#39ri1S-9N2JDsoz5qfie+iPh#L}D)8PQ zdIA*-To>=#Ldm-qVh#E>4H&aOPchLZd9NGz+i^K}=D+@s2ab?TNKY4%LUy};Ci6NK zPe@O?Z^M<2z{+pa)Ov>nl{5`Z?wI|4G${PAC^Vg8O%XMqwOL8$KV0p<0+jX#_J#j+ z38O9gYqL@)Q70Uu2?w|A&zL!eee-|~RBE*T$=TZCy=ZfYa&C5}V^O&qTzGc+HfR3c zB4el{&>sEG(R=0mjt1ir=KKOGiT3*A$H^M6f|Y-s(O>60vW|axZ{r2 znVR(@OYwy@yKT)}NY*c$5C}u3qcb4c)-1E&t}92hxfoP00)0OWFi$-a8C^zirekQfFaZqBO2;~91JZOO7a=|jk*T6aF|qSK`x zWet2%FCQ$>!$4D7JPKVv?gu=?k1OKxhYOrw3gSh;p=CtYpMw)9+v{T-^%m04ym1lJ z;`V)H)zfx38IPRY(>gQNcnQw+6kkGzf>=J3@*leqX(1FUzUpz31dNQVm%tOFKYp8A zu1zRwyfw40sC)CAXI98!4+5QDJ_*WhBfFVo9EGx;y>WlX*LT*M!XPY709DVTR=_Gc zG?F)DvV+m?ugZJRZp}_;BA%zv_$6Cy-Dh44ho*VhDdS0so|S=nI96sm?`Rc9P3^ue z-mX8ets^0!o#Aj1rFvGb*)b-~r(j!HDE_&TBT8cy;VIn>afsOV|1^d z!is}&43dS@JPcX^KEa%URAg%j)mJVyxvQ|8*_ZwJaz0HFs|b_``-4#R;b=mqOV z{5mo~UlU^sv~62?U(@n`+$A!KWUA(BT312mFJPWJ-hA-{(f0V_35HiKCf&(u`V9&u zV?TN&0yXU^!#6^XKqj2lzd*XR#&7e5IHu8RfdG*XPN)w&L4~eU5zumR>h21=$-PKy ziz;S@zhZN}o(=i0Cn>y7`13s@O`m)NPoMS5 z?bV&x=7Cme_Io+S5o`B*^l$Uu zA&lcV)|XDtw2TI@2O}`!sg?>ZUIM&&NQH``YLPSAO@DM_VEx9>0DhbNAxLy#(?+#B z9a)58p%**ftY9M-?fzl`4{a{D>Y5Em%#|W#! zC42~r2jm6&iY1yWQ|MS^dOx?>0X|uR>SdzKb@#LuY03d6)3)d+c90H+=Qps;zqPsm_u?8Hbfl{T~~xU>Nuv9^cWYW|B*g0JEfu z1KGl)Zyidna@JNzI^Wdw^vZ=%=l zRr7sSpLeJeo!C7o!ZHQo%E~eX@+plZU-$_bZdEBR&^>%uOVIrd7b5;kB;!+0i6>Gb zKCuOUxX-pZzow`MJPw?FhB_!bE4S!b;i4XgNW7jlsOP^xdEx|(A&SKF0yiOZezbV` z(ZuUVZ#y_4MGVKgLj(=I)5cf)jPT7d(L2VThnpnt-wMVm*Xx&YYkbB{wqqp`0@-tr zoQh43JkG#zkWd|_O^yKUxH98kYI@d!3H{{Dpwbgf8c1JlTeTCWcIZbZX5ef@{QsCW zlM(=4Ot8N{`@lDoa*ZR$K=B+^3K@#XUlJ|{tD06qr9Uo_)RDfrAvcp=rVxQ4Gz@6i zX(yEOLTM$yDQdoYXKIf8&{X#VrpSw6@6UGEb!E0PxO#E5s`UCU+G8A={A)=#*xZ*d za;Vv~p}!2bZ`4i@lZZ2G#lxqu(}9d#TolZbJn7`fl61C1vJlB>gvv=?l{JztXOT8U_T`nlG6DG}FP}$1!Lz6Y;~nGtY74*<(Pg4$o<8l$@|IW=A7;1yP$0*2B z4%1j)H!@(D^=GA~SfKySc7x)~8c1;#{&;_%StgQMDfQMT7iJ|>rwl@8XAGL=eE3#; zmM75rm%$jEYZbAxaaE-8ZP?W$1qFz1l;UBDW1oVuMQEP_>f+g;SJ?Q9ay!2wp>CR8 z-*+pOL9MSFo97y7a)5tPv)U_4;5)*#yc*Ox)K@{Ml|A1o6ihs9r!We`fy|n;_?H93 za|+_q02sbrD-U?Ae=N(~C^x;U1MA!`LUic^op^B3O*Ass?%F*z^y~7$OJ~95Yc11N zEmz@`c1SuMQ(Sx33pa-EFD?>_9nP6`KG=Ai*hCX<`_3KtQWi=hmHE1z1`y|;vnW7? zI;~XRt$9gPhw<5FAyZ2`q^i|73VbL5shDE^O^EFkhW$orc#fR;;)s0gh2~rAhUf2w ze4X{r?6|6jQSn1nQoUM<_BkBo##9H~IcI72+YtYL)iMn_9!sKC_OBR2R5%KED25GX zskLpBs&O->ln6_A)C*3$lAGr2N6T8MAQ2xwOF$B@5@Ffr+yQazb6i1Ru*w;pNd>Tj zHIBeeh@nU>PR1$(U(57cXh56)a;h&*vR6`WCx4agHvGjB!9;e?l2h+ zJKzx}uyGZuM`HhU@gJ}*M-cSnoKnPvyZ8;V@T(Kp&w4Cz>a=im={%rI#SbC;Wz<9^ zwF3WpHw6XuH%<@--4_8!YBl6cZY4<)Qp8n@LS_Y^CjU43O^QjAB&f)(ddldRw~-z9 z-{Hk@ihn_FJHNC6jO0>61~`9;KDBY;ie_rrqM9rMke<#ljW2D!{#Wd&qEZR9ff8c& z?1K?~6f$Vru}oAm-n3L&_R^|{v(BU5yNKnSHs~cUXJLZoUZF!^IebxZySp{}Qc0Rv zyw@$kfOa!-!^g=r7!W21XIyeEbH{!zi7eh?D-yWfE;Hc--9|P8v0H)7JCZZqoEyh_ zZJfKu%{5Wn-a@r-T;4j%Y1A+Kg0mLOPaTQ=@ZAoGSWiOiyUXN$UtQI!hRP%qrjJk| z+xo)bP#nSOZC75l zs~ww_`CTu;|H@${G!b-3@gDqJz)eoGSku2{O))zC9dZDx3Qj3{VCL`CLT7HD_&c|X zi!fwSw5$~uzU-o(93&;smzEGC8^BA5v8EAWQZTHxllIk)c4yUSs3P)>%|lK6>v@ob zYP3RCblDbkb#&XRj2@O!vVIqB0d1_d*{<1I{Cvf%yU`L;5IzC5*VRiLt;bt=0QLiW}*!!RSKQZ$Y7n*Dm1Ua8;Xq429LtE5Wox4(Unp7b?@|le~px z%t7dH^Ug#E&PrO>T$4Jbg~p5yu5-%UbOgU;0B>7;N_qruLPGZ>e_MY5o_nQp3Swn{ zTYfACSq&(PDp zg`*p+IIKdj>CFI!f)aMwHj^LkAH)G5TK|=uO9BnnT5aqZ#Z#0C*@aE+)e1fHjEmvE z<)0Je(_0!}g5fPjY8WhBdDjCwJ&*)E@;X}>-J9X_(6+Mp){`cUH(-zTLzo)(GiG3y z1PA+5OlBae@-jtIyxZJr5c%u*T|t|-y_I<1?Y=ULh#arHMOL2-1$=u`@lIIQzFx4% z1m%t24Rwhr*E_GLqXLQ_c6}+x_X}i%Ft{V4aCOH>F{Z=cOcLNMi0*JhWbbxR!2HFC z(*ov&LF~TDUQDhW{tQX?23ywq%qXwsQ*g7xZbnbOo$j_$Y~DpMH<&kWelP}&fXLjH zn_COYqcN3gqmtkI-H6>@`okVCuiG+=7=o9o+>`kuKV5&=g98{cT0=laiou{LPI*gOl6Icvu6}L9H<~H7q(26SL%FjxQ)6bL97XWX=M{OK; zK4W=Y)z_Fj_4z_%$>E_wSPeozP2IK%*=zL?Jg*7ht2ntm7@6qnJO7XR|3~Ko|D*10 z6_$M2uT{rq#$c@3bjNuDi2tMe|0fh&Q(lc(5hk`#*P7nj7u~EM>pb7e`)@DR{Cfsw z!rpYpX3k$MxZe23W~TWSFrM0DPZ@Z~a+t1M7=>1DsR%(=w-)}yhZY+C#3SFz?w^hb zQoae`?FL%xJq%~Z0>ZMTKc7NN{sz31!;YU?OEn?!DB{~Yuzxq0%~ zJ2E<)Wnx*FWUJ!u-*wyTkL+%@H1fp%wY3s&5}8I}R2`(zEI<+>S#<^S9}d*{X>mV7 zR!EUd)vG$Y1r-V4qszD)hx8aO<*|1 zP{qOX5*zZ45F*)#crKZgRiXSeT&NX{?k&mn!;F(V^8W&ZC+gqzF*~YF1@Q@%Z+_Z< zL)c#%WC>UCW;+iK*xplbAKx^X!*c^LX1>}yp#nv}9Et<_E?@ks!EJl^)|XLhYS0Z1 zrv}_xm?u64XE5H*CdqH${Y$QI<^2WwXVCpC-h3C{^oaZk*5)e$j4cI(%#oB~#7+a+ z$3{&Bg0cIpk`E!12vVJ~Pw`qt8zk2O-x_?U5pkTs^(*ZpJ&^{)kY~88Nl1M;2E?W} zxvT}*`1eQs3ptk&QFzWlWd|8MWH?Zq)+> zParV(2SYc?{4ByFBDZ3J%541N$aahRTmxd_tCDu)eeWwbtXSO4_N2j3Y5WpT@vh*C(>YQK5n5b>UEYc&=C;&O&#-7>48!B?WhhiqyN3kFp4=P z*W-pwi=O~zCwGO}j69WadIqiv`f|@g$X|a_fkcR8?+DtzCxHA-v>xH4OmmQhQCR0x z)amjS4JSHtt2E4*yTSjihm_Y3?@iprrs!#%DNVF0JgWb_bp#mK|6cc$jAg0f-qm_s@!3!%jE7J|5LBN**N z*cNe-Zd9qICu?VeAZw4Gv^h-J@Xi8G_&RagVaQ@ur^shPIJKu**^qhcQ2~D>PnEQ7 zsND`8nzN1K$aL}MKoEmgrGM+D28r{*CW2vVx_7Xeww#?V@3BV6(c~l+n`)TSw6pP_ zPcgduzs){;tSs$!CRpA@pDEZ5}{s82C^OC zCdSDNJ)bckzc8x`0D6{i;3mJ6rzqC zx4IY|#`nyT3){G2L3l_Y@2x(YhNJ(aKMHKeWh$Q+WmT#$;7f z$%h(}!dK+!hgg(YVdzc`Cb;La@~hNYZ<9+_barMbywtTR!M1ZSuZ`BMSJM4L8hYxf?wm5| zQ6mV1iyrE7T9tvqg@hBS{mH9P^(%%YH#yMQmnWLG^Yz~j^Pe>FLVMk>Z8#zbiIb9x z5i^?NL~DV|O0#O6%B{+si~1VNj+_)$NT}e{YTuInHYRP-1$44c^bqLtn|01m+J#}7 zZU6cc2?si6ClJyeBcvD&ccuMaA%);fm*ggG{wHHiN81th8cQTI`T9rRfstkdi{eEq zx2oCjh){OHQ(`t0r}XnAbfFGrm`G{E3IPfFXf*7Vs?4?Fb?(03Z^95kWg*p6H_5IM z`r@7z+4!xb;<9z5o5d)DO3VThRP@oGRFK~RN%zg{CE_b#F?Vyty<-ThK6LDN`x6q! z3TO**+3iu0HU;&ur=`H8baIVuIh1Sqf;JkqN*ce;DOeaeIT|(`CGYI1NJ55D?#AeF zS9Iv%lC<{mNo4|+%V6r4?Wkn{sgAHb?{P&%Jb6_xH^f-@Jf%-f#+t!cPN{i`5Of!t zfejG2kr4pEMg9C9Csf(L_)ts_fW0*6znRoYES|?q>iPsv#3;$506U_fl?)iN+7p!D z5639q@wN6$v-AXKr_Y#b6EdOROyY&7Q$fp=*ZXoI&tr8XQ~EuuI4iCQtV#6jPCp6m zu=)D^vM8aAh-lH0N27U1TgoCX^jk}&RqV4U|6^Jzi;u$%Ll3K1jwo5g#Wz!gL&dk; z?)7&DaTM3Ib5P{I`s4xz@NVBJ6!ozRi>RD2>IbeMidU}Szf*MaD_^I@F?J#4Lg2>z zCEkMktR?;-(2hl)#YyVXhQY;yAuK~d|rvtOH z?GntPa{nC;MR6-kh;H^{&eqdlIhFMiJi@WMPSWRAO*Lv$yLQkMKvRtKK<;Asus)@0 zJb|OSTP_^MQZQ()w2REha$@%&M%g3+JSu~B{47LwK~NK|v7*u_;L663GzE7jAEK^rUik z)t@)}JU%Ufnri7!dU}g|MjWHyh)0qi3&{k$8LDmQ!n>I-!7766P{Dk$+;5D1p{tentP4@@1zIcPcuQpZ3eB2BFW}$IduawUR*EXKPBq3=@0nC$@Rug;16um38 ztnvy(^Tnk`#Yp63DU|7e7?oTqjsy%6c&{7TA(%-#wZ$ zE<6&&a~SWmt}*hEjq0ybVfWZVMqsk%VPa>(d!Go$6_>w?sRqh?5epk6Nebe(%>kW2O{;5^3_V%Y7%T9ULn?L zBlP*^z(32Y$}AdoTUN#>5=__@Yp@ceqy3hurB`sEZfnpdPp8D6TP~`!h!;FpzYa?Y zEBa`WG6xZ`CL!TjUzUrbu?H1F6%`IDC?7pI*sFO1BJWoB1Nc!r1fJXSO6Q9z)O0~M zfkFwwR>qjcP&$q&c0<4~!7o(DL5-K?kmJMo@A3q_h1x69-yTetPm~d`?@Q0zi zvR+&RhnJ4z=_Z^ro`gg*mfK?3x8X+>8rRA*0sZ~dWXd(4_cSZYRLjN!g$Ra7A=k}LbLbHfpPnuq zy;08cnL#?aYx12`)%Bh-y(j%oW!{@BStpTMtb|XM^<}kQ0Jj2+RMJ*B&ZMOnQnC8n z2ek!c>>%3?)l31569pT{8e03m$@RTfCPdD5EvxgT%j%guR7LQI9R7?irL7#Jr&iguwj@%9rnKvS_x+(wy=@%vIG%c>=a9E zJC?$r@HDA?_HClYa}N!ub}ecW<|Z}?TEI8_YnN@S58IU_ocdz)VkH18^8##t^5yx5;5AOu&w%ESYJWt19|(jBdKm`y ze?c?F5iO3fUC4lDEBFd;~EF5FQ%AmF6t z>$Md=a_8X*Jv=x-6YmIUi>;IkNGk_x9vgj~+U@BM&IR(56qF%l;cqTC@UojzPG-Aw z<5}M}(bRrBd}p^()Rx6*L-C~4P`rA0dWOI-e%TYxrCN}@&5zKBtQ<*#r$k}GRi1W6l@2N zbacWZjNhwtlv_*v+e)GxJfzp}JGD!)_+S&E&Y-?PU7aD{Q>dmwl-V?e-5POWoU0pR z&@D(8fv2J_!`A3XPHU?HYh$=oZw}&GF$6eo`4jki{E%gD9<{JGy3L*+7SUxqroqr( z8e1aSA2+XSW6~01(g*-8j5p-KskX6~zMa`RBMMnSt8Ar3oG$zfnOD`WsU_(c_e;#` zUyT76^JfPg9m7oinWPFR(1v}%cqhkmXOo|`cOG^G7IJ$SB_{csIIa^vloFqItNV4GMEy_6oHjji_*vOVSx!Ck$%0y-C`Aevz(A&V$cwyhhR>UWImqk-UQ?XrwLSH1BueL2hRXa$*&)>F%wSbq)9Ffj<=KJq5qmw2f zj&-{Twy4a8;m}UFbmV8Y+?%H?llXY z8)yrdpQokDlFiMNz0YUY{Y@oy#VHK1@$oUi{_$9qlrAvbC3EsJjLI7B z!w#x{s@V<(TjDC;Jtp&5&!;4NZt|oiD@X37s%(MMM@BEZ_;K-hX}z(n6*4}9=q|qo zOeT-$6LIy;wTReG)Q5=Gn%mEI@u1v<0DtC)_6(-y<0M&cqr-RJGfDs_Mx~FC`H2CTGkZac>Oar;pm&I$IB6sJzrR>JItd<0i&}#!Vefi7oZRt)@2Wbnn+FiQE6^M4EsA0d2CA8lx%V~rrEJE z7@o&4^`%}L-{?Me;gIQU3&t9l$kahxKR|xu&zD7p@lh%WfJ=BNR8 zGL~stG6dmZah?0w3uRWAeFldOi{>>(*T}^}W}(UAjzm8ZEG_IH7#j7o@V^2I^p7ZY z2;ej7W`afCGHn1qaRGDYRF6!UWu|9W5WPqA6vk4r8*k9*50!Lz4Ohv%6NBuMciD`T z;LUFvo&Hpfr5E5U6taR3!|yehG_0O(Jd)6@8|9MFIKPKRx09uD@v)tjUqfzGWBB^xJcso}cd57yG{t$e>t2CFI_Iu1N8|&3} zJT-IoxM|A4c9gy~EgCSws+K5}9?^b(eXn1OvTE{9G}3)%GCs@kpX?xVSz;s(e95&AOUoGSQtCOL12gK`L!D?mVkIC3hhi-Yw5@j) z7G%Lw*Z5d)OL$cIz_zWuZt^lTFN{$m-VSB0 zlIaE`-H@Xa`JUU-HZnU8VV=3Km57M|KCyI8X7z#+Lbp(!Rr^}b^iyH^1*ysSX7uRo z97Q`_Cec;>kgT=Go#lf@uMl;g;!2XiO|Y#;msx6sNokRpWHqgs`E@HN)G5a$vuR^A zVCPQPg!ir$`O>se{c?4amF=!*srH3JYj0-V!3h*K2l!dI^gL!nW9!I!GRDGa4n2kX zG4X5bB!RO*X?})Tw==W3%Mx~mcXX!$ox;Hwv)8YUkA;$L*B>h$~ zS!KLdh9ReQcR~A1T^YQ_u)Q=pT~Em^2@Qi~5osSon$6I(Uv9;9KTrzo+A~dtJQKw_ zj1?{jFsz&c#Fo>I{_vTgQc?hS@G^og;%8X>m|rPRRa(P`>(DkL^78eB#R7aFH1(?Z zf94_llZ%RY=BDAmw`bzw6P*^G-6dZf z#1D6guee2Lwr>P>?G<9UkNvq|?J+=)spZR2w1L)~Z0`o|u7f#?pyI$^VcHEnalaz< z{{tXagBA_@D#ENs4#_dD5yzWW#u{JKGTlzpoS^~5yWnS<&QRSs-eKAr%;wlCM>UB5 z!|$l^!pTQBVYryN(x$#cxz;9%d8PTJk`1VM5i$~6Y$o$pSk-h#4gVc3Nq(1sntY8I zJ!se`PbkJBF*uwN!{EwKk<7QgCSc+Si`xq8TRt}D?r`*kID>0`!vk zh6tq(Mv|Dske7Q2iT8OnD!&|hc<#1&7H5x8@zMsK>yrk?>N^O@$zU*Mj8-)DkmrJU zd?#I=A_OBWM~$92#>otF#cU0c*Z3pyUkHuKB4l$03Z;?zNkzGYplq0G+#@bTW7&i^ z?fms(*z#mzoWf5{e(5pBk)^g!wl!I2?Jx_tWzSCi{3Lo;se1gRdSCc?`Tp;N<~$it z8t`KO5KtNr{4@aA6w3D6QMgwyq1ws|IJupVeJL&5`FLMQ*{TW{&ysmxh$}63o3f~> zew&9s$&F537$6yWuf0=9roy{d$dpw1ygN7!{IL3WocEsS!7i1{1ohhUKSdPM+OQ>{4V(@Kd}t?D7RgVeJvFz9A&)3#i|O>7C0GxU!dcUSSt*^SD04f zy2$muA|(7Td5rCF`gDbv*=bSgdnx^!MewM==S&@8$Mcn;xO^;q{L8+Ud|$xeDPoL2 zz`a+X;Te8U>;PegU~&G_4fiO(g8?ob3Y!p5LA%J2S=XLLB!x@z6`g4k(%p9danAxX z%7*emjY3Id>>K!=P%D3p+MN5(UQg_OlF@=8^NzyZV61fHZJK%&=EC`Tql)OlrMc;c z0qz1G5MI7eVS>`nUodl5aMka4LiUvEMS|BSsF!J%_Vdg#E6E6A6=9}2c;=yw>a&5E zxsBKyd!6i^5kwJ`EN>y1VRn{){3s%0r%FOk=evabC4wZaYYSmy8>j1NcJUesF+u|B z{k$1^1=k?X7K%~l1Q`(O6yi_MRXDo+Xgk{dXfu94&|uqPP%I`A2+6dRe>9qAS_jzy z`zF4uxfe_{i`fO2`*4^yf8-KLjYIpS?m#s6aZb$*J-Vazxe=bJ(4*k>KMImasXfpA zvc_%hWp5twq`e@?A7LbEl&QB&yz)```Bx5q1XO{sVx%=<1-c>tH}a1}qGEI|oU^KKy!oXhFMk?}MGgUfOm5aEaDQ zbhH|5J17v40rRIeq7jM6R>?Nkkw4sluyfEn&+BrMko6+t-ZI_qR)wo1kK&(>4h+x7 zj>-{!17i;xK|A`M4Y4_G@`;;pk)YM$7}MZsIyuVu!+(efceeAKJ{uxw&PNmYo9rY= zHed+J&4CLDvSp=hqxDQR3uhMKt?XjFvZq9?t77oyPCl%n@?wUa!Fm;@3LjKeZX3V& za_ySzS=c3OfVEXwOd_ymO#Yy?OluEuGN_HS`n9`l5Ao0wL5qSSY zZGVQC3B#h%rqMX=Xn~8S1bq}=Y-K2B|1o!;jnqgIW_A{BtLQk?H^UyeG$iRUQfBM=qXWX8AT^l$2z z!aYxr&V9f=hd(T~+h73ilV?tvy_JD{NiH0ct6Q8Ik4M+zervi``oZVbGV?ktyTzA$ zW<>jt*i-6U8N-HYd_)6aVG2S066RX}%jueCJ1XekKX8Lv{@8fR$)4;G4D*$V{$#jD z`lfm`5T=;c4|iT!40DKvMXl23kn-POF8CyNTB7kSVtiXFZ;ia5g=~&P@KsJWL+})p z#yz=Y7wnf_Z(F^M1H$I(@0k{|l#{(_5L7!eqXRy|oaI#prq*mfOinLR43v^B3z~2;OBZ~*5^2B zm{7#+D>B|yK*d!+8^`as2k*<{uOxJr>+hh)ONF-?f}56qj(?gL%BhT`3PHTetNp`g zx`375WE1~YDcm5>XQE%THLa`NlSfHgy;L`(mV51H*ti^`dkvGgu|WcD*%VZS>ZuRx zQ$+(UL|Z|1?r}9a8~EC2Git$x&<6G6OiixL3Nr#UzwvFlyx~ehBV4b2=*lCc8rmXu zc$1sb&hA2E;7SR%YK==sT`Yggjjf(#juRiOC${pAtXkezU@)#_q9)qOq}ZiAG(faW zjMYV67Yz9c5`OuLu2R#tp{7oYYB~l`Ga6U1C#F#H7O@)QA^hocJK#4>V<0(I`_i`8 zCKU!^&vFH!=)ff-m?NC^5HN;JsxFQ4MKmN%o6~Ugs-<_KR=At60qhNC6b8yBv9%j$U)7_T{$36~90j zE6-2xHsuM)jFwaC4@8MIab0%!b>M$Vys(-F810(dVy^M04_pZeRZty-7LmL78pKE> z_~5N(`Aqc73Wz}AMqbNVWAQ@=HfdOT5Ku}A=@Xm%md4gW7v42{1hN+X zENifS5Qn1AyGpMsOJvHJ8Z2(;kn!KC>)U9sPWi{;n!WN{s9(-NQ?`lMdZu!}Z$1?b zmAG#Ln)m|WT4XyYR1shAU3#EAMG`+rk~Rf;qZhn2J2=?%Ml`RuT{!<1)LU$Xk1-L> z23ML#^P&$C7H+a_ZZt$qH)=J+Q_BeJky^j3IhjdLhbL3EjG1oK=R8=FR*;0a20d=U z(@JOUQe^$7)8wZD9JBuT5~f>jU^q_>$K-_d0O4&)jgk9cc+{`#)-jassOp6!Wvg_& zv5|Zl?F~6mOO9XC;p}MIR4JC|uSSeCbSL_v;9$PC_iRmoy-RPV5YA`Xm=i9g$U40t5`}5L|vZ>`+ZEK>Z7R#fg zI$te+JBFq7>ld`IMx!}+?7k$R2bXs*Ysog>${42ojEk-y7~|2&a{}DK$^725siT?1 z-g@by;8L+$x1~}`frnumQb)nnnP1up0?8JHaCk?zzi~{=bPs<#58j%MEA^V5)G^b1 zoLESnZAqyQMLsR-&Fv4}+&Ahe<~bTgZW;EmXZrBS=I`s9M+ba6G2y76L|nPZFm!k+ zxxK~8TgVBCpt`u;^j~ujl4l=lHT^b*6FUrB_wXpUw45o{mf|;c-(<$oh5>J#E48AGf)ZA{Fr1oJhgu)tHvrsmd5RxHF=tyxRnD-2ixUqh z>OXI|^!nY#f4QSRD=1TKNVHnNQ?*Mu@aNC19~$eV zh#9b}*ky>>Lz!H8Ks>$w!?p8c0?m{S8<5_R+qJS`jSAq=AS0y@aNgmV$p-&Pb5fZY zr-2P=z1`r<9pS^OA1d(o#1_2T2mYxx5q@=q; zx;utOLb_{6>5d_0=Dd8qpXYghe|@jt`_Ig|u32mCd*5rFv*+x!_uBl2mVJi|a1+9w z$~9J|ONK!lnm76ZtKX3p)NW%_Q055wzu@d3%W}6|rwvO_Emm@;tfr#IIVfrgQ-H@T z7E)ijg8TigQjetY=xDLyNtux71op`2Uk$e|6qV_adx?hnW6LB??Jdb7``Nq&*6#UC z!<>1C=ywKpMXzHEmUqT0>|=KPA%@S|nP!f~JDER+B^m{AD%O;?znr3>$NJp?XS=`@;Cnz@%>Ez^F)0x-me;cUsi^Ywtc2@t#WTwQD_zw1)uU57;*GUZrr2I}^$$ zs1Rgx%st;R-H^={Uhx!%oq-s~(MMD$E2b+eMsR?EiE_pr>&-(?Ih`k?L1swiY(vIj z_fbiEf#`S(hT=`#gzR)|TsKR-wsBsGm+@UXx+~h7fu)`1&vUqI+EJOBUE)9Lr3f5o zu(B&DxCSY`bG!f88a=6P(P&zG#nvu&c(lu)tUTOX^nK;zZGlGX!v#D|{+s)vj?B|G zVutD@_0S(0)w>1XB$+tv)Kw?O_IxX?=QVFKvh&ncUCrt(!QL7^f+q8@^h%qu4}n$d zPe^YTJt>I4k9sx7CU}azmrfd=BkrUJKO)JSbKsTk1Xt!ZIW6!7LlymBqkB>oad^e zl$Yd7_87C*5GR_e%k|!~nsmABq>sFa{Jw1uL+TXI*e!jZ9$w>;a_FV&&ZD&k2~f(sySH@oSjLt+kxYN z2C`p^-R%JPm0fFqdlkFswTMk`X(sRGUe)Z5=O$A?&VPXA#R%3(d5JepaiXLnh?NVj zm4CD6UdzaiKwGF(t4V>82rT6GVHy-hcHG$Ax4{tQejb&04SD4S(4rwu9B8yb?Dbt; zAYv3U3g2!^v*e?5&;LB>xb{hBYPprE5<`vU@I3Co(oLhsN~nlnb}@jvNgd*#1__md z>?%Nri*8V_e{RoSzP}N94l{?}ZM!r7*mkeZje#2|K$wLFkFBYP9WtWfEea5aUpH== zX9jgBtQ+n^K*UCYLtJq{)C_G2 zYf(qJ7u-^xWfpFN6$ilD1Hg|FDYT1y9Pn}w_A?rq69eUEgQmf|QB;r{>Ap|63!oo| z?MZbrWgZq@GOc}*)$2{Nen@u`0GsY7c(VX0EBg$ZX=4AI7$l^>C#h2{{pJ>ju|ymz zq84Krt}<+r@Wja@-@TCOq@8N*?F!q&n7WrPQm5X8W*Ehl;y+ zZ6{*K$XyzT*kD2?!9cI=I1?D8&DGN1p@1>H3+*yEF+wO|v)lLLKt^#vCAC2lbC>%5 zWE6+6U220PbXa7gN&L4|&tVo4TbC`}5B#bhKdXlSJ!RM+c9KL5)40Zd(h*vm`tvPe zG*bC3$ZW%!KceIi)CkD}vs~6%w^Sbc2CdV677i`;co;c}5Zk6=iXLs)*!Tmj5;v{~ zGP$ERkKPV#l$Uz;Aa5rDR>e#OzqkazSA_SKkA`217 zR}z7eB;Lfg>-N1PXc*QF5YVoh>2!?M`bbA2I!#qwhI|$0y;3gV*c(t|dbAZS@R2_K z=Ycv6vXYfz)42R&vd${upOOG}*BZS#M0gP&PGy;H!8tX4QZ)8z1j!XvrPm~$)SJ5) zZxNqAb_d5vb%G9ave-0{Mj2zsbz%xpvLEo@h#H__c^D0LHopTxj?fPw1T5v_LhafZ z4V#$ZLlA~Xgyo8PWzmjRM^TyFI0m31+{%YIFDV=3m&^7!pK7}E`wNkmzAj1O%zF8% zURQ@S^ux!`7ujj{TNh9ErG66i2Qy1!kEmhLtj&r>Ku2c| zGk&&plgvjZ+x~||94t;b-`A5NSicP)5=OiG)FY2vmvlW(=w@zO%!xMC4xXJGEJ)_Q zbnQP_lWpK%C$&vENFL6#@G$&KKwO$+)=ubc+?CmV%(vChb1dx{rXeTutp=s8ER0?e!=d)z3*NRGA`@@C%#4uv%50y;x#e9r20iUiqUoZUh zT`S-%W=@bkQ*G>!@k@UuCY~9j;uo0liZ;3DqWCaQUY`9&3$03GnKuumaWU~aKfmbX zL>0DnT-p!ow13w-1i0B#mALE4ZbGqtQ$z*sez*O&ecxm}* z&rpWere3;*pzm9|T9U{U70tP)-!}#XhcH4M>N|8sGeS(m`m ze`6sF*IRZ%Yi2wqJwx^%yng-9tx+5rou!UuGKasi z(0QL|7Vh6i=k0C2#DwlM3|n={hY7oV!)u&gztVn8G3z8Xxz1QId|%c&0r$?>>pPQK zU88JJiihCa0)G6sU8P2wXwZe7;0uRWPw;wE5_{YHEaFJrmog0OAATE8puK$f(D3SK zqO4||fZ68>R{B@rmx)geets?DzSq;LMw38wKnghWFoon-zM2!yGusXwgymPn%A zcBZ~GjoqnIETrp`Zk;N#u$BNl;3Ff{&u?xuv`kXOABO_}9a6G77o`=u?tGb^&(3JA zi4#O&OEcsxjYp)dIhPtmC3q=R7E$m`zcBTXv9a)@PyOdA%}=Qp&d(cG3f+z$;&@^{ zLF&y>?uvruW85nrsYFU`=i?9j^;cPB^o!D#YVh-Yx-ujB(uu$3=13sXN#6B_XsDSC zj-6-)@+zXI{mm4FoV`R^YC~?;9-x;~zk%<E{9DPyz71?=g#(P5!?}QaBijmlp21YSN3(`Nj!ILtOh*yzhUJSa<&_|j5782~ zuS$C1QCF#zGu7WNorqdZ@Hk4ru;NtSKJppI53ZDNC}B;5p;H=Ja<#jI`Z|}}uA0Pz=)2@qBO-f~|JcG^beTGLjC^YJaJPrt?h@il98aTZ zFH*Pe*Y=jDfP%>xHyKJ12_Wtl3MF-G{flEb-WciBtgya=cJb({0o5*W+gfG@cl>Lb zlt)(*u!%F5CL-@P4>r2B{$&RH7+BXgbiz9ET8-mRy)(C~&>zH|$lfZmP%2Wy?bxWw zb&Gf{dW*T1E6T3x)rT;X#i-N=drQWbTl5n;3f?dkxyAfZUcf?1~ z+s{E-DV=8>-SJCHP3hq+4$W&fzb0^nf$t70;Q`9umF{1c{A#!u_-USC^w$>mKhTde ze*g|Pf6!k(7(J*PIhJI^^hn>^*{@ew$j+i~;lu-+T1`kjVo^)Ws5_TbPLiz0M6YoC z4%#{@(?yjP`E`^WO)Kh@$#`9|*t}}2DV?@Yx~=~lXVm4xoIq`gBbELVqNKkEZ@%kw zRsR?G1t{xMTe(W( zW5XbAlZIJxqru2}Pz~$W=dxc_J+)o0;(L7=OWGSfYV6@$pCCf-^Md(-dH-OOD+}&8 z-lf4Su_MROqsDHzp>VD__K#~b#KcdFZyicb1*nnpGTsnf}Qt4}}L-mvnv${IvEgEyU897_7P+Iwl3p-&^ zyokBy4JuuVzHN4*E4^pp-zJSy!-V#>Qyt1szxa1P%tc}Iq1_MqmHN4>m~bPZlaO~{ znqi1-1o3CE3VTd7D#3&N?sUHcXhsPy!N(x194j4=4xiP`Iwm&a$5l)bnJ~&Lmb^a^ zhy6e8?~lxq(X3^fWYIHFmBX_JA9+ezg0EH-wQsemf1(baMd{M=&PCxZL0T6s@B2U~FI3N@$Vgbr#PTE;^~J=$?y?4fUB?7#7@LopO;L*% zlzUUnWKzF+wULUGUe&O}~!X=nLx zbba+pJ%4V;vIh?jNDzDB%65*n@6ViqUE{U@{3ldN!j5tlBs>%ZLqO1n$+<&_{IWAE^f@^(Ja{K47AW+s=7QN~PKE-pNev zBpauBU~{T@z+JPNhuwN&_Lr9I4WT!-6h>@SFgAJq4DFmSw2CW<_S)i=Tgo~vGY733sn~UNt2#3&G!#1Cbrg$l zzJhVBGKgdE76;lZWjC`?vW%EKtfle!55 zYrsmAYUS_hZl;uw=u6x8h1i#f^nAB{1-+{;vR}7 zNH@gA}j9__UpKNzGHsi7T6 zxBN_B8AlxX>A}Xurelhhp2F9_3Sj87phd(-%_$EO{vy{XEJI?g1Ai-IyR$l|Tfj*nV?|)l4nV>(M)onYpKE z`C!t9F0d5|)Er>V^UZ=*qj+jrkC#VMd}a&o%Qjv`xSQ4UT&4;x2c{^DzA0u_bx>MrXg&d`A^dX7K*le19# zX5t>@qI+S6;}2Hshjhd=I2xKzh%wDj*YOT8m=ZY=nfesyAgNvYI6VzKR-C;P1j;rg zrV2A1KC{^QhH)6_MNLf}+&<=g6*toHJquv}N$}{?`!>1FwAz3&W|BjKfY%n9@I9K#2d7=g5si%8J6JM@f;pi-9=3#!Fe$aAj$B*& zx_8|8SGi+@pp|>epSZe?1cjlMd210POiR%ej%@1p3Td;^g11)hjExx}ZVIdH9}y-vrAhC9RVtg0>U4m7%yLidUF=+|DL=hl-D$d)9w(#>4J1#1yrU5k@jzjy0UF`n;k4a81Zs-#EZKH{L*? zq(C0?ej8ObapYhx@%8Gjr2ter>0VQvhe_|gvXd{*Bu(pMOJhHK@b`v4T$w?WRkmlb zm3oiE9c4yhzaK~SB2JCtJW_?6$Tkuu9V2`L5OoEW+e*c1?f%*~sfdL5TLO^(J(h*%HoF)pnvH8IkL{ z8I96TD>;@-tHaqwdIq>`8F7xhr?))EYAopld;}xirI~uXyT~6OYyJs8HBO20==W9f zfLU>@Gb}Kk3QnGUc-AUC#;hD?!rsbGSh4@QOYF6N&(P=&M$gw>aqI8N-)3i~IEl(# zwJQB3S{Jn@u_3?W!@$WW4tg@z4wEMpY)HjPQs6|J6MZ#J3fwt7L+eI@*%W#KTDaz& z8)|iHnbNWZ#_r*>tN~w8LM0AbF3bo)N3ct#+zbJc;!fk#6nL@P;$LM4d!VBE7ImK z8JfXO8@UU@f%X|ag8VV2@69rvYT%`v|Iykg={^32oO7gJDeIO9HDO;5w+4w4nDuuA>~ij(OZN)=_0|7mb2J;UMM#KqHws=u};@vnavpb9n#gnR-vP4+2+$epo&*NKby zVPv`UEH%wSrQJST7jF!A{HjQGXgMH_b26FF*)jk5vBzJ^$4Q6G-}r5QJ*YS>t1_*l zxEt7G>4r%pNmNn!C0TB~^UyN?6KK^L^sbaU4;L+&8-!)?XG>Qs6-mp}k1tjm$_Nq^9&NJPf_ACeHG-_zb@j)myo0q&fpQYRvG1yR~n+%`{W zAKvN_ufK!_JK8I0IoY$Np<+ob!-LMfp0E3miN*AE^W#`1YxjzDsR$dnTW=`#X6SGP z@DAgdjp{60*p40soZq&z0<(_J-tFw6w}5*sI@1v%k$4M@toae-@*@HPxFbWa5<*jN za=*6Ix3(FD;FS|k;BUelk!?&Qm)WIkmw(5&ln?67-g@XT>0-8Jl9qoN!;{K{o7ed_ z6gu5M*u52Tp#2_IpaBF6dds2sx-c>E5qiV82kiP}Ez`Lr43OengMASA;+6?MzMHj`#4I3Lboo#iMZwQo3I0;d%Z#WkYe|r0k3AwL}Tow+gzoJEs`Z zcKo_Wr+=JS+FcyT+6#{A?&3ck3^7cO! zHkZ)AJ(HZ<%$|L)VF@&ncU=$z(5_NdbSh`Dj^}FhRhV6)Je*u93h`c$G6s}o?ml%Z zzYEoVvjj7Fzr=ups1FUeb7Z4*AC<*%hYdoz;4zw{fR?2rl&j*sBeD(`I1;+PhJ}Ve z|F(s|t0d`w`}rPzE9B^@tj+cIJ=4@F{{8*rI9Jr2e_6h+SS@8*D8Fr$r+45YkvBy1^l%l;wALbOat;iVm?(BNd=^IlQUXBK#t=@2W!9+t<66HN_)LNKmL$h5 z$x1zH$B;odb1cxKf_wFnsbrmd{SqF&XVUFWyq5kteJ}IhfgofSnh*AK{|xF$T}8Th z#Nr-HMmA_aSOF;)p-rauU~zlks6D{b7@(}Xb@;OnK2WV~0J)JKuN0v@Ccm%7r-dLe z%PV-_P@VRXt;AM*IgR=e((roCNHnq^%=ycMTx3P?p+}uZWO>2CdKIA z_orcfF&?r~TBnRQGV35xg~CTD%$vSRzixYO$Q;4Dr?huFfm$+Vb~+9w&V9Ltf0_*M zo@5&$RCTs)LpBFPrJKmMi=6f3$`1_V_77-cN~+ms^FGDcc*(Fy(7Ztx{62rEKTydi zf<9G?DO{qB#Rd4QJ`PFfu?P@6iHP@Dq}@=^+Dg9_A51;vbO2sIKdzWJ6L+2$eM=_Y*jlGt+*%VWx1z)O>#P{9W5B&)JV!*QHUGHoZ7<#e4(jdVWPZ@; z72yU6AXY(wW`AuV)#^)UpCg}k8H49@BJ$Wcgj;z0ivHm-aW2-Q_7=8>U=lKM3E8aF z$k!Jzm3XY{`{8~2kbb+iMFwC`aYPI1^fnz4_vmFh@_Z#75t!9Z`y(>a6a4_Nxt~ak z+>=-u14L;<9F}x@gsd3X#1_)MB}7nUZD zc2kZwS9~^5m0t8av!>zssFnCN_g?R~P^_hzlcQ&3G+5nlXtK)hc@(|$2HeLEX&|)$ zmw$t1_|p&5(e~sK@92lkD*Eptd;{8ACgq?KVx+&o0|ESZY!c`QA{u~fM2&E}lm88x zEe$jG`Wg`y#_zvzfGdo;q}z@Ksk=wM>Y_INb}Vp47dgDOk@>*q$8o{w=#Pc_F=xE- z)&D&F8Z)Dy@nm0zhZ$@Hzb53Sn*$ETj7nFnWHx z-~G;0=#8Z3d24a2YKD|2zGFzroM7svFEWPn7=g=rJgS4PZPn4wZG}g?IjQK;tFerN z|Aw7KNHtZTc_PW%Zi{ne!_e`(Pqy@5E5_UO$;T-|7`Fz!5;jxx3kKa@XKXf)6wkn6 z6LnAuR=i90N)%u}hQ>W6<~_SI;mtykkEs#vbedTMk83}&Isp0lciWQQ#_f7{Qp2$) zj}v|+n9F!`B_IRMI}hzthz8Yv3{_Q>e{txakriP)`?3#~i+nA-uJDrXt+TFx$eDgL z{fZ*A2)HKMF324TZ`A|Fcd*W04!rkdO#QO~dwBM;S6Mu?+hlAV#-WyhsG=t~ZxVk& zt?en%lDK)(e84;4LPCA{9`y?O`RoG8-6$@WDdTB;d3I{Tpq>Fze}j&#JJipH9Hw-?g#$z24(~(`xV$;?IyF&?WiJby!e5gzSEdyKP(|~iNk{v2GT-bnQ zB)%JGyZ)SWM3COn%c_WEbfTc5Z*5**orze@PcVCzi_&ga9hkU`s@M#Fw}cWt^8Lwq zBwz^a|3up~+K?T+9{q_1cVC)x->(wf2YO)zfX6chtqru$G>|Ja29Q1M=^ju*vGJ36~_WAB|R> zy#U-Npd7nUs#yU12IbIo3AAR58^~uID*fU4)j%REda~d=x62Wn{n7Nk2&>3ymifwM zugJ;Ep`|}RX*EKn&qVTaD_`HJF`?!Rc8_@orJHh^3{_`)&_`+HcVdI_ZeCn+edl%R zeFC1v;Vx0x;oD=gDy~Fqw3g}G76fa2A0(pewz2kM`s*WmzYYqgtiuj+O&;`(^B+{2 zRB*3lu%BK#1n4utPp)}Q%W_OnD4{TbY{pI+XMc^1(q>ec5dcw7G5%>|dRs3@4}DM1 zcLL3Pn`Zgh84!}HmHojo0JzzpdDnbgC~jc%mN7vtGA|KO0Ow@XNn)OSdKZ_)r`+G~ zn#bbj^4fJqIrBO{Y5SiheI%e02AF#*r12%4`~l~#mkgfDJYL^_e|<~vrmNvj@@acN zE*!yseH$ev+ca^sx7r75Nm zVUi`tFm0H#)`qiTyL2tnxlIneZ()(MxSVrqSo=pa)GF50l<|8_svqr!qz0b)X0|`9Y1XRb7K=&pP#R@jxS( z?vx@4n;P$p0rc6!A6$j7Z(5O50l3&uG~Da1XAG^2DB#ZGh5 zFD1_PSWogMtz6mQMnW@X!Z{+_Vv$UQP~|T0=f^i-RVP5UJJVeV z0aaOU_1)>?XAGK2lD<-LKt?&ca_S&0Z5n zsrJk#J05w&I1W~a)tF#(;hsun!$iaUZ+|`X4L=d!>(F_3{|}FIt%c~BN;v+~1pB)cy7{YZzX4g>52h^TzKu3(XBk9tI{mLWGPBfF>rV zAo7u9Rg#h3e(f|;ddn%?>~>SA*~>ZVg=1Zr72?>0K*zPfBH{xMi$_Y_{nCL;%H7HZ zuZdN;0W3{A&MK(o6wd2p=OHe=rQp}Nsk<&iT)Ioi4bs(=LmU-LF*-~RHgMaOIB=bh z3#Nzn>rDY`>zf-AxoQ3yXrYiftT3TMTGUYdWAb?QGw~a|%)AbPOa9`3?pP&{7=&PM zd2rd_naslIBtD^Gj!4Z5Dgu~sP&~dj&!~k2@bL=e@VWsvdhvqux|iu>z)$Tr(1~~Q zNlb#)hwfy_J@qW!=N4}!io?lVXpPHd&ZTMoc*&VAQnd?mnALq81nA}un`haDx_2!3Ck%dWt3AuWa zO$FXU#Kl+JeFMsH?NqcXk_o9D($3toh7tdc^u2$J_IQZf(&E^dm2r*z~RY zvz){v)K8`!1YJksa?S9!Enzg#Tf^}aG2+Ob9D2JrbAy=Ebm18c;OcnM5hjKUs2KQLz_K6FSIFGh=5R}G-_WF^^(vbHvguObZ z&9|;J5j?m}5a&VJW4O%-q-92(X!=F)2bz9)TkdD}m?EXZc)e8iPD})t(|j!j<1H;# z{v-bBr^{U7nvkosb>&*F%{j$h1kHpwv(c2EZ)7i3l;LDQAu zs$1xUMsv9!?i{BZyf}xD!M5ne3gY)i(q>I%7_0-Fh?*Wd7S_ZJlEz%WyRY#r+)i-W zKOev8rX(_i-*uP<-l?pk!!9o4et}KgES8vxmioVJjJgO%D;VCNWmrDoh2vQ~h!Sy1 z{D_^dtM`sCL7vxLr-^M{LZJ~vL@H#1S_* z0>3+7wdEjAcXp>m@2Bn#<14)88{{|0hk-4)PRAS;;=K2#>tsD{C|=UB+8%Am+c6Ns zU5a=x;1hr6y>35`*<>y=?Mtb2#m|W_TQ`pJTOt$-gWznUq6a}?jhaO~a8kF*XRGA} z&V{7zi~>1jW<~|lE5SgK<1FjZMEP%q<<73Eg|nq_>;1S?kJBu)>kAOGf54) zg`>~J;#(d$5ohrgJzIy+>7PD}zvEaSjf>>R4#k!M4PY$Ib0WNZKmix5nzn?O^ZU+X z@!l_;sMQxe^7LtyDSs>W zFev=})G#+>9EdIq90Y|l9y>?Fm^lw}8ED)?2A5+REjVLpa-v_1$s!wd*2ncYY3 z)M~}eobEd7=%Bd0#g!ZYOWV^rs>@f%Nld=2jp6Z8`|$IDeeEOs+PB;8U8trn#f;`- znOMKo&j9JdG_e)HZt5)BhZ?(VaOaJ9e%yLx1@Y6f{zQElh5Ux(s zA_?MSWJ|>_j@8$PD4Df-l7{%R5zlS9GG zE7iEO%q;D6?IZDZbVf7*>E$Nq6$zo%k#SEO3t^Lcn48u21rbC|7lZ5)JUX;ZvvjB- z%gA26sbVZ4tvwEL^~Uc`b1YUz`{=L-#q0@d>k$OMhdZV$gw8>DSd>;zNts+M^JCx75>9^b;`1edv8)-4@U z_;xp;isz2MuELD}TKi=z+0K)8?I!7vJH9nW?)vrI7vbL7gD4ie?)KRYUDDzR2{2(gF zy!k8?2y&3`?P+r zEm3gWko^u-ulpVP%JWm=AnKu$Nj{oS>1C(duNxvx#JhD!U?Z5!pnJ4*W^*Cj;K;Nf z{_cK7YO(1W`|^#0sR{0`6f-I|TH;yWDBkX1H)@hFz*UhMRT+KsJkJ|%_a!q(UQU&I z;zr6VJKC5%Zx&~_Ovsg}2)ucu^ze)f$};aKXoH^fV7f zJ#iOsLCVaHzpE(Kydd}1n0fhw9GESyA9q(o>eh}K)gEokm50FFt&wUzt9fM1oc=U# zGN4vk4na1dioZ)L<(2l2F*4@14|2zp6PAIlTeg7fec=$Q38O&Q$mk57yltFaHYu;N zXdbc&@4#ABx!}#~LL2>AXcinT`bBsWm#+O;R5;1 zlgp%9Q&?pC@;3=tSrb#qd!;t4i*nIh91O4Tkr}3v73H}?LRp!1`LXbC?V7US z%TPb$|37#^Jr3AI-@-`9p85X^c-xr&?JMX1gSfArY&sIlRbCcc#}}QE|4&~R-7YAx zE`%ADZ6qs7bE95m-5vZxhs=Md=VNFqerPmt-_=k5cGZ6NA7(cG!%X!*%pBG1-m^iy z`wufit{27sF@Bl-Z%>yA1|f-<0YM$f4t;-=ypao_QUn!FV=;H=zS10=jCSV+y6Hxin9X_lDre+sdQtB zyF1#1J2CexV*E>sYfEH%*Ehz(`vmY#yxRpSma9q1|Ahh(^n+Kt6L;Ps{+PU#Ezy3! z%S=O4ubFj+?%XvBX;;XiesSTEGfKKMMA00-Bl5^dV~5S;Xli<;B#NIX#Lju>XML-I55xDM{cvp`1G3z0{@$x;nuAlWQMytYr=1H5B3 z6t_e<95}8mQYP{jdyhaW+1unOTl}|>zzpS!^he7LT0b*)N*;Utl#1fGrT1%B$~Fki zU>RczG_DTXq~6iP^Ti$~f81=8UBT+*=Eqn#b{-13GXB5ni1z>1`ld^aNtWe{3x;>9 zBs*UN+hca7F}>2HMmcWh|NdX_#~>Lb}-kLSfjjH*1u zFC%}3rCkf0y{U`I!42z2(22b8V3{DHtnz z7e&4kBgK1@^uN^mmffShZ=M?WU&mWKYO9tSWp8F3^Hrp{Z~cG3AO4TuRsQ35%K!EI zoj&e8*!Xw6WdDL6dhBI!?{`=EtUD^MGjfpPzfre{`7iNl{~=!Ce~Cx1;r}=76|GGU zuLMb6jh)kX4fy{OC;T7cqU8RExaCCF-xXszWNmU;{Vy(ja(Kyi;{V~D&F{z{@3-p3 zOF!ehv5ioB>VG5N=|2(A6oY&OQp`Sn+FX(U-)M8leg+&!_t1H3!9<_?qBx zK$2Q&;g9*hp^4_Fu4L~ujsw7!RRz5P&0VXT0m^4%?sb@76BYPYQPGUNdBM255>m~v(e~VVskenz;2X@= z1cpTV`rewLVdq|{8Tqr+6b6lhTRC#qp!@w%RC+6TD9fH=FJ)!H#-XJEAuL5{JqKI< z#XK#Sh2e)EEPk3DedY&W46;)WVjJ`H`mP^Wziw-#QE`eE9I(Ky!bl{R&Z>Du-~NT` z;G4mrbZ@O8R*HlI&bzg@{dO;A~ZfmW#b!NG}N2W7h}wiiRg1 z`htunzQQ<0Y?yw%HA>CS>?*pg^Z3WH{nr7H)Jx_@9tp~dAIHY|jrGndn1#=0dkfdy zPD^%0R{lnMT5od&cK^LS^v?cuo9wI78YF_5aW~*n<2rUrTgmnXHCO);B|eR;m{m

    n-(m;J;%(|7o=`SnEbP)hDZuQ(s_ADn0vMy^}$!Am#gP?5?s*&tl+6%#w zOf-;pt5=h`XL&kdpaWiJK_}dXie54O%B)vF1?=r@A4TY+@@{6MJXxkkLHCN~ZE}}2 zff4gt-+i^fKkt4)l#b!8kmeF?O~T?nt025ze-!kJLL80)!YVanSUcEOG*^{jUMHw6 zuj{l7gkRVg(G8Ku^QWez1G0wm=e1gmsB8G#5Ayk-mc*d6%kuUXa}NjgyG9@q zmhJl~B|G9J7prQ-H>&SK;Mgc`XCXU z_o64KaN_&RK5lv-8A!MlIT`1s1AM7`kiq8%q%PKI{0-Rslw~UR2JHCWxEK?_-59t5 zJAvgxK9s_=FI^CYZ}D|S&x9f{(wOb3U0Rqq`U9$S-J+^%d*>FyYI`2gxm~zJ-3)Y{rQ2rRMex+_9s(E zB+q)y9yCYg1W)Z(1QmIapn_%1!gX%6buJ6B&=SJg>gLbJ3!%#>-c+>1UBA>#hM}op zJGiXl>=0pT_SE_Y{VHP+q0P$$Z!#$IQMJuJTNXfoHQMn9Q<1#!$E90+Tl-NkqnbDMX}yDpc5bb9UfY5*27&E+*|KsVlLwwUfHLZWpfCp?U4bjknO|_V4}R1k5Pt%kFAMgr#i@4M5Yq!9>;r z7CgJ|$6^GMP!5QO<#_~R)A)Xpd z)-N=x!HP`kW_V~_Be>)Nj_z`|=atp#ajN_3eX7s6kBZT%rBp&wen+A6>x?}a!s5~_ z{AVh9O9Fb&y7@!cAT^C^1dc6*4g}^a4o=%rTV8$q8cj9n%lzzQdw(aq^S8c^ZhmQL zaPbMU44+Ow`txg`Su1LnnB#|-Dq*yq)amY@#JT{3r+OwuipNJUOOHdhija_cOHTy<)q{M5t4IRtnw3~*wi4h zxe)1*B1#qymQf%oP3G7=&Qll2;1aEQE;w5A!C>XZ_PfvZG4v~>RnZpBfA~eo_Ie0Y zS=D?sl5YtlAG>V@yU6=84r40c{L!0aM(HCN7EBw#85zwgbK>O6f)1kX6zn=diAnRO z%}UWe`IEJm4k;jw_2{=Rsg}B> zMhj*`+#wB$yW>Q=zqmI8YWV7FBF7Y5D)ek89B0BUeW<<I-ZhVaWlS(!^>{RP7KpbqTRw~`Ri6ippfgW#l^p^Q(o3)c1#=})H zb|t=VcS<;wGfFd&`46iO>x16c5Em3yrfG=p4)Yc24@eb0dsmcW+2Hd z@yqBx{9V?GM31_}cQR}lWt9#j7g5@k6bv!816bA@Pk7^B5m|?>2*-N)w*w zwHl@pfBw_^mw_q0Qqh!_cyR_D;)>^L#G|P&Mwibx+@>w4Q~r4$@-Ka_-hj5 zp^d8%it&SHkK1)dD@J;}lqXt_PX$$2sZu$r@0BDLqt!+(Zc9wuX6Hp%{wYTFWe+@k z_5T1vK)k>A!G+XgPG3XJBlGCx_V6+ZaavA>wHu*piL)cl;@Gf9_kmo17oZXPYdWxm|^Z`#y&E`<6?SOVhh;=+JBL>-XEE=Z;zOGG`?FyBTZ1fw!*X+8;uuw#B_mwGVLB$5__6r z;sqBEJpHNu+1Idfc_%R2>BUMdFe*f_{Rq@jX$}-fBg&CYw}sI_3Un! z&aclzOL8(vw9*Vt75O?tk>PnD#^D{ylwmkP-Qg9A3(&IDtnJS?^_HJ%&9Bwpjp7BO?#-G+9lFtrOjW{rYXJ}aR! z*d@pVaImtWxH8voX{XK8mDT?)sQ*qWKq9?>=aKl_?m@li1o<*3-MiSOe$Z8*r;HA| zTosEw;zQ;H+t&LpLK+J_iJ80Qmepx7Roc#ET_%$vR&A-SX9T015A_|HunW{A!AIR4+O<<4XB!|ZwSLHNYLEbMcWzzewFj-3GGfAkG-RTbwkgF)avK-hS6 z9y6qq^5y4Q9?Y_|hAhM6;^(O){L22y2R|Vz`QRt?@UAKT#~GxU$aCp{3zbZtlVc$j z`i>3b_gR7Aj2Cr})s}N4YqqRU@0f&MIdn46KRatMTUmG#Y~_fO+!S*N_CJ=@?QCB_ zErQjtc|y3ENQ2fB&TJm+{;|quaazZ(x>_(e@i4~0cr3Fy)HSo$t(YHs7g{;fFZfpR zt8zvTG_47MW|O@5Q<6G%vbJC7hVn2Y(=j?wp3>q#dE##_Puu7JeR(GSf7E>md{jm9 z@JtRECHMveiHb6~qb3}hD6AQQHIW3~;6zalqln^i7&!!C1`tG!nE{^9r*Rit)|C}k z*~J?b5YRwEAPFKE4mm__gdT@ zAq1cD7dSpSFNNb{3BxBbh!1doKNI1TU?uRt?05G{LUbh094IePO0WV(i8@ARogeT# zF|g*oXR+Ip?EK$2K8Znm9>-hh1NdldKLNd2E{U`9b>avZZNRT(u1?q@HIL2yoi{rN zn)QE5c+L*;Bqu|e{|z>0yjVa^vMGP#h(9$$aAk=8P5SK~6qyd~857z04sDD@*=mQm zOUxagRM$yQ=z($_&%38ftG)+S3bm9|I855~G?t3yxf8Tc*WSDUQKd? zG^7_$OIqa>6jK7fs7Sj}R&HS7L_qXW*ay>!7g~(I2Z%$Fj{dBU{cWP1C z{eq-ZS-IkKg7}(Sr z$i-L&F2=G0uMK+&hP> zD}xO4NlSnxGm7J2gh&sZ4*Eizl?=bpX!t3`XdTH8STWQ62_T=c)tTf4>StG_kavMrRNj7PG+q0X?*xBl(y-`!cBEGiOTh1(2Qf;(BGTw_xg;Y^4rbGNV^# zmBZG}DtYj=U41giZu=3mL%Y;xmuPdc1u^Qv+q~HU(lB|SJo^`FwjF%IW~EoYh7YCG zTOiH+0&k)e;y|pR_9#u_++ksD=W)ghHA42%cx{{-YIEiZ{(~X%IYC9q zwdf$=o0{5y-&3L8u>_s6+h}iHsRj@utammO;;F3}Z>J5>syfIeboh#`DUsXThvw z&SyzyzBLE>T1`RxMg9-+xV{|1|GPQ=Uj~sIRWxY)2>Bn&JE1;HtiRq6N6M|>|K5WC zdvpFTaR+#xrh+@j|FLZ5>u64Il+`iAK(6G+FHMQ_a$S7@=${7rm!N3^g2X-5$>BLj{zovPMuTGPOu4+^G#98 zbd-7>^9@xTRs|Cx#*$RDCoQ<~(BRd2TC$^K2 zR3^3wl?$MX$!`1>ERL!2>_P*{pm%4{*s3@L!b?K^0aAbu;RSf(XeJm5HObo*N=3oh zhh7o(^{9YA-^8S;WV`D8Kc1FT;D5fvZ$F3pe@LvL?DM7EW*F@xGs0iY@0Z1c8y5zs zeL)1)f?RYCp4i1*9pjp~9=#Rp%&WMv3fyrG`H^bVtB|;BiH-< z(F}Ox-|OF<7e&9zP%aD_kfn{a8Y#KJX(01>n0I}1X-IK$6jsOCfw$)Eq4`svt^CB= ze*jao@Ta)-76V@vjAvM>MgTs1G<1K*p-f1m{hi>wRj5BG{lPeP0lT^Aj0^4zTn97k z_#re2?gu`24rlehX=Uy`483>21DE=#J(JiG8rI&!`l)Teu;@LtpVH*_B4ew3_yWFOq*N=1c(I! zOzAhNML#X>l&tUZz+p=hlHPn4f%||n1=YC2D>be3aT%G z8qq!V2N*wFmGhce_jPi%V=?cNCUp%ceC8p6eD0({0P1LSE4-Ipr`p#Kn9Kq>f`a7&qb+k)$32!>(9~a+e7PfwWUViLW+OuWWe+S zI*XN}-LvTC8*9x)oqd?ycEEWr@bIwxw90>qc!$^rV@#-0v?zCvL(cx1*4!d9RB|1o z>gdrn;RqMfe2%{wz(?mT!EOgFn&rL?bB+|Yb)WKYCbscC?i&I#PHZ~O$)$J?+%i$l zp-Vd}L3p|cCAiFi=ZV7q$6=5>Q)$D519?i#(!Jw@OwJWLoQHh_aMNG-#r5_9p#~0Lf>Xf5V>QmNXZQ$ zlstkp1B6T>K^6b}Y3NFukrprr%HAS_!NP9;SnPmmGlZAHXZ=@|kdJXzpeYI9OlyKJ zn!%!XwF$gS4An;k9jB2#%7HwUK{{=quF1JfhxA9^!(kK(_D6^zMQ?&u^yZ-VsF4j{XRG&iaR-=bjH6&~w+4|F7r?#B`|706{x|9qzmv z=ym_@PN*={IwuRh51+@L0IWaaJRWMGc#$^@ddHx@2{DcKRQ3sJ3JdU zCs*rz9`?T+eLg-DJ|A=RxZ|LwFLXi1-5BR2eba~t0rvV#bm^NN$@qD?gPOOHes_D7%Jbo`HzO?Oe6#-lM zccS9>d|>@{sFO&kOwd)yzY4~3*e<({3dVA_M!OE}L*{sz+u1Ne-K=()h1tqTvur$z z#FvfbdvIA7^+B3n*9rTzDqz3<5uWFTy}-`H?fnN6B=58<)c-TpFWFcQpT4^Rj;z$DO#fkv4BQCp17Rb4YA1Ocguyx zVqxB-R>E-0Mn8M|QS{H?Cars8QaC?X1n7DXTDNNjqjr#$GDmWy9hkYWHr4@c+lr)l zXKzZ4Qu5TgiSJjJ}MEPMO?u4y6*~&BzW!8IeLt;R z=WL~?)vm==+4;^B{ISzcbvUvdPvo5~{=z)DWK%Vc$K$$OQE;oV1nOeXS*i6t48Med z82tdoWBfUCzi?KsH#*{hfaVoNLf)M=v3p*^cwi{YX24u!GH5bv02qht=85p_k%y5jfFoo*yu?t_Xuu?c7^H3l3vI5-04^kCLZ{ z?YI4qBX+hr(*9`toFAIL-&Qx18>fSx4s0NOHKOVJZBq_3*l#=i2;Xme?-c=+N{-O@ z)BBpX-!@i2v~gHqzb$!0$bMVoepZCPx(W&SJ-Ms_#vGvJgfO4AoLa#1r>ODwfdk$9+d_j>S0%o&+k0}<(M)i7sdnQJP`Y4cEoB4{H}I`fYseP?F(}6NB24$AnRFyFj!Zz7Is(q$%9_sKy)jhWTCly}@0P zKcj4@?Jr7A2SvQrxt+ZNg2Dd{JVN-_;|AR^RE_LPvFi~8;H|!>x_9^PL|wc4P07s- zYp*9Bs$#E?$RRTxr9`K{;F3|y=Dgm3Zqj4e!bQJ3bU##9 z&U14jsX-Wk+Sh_xJrs`sgJfoRfGAmy8y##T5z3WAPVg)&I}jHoa^RXKifthnmNufn zy{_hbT`6L|hXQWfmvT6*iicL!2ltmlmg=4I?9zL4b3MKS^XQ=JO|j z&|B6#BBc|Eqauo^Vr#w4NJ4rOO0nHBL5f+Vc?mH(#omk&6!Tb@(A388uaB&Y;5(<6 zHXsE#GZ$xOf1pZ%pbAo`RR0dhl!?#1Iz2ts<)YYOoseT;(uNW;`z=n$7UA^L>G_Ii zw9tRGuK$J6Q&14f?G>z3Du6gllNV zstQBu2c>{vL81KZ*yn?Il*0#ucqo7CY^B!!Ek0SqpSaG->eobQ)VXEbA()Xf231iQ zU~UF*77v2ZTon1;TMt-#d2&^QfW_-s<{-Xq^c=(k7TZw3BE~}=VoCg&>oiXPl;HF? zQ)lrH?+kKJKo;@Fw;azFG%u!xQ!zbmRXlZ&YYJ z$|&hv!5SWg_!A00hn9~g7*GZd0O7Yf7T+vPtHB-fOCeG(t2DL1u7UlctadRduL($ zr)7M8-l43tP_HZd6ES^IMoYm-W9#Eq%{*d-SSyt-ijMXTs*LI`{ zRE#{i^JV1HkLMFE!_(Gs&Hp6E1uYwhuXqpYO3sF-@VJPW_lMYSIysS#6hIGdXW3l> zQt+TdCEQtG2QD~maIuEjHyqD$0GHfv;u3r%0Sm39kfywnxDaOF2&j&bbER5YxtBTy zdnyB25az87#&MJ6dG;Dy`X;4d9n>UP62-Msd0Lkzr8pA-x~Uv+_IFRn{!^AV%IN5^ zbP?g^&4Z6@;Ic>ybwwwy3si~~4ps@dF6OE@BCIj}c&KIQ6J)6HgS?n+wDfiaixUf;owSyN4B zP(v}OR5h$as?c#DhdL~g?Ruq390@8zQOMDuhE+pYy-YP1RN<94&iWW<6rM4$A`oGj zJY;0*!^HT!cB@&_OlD^rm;uCgYHR#0c~>A=)QPC4F_v9b?|ZkF$48;1sVyu+T5DQ0 zJRFT21~Z4pErQ6YLCyv$U>Vj0^zg?3@F#r}F|Ir>gou*&QyN*KTncBtS;W`Lc(rfWJUmLv$!mt3cPff1YRgXVWGa>)K6=FhU%##!5qnc zbfY=c35jCN?OI%WG48-}K;1l=jJvNP<*`&=^X0hxtKk2Zc$CBflfRI~%F?HW@|u8< z>R2jk3EVT5Txi!E-{u7O*X}pTonvJ8hBTA`#$wsF-E~U!qV&sDPSF4t9*uJs7>&hT z46x+c=o?#_M-<^CAorz`Q|LO6{6+S56+?QPJ59Bn5ac#by}GWLvSzml-R1%BLl(GGu@FVJP29b{A54 z<8Bbxu2Zs+%XMBHC{MrC#k5qUZeDv20UDPsEBnwgEj%2HbW{6)C1+=NMaHcZNk^j# zl4(BYHj;hnLRyE!VGYo}>Q^Eu>Q^oiQs(6`RrNX2s@~?ud$yD24uo0mW{K|V?x1qP zcWKVF7Wky6yE;ai_go9&3oh3RoS-U@_>;A%pvNsPpi`2SEVnNLbFE$LlW13-wAhuw zB;CkKA<7FWT`bl{T%_)QJWSew$+fsTMA}sgaC`Iwr9&H)Vz>P$c@h(FhiEbvbtnsa zkr~Sii}{WcZ$gT*1;s5LVjs>VA%tQqwQ_wLt#+Yv08Q<`fsK{A8Rc1}FoEOqsFsd+ zGGUAYeI*I>$$hHGQyK=osIO1ABS zPsNe;;$m(Gutp4j- z=Fldj2vGw{5b`da^=?Knl}oWwK)x^LCx(}<4ypy&%4xsE?F)5;6;FmcAIs*}2UW7$ z0O$>Nsn7QqtlNr?*r3_?(ol*qY47MvYNj29Uj59OX82;jDB#aB73M9N`T?ww*svK- z=l;iG<8w>QLsbU^Kb$#s^}!^&?OR#-Mvo1?8DoQKLx{u@yNWu6*^Iwj9kP|9_T+Eu zvoB%S(rc%Y^0O~d_boD&GH-#T>66aa4FVv<5D{hvR7v8N3U8IG2{fLY7LzyvgUC$wc292hEYG4YkPX*QrCFGs z2KC64STIspDJUHt8OWZRKej5KjS4URfKtcZlvK2%KL4>0*WsN{Gz(Aa(zhlr;r)(z!&=AmuXW>7X_ZDqT>aE(X1Zw`4aYaNAWue#P31JVCg)TM@HAN`8#ol53`O}1QTMES1`qv60OB^#rQ+QhcFxw&- zNr6#)eEntq`^4g#1&i-cJhrVbzJd@4=yp&WB4Ns_zEMS6r&#Z5e9Os&l1A~Hf31&Z}k zdmd)f9wGlXs6VtBO5_xz3^hS?1CTSiL+!HHj!_d_$uIU=c+rlcD1yY6)9Fi1*l~F} z8=exgy46EhFyc7R&G}O!JQpc@oPrIQ*b347*lWVkUYjvgORC80Uj5XN-p z*htzVpwc?iNg*cOZt^L{+HJ*>=OyTgjHY=gT+v$+m{)AuEMb`$e2)tIBrt%Ir(Ych zx%xS}TnAB`6E$k&wL13T)%k^MQEzp7D}*!^OZVEq$5Z z8g)$XTNm{_dJEADpf27^gn~sFNsrS91tc3eg^vY1fMt0~4m9q(i1rb!2@RGgu{8O< ztO~}tk*b3_T|IkL#@0f05>Pz2{_%r2?b7YZU=hB0`e1#+_=lG)vh5e=wZad?)cXQP zEd^BYr>@{r;9XzJFFG8fr11n|hSaK0if>MJ06q*~sjY-=riR5c1(rBGWNPYZmOM}5 zF)C&Rg3c{Lu*O~pkC%Ed$)Vn#`W8V+uD zxPX&OQ^D>lkb7~JV<12&_cGA=q~7qokabW>e1EFu_NgujC6DnkmoBTv!th+U#QBHM zrS9%381n(GWwmz>u$V(wuIfs%V>LO+eH1kB1Zczp84dnti zK#cfu9hRh%vIc2Yy2wxX`&ch>j=uR-H0S7#;!LmS{@^NuYo3q;L&rp4(xh0lW;<9}J7 zpuCYHeSWrq3jURNYN{0e_=eTjb60B#yO)lsA95&{+4_k%(cz*0ES8)rHnbOE!>^0! zh!2`WIp0VMRLfr?p1JW^TWYq**OGtY+#52LBlvCx#xvX>p`8=7C`DD9f z0XclJ^)#EXcH5rlY^?C_LAZ7ZA%=l9+~;+ObsLQZF8jXYIgk*d^RRIH){{!fWg3x0 z@_bYm6kD~=LYaTm@wT9u{O5Ty^Xmfi`2ZgI--8Od0@L>a6N;!6Zg4#8#y#PZrzhqv z%LN6^cb6bVr}`lG2>?2a3`euD^PB4etjqC{Jl*)_h2h=3)ZfDo2_ih*49x{n%Sq5Vnf`C8hW)Nwk6o#Y&a4vC|0g>a~^yKDgz;_ z^)}nxb&^9jV1Ca^ar8Rcz7y68N@Q%_^HVc~Vj*98V9vj6pi`comy=BROErDW!H1IPd$d3lF*MxF z3|TOm^S;E;(eA}+L|0pxUaK1oa;?6)^e11P)(58R$}+CsVlg)INs8-|R(Wxs%Au_s zR;%lDhFMEdr>nDTFQt`uM+98mV{5s}KWizf%{3)nr+DQq1w~~l{4lu>|1xXo3-Imm zd}_DE`(mK_EK$A0>j^xYA9(gk;8|Ya+XC^8?R^x4*D3-|NC!La`tvsVqVon461oS1 zp{?OAHO;5;7iYx7*YDEzrId{{#_o+Ne1P}dL#mqz7hUgY+C&(T4n>mWHHf~Md+`DHG|nugfxWeb zDa7-C<=fI^z{%tcF%cSNqg17zAC`c`T-P3Ix=mRwdslT#U@~;HK`&)8lP!xgL0;oHx8aUpU+SST2nIBeC)|# zyJT0spZuWewGwAof@Domtl}kYXP;!z)xiCUmsvDRw%z4TSB7-OJE@TA+Jr-raGe+&FK;Bqr%{ z;Zd+d;;w_ajXv~zBqvL|J(#}&+9jfEUmeCdta+?NJZRmKp)Rl@n?VOGq=V71yn|cq z>Iz<1Yx5WACr0lF7fB>EE4vL{`X4ji{vso9pNJ3b+p+!SMr?n^620+{V|e39*f^C8 zY4}=Mow0~byDc;(GE*711p4Dk?m<>Ng0FVnhXWC>1%1d)VOsQ_S-R1p&4pl&#i8XU zVs!tVS*aoCiL~jZVCk<#1!EmLrtMN@Ju4=4c!&uj<_~}*J}iulIv1%H-{oUm+s8PI z(2bc|S;Flf@(`T^hE5FbtC{t${A9nWHsBvIaNSqzt|j4=tm{`*uVyEr6aZ5xV-0Wd z6wSGl-pv`(f&!eXshdX|>iehVJRVR!V7;dxqWBdbbRrl?6uO zyA!f)F9=m<64@a=_?l26@^HC}EKC9dVyyE~y}W+Hp_1zrXs6k=J`?z?Kf)iPk!g1q z_jlN;;NiFQ&`Yvh%}=it${1Bxes=@MNO5cf$4DT~-+}?=UQ`5pV}s{w_%dl%X|)yR zN$L(Brlb4MIF0gu`5B93rP!HhFFYAT(;OC8@2fFEKMe9UZC*$ol(=H@cLm|&OR5dtPo{cL z>(b<5Bk!Wzpi31JXVA+H4{1HuhxPP}QTHS3lT-9HwkFV5VSniBi5h{2x2N9EXJpoZ zjNzT6h)&u<2eE+;CK<(fL=-Th?d6vPZI8sZms}1kMil7+0-pBVQbOzWR*1-Kv9ylgzBY>PD$lS`0#x)!u)W(lhWD&@OXVI8maDUbV-`T`u3sp`z;|6WKBRPURDgX8HGKEMTXxs& zRk}V&#cnLR}apSvF8?Sv(tj zCl`7D?8c7+sPS(tZ6Fs5AL50Q2apW{9ifbKs7p~oLal6DjYFw+-egzr3(BmTJFsvy zJg}hMwjT*C99Z$}&>d04;x~6hibctT+{iwI7ruMB+2Wrqte>$@z{BTI#$|?BJQN{S zBI5y@N3iN-tSVE18H?z=gIix_b_0pyxj?{xlf z1TUP(drzXS_uv3O055!7>vZ`$C^Qy}8$tDG1c3mUeLArv_rd+#(ByI6L(A3opWv|6 z^YFUn?A0!$l3FTI$6vbqB7S_lyn@6^9z9$+fqpIQUqMo&>}Ro{tQkIj#K^AW|2!J? zzzVLx!AS4PSTp%<+G{C5=a{mR{Y>?+X}gwX#xTOfk;9!`M`bAvYx|6~DR+@#xmOgU z&ia^9oYkTGsl=O95!~RQe){0Dz8c!Qh8Cc07fD6+uCCB+nt}?7C0ha?&y1RC8fC(n(coJt^>de&$>MS2=5S$$c>0SQv3Qk@AaI<)LQ6Ck0`bB3{#q&`* zv1iD;5^w&ZlL5o~ioNp%5ocW#n;@0NUWK0V`MZp~Nb&yxWk*W(=3S%^v?b*J#5+LI zCq13X%3=ZplNg+nfxFUa2R#1u_wc~*49ruNsE@H4Km9$5@=O=4vun|Y(Of1SMm!Hg zwY2UhFEul)#RY^tIF`nRH!+Q2KX^3h_GWabXh}QnG=jJ4!Xzq_4)p#ye$fRkFD-2$4)~sA ze$p!Azy5%q!ZzzvrNM|_`ayuI*cZ$1j%Z#l$lc`hSyh>W7;k)_3BlOl7th_&#XlkUyslCmvWiaE)gYBT;#b3##xbajG zn9ZyLwQyB7Q$UM%Gt5pd!nB+UwCA5kGtS+s=1n?vd|Mo563W07Y3Ye#;?MUEqxQYj zKD9KJx0Ij4TS_Bom0`j|w+17MbQGB?$zOhASIkuwcl9X~ZTo;3+y^Y+ zbZW=j9zOcBuRtYx{4rS*oy4+fgh)P0NcPX0z+TD6HrOAo$FEq2C0@@DUKkof8Iu6_ zYRS>PJyxJAP(`e>Qa6JMl)9M!R}%ke3V$iFB4;g+)*NI=84Yh~ULCEM&keMwva@L*7{(T!w zf+-*Ku4K#)>rnEseb$eku8;i$p=nt|ML6%%zB&fu>(d1|R#^})rcMl22$^phCS6@j z9e+uYCh?_jb3ulC9tJLl5ya&j1(ISjZXl^OfyLcMU(x|8T$$JkN*WYDaw#PZdU7eY zM1qb3HP{@Gi~=ZXbTKuW#sgjWa+7ZfTLHR^YhcsuGd8s8g85lGuf(ylyX(+*No4nY zY^1Xl8GL6KMzys=>t=A+%H@&*G|?SW-*V)y3AiDz6xxl&`s6c~IHxs_B&p8vVOujDe{e_6<Ihos{P13(CCeA0-OqzD$1whb1O z_-j85nHYR}60dK`;m1VYed5YY_a$E+m%S6FOrC#=>-T>rsEXJ-LmG;Tu7BVs<2ISA z>ivsw|KNG_0in3SK(RaSqL^zR-yy(=TP`_CB?|U`@|G*BL|bkNszmNhW(S=tS#?WL zCE|JeBu@ua>=)k>va6FUY=ifNutOxeBqB0<5%zb?Rr+$n7Lr8;VphmlwZEC-gcg#h z#SBdUTF?onVLy}P>1srAYZQyiN~)kOE~uhd$TgV#Zmqi&I{5d>>L0^J?}4^K(fdw2 zqHk(}?A~1e^CuXouHN*~0zBMTDp!@sT7M${)qmu}5R-=IcOU0t(f0Asv5>qI(A1yA zH;QyrGoXF{_eh8uGN@*lFbw`5DL5pDiLkD9qv+rL0V$(=~lj)7e~(O8Y|eXRp1H%H(smwU=50GozmS6Io}BYSfP(%#=IR z8;}mSOLGs!lA+K2t=-q21LX<(W7Txx*K0;mskxRov~jZ>DuD^cEmc>%WF2zV5EFa| zeb8L5TQphANVCjZj^HmLWnRge1(vVFcv#%{P#>e;_^`B^_N4M+^gADl`I)!o^Rx1> zszb}i4N!+Naz0)E0JdE^(PAZLB`onyfWMQ*J#`Om>tz)8ZIeO?;Q_QSB`~uo z{}DZ$F)JtlR`MrH?CxSSca*++Gr7KS1Txj7i8OX&0Itsb1c^9hcvq0-c)V7c_oG3l z`JY46%QIkz*=L(!8cP$o$x?Uli#K2*t^t*4-PJ_54dP7D8+5?;0h1&7u&jLJNIojt z%4DfenceoSH1Abfzgs>G&1I}kAdmD3iihJ~98S=DbA71PyLBX&i2R);?A;nt*|JLm zH+X3+AB$ehSf z=h%8)znX6l&61TtdBH0_{>2-hmul85U*B4s@7dntb~-PIPFw$W{EZimc;13y{@Jkb z#m2(I_r|>!sw$rsNzRlnLdAS&jsd@jyYxwCSK)ftOAX_Z=tN^VemxB-#H9LW!D}N z2{*^mk-U)nowLaQ5?Rc9O=z(|{?3EgC4-%D*nk7cLOx18KU3 z$y#sWO?~ltQuS{72cGwzo~9=eaOusp7tOKW6ySSkh_B!ERbla7@kic_0#7%+f&Cpt z12K97S6v^TfAF1X^)GbM@z?8b=qc)NP7JUAbhP>_^!h{e`g8R9N<{sR(ds+(`ZwzJ z@7L?wBm4hrBlK-gGci8@@8R@qt2h6(MIWxCJurXX09oyT?v-}Pl@DDby4>AP&rdKU zBA+EsWHLSpFI%bMX+oY#3pu}ij(wAc5b&e2a^U>7hbx=xMf}9`+j8*yb}oPBI&BA$ zpy$E9E|@#S*SB4x_jLtRc>yqHx=u${x>+eh*+yLNC_m;M{x-Bv+S?qkOCLtbsZL6A zsADYIwyJF9LblZBB%Xd~P3y{8oT6`bp)(HGq3)Di&SU;&Ch&N_7ngerrhWu%CZ<+m z5dWv?o=Us2&ou~>a(PuKlAT;7V{dx_z*llq!vWI%?okBV(YwwL*F+&dN_p!>260M2SC4zD|MtT2Qy}7(x{$ zt^odrZlCSMDT`-S;hE%}Z;Zh3)tiI+-&nuW0s6~bR4p9Rb%q9zV*V^3hDGvJ8aOfb zb-FS!CRhkGfNE!QfEexV$*1kJYj_u1u)Wze260~IF!u2!O|V(4 z|F%9ZH24XK^JXbIfp9M6gWt72xZe+wCv`JQW7KdjIyAiK^mO}aacf#WT6bPsDR2yqjgt0-Y} z+ENfc-O{idT{3iXcX}F0)6oXjdSF$34%EJs*U9j>xZy;!VyCPdJ!nTStJL`il)a$- zC!!aMCu55=958itcIR$Zo*yxmHV@uGL80|DIBeC@-1m@eOG>IquGO zfd$3Ot#u8daQ;C`CIekc&YmSD56Y90hc1zlM=qC=$AMIHl}X7n4&$Xr4oxE$gBJf@ zReY%T!;8SdI;D-UN|<|u@(-@VKWjxG1-!SA9ecbSBcaG*A=4e;F7e{+iYdG$^q{co zF>FYCz$_0hm)&(nS!v<2f#DlxolkM5*c&i+Ptp4%ylAyJF4Z7;KO#5!$Z{Zf;JlT) z+B9(pve}LKfqp~4G`ZBXk-6$oqRh@q;*FyjV zf*?c9Y*Pa-YWyqMx%dY6FLA}~G^Ln|ixd;=RZ!_Ofi>g6kcjc13fHn-WxRn>Q{pkF zLo9(T&Ve-l+XU?@yi0aS5ql|~XI~Ay4+xJqRu}heoq~5LW0+iV-)o`6c574bfCXH6 z)Lb+wBHx-23#3w|(M==`%)g6ms_HSN8O+O96|(#@t=7~t{N_J3{@@H=Z4K-yq!Bcb32&_+dgz2SOyFuWoj z`(@I`K5N76hBWpe9Sn#cIIoi-8?)1kRC#hq?;PY-1TLrAT$~Qd7mQ(^Z zt+tXRb|RO8nnMD4Q;?lp?f`jnBOXFQkpn6KX0OVI!VbJd26rgD`ybT08M*YyOaSR) ztPPN^r!^>qq}RT#3!dW%v$-FNypP18bpZj&CQEja*z%To9v8JHD5thgIhvKq&ktW8 zkWrg_=viUyW66faVR+ie1=Ra3_zH90Ot1WYQaLs;Fe&HQLo1gqbZQc%TI(oc_H!C?wSF80W1@4DEnM1(icUktFM0)e zb}aF%F~Fl^1&>Z79%c5<)4UDOBZy;)=a}bK7{s(zw7 z$g_b{+Gho@m>|`Hs4zet$tR)UKC6K@0q{7JG zN&yVYLs}OTXW7@U!UYrTFrbW8ZcZr5s447I z@{wbb76dIdm4lXyY^+XC!s~Fsm>dBAhNd~x>=XxDQeLQemyFkbse=G%7Y+}M@C?pm zjwv`c)7*V=Y^JB+*o-#x&lo(~kUoD#fn5N8K!Lx#H)8-x#)Sph!o4wt{rCW%?;Uiw zWHphs<8aXq>Qob3l)`01S!Ch>SBD1nLOFJg{O5ZSb>sZYm3%++9Q#WqNAJz6ssdd3 zF>c&4j~2Z{>{ zZ<|aP6jq_WeLn>?f$VwKLaL#xJz-4;k%?CnvQH>t4s!|&Fk^zpaZf+{99@r%3vO|y zb)bRL9Lnk}OrI(Dh+hbEL-Kx1g78IaqQN-@KTQVbcpUuE0ihk>_e@M9qF_&hD@Z+9Il%}uxxSIz&@Sb3H{+8; zi~e{x{O{QW`QMY|KlJ|_-#ev>>uC0X^jKy!v#3Joc)Re#(2+F9$_xe=rfLop!~JJA=ga zd&B8R*-3o^fdlh`zx@U5DOy)3cTW-9Vc>9fO6QclD<0E-{?F)n_Doagx##&S(i3#7 zgM`-c>vVc{5^&DB?{}f+XQ1OP&6`e7sWOC~b6^=lIDhyPr{~dFou2Xlj#{BGo$JG-~V(Ahh=bVApd)3Sgf0z|NW%~`QJZk$w%XOc_oKx zKShVfkA1Oe`QJb4$RXMo6UhJm_nOLJ{`YWw;Th8ax}3BUNtP+Cx+DM#`WJ{|?Yi;> z_o4F2fLk(CsT*xz2U59Ha_jVlhCB(SgM(Wu4=A$iub4D9)&Djpc7$|kW!Q`X=# z$#akv&|WV+M-vP0;-t7jTC=8``0b_X6fdb@Agz9%b6oHi=ld|}bOL~-U*LRgkE-AW z&V7>jj}$icd?mlix#DzXePk|r+AN@4^hLq>bH5%1J-|%Bcz1AMFy}F*_o2vMOu#eH znkaej=nRj3hUb)aVU##egiR1GR3(6rz(r&MaQB)vjzW@e;UO6&|L5%4pDTr3Uh+ct zjk!;C=L3C@A%JTUd;E4C*IB?${?dm2i@<)POm_fq6!2bLHF&{@R8d%7Q-^H^DQ-qeWh&mK6;PwLD}jQvt1nw3hslJaOErxshc6K z8VtH_GW0foAb`CORcG62*VmY6+YCBO0x8UNZSf)28Rv)^-*aZr#~&tCCGRJE6y>Np zejF~Rd}Wxf4(dvgkliOBTkCsT_mA`Vj8WGI)1#u5eH}2@`b2!=z<2@FwsOrB?|BxX z?`@LZz0IH|OfZxA&*ql%AqE+6<1gGW9}dPG-lsW`pKuzW-x<+hZ-!lejwsUUC5+zy z(V+Yk;E`C?{v;U}mefjSyq4>3zoa!Vg+lgUE<k0TP35-`YDwKWEF!T^Cla^z@a=ooyFd zF7J-2ka9fSuisg(e~Vt}d6^dVT)|QUB=mqW=4#^@VKIJ|#nwR_nLvtIh8m z0u#xM=cal&)UIDlMoD<d=sCrx3@q{X#m73D!^r1`x@ozuZiQglxjhX7u-X*xj z1?R=Rr=nt$D=Qz)H;9c*?-{woO{pwMn#97dMf$H2_mbe(Wx=n@gI`w%zm|!w4uv{} z54?!M*%E~oTAJ*8Py%&%3@aIBzQA2eTMeE+yOZ5|f##h5EWE?s_<2@|964X*1s>E@ zkTlEXA%^AjEB**hVRcM8W>nRe1n(K=O)_2oaFV_6LkI6TGsUY_pptvWe~{Kbi?%yw4iP{?Yb5#6r-F-=R~xn; zwqM2Xbpe*cacE#t_jnt}MDolcdhAC>_}zd5bhjAGW86@Kl9SsTa?%i*Z7JNzL3O8EUe2OXWi6t|EBT=}aE z+?6kOw#kwIIhCnr_qPsiac3*HIn?umVl7XF8im>3%6?jVBTr-a&3PQ#kFm3CW38YSJV09d zO7%DjvF|+~uDGNeH10U7469oo1QJ*w@!6G)!nlu&eaIr=7#JCtLt!_XRT3nk*k7}th`tSu<7jJ$`kG5l^F z-3Wei`4#ZHvpfR7n;YPl!11dc_h0ba_)`$Sz2_qE)3OqIYwOPGnuG|i@qGkO!sEfv z-mQ$7lAyXjgqMwCTJYq9GH{^I@pM11)-A~K< zFbV9&5@2JT3e99S#;!EOD?}~&E*75v6pK}RNK66@$FeKi9?~-7P1(sO9JV9QR5YJB+n zw3_huIX?t{3+qf)E{XtE;P5FXx{K{tgD8e`7#pSaHsglzxxk)ZG5VY-X6`A!FhgJ& z%I4Qm;w6=<(cOK_!u7q>9mD~(15ztFDel9KuCU>4Ko`n41VjErD}gE4a1OjKP<5GE32lOb zuoy;oE;|&&Q&zJ~JiU`$;TNJY`-w_CO+G7I10W)eH`DVX9d1?r4+9;Q+{6883pW&>WJ|UzVC$ZhhF)8GQRJ1 z<@X4v@1`ri?~3nhtFHLI3%)-P`Civgvz1LWSd^`r1`I|EJ*~fHz&>piIt`urfF+s2 z2JET>IAC)_2kdOffPHW*a=_ThXw$I4fOR|+Wxzf<$p`GhX|C_^^?w@$_-y8f1e7X{ zZ4*)bflzfeZy<3H)n3iRP`&xbMtO7n(MVKlPc#YDKOT&NYT*fv>Q$#JL(V4wzL@Hb z^qp;l@cLE;9?tZu#bq6Ey z?Rl(8__}_Gg0Df~E3?M&m3|Ap?xRuheI+Ig-*I~y#rK*9_&Sa@319ENDEOL>a(r`6 zhU05jcIyF{Tt5E_WDgRuyUU^?`?fI**;%_AMYiLCNMxTn(j;UT?Tv!$AC7Qj$DF_w zINJT&S;|&!k1AVm@20_KjdzEF2M#FU_&rkPB+o*8ueP21Fgd$B4DDnp6$ zzg;PIy5Rj_{=WVRe2>=SPaXJ;kU4nNVF1F4ff|*pS8Zi|mgfxSJ8xb*YY9>&%X3`v z9-w|4v=0+mmq;3kh9Mj0VGJz>en~DnBsRdZ%rs~Y zwCQme&r#%5H!F=+sebwp-?@4Jc*y#SJ2qL;s?8Y>!{5@N zwndv0>syh(FtI2so-%5A%1ZTG)5~KrIQCT2%hP7nolP%KY2oV4QOiTAEKCtAieicc zwo5OX*C2j0vV3bS|6bF}Ux($LO)uXT%V##dJm$w>ZJJ)bJ(fQ%HM#svSboJ7%Ktv{ z?vFP|)kZ(19j<`9%iG?lyi461DeuPr*d%$kY-bdCclVE6-c3IeMc(~S@qOcLRQbOC zau~i#${NKtV^1W$Gxs+M-_<*!;M;pY$5%ZZ1>XpJL;ej0$F)^aQU3N)7|I`i*C@&l z?v6zHl^>d9aBSKh1?B!faFiDv3SYl}FZ(|w`YOKfmoSvqZfo5B|Gt6!zprWbe_1s9 zf1j}b4>qv>$$npata1B&#+BdSj_-$F`TcG9zSouC--_>Vy7GIBNcGhoz2f^n;QIqt zet!$T&%g3}3*mp|_Zl$#4rEa7KLBB z?&196`mq7OVBR5L*WFj77ltJJT3-1*9xw2n`TmOUF_g@=^~&#YW5DIN}YUgf`9HfokM;oLF7iRspQHJQ{ z@A(kz0BJTVuK|9*h@dk6DiqVsM@6yY#V{0O8b|TnvPcw<>}mpv?W3XipIscqbNebI z&wED0*P#@m+WxC>UT`2P&Rs8r;e5^JMsfc1yGWeR>}-MuK50u7oWIz~alX7ad_Df1 z@NIK2D!#oc!tlMhv{8K5Z;QmYc1IKNP5UMazC}AYzAgSczQ2~A7qyGJyYleQVYuE^ z(kQMww?^XHYDW|J*|9kau6wp~Ts!P(5AiNrUxtO@vzEs26})ps1<`@aA0 z`2O1Ur-c+1SFZDGX0bo`t9J)K*;8;9*TnB<}+dV zeXyZX{2tgGiJ!8y3HWW?7zMwFwsQRbwj&I`-(@{L6;%cuI332O>k1oX)2B)!ab5gv z6V{i#MNx1a{w>G#!|lKO`m*}#sJNc@hvB-Tpix{Wl|`zwLMD=d_d2^?zTu{@*yR?{195^+;(`^#6ir`hTg= z|F{1B>*@5JQIYM&!jNs7-zc)HiXxG{P|^fs?_3`R*|jAc*_f>j?YZAHoNFqgV))3( zFbwbhx={>wZHUCMbyG0RUKa(!y~P|u3y9?3Ykpq%E-I4aPJ|)(;8%?zd9E-L$*#ps zn4h`%QIM?I#F6Z=rSbXsYx#Bmil~TsehSzB);6yHH9$0TQ&aT6ucPUI0?|jliRyp& zwfKFwJSu+g9S_6LyQWe6ZYzkyZ^XtX;P>`dQSj@uk>fXUb3nh{@mux3f1Zk_{~Zh0 z|5i7y|E-TiO(|-M{*Wn=2CHMS<^6b;h5Ek(7vEo4kz^sIW zh5zCAypZpULcT8v`MxaV`|^su;V>a2vNpJ0 z-hN8f+Ly8MM+MWw%F0{RFCF>UX7LJdQPY%y5>H+C$2^p>x)|Ybs2>*yL=kj{_BC%@ z;hjA#Kzh>Td7)?o(047vDc0zR5*psILE6MiA74X$f_ zRdIUWOLr08uIbE@)z0F8fo$_)H?HHDT7sZ+GZS4%*#&B0fI6UgCmpzOmtbWb64zI< z?P~&2@;oDwi6!xrO&a!3NyRilK055#!GJXgInN-ZmD;x&cdFB5_c@c?120k)Z4-w} z9)3?L-hIt)z-xwQBZuK>4m`M-M#-f9P1#DvT9X{gS%0E_u1M`|UZdxWZAZK411xsi z_Y+T0*4NE=BFX33EP21c#8I|W(j1=8?2~0QN}uq9A{-Wn?TquqK)VB>e++E*?r?Uu zYYwZ)FVTJQ+hz3;y!^o_$vqyoOg;-g6EPv?c|NGNLT59%$77aB$qmR%T*JxS0ZAO7 zfqN5>`;1EsklCA0Y2=&;0-hoUVA7B>=`@qCu`7d3cBPM*{gaj#!CTPseb{FyfiL(~ z%Z0uVOzs$r$L@h6%od;+upO9kPOr}9tmiXgFU{1uy9r59E_u7+JqWWm;uH=&JHz?R zyB~%_@_Zm}^)=m@%3maj#9}=q|(NR{4)1_6lveMroD?hNSaog1m3~fY& zlYGO%^F7&~u;x0KV%$N;a~Sat^zLu*-9_}#aZLy2npJHS+{9Ls(xAWWUrhs`ea;JX zgZcY8js(qHJi{PehoA7A7!;WkXXJ82phz4Q!G{GjI0I@$jz5vVi;ig_y8@SbAE&!C zoVwxo;cyq`o;n5 zG_t-)9)~DFJAw*kMtlGdj~qNr9<{XX|?53g_lC#NeECrRkNePpe)~ z*?)$;@dYj%MeJ2_{~uz5KM2lKc0G0wvF^p|fNLI(6Xr8Fv)%kDUDN|pP4Ye^viX?l z^_O9<&64*H@tP8lvqfRA6D4o0;C&0dej)6&Me=?tUMJD($HQJHN#1|ZRWD(ip^@rbeiD8i8=dFgX`M+}Biqoqqa14(Vt1qUuLGt4jMfT(i z_QIN2yL+SY0WINnS?hU~Y`Z);OHSS*Z}(%mNzaacTEb;n%GxK#4mEJGvcX<>90b$P zvhAQW_XMph#ggYu5(;1}lpGG>sWfaL$?;?jbHZ~>*w%xW#npb4NJA8YdLF6#U z4rTy!D1O$5LWdHpm|KTZIl7X3<3~SFYAcXAl zmq=$9(0QDRE-M=f>QGjq9M5&mEK5Hv^AFYVMW&2qS!sbc=?#V?ldyEI#zZd3dt~Dw zra@Q!Wo+&q{d`g&pMk8dr*n#G^ZNck@qE%4QJuoy>Pi$)?z0K*x>nM>e-n?kUHCHQ z94M5B$9*mv%jysF{&xC!=^Bf7wF!WUG z{&X-$LZ6S3;%)!agTg|SQh1Jp^fi@wLWGs)xn-086-wF=e-Y)s85GFqj5-lzbneeH zI_F?U=i8)Y%J3{DYdo>@pjlG#&^#%52B#d(59eB7-hCbecy=ccfb;pAo8rs~i?3 z$uk{F0;~^M;BBci7d4CZ$fy80o(JAhSr2bsjWPV*h-rxiSQ;^uaIZtDKxnSA+jdFbt!SfgP4S3BspRcU#qAHb zw;Jjid3)-ZBpK5!YeZVSaf3zVyM_J=CqE+hSX>UZoIP}y;18bl-(4yogFpQdjWTKO zT1rPS9WS)lzz0tvsX^i56}~jQ@nU}-C{P@zW1r!WjY;AF0WjW4DB$J?NVj3T=))oM zHEd$_6!|qKiLNpid1ZpMejWGY-T8hrOqXa)d%d&;{5NGwT;P}ChewNgF>@Z#WTZzv~x?2F&>wS3Xi@0Yd z0Oxhl;Pn4QKuVrR>+$;HLy~to$@ojY{`FyPcJw_`AIOs&+OFggH%$iHJ}lZk0xkPY z)ZQ`wzeH*z!*V|VFQKH80zdbC)(+F`` z@W(GI8|2@;%p)&<`fpBIytDBX`DOooh5(s@eUOv%SIY5bpHTgC9mA4-C521wG?d&u zm+F4l5W!vYF=rgzYlHxQ8L zl5EShOggKJ^YeDw0m+kqQp`5SGI<6`7S@wO0&*=5ytgGN!|k?nlV8jRyEZYFOYNKK zJ}Yz#*zj7U4U&6-u@O=IJ->fZh-x40hf_R`$C?B!8XfBJB(zDN#xZv&ha32*CjHUg z<1MQN$~xOqzWyTKwKR8Jh#YolLi+wQt$gX8N))GHT5Xl)_6w1wHZD!SBx&lm+ukDjOP&>6oU&~c zwEu&wk|6c3AC>1&r-Ar{;UekzI(qd&AiBXf$H4Cy3xlgoL%94WIZ%@487@9uuc7F~ zt6xJKyva90+;0%lp5*CVFYW{8Z5|bE9z&@~m!JTXekig*e)9om{?^NBe+BoBc8N)x z@~bBC5jqzi|M?$&<0Q@?r7Hf-TTwK=8=-|ozi08eccRSZU*6{kWqb;=IjDbK47G># zDZD%er|{R-Wx68vB2D4T{u>29O>E}qk()H7b=W-ak!zWJB4i#Pl02C>3wn&POf*aL z7T_f7F(yfJzl_>h8glfA&0=0Fxk+T7o6m03i%{2jp36#g>FlGhG%k044>#c59Qp6cpDka^#W2H zI`SLIc*>=R)(cS310u~yf_GD{M*2u|+Q;y@w>+?@zi-5RIa8V;B_`e&m~E2h6J0DK zb8ekWdK4b_ZV%F=KT?>Pi*7p@qf5`6YwaC ztzmc)5(t|;C`eS)sH02-HBpcZizJYc9+<%>tAGMRFhGD9W&j~M3nA$~PsBMO)1Xp_>TufP0VERqHoSY}5rC9hgjn>Q-xi>)V8LcN2 zgT=g;D;$W^=8`z~6J&Kr=E67XBn4BPP*FqC*kq1({4L~o2_C1E2Nvhc^Ovtd&fIyB zyY(yCo?;Urmxx>xDX|k7A(N-tAnjNA(YfB2Pb!ekB@&%g@d-Waz>O_j0q(XM$NY7P zyPZ3r=5C%yuorn(|G1I}Y1TgfDtPjRpMWM$u;$Ic`TChj1|)MYRd}4&WPMkX5L}@R z64maI`%f;YM=tjf=$V!y&LUp8;Q0jk?L`LLo!G&lMEL13(+|&LEWYC1CC9VF}UB$R&9Q5MD9cRwM}Q<#fWEYM^NlSW_xi9BS7H^%oe-ansm0NM0aU1`dCq^w_}zMN zws{Uz$iSc8g}`5%=>z_Sw-9)5JOZD68v$SNi4XX{j}q_^p8@|Qd4SPH#S*w324?~o zo4(5gcY&>)Iy0LmUv|H%;{~dM1xjpO5cv(Yk+jz2v|+t@%kG&Yp}Rh~&%c3>*ue`f z+7)q81h^F#-VL)0;V#H#KE7H`X2yh%So?}p(^F1S38k7OBMy}jX0GViTm4kn_oN$l zH~mW;?}EvS6p|JDF*NgbnoPOo#J$pa`)I!Ny@pgNPNJ}aV%&Z3x(d;JUre-OReZqh z{}s)63GTB;KiT2GC{$0skM-x?4nh^}3s#;BV*BG0vZo~7 zxSEhN2gmsb|0k?A44|iEsflOEJqm}Kz=;LP4Ca_&I}3KgguxJRcrX(osLf(Lnx%vI}OBJX^x-${&?O3*^4|G$A-LBKZlyhoVa^ofKfLlHY!EfpA8 zf%mwcJpedCcx#)hTVRB1%QlqXzJmpL%z!J3WqecE&#ce1#6J^%@!pPlVBs&0NTE~2 z89u)(D%ElZ_C%7nBJhygED)bgsN_5O4hhim;R=UBF7=Wq1MSek`9uiG3+Wyp6?-IX z3))~yuF@L6g}40dxD&TBb~3Cn0TM`aqqyNkITsiE*odyKN8PrlC?$sC-I#u17oP3G z?4RI*d*MZi$WYg4g*;wc2VgWswj9O0cR{d?HyI<&*dwl9gMA$^Skv%a4p)i>U zvgl?-Z!dxEK=F}M+B=Mau#yQ9!}=M}bFE5h0(!}+WYBzZ#(IQQn>Ke( zH2pnOeEuCbZr86*V6ceI-lH9EvnLaL$*W(*`2`*(`#aM1DS`UPJH(O}eTY2E`=^?J z{3W>$kyJrZo|}}QiXq3CE)(BJT7HYf= zyNE|y`rjE%B#f-CqT#YxT@0Q|;SAEX$MA&tcX3NvgTB=x$2_w=_k09Ck71_hCDev! z93}01m(_BH@~HKAG9m8{ZW$g_>ESfzw&RLE#OKU7F8@tgkHSM&;;J8N>V@G6sjeYx z5tZ76F-#N9G4t*mP})I5HKba8;@t0mTt|JjCFHMp2IEiG_%!`%#_+$kW08Ilis0moXqq|Hd8mFirC68 zzZnb-_dF?7=s=o66W8_Eg6Nl<(g;S#yy{bUItr_ry2MnWbf^* z@I|Hr@4r^Bia+g2W<&LO0#20_%SJqLuxtxz#``^an7ii`sLPu@U*k)lv*!bR#I#Gl zyh+0Cu6JvQPX7Bf%Jz&#lHaDdwb_+7V&=}*7DNNmYvT-Z^3L1;Y$$mr$eUt0KWJd% z3)Z9N=ICvc0)D!>{>EW939K9JLoEFxuz6rp9Wge&!oSmv5O9>X`9gy#M$;#+l?Hr8uOWQ1+J%6x zw+g;NVZir5cO*SX&^kGEY*c4TMM3AO8z~sl7CC|;CXIIAUx0L!{eu~s^m!*0;zR`mON=sg)yPv-L+7G z_YYTSd065KEf4>v^M9=PZ=FZKHYcp}=o9`GbRPYu7yhrDM~5?yc&Pxdpe-^$hU0!l zg`}KDUX`kHf%Gbiw_rtC3p`jZGBxZ1!i0Q;e7cxcW6=`X#@e13w_;L@;+j%DpRenNMw z!xST4T_by-g+nerfmH!`p_=UPLU%^u-jUMza94B+sJz0=7%9C8Uy&Xw{f(3UnyDtt z@|j5$>B-Fu=$zn%bk;%o7+xr%jjp0n`kt5SWk`SN9-}Zbt0Fxof?m;1PX#(}q(|Q- zobVlZ#w&{6W&fvL^CIQWXOVK{vuL$aV++=RORr6+tWx>xnDQtpHEOYjfx*+lfOPl` zmSaT`x?bpOqw*LlJysOOv8M z+8!Q3*Hc-$S=erl+-^nDETzH%<#DmEZZUf@Qr&m4?x#xKFO>?bmB&(F-7V}Xj_Ouo z-D65!QK@iFd90Nmg`(YrVhn8RyNmcpBrE%WGBs$#7qO8(0u7HO%Fy?-BC)-=NMFC6lM9^oTYpj zi!SFPJgVg`%AhY7D2=}EtMsYz@=N7$i?6&=d3j8Eto4-x6w*th@@Q5b6O_j!Q za;<=NU(#NX@{L{e6JN=&PG9s>X=6d&Hur2QAU~bP_hX}^e2M;bqQzqD?C3~OQGp^i zUD1hJ?T^=QKSHnl1iAf=O8cH$3e?9Viajm;e`7?q`+(8%JN13QbI;cV-3L4@zw^5f zSWa*C?gM_}R~&R7aIs%;(0#y(e#Jre0sH$EhjSm04T4&YRE^Rbk$#kB<715s9G|~~ z#^+xVf#dT;gl~Mfg6kq6EijG|SFqIpM^PA-sL$l%IL^UA>tkhu24m%cST(NsW=J6n z6S;%<4CtY3ltwSC!KnG|?ZPGRZd{-6ny1%J4;+H9Nr^#k6B8HGPE9-(*E?>4Gfywd z>x>l`sla*C3dF==r~WQ2pj7Qu3y`#q_G}7}?p6%1E9te+Ze#h)|HAPZlB0}I^SQy}Q=O+VKF9Kc z$7g$<*7&T-YjAvC%WH6aX5}?FK2PK|I6mF;{vQ}0=gZ3YOq>%uK11ecj8F2M;PL4^ zM{9g;oYUa=T$EhI(6wttoYKY$qpFQD#Y!84{n}W8)}&Mg?*&R#rc73< z+~HrPR8{3mRh1F>N|m!S{i@(WGWUuFQq5$!N;kOq$8;gA#xS{A{B|bUQJ+%-j&`gG z(Ae;D-|O_!>19ohQ9+TRf@0rX1&S;`DDG=>iPYaNE# z!BfEb;7K6r-a%!hO4+BrvS=z>td!;W$~-f$EJG(%WQu+n?T=rMEZ$H>sSm_s&ooctQE5;e7h` zL8)N=UJ;-A79GQ*tKu=wJ>;`ju2bSUIlBM36RZ6-$O?zNua&^@X>sH7n))>1Twu=j zw?KW9^R+{AdNQOauO3x|_&R0=E`6@7>%U@Pui?9d7^bKb? z1fC@pFTX~I?&GYTvVlQFw>byct)u8ibnd)@&b`y4;_J)z`TBqg8?H5BuN0qO6NaNZ zAN+jzXCqiisZvi4`BssyxB>Hz`D*5i^yeHE)Jg@;d*Ki!=cUqmz%}`AF9^52}XBfN@$l8l}7hWUaU3@GAuHxgj@f$8Tu~~lS7L|Z0#wJaF zS5LSO4C8hdj$D(s6~qav44ssxniAN$?;?ux-T*&%Q{<&k6*esF;+E;KVX-*^BBJm; zvQ05gfk_Rn;C?){_gxZRi25!WB^Kkyj)&j~Z4)>`i|38;15bo>4^M8#k_~*n7aUpq z54?8|W@d}K1l*ls$D*2g&u(DL?p}RIMzdi&!3O%-andpFua6OdV7ZB%a+H;Z>*eO@ z_(EB4&GAL_?f{#;^VNvkLxW}z24S+t-0PzxwPK0X9nFj&OaS%bCK)Zg=bdLlW&7tqg(j+omSyI zzo9PLy5Vnza}BrjWxR3QhAWtj50Az0?kAWEu?{Nsc+ALtF%{!;__g6gG#{~(y^M7} zcJ@;^XN^A(U|942 zj^qj;h7Nmb!WH~PPZ3wJn3inDb;9U^!aJqJ31i`Qw)=@# ziU%WAbYxpCl~~WQ)#}QqwF*Wn-m!SC6cCw!^dy50mYB-@dw3KV}ZJzq$bfX z=YGQM{cJ)p`U&Y_tPb7I?QvF_7K|)|K#sH=J)e>YqZgkv%C(PMHYw3Y2?0|Pu z&%uo{ylN|4^z^~MJ3W8ElAs0|&fd>7&Y!eB8{gph#KW1&mfyK)q0c97pVqkZiLab# zxc^bA{&NER_id35off!7nmSFLU-*A+fARFv|JMHEKSqbOzu43rw7)og>i@NPS3diK zNBaV9@;FRMT)`l8->VIb1g_wWzeA9jA}<6v4Lb%o4L1cDA!CD_hIN36YYh1bfjvXy zTr@idqS@iVvbv|5t&9s}JclPh(vw}$Ws1+PIL3?=yHF?I zF@@ZH{{g!u31aga!MM5G$|R>*c0P8%HpqxMm+doRSQ{tp!fEl>Y-r<*6e$I>SGi^- zF{rUA6YtLQwX}D*($dGOme9%(U&)r{_A$87!z5j#)S!{sVN!!<|d7A<{o2)B^pQk>ZEvjDDbPv$gfH#;pDIhUTQE&`~n+1 z12;E?Y__Bdzr2dJB(G^p%KR%J_+0du<4j?7l^l~4Q9CFn%^V|nH|{q}*L~Qe!->`7 z8Ju=WA!c-Rk$!c>Vb zqQ5#4uW`0y9Tl!bZK{E@CDTVMpcGBiWAsxdk~s7+dLWr+*hH=PlRkPS{MiV4wVcoZ zy{;Rg0Qq}X5WRL~Y0+yqQd=eW{es6coBuAVW|DdGyg$M!}Wfbj206_)sjxt***gD10HBxwJLA1r6LlI{? zLt>4%AE;@mcJMr2+Y)M!-w&g`h*~x!`n$VyvNKU=bw^#YVq18B8gG&MBTIOKnEKJX zp_s4O40?oTgtYI2_u72dm6_=E} zB)Tu<9&h5nX|M)PZD7J=402R~0sG=mr~dPt6dRCt7bYVNCo2j~N@za_11L@u*bH_L zF!JNhLDXGdXz8@1aI2zBi~AbcI!K~eem$k$r^BXJc~l~^Zdc-nW}X|FGU`d$uFbE- zcztb65WP>>Xm#bat>uNhHnqHaP%H)O+Y`Z*xb6AJYT?`4+26#i!mj)cZ0w_YS~f7X zt9x#hGQV9dJu`7E==Y~lPE%p2Lu3H9HGLYnc#zR?SlF1teV5v{Zi0~7rVc_YfZfGF zKnPM?sco;af~($d#NMNV;v1pf6ytgohARKm-T9YWaYfIdXR7r=3&ael<;S&sk&oC` z|GRXrb(nF@*y)VS>BgrgI>t>99&KBmexrjI+QZ=oI0aFj-d<9t%fNmZW`wemvE#?* zj-H+~b}dxqg;~am?bbe^G zu{=E%KTGtxpQ!$Ry5{#08s9r7!uJz$w7%a{7ySM72EKn!<9mRIQD~gj_s<4>pR#(E z0|$^*%bbuZq_>57O|f2kHkbxauR5O|j_v0-asi-;&@TI?WU2!kKglt0B58s!E4DoO zW`ifD&PUIQQyuVq&U!~K0W(1Wr4pw|3mgLWedRSdz-T(0V0CwGVU0FOUV!7nL8`Rt zhf=s}#V_%=;eB*`jeN~`@IS~OFzOk9e}FubTjgJF!xhb^ne2Tya3<3{o(9k{kRlO{ zM;Xbk2NIM?LemkqDP&?T$xDB^6#hI=4MW5u>BC_6B;UqnL0fSSPG*A<2W#5aV(RO3cm4U?PO2S%5A8hu?>~4A3IeV zeqm-1wTBGCIgY_wM!$gb{Om)%U@X^$;5>ioPn_pRD`}oz8ta?q*>BQ3cZ?_h0P*Ym z^)SohxuV@s@|0)O+}0&<*`E$t?$?Aey>e`m+tr4^dG7Jy(WD+>3*+G|0`E?zt?R4N8RtW>Hk*H_b3Omr?YvR z8KBR3re2YCf`t@$SWT_ioEjcpNLofILgN>la~{(dKK(LW7;E&bvtMdkLcb~p#w;X3?sUx26G1+Ky-Q}#Z=6^w{pddad#ZF)! zh_5`sOjyft=Rs!lj95*t3*VO~;MaX3__GPfq{&1f5?PU~Wd__Dj>`FK1 z&oZ`7;a=I4_>0%%>mSo-i4)KXk&K^kuaxGOMWi&}1O=@J)zn+ca(_;x$bDtSLk7W? z$~NBy{tUDD1l^XObR^!OY=BB^zSy&3pkF>T-i0UqA%%V0b4*`%p^qV7ygt7+f_t^J zO#aM^!3`N?^qeB_g!`<| z*VLzCL?ZE_ku~-CbB(PXc4pG@E@nns&NKL3ZFtw?=KR^LpbPJQEXLbjj_=Z(&)>+J zx0>^xV}ni03cbK2e5f4ZKSlE=?5>y+k5Rz0`Af242C(ZwdP8^5RCCM z(z9=d#{SDSX;(ptClVZ+J{=FN!zivEX`nzOyl_dJIe>VQ%fMUGVrY}b=FT%2G4Pia zt3{c-ZDD`OJ<1fpTgt4$VJ;s7hqC(JpuP+Da9mv~G44tnI(pDG1cd$8T_c?*>e15# z=RZ)7Of+3}R8;Tt77&n-?nX))q+3J*3F!uxkVaBUU{?tN>69)3=}w7-rKAz1B$kwh zrEBAN{d~{wAI@RVv-8e8^US-u_uiQ|W=*a{50BknRl!3>e8UQ^I1=p2V?1WO8=Dzh zJ(08gyH1j#qmrU=Ny9!o5=%v~@yXGRTZWhh2Qll=uPUrH4&HCr;!{7hJD$I+Xmc$( zTf3(wdVF-DJLwd5jW_T2ER*{0y99a3jjsdXu-&pV{XwGvse3{GwGN2G)HRB{StMbU zvSZ_Kxsid(10N59Q`#2&f4BW^ao>J_y(Y)r@yW>yZSo}G=6yPwMz&+c((*aVlS z*HNsZR2IZ}bYVHVrgI~B%X{9~u4@~@eAi4AN_#PBcO60K!M*7~mn*FnhXm=Oz|j|u zcuZf`&${NVtF95Qr|+6JGi`H2dOXI~zLW7Ny;fn=Mmx*rE*mD{d0}qJSRu-t2^agQ zdxswHLXD$VZuEbxOg@h<w<#}}14mYY|{G{;fpO~GNZY=Eh4(*+mPY=Jgdb{l5fA8p5eiM|9=%85WCWSGF zjP?fbHlQ}myw!sa?-jNgc&akzpNVL|v zj3P(Yo5jn-eU|?SkN?}rHM-MjO)Y7|pzDS2)^GR9fJz@!%-61TcBZ9AZmCRf=j~Qi z^nvDGvEWQ<&vn~)uF9+Tyu!~(riYYwIa7*<)7;2-bW;4LPEuo-bOjiOuFdRwRK?VDn z!xJOpQF=ATK~XoDeYU6Z*seCyP=uZQRIW>O{gZbal>sq3ZMZr_ma1lN?F7Dhcob<0 z-AD%dUaqyz zFBl~ze3mur=THyi!Z&_zx38wU(DLU<-;3^qk{(dX?JPAog{2rn0!dZWha=QC$JccZ zO17g5I4t3d4@zvk(`{Z`~QO*-xnjD37y zuPZC_#@i*R?hI`H)5z9aQXiYEx-L5EXJ)URZ|mV$ncV|g12=A!* zQJg3v_c3m*?@FaBU#I00@V4m&;!-3gzgC-8grI#syhDri!7&FhtZtMehcFr~D%=BU zyk^1T{D%g~PaYk;QhlV@A;|(GI95FeUyZ`_@Iep0J*JS^zi{)`?$W?Yw=U1os(T}1 z>WLZWp{ya{V|Qs-Ta3Wp3UAV9Bvjs}@w(!y6Png|nZVjGKq9)KQ0{&IXDYEO$;pU; z#6_PP;%H2xAau+kPoPmhE0qi7Jn04h+I{nAo76h%VPj$cTato((a$qPAH+0k?)$x$ zC#Z5qTzR;JK$wF)rjX$0pH5Naw_v;BY6Qx?l(+27h67W=p)}ct2Kh`cc}_6Zvdh;a zYx%`Rh{)jOEJG|-ij&C5)P&F_Yl5rWAamlUP~dt&pXaJ>@kK@XmrpJ!8RB{hLVk2# z42|BHFnsCunQs&MFzPqRsQN=AV&!q}s&ncQD_vQ_Cjz*df)S!Cj~XwSb&UHO&oCSd zq(snNa>;hhMkclK)X7zZrIT+XzN9cZ41}^NJnwv>4E&fg zRDxvJqL(Xf^M>Dn%`Cr}KuKpIfF*nTsx}R-=PL69|Xk zR6|_%t%EO$S|7!q`Tp7fg`@zX0+f&?xBKUNSB1Cxk~e$s{Ya=GB>pu@y#i9QWCBfL zMbB{}FZMiw@9!tv4Vpk3S<%4TH!cw0Zf^*YS2t_IGob&L_h?@*!GQl+^EuyB(?{(p z-R>!^Mq-Q>&RjD;<-sejZsno)9bdCWocU$jXOU0y;0h?0&4LzfgO`6vn{HFF8dt!h1O);O6VwVxRd1m&CMB$hh(jUcbLN>4+D_BDD`zwK zNyqW@6M{uetJ)^oDpcp6$fzJrg&)wuMfk5jsIHlZcZ|^HLiAD6ZT6oiv{3BKK`rIQ z+)c-)TZ^=3sF{mjN5>x~Zy^?=WoeR0wZ^CfelrnGIn?VIi%24Rlpwp$l@WrRM9ZG5?9I+NrVoE3Wo$ppobp)IQc|RDfkJW7U>C3p8kCHI<>&v4>Zp453 z8sg3jDB4BwRBNZ9Wb9Q7@UqNQ-cu@|lNi)VeLEr1rvZMeLed@#B3}~XE6T6ziH!LR zB@%Sc5{g}JSh>JHar%Uuq7Gu*^4>VP_*5_f1Y2B5@ptttZmYy7JvxM0i9#8?IZ5iA^TQeL4)S4*l_X z;d$w%p4WJodRa5Sk)m3&&I$MQdpL-Z^-a-1Fj-yus+ivF#<9UZM+qDxPxstWXw{`I zz=w7?h&=b4n$SESb;g<0n^Lq^Ah7dtqfUXVhsF5B(G;!)JLDKqQq)LM!eo>>dt|@Y z-(JW%oU^{WTPSKDAzIAnd^31|VOc9tiM-GM`>?9T+iERaqR!04k?enBh#O3Bc zL#H7(sy6Zm&tw=$C7GyVhP8;EBP;B*yMom3k9!Z69)?Cer)`W9;{-Fy7uG818XQYe zTmEb`xf>q=sgf0ky}#Qx_=r3sov)*PUMi<7#2?tQ>^QSl05>gLs7$y&CUvTmNU1z; zLn%3*sW;+uN}2a6>Im`h5qs@Z`jHya#N(&0qx>}kUP^}`b6H=FbLRY2D#}dwJp{Sb zyo&U{wYk0WoJqW^uC^kT64_)-Df>l}aR1Lx19;>XCP7WKYY|NJ*?;^qdB=o_Am3G@ zI>Q@=*B`}yf9Za}va)!@gXTP|+&`)hsIlTW((w54%?|Rjedn6_}qTS?ds-++483ncppE+k>Hjz zi0^KkeNYUXu@WG(zt9u#I>7W4a(T|EGYiUEV%=bMhP^t~OTPb1@w94T``f0?(-a#p ze1`rf`mY@#p_&6TUv9YO+IDYWsuU&n_4!4d^VI~}0JAdT(wa$XYtm$2&06v(k{sC> zbR8Et0(!xOsim^v>3gtUBI2(}&>nqQvL4upv=!d6nh?|y6R7oPu*WXS-V?p_{&KD9 z5{4d6vlq{M}#Dr>D22>B-j{E;maTuD$AbnLBaJ68u`J z6J3cviZVd_wkAXGEV4a-zhft>F~>f#h>XTqD|k@ej=d-+;41=Yzn&yz{U+QT@FTIlu?8NVKQ zS#jq2{`BCJa_?nNlN+z#YgfG#41B(9bH|gGc zCzib41m|-z@$pkXjk44|m?cl*2a|H9D;X~^qolA@eL3BEe55Rf3be6I&Yd9i*5DlL zpsTfeLC+#FN~J>>_)GUF439bzV*?5NbadX|^|AZBn{&42$yscR5|Wf;FFqk)MCk3q zo|^R~;&Em3)Dju>1P!+C*%`-#mNnZc0AJ*4BZVDSV%T*{pctT&9);y3?*|x(}pSK5p zfhv0K)v$HzAA6L!)2kkP^PGWJ0e}hF{>1BUoC2mo=Kq$LyAzdQ!>o<6}wf;M#td^*dnb4*KQqI!KG=g4bG=UBd3vfroM1 zBN9_?ckLzq*`<#=;}9KDvXdrnoJ%v1MVOv+NCkS;m31Yu;NX{VCT$W%P&84>S8K+Nql%xIjiNYlJ zLDRsT@))7~s>Q4J&2K}*o@!R>5#466O|vO0vCRl!90Q;2=LjKrpEVzj)X4*-F0SMF z@ArQ86~AoC_agBp*c!R7g| zj?kCbs(?CiVdMB#!&K!%Ur1G29jIj2;l}HQsZH#~yRd`m!HIB05am6_1V&}N^#i#A z`fGx3fd}bU@$A5Lc5bF70^jwe>^1bIYPPOU{$yO_iZwjr3M#g1n5JLo`Bg%thc3G&Odto*jjw7E zWRp0%8I4I)G`zcuPqob`R9WAU(u<4IK+?r(8htlQ;ty}T!0Lr@Q=P$-W9{s|KP(Nl z;5KNqb;}*=boC=%>Y;7?MF&~0(5Sv~e6;y|iW(jLWk3*Rqe4Cj+0x^-Z=9D*!Y-r= z?)huJF1*sQ8CNv?+!k5O;rSEn0C`%n^p|1M&F(=hmfAz9 z#c(@tdDgCa?%Y$gJ=;wVU(t2MJ5#V8H}MykRP+;@gp7s-p`5zGDLsw7+`1_d ztFw!qGF}6pROf=lqz@*yDZn!!NKxs<03HY6Sm7ESP|M%G!*UvX*TH)V7Q6;$D?!#3 zSz0aJ8`@EOFWg_J4s*W~^^#J}q+3y!*O;?Zw<$Pb;_431P|V!f51|&_Q0&9EZMf!|A8s}@;*+@R zJRhu&uh~2$rmP{5SNnvO{8+;c$04ut zUa`SCfo1QD>n&)yjt0zz=qNAja_@-3wKb>s>$;}~Q>h}?{->0+$YO(r`Vi_XNg)z7 zit9FKRNH%&qMT!66AP`E6L{a=a)C*@-4TJKp%vbLS>`)2axv+S*i9i9^*!uebI;kO z?LH*7W;w~8fio*u2|kPdFx}42Tks|0f%)K%eF+ID=o*`w>h5qVdS^{+aqvU+HBtwJ z$-9aMHF&V)JgdVrTTICs>!2YIlqJ7UJ+SMRL4VNZE#sCLI(A3zsOoBK%GGwj{F4-#Rt zdaDIx3pDju!WQhn@z^UTZ3iMRmS5Um$6WQV)9kqd7ZS^@E16oUF2CNNw&%dmd(?$G zg{K5i)A6Uyws=b$N*Bqp61G}!;$BWOcl&xT36(@SW!|%cBxg(wvB;3a7sOlTX;C zGB!;nCg;qayG-J|#C5wzf0)31#WKflzxBhdWi$4@=AHdLJMEt@oi`rvFeY|QoOyn9 zCH4q>o&~WpXV~c0u@fLgJapcBTYIJT0fUsaFF3f%9!O6TeEXrVYDxzZ_1?lOJ z1Fw?8+NM-BsA&gME1Lap2EO=S-QHXUgthT9H$-^i-ic^9&WRl`^P5ki4WF~36<%lk z`KiR8z_Y1VxG!{VzVn@t;XC$aTcycjYr#i9D9Z;WD^Nu8RR;NnSKZ}+j(hRpTZGB- ztFzLcO*x&>4h#6az#kDOL3^IipX2Jjf<-eweV6&C=1$ns5}MN*8LT}}e+HaXMqJ8r z+P+MP<-Ear51YRAh_rCQh}K3!oTKlg0tov|vRub?m1 z@*e>ncJ#^HiV(-8cYhIaI3>!o(C}4y{T`5{U?~@)UTENVaSia1xu)jU)6zJQh(=C>AGr>~KLJS3S{Qoh$5vY;MyMRs%lPV??6xz{C?o}Dg} zeF4qJaVAoCn4ygs;2TY69^I*u*y~BZ2b?QAB$(wPP`%KIFn_;k5+lz z5rdVZ%zrjsoG`nv-~Z86uUu%hdHq6H5w)EqQGuq&Lp{mjv6;zT z^Tv=tTQ&i#ZocPOtmN1%kOuDyy<^YaXTgVbT#s7hQ8X&n-JH02TAyjBsMgxaRYo$*f(bo0R34E^iM zbsZ!ID?3tscAJY0JWqi+J|5Z89Y>sB?cz_bXXWobXOsd zx{dx=2=;C$)>Ya=>hj$-vixQz7FpGl8$U4@vp+Z=&fHLQq~0K~Ul3FFPOl5cjdPzKnY!lw!O&1QRkBw{`(C*t_}J*a1o)mAq#UTxZv;H(WatlD#v zxXQ>BDJf(WF5QS9Aauc$^Y__=S)L1a9+^Fb$B0v+g~wg3k1GPM?shBSxd&0I7$`qu z7%%5{Cw`L+FH(h#wdnn*l1t1B&bM_O-d7u_?YIR_$o2Af-JDDK{er%q)MIvi4kt|Z zDDZ1BW~qn+goCtd6rr1#FYoz@#CxK-F5UgZ!AVKnLZORi<(J232Rb^E&ax6=9<%kp zBf{~r{)B)B-q&l?A=zA{ssv(>+d3}0=&FDGcA9F- zaL=mBBPf~HQcrh$_UQ>F;t^+U91$2i*{!8Mv)TE)OMMhtQ1 zTeD6E)50re-I3Y`8;pFqxJBTm^L%78Zq)8Al>9WmwL}>-C?Iq)o-Tf>Rvd*N=!q`Y z>V9@p9Uyi$fA=D>ivx5>QxRJ8%LKNo=fD5=+tXT?(&@_h%NcW}Fq~P^T%Xig((KBL zEfpU>(H$;u>!NaXbXqLkDbQMNThjJD!ew?8|MA_*Q*qCM+buC-eCQvcC^*N39x>F3 z8hjCgRzuvK-wv*sELh?|z2+#v{C}hvpr)`%eVmOs%Lw=(M(p~n^Dk+ni=8-P)I;SO z74Vf6{PAQXNGQ3r(D0`97x{;sVtaPmomnX(+ntpu?stm&e0Dp*vsj3b#-_FF%RoM| z&vAtt`;Lr^v>>QTAzp|qLy}^>CeWth)P%kC;*~J>*m5+UXSv}LZj=zn;r@o2DI|`rz$a*pTBWpey_mdKGSyL6ePnG3_2`)D%z1o(rstKuCYuD94Vm@ zVFh~d@p$Gz36EXn)ODWwf^@K27Zpi2tG`J>Kw$|<_hifS)LY>xn7;^hDFHj;Z3#@O z#@-2-vggFqy&(K4?3gbGdmvlz9l|QgMf*JJ#{4?|8;m_hWp%N);5>Eek$87CL$t?h zFX`)a9$wwxSxbC4t=@Qap12MX2%p-4CZea+C#1Rd+veZGlf$V_uWKPQ8j^1O@4VMF zaX)*n6byrk94BXBtse+;W1g)pHooeW;@}qV&b4ERJYLl7>%HplBkW$x+FiXf5_`TI zAukvVDulscWY>Rcc(3D~X&B&n;{a5{U;29Xoe@Iew;?|Kp*DGm)V9;w9Xqv z1WWgJHk-zxgY#NP0-=;J@yFw2D0MLWEd>L-lRK{Os)>xGn^dW6HOPmOIhf-?iHcY= z0wqR4boI~C#b&~8km{#F;EcZwMAvs|rrnb2)j`aWv{sz$VzCku3JiAJG$5IwZ;gA!FScV*Wk%+XX5T$+K8~@*gNWAwQJ=< zm{bIH9gT;d)0%4$%+E(j{!f-<=_ps@1tz%}3Y85ISCx&$#ul&ZnjG46R8YbQx~n)0XcLK= z&R+OuPskszH(7Q7*DO3~wsK+WhEF~m1>y>-P{oHUNJLFLru>T*`Qx9tYQWsH4Y`U^ zuWR>7qEU30g+0a?aL+=bsL3z+AIb1nr1qHiMwcGGPQi_pjFHD{GByiP?le?o= zbHK0J5!UK>VfO`;kyhzIs=}}pSV16G71X~bx z%p5+wMer|HV;v%UOr9Z-4wa3L)8Gxnp$NagozyGf#kpRo6ZxXw&nV$7hIGt?@IxNJ zmv)f@xKCUY#POe=Y5O6sbiBtOvNN6KR71#v*nu0qbQOUDldRqA!wBa;pCnJ{Dbg0EZi1)=ClI%`K|m zT`l|r0rBzzPEjpDzy3?nI|o3I34k6AZ04cg7@RSG!|gGH6$hY%Jk&n>-4*Y@<%sfu z@K^xJ=ZPGEbT|&-&Omk!f#u2^F`wUH6wCh=z$pQ+90%W>32`5qn$a2a>VNv$E6zg& zB_F``{$&Q7N)J$A1cVHL6czwBH2^l_`0lR40aWM#Jn27_oMI{}$P_?508l^G@1*1e z?plGvQD^$DkX{WCHK#vN!?r-n$9Gq>05S5!cUJ*`Q^SB$ZvoD|6z;z0%ASXpqV9MW zpgNHa5Kt_Vnb?<#R{clqAni>h-qpw5T;0;KZ3lcNM2yr=`5*a_iZ$5fPwC=weGb( zMJgh-8KEW(zc(y)ab`~#VZUVgtAJkyhHIE?7WD7UE%vNy1DgwjLWEKQ43i;*9tuS$ zmBvA~9bpZ?d~p;+3m=+l_Xgv-?g(s@g$TJ$5MY0%@3`mZQ=}yG%;2OXI}o)jK$an( zB8|&q>wjtOPz18E3~ZpjCzUc3QSf}VrjyYL;!(6~bps}TX91%21I<|MjL}<5)GItt zq6QPFog+F#A3z^7BhzC^N72qeX@h4PAz-e%Z7=P+NWxqWl?5fG@`HTP1tREr?kQ3W z>ax-5ZE1Q7(M&@_Om#3yTDKUxtXm~&Ay#q?47_q0L%P2vb?(UqE{|+JSwz1xy{#pL zjzV{Kq0mHhc40Z7tX7@yX~c*c9FgKM1 z`3g&h=<(gOf^^V_JU4S71GL`SsdC{aJmKGqCg?-bWXK&5!;iYne1N^X^GGQ?Q> zMlAxa$fALM3wRvz{nP`Lh+O=}fl&?T!eo_g=Cgpw!(r^N4bZ6Pr^od`5S2ju#rzmB zVKPK%_Xo5N!C3KqU;)MRA9b(hQXz(>xj+N$TfJdu7huiE!{sNYjeq;Uk<{M1drnLr11QXW4hjOX>$6d6m z|L!otV?eGA~Y z0bDtN!yAU?0ys|q7Xja5et+r#C_*voG=UJxOs0WOE8m2ZSlzU8sG|>oRD=PksIW+r z+Fzqpe$1rtGZnJ<5`96+g^{e-g!=(F1$Fe>xg?0qnv|h$Ov9bu|G1m|$KBh90IUqi z+V|7rA-0>Vukpwrd4Zd6-z8+$)f8^7IEIz9ky?iHLR8>LO`}tf>>A`E4JW1rNX5{9 z%O*Fr0@1$Z#JqO}lyWc`))Jp7k2<(lOp0G5c{F9BpcwIG~|v;bP8E#7Ux(*biY)X)!! z|KSu^DyC8()WE7Pz^aycG=NG9k|EUJPir5kqm6ccCW|InuAlQcs{}oht-1k%A|808+n_;CH!Tx4Opv3T!A*BEE@a__**lwW2 z9s)I?0*JVP12g@fIwK=BE;d2D!_b<4a{8k5Z&m$asJ_*W2?Hla!*>$|{a?j$y+5rD z-vrVASFx9bfH|XKC}pgKi$ zboKwM&b8t0{eP+h>;6Ywg6`?DYZ~$n7^a9K54{4!!dW`ii+=*hqJjH7o$1^*}H=Kc2k6^!vM{pQ>x(~q9PM5R4i z=+#GwnYF>=wB4k+7uuv-oxAy!SNMJMj(y&Z7;e|})5ZZ9mY?b`F%a8p?`B(%4Wgtvqu!9Wh~Q{> zwfvaAEbi3PHolm&P{-dQthwmfgj?$ab-Z3rTGpzXR>JH+(vJ&i+9j^wIraKi)`sn%I@ggc7EXGMBX&T0>n|IB)2(l`p_(ba560M zOJjy`l^3yoa{B9tcHg9=>9LMdQSVa5n}DQ-*4)vMC^$lmQqqqs_Ku8ojXDT|75<@L z>n3}QZ-=cm<~fM5cu1~N<2lbk%fOszgv*b=PIhlCM%s$Ro|?wYaYxC|wT{uSdrv}e z&LfL``8}~4$ZtsF)g5^;j?|@Ss@g;R2YdnksSA4}k^}SX)Lagz@TWgI+5WKkFmT-+ z3=W=|l;xUo(10f%yOsMP=-#bx|L$q)zVnjc{(wo{4Mr_n$u30XaaYRiB~vfLw7$?8 z!dcB=ltsLlon_rI6SqP6e2!xI{Cf^n!YAyEggQc7MJ^}cROOr@nobBh0i*QyY%gSO za=SC(+4wd&EAIHV4IBx|p2TQwxYD{;UJlH)JV{F>jjCNX0)LCIW#)kz4$Qf8LC<1p zt6$EB(0pk`d@@KWTOZ`r2b_A9D0n^agjwoG?yPzS2OS4wvy{ z)W6_X?3}QhijsvB$JUN3h^eL}>gk7+jsKK1G4Q}Q0!x{I>26SxJ*1@5upb+{?p_311|4Nhsq&+0f=XUrOVOi-%kl- zD|mZxTnaY=zvL_qJ3JBdCK5SJOrSx-sAGp_2Cv6s=`1Z)F7>aj*)ZNGcw^}^ zCONk$-BB_jWZCS4dexKV8spl-jdZ7+_*{}*xxZk&^5dF?Gw2A{+467edQiPF&1Q(6 zsqDTO-D!W9$a+5Q!4{zomI4uGlknJ9VPQkqjMjFK)_q~9>I16hv!FqQ`Ji&d7cx4$ zh7U#2)nBQ`7AB}k93dLXA3F_BnszPmP*j6VQfEh-_p=eLM(Qye9P;shW2HQCF?g|O zwO)zeMdJByanrJ0$t~BUjT2$jF8K`s54iCq%_DdnE{kg9N8EbcsvHMq+YYgLpPbP? zWoUbV8B4dck#bx1IC&Y!Fj|(kbuojmQTDgSa%-zy#eFrHYSlRZz6Hm^;3r*UJa#l8 zc;LlxV$}Hgg7#Cl8ids4>lfckvKKtGYz5=KE@DL|{T<{wqV})FUd3;O$FdoJv>~W0 zB$@63$1JwTvWZkH`0mN~7}#3x9MW*h3R$#+@7N{m83!fle6n20TEDDiz}*v|>FK6+ zRsnH%o`kL+Q9W1(iF{Fv{fh?S-jljo&x1el2V^S*(0)Ki+3oD&I@8BJZs+SCX?<#G z6n*@JOF5WGNaK4@b#P|%{z@J1sPXvAw!k5$))>)>9^{hZp**{^u$vNx6LzWxW4JfD zme`t3i}kGPBM%ufXtw`D8%vouQTI99RI{7wp)^n|^z|nt$`5IX#FEeB$uUdfb5#%a zxYA=h?AQNn_IaqezLsupA-DvYKzZ13`N|du`YlT?2G7}!iEJxLEB8yB~3DZfpF^ zb`(=zr@Ydn(|s+qZgPdm@8F7W5|>eN<-gg6FRh5l@+RSKG})a zT8whvM_BMeeTO}x(MqCC^qPmZ6qBJ@>~8B;+5x2xES7N={)nnjRNh}{5R0(EU60!R z5X*MmqpR@=bK+;MicrL%{u%0|{=zMDWYge+IC5D+c-EoZN4`g;;>4J{9$P5RykVa3 zO`{+@RKU?#z_rF!qQ*?F=?K`8U!o6%{^!Et%pJj8iiW;7JDk{e5?> z?u{iTThaFHafA8gq*G=9UYpmN=Ow`oD;!CrKo%QYWbVP+@^?~GV8~Z@2}GhJl2uSu+3BS5@i*qWc^@}{_JQa!egC&@=a2j2VKY_U@GZzQR z4{z!P{)@?-_0IT^O_^w6`UO8zL8FO>d41=L`1$@H`gsfy%T=WL&yPcGt^0}DYCNBO z6@QmXDsMW!cA?wtHbo58iyIIZKquXzRM ze^C2Hrr0Uqcyx(v0xg8n%z^PpS<}~AUi|+PGOm{FXq?)YW zZ9wMs`z${3Ij@9`Au{^ihkB4s_3rF%N$c${*!oMB!2`d33caXvaOEclY z<)`B`TlM;Z@}=^hM>b@(>Wu@%pWm|BrK){BZOaac`Qu68UIgjNw*Hu>8PAUTGI z5ZKPKa7*7yWu2nR(!-W`!FaH5!a3C2>gM&N@K715+<_Y4OT|nU+@0>fd7s^bj;bwyyE79Lw=03r_qbuQ8Jz1SkkzaF^@7?U=GWA# zfk(k+7tVbvVIv#Apc#8-7oN9yk;hjd@rOLme;g{EW3@OJd$4^gHr23RWAe0n{r+SQ zvQ{a)JJW%1=?}Vg!RpRR;b`fpBl{&bpYZ17r98)3g`fxb!|30J5}sI#8U({JxSIW=k!7=kw&8RHbq~x>Eh;-o)VvxDt7DDMn5sC0 zGoa{aBa0G)hqQUBH3Um&UWmIA(RAPw-}_5a1c?R`sw3%ylEnOUI6Cx+6Hkh!e=_;r zCYOrv)OxOdT5T_St9PA3fGw6n{usZ2fnH}-gw>wmH482uj`XFaj>FR!8DoEjgcag* zl9?A_xa%ShBQ5h@S{JOEsCMWc=4zw0*Pf6Ln+ZpUbaRB2y`bt;!0Xm-c)F-g8%3hewZS>OxMuk;xXl> z)-pQN%uFuRx0hO;u)%QNfznTdKqJM?+AFHh80)uWou1z479sRS@#M)#+HZ;0bN7mv z{spd;IEkFr`E}t_qFT1}!v&mzVZobB6geOLAP70{UV{_-mdkob@-VdO2_a()_5zow|nEm|jWl!T3qUZch zRCk~%i)N~b$RRW_PXJ6Jsz!Ka-fMUr&2Nf(&;t`N&F~)kAVCcYudjCwp zQmOrm)!_Q}!LaN&dQc*1n#9PWzz}oYFD`CqpW-_RiPoQ^Tc!Yt9SPjy=Vas*RU-W| zS`i6nElG0;%M!R<@u-DZi7<;<@1*Tp)cvJJC*ScnoWu7QQtAhne6~Im^=BF%rW$AK z7W&MeG^FQLW^@lxFEJ`N-gE3>m&74a*Pe*QxsXoDIy=u~); z;2}tz?~SYHp{^^J*B7lV%Jxm9h8TM=2JGFJ{HC@>WBO}^-A~j9MINh5UVSN(Zl}2y zsp+GwsqLk$iI>l~zqHd8xeIitaes{nU>3vrYy2|aj7Jwn{x*|I>Xqh z*|lr68AtEsHfYG1O)Oow*OLf0i|!kxUScM2=*4n2y;09^_hzyDQ(GNME+|qs#ATG> zhE3&I-f!cPK}#fsC3YrepZqYVNA%_9%XYq-Mik9(gV=$Dpr8Bj!@0eW5CQkVhrv{C z7l)rdyVhvjJ=m%CmD37de_g-B;a!ZNpkyLGK#=J?@}fAB7o&-a>HBs0xp-(YVMA~( zgKGX#GC*qYFxEb;PHj?mHvbQlNBF*{FLUQu@lNCvbVg#_&1b6tg4#H!dFPO3gyYo) zmLr>eA2!D8R`CnYI zSjICY3^d$$QYFvmaXCJiCeBD%e;Qycla%z;qhTgFFVJ6OyJ$BowLUwHre%{4F!u7P~?Vr!)E4jpQ(2RXdXu%zaMUIsMv~m*ULLTItaarbJR748$>3)rnM!Uwu=ZmZi`y#k zVq_jVNH?#srWeU%31<2NVJ#bUV*omwGhP~u;$;Ym#;X+%Q{pdY4u~%Ph@lv2lr^?# zd>Dr^s={k^vZoy0z4vQmjM`1rNpca$n~jsOfcD>X;1lJy(BNU#qp1Tu)KWbj6KBa1 z-;#`T=-IBvyOE8f*WNj8*pjTL^y%c{HodHE6#T=FX;uhb^uc)q(((alvn!QE#{^ru zflVK#w=R|xYS3yXFx@$xK!?&(VpgDYmt%__f0y@R#1Co`mr$)YzHF3O;}bnQRDFV3 zgcK{VPoey6p{01qL|@Euu2-PZma5T~Ps{0qprpY+bfX)w+uK6pdiKKgC-Y>NFu3x> z@#zJ`vGtdAL*qI4td1fSvWo#jG#2!Y)Fjv)ZzuW8tYl(iawae8=IY(7>J#LLabv%@ z`I*lLx&MrpozLgTl`+mRz~57kxE>_r%-;?|=scxe(u}Rk8=i++mEnee@`*@sx^MP& zvhOp7w$odtb2KxyA!l51_*6(lP(*#?TaYd>$kfg!E;?+kVPKr7HiJeJ(#!4t=FEN2 z&l45@gezL<5cXEX!)F*nuphNcKDK%jLz}K{Bo$zBjnva+;ZHL-7Qa>b)OY1{*)cY? zxPP7Dzy`7o`IOYHN#P^nFreYGxGeRl)yPpm$l+cbzy6n3KAn<@uYNiFQty-7Im9eo~tBtAvG% zWpAapNGGh&6+y#&-Wxm7l($M@7Vvd$22lk?9OJFo zx#0rmFs3CnVG;B6?-l!9_}Hjw3MF3*qc7#!=Ql0$mR!?#U!V&xfpogW;l4vpd2U9( zM5Vz=N0AeM=#^|JT!OXrGyc5QQySu7V-UO1s4STx+!snhlBnF*9(tp_>NP96GHwE^iXMooy2?@?k##4s$lUR_ zuP5l2Jw1JvIxJ*)*77+0!YF%LVvabgWb1L*9-I66hi*a=i~7v)G6{59Q^NhI>C46U zOW^DaW--$@aTko)16bTWwIa=#X*7%XcVvV;6MpPvk>eAoBF{n7EJ>^NLzuT|k9*Ed zqOsGG3PjzoLTuDH4H6l6m<9602C_UKRoi=xw z-(H-&C2*1Nc1G0>nae5T0bA=dv~P3I>xyD?wdjPqJZl`j<#7M`$8uPt^3WPvHM3Ld z$DO@Xj1O);n;uV!pz%0AV=6FW(Em1E0K~7=eRjGjrja6iESD3|*!|`&q3P}lwW4DB zcvQ<)mIv!h1dEnxthpkCWn$j2i5?i|k-BhAfp7r-2ZwJT99|3md3f|%N+wgaeV2M( z?Bf})CpI@@yq^dYbHbvP*ur<2p#sN!2dNKWqWR!DclF8)WPMfVkc`~?#-thTZ1Q0P zX?j1*JgiCWHDS=^>X0DTg&q@zOIuf(zmZGC#??7189t6amn%#MufsI%m? zEr-q@dk(Awb@sTnV$k$Hy^YV3QaEw7-*`j$4_0#U_~I?>)ysn)Esk@frDE9B)6<77sUhx<`q6a(~ls z=mW3fQi&tpkw}a3sD3u&*M-v}9t(80W0e)mFAt}P?ki>$y+s;;>p_T(36X|ik1xIBZK311F!4wD$^ZTqpizbGtkAz)q@^xq{RxCIJ_X! z<3zl85}Yg9Pf{ANl3eEEE^^-x@?4Ir(=E?3{O6LoPet(0(Lix^W+EuQo{1`zDRvk7 zO4X`=CWn97fn1*RE5P|0ohH{Qq8Ow?gBI08f75VI4#|K1?y|BUKKJupMZjeRSNwj%K-7f@WM=5fNfGMpGU0@u%&>Irn(T`CPi;*MBhl_**=D? z^c0S_NsImozoK6qZY2*DYjjfs(vMZfvwwdUDk)HBXFQyozdH^h zca_#1@!{NER;c1m9~tze;2pG9E2Ft+Nd{?ku*o;v3ysLAlJbTp!Q$<4JNIfG1NoY3 zW3?rM(rZuapd|X!)bd?4#T$AFA@5g@*fIh~A#%6fq5k|TsRm_~gVsu+zoYzwtvdVI z(W&*2x=0~CntB)fjedj*pul_>8{owGGSV0?&=_10=PHe|o`oDKDdJeik!4pqi{YVw zh=6`7;&*LHbVI;RfS883Bf!o^<~vlkHnK{$^1T%5-303rzLvZpnDQZ61<-e(mbpgV zyo9})*P%d!FB8&^&V*n2tD`#rtn>FqzdXV!4I35(iP7H$O=j|16#UgPbZL_?5=E zI`=X7U8(j~YNCIJUqG7heJOYg3?zDE+%E+LUBbieuxw-aQUx9!xKg)xVZR7Kr{PFo zM-CC}eDj0`JN`4Z;~XE0)Epm6OOrO1!%et({p_ry2igj_EV{(t4}!Q8_dfQ3YFvDuKO&TYlaOAF z@uI1rlO4s30;J%U*j(V308q&^)~a^=E7Q~i#13uOtpmw+!@X!XWPiz8b^YUPHMiKZOG^~Fz)Z%#}3~E zC)VJvVD_!t_yO$A)LDH(gCSb-c)V%=~z;8A^ z&%UZ-$eQX7)L1>vqioTe7QjcWJ&a4{%0?;lAzO$ZOOnf%N(!pjqQf;kn{nz3dua!_ zxRq@osQ$v04gHGbIzHzIx}r)tk>AW{WmKO-K<82H_Sl38OFha@w2-yxgO4%X-E2;@ zN-Y@A?$k7&#|&V$KMBl+0+$^^s{RUt@k})yB!6H906*mh4CT^^wI&u<_|&h&2;LXB`~QbgM2L^ z$bUSlAxJ5dMNF(=7N==zyI+uh^jXoY^Rz@Y>hwo7z0}qedEsQ17f$dN0*fmxdX;eb zNV^=}bu`hwgzkRPR}A}WuoBIMKe4}Q_B|_7wo;&qFDXpcN(IJ3c}chL-S;%>bW6$1 z9i7-L^eP&XLZt+nvtnP8{c(u}iS{R&{ZUeSar{x{c);3k1R4CT*!>u?Ga)piKgx6W zFpiFe0FgJ$Ax!_$1o#H&d$K=jgYMGdR=(qKjLX&Q@50r?fFl33a(F4Rcj3pE(lPu9 z4}!*=0nO)#}itsK03TfA1!(Kw|JvLVUtadC`}a zqiu}ltLbk&k(W?gq{WMX-18{1w+YJI98GzP7U3}Y2K@$jk>p$Sj0)d?_5q(HJFg-6 z_69Ns{N{G9@U=smOWElt+FInur>t+P#+|d`J*!&$9db&gE>C1e9ssVtTWq|5hi$6* zFgYf}^7{sRB9m>N$UGOpa1&~QFJFkbh>qj7>97sX{x1J&K?Yi8if7C5YVE_kWsm9Y z`Trq@pG19|C-OS$+WN;&7ZRyS=WaE8sC8aoc#?dcqx=kYT*r1 zd2jmuVb%t~O5Lcb&AiIa*eM!cqpUZq*VV@L@;ZO@wR0Di2z`Ks7Dz0bhm~|zwad4t z4_N6Ak1Fs+##8uPkrpd77Zv)bUyJn5qnIIQjjx+}U_Q@&&wPp=LVIy6_9fog$annv z+>w$zcTEZVKG&2C=7E;D;2&Aba||Z^7jR9I!x~+r+;%>RxB|Ujx2d0y9~UV^k^F6t z7VjXB*DV}a;J=RTj>Deh6AgGcO3^qxqU}K2Cw-iYaR>z)Jhx>?&O7^e9*r>?Ug9#E5$Ft)}Vc{F}MM=Hk?0{4LZyWdPYmG(>evq~lMsZ8GtfriRp`unb~&dNjoU{b57 z@4qX&glgQ%1}UiWaq^#lu0p_|;FxswG)KfC(HQYPbj30{mVcK_EH}2;n75hYW&19LxGKn$6+B<=ieeo8NvL8nSDMR{>@<_M0_=VO5iQ%}#nmn@Km}bEc zRr)Tg+z&g@KoY*D5g{jek1q_HZ}x^c+Mj> z>uQi^DYkU@eMl)6+T4-r@__XMbhyTsqy9XX1+gFIVw`=0?-bybVqY%BTdVk%e=j3Z z0D$m3`09IKclGtTyb>?aE3Lk+u$!?6(Z6l#QoiYrxltCTuo8+@Mf}@aokbn)R_~Zg z@u0r$Zso^lQLFCvv|Am_PyD-3AxpS8Jc>T{AujQ%G+g&|-HbZZ(*$lm133`bRwyGAs>5l=g zm?t}9{#V_Bl@^Z^WP~{n5S2y53&-p zO8e=?>!I-~tK2YTxA`lB{98(K(Z%FC0t}RT?&4JL=`{5nA0PbJd_3+h$US|5_C??) z`JVq*;M`D-^mPPjnFwR-8TJUM60{gQ(1$pH-Z>~G2EXzBj~df|RD9ldya z7N5Ak&q|>ekIib6Ud*4xVB?>ahF*-Bbu4;u^(>|rm(Oa8Uf5=}NiVWz@i99(GY!2^ zXC9kgteVMqvSMai^y0;t$EFvHMGFgNwnZ<-%{(@}C>AaBo7one2~z4P=m@ZLD>*zmq_8t?PkX>Gy#d((~$?~jNU z?w{5cy!)md8{UVC7Ch71g7=H39UI79Oc>iEZoACbR6o&VQr=)@Rd!`%<-fx@2@P6}@w&2}0rA>H0dkVvQ zuPJHZ-7@9a@UBkgecnE~EqJe_UwFU1tOgc8aUo2WUcT!vMZk=>&c;7!!$iEZYlz%7wAIiV~Dfz?BemI^#tOHLX zA%9r?g!UI(#%_JmaH5<5Tca!T2`uQLpigkGhUegO5z(kA;tZ8prr(^SHL~QN_46 z@zD$87#}@3E)70fGVa*;Xwo=7hhxUIg^vb|J2pN#N3?MIxVG?7*0^KiqhH2y3qOu+ z3m+{Xdu)94tZ3o!v2Ed_d&V9cAB_<$l#Xo+A6+^2c<|AGIK1c1NeAy^Z)12Laa#(! z%eS=&?-$(0@P77fY2e*@+p*yNz^x4Ldv0wD-q+sRCcM9UE5rNix2A#j$8J3~ywAOr z_j$&xZNdAEw;mhbFBL6Zcxzkm-sRR~!+YHrZsFH4ZNdAxF~^4Yw?qrCj%f?tmyS6$ zyw4CVOd8V`yk9%!{{eVEJS`o(myKq4pFBDR-b+Wf3GY{oW_a&AIt{$rMjs2_Ge$GK zH{H?}yl=myO?Y2<3&Z=Tx1@phmv1>Xyf3+h_xYY%+Jg59w;UVZ2Z$D~yrnI8x8HJX zc+V0o92wOXy#F-n*zmr56u0ovsJ7t!nNi1v_j^POb4RrW@3)LP9(dRJO_?AaE!aMc zI(d+yPK*pE9Dx(E5`Z~R3YKRkABxP(&8uXfHrpzSSFCN2l5^mq%VKZCwxQ~Pabz(w~AQ0SplzX(b+2ixKr|a=_ z(c`HpJ!Tnuw5j#OjXi>laa0e;XsgGI)Asn;mJKdINZ=S7MmJIpI<6yJJHUpneNtc-JD%lWy2H;?^1Dll2@{80toz?Y zMssR<*;xvB{!@Wc&n`z_C|Kn1BnJ%A9X^rrBPdlK=TWb6)FTbpiykw?|41jVGEy%J@kfu~fxfpUZ)NYv|OA9Je z>o?HU5AoC|-C>w|%$yA5mFSW6mB{oa2NpU^Qt&xC=c8@36XA}YaG8U$S@nmp?SpgH zLJ7+r$PljQ+A?+(+i(?ZLm|ELBEFJapE~U#vo`I^%wpPnejMyC=SYSktxf~1Zh1!H z_DT1@lME30hW5|k4K2(_=;eG$7&Xe~dk8o4oI8%OnZdc#-EvBfL;-O*w5}d>S?hE= zuWcv0hdBX^&DyTqiieYpOdBA--N%Mg7_lRR1>4B~pDe@u9yyo}WWf4q_aa>!>bQKzD0IxTawq#go#_W2-q4jN_PNtjKmX0^{m% zk!-{=>x%?U{x(sYixkX2C-$798d|1ZtTbuyg3=#nGFn-3dy8o9q81IQQQ+ze6FDd? z2VwOBL%fru+=%nbkNf9i1}h2ypkUD~yL$UXV~idg0f#veY^D24Xk<+ubopMR7KqM@ zanLs(W&eDF$mE&===Yz9W{ylSoV7KkWH?2%8MU_>^HI^E?u3VWHD>?D^Sp{qcJ!Ph zl$FX5O7k9$=$E4j`Bw&NII5Nc`{rTHC58)NBnGtH1zTPx<#wT#*K*5WHf#Quk}?pN z^r2=wzF;ym6MH`%|1bmBcUz18d;-2-sl_8C$Ug>V0n-8?!?1!Ra3o|GVv-0`0tvsK z>H?IBx&ViOrw7!s8+GX?$75Jt9j~QWY)p+|`3E4H*Wxt&O{VGoVN?|$jD$TNDsI<8 zKiX61$DLi$QR#Fvjq@-V=(O*4L>l)ltf#chp70E$(U^Fj=5wb7CL)J(aYj5HDd;DT zM1=0pf%|rxcHpGYbR@s}K&AVKAe3O88O-mLN>Ae~TT;E0|CaSXcieyI`cr)4zr6lK z#{L(re+obIz0S{cqZB;1!N8{h@;K-L9ufE%xARw7iMpi$Ut&Us&_^Z_H>k&LH|dCx z%zU)NhzJUuuB6uXXt{@S^k95Id^2u~6BAGFPFX9iKsk>^afOaVgDJ{9 zzk)JP$jawZ@EWs@n&SKhvl@gz&ei-uwWNL;U4(8A-)h9SKt-0AlV*7T>AL?necyn= zTFYZE9 zGSK)Z#Mfcrx!M}E!!@|GM>_T>UT4ig`4Nm9!EU!+hfP}Y7SfPVi^2cj9llI09)ru{ zyD#>%wl8hdr3(!7YnZI;EE+n5vua_A;~*U9&=@*ykxB5!cb1e(5vo$I_Z}&C*itEX z)H71<_V)iT%n@Fu1C5qhBixkQdWcn7?hfDG zr4cajNsV%3NRAx%35fqU=2h<8F9X{f@!j6=H-ELVCQxbdhO_!%=ESVaXVn_~eNTwa zF|6;)S$;e0iwXnmVt+Yf$q=jWM`d}D^nP)dmI+GnNiDIjm=41bXaq&!;x0$v!I9WI zYU@??A}eNGU#TnRsZ89YglM8tL;(w|FjXwp=Akq$;l3S~IbhnIug=Rd6)9^!vYOZn zI)idJb#At0n~E?EyeWTOJ_Z1)PK*VB&9b^FFpRy;EplYARd#+O1qr(;ltxsxii`)c z#T{5_t#(=I2_RewXv~vqTvT?=wzGh2q2&)qgvjs7a!7A=1x? z&u+muoLaT=x_XWYlPR%0G=lkqcVWcJi)i)@qf$LwVv0SB--UkgJs#V#^q@4|WtDP) zVdLd+$2Fdz3?ac#N+jFMJFlTzY+y7H+XY&3U^M`We{R0Jw zQDZ?4tOs0JPU*{G#Y<)9DnO{23O7dM9CoRm@blUGO(GB5hwWfY_i(m}7Rq$9%xCXU`N+|W9@eTOdTxiq6*N|^t>9cm z!RA5{ju`?B=Oc%@90FTbW>Jr#iY?N2%#v(vI5lb%?}fRLgOe&afILoG_cY@>%`iSw z!uZY<c^rdd`=peiYK&ZXx7`8YHo3UI zwb*&Ly{A-;xp-hMfB7&=%Y$F5{t@3VBqx%Yq$?VU7A8#Sga@;Aa86?Lqx9yn2oKeY z?O2SfQEZ>&=9g;ZT%gcRGy`0V&B)_?Aff_#%g*Ie`Ask>b=_FXx|H&%v)XrpjIvv+ z`g9*QsXcl{eQS}jqC}}DK9s5}MfwhzqytH2qeob$CWk$U14`*@PW^~pL0I(CI z>%tagDMaa{BYpiJ#}Ay^zOBIz(mSX$bWkcfcxF0xFxbM!nDe$?NP6#}mh(W1pFNYa z*(N1Cpm>tfBJ4hMSw}jLt~|LgyQCVmM(e0F%(m9G(z5;0m7Je8(s%YphM{|ItZyyB zgNNy3ivX_xjJpHBn&m)+nJimHkpX7-+mo4nkQjK?&aWI!q0K!{oA-BUT2;EoalMY1R5ja{nY?%u&C-#tY)=K3U+L6 z6%n$tI@vN3QuU3-MGQyR^SHz z(nLHJZ!jhoCwP#rN`Z&I^!n*X4e)T%QLfa~Jx%NF==EKP4XtlF%=PVJSEe++u?pWl z)7W@dxB&j+2cW0V{viuiVrzbdy7nRD^u4S0hjwCbs83u9e?EhEkjev`ga_s&+d1W}{T#$}pG!!-bZ0#UQ@z0M^r5h53~kUGc1|(NCkYi zl-@5y`6PBGwhxR$psADZ%s^9T|Cu<~^F&_=Af50=6YOZP!nc8V+7VUsr)p_@Kienm zZ+n1=ysE_OF$ve%cyjeReeF3bjq!Sa(lsh5jn`w6t{+RfUYm4{Rf(z+^Jj=!tD@}q z087n4YbP9D%|I%Y2Y+qiFQR{wa#NnH48#KE2VZw+1K3g z4+xQ-$P9}Y32v+3B`cG2Jj!GWwol6SL3$x|1JrTb}jh&Ck4ZhFaj`67EL_>0f zM`=>OhKFABQPfEN9dXCkg>c&bMmbaG!LiDo{jnIFR>=JILWbOWmw;SAdy%UTBRo7> z0o=vI%gUS{Q3;jS*UenDW7C3 zR?y&AEr$o$K>DF_f0&9t5G0=O9eXL|H9kmrPJE{myrS)*5OrV6rNw)N9_Owcldm1+ zoY^Rq)-W-{mgW&tJdsT>DV65Jq&Th9w!va3WGh^RrLN|5#PyLD^(6Dl6SSb{QFBK06*KD3=Cs3=nAJnqAxgnF zh?G^)5;(sq+oG#1W7{Vz<<+W2jymuO4ckP;U3lfRCG$CuKc@0fg?HGb@=M|KSzU?? z?{G-vsJe>pf8v-hEjynZF|Ka2m zND}75xR$`?klNWv6$NOSLw^Ali09K#ie-KGaE2havV!cl9%XD^)GzFE1TTx4kaH+s zfO#SQshxKKgju}E10qLc4j#FXl9c}NKSodff%9Y+9lA^Ie! zg3gaJc}PS5E}(jHr0Y@%SlqE$i)8cK*6jj6eQ;0W_&%^EUPsTnkoX{^>omW?N$=Z|u3d@O z`9x(6?fq~;;`-uL=D@(03Go~!5ZzT;-s)G z>>u9NNwZtY!u&EP_2FYU+-b~rG;|oSCJ6LgYW>$C^}m^n=Uv&u+c`G!8 z&{LA%0L_%-|1ty<@;77Zs=%*SgdlQXIPYAdmH;uGjT{fNH%H!gq%c45H&6U6U>JsJ z=qZ@nqe+$_z;}+GzMwZ&FeA#2BRgjZp?Mn`*AOJkU&+cK;HU-_0BhuOhf6H(u-}B! z$*q^2`+X;3?PFz+y7epuA#)VuH+VWLj$A?lrN!Oq zsC?Q;7qLY>%$j{=dx~F*9G;OUhX=!M|8{MpQ-+-Tja+#+Q+6J9OV|A-I}b>~i--g2 zWS5;CrA4Sikjif*6UB~LPeXo@@M#`rcWF^6z2KrVx*U#irA2mb@b8eF zZ`@zJnG$J{8@2l4U_3=*9G=W^n3`=cb7V5PVn>!y^-Qe3SK|^e4P z5Uzvm@C^lykx02O8y9apI>ISi)oa|mB<{fC$B=6zRtA$Ka&q+ZsA~f&^D1SO9qWo# zQhj60Cq)G_GxF*=-PF(OArRD~53{}~d^LLL*k=9dGaTBS`#j+rU}4MhFbNfKC=Rc( z)~)>J&Rylst@HxAfmL>!E8LaMnO^56SZo5E0#6u_eg~<1IkZvMi`?q9eR9gq0%_4J zxYy;c^XV4rTkYSG-|BXLj|i{9Nr#m0~$RNuXp6cY%dA6>|3HKPwXdkRIzZuMDlf6pbxq*x>aV*In6 zp2|w~h$#AR8=tG)%0MQbXYEV}C2aC_9AT&r3#6!KM0DtN^`fp__HS_sQ4RwEhu!TE zwO`234<iVNCo|JJtJU#hiVp3wf` zetKdPm2 zG(pRu2LF1mvcDw1wIqLU^h)yA5S}m;_ev`97$MaYW8Ju&vKvtXD#^zr{peCj;*ZC< z8s+u=RyJfJZP-~7tRR{0O%wVXtd`2R;0_0Vr8B@beWI^A{+|r&{s1fC3gA54${xf{ zx$E4yRbJ&g#7}PXa(CqsKvO?^rR$b^oE5%&PuS{lo;)X5);&6&@r@_k-Q&!W7EL4k zKe#oHFWk-w#20&{;N^t6ilpq^>0+9p$=2iET4^ z%bbXlTYMIo*yD3D(4%W%PS;u(5B7H;H<}Cl#Q`4uY2GLQ;B!6B zX)yeQ(qbw%OO7!w2*;Sc!vV@z88YW?Y0)zj8b-EeSDu^0rLTcGe2wJ*b6b##CrteiP>jeJHF_Qf}W^|2tA`*P8R zrY#qklPFJ|bFHs0z%S+_-JEi^UBchx6!s~`{HGanzLNr{>xw3f9Wr>VtRGdc)vdb$ zSd7Nutr@tmk{xtHQ<&oXdl9123{EsaOJctx5qL-0PDcce4 zN8_!gSVGoEkQIT%_`%ncB_VQwZb{hJlPn3>{hcfcYoBIIg6hyM38QXh!D9S*LLZAK z+Y^~yKyu#^)RMSlr5=@4r^fmx`zOZp8Sk01EAC|&bp;#zr@_rx=cm-K_b4&9bEIAN zor>LWCHo#G_T5wV9g?Co@yxz`IkWG)5~NE};)j*Zl-Rdgop5D+>j7;ejojKK$5$1N%n>8Z=Q9am@G%8=KwaGY=M3Adsd0hPzYh>G4a_$YWJTXB&ZoHKQX#qQ!-f0&7Shh49COiGWS1}X zI*)+R1$T#)`oG5E!H7rXu1#Oh&y^Ef6u*Y`@IOSVR823s!jj$>%c=U_* zIv#zo8{yIRFA^SI@;KwsXU@{`XxEX9M|+)WXf0|PNmzN8C1@^_HQJ#gcQD%C{0H4vF!w4t-UU5gd4;1f@{J)k*n zc#_3`Dld-8i=*2Q=(GW+@g-1c^?|F|0O3(~P|1RQgar+s(D`*lCtj`7Lp-ksCrGsw zoQ`-p3}&GvIfXs?L*l+-b#fc_3hRQ^o~_i&Rs^UnR}1e)L~Rn6aIff9uZ4Y6=^$+h z+Hwi;m@h2^j7|2sP+Yl@u6Sbk)9S-D-G%?79F@)QEHW}4AQBniUn~H)+3k7 zE8UI0a2;DkTwz2M7_+Lc^sX1#rQ%yz-@~w#1@WycO^Q}tDo47+bsKA?a#TFVZ4;=VFJ*^K@tzv12* z#nP!l`Zd~uiMEkaPSoYbLU-6tAfawNmweMOFUWiX`>;ta%zhQPXa7Lsx7~=ovb_+_ z8lVfj?f=!t6X;enPwW0n*QWT3NsJKvRnaHU`b&IJ?GkBf;&y>n0}Qp2r8q&|9zh*aU9nMe)k zN%}fw{$XA@T+ku39sVfa%8`huXDkKpL`A*wRm@6qtzMtkcVc0wZFa3@JUHNSMx~$^ z-YqH|W|IO~=HataT0?RP?_^hPPR&6)G6N2Ijpol$dDth~~f^l>Qslcn#7jD2*m7Y??$(e^0R{8nl4S(zL@ zd?&hsc(DS6>Q#=y(4k*{2aj{L6nZJ{4e~jb&6=K%u6d&h;)P6pkV52ksQm19j&g~_ z=9oVhx(&;=C_2?MU0IeP))oIopNq$1Qqk)!cBvdw^8h)HEOe;nWO1^cmTugkH!?mC z#eqM>9v1K%?r(>^c(|g-`L*=aa@%r$7nJ{S8%~2g4iB=ygZZ0LocKsn+3gO3ygdxi z06bc7H6r3G;J6gg1f|dkxIZNEDVM?6efFFhmz^p44ifxYl^Hga{qD#^-;$ES<|N}N zpC6F_aFY08bQ!Aoov)*Y$JbA&Nv*}O{XNz77wFp}v;GN|K?;0@bOOs_!d?{vWb~9m zlc?@T6t*U6O9i1VmueuSfd+t*9Qd@60Vo-y5T2+m%?z2UXcT$E5gcAo;q|uJeex@! zO`1PzabcMa)>!(ZiaYhxazL1&iX!RZnxZqlkA4@g+u;enjwb}ur=K2;m~=guJ6vKd z`bZbl;g}SK=i8jkvv$FgoRa)?B&e4J*TLNUBb-fh&T)t56hvFF1P;2`qb&9{Fq2mdPb((1Z2fGXH@qOn<81L=B#TJ!n<%CL{lYrgrND4f z`pv~dQ1b!7m{f99mWrw9FCeDFQfRhWM@&s#7dA1p>C;riIg znqI=d7sy7wKz8o&wJR>1D$1jE66Mi8UQ6AH@@V5Jw0|X~;fhQp+ILp;8QSk7(t{O& zrc9UgP_@fe?eC`1y`D(-hS}3AII%eu!x%FH)KCyzD|89VETRQ!!44i=PF+?>BT|h|(+ro2UNEr)4FfRW` zFoD_cBJur4lmdj5_0f7mzJ_prTX={adAJ8|B~tyJ^dJzQ1`wu;q^GNk&e#w|&{2!q zvwinC_pfWG|JnU}q3L(_FYB25m(|ApWwx<@8!puLFDqsLKKjGIynmlv8Q;H5+`p{j z-oLDr{mV?)zhHKKtLD#G&pHecK4)9o(qWWY$*4PHMb2j3cLgBxZtv=bAiQ&$Z4f2Is`TjUWevSIE*dH#% zTufHk8Iyt(f2Y)8d~A*f<=WU;ar_74&!n+e35(Je%6;j(5r&2GI7)E@llKud==%5h zB&G~i947Q$t5GjU?Vw2f=bK3tqvb-MJ=ptRNowlw97a%;8w9{C;(Hmdzxp9M<$N#y{%K*}}uC2KGP-{vc40R_qY{ zC5*)me?p(H8&P8%2rWT>FMy{mqDiHHF{zOZVS^%jQ$aF+q}4lQ?b3$cTK#7r+*?Qj zGOpnL%h1F95nXS@bq}vUxjwn>;ZLIO;iZoHR#6+ezb#s4z*m&tfJ|BscBt3N8^x?bS$$=Jv0XcIej z|KU&_otatyc6AS$Q6g6~@5?l)_n$+@OTAQ>R!%>Wv1&ShOwOeyoB-?7p*8gZGR#&{ zyl!M3tlvDFC!9wSTFQO_b`2oxWF>-K9w!(re=JjjyM37ickKx78od#8=^5)FXw!;n zx|Ltu&V5qw3(8zN$l(c>ILKH844ubW=~XH{(ty>JtcGfhe{0fejeEEnod@w60YL)i1yS%XeYHR`qPc^?L}M|!*+yFj$sM*x3GCIP<@wHn72&z^O@nX`b| z5qV(dkn50n)d#UqNiLL%`D@(HsW#sNQHkBE&g!MRfsb*xl?sn@2cA}Gz&D8O&cWO# zLJ#a=zjdoRx~HC;iq4+DI}LBU2IUm@V@N!6lO`s4l@(HG97#i}n^LWOE<3*xsY)=| z#@H7-^&aejePKQ|y|**u+-QEQp403Pl!iuRQ^d#{Wld#m@2E8)K1Wje5QjgN_s@41 z#Xn!FPhIAzuf_uNV(NLA14ryW6*r8}zenQy5t3|J9$dpb3nyuKyFKCUJ4@Xts7k>L zcf7vswy0ZPLZy-`E*X{jwKnr$7=`Hb(D0s^b~NlX;9Mb2~Gh%uB$N z4fyE9vlvh25uPkQg5g5ARyY=97_RKDEs#YDUWV@fUZKAy{MIN1t?*6k4deNMKHzv0 ziimF4<9X19^%72Sk-rBp*@`n5)%82Ts4goDP+e<3gCAyp8@F*E^q`hB*Ae2oq8}f) zvpXd4Z!K|u$a{Znq|fKr5>i|sjRZz6GSWTT#Nb>>L??x*`J~2OD*mQX83qWTszQwY2N7TfVRuv-JMK&ZV_23zpOo7 z0-}7<0wDZeiD^mDj^12uN>K8I{bkzIR?IP?KHi0mIt7SoCZSbm3~6$ZZA-P1+E0jE zNe8g~o$aI6V1*QFX-^c-8ov{?m`u8zl2dz1;hmmTl`^cWAAdVlBQ+L(w{(2w+r($SE`vyhbu59;d=-~J2OCe~ zGqLe9o3Nc=;}01c@14Yd?oPvherx1E%WcfHV}Wg06m60gLKtH9S*{wY zvmEi-+do`E2oU5ARexAc2s~C$2GVdo0PL$a*M8&Ov z*L$(Gj)b8CkFrm711`Y~@O-vBz#F9=tM%>U$7&$kvBkNv;hCwA(=*#}uISFq z*@km9Dm#9zY}&b+Bn4k0xVl6Jyi!5KW?bL~6ot7BIN1*3h@g0$yct`?((G&>$2g@@ z`Ma4K#u=M!*n^;*v=##EtfH3oAl9Lzp@(RLTG!$tQg4`ch;j~1pi?)L)(=nArCss( z1nrF!`x~^IC=S#{`C(?j_mj4JBC{o(4l8*QcHJb9cj6rT&RK8WrJLDGP4vGJ3+@HpC|-PA5N8I$jArY0|f zA0|$?wh_1h8_TykN@?&YuhCC+ejO?us|nV;$^sY4z;3{mB>}>f3+^ObIk6w%O5a5o z!EHkn1enYrP>@RszMo}O98_D}3cvPyQ2U4MIMli|{3PHuhvN`O|!G5yuenJ~cDXK8NKPRc`d!$YzQXwV4IY$_4BNf*9W8he% z&{mx4y(^Grr_HE{e2kpG4q2mHvzU*O@ig!$c*xE^(VIEEi1-y8g{;9@^r=SSZ$N5~ zxTlU3+(q+5_5gWAbOj*z75gW0O*lO(o> z!(h~Xh-V^nPFmr#>&MT8s%&(qa9s+B*9 zp4Ij8IIh9WPj1TKaShh^Nim*1`0$e)e2ByVIZCN1?zdQFjka$#_G^tkeJJVrsPQ@{ zy6{lab<*?qo8op!&flx=?&l_Lci%8+ySs|sI~cvEDXtGl{;s=O`>sj*E|f$0n}mZF zo{?ZXiY$*s_Z`va7y3b_b$hM}lLW`t4Mlw?h(ekUuE*gzp@T!GSG?09ey1v0c-YVv zPzwLF1otq!Gcx%Oj6@1z#76W*P`DT+?1}Ny9sZ%o5||U)0^s8y%|S(jPFVgCE&YDg#u>*;5}8r zrda8D3x>~Df(&%Q7Lb7|h|{l%RpY_AWG8>p+yI|6TvV_1L);8B8di6}lUNWw-v1+f z9O%NxS0~i)y0g~5DLR^^tS|7TTet_c9kdtwXIx%tjmt}cUxA|BN3=Nn)-$Gr%z&H$ z62l7)p`R{0p@|AKuMxTWHYwnFIqrJFsVu>`atcXb1{UnI{I^wdq z)Vf_T{lMLJ6s{eFg>QoWz8aYLtKm@jGc*kJ12Pq|kzeSyQEh*@P?4{cg7;BO&=lxg zsquz=TTE&$ZALZKtNM+$g72RM_d$?Gq*jgO>?GJHcer?qi8c%PN42)0jwWJF<#oo9 zDpp?WAzo)DX*vqL%0U>J~X*KP|>2uyfxtusLrdn**(FLCTQlyVf1P)`9vN z(uq*pJvDZR2JTRYb=1z1!{5WLL=7sCi;`3zy+w%|AjGIeKu(8}M5_131n)!k5tqZQ z3~*^_(m;mMcNE?3wG4&dN{cS1yrEY#|Bz|oG+nvB)EVv9DfBXAv~M%?R3Bs%S{gTZ zx#D>`S@8yfnbL%24U#i}^TE6#pVRkvL;?r-{C&{~=WhbAfa*7U>L`a;^cF+@)r9!V zR_**?mTmIGm;>zk6uUL`3;{%)Z)f9{JG{_BE5|B_D5)G8uXK}zur;l>L5YreXVFojeY-MX4 zJRWu#tEY|v2FR~S%m(Z4lMGr3TXy+x3`mO_2z(l%?E!pZKd6h;x-{eK#285(+Q93P zUy%?%{BK_e7Fx17R}J5SYqRk>JP?2ZozcM z{J)OKuSg6QMgPkG;&|mBrGG14IkLU(c;b~CI;X@dkFBlyABk7~FJ7qB(R_z$fQv7<-*zfm9J z67&D!`VhNFFY1yU&23U zHvfzGXEoZge;5B$HU0k={|s#SUHtR>;r~?p)A8_sDE`@T^1sFZH2#<1AL+gJ!3y7) zaM$n)4pX?Vf|28a8gog0MbU>Cni)Hd_p@IIUvXf$8J|h-4a5Z8J^FR@N44ufwK;zs z=Q|2=awes#6Wa2%Q~LIWzXvt=QogA?+`BT`E0WbQu)Tfms$ykzZl(G2*o)%1p;mW_ z;s!l`BcnJc7iRho=1?Ty5q0+usH|+1+`=zID)*Z;W5M6q1+$BN33?_*`tDzrWkMBW zu|4NQ%r{w`l}W1PKMP4TXBlOoWSvc*sk1B%+<^L;@;{S?vGZ_8wh7Zyp~bgW-Al2I zy&7)D5Mke9QEL z9`w=&b!_d;5r&;@>WgrP;(|~Qr@F&WWa%X)(76ChO!&N5UIO)~hb7jX6m`}T9Q@$~ z>c_fFkI7uQfz<2gj3@Q_OT&3QpZ%ntd#Z;`!e>IN|Gi^jk0004{Ku9qHPqiJQtPz( zJH=dQ$i`)BaWM84&B%g^tLRuApP*1UjFa-j*3ukNnWJ02r);aE#3%3kQrD_3+=S-6 zzJGcX<`KDK(;GOQ<@puroHcm=iqe&tCN*}1)IXRBWCn2Eu{V?A_c`6(yNBvpF(!Yh zxPC#qj{X&X8S=du^Dk`gK+8CNk-m%z*3&X>8%xVLrj(a)?v6Sw|H9tW*x$YW=Q=Gv z2zThS*SChttdzUYa-c4bA1S}b+3Vpj8(j54oSmNIrV1k_k8`{KdTb&enkd4W@vyur zNzFEbRgszEt*f!;6ZDSMXYeIT7qE{~U{KbDkJr?3p2J#nJVsEHW+$&|^zm_%xc*h#d0f-ad&3KJ zFbDl0cX%}W_Z-w*22OH^7qb6&w}XW8yLoaCY90fU<}t7gJqG4};8l)7Q&voe|E|Y5 z#cuJ9cZWwg+|I42^w*rn7ZB(1TRLZkOUaobPdKN&@D*c^xK>gMm9wk0Q8S705CHCc z3jfxg({fXww6aZ& zc0v8=P;?UmA__M)c1ibjCuy;hzdjqk?kv9kdlz)8n_|yne>!wiG!zG;@=vf+d0KHW z7ZwM@!eC5ox%84^)hdIjKd!5b*UVzu+$%rS39ULRUPa0>7d@48zb%d%2ypF^FQr)w;t~ItY2BheUca{T!>AsDS&CB@v#BFuS{@?NZ1BZm$ zH-SKk?{}FEcWu4}hXoM9fS`ee7S|;7;hTkpY+-%UN(!6yz|A^xO^7QwA@2NKr7rr$ z__414-YoRrVkVf_N;xWGyId(DAQX&)V+zm56*8#oX_jcYeH_zCM21)~{{>f;NCI{$ zLbpeq)tY`XcWp{P8BC<&qBL3R`)kuprDzjRMcG=Hkk9(xoX_W{AK!d#O9pc>pI0~j z?tFHyNjIOF8+kq-ZZXbB;NE5iuCzfq!m>rk9Du9q67na=q^ekJ{qu<)_0p-reKEw< zK&^iMjGQ{P9328Gu>>H9L%)y_cau)gab?C3L)*_DKT*rRXSC=rM2D&VoOl3Al}1pY z$Jig$eP7cQkyr=lQ)6Q@zh~0_;g3X2Vmq|>L)z1Nk%{M6dpeKzbf0#agY)El75A|3 zYIS!{-eNaa=lCsYmuZ7x-n83l&mHM^dBukGyWCmZWzPG3^s96`UA}>Ldd&}YDfzzs zJLY?+$8pbh%Gc@V``!BEns4W{^L=zZ&v)DA|C#x=KmPeXky)R1zQ25TT=VVqWxDyk z^c~N)c2nwnQ|kexWvRpz=Ys4@(^Q9ijy>gZay(0F*}mt@rsO2=>*1E#qmG%=aDSM} zC0fnC{Zz9L#~53|r*7;obtFZMAk*?_tNXmsK&z_Rqg3wL&mF82xsoECP(tN&MfnY=EFyd&LPIQaWpgGqr{;YDpM-wE+_02Kh5S{n#;R%g z6(DkiJ5>knw3vNo$&qW!Ab6|?*|19m6|QquVXXS++AIDoUZszCve8eZv=05`*zCPe zKtqAR=_tGtx@T?(!;RHpjRxZ}0v+w^nswT4ljXSrFGG9Lo>Z&8P0xJ9A+o=vN7?Sb zB#A*>n!h)?Ai~vqPAhg>s?+%-94K3{n5@IY38a{I=5UrQje4Tr z3_J6S8xaqNieeZUreul~+k0~i3`iY=J@D`x58OLj-PTmsT6!}D1s*_kKn|ucyt$i* z6GUx@>gi?(tDOZrIFNH)-aa%iMXy>^)S$%=t7g0PyBAXgWOYJBB{7*^C0Tq&i!}R`&Fm}QVRIYmH<++z-LsA z1%Vo&hoXq7oL9K-i#Otkx;}jqE-Goj8q~K=q@@MumVyh&kNQaxE}c)JaOpo;mjp^1 z<$J486mXL}`KxO&++ZFEor4TQ=L+hG+``<-*T#6|1bK6h#~DkNH(&ZilQ&~+%bP6* zc@vWj1+S+e-2yV_7M3~Ji&kf|#B)8jdOBvmjSHaN^pHHPe@ek4OWJcwE)TCC><9T7 z)=(vSy`XbAHyEw1`tz6Uh?B#s@(}IhW}(*F^^16jU?dwu`=t=2mX)IXl!5%>F}9I^7aaf;F{A7-gonT(4nZDKVH+vE)ZqfF+BCOH z!Y!D@1RcTDo4*ocjb6u|mdZat66*ufjr#P8q%TSS%>9ZnIPUlzhHM@%BpIzJ(X4eM;>SHN)3$1z`EqN3XH>q5M|$ix^6l{Ii|0iRt4%Rt!e zO2lqgsOAR7ZunrG%Dl2)1>b z6vA|3>Kn@dHJxTQ@$$Tq6Cw8KIYQ-;D9{}8$t?UjXP217IQH8 z47GRw!(ey@*j+8(pM zX)hcRn}(QRu%@j~zj1&Q;C`tmDbnFo$a3lav0nnys|9I0&;!R)DcGJiVs9(W6qn5Ut9<>jitIfuI6iM(i z0z0p6j>nIrZD&0q-e;QiA)Xc=;?kxzhj=FranFAEbUc50fysZWXrWoh*uiU?(wv*; zTHvz_Q$Fhy@3>pK&-!hF&kn`obW&dU7&zYOhNjf{iswB#0s4Bc?dKx^+M-ydg z;K|+Tp40%WM{k2C>inPKS82L;E&-W2dL_Q&eiwfAN%zh+AYak$_|AH4(VFfZjQ5E) z057Hf331Y|qzl8tq4^7UT*=2pI~8#2&x`}XKaEwyYL^=HRa9GnxM9?1ToKA`HQx)_6=dLyy$TWQ<%8hdqrZowf z{Om55Ixq{V4O7e?l%@{^>5m$&uu3OUS`nN^4d7@Lj8>wg^eT`iQJj(#gO7hp2pN~i zg^G{MI~bHXQCSz?Wlfpm0$f{-nGn@C#Y9|5>(H&rAT>=&)R)iI^yN2vWc*lLg&&vm zD%e*EefjUrzc0_HFbu3a7PKm zzaHWV-{!!eB&8m~&;`q16dq;;DQ9q?$;>&T0!*&DFI;syN`WbudepY-S;we(vLH)WhUw!iPjtA?6nf{&_@#oSw-M)PK z`^29wEFu2fbUE|qIaS&h*YX!9y`*#WzXE#``;wpL@zc-7>g|1vPu~#8>-uKi%He=5 zVlgd0W1=Is{5#l+&6yZ#n+7{A#mrV@GZ13tnvC?y)$-5MsGdRU0CAKy(k z%L&VPmTy!g+Z)3Jt%>#1HCl6s=9<>@*qDi?NAkK#YmLUK4gHAp13afg>(kF^&8NR> z+Uk^cK979L^VzyQ#eRAm<1aXl@jvvTVf^K47o;rRby+5Rp z{It`UnZeW0PEopPeDy&cPqheVqG?q&6(bfQ#o(7!>8IA`lj8!nsc)y7-l30qdRJ8* z!(PET_C?CZNvZaVfmYp4!1?RZ)8}DgK}#*3@EFXUF9oqxR3%$HNEc(>lk+vYoj+1d zY2PiBMsi;~U*l#I>tD}f{qJR3zQ+47Mwr87EmH7PdJ7rwLZ98{e})1;TGe_~Z&Epk zAN?;feW+Epn}kh3*yUL}tLbX7bK=BM=4P!nS$;PO}l-={h5@?%MBpQ41${p0&iz#(>ZNi$xc63>4X7A5eXtyU3MK3mJ$~U+=pHTw58}PTr`XZ!5i@Ub929hG ztDd`@62%Th7MOgF%C;j2j7F`ig*5&EAnJ+aNmm)`WU!BYBZsfYx?%o03Z2HsosIs} z_&kT7hlez>^Aa1QwEQ`T)D1tg4%tR7l=l#qTaMm#2s{ZsI zFYspg>xH3uJ<7q@Dw-uUX`G)eX`&_UaVOaQJhk*^HVsn3xfn6hCO5i8V)D@(tTfWg z5MP~8Zv^4?R}P4ctLgnk09Jzy>}@fdT=RIOQs@SZfPGTP?~9YHB+(&Qer4S2VoX%U zxR&ids6Rjz>n@|d?>gY12EtLeey!|W>zjy5|6y|YMnH$Mg)Gl#=P1Tv@5*3nvEqW$ zKLD22l|$hSiVJ$U2B^}Fuo#0Ka^yyfTr+^{=hY10a_`EGIq=U0LK5uUz`UyDIwS%Q zVO{iuWRK3Rkprz}X>l!{E1aWx_Il*yh#dCM!y#WMgJ2?t?7Isq z!N6#h8SvZ>ozKXjd>2%K0tY)x47LOYTOtNqA_iLm@0Zx1U77TAWQ-pIzn5+&%Xj|* z_y_YC;!0Z=Px#HrH7-#@P=@AYtUN-^$6gY8o8Wo2=tV?C*^4a;G+yhv$-d(vfQ~qX;C(`E|R7~a`@HiW!#m^vOcJA^` zz$5bW^RV$Fmy%l~<~=O>1U;0miS*lveW~AliwjyqNjJ+J=xG0q>W%cN(bvIwz;^;R z3{5@uJi=BXcB(${Hi}V&9%Zb>9d37BWZZ>~9t9du#>%ixIqbuSiEXHaH?E#`#}&<| z0u8%-$oR;)-+Ro5)GKh5@1g%{WZcOaaeoK5bA_}R?L%shZ|hR)H*YYC?FAHpAQH8D zk{C72d5rcnPk5{igD#L)gSB_VmgCBq_h63JU!>qQl*5P24$8I`?(ag3;}*~ae+hed6+bW@C9ZHLAj2Z1#3Y3Vk*RaCmD1roi=&DFhVe2& z$>AFB1J~8*`QtEWadEN*2p`i>31COp&c?4J{REsk%HhDYYv^4F=`Nh6Uq;qb~eb%un)eOirqfTf@v@!z_n z{(vJ;-`I>$T#^%gT=z#uw+44)KK3_5>koI^tqg<3)!f;Kk;fj^+z6N0qGucO@8C9d z#dOK*RO`#^LbLHHg!x+anSTIj{%e$PRy`%?HQro|HzQX(`befpJ@;$+9me0MlYd;5 zTCMWSZ@kMNFMf*YI}#T472wj<;wZoQ1iiU2wh1pD5TDI?SNH2r$k(}EGtmPYUEoL& zP@bu)cbS0;0N$u5m;dDECu@^g-34gJXGM!vv!n%J4{S=+^=x2MesLR;p-&S zEWvMtCr~cmN1WxW@2mYBhuC0V?! z<^Q1XKcJ32gX4p+9`IlE_O|?m+Y2oGKWXnWw0^wB^^=eHn}h!!b?+YERM9*RCv8#! zw49(6f`U{Gk_t$xfQ?*iXbUHpY8AOCpnaYSPsJM#B!F_&q|)QzAl~tgk2kzL>Z3jt z5TpfZ0Ux!X2m%U<$~gq2K!vtI^UlofIk~im{=R>`em>HZ-JRLn?Ck99?94{=!e)4U zdSWLDy5pWOG*?&9E|LED6(s0(>dkareQ|lzyxQ6+^@e}5Q|dDxpfO7Lhwz^8E^d-C z69_-NKk9GkWQeG_gDJGTl>Ak*C^h z<`(s~YLi*r_AEO=z|nuBD1D`_r>WoQABBDXZ$jp4ER*|)Kj0Nc?1hea2e(13l#s8X zf%4Y(u}8__wlpr)1txPValw7kDI~#kGH1P#eCVxLpJtGr>Xbx~rM$cq7*+U7dPhRP zdwmbTKV?(>=C?G}cRglI^~itGRIkXQsm|=hrrPsvz5Z;b;#W4cpT6HhzhdYgwvQo= z9!P`Ca-$Vf4YdP$=#i@FdG<7aUv%ZiZ3c8V z`X5+Yw;7^c&NRC3QBQ^wYOcI&1uf!i zznmUjm~`LEC)H=3gt=R0`1L!WsxM)+<85Le2k?J9)WeBezn+e_0nPGXlwkjPDw*4V zZl?8wZ(~LC=k!h(Z5?yjW8r*My5#FmST=~6$I&LW1NuG#v0&V@4A%Ue{^Ufx=_TCo zQP$pw4e|j@9|tz77#oEMi(*<9dIZI@_=f$HH>}-eF3ApY7Alq=#tUXN0xD*RF#HJ( zk>0TL$P)hvX72Qr>;s&sB$6EA zlAH@Ys*$x(K6Bvf2=`P*^= zBjkVI{~4Um8boDWgV()-1tF&wdE@<^eVI-4))$68bBAi?4M)sS6Na*U1rVf`Rn8w} z&?Krniq1}$-!UQ@^hHa;1P=Y)8(4!6ts9}u!Nca+L&TDe)Jh4+es)C6JYvLU=u`!h zUxVj{4nFsKe_J1VT;EGeMm57RJg|pUndD~7m$}84*?SlwL`t!`e7xOwZx%(!bHQCjDHDKmW(K>90^GdZR@=d&58C@5ti6 zMHAAN=A$VXW>&F^ zAHHiyt;IJE_#of??aV&VtQo&T17{uNL8Mc+#uu6Oy}^InUki&);DgCgO#Y|yv^?Nj z%|^eo4{OSq7=D$#S$jH;e;H2^cB9SQP}nonO;wx+#rAE(%k|O64;3#nnr#MVq&~xn zhksNbsfr(?wW{(m>ThE8GxYvBefxX8P2?6Jr^Vy864)m!>+^*Qn`dETPa)JU38Pz7 z7KAnnWf=4}*w_qmHqxE2uAQi)w|mt>PuTK>b*ZB~4>Y1XKp`{$m6XyJig&pR4OCC= zDu89{t4VLufhjH)w?iq`rd~q5c=Le0Y<#ztr{l%HPdG zMOToJzFByxXJD;Y`XYOqN2r)&UU;gve~_>Q{U|5l9%)(G)l#nh&B|{|A@pjVk#D+f`N?8 zf$lNF0{5d`ZAMNaHH0e5Q~bF@dI=TxnFFfFT{~}o?!w57!1tqM%V!0OXkM7plU*GM z6*;+~NN)Ksrq>WEJV}KmRe&WaygIv5>kx@qZv_}4y3#y+;Sjt>$wD8gN9eppw4Q-6 zc0pXz#zu^I&Ru9=wM`UuR)taND#54}8$TZ_ho3hXKR4jd;l|HySolg*KvYw-Q>q@Badhjb=TFOVIA z2R3S_$xn5q8OKh;$X$@$Gtk~={Bq#ee#;Gln|?tP-3H|yyKpjppDZ{?A;IQxdb z`db_PaeX8~zR2FpxJNmQNBWcd<@9|o1Fu;I6O~k;3@~{fXdVFV79N@xYyU4Q)>SW9 zOv8nWPB|cYW-M%aQ&`lwFD=~JivYgF@~CLugsXWWZ_MoMAL^kDv6wr}sWdX#6 zhdzgI*sD|VW@i_2!nw}WK8AWkCpxRy zXv1M8cPZ%78Bg{*HQdf=JqTs9S zucU^Js`h#`-jgydiFhfQ$;ZZXNPFfbB!gPmK?ii?Gh22knd)0B+Ph3>h>??zmTtT9 zv3+oO37X7!KFy_*13o4i+Yo`1Og?tTe>ZdoM!-DZ0u1Y06K<Q&R@Fza3>I{t zYOD-}2q^sHJd5cQY@K=+6wxL|_2&ev&ZJkzL*MA)G7+rQM$=Ane-F?)br3*N?b^c_ z^PR5mKq5W3YPrD|m8o!Un{E@<**)eRo(CdCVI86FXUlP_B2$JB?zF(~fHRTw435WU zpRjJob=u-)9FFW_HZsG$hcts2A_!jUX#YmUiYF!w4aV4Nq?psM&bJ zP_km3PjZB;6I%#BX#s|=nrn%27Q&;7Xh>a+s#BE2JnL}fsH4fiAkA-cnaJt)m2rN=_nci1>IB{s4P)bc&pyTP^lE7!AA#mXgXBkS%;DTR^h_d zhS7i&H1Pbom^|zf_2z;oTaBN3ofgeq0~n?W2*=+m^%-!imT_;R0kXmC(!- zLjOquiv1M~MdC%QY`qvEp%H{FilSMYt)K2a>u}#!mva=dwR3j{;(@_JM=Yz%37|t0 zKqniZySqg}Q(FT~N&vmg09_sf+P*c=n$z*1NB*mWPL2U>*Ba>J1kjHR(92>#+qMRJ zLll(A@;(Ff$c0f*oYz$y5#k*azt)WYl(9N^}g7R93Z5 z7{iMbK)nX&5o;9G)EelojWj+D(cV9Xy+?}Hwm{xGj4-V>e4r!z658TbuFGIJt|Jdf z8Hh}_&VekSoa0qS*!lBi*x~2(B~RAs2&JAJ)bVHW`4|w%hu7})Jx$zC-IPn%wY#T$ zYz9s+w!7M5HnAqrpi#4~Y)qW)Xp_4%8q|b|>NOt_HM!dA)KnL`$eHN)5dTy25*nuEADHE-)>$e@U)&nYqX9#R;|8cwPmfV z*9Ik^xRs&Uk^JXT96uS4;zEYv)uca<;{B8Hi*_eohsgu8E%7Mi!kuu!(TBPc=*323 z-Ii#75TL5lCMTd$7^>@%{yM4)5;``VP^GMFZaYF)%`yUtA5NpM4=l}UueEvHHT~gJl&vTu* zN2*p2ze|@xxbHUiHc=G?(kvU}2W`|Z-=(Im>u59;c&m+&a^@FZ7bjR_cWQt8J%0O; z7?Pe)0Y`GyeqQ=}Jo#Klpab7)_t(j1cYVVC$6B$*SlnlC#qtEuO>~Kp-qn0=(8&mz zi%jyO9of`{JG_ky3&{^TyW>YcOgf|n%jf@6;xBbkL}iu2#0 zkz9}(9m$^YBWZ}w7aliA9#vXu-aG6(U_R%0ppS{42X-E7ObhU4_?vDk6? z>}`FV9yI^SaeDOavyIbbiQ{znG2__r15(^ry|#in+SQ^pWDuXoG#PUJ*JpaihyYF6 z{-g1z+G8tZ+!^Cp4MPDVC#NqxgDaCxFAZ!e7%?wPKvntHIZ+KYxmrb4cro(H)0TvO}Mp}rv73_fTV1#{h5-MicfpnF?3E2~Duol3o!k8q|Ob4zjp=<$R zoe4~LrcIlE#OMLl@hsi2rngTrq$qv7IZ|5dgo#l{oz_1A#W;pylQ|BB>5otx`Ymw~ z5EB*GYJZF8=M8V@Yx$lIQGps?@U}S5JedHx)&RXS1~jEL&|4BfA2dK4(h@+sC4lxw z03B(7u1!lgyCs7Dd^jF7)c}1kE#d5z2>NCM=+@VD(2;2g=hQ^ddlEpOGC)()63(fK zpjReX(H%z37{JcP^POsi01D9S}PcBIT} zjya2D(O)L`cZ>8kMFsu_ZQ?J9WcE9fS$`z6+=P~v>eX3)utZyGYptc$1kip4=qr{4 zQ7#d5%g^zk$6nAuODze~LL%rR2x=qHTqNd?4bVd18FV+&D*;^{c5#JKyoei|KNQSO^7am5w3ES? zeS=TRqJg_)f4mClS~BAt__M*B8@Oav7o!4F4HeKplP;MxX*+*P9Or9bW>*csky`Ty zdCPxF(EpHmUiuw%Q3ckOrnBgO5`v{P+N3EHWp zdkt0GJ{-4|%k8JzC3pNPmbKGr_VNW{SrrR><>m~ zaUH1hpJsAfT|6^uMTyzh#Fd!mC!iS2P~7wvP}F`OkD}ozc5eF%D3tF>k7QE&hK$>T z8uh)W^uc@KSGIr(@Oq%HQB~{`8Wn8?DRif^R=g5EXDbYaZkz8D%_ab3*%YSGV|Vp!L&Z z)XF|b&!z&ib97A5y3}ahG40+2R8JGCw0(7PsG^;1IUSek)8YKim{Ah7eGJv%ld)W) zE(XOu9Yrq_*N6YOGaf}-LXrN~$(B1zue0nhFYZhvQ+!RZ>S83*2mc^%${*y7+L>sZ zL3U;E0?slIqP1x=3(|5Zb ziMw6tj>KMU!FF7P?eMk8i;hgzALMn~5r2a?#OT4*b&M(K9pE>KQReX7w`8N*?p8E^ z+(w5wp<;j?A0gGDJ*4da0?kF=f_$MOwWFw{@115b1+`+aOIZpd^|>9moC!dxk4 zZV)rKGb2oG;kVKE1lhZ_`QJq7nT+UF=G(Kksa z`5VzP3_VKET}VNOR-5R7%H^XcNIx%OPZ=}lZGu;-^qH&u$v#+H@=PGEMsMa(pS#)b z@L_z~8V0lcHVy;Dz48eA^~y~)U=CGiCfL;Wg55qOY%pG?VJ>12gzLEJ^GQReLHI15ysBQy|B}iYOrF@M8k!r#K zrVpiBV+W$k|HJejk^lPizdiA>$lAR*IuVd zf4$e6Z~p7Oe)8sX_xgd-v-W!Ko|s>} zuedJ4{_9~ti|!_2QAc!%j?a7WsgGE-&q792S`!pd3vWZ|W@vL^vPpfk46wWwe&uX< zK03|utT)Btc>&+blUt~>z)e3xaQ>lDnA^^ry|}xX74H!(POn$=hpGkHm0IqZgaq`^ z)GES8VbK}t|K?bP2OAUM8&Aa~WN)Sd+EZsk3a$L0fwgjY6zNNlr2YL5VCFTo=-Je! zgoey%NJzK?N@%J0z_$&1igcq7=IWPVeSERPlMy|>=V(#J*yQ)xMR(YLp;xMBajnSD zAiB3APn+&=;LwEZO7wvn>mc&~|L}UeaT{Xs$^Sr~r2awZv0U!=(t+Nla0mX*6Ochp zH$*JAp?JuuU*o=Um|26-2{0y`8-#bCLml{DPzv^y_iMflH z(Px(r7|l9*S@q!4eN?p7Yx4P7ou)(wy$rEly_fx7P5fz3%mJtZpCrFKGvsoeSw zWHN7*vHl~iKYrWWB%2KXZ`MZcjj$iMH0 z_}`g(;_-Fe69caeekuOFsO~t5tX7%dNp;b)^vX*JJqBsYydtzcB!N2%ES8jr7k#xTvC%afF`f35{EloxfF-HTdm(F*5vhN zkE{#Je)e}ke*?91>LXH(x_KJ85qYUnsHl|kocYq-_&C~Tc(~sHOknbz7L!_w(Oq^I z#q~#&hjxWJMk8I6V0;3sP$`OtgVQ_0=gc+}_&6!!;lw|H95*n11@_ANeJ_{EUknZ1 z10UoTLh8O1{WmF=TA`y!=$LDV|1;qKtO-OfbcR;PKBFUe9uWZFV~we?E`{G__NmZI zOp|E>cHo<-&;j8Lf6W(1KQ}|U*sz5-5nnLUz7CrrcdI40uB+%&-|LEly-jN1U-c|1 zJWN5BTJsD|ft?&r^+}k5c9`mpn}TiV*q}Q)HYmKGreMUq^^xptWfc&R2@yv+j{vmM3WwX>`zU(tp5m$k#d!8wQ4G!52R#B9*%XCRR+?tBkHesX(09G z54stDOwV?*!O?z;@i(Uj9Yv;MVi<(bNj91d^b$3#5&tODVQ(4o1xNYDEkJJi0$I{gwh^OAKT z0istPrXxCR0}=$ora`ilF)HDQ0j1?)ZtlL|A_Jal+o+!Pw7|>^#rWWD826M$y%Ka+pBeV zPe!pcqCm7gN`Yu^X^MMq8*Db?jxck|!ns#9n*z7NpNJ_i9(W3I6xbNl9KE&;{!vN* z#KvOS$BOMnrMVf5CD%yP9MZIIHMuUfVsZ!25J)9B*|hjKEJb%x>=W8k^ABhbzH5r} zuTP7n9@!3&R9iSF&f{83PD5ZvDi4{n2Gbb!+VJwaEDF$EjV<#r)7!%RQT!N#QW zBC`;Bj{QywHYS%BC9!z5@H;u!XelpBHhx=zjn?uaix7I7{k8@hQ_731Lg+R2J0;lI zro1Rc2+d=^+XNeJ@p7D%=r`teA+JkqA@ zh$nN)0A4_$zbM6T1zE8`+Uhk|da5)Fd9oAkwR8y9m_0&A3%aDSd)$q)4(3Z+gFn`p zw|LxJfX|d{ixF8WeCek0R6Ba{aDTm^t~%dgHdR|3ChmyzuE~7r4&mmVOB)VNKW`5? z>d>Y{^mpMfTj&y`+RXD~@r&+bfg{{c^lExLez^gcy-m9vek@&0?`u07>8tVTTT6l8 ztJ%FPe?Pt2s;`UHc+08%3}1lbdrY8b7GL8R-n?j=@rxdu;Y*CwXq%_~7r*FH;@kK| zj~idcFEnW+4>dsExwiv(he_i#G<7n3?>*6Czd-h;emsV_zf?Uzd#4bkqL}rGuSjYV z7~#o_I3uheMmX@uMa0?a@Q*SdPhy-AekDAFrYWL%y%@aOf>prl z(^kP}Zuv0c{nPF8z(?V?G`$;O*@P@nx&U}qzb`pgoOqGW6{lVl%N6h1Ph7F^q9|9S zZ^jj){W;F%7{dP)oY4(U+uB{{w`pH?Cw{xGJL9(rd{!Sf?SE|k>i!OM*5~&~tbC$GLQSXz^1O7O!GcLtC(D`$1h{O z*Ma<>hqqJu@u)o5*p1KrGPVuUZ+S>9kxIzKWN`Iskz|NHl$pMf!H6{$Jp7vc?a67K zWL-auQyW4PZzp$qB#)#{3iPOAO2ZmeZ@#*e_%YfVP6-;r=#y6=*rUU%5a?j zH*#jPp{2i1I#ZR(=^~(E@2tzVu}BXXPZK?{b=S7^26kp0bRoVI8+r*o#g@7`eM-Ml zI3kHkKhHlcCZE#l*%JIS@M{n-ebT9XIbFK|0n&EiA7$t#_gYNl$;`W1$uT%Od&q@| zM_A_zn4qP>xdG&w$wP*~S#<^xTG0Cv9BgE@eW zAu(5pC7a=Mh|j$vutQX=E2dcRJZKh$bpu3JMwE_Cd7$w@bKsEzbcg(IC-gnj{hGg7 zOv1YSfC*;$e6hsZA3YDq-80}1`Tt-(a><3}E)dKy-HTT;!^F%c5d4?dz(J>rP;q{B zvZ*}11LOu9CJT!SI-v^@OVDTD8QF*K-fFBrbY+AI^|Y(8{*V8Hs(!(xIVL7Dg$!*aSo);DQ>+Q&gZ?c=c=(qH%4j5ix+yJ~$$lOBFGYZg!%Z+W38#I}P zvRlzp5e%NA#+v3}t41Dl(TBxrr4C@21pS zdFn8px{*>Z;P~&?@pt73cjyV-dBS)-p&L&asV7{-69(xCBX~kDJz*eE=w$SQCnW0$ z!+F9neP}#9;YU3omnYQf2}5|oIz3@1Pk2{P7|9b}&=ZF7gvEM70Z*7?l;ES}xQj+f zdtPr~4_@*Hy`+UFpk5S+^8C&&^yviiy+J!gK{KtP4ZWGHD3;(G+=4G~G1w3o4c974 z>m%~eP3bOVu-ns#>#T=A7&t*{(mrnD^<2#B*?F2LWblLf#0P@yMurXpB0~_?2f&)ibW<8DH=WW90hsjE{Im zj-D}(XFSIA&V#6tta&72@~~%%Xz}JdO|;* zaJ8P$mnUTE34M5iLr=JjC$!NMGI_#@27RsZgkSW8-aO$OJ>gQGut867@`Mlcgh4#v zWj$d4PgtraT*(s_=m}TwgnRUat9ZgBJ;BWre0o9}PZ+Exr1FHz^n|uNp{t%?;|VrB zp$$(sd6Flj@PuFW1S?P2t|zqP37_c+Nj%|0OaKn*K$%Oj-fu^?$^`^D{Z2di4mD4>?(o-`$2T$w>fq%)MK)&B*uJsHa?hqdNpbZ`= zyI>W|sx56uQgn%O7g4^^R-K=OR{(8%=)WFC1IdOhqQC|Jg*KAow!7dIFI%ex7e0-) zv38==BX_C>LG)7r+kTChJlw{-AWIwI@Ut9JDXfP3lvq+HJmi6+UcQ`mpMdgq`c(J> z;>9I=OoiO10J zAzw=SwF3&G={xYvvV@J*o~&nj3@DzJuiO{`5i1{^7Yj?4B%!B@8)9^0UiVh68`};O z;t4i&79RSXWUwvd6aM6`;7@{bzYx@t++PWcUIal6uSp+*zXd`?M^O>YAh&sy`^@Bt zIUm(!x3bI|%#fQDY)XRMB%X`fvjIFi39^%eP05g*tY;@NTW`#UQ;KK_Hd!FsqGu-y zp?x-%ZGmiSu*nM9Rz2GygerKp6|z%;O(~F_qGwx$&|;pQ0@-bXO>H2%jh>w%g#OO6 z+hBC9V3Q59ZF+VaA(X?jZIInI*whxX+v?eHn(M%`LHZTj1)JJIb~`=0tq}UT4a;r^ z+3kZ(?IF9pp50Cet>)Q4Nn&cSDHXC)_3ZXSXerN5h3vFoQyOHa>Dj45XbR6xgX|8$ zrVfzZLC?mJH$1xoWDAfd=y@GHqydz+p4^syupw>UBifO(w5*)byBUzyv?Viwh@zI?YOu29psEPwIkQk zd6d-i+&*W^ zZ$X+eybX|_cYReuGO!0}xFcAZgvSLJ&^;|K^^MtRiEtBncgBN)%PS4HVaT+hhuf2O z>s$O&vfHOv;0)2J8O{wKL5fGdF}CgnS0V@U$fGhaIz#5Evduyn8b^9-hBFs>%Fs`5 zu$a`${jvV0p*rFrj-jRulW8^nBV1SOk%v3cK&z~h4*g~I)GNPpJT|;YeX6{k+?!`{ zr9%l>TBKctXFM580oEcM1(#3y91G@RkqquAhIkns%H}#newErDO}}#K5G5whwyC{l z;_*qcZf}RY3sQx$t1Wc!R1eO>ES4C0nG*9c@&Nn}=2(QX2U$7RGPUn~vLb%69i3ob z)X`~k(0D~oEBgvsTr#d(%}A60OkUY6rT-1fN^P_skD>|{3Kc^kAGSZ7kIe)s8(2Vy zpf}3>z(`WCnDnTNg zv{%vb-MXBVojL0J0T7#BBip^9bFT+V=<#DTju!N9NTIYTUrzV7#Y?rXy-GK;Y|SV1 zY5nkz8#bBlgWIw#30rR$9zsXYKB=Y}O&`dt2KJX#G5gC9X3YxwOC5RG_pIc*OaSU+ zyUb4Px?QLM`u7M0wdKZkGt<~^HsgX600(Y20h4(P+iond-Bi+cv;8Kv-SC}8D8onz z3(-o4EPVW(aYXI;Qun)T*tuOkDeV>;Z9dp&yR=?ga5xg#+C=M-X2-lZ9`pZaqwW0RD>c&MzXaHl+=DXNtD+fPJCEV z7R;~O*-zTPNE@0`75>QSyV@{;#7qLoL>5&uS_jA|+CwL4E*;mwT#nw$rJHJVJMvXV zGYT|0y$$XAFtO7RJ&FrH=_{IDloh1zcVIG%+d~*AR;Un24hU_g>6|27*y(X$ z+o=X7Bf#vPgxRY_wqU1a48N*{ieup}X!j8n7p$$w3aE~|$=wjx$qd94Yp2nuYO@sK zBx?7-@GJ|gyaOZAX7Ts`IAH0#Zh&ti7plXh2UOO0$L|6FVfl`lRcoa{8njrg`*9*G|*vlWwN| zL7#MKHIY7KKeJPi^lP0w+<*BQGbCpt9655b7?aB zXH5T_ar*|MuKz+1XamsEYt(zL!3HrSDBXVNa^|FS^lkN#VZL%vVZkL>*f5+Vy{&9PL7n+hGC8}5`*HXJwhzL28NYU|8*U{1o@1DS78jB13vlvauU1p?7+vf?z;Eh zX=ozo!gT`sR* z#`g{$jE530C}pn$C^%2QzYzj^6q;@hA0i<#ebJ{V=nS;2>dj-6|2BtiPyKg@=&~GV zN}V%BM@K*JqMv%^tvu7geqL|UjD;@@MGZdlve?`YiN+ zi%$Pa+TJF#Xa~qrs1j`qaRu0D1@I-2>Wp0qJgOe8hdkVnC5uz&4F6|hOD?|WtZTtb zgBG&_j;Nz=Ldu4{fW|PIbC55C;ILNKM)U14Yop_>3`0i3;*i+lnEDAVj_`l=#j$lZ zTO4=A(0@tX;<&yfc5!%0;ugmhB{7TR$w{p&j_czVhkoDB*T;V6AFPk5E!GFda8vtd z!TNZ%=v?cgVd!67A2W0Q^!hO3VV-8ed})!ih?k~X@Z2dK!O6s>qlQE_IW~MOC{g4^ zTqnQzt`0|L>SI*5>u~kT@2*EwK-@(u^=v>Pb!e?b|8)#C;8mK7DUk!^> zXTJ&~yq&-)`n@>XOm?d{kq8NJs^7)roiQHWt5H{HUxPOJj|5Tanj_4O?bmm>z3|7O5*1;g{9f#(^9$3KK#mmN_v zi?Kn?h-Aqvtt^APT7<&Or}vNu@coTIcpu}wVZ{AR>(-lZ+$yPiRMK8Yb-gNI7v=y^N_5YE3VQk|VAgp@B6gAupwnnNw4ALg$coqt z%TCa$V|I=d6K+541^srvPc41*J)@<+sUOeal=3=_0ELH~F$-o{xeHEZNWj?j9NzU@ zlUFIQv)yMBoef|&DU?(n#vrLiZ^^xL*_>o~gB!7VKKBur6Z3$2{UuS^lP0!QjZoQE z^`|Qt!yg|j^K(sH{KY=EH~w)Y8_M5sqU)zq7Xss`&BS%7JqIzQpWb+EH#C&`z}kNp zP9zk;#-KkQk@0|f1#Hs9hIt&U8kh)m(o8-oqyXUIKU*7^{5)70CusBBs9u%Fx`v~~ z4plfxNAH6Qw(Id=c zV)y~&J6OIvx&g~%apB}9jD__pK17>xD~<~6J7k#8HkcU>oW(=KF=q}sM9%WbGmym# zA+&{1r)RUrYg6GbEEoS6pL9IDiF(O)BBOs92_2=w{I5*&MU#DC5`WN(HQqlw_B@Qz zKdycL2uB=joeGRqD0`JQf#?gK3ynBXgZUdBV#;3kp1>hdUW-#D4qnS=L7{9lJ`-yfiR0&xZ)5laOTuq1RZsQD#+BJAV@@qrCkv37oJyh?g3MHWHo%a`iUw(<4j0rk;bhWb9>YvEsw`Rgo7 z4R9_`66G7=jNV3+#@RQ-pZ_nJ-Vu)dfs_~>0iM@ycry09E?os5QO6JqIEsS!gK2{E^4(sWe9QJ1%au9SQo2(YUUpQxQ102ZuJl)Z1k8ccKunbwX7hMndzSj}L`Ra$c%Z6oKE; z=+wYrRHR9JCjjv|BKzXw10$2Hhu)om-r3{0BQy^jHryX(`2!? zWf-s<7^by1n)n6PFYR_ zEp1Tb5=7WQ$?Xa$Pd~-4nlc2-Anvoj4vM?T0ZLgeE#$p5Wls3WVwf;`r0NFw$;-~L zxRK*CD6yngH1AS-kO7;5^%;!KC+&kNRVLR>@JyVvL0!``-hMwAvCXzF6#LpJpwJ|0j8+OIbvXO~}0W~SPE zB#2KXHQrwJ!G7$N2cIV=vmHOuzdmukf)R`GRON;#vGT50YUU=+((hzQA3WrOeaGhO z(>sp99oaBpaKMM1j$F}Tph*nIjNF#Qu=v=^Xx@T+FcWC)I;Agcem|kCSc6&tpL7?< z&tF}P(o#tTf7!&7yTz*g$zJII6#j*1HaO32VnQc7T13W>(MO@2GD;cHA6cm~+$>5d z*UAuz~q|zY91^t6$JGg0HjlmSMCHIJqR#c15cr3xLsYC z$rKch=n^b~Mc|4NE$Kb$?(jF!_>v&D*bA}`YXgw9I(T$rTpokDUnbiF0>JoGyBGeM zlM(m6fa%-ik@jFiJO34A^Rfb`sUb-S)zU|@5PE|?QiU?S{94KUC$o1KOYwP$s|#L| z0wl)649>B)3rxg1J5sWZy(5tGu!g{kn9`@%cNnI(#p))p7e}#_IU)?`(Da*cWCZhW@?MS2P=< zDBou*lt;MYh(8B7kTk&QmC^4{HeFHsq$&(po>?Vk9uun?lf2G8Ok8;lS1fS z0EW#h#p|w|BFwKrJ(!)ve$L)4!ZE=fF46q~$)$ExNXO0;0ux?OtjQDE?v=ZE@u4)D z=Vzk-7^q>xjGNKgyK0{Wa9fhFNnUxZ%cw!jTn|l2!qD5n{{e3IkYz%d0w0C)6gx01 zuWWG!mXOmTswPt82tRGeCqrm~(-S{90HS__z0pDEGi0Te>wp@`emo=McV^-LMYwD- zBGc@VX|_m~CsGU3oP}vQ(WT^vG%;o=vH45Af#z?*C}aNK>PGX|{%xAS#V@h>d)0+6 zUUAfhZcR%@MO1dB8cFBQ4%LlCgr>o3pegBwnglhqyD{%S7PEQRE<8$8HKY9XZUBJ!~%cTS>Pjd1W5mHh8kf{Fc1mj2a0M&?_izn>gRqJ`2Mead7 z^MFN`A}RdVk+VpAXD$~ohIc@L;nYldkY0AtMLSO2c zz%KieSTgY0krpE8U|efs;wKL*=OtS2k20{B=X=hQ-;L*A4Egjpg2j^^*p{ca#q_VE z=_f|8^rOE+`X|x!pLzOsnEpyMeH%}&!gSf-#KQ||36_p9mf${G2kD+SXeFncfGMlk1v#=I@|d z{T|3X{H2=GReyg+7n#5FT8dhzss9b-fjmrkISj$xXbj!|Pzyt;!D!|U}!anTp zgR;!+aQaiy_fYb0Nb09i#6h`yuNT^=t}Wu)NT2%vEQk&l>q}%KERiGrht<;{9)yGH z7ze*R9UNUfKW4DilX+f8-y4BV{D@Nm^zTG zp7L_xfNZbH%O%RG$rJhCVzzQ%1oMjE|M6-HsY~;+*u!qT7MNpW0?wMQYzm*~g^ija zLM1uKPCC)zpJVsBPy4s3mtyO-kpCC`KJ9<(wxHF?7Ud*7hseWi^we{t&3_)8-f|!7 zjd})rH)&+ZwWJrC@Skgl{Cd8GhD9YgBAH<^>!H_{k?2-x z_bI^IB@xz8o&z0C0+I|q2rf1Vb}vO>0-DQ!M7kqm`)e{Db!X$5ZY?4e9-gH1Qp8}jMU-Z^VvsQ;%BYB2o0$!B36ki+nK#13s%jfj;>~c0gJ)#Hh*$!v zxVrl~7}fA6Odl9`R4mqyiu%249S)JUlRb2ODFt`T-+;cq=)e^rq!WhuTTIf@90&XL zTNnDp=XPHgHn%I`oTzS~J*q~PQ&E|7*x$}eYG&8~ZR$v7Z$URk>a$~^jjQ9&7blr~ zwH>+rnROyN+K?YCo7!#|ZGJwbID=3sGqOk>I3oieyU9gFuVombOSv0eVi2(wy3a6$ zwpSU4S~cAM;2fv^I~Q#N@mxH+fR^13^;t?1%6}s1W=@9J-3)zuqC4Aa7k8(FuEQra z#K*@)!gx z$OaVmXHZjmk{Y?)LO zepuIob#i*89a?dO?E}ygjqQas>g;xGEOSAmQ3sDVnH#wgMslZ5E&-nVyfX{Y1S{>L zaxJ?#^twO83u>>T%3+y;aUA@Wzv=UKW!)T;`h761n88{%i z@4~v@M@S`*fx}`M#;Edn!Zt4#R~(3-=}$aKo8_3N@$Nz$b_=@$>QKE9iQVjz-d{*$ zyoPmm@DL}^@}>*e#$ZD2nj3pAeI}G$K>AO#j8tBAE>EVjI7+9haaF^3)seLDj?+%M z`;VLpfy8P;&IS|vPlRvcmQ>8=oCe#44HF$a5k6h!XxXfoZ;^i!S%=7E6aUf+P9-k~ zN{o*wU95jYksMw$mlZ9={MknS^VC?(FXiQ?QMm#;6w648qI4xiQD$)vA_Ao#7xQzI zj6M{_6fM@Vv!c_m==d14(_+dl*tYku;n;eA} zlcQA?aj5ahe03OkKBrwB+KyT045Mq!9i)a(SB%5;PvU03PR3`?kxbNd;SFwJFuxYW z9}R8Uk6iT!UwtH0>F-O}_NtM~2^zPe0hco&Q~jK*9p*ZV5%QvIIQo<7=q#2lV*2Yn zx#1K3=!((+(8PE;1^_!!+vFkG5mo4j#ca5#uiK~dWV#bnErZzQi4Em$J3=zv46~?~ zU%-Wasyd?XOhyBAR2h;a4OMc$Sc(j^!m>EWxHuE@BDA#P8-5tNu$^LKaKNQa5yA z2_{;Zd({uSuub9Gc6_pNhe^RpLy)NC#k8eHk2ffm2F8dFe1@BGoW4^T`X*4ndJ~=H z__jkvEZmaf+DK39sNkuj4fl_3rC69#%e?)*& zgnO-bUu|LAGPUdpLLzje&lNP{v#m5TLTD^`(@tTvF^~Gt4F`Nv53%bE$p{s{SFrCo z{C=N(H;58%0Ic>>6k&@+w@Nmep}usZzJ`+z>p-D0iwV*wFXPTdnQ{PGqpn9djQ;KA z=Xt-Lr+)e5$r;h9y%wLJ;SJFF2ST zRP2D)y~jU7RAyx0LcgDKXAjCp+bh(Fv1Vh#&@g7KdPkZ*`*%4V{u|_OPHlv7AH~EJ zW+nQA54iYvwJk>9mNtanB|VKnHmlJO^=GY@--qmlfn+S z7mbHF9;#tpDigIzacF@(toP@N%6ONUxl61%h`4KA;dJbmI-2kTj-7r7;Kg^qbozGJ z1opG^fXa|Sx6p-~dT}b-@Uzt3q=jO;V4MtM6Uxv}Bgi4RmBuPIqV}I?5)L%a&P+e3 zGFp0iPwVpmOqDTuq2ma6_CY_mfxoG`&NxUiTpwe~iUQMbg_7O*z5` z;>Z6?I|RL*vu6Zy5D?l8lJW7Lc8nOq`>{90463G`KC)k=vR%t+hdz{$er5xE2BmN3 ze2$(BFM}*?3@Z|r)&1t^uy~bm4q~n-Bdc7&W>6`$2Rer5oJDUntxuxfihzh>Hu?*6)s>fu%^pY$v%p}O zx`F&qHOGbq>z}!NZl}|qlP~?Q4T|BHL`gR>yK*?zL&s;^)a_b?G{|r3aDh$CUK3B4 zI2AeU4dM(SWV?D*5>ok2QJ!E&6D@qpZMlwJzZCiO3PcQ?!c29?yit^;Ls26!^?=!r6AqZd4l*X znaZ&B_=btC$ML9^;AxLzn!<%Kw%>n8Lr}wPMDzGNHD3I76Kiq?cP2I7;ggGjbV^-5 zxlgQaPTg`TlOnQ230-D0DkA^AQfY>|5;F)*bgsF#lv*+KOExgh&QK{oswIVN;;I>VC+-Sx*sp)ZK%pP2Ai^|KJV!_01n zy?8fSc33D!r5=jS$3)?>jmL!X7Uwa+p4QWz45_YKKtu`Fwvcd4xOx2);qpzwLldz6 zdywG<*XWY_Mj_Oj)FXQ%Pmos8xP!2(c|+<+HWxEhXwYSq!sXWYg=CZPAliW*7baT& z1;0dH9$QF_7PB`(`-QRsC{l=)!nZqJg_&?-F`{>b*3fiOL>oNMIfREs#fCbe=i2JX zUL;eB(k7p|4kPsd3D0%9u2XK$w)yWS!SfgjBEpH|BzvN~1LB)1SlQIM@X+V*0?Tjk zN?&=+U*fPy=$Ce>D5=z|t!nTWv)ER}F|33!tkgcl!^l!mW?r_nC3Hbc2iyOIUx@CP zagj&3DSJL%pq%!~UHljNq=s-B*-zR;_ZCqYxfRwsvZugdyc{Thg3a9ttoIm9)c_~T zOk*u<1E6zQD=%|V*po}MU{(tN`f{MCho0eEyd7kIyw}OK;VzJQT$FZsr9EV?Qnt*jj)cH?`edG%lPKcW5xUdJvqQU8?*u~vI=b^^G|X>FJMQN z3hivAJ)Ep>p2b(QucA0r3CavJtz2tioZ*5sHr3vfjRF^AEZh+KnpPzk&LnV0+;a-H zG5iO|eRb4kZmY296_5{MfcR^lxAEb;9*YAwh4$DPgO5tJI$it(bWv;21uZ9S9Bz^e zFTkrhT5gP1SI{s$&|D8`4PJ!<{`SrbuBezyuc!UyiCDQwug|Ln5vC zUk)Auin*qxX-|rNnx;bqYR>tonmL#nVHfov!OWK5u2zx30xGm}-ZfaDB!7px^-!R&fy)kCRqVI|x02yz%-{ zFV;V37gK*5!7gpTrHyPJ-e%gcFGcLp7ahp5cjN63U8LY4EqpP1kAyUFmZF!PMJ z>Wszb4hz9YQ3#Rtc&mQ4jFkKK&PRJHCn=G?$j4_u>`I0z_xyI8Divi|dYO%4Rij08 zf9I9H5FTpV#$>uy@%Of^BG!bl$xf;-M~;(ns2G()r9!BR#KRN*_DE5zeJNxI7p%7V z@I(Veg$M5?$2IsOPVqa7yvnUPIM7dKp^fj@TntzgBBg{<$tx+e=yfHa2p z=L6|{4Pwwc(?|^JZzVBk1e1b9HdZg6jILoHOcqam7?SDk#>A8j^gP!^iVb8^gY&p@ zobMOYyvi7R>|JCHyHI2;e}c)n@RuA=;;Xihq58|<#e`741rIeq`^sWYNo(k@nwHCV z?)CiroQSSahT~&Ld~c+q-x6+T*pp$tgXPOn#T`{IEjI2tnd4C|?x72)0$i!_%AwgY zU0N9PZCtuZ?1~lftTZJklq7cLig}ehuQDbthv#kLd7EPLa(P}Y&x=(|i9By7&)XT3 zSH$!7@w|O8dEx^d9=`64g0-MV#A1k7n<1x8xEhpCySWIqa9)hw8st|*M%(> zJ55(E7n(PTWt(*jbb2GrWoKB;d@qY~$i;Na$uClMeKW$m&wo4k@0IKl6{K7|Jv zVre&qKuPsyHu2|yQJfBr(?vZyfkJ%jt-F3@D|1qay@lu3xR{s2^KxSHcJjPjo|hYw zw~yzEJWq_tQ+Zwy&nt?_tK)g&dEWS#yat|E%zK9S#_BY@Qwmnj=GQnG>NP3Iwddui z-)1o;K+}!)U#~gX717!(CV_BXGVJ*ANGwP9nWG$?KRAqI>r79Fyj`7fh%xq@1S!X^ zN~ew+9LTHSRaI}OT|3c;qM6qFG+H7ta;RZspik$Dm~Z{Yn~?w~4^9@E3$_8`mfAK_ zGYxItmxNq~Sv6;B$C(^u>^cWfo`7Agf3uNeSAS|sZ05o)^^x!<6r>KQo$8K9)CKl> zy*wYVv|};7{+RVT{9bsM>US_JEEwaP`vaQK5zTgWsGY9l(4gmE7Ct%d{t1&~!+JL& zz+SZHRk~lb87)%}i7gE1;k>yC<+h2;gN7YWo%!qr*v`Z*X%}7GHQ!xKc!}YfTcdyp;h}T-L87P zPo7mO%CmA4O!}E7uhUm_72%&Z$V=VlHMn?!ei8df4=ed9P zUx>ZahDYeUj)sbR)iOB9K~~@wp@Ie!wM;oSq$)WNC@;^2`rj-e)Sdh#R+_!;LujJv z&k)&rVIg#iayDkDTk1ica{m(eg+S;H>q!!5kEriqmfT|E%EmQE{TwEkIp0|-R6sL? zvJX(vlIW-*mxua;9-w@wU1aF>P!DO>GexCSt|%w_l)km>0RbJ>^Bi8Oz(m?hbuxcy zWyd6KN3=a&=G_8WG^iQ1od9e!Ap?+T&oR9RAGZu4bR%{iMotw|i<#ukKpAW1ZOOt1H{RgJBD_l60UQ_W#dF9J^$rlOJ;+1wRvfUpYyG*x zr+DGLs09{L&b7n*6miMip)TgANZlf>SGysy?2R5ARY$PSoS&l#Wd|@YUKXPV8Ri=4 zr)(P}o(Lj0PZC1sM~FUzvMTbJpGYG}8+^Rnhy0gN|2bV?Tq~lB4~`|!1>V9OQJ}teoh>ZXg?VDHb(k$*)XZqEz>^)zOB7$q+=JMcS2bh zYcQXQo~z=>-Ru5MC@V#M%lT(Goy^H)d1h33zW|(68-F_5KGt%ZP&Nwh8a_4f$}*;( zsImxUS#lO_rPzQ{B8v$eP*zPrN&Ln(TUmbDb`ezdgGbfpQB5;xT?Q-f)*~ zqjx93OYY!r8keBBK=rB#^=2Q|^6~Zv-+pCNg}K7>HCmpoVM4v>0rcxR4o8=nG-pQG zYff)4Vi%T7p#$kQuXM&MeTz|I$-|1M+zW(v+?N>!mQZN+27iN}J7M1$f!uD4Nvu+@ z5zQOKt*5+7es_~lk&~O#IPVTmPJ{5sC4kYBvuVK{p7KJIr+kcAs2GdNFj1;A^u0pG zV6U_%H|MLleemgP&X)OhPIoF4x94nG@Vn^VIWwpH{M_;!p`r+DEu`o_;m_!O2H;P-k8JY^n#%Yy&0$FNaNYxz7=xt-dK14Q9qTC>-Zz7`)i%4EwiHsgNo%;#?P9v)RNp$lR6+y;uuHjqfvx~`Z3`Sa*Rs1ne6%vg z?v+Poc>92mX+B0*4csIIV;UEq#*zKl0uLK8hlH9G}SngMd9K$ADMV5fTAS zT+oc5OfJ(66Nz$#1%(yCte~RI04svQnE`s+)~w>HySiTMjp9WF6i5O|5U+5`%8dv; z41q`lA`s^P-mB`Ko@A2M@Av!ve)jkK?Pn#^)m7D1uU@@+@6|ilOSG@RJ5Vl~YO~fb zs~eJ2R}a$q?1mwzkCTvr86ucHBHCd+$BZ$UHt|awGckP3)*Y*BebusQ61==isUll?#1KMEM-PB6#{MZJLOp9CqN^ z<@?e&$0nozkRAHy4fT=yebjvi=TT4d*mr@2%wnb|vH6hBO8uv^1fDbMJWBtrf9S(C z!?MR&WR6EgvP|cp8xvq}Xe(2QOdUl$*`w`Q9pTDDpmbNXV;AiB3FiG}4>v^gagqa2 zbZqFCet6ca`sj5$0M#sG6My+0Z<5mo9%Qm+O#H=eO+PSdK^W8a!-m5 z0mDfE#)2P5#RHzBfo@%x$Y0C9PcPJ|^!_XGJqa| zos%rOuWc^nr_lC?SdCIvqsCMNZ(`SK6`=+)BPp?%AWUKK@Pg~(E14Syn|C+oc}ZMF zq{+NvLNtse%3XY!sqCC986eP4@v`&CPf;5aYA{d}jzVrD0vLL~)?ffC8UU||mwrH6 zmRopfHkBrkNk%CXTlkw)jcIAf2xWTAtIqtD178)!ygCka6(|@IL-bO*IMHGlIcC)P zwFDH4a7wdruC@I7B9+#SKV;&Ah1<%C_+aM`=`r?1(5Tz-YY-IMbTW(mY4(+oh_39C z3MtNrB?AYDnyu(#!Bu+qELXH_(_A+R|(z!e$m0yvI_XY5v*vPTmKRn30Z6co@y^9TZ&a zn3i@qa|(bW9CUXY^f4`gYC>^8Cw`Iyt#c`ewB4SiD&$76lSVcYr$)%TGex)USG#>g z=!tPowPjRov2!*hGx&r}53yLAdZ~%5nsN7|ALHTf3qQug-333khP%^#Y!!EJ`>|Er zy`FysA&7pzjNYg42I=_99@8M1Hz0Hx^oUc0vY!sH>Zvpft5IDD^yvuLxd~qnn&zA> z1aD}H=zDtLI~|fN+GzC31x~ma2y_xxzT5IL4rFZdsKs{Of7DHPlQ>6hb}G3Ws;+?AKXU3$xqhRclgH*C_%zac6L0_s+JQQDdFfz=^lG;S_?dG* zuu?m_4L+hLWFvI9Y1i*ZwE;-ERhUH{NC~@bUc&Ec4r$AJGHa;m zR_-MLV?dn0c#Pj1+c^~ywc~V*b^Am6BulP-0nB16heY+E1KO*d(ctF+c!QSlXoH!| zjqk0TX$?m35np?U(A`urTkQ48jF0fR>SkVU@QZU{=)eKY*@tSAx>-l{F10`IGtt!& zY@1A%IRp&PJ_c82r;jT3cfL1OcT;uFyu!;%uy>gMGeijae#djgqbARW<<3Kkcy?Wa z|Ep%_mxUcmO(>rfny*@z!M1VwOhA_!bzoy6;>}@1r-CpJhLO`N+i?!#m^Ni!YZ!HL zPZLI&$?2rM@zAPZZ-cS1_Q7YhoiGH2i7;qBRbero+PP}+IRXtp+Uij!+EE{>p|5~G z02^IFC-S>VlmGWet)Hu7ZP9a82#fg5!yDEa8p~Z2zY+_@XwV4tX^BoQH4vFO!l;NzraEFGdU1#f*c4;s{}Vw^dY& zZs~XJF>n4oJ&`K0&HqjkKl~!v@;$BY{~qSUuQ9q>%XY_Gt(SJkBLed7)`-9ZyIUm! zBX_rY__=m7{u58Qj5=;zz^^D>>>5%C?9{~tog>2sH-E@9%U5(xVcrSY0l$c88%24& zSn*3Tv(WoOc;W|yhU;n}u%CY13&6{^34xCj=?PXP+b#rN$ES*Y$)a^56gqsr3w2MX zJk{`o@vf*cNEU%<)CruCfrr>HhY%pR%aza*$fp&)uS9@!+IkuQ1*z#xs&b4{b0Pa> z#ob^{f@yZu?*WK?)1LSsVi-9H zET;5oL)fD`;vwwz z9jzhk>K&~j>}xw(Mc73%MRttX3#c6HM z*Q^7((ZP!Exyh0?fxkmdiFW(fY}lj#!X9JB5%`|HE#+@ZW8O~ro^^zZFYS-CRF9KB zEXI2yMF{5L2AIDNwQA-}%dRxil@7j7`&^}`$5!s{j9wT0JDd}qSzzx<2cs(bPcA zX449N&isTz*XgcH&8N9@Ykbi!xEL?r7B?|Bqg}0hUkJQ|hSASLC!v?mwKYud)_B9b zb89Qm&x=1Sc++=ltI)S^ZAFj@?!rsWU1uY$VVe*gLv#Xwl|Ym*~FDtu0Y!oJwalmGEEC zaD6&ANq?8mZ?!&8{`b`Bwq}zUcQ$pKn$2c3C%9WSaW%9t8=Vxd+{Cu1X*T++m-s8w zZ1h)wO{R^VZr<3};f|RMugPo;C(4}$LsR%)FLxr8@t>FQpJiBFe>%b$8CYcGWnS*W zq9{BEe?y^P+JeT3jq#wdXJadCTfMQhwOz8Y)wTW0#@5zW#PeQc1JpeY*zEp>|18CS zs#+y6pW<#C0G)^4<17C>?<$F5eCSC=rb4eGl5gZBZz3=JmRH)(|88q!f3N?A{e6^| zGiIPFG@3v6JdON`TyF|q2J#ICsq6btPmCwu9wX5TT`)+xS-whMk>4_|yzxG`$=eC~ z@FlVr!2cxwVYI1!tMTfULiC5oB{paJ1qpkY$Z;ex5DC=Hwy0h?1dCBSXgl8%* zuZ;nRJKlWu-Dc|N-?eXJJhv76w((nkWBo3Lap#+F&1@$uF6VLa&FzIx+f6aG?=Uic zdt3AN{{~+w#dLUkr|{o(;jMf=d-ziKdvAEhCs z+4G!c&r_Q{k8AckEcThqPL+8;6$kz)w+}|a_MTrd9Q z4Gk2DvHzP;$NN1LRp)^`U&F2mG71O7rw_VsD!c!HvY*$v7gLb3$%(yzejkqsNb>e z`ph0KVubz9Y!NOS^Y9ee#J&!*>GGLGQCPa2;@DvPS+kewT47-yEUNe+LBfc(u->7D z2+wrzJw`;;%1N;FsNPH9EB%~eMI@m>xrflU3{)<%V7Ed6F^!MqcSq zI(;a6f@B6Px*B~y(%lBSg7C~k6!TqMtKp^XV9Hf$^9&l`WVZDfk|bq{Fs+G!^TC&?I>ikIk8jIxaKceKXn~#oP&-G$a6ZZ_3cDGqRH<({W z9cCk|NpiA{O7TLqjxbJb<{J2C-yS9u+YL9#NR8=B$7`54e&g397h}ZsTvNREVo@0l z0>=HZSvWB@n_MfVX?QabpjhGlMo|a^>B3|r=h}P_*H^F zB*7Q4Odan`r;J@CbM4X8PAnKht}|T-R?#D~Rc@B|jY3!9lB67Nh#-@QR+|P7dl5`4 zScga%lgms9sx(+Vk(oh1@Rmc(tzAxo3rr$!$5f9pksK(}aYO}aMbR9`>PA@>k$>IY&dluUq!gN#5Ew+g-3O4l&qj%@Fn5*RlU67Cn*?sCf{{xX z{*~a=1pQ$?0lmKze2Z29)k&Dan?<hKV3Hb{`;wcSm?L^NYs_23ufgVZT=UIh#s)jKKT*ss@UR{OSg@wAF11 zn^@ZHOBUzh!La%oIv9MNa9*xDB>%pF9_6YjgE4U0T#GiDtg!95G&mxRNlu`NQAovseSbW62ZBv|PC=M<@&Qb6GK~kr1dL8lpT$TxOTRCU39IT3^So zkG_*x>76F}D-!69?CIRZ)*S*gYx(qO z7uYH9Ca;RahgG$jzRbD+AY_qAHnnA~GERm%>+RtClZ3#{R{AyDqrSGDfa+xhJ|S=~ zLCcSzbqi&6dhd!m{bX*j)g}biS#eA&m^t_%?c;U0)_%4`eq177T6l0Ntq=yLewsq# ztxngzf3JB(Z#U?@oR$Sp{C4=jP^-;X#io|Tp~=x!A}xXF76*0iFc7waT<#kW9GpF8 z5nJd`FWsMn7P`a*l-Y@<`d72^C}?Bhm9_l-D4;`68Ghcma&<0{qfSJkA{%po`g96E zVU}miGhw+VaL%T6i9rJn2>x+I)zDR>n)5SHU%&3mTa8C9t_^Hkym_ByNnr9r4CLmv zgq+=5%z;iOsRrEdTG9~7a@F}(v7u339?S|69Vs6ov<&^FiL8qMp(R?@paFyS&m=NU zSY{MF%?frJ1&<|RX051wHqS&g+oSaHD6?!V?Y2A^ZL$aO^WN($&Nx}%`k6Tp>M5+r zc~A&kV8v{xbs0M_q5&m91ew3Dgd5Ah)>hy;h;E3fEFw6DUa?7iku_^d@VLUhS`07L zH&@U7%cH>r5#O`c*evWKwGbE>iD`ix9;-%yU+|1c!O%WwAmaYrqZFUg<8)}9 zKS1QAtku9n6{xwLZ73F|{Y!Lh_g*A&pXJ@G^4_JqGCC^<>m>DJ8)J>WTirl8vm7{^ z0&5eodJdeA1KlSbFgbjk&L9!I)SSN*xCS>mlBBSRKd30=v66RVbvG0zzj;F+i_ks! zx%@r^%tEc*`});J98XNV=p^>dy_(f3j`D`L!`7P}8&9^YMXITx*g39RVc{o;ysM)Q z;Kzl9uRu{=2K=8Q&;(vQ$AX|pa!qIfG83+44Z7yA?eToDiX8@xmDC`70Pw)bHzO(! zIFQ%uBKmJScrE4yytv+fpUvvMiRHbU0|#B`=-CF{|0mDJ^8=2u`F63SeHeF0$-}&X z@~BQ2a*yOU{f(YwFb}f~`ra{KIKuxh$;&2x1Q13jfPojWaw^^2HHmUPMYzH=7=+aW zh^!hV4k3^jTVfXin=COUKx<#IP=+Z7CT{psfil^FVXVd%pp^gy47#a6nT3%Ny(t)B z7eVGbj;85%VuW4okH`iEcS!PX-(E?XKv8l>>1BI-KvKt(+@QJj!kxZ+LbG9YV0_oPObsuUEvp z2K@huzfPsse~WpYDg@Dku$c%G>pumHU}mFCtMq(A45VRwq=e?{^Gl2MdE452ea2di z^$`N+qb(v^BVZngI$<(!ls)~lj&o?yvMGEq#$P#yYxQV4MldGNJ?+z#4dgAq6#n9} z_P~I~wcKwhqCLloeIZeqlMdZct4p={E1AIcCwd+6_VmbS!hbR8F($GXzC#kF9mZv6 z-^*XgtlEaGB+z<$01B`enWb2q%8GpUeuGr*e6F2q_A_`G~hZNYPfj@Z^kymg_uHTtfFy7Jc7n_H*Ew(iASUtn&Xf~_$)D86Z2Zfczx+j{46)_Tiw zQ|nZ0?d7dk>aCmOw80O1)TPKan#6*a$d?AXyNPAYJ*F)_i?kfkV-kJ2hj!O;h919P zj#zNOdx1y(AzX_kA}jG76_q>O*o)Jn-j}ZTbTx$mnnX^$m$I7B(zjP6MA(5Y%`*s( zD;lE}Pkd^s7!^9E{HM~(DighQecF;(Hv`HoX6nnEfj1n>$ml{9awOa#n0*2d_NY zrf1=PnS2PQQsu5R>N-&l$87LY;Kqu2BRAGv=b!Rzn5DT}1Jf%i`8ppq#^wu$E?HnU zrUL$sDXSml9N_yO6BXmxt?VQ@vg}qEyz1Ch_@QpGAL`2T$i;l;$hgoaL3S?1;5P!$ zCJvfl_om`x*@ayq%g+ukV%<`?AA!8v)At5R2AQSF9B1;M#GA+P#tUBNa)!2vC@FP4_?z4a>;gNk%bf3ND;5q(~^b?wF=+9O}dG`&%!h%Q)?=D56y!&-5 z-T~~&afUB?gN_JgoNpIwpuVux)5X+HG<<0L)O#J?% zvl$vZ{~@8_bnLMkpn-NJqQOk!{)NDoL~N!`)ZcEPx9R-tTzWer6Xb(b&0ZVDrwI&T zp}`+A{Y;F!@O3@Cjv-IY;taNP$cav&e`$S1VRK46UJ$IY;d-BU9D zA7hucJU{Z5#yLOIm$p1VE?ye%{Ajnd&GX~X`)!^d>)&tl{CNBQHqVbg zzyE*1`M!01bbVI5*K&Pce=pAZl)l$;eI9u)-ug^=ug&$zf3MB;Nqw))_37|lo9lCU z$^QlG^ZMVS>(g^d%k@cK5@&tt-)*@*JKv4BKGpBGxjt{c+vfT#dbiE>`Qy87uFueS zf7ALj3ul!hLCCx)eJDO`R1UbJ$>J8cM8|1?#6?wz>8)qgVmeSJ$5 zfByA$OZ=&NI}ZMQ@ODf5dExDN__N^cHu2}4x7)-Y@$EM8C++Pv@#nm^|7Y>1*Z$~A zzyH?1!JoQ)(aQ7RY6E|qo1+zr-fA6xC|A+5R*R)A9^w>k85R(Qu@T6YCA@`&FP3sm z1TyHi_M+&r7-+EGKNb0}I|#X6EHPC|W0hXB7?oJ;)jNt{q9n11`fMjuN4ZRz3~-s3 zYVwTrs80GX`|j`s^x+=nTD+!us5Mg$Wru}mdOfz z;TCKT@P#=#zc&KVY&79fB88~5D>=mSjbQxQAk!;HsK3r{$F^swh+^8)t4>02+c|Ul z_2+ndozF5_>1<(6r|?3$_i9m2!&!(|;!BzSMUqdlcv3&>mg*jN*4G zAf+TZwN-y*S0pwM$wl7`S96nXAQgFB`@CH|at#xVwBGAbf-1+Dv49UYT-2vuXEF~H zc;?!pcZ7Vc*nXqG$ktc+dWElICLFQtmJi5}Mv1;X&P9#@68 zzo_&SU6(!j*SX!pB|1L9M0!3F5T2ZdQ$ZIke~nFYt@r&XVywVFHnZ! zx9t*qTU15^D8G@=J@OkVtyYp7B*X*jHVKAN;My(K zDwF?{k|DaT6CUq|MNf93u~gca%h-3meIC~rl8{>&I;rC`;-g2MWitghJHZfgf}K?6 zJE+r}!`<03=ej-^|O~Gd9J_FCJwCzyW9(MGedYT!!iYgg{8ED1Zg2_yyLXoWbE=k}$j;cS2w^ zId%5(xE#Xca|mCAC&}a3Uv8s$18xN}sM3udSLWP(#wf`MpYpggNG?2d0#~q92{3N$ z6dChlnOm_XrLFe>fUdjHrlOW|OX06Nf#b-n1?Cd$K7;lFg~ zJk#r-J^rnSLe=bj3eOz6o2<9QsE#P)mEBO3NuNi5Go=8_isWI1KuO$C1on6iq)%D2q^-y4_H5HoTk& zZzi&aemZ}YykaQg>0#NdW%gnbdlAhs zB=BnI$|^6Z)f`AFU>*kYgUp zZ6J@B2>%u9;b23zp5ymKN*-VqAOt3jQ*u&&)foG;F+O_D-0q=%XZiSU6kVB*Tr50! z(OL8UtML6OVVuWgD8<{k;Bxqpu_JWiOoZVkX7MZ!Wf15H%x|0}6i1TOTNa4`&;OE z>PHze7aT*!%bKda5z5-bk^{d>m+~WV&s7&#PkAq=Y+A~7bO6&g zUaWCbcFJm6?i2tipjx%}Mm+3}LcexuCe|aIS2YL^ScERw_V6l?L;5D-;rfSa@F~-BnK=BB{BJ%ME;9^0VUj`=#_ zFKnz)&`BPRFRMBa#w?3ap&eR`2dc2Xil_5#N)3zq>ICeo^mGiET2W`hR{+v^EC=Y& zM(jeAZwH*sW;mN*AzVdpyS5>|mLtC6;^4{vzHT6VO%S@c8NOoV72H*lwXOhIMv21F zp20BYaI~uKpX}~r|VDk)0r=4>h>4h=RXM!ka!8o!TDXBrcaa6!Kj^jAKz;R&Q zNobZ0yqkkz_)^bfhE zc$CyqJZk@G3-QSNT$|$2l;-l$vex9IA#KY?CN%+oZPW-!XW!Xjx901w*#GF*`-V79 zVQCG4?kCzqop7ht6X@0p3p!I!yhA*5(g3Hn1n<}>Av=~}DpQ2sItf6RMgF78c_3cW z*@g}Q%P7>}`LK8Fl6g^d)EUV(kz{QMb#n=lpjb1H0I*hlhaT#+#DOzNQ! zwo;|$Kzi^P5w{gE$?D{eJWB^5MYuF_$ZS|4ZD* zJ8&Pze0-Vl@nx8API&-`T6*xACMvh7)v z^fmvSiC++xPhtOShwDphEf6Q)s?Sis!H9Tuq539MeTI+UAwG&Lb^jmQ#uA3SX`raY zzzd7Y5a9cyv~yxwBge{0!pe&h^ov}Mm9@fx%NSN_mn~y*8FoOsc%7)a9k`AA11d>1 z?TKTMyW@gNHTgF7I2r&v#tzSR{tUo7BLt4qiG?O$+l`UJLGgPUBDfjV-l1*IHRN3ifA**u>@Hp{2ruXMm~oLWFd za1VwT+PhC9{$cDQZ&wMs<@t|U8RkFMbMnrAnvpzFz5_6Ea-1WD?qMqcjZcHT-5ub% z5)Hva%T4s3M(#fY@9~O!IlWIY+25|@_h~ynXc)Nz>H73l}Ui=6#u=BQV=uvT@o;ib3 zbTb!KybvxzeG6>)LSiFy-#Iq^mj0;irKq2rBC&3KM+!d0rv|RK9GP*W;C&NK7 zVGw=moJ6yv;7HUJFPR1s@)z~fyVx=UA+jN96{A1J!wv2MgbqC<{_uhS^ zpxLdBu7G7wF+Jrh1+AkjY3lTL@!qWAk>l!O((C@9RfU_10~3`Dm*QQsotXwz7LQv?XnO-Pb6XlBLQg(MluY=O4#zpec!VU zTl-`~WSE-N8)YUK6XtZW>H5in;1@&H0UhV{=j-5667_j`vM&&oW|Tm z`7pDP)%U#zl_I3ZS8sJlXxl#rCVCY*r{nn8*k;1VAA+&L;s1?q z2My$V+Kmwa<2OP8bGZ_+!1m#_1YZ9+pc2;^RHP&P((2xCh_z=Td5N7zZejJdQ6pGF zjQK|>^Os5?cgH)^V*=G)jZprIR#zD zNhp=6lpL(|Fq6rd=k=iwQxtYSjpe?=ed|%C+cDpQhW>>haC?)|4VOtY71l)S9_`9i zsLbm>I2kC9I!BCVAJkJ@wvoqDQu8PUrjZFK9ZKA$k=@y~k3|pI>eh;?XpWahdRtiV zRdr4gg*`E6^44nuCKmH}_^5T$E}DoUEYClv&ar2Fk+F%l!-G=$3>k8DhMk~!EB6qD zv!V)4WsGAxNNtE@`6D(Vun*Y@Oa&1*MSYR~${uY)Wkcj7OURcj1kb`Baz4Q+DxdST z_AO^P@?7s`W2P7;Uq4P$z4Jm2RRabCzG{d>*8~U4)55u3pQ+R3f{43{}eDgKsU&+OAI7f;+D&7Fg zUd$HOEH#%5vl{HwN>OvJl2`9JkUDwrc$>EUDQ0e;Lbl+CB)LY*+rX+iN)A~ue^xq$ z(PVzl9#@_3f#`Z+c*pThhjzgRCL5ZwKW!I+{gdd9qj}yZ$+gRSA!RI4#xMsvA-JCF z_#Laq$U`Sh=$`DQl1Uo(%U=saF+ ziZ>78i4nTp0H4Q;5Ak`pXda>88Fsw1V}7Cy0B}yw*3M<5)&}^mZG+P!bR~<^DawaK z%gNWIjsnkVZyPn`wc6)&_5$(xxQc0!=O}d95k4HX@8tAw4W9qZ3RvDM$={NIa`ts> zKO+V2QIY1UvjH33>2N|zlp^$zBGNFs(WFBHa?E66JphA*;R>*#8`!fB@xgziQZejqSNaYf!8vZc71h%X*5uHKCO z3gt(b?%pwj(fhrDhKROf6}wSNoWbDxB^Flm!lMtd{1MR2jMt$2{Z;Iysog48`KZjE zPa@wUZ>KDuTIh53awEJf)o;-aYw36A;W0SPS&U{?6VazNg~@%)?m3^7w70+@Fl|eS zRnnD+*9ICWX&g^JnnI5(fBIzzo6z}z1|xp!T1@`qdo%nwJ(HP3)SjRBu-lK=F^E#O zjtkYo)41mHsN5(jt0~Rwpw-3j$9svWpx|=M0UUdKQKol77gQhE*8=?@W#eCG(=kLG z8al<^-UyW+V~KE|s*g6%?w|o+J@O-G?NpFTpou6y!5;ri*>2^(;^)Qrm3!F}OMUyq zbQb2rq1!v7V0sqd{Q&8Q8F*jP-QT-PaJmKv`t)958yh0G%MA6mgEd*gU-K1+m zHMW)%qi7Pu%WFOA-R?xpnkiY&n&hUn>Drn{#i{uy?bxY47Gy?@QJEG|M|Ob-Vc- z-6-w!>Xt@nJ++@p*rq&E!i>_ol2IDQ%ip7IXJx)4+^l|m39~mVjbm@tLSOUm?EBBW z6gy5d7fdzm`{k$=;O!+X?E8n5{1*Fub~EBp*5J;-tDSM8x=@c;Y$NYh;-dce5~az? zWd5x%q5|~+V?-X)h{_+0jtHhw095e3N7;xz6qQvJubVLL*+#eaOUmY9*1@h0@_IYel3e160(eXJjQIUOQ z-Sg2oyfx2k*!kij-UL^6wqD;EjMkqWQ~!=4y|Q6Pd8#y8dDuKNfaa#+*o#DzW@A?| zkvu3SxXr6*n4!L5>hSd37OQw@uCa)!A3o0@v`1aGA%-o zWoM#z9B!ekiUs7zMwte+1pc~AcS(~~Sa6KGe|LTKV?6Li*5{Nf1oL|kg2zPq7%VDE zIn4v0E&;!D{ABKfAp1D|hcin~TW4O(z;VnJ@nd>1sxr=6eZAQ9sP||n6@H(J-wr<& zwRh<1%$3VUWhjPNdE;1bi?1IB&*=(>gC1i=t_Z!U-@j&j5&F=4UlTW8%4$&^nD>?* zC}eE8DL%JBO`1jbx!g*?PmMGyWE=$ zi{YE4P?FfUN`xTFVsE2Ou@G3<>Iz{GusrEP0AppDLeIpYL!PYhwPO6WpO(mKD2=C- zMiGhS<3xWyjKai1){H*d8z7|l(h#J%45T!f&<(o(7UK)Bx6lV4p?aF&2*NNhd+NZ* z8#)v(M9U;~;MrI8f`l~`&#DVLYLdJgt8=OpxhS@r~D? zu;la{9`i@}jBS5E9x);!Oz!DtjO+w%+{Cl|&Ok|;*6BwSgkD?~u_biSP$J(&BGuwsKznsTfIxh;s(21sK`}l-CG@9u*vw%P)G298Q zVb+xB>jKD<{PtlWev&&-=e;0j(GtQ~5S@oJ`MzWjEYuTdU9u@u)MUOt)Yqn6bWuW+ zsUpxuI4mQza#i^4w%2fabi5(DEK~#WSdwxbcf9PvakcPOUWf@Be9g z|BL_n-?I1rddjb|_rHE|3xk%|eE&Q4{^;NALDLl_-yXFf@T8R+Rsyd2hi(&boaK1h zTrbwJg|BgQQ?_zPwYw4i9=7^+<8`T|CY6IsV$=C~&VL-XdiT-oFteBBd^Jh$c+akL z^+Uw8Gm^YZs%T2|2ulx(D~YmGGR71l9u==>Ia3KYL8VD9z%|U47nRDboD6& zPBOr_rHVs|lrIQiTtLR$XyPbW;~J`6FU9n1c!&2I%_)IRjoTRbD;?F@I5oU$!)yC| zQO##2-@i~sufvIwn!62e295oIeJ!w%_-1AIffaP;=25a5J#uyJ0{)`;M(6_Fq5Wj&Qsx)|BWQGZRaTw6 zuGO15!%?HnpN0ApSc5G2gl^urkVR!%Xo||FpRCHWM6_L%UdeqU@MCAP9&(v#obV89e1_)={l*Q&vgSjb0Dz2k>jW5L^PkPA3!^MXxU93SIOXo+Xyhj5+jo6f!JEOa__b&c}E}2xT{l%1mIW=UHvO z2izr*e%`TB)`{JxLMfoiNg7Ip5I`S;l83Exh2Res{p{4C#~{jLPqUbb%#y`?uJ=TE z93s9sMxX+Gt;sma>h#jr6a}l!4@LK#i$$rtHv9(NM>OA}*Ntl~g9(8@5r^0^R%JzK z0;fAL!MVPk@n#VkOtfa;&=+8(@X*%^KL+`1Jt$B+L*`|9?0LP3$ZoHc%^b4`9ycbaGZ#EkOl ze4BHMoIg!elM|r7n*2^Z5GC{N*VN>9#gUz?po>ylb-v^324NnY$u;>GccnTheQWZa z-RNiPked8{7tqi2M{4p1Tu47N&#TEF%zgrD=I7$in*3rMB~a5>{+?)=B0Nxm=`L*) zoKWbJGuh@LvaegP5v)w1f*Z8*uW3mvFwk%QQd~=hPg1)4!d4 zkFZJz2C2v7^MC4%wd#2-Xh)S_L#;hMC%|(FJ!1y3`TG!F2n35HQ+jtW|^_u4a)#(m*@&^sN z+%Vh?tW~}AM92`p2Dg?S1T+R2ho&QFhx7ly&v&oEoFdqgrY9>Z+S)}yf$ za%XAPqW_$A=D0Pez}FsB*hLKb6{X{6P_djxR{`WM2j5Y0MA-N?cI`F-ewQ-Ue@-U+ z2WR>ScQd8~;6E#sK1!i~JnHlWk6LUUrY2oa_EN7~ILq@D!Rqb{5~7T!fj#{mfE*}#u3(Ff69zhH2mDC6{rJhjB&Y+dpnA0$MHVsX(C61q>3LCwP)`|h!66} zlh7eyxO(?Bz)C>}@+1*&q{L)Cixi5Mi?h*7+@Zb9Zo`vy97SvFPTY9dqYl`JMriOU z*U6bzdRzy+Z{SBSg^z|`AiVjU0>)$TBdKX*D3boI02kEjY|S$-vDu~S3+x3a{!o$J0G z_>$$VABoWe7aJMvZTQB{-=va<8VFUe59rc&=4|vg#(OCT5GK5Kno^1iW>m@MvS^We zv@?IAXpt$BI-Es}bbagXr3H&w#K^hD+@BpIMt(*9r4g;T*zjjxn~v9f?}+{+eg)=& z@6P%&OEv1kSh9i9z(sflO~c?2ZxhwaMW?Jge#OO+s7`YyiWNU56{wH&x9I-GK06L5 z7b8L78VsWG&x0;Ur%THHq&@GYRo%nFBCfiJtwA?QKFY^`N_&eH`obO;ib5FHX>EG= z^=2^^n1=w&!65|R;sbYh)O%qYZ;Xx|MW*bfY-ynBH7_YzhMQ1?OA*RwNa-z;mW+kp~wy7~$JS zu_00THvA?_Us~?HKvF-aB;Ur>VlDQGmL=GoXl=HmUYs6pN1d9?U~z16^BuKovS~-9 z8y7m&lX2`zc(TYk?>tF`9YmW3w}a2Y41DfO)dtUrz23^zN7CSJc`MUPyN{1#@?>@^ z(~WLrDDTi7t&kPMzF^m1?#XPQJb_c*YcYY5g+6B=Iy^NB#`7ne!T17Y!^HY8TbjWs zZ&ExseS0?>UCrIi;k4{-6Fn=TUZ**y$8`D88}Yh)gm>v}(dAvyF2_3yu|IcAf0a+f z>#ra0&)K5C3!?p*TqKjV)02o3$^@3cbi;Fi0DwEvr3nEWYTAAkP4TFY;AQloqyp

    31zo`jDPR!2G~XmSa<%j74YOT6Bo0XiOrT9^@|1kXy$9I3VkV1?Z(O)l}C< zSZLQhc!x|JNg@Ti9q+VzJ9^}E2+kRIpgLx-v%qys2%MyC3w6Rhbh{{j4JHdoF97~- z1|?kC;SHnnTYI5PBL2-MFG>290>4tS{6jkWz93(`nw%8It=90;decJhAe8}i@v`g+zW{O`PWXQ@hQ~Zc&8)W)zHIMVVaF}4~%arxg8O+ z<)3Tmhy!F!dO5^_(eQlVcvQeK^j+772SZvRsy;R(Qk25$RJ~&uP9r?m!{2)Ni0|** ze~*lB(I~}r%sbqJcaqDaw7yQPsKcev3@!|CstV4j`{V zeLz?HIgMO4z5l?l*Ct_jA<0@T0bMGP@n+b+E;Z)9k`$1`)d4f{Bnh?C@5#jFquBqF zCDhm*B0t}gzHMldO3Jm6hC1s~250;fE`eHZ!&FMfJ6p4H-U`Cx`M&UYBLxC(fqtqRn&gM<=q``BGHQh7@n`(WZ2N|RH^UUsO z-gAr^+=4Cq(2>P^HQ_1R)j7PWsMkP4k-uWwFt+gU7F}=UmcQD=V>m2vI5xC;T0a3k zvGzIkiHmrH#L1*@;PyjqanKymn~kSM2TlAvQ7Yig5n{?c>ke&`dUU|is$mHKl^YmB4?-@3<_IFd79<+P2k-!PtF+wW~Ocr-a zxpyoA;3&f2$)|~mAy{%pGa+m6Xr@4)ZMORL^#Yny!lf<@8qltg!#MnBZNn&|lgKF6 z$*EBc#xG53p%2vW+C>C<%xp7Zwy+2_Nltdt@*|-xj^2A0u1hEMx_PfC&eBBvUZCrW zGf9++YyKHpn;|&mdq%NIWkx5Z=0tq?0Ckb){bK6|i*^{-?PQRz+e@%+vZAkBKc;cf11h8%(R&;Tbbm4^SH1VJcP@5)1$uVP=cGh_vU4k z=Vct=c}u9B@)?3ws@R*f(oPhtDjPtX0mf4$TBAc0x zC(1x>Z^}4J;t{uvWgp;?06Otdj~sI(e0qn)7<{_iLoa8Q#Uq# zmdDkIZ-08{So3@f$MX4}8fKoaYwT~CZ^vIV--AVQ=DV>d?tE7k>GPd9toeNZQS|TT ztJ|Z(7GMeHfKD^J*rR}_%)V$OWxZSEY38{CyF{xal-Z5ga{&u^BgyNun&E7Fufe|{ zW+UEqrFI^9udjoFx7wY)SC0%X!o7H(!NS`OkbrQP0HzgLRldZrBCJa+^`&&q%w?tRyZ6_kk2t1z1?Q}D^J>CIHqjaE9v$6ZNwuJ^?FBVUMaecdSAkC-vZwbKZAI>iqv|8 zqW9$JK{=ZRP9fdiF|8+5rhws^Ftero>~GZPhLrKK`au)EV(pq+$WDKX@PG&1-eO%N z2}j9s6R=vj#X-3nZy|$zq09LU#XEAois&$AKcXy3dOE>eXR6kr(7^F){%qju4<3fE zJ3P&4b2A(OPC&80f7oNRS<2hI>0vlN{P!*3V@C8MQCV6-K9_G=$VD&KZA?@a+Kn?j zRZAd)V+_-i%aXOq(8jm1%^pOJP2SW4w- zi>`g-i50s#waK}RK}_+<<|h>`o4 zsRaeS8(}y6R*1ZfaOnOk@a%x+sPHTolFN+|`A;rP6)~it9sy7;Oo!*pXe7YGOn7#~ zvztG|HU^SelOytHAjuf3udqDTdc9 zn4Vl%ER>CyE_5lu$3=y(!IPi)25Q1`bm zowqY@i#czvrt|g>w;AUxbHcb%cf!cO%_JYNb9E_llesPZ7viqx%ebBi5B{IKo-80@ z^Cd08Qq!uI@>PXrTvh%!9Ts`^qQcp*q^DLJt2=sFdiX9M@oV6legFUS@I6&w!1qYyuZM5Z?WXvTx_p39YP?zKfG#3`pQ_*FT%`LA zuzQ@ZJjOlFU?&lyl^*j6(wzRVcqrR+GAN%Cj`SE!!hc5K zY1Z)8ZyG#Xh2X0e3YOZPC4?UDbW|rfP_sy>B`JL+`lut@Y26qpbTkW>(23Y3FR5+@ z80^w`0Gm5TFJk10KBkQw$)t#wJkQ^IFg#O@HuzqAIbP56Jk{`T#aO_YfIPfeYs_vi zMQyhB0Y(Lt(7_(-<2nu14FuNe#&F2RD)CzrZ+fyR@b^AGdKdQC&ru&2}`5Wdf>pzky$p@MzuGhL?A95AT z;q5wOXojoNOBOG}yrZX3{umZZCsLS_E0k&3!s<`@a|?c-6OCeo)%DzcTZ#e9lmf^z;*Dea_GaSr7C`$5jHPKa-}6Hm;ih{w8^-{YQG zF~2RlogFtc7S%w?M1f=44|r8>t-|oIat?Gl)h?kQ0IjKOBU+L__isB^?6AYjSHmlxm~ss`u^2s}-*O!uetRXkNg~&CfZ!JDHvRa_!d1E$rclK7VV9 zUwM}9dbc(Y+5C+WYL?Q!s+|Co>&dY}Sa8V7bq;&^KC@{*-_3}*^{DqkJUk_Jo{hr5 zw9|ee+AzRbKv!-6Jr~U3IL}XyA0RJuTQ?ey#do5&@eEQEUtsLHV5PU9RnD7yJ1vlN{A4Q_zto{U${W&=QN^BK9hqx zu%A;*1Q_B)tPMj)v~&t#%R<(_6?Z#r$)P)W+;N;4nIiliT?de7rHDtREOhi{A@Chk zTu$W=#>S)MWT8F9qs04Nj7RxOMqS8`_L&4x9}vx@cx&t75g@u}FzboVxD);b?2EQ# z4w*qwjDX`>b+&$Adhr}~$|Z>n&GQwSB8mIZF$9BE$n{cM6ANL<;?W+lSa?v?>a%)K z)p}anQ!JkNxVsv7ZUG;x!0aR*TYQ~ltuCw;(3xEl9 zq`*$BOcMPSNpjX%M5iA`H9v_mrdLp894(fX}dX9%rR6JMvT zS!5)imvJ&L;{b(hGvt046RtZRl`F6gxsI@6*qmdHV$zjf6P2fcRLEyCzF-S88uJuR zZywwL^GaGPfi!9Tf!@lMP?}V6*nYCo>Pwyi1n&3lK%MWZ^G4h?r9;ldMI-KN7QH}X zGiQ9XGNVu&G^5Dd9q*Xik@s3V(O(TKFrzSBU5fX=Nf-!Ay8;fI@G?;y_{Ia|OR^sW zjjXz%S$sM^er6#5tEO=8_3@@K2B&aEt5djzPocE!@qb|&|6O@7{^c2D>uRaFj zpF6PG_(j(#UytzR(RfQ`!>yTO1XF$1P;7k{<|XjKAyv=d=Q`zWz;^yhE3!$wgtGH8 z1#dk*FB)#eR{nB)<2%@3y1z5&_WpJg)9sOfvd4S=xX_}TqVji(`#O@BPcXB05SK@Y z7023%>IAFkuSpa!&fmgU=v>XS;laAbjr`pWci<;R*(jW9+V_mnsx45Ry9-GDUR} zRvM}%0iZDbNnU!?r=0UOb8TY9p+s^t!6Rku)fkJVVdnXgax<{9Htn&&WUn*F$Lwe} zd)Yzq_p2Gb*Y1M%Fim!f@woO{0k8c!udQY^k6$@*RF?cRAgNm@+l?M6+P!ELtrVsc z%1#OP`@Qdyj4$T@h|W{q3rl8aTD%ttWu>U%&dS^(EUrMW2zkBX7oi8i`{7323`jv2 zj&u-LYuEI)tek+tdDZF(g!(m+jdEr9)A;t}>czJ+Q@r7sV#NcjNvY2FTFDFUAt*SR z<*@rM7L|fxjED~`{YJF-b5+cReToq>@`?epZZv%9+MmbqK^sS7-w8nvOF5LGOr4h-#H5biGu5Dtmt&d!HZ#Q_+!f zs?pvd4$4pA`HBYRgRG6Hn=7oIj@}>XK#7%s?|;}EBa7Nabu`FX>k3i2y)Khph z`J`4ggmzeJmb{K2>8kV&gf23T9;Q(b(|Hfog(%h3U)vDLmCwnQ_~3^gvNF~QWjPLi zy)`#;uMjLjmTT=fKF%3-ZRuE8l(G>H|8Z+(tq`1u?_bm3=Rn_G-J;UoqdIGGS_Qz9 zOJr{hfc<1Nu%q$-XQ*#0^~X5Sxq{5~@`N(4(=H)Lcp-MV^Q`_oz!aMhz;p}9SWPkf_m$z(q2Odik`TO& z)Cglzv5#yhWd2P;Ac8%8-o)?~`T^VuK)*P?(0gY&oUq)xAbeOma}69rQ-(K0PFC2w zD?(>uMmq)XlSMV@c8nA)iWU2l7lQ!Isie$XW}dR^eMYB2pY+Omf2BTgc@e z{(lk#GQ0L*Dp$LDToLd2ix9ofzaBRj5%EWy>99}X-6{U=_(1+HQ0`6hC^H;zc6wnS zPjcYVn#+u%3P#{3W;kNv(H2GdmJFbs#b*0j%-(Tn#1wBK+Lsmj3a`<=@5hEo!y)bA zDc~CJ9>@;3B{w#^pVotQpBN#oHrXj+aAk4+5$4{Cu@ZX3glI+wlZ44 z(ITa7L1!+e4yi`n#TCueAv<^~fi0v2A7;ZF7jg`;u7ibb5a!WR-0e)6B1~ib3xU~` zlAyVIME9=;qHuo#`Pa9qBl#Xz8T!s~2;Fm7%vz)?_FT-O4r6CmGMGR@50HyrF2pA{ zV;A&b&q(92qjYlX6qRf@_BI`>PZaxRXGV2L*oI|IGe>Q%VNsgtH#5>jr4cNu5O@`- zUDWg+frXlSj7(5*FbPTi%Ofy8z~FSeZ`dtz1xnhtqHN@wU|9^WCHhwKNEf@s+e=h_2WZwsv~FY8 z!(A|25^%-0*hrb>`p&lvyX3JQpS+HvI(sa+PQMqS#|P>01;SO>Q5Wla`HABEb%@t7 z_cI74A+Qyd9;kHUIdR;mPYbO!#W%@_ggzjBZzT_a{z{XcCgEZDr5(8q$n%&q_-FrZ z`ry!80SB}KF z=@x>pPgD68Wqs>g)uY_&_Vm5iGz_VtDTzXxAgI1-V-wSw^Bs#SY#8awe54fC9{y@) z(S}v7K%Jvsv(CO$QuEHSw4fNEwlN|J1zR&tYTf!!wOqL#$-)OQWT6_!!sE1SfGpHU z$-*bjwai3D497Vh&u*$78m)ws_;XTjrXvmBtqcZrjE$iKiSnQND~w;?aEc%zR}ywRE0 zFjJS1QB}7t9e)kmgPejCToXe3{9)yirx7jgsY@~b+c=u`Hob7y z0M0*;V}sGlF&;LTKW1Q-32nCDVR-h(G2}cY(Gt+(N7*Q_IV&TV!FqZ zlmQh)M!mdQ~Gj(i25#O6%vWAx1dwz^(2$T8`O#s1; zqr+Yns~Uy>aXKJEC&%daa$1<&twvDvq?kIdPM~;!=;#OlfdL^DXea;?fCD>dI^B-0 zj_So55E4S!RJ%u+Wb-I}DUo9iin@h1ZOjA&Nnead;9fMre=}1n*+s6 zu2bMWELzf1r|`mMCY{2y*P>3LN+`=?It9(!uX+53lXRg8qAmq|fr>!DSY6*1QW+AM z51PU_W=#m)aHnBks62`7p%IA0zQ5|?a91eM#bFE!Nm~vD+(V%UVy+uX|4))H%crz0 zU%osgM!xLQtA%{|exKIm%cuJM3i)!?`2RWi^1<;WUp{ z-h8v4bTd7m_foCPcw<+4#peECTEjTf+q{SG*$H4lJmcD%t@zePFI&W+%S>|C>5c+X7upzpX*ICz7Rx~XV^-Kd~Auj6a1`N3XmQn9T4_Soo$aE38{X!$T z5q984MwIwzA>d5lJpM~bEqLR%laM#ALGN|6`PrAt5Jw!Ub~1C}xMlI5i8%0LkMbz+ z$B$-86%pi$PgCb_#N~=be^}>=l?f>nq0s}qn3#DSW?ZsLg-nRsFl>-Hc|{2Zq`sajTl#uHZy!COG!QJC2LmfJV1Q;EzT!k5zR z4n{xAWjStAwezT%5X??wyE9!J1fmY`zp$Hjk8OyA?n^*_5Pd&F>3VXxLhQi^CMKzH zBi=zXru!Sw|4V|f;A;w(`b(m>gQymYgmS2B?k%dK1CvYxx{1~euurPpXk%B**l)R5 zLxc+>A#El<3&Ah2%lYe;TbiY8*ZJQt@VmtZzk7?$_uw+#(^7saNr=IbRot{uM1i7E zveIQqkq5A{n0jckw2la*+E>Yz>=Eb0>VPw#<2cV=Pf(u@>&utaSV~;S5@Q{uu1toT z&UG7mv^R$k?fHay%nO_pf|xE=VR-hIo5lX4MaF{gFrqs!{uYbfVo=)73!8uJM)+9C z=d7Nu(1a07!XoB-K=^iv=juj;|NfPBY9oT}pWh+Y{!4EC@n7|ue^z2=b*VmA9dO{Ov^$oL|( z7T}>R=>g)>4Q}}7I=3NVaag3MurqBo>bu}@pN`VgXWbdR2z;BgPEZHakWlijYezGG zsC;M;{+Jj3*~JR4yNdWticO6P8_p2EvOSCUZSPnxiX{lgz_lPiQoVyb>ZBADy^)(! zhS@Pfl`_MYRkFS$SI%*09|#RVIeb}Bqx9{ZCC7W_sW$IHqa}6(O)1dJ{w)9ciP{Wm zEUI&@-j~R7UCBw+^Yf^=m9?UamK2vnR1%qS;dYaq8Il<_cm48^7{j2&#BmknhCKOe zO!h~kLlps1AmW7dgLF4z*#!KM>^ek%w2gp4SN0|x}$DQA5?1kZKy(no6HIr{QZ`G>IAye zR|Y%DxEr(d?$Bm901Xn8D3NnYt{j@bj}B%;E8Mi0!*dg!zN9s>ODg8}Ap6{8jDr=u zkHR>i_drozFyfnnyPOm-g{I&SM)pz)0fD?L=7Yy%S$GOi`lzmz9_22e3cXma(iVA2 zwSgUU`{-syLt33G=WlWuDL;g5e z?(2gxNZ*YAkG(emZ>mZk#*=OYL~l@*swj1)YU|o6Xv3l{ZQ))>wJJDOQR}E!6{Si7 zb=XZJy%QsK#BFpGM`y%kM%)kqp=D`7TncWuK~5hbMLw5p5;C7zA!>8#7chCc%hX*dW`&+_@WdUG0N*3lv)k0f7_c+W;y@7mwrA- z;9-d~7fRr7cA{V_ZOJ78OL%B|aA@Jm80A9_qF&L3VUz)#40x#1b7671z+ zwh!U&$i~QrN%EqZ|N9{Nw|=VXtEt%eR<%vA$U{RxT+00cxIiUj&DQ!#`Kyl3FFLRdNR$0TRX;}6hf9y=zS;TPG=lux zot|#Ue`E6O72@sA#CxmuFt#;PZkDiVX-)w>jJ$#VI6z%W^S7B{kT*C`_&WR44xO{t zNxx;!Nhasn3Fz&FYJ{1~3Me?pZI6`c=VX_0^)=)m(K%F)sKL8znrpk~A`AWvd<{Jr zyP2v6y(A)I$+hQxJcK3wumXJ+yyg9rk|mj%O|gdTJWZl=24-3M)|07WL1BzbBEPPs zz=>$^dgKT3PC0ueDdq752d7PVX({rxfmNQfgBO@Su=gB5qg+_?)u*|0o79FKDS&b2r(rLK4|*D= zqFYbSqmdP2b%FCEcDL0Ize6%Ouw=Wfw-}S*|r;KyG&)`Lv1#H-L z`Soq|@9y9k99}%`B3f-!Q0SXQDX?ko(48P=d$K8Q+AS4a+O0r48#U@iSe_A%XN2J) zrNO+-N-ZY-*7QT-7q*y-jS3Z%;ydx^gJ;SSpsPAu?T}a6#=Gh zMPwcR)0p62NBcjO_W$;y+5fG1a{s$`y#KRl|NAEF|EbCQfAvxC|4+34UF~xJ`~Ld< zfA?S7|9+PJXL@V}0gV_Fy||iSp$0Y`TX5St0*4wC9O`U|rYeR9d+ib7;RGu@TxP;U z9>PP)g(f__lWu{BeaG_-MUQ8A=%1Mc4__XC7tt?(^q1QhzmcGGNZ38?jlFEet zvnL}=H-~~@nc$QA1dMS?X)az{7bk2iXndRsH9@KfQ2r&fju*B``ViwGy{ zZ~&n_0+r1`;&Q4JWlm&-(RK=q~aC^rM(L%?S7=RZr3)CiU<87H`XWD zH@Ni+t@O3%2cWO$1|6E1m0a%h^dE@!7GnAW&9%)tpJ;EA@j_gl$8ZWzOG@j@krP7C zm?q&l6pguE`#l()sQH=-#Sb~?P7K8OERBir{i>B1cX#rtj_Q%9?zS0QA3~ZghDX#XGP>o{O5yzb^YG5ao3ZM z>tOwEKF_*-{W25RZ)ADm`la`>tltyo|BtL+=f@7Sex5vDzfm^J`aOFrPxA3&dHpsV z-)8;(J^Xohw>J3m>%RZ<__L{N8~piJ*I$i4?L7{IKR2FrWc)chwLScKR##s1d%LxV zKf88qi$BeHN;E0qc78&!+4`n7A@8TUNWQ{7Xg}h8c!@IaLbFJnTi=OM?x$nWXpypb zw+wT)O~o*GtOti-?nXH`1dzM;M9nbwx81=(Q+${%Xn$Gg9aKI=6x1hU$i{lb|0W$O zQYKVr6zjgxor~!zQ;Vhnn`HCcs?7TuK02be3uP9rGX-0!m)gX|$}Wmmk1yIUU0CYO zv1E}F-Z5<=;%)Kze&I_o{t!2EWoLxKv}|SZCu*=TPaTjx@1Zkno?88LY}>3XZVumP z>>rLrlhGVSjipW-4v37#VvXH0DkDq6s4l#lFDPx|3n`Z3Mu$4!MyDqS=}>27UPf&y z-9uH3j(?LP8Q^o@y3alzrC|M_<~s=NV#J0eR=2UD-E+d_SkOsrsz;KD`OlT1I}yQr znu`@f%WPOjOw-pr5G*HM&ljS7_&uCkGzwPPHZ3oy6k@@Y|^%JsyljV@dfbd|RFEf6UXT-Oh$_ zXN_!avkW4G>X$4NPvSQ$@s0oa`84e}=5y}9Kc7GBKf-)g9f~IC3l8^*0!l2U?=Mjn zSE7eim6(WfRM1I29Zi^q3_lE5g1&udUkBgrY}SGi!wy-e_^ZFa|H8w3f67sO|NVmr z->0*F*gZX`{XH$Sl#Y$}=MLT`$6IAbd9q3jGX}ifGGNZp)=7N0mv{bIaUl%L%#Zs& zIi3X<9A-R&%<(*bAj;>L+5H=-y1*6gU`C`T`>ho_i#SRXxcp2alf$z#*UCwo(AlUI z?H80DWtFCXV6x;yyKtv9Q|E1mdAQC-8bCU1wInefy5 zljf@TWT-ARYYH$ieX~_eZ`>%JIsrx8`W$379452j?aDRo>#*-N;(`59!+*}2&m(`g zI(;jZ<)&%^XuUB!kP|FaId-y8g-RW!1T7bKTQa=@?-!^p z>7yKaE4!ZlzM4(-snF{)ByHgi#XpbAFR_=e;$KSs4?|m3-;Q{9sF%m80BWdrA-W7{ zsW%n?hncQmgZ1S6S-k(>a_(FN8sBV z^IXSEjpKs_bVkkrd|+MQ0mEn>?XGVxHH5BSjkLNWP@IU04GP-P3Ug4~985J9cdxOy zJBel9Au@XzDZpsFfr=Pb-@xJp@No4&7BsRbnlMu=9Ei-32b29!F>o??YWfi1lWuTo zdQWlI>HGpZ#fxo4>>-|Gw|IyHo2~?|Wj7hb4vF=%W^pwY*j&YLmlj3Z;QqA1#~XH! zBkpe(++QX19(p!(w8xe9_V+kEL&h5-8fY=l0M|%R{HxG`A~epag{BwAL1F{9)&5=8nXyF!`n>15?I8rc0i_u zLTGS^th08R@E*Qkzl?jO$KxF$Ed|Be^(l3 z=`&w4 zQhYvySH)ir{rVQ-Bu-KS7j(iCMK2kf6E4&ah-3pE38z}4T zVlLG{rqC)55dz4n6lZ`e@4mSuNW}?#ttTq}ff?kLOp>>+^;jkFZHDn#v971_#p8Oi z&+VBYXe&HISfr{9T%@wVMqJM+49~HwlN&3A!b`j^EYzt;ZTY6ZHbhV9dRw1np1E= zWFa_-7||IHk4PofL_nZpPt5LDQn*6u)dgj)O{zYa++3>_|1xs5nM4`h#YY*qKW+eM zxjyUIZen?&6WYf@OfUgJ#pAo83ByvG8e{aWS?r7v@?c>oI!xtFe!lV3)#J6q7cMXqEJbb({t{2l2*Z1P6dHUVK7kDj; z`|LXGn(He>-RbNhG|jr8#(K+z9v;Hr0^Sli9Xhl{1?JMD`=W7Tt%sU?B00ED^f2f7 z_C85h0K$>itTz0P(lmZ^2(UIa#Dmj90N+LY6(H_v&qy3DoXzfOVXF~h*S4j;71Dp*Dwh$6F#2?~>?KH&+_Qmoq=#R4h$ryGTqx1bMDs9(WvjdHY~^NowKMF!qj zK~Ab@d|>-M4CVfP)XM;~m*=2Oo$A`7_-EKKP|zKkV3)~-ur{1)_sFZ<`Y!KDkt-?B z%~$T+@4e-yIW4L^CmS7J`YZF!F6w_y_N4EXc~_w+FIHIa*(67#nA)$bEw?N4%Hh6B z_qP?+=CJPiTW_Ojy@nB@JwLE18jCy+@9i@Egw{d($VUnK+{{FMZiezs#%Rh?lp>Y6 zZS$#Nhzh2bTMq)hmJ4gWQ(e=l__1EC+qF|`!tIVJ572KAC6Db+q6Ipt2PfOg?As|f zU8pbxHMs9|&mLW*_rzRs1KoNUbIH|_KLDQK66Ku~(;=WlFFJ&|v#0548V&6yC?V<*qk83aFwB^IJFXin7n4TL%EwOX2V2cJc)lEg=hk zmZ4uS3+z^E(fiOl2rw0-G@xOy$+@<#}4qa9PkHhc`9S2wg;annzGyWKZG%h+B?`-Bw(~9}M$U_o;kO~?7UJ%)z z3V7sH;o8KIk=F=&)ld4Nwxz?s$cc}RVN#p{f5QcoFVa$=OVaC?jv?nkG#XZlZ^#fN zmnKN1t@JH$NUBu(8qp^br80U|!1GT^VNFOS4U87gM#KB_M%l=&_%mn;aGsw^C(uf1 zMO6Hl*hoy?8I6UXJYJjFr3pq_!j7VT{!KBU?XII)5V#Cvuu2 zw!uk3vu9dS;M3Dp)jq-1X7V>=G_leP;58Fp6@#I8z*+=CCXZSE}b9z)N%W&2p`~X2UsA=4tkV_mv0Vpgltf2u;qzILBHyfawSzcS}hd zm6qLzKnxGy1c%=~U=q{|Uc0$uHMuBeIk6ds3e#xHInQaA2Y zO7!>c1s2dU({KNFdz8rHox7;58)O_xr78Y+0nyS3yu1X>km5eRajkiyif{a6-uT@G zmd&Rx?C?Glc`l+ut4=(Kz8AF3e|q%z@w}tKkNF0EY&P&CI#!4udz$!hY$AS~ZQ#cP zsU6|Re{|wq*py)$tFOsKb7b83Tl?X0IKF!@j_)Q{;TcXLev5G*A-?A} z#1UV*iTHwhqe+Ob5fPssP-%FP)$hN3yw{ZnD@xLhmFd4X+AiLs5l`M^#e3H$s0q;G2QUcx+KgRLiq@KxmuL$v8%ApSM-l7=H>&K4p-ZS`gdqVxM4)I^Q#D8f= zjsMal{!2^1e`n!0D-!VE3BLmWMGnF1Cm#G4@ZYYs_-~B`|H;wbnR;aW7w;)Nlki^) zjo+Pn82rbB=qvL{a{kiY zY;eHi?s;1hSUKZG>JQPFGv|(c>HF$?Zvmc0@Z_@t(cjkDmgnw1G zQ@~Y~;$MtczQ{DU>j(DE#c(ULXhbNWBlgIhJ!0qHuQCu1K)vd!QiUn}5 zFxh_>hWcq_K^+Pxxg6u*YfbT#P?*=5@UPu-`E~FxK5T;@D$cS9Qxw?$#FtTuK<+{8 zJ0>ir+tcOkLx&P>-$A!Me+_R&euMp;_L%wOIlVZwmK#ANiIrs8+3Mq~Ja?QTOGhPb~b*jNGn%+qb4R(@+V<80)!`Fs6HT`l- zrHX1rXqcxq4Q6ku!RQr)vB87$pa}@`!?NhrDth(uEKI)&fAipP4hZ&H@L~?Xn1|1E zFb5Xqz~JNFGLo>tDdEE&$~IBW4SZj3)W~0yS6-^Sak*&L>#Cj^-lgWzI>zm+Vs` z+RQ6)PB=|DFEq{fi_a@q-Hdaid3}^HuR3!+|1kUmn(S_jY;D+*q1g|*g9FN3dprde zebv+~0tSk)xdCJe4a9o{BdkM8p1vB~gqOUZx`P*D5fcZZ)4nKmU`iJxAUC(ky>1Gz z*aN<7)FEXT30P!J|A0c0CBA@h56&x6x(&?>KYGwYukgMT?>oaqn4i?l+n!2!+rK}+ zdE5U{^j@KjcoLAje>xDCZ%poR`qsMkS;NAD&y|_iVA{pd1abk*(swC~*J_x1XCqY| z+95`N5sfK}KOs~37E$pT4|2bhv9~?r!;xS+~M?bl-<`%UVik zp=^{Dccz^49V=HQQ*5MGrXxIkc|veUod%R%)xQ$6(j*ZNPfE~_j>cFbXwgKeF~Tzq zzKiZVDV1LMe2e@{_sf-TL!9Ay#P{^O8h+=6_aNUnW*xBu+qxOFed6i6xIE|zVg;Yx z%M;|ut#4^Fxz#OE;#*Dt-WXnrv~-wH_WzQ8=*P}S)DN|GEdT7|*=*Vy{TK8@H+DU| z40QRg)eoI2hjTR!hxJ3dEppI~V-Bw$>O*?w6F)g>{m?)0JJ9iFkgz+{59xcOad~M` zYg_5Vl$T=b;__03RX^0n)DQI$e^}eDe(0BEF^Tj;*RMZH{m@3z0_20(jq!FEeoqni0>}$vRq1`a{Bj|^A?O{+kdr$Om z^3ic|`KUp@v3GaWDj)q3zm=dLdLkoPJ{n2#5to!sl8=Vt>!=^9C5rNhEQw1Z&En&g z#npCE@Ksw$Xmwl?%9Z1~X*V}pa4w;S9rW&jFXOV$<0$*Q@(CfZ30+C-crU9x+2_n} zSoRtF9kne#){uQ3$|A{TY*z#OP39XJ#~L@rWVID_UW>dD`7^ZY&EM3rtns}65qkc2 z5WwT~{0S7t|4YyRYPsT7xx?1c!TRw~)oj&upDAVU5&`*c@%)eyKDg>|`pmmm9ko8QcqfCz#2rUVue-@N zv>gfbdf&=qeWo}RJ$ib*aHZ5|c1*9Uy6$R2ueZeM^?f^dT(56$hhCo(e>eOl((C=J z2|+cb5$!zo#P;a*gio1XFZq(%&Q3Sz^?C}6?YAV&z-}M%jjPj*8;deKrq^Tmn@?u$ zviegEj+r%yi}cli@``+$?IX2@ZI@o9*;i<54!T|2l@P{9YI-zslHB()*NVZl=k!t@ z!f-LSUa#p(hlfgzPjQC|WA0GbPM3wUI~6JOPK5u@$u2DTcIrnOzR|na)Xz{yQKN>+ z0lhYB4dPPG-mI;`JnB;lhbZ&9cPc4ZGc_~Pm-Qn67TtSM|IMWZpG-Nmq_%q}4Zd+1 zd;@c@H2ioe*40|G&+TfQw6>_WbCRABJ_f_N*t<#9cT%3Y(BMNjJ~zMXrgyP~Xy6V~ zUXHNY;c>50WuCpHHVr<}9KOQpKL`DlVZPLQCE&uS%hx*g2bjKnUNKs?Q< z?Z3wOU!c`a&Gn6{uUGstX&E;yxYWi7Irm$1`&8arq55`Ssy?CSE>-Po#c>oSe$c5r zv>c-tFw!M-Cx)8eK@LMpfcMl~TZ^tKy4KTAndb#?9SL(PjRDU!$q{}V-%0XQ`Fzqg zpndkj+B|sCyP7$uV5S zmoSbZ6!!zoLjOr(sPZIUp%3-WR)aNF`l#&Q#J1Ir%C4YzFS8ai2K)raWS6^xWeyCl zaqFwxxeb8*rktE9j&=cpMTbJ&Nx5o6B4c zp3X%e&OYRzFo{XoIm+S@*%cgKj2bnW|4`(7h69*fE|@b>n8NJDI3F2H ztm11pWo1S^C9A|};-0A#OG~4V`mQVq`J;6b7aLU67O2Q_WaPk1j)ur6bq? zKv;kiZf<5NHU}Tjs4#IP@o}nrRE}goRf81;=z>ED1cDX7>#YV#VVCg9zPrI{wD^}O zi_gN;dCI&lYVZU&Y5i;6*(Zy=Uoc53IvC{y8C%3?3XmgMo;b&2fB=(?hUrm0;DN??qk zQEV;%#$1L~mn(s9k?t+_WLE^S?0ikN_~uijCHgWYaIW;OEk{T$ce9Q`?aY;o?*=Fx zUyUAB*L@*31CI~+@7CaH%7SB5*Mt4@=MpYmUU99lw=+J%YFtS-;S*Sa=@L zBif|!-DZAu2|_SAjBMU4#(d5JMkbh}+bAFEugIX%Bs6r_3*t!#a+C1!7XDyQvwB){ z_!5(!z=C!=zOTNRimO14@q6lfKBkE}P##Qla#loeXNZ%5{1z`|xxyqx150lSlq zJT3y%5vTShGi*zy*^>nVI|a{4UK^ITP?^`0S?!W?J21dTdb!btL#|l`#2ReZL2gK2 zi~Y#??`kdA?Zw+Q`m_JaC|s7T&>2W;Ww^`HIM@dtK~^=v&)yyt2OFX>eW@&T*?BY? z(Z<8c>@tf!NUJfrGeZrAbn+947E8d(fH%jZkes;Agnjq8QB?F?s3C zFdtb5soc1`Q{JtJFRF|$VOG{$VYq9P+JLHfv?JVg$h+0?g@?ONdAG(A2kFnIH3=Go z(y}Y?EK5GCP2MFLaf#3BTngMRl-o86c!@>*$|B-d6%OwDLtZrqIS;~3*MFLR}2X{ zR)zx+*=Rc9J{Ea=&GU$8 z`2U$BY2B5i{yguOi# z;cR=`d~OJT;=CQ?n{krVIv}vx*-oN_uciFhtQhF61coEu$I31kD(fh)iwJt1g(f=q z*D3IedO?ssH{26ToZYv`#2_8a;!HV+G< z&5%R%HR_tZ@Srn1P-ebOLwz3wwM}l6%G|NO}#wcwCEEjXn>#T1@}1$k3{Q&>=K zpVT`tz?8=e`{yZtX(;SpZC9RI>R;nM4H&l+WZX6v1+T7zm#f5ytD;hY>VL*&=za7r zN$x-w5f(d23r1IKq3)ln!C^HXmj>*n+Igim{U99(eQ=FuAdJQl?rPW%^))azkT-MR zcZV*wyF*vT!d*~8*wak!o@Ra5yF9UMi4wqUjLEs6Ou0CVg7`=Y<;F^nSWz4VwqQ+` zIQYKgka@>`M9Bgr4A{8~42v6DJV07^ePM)7_yuyaLYZeXgHrycXsqZO z31LOoh9AchtM5ec1K0xyR04ysbZiYC%8m1*roUYNsyGFnc6jEx=Kla)K%&1r=GssR z{Mras$zTIhJ!f7Urh4>|nd&R+|I1W&%vA3}5^4xLN5fQiV@jJu(5ai1wm9KNiaS@f zN0M91lv|YiU^3CS&pR>`?fvIrndrt3ndM#lA(P#jHca%i1SWd?+ec!eJ+j9!QjqSu zEY3vt*cNA^-6@iZHtG@K`CXeR9JeD5BV!y)WUCKz4*w_Fb~=9gj*^|0hcbzs zrf$~R;_NgY;(FP>h7U1v(_-odlYM^SjiX|p|MBu++2=#6c*}RMYDXAW0@H28J|_so zrhO6F-R(bg%s=0g$UpzS1OEAyPp$m(x1S{N&*vhkcD;Ti{`q1#jq_JA^4PQZu>7-W zFX7^-2LEhSykP5d{wlPWWd1sP$@*0WA3Zs>9X@*IH*7Dtw&=QKdr5SvBRn>>E&tq( zy(F`Zy<`i_`-U5tdmB(h_L3q88$e)HRkNCzG?~cqzj=6d4Ovbs`da<~vfXSb$&MRJ zW^YO~lweZB^0MF+jxsFiutwTRE{@wt`gfX=9X=~1p&mYDX`%Mwkea9 z#ehe&;5B&&$6Bxm`0-^j1s1{$`(+33slrk}%Di!Q_~*Cc%_R&2`!|vw1zu+DLGR^K z8)hqVOEW@K!VPC>!2x9^qJq+bL&}2`r%miSnhUaN8Kk*@sxY*KX~RK$vdhK69lNIHJjEMw;&p;`>#I!SLx&Ldih_`$KUqP|B4PB ziw^%^>*H_VV2To_zIEoaI;@{TCU$@dKPlTG<>f^&N97&#dN6HhgNqOSMH;zW0&<{!% zED|ZkIubAfd12K9cSO=e2!566TctFyUCaFjU51tSmM2IPb-+exz5^glv`FtOcj#u8 zCeT%)s2St%Ho+x1VD(K>%R9oW+ws4$>f?X$exL@gw^0fnQJ1G}g6 zRrR0LHG9iKX~5$;CufEGrI>p9kJU9Xc;G7P?^I@pfL{ojbQh z9Yh`Yf#=5e9l+Ji^pLOt<1~D@DNSL$Q**7r0MQ-dgqPU~t1Ams7Fp_6kfq$8*-T;T zOIo0Igek@U4Ru}W2>&JSfBP}Ol2u*+s8T^*xF;6=6Eu=;-hsG$I|}7nbPKe)UETVc zX|q{MPy;I+zBtCoN-4HW&0i}%e!T<9H{(@6IPTyr7U!h;g9-W8P<~lP(o5pp z0@<;49m^)MjF{|L7WWU`sQ9lx$g-6)JkG4sVeuUE#68E{rsvomaoX!0%6(UpleFPE z*2JD;2+3u05a;ahIBA7=al?vL2y_Q)%FA2_y_w`iR)_K*yrB4B>OlVc2g`qvf0~|I z+hG8=u8ziTG>}tKMR?SHi!J9DvgI6$UReX#E6Ys@iRIzae2q@7SM=9DKC&HN20uUT?g{soeQk;-e2BLkoV3&d2jDyN0Rp@JkP@) z`#d9!cV6gN-m_qbb;-w`g*=YML~1t`{)+ckl1KjvB~WjVhf}60{^vSzIHli_ffpHy zKSp=b75^o4r;FnM(19176@PYTjt51FLvo00ufhWDp3X@;;yjhCul_b+U9`-^58rfjAHE~` z!?zzw{_yWC@gi+OCRYyU#FG<-lad57YacnP;d~uT8qQa3)?wwrc>FdFWP=iT6ahaE z#PIM!N(Mv&Do9+P(|RWksOabhG%R^QTAKk~Y>i)iWEwLS>79EgeEuugo?50MM0scN z17kFA{^_Vjv*?dWqglN_c{K0tx8}F!{lf7yB<{}}UC-cw_!9?m*3k{*!sLNma3Fag zS!ifm_%>|H8T*a%jRRTNBVizWlg{%?4+~Vh7KQ7_(XPwsm zI{fD7hH-ZCFwSZ-jLiMk{3?kg?oNl$-(-B<@CM>*#5|a1XO4payKgGyEB_`MV?_Dw zUMc^(L$6>zW|Th0L2pOKc!2C}966A11&}R|#F+pt{*l8D0x*Lt@#X8$7?WX#64*)U z=eAS4`xwmMNvhEu#(Ig~_&bDNq5699@xuo99~?m<8?tnrZ5$JyP|t&OD&-Dou71TP z`Ryu7KTkuUWcIQQj*i*o^-_npn;99L&m$iY?Qk2@W5PsHBWc?I)HfAWg8Zi+aFn!cNN!ej33 zz2_6yt3nHVWd)Lq8Sv55S}EU52m9ueUtKsizHc+%OWrq&==GbqmThyT7~3ZLb4NVB zyXIQ9D@!tYPKu6(HOo_fhl38!hz*_Hp;+I86SNnGp{tm45+$%2q7{2_~p2T_vZTi<_|I?xW{N0h~ zXdL3I#YT@W?Y7`U@k*m{|6#Z<&UxMVc|ZRA@bAXYarpC@2PJD6XB!f^ON5Er|JG8{nkDp3<98AP6_r+pKpI7`?h%~X)Rk2u11}1iXkdKcY{)Puo zlzP8+SESgMq~rRmUY9%@#ox0{cWLc>bY~mBVf~@Dt=@0bYJNMdUTD+mPwYQpaOb|| z(-J}=YVzwv^`j(hOj7ktvP#Q&A!hjJE#hO4g~#B;PUR}IL=|e{taS2S-lkhEwpGG#2otV-AJ2a@wY3K=N=Vdgm6&d z#TatCc%+G-T+^2el}NA8KspBILyJca{zC)K5)a=G`#TIP|9+vH&Msa5$Sd;J<8A9@ zp?wSWY6&tA3LUVvo`ryI-a~A2f)e0*Mgh*i@q!eA$@Q$%AtUtz&GIH7=YW9{3%P)! zF3TSgeo@0sb`*6@er4!RcW|mueLFJU!3lus-m8>vXW{s_qBt1jdjsA>Y1c?EHdMHS zW3kvvCgVscmzM={E`c(!P#rK)Rs7?yvp~$#6-{v4iT8s?z9v{y z_B!ZAEla+jr!P6dG-O#dGS!%-sx{-qJ@cR_S~n_Q?3m<%KS&Tbj; zK=}wqtm)9??8(*LHl7NF=Nn^L@Uj(GEB?y5k9Uuv^ufgWTn2-MI!SQ)QY;4p&oTj;8t3#TamG6gcj2I62h&2O zmggDo@Ggf>p}Kzk9$N(H8yPRr^^+(UuGu0^dw`Gnsz;*M{4D)$HydwQ8#Rc+&jDlS ze+=+ze}dPm^nV}bIx!4A!3bR@EGVH0q$r`#vOuiM~PlfnnB-6XVIQ|WH3vV zl6owNSE6I#T`bW?7`7Skckq}py#lmZV_M4 z;#*gJz+BAm4{&Q?UEuCQR)F>!ns$aQRu1Q1;(Q+|G@E>xat#{k2HwdtR;7@(fO9Q1 zQ8huiZfg9pJbqabzpRX3R>d!?-z3ebPD5G31+T7-scU{z zL+3P$ZOA_xr>GR0soc7X{}kFeJ(W(wFo>M~3TZ#}N6PtE4gVrLhOZvT2+4M$WxmO< z!>wBCpfrcv9|m}x?%-sgl;w6+AIZ{>;=hS-{cbG0m%)_Y*9g~MB4U-O-zEhb%D5KKiTp0Iprfm^S3h#OxpSoo6_ z{~Neo#DZ1KmB(_uxXW*lkP+DhQmah@mh{@dA(%bxMvlcDe4eE%R&(P}-ITv6NcpG? z35&H48V%nOkfngPh;xnMFBjiW=VF~-P3OVm1kF!XIt*TgHP~GxJ)JKkd1`5?}fL6Oj@w0*gb_OF^Ht$sJji%f_*Z~~jecdC!m3X`me`CF| znr?BHt*F3P(8cq8FeOwI^Tq7r3w*7eJQu3IrFmEeYJpLT&4P`4*$ErB+LG(h4tDbQ zmL`4ASu?lI_ac8ult-jII0=y1uD=5JbrIGEMwGon`2HY;)$p?iWkJ*xV#+jLk^-IXb zH7Pq2y6j#{Vyi>)BDxp{yB@h8ZsDZFFThSjKEMu#*Ymee7*8+I(=TQacF#*hEQDPb zQS%j|JN%6Vll?al^xCQJI>*LUT6-#fbg%I3>!tX2(M6`>@1MdCJ9*AF+KSLV$hT*rGpzzW%>{qNTo-R zDx2NwD5 zhS1R)ww22yK8rB|b?w+Eo}6u&`v=cjCyaAVOPK4^3rw-+ijGE*DE9o>Ym9)Iqc-0^ z(fWONoA0M5e7`q*KRhv(oOjXEAKROkKWpi9&od@0$W>?h4^V;2pJB7ecFHUuC?YV` zT>F>bg)d^dC6P5LO1Vep8L~OYKam_)GD+IMb^b1}aQTyXr*h?^HSj3X9Qi1+AD*PNiF+6ECvhA`36_-~yQck%g+B3Y4hO?1b}Ur+>HgbXem~`SoNU3}zVDCD!;M z7@z0Pd#_0t+sWqG{^_%fP2%VCmLT^g9(g`#d@u?eqS@TgPo-<^j-!sB18&jupGg?@ zZFMH77@!+q7QCHRw81=rM=1Au)wfSk^-oYCH=zW6*fEQl_ue99Uia);nU4Jvl;{UAE?Vx3#uD>M=)aJ6f^VyeF>&s*&rFzLL4G~uk44xdCfKe200M3*<=7gE zC&Vbm&_{JGQ)XPplB383Qg>PCoGhSp`M`q^H=ji~LA-Y#ZrEg}#1o;LW3sx&p0DAJ zP4p4c%Y@Rt<>NJqGcG&G-)={2WF|R+mIX70h5DYa273aR04!X_@L*aFa58q))$F~b zEU?itMNK6EyCu?@;4M(^8AlD8N*XyhF4(n64P5~*cW=^y-C-4egtymtJ{zr!J$B^M z6q}L@(qF{Ma3CI83Yfbf&Fdsp(gv~YG5FmY^K7Q+C;`m&CbmBjHRMYmdJcdmam;k4 z^9AvJ9sv8ST9^z%R#-Cf`$O!2n1zQLI!G5EPUExN17tTb7dP?bnrN&p4j-_GUx;4! z5FQYhKgg{guf}hts~{#^rv`1yEbSBOe^yZPjf6c{0jM#A5$$MWFy}O;jMwx7@EJ}2 zNYlS40~*kTyo;|@p_-e(=_@m^;wBc=Q3A_s!|HL-mG}^p z<)I&k>EQrFdtZ8R{`eQ(piImInH5&q?n_$7fkX>MWUcrcawnldSKkU}Dg3lag3g=T zX8b2lWvqL%A0^P`krsr$j@G~J(vRXW%`CtyQolb6cOgu>a8Shp=0A&}_wo7z#h2?( zdB17>$JuH90o@oI(3qBr>H;GzLVcd9>R~naOLh8Q>b4hLV7JPHm>*VO37b;; zGIcbi5Vcz$k#S^S)oJ#QUXsBs3t%U;e9NT&cGwS2pB+P!V7?{ zM7Il`GT{ZbMixt8cnOi_dR=6#)jlyuy9t2i=V=Keh4XD@1P@F4oDWi=80aBD+xB~S zXWE~mF*E6p8oVSq=Z_j3;!vA1m_;7qpuhUSY=)cQ5GUL;!+_x&rDClim+AmMmvGBH zlXxQ5yfJHeajGoz5n!W9CZ#tCLEQDy)bNrzH&_hvCak6E&|E7$7iu98mU0_lqz&Oz z)z@GjU8grPuC2ER`@W;P8XE7P4c!0|egnytw*|U@p^nQ`L-WyPObjF)0%qjjZroaR zTa@x-t2WPxtY8HAxrO+)f!VS$uZ=2{Z5CyJ1D@wAvH4zn%1sebUL;%dLR3xU4VXKP z=_4z%4j3hppCvc3z}N6K-=L?qosJ@z7mm+P-<=L%p5Dikgx=9q^6VvqPD7XNKr|Qh zX1YUnsUn4}ZP!=0gFr)|x4}3~1pVB{Nx7GWY5Lv4iC8tjlPNy=0}pC7{KW%#f?MAo z`Jj&M2Yk!BO(hjZDW%&#Lp=8fmOP%M$GbhJx%FQNtiv@HeiielaRcuLjk0ROSbD(B z8h{P5c_zqNEvI>)y7rHw5g{WPhK8j)=q;k;O=Hl-z~kLXuGJ@tQ{|w0;@`kFPft~k zL_RR=9yY$~{jp71y`;!h?+jlkNoe@^tq1y_WLDhdL(mw;J z$!QwO$5Cn-6HabsJ}Eol86PTrDBzi)S)}0^mnRmWKXhnfp=dxXA1bbBDwdw7#g+1} zbaDhxF0O+AtHtN!hv&q@3NY7*j1uNGCcDZc*tsTO+{JM_s>kpg-E%*OF2H$v1s`j! zt)3E!FQEDlzCo}3NU)9|y|c&%o!Mne*hD<|o-R9mId)F|i@43{)}NP;38Kd0`<~LB zzPk{j2(=KF1!&GtgPme*PRlgWPl^5tdl2npER-Xi+Xy~NjeQ^K@Xlu@8Uy-+rL^fI zyM`aIQ9LJ)6Gkv7;g$2-d4)E%m;gaO%MACG`M2EqrZ8pX#Y0_6A^C%z4dZ;~;MBt@ zMxHLSTLWN9x!-Py{T{A8`I?ErcS}9WI1Kp`KfK1L`Q%B)?Mi@(avNU z*z|kAbIU<+rS*sA^fWeB>VWX)d%Wb4w^0+c+RBe;dJMssbi^aC()8UL0P*y#xUC6l z6jU$g*B@;{o&Tttvnut9M(5Rdyu@p$R0r~2y~LKnXR8W0dR8v?ZWeXq2skFYoP?ZE z9XCqFJlW*k8+y<1xSvLUpO(RGv)D1%67w@h9^D#p_Lk!@LLUrAZ~Tj{n!+(1@$0rm zc1^Z%-7MtloTd32O>1W!aw(C!O?mBWPr2HJCT`mj#2I2OxzhKTxC5k9Ecb{UZIy#f zsJ^8DBXDkey#3SZHX$D1ZO%(LS);NmG~aSNaHgm#L_V^@lTWwR0c5d*1M97&Dw5JV zJ{J%Iz5>m?pBwRJ!9Hcdu?|Y@C9iO=wq;sZQS-WTv0_D|38 zE8aiTdGZZ%Y$h~yFX@4!=cQ{ov_N z`6xEviiumsV8)AYygQ8MpS(!=1iAg8$-tqE;2_966q23v4B>=CX7~lrLWNi8L9KjH zC|3WKH>(OaYoHN^fhph_CSi+R-;O^uPQM1wc$OmvSpLe5N zbTJ7<<&iP5&72W0~HFRQ?kqk&sSqR9~QxSaqYNxScvZ;=r8< z{5fi{%CV#c@Q>JEhZy@OsTL_9GtrNzy1n0P2*-G6KD~i#S1N^S_zEO?%TirJ@l$;P z%=Y6B-UU?t`WzFdRI{N<7pi@or*j$7uHUC{;|jtYbw*i%3T4JMW;uapQ;c!~vL00> zHmPD2)ns7nJ7JI*A<1B2raB<~qD`j*ndXvnGp0%8n8~5D!Adw96@Z_5XA?8Az{c^F zRT#8aY3NhR;Lv3vLb$99-2>eEk7ceMnlj`IlnT3PC?vV{-DGdZIM%Y<9oqC5$%Ib0 zu%W2As6=lCS@UhT{sZ;~!sbTr8>*{!wr3cy(K%(VgKmAFW(b-Kfu z+*WORD|V{PL~RrNppK6z(dz+w=&KMHqnM-V--=>%GNf2R4nG7+Q>gt}Y1YSlvA(?xFU9%a0#A7p}(xu5)m}jSj zM@~yGE@Y;V0(Bd?lItCJGo$fWR8+Z0-d&;iU!V?A?`FUlhDgStOYy&PFosl43A{v+ z8iV?e0Fm{>U^E_&N}eAMkf&#iH?WmXutn~iMtXntH%GFBfET`>JREH3N(Unj&oTWc zZ$k@=(9jBwqKLvnqx~yNY!Lq*HnT|ufhBu>e=@2;=+l5la>R@GBaGq{a=Zvy`7t$jB}ZW^Gs#tkBC(YR)8M*6^wFEb zF}Y44*AYx9{=Z{pB(9h}=$R{FZ}R`h)&ipV9QC@Cgw1p@dJN;qf8G<75zv4OuaGyN zB<*68xaV5_?SlKFFGwk3=FN;XXMp$-d9g0Zf9weQt+iJD7UeQT{ni(n{xh(j;+D~d zimOSgxR#nKu2xoYjkG1HxB$tO1t}>3D!Fzm{tcw$TJ9zcLrSjBWx?^8C>#$%T@>I4 zw`;#ezcq#QTRGENzva%|-?4t{XXwd^QuI8xD?3~9k0Uj}sBGBT4H%OjM`fIROst(| zAf8q&cb!B$Q!^#fQ69X&juH?ep4u~sl^ID~P@&pXO5Q0=rL4?pDrFUnJ~9hk7qI^d zv?NX-Dal%x}T>Ot?5TPkD zS4}(?o0MJRj#JDHa_6s7gd;{gWU(tpxuu!u5+s8R@o$y%EHS)+#GqS&Z4S+tMss)^ zrvgt4Q{%m;mg`F6I2dMOLNtjH#eWd9XitDt(}(ZoHTPz1(w@|mg|amlPb&Y>a_SbYvebl@i*7WA=bv< ze8SRtj%+=bTRUXy*_O`dTbi#-BP9VWPBVt~(4M5j8#DLwS0< zhgSJPTRMHPLIQe~*<@CBO*%Kpk$>~JX|4RgGgw!Rg(#Qt` zVgCudy{LlOJVJK`&|oDu5WoF9wMY2DpD+AAN*?6Z5~hIm3)_QCJt^H{jq3VX@n1sV zJrua8)I>$Nfe+CH)4d7x;YGo|b5+;MB!&0}mfTj{!c0=0tP8b}zj;XXCLPDZ<~06! zmrg%d$)DSQ7p1%&G_^H-ryNyAuZv$U$FIvmWuamaS{tx!rsA)Z!cK)7utak@ zvl4}fTkGYIEQ3D;N>K%LzA6h7Zb$*kq2}&D0kWYrok+)s40!2XX}0bJ>#fW<(E$Uv z1w9GaPC^~2dDxli3KI@i{Au_eYQZDt$vl^sJLvOt;zWAZUgdP_H#;a@5Qyt&i2^Qi zOt!;w8!C3Ta!uh>chCzA;H^MBe{$QuLumn+l;3xv-Y7v)#rPL-Qwqs|Da!9O8O&7R zFP)Ti1`X=sqUo$s0sb--bKSuF2Vmk+|MlrMBZJW*S|E22CjwvGsoI77;<0w4QK&Lr zH_yR*yYHZas-a3{Mm{lI2Vt5MnBmH(_45t{jm3X8dnNrI^vkEG+ZxkyfCCx{Y?Rj_ zMv%d@-@Z#zYztCor^(`Acl(k0-!;bSEyGUpb{_ATtliOhY1e=lL`h!S`OQvt6yVA%CY zb(OOZ?me(C;XL}_U;zgB8#aIJK-{i608MX3;%z0+3}6S;+hnuy_b546gyJ+XEpf>EJU_Oq)nmgxspnc3`Aq$Uac{TSgztK z*!Ei)8&$Q%MuskC{U3NAs!VKzwb5Zya9si5a#v_J44|$ks>3%);T3ErG znG|8G7T^Uru#-Zj|E)5N;tLCZCa`Av@?J3)!-#dX)B8afhL z4=Z*X2#{i*L^*l*DvA;A&}`zQp@(~o9s-sgPBeS4o^x12I_mqcVHfukD%%`v#s3%@ z5x#^L;Ri{?U%`v~qV#M~Zj?N#JOpC>6$JHoKE#z8U6Ug4B48&Y%WyxQ#(!qw>Uyx0pFUUYC23bzpbl;%oSD#LdZuXm4W3=z5 z@yY+qG59%*ZZAMb84wdZ15iXLL(8yLMxb++EFUC0%xr+Joakd>c*h(=`3Fj!sEL%f zo+v?h9@O$any)clTV$Yb6cGKDjI16b%smRVvKz!Ogpv@FnBYhZmdT@Hzmk~_mS(ww zDQeIu9wA+8DQv~lq$bGUO=%FViUQsYlJqTl1S4F?9h^H&a{Eo33Cw6wXStqZ(K=K~ ztpmmR0DW_Vs8`R5iWW<=CuFnP)XXp>vxvk^)Zth(0PB)<7%fsXB|SsPH&^N~ zcE)uWHbce^y=mw$oKjFGl|#s9C_b#pgDi_=UMQQZ+&q(24SAN<1(w!UB>}Wnh z+KkA*LP!mZZ|6c|a}5TcozwrM?2~b7;aOsv`(kKGI^j zZnb={R`yoMALR9(m5uqMuxn=qKb|j})pN62xxhl+-kBah2dGwox5UUQi=JRJ9;6(+ zy?Cfa*5547f7C>&_;W5{TWJVnFa(iQbe_nVoNh}?i|X2|_+8YlUCeg0dXEpUN*KX; zev7tavzS_AorSOs<$=&te$ZLlv7ZL9FiW4=>p_^O8IUgTybmlMv$#PspZ*dlGRFt&Jrfw4s@mf%xo_@7Gd zakH@BY2h9>iwo|EbB~SVEZpN}(F=Ao+(hv@K&oSFafZtMmjh=J?qT>xh2A2qbW^-G zl~o&U)iGls5j2D5n0Ww=w14a;$?*YVDRHoF)MCw)}LPP3mt-=_1e5o#gBLkm`CIaCy^Z(O4Z)FlHu$d19uYMO@Ax_!cJ- zMG-m3DtC9#TR@_36KlkcndKPQf!(Bjl|PMKgm6UjjubeTQ~-sg0bJ8d_5{*YbN`f-Wa{rOF7o#ABfY7#I4W>=AXune=PWp#kRsZ9*sf z@Xb7a;6tIQP2A4Sp0PAT4tps#BL$N?_(D2DB1w6EG{;W^Nejd}4C5)njK#&oZ%*am z{%(@gqgb6*e1m2-AVbxKdDJ(@aT{lX6Bg!22y8LpsYH>wm0W9LMdOEo)m@&NzErK9BEJyqLFhWXX=mHTj^gIbMce+v1c#8-# z8glz-!SN)5q|AuM%7S-e1iyF;)~Zb34k8FvhT0%T-wlg!1?Q*QyNxQui6z_^Takqi|;o|hsA zjF5pg;R2gw{)K4F;w3-WE$y*!&#?>@p)Y-*5$Im1!IQB9ld+ z%QS_bVM2)H;4?AyQk?GaLX1N}x2P?eUN24>%U0(+QmX;|)L9pvM~4}mnluztQ&8@G zHM^2jYF~Sb3~&3hE2C{3*L#Uuhx351$G>o3k0-}&>gszlf{FfR~hKGSbPx`B#mh7+E zU5&E8s`0u=QE^kACW#(LoP!66WOaFlej50F+2Wqapx!cbm|u|t#3<@G}?=Q7W3oXa6YaqO=$dj~H}(Lypu zH6c@PCSpH#=&l^GXcT_%`T`@$r^Fb^xT}qk#G`yp65EFGNHz`OD4!k_<->l+qM2{P zxk6*ZFQR@3pR$}Kj_GExY-{Lmd`@nw8yh;2f)`p+l0_ zsoxL_R-8t?j>)c0=<Nb5U z{{EO&^aa`P)7*>V??=`CB9Y~M#?Wu}f`hTx(P0{QXi{OS*03e5%)UeNyTQAhINlP9 z=feDxw4WS*MVk%NJ4u`(dAuH_5>LXu$^G0RY^4%UGG0*p&ynqISzhEhnRR2kT<|jh zql)~2Sbxx@m!BcM+(mJ#+tuk?*Y9s1i&N&89B<<_&a zQM0Awf&kzsKtZ>jm3~BS*As0e!7N9-gx=VpQiq%Kem#ZTJz5!?mW{#PiFSNwR)DEO zu|7~ze)L4os6dTWR?W5ujPotc$tfxn$YNzlyG1Ch;-(>bg#x@DPt)l{MV)j|XjgpG zWlDGHRb`ds#JED7b%1qwk4jG_r$vXCX}Qj`crwj39Dt<0C5Mya&F$D1%G3f{ouA+p zL!nn0*8&=jP%=TS)%6KpFua&*EFUr4pbTdxQWSA>!;9$&R@lw4w6yv=;cm%}=UO^m z*@4p&G@p~?7S)0O)4$>9l;k-D-!$}i?fE~oQD+7W-Ceb%^IA*uCrmG<*=bUL$Eusq zd#rqVfwZ0$y&-8^ExNmv@`KI%LBorwz@jWvrGn1t@B~e3 zGrgFeCVivWeIXl4obZeXvR?dtIyGf9uwMKyh}Hb%YUpjg_mw>Hnd0Kxn58a9LsA77 zu!zTTmAMXJC`+~C&%iwNp%K~D`Sn;Xtb2BNiOs0%g{b>a*gE)6Jd$N+|C1sjR>>GA zwW(M}NESO8pam)ckj9^CqT)+|e^q7(2j#UA4L6e3M6-XW>c^=1a1d|rbZWr|nDd)Q zA&mM8E%z7IwbOH)s(%bfOiaq7m)!O+zJY5}gSnD0lBmqJ1?|~2o&j{*EEeHfh8`nI zz!hWhM+-exD;MohfNYWuc-8y9#=768gkQjHXuoZ+Y1m$k;5msD+Ig1v zfkb)jn9JIe*B%|n@>*zMJMvo1z{ARGnG^0OFA^!IOp@|ip5i}fV`$0y56Ek?ZE<;R zXjxl%ZM;}BfaSGS19(&JG3B*)21t4B>$dXRQv+CDdxUgrvxp4>k*z!PMF{*#U>B>* z=cKo-!;K%T;O@OM6XdtSQhsydy5z*=w;VNDep5S?-^NJ!Z4=0EAGVd>DD#1Ex#J+} z#%N%Y-IME_*@ZE*i1$Zz>o zalxZU|No5qc73~Y$vZAyi`QJq$Mr9+cICGRTs(HC>_16<8%Oe+8oDz_9JmaT)WiaG zX0V*I9pX*ZIA_0;=M3eyb6tGSPIt9)&YUjeoT0}{MA+!}wj;=IFJBs8#*Ye)QhrOn zxV?3nS-|Twqab;myah*;-|qcGdtDBcUG{C$TA*-_Bw9rjMHDBjqQb>Q z@G6QX5soohr|Ya-XTS~2r)(l%9)@a>HKIe=i#o^>>R!<4m_?mDqCO-b8$v`2I=? z8+79P8)INZuss8GSga>X{H}{jVsBm0} zg7f|H-*kAu++uN3MZ8I*JhaEs_doo7jV3NKH}ONCWD^u*UWO+=FbSI4Px1T~kd*wV zb7-I>C0Du|l&~XS6F;~$U>`6(6bh^P{Q`))`S>buEpflJrO`Yh@MYKue=T-c<&R|P^c%JFY zdfgf$X)}}JbJW1AC`+Gn(rBiVxpe*O8q;r`tFC0uTrYN+e&;1WQB1{2^PZ4l-ZPH- z)EVsJWX-9juXU@F68>6eu&0|EZ|GIT4ECIiCJbb}+6QFr^C#9g4eW??2HR|nqx=6f zF!`A&JJ5}F8rb#d3^wxLqF3SGq>0Hv-1q2vB%XD%yP?xC9yutillLZVRSt@W&t(RC zoM9@n*AS_#xr&tu4xP6yH71!aImUeH?U~P^$em6>?pN|Vu@UVj_A8}*3-$XCue_g;g2HtkwuAp z;76X$p5_BzI&+JRYRp1MD0zD}UV=OjpW)RWkDi^y(>c0JqW7VqxcG zw*NVgFV1I2NrxBW9q;%n-f>uyWaD(%*wphH(k|oA`#Zd2PVc5#JzKVVMx$08$yUw0 zkKfWo=OLNDiQI&N z68v7eZ*i2%2jOt;M?VQrRxP$l(`(?K&U% zx*Dvo`8;(~H1rg*dD8!!Bl*al%^LBMBhTU4k2q(q;jxs$M>_1`r<)TWxwZ*D5<7HohdLi= z58Xr&H-MKMm|&-wVg%7R9SP_Gk-VO2+y^AkvEUiM5DopJ{FHkzcNf_=#VX;cgkbQT zr71aEp~NVK5~CFUm7wkSOcu0__9g{wKA4Hm=GE@}OG#V1)02`mpAG$fNNN@=ccR1$ zo#fG949om^#@oo}tibT>W|

    +(MfPDRdK$O=gK3)( z-$He-Y(&Ji!-VR-e%plVHeUDVs7|Jw)r}=R^?qn{qLT#%Qry;&0P&#p{=zcaM!PCz+^j`Z6J*Mq!L+`aqh50l?=|uc3vtVN2i3Afa!+(b5 z65nJuluNvy{TFhH7p_V&)YG#6XXFy)7c_Mhbj@Zcbjogc6=Y^NDVNxt^*7`aUX)AR zOsH)_9)aA<$DWp_t+rr=c9gW81< z>&y)lk&I!slrh*t96DIGwdy)kTerxz*0OlyN}`+lG{4>ukL*8<#3Q5LC-KOdS6Mu= zn|4QeY?ko(>3J-&y`YOL8HZS4zqvbPTP1TI^ctMyIaPOSH%5v zGAv}s9)aufd^CWkBq))4;lFlDI^2)ecddSSu)}__Qat`dbyA+(07yW$zdYee%{Soh zIr)^};j}>WefEjX>}{K;3`vP(KJ>0R9>3N%<$c5X{MGZ{=CPUHTJzY3E)DW?!Fvz9=ky8mjhNqs z&h41kBxs6WL>_?E>+{}=(2l3hf70|pX9Gq>|L<^V;3SM)FvjpF$SoaXn%(#kQ~ldg z>!`5zn7c>ZmlNuB?%L-hG~!@d0=89+CugJ3IZxPSD_ zGMpC1Y+hqwdEjst<`6D_wmaqyeiiTnrTmZ zd!E>=$Aiwi;nkG=qqot{XrtyWXrp7I$U(d6)UM_&m{>)HnO)_0IH!F17;09 zH~iO?TX!|rV(*)cx~oG6RYCYVtWSw z$>kTqxJuZT?AWN03nx+c2jp;4(wc}X2eO`mRuKQAYmvqP_y)H@ zCx^2pD5b-9*)QpEAL0-h)FNG_hg#yX%lu~vUZo>p4{pNzEWnZ71hiM53D0sZC>xQd z5*eJi9rmZat&d_5+S_7a0}K+62@r=U==lM>Z5|&OOg1y52+9;FlZR-?Xm~Dy=OXz$ z4xUHD^Jw{8tVGf=ogM~RFlCx3g#X3@>nj4Sk^##Xix=A)iKovyoo15%GS1{aCx<7* zrE>T)5uXsP-12lLiY`BcXodKXjq)d&N??fM zS?)hMUTDy#!?RS+a3#=zPTM9iV(i|8-hlsi$|5&?UMXLNMOpqfN<>i<&q{mf8j?9p zq?i`6vFmrcsFH({tP$$-BI@*Vc5;Pc?HUB@ zL3EL+Lc$(4ChQ9gt{4<9pqZ7(a6nxDlK7%!k^obJL$X|ev}~5Z7!2m%ve-lCSxNAb zz!A_`pi0M7npj@}0$`(M2xI04iqs*FblS`L2iR}oWmt$?qqq=HiIPE-vrO7YrH(3c zUyi6g6}Qt6K=~+`01`_ZnQw%x2F64i0stDBN75snL7p*OI|X_-H)u}m5E#Wt9Sj+j zKD>o6yrqxf@V*eWa75!D5vDy+qpJZ+DJ6=2W{f!`z=o1FXdKP=Nw*Mdy- zEbZSq5oBTh@nxv2F20l@T||uBwWu@#TrIj~a`C+da$MRTjHyKan*dgslD5Le&RB2s z;A><=7S|dAf5l1AaxBXnf9n7G`t`iK!}SBC8J(CV zlidMrvGY+g@hY=EeA%8ZioB+%>C8``Bx=NEP$TIknz0aQvcL$U88fw~=|h|8!*0fh zo$E;FF=3@V>$On7xMzPuX0>|ve)#l>$l{<)3DZ-W0FS`+Z^d-9*a z|NN3afq(f6Fs6Py`W8<$f`9VEm+`|>@k6v$1{Pv$tgPMML2 z`{%^WDmHGxQ;+xTJQ9%up`Pzj`epiG#Bc`Jaby)@Y4<)tEbVts7%c6zW|o$C*TyWZ zqNn~umiA)=9#qTve~zWi_!qOTKMhP}X|D)z`8NZJrTrtwEbW4)nS~{`_5(s6edbT^ zZV~NIVj}q^i_X%vVU~9GnM}{6@U!ZF!p~l7=4Z)iYl6c*ot+zqrzi!GhVyE7+i=s_XI5LJ96M$)5_2%NxC_NkVbsQwIx2340GI6JTPySoR`;chwj;iNiV2}#K2;EDTqlcC3yKSHoC*9P#3~H+#35-rG&>p8|E_uL^t5RO$23q(0@^C-^?< zmG|G%rzF=)X6~Kpjr8zFrr1z)LJ~qi^m;IU4E=b}=h;tAL#d>`J=6}6OJVM=-1YHp z6jy(}KV8v!!Z~ts%BXF0bK65ZES%raruIbE`dFIR6A!?hv{vaklnDYS?1pqyWQ7Mp zn*|GP7LNJhKobTI&FJ)61-751a9V+171&Ci4Su+Ymvg393im92IE--i${@AA=+HJc zWG=FQRXl6`r;)vFC0Rn_$Fua0(RnqzOO1B6F>Cu=3&g5`iE%bQ>oKApoH!pRQV&j; z9Hk>>NQAp2lKDB1zRyRohl>Nkq3wqxNH$UeCnSwtn?|Hj&D8D$%p!45U&i)DP5E0H z42-1(e^s2M4K9b(othGlG}_*xR#swi2s#LqbL3ljHo7XFz4jUZhN8k;`2Q}9Z5w(8 zzbLEDfJ-^{2!;_k1Qdmgi23K^nPg*@WL_JVO#VUi@ql^%L^RY<$8*CG&-pv-W4S|Q zcx#DYu4)~KlhVX#*2kE!$wND5;ZBI0>y3=Joj`fQiV{5(2}dZQYueC_=o%DqQp?11 zq*E~Pxb<;wq)A&cD5a1sE4bScV6CpD%;kxFgxAS!eHOt$`y5~sID_|W(5ol2a3 zc`uiD?M=k#@Ozm!b!jHzG|0qh=X-!Sosv$fuzQFwwTH^+)iftsWs&ZxCOEOR)$jFc zw~(t!y`8O)QmIK@TH2UEoe0RHWiwqJs zcPH|%&2-!gQ(-Z;VK6IHKNz|8)Q4dy?nl{<6rLJElcoWLyk-_w{5$f3~NSY5g??`FK zvo!Do-En>S?6lS+ltgfxaK*fQ+YH*+> zAjH%CVWaJ)8v=c5%gqa>2{a8_ELQYmS^a+N74i(VlNjDY4l@w-CEk?4)4Z9myV;Xn1>QrG?W9G0z?( ze0m>6P9Lm$rp@oy*aOrlo=#)Wv`Cut7bMTL67v5{lm3E@9TUci? zwcx@QJ~+lLlv+$JWHoMKmTcj9a|>D6!f4sT19}S$R0^s7QOW%t+^NuJCyLh(*IlSp zX2TtSdqzqLoQmJlQ%Z8|p-a*mo{Dc9or+#)*dCeWBvHeQbOy%r8E9i=3e-skcn=a1 z3^V3y+>A1v?*OE>Kc$}Rkz89-vt&h>-UY?&){xtA2EID*PJ_~)N zF>&eH=6szdEB1?k{&fird%_^%a4F&;)XIWPl;j@2B)_@nwb z-iep-hhOuDfnQ{zmefIXZ=!$gn5lnfO8z(0<1|2?DtRm8qOnlR$uzZYN3pDp0e0>83XY-AjcfO=OwzxmHZ zD``wMgMDKj8WDx>F^%mu_Eg7j0WqA1*nG0=J4+Os`_96?2eJXF3#-oNzOzJuxo_8B z_ubaqw=1RZgGXv*gvp&pOntks?>R>r_3aYnM@)U^{dM2Z9x?Tur}sS=`~IiwJ5NlK zeP4uq=V9MxnEN)W#vA0X%M%yM;Z!t;{x-g(Y5a*42%*m{nGKQUMVSqe<)|jZ6zg++K~>8&A(3-Zyn+Ry>f_`v;&!cOS)wy7n1nDRgS2u zPum|`b39f4vt1~1T)%S?(#U84Q!yq_qLE#!=RmljiIKTPD><@sIkMAeWXXI%@JiJ4 z__iu+$ReA7>fOZm^;~^PJzr3rDL@~v1+i`vpPAx6@H>fMPdL7^_Dzj%g6!a?`V)>% zF^|t-9$#0{OICNw@tum}bKv+UQfiM)qNN<&HXUK7jIKZ*-I+~{u9Ntoj+-y9t8HM` z{~4DAb}K|3rwr;jS?h^9)1aIXsxu0vug~q5(=H z{9tQNA$Ud;Cw z@mVDE2!Nex0;;05=5*$@6xF#m12-X#+E{N~Ph4I~KBsk16q{t+Z_sLNvq4|OWeYvt zn*}@uDkfwK;3$)4@N`x8I zv40)s+(1jJdAmNgw>BYWkU!31u}OOARRSu~?e(rK7^P|lP}=qrY{tlyPL*TQPPVC@ zin}jo|6w#ijGxN$Kru4XNF?M-RG}3ARNR%geSW9phn@|A-AR4~{g#6jHm|2<%BN~% z6hOWT_8`LA!X&hLgu6DwoA z-Fk(tO;m{cS2UW*iul)TAHt7M#6OTf?O_E)R^wCDMI}J?DcUil{EGJ3Ctjb!woKaQ zhMBP8Aeiw>J1DB$BwDovHJ{lOvLdQjcT9%Ct+S3TEu4JZb(q~r8Ab`a7 zqHJK)TlD;uG(4tf}N6`C665tm)3E=xsQz$l^gfnV0-j7xYC+lGZ zzt+w06-@N4jkYPlGW1AxU~YRhXgZO9RMNhDe^SL8nW&I<@hTLBVrqG94GGY*fuA{U zwLkI;O2+Zb3m=Y0!`QDAawv}`DFUU*salD4&~#7i@{d=ggE_0Y(VoVu0V7*%LrZ!k zlKK8kXi`z=M0@_b>8FECz>>z!O48Ulj}4wj=JWJS`Z4HgDeTLW$I-q#V9_=7pPfiU z|6)=@AAh_=iA>2-wcTp=LAC67x-WW{6_8Wfh-k)zWYzeyMT*)pXv5#pE-fj*Ry^hQ z85}mJKSnOMi1XU8Eerq=b53U0C@Y08+Pl)2muIiPm6E*|g(v$q;NIz{mTgTJbvu#o z=!aTyHPUQE^p11uz1rPyXH8IG-cBV`LczsD2WaRe`LUZh4|=Me|Fh#w>-mow_l-Au zuM#Y`VfV_)9o}f~mFUI}L-QQGr>ElCZ4XUOu#E6x7-;MkU2eMocVMigA(x_0htsEf zloO! zMX;7WRG+dr6jy16etB?2y^%gI-q78$uK;~I@{HyXiROE?gY4I#uTz1&El~6Lfg*iF zu5c?V2hGBN;5%~R!J^UR;DAM=^`dcDG|pH!EGkBuBeY8+hoN-xcfSc(gM;MmbHBR? zPcNbNn`4aH^9?O#j!BCdxz&a~1W5l)pcozlo*skvCmcnC2c%UI$@#QfY{5loWG6+m z3`SNWQuUFd_Muhw@QoCzsIV9s0>E50rn^OoY96;r6T7j{x-{0Zhl&W-Oe+@eIQYiB z2I@~=P)dg(e3)%D*#kjJ`T?{xLAZ`1g}^%AqM$Dfs@+24iIo_No(g+PsrUQ&bEh;0 zK?M}=7Tr!wT#iaHw60)%s7-E*En~YAXc3Qhy^DW4FpNnSkto4Z7)v3Bc=dOr8J%pN z(T|jbwVVQ#;LDG!EK#!we`~fy=zj|L3iHT@3U`uJ%S$E`Z0`*WvpwuhCk^D(6at^< zAxzg~s|luA^#!5dfEr`Ble{EppGJdfZMBCnAx_dZO}0b1p&x`d2D~4&G=z6>2-<1E z^kb1EUC*}bKw~_$qn1JzWySU!PuiDIV&ob6a^Gbo;7ml=oKLVQRJhJ((tn%W&}0ag z*gMCJe6oDYe$v?5Y*5T^LuCP3xWl@_5d4ZHa8s3pMOHD$}gRwg27Elk=kT!k46O*Dq8{X+5?PlY|yiByy~ z!fkpNV9$Nt$S{{Tay~`riK+Hs-z(u|_V8EKOSPhlTI^38#?iC5#fGvHOiZeJ#ySBn zr!m!V7D+{lvygJY)^W6Mm zp>Dm>^Tufp&$Kp};p1JB^dZ)VJy9t(*I~8Pk&?Yh>PTl>2pDh&9kehoG!!YH02q^8 z(kw>=>kMFYRbDt9!h9{|$)(D992nZnQ+7lqE z2oQ0%&R647o`(#6{}cwFo91BFu3HO+Y`Fa9|u{ARKTCYvjkN&>#=?Ixk*65QPsui@Nyd%~AP9N+5l z$jeA<2Aoi2g3ITr^J)8i_F>zljCC{F^Y0PM57bI=WIb|Bsp7~v;nJxekT~)nN?|_E z4;@ssUjw5^5{U`geA&mQblJPyVmqpSMr?JNU=3|9=Jlys+=Tgnw?__g}<6 zE%yH%{FA=#|G)UBa&PTl;h#Th|J(S7WRd{$*C?cHc)h=eH~KNcs<=#5S~g2o8JJpyvUbZV6!KSAnPTdt zRce(vrYf^cRfd|XtX-EHrRo12@tIQM0fwfMpGS5`yT~n2*pu|Hnf4r z5@)W%L0{|6BfE$cHRtn&jYiUxvy*sgEtA-fu4NmIc4VW0B~!U1u$v?|`$h3up3`G6 z1wD!)PQe^R@rr%PDdeq9S)i$=D#fNMpKMH4$xN-Hn5yKNsywhGS!LBPDQoJqCfC$v zZM3E#J2Ft$=bP6wM{l-=soDE}X|$%rza-a`qNmfEUQS}JSI|@*MoIcjvmMGv3J7-} z>y@EG7t!q(2KXu51KPK2iB}nUoHq#zzX)tcH3bdQ8@?Xn0ydr0vLIgUz{lNbY;Fv{>_qq*^8)c zejfZ*WElzOD)L8LrjNUNIy{e)<;C*vB>8u$yi{y*d@@o2BYQ_91AkbR7ASWOV`64+ z_GTq`*sf@+F=ft_q40pBE=yo?fqia!_e_Tp>Fj1P_*h#-hSSm@Fw%9IY42tg3p%Q-X7xxb@=l!G7k>dY0`Ai8Ts+hYn{jvKX?{j zqsl`&(Wf&6G;MesQJm3Y_~Sp*o*gE|a<*@}J$qD%J$vjdd-g4J?Adn$VHK#dXHTm( zfij#2-=e$CurA(c)J44gQdXf?mhmX7YbwL|0++fn?1o>s$V&DS1#MZQ*SZ1f;Vk0E z!}(NWOAUoc2SzXHV)m?)hBBW+YMqtQ)OiglgnOyZvfI09IR{!)Zj4XYZxrDjaQ zwPP0x^}AYA{0hTxu2DANPte39I-uR@X_omc5UAMqw*9$9)7M#0xH=e+uYK;M^yrmU z(+{_>KXN`iFD#}>vbsu4$AyeeC66*~QF7nYUPWqkd;PZu@aoGx6f7$)i=|Uo`pmaF zr@>c)bNK7Qc?2<}%G0chHaJThDn+Xg5<_6r=FpQQA+}f%Yyi6sA*_$qiP6b1f*H!w zGtfFpNAZr_RpRcY)QYMtv4_5uVo`?@TnyNJ6JVnyt8nanJkS>L9ec#&#kC|9z7(V8 z279@!fmWn-AZZj}m2+;mqC1BniuP%VLkJdHtfDhY8e}*5WFXkft>}6@v61W_>z4xG z<8P%zGA}IX&ibcl=Hf5#exf|~ZF+RW8Fk3%hJ%?2X;LnJawGUU z92<#$`c(AqPD=8RR+GF<)qZNQ+&@?0i?~dM+A?h*~4BdUnX0=MI-{K zqcU+h{96H|F81Pr=ZSYw^taEpH-Urh?~c__Fwp^4n1;B6622L!x*LE9+MWHgKKjd! z7^CmVqaVbhx5?4h;GU$oDYTCEa1n`;jkS#BOFbdYP!zPb_&Tl%KaXnFH2Hgy=Et5@JyE^v~_bLc!68#z*PMRE%v|DWcDGnYM-Fg`tC z`ZRW&EzbN48iD)J#SCuG0RVSMHkG?R_RukI{U1;ipGEV>@rZ(!F#lEDb$q5_zmfUL zrFh8JD3R$H20BsQAb#aA+rjC`q)u1tLu)WjBz&Hj<+QSXpY{38Dfl1`>bHS3D7;}~ zvyhREX3oEe{KjKexSmSj`Mrx|JVULy*!5j4$HKh%$J&G*@A81O6A+0Ngp&eiK*K{W z!B#K?91OHXUgD;>Fp&<{0T{ot$QD>h*pKZJbUx0e2iokYf8SNq{6H>S&}5*7);==L zl6#O+mqpu`)9H$qBiN@^DV2qS@o!-@{q;~-bnGJb-VdhxqhWj(TG~yPR6`e=*`B+w z$Q!w3DvGZ%WdM(97V#EYqfWDlm-&Z7JWW5wI=zuQ(J{k+p+1j(Vk(=i+QdZu;b`Jy z1sA*6ZyK#sFyOcnnd%e+x%F{k>-V*WB}-@V%l9nzU-3OtlATZhe>uyKkiX*MKrCq- zu~?5=X&8Bk!9UQ_VxY8g5k^cz4)|!Tz_D;>|D7mE+i>`Fi*YdkXW^(o#h?-?c2d4V z%*R=Q9qh)d6b{H@R5j>2@dwd2+y*y^^PW+FFjlrnvwU@mq=H|}pm-cx@M?+lcH;@E ztCengN&NR6yLM+8dOvD9ki(8kivgX@_zd>Pt`(-5Z~;^0=f7j{RRaZWIGA0t_Li2t$VGKy;=z_vXULZS{Sz# z#bK)Un-cB+i@I`W?kZ`2v*yhio>G-VGNPHjJ~c#bI1D~?(GeU<1xp}|uz zUPDj#4W!pK6uhAeJOD#+jRPOr!h;JQocNFdKU}08iuW}3f88pjn$wII>!jdWR=`rQbhBNT@43cO9sBRY&yzA(E8g)(U4Pr4y$@5`TgdIr@lkuZvOP7~-nZ%8 zUd0dQ_KqNDf~{Tpt)yhr{mmsJg@cyc66l8BbQ|}RdPf7fU1dvLo`Ws=f8hHgY5xfP zYL^te;-rW}{tP9U4s6>Hd96A8XOcZQ>J5WEfr?s!ga8J+-2`|OvEPk&sF1lp-8zFw47Jz$`C8`fWZ! zAv)kiEEsT{8-jVKw1xR;0Sg1aHj>ynw%%J&X+pW}>Kdcm{8V^fzo993UyU6vIriu9 z&iJ0;&GEAzbT&or@SxvOYb;nllS!8{{Tue&v%{?>>f z8nI#j+*ylJJy1UnYtR`9!7oeMoKn?*Q;p&D_lPrr2&Je!@f!8UhWxtfAqL7P1Ix?gCt+pNQKm7tS@zodYlr+w3zQCBvaMm*X4& zYB#liQn4;iY!+o-!>Atnv=N=F^=3NvvK{dLWem@R+Ancq<1e4c=Pyy-7XJw`(w#?n zK%9O<2oZ$uCp%&U(6D%8-L-Et*(SGV)tRBowwp|h%KKq`p_Xid>21Na!GvK z_@nqB{wRnaq2?iW1^$EP4dRK$pZAuNzXCr$j_Bcb(??E9XBlgUEwsTFeDo5H%O;`f z%wdwA>C!Z|u6h%ACt?@LzJqFIx-LMjv(F2eY1HNK0zQdIR>ggU0oi&f?HbWuJ|C?# zlQ+2EkLM<0bC2ozu!F;M()=y=CHY$*!(%gc8u}Xr&5+FW@qxcZABS5=`uP49l0Lq> z#R=)-(Jdr>d})jSkUs9xLej_OWoG*L%Q7bLJ}zsLK7PLJ#PqSmmqktDOQp$D87*fp z4dFf#IkXrPyP%BhxyLlLw?=v9PdXE&)rptjRH2X&&?2G7?V<%zyFgGY+R4d4c>Z%S zsXc|j8{xnRc$BU_8Tk$a;z>Uj()V%Pw6;SW@l~nuDhxIU?xcrya+sV!DcEFcZfR(l z|3Z{tX0Q-zqZoC74R59GbuWKnBU=dslZ)~-)Ma~}aS!pc=dpNC*+5RF7@(pRrZ^tXKpqt+krpY$zhE-~(Bx)E%|lSR5Jp0(0RCOV;5`KtVqolD^VCt;Xh8TZAUb*L-m&cndg z(f0XDG^4~DbuT8n4RQCk=#>ssa~$s0V0kfY?f=4|IOkjAPy{~U<^rFfdk<22CmSJ3 zaRc_bUgpQAZmZ&UhR@9>kmb~6fcmyZ4$3UE&iTmhv09qhH9eSs5$E$`K8 zj5t7V5`X+1K_!g0x_ZXgQAXWi{@j`n-1)(lV_Hs#CShW_LyQRE%kmi<4%2^QK$ zM7@1%6x?_th9Yar0gUnJoca;jY^&BEjbKNe7T{jK)8exp!x%agr>W84<$X`l(vtS} z%KW6e!E*ZyUm_*emzqe%`Z}NR9l|fSGzmeB3y!-aa+L#xhuIsHY@nrLO4;FbrR-2T z?#~;5Y*I0-(k9>)P2LSih z*};DfoOVLEfTEdaHH0w6WJMjBwO5Ms-UkZen*D$fOW|1l@2&s(|6u(;_{;VGpz->D z{eN}+Rqde3#Yb9ATuR$OTZ#k5bYv1P-w<4SC*e|Kf25S*f^r>kEjeY3rGQpGtr|6A znCj75sb;T5fWexi0lW}OE);&CBL&6ovyrV%o7pV4v3UuuKVBLgnrQ>5dOg4U+j_N1 z2CTSeEv`VR&su@&Og!C|`3y4#*fd82ZKe({vNiKY`=jHJB{nQ+_hZ_~74{jFM>2dc z@Uu^=Vfazjgz+UQ`8GJRm<_jCWHb=j)BBxmf2LHiF2(_E{;W3fRrA~ImBd|a607Fn zMXuIS2qLsa%BUiUzK#QmF?}vM{-6)ZvkRCAK-)453rw2`)7?hfHxuG_){VI+kPb@sC^ra-Vall=pMyQ* zJu77V;IQr|+}lkAij0Y|i#jHAed%XHD(s5p79uV%8U&LULXNvk*fu+<_(I7gS0 z%)ZZ=q<|VIh=o{O zc*wMfN2?jS`OJ|^@G1n}9|=1k(7iyPVG=~Q4&I?D;PoPSJ4*28b7Ffvt|EbV34u2s zNHu#1RWBkP7C-iW(oX`Y&(jo&&WlqKH~)l)gA%axca!`O`l-n1M2xyCmap5Fk&_V- zNU?_~zFf+ZtI=s>uj-1~rT>J{kN=;-BKuIdk0d>-_mdy%t6RCFNLeQYEG5<$S+BlTQu;EF-r$#l-EkoOJXK{1O&p5&;p@`3~i|CIWhMH}2+u2rW55SDhCedm$ zf`O7~pw9ATnPb*Q!Qk&9h&(3CkcE1TCxmPirEiY*dFw2zQl65=)oLu{!kisQe*0ba_&MWm@697^K$(8ZFDU?=B}6BkST)R*BSpN+}-I3 zD=EVlqgQM6b2mEesnIK`XL8^(^?a-BnajE7aYoPIxJ!(l4-!507$qR4p4+>Pj^CR_ z9e1s_5;c4AR`VVoNgdB=(D528cRa_`aj7*4S1OoiO#vpsxWNe#8$?rwqsJUB#qq1b zN?_2DH*70Q2_nE@dp&hfdgNm+&7o2~j{87ZHIoCL@85z{V_`_Tr><~ zHcDm6_F3|k4PViIHeQA46;f^A@FC(i_Rz}+(Oc>5ofI1@y!>cFo>PuYgLDNE0%c4H zNK}ZSNW1F9fQ}}S9$;LGCv7x)Hlgjy7sMxjP6#?_gMtxt?BO{6d}s+G^Cs-C^jZcZxB~MMEfiD(y>P6xZ`6j8kgg7i#>KU8K)#u>q275y|q>n zOJ4jFAK)xBXTh@Q2?gITk?&3Wj^8gAS@?dje4k@%SmV7d`95`ruMoR{42Un4ugpi{C$S1&{UBeN z&(KfeOY#^cLnMR}b92V+>iJa4jmd5nHH(T+#9^L0|6g+XyO z%uC+K=9lJ}Mh$?YfnPYAH`oJnQUequ z;bBvP4>~P=PTvz9V&vmg&^|fxpiMj-C-ThSod$+cA7B_OX&B!bLUE%mTVyT#ndB(Q zCxpnRtHX!q5Z`rAM^^P-_o1QcUA>jZzy9OK_7BHK<^u7#UuwV^5%^_i?T{a`0hU7dho;}M?zv$PpoA~qkR5w_wH{?vgpW`FYmkFZgT zdqnQXwR&RBjxhN6dR?C!9q=}Aa!snm#LKU7dy&l7v%6bJmYOKZA*KbxH~@c0yBuh! z&4f+SxOX!-EnqzHt5uXL?=Wuhpoe1 z9dz*1XnsPIEb;v(WR*GdC~;DKno-l+#k0h8wMSdbOe)$WK7E1hHy@*MaYXS3+c3P# z-?j;ZNOPJ}v}06~cFiS9<@Ee?Cc$If$X<8Azj2BDo0zd_x=DjQnjH`7Al|_;b+j4cPC|jWRF!Vo|UK7Ij6#cpexC zEjr*%_?AV0V)-j*tlxcg58Tq_ltQQ=@r>KNrrY&3{qclr+8G*DgBi$A(ZYJQH)!n| zuP@-AY=mQwuI*^%;kC4yc*}_6AMr~5D|qNsOv7+wP%C0;#@2q0QM$cnc}Z#WP6AX=#F3r?ksMYzE;$|R7(jv z5iup6Hm~IAR=JWH))TGdiXEoYImDG`NnORtAYQiIn|@%Q>(ubTj`GtI=kZhXUulW6 z`00g5h;+*lj*ob?tyE5?&V34F0D)hAW7-P~hQ;{!?t7MxZ`5Krp(*kS&+q6)=eJll zE-_7F8U7N#6v|tT4{+)%COq9_~S#V1$Fu?*;G3^DBq`77hJ1 z{Q&<4eoZHrhT(&b55G<7{GL`?cH&VRwCnM!1$L+LlsrM0FQ!Y&jtWkrTN?V(&UP0UA= zDX+GuAXq&JAk)kn_5N((*iK5cebrE{9t~u|s~)&g)#8QPX7yVsp_yK=0Eb+Nf(p!& zToBx0^=k2<+Dc{hmO>QraH)V&F;pv4R#&Ux)6_@{|4609QcS0*YSuuh66rWpJH)eJ zU5(<5x+8pgRVQF@*ixyq($2Q{y| zvogO_+N+|oO*BJbCJ|)pJmkn%iu!@d43dufFGOvUvap^j*eRy7r)mnSX%ua#Vjs2$ za2Bk9Xr`1^r<1&#={=0Yxld%2q5EDhHw#m;veMUL zwNeM`u+$fJsP@9e!Na%;feN%JS1EN!g$-Sa_gD+a7l?w)1-^#QRISDyI+<)%SE+TY zlopG;u$uR~JL7UL!f;1Gur{N>8tB0v?Y&6pNj`}OADF=ArKctJTfs_e@W>$lu;5~= z65MU|t7^od{mN#bs&)wWj=K7HG@@!jJEvQfFYJ@sCIKU{1QuXglfv8tv=aCV2aW+g zcUuErA};Ksp!+@L$3=WZ3HS5}Mg@AXD?MIafnm=5Aw#Jnv4ybX$0 zqh=pdvg-}~-+t<&HNd2T^?yt4kJJ;6tYg!TG2b$x#74a6EXC2SmZ;VWpLMIhIqClZ z`+1J|2YNMFxx-SPzl`mO;5{X!36{a-R8Wr?bPI+wpqyW2cvQ+nx-WYF65s*%`3I1j z-67Nbv7L`_K*@Hz)XrXL2Q|Uj8 zx>-+ZQf~;?1-X8Nmj!-iy=-Cb!NS}>$Q8x#hleqFR@+193^$*2w=KyZ4vY8NLsuKc z{aD|~tK*H{>X4jaQN?f+tex91*rr$Op;Qj%{fQJ&OQwiysfxDVtZ1`UTgi2CkNEmi zX+v7hc@7bqbb5(+KUB2!dL8u)Tg_^=_HNR$JzT+oY+1?HkLep!0%;T;;wY(n?4i$e z1<(k%-7iqIkv0@_*+XxW>&0Z;Av`%p{OKe2}CJ9HCU| z$_U=#v{y~~vTrL`=DJDh%K%G2w7<&5_aBo8A=5_svXg!weOce;6!r>lX)%K9OMMx| zpMN2#FT>{+|7N=-7X~Py6s5W@64hm6%B3<=Ny3SBk(i@U!YRidYIOoQ9j6&KrGr#- z9i)PZqXde^Hogd?(8l8?`JPEnhNO(>KfhLzqLF@VPd`$sull=Vf#b=2nj8Zw#o+ao zw7GSC*gc|Vo}qK=m-K%dX;QoS$XB!+<5qf(!fFnDk%_)XUeoQzQMKh@)fNsZc1J>w zcWm^p7)rV0QWus~p!JJCF4QOGZY>vGeg^*HN!l;)7?_dVGO?)%8BMwA_@@7J_;3FI z=kVY2|8K^BX8WFEqSGl3=N0a}p)-fNdjZfWsLl0o6q>8rkCCgK-pGS+SPl#YP6Xo} zcnNT7b|oy^a<5jeVxH0JbX8lSWH0q)S9(1)KCRyV@Tcjhc**FfM3q5^5bY{}`cN}p zwEcHTm6ja>;w*NNS3~N>x)iD9?4^3L3|1a%rT4;;=jZ#Pm)Co>wb*YBo?1Bi?H4K8 z%aMw{LS5L9euf3RI1MSC7?5N;hic2z)qfEE%%vv!*%v-jjkL6f5{N5w8agfTjW?Z) z9fl3^g%7HBwE}i)Gw8E|4VsXsya={mL2%n3HM`ZH&D)}Y^G?siqj057 z&HfG{g&dL97rlwSm0akl1mxb2P&ap(x-w3FGG$gYpR{KWt5H4QP1%D^^TBP$y`F7q z)O&#Zl>PzRvMt8K?Fql(4!Vp2<92}uCpoMTS1wukl=I=T*1L z+ijEueS11nAX_xXXik^LEtccC%k=ZZ{z@(Abu}VrZLf+xOUwKVye+u?Z$|w=RNrSo z&!q}Ip5GumRp)x~S`(K%i$o9m+k;K)=Qq}lVgl$VP=8*Fb<+t zmAj1j16cWh&9fBKU;(|V{iv+orqr!byX7GHIS2rRA8hw5MSOw*`|UGHFG*4Hf#k$% z9|_ckW0vs`+0W^vr!7@iS9@DD!wx$@hcK9J=E6Xnn2%-g?@N@g zk}TMM6_uV<-t1BpZMfdYC@vm21f{BdNQG)GwLb+fs_b)1?K4I(gJutV(fd7`Z(_Vy zSuLLa3>mJzfTHmN%dS;D`!Orba-th>ZEp;*sTz>|Yt`ueZ7e=yGl}Nufjw)#!^DWR zkDr{x&oCPmjx>tQS9v2{(H~5!Lhs$WDsPL8-fnBva9iMBDqxn1);v(9;ynwSp`|U=WHt^Iq7bbnJme!=qoe<05agbtPgm!c$fELK~&|%=LQRMJQJDMge=` zx3SL$@=6fkU?hUgM0>d2zgi|q^R(7?SS+hTGhfTx`TUTR#i>(F0%G~(<07G~2MHFmT;aP!aMLsL=Tm;WDuJfoOcpeRZ zN5kK7@OPXKWrttF)s9h+is7PH=IJKyUTw24Tc|)zdF>yUni7q6D+P>Tg{_Vz?{mCS zZ#laA>2_xT&v(c!=X@j7O|Y~We$S!bmHD$Iol~r6lb%;1lV`!-SxV(#D)&Y%e^-f2 zde^HJy{l-G--YKn@H_{e=fLwEs6S7(vsC^?BNaku(};hBaJSPv(~;mEtbkUF0KQiOY=<$-wW3+@ z{4PAdE1v<@R{~6j&5_Rl>nrEM^E~-HFLGsR^)_P3a<+MFpnecNHIX}E>-&>HvI?zOVuivf}s$YdYo>pB34Fr z2NlI?@OZI3^cFl(E3sdXndP~Q;Qb+ZKjxVJzTl`SUSKNkXQKobp*0pWXM>)QNKpKQ z=QrAmOve#*PMVF3(+*M^qoTwWL=&!2IZDLmP-Hd>)DMg)%q{gseU5lXxQ}4_0x=u0 zVCoGN(jM~nS0V){NsV}vx_xkde!>*Z;e!GA3(T07t7twTdSPT9i$4$QjL=#yQITjk z6eur2R?|y{ESn!iL_XGrtXyL~0#t}U0r!pVbK~rq)u3 zY(*=~5>G8=Pr)6F*j8cH6X>tZw(mcq?fZVZgnj_(#Km0mRrN|_ifAgyHqmz!_VWAh zY7?PSOv$x|^lh8PKgru&plSNuHi~4EesQ!l?VF1u8P#Y+a1p@^vw);ik(Q+dhd8a2 zH^Z#V4iqG}<6_Tb|6yoC@ zd^o}yTSH8PzhdelhD=o%LuTINrhGW)zjqXw)trX@>sjQ#u7~j4iqj{rbAZa+Ac~OM zr&G|QJ)*~F2^ruEBf_i_&pjYzbc^kwV-~6IH>_Y{Gx04nj4{qQc=xc^q=+iR?H`gF z2Sd?DJ28M(4**q;DPBV4^nbKOt!KJ5@M+SnBL246@gRBw;oZPh#NjW1Qc0(V` z#x@zRL~9c)dK>Z?iQe}AK%zH1E|i*kKpp=UYyn*6dj3q8ry!ufE)CH!0V_s6&?Wae*9f%KmG>JHa03j>U)IrLfG)}nJNB|rlddvv8f?SzX8v; z>JE4RQ#jjx1A3-+xA<>^AMI@LV-kBI=97x|HBdE$=Q(u$t;Md#EX~Bq#n`oUwjOCtXY0}$HyMQYonh$yo3x2*dQbrA#}R$LaG&mY z%Q3`O&}YkZ%YjmbZAC6$2%GI?RP=w8YJWz_(HgH%<_k)9iXz?%Z;LR-n@ItKJ-jwO zsUtA->(R^~mhl?e#6q%+rof6cJk$%ufouqs0_F{86ZLue`P2=d+uNYIg=ml~ZbidE zZPEOG7IvTbWuM)V`LE|J77743IA@=|?O+#B&*;cx!H3cCkZ)x&d@A3KY4Bk&ynDEW zzj=5my?z)eSN>vfo)Ud{mVAq4_?IkFDjycfuO`7)Yvor{`K`;UzkT>!n)AaBelafj zqMBb&WL529PE+;pOnzc3@{`3Aa6rWSAD~R@Oxzt4|52ML&_ZqsIvYuk%)=i+*iE5k zWQf08_(&Q)3X|smnG_jtU>IsvN~+M$WR;j0PU@a0&Muv?XrF%2j(B~(Il>`YBqR?X zWuNiGsXI20t!N5mZI|fzb)qSSQBOuH3rJVfrsG0v`&qK1*A()u*1~L6-%~961?@}1 zbL8WTXzUX?4NBOO{cHRqhLSx^kd5FoyWq4Ume`|)Jv5rX;8Z46TIo9aXgIIdldLG($_qkvXK8!ZUj^ ze5Ld|-J1!xIoyYAEBmaj;KB9%%fTJ^B1wS^u{IoECmLT{V|n#LR z5|wjFp5nmYU*qradbqwC_k)3fpsV_M9HXx$q+37z>wSZ|7_*u%V2CzYA77lQi#`wY zKxhHx&1-7QL4&V_XiRg;VH_9CH4GW2euehn+*7Y)PqnM4r&GD7EjzfU?;c_s^jG1Z z*d+<0kqe*^@=4`({&>%5=RIy`6>EB5lI;Yko$}Cne@h;9+B^;(ayV$1&2ECz8sC5< z*S7ef2Jzrz(!q-1HmjI&IFaDPjC7QEp!IE{FIjW- z3(i23aXqphw=X63>lOqk#~c@3E~)^eVXQr`W`XJDJq!PR98rQ~9Rt-E{-{rXx5=oBF1%`aR-;1lv5il88m@ ziAV$c|F(5~;NBCP@q6wz#d9QhR&o%|F|lv&YiMuZkpCAi)xi;qC{tkzp@D?(MecA< zQtN7A?SEz|6eWR(z%gIr1;#yomgJvWkOBtcAI& ze6Y2AYGf)hpjthBwAUWqjm>(F`Lsp;tk~}sY2WONTtjWG)j#`=+S*_bFG3DJ(#{uN z6<9|ufRT|l3ZL0IetvR}Y9!yGMso0l|4db@@p-CHd$Po)Cu@-B(5qFb+EUDKhI}z} z=8InYvqjZbshH5JEZ)+v3pk`iJYG*_;DF*;>Mv5l2mKwhL};k{53`qXlT z@1N)x{ejy!_#~K@J;PK>yq=ZzFlOP1-AejiZ1233_U6g<($(lQ?&$=iDt}v5rZ$cA zjX$SoOXHhW?Q?n?-`Z3>8)8s~*}M@K18sTGgt25ue2uUgm#Ljhy1smMf7F0OS?^Udi}I>3p3<6dW=d zMEhBUk>96?^U=(=u;u80OYl!+<^=e|)O2Ps{FJo8W$@G9HuFaKQ<*uA%BK&4f*Y{C zq3E6y?QqU{OFK45_m>uteTX|YqDaWX(*SfNr$8GwKeQ{*I=3{I zi;}bjcOhO%$8ZYM2c8`O3aG-WYJcDbe8%f^cZ=n7@Or)$5Tjws1N@`KaDgM&MAE)Y z8YDhMCO9Q3YL*hLA83Dsgh-{WY>zDW%|d4Z4Ww=ZxRzf<|qmIFw?0oBbS5G zHpI*+Y=u+vVS;^M`n!`plP}%FH^dgnrRB3-b~&44O}mFnUApt;B5^wu#lI$^0fRkW z2QYgMPJMirVUH~JSE_b^yv~Vb(a;$;GBCya9PQ~bVnk+m0gly+7FA4PD)yoEqU}wb zyN=9fFo>2&+=y3wp-qj9b*hm&U5Y1e52cY=^PA*|=#Ad5$0@y+;#TxW3sIUfOeLjS%zL3dI(Q5a<&_q(4FlrO0csX&ICWfOW^{2 z-}>l457_p#)iS?@h4A~nw@D1EN4)o+h9?x?x6$9*pDXPQXV?BrS%H0S;nlgjFm=4i zDZhe!S1rNn<1LmJ%q+YbKI?+s+tVcwY71(_?1N2 z(~nHEhc;#4kM8#H`I!iy;gluQNnX2`MBMd+!ke|hRWqeKU7O6{x6^g}^OB3YyTusHqopeFwF`D`MmsTR3^6^8(ND@n7D#SHQ6XYX4 zi>u)RZ~qF5zHyh*q3qD%1sRnrAx`)6l4g;e)^TypO|=v@z$TrdEs~eAiN&+otCdRn z-H+K0h5c?W!*-K?(T65;=`4%c*Lln2+GON{hMR}CTGHWquKajvme*r6 z%4tqa050162>bm@Koa8%w8Cp(IFsxB~ho6AWL>ZIC%TFSb8HF(nZa4+iCWYga%KY}Y{FV8g@eitd6kBD! zgVRCoLE%BP!;&`v`@iN-a>@Hz5rY2ajRdI^o?%b0@UhkixoO38mKRcFjXoEA0k8<( zK_T(*ojr6G1-2r8-JHN1pzHUKP$060YQH=aTIxb?|hVU*$%d&?)KoNLD9#`L& zaE_NI&oOB=y;{65_Yajaj)dqdxW7l9gLff^G{Ara)qF!`S#x)>tuDOd)9Zyj61{}t zV7p>yO9k{6DhsD5kvQIjty6vY2w?`;6%Dyqftn=~!4d~Dtmup-I@ zQ%uveWYZ)~S_(;%w7YbZ#wLX#;IdzB*6tU(cheN82ATjdL{ULe@jVot51*juPw@-2 z$hTh<#E<7ED(Ef+p`!gD!2ZuUGjs3UyZ7#HioPHJ_aiXfd+*GdIp@roGiPSb%=o4b zy7%O^e6qkN0B274*tf`fz9qsXuIn!R`&04^U|-yy}U)O zXS5afuHW;=`JUaEpH!=8i}4~~Y%1O-XCyJLltc(#LsU#gjg;O^K5}PW?CTV8u@Q5Ql8AlRj=I3DghXpm=r!2|JOZ z7C*3qql!+XkYy~l;j_hh_hWHMz?jl>O{|5^CATY%8_;GY`VaJ-U#-1?JnmJ}iWkJB+5bo;-!hB zR9q8pu3-|vlmO=vQ2IQV6vp+5dYqHaA9IkEr=JgjP~=H2)@R9-20fc{P3*+&rc8BT z4*J72xru$Kb3c1vZJi23Z|Yy3%bh;WClsH{?faymbFV%ME1+-0)7b+`^ftPY4(jE! z7y100Yw}H$q3#r3`=Wsd>d|Rv8@Nu)4vKw4?zeQB@b)^<3Fl_EK#v7g4kO0CMXpIU zp0Re&Teb^73EpQJ=ZLJ?&gue9U47~T2KC6&EAB(fxagkIw$U#${qgC4XSyq&l;jWM z7eRdf6Nkk7@4Z$0?kMP{>1xH7WuJH7v{v8jojxmf%T0Xoa)E1N-%+gJU^hk@eMeq3 z{Jdw{{_-V;$D09U)-WOPNZKm3QywY@p&zesFdbzew>-Skh^;qmNu&d`^ z_@ydtM!)!-BD3u3FX3_}0HRH1GRHi4bkrSH+%)^{cZy6i-Zn` ze7=)u*6^8P6`@7#OtW`K?MyRIbPlH3TcQ=1X6|S6?>@lK4ZEE9_uTEwWWCrjD7pRH zSfLHLEIgkrW83GA^~jItJbb&kA}5%$EHtE>59$r+hJ&Vtbk#xQd^MpN9>M00N+JKJ zS;PMyF95;3+Hk!T5y~&cCp@C2zMAACB^8cq2vFQzJ?D3~^qqfOmuvEcD!N2{-y6jF zo%i9K2%ayUeL+-o|5OaV*cdz(Q)i?eJ8Rd(u?Gvf@vxe_659Uz3ryIX*z0fj>sxQa z>F4=p0AT*-2k`v3nYN$txT>4bK6qn2*$2IJ+)k#JtcU!!4;lh|agppF`Y$2-uYAj) zwpV?fH6cQ8yIRyE{^36TsIvex2HvznWhw{~Np~3z}lC_a;8KVpRsKBXdv&$Fb}U zDL6%!f)gYucmpr}KXIYxF=raS8!l2>Y{yORKOlzAu;&$S1$Is1I(YtO#$)PiuYXyu zi79hUTuQ5pYd!>GojdcBlxH3*E1zc(Bgmk)V+6Yy!!`MhS4`aw9`h*LGPbNge?aOI zX4>|C9vj??D9v#{J2j0R({RF{KF=q)4*I4vuxS1*j5A#aa|@ub{Pvd%!fSys^Rbr% zGaq`%VCLQMoIe}UUG*|s!Z4&C51tVRN2Nn{0&!u83B>1L;sTNXx0kpiR9NRH@f?N= zwi5KU+~FX!u{(K`+-M)q{Wi!BcCr_MM{?f@(s5BUYwia|Hb;4 zr*F*~eEQ-nJBH!sP+b1AFBhLLIrRcf*6x00i*NdigH@KRKs@d}t|ee|^@BKpLhM3? zsLl7%O?Y?}$aG&_|4d}jfh+Nnmuju1e>$=VU0=ze`*fNqbU(ZA6m0P|zNsworxte& zK)h!PXAvgo7>>?d|8#48_7^xSu@p@_8UTsie8AHI zqF>~h2T$5O-&8YMQJ$&15BG)qMLUU)IKEFeo#Wn#zD=wTrAez=-%M|9EnBvqy2rC5 z4_Ik$q2v$Jk|aXBQ9$6i#*+uUGflPVb$c+kFd(eSdA)QVMlEfoszoYmT_@tkAz;Z8 zP8;-)wetCHUbZd`?8F;Hpx$!d{539c;SxQax-s=Y|I{zNOJ4Ne^WwaIJj-D|H6_Rn zCr=x2&<{myHSEV@p7Tz3i#z396S$qNe|lXPjx`?g9(>+Ay{-e#HbzfQWHFsi2ZOfh zOKSJM9oI+MI@#m&`4U;p_1!z7!0}^q3y{Gd5 zI=@0Y#k0CjYsE{G_PO|dPkUh4FnCDA^P98#J*zWy{Cqta>BDgAMBJ$1nSyM9f&HCN zWj{ycGe*D|+F*jW^J&-BC!j3QO3bvMvEL(^>u0jf##?OmH}y|{j{Hbw`l|ZT%(<3G z`krgzn+h2>5c+FTu)XLNvwKg~fh$PBaZ;Nt6nbaQeB6gZjE3f7c2xA`xaji=O`hg{ z`5s(SyRX*Q8LqF({*G2|cIT>)2sV&XYgWR z*mUnL=&bS4%}7IBq0?%u*fPGq@!iZ`xgf;+AD-0H|I$dm@5e*Ar~jL)^>KFJ^hd6v zcK%~>nY%izCE?Rqj#EV7jfh=e>J5WXY>avgf zrqAxjjF=nMBe{z�jeNK(sg3xep-gpy9e*cl9;n7Vf>d-A`1pb3t$~EM6#pL_bOm zZ1ya0l@5UJX1^Ak=YqA z^)6eqkM=Dd=xeUVhzEOW85crz*%hAYI3(G=2|(E%>hR1Q`%5~Ui_NAU_#956F^@jbl%8gcPt+^9 z?32vHs}6gVEu+i_Yo&u%sdr`vrE7*BF~5{>g5EZNJ)SV|>vxivFTNGT{NfLhIoFNP z!ygiJx=t6ve%Nnz``kwoFH50=)A7C^D&(s#j zhTuUTiofDuvBNigGPZ(#oVT4OrF}HDf>Y7|0p(vCW3t~r^#fVGe*Ib-&Dg zV3^0=sd1tXJh0I{nCG$YJRL4PYrzEAJ`Ec$o>#po6s(E zI4f`l_nQ1Jcjf`C^8#*6%=>U;ux}$(_%ok@N=|;TP$+r%?-y{~XC!!r68uS2F%FKN z`84G7jt`pjqj%~c9ouz)mWQvYwj4@6R?WO>hrvpE`D$j6HRHxqeF0r;!`I#bpBJ$; zMx4)M*VD842qp>>^^5rPfv)qbaDoDTV_lPDF07%L0ipAo>Bk3seoPMnIDE%+ji zGN)s#FBZ$1_%gy@2KNmc{{J|A46{0F%+=$SRb z7QONYWPW^%W>4no7rA)O^%<0oyj>>+i=|b#`aO5BO{|J8bVJezb0ff?If4?a`pI z1E=ihSl-oNA#9l>+{Dg(oML6UCi)iW7UA2NMaZE5`Z^Ac7*AhIM7Sne0buvxvjLC! zih@TkG3xlaO5Jx;i=)$(e4X|6uB+&jP^fz>wtwnR-h2KupSyHiw1{ef?oxqDc*1q{ z^Dt=IFx~s-MQ&OxZ{~Q&RJbNCo5wId8p&nEEHGh?^0SWat{++iQsBDldyd3xoX3ed zy;>%Qsg3Eg#QAHFdfsrzbL!!V$2_jyd+?;L-S^=Nt?FA16i&7j*ebyyYIXc@<_Qf~ zEO#qa{dkAdgRZN(=bQNO0I)z2%;?C!4cGj6zsH8Rxj$aZ=kXbgYvR-@T82~gB-ysy zU)Hgm3HpKAEC%)3BG2xIs_ehg@&R70*DS!GUFFX0+Fw8vz)EJvBt!l!Z?I$cwMUg^ zH*#6AS*UTDXE>&#ef?nhq+c1=m^pEN%-buG;_qy6y}zS3~1isj(fToeXu`LDmk&)aZ6nSV@P zzq#j+^N`N^OW17E2u*L_$HqXece2$Y(xFS4=Ji$m5i({QGj3u+3w?lZ>Q(~9UlM3M z$^s7~yw`Qrt+f1g@7bPv_AT(t)cl6k{+@h;C*X^^U# zpCtkVlBw@`4(|27;jg|^_jo5;>%FeN@6mx5)!9ebJPuyojBwe9`k^tN#7|f1Ku?3* z;f>7Atz#OG)&V9E((_KPmpi1X>xrf*MhK=tCgyKEEbMIuxqkbLVtv#;eg&^boOEOg z`||ohG}QmkS3xqqdRSghM7hB643L8NAW$}6VQ0V3jH$VIj_l_hNPdq5CsbNNJ1#fS z!_0um6RKH%4}JXP+dw_srm}CmS;>lmBL<&3q~5v9=$$>%=Rwf}Py7`B{aXLK9%Ku* za#n~ve(Q7ZL1(Lj2>1j~gX5?zxd#TZO0LyYV~tEta4Ef(y`-`LaMa^0I;M`KI+}(Jp>LxPKpO_7}Wi>=g~8BLWKZ z=|gONQ|R9g(!c$HPk!u>p+CXia9#ZoTzzD%W8x1l6$(OdNm_Fc(}?*@?CrkG`yO^v z!ckSesk`~X;Kj4wR;Jx}Y#BpLXPW1B#Org4A_ga~rTx(>Cz(2n=pxxp^@DHY0ve#Iwt})(zsb z7~B~HzI9vh`yQ6&Vs>&1tN#3alBBVbT+LsYl5U_Sa_BLqzL(qcbfNG9-q&`qa2%qy z)K-t{y?eyiokqz&UC64R28TGSc;7#mw=nvBGy|D&GA)sV&6 zhwIsK|M}nioegiG#dV#3Tdix-4dAy`d1l%Izr`ju|BMe;v7V9Kns2Xv9?Nv~J&&s9 ztTYG4J%Q{r_z?dR!htiGAgHSuu( zU41a!w&re}T{?Eta%fI9_1uT^&)rjd+w4x?v!$A~c}DJYxu3q*j+|3$F?@qPk$>&m&v`wat)rp9C6v?T0q*}qO&ege6Y+}X zDXqRXH_)}ezz)`!Ier1+_w8h%;C}2o{P_Reaqma|cOB(hJwYhDuKdvg)?=6v9i_6f zcph$U;icpuviDkckJl{iOyi@UyO;;)Dc7;M-|<~*&8+V!^t$f4`26nOPgdog!h$=W zK)FJ-!`2|Vqtetn_v^c)@4->hZlxMsimyoXIikytQ#ikoWQ5Os_40x(-!Ix_$M^s2 zWnV9L=nDA?6XMXP;OdF7ui6}CI&uP6sj6~E17+UM`&_%vVf$Kt6J@;k$ryCEb&8IWis<&=@ed~a<>J86(a`QlYqf&K4>yps7MfmZGEcAE zrr661gV{J!|7Z8F_8qH_Ysl)F9{%+r)7A069AuvHq{HcKy_4&+d-Ina5YDJG*Rx6? z`w!PCF9H=!XQzs3DA&Z7X)J{6gAeB3+|H}^z3{PX;#tiFV)s?vJTDxj*7_;_+CRF# zAP?oJXSN$Rd8C08I=y>m&aL&%j4l!k$_=F$8Ede&d2o+VKxBH%uqyG*EbfGI*%4V& zm$N!K#9!+0g`BPNsAV!#r*7xBX5P(=2%Aa!udo!Ly6zpbZ}HoA+O_zvcCoM5Io6=l zWtqGhB^KniB#HEirg_X7K$`JpfJ4xVUR0>pW=x%UOiZO`u}&U~uu6DHxjV1Ne7?d#J91A%3`P2<`I5}r zsk~ax!ExRY7P%%<iXPIl9%|=&w{?Y4>HbIJxkruyTVjvHIp26{XzX%)Gks}RmwMNNZxrL`a8Dn z{FN2#B->hF=e116a1MwrOmJOwES1<@O-cJZ?{QuA@FAggL{GZ=u0y#0@nL@H_mX>% zvM&tpFMOwg(|&3r^MFy=9!7RE_>xC+Q^Q3P?3#E#kx>)KNz)`FGF} ziN6-e^DB0yX-sW*_iL_!I!m+q%x^zj;C^}}fm5@&OJG=+AAX)r6)2j>oLqevT|DDT zzU45P6ia08-gSkzTmt6^fi3x;k*@`*30%2kjV6C+mPtOnklWSt{1?BBCRp1W!F0zO zg^I5Y0oiT)3c3La(Rk`Us)4(=6$Me@aw8GxrnvM8%tR&dDh@f zRj#W(NBXvHC6xKizjC(z26@4@2lA^)iq>suKCv6p}C)mz2(&7&^Ok&CTJt^ z4Db4Yf zM-z9z(qwgx%6|@|xgV8R&eu+ecNcz!S$)UHvbqa?b5}q!$LZk<@&(#5){W)R^x8Iv z$&X<(zNm%hJ?kn^lfS=&90cjeJjZDa&H_4u-OF}Nczw(%^7jpF8*pc2zhMT@0d%Ix z3Sm+PH!zkj^f9n%#Q8^QT!pW%&%0i!GjhoN@BLD@beVASH9G%a5+09>W%?iR6J{QTD%tmQ+vYdgp*LK8 z&ge`XVLD$=wmxNg7*XfVC9B(#aRk3_Z zYuJ2pYjV+3I1m2^Y1)!yV9W2@92(8tG-r}Kqup4PVJkAbR-SHxc)Bc-z zALA*4p4~6jWEZ1SXj%-7W# z6s1>L`uwL^i+r?0KcsOT^kj5R7jI+j_?}_q=eNJ)(7o(!)1}lrqwuvhX+}X@FgFVQ zcJ5OUlaC!{ggwCBoA1rP>#(^9uP7qkhB)rCWCjt#ZADWsV2J?(bE~k3s+Z*=Ea!de zXO=X0rWbgoZ==pd-xA`ZJAI(1oVjRGH*>?ShSAjYMT>kht@nC6f2+NlJc?0@>hK0K z8fi?g&V3rsC8UcdwH2UZ!LadM?CE?|JKc--opiQ%X4cka|7BmF@TUcwQ_pMdo=z}X zzL)(AULFFuXTQS4V=I>g&vaL_*>SEw=cuEvQ(+V2+l$wqJO~CYuG?$;%=W@w?j1dh zb{kD|RFnRoGF+4BdbIDObiTUDC(olhjhau|YU|uz(C-5k9%PVxvbM;J?3-WIy~ys* z-7$py*Jg_o*+>Z|vK%>)^_hIgo>L4TvSs2P2{>~v;v3yXf0A_#9hj8TJtQDCcMn&7%QTZOG}IdTOUBx& z@W7v^tGfKMga7|Bh{xr7b==bjh z=Bf41MYlex9>GJr^rn4I69pv1ovo#vzLfVzt=#2iof(=BKuw|x<9 zWe|JZGcpB&f5Izw)GN8eE9CR3+y@FDW0u#wsNc^7#XvPM^$T<|Hl1Iz-g17?DY?JU zO4Dob{G!dx%mC}hU&r%{3SY80zX!(*=N3yc-S?T=TS9yMs?vMQvc^lH=qghR` zZG)gGuKT>CDRyuCM`((fqYX{5l9>%#mj^%0pttiQxo4JR z2NbTa?Q=iN_J!ciC$=vHCtKJA5)PVs7HRW*(+%Bt-A^-4qTo2lJA;E;Y0l3#bvt<| z?9K1~eN}e8mkoZo_rf%kIo-1euQg&jXgut`laKKY|2HelQ}e?5Mf(rs@t6(~a1(h43ArU3a~!`M|v}*!l{*?AbSO_aCb!e#MT3>ORm5CZ22J>1sC1 zM1B-Fze3J`j8b%AxskH*G<&DpY3WK3Gx2ctEuQJ)AMi|X!V8&xO6stRp#f>}Vhfs9 zo?Up3R?8ko|!+Eppv;a`oPQ4E2dVbfR7N zIcle4FaqO;%AK1+@tv-LCifE(0LBfVWFtt1yk34bbJnNljkI5ewa%ZoCNYVp^GB$K zTz&3`v2FD0htEMGU*|7flfSPb@$pVQtTp5xuhP$zvn0EhC38*U^ko0^8$fB0cB4)R zx7O5V-?*>yIy`C42QrK2$*u5#oOV8(-3K!DDP%tp?V2F8X4X8s6Zijx_&#y;e>jZ( zRJ`f-&zy<0d-2w;d-ffz&)-iS_D*-A+WET=Z&JN*0S=`1K%?ddwj<@JbB=io(CDA4 z%00alr@_8JC(Lypm_$6PU6Vhi@mU|9>o`bb@dH|oclUxS@ALq>vt(x7BEIzoBBs|Z z^30sMmyS8OyuL#V@J(MGx!9xJXnGJ&p%+He_qAoR(X=Rkg5ayLo7g#2+;)1}vm3Ij zWfNi`M;+N)y}Osw#N9gF;{|FDMFmH*T*@a;66d+<4TE{N-BfLOx49<&h6APo-mf$D zFtM_3!&sew)4eyQ-oXo%zP1rpSBj=N0ox5`-a|u5g<&HMT@N-p7o6ZrT zI@e_8DAw#yH%gYlJveKB#eJwoWxqK?esS|H5Pd&+WivOU+{(`)#`j{2ASYz#c&0DI z2Ei^moUZw~cj_@}6`y6T!pF8VLer?%_mQ)9FpYCd2gzcu<`>oI>!OnUyCz(8hYq!YKZ789rpKA7PI~cTws#$E zUg%v1YCRxv9*{gPb*{-vNNcw4rA=*G{UB&T<|*8fJMiN|foIcIBzHRiEg^SzC`EGj z3B@dTD#@LSayOX$3`b`HD@sgoEP!5y26yYA7e;Z+pTp(PtS26GP5uh^<|6xClY7x0 zglURj&{PcBVQfL{HpN&F+a%w|L0*__T+l#n^K6PO_x4*!CZG=_ned=YWK!OP&$A2v zrfYsCWP*8w&?zq-+E!Rk<+6Q+Z=pzeUm@VUfSTe#)MsvY0iAe4KBy6~Ws&H*BZCh|A|`J=iyM{9YWdZSmsF(>?o+G3*^WZxfOKJKZdO=6r?hxs$zAF_=Loz@b+# zxg8v{*EfBfZ|V)WQLj1o*lq9d4i}mDMi=J$*Wu$pj5qo zaZTKUhX+F+gWc22N6GL7Ltr94U%$G~H+@uo7tZ;!j?Iv7-oH_!6ZHzlNHunIHP>1D zH(w3w-%i%c$NFq$^{M)(KCzp_`c!kT)U*0jMSbQcg?}}+2Hvhb$gDB80Xp}Cn{lS$ z0X)*yIL-;8al9~!2ito1!Kti?F=OOGdgoifp{49wV$j!4PE!9t&jU>F(n+mgkFay3 z4&(ma1?~cpHPAmjNbO}Wq#--wEOcz>eAEXW>XN*Bp4bQAn?BE+?#(QnYqD;h+^*J1 ziusiMXtjW6=iIWUMyD-7y?8qhW$~yicB|bOj6#pQ4F%#4xsUz@I>-;Af4dmR*5h7y z*gyLCtgpRdzxA)NzP5p7pIn5Eh2K=^5x0mxxSqx{`)au#>jT&t z@%&$J=YHSxO4r1r+`c-&OFE5Zu7O$C3MSf0-&A{U{b^{T-GRg0s_{_sbKcHpw57hO zKa$>F%}tMaMWzQ`SKIlU?5}9d`@Vzh{L~ju-CxM359<3SYZfEEuA`aG1@mX(O|)=k zQoe`bQ~td#;@TbX=@D{7ko6}RUoYyeTgU9h<~s8cz1gE*ulK=f*6dyRPqF*^ywJgj z(~`ycTE3Z+_liT4AJ{h^^v%>Op6RprdC8#do>JHt_N^XXKAmyw(vM`DYTb)sJat^| zUr*g{tb*^pjWze>FJsXUf<|*qK8MENTe;qY9x1?@(`Kz&Zrk>yT2&nt~n@Fucl<%Yo;pb@+Eoh;< zmjJ$sSmhH1eBMpKr_%E@J-59g`4&uFgWvW|=N7z)rlze%InH4Gs-8KBr)a_AC^j&# zS<0WrlYNTgZx<6px8SPr)VqqcB&CP5^JZbDlW@nq@!RFP^A<#uMY5I{a@{jlgei>(XSad;;+tUn^$WG@s z?oV5a^M`DDhR^jvJkIeldl(vi5B0;-$KHvvUY!qVD~t&ioZo)|OrEKq;&gWIZESsI zP#sV7WpD`YZow@CcOLG+CAhmg!5)N%LvRQd+})jrySoH;cZbLF``7M%*xD~s)6+dw zcc!Q2p4)xyJzx3zpQr^Z4aQDt{T9xJe8F7RzSDS(V$nS1S%Ev-&Mq)F8hauVQZeH{ z)i>AI6u@`ix#sKmas`%tjsp&jg3`Uyo)@FjqefV`nd_OaW{pQMp3q!v0MdQbWma?`jA-c57L-TK(=IXfn>QRqrpm?6E zHf#0$*#D)BrIK&l&2R}rz`N55%;BWbU-!~ayP2wQ6gtmlz;gi+*Ki;ij8uCN-DmvX zx2VEq({T_OwxH0ou;VLxs9d@CW8Z;NKkMGYkEhmQ^4mH@ z@L~6(hV5U58UQi%42X5Rt}{93`{TJ^3umWir{3ocrkGUZ&I(5f!k(Ac^Rf;tHpq71 zT&V`scObdd;*1JQs#SQQfY`8v6f$+aFhd8TN4CTttw_1m`7ht7hgfb>sxrYcqv5NSD$a0z0!>E2EHA%bTp>kzc=O zOjBvTX()*V&mN0f&6Bp&Xp;oqP;w6()5Ko>q41rW%bgP=GDLoJ7Hdq(RELu?fg*)m z(MEDE^;jB)oqeAg4-!&G*}Y-+g%k-S1>OClj}pwy68~Eo8;ZqR{VxNUA`zd64VQkf z2?o8#-}Os^uNgKsL&noA{2wFzP-wzI9qxHY`qY)9#dye=%qN+HZ$~nMd_*ccnC8wu zmeW?C<}EARMHq2qK}oRlr`ybc(QJ=Sb<9u3Y@d0=Gd6Z zSjrVTeP9++RLf^>RO>So_lwQ0Sn}o>c-24Ax2SOK;%5+Hg=%QG_y?g&?)-*px{@$)AE-xl=4`&+CivN55rTYKIny3M#=p34(yJq;6eJ63X4tFP zp9+2%#YoK%8YQ)IWk!{$#FE-uGUnRIx)b=r2^Y*DyZ`LOzC^Gl0Ho7Jd3`^|Io~WXoCXT1iNW1eKy%I52h4-g+8g)lJ)}rfWLf`DW$DPY1{O(=6_HG^-4ldGE2Ksvlr?i>?p~1~(ql%*Rw{j)L06A-gQ&mvH!E`c~wQAqV9&bH0%js1BH#04H;NQ?A0zGdSupqnyV0532w zOB#-TmNK0FqZq4W)Ayi5*B<|@PkEd&rql6j+~eFfeJWywE=^SRg|qy2k=x$<$kz1s zsP}WG)QQYagS;ipZOuO;C)=qapm%2HyY=&!hluRk%Ge=?Qskk;iegH8GX0%av7r^w z*c9tb`I2pP8sBk>rql)^3W;$>Ng2MW$tCc&@KF8og#)@Yhp#WmLWRxa`TiG)tdMp{ z&K=|-aW||g_xa6G5zx3Rclw|}9~&n5fsEg;0;a6`pQ<`@6oIn8QAh&)y6-%(lx4x1 zsW5g%?~{S}!a?{nM~2o!8~t6oQw>IKw^t42$nTyrmbcU9`%PiOsnrvUw4%SMGTv!! zmz63p{BRNJ4R8Npj?3h^{k*yZ#?A$z?lE!ToDlOPXkbV*Jw8SK@gvom5jK?Hoz#ez zpKkh+Nz)#ieecDd5q)$Ma`txGVD@vZygj!-qEk==IPm_+<{{M1 z;VRo|6}ZudP8h6sUGzb-?pChL~ZYqJd8=M;VZaEBPp?-su29V#9bYy`L*RO_6U` z0X@2y*HHh=JB~jPe(T`%E#cA!8>DpQ+1`Fjjp$fne8EFrMV}RNnYy$55`Ovi9m8o9 zfxn0z0udP@M=|>{W-qf@_a#H7e%lhcU-dno&bcWs`&X<3%>~h>V9DOS6F)zZY?Jt% z1oV|~!n(~>6GEMC(=Rm1{st@@{M0&@!1Cm|+HfnAbVE}=WchZXctr`5*dZ`hVm@B9 zv{5zI5PQ?_ieWdwF3rl~vZE{XJ7Y%kIh_ zI@;fk*!t|xsd?hjMBlBBoxrV99b;=VHMVy>!Fj`0KTzT8r(~enZ^bJGppTq-HK)F( zt$6t-jAT9E6}hJ4mF|PxV0Bexf)JFgh9&HZWZKNptQRLK&sK8lWqwp0vzs)b>b3!yQbN{z#V3NNM z9~X^!GGAaL^GuXqW&5P{<|>FKGt^QvuxX8)FBfAxBh{D1E?Y=ky-QC6%HjyWoWG^~ zEF1cgq9>)-g%a><{`S{$ufQ9##*;K76_=FAqj`$NPm!6HVc+E~)=HlOnU5OgCbcvNn}_>gjK z&l0jk&7W#cD%WJ$5d`N`f9Qa6n}Dds5jTnpJzU^3-_cExdlf!N93(n3L*MjE;k9!I z<`l#JSx-_w{;Ii*>2B1|H`^~(`V~0kp`CN^SKy^W*v!J zSXoZepvIZ3kT|VOQ;#l1)8sHN!5P*hlM$r4eVkL1vw_u3q+#0Il6*n0NZV3mL8qJo z_pe-ROT77URPjET3x8RS|=Dw%QO@cK=pDS?!YS9-Z^-(dEUX+O(I`2~wTQ@h$b z+5Y#i`cGWdmlIM8`vC>z%U3wXk6YvlTT7>5Qm;)v49$xX3u}5-m5nQsw|`0LO_3n* zelfc6u450KM4W9~n6ghqi`a4$^+qOhZV-_)jY1s*R1tYS#|2oZ0)OBZuiuSq9@Trv zDe?SL*wP@VcZ2s1hfaXc_9pC(^iA!tEKNd07q}JiXkh*m{H~)>IlL}?GTTu*WK`z$ zAxuo#|2fV4n}4H>BTM&Zw)PxZQZnTbQEOC3We>_uZlPZIQ^G_OpYNi0g@e%e^z_85 zTPbXTOqH8(%-s_dN&({$Wk1?1TEsT9qf4?Od>CR}go-w(Bi1&E z$LMT>5*XcB4AC`CA~ke@{5;ao2)uzd5mbxgyMkRD0|dWkbntqw7)8TpKz~@KuR@!- z9w$|I>#uNiEg6S884F@Pn|Ml?bkgST7ig32W@u!VB`Y%Twh4Drzk{V5a!}9jX-c7C zJ#>#7IYshTOl4z&VP4W7#gzp?b~C8TW_d)THvKi1e1t21jW3bBE%fOxB=Y8Z6xPgd zuv(m97HN*GxFxV?+`NM3pd4d&e*Jcirt12Ic%K#UZ7#8(41UXsYgp){nwXe(NmXMr z^op1ur_iejhg@!;_6o$;DH$OhBGJ45h^s2dEnT=8rWQ>vBtH*t>@viVqwhjiGXxdB zixXdZOzm1qG>?Z}Fr~9Z{z9ZcYue6LO+*pG{Yh?$1BJQ8NFyOF>=ZFdk^}$h^A72d zhfJd}*K0=b8)BzDLc=NE-2^g<9bK&&N`Xb5wXxY0&Jp()BtXmeUi96$Fq;ZYNlitD zCMV9?JGWnag5tKT1FDO~?M@SU`l^{A#>xbYD(tTaKWY;_0*dAjIOY~3c@b^j) zDK3la89ZcT;iTO;P2=C6I>ykY%!!D9%6)(2P2p?^ZfxuA&uRN3Xo)!t0-ZubY(Bvf zesWx14N_!TEuV6)qm{AvKK0GkEJ;W?zc!RdhF`aK35BULaSz{(d0sIkc_HhbsII-! z4A)#GRZJfBfhXd(h|O!fA0WSF4FW00Bg~U^uX4kxrEty7gJ8Vq(C3}@zcNzEPPK1i|4puuU zO9R?vZfDzm7%&s|LIO-;8JyybcTKxlVc(6`DbKb-HmsRMXyCvDZ2(bYxrD|iTxXQl zd^Yz=1-g(azE;I;R=n0V(rB=G*#xupgH>O(3PkLCLxAj%h3gaMOHGLO-el`Ry*%4! z%ov!60lnxs<>|N&tok_W+t3IQfM}m6EyQ?SQ{8lh16?*23YJEH|BKN%s#5!%;SXgP z0a3k{KPwqA0Im#IVz+OKmo%Cp#B2-}Y`E0we(oGq?bPtdyd>9(@=%U|==A@6-;pzK zoh*5+#JWw{0mt&i#U#D&ZAzwBNO&$!vz~xm>?cbg62<@1a;m$o!h(!8yDxWXiGfSo z`61!WR9jhAH;T8DaGfM?im&*W9=D4>HqQ>L68sNF0Lu>a<}l&+_KWYMmj!kd0$!r{ z{>+vzr4bzve0Qx8c$~-8E5ij**8;;$Kqbz@jix|0&O7XcqRS~P>yF+CyjzdqrCvzN zljWWt&EDv1!d-FcN79vcDF0EQ4T6_#YQZxm-7}m6G=cyG>AB)Q9fj4Z=2|HkAn+nR z#R?E;AQyO33rGDi0*N!;9{~^+CZZlnfU|29N6N}z+Y?@s>!30Zo0v9!L*#C=N)syu ziXggO0?Wz;w@tX6iRa7(?}IOgy>n z-3+pFRh9UveU{g9Jz4rbV81EfK&}l;s?Cv~X6>C!@+Sn+5!Y72zQ*o5JfIi6)>9KA zP5#8}-ObuXn_)C$jGd6s-1OHT0e_zW6y%5tAz*!d_}A2VZIX59d~Z45Vc}8HD9@=Z z`(^(f1Z=!I*^o>X$=H54NzZ+?o8}-1w{AY9vA5_!?3~cd5I$suQ^_FXAv7nsF>H>l zfNYDpp$?=~b)qoy7IVWZFf}U5&JScWkUof_Ec>BI!4S@R?rH!N>Q&1f=p!!ftN)ze zMC2}97dP~)2nm)J@!Dh;9jY;tT+*f84#r*)aFa<#4U%&6!J<}Y8*-4%4ww#j{c}hV zAL^onMYy~43CGnJ*EeWFU9;YU85aV^r5i0|hx*4tgLQn-FPos-^js%M zC?r6QG-M5p7B1$JtL&UBHLU+hsoj>FY;6Rqw@9bFHt*omhKdXt>|?4AhH{S)haurt zMP8@4UFXcsxv$YLxgmH|;T64sb!||#7($vCaGFhH?qN;hS9~P>sGo`FI7C_we@buP z+Oxff=!DxZ+kVQJ?0ckz%?CR5^JAr~4LOGk`&m=c=?k}K5Y~y zywu@O&|lA~I@ny1A%ZFg1D+$1Fx3!5MwY!;Hka&YiWueIBo>kIf0~P*M810zZ{Gbu z%e6&%UA5w58_>3rU=|I*GK+#0koZNaIvyOXU_8%k@x?T(K~Wte{4;y*FKchbS=W%i z9Q-OnhQ|V`-V?ct+9M;gHCATV4AmA@Sgoz?5m8ecBm^Qf4k^}%hn%oxL}QN<*g4;n z=ytHD)IPUsJE*cFO*$E!A0nXK!ffY(6sU}8zf_OnVkzhld=8VwMzx;v8JE9g`4-}M zS-$|28i)3$ndiguDl(H|C7noi6sVS`{ zRtl-$$mA`}NkO#-gcXq}w@dZC<>B*$qR94qZZIh0ufpPT;8#j%gO`!jN)_kvi9rX7f76~i8Z}})Z?p8Qh&!a zoM~(qEvs7)mN$Ux*F#uRMheBpnp)Juz`Owq3aO|*dy_AU*F&7vdq{VjZbw_7d5RY(&n$!%q7JOEh zxo!Eg4aJT03dJNWn{lk03H2A2nnHFj?*!hzm1-AJC{x^EvX3C z{?ta%ad}IRYUBO1F%?cs*QKt#YFYpk{R`1A^D*qj@JT%`gkI*^vbFV?-p_NZOc@hi z5^CL`Wz9&TB}(2cc7RIEY$EM;*MuJRPP{tP` zL6>7fhN`P)kGQ-J4h^JL4*FdBgN8(1L@|-pno+FXDN?CDzQE|uTKQWb(yxJQG|6N8 z3FSM{&r6H6d5ZIf!Q(YXP2Dz;u_RGniH(We^B(P^pzR7J#*9~_f512sW}$E zr;q&_mpVX-5k2?L$gQN8Y^1ysZa>Lgi({zu`Z?k>_02RA(&wv_qIxRNI5`{nllMx` z9HvU=x-{t;<0fK(kH10ekw{)x^Hl@sVFjyN^Ywd?C7klXO)*oq@}5OlMv1Ykg3hk;f!Qf_g*709a)VqMiE<025t50r6~puW>zyJ zm*tl;aUiac6wT`BSQ`;NVAWnhS~v;%9(^aKUbbGW%~N3`ApjVX1V2E?LodIqYhHr4 ziY^x$y|;2EPQPBD(Jmm`mCoJuJ9yvB;m~7dLitKYa>CQ0QD(ZbSRWaepSX##8U%n% zq3?Yg`KD;gz~1}?KSv~JKG*7)riq*01gkOJzQNp-x+WbR6zT@~A+pgNx(3b0F=QSz zPRa?D5CPBoEUY(5`wt5!28@<()i|`yVck2hL3pgxLaNf)G;fyM#037dm(durRdNAt zgTMhPS5+EgekR|MUi(+t4qeTD(0K)3G=p4|Oa=^-);}Oy8&2Lkj;#s1uRfudN{Aki z-M0ef!!7?fQ92Jg`~Tv&oqBNY2QbRg@EGYXi=YH(LPp(>)PXy4)=wo#*R??xsv{qg zjJ5(^8oyHZ41GNr$#<$nerB#t03>$J4>mqn{02$CwCpzrlV&=9vbWDea4u`Jdi5@= z5;ijrQ+7^YS79d#USTpuE>xo$rE4`bP7PJ;15@7Bk=wf@Lz+h7K0LF*epW^(_PZ_h zqbdZhHU>oM%K%B*1~bCh5OcrQJqP?aO4<5Acq4`ls(Qd3tl#YKHu-`X#~a5!#)JQZ z;@qz%tDmcX9I+mxYgOz0dIbiQ~^*_|Bxvr(!IR?;<`49IcyB8;r6?+6UC2N65R?T4?la%P* ztP3_i0Nt}vi?0B2YsMS>N=6cdvHxGET+)WQIJm1r^-+~uq~Nk(r842|%SgYn+yh?{ zYf;>jB^{vPHYNd8djgX1Wcx9d*Om=2AF#3vLfjO{I`41xrZE!oRT7K>(0Vq1&9ob8 z#k#zHX4+5{1>&9pF^WQxH(*u16f7atOd00|Z5Z$LNScUpb6~CMKnPlM@0MIXE!@q^ zH4>?BlC!c(x-(Dnz`~|yn0W9GMDY%QM(_#UnAs11m`&9T8PQ0FkVK;J;Sstpvzsx* zGg_Ph{2oG&#zaqkHD4`Vqw5fZ=ZF@!s~sFXiHxpFTh_D#q&qDfTzyT{?Wnr@)ndHi z1)i0CB@eP14yLsvJ*raH&S{FwTzYXUE4+4%yIP&}%Q;kO9UXxI6-OpI#0=~iHN7r1 z4kb&~%RF*VgPZnRSPOye$k}mR=35+Q69#}9%z`eo5gB3H(&R=ybr4vWF`EF$B)nT( zt_F2&p~U>GO*kx4UT%dD{=>ENSv28K6=?t;km~6^8_rVNny^}CbRxU19z|xxB;i-< z7o^DQ`8-Ne>UW;UCY8NAIWh}@p@ZMYRJ>x7E`IGqQrT3VD$zzQiU-b`hqqrc`$|S( z_`z`(>F0cvrN%+Vv`S94b&L!t{M2V;=I5=NIn4diYvkg;nm&h}mzWMFnlxq3@bjq4 z9kBZ^Kb}x%;AYgHzJo6^^2Q2;Fk3}f8_>Yv$2k1dTmin!&bM!v%8URajb*l;-{aCn zq|eLFP(YgiUiX^P&3nva2Z`X#2ac!wH5o+HD zA#E$Ax!gxAl}M+{-ZzCP7SLOtTOIqEzmcXvG6+q;y$Bi;=R{(>WkH(($mgsc`?od&E`uOBJZR(FmA zrcVFFldRAbp`$;-Kr#%A_)OS?pZiNpvZv^z8KVhbOhD!$)&MnnilaVf7LUE;D>N~T z`sw+svYqd9VsIM{mT<7098LQF-WUQ=?x^D_ymd{OX{t3#sgwgv1O+MyOkLS5@OJc{ zT2`PsPc(YU@Vp2svAp`;-QW3O-WT5v_BA2ReN^3m7D(`E`#Hc8y-Mz~_BHZCanFJ+zC;bK-`_+PNt1aHg(WZsk? ze;;vO^0+K`)QWg7#VNL5x}4;fcKjIAbUn!koR@Qci@DyNlPlr7v^pwl%0}u~;rz#@r~UWB9UIP|&=ZGf zhoHymG&wr8I5)c@dZiJ3R=pVNo_oRvyYsVw@>?0MG$XkF+-^6D4z=%a^_Va~e|48o zkvyeg+D5u~0jykte#j*B#!QCXfX6_>(VPtDu97snvI+h68ej7adBoFB++&w=-A0EA zb?Wr%sk#MtfiA1#)XA+*t$mfneBKg!BLn18pul{UnNvN*Cj&a-rtA-=J!7i}U& zhDMtCVoHQNE7yO?y5Pe6RJA+&OtQVk;$q>){R$s4@)KuXYo-eg`xap!_sULhK3<2S z@LZ;_iFGwu(L&9J*4x@FsdP{pd+X4OGlNAjQM-VDh!zr^zY#$#IYPhKlE4bWabhs*RMAN@O!ESd^ zy)SnEyzgdeCmZQCHSW_P1{iN?e<+Kx!aR8I)eXoArSmD@M#lC#T&}gF;5rY^C0fhd zo}x0sd@UQU?CoQ+W&9)7NC?a$!H74u3M!a78!a!;{EEO|F>1h+V2j+on>Xv?A0$Wg zqR`cEnj+|Udcb>HiKhiS0c4t*>73-FYbDE-i~^dkSeE#Y-w@mC$QeK7Jrfsz7uaO5 zJ@Om|Klv8tMc+`TCGY($OW8vvNZnb80qVXVx9oeA@T%nI?;N4~qu@){AM!Z+J}S@ea*5+hAoXA?E*M1@eo7s41b&?oQA zZr6cfN)d1BOa&3~DX=W=;0>2wV0_zfS^7GBf=X;>J)Ay0dFQwMyT!kD{G^n-TYfVB zq0Ahy?Ll5c+X!{3n12IsK^yT5vvm0bs^9#HYTxF9{6x^*C#gH(rKedxO7^620^#q> z`#>hPb<;n*V)ehFS4Fi6DG>Mg7;eYgQ41lY-Yeh zJlRlgClr;);PX-OWa}X7=pYQlPO_sI-eEZ;47;gq7GAiS1{@M%A_dGYSTUz94S+c` zT@@xHZK!PXkg9*>%Y93#XUK)iMe0WR9j)SgVxryc0GX!$tC4i~CU{uCV47v#BVA?k z#I^JNhor=T2PUDo*(UqS52I+bg4`pOnRi<2yH5xv zaSZ|p;wm#*OCpb1acIO+jBxyb0DcPb?M#(wPrZWsg4y9)_QP`NoYU&h@6bHoRfd_p zk*_LeNs?r?&(Q)smq-YI52-}(u}dbA%9d?e!TW?(tF<>E7P$oyI3NdB>LX_}slpw_ zmn7oufxf++rK&N0B|^7z4cl%^D8^SKA}h+n-EN&uq$iUp(KS{!lH^Huia^Sr4b0^I z696lz9#MO`9RMS0V{xci?jUb|CJK`yUpwu_R!z&^Fk>xI5{M!`nM}U##;`EFlphHa z6A`}kwpLE37l}yX3enPGCPGjv<(h53kggt%kt3F99+`oUJ-T|W_6^Pd|TnV3ft@qW3G;vP*$Yf!M z5!_Yg2G~Y&o^Gu1MG=UVWY;LhX$a4H_zbe1YDN*@v@S6B5|iE9t!LpCjxWMz?D)3> zveb~7Mp9^EVTXjliQjgzp&MyOXW@QA*_UM6V8L5{%dHzw8;s~s5zNl{(WPY7M@z28 z?`priO(?O7UnAM3a1e*fw+>#BNM)fV_EnGyDm_z>w7Z(>Pt@%%xN#0ytUw9%B0{(v zmSy>jd-?}Gb)MM|$yXL*7j1=-bb&QN_Qm@fp>i%nm?4w3jf3J0 zXS_TU>rz26o`F46ex98CQ)0Llm1&%^=B#dz>GK&Uvoba>n~G?wlNnR$aJG^>t=kw~ z>;N9|?-L9U_SqmV-H zlQAKvCn@xEj94?A7+#MzqGL$)MTB^;qxd5Q=o1JRV_*F7=`pA0fI<&GhrEXY;~spW zyk&sMBl5VmTZX<$k)?+tYxxWa#7aFVnW_1OT-X~H^^qxV$! zGpHp6tcr5?W@3}$`1O*dg1;Sx61Ug&Wo5F6+5k@QyM{XaMykwn1rvemwqaQ;d66Dt z65u#HM#LYs-Ce(|=UJCBc^vlTF)?EH`A@^EMfKCGos-dIH~S?ECzINjP?y`;-7rAf zRo&0nmc}ei(IyzL<$DBywBm2JYQv1>mb$Xi6%VKulnk`9^_EjD|CBoO)SIuMrlVpwTE! zKze+_4U;%9osdj`qK=(m5pvri07!33H+`9X8nk%WmjhZNO=zGf{qh05X%T0cT;G)B z@I8>*iSH->1vqZ0WwgG0PTwy7@y?0NdkA$`GKmFYm3(7OC%Bu%*~PpCSLm=G4a%wmzUNMK zye0S`*8O#d5Z@Gq|)54AHFOOEOT zU^FyV>p;0PKo3|v3;3|}+J=fhvKe!`vpqu>0-2T#zFj@bY?h#%_Mo z7#!0n_k7AdUX7QVv}&S4=$9#!?rbUn2g`p=_ZNkn4H$8FvdsLi{+;c%yGdDrNA7L< z5_2?69J0TrQzZtSxvRU9f2S1dm<-_IhY&thnLWx&U127hWPz7heh8}}yr!|iELejX zubC1qxGmO^oipUAD(QivQr#`e>MD>0(qFFhMreEuP@ed~$i z2~&4@B;`|3vln+EN+mHV2@6#>?m0OZKZ@FF&Rd@3QqoG5Eh+udg(s)#^0{Xsd(HBN z6{FaxC(bLXv+MW`KSA4SLga)M3KQMgC<+%q{oChg2Bc=RrLqnR#RvSsxm6g zr|%au&MHQ8&y#;^U?fMku4=WcrbJg-X*cAWYeTPZmUL)b8arDg9fz`W*z}?qy&*oE zuy%5`Nqq1cq;gZuEps#$KDe=Ew^OfFH|_e;u0utoox6=1j6))vIRV45>m{%-*5=xB z#I}x_)PmC8I^PSmDy|=PJ`BEgx%h<&*@`- z?EV%n8}%hmH{~k2JSX>>66R-ZIj^uLyMJ&A6GK{-^LEv+QRB&98pRXu+A6QNs!Yk` zovOIfFd!S40ht)GGSDPJ=}7A;`cH-STWr(%)X$T+8g-phMw2}L4}^~`lb!Anc0X7G z#T6IbukI`?)S5Oci_H7}lX1^h7Ck&6TqfixcgBH+;cacXH#j&i6dAk^R=VD>tz+K2 zsnEu|d?I}8$Or|CHlz3KU_=~_)AqzwOt`i!+-5%id>I1C3c#zdC@9swj(CQNnbeXNZ*Tq zj<1fWkMT9ci4-J+@pte7o%Ab;7(u8L9q5ECfETeQ6dnIV^92_X9CC>TdcGk4dMkz~ z{wKhR3G_lCh?KLB5Q_XA!xzH@GiT3d4HL>fhhV^8A>%mCIUOQjDa_TT_ZvZz7 zbU-D3;_X~8*uLDyAKdT?282i4RQ9VM{xj$;iBXfh{Y}QEp|IJsw>X7fQ2}0s*bzHg zuzBBXs!L#Bo^7lLB)uA;FA$Rn?m|6jp`B0Hs@GUG)laf&b>PnMCpPCvMzANrA6auf zWQabL|B#F`Lh16I(rPh{0&tE|pL6<09M*f6W2=%x$xJZ&sQdHKQaMr5{L?6^|ClO@jmR%go!ExvUwGb~r zih{^RWXgCsKtmOkkn8WbmCZL3w=yXn>P7^{#E`wtS(z}` z&bhKL7A2pGJve7Lu-7T1ae!K zI*l)x_<*ncz>)75f3^hfY)Uv0uze$STQy2+NtiFO-lUSp0FA%Q6_Uke|TCLr|+f0e!EhZ4-E5;iG)jj;$z23piYH zRMDgMOsMjLHzY#}_&Q<6pzLkk#uh$R1OP~z(aI#Tr4ct~zO2FOXp>ny;?@djtH4`4 zdPD~52qVkCz{(C*qI!crYa110b$`G@Drqq=Jvsk26~jhH{J5Zof#1UOfkmM!{Qhml zRegyIwY;pau-R3HLO8$6(?q!fiDr%CEQtGRKbu{T(1LJtN z7&$+IIwKMHB~y#!MMj|Z`?1^vC0Nt1Pd>V_8M6GE2!O{EdORG*h^H?`NHazO>YydBp@g3B z=mP<0S%9Sj^275BwTZREGYxJZgpjpcM+6^-qAXmN+u8Nfa$=Cu3H(-XtEksqrP{Nv zxZg-?bl-;YMc2OYo?s0A65o`k!2{j}M&C<&koe`45DGQ=3j+7U3j&Y59=N0}Yn{g( z{S#gG>3Xy>*Tp|j)WzDr*#q*Ge^n^rr?E2|9g;^6(5Zf%!o`2ix1!(H$$&}^v0kUE zGdq}*U3kR=8&Wa>4Z(LuM`WjJ>(h^Dd!+ax&4^;yAb3Q1it=Ah{GbAiWKl0xkQ!wM zlrbBKgE9kdYYpRTs1rAc3*&20HFD3fC;eS>Iz<74RVmRz$rm(qMCsXGdt%sm3`A@H z-X1fIdN_k(+_%7XRh-@x^$1j4Nju3{VRI9&2u==V}=85Pi z)z1hbQj4_csmlT7|-PwJGv(ZOp=m`IOAm!$C3L2P$<1S%H+&Q0-&CYZ>_qF{CB&SH?L=@|MeoScapU7`ZlA|F=_&@ z)pHCFg6!`p`bZ73rSv`3#1np*UhdK08N=fiDGs`vJ+Va23fK&-Cf!b>6nt7={s;}J zDar$!tQPII$}nayCW5D+wEJcj@$G+2Dl z`=1OJSjr6Otwrp*;23dU{GMry`%`#5%8z-x99iyJr2GfF<%?)b;d>vk-19hP>V*$dqj)`DS?(FfprvpV_cAAX{IunVF#*aJdpvrfR6?{A z_Ywf@1VTaq?7Gb`Vi-N?6l4BgjUV0^5tKk7c%8G{^B(vOdaDAXl;WQ|=_PuP6e1JF zt}w6eBPEE5LV_X#b}I?}6Uj$@i7kb?xR(?BF40cUnSo|Z2?G0CG+q*bzJ^tWI#GfWDpOf}p}a0$2 zcdKw&>~jx0A_oR}u#;e-JNnQZIRMHXmJ%%ouBSU@67M|32@d3r@jImYVPLsOi{c;R zNANBlaPmS)Kd-S5ws~|UgMBP2ENrxZ{7QpE#+zH{0X!nT4-&*Rw-Wt57*E%tfgFIJ zfj4{YNd9C&`Rt{ow)^DnbHDQT#ksjK;FgV(Z%Pj|w5~uJ7(tWsXz=I#P_u^L2tEh* z#0@RjuS(_agCC|g?$2s|2>#6mQ@<9drgRzdXHJ;06G6=DmZzsl1qKE@&f9NiM!&ZG zdt7Ze>A3LOcznct^xR8o^Hfq$P*gNcz>InUF_Ky8$%Gj>_#{6Tnk;= z2nD#j-SPO|=G}h~?1~tRx-X98Qf4Z~)H?IZLx8|+z0!6anb(wiy(7|zCrQMKF^mj*QuG)3rHL+WE}_?Odxl9ojI$#RpHfvP)EY0V z4AsxoJM(?D2ls487Ly$5H`F@|{#E!A7kA1%wYxAU3V(V(3#=7JB6v>3hDyOs;^}^Q zNJ71YQ0)zNrPCpn`9IeN@`CsPqUITWE!W(8IGdlZ@6XgV!F%AFB*_uSBu}Wbw@({Y z|MHmO$vVZYm6PeCM^p=BwF{ZGCX~}~o37vEJXtTDqn~yLvt~mkq%>4eRHfi%<(TYe ziL6!ZdlKuTX$k6Q>SYBezkf+RRFBL(&6Z#O8dNL!t$4nn!GghNo@nlPr8el?L1j#J zcflfF;rSJ?!I5A)8@YD(vJt;HEO2oBFUxhpbwY1~eu0koKy4%_8rnUKZHjk}cZ~NR zff>Yy*Ol)8!b|80OVopVqHA)9eSPfQmd#DfV{7QT! ztiy&O+Er@SuGSlZNmYd&uVnVGfysYl4H=bvMxNv?CJbp=<%?Mn&Meft(E&b_Eelr} zqscELKLtA(3N!gGQrFjAD;suwm42?hXLxS781ls&6gFi%jD7F+-(w*&Jn*#?lJk8x z5W0_YEwq^&S5j;3=wI#}Pc*RI0nf@qEPaMLT{_;~m!4+p`uTzP2ERDwz*?a%TE9w; zi&z6+UmxVBeEzKYgdNA%tv@R*W_3Koc2umXjj279Ii9oZuLCRDV+N&NXWYfln_bBp~6kGI% zDmwJx>gc zbe8Mb%{1`3G4C(Q=rqqrnR-2Sk^@`h5p|?|^1G#?JuYixw>-9kF#?f-Z2NI>0 zejo2@zVG|)f1iAF{$HOUPsd)nr32oD*2MOeJH%}vukYbv-gJ_8(0zBmh+%Dg+wT49 zPUrbuinZjK{G9S-| zA+}Np@1)}0?QPiyeR*|7s9z9Bepd@Py1&nBNMajJKUO;ScMtd0ZE0o)EBIA>Id-|} z1nJF>s|MRNrsYMb*QAy0Gc+-(OcpcEyO*kKl%33@>cy-VJE-!M6&&M90nAJF)CrJd63?V$*V8jw%$|{+v%2)#~|g%&{KP>Q^~VRd^E$JsJGh`1k0p`z=DSF;dW4~r!#zH>v< zHvXFTEfd4|ySoOw?m7mL{>o7cy&Oh=(f2T5*25+a(NH_K1w?_QYz}j92q@2QQd<(G z&9hx`+=s2Iq&gz(_9wUhspV*2_qV`P;`(*bGxgo9K5%0;-3{CUp`CW|i{I7S7d{mT zSpTO49Dk=Pdw6P70nStP@#rx{a=buF92wvB7Y>N$KV+CT;QYXgcmqiXwN-;J=Sfj3 za?5#|(`qkA_|Lav-JTYDp0!_db#G}#Mv99yCvzSEl|+1bK51ej(|)FR1z zJ_`R|JiT{VQ_T}LtPct*O++9_iHHc&i%KUTB~k*U&G3W``XMe@?)>^_AHP{cqH|cj(dgUO z508_b1(H+011CrL%}>*HvNM<8Lh`nmO3^yf;;9nD?*?q{p`IjcJ3XQ`WcGsgV0k8~ zOIIWm2YjU`@05C%3T1?mohrg|oUcy~>!j*L>a^%MN+rI%H#7y`h8#cfpFS|;+HI6< zP?{+3u)$|M?#4ctEJT@|rs||iT6SKlHYaRbkbqr?qEaIdJG{^&Ff(ir5w|T`>XP9g zX)lpG;0rL6vT2e;_Fy?m*|}SIu``E<_NytRj8+CfSnzhoRfs$;1=`yce*Kw}kcs`D@HJx`z zBC1kWH}~PW@J-p6mE`74z;6S26Np_i#aA(M51X5Zv#Y1jB=k7bJN1 zlNa^?Z$E*W)5J#`PIjMh?-%b^2YY7-4m~w|-6|0&ls~sAl-{!G(kN@ektps^*E1Mv z;${78cCqhJoEJ=Qzob(g>uyj6Xt7}LZx}c5lO)66Eq*&#lb&uirMDR{;{UN>8pbG< zCOMbTMS*AQZ>0Xbu<5V83`nqfnk1QL1il9bL~}&niw007geQ`OWtU>eatZGqMSD?q zfTV;qK77@9?cpdW{M^M?Pyk)aGN~ckvJ7F@u&beDphi?y*~&;o19C|6H8jb zOH~K2JN_PH7^)rJ*WX`mS=w;w$Wwx6YH5_02;`PY%l93%YERu)U>|2cnb}^dX0rpz z9@aPOQ1W3OEFaln)kru+rg}GY;JL|)BbUJn5{QmF@YZM(#QH*0hg{BYCEk=T^DV5R;GC(>s1#CPv<1`|omKbgf4pl6ej)S|rjt$?BLz(Bs)R9GJKD zqjHm5{&^y~vgm$qza=BBYJ>dVtVPFibY9AN`#K`d#L7ncQLTx#Fm1_8=9*KzBDhXL zlY;Akhji`A^1J2LbfR!ORJVX8Q8EPE0GoSmaM?}wcmehfsx7^7$~s4Sa^|Lg_KT?- zl+$obig{w!AeZVaNbm_EE2I%cdI0a~p6&HMu9TgViRJr#8Bi$EE%TA=wwV{4;Du=E z;O_tG6}u9Pg z9{g{1ugkjJZ4U*n7;W7jwDHj+eGbLd?1YOFPBeUAVe++l9dm9qVW~t27pftB3izkZ z`frve^vr#(9{8k&pJ-t+Ktgsdb{$Hd|I@Spt`w$?wa$9)2}_^b&P8u|`ilIGkvANE z#a2N(eSNRX)HSz1jm*DS5WEaRO}xro$aN?A?ST`#iw`b6pkZxg-Vjud9caeRTubaYs*uB*jw%Hfc_rh5OS~R|vGr3rJ|k}L%Ci%^%7-#s zbk+f{yX#^vhhNnh*TQ3c5){@I?7n(sWjVcLFPeVGi+Z8I#qM~|@w;nU&LC5;Any+^ z*U+fAvh~idYFX0k*K^7|B;}xv;zo6F>ynLjNQh~ettIz1>vtcV$JOBAD2d;I;TO)$ zU)+nm-o13J`q>B(EV>^QdMZ<-HkbH%Q+NGFuq3x*j#K+{NC^0SlMicP3UA^iC}zDn zas7Q9u6&0lYB85{Xj~>T&=4U1*iRg6^}Uwr4HN*LQ_VnVNy56T=%NC?hYcM*Y~+{O zZ1k*5WlEOpy8OiyNaJpDETTHBlgR02E_-~S)-xqg)8eB>TD)w_ZRzk)Gv3Tua}p*) zyS?y-oG|hT6V5QH>-a?ae zB};~aaYyiqOouEoOIFRu(!0xKsljxvn#^u7;QUJCJc)azjWXRs{Pg>vs-U(xd@sG!abpLHp;m7qZXne_V1(%N+S!{{7wfJAByOTmyxy z`mN~hPM_9hjx4?a5twTt9*iVKURp(@OATbXwR`(LUjM*wN9j z?&qY5%04V@>HOcDh2~0Q^L+g|JE$mAGBf*K`vc(`7oE2XfegHzu|v5w?f25$bti8I z@T_;aYD8^$@3FhPgTfe{u`7p*IsG{3!=hK3&UcHZBoed(R(3WbsD?U>Pihi8&RN>J z|M*$;jvbL1o{_n`e0`flbhQGTaRG1BpYlsS16JcR$iOypEwID{_sE;j+fc^s#z#Gu z*;5jrliJ=bA`ukr|)RSHo9DQ}`!AnF) z2vpevI50;hWb<;hL9b!Jp-RLAD<$%Yof0TmLk!M>msmv!dkzNd_}*MtKdD{<1t1o- zY|vBx)`+6yAA_YVGix^OHUw*GgwvsQ&iTx$7WNQl9!E zc(I>93kVhxY^0Axd7E9vck9Nl7;>(1*KLX1+OjaRFh(@JT(ef6*=0?gwm%*#nVbh* zEQNf1JtV3O4quPx+Xfm;60s1L9*{=En9j0e-0=0Bx%)5*<(bX4pe4Y`IS1EYL@1nT>mQ@cIs$K`hfp z>g!+SRgEEk?k^)UGh-X~Tid@KXNhujx1pLhHU#zb9ULxIHyg_bWL+gG8xz|@AhI6# zhi!>I8C8W4f9HPA1wBn47x(1Ot%k&|K*_8CVa~r!n6^cEUys5+T>*gu0y@B)2jX^zm z-G&{^$YqOh(@^#OhCkf?7p6T1@YtGMu9aG}=~==0(2v)q4i2bu++6wE{Q3gy#gOI* zt#(CRTjX|M2}=7@><8XE@onlr>!(9$FV=My`eUn5es>e@AFHffxZLGhQGN1Y{3hDv z^PVcWyTejy*a7aVuK;Dq8z?}!vX$|UYWY1(z9{|GO?9wklyV`<0!MKu2? zA{%aA(q`|FRrAk?B~Z69sAO6&UU53re)Hbk&tG=!$&@2K8~wL?xK_H~iKc54Sp!K%*AAor>)#Bw`Fzj_}CyVkn`Aa;|)$8 zl5)wFUGT?<);{yG_Dj<|j~U3NYWwkOsq-|qncmB^XRjv(r^b>fr#WPH`YxNxUnYLb zGJIq9M%HSaW?z-%>GHVA@Jv(15=6a)-OK3bvRq@e;Mv{o*+0``UbzLVuUU;BGDybW z-_KPznr9&5sHO&}6`+(J`fYqJ*lG1@ z-0l6I%tyKAT_b!}6e6tkClA%rbc$Af__YE-Q9rK)<6cj0%^#%@G%SDTMWubc$joYi-z zEsrU$3%uCVVlQ}@^_lxXH>DP}Bg>cSQ@%B*U8>dYQdFlklJl}6`rp=bSovqw`10!fxYVX^@2pTXzr}Vg)%?~@(OMb-Rtx{#>oy)Dne*7igu3#> zWK*Q7_i9qn$BNbJJ-lbUq|S0#KE6iTgXeRn>{Bm3Ojnte&O4f&8kfEt^(9-EPQx2^ zh8N6E{Y(9HgfgCmjhE9%xZ!ewN_*g_CUX>GuYP1!8vUP{QfpTCyKTSAU{k_vBDVWS z(YiT`)BE7x>vWcEyA{RI@c7Ezqy3+gVtpsR-IBOP1-(m&(d4bIAz-dsHJ4(?58LA@ zh0*Qnn*sGnoU-M+=dyZv$y@BQ20PzBY-lMMC2#eaw2t(TjO4+$`adUuWJjID0-n{g zMDnF3i@mj4j$wj@xn2&I@DIvA)$lPBZ~9sMp}|nB`P-AjwNL#fbsJinw3{goTZx=T zKgzTtZOBj~J=nmU3+{W1qQsxXW6>ls*dZS1-SRPCIIakWuQY+i<5% zUf&2Y_yOXI}jTHdxG1c^Fd#B8uW6urS>eo4O=zpY5a8J{YWoo z!<$7stKXkx0}W4xTYi=b8rTS-9!Wj96TQKG`mn5Hm1~7bIF)b3%gR{EdOkF4Z46Yq zrJ=r2gXGK^6&$X4UX@2lPi6k>M81mo@!1Uz693A+9+9v0zSL3|41L`_B3AS?S&Nq`?*)-UcPTuHvRK#v4>bPVYi>FGWrAj14974%i7ykeI zsdXwD_o!)^VBkS)YpRkv7djzKL12|dQ<7HeamCG3KlLYDK7F}8%UOBCp<>vJzS^V+7FCZcyB32_H zVE0$ZvEHRMukwYDcbEzxQ;RcM#?DVx!rD$W2M7W)y;WKmhQV3e(xC8BJ!dGT>YHIZ zGK{R)B9hOl7|B<{ew`E427f+m_il?_s;wvo^ayA<36QP(n^PBQxWUxalNzii2o}^;sNBmvkSPcn;tQ}@;|e$^M3b0 zvgHSls9B8JR~M4|dP0(yCLuMzf4TUgx%eSr(dC6>r2=0?t1uz!A~eoL?$!xZlLU6? zroHd>y7@qeo!G@5cdfsSN-am{@k46U`sk_rT$W}aDMj8vcj%o4Y*LE zHN=-Z{NOBaa_JDB&|cT=6BUB(UZkn>5zFAU8;jQpsOkQs7zMd%Kx^98>dMQl!3()gZcRt!U5zCg7M4kjbd)$?fm*l>jwObVt z%t6+kt^6BkEdyiW#=rZ?dqD208N%;o6-+Wmuk^PW^&A9W;9Qe=*c~lH&Oc8Mv&T9L zrjF)@xQD42tB1*Dzsq2(?9bBx50q!5r7xk~UBn2p9ie^fllz~Ok)wEmN2y|C@ciBj zP_+~o-#cuH@$xxvfhMe8el5p&i;g0CYT$0Pw<=tZ9GyHJQut>c{?|9#HBx={r+Ekl zeVaH4ZQ&c^{$hbo*Hrc=%$4+HUVE{K+QY9tYYD%tCmWk#4wo*z-beQ9&u- z^|i+o;`B*vCT`v-vux*DHZtQ=MomY`Ya=5z#-&3~OKTQ)@_kz|oMgaslB(S&e!jQr=CE5Po61Ua4*>u_(88#-3+tM!}m50pFH(4BJs{$flHx z6-z~wh0^wv;qbjLTZjyzmYjL<#y_|_1N zitX;VjSU}#%zmzE#{+{YD?<3Zyvf7%{-1IAv5u|-A649E&*C+w9+jy0@zje`TMrJITo2;ypG zEKthUNAHV{QNUSUclb}a<#--iW~68yjJV@?eRDF8Rx^3|)vGammHnma^1oT*4xqWv z4!GePeLnC&64%O54PCR`e~$B;zI$gz(>|MbO^k-C(z07};r-H##W>lZ#)>$$yfK1T zp<;nM&5w~T^`F9-nP2ND|FHY}RSBLz`WH{$C7P0WQX1--3nzpTxWt`fMqb>-+j`}8 za%gp-yQ!eqIse&Hf;g+!e}HFu<=2l_#@;;p$Vl zfzvle!RdAV3)HvJLr(QDPW*Z!@F{|bD?!J)j>+*EIRuCZiE%7@32I_vc(mwaft|PH z^`thX^ka<~-fyZhK{#u^ryjWNr#{Ot)E_?*zzhvl`%LHsJNIx0IUla+~c8)?4-m3H794)I=>z^ zmN$OEZKG5%7VXREaBh*@F@hnCf z4;q^NuOhDEb-I4iSBGtXGvmfp$GO+uA$!0>smy0AD-u%Z)z9z8 z{TxN*3XbJiI&Q_U@Hig^?6)B;&#+GxVwiZNCI+>&nI-1tQR4OQh?PlQt{(dauOA$W zX>;btG4}Dg;@rPKd}jXk?4bKrJ#~48$iZG~X8%4;OLyeKbR>G(0C{ZnlHqN=+_JQM zJF;50;y5MA6aIA9bpbITs$BxnibqyzY9Cn4FW1xZy)c^{UP?t%0uP|??1M5nAdt18 zQ7}mxZQ|3Q%MAyc6F4*ngoYK%izb`kIQcwyy0egw)mV$+3pP=q6}zp(rn(*3jkkTa z8p24~j=G1WP9NJ$Y{^cY=n= zr_eN?sjY--nVN@fWP#o zlYp4}uPd$AyrC9#d06tR3Jz=jP@N@Zd^1jD0yPov0%!sp1FPHPzJ#Wr(PqdzfBK)D zXNKQKPaEc=l`JBe=mw5wT}~hQy4}zHE==Y>!MJ>z*2RB3X_ngu$5-hx+6Fnj$kol% z2rZp)ZVJ@O)gA4z@so>dH=J{qJL%fRUPBbmZr{Tul~1y>)n8)eknCKUe)C>8cll?w z#m(w>5irtbd!4S(95=_fnU+G(hFw^$Rn!;BMzFSeE;n2cSN({STG3DL{6U(@?6!d~ zWY_n$Dy^v=id zF@nlPx*vpkuuCXrOR{sxr0m4>u)RaK=J{Z_AEeE-<15|Z^FKe?jovUj-jSyhO##z9;C4G%Evm2t|sp^*mEBNU)LDU)$mg7oElH~ zFjUSu7qfLgKyVw^%=TTbo2rajvsWwkFk1I zdpyV%$zb=e!OG))=;7m2$n9-#)buWUF_7J=CbZZCs9$4JNITo-MJ@gIb{C*$A0ty` zRdHwSw+i3H=#C!)-s`cP#>}oCpi{#)u~5O}D@geHtkcAophb$I+&CB9l%Tb=+c~w( zL^Vi)(tPc?4Yf;v$sP%R;|yeVr*{(pJr9$LmZf%11L+FPlPWTcqiq&H=<7W0@|rS{ zJq6`m7M%=1kZL)vYQmjB(U*31tp4VA6vX~PHfMc+j6MSuDZApvJ^zO8pYkT4}{-(dVCDuZVXTvx?7z_xJ;{UWOLhi*8$k72J1- zkWNc=+?c+W0-!ne4EYh#mWfw6H#kllG2dwjy?I2+@$Vel$ASD z@{4LA=k-K`8PJ=W^dcX=qx^N7c{JOMH&-6PG0*T3OWap9>IQQPn-aSog)7<(??S_m-v(wLq!mx+b$a}f#_pp^L-O)m0p+VtEG1uuK z3R_Ur8g;HZD9Hi^@m(Ct50`|Qk2e*oc%+2Iw_v3Q6T4wbbos-mf9j&3J#_zpZ6I8qK5eSRM93j3H!F9#>>gk*$_ zTZh9aRb!Rr^NCn6lpd$HyG{?GB97qH!nt2`(5?$&73cDcFwm^aB^~_es}4Y5qYA2f z<&_>5^~+~UnkbGaRq-eQHHR(aiv;vFM;&Xnk*ie}oID8g)Vbj^E6GaKp7{uJd^i6- zhwGUCy!1bs9X2j8ebMY zLSBZAor|1)_+mCd{D3vbGvn%}YAueV&Bv2m9zvV1<^nX>9QtIQ%;$44_Fbf$g(~;v zV$BiVeKfmE<@z2fjs``#)|v`5&vr^MyR+X^86{Ra^u>bTj6_#DD(otzoC4UVzDW=G zgP*+@TDYF{ekxwV)oqIT&$72p(Q@3G9fzODLNvAX&ZW^1eEu61=2u8>&zWW-WJypq zN%}qQ$@rcIlEftIF1=GssSD>_WcBGFM*~S8o^cE~(&JH}_U7<=Pd|8auYt|dV3G9@$~U|w{lFznxCxa`=sRi}|#n{`=ppNoBE_OqU^q6%e++&vXJ zQe67xsK7^r2l?92@m6R?Uv!S@dpP0Tin#TDj&E_CdLKn7MZsN1RZLeI zxLE}BX$h;1xaY@Y<~8VH4|*#*D|Hay1i~U`zeyLY6E(PcFxL0{MhmA z!#b|{#F@-Wl=5$eT(!&U2Wve7c>86fDvYeSZPlPk7<&qhW+*E0j@IP^UKLhvbBxv{ zQxSKwxF9!M*$&3C#ao@C8@-k`JHn!ulk{JU4IV+GN0iB2@>m1JrERoT_(!xn!jB90 zzlP@gs8ue=fB3Kl)vcm+t43>n6q>7*x(JZ6&;4N8EQM?S|lS7|}Y_3!|$@ zGGezrm`Xd9)Ls1UVwHZfuFIu1n2`&S*N}+Zo&5DD7ZPbvCg;CHzjl#3kEdQ8S^Uob zR@|k`DtyMgpJul9_reQkcutxB4*IJYqW$q{yym{KlS#e>c?;okBVqr#u8PqYrRbyKM{r=sr$7| z;Z8TdaF<4GK;D&&sBbMqfKgXp$_bif`Hn24Y>hqIS0(7bgh;hky_Cy6OjHMy(o3%A+i9Jgijf5_&0MZDQq?_)>cba6PQ zI}xKngS(OBJYoJ2l=R3gJtSwmIQ>O?FEa?|mA6zZB>sUXK{ubX7??FxZ0R)me!`~T z#6P)_nEG$4P7&`VA~)(Xp_y$8GxoK+7v&j+dDwx;)oFi6#Jz-!MW?V$j)$rwH-;#O zWW@30e9+JNc$V{MpQzk>-p#wPdXH{h1EK3Y!DI^4{3=EVRSWWzj}m*--l5N!{z?zy zGtP}^Px0;YUUZ&#Y$H4JWLuc7khTazKj;9GSn4Q}lLI}9^TwdG#M@m>Ux42kE%;Bvv&X#UkBV4|fP+C45k7!2VDp(nFu=meuo5bFm*#=})CK320#v6+Rb z%a|s8hu1D4U$)+$MAerZy+IM{SMZl9=RU_lpwnn3?AyOrk55_>Dq?B&h5E^Cm0(6Z zun?4gXtU$Tu?v0KE*_^uS8WSbo#maPH3Qlc(}ls(tw2hxzM2A%-L)#~eq!3AeKi6z zf;US1Un(cf90S+YMnV=|g&zX9l}9KI``2;u)?KnU#{UR;x?lqcWol7+XpOm~$93MA zC-)q0^|8s#-e%O0__Vt6_0Z!RZ0jaDF=5v3y>SuIG}cA8hDs-g4ncdHdEpAWTT=`trFX!vjugE8b<`n~pqh z^8NGcH>1x$cdF9@$0C=dUocj2+<>Vh|xUE{aFdH(&=%-)RsX zs){x@_UVP8>8DOhFWSdHW-~~J4)5x}qTwL*$61aRcn*pKCoWwWbSLtkTOWWRPvITHRd`c78 zw!3B4zHyOgJoH??b?jparkfAb$%5&<0`cO)dots_IH<+%r4q6bP^s-A`mT(ROun{v z9rMvb^T4X+yZab0O|=GpHJHs^Lt0}jUIM%Ni**9^nl7IzJR_VZC(;J{_F5>QC)Ej; zbp{r4SC)?&|F|$7Tb_#by4^aSWW@QUQ*1UV(^5C?)*W zLWJo3@0s^kC4^VK+9JTX(#R_eoww&DKCPn(OVE5uPt-ALVy#~6c+b=bRKp!^)jQlx zFD}gHtvl_$ssthmIVQ)VnZ-Zg0MDg0JSEI_oS+E1#0XS=1S*kor&geA%0H)V)j%B; zZN+_<|Il;(*0IDAOcx*E7Ym^CLj=n24yDc-rKc}D?Ou1Ze<%{7;?2{qDKlKEJy{!Lgo!$XBqyDL885$*|p4G?PEKqr(R zrdzzVR=;!oS6Bq<30(m*@0*A;hk2U)<~z9nHEDMZ;EYz_o(GQtWo}HJoNdU;@}e62 z`0+%S@#t*v4-0$>W?>qEI_la&MCN@Ns$7h^bhR?D`u7$0?%Ns<6&~?1D``;LB!T}o z0o>SCZ@Rzl>1=NBQU=~s3*}a$%n-laSRxM7b&Kw=B%RF_-sY&x}t>I$)54 z%@s>Pw*lSi9lG^f32(0yH8Y54>LYJfHPi0{)M?H@3E1%{(II7Nx$$_Zv1k?y#q219 zsJ)&>c`8PENok0G(oZNrUzy+Xq%1x}5fxWPxNc6QC^nxX6w{N6>3WJm(F1;oQ+vrr zy`p~*${E3gu)B*YLjU?UqAtW*z=g#8vd45AVLBTyy|*D=wD=#a#0=kS+eIwl8Br7? zYcqtp(JB!aKRyAYEaRDcu5iq+L`-)ThT>`$p0!-ds zqXP8!In|DloO;}xPy%v4$oj>wXTCPnt>dv)IUkiOl|iCCSMU-s6%rA<5((K7pO_MC z=|n(9#HwQ0cPB87oH$Lc46P+bMH)XAFEk$QG6v%ir>OKb|C?|fdCr$IIUml#zXkaI z_4c);ZxxSWN+>J=p+QlXMa*3?RJ?CK>RZIMhikG?0W{99;OA;L822$p(&)M*dC0zEh)Z#I_?-bpi&)Q6Fz5U;ea_Ul+D((K8 zI0lPPJQgsf#e|P45ul_%2^Lk%FY(QM!i_Up?5ZGwTQd-`?%@0OS}q{jQ-%~c5ZQu} zkHf3}-;s_xmt0~r?0J`qZo(Lb)cIKpXpw;5_sx=QpucaM>E5^0`F*3*?r!~9fbh2M z6(UBLX9d#}Q;7oLz5d>w0P0Y>{Q$uUZW)hru~&>5`)Dp6@tFUsXsO-bz#We8=4-0W z|8)VBn-YD;qa}Rvltar5l!jEKN1ls8#N)0a7z&moKA4NgGypn_lVgBY7`ZQjWAXK4 zU$vCR;)KSMuKNC!qqD7wKouuzk{{x)LdHK*nuA6<@BdHT(XDGG{JnP*Dv4sb8Lf}V zd)}_gW2vJ`qpQrnX-ARwT`EAG&t)?DPYe^{k16aihGQTs_G+avO8iXh{}+(^qQFU) zCRAXx_JvLvmxOp09=!Wzf?Y%55Bc@Zu5VH>K%)kekDUAllZtVa*_E2B5&v+NZcUDE z^^q4RG4R@|XIdcZwhe}J=ln?{{6&Wjf4lHNVv>(6=yN+W+aTf0Q?|918VTsW;@*2> z4!myYvgdk`fAog@Dg~@x8xpAhCx1%p_5)eSoAb}WIM&>?xx}YR2kN) z+v`>+p!XjiV#VngU8pqnG}+VcPkR=LZ@K&ISgEVcsit=7)MIb=&)aI$RkB0&CcmZpZ1 z>AFj<0jb!_cI}@|bA3TKivV4H!3cBKQ@rCmmFnT7JBC2J_1*P2Tr?q{rV99q!FTOF z*7gQfGkw>$5x(4xuX?`uR7)AKu*zSzMZ5kwB%K2;K0_2>r$Bvc41weH)78Z|9@p+0 zW;xx!M_~v%IXMa1CCqbRn(dYJw>M1Q{!Q?6lSvqI((%>+k@2kF*&F)l?6)T~ zI_J^()jr@ZQe@-s`9!R~7~6o~)9rjVvPyls%XH}BD<1=~i-Qq;0^xJE8n3sH91KTpOzM|<~ z>h*a~ReR}&DS-=x2GeMYJqzlPJ9Avl(`XrsTpy2a`G3|C!Qe5ICiDjM=`!Er;c4R8 z_%K91O7h$2e-prk%JjmX@k(iyK3pEU{smk7*wdslHEHVcc&aUNC@Ze_c6H~kYdhM@ zCQ#k_M%D>22{u*c?_1Ah$%dmCdD!u&Lrc9WVcbuNSQLZx$vSOq*?(wq!UnquiKcjcM@sTIq_-m8OA^o2MIHxG8D8r>ur}M71RT zx7!8CtLzc<}@dd)$9(q&bh26o$3C+l6WS7#1Xbg3&pJVpIPZKelOpEXA~tEe+q{6^0JB3rl5MBm``QHX_a<9* z*u-JHy@FN;So;4!i--z|xZUvs{;*gYUQ8E&+Uu_cd)b|iy)wpEaDbU+M;V!h`vDhL zap;I~siV|#4k%oa0R~2m<8K`OfL?gsoCg+lE zmb2vd6VO!r^DP&IyxfeBLUR<>y3D%Z6o#P4V!7Db?(QVpXA(7DF>0k4WQ)7qE6%WEZLRGj^o}UKngwzx=8C9AiDZ2EQ4dbgrqlsHQZIXB%jW@%8F_ zz&@)F2V4s6e;^HGX|Q2^M}87VA!;BnYUKq@mCbRq{_ENPzy(xE=)slQTbx@I|v)5f-G|3*pEySbKc^MAjgp? z$n4WfloPE&wf^eMoq#1vrBx3oK9G-%Xc-GPqRR_6u4{$qh{Po*D?pGmiZEF(ydQPBY6 zu~iWABla+PmR+WLnZJhlm0s9d&lBHotcrtTHOvSn#@4Z|bi$5^obZXJ2;?u(d!)0j-tlWnc8j!ku40yL+s;+IW(z2&71+ zNu8!YdUC0-sgO9pnu`Y zbW3F*UdgwJ67{>1Zg<;7JVJ#+lqLK%POvTN-gO>N;}@VQ&F7jN|7dFK7jN}ZlrG=L zCO&CE;SBl}qC#;gX(@Pm#>E_AowmNik5-s3oV5cOQZ;9gKF#d*@#CvK#?A8{kg~s( zmBq5NQ>l-fOl*Je36Th4wTA7OP`8Z{Ax2xE$BQi`i1x#fs|M}Y#S zo8oF<-FKL|&c{S4z6jtKV(SkBzXJc3_0EZ`?C6A8UNS*3g^}3Im3uw`ocXNOI$rzr zj1~GaC`?cNj4 zuch1EH7T%HB#4~C)UR9r%BZ!PC@;afhKto+$=tLne^z*IXU(n6LyJr?p8RfPMZSDpHQ!-n)#N(8Fgv&lsgfi6 z1u2#q&BjC{cCgX{v#V9DcahE4LYx010)Abo?z-=OXVhbwiu?j1!_CEpcrOj{x=!t- zXy#+)_d0Dog1$Q(hso$X!*usxI?>0tcanlN zO?|NVuEa;E={U#AR9&Y9D4=G4lIgZWV+@?x4^j8-1Z3q?ESK zx*aNr;@=@04%Ou6Hhn!%O|j!x0BOp$KZK7qyl0H^H=|v~V|i{yn=hw&ap6+;C7gK|LRA#=nJ1qO1j77#uTUJfCBW|< zKo9zkJI5)^pRQ0fuZZ{P?@XekRk7t*BJ2EI{Nuy^P+5EfLf*j!5I=_J+ZSC}l`~CB zJJ)11TuvthRL#fQVDr%~J3y{xmB#p2*4Z&+^JAoDlHnddpmSJ~%(dU5nWX0`w!fmR z=$>{C)~_BuCh_gu2Ru@cg$#L6N0F_-3TFFF5UOb}338>OO9$6tC zXM@dUolL~MfNcw(!S7f95j^ke8+A;v)6uC>n|(8(9^jzE7OG1^^unUxbG6)CBinSrgXs-Hsu~I-~QCP?H&rq3UvMy}s9}J!SD_DBl~%APKIX z8*=TBi17j-h*;9L8^wc<;RI^3h0E|hIIr)6svVg0V0&W{VBcZTFYrNU#)P+DWWq0F zrD7Smu5TQ2to`Ohu=ZkYu_KQx=|&4UF#@)29-2)VSn&_sbgSM8f8YC##BXN|wb{+L z3BR3X5aR_yqP+=ywV@~Xr3T1Yc&ZSGth)y=AB|Lt$OE*qunhMuIlO3 zkga;V5G&`Bq!Q4jvQq7Tq46<{`_iwhk~~P+&SKc*=N%PAVU(BLy(Dk@^17LNQ10F^jW^)#$$;fg@;w3FSO;%s++zuPJ>MwQ3wu8 z(5piyxBRd~rGYs$vdaHQ*;j@&;e~xmD+s7aNQ#JbcaD%25D*w3-L0g=7>EjrNVk-< z#E1b>1EfoOz=$c`qhn*+yZ`&XU!M=pmwniMU1!(s^ZVWBu5%w-fpbJzf}iI}G!Et+ zs)UY!ms*1?BHrZGs3Hr!TNw_b7Xw0#IX0AP^xMQli}fLdGO4t6@AXJ7ytMg(Yx$oy z!h@4_^M~j2oIl&+K|9`v7~!h!qe1EA*z~W`2Z|rw4M2COnPfZk$K_M6#1{kL_Pzen z$80NFWDSS_zSeQi>2$8xtz>B>5;hI*Q0h-&U(=_gbAz7lF*fdSZ@-3@s;`fKw0fEk zjI((mUrW1%m(C@ez*>7E+KdO=Y82@|=rRgu&9!CkYRkTkxxe(mXz5eB!eYRy)gIhT z9Gmxc@7(}LLQ<8O>(}e1wz!kQAc6ttDKXC9*4kS34lNp8lfGND;hKudlRoGg6j_8a zMqk#mg zd3-pRhN|#i-)&&P%4OI)t^<+*4(Ywo!e>7y2qlDb+X?VQUHr-Ye3qVVbTS%{eE=)P zO)?kS{$R!E+#yd5{AX*0lHrXIkXY!X*doq%kSlWvs;3 zkOaGr9QQ8zX0RvR9}zz2Ryh&wU?{={@07ng5oCM`~^(Zy#HFE5?g&m@A3tTO{0G4#p(p{0HF z0BdGr*sEPGphcw?PGq-+qGpN@iv&xg@R{E}@R+l%ug_`Xc4}NIwOY=0+5B@`?BA1T zAMWMMKA3)>cAK-(y3T?V+Uw$6mgTOgew*{d#9>n@Ab6MN0M?1}U%6aaxm>zjTCDa2 z=bvBqo`e3Je)rEi{l2BR)xJe*S~bCJd58L43N)g#>+zxrK5cDkjDq=Ydd|IH zdmCFvTg_7lA4#PzcO^)~Q%7t*i4UyE2}_{_+$vwB36he7C(<_T>g72rHxAzJ*c7SI z`q#6<{Z4(qhW_Y6L<6W0Jqn`2V{U=jMJ8I3-*#RsJIEF3i@Z-{Q5lR6x;1{MtKs9X z&$lf?gIMqk|{Fe zAv<8Ej6D3Eb+(HLUIiO(HtFoVDN_UKh7Kq&=+wO^gYh9)liXUS+*C z9MFy`;iEm;Uv}q~fy8Oca`?`LY`qv^UU}w#IA^hMM;bYEHlkiS{_nVk1 zL^MzFmym0U>4w(X(TC=py0ltm!X?WRpZZI{KWr^)SZ;6Z!GV?Wo0S@D#yeVP{k~{U zx~YsOfaD4fyx*yr$-PC9n5x!)Z+99els|jza4(Z<=$Y?6sQ;lucyA;i9sD|!zO`2I zrpc0m#yW>hCfl%Xd-pN2Ec)qj zs(3Lba&{V4iVCI0M7-V92d5wJ{_@L#xr1y}>l%7E6d=|<`hITN;{m&V44PYG)K*@s z=TADn*uK!7Cbs|t&LDjyxi&OMN=B-WvnavjvmC90O5thBq^Dr%^y7dMM&dPs` z_(kQM)i@V{)i|za#h{DZ?gK4wpQ4ZnK~Zsyfwe^Q3^7jYN39%?58oj4XG+fm97RR} z;73`&5{CL!gRd`I$RCe{oPB^AJ8A98<&1>K!!Y9x*eP;HAy3Nv3@67&1?R3<-$kaK zk$M?4PXIui!`8y*+pLTj&vXV={V@C88FOabu!`q+USf?a+X{!{x~2DRj@1SrX%Zc% z4&{1SD&#kcte08Vk>*Uha>RXa$wdBkh;W6jnxm7N9tW$n_V8lTUfsm+>j$UmYn#(0 zu8lJuI9ZM+fFOnCD}7H zvD$yamh;HwZ&>+_HD4{e-6TKQoNRPLnX{VO`}842K44|GBMuJOWzcJFw@=d)&LP91wHy0ZP-_ScREe%rs zv0CSg#x`2}Dp?A)z;FOpu4dsf!XO3I>1^U#ejXPHk0`qS@>rfbt1od!(NQh*QY$e; z&$s;DqXQ@D&W2{Ymqs_PdqlMmgVFAXcpdLKvMLPXGb%thjntZ!6Pb`cfi}>_u1~#o z(4DzYpKo>FFxX71-!Mqs%=)}HSUTegNBUi*GT-ls7aIF+EkEYAZG-(+>|x7(i^;7e zd(y%Z5Lc47KG21SX%9TnvKrrP^}2J8hCiHHS#pGVv!ng9lo=lOO=$LPT-w<}479h< zdpTUpjBfg<5LHR9R?TKE$1b-!5=;`-!biXpnFM2~p_)Mo4soK?qJNX!_CU`y5Rb)BKoVEa9(oo?u5ynX<1myXWd|DKF9% zWK`2l<@TBvj5j|)x88z1IWqO47J>7?LXXZruDA8#rw8V!V7)fn{fsARR}u1{@J}2L z;WR_RHY#${|E&S@I~o&?gwTh7^Zs{A9S!jz3O}Z>XJ<@OxZj_LI+%<<)iCdx$g9wN$jP;M07wd`VY!Qd>*phu#(Yk<#me zo=#ZsM-Lk9Q%9;*=#%0D87AV*Z$}5E{uvDwI;G?0Q^CgpNAJ3NE=X&|gmQ+{d>gCl z`Wb8MMe}zxTQk|{rf7L{dzh_me}_M;EX4n|3aVEkPN#ZH+Y?F3ebz50TvBynbXzRWS{E`nxyUAXynP_gJQ-*bfUuv)0ekU)X8Ip>cnC7F$}FX>y@L5b_JJMI*X zb5xK~ft5e^g#KDPRZDGJ?Zpe<$k6d|=6m@vgh;}qIF&R*A)y+`3pdk$YBgWK;HD=A zjeN&#@lati$~>C+vCF&4ZE~N_0nA&TDgqxBkv31K?73NblQzo% zdL}iLFvhF_+-b<_6;KZC$(yIgXe36sf*;YfDj%S{qqG}D&$2F|;Wag{K(RrJb#UnK_)#G%)?J{LL%`$QH836q0GZn|AOn&WI= zVeavudvZN!)ugRr&Pvpo@VuQ;BINT`bbp4=U~sq35Q^}c1cJX2i{gn27f7;DaNV1! zY51#$f2U?Ss+OFjqpRnNm2-;CMbrTY`YPF#*d^-5Z+VczKbM~t;S+WSJ4QB?@T=w8Nb zzbxFN(R>=*!}I4YHpRdGbu*q}cXZEJbo`rP0PF4m7riJYu&*vcxw8Xc7<+-M#dao2 zh5uc3PNEJj64Hy14ggOn9@jr0xMW z;m!`EI2sx70YR%>Gv`J9gXE;=(d|@)foH|ElHKa~ZGOXdD{+#}{N@xhZrF}`YUi^@$SW!boRG)Q5Qry*y!d8>Fv zE3oNf>JVi}IqXH5p$qGoO~+jE*`e{^nKFyz=C7gfC|vL}JR%C?cGjuB$;RWwaPCam zxA<6qzR8bj;bW+7w3 z$;#s}X^v9gotDW>V3Dt5?DX!+ul@d#Ks}#^B_CMyZ@Q`A-*wwXF(2VFeb&O;ZN%;n z?y%R_FSZ+ug-wbM7pmy}tYn^`(tFB^`IJ$wrV9YRK%YD3_#$_Xp5in5>)wismv7aG zP9k13TSye?`*hpfx?p3)|3re5`;e`5G6yei#NP<(ob!>Px)k@OKU@T}7tHYNwN;J_ z;+*kA4{h_kW|x4?%IgCAZuy_Cboin#7CKJ?+-#%f_kq~>XgOv4^sTJfJ1NFu*F&v| z;^rwY{%z`OZ1Ot6*(CW)6Gs~)#i;^DJ`WwgN&NN2ONk_OPw}0xe54cmSwQ)Pg=1jW zCBx_9cRy%OJZN)MS1UR)l&bUQoY9N#U6jFVh$mV;amq&-xOc_~4G^lQOzm6=&Okx& zh^qWW)0!wdYfWm!Rz>ab^BbR|u&!~qC`xC}TIO}ASlg@FInsa~Tn(FFfe&P2SS{R_y&t>Lx-2s za)^lM5$LMO2F7(vil6CAq61jryPkw~vd3A&GKz~Lldub&9l%VJ4D=BeQ-eLd%d`0i zGQr9PK!m@)PxlS?kh@ew@&4M|i(+WPgXt6iZ%n(aosJW@#M=RUWL5yM$JliF4rQhO zhy@f0{k5tfiJIVEl6m1#X`fnhtM$~wQSxw9t~8_c8Pv3?sX8_A?>gPK(7w)&*D&UQ z=Y2R5=opxD3D4SeZ2%R}Cr;-o@KI0VX0`HfAC&Fsds(HfB?;=-!Q0n|UB4slJ!s5ONH_&wz@&hx(;W;ZK*stra zINVt%JKL`E5z|Du@O(wSWi`wR9LEQxWtN^|3Q=28OGSaI{ATTE8D28#ZeIh0LBr|v2bN{@4%L&NbCuD>pm7+eCGH?N(QH+ z#ojM}>Syxhz`VCvyj78VHqAj)&e(WvI#Vk+S=Rp^r0In&M~pH=O{l@Z#&%?G*F|cP zoh26X6{gELPFD(0XB>3r}gDTW5L^u6$IBoVCL9IC4e-a zDPLVw5QibZwW27aA@Au*0M!44gQr9j0RY08Jq{`IJA1!3X1uFTa)PV6Y%ug%I)Qlr z?7Ex$CtGr8*wYdGGK)A}v!Sk)yX~=kDCjApG-$`hX+myt^YaE@1Y*l}lyB*K9}qjpmP& zz3~UMLt2VK=u|KV-6Ed4>T3^wWZF;BsSu%B$}6*TWt}Dzqw`PV2!*tY0;#`NBto@3 zEVF4tF38N7ZkeMLz8%X|g(rz3wBZzQ!kuJ}C#=*?QeV9|<@c;KSTcl3{UJ3B6;X5- z#_0Y_1j#qHVYf`C80y!Yl~+u|gvU%kbK4~c?+v@cT!xV{55uY{P%7EK_pTj*?Vu1S ztn?^?ncer{nQ~bBZ%k{#T*U@RD}Rn?1x-i4()nS)ih8BDif_ky)tmk(qIi6f03Ozl zmw89;I-C-&DN)q_j!O%B`fAMHiBKmrv$nH&(zo&B*4hugp!G)h&$di9(NqUz`=szU zW`E@zgK#-h`E^$)6Q2{ptGt(gve5>ppB1X?MXo{-B4llNQX*sGfLRUX8yc_|yDRF2 z%c=44fcmX@wy(8ocU6v89FC0v2Y`a`9De9|V-BfO@^6j{1&ne#ww!_5CN$Z5N z2WC`C513ongys}~a9oa^=E=C=ugozeCR3Uh)IRLaUNYGN=Of$rXWvQLzFB{2kvgAN zClznOt^=sW$GO4AZy)r3m~O;<*l(qI$A2a^jzJtw&h}9rgS=mHi0qnN-oLhamL9O71nV`u z`5lTaZ^W%t0EPo1wG-A3V~IwW^QoxR3Y?p-l=Wf+G0Z?yrvz%?EMfw0{w z#%op)uL|n$;QGu)xmVz4VZ#zT7iEeU2U8J0yC>J#vyqU*&U;6?UrHw~Ow9dCmsz&m8U;8tMd3%44l)W2Po8 zJ==!b9kbqn(&F(NM`4Q$otG)q;KJ~gxzb{mia&EoRkp6@?B*HQ;YMi-SV&nR%=ZP> z5b4<4513D`_SZ#+-Q5k?Lv3y>Dt~53Fx|(MO^e}jpZ>z%k(|Te7ek90MMgreP_$+9 za)1{0-wO2Gd5^kn$k+2LtVHpvPhuFa<~r@b9krcwKY07!@dmWb=8U#I&)DHRkdBp4 ztD5dKhm%w2G-Pl&7FO}e z%569<=#4s&)!uGYRe3^Lz&I+IX)JhBFr<2fik;%bwj9}%5|Oke^{@?gQ4m@^TG6^V z*!RqPiTsH?iFwGU?|`=Fxu7ln*9M2lf2gOw7arej)mq|OR#}ViiG7O%W!3mkFE6@; z8vY`Ry2$I#a~JIm7w4AR-cD|sth~%KRzM^v_pDo-Ef?(L)CZZH-aQeweWZ!G5Ioji zn^9h%9rYA``OryNWzB#Y(c3Y~bx?ZU2c!MK>{$UT`O)C*Vo6S~lcd&e#EZk7EJW@p8y5V@hc~s4o0nC$9C4vOMCVtHW;} z$8*Febg9j?|DvJb#V*#t=h&FS3Lo7*3@I&+yZF(vjrHFX-VOZa()2!^-di5P4L`f> zA1#QZ@LiN#b)UL!yNp4@YqK}&Y(GtebXy#it?=zce9_8Q9}bxOzMJz{jOiNH-yoL9 zxQ#*4OiXodO@`}jlco3Dt9Kd8d9eb;V=VrtUQa5I&{w4n->M>;H!0j=+kZ1qq3KyB zMlo5^Lm0*waYMc8%LHsQI(w@-)JYx^uNhX1HNBQ0KE0(!t#k?05IcxW8kw_a>qGuZ zGBYut-!fDdA(!7-n&!f$1>9dQ$K|PE0;IOi%iG5;65gdf9L{DZN%I9iw)>OkBD6luUt2nmFX3j@=!QY%azTvj483My38# z5~_2qd_?k81O=Nw;wFNZIN^exoDH%x7uw?0rzzJ|lTlL9iX0rgg%}=^PGme$GYXX? zU5*%g!@lW@@x-XJ)~*0N>YmyWYM4g$kGjxNS*Dp(Kn~IK%bEuod9+-M<)1o=Ig!mn z(h)%rg!J~ae}D?_o^O%|+WE2a*r+K~h;O125Nh%4=qvq(+$TRtuJML6;xF)f_=~hy>&v2qSj5isz)((Z;-;#vT%|H&4 zHar7O@Vy-_F&UBfd2v=sCs}R2lllJ7F;4Yq4QHFPKRgA>y~cR0rt?KL z@!v8wL@0Zyi0HsvX*6S3f)Y!XwEm|RW(7rK@iY`HY}Pxwh>Mnv#=oOnp&r8_^8Dr^ z+zA0?ssChS`8sDx-g8FdXaIbCGl%plI2D!}RA9qmZs`p+Sw=UZb-_wgkGy}e1QG z5_>MJC;c%NwVZQT4e5`Xoos)*(SdjivHm8s_2cORIeE89?E;q&iQV;a$LVJa{vV{K z1xgi7dSZa>%Ma~gx#+k5`eKPz&IV6a${zcUDQjaf4&obWqg^hxT;l-wm!$}zgtN%E z2H=ja)s=JY%Zge~yD`+5{X>?oBzh6Zp4Gv~`M-x)vIOk&Y{b?7+lbrEJa>m8 zM@S~7YTfI&{oIUh5LUck!p_)NdYH#=bMieHts%s=p~lr}@F_>+Ru4;&QX~9PJ2!^)T3x z_QxM#j58sDQ>IezI5FRBmMxR@nhF@|moN%|lI<`fSHf1p01 z8y+gGk0o~)gcUFQAW%a5BON0q4*GjWsp&=Q*2qm0chAKfzR|f7bGsWk=ec0!hE%=- zn?SU%yRYSVyMbtaHp?1(vUyM`gL8M#T!XCzu3CFH$1KW&{~iqxddLvQz3FSd18L)q zc6q0-;Ik0LZ41+rHm~W$P)8=y|zy(Vj6t= z#aF}DQ%^CJWw^4QDl&gesQO{ zS`AOFtVIC;-!i)(u%nE!J_+}C9hChz)}5@jH1JO?=hZ9Tx;4*BA^D)jxEW>obAsDatrShiL#)Sfmq?1$J)W>Y%x zhrNu78#zt;oA~T)hIH1K=NOkJFqU@1yEhOH}{oAhDXK8+*ln|SK#KumkowCD*A5NKDzG$q& zciR}D^i%)1qfGf>Qb^cMW1^J&R#=5E4~f5TW_pB+C==Ab&L0GO*#s^Y(2zfGPBylq z=l|M)C-Y)z{pD=q>9d(13;$dpa%n!PjCJ2JI_Us_fNZpK3)Y$Z<2oT>Nx>u8eOqIP z-R(3uGVI~q>du*cdr5k?W$tRBS5YEC2hes3jj)yT7x?WTOW2t&P;PC^Vi$DdCb*Y# z-gD}DdW!$cqi^Wkh^38>R*bCc%=+bD&_JXaxXmJCT}5ir{J8U4;6`` zO6pT@_1q>T;srx~W)?M@EhYq2tA@}#?L_~HF1(yd%~56qDY#zZRUjYB4E63jMMaF^ zYFd;$&d6{jT?!`^iidkU?OV_Qg?M}C_b273T7DV4F%%n%XbI%%7ukMOhx9K$P)sxp z?UM`(ds2|=zlHd=9BmveTk*~^R9&Z%z4AM*?e2?QM4|3CWL&ouxE~r2`5>W+JPL#- zI(&|cTi)th>4@^Cf1EH~k-N$<(Z*Pk9QG};PuC{3aCOxE+M83_7TEY5PBU({au6F+ z;3Oiv4W*C;0qPY)I$<3X+(ONM7xiip9EA*iR-b}(H-_Z| z8Wsl;ac)8d^w!m1+YI>kOe5 zQagxKBw=643iC&@Z9JVJiFnI*A0y**SCzaY-u_XG$V_IPJTQyb#{+ifRN;*NyuFYF z*h=YmOnai=ruGdh#DX%pjDEdK;rw&wZrJ6WP$5;cFE6$w`^*dH|9~{%Gn1D z@**XMBfmawqoGeGbOYS>F&u8W=K7v13Kfu&zz%o5EI5h5y(&773+L=nZ*UbHmvdlTQRmOp3cx#^JGMHz*iN77RA%2*YGpFP_PW!m|Yf7x{dX4|I=V6OZeba2w*>$>CCCOt7Pe~x?aJCBgzMbYh~ z-k1rk(=UmR!8Zl-m#b}Dt@JTcH~uWnx$Sg21Bp0)-A?_Ki0bfX#yXJp7;)W=W!bS6 z{YFK9nM>;9C-~;>rEQ00q;l?4`gk0IeU^Qsn8i^62Zu;_)!>$a_!?H$4z5N3wq$$&a-jiAb zJ9mHabdy`@C0AM$+a)q&2&&1Y=8@~Y$)75F6bDWpH?682P_S@!+7>$I-f&hgktsFG-#LB!ln~mayiv zkzBu=x*i1m6+?1z8RsVb)B5WfqVL z3Ka<1@Q9 zGpf(Vjf){+23fQnVOAooGs+0n z=Y*#lGL$$A+X&R`$59dX1KW%D$;h}s``Lu6xGuov8iXppGwB5N>T3KaRVAz!pnZ+G zO1PVle?||cJpmil#c&6z9z>xt{x20cQgM}5>iO7uMvL6oFKqB1uPeh!8YLGk^IRJt zQ%WsWy@TF^MJL=P)Tj>Hmf2E@sCvQ{1_f+JuO5V6ZbHd`~q z&7_qYI`VEY3S_~D)c=>3a8=3M*~x!X7Gv8fJdB6vV!!?3E~aHyN1vG!8e={hHtsOnQ^w@oP&2AR}EyE3P+&&591Wi8U#%?uoj>vpE z>&STs@~xp>v`OKJ?6}~)iT_8ax6tBN*!z@)e^Ngpku-|PDJyB?!~X0#)6mYZ9DPzA z4$?-pSR~~?5w=YVzl5k`#geZa3kHjYC_6dBal}V5=qqF^pJ2qvT;SgXguGDa0>W=v zu|H8D5Zj1aTDh0zT3K(+oWu1!OjLhbxYM5h6&IB?vP3(~8GDegOxgDL)aJ8&!NGpsaR)eSbDXzMtt@&lcmu2x&G%Bwo8N_q zjqxH`zO+jberl!jV>?3TA$FhcHJ}R5)p1E(f)A74#f&R@?rJPu)f@AHIj#gSX(ztR zzuZ7iZ`e1EhuWNvPlg{uPR86X^75TcwL{3v7&WKPq*`;7wQa2v8PC(jS9*Q!u8Pd5 zJ61|AwgK^FC+#KBaaWlSXN*H&75rrh>P{|oaCO;-3_3AO;4N>YkJ@hf|WJtm1?FsOPB18wpcOhZLHnoCIgiU7P*8cA3fE%_Z51}pr(8V`A7n%_Kc?E{(|vdzuf`2iyz8oluEE0wbjJ6)7=Rkt$ZdX~B< zWvhrYQ+hn21F2#Hi99!JBkc59 zrHQMOdfK~~Wd%2Nw)1) zVP!7~3L`b^wf|qLOA;*=(?_JYTKm-QGiy50XurL%-5pjlV!^x~{;0mUK(<<--R`Ax zHhqXkl{#NO^B%~u2Exn|Y6;J0W&vCJ&`P_cf1P{OC_!bjD1#gQtZT8<+*W zGB7dB0^S*z9Og=o3=D+1(l-Me#q8;xfsJF{^UA=cG4FY1U$W;*o`C?}OgYh%FSyng=VW@g&q?F(M<5~t9783X$`%g6U_X*nF+aPF{ zgrHroOad7D31H03AZXVj$fNH6qcMtrMs5NcYb9w|h+Vd7-`bI}x?D@31!DS01pd`J z5jNfBKOlVU@{p@~2qcLy{SQE)cHXRT;;NJ$`Ka?ZK#tl78WwWtUVC=CB36cT#Wys1 z@A4k@6A&ZDx_OIX++Kq~h^o6l{uon|k-r4)8qj!>nGM=ocP+8PX$ch6j5PJjWCT*j z^tIo@{zqLp0(CwAqb?1By72#~izZNa{xN~NEdL19T_aG}%k1MMbMLL%^3U!=nzt6C z-z9@V4y;fnBB4?`aWd%BiB(cb3b$k8Y!)afQ8lGxP9*%c<&Q`>(SLCdtz~_rG`Z!^ zNQUz#2vZe3QXy-VhUj$80{Und#tD8^|J!cV1#)E2pb1J+sPe>XVnRglkq{AZpP3Vl zkz3Y9rW0jbsR$5PKUb-|RsBk(lA`*BiYH0+Yn45+YBd!WqH1Fmi5s#Y-U9AOGogZ# zXfv(?`uAo61!>V{JO#5+W+DZR@8ORN&LiQx1=P{-rv+p0;erKy!S;miw<#4*dV)vV zc*o7TppnY0u^{j^A<7G+bXzV6yyK=`5J=@_GZ9J({gq*FNNmaP1|qg3a>Ei^^0|=^ zTT;746I)8SsS{f=y7dqv9=mZ7XH!6ph=eX6M=8XX;%;li*>sW#k!Hl%<|>usgpkMc zM_dEXxqEJb1u2wh;s+`8?Sum;6ePT;)HyWa9BFVByIE`ggB%C5! zPlbi7`lU)k`_*DexCw5f@=q3;<0*n)(s+IR9G*v1e zPTU1Yvr)R;DKMpS+bNj6?e?)?meOsbVD^q%V8JYvTXlidZ8xO?DN47V0;xN04FytE zZpS&Ihqv9_3mPfigeMTBQ1OW+>uF=pud2#>$(`YcH!^N{EAADaA8c7Vj=%i#YeFuP zVM5-Dc)TRYJNaPTUDWMQU+p#inT0J(=34s?Q|)GYSJ<(TQaz}AyrA!L@S?asIm`(& zy#$`)TCCi}?y)bxq)@h(gILv5TwucM2q`lp*QJaUe2VMRPzt`lb?GVvpX0g&OGz(r zVW^}wcajKZC=MZr;gU+PwYaeL5DaYCN6ln9HtciE0w-MvV>;^U$LjH>jiau(wj8c zZD%w6;}qE|>m2OJX#|i@5-vTeT;fz!h z1VM4XOY`7y3Uk~t|A3W;;M+>p{EE7>YCpxYe#)ld<9ud+-?TLiDYH7$Zz7j3?HjE& z-o1*K@=jYbmx9AgzXi_%$IaxF%^af%0?YV#g1}eGWjk#xP>NwrVBpe2u);i#w!H4E zO$e@_|8>VnxO#lr+3+cyl-Yv7xa$ni8ls3davUQFbm9XE0s$%bqQLmT%=}A;BG|-n zjUX_Lw`dwVRhMGu)Xy{gIo|)wH)Si|V-%*J2l-j|+xz6k%pbof9im*-g7?l*1e-aY zFRouVF?%jN8y}wqZV@Dqel%eP!j81-+lah$%=11$zb9;oFhr}o?!&?ine=z5eNsrpsfwpB>)8~00hCk~92u9uP7QqKt0-c1G zZQ_FMI{3d{IL#I*g+|VXq6G|RY!2-27Bu{f03R@Boy>#frVf|Dhv?J7( zDOWo9YMfl*hkhj%zxLGy3y#@G3I6osK_y~ynH&E5)G%fA2tEm}628o353*(s8eGe0 z6+VCrt#MY4w3Nk|W+busg_j982gk4?E9x@mHm(e;Gm4evV2*(cA(r!jZ`EDqikpN@ zXK*RH);K>)V4!FyJ}m`2VL2v zn}-{VZmq8(=H%v>`*vwVziL@EQQ8@?nqkNfKzTj;SDp*1AM;#%#Vl=jDeK2m4)0O6 zC&XTZ2Zb)&&H+>{C0(I9hk+XQJAg)xo7W3$UG1!9q@!z-yPd?|;)h0O9M%t;aEp(p zVS1H#A3L(PUNi@Jhi68U7rR|9;Rt)S`o$V?@=T5Jj7jQ=jlakIhR~-MJy5gDG$-2W zF9%zB%;|@Vk0BA3R);=zfvnQEs)8)7GTyFRNmS2l9GLvB(VMbLRH&W>k=OonuRDxW z4IXnFAsqRUgd<#ue(-y%gD?**?ehyR?h zRC6mE#sHsV{teLKdhLi6Rn|wF>T0M;>0;&la3D{Nvp{sB%EOLu_y# zuW6OR>jAd4_L1kSYISO-!rYiy`7qWx&IgS!ED; z;C<6t!|R2sTAz1NV7||$D-aQ!f)ZlAyjgh3tx)}J(_O#G*4jN|lB~1_M609;W}H7z zPP)DIu8Pe}XX6GtnajUg3;D$2-^Ry`v>kt*AUz48uqO#J4aoLRq}A~r4uiC`wIZ!L zLOR%5-3g^N&yy$kTJXD7aRQEv!}Hg^C0_@|AzzCU9j_CbzBY6@;JJhye`9K=@ox@j ztG@KGcJbqjgo;>NtFID_`?p@7@;kDv2lzXCk;55_zb+TJYA;>U5}Yy3sFJtyg0-*Q zEl&OvCp==d6_=2d-E?lWVVIb@8?tRT$adL-BtPE$it1{X{ym4 zLd!uvx3}&2kFSo&Q|5-2bV0WI*u?eGtC5jIJozSRlLa3t9OF)Z^{Kl|*k7rZlg5)s zy8O3zY7ob*{@vr=-P*l>A4i|Bebq#Xsh4wzCat8X`bv9@xKO}*32qv*DHAB4mkfSO z@BykfSjDW>`BUzt@vK!C6776qdW{QZe69ffk~m(Yp-LN=*tZgZ{KbFPp}(;icZPER z6JLrUj~i4@AJZ0kiYhflUt$^*QrO@wMJsWZ*ZW^>B-Ww(VqCfEer6uS+^vAnl5Gp8 z3+yS^{D=^GYf6x#2+t~8gvF6mQOx_%r{Aq&$#lWe0W&Bak@(0!;x*I)7mENzHLYYv z8|e_ko~mJNUJ}jI)*j|PVuMTTrlK*rs0O*c5;D!KjJ&h9prG`8UHVSO{}y5qWG2MUNdF|6 zrSp(kZq5<;E%K+Wkk_@(jC^nOF?kERH@b@a-)HFW(A)dus=_J56_d^LgN65Ia;TKRc_ zKAP?S0q z|KUs7eh#KWvGKidIo&8iN+v%W4~NqL`mI_&o2?n|&fw~-6Jt_+%g|npDBI_BH|QU; zL&b9W^oC`1kh?)E$^O5R%m3egubg~o+9>8{uuZQ&md^>|%glR=p5`)T>gMibrOFM! z={hQQr*{|HE+s{9 zCI0uC+5CT$?sMNMu1P>eka-nKJmq?@Q;JenOae$oZ)Yq<9s$o_56 zjpZz5+`TrCZ$9HLsg=r523bWLX5_a@i#4*?UEVHqr7O8Iknd>$UqASIX{^Vp1ck0T`n?$h=$!mZ-_r-F*vTW4IbNJp}WzqEXCw{&N328ik-G zm22m4EdE7%qyo24d%m7EXUEwa&8EjvY%7;$9E-gqOd)bV9ui}%w^f_|+Rdea{ zmGlEh-;DGZQTki6wD-WIQcO2D2BazNK8*B*RGy2nbow{F$M8%_>NzQfs};*4f1W}5 zyJAM_Ccd>rv~bi-T#XG1}hXj>{=024=nzHtkUZQ@z1~L z-8n)0^EJJfp-)Q!t>rpF`~zCAfaV5mOhBswEw(z4#@0W^=)IU3_GXx6uN9&16sk7T z;}5z)JINa-c~>g(1~V5e&kjAdT&2WWzZb|0BhIRaJkM21oV6h{tT>C-Ui-oC_|=T` z5b+RC5Oh8JgDk4op*@5 z;7hn>Mt%?EGCOxy)coHANuMwnN-6{AEi`r(f>!yP8R

    ^ycXFc*QB)9NkH=qeBxr zlB{OPDub{7P$X+LWIf5*L|$MVN;(^(1+5!qq=8_vp4FDZPVEhwNJo7xr?%U0Vz7NE zf7RQELiSw_85P&gus7!{$DAl!uReoQ3ke`Sv`)f4QTD~Y8Aq}_kmW>KUJLymA-%nR zU9kD9eac{{6TBB+3|jLIGn(%!<-poFtOi&ou=Ogeaf)Jq87Wr5*!d4y^Z~ZNzi+x& zZtVMql0Hf$V@OG_#>Roo-k`-S9`p+iiwEZ6uw-E49A@{>dpugK5ZI^+WAA@K=Oe(_ z9iKVB)L$28Hbg3aGH_X3>A#(jQ-?a4$rfXO8N?Rpp|5VC90T-FSR?l<%Lf^1IS#Fy z9LRY{R}T3#C*+Jl&Ob>`b4I9f1u>iH;b3ct^}A5N-_B^y{{w5}u+CldE}$0c1J=%A z!@#;Y%mb`fg;70X(Pjg{#;6`&9v>=}b)g=A5Y>ade+hXNH!AaomgDvHlBHe;6zhfL zwL@OqO*7Ic$va0N&oO^_hF$cIo+8h-czG_!E38%Ief*0>%WH-_k569H;^mF+qIcyK zd8aR4o_!i)$}Ka}7bGuUATLX`rV&D3{WN`_nvq_he*Nxoq2Cole9$I%FO{8t^}_Ge zDRGR$8jql0y>I=@16q-Q7P}kct$>ycTD5@Y1g$|pD*>$)v>z!nCxufAts1m00j&wN zegUl$v=ISq05q?FHV#_MZThsBZ!jO$p;7y!LT72WDeaw~zUcO@hP(lvyr{*?>xaCG z+ZFrXa%?Dht3t`MK~HgaD1N<0ATJERUivNk%^g}lTm`I~!|H+caaaqmK@RH#Ho{?j zz{WXj7}zX_d4NULYq6+VdiP6=P4YsZOn*hq4{qL*JEvuM3kBac2V-v(fV9M%eK zgu}XkjdNK4EWPif#YTWd-8qM`@gF*f1NJf1qxuh0AG%oQ{W-ev$t31@#Md{@;Z1Y|vk)nFX!+(HZI9`FukwH~+>M zWKWtC6H?D(ky)Q;fULnk&PW%EWU2R~xy;G>OUpUYv{pT8G#Uz^#|xH0kJ8Wd$Lkzb zoYibHZwm1)VVcEg@SYd=U7-NDYqNIVUkV*pwa!R~P+b4Q&$#tAwk6V`?5WDb?|hx# z?CsQ6$S!$&M*2YgEhqanbDd@567v!xt!lFOIBobFK-$5c8R;y8L0KosM^^d%?>Usl zi!^a4>vjG#RGc--)%$M}mKY8Ff6}gPqyp>s4rQJKBlS(tK zB^?p^JD7`FOz(NpY_>19VX4tD`s9qXr=nv!jsM7F1Xw?n>Cq#CtwBD>twB0{-&n>V zxBRIY>3jY*Kh?jU9Si+cfL22CPd!3i=aa3r3)EGX*T0dz1L zNMHE$jPx~q+uKi|BXz&Fn?Bc7QLg;$svGh=&&)_~Q2rkz2YjEULOj5&e^WWU;H!If zMjEHKYEKT;R^(sy&~K}pX!{Z8PjGA|c<+^R)lH;4iic5Wo8w>qgeT;_Tw<1%nTF6Qph>WZO$f|uOGO}#@!e71{8ChkJl{z>h z{lz5qP3GH8TxVSq;GEd^AIKUwC9wZcza9Xs_JbKIS@D5tTRYolU5af+L05H1TS&H9 z_M;&4s*3`)cChdNafmlQ(Ds``V67i;=l_5We6slYDfPP!$ch>cf4)5iKKpQF)@aiA zLwxb+jC6x}9(GY}G+$uWHw+g4L00Jq{8DHeqxslUqn=v@tDD9@$SU|^MmlzWvW)se z3=N3-c1&kk()KqRlD_=!pJkzTcccIOoyO4qABGx(R$~k`4GM-(vV9A5W%y9B{fB=T zmhIEpR>lEFgAM#2`PZbBbHAx3uWJ5JJBgM5$&NPJHA@L>J*@_E(ofR%W!j6^1l;vy z@cqlwANo*_$|Jlio#m?^sK*y4^?5pE^&_KtF~Gt0&{@!q<0%SX&;0nPf2YO6UQ@mD z`-e)OcLJ;6uwr11YMB;jbH5GO%k5 z24y_XLkzP)vv+$0X=;(?aFr}Cuohs6DlF!En9l<{ScTbu4gFkO-ZWrQM|$NuUVZfk z7I&n!uEoGofu*YJ53B&#q3ZerD+Tsbb^U?W9H}i|8?a_z$qHu2UoUK=3)rjFhWjO` zV?d{B4FMkLcq8XsSsY2RX$*AXO!G=n)UUogF!b6~yU~1dkl2*U=Y~uN{7?_u8-E~w zf5plEUXUfP$JSGOe>EcwQTn@(-h7wI&w(Z-nH@;e{ErzaiOxa%CP3yI^`28&n;QB) z((jy+ZX;!_5PgR*{aV|%~jwE~Odur6S+z@k-HKd=}s-3YKKE}a*cMU{?hGzR8l z7?<~GYS-~DhH7(JMe;E+C`)?s{y*>}gJ+pab|J8&aqZdhQecU|mMR#HBUK5qKE6-> zZCy~C%eT6&56?H$*itBr#D>jCzt(%w4rD0WCi(}S`pKFQU@ zCp1SWgsh|qCH|7vnb|!GxyrfQD(F5P^e33#jSqc(0KGRO$oYX}rQI9g2j0PN3w0*# zdi1^izL6`RDKJ0>j-!n%?6DELeVf|u<^6TP75q-;3|N9q&My*kpHj#wJjN@19<1zLq4h_y zIv~p%Az21Yd76%mjI094vZqHzRwHDML`c>MWYr!Q8Cj_Z!{7fRGP0^5t3N`rdLgUg z_{hkLI|SozMr35UA*(Awvf3ca{maP6@$6`U9uSwoOD9wAvt zhhluSM@Ci!WTj_DMpidu4Mj*+%#Sf&TM-#qMUa)46&YDAkkuO@S>upZ=7@}}jKdIr zXGca>Eo5~>NY(&k73D-mmMzI>n2nIEGRSJqjf|{L$Z|L%Bg=BQ(J&ezSuV(`%ZrSx zCdf+7kBqEQ$Qp={thApP4V5b+BdZ#+5(*+Cs}HieBP1*Sr$$4`s>sMHfvl+2k&)F7 zS#1%LH49m;HIb3!Ovd~$Lb4hltI-u1S;LT(u{JWYl8-PNh9e}a60&O6EnXJQ?|L9B z=_Ic-8%)-xf%C#y@Qs6SC>YDdF=Cfv@0AE6y(NKPV z_(e^MP0Oj@eh3M z3*d`82IF7Z0>(e^87^MH_y@k$1@Lu&&vnTH#y{|R7r>W%EXKb}7cl;T&sn~J@eh3C z3*Z|7U*l!r@v-%vbc}zOd)eI@LcaIs_r0VDvMg6DUY48eAF_HO>xB7yhp?Y5*h3nUD)vxzDtMD$ZgyRs;eWILc`JrCDS8rT)ZA$FS_Z#We{)}FwUuc$3bFTJkbWHL|4n;v=>h<~deK$qmvWaVt(4E3 zj2(~hU_{o7*p}$J)HT-Be62+>|CfRM!GEUKuc*(-*in3qO4Er)pc>H#%H}ux}g>tXpp27G{D}?u7LB;@NoJ%s^|0alx zU+dPP9sHIr75>M=<7ele!Jqb}a&OYr!T9OG(xbZPX{j9Puz}exm3zFhztOh?!98BY zUkUz%uV$p(#J`__pNjBcVBe%Z(+U0|@Kd{3{=PfF7o4MVqwM}`q#r~2M5_0;-DBV68^ripOXy_YNeDZC(!}7`04wc{^1OHCqKO-3bN$5<^3h$ljhMb1K%lm09 zhlQ*cI(4P)ZhYT;c2Rbtq37>`XCs;YW8m%myW&q@^Xiv5i`jp!(EcIA@()FC_j!eO zbC!#X*3g*u`-q@9TQI^_M1 zx_|15YVp7@#~L2Ixfkx_>GD_sHq6{4e!oFZkQQUrYR^>0o2oT2WulSb_P| zzm&fG#;&mA(kk%Bf33V-^zbfme5MnZk{xz|zw~RxuFH2V%B~aHIA%1|LSDM!`xO1J zw%X3~@84woKMV0H(w#;;QNik69#C(}rwsXY&m*5Ef2lKvxtyEBG$XGHWiPJ9HWgMsXZPPB zuZD5;+4>T{>{-Ze^~oNX3`;gU|KUVDeV+Hfdj5m$e?q>larwUW5c6}oFYb2TR|GH@ za+4o~EXVoZ-I@aRqY|`%VRA+~kmhSu6QT0SrSmF(6I?b+YZM)j*YBB;R?r^#%!$x3 zU%k=XufIm&@Wn_`&_mz!jFd()_Y5g>wY6MyELHYBMUdC!Rr<)g<6-p?w*M<1@$q?! z-*Za+Z3wdL8q?iKO@=ku{-r`Bv*58^jguKMC`j468tK_w>(#GAk7LzF z;bn#F{^M0fgRRV~IoD_hR`-y1(eE0P4c0+cYBSn!K4%Kc&6f(_Rr#a%tl>EFhWNg?4Suq!QrPoBVa#XThrk8XBM`yH2f3+fujS(Hlqw;yL^r~eZZBlJCw_5ZI zOzSaG&|@-u#AkG-?CF05j91yJ_d4z7Hgn*DOdfk*&bb!z#h1L&S)p}E_g@`n)Ez(& zQS^uTQWLUk#9uG+Z|H*Fm-zAz`X&xR1^SiA_MI;*Lwj{z z>XlZ}9dsN29x8UrwRRiLJ1pXFEA0N;bC4RWy~pJ!#C+louT*0Y*uFM)I!8|!wP_vXj6=>Is`0fM*ch;{ zL@~cpW7mg^#zbn%G05uf{_nRX`G?Yz5ud@1{y*MKWB0$qjvMauN|_YHJ@RF!K1gx? zDVGF@vzQE@QGJ*r;Hs~!Y5wcJ7tPN zOXbp+11r!;Pv=JKKy!n(!cQw_=SDM?J3rb$uLnKbPp|an)eBn14zKq8b71AbqEy&8 zurgr#DH!$Nm{aI}8n1LG#R73(aBrBe_ut#cjLd(B-`sDCx7q!lMVL=;WpD%Q+c7tv zpm|vZuwGypioa05+jFUEUPf)vjI=|(@+xnMvHMSej{rZ2@U^6iReoIzfHn%6@BT>k z{90n?H_7jxoMtq{zpgEJJg~UewYp0N7RzCFU@^edXPpayMR8auFbjuO z0W)w|J+RpwTC4?_m%}=Nc{r>O*tiN~-@nk_W5B+pdOAN3J@(q^<#h3V?Ww0D9!8nt zRC!Rlw*q5#N=|+jI*%ZCO0x6s$iw-Tc3#>F%<3{r=fJe5@XASBEL>^?0C}GfLH#$n)hkY@2f>r zZ=^3n`tvCN!#@qO-a{Y8J`D_?v!hSe`^ZW8Hx*;P4Lx-kHEqY%2pcVhYS1giE4I7Qw9Y@SjDQwD=vCcEa0y+?d7uAFvi+>N_p^fHe7UJil#a zvv0}y+Ql-89h1&N{|3*kqS$fntv}j8qyJ_YZ1i@y5wrO4yhQhQi%b@4wIJ`P&%DxF zfBTtM`p$lIgH`}qzM_96e#%zu2THw?+f+v~z2NJJ*1vx*&TTXl?Ymp^K4=oKLJrFS z=Hf6HumTP%0p{eeN?;CP>anyAn4QC#fn@+w+d~JibYM$WSTC@KPrdS+qY5?ztoc*z z7C>C-uTpfSR7HpbU|c1uSoI#m0a3#{>3zA>T>f9cR$*{AZpM$4^3>OPTgwUw&g z40L>_zq6IMYkN_iu}_sgy4O%ZAEh*7NE82=l4j<^fHd?yGU^<(2htp=-LI+MSC(q3 z;XVlE$anWR!0Y^69y1KSyVNi7U%@2DKS)#Zxpr<>1O2d%Ec2pIwkEkW zTSVpdhpY;feDZZ8XzOxd?B2y_O}$sb*K0P|(Oe=9b})?e2hg{Wzkd*FF5#rLrMpBA z`YPpepNsMN3vDb@aW3sA^J?y&sRh;ojJqFKd;bi~D=AC8X*aq^3&`~zLOr6sRO+4j zLC|`W508QFDv^FQtqE8|)|=+1*XqwtbEDmmS6Bjh|3u8W$Th%=1u-JoSts&{`+6=m z2#7f{<+lV~<|B-HYjz}K7BcF0E?fp%|3~06Yuz1@I|b_?Yk054%OV>XK9{~9d)fQ- z7U!VWPo=8v%F2@SjyfOyGj{QLlRXtfueHegI`WYpy%%adl=_irmmnskv9|?fOWJ$) z|I50KZG+K}jIrVWW8H@8-i-X);{Jc?o=|EuwD0%-Q}+U;-Mi60S5S=n+&lAGv)Up0 z?#R~vP@mxg{`JdL=1PudeR5W3&C zo6H@{g0Bz9Lq}ca-BLQ~-26@`nGQM!@qvEB`^#AdnXOBAOT{#C@0x*7a_j<{%&b6e>@|vEfFmhv}7F` zTmJ&h2AXeNXYZc_Ycm9vlj5eD3t)&y&*HLg-VTV%h`(9MkDu33%+>{3Drom>V+QrT zkMbV6Tusct+Qa}IHGp>?YUf+t4n5|hP4>}mlPuruf2i>7H9tOf|10ztd(dunj@|P2w?ymD zZqws|eL575tmV<)%l3Z(Y?vPNhwebL+z843Pc(NRe~$!sL)` zB$OEqy^wP?^_7F(3cWS~+ZYZK2h%!HIb?bc*)8Rf%<(sal#LerO=nsgjcXv+h5u&K z7l1sS2#lNomA4PFM-SaCJxsD+d^2=;A#Y2NeNhy{+AcO4(vo&d7e&!|i8n*}ESGiI zXmVIbjRn?m6nYee9yXe96MtN!_NqZ1_SD_dyVPE11k(w%SEKMuMjzx1rYd%P*qi!x zOs!_}j?ufxqa;-RwPe%Z~nFdn?xmyv=kfhpeOVm0KY^ zokMbUxbit%W&74c?>X)-$LrU6<1Rn7gI7 zNT#7bL|ZoKwk2Etg6z7q-Tw6ncJ3RpYk@sWvj300_l}G5Xdb|4pXUaSjsl8+3ig5} zQ6nZ6L`7^76O;J9rWiGEjIp4xYZAMNsHmtYYOps{R8;KPupu_=U9lTuSMJc~H}l+q zcgLNO^8UWR-{Qu zWGIKfGM5Y2LH)i0yo0lq2b@O{8$6rpTgs@^JM1}rb1jBwCWmAzySZN994Uxq8*<7u zwvOL+wJ{W_@1IdVeoeELlRVe4;z&VlvJLBX>xWpHs`SFBjU=|nR-UsRwW>hA)jhtU zHk@a+@nZT5aDMsF>bnk!6qgWL@H{2so+aGmLgH!$(0E@++?s@Yu@JcG{U3z;h;aXH zZl9XX?@7(gXRuT|kkWaN|8wcs9!{rp{;jhWca05uxG8z5&X(_M7~UMz4&?i*vdz{G z{0R2Q0#=h?4=i9!33kT<)`4JIZQuF=+;93(om?eY&Ijzz9yV<&O|9%_33q)|M3$}l zR(Wq4rcyr6&-1qzb^e>;YLfibxTdb|>ijpsS`%!939p(#hWa<1C-)}UUV@pfuLKe- z)dChwux%D;+Y>Cs0;x8r;(YD+WnA9PUbe=xPD@!4j37itpBk6>#VpzgsS=-YZ@Pd|QtLI29@LJ^Jsp}uqCL#z=J@d44zd3F=+v0EAkb3{sF!E=Y8FfRLY2AJ9 zsH*nH+|qmH1^1*G-@m1Fm6rdxbYaGNA5Q6XY)91I{|>)F-Nsu+%5SSQ&3z;57}2Vo zn63EmI9zJ~JMC|x)yN91=D9YeAIVp{CR^Fhd};R=eAbBk$pGu^AvBjJ*Zb(T*~%TJ znVD8FO=@Ec-f_POmw7MI^iR%Ke&90O?=Q&Kt7oOmp}FIT_7?%={;)odl%u#vFP+)? zZ4kjcH<6ub0&7h$|4rFuZ=iG{*q4;{0~1(3BfXL)utpABRU%kZ3s`-Ebt0JQ7!*#hFD>x8 z5^RtK-XMbYD`qa2Xo4kJz!C_SZUIXs*eiksn7|GY%(J-8?7f(a1gmKQdql9-7BJ0l z(!T^VeQ(d3U|lU>fduPR++2Ra1nW`Uylu3%La&>Vp6NR1Fe5!vSPa3ww16!lSO*JO z3c=c2z|sj8ZUW=IFIR^f%4_YtK8}YG2L7$ZGt8gjqba^4$6MIDUz=Rt)74}z8K>(& z>5~6Kx)@65oNS!Cwy8@89AtNnZKMSRl3fnXBOq zW`Bx*K=H@S%VC^9sL9x7d#=>|zmz6&L$>msk>*&bnQ?zaTUYPahJAF5H;nKS2=7@Q zUfw?8+r}gEhWaPG119~JZKD)|WfJULEAo}Tcd2%0RXn%ouJmrq9 zUGmOoGhWarLwkM~9{Y#gjO}YH`?bI9D)?+jgxuWL8kTzgpXgom)+v*C9QkCIE&s^y z@|l^RV|erI6{QdK)hTnh-&e4ao-4Ll{@0?^`%g%g(YfhgrT(e({*-1AwzS#8|2XQh0>(psY5)@)J#eP z`MI4IZL7b@vb{7aZv%EHPkFq3T%F&Cxk_|2G!|zwokcqfUUMS@p}KI56x?Nyxk3v6iCV zdq_0BDc{R1lR7)B=;esM=CAe{^xy;H_i8DfGR9nI5z{;(H8c2T#>igeB zt91pP@|&%G!{wP^ULMu{p>)ZVu9k_dlTNU!1p6|tJ@S2I)D+vgYCCcLQ@npg-GA-Q z10D;*qXgto9(>mt@5ztm$MSN=+Ln2nwfCf@8_Q4l^()ahntzU?tI2zQQv19Q!#r*Z zrK??8rwlRX^UgLEEVX;bB@~AJPZMZ7ul&!nkM=|>rOH1;D}iV^2mUi{?E%s1R86Pu z3$)qR#^vuj<)X>{s`k(D5J9y3YWz7Kxa}^ZbfYQVW#*w(f%W~&)>fq9xoa_D=4&x* z>w1!W5;kTVY@5=X^XXNoL;kn9gD28>T~qhB_s#Jb8cnp4-Xq(HX?<$mcZQhw41Da1 z3ZcHyI#!OB`0l7<#CzuZI^_WGFB+I)%8T_oEW)L)tnOoK>Ld@5tX^V~cV|d_s*9ky zI)&d;)YM2RsN5}G!{ryYY+7~vpG5v~J)OD_20muJ)ki&>#q)9r6qiVGXKkJTG}Bp< zbt{F+^uRg;EpzKpJ6S-v`h~8?eYU8v&GIGH<14ZW@38GvW79j=PlHu+KmW8A^E8Ke zN+q7g*%~*^&s4cspMnZeY26`OnnpS$)fTN*CTm{7@_k!tjBH2Lj-mQ$Y}D}|o8Cb; zFi&H%f64wrC&JxExCx9a1zTUt8D`$!)wesrcn=b#_Y2V}3yu21#=e3s`SulnVP5wR zwVyA^oyJ0AG?>rjv7H!0 z_4BNW&Tt;6i48uRneaJWW9tHtmU;Id63tG{b&4aeb@~)oc6H;8_4`knsbenJyMJr4 zRk&SWNiJw_H{URJ*s91{80MRYO%)K)Mwjm@ zR?VEo#`8>>6n~cD)ir^BWJ&ZlzJI{^eN47^UVd+Nd;h%bT9{_1(hH=zjiYq@7PY2o zvTARs_y12LeNFJ`R_IA@6?SW<#%~F-zUxQnyxZywb2U6`KhQC6FMbTgMN^#mZfe2f zTtBO|WwdLU-qmTmUir`1NjwhMjHUh1*=PG3$UZs<$t{)*1E{w@xzE1FcvOv~#T~uNCIrIHwy#dBk?o8Q!h8&>cPUR7Xpv z|CpGZTwh6)?johT$-MO~a7$8ztzN-4qhrL`a zJeX~`$G!dh$fhCDCBbWpmYZ)-8oJ-ceS~n-${zhD_1ka9CIx3F{p=5 zz3UuW+p_tFS*THUkOH1R&ZP9qD1Fa&ygl(vKE1B)ln*u2BwZ{ZEuEzC79e(tST^Ca z1T5;QQ@q(WI1}Jrrn>$~w3hYJ8SZs7w#{S2S0=&KJ*(f@;A_4)U)soK;lEzT3mKEkBatMi|8$^PoEQ~F!8Ia`|8 zoF?_g`8=Y0x>C9hoX?ni`BXB^hw=SM#<2s8_+1Ld4=4P+1C98N-;J-1fAh%S8KhG_ zv(o+s<`tu@sp+ix)+v;a-(a0mf%Cb&;vGI=8|!mr!F}`kP5ZpFKYNj`C_c^q-kS z%pzKG!*t3jTeQsYfBBEC9oTGF)g+@R=VzUA-Msy&who_@G1gY7Q&|4{`1_H}lYY@D zbJ$k;ba`R+_}aLht*L6KZ6jX%)3cR*#`?%xCkvLoF_}9yynjo$nZpYu&z5B8>TOx3 z>ioxi0hLDTl=bHM@=n4pO~*3E?L@fS^5cGNh8tqQjV9diQH7(MLbzG^ar4n-eVIkL zgGL+uUNDRguL!o6VDIO(<<5@m@bLH$%>AF_l|}KjxIOP*YQ3K|SACM@`A%nmVK3m9 z3&MX8n?a^S6CoAtcwo>c;gyL8rvBjka~kJLUpA@QI<%Rv>@sIj3M-`xK4*X-P)gCh3%JJa7EzyZIX~$wRGNfT`N!Dcr|N za95}5l=HUE)2r*WtY0ivX#3`CL5(dghiPdA(kCB;&D8>v<%e)v(JrR(AhvLAh0n7E z5q^8Zp9E$$=3C$NV`jc{hDK57I`R56lA?&Os9GNTU6XIqkQ1x{w^D(EMv8 z>69*s(lsks&$l8SX?-0pL_I!X^;9;u!62&RIn#}OBy7>&?jtPA(MtrpnxV5i$7;4m zyIVecy;nZzhFQpx`MpDo)}N_UdU2mgTv+h<_ejWR#5F=%Cr3< zowC82T|3>}mf*IYN_l1x{!`BLN6NFkExpyraA}H3f0vzv#cY`BJ9c91u^6qHC{L0& zk8)Jc!A!KnM;ys2P&3HOUU`;ZqZ zTIJ1wF}?a+_@u(o>qfXulWfytdoh}Dh7-;JD|(e@;JR9zRpm6PlqUU8q+$L2jM8|o zeY@_I&tDI94gSeubm4tbs{cbyRFA)qmgUuh(xy_{C`*6EU;4~cVB8+!2)A5Xw(?cM z^YZP;UzuoQ@>O-_tVOUtn`6okb1~Ra&7a39f09w{bvh-Q$Ia673L2+cxIU8PQt!A| zSM~phW<-ik>CJPdZ{`;4howqVKN|)<^SEubfS6R{+Hxz_8KHS&PV={GCYGni8#{#? zH-UtisR>o??rQ{GB%1RJbw`ZNJyK6>`OwVwFp_(v-8#jO*JYgxA%|na)?wGY^CicK z=Cd@NlEZd|Zcd?S!aM9TzRTM$NkGD3!+NjfnBMg5m_8`q+Pn01zBMV!!}!gezOz^g zZ{gI;igCa`$|B+;C0(Zsk_+q$yylo`Yt4~qy`nbr;HXYH!DG7ToOi6Xsn^iIw6ZxP zBv}Bzjw6>p6{%= zj_j4oI)$IQ<8y|lZK4(P)=X^^8?4O+X;DVaPMFl4rA-8MI` zTT*?p+{ucsowx5Sm1rg3)+uGVZrjd&M?0ZvmE|w2&{FG!@3hn;3y8a~Q(D{VU*=~Y zr7v8Ui59L&BAk93Ga!rt%?Qv^-7iiaUsqczyFCb_4Pmy;O*)tYVd@rhrkNnreNWUz zIy}@Vn^{h=vrNm@)Us?zvK;m5q+a`r+NQNapRW&{BPVZdbL}Q-6A{QyMX? zhO-K8SH}dktFu(Dt5oQ-BpL!Vr$93Tgwk(g?r}C&1w(n8-&w&I5PV`pl~z$8i~y~T z#EfN3C%!sl8{5^^xWCn|ys0fb$T7+*Chjk{E0$MRqIC@P$}R4P-^CTIf2iC2+t?4o zbNgY4u^+a~?Oao=d*>~ks2Q5~L+N{CH{?FtQtgIn8FPrQN_KkXILk{nv!FKdPI-lG z5U|ZbuXwP$j?OGdUM;QAYMQHCV~JLbi(Z-Q#Ah~UTDLD-x(=6kUBlw23(p&6Qu;^U zdZo2^%S*oX*$JiwvB8dV-Y7upqc_y~%9#b%Q-UOSwq6T)bE%z(W>!hP@(bIKqh=N~ zZilOP)4jufROdg5rvHv?<+2~q99IZFk2%Yqd)Qv@SKt5HMDYdKR?@*;s%Y?o1KRoM6iAYGd(vMO|S@p<(^Ad-#;K&4}ygmY;<>G z&9MKU;EM=ucK(rI2?R4eD|(e+=@xlEBiI28n05>KdloQ1f~8u}t4Xj27O0?*-NHVpX7LlVCtF1 zSJTbcddVp5EzI11wH-%LntqXby=T=-?-`4=agHMby?o>S((PFKJ=r4 zb&Jv~2iXSyW_m%suX-Ou$9LR;$-Zv{(aefA%B;flg6*t#oC9UOl}d!#Ptq%!)U~Qu z>tmSepS5z0;HLwuj9{w%qxwysY@~Dluh9u6I^o;+z74&4&x+RBnTJ`@ZO-!s_apAGUJGLkjhgD6TK(_tCV1WT0-t*RP zD|k+a=fl6UwULqaSj}DJ|Hhe*$IN#))mvABtLHpBPc3*|tblW#>irj#SHx`Ncw;O} z3gt70V8NV^+thd4n?hb;^>B%??ldHqm^q{;nb!R&g*nG#fB1}-DL02)BhuH zQpORje)IK8LoAS%;c^(XPK?Ko2UN}!YEu#1Mn9cmK7I@ryfZFuPcP@~xko_I620PO zSGc^hZRE|gIuWfz8gr`@{AM-33)#WuopwC#Q+r5VsaM8xeZ|KVbVh{l-4&trYsVXuH^tM$qc^6V> z{_ME_G^O+jl-Dlx?#9W5*;|!ltH-kri6wge>-EYPJU?EeKzc1%hrBGn54%G&{VGvg zW*@xnWb19l^vupxNq%6Rrg}#>-wR(mO~BbL`u~%A;njJ6s^e~fdZj9tY2l=T^%g0@ zgVxKGHy=0R4r<4V1?gn!7tZn}r+3dK0LkZBW4&_H zndi(V7Tni{yU1zQ&knT6Jv-2zXb%5aZ#X-UHPPnTfnkLAituXixrKv;$y>-f3bH)hiF&SwGs4yV?0cXSt!)Hcj>ZyF;Xp z7FwStiKe(mhu-?n>it&)%ORMb2`rglmG;S=@9WwgaWS4zm^vSOp)4LgQ znTKZtdzDL5^*7t*-CyW_*nl5yv~}O^VeV|=!hDu@#}B7zN@=_g>y?4#Y1n4}A}`U$ zPEB)zJoXHtG+impJ)<0~o`XMZ;hh@K#d3L5D1GhS`rNf!cWI2lk2!Xj*H;Sfoz?o(+xs^BV*Hoz>@?)OKBt;=57&%e?Z1=DVkf@mdq!a4J*fd~*2Q)w{VtUTlvI zBl$-X-c8QC_XNv(30Iq*b7A~N%wIa$Z`SUoj1y*ithoL!GW`s_`X-vDq#6Czxo=_t z$tgJ(2lL}_zU2t#D&eTM^6ya=?V+71o9etwd%ghus9t%<_(!6w*ge6#Vb#p;VY!Vy zLiQ=8v$#vDrKQ}GiGJ!)BR><~%@3C`i}^oloOAgx%2>BO)st^bQ$7Dcvdtl!NVd6J zMVZ<_P4e1e3$oUI^Usy2=iexO>*M63@VHsktjxK4FuGe5*)ac^A)x(9z1f|3(F6-8 zSP3IcJ^xLxFHiope!?=iL;N{s>XlDe24!<)AT`f@SXaIO^eDA2!uiQcpWzwby#0Je zp6XM5#Sy*-%k1QM>vk%ya#_A}5oYLb{V2adr}WCo07XE$zr6m&^CJcw$#px2=xa~w zl?%+-jy)h+2QKNAX!a|H+u+UA9!t>3N6@xk!F?u}WT&~MS0wHO zv&Nb0`*42Y<*hQ}eNz!!pM=jdzbX%A<3p>w`=>&kBJ#qlD|k#xq_knT^$Nc=q4{iF z{yrLFzCV=p(p5^cm(r+ji#Ik)6K2#)>fx%ln#pjFgEuAUaaXS-bGi@5nfARfv%dG# zqVF{^_^=%){eyd?qg8!4&U)Ydn7xNO`4y^Bzsw;TuO8?PZ#GvPXI&>V4;%CA7q(3o+xJ>eUwhtAk_kU)R{(7ueIvMr2dJckM?FlxoQ0H%C z`CV@uY&#)f*%Q*~+&6s3TJIap^6tGYX{B$3<(+>&C1Bh0zt0!X&$K35?l1MqbGrg* zshw7D)9G4l)6rmEvyJL^*=xPhf&Jg$G1lwY^gfqxdAO~mRrUQBs+$xkN7aKG{b*b3 z&NZb&*6)ODP3f1(Ifk>ZXUx*SU2kb;(D-KQn;Y%{iKg_CnjFI##HP_!?51#jt;lrQ zTFrMarE|B-QOa_^(NKMXeU54?Yi5tOwB0A=trav?PqQ^O&i*&g z3mffIuJ=fyGoAV9I@;WxjgZcnOfMSN{Rsb+Hb?2s_4mzaD>hSv*|RKR#{Hv-X9U!A z%`u$WHRttX(>&Ds4+wV=;f^-iG}iay_sn-Kg!=y3S>lh<*5kD8*gpAQ^^e$|3OY^x zn_G@jie*-2w9$TPn`=&~^7xW)ya}fw%R{n|Yq(s@^em`){}bis?Ve*;V^rz2&Rbtd zpg2E@J8A1IH@C~X7Vo0cz~9Z{oObS8QLl)%#3DILTjs6%DBEr3zD1h}QQsx9Bxz{> zBgN0RJgeSRa)M!iZj zlQ-&>yS%64zRhw*$lqJOhtWdi$)Du@tYVH5!Rf1xE=ZrX;t}oaJJz*TWh82y+1^+Ta6s$F4y_$ zQC9ZFs7=VM!>RKh=LGn_m!n`=o^Q8NFHKD9B^BkLtB8d zPpbaUc>$HaF!rHY@6fvsEl3;Zxrb5upbk_YJg?N*M%yxbGe?q}TenAB<*qjzAbvBu z^pJ4>?K@m!mfE@r9Lft1a(7&3`>vm}~eb9IJ_zCrWDm zVV9`C5UoUWA2P3fjWcauEe&TEhEbY`?uAdoe~Ka+FP1t0?2_eq??HwJH@*kE5h;Au zZlf1{%=d44sSX@3Gg79Hq{=kL+^xb$iSLoGz5l(U8`W#MmH3jvvouA1|LY$@jFfPk z1XCsY7Lc1ym2=Wndw}y!IOj9SaydpV>5+Q_3UBhr^a04IUjEsqaH!^kaz=bNen0wa z`Je{YNXe3#M}R06NWiJu*&@+FY;(X14sgriZ7yoMkk2CkmPo6`WmmW(<8U`vgHObE zH<+g79UM`PND&Jiai<&Hahfj8b%SND5=EbL<%q|wZYq_pwuw&$~i&C6Edup8ym}`Er~%;%YflLT`ZMn*})Mz zRKM$gfPCeAj^+;!(Ai*&DJ?k){aDY7Cpi zW=gPBe(ZcB1Qxl=v5jGUkxz~_hIp?}5*x$95})J*!)rgJ*pa0^Ne_lc0n)675K~5) z+Yr{2!Hfp5vkVgCXc>%e2v^D=!MBvd4Z$$BJWdFP6XkJQFf6Emn}Xp;1&nM04=Zkx z<~M=ORd7QSxLXakH-XdD6=`7z+g}}->Na3{~eDWj&&VD3~4~CZ?;mL+Du0AF= zf|>P^Aam+VL^Zv>L{wvWWq5xu5_4CAF^kZHk(i#`7^yHz8zU8Cry4%o7^z@a)Ud8G zB^n#T<(eA8e9lwD>q0p1v=Dq00w+R>5adk=5+9?Q5PDJ*B)*mS!J^kfP4NYeVuBqW zb`*#0kis`v_ABi1xq~>Y#mP=0+5vTrVvHjmae^t144>=BkS&gQ)e+Jh8Rvo{K5~@r z^R;|YPPoVkesjWPCwYw%0S-7Z@v}~bD*f3|r|~gDj1i~Ab!u&gHMqY7#Mom(30S4Y z=e{t)0dM-i8K>!jK)YRWm9L0&4ArQ;HsyNqem5Qp5gC z-6bfib0rw_l?oW`gM>22m&40^nOc@F>U`mm3dw6{+!m)ubHyc%I3yE^M;d&tf#GP{ z7UO%0F12Pk0Bm=}(LQk088b`34j0^50#e-qNQJoL4R5$w6ytp1s0U8-g%~d!?JF|9 zFvCYA6vLf9BC8njDlU67Jdt@-N-%hf5AN~-#Rtdwnzntv>7qZt7#T0Qz$)xXs>dE* zy1)bNVnYXT_NLz&*a+GfzJ2P^@8F0LGQ1F_V`bPRHKXuk*%0g6fPUNfO%||37$Mw$ zo0l9ZV3rY5^p4=S2p`Cpg|I@CAyjDj?+(PotldJ&K+L!v~3-E9d8Fo&scHD*z? zE@y$$1au;V`9`Ad{pnXH_KO4!%+ORcz{YUoOwo$6dtt;Y{e2^V%>tLnuw25iGTfDD zz+T9-_Zu_t>W_ycctki7>|@<=S28I|sNW3%b3`WtFRTK;!JPv7HUT)2Tfk5i`bI#% zE@Z13Qpl<<^1>TlmoP_%%xX!tbHB$Gf{vp69f$Jr5-f@Zqt2y>WZn1d8 z`wl#QX*l z1vg8K(ZC7`_sDQd!bA;hX8HbZ6mrFuJ9#lf!js5N=@~+fjPnsTYjBz!%w$e8&`7jl zZ{P;DQ3E?A%+$a`vL-aJSjI;hc+K?IY7CgZKO#5RkES5pkX{n~7aClN5RLB}T6mGs z1WOdSU%+nT5W7}%n?%4!3Aa{--=v8Y-Xvp2RhX{{0mvrvrz%Xu;_+2s8>Ud?NPC;Sg7trx2TA+1O=fW<{IBM_bxJwhrp%L~T^LY6mP2!xd-a=7u7#%ERGL1})S zT`7mxszO}(5fmo#DY7c0RpyC;wY=TnApg6_|Nh{ABiXu{&i|GLl0G{ah?F0b zY!nzN!BJ6+ayM3u-vE3l!$k?#XyBeCM{D4WT%5uqjRLAMm&9mC-MhnDJ6Ns31UuNK zk&RLMN}`DETGAZ52&L*s3Dzw?hIGTBV=x1?P$T5uQt> zcO%zNEW!@C4u!91iW5A_SfM`-USN048P?cinhV({Na3gUxX}e>XgN%y&J-6&(|$+c zoentK1DN`H$keWJ0>x4dKG48cE<~z^!M8d16!8_B@Z>%e?+chF+L208jJ>~J zFMNbB7_T9Z6%;%w@FK!S85Ip=%D9c|Ql{`LjYQ#fT%%`@BW)0L>q;5|hJ`;6aN7u~ z-VLuJX_x!c@(pZip zxfUB8;haWFc7#aky^b&qrECXyfzJeeY~VI_T#FaTG;}CVfF}+Th1WP*$?N@%(hh1I z^E9~C0d{bMd5Cz)0T$cw8pnDqE^~l`+P(m1*}=+oFgD2U74!lYigvM5-stD{)pulQ z3Ymt#QASX)Dl|0nkbgyjC&Gh<-1EkmL4$whaUe#z>nNbK^2DOz0a#W_-VodZUB zL5d^Z_k_hx3`um3kqP&L8=m)qX!rIM9#aGlc){W#?i4;zgpn>6?L}eoZMJ&BSWl$z zHcy6M^Q2ridNF|Vo-96M8wE$W8jJ|yIm-p+NWnlBfjr6;Qn10F4HF9dV2+oO9erh%1~i z&J}h#^DD&DT#3pO*Qx+lIrP-kN}bf{i|e5(==pK%+|mNiG$ISe3%9c zX@#JVT^cm%wR)oFVu7!r72t#yrdNO`UK<#+ zy*O^J0H=$;=8rq%epG}=pSl!&?88}4^(~!T9u!~PTpsTF;jQwJQu2r((7n<~NE`h< zi25ae4qptYOz=5nFsnQqF2j&BWhLToX*nj8R!$~(h8lidj${#8o>|>mo+&@5zy%y$ z5vj-|kg@V_;!A)S85g%EMRHS~+Zy)R)4!8k3BY6VZUc&{b=;eZEP!fi*M z9C+@;u``_A(^`Py%;AMDo)<%5mMe$1yLzq*g%xfb&UEv<84A1IC&~2j&;!pkgAJaT z*$mElqP`hi^qMH1HiM(Z$TwR>j0ut(~SlfdtOlkqcKg8Q%Fzdr3 zGJ$jr^vv?K5WF1wA(>NE6lr;p8tQpzg*=)D9y>SV>_yl*>aa6p_jL{75qFg-8uz4z zpm2XkK=)CN;3bbsadz&j8`9Wv-GwZ#;!)yCeb`Xq>+|&?$*01Tk6@-BQh29d@tg*r zD@FM6rQL5egbDs^R{6VcZvc-1YK)~E$`D{wS)9}m;>+TUhOn_L&T9yF%VA6-7+2ms zz5y(*fJ+)eY6XT|t`L}2A7Uy>1m9e7hoFebm2qwZcwHG6H-NcSwo9uTz>8|Qp#ely z$H+#ou{!Rp2X|`VqL1Km%?jrmz|8kKcE|hvyXu2dhv64KU}lHcE&hfW3X-UZ!|O>@ zx|n(rl{}%IM50(xPa>w*)|03dJL@sc$sc(E9BzpB8^X1QB@HU2)lby4r3K=iJ*47? z^Xy@SJyQ52k83y8AYttiiii>y>%uEJPGr@En~3x=!wye>0PE~I{7UPS76eNjlQi`4 z+!Z$l!6~;FoMeK>TIoR@IOElqvs^jaz>PnLx=_99BIMK?kjWg;&LLJWWb{(cMmz}9KA9zvl_y-J5uFK%~ zAiNm_YlD!;?XJfViy1i6uoI1gI+5qYdR0yp>}HpGEWsg$jnDO&tMT&x6H)4#1Y0;F z+nD0LZ|2GXWWF7Agi9Loc(-H8nT}++ttb4g_9fRl!UaBtw3-L-GsaQ8e610b=!%Rp z?O~ZF6v)mlxq`%RNABBVmi{jL{~F?lBMI@gikQTRQ&q(Ae+My&5dV;IsXe4H;%qEQ z*4q?D+`u@;|L+n1S}mOwc)*#4*zw|(Gn_{c`bc(oDWYB2Du{Q1O)h8{H>|vt@ z)7Y#Iduj(Ub}~VB*-bIbANl^P3)w*uUXSY`TNz6WCDrlV!i|Q zR3bi86DMA2;V~x3DnelYZ|`ZSyW>oRy^=eHA2Zr44UR@~DJghbg9KGoXCc-CDox`u zPm}PFhBJJlfw3CAu7M>QnZo-t48LS_V%=-g?`r9ROg_+6qhHixJ*QtJZO0Wputs}@ zxIFK?RU9t?$6Ybg2NK-ygb$o^t3!}O?s&`xW);Bu1vZpAf`@mEWT3_ot_Yr9`i-2U8Zc2VdZq>}*Zg})4Oopx z;d_V|s>4LPx)hFaz?0P>-YMX4br|c62dl#&XWU;M4!YpJ>Y%vbp6amQbuv=)T2I_o z9iDh%WOX>;g=4G3h+_DAHCSD2y-d+beoXsW$vAPd8tg8OH>$zQ(xoVz7=Q<>K}G=X ztp>4WaA!5RSVkuJ&a$|v8vIrc*Hwd4h)Gp&S~YlD1)l}NA3VEoq8d}U zTn#C5M|I|fZT3NU&5kD%boAej!_SS@ix?l|LlPl$8HJ!IRvHzLSJPmbQfhWRN^24C@Hl!u<2*5IG> zJk};L9n5q>I^n4ju*C`2mViypk0re~ zTr7$dky8{2>!cUH@P-FGVSBy=gR@F7ShaiqCEC^?Cl+=|@@6-7s;REY`E`dqa%&nT z@VT_k9TK@`Um?%L4aPZ@C0E;pp;KM)w3|5YMsc^?aI8D5a%X_a9CEQK0`Ew0QlbpV zO{CxpqxvuTOh^x9GRdWrGHj7?rwqG|bFJyz@8a8uX0_=1q&O;equ46r7!(sVNa2+l ze54V>5!WG1r$H6v`N*-Wkm1{ne)?U6XU0jupY!_VU$u1Mh(EQpEj zj8FWR5Jf)95aT5=OTu}QVMb9HeT_Pz3OB_srAS$f7o>;M1TK-qd>Qx3?0;m+?1`zv zl6l?t{yLi^c~iz!elUX+kS}aTO!b2cc6i$lez%kNl!QYL@|luw-2vZ}gt3mk6rSdU zS4zT2=hEq=;9?P+;0Gs);vrv{=z$AM!CtSP@xCy-gcM&2bUxvv*8Kvho0J?vP@cl= z$HRfZ^%`p9#Ywah&eFhSO*zBd<^YO4M_!LT9G38b9h{YMlO1f;NRREs~B{LuwX{*FjPI?LTG8m?xZZgsfnE`BX z{6Jre)d|^Ya%)nKV>Oi^FM@`fEY2cqy$t7NLxf`w+$ta=m#?-TlKVc__*IRwJSgK9 zgi#v2jIc+8v+Q64GUNa^uXD)YSUU!@Jig{(VkL2}f%`I#5adBq_@Uau)FC6uI5qfK zKA^>fzHr(BPkaT7oG_^mtaHX?-@-f>Oz8uOuK4_0h;zebePD?@?(7Uxi{RP5u%IZ8 z?*cPCFtsl%^2C!}AkGU95WE=9?FuuB<1vDJvn4u7Rl(u8CNLFZC|n6j(o*3oT=Id-?9iFeaphz(wW6Jx(f>| zql+r4t}LYMU0FPd-?3n_zhjXc`40=Xp99LX72qN9^@DP~cj%kBnB=y*#FtsS29|U7PaQ#q-^~8*U z5b1?m30@4Z4}`JBanDeg?Ts%6!juwtatOrxVALR(?2Ds@!a_fsHwb2z#Dzm(bt!z( z9~PI!_#v>tAIA>hA@%-HNG*f22f(JX_;e`jFNdoKz>e~$9||Wb;BE?6#FazfL?t{u z0MaVsg`sf13f`k|RlGsrKzuU*GOFR>p>VxAP8|ptHSi>bYvPiDaQZ!5ITY@_k6S2Q z3-v?bUTusY443QR(;@KW16(s0Zq>ybLqHdV(}uvqdU$^bjQj}q42Bo~-!1oY&IQQAhhp3(-RG8j%Z#tTE>YzSWY4lZ_Od7l4{!`FD>@Av*(+z|t~pwkC% z5tk3(!fhYG#X3HK3v_b;7bSZD7h>W-W`F)bW_-gyX8GVi7WSoqEat}pS-@ikv1n%v zVxg`Y#MQNH5LeWxL0m<52XSS-9>mo6Av?)Zj>j*CsRfN_=@TbXh+N{LM}z z+u;s7*lfq)gBE)lO8q8bj)rv9KCu?zrKS#jBqCB+oryN)cUa&ALXr-P7qTD&b%6%v zn&!tlHppXuYg+ux0aDc|()jm8g()(=KvI1)29n0SfI`Qq9wWxu!DQy-H#@U2ExwIt z5dQN|!fp~+ghG0$IcnMPy_+^!KvB%GuXH{_BO9JPn@G$O|=$lcE~v z$JCsEXM(w+f1BqlHP0J1^HjtMtlEUQ6_xk&hEpO?9{~wc;L`{)xdP++z&1H>S|5ng z1WxD!XElMNNathV;of9_1Wt&6-FAUndqa$U$ayKJ5)R$nlDRjjni~l#y%TjR;uf7I+~7 zUbzLHjDR%vz{3#`TO@D~#TE(N8UaZ~1J_5uo1%d$BOu))@J1h)?HPEX58Uw#JlO{} zdIcWt1EY%t?&$+3iUn-#1JjEKtS8Qk2dwM^@!kRP{o%5Az_k7_zeK=<{$y_kjOq_d zeQ@Cb$n?SZ0TAPd@dII-U%>0`kmDEdygO_w8Sr~|P)Y{e><+t21zhS5V@d~{>JA4= z2OQ}R6Z`}Ab%zZ9fbHF3NBq zYvIHnc?S)^ypVt=1K~tSz(`W@O;9%wRyM&gKf02dCSVy5A3?KgJvVAn{||(+_U8#VG^{_q|UIGu(Fu z2`?O{41g)^aAkiu^9gPp2y?$N`tVK{1U{2soN>ZumSCG{mxLF{v61nnC&VM}_J9L+ z8087i?QyXuEO5Y7PdMs`FFhg038#9&au;0X1@nvG3Jd9nIdvaD+Jvl3KD2e1mFDALni%A~xVlofBm?W?K^yOvmy;AAd zcI;aeTvos?;VXiwqC<2$=nfKd8IGp&lFt0sS8OFB|jiLl^d1u?-UL0g@ zj$?-S&0ZX}XNba1;Akzcyp7l5JYEZ7hPAj)3&*r(x@eJuh;vA_#|7I3tZ~BGI|NK{ z!)fybDDHT2t`JMS@X-dr{`MTQd#9C{DwgjeOWX%@wh5Twi!we&SH03XrUEBe_aV0c+frmvf2@fCd?Du=V_=R|otK)_8EFfv8J zPBI7Xp4Z!RWXjfo>j#=lrAR_x6T)Etr|Y2jMA!O0>zO{ zae!Qg^))e*5^Q}BN30Q$@II!~*Roo8d=;@&8~I|gC$%w)espzkDxqBY0O!!x?Yj6P ziC7B4m74{euZL-C1?>9}*VEULk8s910V(y7?@Qaz0B_OPtcJLOvYp)sR}$CyMm2QA z^}R4;fsAQ^i%59Cx4;V<1&nHmF%*~85}(tL(h^hX=XNWcOvGMOccUCGx5g-{iC3+0 zD*aq-gU5-At__}}pDQ2ZHTv1#7Vpx}rM8$zq<4hl%Txj9!f`VJceTS6RG$~x;V!b*kL{PZ zj0(2)E1XIKnf^8M{-!6o#i`v#NgOWz5v)7O=6kuMCp-Wj*hS6Ua`OJ4`OVg`M! z`VQl%U`hYM=_L8s?{N=(P49;1Rtw1PhFj@N_XBRHxXazYTS#Azf5hJ@?pzP#OT~`% z#N`y1*b6g=`BlAf^fKZ)0#DP|w*IP+Gx}pBeckDg6X|E|fDah>_!qU=jvFp;DnZYZ zoy6dK^qWm&M~@UZf+Tf(v=L6fy9h614B16gRd?w38R4BAYlwd|RV(WAfFR6rP3fw|XGb2vm4f?%9zq{%8SFPk(^ zsGaU+0&9g=^q6aUR=4zn|{to@JkaOslyT`;eLhS?D^vhL0F-fSEKA(QMx;M}-SM5Ri z6d%UuC+q#CsWo5wTE1zU+x=A)(f>)+^1je(+d4^Z=RMc)P9^qzgy^6 zq2KYOY({S&eEKz=%P4w8#>wuyo%gOAjJF@_V1zpk#^cnPa(4;xq{@FP@-u{1J@k4! z+Qe({cRRSC!Hf1h+WcVMX6((Sto4>#~J4_!z8p{qJ(9dIg%%Yzs zj(B@GHDD*aLO+TV{y{%$oH6zn*zb%AecW)tGxRf!46&bKr7Iqyk7Rc|N zz`KAxZh7E{pJA0(^J&B2w_-1sX*mcWsOHNyw@{0vdPIGaA={P4`r z5M2_Z=wlA~dA~wTX`D(Q^ZaqruMic0FBz>2DnlW> z-^1g-z=HQNnaS0{z4WuHHr^Wy^Xp(NA#C^nxBmib>tZa$>7t^sD!$L|es_h7i&2tH-R7@jr86~s?u6I?wQa+=^e`iNV zZU(>q$`wm(v(_+tV+YeTIN6TpD$^;9c;619>=?3<8~uJe2H&z{@JxGzf8LjjQ+>%q zdGFuxCI9CG{?7;epAY!|Yaeiggezp&E?XO-zv3Z!m4uUBfIOQGF7N{JlnYEy_qb*_ z@}YtmPB_aIE<53NS9s#g@G(3BN4apKBo|I}%!Ly@abfgWS4K~9WpsAWzP{@Uyr%IR z`B+79t_QpuG{0rhCJx+Xb693acn~v=fIm=A{}vC5X3p+-XNIe zgy})B+zB&-c-i}65NvVAtRUFxf)9e=qzgU^f=pMO69ieVxF`tDyJ2Dw+;hX^AUNZW z_iDmz@*3+xW)Zwm7akPBH+A7;QC!`KPk9jkV?1z1Bbe@i^BTc4PaN9_7I@;MMv&me zXNpq1@KyuZSPWk@fV5)xrUC3Njt?6^W^vwTbJUwR;av9SO*j`z;Od6(tOTYtgy%kZ zx&e&w#mfy~tRH4Hgc*LA)&ORf#GD{lS`z0pfMuodcwj?#Fh5oDC+vq^jX z@q933`lBuwb_HNuV>ljw>j|d}K5h)B%V2Z}Y%hyj=zm>9P5h>9L4j`gM|T(}a9>B> z`MJF#Z~audvjO;$y0(NbJMxZ_F38!|4 zkj7jp6>~hi{Xi$ zu(}xD>DjL4phJ^z2J2P z9N8P@Rm8=;A+;jz>kZc`;?3SLsuGTgVE2A;1RSh{`y${`CA=8{ag{N$5A3XrG%RFQ z#*98Np$b0k14&gdwl5s7f}8rnqbhi&FO02OWR zhgC6l01OYrS)`H!aozw}5Qs|#z}7(ggS6KB%;)y^@j-vccptM!xxSCvy203582ba< zsD%#)z>8WKGY}$cvjU!98`B1YzD_wxmG}WB_k(>O;PHMip>7(><4zFH><6Rk;hcW3 zxE?O*2V3i5Vm~-h51;jiTlFxfKV*~1Ishhoh|vRJ-iLU303>~gw+6t$5ApSPkX@e{ zifVv!J3@Q|ywC+w8sMgGaHs*U>cXccH*|qN8sPRW5ZMslbb<6njDM*S?(PB)8{z&g zFd-O^c7eIUc%};^1>?CcusaxMe+CzVam^2~Fr)|J$2Y-@0g%{)7cveu;iZiWO>pA? zc-RDYkg%F!8jW{N@n~0w4rTgtLvdy|SRIPLb%Ud!xU?JG3&l0v;8=6q&=GDo$5kCc z-y9crgh^qzxC@VBlo zrZq0@3UgZHny!%48aH)?J*{!q3JWbHFn6jhIM z<_MUQdzhs;J{QmxF9OPnFZ@*!-i;`(Iqnk>&WrkeaVD?q8{$gvx%gImr1dM@ zCV1I@xqwZ^NHuXOs;4lH`xj;j=!{4C!1MA($j!hmmE^vh+I`GX+c* zxJ&TCHiH{9orkdILMo-zQy%g8VT@NwnQIsz%XhnboM&GaHxIahP)qw=t6C(wTGM>_P zbu16iA7cbeunBSVs6>ORzyv-H{b!K!<>uyDwGc@8DOm&MCuNO#em?!l(A%b->ONtc zg_6|)rVDQ6EB{patRQIv3KV(aN^GU_3Iq9THB}ghn^#r!xcz*d?f4VP3Rl{Ee7iW3 zn)*)F-bNIU%9d@n5D+*2_kfQB?-Z~l4@3%Xj=)U<*5!dhEt^T98gJp+QhrdOC|Tjj z8Yo}+{wh<2;?u2Y#eCISIO-MZtCu6}6|ggxx@Q&3`ul4wvv3SpkyQTXVnxrCtW^k5 zp*T0sPCd_2l=sco!XL#@Zt4?+g}wFv9uCKsP>rVE;#VB44)1@3#y`Qwe>;!=PvVpi z?z|jb5&tEAz+N!ebPnRb5%+iN!FYxuQ)K*uXDIAmNjO~fcX+>NHRQA8cwL5h5LW@2k$hKxm0k0>qgl;CKO}P3_=;Z0%1}?cmEsxNmj(-KOCyBE}&`BVVMDduN2g zm*UXZFuaAZUBX4`l?@cWmU~4@;d}{W{dj$fak7{#^#@oc^NmF*Z;x~o7++`!80w-T zBa=d1FT-wuX)-((cv6=4N-ZYJu$<}YHl7vsZAc? zLI>C(;U8K!CgDg22ztfz}eMdfwW9at4A{u&uOpuV{8o= z9?*`$_X7N%SBK+ezM=4;vUs2d+$|f<`IalUs0Pd}ukujAgZ?k7$T`la@*{wTW*OiMjOYm!}*|ETOo z+Ws(eWz9ZNgA}LIO}TssGo>0SOYN{Dm&{40yK0hX23=m049U!4pC@I~Xie?6O!`Bf z_GivRqpE44DwJ7GQ>i^NaIg9Oim+bt@N`rvMS}FiK)ER5Wb=UGaR4B|F|lR|B<)wrtB|Q z(>|pCe?Av*bzMT9%zW?CqenHx)pw1VefLu{8<0;jlZf`C_?>ATyNCZWm9Dn1VKkj- z;frRiwUDW%fS5LE+Mx-b^1Smm%2$l}g_7YWezJ@;(G`@OFj1y|k3J@2k(P5@yST7S z`n@?Uj9Db-LK-D@pf~Y|?OOcc-CBEzDRI`D$oE<`_W3hRzT`Oh%A99Srpac$w2gtk zT)r~fV2y4zkzx~FY$9Kn{>>l$gMOL+e)T%qm-G}N6-vzapd(*shE?nYGrxYY&tp-# zoNH$A-?*9RL^C;N>LmQ|^#eNdaZPvn^2Iy3e}6tByL#2ckLU-+XE2DgD;&TP`j9lYUe8jT$*9 zw{GtKrAEH{|GL$!`v#3fWp$6_9|o9uvI>+d_2?~>Exj*&+4Wlm-Py%*x7_O5os9?f zUD~h9ed@RLmj7x?pTqSxvqSi`@z&qZ;a^$m^Xt-mjmDR1G5Vg~vZ8V(e#6;pHvjmQ zMrT*j#-{eCODmB@mFTc6^Xe?RJB#1jC;Zm_<(I-lTdd03;_n(QF|i&%j5^XgRo!^$ z*i?SDjGX|h%=8B<>xrj`U!iB!V&C%cZtF|g57bGVZ(Yg5DBjg{rpAY!!pU<-S+?Js z=w7}^*%{FVrFWzsYGk7{8LSOp(SV;_JX6R|_`!c>r2;}8qi>VfvFd8}6?|UvY4QXH zNPO+FrydMn*0?)U{va;4>NYa-Gd{}aGuLb6Ri=*6=u*B4UA=@sjWOC5?TCfpYE2)EbEs|0AtGE8{v!hozAHJfnnNZDw~1AGi5sW$>OkpWZ-j zM~KHKmq(fCR0}z0qNS7!wA5ywWe2O8SH7FR&!eV!+EEi3XZ}~>3w*JxeOjqD!pxV? z6FWUYGsnz)l{yxC4M_Zeb)lJm-UovWf8wFI1r&>CEKJ;UTq`&KNTctilah3?S?9)H zhfMrgh%sjJoAwg>e2({7Bh7pfN51!s5}oRDjg-rt>Tf97jl1)Sc{jU4tTHu5v!kI) zzkI!tM;1l=m%J=Y`yiG7mVJB*zh-=x%5O2U&nv84ACvM0jzS*gv(M8t@>vB(A(Q{4 z?Y7)!$8i~OpUUR{Qzd@)yrB=LEU{C9~G?B$WCd) z2b?x~CAk(h-RllEk;z&U=B>Y}q2Zy-YSWaEm%HraD>kDv_44>h=qAs~(yzTI`E8fh^jdO{v4h0T ziAS2Scx|!t&_Zc%{O(XU`q0H9@a>!FBjmY%vE73{Mqa=+zu$5#y@@lYtrGU8H9`j={~&B|~`?qyl1 zW%zdn>Rzeam0#h%p_*ZOPwQtTXK8)WX|*WeGqxkN5f*aFbp1jLe;9&&o@x;bPUXJH zzPv}XO|+1&O)nhdKij|ExGUbli^Z;mE6ikMVwpZ}#4n@u=oN(D9^}t^)Jd!`%SNQ- zREs9oPCQEa=d=2CU(-9A*79>hxK3?p>Ph;bB0JXM4_a*HcjCH>>-m*^>A(zf)S?}u{PU!G^s{XO?)J^ja@fQ{PQNc05MNg)Bg+lwu9cna~Q zh_!!oi(B1AA0%&4)IlbTllX1yw&y)FS{BceG*Bv$K^jdrc zbS~H8-@Vkfc*ps*w%uG}Wi3#w3(~lN73H>S%Pi!LhuBC}Gmo{9^`?6KWdKz_wUE{3 z$)XQvv`|~DjkNF;WOi7{2y-j;d87FsgeTTzSWeX}?79c}LC^K59fC(S%)*p5iy;Kd(@!*0eRBV?31@-EPy^AhV6I zY@W~imo=9LA%90{a0Tm&#%sMk;GHNvY2gnhZ?up(7Rr<|i>R;$P1rYB-&k{{g)f87 zj*p5QkN+-1Xg9vzJFztP`W)7Kv&NN}uFE0)t#nTg ziCI}kzAlB|t&XPLM)*9+=Q(6q8ZFD=i^&YkAsf<1TSd8Bv5V4!b(UXxNFw{jwu$?7 z-&}eyz(8tHo1K|tzlkn2^G5{;UmSkDnLk1I1y=vz&Ra5@9(k7^hE}ZWeV`^AL&$a1 z6VdHWRmN)MgNhY??eA${qVW-$h|&jW56&=?p{Cr=&HORq{bn-7OlM%uHBtNg6%^Wo zlgzx&UTG%9X8v1WznRbTB-RuCubLmtOT@Q7Xmo7ixw*4uGTHn};xE@UzjF`khK=NlakYX#RW2f(r+nQM9~nMvrdsut}HT>d1i55 z+x(O~Uvu;un;8RWebx;BGsPNkzW$Ygfh)5+m9+CPI_NPM2d zuP$Gt({Kj)Hr++Y>I}-h!(6r+sy)Ikkfts>Q^^KFP|pm zH_T)0Z_ds1%v0pMoB2w;2fNeQQ)F_FLF~Kby=lA;`IbKi+W&sKx(_M4pFbwO?19hO zA6NF_pJUH(J#mrY!-$rar`SmMsf*ojTmF>COLQ&eGk2^{IG$J!nhl2LP}XSKm4NKB zSbUW9qm=zWgX+HZed1f=t^8YODSu*p17+3v2m97(izP@2YYXiEHTYJYrdJa(RHL&f z3ySDE%9k}?LrK_l4$Iu}K75?!*VvKVY^u$?k4)T!m}{xUltY$g3n_U&ah&(i|H5`z zeI(lDAuFFt*kUE0nCFQJ2^;Uq*{DBc=Ib-Rn>bn6Q8V+8i{(__9{yn>&z}s%o@Y7O z+5A|}P>tfdXoaWnZ9=St=I z8r@|f@0sari&+2dg8GY+syqPM2VbY45B6Y3vSgTGuFY3!K}aeJm24QUC(p%^}T+7{pOl$uDRd$dwHGLd7pFcbMB2|Dnd6V`gA_)8apxp)OlboXYkf!%c zA%`&rPW<$FXTQHbAY{d;Dy^Pyi&_+rQ1+z{o^_(QuVBtRtvl2LRCc}p_Trl2{M=2N zop6O_!uwmN!Uh)^iVp(iCU0h(@FJsC`ai>OoieLjd?7VUxIgq=6Yd#8!(D%0p6`cU}tKJN0S?B7^=vCz^ zbVsCgmK}80CAIOUSCHG?R{mdi^n<<`%cA76#*L33UENi&5|HI#I8ek|4zW zC``SZT7|fi!Q&a={ZLLTJ9+HP#xIHc8W6*@=wHB$+QLm9X`WBX>U9Rr33w*Frkx;H zaZNpGt~Rc|tA6I$A)(zA{c#J|k0cGq2OAYTX5~JsoiIlON_j>Q1YIs>S-}76{HxsL zHUS@Ms1nUbENA8=LFgqy?#O*lR+%t$_eFSt0BMxpJ0q?S+lT*fSaQ1p(;(EDlMwZI z`&9tSXV6%}AlEytD=amRHfBidT--8!A8*xWX(A^h8}57h{-tru6_H#8ra%hZK_X3{QPpMUT-GvW1V_!T(*G-O-5Fh0)dXfEEhZ`GJ?PR`lzNQd;vM_K7Iqf@niDU zB@uE(Ue`E$PTXTE=N!4h(jzDIOJDEvj2~Vi1hs~OFMVW!cc5phx(_BiW*F*8d#20l z+;?BC>#9ZigSVu`n`G*d3fTm<`vdN)!u(daC+0Iz#U2VPhc5wBTY+9DAKd4gJS~zT z=27oyp}3o|xhF*1L*<$*{3FhZ1y{UHZtHFM`(g3L5zVqom|rJnPS8xbbw0MH zrFs9nRCCdOk-e)Oq^zTvCyUJBMa{Uzm(G*&NwS8n>pT3!*6as`u5CM{L59tzbZ+es`S&$tcz;? zd^DpK)@C8hdP1`XKm`lA`Xgc0Qkj<4XXT!Dcnv{+oc(AJHQW^&K0Gbf}+lZxDNHllkB+Nf3E&!9Z` z>jCZOQ%5YjmJEO=$aD7M4x4Wz3y89?@NCo5GIJ=>F;BI1t0k8 zN@v*o^%}Ha?b6B!Tz7a7AWJjnuc$f>-BwgP6AyRIHchHNGf_dz3!Z-}!mN?jNjE(s z-PJG_c-2QuBQw$4(s(S8mG((LQ|6HstT-G%uKCVqF#2)jt1+Qg&a4D${&;sOO>#mA ztRYHY#takdsq^6)P9r&QdJ2KH^7hYm5A0Y5EVfb~*sDMZ5}T$$kiHf;BD~88_Ql*R z=w#HR0rkEL5rf~a!=0D+e{S=^vA@ZVxx})!Vs!%_@&wcfuHquzeCu^Y!O<#{BZ=p~ zSccS`$y^$2cih+_XLxvX(M+5N(WeZECxs*inSz@}CiT57__o_?Co?4m1q3&tCQg4( z5+n!tSG&497|sj7&h`3)^1I3p?e2T*V5xGw?j4XXm{O)N@Z@jpyf;V*4s(x7yRHGw zzU>M9p0*X__&sT5bi<=(?Cr+od**GG6vc~J37@Z8+819+yisbIZ8K~88fTO|dHr}2 z&t^Z8ccIdnV8!_UzI<^upEhjvLAl`gv%3Z5*p&dM&=XAOx#~8l=RZ#e%U<#u?6bUn z$=~4;=F-6Cm770PQJ=#OUp!VyYNwAWYfZ4gP}02I!~*^OtH)`ydip@tRr3z|9X>nU%n$f)xnX?|USuHQjX(Q9HVbesoCo#T_kLzv)JC#t=GTG8 zB4loU1K(5fs_Z-Sa5a&>y4=aSfi|4cd-9Js!i(F1yC~E9u9(|h&AniO&`{}$nJG{H zA4y0X!)v|tmR8)iu-MIs+KO-4$32+C_olocNe)%Fzj>rRnMR07X7qb(Xy8G8vr3ss zeWy~c2R;-;q)K(2KBDe%KjY`E`U|n_T;J=086T~a9wa9XE{djgnKuRe+_fm>DQ~e7 z2`NuYdO-CWh`MvuA2GM&ATgQT0c;+9``PLOIsPJlFz03NKZVklqs~iRlLf7p)#xXz znGc0820o{?d&!M$>k&50M}@rOk~fd?Prkb<;^b4w`t#@>P;{LIxwm9CJ{`D9sB(&X z>N#FW!siFglFx<=&iP*Z_ITv;ovInn;7`>|&mP0d2+}!zarc=XdgHjUh|`mqUZX3@ ze^DZZIh$8M2rc8EW(yo2?TkCI?IQAUJn(U4ToV|PWu2$D^e)6c>N9fbDEOsKrbF!Y zuUBkFMuI&Js>?LRlTCPj%cTTJRAT7I5%M>^yL~$x3+PKJn392O(yJ3LtX+Z&B-c>q zz1jBg8U9B#o@z3<{ zOZr?JBfsxxC{*6;th=M^vq7-#@JPP%-1%lpL4r^v!dLZkx!tw9n|4bHz+t(cgRu`M z9;H}`2U<%v!Kmic43JE#VpKP1Bu{p$Ygae$n!(!Kh*#OQ#*g#Mv^xFyesus3PToI5FR=p~3?gMbdbY#gdu1f7`CQf0DVsw>8vh`h?TV8>LHbli_jczSPfL zee*$6i+)2>e|%5Nt0p>hJ{q5#e7OB5F3xQK=SQbsj>ek}-~OOJbVcstLu^~oTbr|M zirE(WgB7W0o}Z1aePBnmkyb=tj`uO?Kc`t>UGwG4ajimRwd$(RqR*Pol2XI^F-hM~ zTJ-HpbOJYji{}csS3s1$J)RhQ-0!}$_bfQEmMLi8yl1|Y<%Yd zDo&;UG|%-wsH|Fh8oR$z!Q&-KRZLS;_4t#c|;aI`wLqZ@h{gYAzxylDx!PgBp&)k;x%6(tw z!#X%v(83zyDb0r-)3Lb+@JRdnfaZxF;3w{Os1L6=XLGn%T7d5S87o}$XR_EB2e@1sZTE#w-%2ivGru zw+~YvFu4lxg=G3H;!_GTR5HPN5$OHE+oIp>%e;$)<;ZP&1$tdNcq*}I&}Q(JY63jW zNJ6q@)x3Z?)V*0LjbyG#GE75caEI=pI4{`4pBV3*~L+rG7buEQ<{W&6xH z4sNUjw0D$wvbdicdpZ&Sv?(cxW+XmjOg{bA%4M~Cd-k?LaP`*>OOuMU5$!*3ZNhUN zU%u+J7-H2erNk^fC7vTO^kHB+Q1{10G+90W;_mruK_BDb!XeO7%EgR8PpFz+ilFcH zTV8|1Tit#cB%kZ!AF7k!ejb9%FZ+3KevX$|PZzo1;hIJ(OxSU*8YApFgkB83**iA! z%C|2p7EXE^MDrJ#Eip2&>YA0%n0joSI8r=QnZm2FSQM;QD>wOA-sjO^N-(Qp>7qba zZ1tk5<_)J>b^0357STmg=SEV4$_xSbo5q;CE~zzkZw_IjG0g@)ymBXTPLd3Dn)slE zzx0{8!bk}A%6w~NwNpZu(^v=#cE15$6=!t~GXdd#3pP|3!gy>fFj8RSdex%4vjdDAw z=hv%@OT2!5|MS#$NufGaIYXIQU~GP^uPyx`P&iY!+%_~l4%Xix8or)+yJk7;*}bqq zoaf^&86ZBQmhwd_rEB8v#_xF>@$q+#vOCr8^fi><;E@CLL!Q|f^}OuCm19y$)$sM>m4}({Gc8)<+hdZpfFoz+2V9? zeZA6dyES^zINxenjd@c&=e~bDH@*JTFK>65G1*m&h<+zbd6&)ayor9|v)iGEOA#m~ zZfBcU31^RFeDj`+SKz!C2ASPG#rX3xySr605G=1&@L2jU`HzD#tnsCRc+&HPmj0U?56LGD6!C^<_cIh@}lkghn70q17GAS z%zb{c0C}KFt?8uwE*FAs_W`SN@$2dD|GeE<5INPL93G~tOO3yqxcpl6TeO*t(Bt{q zDr-h}u|C!RmMkQ9zj*51C??o72G(}3=nvrfk-43r`(&R$iGdec-;4-aCtE&_Mfa6H zW<_P!GeYa30(Boh91*BEf7)ql<2flrP1gRu`7)EN_J_sw(mSo_sy1Y ztibyh9@G9dbW>k*Gqq6Mdok zSk0#|?0X+IZ?wZT>Bf7TTZZQ`fLA24-AzI4&@-8AG_7FUT%8b4}Z= zpSV^g+{axKwblID&2VQfC{p@6l>fQjbI=Z8U7`GNxbO;(AUA%0=L@0cePRbiqe_~$ zrZt~}o{PPSlIVGSK)pr3Tyu1vFJH|8B=_niIqSOG}0 zSBqmauEl*?`{{A9eT1m^mZWB3H}M_WIwAeTtOFe_pk{J~3{tcza?wrX_Mtv5pNOsI zm$}lu`)dwNL<*Ll8RcC$pxd&>acI46xUR77upavKAd#mi68NIGMYYARnbdMW_e+YK zNBOBykw)RbC`?pT%j)#mr@T9|&yVU0c!r4h9$br_+CLEzsTySy7_`;B-FEr3uH2HN7A@Zh86ruS-wT?KoHM(Hbk?Y%%V1!1(ga*j75f zDe2*CtBMxZ&b0oiBi%zxRCqb8;V8AeA;z`dj2-gBb@N5Z`TEs?9_l5Ju%-%>@tAAW z>Um#aXYgHLf~#`PcZQk{H0t(;AIED%15YfAkF*8T&DvqVJA(I#54TdwI@|EeZj!zF zDR>O{xk@rnlVosGt`_EIY*E4 zPC5BQ-OyWib>d}?AG&V8sRuVo1)*!p(Jvm~S(_VOBOUD#_f2Zp37-D{NI!e`%b3I* z{VSDoJ833YV5AC`3t|K$@YHnYc6*f1A4|r2TA_IMgjlV$H;bsfS=>IgXKhX;?Fo^#Z|`7Hdj|vT z9nA2}p23#(W)XUDPd>JQd*etty=S2fU+#&6roCsOOZKjpE#AFTM}F@VM(mvgt5JJH zwAnjY(f<(4XwQs;itNp>&fdXJ@4bt#HxBUSy>T4d8;9)P!P55*_CGWV|ZU~lwh{{#BH z`vK|rJu8zYvnToz;NCSRwWrnc^Sx_~ckeO_?VZ5#dna(y-lLqZ_iEuc;UbqgQyS&z;HUC+qTx~u5q&||ed;v6TRf6crOkqFA?y3s{nvHYJ=U$t$DAX% z$_?t`Z=do#Tx0g4pv8Zh?zZnpB!6>KbNh7X^wkRoScf}WtUz%LP&>@;KmjiAu8Y!h zr5fdPY(Za}2$V47@I=}=>Qp&higtGW4g)1x11Fu^IkW;kWIv-N-FGm3np8S1&kqqC z74tn5d?+k(yJuDT9=q0BD170=^yBP%}uX4Y&O45%I!& zeRKe`naZBgjd@i6#Rr#IcMHG~C*CtO?cmKK!>-bJcmF)@S(h4=fv{ zs=hp|DHiDKd9S{%AK|J6cEQ^qomI-XigAnWTNFzD)~s$g#9(0-0)rTA&DO&q2CK7> zScpOPeBi}F^=m&`g8Hwkng6y$2KqCvpmI!mj%@j=b-LZPRPR2eBmaid^U2pznG4m^ z9we~^8IE68U)p+f_E9lGf(7fnzIlxlbQQ7{zkG3NOX8*fi;UlABfegxfw#STO}=5y zpj~Z)ruwdbV_DY33!jn3qHP$Z5}}(qb~#c+jCX636ty%r_IfMq)jd$}`IT-QgpN>d zg*DzoBlzgy?M<-P<@mzUqC&o~Z5(a0)>4QHQlu&@wi@P+iat0@r}J0_zTyA=Sgd1F#`1ss)QZ#k;!xPmT~U5OW_|JH4V zMcqR;2r@_`6Dg2ZR@FUpp$tQbN?7Kn-z z;W#nY_R{LmYAegL9abyN5~i1YoC&|Vf_x&#rcRvPZ^=WKLte|fL%`jxE| z1MP>Q*mS3v=L%FS=K|0eavQG%Boy3(n@VP>;)zW3s0g9q9-HW(4Hu?!uVQ6R&^Z%- zwzE5408&&k(4UuX0=^mHijN?xC{i>lrgBA1(w7g|M5`cuLX6qckbPA7Jani4h07SR zpLvPW8fA<)0QC*a1IEH6(%IJ1x$Kja(sH&KB=kBl^C&oQ?NvDZ1mq|+K2KC;B%RRX z?!4*gRL^5g17=Dz6SG=M}I!iIZ zg3(-C;Zd&eE$h(`VOr2R?2Cq{;qlQ^2dSWIlZAeOG1Sv=QcKSWcA~+ypU3~t47)Ha z+LihmYwr#u3eY=aT!GZeHPa?&Ir{cf`Vid~^w7`2*6$1{7IBJ33tim5LbA;Rn21d- z8xe(K40(6o>z_@;IW0Wj%}ZCH(XoySv_N97FjSCI_`2yRhO)>__1{Go9FJZ)E zL@iz@fKtsqs{7*(5o`n`pX=d*s#7{;Oqlr7oFa!A3ZfdY3tRaSX2b{?2)i>+6t74@ z<{{z6#I*?uEmc0o6>Cq_%MC+to>{6w@yo>z@z5QLF*ugZiYSGQ%`_BX3g-3hHzs@K z;uR=TQsv7zG3jB)j#^n%($bcOD+OTxW@7hJ zVKy-Ir#Ai^W8P1gwXn4i=ZFRQ;RwqRVO;iIV>Yk6D;oD8OOB%Ij$RI0!JW$U$_3%+ z+_dj}J)pY^@TJU-1rQe~f{)Q>%NSK$x8JpD;}YJ<__8BDf*-}Ct*5}_4O!oYMdzx$ zC>t~RO?4{Gtcwq(pYwO3im;spQ@UsY=y$B12}V(5byfTCk9qpZuSQ9VkkWf?$|ljk z(8e56?DF{2yxzI3tMIm6VM&X?3xUrL5dC@X2-6*~C zTsC(;BdSft)V6<>>pL2-xQ#P7OlZhwA2G-!LZQa0c9fMCV^v3r0@hL(wR%(v8A4h3 zb9nNu5p?aw61kRdX*Fy|;Y%~-NxKcr_C5&SYIElhjCv0CM9g!gh-RB*=5Y`1fgT7J zoBhtmAUQA^l-A)qjL`!fFscG<>RU5Leck?c1cLb*0hR55y=<{r6rN2`+C(WLj8$V* zI^%j{?@Q4ln}Bs82YZHYjP}CjHbO24I;{A-iDr+qvJ`cgG!@!RjWPOw{9p*_HS@J1 z0f&+`mLHCulA(P*-j*W#+lKy#M%&FhxY7JGpP6n91o08h1nR&VL#zCu!D0!tZzu7B z7EMLW4w%cjPz++RLcbseabPETXuj7C!OkzP_3p|Y68lZ&&2#MBDkI#^3K@!Kui+MZ z?|e;t)^?i$I%8_k1(6Xz-5%&PQMKhz6_<}mW#I>w$)Dh+s%b1?gR@K}n&Kin+Li4> zbEF6iqVxUUvxJ#Gi|{&E{28VaACUx2~lB1We}GxpKt#R2I+D7K?)jkjxUk+u?m>z$5FH=4E6O$2;^J zM@z_KSJ>cWFCL8K;dlq1*4=!;u6{}Cy1L{~Y!dJEb2P%<1hPz1;BKiX_N z8r&JlKFv`u=P0l_(6V=C*vhCUF|OZ;8Cd=kD89gaqQ)n769kpRF-qVV`JLk6Yof3} zyM|n2V<(Kz)-=bJhHLECuu2O1r7P(_Wt-$6z{_*knNTh3ZSM&}Bm1_4pu0R>;lj}3 zKF0xk+O8rkY>@2(fxhFo9pW&AQ0Cb246;ig&^V4ri6a94N2I&R7DllOIU;9H@c-~d zgDv7qaOe~cVdX&oN!YQF?(L)8osyjq52XH@kH6A7*RxUmCOv;B1|BNv+yT-wP4~a) zR_`yhaBGF<3_i=|N|V|XDJ_|~TxQ3{>MU9uuM&wMKAB<9Li2e$Al(N;5tcweFFA64 z2^0UgTorVZ(e-?1Dh`Xjm>5QWmN4fUlHvcy$39Se`>La4Cu+ZGdgnkXar61uElD6J zGdleJE)RJ8S-!L|p{^9>22Y;0y*;&isexsDb*Z&rY|0|OCS0wg-i*!JDKEJVl@W-b_ zd99$k4yQh*Z73D1tvs%&_g#6eIn}&_R)QtZfD-`mX**9j2Y$M|Aj>@GsaQn6ak~H9 zQ=)7wFEfJM7$;Yt-?gnuF1FBDm_s$6ntP7nRQOaSr^2`MFU~zz&E;%8Q{rs-zE+t- z5dj?=uAEG1Byci?G{(u4umMuS9Ey`E|ED2dyff@S#mwOpv(bxFa*`=P;$M;2Kca$v z-TVDF?*BF1mi))p_P-#u{{^xAudk4QeTDq%E975aXZ}Sm{l7(1c}YAEI9#lFG>S6E zwWs*caX9_M;nX0L!<}mVT_oqXd{X=WBwO=e{%`#wdicY?QPh@0@eg4X`N57OYWEvr%1PEYr~F)ENqW-ruaD?wGnA)+DA1#-1Q)fdOyd|I+HV4&aB%* zkn08gaG6_-^QFz0^Ws{!rU|cRI}O@LZNwR=g|Fn5p1ia!a${*aAoYrJSsP(Ck`%p9 zOlZf#uDJqcSvZw|Shu&0K%COpf-!`5#E*bXIlo_b?%a$(>|-2MTvxxoQJHrDq%>XB zcmVWM7*zdPf<7h)@(649PsU6r_T6@PHXp;0Z<_J7*(Pqb(=Gz>d0mKu|6IEbl8>Q1 zBTv|WA5r|G2|-V#_Z(VWHS9+!NEK+Oz%<7 z*va5!e#`G&PF_7V7jCW$ZGw6_K6UNuU^ShlmT*%zvx8gH;F7KF@gm>YcHy0i3b2a;{(b{obyp6YJ)!%4 zSbpIK{4!Ci;4;srDQBWVF8ctgu4oDa9AwGlf#Nrd`XiVL#t0si8YS`x1oghkkuge` zu5+|S{7R*{7~tv*`KLRB^S}0E44y?JW}RrPb#J2VYs15k=Pv1(wn;VcO7ZS0P%;>n8}Mr5`Kab{N;)EKSPgV@p8D{=m&* zZ0xiS^*^VFh|IdTNayZtQz9yj32~%_TJJ0#fWA8q2)yH8`Dtw+y(my+HX$})qI=SU z$s;OOo1!{Xbhg&%9$kkR()V-sgM6SUWfhxbl8*61UjSzn)i}@ywT^veAp5(1@a+Tc z2;^vz^NVGtLv#(-F=QdXEl4$v_Tv~0f$i=!R(;`WCpEapRfYMSpA~uo{A$xAec=Z@ z`5fE7kWq7k5?h|We5CP*81utD_DJj~-jhbtskkzlu0-`pKkXU`btKk?N0U)MunWw@ zG-0B$v34SMf%FX=nS(bb%6l|nxz5BG9$$s8sO@x%LcpUW&P5J4!}9D$!C8Lc4KJUd zVNMaudS^~v3|wN1F=tARK`-z@HFsmefyZgOqs|~B{q=IJ@{TkGf$ar~)-xuFi8kNS zHiw~_f#T^R4p>3?bF1aJM^&}=zJtd$w>>;QY!l@J$JTRbM835@WBeYtHRRpU2se!L3wIW;U%Vo5mG=n5kpC!L5 zp{pZLS>1`i3b{*lUh?t0u4q0^Kjt^~Vk2f#@8c9N6S+LvC;-~uj#-apf^k1H9F2AzZ|24-VU;#3 z#TQPs&(?K8EZGNehlrbXXx9jqP+$m%GldD*y1JrDvBx?ZsXEC8YEfP%;%p>Cne%TQ zxR(!$vs~5O!BO)M7Ln*5;QYxoq1~xOt{M?YU?o=42(GHwFZ!b_?aj|+SWNJs47sBm zEeW9*%oHh7B5nZBPy%7TkSrP2Y(Ze9P42+f(%L` zOSPK>Ov5r0=9~{QwZ|Q-Mr~qUwQY!UzrCI-mD%4$mz-KjYRx%+l**rruKKl8_XhuR z!}l?hv{>UO)LDDk8$fUaI?T=_PQ)-vUNfN+-@$sOO(}V3)Lp4g`LsDAUWHMMEZwdY zhXq(fBTk)z4nL5&O80U64Z$xqJ z>ED_9O^BAM8uL+A69d9Z=kpnLsGjO*(wyN9JOIy?9e#$Qgay5M!MW&x7$BW?gV;V1 zJ;kHd5qvoWAxv{mm0ytD0iLGlR>(9RTR1Wuo=Ps(YH5<{#BnpliKffZQJmHtM8UpK>ATMP)=VV4uvM_(cvP7A&gpkjbC(35(uT17(}MWJSQn;B?j1wxy@$WD?Cs^kx;e|35FcFx=*LbOg@K*V9 z(3E8=RgZ`_Qbk$+cJ{fG3%1P(ZFTJjYSK+d%?L}& zr0~{xGkC3uJZTBRB_4i#2h#~r3rL-u(H{TFowqYk_E$<2_9v|^IiQ&}?LR|!u5S1B zI1#{5WQ>X|Er*UEI~KqIdb6UPpG_~5>9C_Zb^0_y_ngfduU*c$oVikNx{^&Z>rryA zV>z3^%V~bU$pJ)D&dM8eo_8*wh9S@?d$uR#WoV^d9uV$a1=m4C*?yd%f~aN3niZt^ zkATYbsgIMrHlkurrY zn35T+C5z?}97V{Eq%-(L|NCsN*?bsZSlpcRtGS8;v<>4tyuf5I-|p25ctQ50wKyIA z^#C*5UYq19Oh8J3qm{}*vEjW;8tn6=H)m0Dm7qb~B}XUrp48z+Q8vEnqI$%uCSJ}S zo|5gwPD<~LDA5-J5tECV!|=_tMK^L za9p!t`DHlEb?0ZD0f89@=@NyfF{9RK=XO?Qgd7j>)UV zp&ft6zu$*6+!1&asg010*b%JdMuYJ6kmQC-$$m3W>~i4EY}?OdntV?<%Suk!-;ldZ z6n9JY|6Q{2&hE}X~qJ6h)%W!38trT92#{SweeggoAUp1u;hSO2a#{vR7fEw(Z}E^QNh>F7g3zF4`QHU!FMe zbLT{=*IU?>_Pcv7EQ>D^WqSEJ9{QU44m)v%rFdlMmvcp>U(2RyOfa3#O4Y#jAN`xG z2Cp>jTDAM9`;(--Pvd`op@k#dbxF0o0bASKokels1;0#Ha3&ZFt^QiO`f|YYw$SAG z;8FFvsYS(CzzC>2nDGjWyQ~`P>gqUu8YlkDQ;eXeYqF#2eTLz}#Vp`k+4dCM26Rpn zo=ZZ}xbs7!n9_`gu^F?-P5sF%1xB6O5}kbp8~=F;BN`_A``JgV`hfM?3)kW|XKHS7 zmNHk>)h#LNhaR^jr_EI>RoG%gX3VIT+a^gh!u5@a&9L_m1Q9M_O|S@j3swq7ov_hK0#NfCQSN81KJgfl7Z(Iy#+Fn|8UPMCmL zD5CgURMVFV&e1LI`~i8&Bi{# z^5cImw{tecZ&^8l-ao2)Vl712lGfg5IvkC-3`8N^hp|)re4v0S-`L-2bAxK4OgO%$ zo+gEer`@EH{Q^wGl$xC8&(RAh`jnCn@EpKoTI}+G&dd)0r783r8VqJ3^HLTd%u9?D zr(tzGkkX~P{=^YXtqWLsb^TRwEO5U5BGz(h0F1nkC6i4XndfH*B{UKy+><`mts#ys zBr`GU8UslN@=IhhV)Dm(=wOA7K7;BjpzE6y%rdnk1`=ik1jyJ*3cPCje zDtRGG#!xmxh{2=D%iavlg9ya98d@Vr*#gOaj>qt8rUvM7+Z_b?>$+bQGx}u*T@RVJ z;eST}WYbin3Ln0D4imr(`XofD*fItlWSV}-+?pGDdL~QMvdte4Fetq^O8{*7JH>Xq zDu4|onS?dxH0G%iY)1)MqPwuz$R`iqK&BKo)3(#mG0-pEVZ=LDZHa>LMVaVADz4K$ zcmQvRz5V+`cuD#fh4CM1ew^zqS!x4F>#Nf&YkMgqfU+8@xsp_SoIv8o;*3;p_7^19%fK+Ibjm6}guG+gxbkO-`cMdVA;fmTGT}SD>T*^TdEq1_ zqg=jZYl|AhKSy1cXQcj7W^`?x9(Rc#ADUAQg@(@a{RYjF0lC`^t)pzI9fj9?>;S`} zN0wP37GX`O!eRt5J|8yEASfv(Fm1{%gYh?9L^soROTfu~8@etw{?ega7WY8|O8y3$ zykVhWBv?smyM^Z8V*CX$)amWz#x(r>*|hS=4nr~l1}1oFgX<4aI}ySQIeR|e zCt%`vr3WzQ=+*p?hyd_hjn6<%?BXhqNm3$i+mb&w9i-5|Tsb-ujGL+7ZE9kokDwlW zXEhB65<8`71anPeRlF?Ku|G)%sa3{?yQnU8)?B|dZ#epW8);_Hz*+SnRR?`_YpN2W z4JV$)w_weTRPm>%wY(G?J$EU)bCi%={IK}Km=Z-qo0PNaE5+72-|TPKG0Bd0E$d6N zQoRhyrC93mPvezWYK{DYqOil;u3HWJne6jW^u<+ff0U9fOsk&7EroY(hfQ*W{E3}u z0lM|m^+Qe|((e-fSy>O6#deiqtv-AYfhckrW8`9Ygvqm`>Qf4rA^D2V&4rmcy8AVN z;UK0C(c^v$B0Y~iDoikS{El{A@vD_gU;-MCZC$vNvE4LKH(3@1%{vZou0J(p+mYuO zBD8fYw=F|8$b{`FR0xrwA+h{$A*Q?Kk&}{o{w}tFfD`z!IPI^oO~57~#9ZB9raQQL zGH&Nq23U9I!~&=^6(5|^Ra+hflli=C^^g#2s^j5;T9R{tcXAG&vNG*3s}u~bxyE_qoVg+O>@xb=A6J2 zuz9*bW{{Zy7>NMFiW~eJB)IAKJY>9G4^esQ=V+OVOLsc%YCvvcNn4jeUtI99CaTr( zb1Yqo02^@EOB%Sf!C+4dwp1UjddMpnWBfFh>$R}=)7WY2C_SmQr*%JtLj==LJ6=+@ zz47Uqf{VWb1!qI}0Sym!T;+cqVLk)Is4oHFQQs7p!m!N`_SP_6@r^K z=l!`Wo6z4(!cDS6L`!>reAs*@5k!OwJ%n zlj!vkGFmdRDf|py8f>s~o!h$`z9qJ-(@VH~=$Rd-!&>pS}^nro@IaiZYw zBxL8H^*F2BfoSjuEQB*UY{E zC9Z{8lu%V)8lKL1>SNAxPa{wUA1DwfNvYY^@Q|3r=vQuU{b~tv?K!-DEp2;YKj2oV z2TlNXZJld+Loy;H)qkZWL|vm5gqEkE$&(+a%Q&-$f%U1?8|geO2J`02Qkb9U386?Gisajro z`f6+E@0o7=LH3WFE;fk0U;pj(oy^8x$yQlgk8AS$6vO{UAb0@Qa%i>fO((51>Osh8 zfY_0#(6|KEN3ZqA2%32YNtwDN`{SBtWVG$a{We#XG)(~LGl3KRfQxnGh_x7u$$}qaAPXu*^BmFJ*81kN{QJk0mKqZHHI#XW ze|}mBniKtJytWy5YeNb^86c*8!8{ZDgLnCq&BG86Z&aL0%aU=2+dd z(0DzY|2pH=Jn6Hub^Z_;tU9kzCA^36u9BGV`+=l++>ZF?SWyeHn-;iMq{zkQdzcpv(5TaFO5RCkGts~3c zFB(pCb2jvv0S0s!dhI@ZI&0T~_%gLWRI%Y=6&E2X%Ta<=(xQG2KIgRHImuJZvQ-0D+M3MAhE1_ynXeGy&4PcH-?%C?NQ{{7?4;#{tQf<2#>wOO7!EZP z@~f|b^xLNc4A+qjy1hkU$CVelS}p(?l*>H&h`u7XA>4%2t<2O2J?3BETSrgJIkB23 zb(7u@m~=|eljz{G5F9eH5mf_DMaE7i_poBO-TuJSV9Z~2jVlq-nJNLA+i%9Hfz))` z))i#%HJ{~x3)EtH*i9RHLkns>{bo~VvFnbdYUrnT?O32lX&3;fJ5B6wUs|FQ$3WTlj8h*#a{jK9HS3q zPVZj;bV@`c8izVS!1-?=4W2yE+$uyH&>VHMv3Oxxti$I*fNdy$#>-sCl2xb&>QtXu zA*dvlVFOajxffL{#7NVW!4x(u)_;M<}Z(To>P2v$IRkL`;!Nv zg3RVxBjC&IMeZD^Sa3ylRa@KGN5L~|PJ>NJpl~rry8(Yet@BI)Ihjwe(~x~Cpd?ae zM7*CxD}3|U>cpBJ5t{x<_OnU`;7|4B6dDwnzRKpEQVMlE9If^^daA z_b2UwVLc&j6T^^NlHNGDt>31JOJ<|w@%FnbvtF(i-ZKU*CW`T>aW2>~{d4J6&rZCa z2>4Vvl_u-8e!iIxpl zEAfk}o4(vr{#8VWD=_*W_w(VcHyfGL4;kI32a){Kh^u-?QJs!yuc^O`?BJ0@IL_r5 z&0LbJ&@9Y!ys4nD*!0c&ap^4g&Y{RfNEcl2xfr2T4hWkY64|z2ibb6zUx=L-Q zP|}T!spmp(*?be$@3{aza?1nGPqzO0C>1;2heZ%f8hYrI1uEW2;muX%&)OmLwr0U)SB-N+jzhiCL0da|yX_Blk-i zg<))So!D%QUF`Pr`~N)7xfvSN@yXq0jI@je+V&5UpxtI^AiCuBFBf-yJ{<)ntP&<|GOMViu z4*!X`b~!ZJ3@6B!-5CB$Mt+6af^Nf;-Rr>GP`}THtapJk-KW7uH#%Ns0+S$np%}x6 zhdUC^E>@Q(4x`SmIYgnB_FP_;DVL# zuX4V7C{#Qio(}F#4bwu;J(_|x4<_U=n+%LzBU3^ocJjEqkj9K(!CAJ1nY+*zN!BJT z6qI}xW-!cB+s(S?GebjP13!ixX=+YJ&it19hLO}_>bo~#LW1hLtOjw`VOqxbbQa4) ze!p0Ig;ke5H3Km6X{(7lT`&m}B2CqvJ;{hfvCZJ3^My$H0BaTsi^~5J}?p3<3M8xkXb@+UJQC>=im|Yusxk@@|b&vCU1ey zX@QD1Ea;We*sjjT)Ahdbq)z4H@5$zvJzzz^)QRQu-wsKd#`p!Ghf99%cTfr*wvTs5 zlqq*%Ib2WKonudK|7;6VUiP?vVKk$Rb|+jVWQuc_Yp0(`15))Zf-HjXgc3t`Kz1Z> z__6KGtRp|HvDLAnj5^_+sEx>LL#u;rOiSb7jZq#Kh=~1>^cXs(ZRTyDu54wPb4+!? zS@<`q^xv#ockb+65An&ZkaH8TcAb2~i1-nWN;lo5gv7%MuI^4X= zwe0Qi1;+afiLL8-Xk7QuioTBx_W*vd<#kvrK&LDG7o_s{E%p>F*7*xbQfj)aFb($u;=@ScBso-#gjP8bqUj-GUMR#Xk1pK;>raaQs<+fT3b^~zu@N!x@ zHtk0r+{tGKINLlxSm~i9h6pM+7pWXF|7V3Zp-V5SW^-*QBL*(KC?cugl}fl+{h?0c z#nydEoaAx0#$I9d--H;JlVtT2^Oqxu%x=+1d9^6hZV(trkA~8#qcqT=Ch1h$ztmcH zc3eN2V~vc>xAheLH};Ysqo;(BY4+%}B-lbcL=;ajCjOwgK+=W1@^XRopWiC5f1-X< zraGe$1E^ez;IqPjQ9$=t&1Fm7c)cNl+TZ))($WS$BF+Q}51OSKzYYgY?(}SLTIGf; zaNnScH39;x86aiUcUeo7BjNRW3GT1E3DbT2?OEI;j1H4*=`1~Ax-MLJh~>2A;mwaW zRg3`2l==4?uF|=RQ~D zd^*|~S3~ii%TbY-V{V{>Ebxio+;S1uM6~sab-dV4R$FHHMc?_^Jo&5zM%TG#{7Df1^Z!^sn4 z{q9yVr0k{ngRrs^PFZ_6N7YB3H$LekpVH@*sr;}q&H5K}5l2m12g_&I&i^Y{6&^e6 zwY_rnO9+DAoR?mw|7LoZWY@H_Gqg1V6^JMOXLZ5t2Cn(q2B6N}`GF+|5m4!Mv<2&> zim-5HvHQXbLYuhN7+ zy%O-qs-S~e2)UgC>V*mH|1=$HjO(I>?)@r1Gx{V4CI7w%=quV@rFa<&W4`53rl9Q> z=pa|f3T52qiP3|J2RW1jfKra4I3B`G3;P(lhU35XZV7BqQEbxr-YzhMK)F`fIb6G1 zcr~@OfHj;}5W#-7SCnbGJep=A{E`H33}oz-V`9Tqu(O`jpxRI+gyb6vTEe;@oznQt z$kHoqhPhdOLm5YW0LdJKzidOd-QXCPNbIGy90|Gw>KM5rpvjsVAU?Lb1Mz)m5Ut2ai}vq$)|vHgo%9EA3C{#b9(J(j)4pO<#z$(ze3(6~a@ANGl`PsKPObpL44 z+?U@YpdExkT0 zZ4UoA)bF)Vq8|2aHL1d1C=IqPZL|^cg8fopX zFJuMX-W#atHsI7PdQp)zCEtkgO|<4l#o}+t#a-~**!~C(>81P4Nbl;VoB`{!^L-s~ zG5+P}eT>8~8QqS#BVnq(bT8vy(mh35BE9Y4^_neKkFwmWe@L|lbI9jJTSEU4(mhd* z!7I=v_ff#f&_FxJ1u~2yE!q>F4u8vun)JyQ7D)bH=330V<{^;I)0&8TpBN!PMc7Lu z6{r&hBAv`gejaHY)QeiumyAqSO%X%1X0}DdC$Ti7+r(+FA}x1X)NJ)|ZOa8IAKznT zS-t$T!Fi`bX^A!3HO0rM>SWn0k7nXaBBP0%Y8pmq^%egdBZW&w?X_`}9pIlVeSCan z(a-ubsSr(Ww4Ex4RZ(*Su=jD>Zq2i;MIuKO=`?mcXX5w^9Wdq33bx#o#)%N+wv%O1 z{Jk>cf0lDiZi~_FrF;ELZ3micoV2mQq)E`}?TXWf=6~WzV2s(cI!oY#iXHn@BHyuV zeonn0ZQfZjyB<@f26?~Iy#?jK>(CO2MW3N%cmtphaCt|-25y{u3X!B>SGAY_bRbGH z`QE%SSRY_F#!j9rE?N5ZJvL$6c8Gpo^C>2gJVC)IEE&oq|tB zZk=BWnR!uYM%?N~4MJ|!(K?nSh7*Nbrg*ofe3z7x8Xn67D>R03dkT)NIv77!Ty+kf z*(*93ahX_DXsfaKI~~1aS#$hZm>rR1Nza4_YM~rB?oH$mVV$~ z3Bn*+#n*ms9Sk=mv{8=S#~VupA7y=-uR|Poe}gh5Rs3x%`zm(CQn#5X*C6mZ(=g0D zud=?cGmlWS$Q^k!SJ7h`W`sKEy0 z!J51m1o#k7$08s309pfm%CxD~;BSV+EFDzfdebD)9!D_6RKQdlv3XQ+XA>$uiO-ft zqg)R$P_gG)#7CdiAu)ZWN%vkOnKs)>$csUWRsUvc6Qa?7*{!)(ay2 zlQ8D;dZU)K-PtNNT#%Hp=(JcOR4^4Y2d-965}fX-F5Fldt!t0P{-)S7jY1(AreTk4 ztwDQSNB))`T>gsY<_`VTKR+g!)wG>O*-XFCOR2QD_GpoBTYy*&X67vL-zOfeF~$|g zP47x4roH zmI{vzjm`Cm5XL&nhvPpck(9UBiJI5Zp}*pn+%bOTc5OgX=5je80 zaWEVB>n^QXjY*{cY;V|dT-ECSQ+Kn7>T!ow54ikNOT+DllJdXmf38jEpV*qF1F)YB*E|_8sZnC3=^KO*s9cj_ z0}|A7s5k)DF|wb;cgH4iz3u0USx#@SNXDiE>diKf(nho3{*6@xLYCu*Ind%?xguBirNj3maPBQ~?!bnEGN zmhQfGjb9QL#v~N*KarUTM|8yfG9D19UaR}X+BFt8y(UyYTsTTNA$Pj+8g|J|{JB6Q z9OC_`JyiU{kt}sDIIpHdV*mMtSSa%P1bwv`MGxX)l$lwp_p0r!`4$D`G#yt6t*N=+ zV(eRg2kAWy3OCrc`nG$Jf6@0-X&?(u*noBOA{{ON$1tjB=@&T;5DXP=7QL;Q zwOeCMhM(lWO9=)&Y%kqo13EcEhl5TaA(z3}Uh?|C>9&3-_O^v@-jw=u6=z{I`jYXa zCP#hsw3X<_0p;Z_Yp2QgO7(XxiayZZnVFi;+pv=#&=aM$+cBCScGT~BK%$w0tp3e$ z$B&=8=tD@tU7q@j_isPY{oouVk7Jdo*9*J-@J~?Zzl@inb3}i(`03T(45sTnF{X*} z@NN3ae+gq*H<;to?1t4AmtNRXrrjC%FtdN!^u(jA3nS^y9EQZ_Y}W2C@EPeu1%9JjoM*4hT1nk)*)`$WS|Z;+_eXP@STlE#rZoPvo7&Xl zp+J_YD8B7|a6A|{J%hWL{otkN|0pw)%*Vsnk@wh_my(o6& z#!Ezt#%yHw@-2)HX57l-DDiP1%fA^;1s+MxtR9|}IC;*5T9g9^ei1G`uI|Do#=^>e zzr}3L5IJwAtl6Aj9`01hCd>S^yTXMdk;aov&4pLMFaD4vNMZJp!gP=4sMBcM#*+D` z*9^~%o|*Q2^b~S+ngc&JaZI6cp$u9OfJm*#pS9U51XRa_uLBweaoedH-gKEEx;_%X z`M1UyhiEfYkpZVXpF9rLusYW&uGr<{jyTj|&imYYK@H**UUDs%Jzy~%zT1Zg{lPGC z+eRpT+A%zfdA74M#3EF+hmgFp1?K3U)5^>?AeL8bK&wSAX*b!Woo9R>5<{)8SYoLQ zZ#o?QieJ1*whPR_aBmJpPP}@?isUu=N=8j9PcA>@h&&R04*}?>>bDZLqS8hHGRKdh za2&PWpc#21%0p*#*G@SX_x(f*=*|{E4#k7_d{6+!rgeQ-{6775$!;wi<1Sl4g(pEx z&rY|#jP+m%;JU`E^XtB1-Zft$S0nMNzuVnCzcCV&=2=JNs51|1`W$I3Uohy^=?reBR*e{7?a6;Y}4LtX{ zPWh9$C9gmYGp%t-3FxVPQoz3*I*8kR0^1{N&v*u$^cL+ZISZa2UJ-)_Karl+wgZS2 zDiXIUHIEoNwR2G6%tX@-#{l#az4=S05!#X09oS#&IFjFMKB+fWrZNWZW$k&DbxoP# zlEH;k3bnhN+*7CPowG@|X%>N*m1>J;LIZEssx5wAgCs*;d9wuv)4Q1S^XBv8rM#f8 zOU@|*=c%UXE{WsUhZeUtkdH=rPf(`kD|B89 z1DD=cQ^Oaip|ha`S&b3f^&nWyt@A1FhT!q1?pk<-`_R5r%!dZ0$)XuAZ(i>=K^^oD zQ`oZLZUak+k=*M>;$yXv%;M#B+6h)l3(l<6-2FSuZV3yjPaY&+cO&}K4N>cKX0K#f zF}}%9F`Pm*0O()lYm}2oaE5MgeJWJTTa<5p^LLx*C1Y6J6Sv>gx?Ypfu=FTM-4~Usu>ip4mj=BHktw1vfvzVe`57isz9RZTjA5mLaE6caG^x?O!5bwW!KSY=dll};{W1(xafytG!B=c_{sNU1g zdJ8h$h^aJz?0fz#vXu1jmwCxg_&v8hF$o5krOYPSh(uWZZShZAz68$LNuBd&6s^v< zE;|sMPp!DJWTFQ)jO&B#M>4}V5Kl?Ymzz$f-`)5VC!w7jwJ_>Cl4+heVr#7GGv>J1 z+}$e~ARY_QOy{2Jm*+pxoIry(5{(sFqit)`#aIC9Rd8oxO{Nm?@al$u>>xkt*_9u< z3)Me=c(FcVkwzU`cxC|>`Zu{hkbNoQB09ydFQq2w4(XN+=sbaF!4!{#ELa>**Tw5G z7s@9VqxMEuaTb*eSQhJ6+1h)B+a*NK8#Q*tJ5q3&vdmDIKh|GxbaY339{G=EjAQ^& zFWZLl@%^;eE@yYtzUjgn(-OrVwlkA=d0Dv3mB;_-vu|6{QaB6ovT@vOHnY?61@RX( zv=E%5-*nJN9pFzJj`TRjmXbmZBd;T?k<=ktY5j4AlgFQ{j~SOGT8}kr@;=Ru-{75e z_bP*$Fb~xo#4L=50pPv6F^E!J>64nG4hEPk2!j9AKQ|D0w0|2Z4PnwoJ~{Vn^y`RB z8>o{#7q6u(WWAMtJt^$3i#sZ@_ylGdcV!hPKt}v&%@w^qAMOf|qJa~@LXtOv3GCTf z8#>O+GpotNYA1+m0jtFe-Ba_I5tc*z&nh*}EB@vY`GuMd-}EKxcPmH?hvn1f#TSl5 znGmZuE9HU4Y*;=M!qZ{y@FRF^!t?$+0l#X+k20RX&qY(|6Z z0kh+Be5%w`ysqA#kTEIoAZE6q1L`Q(0Z9`XkeA$&f2OfhUzcyPO;THwDx(5 z=F9(n@N*iQ>u8I#T{*n~ci`r=_$1Tbh+Sd-)DwU&_dj*dm1kJk=39Ief1U!VE<^>i zhi<_(DW-_n0?pI+w}vsJhd4Wzg?noB(|9IfRh_p77jGX-NMq94f0IYnx~Ec!|FN(t z{I>*)_8ke-$Sad&&VM-kuJOrCPUrpEoXMf!2Ghyg)WELNYjK#p30jWUMGKRC6m%?i z3P^3MAQ#ph#kWn`7KkwEs^nl&mWuO=J}Y+3pJZurgCMvmjCg%Ckte>;*4;9{C%S%3 zT5C;gftq!ge-xIzvHwZDq2H95_B3vu|4XdmJWsB@Eb=nI8!+xm8?8Q5R3GmujFALx z0##&PyxNv^loU08pN%cFd9vHeZ2*S$A{Nd4>-cd(3#7TRgg-B^KU+F;#cofu8SV-~ zUN(Jt@7iSg@u0o8qC2w00%ipyxN^GX>w#du>b%*t02B4obQP;l69n75M60nbvQ^g1 zBmF=2?ODc^G>0HdLD{o^zRIPSW^)SS`~;JTuich6?{Iv$_WCK=bBO!l>Sj8g5@m;` zu@@U_Lh!vYl!)gfBNty`B#&rvYXmaT1RlBlo51^K?&NJw_+X0-Nb(bU@~dK}|_ zzMbGcKc$4F|HJPeYW6hUXI!Nif&Qt5hchRgLY?SZ*4jNAON&MbiS#@M8jY&yCDby5f)Ohjn>7iz@s_F`2FvslL{-}sr6$E%y-tOZDz zDI=4%x%wB=w6Xck3l*KuB(s@jp9F$Yi+i-zJE9Z3F{*HZ`d=)>@tgmaMjN#u;i+rK zKH;Y9K1e@1GfT*^)$5GEzY^r5dQZmrY?qS44V$bP`O=pc)(*R}q`4QbDhl(gcaAdP zBIubjiF>@D;m-lI=-}eYy^JYi# z{=F$!3U|wzog@?vDLXPLIj*vLwWv1%@8sW6bF>9TRjs{U*g>2 z$F}h(_Y#l4jo7w4msu^{&)&U)>mhn=UA6>Ht?pk7j4yc#(;N4amaG{<2LdESmb8j( ze3~{tVkC-48WM+^jjLZl{+^}eRdv*UW$Gy^+AO7%xF68d_i-VT>ThqkKU@tNr^He~ z6By|~nMWNDFA(+>mZXVwzDbLS^XYkOa$hi^0u6yvqLy=uHO6Wf{ZVlt5^T^Xl68P= zAF)q78X&niQG(^5me*wWr<|}o^mu}kWcc}-OUZ!bO5p6+s{77a8%N?|2=Boh{bAFu zmZ$HuORCbvnt`QoT+^ZYVNt|mhB=sfQod*Eu?^c5J>x4doMm01sX30sLg1CMn6s{# z?cM`>mecEQ>s@;jc~tPeAut3&e|S`~>NjfU<2DmGT@PkQOd9e2NiAJM`frtNG7gTO z^CL!-LrvKzH`~XIKyXQf`Y0Cex%4{(MX_=U``RrebJAtj{KExng#;T3sN`#=Ytxby zWxvv4uvzQCY{f=tS^#-*s-HHjQM_yQ?4urofV5y(&CkK=)3ikD^bh{wi$OkaSCVii z7hk^QrI)p3Y2?t2cb<-9H0YnQj!L0Bmm{;q$)pDEPU@5sPYz=h|6^I49BE>^Sl~*GUwgFHLVrDAV_Y-j=>fR34Jl zhb3gT2TvTBde0Dq{P+oF$GBIUQaoAwu4@T6F(?jFTs2!;fW`f3uiHqX` z)3+A>okaH!;!E@wh5g`74vnlLe%hG61>Vqy`c$&`B~ks#mS3m@A<~&eNCQ7bp*^{! zjLe^E*D;evXYKG8G5LJo<nW@})5i`s|q_iOVk?cI_G6Bb~yC7zUN{=y@-nsS7mbpfKu3BUhD9xc1EqEr~2 z@IC9J7@}d-@U+9wFftB~x1kJx-6pp!?3870&YHH8rb2nAA}R}kP??oHwUAUUwhCLy z>G;(#rGkv54Tmv}Me&6pB|kMD0);zzWKDZCGN-NfMwKpsPDdwfw8Q+WbDKISxHTa2 z5AKUZeSegz~jVFv5WLor~x|b?=v1SU-+(`aTk=g zr0N_a$;vC6dc48vesuCAW-&KoD1r8~eX3xg=IdbF7Tpo8ld&TR6%Ggi2 zahkY(>uObXrkI;RrwwP>_VKWz#^&wzXH(Hl=t9vD6J7?*G|pIk0+sz-mE6%Ayg$Vx z{<6#%Bp-zYKAy1pTEg__ZCXM5&pli|UW&q2Jv=F^(~Tm60NySq7nKD5Z5XgBzHIa~ zUea6NA6YW*EGi1Wp&u;?ACiyP!8@Cs(Og(Bpr9tI5X=T>4$WVnf`96lPE;^chN)`- zY&8y@^|Q3yOGkUG2kVRd0c_DwPd!;|5xkJiTDk8Y=ttb5W8w5GoOgG3&}AfO)$;R= zwyYom1#LS}PI|&KME<3M^M{=2-rAZL;BZ zN9?}abaZ)>^^M5Z6`f$?tBatms4w4HfJRxNhxbG~vieO=xI=ipUQI>_l25$Z;^h-* zgz1SSSW-rS_%labN?GX<&Rda7@@pP-QNk~S+#%M*A7k_DQ7jz+A(4yraqBQW`DDmX zlrNU7x0>*gKqh(mJz!l$F08D2@~K>(cN_DGHT;E?Q|9F_Z!{_pxXI!g4(%4&iQb7| zQGdB$-bGkv_|3@fiY&|S5D;>7ZnTVNx_tk&HEGT|WX4oIsx%3hsRDFnWvH(IO+LQj zasTN$L1q;D0i4v#gg@8$dhRTK<9)h#%djx`rgW`QMwv=_oiIDw1AR&D;d`TGI_=%i za;R*B$vmQsMP5{#X1?|bkQ`@OEOt(WWsNKGF3o}VgEQqD=R58%njP$Yd($|cegR`I z(ImbS>wFn}eMxkFjoxj1v3Dtw30mwZ;cBpSqlyGBr#!x6-27;r*Nk2A`Vp~*vW>sd z{avMRYBogv4Q*0HL*gv;?H4yk;d?WPQ!2oW2Y6CA=BZ90|IgoNK=g}Fj5&WZqN*d1 zwZ<(Mrnr5jn)JM&&L$=mSz%*{6PoPEz&;EgJ^%1AC2Txo2e^T|O@eX))bzLx3cX{o zKRsSXbX%?d`=sVYljSB<*%HPhCZRCmfjGP$=fE|0jI_BCV2O3=9uL+2^Dkm%|JiKug1p`C?`bBw3*=L%~{;|H|%dS%Zz-jhCa?=AZFm zj_gi>`O?^f!je0A<{b$VwJ**a?30x-VSkeF23SYxq=nudaYn44F{!F(GzmT& z)|UmHNF22+szFVN&Vj621Qeg>1j3t0&xxSwCS?(DreOu6!BSYm99d^Ac3vWVwxhA` zo%)0+nyU{J0m75sZoL{!5BtGDxo*`7zB;0K`N%F7(#Xi0@tTQ0Sp7-m+(Y(lCmu6I zK#aJ4u8ok%)B}bAmd8UL$!fE%6YB$G2uAlW#Y6Ii^+kF+#qsgOll0R!wAxXO)1r8t za$2OORdKC5yd>aW+dQt9qKpUV6fEfCHejy{`;+t%C-;_WemK86e^XBR`R~v63pk_$ zUc3t_t-VhSznGn%yE}?v4rWYNf9cTAA^?AVIF>SL`TJ-(FMPgEiy4Fa{YQ-%wPKMHRA)weH<>&-B5Scbl`pS*LRhFe5k4x z0G+X$%f0=VAAU#@8p9l!b! z7O!p*=Zy`QPi2#ob#B;91ueb*^VJLG$bC6@O@EnwXTl8q@(d#HE%#!XM}}p2I>i(E^-W{jS~15XH*NcVR?$b~)|zLC20R<6e0bK?(=!Dl;ee-~SO)lSmpAcW zhlgFXfgEV5C23}3$@-YJwS(b`_TXeqsuIvi@c2nsx<-sKCn{X?yPc%j(K`ONMQ>5= zlDk)X;Xb$4*UzE>MG}{cCYc{x3uiOsV#9H`IsGndwZrx3Iu8^5CC8P^busgGxHb6; zI>KFc?@E-vs&;vY-rc=XvH1iB9u4noMfS7S*Sh5Gq@mAEkIss9=FZ$Z<2>^~r+{DY z(+e2vZjhvM2g!#fZ{8WONPm^EsH=8Y+Wf6o@=n%}l^|atfA3Vkv}7TtpU8kHSR#Cv zN7Wh{x=ZYM>PdHZRC%ueuNect1Yo-x8`w4j#2}01z+9+vlgvQ=)*K|Ibjv#$z{qu0 z=f1(y`OUV|!qZB?8ste zH2_MLYDz+eivJ1ADP|I~L0fg=88zFa+iRRThq7V;4ry#sUH#KCXah7=#RgOHs^iyu z)wDn*aH!<9n7Dt_b8t0iJDr{`iVrYh@prtQnh- zAK6!0)*~z%j{P8_5q{2MyYxHv&si-zwP{_@b!5QZ8TB?VqS(JJqhc0Qsv>zI8;wi6Okhob}fe7qbL}$pcJ!z=?@GlQZ_j`>20`j0RV; zg*CmyFq)omeo2^2_;!6yeceU$?iWveuDP&8lq0h?X3ul&J2(Cyb$3ax8R;r_^Z>4 z2`pN0Mb`9Uj-&fKz%-63JhnS?((BaGyRg8{I)Ro&_&VWkb+v7u;CEu=h5Ys$9x%h7 z!l>t|PHVMH`Q`ybez5QfCLRWI7yTX=2_}i+ zdf6LH0(7b;iw-ps42%P9y;*3P8Ua2QzNH`*QpErC-suB%V@u!%S>L5uW$Fb`PxTK? zi1P*0m$(9t-H=S|2dgAQWBa}dfKoOZiB@Dy>KDy~nQ%XXggfQ3FLSsK&b zu;$Iq0>Tfwl}vG7F{Nfr8@!G3Sq(E`hRd65w0JNc(1{hd(Y=!N>|1qJ(~3AETw~tu z1h_Qx$a2>BiG@7TnJkr-ZT;Xr&i=-{e4dGf=&~(y(l;*~n-Wm!587|ueZa+L*@YAC z7i)Xi8bo~+($gl8F9g~Mwy?oH7NPGiwm3nm=VwscbS$p&9GD^dn`{a7;q|XWB$KF@ zTGAF@liO>p#j_D~>jhq!C`>!r^QDW1V-~NrjSX-j47S+w%-)Uu{P+nU`pDXMcmrv+ zym)e;=F_waP<)s2LYt{HcPkx1EckeW)b&EnjwePsNH@&+YAJyR9Hk{^ySR4-97mvM z2l%7Z{+8+U>w)6Hj}Lmb?1nxX+PYRvM7zCu4>w1g&%d5!CX2y{nqioAhw>I-*E(hP!?|hz_gWv@mZF;jqDx@U(*X@?i|Ll&+gO z7#n6lT6Ej8oE{ZDgN~Yc-(nI0>}Jm&jYGJOtSEzjoX0#IxHwhuSyz&BMNm%(y|V*! zT|7XzoAmS5nb5@9rKSHyB=y4Sn265ca=I#D!v4EBX5Y?3~Bcg>W9Rd zIiG-Ku#>C%1_pbKX_TPhl*OCE&3QjUQj|fyEGy$RGZ0AJ7wZSwYr&@JXyjJWg5X~1 zdIJbQ-+J{rMTf!G=J6epo;e=6QSbopy25r$bB+-s+PmVHEKQ3`C=i;!*3O>`cEs6e ztH~to1)xXuA=X9%<9r8OmZ-TUg-w0({G??ZNf~?lDCtC$&Tt~PaMb_|_DooF!(NA3 zQd|E2+6isgMeYvHaqh>v0T+~nMOM5@cb?guVSf4Hh)pXg@)ywm&f=k+RFb#SCQg_6 z1YWbAy4R{EXZBlXg0!HBC)B9qXTrl;)r4G@SS|eTO+fu`cq+1Nq==6Vj2TXVXOJDuw)x zlQ~~bq#V0XRQ!%x{E0s)Y*AmnkDGAs`_3F_%mLBFbX?TI>e}DrTOE8i7<=y2Ym$Yr zs6j)%Rq!j**lcdPI01|(M;C-!%NL$XX`3*>2<|3wU&CCc(RxCcG2C}kR&g~yoG;u1 z8TZqo3EP0!AeNBKwMRI*e{@hGd6IzUIY9ow1fK5(Su>;uJm>Y_dKgUubt`nvFE-)i zxb-0w$XV#1w^Lebzm89lY*UqI-fTl?A_Dno#4WAt&2W@w2v%;*n(k@JuIn??AC zbqfkt_Om`p0&^UD0jlY^%SybCJ+K43;20LokEp^b;kPRjBiDs73zEAnd74PGxqMrb$kADZTCNeY1bMQt;k3v)Q2SMwB}Vr%E*$h z#a?mRL1Q)Px`>)6{Ry3j>-(}{ly%OUKWpZN2b-+lUVx26>+5t$=N^?}2O3u-6*{$Xe`G+Fec&h|QLufX6XL7{zOD&f<}cgM%I36lp|*14toJ4zxVp>>iXE%7n}2}b`0kQpKH!2J9=zj+ zJEt5>$(wT@T_-dy7oH8en5}5g-+lSO%)OjNC~ydQCs|K^M&3~1uMsT|0A&ZMWZm65 zSwuey)Q-#g@`I}zsvU>Yd_Nts$ zjON26dI@T8V}sd82T+RONoGg~InC8f4SGC|7a2s~7ejX)Altlv>O?KoCnLL3+Q~(h zrl>~ixv^>&COXuZ*&o5;FF2}at`tsfm`RH={!ZDz{!*u3zqa^n%bdK)7b+vAIULCm z2fcgF1Kou!{}k#(b2{h4k7MhGf2R=%e&~W>tUKtN!&K zY+^W=YVClV)Fj2thnM`Z_iAti7r|`Er~s(dz5P+AI(6D-eaWm%rJus9)K@`Qtt{=DosB|I{N4dTnLk5(}=Pb8 zP|ZHE1mhGa0Ch`P=0`b84$Z|q@PK`oJVx;j;pHsXzYn>M-8>Td(qn1tRqAi!w}6Zg zBmN!n4egnG3hHxNEjeDXBZ`5Pc-(&-N#;C)V8x%Mf>eH4QDMooC2=t+;XF*Qk37T2 zvAV1I!?yY6g+36;^%Zo}?pfP!LLV7 z9F%@G4R%&FmS9FpCZX3^#t+7BM1IKA_(MkjzzZbz#DRx}+wA2xKkJg0 zf(<>1tSau&6_+&&zenw?G(%7KLwy54`Taz`{P5qKQ^j-hXyPu^kT*Z5uD=%7?A0i2 zi&ktD#3;{+=A4XfJ>`SxYAF1DJQ6#|zq^*Gk=&{SQj6xX`n)DAV+`#y7bwb!d zpSOU4q2`(*gpuu5lDb+cf)4OYQTyI|-u2D{5D(uc?33&_9xOQT6q z*e6!Yfzz%bLn+Ct3II8 zmXJdY1j`!=v6d>M_SQ6=N*2HdZYS7y5u+2>Ug{9~Or3jNa&Nq1uau@Os&)u~B6mep zMkO4K0cVB(<4eLcrpef0c+k(HbSgqQND7lQ#`3E1z1HvCk5^nHR znUstW0+6?OVTgOSnxV@9=JV5Z>oYn0I`J*D-4G{uH^%i_3w02$NQJ`h8>~JJ7Mw>|z#K!LEOA=ZFhN|~D0XNT z_Nd@@Y5kIO2xK4_v|3C*^jsypz>2Vj;HEFJ1nby3hY9(&qHY=p`6p87U1Z8#=K&6JI8tcZ;K5Z z;2#X%3n>Ryq$oPcKl^>Hl5vhJVq2{?#&xm9+B)X>k(CWgBHr5q15l$Vph`oaaD z;W`VZCWC$I-_34VbP7$`z8P|9_^m+yc~E}MpPjS%hatAtK0)2tvR2#2&d#qZ%2@-~ zEU`gpZ$*ZfMEb=TW9v@Hz5F`TK4rXB-UoOZM{w10`YP$~uCO<&>3T-t^Y8c;PbU6| zf;t1B6-&N8a{FthAiGTf6hGz`12`7Y_(y?iPDkMV^Rmq`FQyPwegn3v$r2}=f_;PH{5)Ek%7yzXB@Pu*juDKEOu zG0Z-K6YGs~W2me#m3g2G4T&Y?<+)RfSi6^RP1~^v=iJ-}ayCVW-x6+iaKAvl0&yt) zy@t#o4&&XuvU+9u~eY75N}H)s+1=gtN_ z3`ZXgC?-q*a?)oB*P^_YF0ZsCdlw!E(F&JWCA{nbN<-+EJaTkv$&SAu!_vZc)0?qkB)35*7s`^JDSA#2E?I&ycUu6IZQH-mnWUC?XoaB+PF49h?Qw<+k$2|_ z@!_XX`nXnsR_i=yh8RAIiTPDF>F!cLUN;CMl{7E#HxAOq2OP|>VUv%so3l$oo0U88 zg*V%+w{784`eR5Df3eH^$W8^HFUS3VVKIz;?WfpeUYH(YoZ1fC@C!*UKxqzK=@NIT zf87`XtEug&l@I)U&>=j(WQR-^H=7ca7}BUKgh09H9%MX-H_Kd9BQO`JV>VOd;KhveK1*4En5#Plzdl!w|daPJv2_)7VDtmBi1-719neI97)clyb zWq?<9D2jQ>s37h&u5XTe=wOZ(8WP_i3@04Hb`=S{iixDmx`IScWZoNJ+GL$>mPAJQmSzzJ zM_j4_C9IQ>;swMdVz2d358#%A^v?b%?H~A0g~7pK;KNMEmjIfw;()}8ut^o6?kUR@ zGFQVya_?Hdn$07kv<9ZhT6Ess_)8$rxX`M38w}CBAv9rbp`L%Hj$Ki5SxTbur1sy} zD3tc9;BqnO3;TN{l{jsV^NPna>Qkg8ri{v)$Bx0i7)TkEu+3 zYo6(d9;g2Mg{kIhtt9{qM)9q0V*KbVu^*3ps7%*Bx%J)lM_g8L#kf&7DJwl3rM z!7@~s8NYw3247jl8ngCTVO?Y89EdRX$Sj%ncZS-joRaEOZ5OBoGtm(JMH(ud*tV+l z?dIWDNEL#=>>%eX>V?B^O=XKVKl}V6N}rKMb;Wcp?Cm0lUBwP?m*dezTBc`I5=B8) z(VlSqP~4qXw)kshn}E57OkcBJJ-?E@JFObL*3@zcyY7Gc(VClk$VUWnA%ZT%g8USx zGgRq?HKcmiq{*pUL)*%2tb4qIzf8gS^Rv7ys}kd_ng-zt6t6)-QVj zINNj`INg@$BjE-2ri+toJTs`VVZi$t$0WXm0lTzXib)Z~g(X4H%AR8GLr49crtpN} zNKz$sUUfsqO4saRI0~lURG#!WH3U3PF5xxn@_43>>w*V>|IJIIMbZjm$GcnGTiTw?cPcl*dCHS*;rL1QO zkiQ%#%wC6nZ}Rp5kw-AMI{nyh_ffbPLQ8NS=sy&N&Bot8T!>i~w(t05c22G=M_ak` zEN!E~pI*!7zHO4&8uzoRwDb3FoZD*@cxHmff1-Wfbd3S+C-XUgdJ}W@) zPX$bJBP47o#*h7^Vq2x3PZwM5@gUt0qWNF1*5UH(1Fq9S6!xKexQ)`KF7L#S;XB^1 zp|;ax)`gxp{}a!cOH6N;x}pdR1rgC-c|?COuGPD?k}b%amk~RHZu8E|fNeE$gsE2g z1Da)HagXhmkQiI``8nGG2c?%i5e{pyYcW{)$eH~&*voHSIKs!?pU~u{h`aRYPT{$n z-AI2<&IxtyPJ+FD?B_%ZZTz{g5FQ(YhJMA*vCi_mL95q@rG`hYpE>sS;Nz!@c2s)^ zXI#5x3-t!BS@_Eb-H)?f)nEE^MBD5>*5B0V;-vSy245NOM$1Qdr|w&}YWM3`k~FOm z5k6s2bw|uf^F}Gbh_jd>y_xR}sV1`<(9)F5` z+5)Q_a(gCcE3EFvh(tB&`N>Sjyyh6ZiPXx)q5~CJ7z#XIYmla0=44mq{C*;O!5`SC zw&7>rQ@lvisY_bCr~iQiIjgofwY_6ZZ>|ZE`2ft?lGct~Am3wP?Y=q%-efM@e`%ch zhL6Y~UXy*^-GyAZ3OT}|g+`)u4XI_Qq-Ft<;bxOZjDZ0hRm&U~mwDZ!z2(qKytpBh zn++~n2%LUaBHRymONgS*+AjZxptLD?C2^q9H&x%Si(&SxGi+9c5(i_7wT%=o3Z;yO zz_%Q7E&}hd$Fd_D7;HkkX^_QgFUs;)1ap^SyL6VJ#*c=Yv3w@<=@9 zL@SdX?WcavNCol!ClY=pCNmK9J)ZS>7iCO*fj)b7h6cXbT$y<;!U4sGY3RU4oOip1 zhc!iVGd$LkV*Xy<;lncK19|mtpJZ=*sX=_#yc+!D@nP-_!iMj|C6s9f0@!;4Dv6t6 z8h$ru5p9QE0R?_BF2PslGV@LE=NM0=*mJmzjOLSGOo{1Kz!UZ|ApIr4sVPJmdXq5b@DXIu0l<+ZVB%IzfTrd= zzmOhpURl<-h&e_Io;)(Ar|o`%@>}dH{n#&cYSk=%bA!vF3b%g*`MSwmORyQ|l%r;lnf^B?q-JinwWoNQwEup|>NlaVV$tKk-h^cg+IumM6knv-vG8^!v88$* zwilB&>!+oQ7{BdL;N6j6(7OMwXJXk9e!qE0VTUKp1J7N#?&whL4~vfs`Q3l@Qd$qS zv6yhvY`pOqr~|b8hjX$Qj^xl!aeR%JYF!XDgWI&-@M{Sx{ttJRC}gaYF3y!aDQkw;2&~iOW;bOm0y%5Go0qDC z{i)k1#oqeXdo)!%07?bzm~C7z%(3D>^-KEd>!uTg$jy(#>aUYx?%`Q>-8yQ7>o{SG zLR<|(ZIid#zBCrN5iuF>$tO{8bEN`Jk~X@KNK!GUP8D+yqRC)6V=Yt#bK&_fsCs+` z_@DOAx^GdDXbzajJ9%aAODM;5Z%^_sZ$hp1No||873TBfAl-Su7w|dGVa+F-^B*PP zGJH^t!zFI_dV59L9nP||S)lht>j}JX!)M~C2VEpU z;@`l?!mHnSciZ>5DAsxAyGsY0i=}%4)AZLWuPZLS6O2yuA{`S=yqnpz10g>!dePJF z&YV65;FJW)4J&qUpt9`@I7o>wStRlr?j7*n?Fhkev)@I^O`_V_jq@xHlTGO*_DR!H zBf(+;viWo`=-P_M@}7iO26j_!O&(p4_XonW9im${I&{c1x*7@e7khDvBH^dxrxSR$ zSxDGEHN-HaeD+u?_s_N=`9zW^3WvU(U3gJ7f^w*0?!e#eE>_-& z=t9|1SP6ExL`D_MYIgzHyWPoo-g8-cND89s5&U!bS*1PAw_mWWzkcRx(Q{ zF(BEsLU9k65oi0!rq?7RK&9kk2BW{sYECd14*RA3AKR{4ge)1J7Qh!d-Yuc{2VZ#o z_lkV2m|yxzJY;JT@vfG*XSH{E!B!pbW?U3KZ%M2*o7591g4q`Z-v^L%SKsbhtCaKv zW>n&&QJ)q{cMD$^#m@B-ZV{{G#<6yb$$Q|ar719C|0%Vvw#_f|t}F*8P8)9PsxP(S zo>ycvpY?aZc~Cyc(FyHq+KN1v?yRi6Y=x+|w(!?ZR!=Wpkp+Q<|AXK8*sO?k|C|B3 zL2tRe?SoO#WuxM}r=1V~oO#cCq4CLgs@93Q(u2Jm(Q@=tf7PMMjadUNC3WdKTy`_w zjigGe)T2Qn!={fkdw%?yRTvqx;9Rdh(+aq8<-nkap*a`6|CF_3%;V)Jv5;%N`&Md)nSi8tgQ6sHqz9(fLR{DvOy3DqrzwoU& zgeffVlfAe3yCE*`1_EYhG4A7uKIp48uwAzSplOED1=mPz3+*E1I*=ia!zv!~j6@(0 zAjtc89r^ca_I0KU8oeJ*3?^O7&*yM9&-g=$ zH=q(lK^Zw$VV{4v48#oi=ZGHiL{1%O4?kI!J|nwJ&b>;5_B+5U1Kwu}E4*yw_P_5* zhz6!Z|^`T5!RO<8_T66>8V!sft(=DK_=o zp@apNw3opy<>JWVTB>|z7|k7{PmByfki5H3_sg#9q+`3(Hi$vEd5Nj=!`qUt+k6gn zf&D0Pf!(J;Qu@I!5#!8itzrGDiZ!9yq`y6Uu0YG(kK&KNKrF_?odFnvupT zCuuDIOnC3*3FhHTt3jPP4qHJMMZOle`mC|+U1r~Brc@wR4JH1zL_Zn7ZBWF6M? zf?>SHOeHkFdoVoPe;d>IXSallQ{7$=eXz$KnBCud{=hm`atC4y=6?<7*Qy7H3yMcM z6#6XSmxT65hLA&=Uwr@xXywz|h8fa|mrk%Bjv|HgQ5|jLw9&vj)Reru>+1}UZ7`DEsNGfLr*B{%W5d}GJ<{@Vu#xwzf_6jH8Po1uDwdV|??^Ak1Me#ZQa zHu3rPc+#kwkb2pF5=9JvU>z#2(_;75?#FJe0R>Khrbuy)uuo|w>I!(9G~;;QI{>;g zb0SSPqR>e(`B)!hQfT< z$VY=A8rmg+Dc&NYc7iB_19N?urMPI?t`l_}@XF z>(hg|Ye_m0)CVVh4c<{hWv#m&^QSJ$>F-J0rDLD`sq(QjbynV|<1-Z>wWJ&O(a zlE?62g!~ z0#Fkx&!OW6ZSaM>Qul+Y4+O!)!_X3l!Ghj*6!|zyzpk77{|Qbz;v=*xofbi&C>7&F zsQRbsez9dI4iK?XqvR@&^7EZ(P$bn44a4O2*-h|PD~YNL)AM(0B_bDcmb1UCdrR0) z#0;z<{s(e%iwK$$B6(9=wH{!MN6?`_`8kQC(#11HG3F}Jg(3>Pf%?}x0dFIqjqDe8 z)daxTTY52FcbpwU=O44+J6vBI`&ZsaaLa+Hl%tG9u%19HrA?hTV+E2VoQ;k$YuD@5 z?>bTx0ZE*}R<`I3(Gmm8JC8_Oh-J;aoQbz0nV=VyPiW_c?EoYkzOu*eO_&f^^Eq9& zwgE|}xd&W(YCDo{wj``tS$)$^`U6;X_*1W6!!`FnycF4r5CImr>!6!|dOu;^_1ydX zK(coKt5M7A?Xf{?ng$P1p_+e5J$wCrP?I^f7yjC1^)^)?2zF1cBC6ods@I@`C@q=pFj<1g$OS9b?B z->5&q3E%FGpX&ObK-g2qno?|#0mw5thapl5>e#Pgsv8>A6G|rbsnsRCTd@7sY6=K^ z;30J^=nSpZQVT}m(+)ev~WVX|m>6NslL5 z=0L@dAyrbqY}lm*yU?lXb+ze_Q=6h0qRGPC+dlE)qTgpl<6Ip0`giz;NXml`mpcmb zbk<=WQx9hfhXPK#dlfSM1}W^*W>idH>1oaRr&9r@mNkC;I-otn{(^t#@TaGILu&g+ zROBH>myU=EQ6u@xOI0E&CO_(@?GbDZjQ-NnJd~MFVV;G9Q`h}K4cv!MLU!sVe~ z)bnT>b6Fh=l%qixCgS64HPq)k$Zi_Y@QP_a-bP4VWf8ccKbnK34^@O@?D)4DX~jl! z-P}*-!K?s^7!UE}smPHUF~xgq%XyS(4L{ugnES=ef?H5Qo*qm9&VP6BHP2uVO|8$d zw1y)KsSYo+Nb!*CA6sobT-e1Vw~W?fmF`fc|7y-q08F(mD&X~WTm6@)V?Qxrb2eWx zZm}3S;@jNF0ZJ-ew~%8fFs^0!OUtR(DRSXF8irNxQ9Ojlx^r)B-xoyof65ZaWN`0# zNTtRLaW(KD_B{NX-mG}cq?nvENe4j%87@3bFf^b|V*SajxQO}!#!=cLNLN5x`Sdeq zqz}BHGn8E+%u3Tkd;_IG5rklSCxkNM44nml;EyU=TPE=2pOjU(f2CJc^!V}fV|Sz% zfWceq&C<#_9>8{pr<7ism#hSKK~#Ok!dueA`KGC@la`KY?EK*5R#i-m{nS=uEcH_+ zQWSbQ@#INK->}c2rHN+G?iz`mJMBfXf$aE@aN3d{bI@n)+y?7kgrtWSAUoBQD`0lE z&%NR_N*DN=#^hEtfW`6BBNu|tdGoX=BI=XC@Li=52n*#3UEfyP$Ab3ax9b1UjqEb< z+N;={I~aI!8HTRFFgcD`wcVhSU4!YK8#M7(cpeZA(>x6n^bA&pv)l~OysU46!$j`P z+-0B4!+}?k3{Vlg)?^YtPTD+%GNwNn5M;5G;u&_3sM}@Ao>w#`_xvz2Wo~tK?1Kxz zEfdDwF*z^K?^CjK8K9YnvpL-t$=aL79H z51z1m!Uj+arxu@svtX>~Y_kbbcmm^+0+3|g;4nN9!YuVIvwzCm-?!a)PdVm!U583% zXk>dF+WRu&l0bU(tYmOb2w4p1gqG5x4uNyXBY|#ky*c)ZY-}5Nm?IXEnr?uXCW~6-rq+_uCBUlvgp-h>OZX;Lj?XJ4D%mY zIw(ui`&RcEdlbU!jU-^7r-Aj|63(0p_`zbJx9~8W)+Y6m0K;q}n6Tp7;zY1lL8NA) z4itlj1Pe)M@(6LCQ;D1bjQRjeHwo}S;Vvrx8g8`_nZ!DwgSzMU>Y688&R*%=Z||?_ z%o92;W`jLCOY^y)zTLB=4r)NW!~_kngGyUYF;^9!<_FB82hE0=pxn)PX||O8BqKG6 zDYf~a6wro)jVYo%Ste1(-Vl$PK?M6|$0)UB!$deiVw7+RxiY)(06T*D9hjyBQ9`jz z72x~J(8Nq1)CF?jI(Hv? z{2&wye8hV8Ef(j{0AyX>!GRb3>&)KMlQCCNPjO)O56mwg!jB?#flV6oAL4#57R467 zl+-h6bU4WPAvjWx?1~m^fc1zzqi<;eUnOqW<&xp)u)WXP~iML`hlOfVYrWTg+qJY%E3!w6}||6olrn4V1gQRP%7oYHu-)|k;NU`*{kbSxt?L;|)(-u55h zhNELr3BT*V z5>$;Dgy+zEY$Ueun=B!ACJLLjxd@-`P(cIM@zePjcaSkokJ%5%!ZJELNI?)m9c&ob zYNa8T`rrq*f+IM&Lq3}x`v}<*jz(>e_xC}P5aZ~|=EYblkXe7WBF`Lm8l6E8Tx6 zXcyu~h$*b9wX_~?NChgeVVh=e=f4Ou-F=z7a98#`mX*S!Euwqj@IK`PZ{{qha&{cW zaJh!KWHzF|Nqd0m#_kk!)d3B0EFs3Eo`fUgNfhddc-|dh)~B*qP}>@{(wE5v_dqMR zY@&tsjNon9oqGx>Ybr0D$@wtkB*jLM9ZveV?!r`=2TdH{5!rMzhNr8}(4o@X5CF3K z82mu#ZKeU@BsB;Reroq*fIEcH^N5YI|2Er1b6O?84O)WVBcm)k;+rDMa7_~c=Ec78 zkga+sbtM;pGAVrkfTu6$ujEn9eC?4|!Jo@O2f9GTU((TtEI&{I%L6AwA_h-N_exOw zS?Nl`K|3hPpnT1HBas{9a6NCBg2Vp2FZH&n!CN-*P$Ubr`mGwZOh8J2miy zijEH%JO(kP-q01hor{xZeGP;?%9M*8X1Hl1kij3e<=n;LduQ6+-V!gf$1uY|YSK4&Gii zdpPBTTJ=6RX@PFUXW<0jz=j1YaT_Sq-BL0mH!#Mz#}d$i#5Enqm~nECL$r$o;Z93T z>H;fODJ0z@wz}mf20V7~3G-kQRND0fV>$_cRE%g|DfdM=BQHn{UT-T?o%>hxXn6dWN&s@Qd|puSEp zaa72u^d&M(WRNY|3qIX+-Iv`A?m(Ddji3~x(ot*z>M5}iwKA|twX#T`S{2BGX{jTfKg>i27QJsKoc9l+TxkGfUx$!ZsCcz% z#=oHzDpSy}4nw8pRE_;XXzubH9AFRseh{)u{okN6$0;8yOApgr zDQv4z;v}rF0lYvt&Pvx(vFqX9WNE=ZAd#)Fn-36sIBD2-gd6IvcLbO&4Q&#%qC%rW z$UVNy81Q3ik_5Z6$C|k0li7>GQf&}f?ENn!KWe@7j~2-VNSG*lK>7d^vgby{pQv)E zMqUdFY?u)yW`L=87Gm#mPBo9&qpT{OqRp1kfOy7927BHt+i!$6@;6f#1yM)&=n4i8 z#J>R>Q~!bxyX>pMLu6N|2|D;Td`Ps4R{gyU!4*?Rw5Do2%mx)B?*Gc3~HvSRA&$<$ZeM6vvb6c-9maxqEf31MuPUAjhpVm zR0lf(3WKWfOxMx)^NLU4rb0A*U(QKk5uE03hDE=14Qid#&Ol)!s{-3OxvS@}*AR;6 z+k)R6-Vy*F_|mnZVVOQkz6Luy8^%D`oW6$1*e#2r%q?*-ZyL@BZVEs%bDQb`qU{Yy} z$aww9MW9K*RLUvSTbAI z9)8_kN!v(=n>}vy25<((x^2%+Ng8f=;}-|if-HZfqhQ2FN20#g}5aq-+x^7JEQ^5xTENY%mxHv)W3)1jVoyrL^olvr~)I=ZCg~_x??Lb^s^9b zb6^|09qsUtK$v%Bt_wDQCCxXE@3T0Y0;+n;LE0nso4`+2BgFOr3Z3H1d@bFFxi(kQ zD&hcqiwUqY8x3UhPqNLS76CQ1%2qdrDSv3Kx(osCVSl47lGr^_XpO|Y z7Hs%qPs|vly}4*=+p)QM#_n)2+cwH$=A^%EC9CQdUqs)}kiRUSLX`c!4#wvcD`@}e z!UU;m7s-!>)vCI?yL%u=(|282L9BAnCGPkOO;<#hha|7|Uc5W47W$s_pW(az^bzg_ z$~-n>tq2mWM{WL{VnNqEx4AnnDqD0Eh!Hp6i3QGbY}b40#?w^PmW1l!=P#ZZJ28i) z?s0Zu(1{NS8F067#FCj@4luGlL^q&Y>w?VNA|F_oGM#+7)`U`ui zZzOTux`~4(5Z_HXf`(}MWZGO+HuqQr+y3roj79OZ^NzESveaV+iuciYk*`ex8l83C z!4WwIsJ?c!NYcL9UHUex2X;pgK7y(NEye%!Xdo>U_XZVB;! z*`6-1dA)YS!MFF}Q@vkaX?>wU-g@b!3-*3io$nheW2&|VP+s(s##@5a0#0wxmCY5H zn=C$nLf)y|rOB^mez5f&?*qLxOE#E(`*E~CDch|Uy*2CX^sBF~Ixm!Mh*$lwtfa@` z2gp4BUneE|oSnYR9X09Q#B&iAlyKh@rqZR3H>{6R@M)gR*y*#Mov)Cu&1E;TM>1>s zM}@$~cGD3}#ltshJ${m>Op|QY{x+Tpy8JU#@+2|M{J$7CX0=SO#G!;)NYtVT> zdflpdg`$yW&C|S<3qpcUiSkE}R{9DL{t!iv#`)=c-|q0vN`H1DXSW2fG$Mzkg3QSW z!7FJ63JwJw&sZ;fEEKSKd>vlKQo*AP*FkOv47f zkUmAAExY|iOr8UZ`^bRISNG!;i&~1P3I6<22aI>*JLtQ;y{12H&J4T8#WuEpju~T| zU`lE?z`ZKBTr{#JGKZ?&Vpi(hDLMe?wep4#q`C+uVSqj)Sv zggqU?r1$4-zFxSfcGq3$g`BoR!$O+@v}^yB3edWEa#`uoz;kPTu_LF_8+M3Yv8s3D z>w`yga9nmZ4R}1i%N@l@?&G_vm&VVo5U1GwjP*?ipMpif)UNj3v)!&|5 zB@`1S_d|iPiN{p$fa+zRZa2f|t0>)ZGrOYYBB|CTrE>9Kr)Si*{6?c`RCSXC z(*sxN4}9&Xc_1WWEb@RS5-*+Mt#@Z7F%)A-{3D|EozJP~$v1I*Re!PBzjspZRodJi zF3IN}Ol6U{bTOJorGvVq&n4o=r_0AgC1T2sKPO$76&0Sw_Nb+(eb_Wni+?vnfqNEx zF$xW^h~KxLbh+TUSGDoEpor_hSIbz}wbV(qacbW``Kr+-c-)2_U~Bv-M!d+T^+9cy z?$isF@#A8HJ2$WNS;UQ3RrJm}x_(uGjQ?;!DM+5TwGfs?|&wt+-geWDS)v%p8eotqy zkM(46LwJ08H9Nr(0Q=1eX`anL7l%{2oraS;|Y(JMMQADsCQsaq;FQ|3059$IkI3uZ;6Bmiot&i?pQdNRWhs2F9@X-&;8e)dng?I?1>Y`xnr=rsrF8_dH2f&*K|FRhNX#| zWdrfyoG4wLkWCK*=RIf|F7e7qL%@*2Kw!Mv^V$mmJvqM5r+fBu!c>7t6Lo&Qx}%YV z8wE{uJJOd?T+H8J+Sjrlqw_L6`!=>ukva!2_1c8BG`@~Zbh?VsY&sX4U;g=HeQDs4 z_wE&GZ|n4h`^NXV*M0oRMHl7UQVCzb8tC;U-#vtsFZJIkw{e@= zLU%Z9{Ip>%ES%Q;vcsBxQYv(SO76>5^?5&GsiuLwE#;!%^nRkMuW(By^iA^5qjgJp zERWKP#iJyT*Fk>K-Cjv4Z$$7?W=ZB^$Hyk>RNt1fxl41+#v6D(*n&k)@80izR_bCr z-k@nAb^P{&+c#eS`TOCe#A8JBoBRnq(=g^=&8pXV_hiin_%J(TuxnvhOR2CV<$R)D zcU7i&LG_kAur&F_mw`8(7jH=yqdPck2Y!NFfu$Ok_2P`Vh`ENIZ&f>;#A=- z(imcOFeMTF7XPwK<7qkA!MY)a3oac}ySRvmo=FF5C%+dn&O12EEmTGhYnZty%FSJq zGPM_0E}i=2Tf(4j;#y_gEg{hv<9~$=7?Z+;A~ zpP~B;83N?A`xEGlf*12Vd$yX2@n~R&ES{hCa;MablBBw$&)G(lVK%|&fVcs1!`HaO zxzJie2f5jS#nT?01J6E1HhUn~0-V)x^q-3%pDK@l|Li?fn*0XK_`s#*@wUuAUjgKZ z#w1%kt!ZA`-4iaaSD80`(WosDzR&s%=*%2?dZboUsyskH{>a@58^=3ahf@Z`8$7iM z1GFjwr>a5MM%9$VaDz8=`Sv%TWiYwcivE!uXK9HZ7ptT)LO^1b)_VeL*8?Q)w=@a7 zWlWv=EKP9shlHq!yo;$nW3y?R+^ALfXPVFESl(z%3ReQ@U4XE$xonYMxu0$2=I4}a ze>;for_V6d_kH@tx{KL2cT=&~rfOcMMTt8nTO2Jl^hOHaKj`z* zfGx=hGLFgD)Ez+;B6NPOK3Q<3o4pVBG-Q^ciB*-LSL&VX@qu5Ckvh10PYwCoaLIZH zZy$&AFP;tl%hLF--r`2Xe1)8K%1w$-%&fo>)elFqhZ{wfvpnkSwzMaS%T;x~?j7S5 zns<8V?B6Hr8mIj9WxUCI;tb8VVtSlzyTG`QEch#B8Jyrok+uxA_;a)*d{XZD5zDqy ze@^|s_fOkE#T02+Z8)ZMIiQ#_FAJ-U`oEi(V6~qxrAq;Bl==Uwxe|b*%xl7GV=$%v z|EAy9`U`t#NtSgxXn^9v#V%P!s4|4`0edj+mFk#i_;P?aqbvgBx7R2Dn{2~tQtX9M zxa9zQMp-n*Z@*CyHra)5`2RYiDf5!B+Rqq;<$(XwwEPxg_dVd&T)1HO50J1C%TAu^ zQ@)Ds`ad^Dl*O|x&v;O_?fAs~#v8g>z(q5LhX~58L1A0f;{r6)yeQ;J+>K-beXs)Q zV+Z9$y(v>kjI60n@tRwHVrHsnsBV+(XeP0AK>FYoUF=j}W}NJgvQiBdO~FG7QB zei-n;T9kDEX~&z7FSj0KG03UERW2Qm7sPu`BM9DY<>FJiZ77fioLes zDt^2>a;hd^BjHY8DD4~!Gx*Z=Cx6woe{XjmdZbBJZU067s*QeNUv9h**cXwg% zw#W;yG%a4A(>3bC21{vcMDc6>@%Oc!U%I1xu0dY*r{XSG=fUdF&i|+dnO3`1n)ERF zMg5>C0hwqOm*j+J-eLy1mZ#(kzmXxpDnE$teZ#1{wCoHK0@|;u9oE?pn@=bwYPVLU znwVa@aIO8jLAbf6NnzH&o@dFQ+r0r6(4+8@YlG{n_dFF7oqcd?D<2*wz2Nb1U-TUh z74(ZRMXA2+yOjFSZcACgFXnCO^sA%ef*DWXY8Qjo#mnlBow}nlE!!^YdVlB3Gvtk) zGk)?>I*yb-ZPoi92HlUcE~uPqR)>sJwbKO){_fs?@LS`Jn%3YSZiEGvu=z*zzl4yo z+lW}(3>kapOLab6sNFx1%RGZI{!K_VL_owmW$113e2?H3rETGjW;nbYoKd%|BrvbF zj&OxEs`7O04}m8!?k2V&!SjOx4vLinbDuwUd{`(gvwzhP15<65VZA-AE5iaVvcOiqK|!sRCpTax z|9syINN&%%nyKC7oSgL7L;Gu@WC|ZTR7ot@xMA8c=Eu%I>ca@7C9V6^ukvd#O0O6v zd8+@`-T==>R&~2BToyW)wY?*zeA)d_yKgZ#cpX*_R)<6DpX8(Wykl71dZ z9zSj0#+MpveW)t*F+)65VdKWk#dOhg*DH{(hxYIJ%1=VMs`^_g&2MUtjzGrF z5AJfg`z!5D4^EvRugYES6N6llfZTH2?omJVnHffp497OKs~Res=RjKeWZN~vu(zDv zyV_e_v_4E+l(czsQLi_ih9m!=Z4+~SvWvMf?E^}{>O zaK(P`O8O;J4elH3`@b=`zdDmEV<1Nl<;HO>?LO$Q%|635n)CN4)18L=-YCh--m6ga zk!G1?3)5dw8g(X{SVJM_P5Wpkfdiw8wEyOgUq7?#q^9^z7D2c^?Y6wb(9bs0QK}Qn z4eY@F=%kkWI4Yrj^ZQW3ilejmJ=PMn67K7mlze^W8)fwCk+qCTzoEKr|B%~Le_&!o zcxx;;i+3z&Tf5OY4DHp(1`#s=L`4hJpbFw0v@@1%42`xqB+-K%fik6blVeL zqUmqbUwF1Vn0FS7xlJ8>7R;xoGd2wMZB#NHK%(Y`+9uJ_OaM|e#GQz z6>iES)AZreo1<%AbG+AI`Y$tw&&Z8Oa*T{p#Kmv_a{Vv<%HN}~=vh-gM2fhU=^?!< z`;$7Vbo?X#5X{#2^X69mU~7)%rDxh3CVAHvwl8ykuJW+&y7tZWv>nC)nq_=`LooGq zAa+cJT)>n!qcZM0iumy9`S7iuK2`t8%TT2K1ixbkX!L7zOW|X9w0YUy)*7EfhE3p) zDujX*W1C1!n%{~NJ4|O)*vd(a%+(mb9t1BuYhGB6?rA&WsG$DdU)L~$kGSKuKAZG+ zQmLQ`NZ?v6^VM*%G%+flthkvdUoCQ0!t3fYQK!NoT7a|rsb$9XEtm*{=%f%YmJE9dT^7YBpp9~>Y4Mb$3Xz3BQv%q8E|v<9qgqr z1;*HYcQd&X`H0GB~k6KVel#hVS`{%+8ZY^28Q%!1_wU7?Qci#Tj@9)%q zyoWP9%^X7<<(YiT@a{Df-XBfw@>WX?;Yq_|)RPTO%f?Tjea>WIweGW{#7ZXn_}Y zua?ZX$<ha$hl{(gk)(^jbix&r>F6aQ=2wW^~z*|PT(@1(RH(QG>D z)9U=f>2DLy`q(@7-MS`G$+OOPx*i2;ZPzBtKiXMeu3dXSWMATQ+PlOoeDmGyRf^`B z9{;Asnq2Z?uH*Jm^5F0<+24Zs-kJh2XJ}bBcjmPY4Ie2g=oFX2zDfVhxL-U=e0z$6 zer~}WXoaeCamN%^TY?S+nt4ysTuhSzb`SpYrG4J_T|SopuaJ4&J(|b6P;=5kSIv(h z^zz^7qc4xSV9)9v#$7)1xVY`rOdT(1U!W5D&|lP4q9JR7=Rv!>{Mr=%RQ)XpWw>t# zQDEf5ZO?!nAE3z%QV`Zz8NV9ZTUUNqY4sw~&R6JDClOJ6~-pMrPMGQut~_}+6T z(K&80saaV0+{%ASMnA0ee3LO-4|Lk+1S1Lyj4vkX=*6lm&y=(TisE% z7wo2K7dE|5O{JW~Owc~t2s+)%XjRz$V!8YrQ-R*}PfeS`syuxG6}1hBC1zx2eGjbP z=}fvRy0yGHHfy|tFUiovzSS(b!CJ>xfM0$ZxE&dF(0jJ2?WxOHy+d2X3Xg;2*^|}k-jDt z#^%;GPQhPfx3iPGpMJ>jQ>8+R-*T+jb{HR#IZW!>gx$A5FP z3C>ldZ3dbR6H6@$gF4j?uSvm=RhcfbfpiX+ipX1crv2M%vG|!hJ+#H&kme4#6=qLt^#NUsTfDA`6LsImPTpDX}5P^gVVxxNio+Jwh{x z{nkEv4gC8B<_SXBXVj$vAozsMAb#L76P2^E@ejNBsraZlVXiA1I$RTq03TP4M=7Qs z(h~fC4y-{P%`kk}U~6-bYKE^w>Nru$A-9T7`6r@dhjb#TWA!A=oc&^wNr_%mE~Ao@ z`v{}b?}j4^m>kKZMX={GMy+DC#5yX|PH}Wap5jYwG5@dE^Llr&KkkAo08}}kOM*KK zDQUu9yAM*hRxJ6Q6qgA-CO^SYVSW>N2cWq=Bam`CTlZmwqSGUQ)L@KjwwsB8KbW%x)Ay zd<9AK72r$tB+XY()_nD3%~#K(`AWNNEZ&v~-rg*+hcendOabkjU#GB1q#_~y2G$x_ zf}rO3w~^3=*$-KusP+)-lhEWV1Dp;nO!-;y2N4%;aFL|+nIK8A;C(~!vT4I zU6qVvJy@{#@m5P(GQ;Et)Y*%Z|FNugu!m>dd?y)myi3;b3@(? zh*X&Lq-v)%4rl#fy~KXxuzC8B85(djML~fmQdRZsKFbmOxL8D?YLVSA3>zf#ZG9$e zP9-U1r0p#hF0+B|&Nq4?sIawDjZ^H#D{PCdL&;%ik2fqlftsP8892Im)Z~Ta2raGf zT{o9ruZO+8VT&oo;Ck5D+{C7tIHkuP+pVr2S=s3hGYF+_qEoHrN}Rw`WxU^Ni`>qB znd@Lu6cjHZyf~r4V9>-T@Oi>(Ff;_4Swp*yH^`$SHx(rb=4aI!T6bM_e~d@t8Pq&$ z+e7URp#(Wap&2cBamEgiF2;QTM5CV1>8mT5kcN3FLum^9JmTktF3iK=YHA=f;jrQ@ zJ>g{fyv47ar`Z9Fng`4WS{Tz}D7akP^4$_|3Dxl1iw(cD*I5o6X}&+F9RWBW z6GQe8eE5#WW-34O~vLvC`JTx)}xVdWa zP*1l(J63#84}tZA;ofvV4uKiJS;j%ri$VUDX`Z>y*IH8`BOEGN6C7)RmAgzLqW8EF zu8DNM$`%b%l&}|C?cSjSsk+_Kq{D=BHoOYnpRu4B=`t}{B?+V<22Iq!huV@DiykH| zIxYZdC0KAi`KHs5nox5}NK-r({(Vj6%dF-1lFd_v%(;*vxZ0WxZZ(+{~3 z{SZ?Kaf_d&lSGIvmTJJpQ0Fc6`&^x`m~2oSR;Skv&q8C7NLZj_xAHRl!H z!UAy7C7}Si0bp$Sh`o!T(&#{DFHKsW%aKXvMl8@LXcv~XAvc;->sJi}wLId71}o)4z`%+`Uf z?9$ULy^@RIs6>GGsL_0M!P+C47V2m-f$&Kr2^`H8@wSiIEt614_A4IlL9{(K zo9lq#zdHty=Cl`vE&pn-JKfj!5;K{ZXz*syT!cfBO0$#KiKtur2m#h8qo=`UsW)Vz zwL6St+MKF+#(pqeo4A7up8vo*vny_Bn(mK&O6VXYn)oY`)$()hm8y&)+hApIy)to>jQ?e(Pc8(o6eOw68t-bcxq5cG6DES6AZVg)`PngZh}(N zc2;2g+o}Osx7;F31v?G96`@4#7mp9M*-m1M64GsoWa>M>!NWQm@(h$aji%*-un z;nRyuxOzQyjBWIohTjPK4fgZfC_&29QlZNX55Sln%(G~0?%6Z|7RjcHld3{CKL!z` z9|XpJ%}c%*0ftL{ahO!I+=NMvhF(c7NNKe^)9D z%u|BsEK-LqLB8QHXA;xJP)>B3huLBIo9x!@h8#@C@E!bWZxq^o(kr9;3&P_1jZp~y z1OH4pR4rSXS*u(keS3Y@jXJuYXwxmiKtavm62UI zlJO>mol+0x4O1MMGU29hgL%b~JgWC>&zZkAxQE{-%dARArRmTlHYySDU@qlsiiCLB zJI&t;LU0=3U$AXdTZSXT)08Nbl188=osKHLag}k5D=CrQ{*y5vW|YjCXOp?44aeD-h}yQ$#zAS zYf?a0a|h|sQD&|Lb1voPn{Wzoe(s7rkh&6X1PC0{iw0Oz4q^6qL1bUgsgP;Pp0kgq zMST*CCm706G80K+RO|Ga05!>s0_EdtCL6kwJQLg<%`o6U=i%bW_|uG>Vt2%JH|f*F zhEq!KsFdzgyv193YpLYOg5B74-0=u=-x!&4%4{F=d1nS03=hu$9w%16$ivhy&(u{5 zA0r}N%C~kdD=q|uEO;>_C1j<~3mC*w{O`pbic)yBHD)ee$+vl+4aI~kVV4MQbr&Q% z($oE4I;}_i$NA!=l>Rs#9x8BfUmOQRAQ-T`j6C$>?wCD=ttlV9Y;3!w#Qd>+VWZff zBVvP-V6YlzO@C{O9rKWkzP(`^Fbf%*{UY%#?uqhL>Pq2vuj;wO5*bFj$+PTn4=&Xd z68jQhC1}iW6~t2{`q9|Vj<^*;uM?0pjySP*tIO(>4$t_=%M+61?)+g^fK;1Z6kr!Z zy4@bUCXfh65}ac1AazrG@;G37FLdgl4d3Z|9k0xc>RCV2(!;_Os|)CK0}SJ#dbGvc zWRFI(8gQ4new*)Vn-RCu3a>z*9u`Ck_<}AaBd>9Ys#P`mYT$gL83ZjUfhvSQ3~X@r zEOuP?hFzddkj`~WJ+6m5;Orkg4;ZLCfH$&D?j3S+heJ;d&)``G3+lI-)kv~#Hlinm~qY&vD$MR=>z3gd`mhSdjIFfvs-=4q#@bp3cH z*L#8|a7y5x`DR??g(8a#jw5SuGaFC@;oMSH#+KTGT>d^;BUOkGu^IMw6%6xubfY05 zG>b_@b`XsBdqPp^fj~C8mfsHcdVmaKO6tx#nZ{=aq3@h5W_n?y^UXY3E-$6PrXKy; zOc%u-FpNS^%(-K8$(F|JVj9*TL86gax7X&HW9g&|9Ul~vWA@@>W=63O`^?tS&0*cl zVsi>I&2%SIZ?-qZp^{jk0z&n`8QufH`fY6*rmm4|DF`ej&|S29Qz_LV-RqWzhkkcH zYz^e#XMUqyn~b8^fU=JrZ^KMyEy@v3N8>R$Ydm1=u8Ry9J5<5Uy%6l7kk+CP+HEyv zW^KHogbRxRLqOHV&lWuvSczkS2T^`DJrY>)J>xuJM|Zg28po*>W3`+rf?6T{VqYpD z84e!0)X<72x5<*xgE!~5o26pGbU0+M2B)@sncJY0!C-oAYm-^)HqjSuQgSq2wx-a& z(|w$FYclCkf*&M#1uE2*v7iuOV$cCkvuf1q*Qe93jLZ-@vY&O(VJNHi%w+7$me85q zz}aYQgr1os7#uY9aCq~;aK+e)aRCwsPwYD!tT@x-GINxUUuMBEqjNR{yMZ!*4n&Lj zbv6mT-8rBhz2-rRM`e>Mmkya!qV*=(&yjiH$*T z?DyFu3Qh$@!WA(rfnBFXd5RoYDhMt06%xM3awu3Ws7krjQyLzFd7&FJ0#$Qr*r@}C zTE8Fkkpt5o2H{lMaRc_afu}Pm9-^B&$d>C8#BS{w`S>ue@W?b*ISo;M1jjkQ1kE^` zgR~TN1a66Q(`JwBikX83Np$O!6Cwz^#8k>;Fid9`We-OT3^2e+cZ1LTCknyZZSHceaMk0pp)(CxS#w^$-~$o23$a-)@YR--UYFgbqeoe8()Lin7yX!a3=5eN8<=zZybtpyp=Cfxf=lI zu}UC~2K9DuU(W67Ij-PQ29L0D7jIs97$GgvbB!f&oKYeJF;5WAk}{3*-ePaWU!~RG z8)2;Cx)WV@5}2+*r4-m4v&({nyUABpB#YZSfI`{A?-%irY%s7SOzSszoeS(a_9?-d z@HZMvJ^`BV;kM#GD|Aojl=*XZ`>i_VFQ++_L}e_yT=F z#iypZ4;KeO0b5FM`iBut?x&dyNqc-gFoIrILThMy^M>BhR9)d0=&) zpJ?0bakp^K=kQrTOM1O)%!SxYg*UmKS(AGNObM+C_`D4r>o}trI&K~+CWy!uZCWYH zy*a@@`o7-S zI>mJiHW6}4BExZ-GaSN2w-nt(rP2-b2_cJ-4xhlVK~30My+Ml^0thHiDxwk8&r2)T z97!}aZ)AyU@swz;xRz#O>mHB&1tPAy`nA=efGVg?0Fc&+u1)O~ zRw`=Pa^mX|RvzF`ohG*m_jl#S>V7y5ItZ7AS%Q|_P!J5=(ubp7o+wCq%F-G=GXMiN z6ir*50UW74D5dC69aY7I&YMj9(*e`TnR$QL#j?CfYYz%7`264ui=#me>mEe)TF4Gw zWHZPAM#oCkkAUY8)I%OQc8F9dzLy6fqG$uj^-J{~T}bsrkl)rJ^A7=~XP>*ytZ0S-fy}PVv(HGv zht55xo5=L8i;k-GIaC!)?arh(W=ECX$*%kBG_e^m07$pNp{*%JC9tMixpXuFAPsH2 z{O1;cE(qmluMD*)I`%Vc#w>@U;@&1~Hr%G%OEPO;$fQl+DjekDcN!q0VZ>9Ig>9ohBPZZxfhAN7*BhX@<>nSj!ASe5b zlIjMvqp4__nD}MnLP~^Ef~2as>gMQbFlnw^E5!5{?&qpI(xyh7Ou(-JF%4$Y@26pZ zqRg?ScIa$yt+h8AroAjE2EjZj4lsVSAHk-NBpa(gvvhlPHjFUD?D!w_rZzkx553m; zW<~1(f}-%+I*O2kZp=ZecfCAluZ3YSX!qyYR0DB*$CctczNSf+j}a6Wj}ZWfwzUlh zkQY1APUSo`GJ%5-K=I`%{Kab?0)nw7KIPzEauP2MK?i|-;grPUNh+O>bKXf!l)pvu^Kjb+K()%ptg9OpQjs3C9)t z9^)(QN}Y#i*oxp;+2%z<>^mDw$pYV+tU0)bs`3^0ctv?5iXn?-=FKhS#l@d?8d8)f z0V8I^5E<}SvqUG`XlIrlrEF+F$yJaUz*?se=jyFHc0vEZiRSn3V&^e_xTjghr)lRod1S1(YDViaxR!(d|vOVqrPuz>JRVHJ`JS zmbaN1)4xo<7o_^}a}0Jn$kc_NkM3RQK{&v|yPJ=@dW6gh&L0I`bX79rrvMtU!+>~Kso$YrgZU{9|aF#!yY%dhaV zL}rAI5{{t)9Yf5ITHPs>;WfCe5b;;YYiFr}HYT(JEV{v@W34$hI^rhw&xP!aAY)k{ zjpr%et+hf=i}c1j+AU5eb9TZ&Sei{p?~om#Pz4lvTimH)f9Fi8np7zz%UMbnC0O(5>^$ER)Y7FJHY5*=}mOX`Rv694dJ_a zUVBRftlJ&Veq^{+Oe@^YjO4o!@{yx9v%+@}Rx0|u|4Uas&z@m z$m(WP8~_1sNd?DBxLCryUfATjx^M~531^utft=MHq><-T(y->A$;K?RjSo_SWA@3(xT-KaA^FX@_BQ@+UPqC!4*T> zT!G^%`muv$FWc}g0d*Vuf&h(iJuYcVfwA2o-*q&BfN;))6*i%c&M>bdj%aQvf+xwQ zKZy@+#E#MY)jGr@9c^$l6NtG94f^fgYl2L>D-Or<;x{4nMa*Wpn2$W~2CnvlLKMg2vL6qX{=^`6+Xdch^eQ)E|p1wm%Fpahv^7eZ*$ zvad&$)jDz|p_sPX6)9D80b0?k;q~N*kgVvnhYR>mdTYSt8UyAHq35p@v|8Iz&4UGT zs7$ZwZYl?+WDC_=v*`#hG-SsqI;<-@`vkB?IkvHdsct`W^@WKfK)E?F4ym+;ie+$Q z4>2i8Zdk^7eb{^`^sIvWVrS-oh|q3hqb{sp;zAQnm85V2#jK6^A~AuRmo!E#p5R1c zj!+&CwO+5f=P~c{L4#)zUjL z)y|9zeYtyLdg!4VXC&!RijF^KnZR1?`knnNG5Sh9nGws0BbUfk^_W}=^6$X_(Fx>`O9P{{1%cUdts*_O z2WZ6EcEl)RN~u3dMg>Ar&?qCqtTE_|_mGPv6Mi`wprHsiHNyAOEDSJtIORwuCNu}~ zMoJ82a1k{y@7Bmuu{G$zSU8gvJ6O92@ZAlt`<9kHUP&Z?#Ql90i>3;^SyB zF+}d_O{UCx9iuj`veICS5{ghGCrN@UrGGY4;&D!;&m52$TYP8;2ov1E{=SIkpzz*# zCb8ZQG~wd}vodRr5rflO4-BxEU=Au?GxE&I@EqUtqL#vIx)5&!?FRGS>Oh>CH$jAj zTA*0B9Ub_j7PS^SdI3}dD{*!Lv{;Z6pu&TYlAw9gA;~^SPAA4GQNRSkE~A+e`ic&b zLT5Y^CrBX-BDJ|?iQO%8WN_$_VoY)0@Vr1dv47MAN-;nA)k2IQ8L8SU=+ zGyo3muFeUE7o?BeMME+0Ism~9Xv#L>cD5*5G zzZ%+Wyed*Xj0zqDKYDwgY59J@fvYBVED=6t3y0lEQ&65eOZ;5*>65}PZ2!l(-xa=_ zO%E>j1KEVOW_?P(AM7y|&ul!9BLnt9=9o-vp_yX|j*t`?I|6%7u{*jtjB}e=2sX+2 zN+?NO+d6NnU`w<1NfbJHO0T8*g*e8~_6zh6-M@lq=-A;ZozT%52AD(sungph(~+-r zn9Yj(DxJiz0lSH$j|*?3EueWCrh<)XDf7`=n6wE?G;3v&cHYNYg9(qM<|>*{+Bet@ zQr)_=aAeschnTkW`mJ#oba#3pYpre`sYOsZ?8AI{aO4cxRu3%_#kTe=BCZgxlWORe z{J1A9xqB5J6=K4xXQt{OxRtLSU6vLjq7>(q}t@HYq%Bd)l8S(6P5=oyqH9 zI4|L%gqoPGCv4<^d##>P2%eg|)H`(MRD#0)XPW73E8NG%VX@^CrQ@`AomzXCf-k4t zt(8MHVHL0YYFkVRJ<4MW<&;#EawU6RM}y0Dgnpf$bHu3%?I7e0MeMy4l_?* zcswkkdeyK6Ec8h)r!f)N`s`6dVLl{xbx=6+QzF$Ch&5%ZSI;T7?voN_KeO%wjiP5F zxX-bhPRAT$=Z;Ekd@iCkPRka`Ed@8i9OZ;2^NUzNZr~Ta;a+bTkO%9ye7o#WI}WXI z{U`*uk>XuJqp)1Tk3<~F4Wk>t(@SW4*z=IROwdL}p^b1oR)#@5(VPRclsMwRNmGAc z(4~b(q)Q{eM($*-W!#j=7H{+;5m3rRI~m>Za@6DIY=&2`+XjmHWsM(3cCS+%w62M| zVs2cA+mp zIG7A{xK>I#^<#j2ra3w?(Op8_MbuPMq_OihmCS1vsA6KPeAohFvo$}(bvJ2-a_%K) zHZJouXJE}8S01neT(-ob6e@(gv8d+!!feoH(uMs_4X{BXOAvtp^dSB~4D&c>pV-1p z@zqKP%rZ23hG&F1b7j9cMTb+J1e{zu`ZgDZ_}?%lr>Oi>se#5SXtUdvoIlM-kWEJ; zV62(5JsnJn9#E1kc2=Q-k7 z$U2wU*0g29-9j&fejObChoKB;`EFD!#f{u;0c0t@tTjtl>!9H$#st;s=W0W}@qdgh5x{$Vuum+{* z()EUA7{D5~9l|RCv{M!hO0y~x*QeJuc+yGNj9@ZFO8$5*(r_1j8~E6eW9YqC^97|)IyU)~Pi{^{^bl%8 zCsrnUX7O`E`;T@(_mM%a z`2(UmBGQl+#gN)HWW_EtY)K3Z(DOl{XQ8~x)9n$*6&=Q-6nlE{o@Q&Ah3f`)Ru$P4 zdK@p4KvZWdLr|k3(Gew@oJrd$?2N{7!ZkUHLFmG?zI+Q+*y;$bBO{Cpd@pl+o9#s9 z?m#Jw!0pN@PpA~YC+F_}u1>SjhQPn4##<}&hRvGaxagm5obyhwiDd3q22@@_2CyTe zIScpKXOjtl;|^VkH!m?+&)k!Xh}~vu!}~{H3Fg#8TGy{jaRWvQxa>Eb;_z_L=P)%e zRdkiRW||IbnYwpP5TdN^MWLWnw)q8La6}Vqo)#PCt8W}N+_K?gl%AhURCB3C_EW>qIg+-rLFiJ0uLudm zdUquTtAsA#O62-AJKGL`7$l|i?z#J7QmxUdJ1a*bv-GRr*WZp9kVf0C#ysp0p4j% zT%-PFXIr7#zihz(G}GIWKO4^4u)X|tum@R`6TIHA4n6>805g(OqTtOS%d@1#+S!SY znBmz{R$|Ah*W*?(e%!epv7>AjrPQN3u3unm(@sV1R z261bUl$5s#16X;^dj(ulMC{f)+mM)I4m9$poY*vb3&!N)Ab}x|f>A1}{QAbClx-wa zTwBY52{CMMRbvI!X&Y5eTk7V*xHaiy;RBVzShET^kwG;2y4*8U-nyJ}1dN28(UoAB zNjD-%{~-u@Aq8WVNjv1uhD@7CtZ-R0=4n(H`NOnjt@OTZ`YG6v+(D2pQQ9Em=!wj# z`J;shv-NNZY3d_wwNN|74A_rE!NlEO@`&L#w2~H=NA?`w9}Usp==Uu!2{x&0j;2W` zebgMYj@*DaOBOc-oewEX#qNk{BQ1nzx-1+-_T)hZL9iqT2ayz{JKAN7A5#-2K{@t8 zwK|Uo09zOEEh{~W0U>~O2k=#II+Y;=vfw92?vOQ8^61SpMa|p@wah7~rBd@mW^+^s zr0QN}lYo1bj{t7IViG!q9G9Pw>)gcr4x}#1j!-7Gkrala2qnRWNrqokxsZ#(7vb*b zcT-|)uu}Uabz_y#{c=j;*l-KZk#P%ZuWg5U?u{XvY(NG`CMMdHGW^Flg$u8V2*WfJ zAO*gp@;R>3Q)cH#L8L}FaSun8^lXSTrAW8Ve|N-xnCr;}>k)JG_Z381#5uSB5b$&gIyy>L%pME-d8A z=W>9Ps)nMZ&he#<%?2lh5TNmi&FyapMm=u>BI>+?20Z4IvV%8<%c-1mtnn^qgf)rMQIATJn}wt&r3WBPYXYRAVgle9OQr zs}Ycl70CiP=WjKau&+{S9V&6g>7J>v0r9$yEg>s;$_9l^+(DJh&6KaNkZ&KG^D1Zk%C9L#tA@H)2?g< zt-GBf8ueEK3X^m?7TspD@nHWPe*^&E08j&6l;i_5B?2aL?6D+eMY1AIIhzqA5ldn> zDz%U8v9L=X#4W^lT+V?s)o8{{?#v`l6GVl*cd~ zs6n*cUKe}~q)2YS`4)6{1W`swGQO6dHawGcGp+*ZO9-2GM&mhq znTC)3NM_9A9!EtDmOn%1eQPJJu1sc(T&7e+&gsw2VE}HE zvsq`yFJm4p?b%TN{}eMoDVto+T=^(UMxNki|Dd_4a|<_5A2Lh1 zxp_Ef9zv(prb6BmXX}goZYu|;kZ{GjmNGG0$e+h-G#rb?8LDD90nyuQK94s?d`P553 zIwSXgRoMp}<)K7#EL0A5);j+b-x<85lFe9~?DB3v(Jnhdm(qAAjYwMW?_nu5(zR;Y z2R=t8`a^lVMzF&vp?3zG4Clr|vM3MP9;%C-s$!?_@#ZWSHn!kRWy3@*v~uEN+zG9X zNI*(47d#9^=wI`*YZ3K=C?wVoH*2X_5kNb`K&d*B->d{1%c;TJ^g7H?}mRj zsrBXdf=N*YL=zH~hJ&8ml$dCQTan#=iJ+m``$~4290>Stu!G}VYYybhY;<+LkC_TJ z4ELx&%)1tJX2>OzgEeMxO@jqB7{`wJX|UP_1K0Kh7e_#FTLl-I*{|&OvuwZmc)mhT zA@KSZ1BbX@&>@AnvBY|#`Y6-dTq$x>N6HqJ9?W{x;`X%M0oj8h70@+Td+Z=V$2x;D zA-4K#KPuq4OyCoXw5=U{k^(0V1x|nf^_WRd7Yr%oto^msE)vGoqDpcB-3%h*gnG!; z12YoMd%5Ygm4{mfX&DBR((v8VDctB@1{;$acrv{OEzB@WX?8J_<-Fb<`NP&^GP+7( zu44-*DE;Yk#}iUJSyxkNb8Mj6gg#1Ymlhi_OH$Y&HxsQr>X3?+*UvT`9BcO#GT5r& zG#ra0VQNC69NQ9bXzh%)n~kl@O>EF07&G8sTbLl35Fh-g9-*b7MeWx1Xh##ldh%Hi zjd*dfE2Vb-(steMSiwvn=nJSQ{We(M;Mot9VhFZrAoI}~&uzW*Or9^v`bZ(+;))1q zrxY+ozC34UvLedBh#H9iZ47`qNC+9%_v8M)#Oa#RLZpmo>%2WgGUB)5%N4TEmJi@2 zW+^OO&-u2BwE?->Gj}(sXKoZ~QQeB=S2vC=$gSkOW>2Sg*ZB!lZuC?F_hTYRUqOgQ zt>oyY9B!gxwSS_|c)kc{X>qkHk}7iJJ85#QWi#p0*yPHbPYJ?>2o^E3h~ zq3dS`NtEv2f;?aWh6=JTZF-{`27Aatm|8SljEkm^6rx{?h~9RN9!eLJPo#Ccvj>Ef zwIV{lJNzAmbCKGY$;r4pu2Ld{p`GH7a&Nj&m-qtWKxYbkx z-+ihyUlWyA5Y6ldt!)rAKcj>&$wUEg6kxH;1Ye8^(kbiBIxw}ZU7~Rl8B)-*3V+{b zT~ycEZMtotsvj$NG+^p=#`~zTSb}}1q6QRmNb78osve30C3g;zp>_zMN-d_9;g`6? z`h66xzIuxqBvP$vsgpyTGVwA@>%2jk=&;HMP>}5)URl5Wn|TjLPIJj?>6T&XEo z(ukY_o_Vn229`>-*^sR$Kc0;GcKwD0SPde)pt&cIR>^h9Is<5e&#r|2)!uYp=a*LA zgq?M>jx}((+IsVJNMr?|zjy+V*oXbrI2R&Pot-F+2@E9NxT48mAW_2Y1#7t7-m`j~ z6D*kz#)MwntOr|pSB?EI43?u&9KFUF%A1>;R-#nesYd0c-ikYeUShV*b6Gsxn|t5q z%Uu`;LZS_5M$7=TPv$3Dyomn}+5Q1z!M-v=O6Gbn3?kbxcXbZm%E2f;^h-p^(Y&ja zkf2luVu6_Ob`o~KF=>fP0yDysa-K@0063AJ<$b7!jLE2go~v0mtMjo4b5%G1ihb(% z5!nev6Vhl}YBJ@7Esn)^b0rL>Ym+_dcP->AAw5wR>4`DA2)!f=;AkMv>&T^?7K1M5 zqb+0_7R$wN$~KbQXMFx&{8etH^TtY99l-1{vOKVSBVS{~V{HkZWXPp2!l=fj)~rlO z4*~Rate3sXSX3V)WLMZaN!yb^>q#E6ywR-lS22Uh`fQbC(1nGRDMP9g%|G4a?v6kpEnMPZ(wYe;D zNm})Z0y{OMa~7J|C6^KbO}YAH>U18a(#Ic$qwI^R6`e|#g(X`kzY%U9;(D&YJVW05dww5JWX$Ewga&S z9<5x&23F98+|0!Z7nuvt!s9&gkEXx|6GJcRK`)gPBs25U*}GuTeZm=~o2+EcW*<{4 zpy9|C7|m=kEO87KPl1JI2?^KSOuBkw}#>vr_r>+(tBhMWk z=*_t|nMB7TT8(yq#$l7`k3!zkxlAYp38q?)ovXY8 zfZwnA8w6nY?!ix&=Bf7-o_gqd0QK#MOx+%O{$W;CNi$i?>Zv1JC9wf9P3&oM#KfMw z#d5*|midW1jP~&&w5cATgzbU*ecDVOFC6H_TV`PEb0?W&u3!m)uzo}mB8T~J5JyM? z)JhoF#G;_u(^2Jud$cAg@|5WW^iG|l>4RZ7n(X$0C_gs4<_!jej*W9Eb+z;oREUu9g|;Sd^3Q_6tvCR9e4%czLaEQOaqzG%rS{j zF%^klhzbC=f>!qwr=qwq@M_r&OgH&m2am=GeIAt@#ORR=!PS)MjT@bONej2uQ zV*ai3S`xWAKhH+Jj=^s(In?_{^>YrrsPzx_apQC-%$NropmCSl;?{gnXG%VjC0Z|W ziX9w8oV>X$BRRj4tM1Av%;)f$FmrZmW_Z}^_F8^NG}1syRx!xqVMfyQgT+>4i@vmi z*wsVq3KfHygO+RH3`{%i*Ga%W42GQ`Mr>=c90R?x+8S3xH+MRs!))izz`dmdjf|M+ zx)SDfnw*v|3!$8P%Bjtn{i`h~dG3W%JCC(yw`tk^cyILC8ulalmYNU$j|LrSsOkvc zAY?MO7VqW?P?n=)=|iTb4UmZ?SniiF(AAsY6cl>^UI^O(IQcyjlc81V9+ zYq_bAlN7L#_aGAn<`DQOj9mCl$2?J3hIzu4Y3HF`6syP>k&2D4XJA*=E*CC0ckW&c zW?C0W;O#kE&ByC1dY1%!e1O#s0Mo1tPI;OLjyl{ix8Lh_0|R+e9>d1r?ob3Y5N*w^ z)ns}DVxXmDSI)DOCkoergS8pe!eyXcl%6Fvu$4=R0QSZG^weG4H$~nw5^GYCeVwx= zAz_j=^%G;OMH+}Qy`Fe+Zd!$O6dhBq?ih%XSDi$t+69aI6&s?i_6EJV&o?Iz+&?(bX`4^6{h@bm6=W0n8XR1mx@|q3iVR#ir9>;$@R4m$M>#zO}PQZ;X%`>u3tJ} zGsHt2e!%R2W^)-f*hON)&Pv!|HSmT8+esx2^vXEmmRs^4|5L3aY=+)B+3dv;#^Z;P{qHwdtw$$MQW#x;iB16U$ z3S>x$j>6uQ!#E%W#xxJ{3TVaHCfAlD3Aj#Ha z9%2=)N(^C9F^VS&g5qKpP#Dq*1%E1MZ*?N-7WIr?#P+p2HZpqM^fA)Wd1(mv6)!$D zGYt?CtT3toeT(&-D?1-%nh%;3@Ye(0)6ocwx493n$S_GY6820b zqP|Ye=5n~3H4S}2V?6)qJ$cZ-*Gz>y3ty(5rkD@RUT92A|$RV3_c7k%uPLS^Y*Zn+c#JZYBl=QwV za%CV<5>JSbOy<~t=Rwxt9lFEWW zrPByhIwOEeiRBA9^j2~Y4`G!bQ^;1t;HAdefYgO@G!q_^;s zSV^ET@x);`#7O>b67^7ahDPeY4O285itcMlAwWcAWfF@Od>Qp+J<>ob&;lt<0zz*x znG5&%#d3TxxNp|#8w4wLiC{(JA=Y@UG-KR#xUW{(P(sr|#2GAfshS%WQ&ST@Rn6vLRsVKyUqx3(zj9y!_o?CArf6*CmbKb z>BReRXx>W`k{8W&j@3H9p4ed}C+7oTxi&kg$)=wwQxS1wyudVb!0GY zrwDf*%ilz4r0rYFi5Tx+J0;9KH*iDgH7qnut~h|c>lffE;qeC|4$DIZK_#3Kqn?O~ z+409K;@C)Dk{+x;uWg=2u`MK-TG0lP9U;Dn)B~qx9yqnkeN`h0(Jes@4YynJvGzRAeMTy^L z837wa0&`ZY1U~Bz5YHjUe&A>lUF9GJwD(L&$l&Fv&vJ^iRVdw_+4sX{ zgB$VE%Uh}7O{I*e097P$oEMWmhVy2DJCqRy>~?0tz(2MKKvIOX7$uJcH^Nc5ju)|h zBzj_^u+WAtvwWoETLeUDZ_14OGJzE04pu+Jp!|v)v08>D$zq?-1 zg2Iua1+tk-1lLZv7XmzW95vBP?5>w1(Qa318TNpnO55FqPLdYLt+X_$!BmCDs={>W z8YiXD2&~!(#Vvx7*+3RX8loVufBXH_AphF!T$#2Q84;01E|2;VhFi9B;t|AjT~ni? zgHCG*ePVA`zF+eBWpc9Z2JrtLm;}HXW!_~C{@x9CG1r)@d?}WCwM}2Lku#5q_|h!X zL~IAs2+N^6*gyy^H)&n<$JjES5AjS(Xg)H|0%as?)#@a&=^TfH6uLosw%1cOrkxfkLMXUoAYkR0 z*7>rOLtz_RJc~_^+wNwZjwxvf%4i|Yq==dvp}3w$ewrgul*)n8jmSxklF;Qysvmcl z(1hWr(_?2xtrPMd*}2;trAIv0{J12>sjlJ9VMq7fo&CK=gu)V-RoLUM4+SzMe&a8clh_S^IjCKRE>fkx=-atUwZ@v8mdtpq;J03L>J%j3lB0iJdAX zf@hNbKtald*`Up&>D&mr+AlF9kYN?1LDoi}dX|!2*xvfc-g74tm*i9%&8gMaz`?QR~}Mw7c-L94=Nh!o#tNKt5Bu5eIs%8(f2?i&|5Q^mE>Z zcvMdNqw``5i)}l}>XiG+#CJtzMWHjECAVP94IngkM;z{~^g={)=8Jcxq`vxRf+=r- z21bc*C^Mreva@_s4a&`0;P%(eO3u4G3(hfVsJwZjvjU5qmt86JqjQ+x{&r5?WmESlXZ*GMZKK0 zU$Y5q%;SKo8y+9Kj?RW&(KCixtVr=y0nQ!p`d7J0tR8gIqGZ{H2v%k$A&ea^Qx;QK zZj3E<$x+(nX^elVmH%0!mRfg4Inl|Eq!3%EP9QR`w}8|`Vg{^H>N^zIM3;PV=AasG zq^sL?$MDS3XipLBeNh7hV{gywWKBh517Pw6EE8}{<*ZXNB8`cOZdC?W%Z_Y1=bcVe z=WE!$j%xcRf=-dPMADZr2C6)rjhW+cmthu>OSvQiM-+2B#S1Bfm1S5*XQY_oDIQTc zlW8BFiMV$2G(T9le@ULs2l)|$yy0;advS}JMapi#HHGMKk=f3`p5A5pI=rH=zpjo8 zn5&ek%uz423h~riJfbvv(RS=pnr$W4eWEG9coVi<9g2$u2`CfI4U-kNpP-Z>P9NAC z`AE@}g}e#_)MB=sO$l;C|H5QjQCW-15f(zXJ|Z$_=v(H4Qz8*L6L2F|Gwn7LXuknq z|GNeQ&od%Tkuq>KFqq3LkKyG}8FIX31CDB>T=965da3w-g0!E=n#p$X2EcsIWCwVG z51dNyY2kZAXhoiI(lW0|^ak4v zctInvVWV=-k={sCQj0LPq-~%U z#jTgHB^+ngHi~ss@1M_J3BKwnji1|4RDWU>J=C#(Is$d{5M**oKocXb5bh~1h0UW# zbts$y=MuQI6%3SYVHx$9U*5Ue8yX!BO~!}o6m?2Lqn-~203oXJ_&Yyl?WaiG${@Qq z;O1smD^O;f*h445m@PsQd1yi?ST{b7xq+ZC|7|w+l}#Lt0{cZ`obQc5ur74&TB;+sqTN>R`(xACTtpgrMjHM zR*IY@RxE*&R)!50F&RO6U@=D!B2J+0O7d`u>!>EtJKj~!CJ>PU*FBiv=sxLwn9mru9CJQ9~)WzZXh{q?nYYB zL|(#@f8q36&N}4$T-jnn#0=Kk7RNsbj%_*W#de_(ixM4*$q!j-Xd8d-dl3ehpfg#S%5gRx; zwgyhs`J4?h4ALms+~V+X(2ooBAZ!$+DzU(bT8(!KKBK26i3cPBx)R| zyy7kos_e!o#O0*ZFkEF(DYua%iYjDAwggUDLZav8HE=76oI~kGI2(^gn3&F@W=@NI z>XW)y$vKXWP1tnQH^^rsuaJo+ya)$D=FN*3uqTJDw?a`V>1=EY=Qy084ZXf-%z#Pl zVkHIhxh6gE%GQl0h>o=hqG8`aI!|l9E$5geLb*J-gH;wIR@7od=G`Be)_Bu?W~s4q zy}5%yt>WH*pmLI-rU#XC(lIWjXmrI{>C7Na5OuLB@<{@TVmi>KZahOw6VBio=~4;l zw$e0?9AT+r-!O<6@{IBk`oGHTP8O<$idnc&IjUz`0nZeBQ;6Dz?R|Ct^@Gb?^b%?t z24IO$C~h?m`0}8= z7KXu~-Je@9=FRyIC_$eYP>pFY$P5w~VA1Rf{4^4rR777g8a%NuMv0cMPp@q=yO0XB z;rE!ex63wTGl$GZR{eN-O|JJ^z*ofRGU*1<^eT0RE#2Q|!`QgWs5-MKGqsK~)sHX6 ze*BU2T0PQhc%&iZasWVKQJUJ?({U9kdMT#p{+d$%IbP|P_2Ov?%t4x&l2ZNR&#?ARf-rI#8 zj2pcU^ijzU1sNf99v)eSwT?;0d3Nlv3Fh^Ghu(!vO!0U6spnp;(?8t@c0?TwnA-Sb zBZFpQXrY;qY)v%+bQ9)~9nz9)wV8F;^*SEgi(!tVm|~B5V`E#KLr$iO&9egVLXG+$ zOt`sUI0bWbibx&V#I=0(>9yX}2zqd;HE7Et`4Xk5`HyJBnA@MQM1e(hKh_zd!wvK?At2jTAGz4hZ(M*y;1#y%V~z=K@_AozqD`uNO$gV_h4Yl%&c=?R%O^&*3gK85 zik;m(r#1=LH$yaM+0LF&*TsUJ{K-lQ(CQkIja8#lqd@$kX-OpjCKZb0a@Msfhb+Xa zJ$0Pfor5-*QktELesUq$4_WC zoCR^)oiuWntVyz!o!H;5XS|RJvvslVZy=*=wTJPIEZ#6ra!<|&40<7+$RaLa)XAGA z!N>kA>a8MIE*`mR%kUdBh)HV9UA{7d zIq8Gv4V|c$WF~<$i~N|-vbGvuepwVR@ib~^1{ccVes6c`p8EgWdy^Q+wlqyBG8?M0 zh14YKMU}chDg+A)&sa8m^DuIN00Tk!|&v4K1pTP!?7cCLWmmcBn zHr;#9fBxYcZm2LJ4JCSF_g0B3YZ5Uc4J3tk`&>^;Wbe!WXO{oT%e?42!+>>=~JYqb|U*g zyff&IfAx_MY0L|@P#sK2ZU)yU$0ckT)xXjmJir`0(!#3yucUrXGh>OboZMNrJPqyh zhCus7g|RHey@N>8kXW+zGMCs<#4sYey2B#x>FR#)I5RK+M(d#&Z9Py8LEOq;teAA~ zx<^9SIf`rvB87$%U5Ob$fM#9@95?#|vcQZ}5m;@Zpl=NUeLD^Esox;r5qBS$L{t?P9N!R zi#g?B>6ugo*9x0NjDS|`V6_6!0dS$ib8uU5XpX0{W>ro0-mU0tFw?CCNO59bY&nwG z1MI-qpZ!)LGIz54WU(Elkl_D(@XN#BhIl zdnRT6x-3?+a$VczTHtL5+A41Pe!v1=GR-6h+Y}bkM2X-KOGUr7tu5e!2gD2ZuuH`b z14ACl~tT+>`KvNZJ^ExKdc1 zM%6)e8MXLxBC#iimv=iRx`VdwQ98=r7)l%>C6YqLbtkX$r*cw`^085r=+@t^sWSeWT ztg?Q70qoi0x9t3d`Cy3??Hf=qVgI{nSVMyBMA2XMLABFJ^Vn$ZlW)2F&qMK`GFI-4 z2Q?#00w-&94TV@cf3@A_cT@J}(1bBmm$b(l4jTAwj;RUJrljj7;_@oLHrwlrzWgm? zUVpz@Y?gJG+8TK2tq>WmBlg7mG80cEGf^0AqidPNOzhUo#1oB~7`8M1;I}ijz$%h^ zMT`UDmT`$}?u0r28E-cH3i8@|&i_)09zQpZITwp9K{5jHEdUFQf1bsiUk1_<$s;>V zyqZYrAa^(OPxHm|+&%MYe*t>De$)|aga{cqiT9towKYaPkCN2r?Up(YJf=d2XrB5L zUPEo!ZZgn&QTOuGW+X-okid@w?F19~G-0uB4j-9>&N$elO3;^I0<@sKh}(Cgaj|xH zOAHGnKNwFEsY}K0cxQw1S2XBp=a#;qAQ`L)IC{pe2K2md!JXk^`j*QGhytQTt*^jJb?d89V?E(Q^f#>l{MC;#q<;Fj|#X#6NkJfb@77vF(^H z59OS#F9d;`pRq7j0yAoHS0B$k7^sfc>y>N4K!~~3RQ#aie1z_VTiBeFUoS_iF`tyb zKl9@s%q|`pwtu@`ya5e_?eOwxX0LvBHqOTeAStF8FA>>*{D#$eF<}L1x!&@CY6aJJ zv?ztn_FIX zFPP8NZmyJjgkmoN8cr4^m#7fK{}GCZ^)%9CWzJcq1ZuZbngf*1>L<8y@l7Xs4KCMr8J6<8_XGKY79g{Xtwt z3--h5Xjw-~#P`MsC^8G2A%EJ>HR&SPPg5Wwo8%E~ci=wxz(KnB(+8 zu!TBmOW?r7;u3veLYo~(S?&W7>FDr@E;Lol}#jY!I7p z(^03q-8$2nZ;#Sm!uD6n8kL3XF5HUKd;i*DSHSLgbTe(}5%Usp)FO9{Rb9ajtF za{@5pJ`h|Mo7<@^Fhu7pePQ(0YFf7PkwSr=`M_GKh%R2UskM2-69zjWul*Yb+lQG~ zmyBpSXVb-VvEncc)o*U#Klw@{(KTrAR`dRk)`PYLY@*0@RMN zqD-I_4(!`R2Zy&iu=h1)WpixW6$X`iI9CK-YH1GnNFJMwWHQ6uJGS{s zI^LPunkw&hxaJf}rLLY|R^BKk510Am;UHAAt#;4yTx@gfZO#+$fs{Fsx)|fBv54WF zwb-RMJfFEf>c#BctTmw8zQGUQV`bCwVKFVT?y7iRVOIzvhoGC~j{4LU=t|2l8tNXI zU|@UY`L&E{PlZH>niuFDMXcC&wuiT&XehodaG5OlJ@|@1{gi+?#3dd-u-zC#g>5|k?n*0miRmOh^uiNp60NY*wc;Le&+PHo2=%CL@WVLEpc;Kx zE#@#4OA&HCf%s!2td4F8T)0=m?U|aB0y-yQQm4R6Jm{o$1;PaN5`t>!r#RaW6M%`- z)bV(c#et_&@+3iKS^Fr=GEyc&91dTREcO`fP~s;<*41pZ*oK&uj_{m^s-Gm{MnViO z11D(KC@TyIt77J4+5%{?DfL`C*@Z#NI`rMC+klCV1?&l&jj(O{_TnAy>#+><=Ao=O zW)Hs=?<-4qy3tUsD_ONFG`E(N^sKQ#Cs`_4lY{qkdNVI48rPWL zL#UDyL`zcB03Oz;L$-jaWGg_GInl{++Nz4AIvzc2U-Qf&H4A>K}IE7*@~CfvvILVDhHjn(o9gfQLG(RcoC+bzGk1LS>D;z*&CHk;(S?`i}k5d9Fra@1kexyPPCw z)FvD)FyY`b&y#ZI)an9V$=*4J|Kaxac3o8YX?2aC^K?GBW~tgMw!+3U>&kn3xMn#h z;@x*ZMEG9kA-FK=7>+Q5f`hj>Sjm@BgI0M#O|SOh|ExczQk zLWE-%WH5)nO!#&2=65l1X`g|Pn(qt6H}lV4$zB(gXobRWd?4j9S4lW2yb63NNGT5u zww|^cvq}Py=tCV0Y7nJUznKTY%N21)t9RQ*z1zdL3HVc+T_wcC%_->P%SxJRhCd+X z!Y;@dK5;_99fQEQC|N}xlYqC`W_lcUpsIO*?YtCV>Sy6$_Sc>7;wBv5+dp8ivB z%HG_ZT~U1DJDjaLW}Mo-L!l4pLP?qN)^DZ{LLUcXhFyG21Tl9n+1%|cr=5A7 zW2@(KxO>qmyhnAo6&7g5foW%=bCIgP-ESQtvMH%vRg+@^S=%WBHNNORRUZg zvArwUORE&uzOEj>ghT+&?u?9xe;?XKNCH%ejVw3e$F~FFUTUNf_nOWg{K1BzExFJ`SS`Y#TA{`VQS$u zaPR9BmfY!FR;056?TQ~G*mH_ z5EiW6nr+GZy=8}+N*zEwnQBs)T18X|r5dW8Th77Nhi|oUJ0fx5mHyFmHP zIt19W(kxqO_*6G+DFPkEqpoFntnZ0^G`cOOv|U6ep|svdg9yKh2dUyYf@0gO_$G-mQl!Be2G3=~8b<6Z-U<`g^3LZaV8RUTem zpXZ+T#nnwtEfds>ei%*f$W;<0zxmxIJ!yq(Gn;cXkBnPPZayM5jaYe}Z~D5IZO__h z*Sg+_b~w7!n0J#m^r)lHk5O3ld{jZ&`*1!T-B&nbZkgXo@g5KeV5zq)y|olczJBPaN#?&QnxB^AS{+1I8=*oc8Exhif)EZU5VTTd#+qQ%v^Zhg-<|F6Ok9&Jhn7GTBrqEyeF6#4ns?dodp8kjt)C?2S5ltB0so;#%5n7n?O1 z`;ygH+^A>f5W8m1=6+smL;roeXUR;!W`cNmE~;C-B$>jl&@v?Aa?AutDoB^^puF&Ckz2`st< z1u0^Na9=bWNeLQ6L{x8(u7yOjZ7RafU?lAiyZED>9F$lfFMBR}5}u?ajEL4dMvCxzVF^pXVVZPelBFPjR`YaF{tG zKuwS8_vLHJ=bg0-S}<$RD3G_O$5R_hSj{bM;3+qUI~FIllWtOd z{q{>CJflPxN>!a?O&Ipka1EMx4N$NK%((=Mt#XKdUtzJ)5N?mt#p2WG0o)&^kqNG> zLyn^Z9Y<9en~c6}wtuvj2w2Qeq1#5Zr!l-6ap?NzRe4U5lKA=oWID5z~g#r;tF zqer?SJ4$BGM>ASYI|>VRY2l>mQAm^FI&|PD#6te2fXIh)b735h=Vd)s0h-r9;w&Sk zl6&X;nKp9@n?&|}CXp;*S*`Mq+NF-%|LRdQe9T*23c?fzSst+AwP18>?>Dtkm?_^pkb^7x1_q1IdTWIW~d0N1dr zF6Kj#=$d)&qq)$Ak8S~{>GAZnE;VsZxUOoH%FA8=4yU0aHYpcMkYYMcz-*cU({r_Y z^QYOgCZO2eThkq}%f1C~Nq2XM-o1`EEavw)PbEk^Zg#@+nOoR9>rv$fsV~Xvp@S6EBEd>2hi>A*G}t_q4W6xT?HkRprNbH%G53 zDP=DYDvlQK+;c37imYOH;CnP3kTYzqM-Kx^>7Jet7+%J%SfGgtQzEVVf4rc_vjl3Q z$8&^uM!9|^QqRzBk-i+VnIT=a4SghYd1#WgPGJb4Q1l;QyFE!{95w3GO=Vr`p&HQm z=B8$MID$H<0ho6Gk2lN|Y{~fihizgGDhoR@#bv(`An3^>?Ekg@*himElD(C z)`HQ`yx>{^>O^$4;jcs81CNjqb@4pD7++4ZIxvYzQffUBDJrnN7MnFi);tuw;Ff}( z;wOpIa)ddiCq@XCahC_zsxuz3=gei|Mjg{(*<1VV8m2+nQA~qW?^1Zof=VWPV;D8T z8}zos2Bt~HJbnM>?eIIUFy;}KZwgw0#5$TC;V%#_G&*Uf9F2peK-)SS>qDEIA~-m} zG}wSGu$pAPogjK_S0b%YB@zMoC++*@W-g6e)8xf0nG1Rk>B@@`x`&(ULG%uk?vLfl zgDV#r?y@Z#bmp->20!v$Vm~y6413r^qefO=SEGpS3PCB6&P#J=Tbsr;QQ}4>hHzRx zmn&wc?&Yo@Ms|)x;_>vHZMo`!6xd7|69nsCpKpykmJ{qHY$#KQ*j)Q@&pikN z%FCrAy<9ecSQ1{G305w#u0gY9n?sNJ&T}pzQRXXubE^j8#d=EVCV8W`D~ifUo}Q}6 z4^wZ6=wq>Mzv{_c6W9hkVndsXRW>lF1fBE`*WUts8uu+A?+bfuXQCbnhnJ2`B=ywb zv`caV!_6GrrI|~f$n_^YRKlb3>vgLt)!c%gO(vJz4z6xWu_us|^@I5T@xJyxv&xY55uK_iMn@8C*#fN?a8;s(n`!Q?-tm#H z{rcU_c~1zx4z~lgv_QQaT68SvSE{quJ7cd2al9i?DmJgD3%?-3dR}W@3n5%nEIJD) zMcYfaQL9A~B|Rl>t8NJ5Ky9*x#}8MMDReT%NzfCwkDj<4@WegU53O39VKWEf)}XvC z%hI_~p)4yj_OoFkP36oOmQPT12|hsNND@+^@K;}V=w>M&w6rI8l_yZ!hh>9{LzhM}$5BVt^DNp}#*-5hT?aPi$@G~rzv zjy)2fKt0IRN{4lVBj!WwF0pgViv*-vos!oYv0IjFzQfyV2ly=qGZ)%aOeQZJxX6Q- zg;hd7AijE)sqYl*8k8r=#BAwy&)} zY0b=96Vy={PBDb51-WGTz;3NJR`g6X()3wKs%(NLXJ0?a-mv{R_-q4`9PV1G(p(s# z;f311z))1(K&e9;WJj9g_%t%4e(!36Y;2%%eEJ!+cg*Ja5VVVjXQYR+_j;Ycf)#6C z%M!ZOyP33j-;msAC3>?ctyWJT6w8P8U zbvI}Kqg`(fvW}pq)0#wzOJWNRQXOE=b*_plIg2Iw4Gv^chB@her=CwhO9hUxm%~>8 z$H-6H6%m{r{HJx++yQw#ggM#GiF0k#)#Fv)xd$x+--nm;pk>Zow2U#X<_4ULd12RO zGfW(7G;*fPcoO>E4X)okU5r#~QWK8(qH^8Z2i4`(aV;Exj=uQ7ZW26hbXz`UEXF|u z9#fs=-i^Q_ z(k)R>BCSoGN-RG2dGGnExEqk-v%c|IP!`c<2RB>G#d0%6Q>78YI@aa=X0fTPBLb{; zN6QL|Di#wzuul+ai^x>z#kO&bHss33>;Q3~EW!3Q1dD-2VuLDEsh)#m*DFh96z<66 z{hJT$c4rM%-F<}Nw15kD`1g~9MM-0kV2QO&gX!ua5j%HTvns$jOvld?H^)wX#U0v1 zs9%gd9v@BSDp)f=cP6rCA#y*HGPQ|ePr9vrlhf?p*DnB0lbTSJ~!cQW?BZWO{wa7%_pT7vv;!=|LNY|(KFT8_|}42e4Eek24fZ{ zjwo$*93@t&F$D{iFh$VCe08f&*d58h6O&T~`>a4W^~p|vpIwh8+d|nyNc-H6IQQ-v zCNLMOC{g@5mg~4Gyvf(0Zwh%>8|do2o26)wq3IM}lUsw~duYni4=0M+yM)={DJ(Ts z2woua2DUY_2)iQ7f4xCP_u}q`|H8nN?p)l!HI#bYH}B7&D{%w&j@U<@7KiJ`VT5K6 zMMx8~jEY+e%r_YHtfWAb6H^aUTy}C{vivv z8PcBd=dhi)1L0%%(nMs6sVPR*hGe;7rFV|V*GkWJMAyz7bvJc6y`)+9f&BkWZc5?J zYQ?OV&s1{5HO=A&YZ<1vTBM|RVj!nD#tIs7x8xqx^WuJl48VHsK$)TOg1igW7Mw5q z_h57qP&`P~fv$Tg)FjU_vT%y>CcE#b&5kHjH1-xEG^@ej2@UVq@9@ICSi^Kqjp)TA zFTk;aEtlVLceQ;zug?w7Yna#LCgP^^PoZIAJl*$33Z<`T&VzcA2SStMTv>2)8|XVz zd_b!;+!ca)5=4RcgzAAs^G`VOl42)WgM;dZi^a5CaJb2ZRmV|R^OvaDh6g~0-)*?} zeIdiYxVt#N$=@t7A>?m)@bUT$`_9MfL9W$T5<+k>Ujv9n8Gfu;s90f^KhrUOS!26J zS=C~GA4ZAs5Zte!viYdWu^QE;UDrv%(xPgQYaCzO0o)Z!clFU6@nC1}yDBi5f}dqg zzG^D=mdekP$!tZ1ppvt@Klk2G9n(XeWgI&?XqsROhx)WvZ$4$#*b}L}X4FKc13r5! zrCDH##?b8DI2F!2I`!fijmw}*b7@uyiExCSI7P4Ea38VYF~{KM=ph=$y1iRCm7#x! z9O&oKYSIV7%H4FatSA{+%V@&~;cO!j`I=?sf(=>U54!c{@f{_AQe6&|610}*lAom1 z2dT~aL=7zs*O`!I{a1o4 zIE0y;d5u+xa9)G#?8vCSSfa&F%g`6Q2e6W_sK$*YE(;<&<;-evXD+$OArw=%b$7Yp z>i{pzw#-OpSse~)a@~xNpum+iIJ@uSY_^2o?Xs`X!Rrs%$1lfXv8m2t*HPvQALcrj zFTzzJxvuq)GL)h6HgA%9s#2_+lut*(aGMR%AI=u5f^=Ay(4p%Y+w$Mz76U zmW#-r^W>jKBB|d>>hR%9%dc5Ulcjk-m?_S>`5$x;5#D$r~gw;5A5Q_T)tgqW)jk2}!VLd6rwDhEuLvJiI`G3r9NX;n5f`H(%IC#B}Jj z6$FnR0BP`GHw`@4&9V1rg9lY(Z@tLZVDzrl$GN=xb|`ZX2&ZEsbZVAcM4|9p&N)du zeIbSR8BHc|pm89d?js(XUac1k(Dd2nFfpqJ_Nl?UgzDUs?~AXRjX2nb=JAPb`9M$4 zM(6M3-z&8t6aABtqHJhp9)VUmGjY8&%!L+4JlqhHp)l&IhmNZGoe|*0k)RLrWE5G={NhGw7D{~;r<(`goDnYqsanQaNBjqQ&G!`M zNIHQv4?)}g9}m(6l`z)gzuAe@cH;ZorxapJK&fKlBoBy$+#GHX&`>b4PA&o#4x-sD zjY5{j^&J{ZJQAM3Ioq2RThkXHrU8cwK8W4a8;Ec>4To`5Yb$8qgrRRf+0I8Z9pUbR z_6|AZ$(%5Ms^;hrb(fDHKGoeI8#w2QjSb}_SIHgf*&Px3iIdXnRGI#&Sgvr=r7Lz@ ziG>t-aeBW3NVF&9OwC3yyJ#x@w3A-Eb<>Nt(FyO{T!~}6fXt!M%gZoVVrSQ?Dqx;w zHS%J0xIkbCZ9-!597UxcY8Y@v=}euw!sqAFtA3tS=YCkhQR0Cv-QELyIEhyMzS+#+ z(1h)9SAds@_*G(xo7HX0TFEV;F7Szekb;p%X@KwQ#h3Ni1-L6$>{-B9iytW#(ZKZr{jv#jcUNH-<0T5i$Ak-E zkjQY$1txdYl2IGPjBqOXUWn4|V5EvY8NyJ-35OkBVBB~gDs?`xBrLI=R9rm>l5vOw zuHlB2?0_3UW6{!~MrlpZGbP%A)J8I-B~39yUza=6oTzc({R-C|DQhcfQnL+aBV(rCQX2Q-k>)}rlS#drr9qe*l(j%Ekew#%xEocUIP)dRp4mR2WyzU|J~*yx zT*|P)?IwymANv4$sdtB!K;sU9^GEC$m=4L;=_yksUM_n@>y$jp;MH~usUESXn|k^Y z)e@5Qu_+wNQ`w}^5hCIH+#CytHF<}RVM8sBNXz~^JC0(svj0vkht%CWPB|nP>#5}s zrLpps50ZTvD{nQ8mB=b+O$rUSRj&b7^Y;NEm@npRqrE5Qhh`2xIDZ;iG#Cpa2lEjN z6nA2R;s(Rp!7rhS!43>mzKfgD1Z>qu09Spu1oRqc|HO&~qQp*(BU*rcpsilV9S2-ju~C%K&Z;6=I! zUlcf->1{+ws$s?Y(K;b+p!I9+NLU2rg2abfDKhD5k+q*6V4}B6rwB2b0?)gGepJDZ zAHteLWBRZVEP#wtgey6~RJBA!L4e!#VvO8AXA2Yf_*OmK*ZjP8LIJq~hK5k5ZH*HX z>NLAG-xi-o_lvpLif5-Oq5JuJ011ZzjGf4Uh3|Y~@9G*$@e%dJro`@3jx@w}}Pfid@`wOxkmG z=xWeBwqP3gf%4~S;E#AF6bAo64ZMK}dfd@kO*wjuO&1Kjx5YG8HFXobMWZ-~V>NYC zET0dyYwB^t$iz-Hd}z_y4J$?@FcI{%z7I7XXC-U}8g$Pqs5Mr0=Q8^A6A#c}7f;om z2)Y_`=S;W~7K1?W{V2Im~7pVtIk zt`$H3^3>j*P}2W)y?CQ3>9fzWTIJWpTG^9)4rpVL~Zs%k(mV7lt zrvc`7Yh{OzZjW@IHl}Y{)|o(m1x=?WD8-bWAh3z&M7%iB9OP|{YJ)3JFUq5Wt9R^f zWSTwmWHXy>twu{o_R?<0oPYpOfQGtBkVxH)@e)G>pawIFFG$;8w=4MJdNHjArC+Tp zqS`2D^G?^|1o{Ds3M2#x{OgR}#+hsHIWx>XBbZPLhY=e}_BJYaJ0-aePT;aGTQz%q zvRF%HZ(Zr|_7+)IGDqm=N)7%c?7?xoaBCf~*XRNH8xqiPz~sE44J|z>lceg5s)yBa za;00s!B#pB_vJBOA;I9v9to>cUP3fqO{Q#apq zyP4M+hbc)z)^e>}`Ar-_iqt$|bR!3PfI#W6xqUneP-kSrUG@z)n~bSK)Q*J~cJn3G zedV@s#fqNSobyjwZOG>MBRGQC4jVlu&z`B}=z^G>#o5j+U{gDqP`B-b+I05gO(ns`uTWC~K*d@^-;lVcl@JDn3OCxWjXE`uzLo%~3a zg^?Q8Ktp%GTG+&agi#>S2fx$#d%Lt7xGWx+VO`8WuEy`retB_o#r5u?>T+41t-<`h zoEHMMr8T1RtRd+YjKir{zrQ#m_(?j*1ud`q?o*~jfGI%AQYby!v(-wuxF z5oP&`kw(JV2~5-LpHL&EuHen4xshG(=7GL1MPqLd^thFINX^S9|F>L~Gq^@d25<~o zP7&iL7D9QG>niBvSBlgY$!$?vbeMI7^670E}W+CmW2ym8v^h5A9@kW z-p;O-G5ulv^!6(cXPjxqJ6n+r0VYGzHbc)pJd zD`mn)WyXPU9Lvc`s>gEHJVKEYbIogyr<^%B{mejH&nE@c;NAlVW=QP3S8_I(?uwR_8`)v8-YaQWC# z$hN_FI;twGb(A48HGarx6TTP{gd=^7=Eoe=NlS|nxrDT1Un ztTW*pp$@{M>L3K0TUnlh7a3i1rIM7c9@+_jV8Vj^a5`ENc=(+)1sY~^H;Gn6MuiUT zX-q4~Sk;-;xH6S*3?_a{(eU>TnAnXQP1)j{qXKNo%;W^-@wgD#^Pm>8v`N)d$;W}I zr(CN-WGy)K%dr4K%Y#A95UvonFB1^}+(S);|FbSsN9bxOBH@c$7FvQ*?fHVGZVpV$ zfi5@2*&L#-v=mjr=;XJQIk1dA3A7fyKu`b&7Cl75Cxl_}>U(5*ZjX1>SV$N$;9)AS z7fxyiL0WaUNaaKSVjf{Sgg>f@ujI^EBWTH1hHOpH$D3%;g8&aJv9Q|;7^~@eL}|mC zKCRY;5+^uItqWGw^PDnDJ5~M!PWn&c-eYs+yW>efCWu1yC1KGB=ELoKmUo@A-ES=< zLgUJT@vBe7l};C%TSWDC!h}2YQz!|+Sw+c->{9%5ulay-4W*qRwL99t9KB7nyo<- zwF|(_fQx$p$xyf)aJZe$7!htPr^Ac+y4l9HcEjEZ5UW4Vi|U$H*k{OT^qI4Kg_?m$ z@-knZ3YXGCp_(8fenMY*%mra{^_VBUlR&{`MUL$(XR4gLjpr>Mb4_bG^84jX0`V@h z6UO4Kq5Oz0Op+&xk?%J3 zX>>!RQ7a-vr~yT2Oo{au9?bN{75__eKPtT2PbsNouUk9MJ*_k&c0B0}&3?@f}k!wy>bR1ychiX+i#1Mm+^iEZF~G(*vuzT&p6h-q|g zfXPq^i)C#CE|4``bFuytWuaKZ%N$(0>@f~*;YT0wI*+Cp@S3@WEfJTysH49PFd^O? z!XqCYklP~-ar2D_+vDkVvF2om)75<*$)S0V(njBghm~#E^p&9?sCUzKUZdF*oCU^S zI)dd046Co%^~F^D+6fA>^C&wz1P8JsUe)G*k=+rtJ{R-C?l~OmC`N>vz;b(bNUvHx z!RAXiZJG`CdXwkT3d!Q^P9ifBV-za{VZp)w5kROO;^sH=iOF9}E_*aUeuRl)yS-E3 zXPWgM1b}vwM2eUk4@`-HNPQ*Tizu#=G3;vRHtX#BE7Aqme4mO~mRZCyLrOQ=IcRR? z7B5m;eQka9v2!;dUJqtsTTqf8&t{Z%{qenrf^POd^g0CKFd|w)5VC@}JtgNDRrUyV zY4YF9M~=z-aeL2}(Q7bOZP%$P!lWdxMd&O2l0_Q$B^0j1xZ_h=It8UAMS(e7u@%Oa z{pF^5K#GmBiL|CXJ$dr-5#xDTmJ(=Us{ibb112%cNz1!&_ML<*B|&4zX|Ky)Xxanb zra3j}!$-04K@-OZXTkA-H-mw`mzbjL>iuTkN2N(`!W3dOPbDZvPukT6 zmXLkr5NE<$>RF+6f*4LS8b>b4K9)jZ?mZ2LxPLij`%i<@lTrpf7MItf&!YF?$n}}i z45PS$P0Jx_StbCXFcSqQs9t`7u)vij4rfA~&Q1n14nIPzH|4I$pSL{$*&nI+gF0?% zYv-fQ=ZjxnT~2ilUG&0-bB_IuQrJB+Z{7!1bK!FbTl^Z8~P%zXK(SN+Z9`m@&^vW|*wU)2-|W{>T2LseD{ zCm`;qj7E|}BK3AyB$yg^x`2_Fva`||>xQh0#LK+I3!8zMM-Yh<6{}5QPPoG1#q8az zg)QabrJi50_leO+RfIzm-V+CbLW_}edHFYdu9rHUm?%*M`?V)lTa{%zXdxOgWxC)> z4A5v_523~*#y@2?3An)1Na zhecwsiExdi4uKo1(MZRYXU-G|zLsaU8nFOS0lIw9uu#ZNDDs&gu!%9PT7HZ6-IOKc ze)|Z?4B;rR+uOYsX*Xy8dqhKlT~a;G-aAzk{{`-bA(nlhYK^8;5o;kO%y#XTXQoY* z0oT8tR#-Yf{i!)L5s&POEXcB6jTB<7hIU%3CALC&<6@_25J(Yb?hlE1)ZO0DI;_3S-8Ka>U-Vn+RzK?*u=_tq}|t zRpl9bgq*hF%ih|KD!@VddOs{Hn*JagP52BQdj8HnJ)u&|513IRWvIo z;J<~#3U?aRIK^NZ%Ro}oAIekgrMlRJ9U=X$P{JT7t#wCJS}Gfy4O(+dzI^}-pJpDB z;bce<3$Kdh%5BXjj5osmS!eb=bVOEM>5*Ee)lZ~xdLN;7$K!6z4*LxqSI0mu&tv<) z=u%4Q{x7u#v~8%IssS8NW4yRyAAirNtP@~1#%Rn`=V`oQANt*B%j#3Zs%y9%3eJ|C zis~2T&Zd7Kr0JU=&T}111)QJzCbrhEb>^d4;W)BW(#V1gV?2kaeAe%Wc(+mo5<*Xw zZMSQ-?&k3M*qLJmd6H{Ki^}<%9w(2Ba&D8h)KvblCeO*X3cKCs0B{8i^gC8ieAvux zi`6TEh88^Gau=TPF0I%hwp;!Bo3b05rohSG zTr}BG!8*Z*3mv z1#g`~D0NM_@bRcRS|AA5Q~_rerJb1A7I2H~JzaTvTK>cv7OXF`PN-(z9%d1Mjlj%@)b}Mvww14zrdF zNDhE>jM3MEJ2NrHhrLFKl~@z6%(=^wVH!L}e(m>0S3L~B=3x(P5Ue2Sc5@imw$xE*YmcxIVFPr_bs2rFY`*lI|Qj(dC!*h7bI)!TV5<&_ZvT7hE z8hy~7Lab`!)FVUhb+JBOf^H4IwdxFk-npxG>aDf{TP_*cbU!6z!6=piJ19gVB9sgb zRXV9+s7F=8la)~E^Hs05P-47-jhw6fdT(Rxtc2Um8eJ-#b0-IG0;2bZ6FbA+Q-K^S zyqq*$JqNbK!Y%u#=CsKrTy=%v@5R%==q-!$I%`$My6!S#Qz+N_&Vl;XS8$2pAy&}H zRZ@79U331?_245*=_#T|7;BZ)R2^-i;V0}g$_YDUl&ZeBP$$H`YBgn?kY@RN)a}Mr z(oVzBT0oXP<;Sp}6{$9PjTNM$HVKK~&TcUtvpR6RSgqKnHr}E{8JQj0+VFmm_3P{gk20vE2N)oE zi?JrIH1jJkRX|=v&k%Br(!rYua70-pxTcbWZ7KIA02lO&Lx^iQ>Nm;L#a--jQ_8oj zEmI|((BQ8R5)jBcZE`#7BAhtBw|AFbG&vWy*`7b;lkI#o(|wW*(_+KCy?$iK*0(7o z5{BEe(XxUI3_ zes3;o@8rH;X&NJmcF@nL5Ki1XS$pIND*TK6~jzaKon}1l$i)(zs@F(7!tv|m7 zS3OQxEu9%j8&sIeIkqx&Jz8#8vErm$Ta-3dc!hP!$W|I(ULBT+ov!W$KC7b*wPWZr zW5T654mwN+OHiNkq}EE!X7(_uIBr#{KtmyxX7}V0e#5_S_RMVl3=|{n>Y=<_x2BU@ z{26X(tlWlA1RAvtZH@7U2Yt2{R8{LS&?cR$EuS`?3bDu3r=qhabWlP8X@CgnVb?_& z1;YV8#dOi>v6#J|4lk0@T0iESbFIcY;}NS=ET$EwH&#8KAE{A^i+FOqjcgGv)<5-UMp26ncfflO?)uuiRHq1-#( za8DQ$UkS%1F#9IBsaRN8l~9snv%b7z$;`a47$lr-?1FYX8U#?gnJgTSNAo$md}ATg zIb!>>8jqGmUW{iW4MSPeo1rLU=Pn+y`TAiq{{+pSrf9tJX8Yi2NTy1!4Q6fp!G1C-f^YdI6P5|Kj!y&g;us6hk-Dd-8qk3HDOk49?$TN2n0$}+C3X} zGIYK5{i@EixmK2Du9f9npx1D_fu5-Q8jgO88AkI@$42y%=2#cFGZ>9$>5(ei_y zpwQK;UI|2$*v^R4U0Z2xarM4|f$n-W+)AIS_r>~wWxawB>gyQlR0`NPwPRzbV`gKY#lYMrDeu;JgqctSd8|rSOlvP+5$VUC9zoRpYm5CSN7O!7ii5ly zjwL%I#VBZjrhP`CdiHm+sc1-FEKyKZU4H-hGbou?prcu?xA{}UF0=m)@%QHwBnZYE-GKM7l6EBGG*D)oNw^2Q<{`8q~Og>VP_)#3@=O6 zhu(wq-bYz!?rd@ke`TjWO!M<3ZA^#KnFO%x=!9%&E;(fEV>vhYC|C8V)#b=s%?nCN z%Qyy6KdK#`-t2NZVY*jdqWB|P;@ttc&PGV`71kMRo5;a`Yd$oQ6(Sln>eU=w_8ky! zb}#;Yxvc2Mt$uNqe5gtIp*{K{o~&}_#r=`ww!=NvV2Y}BTurPP4EGZH@!BVAc9n}4 zcADm!=Xv#f3uVP(xtnM5ewX4lB9dSN))bac4Z8DhN7Vx&E$WDXs9x+tnEU~!YauI} z)?>LQXC}Q|NPC{;CP7*2sSl7`&|Vc|-q_2F-)=@z{@s)MyASMr4p6zKm%nFq0UzAM zcR>~de$Z^>V&C&0PU^?7AF(w(gJ~LkPX;u6x^8_5c_p8NbX{dt7JT-$-ND27? z<+JIlOigy%#5sCQw;SHh!7+*Q(IY!OSC{y+gkDc8N1U47>!GO)+_z$Ohu&K+rLGn| zB^8E%!>Ll>zzH@^^&PRxQPmrCvRf|lPGtllRWo!8fczz4~oDQ zbchdwY)!5#0NP#sm%~*A-CBu4rKh;5HfcB2rs1aAPjgeXhu^3@S=)VgJRLU`M zXqAf!OV1Nnlw9G%i+yc2VEDl&9~w5I5xCN2(xtV!l+C6VgN^#!+&!;{=MFuJ2C}$V zOttnc^Ejod?vxX$!{7!f$gxIZnBukl-?%7dh3%cP@g#BmMJ6i zmKuqbGnQ86d~w1!a=a>+F18EzO|RZJaPI#o8z(xb3@fj?tT6g&>8*YP2bc?tWsA@C z1Vr@$=*=h!E=nksKkTJC78f(ao)Opiv2u0+BLnu{Vkfa0(wz=?W<7DVhx?HTdNK8|1(~emI^1Yos=`{mtA_ zzw}LRgE*zio9^bOYoGphpQ7e=y5|W_V4GqB`{h8lI~829Q>%%nKk!rrF8|iXN@(~= zN;sbOf1+nYyD8$REI*B1L_@$43Cgmyf?EUfu}rOFbs|-gNl7cvB2j_&MMh+81A6MI zVO)~Xu0}L5)E8+Kc$E3FYrYR`qsW#?fKymU6C6d`dz%K zC?mT1m;2+*HHVW60-r$J5&#NKAtV|T{q4HOE`wJK-I2mAJ@&IYi`bmQDhp%r!H(ip zl9^s5W)V7sHuvx>J}ojvX|`~u&C##(p)mTOMiEJlk+s+ZJ2u^eGpQq-S>)@*-5n6S zJz^1EJ!5qCK3m(V;-$?KW!^AHcL?HawO-6YZdY&oNIJH6`iTe~9vFW%rd*Q1~ zV3f33F3Vmi17PbB>9Z&tEP1N++99kyM+g|R)ASKuNnRjCuwS?PY1o1B3rkN~K290V zsfANdSNxfbwu;oef;4j zAN&BYWhf&qIdpkE4bLfz)anK16wu#WEfxgB(W4rez>$hKRtWJ_nMhmv)DldHD@KZL z5S7$KSB&mHm4c{s3}u2{(p|GB-9BX(bAfhE1;HLMbjxbZ2eS13^4+=cT_kM|B8hT3 z(9g;)hFd;&%TFg0E~=QC)TE(^+$`8qA#Um;7_))W-fpMG=ae*_qBdFehNrXd;XgU9 z_h_^(oMwc>S!n*-&I$F%n7>lSn}u}xrxQWd2#R-{Tp=t{yiI#T)DT!-z?>ZV55=Tb6VQfo@GPWo`d8iT z1n-i`T@`UG@!yWD3(>@=B3_77NpNfR;W+mua80irAwFYlPO$T>Mz5B+2p0o+v}zq* zJU=~761D1!JXlk5n6_0Pr8SWWZy2AjT)!5@O_P=LNCA-3ZCSZlaBUL~@$G1O7lT8X z$Bt2!YvRdoi}l62n5maAz>9oeCKbE97&A^N=Fm&k7h;e#7|Ch{H5Bt$z2NrZ7rPgwJ?Y&wZFEu91p(c zMClNwV7gt7Kssq)ucI#Zx@MZNtc5xV4c41+0e(!@n?J)l%ZQ5QYfP;vp&~<%(xjp2 ziK9T%zB-DpM}r(oh;EO(U9JdiFn_56_<;G7qkCVM94PO6F<*C1q8Mzp5yua!#eC7qzi(zu zuA#v$6T2L(`Qq6b!St=WX6x9{t@Pt~$RrZ|vfiw$P!+bA7@DXrl0g#=Gha4~R&%ha z7*9BHPMh`3oWLvJ*vy#W%{cD`%8xaWUX^QMj_VZC78b;zLns7#UxJk2G^?Uo5z;YL z$a&h(Q#Z7p^i9W zIRS)hE?#XP@DUy>w69FHYk%{ew$Am)dq_W~OU)pfxj!(sw_a^MXXhYYz_ONKiDp-cWmMRd@fhDzRDXN->%UWj~kqtK74nB|bYs`lRuvzT=5s@jsp>#AlnvMqd zLEyR=@o6dB&pMlhWjI)x4MYS9s(I`*jF)?9S43|O>w(ILbHY&|iGNczbD}HIY3!^Q z3$Vi3=CHad0%G9K+7eXiGGXaA8+|JH-5ZR@P6*6AYOM5om9|;h*#C>F8tdr8BX|Dr zvX8;IksTZn$E{R=3!R%K*_DVZ%#|lmWf9UQ0JKnuF_z)5dS-<2(3x9yw_7abX7SOw zH^7mTc2n;#2nDh4i!D&NXIEB@GP8;$r2-RyotU~*qcrO9^b zGy54TJg|rlGbDZZs90C*g(Qo^9zT-pRXKySVV*q-qAc~qDCb71$1C2|qv+p)6DW8F zAr9ZlnA zCwR5EC51@}z?Ajqf zVnFUQ_MqYA^_$$|DsmMS${?d7~Yd5i)n8HinG}-tDNT^gaOyZ%QKA! z-%Z&M*aK$l_hxJ{r1vU!lJj&^Fg{Ow|KmIaHQ~XEg8^9G z7xO8w+6%h%Xr{Hey~4Yo_+vM6A;CIgL@?VM@7R_qNl#51(UdhFip`3Bo^o6{HzreQ z@!c|5hirGPGK~?IQZEIisC^7{(n`O*Ei(ZhR&#nt)3qS>?Zm{Z6Bbday6%exw z1ABc_5$0PGv1Fn*JxH#I=A~y^EXQ$mTiZ<)eH`l>YQo<9mR{#CJlSS_9%d&e)> ztw@MT%?4YSlV?_3z!AzG&8>xu(}L`Pz8n^0Pc(H>=MS!>z%k$XAGNM;mCt}I zYb4xb^x2EBl2=^QyHL|vY`#O)?R$g+3ksJJJiaGOtK@YCx+;DwM?9%#zr8h4%fgq2n9ut6=Qi62?Z_v8P8DIUd7r? zej7ZJPIZBUNah|m!3;&v-D!zUjX8bGXP}gM-mTLuI8j%R&pvl<*ZRG5RTswXga$_D zHbD5OOIRP{M8HluvP>O`V6ML0LX$@vW*!<}S1^*( z)Y>AIRal5G{Fz+AGdFeV2>;|&!D=0lmag+I{XXv5BKJ?g4umF>Ra?twz#g>r+(W~& z@)CgJ;d^8oBia7axTb0brBWdCt|P%E4I<2UndB)0DJsSg%m0M03YIKG1vyPr!1lcRs^x=MFrrUTC_3VfZxk?u|5MwYxWWE%KL%V70SBeuiWvvm~RN< z@dTW9q$uu=r{~Z|8&HV2S9i0~ov*mWlz}C1#hq!RZl`X24C4Tk73UE*7#nWiE!dhm zt>8=ileH#=URt?fj|%1Pukp9k%!Cwg4yLB;{Z#pLxqd+7VJmUi3kub!hr_WJq#6&e zMljs3WVg%|HgAb>ju3O8Dw0cw_y{y-y zz(o@BYCF4KOihW>n3_qP8EsfNnE8(I{V?Dw$Km81mFHl3GNqQc&Axd=Jpd$HJEw4f zA~PmVs4)l+@*F$npe2*Xrs!ed{7{GpJZfx`+nt*nvM+^m^)!4H^{FdF!30}THK%zC_+s{M*1~drsNo5) zbAp@%MaUlVh;yJsrw|9yB|4qA?!(F%EEp=;Aw}uv%OfZOe4IoQi!`6eqaPsYmCPZ> zL@&3?9p}JxC6teK;Ko~7uHv~>FA2^k3NYQQHbpJFp|T@(@Wg$aNlp2>7|d`mdB&sBuOJwSsiYgF>R}`SCTs{WA=?+n@tnL~FQ@+B z@IUDdIEtWe$VZfT5R}%iN0-X?!SRjP4KaHsy62Sa7aZfHgNW{I2?54(TC&5FsTocV zFcq1n!)qboTNX=-lL{#lwq}6FB<<<~wHj;Im3|C2^E37uJDtNckiAm_EbQ(TF>{I{ zH3!qtvMMGwC7|!`hLACHren2c37qr6GhsjCp%#V3x@uLZ=%Z>(z2h83_Fv(>9IQ^> zaMj7Fi|p{Yy~WR7p6H6tK@9EmX5vgBE<2yV6KlQS^zNYar32|pTNq5TmxGTsx-FkF zR#4`7w%0ef6?eQIhsvS}bpJ(S_2Gb*i*13(JUncK@TJqR z529&NBNB4N`@NGeP}YnNQq3N(^GsbE@pvDviwLBlwKQX#G+_Zk<(3la>4CAxF9p3tgCBLJtUD0&xe}5%MU+yat6BWmg=l zOl=#Y!W0(aqi{f2fa&cb!LoZpQqQ+ZSDL=O}jtD#+9b*dj6YVEhZC zI+uvts)@*v>fmhsdAL}fZB{F&2Aa;V(q;w%_TjM#HJsddtc=q&Nw~8A@H+d9! zMnT`jrr2-m>uI|!A*LS&_JE&{1>)m<0lxN(_pJhvlCj+hF4angOC^9-esl%KaO678 z!#ScuWU;7rhyj>CVcw);;liCq)7YlIVcAK3E{{mz8~hqj3^NwZ?s>(2xUMGeKZ&Pg z_i5dSC(0oJVap7o^n3o}+PFRxl{@3<8Rq&!evbXRTtB?#S=!*$443?FTmi1swFJ4| z^}6=9@x|to@$_0UEKZ?qtzX{9+3+&)^8SXE>|7CSxL8c(4C=z6q`G}rfa-L3YS`gT zVz?uH(UV!+U7X+KZAhg2&^9bnO`dJ ziwYixLk~ERTweQd0^P0?=!04@uhk?eOzk`-W4&WC*4cYV>0x|D0@UGPIgktngkevo z@)@QERcJbuLV_)8j;AgkNa}E`vrfh_+BPkD=b*s~;Y=60ld3>%8zP~g^q!37Vt8Jb zrIIQ*cqSa^btVY^>FJoVonmj9;wLB0n|IM>vF(>^3CkX8YO!(~vr^MKB)98lsG8V2 zUSJTXurnH(iw3Fr&5oCoxphI`u>plL6?keN1v{R&M@Tu?W=-q-#K@f zq9z7TpM1G0XOR4=emUD(Pa0L!Z_69Jq=epP0(ZmV1=tduA#Xct#%s`1lMpkuucbXZ zbU7|@*rn<`^2rLw98GlK2uR7a=z{+bZ#^KlBvB3Yzo4u!Dz42FQEbm zf8S;RvBwIwQ?W^H`|A8FG7->O^yONT?DiENm$;ZzK;NQEv7lL5cM}tH?u^57@E*J9 zPS%Da!`smWZ%8?@$A!l%vS!EY+0@s@@5t$>S(v7D@@$Fdbu|T8cSUIanUXYJxPmZG zRtqw(mc^KR+)1}OQ!z)Ok~^^DrxQbGvcxitec}_VM&UpMm1gYeJ>_~{H#2{pv;!(a zC5^=3+&5$A^Rf1N!8X1us_1Nu_p%RAET=?9QY8{-9RL#NYU}fSDSGIUp{+U`K5N4B zDmom8X&D1puBfUHV1>8ZfLqn!C#(_R2JHJW1p1~Lw70kVMaptnda<(Um$TVm;4Ql3 zu21r&7sBhsn#wAITOAA*&aoq_oG`rB$fd$wj zZnAKAb$#lM1P$b>Dr&0!W@y7L!1(Qq2#2GW#af6o^hverGuF3Y7DAaj+XyUR%lkQi zPB70kpl<4=*>TItKT;~inI3ac4wwRtw^=qERh4L*1ECQwsYL9gD2MP&MILF-i`6IC z5;I;y)6O>Cic3A#If|}93hY~c*6WK>6~$STe}l!8YBgpnWf6#chR2*kAD|);rVcD0 zW~tm~Ze6g0bhK{gFy(331`eBeQqbpm8KcS8L|4D$#H(@G#LFbDzP}!xv>KE{g?V)} zlj4*qxmWzi-&0G;(lqCFD&e%pnrTUA3|mO?Hb#W9h++<8f;q5R6|5Qm`DUN5pp z8^aa;aw;6rp~aa?1Pks!591e@(iWjQCF6}$wG4J9NS?OknUd7^6EXss7k zL-Pa%DRy&h-A4?G9A0mC+++#sSCXwk38YC%sx<}$)py%yvC{+LOQZFAC6kj{H$x}o zsET)Qf3sH}vQxnC#f-!%JUHkWRlt!xPD`E^pLil#pQ0bf)+odV zE%%`0n6f1=TK$0+t<6M|8jJZ34Mm*D-vE{>VMlvB4V;WG*glVFO{hkRCZbvY*_9#{P5|xQCB)lBHSygyz-d77;N_e0oyWD z;x@D)*&^g2NX(Xv-)K}2RrPLvvYx5Z)i`ihscaG!grB*|x5+VdFztDd?{$c-1xi)S68?Y*;>o(@8i2dFw7w8P@F}r}Q<~;x?r`4H zT3kD7OJtU~L>5$+$PSQ~7#M(7+###Y9CpZPTTEE+?>!fzPwH)DE$;M-@8Wy5r<4vw zxCM6|0G)sJCwntvpNtn<`Dn`C!F&X89}n=rSL!AcxNTieqT1@?D6w2smYawlSyf99 z39RMKO|4U2hIGp9Z5A%>GPXe%pEgTQglEyTfLkTjusF%zv!yyf^RinwpzB|)4(^Il zIdHFBRJdLNmg)oz*qa-+x+WmT9yqjvj+c7>-MO z9nqlB_0ur~J0oaF;c8G$VSY1JdZI~yk{Y%nJ$WqkMnVH;gXLtgVrRhVT3}oT!>8uS z*>_9+6YRX<_fZdZYel7Y-cRYwZLx<4BAvm^l_yHbGU%L83+>#>{%B(qEVV(!J&<*d5H`sJ07;+(B z*tUfyZo5G+BpC>h>Qkp)yM`@47jH*(6lKoKfFnZdcUC;=+<9>Gi^BLE{ z0Z6^goon;fLv?}gU!QpornmB7@>;cNIEdkvbE9&RM>!sWv`IvET$6}hv@gs)rgI(p zE@h}61)BB44k4tdyd7J21f;tf29J*_!j7&7Xd;l@_1nz?$POPu^y)pyCZML76uz<^ z0JbLMLP55Ij-x@nJXdAU1LbT(ltuVXeh(&n78dkT<+>EMjHlKyEqjl+#aRkPtS8U~ znhTLORRF-k1PxtSowX{tIo$~G6|iAMc?x+T#HbmdzwEZGM5t3UWT(W9bbKL1X>F({ z*`+f2T8-RkX2DiFuxE9Cw4RlB-tC|9H?0{z)ng$b0zNXJc$!$&*r_KCJl&Kv|4Zvh z)Ng3 z@iLt?Qhu{ctRtxW#a4v-OWJ(0{^(_Z!=>-Mgn9l)w7G>=}JXc z!W*Xn{1~A{a>cK~!DcvUsbgLPK6mk|Ilp@5d9DtAWMJ_@4go#ep;&}id41G}G z6SlsPJc04)Vo~KE**Bp1>hF4(OT&A_7>IzTg9r%8ND6^(OToxRy9Z{kPE!w+=}76B z;7GPjq{cXa6RLWAe=amp=i>h5ObP8PT)QPZfz%S8uEphK2KgfjOq%0p!Z-pCp?Se_ zF8=Mo%!bHkremhBD{495+CK9`@Shr!2Q zYG*^pXhx(5{U(Ag$&Uzc5tWU`Gy8sVy)0Kxjzin^;%a|9s}?pU3m0`8Z8v&lx~;|1 z-ONAD7teD_L}~XG5Uw=I)?j@tTjS09Vs?u93!qfEozG_8(3@EJdf_eZ4A4b-{zI?!i*YV-*a(42v3K}$@AF>FCkACy)v z;C92YjWGy;egj`58(y*x=@_28pSC;BS{KdfBdW@wT(g-P&_SgL>9=yf!(1j#sk19!KEp=9o(ehgrSFx_o^(evpqTER=4P z?Gc&DiLGOVm#` z;JRjJ$J64v)*C)77DmVzF6uUw6Gh$jR2S|l3&L_3tXvy-qI96;ptq%|Q*@&Y$iZrr zpQA4ak#Z>y&l8yah>I%vu(z|>&{;}!cX71?hsV~<+__28flP4XPMr?5s?W@#s?N7r zur24s2!!M;qS=*`LS4ZK#7Mt0D*evF>Gv)t(i=;;*tb*z?3VA-23ZmIO2n8pY9zxs z&VA(}hGIbL5-qJ+5Z%Cr(e+A`7TbWF>ushVv>dS5||=22Cxu57ce7;+HTodiUy_6GFX_mqt&X!RMebWq6^2N{lV~cHDxU@ zC(d;f1_RyQt+mB1VYGF9Ym*+cr1r}%s_|$FZ*EJduQe*e#&EBlk-^UR?PequXebsg zi7$v~D)f}5f^x2SA+ZZjV=d&laYmL~x<4}MC_1pg`xC3h zJ^H>3ceJroKZbzU&UW)VUs&^RBzC@_?UWa`Rk>`&BAaX4wy(@IJ?u0!KdvaT;t49mKDDl>t-72C-BYISaZN}FK1;cw_F-*Dj79%<~x-g??i3(gZj;f zytVAd{3Sa#QZ&0oDahICoPA!AC$IP)2r^<6R9?uBRy1mdUpic;vFr}TPf|HPGc-*G#>QJ0LOrDSz`r(cO6*AaIe+=GTv+CmGe!utk-)Ov& zu{wxJghiLtU>sr`YlfHOX6$n>8$>vqVlU}jvnAWmDx+Z*oBvr(ERQ^OJhZ98(7ofm zuYdSGsjf8IXj_i2!qo#SPnHyJCM@+V>z^5pR;%McdY(6%_8L-uc|zSugo#?&i|aNgL8HqGT+%MdA%@SGs??GD({4`2Ims-Wx3DpVqEUI zz1YhaI{((*GafVde3cO*$=1=t=C*kV4t0#lW7YmV}FNaVG7))Bo4UR?k70tf6lZb(6X zi6d7$=}N2QTsJ=V_tG{v^!N07d7kFBeADZE&mGXcD;BIjT9nvm*-O1L!LX^_m1n)k z#ck+}OJ*InBs?u9VXoq}=$c2v!VQsIp20_X8*deRRcE>?rgclbyr|c^2@TG?Umb_Q=L7QrmRLV_O8ryv#VZ%Dyq&#-#dv_ zmZ&s;GI9IqxYAeI#+1^@gqxK&GJP8&_k{g*PiJ?n*&9w$Hms;CQu0#iLsW`OF6mBh zmxR*UY*>^|dKtMupN=AVCtkcJ{=zU;T*OR4v)Q@KOV1PFGL}jxmOe)<_(N0HD*hwb^-j1>?{R27RP@PfP@QvskIYTogUvGtHT|WKSykGn&IR64v-?Z;>j7*zFyX1{?S*)>P%l-GK zr#9gpee=}0o7B}@RYe^!SV5kOgRVbx#)$_8b{barheVZXu6{s&38KAJYS1xWkGFca z4IOc5FwC~5lawaOsd^^pqRtE$&(67=w|}CEU)XQqXV!Tv@Oxb?!``7|vi0eBu|F+2x~|He40ibny9GC*`D2v!1lv z>MTSd>TTo}N^hi*T9|&5d@4hvTRRaJz_9g`TSCK@C~hhwMAf4YH?QK|Zj*hsLywiB zRivah$JtB*(x-zsgIDpA3a>or?tQ(n(bR?tPa!i=s19uvnui^YOY~*Aw7xbJs_pH5 zE(or=S?w@IWU7F#Gb*v~!7J4T4^qZta0i9qmPbi~rLj!n0FmBr+=F;4%~gM^7(jYt zd$I;2myIH?qiU_cFF>$*DS=zuTOJP63Q}_&I3wx!^CR)yK`s~B%I?PzjmxHZFA!fJ z-}A#tMw-4Q+I**JJp^$DBAy$EjpxIrnWW<)03IlsQ=g%OTu*#WIsv2>Tt{3h;CGYO z5x44>8jn_@+nUOaRw z9tfT5V$ERUr;T&jU|Cl;y9W;vXZWuaNFi||he4??M}?madDAXUv>^sU^p%%Nq>@B1 z26uu6S0mt!fO;VtPv;A?00-_IC^^U}KyX6W`f)$NEk!vTu0lC4ln%DW3TX(ijV~Ah z(`M)3wGF_TE(Z7x4{T#FX^0#=E*L9PD5ro)r}BgXI1qwQjKj{3z#nUZbifzyA17jV zvHJ=@_C*SHjyE!RVu}e0{;GH~KcT>f01^Qz+@!(za}ok1L9eDkhQehc9@NSuVu&QL z9S4FP^sNnm8i?a_LGd|-ipnDe^&I1-Zum95NH%Wr1o+8MHcv%SyR(?1Y%d~6hVzs7 z8*D(j90^)8R(PKTja#+AYj`D`5DbR{7-8QalXL+Ae1nJD0E9RA2EmvMKk79F?4=J8 zz-RajI6%X}g?u_Fix(wxv9o+USpc|PAi}Z3z-$yG2HI`LyqtIU>nkFuqW*6YjSk|0 zb;3HK`I}|_6cFDy&4&XScqtOlAA45B2NhhMB$p_QiHXUGrQT9-sY$|U5uuR~3PmXF z92|@so$XzWU7h~v97uDNduhAEj)zUDvrRysDM?Q%v~NTm)kPk93;XaPq1Jp!*~FZX z(wA|Jkn$l*H)^?RHuvSD(boxbq07HM+c;igslq;(ui7=RdAXFd#B%HCcX~0p5d`Sw zt5wz}dt_tIDOoIE0h1lqZ4HK5A4b*rJbBbN6A}8RZggey%M(Eq1I4^v%lGEQ79Fc) zjv;lc#ubO--*N6N>qq|IN*bgA}_5KY8ts7s{It<%K7LzQLpz<3DCfX<#6=4(=5RCl4r4J?pOj&5QG!&`} zQPG$PL>Mv#88)xj@W-k@z~U0hlK<4$UZ}?U66-(hdLt)RX{sZ(9D@IB6Y=vsNOyE3 zDjI{{6oqkh`i1imf;jt_(Q{&n+btuBTyw*MD=U;6_1f?`ZE@oLMXx_k|tvKHIHHIut O=;?s!w;h@(CiXuAPatUk diff --git a/python/examples/dot.py b/python/examples/dot.py index e7c7b1664..52c7e0a2e 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -1,4 +1,10 @@ import libtriton +import tensorflow as tf +import distutils +import distutils.log +import setuptools.command.build_ext +import setuptools +import os src = """ const tunable int TM = {128}; @@ -9,7 +15,7 @@ void matmul(restrict read_only align(16) half *A, restrict read_only align(16) half *B, restrict read_only align(16) half *C, int M, int N, int K, - multiple_of(8) int lda, multiple_of(8)" int ldb, int ldc) { + multiple_of(8) int lda, multiple_of(8) int ldb, int ldc) { int ridx = get_range_id(0); int ridy = get_range_id(1); int rxa[TM] = ridx * TM + (0 ... TM); @@ -39,4 +45,51 @@ void matmul(restrict read_only align(16) half *A, } """ -print(libtriton.make_tensorflow_src(src, [2], '(M + #TM - 1)/#TM, (N + #TN - 1)/#TN, 1')) \ No newline at end of file +with open('test.cpp', 'w+') as test: + src = libtriton.make_tensorflow_src(src, [2], '(M + #TM - 1)/#TM, (N + #TN - 1)/#TN, 1') + test.writelines(src) + +triton_include_dirs = ['/home/philippe/development/triton/include'] +tensorflow_include_dirs = [tf.sysconfig.get_include()] +llvm_include_dirs = ['/usr/include/llvm-8/', '/usr/include/llvm-c-8/'] +cuda_include_dirs = ['/usr/local/cuda-10.1/targets/x86_64-linux/include/'] + +triton_library_dirs = [os.path.realpath(libtriton.__file__)] +tensorflow_library_dirs = [tf.sysconfig.get_lib()] + +include_dirs = triton_include_dirs + tensorflow_include_dirs + cuda_include_dirs +extra_compile_args = [] +extra_link_args = [] +library_dirs = tensorflow_library_dirs +libraries = ['tensorflow_framework'] + +ext = setuptools.Extension( + name = 'test', + language = 'c++', + sources = ['/home/philippe/development/triton/python/examples/test.cpp'], + include_dirs = include_dirs, + extra_compile_args = extra_compile_args, + extra_link_args = extra_link_args, + library_dirs = library_dirs, + libraries = libraries +) + +build_path = '.' +args = ['build_ext'] +#args.append('--build-temp=' + build_path) +#args.append('--build-lib=' + build_path) +args.append('-q') +args = dict( + name = 'test', + ext_modules = [ext], + script_args = args, + cmdclass = { + 'build_ext': setuptools.command.build_ext.build_ext + } + +) + +setuptools.setup(**args) +library_dir = os.path.dirname(os.path.realpath(__file__)) +module = tf.load_op_library(os.path.join(library_dir, 'build/lib.linux-x86_64-3.6/test.cpython-36m-x86_64-linux-gnu.so')) +print(module.matmul) \ No newline at end of file From 11a6a925986a90165b19f6bcb40abbb6a65bccab Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 16 Aug 2019 20:50:18 -0700 Subject: [PATCH 298/494] [python][tensorflow] basic op generation is working --- CMakeLists.txt | 13 ++++- examples/cpp/dot.cpp | 21 +++---- include/triton/ir/enums.h | 84 +++++++++++++++++++++++++++ python/examples/dot.py | 55 ++++++++++++++++-- python/setup.py | 12 +++- python/src/tensorflow.cpp | 19 ++++-- python/src/tensorflow/alloc_empty.cpp | 30 ++++++++++ python/triton/tools/build.py | 0 python/triton/tools/checksum.py | 0 9 files changed, 211 insertions(+), 23 deletions(-) create mode 100644 include/triton/ir/enums.h create mode 100644 python/src/tensorflow/alloc_empty.cpp create mode 100644 python/triton/tools/build.py create mode 100644 python/triton/tools/checksum.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 5b252c520..7c7a1c0ab 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,9 +48,16 @@ endif() # Python module if(BUILD_PYTHON_MODULE) message(STATUS "Adding Python module") - file(GLOB_RECURSE PYTHON_SRC python/src/*.cpp) - include_directories(python/src/ ${PYTHON_INCLUDE_DIRS}) - set(PYTHON_LIBS ) + # PyBind11 wrapper source file + file(GLOB_RECURSE PYTHON_SRC python/src/tensorflow.cpp) + # update include directory + include_directories(python/src/ ${PYTHON_INCLUDE_DIRS} ${TF_INCLUDE_DIRS}) + # update link directories + link_directories(${TF_LIB_DIRS}) + # extra tensorflow ops (e.g., alloc_empty) + file(GLOB_RECURSE EXTRA_TF_OPS_SRC python/src/tensorflow/*.cpp) + add_library(extra_tf_ops SHARED ${EXTRA_TF_OPS_SRC}) + target_link_libraries(extra_tf_ops ${TF_LIBS}) endif() diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index f97cc2021..90287f719 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -164,16 +164,17 @@ perf_t do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int res.cublas = 0; // test -// stream->synchronize(); -// stream->read(dc, true, 0, hc); -// std::vector rc(hc.size()); -// cpu_ref(AT, BT, M, N, K, rc, ha, hb); -// for(size_t i = 0; i < M*N; i++) -// if(std::isinf(hc[i]) || std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-2){ -// std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; -// exit(EXIT_FAILURE); -// } -// std::cout << "Pass!" << std::endl; + stream->synchronize(); + stream->read(dc, true, 0, hc); + std::vector rc(hc.size()); + cpu_ref(AT, BT, M, N, K, rc, ha, hb); + for(size_t i = 0; i < M*N; i++) + if(std::isinf(hc[i]) || std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-2){ + std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; + exit(EXIT_FAILURE); + } + std::cout << hc[0] << " " << std::endl; + std::cout << "Pass!" << std::endl; // clean-up delete dc; diff --git a/include/triton/ir/enums.h b/include/triton/ir/enums.h new file mode 100644 index 000000000..600c83ade --- /dev/null +++ b/include/triton/ir/enums.h @@ -0,0 +1,84 @@ +#ifndef TRITON_IR_ENUMS_H +#define TRITON_IR_ENUMS_H + +namespace triton{ +namespace ir{ + + +enum binary_op_t { + Add, + FAdd, + Sub, + FSub, + Mul, + FMul, + UDiv, + SDiv, + FDiv, + URem, + SRem, + FRem, + Shl, + LShr, + AShr, + And, + Or, + Xor +}; + +enum cast_op_t { + Trunc, + ZExt, + SExt, + FPTrunc, + FPExt, + UIToFP, + SIToFP, + FPToUI, + FPToSI, + PtrToInt, + IntToPtr, + BitCast, + AddrSpaceCast +}; + +enum cmp_pred_t { + FIRST_FCMP_PREDICATE, + FCMP_FALSE, + FCMP_OEQ, + FCMP_OGT, + FCMP_OGE, + FCMP_OLT, + FCMP_OLE, + FCMP_ONE, + FCMP_ORD, + FCMP_UNO, + FCMP_UEQ, + FCMP_UGT, + FCMP_UGE, + FCMP_ULT, + FCMP_ULE, + FCMP_UNE, + FCMP_TRUE, + LAST_FCMP_PREDICATE, + FIRST_ICMP_PREDICATE, + ICMP_EQ, + ICMP_NE, + ICMP_UGT, + ICMP_UGE, + ICMP_ULT, + ICMP_ULE, + ICMP_SGT, + ICMP_SGE, + ICMP_SLT, + ICMP_SLE, + LAST_ICMP_PREDICATE +}; + + + + +} +} + +#endif diff --git a/python/examples/dot.py b/python/examples/dot.py index 52c7e0a2e..29d6f9470 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -4,6 +4,7 @@ import distutils import distutils.log import setuptools.command.build_ext import setuptools +import numpy as np import os src = """ @@ -45,23 +46,25 @@ void matmul(restrict read_only align(16) half *A, } """ +extra_ops = tf.load_op_library('/home/philippe/development/triton/python/build/lib.linux-x86_64-3.6/libextra_tf_ops.so') + + with open('test.cpp', 'w+') as test: src = libtriton.make_tensorflow_src(src, [2], '(M + #TM - 1)/#TM, (N + #TN - 1)/#TN, 1') test.writelines(src) triton_include_dirs = ['/home/philippe/development/triton/include'] tensorflow_include_dirs = [tf.sysconfig.get_include()] -llvm_include_dirs = ['/usr/include/llvm-8/', '/usr/include/llvm-c-8/'] cuda_include_dirs = ['/usr/local/cuda-10.1/targets/x86_64-linux/include/'] -triton_library_dirs = [os.path.realpath(libtriton.__file__)] +triton_library_dirs = [os.path.realpath(os.path.join(libtriton.__file__, os.path.pardir))] tensorflow_library_dirs = [tf.sysconfig.get_lib()] include_dirs = triton_include_dirs + tensorflow_include_dirs + cuda_include_dirs extra_compile_args = [] extra_link_args = [] -library_dirs = tensorflow_library_dirs -libraries = ['tensorflow_framework'] +library_dirs = triton_library_dirs + tensorflow_library_dirs +libraries = ['tensorflow_framework', 'triton'] ext = setuptools.Extension( name = 'test', @@ -92,4 +95,46 @@ args = dict( setuptools.setup(**args) library_dir = os.path.dirname(os.path.realpath(__file__)) module = tf.load_op_library(os.path.join(library_dir, 'build/lib.linux-x86_64-3.6/test.cpython-36m-x86_64-linux-gnu.so')) -print(module.matmul) \ No newline at end of file + +class dot: + + def __init__(self): + trans_a = True + trans_b = False + + def __call__(self, a, b): + shape_a = tf.shape(a) + shape_b = tf.shape(b) + M = shape_a[0] + K = shape_a[1] + N = shape_b[0] + lda = M + ldb = K + ldc = M + c = extra_ops.alloc_empty(tf.stack([M, N])) + return module.matmul(a, b, c, M, N, K, lda, ldb, ldc) + +dot_nt = dot() +def run_dot(): + M, N, K = 128, 128, 128 + a = tf.placeholder(tf.float16, shape=[M, K]) + b = tf.placeholder(tf.float16, shape=[N, K]) + # c = tf.matmul(a, b, transpose_a=True) + c = dot_nt(a, b) + # Reference + ha = np.random.rand(M, K).astype(np.float16) + hb = np.random.rand(N, K).astype(np.float16) + # Run + sess = tf.InteractiveSession() + sess.run(tf.global_variables_initializer()) + result = sess.run([c], feed_dict = {a: ha, + b: hb})[0] + # Test + hresult = np.dot(ha.T, hb).T + dif = np.abs(result - hresult) + np.savetxt('dif.dat', dif, '%2.4f') + print(hresult) + print(result) + print("dif: %f" % np.max(dif)) + +run_dot() \ No newline at end of file diff --git a/python/setup.py b/python/setup.py index 057362b0f..3d98218ac 100644 --- a/python/setup.py +++ b/python/setup.py @@ -35,12 +35,22 @@ class CMakeBuild(build_ext): def build_extension(self, ext): extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name))) + # python directors python_include_dirs = distutils.sysconfig.get_python_inc() python_lib_dirs = distutils.sysconfig.get_config_var('LIBDIR') + # tensorflow directories + import tensorflow as tf + tf_include_dirs = tf.sysconfig.get_include() + tf_lib_dirs = tf.sysconfig.get_lib() + tf_libs = 'tensorflow_framework' + cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir, '-DBUILD_EXAMPLES=OFF', '-DBUILD_PYTHON_MODULE=ON', - '-DPYTHON_INCLUDE_DIRS=' + python_include_dirs] + '-DPYTHON_INCLUDE_DIRS=' + python_include_dirs, + '-DTF_INCLUDE_DIRS=' + tf_include_dirs, + '-DTF_LIB_DIRS=' + tf_lib_dirs, + '-DTF_LIBS=' + tf_libs] cfg = 'Debug' if self.debug else 'Release' build_args = ['--config', cfg] diff --git a/python/src/tensorflow.cpp b/python/src/tensorflow.cpp index 12e64fa4f..c1c224916 100644 --- a/python/src/tensorflow.cpp +++ b/python/src/tensorflow.cpp @@ -161,7 +161,7 @@ result += R"( // extract outputs)"; for(unsigned i = 0; i < n_outputs; i++) result += R"( - context->set_output()" + str_i[i] + ", " + arg_names[outputs[i]] + ");"; + context->set_output()" + str_i[i] + ", " + arg_names[outputs[i]] + ");"; result += R"( @@ -201,15 +201,26 @@ private: rt::function fn_; }; -REGISTER_KERNEL_BUILDER(Name(")" + name + "\").Device(DEVICE_GPU), " + classname + R"(); +REGISTER_KERNEL_BUILDER(Name(")" + name + "\").Device(DEVICE_GPU)"; +for(size_t i = 0; i < tf_scalar_tys.size(); i++){ + std::string arg_name = arg_names[i]; + std::transform(arg_name.begin(), arg_name.end(), arg_name.begin(), [](char c) { return std::tolower(c);}); + if(!fn_ty->get_param_ty(i)->is_pointer_ty()) + result += ".HostMemory(\"" + arg_name + "\")"; +} +result += ", " + classname + R"(); + REGISTER_OP(")" + name + "\")\n"; for(size_t i = 0; i < tf_scalar_tys.size(); i++){ bool is_output = std::find(outputs.begin(), outputs.end(), i) != outputs.end(); - std::string mode = is_output ? "Output" : "Input" ; + std::string mode = is_output ? "Input" : "Input" ; std::string arg_name = arg_names[i]; std::transform(arg_name.begin(), arg_name.end(), arg_name.begin(), [](char c) { return std::tolower(c);}); - result += " ." + mode + "(\"" + arg_name + ": " + tf_scalar_tys[i] + "\")\n"; + result += " .Input(\"" + arg_name + ": " + tf_scalar_tys[i] + "\")\n"; +} +for(size_t i = 0; i < outputs.size(); i++){ + result += " .Output(\"out: " + tf_scalar_tys[outputs[i]] + "\")\n"; } result += ";\n"; diff --git a/python/src/tensorflow/alloc_empty.cpp b/python/src/tensorflow/alloc_empty.cpp new file mode 100644 index 000000000..e60e8436c --- /dev/null +++ b/python/src/tensorflow/alloc_empty.cpp @@ -0,0 +1,30 @@ +#include "tensorflow/core/framework/op_kernel.h" + +using namespace tensorflow; + +class AllocEmptyOp : public OpKernel { + public: + explicit AllocEmptyOp(OpKernelConstruction* context) : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + // fetch input + const Tensor& x = context->input(0); + const int32* x_data = (const int32*)x.tensor_data().data(); + // allocate output + Tensor* y = NULL; + int32 x_rank = x.dims(); + OP_REQUIRES(context, x_rank == 1, errors::InvalidArgument("Input tensor must be 1D")); + int32 y_rank = x.dim_size(0); + TensorShape y_shapes; + for(size_t i = 0; i < y_rank; i++) + y_shapes.AddDim(x_data[i]); + OP_REQUIRES_OK(context, context->allocate_output(0, y_shapes, &y)); + } +}; + + +REGISTER_KERNEL_BUILDER(Name("AllocEmpty").HostMemory("x").Device(DEVICE_CPU).Device(DEVICE_GPU), AllocEmptyOp); +REGISTER_OP("AllocEmpty") + .Input("x: int32") + .Output("y: float16") +; diff --git a/python/triton/tools/build.py b/python/triton/tools/build.py new file mode 100644 index 000000000..e69de29bb diff --git a/python/triton/tools/checksum.py b/python/triton/tools/checksum.py new file mode 100644 index 000000000..e69de29bb From 078f0052fed759729dbe8afeddb434660ace5e7d Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 17 Aug 2019 16:12:17 -0700 Subject: [PATCH 299/494] more cleaning --- python/examples/dot.py | 122 +++++++++++++++++++++++++------------- python/src/tensorflow.cpp | 30 +++++++--- 2 files changed, 103 insertions(+), 49 deletions(-) diff --git a/python/examples/dot.py b/python/examples/dot.py index 29d6f9470..6c79e846c 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -6,6 +6,9 @@ import setuptools.command.build_ext import setuptools import numpy as np import os +import tempfile +import shutil +import hashlib src = """ const tunable int TM = {128}; @@ -46,55 +49,94 @@ void matmul(restrict read_only align(16) half *A, } """ + extra_ops = tf.load_op_library('/home/philippe/development/triton/python/build/lib.linux-x86_64-3.6/libextra_tf_ops.so') -with open('test.cpp', 'w+') as test: - src = libtriton.make_tensorflow_src(src, [2], '(M + #TM - 1)/#TM, (N + #TN - 1)/#TN, 1') - test.writelines(src) +def make_bindings(src, outputs, grids): + return libtriton.make_tensorflow_src(src, outputs, grids) -triton_include_dirs = ['/home/philippe/development/triton/include'] -tensorflow_include_dirs = [tf.sysconfig.get_include()] -cuda_include_dirs = ['/usr/local/cuda-10.1/targets/x86_64-linux/include/'] +def make_cache_path(src): + md5 = hashlib.sha1(src.encode()) + hexhash = md5.hexdigest() + home = os.path.expanduser('~') + cacheroot = os.path.join(home, '.triton', 'cache') + cachepath = os.path.join(cacheroot, str(hexhash)) + if not os.path.exists(cachepath): + os.makedirs(cachepath) + print(cachepath) + return cachepath -triton_library_dirs = [os.path.realpath(os.path.join(libtriton.__file__, os.path.pardir))] -tensorflow_library_dirs = [tf.sysconfig.get_lib()] +def write_bindings(src, root): + cpp = os.path.join(root, 'tensorflow.cpp') + so = os.path.join(root, 'tensorflow.so') + recompile = False + # recompile if .so does not exist + if not os.path.exists(cpp) or not os.path.exists(so): + recompile = True + # recompile if cpp was modified after .so + elif max(cpp, so, key=os.path.getctime) == cpp: + recompile = True + # write cpp file + if recompile: + with open(cpp, 'w+') as handle: + handle.writelines(src) + # return path of cpp file + return cpp + +def build(src, path): + # include directories + triton_include_dirs = ['/home/philippe/development/triton/include'] + tensorflow_include_dirs = [tf.sysconfig.get_include()] + cuda_include_dirs = ['/usr/local/cuda-10.1/targets/x86_64-linux/include/'] + include_dirs = triton_include_dirs + tensorflow_include_dirs + cuda_include_dirs + # library directories + triton_library_dirs = [os.path.realpath(os.path.join(libtriton.__file__, os.path.pardir))] + tensorflow_library_dirs = [tf.sysconfig.get_lib()] + library_dirs = triton_library_dirs + tensorflow_library_dirs + # libraries + libraries = ['tensorflow_framework', 'triton'] + # extra arguments + extra_compile_args = [] + extra_link_args = [] + # create extension module + ext = setuptools.Extension( + name = 'test', + language = 'c++', + sources = [src], + include_dirs = include_dirs, + extra_compile_args = extra_compile_args, + extra_link_args = extra_link_args, + library_dirs = library_dirs, + libraries = libraries + ) + # build extension module + args = ['build_ext'] + tmp = tempfile.mkdtemp() + args.append('--build-temp=' + tmp) + args.append('--build-lib=' + path) + args.append('-q') + args = dict( + name = 'test', + ext_modules = [ext], + script_args = args, + ) + setuptools.setup(**args) + shutil.rmtree(tmp) -include_dirs = triton_include_dirs + tensorflow_include_dirs + cuda_include_dirs -extra_compile_args = [] -extra_link_args = [] -library_dirs = triton_library_dirs + tensorflow_library_dirs -libraries = ['tensorflow_framework', 'triton'] +def make_tensorflow_op(src, outputs, grids): + bindings = make_bindings(src, outputs, grids) + cache_path = make_cache_path(bindings) + cpp = write_bindings(bindings, cache_path) + build(cpp, cache_path) + result = tf.load_op_library(os.path.join(cache_path, 'test.cpython-36m-x86_64-linux-gnu.so')) + return result -ext = setuptools.Extension( - name = 'test', - language = 'c++', - sources = ['/home/philippe/development/triton/python/examples/test.cpp'], - include_dirs = include_dirs, - extra_compile_args = extra_compile_args, - extra_link_args = extra_link_args, - library_dirs = library_dirs, - libraries = libraries -) -build_path = '.' -args = ['build_ext'] -#args.append('--build-temp=' + build_path) -#args.append('--build-lib=' + build_path) -args.append('-q') -args = dict( - name = 'test', - ext_modules = [ext], - script_args = args, - cmdclass = { - 'build_ext': setuptools.command.build_ext.build_ext - } - -) - -setuptools.setup(**args) library_dir = os.path.dirname(os.path.realpath(__file__)) -module = tf.load_op_library(os.path.join(library_dir, 'build/lib.linux-x86_64-3.6/test.cpython-36m-x86_64-linux-gnu.so')) +module = make_tensorflow_op(src, ['C'], ['(M + #TM - 1)/#TM', '(N + #TN - 1)/#TN']) +print(module.matmul) + class dot: diff --git a/python/src/tensorflow.cpp b/python/src/tensorflow.cpp index c1c224916..40810fc75 100644 --- a/python/src/tensorflow.cpp +++ b/python/src/tensorflow.cpp @@ -75,8 +75,8 @@ inline std::unique_ptr make_ir(ir::context& ctx, triton::lang::trans } std::string make_tensorflow_src(const std::string src, - const std::vector& outputs, - const std::string& macro) { + const std::vector& outputs, + const std::vector& macros) { triton::lang::translation_unit *ast = make_ast(src.c_str()); triton::ir::context context; std::unique_ptr ir = make_ir(context, ast); @@ -108,7 +108,12 @@ std::string make_tensorflow_src(const std::string src, std::transform(fn_ty->params_begin(), fn_ty->params_end(), std::back_inserter(tf_scalar_tys), to_tf_scalar_ty); std::vector tf_cref_tys; std::transform(fn_ty->params_begin(), fn_ty->params_end(), std::back_inserter(tf_cref_tys), ref_to_tf_ty); - + // output indices + std::vector out_idx; + for(const std::string &name : outputs){ + auto it = std::find(arg_names.begin(), arg_names.end(), name); + out_idx.push_back(std::distance(arg_names.begin(), it)); + } std::ostringstream oss; std::string result = R"( @@ -161,7 +166,7 @@ result += R"( // extract outputs)"; for(unsigned i = 0; i < n_outputs; i++) result += R"( - context->set_output()" + str_i[i] + ", " + arg_names[outputs[i]] + ");"; + context->set_output()" + str_i[i] + ", " + outputs[i] + ");"; result += R"( @@ -172,12 +177,21 @@ result += R"( std::regex regex("#([a-zA-Z]([a-zA-Z]|[0-9])*)"); -std::string grid_str = std::regex_replace(macro, regex, "x.at(\"$1\")"); +std::vector grids; +for(size_t i = macros.size(); i < 3; i++) + grids.push_back("1"); +std::string grid = "rt::grid_t{"; +for(size_t i = 0; i < grids.size(); i++){ + if(i > 0) + grid += ", "; + grid += std::regex_replace(grids[i], regex, "x.at(\"$1\")"); +} +grid += "}"; result += R"( // create launch grid; - auto grid = [&](const rt::params_t& x) { return rt::grid_t{)" + grid_str + R"(}; };)"; + auto grid = [&](const rt::params_t& x) { return )" + grid + R"(; };)"; result += R"( @@ -213,14 +227,12 @@ result += ", " + classname + R"(); REGISTER_OP(")" + name + "\")\n"; for(size_t i = 0; i < tf_scalar_tys.size(); i++){ - bool is_output = std::find(outputs.begin(), outputs.end(), i) != outputs.end(); - std::string mode = is_output ? "Input" : "Input" ; std::string arg_name = arg_names[i]; std::transform(arg_name.begin(), arg_name.end(), arg_name.begin(), [](char c) { return std::tolower(c);}); result += " .Input(\"" + arg_name + ": " + tf_scalar_tys[i] + "\")\n"; } for(size_t i = 0; i < outputs.size(); i++){ - result += " .Output(\"out: " + tf_scalar_tys[outputs[i]] + "\")\n"; + result += " .Output(\"out" + std::to_string(i) + ": " + tf_scalar_tys[out_idx[i]] + "\")\n"; } result += ";\n"; From b4a9ed9663bccfe5a6af35392bbae8be0cb56130 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 17 Aug 2019 18:18:26 -0700 Subject: [PATCH 300/494] [python] added basic tensorflow support --- examples/cpp/cuda.h | 160 +++++++++++++++++++ examples/cpp/dot.cpp | 4 +- include/triton/codegen/selection/selection.h | 2 +- include/triton/ir/builder.h | 2 +- include/triton/ir/instructions.h | 24 +-- include/triton/lang/expression.h | 4 +- include/triton/lang/parser.y | 4 +- include/triton/lang/scanner.l | 2 +- lib/codegen/analysis/alignment.cpp | 4 +- lib/codegen/selection/selection.cpp | 10 +- lib/codegen/transform/reassociate.cpp | 6 +- lib/dnn/batchnorm.cpp | 4 +- lib/dnn/blocksparse/dot.cpp | 6 +- lib/dnn/conv.cpp | 4 +- lib/dnn/dot.cpp | 4 +- lib/dnn/shift.cpp | 6 +- lib/ir/builder.cpp | 4 +- lib/ir/instructions.cpp | 28 ++-- lib/lang/expression.cpp | 6 +- python/examples/dot.py | 120 ++------------ python/setup.py | 2 + python/src/tensorflow.cpp | 4 +- python/triton/__init__.py | 1 + python/triton/ops.py | 103 ++++++++++++ 24 files changed, 341 insertions(+), 173 deletions(-) create mode 100644 examples/cpp/cuda.h create mode 100644 python/triton/__init__.py create mode 100644 python/triton/ops.py diff --git a/examples/cpp/cuda.h b/examples/cpp/cuda.h new file mode 100644 index 000000000..5f03870f5 --- /dev/null +++ b/examples/cpp/cuda.h @@ -0,0 +1,160 @@ +/* Copyright 2015-2017 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#include +#include +#include +#include "cublas_v2.h" +#include "triton/driver/buffer.h" +#include "triton/driver/stream.h" +#include "triton/driver/context.h" +#include "triton/tools/bench.hpp" + +enum cublasStrategy_t{ + CUBLAS_PREFER_FASTEST, + CUBLAS_HEURISTICS +}; + +enum DType{ + HALF_TYPE, + FLOAT_TYPE, + DOUBLE_TYPE, +}; + +inline size_t size_of(DType dtype){ + switch (dtype) { + case HALF_TYPE: return 2; + case FLOAT_TYPE: return 4; + case DOUBLE_TYPE: return 8; + default: throw; + } +} + +std::vector gather_all_algos() { + std::vector result; + // non-tensor ops + for(int i = -1; i < 24; i++) + result.push_back((cublasGemmAlgo_t)i); + // tensor ops + for(int i = 99; i < 116; i++) + result.push_back((cublasGemmAlgo_t)i); + return result; +} + +static const std::vector algorithms = gather_all_algos(); + +static const std::map cu_dtype = { + {HALF_TYPE, CUDA_R_16F}, + {FLOAT_TYPE, CUDA_R_32F}, + {DOUBLE_TYPE, CUDA_R_64F} +}; + +static const std::map cu_op = { + {false, CUBLAS_OP_N}, + {true, CUBLAS_OP_T} +}; + +inline cublasGemmAlgo_t cublasGemmFastest( + triton::driver::stream* stream, + cublasHandle_t handle, cudaDataType cudt, + cublasOperation_t AT, cublasOperation_t BT, + int32_t M, int32_t N, int32_t K, + void* alpha, CUdeviceptr A, int32_t lda, CUdeviceptr B, int32_t ldb, + void* beta, CUdeviceptr C, int32_t ldc) { + + // cache to avoid re-benchmarking + typedef std::tuple key_t; + static std::map cache; + key_t key(cudt, AT, BT, M, N, K); + // benchmark algorithms if necessary + if(cache.find(key) == cache.end()){ + std::vector times; + for(cublasGemmAlgo_t a: algorithms) { + cublasStatus_t status; + double nanosec = triton::tools::bench([&](){ status = cublasGemmEx(handle, AT, BT, + M, N, K, + alpha, (const void*)A, cudt, lda, + (const void*)B, cudt, ldb, + beta, (void*)C, cudt, ldc, cudt, + a); }, stream); + if(status != CUBLAS_STATUS_SUCCESS) + nanosec = INFINITY; + } + size_t argmin = std::min_element(times.begin(), times.end()) - times.begin(); + assert(times[argmin] != INFINITY); + cache.insert({key, algorithms[argmin]}); + } + + // return best algorithm + return cache.at(key); +} + +/* Wrapper for cublasGemmEx */ +inline cublasStatus_t cublasGemmEx(cublasHandle_t handle, cudaDataType cudt, cublasOperation_t AT, cublasOperation_t BT, int32_t M, int32_t N, int32_t K, + void* alpha, CUdeviceptr A, int32_t lda, CUdeviceptr B, int32_t ldb, + void* beta, CUdeviceptr C, int32_t ldc, cublasGemmAlgo_t algo) +{ + cublasStatus_t status = cublasGemmEx(handle, AT, BT, M, N, K, alpha, (const void*)A, cudt, lda, (const void*)B, cudt, ldb, beta, (void*)C, cudt, ldc, cudt, algo); + if(status != CUBLAS_STATUS_SUCCESS){ + std::cout << status; + exit(EXIT_FAILURE); + } +} + + +/* Get cuBLAS handle */ +cublasHandle_t cublasGetHandle(triton::driver::stream* stream) { + static std::map cache; + CUstream key = *stream->cu(); + + // create handle if necessary + if(cache.find(key) == cache.end()) { + cublasHandle_t handle; + if(cublasCreate_v2(&handle) != CUBLAS_STATUS_SUCCESS) + throw std::runtime_error("Error: could not create cuBLAS handle"); + cublasSetStream_v2(handle, key); + cache.insert({key, handle}); + } + + // return handle for the stream + return cache.at(key); +} + +/* Simplified API for default GEMM */ +inline void cublasGemm(DType dtype, triton::driver::stream* stream, bool AT, bool BT, + int32_t M, int32_t N, int32_t K, + void* alpha, triton::driver::buffer* A, int32_t lda, + triton::driver::buffer* B, int32_t ldb, + void* beta, triton::driver::buffer* C, int32_t ldc, + cublasGemmAlgo_t* fastest = NULL, cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT) { + triton::driver::cu_context::context_switcher scope(*stream->context()); + static cublasHandle_t handle = cublasGetHandle(stream); + if(dtype == HALF_TYPE) + cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH); + cublasStatus_t status; + if(fastest) + *fastest = cublasGemmFastest(stream, handle, cu_dtype.at(dtype), cu_op.at(AT), cu_op.at(BT), M, N, K, alpha, *A->cu(), lda, *B->cu(), ldb, beta, *C->cu(), ldc); + else + status = cublasGemmEx(handle, cu_dtype.at(dtype), cu_op.at(AT), cu_op.at(BT), M, N, K, alpha, *A->cu(), lda, *B->cu(), ldb, beta, *C->cu(), ldc, algo); +} diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 90287f719..e592da570 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -88,8 +88,8 @@ void matmul(restrict read_only align(16) )" + a_ty + R"( *A, restrict read_only align(16) )" + c_ty + R"( *C, int M, int N, int K, )" + align_lda_str + R"( int lda, )" + align_ldb_str + R"(" int ldb, int ldc) { - int ridx = get_range_id(0); - int ridy = get_range_id(1); + int ridx = get_program_id(0); + int ridy = get_program_id(1); int rxa[TM] = ridx * TM + (0 ... TM); int ryb[TN] = ridy * TN + (0 ... TN); int rka[TK] = 0 ... TK; diff --git a/include/triton/codegen/selection/selection.h b/include/triton/codegen/selection/selection.h index 3b871dce0..433633cff 100644 --- a/include/triton/codegen/selection/selection.h +++ b/include/triton/codegen/selection/selection.h @@ -169,7 +169,7 @@ private: void lower_store(ir::store_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); void lower_downcast(ir::downcast_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); void lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); - void lower_dynamic_range_idx(ir::nv_dynamic_range_idx_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); + void lower_dynamic_program_idx(ir::nv_dynamic_program_idx_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); void lower_reshape(ir::reshape_inst* x, LLVMContext &ctx, Function *fn, Builder &builder); void lower_splat(ir::splat_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); void lower_broadcast(ir::broadcast_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h index a104cc6b4..4f5f4f45b 100644 --- a/include/triton/ir/builder.h +++ b/include/triton/ir/builder.h @@ -126,7 +126,7 @@ public: value *create_reshape(value *arg, const type::tile_shapes_t &shapes, const std::string &name = ""); value *create_broadcast(value *arg, const type::tile_shapes_t &shapes, const std::string &name = ""); // Built-in instruction - value *create_get_range_id(unsigned axis, const std::string &name = ""); + value *create_get_program_id(unsigned axis, const std::string &name = ""); value *create_get_num_program(unsigned axis, const std::string &name = ""); value *create_atomic_cas(value *ptr, value *cmp, value *val, const std::string &name = ""); value *create_atomic_exch(value *ptr, value *val, const std::string &name = ""); diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index e9791e2a1..446dd871b 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -496,10 +496,10 @@ protected: using instruction::instruction; }; -class get_range_id_inst: public builtin_inst { +class get_program_id_inst: public builtin_inst { private: - get_range_id_inst(type *ty, unsigned axis, const std::string &name, instruction *next); - std::string repr_impl() const { return "get_range_id(" + std::to_string(axis_) + ")"; } + get_program_id_inst(type *ty, unsigned axis, const std::string &name, instruction *next); + std::string repr_impl() const { return "get_program_id(" + std::to_string(axis_) + ")"; } public: static instruction* create(context &ctx, unsigned axis, const std::string &name = "", instruction *next = nullptr); @@ -668,23 +668,23 @@ public: }; // On NVIDIA, implementation is such that -// constant_range = nv_dynamic_range_idx + nv_static_range_idx -// so as to enable re-association on nv_static_range_idx which is constant -class nv_dynamic_range_idx_inst: public instruction { +// constant_range = nv_dynamic_program_idx + nv_static_program_idx +// so as to enable re-association on nv_static_program_idx which is constant +class nv_dynamic_program_idx_inst: public instruction { private: - nv_dynamic_range_idx_inst(type *ty, const std::string &name, instruction *next); - std::string repr_impl() const { return "nv_dynamic_range_idx"; } + nv_dynamic_program_idx_inst(type *ty, const std::string &name, instruction *next); + std::string repr_impl() const { return "nv_dynamic_program_idx"; } public: - static nv_dynamic_range_idx_inst* create(type *ty, const std::string &name = "", instruction *next = nullptr); + static nv_dynamic_program_idx_inst* create(type *ty, const std::string &name = "", instruction *next = nullptr); }; -class nv_static_range_idx: public constant { +class nv_static_program_idx: public constant { private: - nv_static_range_idx(constant_range *range); + nv_static_program_idx(constant_range *range); public: - static nv_static_range_idx *get(constant_range* range); + static nv_static_program_idx *get(constant_range* range); constant_range* get_range() const; private: diff --git a/include/triton/lang/expression.h b/include/triton/lang/expression.h index 6823e8988..9d65de5c0 100644 --- a/include/triton/lang/expression.h +++ b/include/triton/lang/expression.h @@ -71,9 +71,9 @@ private: const constant* size_; }; -class get_range_id_expression: public builtin_expression{ +class get_program_id_expression: public builtin_expression{ public: - get_range_id_expression(node *axis): axis_((constant*)axis) { } + get_program_id_expression(node *axis): axis_((constant*)axis) { } ir::value* codegen(ir::module *) const; private: diff --git a/include/triton/lang/parser.y b/include/triton/lang/parser.y index c44a619e8..d67a89562 100644 --- a/include/triton/lang/parser.y +++ b/include/triton/lang/parser.y @@ -55,7 +55,7 @@ STORAGE_SPEC_T get_storage_spec(node *op) { return ((token*)op)->storage_spec;} %token VOID UINT1 UINT8 UINT16 UINT32 UINT64 INT1 INT8 INT16 INT32 INT64 FP16 FP32 FP64 %token IF ELSE FOR CONTINUE WHILE %token NEWAXIS ELLIPSIS AT -%token GET_NUM_PROGRAM GET_RANGE_ID DOT SQRT REDUCE_SUM TRANS MAX MIN SELECT ATOMIC_CAS ATOMIC_EXCH ATOMIC_ADD ALLOC_CONST RESHAPE +%token GET_NUM_PROGRAM GET_PROGRAM_ID DOT SQRT REDUCE_SUM TRANS MAX MIN SELECT ATOMIC_CAS ATOMIC_EXCH ATOMIC_ADD ALLOC_CONST RESHAPE %start translation_unit %% @@ -120,7 +120,7 @@ identifier /* Built-in */ builtin_expression - : GET_RANGE_ID '(' constant ')' { $$ = new get_range_id_expression($3); } + : GET_PROGRAM_ID '(' constant ')' { $$ = new get_program_id_expression($3); } | GET_NUM_PROGRAM '(' constant ')' { $$ = new get_num_program_expression($3); } | DOT '(' expression ',' expression ',' expression ')' { $$ = new matmul_expression($3, $5, $7); } | SQRT '(' expression ')' { $$ = new sqrt_expression($3); } diff --git a/include/triton/lang/scanner.l b/include/triton/lang/scanner.l index 1aaf40a57..6062a51ad 100644 --- a/include/triton/lang/scanner.l +++ b/include/triton/lang/scanner.l @@ -43,7 +43,7 @@ using triton::lang::return_void; "float" { return return_impl(FP32, yytext); } "double" { return return_impl(FP64, yytext); } "..." { return return_impl(ELLIPSIS, yytext); } -"get_range_id" { return return_impl(GET_RANGE_ID, yytext); } +"get_program_id" { return return_impl(GET_PROGRAM_ID, yytext); } "get_num_program" { return return_impl(GET_NUM_PROGRAM, yytext); } "__atomic_cas" { return return_impl(ATOMIC_CAS, yytext); } "__atomic_exch" { return return_impl(ATOMIC_EXCH, yytext); } diff --git a/lib/codegen/analysis/alignment.cpp b/lib/codegen/analysis/alignment.cpp index 3ed74f7a3..a602c87ca 100644 --- a/lib/codegen/analysis/alignment.cpp +++ b/lib/codegen/analysis/alignment.cpp @@ -227,10 +227,10 @@ unsigned alignment_info::populate_starting_multiple(ir::value *v){ if(auto *x = dynamic_cast(v)){ return cache(x->get_first()->get_value()); } - if(auto *x = dynamic_cast(v)){ + if(auto *x = dynamic_cast(v)){ return cache(128); } - if(auto *x = dynamic_cast(v)){ + if(auto *x = dynamic_cast(v)){ return cache(x->get_range()->get_first()->get_value()); } if(auto *x = dynamic_cast(v)){ diff --git a/lib/codegen/selection/selection.cpp b/lib/codegen/selection/selection.cpp index 0ca17f9e0..166b423bb 100644 --- a/lib/codegen/selection/selection.cpp +++ b/lib/codegen/selection/selection.cpp @@ -411,7 +411,7 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::functionget_operand(2)); return builder.Insert(SelectInst::Create(pred, if_value, else_value)); } - if(ir::get_range_id_inst* ii = dynamic_cast(inst)){ + if(ir::get_program_id_inst* ii = dynamic_cast(inst)){ Value *result = tgt_->get_block_id(builder.GetInsertBlock()->getModule(), builder, ii->get_axis()); return (Instruction*)result; } @@ -837,7 +837,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, T->set_value(idx, idx[0]); }); } - if(is_inserted && dynamic_cast(v)){ + if(is_inserted && dynamic_cast(v)){ T->for_each([&](indices_t idx){ assert(idx.size() == 1); BinaryOperator *bin_add = dyn_cast(idx[0]); @@ -996,7 +996,7 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, } } -void selection::lower_dynamic_range_idx(ir::nv_dynamic_range_idx_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { +void selection::lower_dynamic_program_idx(ir::nv_dynamic_program_idx_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { distributed_tile* result = (distributed_tile*)tmap_.at(x); result->for_each([&](indices_t idx){ assert(idx.size() == 1); @@ -1418,8 +1418,8 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & lower_downcast(x, ctx, fn, builder); else if(auto *x = dynamic_cast(ins)) lower_reduce(x, ctx, fn, builder); - else if(auto *x = dynamic_cast(ins)) - lower_dynamic_range_idx(x, ctx, fn, builder); + else if(auto *x = dynamic_cast(ins)) + lower_dynamic_program_idx(x, ctx, fn, builder); else if(auto *x = dynamic_cast(ins)) lower_reshape(x, ctx, fn, builder); else if(auto *x = dynamic_cast(ins)) diff --git a/lib/codegen/transform/reassociate.cpp b/lib/codegen/transform/reassociate.cpp index 6893a7a10..c411ccf12 100644 --- a/lib/codegen/transform/reassociate.cpp +++ b/lib/codegen/transform/reassociate.cpp @@ -164,7 +164,7 @@ reassociate::reassociate(analysis::tune* params) void reassociate::run(ir::module &mod) { ir::builder &builder = mod.get_builder(); - // constant_range -> nv_dynamic_range_idx + nv_static_range_idx + // constant_range -> nv_dynamic_program_idx + nv_static_program_idx for(ir::function *fn: mod.get_function_list()){ std::vector ranges; std::vector rpo = ir::cfg::reverse_post_order(fn); @@ -178,8 +178,8 @@ void reassociate::run(ir::module &mod) { builder.set_insert_point(rpo.front()->get_first_non_phi()); for(ir::constant_range* old_range: ranges){ - ir::value* dyn_range = builder.insert(ir::nv_dynamic_range_idx_inst::create(old_range->get_type())); - ir::value* static_range = ir::nv_static_range_idx::get(old_range); + ir::value* dyn_range = builder.insert(ir::nv_dynamic_program_idx_inst::create(old_range->get_type())); + ir::value* static_range = ir::nv_static_program_idx::get(old_range); ir::value* new_range = builder.create_add(dyn_range, static_range); old_range->replace_all_uses_with(new_range); params_->copy(dyn_range, old_range); diff --git a/lib/dnn/batchnorm.cpp b/lib/dnn/batchnorm.cpp index e5143755e..fe785afdd 100644 --- a/lib/dnn/batchnorm.cpp +++ b/lib/dnn/batchnorm.cpp @@ -82,7 +82,7 @@ void batchnorm_forward(float *Y, float *M, float *V, int rx[TM] = 0 ... TM; float *px[TM]; float x[TM] = 0; - int c = get_range_id(1); + int c = get_program_id(1); float g = *(G + c); float b = *(B + c); @@ -177,7 +177,7 @@ void batchnorm_backward(float *DX, float *DG, float *DB, restrict read_only float *V, int DHWN, float rcpDHWN, float epsilon) { int rx[TM] = 0 ... TM; - int c = get_range_id(1); + int c = get_program_id(1); int offset = c*DHWN; float g = *(G + c); float mean = *(M + c); diff --git a/lib/dnn/blocksparse/dot.cpp b/lib/dnn/blocksparse/dot.cpp index 97823e309..b155f9c89 100644 --- a/lib/dnn/blocksparse/dot.cpp +++ b/lib/dnn/blocksparse/dot.cpp @@ -114,11 +114,11 @@ std::string dot::triton_c_src_ydx() const { int lda, int ldb, int ldc, int N, int* lut, int* locks, int nlocks) { - int ridx = get_range_id(0); + int ridx = get_program_id(0); float acc[TM, TN] = 0; int rka[TK] = 0 ... TK; int rkb[TK] = 0 ... TK; - int *header = lut + get_range_id(1) * 4; + int *header = lut + get_program_id(1) * 4; int offset = *(header + 0); int K = *(header + 1); int column = *(header + 2); @@ -191,7 +191,7 @@ std::string dot::triton_c_src_dw() const { int lda, int ldb, int ldc, int N, int* lut, int* locks, int nlocks) { - int ridx = get_range_id(0); + int ridx = get_program_id(0); float acc[TM, TN] = 0; int rka[TK] = 0 ... TK; int rkb[TK] = 0 ... TK; diff --git a/lib/dnn/conv.cpp b/lib/dnn/conv.cpp index 63503c70f..381691ff0 100644 --- a/lib/dnn/conv.cpp +++ b/lib/dnn/conv.cpp @@ -686,8 +686,8 @@ if(b_lut_){ float* pc[TM, TN] = c + rc1[newaxis, :]*ldc_k + rc0[:, newaxis]; bool checkc0[TM] = rxc < M; bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - int ridx = get_range_id(0); - int ridy = get_range_id(1); + int ridx = get_program_id(0); + int ridy = get_program_id(1); int *plock = locks + ridx + ridy*grid0; while(__atomic_cas(plock, 0, 1) == 1); int *pcount = plock + grid0*grid1; diff --git a/lib/dnn/dot.cpp b/lib/dnn/dot.cpp index 4ea355170..f3d35a2f0 100644 --- a/lib/dnn/dot.cpp +++ b/lib/dnn/dot.cpp @@ -116,8 +116,8 @@ void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, int M, int N, int K, )" + align_lda_str + R"( int lda, )" + align_ldb_str + R"(" int ldb, int ldc, int bound, int *locks, int grid0, int grid1) { - int ridx = get_range_id(0); - int ridy = get_range_id(1); + int ridx = get_program_id(0); + int ridy = get_program_id(1); int rxa[TM] = ridx * TM + (0 ... TM); int ryb[TN] = ridy * TN + (0 ... TN); int rka[TK] = 0 ... TK; diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp index 5b50a73b4..93ae57cd4 100644 --- a/lib/dnn/shift.cpp +++ b/lib/dnn/shift.cpp @@ -354,9 +354,9 @@ void shift(restrict read_only align(16) )" + a_ty_ + R"( *A, int BH, int BW, int CH, int CW, int* locks, int grid0, int grid1, int grid2) { - int ridx = get_range_id(0); - int ridy = get_range_id(1); - int rz = get_range_id(2); + int ridx = get_program_id(0); + int ridy = get_program_id(1); + int rz = get_program_id(2); int rxa[TM] = ridx*TM + (0 ... TM); int ryb[TN] = ridy*TN + (0 ... TN); int rka[TK] = 0 ... TK; diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index ef2d81abf..9fe444dd1 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -300,8 +300,8 @@ value *builder::create_downcast(value *arg, const std::string &name) { // built-in instructions //===----------------------------------------------------------------------===// -value *builder::create_get_range_id(unsigned axis, const std::string &name) { - return insert(get_range_id_inst::create(ctx_, axis, name)); +value *builder::create_get_program_id(unsigned axis, const std::string &name) { + return insert(get_program_id_inst::create(ctx_, axis, name)); } value *builder::create_get_num_program(unsigned axis, const std::string &name) { diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 074b55bb8..85b6eee5c 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -664,14 +664,14 @@ instruction* select_inst::create(value *pred, value *if_value, value *else_value //===----------------------------------------------------------------------===// -// get_range_id -get_range_id_inst::get_range_id_inst(type *ty, unsigned axis, const std::string &name, instruction *next) +// get_program_id +get_program_id_inst::get_program_id_inst(type *ty, unsigned axis, const std::string &name, instruction *next) : builtin_inst(ty, 0, 1, name, next), axis_(axis){ } -instruction* get_range_id_inst::create(context &ctx, unsigned axis, const std::string &name, instruction *next) { - return new get_range_id_inst(type::get_int32_ty(ctx), axis, name, next); +instruction* get_program_id_inst::create(context &ctx, unsigned axis, const std::string &name, instruction *next) { + return new get_program_id_inst(type::get_int32_ty(ctx), axis, name, next); } // get_num_program @@ -745,25 +745,25 @@ barrier_inst* barrier_inst::create(context &ctx, const std::string &name, instru return new barrier_inst(ctx, name, next); } -// nv_dynamic_range_idx -nv_dynamic_range_idx_inst::nv_dynamic_range_idx_inst(type *ty, const std::string &name, instruction *next) +// nv_dynamic_program_idx +nv_dynamic_program_idx_inst::nv_dynamic_program_idx_inst(type *ty, const std::string &name, instruction *next) : instruction(ty, 0, 1, name, next) { } -nv_dynamic_range_idx_inst* nv_dynamic_range_idx_inst::create(type *ty, const std::string &name, instruction *next) { - return new nv_dynamic_range_idx_inst(ty, name, next); +nv_dynamic_program_idx_inst* nv_dynamic_program_idx_inst::create(type *ty, const std::string &name, instruction *next) { + return new nv_dynamic_program_idx_inst(ty, name, next); } -// nv_static_range_idx -nv_static_range_idx::nv_static_range_idx(constant_range *range) +// nv_static_program_idx +nv_static_program_idx::nv_static_program_idx(constant_range *range) : constant(range->get_type(), 0), range_(range) { } -constant_range* nv_static_range_idx::get_range() const +constant_range* nv_static_program_idx::get_range() const { return range_; } -nv_static_range_idx* nv_static_range_idx::get(constant_range* range) { - static std::map cache; +nv_static_program_idx* nv_static_program_idx::get(constant_range* range) { + static std::map cache; if(cache.find(range) == cache.end()) - cache.insert({range, new nv_static_range_idx(range)}); + cache.insert({range, new nv_static_program_idx(range)}); return cache.at(range); } diff --git a/lib/lang/expression.cpp b/lib/lang/expression.cpp index acbfaf6f6..8d5288e8b 100644 --- a/lib/lang/expression.cpp +++ b/lib/lang/expression.cpp @@ -115,9 +115,9 @@ ir::value* alloc_const_expression::codegen(ir::module *mod) const { return res; } -// get_range_id -ir::value* get_range_id_expression::codegen(ir::module *mod) const { - return mod->get_builder().create_get_range_id(axis_->value()); +// get_program_id +ir::value* get_program_id_expression::codegen(ir::module *mod) const { + return mod->get_builder().create_get_program_id(axis_->value()); } // get_num_program diff --git a/python/examples/dot.py b/python/examples/dot.py index 6c79e846c..75fe931bc 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -1,14 +1,6 @@ -import libtriton +import triton import tensorflow as tf -import distutils -import distutils.log -import setuptools.command.build_ext -import setuptools import numpy as np -import os -import tempfile -import shutil -import hashlib src = """ const tunable int TM = {128}; @@ -20,8 +12,8 @@ void matmul(restrict read_only align(16) half *A, restrict read_only align(16) half *C, int M, int N, int K, multiple_of(8) int lda, multiple_of(8) int ldb, int ldc) { - int ridx = get_range_id(0); - int ridy = get_range_id(1); + int ridx = get_program_id(0); + int ridy = get_program_id(1); int rxa[TM] = ridx * TM + (0 ... TM); int ryb[TN] = ridy * TN + (0 ... TN); int rka[TK] = 0 ... TK; @@ -40,7 +32,7 @@ void matmul(restrict read_only align(16) half *A, } int rxc[TM] = ridx * TM + (0 ... TM); int ryc[TN] = ridy * TN + (0 ... TN); - half* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; + half* pc[TM, TN] = C + ryc[newaxis, :] + rxc[:, newaxis]*ldc; half c[TM, TN] = xc; bool checkc0[TM] = rxc < M; bool checkc1[TN] = ryc < N; @@ -49,100 +41,10 @@ void matmul(restrict read_only align(16) half *A, } """ - -extra_ops = tf.load_op_library('/home/philippe/development/triton/python/build/lib.linux-x86_64-3.6/libextra_tf_ops.so') - - -def make_bindings(src, outputs, grids): - return libtriton.make_tensorflow_src(src, outputs, grids) - -def make_cache_path(src): - md5 = hashlib.sha1(src.encode()) - hexhash = md5.hexdigest() - home = os.path.expanduser('~') - cacheroot = os.path.join(home, '.triton', 'cache') - cachepath = os.path.join(cacheroot, str(hexhash)) - if not os.path.exists(cachepath): - os.makedirs(cachepath) - print(cachepath) - return cachepath - -def write_bindings(src, root): - cpp = os.path.join(root, 'tensorflow.cpp') - so = os.path.join(root, 'tensorflow.so') - recompile = False - # recompile if .so does not exist - if not os.path.exists(cpp) or not os.path.exists(so): - recompile = True - # recompile if cpp was modified after .so - elif max(cpp, so, key=os.path.getctime) == cpp: - recompile = True - # write cpp file - if recompile: - with open(cpp, 'w+') as handle: - handle.writelines(src) - # return path of cpp file - return cpp - -def build(src, path): - # include directories - triton_include_dirs = ['/home/philippe/development/triton/include'] - tensorflow_include_dirs = [tf.sysconfig.get_include()] - cuda_include_dirs = ['/usr/local/cuda-10.1/targets/x86_64-linux/include/'] - include_dirs = triton_include_dirs + tensorflow_include_dirs + cuda_include_dirs - # library directories - triton_library_dirs = [os.path.realpath(os.path.join(libtriton.__file__, os.path.pardir))] - tensorflow_library_dirs = [tf.sysconfig.get_lib()] - library_dirs = triton_library_dirs + tensorflow_library_dirs - # libraries - libraries = ['tensorflow_framework', 'triton'] - # extra arguments - extra_compile_args = [] - extra_link_args = [] - # create extension module - ext = setuptools.Extension( - name = 'test', - language = 'c++', - sources = [src], - include_dirs = include_dirs, - extra_compile_args = extra_compile_args, - extra_link_args = extra_link_args, - library_dirs = library_dirs, - libraries = libraries - ) - # build extension module - args = ['build_ext'] - tmp = tempfile.mkdtemp() - args.append('--build-temp=' + tmp) - args.append('--build-lib=' + path) - args.append('-q') - args = dict( - name = 'test', - ext_modules = [ext], - script_args = args, - ) - setuptools.setup(**args) - shutil.rmtree(tmp) - -def make_tensorflow_op(src, outputs, grids): - bindings = make_bindings(src, outputs, grids) - cache_path = make_cache_path(bindings) - cpp = write_bindings(bindings, cache_path) - build(cpp, cache_path) - result = tf.load_op_library(os.path.join(cache_path, 'test.cpython-36m-x86_64-linux-gnu.so')) - return result - - -library_dir = os.path.dirname(os.path.realpath(__file__)) -module = make_tensorflow_op(src, ['C'], ['(M + #TM - 1)/#TM', '(N + #TN - 1)/#TN']) -print(module.matmul) - - class dot: def __init__(self): - trans_a = True - trans_b = False + self.matmul = triton.make_tensorflow_op(src, ['C'], ['(M + #TM - 1)/#TM', '(N + #TN - 1)/#TN']) def __call__(self, a, b): shape_a = tf.shape(a) @@ -152,17 +54,17 @@ class dot: N = shape_b[0] lda = M ldb = K - ldc = M - c = extra_ops.alloc_empty(tf.stack([M, N])) - return module.matmul(a, b, c, M, N, K, lda, ldb, ldc) + ldc = N + c = triton.empty([M, N]) + return self.matmul.matmul(a, b, c, M, N, K, lda, ldb, ldc) -dot_nt = dot() +dot_tn = dot() def run_dot(): M, N, K = 128, 128, 128 a = tf.placeholder(tf.float16, shape=[M, K]) b = tf.placeholder(tf.float16, shape=[N, K]) # c = tf.matmul(a, b, transpose_a=True) - c = dot_nt(a, b) + c = dot_tn(a, b) # Reference ha = np.random.rand(M, K).astype(np.float16) hb = np.random.rand(N, K).astype(np.float16) @@ -172,7 +74,7 @@ def run_dot(): result = sess.run([c], feed_dict = {a: ha, b: hb})[0] # Test - hresult = np.dot(ha.T, hb).T + hresult = np.dot(ha.T, hb) dif = np.abs(result - hresult) np.savetxt('dif.dat', dif, '%2.4f') print(hresult) diff --git a/python/setup.py b/python/setup.py index 3d98218ac..aeba8b5a6 100644 --- a/python/setup.py +++ b/python/setup.py @@ -18,6 +18,7 @@ class CMakeExtension(Extension): class CMakeBuild(build_ext): + def run(self): try: out = subprocess.check_output(['cmake', '--version']) @@ -80,6 +81,7 @@ setup( author_email='ptillet@g.harvard.edu', description='A language and compiler for custom Deep Learning operations', long_description='', + packages=['triton'], ext_modules=[CMakeExtension('triton')], cmdclass=dict(build_ext=CMakeBuild), zip_safe=False, diff --git a/python/src/tensorflow.cpp b/python/src/tensorflow.cpp index 40810fc75..0e98f6636 100644 --- a/python/src/tensorflow.cpp +++ b/python/src/tensorflow.cpp @@ -177,8 +177,8 @@ result += R"( std::regex regex("#([a-zA-Z]([a-zA-Z]|[0-9])*)"); -std::vector grids; -for(size_t i = macros.size(); i < 3; i++) +std::vector grids = macros; +for(size_t i = grids.size(); i < 3; i++) grids.push_back("1"); std::string grid = "rt::grid_t{"; for(size_t i = 0; i < grids.size(); i++){ diff --git a/python/triton/__init__.py b/python/triton/__init__.py new file mode 100644 index 000000000..18dff0a49 --- /dev/null +++ b/python/triton/__init__.py @@ -0,0 +1 @@ +from .ops import * \ No newline at end of file diff --git a/python/triton/ops.py b/python/triton/ops.py new file mode 100644 index 000000000..ea782ad08 --- /dev/null +++ b/python/triton/ops.py @@ -0,0 +1,103 @@ +# import for cache +import os +import tempfile +import shutil +import hashlib +import sysconfig +import sys +# import for just-in-time compilation +import distutils +import setuptools.command.build_ext +import setuptools +# triton +import libtriton +# frameworks +import tensorflow as tf + +extra_ops = tf.load_op_library('/home/philippe/development/triton/python/build/lib.linux-x86_64-3.6/libextra_tf_ops.so') + + +def make_bindings(src, outputs, grids): + return libtriton.make_tensorflow_src(src, outputs, grids) + +def make_cache_path(src): + md5 = hashlib.sha1(src.encode()) + hexhash = md5.hexdigest() + home = os.path.expanduser('~') + cacheroot = os.path.join(home, '.triton', 'cache') + cachepath = os.path.join(cacheroot, str(hexhash)) + if not os.path.exists(cachepath): + os.makedirs(cachepath) + return cachepath + +def write_bindings(src, root): + cpp = os.path.join(root, 'tensorflow.cpp') + suffix = sysconfig.get_config_var('EXT_SUFFIX') + so = os.path.join(root, 'tensorflow{suffix}'.format(suffix=suffix)) + recompile = False + # recompile if .so does not exist + if not os.path.exists(cpp) or not os.path.exists(so): + recompile = True + # recompile if cpp was modified after .so + elif max(cpp, so, key=os.path.getctime) == cpp: + recompile = True + # write cpp file + if recompile: + with open(cpp, 'w+') as handle: + handle.writelines(src) + # return path of cpp file + return (cpp, so) + +def build(src, path): + # include directories + triton_include_dirs = ['/home/philippe/development/triton/include'] + tensorflow_include_dirs = [tf.sysconfig.get_include()] + cuda_include_dirs = ['/usr/local/cuda-10.1/targets/x86_64-linux/include/'] + include_dirs = triton_include_dirs + tensorflow_include_dirs + cuda_include_dirs + # library directories + triton_library_dirs = [os.path.realpath(os.path.join(libtriton.__file__, os.path.pardir))] + tensorflow_library_dirs = [tf.sysconfig.get_lib()] + library_dirs = triton_library_dirs + tensorflow_library_dirs + # libraries + libraries = ['tensorflow_framework', 'triton'] + # extra arguments + extra_compile_args = [] + extra_link_args = [] + # dependences + depends = [os.path.realpath(libtriton.__file__)] + # create extension module + ext = setuptools.Extension( + name = 'tensorflow', + language = 'c++', + sources = [src], + include_dirs = include_dirs, + extra_compile_args = extra_compile_args, + extra_link_args = extra_link_args, + library_dirs = library_dirs, + libraries = libraries, + depends = depends + ) + # build extension module + args = ['build_ext'] + tmp = tempfile.mkdtemp() + args.append('--build-temp=' + tmp) + args.append('--build-lib=' + path) + args.append('-q') + args = dict( + name = 'tensorflow', + ext_modules = [ext], + script_args = args, + ) + setuptools.setup(**args) + shutil.rmtree(tmp) + +def make_tensorflow_op(src, outputs, grids): + bindings = make_bindings(src, outputs, grids) + cache_path = make_cache_path(bindings) + cpp, so = write_bindings(bindings, cache_path) + build(cpp, cache_path) + result = tf.load_op_library(so) + return result + +def empty(shapes): + return extra_ops.alloc_empty(tf.stack(shapes)) From b58b0d8b27db7dea186070523d9d4798d4b2d6ce Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 18 Aug 2019 00:34:30 -0700 Subject: [PATCH 301/494] [general] removed unnecessary includes --- include/triton/codegen/analysis/alignment.h | 1 - include/triton/codegen/selection/target.h | 4 ---- include/triton/codegen/transform/dce.h | 3 --- include/triton/codegen/transform/peephole.h | 3 --- include/triton/codegen/transform/reassociate.h | 1 - include/triton/codegen/transform/shmem/barriers.h | 4 ---- include/triton/driver/kernel.h | 1 - include/triton/ir/builder.h | 1 - include/triton/ir/context_impl.h | 1 - include/triton/ir/function.h | 1 - include/triton/ir/type.h | 2 -- include/triton/ir/value.h | 1 - include/triton/lang/declaration.h | 1 - include/triton/lang/expression.h | 2 -- include/triton/runtime/launch_info.h | 1 - lib/codegen/transform/shmem/barriers.cpp | 3 +++ lib/driver/backend.cpp | 5 ++--- lib/driver/buffer.cpp | 1 - lib/driver/context.cpp | 3 --- lib/driver/dispatch.cpp | 1 - lib/driver/handle.cpp | 2 -- lib/driver/kernel.cpp | 4 +--- lib/driver/module.cpp | 2 +- lib/driver/platform.cpp | 3 +-- lib/driver/stream.cpp | 2 -- lib/ir/cfg.cpp | 3 +-- lib/ir/print.cpp | 1 + lib/ir/value.cpp | 3 +-- 28 files changed, 11 insertions(+), 49 deletions(-) diff --git a/include/triton/codegen/analysis/alignment.h b/include/triton/codegen/analysis/alignment.h index 1d0c4b191..6ef3c0f55 100644 --- a/include/triton/codegen/analysis/alignment.h +++ b/include/triton/codegen/analysis/alignment.h @@ -1,7 +1,6 @@ #ifndef TDL_INCLUDE_CODEGEN_ALIGNMENT_INFO_PASS_H #define TDL_INCLUDE_CODEGEN_ALIGNMENT_INFO_PASS_H -#include #include namespace triton { diff --git a/include/triton/codegen/selection/target.h b/include/triton/codegen/selection/target.h index 5a0a84694..f5f8e9a7c 100644 --- a/include/triton/codegen/selection/target.h +++ b/include/triton/codegen/selection/target.h @@ -1,10 +1,6 @@ #ifndef TDL_INCLUDE_IR_CODEGEN_TARGET_H #define TDL_INCLUDE_IR_CODEGEN_TARGET_H -#include -#include -#include - namespace llvm{ class Type; class Value; diff --git a/include/triton/codegen/transform/dce.h b/include/triton/codegen/transform/dce.h index dea50996d..8bed0afef 100644 --- a/include/triton/codegen/transform/dce.h +++ b/include/triton/codegen/transform/dce.h @@ -1,9 +1,6 @@ #ifndef TDL_INCLUDE_CODEGEN_OPTIMIZE_CSE_H #define TDL_INCLUDE_CODEGEN_OPTIMIZE_CSE_H -#include -#include -#include namespace triton { diff --git a/include/triton/codegen/transform/peephole.h b/include/triton/codegen/transform/peephole.h index acd11ecd6..691f8d0bd 100644 --- a/include/triton/codegen/transform/peephole.h +++ b/include/triton/codegen/transform/peephole.h @@ -1,9 +1,6 @@ #ifndef TDL_INCLUDE_CODEGEN_OPTIMIZE_TRANS_H #define TDL_INCLUDE_CODEGEN_OPTIMIZE_TRANS_H -#include -#include -#include namespace triton { diff --git a/include/triton/codegen/transform/reassociate.h b/include/triton/codegen/transform/reassociate.h index 5f639d23c..ce7ab476a 100644 --- a/include/triton/codegen/transform/reassociate.h +++ b/include/triton/codegen/transform/reassociate.h @@ -4,7 +4,6 @@ #include #include #include -#include namespace triton { diff --git a/include/triton/codegen/transform/shmem/barriers.h b/include/triton/codegen/transform/shmem/barriers.h index d03360690..6352fd060 100644 --- a/include/triton/codegen/transform/shmem/barriers.h +++ b/include/triton/codegen/transform/shmem/barriers.h @@ -1,10 +1,6 @@ #ifndef TDL_INCLUDE_CODEGEN_BARRIERS_H #define TDL_INCLUDE_CODEGEN_BARRIERS_H -#include -#include -#include - namespace triton { namespace ir { diff --git a/include/triton/driver/kernel.h b/include/triton/driver/kernel.h index 5d68ffd62..fafbcb0bd 100755 --- a/include/triton/driver/kernel.h +++ b/include/triton/driver/kernel.h @@ -25,7 +25,6 @@ #include "triton/driver/module.h" #include "triton/driver/handle.h" - #include namespace llvm diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h index 4f5f4f45b..bbd015c7e 100644 --- a/include/triton/ir/builder.h +++ b/include/triton/ir/builder.h @@ -1,7 +1,6 @@ #ifndef TDL_INCLUDE_IR_BUILDER_H #define TDL_INCLUDE_IR_BUILDER_H -#include #include #include #include "instructions.h" diff --git a/include/triton/ir/context_impl.h b/include/triton/ir/context_impl.h index 290d20cc7..cd41d20db 100644 --- a/include/triton/ir/context_impl.h +++ b/include/triton/ir/context_impl.h @@ -1,7 +1,6 @@ #ifndef TDL_INCLUDE_IR_CONTEXT_IMPL_H #define TDL_INCLUDE_IR_CONTEXT_IMPL_H -#include #include #include "triton/ir/type.h" diff --git a/include/triton/ir/function.h b/include/triton/ir/function.h index c5f5f0605..bde5218b2 100644 --- a/include/triton/ir/function.h +++ b/include/triton/ir/function.h @@ -5,7 +5,6 @@ #include #include "value.h" #include "constant.h" -#include namespace triton{ namespace ir{ diff --git a/include/triton/ir/type.h b/include/triton/ir/type.h index 13ead1959..6f1df7ec7 100644 --- a/include/triton/ir/type.h +++ b/include/triton/ir/type.h @@ -2,8 +2,6 @@ #define TDL_INCLUDE_IR_TYPE_H #include -#include -#include namespace triton{ namespace ir{ diff --git a/include/triton/ir/value.h b/include/triton/ir/value.h index 08b26d715..284a0a3b3 100644 --- a/include/triton/ir/value.h +++ b/include/triton/ir/value.h @@ -3,7 +3,6 @@ #include #include -#include #include namespace triton{ diff --git a/include/triton/lang/declaration.h b/include/triton/lang/declaration.h index 7441e8449..e406f00d8 100644 --- a/include/triton/lang/declaration.h +++ b/include/triton/lang/declaration.h @@ -3,7 +3,6 @@ #include "node.h" #include -#include namespace triton{ diff --git a/include/triton/lang/expression.h b/include/triton/lang/expression.h index 9d65de5c0..7724fdd61 100644 --- a/include/triton/lang/expression.h +++ b/include/triton/lang/expression.h @@ -3,9 +3,7 @@ #include "lang.h" #include -#include #include -#include namespace triton{ diff --git a/include/triton/runtime/launch_info.h b/include/triton/runtime/launch_info.h index 06e79d4e4..995ed09f4 100644 --- a/include/triton/runtime/launch_info.h +++ b/include/triton/runtime/launch_info.h @@ -1,7 +1,6 @@ #ifndef TRITON_INCLUDE_RUNTIME_LAUNCH_INFO_H #define TRITON_INCLUDE_RUNTIME_LAUNCH_INFO_H -#include #include namespace triton{ diff --git a/lib/codegen/transform/shmem/barriers.cpp b/lib/codegen/transform/shmem/barriers.cpp index be0875b96..6b66ab148 100644 --- a/lib/codegen/transform/shmem/barriers.cpp +++ b/lib/codegen/transform/shmem/barriers.cpp @@ -1,4 +1,7 @@ +#include +#include #include + #include "triton/codegen/transform/shmem/barriers.h" #include "triton/codegen/analysis/shmem/allocation.h" #include "triton/codegen/analysis/shmem/info.h" diff --git a/lib/driver/backend.cpp b/lib/driver/backend.cpp index aa90fcdc4..3be4daa20 100755 --- a/lib/driver/backend.cpp +++ b/lib/driver/backend.cpp @@ -20,6 +20,8 @@ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include +#include #include "triton/driver/dispatch.h" #include "triton/driver/backend.h" #include "triton/driver/buffer.h" @@ -27,9 +29,6 @@ #include "triton/driver/stream.h" #include "triton/driver/kernel.h" -#include -#include -#include namespace triton { diff --git a/lib/driver/buffer.cpp b/lib/driver/buffer.cpp index 53f9d4e07..1f499e5f3 100755 --- a/lib/driver/buffer.cpp +++ b/lib/driver/buffer.cpp @@ -20,7 +20,6 @@ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include #include "triton/driver/stream.h" #include "triton/driver/buffer.h" #include "triton/driver/context.h" diff --git a/lib/driver/context.cpp b/lib/driver/context.cpp index f9d7d0662..473cfaac7 100755 --- a/lib/driver/context.cpp +++ b/lib/driver/context.cpp @@ -20,12 +20,9 @@ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include #include - #include "triton/driver/context.h" #include "triton/driver/module.h" - #include "triton/tools/sys/getenv.hpp" #include "triton/tools/sys/mkdir.hpp" diff --git a/lib/driver/dispatch.cpp b/lib/driver/dispatch.cpp index ee5a36b85..9b5fc5242 100755 --- a/lib/driver/dispatch.cpp +++ b/lib/driver/dispatch.cpp @@ -20,7 +20,6 @@ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include #include "triton/driver/dispatch.h" #include "triton/driver/context.h" diff --git a/lib/driver/handle.cpp b/lib/driver/handle.cpp index c698cb8b5..20ae0f90d 100755 --- a/lib/driver/handle.cpp +++ b/lib/driver/handle.cpp @@ -20,8 +20,6 @@ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include -#include #include "triton/driver/handle.h" namespace triton diff --git a/lib/driver/kernel.cpp b/lib/driver/kernel.cpp index e4b5ac76c..e8bed34bc 100755 --- a/lib/driver/kernel.cpp +++ b/lib/driver/kernel.cpp @@ -20,9 +20,7 @@ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include -#include -#include "llvm/ExecutionEngine/GenericValue.h" +#include #include "triton/driver/kernel.h" #include "triton/driver/buffer.h" diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index d2c31fadd..8e23959c0 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -23,11 +23,11 @@ #include #include #include -#include "llvm/IR/IRBuilder.h" #include "triton/driver/module.h" #include "triton/driver/context.h" #include "triton/driver/error.h" #include "triton/tools/sys/getenv.hpp" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/IRPrintingPasses.h" #include "llvm/Bitcode/BitcodeWriter.h" #include "llvm/IR/Verifier.h" diff --git a/lib/driver/platform.cpp b/lib/driver/platform.cpp index 93484a4ee..90cb1913b 100755 --- a/lib/driver/platform.cpp +++ b/lib/driver/platform.cpp @@ -20,11 +20,10 @@ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ - +#include #include "triton/driver/platform.h" #include "triton/driver/device.h" -#include namespace triton { diff --git a/lib/driver/stream.cpp b/lib/driver/stream.cpp index 92fed604d..2ff5746fc 100755 --- a/lib/driver/stream.cpp +++ b/lib/driver/stream.cpp @@ -20,10 +20,8 @@ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include #include #include - #include "triton/driver/backend.h" #include "triton/driver/stream.h" #include "triton/driver/context.h" diff --git a/lib/ir/cfg.cpp b/lib/ir/cfg.cpp index e1c0b6776..5b19849d4 100644 --- a/lib/ir/cfg.cpp +++ b/lib/ir/cfg.cpp @@ -1,8 +1,7 @@ +#include #include "triton/ir/cfg.h" #include "triton/ir/basic_block.h" #include "triton/ir/function.h" -#include -#include namespace triton{ namespace ir{ diff --git a/lib/ir/print.cpp b/lib/ir/print.cpp index 4b7248bc6..cf5e706e4 100644 --- a/lib/ir/print.cpp +++ b/lib/ir/print.cpp @@ -1,3 +1,4 @@ +#include #include "triton/ir/basic_block.h" #include "triton/ir/module.h" #include "triton/ir/type.h" diff --git a/lib/ir/value.cpp b/lib/ir/value.cpp index 0797d0441..3ab64b97a 100644 --- a/lib/ir/value.cpp +++ b/lib/ir/value.cpp @@ -1,7 +1,6 @@ +#include #include "triton/ir/value.h" #include "triton/ir/instructions.h" -#include -#include namespace triton{ namespace ir{ From c05445d0017b4e5727aa9dca262fc172adb583df Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 18 Aug 2019 00:36:32 -0700 Subject: [PATCH 302/494] [general] removed dnn/ module and runtime/jit.cpp --- include/triton/dnn/base.h | 116 ----- include/triton/dnn/batchnorm.h | 110 ---- include/triton/dnn/blocksparse/dot.h | 61 --- include/triton/dnn/conv.h | 155 ------ include/triton/dnn/dot.h | 79 --- include/triton/dnn/heuristics.h | 186 ------- include/triton/dnn/shift.h | 192 ------- include/triton/runtime/jit.h | 136 ----- lib/dnn/base.cpp | 94 ---- lib/dnn/batchnorm.cpp | 227 --------- lib/dnn/blocksparse/dot.cpp | 238 --------- lib/dnn/conv.cpp | 720 --------------------------- lib/dnn/dot.cpp | 162 ------ lib/dnn/shift.cpp | 538 -------------------- lib/runtime/jit.cpp | 284 ----------- 15 files changed, 3298 deletions(-) delete mode 100644 include/triton/dnn/base.h delete mode 100644 include/triton/dnn/batchnorm.h delete mode 100644 include/triton/dnn/blocksparse/dot.h delete mode 100644 include/triton/dnn/conv.h delete mode 100644 include/triton/dnn/dot.h delete mode 100644 include/triton/dnn/heuristics.h delete mode 100644 include/triton/dnn/shift.h delete mode 100644 include/triton/runtime/jit.h delete mode 100644 lib/dnn/base.cpp delete mode 100644 lib/dnn/batchnorm.cpp delete mode 100644 lib/dnn/blocksparse/dot.cpp delete mode 100644 lib/dnn/conv.cpp delete mode 100644 lib/dnn/dot.cpp delete mode 100644 lib/dnn/shift.cpp delete mode 100644 lib/runtime/jit.cpp diff --git a/include/triton/dnn/base.h b/include/triton/dnn/base.h deleted file mode 100644 index b991c3726..000000000 --- a/include/triton/dnn/base.h +++ /dev/null @@ -1,116 +0,0 @@ -/* Copyright 2015-2017 Philippe Tillet -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files -* (the "Software"), to deal in the Software without restriction, -* including without limitation the rights to use, copy, modify, merge, -* publish, distribute, sublicense, and/or sell copies of the Software, -* and to permit persons to whom the Software is furnished to do so, -* subject to the following conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -#ifndef TDL_INCLUDE_DNN_BASE_H -#define TDL_INCLUDE_DNN_BASE_H - -#include "triton/driver/stream.h" -#include "triton/driver/kernel.h" -#include "triton/runtime/launch_info.h" - -namespace triton{ - -namespace runtime{ - class jit; -} - -namespace dnn{ - - -enum autotuning_t{ - FULL_TUNING, - PARTIAL_TUNING, - NO_TUNING -}; - -class base; -struct launch_context_t{ - base *op; - driver::kernel* kernel; - triton::runtime::launch_information info; -}; - -typedef std::vector params_t; - -class base { - friend class recompile_hash; - friend class recompile_equal; - -protected: - // leading dimensions - static void set_ld(const std::vector& shapes, - std::vector& ld); - // list of retuning parameters - virtual std::vector retune_params() const = 0; - -private: - // initialize - virtual void init_impl(driver::stream *, driver::cu_module *, triton::runtime::launch_information) = 0; - // deinitialize - virtual void deinit_impl() = 0; - // enqueue - virtual void enqueue_impl(driver::stream *stream, driver::kernel *kernel, - std::vector args, - triton::runtime::launch_information info) = 0; - // number of flops - virtual size_t num_flops() const = 0; - // default parameters - virtual std::vector search_space() const; - virtual params_t heuristics() const; - // obtain execution jit - std::pair get_profile_impl(driver::stream *stream, std::vector args, autotuning_t autotune); - -public: - // constructor - base(const std::string& name); - // triton-c source - virtual void triton_c_src(std::ostream &os) const = 0; - // clone - virtual base* clone() const = 0; - // enqueue - base* enqueue(driver::stream* stream, std::vector args, autotuning_t autotune = PARTIAL_TUNING); - // get profile - launch_context_t get_launch_context(driver::stream *stream, std::vector args, autotuning_t autotune = PARTIAL_TUNING); - -private: - std::string name_; -}; - - -struct recompile_equal{ - bool operator()(base* x, base* y) const{ - return typeid(*x) == typeid(*y) && - x->retune_params() == y->retune_params(); - } -}; - -struct recompile_hash{ - unsigned operator()(base* x) const{ - return x->retune_params()[0]; - } -}; - - -} -} - -#endif diff --git a/include/triton/dnn/batchnorm.h b/include/triton/dnn/batchnorm.h deleted file mode 100644 index 204ab631b..000000000 --- a/include/triton/dnn/batchnorm.h +++ /dev/null @@ -1,110 +0,0 @@ -/* Copyright 2015-2019 Philippe Tillet -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files -* (the "Software"), to deal in the Software without restriction, -* including without limitation the rights to use, copy, modify, merge, -* publish, distribute, sublicense, and/or sell copies of the Software, -* and to permit persons to whom the Software is furnished to do so, -* subject to the following conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -#ifndef TDL_INCLUDE_DNN_BATCHNORM_H -#define TDL_INCLUDE_DNN_BATCHNORM_H - -#include -#include -#include -#include -#include -#include "triton/dnn/base.h" -#include "triton/driver/stream.h" -#include "triton/driver/kernel.h" - -namespace triton{ -namespace dnn{ - -class batchnorm_forward: public base { -private: - // init - void init_impl(driver::stream *, driver::cu_module *, triton::runtime::launch_information) { } - void deinit_impl() { } - - // enqueue - void enqueue_impl(driver::stream *stream, driver::kernel *kernel, - std::vector args, - triton::runtime::launch_information info); - // number of flops - size_t num_flops() const; - // retuning parameters - std::vector retune_params() const; - // clone - base* clone() const; - -public: - // constructor - batchnorm_forward(int C, int D, int H, int W, int B, - std::string ty = "float", float eps = 1e-5); - // triton-c source - void triton_c_src(std::ostream &os) const; - -private: - int32_t C_; - int32_t D_; - int32_t H_; - int32_t W_; - int32_t B_; - std::string ty_; - float eps_; - int32_t DHWB_; - float rcpDHWB_; -}; - -class batchnorm_backward: public base{ -private: - // init - void init_impl(driver::stream *, driver::cu_module *, triton::runtime::launch_information) { } - void deinit_impl() { } - // enqueue - void enqueue_impl(driver::stream *stream, driver::kernel *kernel, - std::vector args, - runtime::launch_information info); - // number of flops - size_t num_flops() const; - // retuning parameters - std::vector retune_params() const; - // clone - base* clone() const; - -public: - // constructor - batchnorm_backward(int C, int D, int H, int W, int B, - std::string ty = "float", float eps = 1e-5); - // triton-c source - void triton_c_src(std::ostream &os) const; - -private: - int32_t C_; - int32_t D_; - int32_t H_; - int32_t W_; - int32_t B_; - std::string ty_; - float eps_; -}; - -} -} - -#endif diff --git a/include/triton/dnn/blocksparse/dot.h b/include/triton/dnn/blocksparse/dot.h deleted file mode 100644 index f42d5b9d8..000000000 --- a/include/triton/dnn/blocksparse/dot.h +++ /dev/null @@ -1,61 +0,0 @@ -#include "triton/driver/stream.h" -#include "triton/driver/kernel.h" -#include "triton/dnn/base.h" -#include - -namespace triton{ -namespace dnn{ -namespace blocksparse{ - -enum op_t{ - FPROP, - BPROP, - WGRAD -}; - -class dot: public base { -private: - void enqueue_impl(driver::stream *stream, driver::kernel *kernel, - std::vector args, - triton::runtime::launch_information info); - // number of flops - size_t num_flops() const; - // retuning parameters - std::vector retune_params() const; - // default parameters - std::vector search_space() const; - params_t heuristics() const; - // init - void init_impl(driver::stream *stream, driver::cu_module *module, triton::runtime::launch_information info); - // deinit - void deinit_impl(); - // source - std::string triton_c_src_ydx() const; - std::string triton_c_src_dw() const; -public: - // constructor - dot(int32_t N, int32_t K, int32_t S, int32_t C, const std::string &ty, int32_t BS, int32_t nlocks, int32_t nblocks, op_t op = FPROP); - // triton-c source - void triton_c_src(std::ostream &os) const; - // locks - driver::buffer* get_locks() const; - // clone - base* clone() const; - -private: - std::string ab_ty_; - std::string c_ty_; - int32_t N_; - int32_t S_; - int32_t C_; - int32_t K_; - int32_t BS_; - int32_t nlocks_; - int32_t nblocks_; - std::shared_ptr locks_; - op_t op_; -}; - -} -} -} diff --git a/include/triton/dnn/conv.h b/include/triton/dnn/conv.h deleted file mode 100644 index 5a167531d..000000000 --- a/include/triton/dnn/conv.h +++ /dev/null @@ -1,155 +0,0 @@ -#include -#include -#include -#include -#include "triton/driver/stream.h" -#include "triton/driver/kernel.h" -#include "triton/dnn/base.h" - -namespace triton{ -namespace dnn{ - -class conv: public base{ -public: - enum type { - FPROP, - BPROP, - WGRAD - }; - -private: - // initialize - std::tuple - unpack(int32_t ltrs, bool flip, int32_t EBD, int32_t EBH, int32_t EBW); - void build_b_deltas(); - void build_a_deltas(); - void build_masks(); - void init_impl(driver::stream *stream, driver::cu_module *module, triton::runtime::launch_information info); - void deinit_impl() { } - - // enqueue - std::array get_grid(size_t TM, size_t TN); - void set_arg(driver::kernel *kernel, - driver::buffer *a, driver::buffer *b, driver::buffer *c, - driver::buffer *bias); - void enqueue_impl(driver::stream *stream, driver::kernel *kernel, - std::vector args, - triton::runtime::launch_information info); - // number of flops - size_t num_flops() const; - // retuning parameters - std::vector retune_params() const; - // clone - base* clone() const; - -public: - - conv(int B, int NC, - int D, int H, int W, - int T, int R, int S, int NF, - int stride_d, int stride_h, int stride_w, - int pad_d, int pad_h, int pad_w, - int upsample_d, int upsample_h, int upsample_w, - std::string a_ty = "float", std::string b_ty = "float", - type ty = FPROP, bool bias = false); - - // accessors - size_t a_size(); - size_t b_size(); - size_t c_size(); - std::vector c_shapes(); - // default params - std::vector default_params(); - - // triton-c source code - void triton_c_src(std::ostream &os) const; - - // cpu reference implementations - template - void cpu_xprop(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B); - template - void cpu_wgrad(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B); - template - void cpu_ref(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B); - -private: - // image size - int32_t NB_; - int32_t NC_; - int32_t AD_; - int32_t AH_; - int32_t AW_; - // filter size - int32_t BD_; - int32_t BH_; - int32_t BW_; - int32_t NF_; - // activation size - int32_t CD_; - int32_t CH_; - int32_t CW_; - // striding - int32_t stride_d_; - int32_t stride_h_; - int32_t stride_w_; - // padding - int32_t pad_d_; - int32_t pad_h_; - int32_t pad_w_; - // upsampling - int32_t upsample_d_; - int32_t upsample_h_; - int32_t upsample_w_; - // equivalent matmul - int32_t M_; - int32_t N_; - int32_t K_; - // helpers - int32_t Fs_; - int32_t TK_; - int32_t Luts_; - // memory strides for A - std::vector shapes_a_; - std::vector ld_a_; - // memory strides for B - std::vector shapes_b_; - std::vector ld_b_; - // memory stride for C - std::vector shapes_c_; - std::vector ld_c_; - // constant memory - std::vector h_a_deltas_; - std::vector h_b_deltas_; - std::vector h_masks_; - driver::buffer* d_a_deltas_; - driver::buffer* d_b_deltas_; - driver::buffer* d_masks_; - driver::buffer* d_locks_; - bool is_a_deltas_cst; - bool is_b_deltas_cst_; - bool is_mask_cst_; - // data type - std::string a_ty_; - std::string b_ty_; - // conv type - type ty_; - bool bias_; - bool b_trans_; - bool b_lut_; - // axis index - int32_t a_inner_idx_; - int32_t a_outer_idx_; - int32_t a_pix_idx_; - int32_t b_inner_idx_; - int32_t b_outer_idx_; - int32_t b_pix_idx_; - int32_t c_outer_0_idx_; - int32_t c_outer_1_idx_; - int32_t c_pix_idx; - // maximum grid size for loc - int32_t max_grid_0_; - int32_t max_grid_1_; -}; - -} -} diff --git a/include/triton/dnn/dot.h b/include/triton/dnn/dot.h deleted file mode 100644 index f36d05db5..000000000 --- a/include/triton/dnn/dot.h +++ /dev/null @@ -1,79 +0,0 @@ -#include "triton/driver/stream.h" -#include "triton/driver/kernel.h" -#include "triton/dnn/base.h" -#include - -namespace triton{ -namespace dnn{ - -class dot: public base { -private: - // initialize - void init_impl(driver::stream *, driver::cu_module *, triton::runtime::launch_information); - void deinit_impl() { } - - // enqueue - void enqueue_impl(driver::stream *stream, driver::kernel *kernel, - std::vector args, - triton::runtime::launch_information info); - // retuning parameters - std::vector retune_params() const; - // default parameters - virtual std::vector search_space() const; - virtual params_t heuristics() const; - -public: - dot(int M, int N, int K, bool AT, bool BT, - std::string a_ty, std::string b_ty, std::string c_ty, - unsigned align_lda, unsigned align_ldb, unsigned align_ldc); - - // number of flops - size_t num_flops() const; - - // triton-c source - void triton_c_src(std::ostream &os) const; - - // clone - base* clone() const; - - // CPU reference implementation - template - static void cpu_ref(std::vector &c, const std::vector &a, const std::vector &b, - size_t M, size_t N, size_t K){ - for(size_t m = 0; m < M; m++) - for(size_t n = 0; n < N; n++){ - float acc = 0; - for(size_t k = 0; k < K; k++) - acc = acc + (AT ? a[k + m*K] : a[m + k*M]) * (BT ? b[n + k*N] : b[k + n*K]); - c[m + n*M] = static_cast(acc); - } - } - template - void cpu_ref(std::vector &c, const std::vector &a, const std::vector &b) { - if(AT_ && BT_) - dot::cpu_ref(c, a, b, M_, N_, K_); - else if(AT_ && !BT_) - dot::cpu_ref(c, a, b, M_, N_, K_); - else if(!AT_ && BT_) - dot::cpu_ref(c, a, b, M_, N_, K_); - else - dot::cpu_ref(c, a, b, M_, N_, K_); - } - -private: - int32_t M_; - int32_t N_; - int32_t K_; - bool AT_; - bool BT_; - std::string a_ty_; - std::string b_ty_; - std::string c_ty_; - unsigned align_lda_; - unsigned align_ldb_; - unsigned align_ldc_; - driver::buffer *locks_; -}; - -} -} diff --git a/include/triton/dnn/heuristics.h b/include/triton/dnn/heuristics.h deleted file mode 100644 index 56c23642b..000000000 --- a/include/triton/dnn/heuristics.h +++ /dev/null @@ -1,186 +0,0 @@ -#ifndef TRITON_DNN_HEURISTICS_H -#define TRITON_DNN_HEURISTICS_H - -#include -#include "triton/dnn/base.h" - -namespace triton{ -namespace dnn{ - -/* Dense matrix multiplication */ - -typedef std::vector params_t; -typedef std::tuple trans_key_t; -typedef std::tuple size_key_t; -static const std::map> dot_params = { - /* NN */ - {trans_key_t(false, false), std::map{ - {size_key_t(16, 16), {2, 8, 16, 4, 16, 2, 2, 1, 1, 16, 32, 8, 4, 1}}, - {size_key_t(16, 32), {4, 4, 16, 4, 32, 2, 2, 1, 1, 8, 32, 8, 4, 1}}, - {size_key_t(16, 64), {4, 4, 16, 4, 64, 2, 2, 1, 1, 8, 32, 8, 4, 1}}, - {size_key_t(16, 128), {2, 8, 16, 8, 128, 2, 2, 1, 1, 16, 32, 4, 8, 1}}, - {size_key_t(32, 16), {8, 4, 32, 8, 16, 2, 2, 1, 1, 4, 32, 4, 8, 1}}, - {size_key_t(32, 32), {4, 8, 32, 4, 32, 2, 2, 1, 1, 8, 32, 8, 4, 1}}, - {size_key_t(32, 64), {8, 4, 32, 4, 64, 2, 2, 1, 1, 4, 32, 8, 4, 1}}, - {size_key_t(32, 128), {8, 4, 32, 32, 128, 2, 2, 2, 2, 16, 32, 4, 4, 1}}, - {size_key_t(32, 256), {4, 8, 32, 32, 256, 2, 2, 1, 4, 32, 32, 4, 8, 1}}, - {size_key_t(64, 16), {8, 8, 64, 8, 16, 2, 2, 1, 1, 4, 32, 4, 8, 1}}, - {size_key_t(64, 32), {8, 8, 64, 8, 32, 2, 2, 1, 1, 4, 32, 4, 8, 1}}, - {size_key_t(64, 64), {8, 8, 64, 16, 64, 2, 2, 1, 2, 8, 32, 4, 8, 1}}, - {size_key_t(64, 128), {16, 4, 64, 32, 128, 2, 2, 1, 4, 8, 32, 4, 8, 1}}, - {size_key_t(128, 16), {8, 8, 128, 16, 16, 2, 2, 2, 1, 8, 32, 4, 8, 1}}, - {size_key_t(128, 32), {32, 4, 128, 16, 32, 2, 2, 2, 1, 2, 32, 4, 8, 1}}, - {size_key_t(128, 64), {16, 8, 128, 16, 64, 2, 2, 2, 2, 8, 32, 8, 4, 1}}, - {size_key_t(128, 128), {8, 8, 128, 32, 128, 2, 2, 2, 2, 16, 32, 4, 8, 1}}, - {size_key_t(256, 16), {32, 8, 256, 16, 16, 2, 2, 4, 1, 4, 32, 8, 4, 1}}, - {size_key_t(256, 32), {32, 8, 256, 16, 32, 2, 2, 4, 1, 4, 32, 8, 4, 1}}, - {size_key_t(256, 64), {16, 8, 256, 32, 64, 2, 2, 4, 1, 8, 32, 4, 8, 1}} - }}, - /* NT */ - {trans_key_t(false, true), std::map{ - {size_key_t(16, 16), {2, 4, 16, 2, 8, 16, 2, 2, 1, 1, 16, 32, 16, 1}}, - {size_key_t(16, 32), {4, 4, 16, 8, 4, 32, 2, 2, 1, 1, 8, 32, 4, 1}}, - {size_key_t(16, 64), {2, 4, 16, 2, 8, 64, 2, 2, 1, 1, 16, 32, 16, 1}}, - {size_key_t(16, 128), {2, 8, 16, 8, 8, 128, 2, 2, 1, 1, 16, 32, 4, 1}}, - {size_key_t(32, 16), {8, 4, 32, 2, 8, 16, 2, 2, 1, 1, 4, 32, 16, 1}}, - {size_key_t(32, 32), {4, 8, 32, 8, 4, 32, 2, 2, 1, 1, 8, 32, 4, 1}}, - {size_key_t(32, 64), {16, 4, 64, 16, 4, 64, 2, 2, 4, 1, 8, 32, 8, 1}}, - {size_key_t(32, 128), {4, 8, 32, 16, 4, 128, 2, 2, 1, 2, 16, 32, 4, 1}}, - {size_key_t(32, 256), {4, 8, 32, 64, 4, 256, 2, 2, 1, 4, 32, 32, 2, 1}}, - {size_key_t(64, 16), {8, 8, 64, 2, 8, 16, 2, 2, 1, 1, 4, 32, 16, 1}}, - {size_key_t(64, 32), {16, 4, 64, 4, 4, 32, 2, 2, 1, 1, 2, 32, 8, 1}}, - {size_key_t(64, 64), {8, 8, 64, 8, 8, 64, 2, 2, 2, 1, 8, 32, 8, 1}}, - {size_key_t(64, 128), {4, 4, 64, 8, 8, 128, 2, 2, 1, 4, 32, 32, 16, 1}}, - {size_key_t(64, 256), {8, 8, 64, 8, 8, 256, 2, 2, 1, 4, 16, 32, 16, 1}}, - {size_key_t(128, 16), {16, 8, 128, 2, 8, 16, 2, 2, 1, 1, 2, 32, 16, 1}}, - {size_key_t(128, 32), {32, 4, 128, 4, 8, 32, 2, 2, 2, 1, 2, 32, 16, 1}}, - {size_key_t(128, 64), {8, 8, 128, 8, 8, 64, 2, 2, 4, 1, 16, 32, 16, 1}}, - {size_key_t(128, 128), {8, 8, 128, 16, 8, 128, 2, 2, 2, 2, 16, 32, 8, 1}}, - {size_key_t(256, 16), {32, 4, 256, 4, 4, 16, 2, 2, 4, 1, 4, 32, 32, 1}}, - {size_key_t(256, 32), {16, 8, 256, 8, 4, 32, 2, 2, 4, 1, 8, 32, 16, 1}}, - {size_key_t(256, 64), {8, 8, 256, 8, 8, 64, 2, 2, 4, 1, 16, 32, 16, 1}} - }}, - /* TN */ - {trans_key_t(true, false), std::map{ - {size_key_t(16, 16), {4, 16, 4, 16, 2, 2, 1, 1, 8, 4, 32, 8, 4, 1}}, - {size_key_t(16, 32), {8, 16, 8, 32, 2, 2, 1, 1, 4, 4, 32, 4, 4, 1}}, - {size_key_t(16, 64), {4, 16, 8, 64, 2, 2, 1, 1, 8, 4, 32, 4, 8, 1}}, - {size_key_t(16, 128), {4, 16, 8, 128, 2, 2, 1, 1, 8, 4, 32, 4, 8, 1}}, - {size_key_t(32, 16), {4, 32, 8, 16, 2, 2, 1, 1, 8, 4, 32, 4, 8, 1}}, - {size_key_t(32, 32), {4, 32, 4, 32, 2, 2, 1, 1, 8, 4, 32, 8, 4, 1}}, - {size_key_t(32, 64), {4, 32, 4, 64, 2, 2, 1, 1, 8, 4, 32, 8, 4, 1}}, - {size_key_t(32, 128), {8, 32, 8, 128, 2, 2, 1, 1, 4, 8, 32, 4, 8, 1}}, - {size_key_t(32, 256), {32, 32, 32, 256, 2, 2, 1, 4, 4, 8, 32, 4, 8, 1}}, - {size_key_t(64, 16), {4, 64, 8, 16, 2, 2, 1, 1, 8, 4, 32, 4, 8, 1}}, - {size_key_t(64, 32), {4, 64, 4, 32, 2, 2, 1, 1, 8, 4, 32, 8, 4, 1}}, - {size_key_t(64, 64), {8, 64, 16, 64, 2, 2, 2, 1, 8, 4, 32, 4, 8, 1}}, - {size_key_t(64, 128), {16, 64, 32, 128, 2, 2, 1, 4, 8, 4, 32, 4, 8, 1}}, - {size_key_t(128, 16), {8, 128, 8, 16, 2, 2, 1, 1, 4, 8, 32, 4, 8, 1}}, - {size_key_t(128, 32), {16, 128, 16, 32, 2, 2, 4, 1, 8, 4, 32, 8, 4, 1}}, - {size_key_t(128, 64), {32, 128, 32, 64, 2, 2, 2, 2, 4, 8, 32, 4, 8, 1}}, - {size_key_t(128, 128), {32, 128, 32, 128, 2, 2, 1, 4, 4, 8, 32, 4, 8, 1}}, - {size_key_t(256, 16), {16, 256, 16, 16, 2, 2, 2, 1, 4, 8, 32, 4, 8, 1}}, - {size_key_t(256, 32), {16, 256, 32, 32, 2, 2, 4, 1, 8, 4, 32, 4, 8, 1}}, - }}, - /* TT */ - {trans_key_t(true, true), std::map{ - {size_key_t(16, 16), {8, 16, 4, 4, 16, 2, 2, 1, 1, 4, 8, 32, 8, 1}}, - {size_key_t(16, 32), {8, 16, 8, 4, 32, 2, 2, 1, 1, 4, 8, 32, 4, 1}}, - {size_key_t(16, 64), {16, 16, 4, 8, 64, 2, 2, 1, 4, 8, 4, 32, 32, 1}}, - {size_key_t(16, 128), {16, 16, 8, 8, 128, 2, 2, 1, 1, 2, 4, 32, 4, 1}}, - {size_key_t(32, 16), {4, 32, 4, 4, 16, 2, 2, 1, 1, 8, 4, 32, 8, 1}}, - {size_key_t(32, 32), {8, 32, 8, 4, 32, 2, 2, 1, 1, 4, 8, 32, 4, 1}}, - {size_key_t(32, 64), {64, 128, 8, 4, 64, 2, 2, 4, 1, 2, 8, 32, 16, 1}}, - {size_key_t(32, 128), {16, 32, 32, 4, 128, 2, 2, 1, 2, 4, 8, 32, 2, 1}}, - {size_key_t(32, 256), {32, 32, 32, 4, 256, 2, 2, 1, 4, 4, 8, 32, 4, 1}}, - {size_key_t(64, 16), {4, 64, 2, 8, 16, 2, 2, 1, 1, 8, 4, 32, 16, 1}}, - {size_key_t(64, 32), {4, 64, 8, 4, 32, 2, 2, 1, 1, 8, 4, 32, 4, 1}}, - {size_key_t(64, 64), {16, 64, 8, 8, 64, 2, 2, 2, 1, 4, 8, 32, 8, 1}}, - {size_key_t(64, 128), {32, 64, 8, 8, 128, 2, 2, 1, 4, 4, 4, 32, 16, 1}}, - {size_key_t(64, 256), {64, 64, 8, 8, 256, 2, 2, 1, 4, 2, 8, 32, 16}}, - {size_key_t(128, 16), {8, 128, 2, 8, 16, 2, 2, 1, 1, 4, 8, 32, 16, 1}}, - {size_key_t(128, 32), {16, 128, 8, 4, 32, 2, 2, 4, 1, 8, 4, 32, 16, 1}}, - {size_key_t(128, 64), {32, 128, 8, 8, 64, 2, 2, 4, 1, 4, 8, 32, 16, 1}}, - {size_key_t(128, 128), {32, 128, 16, 8, 128, 2, 2, 2, 2, 4, 8, 32, 8, 1}}, - {size_key_t(256, 16), {32, 256, 4, 4, 16, 2, 2, 4, 1, 4, 8, 32, 32, 1}}, - {size_key_t(256, 32), {32, 256, 8, 4, 32, 2, 2, 4, 1, 4, 8, 32, 16, 1}} - }} -}; - -// small search space for partial auto-tuning -inline std::vector dot_search_space(bool AT, bool BT) { - std::vector result; - for(auto x: dot_params.at(trans_key_t{AT, BT})) - result.push_back(x.second); - return result; -} - -// simple parameter heuristics -inline params_t dot_heuristics(bool AT, bool BT, size_t M, size_t N, size_t K) { - size_t TM = 128; - size_t TN = 128; -// return {4, 4, 128, 8, 4, 128, 2, 2, 2, 2, 32, 32, 16, 1}; - return dot_params.at(trans_key_t{AT, BT}).at(size_key_t{TM, TN}); -} - - -/* Block-sparse matrix multiplication */ - -static const std::map, std::map> bsdot_params = { - /* FPROP */ - {{true, 32}, std::map{ - {32, {2, 2, 32, 32, 2, 2, 4, 8, 32, 32, 8, 4, 16}}, - {64, {2, 2, 64, 32, 2, 1, 16, 4, 4, 32, 16, 2, 4}}, - {128, {2, 2, 128, 32, 4, 1, 32, 4, 4, 32, 8, 4, 16}} - }}, - - {{true, 16}, std::map{ - {32, {4, 1, 32, 16, 1, 1, 8, 4, 4, 16, 4, 4, 8}}, - {64, {4, 1, 64, 16, 2, 2, 8, 8, 16, 16, 8, 2, 16}}, - {128, {4, 1, 128, 16, 4, 1, 16, 8, 8, 16, 8, 2, 16}} - }}, - - {{true, 8}, std::map{ - {32, {4, 1, 32, 8, 1, 1, 4, 8, 8, 8, 4, 2, 8}}, - {64, {4, 1, 64, 8, 1, 1, 8, 8, 4, 8, 4, 2, 8}}, - {128, {4, 1, 128, 8, 1, 1, 4, 8, 8, 8, 4, 2, 8}} - }}, - - /* BPROP */ - {{false, 32}, std::map{ - {32, {2, 2, 32, 32, 1, 1, 8, 4, 4, 32, 8, 4, 8}}, - {64, {2, 2, 64, 32, 2, 1, 16, 4, 4, 32, 16, 4, 8}}, - {128, {2, 2, 128, 32, 4, 1, 32, 4, 4, 32, 32, 4, 8}} - }}, - - {{false, 16}, std::map{ - {32, {4, 1, 32, 16, 1, 2, 4, 8, 16, 16, 16, 4, 4}}, - {64, {4, 1, 64, 16, 2, 1, 8, 8, 8, 16, 16, 4, 4}}, - {128, {4, 1, 128, 16, 2, 2, 32, 4, 4, 16, 16, 8, 2}} - }}, - - {{false, 8}, std::map{ - {32, {4, 1, 32, 8, 1, 1, 4, 8, 8, 8, 8, 4, 2}}, - {64, {4, 1, 64, 8, 1, 1, 8, 8, 4, 8, 8, 4, 2}}, - {128, {4, 1, 128, 8, 1, 1, 8, 8, 4, 8, 8, 4, 2}} - }} -}; - -// small search space for partial auto-tuning -inline std::vector bsdot_search_space(bool is_fprop, size_t block_size) { - std::vector result; - for(auto x: bsdot_params.at({is_fprop, block_size})) - result.push_back(x.second); - return result; -} - -// simple parameter heuristics -inline params_t bsdot_heuristics(bool is_fprop, size_t block_size, size_t N, size_t S) { - return bsdot_params.at({is_fprop,block_size}).at(128); -} - - -} -} - -#endif diff --git a/include/triton/dnn/shift.h b/include/triton/dnn/shift.h deleted file mode 100644 index 4590c476e..000000000 --- a/include/triton/dnn/shift.h +++ /dev/null @@ -1,192 +0,0 @@ -/* Copyright 2015-2017 Philippe Tillet -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files -* (the "Software"), to deal in the Software without restriction, -* including without limitation the rights to use, copy, modify, merge, -* publish, distribute, sublicense, and/or sell copies of the Software, -* and to permit persons to whom the Software is furnished to do so, -* subject to the following conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -#ifndef TDL_INCLUDE_DNN_SHIFT_H -#define TDL_INCLUDE_DNN_SHIFT_H - -#include -#include -#include -#include -#include -#include "triton/dnn/base.h" -#include "triton/driver/stream.h" -#include "triton/driver/kernel.h" - -namespace triton{ -namespace dnn{ - -enum op_t { - FPROP, - BPROP, - WGRAD -}; - -enum layout_t { - NCHW, - CHWN -}; - -class shift: public base { -private: - // initialize and enqueue - void init_impl(driver::stream *stream, driver::cu_module *module, triton::runtime::launch_information info); - void deinit_impl(); - void enqueue_impl(driver::stream *stream, driver::kernel *kernel, - std::vector args, - triton::runtime::launch_information info); - std::vector search_space() const; - params_t heuristics() const; - -public: - - shift(int B, int NC, - int D, int H, int W, - int T, int R, int S, int NF, - int stride_h, int stride_w, - const int32_t* shift_h, const int32_t* shift_w, - std::string a_ty = "float", std::string b_ty = "float", - op_t ty = FPROP, bool bias = false, layout_t layout = CHWN); - - // look-up table - void build_delta_a(); - void build_masks(); - // accessors - size_t c_size(); - std::vector c_shapes(); - // equivalent GEMM - bool AT() const; - bool BT() const; - size_t M() const; - size_t N() const; - size_t K() const; - size_t lda() const; - size_t ldb() const; - size_t ldc() const; - // number of flops - size_t num_flops() const; - // source - void triton_c_src(std::ostream &os) const; - // retuning parameters - std::vector retune_params() const; - // clone - base* clone() const; - // cpu reference - template - void cpu_ref(OUT_DTYPE* O, - const IN_DTYPE* I, - const IN_DTYPE* F) - { - OUT_DTYPE acc; - for(int32_t p = 0; p < AH_; ++p) - for(int32_t q = 0; q < AW_; ++q) - for(int32_t bs = 0; bs < B_; ++bs) - for(int32_t k = 0; k < F_; ++k) - { - acc = 0; - for(int32_t c = 0; c < C_; ++c){ - int32_t h = p; - int32_t w = q; - if(h >= BH_/2 && h < AH_ - BH_/2 - && w >= BW_/2 && w < AW_ - BW_/2){ - h += shift_h_[c]; - w += shift_w_[c]; - } - IN_DTYPE a = I[bs + w*B_ + h*B_*AW_ + c*B_*AH_*AW_]; - IN_DTYPE b = F[k + c*F_]; - acc = std::fma(a, b, acc); - } - O[bs + q*B_ + p*B_*AW_ + k*B_*AH_*AW_] = acc; - } - } - -private: - int32_t MAX_C_; - int32_t TK_; - // image size - int32_t B_; - int32_t C_; - int32_t AD_; - int32_t AH_; - int32_t AW_; - // filter size - int32_t BD_; - int32_t BH_; - int32_t BW_; - int32_t F_; - // activation size - int32_t CD_; - int32_t CH_; - int32_t CW_; - // interior image size - int32_t IAD_; - int32_t IAH_; - int32_t IAW_; - // interior activation size - int32_t ICD_; - int32_t ICH_; - int32_t ICW_; - // equivalent matmul - int32_t M_; - int32_t N_; - int32_t K_; - // shapes - std::vector shapes_c_; - // strides - int32_t stride_d_; - int32_t stride_h_; - int32_t stride_w_; - // memory strides - int32_t lda_n_, lda_c_, lda_h_, lda_w_; - int32_t ldb_n_, ldb_c_, ldb_h_, ldb_w_; - int32_t ldc_n_, ldc_f_, ldc_h_, ldc_w_; - // shift values - const int32_t* shift_h_; - const int32_t* shift_w_; - bool shift_edge_h_; - bool shift_edge_w_; - // look-up tables - std::vector h_delta_a; - std::vector h_delta_b; - driver::buffer* d_delta_a; - driver::buffer* d_delta_b; - // data types - std::string a_ty_; - std::string b_ty_; - std::string c_ty_; - // convolution type - op_t op_; - bool bias_; - // transpose - bool AT_; - bool BT_; - // layout - layout_t layout_; - // locks - size_t max_locks_; - driver::buffer *locks_; -}; - -} -} - -#endif diff --git a/include/triton/runtime/jit.h b/include/triton/runtime/jit.h deleted file mode 100644 index a7fb5deeb..000000000 --- a/include/triton/runtime/jit.h +++ /dev/null @@ -1,136 +0,0 @@ -#ifndef TDL_INCLUDE_JIT_H -#define TDL_INCLUDE_JIT_H - -#include -#include -#include "llvm/IR/LLVMContext.h" -#include "triton/ir/context.h" -#include "triton/ir/print.h" -#include "triton/driver/module.h" -#include "triton/driver/kernel.h" -#include "triton/codegen/selection/selection.h" -#include "triton/codegen/selection/target.h" -#include "triton/codegen/analysis/tune.h" -#include "triton/codegen/analysis/shmem/allocation.h" -#include "triton/codegen/analysis/shmem/liveness.h" -#include "triton/codegen/analysis/shmem/info.h" -#include "triton/codegen/analysis/alignment.h" -#include "triton/codegen/transform/dce.h" -#include "triton/codegen/transform/peephole.h" -#include "triton/codegen/transform/shmem/barriers.h" -#include "triton/codegen/transform/reassociate.h" -#include "triton/codegen/transform/vectorize.h" -#include "triton/runtime/launch_info.h" -#include - -namespace llvm { - class Module; - -} - -namespace triton { - -namespace lang{ -class translation_unit; -} - -namespace codegen{ -namespace analysis{ -class tune; -} -} - -namespace ir { -class module; -class context; -class metaparameter; -} - -namespace runtime{ - -class jit { -public: - typedef std::function benchmark_t; - - struct tune_res_t{ - double perf; - std::vector params; - }; - - struct passes_wrapper { - passes_wrapper(codegen::target* target) - : tune(0), - shmem_liveness(&shmem_info), - shmem_allocation(&shmem_liveness, &shmem_info, &tune), - shmem_barriers(&shmem_allocation, &shmem_info), - vectorize(&tune), - selection(&shmem_allocation, &tune, &shmem_info, &alignment_info, target), - dce(), - peephole(), - alignment_info(), - reassociate(&tune), - target_(target) { } - - void target_independent(ir::module &module) { - peephole.run(module); - dce.run(module); - } - - void target_dependent(ir::module &module) { - reassociate.run(module); - peephole.run(module); - if(target_->is_gpu()){ - shmem_info.run(module); - shmem_liveness.run(module); - shmem_allocation.run(); - shmem_barriers.run(module); - } - alignment_info.run(module); - vectorize.run(module); - dce.run(module); - } - - codegen::selection selection; - codegen::analysis::tune tune; - codegen::analysis::shmem::info shmem_info; - codegen::analysis::shmem::liveness shmem_liveness; - codegen::analysis::shmem::allocation shmem_allocation; - codegen::analysis::alignment_info alignment_info; - codegen::transform::shmem_barriers shmem_barriers; - codegen::transform::vectorize vectorize; - codegen::transform::dce dce; - codegen::transform::peephole peephole; - codegen::transform::reassociate reassociate; - codegen::target* target_; - }; - -private: - std::string compute_data_layout(bool is_64bit = true, bool use_short_pointers = true); - std::unique_ptr make_llvm_module(triton::ir::module &module, passes_wrapper &passes, llvm::LLVMContext &context, launch_information &info); - std::unique_ptr make_triton_module(const char *name, triton::ir::context &context, triton::lang::translation_unit *program); - triton::lang::translation_unit *parse_program(const char *name, const char *src); - -public: - jit(driver::context* context, unsigned nthreads = 4); - ~jit(); - std::vector get_valid(const char *name, const char *src); - tune_res_t autotune(const char* name, const char* src, benchmark_t benchmark, const std::vector > &targets = {}); - void add_module(ir::module &module, const std::vector& params = {}); - void add_module(const char* name, const char* src, const std::vector& params = {}); - driver::kernel* get_function(const char* name); - launch_information get_launch_info(const char* name); - -private: - std::map modules_; - driver::context* driver_context_; - llvm::LLVMContext llvm_context_; - ir::context triton_context_; - std::map launch_info_map_; - std::shared_ptr target_; - unsigned nthreads_; -}; - -} -} - -#endif diff --git a/lib/dnn/base.cpp b/lib/dnn/base.cpp deleted file mode 100644 index 86d031564..000000000 --- a/lib/dnn/base.cpp +++ /dev/null @@ -1,94 +0,0 @@ -#include -#include -#include "triton/dnn/base.h" -#include "triton/runtime/jit.h" -#include "triton/tools/bench.hpp" - -namespace triton{ -namespace dnn{ - -namespace rt = triton::runtime; - - -void base::set_ld(const std::vector& shapes, - std::vector& ld) { - size_t size = shapes.size(); - ld.resize(size); - ld[size - 1] = 1; - for(int i = size - 1; i >= 1; i--) - ld[i - 1] = shapes[i] * ld[i]; -} - - -base::base(const std::string& name) - : name_(name) { } - -std::vector base::search_space() const { - return {}; -} - -params_t base::heuristics() const { - return *search_space().begin(); -} - -std::pair base::get_profile_impl(driver::stream *stream, std::vector args, autotuning_t autotune) { - static std::unordered_map, recompile_hash, recompile_equal> m_jit; - driver::context* ctx = stream->context(); - rt::jit* jit; - /* the current template has not already been compiled */ - if(m_jit.find(this) == m_jit.end()) { - base* clone = this->clone(); - jit = m_jit.emplace(clone, std::unique_ptr(new rt::jit(ctx, 8))).first->second.get(); - std::ostringstream oss; - clone->triton_c_src(oss); - std::string src = oss.str(); - auto benchmark = [&](triton::driver::kernel* kernel, - rt::launch_information info) { - // launch info - clone->init_impl(stream, (triton::driver::cu_module*)kernel->module(), info); - clone->enqueue_impl(stream, kernel, args, info); - stream->synchronize(); - double ts = triton::tools::bench([&](){ clone->enqueue_impl(stream, kernel, args, info); }, stream); - clone->deinit_impl(); -// std::cout << ts * 1e-6 << std::endl; - return num_flops() / ts * 1e-3; - }; - // auto-tune and save result - if(autotune == FULL_TUNING || autotune == PARTIAL_TUNING) { - std::vector space = {}; - if(autotune == PARTIAL_TUNING) - space = search_space(); - rt::jit::tune_res_t best = jit->autotune(name_.c_str(), src.c_str(), benchmark, space); - jit->add_module(name_.c_str(), src.c_str(), best.params); - } - else{ - params_t params = heuristics(); - jit->add_module(name_.c_str(), src.c_str(), params); - } - triton::driver::kernel* kernel = jit->get_function(name_.c_str()); - rt::launch_information info = jit->get_launch_info(name_.c_str()); - clone->init_impl(stream, (triton::driver::cu_module*)kernel->module(), info); - } - /* retrieved compiled template */ - else { - jit = m_jit.at(this).get(); - } - auto it = m_jit.find(this); - return {it->first, jit}; -} - -base* base::enqueue(driver::stream *stream, std::vector args, autotuning_t autotune) { - launch_context_t info = get_launch_context(stream, args, autotune); - info.op->enqueue_impl(stream, info.kernel, args, info.info); - return info.op; -} - -launch_context_t base::get_launch_context(driver::stream *stream, std::vector args, autotuning_t autotune) { - std::pair profile = get_profile_impl(stream, args, autotune); - driver::kernel* kernel = profile.second->get_function(name_.c_str()); - rt::launch_information info = profile.second->get_launch_info(name_.c_str()); - return {profile.first, kernel, info}; -} - -} -} diff --git a/lib/dnn/batchnorm.cpp b/lib/dnn/batchnorm.cpp deleted file mode 100644 index fe785afdd..000000000 --- a/lib/dnn/batchnorm.cpp +++ /dev/null @@ -1,227 +0,0 @@ -/* Copyright 2015-2019 Philippe Tillet -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files -* (the "Software"), to deal in the Software without restriction, -* including without limitation the rights to use, copy, modify, merge, -* publish, distribute, sublicense, and/or sell copies of the Software, -* and to permit persons to whom the Software is furnished to do so, -* subject to the following conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -#include "triton/dnn/batchnorm.h" - -namespace triton{ -namespace dnn{ - -/* --------------- - * Forward - * --------------- */ - -batchnorm_forward::batchnorm_forward(int C, int D, int H, int W, int B, std::string ty, float eps) - : base("batchnorm_forward"), - C_(C), D_(D), H_(H), W_(W), B_(B), ty_(ty), eps_(eps) { - DHWB_ = D_*H_*W_*B_; - rcpDHWB_ = (float)1 / DHWB_; -} - -size_t batchnorm_forward::num_flops() const { - return C_*DHWB_; -} - - -std::vector batchnorm_forward::retune_params() const { - return {C_, D_, H_, W_, B_}; -} - -base* batchnorm_forward::clone() const { - return new batchnorm_forward(*this); -} - -void batchnorm_forward::enqueue_impl(driver::stream *stream, driver::kernel *kernel, - std::vector args, - runtime::launch_information info) -{ - driver::buffer *y = args[0], *m = args[1], *v = args[2]; - driver::buffer *x = args[3], *g = args[4], *b = args[5]; - std::array grid = {1, (size_t)C_, 1}; - kernel->setArg(0, y); - kernel->setArg(1, m); - kernel->setArg(2, v); - kernel->setArg(3, x); - kernel->setArg(4, g); - kernel->setArg(5, b); - kernel->setArg(6, DHWB_); - kernel->setArg(7, rcpDHWB_); - kernel->setArg(8, eps_); - stream->enqueue(kernel, grid, {info.num_threads, 1, 1}); -} - -void batchnorm_forward::triton_c_src(std::ostream &os) const { - os << -R"( -const tunable int TM = {128}; - -void batchnorm_forward(float *Y, float *M, float *V, - restrict read_only float *X, - restrict read_only float *G, - restrict read_only float *B, - int DHWN, - float rcpDHWN, float eps) { - int rx[TM] = 0 ... TM; - float *px[TM]; - float x[TM] = 0; - int c = get_program_id(1); - float g = *(G + c); - float b = *(B + c); - - float mean[TM] = 0; - px = X + rx + c*DHWN; - for(int i = 0; i < DHWN; i = i + TM){ - x = *px; - mean = mean + x; - px = px + TM; - } - float *pm = M + c; - float m = __sum(mean, 0) * rcpDHWN; - *pm = m; - - float var[TM] = 0; - px = X + rx + c*DHWN; - for(int i = 0; i < DHWN; i = i + TM){ - x = *px; - x = x - m; - var = var + x*x; - px = px + TM; - } - float v = __sum(var, 0) * rcpDHWN; - float *pv = V + c; - *pv = v; - float rstdg = 1 / sqrt(v + eps) * g; - - px = X + rx + c*DHWN; - float* py[TM] = Y + rx + c*DHWN; - for(int i = 0; i < DHWN; i = i + TM){ - x = *px; - float y[TM] = (x - m)*rstdg + b; - *py = y; - px = px + TM; - py = py + TM; - } -})"; -} - -/* --------------- - * Backward - * --------------- */ - -batchnorm_backward::batchnorm_backward(int C, int D, int H, int W, int B, std::string ty, float eps) - : base("batchnorm_backward"), - C_(C), D_(D), H_(H), W_(W), B_(B), - ty_(ty), eps_(eps) -{ } - -size_t batchnorm_backward::num_flops() const { - return C_*D_*H_*W_*B_; -} - -std::vector batchnorm_backward::retune_params() const { - return {C_, D_, H_, W_, B_}; -} - -base* batchnorm_backward::clone() const { - return new batchnorm_backward(*this); -} - -void batchnorm_backward::enqueue_impl(driver::stream *stream, driver::kernel *kernel, - std::vector args, - runtime::launch_information info) { - driver::buffer *dx = args[0], *dg = args[1], *db = args[2], *dy = args[3]; - driver::buffer *x = args[4], *g = args[5], *m = args[6], *v = args[7]; - std::array grid = {1, (size_t)C_, 1}; - kernel->setArg(0, dx); - kernel->setArg(1, dg); - kernel->setArg(2, db); - kernel->setArg(3, dy); - kernel->setArg(4, x); - kernel->setArg(5, g); - kernel->setArg(6, m); - kernel->setArg(7, v); - kernel->setArg(8, (int32_t)(D_*H_*W_*B_)); - kernel->setArg(9, (float)1/(D_*H_*W_*B_)); - kernel->setArg(10, eps_); - stream->enqueue(kernel, grid, {info.num_threads, 1, 1}); -} - -void batchnorm_backward::triton_c_src(std::ostream &os) const { - os << -R"( -const tunable int TM = {128}; - -void batchnorm_backward(float *DX, float *DG, float *DB, - restrict read_only float *DY, - restrict read_only float *X, - restrict read_only float *G, - restrict read_only float *M, - restrict read_only float *V, - int DHWN, float rcpDHWN, float epsilon) { - int rx[TM] = 0 ... TM; - int c = get_program_id(1); - int offset = c*DHWN; - float g = *(G + c); - float mean = *(M + c); - float var = *(V + c); - float rstd = 1 / sqrt(var + epsilon); - float* px[TM]; - float* pdx[TM]; - float* pdy[TM]; - - px = X + rx + offset; - pdy = DY + rx + offset; - float dg[TM] = 0; - float db[TM] = 0; - for(int i = 0; i < DHWN; i = i + TM){ - float x[TM] = *px; - float dy[TM] = *pdy; - dg = dg + dy*(x - mean)*rstd; - db = db + dy; - px = px + TM; - pdy = pdy + TM; - } - float sdg = __sum(dg, 0); - float sdb = __sum(db, 0); - float *pdg = DG + c; - float *pdb = DB + c; - *pdg = sdg; - *pdb = sdb; - - px = X + rx + offset; - pdy = DY + rx + offset; - pdx = DX + rx + offset; - for(int i = 0; i < DHWN; i = i + TM){ - float x[TM] = *px; - float dy[TM] = *pdy; - float xhat[TM] = (x - mean) * rstd; - float xtmp[TM] = (xhat * dg + db) * rcpDHWN; - float dx[TM] = (dy - xtmp) * rstd * g; - *pdx = dx; - px = px + TM; - pdy = pdy + TM; - pdx = pdx + TM; - } -})"; -} - -} -} diff --git a/lib/dnn/blocksparse/dot.cpp b/lib/dnn/blocksparse/dot.cpp deleted file mode 100644 index b155f9c89..000000000 --- a/lib/dnn/blocksparse/dot.cpp +++ /dev/null @@ -1,238 +0,0 @@ -#include "triton/dnn/heuristics.h" -#include "triton/dnn/blocksparse/dot.h" - -namespace triton{ -namespace dnn{ -namespace blocksparse{ - - -size_t dot::num_flops() const { - return 2.*nblocks_*BS_*BS_*N_; -} - -std::vector dot::retune_params() const{ - return {N_, S_, C_, BS_, nlocks_, op_}; -} - -std::vector dot::search_space() const { - return bsdot_search_space(op_ == FPROP, BS_); -} - -params_t dot::heuristics() const { - return bsdot_heuristics(op_ == FPROP, BS_, N_, S_); -} - -base * dot::clone() const { - return new dot(*this); -} - -dot::dot(int32_t N, int32_t K, int32_t S, int32_t C, - const std::string& ty, int32_t BS, int32_t nlocks, int32_t nblocks, op_t op): - base("bsdot"), - N_(N), K_(K), S_(S), C_(C), - ab_ty_(ty), c_ty_(ty), - BS_(BS), nlocks_(nlocks), nblocks_(nblocks), op_(op){ -} - -void dot::init_impl(driver::stream *stream, driver::cu_module *module, triton::runtime::launch_information info) { - int32_t TM = info.globals["TM"]; - size_t grid_0 = (N_ + TM - 1) / TM; - if(nlocks_ && !locks_){ - locks_.reset(triton::driver::buffer::create(stream->context(), grid_0 * nlocks_ * 2 * 4)); - ((driver::cu_buffer*)locks_.get())->set_zero(stream, grid_0 * nlocks_ * 2 * 4); - } -} - -void dot::deinit_impl() { -} - -void dot::enqueue_impl(driver::stream *stream, driver::kernel *kernel, - std::vector args, runtime::launch_information info) { - driver::buffer *a = args[0]; - driver::buffer *b = args[1]; - driver::buffer *c = args[2]; - driver::buffer *lut = args[3]; - kernel->setArg(0, a); - kernel->setArg(1, b); - kernel->setArg(2, c); - if(op_ == FPROP || op_ == BPROP){ - kernel->setArg(3, N_); - kernel->setArg(4, BS_); - kernel->setArg(5, N_); - } - else{ - kernel->setArg(3, N_); - kernel->setArg(4, N_); - kernel->setArg(5, BS_); - } - kernel->setArg(6, N_); - kernel->setArg(7, lut); - kernel->setArg(8, locks_.get()); - kernel->setArg(9, nlocks_); - if(op_ == FPROP || op_ == BPROP){ - int32_t TM = info.globals["TM"]; - size_t grid_0 = (N_ + TM - 1) / TM; - size_t grid_1 = S_; - if(nlocks_) - ((driver::cu_buffer*)locks_.get())->set_zero(stream, grid_0 * nlocks_ * 2 * 4); - stream->enqueue(kernel, {grid_0, grid_1, 1}, {info.num_threads, 1, 1}); - } - else{ - size_t grid_0 = nblocks_; - stream->enqueue(kernel, {grid_0, 1, 1}, {info.num_threads, 1, 1}); - } -} - -driver::buffer* dot::get_locks() const { - return locks_.get(); -} - -std::string dot::triton_c_src_ydx() const { - bool AT = (op_ == WGRAD); - bool BT = (op_ == FPROP); - std::string usea = AT ? "trans(a)" : "a"; - std::string useb = BT ? "trans(b)" : "b"; - std::string sizea = "TM, TK"; - std::string sizeb = BT ? "TN, TK" : "TK, TN"; - std::string bca0 = ":, newaxis"; - std::string bca1 = "newaxis, :"; - std::string bcb0 = BT ? ":, newaxis" : "newaxis, :"; - std::string bcb1 = BT ? "newaxis, :" : ":, newaxis"; - std::string lda0 = AT ? "*lda" : ""; - std::string lda1 = AT ? "" : "*lda"; - std::string ldb0 = BT ? "" : "*ldb"; - std::string ldb1 = BT ? "*ldb" : "" ; - std::string result = - R"( - const tunable int TM = {16, 32, 64, 128}; - const tunable int TN = {)" + std::to_string(BS_) + R"(}; - const tunable int TK = {)" + std::to_string(BS_) + R"(}; - - void bsdot(restrict read_only align(16) )" + ab_ty_ + R"( *A, - restrict read_only align(16) )" + ab_ty_ + R"( *B, - )" + c_ty_ + R"(* C, - int lda, int ldb, int ldc, - int N, int* lut, - int* locks, int nlocks) { - int ridx = get_program_id(0); - float acc[TM, TN] = 0; - int rka[TK] = 0 ... TK; - int rkb[TK] = 0 ... TK; - int *header = lut + get_program_id(1) * 4; - int offset = *(header + 0); - int K = *(header + 1); - int column = *(header + 2); - int lockid = *(header + 3); - int rxa[TM] = ridx * TM + (0 ... TM); - int ryb[TN] = 0 ... TN; - int *plut = lut + offset * 2; - int offa[)" + sizea + "] = rxa[" + bca0 + "]" + lda0 + " + rka[" + bca1 + "]" + lda1 + R"(; - int offb[)" + sizeb + "] = ryb[" + bcb0 + "]" + ldb0 + " + rkb[" + bcb1 + "]" + ldb1 + R"(; - bool checka[TM, TK] = (rxa < N)[:, newaxis]; - for(int k = K; k > 0; k = k - 1) { - int ak = *(plut + 0); - int bk = *(plut + 1); - )" + ab_ty_ + "* pa[" + sizea + R"(] = A + offa + ak * TK * lda; - )" + ab_ty_ + "* pb[" + sizeb + R"(] = B + offb + bk * TK * TN; - )" + ab_ty_ + " a[" + sizea + R"(] = checka ? *pa : 0; - )" + ab_ty_ + " b[" + sizeb + R"(] = *pb; - acc = dot()" + usea + ", " + useb + R"(, acc); - plut = plut + 2; - } - int rxc[TM] = ridx * TM + (0 ... TM); - int ryc[TN] = column * TN + (0 ... TN); - )" + c_ty_ + R"(" c[TM, TN] = acc; - )" + c_ty_ + R"(* pc[TM, TN] = C + rxc[:, newaxis] + ryc[newaxis, :]*ldc; - bool checkc[TM, TN] = (rxc < N)[:, newaxis]; - if(lockid == 0) { - @checkc *pc = c; - } - else { - int *plock = locks + ridx*nlocks + lockid - 1; - int *pcount = plock + get_num_program(0)*nlocks; - while(__atomic_cas(plock, 0, 1)); - int count = *pcount; - if(count == 0) - @checkc *pc = c; - else - @checkc *pc = c + *pc; - __atomic_exch(pcount, 1); - __atomic_exch(plock, 0); - } - })"; - - return result; -} - -std::string dot::triton_c_src_dw() const { - bool AT = (op_ == WGRAD); - bool BT = (op_ == FPROP); - std::string usea = AT ? "trans(a)" : "a"; - std::string useb = BT ? "trans(b)" : "b"; - std::string sizea = AT ? "TK, TM" : "TM, TK"; - std::string sizeb = BT ? "TN, TK" : "TK, TN"; - std::string bca0 = AT ? "newaxis, :" : ":, newaxis"; - std::string bca1 = AT ? ":, newaxis" : "newaxis, :"; - std::string bcb0 = BT ? ":, newaxis" : "newaxis, :"; - std::string bcb1 = BT ? "newaxis, :" : ":, newaxis"; - std::string lda0 = AT ? "*lda" : ""; - std::string lda1 = AT ? "" : "*lda"; - std::string ldb0 = BT ? "" : "*ldb"; - std::string ldb1 = BT ? "*ldb" : "" ; - std::string result = - R"( - const tunable int TM = {)" + std::to_string(BS_) + R"(}; - const tunable int TN = {)" + std::to_string(BS_) + R"(}; - const tunable int TK = {32}; - - void bsdot(restrict read_only align(16) )" + ab_ty_ + R"( *A, - restrict read_only align(16) )" + ab_ty_ + R"( *B, - )" + c_ty_ + R"(* C, - int lda, int ldb, int ldc, - int N, int* lut, - int* locks, int nlocks) { - int ridx = get_program_id(0); - float acc[TM, TN] = 0; - int rka[TK] = 0 ... TK; - int rkb[TK] = 0 ... TK; - int *header = lut + ridx * 2; - int offx = *(header + 0); - int offy = *(header + 1); - int rxa[TM] = offx*TM + (0 ... TM); - int ryb[TN] = offy*TN + (0 ... TN); - bool checka[TK, TM] = (rka < N)[:, newaxis]; - bool checkb[TK, TN] = (rkb < N)[:, newaxis]; - int offa[)" + sizea + "] = rxa[" + bca0 + "]" + lda0 + " + rka[" + bca1 + "]" + lda1 + R"(; - int offb[)" + sizeb + "] = ryb[" + bcb0 + "]" + ldb0 + " + rkb[" + bcb1 + "]" + ldb1 + R"(; - )" + ab_ty_ + " * pa[" + sizea + R"(] = A + offa; - )" + ab_ty_ + " * pb[" + sizeb + R"(] = B + offb; - )" + ab_ty_ + " a[" + sizea + R"(] = checka ? *pa : 0; - )" + ab_ty_ + " b[" + sizeb + R"(] = checkb ? *pb : 0; - for(int k = N; k > 0; k = k - TK) { - acc = dot()" + usea + ", " + useb + R"(, acc); - pa = pa + TK)" + lda1 + R"(; - pb = pb + TK)" + ldb1 + R"(; - a = checka ? *pa : 0; - b = checkb ? *pb : 0; - } - int rxc[TM] = (0 ... TM); - int ryc[TN] = (0 ... TN); - )" + c_ty_ + R"( c[TM, TN] = acc; - )" + c_ty_ + R"(* pc[TM, TN] = C + rxc[:, newaxis]*TM + ryc[newaxis, :] + ridx*TM*TN; - *pc = c; - })"; - - return result; -} -void dot::triton_c_src(std::ostream &os) const { - if(op_ == FPROP || op_ == BPROP) - os << triton_c_src_ydx(); - else - os << triton_c_src_dw(); -} - - - -} -} -} diff --git a/lib/dnn/conv.cpp b/lib/dnn/conv.cpp deleted file mode 100644 index 381691ff0..000000000 --- a/lib/dnn/conv.cpp +++ /dev/null @@ -1,720 +0,0 @@ -#include -#include "triton/dnn/conv.h" - -namespace triton{ -namespace dnn{ - -conv::conv(int B, int NC, - int D, int H, int W, - int T, int R, int S, int NF, - int stride_d, int stride_h, int stride_w, - int pad_d, int pad_h, int pad_w, - int upsample_d, int upsample_h, int upsample_w, - std::string a_ty, std::string b_ty, - type ty, bool bias) - : base("conv"), - NB_(B), NC_(NC), AD_(D), AH_(H), AW_(W), BD_(T), BH_(R), BW_(S), NF_(NF), - stride_d_(stride_d), stride_h_(stride_h), stride_w_(stride_w), - pad_d_(pad_d), pad_h_(pad_h), pad_w_(pad_w), - upsample_d_(upsample_d), upsample_h_(upsample_h), upsample_w_(upsample_w), - a_ty_(a_ty), b_ty_(b_ty), - ty_(ty), bias_(bias) -{ - CD_ = (AD_*upsample_d_ - BD_ + 1 + 2*pad_d_ + stride_d_ - 1)/stride_d_; - CH_ = (AH_*upsample_h_ - BH_ + 1 + 2*pad_h_ + stride_h_ - 1)/stride_h_; - CW_ = (AW_*upsample_w_ - BW_ + 1 + 2*pad_w_ + stride_w_ - 1)/stride_w_; - // shapes - shapes_a_ = {NB_, NC_, AD_, AH_, AW_}; - shapes_b_ = {NC_, BD_, BH_, BW_, NF_}; - shapes_c_ = {NB_, NF_, CD_, CH_, CW_}; - // a layout - NCHW - a_outer_idx_ = 0; - a_inner_idx_ = 1; - a_pix_idx_ = 2; - // b layout - CRSK - b_inner_idx_ = 0; - b_pix_idx_ = 1; - b_outer_idx_ = 4; - // c layout - NKPQ - c_outer_0_idx_ = 0; - c_outer_1_idx_ = 1; - c_pix_idx = 2; - // swap a and c for bprop - if(ty_ == BPROP){ - std::swap(AD_, CD_); - std::swap(AH_, CH_); - std::swap(AW_, CW_); - shapes_a_.swap(shapes_c_); - std::swap(stride_d_, upsample_d_); - std::swap(stride_h_, upsample_h_); - std::swap(stride_w_, upsample_w_); - pad_d_ = (CD_*stride_d_ - AD_*upsample_d_ + BD_ - 1 - stride_d_ + 1)/2; - pad_h_ = (CH_*stride_h_ - AH_*upsample_h_ + BH_ - 1 - stride_h_ + 1)/2; - pad_w_ = (CW_*stride_w_ - AW_*upsample_w_ + BW_ - 1 - stride_w_ + 1)/2; - std::swap(b_inner_idx_, b_outer_idx_); - std::swap(NC_, NF_); - } - // swap b and c for wgrad - if(ty_ == WGRAD){ - shapes_b_.swap(shapes_c_); - std::swap(BD_, CD_); - std::swap(BH_, CH_); - std::swap(BW_, CW_); - std::swap(a_outer_idx_, a_inner_idx_); - std::swap(b_inner_idx_, c_outer_0_idx_); - std::swap(b_outer_idx_, c_outer_1_idx_); - std::swap(b_pix_idx_, c_pix_idx); - } - // leading dimensions - set_ld(shapes_a_, ld_a_); - set_ld(shapes_b_, ld_b_); - set_ld(shapes_c_, ld_c_); - // equivalent matmul - bool upsampled_b = (ty_ == BPROP) && (upsample_d_ > 1 || upsample_h_ > 1 || upsample_w_ > 1); - b_trans_ = ty_ != BPROP; - b_lut_ = ty_ == WGRAD || upsampled_b; - M_ = shapes_c_[c_outer_0_idx_]*shapes_c_[c_pix_idx]*shapes_c_[c_pix_idx+1]*shapes_c_[c_pix_idx+2]; - N_ = shapes_c_[c_outer_1_idx_]; - K_ = shapes_b_[b_inner_idx_]*BD_*BH_*BW_; - // look-up table info - if(ty_ == FPROP) - Fs_ = shapes_b_[1]*shapes_b_[2]*shapes_b_[3]; - else - Fs_ = K_; - TK_ = 8; - Luts_ = (TK_ + Fs_ - 1) / Fs_ * Fs_; - build_a_deltas(); - if(b_lut_) - build_b_deltas(); - build_masks(); - size_t cst_size = h_b_deltas_.size()*4; - is_b_deltas_cst_ = cst_size < 65536; - cst_size += h_a_deltas_.size()*4; - is_a_deltas_cst = cst_size < 65536; - cst_size += h_masks_.size()*4; - is_mask_cst_ = cst_size < 65536; - max_grid_0_ = 256; - max_grid_1_ = 256; -} - -// comparison for maps -std::vector conv::retune_params() const { - return {NB_, NC_, AD_, AH_, AW_, - NF_, BD_, BH_, BW_, - pad_d_, pad_h_, pad_w_, - stride_d_, stride_h_, stride_w_, - ty_, bias_}; -} - -// clone -base* conv::clone() const { - return new conv(*this); -} - -size_t conv::a_size() -{ return std::accumulate(shapes_a_.begin(), shapes_a_.end(), - 1, std::multiplies()); } - -size_t conv::b_size() -{ return std::accumulate(shapes_b_.begin(), shapes_b_.end(), - 1, std::multiplies()); } - -size_t conv::c_size() -{ return std::accumulate(shapes_c_.begin(), shapes_c_.end(), - 1, std::multiplies()); } - -std::vector conv::c_shapes() -{ return shapes_c_; } - - -std::tuple conv::unpack(int32_t ltrs, bool flip, int32_t EBD, int32_t EBH, int32_t EBW) { - int32_t l, t, r, s; - if(b_trans_){ - l = ltrs / (EBD*EBH*EBW); - int32_t trs = ltrs % (EBD*EBH*EBW); - int32_t tr = trs / EBW; - s = trs % EBW; - t = tr / EBH; - r = tr % EBH; - } - else{ - int32_t rs = ltrs / NC_; - l = ltrs % NC_; - r = rs / EBW; - s = rs % EBW; - } - if(flip){ - r = EBH - 1 - r; - s = EBW - 1 - s; - } - return std::make_tuple(l, t, r, s); -} - -void conv::build_b_deltas(){ - h_b_deltas_.resize(Luts_*upsample_d_*upsample_h_*upsample_w_); - - size_t Ds0 = Luts_; - size_t Ds1 = upsample_w_; - size_t Ds2 = upsample_h_; - size_t Ds3 = upsample_d_; - for(size_t ud = 0; ud < Ds3; ++ud) - for(size_t uh = 0; uh < Ds2; ++uh) - for(size_t uw = 0; uw < Ds1; ++uw) { - int32_t* deltas_ptr = &h_b_deltas_[uw*Ds0 + uh*Ds0*Ds1 + ud*Ds0*Ds1*Ds2]; - for(size_t i = 0; i < Luts_; ++i) { - int32_t EBD = 1; - int32_t EBH = ((upsample_h_ - uh - 1) + BH_) / upsample_h_; - int32_t EBW = ((upsample_w_ - uw - 1) + BW_) / upsample_w_; - if(EBD == 0 || EBH == 0 || EBW == 0) - continue; - int32_t c, t, r, s; - int32_t nextc, nextt, nextr, nexts; - std::tie(c, t, r, s) = unpack(i, false, EBD, EBH, EBW); - std::tie(nextc, nextt, nextr, nexts) = unpack(i + TK_, false, EBD, EBH, EBW); - int32_t cdiff = nextc - c; - int32_t tdiff = (nextt - t)*upsample_d_; - int32_t rdiff = (nextr - r)*upsample_h_; - int32_t sdiff = (nexts - s)*upsample_w_; - deltas_ptr[i] = cdiff*ld_b_[b_inner_idx_] + tdiff*ld_b_[b_pix_idx_] + rdiff*ld_b_[b_pix_idx_ + 1] + sdiff*ld_b_[b_pix_idx_ + 2]; - } - } -} - -void conv::build_a_deltas(){ - h_a_deltas_.resize(Luts_ + upsample_d_*upsample_h_*upsample_w_*Luts_); - for(size_t i = 0; i < Luts_; ++i) - h_a_deltas_[i] = (((i + TK_) % Luts_) - i); - size_t Ds0 = Luts_; - size_t Ds1 = upsample_w_; - size_t Ds2 = upsample_h_; - size_t Ds3 = upsample_d_; - for(size_t ud = 0; ud < Ds3; ++ud) - for(size_t uh = 0; uh < Ds2; ++uh) - for(size_t uw = 0; uw < Ds1; ++uw) { - int32_t* deltas_ptr = &h_a_deltas_[Luts_ + uw*Ds0 + uh*Ds0*Ds1 + ud*Ds0*Ds1*Ds2]; - // cumulative increments - for(size_t i = 0; i < Ds0; ++i) { - int32_t EBD = 1; - int32_t EBH = ((upsample_h_ - uh - 1) + BH_) / upsample_h_; - int32_t EBW = ((upsample_w_ - uw - 1) + BW_) / upsample_w_; - if(EBD == 0 || EBH == 0 || EBW == 0) - continue; - // unpack - int32_t ctrs = i; - int32_t c, t, r, s; - std::tie(c, t, r, s) = unpack(ctrs, !b_trans_, EBD, EBH, EBW); - // next indices - int32_t nextctrs = ctrs + TK_; - int32_t nextc, nextt, nextr, nexts; - std::tie(nextc, nextt, nextr, nexts) = unpack(nextctrs, !b_trans_, EBD, EBH, EBW); - // diffs - int32_t cdiff = nextc - c; - int32_t tdiff = nextt - t; - int32_t rdiff = nextr - r; - int32_t sdiff = nexts - s; - if(ty_ == WGRAD){ - tdiff = tdiff * stride_d_; - rdiff = rdiff * stride_h_; - sdiff = sdiff * stride_w_; - } - // delta pointers - deltas_ptr[i] = cdiff*ld_a_[a_inner_idx_] + tdiff*ld_a_[a_pix_idx_] + rdiff*ld_a_[a_pix_idx_ + 1] + sdiff*ld_a_[a_pix_idx_ + 2]; - } - } -} - -void conv::build_masks(){ - h_masks_.resize(Luts_ + upsample_d_*upsample_h_*upsample_w_*(2*pad_h_+1)*(2*pad_w_+1)*(2*pad_d_+1)*Luts_); - - size_t Ms0 = Luts_; - size_t Ms1 = 2*pad_w_ + 1; - size_t Ms2 = 2*pad_h_ + 1; - size_t Ms3 = 2*pad_d_ + 1; - size_t Ms4 = upsample_w_; - size_t Ms5 = upsample_h_; - size_t Ms6 = upsample_d_; - for(size_t ud = 0; ud < Ms6; ++ud) - for(size_t uh = 0; uh < Ms5; ++uh) - for(size_t uw = 0; uw < Ms4; ++uw) - for(size_t pd = 0; pd < Ms3; ++pd) - for(size_t ph = 0; ph < Ms2; ++ph) - for(size_t pw = 0; pw < Ms1; ++pw){ - int32_t* masks_ptr = &h_masks_[Luts_ + pw*Ms0 + ph*Ms0*Ms1 + pd*Ms0*Ms1*Ms2 + uw*Ms0*Ms1*Ms2*Ms3 + uh*Ms0*Ms1*Ms2*Ms3*Ms4 + ud*Ms0*Ms1*Ms2*Ms3*Ms4*Ms5]; - for(size_t i = 0; i < Ms0; ++i){ - int32_t l, t, r, s; - int32_t mask = 0x0; - for(size_t j = 0; j < TK_; ++j){ - int32_t EBD = 1; - int32_t EBH = ((upsample_h_ - uh - 1) + BH_) / upsample_h_; - int32_t EBW = ((upsample_w_ - uw - 1) + BW_) / upsample_w_; - if(EBD == 0 || EBH == 0 || EBW == 0) - continue; - std::tie(l, t, r, s) = unpack(i + j, !b_trans_, EBD, EBH, EBW); - bool in_bounds_d = (t + pd) >= pad_d_ && (t + pd) < (EBD + pad_d_); - bool in_bounds_h = (r + ph) >= pad_h_ && (r + ph) < (EBH + pad_h_); - bool in_bounds_w = (s + pw) >= pad_w_ && (s + pw) < (EBW + pad_w_); - mask |= (in_bounds_d && in_bounds_h && in_bounds_w) << j; - } - masks_ptr[i] = mask; - } - } - for(size_t i = 0; i < Luts_; ++i) - h_masks_[i] = 0x0; -} - -std::array conv::get_grid(size_t TM, size_t TN){ - return {(M_ + TM - 1)/TM, (N_ + TN - 1)/TN, 1}; -} - -size_t conv::num_flops() const{ - return 2.*M_*N_*K_; -} - -void conv::init_impl(driver::stream *stream, triton::driver::cu_module* module, triton::runtime::launch_information info) { - auto init_lut = [&](bool is_cst, const char *name, std::vector host) -> triton::driver::buffer*{ - if(host.empty()) - return nullptr; - size_t nbytes = host.size()*4; - // get buffer - triton::driver::buffer* buffer; - if(is_cst) - buffer = module->symbol(name); - else - buffer = triton::driver::buffer::create(stream->context(), nbytes); - // copy - stream->write(buffer, false, 0, nbytes, host.data()); - return buffer; - }; - if(d_a_deltas_ == nullptr) - d_a_deltas_ = init_lut(is_a_deltas_cst, "delta", h_a_deltas_); - if(d_b_deltas_ == nullptr) - d_b_deltas_ = init_lut(is_b_deltas_cst_, "b_delta", h_b_deltas_); - if(d_masks_ == nullptr) - d_masks_ = init_lut(is_mask_cst_, "masks", h_masks_); - if(d_locks_ == nullptr){ - d_locks_ = triton::driver::buffer::create(stream->context(), max_grid_0_*max_grid_1_*4*2); - ((triton::driver::cu_buffer*)d_locks_)->set_zero(stream, max_grid_0_*max_grid_1_*4*2); - } -} - -void conv::set_arg(driver::kernel *kernel, - driver::buffer *a, driver::buffer *b, driver::buffer *c, driver::buffer *bias) -{ - kernel->setArg(0, a); - kernel->setArg(1, b); - kernel->setArg(2, c); - kernel->setArg(3, bias); - kernel->setArg(4, M_); - kernel->setArg(5, N_); - kernel->setArg(6, K_); - kernel->setArg(7, AH_); - kernel->setArg(8, AW_); - kernel->setArg(9, BH_); - kernel->setArg(10, BW_); - kernel->setArg(11, CH_); - kernel->setArg(12, CW_); - kernel->setArg(13, NC_); - // A arguments - kernel->setArg(14, ld_a_[a_outer_idx_]); - kernel->setArg(15, ld_a_[a_inner_idx_]); - kernel->setArg(16, ld_a_[2]); - kernel->setArg(17, ld_a_[3]); - kernel->setArg(18, ld_a_[4]); - // B arguments - kernel->setArg(19, ld_b_[b_inner_idx_]); - kernel->setArg(20, ld_b_[b_pix_idx_]); - kernel->setArg(21, ld_b_[b_pix_idx_+1]); - kernel->setArg(22, ld_b_[b_pix_idx_+2]); - kernel->setArg(23, ld_b_[b_outer_idx_]); - // C arguments - kernel->setArg(24, ld_c_[c_outer_0_idx_]); - kernel->setArg(25, ld_c_[c_outer_1_idx_]); - kernel->setArg(26, ld_c_[c_pix_idx]); - kernel->setArg(27, ld_c_[c_pix_idx+1]); - kernel->setArg(28, ld_c_[c_pix_idx+2]); - // pad - kernel->setArg(29, pad_h_); - kernel->setArg(30, pad_w_); - // stride - kernel->setArg(31, stride_h_); - kernel->setArg(32, stride_w_); - // dilate - kernel->setArg(33, upsample_h_); - kernel->setArg(34, upsample_w_); - kernel->setArg(35, (int32_t)0); - kernel->setArg(36, (int32_t)0); - kernel->setArg(37, pad_h_); - kernel->setArg(38, pad_w_); - kernel->setArg(39, (int32_t)0); - kernel->setArg(40, (int32_t)0); - kernel->setArg(41, d_locks_); - kernel->setArg(42, max_grid_0_); - kernel->setArg(43, max_grid_1_); - size_t idx = 44; - if(!is_a_deltas_cst) - kernel->setArg(idx++, d_a_deltas_); - if(!is_b_deltas_cst_) - kernel->setArg(idx++, d_b_deltas_); - if(!is_mask_cst_) - kernel->setArg(idx++, d_masks_); -} - -void conv::enqueue_impl(driver::stream *stream, driver::kernel *kernel, - std::vector args, - runtime::launch_information info) { - driver::buffer *a = args[0], *b = args[1], *c = args[2], *bias = args[3]; - unsigned TM = info.globals["TM"], TN = info.globals["TN"]; - unsigned GZ = 1; - set_arg(kernel, a, b, c, bias); - std::array grid = {1}; - grid[0] = (M_ + TM - 1)/TM; - grid[1] = (N_ + TN - 1)/TN; - grid[2] = GZ; - grid[0] /= upsample_h_*upsample_w_; - kernel->setArg(11, CH_/upsample_h_); - kernel->setArg(12, CW_/upsample_w_); - - // initialize to zero if necessary - bool init_zero = false; - for(int32_t off_uh = 0; off_uh < upsample_h_; off_uh++) - for(int32_t off_uw = 0; off_uw < upsample_w_; off_uw++) { - int32_t EBD = 1; - int32_t EBH = ((upsample_h_ - off_uh - 1) + BH_) / upsample_h_; - int32_t EBW = ((upsample_w_ - off_uw - 1) + BW_) / upsample_w_; - if(EBD == 0 || EBH == 0 || EBW == 0) - init_zero = true; - } - if(init_zero) - ((driver::cu_buffer*)c)->set_zero(stream, c_size()*4); - - for(int32_t off_uh = 0; off_uh < upsample_h_; off_uh++) - for(int32_t off_uw = 0; off_uw < upsample_w_; off_uw++) { - int32_t EBD = 1; - int32_t EBH = ((upsample_h_ - off_uh - 1) + BH_) / upsample_h_; - int32_t EBW = ((upsample_w_ - off_uw - 1) + BW_) / upsample_w_; - if(EBD == 0 || EBH == 0 || EBW == 0) - continue; - int32_t K = shapes_b_[b_inner_idx_]*EBD*EBH*EBW; - kernel->setArg(6, K); - kernel->setArg(9, EBH); - kernel->setArg(10, EBW); - kernel->setArg(29, pad_h_); - kernel->setArg(30, pad_w_); - kernel->setArg(35, off_uh); - kernel->setArg(36, off_uw); - kernel->setArg(37, (pad_h_ + (1 - upsample_h_)*off_uh)/upsample_h_); - kernel->setArg(38, (pad_w_ + (1 - upsample_w_)*off_uw)/upsample_w_); - kernel->setArg(39, (off_uh + pad_h_) % upsample_h_); - kernel->setArg(40, (off_uw + pad_w_) % upsample_w_); - stream->enqueue(kernel, grid, {info.num_threads, 1, 1}); - } -} - -std::vector conv::default_params() { - if(b_lut_){ - if(!b_trans_) - return {16, 2, 32, 16, 16, 8, 8, 2, 2, 4, 2, 8, 4, 2, 1}; - else - return {32, 2, 64, 32, 2, 64, 16, 8, 2, 2, 4, 2, 8, 1}; - } - else if(ty_ == FPROP) - return {16, 2, 64, 32, 2, 64, 16, 8, 2, 2, 8, 1, 8, 4, 1}; - else - return {16, 2, 64, 16, 16, 16, 4, 2, 2, 4, 2, 8, 4, 2, 1}; -} - - -/* CPU reference implementation */ - -template -void conv::cpu_xprop(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B) -{ - IN_DTYPE acc; - for(int32_t n = 0; n < shapes_c_[0]; ++n) - for(int32_t cf = 0; cf < shapes_c_[1] ; ++cf) - for(int32_t cd = 0 ; cd < shapes_c_[2]; ++cd) - for(int32_t ch = 0 ; ch < shapes_c_[3]; ++ch) - for(int32_t cw = 0; cw < shapes_c_[4]; ++cw) - { - acc = 0; - int32_t d = cd*stride_d_ - pad_d_; - int32_t h = ch*stride_h_ - pad_h_; - int32_t w = cw*stride_w_ - pad_w_; - for(int32_t ac = 0; ac < shapes_a_[1]; ++ac) - for(int32_t bd = 0; bd < shapes_b_[1]; ++bd) - for(int32_t bh = 0; bh < shapes_b_[2]; ++bh) - for(int32_t bw = 0; bw < shapes_b_[3]; ++bw){ - int32_t ad = d + bd; - int32_t ah = h + bh; - int32_t aw = w + bw; - bool in_bounds = (ad >= 0 && ad < shapes_a_[2] && - ah >= 0 && ah < shapes_a_[3] && - aw >= 0 && aw < shapes_a_[4]); - IN_DTYPE a = 0; - if(in_bounds) - a = A[n*ld_a_[0] + ac*ld_a_[1] + ad*ld_a_[2] + ah*ld_a_[3] + aw*ld_a_[4]]; - IN_DTYPE b; - if(b_trans_) - b = B[ac*ld_b_[0] + bd*ld_b_[1] + bh*ld_b_[2] + bw*ld_b_[3] + cf*ld_b_[4]]; - else{ - int32_t bdd = shapes_b_[1] - 1 - bd; - int32_t bhh = shapes_b_[2] - 1 - bh; - int32_t bww = shapes_b_[3] - 1 - bw; - b = B[cf*ld_b_[0] + bdd*ld_b_[1] + bhh*ld_b_[2] + bww*ld_b_[3] + ac*ld_b_[4]]; - } - acc = std::fma(a, b, acc); - } - C[n*ld_c_[0] + cf*ld_c_[1] + cd*ld_c_[2] + ch*ld_c_[3] + cw*ld_c_[4]] = acc; - } -} - -template -void conv::cpu_wgrad(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B) -{ - IN_DTYPE acc; - for(int32_t c = 0 ; c < shapes_c_[0]; ++c) - for(int32_t cd = 0; cd < shapes_c_[1]; ++cd) - for(int32_t ch = 0; ch < shapes_c_[2]; ++ch) - for(int32_t cw = 0; cw < shapes_c_[3]; ++cw) - for(int32_t k = 0 ; k < shapes_c_[4]; ++k) - { - acc = 0; - int32_t d = cd*stride_d_ - pad_d_; - int32_t h = ch*stride_h_ - pad_h_; - int32_t w = cw*stride_w_ - pad_w_; - for(int32_t n = 0; n < shapes_b_[0]; ++n) - for(int32_t bd = 0; bd < shapes_b_[2]; ++bd) - for(int32_t bh = 0; bh < shapes_b_[3]; ++bh) - for(int32_t bw = 0; bw < shapes_b_[4]; ++bw){ - int32_t ad = d + bd; - int32_t ah = h + bh; - int32_t aw = w + bw; - bool in_bounds = (ad >= 0 && ad < shapes_a_[2] && - ah >= 0 && ah < shapes_a_[3] && - aw >= 0 && aw < shapes_a_[4]); - IN_DTYPE a = 0; - if(in_bounds) - a = A[n*ld_a_[0] + c*ld_a_[1] + ad*ld_a_[2] + ah*ld_a_[3] + aw*ld_a_[4]]; - IN_DTYPE b = B[n*ld_b_[0] + k*ld_b_[1] + bd*ld_b_[2] + bh*ld_b_[3] + bw*ld_b_[4]]; - acc = std::fma(a, b, acc); - } - C[c*ld_c_[0] + cd*ld_c_[1] + ch*ld_c_[2] + cw*ld_c_[3] + k*ld_c_[4]] = acc; - } -} - -template -void conv::cpu_ref(OUT_DTYPE* C, IN_DTYPE* A, IN_DTYPE* B) -{ - if(ty_ == FPROP || ty_ == BPROP) - cpu_xprop(C, A, B); - else - cpu_wgrad(C, A, B); -} - -/* Triton-C source code */ - -void conv::triton_c_src(std::ostream &os) const { - std::string BS = b_trans_ ? "[TN,TK]" : "[TK, TN]"; - std::string bcb0 = b_trans_ ? "[:, newaxis]" : "[newaxis, :]"; - std::string bcb1 = b_trans_ ? "[newaxis, :]" : "[:, newaxis]"; - std::string ldb0 = b_trans_ ? "*ldb_s" : ""; - std::string useb = b_trans_ ? "trans(b)" : "b"; - std::string flipr = b_trans_ ? "" : "BH - 1 -"; - std::string flips = b_trans_ ? "" : "BW - 1 -"; - std::string upar = ty_ == WGRAD ? "stride_h * ": ""; - std::string upas = ty_ == WGRAD ? "stride_w * ": ""; - std::string upah = ty_ == WGRAD ? "": "*stride_h"; - std::string upaw = ty_ == WGRAD ? "": "*stride_w"; - std::vector crs = {"c", "r", "s"}; - std::vector rsc = {"r", "s", "c"}; - std::vector ax = b_trans_ ? crs : rsc; - std::vector redax; - if(b_trans_) - redax = {"NC", "BH", "BW"}; - else - redax = {"BH", "BW", "NC"}; - std::string inc_pb = b_lut_ ? "db" + bcb1 : "TK" + ldb0; - std::string inc_pdb = b_trans_ ? "incd" : "TK"; - std::string a_delta_mem = is_a_deltas_cst ? "__constant__" : ""; - std::string b_delta_mem = is_b_deltas_cst_? "__constant__" : ""; - std::string masks_mem = is_mask_cst_? "__constant__" : ""; - - os << - R"( -const tunable int TM = {16, 32, 64}; -const tunable int TN = {16, 32, 64}; -const tunable int TK = {)" << TK_ << R"(}; -const tunable int GZ = {1}; -)"; -if(is_a_deltas_cst) - os << "__constant__ int* delta = alloc_const int[" + std::to_string(h_a_deltas_.size()) + "];\n"; -if(b_lut_ && is_b_deltas_cst_) - os << "__constant__ int* b_delta = alloc_const int[" + std::to_string(h_b_deltas_.size()) + "];\n"; -if(is_mask_cst_) - os << "__constant__ int* masks = alloc_const int[" + std::to_string(h_masks_.size()) + "];\n"; -os << R"( - - void conv(read_only restrict )" << a_ty_ << R"( *a, - read_only restrict )" << b_ty_ << R"( *b, - float *c, - float *bias, - int M, int N, int K, - int AH, int AW, - int BH, int BW, - int CH, int CW, - int NC, - int lda_n, int lda_c, int lda_d, int lda_h, int lda_w, - int ldb_c, int ldb_t, int ldb_r, int ldb_s, int ldb_k, - int ldc_n, int ldc_k, int ldc_m, int ldc_p, int ldc_q, - int pad_h, int pad_w, - int stride_h, int stride_w, - int upsample_h, int upsample_w, - int off_uh, int off_uw, - int off_uah, int off_uaw, - int off_uch, int off_ucw, - int *locks, int grid0, int grid1)"; -if(!is_a_deltas_cst) - os << ", int* delta"; -if(b_lut_ && !is_b_deltas_cst_) - os << ", int* b_delta"; -if(!is_mask_cst_) - os << ", int* masks"; - os << R"(){ - int rxa[TM] = get_global_range[TM](0); - int rb0[TN] = get_global_range[TN](1); - int rz = get_global_range[1](2); - int rka[TK] = 0 ... TK; - int rkb[TK] = 0 ... TK; - float C[TM, TN] = 0; - int ldlut = )" + std::to_string(Luts_) + R"(; - int div = K / GZ; - int rem = K % GZ; - K = select(rz < rem, div, div + rem); - int offk = rz*div; - rka = rka + offk; - rkb = rkb + offk; - int rabh[TM] = rxa / CW; - int raw[TM] = rxa % CW; - int rab[TM] = rabh / CH; - int rah[TM] = rabh % CH; - rah = rah)" + upaw + R"( - off_uah; - raw = raw)" + upah + R"( - off_uaw; - int ra0[TM] = rab*lda_n + rah*lda_h + raw*lda_w; - int ra)" + ax[0] + ax[1] + "[TK] = rka / " + redax[2] + R"(; - int ra)" + ax[2] + "[TK] = rka % " + redax[2] + R"(; - int ra)" + ax[0] + "[TK] = ra" + ax[0] + ax[1] + " / " + redax[1] + R"(; - int ra)" + ax[1] + "[TK] = ra" + ax[0] + ax[1] + " % " + redax[1] + R"(; - rar = )" + flipr + R"( rar; - ras = )" + flips + R"( ras; - rar = )" + upar + R"( rar; - ras = )" + upas + R"( ras; - int ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; - )" << a_ty_ << R"(* pa[TM, TK] = a + ra1[newaxis, :] + ra0[:, newaxis];)"; -if(b_lut_){ - os << R"( - int rb)" + ax[0] + ax[1] + "[TK] = rkb / " + redax[2] + R"(; - int rb)" + ax[2] + "[TK] = rkb % " + redax[2] + R"(; - int rb)" + ax[0] + "[TK] = rb" + ax[0] + ax[1] + " / " + redax[1] + R"(; - int rb)" + ax[1] + "[TK] = rb" + ax[0] + ax[1] + " % " + redax[1] + R"(; - rbr = rbr*upsample_h + off_uh; - rbs = rbs*upsample_w + off_uw; - int offdb[TK] = rkb % ldlut; - int rb1[TK] = rbc*ldb_c + rbr*ldb_r + rbs*ldb_s; - )" + b_delta_mem + R"( int* pdb[TK] = b_delta + offdb + off_uw*ldlut + off_uh*ldlut*upsample_w; - int db[TK] = *pdb;)"; -} -else{ -os << R"( - int rb1[TK] = rkb)" + ldb0 + ";"; -} -os << R"( - )" << b_ty_ << R"(* pb)" + BS + " = b + rb1" + bcb1 + " + rb0" + bcb0 + R"(*ldb_k; - int offda[TK] = rka % ldlut; - )" + a_delta_mem + R"( int* pincd[TK] = delta + offda; - )" + a_delta_mem + R"( int* pda[TK] = delta + ldlut + offda + off_uw*ldlut + off_uh*ldlut*upsample_w; - int da[TK] = *pda; - int incd[TK] = *pincd; - int maskh[TM] = pad_h + min(rah, 0) + max(rah + BH - AH, 0); - int maskw[TM] = pad_w + min(raw, 0) + max(raw + BW - AW, 0); - int offma = offk % ldlut; - )" + masks_mem + R"( int* pm[TM] = masks + ldlut + offma + maskw*ldlut + maskh*ldlut*(2*pad_w + 1) + off_uw*ldlut*(2*pad_w+1)*(2*pad_h+1) + off_uh*ldlut*(2*pad_w+1)*(2*pad_h+1)*upsample_w; - )" + a_delta_mem + R"( int* pincm[TM] = delta + offma; - int incm[TM] = *pincm; - int maska0[TM] = *pm; - int maska1[TK] = 1 << (0 ... TK); - bool checka[TM, TK] = (maska0[:, newaxis] & maska1[newaxis, :]) > 0; - bool checkb0[TN] = rb0 < N; - bool checkb)" + BS + " = checkb0" + bcb0 + R"(; - )" << a_ty_ << R"( a[TM, TK] = checka ? *pa : 0; - )" << b_ty_ << R"( b)" + BS + R"( = checkb ? *pb : 0; - int rkamin[TK] = rka - offk + TK; - for(int k = K; k > 0; k = k - TK){ - C = dot(a, )" + useb + R"(, C); - pa = pa + da[newaxis, :]; - pb = pb + )" + inc_pb + R"(; - pda = pda + incd;)"; -if(b_lut_){ - os << R"( - pdb = pdb + )" + inc_pdb + R"(; - db = *pdb;)"; -} - os << R"( - pincd = pincd + incd; - da = *pda; - incd = *pincd; - pm = pm + incm; - pincm = pincm + incm; - incm = *pincm; - bool checka1[TK] = (rkamin < k); - maska0 = *pm; - checka = (maska0[:, newaxis] & maska1[newaxis, :]) > 0; - checka = checka && checka1[newaxis,:]; - a = checka ? *pa : 0; - checkb = checkb && (k > TK); - @checkb b = *pb; - } - int rxc[TM] = get_global_range[TM](0); - int rc1[TN] = get_global_range[TN](1); - int rcn[TM] = rxc / (CH*CW); - int rcpq[TM] = rxc % (CH*CW); - int rcp[TM] = rcpq / CW; - int rcq[TM] = rcpq % CW; - rcp = rcp * upsample_h + off_uch; - rcq = rcq * upsample_w + off_ucw; - bool checkc1[TN] = rc1 < N; - int rc0[TM] = rcn * ldc_n + rcp * ldc_p + rcq * ldc_q; - float* pc[TM, TN] = c + rc1[newaxis, :]*ldc_k + rc0[:, newaxis]; - bool checkc0[TM] = rxc < M; - bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - int ridx = get_program_id(0); - int ridy = get_program_id(1); - int *plock = locks + ridx + ridy*grid0; - while(__atomic_cas(plock, 0, 1) == 1); - int *pcount = plock + grid0*grid1; - int count = *pcount; - int countp1 = select(count == GZ - 1, 0, count + 1); - if(count == 0) {)"; - if(bias_ && ty_==FPROP){ - os << R"( - float* pbias[TN] = bias + rc1; - float bias[TN] = checkc1 ? *pbias : 0; - C = C + bias[newaxis, :];)"; - } - os << R"( - @checkc *pc = C; - *pcount = countp1; - } - else { - @checkc *pc = C + *pc; - *pcount = countp1; - } - *plock = 0; -})"; -} - -template void conv::cpu_ref(float*, float*, float*); -template void conv::cpu_xprop(float*, float*, float*); -template void conv::cpu_wgrad(float*, float*, float*); - -} -} diff --git a/lib/dnn/dot.cpp b/lib/dnn/dot.cpp deleted file mode 100644 index f3d35a2f0..000000000 --- a/lib/dnn/dot.cpp +++ /dev/null @@ -1,162 +0,0 @@ -#include "triton/driver/stream.h" -#include "triton/driver/kernel.h" -#include "triton/dnn/dot.h" -#include "triton/dnn/heuristics.h" -#include - -namespace triton{ -namespace dnn{ - -dot::dot(int M, int N, int K, - bool AT, bool BT, - std::string a_ty, std::string b_ty, std::string c_ty, - unsigned align_lda, unsigned align_ldb, unsigned align_ldc) - : base("matmul"), - M_(M), N_(N), K_(K), AT_(AT), BT_(BT), - a_ty_(a_ty), b_ty_(b_ty), c_ty_(c_ty), - align_lda_(align_lda), align_ldb_(align_ldb), align_ldc_(align_ldc), - locks_(nullptr) { - -} - -size_t dot::num_flops() const { - return 2.*M_*N_*K_; -} - -// retune parameters -std::vector dot::retune_params() const { - return {M_, N_, K_, AT_, BT_, - (int)align_lda_, (int)align_ldb_}; -} - -// clone -base* dot::clone() const { - return new dot(*this); -} - -void dot::init_impl(driver::stream* stream, driver::cu_module *, runtime::launch_information) { - std::vector hlocks(2048, 0); - if(locks_ == nullptr) - locks_ = triton::driver::buffer::create(stream->context(), hlocks.size()*4); - stream->write(locks_, false, 0, hlocks); -} - -void dot::enqueue_impl(driver::stream *stream, driver::kernel *kernel, - std::vector args, - runtime::launch_information info) { - driver::buffer *a = args[0], *b = args[1], *c = args[2]; - unsigned TM = info.globals.at("TM"); - unsigned TN = info.globals.at("TN"); - unsigned TK = info.globals.at("TK"); - unsigned grid_0 = (M_ + TM - 1)/TM; - unsigned grid_1 = (N_ + TN - 1)/TN; - unsigned grid_2 = 1; - int32_t lda = AT_ ? K_ : M_; - int32_t ldb = BT_ ? N_ : K_; - int32_t ldc = M_; - std::array grid = {grid_0, grid_1, grid_2}; - kernel->setArg(0, a); - kernel->setArg(1, b); - kernel->setArg(2, c); - kernel->setArg(3, M_); - kernel->setArg(4, N_); - kernel->setArg(5, K_); - kernel->setArg(6, lda); - kernel->setArg(7, ldb); - kernel->setArg(8, ldc); - kernel->setArg(9, TK); - kernel->setArg(10, locks_); - kernel->setArg(11, grid_0); - kernel->setArg(12, grid_1); - stream->enqueue(kernel, grid, {info.num_threads, 1, 1}); -} - -void dot::triton_c_src(std::ostream &os) const { - std::string ZS = "1"; - std::string AS0 = "TM", AS1 = "TK"; - std::string BS0 = "TK", BS1 = "TN"; - std::string XAS0 = "TM", XAS1 = "TK / " + ZS, XAS2 = ZS; - std::string XBS0 = "TK / " + ZS, XBS1 = ZS, XBS2 = "TN"; - std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; - std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; - std::string lda0 = "*lda", lda1 = ""; - std::string ldb0 = "", ldb1 = "*ldb"; - std::string usea = AT_ ? "trans(a)" : "a"; - std::string useb = BT_ ? "trans(b)" : "b"; - if(AT_){ - std::swap(AS0, AS1); - std::swap(XAS0, XAS1); - std::swap(XAS1, XAS2); - std::swap(bca0, bca1); - std::swap(lda0, lda1); - } - if(BT_){ - std::swap(BS0, BS1); - std::swap(XBS1, XBS2); - std::swap(XBS0, XBS1); - std::swap(bcb0, bcb1); - std::swap(ldb0, ldb1); - } - std::string AS = AS0 + ", " + AS1; - std::string BS = BS0 + ", " + BS1; -// std::string XAS = XAS0 + ", " + XAS1 + ", " + XAS2; -// std::string XBS = XBS0 + ", " + XBS1 + ", " + XBS2; - std::string XCS = "TM, TN"; - std::string align_lda_str = "multiple_of(" + std::to_string(align_lda_) + ")"; - std::string align_ldb_str = "multiple_of(" + std::to_string(align_ldb_) + ")"; - std::string res = -R"( -const tunable int TM = {128}; -const tunable int TN = {128}; -const tunable int TK = {32}; - -void matmul(restrict read_only align(16) )" + a_ty_ + R"( *A, - restrict read_only align(16) )" + b_ty_ + R"( *B, - restrict read_only align(16) )" + c_ty_ + R"( *C, - int M, int N, int K, - )" + align_lda_str + R"( int lda, )" + align_ldb_str + R"(" int ldb, int ldc, - int bound, int *locks, int grid0, int grid1) { - int ridx = get_program_id(0); - int ridy = get_program_id(1); - int rxa[TM] = ridx * TM + (0 ... TM); - int ryb[TN] = ridy * TN + (0 ... TN); - int rka[TK] = 0 ... TK; - int rkb[TK] = 0 ... TK; - float xc[)" + XCS + R"(] = 0; - )" + a_ty_ + R"(* pa[)" + AS + "] = A + rka" + bca0 + lda0 + " + rxa" + bca1 + lda1 + R"(; - )" + b_ty_ + R"(* pb[)" + BS + "] = B + rkb" + bcb0 + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; - )" + a_ty_ + R"( a[)" + AS + R"(] = *pa; - )" + b_ty_ + R"( b[)" + BS + R"(] = *pb; - for(int k = K; k > 0; k = k - TK){ - xc = dot()" + usea + ", " + useb + R"(, xc); - pa = pa + TK)" + lda0 + R"(; - pb = pb + TK)" + ldb0 + R"(; - a = *pa; - b = *pb; - } - int rxc[TM] = ridx * TM + (0 ... TM); - int ryc[TN] = ridy * TN + (0 ... TN); - )" + c_ty_ + R"(* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - )" + c_ty_ + R"( c[TM, TN] = xc; - bool checkc0[TM] = rxc < M; - bool checkc1[TN] = ryc < N; - bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - @checkc *pc = c; -} -)"; - - os << res; -} - -// small search space for partial auto-tuning -std::vector dot::search_space() const { - return dot_search_space(AT_, BT_); -} - -// simple parameter heuristics -params_t dot::heuristics() const { - return dot_heuristics(AT_, BT_, M_, N_, K_); -} - -} -} diff --git a/lib/dnn/shift.cpp b/lib/dnn/shift.cpp deleted file mode 100644 index 93ae57cd4..000000000 --- a/lib/dnn/shift.cpp +++ /dev/null @@ -1,538 +0,0 @@ -#include -#include "triton/dnn/shift.h" -#include "triton/dnn/heuristics.h" -#include "triton/tools/bench.hpp" - -namespace triton{ -namespace dnn{ - - -shift::shift(int B, int C, - int D, int H, int W, - int T, int R, int S, - int F, - int stride_h, int stride_w, - const int32_t *shift_h, const int32_t *shift_w, - std::string a_ty, std::string b_ty, - op_t ty, bool bias, - layout_t layout) - : base("shift"), - B_(B), C_(C), - AD_(D), AH_(H), AW_(W), - BD_(T), BH_(R), BW_(S), - F_(F), - stride_d_(1), stride_h_(stride_h), stride_w_(stride_w), - shift_h_(shift_h), shift_w_(shift_w), - a_ty_(a_ty), b_ty_(b_ty), c_ty_(b_ty), - op_(ty), bias_(bias), - layout_(layout){ -// std::cout << B_ << " " << C_ << " " << F_ << " " << stride_h_ << " " << stride_w_ << " " << a_ty_ << " " << b_ty_ << " " << ty_ << " " << layout_ << std::endl; - // max number of channels - TK_ = (ty == FPROP && a_ty_ == "float") ? 8 : 32; - MAX_C_ = 8192 + TK_; - // activation sizes - CD_ = AD_ / stride_d_; - CH_ = AH_ / stride_h_; - CW_ = AW_ / stride_w_; - // A memory strides: [C, H, W, B] - switch(layout_){ - case CHWN: { - lda_n_ = 1; - lda_w_ = B_; - lda_h_ = B_*AW_; - lda_c_ = B_*AW_*AH_; - break; - } - case NCHW: { - lda_w_ = 1; - lda_h_ = AW_; - lda_c_ = AW_*AH_; - lda_n_ = AW_*AH_*C_; - break; - } - default: - throw std::runtime_error("unsupported input layout"); - } - // Shift edge - shift_edge_h_ = (AH_ == stride_h_ && stride_h_ > 1); - shift_edge_w_ = (AW_ == stride_w_ && stride_w_ > 1); - // B memory strides: [C, F] - ldb_n_ = 1; - ldb_h_ = 1; - ldb_w_ = 1; - ldb_c_ = F_; - // C memory strides: [F, H, W, B] - switch(layout_){ - case CHWN: { - ldc_n_ = 1; - ldc_w_ = B_; - ldc_h_ = B_*CW_; - ldc_f_ = B_*CW_*CH_; - break; - } - case NCHW: { - ldc_w_ = 1; - ldc_h_ = CW_; - ldc_f_ = CW_*CH_; - ldc_n_ = CW_*CH_*F_; - break; - } - default: - throw std::runtime_error("unsupported input layout"); - } - IAD_ = AD_ - 2*(BD_/2); - IAH_ = AH_ - 2*(BH_/2); - IAW_ = AW_ - 2*(BW_/2); - ICD_ = IAD_ / stride_d_; - ICH_ = IAH_ / stride_h_; - ICW_ = IAW_ / stride_w_; - - // Equivalent matmul - M_ = B_*ICH_*ICW_; - N_ = F_; - K_ = C_; - // transpose - AT_ = false; - BT_ = true; - // C shapes - if(layout_ == CHWN) - shapes_c_ = {F, CH_, CW_, B}; - if(layout_ == NCHW) - shapes_c_ = {B, F, CH_, CW_}; - // Weight gradient - if(op_ == WGRAD){ - // b <-> c - // b <-> a - std::swap(ldb_n_, ldc_n_); - std::swap(ldb_w_, ldc_w_); - std::swap(ldb_h_, ldc_h_); - std::swap(ldb_c_, ldc_f_); - std::swap(lda_n_, ldb_n_); - std::swap(lda_w_, ldb_w_); - std::swap(lda_h_, ldb_h_); - std::swap(lda_c_, ldb_c_); - std::swap(M_, K_); - std::swap(M_, N_); - AT_ = true; - BT_ = false; - shapes_c_ = {C, F}; - } - // Input gradient - if(op_ == BPROP){ - // a <-> c - std::swap(lda_n_, ldc_n_); - std::swap(lda_w_, ldc_w_); - std::swap(lda_h_, ldc_h_); - std::swap(lda_c_, ldc_f_); - std::swap(K_, N_); - AT_ = false; - BT_ = false; - if(layout_ == CHWN) - shapes_c_ = {C, AH_, AW_, B}; - if(layout_ == NCHW) - shapes_c_ = {B, C, AH_, AW_}; - } - // locks - max_locks_ = (op_ == WGRAD) ? 8192 : 0; - locks_ = nullptr; -} - -base* shift::clone() const { - return new shift(*this); -} - -void shift::build_delta_a() { - h_delta_a.resize(MAX_C_); - auto shift_h = [&](int c) { return shift_edge_h_ ? (c / AH_) % AH_ : shift_h_[c]; }; - auto shift_w = [&](int c) { return shift_edge_w_ ? c % AW_ : shift_w_[c]; }; - if(op_ == FPROP){ - // compute offset - auto offset = [&](unsigned c) { - return c*lda_c_ + shift_h(c)*lda_h_ + shift_w(c)*lda_w_; - }; - // populate look-up table - for(unsigned c = 0; c < TK_; c++) - h_delta_a[c] = offset(c); - for(unsigned c = 0; c < C_; c++) - h_delta_a[TK_ + c] = offset(c + TK_) - offset(c); - } - if(op_ == BPROP){ - for(unsigned c = 0; c < C_; c++){ - h_delta_a[c] = shift_h(c)*ldc_h_ + shift_w(c)*ldc_w_; - } - } - if(op_ == WGRAD){ - for(unsigned c = 0; c < C_; c++) - h_delta_a[c] = shift_h(c)*ldb_h_ + shift_w(c)*ldb_w_; - } -} - -size_t shift::c_size() { - return std::accumulate(shapes_c_.begin(), shapes_c_.end(), - 1, std::multiplies()); -} - -std::vector shift::c_shapes(){ - return shapes_c_; -} - -size_t shift::num_flops() const { - return 2.*M_*N_*K_; -} - -bool shift::AT() const -{ return AT_; } - -bool shift::BT() const -{ return BT_; } - -size_t shift::M() const -{ return M_; } - -size_t shift::N() const -{ return N_; } - -size_t shift::K() const -{ return K_; } - -size_t shift::lda() const -{ return AT_ ? K_ : M_; } - -size_t shift::ldb() const -{ return BT_ ? N_ : K_; } - -size_t shift::ldc() const -{ return M_; } - -std::vector shift::retune_params() const { - return {B_, C_, F_, - AD_, AH_, AW_, - BD_, BH_, BW_, - CD_, CH_, CW_, - (int64_t)shift_h_, (int64_t)shift_w_, - stride_h_, stride_w_, - layout_, op_, - bias_}; -} - -void shift::init_impl(driver::stream *stream, driver::cu_module *module, triton::runtime::launch_information info) { - build_delta_a(); - triton::driver::buffer* delta_a = ((triton::driver::cu_module*)module)->symbol("delta_a"); - stream->write(delta_a, false, 0, h_delta_a.size()*4, h_delta_a.data()); - // locks - if(locks_ == nullptr && max_locks_ > 0){ - std::vector hlocks(2*max_locks_, 0); - locks_ = triton::driver::buffer::create(stream->context(), 2*max_locks_*4); - stream->write(locks_, false, 0, hlocks); - } -} - -void shift::deinit_impl() { - if(locks_ != nullptr){ - delete locks_; - locks_ = nullptr; - } -} - -void shift::enqueue_impl(driver::stream *stream, driver::kernel *kernel, - std::vector args, - runtime::launch_information info) { - unsigned TM = info.globals.at("TM"), TN = info.globals.at("TN"); - unsigned grid_0 = (M_ + TM - 1)/TM; - unsigned grid_1 = (N_ + TN - 1)/TN; - unsigned num_locks = grid_0 * grid_1; - unsigned grid_2 = num_locks < max_locks_ ? info.globals.at("GZ") : 1; - std::array grid = {grid_0, grid_1, grid_2}; - driver::buffer *a = args[0], *b = args[1], *c = args[2]; -// std::cout << op_ << " " << M_ << " " << N_ << " " << K_ << std::endl; - kernel->setArg(0, a); - kernel->setArg(1, b); - kernel->setArg(2, c); - kernel->setArg(3, M_); - kernel->setArg(4, N_); - kernel->setArg(5, K_); - kernel->setArg(6, stride_h_); - kernel->setArg(7, stride_w_); - kernel->setArg(8, lda_n_); - kernel->setArg(9, lda_w_); - kernel->setArg(10, lda_h_); - kernel->setArg(11, lda_c_); - kernel->setArg(12, ldb_n_); - kernel->setArg(13, ldb_w_); - kernel->setArg(14, ldb_h_); - kernel->setArg(15, ldb_c_); - kernel->setArg(16, ldc_n_); - kernel->setArg(17, ldc_w_); - kernel->setArg(18, ldc_h_); - kernel->setArg(19, ldc_f_); - kernel->setArg(20, B_); - kernel->setArg(21, IAH_); - kernel->setArg(22, IAW_); - kernel->setArg(23, BH_); - kernel->setArg(24, BW_); - kernel->setArg(25, ICH_); - kernel->setArg(26, ICW_); - kernel->setArg(27, (num_locks > max_locks_) ? nullptr : locks_); - kernel->setArg(28, (int32_t)grid[0]); - kernel->setArg(29, (int32_t)grid[1]); - kernel->setArg(30, (int32_t)grid[2]); - if(locks_) - ((driver::cu_buffer*)locks_)->set_zero(stream, 2*max_locks_*4); - stream->enqueue(kernel, grid, {info.num_threads, 1, 1}); -} - -void shift::triton_c_src(std::ostream &os) const { - std::string AS0 = "TM", AS1 = "TK"; - std::string BS0 = "TK", BS1 = "TN"; - std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; - std::string usea = AT_ ? "trans(a)" : "a"; - std::string useb = BT_ ? "trans(b)" : "b"; - std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; - std::string stride_h = std::to_string(stride_h_); - std::string stride_w = std::to_string(stride_w_); - if(AT_){ - std::swap(AS0, AS1); - std::swap(bca0, bca1); - } - if(BT_){ - std::swap(BS0, BS1); - std::swap(bcb0, bcb1); - } - std::string AS = AS0 + ", " + AS1; - std::string BS = BS0 + ", " + BS1; - bool is_chwn = layout_ == CHWN; - - std::string lda_b = is_chwn ? "1" : "lda_b"; - std::string ldb_b = is_chwn ? "1" : "ldb_b"; - std::string ldc_b = is_chwn ? "1" : "ldc_b"; - - - auto compute_bhw = [&](std::string rx, std::string sz, std::string rkx){ - std::string B = std::to_string(B_); - std::string CW = std::to_string(ICW_); - std::string CH = std::to_string(ICH_); - - if(is_chwn) { - return R"( - int )" + rx + "wh[" + sz + "] = " + rkx + " / " + B + R"(; - int )" + rx + "b[" + sz + "] = " + rkx + " % " + B + R"(; - int )" + rx + "w[" + sz + "] = (" + rx + "wh % " + CW + R"() + pad_w; - int )" + rx + "h[" + sz + "] = (" + rx + "wh / " + CW + R"() + pad_h;)"; - } - else { - return R"( - int )" + rx + "bh[" + sz + "] = " + rkx + " / " + CW + R"(; - int )" + rx + "w[" + sz + "] = (" + rkx + " % " + CW + R"() + pad_w; - int )" + rx + "h[" + sz + "] = (" + rx + "bh % " + CH + R"() + pad_h; - int )" + rx + "b[" + sz + "] = " + rx + "bh / " + CH + ";"; - } - }; - - std::string result = -R"( -const tunable int TM = {16, 32, 64, 128}; -const tunable int TN = {16, 32, 64, 128}; -const tunable int TK = {)" + std::to_string(TK_) + "};"; -if(op_ == WGRAD) - result += "const tunable int GZ = {1};"; -else - result += "const tunable int GZ = {1};"; - -result += R"( -__constant__ int* delta_a = alloc_const int[)" + std::to_string(MAX_C_) + R"(]; - -void shift(restrict read_only align(16) )" + a_ty_ + R"( *A, - restrict read_only align(16) )" + b_ty_ + R"( *B, - )" + c_ty_ + R"( *C, - int M, int N, int K, - int stride_h, int stride_w, - multiple_of(8) int lda_b, multiple_of(8) int lda_w, multiple_of(8) int lda_h, multiple_of(8) int lda_c, - multiple_of(8) int ldb_b, multiple_of(8) int ldb_w, multiple_of(8) int ldb_h, multiple_of(8) int ldb_c, - multiple_of(8) int ldc_b, multiple_of(8) int ldc_w, multiple_of(8) int ldc_h, multiple_of(8) int ldc_c, - int NB, - int AH, int AW, - int BH, int BW, - int CH, int CW, - int* locks, int grid0, int grid1, int grid2) { - int ridx = get_program_id(0); - int ridy = get_program_id(1); - int rz = get_program_id(2); - int rxa[TM] = ridx*TM + (0 ... TM); - int ryb[TN] = ridy*TN + (0 ... TN); - int rka[TK] = 0 ... TK; - int rkb[TK] = 0 ... TK; - float acc[TM, TN] = 0; - int pad_h = BH / 2; - int pad_w = BW / 2;)"; - -/* A offsets */ -if(op_ == FPROP){ - result += - compute_bhw("ra", "TM", "rxa") + R"( - raw = raw * )" + stride_w + R"(; - rah = rah * )" + stride_h + R"(; - int offxa[TM] = rab*)" + lda_b + R"( + raw*lda_w + rah*lda_h; - int offa0[TM, TK] = offxa[:, newaxis]; - __constant__ int* pd[TK] = delta_a + rka; - multiple_of(8) int d[TK] = *pd; - int offa1[TM, TK] = d[newaxis, :];)"; -} -if(op_ == BPROP){ - result += - compute_bhw("ra", "TM", "rxa") + R"( - int offxa[TM] = rab*)" + lda_b + R"( + raw*lda_w + rah*lda_h; - int offa0[TM, TK] = offxa[:, newaxis]; - int offa1[TM, TK] = rka[newaxis, :] * lda_c;)"; -} -if(op_ == WGRAD){ - result += - compute_bhw("ra", "TK", "rka") + R"( - int offa0[TK, TM] = rxa[newaxis, :] * lda_c; - int offxa[TK] = rab*)" + lda_b + R"( + raw*lda_w + rah*lda_h; - int offa1[TK, TM] = offxa[:, newaxis];)"; -} - -/* B offsets */ -if(op_ == FPROP){ - result += R"( - int offb0[TN, TK] = ryb[:, newaxis]; - int offb1[TN, TK] = rkb[newaxis, :] * ldb_c;)"; -} -if(op_ == BPROP){ - result += R"( - int offb0[TK, TN] = ryb[newaxis, :] * ldb_c; - int offb1[TK, TN] = rkb[:, newaxis];)"; -} -if(op_ == WGRAD){ - result += - compute_bhw("rb", "TK", "rkb") + R"( - __constant__ int* pd[TN] = delta_a + ryb; - multiple_of(8) int d[TN] = *pd; - multiple_of(8) int shift[TK, TN] = d[newaxis, :]; - rbw = rbw * )" + stride_w + R"(; - rbh = rbh * )" + stride_h + R"(; - int offkb[TK] = rbb*)" + ldb_b + R"( + rbw*ldb_w + rbh*ldb_h; - int offb0[TK, TN] = ryb[newaxis, :] * ldb_c; - int offb1[TK, TN] = offkb[:, newaxis]; - )" + a_ty_ + "* pa_base[" + AS + R"(] = A + offa0; - )" + b_ty_ + "* pb_base[" + BS + R"(] = B + offb0 + shift; - )" + a_ty_ + "* pa[" + AS + R"(] = pa_base + offa1; - )" + b_ty_ + "* pb[" + BS + R"(] = pb_base + offb1;)"; -} -else{ - result += R"( - )" + a_ty_ + "* pa[" + AS + R"(] = A + offa0 + offa1; - )" + b_ty_ + "* pb[" + BS + R"(] = B + offb0 + offb1;)"; -} - -/* Main loop */ -/* Increment A pointers */ - result += R"( - bool checka[)" + AS + "] = (rka < K)" + bca0 + R"(; - bool checkb[)" + BS + "] = (rkb < K)" + bcb0 + R"(; - )" + a_ty_ + " a[" + AS + R"(] = checka ? *pa : 0; - )" + b_ty_ + " b[" + BS + R"(] = checkb ? *pb : 0; - for(int k = K; k > 0; k = k - TK){ - acc = dot()" + usea + "," + useb + R"(, acc); - bool checka[)" + AS + R"(] = k > TK; - bool checkb[)" + BS + R"(] = k > TK;)"; - -/* Increment A pointers */ -if(op_ == FPROP){ - result += R"( - pd = pd + TK; - d = *pd; - pa = pa + d[newaxis, :];)"; -} -if(op_ == BPROP){ - result += R"( - pa = pa + TK * lda_c;)"; -} -if(op_ == WGRAD){ - result += R"( - rka = rka + TK;)" - + compute_bhw("ra", "TK", "rka") + R"( - offxa = rab*)" + lda_b + R"( + raw*lda_w + rah*lda_h; - pa = pa_base + offxa[:, newaxis];)"; -} - result += R"( - a = checka ? *pa : 0;)"; - -/* Increment B pointers */ -if(op_ == WGRAD){ - result += R"( - rkb = rkb + TK;)" - + compute_bhw("rb", "TK", "rkb") + R"( - rbw = rbw * )" + stride_w + R"(; - rbh = rbh * )" + stride_h + R"(; - offkb = rbb*)" + ldb_b + R"( + rbw*ldb_w + rbh*ldb_h; - pb = pb_base + offkb[:, newaxis];)"; -} -if(op_ == FPROP){ - result += R"( - pb = pb + TK * ldb_c;)"; -} -if(op_ == BPROP){ - result += R"( - pb = pb + TK;)"; -} - result += R"( - b = checkb ? *pb : 0; - } - int rxc[TM] = ridx*TM + (0 ... TM); - int ryc[TN] = ridy*TN + (0 ... TN);)"; - -/* C offsets */ -if(op_ == BPROP){ - result += - compute_bhw("rc", "TM", "rxc") + R"( - rcw = rcw * )" + stride_w + R"(; - rch = rch * )" + stride_h + R"(; - int offxc[TM] = rcb*)" + ldc_b + R"( + rcw*ldc_w + rch*ldc_h;)"; - } -if(op_ == FPROP){ - result += - compute_bhw("rc", "TM", "rxc") + R"( - int offxc[TM] = rcb*)" + ldc_b + R"( + rcw*ldc_w + rch*ldc_h;)"; -} -if(op_ == WGRAD){ - result += R"( - int offxc[TM] = rxc;)"; -} - result += R"(" - )" + c_ty_ + R"( c[TM, TN] = acc; - )" + c_ty_ + R"(* pc[TM, TN] = C + offxc[:, newaxis] + ryc[newaxis, :]*ldc_c; - bool checkc0[TM] = rxc < M; - bool checkc1[TN] = ryc < N; - bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :];)"; -if(op_ == BPROP){ - result += R"( - __constant__ int* pd[TN] = delta_a + ryc; - )" + c_ty_ + R"(* shift_pc[TM, TN] = pc + (*pd)[newaxis, :]; - @checkc *shift_pc = c; - )"; -} -else{ - result += R"( - @checkc *pc = c;)"; -} - result += R"( -})"; - - os << result; -} - - -// small search space for partial auto-tuning -std::vector shift::search_space() const { - return dot_search_space(AT_, BT_); -} - -// simple parameter heuristics -params_t shift::heuristics() const { - return dot_heuristics(AT_, BT_, M_, N_, K_); -} - - -} -} diff --git a/lib/runtime/jit.cpp b/lib/runtime/jit.cpp deleted file mode 100644 index ae9e1c783..000000000 --- a/lib/runtime/jit.cpp +++ /dev/null @@ -1,284 +0,0 @@ -#include -#include "triton/lang/lang.h" -#include "triton/codegen/selection/target.h" -#include "triton/ir/context.h" -#include "triton/ir/context_impl.h" -#include "triton/driver/device.h" -#include "triton/driver/error.h" -#include "triton/runtime/jit.h" -#include "llvm/IR/IRPrintingPasses.h" -#include "llvm/IR/Module.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/PassManager.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/TargetSelect.h" -#include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetOptions.h" -#include "llvm/CodeGen/TargetPassConfig.h" -#include "llvm/IR/LegacyPassManager.h" -#include "llvm/Transforms/Scalar/EarlyCSE.h" -#include "llvm/Analysis/LoopPass.h" -#include "triton/tools/thread_pool.h" -#include - -typedef struct yy_buffer_state * YY_BUFFER_STATE; -extern int yyparse(); -extern YY_BUFFER_STATE yy_scan_string(const char * str); -extern void yy_delete_buffer(YY_BUFFER_STATE buffer); -extern triton::lang::translation_unit *ast_root; - -namespace triton { -namespace runtime{ - -void parallel_loop_nest(std::vector const & ranges, - std::function const &)> const & f, - size_t nthreads){ - size_t D = ranges.size(); - std::vector values(D, 0); - // thread pools -// ThreadPool pool(nthreads); - // Start with innermost loop - size_t i = D - 1; - while(true){ - // Execute function -// pool.enqueue(f,values); - f(values); - while(values[i]++ == ranges[i] - 1){ - if(i == 0) - return; - values[i--] = 0; - } - i = D - 1; - // Short sleep so that the thread pool doesn't grow too big - std::this_thread::sleep_for(std::chrono::microseconds(1)); - } -} - -template -void parallel_loop_nest(std::vector> const & iterates, std::function)> const & f, size_t nthreads){ - //Ranges to iterate over - std::vector ranges; - for(auto const & x: iterates) - ranges.push_back(x.size()); - //Proxy function - auto proxy = [&](std::vector const & idx){ - std::vector x(iterates.size()); - for(size_t i = 0; i < x.size(); ++i) - x[i] = iterates[i][idx[i]]; - f(x); - }; - //Iterate - parallel_loop_nest(ranges, proxy, nthreads); -} - -void parallel_for_each(std::vector> const & iterates, std::function)> const & f, size_t nthreads) { - ThreadPool pool(nthreads); - for(const std::vector& values: iterates) - pool.enqueue(f, values); -} - - -std::unique_ptr jit::make_llvm_module(ir::module &module, passes_wrapper &passes, llvm::LLVMContext& llvm_context, launch_information& info) { - llvm::Module* result = new llvm::Module(module.get_name(), llvm_context); - passes.selection.run(module, *result); - // add globals - for(auto x: module.globals()) - info.globals[x.first] = ((ir::metaparameter*)x.second)->get_value(); - // number of threads - info.num_threads = passes.tune.get_num_threads(); - return std::unique_ptr(result); -} - -triton::lang::translation_unit *jit::parse_program(const char *name, const char *src) { - // create AST from Triton-C source - YY_BUFFER_STATE buffer = yy_scan_string(src); - yyparse(); - yy_delete_buffer(buffer); - triton::lang::translation_unit *program = ast_root; - return program; -} - -std::unique_ptr jit::make_triton_module(const char * name, triton::ir::context &context, triton::lang::translation_unit *program) { - // create Triton-IR from AST - ir::module* module = new ir::module(name, context); - program->codegen(module); - return std::unique_ptr(module); -} - - -jit::jit(driver::context *context, unsigned nthreads): driver_context_(context), - target_(context->device()->make_target()), - nthreads_(nthreads) { } - -jit::~jit(){ } - -std::vector jit::get_valid(const char *name, const char *src) { - // find metaparameters - triton::lang::translation_unit* program = parse_program(name, src); - auto ptt_module = make_triton_module(name, triton_context_, program); - ir::module &tt_module = *ptt_module; - // set parameters - passes_wrapper passes(target_.get()); - passes.target_independent(tt_module); - passes.tune.run(tt_module); - auto mps = passes.tune.get_params(tt_module); - // create parameter ranges - std::vector> ranges; - for(ir::metaparameter *mp: mps) - ranges.push_back(mp->get_space()); - // iterate over parameters - std::vector result; - parallel_loop_nest(ranges, [&](const std::vector params){ - if(!result.empty()) - return; - std::map> errors; - unsigned i = 0; - for(ir::metaparameter *mp: mps) - mp->set_value(params[i++]); - passes.tune.init(tt_module); - passes.tune.check_constraints(errors); - if(!errors.empty()) - return; - result = params; - }, 1); - if(result.empty()) - throw std::runtime_error("couldn't find valid parameters"); - return result; -} - - - -jit::tune_res_t jit::autotune(const char *name, const char *src, benchmark_t benchmark, const std::vector> & targets) { - // find metaparameters - triton::lang::translation_unit* program = parse_program(name, src); - auto ptt_module_0 = make_triton_module(name, triton_context_, program); - ir::module &tt_module_0 = *ptt_module_0; - // set parameters - passes_wrapper passes_0(target_.get()); - passes_0.target_independent(tt_module_0); - passes_0.tune.run(tt_module_0); - auto mps = passes_0.tune.get_params(tt_module_0); - // iterate over parameters - tune_res_t best; - // update_best - std::mutex mutex; - auto update_best = [&](const std::vector params){ - std::map> errors; - unsigned i = 0; - { - std::lock_guard lock(mutex); - for(ir::metaparameter *mp: mps) - mp->set_value(params[i++]); -// for(size_t i = 0; i < params.size(); i++) -// std::cout << ((i==0)?"":", ") << params[i] << std::flush; -// std::cout << std::endl; - passes_0.tune.init(tt_module_0); - passes_0.tune.check_constraints(errors); -// for(auto x: errors) -// for(auto e: x.second){ -// std::cout << x.first->get_name() << ": " << e << std::endl; -// } - } - if(!errors.empty()) - return; - // Deep copy of the module and tuner - triton::ir::context triton_context; - auto ptt_module_1 = make_triton_module(name, triton_context, program); - ir::module &tt_module_1 = *ptt_module_1; - // run passes - passes_wrapper passes_1(target_.get()); - passes_1.target_independent(tt_module_1); - passes_1.tune.run(tt_module_1); - i = 0; - for(ir::metaparameter* mp: passes_1.tune.get_params(tt_module_1)){ - mp->set_value(params[i++]); - } - passes_1.tune.init(tt_module_1); - passes_1.target_dependent(tt_module_1); - driver::device* device = driver_context_->device(); - if(passes_1.shmem_allocation.get_allocated_size() > device->max_shared_memory()) - return; - if(passes_1.tune.get_num_threads() > device->max_threads_per_block()) - return; - // Compile - launch_information info; - llvm::LLVMContext llvm_context; - auto ll_module = make_llvm_module(tt_module_1, passes_1, llvm_context, info); - std::unique_ptr module(driver::module::create(driver_context_, &*ll_module)); - double perf; - { - std::lock_guard lock(mutex); - std::unique_ptr kernel(driver::kernel::create(module.get(), name)); - perf = benchmark(kernel.get(), info); - if(perf > best.perf){ - best.perf = perf; - best.params = params; - } - for(size_t i = 0; i < params.size(); i++) - std::cout << ((i==0)?"":", ") << params[i] << std::flush; - std::cout << ", " << perf << " [ " << best.perf << " ] " << std::endl; - } - }; - - - if(targets.empty()) { - // create parameter ranges - std::vector> ranges; - for(ir::metaparameter *mp: mps) - ranges.push_back(mp->get_space()); - parallel_loop_nest(ranges, update_best, nthreads_); - } - else { - parallel_for_each(targets, update_best, nthreads_); - } - - if(best.params.empty()) - throw std::runtime_error("auto-tuning didn't find valid parameters"); -// std::cout << "Autotuning done - Best performance: " << best.perf << std::endl; - return best; -} - -void jit::add_module(ir::module &tt_module, const std::vector ¶ms) { - // set parameters - passes_wrapper passes(target_.get()); - passes.target_independent(tt_module); - passes.tune.run(tt_module); - unsigned i = 0; - for(ir::metaparameter* mp: passes.tune.get_params(tt_module)) - mp->set_value(params[i++]); - passes.tune.init(tt_module); - passes.target_dependent(tt_module); - // check constraints - std::map> errors; - passes.tune.check_constraints(errors); - for(auto x: errors){ - for(auto str: x.second) - std::cout << x.first->get_name() << ": " << str << std::endl; - } - if(errors.size()) - throw std::runtime_error("invalid parameters"); - // triton module -> llvm module - std::string name = tt_module.get_name(); - auto ll_module = make_llvm_module(tt_module, passes, llvm_context_, launch_info_map_[name]); - // llvm module -> machine code - modules_.insert({name, driver::module::create(driver_context_, &*ll_module)}); -} - -void jit::add_module(const char *name, const char *src, const std::vector ¶ms) { - triton::lang::translation_unit* program = parse_program(name, src); - auto ptt_module = make_triton_module(name, triton_context_, program); - add_module(*ptt_module, params); -} - -driver::kernel *jit::get_function(const char *name) { - return driver::kernel::create(modules_.at(name), name); -} - -launch_information jit::get_launch_info(const char *name) { - return launch_info_map_.at(name); -} - - -} -} From 81571246cf35b9d030e24d695f2ef4d9dfa5cfaf Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 18 Aug 2019 14:08:57 -0700 Subject: [PATCH 303/494] [general] fixed some warnings --- CMakeLists.txt | 11 +- examples/cpp/CMakeLists.txt | 2 +- examples/cpp/conv.cpp | 58 ------- examples/cpp/dot.cpp | 1 - examples/cpp/shift.cpp | 150 ------------------- examples/cpp/shift.ptx | 93 ------------ include/triton/codegen/selection/selection.h | 4 +- include/triton/ir/function.h | 4 +- include/triton/lang/expression.h | 2 +- include/triton/runtime/function.h | 2 +- lib/codegen/analysis/alignment.cpp | 2 +- lib/codegen/analysis/shmem/info.cpp | 12 +- lib/codegen/analysis/tune.cpp | 34 ++--- lib/codegen/selection/selection.cpp | 21 ++- lib/codegen/transform/peephole.cpp | 3 + lib/ir/constant.cpp | 3 +- lib/ir/instructions.cpp | 14 +- lib/ir/module.cpp | 9 +- lib/lang/node.cpp | 4 +- lib/runtime/function.cpp | 4 +- python/examples/dot.py | 3 +- python/triton/ops.py | 4 +- 22 files changed, 75 insertions(+), 365 deletions(-) delete mode 100644 examples/cpp/conv.cpp delete mode 100644 examples/cpp/shift.cpp delete mode 100644 examples/cpp/shift.ptx diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c7a1c0ab..694cc5578 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,7 +25,7 @@ include_directories(${LLVM_INCLUDE_DIRS}) add_definitions(${LLVM_DEFINITIONS}) #llvm_map_components_to_libnames(llvm_libs all) -#Default build type +# Default build type if(NOT CMAKE_BUILD_TYPE) message(STATUS "Default build type: Release") set(CMAKE_BUILD_TYPE "Release") @@ -63,7 +63,14 @@ endif() # Triton file(GLOB_RECURSE LIBTRITON_SRC lib/*.cpp) -add_library(triton SHARED ${LIBTRITON_SRC} ${PYTHON_SRC} ${BISON_Parser_OUTPUTS} ${FLEX_Lexer_OUTPUTS}) +add_library(triton SHARED ${LIBTRITON_SRC} ${EIGHTCC_SRC} ${PYTHON_SRC} ${BISON_Parser_OUTPUTS} ${FLEX_Lexer_OUTPUTS}) target_link_libraries(triton LLVM) +# Warning level +if(MSVC) + target_compile_options(triton PRIVATE /W4) +else() + target_compile_options(triton PRIVATE -Wno-unused-parameter -Wall -Wextra -pedantic) +endif() + diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt index 3366ba591..f5f6a40b8 100644 --- a/examples/cpp/CMakeLists.txt +++ b/examples/cpp/CMakeLists.txt @@ -1,4 +1,4 @@ -foreach(PROG dot conv shift) +foreach(PROG dot) add_executable(${PROG} ${PROG}.cpp) set_target_properties(${PROG} PROPERTIES OUTPUT_NAME ${PROG}) include_directories(/usr/local/cuda/include/) diff --git a/examples/cpp/conv.cpp b/examples/cpp/conv.cpp deleted file mode 100644 index dbe0591f0..000000000 --- a/examples/cpp/conv.cpp +++ /dev/null @@ -1,58 +0,0 @@ -#include -#include -#include -#include "triton/runtime/jit.h" -#include "triton/driver/backend.h" -#include "triton/driver/stream.h" -#include "triton/dnn/conv.h" -#include "triton/tools/bench.hpp" - -int main() { - // initialize default compute device - auto context = triton::driver::backend::contexts::get_default(); - triton::dnn::conv::type ty = triton::dnn::conv::FPROP; - // initialization - int32_t B = 16, NF = 128; - int32_t D = 1, H = 16, W = 16; - int32_t NC = 64, T = 1, R = 3, S = 3; - int32_t pad_d = 0, pad_h = 0, pad_w = 0; - int32_t stride_d = 1, stride_h = 1, stride_w = 1; - int32_t upsample_d = 1, upsample_h = 1, upsample_w = 1; -// triton::dnn::conv configuration(128, 256, 1, 14, 14, 1, 5, 5, 512, 1, 1, 1, 0, 0, 0, 1, 1, 1, "float", "float", triton::dnn::conv::FPROP, 0); - triton::dnn::conv configuration(B, NC, D, H, W, T, R, S, NF, - stride_d, stride_h, stride_w, - pad_d, pad_h, pad_w, - upsample_d, upsample_h, upsample_w, - "float", "float", ty, 0); - // convolution configuration - std::vector hc(configuration.c_size()); - std::vector rc(configuration.c_size()); - std::vector ha(configuration.a_size()); - std::vector hb(configuration.b_size()); - srand(0); - for(size_t i = 0; i < ha.size(); i++) - ha[i] = (float)rand()/RAND_MAX; - for(size_t i = 0; i < hb.size(); i++) - hb[i] = (float)rand()/RAND_MAX; - for(size_t i = 0; i < hc.size(); i++) - hc[i] = 0; - rc = hc; - triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*4); - triton::driver::buffer* da = triton::driver::buffer::create(context, ha.size()*4); - triton::driver::buffer* db = triton::driver::buffer::create(context, hb.size()*4); - triton::driver::stream* stream = triton::driver::stream::create(context); - stream->write(da, true, 0, ha); - stream->write(db, true, 0, hb); - stream->write(dc, true, 0, hc); - stream->synchronize(); - configuration.enqueue(stream, {da, db, dc, nullptr}); - stream->read(dc, true, 0, hc); - configuration.cpu_ref(rc.data(), ha.data(), hb.data()); - for(size_t i = 0; i < hc.size(); i++){ - if(std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ - std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; - exit(EXIT_FAILURE); - } - } - std::cout << "Pass!" << std::endl; -} diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index e592da570..102380036 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -3,7 +3,6 @@ #include #include "triton/driver/backend.h" #include "triton/driver/stream.h" -#include "triton/dnn/dot.h" #include "triton/tools/bench.hpp" #include "triton/external/half.hpp" #include "triton/runtime/function.h" diff --git a/examples/cpp/shift.cpp b/examples/cpp/shift.cpp deleted file mode 100644 index 1495de3c4..000000000 --- a/examples/cpp/shift.cpp +++ /dev/null @@ -1,150 +0,0 @@ -#include -#include -#include -#include "cuda.h" -#include "triton/runtime/jit.h" -#include "triton/driver/backend.h" -#include "triton/driver/stream.h" -#include "triton/tools/bench.hpp" -#include "triton/dnn/shift.h" -#include "triton/external/half.hpp" - -struct perf_t { - double triton; - double cublas; -}; - -perf_t do_bench(triton::driver::stream *stream, - int32_t R, int32_t S, int32_t B, int32_t F, int32_t H, int32_t W, int32_t C, - triton::dnn::op_t op, triton::dnn::layout_t layout, - std::string numeric_t) { - typedef float NumericT; - - // driver variables - triton::driver::context* context = stream->context(); - - // random shifts - std::vector shift_h(C); - std::vector shift_w(C); - for(int32_t c = 0; c < C; c++){ - shift_h[c] = rand() % R - R / 2; - shift_w[c] = rand() % S - S / 2; - } - // configuration - triton::dnn::shift shift(B, C, 1, H, W, 1, R, S, F, 1, 1, - shift_h.data(), shift_w.data(), - numeric_t, numeric_t, - op, false, layout); - // host buffers - size_t a_size = B*C*H*W; - size_t b_size = C*F; - size_t c_size = B*F*H*W; - if(op == triton::dnn::BPROP) - std::swap(a_size, c_size); - if(op == triton::dnn::WGRAD){ - std::swap(b_size, c_size); - std::swap(a_size, b_size); - } - std::vector ha(a_size); - std::vector hb(b_size); - std::vector hc(c_size); - std::vector rc(hc.size()); - // device buffers - triton::driver::buffer* dc = triton::driver::buffer::create(context, hc.size()*4); - triton::driver::buffer* da = triton::driver::buffer::create(context, ha.size()*sizeof(NumericT)); - triton::driver::buffer* db = triton::driver::buffer::create(context, hb.size()*sizeof(NumericT)); - // initialize host - srand(0); - for(size_t i = 0; i < ha.size(); i++) - ha[i] = (NumericT)rand() / RAND_MAX; - for(size_t i = 0; i < hb.size(); i++) - hb[i] = (NumericT)rand() / RAND_MAX; - for(size_t i = 0; i < hc.size(); i++) - hc[i] = 0; - // initialize device - stream->write(da, true, 0, ha); - stream->write(db, true, 0, hb); - stream->write(dc, true, 0, hc); - stream->synchronize(); - // benchmark triton - double triton_ns = triton::tools::bench([&]() { shift.enqueue(stream, {da, db, dc}, triton::dnn::NO_TUNING);}, stream); - // benchmark cublas -// NumericT alpha = 1; -// NumericT beta = 0; -// cublasGemmAlgo_t fastest; -// cublasGemm(HALF_TYPE, stream, shift.AT(), shift.BT(), shift.M(), shift.N(), shift.K(), -// &alpha, da, shift.lda(), -// db, shift.ldb(), &beta, -// dc, shift.ldc(), &fastest); -// double cublas_ns = triton::tools::bench([&]() { cublasGemm(HALF_TYPE, stream, shift.AT(), shift.BT(), shift.M(), shift.N(), shift.K(), -// &alpha, da, shift.lda(), -// db, shift.ldb(), -// &beta, dc, shift.ldc(), nullptr, fastest); }, stream); - // result - auto tflops = [&](double nanosec) { return shift.num_flops() / nanosec * 1e-3; }; - perf_t result; -// result.cublas = tflops(cublas_ns); - result.triton = tflops(triton_ns); - delete da; - delete db; - delete dc; - return result; -} - -int main() { - using triton::dnn::op_t; - using triton::dnn::layout_t; - - struct config_t{ - int32_t B; - int32_t C; - int32_t H; - int32_t W; - int32_t R; - int32_t S; - int32_t F; - int32_t stride_h; - int32_t stride_w; - op_t op; - layout_t layout; - std::string ty; - - std::string repr() { - std::ostringstream oss; - oss << B << ", " << C << ", " << H << ", " << W << ", " << R << ", " << S << ", " << F << ", " << op << ", " << layout << ", " << ty; - return oss.str(); - } - - perf_t perf(triton::driver::stream *stream){ - return do_bench(stream, R, S, B, F, H, W, C, op, layout, ty); - } - }; - // shapes to benchmark - std::vector configs; - std::vector resnet18 = - { - {128, 128, 32, 32, 3, 3, 128, 1, 1}, - {128, 128, 32, 32, 3, 3, 128, 1, 1}, - {128, 128, 32, 32, 3, 3, 256, 2, 2}, - {128, 256, 16, 16, 3, 3, 256, 1, 1}, - {128, 256, 16, 16, 3, 3, 512, 2, 2}, - {128, 512, 8, 8, 3, 3, 512, 1, 1}, - {128, 512, 8, 8, 3, 3, 1024, 1, 1}, - {128, 1024, 8, 8, 3, 3, 1024, 1, 1} - }; - for(config_t c: resnet18){ - for(op_t op: {op_t::FPROP, op_t::BPROP, op_t::WGRAD}){ - configs.push_back({c.B, c.C, c.H, c.W, c.R, c.S, c.F, c.stride_h, c.stride_w, op, layout_t::CHWN, "half"}); - } - } - - // initialize default compute device - auto context = triton::driver::backend::contexts::get_default(); - triton::driver::stream *stream = triton::driver::stream::create(context); - - for(config_t c: configs){ - std::string repr = c.repr(); - perf_t perf = c.perf(stream); - std::cout << "// " << repr << ", " << perf.triton << ", " << perf.cublas << std::endl; - } -} diff --git a/examples/cpp/shift.ptx b/examples/cpp/shift.ptx deleted file mode 100644 index 62a841909..000000000 --- a/examples/cpp/shift.ptx +++ /dev/null @@ -1,93 +0,0 @@ -// -// Generated by NVIDIA NVVM Compiler -// -// Compiler Build ID: CL-24817639 -// Cuda compilation tools, release 10.0, V10.0.130 -// Based on LLVM 3.4svn -// - -.version 6.3 -.target sm_60 -.address_size 64 - - // .globl _Z25shift_cuda_forward_kernelPKfPKiPfiiii - -.visible .entry shift( - .param .u64 _Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_0, - .param .u64 _Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_1, - .param .u64 _Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_2, - .param .u32 _Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_3, - .param .u32 _Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_4, - .param .u32 _Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_5, - .param .u32 _Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_6 -) -{ - .reg .pred %p<10>; - .reg .f32 %f<2>; - .reg .b32 %r<31>; - .reg .b64 %rd<13>; - - - ld.param.u64 %rd1, [_Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_0]; - ld.param.u64 %rd3, [_Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_1]; - ld.param.u64 %rd2, [_Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_2]; - ld.param.u32 %r3, [_Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_3]; - ld.param.u32 %r4, [_Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_4]; - ld.param.u32 %r5, [_Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_5]; - ld.param.u32 %r6, [_Z25shift_cuda_forward_kernelPKfPKiPfiiii_param_6]; - cvta.to.global.u64 %rd4, %rd3; - mov.u32 %r7, %ntid.x; - mov.u32 %r8, %ctaid.x; - mov.u32 %r9, %tid.x; - mad.lo.s32 %r1, %r7, %r8, %r9; - mul.lo.s32 %r10, %r4, %r3; - mul.lo.s32 %r11, %r10, %r5; - mul.lo.s32 %r12, %r11, %r6; - mul.lo.s32 %r13, %r5, %r4; - mul.lo.s32 %r14, %r13, %r6; - rem.s32 %r15, %r1, %r14; - sub.s32 %r16, %r1, %r15; - mul.lo.s32 %r17, %r6, %r5; - div.s32 %r18, %r15, %r17; - mul.lo.s32 %r19, %r18, %r17; - sub.s32 %r20, %r15, %r19; - div.s32 %r21, %r20, %r5; - mul.lo.s32 %r22, %r21, %r6; - sub.s32 %r23, %r20, %r22; - shl.b32 %r24, %r18, 1; - mul.wide.s32 %rd5, %r24, 4; - add.s64 %rd6, %rd4, %rd5; - ld.global.nc.u32 %r25, [%rd6]; - add.s32 %r26, %r25, %r21; - ld.global.nc.u32 %r27, [%rd6+4]; - add.s32 %r28, %r23, %r27; - add.s32 %r29, %r16, %r19; - mad.lo.s32 %r30, %r26, %r5, %r29; - add.s32 %r2, %r30, %r28; - setp.lt.s32 %p1, %r1, %r12; - setp.gt.s32 %p2, %r26, -1; - and.pred %p3, %p1, %p2; - setp.lt.s32 %p4, %r26, %r5; - and.pred %p5, %p3, %p4; - setp.gt.s32 %p6, %r28, -1; - and.pred %p7, %p5, %p6; - setp.lt.s32 %p8, %r28, %r6; - and.pred %p9, %p7, %p8; - @!%p9 bra BB0_2; - bra.uni BB0_1; - -BB0_1: - cvta.to.global.u64 %rd7, %rd1; - mul.wide.s32 %rd8, %r1, 4; - add.s64 %rd9, %rd7, %rd8; - ld.global.nc.f32 %f1, [%rd9]; - cvta.to.global.u64 %rd10, %rd2; - mul.wide.s32 %rd11, %r2, 4; - add.s64 %rd12, %rd10, %rd11; - st.global.f32 [%rd12], %f1; - -BB0_2: - ret; -} - - diff --git a/include/triton/codegen/selection/selection.h b/include/triton/codegen/selection/selection.h index 433633cff..3f118d47a 100644 --- a/include/triton/codegen/selection/selection.h +++ b/include/triton/codegen/selection/selection.h @@ -100,8 +100,8 @@ public: private: Value *ptr_; bool return_vector_; - Value *offset_; Builder &builder_; + Value *offset_; std::map ptr_cache_; unsigned vector_size_; }; @@ -206,9 +206,9 @@ private: tmap_t tmap_; analysis::shmem::allocation *alloc_; analysis::tune *params_; - target *tgt_; analysis::shmem::info *buffer_info_; analysis::alignment_info *alignment_; + target *tgt_; std::map axes_; Value *sh_mem_ptr_; Value *offset_a_i_, *offset_a_k_; diff --git a/include/triton/ir/function.h b/include/triton/ir/function.h index bde5218b2..9cfc89931 100644 --- a/include/triton/ir/function.h +++ b/include/triton/ir/function.h @@ -47,11 +47,11 @@ public: return std::make_pair(kind_, value_) < std::make_pair(other.kind_, other.value_); } - const attribute_kind_t get_kind() const { + attribute_kind_t get_kind() const { return kind_; } - const unsigned get_value() const { + unsigned get_value() const { return value_; } diff --git a/include/triton/lang/expression.h b/include/triton/lang/expression.h index 7724fdd61..a3574f15d 100644 --- a/include/triton/lang/expression.h +++ b/include/triton/lang/expression.h @@ -344,8 +344,8 @@ public: const expression *rvalue() const { return rvalue_; } public: - ASSIGN_OP_T op_; const expression *lvalue_; + ASSIGN_OP_T op_; const expression *rvalue_; }; diff --git a/include/triton/runtime/function.h b/include/triton/runtime/function.h index 2cbd65fd4..af849448b 100644 --- a/include/triton/runtime/function.h +++ b/include/triton/runtime/function.h @@ -76,8 +76,8 @@ private: void operator()(driver::stream *stream, const std::array& grid, const std::vector& args) const; private: - std::shared_ptr parent_; std::shared_ptr bin_; + std::shared_ptr parent_; std::vector param_tys_; size_t n_threads_; }; diff --git a/lib/codegen/analysis/alignment.cpp b/lib/codegen/analysis/alignment.cpp index a602c87ca..6383ed850 100644 --- a/lib/codegen/analysis/alignment.cpp +++ b/lib/codegen/analysis/alignment.cpp @@ -227,7 +227,7 @@ unsigned alignment_info::populate_starting_multiple(ir::value *v){ if(auto *x = dynamic_cast(v)){ return cache(x->get_first()->get_value()); } - if(auto *x = dynamic_cast(v)){ + if(dynamic_cast(v)){ return cache(128); } if(auto *x = dynamic_cast(v)){ diff --git a/lib/codegen/analysis/shmem/info.cpp b/lib/codegen/analysis/shmem/info.cpp index b674560bf..8f0dac32c 100644 --- a/lib/codegen/analysis/shmem/info.cpp +++ b/lib/codegen/analysis/shmem/info.cpp @@ -19,7 +19,7 @@ bool info::is_loop_latch(ir::phi_node *phi, ir::instruction *terminator){ if(auto *br = dynamic_cast(terminator)) return br->get_true_dest() == phi->get_parent() || br->get_false_dest() == phi->get_parent(); - else if(auto *br = dynamic_cast(terminator)) + else if(dynamic_cast(terminator)) return false; else throw std::runtime_error("unreachable"); @@ -36,15 +36,15 @@ void info::replace(ir::value* before, ir::value *after) { } inline bool get_is_shared(ir::value* v) { - if(auto x = dynamic_cast(v)) + if(dynamic_cast(v)) return true; - if(auto x = dynamic_cast(v)) + if(dynamic_cast(v)) return true; - if(auto x = dynamic_cast(v)) + if(dynamic_cast(v)) return true; - if(auto x = dynamic_cast(v)) + if(dynamic_cast(v)) return true; - if(auto x = dynamic_cast(v)){ + if(auto *x = dynamic_cast(v)){ bool res = true; for(unsigned inc = 0; inc < x->get_num_incoming(); inc++) res = res && get_is_shared(x->get_incoming_value(inc)); diff --git a/lib/codegen/analysis/tune.cpp b/lib/codegen/analysis/tune.cpp index ec67ef254..c43a7126b 100644 --- a/lib/codegen/analysis/tune.cpp +++ b/lib/codegen/analysis/tune.cpp @@ -58,7 +58,7 @@ void tune::init_c_graph(ir::instruction *v) { shapes = store->get_pointer_operand()->get_type()->get_tile_shapes(); else if(auto *atom = dynamic_cast(v)) shapes = atom->get_operand(0)->get_type()->get_tile_shapes(); - else if(auto *downcast = dynamic_cast(v)) + else if(dynamic_cast(v)) return; else if(auto *reduce = dynamic_cast(v)) { unsigned axis = reduce->get_axis(); @@ -116,7 +116,7 @@ void tune::init_c_graph(ir::instruction *v) { } } // Matrix multiplication - else if(auto *x = dynamic_cast(v)){ + else if(dynamic_cast(v)){ ir::value *A = v->get_operand(0); ir::value *B = v->get_operand(1); ir::value *D = v->get_operand(2); @@ -166,7 +166,7 @@ void tune::connected_components(node_t x, const std::vector if(nodes.find(x) != nodes.end()){ nodes.erase(x); std::string suffix = ".d" + std::to_string(x.second); - for(int i = 0; i < mps.size(); i++) + for(unsigned i = 0; i < mps.size(); i++) params_[x.first].insert({prefixes[i] + suffix, mps[i]}); ir::type *ty = x.first->get_type(); if(ty->is_tile_ty()){ @@ -254,24 +254,24 @@ void tune::init(ir::module &mod) { create_grids(grids_, references, fn); } - int num_threads = get_num_threads(); - auto clamp = [&](int x, int lo, int hi) { return std::min(std::max(x, lo), hi); }; + unsigned num_threads = get_num_threads(); + auto clamp = [&](unsigned x, unsigned lo, unsigned hi) { return std::min(std::max(x, lo), hi); }; for(ir::value *i: grids_){ if(!i->get_type()->is_tile_ty()) continue; auto shapes = i->get_type()->get_tile_shapes(); - int shape_0 = shapes[0]->get_value(); - int shape_1 = shapes[1]->get_value(); - int size = i->get_type()->get_tile_num_elements(); + unsigned shape_0 = shapes[0]->get_value(); + unsigned shape_1 = shapes[1]->get_value(); + unsigned size = i->get_type()->get_tile_num_elements(); /* HMMA parameters*/ if(fragments_.at({i, 0}) == HMMA_FRAGMENT_C){ /* fragments per warp */ // try to make things as square as possible to maximize data re-use - std::vector fpw = {1, 1, 1}; - std::vector fpw_nm1; - int num_fragments = std::min((shape_0/8)*(shape_1/8), 4); + std::vector fpw = {1, 1, 1}; + std::vector fpw_nm1; + unsigned num_fragments = std::min((shape_0/8)*(shape_1/8), 4); do { fpw_nm1 = fpw; if(fpw[0]*fpw[1] < num_fragments) @@ -280,13 +280,13 @@ void tune::init(ir::module &mod) { fpw[1] = clamp(fpw[1]*2, 1, shape_1 / 8); }while(fpw_nm1 != fpw); // store parameters - for(int d = 0; d < shapes.size(); d++) + for(unsigned d = 0; d < shapes.size(); d++) params_.at(i).at("fpw.d" + std::to_string(d))->set_value(fpw[d]); /* warps per tile */ // try to make things as square as possible to maximize data re-use - std::vector wpt = {1, 1, 1}; - std::vector wpt_nm1; + std::vector wpt = {1, 1, 1}; + std::vector wpt_nm1; do{ wpt_nm1 = wpt; if(wpt[0] * wpt[1] * wpt[2] < num_warps_) @@ -295,7 +295,7 @@ void tune::init(ir::module &mod) { wpt[1] = clamp(wpt[1]*2, 1, shape_1 / (fpw[1]*8)); }while(wpt_nm1 != wpt); // store parameters - for(int d = 0; d < shapes.size(); d++) + for(unsigned d = 0; d < shapes.size(); d++) params_.at(i).at("wpt.d" + std::to_string(d))->set_value(wpt[d]); /* sanity check */ @@ -309,8 +309,8 @@ void tune::init(ir::module &mod) { /* Scan-line */ else{ - int shape = shapes[0]->get_value(); - int current = num_threads; + unsigned shape = shapes[0]->get_value(); + unsigned current = num_threads; params_.at(i).at("nts.d0")->set_value(clamp(size / num_threads, 1, 8)); params_.at(i).at("mts.d0")->set_value(clamp(current, 1, shape / params_.at(i).at("nts.d0")->get_value())); current = current / params_.at(i).at("mts.d0")->get_value(); diff --git a/lib/codegen/selection/selection.cpp b/lib/codegen/selection/selection.cpp index 166b423bb..4b31dce52 100644 --- a/lib/codegen/selection/selection.cpp +++ b/lib/codegen/selection/selection.cpp @@ -226,6 +226,7 @@ llvm::Instruction::BinaryOps llvm_op(ir::binary_op_t op) { case ttop::Or: return llop::Or; case ttop::Xor: return llop::Xor; } + throw std::runtime_error("unknown operator"); } llvm::Instruction::CastOps llvm_op(ir::cast_op_t op) { @@ -246,6 +247,7 @@ llvm::Instruction::CastOps llvm_op(ir::cast_op_t op) { case ttop::BitCast: return llop::BitCast; case ttop::AddrSpaceCast: return llop::AddrSpaceCast; } + throw std::runtime_error("unknown operator"); } llvm::CmpInst::Predicate llvm_pred(ir::cmp_pred_t pred) { @@ -283,6 +285,7 @@ llvm::CmpInst::Predicate llvm_pred(ir::cmp_pred_t pred) { case ttop::ICMP_SLE: return llop::ICMP_SLE; case ttop::LAST_ICMP_PREDICATE: return llop::LAST_ICMP_PREDICATE; } + throw std::runtime_error("unknown operator"); } /* convert ir::type to Type */ @@ -468,7 +471,7 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::function(inst)){ Value *ptr = value(ii->get_operand(0)); Value *val = value(ii->get_operand(1)); - Value *atom_f_add; + Value *atom_f_add = nullptr; if(val->getType()->isFloatTy()) atom_f_add = Intrinsic::getDeclaration(builder.GetInsertBlock()->getModule(), Intrinsic::nvvm_atomic_load_add_f32, {ptr->getType()}); else if(val->getType()->isHalfTy()){ @@ -477,6 +480,8 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::functiongetPointerTo(), fp16}, false); atom_f_add = InlineAsm::get(atom_ty, " atom.relaxed.global.gpu.add.noftz.f16 $0, [$1], $2;", "=h,l,h", true); } + if(atom_f_add == nullptr) + throw std::runtime_error("unsupported atomic add"); Value *res = builder.CreateCall(atom_f_add, {ptr, val}); return (Instruction*)res; } @@ -607,7 +612,6 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id Value *_2 = builder.getInt32(2); Value *_3 = builder.getInt32(3); Value *_4 = builder.getInt32(4); - Value *_8 = builder.getInt32(8); Value *_16 = builder.getInt32(16); // fragments per warp @@ -1303,11 +1307,10 @@ void selection::lower_masked_load(ir::masked_load_inst *x, LLVMContext &ctx, Fun unsigned id = linear / vector_size; if(linear % vector_size == 0) { Value *ptr = pointers->get_value(idx); - ConstantInt *cst = nullptr; - if(GetElementPtrInst *gep = dyn_cast(ptr)) - if(gep->getNumIndices() == 1){ - cst = dyn_cast(gep->idx_begin()); - } +// ConstantInt *cst = nullptr; +// if(GetElementPtrInst *gep = dyn_cast(ptr)) +// if(gep->getNumIndices() == 1) +// cst = dyn_cast(gep->idx_begin()); ptr = builder.CreateBitCast(ptr, PointerType::get(VectorType::get(result->get_ty(), vector_size), ptr->getType()->getPointerAddressSpace())); @@ -1374,10 +1377,6 @@ void selection::lower_load(ir::load_inst *x, LLVMContext &ctx, Function *fn, IRB unsigned id = linear / vector_size; if(linear % vector_size == 0) { Value *ptr = pointers->get_value(idx); - ConstantInt *cst = nullptr; - if(GetElementPtrInst *gep = dyn_cast(ptr)) - if(gep->getNumIndices() == 1) - cst = dyn_cast(gep->idx_begin()); ptr = builder.CreateBitCast(ptr, PointerType::get(VectorType::get(result->get_ty(), vector_size), ptr->getType()->getPointerAddressSpace())); packets[id] = builder.CreateLoad(ptr); diff --git a/lib/codegen/transform/peephole.cpp b/lib/codegen/transform/peephole.cpp index d5d678628..73885c772 100644 --- a/lib/codegen/transform/peephole.cpp +++ b/lib/codegen/transform/peephole.cpp @@ -60,6 +60,7 @@ ir::value* rewrite_trans_phi_impl(ir::value *value, ir::builder &builder, trans->set_operand(0, i); return trans; } + return nullptr; } bool peephole::rewrite_trans_phi(ir::instruction* value, ir::builder& builder) { @@ -76,6 +77,8 @@ bool peephole::rewrite_trans_phi(ir::instruction* value, ir::builder& builder) { if(!phi) return false; ir::value* new_phi = rewrite_trans_phi_impl(phi, builder, trans->get_perm()); + if(!new_phi) + return false; trans->replace_all_uses_with(new_phi); return true; diff --git a/lib/ir/constant.cpp b/lib/ir/constant.cpp index 4b06af60e..5ace19a04 100644 --- a/lib/ir/constant.cpp +++ b/lib/ir/constant.cpp @@ -67,8 +67,7 @@ constant_range::constant_range(type *ty, constant_int *first, constant_int *last constant *constant_range::get(constant_int *first, constant_int *last) { assert(first->get_type()->is_integer_ty()); assert(first->get_type() == last->get_type()); - unsigned vfirst = ((constant_int*)first)->get_value(); - assert(vfirst == 0); + assert(((constant_int*)first)->get_value() == 0); type *ty = tile_type::get(first->get_type(), {last}); return new constant_range(ty, first, last); } diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index 85b6eee5c..bd06668e6 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -359,8 +359,11 @@ getelementptr_inst::getelementptr_inst(type *pointee_ty, value *ptr, const std:: : instruction(get_return_type(pointee_ty, ptr, idx), 1 + idx.size(), 1, name, next), source_elt_ty(pointee_ty), res_elt_ty(get_indexed_type(pointee_ty, idx)){ - type *expected_ty = ((pointer_type*)(get_type()->get_scalar_ty()))->get_element_ty(); + // sanity check + type *expected_ty = get_type()->get_scalar_ty(); + expected_ty = ((pointer_type*)expected_ty)->get_element_ty(); assert(res_elt_ty == expected_ty); + // set operands set_operand(0, ptr); for(size_t i = 0; i < idx.size(); i++) set_operand(1 + i, idx[i]); @@ -574,7 +577,7 @@ ir::type* trans_inst::get_res_ty(ir::type* ty, std::vector perm) // permutate argument shapes perm = init_perm(ty, perm); ir::tile_type::tile_shapes_t res_shapes = arg_shapes; - for(int i = 0; i < perm.size(); i++) + for(size_t i = 0; i < perm.size(); i++) res_shapes[i] = arg_shapes[perm[i]->get_value()]; // construct type return tile_type::get(ty->get_scalar_ty(), res_shapes); @@ -587,16 +590,17 @@ std::vector trans_inst::init_perm(ir::type* ty, const std::vector ir::type* int32_ty = type::get_int32_ty(ty->get_context()); std::vector result; result.push_back(ir::constant_int::get(int32_ty, size - 1)); - for(int i = 0; i < size - 1; i++) + for(size_t i = 0; i < size - 1; i++) result.push_back(ir::constant_int::get(int32_ty, i)); return result; } trans_inst::trans_inst(value *arg, const std::vector& perm, const std::string &name, instruction *next) : builtin_inst(get_res_ty(arg->get_type(), perm), 1, 1, name, next) { + // sanity check perm_ = init_perm(arg->get_type(), perm); - auto size = arg->get_type()->get_tile_shapes().size(); - assert(perm_.size() == size); + //auto size = arg->get_type()->get_tile_shapes().size(); + //assert(perm_.size() == size); set_operand(0, arg); } diff --git a/lib/ir/module.cpp b/lib/ir/module.cpp index 7adcbb14a..3d995558e 100644 --- a/lib/ir/module.cpp +++ b/lib/ir/module.cpp @@ -96,8 +96,7 @@ ir::value *module::get_value_recursive(const std::string& name, ir::basic_block bool is_const = const_.find(name) != const_.end(); auto &preds = block->get_predecessors(); ir::type *ty = get_scope().types.at(name); - if(block) - if(!is_const && sealed_blocks_.find(block) == sealed_blocks_.end()){ + if(block && !is_const && sealed_blocks_.find(block) == sealed_blocks_.end()){ incomplete_phis_[block][name] = make_phi(ty, 1, block); result = (ir::value*)incomplete_phis_[block][name]; } @@ -106,9 +105,9 @@ ir::value *module::get_value_recursive(const std::string& name, ir::basic_block result = get_value(name, has_pred?preds.front():nullptr); } else{ - result = make_phi(ty, 1, block); - set_value(name, block, result); - result = add_phi_operands(name, (ir::phi_node*&)result); + ir::phi_node* phi = make_phi(ty, 1, block); + set_value(name, block, phi); + result = add_phi_operands(name, phi); } if(auto *phi = dynamic_cast(result)) result = try_remove_trivial_phis(phi); diff --git a/lib/lang/node.cpp b/lib/lang/node.cpp index 29d61cdb8..dda7126bd 100644 --- a/lib/lang/node.cpp +++ b/lib/lang/node.cpp @@ -106,7 +106,7 @@ void node::implicit_broadcast(ir::module *mod, ir::value *&lhs, ir::value *&rhs) size_t res_size = std::max(lhs_size, rhs_size); ir::type::tile_shapes_t res_shapes(res_size); ir::type::tile_shapes_t::value_type one = ir::tile_type::make_one(mod->get_context()); - for(int i = 0; i < res_size; i++){ + for(size_t i = 0; i < res_size; i++){ if(i >= res_size - lhs_size && i >= res_size - rhs_size) res_shapes[i] = lhs_shapes[i]==one?rhs_shapes[i]:lhs_shapes[i]; else if(i >= res_size - lhs_size) @@ -147,7 +147,7 @@ void node::implicit_broadcast(ir::module *mod, ir::type *ty, ir::value *&src){ int src_dim = src_shapes.size(); // Pad int off = dst_dim - src_dim; - for(size_t i = 0; i < off; i++) + for(int i = 0; i < off; i++) src_shapes.insert(src_shapes.begin(), one); if(off > 0) src = builder.create_reshape(src, src_shapes); diff --git a/lib/runtime/function.cpp b/lib/runtime/function.cpp index 034738c93..d69049291 100644 --- a/lib/runtime/function.cpp +++ b/lib/runtime/function.cpp @@ -88,10 +88,10 @@ arg_type convert(ir::type *ty) { } function::caller::caller(ir::function *ir, std::shared_ptr parent, size_t n_threads) - : bin_(driver::kernel::create(&*parent, ir->get_name().c_str())), n_threads_(n_threads), parent_(parent) { + : bin_(driver::kernel::create(&*parent, ir->get_name().c_str())), parent_(parent), n_threads_(n_threads) { // extract signature ir::function_type* ty = ir->get_fn_type(); - for(int i = 0; i < ty->get_num_params(); i++) + for(size_t i = 0; i < ty->get_num_params(); i++) param_tys_.push_back(convert(ty->get_param_ty(i))); } diff --git a/python/examples/dot.py b/python/examples/dot.py index 75fe931bc..638d49c20 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -11,7 +11,8 @@ void matmul(restrict read_only align(16) half *A, restrict read_only align(16) half *B, restrict read_only align(16) half *C, int M, int N, int K, - multiple_of(8) int lda, multiple_of(8) int ldb, int ldc) { + multiple_of(8) int lda, multiple_of(8) int ldb, int ldc) +{ int ridx = get_program_id(0); int ridy = get_program_id(1); int rxa[TM] = ridx * TM + (0 ... TM); diff --git a/python/triton/ops.py b/python/triton/ops.py index ea782ad08..a10739903 100644 --- a/python/triton/ops.py +++ b/python/triton/ops.py @@ -17,8 +17,8 @@ import tensorflow as tf extra_ops = tf.load_op_library('/home/philippe/development/triton/python/build/lib.linux-x86_64-3.6/libextra_tf_ops.so') -def make_bindings(src, outputs, grids): - return libtriton.make_tensorflow_src(src, outputs, grids) +def make_bindings(src, out, grid): + return libtriton.make_tensorflow_src(src, out, grid) def make_cache_path(src): md5 = hashlib.sha1(src.encode()) From c787ebae684bf1fd7c66a155fb5eaae6cea6b133 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 18 Aug 2019 14:09:55 -0700 Subject: [PATCH 304/494] more cleaning --- include/triton/codegen/analysis/tune.h | 1 - lib/codegen/analysis/tune.cpp | 14 -------------- 2 files changed, 15 deletions(-) diff --git a/include/triton/codegen/analysis/tune.h b/include/triton/codegen/analysis/tune.h index 63bd2bcc3..66cab1285 100644 --- a/include/triton/codegen/analysis/tune.h +++ b/include/triton/codegen/analysis/tune.h @@ -42,7 +42,6 @@ private: public: tune(size_t num_warps); - std::vector get_params(ir::module& mod); ir::metaparameter* get_param(ir::value *value, const std::string &key) { return params_[value][key]; } unsigned get_param_group(ir::value *value, unsigned ax); fragment_t get_fragment(ir::value *value, unsigned ax) { return fragments_.at({value, ax}); } diff --git a/lib/codegen/analysis/tune.cpp b/lib/codegen/analysis/tune.cpp index c43a7126b..5a019a550 100644 --- a/lib/codegen/analysis/tune.cpp +++ b/lib/codegen/analysis/tune.cpp @@ -183,20 +183,6 @@ void tune::connected_components(node_t x, const std::vector } } -std::vector tune::get_params(ir::module &mod) { - throw std::runtime_error("remove me"); -// std::vector result; -// std::set seen; -// for(auto x: mod.globals()) { -// if(auto mp = dynamic_cast(x.second)) -// if(seen.insert(mp).second && !mp->has_value()) -// result.push_back(mp); -// } -// num_warps_ = ir::metaparameter::create(mod.get_context(), mod.get_builder().get_int32_ty(), 4, 4); -// result.push_back(num_warps_); -// return result; -} - unsigned tune::get_param_group(ir::value *value, unsigned ax) { unsigned result = groups_.at(value).at(ax); return result; From 457c330f158194fe68a5f52098818cee98e81acd Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 18 Aug 2019 14:20:42 -0700 Subject: [PATCH 305/494] more cleaning --- include/triton/codegen/analysis/tune.h | 1 - include/triton/lang/parser.y | 2 +- lib/codegen/analysis/tune.cpp | 3 +-- lib/runtime/function.cpp | 1 - 4 files changed, 2 insertions(+), 5 deletions(-) diff --git a/include/triton/codegen/analysis/tune.h b/include/triton/codegen/analysis/tune.h index 66cab1285..373b20c03 100644 --- a/include/triton/codegen/analysis/tune.h +++ b/include/triton/codegen/analysis/tune.h @@ -48,7 +48,6 @@ public: void copy(ir::value *dst, ir::value *src); bool check_constraints(std::map> &errors); void run(ir::module &mod); - void init(ir::module &mod); unsigned get_num_threads(); private: diff --git a/include/triton/lang/parser.y b/include/triton/lang/parser.y index d67a89562..118e26b8b 100644 --- a/include/triton/lang/parser.y +++ b/include/triton/lang/parser.y @@ -120,7 +120,7 @@ identifier /* Built-in */ builtin_expression - : GET_PROGRAM_ID '(' constant ')' { $$ = new get_program_id_expression($3); } + : GET_PROGRAM_ID '(' constant ')' { $$ = new get_program_id_expression($3); } | GET_NUM_PROGRAM '(' constant ')' { $$ = new get_num_program_expression($3); } | DOT '(' expression ',' expression ',' expression ')' { $$ = new matmul_expression($3, $5, $7); } | SQRT '(' expression ')' { $$ = new sqrt_expression($3); } diff --git a/lib/codegen/analysis/tune.cpp b/lib/codegen/analysis/tune.cpp index 5a019a550..2f00d0eb6 100644 --- a/lib/codegen/analysis/tune.cpp +++ b/lib/codegen/analysis/tune.cpp @@ -232,9 +232,7 @@ void tune::run(ir::module &mod) { } } } -} -void tune::init(ir::module &mod) { for(ir::function *fn: mod.get_function_list()){ std::map references; create_grids(grids_, references, fn); @@ -316,6 +314,7 @@ void tune::init(ir::module &mod) { assert(num_threads == effective_num_threads); } } + } diff --git a/lib/runtime/function.cpp b/lib/runtime/function.cpp index d69049291..250e53243 100644 --- a/lib/runtime/function.cpp +++ b/lib/runtime/function.cpp @@ -199,7 +199,6 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c peephole.run(module); dce.run(module); tune.run(module); - tune.init(module); reassociate.run(module); peephole.run(module); if(target->is_gpu()){ From 0970fe12dd9f8d4f8b1a3cd953905f18358db179 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 18 Aug 2019 15:39:36 -0700 Subject: [PATCH 306/494] [general] cleaned tensorflow source code generation --- .../codegen/analysis/shmem/allocation.h | 6 +- include/triton/codegen/analysis/tune.h | 5 +- include/triton/codegen/selection/selection.h | 6 +- .../triton/codegen/transform/reassociate.h | 6 +- include/triton/codegen/transform/vectorize.h | 6 +- include/triton/runtime/function.h | 2 +- lib/codegen/analysis/shmem/allocation.cpp | 4 +- lib/codegen/analysis/tune.cpp | 26 +- lib/codegen/selection/selection.cpp | 4 +- lib/codegen/transform/reassociate.cpp | 2 +- lib/runtime/function.cpp | 4 +- python/src/tensorflow.cpp | 243 ++++++++++-------- 12 files changed, 162 insertions(+), 152 deletions(-) diff --git a/include/triton/codegen/analysis/shmem/allocation.h b/include/triton/codegen/analysis/shmem/allocation.h index 024c3cf68..243d78352 100644 --- a/include/triton/codegen/analysis/shmem/allocation.h +++ b/include/triton/codegen/analysis/shmem/allocation.h @@ -15,7 +15,7 @@ namespace ir{ namespace codegen{ namespace analysis{ -class tune; +class grids; namespace shmem{ @@ -24,7 +24,7 @@ class info; class allocation { public: - allocation(liveness *live, info *buffer_info, tune *params) + allocation(liveness *live, info *buffer_info, grids *params) : liveness_(live), buffer_info_(buffer_info), params_(params){ } // utilities @@ -45,7 +45,7 @@ private: // dependences liveness *liveness_; info *buffer_info_; - tune *params_; + grids *params_; }; } diff --git a/include/triton/codegen/analysis/tune.h b/include/triton/codegen/analysis/tune.h index 373b20c03..26331c786 100644 --- a/include/triton/codegen/analysis/tune.h +++ b/include/triton/codegen/analysis/tune.h @@ -19,7 +19,7 @@ namespace ir{ namespace codegen{ namespace analysis{ -class tune { +class grids { typedef std::pair node_t; typedef std::map > graph_t; @@ -41,12 +41,11 @@ private: public: - tune(size_t num_warps); + grids(size_t num_warps); ir::metaparameter* get_param(ir::value *value, const std::string &key) { return params_[value][key]; } unsigned get_param_group(ir::value *value, unsigned ax); fragment_t get_fragment(ir::value *value, unsigned ax) { return fragments_.at({value, ax}); } void copy(ir::value *dst, ir::value *src); - bool check_constraints(std::map> &errors); void run(ir::module &mod); unsigned get_num_threads(); diff --git a/include/triton/codegen/selection/selection.h b/include/triton/codegen/selection/selection.h index 3f118d47a..2610fefc3 100644 --- a/include/triton/codegen/selection/selection.h +++ b/include/triton/codegen/selection/selection.h @@ -44,7 +44,7 @@ namespace codegen{ namespace analysis{ -class tune; +class grids; class alignment_info; namespace shmem{ @@ -196,7 +196,7 @@ private: public: - selection(analysis::shmem::allocation *alloc, analysis::tune *params, analysis::shmem::info *buffer_info, analysis::alignment_info *alignment, target *tgt) + selection(analysis::shmem::allocation *alloc, analysis::grids *params, analysis::shmem::info *buffer_info, analysis::alignment_info *alignment, target *tgt) : alloc_(alloc), params_(params), buffer_info_(buffer_info), alignment_(alignment), tgt_(tgt){ } void run(ir::module &src, Module &dst); @@ -205,7 +205,7 @@ private: vmap_t vmap_; tmap_t tmap_; analysis::shmem::allocation *alloc_; - analysis::tune *params_; + analysis::grids *params_; analysis::shmem::info *buffer_info_; analysis::alignment_info *alignment_; target *tgt_; diff --git a/include/triton/codegen/transform/reassociate.h b/include/triton/codegen/transform/reassociate.h index ce7ab476a..f7b843846 100644 --- a/include/triton/codegen/transform/reassociate.h +++ b/include/triton/codegen/transform/reassociate.h @@ -19,7 +19,7 @@ class getelementptr_inst; namespace codegen{ namespace analysis{ -class tune; +class grids; class alignment_info; } @@ -37,11 +37,11 @@ private: ir::value *reassociate_ptr(ir::getelementptr_inst* pz, ir::builder &builder, std::map &offsets); public: - reassociate(analysis::tune *params); + reassociate(analysis::grids *params); void run(ir::module& module); private: - analysis::tune* params_; + analysis::grids* params_; }; } diff --git a/include/triton/codegen/transform/vectorize.h b/include/triton/codegen/transform/vectorize.h index 09fb48000..bf08eb46f 100644 --- a/include/triton/codegen/transform/vectorize.h +++ b/include/triton/codegen/transform/vectorize.h @@ -10,18 +10,18 @@ namespace ir { namespace codegen{ namespace analysis{ - class tune; + class grids; } namespace transform{ class vectorize { public: - vectorize(analysis::tune *params): params_(params){} + vectorize(analysis::grids *params): params_(params){} void run(ir::module &mod); private: - analysis::tune *params_; + analysis::grids *params_; }; } diff --git a/include/triton/runtime/function.h b/include/triton/runtime/function.h index af849448b..2880a4e54 100644 --- a/include/triton/runtime/function.h +++ b/include/triton/runtime/function.h @@ -42,7 +42,7 @@ class translation_unit; namespace codegen{ namespace analysis{ -class tune; +class grids; } } diff --git a/lib/codegen/analysis/shmem/allocation.cpp b/lib/codegen/analysis/shmem/allocation.cpp index ead6143b3..00e90d4a6 100644 --- a/lib/codegen/analysis/shmem/allocation.cpp +++ b/lib/codegen/analysis/shmem/allocation.cpp @@ -21,7 +21,7 @@ unsigned allocation::is_ld_padded(ir::value *x) { } for(ir::user* user: x->get_users()) if(auto dot = dynamic_cast(user)){ - bool is_hmma = params_->get_fragment(user, 0) == tune::HMMA_FRAGMENT_C; + bool is_hmma = params_->get_fragment(user, 0) == grids::HMMA_FRAGMENT_C; bool is_op_0 = x == dot->get_operand(0); bool is_op_1 = x == dot->get_operand(1); if(is_hmma && is_op_0){ @@ -57,7 +57,7 @@ unsigned allocation::get_num_bytes(ir::value *x) { for(auto x: shapes) num_elements *= x->get_value(); size_t depth; - if(params_->get_fragment(x, 0) == tune::HMMA_FRAGMENT_C) + if(params_->get_fragment(x, 0) == grids::HMMA_FRAGMENT_C) depth = params_->get_param(op, "wpt.d" + std::to_string(axis))->get_value(); else depth = params_->get_param(op, "mts.d" + std::to_string(axis))->get_value(); diff --git a/lib/codegen/analysis/tune.cpp b/lib/codegen/analysis/tune.cpp index 2f00d0eb6..9e6c499a2 100644 --- a/lib/codegen/analysis/tune.cpp +++ b/lib/codegen/analysis/tune.cpp @@ -15,7 +15,7 @@ namespace triton{ namespace codegen{ namespace analysis{ -tune::tune(size_t num_warps): num_warps_(num_warps){ +grids::grids(size_t num_warps): num_warps_(num_warps){ } bool is_hmma(ir::value *v){ @@ -32,14 +32,14 @@ bool is_hmma(ir::value *v){ return result; } -void tune::add_constraint(node_t x, node_t y) { +void grids::add_constraint(node_t x, node_t y) { dependencies_[x].insert(y); dependencies_[y].insert(x); nodes_.insert(x); nodes_.insert(y); } -void tune::init_c_phi(ir::instruction *v) { +void grids::init_c_phi(ir::instruction *v) { // Phi Nodes: all the incoming value share the result layout if(auto *phi = dynamic_cast(v)) for(ir::value *op: phi->ops()) @@ -50,7 +50,7 @@ void tune::init_c_phi(ir::instruction *v) { } } -void tune::init_c_graph(ir::instruction *v) { +void grids::init_c_graph(ir::instruction *v) { // Reference shape ir::type::tile_shapes_t::value_type one = ir::tile_type::make_one(v->get_parent()->get_context()); ir::type::tile_shapes_t shapes; @@ -142,7 +142,7 @@ void tune::init_c_graph(ir::instruction *v) { } } -tune::fragment_t tune::get_fragmentation_type(node_t x, graph_t &graph){ +grids::fragment_t grids::get_fragmentation_type(node_t x, graph_t &graph){ std::list work; std::set seen; work.push_back(x); @@ -160,7 +160,7 @@ tune::fragment_t tune::get_fragmentation_type(node_t x, graph_t &graph){ return STRIDED_SCAN; } -void tune::connected_components(node_t x, const std::vector mps, const std::vector prefixes, std::set &nodes, graph_t &graph, unsigned group_id) { +void grids::connected_components(node_t x, const std::vector mps, const std::vector prefixes, std::set &nodes, graph_t &graph, unsigned group_id) { // std::cout << "connected component: " << x.first->get_name() << " " << x.second << std::endl; groups_[x.first].insert({x.second, group_id}); if(nodes.find(x) != nodes.end()){ @@ -183,20 +183,20 @@ void tune::connected_components(node_t x, const std::vector } } -unsigned tune::get_param_group(ir::value *value, unsigned ax) { +unsigned grids::get_param_group(ir::value *value, unsigned ax) { unsigned result = groups_.at(value).at(ax); return result; } //TODO: This shouldn't exist! -void tune::copy(ir::value *dst, ir::value *src) { +void grids::copy(ir::value *dst, ir::value *src) { params_[dst] = params_[src]; groups_[dst] = groups_[src]; fragments_[{dst, 0}] = fragments_[{src, 0}]; } -void tune::run(ir::module &mod) { +void grids::run(ir::module &mod) { ir::context &ctx = mod.get_context(); // Create metaparameters for(ir::function *fn: mod.get_function_list()){ @@ -318,7 +318,7 @@ void tune::run(ir::module &mod) { } -void tune::create_grids(std::vector &grids, +void grids::create_grids(std::vector &grids, std::map &references, ir::function *fn) { // get number of dimensions greater than 1 @@ -363,11 +363,7 @@ void tune::create_grids(std::vector &grids, } -bool tune::check_constraints(std::map> &errors) { - return errors.empty(); -} - -unsigned tune::get_num_threads() { +unsigned grids::get_num_threads() { return num_warps_*32; } diff --git a/lib/codegen/selection/selection.cpp b/lib/codegen/selection/selection.cpp index 4b31dce52..99b18e568 100644 --- a/lib/codegen/selection/selection.cpp +++ b/lib/codegen/selection/selection.cpp @@ -573,7 +573,7 @@ inline void to_warps(const std::vector &bs, std::vector &nw, void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { const auto& shapes = v->get_type()->get_tile_shapes(); size_t dim = shapes.size(); - if(params_->get_fragment(v, 0) == analysis::tune::STRIDED_SCAN){ + if(params_->get_fragment(v, 0) == analysis::grids::STRIDED_SCAN){ std::vector contiguous(dim); std::vector block_size(dim); std::vector warp_size(dim); @@ -1278,7 +1278,7 @@ void selection::lower_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn, IRB if(NK != 1) { shared_tile *TA = (shared_tile*)tmap_.at(A); shared_tile *TB = (shared_tile*)tmap_.at(B); - if(params_->get_fragment(dot, 0) == analysis::tune::STRIDED_SCAN) + if(params_->get_fragment(dot, 0) == analysis::grids::STRIDED_SCAN) lower_scanline_dot(dot, ctx, fn, builder, TC, TA, TB, TD, NK, c_ty, f_mul_add); else lower_hmma_dot(dot, ctx, fn, builder, TC, TA, TB, TD, NK); diff --git a/lib/codegen/transform/reassociate.cpp b/lib/codegen/transform/reassociate.cpp index c411ccf12..c5e76f18a 100644 --- a/lib/codegen/transform/reassociate.cpp +++ b/lib/codegen/transform/reassociate.cpp @@ -155,7 +155,7 @@ ir::value *reassociate::reassociate_idx(ir::value *old_value, return new_value; } -reassociate::reassociate(analysis::tune* params) +reassociate::reassociate(analysis::grids* params) : params_(params) { } diff --git a/lib/runtime/function.cpp b/lib/runtime/function.cpp index 250e53243..1e7de730b 100644 --- a/lib/runtime/function.cpp +++ b/lib/runtime/function.cpp @@ -147,7 +147,7 @@ options function::autotune(lang::translation_unit *ast, driver::stream* stream, double ts; std::vector params; }; - profile_t best = { INFINITY }; + profile_t best = { INFINITY, {} }; std::function)> benchmark = [&](std::vector params) { // options @@ -184,7 +184,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c if(auto* mp = dynamic_cast(module.globals().at(x.first))) mp->set_value(x.second); // create passes - codegen::analysis::tune tune(opt.num_warps); + codegen::analysis::grids tune(opt.num_warps); codegen::analysis::shmem::info shmem_info; codegen::analysis::shmem::liveness shmem_liveness(&shmem_info); codegen::analysis::shmem::allocation shmem_allocation(&shmem_liveness, &shmem_info, &tune); diff --git a/python/src/tensorflow.cpp b/python/src/tensorflow.cpp index 0e98f6636..ef7de24ff 100644 --- a/python/src/tensorflow.cpp +++ b/python/src/tensorflow.cpp @@ -74,49 +74,118 @@ inline std::unique_ptr make_ir(ir::context& ctx, triton::lang::trans return std::unique_ptr(module); } + +void gen_extract_inputs(std::ostream &os, const std::vector& args) { + for(unsigned i = 0; i < args.size(); i++){ + ir::value *arg = args[i]; + std::string suffix = ""; + ir::type *tr_ty = arg->get_type(); + std::string tf_ty = ref_to_tf_ty(tr_ty); + if(!tr_ty->is_pointer_ty()) + suffix = ".scalar<" + tf_ty + ">()()"; + os << " " << tf_ty << " " << arg->get_name() << " = context->input(" << i << ")" << suffix << ";\n "; + } +} + +void gen_set_outputs(std::ostream &os, const std::vector& outputs) { + for(unsigned i = 0; i < outputs.size(); i++) + os << " context->set_output(" << i << ", " << outputs[i] << ");\n "; +} + +void gen_make_handles(std::ostream &os, const std::vector& args) { + for(unsigned i = 0; i < args.size(); i++){ + ir::argument *arg = args[i]; + if(!arg->get_type()->is_pointer_ty()) + continue; + const std::string& name = arg->get_name(); + os << " drv::cu_buffer cu_" + name + "(ctx, " + name + ".tensor_data().size(), (CUdeviceptr)" + name + ".tensor_data().data(), false);\n "; + } +} + +void gen_make_spmd_grid(std::ostream &os, const std::vector& macros) { + std::regex regex("#([a-zA-Z]([a-zA-Z]|[0-9])*)"); + std::vector grids = macros; + for(size_t i = grids.size(); i < 3; i++) + grids.push_back("1"); + std::string grid = "rt::grid_t{"; + for(size_t i = 0; i < grids.size(); i++){ + if(i > 0) + grid += ", "; + grid += std::regex_replace(grids[i], regex, "x.at(\"$1\")"); + } + grid += "}"; + + os << " auto grid = [&](const rt::params_t& x) { return " << grid << "; };\n "; +} + +void gen_make_launch_function(std::ostream &os, const std::vector& args) { + os << " fn_({"; + for(unsigned i = 0; i < args.size() ; i++){ + ir::argument *arg = args[i]; + std::string name = arg->get_name(); + if(arg->get_type()->is_pointer_ty()) + name = "&cu_" + name; + if(i > 0) + os << ", "; + os << name; + } + os << "}, grid, stream); \n"; +} + +void gen_register_kernel_builder(std::ostream &os, const std::string &name, + const std::string &classname, + const std::vector& args){ + os << "REGISTER_KERNEL_BUILDER(Name(\"" + name + "\").Device(DEVICE_GPU)"; + for(size_t i = 0; i < args.size(); i++){ + ir::argument *arg = args[i]; + std::string name = arg->get_name(); + auto tolower = [](char c) { return std::tolower(c);}; + std::transform(name.begin(), name.end(), name.begin(), tolower); + if(!arg->get_type()->is_pointer_ty()) + os << ".HostMemory(\"" + name + "\")"; + } + os << ", " + classname << ");\n"; +} + +void gen_register_op(std::ostream &os, const std::string &name, + const std::vector& args, + const std::vector& outputs){ + os << "REGISTER_OP(\"" << name << "\")\n"; + for(size_t i = 0; i < args.size(); i++){ + ir::argument *arg = args[i]; + std::string name = arg->get_name(); + auto tolower = [](char c) { return std::tolower(c);}; + std::transform(name.begin(), name.end(), name.begin(), tolower); + os << " .Input(\"" << name << ": " << to_tf_scalar_ty(arg->get_type()) << "\")\n"; + } + for(size_t i = 0; i < outputs.size(); i++){ + std::string name = outputs[i]; + size_t idx; + for(idx = 0; idx < args.size(); idx++) + if(args[idx]->get_name() == name) + break; + if(idx == args.size()) + throw std::runtime_error("unknown output"); + os << " .Output(\"out" << i << ": " << to_tf_scalar_ty(args[idx]->get_type()) << "\")\n"; + } + os << ";\n"; +} + std::string make_tensorflow_src(const std::string src, const std::vector& outputs, const std::vector& macros) { triton::lang::translation_unit *ast = make_ast(src.c_str()); triton::ir::context context; std::unique_ptr ir = make_ir(context, ast); - // extract function signature + // function ir::function* fn = ir->get_function_list().front(); - ir::function_type* fn_ty = fn->get_fn_type(); - // numberof arguments - size_t n_args = fn_ty->get_num_params(); - size_t n_outputs = outputs.size(); - // extract function name std::string name = fn->get_name(); name[0] = static_cast(std::toupper(name[0])); std::string classname = name + "Op"; - // extract argument name - std::vector arg_names; - for(ir::argument *arg: fn->args()) - arg_names.push_back(arg->get_name()); - // cached int to str - std::vector str_i; - for(size_t i = 0; i < fn_ty->get_num_params(); i++) - str_i.push_back(std::to_string(i)); - // index of tensors - std::vector ptr_idx; - for(unsigned i = 0; i < fn_ty->get_num_params(); i++) - if(fn_ty->get_param_ty(i)->is_pointer_ty()) - ptr_idx.push_back(i); - // extract tensorflow types - std::vector tf_scalar_tys; - std::transform(fn_ty->params_begin(), fn_ty->params_end(), std::back_inserter(tf_scalar_tys), to_tf_scalar_ty); - std::vector tf_cref_tys; - std::transform(fn_ty->params_begin(), fn_ty->params_end(), std::back_inserter(tf_cref_tys), ref_to_tf_ty); - // output indices - std::vector out_idx; - for(const std::string &name : outputs){ - auto it = std::find(arg_names.begin(), arg_names.end(), name); - out_idx.push_back(std::distance(arg_names.begin(), it)); - } + std::ostringstream oss; - std::string result = R"( + oss << R"( #include "triton/driver/buffer.h" #include "triton/driver/backend.h" #include "triton/driver/stream.h" @@ -138,106 +207,52 @@ namespace drv = triton::driver; std::string src = R"TTKERNSRC( )" + src + ")TTKERNSRC\";" + R"( -class )" + classname + R"(: public OpKernel { +class )" << classname << R"(: public OpKernel { public: - explicit )" + classname + R"((OpKernelConstruction* context) + explicit )" << classname << R"((OpKernelConstruction* context) : OpKernel(context), fn_(src) { } void Compute(OpKernelContext* context){ - // get device/stream GPUDevice device = context->eigen_device(); drv::cu_stream sstream(device.stream(), false); drv::context* ctx = sstream.context(); drv::stream* stream = &sstream; - - // extract inputs)"; -for(unsigned i = 0; i < n_args; i++){ - std::string suffix = ""; - std::string ty = tf_cref_tys[i]; - if(!fn_ty->get_param_ty(i)->is_pointer_ty()) - suffix = ".scalar<" + ty + ">()()"; - result += R"( - )" + ty + " " + arg_names[i] + " = context->input(" + str_i[i] + ")" + suffix + ";"; -} - -result += R"( - - // extract outputs)"; -for(unsigned i = 0; i < n_outputs; i++) - result += R"( - context->set_output()" + str_i[i] + ", " + outputs[i] + ");"; - -result += R"( - - // wrap tensors)"; -for(size_t i: ptr_idx) -result += R"( - drv::cu_buffer cu_)" + arg_names[i] + "(ctx, " + arg_names[i] + ".tensor_data().size(), (CUdeviceptr)" + arg_names[i] + R"(.tensor_data().data(), false);)"; - - -std::regex regex("#([a-zA-Z]([a-zA-Z]|[0-9])*)"); -std::vector grids = macros; -for(size_t i = grids.size(); i < 3; i++) - grids.push_back("1"); -std::string grid = "rt::grid_t{"; -for(size_t i = 0; i < grids.size(); i++){ - if(i > 0) - grid += ", "; - grid += std::regex_replace(grids[i], regex, "x.at(\"$1\")"); -} -grid += "}"; - -result += R"( - - // create launch grid; - auto grid = [&](const rt::params_t& x) { return )" + grid + R"(; };)"; - -result += R"( - - // execute function - fn_({ + // extract inputs )"; -for(unsigned i = 0; i < n_args; i++){ - std::string arg = arg_names[i]; - if(fn_ty->get_param_ty(i)->is_pointer_ty()) - arg = "&cu_" + arg; - if(i > 0) - result += ", "; - result += arg; -} -result += R"( - }, grid, stream); - +gen_extract_inputs(oss, fn->args()); +oss << R"( + // set outputs + )"; +gen_set_outputs(oss, outputs); +oss << R"( + // wrap tensors + )"; +gen_make_handles(oss, fn->args()); +oss << R"( + // create spmd grid + )"; +gen_make_spmd_grid(oss, macros); +oss << R"( + // launch function + )"; +gen_make_launch_function(oss, fn->args()); +oss << R"( } private: rt::function fn_; }; -REGISTER_KERNEL_BUILDER(Name(")" + name + "\").Device(DEVICE_GPU)"; -for(size_t i = 0; i < tf_scalar_tys.size(); i++){ - std::string arg_name = arg_names[i]; - std::transform(arg_name.begin(), arg_name.end(), arg_name.begin(), [](char c) { return std::tolower(c);}); - if(!fn_ty->get_param_ty(i)->is_pointer_ty()) - result += ".HostMemory(\"" + arg_name + "\")"; -} -result += ", " + classname + R"(); +// register kernel builder +)"; +gen_register_kernel_builder(oss, name, classname, fn->args()); +oss << R"( +// register op +)"; +gen_register_op(oss, name, fn->args(), outputs); - -REGISTER_OP(")" + name + "\")\n"; -for(size_t i = 0; i < tf_scalar_tys.size(); i++){ - std::string arg_name = arg_names[i]; - std::transform(arg_name.begin(), arg_name.end(), arg_name.begin(), [](char c) { return std::tolower(c);}); - result += " .Input(\"" + arg_name + ": " + tf_scalar_tys[i] + "\")\n"; -} -for(size_t i = 0; i < outputs.size(); i++){ - result += " .Output(\"out" + std::to_string(i) + ": " + tf_scalar_tys[out_idx[i]] + "\")\n"; -} -result += ";\n"; - - - return result; + return oss.str(); } From bc11e31419cef056efc2f311ba6dcbd2d67feb83 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 19 Aug 2019 20:56:39 -0700 Subject: [PATCH 307/494] [lang] more progress on parser --- CMakeLists.txt | 12 +- examples/cpp/dot.cpp | 24 +- include/triton/lang/parser.y | 266 +-- include/triton/lang/wgtcc/ast.h | 743 +++++++ include/triton/lang/wgtcc/code_gen.h | 274 +++ include/triton/lang/wgtcc/cpp.h | 162 ++ include/triton/lang/wgtcc/encoding.h | 20 + include/triton/lang/wgtcc/error.h | 15 + include/triton/lang/wgtcc/evaluator.h | 120 ++ include/triton/lang/wgtcc/mem_pool.h | 101 + include/triton/lang/wgtcc/parser.h | 244 +++ include/triton/lang/wgtcc/scanner.h | 84 + include/triton/lang/wgtcc/scope.h | 70 + include/triton/lang/wgtcc/token.h | 418 ++++ include/triton/lang/wgtcc/type.h | 450 +++++ include/triton/lang/wgtcc/visitor.h | 50 + lib/lang/wgtcc/ast.cc | 885 ++++++++ lib/lang/wgtcc/code_gen.cc | 1561 ++++++++++++++ lib/lang/wgtcc/cpp.cc | 886 ++++++++ lib/lang/wgtcc/encoding.cc | 42 + lib/lang/wgtcc/error.cc | 95 + lib/lang/wgtcc/evaluator.cc | 210 ++ lib/lang/wgtcc/main.cc | 253 +++ lib/lang/wgtcc/parser.cc | 2688 +++++++++++++++++++++++++ lib/lang/wgtcc/scanner.cc | 452 +++++ lib/lang/wgtcc/scope.cc | 111 + lib/lang/wgtcc/token.cc | 259 +++ lib/lang/wgtcc/type.cc | 484 +++++ lib/runtime/function.cpp | 28 +- python/src/tensorflow.cpp | 13 +- 30 files changed, 10862 insertions(+), 158 deletions(-) create mode 100644 include/triton/lang/wgtcc/ast.h create mode 100644 include/triton/lang/wgtcc/code_gen.h create mode 100644 include/triton/lang/wgtcc/cpp.h create mode 100644 include/triton/lang/wgtcc/encoding.h create mode 100644 include/triton/lang/wgtcc/error.h create mode 100644 include/triton/lang/wgtcc/evaluator.h create mode 100644 include/triton/lang/wgtcc/mem_pool.h create mode 100644 include/triton/lang/wgtcc/parser.h create mode 100644 include/triton/lang/wgtcc/scanner.h create mode 100644 include/triton/lang/wgtcc/scope.h create mode 100644 include/triton/lang/wgtcc/token.h create mode 100644 include/triton/lang/wgtcc/type.h create mode 100644 include/triton/lang/wgtcc/visitor.h create mode 100644 lib/lang/wgtcc/ast.cc create mode 100644 lib/lang/wgtcc/code_gen.cc create mode 100644 lib/lang/wgtcc/cpp.cc create mode 100644 lib/lang/wgtcc/encoding.cc create mode 100644 lib/lang/wgtcc/error.cc create mode 100644 lib/lang/wgtcc/evaluator.cc create mode 100644 lib/lang/wgtcc/main.cc create mode 100644 lib/lang/wgtcc/parser.cc create mode 100644 lib/lang/wgtcc/scanner.cc create mode 100644 lib/lang/wgtcc/scope.cc create mode 100644 lib/lang/wgtcc/token.cc create mode 100644 lib/lang/wgtcc/type.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 694cc5578..5faf6c8bb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -62,15 +62,15 @@ endif() # Triton -file(GLOB_RECURSE LIBTRITON_SRC lib/*.cpp) +file(GLOB_RECURSE LIBTRITON_SRC lib/*.cpp lib/*.cc) add_library(triton SHARED ${LIBTRITON_SRC} ${EIGHTCC_SRC} ${PYTHON_SRC} ${BISON_Parser_OUTPUTS} ${FLEX_Lexer_OUTPUTS}) target_link_libraries(triton LLVM) # Warning level -if(MSVC) - target_compile_options(triton PRIVATE /W4) -else() - target_compile_options(triton PRIVATE -Wno-unused-parameter -Wall -Wextra -pedantic) -endif() +#if(MSVC) +# target_compile_options(triton PRIVATE /W4) +#else() +# target_compile_options(triton PRIVATE -Wno-unused-parameter -Wall -Wextra -pedantic) +#endif() diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 102380036..55c25b575 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -78,19 +78,23 @@ std::string src(bool AT, bool BT, std::string a_ty, std::string b_ty, std::strin std::string align_ldb_str = "multiple_of(" + std::to_string(align_ldb) + ")"; std::string res = R"( -const tunable int TM = {128}; -const tunable int TN = {128}; -const tunable int TK = {32}; +#define TM 128 +#define TN 128 +#define TK 32 -void matmul(restrict read_only align(16) )" + a_ty + R"( *A, - restrict read_only align(16) )" + b_ty + R"( *B, - restrict read_only align(16) )" + c_ty + R"( *C, +extern int get_program_id(int); + +void matmul(restrict )" + a_ty + R"( * A __attribute__((readonly, aligned(16))), + restrict )" + b_ty + R"( * B __attribute__((readonly, aligned(16))), + restrict )" + c_ty + R"( * C __attribute__((aligned(16))), int M, int N, int K, - )" + align_lda_str + R"( int lda, )" + align_ldb_str + R"(" int ldb, int ldc) { + int lda __attribute__((multiple_of(8))), + int ldb __attribute__((multiple_of(8))), + int ldc) { int ridx = get_program_id(0); int ridy = get_program_id(1); - int rxa[TM] = ridx * TM + (0 ... TM); - int ryb[TN] = ridy * TN + (0 ... TN); + int rxa[{TM, TN}] = ridx * TM + 0 ... TM; + int ryb[TN] = ridy * TN + 0 ... TN; int rka[TK] = 0 ... TK; int rkb[TK] = 0 ... TK; float xc[)" + XCS + R"(] = 0; @@ -112,7 +116,7 @@ void matmul(restrict read_only align(16) )" + a_ty + R"( *A, bool checkc0[TM] = rxc < M; bool checkc1[TN] = ryc < N; bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - @checkc *pc = c; + *pc = c; } )"; return res; diff --git a/include/triton/lang/parser.y b/include/triton/lang/parser.y index 118e26b8b..e3c22c132 100644 --- a/include/triton/lang/parser.y +++ b/include/triton/lang/parser.y @@ -66,38 +66,38 @@ STORAGE_SPEC_T get_storage_spec(node *op) { return ((token*)op)->storage_spec;} /* -------------------------- */ type_specifier - : VOID { $$ = new token(VOID_T); } - | UINT1 { $$ = new token(UINT1_T); } - | UINT8 { $$ = new token(UINT8_T); } - | UINT16 { $$ = new token(UINT16_T); } - | UINT32 { $$ = new token(UINT32_T); } - | UINT64 { $$ = new token(UINT64_T); } - | INT1 { $$ = new token(INT1_T);} - | INT8 { $$ = new token(INT8_T); } - | INT16 { $$ = new token(INT16_T); } - | INT32 { $$ = new token(INT32_T); } - | INT64 { $$ = new token(INT64_T); } - | FP16 { $$ = new token(FLOAT16_T); } - | FP32 { $$ = new token(FLOAT32_T); } - | FP64 { $$ = new token(FLOAT64_T); } + : VOID { $$ = new token(VOID_T); } + | UINT1 { $$ = new token(UINT1_T); } + | UINT8 { $$ = new token(UINT8_T); } + | UINT16 { $$ = new token(UINT16_T); } + | UINT32 { $$ = new token(UINT32_T); } + | UINT64 { $$ = new token(UINT64_T); } + | INT1 { $$ = new token(INT1_T);} + | INT8 { $$ = new token(INT8_T); } + | INT16 { $$ = new token(INT16_T); } + | INT32 { $$ = new token(INT32_T); } + | INT64 { $$ = new token(INT64_T); } + | FP16 { $$ = new token(FLOAT16_T); } + | FP32 { $$ = new token(FLOAT32_T); } + | FP64 { $$ = new token(FLOAT64_T); } ; pointer - : '*' { $$ = new pointer(nullptr); } - | '*' pointer { $$ = new pointer($1); } + : '*' { $$ = new pointer(nullptr); } + | '*' pointer { $$ = new pointer($1); } abstract_declarator : pointer { $$ = $1; } - | pointer direct_abstract_declarator { $$ = ((declarator*)$2)->set_ptr($1); } - | direct_abstract_declarator { $$ = $1; } + | pointer direct_abstract_declarator { $$ = ((declarator*)$2)->set_ptr($1); } + | direct_abstract_declarator { $$ = $1; } ; direct_abstract_declarator - : '[' constant_expression_list ']' { $$ = new tile(nullptr, $2); } + : '[' constant_expression_list ']' { $$ = new tile(nullptr, $2); } type_name - : declaration_specifiers { $$ = new type_name($1, nullptr); } - | declaration_specifiers abstract_declarator { $$ = new type_name($1, $2); } + : declaration_specifiers { $$ = new type_name($1, nullptr); } + | declaration_specifiers abstract_declarator { $$ = new type_name($1, $2); } ; /* -------------------------- */ @@ -106,16 +106,16 @@ type_name /* Constants */ constant - : CONSTANT { $$ = new constant(atoi(yytext)); } + : CONSTANT { $$ = new constant(atoi(yytext)); } ; constant_list - : constant { $$ = new list((constant*)$1); } - | constant_list ',' constant { $$ = append_ptr_list($1, $3); } + : constant { $$ = new list((constant*)$1); } + | constant_list ',' constant { $$ = append_ptr_list($1, $3); } ; identifier - : IDENTIFIER { $$ = new identifier(yytext); } + : IDENTIFIER { $$ = new identifier(yytext); } ; /* Built-in */ @@ -139,109 +139,109 @@ builtin_expression /* Primary */ primary_expression - : identifier { $$ = new named_expression($1); } - | constant { $$ = $1; } - | primary_expression ELLIPSIS primary_expression { $$ = new constant_range($1, $3); } - | builtin_expression { $$ = $1; } - | STRING_LITERAL { $$ = new string_literal(yytext); } - | '(' expression ')' { $$ = $2; } + : identifier { $$ = new named_expression($1); } + | constant { $$ = $1; } + | primary_expression ELLIPSIS primary_expression { $$ = new constant_range($1, $3); } + | builtin_expression { $$ = $1; } + | STRING_LITERAL { $$ = new string_literal(yytext); } + | '(' expression ')' { $$ = $2; } ; /* Postfix */ slice - : ':' { $$ = new slice(triton::lang::ALL); } - | NEWAXIS { $$ = new slice(triton::lang::NEWAXIS); } + : ':' { $$ = new slice(triton::lang::ALL); } + | NEWAXIS { $$ = new slice(triton::lang::NEWAXIS); } slice_list - : slice { $$ = new list((slice*)$1); } - | slice_list ',' slice { $$ = append_ptr_list($1, $3); } + : slice { $$ = new list((slice*)$1); } + | slice_list ',' slice { $$ = append_ptr_list($1, $3); } postfix_expression - : primary_expression { $$ = $1;} - | primary_expression '[' slice_list ']' { $$ = new indexing_expression($1, $3);} + : primary_expression { $$ = $1;} + | primary_expression '[' slice_list ']' { $$ = new indexing_expression($1, $3);} ; /* Unary */ unary_operator - : '&' { $$ = new token(ADDR); } - | '*' { $$ = new token(DEREF); } - | '+' { $$ = new token(PLUS); } - | '-' { $$ = new token(MINUS); } - | '~' { $$ = new token(COMPL); } - | '!' { $$ = new token(NOT); } + : '&' { $$ = new token(ADDR); } + | '*' { $$ = new token(DEREF); } + | '+' { $$ = new token(PLUS); } + | '-' { $$ = new token(MINUS); } + | '~' { $$ = new token(COMPL); } + | '!' { $$ = new token(NOT); } ; unary_expression - : postfix_expression { $$ = $1; } - | INC_OP unary_expression { $$ = new unary_expression(INC, $2); } - | DEC_OP unary_expression { $$ = new unary_expression(DEC, $2); } - | unary_operator cast_expression { $$ = new unary_expression(get_unary_op($1), $2); } + : postfix_expression { $$ = $1; } + | INC_OP unary_expression { $$ = new unary_expression(INC, $2); } + | DEC_OP unary_expression { $$ = new unary_expression(DEC, $2); } + | unary_operator cast_expression { $$ = new unary_expression(get_unary_op($1), $2); } ; cast_expression : unary_expression { $$ = $1; } - | '(' type_name ')' cast_expression { $$ = new cast_expression($2, $4); } + | '(' type_name ')' cast_expression { $$ = new cast_expression($2, $4); } ; multiplicative_expression : cast_expression { $$ = $1; } - | multiplicative_expression '*' cast_expression { $$ = new binary_expression(MUL, $1, $3); } - | multiplicative_expression '/' cast_expression { $$ = new binary_expression(DIV, $1, $3); } - | multiplicative_expression '%' cast_expression { $$ = new binary_expression(MOD, $1, $3); } + | multiplicative_expression '*' cast_expression { $$ = new binary_expression(MUL, $1, $3); } + | multiplicative_expression '/' cast_expression { $$ = new binary_expression(DIV, $1, $3); } + | multiplicative_expression '%' cast_expression { $$ = new binary_expression(MOD, $1, $3); } ; additive_expression : multiplicative_expression { $$ = $1; } - | additive_expression '+' multiplicative_expression { $$ = new binary_expression(ADD, $1, $3); } - | additive_expression '-' multiplicative_expression { $$ = new binary_expression(SUB, $1, $3); } + | additive_expression '+' multiplicative_expression { $$ = new binary_expression(ADD, $1, $3); } + | additive_expression '-' multiplicative_expression { $$ = new binary_expression(SUB, $1, $3); } ; shift_expression : additive_expression { $$ = $1; } - | shift_expression LEFT_OP additive_expression { $$ = new binary_expression(LEFT_SHIFT, $1, $3); } - | shift_expression RIGHT_OP additive_expression { $$ = new binary_expression(RIGHT_SHIFT, $1, $3); } + | shift_expression LEFT_OP additive_expression { $$ = new binary_expression(LEFT_SHIFT, $1, $3); } + | shift_expression RIGHT_OP additive_expression { $$ = new binary_expression(RIGHT_SHIFT, $1, $3); } ; /* Comparison */ relational_expression : shift_expression { $$ = $1; } - | relational_expression '<' shift_expression { $$ = new binary_expression(LT, $1, $3); } - | relational_expression '>' shift_expression { $$ = new binary_expression(GT, $1, $3); } - | relational_expression LE_OP shift_expression { $$ = new binary_expression(LE, $1, $3); } - | relational_expression GE_OP shift_expression { $$ = new binary_expression(GE, $1, $3); } + | relational_expression '<' shift_expression { $$ = new binary_expression(LT, $1, $3); } + | relational_expression '>' shift_expression { $$ = new binary_expression(GT, $1, $3); } + | relational_expression LE_OP shift_expression { $$ = new binary_expression(LE, $1, $3); } + | relational_expression GE_OP shift_expression { $$ = new binary_expression(GE, $1, $3); } ; equality_expression : relational_expression { $$ = $1; } - | equality_expression EQ_OP relational_expression { $$ = new binary_expression(EQ, $1, $3); } - | equality_expression NE_OP relational_expression { $$ = new binary_expression(NE, $1, $3); } + | equality_expression EQ_OP relational_expression { $$ = new binary_expression(EQ, $1, $3); } + | equality_expression NE_OP relational_expression { $$ = new binary_expression(NE, $1, $3); } ; /* Binary */ and_expression : equality_expression { $$ = $1; } - | and_expression '&' equality_expression { $$ = new binary_expression(AND, $1, $3); } + | and_expression '&' equality_expression { $$ = new binary_expression(AND, $1, $3); } ; exclusive_or_expression : and_expression { $$ = $1; } - | exclusive_or_expression '^' and_expression { $$ = new binary_expression(XOR, $1, $3); } + | exclusive_or_expression '^' and_expression { $$ = new binary_expression(XOR, $1, $3); } ; inclusive_or_expression : exclusive_or_expression { $$ = $1; } - | inclusive_or_expression '|' exclusive_or_expression { $$ = new binary_expression(OR, $1, $3); } + | inclusive_or_expression '|' exclusive_or_expression { $$ = new binary_expression(OR, $1, $3); } ; /* Logical */ logical_and_expression : inclusive_or_expression { $$ = $1; } - | logical_and_expression AND_OP inclusive_or_expression { $$ = new binary_expression(LAND, $1, $3); } + | logical_and_expression AND_OP inclusive_or_expression { $$ = new binary_expression(LAND, $1, $3); } ; logical_or_expression : logical_and_expression { $$ = $1; } - | logical_or_expression OR_OP logical_and_expression { $$ = new binary_expression(LOR, $1, $3); } + | logical_or_expression OR_OP logical_and_expression { $$ = new binary_expression(LOR, $1, $3); } ; /* Conditional */ @@ -253,21 +253,21 @@ conditional_expression /* Assignment */ assignment_operator : '=' { $$ = new token(ASSIGN); } - | MUL_ASSIGN { $$ = new token(INPLACE_MUL); } - | DIV_ASSIGN { $$ = new token(INPLACE_DIV); } - | MOD_ASSIGN { $$ = new token(INPLACE_MOD); } - | ADD_ASSIGN { $$ = new token(INPLACE_ADD); } - | SUB_ASSIGN { $$ = new token(INPLACE_SUB); } - | LEFT_ASSIGN { $$ = new token(INPLACE_LSHIFT); } - | RIGHT_ASSIGN { $$ = new token(INPLACE_RSHIFT); } - | AND_ASSIGN { $$ = new token(INPLACE_AND); } - | XOR_ASSIGN { $$ = new token(INPLACE_XOR); } - | OR_ASSIGN { $$ = new token(INPLACE_OR); } + | MUL_ASSIGN { $$ = new token(INPLACE_MUL); } + | DIV_ASSIGN { $$ = new token(INPLACE_DIV); } + | MOD_ASSIGN { $$ = new token(INPLACE_MOD); } + | ADD_ASSIGN { $$ = new token(INPLACE_ADD); } + | SUB_ASSIGN { $$ = new token(INPLACE_SUB); } + | LEFT_ASSIGN { $$ = new token(INPLACE_LSHIFT); } + | RIGHT_ASSIGN { $$ = new token(INPLACE_RSHIFT); } + | AND_ASSIGN { $$ = new token(INPLACE_AND); } + | XOR_ASSIGN { $$ = new token(INPLACE_XOR); } + | OR_ASSIGN { $$ = new token(INPLACE_OR); } ; assignment_expression : conditional_expression { $$ = $1; } - | unary_expression assignment_operator assignment_expression { $$ = new assignment_expression($1, get_assign_op($2), $3); } + | unary_expression assignment_operator assignment_expression { $$ = new assignment_expression($1, get_assign_op($2), $3); } ; /* Expression */ @@ -276,13 +276,13 @@ expression ; constant_expression_list - : expression { $$ = new list((expression*)$1); } - | constant_expression_list ',' expression { $$ = append_ptr_list($1, $3); } + : expression { $$ = new list((expression*)$1); } + | constant_expression_list ',' expression { $$ = append_ptr_list($1, $3); } /* Initialization */ initialization_expression - : assignment_expression { $$ = $1; } - | '{' constant_list '}' { $$ = $2; } + : assignment_expression { $$ = $1; } + | '{' constant_list '}' { $$ = $2; } ; @@ -291,41 +291,41 @@ initialization_expression /* -------------------------- */ statement - : compound_statement { $$ = $1; } - | expression_statement { $$ = $1; } - | selection_statement { $$ = $1; } - | iteration_statement { $$ = $1; } - | jump_statement { $$ = $1; } + : compound_statement { $$ = $1; } + | expression_statement { $$ = $1; } + | selection_statement { $$ = $1; } + | iteration_statement { $$ = $1; } + | jump_statement { $$ = $1; } ; compound_statement - : '{' '}' { $$ = new compound_statement(nullptr); } - | '{' block_item_list '}' { $$ = new compound_statement($2); } + : '{' '}' { $$ = new compound_statement(nullptr); } + | '{' block_item_list '}' { $$ = new compound_statement($2); } block_item_list - : block_item { $$ = new list((block_item*)$1); } - | block_item_list block_item { $$ = append_ptr_list($1, $2); } + : block_item { $$ = new list((block_item*)$1); } + | block_item_list block_item { $$ = append_ptr_list($1, $2); } block_item - : declaration { $$ = $1; } - | statement { $$ = $1; } + : declaration { $$ = $1; } + | statement { $$ = $1; } expression_statement - : ';' { $$ = new no_op(); } - | expression ';' { $$ = new expression_statement($1); } - | AT primary_expression expression ';' { $$ = new expression_statement($3, $2); } + : ';' { $$ = new no_op(); } + | expression ';' { $$ = new expression_statement($1); } + | AT primary_expression expression ';' { $$ = new expression_statement($3, $2); } ; selection_statement - : IF '(' expression ')' statement { $$ = new selection_statement($3, $5); } - | IF '(' expression ')' statement ELSE statement { $$ = new selection_statement($3, $5, $7); } + : IF '(' expression ')' statement { $$ = new selection_statement($3, $5); } + | IF '(' expression ')' statement ELSE statement { $$ = new selection_statement($3, $5, $7); } ; iteration_statement : FOR '(' expression_statement expression_statement expression ')' statement { $$ = new iteration_statement($3, $4, $5, $7); } - | FOR '(' declaration expression_statement ')' statement { $$ = new iteration_statement($3, $4, nullptr, $6); } - | FOR '(' declaration expression_statement expression ')' statement { $$ = new iteration_statement($3, $4, $5, $7); } - | WHILE '(' expression ')' statement { $$ = new while_statement($3, $5); }; + | FOR '(' declaration expression_statement ')' statement { $$ = new iteration_statement($3, $4, nullptr, $6); } + | FOR '(' declaration expression_statement expression ')' statement { $$ = new iteration_statement($3, $4, $5, $7); } + | WHILE '(' expression ')' statement { $$ = new while_statement($3, $5); }; jump_statement : CONTINUE ';' { $$ = new continue_statement(); } @@ -338,74 +338,74 @@ jump_statement direct_declarator : identifier { $$ = $1; } - | identifier '[' constant_expression_list ']' { $$ = new tile($1, $3); } - | identifier '(' parameter_list ')' { $$ = new function($1, $3); } - | identifier '(' ')' { $$ = new function($1, nullptr); } + | identifier '[' constant_expression_list ']' { $$ = new tile($1, $3); } + | identifier '(' parameter_list ')' { $$ = new function($1, $3); } + | identifier '(' ')' { $$ = new function($1, nullptr); } ; parameter_list - : parameter_declaration { $$ = new list((parameter*)$1); } - | parameter_list ',' parameter_declaration { $$ = append_ptr_list($1, $3); } + : parameter_declaration { $$ = new list((parameter*)$1); } + | parameter_list ',' parameter_declaration { $$ = append_ptr_list($1, $3); } ; parameter_declaration - : declaration_specifiers declarator { $$ = new parameter($1, $2); } - | declaration_specifiers abstract_declarator { $$ = new parameter($1, $2); } + : declaration_specifiers declarator { $$ = new parameter($1, $2); } + | declaration_specifiers abstract_declarator { $$ = new parameter($1, $2); } ; declaration_specifiers - : type_specifier { $$ = new typed_declaration_specifier(get_type_spec($1)); } - | storage_class_specifier declaration_specifiers { $$ = new declaration_modifier($1, $2); } - | alignment_class_specifier declaration_specifiers { $$ = new declaration_modifier($1, $2); } - | multiple_of_class_specifier declaration_specifiers { $$ = new declaration_modifier($1, $2); } + : type_specifier { $$ = new typed_declaration_specifier(get_type_spec($1)); } + | storage_class_specifier declaration_specifiers { $$ = new declaration_modifier($1, $2); } + | alignment_class_specifier declaration_specifiers { $$ = new declaration_modifier($1, $2); } + | multiple_of_class_specifier declaration_specifiers { $$ = new declaration_modifier($1, $2); } ; init_declarator_list - : init_declarator { $$ = new list((initializer*)$1); } - | init_declarator_list ',' init_declarator { $$ = append_ptr_list($1, $3); } + : init_declarator { $$ = new list((initializer*)$1); } + | init_declarator_list ',' init_declarator { $$ = append_ptr_list($1, $3); } ; declaration - : declaration_specifiers ';' { $$ = new declaration($1, nullptr); } - | declaration_specifiers init_declarator_list ';' { $$ = new declaration($1, $2); } + : declaration_specifiers ';' { $$ = new declaration($1, nullptr); } + | declaration_specifiers init_declarator_list ';' { $$ = new declaration($1, $2); } ; declarator - : pointer direct_declarator { $$ = ((declarator*)$2)->set_ptr($1); } - | direct_declarator { $$ = $1; } + : pointer direct_declarator { $$ = ((declarator*)$2)->set_ptr($1); } + | direct_declarator { $$ = $1; } ; init_declarator - : declarator { $$ = new initializer($1, nullptr); } - | declarator '=' initialization_expression { $$ = new initializer($1, $3); } + : declarator { $$ = new initializer($1, nullptr); } + | declarator '=' initialization_expression { $$ = new initializer($1, $3); } ; storage_class_specifier - : CONST { $$ = new storage_specifier(CONST_T); } - | TUNABLE { $$ = new storage_specifier(TUNABLE_T); } - | KERNEL { $$ = new storage_specifier(KERNEL_T); } - | RESTRICT { $$ = new storage_specifier(RESTRICT_T); } - | READONLY { $$ = new storage_specifier(READONLY_T); } - | WRITEONLY { $$ = new storage_specifier(WRITEONLY_T); } - | CONSTANT_SPACE { $$ = new storage_specifier(CONSTANT_SPACE_T); } + : CONST { $$ = new storage_specifier(CONST_T); } + | TUNABLE { $$ = new storage_specifier(TUNABLE_T); } + | KERNEL { $$ = new storage_specifier(KERNEL_T); } + | RESTRICT { $$ = new storage_specifier(RESTRICT_T); } + | READONLY { $$ = new storage_specifier(READONLY_T); } + | WRITEONLY { $$ = new storage_specifier(WRITEONLY_T); } + | CONSTANT_SPACE { $$ = new storage_specifier(CONSTANT_SPACE_T); } ; alignment_class_specifier - : ALIGN '(' constant ')' { $$ = new alignment_specifier($3); } + : ALIGN '(' constant ')' { $$ = new alignment_specifier($3); } multiple_of_class_specifier - : MULTIPLE_OF '(' constant ')' { $$ = new multiple_of_specifier($3); } + : MULTIPLE_OF '(' constant ')' { $$ = new multiple_of_specifier($3); } external_declaration - : function_definition { $$ = $1; } - | declaration { $$ = $1; } + : function_definition { $$ = $1; } + | declaration { $$ = $1; } ; function_definition - : declaration_specifiers declarator compound_statement { $$ = new function_definition($1, $2, $3); } + : declaration_specifiers declarator compound_statement { $$ = new function_definition($1, $2, $3); } ; /* -------------------------- */ @@ -413,8 +413,8 @@ function_definition /* -------------------------- */ translation_unit - : external_declaration { ast_root = new translation_unit($1); $$ = ast_root; } - | translation_unit external_declaration { $$ = ((translation_unit*)($1))->add($2); } + : external_declaration { ast_root = new translation_unit($1); $$ = ast_root; } + | translation_unit external_declaration { $$ = ((translation_unit*)($1))->add($2); } ; diff --git a/include/triton/lang/wgtcc/ast.h b/include/triton/lang/wgtcc/ast.h new file mode 100644 index 000000000..3cb3257f7 --- /dev/null +++ b/include/triton/lang/wgtcc/ast.h @@ -0,0 +1,743 @@ +#ifndef _WGTCC_AST_H_ +#define _WGTCC_AST_H_ + +#include "error.h" +#include "token.h" +#include "type.h" + +#include +#include +#include +#include + + +class Visitor; +template class Evaluator; +class AddrEvaluator; +class Generator; + +class Scope; +class Parser; +class ASTNode; +class Token; +class TokenSequence; + +// Expressions +class Expr; +class BinaryOp; +class UnaryOp; +class ConditionalOp; +class FuncCall; +class TempVar; +class Constant; + +class Identifier; +class Object; +struct Initializer; +class Declaration; +class Enumerator; + +// Statements +class Stmt; +class IfStmt; +class JumpStmt; +class LabelStmt; +class EmptyStmt; +class CompoundStmt; +class FuncDef; +class TranslationUnit; + + +/* + * AST Node + */ + +class ASTNode { +public: + virtual ~ASTNode() {} + virtual void Accept(Visitor* v) = 0; + +protected: + ASTNode() {} + + MemPool* pool_ {nullptr}; +}; + +using ExtDecl = ASTNode; + + +/* + * Statements + */ + +class Stmt : public ASTNode { +public: + virtual ~Stmt() {} + +protected: + Stmt() {} +}; + + +class EmptyStmt : public Stmt { + template friend class Evaluator; + friend class AddrEvaluator; + friend class Generator; + +public: + static EmptyStmt* New(); + virtual ~EmptyStmt() {} + virtual void Accept(Visitor* v); + +protected: + EmptyStmt() {} +}; + + +class LabelStmt : public Stmt { + template friend class Evaluator; + friend class AddrEvaluator; + friend class Generator; + +public: + static LabelStmt* New(); + ~LabelStmt() {} + virtual void Accept(Visitor* v); + std::string Repr() const { return ".L" + std::to_string(tag_); } + +protected: + LabelStmt(): tag_(GenTag()) {} + +private: + static int GenTag() { + static int tag = 0; + return ++tag; + } + + int tag_; // 使用整型的tag值,而不直接用字符串 +}; + + +class IfStmt : public Stmt { + template friend class Evaluator; + friend class AddrEvaluator; + friend class Generator; +public: + static IfStmt* New(Expr* cond, Stmt* then, Stmt* els=nullptr); + virtual ~IfStmt() {} + virtual void Accept(Visitor* v); + +protected: + IfStmt(Expr* cond, Stmt* then, Stmt* els = nullptr) + : cond_(cond), then_(then), else_(els) {} + +private: + Expr* cond_; + Stmt* then_; + Stmt* else_; +}; + + +class JumpStmt : public Stmt { + template friend class Evaluator; + friend class AddrEvaluator; + friend class Generator; + +public: + static JumpStmt* New(LabelStmt* label); + virtual ~JumpStmt() {} + virtual void Accept(Visitor* v); + void SetLabel(LabelStmt* label) { label_ = label; } + +protected: + JumpStmt(LabelStmt* label): label_(label) {} + +private: + LabelStmt* label_; +}; + + +class ReturnStmt: public Stmt { + template friend class Evaluator; + friend class AddrEvaluator; + friend class Generator; + +public: + static ReturnStmt* New(Expr* expr); + virtual ~ReturnStmt() {} + virtual void Accept(Visitor* v); + +protected: + ReturnStmt(::Expr* expr): expr_(expr) {} + +private: + ::Expr* expr_; +}; + + +using StmtList = std::list; + +class CompoundStmt : public Stmt { + template friend class Evaluator; + friend class AddrEvaluator; + friend class Generator; + +public: + static CompoundStmt* New(StmtList& stmts, ::Scope* scope=nullptr); + virtual ~CompoundStmt() {} + virtual void Accept(Visitor* v); + StmtList& Stmts() { return stmts_; } + ::Scope* Scope() { return scope_; } + +protected: + CompoundStmt(const StmtList& stmts, ::Scope* scope=nullptr) + : stmts_(stmts), scope_(scope) {} + +private: + StmtList stmts_; + ::Scope* scope_; +}; + + +struct Initializer { + Initializer(Type* type, + int offset, + Expr* expr, + unsigned char bitFieldBegin=0, + unsigned char bitFieldWidth=0) + : type_(type), + offset_(offset), + bitFieldBegin_(bitFieldBegin), + bitFieldWidth_(bitFieldWidth), + expr_(expr) {} + + bool operator<(const Initializer& rhs) const; + + // It could be the object it self or, it will be the member + // that was initialized + Type* type_; + int offset_; + unsigned char bitFieldBegin_; + unsigned char bitFieldWidth_; + + Expr* expr_; +}; + + +using InitList = std::set; + +class Declaration: public Stmt { + template friend class Evaluator; + friend class AddrEvaluator; + friend class Generator; + +public: + static Declaration* New(Object* obj); + virtual ~Declaration() {} + virtual void Accept(Visitor* v); + InitList& Inits() { return inits_; } + Object* Obj() { return obj_; } + void AddInit(Initializer init); + +protected: + Declaration(Object* obj): obj_(obj) {} + + Object* obj_; + InitList inits_; +}; + + +/* + * Expr + * BinaryOp + * UnaryOp + * ConditionalOp + * FuncCall + * Constant + * Identifier + * Object + * TempVar + */ + +class Expr : public Stmt { + template friend class Evaluator; + friend class AddrEvaluator; + friend class Generator; + friend class LValGenerator; + +public: + virtual ~Expr() {} + ::Type* Type() { return type_.GetPtr(); } + virtual bool IsLVal() = 0; + virtual void TypeChecking() = 0; + void EnsureCompatible(const QualType lhs, const QualType rhs) const; + void EnsureCompatibleOrVoidPointer(const QualType lhs, + const QualType rhs) const; + const Token* Tok() const { return tok_; } + void SetTok(const Token* tok) { tok_ = tok; } + + static Expr* MayCast(Expr* expr); + static Expr* MayCast(Expr* expr, QualType desType); + virtual bool IsNullPointerConstant() const { return false; } + bool IsConstQualified() const { return type_.IsConstQualified(); } + bool IsRestrictQualified() const { return type_.IsRestrictQualified(); } + bool IsVolatileQualified() const { return type_.IsVolatileQualified(); } + +protected: + // You can construct a expression without specifying a type, + // then the type should be evaluated in TypeChecking() + Expr(const Token* tok, QualType type): tok_(tok), type_(type) {} + + const Token* tok_; + QualType type_; +}; + + +/* + * '+', '-', '*', '/', '%', '<', '>', '<<', '>>', '|', '&', '^' + * '=',(复合赋值运算符被拆分为两个运算) + * '==', '!=', '<=', '>=', + * '&&', '||' + * '['(下标运算符), '.'(成员运算符) + * ','(逗号运算符), + */ +class BinaryOp : public Expr { + template friend class Evaluator; + friend class AddrEvaluator; + friend class Generator; + friend class LValGenerator; + friend class Declaration; + +public: + static BinaryOp* New(const Token* tok, Expr* lhs, Expr* rhs); + static BinaryOp* New(const Token* tok, int op, Expr* lhs, Expr* rhs); + virtual ~BinaryOp() {} + virtual void Accept(Visitor* v); + + // Member ref operator is a lvalue + virtual bool IsLVal() { + switch (op_) { + case '.': return !Type()->ToArray() && lhs_->IsLVal(); + case ']': return !Type()->ToArray(); + default: return false; + } + } + ArithmType* Convert(); + void Broadcast(); + + virtual void TypeChecking(); + void SubScriptingOpTypeChecking(); + void MemberRefOpTypeChecking(); + void MultiOpTypeChecking(); + void AdditiveOpTypeChecking(); + void ShiftOpTypeChecking(); + void RangeOpTypeChecking(); + void RelationalOpTypeChecking(); + void EqualityOpTypeChecking(); + void BitwiseOpTypeChecking(); + void LogicalOpTypeChecking(); + void AssignOpTypeChecking(); + void CommaOpTypeChecking(); + +protected: + BinaryOp(const Token* tok, int op, Expr* lhs, Expr* rhs) + : Expr(tok, nullptr), op_(op) { + lhs_ = lhs, rhs_ = rhs; + if (op != '.') { + lhs_ = MayCast(lhs); + rhs_ = MayCast(rhs); + } + } + + int op_; + Expr* lhs_; + Expr* rhs_; +}; + + +/* + * Unary Operator: + * '++' (prefix/postfix) + * '--' (prefix/postfix) + * '&' (ADDR) + * '*' (DEREF) + * '+' (PLUS) + * '-' (MINUS) + * '~' + * '!' + * CAST // like (int)3 + */ +class UnaryOp : public Expr { + template friend class Evaluator; + friend class AddrEvaluator; + friend class Generator; + friend class LValGenerator; + +public: + static UnaryOp* New(int op, Expr* operand, QualType type=nullptr); + virtual ~UnaryOp() {} + virtual void Accept(Visitor* v); + virtual bool IsLVal(); + ArithmType* Convert(); + void TypeChecking(); + void IncDecOpTypeChecking(); + void AddrOpTypeChecking(); + void DerefOpTypeChecking(); + void UnaryArithmOpTypeChecking(); + void CastOpTypeChecking(); + +protected: + UnaryOp(int op, Expr* operand, QualType type=nullptr) + : Expr(operand->Tok(), type), op_(op) { + operand_ = operand; + if (op_ != Token::CAST && op_ != Token::ADDR) { + operand_ = MayCast(operand); + } + } + + int op_; + Expr* operand_; +}; + + +// cond ? true : false +class ConditionalOp : public Expr { + template friend class Evaluator; + friend class AddrEvaluator; + friend class Generator; + +public: + static ConditionalOp* New(const Token* tok, + Expr* cond, Expr* exprTrue, Expr* exprFalse); + virtual ~ConditionalOp() {} + virtual void Accept(Visitor* v); + virtual bool IsLVal() { return false; } + ArithmType* Convert(); + virtual void TypeChecking(); + +protected: + ConditionalOp(Expr* cond, Expr* exprTrue, Expr* exprFalse) + : Expr(cond->Tok(), nullptr), cond_(MayCast(cond)), + exprTrue_(MayCast(exprTrue)), exprFalse_(MayCast(exprFalse)) {} + +private: + Expr* cond_; + Expr* exprTrue_; + Expr* exprFalse_; +}; + + +class FuncCall : public Expr { + template friend class Evaluator; + friend class AddrEvaluator; + friend class Generator; + +public: + using ArgList = std::vector; + +public: + static FuncCall* New(Expr* designator, const ArgList& args); + ~FuncCall() {} + virtual void Accept(Visitor* v); + + // A function call is ofcourse not lvalue + virtual bool IsLVal() { return false; } + ArgList* Args() { return &args_; } + Expr* Designator() { return designator_; } + const std::string& Name() const { return tok_->str_; } + ::FuncType* FuncType() { return designator_->Type()->ToFunc(); } + virtual void TypeChecking(); + +protected: + FuncCall(Expr* designator, const ArgList& args) + : Expr(designator->Tok(), nullptr), + designator_(designator), args_(args) {} + + Expr* designator_; + ArgList args_; +}; + + +class Constant: public Expr { + template friend class Evaluator; + friend class AddrEvaluator; + friend class Generator; + +public: + static Constant* New(const Token* tok, int tag, long val); + static Constant* New(const Token* tok, int tag, double val); + static Constant* New(const Token* tok, int tag, const std::string* val); + ~Constant() {} + virtual void Accept(Visitor* v); + virtual bool IsLVal() { return false; } + virtual void TypeChecking() {} + + long IVal() const { return ival_; } + double FVal() const { return fval_; } + const std::string* SVal() const { return sval_; } + std::string SValRepr() const; + std::string Repr() const { return std::string(".LC") + std::to_string(id_); } + +protected: + Constant(const Token* tok, QualType type, long val) + : Expr(tok, type), ival_(val) {} + Constant(const Token* tok, QualType type, double val) + : Expr(tok, type), fval_(val) {} + Constant(const Token* tok, QualType type, const std::string* val) + : Expr(tok, type), sval_(val) {} + + union { + long ival_; + double fval_; + struct { + long id_; + const std::string* sval_; + }; + }; +}; + + +class TempVar : public Expr { + template friend class Evaluator; + friend class AddrEvaluator; + friend class Generator; + +public: + static TempVar* New(QualType type); + virtual ~TempVar() {} + virtual void Accept(Visitor* v); + virtual bool IsLVal() { return true; } + virtual void TypeChecking() {} + +protected: + TempVar(QualType type): Expr(nullptr, type), tag_(GenTag()) {} + +private: + static int GenTag() { + static int tag = 0; + return ++tag; + } + + int tag_; +}; + + +enum Linkage { + L_NONE, + L_EXTERNAL, + L_INTERNAL, +}; + + +class Identifier: public Expr { + template friend class Evaluator; + friend class AddrEvaluator; + friend class Generator; + friend class LValGenerator; + +public: + static Identifier* New(const Token* tok, QualType type, Linkage linkage); + virtual ~Identifier() {} + virtual void Accept(Visitor* v); + virtual bool IsLVal() { return false; } + virtual Object* ToObject() { return nullptr; } + virtual Enumerator* ToEnumerator() { return nullptr; } + + // An identifer can be: + // object, sturct/union/enum tag, typedef name, function, label. + Identifier* ToTypeName() { + // A typename has no linkage + // And a function has external or internal linkage + if (ToObject() || ToEnumerator() || linkage_ != L_NONE) + return nullptr; + return this; + } + virtual const std::string Name() const { return tok_->str_; } + enum Linkage Linkage() const { return linkage_; } + void SetLinkage(enum Linkage linkage) { linkage_ = linkage; } + virtual void TypeChecking() {} + +protected: + Identifier(const Token* tok, QualType type, enum Linkage linkage) + : Expr(tok, type), linkage_(linkage) {} + + // An identifier has property linkage + enum Linkage linkage_; +}; + + +class Enumerator: public Identifier { + template friend class Evaluator; + friend class AddrEvaluator; + friend class Generator; + +public: + static Enumerator* New(const Token* tok, int val); + virtual ~Enumerator() {} + virtual void Accept(Visitor* v); + virtual Enumerator* ToEnumerator() { return this; } + int Val() const { return cons_->IVal(); } + +protected: + Enumerator(const Token* tok, int val) + : Identifier(tok, ArithmType::New(T_INT), L_NONE), + cons_(Constant::New(tok, T_INT, (long)val)) {} + + Constant* cons_; +}; + + +class Object : public Identifier { + template friend class Evaluator; + friend class AddrEvaluator; + friend class Generator; + friend class LValGenerator; + +public: + static Object* New(const Token* tok, + QualType type, + int storage=0, + enum Linkage linkage=L_NONE, + unsigned char bitFieldBegin=0, + unsigned char bitFieldWidth=0); + static Object* NewAnony(const Token* tok, + QualType type, + int storage=0, + enum Linkage linkage=L_NONE, + unsigned char bitFieldBegin=0, + unsigned char bitFieldWidth=0); + ~Object() {} + virtual void Accept(Visitor* v); + virtual Object* ToObject() { return this; } + virtual bool IsLVal() { + // TODO(wgtdkp): not all object is lval? + return true; + } + bool IsStatic() const { + return (Storage() & S_STATIC) || (Linkage() != L_NONE); + } + int Storage() const { return storage_; } + void SetStorage(int storage) { storage_ = storage; } + int Align() const { return align_; } + void SetAlign(int align) { + assert(align > 0); + // Allowing reduce alignment to implement __attribute__((packed)) + //if (align < align_) + // Error(this, "alignment specifier cannot reduce alignment"); + align_ = align; + } + int Offset() const { return offset_; } + void SetOffset(int offset) { offset_ = offset; } + Declaration* Decl() { return decl_; } + void SetDecl(Declaration* decl) { decl_ = decl; } + + unsigned char BitFieldBegin() const { return bitFieldBegin_; } + unsigned char BitFieldEnd() const { return bitFieldBegin_ + bitFieldWidth_; } + unsigned char BitFieldWidth() const { return bitFieldWidth_; } + static unsigned long BitFieldMask(Object* bitField) { + return BitFieldMask(bitField->bitFieldBegin_, bitField->bitFieldWidth_); + } + static unsigned long BitFieldMask(unsigned char begin, unsigned char width) { + auto end = begin + width; + return ((0xFFFFFFFFFFFFFFFFUL << (64 - end)) >> (64 - width)) << begin; + } + + bool HasInit() const { return decl_ && decl_->Inits().size(); } + bool Anonymous() const { return anonymous_; } + virtual const std::string Name() const { return Identifier::Name(); } + std::string Repr() const { + assert(IsStatic() || anonymous_); + if (anonymous_) + return "anonymous." + std::to_string(id_); + if (linkage_ == L_NONE) + return Name() + "." + std::to_string(id_); + return Name(); + } + +protected: + Object(const Token* tok, + QualType type, + int storage=0, + enum Linkage linkage=L_NONE, + unsigned char bitFieldBegin=0, + unsigned char bitFieldWidth=0) + : Identifier(tok, type, linkage), + storage_(storage), + offset_(0), + align_(type->Align()), + decl_(nullptr), + bitFieldBegin_(bitFieldBegin), + bitFieldWidth_(bitFieldWidth), + anonymous_(false) {} + +private: + int storage_; + int offset_; + int align_; + + Declaration* decl_; + + unsigned char bitFieldBegin_; + // 0 means it's not a bitfield + unsigned char bitFieldWidth_; + + bool anonymous_; + long id_ {0}; +}; + + +/* + * Declaration + */ + +class FuncDef : public ExtDecl { + template friend class Evaluator; + friend class AddrEvaluator; + friend class Generator; + +public: + using ParamList = std::vector; + +public: + static FuncDef* New(Identifier* ident, LabelStmt* retLabel); + virtual ~FuncDef() {} + virtual void Accept(Visitor* v); + ::FuncType* FuncType() { return ident_->Type()->ToFunc(); } + CompoundStmt* Body() { return body_; } + void SetBody(CompoundStmt* body) { body_ = body; } + std::string Name() const { return ident_->Name(); } + enum Linkage Linkage() { return ident_->Linkage(); } + +protected: + FuncDef(Identifier* ident, LabelStmt* retLabel) + : ident_(ident), retLabel_(retLabel) {} + +private: + Identifier* ident_; + LabelStmt* retLabel_; + CompoundStmt* body_; +}; + + +using ExtDeclList = std::list; + +class TranslationUnit : public ASTNode { + template friend class Evaluator; + friend class AddrEvaluator; + friend class Generator; + +public: + static TranslationUnit* New() { return new TranslationUnit();} + virtual ~TranslationUnit() {} + virtual void Accept(Visitor* v); + void Add(ExtDecl* extDecl) { extDecls_.push_back(extDecl); } + ExtDeclList& ExtDecls() { return extDecls_; } + const ExtDeclList& ExtDecls() const { return extDecls_; } + +private: + TranslationUnit() {} + + ExtDeclList extDecls_; +}; + +#endif diff --git a/include/triton/lang/wgtcc/code_gen.h b/include/triton/lang/wgtcc/code_gen.h new file mode 100644 index 000000000..31ed8fca9 --- /dev/null +++ b/include/triton/lang/wgtcc/code_gen.h @@ -0,0 +1,274 @@ +#ifndef _WGTCC_CODE_GEN_H_ +#define _WGTCC_CODE_GEN_H_ + +#include "ast.h" +#include "visitor.h" + + +class Parser; +struct Addr; +struct ROData; +template<> class Evaluator; +struct StaticInitializer; + +using TypeList = std::vector; +using LocationList = std::vector; +using RODataList = std::vector; +using StaticInitList = std::vector; + + +enum class ParamClass { + INTEGER, + SSE, + SSEUP, + X87, + X87_UP, + COMPLEX_X87, + NO_CLASS, + MEMORY +}; + +struct ParamLocations { + LocationList locs_; + size_t regCnt_; + size_t xregCnt_; +}; + +struct ROData { + ROData(long ival, int align): ival_(ival), align_(align) { + label_ = ".LC" + std::to_string(GenTag()); + } + + explicit ROData(const std::string& sval): sval_(sval), align_(1) { + label_ = ".LC" + std::to_string(GenTag()); + } + + ~ROData() {} + + std::string sval_; + long ival_; + int align_; + std::string label_; + +private: + static long GenTag() { + static long tag = 0; + return tag++; + } +}; + + +struct ObjectAddr { + explicit ObjectAddr(int offset) + : ObjectAddr("", "%rbp", offset) {} + + ObjectAddr(const std::string& label, const std::string& base, int offset) + : label_(label), base_(base), offset_(offset) {} + + std::string Repr() const; + + std::string label_; + std::string base_; + int offset_; + unsigned char bitFieldBegin_ {0}; + unsigned char bitFieldWidth_ {0}; +}; + + +struct StaticInitializer { + int offset_; + int width_; + long val_; + std::string label_; +}; + + +class Generator: public Visitor { + friend class Evaluator; +public: + Generator() {} + + virtual void Visit(ASTNode* node) { node->Accept(this); } + void VisitExpr(Expr* expr) { expr->Accept(this); } + void VisitStmt(Stmt* stmt) { stmt->Accept(this); } + + // Expression + virtual void VisitBinaryOp(BinaryOp* binaryOp); + virtual void VisitUnaryOp(UnaryOp* unaryOp); + virtual void VisitConditionalOp(ConditionalOp* condOp); + virtual void VisitFuncCall(FuncCall* funcCall); + virtual void VisitObject(Object* obj); + virtual void VisitEnumerator(Enumerator* enumer); + virtual void VisitIdentifier(Identifier* ident); + virtual void VisitConstant(Constant* cons); + virtual void VisitTempVar(TempVar* tempVar); + + // Statement + virtual void VisitDeclaration(Declaration* init); + virtual void VisitEmptyStmt(EmptyStmt* emptyStmt); + virtual void VisitIfStmt(IfStmt* ifStmt); + virtual void VisitJumpStmt(JumpStmt* jumpStmt); + virtual void VisitReturnStmt(ReturnStmt* returnStmt); + virtual void VisitLabelStmt(LabelStmt* labelStmt); + virtual void VisitCompoundStmt(CompoundStmt* compoundStmt); + + virtual void VisitFuncDef(FuncDef* funcDef); + virtual void VisitTranslationUnit(TranslationUnit* unit); + + + static void SetInOut(Parser* parser, FILE* outFile) { + parser_ = parser; + outFile_ = outFile; + } + + void Gen(); + +protected: + // Binary + void GenCommaOp(BinaryOp* comma); + void GenMemberRefOp(BinaryOp* binaryOp); + void GenAndOp(BinaryOp* binaryOp); + void GenOrOp(BinaryOp* binaryOp); + void GenAddOp(BinaryOp* binaryOp); + void GenSubOp(BinaryOp* binaryOp); + void GenAssignOp(BinaryOp* assign); + void GenCastOp(UnaryOp* cast); + void GenDerefOp(UnaryOp* deref); + void GenMinusOp(UnaryOp* minus); + void GenPointerArithm(BinaryOp* binary); + void GenDivOp(bool flt, bool sign, int width, int op); + void GenMulOp(int width, bool flt, bool sign); + void GenCompOp(int width, bool flt, const char* set); + void GenCompZero(Type* type); + + // Unary + void GenIncDec(Expr* operand, bool postfix, const std::string& inst); + + StaticInitializer GetStaticInit(InitList::iterator& iter, + InitList::iterator end, int offset); + + void GenStaticDecl(Declaration* decl); + + void GenSaveArea(); + void GenBuiltin(FuncCall* funcCall); + + void AllocObjects(Scope* scope, + const FuncDef::ParamList& params=FuncDef::ParamList()); + + void CopyStruct(ObjectAddr desAddr, int width); + + std::string ConsLabel(Constant* cons); + + ParamLocations GetParamLocations(const TypeList& types, bool retStruct); + void GetParamRegOffsets(int& gpOffset, int& fpOffset, + int& overflow, FuncType* funcType); + + void Emit(const std::string& str) { + fprintf(outFile_, "\t%s\n", str.c_str()); + } + + void Emit(const std::string& inst, + const std::string& src, + const std::string& des) { + Emit(inst + "\t" + src + ", " + des); + } + + void Emit(const std::string& inst, + int imm, + const std::string& reg) { + Emit(inst + "\t$" + std::to_string(imm) + ", " + reg); + } + + void Emit(const std::string& inst, + const std::string& des) { + Emit(inst + "\t" + des); + } + + void Emit(const std::string& inst, + const LabelStmt* label) { + Emit(inst + "\t" + label->Repr()); + } + + void Emit(const std::string& inst, + const ObjectAddr& src, + const ObjectAddr& des) { + Emit(inst, src.Repr(), des.Repr()); + } + + void Emit(const std::string& inst, + const std::string& src, + const ObjectAddr& des) { + Emit(inst, src, des.Repr()); + } + + void Emit(const std::string& inst, + const ObjectAddr& src, + const std::string& des) { + Emit(inst, src.Repr(), des); + } + + void EmitLabel(const std::string& label); + void EmitZero(ObjectAddr addr, int width); + void EmitLoad(const std::string& addr, Type* type); + void EmitLoad(const std::string& addr, int width, bool flt); + void EmitStore(const ObjectAddr& addr, Type* type); + void EmitStore(const std::string& addr, Type* type); + void EmitStore(const std::string& addr, int width, bool flt); + void EmitLoadBitField(const std::string& addr, Object* bitField); + void EmitStoreBitField(const ObjectAddr& addr, Type* type); + void EmitLoc(Expr* expr); + + int Push(Type* type); + int Push(const std::string& reg); + int Pop(const std::string& reg); + + void Spill(bool flt); + + void Restore(bool flt); + + void Save(bool flt); + + void Exchange(bool flt); + +protected: + static const std::string* last_file; + static Parser* parser_; + static FILE* outFile_; + static RODataList rodatas_; + static int offset_; + + // The address that store the register %rdi, + // when the return value is a struct/union + static int retAddrOffset_; + static FuncDef* curFunc_; + + static std::vector staticDecls_; +}; + + +class LValGenerator: public Generator { +public: + LValGenerator() {} + + // Expression + virtual void VisitBinaryOp(BinaryOp* binaryOp); + virtual void VisitUnaryOp(UnaryOp* unaryOp); + virtual void VisitObject(Object* obj); + virtual void VisitIdentifier(Identifier* ident); + + virtual void VisitConditionalOp(ConditionalOp* condOp) { assert(false); } + virtual void VisitFuncCall(FuncCall* funcCall) { assert(false); } + virtual void VisitEnumerator(Enumerator* enumer) { assert(false); } + virtual void VisitConstant(Constant* cons) { assert(false); } + virtual void VisitTempVar(TempVar* tempVar); + + ObjectAddr GenExpr(Expr* expr) { + expr->Accept(this); + return addr_; + } + +private: + ObjectAddr addr_ {"", "", 0}; +}; + +#endif diff --git a/include/triton/lang/wgtcc/cpp.h b/include/triton/lang/wgtcc/cpp.h new file mode 100644 index 000000000..5f7a296c1 --- /dev/null +++ b/include/triton/lang/wgtcc/cpp.h @@ -0,0 +1,162 @@ +#ifndef _WGTCC_CPP_H_ +#define _WGTCC_CPP_H_ + +#include "scanner.h" + +#include +#include +#include +#include +#include +#include + +class Macro; +struct CondDirective; + +using MacroMap = std::map; +using ParamList = std::list; +using ParamMap = std::map; +using PPCondStack = std::stack; +using PathList = std::list; + + +class Macro { +public: + Macro(const TokenSequence& repSeq, bool preDef=false) + : funcLike_(false), variadic_(false), + preDef_(preDef), repSeq_(repSeq) {} + + Macro(bool variadic, ParamList& params, + TokenSequence& repSeq, bool preDef=false) + : funcLike_(true), variadic_(variadic), preDef_(preDef), + params_(params), repSeq_(repSeq) {} + + ~Macro() {} + bool FuncLike() { return funcLike_; } + bool ObjLike() { return !FuncLike(); } + bool Variadic() { return variadic_; } + bool PreDef() { return preDef_; } + ParamList& Params() { return params_; } + TokenSequence RepSeq(const std::string* filename, unsigned line); + +private: + bool funcLike_; + bool variadic_; + bool preDef_; + ParamList params_; + TokenSequence repSeq_; +}; + + +struct CondDirective { + int tag_; + bool enabled_; + bool cond_; +}; + + +class Preprocessor { +public: + Preprocessor(const std::string* str, bool isSrc = true) + : curLine_(1), lineLine_(0), curCond_(true), fName_(nullptr), fSrc_(nullptr) { + if(isSrc) + fSrc_ = str; + else + fName_ = str; + // Add predefined + Init(); + } + + + ~Preprocessor() {} + void Finalize(TokenSequence os); + void Process(TokenSequence& os); + void Expand(TokenSequence& os, TokenSequence is, bool inCond=false); + void Subst(TokenSequence& os, TokenSequence is, + bool leadingWS, const HideSet& hs, ParamMap& params); + void Glue(TokenSequence& os, TokenSequence is); + void Glue(TokenSequence& os, const Token* tok); + const Token* Stringize(TokenSequence is); + void Stringize(std::string& str, TokenSequence is); + const Token* ParseActualParam(TokenSequence& is, Macro* macro, ParamMap& paramMap); + int GetDirective(TokenSequence& is); + const Token* EvalDefOp(TokenSequence& is); + void ReplaceIdent(TokenSequence& is); + void ParseDirective(TokenSequence& os, TokenSequence& is, int directive); + void ParseIf(TokenSequence ls); + void ParseIfdef(TokenSequence ls); + void ParseIfndef(TokenSequence ls); + void ParseElif(TokenSequence ls); + void ParseElse(TokenSequence ls); + void ParseEndif(TokenSequence ls); + void ParseInclude(TokenSequence& is, TokenSequence ls); + void ParseDef(TokenSequence ls); + void ParseUndef(TokenSequence ls); + void ParseLine(TokenSequence ls); + void ParseError(TokenSequence ls); + void ParsePragma(TokenSequence ls); + void IncludeSrc(TokenSequence& is, const std::string* text, const std::string* filename); + void IncludeFile(TokenSequence& is, const std::string* filename); + bool ParseIdentList(ParamList& params, TokenSequence& is); + + + Macro* FindMacro(const std::string& name) { + auto res = macroMap_.find(name); + if (res == macroMap_.end()) + return nullptr; + return &res->second; + } + + void AddMacro(const std::string& name, + std::string* text, bool preDef=false); + + void AddMacro(const std::string& name, const Macro& macro) { + auto res = macroMap_.find(name); + if (res != macroMap_.end()) { + // TODO(wgtdkp): give warning + macroMap_.erase(res); + } + macroMap_.insert(std::make_pair(name, macro)); + } + + void RemoveMacro(const std::string& name) { + auto res = macroMap_.find(name); + if (res == macroMap_.end()) + return; + if(res->second.PreDef()) // Cannot undef predefined macro + return; + macroMap_.erase(res); + } + + std::string* SearchFile(const std::string& name, + const bool libHeader, + bool next, + const std::string& curPath); + + void AddSearchPath(std::string path); + void HandleTheFileMacro(TokenSequence& os, const Token* macro); + void HandleTheLineMacro(TokenSequence& os, const Token* macro); + void UpdateFirstTokenLine(TokenSequence ts); + + bool NeedExpand() const { + if (ppCondStack_.empty()) + return true; + auto top = ppCondStack_.top(); + return top.enabled_ && top.cond_; + } + +private: + void Init(); + + PPCondStack ppCondStack_; + unsigned curLine_; + unsigned lineLine_; + bool curCond_; + + MacroMap macroMap_; + PathList searchPaths_; + const std::string* fName_; + const std::string* fSrc_; +}; + +#endif diff --git a/include/triton/lang/wgtcc/encoding.h b/include/triton/lang/wgtcc/encoding.h new file mode 100644 index 000000000..9d6c1e544 --- /dev/null +++ b/include/triton/lang/wgtcc/encoding.h @@ -0,0 +1,20 @@ +#ifndef _WGTCC_ENCODING_H_ +#define _WGTCC_ENCODING_H_ + +#include + + +enum class Encoding { + NONE, + CHAR16, + CHAR32, + UTF8, + WCHAR +}; + + +void ConvertToUTF16(std::string& str); +void ConvertToUTF32(std::string& str); +void AppendUCN(std::string& str, int c); + +#endif diff --git a/include/triton/lang/wgtcc/error.h b/include/triton/lang/wgtcc/error.h new file mode 100644 index 000000000..fdae7e060 --- /dev/null +++ b/include/triton/lang/wgtcc/error.h @@ -0,0 +1,15 @@ +#ifndef _WGTCC_ERROR_H_ +#define _WGTCC_ERROR_H_ + + +struct SourceLocation; +class Token; +class Expr; + + +[[noreturn]] void Error(const char* format, ...); +[[noreturn]] void Error(const SourceLocation& loc, const char* format, ...); +[[noreturn]] void Error(const Token* tok, const char* format, ...); +[[noreturn]] void Error(const Expr* expr, const char* format, ...); + +#endif diff --git a/include/triton/lang/wgtcc/evaluator.h b/include/triton/lang/wgtcc/evaluator.h new file mode 100644 index 000000000..620539169 --- /dev/null +++ b/include/triton/lang/wgtcc/evaluator.h @@ -0,0 +1,120 @@ +#ifndef _WGTCC_EVALUATOR_H_ +#define _WGTCC_EVALUATOR_H_ + +#include "ast.h" +#include "error.h" +#include "visitor.h" + + +class Expr; + +template +class Evaluator: public Visitor { +public: + Evaluator() {} + + virtual ~Evaluator() {} + + virtual void VisitBinaryOp(BinaryOp* binary); + virtual void VisitUnaryOp(UnaryOp* unary); + virtual void VisitConditionalOp(ConditionalOp* cond); + + virtual void VisitFuncCall(FuncCall* funcCall) { + Error(funcCall, "expect constant expression"); + } + virtual void VisitEnumerator(Enumerator* enumer) { + val_ = static_cast(enumer->Val()); + } + virtual void VisitIdentifier(Identifier* ident) { + Error(ident, "expect constant expression"); + } + virtual void VisitObject(Object* obj) { + Error(obj, "expect constant expression"); + } + virtual void VisitConstant(Constant* cons) { + if (cons->Type()->IsFloat()) { + val_ = static_cast(cons->FVal()); + } else if (cons->Type()->IsInteger()) { + val_ = static_cast(cons->IVal()); + } else { + assert(false); + } + } + virtual void VisitTempVar(TempVar* tempVar) { assert(false); } + + // We may should assert here + virtual void VisitDeclaration(Declaration* init) {} + virtual void VisitIfStmt(IfStmt* ifStmt) {} + virtual void VisitJumpStmt(JumpStmt* jumpStmt) {} + virtual void VisitReturnStmt(ReturnStmt* returnStmt) {} + virtual void VisitLabelStmt(LabelStmt* labelStmt) {} + virtual void VisitEmptyStmt(EmptyStmt* emptyStmt) {} + virtual void VisitCompoundStmt(CompoundStmt* compStmt) {} + virtual void VisitFuncDef(FuncDef* funcDef) {} + virtual void VisitTranslationUnit(TranslationUnit* unit) {} + + T Eval(Expr* expr) { + expr->Accept(this); + return val_; + } + +private: + T val_; +}; + + +struct Addr { + std::string label_; + int offset_; +}; + +template<> +class Evaluator: public Visitor { +public: + Evaluator() {} + virtual ~Evaluator() {} + virtual void VisitBinaryOp(BinaryOp* binary); + virtual void VisitUnaryOp(UnaryOp* unary); + virtual void VisitConditionalOp(ConditionalOp* cond); + + virtual void VisitFuncCall(FuncCall* funcCall) { + Error(funcCall, "expect constant expression"); + } + virtual void VisitEnumerator(Enumerator* enumer) { + addr_.offset_ = enumer->Val(); + } + virtual void VisitIdentifier(Identifier* ident) { + addr_.label_ = ident->Name(); + addr_.offset_ = 0; + } + virtual void VisitObject(Object* obj) { + if (!obj->IsStatic()) { + Error(obj, "expect static object"); + } + addr_.label_ = obj->Repr(); + addr_.offset_ = 0; + } + virtual void VisitConstant(Constant* cons); + virtual void VisitTempVar(TempVar* tempVar) { assert(false); } + + // We may should assert here + virtual void VisitDeclaration(Declaration* init) {} + virtual void VisitIfStmt(IfStmt* ifStmt) {} + virtual void VisitJumpStmt(JumpStmt* jumpStmt) {} + virtual void VisitReturnStmt(ReturnStmt* returnStmt) {} + virtual void VisitLabelStmt(LabelStmt* labelStmt) {} + virtual void VisitEmptyStmt(EmptyStmt* emptyStmt) {} + virtual void VisitCompoundStmt(CompoundStmt* compStmt) {} + virtual void VisitFuncDef(FuncDef* funcDef) {} + virtual void VisitTranslationUnit(TranslationUnit* unit) {} + + Addr Eval(Expr* expr) { + expr->Accept(this); + return addr_; + } + +private: + Addr addr_; +}; + +#endif diff --git a/include/triton/lang/wgtcc/mem_pool.h b/include/triton/lang/wgtcc/mem_pool.h new file mode 100644 index 000000000..217237784 --- /dev/null +++ b/include/triton/lang/wgtcc/mem_pool.h @@ -0,0 +1,101 @@ +#ifndef _WGTCC_MEM_POOL_H_ +#define _WGTCC_MEM_POOL_H_ + +#include +#include + + +class MemPool { +public: + MemPool(): allocated_(0) {} + virtual ~MemPool() {} + MemPool(const MemPool& other) = delete; + MemPool& operator=(const MemPool& other) = delete; + virtual void* Alloc() = 0; + virtual void Free(void* addr) = 0; + virtual void Clear() = 0; + +protected: + size_t allocated_; +}; + + +template +class MemPoolImp: public MemPool { +public: + MemPoolImp() : root_(nullptr) {} + virtual ~MemPoolImp() {} + MemPoolImp(const MemPool& other) = delete; + MemPoolImp& operator=(MemPool& other) = delete; + virtual void* Alloc(); + virtual void Free(void* addr); + virtual void Clear(); + +private: + enum { + COUNT = (4 * 1024) / sizeof(T) + }; + + union Chunk { + Chunk* next_; + char mem_[sizeof(T)]; + }; + + struct Block { + Block() { + for (size_t i = 0; i < COUNT - 1; ++i) + chunks_[i].next_ = &chunks_[i+1]; + chunks_[COUNT-1].next_ = nullptr; + } + Chunk chunks_[COUNT]; + }; + + std::vector blocks_; + Chunk* root_; +}; + + +template +void* MemPoolImp::Alloc() { + if (nullptr == root_) { // 空间不够,需要分配空间 + auto block = new Block(); + root_ = block->chunks_; + // 如果blocks实现为std::list, 那么push_back实际的overhead更大 + // 这也表明,即使我们不需要随机访问功能(那么std::vector的拷贝是一种overhead), + // 仍然倾向于使用std::vector, + // 当然std::vector的指数级capacity增长会造成内存浪费。 + blocks_.push_back(block); + } + + auto ret = root_; + root_ = root_->next_; + + ++allocated_; + return ret; +} + + +template +void MemPoolImp::Free(void* addr) { + if (nullptr == addr) + return; + + auto chunk = static_cast(addr); + chunk->next_ = root_; + root_ = chunk; + + --allocated_; +} + + +template +void MemPoolImp::Clear() { + for (auto block: blocks_) + delete block; + + blocks_.resize(0); + root_ = nullptr; + allocated_ = 0; +} + +#endif diff --git a/include/triton/lang/wgtcc/parser.h b/include/triton/lang/wgtcc/parser.h new file mode 100644 index 000000000..c1de92491 --- /dev/null +++ b/include/triton/lang/wgtcc/parser.h @@ -0,0 +1,244 @@ +#ifndef _PARSER_H_ +#define _PARSER_H_ + +#include "ast.h" +#include "encoding.h" +#include "error.h" +#include "mem_pool.h" +#include "scope.h" +#include "token.h" + +#include +#include +#include + + +class Preprocessor; +using TokenTypePair = std::pair; + +class Parser { + using LiteralList = std::vector; + using StaticObjectList = std::vector; + using CaseLabelList = std::vector>; + using LabelJumpList = std::list>; + using LabelMap = std::map; + friend class Generator; + +public: + explicit Parser(const TokenSequence& ts) + : unit_(TranslationUnit::New()), + ts_(ts), + externalSymbols_(new Scope(nullptr, S_BLOCK)), + errTok_(nullptr), + curScope_(new Scope(nullptr, S_FILE)), + curFunc_(nullptr), + breakDest_(nullptr), + continueDest_(nullptr), + caseLabels_(nullptr), + defaultLabel_(nullptr) { + ts_.SetParser(this); + } + + ~Parser() {} + + Constant* ParseConstant(const Token* tok); + Constant* ParseFloat(const Token* tok); + Constant* ParseInteger(const Token* tok); + Constant* ParseCharacter(const Token* tok); + Encoding ParseLiteral(std::string& str, const Token* tok); + Constant* ConcatLiterals(const Token* tok); + Expr* ParseGeneric(); + + void Parse(); + void ParseTranslationUnit(); + FuncDef* ParseFuncDef(Identifier* ident); + + + // Expressions + Expr* ParseExpr(); + Expr* ParsePrimaryExpr(); + QualType TryCompoundLiteral(); + Object* ParseCompoundLiteral(QualType type); + Expr* ParsePostfixExpr(); + Expr* ParsePostfixExprTail(Expr* primExpr); + Expr* ParseSubScripting(Expr* pointer); + BinaryOp* ParseMemberRef(const Token* tok, int op, Expr* lhs); + UnaryOp* ParsePostfixIncDec(const Token* tok, Expr* operand); + FuncCall* ParseFuncCall(Expr* caller); + + Expr* ParseUnaryExpr(); + Constant* ParseSizeof(); + Constant* ParseAlignof(); + UnaryOp* ParsePrefixIncDec(const Token* tok); + UnaryOp* ParseUnaryOp(const Token* tok, int op); + + QualType ParseTypeName(); + Expr* ParseCastExpr(); + Expr* ParseRangeExpr(); + Expr* ParseMultiplicativeExpr(); + Expr* ParseAdditiveExpr(); + Expr* ParseShiftExpr(); + Expr* ParseRelationalExpr(); + Expr* ParseEqualityExpr(); + Expr* ParseBitiwiseAndExpr(); + Expr* ParseBitwiseXorExpr(); + Expr* ParseBitwiseOrExpr(); + Expr* ParseLogicalAndExpr(); + Expr* ParseLogicalOrExpr(); + Expr* ParseConditionalExpr(); + Expr* ParseCommaExpr(); + Expr* ParseAssignExpr(); + + // Declarations + CompoundStmt* ParseDecl(); + void ParseStaticAssert(); + QualType ParseDeclSpec(int* storageSpec, int* funcSpec, int* alignSpec); + QualType ParseSpecQual(); + int ParseAlignas(); + Type* ParseStructUnionSpec(bool isStruct); + StructType* ParseStructUnionDecl(StructType* type); + void ParseBitField(StructType* structType, const Token* tok, QualType type); + Type* ParseEnumSpec(); + Type* ParseEnumerator(ArithmType* type); + int ParseQual(); + QualType ParsePointer(QualType typePointedTo); + TokenTypePair ParseDeclarator(QualType type); + QualType ParseArrayFuncDeclarator(const Token* ident, QualType base); + int ParseArrayLength(); + TileType::ShapeInt ParseTileShape(); + bool ParseParamList(FuncType::ParamList& params); + Object* ParseParamDecl(); + + QualType ParseAbstractDeclarator(QualType type); + Identifier* ParseDirectDeclarator(QualType type, + int storageSpec, + int funcSpec, + int align); + // Initializer + void ParseInitializer(Declaration* decl, + QualType type, + int offset, + bool designated=false, + bool forceBrace=false, + unsigned char bitFieldBegin=0, + unsigned char bitFieldWidth=0); + void ParseArrayInitializer(Declaration* decl, + ArrayType* type, + int offset, + bool designated); + StructType::Iterator ParseStructDesignator(StructType* type, + const std::string& name); + void ParseStructInitializer(Declaration* decl, + StructType* type, + int offset, + bool designated); + bool ParseLiteralInitializer(Declaration* init, + ArrayType* type, + int offset); + Declaration* ParseInitDeclarator(Identifier* ident); + Declaration* ParseInitDeclaratorSub(Object* obj); + + // Statements + Stmt* ParseStmt(); + CompoundStmt* ParseCompoundStmt(FuncType* funcType=nullptr); + IfStmt* ParseIfStmt(); + CompoundStmt* ParseSwitchStmt(); + CompoundStmt* ParseWhileStmt(); + CompoundStmt* ParseDoStmt(); + CompoundStmt* ParseForStmt(); + JumpStmt* ParseGotoStmt(); + JumpStmt* ParseContinueStmt(); + JumpStmt* ParseBreakStmt(); + ReturnStmt* ParseReturnStmt(); + CompoundStmt* ParseLabelStmt(const Token* label); + CompoundStmt* ParseCaseStmt(); + CompoundStmt* ParseDefaultStmt(); + Identifier* ProcessDeclarator(const Token* tok, + QualType type, + int storageSpec, + int funcSpec, + int align); + // GNU extensions + void TryAttributeSpecList(); + void ParseAttributeSpec(); + void ParseAttribute(); + bool IsTypeName(const Token* tok) const{ + if (tok->IsTypeSpecQual()) + return true; + + if (tok->IsIdentifier()) { + auto ident = curScope_->Find(tok); + if (ident && ident->ToTypeName()) + return true; + } + return false; + } + bool IsType(const Token* tok) const{ + if (tok->IsDecl()) + return true; + + if (tok->IsIdentifier()) { + auto ident = curScope_->Find(tok); + return (ident && ident->ToTypeName()); + } + + return false; + } + void EnsureInteger(Expr* expr) { + if (!expr->Type()->IsInteger()) { + Error(expr, "expect integer expression"); + } + } + + void EnterBlock(FuncType* funcType=nullptr); + void ExitBlock() { curScope_ = curScope_->Parent(); } + void EnterProto() { curScope_ = new Scope(curScope_, S_PROTO); } + void ExitProto() { curScope_ = curScope_->Parent(); } + FuncDef* EnterFunc(Identifier* ident); + void ExitFunc(); + + LabelStmt* FindLabel(const std::string& label) { + auto ret = curLabels_.find(label); + if (curLabels_.end() == ret) + return nullptr; + return ret->second; + } + void AddLabel(const std::string& label, LabelStmt* labelStmt) { + assert(nullptr == FindLabel(label)); + curLabels_[label] = labelStmt; + } + TranslationUnit* Unit() { return unit_; } + FuncDef* CurFunc() { return curFunc_; } + const TokenSequence& ts() const { return ts_; } + +private: + static bool IsBuiltin(FuncType* type); + static bool IsBuiltin(const std::string& name); + static Identifier* GetBuiltin(const Token* tok); + static void DefineBuiltins(); + + static FuncType* vaStartType_; + static FuncType* vaArgType_; + + // The root of the AST + TranslationUnit* unit_; + + TokenSequence ts_; + + // It is not the real scope, + // It contains all external symbols(resolved and not resolved) + Scope* externalSymbols_; + + const Token* errTok_; + Scope* curScope_; + FuncDef* curFunc_; + LabelMap curLabels_; + LabelJumpList unresolvedJumps_; + + LabelStmt* breakDest_; + LabelStmt* continueDest_; + CaseLabelList* caseLabels_; + LabelStmt* defaultLabel_; +}; + +#endif diff --git a/include/triton/lang/wgtcc/scanner.h b/include/triton/lang/wgtcc/scanner.h new file mode 100644 index 000000000..aee010638 --- /dev/null +++ b/include/triton/lang/wgtcc/scanner.h @@ -0,0 +1,84 @@ +#ifndef _WGTCC_SCANNER_H_ +#define _WGTCC_SCANNER_H_ + +#include "error.h" +#include "encoding.h" +#include "token.h" + +#include +#include + + +class Scanner { +public: + explicit Scanner(const Token* tok) + : Scanner(&tok->str_, tok->loc_) {} + Scanner(const std::string* text, const SourceLocation& loc) + : Scanner(text, loc.filename_, loc.line_, loc.column_) {} + explicit Scanner(const std::string* text, + const std::string* filename=nullptr, + unsigned line=1, unsigned column=1) + : text_(text), tok_(Token::END) { + // TODO(wgtdkp): initialization + p_ = &(*text_)[0]; + loc_ = {filename, p_, line, 1}; + } + + virtual ~Scanner() {} + Scanner(const Scanner& other) = delete; + Scanner& operator=(const Scanner& other) = delete; + + // Scan plain text and generate tokens in ts. + // The param 'ts' need not be empty, if so, the tokens + // are inserted at the *header* of 'ts'. + // The param 'ws' tells if there is leading white space + // before this token, it is only SkipComment() that will + // set this param. + Token* Scan(bool ws=false); + void Tokenize(TokenSequence& ts); + static std::string ScanHeadName(const Token* lhs, const Token* rhs); + Encoding ScanCharacter(int& val); + Encoding ScanLiteral(std::string& val); + std::string ScanIdentifier(); + +private: + Token* SkipIdentifier(); + Token* SkipNumber(); + Token* SkipLiteral(); + Token* SkipCharacter(); + Token* MakeToken(int tag); + Token* MakeNewLine(); + Encoding ScanEncoding(int c); + int ScanEscaped(); + int ScanHexEscaped(); + int ScanOctEscaped(int c); + int ScanUCN(int len); + void SkipWhiteSpace(); + void SkipComment(); + bool IsUCN(int c) { return c == '\\' && (Test('u') || Test('U')); } + bool IsOctal(int c) { return '0' <= c && c <= '7'; } + int XDigit(int c); + bool Empty() const { return *p_ == 0; } + int Peek(); + bool Test(int c) { return Peek() == c; }; + int Next(); + void PutBack(); + bool Try(int c) { + if (Peek() == c) { + Next(); + return true; + } + return false; + }; + void Mark() { tok_.loc_ = loc_; }; + + const std::string* text_; + SourceLocation loc_; + Token tok_; + const char* p_; +}; + + +std::string* ReadFile(const std::string& filename); + +#endif diff --git a/include/triton/lang/wgtcc/scope.h b/include/triton/lang/wgtcc/scope.h new file mode 100644 index 000000000..eea115bfb --- /dev/null +++ b/include/triton/lang/wgtcc/scope.h @@ -0,0 +1,70 @@ +#ifndef _WGTCC_SCOPE_H_ +#define _WGTCC_SCOPE_H_ + +#include +#include +#include +#include + + +class Identifier; +class Token; + + +enum ScopeType { + S_FILE, + S_PROTO, + S_BLOCK, + S_FUNC, +}; + + +class Scope { + friend class StructType; + using TagList = std::vector; + using IdentMap = std::map; + +public: + explicit Scope(Scope* parent, enum ScopeType type) + : parent_(parent), type_(type) {} + ~Scope() {} + Scope* Parent() { return parent_; } + void SetParent(Scope* parent) { parent_ = parent; } + enum ScopeType Type() const { return type_; } + + Identifier* Find(const Token* tok); + Identifier* FindInCurScope(const Token* tok); + Identifier* FindTag(const Token* tok); + Identifier* FindTagInCurScope(const Token* tok); + TagList AllTagsInCurScope() const; + + void Insert(Identifier* ident); + void Insert(const std::string& name, Identifier* ident); + void InsertTag(Identifier* ident); + void Print(); + bool operator==(const Scope& other) const { return type_ == other.type_; } + IdentMap::iterator begin() { return identMap_.begin(); } + IdentMap::iterator end() { return identMap_.end(); } + size_t size() const { return identMap_.size(); } + +private: + Identifier* Find(const std::string& name); + Identifier* FindInCurScope(const std::string& name); + Identifier* FindTag(const std::string& name); + Identifier* FindTagInCurScope(const std::string& name); + std::string TagName(const std::string& name) { + return name + "@:tag"; + } + static bool IsTagName(const std::string& name) { + return name.size() > 5 && name[name.size() - 5] == '@'; + } + const Scope& operator=(const Scope& other); + Scope(const Scope& scope); + + Scope* parent_; + enum ScopeType type_; + + IdentMap identMap_; +}; + +#endif diff --git a/include/triton/lang/wgtcc/token.h b/include/triton/lang/wgtcc/token.h new file mode 100644 index 000000000..391507f80 --- /dev/null +++ b/include/triton/lang/wgtcc/token.h @@ -0,0 +1,418 @@ +#ifndef _WGTCC_TOKEN_H_ +#define _WGTCC_TOKEN_H_ + +#include "error.h" + +#include +#include +#include +#include +#include +#include +#include + + +class Generator; +class Parser; +class Scanner; +class Token; +class TokenSequence; + +using HideSet = std::set; +using TokenList = std::list; + + +struct SourceLocation { + const std::string* filename_; + const char* lineBegin_; + unsigned line_; + unsigned column_; + + const char* Begin() const { + return lineBegin_ + column_ - 1; + } +}; + + +class Token { + friend class Scanner; +public: + enum { + // Punctuators + LPAR = '(', + RPAR = ')', + LSQB = '[', + RSQB = ']', + COLON = ':', + COMMA = ',', + SEMI = ';', + ADD = '+', + SUB = '-', + MUL = '*', + DIV = '/', + OR = '|', + AND = '&', + XOR = '^', + LESS = '<', + GREATER = '>', + EQUAL = '=', + DOT = '.', + MOD = '%', + LBRACE = '{', + RBRACE = '}', + TILDE = '~', + NOT = '!', + COND = '?', + SHARP = '#', + AT = '@', + NEW_LINE = '\n', + + DSHARP = 128, // '##' + PTR, + INC, + DEC, + LEFT, + RIGHT, + LE, + GE, + EQ, + NE, + LOGICAL_AND, + LOGICAL_OR, + + MUL_ASSIGN, + DIV_ASSIGN, + MOD_ASSIGN, + ADD_ASSIGN, + SUB_ASSIGN, + LEFT_ASSIGN, + RIGHT_ASSIGN, + AND_ASSIGN, + XOR_ASSIGN, + OR_ASSIGN, + + ELLIPSIS, + // Punctuators end + + // KEYWORD BEGIN + // TYPE QUALIFIER BEGIN + CONST, + RESTRICT, + VOLATILE, + ATOMIC, + // TYPE QUALIFIER END + + // TYPE SPECIFIER BEGIN + VOID, + CHAR, + SHORT, + INT, + LONG, + HALF, + FLOAT, + DOUBLE, + SIGNED, + UNSIGNED, + BOOL, // _Bool + COMPLEX, // _Complex + STRUCT, + UNION, + ENUM, + // TYPE SPECIFIER END + + ATTRIBUTE, // GNU extension __attribute__ + // FUNCTION SPECIFIER BEGIN + INLINE, + NORETURN, // _Noreturn + // FUNCTION SPECIFIER END + + ALIGNAS, // _Alignas + // For syntactic convenience + STATIC_ASSERT, // _Static_assert + // STORAGE CLASS SPECIFIER BEGIN + TYPEDEF, + EXTERN, + STATIC, + THREAD, // _Thread_local + AUTO, + REGISTER, + + // STORAGE CLASS SPECIFIER END + BREAK, + CASE, + CONTINUE, + DEFAULT, + DO, + ELSE, + FOR, + GOTO, + IF, + RETURN, + SIZEOF, + SWITCH, + WHILE, + ALIGNOF, // _Alignof + GENERIC, // _Generic + IMAGINARY, // _Imaginary + // KEYWORD END + + IDENTIFIER, + CONSTANT, + I_CONSTANT, + C_CONSTANT, + F_CONSTANT, + LITERAL, + + // For the parser, a identifier is a typedef name or user defined type + POSTFIX_INC, + POSTFIX_DEC, + PREFIX_INC, + PREFIX_DEC, + ADDR, // '&' + DEREF, // '*' + PLUS, + MINUS, + CAST, + + // For preprocessor + PP_IF, + PP_IFDEF, + PP_IFNDEF, + PP_ELIF, + PP_ELSE, + PP_ENDIF, + PP_INCLUDE, + PP_DEFINE, + PP_UNDEF, + PP_LINE, + PP_ERROR, + PP_PRAGMA, + PP_NONE, + PP_EMPTY, + + + IGNORE, + INVALID, + END, + NOTOK = -1, + }; + + static Token* New(int tag); + static Token* New(const Token& other); + static Token* New(int tag, + const SourceLocation& loc, + const std::string& str, + bool ws=false); + Token& operator=(const Token& other) { + tag_ = other.tag_; + ws_ = other.ws_; + loc_ = other.loc_; + str_ = other.str_; + hs_ = other.hs_ ? new HideSet(*other.hs_): nullptr; + return *this; + } + virtual ~Token() {} + + // Token::NOTOK represents not a kw. + static int KeyWordTag(const std::string& key) { + auto kwIter = kwTypeMap_.find(key); + if (kwTypeMap_.end() == kwIter) + return Token::NOTOK; // Not a key word type + return kwIter->second; + } + static bool IsKeyWord(const std::string& name); + static bool IsKeyWord(int tag) { return CONST <= tag && tag < IDENTIFIER; } + bool IsKeyWord() const { return IsKeyWord(tag_); } + bool IsPunctuator() const { return 0 <= tag_ && tag_ <= ELLIPSIS; } + bool IsLiteral() const { return tag_ == LITERAL; } + bool IsConstant() const { return CONSTANT <= tag_ && tag_ <= F_CONSTANT; } + bool IsIdentifier() const { return IDENTIFIER == tag_; } + bool IsEOF() const { return tag_ == Token::END; } + bool IsTypeSpecQual() const { return CONST <= tag_ && tag_ <= ENUM; } + bool IsDecl() const { return CONST <= tag_ && tag_ <= REGISTER; } + static const char* Lexeme(int tag) { + auto iter = tagLexemeMap_.find(tag); + if (iter == tagLexemeMap_.end()) + return nullptr; + + return iter->second; + } + + int tag_; + + // 'ws_' standards for weither there is preceding white space + // This is to simplify the '#' operator(stringize) in macro expansion + bool ws_ { false }; + SourceLocation loc_; + + std::string str_; + HideSet* hs_ { nullptr }; + +private: + explicit Token(int tag): tag_(tag) {} + Token(int tag, const SourceLocation& loc, + const std::string& str, bool ws=false) + : tag_(tag), ws_(ws), loc_(loc), str_(str) {} + + Token(const Token& other) { + *this = other; + } + + static const std::unordered_map kwTypeMap_; + static const std::unordered_map tagLexemeMap_; +}; + + +class TokenSequence { + friend class Preprocessor; + +public: + TokenSequence(): tokList_(new TokenList()), + begin_(tokList_->begin()), end_(tokList_->end()) {} + explicit TokenSequence(Token* tok) { + TokenSequence(); + InsertBack(tok); + } + explicit TokenSequence(TokenList* tokList) + : tokList_(tokList), + begin_(tokList->begin()), + end_(tokList->end()) {} + TokenSequence(TokenList* tokList, + TokenList::iterator begin, + TokenList::iterator end) + : tokList_(tokList), begin_(begin), end_(end) {} + ~TokenSequence() {} + TokenSequence(const TokenSequence& other) { *this = other; } + const TokenSequence& operator=(const TokenSequence& other) { + tokList_ = other.tokList_; + begin_ = other.begin_; + end_ = other.end_; + return *this; + } + void Copy(const TokenSequence& other) { + tokList_ = new TokenList(other.begin_, other.end_); + begin_ = tokList_->begin(); + end_ = tokList_->end(); + for (auto iter = begin_; iter != end_; ++iter) + *iter = Token::New(**iter); + } + void UpdateHeadLocation(const SourceLocation& loc) { + assert(!Empty()); + auto tok = const_cast(Peek()); + tok->loc_ = loc; + } + void FinalizeSubst(bool leadingWS, const HideSet& hs) { + auto ts = *this; + while (!ts.Empty()) { + auto tok = const_cast(ts.Next()); + if (!tok->hs_) + tok->hs_ = new HideSet(hs); + else + tok->hs_->insert(hs.begin(), hs.end()); + } + // Even if the token sequence is empty + const_cast(Peek())->ws_ = leadingWS; + } + + const Token* Expect(int expect); + bool Try(int tag) { + if (Peek()->tag_ == tag) { + Next(); + return true; + } + return false; + } + bool Test(int tag) { return Peek()->tag_ == tag; } + const Token* Next() { + auto ret = Peek(); + if (!ret->IsEOF()) { + ++begin_; + Peek(); // May skip newline token, but why ? + } else { + ++exceed_end; + } + return ret; + } + void PutBack() { + assert(begin_ != tokList_->begin()); + if (exceed_end > 0) { + --exceed_end; + } else { + --begin_; + if ((*begin_)->tag_ == Token::NEW_LINE) + PutBack(); + } + } + const Token* Peek() const; + const Token* Peek2() { + if (Empty()) + return Peek(); // Return the Token::END + Next(); + auto ret = Peek(); + PutBack(); + return ret; + } + const Token* Back() const { + auto back = end_; + return *--back; + } + void PopBack() { + assert(!Empty()); + assert(end_ == tokList_->end()); + auto size_eq1 = tokList_->back() == *begin_; + tokList_->pop_back(); + end_ = tokList_->end(); + if (size_eq1) + begin_ = end_; + } + TokenList::iterator Mark() { return begin_; } + void ResetTo(TokenList::iterator mark) { begin_ = mark; } + bool Empty() const { return Peek()->tag_ == Token::END; } + void InsertBack(TokenSequence& ts) { + auto pos = tokList_->insert(end_, ts.begin_, ts.end_); + if (begin_ == end_) { + begin_ = pos; + } + } + void InsertBack(const Token* tok) { + auto pos = tokList_->insert(end_, tok); + if (begin_ == end_) { + begin_ = pos; + } + } + + // If there is preceding newline + void InsertFront(TokenSequence& ts) { + auto pos = GetInsertFrontPos(); + begin_ = tokList_->insert(pos, ts.begin_, ts.end_); + } + void InsertFront(const Token* tok) { + auto pos = GetInsertFrontPos(); + begin_ = tokList_->insert(pos, tok); + } + bool IsBeginOfLine() const; + TokenSequence GetLine(); + void SetParser(Parser* parser) { parser_ = parser; } + void Print(FILE* fp=stdout) const; + void Print(std::string *str) const; + +private: + // Find a insert position with no preceding newline + TokenList::iterator GetInsertFrontPos() { + auto pos = begin_; + if (pos == tokList_->begin()) + return pos; + --pos; + while (pos != tokList_->begin() && (*pos)->tag_ == Token::NEW_LINE) + --pos; + return ++pos; + } + + TokenList* tokList_; + mutable TokenList::iterator begin_; + TokenList::iterator end_; + Parser* parser_ {nullptr}; + int exceed_end {0}; +}; + +#endif diff --git a/include/triton/lang/wgtcc/type.h b/include/triton/lang/wgtcc/type.h new file mode 100644 index 000000000..20c2fa898 --- /dev/null +++ b/include/triton/lang/wgtcc/type.h @@ -0,0 +1,450 @@ +#ifndef _WGTCC_TYPE_H_ +#define _WGTCC_TYPE_H_ + +#include "mem_pool.h" +#include "scope.h" + +#include +#include +#include +#include + + +class Scope; +class Token; +class Expr; + +class Type; +class QualType; +class VoidType; +class Identifier; +class Object; +class Constant; + +class ArithmType; +class DerivedType; +class ArrayType; +class TileType; +class FuncType; +class PointerType; +class StructType; +class EnumType; + + +enum { + // Storage class specifiers + S_TYPEDEF = 0x01, + S_EXTERN = 0x02, + S_STATIC = 0x04, + S_THREAD = 0x08, + S_AUTO = 0x10, + S_REGISTER = 0x20, + + // Type specifier + T_SIGNED = 0x40, + T_UNSIGNED = 0x80, + T_CHAR = 0x100, + T_SHORT = 0x200, + T_INT = 0x400, + T_LONG = 0x800, + T_VOID = 0x1000, + T_HALF = 0x2000, + T_FLOAT = 0x4000, + T_DOUBLE = 0x8000, + T_BOOL = 0x10000, + T_COMPLEX = 0x20000, + // T_ATOMIC = 0x40000, + T_STRUCT_UNION = 0x80000, + T_ENUM = 0x100000, + T_TYPEDEF_NAME = 0x200000, + + T_LLONG = 0x4000000, + + // Function specifier + F_INLINE = 0x8000000, + F_NORETURN = 0x10000000, +}; + + +struct Qualifier { + enum { + CONST = 0x01, + RESTRICT = 0x02, + VOLATILE = 0x04, + MASK = CONST | RESTRICT | VOLATILE + }; +}; + + +class QualType { +public: + QualType(Type* ptr, int quals=0x00) + : ptr_(reinterpret_cast(ptr)) { + assert((quals & ~Qualifier::MASK) == 0); + ptr_ |= quals; + } + + operator bool() const { return !IsNull(); } + bool IsNull() const { return GetPtr() == nullptr; } + const Type* GetPtr() const { + return reinterpret_cast(ptr_ & ~Qualifier::MASK); + } + Type* GetPtr() { + return reinterpret_cast(ptr_ & ~Qualifier::MASK); + } + Type& operator*() { return *GetPtr(); } + const Type& operator*() const { return *GetPtr(); } + Type* operator->() { return GetPtr(); } + const Type* operator->() const { return GetPtr(); } + + // Indicate whether the specified types are identical(exclude qualifiers). + friend bool operator==(QualType lhs, QualType rhs) { + return lhs.operator->() == rhs.operator->(); + } + friend bool operator!=(QualType lhs, QualType rhs) { + return !(lhs == rhs); + } + + int Qual() const { return ptr_ & 0x07; } + bool IsConstQualified() const { return ptr_ & Qualifier::CONST; } + bool IsRestrictQualified() const { return ptr_ & Qualifier::RESTRICT; } + bool IsVolatileQualified() const { return ptr_ & Qualifier::VOLATILE; } + +private: + intptr_t ptr_; +}; + + +class Type { +public: + static const int intWidth_ = 4; + static const int machineWidth_ = 8; + + bool operator!=(const Type& other) const = delete; + bool operator==(const Type& other) const = delete; + + virtual bool Compatible(const Type& other) const { + return complete_ == other.complete_; + } + + virtual ~Type() {} + + // For Debugging + virtual std::string Str() const = 0; + virtual int Width() const = 0; + virtual int Align() const { return Width(); } + static int MakeAlign(int offset, int align) { + if ((offset % align) == 0) + return offset; + if (offset >= 0) + return offset + align - (offset % align); + else + return offset - align - (offset % align); + } + + static QualType MayCast(QualType type, bool inProtoScope=false); + bool Complete() const { return complete_; } + void SetComplete(bool complete) const { complete_ = complete; } + + bool IsReal() const { return IsInteger() || IsFloat(); }; + virtual bool IsScalar() const { return false; } + virtual bool IsFloat() const { return false; } + virtual bool IsInteger() const { return false; } + virtual bool IsBool() const { return false; } + virtual bool IsVoidPointer() const { return false; } + virtual bool IsUnsigned() const { return false; } + + virtual VoidType* ToVoid() { return nullptr; } + virtual const VoidType* ToVoid() const { return nullptr; } + virtual ArithmType* ToArithm() { return nullptr; } + virtual const ArithmType* ToArithm() const { return nullptr; } + virtual ArrayType* ToArray() { return nullptr; } + virtual const ArrayType* ToArray() const { return nullptr; } + virtual TileType* ToTile() { return nullptr; } + virtual const TileType* ToTile() const { return nullptr; } + virtual FuncType* ToFunc() { return nullptr; } + virtual const FuncType* ToFunc() const { return nullptr; } + virtual PointerType* ToPointer() { return nullptr; } + virtual const PointerType* ToPointer() const { return nullptr; } + virtual DerivedType* ToDerived() { return nullptr; } + virtual const DerivedType* ToDerived() const { return nullptr; } + virtual StructType* ToStruct() { return nullptr; } + virtual const StructType* ToStruct() const { return nullptr; } + +protected: + Type(MemPool* pool, bool complete) + : complete_(complete), pool_(pool) {} + + mutable bool complete_; + MemPool* pool_; +}; + + +class VoidType : public Type { +public: + static VoidType* New(); + virtual ~VoidType() {} + virtual VoidType* ToVoid() { return this; } + virtual const VoidType* ToVoid() const { return this; } + virtual bool Compatible(const Type& other) const { return other.ToVoid(); } + virtual int Width() const { + // Non-standard GNU extension + return 1; + } + virtual std::string Str() const { return "void:1"; } + +protected: + explicit VoidType(MemPool* pool): Type(pool, false) {} +}; + + +class ArithmType : public Type { +public: + static ArithmType* New(int typeSpec); + + virtual ~ArithmType() {} + virtual ArithmType* ToArithm() { return this; } + virtual const ArithmType* ToArithm() const { return this; } + virtual bool Compatible(const Type& other) const { + // C11 6.2.7 [1]: Two types have compatible type if their types are the same + // But I would to loose this constraints: integer and pointer are compatible + // if (IsInteger() && other.ToPointer()) + // return other.Compatible(*this); + return this == &other; + } + + virtual int Width() const; + virtual std::string Str() const; + virtual bool IsScalar() const { return true; } + virtual bool IsInteger() const { return !IsFloat() && !IsComplex(); } + virtual bool IsUnsigned() const { return tag_ & T_UNSIGNED; } + virtual bool IsFloat() const { + return (tag_ & T_FLOAT) || (tag_ & T_DOUBLE); + } + virtual bool IsBool() const { return tag_ & T_BOOL; } + bool IsComplex() const { return tag_ & T_COMPLEX; } + int Tag() const { return tag_; } + int Rank() const; + static ArithmType* IntegerPromote(ArithmType* type) { + assert(type->IsInteger()); + if (type->Rank() < ArithmType::New(T_INT)->Rank()) + return ArithmType::New(T_INT); + return type; + } + static ArithmType* MaxType(ArithmType* lhsType, + ArithmType* rhsType); + +protected: + explicit ArithmType(MemPool* pool, int spec) + : Type(pool, true), tag_(Spec2Tag(spec)) {} + +private: + static int Spec2Tag(int spec); + + int tag_; +}; + + +class DerivedType : public Type { +public: + QualType Derived() const { return derived_; } + void SetDerived(QualType derived) { derived_ = derived; } + virtual DerivedType* ToDerived() { return this; } + virtual const DerivedType* ToDerived() const { return this; } + +protected: + DerivedType(MemPool* pool, QualType derived) + : Type(pool, true), derived_(derived) {} + + QualType derived_; +}; + + +class PointerType : public DerivedType { +public: + static PointerType* New(QualType derived); + virtual ~PointerType() {} + virtual PointerType* ToPointer() { return this; } + virtual const PointerType* ToPointer() const { return this; } + virtual bool Compatible(const Type& other) const; + virtual int Width() const { return 8; } + virtual bool IsScalar() const { return true; } + virtual bool IsVoidPointer() const { return derived_->ToVoid(); } + virtual std::string Str() const { + return derived_->Str() + "*:" + std::to_string(Width()); + } + +protected: + PointerType(MemPool* pool, QualType derived): DerivedType(pool, derived) {} +}; + + +class ArrayType : public DerivedType { +public: + static ArrayType* New(int len, QualType eleType); + static ArrayType* New(Expr* expr, QualType eleType); + virtual ~ArrayType() { /*delete derived_;*/ } + + virtual ArrayType* ToArray() { return this; } + virtual const ArrayType* ToArray() const { return this; } + virtual bool Compatible(const Type& other) const; + virtual int Width() const { + return Complete() ? (derived_->Width() * len_): 0; + } + virtual int Align() const { return derived_->Align(); } + virtual std::string Str() const { + return derived_->Str() + "[]:" + std::to_string(Width()); + } + + int GetElementOffset(int idx) const { return derived_->Width() * idx; } + int Len() const { return len_; } + void SetLen(int len) { len_ = len; } + bool Variadic() const { return lenExpr_ != nullptr; } + +protected: + ArrayType(MemPool* pool, Expr* lenExpr, QualType derived) + : DerivedType(pool, derived), + lenExpr_(lenExpr), len_(0) { + SetComplete(false); + } + + ArrayType(MemPool* pool, int len, QualType derived) + : DerivedType(pool, derived), + lenExpr_(nullptr), len_(len) { + SetComplete(len_ >= 0); + } + const Expr* lenExpr_; + int len_; +}; + +class TileType : public DerivedType { +public: + using ShapeExpr = std::vector; + using ShapeInt = std::vector; + +public: + static TileType* New(const ShapeExpr& expr, QualType eleType); + static TileType* New(const ShapeInt& shape, QualType eleType); + virtual ~TileType() { } + + virtual TileType* toTile() { return this; } + virtual const TileType* toTile() const { return this; } + virtual bool Compatible(const Type& other) const; + virtual int Width() const { return 0; } + virtual int Align() const { return derived_->Align(); } + virtual std::string Str() const { + return derived_->Str() + "[{}]:" + std::to_string(Width()); + } + + ShapeInt Shape() { return shape_; } + +protected: + TileType(MemPool* pool, const ShapeExpr& expr, QualType derived) + : DerivedType(pool, derived), + shapeExpr_(expr) { + bool isComplete = true; + for(Expr* s: shapeExpr_) + isComplete = isComplete && !s; + SetComplete(isComplete); + } + + TileType(MemPool* pool, const ShapeInt& shape, QualType derived) + : DerivedType(pool, derived), + shape_(shape) { + bool isComplete = true; + for(int s: shape_) + isComplete = isComplete && (s>=0); + SetComplete(isComplete); + } + +protected: + ShapeExpr shapeExpr_; + ShapeInt shape_; +}; + +class FuncType : public DerivedType { +public: + using ParamList = std::vector; + +public: + static FuncType* New(QualType derived, + int funcSpec, + bool variadic, + const ParamList& params); + virtual ~FuncType() {} + virtual FuncType* ToFunc() { return this; } + virtual const FuncType* ToFunc() const { return this; } + virtual bool Compatible(const Type& other) const; + virtual int Width() const { return 1; } + virtual std::string Str() const; + const ParamList& Params() const { return params_; } + void SetParams(const ParamList& params) { params_ = params; } + bool Variadic() const { return variadic_; } + bool IsInline() const { return inlineNoReturn_ & F_INLINE; } + bool IsNoReturn() const { return inlineNoReturn_ & F_NORETURN; } + +protected: + FuncType(MemPool* pool, QualType derived, int inlineReturn, + bool variadic, const ParamList& params) + : DerivedType(pool, derived), inlineNoReturn_(inlineReturn), + variadic_(variadic), params_(params) { + SetComplete(false); + } + +private: + int inlineNoReturn_; + bool variadic_; + ParamList params_; +}; + + +class StructType : public Type { +public: + using MemberList = std::list; + using Iterator = std::list::iterator; + +public: + static StructType* New(bool isStruct, + bool hasTag, + Scope* parent); + virtual ~StructType() {} + virtual StructType* ToStruct() { return this; } + virtual const StructType* ToStruct() const { return this; } + virtual bool Compatible(const Type& other) const; + virtual int Width() const { return width_; } + virtual int Align() const { return align_; } + virtual std::string Str() const; + + // struct/union + void AddMember(Object* member); + void AddBitField(Object* member, int offset); + bool IsStruct() const { return isStruct_; } + Object* GetMember(const std::string& member); + Scope* MemberMap() { return memberMap_; } + MemberList& Members() { return members_; } + int Offset() const { return offset_; } + bool HasTag() const { return hasTag_; } + void MergeAnony(Object* anony); + void Finalize(); + +protected: + // Default is incomplete + StructType(MemPool* pool, bool isStruct, bool hasTag, Scope* parent); + + StructType(const StructType& other); + +private: + void CalcWidth(); + + bool isStruct_; + bool hasTag_; + Scope* memberMap_; + + MemberList members_; + int offset_; + int width_; + int align_; + int bitFieldAlign_; +}; + +#endif diff --git a/include/triton/lang/wgtcc/visitor.h b/include/triton/lang/wgtcc/visitor.h new file mode 100644 index 000000000..d3c4131a7 --- /dev/null +++ b/include/triton/lang/wgtcc/visitor.h @@ -0,0 +1,50 @@ +#ifndef _WGTCC_VISITOR_H_ +#define _WGTCC_VISITOR_H_ + + +class BinaryOp; +class UnaryOp; +class ConditionalOp; +class FuncCall; +class Identifier; +class Object; +class Enumerator; +class Constant; +class TempVar; + +class Declaration; +class IfStmt; +class JumpStmt; +class ReturnStmt; +class LabelStmt; +class EmptyStmt; +class CompoundStmt; +class FuncDef; +class TranslationUnit; + + +class Visitor { +public: + virtual ~Visitor() {} + virtual void VisitBinaryOp(BinaryOp* binary) = 0; + virtual void VisitUnaryOp(UnaryOp* unary) = 0; + virtual void VisitConditionalOp(ConditionalOp* cond) = 0; + virtual void VisitFuncCall(FuncCall* funcCall) = 0; + virtual void VisitEnumerator(Enumerator* enumer) = 0; + virtual void VisitIdentifier(Identifier* ident) = 0; + virtual void VisitObject(Object* obj) = 0; + virtual void VisitConstant(Constant* cons) = 0; + virtual void VisitTempVar(TempVar* tempVar) = 0; + + virtual void VisitDeclaration(Declaration* init) = 0; + virtual void VisitIfStmt(IfStmt* ifStmt) = 0; + virtual void VisitJumpStmt(JumpStmt* jumpStmt) = 0; + virtual void VisitReturnStmt(ReturnStmt* returnStmt) = 0; + virtual void VisitLabelStmt(LabelStmt* labelStmt) = 0; + virtual void VisitEmptyStmt(EmptyStmt* emptyStmt) = 0; + virtual void VisitCompoundStmt(CompoundStmt* compStmt) = 0; + virtual void VisitFuncDef(FuncDef* funcDef) = 0; + virtual void VisitTranslationUnit(TranslationUnit* unit) = 0; +}; + +#endif diff --git a/lib/lang/wgtcc/ast.cc b/lib/lang/wgtcc/ast.cc new file mode 100644 index 000000000..eb673584f --- /dev/null +++ b/lib/lang/wgtcc/ast.cc @@ -0,0 +1,885 @@ +#include "triton/lang/wgtcc/ast.h" + +#include "triton/lang/wgtcc/code_gen.h" +#include "triton/lang/wgtcc/error.h" +#include "triton/lang/wgtcc/evaluator.h" +#include "triton/lang/wgtcc/mem_pool.h" +#include "triton/lang/wgtcc/parser.h" +#include "triton/lang/wgtcc/token.h" + + +static MemPoolImp binaryOpPool; +static MemPoolImp conditionalOpPool; +static MemPoolImp funcCallPool; +static MemPoolImp initializationPool; +static MemPoolImp objectPool; +static MemPoolImp identifierPool; +static MemPoolImp enumeratorPool; +static MemPoolImp constantPool; +static MemPoolImp tempVarPool; +static MemPoolImp unaryOpPool; +static MemPoolImp emptyStmtPool; +static MemPoolImp ifStmtPool; +static MemPoolImp jumpStmtPool; +static MemPoolImp returnStmtPool; +static MemPoolImp labelStmtPool; +static MemPoolImp compoundStmtPool; +static MemPoolImp funcDefPool; + + +/* + * Accept + */ + +void Declaration::Accept(Visitor* v) { + v->VisitDeclaration(this); +} + + +void EmptyStmt::Accept(Visitor* v) { + // Nothing to do +} + + +void LabelStmt::Accept(Visitor* v) { + v->VisitLabelStmt(this); +} + + +void IfStmt::Accept(Visitor* v) { + v->VisitIfStmt(this); +} + + +void JumpStmt::Accept(Visitor* v) { + v->VisitJumpStmt(this); +} + + +void ReturnStmt::Accept(Visitor* v) { + v->VisitReturnStmt(this); +} + + +void CompoundStmt::Accept(Visitor* v) { + v->VisitCompoundStmt(this); +} + + +void BinaryOp::Accept(Visitor* v) { + v->VisitBinaryOp(this); +} + + +void UnaryOp::Accept(Visitor* v) { + v->VisitUnaryOp(this); +} + + +void ConditionalOp::Accept(Visitor* v) { + v->VisitConditionalOp(this); +} + + +void FuncCall::Accept(Visitor* v) { + v->VisitFuncCall(this); +} + + +void Identifier::Accept(Visitor* v) { + v->VisitIdentifier(this); +} + + +void Object::Accept(Visitor* v) { + v->VisitObject(this); +} + + +void Constant::Accept(Visitor* v) { + v->VisitConstant(this); +} + + +void Enumerator::Accept(Visitor* v) +{ + v->VisitEnumerator(this); +} + + +void TempVar::Accept(Visitor* v) { + v->VisitTempVar(this); +} + + +void FuncDef::Accept(Visitor* v) { + v->VisitFuncDef(this); +} + + +void TranslationUnit::Accept(Visitor* v) { + v->VisitTranslationUnit(this); +} + + +// Casting array to pointer, function to pointer to function +Expr* Expr::MayCast(Expr* expr) { + auto type = Type::MayCast(expr->Type()); + // If the types are equal, no need cast + if (type != expr->Type()) { // Pointer comparison is enough + return UnaryOp::New(Token::CAST, expr, type); + } + return expr; +} + + +Expr* Expr::MayCast(Expr* expr, QualType desType) { + expr = MayCast(expr); + auto srcType = expr->Type(); + if (desType->ToPointer() && srcType->ToPointer()) + if (desType->IsVoidPointer() || srcType->IsVoidPointer()) + return expr; + if (!desType->Compatible(*expr->Type())) + expr = UnaryOp::New(Token::CAST, expr, desType); + return expr; +} + + +BinaryOp* BinaryOp::New(const Token* tok, Expr* lhs, Expr* rhs) { + return New(tok, tok->tag_, lhs, rhs); +} + + +BinaryOp* BinaryOp::New(const Token* tok, int op, Expr* lhs, Expr* rhs) { + switch (op) { + case ',': case '.': case '=': + case '*': case '/': case '%': + case '+': case '-': case '&': + case '^': case '|': case '<': + case '>': + case Token::LEFT: + case Token::RIGHT: + case Token::LE: + case Token::GE: + case Token::EQ: + case Token::NE: + case Token::LOGICAL_AND: + case Token::LOGICAL_OR: + case Token::ELLIPSIS: + break; + default: + assert(0); + } + + auto ret = new (binaryOpPool.Alloc()) BinaryOp(tok, op, lhs, rhs); + ret->pool_ = &binaryOpPool; + + ret->TypeChecking(); + return ret; +} + + +ArithmType* BinaryOp::Convert() { + // Both lhs and rhs are ensured to be have arithmetic type + auto lhsType = lhs_->Type()->ToArithm(); + auto rhsType = rhs_->Type()->ToArithm(); + assert(lhsType && rhsType); + auto type = ArithmType::MaxType(lhsType, rhsType); + if (lhsType != type) { // Pointer comparation is enough! + lhs_ = UnaryOp::New(Token::CAST, lhs_, type); + } + if (rhsType != type) { + rhs_ = UnaryOp::New(Token::CAST, rhs_, type); + } + return type; +} + +void BinaryOp::Broadcast() { + auto lhsType = lhs_->Type()->ToTile(); + auto rhsType = rhs_->Type()->ToTile(); + if(!lhsType && !rhsType) + return ; + else if(lhsType && !rhsType){ + type_ = lhsType; + rhs_ = UnaryOp::New(Token::CAST, lhs_, type_); + } + else if(!lhsType && rhsType){ + type_ = rhsType; + lhs_ = UnaryOp::New(Token::CAST, rhs_, type_); + } + else { + auto lhsShape = lhsType->Shape(); + auto rhsShape = rhsType->Shape(); + auto lhsRank = lhsShape.size(); + auto rhsRank = rhsShape.size(); + auto retRank = std::max(lhsRank, rhsRank); + // pad to the left until shapes have the same rank + while(lhsShape.size() < retRank) + lhsShape.insert(lhsShape.begin(), 1); + while(rhsShape.size() < retRank) + rhsShape.insert(rhsShape.begin(), 1); + // broadcast if possible + TileType::ShapeInt retShape(retRank); + for(size_t i = 0; i < retRank; i++) { + if(lhsShape[i] == 1) + retShape[i] = rhsShape[i]; + else if(rhsShape[i] == 1) + retShape[i] = lhsShape[i]; + else + Error(this, "cannot broadcast dimension %d " + "for operands of shape %d and %d", + i, lhsShape[i], rhsShape[i]); + } + auto eleType = lhsType->Derived(); + type_ = TileType::New(retShape, eleType); + lhs_ = UnaryOp::New(Token::CAST, lhs_, type_); + rhs_ = UnaryOp::New(Token::CAST, rhs_, type_); + } +} + +/* + * Type checking + */ + +void Expr::EnsureCompatibleOrVoidPointer(const QualType lhs, + const QualType rhs) const { + if (lhs->ToPointer() && rhs->ToPointer() && + (lhs->IsVoidPointer() || rhs->IsVoidPointer())) { + return; + } + EnsureCompatible(lhs, rhs); +} + + +void Expr::EnsureCompatible(const QualType lhs, const QualType rhs) const { + if (!lhs->Compatible(*rhs)) + Error(this, "incompatible types"); +} + + +void BinaryOp::TypeChecking() { + switch (op_) { + case '.': + return MemberRefOpTypeChecking(); + + case '*': + case '/': + case '%': + return MultiOpTypeChecking(); + + case '+': + case '-': + return AdditiveOpTypeChecking(); + + case Token::LEFT: + case Token::RIGHT: + return ShiftOpTypeChecking(); + + case '<': + case '>': + case Token::LE: + case Token::GE: + return RelationalOpTypeChecking(); + + case Token::EQ: + case Token::NE: + return EqualityOpTypeChecking(); + + case '&': + case '^': + case '|': + return BitwiseOpTypeChecking(); + + case Token::LOGICAL_AND: + case Token::LOGICAL_OR: + return LogicalOpTypeChecking(); + + case '=': + return AssignOpTypeChecking(); + + case ',': + return CommaOpTypeChecking(); + + case Token::ELLIPSIS: + return RangeOpTypeChecking(); + + default: + assert(0); + } +} + + +void BinaryOp::CommaOpTypeChecking() { + type_ = rhs_->Type(); +} + + +void BinaryOp::SubScriptingOpTypeChecking() { + auto lhsType = lhs_->Type()->ToPointer(); + if (!lhsType) { + Error(this, "an pointer expected"); + } + if (!rhs_->Type()->IsInteger()) { + Error(this, "the operand of [] should be intger"); + } + + // The type of [] operator is the derived type + type_ = lhsType->Derived(); +} + + +void BinaryOp::MemberRefOpTypeChecking() { + type_ = rhs_->Type(); +} + + +void BinaryOp::MultiOpTypeChecking() { + if (!lhs_->Type()->ToArithm() || !rhs_->Type()->ToArithm()) { + Error(this, "operands should have arithmetic type"); + } + if ('%' == op_ && + !(lhs_->Type()->IsInteger() && rhs_->Type()->IsInteger())) { + Error(this, "operands of '%%' should be integers"); + } + type_ = Convert(); +} + + +/* + * Additive operator is only allowed between: + * 1. arithmetic types (bool, interger, floating) + * 2. pointer can be used: + * 1. lhs of MINUS operator, and rhs must be integer or pointer; + * 2. lhs/rhs of ADD operator, and the other operand must be integer; + */ +void BinaryOp::AdditiveOpTypeChecking() { + auto lhsType = lhs_->Type()->ToPointer(); + auto rhsType = rhs_->Type()->ToPointer(); + if (lhsType) { + if (op_ == '-') { + if (rhsType) { + if (!lhsType->Compatible(*rhsType)) + Error(this, "invalid operands to binary -"); + type_ = ArithmType::New(T_LONG); // ptrdiff_t + } else if (!rhs_->Type()->IsInteger()) { + Error(this, "invalid operands to binary -"); + } else { + type_ = lhsType; + } + } else if (!rhs_->Type()->IsInteger()) { + Error(this, "invalid operands to binary +"); + } else { + type_ = lhsType; + } + } else if (rhsType) { + if (op_ == '+' && !lhs_->Type()->IsInteger()) { + Error(this, "invalid operands to binary '+'"); + } else if (op_ == '-' && !lhsType) { + Error(this, "invalid operands to binary '-'"); + } + type_ = op_ == '-' ? ArithmType::New(T_LONG): rhs_->Type(); + std::swap(lhs_, rhs_); // To simplify code gen + } else { + if (!lhs_->Type()->ToArithm() || !rhs_->Type()->ToArithm()) { + Error(this, "invalid operands to binary %s", tok_->str_.c_str()); + } + type_ = Convert(); + } +} + +void BinaryOp::RangeOpTypeChecking() { + auto lhsType = lhs_->Type()->ToArithm(); + auto rhsType = rhs_->Type()->ToArithm(); + if(!lhsType || !lhsType->IsInteger() || !rhsType || !rhsType->IsInteger()) + Error(this, "expect integers for range operator"); + lhs_ = Expr::MayCast(lhs_, ArithmType::IntegerPromote(lhsType)); + rhs_ = Expr::MayCast(rhs_, ArithmType::IntegerPromote(rhsType)); + long begin = Evaluator().Eval(lhs_); + long end = Evaluator().Eval(rhs_); + int len = end - begin; + if(len < 0) + Error(this, "range cannot be negative"); + type_ = TileType::New(TileType::ShapeInt{len}, lhs_->Type()); +} + +void BinaryOp::ShiftOpTypeChecking() { + auto lhsType = lhs_->Type()->ToArithm(); + auto rhsType = rhs_->Type()->ToArithm(); + if (!lhsType || !lhsType->IsInteger() || !rhsType || !rhsType->IsInteger()) + Error(this, "expect integers for shift operator"); + lhs_ = Expr::MayCast(lhs_, ArithmType::IntegerPromote(lhsType)); + rhs_ = Expr::MayCast(rhs_, ArithmType::IntegerPromote(rhsType)); + type_ = lhs_->Type(); +} + + +void BinaryOp::RelationalOpTypeChecking() { + if (lhs_->Type()->ToPointer() || rhs_->Type()->ToPointer()) { + EnsureCompatible(lhs_->Type(), rhs_->Type()); + } else { + if (!lhs_->Type()->IsReal() || !rhs_->Type()->IsReal()) { + Error(this, "expect real type of operands"); + } + Convert(); + } + type_ = ArithmType::New(T_INT); +} + + +void BinaryOp::EqualityOpTypeChecking() { + if (lhs_->Type()->ToPointer() || rhs_->Type()->ToPointer()) { + EnsureCompatibleOrVoidPointer(lhs_->Type(), rhs_->Type()); + } else { + if (!lhs_->Type()->ToArithm() || !rhs_->Type()->ToArithm()) + Error(this, "invalid operands to binary %s", tok_->str_.c_str()); + Convert(); + } + type_ = ArithmType::New(T_INT); +} + + +void BinaryOp::BitwiseOpTypeChecking() { + if (!lhs_->Type()->IsInteger() || !rhs_->Type()->IsInteger()) + Error(this, "operands of '&' should be integer"); + type_ = Convert(); +} + + +void BinaryOp::LogicalOpTypeChecking() { + if (!lhs_->Type()->IsScalar() || !rhs_->Type()->IsScalar()) + Error(this, "the operand should be arithmetic type or pointer"); + type_ = ArithmType::New(T_INT); +} + + +void BinaryOp::AssignOpTypeChecking() { + if (lhs_->IsConstQualified()) { + Error(lhs_, "left operand of '=' is const qualified"); + } else if (!lhs_->IsLVal()) { + Error(lhs_, "lvalue expression expected"); + } + + if (!lhs_->Type()->ToArithm() || !rhs_->Type()->ToArithm()) { + EnsureCompatibleOrVoidPointer(lhs_->Type(), rhs_->Type()); + } + + // The other constraints are lefted to cast operator + rhs_ = Expr::MayCast(rhs_, lhs_->Type()); + type_ = lhs_->Type(); +} + + +/* + * Unary Operators + */ + +UnaryOp* UnaryOp::New(int op, Expr* operand, QualType type) { + auto ret = new (unaryOpPool.Alloc()) UnaryOp(op, operand, type); + ret->pool_ = &unaryOpPool; + + ret->TypeChecking(); + return ret; +} + + +bool UnaryOp::IsLVal() { + // Only deref('*') could be lvalue; + return op_ == Token::DEREF; +} + + +ArithmType* UnaryOp::Convert() { + auto arithmType = operand_->Type()->ToArithm(); + assert(arithmType); + if (arithmType->IsInteger()) + arithmType = ArithmType::IntegerPromote(arithmType); + operand_ = Expr::MayCast(operand_, arithmType); + return arithmType; +} + + +void UnaryOp::TypeChecking() { + switch (op_) { + case Token::POSTFIX_INC: + case Token::POSTFIX_DEC: + case Token::PREFIX_INC: + case Token::PREFIX_DEC: + return IncDecOpTypeChecking(); + + case Token::ADDR: + return AddrOpTypeChecking(); + + case Token::DEREF: + return DerefOpTypeChecking(); + + case Token::PLUS: + case Token::MINUS: + case '~': + case '!': + return UnaryArithmOpTypeChecking(); + + case Token::CAST: + return CastOpTypeChecking(); + + default: + assert(false); + } +} + + +void UnaryOp::IncDecOpTypeChecking() { + if (operand_->IsConstQualified()) { + Error(this, "increment/decrement of const qualified expression"); + } else if (!operand_->IsLVal()) { + Error(this, "lvalue expression expected"); + } + + if (!operand_->Type()->IsReal() && !operand_->Type()->ToPointer()) { + Error(this, "expect operand of real type or pointer"); + } + type_ = operand_->Type(); +} + + +void UnaryOp::AddrOpTypeChecking() { + auto funcType = operand_->Type()->ToFunc(); + if (funcType == nullptr && !operand_->IsLVal()) + Error(this, "expression must be an lvalue or function designator"); + type_ = PointerType::New(operand_->Type()); +} + + +void UnaryOp::DerefOpTypeChecking() { + auto pointerType = operand_->Type()->ToPointer(); + if (!pointerType) + Error(this, "pointer expected for deref operator '*'"); + type_ = pointerType->Derived(); +} + + +void UnaryOp::UnaryArithmOpTypeChecking() { + if (Token::PLUS == op_ || Token::MINUS == op_) { + if (!operand_->Type()->ToArithm()) + Error(this, "Arithmetic type expected"); + Convert(); + type_ = operand_->Type(); + } else if ('~' == op_) { + if (!operand_->Type()->IsInteger()) + Error(this, "integer expected for operator '~'"); + Convert(); + type_ = operand_->Type(); + } else if (!operand_->Type()->IsScalar()) { + Error(this, "arithmetic type or pointer expected for operator '!'"); + } else { + type_ = ArithmType::New(T_INT); + } +} + + +void UnaryOp::CastOpTypeChecking() { + auto operandType = Type::MayCast(operand_->Type()); + + // The type_ has been initiated to dest type + if (type_->ToVoid()) { + // The expression becomes a void expression + } else if (!type_->IsScalar() || !operandType->IsScalar()) { + if (!type_->Compatible(*operandType)) + Error(this, "the cast type should be arithemetic type or pointer"); + } else if (type_->IsFloat() && operandType->ToPointer()) { + Error(this, "cannot cast a pointer to floating"); + } else if (type_->ToPointer() && operandType->IsFloat()) { + Error(this, "cannot cast a floating to pointer"); + } +} + + +/* + * Conditional Operator + */ + +ConditionalOp* ConditionalOp::New(const Token* tok, + Expr* cond, + Expr* exprTrue, + Expr* exprFalse) { + auto ret = new (conditionalOpPool.Alloc()) + ConditionalOp(cond, exprTrue, exprFalse); + ret->pool_ = &conditionalOpPool; + + ret->TypeChecking(); + return ret; +} + + +ArithmType* ConditionalOp::Convert() { + auto lhsType = exprTrue_->Type()->ToArithm(); + auto rhsType = exprFalse_->Type()->ToArithm(); + assert(lhsType && rhsType); + auto type = ArithmType::MaxType(lhsType, rhsType); + if (lhsType != type) { // Pointer comparation is enough! + exprTrue_ = UnaryOp::New(Token::CAST, exprTrue_, type); + } + if (rhsType != type) { + exprFalse_ = UnaryOp::New(Token::CAST, exprFalse_, type); + } + + return type; +} + + +void ConditionalOp::TypeChecking() { + if (!cond_->Type()->IsScalar()) { + Error(cond_->Tok(), "scalar is required"); + } + + auto lhsType = exprTrue_->Type(); + auto rhsType = exprFalse_->Type(); + if (lhsType->ToArithm() && rhsType->ToArithm()) { + type_ = Convert(); + } else { + EnsureCompatibleOrVoidPointer(lhsType, rhsType); + type_ = lhsType; + } +} + + +/* + * Function Call + */ + +FuncCall* FuncCall::New(Expr* designator, const ArgList& args) { + auto ret = new (funcCallPool.Alloc()) FuncCall(designator, args); + ret->pool_ = &funcCallPool; + + ret->TypeChecking(); + return ret; +} + + +void FuncCall::TypeChecking() { + auto pointerType = designator_->Type()->ToPointer(); + if (pointerType) { + if (!pointerType->Derived()->ToFunc()) + Error(designator_, "called object is not a function or function pointer"); + // Convert function pointer to function type + designator_ = UnaryOp::New(Token::DEREF, designator_); + } + auto funcType = designator_->Type()->ToFunc(); + if (!funcType) { + Error(designator_, "called object is not a function or function pointer"); + } else if (!funcType->Derived()->ToVoid() && + !funcType->Derived()->Complete()) { + Error(designator_, "invalid use of incomplete return type"); + } + + auto arg = args_.begin(); + for (auto param: funcType->Params()) { + if (arg == args_.end()) + Error(this, "too few arguments for function call"); + *arg = Expr::MayCast(*arg, param->Type()); + ++arg; + } + if (arg != args_.end() && !funcType->Variadic()) + Error(this, "too many arguments for function call"); + + // C11 6.5.2.2 [6]: promote float to double if it has no prototype + while (arg != args_.end()) { + if ((*arg)->Type()->IsFloat() && (*arg)->Type()->Width() == 4) { + auto type = ArithmType::New(T_DOUBLE); + *arg = UnaryOp::New(Token::CAST, *arg, type); + } + ++arg; + } + + type_ = funcType->Derived(); +} + + +/* + * Identifier + */ + +Identifier* Identifier::New(const Token* tok, + QualType type, + enum Linkage linkage) { + auto ret = new (identifierPool.Alloc()) Identifier(tok, type, linkage); + ret->pool_ = &identifierPool; + return ret; +} + + +Enumerator* Enumerator::New(const Token* tok, int val) { + auto ret = new (enumeratorPool.Alloc()) Enumerator(tok, val); + ret->pool_ = &enumeratorPool; + return ret; +} + + +Declaration* Declaration::New(Object* obj) { + auto ret = new (initializationPool.Alloc()) Declaration(obj); + ret->pool_ = &initializationPool; + return ret; +} + +void Declaration::AddInit(Initializer init) { + init.expr_ = Expr::MayCast(init.expr_, init.type_); + + auto res = inits_.insert(init); + if (!res.second) { + inits_.erase(res.first); + inits_.insert(init); + } +} + + +/* + * Object + */ + +Object* Object::New(const Token* tok, + QualType type, + int storage, + enum Linkage linkage, + unsigned char bitFieldBegin, + unsigned char bitFieldWidth) { + auto ret = new (objectPool.Alloc()) + Object(tok, type, storage, linkage, bitFieldBegin, bitFieldWidth); + ret->pool_ = &objectPool; + + static long id = 0; + if (ret->IsStatic() || ret->Anonymous()) + ret->id_ = ++id; + return ret; +} + + +Object* Object::NewAnony(const Token* tok, + QualType type, + int storage, + enum Linkage linkage, + unsigned char bitFieldBegin, + unsigned char bitFieldWidth) { + auto ret = new (objectPool.Alloc()) + Object(tok, type, storage, linkage, bitFieldBegin, bitFieldWidth); + ret->pool_ = &objectPool; + ret->anonymous_ = true; + + static long id = 0; + if (ret->IsStatic() || ret->anonymous_) + ret->id_ = ++id; + return ret; +} + + +/* + * Constant + */ + +Constant* Constant::New(const Token* tok, int tag, long val) { + auto type = ArithmType::New(tag); + auto ret = new (constantPool.Alloc()) Constant(tok, type, val); + ret->pool_ = &constantPool; + return ret; +} + + +Constant* Constant::New(const Token* tok, int tag, double val) { + auto type = ArithmType::New(tag); + auto ret = new (constantPool.Alloc()) Constant(tok, type, val); + ret->pool_ = &constantPool; + return ret; +} + + +Constant* Constant::New(const Token* tok, int tag, const std::string* val) { + auto derived = ArithmType::New(tag); + auto type = ArrayType::New(val->size() / derived->Width(), derived); + + auto ret = new (constantPool.Alloc()) Constant(tok, type, val); + ret->pool_ = &constantPool; + + static long id = 0; + ret->id_ = ++id; + return ret; +} + + +std::string Constant::SValRepr() const { + std::vector buf(4 * sval_->size() + 1); + for (size_t i = 0; i < sval_->size(); ++i) { + int c = (*sval_)[i]; + sprintf(&buf[i * 4], "\\x%1x%1x", (c >> 4) & 0xf, c & 0xf); + } + return std::string(buf.begin(), buf.end() - 1); +} + + +/* + * TempVar + */ + +TempVar* TempVar::New(QualType type) { + auto ret = new (tempVarPool.Alloc()) TempVar(type); + ret->pool_ = &tempVarPool; + return ret; +} + + +/* + * Statement + */ + +EmptyStmt* EmptyStmt::New() { + auto ret = new (emptyStmtPool.Alloc()) EmptyStmt(); + ret->pool_ = &emptyStmtPool; + return ret; +} + + +// The else stmt could be null +IfStmt* IfStmt::New(Expr* cond, Stmt* then, Stmt* els) { + auto ret = new (ifStmtPool.Alloc()) IfStmt(cond, then, els); + ret->pool_ = &ifStmtPool; + return ret; +} + + +CompoundStmt* CompoundStmt::New(std::list& stmts, ::Scope* scope) { + auto ret = new (compoundStmtPool.Alloc()) CompoundStmt(stmts, scope); + ret->pool_ = &compoundStmtPool; + return ret; +} + + +JumpStmt* JumpStmt::New(LabelStmt* label) { + auto ret = new (jumpStmtPool.Alloc()) JumpStmt(label); + ret->pool_ = &jumpStmtPool; + return ret; +} + + +ReturnStmt* ReturnStmt::New(Expr* expr) { + auto ret = new (returnStmtPool.Alloc()) ReturnStmt(expr); + ret->pool_ = &returnStmtPool; + return ret; +} + + +LabelStmt* LabelStmt::New() { + auto ret = new (labelStmtPool.Alloc()) LabelStmt(); + ret->pool_ = &labelStmtPool; + return ret; +} + + +FuncDef* FuncDef::New(Identifier* ident, LabelStmt* retLabel) { + auto ret = new (funcDefPool.Alloc()) FuncDef(ident, retLabel); + ret->pool_ = &funcDefPool; + return ret; +} + + +bool Initializer::operator<(const Initializer& rhs) const { + if (offset_ < rhs.offset_) + return true; + return (offset_ == rhs.offset_ && bitFieldBegin_ < rhs.bitFieldBegin_); +} diff --git a/lib/lang/wgtcc/code_gen.cc b/lib/lang/wgtcc/code_gen.cc new file mode 100644 index 000000000..ca92d6e84 --- /dev/null +++ b/lib/lang/wgtcc/code_gen.cc @@ -0,0 +1,1561 @@ +#include "triton/lang/wgtcc/code_gen.h" + +#include "triton/lang/wgtcc/evaluator.h" +#include "triton/lang/wgtcc/parser.h" +#include "triton/lang/wgtcc/token.h" + +#include +#include +#include + + +extern std::string filename_in; +extern std::string filename_out; +extern bool debug; + +const std::string* Generator::last_file = nullptr; +Parser* Generator::parser_ = nullptr; +FILE* Generator::outFile_ = nullptr; +RODataList Generator::rodatas_; +std::vector Generator::staticDecls_; +int Generator::offset_ = 0; +int Generator::retAddrOffset_ = 0; +FuncDef* Generator::curFunc_ = nullptr; + + +/* + * Register usage: + * xmm0: accumulator of floating datas; + * xmm8: temp register for param passing(xmm0) + * xmm9: source operand register; + * xmm10: tmp register for floating data swap; + * rax: accumulator; + * r12, r13: temp register for rdx and rcx + * r11: source operand register; + * r10: base register when LValGenerator eval the address. + * rcx: tempvar register, like the tempvar of 'switch' + * temp register for struct copy + */ + +static std::vector regs { + "%rdi", "%rsi", "%rdx", + "%rcx", "%r8", "%r9" +}; + +static std::vector xregs { + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7" +}; + + +static ParamClass Classify(Type* paramType, int offset=0) { + if (paramType->IsInteger() || paramType->ToPointer() + || paramType->ToArray()) { + return ParamClass::INTEGER; + } + + if (paramType->ToArithm()) { + auto type = paramType->ToArithm(); + if (type->Tag() == T_FLOAT || type->Tag() == T_DOUBLE) + return ParamClass::SSE; + if (type->Tag() == (T_LONG | T_DOUBLE)) { + // TODO(wgtdkp): + return ParamClass::SSE; + assert(false); + return ParamClass::X87; + } + + // TODO(wgtdkp): + assert(false); + // It is complex + if ((type->Tag() & T_LONG) && (type->Tag() & T_DOUBLE)) + return ParamClass::COMPLEX_X87; + } + auto type = paramType->ToStruct(); + assert(type); + return ParamClass::MEMORY; + // TODO(wgtdkp): Support agrregate type + assert(false); + /* + auto type = paramType->ToStruct(); + assert(type); + + if (type->Width() > 4 * 8) + return PC_MEMORY; + + std::vector classes; + int cnt = (type->Width() + 7) / 8; + for (int i = 0; i < cnt; ++i) { + auto types = FieldsIn8Bytes(type, i); + assert(types.size() > 0); + + auto fieldClass = (types.size() == 1) + ? PC_NO_CLASS: FieldClass(types, 0); + classes.push_back(fieldClass); + + } + + bool sawX87 = false; + for (int i = 0; i < classes.size(); ++i) { + if (classes[i] == PC_MEMORY) + return PC_MEMORY; + if (classes[i] == PC_X87_UP && sawX87) + return PC_MEMORY; + if (classes[i] == PC_X87) + sawX87 = true; + } + */ + return ParamClass::NO_CLASS; // Make compiler happy +} + + +std::string Generator::ConsLabel(Constant* cons) { + if (cons->Type()->IsInteger()) { + return "$" + std::to_string(cons->IVal()); + } else if (cons->Type()->IsFloat()) { + double valsd = cons->FVal(); + float valss = valsd; + // TODO(wgtdkp): Add rodata + auto width = cons->Type()->Width(); + long val = (width == 4)? *reinterpret_cast(&valss): + *reinterpret_cast(&valsd); + const ROData& rodata = ROData(val, width); + rodatas_.push_back(rodata); + return rodata.label_; + } else { // Literal + const ROData& rodata = ROData(cons->SValRepr()); + rodatas_.push_back(rodata); + return rodata.label_; // Return address + } +} + + +static const char* GetLoad(int width, bool flt=false) { + switch (width) { + case 1: return "movzbq"; + case 2: return "movzwq"; + case 4: return !flt ? "movl": "movss"; + case 8: return !flt ? "movq": "movsd"; + default: assert(false); return nullptr; + } +} + + +static std::string GetInst(const std::string& inst, int width, bool flt) { + if (flt) { + return inst + (width == 4 ? "ss": "sd"); + } else { + switch (width) { + case 1: return inst + "b"; + case 2: return inst + "w"; + case 4: return inst + "l"; + case 8: return inst + "q"; + default: assert(false); + } + return inst; // Make compiler happy + } +} + + +static std::string GetInst(const std::string& inst, Type* type) { + assert(type->IsScalar()); + return GetInst(inst, type->Width(), type->IsFloat()); +} + + +static std::string GetReg(int width) { + switch (width) { + case 1: return "%al"; + case 2: return "%ax"; + case 4: return "%eax"; + case 8: return "%rax"; + default: assert(false); return ""; + } +} + + +static std::string GetDes(int width, bool flt) { + if (flt) { + return "%xmm0"; + } + return GetReg(width); +} + + +static std::string GetSrc(int width, bool flt) { + if (flt) { + return "%xmm9"; + } + switch (width) { + case 1: return "%r11b"; + case 2: return "%r11w"; + case 4: return "%r11d"; + case 8: return "%r11"; + default: assert(false); return ""; + } +} + + +// The 'reg' always be 8 bytes +int Generator::Push(const std::string& reg) { + offset_ -= 8; + auto mov = reg[1] == 'x' ? "movsd": "movq"; + Emit(mov, reg, ObjectAddr(offset_)); + return offset_; +} + + +int Generator::Push(Type* type) { + if (type->IsFloat()) { + return Push("%xmm0"); + } else if (type->IsScalar()) { + return Push("%rax"); + } else { + offset_ -= type->Width(); + offset_ = Type::MakeAlign(offset_, 8); + CopyStruct({"", "%rbp", offset_}, type->Width()); + return offset_; + } +} + + +// The 'reg' must be 8 bytes +int Generator::Pop(const std::string& reg) { + auto mov = reg[1] == 'x' ? "movsd": "movq"; + Emit(mov, ObjectAddr(offset_), reg); + offset_ += 8; + return offset_; +} + + +void Generator::Spill(bool flt) { + Push(flt ? "%xmm0": "%rax"); +} + + +void Generator::Restore(bool flt) { + const auto& src = GetSrc(8, flt); + const auto& des = GetDes(8, flt); + const auto& inst = GetInst("mov", 8, flt); + Emit(inst, des, src); + Pop(des); +} + + +void Generator::Save(bool flt) { + if (flt) { + Emit("movsd", "%xmm0", "%xmm9"); + } else { + Emit("movq", "%rax", "%r11"); + } +} + + +/* + * Operator/Instruction mapping: + * + add + * - sub + * * mul + * / div + * % div + * << sal + * >> sar + * | or + * & and + * ^ xor + * = mov + * < cmp, setl, movzbq + * > cmp, setg, movzbq + * <= cmp, setle, movzbq + * >= cmp, setle, movzbq + * == cmp, sete, movzbq + * != cmp, setne, movzbq + * && GenAndOp + * || GenOrOp + * ] GenSubScriptingOp + * . GenMemberRefOp + */ +void Generator::VisitBinaryOp(BinaryOp* binary) { + EmitLoc(binary); + auto op = binary->op_; + + if (op == '=') + return GenAssignOp(binary); + if (op == Token::LOGICAL_AND) + return GenAndOp(binary); + if (op == Token::LOGICAL_OR) + return GenOrOp(binary); + if (op == '.') + return GenMemberRefOp(binary); + if (op == ',') + return GenCommaOp(binary); + // Why lhs_->Type() ? + // Because, the type of pointer subtraction is arithmetic type + if (binary->lhs_->Type()->ToPointer() && + (op == '+' || op == '-')) { + return GenPointerArithm(binary); + } + + // Careful: for compare operator, the type of the expression + // is always integer, while the type of lhs and rhs could be float + // After convertion, lhs and rhs always has the same type + auto type = binary->lhs_->Type(); + auto width = type->Width(); + auto flt = type->IsFloat(); + auto sign = !type->IsUnsigned(); + + Visit(binary->lhs_); + Spill(flt); + Visit(binary->rhs_); + Restore(flt); + + const char* inst = nullptr; + + switch (op) { + case '*': return GenMulOp(width, flt, sign); + case '/': case '%': return GenDivOp(flt, sign, width, op); + case '<': + return GenCompOp(width, flt, (flt || !sign) ? "setb": "setl"); + case '>': + return GenCompOp(width, flt, (flt || !sign) ? "seta": "setg"); + case Token::LE: + return GenCompOp(width, flt, (flt || !sign) ? "setbe": "setle"); + case Token::GE: + return GenCompOp(width, flt, (flt || !sign) ? "setae": "setge"); + case Token::EQ: + return GenCompOp(width, flt, "sete"); + case Token::NE: + return GenCompOp(width, flt, "setne"); + + case '+': inst = "add"; break; + case '-': inst = "sub"; break; + case '|': inst = "or"; break; + case '&': inst = "and"; break; + case '^': inst = "xor"; break; + case Token::LEFT: case Token::RIGHT: + inst = op == Token::LEFT ? "sal": (sign ? "sar": "shr"); + Emit("movq %r11, %rcx"); + Emit(GetInst(inst, width, flt), "%cl", GetDes(width, flt)); + return; + } + Emit(GetInst(inst, width, flt), GetSrc(width, flt), GetDes(width, flt)); +} + + +void Generator::GenCommaOp(BinaryOp* comma) { + VisitExpr(comma->lhs_); + VisitExpr(comma->rhs_); +} + + +void Generator::GenMulOp(int width, bool flt, bool sign) { + auto inst = flt ? "mul": (sign ? "imul": "mul"); + + if (flt) { + Emit(GetInst(inst, width, flt), "%xmm9", "%xmm0"); + } else { + Emit(GetInst(inst, width, flt), GetSrc(width, flt)); + } +} + + +void Generator::GenCompZero(Type* type) { + auto width = type->Width(); + auto flt = type->IsFloat(); + + if (!flt) { + Emit("cmp", "$0", GetReg(width)); + } else { + Emit("pxor", "%xmm9", "%xmm9"); + auto cmp = width == 8 ? "ucomisd": "ucomiss"; + Emit(cmp, "%xmm9", "%xmm0"); + } +} + + +void Generator::GenAndOp(BinaryOp* andOp) { + VisitExpr(andOp->lhs_); + GenCompZero(andOp->lhs_->Type()); + + auto labelFalse = LabelStmt::New(); + Emit("je", labelFalse); + + VisitExpr(andOp->rhs_); + GenCompZero(andOp->rhs_->Type()); + + Emit("je", labelFalse); + + Emit("movq", "$1", "%rax"); + auto labelTrue = LabelStmt::New(); + Emit("jmp", labelTrue); + EmitLabel(labelFalse->Repr()); + Emit("xorq", "%rax", "%rax"); // Set %rax to 0 + EmitLabel(labelTrue->Repr()); +} + + +void Generator::GenOrOp(BinaryOp* orOp) { + VisitExpr(orOp->lhs_); + GenCompZero(orOp->lhs_->Type()); + + auto labelTrue = LabelStmt::New(); + Emit("jne", labelTrue); + + VisitExpr(orOp->rhs_); + GenCompZero(orOp->rhs_->Type()); + + Emit("jne", labelTrue); + + Emit("xorq", "%rax", "%rax"); // Set %rax to 0 + auto labelFalse = LabelStmt::New(); + Emit("jmp", labelFalse); + EmitLabel(labelTrue->Repr()); + Emit("movq", "$1", "%rax"); + EmitLabel(labelFalse->Repr()); +} + + +void Generator::GenMemberRefOp(BinaryOp* ref) { + // As the lhs will always be struct/union + auto addr = LValGenerator().GenExpr(ref->lhs_); + const auto& name = ref->rhs_->Tok()->str_; + auto structType = ref->lhs_->Type()->ToStruct(); + auto member = structType->GetMember(name); + + addr.offset_ += member->Offset(); + + if (!ref->Type()->IsScalar()) { + Emit("leaq", addr, "%rax"); + } else { + if (member->BitFieldWidth()) { + EmitLoadBitField(addr.Repr(), member); + } else { + EmitLoad(addr.Repr(), ref->Type()); + } + } +} + + +void Generator::EmitLoadBitField(const std::string& addr, Object* bitField) { + auto type = bitField->Type()->ToArithm(); + assert(type && type->IsInteger()); + + EmitLoad(addr, type); + Emit("andq", Object::BitFieldMask(bitField), "%rax"); + + auto shiftRight = (type->Tag() & T_UNSIGNED) ? "shrq": "sarq"; + auto left = 64 - bitField->bitFieldBegin_ - bitField->bitFieldWidth_; + auto right = 64 - bitField->bitFieldWidth_; + Emit("salq", left, "%rax"); + Emit(shiftRight, right, "%rax"); +} + + +// FIXME(wgtdkp): for combined assignment operator, if the rvalue expr +// has some side-effect, the rvalue will be evaluated twice! +void Generator::GenAssignOp(BinaryOp* assign) { + // The base register of addr is %r10, %rip, %rbp + auto addr = LValGenerator().GenExpr(assign->lhs_); + // Base register of static object maybe %rip + // Visit rhs_ may changes r10 + if (addr.base_ == "%r10") + Push(addr.base_); + VisitExpr(assign->rhs_); + if (addr.base_ == "%r10") + Pop(addr.base_); + + if (assign->Type()->IsScalar()) { + EmitStore(addr, assign->Type()); + } else { + // struct/union type + // The address of rhs is in %rax + CopyStruct(addr, assign->Type()->Width()); + } +} + + +void Generator::EmitStoreBitField(const ObjectAddr& addr, Type* type) { + auto arithmType = type->ToArithm(); + assert(arithmType && arithmType->IsInteger()); + + // The value to be stored is in %rax now + auto mask = Object::BitFieldMask(addr.bitFieldBegin_, addr.bitFieldWidth_); + + Emit("salq", addr.bitFieldBegin_, "%rax"); + Emit("andq", mask, "%rax"); + Emit("movq", "%rax", "%r11"); + EmitLoad(addr.Repr(), arithmType); + Emit("andq", ~mask, "%rax"); + Emit("orq", "%r11", "%rax"); + + EmitStore(addr.Repr(), type); +} + + +void Generator::CopyStruct(ObjectAddr desAddr, int width) { + int units[] = {8, 4, 2, 1}; + Emit("movq", "%rax", "%rcx"); + ObjectAddr srcAddr = {"", "%rcx", 0}; + for (auto unit: units) { + while (width >= unit) { + EmitLoad(srcAddr.Repr(), unit, false); + EmitStore(desAddr.Repr(), unit, false); + desAddr.offset_ += unit; + srcAddr.offset_ += unit; + width -= unit; + } + } +} + + +void Generator::GenCompOp(int width, bool flt, const char* set) { + std::string cmp; + if (flt) { + cmp = width == 8 ? "ucomisd": "ucomiss"; + } else { + cmp = GetInst("cmp", width, flt); + } + + Emit(cmp, GetSrc(width, flt), GetDes(width, flt)); + Emit(set, "%al"); + Emit("movzbq", "%al", "%rax"); +} + + +void Generator::GenDivOp(bool flt, bool sign, int width, int op) { + if (flt) { + auto inst = width == 4 ? "divss": "divsd"; + Emit(inst, "%xmm9", "%xmm0"); + return; + } + if (!sign) { + Emit("xor", "%rdx", "%rdx"); + Emit(GetInst("div", width, flt), GetSrc(width, flt)); + } else { + Emit(width == 4 ? "cltd": "cqto"); + Emit(GetInst("idiv", width, flt), GetSrc(width, flt)); + } + if (op == '%') + Emit("movq", "%rdx", "%rax"); +} + + +void Generator::GenPointerArithm(BinaryOp* binary) { + assert(binary->op_ == '+' || binary->op_ == '-'); + // For '+', we have swapped lhs_ and rhs_ to ensure that + // the pointer is at lhs. + Visit(binary->lhs_); + Spill(false); + Visit(binary->rhs_); + Restore(false); + + auto type = binary->lhs_->Type()->ToPointer()->Derived(); + auto width = type->Width(); + if (binary->op_ == '+') { + if (width > 1) + Emit("imulq", width, "%r11"); + Emit("addq", "%r11", "%rax"); + } else { + Emit("subq", "%r11", "%rax"); + if (width > 1) { + Emit("movq", width, "%r11"); + GenDivOp(false, true, 8, '/'); + } + } +} + + +// Only objects Allocated on stack +void Generator::VisitObject(Object* obj) { + EmitLoc(obj); + auto addr = LValGenerator().GenExpr(obj).Repr(); + + if (!obj->Type()->IsScalar()) { + // Return the address of the object in rax + Emit("leaq", addr, "%rax"); + } else { + EmitLoad(addr, obj->Type()); + } +} + + +void Generator::GenCastOp(UnaryOp* cast) { + auto desType = cast->Type(); + auto srcType = cast->operand_->Type(); + + if (srcType->IsFloat() && desType->IsFloat()) { + if (srcType->Width() == desType->Width()) + return; + auto inst = srcType->Width() == 4 ? "cvtss2sd": "cvtsd2ss"; + Emit(inst, "%xmm0", "%xmm0"); + } else if (srcType->IsFloat()) { + // Handle bool + if (desType->IsBool()) { + Emit("pxor", "%xmm9", "%xmm9"); + GenCompOp(srcType->Width(), true, "setne"); + } else { + auto inst = srcType->Width() == 4 ? "cvttss2si": "cvttsd2si"; + Emit(inst, "%xmm0", "%rax"); + } + } else if (desType->IsFloat()) { + auto inst = desType->Width() == 4 ? "cvtsi2ss": "cvtsi2sd"; + Emit(inst, "%rax", "%xmm0"); + } else if (srcType->ToPointer() + || srcType->ToFunc() + || srcType->ToArray()) { + // Handle bool + if (desType->IsBool()) { + Emit("testq", "%rax", "%rax"); + Emit("setne", "%al"); + } + } else { + assert(srcType->ToArithm()); + int width = srcType->Width(); + auto sign = !srcType->IsUnsigned(); + const char* inst; + switch (width) { + case 1: + inst = sign ? "movsbq": "movzbq"; + Emit(inst, GetReg(width), "%rax"); + break; + case 2: + inst = sign ? "movswq": "movzwq"; + Emit(inst, GetReg(width), "%rax"); + break; + case 4: inst = "movl"; + if (desType->Width() == 8) + Emit("cltq"); + break; + case 8: break; + } + // Handle bool + if (desType->IsBool()) { + Emit("testq", "%rax", "%rax"); + Emit("setne", "%al"); + } + } +} + + +void Generator::VisitUnaryOp(UnaryOp* unary) { + EmitLoc(unary); + switch (unary->op_) { + case Token::PREFIX_INC: + return GenIncDec(unary->operand_, false, "add"); + case Token::PREFIX_DEC: + return GenIncDec(unary->operand_, false, "sub"); + case Token::POSTFIX_INC: + return GenIncDec(unary->operand_, true, "add"); + case Token::POSTFIX_DEC: + return GenIncDec(unary->operand_, true, "sub"); + case Token::ADDR: { + auto addr = LValGenerator().GenExpr(unary->operand_).Repr(); + Emit("leaq", addr, "%rax"); + } return; + case Token::DEREF: + return GenDerefOp(unary); + case Token::PLUS: + return VisitExpr(unary->operand_); + case Token::MINUS: + return GenMinusOp(unary); + case '~': + VisitExpr(unary->operand_); + return Emit("notq", "%rax"); + case '!': + VisitExpr(unary->operand_); + GenCompZero(unary->operand_->Type()); + Emit("sete", "%al"); + Emit("movzbl", "%al", "%eax"); // Type of !operator is int + return; + case Token::CAST: + Visit(unary->operand_); + GenCastOp(unary); + return; + default: assert(false); + } +} + + +void Generator::GenDerefOp(UnaryOp* deref) { + VisitExpr(deref->operand_); + if (deref->Type()->IsScalar()) { + ObjectAddr addr {"", "%rax", 0}; + EmitLoad(addr.Repr(), deref->Type()); + } else { + // Just let it go! + } +} + + +void Generator::GenMinusOp(UnaryOp* minus) { + auto width = minus->Type()->Width(); + auto flt = minus->Type()->IsFloat(); + + VisitExpr(minus->operand_); + + if (flt) { + Emit("pxor", "%xmm9", "%xmm9"); + Emit(GetInst("sub", width, flt), "%xmm0", "%xmm9"); + Emit(GetInst("mov", width, flt), "%xmm9", "%xmm0"); + } else { + Emit(GetInst("neg", width, flt), GetDes(width, flt)); + } +} + + +void Generator::GenIncDec(Expr* operand, + bool postfix, + const std::string& inst) { + auto width = operand->Type()->Width(); + auto flt = operand->Type()->IsFloat(); + + auto addr = LValGenerator().GenExpr(operand).Repr(); + EmitLoad(addr, operand->Type()); + if (postfix) Save(flt); + + Constant* cons; + auto pointerType = operand->Type()->ToPointer(); + if (pointerType) { + long width = pointerType->Derived()->Width(); + cons = Constant::New(operand->Tok(), T_LONG, width); + } else if (operand->Type()->IsInteger()) { + cons = Constant::New(operand->Tok(), T_LONG, 1L); + } else { + if (width == 4) + cons = Constant::New(operand->Tok(), T_FLOAT, 1.0f); + else + cons = Constant::New(operand->Tok(), T_DOUBLE, 1.0); + } + + Emit(GetInst(inst, operand->Type()), ConsLabel(cons), GetDes(width, flt)); + EmitStore(addr, operand->Type()); + if (postfix && flt) { + Emit("movsd", "%xmm9", "%xmm0"); + } else if (postfix) { + Emit("mov", "%r11", "%rax"); + } +} + + +void Generator::VisitConditionalOp(ConditionalOp* condOp) { + EmitLoc(condOp); + auto ifStmt = IfStmt::New(condOp->cond_, + condOp->exprTrue_, condOp->exprFalse_); + VisitIfStmt(ifStmt); +} + + +void Generator::VisitEnumerator(Enumerator* enumer) { + EmitLoc(enumer); + auto cons = Constant::New(enumer->Tok(), T_INT, (long)enumer->Val()); + Visit(cons); +} + + +// Ident must be function +void Generator::VisitIdentifier(Identifier* ident) { + EmitLoc(ident); + Emit("leaq", ident->Name(), "%rax"); +} + + +void Generator::VisitConstant(Constant* cons) { + EmitLoc(cons); + auto label = ConsLabel(cons); + + if (!cons->Type()->IsScalar()) { + Emit("leaq", label, "%rax"); + } else { + auto width = cons->Type()->Width(); + auto flt = cons->Type()->IsFloat(); + auto load = GetInst("mov", width, flt); + auto des = GetDes(width, flt); + Emit(load, label, des); + } +} + + +// Use %ecx as temp register +// TempVar is only used for condition expression of 'switch' +// and struct copy +void Generator::VisitTempVar(TempVar* tempVar) { + assert(tempVar->Type()->IsInteger()); + Emit("movl", "%ecx", "%eax"); +} + + +void Generator::VisitDeclaration(Declaration* decl) { + EmitLoc(decl->obj_); + auto obj = decl->obj_; + + if (!obj->IsStatic()) { + // The object has no linkage and has + // no static storage(the object is on stack). + // If it has no initialization, + // then it's value is random initialized. + if (!obj->HasInit()) + return; + + int lastEnd = obj->Offset(); + for (const auto& init: decl->Inits()) { + ObjectAddr addr = ObjectAddr(obj->Offset() + init.offset_); + addr.bitFieldBegin_ = init.bitFieldBegin_; + addr.bitFieldWidth_ = init.bitFieldWidth_; + if (lastEnd != addr.offset_) + EmitZero(ObjectAddr(lastEnd), addr.offset_ - lastEnd); + VisitExpr(init.expr_); + if (init.type_->IsScalar()) { + EmitStore(addr, init.type_); + } else if (init.type_->ToStruct()) { + CopyStruct(addr, init.type_->Width()); + } else { + assert(false); + } + lastEnd = addr.offset_ + init.type_->Width(); + } + auto objEnd = obj->Offset() + obj->Type()->Width(); + if (lastEnd != objEnd) + EmitZero(ObjectAddr(lastEnd), objEnd - lastEnd); + return; + } + + if (obj->Linkage() == L_NONE) + staticDecls_.push_back(decl); + else + GenStaticDecl(decl); +} + + +void Generator::GenStaticDecl(Declaration* decl) { + auto obj = decl->obj_; + assert(obj->IsStatic()); + + const auto& label = obj->Repr(); + const auto width = obj->Type()->Width(); + const auto align = obj->Align(); + + // Omit the external without initilizer + if ((obj->Storage() & S_EXTERN) && !obj->HasInit()) + return; + + Emit(".data"); + auto glb = obj->Linkage() == L_EXTERNAL ? ".globl": ".local"; + Emit(glb, label); + + if (!obj->HasInit()) { + Emit(".comm", label + ", " + std::to_string(width) + + ", " + std::to_string(align)); + return; + } + + Emit(".align", std::to_string(align)); + Emit(".type", label, "@object"); + // Does not decide the size of obj + Emit(".size", label, std::to_string(width)); + EmitLabel(label); + + int offset = 0; + auto iter = decl->Inits().begin(); + for (; iter != decl->Inits().end();) { + auto staticInit = GetStaticInit(iter, + decl->Inits().end(), std::max(iter->offset_, offset)); + + if (staticInit.offset_ > offset) + Emit(".zero", std::to_string(staticInit.offset_ - offset)); + + switch (staticInit.width_) { + case 1: + Emit(".byte", std::to_string(static_cast(staticInit.val_))); + break; + case 2: + Emit(".value", std::to_string(static_cast(staticInit.val_))); + break; + case 4: + Emit(".long", std::to_string(static_cast(staticInit.val_))); + break; + case 8: { + std::string val; + if (staticInit.label_.size() == 0) { + val = std::to_string(staticInit.val_); + } else if (staticInit.val_ != 0) { + val = staticInit.label_ + "+" + std::to_string(staticInit.val_); + } else { + val = staticInit.label_; + } + Emit(".quad", val); + } break; + default: assert(false); + } + offset = staticInit.offset_ + staticInit.width_; + } + // Decides the size of object + if (width > offset) + Emit(".zero", std::to_string(width - offset)); +} + + +void Generator::VisitEmptyStmt(EmptyStmt* emptyStmt) { + assert(false); +} + + +void Generator::VisitIfStmt(IfStmt* ifStmt) { + VisitExpr(ifStmt->cond_); + + // Compare to 0 + auto elseLabel = LabelStmt::New(); + auto endLabel = LabelStmt::New(); + + GenCompZero(ifStmt->cond_->Type()); + + if (ifStmt->else_) { + Emit("je", elseLabel); + } else { + Emit("je", endLabel); + } + + VisitStmt(ifStmt->then_); + + if (ifStmt->else_) { + Emit("jmp", endLabel); + EmitLabel(elseLabel->Repr()); + VisitStmt(ifStmt->else_); + } + + EmitLabel(endLabel->Repr()); +} + + +void Generator::VisitJumpStmt(JumpStmt* jumpStmt) { + Emit("jmp", jumpStmt->label_); +} + + +void Generator::VisitLabelStmt(LabelStmt* labelStmt) { + EmitLabel(labelStmt->Repr()); +} + + +void Generator::VisitReturnStmt(ReturnStmt* returnStmt) { + auto expr = returnStmt->expr_; + if (expr) { // The return expr could be nil + Visit(expr); + if (expr->Type()->ToStruct()) { + // %rax now has the address of the struct/union + ObjectAddr addr = ObjectAddr(retAddrOffset_); + Emit("movq", addr, "%r11"); + addr = {"", "%r11", 0}; + CopyStruct(addr, expr->Type()->Width()); + Emit("movq", "%r11", "%rax"); + } + } + Emit("jmp", curFunc_->retLabel_); +} + + +class Comp { +public: + bool operator()(Object* lhs, Object* rhs) { + return lhs->Align() < rhs->Align(); + } +}; + + +void Generator::AllocObjects(Scope* scope, const FuncDef::ParamList& params) { + int offset = offset_; + + auto paramSet = std::set(params.begin(), params.end()); + std::priority_queue, Comp> heap; + for (auto iter = scope->begin(); iter != scope->end(); ++iter) { + auto obj = iter->second->ToObject(); + if (!obj || obj->IsStatic()) + continue; + if (paramSet.find(obj) != paramSet.end()) + continue; + heap.push(obj); + } + + while (!heap.empty()) { + auto obj = heap.top(); + heap.pop(); + + offset -= obj->Type()->Width(); + auto align = obj->Align(); + if (obj->Type()->ToArray()) { + // The alignment of an array is at least the aligment of a pointer + // (as it is always cast to a pointer) + align = std::min(align, 8); + } + offset = Type::MakeAlign(offset, align); + obj->SetOffset(offset); + } + + offset_ = offset; +} + + +void Generator::VisitCompoundStmt(CompoundStmt* compStmt) { + if (compStmt->scope_) { + AllocObjects(compStmt->scope_); + } + + for (auto stmt: compStmt->stmts_) { + Visit(stmt); + } +} + + +void Generator::GetParamRegOffsets(int& gpOffset, + int& fpOffset, + int& overflow, + FuncType* funcType) { + TypeList types; + for (auto param: funcType->Params()) + types.push_back(param->Type()); + auto locations = GetParamLocations(types, funcType->Derived()); + gpOffset = 0; + fpOffset = 48; + overflow = 16; + for (const auto& loc: locations.locs_) { + if (loc[1] == 'x') + fpOffset += 16; + else if (loc[1] == 'm') + overflow += 8; + else + gpOffset += 8; + } +} + + +void Generator::GenBuiltin(FuncCall* funcCall) { + struct va_list_imp { + unsigned int gp_offset; + unsigned int fp_offset; + void *overflow_arg_area; + void *reg_save_area; + }; + + auto ap = UnaryOp::New(Token::DEREF, funcCall->args_[0]); + auto addr = LValGenerator().GenExpr(ap); + auto type = funcCall->FuncType(); + + auto offset = offsetof(va_list_imp, reg_save_area); + addr.offset_ += offset; + const auto& saveAreaAddr = addr.Repr(); + addr.offset_ -= offset; + + offset = offsetof(va_list_imp, overflow_arg_area); + addr.offset_ += offset; + const auto& overflowAddr = addr.Repr(); + addr.offset_ -= offset; + + offset = offsetof(va_list_imp, gp_offset); + addr.offset_ += offset; + const auto& gpOffsetAddr = addr.Repr(); + addr.offset_ -= offset; + + offset = offsetof(va_list_imp, fp_offset); + addr.offset_ += offset; + const auto& fpOffsetAddr = addr.Repr(); + addr.offset_ -= offset; + + if (type == Parser::vaStartType_) { + Emit("leaq", "-176(%rbp)", "%rax"); + Emit("movq", "%rax", saveAreaAddr); + + int gpOffset, fpOffset, overflowOffset; + GetParamRegOffsets(gpOffset, fpOffset, + overflowOffset, curFunc_->FuncType()); + Emit("leaq", ObjectAddr(overflowOffset), "%rax"); + Emit("movq", "%rax", overflowAddr); + Emit("movl", gpOffset, "%eax"); + Emit("movl", "%eax", gpOffsetAddr); + Emit("movl", fpOffset, "%eax"); + Emit("movl", "%eax", fpOffsetAddr); + } else if (type == Parser::vaArgType_) { + static int cnt[2] = {0, 0}; + auto overflowLabel = ".L_va_arg_overflow" + std::to_string(++cnt[0]); + auto endLabel = ".L_va_arg_end" + std::to_string(++cnt[1]); + + auto argType = funcCall->args_[1]->Type()->ToPointer()->Derived(); + auto cls = Classify(argType.GetPtr()); + if (cls == ParamClass::INTEGER) { + Emit("movq", saveAreaAddr, "%rax"); + Emit("movq", "%rax", "%r11"); + Emit("movl", gpOffsetAddr, "%eax"); + Emit("cltq"); + Emit("cmpq", 48, "%rax"); + Emit("jae", overflowLabel); + Emit("addq", "%rax", "%r11"); + Emit("addq", 8, "%rax"); + Emit("movl", "%eax", gpOffsetAddr); + Emit("movq", "%r11", "%rax"); + Emit("jmp", endLabel); + } else if (cls == ParamClass::SSE) { + Emit("movq", saveAreaAddr, "%rax"); + Emit("movq", "%rax", "%r11"); + Emit("movl", fpOffsetAddr, "%eax"); + Emit("cltq"); + Emit("cmpq", 176, "%rax"); + Emit("jae", overflowLabel); + Emit("addq", "%rax", "%r11"); + Emit("addq", 16, "%rax"); + Emit("movl", "%eax", fpOffsetAddr); + Emit("movq", "%r11", "%rax"); + Emit("jmp", endLabel); + } else if (cls == ParamClass::MEMORY) { + } else { + Error("internal error"); + } + EmitLabel(overflowLabel); + Emit("movq", overflowAddr, "%rax"); + Emit("movq", "%rax", "%r11"); + // Arguments passed by memory is aligned by at least 8 bytes + Emit("addq", Type::MakeAlign(argType->Width(), 8), "%r11"); + Emit("movq", "%r11", overflowAddr); + EmitLabel(endLabel); + } else { + assert(false); + } +} + + +void Generator::VisitFuncCall(FuncCall* funcCall) { + EmitLoc(funcCall); + auto funcType = funcCall->FuncType(); + if (Parser::IsBuiltin(funcType)) + return GenBuiltin(funcCall); + + auto base = offset_; + // Alloc memory for return value if it is struct/union + int retStructOffset; + auto retType = funcCall->Type()->ToStruct(); + if (retType) { + retStructOffset = offset_; + retStructOffset -= retType->Width(); + retStructOffset = Type::MakeAlign(retStructOffset, retType->Align()); + // No!!! you can't suppose that the + // visition of arguments won't change the value of %rdi + //Emit("leaq %d(#rbp), #rdi", offset); + offset_ = retStructOffset; + } + + TypeList types; + for (auto arg: funcCall->args_) { + types.push_back(arg->Type()); + } + + const auto& locations = GetParamLocations(types, retType); + // Align stack frame by 16 bytes + const auto& locs = locations.locs_; + auto byMemCnt = locs.size() - locations.regCnt_ - locations.xregCnt_; + + offset_ = Type::MakeAlign(offset_ - byMemCnt * 8, 16) + byMemCnt * 8; + for (int i = locs.size() - 1; i >=0; --i) { + if (locs[i][1] == 'm') { + Visit(funcCall->args_[i]); + Push(funcCall->args_[i]->Type()); + } + } + + for (int i = locs.size() - 1; i >= 0; --i) { + if (locs[i][1] == 'm') + continue; + Visit(funcCall->args_[i]); + Push(funcCall->args_[i]->Type()); + } + + for (const auto& loc: locs) { + if (loc[1] != 'm') + Pop(loc); + } + + // If variadic, set %al to floating param number + if (funcType->Variadic()) { + Emit("movq", locations.xregCnt_, "%rax"); + } + if (retType) { + Emit("leaq", ObjectAddr(retStructOffset), "%rdi"); + } + + Emit("leaq", ObjectAddr(offset_), "%rsp"); + auto addr = LValGenerator().GenExpr(funcCall->Designator()); + if (addr.base_.size() == 0 && addr.offset_ == 0) { + Emit("call", addr.label_); + } else { + Emit("leaq", addr, "%r10"); + Emit("call", "*%r10"); + } + + // Reset stack frame + offset_ = base; +} + + +ParamLocations Generator::GetParamLocations(const TypeList& types, + bool retStruct) { + ParamLocations locations; + + locations.regCnt_ = retStruct; + locations.xregCnt_ = 0; + for (auto type: types) { + auto cls = Classify(type); + + const char* reg = nullptr; + if (cls == ParamClass::INTEGER) { + if (locations.regCnt_ < regs.size()) + reg = regs[locations.regCnt_++]; + } else if (cls == ParamClass::SSE) { + if (locations.xregCnt_ < xregs.size()) + reg = xregs[locations.xregCnt_++]; + } + locations.locs_.push_back(reg ? reg: "%mem"); + } + return locations; +} + + +void Generator::VisitFuncDef(FuncDef* funcDef) { + curFunc_ = funcDef; + + auto name = funcDef->Name(); + + Emit(".text"); + if (funcDef->Linkage() == L_INTERNAL) { + Emit(".local", name); + } else { + Emit(".globl", name); + } + Emit(".type", name, "@function"); + + EmitLabel(name); + Emit("pushq", "%rbp"); + Emit("movq", "%rsp", "%rbp"); + + offset_ = 0; + + auto& params = funcDef->FuncType()->Params(); + // Arrange space to store params passed by registers + bool retStruct = funcDef->FuncType()->Derived()->ToStruct(); + TypeList types; + for (auto param: params) + types.push_back(param->Type()); + + auto locations = GetParamLocations(types, retStruct); + const auto& locs = locations.locs_; + + if (funcDef->FuncType()->Variadic()) { + GenSaveArea(); // 'offset' is now the begin of save area + if (retStruct) { + retAddrOffset_ = offset_; + offset_ += 8; + } + int regOffset = offset_; + int xregOffset = offset_ + 48; + int byMemOffset = 16; + for (size_t i = 0; i < locs.size(); ++i) { + if (locs[i][1] == 'm') { + params[i]->SetOffset(byMemOffset); + + // TODO(wgtdkp): width of incomplete array ? + // What about the var args, var args offset always increment by 8 + //byMemOffset += 8; + byMemOffset += params[i]->Type()->Width(); + byMemOffset = Type::MakeAlign(byMemOffset, 8); + } else if (locs[i][1] == 'x') { + params[i]->SetOffset(xregOffset); + xregOffset += 16; + } else { + params[i]->SetOffset(regOffset); + regOffset += 8; + } + } + } else { + if (retStruct) { + retAddrOffset_ = Push("%rdi"); + } + int byMemOffset = 16; + for (size_t i = 0; i < locs.size(); ++i) { + if (locs[i][1] == 'm') { + params[i]->SetOffset(byMemOffset); + // TODO(wgtdkp): width of incomplete array ? + byMemOffset += params[i]->Type()->Width(); + byMemOffset = Type::MakeAlign(byMemOffset, 8); + continue; + } + params[i]->SetOffset(Push(locs[i])); + } + } + + AllocObjects(funcDef->Body()->Scope(), params); + + for (auto stmt: funcDef->body_->stmts_) { + Visit(stmt); + } + + EmitLabel(funcDef->retLabel_->Repr()); + Emit("leaveq"); + Emit("retq"); +} + + +void Generator::GenSaveArea() { + static const int begin = -176; + int offset = begin; + for (auto reg: regs) { + Emit("movq", reg, ObjectAddr(offset)); + offset += 8; + } + Emit("testb", "%al", "%al"); + auto label = LabelStmt::New(); + Emit("je", label); + for (auto xreg: xregs) { + Emit("movaps", xreg, ObjectAddr(offset)); + offset += 16; + } + assert(offset == 0); + EmitLabel(label->Repr()); + + offset_ = begin; +} + + +void Generator::VisitTranslationUnit(TranslationUnit* unit) { + for (auto extDecl: unit->ExtDecls()) { + Visit(extDecl); + + // Float and string literal + if (rodatas_.size()) + Emit(".section", ".rodata"); + for (auto rodata: rodatas_) { + if (rodata.align_ == 1) { // Literal + EmitLabel(rodata.label_); + Emit(".string", "\"" + rodata.sval_ + "\""); + } else if (rodata.align_ == 4) { + Emit(".align", "4"); + EmitLabel(rodata.label_); + Emit(".long", std::to_string(static_cast(rodata.ival_))); + } else { + Emit(".align", "8"); + EmitLabel(rodata.label_); + Emit(".quad", std::to_string(rodata.ival_)); + } + } + rodatas_.clear(); + + for (auto staticDecl: staticDecls_) { + GenStaticDecl(staticDecl); + } + staticDecls_.clear(); + } +} + + +void Generator::Gen() { + Emit(".file", "\"" + filename_in + "\""); + VisitTranslationUnit(parser_->Unit()); +} + + +void Generator::EmitLoc(Expr* expr) { + if (!debug) { + return; + } + + static int fileno = 0; + if (expr->tok_ == nullptr) { + return; + } + + const auto loc = &expr->tok_->loc_; + if (loc->filename_ != last_file) { + Emit(".file", std::to_string(++fileno) + " \"" + *loc->filename_ + "\""); + last_file = loc->filename_; + } + Emit(".loc", std::to_string(fileno) + " " + + std::to_string(loc->line_) + " 0"); + + std::string line; + for (const char* p = loc->lineBegin_; *p && *p != '\n'; ++p) + line.push_back(*p); + Emit("# " + line); +} + + +void Generator::EmitLoad(const std::string& addr, Type* type) { + assert(type->IsScalar()); + EmitLoad(addr, type->Width(), type->IsFloat()); +} + + +void Generator::EmitLoad(const std::string& addr, int width, bool flt) { + auto load = GetLoad(width, flt); + auto des = GetDes(width == 4 ? 4: 8, flt); + Emit(load, addr, des); +} + + +void Generator::EmitStore(const ObjectAddr& addr, Type* type) { + if (addr.bitFieldWidth_ != 0) { + EmitStoreBitField(addr, type); + } else { + EmitStore(addr.Repr(), type); + } +} + + +void Generator::EmitStore(const std::string& addr, Type* type) { + EmitStore(addr, type->Width(), type->IsFloat()); +} + + +void Generator::EmitStore(const std::string& addr, int width, bool flt) { + auto store = GetInst("mov", width, flt); + auto des = GetDes(width, flt); + Emit(store, des, addr); +} + + +void Generator::EmitLabel(const std::string& label) { + fprintf(outFile_, "%s:\n", label.c_str()); +} + + +void Generator::EmitZero(ObjectAddr addr, int width) { + int units[] = {8, 4, 2, 1}; + Emit("xorq", "%rax", "%rax"); + for (auto unit: units) { + while (width >= unit) { + EmitStore(addr.Repr(), unit, false); + addr.offset_ += unit; + width -= unit; + } + } +} + + +void LValGenerator::VisitBinaryOp(BinaryOp* binary) { + EmitLoc(binary); + assert(binary->op_ == '.'); + + addr_ = LValGenerator().GenExpr(binary->lhs_); + const auto& name = binary->rhs_->Tok()->str_; + auto structType = binary->lhs_->Type()->ToStruct(); + auto member = structType->GetMember(name); + + addr_.offset_ += member->Offset(); + addr_.bitFieldBegin_ = member->bitFieldBegin_; + addr_.bitFieldWidth_ = member->bitFieldWidth_; +} + + +void LValGenerator::VisitUnaryOp(UnaryOp* unary) { + EmitLoc(unary); + assert(unary->op_ == Token::DEREF); + Generator().VisitExpr(unary->operand_); + Emit("movq", "%rax", "%r10"); + addr_ = {"", "%r10", 0}; +} + + +void LValGenerator::VisitObject(Object* obj) { + EmitLoc(obj); + if (!obj->IsStatic() && obj->Anonymous()) { + assert(obj->Decl()); + Generator().Visit(obj->Decl()); + obj->SetDecl(nullptr); + } + + if (obj->IsStatic()) { + addr_ = {obj->Repr(), "%rip", 0}; + } else { + addr_ = {"", "%rbp", obj->Offset()}; + } +} + + +// The identifier must be function +void LValGenerator::VisitIdentifier(Identifier* ident) { + assert(!ident->ToTypeName()); + EmitLoc(ident); + // Function address + addr_ = {ident->Name(), "", 0}; +} + + +void LValGenerator::VisitTempVar(TempVar* tempVar) { + std::string label; + switch (tempVar->Type()->Width()) { + case 1: label = "%cl"; break; + case 2: label = "%cx"; break; + case 4: label = "%ecx"; break; + case 8: label = "%rcx"; break; + default: assert(false); + } + addr_ = {label, "", 0}; +} + + +std::string ObjectAddr::Repr() const { + auto ret = base_.size() ? "(" + base_ + ")": ""; + if (label_.size() == 0) { + if (offset_ == 0) { + return ret; + } + return std::to_string(offset_) + ret; + } else { + if (offset_ == 0) { + return label_ + ret; + } + return label_ + "+" + std::to_string(offset_) + ret; + } +} + + +StaticInitializer Generator::GetStaticInit(InitList::iterator& iter, + InitList::iterator end, + int offset) { + auto init = iter++; + auto width = init->type_->Width(); + if (init->type_->IsInteger()) { + if (init->bitFieldWidth_ == 0) { + auto val = Evaluator().Eval(init->expr_); + return {init->offset_, width, val, ""}; + } + int totalBits = 0; + unsigned char val = 0; + while (init != end && init->offset_ <= offset && totalBits < 8) { + auto bitVal = Evaluator().Eval(init->expr_); + auto begin = init->bitFieldBegin_; + auto width = init->bitFieldWidth_; + auto valBegin = 0; + auto valWidth = 0; + auto mask = 0UL; + if (init->offset_ < offset) { + begin = 0; + width -= (8 - init->bitFieldBegin_); + if (offset - init->offset_ > 1) + width -= (offset - init->offset_ - 1) * 8; + valBegin = init->bitFieldWidth_ - width; + } + valWidth = std::min(static_cast(8 - begin), width); + mask = Object::BitFieldMask(valBegin, valWidth); + val |= ((bitVal & mask) >> valBegin) << begin; + totalBits = begin + valWidth; + if (width - valWidth <= 0) + ++init; + } + iter = init; + return {offset, 1, val, ""}; + } else if (init->type_->IsFloat()) { + auto val = Evaluator().Eval(init->expr_); + auto lval = *reinterpret_cast(&val); + return {init->offset_, width, lval, ""}; + } else if (init->type_->ToPointer()) { + auto addr = Evaluator().Eval(init->expr_); + return {init->offset_, width, addr.offset_, addr.label_}; + } else { // Struct initializer + Error(init->expr_, "initializer element is not constant"); + return StaticInitializer(); // Make compiler happy + } +} diff --git a/lib/lang/wgtcc/cpp.cc b/lib/lang/wgtcc/cpp.cc new file mode 100644 index 000000000..543bf3194 --- /dev/null +++ b/lib/lang/wgtcc/cpp.cc @@ -0,0 +1,886 @@ +#include "triton/lang/wgtcc/cpp.h" + +#include "triton/lang/wgtcc/evaluator.h" +#include "triton/lang/wgtcc/parser.h" + +#include +#include +#include +#include + + +extern std::string filename_in; +extern std::string filename_out; + +using DirectiveMap = std::unordered_map; + +static const DirectiveMap directiveMap { + {"if", Token::PP_IF}, + {"ifdef", Token::PP_IFDEF}, + {"ifndef", Token::PP_IFNDEF}, + {"elif", Token::PP_ELIF}, + {"else", Token::PP_ELSE}, + {"endif", Token::PP_ENDIF}, + {"include", Token::PP_INCLUDE}, + // Non-standard GNU extension + {"include_next", Token::PP_INCLUDE}, + {"define", Token::PP_DEFINE}, + {"undef", Token::PP_UNDEF}, + {"line", Token::PP_LINE}, + {"error", Token::PP_ERROR}, + {"pragma", Token::PP_PRAGMA} +}; + + +/* + * params: + * is: input token sequence + * os: output token sequence + */ +void Preprocessor::Expand(TokenSequence& os, TokenSequence is, bool inCond) { + Macro* macro = nullptr; + int direcitve; + while (!is.Empty()) { + UpdateFirstTokenLine(is); + auto tok = is.Peek(); + const auto& name = tok->str_; + + if ((direcitve = GetDirective(is)) != Token::INVALID) { + ParseDirective(os, is, direcitve); + } else if (!inCond && !NeedExpand()) { + // Discards the token + is.Next(); + } else if (inCond && name == "defined") { + is.Next(); + os.InsertBack(EvalDefOp(is)); + } else if (tok->hs_ && tok->hs_->find(name) != tok->hs_->end()) { + os.InsertBack(is.Next()); + } else if ((macro = FindMacro(name))) { + is.Next(); + + if (name == "__FILE__") { + HandleTheFileMacro(os, tok); + } else if (name == "__LINE__") { + HandleTheLineMacro(os, tok); + } else if (macro->ObjLike()) { + // Make a copy, as subst will change repSeq + auto repSeq = macro->RepSeq(tok->loc_.filename_, tok->loc_.line_); + + TokenList tokList; + TokenSequence repSeqSubsted(&tokList); + ParamMap paramMap; + // TODO(wgtdkp): hideset is not right + // Make a copy of hideset + // HS U {name} + auto hs = tok->hs_ ? *tok->hs_: HideSet(); + hs.insert(name); + Subst(repSeqSubsted, repSeq, tok->ws_, hs, paramMap); + is.InsertFront(repSeqSubsted); + } else if (is.Try('(')) { + ParamMap paramMap; + auto rpar = ParseActualParam(is, macro, paramMap); + auto repSeq = macro->RepSeq(tok->loc_.filename_, tok->loc_.line_); + TokenList tokList; + TokenSequence repSeqSubsted(&tokList); + + // (HS ^ HS') U {name} + // Use HS' U {name} directly + auto hs = rpar->hs_ ? *rpar->hs_: HideSet(); + hs.insert(name); + Subst(repSeqSubsted, repSeq, tok->ws_, hs, paramMap); + is.InsertFront(repSeqSubsted); + } else { + os.InsertBack(tok); + } + } else { + os.InsertBack(is.Next()); + } + } +} + + +static bool FindActualParam(TokenSequence& ap, + ParamMap& params, + const std::string& fp) { + auto res = params.find(fp); + if (res == params.end()) { + return false; + } + ap.Copy(res->second); + return true; +} + + +void Preprocessor::Subst(TokenSequence& os, + TokenSequence is, + bool leadingWS, + const HideSet& hs, + ParamMap& params) { + TokenSequence ap; + + while (!is.Empty()) { + if (is.Test('#') && FindActualParam(ap, params, is.Peek2()->str_)) { + is.Next(); is.Next(); + auto tok = Stringize(ap); + os.InsertBack(tok); + } else if (is.Test(Token::DSHARP) && + FindActualParam(ap, params, is.Peek2()->str_)) { + is.Next(); is.Next(); + if (!ap.Empty()) + Glue(os, ap); + } else if (is.Test(Token::DSHARP)) { + is.Next(); + auto tok = is.Next(); + Glue(os, tok); + } else if (is.Peek2()->tag_ == Token::DSHARP && + FindActualParam(ap, params, is.Peek()->str_)) { + is.Next(); + + if (ap.Empty()) { + is.Next(); + if (FindActualParam(ap, params, is.Peek()->str_)) { + is.Next(); + os.InsertBack(ap); + } + } else { + os.InsertBack(ap); + } + } else if (FindActualParam(ap, params, is.Peek()->str_)) { + auto tok = is.Next(); + const_cast(ap.Peek())->ws_ = tok->ws_; + Expand(os, ap); + } else { + os.InsertBack(is.Peek()); + is.Next(); + } + } + + os.FinalizeSubst(leadingWS, hs); +} + + +void Preprocessor::Glue(TokenSequence& os, const Token* tok) { + TokenList tokList {tok}; + TokenSequence is(&tokList); + Glue(os, is); +} + + +void Preprocessor::Glue(TokenSequence& os, TokenSequence is) { + auto lhs = os.Back(); + auto rhs = is.Peek(); + + auto str = new std::string(lhs->str_ + rhs->str_); + TokenSequence ts; + Scanner scanner(str, lhs->loc_); + scanner.Tokenize(ts); + + is.Next(); + + if (ts.Empty()) { + // TODO(wgtdkp): + // No new Token generated + // How to handle it??? + } else { + os.PopBack(); + auto newTok = const_cast(ts.Next()); + newTok->ws_ = lhs->ws_; + newTok->hs_ = lhs->hs_; + os.InsertBack(newTok); + } + + if (!ts.Empty()) { + Error(lhs, "macro expansion failed: cannot concatenate"); + } + + os.InsertBack(is); +} + + +/* + * This is For the '#' operator in func-like macro + */ +const Token* Preprocessor::Stringize(TokenSequence is) { + std::string str = "\""; + while (!is.Empty()) { + auto tok = is.Next(); + // Have preceding white space + // and is not the first token of the sequence + str.append(tok->ws_ && str.size() > 1, ' '); + if (tok->tag_ == Token::LITERAL || tok->tag_ == Token::C_CONSTANT) { + for (auto c: tok->str_) { + if (c == '"' || c == '\\') + str.push_back('\\'); + str.push_back(c); + } + } else { + str += tok->str_; + } + } + str.push_back('\"'); + + auto ret = Token::New(*is.Peek()); + ret->tag_ = Token::LITERAL; + ret->str_ = str; + return ret; +} + + +void Preprocessor::Finalize(TokenSequence os) { + while (!os.Empty()) { + auto tok = os.Next(); + if (tok->tag_ == Token::INVALID) { + Error(tok, "stray token in program"); + } else if (tok->tag_ == Token::IDENTIFIER) { + auto tag = Token::KeyWordTag(tok->str_); + if (Token::IsKeyWord(tag)) { + const_cast(tok)->tag_ = tag; + } else { + const_cast(tok)->str_ = Scanner(tok).ScanIdentifier(); + } + } + if (fName_ && !tok->loc_.filename_) { + assert(false); + } + } +} + + +// TODO(wgtdkp): add predefined macros +void Preprocessor::Process(TokenSequence& os) { + TokenSequence is; + // Add source file + if(fName_) + IncludeFile(is, fName_); + else + IncludeSrc(is, fSrc_, nullptr); + // Expand + Expand(os, is); + Finalize(os); +} + + +const Token* Preprocessor::ParseActualParam(TokenSequence& is, + Macro* macro, + ParamMap& paramMap) { + const Token* ret; + if (macro->Params().size() == 0 && !macro->Variadic()) { + ret = is.Next(); + if (ret->tag_ != ')') + Error(ret, "too many arguments"); + return ret; + } + + auto fp = macro->Params().begin(); + TokenSequence ap; + + int cnt = 1; + while (cnt > 0) { + if (is.Empty()) + Error(is.Peek(), "premature end of input"); + else if (is.Test('(')) + ++cnt; + else if (is.Test(')')) + --cnt; + + if ((is.Test(',') && cnt == 1) || cnt == 0) { + + if (fp == macro->Params().end()) { + if (!macro->Variadic()) + Error(is.Peek(), "too many arguments"); + if (cnt == 0) + paramMap.insert(std::make_pair("__VA_ARGS__", ap)); + else + ap.InsertBack(is.Peek()); + } else { + paramMap.insert(std::make_pair(*fp, ap)); + ap = TokenSequence(); + ++fp; + } + } else { + ap.InsertBack(is.Peek()); + } + ret = is.Next(); + } + + if (fp != macro->Params().end()) + Error(is.Peek(), "too few params"); + return ret; +} + + +const Token* Preprocessor::EvalDefOp(TokenSequence& is) { + auto hasPar = is.Try('('); + auto macro = is.Expect(Token::IDENTIFIER); + auto cons = Token::New(*macro); + if (hasPar) is.Expect(')'); + cons->tag_ = Token::I_CONSTANT; + cons->str_ = FindMacro(macro->str_) ? "1": "0"; + return cons; +} + + +void Preprocessor::ReplaceIdent(TokenSequence& is) { + TokenSequence os; + while (!is.Empty()) { + auto tok = is.Next(); + if (tok->tag_ == Token::IDENTIFIER) { + auto cons = Token::New(*tok); + cons->tag_ = Token::I_CONSTANT; + cons->str_ = "0"; + os.InsertBack(cons); + } else { + os.InsertBack(tok); + } + } + is = os; +} + + +int Preprocessor::GetDirective(TokenSequence& is) { + if (!is.Test('#') || !is.IsBeginOfLine()) + return Token::INVALID; + + is.Next(); + if (is.IsBeginOfLine()) + return Token::PP_EMPTY; + + auto tag = is.Peek()->tag_; + if (tag == Token::IDENTIFIER || Token::IsKeyWord(tag)) { + auto str = is.Peek()->str_; + auto res = directiveMap.find(str); + if (res == directiveMap.end()) + return Token::PP_NONE; + return res->second; + } + return Token::PP_NONE; +} + + +void Preprocessor::ParseDirective(TokenSequence& os, + TokenSequence& is, + int directive) { + if (directive == Token::PP_EMPTY) + return; + auto ls = is.GetLine(); + switch(directive) { + case Token::PP_IF: + ParseIf(ls); break; + case Token::PP_IFDEF: + ParseIfdef(ls); break; + case Token::PP_IFNDEF: + ParseIfndef(ls); break; + case Token::PP_ELIF: + ParseElif(ls); break; + case Token::PP_ELSE: + ParseElse(ls); break; + case Token::PP_ENDIF: + ParseEndif(ls); break; + case Token::PP_INCLUDE: + if (NeedExpand()) + ParseInclude(is, ls); + break; + case Token::PP_DEFINE: + if (NeedExpand()) + ParseDef(ls); + break; + case Token::PP_UNDEF: + if (NeedExpand()) + ParseUndef(ls); + break; + case Token::PP_LINE: + if (NeedExpand()) + ParseLine(ls); + break; + case Token::PP_ERROR: + if (NeedExpand()) + ParseError(ls); + break; + case Token::PP_PRAGMA: + if (NeedExpand()) + ParsePragma(ls); + break; + case Token::PP_NONE: + break; + default: + assert(false); + } +} + + +void Preprocessor::ParsePragma(TokenSequence ls) { + // TODO(wgtdkp): + ls.Next(); +} + + +void Preprocessor::ParseError(TokenSequence ls) { + ls.Next(); + const auto& literal = Stringize(ls); + std::string msg; + Scanner(literal).ScanLiteral(msg); + Error(ls.Peek(), "%s", msg.c_str()); +} + + +void Preprocessor::ParseLine(TokenSequence ls) { + auto directive = ls.Next(); // Skip directive 'line' + TokenSequence ts; + Expand(ts, ls); + auto tok = ts.Expect(Token::I_CONSTANT); + + int line = 0; + size_t end = 0; + try { + line = stoi(tok->str_, &end, 10); + } catch (const std::out_of_range& oor) { + Error(tok, "line number out of range"); + } + if (line == 0 || end != tok->str_.size()) { + Error(tok, "illegal line number"); + } + + curLine_ = line; + lineLine_ = directive->loc_.line_; + if (ts.Empty()) + return; + tok = ts.Expect(Token::LITERAL); + + // Enusure "s-char-sequence" + if (tok->str_.front() != '"' || tok->str_.back() != '"') { + Error(tok, "expect s-char-sequence"); + } +} + + +void Preprocessor::ParseIf(TokenSequence ls) { + if (!NeedExpand()) { + ppCondStack_.push({Token::PP_IF, false, false}); + return; + } + + auto tok = ls.Next(); // Skip the directive + + if (ls.Empty()) { + Error(tok, "expect expression in 'if' directive"); + } + + TokenSequence ts; + Expand(ts, ls, true); + ReplaceIdent(ts); + + Parser parser(ts); + auto expr = parser.ParseExpr(); + if (!parser.ts().Empty()) { + Error(parser.ts().Peek(), "unexpected extra expression"); + } + bool cond; + if (expr->Type()->IsFloat()) { + cond = static_cast(Evaluator().Eval(expr)); + } else { + cond = static_cast(Evaluator().Eval(expr)); + } + ppCondStack_.push({Token::PP_IF, NeedExpand(), cond}); +} + + +void Preprocessor::ParseIfdef(TokenSequence ls) { + if (!NeedExpand()) { + ppCondStack_.push({Token::PP_IFDEF, false, false}); + return; + } + + ls.Next(); + auto ident = ls.Expect(Token::IDENTIFIER); + if (!ls.Empty()) { + Error(ls.Peek(), "expect new line"); + } + + auto cond = FindMacro(ident->str_) != nullptr; + ppCondStack_.push({Token::PP_IFDEF, NeedExpand(), cond}); +} + + +void Preprocessor::ParseIfndef(TokenSequence ls) { + ParseIfdef(ls); + auto top = ppCondStack_.top(); + ppCondStack_.pop(); + top.tag_ = Token::PP_IFNDEF; + top.cond_ = !top.cond_; + + ppCondStack_.push(top); +} + + +void Preprocessor::ParseElif(TokenSequence ls) { + auto directive = ls.Next(); // Skip the directive + + if (ppCondStack_.empty()) + Error(directive, "unexpected 'elif' directive"); + auto top = ppCondStack_.top(); + if (top.tag_ == Token::PP_ELSE) + Error(directive, "unexpected 'elif' directive"); + + while (!ppCondStack_.empty()) { + top = ppCondStack_.top(); + if (top.tag_ == Token::PP_IF || + top.tag_ == Token::PP_IFDEF || + top.tag_ == Token::PP_IFNDEF || + top.cond_) { + break; + } + ppCondStack_.pop(); + } + if (ppCondStack_.empty()) + Error(directive, "unexpected 'elif' directive"); + auto enabled = top.enabled_; + if (!enabled) { + ppCondStack_.push({Token::PP_ELIF, false, false}); + return; + } + + if (ls.Empty()) { + Error(ls.Peek(), "expect expression in 'elif' directive"); + } + + TokenSequence ts; + Expand(ts, ls, true); + ReplaceIdent(ts); + + Parser parser(ts); + auto expr = parser.ParseExpr(); + if (!parser.ts().Empty()) { + Error(parser.ts().Peek(), "unexpected extra expression"); + } + bool cond; + if (expr->Type()->IsFloat()) { + std::cout << Evaluator().Eval(expr) << std::endl; + cond = static_cast(Evaluator().Eval(expr)); + } else { + cond = static_cast(Evaluator().Eval(expr)); + } + cond = cond && !top.cond_; + ppCondStack_.push({Token::PP_ELIF, true, cond}); +} + + +void Preprocessor::ParseElse(TokenSequence ls) { + auto directive = ls.Next(); + if (!ls.Empty()) + Error(ls.Peek(), "expect new line"); + + if (ppCondStack_.empty()) + Error(directive, "unexpected 'else' directive"); + auto top = ppCondStack_.top(); + if (top.tag_ == Token::PP_ELSE) + Error(directive, "unexpected 'else' directive"); + + while (!ppCondStack_.empty()) { + top = ppCondStack_.top(); + if (top.tag_ == Token::PP_IF || + top.tag_ == Token::PP_IFDEF || + top.tag_ == Token::PP_IFNDEF || + top.cond_) { + break; + } + ppCondStack_.pop(); + } + if (ppCondStack_.empty()) + Error(directive, "unexpected 'else' directive"); + + auto cond = !top.cond_; + auto enabled = top.enabled_; + ppCondStack_.push({Token::PP_ELSE, enabled, cond}); +} + + +void Preprocessor::ParseEndif(TokenSequence ls) { + auto directive = ls.Next(); + if (!ls.Empty()) + Error(ls.Peek(), "expect new line"); + + while ( !ppCondStack_.empty()) { + auto top = ppCondStack_.top(); + ppCondStack_.pop(); + + if (top.tag_ == Token::PP_IF + || top.tag_ == Token::PP_IFDEF + || top.tag_ == Token::PP_IFNDEF) { + return; + } + } + + if (ppCondStack_.empty()) + Error(directive, "unexpected 'endif' directive"); +} + + +// Have Read the '#' +void Preprocessor::ParseInclude(TokenSequence& is, TokenSequence ls) { + bool next = ls.Next()->str_ == "include_next"; // Skip 'include' + if (!ls.Test(Token::LITERAL) && !ls.Test('<')) { + TokenSequence ts; + Expand(ts, ls, true); + ls = ts; + } + + auto tok = ls.Next(); + if (tok->tag_ == Token::LITERAL) { + if (!ls.Empty()) { + Error(ls.Peek(), "expect new line"); + } + std::string filename; + Scanner(tok).ScanLiteral(filename); + auto fullPath = SearchFile(filename, false, next, *tok->loc_.filename_); + if (fullPath == nullptr) + Error(tok, "%s: No such file or directory", filename.c_str()); + + IncludeFile(is, fullPath); + } else if (tok->tag_ == '<') { + auto lhs = tok; + auto rhs = tok; + int cnt = 1; + while (!(rhs = ls.Next())->IsEOF()) { + if (rhs->tag_ == '<') + ++cnt; + else if (rhs->tag_ == '>') + --cnt; + if (cnt == 0) + break; + } + if (cnt != 0) + Error(rhs, "expect '>'"); + if (!ls.Empty()) + Error(ls.Peek(), "expect new line"); + + const auto& filename = Scanner::ScanHeadName(lhs, rhs); + auto fullPath = SearchFile(filename, true, next, *tok->loc_.filename_); + if (fullPath == nullptr) { + Error(tok, "%s: No such file or directory", filename.c_str()); + } + IncludeFile(is, fullPath); + } else { + Error(tok, "expect filename(string or in '<>')"); + } +} + + +void Preprocessor::ParseUndef(TokenSequence ls) { + ls.Next(); // Skip directive + + auto ident = ls.Expect(Token::IDENTIFIER); + if (!ls.Empty()) + Error(ls.Peek(), "expect new line"); + + RemoveMacro(ident->str_); +} + + +void Preprocessor::ParseDef(TokenSequence ls) { + ls.Next(); + auto ident = ls.Expect(Token::IDENTIFIER); + if (ident->str_ == "defined") { + Error(ident, "'defined' cannot be used as a macro name"); + } + auto tok = ls.Peek(); + if (tok->tag_ == '(' && !tok->ws_) { + // There is no white space between ident and '(' + // Hence, we are defining function-like macro + + // Parse Identifier list + ls.Next(); // Skip '(' + ParamList params; + auto variadic = ParseIdentList(params, ls); + const auto& macro = Macro(variadic, params, ls); + AddMacro(ident->str_, macro); + } else { + AddMacro(ident->str_, Macro(ls)); + } +} + + +bool Preprocessor::ParseIdentList(ParamList& params, TokenSequence& is) { + const Token* tok = is.Peek(); + while (!is.Empty()) { + tok = is.Next(); + if (tok->tag_ == ')') { + return false; + } else if (tok->tag_ == Token::ELLIPSIS) { + is.Expect(')'); + return true; + } else if (tok->tag_ != Token::IDENTIFIER) { + Error(tok, "expect identifier"); + } + + for (const auto& param: params) { + if (param == tok->str_) + Error(tok, "duplicated param"); + } + params.push_back(tok->str_); + + if (!is.Try(',')) { + is.Expect(')'); + return false; + } + } + + Error(tok, "unexpected end of line"); +} + +void Preprocessor::IncludeSrc(TokenSequence& is, + const std::string* text, + const std::string* filename) { + TokenSequence ts {is.tokList_, is.begin_, is.begin_}; + Scanner scanner(text, filename); + scanner.Tokenize(ts); + + // We done including header file + is.begin_ = ts.begin_; +} + +void Preprocessor::IncludeFile(TokenSequence& is, + const std::string* filename) { + IncludeSrc(is, ReadFile(*filename), filename); +} + + +static std::string GetDir(const std::string& path) { + auto pos = path.rfind('/'); + if (pos == std::string::npos) + return "./"; + return path.substr(0, pos + 1); +} + + +std::string* Preprocessor::SearchFile(const std::string& name, + const bool libHeader, + bool next, + const std::string& curPath) { + if (libHeader && !next) { + searchPaths_.push_back(GetDir(curPath)); + } else { + searchPaths_.push_front(GetDir(curPath)); + } + + auto iter = searchPaths_.begin(); + for (; iter != searchPaths_.end(); ++iter) { + auto dd = open(iter->c_str(), O_RDONLY); + if (dd == -1) // TODO(wgtdkp): or ensure it before preprocessing + continue; + auto fd = openat(dd, name.c_str(), O_RDONLY); + close(dd); + if (fd != -1) { + // Intentional, so that recursive include + // will result in running out of file descriptor + //close(fd); + auto path = *iter + name; + if (next) { + if (path != curPath) + continue; + else + next = false; + } else { + if (path == curPath) + continue; + if (libHeader && !next) + searchPaths_.pop_back(); + else + searchPaths_.pop_front(); + return new std::string(path); + } + } else if (errno == EMFILE) { + Error("may recursive include"); + } + } + return nullptr; +} + + +void Preprocessor::AddMacro(const std::string& name, + std::string* text, + bool preDef) { + TokenSequence ts; + Scanner scanner(text); + scanner.Tokenize(ts); + Macro macro(ts, preDef); + + AddMacro(name, macro); +} + + +static std::string* Date() { + time_t t = time(NULL); + struct tm* tm = localtime(&t); + char buf[14]; + strftime(buf, sizeof buf, "\"%a %M %Y\"", tm); + return new std::string(buf); +} + + +void Preprocessor::Init() { + // Preinclude search paths + AddSearchPath("/usr/local/include/"); + AddSearchPath("/usr/include/x86_64-linux-gnu/"); + AddSearchPath("/usr/include/linux/"); + AddSearchPath("/usr/include/"); + AddSearchPath("/usr/local/wgtcc/include/"); + + // The __FILE__ and __LINE__ macro is empty + // They are handled seperately + AddMacro("__FILE__", Macro(TokenSequence(), true)); + AddMacro("__LINE__", Macro(TokenSequence(), true)); + + AddMacro("__DATE__", Date(), true); + AddMacro("__STDC__", new std::string("1"), true); + AddMacro("__STDC__HOSTED__", new std::string("0"), true); + AddMacro("__STDC_VERSION__", new std::string("201103L"), true); +} + + +void Preprocessor::HandleTheFileMacro(TokenSequence& os, const Token* macro) { + auto file = Token::New(*macro); + file->tag_ = Token::LITERAL; + file->str_ = "\"" + *macro->loc_.filename_ + "\""; + os.InsertBack(file); +} + + +void Preprocessor::HandleTheLineMacro(TokenSequence& os, const Token* macro) { + auto line = Token::New(*macro); + line->tag_ = Token::I_CONSTANT; + line->str_ = std::to_string(macro->loc_.line_); + os.InsertBack(line); +} + + +void Preprocessor::UpdateFirstTokenLine(TokenSequence ts) { + auto loc = ts.Peek()->loc_; + loc.line_ = curLine_ + loc.line_ - lineLine_ - 1; + ts.UpdateHeadLocation(loc); +} + + +TokenSequence Macro::RepSeq(const std::string* filename, unsigned line) { + // Update line + TokenList tl; + TokenSequence ret(&tl); + ret.Copy(repSeq_); + auto ts = ret; + while (!ts.Empty()) { + auto loc = ts.Peek()->loc_; + loc.filename_ = filename; + loc.line_ = line; + ts.UpdateHeadLocation(loc); + ts.Next(); + } + return ret; +} + + +void Preprocessor::AddSearchPath(std::string path) { + if (path.back() != '/') + path += "/"; + if (path[0] != '/') + path = "./" + path; + searchPaths_.push_front(path); +} diff --git a/lib/lang/wgtcc/encoding.cc b/lib/lang/wgtcc/encoding.cc new file mode 100644 index 000000000..d5d1f99d1 --- /dev/null +++ b/lib/lang/wgtcc/encoding.cc @@ -0,0 +1,42 @@ +#include "triton/lang/wgtcc/encoding.h" + +#include +#include +#include +#include + + +static void Append16LE(std::string& str, char16_t c) { + str.push_back(c & UCHAR_MAX); + str.push_back((c >> 8) & UCHAR_MAX); +} + + +static void Append32LE(std::string& str, char32_t c) { + Append16LE(str, c & USHRT_MAX); + Append16LE(str, (c >> 16) & USHRT_MAX); +} + + +void ConvertToUTF16(std::string& str) { + std::wstring_convert, char16_t> utf8_ucs2_cvt; + auto str16 = utf8_ucs2_cvt.from_bytes(str); + str.resize(0); + for (auto c16: str16) + Append16LE(str, c16); +} + + +void ConvertToUTF32(std::string& str) { + std::wstring_convert, char32_t> utf8_ucs4_cvt; + auto str32 = utf8_ucs4_cvt.from_bytes(str); + str.resize(0); + for (auto c32: str32) + Append32LE(str, c32); +} + + +void AppendUCN(std::string& str, int c) { + std::wstring_convert, char32_t> utf8_ucs4_cvt; + str += utf8_ucs4_cvt.to_bytes(static_cast(c)); +} diff --git a/lib/lang/wgtcc/error.cc b/lib/lang/wgtcc/error.cc new file mode 100644 index 000000000..618a83181 --- /dev/null +++ b/lib/lang/wgtcc/error.cc @@ -0,0 +1,95 @@ +#include "triton/lang/wgtcc/error.h" + +#include "triton/lang/wgtcc/ast.h" +#include "triton/lang/wgtcc/token.h" + +#include +#include +#include +#include + + +#define ANSI_COLOR_RED "\x1b[31m" +#define ANSI_COLOR_GREEN "\x1b[32m" +#define ANSI_COLOR_YELLOW "\x1b[33m" +#define ANSI_COLOR_BLUE "\x1b[34m" +#define ANSI_COLOR_MAGENTA "\x1b[35m" +#define ANSI_COLOR_CYAN "\x1b[36m" +#define ANSI_COLOR_RESET "\x1b[0m" + + +extern std::string program; + + +void Error(const char* format, ...) { + fprintf(stderr, + "%s: " ANSI_COLOR_RED "error: " ANSI_COLOR_RESET, + program.c_str()); + + va_list args; + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + + fprintf(stderr, "\n"); + + exit(-1); +} + + +[[noreturn]] +static void VError(const SourceLocation& loc, + const char* format, + va_list args) { + const char* filename = nullptr; + if(loc.filename_) + filename = loc.filename_->c_str(); + fprintf(stderr, + "%s:%d:%d: " ANSI_COLOR_RED "error: " ANSI_COLOR_RESET, + filename, + loc.line_, + loc.column_); + vfprintf(stderr, format, args); + fprintf(stderr, "\n "); + + bool sawNoSpace = false; + int nspaces = 0; + for (auto p = loc.lineBegin_; *p != '\n' && *p != 0; p++) { + if (!sawNoSpace && (*p == ' ' || *p == '\t')) { + ++nspaces; + } else { + sawNoSpace = true; + fputc(*p, stderr); + } + } + + fprintf(stderr, "\n "); + for (unsigned i = 1; i + nspaces < loc.column_; ++i) + fputc(' ', stderr); + fprintf(stderr, ANSI_COLOR_GREEN "^\n"); + exit(-1); +} + + +void Error(const SourceLocation& loc, const char* format, ...) { + va_list args; + va_start(args, format); + VError(loc, format, args); + va_end(args); +} + + +void Error(const Token* tok, const char* format, ...) { + va_list args; + va_start(args, format); + VError(tok->loc_, format, args); + va_end(args); +} + + +void Error(const Expr* expr, const char* format, ...) { + va_list args; + va_start(args, format); + VError(expr->Tok()->loc_, format, args); + va_end(args); +} diff --git a/lib/lang/wgtcc/evaluator.cc b/lib/lang/wgtcc/evaluator.cc new file mode 100644 index 000000000..956fe21a6 --- /dev/null +++ b/lib/lang/wgtcc/evaluator.cc @@ -0,0 +1,210 @@ +#include "triton/lang/wgtcc/evaluator.h" + +#include "triton/lang/wgtcc/ast.h" +#include "triton/lang/wgtcc/code_gen.h" +#include "triton/lang/wgtcc/token.h" + + +template +void Evaluator::VisitBinaryOp(BinaryOp* binary) { +#define L Evaluator().Eval(binary->lhs_) +#define R Evaluator().Eval(binary->rhs_) +#define LL Evaluator().Eval(binary->lhs_) +#define LR Evaluator().Eval(binary->rhs_) + + if (binary->Type()->ToPointer()) { + auto val = Evaluator().Eval(binary); + if (val.label_.size()) { + Error(binary, "expect constant integer expression"); + } + val_ = static_cast(val.offset_); + return; + } + + switch (binary->op_) { + case '+': val_ = L + R; break; + case '-': val_ = L - R; break; + case '*': val_ = L * R; break; + case '/': { + auto l = L, r = R; + if (r == 0) + Error(binary, "division by zero"); + val_ = l / r; + } break; + case '%': { + auto l = LL, r = LR; + if (r == 0) + Error(binary, "division by zero"); + val_ = l % r; + } break; + // Bitwise operators that do not accept float + case '|': val_ = LL | LR; break; + case '&': val_ = LL & LR; break; + case '^': val_ = LL ^ LR; break; + case Token::LEFT: val_ = LL << LR; break; + case Token::RIGHT: val_ = LL >> LR; break; + + case '<': val_ = L < R; break; + case '>': val_ = L > R; break; + case Token::LOGICAL_AND: val_ = L && R; break; + case Token::LOGICAL_OR: val_ = L || R; break; + case Token::EQ: val_ = L == R; break; + case Token::NE: val_ = L != R; break; + case Token::LE: val_ = L <= R; break; + case Token::GE: val_ = L >= R; break; + case '=': case ',': val_ = R; break; + case '.': { + auto addr = Evaluator().Eval(binary); + if (addr.label_.size()) + Error(binary, "expect constant expression"); + val_ = addr.offset_; + } + default: assert(false); + } + +#undef L +#undef R +#undef LL +#undef LR +} + + +template +void Evaluator::VisitUnaryOp(UnaryOp* unary) { +#define VAL Evaluator().Eval(unary->operand_) +#define LVAL Evaluator().Eval(unary->operand_) + + switch (unary->op_) { + case Token::PLUS: val_ = VAL; break; + case Token::MINUS: val_ = -VAL; break; + case '~': val_ = ~LVAL; break; + case '!': val_ = !VAL; break; + case Token::CAST: + if (unary->Type()->IsInteger()) + val_ = static_cast(VAL); + else + val_ = VAL; + break; + case Token::ADDR: { + auto addr = Evaluator().Eval(unary->operand_); + if (addr.label_.size()) + Error(unary, "expect constant expression"); + val_ = addr.offset_; + } break; + default: Error(unary, "expect constant expression"); + } + +#undef LVAL +#undef VAL +} + + +template +void Evaluator::VisitConditionalOp(ConditionalOp* condOp) { + bool cond; + auto condType = condOp->cond_->Type(); + if (condType->IsInteger()) { + auto val = Evaluator().Eval(condOp->cond_); + cond = val != 0; + } else if (condType->IsFloat()) { + auto val = Evaluator().Eval(condOp->cond_); + cond = val != 0.0; + } else if (condType->ToPointer()) { + auto val = Evaluator().Eval(condOp->cond_); + cond = val.label_.size() || val.offset_; + } else { + assert(false); + } + + if (cond) { + val_ = Evaluator().Eval(condOp->exprTrue_); + } else { + val_ = Evaluator().Eval(condOp->exprFalse_); + } +} + + +void Evaluator::VisitBinaryOp(BinaryOp* binary) { +#define LR Evaluator().Eval(binary->rhs_) +#define R Evaluator().Eval(binary->rhs_) + + auto l = Evaluator().Eval(binary->lhs_); + + int width = 1; + auto pointerType = binary->Type()->ToPointer(); + if (pointerType) + width = pointerType->Derived()->Width(); + + switch (binary->op_) { + case '+': + assert(pointerType); + addr_.label_ = l.label_; + addr_.offset_ = l.offset_ + LR * width; + break; + case '-': + assert(pointerType); + addr_.label_ = l.label_; + addr_.offset_ = l.offset_ + LR * width; + break; + case '.': { + addr_.label_ = l.label_; + auto type = binary->lhs_->Type()->ToStruct(); + auto offset = type->GetMember(binary->rhs_->tok_->str_)->Offset(); + addr_.offset_ = l.offset_ + offset; + break; + } + default: assert(false); + } +#undef LR +#undef R +} + + +void Evaluator::VisitUnaryOp(UnaryOp* unary) { + auto addr = Evaluator().Eval(unary->operand_); + + switch (unary->op_) { + case Token::CAST: + case Token::ADDR: + case Token::DEREF: + addr_ = addr; break; + default: assert(false); + } +} + + +void Evaluator::VisitConditionalOp(ConditionalOp* condOp) { + bool cond; + auto condType = condOp->cond_->Type(); + if (condType->IsInteger()) { + auto val = Evaluator().Eval(condOp->cond_); + cond = val != 0; + } else if (condType->IsFloat()) { + auto val = Evaluator().Eval(condOp->cond_); + cond = val != 0.0; + } else if (condType->ToPointer()) { + auto val = Evaluator().Eval(condOp->cond_); + cond = val.label_.size() || val.offset_; + } else { + assert(false); + } + + if (cond) { + addr_ = Evaluator().Eval(condOp->exprTrue_); + } else { + addr_ = Evaluator().Eval(condOp->exprFalse_); + } +} + + +void Evaluator::VisitConstant(Constant* cons) { + if (cons->Type()->IsInteger()) { + addr_ = {"", static_cast(cons->IVal())}; + } else if (cons->Type()->ToArray()) { + Generator().ConsLabel(cons); // Add the literal to rodatas_. + addr_.label_ = Generator::rodatas_.back().label_; + addr_.offset_ = 0; + } else { + assert(false); + } +} diff --git a/lib/lang/wgtcc/main.cc b/lib/lang/wgtcc/main.cc new file mode 100644 index 000000000..72e2000ef --- /dev/null +++ b/lib/lang/wgtcc/main.cc @@ -0,0 +1,253 @@ +#include "triton/lang/wgtcc/code_gen.h" +#include "triton/lang/wgtcc/cpp.h" +#include "triton/lang/wgtcc/error.h" +#include "triton/lang/wgtcc/parser.h" +#include "triton/lang/wgtcc/scanner.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + + +std::string program; +std::string filename_in; +std::string filename_out; +bool debug = false; +static bool only_preprocess = false; +static bool only_compile = false; +static bool specified_out_name = false; +static std::list filenames_in; +static std::list gcc_filenames_in; +static std::list gcc_args; +static std::list defines; +static std::list include_paths; + + +static void Usage() { + printf("Usage: wgtcc [options] file...\n" + "Options: \n" + " -h Display this information\n" + " -D Define object like macro\n" + " -I Add search path\n" + " -E Preprocess only; do not compile, assemble or link\n" + " -S Compile only; do not assemble or link\n" + " -o specify output file\n"); + + exit(0); +} + + +static std::string GetExtension(const std::string& filename) { + return filename.substr(filename.size() >= 2 ? filename.size() - 2 : 0); +} + + +static void ValidateFileName(const std::string& filename) { + auto ext = GetExtension(filename); + if (ext != ".c" && ext != ".s" && ext != ".o" && ext != ".a") + Error("bad file name format:'%s'", filename.c_str()); +} + + +static void DefineMacro(Preprocessor& cpp, const std::string& def) { + auto pos = def.find('='); + std::string macro; + std::string* replace; + if (pos == std::string::npos) { + macro = def; + replace = new std::string(); + } else { + macro = def.substr(0, pos); + replace = new std::string(def.substr(pos + 1)); + } + cpp.AddMacro(macro, replace); +} + + +static std::string GetName(const std::string& path) { + auto pos = path.rfind('/'); + if (pos == std::string::npos) + return path; + return path.substr(pos + 1); +} + +static int RunWgtcc() { + if (GetExtension(filename_in) != ".c") + return -3; + + Preprocessor cpp(&filename_in); + for (auto& def: defines) + DefineMacro(cpp, def); + for (auto& path: include_paths) + cpp.AddSearchPath(path); + + FILE* fp = stdout; + if (specified_out_name) { + fp = fopen(filename_out.c_str(), "w"); + } + TokenSequence ts; + cpp.Process(ts); + if (only_preprocess) { + ts.Print(fp); + return 0; + } + + if (!only_compile || !specified_out_name) { + filename_out = GetName(filename_in); + filename_out.back() = 's'; + } + fp = fopen(filename_out.c_str(), "w"); + + Parser parser(ts); + parser.Parse(); + Generator::SetInOut(&parser, fp); + Generator().Gen(); + fclose(fp); + return 0; +} + + +static int RunGcc() { + // Froce C11 + bool spec_std = false; + for (auto& arg: gcc_args) { + if (arg.substr(0, 4) == "-std") { + arg = "-std=c11"; + spec_std = true; + } + } + if (!spec_std) { + gcc_args.push_front("-std=c11"); + } + + std::string systemArg = "gcc"; + for (const auto& arg: gcc_args) { + systemArg += " " + arg; + } + auto ret = system(systemArg.c_str()); + return ret; +} + + +static void ParseInclude(int argc, char* argv[], int& i) { + if (argv[i][2]) { + include_paths.push_front(&argv[i][2]); + return; + } + + if (i == argc - 1) { + Error("missing argument to '%s'", argv[i]); + } + include_paths.push_front(argv[++i]); + gcc_args.push_back(argv[i]); +} + + +static void ParseDefine(int argc, char* argv[], int& i) { + if (argv[i][2]) { + defines.push_back(&argv[i][2]); + return; + } + + if (i == argc - 1) + Error("missing argument to '%s'", argv[i]); + defines.push_back(argv[++i]); + gcc_args.push_back(argv[i]); +} + + +static void ParseOut(int argc, char* argv[], int& i) { + if (i == argc - 1) + Error("missing argument to '%s'", argv[i]); + filename_out = argv[++i]; + gcc_args.push_back(argv[i]); +} + + +/* Use: + * wgtcc: compile + * gcc: assemble and link + * Allowing multi file may not be a good idea... + */ +int main(int argc, char* argv[]) { + if (argc < 2) + Usage(); + + program = std::string(argv[0]); + for (auto i = 1; i < argc; ++i) { + if (argv[i][0] != '-') { + filename_in = std::string(argv[i]); + ValidateFileName(filename_in); + filenames_in.push_back(filename_in); + continue; + } + + gcc_args.push_back(argv[i]); + switch (argv[i][1]) { + case 'h': Usage(); break; + case 'E': only_preprocess = true; break; + case 'S': only_compile = true; break; + case 'I': ParseInclude(argc, argv, i); break; + case 'D': ParseDefine(argc, argv, i); break; + case 'o': + specified_out_name = true; + ParseOut(argc, argv, i); break; + case 'g': gcc_args.pop_back(); debug = true; break; + default:; + } + } + +#ifdef DEBUG + RunWgtcc(); +#else + for (const auto& filename: filenames_in) { + filename_in = filename; + pid_t pid = fork(); + if (pid < 0) { + Error("fork error"); + } else if (pid == 0) { + // Do work in child process + return RunWgtcc(); + } + } + + for (size_t i = 0; i < filenames_in.size(); ++i) { + int stat; + wait(&stat); + // Child process terminate normaly if : + // 1. terminate with `exit()`, that is, WIFEXITED(stat) if true. + // 2. the status code is 0, that is, WEXITSTATUS(stat) == 0 + if (!WIFEXITED(stat) || WEXITSTATUS(stat)) + return 0; + } +#endif + + if (only_preprocess || only_compile) { + if (specified_out_name && filenames_in.size() > 1) + Error("cannot specifier output filename with multiple input file"); + return 0; + } + + std::list filenames_out; + for (auto& filename: filenames_in) { + if (GetExtension(filename) == ".c") { + gcc_args.push_back(GetName(filename)); + gcc_args.back().back() = 's'; + } else { + gcc_args.clear(); + for (int i = 1; i < argc; ++i) + gcc_args.push_back(argv[i]); + break; + } + } + auto ret = RunGcc(); + remove(filename_out.c_str()); + return ret; +} diff --git a/lib/lang/wgtcc/parser.cc b/lib/lang/wgtcc/parser.cc new file mode 100644 index 000000000..8ec16ee51 --- /dev/null +++ b/lib/lang/wgtcc/parser.cc @@ -0,0 +1,2688 @@ +#include "triton/lang/wgtcc/parser.h" + +#include "triton/lang/wgtcc/cpp.h" +#include "triton/lang/wgtcc/encoding.h" +#include "triton/lang/wgtcc/error.h" +#include "triton/lang/wgtcc/evaluator.h" +#include "triton/lang/wgtcc/scope.h" +#include "triton/lang/wgtcc/type.h" + +#include +#include +#include +#include + + +FuncType* Parser::vaStartType_ {nullptr}; +FuncType* Parser::vaArgType_ {nullptr}; + + +FuncDef* Parser::EnterFunc(Identifier* ident) { + curFunc_ = FuncDef::New(ident, LabelStmt::New()); + return curFunc_; +} + + +void Parser::ExitFunc() { + // Resolve 那些待定的jump; + // 如果有jump无法resolve,也就是有未定义的label,报错; + for (auto iter = unresolvedJumps_.begin(); + iter != unresolvedJumps_.end(); ++iter) { + auto label = iter->first; + auto labelStmt = FindLabel(label->str_); + if (labelStmt == nullptr) { + Error(label, "label '%s' used but not defined", + label->str_.c_str()); + } + + iter->second->SetLabel(labelStmt); + } + + unresolvedJumps_.clear(); //清空未定的 jump 动作 + curLabels_.clear(); //清空 label map + + curFunc_ = nullptr; +} + + +void Parser::EnterBlock(FuncType* funcType) { + curScope_ = new Scope(curScope_, S_BLOCK); + if (funcType) { + // Merge elements in param scope into current block scope + for (auto param: funcType->Params()) + curScope_->Insert(param); + } +} + + +void Parser::Parse() { + DefineBuiltins(); + ParseTranslationUnit(); +} + + +void Parser::ParseTranslationUnit() { + while (!ts_.Peek()->IsEOF()) { + if (ts_.Try(Token::STATIC_ASSERT)) { + ParseStaticAssert(); + continue; + } else if (ts_.Try(';')) { + continue; + } + + int storageSpec, funcSpec, align; + auto declType = ParseDeclSpec(&storageSpec, &funcSpec, &align); + auto tokTypePair = ParseDeclarator(declType); + auto tok = tokTypePair.first; + auto type = tokTypePair.second; + + if (tok == nullptr) { + ts_.Expect(';'); + continue; + } + + auto ident = ProcessDeclarator(tok, type, storageSpec, funcSpec, align); + type = ident->Type(); + + if (tok && type->ToFunc() && ts_.Try('{')) { // Function definition + unit_->Add(ParseFuncDef(ident)); + } else { // Declaration + auto decl = ParseInitDeclarator(ident); + if (decl) unit_->Add(decl); + + while (ts_.Try(',')) { + auto ident = ParseDirectDeclarator(declType, storageSpec, + funcSpec, align); + decl = ParseInitDeclarator(ident); + if (decl) unit_->Add(decl); + } + // GNU extension: function/type/variable attributes + TryAttributeSpecList(); + ts_.Expect(';'); + } + } +} + + +FuncDef* Parser::ParseFuncDef(Identifier* ident) { + auto funcDef = EnterFunc(ident); + + if (funcDef->FuncType()->Complete()) { + Error(ident, "redefinition of '%s'", funcDef->Name().c_str()); + } + + // TODO(wgtdkp): param checking + auto funcType = ident->Type()->ToFunc(); + funcType->SetComplete(true); + for (auto param: funcType->Params()) { + if (param->Anonymous()) + Error(param, "param name omitted"); + } + funcDef->SetBody(ParseCompoundStmt(funcType)); + ExitFunc(); + + return funcDef; +} + + +Expr* Parser::ParseExpr() { + return ParseCommaExpr(); +} + + +Expr* Parser::ParseCommaExpr() { + auto lhs = ParseAssignExpr(); + auto tok = ts_.Peek(); + while (ts_.Try(',')) { + auto rhs = ParseAssignExpr(); + lhs = BinaryOp::New(tok, lhs, rhs); + + tok = ts_.Peek(); + } + return lhs; +} + + +Expr* Parser::ParsePrimaryExpr() { + if (ts_.Empty()) { + Error(ts_.Peek(), "premature end of input"); + } + + auto tok = ts_.Next(); + if (tok->tag_ == '(') { + auto expr = ParseExpr(); + ts_.Expect(')'); + return expr; + } + + if (tok->IsIdentifier()) { + auto ident = curScope_->Find(tok); + if (ident) return ident; + if (IsBuiltin(tok->str_)) return GetBuiltin(tok); + Error(tok, "undefined symbol '%s'", tok->str_.c_str()); + } else if (tok->IsConstant()) { + return ParseConstant(tok); + } else if (tok->IsLiteral()) { + return ConcatLiterals(tok); + } else if (tok->tag_ == Token::GENERIC) { + return ParseGeneric(); + } + + Error(tok, "'%s' unexpected", tok->str_.c_str()); + return nullptr; // Make compiler happy +} + + +static void ConvertLiteral(std::string& val, Encoding enc) { + switch (enc) { + case Encoding::NONE: + case Encoding::UTF8: break; + case Encoding::CHAR16: ConvertToUTF16(val); break; + case Encoding::CHAR32: + case Encoding::WCHAR: ConvertToUTF32(val); break; + } +} + + +Constant* Parser::ConcatLiterals(const Token* tok) { + auto val = new std::string; + auto enc = Scanner(tok).ScanLiteral(*val); + ConvertLiteral(*val, enc); + while (ts_.Test(Token::LITERAL)) { + auto nextTok = ts_.Next(); + std::string nextVal; + auto nextEnc = Scanner(nextTok).ScanLiteral(nextVal); + ConvertLiteral(nextVal, nextEnc); + if (enc == Encoding::NONE) { + ConvertLiteral(*val, nextEnc); + enc = nextEnc; + } + if (nextEnc != Encoding::NONE && nextEnc != enc) + Error(nextTok, "cannot concat lietrals with different encodings"); + *val += nextVal; + } + + int tag = T_CHAR; + switch (enc) { + case Encoding::NONE: + case Encoding::UTF8: + tag = T_CHAR; val->append(1, '\0'); break; + case Encoding::CHAR16: + tag = T_UNSIGNED | T_SHORT; val->append(2, '\0'); break; + case Encoding::CHAR32: + case Encoding::WCHAR: + tag = T_UNSIGNED | T_INT; val->append(4, '\0'); break; + } + + return Constant::New(tok, tag, val); +} + + +Encoding Parser::ParseLiteral(std::string& str, const Token* tok) { + return Scanner(tok).ScanLiteral(str); +} + + +Constant* Parser::ParseConstant(const Token* tok) { + assert(tok->IsConstant()); + + if (tok->tag_ == Token::I_CONSTANT) { + return ParseInteger(tok); + } else if (tok->tag_ == Token::C_CONSTANT) { + return ParseCharacter(tok); + } else { + return ParseFloat(tok); + } +} + + +Constant* Parser::ParseFloat(const Token* tok) { + const auto& str = tok->str_; + size_t end = 0; + double val = 0.0; + try { + val = stod(str, &end); + } catch (const std::out_of_range& oor) { + Error(tok, "float out of range"); + } + + int tag = T_DOUBLE; + if (str[end] == 'f' || str[end] == 'F') { + tag = T_FLOAT; + ++end; + } else if (str[end] == 'l' || str[end] == 'L') { + tag = T_LONG | T_DOUBLE; + ++end; + } + if (str[end] != 0) + Error(tok, "invalid suffix"); + + return Constant::New(tok, tag, val); +} + + +Constant* Parser::ParseCharacter(const Token* tok) { + int val; + auto enc = Scanner(tok).ScanCharacter(val); + + int tag; + switch (enc) { + case Encoding::NONE: + val = (char)val; + tag = T_INT; break; + case Encoding::CHAR16: + val = (char16_t)val; + tag = T_UNSIGNED | T_SHORT; break; + case Encoding::WCHAR: + case Encoding::CHAR32: tag = T_UNSIGNED | T_INT; break; + default: assert(false); + } + return Constant::New(tok, tag, static_cast(val)); +} + + +Constant* Parser::ParseInteger(const Token* tok) { + const auto& str = tok->str_; + size_t end = 0; + long val = 0; + try { + val = stoull(str, &end, 0); + } catch (const std::out_of_range& oor) { + Error(tok, "integer out of range"); + } + + int tag = 0; + for (; str[end]; ++end) { + if (str[end] == 'u' || str[end] == 'U') { + if (tag & T_UNSIGNED) + Error(tok, "invalid suffix"); + tag |= T_UNSIGNED; + } else { + if ((tag & T_LONG) || (tag & T_LLONG)) + Error(tok, "invalid suffix"); + if (str[end + 1] == 'l' || str[end + 1] =='L') { + tag |= T_LLONG; + ++end; + } else { + tag |= T_LONG; + } + } + } + + bool decimal = ('1' <= str[0] && str[0] <= '9'); + if (decimal) { + switch (tag) { + case 0: + tag |= !(val & ~(long)INT_MAX) ? T_INT: T_LONG; break; + case T_UNSIGNED: + tag |= !(val & ~(long)UINT_MAX) ? T_INT: T_LONG; break; + case T_LONG: break; + case T_UNSIGNED | T_LONG: break; + } + } else { + switch (tag) { + case 0: + tag |= !(val & ~(long)INT_MAX) ? T_INT + : !(val & ~(long)UINT_MAX) ? T_UNSIGNED + : !(val & ~(long)LONG_MAX) ? T_LONG + : T_UNSIGNED | T_LONG; break; + case T_UNSIGNED: + tag |= !(val & ~(long)UINT_MAX) ? T_INT: T_LONG; break; + case T_LONG: + tag |= !(val & ~(long)LONG_MAX) ? 0: T_UNSIGNED; break; + case T_UNSIGNED | T_LONG: + break; + } + } + + return Constant::New(tok, tag, val); +} + + +Expr* Parser::ParseGeneric() { + ts_.Expect('('); + auto controlExpr = ParseAssignExpr(); + ts_.Expect(','); + Expr* selectedExpr = nullptr; + bool isDefault = false; + while (true) { + if (ts_.Try(Token::DEFAULT)) { + ts_.Expect(':'); + auto defaultExpr = ParseAssignExpr(); + if (!selectedExpr) { + selectedExpr = defaultExpr; + isDefault = true; + } + } else { + auto tok = ts_.Peek(); + auto type = ParseTypeName(); + ts_.Expect(':'); + auto expr = ParseAssignExpr(); + if (type->Compatible(*controlExpr->Type())) { + if (selectedExpr && !isDefault) { + Error(tok, "more than one generic association" + " are compatible with control expression"); + } + selectedExpr = expr; + isDefault = false; + } + } + if (!ts_.Try(',')) { + ts_.Expect(')'); + break; + } + } + + if (!selectedExpr) + Error(ts_.Peek(), "no compatible generic association"); + return selectedExpr; +} + + +QualType Parser::TryCompoundLiteral() { + auto mark = ts_.Mark(); + if (ts_.Try('(') && IsTypeName(ts_.Peek())) { + auto type = ParseTypeName(); + if (ts_.Try(')') && ts_.Test('{')) + return type; + } + ts_.ResetTo(mark); + return nullptr; +} + + +Expr* Parser::ParsePostfixExpr() { + if (ts_.Peek()->IsEOF()) { + Error(ts_.Peek(), "premature end of input"); + } + + auto type = TryCompoundLiteral(); + if (type) { + auto anony = ParseCompoundLiteral(type); + return ParsePostfixExprTail(anony); + } + + auto primExpr = ParsePrimaryExpr(); + return ParsePostfixExprTail(primExpr); +} + + +Object* Parser::ParseCompoundLiteral(QualType type) { + auto linkage = curScope_->Type() == S_FILE ? L_INTERNAL: L_NONE; + auto anony = Object::NewAnony(ts_.Peek(), type, 0, linkage); + auto decl = ParseInitDeclaratorSub(anony); + + // Just for generator to find the compound literal + if (curScope_->Type() == S_FILE) { + unit_->Add(decl); + } else { + curScope_->Insert(anony->Repr(), anony); + } + return anony; +} + + +// Return the constructed postfix expression +Expr* Parser::ParsePostfixExprTail(Expr* lhs) { + while (true) { + auto tok = ts_.Next(); + + switch (tok->tag_) { + case '[': lhs = ParseSubScripting(lhs); break; + case '(': lhs = ParseFuncCall(lhs); break; + case Token::PTR: lhs = UnaryOp::New(Token::DEREF, lhs); + // Fall through + case '.': lhs = ParseMemberRef(tok, '.', lhs); break; + case Token::INC: + case Token::DEC: lhs = ParsePostfixIncDec(tok, lhs); break; + default: ts_.PutBack(); return lhs; + } + } +} + + +Expr* Parser::ParseSubScripting(Expr* lhs) { + auto rhs = ParseExpr(); + auto tok = ts_.Peek(); + ts_.Expect(']'); + auto operand = BinaryOp::New(tok, '+', lhs, rhs); + return UnaryOp::New(Token::DEREF, operand); +} + + +BinaryOp* Parser::ParseMemberRef(const Token* tok, int op, Expr* lhs) { + auto memberName = ts_.Peek()->str_; + ts_.Expect(Token::IDENTIFIER); + + auto structUnionType = lhs->Type()->ToStruct(); + if (structUnionType == nullptr) { + Error(tok, "an struct/union expected"); + } + + auto rhs = structUnionType->GetMember(memberName); + if (rhs == nullptr) { + Error(tok, "'%s' is not a member of '%s'", + memberName.c_str(), "[obj]"); + } + + return BinaryOp::New(tok, op, lhs, rhs); +} + + +UnaryOp* Parser::ParsePostfixIncDec(const Token* tok, Expr* operand) { + auto op = tok->tag_ == Token::INC ? + Token::POSTFIX_INC: Token::POSTFIX_DEC; + return UnaryOp::New(op, operand); +} + + +FuncCall* Parser::ParseFuncCall(Expr* designator) { + FuncCall::ArgList args; + while (!ts_.Try(')')) { + args.push_back(Expr::MayCast(ParseAssignExpr())); + if (!ts_.Test(')')) + ts_.Expect(','); + } + + return FuncCall::New(designator, args); +} + + +Expr* Parser::ParseUnaryExpr() { + auto tok = ts_.Next(); + switch (tok->tag_) { + case Token::ALIGNOF: return ParseAlignof(); + case Token::SIZEOF: return ParseSizeof(); + case Token::INC: return ParsePrefixIncDec(tok); + case Token::DEC: return ParsePrefixIncDec(tok); + case '&': return ParseUnaryOp(tok, Token::ADDR); + case '*': return ParseUnaryOp(tok, Token::DEREF); + case '+': return ParseUnaryOp(tok, Token::PLUS); + case '-': return ParseUnaryOp(tok, Token::MINUS); + case '~': return ParseUnaryOp(tok, '~'); + case '!': return ParseUnaryOp(tok, '!'); + default: + ts_.PutBack(); + return ParsePostfixExpr(); + } +} + + +Constant* Parser::ParseSizeof() { + QualType type(nullptr); + auto tok = ts_.Next(); + if (tok->tag_ == '(' && IsTypeName(ts_.Peek())) { + type = ParseTypeName(); + ts_.Expect(')'); + } else { + ts_.PutBack(); + auto expr = ParseUnaryExpr(); + type = expr->Type(); + } + + if (type->ToFunc() || type->ToVoid()) { + } else if (!type->Complete()) { + Error(tok, "sizeof(incomplete type)"); + } + long val = type->Width(); + return Constant::New(tok, T_UNSIGNED | T_LONG, val); +} + + +Constant* Parser::ParseAlignof() { + ts_.Expect('('); + auto tok = ts_.Peek(); + auto type = ParseTypeName(); + ts_.Expect(')'); + + long val = type->Align(); + return Constant::New(tok, T_UNSIGNED| T_LONG, val); +} + + +UnaryOp* Parser::ParsePrefixIncDec(const Token* tok) { + assert(tok->tag_ == Token::INC || tok->tag_ == Token::DEC); + + auto op = tok->tag_ == Token::INC ? + Token::PREFIX_INC: Token::PREFIX_DEC; + auto operand = ParseUnaryExpr(); + return UnaryOp::New(op, operand); +} + + +UnaryOp* Parser::ParseUnaryOp(const Token* tok, int op) { + auto operand = ParseCastExpr(); + return UnaryOp::New(op, operand); +} + + +QualType Parser::ParseTypeName() { + auto type = ParseSpecQual(); + if (ts_.Test('*') || ts_.Test('(') || ts_.Test('[')) // abstract-declarator FIRST set + return ParseAbstractDeclarator(type); + return type; +} + + +Expr* Parser::ParseCastExpr() { + auto tok = ts_.Next(); + if (tok->tag_ == '(' && IsTypeName(ts_.Peek())) { + auto type = ParseTypeName(); + ts_.Expect(')'); + if (ts_.Test('{')) { + auto anony = ParseCompoundLiteral(type); + return ParsePostfixExprTail(anony); + } + auto operand = ParseCastExpr(); + return UnaryOp::New(Token::CAST, operand, type); + } + + ts_.PutBack(); + return ParseUnaryExpr(); +} + +Expr* Parser::ParseRangeExpr() { + auto lhs = ParseCastExpr(); + auto tok = ts_.Next(); + while (tok->tag_ == Token::ELLIPSIS){ + auto rhs = ParseCastExpr(); + lhs = BinaryOp::New(tok, lhs, rhs); + tok = ts_.Next(); + } + ts_.PutBack(); + return lhs; +} + +Expr* Parser::ParseMultiplicativeExpr() { + auto lhs = ParseRangeExpr(); + auto tok = ts_.Next(); + while (tok->tag_ == '*' || tok->tag_ == '/' || tok->tag_ == '%') { + auto rhs = ParseRangeExpr(); + lhs = BinaryOp::New(tok, lhs, rhs); + + tok = ts_.Next(); + } + + ts_.PutBack(); + return lhs; +} + + +Expr* Parser::ParseAdditiveExpr() { + auto lhs = ParseMultiplicativeExpr(); + auto tok = ts_.Next(); + while (tok->tag_ == '+' || tok->tag_ == '-') { + auto rhs = ParseMultiplicativeExpr(); + lhs = BinaryOp::New(tok, lhs, rhs); + + tok = ts_.Next(); + } + + ts_.PutBack(); + return lhs; +} + + +Expr* Parser::ParseShiftExpr() { + auto lhs = ParseAdditiveExpr(); + auto tok = ts_.Next(); + while (tok->tag_ == Token::LEFT || tok->tag_ == Token::RIGHT) { + auto rhs = ParseAdditiveExpr(); + lhs = BinaryOp::New(tok, lhs, rhs); + + tok = ts_.Next(); + } + + ts_.PutBack(); + return lhs; +} + + +Expr* Parser::ParseRelationalExpr() { + auto lhs = ParseShiftExpr(); + auto tok = ts_.Next(); + while (tok->tag_ == Token::LE || tok->tag_ == Token::GE + || tok->tag_ == '<' || tok->tag_ == '>') { + auto rhs = ParseShiftExpr(); + lhs = BinaryOp::New(tok, lhs, rhs); + + tok = ts_.Next(); + } + + ts_.PutBack(); + return lhs; +} + + +Expr* Parser::ParseEqualityExpr() { + auto lhs = ParseRelationalExpr(); + auto tok = ts_.Next(); + while (tok->tag_ == Token::EQ || tok->tag_ == Token::NE) { + auto rhs = ParseRelationalExpr(); + lhs = BinaryOp::New(tok, lhs, rhs); + + tok = ts_.Next(); + } + + ts_.PutBack(); + return lhs; +} + + +Expr* Parser::ParseBitiwiseAndExpr() { + auto lhs = ParseEqualityExpr(); + auto tok = ts_.Peek(); + while (ts_.Try('&')) { + auto rhs = ParseEqualityExpr(); + lhs = BinaryOp::New(tok, lhs, rhs); + + tok = ts_.Peek(); + } + + return lhs; +} + + +Expr* Parser::ParseBitwiseXorExpr() { + auto lhs = ParseBitiwiseAndExpr(); + auto tok = ts_.Peek(); + while (ts_.Try('^')) { + auto rhs = ParseBitiwiseAndExpr(); + lhs = BinaryOp::New(tok, lhs, rhs); + + tok = ts_.Peek(); + } + + return lhs; +} + + +Expr* Parser::ParseBitwiseOrExpr() { + auto lhs = ParseBitwiseXorExpr(); + auto tok = ts_.Peek(); + while (ts_.Try('|')) { + auto rhs = ParseBitwiseXorExpr(); + lhs = BinaryOp::New(tok, lhs, rhs); + + tok = ts_.Peek(); + } + + return lhs; +} + + +Expr* Parser::ParseLogicalAndExpr() { + auto lhs = ParseBitwiseOrExpr(); + auto tok = ts_.Peek(); + while (ts_.Try(Token::LOGICAL_AND)) { + auto rhs = ParseBitwiseOrExpr(); + lhs = BinaryOp::New(tok, lhs, rhs); + + tok = ts_.Peek(); + } + + return lhs; +} + + +Expr* Parser::ParseLogicalOrExpr() { + auto lhs = ParseLogicalAndExpr(); + auto tok = ts_.Peek(); + while (ts_.Try(Token::LOGICAL_OR)) { + auto rhs = ParseLogicalAndExpr(); + lhs = BinaryOp::New(tok, lhs, rhs); + + tok = ts_.Peek(); + } + + return lhs; +} + + +Expr* Parser::ParseConditionalExpr() { + auto cond = ParseLogicalOrExpr(); + auto tok = ts_.Peek(); + if (ts_.Try('?')) { + // Non-standard GNU extension + // a ?: b equals a ? a: c + auto exprTrue = ts_.Test(':') ? cond: ParseExpr(); + ts_.Expect(':'); + auto exprFalse = ParseConditionalExpr(); + + return ConditionalOp::New(tok, cond, exprTrue, exprFalse); + } + + return cond; +} + + +Expr* Parser::ParseAssignExpr() { + // Yes, I know the lhs should be unary expression, + // let it handled by type checking + Expr* lhs = ParseConditionalExpr(); + Expr* rhs; + + auto tok = ts_.Next(); + switch (tok->tag_) { + case Token::MUL_ASSIGN: + rhs = ParseAssignExpr(); + rhs = BinaryOp::New(tok, '*', lhs, rhs); + break; + + case Token::DIV_ASSIGN: + rhs = ParseAssignExpr(); + rhs = BinaryOp::New(tok, '/', lhs, rhs); + break; + + case Token::MOD_ASSIGN: + rhs = ParseAssignExpr(); + rhs = BinaryOp::New(tok, '%', lhs, rhs); + break; + + case Token::ADD_ASSIGN: + rhs = ParseAssignExpr(); + rhs = BinaryOp::New(tok, '+', lhs, rhs); + break; + + case Token::SUB_ASSIGN: + rhs = ParseAssignExpr(); + rhs = BinaryOp::New(tok, '-', lhs, rhs); + break; + + case Token::LEFT_ASSIGN: + rhs = ParseAssignExpr(); + rhs = BinaryOp::New(tok, Token::LEFT, lhs, rhs); + break; + + case Token::RIGHT_ASSIGN: + rhs = ParseAssignExpr(); + rhs = BinaryOp::New(tok, Token::RIGHT, lhs, rhs); + break; + + case Token::AND_ASSIGN: + rhs = ParseAssignExpr(); + rhs = BinaryOp::New(tok, '&', lhs, rhs); + break; + + case Token::XOR_ASSIGN: + rhs = ParseAssignExpr(); + rhs = BinaryOp::New(tok, '^', lhs, rhs); + break; + + case Token::OR_ASSIGN: + rhs = ParseAssignExpr(); + rhs = BinaryOp::New(tok, '|', lhs, rhs); + break; + + case '=': + rhs = ParseAssignExpr(); + break; + + default: + ts_.PutBack(); + return lhs; // Could be constant + } + + return BinaryOp::New(tok, '=', lhs, rhs); +} + + +void Parser::ParseStaticAssert() { + ts_.Expect('('); + auto condExpr = ParseAssignExpr(); + ts_.Expect(','); + auto msg = ConcatLiterals(ts_.Expect(Token::LITERAL)); + ts_.Expect(')'); + ts_.Expect(';'); + if (!Evaluator().Eval(condExpr)) { + Error(ts_.Peek(), "static assertion failed: %s\n", + msg->SVal()->c_str()); + } +} + + +// Return: list of declarations +CompoundStmt* Parser::ParseDecl() { + StmtList stmts; + if (ts_.Try(Token::STATIC_ASSERT)) { + ParseStaticAssert(); + } else { + int storageSpec, funcSpec, align; + auto type = ParseDeclSpec(&storageSpec, &funcSpec, &align); + if (!ts_.Test(';')) { + do { + auto ident = ParseDirectDeclarator(type, storageSpec, funcSpec, align); + auto init = ParseInitDeclarator(ident); + if (init) stmts.push_back(init); + } while (ts_.Try(',')); + } + ts_.Expect(';'); + } + + return CompoundStmt::New(stmts); +} + + +// For state machine +enum { + // Compatibility for these key words + COMP_SIGNED = T_SHORT | T_INT | T_LONG | T_LLONG, + COMP_UNSIGNED = T_SHORT | T_INT | T_LONG | T_LLONG, + COMP_CHAR = T_SIGNED | T_UNSIGNED, + COMP_SHORT = T_SIGNED | T_UNSIGNED | T_INT, + COMP_INT = T_SIGNED | T_UNSIGNED | T_LONG | T_SHORT | T_LLONG, + COMP_LONG = T_SIGNED | T_UNSIGNED | T_LONG | T_INT, + COMP_DOUBLE = T_LONG | T_COMPLEX, + COMP_COMPLEX = T_FLOAT | T_DOUBLE | T_LONG, + + COMP_THREAD = S_EXTERN | S_STATIC, +}; + + +static inline void TypeLL(int& typeSpec) { + if (typeSpec & T_LONG) { + typeSpec &= ~T_LONG; + typeSpec |= T_LLONG; + } else { + typeSpec |= T_LONG; + } +} + + +QualType Parser::ParseSpecQual() { + return ParseDeclSpec(nullptr, nullptr, nullptr); +} + + +static void EnsureAndSetStorageSpec(const Token* tok, int* storage, int spec) { + if (!storage) + Error(tok, "unexpected storage specifier"); + if (*storage != 0) + Error(tok, "duplicated storage specifier"); + *storage |= spec; +} + + +/* + * param: storage: null, only type specifier and qualifier accepted; + */ +QualType Parser::ParseDeclSpec(int* storageSpec, int* funcSpec, int* alignSpec) { +#define ERR_FUNC_SPEC ("unexpected function specifier") +#define ERR_STOR_SPEC ("unexpected storage specifier") +#define ERR_DECL_SPEC ("two or more data types in declaration specifiers") + + QualType type(nullptr); + int qualSpec = 0; + int typeSpec = 0; + + if (storageSpec) *storageSpec = 0; + if (funcSpec) *funcSpec = 0; + if (alignSpec) *alignSpec = 0; + + const Token* tok; + for (; ;) { + tok = ts_.Next(); + switch (tok->tag_) { + // Function specifier + case Token::INLINE: + if (!funcSpec) + Error(tok, ERR_FUNC_SPEC); + *funcSpec |= F_INLINE; + break; + + case Token::NORETURN: + if (!funcSpec) + Error(tok, ERR_FUNC_SPEC); + *funcSpec |= F_NORETURN; + break; + + // Alignment specifier + case Token::ALIGNAS: { + if (!alignSpec) + Error(tok, "unexpected alignment specifier"); + auto align = ParseAlignas(); + if (align) + *alignSpec = align; + break; + } + // Storage specifier + // TODO(wgtdkp): typedef needs more constraints + case Token::TYPEDEF: + EnsureAndSetStorageSpec(tok, storageSpec, S_TYPEDEF); + break; + + case Token::EXTERN: + EnsureAndSetStorageSpec(tok, storageSpec, S_EXTERN); + break; + + case Token::STATIC: + if (!storageSpec) + Error(tok, ERR_FUNC_SPEC); + if (*storageSpec & ~S_THREAD) + Error(tok, "duplicated storage specifier"); + *storageSpec |= S_STATIC; + break; + + case Token::THREAD: + if (!storageSpec) + Error(tok, ERR_FUNC_SPEC); + if (*storageSpec & ~COMP_THREAD) + Error(tok, "duplicated storage specifier"); + *storageSpec |= S_THREAD; + break; + + case Token::AUTO: + EnsureAndSetStorageSpec(tok, storageSpec, S_AUTO); + break; + + case Token::REGISTER: + EnsureAndSetStorageSpec(tok, storageSpec, S_REGISTER); + break; + + // Type qualifier + case Token::CONST: qualSpec |= Qualifier::CONST; break; + case Token::RESTRICT: qualSpec |= Qualifier::RESTRICT; break; + case Token::VOLATILE: qualSpec |= Qualifier::VOLATILE; break; + + // Type specifier + case Token::SIGNED: + if (typeSpec & ~COMP_SIGNED) + Error(tok, ERR_DECL_SPEC); + typeSpec |= T_SIGNED; + break; + + case Token::UNSIGNED: + if (typeSpec & ~COMP_UNSIGNED) + Error(tok, ERR_DECL_SPEC); + typeSpec |= T_UNSIGNED; + break; + + case Token::VOID: + if (typeSpec & ~0) + Error(tok, ERR_DECL_SPEC); + typeSpec |= T_VOID; + break; + + case Token::CHAR: + if (typeSpec & ~COMP_CHAR) + Error(tok, ERR_DECL_SPEC); + typeSpec |= T_CHAR; + break; + + case Token::SHORT: + if (typeSpec & ~COMP_SHORT) + Error(tok, ERR_DECL_SPEC); + typeSpec |= T_SHORT; + break; + + case Token::INT: + if (typeSpec & ~COMP_INT) + Error(tok, ERR_DECL_SPEC); + typeSpec |= T_INT; + break; + + case Token::LONG: + if (typeSpec & ~COMP_LONG) + Error(tok, ERR_DECL_SPEC); + TypeLL(typeSpec); + break; + + case Token::HALF: + if(typeSpec & ~T_COMPLEX) + Error(tok, ERR_DECL_SPEC); + typeSpec |= T_HALF; + break; + + case Token::FLOAT: + if (typeSpec & ~T_COMPLEX) + Error(tok, ERR_DECL_SPEC); + typeSpec |= T_FLOAT; + break; + + case Token::DOUBLE: + if (typeSpec & ~COMP_DOUBLE) + Error(tok, ERR_DECL_SPEC); + typeSpec |= T_DOUBLE; + break; + + case Token::BOOL: + if (typeSpec != 0) + Error(tok, ERR_DECL_SPEC); + typeSpec |= T_BOOL; + break; + + case Token::COMPLEX: + if (typeSpec & ~COMP_COMPLEX) + Error(tok, ERR_DECL_SPEC); + typeSpec |= T_COMPLEX; + break; + + case Token::STRUCT: + case Token::UNION: + if (typeSpec & ~0) + Error(tok, ERR_DECL_SPEC); + type = ParseStructUnionSpec(Token::STRUCT == tok->tag_); + typeSpec |= T_STRUCT_UNION; + break; + + case Token::ENUM: + if (typeSpec != 0) + Error(tok, ERR_DECL_SPEC); + type = ParseEnumSpec(); + typeSpec |= T_ENUM; + break; + + case Token::ATOMIC: + Error(tok, "atomic not supported"); + break; + + default: + if (typeSpec == 0 && IsTypeName(tok)) { + auto ident = curScope_->Find(tok); + type = ident->Type(); + // We may change the length of a array type by initializer, + // thus, make a copy of this type. + auto arrType = type->ToArray(); + if (arrType && !type->Complete()) + type = ArrayType::New(arrType->Len(), arrType->Derived()); + typeSpec |= T_TYPEDEF_NAME; + } else { + goto end_of_loop; + } + } + } + +end_of_loop: + ts_.PutBack(); + switch (typeSpec) { + case 0: + Error(tok, "expect type specifier"); + break; + + case T_VOID: + type = VoidType::New(); + break; + + case T_STRUCT_UNION: + case T_ENUM: + case T_TYPEDEF_NAME: + break; + + default: + type = ArithmType::New(typeSpec); + break; + } + // GNU extension: type attributes + //if (storageSpec && (*storageSpec & S_TYPEDEF)) + // TryAttributeSpecList(); + + return QualType(type.GetPtr(), qualSpec | type.Qual()); + +#undef ERR_FUNC_SPEC +#undef ERR_STOR_SPEC +#undef ERR_DECL_SPEC +} + + +int Parser::ParseAlignas() { + int align; + ts_.Expect('('); + auto tok = ts_.Peek(); + if (IsTypeName(ts_.Peek())) { + auto type = ParseTypeName(); + ts_.Expect(')'); + align = type->Align(); + } else { + auto expr = ParseExpr(); + align = Evaluator().Eval(expr); + ts_.Expect(')'); + } + if (align < 0 || ((align - 1) & align)) + Error(tok, "requested alignment is not a positive power of 2"); + return align; +} + + +Type* Parser::ParseEnumSpec() { + // GNU extension: type attributes + TryAttributeSpecList(); + + std::string tagName; + auto tok = ts_.Peek(); + if (ts_.Try(Token::IDENTIFIER)) { + tagName = tok->str_; + if (ts_.Try('{')) { + // 定义enum类型 + auto tagIdent = curScope_->FindTagInCurScope(tok); + if (!tagIdent) { + auto type = ArithmType::New(T_INT); + auto ident = Identifier::New(tok, type, L_NONE); + curScope_->InsertTag(ident); + return ParseEnumerator(type); // 处理反大括号: '}' + } + + if (!tagIdent->Type()->IsInteger()) // struct/union tag + Error(tok, "redefinition of enumeration tag '%s'", tagName.c_str()); + return ParseEnumerator(tagIdent->Type()->ToArithm()); + } else { + auto tagIdent = curScope_->FindTag(tok); + if (tagIdent) { + return tagIdent->Type(); + } + auto type = ArithmType::New(T_INT); + auto ident = Identifier::New(tok, type, L_NONE); + curScope_->InsertTag(ident); + return type; + } + } + + ts_.Expect('{'); + auto type = ArithmType::New(T_INT); + return ParseEnumerator(type); // 处理反大括号: '}' +} + + +Type* Parser::ParseEnumerator(ArithmType* type) { + assert(type && type->IsInteger()); + int val = 0; + do { + auto tok = ts_.Expect(Token::IDENTIFIER); + // GNU extension: enumerator attributes + TryAttributeSpecList(); + + const auto& enumName = tok->str_; + auto ident = curScope_->FindInCurScope(tok); + if (ident) { + Error(tok, "redefinition of enumerator '%s'", enumName.c_str()); + } + if (ts_.Try('=')) { + auto expr = ParseAssignExpr(); + val = Evaluator().Eval(expr); + } + auto enumer = Enumerator::New(tok, val); + ++val; + curScope_->Insert(enumer); + ts_.Try(','); + } while (!ts_.Try('}')); + + type->SetComplete(true); + return type; +} + + +/* + * 四种 name space: + * 1.label, 如 goto end; 它有函数作用域 + * 2.struct/union/enum 的 tag + * 3.struct/union 的成员 + * 4.其它的普通的变量 + */ +Type* Parser::ParseStructUnionSpec(bool isStruct) { + // GNU extension: type attributes + TryAttributeSpecList(); + + std::string tagName; + auto tok = ts_.Peek(); + if (ts_.Try(Token::IDENTIFIER)) { + tagName = tok->str_; + if (ts_.Try('{')) { + // 看见大括号,表明现在将定义该struct/union类型 + // 我们不用关心上层scope是否定义了此tag,如果定义了,那么就直接覆盖定义 + auto tagIdent = curScope_->FindTagInCurScope(tok); + if (!tagIdent) { + // 现在是在当前scope第一次看到name,所以现在是第一次定义,连前向声明都没有; + auto type = StructType::New(isStruct, tagName.size(), curScope_); + auto ident = Identifier::New(tok, type, L_NONE); + curScope_->InsertTag(ident); + return ParseStructUnionDecl(type); // 处理反大括号: '}' + } + + + // 在当前scope找到了类型,但可能只是声明;注意声明与定义只能出现在同一个scope; + // 1.如果声明在定义的外层scope,那么即使在内层scope定义了完整的类型,此声明仍然是无效的; + // 因为如论如何,编译器都不会在内部scope里面去找定义,所以声明的类型仍然是不完整的; + // 2.如果声明在定义的内层scope,(也就是先定义,再在内部scope声明),这时,不完整的声明会覆盖掉完整的定义; + // 因为编译器总是向上查找符号,不管找到的是完整的还是不完整的,都要; + if (!tagIdent->Type()->Complete()) { + // 找到了此tag的前向声明,并更新其符号表,最后设置为complete type + return ParseStructUnionDecl(tagIdent->Type()->ToStruct()); + } else { + // 在当前作用域找到了完整的定义,并且现在正在定义同名的类型,所以报错; + Error(tok, "redefinition of struct tag '%s'", tagName.c_str()); + } + } else { + // 没有大括号,表明不是定义一个struct/union;那么现在只可能是在: + // 1.声明; + // 2.声明的同时,定义指针(指针允许指向不完整类型) (struct Foo* p; 是合法的) 或者其他合法的类型; + // 如果现在索引符号表,那么: + // 1.可能找到name的完整定义,也可能只找得到不完整的声明;不管name指示的是不是完整类型,我们都只能选择name指示的类型; + // 2.如果我们在符号表里面压根找不到name,那么现在是name的第一次声明,创建不完整的类型并插入符号表; + auto tagIdent = curScope_->FindTag(tok); + + // 如果tag已经定义或声明,那么直接返回此定义或者声明 + if (tagIdent) { + return tagIdent->Type(); + } + // 如果tag尚没有定义或者声明,那么创建此tag的声明(因为没有见到‘{’,所以不会是定义) + auto type = StructType::New(isStruct, true, curScope_); + + // 因为有tag,所以不是匿名的struct/union, 向当前的scope插入此tag + auto ident = Identifier::New(tok, type, L_NONE); + curScope_->InsertTag(ident); + return type; + } + } + // 没见到identifier,那就必须有struct/union的定义,这叫做匿名struct/union; + ts_.Expect('{'); + + // 现在,如果是有tag,那它没有前向声明;如果是没有tag,那更加没有前向声明; + // 所以现在是第一次开始定义一个完整的struct/union类型 + auto type = StructType::New(isStruct, tagName.size(), curScope_); + return ParseStructUnionDecl(type); // 处理反大括号: '}' +} + + +StructType* Parser::ParseStructUnionDecl(StructType* type) { +#define ADD_MEMBER() { \ + auto member = Object::New(tok, memberType); \ + if (align > 0) \ + member->SetAlign(align); \ + type->AddMember(member); \ +} + + // 既然是定义,那输入肯定是不完整类型,不然就是重定义了 + assert(type && !type->Complete()); + + auto scopeBackup = curScope_; + curScope_ = type->MemberMap(); // Internal symbol lookup rely on curScope_ + while (!ts_.Try('}')) { + if (ts_.Empty()) { + Error(ts_.Peek(), "premature end of input"); + } + + if(ts_.Try(Token::STATIC_ASSERT)) { + ParseStaticAssert(); + continue; + } + + // 解析type specifier/qualifier, 不接受storage等 + int align; + auto baseType = ParseDeclSpec(nullptr, nullptr, &align); + do { + auto tokTypePair = ParseDeclarator(baseType); + auto tok = tokTypePair.first; + auto memberType = tokTypePair.second; + + if (ts_.Try(':')) { + ParseBitField(type, tok, memberType); + continue; + } + + if (tok == nullptr) { + auto suType = memberType->ToStruct(); + if (suType && !suType->HasTag()) { + auto anony = Object::NewAnony(ts_.Peek(), suType); + type->MergeAnony(anony); + continue; + } else { + Error(ts_.Peek(), "declaration does not declare anything"); + } + } + + const auto& name = tok->str_; + if (type->GetMember(name)) { + Error(tok, "duplicate member '%s'", name.c_str()); + } else if (!memberType->Complete()) { + // C11 6.7.2.1 [3]: + if (type->IsStruct() && + // Struct has more than one named member + type->MemberMap()->size() > 0 && + memberType->ToArray()) { + ts_.Expect(';'); ts_.Expect('}'); + ADD_MEMBER(); + goto finalize; + } else { + Error(tok, "field '%s' has incomplete type", name.c_str()); + } + } else if (memberType->ToFunc()) { + Error(tok, "field '%s' declared as a function", name.c_str()); + } + + ADD_MEMBER(); + } while (ts_.Try(',')); + ts_.Expect(';'); + } +finalize: + // GNU extension: type attributes + TryAttributeSpecList(); + + // struct/union定义结束,设置其为完整类型 + type->Finalize(); + type->SetComplete(true); + // TODO(wgtdkp): we need to export tags defined inside struct + const auto& tags = curScope_->AllTagsInCurScope(); + for (auto tag: tags) { + if (scopeBackup->FindTag(tag->Tok())) + Error(tag, "redefinition of tag '%s'\n", tag->Name().c_str()); + scopeBackup->InsertTag(tag); + } + curScope_ = scopeBackup; + + return type; +} + + +void Parser::ParseBitField(StructType* structType, + const Token* tok, + QualType type) { + if (!type->IsInteger()) { + Error(tok ? tok: ts_.Peek(), "expect integer type for bitfield"); + } + + auto expr = ParseAssignExpr(); + auto width = Evaluator().Eval(expr); + if (width < 0) { + Error(expr, "expect non negative value"); + } else if (width == 0 && tok) { + Error(tok, "no declarator expected for a bitfield with width 0"); + } else if (width > type->Width() * 8) { + Error(expr, "width exceeds its type"); + } + + auto offset = structType->Offset() - type->Width(); + // C11 6.7.5 [2]: alignment attribute shall not be specified in declaration of a bit field + // so here is ok to use type->Align() + offset = Type::MakeAlign(std::max(offset, 0), type->Align()); + + int bitFieldOffset; + unsigned char begin; + + if (!structType->IsStruct()) { + begin = 0; + bitFieldOffset = 0; + } else if (structType->Members().size() == 0) { + begin = 0; + bitFieldOffset = 0; + } else { + auto last = structType->Members().back(); + auto totalBits = last->Offset() * 8; + if (last->BitFieldWidth()) { + totalBits += last->BitFieldEnd(); + } else { // Is not bit field + totalBits += last->Type()->Width() * 8; + } + + if (width == 0) + width = type->Width() * 8 - totalBits; // So posterior bitfield would be packed + if (width == 0) // A bitfield with zero width is never added to member list + return; // Because we use bitfield width to tell if a member is bitfield or not. + if (width + totalBits <= type->Width() * 8) { + begin = totalBits % 8; + bitFieldOffset = totalBits / 8; + } else { + begin = 0; + bitFieldOffset = Type::MakeAlign(structType->Offset(), type->Width()); + } + } + + Object* bitField; + if (tok) { + bitField = Object::New(tok, type, 0, L_NONE, begin, width); + } else { + bitField = Object::NewAnony(ts_.Peek(), type, 0, L_NONE, begin, width); + } + structType->AddBitField(bitField, bitFieldOffset); +} + + +int Parser::ParseQual() { + int qualSpec = 0; + for (; ;) { + auto tok = ts_.Next(); + switch (tok->tag_) { + case Token::CONST: qualSpec |= Qualifier::CONST; break; + case Token::RESTRICT: qualSpec |= Qualifier::RESTRICT; break; + case Token::VOLATILE: qualSpec |= Qualifier::VOLATILE; break; + case Token::ATOMIC: Error(tok, "do not support 'atomic'"); break; + default: ts_.PutBack(); return qualSpec; + } + } +} + + +QualType Parser::ParsePointer(QualType typePointedTo) { + while (ts_.Try('*')) { + auto t = PointerType::New(typePointedTo); + typePointedTo = QualType(t, ParseQual()); + } + return typePointedTo; +} + + +static QualType ModifyBase(QualType type, QualType base, QualType newBase) { + if (type == base) + return newBase; + + auto ty = type->ToDerived(); + ty->SetDerived(ModifyBase(ty->Derived(), base, newBase)); + + return ty; +} + + +/* + * Return: pair of token(must be identifier) and it's type + * if token is nullptr, then we are parsing abstract declarator + * else, parsing direct declarator. + */ +TokenTypePair Parser::ParseDeclarator(QualType base) { + // May be pointer + auto pointerType = ParsePointer(base); + + if (ts_.Try('(')) { + // 现在的 pointerType 并不是正确的 base type + auto tokenTypePair = ParseDeclarator(pointerType); + auto tok = tokenTypePair.first; + auto type = tokenTypePair.second; + + ts_.Expect(')'); + + auto newBase = ParseArrayFuncDeclarator(tok, pointerType); + + // 修正 base type + auto retType = ModifyBase(type, pointerType, newBase); + return TokenTypePair(tokenTypePair.first, retType); + } else if (ts_.Peek()->IsIdentifier()) { + auto tok = ts_.Next(); + // GNU extension: variable attributes + TryAttributeSpecList(); + auto retType = ParseArrayFuncDeclarator(tok, pointerType); + return TokenTypePair(tok, retType); + } else { + errTok_ = ts_.Peek(); + auto retType = ParseArrayFuncDeclarator(nullptr, pointerType); + return TokenTypePair(nullptr, retType); + } +} + + +Identifier* Parser::ProcessDeclarator(const Token* tok, + QualType type, + int storageSpec, + int funcSpec, + int align) { + assert(tok); + + // 检查在同一 scope 是否已经定义此变量 + // 如果 storage 是 typedef,那么应该往符号表里面插入 type + // 定义 void 类型变量是非法的,只能是指向void类型的指针 + // 如果 funcSpec != 0, 那么现在必须是在定义函数,否则出错 + const auto& name = tok->str_; + Identifier* ident; + + if (storageSpec & S_TYPEDEF) { + // C11 6.7.5 [2]: alignment specifier + if (align > 0) + Error(tok, "alignment specified for typedef"); + + ident = curScope_->FindInCurScope(tok); + if (ident) { // There is prio declaration in the same scope + // The same declaration, simply return the prio declaration + if (!type->Compatible(*ident->Type())) + Error(tok, "conflicting types for '%s'", name.c_str()); + + // TODO(wgtdkp): add previous declaration information + return ident; + } + ident = Identifier::New(tok, type, L_NONE); + curScope_->Insert(ident); + return ident; + } + + if (type->ToVoid()) { + Error(tok, "variable or field '%s' declared void", + name.c_str()); + } + + if (type->ToFunc() && curScope_->Type() != S_FILE + && (storageSpec & S_STATIC)) { + Error(tok, "invalid storage class for function '%s'", name.c_str()); + } + + Linkage linkage; + // Identifiers in function prototype have no linkage + if (curScope_->Type() == S_PROTO) { + linkage = L_NONE; + } else if (curScope_->Type() == S_FILE) { + linkage = L_EXTERNAL; // Default linkage for file scope identifiers + if (storageSpec & S_STATIC) + linkage = L_INTERNAL; + } else if (!(storageSpec & S_EXTERN)) { + linkage = L_NONE; // Default linkage for block scope identifiers + if (type->ToFunc()) + linkage = L_EXTERNAL; + } else { + linkage = L_EXTERNAL; + } + + ident = curScope_->FindInCurScope(tok); + if (ident) { // There is prio declaration in the same scope + if (!type->Compatible(*ident->Type())) { + Error(tok, "conflicting types for '%s'", name.c_str()); + } + + // The same scope prio declaration has no linkage, + // there is a redeclaration error + if (linkage == L_NONE) { + Error(tok, "redeclaration of '%s' with no linkage", + name.c_str()); + } else if (linkage == L_EXTERNAL) { + if (ident->Linkage() == L_NONE) { + Error(tok, "conflicting linkage for '%s'", name.c_str()); + } + } else { + if (ident->Linkage() != L_INTERNAL) { + Error(tok, "conflicting linkage for '%s'", name.c_str()); + } + } + // The same declaration, simply return the prio declaration + if (!ident->Type()->Complete()) + ident->Type()->SetComplete(type->Complete()); + // Prio declaration of a function may omit the param name + if (type->ToFunc()) + ident->Type()->ToFunc()->SetParams(type->ToFunc()->Params()); + else if (ident->ToObject() && !(storageSpec & S_EXTERN)) + ident->ToObject()->SetStorage(ident->ToObject()->Storage() & ~S_EXTERN); + return ident; + } else if (linkage == L_EXTERNAL) { + ident = curScope_->Find(tok); + if (ident) { + if (!type->Compatible(*ident->Type())) { + Error(tok, "conflicting types for '%s'", name.c_str()); + } + if (ident->Linkage() != L_NONE) { + linkage = ident->Linkage(); + } + // Don't return, override it + } else { + ident = externalSymbols_->FindInCurScope(tok); + if (ident) { + if (!type->Compatible(*ident->Type())) { + Error(tok, "conflicting types for '%s'", name.c_str()); + } + // TODO(wgtdkp): ??????? + // Don't return + // To stop later declaration with the same name in the same scope overriding this declaration + + // Useless here, just keep it + if (!ident->Type()->Complete()) + ident->Type()->SetComplete(type->Complete()); + //return ident; + } + } + } + + Identifier* ret; + // TODO(wgtdkp): Treat function as object ? + if (type->ToFunc()) { + // C11 6.7.5 [2]: alignment specifier + if (align > 0) + Error(tok, "alignment specified for function"); + ret = Identifier::New(tok, type, linkage); + } else { + auto obj = Object::New(tok, type, storageSpec, linkage); + if (align > 0) + obj->SetAlign(align); + ret = obj; + } + curScope_->Insert(ret); + if (linkage == L_EXTERNAL && ident == nullptr) { + externalSymbols_->Insert(ret); + } + + return ret; +} + + +QualType Parser::ParseArrayFuncDeclarator(const Token* ident, QualType base) { + if (ts_.Try('[')) { + + if (ts_.Try('{')) { + if(!base->IsScalar()) { + Error(ts_.Peek(), "tiles must have scalar elements"); + } + auto shape = ParseTileShape(); + ts_.Expect('}'); + ts_.Expect(']'); + base = ParseArrayFuncDeclarator(ident, base); + if (!base->Complete()) { + // FIXME(wgtdkp): ident could be nullptr + Error(ident, "'%s' has incomplete element type", + ident->str_.c_str()); + } + return TileType::New(shape, base); + } + + if (nullptr != base->ToFunc()) { + Error(ts_.Peek(), "the element of array cannot be a function"); + } + + auto len = ParseArrayLength(); + ts_.Expect(']'); + + base = ParseArrayFuncDeclarator(ident, base); + if (!base->Complete()) { + // FIXME(wgtdkp): ident could be nullptr + Error(ident, "'%s' has incomplete element type", + ident->str_.c_str()); + } + return ArrayType::New(len, base); + } else if (ts_.Try('(')) { // Function declaration + if (base->ToFunc()) { + Error(ts_.Peek(), + "the return value of function cannot be function"); + } else if (nullptr != base->ToArray()) { + Error(ts_.Peek(), + "the return value of function cannot be array"); + } + + FuncType::ParamList params; + EnterProto(); + auto variadic = ParseParamList(params); + ExitProto(); + + ts_.Expect(')'); + base = ParseArrayFuncDeclarator(ident, base); + + return FuncType::New(base, 0, variadic, params); + } + + + return base; +} + + +/* + * Return: -1, length not specified + */ +int Parser::ParseArrayLength() { + auto hasStatic = ts_.Try(Token::STATIC); + auto qual = ParseQual(); + if (0 != qual) + hasStatic = ts_.Try(Token::STATIC); + + // 不支持变长数组 + if (!hasStatic && ts_.Test(']')) + return -1; + + auto expr = ParseAssignExpr(); + EnsureInteger(expr); + auto ret = Evaluator().Eval(expr); + if (ret < 0) { + Error(expr, "size of array is negative"); + } + return ret; +} + +TileType::ShapeInt Parser::ParseTileShape() { + TileType::ShapeInt ret; + size_t i = 0; + do { + Expr* expr = ParseConditionalExpr(); + EnsureInteger(expr); + int dim = Evaluator().Eval(expr); + if (dim < 0) + Error(expr, "shape %d of tile is negative", i); + ret.push_back(dim); + i++; + }while(ts_.Try(',')); + return ret; +} + +/* + * Return: true, variadic; + */ +bool Parser::ParseParamList(FuncType::ParamList& params) { + if (ts_.Test(')')) + return false; + auto param = ParseParamDecl(); + if (param->Type()->ToVoid()) + return false; + params.push_back(param); + + while (ts_.Try(',')) { + if (ts_.Try(Token::ELLIPSIS)) + return true; + param = ParseParamDecl(); + if (param->Type()->ToVoid()) + Error(param, "'void' must be the only parameter"); + params.push_back(param); + } + return false; +} + + +Object* Parser::ParseParamDecl() { + int storageSpec, funcSpec; + // C11 6.7.5 [2]: alignment specifier cannot be specified in params + auto type = ParseDeclSpec(&storageSpec, &funcSpec, nullptr); + auto tokTypePair = ParseDeclarator(type); + auto tok = tokTypePair.first; + type = Type::MayCast(tokTypePair.second, true); + if (!tok) { // Abstract declarator + return Object::NewAnony(ts_.Peek(), type, 0, Linkage::L_NONE); + } + + // Align set to non positive, stands for not specified + auto ident = ProcessDeclarator(tok, type, storageSpec, funcSpec, -1); + if (!ident->ToObject()) + Error(ident, "expect object in param list"); + + return ident->ToObject(); +} + + +QualType Parser::ParseAbstractDeclarator(QualType type) { + auto tokenTypePair = ParseDeclarator(type); + auto tok = tokenTypePair.first; + type = tokenTypePair.second; + if (tok) { // Not a abstract declarator! + Error(tok, "unexpected identifier '%s'", tok->str_.c_str()); + } + return type; +} + + +Identifier* Parser::ParseDirectDeclarator(QualType type, + int storageSpec, + int funcSpec, + int align) { + auto tokenTypePair = ParseDeclarator(type); + auto tok = tokenTypePair.first; + type = tokenTypePair.second; + if (tok == nullptr) { + Error(errTok_, "expect identifier or '('"); + } + + return ProcessDeclarator(tok, type, storageSpec, funcSpec, align); +} + + +Declaration* Parser::ParseInitDeclarator(Identifier* ident) { + auto obj = ident->ToObject(); + if (!obj) { // Do not record function Declaration + return nullptr; + } + + const auto& name = obj->Name(); + if (ts_.Try('=')) { + return ParseInitDeclaratorSub(obj); + } + + if (!obj->Type()->Complete()) { + if (obj->Linkage() == L_NONE) { + Error(obj, "storage size of '%s' isn’t known", name.c_str()); + } + // FIXME(wgtdkp): + // Discards the incomplete object declarations + // It causes linking failure of forward-declared objects with imcomplete type + return nullptr; + } + + if (!obj->Decl()) { + auto decl = Declaration::New(obj); + obj->SetDecl(decl); + return decl; + } + + return nullptr; +} + + +Declaration* Parser::ParseInitDeclaratorSub(Object* obj) { + const auto& name = obj->Name(); + if ((curScope_->Type() != S_FILE) && obj->Linkage() != L_NONE) { + Error(obj, "'%s' has both 'extern' and initializer", name.c_str()); + } + + if (!obj->Type()->Complete() && !obj->Type()->ToArray()) { + Error(obj, "variable '%s' has initializer but incomplete type", + name.c_str()); + } + + if (obj->HasInit()) { + Error(obj, "redefinition of variable '%s'", name.c_str()); + } + + // There could be more than one declaration for + // an object in the same scope. + // But it must has external or internal linkage. + // So, for external/internal objects, + // the initialization will always go to + // the first declaration. As the initialization + // is evaluated at compile time, + // the order doesn't matter. + // For objects with no linkage, there is + // always only one declaration. + // Once again, we need not to worry about + // the order of the initialization. + if (obj->Decl()) { + ParseInitializer(obj->Decl(), obj->Type(), 0, false, true); + return nullptr; + } else { + auto decl = Declaration::New(obj); + ParseInitializer(decl, obj->Type(), 0, false, true); + obj->SetDecl(decl); + return decl; + } +} + + +void Parser::ParseInitializer(Declaration* decl, + QualType type, + int offset, + bool designated, + bool forceBrace, + unsigned char bitFieldBegin, + unsigned char bitFieldWidth) { + if (designated && !ts_.Test('.') && !ts_.Test('[')) { + ts_.Expect('='); + } + + Expr* expr; + auto arrType = type->ToArray(); + auto structType = type->ToStruct(); + // A compound literal in initializer is reduced to a initializer directly + // It means that the compound literal will never be created + //auto literalType = TryCompoundLiteral(); + //if (literalType && !literalType->Compatible(*type)) + // Error("incompatible type of initializer"); + if (arrType) { + if (forceBrace && !ts_.Test('{') && !ts_.Test(Token::LITERAL)) { + ts_.Expect('{'); + } else if (!ParseLiteralInitializer(decl, arrType, offset)) { + ParseArrayInitializer(decl, arrType, offset, designated); + arrType->SetComplete(true); + } + return; + } else if (structType) { + if (!ts_.Test('.') && !ts_.Test('{')) { + auto mark = ts_.Mark(); + expr = ParseAssignExpr(); + if (structType->Compatible(*expr->Type())) { + decl->AddInit({structType, offset, expr}); + return; + } + ts_.ResetTo(mark); + if (forceBrace) + ts_.Expect('{'); + } + return ParseStructInitializer(decl, structType, offset, designated); + } + + // Scalar type + auto hasBrace = ts_.Try('{'); + expr = ParseAssignExpr(); + if (hasBrace) { + ts_.Try(','); + ts_.Expect('}'); + } + decl->AddInit({type.GetPtr(), offset, expr, bitFieldBegin, bitFieldWidth}); +} + + +bool Parser::ParseLiteralInitializer(Declaration* decl, + ArrayType* type, + int offset) { + if (!type->Derived()->IsInteger()) + return false; + + auto hasBrace = ts_.Try('{'); + if (!ts_.Test(Token::LITERAL)) { + if (hasBrace) ts_.PutBack(); + return false; + } + auto literal = ConcatLiterals(ts_.Next()); + auto tok = literal->Tok(); + + if (hasBrace) { + ts_.Try(','); + ts_.Expect('}'); + } + + if (!type->Complete()) { + type->SetLen(literal->Type()->ToArray()->Len()); + type->SetComplete(true); + } + + auto width = std::min(type->Width(), literal->Type()->Width()); + auto str = literal->SVal()->c_str(); + + for (; width >= 8; width -= 8) { + auto p = reinterpret_cast(str); + auto type = ArithmType::New(T_LONG); + auto val = Constant::New(tok, T_LONG, static_cast(*p)); + decl->AddInit({type, offset, val}); + offset += 8; + str += 8; + } + + for (; width >= 4; width -= 4) { + auto p = reinterpret_cast(str); + auto type = ArithmType::New(T_INT); + auto val = Constant::New(tok, T_INT, static_cast(*p)); + decl->AddInit({type, offset, val}); + offset += 4; + str += 4; + } + + for (; width >= 2; width -= 2) { + auto p = reinterpret_cast(str); + auto type = ArithmType::New(T_SHORT); + auto val = Constant::New(tok, T_SHORT, static_cast(*p)); + decl->AddInit({type, offset, val}); + offset += 2; + str += 2; + } + + for (; width >= 1; --width) { + auto p = str; + auto type = ArithmType::New(T_CHAR); + auto val = Constant::New(tok, T_CHAR, static_cast(*p)); + decl->AddInit({type, offset, val}); + offset++; + str++; + } + + return true; +} + + +void Parser::ParseArrayInitializer(Declaration* decl, + ArrayType* type, + int offset, + bool designated) { + assert(type); + + if (!type->Complete()) + type->SetLen(0); + + int idx = 0; + auto width = type->Derived()->Width(); + auto hasBrace = ts_.Try('{'); + while (true) { + if (ts_.Test('}')) { + if (hasBrace) + ts_.Next(); + return; + } + + if (!designated && !hasBrace && (ts_.Test('.') || ts_.Test('['))) { + ts_.PutBack(); // Put the read comma(',') back + return; + } else if ((designated = ts_.Try('['))) { + auto expr = ParseAssignExpr(); + EnsureInteger(expr); + idx = Evaluator().Eval(expr); + ts_.Expect(']'); + + if (idx < 0 || (type->Complete() && idx >= type->Len())) { + Error(ts_.Peek(), "excess elements in array initializer"); + } + } + + ParseInitializer(decl, type->Derived(), offset + idx * width, designated); + designated = false; + ++idx; + + if (type->Complete() && idx >= type->Len()) { + break; + } else if (!type->Complete()) { + type->SetLen(std::max(idx, type->Len())); + } + + // Needless comma at the end is legal + if (!ts_.Try(',')) { + if (hasBrace) + ts_.Expect('}'); + return; + } + } + + if (hasBrace) { + ts_.Try(','); + if (!ts_.Try('}')) { + Error(ts_.Peek(), "excess elements in array initializer"); + } + } +} + + +StructType::Iterator Parser::ParseStructDesignator(StructType* type, + const std::string& name) { + auto iter = type->Members().begin(); + for (; iter != type->Members().end(); ++iter) { + if ((*iter)->Anonymous()) { + auto anonyType = (*iter)->Type()->ToStruct(); + assert(anonyType); + if (anonyType->GetMember(name)) { + return iter; // ParseStructDesignator(anonyType); + } + } else if ((*iter)->Name() == name) { + return iter; + } + } + assert(false); + return iter; +} + + +void Parser::ParseStructInitializer(Declaration* decl, + StructType* type, + int offset, + bool designated) { + assert(type); + + auto hasBrace = ts_.Try('{'); + auto member = type->Members().begin(); + while (true) { + if (ts_.Test('}')) { + if (hasBrace) + ts_.Next(); + return; + } + + if (!designated && !hasBrace && (ts_.Test('.') || ts_.Test('['))) { + ts_.PutBack(); // Put the read comma(',') back + return; + } + + if ((designated = ts_.Try('.'))) { + auto tok = ts_.Expect(Token::IDENTIFIER); + const auto& name = tok->str_; + if (!type->GetMember(name)) { + Error(tok, "member '%s' not found", name.c_str()); + } + member = ParseStructDesignator(type, name); + } + if (member == type->Members().end()) + break; + + if ((*member)->Anonymous()) { + if (designated) { // Put back '.' and member name. + ts_.PutBack(); + ts_.PutBack(); + } + // Because offsets of member of anonymous struct/union are based + // directly on external struct/union + ParseInitializer(decl, (*member)->Type(), offset, designated, false, + (*member)->BitFieldBegin(), (*member)->BitFieldWidth()); + } else { + ParseInitializer(decl, (*member)->Type(), + offset + (*member)->Offset(), designated, false, + (*member)->BitFieldBegin(), (*member)->BitFieldWidth()); + } + designated = false; + ++member; + + // Union, just init the first member + if (!type->IsStruct()) + break; + + if (!hasBrace && member == type->Members().end()) + break; + + // Needless comma at the end is allowed + if (!ts_.Try(',')) { + if (hasBrace) + ts_.Expect('}'); + return; + } + } + + if (hasBrace) { + ts_.Try(','); + if (!ts_.Try('}')) { + Error(ts_.Peek(), "excess members in struct initializer"); + } + } +} + + +/* + * Statements + */ + +Stmt* Parser::ParseStmt() { + auto tok = ts_.Next(); + if (tok->IsEOF()) + Error(tok, "premature end of input"); + + switch (tok->tag_) { + // GNU extension: statement attributes + case Token::ATTRIBUTE: + TryAttributeSpecList(); + case ';': + return EmptyStmt::New(); + case '{': + return ParseCompoundStmt(); + case Token::IF: + return ParseIfStmt(); + case Token::SWITCH: + return ParseSwitchStmt(); + case Token::WHILE: + return ParseWhileStmt(); + case Token::DO: + return ParseDoStmt(); + case Token::FOR: + return ParseForStmt(); + case Token::GOTO: + return ParseGotoStmt(); + case Token::CONTINUE: + return ParseContinueStmt(); + case Token::BREAK: + return ParseBreakStmt(); + case Token::RETURN: + return ParseReturnStmt(); + case Token::CASE: + return ParseCaseStmt(); + case Token::DEFAULT: + return ParseDefaultStmt(); + } + + if (tok->IsIdentifier() && ts_.Try(':')) { + // GNU extension: label attributes + TryAttributeSpecList(); + return ParseLabelStmt(tok); + } + + ts_.PutBack(); + auto expr = ParseExpr(); + ts_.Expect(';'); + + return expr; +} + + +CompoundStmt* Parser::ParseCompoundStmt(FuncType* funcType) { + EnterBlock(funcType); + + std::list stmts; + + while (!ts_.Try('}')) { + if (ts_.Peek()->IsEOF()) { + Error(ts_.Peek(), "premature end of input"); + } + + if (IsType(ts_.Peek())) { + stmts.push_back(ParseDecl()); + } else { + stmts.push_back(ParseStmt()); + } + } + + auto scope = curScope_; + ExitBlock(); + + return CompoundStmt::New(stmts, scope); +} + + +IfStmt* Parser::ParseIfStmt() { + ts_.Expect('('); + auto tok = ts_.Peek(); + auto cond = ParseExpr(); + if (!cond->Type()->IsScalar()) { + Error(tok, "expect scalar"); + } + ts_.Expect(')'); + + auto then = ParseStmt(); + Stmt* els = nullptr; + if (ts_.Try(Token::ELSE)) + els = ParseStmt(); + + return IfStmt::New(cond, then, els); +} + + +/* + * for 循环结构: + * for (declaration; expression1; expression2) statement + * 展开后的结构: + * declaration + * cond: if (expression1) then empty + * else goto end + * statement + * step: expression2 + * goto cond + * next: + */ + +#define ENTER_LOOP_BODY(breakDest, continueDest) \ +{ \ + LabelStmt* breakDestBackup = breakDest_; \ + LabelStmt* continueDestBackup = continueDest_; \ + breakDest_ = breakDest; \ + continueDest_ = continueDest; + +#define EXIT_LOOP_BODY() \ + breakDest_ = breakDestBackup; \ + continueDest_ = continueDestBackup; \ +} + +CompoundStmt* Parser::ParseForStmt() { + EnterBlock(); + ts_.Expect('('); + + std::list stmts; + + if (IsType(ts_.Peek())) { + stmts.push_back(ParseDecl()); + } else if (!ts_.Try(';')) { + stmts.push_back(ParseExpr()); + ts_.Expect(';'); + } + + Expr* condExpr = nullptr; + if (!ts_.Try(';')) { + condExpr = ParseExpr(); + ts_.Expect(';'); + } + + Expr* stepExpr = nullptr; + if (!ts_.Try(')')) { + stepExpr = ParseExpr(); + ts_.Expect(')'); + } + + auto condLabel = LabelStmt::New(); + auto stepLabel = LabelStmt::New(); + auto endLabel = LabelStmt::New(); + stmts.push_back(condLabel); + if (condExpr) { + auto gotoEndStmt = JumpStmt::New(endLabel); + auto ifStmt = IfStmt::New(condExpr, EmptyStmt::New(), gotoEndStmt); + stmts.push_back(ifStmt); + } + + // 我们需要给break和continue语句提供相应的标号,不然不知往哪里跳 + Stmt* bodyStmt; + ENTER_LOOP_BODY(endLabel, stepLabel); + bodyStmt = ParseStmt(); + // 因为for的嵌套结构,在这里需要回复break和continue的目标标号 + EXIT_LOOP_BODY() + + stmts.push_back(bodyStmt); + stmts.push_back(stepLabel); + if (stepExpr) + stmts.push_back(stepExpr); + else + stmts.push_back(EmptyStmt::New()); + stmts.push_back(JumpStmt::New(condLabel)); + stmts.push_back(endLabel); + + auto scope = curScope_; + ExitBlock(); + + return CompoundStmt::New(stmts, scope); +} + + +/* + * while 循环结构: + * while (expression) statement + * 展开后的结构: + * cond: if (expression) then empty + * else goto end + * statement + * goto cond + * end: + */ +CompoundStmt* Parser::ParseWhileStmt() { + std::list stmts; + ts_.Expect('('); + auto tok = ts_.Peek(); + auto condExpr = ParseExpr(); + ts_.Expect(')'); + + if (!condExpr->Type()->IsScalar()) { + Error(tok, "scalar expression expected"); + } + + auto condLabel = LabelStmt::New(); + auto endLabel = LabelStmt::New(); + auto gotoEndStmt = JumpStmt::New(endLabel); + auto ifStmt = IfStmt::New(condExpr, EmptyStmt::New(), gotoEndStmt); + stmts.push_back(condLabel); + stmts.push_back(ifStmt); + + Stmt* bodyStmt; + ENTER_LOOP_BODY(endLabel, condLabel) + bodyStmt = ParseStmt(); + EXIT_LOOP_BODY() + + stmts.push_back(bodyStmt); + stmts.push_back(JumpStmt::New(condLabel)); + stmts.push_back(endLabel); + + return CompoundStmt::New(stmts); +} + + +/* + * do-while 循环结构: + * do statement while (expression) + * 展开后的结构: + * begin: statement + * cond: if (expression) then goto begin + * else goto end + * end: + */ +CompoundStmt* Parser::ParseDoStmt() { + auto beginLabel = LabelStmt::New(); + auto condLabel = LabelStmt::New(); + auto endLabel = LabelStmt::New(); + + Stmt* bodyStmt; + ENTER_LOOP_BODY(endLabel, beginLabel) + bodyStmt = ParseStmt(); + EXIT_LOOP_BODY() + + ts_.Expect(Token::WHILE); + ts_.Expect('('); + auto condExpr = ParseExpr(); + ts_.Expect(')'); + ts_.Expect(';'); + + auto gotoBeginStmt = JumpStmt::New(beginLabel); + auto gotoEndStmt = JumpStmt::New(endLabel); + auto ifStmt = IfStmt::New(condExpr, gotoBeginStmt, gotoEndStmt); + + std::list stmts; + stmts.push_back(beginLabel); + stmts.push_back(bodyStmt); + stmts.push_back(condLabel); + stmts.push_back(ifStmt); + stmts.push_back(endLabel); + + return CompoundStmt::New(stmts); +} + + +#undef ENTER_LOOP_BODY +#undef EXIT_LOOP_BODY + + +#define ENTER_SWITCH_BODY(breakDest, caseLabels) \ +{ \ + CaseLabelList* caseLabelsBackup = caseLabels_; \ + LabelStmt* defaultLabelBackup = defaultLabel_; \ + LabelStmt* breakDestBackup = breakDest_; \ + breakDest_ = breakDest; \ + caseLabels_ = &caseLabels; \ + defaultLabel_ = nullptr; + +#define EXIT_SWITCH_BODY() \ + caseLabels_ = caseLabelsBackup; \ + breakDest_ = breakDestBackup; \ + defaultLabel_ = defaultLabelBackup; \ +} + + +/* + * switch + * jump stmt (skip case labels) + * case labels + * jump stmts + * default jump stmt + */ +CompoundStmt* Parser::ParseSwitchStmt() { + std::list stmts; + ts_.Expect('('); + auto tok = ts_.Peek(); + auto expr = ParseExpr(); + ts_.Expect(')'); + + if (!expr->Type()->IsInteger()) { + Error(tok, "switch quantity not an integer"); + } + + auto testLabel = LabelStmt::New(); + auto endLabel = LabelStmt::New(); + auto t = TempVar::New(expr->Type()); + auto assign = BinaryOp::New(tok, '=', t, expr); + stmts.push_back(assign); + stmts.push_back(JumpStmt::New(testLabel)); + + CaseLabelList caseLabels; + ENTER_SWITCH_BODY(endLabel, caseLabels); + + auto bodyStmt = ParseStmt(); // Fill caseLabels and defaultLabel + stmts.push_back(bodyStmt); + stmts.push_back(JumpStmt::New(endLabel)); + stmts.push_back(testLabel); + + for (auto iter = caseLabels.begin(); + iter != caseLabels.end(); ++iter) { + auto cond = BinaryOp::New(tok, Token::EQ, t, iter->first); + auto then = JumpStmt::New(iter->second); + auto ifStmt = IfStmt::New(cond, then, nullptr); + stmts.push_back(ifStmt); + } + if (defaultLabel_) + stmts.push_back(JumpStmt::New(defaultLabel_)); + EXIT_SWITCH_BODY(); + + stmts.push_back(endLabel); + + return CompoundStmt::New(stmts); +} + + +#undef ENTER_SWITCH_BODY +#undef EXIT_SWITCH_BODY + + +CompoundStmt* Parser::ParseCaseStmt() { + auto tok = ts_.Peek(); + + // Case ranges: Non-standard GNU extension + long begin, end; + begin = Evaluator().Eval(ParseAssignExpr()); + if (ts_.Try(Token::ELLIPSIS)) + end = Evaluator().Eval(ParseAssignExpr()); + else + end = begin; + ts_.Expect(':'); + + auto labelStmt = LabelStmt::New(); + for (auto val = begin; val <= end; ++val) { + if (val > INT_MAX) + Error(tok, "case range exceed range of int"); + auto cons = Constant::New(tok, T_INT, val); + caseLabels_->push_back(std::make_pair(cons, labelStmt)); + } + + std::list stmts; + stmts.push_back(labelStmt); + stmts.push_back(ParseStmt()); + + return CompoundStmt::New(stmts); +} + + +CompoundStmt* Parser::ParseDefaultStmt() { + auto tok = ts_.Peek(); + ts_.Expect(':'); + if (defaultLabel_) { // There is a 'default' stmt + Error(tok, "multiple default labels in one switch"); + } + auto labelStmt = LabelStmt::New(); + defaultLabel_ = labelStmt; + + std::list stmts; + stmts.push_back(labelStmt); + stmts.push_back(ParseStmt()); + + return CompoundStmt::New(stmts); +} + + +JumpStmt* Parser::ParseContinueStmt() { + auto tok = ts_.Peek(); + ts_.Expect(';'); + if (continueDest_ == nullptr) { + Error(tok, "'continue' is allowed only in loop"); + } + + return JumpStmt::New(continueDest_); +} + + +JumpStmt* Parser::ParseBreakStmt() { + auto tok = ts_.Peek(); + ts_.Expect(';'); + if (breakDest_ == nullptr) { + Error(tok, "'break' is allowed only in switch/loop"); + } + + return JumpStmt::New(breakDest_); +} + + +ReturnStmt* Parser::ParseReturnStmt() { + Expr* expr; + + if (ts_.Try(';')) { + expr = nullptr; + } else { + expr = ParseExpr(); + ts_.Expect(';'); + + auto retType = curFunc_->FuncType()->Derived(); + expr = Expr::MayCast(expr, retType); + } + + return ReturnStmt::New(expr); +} + + +JumpStmt* Parser::ParseGotoStmt() { + auto label = ts_.Peek(); + ts_.Expect(Token::IDENTIFIER); + ts_.Expect(';'); + + auto labelStmt = FindLabel(label->str_); + if (labelStmt) { + return JumpStmt::New(labelStmt); + } + + auto unresolvedJump = JumpStmt::New(nullptr); + unresolvedJumps_.push_back(std::make_pair(label, unresolvedJump)); + + return unresolvedJump; +} + + +CompoundStmt* Parser::ParseLabelStmt(const Token* label) { + const auto& labelStr = label->str_; + auto stmt = ParseStmt(); + if (nullptr != FindLabel(labelStr)) { + Error(label, "redefinition of label '%s'", labelStr.c_str()); + } + + auto labelStmt = LabelStmt::New(); + AddLabel(labelStr, labelStmt); + std::list stmts; + stmts.push_back(labelStmt); + stmts.push_back(stmt); + + return CompoundStmt::New(stmts); +} + + +bool Parser::IsBuiltin(const std::string& name) { + return name == "__builtin_va_arg" || + name == "__builtin_va_start"; +} + + +bool Parser::IsBuiltin(FuncType* type) { + assert(vaStartType_ && vaArgType_); + return type == vaStartType_ || type == vaArgType_; +} + + +// Builtin functions will be inlined +void Parser::DefineBuiltins() { + // FIXME: potential bug: using same object for params!!! + auto voidPtr = PointerType::New(VoidType::New()); + auto param = Object::New(nullptr, voidPtr); + FuncType::ParamList pl; + pl.push_back(param); + pl.push_back(param); + vaStartType_ = FuncType::New(VoidType::New(), F_INLINE, false, pl); + vaArgType_ = FuncType::New(voidPtr, F_INLINE, false, pl); +} + + +Identifier* Parser::GetBuiltin(const Token* tok) { + assert(vaStartType_ && vaArgType_); + static Identifier* vaStart = nullptr; + static Identifier* vaArg = nullptr; + const auto& name = tok->str_; + if (name == "__builtin_va_start") { + if (!vaStart) + vaStart = Identifier::New(tok, vaStartType_, Linkage::L_EXTERNAL); + return vaStart; + } else if (name == "__builtin_va_arg") { + if (!vaArg) + vaArg = Identifier::New(tok, vaArgType_, Linkage::L_EXTERNAL); + return vaArg; + } + assert(false); + return nullptr; +} + + +/* + * GNU extensions + */ + +// Attribute +void Parser::TryAttributeSpecList() { + while (ts_.Try(Token::ATTRIBUTE)) + ParseAttributeSpec(); +} + + +void Parser::ParseAttributeSpec() { + ts_.Expect('('); + ts_.Expect('('); + + while (!ts_.Try(')')) { + ParseAttribute(); + if (!ts_.Try(',')) { + ts_.Expect(')'); + break; + } + } + ts_.Expect(')'); +} + + +void Parser::ParseAttribute() { + if (!ts_.Test(Token::IDENTIFIER)) + return; + auto tok = ts_.Next(); + if (ts_.Try('(')) { + if (ts_.Try(')')) + return; + auto tok = ts_.Next(); + if (ts_.Test(',')) { + while (ts_.Try(',')) {} + } + ts_.Try(')'); + } +} diff --git a/lib/lang/wgtcc/scanner.cc b/lib/lang/wgtcc/scanner.cc new file mode 100644 index 000000000..0f0dbdfa0 --- /dev/null +++ b/lib/lang/wgtcc/scanner.cc @@ -0,0 +1,452 @@ +#include "triton/lang/wgtcc/scanner.h" + +#include +#include + + +void Scanner::Tokenize(TokenSequence& ts) { + while (true) { + auto tok = Scan(); + if (tok->tag_ == Token::END) { + if (ts.Empty() || (ts.Back()->tag_ != Token::NEW_LINE)) { + auto t = Token::New(*tok); + t->tag_ = Token::NEW_LINE; + t->str_ = "\n"; + ts.InsertBack(t); + } + break; + } else { + if (!ts.Empty() && ts.Back()->tag_ == Token::NEW_LINE) + tok->ws_ = true; + ts.InsertBack(tok); + } + } +} + + +std::string Scanner::ScanHeadName(const Token* lhs, const Token* rhs) { + std::string str; + const char* begin = lhs->loc_.Begin() + 1; + const char* end = rhs->loc_.Begin(); + for (; begin != end; ++begin) { + if (*begin == '\n' && str.back() == '\\') + str.pop_back(); + else + str.push_back(*begin); + } + return str; +} + + +Token* Scanner::Scan(bool ws) { + tok_.ws_ = ws; + SkipWhiteSpace(); + + Mark(); + + if (Test('\n')) { + auto ret = MakeNewLine(); + Next(); + return ret; + } + auto c = Next(); + switch (c) { + case '#': return MakeToken(Try('#') ? Token::DSHARP: c); + case ':': return MakeToken(Try('>') ? ']': c); + case '(': case ')': case '[': case ']': + case '?': case ',': case '{': case '}': + case '~': case ';': case '@': + return MakeToken(c); + case '-': + if (Try('>')) return MakeToken(Token::PTR); + if (Try('-')) return MakeToken(Token::DEC); + if (Try('=')) return MakeToken(Token::SUB_ASSIGN); + return MakeToken(c); + case '+': + if (Try('+')) return MakeToken(Token::INC); + if (Try('=')) return MakeToken(Token::ADD_ASSIGN); + return MakeToken(c); + case '<': + if (Try('<')) return MakeToken(Try('=') ? Token::LEFT_ASSIGN: Token::LEFT); + if (Try('=')) return MakeToken(Token::LE); + if (Try(':')) return MakeToken('['); + if (Try('%')) return MakeToken('{'); + return MakeToken(c); + case '%': + if (Try('=')) return MakeToken(Token::MOD_ASSIGN); + if (Try('>')) return MakeToken('}'); + if (Try(':')) { + if (Try('%')) { + if (Try(':')) return MakeToken(Token::DSHARP); + PutBack(); + } + return MakeToken('#'); + } + return MakeToken(c); + case '>': + if (Try('>')) return MakeToken(Try('=') ? Token::RIGHT_ASSIGN: Token::RIGHT); + if (Try('=')) return MakeToken(Token::GE); + return MakeToken(c); + case '=': return MakeToken(Try('=') ? Token::EQ: c); + case '!': return MakeToken(Try('=') ? Token::NE: c); + case '&': + if (Try('&')) return MakeToken(Token::LOGICAL_AND); + if (Try('=')) return MakeToken(Token::AND_ASSIGN); + return MakeToken(c); + case '|': + if (Try('|')) return MakeToken(Token::LOGICAL_OR); + if (Try('=')) return MakeToken(Token::OR_ASSIGN); + return MakeToken(c); + case '*': return MakeToken(Try('=') ? Token::MUL_ASSIGN: c); + case '/': + if (Test('/') || Test('*')) { + SkipComment(); + return Scan(true); + } + return MakeToken(Try('=') ? Token::DIV_ASSIGN: c); + case '^': return MakeToken(Try('=') ? Token::XOR_ASSIGN: c); + case '.': + if (isdigit(Peek())) return SkipNumber(); + if (Try('.')) { + if (Try('.')) return MakeToken(Token::ELLIPSIS); + PutBack(); + return MakeToken('.'); + } + return MakeToken(c); + case '0' ... '9': return SkipNumber(); + case 'u': case 'U': case 'L': { + /*auto enc = */ScanEncoding(c); + if (Try('\'')) return SkipCharacter(); + if (Try('\"')) return SkipLiteral(); + return SkipIdentifier(); + } + case '\'': return SkipCharacter(); + case '\"': return SkipLiteral(); + case 'a' ... 't': case 'v' ... 'z': case 'A' ... 'K': + case 'M' ... 'T': case 'V' ... 'Z': case '_': case '$': + case 0x80 ... 0xfd: + return SkipIdentifier(); + case '\\': + // Universal character name is allowed in identifier + if (Test('u') || Test('U')) + return SkipIdentifier(); + return MakeToken(Token::INVALID); + case '\0': return MakeToken(Token::END); + default: return MakeToken(Token::INVALID); + } +} + + +void Scanner::SkipWhiteSpace() { + while (isspace(Peek()) && Peek() != '\n') { + tok_.ws_ = true; + Next(); + } +} + + +void Scanner::SkipComment() { + if (Try('/')) { + // Line comment terminated an newline or eof + while (!Empty()) { + if (Peek() == '\n') + return; + Next(); + } + return; + } else if (Try('*')) { + while (!Empty()) { + auto c = Next(); + if (c == '*' && Peek() == '/') { + Next(); + return; + } + } + Error(loc_, "unterminated block comment"); + } + assert(false); +} + + +std::string Scanner::ScanIdentifier() { + std::string val; + while (!Empty()) { + auto c = Next(); + if (IsUCN(c)) { + c = ScanEscaped(); // Call ScanUCN() + AppendUCN(val, c); + } else { + val.push_back(c); + } + } + return val; +} + + +Token* Scanner::SkipIdentifier() { + PutBack(); + auto c = Next(); + while (isalnum(c) + || (0x80 <= c && c <= 0xfd) + || c == '_' + || c == '$' + || IsUCN(c)) { + if (IsUCN(c)) + c = ScanEscaped(); // Just read it + c = Next(); + } + PutBack(); + return MakeToken(Token::IDENTIFIER); +} + + +// Scan PP-Number +Token* Scanner::SkipNumber() { + PutBack(); + bool sawHexPrefix = false; + int tag = Token::I_CONSTANT; + auto c = Next(); + while (c == '.' || isdigit(c) || isalpha(c) || c == '_' || IsUCN(c)) { + if (c == 'e' || c =='E' || c == 'p' || c == 'P') { + if (!Try('-')) Try('+'); + if (!((c == 'e' || c == 'E') && sawHexPrefix)) + tag = Token::F_CONSTANT; + } else if (IsUCN(c)) { + ScanEscaped(); + } else if (c == '.') { + tag = Token::F_CONSTANT; + } else if (c == 'x' || c == 'X') { + sawHexPrefix = true; + } + c = Next(); + } + PutBack(); + return MakeToken(tag); +} + + +Encoding Scanner::ScanLiteral(std::string& val) { + auto enc = Test('\"') ? Encoding::NONE: ScanEncoding(Next()); + Next(); + val.resize(0); + while (!Test('\"')) { + auto c = Next(); + bool isucn = IsUCN(c); + if (c == '\\') + c = ScanEscaped(); + if (isucn) + AppendUCN(val, c); + else + val.push_back(c); + } + return enc; +} + + +Token* Scanner::SkipLiteral() { + auto c = Next(); + while (c != '\"' && c != '\n' && c != '\0') { + if (c == '\\') Next(); + c = Next(); + } + if (c != '\"') + Error(loc_, "unterminated string literal"); + return MakeToken(Token::LITERAL); +} + + +Encoding Scanner::ScanCharacter(int& val) { + auto enc = Test('\'') ? Encoding::NONE: ScanEncoding(Next()); + Next(); + val = 0; + while (!Test('\'')) { + auto c = Next(); + if (c == '\\') + c = ScanEscaped(); + if (enc == Encoding::NONE) + val = (val << 8) + c; + else + val = c; + } + return enc; +} + + +Token* Scanner::SkipCharacter() { + auto c = Next(); + while (c != '\'' && c != '\n' && c != '\0') { + if (c == '\\') Next(); + c = Next(); + } + if (c != '\'') + Error(loc_, "unterminated character constant"); + return MakeToken(Token::C_CONSTANT); +} + + +int Scanner::ScanEscaped() { + auto c = Next(); + switch (c) { + case '\\': case '\'': case '\"': case '\?': + return c; + case 'a': return '\a'; + case 'b': return '\b'; + case 'f': return '\f'; + case 'n': return '\n'; + case 'r': return '\r'; + case 't': return '\t'; + case 'v': return '\v'; + // Non-standard GCC extention + case 'e': return '\033'; + case 'x': return ScanHexEscaped(); + case '0' ... '7': return ScanOctEscaped(c); + case 'u': return ScanUCN(4); + case 'U': return ScanUCN(8); + default: Error(loc_, "unrecognized escape character '%c'", c); + } + return c; // Make compiler happy +} + + +int Scanner::ScanHexEscaped() { + int val = 0, c = Peek(); + if (!isxdigit(c)) + Error(loc_, "expect xdigit, but got '%c'", c); + while (isxdigit(c)) { + val = (val << 4) + XDigit(c); + Next(); + c = Peek(); + } + return val; +} + + +int Scanner::ScanOctEscaped(int c) { + int val = XDigit(c); + c = Peek(); + if (!IsOctal(c)) + return val; + val = (val << 3) + XDigit(c); + Next(); + + c = Peek(); + if (!IsOctal(c)) + return val; + val = (val << 3) + XDigit(c); + Next(); + return val; +} + + +int Scanner::ScanUCN(int len) { + assert(len == 4 || len == 8); + int val = 0; + for (auto i = 0; i < len; ++i) { + auto c = Next(); + if (!isxdigit(c)) + Error(loc_, "expect xdigit, but got '%c'", c); + val = (val << 4) + XDigit(c); + } + return val; +} + + +int Scanner::XDigit(int c) { + switch (c) { + case '0' ... '9': return c - '0'; + case 'a' ... 'z': return c - 'a' + 10; + case 'A' ... 'Z': return c - 'A' + 10; + default: assert(false); return c; + } +} + + +Encoding Scanner::ScanEncoding(int c) { + switch (c) { + case 'u': return Try('8') ? Encoding::UTF8: Encoding::CHAR16; + case 'U': return Encoding::CHAR32; + case 'L': return Encoding::WCHAR; + default: assert(false); return Encoding::NONE; + } +} + + +std::string* ReadFile(const std::string& filename) { + FILE* f = fopen(filename.c_str(), "r"); + if (!f) Error("%s: No such file or directory", filename.c_str()); + auto text = new std::string; + int c; + while (EOF != (c = fgetc(f))) + text->push_back(c); + fclose(f); + return text; +} + + +int Scanner::Next() { + int c = Peek(); + ++p_; + if (c == '\n') { + ++loc_.line_; + loc_.column_ = 1; + loc_.lineBegin_ = p_; + } else { + ++loc_.column_; + } + return c; +} + + +int Scanner::Peek() { + int c = (uint8_t)(*p_); + if (c == '\\' && p_[1] == '\n') { + p_ += 2; + ++loc_.line_; + loc_.column_ = 1; + loc_.lineBegin_ = p_; + return Peek(); + } + return c; +} + + +// There couldn't be more than one PutBack() that +// cross two line, so just leave lineBegin, because +// we never care about the pos of newline token +void Scanner::PutBack() { + int c = *--p_; + if (c == '\n' && p_[-1] == '\\') { + --loc_.line_; + --p_; + return PutBack(); + } else if (c == '\n') { + --loc_.line_; + } else { + --loc_.column_; + } +} + + +Token* Scanner::MakeToken(int tag) { + tok_.tag_ = tag; + auto& str = tok_.str_; + str.resize(0); + const char* p = tok_.loc_.lineBegin_ + tok_.loc_.column_ - 1; + for (; p < p_; ++p) { + if (p[0] == '\n' && p[-1] == '\\') + str.pop_back(); + else + str.push_back(p[0]); + } + return Token::New(tok_); +} + + +/* + * New line is special, it is generated before reading the character '\n' + */ +Token* Scanner::MakeNewLine() { + tok_.tag_ = '\n'; + tok_.str_ = std::string(p_, p_ + 1); + return Token::New(tok_); +} diff --git a/lib/lang/wgtcc/scope.cc b/lib/lang/wgtcc/scope.cc new file mode 100644 index 000000000..bc1c6827c --- /dev/null +++ b/lib/lang/wgtcc/scope.cc @@ -0,0 +1,111 @@ +#include "triton/lang/wgtcc/scope.h" + +#include "triton/lang/wgtcc/ast.h" + +#include +#include + + +Identifier* Scope::Find(const Token* tok) { + auto ret = Find(tok->str_); + if (ret) ret->SetTok(tok); + return ret; +} + + +Identifier* Scope::FindInCurScope(const Token* tok) { + auto ret = FindInCurScope(tok->str_); + if (ret) ret->SetTok(tok); + return ret; +} + + +Identifier* Scope::FindTag(const Token* tok) { + auto ret = FindTag(tok->str_); + if (ret) ret->SetTok(tok); + return ret; +} + + +Identifier* Scope::FindTagInCurScope(const Token* tok) { + auto ret = FindTagInCurScope(tok->str_); + if (ret) ret->SetTok(tok); + return ret; +} + + +void Scope::Insert(Identifier* ident) { + Insert(ident->Name(), ident); +} + + +void Scope::InsertTag(Identifier* ident) { + Insert(TagName(ident->Name()), ident); +} + + +Identifier* Scope::Find(const std::string& name) { + auto ident = identMap_.find(name); + if (ident != identMap_.end()) + return ident->second; + if (type_ == S_FILE || parent_ == nullptr) + return nullptr; + return parent_->Find(name); +} + + +Identifier* Scope::FindInCurScope(const std::string& name) { + auto ident = identMap_.find(name); + if (ident == identMap_.end()) + return nullptr; + return ident->second; +} + + +void Scope::Insert(const std::string& name, Identifier* ident) { + assert(FindInCurScope(name) == nullptr); + identMap_[name] = ident; +} + + +Identifier* Scope::FindTag(const std::string& name) { + auto tag = Find(TagName(name)); + if (tag) assert(tag->ToTypeName()); + return tag; +} + + +Identifier* Scope::FindTagInCurScope(const std::string& name) { + auto tag = FindInCurScope(TagName(name)); + assert(tag == nullptr || tag->ToTypeName()); + return tag; +} + + +Scope::TagList Scope::AllTagsInCurScope() const { + TagList tags; + for (auto& kv: identMap_) { + if (IsTagName(kv.first)) + tags.push_back(kv.second); + } + return tags; +} + + +void Scope::Print() { + std::cout << "scope: " << this << std::endl; + + auto iter = identMap_.begin(); + for (; iter != identMap_.end(); ++iter) { + auto name = iter->first; + auto ident = iter->second; + if (ident->ToTypeName()) { + std::cout << name << "\t[type:\t" + << ident->Type()->Str() << "]" << std::endl; + } else { + std::cout << name << "\t[object:\t" + << ident->Type()->Str() << "]" << std::endl; + } + } + std::cout << std::endl; +} diff --git a/lib/lang/wgtcc/token.cc b/lib/lang/wgtcc/token.cc new file mode 100644 index 000000000..62c9b41f6 --- /dev/null +++ b/lib/lang/wgtcc/token.cc @@ -0,0 +1,259 @@ +#include "triton/lang/wgtcc/token.h" + +#include "triton/lang/wgtcc/mem_pool.h" +#include "triton/lang/wgtcc/parser.h" + + +static MemPoolImp tokenPool; + +const std::unordered_map Token::kwTypeMap_ { + { "auto", Token::AUTO }, + { "break", Token::BREAK }, + { "case", Token::CASE }, + { "char", Token::CHAR }, + { "const", Token::CONST }, + { "continue", Token::CONTINUE }, + { "default", Token::DEFAULT }, + { "do", Token::DO }, + { "double", Token::DOUBLE }, + { "else", Token::ELSE }, + { "enum", Token::ENUM }, + { "extern", Token::EXTERN }, + { "float", Token::FLOAT }, + { "for", Token::FOR }, + { "goto", Token::GOTO }, + { "half", Token::HALF }, + { "if", Token::IF }, + { "inline", Token::INLINE }, + { "int", Token::INT }, + { "long", Token::LONG }, + { "signed", Token::SIGNED }, + { "unsigned", Token::UNSIGNED }, + { "register", Token::REGISTER }, + { "restrict", Token::RESTRICT }, + { "return", Token::RETURN }, + { "short", Token::SHORT }, + { "sizeof", Token::SIZEOF }, + { "static", Token::STATIC }, + { "struct", Token::STRUCT }, + { "switch", Token::SWITCH }, + { "typedef", Token::TYPEDEF }, + { "union", Token::UNION }, + { "void", Token::VOID }, + { "volatile", Token::VOLATILE }, + { "while", Token::WHILE }, + { "_Alignas", Token::ALIGNAS }, + { "_Alignof", Token::ALIGNOF }, + { "_Atomic", Token::ATOMIC }, + { "__attribute__", Token::ATTRIBUTE }, + { "_Bool", Token::BOOL }, + { "_Complex", Token::COMPLEX }, + { "_Generic", Token::GENERIC }, + { "_Imaginary", Token::IMAGINARY }, + { "_Noreturn", Token::NORETURN }, + { "_Static_assert", Token::STATIC_ASSERT }, + { "_Thread_local", Token::THREAD }, +}; + +const std::unordered_map Token::tagLexemeMap_ { + { '(', "(" }, + { ')', ")" }, + { '[', "[" }, + { ']', "]" }, + { ':', ":" }, + { ',', "," }, + { ';', ";" }, + { '+', "+" }, + { '-', "-" }, + { '*', "*" }, + { '/', "/" }, + { '|', "|" }, + { '&', "&" }, + { '<', "<" }, + { '>', ">" }, + { '=', "=" }, + { '.', "." }, + { '%', "%" }, + { '{', "{" }, + { '}', "}" }, + { '^', "^" }, + { '~', "~" }, + { '!', "!" }, + { '?', "?" }, + { '#', "#" }, + { '@', "@" }, + + { Token::DSHARP, "##" }, + { Token::PTR, "->" }, + { Token::INC, "++" }, + { Token::DEC, "--" }, + { Token::LEFT, "<<" }, + { Token::RIGHT, ">>" }, + { Token::LE, "<=" }, + { Token::GE, ">=" }, + { Token::EQ, "==" }, + { Token::NE, "!=" }, + { Token::LOGICAL_AND, "&&" }, + { Token::LOGICAL_OR, "||" }, + { Token::MUL_ASSIGN, "*=" }, + { Token::DIV_ASSIGN, "/=" }, + { Token::MOD_ASSIGN, "%=" }, + { Token::ADD_ASSIGN, "+=" }, + { Token::SUB_ASSIGN, "-=" }, + { Token::LEFT_ASSIGN, "<<=" }, + { Token::RIGHT_ASSIGN, ">>=" }, + { Token::AND_ASSIGN, "&=" }, + { Token::XOR_ASSIGN, "^=" }, + { Token::OR_ASSIGN, "|=" }, + { Token::ELLIPSIS, "..." }, + + { Token::AUTO, "auto" }, + { Token::BREAK, "break" }, + { Token::CASE, "case" }, + { Token::CHAR, "char" }, + { Token::CONST, "const" }, + { Token::CONTINUE, "continue" }, + { Token::DEFAULT, "default" }, + { Token::DO, "do" }, + { Token::DOUBLE, "double" }, + { Token::ELSE, "else" }, + { Token::ENUM, "enum" }, + { Token::EXTERN, "extern" }, + { Token::FLOAT, "float" }, + { Token::FOR, "for" }, + { Token::GOTO, "goto" }, + { Token::IF, "if" }, + { Token::INLINE, "inline" }, + { Token::INT, "int" }, + { Token::LONG, "long" }, + { Token::SIGNED, "signed" }, + { Token::UNSIGNED, "unsigned" }, + { Token::REGISTER, "register" }, + { Token::RESTRICT, "restrict" }, + { Token::RETURN, "return" }, + { Token::SHORT, "short" }, + { Token::SIZEOF, "sizeof" }, + { Token::STATIC, "static" }, + { Token::STRUCT, "struct" }, + { Token::SWITCH, "switch" }, + { Token::TYPEDEF, "typedef" }, + { Token::UNION, "union" }, + { Token::VOID, "void" }, + { Token::VOLATILE, "volatile" }, + { Token::WHILE, "while" }, + { Token::ALIGNAS, "_Alignas" }, + { Token::ALIGNOF, "_Alignof" }, + { Token::ATOMIC, "_Atomic" }, + { Token::ATTRIBUTE, "__attribute__" }, + { Token::BOOL, "_Bool" }, + { Token::COMPLEX, "_Complex" }, + { Token::GENERIC, "_Generic" }, + { Token::IMAGINARY, "_Imaginary" }, + { Token::NORETURN, "_Noreturn" }, + { Token::STATIC_ASSERT, "_Static_assert" }, + { Token::THREAD, "_Thread_local" }, + + { Token::END, "(eof)" }, + { Token::IDENTIFIER, "(identifier)" }, + { Token::CONSTANT, "(constant)" }, + { Token::LITERAL, "(string literal)" }, +}; + + +Token* Token::New(int tag) { + return new (tokenPool.Alloc()) Token(tag); +} + + +Token* Token::New(const Token& other) { + return new (tokenPool.Alloc()) Token(other); +} + + +Token* Token::New(int tag, + const SourceLocation& loc, + const std::string& str, + bool ws) { + return new (tokenPool.Alloc()) Token(tag, loc, str, ws); +} + + +TokenSequence TokenSequence::GetLine() { + auto begin = begin_; + while (begin_ != end_ && (*begin_)->tag_ != Token::NEW_LINE) + ++begin_; + auto end = begin_; + return {tokList_, begin, end}; +} + + +/* + * If this seq starts from the begin of a line. + * Called only after we have saw '#' in the token sequence. + */ +bool TokenSequence::IsBeginOfLine() const { + if (begin_ == tokList_->begin()) + return true; + + auto pre = begin_; + --pre; + + // We do not insert a newline at the end of a source file. + // Thus if two token have different filename, the second is + // the begin of a line. + return ((*pre)->tag_ == Token::NEW_LINE || + (*pre)->loc_.filename_ != (*begin_)->loc_.filename_); +} + +const Token* TokenSequence::Peek() const { + static auto eof = Token::New(Token::END); + if (begin_ != end_ && (*begin_)->tag_ == Token::NEW_LINE) { + ++begin_; + return Peek(); + } else if (begin_ == end_) { + if (end_ != tokList_->begin()) + *eof = *Back(); + eof->tag_ = Token::END; + return eof; + } else if (parser_ && (*begin_)->tag_ == Token::IDENTIFIER && + (*begin_)->str_ == "__func__") { + auto filename = Token::New(*(*begin_)); + filename->tag_ = Token::LITERAL; + filename->str_ = "\"" + parser_->CurFunc()->Name() + "\""; + *begin_ = filename; + } + return *begin_; +} + + +const Token* TokenSequence::Expect(int expect) { + auto tok = Peek(); + if (!Try(expect)) { + Error(tok, "'%s' expected, but got '%s'", + Token::Lexeme(expect), tok->str_.c_str()); + } + return tok; +} + +void TokenSequence::Print(FILE* fp) const { + unsigned lastLine = 0; + auto ts = *this; + while (!ts.Empty()) { + auto tok = ts.Next(); + if (lastLine != tok->loc_.line_) { + fputs("\n", fp); + for (unsigned i = 0; i < tok->loc_.column_; ++i) + fputc(' ', fp); + } else if (tok->ws_) { + fputc(' ', fp); + } + fputs(tok->str_.c_str(), fp); + fflush(fp); + lastLine = tok->loc_.line_; + } + fputs("\n", fp); +} + +//void TokenSequence::Print(std::string *str) const { + +//} diff --git a/lib/lang/wgtcc/type.cc b/lib/lang/wgtcc/type.cc new file mode 100644 index 000000000..369e8ed05 --- /dev/null +++ b/lib/lang/wgtcc/type.cc @@ -0,0 +1,484 @@ +#include "triton/lang/wgtcc/type.h" + +#include "triton/lang/wgtcc/ast.h" +#include "triton/lang/wgtcc/scope.h" +#include "triton/lang/wgtcc/token.h" + +#include +#include +#include + + +static MemPoolImp voidTypePool; +static MemPoolImp arrayTypePool; +static MemPoolImp tileTypePool; +static MemPoolImp funcTypePool; +static MemPoolImp pointerTypePool; +static MemPoolImp structUnionTypePool; +static MemPoolImp arithmTypePool; + + +QualType Type::MayCast(QualType type, bool inProtoScope) { + auto funcType = type->ToFunc(); + auto arrayType = type->ToArray(); + if (funcType) { + return PointerType::New(funcType); + } else if (arrayType) { + auto ret = PointerType::New(arrayType->Derived()); + // C11 6.7.6.3 [7]: qualifiers are specified in '[]' + // As we do not support qualifiers in '[]', the qualifier whould be none + return QualType(ret, inProtoScope? 0: Qualifier::CONST); + } + return type; +} + + +VoidType* VoidType::New() { + static auto ret = new (voidTypePool.Alloc()) VoidType(&voidTypePool); + return ret; +} + + +ArithmType* ArithmType::New(int typeSpec) { +#define NEW_TYPE(tag) \ + new (arithmTypePool.Alloc()) ArithmType(&arithmTypePool, tag); + + static auto boolType = NEW_TYPE(T_BOOL); + static auto charType = NEW_TYPE(T_CHAR); + static auto ucharType = NEW_TYPE(T_UNSIGNED | T_CHAR); + static auto shortType = NEW_TYPE(T_SHORT); + static auto ushortType = NEW_TYPE(T_UNSIGNED | T_SHORT); + static auto intType = NEW_TYPE(T_INT); + static auto uintType = NEW_TYPE(T_UNSIGNED | T_INT); + static auto longType = NEW_TYPE(T_LONG); + static auto ulongType = NEW_TYPE(T_UNSIGNED | T_LONG); + static auto llongType = NEW_TYPE(T_LLONG) + static auto ullongType = NEW_TYPE(T_UNSIGNED | T_LLONG); + static auto halfType = NEW_TYPE(T_HALF); + static auto floatType = NEW_TYPE(T_FLOAT); + static auto doubleType = NEW_TYPE(T_DOUBLE); + static auto ldoubleType = NEW_TYPE(T_LONG | T_DOUBLE); + + auto tag = ArithmType::Spec2Tag(typeSpec); + switch (tag) { + case T_BOOL: return boolType; + case T_CHAR: return charType; + case T_UNSIGNED | T_CHAR: return ucharType; + case T_SHORT: return shortType; + case T_UNSIGNED | T_SHORT:return ushortType; + case T_INT: return intType; + case T_UNSIGNED: + case T_UNSIGNED | T_INT: return uintType; + case T_LONG: return longType; + case T_UNSIGNED | T_LONG: return ulongType; + case T_LLONG: return llongType; + case T_UNSIGNED | T_LLONG:return ullongType; + case T_HALF: return halfType; + case T_FLOAT: return floatType; + case T_DOUBLE: return doubleType; + case T_LONG | T_DOUBLE: return ldoubleType; + default: + assert(tag & T_COMPLEX); + Error("complex not supported yet"); + } + return nullptr; // Make compiler happy + +#undef NEW_TYPE +} + + +ArrayType* ArrayType::New(int len, QualType eleType) { + return new (arrayTypePool.Alloc()) + ArrayType(&arrayTypePool, len, eleType); +} + + +ArrayType* ArrayType::New(Expr* expr, QualType eleType) { + return new (arrayTypePool.Alloc()) + ArrayType(&arrayTypePool, expr, eleType); +} + +TileType* TileType::New(const ShapeExpr &expr, QualType eleType) { + return new (tileTypePool.Alloc()) + TileType(&tileTypePool, expr, eleType); +} + +TileType* TileType::New(const ShapeInt &shape, QualType eleType) { + return new (tileTypePool.Alloc()) + TileType(&tileTypePool, shape, eleType); +} + +FuncType* FuncType::New(QualType derived, + int funcSpec, + bool variadic, + const ParamList& params) { + return new (funcTypePool.Alloc()) + FuncType(&funcTypePool, derived, funcSpec, variadic, params); +} + + +PointerType* PointerType::New(QualType derived) { + return new (pointerTypePool.Alloc()) + PointerType(&pointerTypePool, derived); +} + + +StructType* StructType::New(bool isStruct, + bool hasTag, + Scope* parent) { + return new (structUnionTypePool.Alloc()) + StructType(&structUnionTypePool, isStruct, hasTag, parent); +} + + +int ArithmType::Width() const { + switch (tag_) { + case T_BOOL: case T_CHAR: case T_UNSIGNED | T_CHAR: + return 1; + case T_SHORT: case T_UNSIGNED | T_SHORT: + return intWidth_ >> 1; + case T_INT: case T_UNSIGNED: case T_UNSIGNED | T_INT: + return intWidth_; + case T_LONG: case T_UNSIGNED | T_LONG: + return intWidth_ << 1; + case T_LLONG: case T_UNSIGNED | T_LLONG: + return intWidth_ << 1; + case T_FLOAT: + return intWidth_; + case T_DOUBLE: + return intWidth_ << 1; + case T_LONG | T_DOUBLE: + return intWidth_ << 1; + case T_FLOAT | T_COMPLEX: + return intWidth_ << 1; + case T_DOUBLE | T_COMPLEX: + return intWidth_ << 2; + case T_LONG | T_DOUBLE | T_COMPLEX: + return intWidth_ << 2; + default: + assert(false); + } + + return intWidth_; // Make compiler happy +} + + +int ArithmType::Rank() const { + switch (tag_) { + case T_BOOL: return 0; + case T_CHAR: case T_UNSIGNED | T_CHAR: return 1; + case T_SHORT: case T_UNSIGNED | T_SHORT: return 2; + case T_INT: case T_UNSIGNED: case T_UNSIGNED | T_INT: return 3; + case T_LONG: case T_UNSIGNED | T_LONG: return 4; + case T_LLONG: case T_UNSIGNED | T_LLONG: return 5; + case T_FLOAT: return 6; + case T_DOUBLE: return 7; + case T_LONG | T_DOUBLE: return 8; + default: + assert(tag_ & T_COMPLEX); + Error("complex not supported yet"); + } + return 0; +} + + +ArithmType* ArithmType::MaxType(ArithmType* lhs, + ArithmType* rhs) { + if (lhs->IsInteger()) + lhs = ArithmType::IntegerPromote(lhs); + if (rhs->IsInteger()) + rhs = ArithmType::IntegerPromote(rhs); + auto ret = lhs->Rank() > rhs->Rank() ? lhs: rhs; + if (lhs->Width() == rhs->Width() && (lhs->IsUnsigned() || rhs->IsUnsigned())) + return ArithmType::New(T_UNSIGNED | ret->Tag()); + return ret; +} + + +/* + * Converting from type specifier to type tag + */ +int ArithmType::Spec2Tag(int spec) { + if (spec == T_SIGNED) { + return T_INT; + } + spec &= ~T_SIGNED; + if ((spec & T_SHORT) || (spec & T_LONG) + || (spec & T_LLONG)) { + spec &= ~T_INT; + } + return spec; +} + + +std::string ArithmType::Str() const { + std::string width = ":" + std::to_string(Width()); + + switch (tag_) { + case T_BOOL: + return "bool" + width; + + case T_CHAR: + return "char" + width; + + case T_UNSIGNED | T_CHAR: + return "unsigned char" + width; + + case T_SHORT: + return "short" + width; + + case T_UNSIGNED | T_SHORT: + return "unsigned short" + width; + + case T_INT: + return "int" + width; + + case T_UNSIGNED: + return "unsigned int" + width; + + case T_LONG: + return "long" + width; + + case T_UNSIGNED | T_LONG: + return "unsigned long" + width; + + case T_LLONG: + return "long long" + width; + + case T_UNSIGNED | T_LLONG: + return "unsigned long long" + width; + + case T_FLOAT: + return "float" + width; + + case T_DOUBLE: + return "double" + width; + + case T_LONG | T_DOUBLE: + return "long double" + width; + + case T_FLOAT | T_COMPLEX: + return "float complex" + width; + + case T_DOUBLE | T_COMPLEX: + return "double complex" + width; + + case T_LONG | T_DOUBLE | T_COMPLEX: + return "long double complex" + width; + + default: + assert(false); + } + + return "error"; // Make compiler happy +} + + +bool PointerType::Compatible(const Type& other) const { + // C11 6.7.6.1 [2]: pointer compatibility + auto otherPointer = other.ToPointer(); + return otherPointer && derived_->Compatible(*otherPointer->derived_); + + // FIXME(wgtdkp): cannot loose compatible constraints + //return other.IsInteger() || + // (otherPointer && derived_->Compatible(*otherPointer->derived_)); +} + + +bool ArrayType::Compatible(const Type& other) const { + // C11 6.7.6.2 [6]: For two array type to be compatible, + // the element types must be compatible, and have same length + // if both specified. + auto otherArray = other.ToArray(); + if (!otherArray) return false; + if (!derived_->Compatible(*otherArray->derived_)) return false; + // The lengths should equal if both specified + if (complete_ && otherArray->complete_) + return len_ == otherArray->len_; + return true; +} + +bool TileType::Compatible(const Type& other) const { + // For two tile type to be compatible, + // the element types must be compatible, and have same shape + // if both specified + auto otherTile = other.ToTile(); + if(!otherTile) return false; + if (!derived_->Compatible(*otherTile->derived_)) return false; + // The shapes should be equal if both specified + if(complete_ && otherTile->complete_) + return shape_ == otherTile->shape_; + return true; +} + + + +bool FuncType::Compatible(const Type& other) const { + auto otherFunc = other.ToFunc(); + // The other type is not an function type + if (!otherFunc) return false; + // TODO(wgtdkp): do we need to check the type of return value when deciding + // compatibility of two function types ?? + if (!derived_->Compatible(*otherFunc->derived_)) + return false; + if (params_.size() != otherFunc->params_.size()) + return false; + + auto thisIter = params_.begin(); + auto otherIter = otherFunc->params_.begin(); + while (thisIter != params_.end()) { + if (!(*thisIter)->Type()->Compatible(*(*otherIter)->Type())) + return false; + ++thisIter; + ++otherIter; + } + + return true; +} + + +std::string FuncType::Str() const { + auto str = derived_->Str() + "("; + auto iter = params_.begin(); + for (; iter != params_.end(); ++iter) { + str += (*iter)->Type()->Str() + ", "; + } + if (variadic_) + str += "..."; + else if (params_.size()) + str.resize(str.size() - 2); + + return str + ")"; +} + + +StructType::StructType(MemPool* pool, + bool isStruct, + bool hasTag, + Scope* parent) + : Type(pool, false), + isStruct_(isStruct), + hasTag_(hasTag), + memberMap_(new Scope(parent, S_BLOCK)), + offset_(0), + width_(0), + // If a struct type has no member, it gets alignment of 1 + align_(1), + bitFieldAlign_(1) {} + + +Object* StructType::GetMember(const std::string& member) { + auto ident = memberMap_->FindInCurScope(member); + if (ident == nullptr) + return nullptr; + return ident->ToObject(); +} + + +void StructType::CalcWidth() { + width_ = 0; + auto iter = memberMap_->identMap_.begin(); + for (; iter != memberMap_->identMap_.end(); ++iter) { + width_ += iter->second->Type()->Width(); + } +} + + +bool StructType::Compatible(const Type& other) const { + return this == &other; // Pointer comparison +} + + +// TODO(wgtdkp): more detailed representation +std::string StructType::Str() const { + std::string str = isStruct_ ? "struct": "union"; + return str + ":" + std::to_string(width_); +} + + +// Remove useless unnamed bitfield members as they are just for parsing +void StructType::Finalize() { + for (auto iter = members_.begin(); iter != members_.end();) { + if ((*iter)->BitFieldWidth() && (*iter)->Anonymous()) { + members_.erase(iter++); + } else { + ++iter; + } + } +} + + +void StructType::AddMember(Object* member) { + auto offset = MakeAlign(offset_, member->Align()); + member->SetOffset(offset); + + members_.push_back(member); + memberMap_->Insert(member->Name(), member); + + align_ = std::max(align_, member->Align()); + bitFieldAlign_ = std::max(bitFieldAlign_, align_); + + if (isStruct_) { + offset_ = offset + member->Type()->Width(); + width_ = MakeAlign(offset_, align_); + } else { + assert(offset_ == 0); + width_ = std::max(width_, member->Type()->Width()); + width_ = MakeAlign(width_, align_); + } +} + + +void StructType::AddBitField(Object* bitField, int offset) { + bitField->SetOffset(offset); + members_.push_back(bitField); + if (!bitField->Anonymous()) + memberMap_->Insert(bitField->Name(), bitField); + + auto bytes = MakeAlign(bitField->BitFieldEnd(), 8) / 8; + bitFieldAlign_ = std::max(bitFieldAlign_, bitField->Align()); + // Does not aligned, default is 1 + if (isStruct_) { + offset_ = offset + bytes; + width_ = MakeAlign(offset_, std::max(bitFieldAlign_, bitField->Align())); + } else { + assert(offset_ == 0); + width_ = std::max(width_, bitField->Type()->Width()); + } +} + + +// Move members of Anonymous struct/union to external struct/union +void StructType::MergeAnony(Object* anony) { + auto anonyType = anony->Type()->ToStruct(); + auto offset = MakeAlign(offset_, anony->Align()); + + // Members in map are never anonymous + for (auto& kv: *anonyType->memberMap_) { + auto& name = kv.first; + auto member = kv.second->ToObject(); + if (member == nullptr) { + continue; + } + // Every member of anonymous struct/union + // are offseted by external struct/union + member->SetOffset(offset + member->Offset()); + + if (GetMember(name)) { + Error(member, "duplicated member '%s'", name.c_str()); + } + // Simplify anony struct's member searching + memberMap_->Insert(name, member); + } + anony->SetOffset(offset); + members_.push_back(anony); + + align_ = std::max(align_, anony->Align()); + if (isStruct_) { + offset_ = offset + anonyType->Width(); + width_ = MakeAlign(offset_, align_); + } else { + assert(offset_ == 0); + width_ = std::max(width_, anonyType->Width()); + } +} diff --git a/lib/runtime/function.cpp b/lib/runtime/function.cpp index 1e7de730b..49bfa6249 100644 --- a/lib/runtime/function.cpp +++ b/lib/runtime/function.cpp @@ -6,6 +6,8 @@ #include "triton/codegen/selection/selection.h" #include "triton/runtime/function.h" #include "triton/lang/lang.h" +#include "triton/lang/wgtcc/cpp.h" +#include "triton/lang/wgtcc/parser.h" #include "triton/driver/device.h" #include "triton/driver/stream.h" #include "triton/driver/kernel.h" @@ -115,8 +117,30 @@ void function::caller::operator ()(driver::stream *stream, const std::array } void gen_register_kernel_builder(std::ostream &os, const std::string &name, - const std::string &classname, + const std::string &opname, const std::vector& args){ os << "REGISTER_KERNEL_BUILDER(Name(\"" + name + "\").Device(DEVICE_GPU)"; for(size_t i = 0; i < args.size(); i++){ @@ -144,7 +144,7 @@ void gen_register_kernel_builder(std::ostream &os, const std::string &name, if(!arg->get_type()->is_pointer_ty()) os << ".HostMemory(\"" + name + "\")"; } - os << ", " + classname << ");\n"; + os << ", " + opname << ");\n"; } void gen_register_op(std::ostream &os, const std::string &name, @@ -181,10 +181,9 @@ std::string make_tensorflow_src(const std::string src, ir::function* fn = ir->get_function_list().front(); std::string name = fn->get_name(); name[0] = static_cast(std::toupper(name[0])); - std::string classname = name + "Op"; + std::string opname = name + "Op"; std::ostringstream oss; - oss << R"( #include "triton/driver/buffer.h" #include "triton/driver/backend.h" @@ -207,9 +206,9 @@ namespace drv = triton::driver; std::string src = R"TTKERNSRC( )" + src + ")TTKERNSRC\";" + R"( -class )" << classname << R"(: public OpKernel { +class )" << opname << R"(: public OpKernel { public: - explicit )" << classname << R"((OpKernelConstruction* context) + explicit )" << opname << R"((OpKernelConstruction* context) : OpKernel(context), fn_(src) { } void Compute(OpKernelContext* context){ @@ -246,7 +245,7 @@ private: // register kernel builder )"; -gen_register_kernel_builder(oss, name, classname, fn->args()); +gen_register_kernel_builder(oss, name, opname, fn->args()); oss << R"( // register op )"; From 61f25f90eb1af3e6be74b7ab84c77e5e108a3874 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 20 Aug 2019 16:22:43 -0700 Subject: [PATCH 308/494] basic parsing doesn't throw error --- examples/cpp/dot.cpp | 41 ++--- include/triton/lang/wgtcc/ast.h | 7 +- include/triton/lang/wgtcc/parser.h | 1 + include/triton/lang/wgtcc/token.h | 6 +- include/triton/lang/wgtcc/type.h | 16 +- lib/lang/wgtcc/ast.cc | 242 ++++++++++++++++++++++------- lib/lang/wgtcc/parser.cc | 43 ++++- lib/lang/wgtcc/token.cc | 2 + lib/lang/wgtcc/type.cc | 23 ++- 9 files changed, 287 insertions(+), 94 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 55c25b575..83a38be4e 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -55,8 +55,8 @@ std::string src(bool AT, bool BT, std::string a_ty, std::string b_ty, std::strin std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; std::string lda0 = "*lda", lda1 = ""; std::string ldb0 = "", ldb1 = "*ldb"; - std::string usea = AT ? "trans(a)" : "a"; - std::string useb = BT ? "trans(b)" : "b"; + std::string usea = AT ? "^a" : "a"; + std::string useb = BT ? "^b" : "b"; if(AT){ std::swap(AS0, AS1); std::swap(XAS0, XAS1); @@ -82,6 +82,11 @@ R"( #define TN 128 #define TK 32 +#define bool _Bool +#define true 1 +#define false 0 +#define __bool_true_false_are_defined 1 + extern int get_program_id(int); void matmul(restrict )" + a_ty + R"( * A __attribute__((readonly, aligned(16))), @@ -94,28 +99,28 @@ void matmul(restrict )" + a_ty + R"( * A __attribute__((readonly, aligned(16))), int ridx = get_program_id(0); int ridy = get_program_id(1); int rxa[{TM, TN}] = ridx * TM + 0 ... TM; - int ryb[TN] = ridy * TN + 0 ... TN; - int rka[TK] = 0 ... TK; - int rkb[TK] = 0 ... TK; - float xc[)" + XCS + R"(] = 0; - )" + a_ty + R"(* pa[)" + AS + "] = A + rka" + bca0 + lda0 + " + rxa" + bca1 + lda1 + R"(; - )" + b_ty + R"(* pb[)" + BS + "] = B + rkb" + bcb0 + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; - )" + a_ty + R"( a[)" + AS + R"(] = *pa; - )" + b_ty + R"( b[)" + BS + R"(] = *pb; + int ryb[{TN}] = ridy * TN + 0 ... TN; + int rka[{TK}] = 0 ... TK; + int rkb[{TK}] = 0 ... TK; + float xc[{)" + XCS + R"(}] = 0; + )" + a_ty + R"(* pa[{)" + AS + "}] = A + rka" + bca0 + lda0 + " + rxa" + bca1 + lda1 + R"(; + )" + b_ty + R"(* pb[{)" + BS + "}] = B + rkb" + bcb0 + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; + )" + a_ty + R"( a[{)" + AS + R"(}] = *pa; + )" + b_ty + R"( b[{)" + BS + R"(}] = *pb; for(int k = K; k > 0; k = k - TK){ - xc = dot()" + usea + ", " + useb + R"(, xc); + xc = )" + usea + " @ " + useb + R"( + xc; pa = pa + TK)" + lda0 + R"(; pb = pb + TK)" + ldb0 + R"(; a = *pa; b = *pb; } - int rxc[TM] = ridx * TM + (0 ... TM); - int ryc[TN] = ridy * TN + (0 ... TN); - )" + c_ty + R"(* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - )" + c_ty + R"( c[TM, TN] = xc; - bool checkc0[TM] = rxc < M; - bool checkc1[TN] = ryc < N; - bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + int rxc[{TM}] = ridx * TM + (0 ... TM); + int ryc[{TN}] = ridy * TN + (0 ... TN); + )" + c_ty + R"(* pc[{TM, TN}] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; + )" + c_ty + R"( c[{TM, TN}] = xc; + bool checkc0[{TM}] = rxc < M; + bool checkc1[{TN}] = ryc < N; + bool checkc[{TM, TN}] = checkc0[:, newaxis] && checkc1[newaxis, :]; *pc = c; } )"; diff --git a/include/triton/lang/wgtcc/ast.h b/include/triton/lang/wgtcc/ast.h index 3cb3257f7..1181fb63a 100644 --- a/include/triton/lang/wgtcc/ast.h +++ b/include/triton/lang/wgtcc/ast.h @@ -278,6 +278,9 @@ public: static Expr* MayCast(Expr* expr); static Expr* MayCast(Expr* expr, QualType desType); + static ::Type* TryExtractScalarType(Expr* loc, Expr *operand); + static ::Type* ScalarOrLikeTile(Expr* operand, ::Type* ty); + virtual bool IsNullPointerConstant() const { return false; } bool IsConstQualified() const { return type_.IsConstQualified(); } bool IsRestrictQualified() const { return type_.IsRestrictQualified(); } @@ -332,6 +335,7 @@ public: void AdditiveOpTypeChecking(); void ShiftOpTypeChecking(); void RangeOpTypeChecking(); + void MatmulOpTypeChecking(); void RelationalOpTypeChecking(); void EqualityOpTypeChecking(); void BitwiseOpTypeChecking(); @@ -378,11 +382,12 @@ public: virtual ~UnaryOp() {} virtual void Accept(Visitor* v); virtual bool IsLVal(); - ArithmType* Convert(); + ::Type *Convert(); void TypeChecking(); void IncDecOpTypeChecking(); void AddrOpTypeChecking(); void DerefOpTypeChecking(); + void TransOpTypeChecking(); void UnaryArithmOpTypeChecking(); void CastOpTypeChecking(); diff --git a/include/triton/lang/wgtcc/parser.h b/include/triton/lang/wgtcc/parser.h index c1de92491..8c21af727 100644 --- a/include/triton/lang/wgtcc/parser.h +++ b/include/triton/lang/wgtcc/parser.h @@ -75,6 +75,7 @@ public: QualType ParseTypeName(); Expr* ParseCastExpr(); Expr* ParseRangeExpr(); + Expr* ParseMatmulExpr(); Expr* ParseMultiplicativeExpr(); Expr* ParseAdditiveExpr(); Expr* ParseShiftExpr(); diff --git a/include/triton/lang/wgtcc/token.h b/include/triton/lang/wgtcc/token.h index 391507f80..e982ec803 100644 --- a/include/triton/lang/wgtcc/token.h +++ b/include/triton/lang/wgtcc/token.h @@ -64,7 +64,7 @@ public: NOT = '!', COND = '?', SHARP = '#', - AT = '@', + MATMUL = '@', NEW_LINE = '\n', DSHARP = 128, // '##' @@ -126,6 +126,10 @@ public: NORETURN, // _Noreturn // FUNCTION SPECIFIER END + // TILE ARITHMETICS BEGIN + NEWAXIS, + // TILE ARITHMETICS END + ALIGNAS, // _Alignas // For syntactic convenience STATIC_ASSERT, // _Static_assert diff --git a/include/triton/lang/wgtcc/type.h b/include/triton/lang/wgtcc/type.h index 20c2fa898..b43b74339 100644 --- a/include/triton/lang/wgtcc/type.h +++ b/include/triton/lang/wgtcc/type.h @@ -153,6 +153,10 @@ public: virtual bool IsBool() const { return false; } virtual bool IsVoidPointer() const { return false; } virtual bool IsUnsigned() const { return false; } + virtual bool IsTile() const { return ToTile() != nullptr; } + + const Type* ScalarType() const; + Type* ScalarType(); virtual VoidType* ToVoid() { return nullptr; } virtual const VoidType* ToVoid() const { return nullptr; } @@ -327,16 +331,22 @@ public: static TileType* New(const ShapeInt& shape, QualType eleType); virtual ~TileType() { } - virtual TileType* toTile() { return this; } - virtual const TileType* toTile() const { return this; } + virtual TileType* ToTile() { return this; } + virtual const TileType* ToTile() const { return this; } virtual bool Compatible(const Type& other) const; - virtual int Width() const { return 0; } + virtual int Width() const { return Complete() ? derived_->Width()*NumEle() : 0; } virtual int Align() const { return derived_->Align(); } virtual std::string Str() const { return derived_->Str() + "[{}]:" + std::to_string(Width()); } ShapeInt Shape() { return shape_; } + int NumEle() const { + int ret = 1; + for(int s: shape_) + ret *= s; + return ret; + } protected: TileType(MemPool* pool, const ShapeExpr& expr, QualType derived) diff --git a/lib/lang/wgtcc/ast.cc b/lib/lang/wgtcc/ast.cc index eb673584f..5646fbb4c 100644 --- a/lib/lang/wgtcc/ast.cc +++ b/lib/lang/wgtcc/ast.cc @@ -144,6 +144,26 @@ Expr* Expr::MayCast(Expr* expr, QualType desType) { return expr; } +// Extract the operand's scalar type if possible +// and emit an error otherwise +::Type* Expr::TryExtractScalarType(Expr* loc, Expr *operand) { + auto scalType = operand->Type()->ScalarType(); + if(!scalType) + Error(loc, "expect tile or scalar operand"); + return scalType; +} + +// If operand is a tile, return a tile of the same shape and +// provided element type +// If operand is a scalar, return provided element type +// directly +::Type* Expr::ScalarOrLikeTile(Expr* operand, ::Type* ty) { + assert(ty->IsScalar()); + ::Type *retTy = ty; + if(TileType *T = operand->Type()->ToTile()) + retTy = TileType::New(T->Shape(), retTy); + return retTy; +} BinaryOp* BinaryOp::New(const Token* tok, Expr* lhs, Expr* rhs) { return New(tok, tok->tag_, lhs, rhs); @@ -166,6 +186,7 @@ BinaryOp* BinaryOp::New(const Token* tok, int op, Expr* lhs, Expr* rhs) { case Token::LOGICAL_AND: case Token::LOGICAL_OR: case Token::ELLIPSIS: + case Token::MATMUL: break; default: assert(0); @@ -180,18 +201,18 @@ BinaryOp* BinaryOp::New(const Token* tok, int op, Expr* lhs, Expr* rhs) { ArithmType* BinaryOp::Convert() { - // Both lhs and rhs are ensured to be have arithmetic type - auto lhsType = lhs_->Type()->ToArithm(); - auto rhsType = rhs_->Type()->ToArithm(); + // Both lhs and rhs are ensured to be have arithmetic scalar type + auto lhsType = lhs_->Type()->ScalarType()->ToArithm(); + auto rhsType = rhs_->Type()->ScalarType()->ToArithm(); assert(lhsType && rhsType); - auto type = ArithmType::MaxType(lhsType, rhsType); - if (lhsType != type) { // Pointer comparation is enough! - lhs_ = UnaryOp::New(Token::CAST, lhs_, type); + auto maxType = ArithmType::MaxType(lhsType, rhsType); + if (lhsType != maxType) { // Pointer comparation is enough! + lhs_ = UnaryOp::New(Token::CAST, lhs_, ScalarOrLikeTile(lhs_, maxType)); } - if (rhsType != type) { - rhs_ = UnaryOp::New(Token::CAST, rhs_, type); + if (rhsType != maxType) { + rhs_ = UnaryOp::New(Token::CAST, rhs_, ScalarOrLikeTile(rhs_, maxType)); } - return type; + return maxType; } void BinaryOp::Broadcast() { @@ -225,6 +246,8 @@ void BinaryOp::Broadcast() { retShape[i] = rhsShape[i]; else if(rhsShape[i] == 1) retShape[i] = lhsShape[i]; + else if(lhsShape[i] == rhsShape[i]) + retShape[i] = lhsShape[i]; else Error(this, "cannot broadcast dimension %d " "for operands of shape %d and %d", @@ -232,8 +255,10 @@ void BinaryOp::Broadcast() { } auto eleType = lhsType->Derived(); type_ = TileType::New(retShape, eleType); - lhs_ = UnaryOp::New(Token::CAST, lhs_, type_); - rhs_ = UnaryOp::New(Token::CAST, rhs_, type_); + if(retShape != lhsShape) + lhs_ = UnaryOp::New(Token::CAST, lhs_, type_); + if(retShape != rhsShape) + rhs_ = UnaryOp::New(Token::CAST, rhs_, type_); } } @@ -303,6 +328,9 @@ void BinaryOp::TypeChecking() { case Token::ELLIPSIS: return RangeOpTypeChecking(); + case Token::MATMUL: + return MatmulOpTypeChecking(); + default: assert(0); } @@ -315,12 +343,15 @@ void BinaryOp::CommaOpTypeChecking() { void BinaryOp::SubScriptingOpTypeChecking() { - auto lhsType = lhs_->Type()->ToPointer(); + assert(false); + auto lhsType = lhs_->Type()->ToTile(); + if (!lhsType) { - Error(this, "an pointer expected"); + Error(this, "operator [] can only be used on tiles"); } + if (!rhs_->Type()->IsInteger()) { - Error(this, "the operand of [] should be intger"); + Error(this, "the operand of [] should be integer"); } // The type of [] operator is the derived type @@ -334,14 +365,20 @@ void BinaryOp::MemberRefOpTypeChecking() { void BinaryOp::MultiOpTypeChecking() { - if (!lhs_->Type()->ToArithm() || !rhs_->Type()->ToArithm()) { + ::Type* lhsScalType = lhs_->Type()->ScalarType(); + ::Type* rhsScalType = rhs_->Type()->ScalarType(); + if(!lhsScalType || !rhsScalType) { + Error(this, "operands should have type or scalar type"); + } + if (!lhsScalType->ToArithm() || !rhsScalType->ToArithm()) { Error(this, "operands should have arithmetic type"); } if ('%' == op_ && - !(lhs_->Type()->IsInteger() && rhs_->Type()->IsInteger())) { + !(lhsScalType->IsInteger() && rhsScalType->IsInteger())) { Error(this, "operands of '%%' should be integers"); } type_ = Convert(); + Broadcast(); } @@ -351,40 +388,47 @@ void BinaryOp::MultiOpTypeChecking() { * 2. pointer can be used: * 1. lhs of MINUS operator, and rhs must be integer or pointer; * 2. lhs/rhs of ADD operator, and the other operand must be integer; + * 3. tiles can be used: + * 1. the scalar type of lhs/rhs satisfy the above requirements + * 2. lhs/rhs that have identical shape + * 3. lhs/rhs that can be broadcast as per numpy-like semantics */ void BinaryOp::AdditiveOpTypeChecking() { - auto lhsType = lhs_->Type()->ToPointer(); - auto rhsType = rhs_->Type()->ToPointer(); - if (lhsType) { + ::Type* lhsScalType = TryExtractScalarType(this, lhs_); + ::Type* rhsScalType = TryExtractScalarType(this, rhs_); + auto lhsPtrType = lhsScalType->ToPointer(); + auto rhsPtrType = rhsScalType->ToPointer(); + if (lhsPtrType) { if (op_ == '-') { - if (rhsType) { - if (!lhsType->Compatible(*rhsType)) + if (rhsPtrType) { + if (!lhsPtrType->Compatible(*rhsPtrType)) Error(this, "invalid operands to binary -"); type_ = ArithmType::New(T_LONG); // ptrdiff_t - } else if (!rhs_->Type()->IsInteger()) { + } else if (!rhsScalType->IsInteger()) { Error(this, "invalid operands to binary -"); } else { - type_ = lhsType; + type_ = lhsPtrType; } - } else if (!rhs_->Type()->IsInteger()) { + } else if (!rhsScalType->IsInteger()) { Error(this, "invalid operands to binary +"); } else { - type_ = lhsType; + type_ = lhsPtrType; } - } else if (rhsType) { - if (op_ == '+' && !lhs_->Type()->IsInteger()) { + } else if (rhsPtrType) { + if (op_ == '+' && !lhsScalType->IsInteger()) { Error(this, "invalid operands to binary '+'"); - } else if (op_ == '-' && !lhsType) { + } else if (op_ == '-' && !lhsPtrType) { Error(this, "invalid operands to binary '-'"); } - type_ = op_ == '-' ? ArithmType::New(T_LONG): rhs_->Type(); + type_ = op_ == '-' ? ArithmType::New(T_LONG): rhsScalType; std::swap(lhs_, rhs_); // To simplify code gen } else { - if (!lhs_->Type()->ToArithm() || !rhs_->Type()->ToArithm()) { + if (!lhsScalType->ToArithm() || !rhsScalType->ToArithm()) { Error(this, "invalid operands to binary %s", tok_->str_.c_str()); } type_ = Convert(); } + Broadcast(); } void BinaryOp::RangeOpTypeChecking() { @@ -396,59 +440,95 @@ void BinaryOp::RangeOpTypeChecking() { rhs_ = Expr::MayCast(rhs_, ArithmType::IntegerPromote(rhsType)); long begin = Evaluator().Eval(lhs_); long end = Evaluator().Eval(rhs_); - int len = end - begin; + int len = static_cast(end - begin); if(len < 0) Error(this, "range cannot be negative"); type_ = TileType::New(TileType::ShapeInt{len}, lhs_->Type()); } +void BinaryOp::MatmulOpTypeChecking() { + auto lhsType = lhs_->Type()->ToTile(); + auto rhsType = rhs_->Type()->ToTile(); + if(!lhsType || !rhsType) + Error(this, "expect tile operands for matrix multiplication"); + auto lhsShape = lhsType->Shape(); + auto rhsShape = rhsType->Shape(); + size_t lhsRank = lhsShape.size(); + size_t rhsRank = rhsShape.size(); + if(lhsRank != 2 || rhsRank != 2) + Error(this, "matrix multiplication operands must have rank 2"); + if(lhsShape[1] != rhsShape[0]) + Error(this, "matrix multiplication operands have incompatible inner dimension" + " %d and %d", lhsShape[1], rhsShape[0]); + TileType::ShapeInt retShape = {lhsShape[0], rhsShape[1]}; + QualType retType = lhsType->Derived(); + if(retType != rhsType->Derived()) + Error(this, "matrix multiplication operands have incompatible data types"); + type_ = TileType::New(retShape, lhsType->Derived()); +} + void BinaryOp::ShiftOpTypeChecking() { - auto lhsType = lhs_->Type()->ToArithm(); - auto rhsType = rhs_->Type()->ToArithm(); + ::Type* lhsScalType = TryExtractScalarType(this, lhs_); + ::Type* rhsScalType = TryExtractScalarType(this, rhs_); + auto lhsType = lhsScalType->ToArithm(); + auto rhsType = rhsScalType->ToArithm(); if (!lhsType || !lhsType->IsInteger() || !rhsType || !rhsType->IsInteger()) Error(this, "expect integers for shift operator"); - lhs_ = Expr::MayCast(lhs_, ArithmType::IntegerPromote(lhsType)); - rhs_ = Expr::MayCast(rhs_, ArithmType::IntegerPromote(rhsType)); + lhs_ = Expr::MayCast(lhs_, ScalarOrLikeTile(lhs_, ArithmType::IntegerPromote(lhsType))); + rhs_ = Expr::MayCast(rhs_, ScalarOrLikeTile(rhs_, ArithmType::IntegerPromote(rhsType))); type_ = lhs_->Type(); + Broadcast(); } void BinaryOp::RelationalOpTypeChecking() { - if (lhs_->Type()->ToPointer() || rhs_->Type()->ToPointer()) { - EnsureCompatible(lhs_->Type(), rhs_->Type()); + ::Type* lhsScalType = TryExtractScalarType(this, lhs_); + ::Type* rhsScalType = TryExtractScalarType(this, rhs_); + if (lhsScalType->ToPointer() || rhsScalType->ToPointer()) { + EnsureCompatible(lhsScalType, rhsScalType); } else { - if (!lhs_->Type()->IsReal() || !rhs_->Type()->IsReal()) { + if (!lhsScalType->IsReal() || !rhsScalType->IsReal()) { Error(this, "expect real type of operands"); } Convert(); } type_ = ArithmType::New(T_INT); + Broadcast(); } void BinaryOp::EqualityOpTypeChecking() { - if (lhs_->Type()->ToPointer() || rhs_->Type()->ToPointer()) { - EnsureCompatibleOrVoidPointer(lhs_->Type(), rhs_->Type()); + ::Type* lhsScalType = TryExtractScalarType(this, lhs_); + ::Type* rhsScalType = TryExtractScalarType(this, rhs_); + if (lhsScalType->ToPointer() || rhsScalType->ToPointer()) { + EnsureCompatibleOrVoidPointer(lhsScalType, rhsScalType); } else { - if (!lhs_->Type()->ToArithm() || !rhs_->Type()->ToArithm()) + if (!lhsScalType->ToArithm() || !rhsScalType->ToArithm()) Error(this, "invalid operands to binary %s", tok_->str_.c_str()); Convert(); } type_ = ArithmType::New(T_INT); + Broadcast(); } void BinaryOp::BitwiseOpTypeChecking() { - if (!lhs_->Type()->IsInteger() || !rhs_->Type()->IsInteger()) + ::Type* lhsScalType = TryExtractScalarType(this, lhs_); + ::Type* rhsScalType = TryExtractScalarType(this, rhs_); + if (!lhsScalType->IsInteger() || !rhsScalType->IsInteger()) Error(this, "operands of '&' should be integer"); type_ = Convert(); + Broadcast(); } void BinaryOp::LogicalOpTypeChecking() { - if (!lhs_->Type()->IsScalar() || !rhs_->Type()->IsScalar()) + ::Type* lhsScalType = TryExtractScalarType(this, lhs_); + ::Type* rhsScalType = TryExtractScalarType(this, rhs_); + if (!lhsScalType->IsScalar() || !rhsScalType->IsScalar()) Error(this, "the operand should be arithmetic type or pointer"); type_ = ArithmType::New(T_INT); + Broadcast(); } @@ -459,12 +539,14 @@ void BinaryOp::AssignOpTypeChecking() { Error(lhs_, "lvalue expression expected"); } - if (!lhs_->Type()->ToArithm() || !rhs_->Type()->ToArithm()) { - EnsureCompatibleOrVoidPointer(lhs_->Type(), rhs_->Type()); + ::Type* lhsScalType = TryExtractScalarType(this, lhs_); + ::Type* rhsScalType = TryExtractScalarType(this, rhs_); + if (!lhsScalType->ToArithm() || !rhsScalType->ToArithm()) { + EnsureCompatibleOrVoidPointer(lhsScalType, rhsScalType); } // The other constraints are lefted to cast operator - rhs_ = Expr::MayCast(rhs_, lhs_->Type()); + rhs_ = Expr::MayCast(rhs_, ScalarOrLikeTile(rhs_, lhsScalType)); type_ = lhs_->Type(); } @@ -488,13 +570,16 @@ bool UnaryOp::IsLVal() { } -ArithmType* UnaryOp::Convert() { - auto arithmType = operand_->Type()->ToArithm(); +::Type* UnaryOp::Convert() { + auto scalType = operand_->Type()->ScalarType(); + assert(scalType); + auto arithmType = scalType->ToArithm(); assert(arithmType); if (arithmType->IsInteger()) arithmType = ArithmType::IntegerPromote(arithmType); - operand_ = Expr::MayCast(operand_, arithmType); - return arithmType; + ::Type* retType = ScalarOrLikeTile(operand_, arithmType); + operand_ = Expr::MayCast(operand_, retType); + return retType; } @@ -521,20 +606,22 @@ void UnaryOp::TypeChecking() { case Token::CAST: return CastOpTypeChecking(); + case '^': + return TransOpTypeChecking(); + default: assert(false); } } - void UnaryOp::IncDecOpTypeChecking() { if (operand_->IsConstQualified()) { Error(this, "increment/decrement of const qualified expression"); } else if (!operand_->IsLVal()) { Error(this, "lvalue expression expected"); } - - if (!operand_->Type()->IsReal() && !operand_->Type()->ToPointer()) { + auto scalType = TryExtractScalarType(this, operand_); + if (!scalType->IsReal() && !scalType->ToPointer()) { Error(this, "expect operand of real type or pointer"); } type_ = operand_->Type(); @@ -545,43 +632,78 @@ void UnaryOp::AddrOpTypeChecking() { auto funcType = operand_->Type()->ToFunc(); if (funcType == nullptr && !operand_->IsLVal()) Error(this, "expression must be an lvalue or function designator"); + if(operand_->Type()->IsTile()) + Error(this, "cannot take the address of a tile"); type_ = PointerType::New(operand_->Type()); } void UnaryOp::DerefOpTypeChecking() { - auto pointerType = operand_->Type()->ToPointer(); + auto scalType = TryExtractScalarType(this, operand_); + auto pointerType = scalType->ToPointer(); if (!pointerType) Error(this, "pointer expected for deref operator '*'"); - type_ = pointerType->Derived(); + type_ = ScalarOrLikeTile(operand_, pointerType->Derived().GetPtr()); } +void UnaryOp::TransOpTypeChecking() { + auto tileType = operand_->Type()->ToTile(); + if(!tileType) + Error(this, "tile expected for transposition operator '^'"); + auto shape = tileType->Shape(); + std::rotate(shape.begin(), shape.begin() + 1, shape.end()); + type_ = TileType::New(shape, tileType->Derived()); +} + void UnaryOp::UnaryArithmOpTypeChecking() { + auto scalType = TryExtractScalarType(this, operand_); if (Token::PLUS == op_ || Token::MINUS == op_) { - if (!operand_->Type()->ToArithm()) + if (!scalType->ToArithm()) Error(this, "Arithmetic type expected"); Convert(); type_ = operand_->Type(); } else if ('~' == op_) { - if (!operand_->Type()->IsInteger()) + if (!scalType->IsInteger()) Error(this, "integer expected for operator '~'"); Convert(); type_ = operand_->Type(); - } else if (!operand_->Type()->IsScalar()) { + } else if (!scalType->IsScalar()) { Error(this, "arithmetic type or pointer expected for operator '!'"); } else { - type_ = ArithmType::New(T_INT); + type_ = ScalarOrLikeTile(operand_, ArithmType::New(T_INT)); } } void UnaryOp::CastOpTypeChecking() { auto operandType = Type::MayCast(operand_->Type()); - // The type_ has been initiated to dest type if (type_->ToVoid()) { // The expression becomes a void expression + } else if(type_->IsTile() || operandType->IsTile()) { + /* Broadcasting rules: + * 1. Tiles with 1 element can be converted to scalar + * 2. Scalar can be converted to tiles of any shapes + * 3. Tiles can be converted to another tile only if the + * mismatching dimensions are unitary + */ + if(type_->IsScalar() && operandType->ToTile()->NumEle() != 1) + Error(this, "tile with more than one element cannot be casted to scalar"); + if(type_->IsTile() && operandType->IsTile()){ + auto shape = type_->ToTile()->Shape(); + auto operandShape = operandType->ToTile()->Shape(); + if(operandShape.size() > shape.size()) + Error(this, "cast cannot reduce operand rank"); + while(operandShape.size() < shape.size()) + operandShape.insert(operandShape.begin(), 1); + for(size_t i = 0; i < shape.size(); i++) { + if(shape[i] != 1 && operandShape[i] != 1 && shape[i] != operandShape[i]) + Error(this, "cannot broadcast dimension %d " + "for operands of shape %d and %d", + i, shape[i], operandShape[i]); + } + } } else if (!type_->IsScalar() || !operandType->IsScalar()) { if (!type_->Compatible(*operandType)) Error(this, "the cast type should be arithemetic type or pointer"); diff --git a/lib/lang/wgtcc/parser.cc b/lib/lang/wgtcc/parser.cc index 8ec16ee51..cf1e582fc 100644 --- a/lib/lang/wgtcc/parser.cc +++ b/lib/lang/wgtcc/parser.cc @@ -442,11 +442,27 @@ Expr* Parser::ParsePostfixExprTail(Expr* lhs) { Expr* Parser::ParseSubScripting(Expr* lhs) { - auto rhs = ParseExpr(); - auto tok = ts_.Peek(); + auto lhsTile = lhs->Type()->ToTile(); + if(lhsTile == nullptr) + Error(lhs, "tile expected"); + TileType::ShapeInt lhsShape = lhsTile->Shape(); + QualType lhsQual = lhsTile->Derived(); + // create ret shape + TileType::ShapeInt shape; + size_t i = 0; + do { + auto tok = ts_.Next(); + if(tok->tag_ == ':') + shape.push_back(lhsShape[i++]); + else if(tok->tag_ == Token::NEWAXIS) + shape.push_back(1); + else + Error(tok, "only ':' and newaxis are supported in subscripts"); + }while(ts_.Try(',')); ts_.Expect(']'); - auto operand = BinaryOp::New(tok, '+', lhs, rhs); - return UnaryOp::New(Token::DEREF, operand); + // create ret tile + TileType *retType = TileType::New(shape, lhsQual); + return UnaryOp::New(Token::CAST, lhs, retType); } @@ -501,6 +517,7 @@ Expr* Parser::ParseUnaryExpr() { case '-': return ParseUnaryOp(tok, Token::MINUS); case '~': return ParseUnaryOp(tok, '~'); case '!': return ParseUnaryOp(tok, '!'); + case '^': return ParseUnaryOp(tok, Token::XOR); default: ts_.PutBack(); return ParsePostfixExpr(); @@ -584,7 +601,7 @@ Expr* Parser::ParseCastExpr() { Expr* Parser::ParseRangeExpr() { auto lhs = ParseCastExpr(); auto tok = ts_.Next(); - while (tok->tag_ == Token::ELLIPSIS){ + while (tok->tag_ == Token::ELLIPSIS) { auto rhs = ParseCastExpr(); lhs = BinaryOp::New(tok, lhs, rhs); tok = ts_.Next(); @@ -593,16 +610,26 @@ Expr* Parser::ParseRangeExpr() { return lhs; } -Expr* Parser::ParseMultiplicativeExpr() { +Expr* Parser::ParseMatmulExpr() { auto lhs = ParseRangeExpr(); auto tok = ts_.Next(); - while (tok->tag_ == '*' || tok->tag_ == '/' || tok->tag_ == '%') { + while (tok->tag_ == Token::MATMUL) { auto rhs = ParseRangeExpr(); lhs = BinaryOp::New(tok, lhs, rhs); - tok = ts_.Next(); } + ts_.PutBack(); + return lhs; +} +Expr* Parser::ParseMultiplicativeExpr() { + auto lhs = ParseMatmulExpr(); + auto tok = ts_.Next(); + while (tok->tag_ == '*' || tok->tag_ == '/' || tok->tag_ == '%') { + auto rhs = ParseMatmulExpr(); + lhs = BinaryOp::New(tok, lhs, rhs); + tok = ts_.Next(); + } ts_.PutBack(); return lhs; } diff --git a/lib/lang/wgtcc/token.cc b/lib/lang/wgtcc/token.cc index 62c9b41f6..ba588588e 100644 --- a/lib/lang/wgtcc/token.cc +++ b/lib/lang/wgtcc/token.cc @@ -27,6 +27,7 @@ const std::unordered_map Token::kwTypeMap_ { { "inline", Token::INLINE }, { "int", Token::INT }, { "long", Token::LONG }, + { "newaxis", Token::NEWAXIS }, { "signed", Token::SIGNED }, { "unsigned", Token::UNSIGNED }, { "register", Token::REGISTER }, @@ -126,6 +127,7 @@ const std::unordered_map Token::tagLexemeMap_ { { Token::INLINE, "inline" }, { Token::INT, "int" }, { Token::LONG, "long" }, + { Token::NEWAXIS, "newaxis" }, { Token::SIGNED, "signed" }, { Token::UNSIGNED, "unsigned" }, { Token::REGISTER, "register" }, diff --git a/lib/lang/wgtcc/type.cc b/lib/lang/wgtcc/type.cc index 369e8ed05..94f17b985 100644 --- a/lib/lang/wgtcc/type.cc +++ b/lib/lang/wgtcc/type.cc @@ -32,6 +32,18 @@ QualType Type::MayCast(QualType type, bool inProtoScope) { return type; } +const Type* Type::ScalarType() const { + if(IsScalar()) + return this; + if(const TileType* p = ToTile()) + return p->Derived().GetPtr(); + return nullptr; +} + +Type* Type::ScalarType() { + auto cthis = const_cast(this); + return const_cast(cthis->ScalarType()); +} VoidType* VoidType::New() { static auto ret = new (voidTypePool.Alloc()) VoidType(&voidTypePool); @@ -143,12 +155,16 @@ int ArithmType::Width() const { return intWidth_ << 1; case T_LLONG: case T_UNSIGNED | T_LLONG: return intWidth_ << 1; + case T_HALF: + return intWidth_ >> 1; case T_FLOAT: return intWidth_; case T_DOUBLE: return intWidth_ << 1; case T_LONG | T_DOUBLE: return intWidth_ << 1; + case T_HALF | T_COMPLEX: + return intWidth_; case T_FLOAT | T_COMPLEX: return intWidth_ << 1; case T_DOUBLE | T_COMPLEX: @@ -171,9 +187,10 @@ int ArithmType::Rank() const { case T_INT: case T_UNSIGNED: case T_UNSIGNED | T_INT: return 3; case T_LONG: case T_UNSIGNED | T_LONG: return 4; case T_LLONG: case T_UNSIGNED | T_LLONG: return 5; - case T_FLOAT: return 6; - case T_DOUBLE: return 7; - case T_LONG | T_DOUBLE: return 8; + case T_HALF: return 6; + case T_FLOAT: return 7; + case T_DOUBLE: return 8; + case T_LONG | T_DOUBLE: return 9; default: assert(tag_ & T_COMPLEX); Error("complex not supported yet"); From 5224bbbe061d44c92f18ab8c4dd75aa847ab1ffb Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 20 Aug 2019 18:05:37 -0700 Subject: [PATCH 309/494] preparing codegen --- include/triton/lang/wgtcc/t_code_gen.h | 122 +++++++++++++++++++++++++ lib/lang/wgtcc/ast.cc | 2 - lib/lang/wgtcc/evaluator.cc | 6 +- lib/lang/wgtcc/t_code_gen.cc | 8 ++ 4 files changed, 131 insertions(+), 7 deletions(-) create mode 100644 include/triton/lang/wgtcc/t_code_gen.h create mode 100644 lib/lang/wgtcc/t_code_gen.cc diff --git a/include/triton/lang/wgtcc/t_code_gen.h b/include/triton/lang/wgtcc/t_code_gen.h new file mode 100644 index 000000000..00abfabba --- /dev/null +++ b/include/triton/lang/wgtcc/t_code_gen.h @@ -0,0 +1,122 @@ +#ifndef _WGTCC_CODE_GEN_H_ +#define _WGTCC_CODE_GEN_H_ + +#include "ast.h" +#include "visitor.h" + +namespace triton{ +namespace ir{ + +class value; +class module; + +} +} + +using namespace triton; + +class Parser; +struct Addr; +template<> class Evaluator; +struct StaticInitializer; + +using TypeList = std::vector; +using LocationList = std::vector; +using StaticInitList = std::vector; + + +class Generator: public Visitor { + friend class Evaluator; +public: + Generator(Parser* parser, ir::module& mod) : parser_(parser), mod_(mod){} + + virtual void Visit(ASTNode* node) { node->Accept(this); } + void VisitExpr(Expr* expr) { expr->Accept(this); } + void VisitStmt(Stmt* stmt) { stmt->Accept(this); } + + // Expression + virtual void VisitBinaryOp(BinaryOp* binaryOp); + virtual void VisitUnaryOp(UnaryOp* unaryOp); + virtual void VisitConditionalOp(ConditionalOp* condOp); + virtual void VisitFuncCall(FuncCall* funcCall); + virtual void VisitObject(Object* obj); + virtual void VisitEnumerator(Enumerator* enumer); + virtual void VisitIdentifier(Identifier* ident); + virtual void VisitConstant(Constant* cons); + virtual void VisitTempVar(TempVar* tempVar); + + // Statement + virtual void VisitDeclaration(Declaration* init); + virtual void VisitEmptyStmt(EmptyStmt* emptyStmt); + virtual void VisitIfStmt(IfStmt* ifStmt); + virtual void VisitJumpStmt(JumpStmt* jumpStmt); + virtual void VisitReturnStmt(ReturnStmt* returnStmt); + virtual void VisitLabelStmt(LabelStmt* labelStmt); + virtual void VisitCompoundStmt(CompoundStmt* compoundStmt); + + virtual void VisitFuncDef(FuncDef* funcDef); + virtual void VisitTranslationUnit(TranslationUnit* unit); + + void Gen(); + +protected: + // Binary + void GenCommaOp(BinaryOp* comma); + void GenMemberRefOp(BinaryOp* binaryOp); + void GenAndOp(BinaryOp* binaryOp); + void GenOrOp(BinaryOp* binaryOp); + void GenAddOp(BinaryOp* binaryOp); + void GenSubOp(BinaryOp* binaryOp); + void GenAssignOp(BinaryOp* assign); + void GenCastOp(UnaryOp* cast); + void GenDerefOp(UnaryOp* deref); + void GenMinusOp(UnaryOp* minus); + void GenPointerArithm(BinaryOp* binary); + void GenDivOp(bool flt, bool sign, int width, int op); + void GenMulOp(int width, bool flt, bool sign); + void GenCompOp(int width, bool flt, const char* set); + void GenCompZero(Type* type); + + // Unary + void GenIncDec(Expr* operand, bool postfix, const std::string& inst); + StaticInitializer GetStaticInit(InitList::iterator& iter, + InitList::iterator end, int offset); + void GenStaticDecl(Declaration* decl); + void GenSaveArea(); + void GenBuiltin(FuncCall* funcCall); + + void AllocObjects(Scope* scope, + const FuncDef::ParamList& params=FuncDef::ParamList()); + +protected: + Parser* parser_; + ir::module& mod_; +}; + + +class LValGenerator: public Generator { +public: + LValGenerator(Parser* parser, ir::module& mod): Generator(parser, mod) {} + + // Expression + virtual void VisitBinaryOp(BinaryOp* binaryOp); + virtual void VisitUnaryOp(UnaryOp* unaryOp); + virtual void VisitObject(Object* obj); + virtual void VisitIdentifier(Identifier* ident); + + virtual void VisitConditionalOp(ConditionalOp* condOp) { assert(false); } + virtual void VisitFuncCall(FuncCall* funcCall) { assert(false); } + virtual void VisitEnumerator(Enumerator* enumer) { assert(false); } + virtual void VisitConstant(Constant* cons) { assert(false); } + virtual void VisitTempVar(TempVar* tempVar); + + ir::value* GenExpr(Expr* expr) { + expr->Accept(this); + return addr_; + } + +private: + ir::value* addr_; +}; + +#endif diff --git a/lib/lang/wgtcc/ast.cc b/lib/lang/wgtcc/ast.cc index 5646fbb4c..d194d4c0f 100644 --- a/lib/lang/wgtcc/ast.cc +++ b/lib/lang/wgtcc/ast.cc @@ -1,6 +1,4 @@ #include "triton/lang/wgtcc/ast.h" - -#include "triton/lang/wgtcc/code_gen.h" #include "triton/lang/wgtcc/error.h" #include "triton/lang/wgtcc/evaluator.h" #include "triton/lang/wgtcc/mem_pool.h" diff --git a/lib/lang/wgtcc/evaluator.cc b/lib/lang/wgtcc/evaluator.cc index 956fe21a6..02cb224f9 100644 --- a/lib/lang/wgtcc/evaluator.cc +++ b/lib/lang/wgtcc/evaluator.cc @@ -1,7 +1,5 @@ #include "triton/lang/wgtcc/evaluator.h" - #include "triton/lang/wgtcc/ast.h" -#include "triton/lang/wgtcc/code_gen.h" #include "triton/lang/wgtcc/token.h" @@ -201,9 +199,7 @@ void Evaluator::VisitConstant(Constant* cons) { if (cons->Type()->IsInteger()) { addr_ = {"", static_cast(cons->IVal())}; } else if (cons->Type()->ToArray()) { - Generator().ConsLabel(cons); // Add the literal to rodatas_. - addr_.label_ = Generator::rodatas_.back().label_; - addr_.offset_ = 0; + assert(false); } else { assert(false); } diff --git a/lib/lang/wgtcc/t_code_gen.cc b/lib/lang/wgtcc/t_code_gen.cc new file mode 100644 index 000000000..4d78944e8 --- /dev/null +++ b/lib/lang/wgtcc/t_code_gen.cc @@ -0,0 +1,8 @@ +#include "triton/lang/wgtcc/t_code_gen.h" +#include "triton/lang/wgtcc/evaluator.h" +#include "triton/lang/wgtcc/parser.h" +#include "triton/lang/wgtcc/token.h" + +void Generator::Gen() { + VisitTranslationUnit(parser_->Unit()); +} From a23225ad37eb6fc7012c201dd137b1286f4a8c63 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 21 Aug 2019 18:27:02 -0700 Subject: [PATCH 310/494] more progress --- include/triton/ir/builder.h | 1 + include/triton/ir/constant.h | 5 +- include/triton/ir/context_impl.h | 2 +- include/triton/ir/module.h | 1 + include/triton/ir/type.h | 3 +- include/triton/lang/wgtcc/ast.h | 11 +- include/triton/lang/wgtcc/code_gen.h | 285 +--- include/triton/lang/wgtcc/t_code_gen.h | 122 -- lib/ir/builder.cpp | 1 + lib/ir/constant.cpp | 28 +- lib/lang/wgtcc/code_gen.cc | 1887 +++++------------------- lib/lang/wgtcc/main.cc | 223 --- lib/lang/wgtcc/t_code_gen.cc | 8 - 13 files changed, 484 insertions(+), 2093 deletions(-) delete mode 100644 include/triton/lang/wgtcc/t_code_gen.h delete mode 100644 lib/lang/wgtcc/t_code_gen.cc diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h index bbd015c7e..3140565cc 100644 --- a/include/triton/ir/builder.h +++ b/include/triton/ir/builder.h @@ -84,6 +84,7 @@ public: value *create_add(value *lhs, value *rhs, const std::string &name = "", bool has_nuw = false, bool has_nsw = false); value *create_sub(value *lhs, value *rhs, const std::string &name = "", bool has_nuw = false, bool has_nsw = false); value *create_shl(value *lhs, value *rhs, const std::string &name = "", bool has_nuw = false, bool has_nsw = false); + value *create_lshr(value *lhs, value *rhs, const std::string &name = "", bool has_nuw = false, bool has_nsw = false); value *create_ashr(value *lhs, value *rhs, const std::string &name = "", bool has_nuw = false, bool has_nsw = false); // GEP value *create_gep(value *ptr, const std::vector& idx_list, const std::string &name = ""); diff --git a/include/triton/ir/constant.h b/include/triton/ir/constant.h index dea139d1d..b33c9d427 100644 --- a/include/triton/ir/constant.h +++ b/include/triton/ir/constant.h @@ -102,13 +102,14 @@ private: /* constant fp */ class constant_fp: public constant{ - constant_fp(context &ctx, double value); + constant_fp(type *ty, double value); public: double get_value() { return value_; } static constant* get_negative_zero(type *ty); static constant* get_zero_value_for_negation(type *ty); - static constant *get(context &ctx, double v); + static constant* get(context &ctx, double v); + static constant* get(type *ty, double v); private: double value_; diff --git a/include/triton/ir/context_impl.h b/include/triton/ir/context_impl.h index cd41d20db..0ca515f45 100644 --- a/include/triton/ir/context_impl.h +++ b/include/triton/ir/context_impl.h @@ -32,7 +32,7 @@ public: // Int constants std::map, constant_int*> int_constants_; // Float constants - std::map fp_constants_; + std::map, constant_fp*> fp_constants_; // undef values std::map uv_constants_; // Metaparameters diff --git a/include/triton/ir/module.h b/include/triton/ir/module.h index 238968e7b..f91269b38 100644 --- a/include/triton/ir/module.h +++ b/include/triton/ir/module.h @@ -34,6 +34,7 @@ class alloc_const; /* Module */ struct scope { std::map types; + std::map values; }; class module { diff --git a/include/triton/ir/type.h b/include/triton/ir/type.h index 6f1df7ec7..780d79e74 100644 --- a/include/triton/ir/type.h +++ b/include/triton/ir/type.h @@ -38,7 +38,8 @@ public: IntegerTyID, ///< 10: Arbitrary bit width integers FunctionTyID, ///< 11: Functions PointerTyID, ///< 12: Pointers - TileTyID, ///< 13: Tile + StructTyID, ///< 13: Struct + TileTyID, ///< 14: Tile }; public: diff --git a/include/triton/lang/wgtcc/ast.h b/include/triton/lang/wgtcc/ast.h index 1181fb63a..fc5dca330 100644 --- a/include/triton/lang/wgtcc/ast.h +++ b/include/triton/lang/wgtcc/ast.h @@ -40,6 +40,7 @@ class Enumerator; // Statements class Stmt; class IfStmt; +class ForStmt; class JumpStmt; class LabelStmt; class EmptyStmt; @@ -263,7 +264,7 @@ class Expr : public Stmt { template friend class Evaluator; friend class AddrEvaluator; friend class Generator; - friend class LValGenerator; + friend class LValAssigner; public: virtual ~Expr() {} @@ -308,7 +309,7 @@ class BinaryOp : public Expr { template friend class Evaluator; friend class AddrEvaluator; friend class Generator; - friend class LValGenerator; + friend class LValAssigner; friend class Declaration; public: @@ -375,7 +376,7 @@ class UnaryOp : public Expr { template friend class Evaluator; friend class AddrEvaluator; friend class Generator; - friend class LValGenerator; + friend class LValAssigner; public: static UnaryOp* New(int op, Expr* operand, QualType type=nullptr); @@ -538,7 +539,7 @@ class Identifier: public Expr { template friend class Evaluator; friend class AddrEvaluator; friend class Generator; - friend class LValGenerator; + friend class LValAssigner; public: static Identifier* New(const Token* tok, QualType type, Linkage linkage); @@ -596,7 +597,7 @@ class Object : public Identifier { template friend class Evaluator; friend class AddrEvaluator; friend class Generator; - friend class LValGenerator; + friend class LValAssigner; public: static Object* New(const Token* tok, diff --git a/include/triton/lang/wgtcc/code_gen.h b/include/triton/lang/wgtcc/code_gen.h index 31ed8fca9..ff9b0fab2 100644 --- a/include/triton/lang/wgtcc/code_gen.h +++ b/include/triton/lang/wgtcc/code_gen.h @@ -3,90 +3,50 @@ #include "ast.h" #include "visitor.h" +#include +namespace triton{ +namespace ir{ + +class value; +class module; +class type; +class context; +class builder; + +} +} + +using namespace triton; class Parser; struct Addr; -struct ROData; template<> class Evaluator; struct StaticInitializer; +class LValAssigner; using TypeList = std::vector; using LocationList = std::vector; -using RODataList = std::vector; using StaticInitList = std::vector; - -enum class ParamClass { - INTEGER, - SSE, - SSEUP, - X87, - X87_UP, - COMPLEX_X87, - NO_CLASS, - MEMORY -}; - -struct ParamLocations { - LocationList locs_; - size_t regCnt_; - size_t xregCnt_; -}; - -struct ROData { - ROData(long ival, int align): ival_(ival), align_(align) { - label_ = ".LC" + std::to_string(GenTag()); - } - - explicit ROData(const std::string& sval): sval_(sval), align_(1) { - label_ = ".LC" + std::to_string(GenTag()); - } - - ~ROData() {} - - std::string sval_; - long ival_; - int align_; - std::string label_; - -private: - static long GenTag() { - static long tag = 0; - return tag++; - } -}; - - -struct ObjectAddr { - explicit ObjectAddr(int offset) - : ObjectAddr("", "%rbp", offset) {} - - ObjectAddr(const std::string& label, const std::string& base, int offset) - : label_(label), base_(base), offset_(offset) {} - - std::string Repr() const; - - std::string label_; - std::string base_; - int offset_; - unsigned char bitFieldBegin_ {0}; - unsigned char bitFieldWidth_ {0}; -}; - - -struct StaticInitializer { - int offset_; - int width_; - long val_; - std::string label_; -}; - +// Error +inline void should_not_happen() { assert(false); } +inline void error_not_implemented() { assert(false); } class Generator: public Visitor { friend class Evaluator; + friend class LValAssigner; + +protected: + struct scope { + std::map types; + std::map values; + }; + + void set_ret(ir::value* value); + public: - Generator() {} + Generator(Parser* parser) : parser_(parser) {} virtual void Visit(ASTNode* node) { node->Accept(this); } void VisitExpr(Expr* expr) { expr->Accept(this); } @@ -115,160 +75,75 @@ public: virtual void VisitFuncDef(FuncDef* funcDef); virtual void VisitTranslationUnit(TranslationUnit* unit); - - static void SetInOut(Parser* parser, FILE* outFile) { - parser_ = parser; - outFile_ = outFile; - } - - void Gen(); + void Gen(ir::module *mod); protected: - // Binary - void GenCommaOp(BinaryOp* comma); - void GenMemberRefOp(BinaryOp* binaryOp); - void GenAndOp(BinaryOp* binaryOp); - void GenOrOp(BinaryOp* binaryOp); - void GenAddOp(BinaryOp* binaryOp); - void GenSubOp(BinaryOp* binaryOp); - void GenAssignOp(BinaryOp* assign); - void GenCastOp(UnaryOp* cast); - void GenDerefOp(UnaryOp* deref); - void GenMinusOp(UnaryOp* minus); - void GenPointerArithm(BinaryOp* binary); - void GenDivOp(bool flt, bool sign, int width, int op); - void GenMulOp(int width, bool flt, bool sign); - void GenCompOp(int width, bool flt, const char* set); - void GenCompZero(Type* type); + // Triton-IR values + ir::value* GenAssignOp(Expr* lvalue, ir::value* rhs); + ir::value* GenCastOp(ir::value* op, ir::type* type); - // Unary - void GenIncDec(Expr* operand, bool postfix, const std::string& inst); + // Triton-IR types + static ir::type* GenIRType(::Type* type, ir::context &ctx); + static ir::type* GenIRArithmType(ArithmType* type, ir::context& ctx); + static ir::type* GenIRArrayType(ArrayType* type, ir::context& ctx); + static ir::type* GenIRTileType(TileType* type, ir::context& ctx); + static ir::type* GenIRFuncType(FuncType* type, ir::context& ctx); + static ir::type* GenIRPointerType(PointerType* type, ir::context& ctx); + static ir::type* GenIRStructType(StructType* type, ir::context& ctx); + void AllocObjects(Scope* scope, const FuncDef::ParamList& params=FuncDef::ParamList()); - StaticInitializer GetStaticInit(InitList::iterator& iter, - InitList::iterator end, int offset); + // SSA + void pushScope(); + void popScope(); - void GenStaticDecl(Declaration* decl); +private: + Parser* parser_; + ir::value* ret_; + ir::builder* bld_; + ir::context* ctx_; + ir::module* mod_; - void GenSaveArea(); - void GenBuiltin(FuncCall* funcCall); - - void AllocObjects(Scope* scope, - const FuncDef::ParamList& params=FuncDef::ParamList()); - - void CopyStruct(ObjectAddr desAddr, int width); - - std::string ConsLabel(Constant* cons); - - ParamLocations GetParamLocations(const TypeList& types, bool retStruct); - void GetParamRegOffsets(int& gpOffset, int& fpOffset, - int& overflow, FuncType* funcType); - - void Emit(const std::string& str) { - fprintf(outFile_, "\t%s\n", str.c_str()); - } - - void Emit(const std::string& inst, - const std::string& src, - const std::string& des) { - Emit(inst + "\t" + src + ", " + des); - } - - void Emit(const std::string& inst, - int imm, - const std::string& reg) { - Emit(inst + "\t$" + std::to_string(imm) + ", " + reg); - } - - void Emit(const std::string& inst, - const std::string& des) { - Emit(inst + "\t" + des); - } - - void Emit(const std::string& inst, - const LabelStmt* label) { - Emit(inst + "\t" + label->Repr()); - } - - void Emit(const std::string& inst, - const ObjectAddr& src, - const ObjectAddr& des) { - Emit(inst, src.Repr(), des.Repr()); - } - - void Emit(const std::string& inst, - const std::string& src, - const ObjectAddr& des) { - Emit(inst, src, des.Repr()); - } - - void Emit(const std::string& inst, - const ObjectAddr& src, - const std::string& des) { - Emit(inst, src.Repr(), des); - } - - void EmitLabel(const std::string& label); - void EmitZero(ObjectAddr addr, int width); - void EmitLoad(const std::string& addr, Type* type); - void EmitLoad(const std::string& addr, int width, bool flt); - void EmitStore(const ObjectAddr& addr, Type* type); - void EmitStore(const std::string& addr, Type* type); - void EmitStore(const std::string& addr, int width, bool flt); - void EmitLoadBitField(const std::string& addr, Object* bitField); - void EmitStoreBitField(const ObjectAddr& addr, Type* type); - void EmitLoc(Expr* expr); - - int Push(Type* type); - int Push(const std::string& reg); - int Pop(const std::string& reg); - - void Spill(bool flt); - - void Restore(bool flt); - - void Save(bool flt); - - void Exchange(bool flt); - -protected: - static const std::string* last_file; - static Parser* parser_; - static FILE* outFile_; - static RODataList rodatas_; - static int offset_; - - // The address that store the register %rdi, - // when the return value is a struct/union - static int retAddrOffset_; - static FuncDef* curFunc_; - - static std::vector staticDecls_; +private: +// std::stack scopes_; + LValAssigner* assign_; }; -class LValGenerator: public Generator { +class LValAssigner: public Visitor { public: - LValGenerator() {} + LValAssigner(Generator* gen): gen_(gen) {} // Expression - virtual void VisitBinaryOp(BinaryOp* binaryOp); - virtual void VisitUnaryOp(UnaryOp* unaryOp); - virtual void VisitObject(Object* obj); - virtual void VisitIdentifier(Identifier* ident); + void VisitBinaryOp(BinaryOp* binaryOp); + void VisitUnaryOp(UnaryOp* unaryOp); + void VisitObject(Object* obj); + void VisitIdentifier(Identifier* ident); - virtual void VisitConditionalOp(ConditionalOp* condOp) { assert(false); } - virtual void VisitFuncCall(FuncCall* funcCall) { assert(false); } - virtual void VisitEnumerator(Enumerator* enumer) { assert(false); } - virtual void VisitConstant(Constant* cons) { assert(false); } - virtual void VisitTempVar(TempVar* tempVar); + void VisitConditionalOp(ConditionalOp*) { should_not_happen(); } + void VisitFuncCall(FuncCall*) { should_not_happen(); } + void VisitEnumerator(Enumerator*) { should_not_happen(); } + void VisitConstant(Constant*) { should_not_happen(); } + void VisitTempVar(TempVar*) { should_not_happen(); } + void VisitDeclaration(Declaration*) { should_not_happen(); } + void VisitEmptyStmt(EmptyStmt*) { should_not_happen(); } + void VisitIfStmt(IfStmt*) { should_not_happen(); } + void VisitJumpStmt(JumpStmt*) { should_not_happen(); } + void VisitReturnStmt(ReturnStmt*) { should_not_happen(); } + void VisitLabelStmt(LabelStmt*) { should_not_happen(); } + void VisitCompoundStmt(CompoundStmt*) { should_not_happen(); } + void VisitFuncDef(FuncDef*) { should_not_happen(); } + void VisitTranslationUnit(TranslationUnit*) { should_not_happen(); } - ObjectAddr GenExpr(Expr* expr) { + ir::value* GenExpr(Expr* expr, ir::value* rhs) { + rhs_ = rhs; expr->Accept(this); - return addr_; + return ret_; } private: - ObjectAddr addr_ {"", "", 0}; + ir::value* ret_; + ir::value* rhs_; + Generator* gen_; }; #endif diff --git a/include/triton/lang/wgtcc/t_code_gen.h b/include/triton/lang/wgtcc/t_code_gen.h deleted file mode 100644 index 00abfabba..000000000 --- a/include/triton/lang/wgtcc/t_code_gen.h +++ /dev/null @@ -1,122 +0,0 @@ -#ifndef _WGTCC_CODE_GEN_H_ -#define _WGTCC_CODE_GEN_H_ - -#include "ast.h" -#include "visitor.h" - -namespace triton{ -namespace ir{ - -class value; -class module; - -} -} - -using namespace triton; - -class Parser; -struct Addr; -template<> class Evaluator; -struct StaticInitializer; - -using TypeList = std::vector; -using LocationList = std::vector; -using StaticInitList = std::vector; - - -class Generator: public Visitor { - friend class Evaluator; -public: - Generator(Parser* parser, ir::module& mod) : parser_(parser), mod_(mod){} - - virtual void Visit(ASTNode* node) { node->Accept(this); } - void VisitExpr(Expr* expr) { expr->Accept(this); } - void VisitStmt(Stmt* stmt) { stmt->Accept(this); } - - // Expression - virtual void VisitBinaryOp(BinaryOp* binaryOp); - virtual void VisitUnaryOp(UnaryOp* unaryOp); - virtual void VisitConditionalOp(ConditionalOp* condOp); - virtual void VisitFuncCall(FuncCall* funcCall); - virtual void VisitObject(Object* obj); - virtual void VisitEnumerator(Enumerator* enumer); - virtual void VisitIdentifier(Identifier* ident); - virtual void VisitConstant(Constant* cons); - virtual void VisitTempVar(TempVar* tempVar); - - // Statement - virtual void VisitDeclaration(Declaration* init); - virtual void VisitEmptyStmt(EmptyStmt* emptyStmt); - virtual void VisitIfStmt(IfStmt* ifStmt); - virtual void VisitJumpStmt(JumpStmt* jumpStmt); - virtual void VisitReturnStmt(ReturnStmt* returnStmt); - virtual void VisitLabelStmt(LabelStmt* labelStmt); - virtual void VisitCompoundStmt(CompoundStmt* compoundStmt); - - virtual void VisitFuncDef(FuncDef* funcDef); - virtual void VisitTranslationUnit(TranslationUnit* unit); - - void Gen(); - -protected: - // Binary - void GenCommaOp(BinaryOp* comma); - void GenMemberRefOp(BinaryOp* binaryOp); - void GenAndOp(BinaryOp* binaryOp); - void GenOrOp(BinaryOp* binaryOp); - void GenAddOp(BinaryOp* binaryOp); - void GenSubOp(BinaryOp* binaryOp); - void GenAssignOp(BinaryOp* assign); - void GenCastOp(UnaryOp* cast); - void GenDerefOp(UnaryOp* deref); - void GenMinusOp(UnaryOp* minus); - void GenPointerArithm(BinaryOp* binary); - void GenDivOp(bool flt, bool sign, int width, int op); - void GenMulOp(int width, bool flt, bool sign); - void GenCompOp(int width, bool flt, const char* set); - void GenCompZero(Type* type); - - // Unary - void GenIncDec(Expr* operand, bool postfix, const std::string& inst); - StaticInitializer GetStaticInit(InitList::iterator& iter, - InitList::iterator end, int offset); - void GenStaticDecl(Declaration* decl); - void GenSaveArea(); - void GenBuiltin(FuncCall* funcCall); - - void AllocObjects(Scope* scope, - const FuncDef::ParamList& params=FuncDef::ParamList()); - -protected: - Parser* parser_; - ir::module& mod_; -}; - - -class LValGenerator: public Generator { -public: - LValGenerator(Parser* parser, ir::module& mod): Generator(parser, mod) {} - - // Expression - virtual void VisitBinaryOp(BinaryOp* binaryOp); - virtual void VisitUnaryOp(UnaryOp* unaryOp); - virtual void VisitObject(Object* obj); - virtual void VisitIdentifier(Identifier* ident); - - virtual void VisitConditionalOp(ConditionalOp* condOp) { assert(false); } - virtual void VisitFuncCall(FuncCall* funcCall) { assert(false); } - virtual void VisitEnumerator(Enumerator* enumer) { assert(false); } - virtual void VisitConstant(Constant* cons) { assert(false); } - virtual void VisitTempVar(TempVar* tempVar); - - ir::value* GenExpr(Expr* expr) { - expr->Accept(this); - return addr_; - } - -private: - ir::value* addr_; -}; - -#endif diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cpp index 9fe444dd1..b4ff3c5b2 100644 --- a/lib/ir/builder.cpp +++ b/lib/ir/builder.cpp @@ -185,6 +185,7 @@ DEFINE_NOWRAP_BINARY(add, binary_op_t::Add) DEFINE_NOWRAP_BINARY(sub, binary_op_t::Sub) DEFINE_NOWRAP_BINARY(shl, binary_op_t::Shl) DEFINE_NOWRAP_BINARY(ashr, binary_op_t::AShr) +DEFINE_NOWRAP_BINARY(lshr, binary_op_t::LShr) DEFINE_BINARY_INT(sdiv, binary_op_t::SDiv) DEFINE_BINARY_INT(udiv, binary_op_t::UDiv) DEFINE_BINARY_INT(srem, binary_op_t::SRem) diff --git a/lib/ir/constant.cpp b/lib/ir/constant.cpp index 5ace19a04..883df9967 100644 --- a/lib/ir/constant.cpp +++ b/lib/ir/constant.cpp @@ -16,17 +16,11 @@ constant *constant::get_null_value(type *ty) { case type::IntegerTyID: return constant_int::get(ty, 0); case type::HalfTyID: - return constant_fp::get(ctx, 0); + return constant_fp::get(type::get_half_ty(ctx), 0); case type::FloatTyID: - return constant_fp::get(ctx, 0); + return constant_fp::get(type::get_float_ty(ctx), 0); case type::DoubleTyID: - return constant_fp::get(ctx, 0); - case type::X86_FP80TyID: - return constant_fp::get(ctx, 0); - case type::FP128TyID: - return constant_fp::get(ctx, 0); - case type::PPC_FP128TyID: - return constant_fp::get(ctx, 0); + return constant_fp::get(type::get_double_ty(ctx), 0); default: throw std::runtime_error("Cannot create a null constant of that type!"); } @@ -38,7 +32,7 @@ constant *constant::get_all_ones_value(type *ty) { if(ty->is_integer_ty()) return constant_int::get(ty, 0xFFFFFFFF); if(ty->is_floating_point_ty()) - return constant_fp::get(ty->get_context(), 0xFFFFFFFF); + return constant_fp::get(ty, 0xFFFFFFFF); throw std::runtime_error("Cannot create all ones value for that type!"); } @@ -83,12 +77,12 @@ const constant_int* constant_range::get_last() const { // constant_fp // FIXME use something like APFloat -constant_fp::constant_fp(context &ctx, double value) - : constant(type::get_float_ty(ctx), 0), value_(value){ } +constant_fp::constant_fp(type *ty, double value) + : constant(ty, 0), value_(value){ } constant *constant_fp::get_negative_zero(type *ty){ double neg_zero = 0; - return get(ty->get_context(), neg_zero); + return get(ty, neg_zero); } constant *constant_fp::get_zero_value_for_negation(type *ty) { @@ -97,11 +91,11 @@ constant *constant_fp::get_zero_value_for_negation(type *ty) { return constant::get_null_value(ty); } -constant *constant_fp::get(context &ctx, double v){ - context_impl *impl = ctx.p_impl.get(); - constant_fp *&result = impl->fp_constants_[v]; +constant *constant_fp::get(type *ty, double v){ + context_impl *impl = ty->get_context().p_impl.get(); + constant_fp *&result = impl->fp_constants_[std::make_pair(ty, v)]; if(!result) - result = new constant_fp(ctx, v); + result = new constant_fp(ty, v); return result; } diff --git a/lib/lang/wgtcc/code_gen.cc b/lib/lang/wgtcc/code_gen.cc index ca92d6e84..e28ccdb53 100644 --- a/lib/lang/wgtcc/code_gen.cc +++ b/lib/lang/wgtcc/code_gen.cc @@ -1,1561 +1,430 @@ #include "triton/lang/wgtcc/code_gen.h" - #include "triton/lang/wgtcc/evaluator.h" #include "triton/lang/wgtcc/parser.h" #include "triton/lang/wgtcc/token.h" +#include "triton/ir/module.h" -#include -#include -#include +// Helpers +void Generator::set_ret(ir::value* value) { + ret_ = value; +} - -extern std::string filename_in; -extern std::string filename_out; -extern bool debug; - -const std::string* Generator::last_file = nullptr; -Parser* Generator::parser_ = nullptr; -FILE* Generator::outFile_ = nullptr; -RODataList Generator::rodatas_; -std::vector Generator::staticDecls_; -int Generator::offset_ = 0; -int Generator::retAddrOffset_ = 0; -FuncDef* Generator::curFunc_ = nullptr; - - -/* - * Register usage: - * xmm0: accumulator of floating datas; - * xmm8: temp register for param passing(xmm0) - * xmm9: source operand register; - * xmm10: tmp register for floating data swap; - * rax: accumulator; - * r12, r13: temp register for rdx and rcx - * r11: source operand register; - * r10: base register when LValGenerator eval the address. - * rcx: tempvar register, like the tempvar of 'switch' - * temp register for struct copy - */ - -static std::vector regs { - "%rdi", "%rsi", "%rdx", - "%rcx", "%r8", "%r9" -}; - -static std::vector xregs { - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7" -}; - - -static ParamClass Classify(Type* paramType, int offset=0) { - if (paramType->IsInteger() || paramType->ToPointer() - || paramType->ToArray()) { - return ParamClass::INTEGER; - } - - if (paramType->ToArithm()) { - auto type = paramType->ToArithm(); - if (type->Tag() == T_FLOAT || type->Tag() == T_DOUBLE) - return ParamClass::SSE; - if (type->Tag() == (T_LONG | T_DOUBLE)) { - // TODO(wgtdkp): - return ParamClass::SSE; - assert(false); - return ParamClass::X87; - } - - // TODO(wgtdkp): - assert(false); - // It is complex - if ((type->Tag() & T_LONG) && (type->Tag() & T_DOUBLE)) - return ParamClass::COMPLEX_X87; - } - auto type = paramType->ToStruct(); - assert(type); - return ParamClass::MEMORY; - // TODO(wgtdkp): Support agrregate type - assert(false); - /* - auto type = paramType->ToStruct(); - assert(type); - - if (type->Width() > 4 * 8) - return PC_MEMORY; - - std::vector classes; - int cnt = (type->Width() + 7) / 8; - for (int i = 0; i < cnt; ++i) { - auto types = FieldsIn8Bytes(type, i); - assert(types.size() > 0); - - auto fieldClass = (types.size() == 1) - ? PC_NO_CLASS: FieldClass(types, 0); - classes.push_back(fieldClass); - - } - - bool sawX87 = false; - for (int i = 0; i < classes.size(); ++i) { - if (classes[i] == PC_MEMORY) - return PC_MEMORY; - if (classes[i] == PC_X87_UP && sawX87) - return PC_MEMORY; - if (classes[i] == PC_X87) - sawX87 = true; - } - */ - return ParamClass::NO_CLASS; // Make compiler happy +inline bool is_terminator(ir::value* x) { + return x && dynamic_cast(x); } -std::string Generator::ConsLabel(Constant* cons) { - if (cons->Type()->IsInteger()) { - return "$" + std::to_string(cons->IVal()); - } else if (cons->Type()->IsFloat()) { - double valsd = cons->FVal(); - float valss = valsd; - // TODO(wgtdkp): Add rodata - auto width = cons->Type()->Width(); - long val = (width == 4)? *reinterpret_cast(&valss): - *reinterpret_cast(&valsd); - const ROData& rodata = ROData(val, width); - rodatas_.push_back(rodata); - return rodata.label_; - } else { // Literal - const ROData& rodata = ROData(cons->SValRepr()); - rodatas_.push_back(rodata); - return rodata.label_; // Return address - } -} +// Expression - -static const char* GetLoad(int width, bool flt=false) { - switch (width) { - case 1: return "movzbq"; - case 2: return "movzwq"; - case 4: return !flt ? "movl": "movss"; - case 8: return !flt ? "movq": "movsd"; - default: assert(false); return nullptr; - } -} - - -static std::string GetInst(const std::string& inst, int width, bool flt) { - if (flt) { - return inst + (width == 4 ? "ss": "sd"); - } else { - switch (width) { - case 1: return inst + "b"; - case 2: return inst + "w"; - case 4: return inst + "l"; - case 8: return inst + "q"; - default: assert(false); - } - return inst; // Make compiler happy - } -} - - -static std::string GetInst(const std::string& inst, Type* type) { - assert(type->IsScalar()); - return GetInst(inst, type->Width(), type->IsFloat()); -} - - -static std::string GetReg(int width) { - switch (width) { - case 1: return "%al"; - case 2: return "%ax"; - case 4: return "%eax"; - case 8: return "%rax"; - default: assert(false); return ""; - } -} - - -static std::string GetDes(int width, bool flt) { - if (flt) { - return "%xmm0"; - } - return GetReg(width); -} - - -static std::string GetSrc(int width, bool flt) { - if (flt) { - return "%xmm9"; - } - switch (width) { - case 1: return "%r11b"; - case 2: return "%r11w"; - case 4: return "%r11d"; - case 8: return "%r11"; - default: assert(false); return ""; - } -} - - -// The 'reg' always be 8 bytes -int Generator::Push(const std::string& reg) { - offset_ -= 8; - auto mov = reg[1] == 'x' ? "movsd": "movq"; - Emit(mov, reg, ObjectAddr(offset_)); - return offset_; -} - - -int Generator::Push(Type* type) { - if (type->IsFloat()) { - return Push("%xmm0"); - } else if (type->IsScalar()) { - return Push("%rax"); - } else { - offset_ -= type->Width(); - offset_ = Type::MakeAlign(offset_, 8); - CopyStruct({"", "%rbp", offset_}, type->Width()); - return offset_; - } -} - - -// The 'reg' must be 8 bytes -int Generator::Pop(const std::string& reg) { - auto mov = reg[1] == 'x' ? "movsd": "movq"; - Emit(mov, ObjectAddr(offset_), reg); - offset_ += 8; - return offset_; -} - - -void Generator::Spill(bool flt) { - Push(flt ? "%xmm0": "%rax"); -} - - -void Generator::Restore(bool flt) { - const auto& src = GetSrc(8, flt); - const auto& des = GetDes(8, flt); - const auto& inst = GetInst("mov", 8, flt); - Emit(inst, des, src); - Pop(des); -} - - -void Generator::Save(bool flt) { - if (flt) { - Emit("movsd", "%xmm0", "%xmm9"); - } else { - Emit("movq", "%rax", "%r11"); - } -} - - -/* - * Operator/Instruction mapping: - * + add - * - sub - * * mul - * / div - * % div - * << sal - * >> sar - * | or - * & and - * ^ xor - * = mov - * < cmp, setl, movzbq - * > cmp, setg, movzbq - * <= cmp, setle, movzbq - * >= cmp, setle, movzbq - * == cmp, sete, movzbq - * != cmp, setne, movzbq - * && GenAndOp - * || GenOrOp - * ] GenSubScriptingOp - * . GenMemberRefOp - */ void Generator::VisitBinaryOp(BinaryOp* binary) { - EmitLoc(binary); - auto op = binary->op_; + Visit(binary->rhs_); + ir::value* rhs = ret_; - if (op == '=') - return GenAssignOp(binary); - if (op == Token::LOGICAL_AND) - return GenAndOp(binary); - if (op == Token::LOGICAL_OR) - return GenOrOp(binary); - if (op == '.') - return GenMemberRefOp(binary); - if (op == ',') - return GenCommaOp(binary); - // Why lhs_->Type() ? - // Because, the type of pointer subtraction is arithmetic type - if (binary->lhs_->Type()->ToPointer() && - (op == '+' || op == '-')) { - return GenPointerArithm(binary); - } + if(binary->op_ == '=') + return set_ret(assign_->GenExpr(binary->lhs_, rhs)); - // Careful: for compare operator, the type of the expression - // is always integer, while the type of lhs and rhs could be float - // After convertion, lhs and rhs always has the same type + Visit(binary->lhs_); + ir::value* lhs = ret_; + // op info auto type = binary->lhs_->Type(); - auto width = type->Width(); auto flt = type->IsFloat(); auto sign = !type->IsUnsigned(); - - Visit(binary->lhs_); - Spill(flt); - Visit(binary->rhs_); - Restore(flt); - - const char* inst = nullptr; - - switch (op) { - case '*': return GenMulOp(width, flt, sign); - case '/': case '%': return GenDivOp(flt, sign, width, op); - case '<': - return GenCompOp(width, flt, (flt || !sign) ? "setb": "setl"); - case '>': - return GenCompOp(width, flt, (flt || !sign) ? "seta": "setg"); - case Token::LE: - return GenCompOp(width, flt, (flt || !sign) ? "setbe": "setle"); - case Token::GE: - return GenCompOp(width, flt, (flt || !sign) ? "setae": "setge"); - case Token::EQ: - return GenCompOp(width, flt, "sete"); - case Token::NE: - return GenCompOp(width, flt, "setne"); - - case '+': inst = "add"; break; - case '-': inst = "sub"; break; - case '|': inst = "or"; break; - case '&': inst = "and"; break; - case '^': inst = "xor"; break; - case Token::LEFT: case Token::RIGHT: - inst = op == Token::LEFT ? "sal": (sign ? "sar": "shr"); - Emit("movq %r11, %rcx"); - Emit(GetInst(inst, width, flt), "%cl", GetDes(width, flt)); - return; + // return + switch(binary->op_){ + case Token::LOGICAL_AND: return set_ret(bld_->create_and(lhs, rhs)); + case Token::LOGICAL_OR: return set_ret(bld_->create_or(lhs, rhs)); + case '|': return set_ret(bld_->create_or(lhs, rhs)); + case '&': return set_ret(bld_->create_and(lhs, rhs)); + case '^': return set_ret(bld_->create_xor(lhs, rhs)); + case Token::LEFT: return set_ret(bld_->create_shl(lhs, rhs)); + case Token::RIGHT: return set_ret(bld_->create_lshr(lhs, rhs)); + case '.': return error_not_implemented(); + case ',': return error_not_implemented(); + case '+': + if(binary->lhs_->Type()->ToPointer()) + return set_ret(bld_->create_gep(lhs, {rhs})); + else if(flt) + return set_ret(bld_->create_fadd(lhs, rhs)); + else + return set_ret(bld_->create_add(lhs, rhs)); + case '-': + if(binary->lhs_->Type()->ToPointer()) + return set_ret(bld_->create_gep(lhs, {bld_->create_neg(rhs)})); + else if(flt) + return set_ret(bld_->create_fsub(lhs, rhs)); + else + return set_ret(bld_->create_sub(lhs, rhs)); + case '*': + if(flt) + return set_ret(bld_->create_fmul(lhs, rhs)); + else + return set_ret(bld_->create_mul(lhs, rhs)); + case '/': + if(flt) + return set_ret(bld_->create_fdiv(lhs, rhs)); + else if(sign) + return set_ret(bld_->create_sdiv(lhs, rhs)); + else if(!sign) + return set_ret(bld_->create_udiv(lhs, rhs)); + else + return should_not_happen(); + case '%': + if(flt) + return set_ret(bld_->create_frem(lhs, rhs)); + else if(sign) + return set_ret(bld_->create_srem(lhs, rhs)); + else + return set_ret(bld_->create_urem(lhs, rhs)); + case '<': + if(flt) + return set_ret(bld_->create_fcmpOLT(lhs, rhs)); + else if(sign) + return set_ret(bld_->create_icmpSLT(lhs, rhs)); + else if(!sign) + return set_ret(bld_->create_icmpULT(lhs, rhs)); + else + return should_not_happen(); + case '>': + if(flt) + return set_ret(bld_->create_fcmpOGT(lhs, rhs)); + else if(sign) + return set_ret(bld_->create_icmpSGT(lhs, rhs)); + else if(!sign) + return set_ret(bld_->create_icmpUGT(lhs, rhs)); + else + return should_not_happen(); + case Token::LE: + if(flt) + return set_ret(bld_->create_fcmpOLE(lhs, rhs)); + else if(sign) + return set_ret(bld_->create_icmpSLE(lhs, rhs)); + else if(!sign) + return set_ret(bld_->create_icmpULE(lhs, rhs)); + else + return should_not_happen(); + case Token::GE: + if(flt) + return set_ret(bld_->create_fcmpOGE(lhs, rhs)); + else if(sign) + return set_ret(bld_->create_icmpSGE(lhs, rhs)); + else if(!sign) + return set_ret(bld_->create_icmpUGE(lhs, rhs)); + else + return should_not_happen(); + case Token::EQ: + if(flt) + return set_ret(bld_->create_fcmpOEQ(lhs, rhs)); + else + return set_ret(bld_->create_icmpEQ(lhs, rhs)); + case Token::NE: + if(flt) + return set_ret(bld_->create_fcmpONE(lhs, rhs)); + else + return set_ret(bld_->create_icmpEQ(lhs, rhs)); + default: + error_not_implemented(); } - Emit(GetInst(inst, width, flt), GetSrc(width, flt), GetDes(width, flt)); + error_not_implemented(); } - -void Generator::GenCommaOp(BinaryOp* comma) { - VisitExpr(comma->lhs_); - VisitExpr(comma->rhs_); -} - - -void Generator::GenMulOp(int width, bool flt, bool sign) { - auto inst = flt ? "mul": (sign ? "imul": "mul"); - - if (flt) { - Emit(GetInst(inst, width, flt), "%xmm9", "%xmm0"); - } else { - Emit(GetInst(inst, width, flt), GetSrc(width, flt)); - } -} - - -void Generator::GenCompZero(Type* type) { - auto width = type->Width(); - auto flt = type->IsFloat(); - - if (!flt) { - Emit("cmp", "$0", GetReg(width)); - } else { - Emit("pxor", "%xmm9", "%xmm9"); - auto cmp = width == 8 ? "ucomisd": "ucomiss"; - Emit(cmp, "%xmm9", "%xmm0"); - } -} - - -void Generator::GenAndOp(BinaryOp* andOp) { - VisitExpr(andOp->lhs_); - GenCompZero(andOp->lhs_->Type()); - - auto labelFalse = LabelStmt::New(); - Emit("je", labelFalse); - - VisitExpr(andOp->rhs_); - GenCompZero(andOp->rhs_->Type()); - - Emit("je", labelFalse); - - Emit("movq", "$1", "%rax"); - auto labelTrue = LabelStmt::New(); - Emit("jmp", labelTrue); - EmitLabel(labelFalse->Repr()); - Emit("xorq", "%rax", "%rax"); // Set %rax to 0 - EmitLabel(labelTrue->Repr()); -} - - -void Generator::GenOrOp(BinaryOp* orOp) { - VisitExpr(orOp->lhs_); - GenCompZero(orOp->lhs_->Type()); - - auto labelTrue = LabelStmt::New(); - Emit("jne", labelTrue); - - VisitExpr(orOp->rhs_); - GenCompZero(orOp->rhs_->Type()); - - Emit("jne", labelTrue); - - Emit("xorq", "%rax", "%rax"); // Set %rax to 0 - auto labelFalse = LabelStmt::New(); - Emit("jmp", labelFalse); - EmitLabel(labelTrue->Repr()); - Emit("movq", "$1", "%rax"); - EmitLabel(labelFalse->Repr()); -} - - -void Generator::GenMemberRefOp(BinaryOp* ref) { - // As the lhs will always be struct/union - auto addr = LValGenerator().GenExpr(ref->lhs_); - const auto& name = ref->rhs_->Tok()->str_; - auto structType = ref->lhs_->Type()->ToStruct(); - auto member = structType->GetMember(name); - - addr.offset_ += member->Offset(); - - if (!ref->Type()->IsScalar()) { - Emit("leaq", addr, "%rax"); - } else { - if (member->BitFieldWidth()) { - EmitLoadBitField(addr.Repr(), member); - } else { - EmitLoad(addr.Repr(), ref->Type()); - } - } -} - - -void Generator::EmitLoadBitField(const std::string& addr, Object* bitField) { - auto type = bitField->Type()->ToArithm(); - assert(type && type->IsInteger()); - - EmitLoad(addr, type); - Emit("andq", Object::BitFieldMask(bitField), "%rax"); - - auto shiftRight = (type->Tag() & T_UNSIGNED) ? "shrq": "sarq"; - auto left = 64 - bitField->bitFieldBegin_ - bitField->bitFieldWidth_; - auto right = 64 - bitField->bitFieldWidth_; - Emit("salq", left, "%rax"); - Emit(shiftRight, right, "%rax"); -} - - -// FIXME(wgtdkp): for combined assignment operator, if the rvalue expr -// has some side-effect, the rvalue will be evaluated twice! -void Generator::GenAssignOp(BinaryOp* assign) { - // The base register of addr is %r10, %rip, %rbp - auto addr = LValGenerator().GenExpr(assign->lhs_); - // Base register of static object maybe %rip - // Visit rhs_ may changes r10 - if (addr.base_ == "%r10") - Push(addr.base_); - VisitExpr(assign->rhs_); - if (addr.base_ == "%r10") - Pop(addr.base_); - - if (assign->Type()->IsScalar()) { - EmitStore(addr, assign->Type()); - } else { - // struct/union type - // The address of rhs is in %rax - CopyStruct(addr, assign->Type()->Width()); - } -} - - -void Generator::EmitStoreBitField(const ObjectAddr& addr, Type* type) { - auto arithmType = type->ToArithm(); - assert(arithmType && arithmType->IsInteger()); - - // The value to be stored is in %rax now - auto mask = Object::BitFieldMask(addr.bitFieldBegin_, addr.bitFieldWidth_); - - Emit("salq", addr.bitFieldBegin_, "%rax"); - Emit("andq", mask, "%rax"); - Emit("movq", "%rax", "%r11"); - EmitLoad(addr.Repr(), arithmType); - Emit("andq", ~mask, "%rax"); - Emit("orq", "%r11", "%rax"); - - EmitStore(addr.Repr(), type); -} - - -void Generator::CopyStruct(ObjectAddr desAddr, int width) { - int units[] = {8, 4, 2, 1}; - Emit("movq", "%rax", "%rcx"); - ObjectAddr srcAddr = {"", "%rcx", 0}; - for (auto unit: units) { - while (width >= unit) { - EmitLoad(srcAddr.Repr(), unit, false); - EmitStore(desAddr.Repr(), unit, false); - desAddr.offset_ += unit; - srcAddr.offset_ += unit; - width -= unit; - } - } -} - - -void Generator::GenCompOp(int width, bool flt, const char* set) { - std::string cmp; - if (flt) { - cmp = width == 8 ? "ucomisd": "ucomiss"; - } else { - cmp = GetInst("cmp", width, flt); - } - - Emit(cmp, GetSrc(width, flt), GetDes(width, flt)); - Emit(set, "%al"); - Emit("movzbq", "%al", "%rax"); -} - - -void Generator::GenDivOp(bool flt, bool sign, int width, int op) { - if (flt) { - auto inst = width == 4 ? "divss": "divsd"; - Emit(inst, "%xmm9", "%xmm0"); - return; - } - if (!sign) { - Emit("xor", "%rdx", "%rdx"); - Emit(GetInst("div", width, flt), GetSrc(width, flt)); - } else { - Emit(width == 4 ? "cltd": "cqto"); - Emit(GetInst("idiv", width, flt), GetSrc(width, flt)); - } - if (op == '%') - Emit("movq", "%rdx", "%rax"); -} - - -void Generator::GenPointerArithm(BinaryOp* binary) { - assert(binary->op_ == '+' || binary->op_ == '-'); - // For '+', we have swapped lhs_ and rhs_ to ensure that - // the pointer is at lhs. - Visit(binary->lhs_); - Spill(false); - Visit(binary->rhs_); - Restore(false); - - auto type = binary->lhs_->Type()->ToPointer()->Derived(); - auto width = type->Width(); - if (binary->op_ == '+') { - if (width > 1) - Emit("imulq", width, "%r11"); - Emit("addq", "%r11", "%rax"); - } else { - Emit("subq", "%r11", "%rax"); - if (width > 1) { - Emit("movq", width, "%r11"); - GenDivOp(false, true, 8, '/'); - } - } -} - - -// Only objects Allocated on stack -void Generator::VisitObject(Object* obj) { - EmitLoc(obj); - auto addr = LValGenerator().GenExpr(obj).Repr(); - - if (!obj->Type()->IsScalar()) { - // Return the address of the object in rax - Emit("leaq", addr, "%rax"); - } else { - EmitLoad(addr, obj->Type()); - } -} - - -void Generator::GenCastOp(UnaryOp* cast) { - auto desType = cast->Type(); - auto srcType = cast->operand_->Type(); - - if (srcType->IsFloat() && desType->IsFloat()) { - if (srcType->Width() == desType->Width()) - return; - auto inst = srcType->Width() == 4 ? "cvtss2sd": "cvtsd2ss"; - Emit(inst, "%xmm0", "%xmm0"); - } else if (srcType->IsFloat()) { - // Handle bool - if (desType->IsBool()) { - Emit("pxor", "%xmm9", "%xmm9"); - GenCompOp(srcType->Width(), true, "setne"); - } else { - auto inst = srcType->Width() == 4 ? "cvttss2si": "cvttsd2si"; - Emit(inst, "%xmm0", "%rax"); - } - } else if (desType->IsFloat()) { - auto inst = desType->Width() == 4 ? "cvtsi2ss": "cvtsi2sd"; - Emit(inst, "%rax", "%xmm0"); - } else if (srcType->ToPointer() - || srcType->ToFunc() - || srcType->ToArray()) { - // Handle bool - if (desType->IsBool()) { - Emit("testq", "%rax", "%rax"); - Emit("setne", "%al"); - } - } else { - assert(srcType->ToArithm()); - int width = srcType->Width(); - auto sign = !srcType->IsUnsigned(); - const char* inst; - switch (width) { - case 1: - inst = sign ? "movsbq": "movzbq"; - Emit(inst, GetReg(width), "%rax"); - break; - case 2: - inst = sign ? "movswq": "movzwq"; - Emit(inst, GetReg(width), "%rax"); - break; - case 4: inst = "movl"; - if (desType->Width() == 8) - Emit("cltq"); - break; - case 8: break; - } - // Handle bool - if (desType->IsBool()) { - Emit("testq", "%rax", "%rax"); - Emit("setne", "%al"); - } - } -} - - void Generator::VisitUnaryOp(UnaryOp* unary) { - EmitLoc(unary); + // recursion + Visit(unary->operand_); + ir::value* op = ret_; + ir::type* type = GenIRType(unary->operand_->Type(), *ctx_); + // return switch (unary->op_) { - case Token::PREFIX_INC: - return GenIncDec(unary->operand_, false, "add"); - case Token::PREFIX_DEC: - return GenIncDec(unary->operand_, false, "sub"); - case Token::POSTFIX_INC: - return GenIncDec(unary->operand_, true, "add"); - case Token::POSTFIX_DEC: - return GenIncDec(unary->operand_, true, "sub"); - case Token::ADDR: { - auto addr = LValGenerator().GenExpr(unary->operand_).Repr(); - Emit("leaq", addr, "%rax"); - } return; - case Token::DEREF: - return GenDerefOp(unary); - case Token::PLUS: - return VisitExpr(unary->operand_); - case Token::MINUS: - return GenMinusOp(unary); - case '~': - VisitExpr(unary->operand_); - return Emit("notq", "%rax"); - case '!': - VisitExpr(unary->operand_); - GenCompZero(unary->operand_->Type()); - Emit("sete", "%al"); - Emit("movzbl", "%al", "%eax"); // Type of !operator is int - return; - case Token::CAST: - Visit(unary->operand_); - GenCastOp(unary); - return; - default: assert(false); + case Token::PREFIX_INC: return error_not_implemented(); + case Token::PREFIX_DEC: return error_not_implemented(); + case Token::POSTFIX_INC: return error_not_implemented(); + case Token::POSTFIX_DEC: return error_not_implemented(); + case Token::ADDR: return error_not_implemented(); + case Token::DEREF: return error_not_implemented(); + case Token::PLUS: return error_not_implemented(); + case Token::MINUS: return error_not_implemented(); + case '~': return set_ret(bld_->create_neg(op)); + case '!': return set_ret(bld_->create_not(op)); + case Token::CAST: return set_ret(GenCastOp(op, type)); + default: assert(false); } + return error_not_implemented(); } - -void Generator::GenDerefOp(UnaryOp* deref) { - VisitExpr(deref->operand_); - if (deref->Type()->IsScalar()) { - ObjectAddr addr {"", "%rax", 0}; - EmitLoad(addr.Repr(), deref->Type()); - } else { - // Just let it go! - } -} - - -void Generator::GenMinusOp(UnaryOp* minus) { - auto width = minus->Type()->Width(); - auto flt = minus->Type()->IsFloat(); - - VisitExpr(minus->operand_); - - if (flt) { - Emit("pxor", "%xmm9", "%xmm9"); - Emit(GetInst("sub", width, flt), "%xmm0", "%xmm9"); - Emit(GetInst("mov", width, flt), "%xmm9", "%xmm0"); - } else { - Emit(GetInst("neg", width, flt), GetDes(width, flt)); - } -} - - -void Generator::GenIncDec(Expr* operand, - bool postfix, - const std::string& inst) { - auto width = operand->Type()->Width(); - auto flt = operand->Type()->IsFloat(); - - auto addr = LValGenerator().GenExpr(operand).Repr(); - EmitLoad(addr, operand->Type()); - if (postfix) Save(flt); - - Constant* cons; - auto pointerType = operand->Type()->ToPointer(); - if (pointerType) { - long width = pointerType->Derived()->Width(); - cons = Constant::New(operand->Tok(), T_LONG, width); - } else if (operand->Type()->IsInteger()) { - cons = Constant::New(operand->Tok(), T_LONG, 1L); - } else { - if (width == 4) - cons = Constant::New(operand->Tok(), T_FLOAT, 1.0f); - else - cons = Constant::New(operand->Tok(), T_DOUBLE, 1.0); - } - - Emit(GetInst(inst, operand->Type()), ConsLabel(cons), GetDes(width, flt)); - EmitStore(addr, operand->Type()); - if (postfix && flt) { - Emit("movsd", "%xmm9", "%xmm0"); - } else if (postfix) { - Emit("mov", "%r11", "%rax"); - } -} - - void Generator::VisitConditionalOp(ConditionalOp* condOp) { - EmitLoc(condOp); - auto ifStmt = IfStmt::New(condOp->cond_, - condOp->exprTrue_, condOp->exprFalse_); - VisitIfStmt(ifStmt); + return error_not_implemented(); } - -void Generator::VisitEnumerator(Enumerator* enumer) { - EmitLoc(enumer); - auto cons = Constant::New(enumer->Tok(), T_INT, (long)enumer->Val()); - Visit(cons); -} - - -// Ident must be function -void Generator::VisitIdentifier(Identifier* ident) { - EmitLoc(ident); - Emit("leaq", ident->Name(), "%rax"); -} - - -void Generator::VisitConstant(Constant* cons) { - EmitLoc(cons); - auto label = ConsLabel(cons); - - if (!cons->Type()->IsScalar()) { - Emit("leaq", label, "%rax"); - } else { - auto width = cons->Type()->Width(); - auto flt = cons->Type()->IsFloat(); - auto load = GetInst("mov", width, flt); - auto des = GetDes(width, flt); - Emit(load, label, des); - } -} - - -// Use %ecx as temp register -// TempVar is only used for condition expression of 'switch' -// and struct copy -void Generator::VisitTempVar(TempVar* tempVar) { - assert(tempVar->Type()->IsInteger()); - Emit("movl", "%ecx", "%eax"); -} - - -void Generator::VisitDeclaration(Declaration* decl) { - EmitLoc(decl->obj_); - auto obj = decl->obj_; - - if (!obj->IsStatic()) { - // The object has no linkage and has - // no static storage(the object is on stack). - // If it has no initialization, - // then it's value is random initialized. - if (!obj->HasInit()) - return; - - int lastEnd = obj->Offset(); - for (const auto& init: decl->Inits()) { - ObjectAddr addr = ObjectAddr(obj->Offset() + init.offset_); - addr.bitFieldBegin_ = init.bitFieldBegin_; - addr.bitFieldWidth_ = init.bitFieldWidth_; - if (lastEnd != addr.offset_) - EmitZero(ObjectAddr(lastEnd), addr.offset_ - lastEnd); - VisitExpr(init.expr_); - if (init.type_->IsScalar()) { - EmitStore(addr, init.type_); - } else if (init.type_->ToStruct()) { - CopyStruct(addr, init.type_->Width()); - } else { - assert(false); - } - lastEnd = addr.offset_ + init.type_->Width(); - } - auto objEnd = obj->Offset() + obj->Type()->Width(); - if (lastEnd != objEnd) - EmitZero(ObjectAddr(lastEnd), objEnd - lastEnd); - return; - } - - if (obj->Linkage() == L_NONE) - staticDecls_.push_back(decl); - else - GenStaticDecl(decl); -} - - -void Generator::GenStaticDecl(Declaration* decl) { - auto obj = decl->obj_; - assert(obj->IsStatic()); - - const auto& label = obj->Repr(); - const auto width = obj->Type()->Width(); - const auto align = obj->Align(); - - // Omit the external without initilizer - if ((obj->Storage() & S_EXTERN) && !obj->HasInit()) - return; - - Emit(".data"); - auto glb = obj->Linkage() == L_EXTERNAL ? ".globl": ".local"; - Emit(glb, label); - - if (!obj->HasInit()) { - Emit(".comm", label + ", " + std::to_string(width) + - ", " + std::to_string(align)); - return; - } - - Emit(".align", std::to_string(align)); - Emit(".type", label, "@object"); - // Does not decide the size of obj - Emit(".size", label, std::to_string(width)); - EmitLabel(label); - - int offset = 0; - auto iter = decl->Inits().begin(); - for (; iter != decl->Inits().end();) { - auto staticInit = GetStaticInit(iter, - decl->Inits().end(), std::max(iter->offset_, offset)); - - if (staticInit.offset_ > offset) - Emit(".zero", std::to_string(staticInit.offset_ - offset)); - - switch (staticInit.width_) { - case 1: - Emit(".byte", std::to_string(static_cast(staticInit.val_))); - break; - case 2: - Emit(".value", std::to_string(static_cast(staticInit.val_))); - break; - case 4: - Emit(".long", std::to_string(static_cast(staticInit.val_))); - break; - case 8: { - std::string val; - if (staticInit.label_.size() == 0) { - val = std::to_string(staticInit.val_); - } else if (staticInit.val_ != 0) { - val = staticInit.label_ + "+" + std::to_string(staticInit.val_); - } else { - val = staticInit.label_; - } - Emit(".quad", val); - } break; - default: assert(false); - } - offset = staticInit.offset_ + staticInit.width_; - } - // Decides the size of object - if (width > offset) - Emit(".zero", std::to_string(width - offset)); -} - - -void Generator::VisitEmptyStmt(EmptyStmt* emptyStmt) { - assert(false); -} - - -void Generator::VisitIfStmt(IfStmt* ifStmt) { - VisitExpr(ifStmt->cond_); - - // Compare to 0 - auto elseLabel = LabelStmt::New(); - auto endLabel = LabelStmt::New(); - - GenCompZero(ifStmt->cond_->Type()); - - if (ifStmt->else_) { - Emit("je", elseLabel); - } else { - Emit("je", endLabel); - } - - VisitStmt(ifStmt->then_); - - if (ifStmt->else_) { - Emit("jmp", endLabel); - EmitLabel(elseLabel->Repr()); - VisitStmt(ifStmt->else_); - } - - EmitLabel(endLabel->Repr()); -} - - -void Generator::VisitJumpStmt(JumpStmt* jumpStmt) { - Emit("jmp", jumpStmt->label_); -} - - -void Generator::VisitLabelStmt(LabelStmt* labelStmt) { - EmitLabel(labelStmt->Repr()); -} - - -void Generator::VisitReturnStmt(ReturnStmt* returnStmt) { - auto expr = returnStmt->expr_; - if (expr) { // The return expr could be nil - Visit(expr); - if (expr->Type()->ToStruct()) { - // %rax now has the address of the struct/union - ObjectAddr addr = ObjectAddr(retAddrOffset_); - Emit("movq", addr, "%r11"); - addr = {"", "%r11", 0}; - CopyStruct(addr, expr->Type()->Width()); - Emit("movq", "%r11", "%rax"); - } - } - Emit("jmp", curFunc_->retLabel_); -} - - -class Comp { -public: - bool operator()(Object* lhs, Object* rhs) { - return lhs->Align() < rhs->Align(); - } -}; - - -void Generator::AllocObjects(Scope* scope, const FuncDef::ParamList& params) { - int offset = offset_; - - auto paramSet = std::set(params.begin(), params.end()); - std::priority_queue, Comp> heap; - for (auto iter = scope->begin(); iter != scope->end(); ++iter) { - auto obj = iter->second->ToObject(); - if (!obj || obj->IsStatic()) - continue; - if (paramSet.find(obj) != paramSet.end()) - continue; - heap.push(obj); - } - - while (!heap.empty()) { - auto obj = heap.top(); - heap.pop(); - - offset -= obj->Type()->Width(); - auto align = obj->Align(); - if (obj->Type()->ToArray()) { - // The alignment of an array is at least the aligment of a pointer - // (as it is always cast to a pointer) - align = std::min(align, 8); - } - offset = Type::MakeAlign(offset, align); - obj->SetOffset(offset); - } - - offset_ = offset; -} - - -void Generator::VisitCompoundStmt(CompoundStmt* compStmt) { - if (compStmt->scope_) { - AllocObjects(compStmt->scope_); - } - - for (auto stmt: compStmt->stmts_) { - Visit(stmt); - } -} - - -void Generator::GetParamRegOffsets(int& gpOffset, - int& fpOffset, - int& overflow, - FuncType* funcType) { - TypeList types; - for (auto param: funcType->Params()) - types.push_back(param->Type()); - auto locations = GetParamLocations(types, funcType->Derived()); - gpOffset = 0; - fpOffset = 48; - overflow = 16; - for (const auto& loc: locations.locs_) { - if (loc[1] == 'x') - fpOffset += 16; - else if (loc[1] == 'm') - overflow += 8; - else - gpOffset += 8; - } -} - - -void Generator::GenBuiltin(FuncCall* funcCall) { - struct va_list_imp { - unsigned int gp_offset; - unsigned int fp_offset; - void *overflow_arg_area; - void *reg_save_area; - }; - - auto ap = UnaryOp::New(Token::DEREF, funcCall->args_[0]); - auto addr = LValGenerator().GenExpr(ap); - auto type = funcCall->FuncType(); - - auto offset = offsetof(va_list_imp, reg_save_area); - addr.offset_ += offset; - const auto& saveAreaAddr = addr.Repr(); - addr.offset_ -= offset; - - offset = offsetof(va_list_imp, overflow_arg_area); - addr.offset_ += offset; - const auto& overflowAddr = addr.Repr(); - addr.offset_ -= offset; - - offset = offsetof(va_list_imp, gp_offset); - addr.offset_ += offset; - const auto& gpOffsetAddr = addr.Repr(); - addr.offset_ -= offset; - - offset = offsetof(va_list_imp, fp_offset); - addr.offset_ += offset; - const auto& fpOffsetAddr = addr.Repr(); - addr.offset_ -= offset; - - if (type == Parser::vaStartType_) { - Emit("leaq", "-176(%rbp)", "%rax"); - Emit("movq", "%rax", saveAreaAddr); - - int gpOffset, fpOffset, overflowOffset; - GetParamRegOffsets(gpOffset, fpOffset, - overflowOffset, curFunc_->FuncType()); - Emit("leaq", ObjectAddr(overflowOffset), "%rax"); - Emit("movq", "%rax", overflowAddr); - Emit("movl", gpOffset, "%eax"); - Emit("movl", "%eax", gpOffsetAddr); - Emit("movl", fpOffset, "%eax"); - Emit("movl", "%eax", fpOffsetAddr); - } else if (type == Parser::vaArgType_) { - static int cnt[2] = {0, 0}; - auto overflowLabel = ".L_va_arg_overflow" + std::to_string(++cnt[0]); - auto endLabel = ".L_va_arg_end" + std::to_string(++cnt[1]); - - auto argType = funcCall->args_[1]->Type()->ToPointer()->Derived(); - auto cls = Classify(argType.GetPtr()); - if (cls == ParamClass::INTEGER) { - Emit("movq", saveAreaAddr, "%rax"); - Emit("movq", "%rax", "%r11"); - Emit("movl", gpOffsetAddr, "%eax"); - Emit("cltq"); - Emit("cmpq", 48, "%rax"); - Emit("jae", overflowLabel); - Emit("addq", "%rax", "%r11"); - Emit("addq", 8, "%rax"); - Emit("movl", "%eax", gpOffsetAddr); - Emit("movq", "%r11", "%rax"); - Emit("jmp", endLabel); - } else if (cls == ParamClass::SSE) { - Emit("movq", saveAreaAddr, "%rax"); - Emit("movq", "%rax", "%r11"); - Emit("movl", fpOffsetAddr, "%eax"); - Emit("cltq"); - Emit("cmpq", 176, "%rax"); - Emit("jae", overflowLabel); - Emit("addq", "%rax", "%r11"); - Emit("addq", 16, "%rax"); - Emit("movl", "%eax", fpOffsetAddr); - Emit("movq", "%r11", "%rax"); - Emit("jmp", endLabel); - } else if (cls == ParamClass::MEMORY) { - } else { - Error("internal error"); - } - EmitLabel(overflowLabel); - Emit("movq", overflowAddr, "%rax"); - Emit("movq", "%rax", "%r11"); - // Arguments passed by memory is aligned by at least 8 bytes - Emit("addq", Type::MakeAlign(argType->Width(), 8), "%r11"); - Emit("movq", "%r11", overflowAddr); - EmitLabel(endLabel); - } else { - assert(false); - } -} - - void Generator::VisitFuncCall(FuncCall* funcCall) { - EmitLoc(funcCall); - auto funcType = funcCall->FuncType(); - if (Parser::IsBuiltin(funcType)) - return GenBuiltin(funcCall); - - auto base = offset_; - // Alloc memory for return value if it is struct/union - int retStructOffset; - auto retType = funcCall->Type()->ToStruct(); - if (retType) { - retStructOffset = offset_; - retStructOffset -= retType->Width(); - retStructOffset = Type::MakeAlign(retStructOffset, retType->Align()); - // No!!! you can't suppose that the - // visition of arguments won't change the value of %rdi - //Emit("leaq %d(#rbp), #rdi", offset); - offset_ = retStructOffset; + std::string name = funcCall->Name(); + if(name == "get_program_id"){ + VisitExpr(funcCall->Args()->at(0)); + ir::value* ret = ret_; + if(auto axis = dynamic_cast(ret)) + return set_ret(bld_->create_get_program_id(axis->get_value())); } - - TypeList types; - for (auto arg: funcCall->args_) { - types.push_back(arg->Type()); - } - - const auto& locations = GetParamLocations(types, retType); - // Align stack frame by 16 bytes - const auto& locs = locations.locs_; - auto byMemCnt = locs.size() - locations.regCnt_ - locations.xregCnt_; - - offset_ = Type::MakeAlign(offset_ - byMemCnt * 8, 16) + byMemCnt * 8; - for (int i = locs.size() - 1; i >=0; --i) { - if (locs[i][1] == 'm') { - Visit(funcCall->args_[i]); - Push(funcCall->args_[i]->Type()); - } - } - - for (int i = locs.size() - 1; i >= 0; --i) { - if (locs[i][1] == 'm') - continue; - Visit(funcCall->args_[i]); - Push(funcCall->args_[i]->Type()); - } - - for (const auto& loc: locs) { - if (loc[1] != 'm') - Pop(loc); - } - - // If variadic, set %al to floating param number - if (funcType->Variadic()) { - Emit("movq", locations.xregCnt_, "%rax"); - } - if (retType) { - Emit("leaq", ObjectAddr(retStructOffset), "%rdi"); - } - - Emit("leaq", ObjectAddr(offset_), "%rsp"); - auto addr = LValGenerator().GenExpr(funcCall->Designator()); - if (addr.base_.size() == 0 && addr.offset_ == 0) { - Emit("call", addr.label_); - } else { - Emit("leaq", addr, "%r10"); - Emit("call", "*%r10"); - } - - // Reset stack frame - offset_ = base; + return error_not_implemented(); } - -ParamLocations Generator::GetParamLocations(const TypeList& types, - bool retStruct) { - ParamLocations locations; - - locations.regCnt_ = retStruct; - locations.xregCnt_ = 0; - for (auto type: types) { - auto cls = Classify(type); - - const char* reg = nullptr; - if (cls == ParamClass::INTEGER) { - if (locations.regCnt_ < regs.size()) - reg = regs[locations.regCnt_++]; - } else if (cls == ParamClass::SSE) { - if (locations.xregCnt_ < xregs.size()) - reg = xregs[locations.xregCnt_++]; - } - locations.locs_.push_back(reg ? reg: "%mem"); - } - return locations; +void Generator::VisitObject(Object* obj) { + return error_not_implemented(); } +void Generator::VisitEnumerator(Enumerator* enumer) { + return error_not_implemented(); +} + +void Generator::VisitIdentifier(Identifier* ident) { + return set_ret(mod_->get_value(ident->Name())); +} + +void Generator::VisitConstant(Constant* cons) { + Type* ctype = cons->Type(); + ir::type *type = GenIRType(cons->Type(), *ctx_); + if(ctype->IsInteger()) + return set_ret(ir::constant_int::get(type, cons->IVal())); + if(ctype->IsFloat() && ctype->IsReal()) + return set_ret(ir::constant_fp::get(type, cons->FVal())); + return error_not_implemented(); +} + +void Generator::VisitTempVar(TempVar* tempVar) { + return error_not_implemented(); +} + +// Statement +void Generator::VisitDeclaration(Declaration* decl) { + auto obj = decl->obj_; + // initialize to undef + ir::type* ty = GenIRType(obj->Type(), *ctx_); + ir::value* val = ir::undef_value::get(ty); + // compute initializers + std::vector inits; + for (const Initializer& init: decl->Inits()) { + VisitExpr(init.expr_); + inits.push_back(ret_); + } + // initialize declaration + ir::type::id_t id = ty->get_type_id(); + if(id == ir::type::StructTyID) + assert(false); + if(inits.size() > 1) + assert(false); + val = inits[0]; + assert(val->get_type() == ty); + // update scope symbols table + const std::string &name = obj->Name(); + if(!name.empty()){ + mod_->set_value(name, val); + mod_->get_scope().types[name] = ty; + } +} + +void Generator::VisitEmptyStmt(EmptyStmt*) { + return; +} + +void Generator::VisitIfStmt(IfStmt* ifStmt) { + ir::function *fn = bld_->get_insert_block()->get_parent(); + Stmt *then_ = ifStmt->then_; + Stmt *else_ = ifStmt->else_; + VisitExpr(ifStmt->cond_); + ir::value* cond = ret_; + ir::basic_block *then_bb = ir::basic_block::create(*ctx_, "then", fn); + ir::basic_block *else_bb = else_? ir::basic_block::create(*ctx_, "else", fn) : nullptr; + ir::basic_block *endif_bb = ir::basic_block::create(*ctx_, "endif", fn); + // seal blocks + mod_->seal_block(then_bb); + if(else_bb) + mod_->seal_block(else_bb); + // branches + if(else_) + bld_->create_cond_br(cond, then_bb, else_bb); + else + bld_->create_cond_br(cond, then_bb, endif_bb); + // then + bld_->set_insert_point(then_bb); + VisitStmt(then_); + if(!is_terminator(ret_)) + bld_->create_br(endif_bb); + // else + if(else_){ + bld_->set_insert_point(else_bb); + VisitStmt(else_); + if(!is_terminator(ret_)) + bld_->create_br(endif_bb); + } + // endif + mod_->seal_block(endif_bb); + bld_->set_insert_point(endif_bb); +} + +void Generator::VisitJumpStmt(JumpStmt* jumpStmt) { + return error_not_implemented(); +} + +void Generator::VisitReturnStmt(ReturnStmt* returnStmt) { + ir::value *ret; + if(returnStmt->expr_) + return error_not_implemented(); + else + ret = bld_->create_ret_void(); + return set_ret(ret); +} + +void Generator::VisitLabelStmt(LabelStmt* labelStmt) { + return error_not_implemented(); +} + +void Generator::VisitCompoundStmt(CompoundStmt* compoundStmt) { + if (compoundStmt->scope_){ + AllocObjects(compoundStmt->scope_); + pushScope(); + } + for (auto stmt: compoundStmt->stmts_) + Visit(stmt); + if(compoundStmt->scope_) + popScope(); +} void Generator::VisitFuncDef(FuncDef* funcDef) { - curFunc_ = funcDef; - - auto name = funcDef->Name(); - - Emit(".text"); - if (funcDef->Linkage() == L_INTERNAL) { - Emit(".local", name); - } else { - Emit(".globl", name); - } - Emit(".type", name, "@function"); - - EmitLabel(name); - Emit("pushq", "%rbp"); - Emit("movq", "%rsp", "%rbp"); - - offset_ = 0; - - auto& params = funcDef->FuncType()->Params(); - // Arrange space to store params passed by registers - bool retStruct = funcDef->FuncType()->Derived()->ToStruct(); - TypeList types; - for (auto param: params) - types.push_back(param->Type()); - - auto locations = GetParamLocations(types, retStruct); - const auto& locs = locations.locs_; - - if (funcDef->FuncType()->Variadic()) { - GenSaveArea(); // 'offset' is now the begin of save area - if (retStruct) { - retAddrOffset_ = offset_; - offset_ += 8; - } - int regOffset = offset_; - int xregOffset = offset_ + 48; - int byMemOffset = 16; - for (size_t i = 0; i < locs.size(); ++i) { - if (locs[i][1] == 'm') { - params[i]->SetOffset(byMemOffset); - - // TODO(wgtdkp): width of incomplete array ? - // What about the var args, var args offset always increment by 8 - //byMemOffset += 8; - byMemOffset += params[i]->Type()->Width(); - byMemOffset = Type::MakeAlign(byMemOffset, 8); - } else if (locs[i][1] == 'x') { - params[i]->SetOffset(xregOffset); - xregOffset += 16; - } else { - params[i]->SetOffset(regOffset); - regOffset += 8; - } - } - } else { - if (retStruct) { - retAddrOffset_ = Push("%rdi"); - } - int byMemOffset = 16; - for (size_t i = 0; i < locs.size(); ++i) { - if (locs[i][1] == 'm') { - params[i]->SetOffset(byMemOffset); - // TODO(wgtdkp): width of incomplete array ? - byMemOffset += params[i]->Type()->Width(); - byMemOffset = Type::MakeAlign(byMemOffset, 8); - continue; - } - params[i]->SetOffset(Push(locs[i])); - } - } - - AllocObjects(funcDef->Body()->Scope(), params); - - for (auto stmt: funcDef->body_->stmts_) { - Visit(stmt); - } - - EmitLabel(funcDef->retLabel_->Repr()); - Emit("leaveq"); - Emit("retq"); + return error_not_implemented(); } - -void Generator::GenSaveArea() { - static const int begin = -176; - int offset = begin; - for (auto reg: regs) { - Emit("movq", reg, ObjectAddr(offset)); - offset += 8; - } - Emit("testb", "%al", "%al"); - auto label = LabelStmt::New(); - Emit("je", label); - for (auto xreg: xregs) { - Emit("movaps", xreg, ObjectAddr(offset)); - offset += 16; - } - assert(offset == 0); - EmitLabel(label->Repr()); - - offset_ = begin; -} - - void Generator::VisitTranslationUnit(TranslationUnit* unit) { - for (auto extDecl: unit->ExtDecls()) { + for (auto extDecl: unit->ExtDecls()) Visit(extDecl); - - // Float and string literal - if (rodatas_.size()) - Emit(".section", ".rodata"); - for (auto rodata: rodatas_) { - if (rodata.align_ == 1) { // Literal - EmitLabel(rodata.label_); - Emit(".string", "\"" + rodata.sval_ + "\""); - } else if (rodata.align_ == 4) { - Emit(".align", "4"); - EmitLabel(rodata.label_); - Emit(".long", std::to_string(static_cast(rodata.ival_))); - } else { - Emit(".align", "8"); - EmitLabel(rodata.label_); - Emit(".quad", std::to_string(rodata.ival_)); - } - } - rodatas_.clear(); - - for (auto staticDecl: staticDecls_) { - GenStaticDecl(staticDecl); - } - staticDecls_.clear(); - } } - -void Generator::Gen() { - Emit(".file", "\"" + filename_in + "\""); +void Generator::Gen(ir::module *mod) { + pushScope(); + mod_ = mod; + ctx_ = &mod_->get_context(); + bld_ = &mod_->get_builder(); + std::unique_ptr assign(new LValAssigner(this)); + assign_ = assign.get(); VisitTranslationUnit(parser_->Unit()); + assign_ = nullptr; } -void Generator::EmitLoc(Expr* expr) { - if (!debug) { - return; - } +// Triton-IR Values - static int fileno = 0; - if (expr->tok_ == nullptr) { - return; - } +ir::value* Generator::GenCastOp(ir::value* op, ir::type* type) { + //TODO + assert(false); + return nullptr; +} - const auto loc = &expr->tok_->loc_; - if (loc->filename_ != last_file) { - Emit(".file", std::to_string(++fileno) + " \"" + *loc->filename_ + "\""); - last_file = loc->filename_; - } - Emit(".loc", std::to_string(fileno) + " " + - std::to_string(loc->line_) + " 0"); +// Triton-IR Types +ir::type* Generator::GenIRType(::Type* type, ir::context& ctx) { + if(auto T = type->ToVoid()) + return ir::type::get_void_ty(ctx); + if(auto T = type->ToArithm()) + return GenIRArithmType(T, ctx); + if(auto T = type->ToArray()) + return GenIRArrayType(T, ctx); + if(auto T = type->ToTile()) + return GenIRTileType(T, ctx); + if(auto T = type->ToFunc()) + return GenIRFuncType(T, ctx); + if(auto T = type->ToPointer()) + return GenIRPointerType(T, ctx); + if(auto T = type->ToStruct()) + return GenIRStructType(T, ctx); + assert(false); + return nullptr; +} - std::string line; - for (const char* p = loc->lineBegin_; *p && *p != '\n'; ++p) - line.push_back(*p); - Emit("# " + line); +ir::type* Generator::GenIRArithmType(ArithmType* type, ir::context& ctx) { + int tag = type->Tag(); + if(tag & T_BOOL) + return ir::type::get_int1_ty(ctx); + if(tag & T_CHAR) + return ir::type::get_int8_ty(ctx); + if(tag & T_SHORT) + return ir::type::get_int16_ty(ctx); + if(tag & T_INT) + return ir::type::get_int32_ty(ctx); + if(tag & T_LONG) + return ir::type::get_int64_ty(ctx); + if(tag & T_HALF) + return ir::type::get_half_ty(ctx); + if(tag & T_FLOAT) + return ir::type::get_float_ty(ctx); + if(tag & T_DOUBLE) + return ir::type::get_double_ty(ctx); + assert(false); + return nullptr; +} + +ir::type* Generator::GenIRArrayType(ArrayType* type, ir::context& ctx) { + assert(false); + return nullptr; +} + +ir::type* Generator::GenIRTileType(TileType* type, ir::context& ctx) { + ir::type* ele_ty = GenIRType(type->Derived().GetPtr(), ctx); + auto _shape = type->Shape(); + ir::tile_type::tile_shapes_t shape; + ir::type* int32_ty = ir::type::get_int32_ty(ctx); + for(int s: _shape) + shape.push_back(ir::constant_int::get(int32_ty, s)); + return ir::tile_type::get(ele_ty, shape); +} + +ir::type* Generator::GenIRFuncType(FuncType* type, ir::context& ctx) { + ir::type* ret_ty = GenIRType(type->Derived().GetPtr(), ctx); + std::vector param_tys; + for(Object* obj: type->Params()) + param_tys.push_back(GenIRType(obj->Type(), ctx)); + return ir::function_type::get(ret_ty, param_tys); +} + +ir::type* Generator::GenIRPointerType(PointerType* type, ir::context& ctx) { + ir::type* ele_ty = GenIRType(type->Derived().GetPtr(), ctx); + unsigned addr_space = 0; + return ir::pointer_type::get(ele_ty, addr_space); +} + +ir::type* Generator::GenIRStructType(StructType* type, ir::context& ctx) { + assert(false); + return nullptr; +} + +void Generator::AllocObjects(Scope* scope, const FuncDef::ParamList& params) { + return error_not_implemented(); +} + +// SSA +void Generator::pushScope() { + mod_->add_new_scope(); +} + +void Generator::popScope() { + mod_->pop_scope(); +} + +// LValue Generator +void LValAssigner::VisitBinaryOp(BinaryOp* binary) { + error_not_implemented(); +} + +void LValAssigner::VisitUnaryOp(UnaryOp* unary) { + if(unary->op_ != Token::DEREF) + should_not_happen(); + gen_->VisitExpr(unary->operand_); + ir::value* addr = gen_->ret_; + ret_ = gen_->bld_->create_store(addr, rhs_); +} + +void LValAssigner::VisitObject(Object* obj) { + error_not_implemented(); +} + +void LValAssigner::VisitIdentifier(Identifier* ident) { + std::string name = ident->Name(); + gen_->mod_->set_value(name, rhs_); } -void Generator::EmitLoad(const std::string& addr, Type* type) { - assert(type->IsScalar()); - EmitLoad(addr, type->Width(), type->IsFloat()); -} - -void Generator::EmitLoad(const std::string& addr, int width, bool flt) { - auto load = GetLoad(width, flt); - auto des = GetDes(width == 4 ? 4: 8, flt); - Emit(load, addr, des); -} - - -void Generator::EmitStore(const ObjectAddr& addr, Type* type) { - if (addr.bitFieldWidth_ != 0) { - EmitStoreBitField(addr, type); - } else { - EmitStore(addr.Repr(), type); - } -} - - -void Generator::EmitStore(const std::string& addr, Type* type) { - EmitStore(addr, type->Width(), type->IsFloat()); -} - - -void Generator::EmitStore(const std::string& addr, int width, bool flt) { - auto store = GetInst("mov", width, flt); - auto des = GetDes(width, flt); - Emit(store, des, addr); -} - - -void Generator::EmitLabel(const std::string& label) { - fprintf(outFile_, "%s:\n", label.c_str()); -} - - -void Generator::EmitZero(ObjectAddr addr, int width) { - int units[] = {8, 4, 2, 1}; - Emit("xorq", "%rax", "%rax"); - for (auto unit: units) { - while (width >= unit) { - EmitStore(addr.Repr(), unit, false); - addr.offset_ += unit; - width -= unit; - } - } -} - - -void LValGenerator::VisitBinaryOp(BinaryOp* binary) { - EmitLoc(binary); - assert(binary->op_ == '.'); - - addr_ = LValGenerator().GenExpr(binary->lhs_); - const auto& name = binary->rhs_->Tok()->str_; - auto structType = binary->lhs_->Type()->ToStruct(); - auto member = structType->GetMember(name); - - addr_.offset_ += member->Offset(); - addr_.bitFieldBegin_ = member->bitFieldBegin_; - addr_.bitFieldWidth_ = member->bitFieldWidth_; -} - - -void LValGenerator::VisitUnaryOp(UnaryOp* unary) { - EmitLoc(unary); - assert(unary->op_ == Token::DEREF); - Generator().VisitExpr(unary->operand_); - Emit("movq", "%rax", "%r10"); - addr_ = {"", "%r10", 0}; -} - - -void LValGenerator::VisitObject(Object* obj) { - EmitLoc(obj); - if (!obj->IsStatic() && obj->Anonymous()) { - assert(obj->Decl()); - Generator().Visit(obj->Decl()); - obj->SetDecl(nullptr); - } - - if (obj->IsStatic()) { - addr_ = {obj->Repr(), "%rip", 0}; - } else { - addr_ = {"", "%rbp", obj->Offset()}; - } -} - - -// The identifier must be function -void LValGenerator::VisitIdentifier(Identifier* ident) { - assert(!ident->ToTypeName()); - EmitLoc(ident); - // Function address - addr_ = {ident->Name(), "", 0}; -} - - -void LValGenerator::VisitTempVar(TempVar* tempVar) { - std::string label; - switch (tempVar->Type()->Width()) { - case 1: label = "%cl"; break; - case 2: label = "%cx"; break; - case 4: label = "%ecx"; break; - case 8: label = "%rcx"; break; - default: assert(false); - } - addr_ = {label, "", 0}; -} - - -std::string ObjectAddr::Repr() const { - auto ret = base_.size() ? "(" + base_ + ")": ""; - if (label_.size() == 0) { - if (offset_ == 0) { - return ret; - } - return std::to_string(offset_) + ret; - } else { - if (offset_ == 0) { - return label_ + ret; - } - return label_ + "+" + std::to_string(offset_) + ret; - } -} - - -StaticInitializer Generator::GetStaticInit(InitList::iterator& iter, - InitList::iterator end, - int offset) { - auto init = iter++; - auto width = init->type_->Width(); - if (init->type_->IsInteger()) { - if (init->bitFieldWidth_ == 0) { - auto val = Evaluator().Eval(init->expr_); - return {init->offset_, width, val, ""}; - } - int totalBits = 0; - unsigned char val = 0; - while (init != end && init->offset_ <= offset && totalBits < 8) { - auto bitVal = Evaluator().Eval(init->expr_); - auto begin = init->bitFieldBegin_; - auto width = init->bitFieldWidth_; - auto valBegin = 0; - auto valWidth = 0; - auto mask = 0UL; - if (init->offset_ < offset) { - begin = 0; - width -= (8 - init->bitFieldBegin_); - if (offset - init->offset_ > 1) - width -= (offset - init->offset_ - 1) * 8; - valBegin = init->bitFieldWidth_ - width; - } - valWidth = std::min(static_cast(8 - begin), width); - mask = Object::BitFieldMask(valBegin, valWidth); - val |= ((bitVal & mask) >> valBegin) << begin; - totalBits = begin + valWidth; - if (width - valWidth <= 0) - ++init; - } - iter = init; - return {offset, 1, val, ""}; - } else if (init->type_->IsFloat()) { - auto val = Evaluator().Eval(init->expr_); - auto lval = *reinterpret_cast(&val); - return {init->offset_, width, lval, ""}; - } else if (init->type_->ToPointer()) { - auto addr = Evaluator().Eval(init->expr_); - return {init->offset_, width, addr.offset_, addr.label_}; - } else { // Struct initializer - Error(init->expr_, "initializer element is not constant"); - return StaticInitializer(); // Make compiler happy - } -} diff --git a/lib/lang/wgtcc/main.cc b/lib/lang/wgtcc/main.cc index 72e2000ef..cc02588f6 100644 --- a/lib/lang/wgtcc/main.cc +++ b/lib/lang/wgtcc/main.cc @@ -28,226 +28,3 @@ static std::list gcc_filenames_in; static std::list gcc_args; static std::list defines; static std::list include_paths; - - -static void Usage() { - printf("Usage: wgtcc [options] file...\n" - "Options: \n" - " -h Display this information\n" - " -D Define object like macro\n" - " -I Add search path\n" - " -E Preprocess only; do not compile, assemble or link\n" - " -S Compile only; do not assemble or link\n" - " -o specify output file\n"); - - exit(0); -} - - -static std::string GetExtension(const std::string& filename) { - return filename.substr(filename.size() >= 2 ? filename.size() - 2 : 0); -} - - -static void ValidateFileName(const std::string& filename) { - auto ext = GetExtension(filename); - if (ext != ".c" && ext != ".s" && ext != ".o" && ext != ".a") - Error("bad file name format:'%s'", filename.c_str()); -} - - -static void DefineMacro(Preprocessor& cpp, const std::string& def) { - auto pos = def.find('='); - std::string macro; - std::string* replace; - if (pos == std::string::npos) { - macro = def; - replace = new std::string(); - } else { - macro = def.substr(0, pos); - replace = new std::string(def.substr(pos + 1)); - } - cpp.AddMacro(macro, replace); -} - - -static std::string GetName(const std::string& path) { - auto pos = path.rfind('/'); - if (pos == std::string::npos) - return path; - return path.substr(pos + 1); -} - -static int RunWgtcc() { - if (GetExtension(filename_in) != ".c") - return -3; - - Preprocessor cpp(&filename_in); - for (auto& def: defines) - DefineMacro(cpp, def); - for (auto& path: include_paths) - cpp.AddSearchPath(path); - - FILE* fp = stdout; - if (specified_out_name) { - fp = fopen(filename_out.c_str(), "w"); - } - TokenSequence ts; - cpp.Process(ts); - if (only_preprocess) { - ts.Print(fp); - return 0; - } - - if (!only_compile || !specified_out_name) { - filename_out = GetName(filename_in); - filename_out.back() = 's'; - } - fp = fopen(filename_out.c_str(), "w"); - - Parser parser(ts); - parser.Parse(); - Generator::SetInOut(&parser, fp); - Generator().Gen(); - fclose(fp); - return 0; -} - - -static int RunGcc() { - // Froce C11 - bool spec_std = false; - for (auto& arg: gcc_args) { - if (arg.substr(0, 4) == "-std") { - arg = "-std=c11"; - spec_std = true; - } - } - if (!spec_std) { - gcc_args.push_front("-std=c11"); - } - - std::string systemArg = "gcc"; - for (const auto& arg: gcc_args) { - systemArg += " " + arg; - } - auto ret = system(systemArg.c_str()); - return ret; -} - - -static void ParseInclude(int argc, char* argv[], int& i) { - if (argv[i][2]) { - include_paths.push_front(&argv[i][2]); - return; - } - - if (i == argc - 1) { - Error("missing argument to '%s'", argv[i]); - } - include_paths.push_front(argv[++i]); - gcc_args.push_back(argv[i]); -} - - -static void ParseDefine(int argc, char* argv[], int& i) { - if (argv[i][2]) { - defines.push_back(&argv[i][2]); - return; - } - - if (i == argc - 1) - Error("missing argument to '%s'", argv[i]); - defines.push_back(argv[++i]); - gcc_args.push_back(argv[i]); -} - - -static void ParseOut(int argc, char* argv[], int& i) { - if (i == argc - 1) - Error("missing argument to '%s'", argv[i]); - filename_out = argv[++i]; - gcc_args.push_back(argv[i]); -} - - -/* Use: - * wgtcc: compile - * gcc: assemble and link - * Allowing multi file may not be a good idea... - */ -int main(int argc, char* argv[]) { - if (argc < 2) - Usage(); - - program = std::string(argv[0]); - for (auto i = 1; i < argc; ++i) { - if (argv[i][0] != '-') { - filename_in = std::string(argv[i]); - ValidateFileName(filename_in); - filenames_in.push_back(filename_in); - continue; - } - - gcc_args.push_back(argv[i]); - switch (argv[i][1]) { - case 'h': Usage(); break; - case 'E': only_preprocess = true; break; - case 'S': only_compile = true; break; - case 'I': ParseInclude(argc, argv, i); break; - case 'D': ParseDefine(argc, argv, i); break; - case 'o': - specified_out_name = true; - ParseOut(argc, argv, i); break; - case 'g': gcc_args.pop_back(); debug = true; break; - default:; - } - } - -#ifdef DEBUG - RunWgtcc(); -#else - for (const auto& filename: filenames_in) { - filename_in = filename; - pid_t pid = fork(); - if (pid < 0) { - Error("fork error"); - } else if (pid == 0) { - // Do work in child process - return RunWgtcc(); - } - } - - for (size_t i = 0; i < filenames_in.size(); ++i) { - int stat; - wait(&stat); - // Child process terminate normaly if : - // 1. terminate with `exit()`, that is, WIFEXITED(stat) if true. - // 2. the status code is 0, that is, WEXITSTATUS(stat) == 0 - if (!WIFEXITED(stat) || WEXITSTATUS(stat)) - return 0; - } -#endif - - if (only_preprocess || only_compile) { - if (specified_out_name && filenames_in.size() > 1) - Error("cannot specifier output filename with multiple input file"); - return 0; - } - - std::list filenames_out; - for (auto& filename: filenames_in) { - if (GetExtension(filename) == ".c") { - gcc_args.push_back(GetName(filename)); - gcc_args.back().back() = 's'; - } else { - gcc_args.clear(); - for (int i = 1; i < argc; ++i) - gcc_args.push_back(argv[i]); - break; - } - } - auto ret = RunGcc(); - remove(filename_out.c_str()); - return ret; -} diff --git a/lib/lang/wgtcc/t_code_gen.cc b/lib/lang/wgtcc/t_code_gen.cc deleted file mode 100644 index 4d78944e8..000000000 --- a/lib/lang/wgtcc/t_code_gen.cc +++ /dev/null @@ -1,8 +0,0 @@ -#include "triton/lang/wgtcc/t_code_gen.h" -#include "triton/lang/wgtcc/evaluator.h" -#include "triton/lang/wgtcc/parser.h" -#include "triton/lang/wgtcc/token.h" - -void Generator::Gen() { - VisitTranslationUnit(parser_->Unit()); -} From a6ec8072232dd42237e6959f809e7008283cafa8 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 21 Aug 2019 21:53:41 -0700 Subject: [PATCH 311/494] more debugging --- include/triton/lang/wgtcc/ast.h | 19 ++++ include/triton/lang/wgtcc/code_gen.h | 42 ++++---- include/triton/lang/wgtcc/evaluator.h | 2 + include/triton/lang/wgtcc/parser.h | 2 +- include/triton/lang/wgtcc/visitor.h | 2 + lib/lang/wgtcc/ast.cc | 13 +++ lib/lang/wgtcc/code_gen.cc | 135 ++++++++++++++++++++++++-- lib/lang/wgtcc/parser.cc | 61 ++++-------- lib/lang/wgtcc/type.cc | 10 +- lib/runtime/function.cpp | 4 + 10 files changed, 213 insertions(+), 77 deletions(-) diff --git a/include/triton/lang/wgtcc/ast.h b/include/triton/lang/wgtcc/ast.h index fc5dca330..5e3096d96 100644 --- a/include/triton/lang/wgtcc/ast.h +++ b/include/triton/lang/wgtcc/ast.h @@ -138,6 +138,25 @@ private: Stmt* else_; }; +class ForStmt: public Stmt { + template friend class Evaluator; + friend class AddrEvaluator; + friend class Generator; +public: + static ForStmt* New(Stmt* body, Stmt* init = nullptr, Expr* cond = nullptr, Expr* step = nullptr); + virtual ~ForStmt() {} + virtual void Accept(Visitor* v); + +protected: + ForStmt(Stmt* body, Stmt* init = nullptr, Expr* cond = nullptr, Expr* step = nullptr) + : body_(body), init_(init), cond_(cond), step_(step) {} + +private: + Stmt* body_; + Stmt* init_; + Expr* cond_; + Expr* step_; +}; class JumpStmt : public Stmt { template friend class Evaluator; diff --git a/include/triton/lang/wgtcc/code_gen.h b/include/triton/lang/wgtcc/code_gen.h index ff9b0fab2..863f91eed 100644 --- a/include/triton/lang/wgtcc/code_gen.h +++ b/include/triton/lang/wgtcc/code_gen.h @@ -31,7 +31,7 @@ using StaticInitList = std::vector; // Error inline void should_not_happen() { assert(false); } -inline void error_not_implemented() { assert(false); } +inline void error_not_implemented() { throw std::runtime_error("not implemented"); } class Generator: public Visitor { friend class Evaluator; @@ -48,32 +48,33 @@ protected: public: Generator(Parser* parser) : parser_(parser) {} - virtual void Visit(ASTNode* node) { node->Accept(this); } + void Visit(ASTNode* node) { node->Accept(this); } void VisitExpr(Expr* expr) { expr->Accept(this); } void VisitStmt(Stmt* stmt) { stmt->Accept(this); } // Expression - virtual void VisitBinaryOp(BinaryOp* binaryOp); - virtual void VisitUnaryOp(UnaryOp* unaryOp); - virtual void VisitConditionalOp(ConditionalOp* condOp); - virtual void VisitFuncCall(FuncCall* funcCall); - virtual void VisitObject(Object* obj); - virtual void VisitEnumerator(Enumerator* enumer); - virtual void VisitIdentifier(Identifier* ident); - virtual void VisitConstant(Constant* cons); - virtual void VisitTempVar(TempVar* tempVar); + void VisitBinaryOp(BinaryOp* binaryOp); + void VisitUnaryOp(UnaryOp* unaryOp); + void VisitConditionalOp(ConditionalOp* condOp); + void VisitFuncCall(FuncCall* funcCall); + void VisitObject(Object* obj); + void VisitEnumerator(Enumerator* enumer); + void VisitIdentifier(Identifier* ident); + void VisitConstant(Constant* cons); + void VisitTempVar(TempVar* tempVar); // Statement - virtual void VisitDeclaration(Declaration* init); - virtual void VisitEmptyStmt(EmptyStmt* emptyStmt); - virtual void VisitIfStmt(IfStmt* ifStmt); - virtual void VisitJumpStmt(JumpStmt* jumpStmt); - virtual void VisitReturnStmt(ReturnStmt* returnStmt); - virtual void VisitLabelStmt(LabelStmt* labelStmt); - virtual void VisitCompoundStmt(CompoundStmt* compoundStmt); + void VisitDeclaration(Declaration* init); + void VisitEmptyStmt(EmptyStmt* emptyStmt); + void VisitIfStmt(IfStmt* ifStmt); + void VisitForStmt(ForStmt* ifStmt); + void VisitJumpStmt(JumpStmt* jumpStmt); + void VisitReturnStmt(ReturnStmt* returnStmt); + void VisitLabelStmt(LabelStmt* labelStmt); + void VisitCompoundStmt(CompoundStmt* compoundStmt); - virtual void VisitFuncDef(FuncDef* funcDef); - virtual void VisitTranslationUnit(TranslationUnit* unit); + void VisitFuncDef(FuncDef* funcDef); + void VisitTranslationUnit(TranslationUnit* unit); void Gen(ir::module *mod); @@ -127,6 +128,7 @@ public: void VisitDeclaration(Declaration*) { should_not_happen(); } void VisitEmptyStmt(EmptyStmt*) { should_not_happen(); } void VisitIfStmt(IfStmt*) { should_not_happen(); } + void VisitForStmt(ForStmt*) { should_not_happen(); } void VisitJumpStmt(JumpStmt*) { should_not_happen(); } void VisitReturnStmt(ReturnStmt*) { should_not_happen(); } void VisitLabelStmt(LabelStmt*) { should_not_happen(); } diff --git a/include/triton/lang/wgtcc/evaluator.h b/include/triton/lang/wgtcc/evaluator.h index 620539169..6269e66d8 100644 --- a/include/triton/lang/wgtcc/evaluator.h +++ b/include/triton/lang/wgtcc/evaluator.h @@ -45,6 +45,7 @@ public: // We may should assert here virtual void VisitDeclaration(Declaration* init) {} virtual void VisitIfStmt(IfStmt* ifStmt) {} + virtual void VisitForStmt(ForStmt* forStmt) {} virtual void VisitJumpStmt(JumpStmt* jumpStmt) {} virtual void VisitReturnStmt(ReturnStmt* returnStmt) {} virtual void VisitLabelStmt(LabelStmt* labelStmt) {} @@ -100,6 +101,7 @@ public: // We may should assert here virtual void VisitDeclaration(Declaration* init) {} virtual void VisitIfStmt(IfStmt* ifStmt) {} + virtual void VisitForStmt(ForStmt* forStmt) {} virtual void VisitJumpStmt(JumpStmt* jumpStmt) {} virtual void VisitReturnStmt(ReturnStmt* returnStmt) {} virtual void VisitLabelStmt(LabelStmt* labelStmt) {} diff --git a/include/triton/lang/wgtcc/parser.h b/include/triton/lang/wgtcc/parser.h index 8c21af727..92ed7c38e 100644 --- a/include/triton/lang/wgtcc/parser.h +++ b/include/triton/lang/wgtcc/parser.h @@ -146,7 +146,7 @@ public: CompoundStmt* ParseSwitchStmt(); CompoundStmt* ParseWhileStmt(); CompoundStmt* ParseDoStmt(); - CompoundStmt* ParseForStmt(); + ForStmt *ParseForStmt(); JumpStmt* ParseGotoStmt(); JumpStmt* ParseContinueStmt(); JumpStmt* ParseBreakStmt(); diff --git a/include/triton/lang/wgtcc/visitor.h b/include/triton/lang/wgtcc/visitor.h index d3c4131a7..e761e9c0f 100644 --- a/include/triton/lang/wgtcc/visitor.h +++ b/include/triton/lang/wgtcc/visitor.h @@ -14,6 +14,7 @@ class TempVar; class Declaration; class IfStmt; +class ForStmt; class JumpStmt; class ReturnStmt; class LabelStmt; @@ -38,6 +39,7 @@ public: virtual void VisitDeclaration(Declaration* init) = 0; virtual void VisitIfStmt(IfStmt* ifStmt) = 0; + virtual void VisitForStmt(ForStmt* ifStmt) = 0; virtual void VisitJumpStmt(JumpStmt* jumpStmt) = 0; virtual void VisitReturnStmt(ReturnStmt* returnStmt) = 0; virtual void VisitLabelStmt(LabelStmt* labelStmt) = 0; diff --git a/lib/lang/wgtcc/ast.cc b/lib/lang/wgtcc/ast.cc index d194d4c0f..5cd958c80 100644 --- a/lib/lang/wgtcc/ast.cc +++ b/lib/lang/wgtcc/ast.cc @@ -18,6 +18,7 @@ static MemPoolImp tempVarPool; static MemPoolImp unaryOpPool; static MemPoolImp emptyStmtPool; static MemPoolImp ifStmtPool; +static MemPoolImp forStmtPool; static MemPoolImp jumpStmtPool; static MemPoolImp returnStmtPool; static MemPoolImp labelStmtPool; @@ -48,6 +49,10 @@ void IfStmt::Accept(Visitor* v) { v->VisitIfStmt(this); } +void ForStmt::Accept(Visitor* v) { + v->VisitForStmt(this); +} + void JumpStmt::Accept(Visitor* v) { v->VisitJumpStmt(this); @@ -396,6 +401,7 @@ void BinaryOp::AdditiveOpTypeChecking() { ::Type* rhsScalType = TryExtractScalarType(this, rhs_); auto lhsPtrType = lhsScalType->ToPointer(); auto rhsPtrType = rhsScalType->ToPointer(); + std::cout << "adding" << std::endl; if (lhsPtrType) { if (op_ == '-') { if (rhsPtrType) { @@ -430,6 +436,7 @@ void BinaryOp::AdditiveOpTypeChecking() { } void BinaryOp::RangeOpTypeChecking() { + std::cout << "range" << std::endl; auto lhsType = lhs_->Type()->ToArithm(); auto rhsType = rhs_->Type()->ToArithm(); if(!lhsType || !lhsType->IsInteger() || !rhsType || !rhsType->IsInteger()) @@ -546,6 +553,7 @@ void BinaryOp::AssignOpTypeChecking() { // The other constraints are lefted to cast operator rhs_ = Expr::MayCast(rhs_, ScalarOrLikeTile(rhs_, lhsScalType)); type_ = lhs_->Type(); + Broadcast(); } @@ -969,6 +977,11 @@ CompoundStmt* CompoundStmt::New(std::list& stmts, ::Scope* scope) { return ret; } +ForStmt* ForStmt::New(Stmt* body, Stmt* init, Expr* cond, Expr* step) { + auto ret = new (forStmtPool.Alloc()) ForStmt(body, init, cond, step); + ret->pool_ = &forStmtPool; + return ret; +} JumpStmt* JumpStmt::New(LabelStmt* label) { auto ret = new (jumpStmtPool.Alloc()) JumpStmt(label); diff --git a/lib/lang/wgtcc/code_gen.cc b/lib/lang/wgtcc/code_gen.cc index e28ccdb53..1d3783a38 100644 --- a/lib/lang/wgtcc/code_gen.cc +++ b/lib/lang/wgtcc/code_gen.cc @@ -3,6 +3,7 @@ #include "triton/lang/wgtcc/parser.h" #include "triton/lang/wgtcc/token.h" #include "triton/ir/module.h" +#include "triton/ir/function.h" // Helpers void Generator::set_ret(ir::value* value) { @@ -25,10 +26,12 @@ void Generator::VisitBinaryOp(BinaryOp* binary) { Visit(binary->lhs_); ir::value* lhs = ret_; + // op info auto type = binary->lhs_->Type(); auto flt = type->IsFloat(); auto sign = !type->IsUnsigned(); + // return switch(binary->op_){ case Token::LOGICAL_AND: return set_ret(bld_->create_and(lhs, rhs)); @@ -40,6 +43,13 @@ void Generator::VisitBinaryOp(BinaryOp* binary) { case Token::RIGHT: return set_ret(bld_->create_lshr(lhs, rhs)); case '.': return error_not_implemented(); case ',': return error_not_implemented(); + case Token::ELLIPSIS: { + auto clhs = dynamic_cast(lhs); + auto crhs = dynamic_cast(rhs); + if(!clhs || !crhs) + should_not_happen(); + return set_ret(ir::constant_range::get(clhs, crhs)); + } case '+': if(binary->lhs_->Type()->ToPointer()) return set_ret(bld_->create_gep(lhs, {rhs})); @@ -210,6 +220,14 @@ void Generator::VisitDeclaration(Declaration* decl) { if(inits.size() > 1) assert(false); val = inits[0]; + std::cout << obj->Name() << " " << val->get_type()->get_type_id() << " " << ty->get_type_id() << std::endl; + if(val->get_type()->is_tile_ty() && ty->is_tile_ty()) { + for(auto s: val->get_type()->get_tile_shapes()) + std::cout << s->get_value() << std::endl; + std::cout << "---" << std::endl; + for(auto s: ty->get_tile_shapes()) + std::cout << s->get_value() << std::endl; + } assert(val->get_type() == ty); // update scope symbols table const std::string &name = obj->Name(); @@ -258,6 +276,38 @@ void Generator::VisitIfStmt(IfStmt* ifStmt) { bld_->set_insert_point(endif_bb); } +void Generator::VisitForStmt(ForStmt *forStmt) { + Stmt *init_ = forStmt->init_; + Expr *cond_ = forStmt->cond_; + Expr *step_ = forStmt->step_; + Stmt *body_ = forStmt->body_; + ir::basic_block *current_bb = bld_->get_insert_block(); + ir::function *fn = current_bb->get_parent(); + ir::basic_block *loop_bb = ir::basic_block::create(*ctx_, "loop", fn); + ir::basic_block *next_bb = ir::basic_block::create(*ctx_, "postloop", fn); + mod_->set_continue_fn([&](){ + if(step_) + VisitExpr(step_); + VisitExpr(cond_); + ir::value *cond = ret_; + return bld_->create_cond_br(cond, loop_bb, next_bb); + }); + VisitStmt(init_); + VisitExpr(cond_); + ir::value *cond = ret_; + bld_->create_cond_br(cond, loop_bb, next_bb); + bld_->set_insert_point(loop_bb); + VisitStmt(body_); + if(!is_terminator(ret_)) + mod_->get_continue_fn()(); + ir::basic_block *stop_bb = bld_->get_insert_block(); + mod_->seal_block(stop_bb); + mod_->seal_block(loop_bb); + mod_->seal_block(bld_->get_insert_block()); + mod_->seal_block(next_bb); + bld_->set_insert_point(next_bb); +} + void Generator::VisitJumpStmt(JumpStmt* jumpStmt) { return error_not_implemented(); } @@ -277,7 +327,7 @@ void Generator::VisitLabelStmt(LabelStmt* labelStmt) { void Generator::VisitCompoundStmt(CompoundStmt* compoundStmt) { if (compoundStmt->scope_){ - AllocObjects(compoundStmt->scope_); +// AllocObjects(compoundStmt->scope_); pushScope(); } for (auto stmt: compoundStmt->stmts_) @@ -287,32 +337,99 @@ void Generator::VisitCompoundStmt(CompoundStmt* compoundStmt) { } void Generator::VisitFuncDef(FuncDef* funcDef) { - return error_not_implemented(); + Stmt *body = funcDef->body_; + const std::string& name = funcDef->Name(); + FuncType* type = funcDef->FuncType(); + auto prototype = dynamic_cast(GenIRType(type, *ctx_)); + if(!prototype) + should_not_happen(); + ir::function *fn = mod_->get_or_insert_function(name, prototype); + std::vector args = fn->args(); + size_t i = 0; + for(Object* obj: type->Params()){ + std::string name = obj->Name(); + args[i]->set_name(name); + mod_->set_value(name, nullptr, args[i]); + mod_->get_scope().types[name] = args[i]->get_type(); + } + ir::basic_block *entry = ir::basic_block::create(mod_->get_context(), "entry", fn); + mod_->seal_block(entry); + mod_->get_builder().set_insert_point(entry); + VisitStmt(body); + if(!dynamic_cast(ret_)) + mod_->get_builder().create_ret_void(); } void Generator::VisitTranslationUnit(TranslationUnit* unit) { + pushScope(); for (auto extDecl: unit->ExtDecls()) Visit(extDecl); + popScope(); } void Generator::Gen(ir::module *mod) { - pushScope(); mod_ = mod; ctx_ = &mod_->get_context(); bld_ = &mod_->get_builder(); - std::unique_ptr assign(new LValAssigner(this)); - assign_ = assign.get(); + assign_ = new LValAssigner(this); VisitTranslationUnit(parser_->Unit()); + delete assign_; assign_ = nullptr; } // Triton-IR Values -ir::value* Generator::GenCastOp(ir::value* op, ir::type* type) { - //TODO - assert(false); - return nullptr; +ir::value* Generator::GenCastOp(ir::value* src, ir::type* dst_ty) { + if(dst_ty->is_tile_ty()) { + auto dst_shapes = dst_ty->get_tile_shapes(); + if(!src->get_type()->is_tile_ty()) + return bld_->create_splat(src, dst_shapes); + auto src_shapes = src->get_type()->get_tile_shapes(); + if(src_shapes.size() != dst_shapes.size()) + return bld_->create_reshape(src, dst_shapes); + else + return bld_->create_broadcast(src, dst_shapes); + } + ir::type *src_scalar_ty = src->get_type()->get_scalar_ty(); + ir::type *dst_scalar_ty = dst_ty->get_scalar_ty(); + bool src_signed = false; + bool dst_signed = false; + + if(src->get_type()->is_tile_ty()) + dst_ty = ir::tile_type::get_same_shapes(dst_scalar_ty, src->get_type()); + + if(src_scalar_ty == dst_scalar_ty) + return src; + + else if(src_scalar_ty->is_integer_ty() && src_signed && dst_scalar_ty->is_floating_point_ty()) + return bld_->create_si_to_fp(src, dst_ty); + + else if(src_scalar_ty->is_integer_ty() && !src_signed && dst_scalar_ty->is_floating_point_ty()) + return bld_->create_ui_to_fp(src, dst_ty); + + else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_integer_ty() && dst_signed) + return bld_->create_fp_to_si(src, dst_ty); + + else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_integer_ty() && !dst_signed) + return bld_->create_fp_to_ui(src, dst_ty); + + else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_floating_point_ty() && + src_scalar_ty->get_fp_mantissa_width() < dst_scalar_ty->get_fp_mantissa_width()) + return bld_->create_fp_ext(src, dst_ty); + + else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_floating_point_ty() && + src_scalar_ty->get_fp_mantissa_width() > dst_scalar_ty->get_fp_mantissa_width()) + return bld_->create_fp_trunc(src, dst_ty); + + else if(src_scalar_ty->is_integer_ty() && dst_scalar_ty->is_integer_ty() && + src_scalar_ty->get_integer_bitwidth()) + return bld_->create_int_cast(src, dst_ty, dst_signed); + + else{ + should_not_happen(); + return nullptr; + } } // Triton-IR Types diff --git a/lib/lang/wgtcc/parser.cc b/lib/lang/wgtcc/parser.cc index cf1e582fc..f1fb52228 100644 --- a/lib/lang/wgtcc/parser.cc +++ b/lib/lang/wgtcc/parser.cc @@ -450,8 +450,9 @@ Expr* Parser::ParseSubScripting(Expr* lhs) { // create ret shape TileType::ShapeInt shape; size_t i = 0; + const Token* tok; do { - auto tok = ts_.Next(); + tok = ts_.Next(); if(tok->tag_ == ':') shape.push_back(lhsShape[i++]); else if(tok->tag_ == Token::NEWAXIS) @@ -460,6 +461,8 @@ Expr* Parser::ParseSubScripting(Expr* lhs) { Error(tok, "only ':' and newaxis are supported in subscripts"); }while(ts_.Try(',')); ts_.Expect(']'); +// if(lhsShape.size() > i) +// Error(tok, "broadcasting not using all operand axes"); // create ret tile TileType *retType = TileType::New(shape, lhsQual); return UnaryOp::New(Token::CAST, lhs, retType); @@ -2298,61 +2301,33 @@ IfStmt* Parser::ParseIfStmt() { continueDest_ = continueDestBackup; \ } -CompoundStmt* Parser::ParseForStmt() { +ForStmt* Parser::ParseForStmt() { EnterBlock(); ts_.Expect('('); - - std::list stmts; - + // init + Stmt* init = nullptr; if (IsType(ts_.Peek())) { - stmts.push_back(ParseDecl()); + init = ParseDecl(); } else if (!ts_.Try(';')) { - stmts.push_back(ParseExpr()); + init = ParseExpr(); ts_.Expect(';'); } - - Expr* condExpr = nullptr; + // cond + Expr* cond = nullptr; if (!ts_.Try(';')) { - condExpr = ParseExpr(); + cond = ParseExpr(); ts_.Expect(';'); } - - Expr* stepExpr = nullptr; + // step + Expr* step = nullptr; if (!ts_.Try(')')) { - stepExpr = ParseExpr(); + step = ParseExpr(); ts_.Expect(')'); } - - auto condLabel = LabelStmt::New(); - auto stepLabel = LabelStmt::New(); - auto endLabel = LabelStmt::New(); - stmts.push_back(condLabel); - if (condExpr) { - auto gotoEndStmt = JumpStmt::New(endLabel); - auto ifStmt = IfStmt::New(condExpr, EmptyStmt::New(), gotoEndStmt); - stmts.push_back(ifStmt); - } - - // 我们需要给break和continue语句提供相应的标号,不然不知往哪里跳 - Stmt* bodyStmt; - ENTER_LOOP_BODY(endLabel, stepLabel); - bodyStmt = ParseStmt(); - // 因为for的嵌套结构,在这里需要回复break和continue的目标标号 - EXIT_LOOP_BODY() - - stmts.push_back(bodyStmt); - stmts.push_back(stepLabel); - if (stepExpr) - stmts.push_back(stepExpr); - else - stmts.push_back(EmptyStmt::New()); - stmts.push_back(JumpStmt::New(condLabel)); - stmts.push_back(endLabel); - - auto scope = curScope_; + // body + Stmt* body = ParseStmt(); ExitBlock(); - - return CompoundStmt::New(stmts, scope); + return ForStmt::New(body, init, cond, step); } diff --git a/lib/lang/wgtcc/type.cc b/lib/lang/wgtcc/type.cc index 94f17b985..02c1c0d56 100644 --- a/lib/lang/wgtcc/type.cc +++ b/lib/lang/wgtcc/type.cc @@ -317,11 +317,13 @@ bool ArrayType::Compatible(const Type& other) const { bool TileType::Compatible(const Type& other) const { // For two tile type to be compatible, - // the element types must be compatible, and have same shape - // if both specified + // the element types must be compatible + // and they must have compatible shapes auto otherTile = other.ToTile(); - if(!otherTile) return false; - if (!derived_->Compatible(*otherTile->derived_)) return false; + if(!otherTile) + return false; + if (!derived_->Compatible(*otherTile->derived_)) + return false; // The shapes should be equal if both specified if(complete_ && otherTile->complete_) return shape_ == otherTile->shape_; diff --git a/lib/runtime/function.cpp b/lib/runtime/function.cpp index 49bfa6249..f111387f1 100644 --- a/lib/runtime/function.cpp +++ b/lib/runtime/function.cpp @@ -8,6 +8,7 @@ #include "triton/lang/lang.h" #include "triton/lang/wgtcc/cpp.h" #include "triton/lang/wgtcc/parser.h" +#include "triton/lang/wgtcc/code_gen.h" #include "triton/driver/device.h" #include "triton/driver/stream.h" #include "triton/driver/kernel.h" @@ -133,6 +134,9 @@ triton::lang::translation_unit *function::make_ast(const char *csrc) { cpp.Process(ts); Parser parser(ts); parser.Parse(); + Generator gen(&parser); + ir::module out("", ctx_); + gen.Gen(&out); exit(EXIT_FAILURE); // if (only_preprocess) { From 87072203c13890e74ed181e906b3b8dafc72f0f3 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 22 Aug 2019 17:27:10 -0700 Subject: [PATCH 312/494] [codegen] triton-ir code generation does not crash --- examples/cpp/dot.cpp | 83 ++++++++++++++-------------- include/triton/lang/wgtcc/code_gen.h | 2 + lib/lang/wgtcc/ast.cc | 35 +++++------- lib/lang/wgtcc/code_gen.cc | 73 +++++++++++++----------- lib/lang/wgtcc/parser.cc | 5 +- lib/lang/wgtcc/type.cc | 2 +- lib/runtime/function.cpp | 1 + 7 files changed, 103 insertions(+), 98 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index 83a38be4e..a0a699711 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -78,52 +78,53 @@ std::string src(bool AT, bool BT, std::string a_ty, std::string b_ty, std::strin std::string align_ldb_str = "multiple_of(" + std::to_string(align_ldb) + ")"; std::string res = R"( -#define TM 128 -#define TN 128 -#define TK 32 + #define TM 128 + #define TN 128 + #define TK 32 -#define bool _Bool -#define true 1 -#define false 0 -#define __bool_true_false_are_defined 1 + #define bool _Bool + #define true 1 + #define false 0 + #define __bool_true_false_are_defined 1 -extern int get_program_id(int); + extern int get_program_id(int); -void matmul(restrict )" + a_ty + R"( * A __attribute__((readonly, aligned(16))), - restrict )" + b_ty + R"( * B __attribute__((readonly, aligned(16))), - restrict )" + c_ty + R"( * C __attribute__((aligned(16))), - int M, int N, int K, - int lda __attribute__((multiple_of(8))), - int ldb __attribute__((multiple_of(8))), - int ldc) { - int ridx = get_program_id(0); - int ridy = get_program_id(1); - int rxa[{TM, TN}] = ridx * TM + 0 ... TM; - int ryb[{TN}] = ridy * TN + 0 ... TN; - int rka[{TK}] = 0 ... TK; - int rkb[{TK}] = 0 ... TK; - float xc[{)" + XCS + R"(}] = 0; - )" + a_ty + R"(* pa[{)" + AS + "}] = A + rka" + bca0 + lda0 + " + rxa" + bca1 + lda1 + R"(; - )" + b_ty + R"(* pb[{)" + BS + "}] = B + rkb" + bcb0 + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; - )" + a_ty + R"( a[{)" + AS + R"(}] = *pa; - )" + b_ty + R"( b[{)" + BS + R"(}] = *pb; - for(int k = K; k > 0; k = k - TK){ - xc = )" + usea + " @ " + useb + R"( + xc; - pa = pa + TK)" + lda0 + R"(; - pb = pb + TK)" + ldb0 + R"(; - a = *pa; - b = *pb; + void matmul(restrict )" + a_ty + R"( * A __attribute__((readonly, aligned(16))), + restrict )" + b_ty + R"( * B __attribute__((readonly, aligned(16))), + restrict )" + c_ty + R"( * C __attribute__((aligned(16))), + int M, int N, int K, + int lda __attribute__((multiple_of(8))), + int ldb __attribute__((multiple_of(8))), + int ldc) { + int ridx = get_program_id(0); + int ridy = get_program_id(1); + int rxa[{TM}] = ridx * TM + 0 ... TM; + int ryb[{TN}] = ridy * TN + 0 ... TN; + int rka[{TK}] = 0 ... TK; + int rkb[{TK}] = 0 ... TK; + float xc[{)" + XCS + R"(}] = 0; + )" + a_ty + R"(* pa[{)" + AS + "}] = A + rka" + bca0 + lda0 + " + rxa" + bca1 + lda1 + R"(; + )" + b_ty + R"(* pb[{)" + BS + "}] = B + rkb" + bcb0 + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; + )" + a_ty + R"( a[{)" + AS + R"(}] = *pa; + )" + b_ty + R"( b[{)" + BS + R"(}] = *pb; + for(int k = K; k > 0; k = k - TK){ + xc = )" + usea + " @ " + useb + R"( + xc; + pa = pa + TK)" + lda0 + R"(; + pb = pb + TK)" + ldb0 + R"(; + a = *pa; + b = *pb; + } + int rxc[{TM}] = ridx * TM + (0 ... TM); + int ryc[{TN}] = ridy * TN + (0 ... TN); + )" + c_ty + R"(* pc[{TM, TN}] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; + )" + c_ty + R"( c[{TM, TN}] = xc; + bool checkc0[{TM}] = rxc < M; + bool checkc1[{TN}] = ryc < N; + bool checkc[{TM, TN}] = checkc0[:, newaxis] && checkc1[newaxis, :]; + *pc = c; } - int rxc[{TM}] = ridx * TM + (0 ... TM); - int ryc[{TN}] = ridy * TN + (0 ... TN); - )" + c_ty + R"(* pc[{TM, TN}] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - )" + c_ty + R"( c[{TM, TN}] = xc; - bool checkc0[{TM}] = rxc < M; - bool checkc1[{TN}] = ryc < N; - bool checkc[{TM, TN}] = checkc0[:, newaxis] && checkc1[newaxis, :]; - *pc = c; -} )"; + return res; } diff --git a/include/triton/lang/wgtcc/code_gen.h b/include/triton/lang/wgtcc/code_gen.h index 863f91eed..caf4067db 100644 --- a/include/triton/lang/wgtcc/code_gen.h +++ b/include/triton/lang/wgtcc/code_gen.h @@ -81,6 +81,8 @@ public: protected: // Triton-IR values ir::value* GenAssignOp(Expr* lvalue, ir::value* rhs); + ir::value* GenBroadcastOp(ir::value* src, ir::type* dst_ty); + ir::value* GenNumcastOp(ir::value*src, ir::type* dst_ty); ir::value* GenCastOp(ir::value* op, ir::type* type); // Triton-IR types diff --git a/lib/lang/wgtcc/ast.cc b/lib/lang/wgtcc/ast.cc index 5cd958c80..0a7327fa3 100644 --- a/lib/lang/wgtcc/ast.cc +++ b/lib/lang/wgtcc/ast.cc @@ -221,15 +221,20 @@ ArithmType* BinaryOp::Convert() { void BinaryOp::Broadcast() { auto lhsType = lhs_->Type()->ToTile(); auto rhsType = rhs_->Type()->ToTile(); + auto eleType = type_->ScalarType(); + assert(eleType); if(!lhsType && !rhsType) return ; else if(lhsType && !rhsType){ - type_ = lhsType; - rhs_ = UnaryOp::New(Token::CAST, lhs_, type_); + type_ = TileType::New(lhsType->Shape(), eleType); + ::Type* rtype = TileType::New(lhsType->Shape(), rhs_->Type()->ScalarType()); + rhs_ = UnaryOp::New(Token::CAST, rhs_, rtype); } else if(!lhsType && rhsType){ - type_ = rhsType; - lhs_ = UnaryOp::New(Token::CAST, rhs_, type_); + type_ = TileType::New(rhsType->Shape(), eleType); + ::Type* ltype = TileType::New(rhsType->Shape(), lhs_->Type()->ScalarType()); + lhs_ = UnaryOp::New(Token::CAST, lhs_, ltype); + } else { auto lhsShape = lhsType->Shape(); @@ -256,12 +261,13 @@ void BinaryOp::Broadcast() { "for operands of shape %d and %d", i, lhsShape[i], rhsShape[i]); } - auto eleType = lhsType->Derived(); + ::Type* ltype = TileType::New(retShape, lhsType->ScalarType()); + ::Type* rtype = TileType::New(retShape, rhsType->ScalarType()); type_ = TileType::New(retShape, eleType); if(retShape != lhsShape) - lhs_ = UnaryOp::New(Token::CAST, lhs_, type_); + lhs_ = UnaryOp::New(Token::CAST, lhs_, ltype); if(retShape != rhsShape) - rhs_ = UnaryOp::New(Token::CAST, rhs_, type_); + rhs_ = UnaryOp::New(Token::CAST, rhs_, rtype); } } @@ -347,18 +353,6 @@ void BinaryOp::CommaOpTypeChecking() { void BinaryOp::SubScriptingOpTypeChecking() { assert(false); - auto lhsType = lhs_->Type()->ToTile(); - - if (!lhsType) { - Error(this, "operator [] can only be used on tiles"); - } - - if (!rhs_->Type()->IsInteger()) { - Error(this, "the operand of [] should be integer"); - } - - // The type of [] operator is the derived type - type_ = lhsType->Derived(); } @@ -401,7 +395,6 @@ void BinaryOp::AdditiveOpTypeChecking() { ::Type* rhsScalType = TryExtractScalarType(this, rhs_); auto lhsPtrType = lhsScalType->ToPointer(); auto rhsPtrType = rhsScalType->ToPointer(); - std::cout << "adding" << std::endl; if (lhsPtrType) { if (op_ == '-') { if (rhsPtrType) { @@ -436,7 +429,6 @@ void BinaryOp::AdditiveOpTypeChecking() { } void BinaryOp::RangeOpTypeChecking() { - std::cout << "range" << std::endl; auto lhsType = lhs_->Type()->ToArithm(); auto rhsType = rhs_->Type()->ToArithm(); if(!lhsType || !lhsType->IsInteger() || !rhsType || !rhsType->IsInteger()) @@ -850,7 +842,6 @@ Declaration* Declaration::New(Object* obj) { void Declaration::AddInit(Initializer init) { init.expr_ = Expr::MayCast(init.expr_, init.type_); - auto res = inits_.insert(init); if (!res.second) { inits_.erase(res.first); diff --git a/lib/lang/wgtcc/code_gen.cc b/lib/lang/wgtcc/code_gen.cc index 1d3783a38..a234e8a47 100644 --- a/lib/lang/wgtcc/code_gen.cc +++ b/lib/lang/wgtcc/code_gen.cc @@ -18,6 +18,7 @@ inline bool is_terminator(ir::value* x) { // Expression void Generator::VisitBinaryOp(BinaryOp* binary) { + Visit(binary->rhs_); ir::value* rhs = ret_; @@ -43,6 +44,17 @@ void Generator::VisitBinaryOp(BinaryOp* binary) { case Token::RIGHT: return set_ret(bld_->create_lshr(lhs, rhs)); case '.': return error_not_implemented(); case ',': return error_not_implemented(); + case '@' : { + ir::type* ret_ty = GenIRType(binary->Type(), *ctx_); + ir::type* ret_scal_ty = ret_ty->get_scalar_ty(); + ir::value* _0; + if(ret_scal_ty->is_float_ty()) + _0 = ir::constant_fp::get(ret_scal_ty, 0); + else + _0 = ir::constant_int::get(ret_scal_ty, 0); + _0 = bld_->create_splat(_0, ret_ty->get_tile_shapes()); + return set_ret(bld_->create_dot(lhs, rhs, _0)); + } case Token::ELLIPSIS: { auto clhs = dynamic_cast(lhs); auto crhs = dynamic_cast(rhs); @@ -51,8 +63,9 @@ void Generator::VisitBinaryOp(BinaryOp* binary) { return set_ret(ir::constant_range::get(clhs, crhs)); } case '+': - if(binary->lhs_->Type()->ToPointer()) + if(binary->lhs_->Type()->ScalarType()->ToPointer()){ return set_ret(bld_->create_gep(lhs, {rhs})); + } else if(flt) return set_ret(bld_->create_fadd(lhs, rhs)); else @@ -138,10 +151,11 @@ void Generator::VisitBinaryOp(BinaryOp* binary) { } void Generator::VisitUnaryOp(UnaryOp* unary) { + // recursion Visit(unary->operand_); ir::value* op = ret_; - ir::type* type = GenIRType(unary->operand_->Type(), *ctx_); + // return switch (unary->op_) { case Token::PREFIX_INC: return error_not_implemented(); @@ -149,13 +163,14 @@ void Generator::VisitUnaryOp(UnaryOp* unary) { case Token::POSTFIX_INC: return error_not_implemented(); case Token::POSTFIX_DEC: return error_not_implemented(); case Token::ADDR: return error_not_implemented(); - case Token::DEREF: return error_not_implemented(); + case Token::DEREF: return set_ret(bld_->create_load(op)); case Token::PLUS: return error_not_implemented(); case Token::MINUS: return error_not_implemented(); case '~': return set_ret(bld_->create_neg(op)); case '!': return set_ret(bld_->create_not(op)); - case Token::CAST: return set_ret(GenCastOp(op, type)); - default: assert(false); + case Token::CAST: return set_ret(GenCastOp(op, GenIRType(unary->Type(), *ctx_))); + case '^': return set_ret(bld_->create_trans(op)); + default: error_not_implemented(); } return error_not_implemented(); } @@ -176,7 +191,7 @@ void Generator::VisitFuncCall(FuncCall* funcCall) { } void Generator::VisitObject(Object* obj) { - return error_not_implemented(); + return set_ret(mod_->get_value(obj->Name())); } void Generator::VisitEnumerator(Enumerator* enumer) { @@ -220,14 +235,6 @@ void Generator::VisitDeclaration(Declaration* decl) { if(inits.size() > 1) assert(false); val = inits[0]; - std::cout << obj->Name() << " " << val->get_type()->get_type_id() << " " << ty->get_type_id() << std::endl; - if(val->get_type()->is_tile_ty() && ty->is_tile_ty()) { - for(auto s: val->get_type()->get_tile_shapes()) - std::cout << s->get_value() << std::endl; - std::cout << "---" << std::endl; - for(auto s: ty->get_tile_shapes()) - std::cout << s->get_value() << std::endl; - } assert(val->get_type() == ty); // update scope symbols table const std::string &name = obj->Name(); @@ -351,6 +358,7 @@ void Generator::VisitFuncDef(FuncDef* funcDef) { args[i]->set_name(name); mod_->set_value(name, nullptr, args[i]); mod_->get_scope().types[name] = args[i]->get_type(); + i++; } ir::basic_block *entry = ir::basic_block::create(mod_->get_context(), "entry", fn); mod_->seal_block(entry); @@ -378,60 +386,58 @@ void Generator::Gen(ir::module *mod) { } -// Triton-IR Values -ir::value* Generator::GenCastOp(ir::value* src, ir::type* dst_ty) { +ir::value* Generator::GenBroadcastOp(ir::value* src, ir::type* dst_ty) { if(dst_ty->is_tile_ty()) { + ir::type *src_ty = src->get_type(); auto dst_shapes = dst_ty->get_tile_shapes(); - if(!src->get_type()->is_tile_ty()) + if(!src_ty->is_tile_ty()) return bld_->create_splat(src, dst_shapes); - auto src_shapes = src->get_type()->get_tile_shapes(); + auto src_shapes = src_ty->get_tile_shapes(); if(src_shapes.size() != dst_shapes.size()) return bld_->create_reshape(src, dst_shapes); else return bld_->create_broadcast(src, dst_shapes); } + return src; +} + +ir::value* Generator::GenNumcastOp(ir::value*src, ir::type* dst_ty) { ir::type *src_scalar_ty = src->get_type()->get_scalar_ty(); ir::type *dst_scalar_ty = dst_ty->get_scalar_ty(); - bool src_signed = false; - bool dst_signed = false; - if(src->get_type()->is_tile_ty()) dst_ty = ir::tile_type::get_same_shapes(dst_scalar_ty, src->get_type()); - + bool src_signed = false; + bool dst_signed = false; if(src_scalar_ty == dst_scalar_ty) return src; - else if(src_scalar_ty->is_integer_ty() && src_signed && dst_scalar_ty->is_floating_point_ty()) return bld_->create_si_to_fp(src, dst_ty); - else if(src_scalar_ty->is_integer_ty() && !src_signed && dst_scalar_ty->is_floating_point_ty()) return bld_->create_ui_to_fp(src, dst_ty); - else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_integer_ty() && dst_signed) return bld_->create_fp_to_si(src, dst_ty); - else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_integer_ty() && !dst_signed) return bld_->create_fp_to_ui(src, dst_ty); - else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_floating_point_ty() && src_scalar_ty->get_fp_mantissa_width() < dst_scalar_ty->get_fp_mantissa_width()) return bld_->create_fp_ext(src, dst_ty); - else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_floating_point_ty() && src_scalar_ty->get_fp_mantissa_width() > dst_scalar_ty->get_fp_mantissa_width()) return bld_->create_fp_trunc(src, dst_ty); - else if(src_scalar_ty->is_integer_ty() && dst_scalar_ty->is_integer_ty() && src_scalar_ty->get_integer_bitwidth()) return bld_->create_int_cast(src, dst_ty, dst_signed); - else{ should_not_happen(); return nullptr; } } +ir::value* Generator::GenCastOp(ir::value* src, ir::type* dst_ty) { + return GenNumcastOp(GenBroadcastOp(src, dst_ty), dst_ty); +} + // Triton-IR Types ir::type* Generator::GenIRType(::Type* type, ir::context& ctx) { if(auto T = type->ToVoid()) @@ -504,7 +510,7 @@ ir::type* Generator::GenIRPointerType(PointerType* type, ir::context& ctx) { } ir::type* Generator::GenIRStructType(StructType* type, ir::context& ctx) { - assert(false); + error_not_implemented(); return nullptr; } @@ -535,12 +541,15 @@ void LValAssigner::VisitUnaryOp(UnaryOp* unary) { } void LValAssigner::VisitObject(Object* obj) { - error_not_implemented(); + std::string name = obj->Name(); + gen_->mod_->set_value(name, rhs_); + ret_ = rhs_; } void LValAssigner::VisitIdentifier(Identifier* ident) { std::string name = ident->Name(); gen_->mod_->set_value(name, rhs_); + ret_ = rhs_; } diff --git a/lib/lang/wgtcc/parser.cc b/lib/lang/wgtcc/parser.cc index f1fb52228..5acfc9bf5 100644 --- a/lib/lang/wgtcc/parser.cc +++ b/lib/lang/wgtcc/parser.cc @@ -461,8 +461,8 @@ Expr* Parser::ParseSubScripting(Expr* lhs) { Error(tok, "only ':' and newaxis are supported in subscripts"); }while(ts_.Try(',')); ts_.Expect(']'); -// if(lhsShape.size() > i) -// Error(tok, "broadcasting not using all operand axes"); + if(lhsShape.size() > i) + Error(tok, "broadcasting not using all operand axes"); // create ret tile TileType *retType = TileType::New(shape, lhsQual); return UnaryOp::New(Token::CAST, lhs, retType); @@ -1919,6 +1919,7 @@ void Parser::ParseInitializer(Declaration* decl, ts_.Expect('='); } +// std::cout << "parsing initialized " << decl->Obj()->Name() << std::endl; Expr* expr; auto arrType = type->ToArray(); auto structType = type->ToStruct(); diff --git a/lib/lang/wgtcc/type.cc b/lib/lang/wgtcc/type.cc index 02c1c0d56..c83ddd37d 100644 --- a/lib/lang/wgtcc/type.cc +++ b/lib/lang/wgtcc/type.cc @@ -318,7 +318,7 @@ bool ArrayType::Compatible(const Type& other) const { bool TileType::Compatible(const Type& other) const { // For two tile type to be compatible, // the element types must be compatible - // and they must have compatible shapes + // and they must have the same shape auto otherTile = other.ToTile(); if(!otherTile) return false; diff --git a/lib/runtime/function.cpp b/lib/runtime/function.cpp index f111387f1..55e0d5fc1 100644 --- a/lib/runtime/function.cpp +++ b/lib/runtime/function.cpp @@ -120,6 +120,7 @@ void function::caller::operator ()(driver::stream *stream, const std::array Date: Thu, 22 Aug 2019 19:21:01 -0700 Subject: [PATCH 313/494] adding tunable parameters --- examples/cpp/dot.cpp | 82 +++++++++++++++--------------- include/triton/lang/wgtcc/parser.h | 4 +- include/triton/lang/wgtcc/type.h | 2 +- include/triton/runtime/function.h | 10 ++-- lib/driver/module.cpp | 8 +-- lib/lang/wgtcc/ast.cc | 5 +- lib/lang/wgtcc/code_gen.cc | 2 +- lib/lang/wgtcc/type.cc | 2 +- lib/runtime/function.cpp | 60 ++++++---------------- 9 files changed, 76 insertions(+), 99 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index a0a699711..fe9f5f21b 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -78,51 +78,51 @@ std::string src(bool AT, bool BT, std::string a_ty, std::string b_ty, std::strin std::string align_ldb_str = "multiple_of(" + std::to_string(align_ldb) + ")"; std::string res = R"( - #define TM 128 - #define TN 128 - #define TK 32 +#define bool _Bool +#define true 1 +#define false 0 +#define __bool_true_false_are_defined 1 +extern int get_program_id(int); - #define bool _Bool - #define true 1 - #define false 0 - #define __bool_true_false_are_defined 1 +#define TN 128 +#define TK 32 - extern int get_program_id(int); +static const int TM = 128; - void matmul(restrict )" + a_ty + R"( * A __attribute__((readonly, aligned(16))), - restrict )" + b_ty + R"( * B __attribute__((readonly, aligned(16))), - restrict )" + c_ty + R"( * C __attribute__((aligned(16))), - int M, int N, int K, - int lda __attribute__((multiple_of(8))), - int ldb __attribute__((multiple_of(8))), - int ldc) { - int ridx = get_program_id(0); - int ridy = get_program_id(1); - int rxa[{TM}] = ridx * TM + 0 ... TM; - int ryb[{TN}] = ridy * TN + 0 ... TN; - int rka[{TK}] = 0 ... TK; - int rkb[{TK}] = 0 ... TK; - float xc[{)" + XCS + R"(}] = 0; - )" + a_ty + R"(* pa[{)" + AS + "}] = A + rka" + bca0 + lda0 + " + rxa" + bca1 + lda1 + R"(; - )" + b_ty + R"(* pb[{)" + BS + "}] = B + rkb" + bcb0 + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; - )" + a_ty + R"( a[{)" + AS + R"(}] = *pa; - )" + b_ty + R"( b[{)" + BS + R"(}] = *pb; - for(int k = K; k > 0; k = k - TK){ - xc = )" + usea + " @ " + useb + R"( + xc; - pa = pa + TK)" + lda0 + R"(; - pb = pb + TK)" + ldb0 + R"(; - a = *pa; - b = *pb; - } - int rxc[{TM}] = ridx * TM + (0 ... TM); - int ryc[{TN}] = ridy * TN + (0 ... TN); - )" + c_ty + R"(* pc[{TM, TN}] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - )" + c_ty + R"( c[{TM, TN}] = xc; - bool checkc0[{TM}] = rxc < M; - bool checkc1[{TN}] = ryc < N; - bool checkc[{TM, TN}] = checkc0[:, newaxis] && checkc1[newaxis, :]; - *pc = c; +void matmul(restrict )" + a_ty + R"( * A __attribute__((readonly, aligned(16))), + restrict )" + b_ty + R"( * B __attribute__((readonly, aligned(16))), + restrict )" + c_ty + R"( * C __attribute__((aligned(16))), + int M, int N, int K, + int lda __attribute__((multiple_of(8))), + int ldb __attribute__((multiple_of(8))), + int ldc) { + int ridx = get_program_id(0); + int ridy = get_program_id(1); + int rxa[{TM}] = ridx * TM + 0 ... TM; + int ryb[{TN}] = ridy * TN + 0 ... TN; + int rka[{TK}] = 0 ... TK; + int rkb[{TK}] = 0 ... TK; + float xc[{)" + XCS + R"(}] = 0; + )" + a_ty + R"(* pa[{)" + AS + "}] = A + rka" + bca0 + lda0 + " + rxa" + bca1 + lda1 + R"(; + )" + b_ty + R"(* pb[{)" + BS + "}] = B + rkb" + bcb0 + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; + )" + a_ty + R"( a[{)" + AS + R"(}] = *pa; + )" + b_ty + R"( b[{)" + BS + R"(}] = *pb; + for(int k = K; k > 0; k = k - TK){ + xc = )" + usea + " @ " + useb + R"( + xc; + pa = pa + TK)" + lda0 + R"(; + pb = pb + TK)" + ldb0 + R"(; + a = *pa; + b = *pb; } + int rxc[{TM}] = ridx * TM + (0 ... TM); + int ryc[{TN}] = ridy * TN + (0 ... TN); + )" + c_ty + R"(* pc[{TM, TN}] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; + )" + c_ty + R"( c[{TM, TN}] = xc; + bool checkc0[{TM}] = rxc < M; + bool checkc1[{TN}] = ryc < N; + bool checkc[{TM, TN}] = checkc0[:, newaxis] && checkc1[newaxis, :]; + *pc = c; +} )"; return res; diff --git a/include/triton/lang/wgtcc/parser.h b/include/triton/lang/wgtcc/parser.h index 92ed7c38e..eedaeb5e6 100644 --- a/include/triton/lang/wgtcc/parser.h +++ b/include/triton/lang/wgtcc/parser.h @@ -25,7 +25,7 @@ class Parser { friend class Generator; public: - explicit Parser(const TokenSequence& ts) + explicit Parser(TokenSequence& ts) : unit_(TranslationUnit::New()), ts_(ts), externalSymbols_(new Scope(nullptr, S_BLOCK)), @@ -224,7 +224,7 @@ private: // The root of the AST TranslationUnit* unit_; - TokenSequence ts_; + TokenSequence& ts_; // It is not the real scope, // It contains all external symbols(resolved and not resolved) diff --git a/include/triton/lang/wgtcc/type.h b/include/triton/lang/wgtcc/type.h index b43b74339..1cb10777b 100644 --- a/include/triton/lang/wgtcc/type.h +++ b/include/triton/lang/wgtcc/type.h @@ -223,7 +223,7 @@ public: virtual bool IsInteger() const { return !IsFloat() && !IsComplex(); } virtual bool IsUnsigned() const { return tag_ & T_UNSIGNED; } virtual bool IsFloat() const { - return (tag_ & T_FLOAT) || (tag_ & T_DOUBLE); + return (tag_ & T_HALF) || (tag_ & T_FLOAT) || (tag_ & T_DOUBLE); } virtual bool IsBool() const { return tag_ & T_BOOL; } bool IsComplex() const { return tag_ & T_COMPLEX; } diff --git a/include/triton/runtime/function.h b/include/triton/runtime/function.h index 2880a4e54..0a91984e2 100644 --- a/include/triton/runtime/function.h +++ b/include/triton/runtime/function.h @@ -20,6 +20,7 @@ #include "triton/codegen/transform/shmem/barriers.h" #include "triton/codegen/transform/reassociate.h" #include "triton/codegen/transform/vectorize.h" +#include "triton/lang/wgtcc/parser.h" namespace llvm { class Module; @@ -87,9 +88,9 @@ private: typedef std::pair cache_val_t; private: - triton::lang::translation_unit *make_ast(const char *src); - std::unique_ptr make_ir(triton::lang::translation_unit *program); - options autotune(lang::translation_unit *ast, driver::stream *stream, const grid_fn_ty& grid, const std::vector &args); + triton::lang::translation_unit *make_ast(const std::string &src); + std::unique_ptr make_ir(Parser &parser); + options autotune(Parser &parser, driver::stream *stream, const grid_fn_ty& grid, const std::vector &args); std::unique_ptr make_bin(ir::module &function, driver::context *context, const options &opt); @@ -100,11 +101,12 @@ public: std::string make_tensorflow_src(const std::vector &outputs, const std::string ¯o); private: + TokenSequence ts_; + Parser parser_; // execution context ir::context ctx_; // program representations std::string src_; - lang::translation_unit *ast_; std::map cache_; }; diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 8e23959c0..7711b6d24 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -106,10 +106,10 @@ void module::compile_llvm_module(llvm::Module* module, const std::string& triple file_type_t ft) { init_llvm(); // debug -// llvm::legacy::PassManager pm; -// pm.add(llvm::createPrintModulePass(llvm::outs())); -// pm.add(llvm::createVerifierPass()); -// pm.run(*module); + llvm::legacy::PassManager pm; + pm.add(llvm::createPrintModulePass(llvm::outs())); + pm.add(llvm::createVerifierPass()); + pm.run(*module); // create machine module->setTargetTriple(triple); std::string error; diff --git a/lib/lang/wgtcc/ast.cc b/lib/lang/wgtcc/ast.cc index 0a7327fa3..8cb029021 100644 --- a/lib/lang/wgtcc/ast.cc +++ b/lib/lang/wgtcc/ast.cc @@ -461,7 +461,10 @@ void BinaryOp::MatmulOpTypeChecking() { QualType retType = lhsType->Derived(); if(retType != rhsType->Derived()) Error(this, "matrix multiplication operands have incompatible data types"); - type_ = TileType::New(retShape, lhsType->Derived()); + ArithmType* ScalType = lhsType->ScalarType()->ToArithm(); + if(ScalType->Tag() & T_HALF) + ScalType = ArithmType::New(T_FLOAT); + type_ = TileType::New(retShape, ScalType); } void BinaryOp::ShiftOpTypeChecking() { diff --git a/lib/lang/wgtcc/code_gen.cc b/lib/lang/wgtcc/code_gen.cc index a234e8a47..d7188b2b1 100644 --- a/lib/lang/wgtcc/code_gen.cc +++ b/lib/lang/wgtcc/code_gen.cc @@ -29,7 +29,7 @@ void Generator::VisitBinaryOp(BinaryOp* binary) { ir::value* lhs = ret_; // op info - auto type = binary->lhs_->Type(); + auto type = binary->lhs_->Type()->ScalarType(); auto flt = type->IsFloat(); auto sign = !type->IsUnsigned(); diff --git a/lib/lang/wgtcc/type.cc b/lib/lang/wgtcc/type.cc index c83ddd37d..25d5c56ce 100644 --- a/lib/lang/wgtcc/type.cc +++ b/lib/lang/wgtcc/type.cc @@ -318,7 +318,7 @@ bool ArrayType::Compatible(const Type& other) const { bool TileType::Compatible(const Type& other) const { // For two tile type to be compatible, // the element types must be compatible - // and they must have the same shape + // and they must have the same shapea auto otherTile = other.ToTile(); if(!otherTile) return false; diff --git a/lib/runtime/function.cpp b/lib/runtime/function.cpp index 55e0d5fc1..598ae146b 100644 --- a/lib/runtime/function.cpp +++ b/lib/runtime/function.cpp @@ -17,6 +17,7 @@ #include "triton/ir/function.h" #include "triton/tools/bench.hpp" #include "llvm/IR/Module.h" +#include "triton/ir/print.h" typedef struct yy_buffer_state * YY_BUFFER_STATE; @@ -117,50 +118,17 @@ void function::caller::operator ()(driver::stream *stream, const std::array function::make_ir(triton::lang::translation_unit *program) { +std::unique_ptr function::make_ir(Parser& parser) { // create Triton-IR from AST ir::module* module = new ir::module("", ctx_); - program->codegen(module); + Generator gen(&parser); + gen.Gen(module); return std::unique_ptr(module); } -options function::autotune(lang::translation_unit *ast, driver::stream* stream, const grid_fn_ty& grid_fn, const std::vector& args) { - std::unique_ptr ir = make_ir(ast); +options function::autotune(Parser& parser, driver::stream* stream, const grid_fn_ty& grid_fn, const std::vector& args) { + std::unique_ptr ir = make_ir(parser); // extract tunable values std::vector> values; for(auto it: ir->globals()) @@ -186,7 +154,7 @@ options function::autotune(lang::translation_unit *ast, driver::stream* stream, for(auto it: values) opt.params[it.first] = params[i++]; // make binary - auto ir = make_ir(ast); + auto ir = make_ir(parser); auto bin = make_bin(*ir, stream->context(), opt); // benchmark ir::function *tmp = ir->get_function_list()[0]; @@ -242,6 +210,8 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c // generate llvm code llvm::LLVMContext ctx; std::unique_ptr llvm(new llvm::Module(module.get_name(), ctx)); + ir::print(module, std::cout); + exit(EXIT_FAILURE); selection.run(module, *llvm); // return binary std::unique_ptr res(driver::module::create(context, llvm.get())); @@ -249,9 +219,11 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c } -function::function(const std::string &src): src_(src) { - // src -> ast - ast_ = make_ast(src_.c_str()); +function::function(const std::string &src): parser_(ts_), src_(src){ + Preprocessor cpp(&src_, true); + cpp.Process(ts_); + ts_.Print(); + parser_.Parse(); } void function::operator()(const std::vector& args, const grid_fn_ty& grid_fn, driver::stream *stream) { @@ -277,8 +249,8 @@ void function::operator()(const std::vector& args, const grid_fn_ty& grid_f } /* re-tune and re-compile */ - options opt = autotune(ast_, stream, grid_fn, args); - std::unique_ptr ir = make_ir(ast_); + options opt = autotune(parser_, stream, grid_fn, args); + std::unique_ptr ir = make_ir(parser_); std::unique_ptr bin = make_bin(*ir, stream->context(), opt); ir::function* fn = ir->get_function_list().front(); const caller& run = cache_.insert({key, cache_val_t{opt, caller(fn, std::move(bin), opt.num_warps*32)}}).first->second.second; From 64a6910644d788d7435fa296959dacac8f16855a Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 22 Aug 2019 21:02:38 -0700 Subject: [PATCH 314/494] [lang][parser] better support for attributes --- examples/cpp/dot.cpp | 8 +-- include/triton/lang/wgtcc/ast.h | 28 +++++++--- include/triton/lang/wgtcc/parser.h | 23 +++++--- lib/lang/wgtcc/ast.cc | 15 +++--- lib/lang/wgtcc/parser.cc | 85 ++++++++++++++++++------------ 5 files changed, 101 insertions(+), 58 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index fe9f5f21b..edd9fa1a5 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -82,12 +82,12 @@ R"( #define true 1 #define false 0 #define __bool_true_false_are_defined 1 + extern int get_program_id(int); -#define TN 128 -#define TK 32 - -static const int TM = 128; +static const int TM __attribute__((one_of(128))); +static const int TN __attribute__((one_of(128))); +static const int TK __attribute__((one_of(32))); void matmul(restrict )" + a_ty + R"( * A __attribute__((readonly, aligned(16))), restrict )" + b_ty + R"( * B __attribute__((readonly, aligned(16))), diff --git a/include/triton/lang/wgtcc/ast.h b/include/triton/lang/wgtcc/ast.h index 5e3096d96..d1c8f8690 100644 --- a/include/triton/lang/wgtcc/ast.h +++ b/include/triton/lang/wgtcc/ast.h @@ -54,6 +54,13 @@ class TranslationUnit; */ class ASTNode { +public: + struct Attr{ + std::string name; + std::vector vals; + }; + using AttrList = std::vector; + public: virtual ~ASTNode() {} virtual void Accept(Visitor* v) = 0; @@ -561,7 +568,7 @@ class Identifier: public Expr { friend class LValAssigner; public: - static Identifier* New(const Token* tok, QualType type, Linkage linkage); + static Identifier* New(const Token* tok, QualType type, Linkage linkage, const AttrList& attrList={}); virtual ~Identifier() {} virtual void Accept(Visitor* v); virtual bool IsLVal() { return false; } @@ -583,11 +590,12 @@ public: virtual void TypeChecking() {} protected: - Identifier(const Token* tok, QualType type, enum Linkage linkage) - : Expr(tok, type), linkage_(linkage) {} + Identifier(const Token* tok, QualType type, enum Linkage linkage, const AttrList& attrList={}) + : Expr(tok, type), linkage_(linkage), attrList_(attrList) {} // An identifier has property linkage enum Linkage linkage_; + AttrList attrList_; }; @@ -624,13 +632,15 @@ public: int storage=0, enum Linkage linkage=L_NONE, unsigned char bitFieldBegin=0, - unsigned char bitFieldWidth=0); + unsigned char bitFieldWidth=0, + const AttrList& attrList={}); static Object* NewAnony(const Token* tok, QualType type, int storage=0, enum Linkage linkage=L_NONE, unsigned char bitFieldBegin=0, - unsigned char bitFieldWidth=0); + unsigned char bitFieldWidth=0, + const AttrList& attrList={}); ~Object() {} virtual void Accept(Visitor* v); virtual Object* ToObject() { return this; } @@ -685,7 +695,8 @@ protected: int storage=0, enum Linkage linkage=L_NONE, unsigned char bitFieldBegin=0, - unsigned char bitFieldWidth=0) + unsigned char bitFieldWidth=0, + const AttrList& attrList={}) : Identifier(tok, type, linkage), storage_(storage), offset_(0), @@ -693,7 +704,8 @@ protected: decl_(nullptr), bitFieldBegin_(bitFieldBegin), bitFieldWidth_(bitFieldWidth), - anonymous_(false) {} + anonymous_(false), + attrList_(attrList){} private: int storage_; @@ -708,6 +720,8 @@ private: bool anonymous_; long id_ {0}; + + ASTNode::AttrList attrList_; }; diff --git a/include/triton/lang/wgtcc/parser.h b/include/triton/lang/wgtcc/parser.h index eedaeb5e6..bc43d9daf 100644 --- a/include/triton/lang/wgtcc/parser.h +++ b/include/triton/lang/wgtcc/parser.h @@ -14,7 +14,18 @@ class Preprocessor; -using TokenTypePair = std::pair; + +struct DeclInfo { + DeclInfo(const Token* _tok, + QualType _type, + ASTNode::AttrList _attrs = {}) + : tok(_tok), type(_type), attrs(_attrs) {} + + const Token* tok; + QualType type; + ASTNode::AttrList attrs; +}; + class Parser { using LiteralList = std::vector; @@ -103,7 +114,7 @@ public: Type* ParseEnumerator(ArithmType* type); int ParseQual(); QualType ParsePointer(QualType typePointedTo); - TokenTypePair ParseDeclarator(QualType type); + DeclInfo ParseDeclarator(QualType type); QualType ParseArrayFuncDeclarator(const Token* ident, QualType base); int ParseArrayLength(); TileType::ShapeInt ParseTileShape(); @@ -155,14 +166,14 @@ public: CompoundStmt* ParseCaseStmt(); CompoundStmt* ParseDefaultStmt(); Identifier* ProcessDeclarator(const Token* tok, - QualType type, + QualType type, const ASTNode::AttrList &attrs, int storageSpec, int funcSpec, int align); // GNU extensions - void TryAttributeSpecList(); - void ParseAttributeSpec(); - void ParseAttribute(); + ASTNode::AttrList TryAttributeSpecList(); + void ParseAttributeSpec(ASTNode::AttrList &attrList); + ASTNode::Attr ParseAttribute(); bool IsTypeName(const Token* tok) const{ if (tok->IsTypeSpecQual()) return true; diff --git a/lib/lang/wgtcc/ast.cc b/lib/lang/wgtcc/ast.cc index 8cb029021..47bc6d3a4 100644 --- a/lib/lang/wgtcc/ast.cc +++ b/lib/lang/wgtcc/ast.cc @@ -823,8 +823,9 @@ void FuncCall::TypeChecking() { Identifier* Identifier::New(const Token* tok, QualType type, - enum Linkage linkage) { - auto ret = new (identifierPool.Alloc()) Identifier(tok, type, linkage); + enum Linkage linkage, + const AttrList &attrList) { + auto ret = new (identifierPool.Alloc()) Identifier(tok, type, linkage, attrList); ret->pool_ = &identifierPool; return ret; } @@ -862,9 +863,10 @@ Object* Object::New(const Token* tok, int storage, enum Linkage linkage, unsigned char bitFieldBegin, - unsigned char bitFieldWidth) { + unsigned char bitFieldWidth, + const AttrList& attrList) { auto ret = new (objectPool.Alloc()) - Object(tok, type, storage, linkage, bitFieldBegin, bitFieldWidth); + Object(tok, type, storage, linkage, bitFieldBegin, bitFieldWidth, attrList); ret->pool_ = &objectPool; static long id = 0; @@ -879,9 +881,10 @@ Object* Object::NewAnony(const Token* tok, int storage, enum Linkage linkage, unsigned char bitFieldBegin, - unsigned char bitFieldWidth) { + unsigned char bitFieldWidth, + const AttrList& attrList) { auto ret = new (objectPool.Alloc()) - Object(tok, type, storage, linkage, bitFieldBegin, bitFieldWidth); + Object(tok, type, storage, linkage, bitFieldBegin, bitFieldWidth, attrList); ret->pool_ = &objectPool; ret->anonymous_ = true; diff --git a/lib/lang/wgtcc/parser.cc b/lib/lang/wgtcc/parser.cc index 5acfc9bf5..ee8a8a319 100644 --- a/lib/lang/wgtcc/parser.cc +++ b/lib/lang/wgtcc/parser.cc @@ -72,16 +72,18 @@ void Parser::ParseTranslationUnit() { int storageSpec, funcSpec, align; auto declType = ParseDeclSpec(&storageSpec, &funcSpec, &align); - auto tokTypePair = ParseDeclarator(declType); - auto tok = tokTypePair.first; - auto type = tokTypePair.second; + auto declInfo = ParseDeclarator(declType); + + auto tok = declInfo.tok; + auto type = declInfo.type; + auto attrs = declInfo.attrs; if (tok == nullptr) { ts_.Expect(';'); continue; } - auto ident = ProcessDeclarator(tok, type, storageSpec, funcSpec, align); + auto ident = ProcessDeclarator(tok, type, attrs, storageSpec, funcSpec, align); type = ident->Type(); if (tok && type->ToFunc() && ts_.Try('{')) { // Function definition @@ -1339,9 +1341,9 @@ StructType* Parser::ParseStructUnionDecl(StructType* type) { int align; auto baseType = ParseDeclSpec(nullptr, nullptr, &align); do { - auto tokTypePair = ParseDeclarator(baseType); - auto tok = tokTypePair.first; - auto memberType = tokTypePair.second; + auto declInfo = ParseDeclarator(baseType); + auto tok = declInfo.tok; + auto memberType = declInfo.type; if (ts_.Try(':')) { ParseBitField(type, tok, memberType); @@ -1505,15 +1507,15 @@ static QualType ModifyBase(QualType type, QualType base, QualType newBase) { * if token is nullptr, then we are parsing abstract declarator * else, parsing direct declarator. */ -TokenTypePair Parser::ParseDeclarator(QualType base) { +DeclInfo Parser::ParseDeclarator(QualType base) { // May be pointer auto pointerType = ParsePointer(base); if (ts_.Try('(')) { // 现在的 pointerType 并不是正确的 base type - auto tokenTypePair = ParseDeclarator(pointerType); - auto tok = tokenTypePair.first; - auto type = tokenTypePair.second; + auto declInfo = ParseDeclarator(pointerType); + auto tok = declInfo.tok; + auto type = declInfo.type; ts_.Expect(')'); @@ -1521,23 +1523,24 @@ TokenTypePair Parser::ParseDeclarator(QualType base) { // 修正 base type auto retType = ModifyBase(type, pointerType, newBase); - return TokenTypePair(tokenTypePair.first, retType); + return DeclInfo(declInfo.tok, retType); } else if (ts_.Peek()->IsIdentifier()) { auto tok = ts_.Next(); // GNU extension: variable attributes - TryAttributeSpecList(); + ASTNode::AttrList attrList = TryAttributeSpecList(); auto retType = ParseArrayFuncDeclarator(tok, pointerType); - return TokenTypePair(tok, retType); + return DeclInfo(tok, retType, attrList); } else { errTok_ = ts_.Peek(); auto retType = ParseArrayFuncDeclarator(nullptr, pointerType); - return TokenTypePair(nullptr, retType); + return DeclInfo(nullptr, retType); } } Identifier* Parser::ProcessDeclarator(const Token* tok, QualType type, + const ASTNode::AttrList& attrs, int storageSpec, int funcSpec, int align) { @@ -1564,6 +1567,11 @@ Identifier* Parser::ProcessDeclarator(const Token* tok, // TODO(wgtdkp): add previous declaration information return ident; } + + if(!attrs.empty()) { + Error(tok, "typedef attributes not allowed"); + } + ident = Identifier::New(tok, type, L_NONE); curScope_->Insert(ident); return ident; @@ -1658,9 +1666,9 @@ Identifier* Parser::ProcessDeclarator(const Token* tok, // C11 6.7.5 [2]: alignment specifier if (align > 0) Error(tok, "alignment specified for function"); - ret = Identifier::New(tok, type, linkage); + ret = Identifier::New(tok, type, linkage, attrs); } else { - auto obj = Object::New(tok, type, storageSpec, linkage); + auto obj = Object::New(tok, type, storageSpec, linkage, 0, 0, attrs); if (align > 0) obj->SetAlign(align); ret = obj; @@ -1797,14 +1805,15 @@ Object* Parser::ParseParamDecl() { // C11 6.7.5 [2]: alignment specifier cannot be specified in params auto type = ParseDeclSpec(&storageSpec, &funcSpec, nullptr); auto tokTypePair = ParseDeclarator(type); - auto tok = tokTypePair.first; - type = Type::MayCast(tokTypePair.second, true); + auto tok = tokTypePair.tok; + type = Type::MayCast(tokTypePair.type, true); + auto attrs = tokTypePair.attrs; if (!tok) { // Abstract declarator return Object::NewAnony(ts_.Peek(), type, 0, Linkage::L_NONE); } // Align set to non positive, stands for not specified - auto ident = ProcessDeclarator(tok, type, storageSpec, funcSpec, -1); + auto ident = ProcessDeclarator(tok, type, attrs, storageSpec, funcSpec, -1); if (!ident->ToObject()) Error(ident, "expect object in param list"); @@ -1813,9 +1822,9 @@ Object* Parser::ParseParamDecl() { QualType Parser::ParseAbstractDeclarator(QualType type) { - auto tokenTypePair = ParseDeclarator(type); - auto tok = tokenTypePair.first; - type = tokenTypePair.second; + auto declInfo = ParseDeclarator(type); + auto tok = declInfo.tok; + type = declInfo.type; if (tok) { // Not a abstract declarator! Error(tok, "unexpected identifier '%s'", tok->str_.c_str()); } @@ -1827,14 +1836,15 @@ Identifier* Parser::ParseDirectDeclarator(QualType type, int storageSpec, int funcSpec, int align) { - auto tokenTypePair = ParseDeclarator(type); - auto tok = tokenTypePair.first; - type = tokenTypePair.second; + auto declInfo = ParseDeclarator(type); + auto tok = declInfo.tok; + type = declInfo.type; + auto attrs = declInfo.attrs; if (tok == nullptr) { Error(errTok_, "expect identifier or '('"); } - return ProcessDeclarator(tok, type, storageSpec, funcSpec, align); + return ProcessDeclarator(tok, type, attrs, storageSpec, funcSpec, align); } @@ -2654,18 +2664,20 @@ Identifier* Parser::GetBuiltin(const Token* tok) { */ // Attribute -void Parser::TryAttributeSpecList() { +ASTNode::AttrList Parser::TryAttributeSpecList() { + ASTNode::AttrList attrList; while (ts_.Try(Token::ATTRIBUTE)) - ParseAttributeSpec(); + ParseAttributeSpec(attrList); + return attrList; } -void Parser::ParseAttributeSpec() { +void Parser::ParseAttributeSpec(ASTNode::AttrList& attrList) { ts_.Expect('('); ts_.Expect('('); while (!ts_.Try(')')) { - ParseAttribute(); + attrList.push_back(ParseAttribute()); if (!ts_.Try(',')) { ts_.Expect(')'); break; @@ -2675,17 +2687,20 @@ void Parser::ParseAttributeSpec() { } -void Parser::ParseAttribute() { +ASTNode::Attr Parser::ParseAttribute() { + ASTNode::Attr ret; if (!ts_.Test(Token::IDENTIFIER)) - return; + return ret; auto tok = ts_.Next(); + ret.name = tok->str_; if (ts_.Try('(')) { if (ts_.Try(')')) - return; - auto tok = ts_.Next(); + return ret; + ret.vals.push_back(ParseExpr()); if (ts_.Test(',')) { while (ts_.Try(',')) {} } ts_.Try(')'); } + return ret; } From 8798d240dc016351abde9ce183ada020117d82da Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 23 Aug 2019 17:13:30 -0700 Subject: [PATCH 315/494] matmul test passes --- examples/cpp/dot.cpp | 14 ++-- include/triton/runtime/function.h | 45 +++++++---- lib/codegen/transform/peephole.cpp | 30 +++++++- lib/driver/module.cpp | 9 +-- lib/runtime/function.cpp | 115 +++++++++++++---------------- 5 files changed, 122 insertions(+), 91 deletions(-) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cpp index edd9fa1a5..3122ead6f 100644 --- a/examples/cpp/dot.cpp +++ b/examples/cpp/dot.cpp @@ -82,13 +82,8 @@ R"( #define true 1 #define false 0 #define __bool_true_false_are_defined 1 - extern int get_program_id(int); -static const int TM __attribute__((one_of(128))); -static const int TN __attribute__((one_of(128))); -static const int TK __attribute__((one_of(32))); - void matmul(restrict )" + a_ty + R"( * A __attribute__((readonly, aligned(16))), restrict )" + b_ty + R"( * B __attribute__((readonly, aligned(16))), restrict )" + c_ty + R"( * C __attribute__((aligned(16))), @@ -162,10 +157,15 @@ perf_t do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int stream->write(dc, true, 0, hc); stream->synchronize(); // run - rt::function function(src(AT, BT, ty, ty, ty, 8, 8)); + rt::function::options_space_t opt; + opt.defines.push_back({"TM", {"128"}}); + opt.defines.push_back({"TN", {"128"}}); + opt.defines.push_back({"TK", {"32"}}); + opt.num_warps = {1, 2, 4, 8}; + rt::function function(src(AT, BT, ty, ty, ty, 8, 8), opt); auto ceil = [](size_t x, size_t y) { return (x + y - 1) / y; }; - auto grid = [&](const rt::params_t& x) { return rt::grid_t{ceil(M, x.at("TM")), ceil(N, x.at("TN")), 1}; }; + auto grid = [&](const rt::function::options_t& x) { return rt::grid_t{ceil(M, x.D("TM")), ceil(N, x.D("TN")), 1}; }; auto tflops = [&](double nanosec) { return 2.*M*N*K / nanosec * 1e-3; }; perf_t res; diff --git a/include/triton/runtime/function.h b/include/triton/runtime/function.h index 0a91984e2..63c91de9b 100644 --- a/include/triton/runtime/function.h +++ b/include/triton/runtime/function.h @@ -60,54 +60,69 @@ namespace runtime{ typedef std::array grid_t; typedef std::map params_t; -struct options { - size_t num_warps; - params_t params; -}; - +template T convert(const std::string& name); +template<> long convert(const std::string& name) { return std::stol(name); } +template<> int convert(const std::string& name) { return std::stoi(name); } class function { public: - typedef std::function grid_fn_ty; + struct options_space_t { + typedef std::pair> define_t; + std::vector defines; + std::vector num_warps; + }; + + struct options_t { + template + T D(const std::string& name) const { + return convert(defines.at(name)); + } + + std::map defines; + size_t num_warps; + }; + + typedef std::function grid_fn_ty; + private: class caller { public: - caller(ir::function *ir, std::shared_ptr program, size_t n_threads); + caller(ir::function *ir, std::shared_ptr program, const options_t& opt_); void operator()(driver::stream *stream, const std::array& grid, const std::vector& args) const; + const options_t opt() const { return opt_; } private: std::shared_ptr bin_; std::shared_ptr parent_; std::vector param_tys_; - size_t n_threads_; + options_t opt_; }; private: typedef std::pair> cache_key_t; - typedef std::pair cache_val_t; private: triton::lang::translation_unit *make_ast(const std::string &src); std::unique_ptr make_ir(Parser &parser); - options autotune(Parser &parser, driver::stream *stream, const grid_fn_ty& grid, const std::vector &args); - std::unique_ptr make_bin(ir::module &function, driver::context *context, const options &opt); + std::unique_ptr make_bin(ir::module &function, driver::context *context, const options_t &opt); + caller autotune(driver::stream *stream, const grid_fn_ty& grid, const std::vector &args); public: - function(const std::string& src); + function(const std::string& src, const options_space_t& opt = options_space_t()); void operator()(const std::vector& args, const std::array& grid, driver::stream* stream); void operator()(const std::vector& args, const grid_fn_ty& grid, driver::stream *stream); std::string make_tensorflow_src(const std::vector &outputs, const std::string ¯o); private: - TokenSequence ts_; - Parser parser_; // execution context ir::context ctx_; // program representations std::string src_; - std::map cache_; + std::map cache_; + // options + options_space_t opt_space_; }; } diff --git a/lib/codegen/transform/peephole.cpp b/lib/codegen/transform/peephole.cpp index 73885c772..114eda0de 100644 --- a/lib/codegen/transform/peephole.cpp +++ b/lib/codegen/transform/peephole.cpp @@ -2,7 +2,7 @@ #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/codegen/transform/peephole.h" - +#include namespace triton { namespace codegen{ namespace transform{ @@ -145,6 +145,34 @@ bool peephole::rewrite_dot_fp32(ir::dot_inst *dot, ir::builder& builder, bool tr } bool peephole::rewrite_dot(ir::instruction *value, ir::builder& builder){ + // dot(a, b, 0) + c -> dot(a, b, c) + auto add = dynamic_cast(value); + if(add && add->get_op() == ir::binary_op_t::FAdd) { + ir::value *lhs = add->get_operand(0); + ir::value *rhs = add->get_operand(1); + ir::dot_inst *lhs_dot = dynamic_cast(lhs); + ir::dot_inst *rhs_dot = dynamic_cast(rhs); + if(!lhs_dot && !rhs_dot) + return false; + ir::dot_inst *dot = lhs_dot ? lhs_dot : rhs_dot; + ir::value *other = (dot == lhs) ? rhs : lhs; + ir::value *acc = dot->get_operand(2); + ir::splat_inst *splat = dynamic_cast(acc); + ir::constant_fp *_0 = nullptr; + if(splat) + _0 = dynamic_cast(splat->get_operand(0)); + if(!(_0 && _0->get_value() == 0.0)) + return false; + ir::value *a = dot->get_operand(0); + ir::value *b = dot->get_operand(1); + ir::value * new_dot = builder.insert(ir::dot_inst::create(a, b, other, + dot->is_a_trans(), dot->is_b_trans(), + dot->get_name())); + add->replace_all_uses_with(new_dot); + return true; + } + + // dot(a, b, c) auto dot = dynamic_cast(value); if(!dot) return false; diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index 7711b6d24..6195a1249 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -106,10 +106,10 @@ void module::compile_llvm_module(llvm::Module* module, const std::string& triple file_type_t ft) { init_llvm(); // debug - llvm::legacy::PassManager pm; - pm.add(llvm::createPrintModulePass(llvm::outs())); - pm.add(llvm::createVerifierPass()); - pm.run(*module); +// llvm::legacy::PassManager pm; +// pm.add(llvm::createPrintModulePass(llvm::outs())); +// pm.add(llvm::createVerifierPass()); +// pm.run(*module); // create machine module->setTargetTriple(triple); std::string error; @@ -221,7 +221,6 @@ ocl_module::ocl_module(driver::context * context, llvm::Module* src): module(con catch(...){ char log[2048]; dispatch::clGetProgramBuildInfo(*cl_, *context->device()->cl(), CL_PROGRAM_BUILD_LOG, 1024, log, NULL); - std::cout << log << std::endl; throw; } } diff --git a/lib/runtime/function.cpp b/lib/runtime/function.cpp index 598ae146b..36bbc2100 100644 --- a/lib/runtime/function.cpp +++ b/lib/runtime/function.cpp @@ -17,7 +17,6 @@ #include "triton/ir/function.h" #include "triton/tools/bench.hpp" #include "llvm/IR/Module.h" -#include "triton/ir/print.h" typedef struct yy_buffer_state * YY_BUFFER_STATE; @@ -91,8 +90,8 @@ arg_type convert(ir::type *ty) { throw std::runtime_error("unknown type"); } -function::caller::caller(ir::function *ir, std::shared_ptr parent, size_t n_threads) - : bin_(driver::kernel::create(&*parent, ir->get_name().c_str())), parent_(parent), n_threads_(n_threads) { +function::caller::caller(ir::function *ir, std::shared_ptr parent, const options_t& opt) + : bin_(driver::kernel::create(&*parent, ir->get_name().c_str())), parent_(parent), opt_(opt) { // extract signature ir::function_type* ty = ir->get_fn_type(); for(size_t i = 0; i < ty->get_num_params(); i++) @@ -113,12 +112,10 @@ void function::caller::operator ()(driver::stream *stream, const std::arraysetArg(i, size_of(ty), arg_i.data()); } - stream->enqueue(&*bin_, grid, {n_threads_, 1, 1}); + stream->enqueue(&*bin_, grid, {opt_.num_warps * 32, 1, 1}); } - - std::unique_ptr function::make_ir(Parser& parser) { // create Triton-IR from AST ir::module* module = new ir::module("", ctx_); @@ -127,59 +124,59 @@ std::unique_ptr function::make_ir(Parser& parser) { return std::unique_ptr(module); } -options function::autotune(Parser& parser, driver::stream* stream, const grid_fn_ty& grid_fn, const std::vector& args) { - std::unique_ptr ir = make_ir(parser); - // extract tunable values - std::vector> values; - for(auto it: ir->globals()) - if(auto *mp = dynamic_cast(it.second)) - values.push_back({it.first, mp}); - // extract search space - std::vector> space; - space.push_back({1, 2, 4, 8}); // num warps - for(auto it: values) - space.push_back(it.second->get_space()); + +function::caller function::autotune(driver::stream* stream, const grid_fn_ty& grid_fn, + const std::vector& args) { + + // all tuning parameters are strings + std::vector num_warps; + for(size_t i: opt_space_.num_warps) + num_warps.push_back(std::to_string(i)); + std::vector> space; + space.push_back(num_warps); + for(const auto& i: opt_space_.defines) + space.push_back(i.second); + // exhaustive search - struct profile_t{ - double ts; - std::vector params; - }; - profile_t best = { INFINITY, {} }; - std::function)> benchmark = - [&](std::vector params) { - // options - options opt; + double best_ts = INFINITY; + std::unique_ptr ret; + + auto benchmark = [&](std::vector params) { + // extract options + options_t opt; unsigned i = 0; - opt.num_warps = params[i++]; - for(auto it: values) - opt.params[it.first] = params[i++]; - // make binary + opt.num_warps = std::stoi(params[i++]); + for(auto it: opt_space_.defines) + opt.defines[it.first] = params[i++]; + + // pre-process + TokenSequence tokens; + Preprocessor cpp(&src_, true); + for(auto it: opt_space_.defines) + cpp.AddMacro(it.first, &opt.defines.at(it.first)); + cpp.Process(tokens); + // parse + Parser parser(tokens); + parser.Parse(); + // triton-ir code-gen auto ir = make_ir(parser); + // binary code-gen auto bin = make_bin(*ir, stream->context(), opt); // benchmark ir::function *tmp = ir->get_function_list()[0]; - caller fn(tmp, std::move(bin), opt.num_warps * 32); - double ts = tools::bench([&]() { fn(stream, grid_fn(opt.params), args); }, stream); - if(ts < best.ts) - best = {ts, params}; + caller call(tmp, std::move(bin), opt); + double ts = tools::bench([&]() { call(stream, grid_fn(opt), args); }, stream); + // save best + if(ts < best_ts) + ret.reset(new caller(call)); }; - _parallel_loop_nest(space, benchmark, 1); - // populate options - unsigned current = 0; - options opt; - opt.num_warps = best.params[current++]; - for(auto it: values) - opt.params[it.first] = best.params[current++]; - return opt; + _parallel_loop_nest(space, benchmark, 1); + return *ret; } -std::unique_ptr function::make_bin(ir::module &module, driver::context *context, const options& opt) { +std::unique_ptr function::make_bin(ir::module &module, driver::context *context, const options_t& opt) { std::unique_ptr target = context->device()->make_target(); - // update metaparameter values - for(auto x: opt.params) - if(auto* mp = dynamic_cast(module.globals().at(x.first))) - mp->set_value(x.second); // create passes codegen::analysis::grids tune(opt.num_warps); codegen::analysis::shmem::info shmem_info; @@ -210,8 +207,6 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c // generate llvm code llvm::LLVMContext ctx; std::unique_ptr llvm(new llvm::Module(module.get_name(), ctx)); - ir::print(module, std::cout); - exit(EXIT_FAILURE); selection.run(module, *llvm); // return binary std::unique_ptr res(driver::module::create(context, llvm.get())); @@ -219,11 +214,8 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c } -function::function(const std::string &src): parser_(ts_), src_(src){ - Preprocessor cpp(&src_, true); - cpp.Process(ts_); - ts_.Print(); - parser_.Parse(); +function::function(const std::string &src, const options_space_t& opt): src_(src), opt_space_(opt) { + } void function::operator()(const std::vector& args, const grid_fn_ty& grid_fn, driver::stream *stream) { @@ -244,21 +236,18 @@ void function::operator()(const std::vector& args, const grid_fn_ty& grid_f /* find existing configuration */ auto it = cache_.find(key); if(it != cache_.end()){ - it->second.second(stream, grid_fn(it->second.first.params), args); + it->second(stream, grid_fn(it->second.opt()), args); return; } /* re-tune and re-compile */ - options opt = autotune(parser_, stream, grid_fn, args); - std::unique_ptr ir = make_ir(parser_); - std::unique_ptr bin = make_bin(*ir, stream->context(), opt); - ir::function* fn = ir->get_function_list().front(); - const caller& run = cache_.insert({key, cache_val_t{opt, caller(fn, std::move(bin), opt.num_warps*32)}}).first->second.second; - run(stream, grid_fn(opt.params), args); + caller call = autotune(stream, grid_fn, args); + cache_.insert({key, call}); + } void function::operator()(const std::vector& args, const grid_t& grid, driver::stream *stream) { - return this->operator()(args, [&grid](const params_t&){ return grid; }, stream); + return this->operator()(args, [&grid](const options_t&){ return grid; }, stream); } } From f98b0b8e2ae9dabdd1b0a09aac28625fe884abde Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 23 Aug 2019 17:28:02 -0700 Subject: [PATCH 316/494] [general] deleted the old compiler frontend --- CMakeLists.txt | 16 +- include/triton/driver/helpers/CL/infos.hpp | 413 ------------------- include/triton/lang/{wgtcc => }/ast.h | 0 include/triton/lang/{wgtcc => }/code_gen.h | 0 include/triton/lang/{wgtcc => }/cpp.h | 0 include/triton/lang/declaration.h | 265 ------------ include/triton/lang/{wgtcc => }/encoding.h | 0 include/triton/lang/error.h | 23 +- include/triton/lang/{wgtcc => }/evaluator.h | 0 include/triton/lang/expression.h | 357 ---------------- include/triton/lang/lang.h | 13 - include/triton/lang/{wgtcc => }/mem_pool.h | 0 include/triton/lang/module.h | 30 -- include/triton/lang/node.h | 72 ---- include/triton/lang/ops.h | 54 --- include/triton/lang/{wgtcc => }/parser.h | 0 include/triton/lang/parser.y | 424 -------------------- include/triton/lang/{wgtcc => }/scanner.h | 0 include/triton/lang/scanner.l | 119 ------ include/triton/lang/{wgtcc => }/scope.h | 0 include/triton/lang/statement.h | 115 ------ include/triton/lang/{wgtcc => }/token.h | 0 include/triton/lang/{wgtcc => }/type.h | 0 include/triton/lang/{wgtcc => }/visitor.h | 0 include/triton/lang/wgtcc/error.h | 15 - include/triton/runtime/function.h | 2 +- lib/driver/device.cpp | 7 +- lib/lang/{wgtcc => }/ast.cc | 12 +- lib/lang/{wgtcc => }/code_gen.cc | 8 +- lib/lang/{wgtcc => }/cpp.cc | 8 +- lib/lang/declaration.cpp | 241 ----------- lib/lang/{wgtcc => }/encoding.cc | 2 +- lib/lang/{wgtcc => }/error.cc | 6 +- lib/lang/error.cpp | 50 --- lib/lang/{wgtcc => }/evaluator.cc | 6 +- lib/lang/expression.cpp | 359 ----------------- lib/lang/module.cpp | 18 - lib/lang/node.cpp | 164 -------- lib/lang/{wgtcc => }/parser.cc | 14 +- lib/lang/{wgtcc => }/scanner.cc | 2 +- lib/lang/{wgtcc => }/scope.cc | 4 +- lib/lang/statement.cpp | 161 -------- lib/lang/{wgtcc => }/token.cc | 6 +- lib/lang/{wgtcc => }/type.cc | 8 +- lib/lang/wgtcc/main.cc | 30 -- lib/runtime/function.cpp | 19 +- 46 files changed, 59 insertions(+), 2984 deletions(-) delete mode 100644 include/triton/driver/helpers/CL/infos.hpp rename include/triton/lang/{wgtcc => }/ast.h (100%) rename include/triton/lang/{wgtcc => }/code_gen.h (100%) rename include/triton/lang/{wgtcc => }/cpp.h (100%) delete mode 100644 include/triton/lang/declaration.h rename include/triton/lang/{wgtcc => }/encoding.h (100%) rename include/triton/lang/{wgtcc => }/evaluator.h (100%) delete mode 100644 include/triton/lang/expression.h delete mode 100644 include/triton/lang/lang.h rename include/triton/lang/{wgtcc => }/mem_pool.h (100%) delete mode 100644 include/triton/lang/module.h delete mode 100644 include/triton/lang/node.h delete mode 100644 include/triton/lang/ops.h rename include/triton/lang/{wgtcc => }/parser.h (100%) delete mode 100644 include/triton/lang/parser.y rename include/triton/lang/{wgtcc => }/scanner.h (100%) delete mode 100644 include/triton/lang/scanner.l rename include/triton/lang/{wgtcc => }/scope.h (100%) delete mode 100644 include/triton/lang/statement.h rename include/triton/lang/{wgtcc => }/token.h (100%) rename include/triton/lang/{wgtcc => }/type.h (100%) rename include/triton/lang/{wgtcc => }/visitor.h (100%) delete mode 100644 include/triton/lang/wgtcc/error.h rename lib/lang/{wgtcc => }/ast.cc (99%) rename lib/lang/{wgtcc => }/code_gen.cc (99%) rename lib/lang/{wgtcc => }/cpp.cc (99%) delete mode 100644 lib/lang/declaration.cpp rename lib/lang/{wgtcc => }/encoding.cc (96%) rename lib/lang/{wgtcc => }/error.cc (94%) delete mode 100644 lib/lang/error.cpp rename lib/lang/{wgtcc => }/evaluator.cc (97%) delete mode 100644 lib/lang/expression.cpp delete mode 100644 lib/lang/module.cpp delete mode 100644 lib/lang/node.cpp rename lib/lang/{wgtcc => }/parser.cc (99%) rename lib/lang/{wgtcc => }/scanner.cc (99%) rename lib/lang/{wgtcc => }/scope.cc (96%) delete mode 100644 lib/lang/statement.cpp rename lib/lang/{wgtcc => }/token.cc (98%) rename lib/lang/{wgtcc => }/type.cc (98%) delete mode 100644 lib/lang/wgtcc/main.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 5faf6c8bb..b57c0859d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,18 +7,6 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") option(BUILD_EXAMPLES "Build C++ Triton examples" ON) option(BUILD_PYTHON_MODULE "Build Python Triton bindings" OFF) -# FLEX/YACC -find_package(BISON) -find_package(FLEX) -BISON_TARGET(Parser ${CMAKE_CURRENT_SOURCE_DIR}/include/triton/lang/parser.y ${CMAKE_CURRENT_SOURCE_DIR}/lib/lang/parser.cpp) -FLEX_TARGET(Lexer ${CMAKE_CURRENT_SOURCE_DIR}/include/triton/lang/scanner.l ${CMAKE_CURRENT_SOURCE_DIR}/lib/lang/scanner.cpp) -get_filename_component(BISON_Parser_INCLUDE_DIRECTORIES ${BISON_Parser_OUTPUT_HEADER} DIRECTORY) -include_directories(${BISON_Parser_INCLUDE_DIRECTORIES}) - -#execute_process(COMMAND python -c "import tensorflow as tf; print(tf.__cxx11_abi_flag__ if \"__cxx11_abi_flag__\" in tf.__dict__ else 0)" -# OUTPUT_VARIABLE TF_ABI OUTPUT_STRIP_TRAILING_WHITESPACE) -#add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) - # LLVM find_package(LLVM REQUIRED CONFIG) include_directories(${LLVM_INCLUDE_DIRS}) @@ -32,7 +20,7 @@ if(NOT CMAKE_BUILD_TYPE) endif() # Gather headers for cmake-based IDEs -file( GLOB_RECURSE ALL_SRC *.cpp *.hpp *.h *.py *.y *.l CMakeLists*) +file( GLOB_RECURSE ALL_SRC *.cpp *.hpp *.h *.py CMakeLists*) add_custom_target( ALL SOURCES ${ALL_SRC} ) # Compiler flags @@ -63,7 +51,7 @@ endif() # Triton file(GLOB_RECURSE LIBTRITON_SRC lib/*.cpp lib/*.cc) -add_library(triton SHARED ${LIBTRITON_SRC} ${EIGHTCC_SRC} ${PYTHON_SRC} ${BISON_Parser_OUTPUTS} ${FLEX_Lexer_OUTPUTS}) +add_library(triton SHARED ${LIBTRITON_SRC} ${EIGHTCC_SRC} ${PYTHON_SRC}) target_link_libraries(triton LLVM) # Warning level diff --git a/include/triton/driver/helpers/CL/infos.hpp b/include/triton/driver/helpers/CL/infos.hpp deleted file mode 100644 index dcd80928c..000000000 --- a/include/triton/driver/helpers/CL/infos.hpp +++ /dev/null @@ -1,413 +0,0 @@ -#ifndef ISAAC_DRIVER_HELPERS_OCL_INFOS_HPP_ -#define ISAAC_DRIVER_HELPERS_OCL_INFOS_HPP_ - -/* ========================================================================= - Copyright (c) 2010-2012, Institute for Microelectronics, - Institute for Analysis and Scientific Computing, - TU Wien. - - ----------------- - ViennaCL - The Vienna Computing Library - ----------------- - - Project Head: Karl Rupp rupp@iue.tuwien.ac.at - - (A list of authors and contributors can be found in the PDF manual) - - License: MIT (X11), see file LICENSE in the base directory -============================================================================= */ - - - -#include "triton/driver/error.h" -#include -#include - -namespace triton -{ -namespace driver -{ -namespace ocl -{ - - /** @brief Implementation details for the OpenCL managment layer in ViennaCL */ -namespace detail{ - -/** @brief Helper class for obtaining informations from the OpenCL backend. Deprecated! */ -template -struct info; - -/** \cond */ -template<> -struct info -{ - typedef cl_mem_info type; - - static void get(cl_mem handle, cl_mem_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret) - { - cl_int err = dispatch::clGetMemObjectInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); - check(err); - } -}; - -template<> -struct info -{ - typedef cl_device_info type; - - static void get(cl_device_id handle, cl_device_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret) - { - cl_int err = dispatch::clGetDeviceInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); - check(err); - } -}; - -template<> -struct info -{ - typedef cl_kernel_info type; - - static void get(cl_kernel handle, cl_kernel_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ - cl_int err = dispatch::clGetKernelInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); - check(err); - } - - static void get(cl_kernel handle, cl_device_id dev_id, cl_kernel_work_group_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ - cl_int err = dispatch::clGetKernelWorkGroupInfo(handle, dev_id, param_name,param_value_size,param_value,param_value_size_ret); - check(err); - } -}; - -template<> -struct info -{ - typedef cl_context_info type; - - static void get(cl_context handle, cl_context_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ - cl_int err = dispatch::clGetContextInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); - check(err); - } -}; - -template<> -struct info -{ - typedef cl_program_info type; - - static void get(cl_program handle, cl_program_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ - cl_int err = dispatch::clGetProgramInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); - check(err); - } - - static void get(cl_program handle, cl_device_id device, cl_program_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ - cl_int err = dispatch::clGetProgramBuildInfo(handle,device,param_name,param_value_size,param_value,param_value_size_ret); - check(err); - } -}; - - -template<> -struct info -{ - typedef cl_profiling_info type; - static void get(cl_event handle, cl_profiling_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ - cl_int err = dispatch::clGetEventProfilingInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); - check(err); - } -}; - -template<> -struct info -{ - typedef cl_command_queue_info type; - static void get(cl_command_queue handle, cl_profiling_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ - cl_int err = dispatch::clGetCommandQueueInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); - check(err); - } -}; - -template<> -struct info -{ - typedef cl_command_queue_info type; - static void get(cl_platform_id handle, cl_profiling_info param_name,size_t param_value_size,void *param_value,size_t *param_value_size_ret){ - cl_int err = dispatch::clGetPlatformInfo(handle,param_name,param_value_size,param_value,param_value_size_ret); - check(err); - } -}; - -//Info getter -//Some intelligence is needed for some types -template -struct get_info_impl{ - - template - RES_T operator()(MEM_T const & mem, INFO_T const & info){ - RES_T res; - detail::info::get(mem,info,sizeof(RES_T),&res,NULL); - return res; - } - - template - RES_T operator()(MEM_T const & mem, ARG_MEM_T const & arg_mem, INFO_T const & info){ - RES_T res; - detail::info::get(mem,arg_mem, info,sizeof(RES_T),&res,NULL); - return res; - } -}; - -template<> -struct get_info_impl{ - - template - std::string operator()(const MEM_T &mem, const INFO_T &info){ - char buff[1024]; - detail::info::get(mem,info,1024,buff,NULL); - return std::string(buff); - } - - template - std::string operator()(MEM_T const & mem, ARG_MEM_T const & arg_mem, INFO_T const & info){ - char buff[1024]; - detail::info::get(mem,arg_mem,info,1024,buff,NULL); - return std::string(buff); - } -}; - -template -struct get_info_impl > -{ - template - std::vector operator()(const MEM_T &mem, const INFO_T &info) - { - size_t vec_size; - detail::info::get(mem,info,0,NULL,&vec_size); - std::vector res(vec_size/sizeof(T)); - detail::info::get(mem,info,vec_size,res.data(),NULL); - return res; - } - - template - std::vector operator()(MEM_T const & mem, ARG_MEM_T const & arg_mem, INFO_T const & info) - { - size_t vec_size; - detail::info::get(mem,arg_mem,info,0,NULL,&vec_size); - std::vector res(vec_size/sizeof(T)); - detail::info::get(mem,arg_mem,info,vec_size,res.data(),NULL); - return res; - } -}; - -template::type param> -struct return_type; -/** \endcond */ - -/** \cond */ - #define SET_INFO_RETURN_TYPE(DATA_TYPE,NAME,RETURN_TYPE) template<> struct return_type { typedef RETURN_TYPE Result; } - -SET_INFO_RETURN_TYPE(cl_command_queue, CL_QUEUE_CONTEXT, cl_context); -SET_INFO_RETURN_TYPE(cl_command_queue, CL_QUEUE_DEVICE, cl_device_id); -SET_INFO_RETURN_TYPE(cl_command_queue, CL_QUEUE_REFERENCE_COUNT, cl_uint); -SET_INFO_RETURN_TYPE(cl_command_queue, CL_QUEUE_PROPERTIES, cl_command_queue_properties); - -SET_INFO_RETURN_TYPE(cl_context, CL_CONTEXT_DEVICES, std::vector); -SET_INFO_RETURN_TYPE(cl_context, CL_CONTEXT_NUM_DEVICES, cl_uint); -SET_INFO_RETURN_TYPE(cl_context, CL_CONTEXT_REFERENCE_COUNT, cl_uint); -SET_INFO_RETURN_TYPE(cl_context, CL_CONTEXT_PROPERTIES, cl_context_properties); - -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_ADDRESS_BITS, cl_uint); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_AVAILABLE, cl_bool); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_COMPILER_AVAILABLE, cl_bool); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint); - -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_ENDIAN_LITTLE, cl_bool); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_EXTENSIONS, std::string); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong); -//SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_IMAGE_SUPPORT, cl_bool); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_IMAGE2D_MAX_HEIGHT , size_t); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_IMAGE2D_MAX_WIDTH , size_t); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_IMAGE3D_MAX_DEPTH , size_t); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_IMAGE3D_MAX_HEIGHT , size_t); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_IMAGE3D_MAX_WIDTH , size_t); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MAX_CLOCK_FREQUENCY , cl_uint); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MAX_COMPUTE_UNITS , cl_uint); //The minimum value is 1 -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MAX_CONSTANT_ARGS , cl_uint); //The minimum value is 8 -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE , cl_ulong); //The minimum value is 64 KB -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MAX_MEM_ALLOC_SIZE , cl_ulong); //The minimum value is max (1/4th of CL_DEVICE_GLOBAL_MEM_SIZE, 128*1024*1024) -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MAX_PARAMETER_SIZE , size_t); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MAX_READ_IMAGE_ARGS , cl_uint); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MAX_SAMPLERS , cl_uint); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE , size_t); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS , cl_uint); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MAX_WORK_ITEM_SIZES , std::vector); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MAX_WRITE_IMAGE_ARGS , cl_uint); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MEM_BASE_ADDR_ALIGN , cl_uint); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE , cl_uint); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_NAME , std::string); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_PLATFORM , cl_platform_id); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR , cl_uint); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT , cl_uint); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT , cl_uint); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT , cl_uint); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE , cl_uint); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_PROFILE , std::string); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_PROFILING_TIMER_RESOLUTION , size_t); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_QUEUE_PROPERTIES , cl_command_queue_properties); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_SINGLE_FP_CONFIG , cl_device_fp_config); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_TYPE , cl_device_type); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_VENDOR , std::string); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_VENDOR_ID , cl_uint); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DEVICE_VERSION , std::string); -SET_INFO_RETURN_TYPE(cl_device_id, CL_DRIVER_VERSION , std::string); - -SET_INFO_RETURN_TYPE(cl_event, CL_PROFILING_COMMAND_QUEUED, cl_ulong); -SET_INFO_RETURN_TYPE(cl_event, CL_PROFILING_COMMAND_SUBMIT, cl_ulong); -SET_INFO_RETURN_TYPE(cl_event, CL_PROFILING_COMMAND_START, cl_ulong); -SET_INFO_RETURN_TYPE(cl_event, CL_PROFILING_COMMAND_END, cl_ulong); - -SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_FUNCTION_NAME, std::string); -SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_NUM_ARGS, cl_uint); -SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_REFERENCE_COUNT, cl_uint); -SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_CONTEXT, cl_context); -SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_PROGRAM, cl_program); - - -SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_WORK_GROUP_SIZE, size_t); -SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_COMPILE_WORK_GROUP_SIZE, std::vector); -SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong); -SET_INFO_RETURN_TYPE(cl_kernel,CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, size_t); - -SET_INFO_RETURN_TYPE(cl_mem,CL_MEM_TYPE, cl_mem_object_type); -SET_INFO_RETURN_TYPE(cl_mem,CL_MEM_FLAGS, cl_mem_flags); -SET_INFO_RETURN_TYPE(cl_mem,CL_MEM_SIZE, size_t); -SET_INFO_RETURN_TYPE(cl_mem,CL_MEM_HOST_PTR, void*); -SET_INFO_RETURN_TYPE(cl_mem,CL_MEM_MAP_COUNT, cl_uint); -SET_INFO_RETURN_TYPE(cl_mem,CL_MEM_REFERENCE_COUNT, cl_uint); -SET_INFO_RETURN_TYPE(cl_mem,CL_MEM_CONTEXT, cl_context); - -SET_INFO_RETURN_TYPE(cl_program,CL_PROGRAM_CONTEXT,cl_context); -SET_INFO_RETURN_TYPE(cl_program,CL_PROGRAM_DEVICES,std::vector); -SET_INFO_RETURN_TYPE(cl_program,CL_PROGRAM_NUM_DEVICES,cl_uint); -SET_INFO_RETURN_TYPE(cl_program,CL_PROGRAM_SOURCE,std::string); -SET_INFO_RETURN_TYPE(cl_program,CL_PROGRAM_BINARY_SIZES,std::vector); -SET_INFO_RETURN_TYPE(cl_program,CL_PROGRAM_BINARIES,std::vector); -//Build -SET_INFO_RETURN_TYPE(cl_program,CL_PROGRAM_BUILD_STATUS, cl_build_status); -SET_INFO_RETURN_TYPE(cl_program,CL_PROGRAM_BUILD_OPTIONS, std::string); -SET_INFO_RETURN_TYPE(cl_program,CL_PROGRAM_BUILD_LOG, std::string); - -SET_INFO_RETURN_TYPE(cl_platform_id,CL_PLATFORM_PROFILE, std::string); -SET_INFO_RETURN_TYPE(cl_platform_id,CL_PLATFORM_VERSION, std::string); -SET_INFO_RETURN_TYPE(cl_platform_id,CL_PLATFORM_NAME, std::string); -SET_INFO_RETURN_TYPE(cl_platform_id,CL_PLATFORM_VENDOR, std::string); -SET_INFO_RETURN_TYPE(cl_platform_id,CL_PLATFORM_EXTENSIONS, std::string); - -#undef SET_INFO_RETURN_TYPE - - /** \endcond */ -} - -template -typename detail::return_type::Result info(cl_device_id const & handle){ - typedef typename detail::return_type::Result res_t; - return detail::get_info_impl()(handle,param); -} - -template -typename detail::return_type::Result info(cl_mem const & handle){ - typedef typename detail::return_type::Result res_t; - return detail::get_info_impl()(handle,param); -} - -//Program - -template -typename detail::return_type::Result info(cl_program const & handle){ - typedef typename detail::return_type::Result res_t; - return detail::get_info_impl()(handle,param); -} - -template<> -inline typename detail::return_type::Result info(cl_program const & handle) -{ - std::vector res; - std::vector sizes = info(handle); - for(size_t s: sizes) - res.push_back(new unsigned char[s]); - dispatch::clGetProgramInfo(handle, CL_PROGRAM_BINARIES, sizeof(unsigned char**), (void*)res.data(), NULL); - return res; -} - -template -typename detail::return_type::Result info(cl_program const & phandle, cl_device_id const & dhandle){ - typedef typename detail::return_type::Result res_t; - return detail::get_info_impl()(phandle,dhandle,param); -} - -//Kernel -template -typename detail::return_type::Result info(cl_kernel const & handle){ - typedef typename detail::return_type::Result res_t; - return detail::get_info_impl()(handle,param); -} - -template -typename detail::return_type::Result info(cl_kernel const & khandle, cl_device_id const & dhandle){ - typedef typename detail::return_type::Result res_t; - return detail::get_info_impl()(khandle,dhandle,param); -} - -//Context -template -typename detail::return_type::Result info(cl_context const & handle){ - typedef typename detail::return_type::Result res_t; - return detail::get_info_impl()(handle,param); -} - -//Event -template -typename detail::return_type::Result info(cl_event const & handle){ - typedef typename detail::return_type::Result res_t; - return detail::get_info_impl()(handle,param); -} - -//Command queue -template -typename detail::return_type::Result info(cl_command_queue const & handle){ - typedef typename detail::return_type::Result res_t; - return detail::get_info_impl()(handle,param); -} - -//Plaftform -template -typename detail::return_type::Result info(cl_platform_id const & handle){ - typedef typename detail::return_type::Result res_t; - return detail::get_info_impl()(handle,param); -} - -template::type param> -typename detail::return_type::Result info(OCL_TYPE const & handle){ - return info(handle.get()); -} - - - -template::type param> -typename detail::return_type::Result info(OCL_TYPE const & handle, OCL_TYPE_ARG const & arg_handle){ - return info(handle.get(), arg_handle.get()); -} - -} -} -} -#endif // INFOS_HPP diff --git a/include/triton/lang/wgtcc/ast.h b/include/triton/lang/ast.h similarity index 100% rename from include/triton/lang/wgtcc/ast.h rename to include/triton/lang/ast.h diff --git a/include/triton/lang/wgtcc/code_gen.h b/include/triton/lang/code_gen.h similarity index 100% rename from include/triton/lang/wgtcc/code_gen.h rename to include/triton/lang/code_gen.h diff --git a/include/triton/lang/wgtcc/cpp.h b/include/triton/lang/cpp.h similarity index 100% rename from include/triton/lang/wgtcc/cpp.h rename to include/triton/lang/cpp.h diff --git a/include/triton/lang/declaration.h b/include/triton/lang/declaration.h deleted file mode 100644 index e406f00d8..000000000 --- a/include/triton/lang/declaration.h +++ /dev/null @@ -1,265 +0,0 @@ -#ifndef TRITON_INCLUDE_LANG_DECLARATION_H -#define TRITON_INCLUDE_LANG_DECLARATION_H - -#include "node.h" -#include - -namespace triton{ - - -namespace ir{ - class function; - class value; - class type; - class builder; - class module; -} - -namespace lang{ - -class expression; -class pointer; -class identifier; -class constant; -class compound_statement; -class initializer; -class declaration_specifier; - - -class declaration: public block_item{ -public: - declaration(node *spec, node *init) - : spec_((declaration_specifier*)spec), init_((list*)init) { } - - ir::value* codegen(ir::module * mod) const; - -public: - const declaration_specifier *spec_; - const list *init_; -}; - -// Types -class modifier: public node { -public: - virtual bool is_cst_space() const { return false; } - virtual bool is_tunable() const { return false; } - virtual bool is_cst() const { return false; } - virtual bool is_multiple_of() const { return false; } - virtual void add_attr(ir::function* fn, size_t pos) = 0; - virtual void add_metadata(ir::module* mod, std::string name) = 0; -}; - -class storage_specifier: public modifier { -public: - storage_specifier(STORAGE_SPEC_T value): value_(value) {} - STORAGE_SPEC_T value() const { return value_; } - bool is_cst_space() const { return value_ == CONSTANT_SPACE_T; } - bool is_tunable() const { return value_ == TUNABLE_T; } - bool is_cst() const { return value_ == CONST_T; } - void add_attr(ir::function* fn, size_t pos); - void add_metadata(ir::module* mod, std::string name); - -private: - const STORAGE_SPEC_T value_; -}; - -class alignment_specifier: public modifier { -public: - alignment_specifier(node* value): cst_((constant*)value) { } - void add_attr(ir::function* fn, size_t pos); - void add_metadata(ir::module* mod, std::string name); - -private: - constant* cst_; -}; - -class multiple_of_specifier: public modifier { -public: - multiple_of_specifier(node* value): cst_((constant*)value) {} - void add_attr(ir::function* fn, size_t pos); - void add_metadata(ir::module* mod, std::string name); - bool is_multiple_of() const { return true; } - -private: - constant* cst_; -}; - -// declaration specifier -class declaration_specifier: public node{ -public: - virtual ir::type* type(ir::module *mod) const = 0; - virtual std::vector modifiers() const = 0; -}; - -class typed_declaration_specifier: public declaration_specifier { -public: - typed_declaration_specifier(TYPE_T ty): ty_(ty){ } - ir::type* type(ir::module *mod) const; - std::vector modifiers() const; - -private: - const TYPE_T ty_; -}; - -// declaration modifier -class declaration_modifier: public declaration_specifier { -public: - declaration_modifier(node* mod, node *decl_spec) - : mod_((modifier*)mod), decl_spec_((declaration_specifier*)decl_spec) {} - ir::type* type(ir::module *mod) const; - std::vector modifiers() const; - -private: - modifier* mod_; - const declaration_specifier* decl_spec_; -}; - - -class declarator; -class parameter: public node { -public: - parameter(node *spec, node *decl) - : spec_((declaration_specifier*)spec), - decl_((declarator*)decl) { } - - ir::type* type(ir::module *mod) const; - std::vector modifiers() const; - const identifier* id() const; - -public: - const declaration_specifier *spec_; - const declarator *decl_; -}; - -/* Declarators */ -class declarator: public node{ -protected: - typedef std::vector storage_spec_vec_t; - typedef const storage_spec_vec_t& storage_spec_vec_const_ref_t; - -public: - virtual ir::type* type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const = 0; - -public: - declarator(node *lhs) - : lhs_((declarator*)lhs), ptr_(nullptr){ } - - ir::type* type(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const; - - const identifier* id() const { - return (const identifier*)lhs_; - } - - declarator *set_ptr(node *ptr){ - ptr_ = (pointer*)ptr; - return this; - } - - void set_addr_space(unsigned addr_space){ - addr_space_ = addr_space; - } - -protected: - declarator *lhs_; - pointer *ptr_; - unsigned addr_space_; -}; - -class identifier: public declarator { - ir::type* type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const; - -public: - identifier(char *&name): declarator(this), name_(name) { } - const std::string &name() const; - -private: - std::string name_; -}; - -class pointer: public declarator{ -private: - ir::type* type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const; - -public: - pointer(node *id): declarator(id) { } -}; - -class tile: public declarator{ -private: - ir::type* type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const; - -public: - tile(node *id, node *shapes) - : declarator(id), shapes_((list*)(shapes)) { } - -public: - const list* shapes_; -}; - -class function: public declarator{ -private: - ir::type* type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const; - -public: - function(node *id, node *args) - : declarator(id), args_((list*)args) { } - - void bind_parameters(ir::module *mod, ir::function *fn) const; - unsigned get_num_args() const { return args_->values().size(); } - parameter* get_arg(unsigned i) const { return args_->values().at(i); } - -public: - const list* args_; -}; - - -class initializer : public declarator{ -private: - ir::type* type_impl(ir::module * mod, ir::type *type, storage_spec_vec_const_ref_t storage) const; - -public: - initializer(node *decl, node *init) - : declarator((node*)((declarator*)decl)->id()), - decl_((declarator*)decl), expr_((expression*)init){ } - - void set_specifier(const declaration_specifier *spec); - ir::value* codegen(ir::module *) const; - -public: - const declaration_specifier *spec_; - declarator *decl_; - const expression *expr_; -}; - - -class type_name: public node{ -public: - type_name(node *spec, node * decl) - : spec_((declaration_specifier*)spec), decl_((declarator*)decl) { } - - ir::type *type(ir::module *mod) const; - -public: - const declaration_specifier *spec_; - const declarator *decl_; -}; - -/* Function definition */ -class function_definition: public node{ -public: - function_definition(node *spec, node *header, node *body) - : spec_((declaration_specifier*)spec), header_((function *)header), body_((compound_statement*)body) { } - - ir::value* codegen(ir::module * mod) const; - -public: - const declaration_specifier *spec_; - const function *header_; - const compound_statement *body_; -}; - -} - -} - -#endif diff --git a/include/triton/lang/wgtcc/encoding.h b/include/triton/lang/encoding.h similarity index 100% rename from include/triton/lang/wgtcc/encoding.h rename to include/triton/lang/encoding.h diff --git a/include/triton/lang/error.h b/include/triton/lang/error.h index 70e70a387..fdae7e060 100644 --- a/include/triton/lang/error.h +++ b/include/triton/lang/error.h @@ -1,20 +1,15 @@ -#ifndef TRITON_INCLUDE_LANG_ERROR_H -#define TRITON_INCLUDE_LANG_ERROR_H - -#include "parser.hpp" +#ifndef _WGTCC_ERROR_H_ +#define _WGTCC_ERROR_H_ -namespace triton{ -namespace lang{ +struct SourceLocation; +class Token; +class Expr; -void update_location(const char *t); -void print_error(const char *error); -char return_impl(char t, const char * yytext); -yytokentype return_impl(yytokentype t, const char * yytext); -void return_void(const char * yytext); - -} -} +[[noreturn]] void Error(const char* format, ...); +[[noreturn]] void Error(const SourceLocation& loc, const char* format, ...); +[[noreturn]] void Error(const Token* tok, const char* format, ...); +[[noreturn]] void Error(const Expr* expr, const char* format, ...); #endif diff --git a/include/triton/lang/wgtcc/evaluator.h b/include/triton/lang/evaluator.h similarity index 100% rename from include/triton/lang/wgtcc/evaluator.h rename to include/triton/lang/evaluator.h diff --git a/include/triton/lang/expression.h b/include/triton/lang/expression.h deleted file mode 100644 index a3574f15d..000000000 --- a/include/triton/lang/expression.h +++ /dev/null @@ -1,357 +0,0 @@ -#ifndef TDL_INCLUDE_LANG_EXPRESSION_H -#define TDL_INCLUDE_LANG_EXPRESSION_H - -#include "lang.h" -#include -#include - - -namespace triton{ - - -namespace ir{ - class function; - class value; - class type; - class builder; - class module; -} - -namespace lang{ - - -enum slice_enum_t{ - ALL, - NEWAXIS -}; - -class slice: public node{ -public: - slice(slice_enum_t type) - : type_(type){} - - slice_enum_t type() const{ - return type_; - } - -public: - const slice_enum_t type_; -}; - - -class named_expression; - -class expression: public node{ -public: - virtual ir::value* codegen(ir::module *) const = 0; - named_expression *lvalue() const { return lvalue_; } - -protected: - named_expression *lvalue_; -}; - -class postfix_expression: public expression{ - -}; - -class builtin_expression: public node{ - -}; - -class typed_declaration_specifier; -class alloc_const_expression: public builtin_expression{ -public: - alloc_const_expression(node *spec, node *size): spec_((typed_declaration_specifier*)spec), size_((constant*)size) { } - ir::value* codegen(ir::module *mod) const; - -private: - const typed_declaration_specifier* spec_; - const constant* size_; -}; - -class get_program_id_expression: public builtin_expression{ -public: - get_program_id_expression(node *axis): axis_((constant*)axis) { } - ir::value* codegen(ir::module *) const; - -private: - const constant* axis_; -}; - -class get_num_program_expression: public builtin_expression{ -public: - get_num_program_expression(node *axis): axis_((constant*)axis) { } - ir::value* codegen(ir::module *mod) const; - -private: - const constant* axis_; -}; - -class atomic_cas_expression: public builtin_expression{ -public: - atomic_cas_expression(node *ptr, node *cmp, node *val): ptr_(ptr), cmp_(cmp), val_(val) { } - ir::value* codegen(ir::module *) const; - -private: - const node *ptr_; - const node *cmp_; - const node *val_; -}; - -class atomic_exch_expression: public builtin_expression{ -public: - atomic_exch_expression(node *ptr, node *val): ptr_(ptr), val_(val) { } - ir::value* codegen(ir::module *) const; - -private: - const node *ptr_; - const node *val_; -}; - - -class atomic_add_expression: public builtin_expression{ -public: - atomic_add_expression(node *ptr, node *val): ptr_(ptr), val_(val) { } - ir::value* codegen(ir::module *) const; - -private: - const node *ptr_; - const node *val_; -}; - - -class matmul_expression: public builtin_expression{ -public: - matmul_expression(node* A, node *B, node *C): - A_((expression*)A), B_((expression*)B), C_((expression*)C) { } - ir::value* codegen(ir::module *) const; - -private: - const expression *A_; - const expression *B_; - const expression *C_; -}; - -class reshape_expression: public builtin_expression{ -public: - reshape_expression(node *arg, node *shapes): arg_(arg), shapes_((list*)shapes) { } - ir::value* codegen(ir::module *) const; - -private: - const node *arg_; - const list* shapes_; -}; - -class max_expression: public builtin_expression{ -public: - max_expression(node* x, node* y) - : x_((expression*)x), y_((expression*)y){ } - ir::value* codegen(ir::module *) const; - -private: - const expression *x_; - const expression *y_; -}; - -class min_expression: public builtin_expression{ -public: - min_expression(node* x, node* y) - : x_((expression*)x), y_((expression*)y){ } - ir::value* codegen(ir::module *mod) const; - -private: - const expression *x_; - const expression *y_; -}; - -class select_expression: public builtin_expression{ -public: - select_expression(node* pred, node* if_value, node* else_value) - : pred_((expression*)pred), if_value_((expression*)if_value), else_value_((expression*)else_value) { } - ir::value* codegen(ir::module *mod) const; - -private: - const expression *pred_; - const expression *if_value_; - const expression *else_value_; -}; - -class trans_expression: public builtin_expression{ -public: - trans_expression(node *arg, node *perm): arg_(arg), perm_((list*)perm) {} - ir::value* codegen(ir::module *mod) const; - -private: - node* arg_; - const list* perm_; -}; - -class sqrt_expression: public builtin_expression{ -public: - sqrt_expression(node *arg): arg_(arg) {} - ir::value* codegen(ir::module *) const; - -private: - node* arg_; -}; - -class reduce_expression: public builtin_expression{ -public: - reduce_expression(node *arg, node *axis): arg_(arg), axis_((constant*)axis) {} - ir::value* codegen(ir::module *mod) const; - -private: - node* arg_; - constant* axis_; -}; - -class indexing_expression: public postfix_expression{ -public: - indexing_expression(node *lhs, node *slices) - : lhs_((const expression*)lhs), slices_((const list*)slices) {} - - ir::value* codegen(ir::module *) const; - -private: - const expression* lhs_; - const list* slices_; -}; - - - -class named_expression: public expression { -public: - named_expression(node *id): id_((const identifier*)id) { lvalue_ = this; } - const identifier *id() const { return id_; } - ir::value* codegen(ir::module * mod) const; - -private: - const identifier *id_; -}; - -class binary_expression: public expression{ -private: - ir::value* llvm_op(ir::module *mod, ir::builder &bld, ir::value *lhs, ir::value *rhs, const std::string &name) const; - -public: - binary_expression(BIN_OP_T op, node *lhs, node *rhs) - : op_(op), lhs_((expression*)lhs), rhs_((expression*)rhs) { - } - ir::value* codegen(ir::module *) const; - -private: - const BIN_OP_T op_; - const expression *lhs_; - const expression *rhs_; -}; - - -class constant: public expression{ -public: - constant(int value): value_(value) { } - ir::value* codegen(ir::module *mod) const; - int value() const; - -private: - const int value_; -}; - -class constant_range: public expression { -public: - constant_range(node *first, node *last) - : first_((constant*)first), last_((constant*)last) { } - - ir::value* codegen(ir::module *mod) const; - -private: - constant *first_; - constant *last_; -}; - -class string_literal: public expression{ -public: - string_literal(char *&value): value_(value) { } - ir::value* codegen(ir::module *mod) const; - -public: - std::string value_; -}; - -class unary_expression: public expression{ -private: - ir::value *llvm_op(ir::builder &builder, ir::value *arg, const std::string &name) const; - -public: - unary_expression(UNARY_OP_T op, node *arg) - : op_(op), - arg_((expression*)arg) { - if(op == DEREF) - this->lvalue_ = arg_->lvalue(); - } - - UNARY_OP_T get_op() const { return op_; } - ir::value* codegen(ir::module *mod) const; - -private: - const UNARY_OP_T op_; - const expression *arg_; -}; - -class type_name; -class cast_expression: public expression{ -private: - ir::value *llvm_op(ir::builder &builder, ir::type *T, ir::value *arg, const std::string &name) const; - -public: - cast_expression(node *T, node *arg): - T_((type_name*)T), - arg_((expression*)arg) { } - - ir::value* codegen(ir::module *mod) const; - -public: - const type_name *T_; - const expression *arg_; -}; - -class conditional_expression: public expression{ -private: - ir::value *llvm_op(ir::builder &builder, - ir::value *cond, ir::value *true_value, ir::value *false_value, - const std::string &name) const; - -public: - conditional_expression(node *cond, node *true_value, node *false_value) - : cond_((expression*)cond), - true_value_((expression*)true_value), - false_value_((expression*)false_value) { } - - ir::value* codegen(ir::module *mod) const; - -public: - const expression *cond_; - const expression *true_value_; - const expression *false_value_; -}; - -class assignment_expression: public expression{ -public: - assignment_expression(node *lvalue, ASSIGN_OP_T op, node *rvalue) - : lvalue_((named_expression*)lvalue), op_(op), rvalue_((expression*)rvalue) { } - - ir::value* codegen(ir::module *mod) const; - const expression *lvalue() const { return lvalue_; } - const expression *rvalue() const { return rvalue_; } - -public: - const expression *lvalue_; - ASSIGN_OP_T op_; - const expression *rvalue_; -}; - - -} - -} - -#endif diff --git a/include/triton/lang/lang.h b/include/triton/lang/lang.h deleted file mode 100644 index ba1d1a2d8..000000000 --- a/include/triton/lang/lang.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef TRITON_INCLUDE_LANG_LANG_H -#define TRITON_INCLUDE_LANG_LANG_H - -#include "parser.hpp" -#include "declaration.h" -#include "error.h" -#include "expression.h" -#include "node.h" -#include "ops.h" -#include "module.h" -#include "statement.h" - -#endif diff --git a/include/triton/lang/wgtcc/mem_pool.h b/include/triton/lang/mem_pool.h similarity index 100% rename from include/triton/lang/wgtcc/mem_pool.h rename to include/triton/lang/mem_pool.h diff --git a/include/triton/lang/module.h b/include/triton/lang/module.h deleted file mode 100644 index 7ac6c2960..000000000 --- a/include/triton/lang/module.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef TRITON_INCLUDE_LANG_MODULE_H -#define TRITON_INCLUDE_LANG_MODULE_H - -#include "node.h" - -namespace triton{ -namespace lang{ - -/* Translation Unit */ -class translation_unit: public node{ -public: - translation_unit(node *item) - : decls_(item) { } - - translation_unit *add(node *item) { - decls_.append(item); - return this; - } - - ir::value* codegen(ir::module * mod) const; - -private: - list decls_; -}; - -} - -} - -#endif diff --git a/include/triton/lang/node.h b/include/triton/lang/node.h deleted file mode 100644 index c9bd0b011..000000000 --- a/include/triton/lang/node.h +++ /dev/null @@ -1,72 +0,0 @@ -#ifndef TRITON_INCLUDE_LANG_NODE_H -#define TRITON_INCLUDE_LANG_NODE_H - -#include -#include "ops.h" - -namespace triton{ - - -namespace ir{ - class function; - class value; - class type; - class builder; - class module; -} - -namespace lang{ - -class expression; -class pointer; -class identifier; -class constant; -class compound_statement; -class initializer; -class modifier; -class function; - -// Node -class node { -protected: - static ir::value* explicit_cast(ir::builder &builder, ir::value *src, ir::type *dst_ty); - static void implicit_broadcast(ir::module *mod, ir::type *dst_ty, ir::value *&src); - static void implicit_broadcast(ir::module *mod, ir::value *&lhs, ir::value *&rhs); - static void implicit_cast(ir::builder &builder, ir::value *&lhs, ir::value *&rhs, - bool &is_float, bool &is_ptr, bool &is_int, bool &is_signed); -public: - virtual ir::value* codegen(ir::module *) const { return nullptr; } -}; - -class block_item: public node{ -}; - -template -class list: public node { -public: - list(const T& x): values_(1, x) {} - - node* append(const T& x){ - values_.push_back(x); - return this; - } - - ir::value* codegen(ir::module * mod) const{ - for(T x: values_){ - x->codegen(mod); - } - return nullptr; - } - - const std::vector &values() const - { return values_; } - -private: - std::vector values_; -}; - -} - -} - -#endif diff --git a/include/triton/lang/ops.h b/include/triton/lang/ops.h deleted file mode 100644 index 38fc200bf..000000000 --- a/include/triton/lang/ops.h +++ /dev/null @@ -1,54 +0,0 @@ -#ifndef TRITON_INCLUDE_LANG_OPS_H -#define TRITON_INCLUDE_LANG_OPS_H - -namespace triton{ -namespace lang{ - -enum ASSIGN_OP_T{ - ASSIGN, - INPLACE_MUL, INPLACE_DIV, INPLACE_MOD, - INPLACE_ADD, INPLACE_SUB, - INPLACE_LSHIFT, INPLACE_RSHIFT, - INPLACE_AND, INPLACE_XOR, - INPLACE_OR -}; - -enum BIN_OP_T{ - MUL, DIV, MOD, - ADD, SUB, - LEFT_SHIFT, RIGHT_SHIFT, - LT, GT, - LE, GE, - EQ, NE, - AND, XOR, OR, - LAND, LOR -}; - -enum UNARY_OP_T{ - INC, DEC, - PLUS, MINUS, - ADDR, DEREF, - COMPL, NOT -}; - -enum TYPE_T{ - VOID_T, - UINT1_T, UINT8_T, UINT16_T, UINT32_T, UINT64_T, - INT1_T, INT8_T, INT16_T, INT32_T, INT64_T, - FLOAT16_T, FLOAT32_T, FLOAT64_T -}; - -enum STORAGE_SPEC_T{ - CONST_T, - TUNABLE_T, - KERNEL_T, - RESTRICT_T, - READONLY_T, - CONSTANT_SPACE_T, - WRITEONLY_T -}; - -} -} - -#endif diff --git a/include/triton/lang/wgtcc/parser.h b/include/triton/lang/parser.h similarity index 100% rename from include/triton/lang/wgtcc/parser.h rename to include/triton/lang/parser.h diff --git a/include/triton/lang/parser.y b/include/triton/lang/parser.y deleted file mode 100644 index e3c22c132..000000000 --- a/include/triton/lang/parser.y +++ /dev/null @@ -1,424 +0,0 @@ -%define parse.error verbose - -%{ -namespace triton{ -namespace lang{ -class node; -} -} -using namespace triton::lang; -#define YYSTYPE node* -#include "../include/triton/lang/lang.h" - -extern char* yytext; -void yyerror(const char *s); -int yylex(void); - -translation_unit *ast_root; - -/* wrap token in AST node */ -struct token: public node{ - token(ASSIGN_OP_T value): assign_op(value){ } - token(BIN_OP_T value): bin_op(value){ } - token(UNARY_OP_T value): unary_op(value){ } - token(TYPE_T value): type(value){ } - token(STORAGE_SPEC_T value): storage_spec(value){ } - - union { - ASSIGN_OP_T assign_op; - BIN_OP_T bin_op; - UNARY_OP_T unary_op; - TYPE_T type; - STORAGE_SPEC_T storage_spec; - }; -}; - -/* shortcut to append in list */ -template -node* append_ptr_list(node *result, node *in){ - return static_cast*>(result)->append((T*)in); -} - -/* shortcut to access token value */ -ASSIGN_OP_T get_assign_op(node *op) { return ((token*)op)->assign_op; } -UNARY_OP_T get_unary_op(node *op) { return ((token*)op)->unary_op; } -TYPE_T get_type_spec(node *op) { return ((token*)op)->type; } -STORAGE_SPEC_T get_storage_spec(node *op) { return ((token*)op)->storage_spec;} -%} - -%token IDENTIFIER CONSTANT STRING_LITERAL -%token TUNABLE KERNEL RESTRICT READONLY WRITEONLY CONST CONSTANT_SPACE ALIGN MULTIPLE_OF -%token PTR_OP INC_OP DEC_OP LEFT_OP RIGHT_OP LE_OP GE_OP EQ_OP NE_OP -%token AND_OP OR_OP MUL_ASSIGN DIV_ASSIGN MOD_ASSIGN ADD_ASSIGN -%token SUB_ASSIGN LEFT_ASSIGN RIGHT_ASSIGN AND_ASSIGN -%token XOR_ASSIGN OR_ASSIGN TYPE_NAME -%token VOID UINT1 UINT8 UINT16 UINT32 UINT64 INT1 INT8 INT16 INT32 INT64 FP16 FP32 FP64 -%token IF ELSE FOR CONTINUE WHILE -%token NEWAXIS ELLIPSIS AT -%token GET_NUM_PROGRAM GET_PROGRAM_ID DOT SQRT REDUCE_SUM TRANS MAX MIN SELECT ATOMIC_CAS ATOMIC_EXCH ATOMIC_ADD ALLOC_CONST RESHAPE - -%start translation_unit -%% - - -/* -------------------------- */ -/* Types */ -/* -------------------------- */ - -type_specifier - : VOID { $$ = new token(VOID_T); } - | UINT1 { $$ = new token(UINT1_T); } - | UINT8 { $$ = new token(UINT8_T); } - | UINT16 { $$ = new token(UINT16_T); } - | UINT32 { $$ = new token(UINT32_T); } - | UINT64 { $$ = new token(UINT64_T); } - | INT1 { $$ = new token(INT1_T);} - | INT8 { $$ = new token(INT8_T); } - | INT16 { $$ = new token(INT16_T); } - | INT32 { $$ = new token(INT32_T); } - | INT64 { $$ = new token(INT64_T); } - | FP16 { $$ = new token(FLOAT16_T); } - | FP32 { $$ = new token(FLOAT32_T); } - | FP64 { $$ = new token(FLOAT64_T); } - ; - -pointer - : '*' { $$ = new pointer(nullptr); } - | '*' pointer { $$ = new pointer($1); } - -abstract_declarator - : pointer { $$ = $1; } - | pointer direct_abstract_declarator { $$ = ((declarator*)$2)->set_ptr($1); } - | direct_abstract_declarator { $$ = $1; } - ; - -direct_abstract_declarator - : '[' constant_expression_list ']' { $$ = new tile(nullptr, $2); } - -type_name - : declaration_specifiers { $$ = new type_name($1, nullptr); } - | declaration_specifiers abstract_declarator { $$ = new type_name($1, $2); } - ; - -/* -------------------------- */ -/* Expressions */ -/* -------------------------- */ - -/* Constants */ -constant - : CONSTANT { $$ = new constant(atoi(yytext)); } - ; - -constant_list - : constant { $$ = new list((constant*)$1); } - | constant_list ',' constant { $$ = append_ptr_list($1, $3); } - ; - -identifier - : IDENTIFIER { $$ = new identifier(yytext); } - ; - -/* Built-in */ -builtin_expression - : GET_PROGRAM_ID '(' constant ')' { $$ = new get_program_id_expression($3); } - | GET_NUM_PROGRAM '(' constant ')' { $$ = new get_num_program_expression($3); } - | DOT '(' expression ',' expression ',' expression ')' { $$ = new matmul_expression($3, $5, $7); } - | SQRT '(' expression ')' { $$ = new sqrt_expression($3); } - | ALLOC_CONST type_specifier '[' constant ']' { $$ = new alloc_const_expression(new typed_declaration_specifier(get_type_spec($2)), $4); } - | TRANS '(' expression ',' constant_expression_list ')' { $$ = new trans_expression($3, $5); } - | TRANS '(' expression ')' { $$ = new trans_expression($3, nullptr); } - | REDUCE_SUM '(' expression ',' constant ')' { $$ = new reduce_expression($3, $5);} - | MAX '(' expression ',' expression ')' { $$ = new max_expression($3, $5); } - | MIN '(' expression ',' expression ')' { $$ = new min_expression($3, $5); } - | SELECT '(' expression ',' expression ',' expression ')' { $$ = new select_expression($3, $5, $7); } - | ATOMIC_CAS '(' expression ',' expression ',' expression ')' { $$ = new atomic_cas_expression($3, $5, $7); } - | ATOMIC_EXCH '(' expression ',' expression ')' { $$ = new atomic_exch_expression($3, $5); } - | ATOMIC_ADD '(' expression ',' expression ')' { $$ = new atomic_add_expression($3, $5); } - | RESHAPE '(' expression ',' constant_expression_list ')' { $$ = new reshape_expression($3, $5); } - ; - -/* Primary */ -primary_expression - : identifier { $$ = new named_expression($1); } - | constant { $$ = $1; } - | primary_expression ELLIPSIS primary_expression { $$ = new constant_range($1, $3); } - | builtin_expression { $$ = $1; } - | STRING_LITERAL { $$ = new string_literal(yytext); } - | '(' expression ')' { $$ = $2; } - ; - -/* Postfix */ -slice - : ':' { $$ = new slice(triton::lang::ALL); } - | NEWAXIS { $$ = new slice(triton::lang::NEWAXIS); } - -slice_list - : slice { $$ = new list((slice*)$1); } - | slice_list ',' slice { $$ = append_ptr_list($1, $3); } - -postfix_expression - : primary_expression { $$ = $1;} - | primary_expression '[' slice_list ']' { $$ = new indexing_expression($1, $3);} - ; - -/* Unary */ -unary_operator - : '&' { $$ = new token(ADDR); } - | '*' { $$ = new token(DEREF); } - | '+' { $$ = new token(PLUS); } - | '-' { $$ = new token(MINUS); } - | '~' { $$ = new token(COMPL); } - | '!' { $$ = new token(NOT); } - ; - -unary_expression - : postfix_expression { $$ = $1; } - | INC_OP unary_expression { $$ = new unary_expression(INC, $2); } - | DEC_OP unary_expression { $$ = new unary_expression(DEC, $2); } - | unary_operator cast_expression { $$ = new unary_expression(get_unary_op($1), $2); } - ; - -cast_expression - : unary_expression { $$ = $1; } - | '(' type_name ')' cast_expression { $$ = new cast_expression($2, $4); } - ; - -multiplicative_expression - : cast_expression { $$ = $1; } - | multiplicative_expression '*' cast_expression { $$ = new binary_expression(MUL, $1, $3); } - | multiplicative_expression '/' cast_expression { $$ = new binary_expression(DIV, $1, $3); } - | multiplicative_expression '%' cast_expression { $$ = new binary_expression(MOD, $1, $3); } - ; - -additive_expression - : multiplicative_expression { $$ = $1; } - | additive_expression '+' multiplicative_expression { $$ = new binary_expression(ADD, $1, $3); } - | additive_expression '-' multiplicative_expression { $$ = new binary_expression(SUB, $1, $3); } - ; - -shift_expression - : additive_expression { $$ = $1; } - | shift_expression LEFT_OP additive_expression { $$ = new binary_expression(LEFT_SHIFT, $1, $3); } - | shift_expression RIGHT_OP additive_expression { $$ = new binary_expression(RIGHT_SHIFT, $1, $3); } - ; - -/* Comparison */ -relational_expression - : shift_expression { $$ = $1; } - | relational_expression '<' shift_expression { $$ = new binary_expression(LT, $1, $3); } - | relational_expression '>' shift_expression { $$ = new binary_expression(GT, $1, $3); } - | relational_expression LE_OP shift_expression { $$ = new binary_expression(LE, $1, $3); } - | relational_expression GE_OP shift_expression { $$ = new binary_expression(GE, $1, $3); } - ; - -equality_expression - : relational_expression { $$ = $1; } - | equality_expression EQ_OP relational_expression { $$ = new binary_expression(EQ, $1, $3); } - | equality_expression NE_OP relational_expression { $$ = new binary_expression(NE, $1, $3); } - ; - -/* Binary */ -and_expression - : equality_expression { $$ = $1; } - | and_expression '&' equality_expression { $$ = new binary_expression(AND, $1, $3); } - ; - -exclusive_or_expression - : and_expression { $$ = $1; } - | exclusive_or_expression '^' and_expression { $$ = new binary_expression(XOR, $1, $3); } - ; - -inclusive_or_expression - : exclusive_or_expression { $$ = $1; } - | inclusive_or_expression '|' exclusive_or_expression { $$ = new binary_expression(OR, $1, $3); } - ; - -/* Logical */ -logical_and_expression - : inclusive_or_expression { $$ = $1; } - | logical_and_expression AND_OP inclusive_or_expression { $$ = new binary_expression(LAND, $1, $3); } - ; - -logical_or_expression - : logical_and_expression { $$ = $1; } - | logical_or_expression OR_OP logical_and_expression { $$ = new binary_expression(LOR, $1, $3); } - ; - -/* Conditional */ -conditional_expression - : logical_or_expression { $$ = $1; } - | logical_or_expression '?' conditional_expression ':' conditional_expression { $$ = new conditional_expression($1, $3, $5); } - ; - -/* Assignment */ -assignment_operator - : '=' { $$ = new token(ASSIGN); } - | MUL_ASSIGN { $$ = new token(INPLACE_MUL); } - | DIV_ASSIGN { $$ = new token(INPLACE_DIV); } - | MOD_ASSIGN { $$ = new token(INPLACE_MOD); } - | ADD_ASSIGN { $$ = new token(INPLACE_ADD); } - | SUB_ASSIGN { $$ = new token(INPLACE_SUB); } - | LEFT_ASSIGN { $$ = new token(INPLACE_LSHIFT); } - | RIGHT_ASSIGN { $$ = new token(INPLACE_RSHIFT); } - | AND_ASSIGN { $$ = new token(INPLACE_AND); } - | XOR_ASSIGN { $$ = new token(INPLACE_XOR); } - | OR_ASSIGN { $$ = new token(INPLACE_OR); } - ; - -assignment_expression - : conditional_expression { $$ = $1; } - | unary_expression assignment_operator assignment_expression { $$ = new assignment_expression($1, get_assign_op($2), $3); } - ; - -/* Expression */ -expression - : assignment_expression { $$ = $1; } - ; - -constant_expression_list - : expression { $$ = new list((expression*)$1); } - | constant_expression_list ',' expression { $$ = append_ptr_list($1, $3); } - -/* Initialization */ -initialization_expression - : assignment_expression { $$ = $1; } - | '{' constant_list '}' { $$ = $2; } - ; - - -/* -------------------------- */ -/* Statements */ -/* -------------------------- */ - -statement - : compound_statement { $$ = $1; } - | expression_statement { $$ = $1; } - | selection_statement { $$ = $1; } - | iteration_statement { $$ = $1; } - | jump_statement { $$ = $1; } - ; - -compound_statement - : '{' '}' { $$ = new compound_statement(nullptr); } - | '{' block_item_list '}' { $$ = new compound_statement($2); } - -block_item_list - : block_item { $$ = new list((block_item*)$1); } - | block_item_list block_item { $$ = append_ptr_list($1, $2); } - -block_item - : declaration { $$ = $1; } - | statement { $$ = $1; } - -expression_statement - : ';' { $$ = new no_op(); } - | expression ';' { $$ = new expression_statement($1); } - | AT primary_expression expression ';' { $$ = new expression_statement($3, $2); } - ; - -selection_statement - : IF '(' expression ')' statement { $$ = new selection_statement($3, $5); } - | IF '(' expression ')' statement ELSE statement { $$ = new selection_statement($3, $5, $7); } - ; - -iteration_statement - : FOR '(' expression_statement expression_statement expression ')' statement { $$ = new iteration_statement($3, $4, $5, $7); } - | FOR '(' declaration expression_statement ')' statement { $$ = new iteration_statement($3, $4, nullptr, $6); } - | FOR '(' declaration expression_statement expression ')' statement { $$ = new iteration_statement($3, $4, $5, $7); } - | WHILE '(' expression ')' statement { $$ = new while_statement($3, $5); }; - -jump_statement - : CONTINUE ';' { $$ = new continue_statement(); } -; - -/* -------------------------- */ -/* Declarator */ -/* -------------------------- */ - - -direct_declarator - : identifier { $$ = $1; } - | identifier '[' constant_expression_list ']' { $$ = new tile($1, $3); } - | identifier '(' parameter_list ')' { $$ = new function($1, $3); } - | identifier '(' ')' { $$ = new function($1, nullptr); } - ; - - -parameter_list - : parameter_declaration { $$ = new list((parameter*)$1); } - | parameter_list ',' parameter_declaration { $$ = append_ptr_list($1, $3); } - ; - -parameter_declaration - : declaration_specifiers declarator { $$ = new parameter($1, $2); } - | declaration_specifiers abstract_declarator { $$ = new parameter($1, $2); } - ; - - -declaration_specifiers - : type_specifier { $$ = new typed_declaration_specifier(get_type_spec($1)); } - | storage_class_specifier declaration_specifiers { $$ = new declaration_modifier($1, $2); } - | alignment_class_specifier declaration_specifiers { $$ = new declaration_modifier($1, $2); } - | multiple_of_class_specifier declaration_specifiers { $$ = new declaration_modifier($1, $2); } - ; - -init_declarator_list - : init_declarator { $$ = new list((initializer*)$1); } - | init_declarator_list ',' init_declarator { $$ = append_ptr_list($1, $3); } - ; - -declaration - : declaration_specifiers ';' { $$ = new declaration($1, nullptr); } - | declaration_specifiers init_declarator_list ';' { $$ = new declaration($1, $2); } - ; - -declarator - : pointer direct_declarator { $$ = ((declarator*)$2)->set_ptr($1); } - | direct_declarator { $$ = $1; } - ; - -init_declarator - : declarator { $$ = new initializer($1, nullptr); } - | declarator '=' initialization_expression { $$ = new initializer($1, $3); } - ; - -storage_class_specifier - : CONST { $$ = new storage_specifier(CONST_T); } - | TUNABLE { $$ = new storage_specifier(TUNABLE_T); } - | KERNEL { $$ = new storage_specifier(KERNEL_T); } - | RESTRICT { $$ = new storage_specifier(RESTRICT_T); } - | READONLY { $$ = new storage_specifier(READONLY_T); } - | WRITEONLY { $$ = new storage_specifier(WRITEONLY_T); } - | CONSTANT_SPACE { $$ = new storage_specifier(CONSTANT_SPACE_T); } -; - -alignment_class_specifier - : ALIGN '(' constant ')' { $$ = new alignment_specifier($3); } - -multiple_of_class_specifier - : MULTIPLE_OF '(' constant ')' { $$ = new multiple_of_specifier($3); } - - -external_declaration - : function_definition { $$ = $1; } - | declaration { $$ = $1; } - ; - -function_definition - : declaration_specifiers declarator compound_statement { $$ = new function_definition($1, $2, $3); } - ; - -/* -------------------------- */ -/* Translation Unit */ -/* -------------------------- */ - -translation_unit - : external_declaration { ast_root = new translation_unit($1); $$ = ast_root; } - | translation_unit external_declaration { $$ = ((translation_unit*)($1))->add($2); } - ; - - -%% -void yyerror (const char *s){ - print_error(s); -} diff --git a/include/triton/lang/wgtcc/scanner.h b/include/triton/lang/scanner.h similarity index 100% rename from include/triton/lang/wgtcc/scanner.h rename to include/triton/lang/scanner.h diff --git a/include/triton/lang/scanner.l b/include/triton/lang/scanner.l deleted file mode 100644 index 6062a51ad..000000000 --- a/include/triton/lang/scanner.l +++ /dev/null @@ -1,119 +0,0 @@ -D [0-9] -L [a-zA-Z_] -H [a-fA-F0-9] -E [Ee][+-]?{D}+ -FS (f|F|l|L) -IS (u|U|l|L)* - -%{ -#include -#include "parser.hpp" -#include "../include/triton/lang/lang.h" -using triton::lang::return_impl; -using triton::lang::return_void; -%} - -%% -"__constant__" { return return_impl(CONSTANT_SPACE, yytext); } -"const" { return return_impl(CONST, yytext); } -"tunable" { return return_impl(TUNABLE, yytext); } -"kernel" { return return_impl(KERNEL, yytext); } -"restrict" { return return_impl(RESTRICT, yytext); } -"read_only" { return return_impl(READONLY, yytext); } -"write_only" { return return_impl(WRITEONLY, yytext); } -"align" { return return_impl(ALIGN, yytext); } -"multiple_of" { return return_impl(MULTIPLE_OF, yytext); } -"@" { return return_impl(AT, yytext); } -"newaxis" { return return_impl(NEWAXIS, yytext); } -"if" { return return_impl(IF, yytext); } -"else" { return return_impl(ELSE, yytext); } -"for" { return return_impl(FOR, yytext); } -"while" { return return_impl(WHILE, yytext); } -"void" { return return_impl(VOID, yytext); } -"uchar" { return return_impl(UINT8, yytext); } -"ushort" { return return_impl(UINT16, yytext); } -"uint" { return return_impl(UINT32, yytext); } -"ulong" { return return_impl(UINT64, yytext); } -"bool" { return return_impl(INT1, yytext); } -"char" { return return_impl(INT8, yytext); } -"short" { return return_impl(INT16, yytext); } -"int" { return return_impl(INT32, yytext); } -"long" { return return_impl(INT64, yytext); } -"half" { return return_impl(FP16, yytext); } -"float" { return return_impl(FP32, yytext); } -"double" { return return_impl(FP64, yytext); } -"..." { return return_impl(ELLIPSIS, yytext); } -"get_program_id" { return return_impl(GET_PROGRAM_ID, yytext); } -"get_num_program" { return return_impl(GET_NUM_PROGRAM, yytext); } -"__atomic_cas" { return return_impl(ATOMIC_CAS, yytext); } -"__atomic_exch" { return return_impl(ATOMIC_EXCH, yytext); } -"__atomic_add" { return return_impl(ATOMIC_ADD, yytext); } -"__sum" { return return_impl(REDUCE_SUM, yytext); } -"__reshape" { return return_impl(RESHAPE, yytext); } -"sqrt" { return return_impl(SQRT, yytext); } -"dot" { return return_impl(DOT, yytext); } -"max" { return return_impl(MAX, yytext); } -"min" { return return_impl(MIN, yytext); } -"select" { return return_impl(SELECT, yytext); } -"trans" { return return_impl(TRANS, yytext); } -"continue" { return return_impl(CONTINUE, yytext); } -"alloc_const" { return return_impl(ALLOC_CONST, yytext); } -{L}({L}|{D})* { return return_impl(IDENTIFIER, yytext); } -0[xX]{H}+{IS}? { return return_impl(CONSTANT, yytext); } -0{D}+{IS}? { return return_impl(CONSTANT, yytext); } -{D}+{IS}? { return return_impl(CONSTANT, yytext); } -L?'(\\.|[^\\'])+' { return return_impl(CONSTANT, yytext); } -{D}+{E}{FS}? { return return_impl(CONSTANT, yytext); } -L?\"(\\.|[^\\"])*\" { return return_impl(STRING_LITERAL, yytext); } -">>=" { return return_impl(RIGHT_ASSIGN, yytext); } -"<<=" { return return_impl(LEFT_ASSIGN, yytext); } -"+=" { return return_impl(ADD_ASSIGN, yytext); } -"-=" { return return_impl(SUB_ASSIGN, yytext); } -"*=" { return return_impl(MUL_ASSIGN, yytext); } -"/=" { return return_impl(DIV_ASSIGN, yytext); } -"%=" { return return_impl(MOD_ASSIGN, yytext); } -"&=" { return return_impl(AND_ASSIGN, yytext); } -"^=" { return return_impl(XOR_ASSIGN, yytext); } -"|=" { return return_impl(OR_ASSIGN, yytext); } -">>" { return return_impl(RIGHT_OP, yytext); } -"<<" { return return_impl(LEFT_OP, yytext); } -"++" { return return_impl(INC_OP, yytext); } -"--" { return return_impl(DEC_OP, yytext); } -"->" { return return_impl(PTR_OP, yytext); } -"&&" { return return_impl(AND_OP, yytext); } -"||" { return return_impl(OR_OP, yytext); } -"<=" { return return_impl(LE_OP, yytext); } -">=" { return return_impl(GE_OP, yytext); } -"==" { return return_impl(EQ_OP, yytext); } -"!=" { return return_impl(NE_OP, yytext); } -";" { return return_impl(';', yytext); } -("{"|"<%") { return return_impl('{', yytext); } -("}"|"%>") { return return_impl('}', yytext); } -"," { return return_impl(',', yytext); } -":" { return return_impl(':', yytext); } -"=" { return return_impl('=', yytext); } -"(" { return return_impl('(', yytext); } -")" { return return_impl(')', yytext); } -("["|"<:") { return return_impl('[', yytext); } -("]"|":>") { return return_impl(']', yytext); } -"." { return return_impl('.', yytext); } -"&" { return return_impl('&', yytext); } -"!" { return return_impl('!', yytext); } -"~" { return return_impl('~', yytext); } -"-" { return return_impl('-', yytext); } -"+" { return return_impl('+', yytext); } -"*" { return return_impl('*', yytext); } -"/" { return return_impl('/', yytext); } -"%" { return return_impl('%', yytext); } -"<" { return return_impl('<', yytext); } -">" { return return_impl('>', yytext); } -"^" { return return_impl('^', yytext); } -"|" { return return_impl('|', yytext); } -"?" { return return_impl('?', yytext); } -[ \t\v\n\f] { return_void(yytext);} -. { /* ignore bad characters */ } - -%% - -int yywrap() -{ return(1); } diff --git a/include/triton/lang/wgtcc/scope.h b/include/triton/lang/scope.h similarity index 100% rename from include/triton/lang/wgtcc/scope.h rename to include/triton/lang/scope.h diff --git a/include/triton/lang/statement.h b/include/triton/lang/statement.h deleted file mode 100644 index 42b4140dc..000000000 --- a/include/triton/lang/statement.h +++ /dev/null @@ -1,115 +0,0 @@ -#ifndef TRITON_INCLUDE_LANG_STATEMENT_H -#define TRITON_INCLUDE_LANG_STATEMENT_H - -#include "expression.h" - -namespace triton{ - - -namespace ir{ - class function; - class value; - class type; - class builder; - class module; -} - -namespace lang{ - -class declaration; - -class statement: public block_item{ -}; - -// Expression -class expression_statement: public statement{ -public: - expression_statement(node *expr, node *mask = nullptr) - : expr_((expression*)expr), pred_((expression*)mask){ } - - ir::value* codegen(ir::module * mod) const; - -private: - expression *expr_; - expression *pred_; -}; - -// Compound -class compound_statement: public statement{ - typedef list* declarations_t; - typedef list* statements_t; - -public: - compound_statement(node* items) - : items_((list*)items){} - - ir::value* codegen(ir::module * mod) const; - -private: - list* items_; -}; - -// Selection -class selection_statement: public statement{ -public: - selection_statement(node *cond, node *if_value, node *else_value = nullptr) - : cond_(cond), then_value_(if_value), else_value_(else_value) { } - - ir::value* codegen(ir::module *mod) const; - -public: - const node *cond_; - const node *then_value_; - const node *else_value_; -}; - -// Iteration -class iteration_statement: public statement{ -public: - iteration_statement(node *init, node *stop, node *exec, node *statements) - : init_(init), stop_(stop), exec_(exec), statements_(statements) - { } - - ir::value* codegen(ir::module *mod) const; - -private: - const node *init_; - const node *stop_; - const node *exec_; - const node *statements_; -}; - -// While -class while_statement: public statement{ -public: - while_statement(node *cond, node *statements) - : cond_(cond), statements_(statements) - { } - - ir::value* codegen(ir::module *) const; - -private: - const node *cond_; - const node *statements_; -}; - -// Jump -class jump_statement: public statement{ -public: - using statement::statement; -}; - -// Continue -class continue_statement: public jump_statement{ -public: - ir::value* codegen(ir::module *mod) const; -}; - -// No op -class no_op: public statement { }; - -} - -} - -#endif diff --git a/include/triton/lang/wgtcc/token.h b/include/triton/lang/token.h similarity index 100% rename from include/triton/lang/wgtcc/token.h rename to include/triton/lang/token.h diff --git a/include/triton/lang/wgtcc/type.h b/include/triton/lang/type.h similarity index 100% rename from include/triton/lang/wgtcc/type.h rename to include/triton/lang/type.h diff --git a/include/triton/lang/wgtcc/visitor.h b/include/triton/lang/visitor.h similarity index 100% rename from include/triton/lang/wgtcc/visitor.h rename to include/triton/lang/visitor.h diff --git a/include/triton/lang/wgtcc/error.h b/include/triton/lang/wgtcc/error.h deleted file mode 100644 index fdae7e060..000000000 --- a/include/triton/lang/wgtcc/error.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _WGTCC_ERROR_H_ -#define _WGTCC_ERROR_H_ - - -struct SourceLocation; -class Token; -class Expr; - - -[[noreturn]] void Error(const char* format, ...); -[[noreturn]] void Error(const SourceLocation& loc, const char* format, ...); -[[noreturn]] void Error(const Token* tok, const char* format, ...); -[[noreturn]] void Error(const Expr* expr, const char* format, ...); - -#endif diff --git a/include/triton/runtime/function.h b/include/triton/runtime/function.h index 63c91de9b..f30cdabfd 100644 --- a/include/triton/runtime/function.h +++ b/include/triton/runtime/function.h @@ -20,7 +20,7 @@ #include "triton/codegen/transform/shmem/barriers.h" #include "triton/codegen/transform/reassociate.h" #include "triton/codegen/transform/vectorize.h" -#include "triton/lang/wgtcc/parser.h" +#include "triton/lang/parser.h" namespace llvm { class Module; diff --git a/lib/driver/device.cpp b/lib/driver/device.cpp index 41a9561eb..fceb2754e 100755 --- a/lib/driver/device.cpp +++ b/lib/driver/device.cpp @@ -25,7 +25,6 @@ #include #include #include -#include "triton/driver/helpers/CL/infos.hpp" #include "triton/driver/device.h" #include "triton/driver/context.h" #include "triton/codegen/selection/target.h" @@ -51,11 +50,13 @@ std::unique_ptr host_device::make_target() const { // maximum amount of shared memory per block size_t ocl_device::max_shared_memory() const { - return ocl::info(*cl_); + throw std::runtime_error("not implemented"); +// return ocl::info(*cl_); } size_t ocl_device::max_threads_per_block() const { - return ocl::info(*cl_).at(0); + throw std::runtime_error("not implemented"); +// return ocl::info(*cl_).at(0); } std::unique_ptr ocl_device::make_target() const { diff --git a/lib/lang/wgtcc/ast.cc b/lib/lang/ast.cc similarity index 99% rename from lib/lang/wgtcc/ast.cc rename to lib/lang/ast.cc index 47bc6d3a4..7d7e28471 100644 --- a/lib/lang/wgtcc/ast.cc +++ b/lib/lang/ast.cc @@ -1,9 +1,9 @@ -#include "triton/lang/wgtcc/ast.h" -#include "triton/lang/wgtcc/error.h" -#include "triton/lang/wgtcc/evaluator.h" -#include "triton/lang/wgtcc/mem_pool.h" -#include "triton/lang/wgtcc/parser.h" -#include "triton/lang/wgtcc/token.h" +#include "triton/lang/ast.h" +#include "triton/lang/error.h" +#include "triton/lang/evaluator.h" +#include "triton/lang/mem_pool.h" +#include "triton/lang/parser.h" +#include "triton/lang/token.h" static MemPoolImp binaryOpPool; diff --git a/lib/lang/wgtcc/code_gen.cc b/lib/lang/code_gen.cc similarity index 99% rename from lib/lang/wgtcc/code_gen.cc rename to lib/lang/code_gen.cc index d7188b2b1..cfdeee1f6 100644 --- a/lib/lang/wgtcc/code_gen.cc +++ b/lib/lang/code_gen.cc @@ -1,7 +1,7 @@ -#include "triton/lang/wgtcc/code_gen.h" -#include "triton/lang/wgtcc/evaluator.h" -#include "triton/lang/wgtcc/parser.h" -#include "triton/lang/wgtcc/token.h" +#include "triton/lang/code_gen.h" +#include "triton/lang/evaluator.h" +#include "triton/lang/parser.h" +#include "triton/lang/token.h" #include "triton/ir/module.h" #include "triton/ir/function.h" diff --git a/lib/lang/wgtcc/cpp.cc b/lib/lang/cpp.cc similarity index 99% rename from lib/lang/wgtcc/cpp.cc rename to lib/lang/cpp.cc index 543bf3194..308eba1e6 100644 --- a/lib/lang/wgtcc/cpp.cc +++ b/lib/lang/cpp.cc @@ -1,7 +1,7 @@ -#include "triton/lang/wgtcc/cpp.h" +#include "triton/lang/cpp.h" -#include "triton/lang/wgtcc/evaluator.h" -#include "triton/lang/wgtcc/parser.h" +#include "triton/lang/evaluator.h" +#include "triton/lang/parser.h" #include #include @@ -823,7 +823,7 @@ void Preprocessor::Init() { AddSearchPath("/usr/include/x86_64-linux-gnu/"); AddSearchPath("/usr/include/linux/"); AddSearchPath("/usr/include/"); - AddSearchPath("/usr/local/wgtcc/include/"); + AddSearchPath("/usr/local/include/"); // The __FILE__ and __LINE__ macro is empty // They are handled seperately diff --git a/lib/lang/declaration.cpp b/lib/lang/declaration.cpp deleted file mode 100644 index 3f706bee1..000000000 --- a/lib/lang/declaration.cpp +++ /dev/null @@ -1,241 +0,0 @@ -#include -#include "triton/lang/statement.h" -#include "triton/lang/declaration.h" -#include "triton/ir/function.h" -#include "triton/ir/module.h" -#include "triton/ir/basic_block.h" -#include "triton/ir/builder.h" -#include "triton/ir/type.h" -#include "triton/ir/metadata.h" - - -namespace triton{ - -namespace lang{ - -/* Declaration specifier */ -ir::type* typed_declaration_specifier::type(ir::module *mod) const { - ir::context &ctx = mod->get_context(); - switch (ty_) { - case VOID_T: return ir::type::get_void_ty(ctx); - case INT1_T: return ir::type::get_int1_ty(ctx); - case INT8_T: return ir::type::get_int8_ty(ctx); - case INT16_T: return ir::type::get_int16_ty(ctx); - case INT32_T: return ir::type::get_int32_ty(ctx); - case INT64_T: return ir::type::get_int64_ty(ctx); - case FLOAT16_T: return ir::type::get_half_ty(ctx); - case FLOAT32_T: return ir::type::get_float_ty(ctx); - case FLOAT64_T: return ir::type::get_double_ty(ctx); - default: throw std::runtime_error("unreachable"); - } -} - -std::vector typed_declaration_specifier::modifiers() const { - return {}; -} - - -ir::type* declaration_modifier::type(ir::module *mod) const { - return decl_spec_->type(mod); -} - -std::vector declaration_modifier::modifiers() const { - auto result = decl_spec_->modifiers(); - result.push_back(mod_); - return result; -} - - -/* Parameter */ -ir::type* parameter::type(ir::module *mod) const { - return decl_->type(mod, spec_->type(mod), {}); -} - -std::vector parameter::modifiers() const { - return spec_->modifiers(); -} - -const identifier *parameter::id() const { - return decl_->id(); -} - -/* Declarators */ -ir::type* declarator::type(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const{ - if(ptr_) - return type_impl(mod, ptr_->type(mod, type, storage), storage); - return type_impl(mod, type, storage); -} - -// Identifier -ir::type* identifier::type_impl(ir::module *, ir::type *type, storage_spec_vec_const_ref_t) const{ - return type; -} - -const std::string &identifier::name() const{ - return name_; -} - -// Tile -ir::type* tile::type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t) const{ - ir::type::tile_shapes_t shapes; - for(expression *expr: shapes_->values()){ - ir::constant_int *shape = dynamic_cast(expr->codegen(mod)); - if(shape == nullptr) - throw std::runtime_error("tile shapes must be constant expressions"); - shapes.push_back(shape); - } - return ir::tile_type::get(type, shapes); -} - - -// Pointer -ir::type* pointer::type_impl(ir::module*, ir::type *type, storage_spec_vec_const_ref_t storage) const{ - auto is_cst = [](modifier* x){ return x->is_cst_space(); }; - bool is_ptr_to_const = std::find_if(storage.begin(), storage.end(), is_cst) != storage.end(); - return ir::pointer_type::get(type, is_ptr_to_const?4:1); -} - -// Function -void function::bind_parameters(ir::module *mod, ir::function *fn) const{ - std::vector args = fn->args(); - assert(args.size() == args_->values().size()); - for(size_t i = 0; i < args.size(); i++){ - parameter *param_i = args_->values().at(i); - const identifier *id_i = param_i->id(); - if(id_i){ - args[i]->set_name(id_i->name()); - mod->set_value(id_i->name(), nullptr, args[i]); - mod->get_scope().types[id_i->name()] = args[i]->get_type(); - } - } -} - -ir::type* function::type_impl(ir::module* mod, ir::type *type, storage_spec_vec_const_ref_t) const{ - std::vector types; - for(parameter* param: args_->values()) - types.push_back(param->type(mod)); - return ir::function_type::get(type, types); -} - - -/* Declaration */ -ir::value* declaration::codegen(ir::module* mod) const{ - for(initializer *init: init_->values()) - init->set_specifier(spec_); - init_->codegen(mod); - return nullptr; -} - -/* Initializer */ -ir::type* initializer::type_impl(ir::module *mod, ir::type *type, storage_spec_vec_const_ref_t storage) const{ - return decl_->type(mod, type, storage); -} - -void initializer::set_specifier(const declaration_specifier *spec) { - spec_ = spec; -} - -ir::value* initializer::codegen(ir::module * mod) const{ - std::vector modifiers = spec_->modifiers(); - ir::type *ty = decl_->type(mod, spec_->type(mod), modifiers); - std::string name = decl_->id()->name(); - ir::value *value = ir::undef_value::get(ty); - auto is_tunable = [](modifier* x){ return x->is_tunable(); }; - if(std::find_if(modifiers.begin(), modifiers.end(), is_tunable) != modifiers.end()){ - auto csts = dynamic_cast*>((node*)expr_); - if(csts == nullptr) - throw std::runtime_error("must specify constant list for metaparameters"); - std::vector values; - for(constant* cst: csts->values()) - values.push_back(cst->value()); - value = ir::metaparameter::create(mod->get_context(), ty, values); - mod->register_global(name, value); - } - else if(expr_){ - value = expr_->codegen(mod); - value = explicit_cast(mod->get_builder(), value, ty->get_scalar_ty()); - implicit_broadcast(mod, ty, value); - } - value->set_name(name); - // metadata - auto is_multiple_of = [](modifier* x){ return x->is_multiple_of(); }; - auto it = std::find_if(modifiers.begin(), modifiers.end(), is_multiple_of); - if(it != modifiers.end()) - (*it)->add_metadata(mod, name); - // register - mod->set_value(name, value); - mod->get_scope().types[name] = ty; - if(auto *x = dynamic_cast(value)) - mod->add_alloc(x); - // constants - auto is_cst = [](modifier* x){ return x->is_cst(); }; - if(std::find_if(modifiers.begin(), modifiers.end(), is_cst) != modifiers.end()) - mod->set_const(name); - return value; -} - -/* Type name */ -ir::type *type_name::type(ir::module *mod) const{ - return decl_->type(mod, spec_->type(mod), {}); -} - -/* Storage specifier */ -inline ir::attribute_kind_t get_ir_attr(STORAGE_SPEC_T spec){ - switch(spec){ - case RESTRICT_T: return ir::noalias; - case READONLY_T: return ir::readonly; - case WRITEONLY_T: return ir::writeonly; - default: throw std::runtime_error("cannot convert storage specifier to IR function attribute"); - } -} - -void storage_specifier::add_attr(ir::function* fn, size_t pos) { - fn->add_attr(pos, ir::attribute(get_ir_attr(value_))); -} - -void storage_specifier::add_metadata(ir::module*, std::string) { - throw std::runtime_error("storage specifier is not a metadata"); -} - -/* Alignment specifier */ -void alignment_specifier::add_attr(ir::function* fn, size_t pos) { - fn->add_attr(pos, ir::attribute(ir::aligned, cst_->value())); -} - -void alignment_specifier::add_metadata(ir::module *mod, std::string name) { - throw std::runtime_error("alignment specifier is not a metadata"); -} - -/* Multiple-Of specifier */ -void multiple_of_specifier::add_attr(ir::function* fn, size_t pos) { - fn->add_attr(pos, ir::attribute(ir::multiple_of, cst_->value())); -} - -void multiple_of_specifier::add_metadata(ir::module *mod, std::string name) { - mod->add_metadata(name, {ir::metadata::multiple_of, cst_->value()}); -} - - -/* Function definition */ -ir::value* function_definition::codegen(ir::module *mod) const{ - ir::function_type *prototype = (ir::function_type*)header_->type(mod, spec_->type(mod), spec_->modifiers()); - const std::string &name = header_->id()->name(); - ir::function *fn = mod->get_or_insert_function(name, prototype); - for(unsigned i = 0; i < header_->get_num_args(); i++){ - parameter *param = header_->get_arg(i); - std::vector modifiers = param->modifiers(); - for(modifier* m: modifiers) - m->add_attr(fn, 1 + i); - } - header_->bind_parameters(mod, fn); - ir::basic_block *entry = ir::basic_block::create(mod->get_context(), "entry", fn); - mod->seal_block(entry); - mod->get_builder().set_insert_point(entry); - body_->codegen(mod); - mod->get_builder().create_ret_void(); - return nullptr; -} - -} - -} diff --git a/lib/lang/wgtcc/encoding.cc b/lib/lang/encoding.cc similarity index 96% rename from lib/lang/wgtcc/encoding.cc rename to lib/lang/encoding.cc index d5d1f99d1..931e4fc30 100644 --- a/lib/lang/wgtcc/encoding.cc +++ b/lib/lang/encoding.cc @@ -1,4 +1,4 @@ -#include "triton/lang/wgtcc/encoding.h" +#include "triton/lang/encoding.h" #include #include diff --git a/lib/lang/wgtcc/error.cc b/lib/lang/error.cc similarity index 94% rename from lib/lang/wgtcc/error.cc rename to lib/lang/error.cc index 618a83181..baf944468 100644 --- a/lib/lang/wgtcc/error.cc +++ b/lib/lang/error.cc @@ -1,7 +1,7 @@ -#include "triton/lang/wgtcc/error.h" +#include "triton/lang/error.h" -#include "triton/lang/wgtcc/ast.h" -#include "triton/lang/wgtcc/token.h" +#include "triton/lang/ast.h" +#include "triton/lang/token.h" #include #include diff --git a/lib/lang/error.cpp b/lib/lang/error.cpp deleted file mode 100644 index 77076fba0..000000000 --- a/lib/lang/error.cpp +++ /dev/null @@ -1,50 +0,0 @@ -#include -#include "triton/lang/error.h" - - -namespace triton{ - -namespace lang{ - -static int current_line = 0; -static int current_column = 0; - -// begin token -void update_location(const char *text) { - for (int i = 0; text[i] != '\0'; i++){ - if (text[i] == '\n'){ - current_column = 0; - current_line++; - } - else if (text[i] == '\t') - current_column += 8 - (current_column % 8); - else - current_column++; - } -} - -void print_error(const char *cerror) { - std::string error(cerror); - auto it = error.find("syntax error,"); - error.replace(it, 13, ""); - std::cerr << "error at line " << current_line << " (column " << current_column << "): " << error << std::endl; - throw std::runtime_error("compilation failed"); -} - -char return_impl(char t, const char * yytext) { - update_location(yytext); - return t; -} - -yytokentype return_impl(yytokentype t, const char * yytext){ - update_location(yytext); - return t; -} - -void return_void(const char * yytext){ - update_location(yytext); -} - -} - -} diff --git a/lib/lang/wgtcc/evaluator.cc b/lib/lang/evaluator.cc similarity index 97% rename from lib/lang/wgtcc/evaluator.cc rename to lib/lang/evaluator.cc index 02cb224f9..0123f4239 100644 --- a/lib/lang/wgtcc/evaluator.cc +++ b/lib/lang/evaluator.cc @@ -1,6 +1,6 @@ -#include "triton/lang/wgtcc/evaluator.h" -#include "triton/lang/wgtcc/ast.h" -#include "triton/lang/wgtcc/token.h" +#include "triton/lang/evaluator.h" +#include "triton/lang/ast.h" +#include "triton/lang/token.h" template diff --git a/lib/lang/expression.cpp b/lib/lang/expression.cpp deleted file mode 100644 index 8d5288e8b..000000000 --- a/lib/lang/expression.cpp +++ /dev/null @@ -1,359 +0,0 @@ -#include "triton/lang/expression.h" -#include "triton/lang/declaration.h" -#include "triton/ir/constant.h" -#include "triton/ir/module.h" -#include "triton/ir/builder.h" -#include "triton/ir/type.h" - - -namespace triton{ - -namespace lang{ - - -/* Binary operator */ -ir::value *binary_expression::llvm_op(ir::module *mod, ir::builder &builder, ir::value *lhs, ir::value *rhs, const std::string &name) const -{ - bool is_float = false, is_ptr = false, is_int = false, is_signed = false; - implicit_cast(builder, lhs, rhs, is_float, is_ptr, is_int, is_signed); - implicit_broadcast(mod, lhs, rhs); - if(op_==MUL && is_float) - return builder.create_fmul(lhs, rhs, name); - if(op_==MUL && is_int) - return builder.create_mul(lhs, rhs, name); - if(op_==DIV && is_float) - return builder.create_fdiv(lhs, rhs, name); - if(op_==DIV && is_int && is_signed) - return builder.create_sdiv(lhs, rhs, name); - if(op_==DIV && is_int && !is_signed) - return builder.create_udiv(lhs, rhs, name); - if(op_==MOD && is_float) - return builder.create_frem(lhs, rhs, name); - if(op_==MOD && is_int && is_signed) - return builder.create_srem(lhs, rhs, name); - if(op_==MOD && is_int && !is_signed) - return builder.create_urem(lhs, rhs, name); - if(op_==ADD && is_float) - return builder.create_fadd(lhs, rhs, name); - if(op_==ADD && is_int) - return builder.create_add(lhs, rhs); - if(op_==ADD && is_ptr) - return builder.create_gep(lhs, {rhs}); - if(op_==SUB && is_float) - return builder.create_fsub(lhs, rhs, name); - if(op_==SUB && is_int) - return builder.create_sub(lhs, rhs, name); - if(op_==SUB && is_ptr) - return builder.create_gep(lhs, {builder.create_neg(rhs)}); - if(op_==LEFT_SHIFT) - return builder.create_shl(lhs, rhs, name); - if(op_==RIGHT_SHIFT) - return builder.create_ashr(lhs, rhs, name); - if(op_ == LT && is_float) - return builder.create_fcmpOLT(lhs, rhs, name); - if(op_ == LT && is_int && is_signed) - return builder.create_icmpSLT(lhs, rhs, name); - if(op_ == LT && is_int && !is_signed) - return builder.create_icmpULT(lhs, rhs, name); - if(op_ == GT && is_float) - return builder.create_fcmpOGT(lhs, rhs, name); - if(op_ == GT && is_int && is_signed) - return builder.create_icmpSGT(lhs, rhs, name); - if(op_ == GT && is_int && !is_signed) - return builder.create_icmpUGT(lhs, rhs, name); - if(op_ == LE && is_float) - return builder.create_fcmpOLE(lhs, rhs, name); - if(op_ == LE && is_int && is_signed) - return builder.create_icmpSLE(lhs, rhs, name); - if(op_ == LE && is_int && !is_signed) - return builder.create_icmpULE(lhs, rhs, name); - if(op_ == GE && is_float) - return builder.create_fcmpOGE(lhs, rhs, name); - if(op_ == GE && is_int && is_signed) - return builder.create_icmpSGE(lhs, rhs, name); - if(op_ == GE && is_int && !is_signed) - return builder.create_icmpUGE(lhs, rhs, name); - if(op_ == EQ && is_ptr) - return builder.create_icmpEQ(lhs, rhs, name); - if(op_ == EQ && is_float) - return builder.create_fcmpOEQ(lhs, rhs, name); - if(op_ == EQ && is_int) - return builder.create_icmpEQ(lhs, rhs, name); - if(op_ == NE && is_ptr) - return builder.create_icmpNE(lhs, rhs, name); - if(op_ == NE && is_float) - return builder.create_fcmpONE(lhs, rhs, name); - if(op_ == NE && is_int) - return builder.create_icmpNE(lhs, rhs, name); - if(op_ == AND) - return builder.create_and(lhs, rhs, name); - if(op_ == XOR) - return builder.create_xor(lhs, rhs, name); - if(op_ == OR) - return builder.create_or(lhs, rhs, name); - if(op_ == LAND) - return builder.create_and(lhs, rhs, name); - if(op_ == LOR) - return builder.create_or(lhs, rhs, name); - throw std::runtime_error("unreachable"); -} - -ir::value* binary_expression::codegen(ir::module *mod) const{ - ir::value *lhs = lhs_->codegen(mod); - ir::value *rhs = rhs_->codegen(mod); - ir::value *result = llvm_op(mod, mod->get_builder(), lhs, rhs, ""); - return result; -} - -/* Builtin expression */ - -// alloc constant -ir::value* alloc_const_expression::codegen(ir::module *mod) const { - ir::type *ty = spec_->type(mod); - ir::constant_int *size = (ir::constant_int*)size_->codegen(mod); - ir::alloc_const *res = new ir::alloc_const(ty, size); - return res; -} - -// get_program_id -ir::value* get_program_id_expression::codegen(ir::module *mod) const { - return mod->get_builder().create_get_program_id(axis_->value()); -} - -// get_num_program -ir::value* get_num_program_expression::codegen(ir::module *mod) const { - return mod->get_builder().create_get_num_program(axis_->value()); -} - -// atomic cas -ir::value* atomic_cas_expression::codegen(ir::module *mod) const { - ir::value *ptr = ptr_->codegen(mod); - ir::value *cmp = cmp_->codegen(mod); - ir::value *val = val_->codegen(mod); - return mod->get_builder().create_atomic_cas(ptr, cmp, val); -} - -// atomic exch -ir::value* atomic_exch_expression::codegen(ir::module *mod) const { - ir::value *ptr = ptr_->codegen(mod); - ir::value *val = val_->codegen(mod); - return mod->get_builder().create_atomic_exch(ptr, val); -} - -// atomic add -ir::value* atomic_add_expression::codegen(ir::module *mod) const { - ir::value *ptr = ptr_->codegen(mod); - ir::value *val = val_->codegen(mod); - return mod->get_builder().create_atomic_add(ptr, val); -} - -// matmul -ir::value* matmul_expression::codegen(ir::module *mod) const { - ir::value *A = A_->codegen(mod); - ir::value *B = B_->codegen(mod); - ir::value *C = C_->codegen(mod); -// unsigned M = A->get_type()->get_tile_shapes()[0]; -// unsigned N = B->get_type()->get_tile_shapes()[1]; -// ir::type *scalar_ty = A->get_type()->get_scalar_ty(); -// ir::type *tile_ty = ir::tile_type::get(scalar_ty, {M, N}); -// ir::value *tmp = ir::undef_value::get(tile_ty); -// implicit_broadcast(mod, tmp, C); - return mod->get_builder().create_dot(A, B, C); -} - -// reshape -ir::value* reshape_expression::codegen(ir::module *mod) const { - // arg - ir::value *arg = arg_->codegen(mod); - // shapes - ir::type::tile_shapes_t shapes; - for(expression *expr: shapes_->values()){ - ir::constant_int *shape = dynamic_cast(expr->codegen(mod)); - if(shape == nullptr) - throw std::runtime_error("tile shapes must be constant expressions"); - shapes.push_back(shape); - } - // return - return mod->get_builder().create_reshape(arg, shapes); -} - -// min -ir::value* min_expression::codegen(ir::module *mod) const { - ir::value* cmp = binary_expression(LT, (node*)x_, (node*)y_).codegen(mod); - ir::value* x = ((ir::cmp_inst*)cmp)->get_operand(0); - ir::value* y = ((ir::cmp_inst*)cmp)->get_operand(1); - return mod->get_builder().create_select(cmp, x, y); -} - -// max -ir::value* max_expression::codegen(ir::module *mod) const { - ir::value* cmp = binary_expression(GT, (node*)x_, (node*)y_).codegen(mod); - ir::value* x = ((ir::cmp_inst*)cmp)->get_operand(0); - ir::value* y = ((ir::cmp_inst*)cmp)->get_operand(1); - return mod->get_builder().create_select(cmp, x, y); -} - -// select -ir::value* select_expression::codegen(ir::module *mod) const { - ir::value* pred = pred_->codegen(mod); - ir::value* if_value = if_value_->codegen(mod); - ir::value* else_value = else_value_->codegen(mod); - return mod->get_builder().create_select(pred, if_value, else_value); -} - -// trans -ir::value* trans_expression::codegen(ir::module *mod) const { - // shapes - std::vector perm; - if(perm_) { - for(expression *expr: perm_->values()){ - ir::constant_int *shape = dynamic_cast(expr->codegen(mod)); - if(shape == nullptr) - throw std::runtime_error("tile shapes must be constant expressions"); - perm.push_back(shape); - } - } - return mod->get_builder().create_trans(arg_->codegen(mod), perm); -} - -// sqrt -ir::value* sqrt_expression::codegen(ir::module *mod) const { - return mod->get_builder().create_sqrt(arg_->codegen(mod)); -} - -// reduce -ir::value* reduce_expression::codegen(ir::module *mod) const { - return mod->get_builder().create_reduce(arg_->codegen(mod), axis_->value()); -} - -/* Postfix expression */ -ir::value* indexing_expression::codegen(ir::module *mod) const{ - ir::value *in = lhs_->codegen(mod); - const std::vector &slices = slices_->values(); - auto in_shapes = in->get_type()->get_tile_shapes(); - ir::type::tile_shapes_t::value_type one = ir::tile_type::make_one(mod->get_context()); - ir::type::tile_shapes_t out_shapes(slices.size()); - // create shapes - size_t current = 0; - for(size_t i = 0; i < out_shapes.size(); i++) - out_shapes[i] = (slices[i]->type()==NEWAXIS)?one:in_shapes[current++]; - return mod->get_builder().create_reshape(in, out_shapes); -} - - -/* Unary operator */ -ir::value *unary_expression::llvm_op(ir::builder &builder, ir::value *arg, const std::string &name) const{ - ir::type *atype = arg->get_type(); - bool is_float = atype->is_floating_point_ty(); - bool is_int = atype->is_integer_ty(); - if(op_ == INC) - return builder.create_add(arg, builder.get_int32(1), name); - if(op_ == DEC) - return builder.create_sub(arg, builder.get_int32(1), name); - if(op_ == PLUS) - return arg; - if(op_ == MINUS && is_float) - return builder.create_fneg(arg, name); - if(op_ == MINUS && is_int) - return builder.create_neg(arg, name); - if(op_ == ADDR) - throw std::runtime_error("not supported"); - if(op_ == DEREF) - return builder.create_load(arg, name); - if(op_ == COMPL) - throw std::runtime_error("not supported"); - if(op_ == NOT) - return builder.create_not(arg, name); - throw std::runtime_error("unreachable"); -} - -ir::value* unary_expression::codegen(ir::module *mod) const{ - ir::value *arg = arg_->codegen(mod); - ir::value *result = llvm_op(mod->get_builder(), arg, ""); - return result; -} - -/* Cast operator */ -ir::value *cast_expression::llvm_op(ir::builder &builder, ir::type *T, ir::value *arg, const std::string &name) const{ - return nullptr; -} - -ir::value* cast_expression::codegen(ir::module *mod) const{ - ir::value *arg = arg_->codegen(mod); - ir::type *T = T_->type(mod); - return llvm_op(mod->get_builder(), T, arg, ""); -} - -/* Conditional expression */ -ir::value *conditional_expression::codegen(ir::module *mod) const { - ir::builder &builder = mod->get_builder(); - ir::value *mask = cond_->codegen(mod); - ir::value *true_value = true_value_->codegen(mod); - ir::value *false_value = false_value_->codegen(mod); - bool is_float, is_ptr, is_int, is_signed; - implicit_cast(builder, true_value, false_value, is_float, is_ptr, is_int, is_signed); - implicit_broadcast(mod, mask, true_value); - implicit_broadcast(mod, mask, false_value); - if(ir::load_inst* load = dynamic_cast(true_value)){ - load->erase_from_parent(); - return builder.create_masked_load(load->get_pointer_operand(), mask, false_value); - } - if(ir::load_inst* load = dynamic_cast(false_value)){ - load->erase_from_parent(); - return builder.create_masked_load(load->get_pointer_operand(), mask, true_value); - } - throw std::runtime_error("not implemented"); -} - -/* Assignment expression */ -ir::value *assignment_expression::codegen(ir::module *mod) const{ - ir::value *rvalue = rvalue_->codegen(mod); - if(auto *x = dynamic_cast(lvalue_)){ - ir::type *ty = mod->get_scope().types.at(x->id()->name()); - rvalue = explicit_cast(mod->get_builder(), rvalue, ty); - implicit_broadcast(mod, ty, rvalue); - mod->set_value(x->id()->name(), rvalue); - } - else if(auto* x = dynamic_cast(lvalue_)){ - assert(x->get_op()==DEREF); - assert(x->lvalue()); - ir::value *ptr = x->lvalue()->codegen(mod); - rvalue = mod->get_builder().create_store(ptr, rvalue); - } - return rvalue; -} - - -/* String literal */ -ir::value* string_literal::codegen(ir::module *) const{ - throw std::runtime_error("not supported"); -// return ir::constant_data_array::get_string(mod->get_context(), value_); -} - -/* Constant */ -ir::value* constant::codegen(ir::module *mod) const{ - return mod->get_builder().get_int32(value_); -} - -int constant::value() const{ - return value_; -} - -/* Constant range */ -ir::value* constant_range::codegen(ir::module *mod) const{ - return ir::constant_range::get((ir::constant_int*)first_->codegen(mod), - (ir::constant_int*)last_->codegen(mod)); -} - -/* Named */ -ir::value* named_expression::codegen(ir::module *mod) const{ - const std::string &name = id()->name(); - const auto& declarations = mod->get_scope().types; - if(declarations.find(name) == declarations.end()) - throw std::runtime_error("variable " + name + " not declared"); - return mod->get_value(name); -} - -} - -} diff --git a/lib/lang/module.cpp b/lib/lang/module.cpp deleted file mode 100644 index 3455ca98f..000000000 --- a/lib/lang/module.cpp +++ /dev/null @@ -1,18 +0,0 @@ -#include "triton/lang/module.h" -#include "triton/ir/module.h" - - -namespace triton{ - -namespace lang{ - -/* Translation unit */ -ir::value* translation_unit::codegen(ir::module *mod) const{ - mod->add_new_scope(); - decls_.codegen(mod); - return nullptr; -} - -} - -} diff --git a/lib/lang/node.cpp b/lib/lang/node.cpp deleted file mode 100644 index dda7126bd..000000000 --- a/lib/lang/node.cpp +++ /dev/null @@ -1,164 +0,0 @@ -#include "triton/lang/node.h" -#include "triton/ir/builder.h" -#include "triton/ir/module.h" -#include "triton/ir/constant.h" - -namespace triton{ - -namespace lang{ - -/* node */ -ir::value *node::explicit_cast(ir::builder &builder, ir::value *src, ir::type *dst_ty){ - ir::type *src_scalar_ty = src->get_type()->get_scalar_ty(); - ir::type *dst_scalar_ty = dst_ty->get_scalar_ty(); - if(src->get_type()->is_tile_ty()) - dst_ty = ir::tile_type::get_same_shapes(dst_scalar_ty, src->get_type()); - bool src_signed = false; - bool dst_signed = false; - if(src_scalar_ty == dst_scalar_ty) - return src; - else if(src_scalar_ty->is_integer_ty() && src_signed && dst_scalar_ty->is_floating_point_ty()) - return builder.create_si_to_fp(src, dst_ty); - - else if(src_scalar_ty->is_integer_ty() && !src_signed && dst_scalar_ty->is_floating_point_ty()) - return builder.create_ui_to_fp(src, dst_ty); - - else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_integer_ty() && dst_signed) - return builder.create_fp_to_si(src, dst_ty); - - else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_integer_ty() && !dst_signed) - return builder.create_fp_to_ui(src, dst_ty); - - else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_floating_point_ty() && - src_scalar_ty->get_fp_mantissa_width() < dst_scalar_ty->get_fp_mantissa_width()) - return builder.create_fp_ext(src, dst_ty); - - else if(src_scalar_ty->is_floating_point_ty() && dst_scalar_ty->is_floating_point_ty() && - src_scalar_ty->get_fp_mantissa_width() > dst_scalar_ty->get_fp_mantissa_width()) - return builder.create_fp_trunc(src, dst_ty); - - else if(src_scalar_ty->is_integer_ty() && dst_scalar_ty->is_integer_ty() && - src_scalar_ty->get_integer_bitwidth()) - return builder.create_int_cast(src, dst_ty, dst_signed); - - else - throw std::runtime_error("unreachable"); -} - - -void node::implicit_cast(ir::builder &builder, ir::value *&lhs, ir::value *&rhs, - bool &is_float, bool &is_ptr, bool &is_int, bool &is_signed){ - // Input types - ir::type *left_ty = lhs->get_type()->get_scalar_ty(); - ir::type *right_ty = rhs->get_type()->get_scalar_ty(); - // One operand is pointer - if(left_ty->is_pointer_ty() || right_ty->is_pointer_ty()){ - is_ptr = true; - } - // One operand is double - else if(left_ty->is_double_ty() || right_ty->is_double_ty()){ - ir::value *&to_convert = left_ty->is_double_ty()?rhs:lhs; - to_convert = explicit_cast(builder, to_convert, builder.get_double_ty()); - is_float = true; - } - // One operand is float - else if(left_ty->is_float_ty() || right_ty->is_float_ty()){ - ir::value *&to_convert = left_ty->is_float_ty()?rhs:lhs; - to_convert = explicit_cast(builder, to_convert, builder.get_float_ty()); - is_float = true; - } - // One operand is half - else if(left_ty->is_half_ty() || right_ty->is_half_ty()){ - ir::value *&to_convert = left_ty->is_half_ty()?rhs:lhs; - to_convert = explicit_cast(builder, to_convert, builder.get_half_ty()); - is_float = true; - } - // Both operands are integers - else if(left_ty->is_integer_ty() && right_ty->is_integer_ty()){ - is_int = true; - is_signed = true; // always signed for now - if(left_ty->get_integer_bitwidth() != right_ty->get_integer_bitwidth()){ - ir::value *&to_convert = (left_ty->get_integer_bitwidth() > right_ty->get_integer_bitwidth())?rhs:lhs; - ir::type *dst_ty = (to_convert==lhs)?right_ty:left_ty; - to_convert = explicit_cast(builder, to_convert, dst_ty); - } - } - // Not reachable - else - throw std::runtime_error("unreachable"); -} - -void node::implicit_broadcast(ir::module *mod, ir::value *&lhs, ir::value *&rhs) { - ir::type *lhs_ty = lhs->get_type(); - ir::type *rhs_ty = rhs->get_type(); - ir::type *res_ty = nullptr; - if(!lhs_ty->is_tile_ty() && !rhs_ty->is_tile_ty()) - return; - else if(lhs_ty->is_tile_ty() && !rhs_ty->is_tile_ty()) - res_ty = lhs_ty; - else if(!lhs_ty->is_tile_ty() && rhs_ty->is_tile_ty()) - res_ty = rhs_ty; - else{ - auto lhs_shapes = lhs_ty->get_tile_shapes(); - auto rhs_shapes = rhs_ty->get_tile_shapes(); - size_t lhs_size = lhs_shapes.size(); - size_t rhs_size = rhs_shapes.size(); - size_t res_size = std::max(lhs_size, rhs_size); - ir::type::tile_shapes_t res_shapes(res_size); - ir::type::tile_shapes_t::value_type one = ir::tile_type::make_one(mod->get_context()); - for(size_t i = 0; i < res_size; i++){ - if(i >= res_size - lhs_size && i >= res_size - rhs_size) - res_shapes[i] = lhs_shapes[i]==one?rhs_shapes[i]:lhs_shapes[i]; - else if(i >= res_size - lhs_size) - res_shapes[i] = lhs_shapes[i]; - else if(i >= res_size - rhs_size) - res_shapes[i] = rhs_shapes[i]; - } - res_ty = ir::tile_type::get(lhs_ty->get_scalar_ty(), res_shapes); - } - implicit_broadcast(mod, res_ty, rhs); - implicit_broadcast(mod, res_ty, lhs); -} - -void node::implicit_broadcast(ir::module *mod, ir::type *ty, ir::value *&src){ - ir::builder &builder = mod->get_builder(); - ir::type *src_ty = src->get_type(); - ir::type::tile_shapes_t::value_type one = ir::tile_type::make_one(mod->get_context()); - // Both are scalar - if(!ty->is_tile_ty() && !src_ty->is_tile_ty()) - return; - // Broadcast scalar - if(ty->is_tile_ty() && !src_ty->is_tile_ty()){ - src = builder.create_splat(src, ty->get_tile_shapes()); - return; - } - // Downcast tile - if(!ty->is_tile_ty() && src_ty->is_tile_ty()){ - for(ir::constant *shape: src_ty->get_tile_shapes()) - if(shape != one) - throw std::runtime_error("cannot downcast"); - src = builder.create_downcast(src); - return; - } - // Both are arrays - auto dst_shapes = ty->get_tile_shapes(); - auto src_shapes = src_ty->get_tile_shapes(); - int dst_dim = dst_shapes.size(); - int src_dim = src_shapes.size(); - // Pad - int off = dst_dim - src_dim; - for(int i = 0; i < off; i++) - src_shapes.insert(src_shapes.begin(), one); - if(off > 0) - src = builder.create_reshape(src, src_shapes); - // Broadcast - for(int i = dst_dim - 1; i>= 0; i--) - if(dst_shapes[i] != src_shapes[i] && dst_shapes[i] != one && src_shapes[i] != one) - throw std::runtime_error("cannot broadcast"); - if(dst_shapes != src_shapes) - src = builder.create_broadcast(src, dst_shapes); -} - -} - -} diff --git a/lib/lang/wgtcc/parser.cc b/lib/lang/parser.cc similarity index 99% rename from lib/lang/wgtcc/parser.cc rename to lib/lang/parser.cc index ee8a8a319..35ed63e15 100644 --- a/lib/lang/wgtcc/parser.cc +++ b/lib/lang/parser.cc @@ -1,11 +1,11 @@ -#include "triton/lang/wgtcc/parser.h" +#include "triton/lang/parser.h" -#include "triton/lang/wgtcc/cpp.h" -#include "triton/lang/wgtcc/encoding.h" -#include "triton/lang/wgtcc/error.h" -#include "triton/lang/wgtcc/evaluator.h" -#include "triton/lang/wgtcc/scope.h" -#include "triton/lang/wgtcc/type.h" +#include "triton/lang/cpp.h" +#include "triton/lang/encoding.h" +#include "triton/lang/error.h" +#include "triton/lang/evaluator.h" +#include "triton/lang/scope.h" +#include "triton/lang/type.h" #include #include diff --git a/lib/lang/wgtcc/scanner.cc b/lib/lang/scanner.cc similarity index 99% rename from lib/lang/wgtcc/scanner.cc rename to lib/lang/scanner.cc index 0f0dbdfa0..9c394ecfd 100644 --- a/lib/lang/wgtcc/scanner.cc +++ b/lib/lang/scanner.cc @@ -1,4 +1,4 @@ -#include "triton/lang/wgtcc/scanner.h" +#include "triton/lang/scanner.h" #include #include diff --git a/lib/lang/wgtcc/scope.cc b/lib/lang/scope.cc similarity index 96% rename from lib/lang/wgtcc/scope.cc rename to lib/lang/scope.cc index bc1c6827c..9e487deba 100644 --- a/lib/lang/wgtcc/scope.cc +++ b/lib/lang/scope.cc @@ -1,6 +1,6 @@ -#include "triton/lang/wgtcc/scope.h" +#include "triton/lang/scope.h" -#include "triton/lang/wgtcc/ast.h" +#include "triton/lang/ast.h" #include #include diff --git a/lib/lang/statement.cpp b/lib/lang/statement.cpp deleted file mode 100644 index a768bf7b4..000000000 --- a/lib/lang/statement.cpp +++ /dev/null @@ -1,161 +0,0 @@ -#include "triton/lang/expression.h" -#include "triton/lang/statement.h" -#include "triton/lang/declaration.h" -#include "triton/ir/constant.h" -#include "triton/ir/module.h" -#include "triton/ir/basic_block.h" -#include "triton/ir/builder.h" -#include "triton/ir/type.h" - -namespace triton{ - -namespace lang{ - -/* Helpers */ -inline bool is_terminator(ir::value* x) { - return x && dynamic_cast(x); -} - - -/* Statements */ -ir::value* compound_statement::codegen(ir::module* mod) const{ - mod->add_new_scope(); - if(items_) - items_->codegen(mod); - mod->pop_scope(); - return nullptr; -} - -/* Expression statement */ -ir::value* expression_statement::codegen(ir::module *mod) const{ - ir::builder &builder = mod->get_builder(); - // get name if applicable - std::string name = ""; - ir::value *current = nullptr; - if(assignment_expression *assignment = dynamic_cast(expr_)) - if(const named_expression* named = dynamic_cast(assignment->lvalue())){ - name = named->id()->name(); - current = mod->get_value(name); - } - // lower expression - ir::value *expr = expr_->codegen(mod); - // modify expression if predicated - if(pred_) { - ir::value *pred = pred_->codegen(mod); - if(!current) - current = ir::undef_value::get(expr->get_type()); - if(auto *x = dynamic_cast(expr)){ - x->erase_from_parent(); - expr = builder.create_masked_load(x->get_pointer_operand(), pred, current); - } - else if(auto *x = dynamic_cast(expr)){ - x->erase_from_parent(); - expr =builder.create_masked_store(x->get_pointer_operand(), x->get_value_operand(), pred); - } - else - expr = builder.create_select(pred, expr, current); - } - // update symbols table - if(!name.empty()) - mod->set_value(name, expr); - return expr; -} - -/* For statement */ -ir::value* iteration_statement::codegen(ir::module *mod) const{ - ir::builder &builder = mod->get_builder(); - ir::context &ctx = mod->get_context(); - ir::basic_block *current_bb = builder.get_insert_block(); - ir::function *fn = current_bb->get_parent(); - ir::basic_block *loop_bb = ir::basic_block::create(ctx, "loop", fn); - ir::basic_block *next_bb = ir::basic_block::create(ctx, "postloop", fn); - mod->set_continue_fn([&](){ - if(exec_) - exec_->codegen(mod); - ir::value *cond = explicit_cast(builder, stop_->codegen(mod), ir::type::get_int1_ty(ctx)); - return builder.create_cond_br(cond, loop_bb, next_bb); - }); - init_->codegen(mod); - ir::value *cond = explicit_cast(builder, stop_->codegen(mod), ir::type::get_int1_ty(ctx)); - builder.create_cond_br(cond, loop_bb, next_bb); -// builder.create_br(loop_bb); - builder.set_insert_point(loop_bb); - if(!is_terminator(statements_->codegen(mod))) - mod->get_continue_fn()(); - ir::basic_block *stop_bb = builder.get_insert_block(); - mod->seal_block(stop_bb); - mod->seal_block(loop_bb); - mod->seal_block(builder.get_insert_block()); - mod->seal_block(next_bb); - builder.set_insert_point(next_bb); - return nullptr; -} - -/* While statement */ -ir::value* while_statement::codegen(ir::module* mod) const{ - ir::builder &builder = mod->get_builder(); - ir::context &ctx = mod->get_context(); - ir::basic_block *current_bb = builder.get_insert_block(); - ir::function *fn = current_bb->get_parent(); - ir::basic_block *loop_bb = ir::basic_block::create(ctx, "loop", fn); - ir::basic_block *next_bb = ir::basic_block::create(ctx, "postloop", fn); - mod->set_continue_fn([&](){ - ir::value *cond = explicit_cast(builder, cond_->codegen(mod), ir::type::get_int1_ty(ctx)); - return builder.create_cond_br(cond, loop_bb, next_bb); - }); - ir::value *cond = explicit_cast(builder, cond_->codegen(mod), ir::type::get_int1_ty(ctx)); - builder.create_cond_br(cond, loop_bb, next_bb); - builder.set_insert_point(loop_bb); - if(!is_terminator(statements_->codegen(mod))) - mod->get_continue_fn()(); - ir::basic_block *stop_bb = builder.get_insert_block(); - mod->seal_block(stop_bb); - mod->seal_block(loop_bb); - mod->seal_block(builder.get_insert_block()); - mod->seal_block(next_bb); - builder.set_insert_point(next_bb); - return nullptr; -} - -/* Selection statement */ -ir::value* selection_statement::codegen(ir::module* mod) const{ - ir::builder &builder = mod->get_builder(); - ir::context &ctx = mod->get_context(); - ir::function *fn = builder.get_insert_block()->get_parent(); - ir::value *cond = cond_->codegen(mod); - ir::basic_block *then_bb = ir::basic_block::create(ctx, "then", fn); - ir::basic_block *else_bb = else_value_?ir::basic_block::create(ctx, "else", fn):nullptr; - ir::basic_block *endif_bb = ir::basic_block::create(ctx, "endif", fn); - mod->seal_block(then_bb); - if(else_value_) - mod->seal_block(else_bb); - - // Branch - if(else_value_) - builder.create_cond_br(cond, then_bb, else_bb); - else - builder.create_cond_br(cond, then_bb, endif_bb); - // Then - builder.set_insert_point(then_bb); - if(!is_terminator(then_value_->codegen(mod))) - builder.create_br(endif_bb); - // Else - if(else_value_){ - builder.set_insert_point(else_bb); - if(!is_terminator(else_value_->codegen(mod))) - builder.create_br(endif_bb); - } - // Endif - mod->seal_block(endif_bb); - builder.set_insert_point(endif_bb); - return nullptr; -} - -/* Continue statement */ -ir::value* continue_statement::codegen(ir::module *mod) const{ - return mod->get_continue_fn()(); -} - -} - -} diff --git a/lib/lang/wgtcc/token.cc b/lib/lang/token.cc similarity index 98% rename from lib/lang/wgtcc/token.cc rename to lib/lang/token.cc index ba588588e..5445b2044 100644 --- a/lib/lang/wgtcc/token.cc +++ b/lib/lang/token.cc @@ -1,7 +1,7 @@ -#include "triton/lang/wgtcc/token.h" +#include "triton/lang/token.h" -#include "triton/lang/wgtcc/mem_pool.h" -#include "triton/lang/wgtcc/parser.h" +#include "triton/lang/mem_pool.h" +#include "triton/lang/parser.h" static MemPoolImp tokenPool; diff --git a/lib/lang/wgtcc/type.cc b/lib/lang/type.cc similarity index 98% rename from lib/lang/wgtcc/type.cc rename to lib/lang/type.cc index 25d5c56ce..a1564ad97 100644 --- a/lib/lang/wgtcc/type.cc +++ b/lib/lang/type.cc @@ -1,8 +1,8 @@ -#include "triton/lang/wgtcc/type.h" +#include "triton/lang/type.h" -#include "triton/lang/wgtcc/ast.h" -#include "triton/lang/wgtcc/scope.h" -#include "triton/lang/wgtcc/token.h" +#include "triton/lang/ast.h" +#include "triton/lang/scope.h" +#include "triton/lang/token.h" #include #include diff --git a/lib/lang/wgtcc/main.cc b/lib/lang/wgtcc/main.cc deleted file mode 100644 index cc02588f6..000000000 --- a/lib/lang/wgtcc/main.cc +++ /dev/null @@ -1,30 +0,0 @@ -#include "triton/lang/wgtcc/code_gen.h" -#include "triton/lang/wgtcc/cpp.h" -#include "triton/lang/wgtcc/error.h" -#include "triton/lang/wgtcc/parser.h" -#include "triton/lang/wgtcc/scanner.h" - -#include -#include -#include -#include -#include -#include - -#include -#include -#include - - -std::string program; -std::string filename_in; -std::string filename_out; -bool debug = false; -static bool only_preprocess = false; -static bool only_compile = false; -static bool specified_out_name = false; -static std::list filenames_in; -static std::list gcc_filenames_in; -static std::list gcc_args; -static std::list defines; -static std::list include_paths; diff --git a/lib/runtime/function.cpp b/lib/runtime/function.cpp index 36bbc2100..11279b571 100644 --- a/lib/runtime/function.cpp +++ b/lib/runtime/function.cpp @@ -5,10 +5,9 @@ #include #include "triton/codegen/selection/selection.h" #include "triton/runtime/function.h" -#include "triton/lang/lang.h" -#include "triton/lang/wgtcc/cpp.h" -#include "triton/lang/wgtcc/parser.h" -#include "triton/lang/wgtcc/code_gen.h" +#include "triton/lang/cpp.h" +#include "triton/lang/parser.h" +#include "triton/lang/code_gen.h" #include "triton/driver/device.h" #include "triton/driver/stream.h" #include "triton/driver/kernel.h" @@ -19,16 +18,11 @@ #include "llvm/IR/Module.h" -typedef struct yy_buffer_state * YY_BUFFER_STATE; -extern int yyparse(); -extern YY_BUFFER_STATE yy_scan_string(const char * str); -extern void yy_delete_buffer(YY_BUFFER_STATE buffer); -extern triton::lang::translation_unit *ast_root; + namespace triton{ namespace runtime { - // helpers void _parallel_loop_nest(std::vector const & ranges, std::function const &)> const & f, @@ -148,7 +142,6 @@ function::caller function::autotune(driver::stream* stream, const grid_fn_ty& gr opt.num_warps = std::stoi(params[i++]); for(auto it: opt_space_.defines) opt.defines[it.first] = params[i++]; - // pre-process TokenSequence tokens; Preprocessor cpp(&src_, true); @@ -241,9 +234,7 @@ void function::operator()(const std::vector& args, const grid_fn_ty& grid_f } /* re-tune and re-compile */ - caller call = autotune(stream, grid_fn, args); - cache_.insert({key, call}); - + cache_.insert({key, autotune(stream, grid_fn, args)}); } void function::operator()(const std::vector& args, const grid_t& grid, driver::stream *stream) { From c9371c7234c8d89d4fb818dad172781b3658f50e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 23 Aug 2019 17:31:50 -0700 Subject: [PATCH 317/494] [general] error messages no longer depend on a program name --- lib/lang/error.cc | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/lib/lang/error.cc b/lib/lang/error.cc index baf944468..ddda6b6ce 100644 --- a/lib/lang/error.cc +++ b/lib/lang/error.cc @@ -18,13 +18,9 @@ #define ANSI_COLOR_RESET "\x1b[0m" -extern std::string program; - - void Error(const char* format, ...) { fprintf(stderr, - "%s: " ANSI_COLOR_RED "error: " ANSI_COLOR_RESET, - program.c_str()); + ANSI_COLOR_RED "error: " ANSI_COLOR_RESET); va_list args; va_start(args, format); From a110a7e8cffd6c09d595c1a5b19b4ec8a993fa03 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 23 Aug 2019 17:49:21 -0700 Subject: [PATCH 318/494] [ir] changed type of tile shapes from constant_int* to int --- include/triton/ir/type.h | 5 +-- lib/codegen/analysis/alignment.cpp | 8 ++--- lib/codegen/analysis/shmem/allocation.cpp | 4 +-- lib/codegen/analysis/shmem/info.cpp | 2 +- lib/codegen/analysis/tune.cpp | 24 ++++++--------- lib/codegen/selection/selection.cpp | 37 +++++++++++------------ lib/codegen/transform/peephole.cpp | 3 +- lib/ir/constant.cpp | 2 +- lib/ir/instructions.cpp | 3 +- lib/ir/type.cpp | 11 ++----- lib/lang/code_gen.cc | 3 +- 11 files changed, 41 insertions(+), 61 deletions(-) diff --git a/include/triton/ir/type.h b/include/triton/ir/type.h index 780d79e74..786a196dc 100644 --- a/include/triton/ir/type.h +++ b/include/triton/ir/type.h @@ -14,7 +14,7 @@ class constant_int; /* Type */ class type { public: - typedef std::vector tile_shapes_t; + typedef std::vector tile_shapes_t; protected: typedef std::vector contained_tys_vec_t; @@ -152,9 +152,6 @@ public: static tile_type* get(type *ty, const tile_shapes_t &shapes); static tile_type* get_same_shapes(type *ty, type *ref); - // shortcut to get a 1 element in the shape - static tile_shapes_t::value_type make_one(context &ctx); - private: tile_shapes_t shapes_; }; diff --git a/lib/codegen/analysis/alignment.cpp b/lib/codegen/analysis/alignment.cpp index 6383ed850..69cf3479c 100644 --- a/lib/codegen/analysis/alignment.cpp +++ b/lib/codegen/analysis/alignment.cpp @@ -30,7 +30,7 @@ inline T add_to_cache(ir::value *i, T value, std::map &map) { bool alignment_info::is_first_axis_unit(ir::value *x){ if(x->get_type()->is_tile_ty()) - return x->get_type()->get_tile_shapes()[0]->get_value() == 1; + return x->get_type()->get_tile_shapes()[0] == 1; else return true; } @@ -47,7 +47,7 @@ alignment_info::cst_info alignment_info::populate_is_constant(ir::value *v) { ir::value *op = x->get_operand(0); auto op_cst = populate_is_constant(op); if(is_first_axis_unit(op)){ - unsigned num_cst = x->get_type()->get_tile_shapes()[0]->get_value(); + unsigned num_cst = x->get_type()->get_tile_shapes()[0]; return cache({num_cst, op_cst.value}); } } @@ -111,7 +111,7 @@ unsigned alignment_info::populate_max_contiguous(ir::value *v){ return cache(1); auto shapes = v->get_type()->get_tile_shapes(); if(dynamic_cast(v)) - return cache(shapes[0]->get_value()); + return cache(shapes[0]); if(auto *x = dynamic_cast(v)){ ir::value *op = x->get_operand(0); if(op->get_type()->is_tile_ty()){ @@ -265,7 +265,7 @@ unsigned alignment_info::populate_starting_multiple(ir::value *v){ auto shapes = v->get_type()->get_tile_shapes(); unsigned result = 1; for(unsigned i = 0; i < shapes.size() - 1; i++) - result *= shapes[i]->get_value(); + result *= shapes[i]; return cache(result); } diff --git a/lib/codegen/analysis/shmem/allocation.cpp b/lib/codegen/analysis/shmem/allocation.cpp index 00e90d4a6..1061c0425 100644 --- a/lib/codegen/analysis/shmem/allocation.cpp +++ b/lib/codegen/analysis/shmem/allocation.cpp @@ -55,7 +55,7 @@ unsigned allocation::get_num_bytes(ir::value *x) { shapes.erase(shapes.begin() + axis); size_t num_elements = 1; for(auto x: shapes) - num_elements *= x->get_value(); + num_elements *= x; size_t depth; if(params_->get_fragment(x, 0) == grids::HMMA_FRAGMENT_C) depth = params_->get_param(op, "wpt.d" + std::to_string(axis))->get_value(); @@ -66,7 +66,7 @@ unsigned allocation::get_num_bytes(ir::value *x) { unsigned num_bytes = x->get_type()->get_primitive_size_in_bits() / 8; unsigned pad = is_ld_padded(x); if(pad > 0){ - unsigned ld = x->get_type()->get_tile_shapes()[0]->get_value(); + unsigned ld = x->get_type()->get_tile_shapes()[0]; num_bytes += pad * num_bytes / ld; } if(buffer_info_->is_double(x)) diff --git a/lib/codegen/analysis/shmem/info.cpp b/lib/codegen/analysis/shmem/info.cpp index 8f0dac32c..d16048d3b 100644 --- a/lib/codegen/analysis/shmem/info.cpp +++ b/lib/codegen/analysis/shmem/info.cpp @@ -79,7 +79,7 @@ void info::run(ir::module &mod) { for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()){ if(dynamic_cast(i)) - if(i->get_operand(1)->get_type()->get_tile_shapes()[1]->get_value() != 1){ + if(i->get_operand(1)->get_type()->get_tile_shapes()[1] != 1){ add_copy(i->get_operand(0), builder); add_copy(i->get_operand(1), builder); } diff --git a/lib/codegen/analysis/tune.cpp b/lib/codegen/analysis/tune.cpp index 9e6c499a2..5f150ee2c 100644 --- a/lib/codegen/analysis/tune.cpp +++ b/lib/codegen/analysis/tune.cpp @@ -52,7 +52,6 @@ void grids::init_c_phi(ir::instruction *v) { void grids::init_c_graph(ir::instruction *v) { // Reference shape - ir::type::tile_shapes_t::value_type one = ir::tile_type::make_one(v->get_parent()->get_context()); ir::type::tile_shapes_t shapes; if(auto *store = dynamic_cast(v)) shapes = store->get_pointer_operand()->get_type()->get_tile_shapes(); @@ -80,7 +79,7 @@ void grids::init_c_graph(ir::instruction *v) { unsigned current = 0; bool is_skewed = false; for(unsigned i = 0; i < shapes.size(); i ++){ - bool is_one = shapes[i] == one; + bool is_one = shapes[i] == 1; bool is_same = shapes[i] == op->get_type()->get_tile_shapes()[current]; if(is_one){ static_params_.insert({{v, i}, 1}); @@ -123,7 +122,7 @@ void grids::init_c_graph(ir::instruction *v) { for(unsigned i = 0; i < shapes.size(); i++) add_constraint({v, i}, {D, i}); for(unsigned i = 2; i < shapes.size(); i++){ - if(shapes[i] == one) + if(shapes[i] == 1) static_params_.insert({{v, i}, 1}); add_constraint({v, i}, {A, i}); add_constraint({v, i}, {B, i}); @@ -169,11 +168,6 @@ void grids::connected_components(node_t x, const std::vectorget_type(); - if(ty->is_tile_ty()){ - ir::type::tile_shapes_t::value_type shape = ty->get_tile_shapes().at(x.second); - if(auto mp = dynamic_cast(shape)) - params_[x.first].insert({"shape" + suffix, mp}); - } if(static_params_.find(x) != static_params_.end()){ for(ir::metaparameter *mp: mps) mp->set_value(static_params_.at(x)); @@ -245,8 +239,8 @@ void grids::run(ir::module &mod) { if(!i->get_type()->is_tile_ty()) continue; auto shapes = i->get_type()->get_tile_shapes(); - unsigned shape_0 = shapes[0]->get_value(); - unsigned shape_1 = shapes[1]->get_value(); + unsigned shape_0 = shapes[0]; + unsigned shape_1 = shapes[1]; unsigned size = i->get_type()->get_tile_num_elements(); /* HMMA parameters*/ if(fragments_.at({i, 0}) == HMMA_FRAGMENT_C){ @@ -293,14 +287,14 @@ void grids::run(ir::module &mod) { /* Scan-line */ else{ - unsigned shape = shapes[0]->get_value(); + unsigned shape = shapes[0]; unsigned current = num_threads; params_.at(i).at("nts.d0")->set_value(clamp(size / num_threads, 1, 8)); params_.at(i).at("mts.d0")->set_value(clamp(current, 1, shape / params_.at(i).at("nts.d0")->get_value())); current = current / params_.at(i).at("mts.d0")->get_value(); for(size_t d = 1; d < shapes.size(); d++){ std::string str_d = std::to_string(d); - shape = shapes[d]->get_value(); + shape = shapes[d]; params_.at(i).at("nts.d" + str_d)->set_value(1); params_.at(i).at("mts.d" + str_d)->set_value(clamp(current, 1, shape)); current = current / params_.at(i).at("mts.d" + str_d)->get_value(); @@ -324,8 +318,8 @@ void grids::create_grids(std::vector &grids, // get number of dimensions greater than 1 auto get_tile_gt1_dim = [&](ir::value *v){ unsigned result = 0; - for(ir::constant_int* shape: v->get_type()->get_tile_shapes()) { - result += (shape->get_value() > 1)?shape->get_value():0; + for(auto shape: v->get_type()->get_tile_shapes()) { + result += (shape > 1)? shape : 0; } return result; }; @@ -343,7 +337,7 @@ void grids::create_grids(std::vector &grids, // bind const auto& shapes = v->get_type()->get_tile_shapes(); for(size_t d = 0; d < shapes.size(); d++){ - if(shapes[d]->get_value() == 1) + if(shapes[d] == 1) continue; unsigned x = get_param_group(v, d); ir::value *&r = references[x]; diff --git a/lib/codegen/selection/selection.cpp b/lib/codegen/selection/selection.cpp index 99b18e568..5059a3130 100644 --- a/lib/codegen/selection/selection.cpp +++ b/lib/codegen/selection/selection.cpp @@ -594,7 +594,7 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id Value *thread_id = builder.CreateAdd(thread_id_in_warp[k], builder.CreateMul(warp_id[k], warp_size_k)); Value *scaled_thread_id = builder.CreateMul(thread_id, contiguous_k); unsigned per_block = contiguous[k] * warp_size[k] * n_warps[k]; - unsigned per_thread = contiguous[k] * shapes[k]->get_value() / per_block; + unsigned per_thread = contiguous[k] * shapes[k] / per_block; std::vector idx_list(per_thread); for(unsigned n = 0 ; n < per_thread; n++){ unsigned offset = n / contiguous[k] * per_block + n % contiguous[k]; @@ -631,9 +631,9 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id unsigned hmma_bts_1 = hmma_wts_1 * wpt_1; unsigned hmma_bts_2 = is_batched ? hmma_wts_2 * wpt_2 : 1; // number of repetition - unsigned num_rep_0 = shapes[0]->get_value() / hmma_bts_0; - unsigned num_rep_1 = shapes[1]->get_value() / hmma_bts_1; - unsigned num_rep_2 = is_batched ? shapes[2]->get_value() / hmma_bts_2 : 1; + unsigned num_rep_0 = shapes[0] / hmma_bts_0; + unsigned num_rep_1 = shapes[1] / hmma_bts_1; + unsigned num_rep_2 = is_batched ? shapes[2] / hmma_bts_2 : 1; // size of each pack (interleaving) pack_size_0_ = std::min(num_rep_0, 1); pack_size_1_ = std::min(num_rep_1, 1); @@ -715,8 +715,8 @@ void selection::create_grids(std::vector &grids, // get number of dimensions greater than 1 auto get_tile_gt1_dim = [&](ir::value *v){ unsigned result = 0; - for(ir::constant_int* shape: v->get_type()->get_tile_shapes()) { - result += (shape->get_value() > 1)?shape->get_value():0; + for(auto shape: v->get_type()->get_tile_shapes()) { + result += (shape > 1)? shape : 0; } return result; }; @@ -736,7 +736,7 @@ void selection::create_grids(std::vector &grids, if(buffer_info_->is_shared(v)) return; for(size_t d = 0; d < shapes.size(); d++){ - if(shapes[d]->get_value() == 1) + if(shapes[d] == 1) continue; unsigned x = params_->get_param_group(v, d); ir::value *&r = references[x]; @@ -771,10 +771,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, for(ir::value *op: user->ops()) create_tile(op, builder, references, seen, sh_mem_ptr); LLVMContext &ctx = builder.getContext(); - const auto& cshapes = v->get_type()->get_tile_shapes(); - std::vector shapes; - for(ir::constant_int* shape: cshapes) - shapes.push_back(shape->get_value()); + auto shapes = v->get_type()->get_tile_shapes(); unsigned pad = alloc_->is_ld_padded(v); if(pad > 0) shapes[0] += pad; @@ -819,10 +816,10 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, } // create distributed tile else { - const auto &cshapes = v->get_type()->get_tile_shapes(); - std::vector axes(cshapes.size()); - for(size_t d = 0; d < cshapes.size(); d++){ - if(cshapes[d]->get_value() > 1){ + const auto &shapes = v->get_type()->get_tile_shapes(); + std::vector axes(shapes.size()); + for(size_t d = 0; d < shapes.size(); d++){ + if(shapes[d] > 1){ unsigned x = params_->get_param_group(v, d); axes[d] = axes_.at(x); } @@ -1037,7 +1034,7 @@ void selection::lower_broadcast(ir::broadcast_inst *x, LLVMContext &ctx, Functio result->for_each([&](indices_t out_idx){ indices_t in_idx = out_idx; for(size_t k = 0; k < in_idx.size(); k++){ - if(in_shapes[k]->get_value() == 1) + if(in_shapes[k] == 1) in_idx[k] = builder.getInt32(0); } result->set_value(out_idx, in_tile->get_value(in_idx)); @@ -1140,7 +1137,7 @@ void selection::lower_hmma_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn unsigned wpt_1 = params_->get_param(dot, "wpt.d1")->get_value(); unsigned stride_rep_i = wpt_0 * wts_0; unsigned stride_rep_j = wpt_1 * wts_1; - unsigned num_rep_i = shapes[0]->get_value() / stride_rep_i; + unsigned num_rep_i = shapes[0] / stride_rep_i; unsigned ld_fc = num_rep_i * 2; @@ -1273,7 +1270,7 @@ void selection::lower_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn, IRB Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {c_ty}); auto A_shapes = A->get_type()->get_tile_shapes(); size_t red_axis = dot->is_a_trans() ? 0 : 1; - unsigned NK = A_shapes[red_axis]->get_value(); + unsigned NK = A_shapes[red_axis]; if(NK != 1) { shared_tile *TA = (shared_tile*)tmap_.at(A); @@ -1463,8 +1460,8 @@ inline llvm::Attribute llvm_attr(llvm::LLVMContext& ctx, ir::attribute attr) { ArrayType* selection::llvm_linearized_tile_type(ir::type *ty, LLVMContext &ctx) { unsigned size = 1; - for(ir::constant_int* shape: ty->get_tile_shapes()) - size *= shape->get_value(); + for(auto shape: ty->get_tile_shapes()) + size *= shape; return ArrayType::get(llvm_type(ty->get_scalar_ty(), ctx), size); } diff --git a/lib/codegen/transform/peephole.cpp b/lib/codegen/transform/peephole.cpp index 114eda0de..f6f5e3c00 100644 --- a/lib/codegen/transform/peephole.cpp +++ b/lib/codegen/transform/peephole.cpp @@ -196,10 +196,9 @@ bool peephole::rewrite_unit_red(ir::instruction *value, ir::builder& builder){ auto x = dynamic_cast(value); if(!x) return false; - ir::constant_int *one = ir::constant_int::get(ir::type::get_int32_ty(value->get_type()->get_context()), 1); ir::value *arg = x->get_operand(0); auto shapes = arg->get_type()->get_tile_shapes(); - if(shapes[x->get_axis()] == one){ + if(shapes[x->get_axis()] == 1){ builder.set_insert_point(x); ir::value* new_red = builder.create_reshape(arg, x->get_type()->get_tile_shapes()); x->replace_all_uses_with(new_red); diff --git a/lib/ir/constant.cpp b/lib/ir/constant.cpp index 883df9967..6df5a58cd 100644 --- a/lib/ir/constant.cpp +++ b/lib/ir/constant.cpp @@ -62,7 +62,7 @@ constant *constant_range::get(constant_int *first, constant_int *last) { assert(first->get_type()->is_integer_ty()); assert(first->get_type() == last->get_type()); assert(((constant_int*)first)->get_value() == 0); - type *ty = tile_type::get(first->get_type(), {last}); + type *ty = tile_type::get(first->get_type(), {(unsigned)last->get_value()}); return new constant_range(ty, first, last); } diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cpp index bd06668e6..3d911b967 100644 --- a/lib/ir/instructions.cpp +++ b/lib/ir/instructions.cpp @@ -486,8 +486,7 @@ std::string retile_inst::shape_suffix(ir::type* ty){ std::string res = "["; const auto& shapes = ty->get_tile_shapes(); for(unsigned i = 0; i < shapes.size(); i++){ - ir::constant_int *shape_i = ty->get_tile_shapes()[i]; - res += shape_i->repr(); + res += std::to_string(ty->get_tile_shapes()[i]); if(i < shapes.size() - 1) res += ", "; } diff --git a/lib/ir/type.cpp b/lib/ir/type.cpp index e07782ffd..aa3d9aa46 100644 --- a/lib/ir/type.cpp +++ b/lib/ir/type.cpp @@ -76,8 +76,8 @@ const type::tile_shapes_t &type::get_tile_shapes() const { unsigned type::get_tile_num_elements() const { const tile_shapes_t& shapes = get_tile_shapes(); unsigned result = 1; - for(ir::constant_int *x: shapes) - result *= x->get_value(); + for(auto shape: shapes) + result *= shape; return result; } @@ -173,7 +173,7 @@ bool tile_type::is_valid_elt_ty(type *ty) { unsigned tile_type::get_num_elements() const { unsigned res = 1; for(auto shape: shapes_) - res *= shape->get_value(); + res *= shape; return res; } @@ -198,11 +198,6 @@ tile_type* tile_type::get_same_shapes(type *ty, type *ref){ return get(ty, ref->get_tile_shapes()); } -type::tile_shapes_t::value_type tile_type::make_one(ir::context& ctx){ - return constant_int::get(type::get_int32_ty(ctx), 1); -} - - //===----------------------------------------------------------------------===// // function_type class //===----------------------------------------------------------------------===// diff --git a/lib/lang/code_gen.cc b/lib/lang/code_gen.cc index cfdeee1f6..17b24d80d 100644 --- a/lib/lang/code_gen.cc +++ b/lib/lang/code_gen.cc @@ -489,9 +489,8 @@ ir::type* Generator::GenIRTileType(TileType* type, ir::context& ctx) { ir::type* ele_ty = GenIRType(type->Derived().GetPtr(), ctx); auto _shape = type->Shape(); ir::tile_type::tile_shapes_t shape; - ir::type* int32_ty = ir::type::get_int32_ty(ctx); for(int s: _shape) - shape.push_back(ir::constant_int::get(int32_ty, s)); + shape.push_back(static_cast(s)); return ir::tile_type::get(ele_ty, shape); } From 606e7999489960f139473030f59f1aaeb80b56c4 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 23 Aug 2019 17:56:30 -0700 Subject: [PATCH 319/494] [LICENSING] updated license to incorporate credit for wgtcc --- LICENSE | 4 ++++ include/triton/runtime/function.h | 5 +---- include/triton/runtime/launch_info.h | 17 ----------------- 3 files changed, 5 insertions(+), 21 deletions(-) delete mode 100644 include/triton/runtime/launch_info.h diff --git a/LICENSE b/LICENSE index 4ea9858e9..464fb143d 100755 --- a/LICENSE +++ b/LICENSE @@ -20,3 +20,7 @@ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + +// The compiler front-end is based on a modified version of WGTCC +// https://github.com/wgtdkp/wgtcc +// Copyright (c) 2016 wgtdkp \ No newline at end of file diff --git a/include/triton/runtime/function.h b/include/triton/runtime/function.h index f30cdabfd..3f7058a9f 100644 --- a/include/triton/runtime/function.h +++ b/include/triton/runtime/function.h @@ -116,13 +116,10 @@ public: std::string make_tensorflow_src(const std::vector &outputs, const std::string ¯o); private: - // execution context ir::context ctx_; - // program representations std::string src_; - std::map cache_; - // options options_space_t opt_space_; + std::map cache_; }; } diff --git a/include/triton/runtime/launch_info.h b/include/triton/runtime/launch_info.h deleted file mode 100644 index 995ed09f4..000000000 --- a/include/triton/runtime/launch_info.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef TRITON_INCLUDE_RUNTIME_LAUNCH_INFO_H -#define TRITON_INCLUDE_RUNTIME_LAUNCH_INFO_H - -#include - -namespace triton{ -namespace runtime{ - -struct launch_information{ - unsigned num_threads; - std::map globals; -}; - -} -} - -#endif From 6158d96ff776a568efa21efeeb4f4e61eaf21b4c Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 23 Aug 2019 18:08:05 -0700 Subject: [PATCH 320/494] [general] cleaned include guards and added #pragma once --- include/triton/driver/backend.h | 27 ++++----------------------- include/triton/driver/buffer.h | 26 +++----------------------- include/triton/driver/context.h | 26 +++----------------------- include/triton/driver/device.h | 26 +++----------------------- include/triton/driver/dispatch.h | 26 +++----------------------- include/triton/driver/error.h | 26 +++----------------------- include/triton/driver/event.h | 26 +++----------------------- include/triton/driver/handle.h | 26 +++----------------------- include/triton/driver/kernel.h | 26 +++----------------------- include/triton/driver/module.h | 26 +++----------------------- include/triton/driver/platform.h | 26 +++----------------------- include/triton/driver/stream.h | 26 +++----------------------- include/triton/ir/basic_block.h | 6 ++++-- include/triton/ir/builder.h | 6 ++++-- include/triton/ir/cfg.h | 6 ++++-- include/triton/ir/constant.h | 6 ++++-- include/triton/ir/context.h | 6 ++++-- include/triton/ir/context_impl.h | 6 ++++-- include/triton/ir/enums.h | 6 ++++-- include/triton/ir/function.h | 6 ++++-- include/triton/ir/instructions.h | 6 ++++-- include/triton/ir/metadata.h | 6 ++++-- include/triton/ir/module.h | 6 ++++-- include/triton/ir/print.h | 5 +++-- include/triton/ir/type.h | 6 ++++-- include/triton/ir/value.h | 6 ++++-- include/triton/lang/ast.h | 2 ++ include/triton/lang/code_gen.h | 2 ++ include/triton/lang/cpp.h | 2 ++ include/triton/lang/encoding.h | 2 ++ include/triton/lang/error.h | 2 ++ include/triton/lang/evaluator.h | 2 ++ include/triton/lang/mem_pool.h | 2 ++ include/triton/lang/parser.h | 2 ++ include/triton/lang/scanner.h | 2 ++ include/triton/lang/scope.h | 2 ++ include/triton/lang/token.h | 2 ++ include/triton/lang/type.h | 2 ++ include/triton/lang/visitor.h | 2 ++ include/triton/runtime/arg.h | 6 ++++-- include/triton/runtime/function.h | 7 +++++-- include/triton/tools/bench.hpp | 6 ++++-- include/triton/tools/thread_pool.h | 6 ++++-- 43 files changed, 135 insertions(+), 312 deletions(-) diff --git a/include/triton/driver/backend.h b/include/triton/driver/backend.h index a91fa7c7a..ac48a7461 100755 --- a/include/triton/driver/backend.h +++ b/include/triton/driver/backend.h @@ -1,27 +1,8 @@ -/* Copyright 2015-2017 Philippe Tillet -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files -* (the "Software"), to deal in the Software without restriction, -* including without limitation the rights to use, copy, modify, merge, -* publish, distribute, sublicense, and/or sell copies of the Software, -* and to permit persons to whom the Software is furnished to do so, -* subject to the following conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ +#pragma once + +#ifndef _TRITON_DRIVER_BACKEND_H_ +#define _TRITON_DRIVER_BACKEND_H_ -#ifndef TDL_INCLUDE_DRIVER_BACKEND_H -#define TDL_INCLUDE_DRIVER_BACKEND_H #include #include diff --git a/include/triton/driver/buffer.h b/include/triton/driver/buffer.h index a0502f789..282f98bfb 100755 --- a/include/triton/driver/buffer.h +++ b/include/triton/driver/buffer.h @@ -1,27 +1,7 @@ -/* Copyright 2015-2017 Philippe Tillet -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files -* (the "Software"), to deal in the Software without restriction, -* including without limitation the rights to use, copy, modify, merge, -* publish, distribute, sublicense, and/or sell copies of the Software, -* and to permit persons to whom the Software is furnished to do so, -* subject to the following conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ +#pragma once -#ifndef TDL_INCLUDE_DRIVER_BUFFER_H -#define TDL_INCLUDE_DRIVER_BUFFER_H +#ifndef _TRITON_DRIVER_BUFFER_H_ +#define _TRITON_DRIVER_BUFFER_H_ #include "triton/driver/handle.h" #include "triton/driver/context.h" diff --git a/include/triton/driver/context.h b/include/triton/driver/context.h index 7a31e85a1..9e368972d 100755 --- a/include/triton/driver/context.h +++ b/include/triton/driver/context.h @@ -1,27 +1,7 @@ -/* Copyright 2015-2017 Philippe Tillet -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files -* (the "Software"), to deal in the Software without restriction, -* including without limitation the rights to use, copy, modify, merge, -* publish, distribute, sublicense, and/or sell copies of the Software, -* and to permit persons to whom the Software is furnished to do so, -* subject to the following conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ +#pragma once -#ifndef TDL_INCLUDE_DRIVER_CONTEXT_H -#define TDL_INCLUDE_DRIVER_CONTEXT_H +#ifndef _TRITON_DRIVER_CONTEXT_H_ +#define _TRITON_DRIVER_CONTEXT_H_ #include "triton/driver/device.h" #include "triton/driver/handle.h" diff --git a/include/triton/driver/device.h b/include/triton/driver/device.h index f4a786a31..df119a272 100755 --- a/include/triton/driver/device.h +++ b/include/triton/driver/device.h @@ -1,27 +1,7 @@ -/* Copyright 2015-2017 Philippe Tillet -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files -* (the "Software"), to deal in the Software without restriction, -* including without limitation the rights to use, copy, modify, merge, -* publish, distribute, sublicense, and/or sell copies of the Software, -* and to permit persons to whom the Software is furnished to do so, -* subject to the following conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ +#pragma once -#ifndef TDL_INCLUDE_DRIVER_DEVICE_H -#define TDL_INCLUDE_DRIVER_DEVICE_H +#ifndef _TRITON_DRIVER_DEVICE_H_ +#define _TRITON_DRIVER_DEVICE_H_ #include "triton/driver/platform.h" #include "triton/driver/handle.h" diff --git a/include/triton/driver/dispatch.h b/include/triton/driver/dispatch.h index 9803a163e..7f6fdf7e0 100755 --- a/include/triton/driver/dispatch.h +++ b/include/triton/driver/dispatch.h @@ -1,27 +1,7 @@ -/* Copyright 2015-2017 Philippe Tillet -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files -* (the "Software"), to deal in the Software without restriction, -* including without limitation the rights to use, copy, modify, merge, -* publish, distribute, sublicense, and/or sell copies of the Software, -* and to permit persons to whom the Software is furnished to do so, -* subject to the following conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ +#pragma once -#ifndef TDL_INCLUDE_DRIVER_DISPATCHER_H -#define TDL_INCLUDE_DRIVER_DISPATCHER_H +#ifndef _TRITON_DRIVER_DISPATCH_H_ +#define _TRITON_DRIVER_DISPATCH_H_ #include #include diff --git a/include/triton/driver/error.h b/include/triton/driver/error.h index dd695e8c8..5091faf3e 100755 --- a/include/triton/driver/error.h +++ b/include/triton/driver/error.h @@ -1,27 +1,7 @@ -/* Copyright 2015-2017 Philippe Tillet -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files -* (the "Software"), to deal in the Software without restriction, -* including without limitation the rights to use, copy, modify, merge, -* publish, distribute, sublicense, and/or sell copies of the Software, -* and to permit persons to whom the Software is furnished to do so, -* subject to the following conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ +#pragma once -#ifndef TDL_INCLUDE_DRIVER_ERROR_H -#define TDL_INCLUDE_DRIVER_ERROR_H +#ifndef _TRITON_DRIVER_ERROR_H_ +#define _TRITON_DRIVER_ERROR_H_ #include #include "triton/driver/dispatch.h" diff --git a/include/triton/driver/event.h b/include/triton/driver/event.h index 633f03d7d..7310d001f 100755 --- a/include/triton/driver/event.h +++ b/include/triton/driver/event.h @@ -1,27 +1,7 @@ -/* Copyright 2015-2017 Philippe Tillet -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files -* (the "Software"), to deal in the Software without restriction, -* including without limitation the rights to use, copy, modify, merge, -* publish, distribute, sublicense, and/or sell copies of the Software, -* and to permit persons to whom the Software is furnished to do so, -* subject to the following conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ +#pragma once -#ifndef TDL_INCLUDE_DRIVER_EVENT_H -#define TDL_INCLUDE_DRIVER_EVENT_H +#ifndef _TRITON_DRIVER_EVENT_H_ +#define _TRITON_DRIVER_EVENT_H_ #include "triton/driver/handle.h" diff --git a/include/triton/driver/handle.h b/include/triton/driver/handle.h index d3b6f151c..eac14dca2 100755 --- a/include/triton/driver/handle.h +++ b/include/triton/driver/handle.h @@ -1,27 +1,7 @@ -/* Copyright 2015-2017 Philippe Tillet -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files -* (the "Software"), to deal in the Software without restriction, -* including without limitation the rights to use, copy, modify, merge, -* publish, distribute, sublicense, and/or sell copies of the Software, -* and to permit persons to whom the Software is furnished to do so, -* subject to the following conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ +#pragma once -#ifndef TDL_INCLUDE_DRIVER_HANDLE_H -#define TDL_INCLUDE_DRIVER_HANDLE_H +#ifndef _TRITON_DRIVER_HANDLE_H_ +#define _TRITON_DRIVER_HANDLE_H_ #include #include diff --git a/include/triton/driver/kernel.h b/include/triton/driver/kernel.h index fafbcb0bd..b45755ee7 100755 --- a/include/triton/driver/kernel.h +++ b/include/triton/driver/kernel.h @@ -1,27 +1,7 @@ -/* Copyright 2015-2017 Philippe Tillet -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files -* (the "Software"), to deal in the Software without restriction, -* including without limitation the rights to use, copy, modify, merge, -* publish, distribute, sublicense, and/or sell copies of the Software, -* and to permit persons to whom the Software is furnished to do so, -* subject to the following conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ +#pragma once -#ifndef TDL_INCLUDE_DRIVER_KERNEL_H -#define TDL_INCLUDE_DRIVER_KERNEL_H +#ifndef _TRITON_DRIVER_KERNEL_H_ +#define _TRITON_DRIVER_KERNEL_H_ #include "triton/driver/module.h" #include "triton/driver/handle.h" diff --git a/include/triton/driver/module.h b/include/triton/driver/module.h index 7fe2d6f06..59fe6dcc0 100755 --- a/include/triton/driver/module.h +++ b/include/triton/driver/module.h @@ -1,27 +1,7 @@ -/* Copyright 2015-2017 Philippe Tillet -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files -* (the "Software"), to deal in the Software without restriction, -* including without limitation the rights to use, copy, modify, merge, -* publish, distribute, sublicense, and/or sell copies of the Software, -* and to permit persons to whom the Software is furnished to do so, -* subject to the following conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ +#pragma once -#ifndef TDL_INCLUDE_DRIVER_MODULE_H -#define TDL_INCLUDE_DRIVER_MODULE_H +#ifndef _TRITON_DRIVER_MODULE_H_ +#define _TRITON_DRIVER_MODULE_H_ #include #include "triton/driver/handle.h" diff --git a/include/triton/driver/platform.h b/include/triton/driver/platform.h index 45b5399c5..ff4e83b9d 100755 --- a/include/triton/driver/platform.h +++ b/include/triton/driver/platform.h @@ -1,27 +1,7 @@ -/* Copyright 2015-2017 Philippe Tillet -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files -* (the "Software"), to deal in the Software without restriction, -* including without limitation the rights to use, copy, modify, merge, -* publish, distribute, sublicense, and/or sell copies of the Software, -* and to permit persons to whom the Software is furnished to do so, -* subject to the following conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ +#pragma once -#ifndef TDL_INCLUDE_DRIVER_PLATFORM_H -#define TDL_INCLUDE_DRIVER_PLATFORM_H +#ifndef _TRITON_DRIVER_PLATFORM_H_ +#define _TRITON_DRIVER_PLATFORM_H_ #include #include diff --git a/include/triton/driver/stream.h b/include/triton/driver/stream.h index 76d72af39..4b80b62af 100755 --- a/include/triton/driver/stream.h +++ b/include/triton/driver/stream.h @@ -1,27 +1,7 @@ -/* Copyright 2015-2017 Philippe Tillet -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files -* (the "Software"), to deal in the Software without restriction, -* including without limitation the rights to use, copy, modify, merge, -* publish, distribute, sublicense, and/or sell copies of the Software, -* and to permit persons to whom the Software is furnished to do so, -* subject to the following conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ +#pragma once -#ifndef TDL_INCLUDE_DRIVER_STREAM_H -#define TDL_INCLUDE_DRIVER_STREAM_H +#ifndef _TRITON_DRIVER_STREAM_H_ +#define _TRITON_DRIVER_STREAM_H_ #include #include "triton/driver/context.h" diff --git a/include/triton/ir/basic_block.h b/include/triton/ir/basic_block.h index 09eb3ad64..4a60586f0 100644 --- a/include/triton/ir/basic_block.h +++ b/include/triton/ir/basic_block.h @@ -1,5 +1,7 @@ -#ifndef TDL_INCLUDE_IR_BASIC_BLOCK_H -#define TDL_INCLUDE_IR_BASIC_BLOCK_H +#pragma once + +#ifndef _TRITON_IR_BASIC_BLOCK_H_ +#define _TRITON_IR_BASIC_BLOCK_H_ #include #include diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h index 3140565cc..0b6c859b1 100644 --- a/include/triton/ir/builder.h +++ b/include/triton/ir/builder.h @@ -1,5 +1,7 @@ -#ifndef TDL_INCLUDE_IR_BUILDER_H -#define TDL_INCLUDE_IR_BUILDER_H +#pragma once + +#ifndef _TRITON_IR_BUILDER_H_ +#define _TRITON_IR_BUILDER_H_ #include #include diff --git a/include/triton/ir/cfg.h b/include/triton/ir/cfg.h index 8a00a32ef..a61ff6dee 100644 --- a/include/triton/ir/cfg.h +++ b/include/triton/ir/cfg.h @@ -1,5 +1,7 @@ -#ifndef TDL_INCLUDE_IR_CFG_H -#define TDL_INCLUDE_IR_CFG_H +#pragma once + +#ifndef _TRITON_IR_CFG_H_ +#define _TRITON_IR_CFG_H_ #include diff --git a/include/triton/ir/constant.h b/include/triton/ir/constant.h index b33c9d427..6e177a47f 100644 --- a/include/triton/ir/constant.h +++ b/include/triton/ir/constant.h @@ -1,5 +1,7 @@ -#ifndef TDL_INCLUDE_IR_CONSTANT_H -#define TDL_INCLUDE_IR_CONSTANT_H +#pragma once + +#ifndef _TRITON_IR_CONSTANT_H_ +#define _TRITON_IR_CONSTANT_H_ #include "enums.h" #include "value.h" diff --git a/include/triton/ir/context.h b/include/triton/ir/context.h index 1433d741d..83627e869 100644 --- a/include/triton/ir/context.h +++ b/include/triton/ir/context.h @@ -1,5 +1,7 @@ -#ifndef TDL_INCLUDE_IR_CONTEXT_H -#define TDL_INCLUDE_IR_CONTEXT_H +#pragma once + +#ifndef _TRITON_IR_CONTEXT_H_ +#define _TRITON_IR_CONTEXT_H_ #include #include "triton/ir/type.h" diff --git a/include/triton/ir/context_impl.h b/include/triton/ir/context_impl.h index 0ca515f45..df26796c6 100644 --- a/include/triton/ir/context_impl.h +++ b/include/triton/ir/context_impl.h @@ -1,5 +1,7 @@ -#ifndef TDL_INCLUDE_IR_CONTEXT_IMPL_H -#define TDL_INCLUDE_IR_CONTEXT_IMPL_H +#pragma once + +#ifndef _TRITON_IR_CONTEXT_IMPL_H_ +#define _TRITON_IR_CONTEXT_IMPL_H_ #include #include "triton/ir/type.h" diff --git a/include/triton/ir/enums.h b/include/triton/ir/enums.h index 600c83ade..609fb2d46 100644 --- a/include/triton/ir/enums.h +++ b/include/triton/ir/enums.h @@ -1,5 +1,7 @@ -#ifndef TRITON_IR_ENUMS_H -#define TRITON_IR_ENUMS_H +#pragma once + +#ifndef _TRITON_IR_ENUMS_H_ +#define _TRITON_IR_ENUMS_H_ namespace triton{ namespace ir{ diff --git a/include/triton/ir/function.h b/include/triton/ir/function.h index 9cfc89931..4a7c308eb 100644 --- a/include/triton/ir/function.h +++ b/include/triton/ir/function.h @@ -1,5 +1,7 @@ -#ifndef TDL_INCLUDE_IR_FUNCTION_H -#define TDL_INCLUDE_IR_FUNCTION_H +#pragma once + +#ifndef _TRITON_IR_FUNCTION_H_ +#define _TRITON_IR_FUNCTION_H_ #include #include diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index 446dd871b..5c6af5362 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -1,5 +1,7 @@ -#ifndef TDL_INCLUDE_IR_INSTRUCTIONS_H -#define TDL_INCLUDE_IR_INSTRUCTIONS_H +#pragma once + +#ifndef _TRITON_IR_INSTRUCTIONS_H_ +#define _TRITON_IR_INSTRUCTIONS_H_ #include #include diff --git a/include/triton/ir/metadata.h b/include/triton/ir/metadata.h index 618e84cb2..e595da36c 100644 --- a/include/triton/ir/metadata.h +++ b/include/triton/ir/metadata.h @@ -1,5 +1,7 @@ -#ifndef TDL_INCLUDE_IR_METADATA_H -#define TDL_INCLUDE_IR_METADATA_H +#pragma once + +#ifndef _TRITON_IR_METADATA_H_ +#define _TRITON_IR_METADATA_H_ namespace triton{ namespace ir{ diff --git a/include/triton/ir/module.h b/include/triton/ir/module.h index f91269b38..d85140d39 100644 --- a/include/triton/ir/module.h +++ b/include/triton/ir/module.h @@ -1,5 +1,7 @@ -#ifndef TDL_INCLUDE_IR_MODULE_H -#define TDL_INCLUDE_IR_MODULE_H +#pragma once + +#ifndef _TRITON_IR_MODULE_H_ +#define _TRITON_IR_MODULE_H_ #include #include diff --git a/include/triton/ir/print.h b/include/triton/ir/print.h index c5a034ea3..471948d5f 100644 --- a/include/triton/ir/print.h +++ b/include/triton/ir/print.h @@ -1,6 +1,7 @@ -#ifndef TDL_INCLUDE_IR_PRINT_H -#define TDL_INCLUDE_IR_PRINT_H +#pragma once +#ifndef _TRITON_IR_PRINT_H_ +#define _TRITON_IR_PRINT_H_ #include "builder.h" diff --git a/include/triton/ir/type.h b/include/triton/ir/type.h index 786a196dc..60d2d9691 100644 --- a/include/triton/ir/type.h +++ b/include/triton/ir/type.h @@ -1,5 +1,7 @@ -#ifndef TDL_INCLUDE_IR_TYPE_H -#define TDL_INCLUDE_IR_TYPE_H +#pragma once + +#ifndef _TRITON_IR_TYPE_H_ +#define _TRITON_IR_TYPE_H_ #include diff --git a/include/triton/ir/value.h b/include/triton/ir/value.h index 284a0a3b3..0c2727a38 100644 --- a/include/triton/ir/value.h +++ b/include/triton/ir/value.h @@ -1,5 +1,7 @@ -#ifndef TDL_INCLUDE_IR_VALUE_H -#define TDL_INCLUDE_IR_VALUE_H +#pragma once + +#ifndef _TRITON_IR_VALUE_H_ +#define _TRITON_IR_VALUE_H_ #include #include diff --git a/include/triton/lang/ast.h b/include/triton/lang/ast.h index d1c8f8690..6e10219e4 100644 --- a/include/triton/lang/ast.h +++ b/include/triton/lang/ast.h @@ -1,3 +1,5 @@ +#pragma once + #ifndef _WGTCC_AST_H_ #define _WGTCC_AST_H_ diff --git a/include/triton/lang/code_gen.h b/include/triton/lang/code_gen.h index caf4067db..794423536 100644 --- a/include/triton/lang/code_gen.h +++ b/include/triton/lang/code_gen.h @@ -1,3 +1,5 @@ +#pragma once + #ifndef _WGTCC_CODE_GEN_H_ #define _WGTCC_CODE_GEN_H_ diff --git a/include/triton/lang/cpp.h b/include/triton/lang/cpp.h index 5f7a296c1..cfd839dd7 100644 --- a/include/triton/lang/cpp.h +++ b/include/triton/lang/cpp.h @@ -1,3 +1,5 @@ +#pragma once + #ifndef _WGTCC_CPP_H_ #define _WGTCC_CPP_H_ diff --git a/include/triton/lang/encoding.h b/include/triton/lang/encoding.h index 9d6c1e544..297b2b732 100644 --- a/include/triton/lang/encoding.h +++ b/include/triton/lang/encoding.h @@ -1,3 +1,5 @@ +#pragma once + #ifndef _WGTCC_ENCODING_H_ #define _WGTCC_ENCODING_H_ diff --git a/include/triton/lang/error.h b/include/triton/lang/error.h index fdae7e060..386ca3a3e 100644 --- a/include/triton/lang/error.h +++ b/include/triton/lang/error.h @@ -1,3 +1,5 @@ +#pragma once + #ifndef _WGTCC_ERROR_H_ #define _WGTCC_ERROR_H_ diff --git a/include/triton/lang/evaluator.h b/include/triton/lang/evaluator.h index 6269e66d8..589739b45 100644 --- a/include/triton/lang/evaluator.h +++ b/include/triton/lang/evaluator.h @@ -1,3 +1,5 @@ +#pragma once + #ifndef _WGTCC_EVALUATOR_H_ #define _WGTCC_EVALUATOR_H_ diff --git a/include/triton/lang/mem_pool.h b/include/triton/lang/mem_pool.h index 217237784..9b6ab53c1 100644 --- a/include/triton/lang/mem_pool.h +++ b/include/triton/lang/mem_pool.h @@ -1,3 +1,5 @@ +#pragma once + #ifndef _WGTCC_MEM_POOL_H_ #define _WGTCC_MEM_POOL_H_ diff --git a/include/triton/lang/parser.h b/include/triton/lang/parser.h index bc43d9daf..63e312026 100644 --- a/include/triton/lang/parser.h +++ b/include/triton/lang/parser.h @@ -1,3 +1,5 @@ +#pragma once + #ifndef _PARSER_H_ #define _PARSER_H_ diff --git a/include/triton/lang/scanner.h b/include/triton/lang/scanner.h index aee010638..57cdff9a0 100644 --- a/include/triton/lang/scanner.h +++ b/include/triton/lang/scanner.h @@ -1,3 +1,5 @@ +#pragma once + #ifndef _WGTCC_SCANNER_H_ #define _WGTCC_SCANNER_H_ diff --git a/include/triton/lang/scope.h b/include/triton/lang/scope.h index eea115bfb..b958d3ecd 100644 --- a/include/triton/lang/scope.h +++ b/include/triton/lang/scope.h @@ -1,3 +1,5 @@ +#pragma once + #ifndef _WGTCC_SCOPE_H_ #define _WGTCC_SCOPE_H_ diff --git a/include/triton/lang/token.h b/include/triton/lang/token.h index e982ec803..a920e082c 100644 --- a/include/triton/lang/token.h +++ b/include/triton/lang/token.h @@ -1,3 +1,5 @@ +#pragma once + #ifndef _WGTCC_TOKEN_H_ #define _WGTCC_TOKEN_H_ diff --git a/include/triton/lang/type.h b/include/triton/lang/type.h index 1cb10777b..08b8418f3 100644 --- a/include/triton/lang/type.h +++ b/include/triton/lang/type.h @@ -1,3 +1,5 @@ +#pragma once + #ifndef _WGTCC_TYPE_H_ #define _WGTCC_TYPE_H_ diff --git a/include/triton/lang/visitor.h b/include/triton/lang/visitor.h index e761e9c0f..16398f57b 100644 --- a/include/triton/lang/visitor.h +++ b/include/triton/lang/visitor.h @@ -1,3 +1,5 @@ +#pragma once + #ifndef _WGTCC_VISITOR_H_ #define _WGTCC_VISITOR_H_ diff --git a/include/triton/runtime/arg.h b/include/triton/runtime/arg.h index af55f4014..6e255f0e7 100644 --- a/include/triton/runtime/arg.h +++ b/include/triton/runtime/arg.h @@ -1,5 +1,7 @@ -#ifndef TDL_INCLUDE_ARG_H -#define TDL_INCLUDE_ARG_H +#pragma once + +#ifndef _TRITON_RUNTIME_ARG_H_ +#define _TRITON_RUNTIME_ARG_H_ #include #include diff --git a/include/triton/runtime/function.h b/include/triton/runtime/function.h index 3f7058a9f..788def32a 100644 --- a/include/triton/runtime/function.h +++ b/include/triton/runtime/function.h @@ -1,5 +1,8 @@ -#ifndef TDL_INCLUDE_FUNCTION_H -#define TDL_INCLUDE_FUNCTION_H +#pragma once + +#ifndef _TRITON_RUNTIME_FUNCTION_H_ +#define _TRITON_RUNTIME_FUNCTION_H_ + #include #include diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp index b2492c31f..99a04125e 100644 --- a/include/triton/tools/bench.hpp +++ b/include/triton/tools/bench.hpp @@ -1,5 +1,7 @@ -#ifndef TRITON_TOOLS_BENCH_HPP -#define TRITON_TOOLS_BENCH_HPP +#pragma once + +#ifndef _TRITON_TOOLS_BENCH_H_ +#define _TRITON_TOOLS_BENCH_H_ #include #include diff --git a/include/triton/tools/thread_pool.h b/include/triton/tools/thread_pool.h index 0475bdb24..143ef30ab 100644 --- a/include/triton/tools/thread_pool.h +++ b/include/triton/tools/thread_pool.h @@ -1,5 +1,7 @@ -#ifndef THREAD_POOL_H -#define THREAD_POOL_H +#pragma once + +#ifndef _TRITON_TOOLS_THREAD_POOL_H_ +#define _TRITON_TOOLS_THREAD_POOL_H_ #include #include From 732156b942fb778653d46e93e389ed4e3275892a Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 23 Aug 2019 19:06:39 -0700 Subject: [PATCH 321/494] [general] rename *.cpp -> *.cc --- examples/cpp/CMakeLists.txt | 2 +- examples/cpp/{dot.cpp => dot.cc} | 0 include/triton/runtime/function.h | 3 +-- .../analysis/{alignment.cpp => alignment.cc} | 0 .../shmem/{allocation.cpp => allocation.cc} | 0 .../analysis/shmem/{info.cpp => info.cc} | 0 .../shmem/{liveness.cpp => liveness.cc} | 0 lib/codegen/analysis/{tune.cpp => tune.cc} | 0 .../selection/{selection.cpp => selection.cc} | 0 lib/codegen/selection/{target.cpp => target.cc} | 0 lib/codegen/transform/{dce.cpp => dce.cc} | 0 .../transform/{peephole.cpp => peephole.cc} | 0 .../{reassociate.cpp => reassociate.cc} | 0 .../shmem/{barriers.cpp => barriers.cc} | 0 .../transform/{vectorize.cpp => vectorize.cc} | 0 lib/driver/{backend.cpp => backend.cc} | 0 lib/driver/{buffer.cpp => buffer.cc} | 0 lib/driver/{context.cpp => context.cc} | 0 lib/driver/{device.cpp => device.cc} | 0 lib/driver/{dispatch.cpp => dispatch.cc} | 0 lib/driver/{error.cpp => error.cc} | 0 lib/driver/{event.cpp => event.cc} | 0 lib/driver/{handle.cpp => handle.cc} | 0 lib/driver/{kernel.cpp => kernel.cc} | 0 lib/driver/{module.cpp => module.cc} | 17 +---------------- lib/driver/{platform.cpp => platform.cc} | 0 lib/driver/{stream.cpp => stream.cc} | 0 lib/ir/{basic_block.cpp => basic_block.cc} | 0 lib/ir/{builder.cpp => builder.cc} | 0 lib/ir/{cfg.cpp => cfg.cc} | 0 lib/ir/{constant.cpp => constant.cc} | 0 lib/ir/{context.cpp => context.cc} | 0 lib/ir/{function.cpp => function.cc} | 0 lib/ir/{instructions.cpp => instructions.cc} | 0 lib/ir/{metadata.cpp => metadata.cc} | 0 lib/ir/{module.cpp => module.cc} | 0 lib/ir/{print.cpp => print.cc} | 0 lib/ir/{type.cpp => type.cc} | 0 lib/ir/{value.cpp => value.cc} | 0 lib/runtime/{arg.cpp => arg.cc} | 0 lib/runtime/{function.cpp => function.cc} | 0 python/examples/lol.cc | 7 +++++++ python/src/{tensorflow.cpp => tensorflow.cc} | 0 .../{alloc_empty.cpp => alloc_empty.cc} | 0 44 files changed, 10 insertions(+), 19 deletions(-) rename examples/cpp/{dot.cpp => dot.cc} (100%) rename lib/codegen/analysis/{alignment.cpp => alignment.cc} (100%) rename lib/codegen/analysis/shmem/{allocation.cpp => allocation.cc} (100%) rename lib/codegen/analysis/shmem/{info.cpp => info.cc} (100%) rename lib/codegen/analysis/shmem/{liveness.cpp => liveness.cc} (100%) rename lib/codegen/analysis/{tune.cpp => tune.cc} (100%) rename lib/codegen/selection/{selection.cpp => selection.cc} (100%) rename lib/codegen/selection/{target.cpp => target.cc} (100%) rename lib/codegen/transform/{dce.cpp => dce.cc} (100%) rename lib/codegen/transform/{peephole.cpp => peephole.cc} (100%) rename lib/codegen/transform/{reassociate.cpp => reassociate.cc} (100%) rename lib/codegen/transform/shmem/{barriers.cpp => barriers.cc} (100%) rename lib/codegen/transform/{vectorize.cpp => vectorize.cc} (100%) rename lib/driver/{backend.cpp => backend.cc} (100%) rename lib/driver/{buffer.cpp => buffer.cc} (100%) rename lib/driver/{context.cpp => context.cc} (100%) rename lib/driver/{device.cpp => device.cc} (100%) rename lib/driver/{dispatch.cpp => dispatch.cc} (100%) rename lib/driver/{error.cpp => error.cc} (100%) rename lib/driver/{event.cpp => event.cc} (100%) rename lib/driver/{handle.cpp => handle.cc} (100%) rename lib/driver/{kernel.cpp => kernel.cc} (100%) rename lib/driver/{module.cpp => module.cc} (94%) rename lib/driver/{platform.cpp => platform.cc} (100%) rename lib/driver/{stream.cpp => stream.cc} (100%) rename lib/ir/{basic_block.cpp => basic_block.cc} (100%) rename lib/ir/{builder.cpp => builder.cc} (100%) rename lib/ir/{cfg.cpp => cfg.cc} (100%) rename lib/ir/{constant.cpp => constant.cc} (100%) rename lib/ir/{context.cpp => context.cc} (100%) rename lib/ir/{function.cpp => function.cc} (100%) rename lib/ir/{instructions.cpp => instructions.cc} (100%) rename lib/ir/{metadata.cpp => metadata.cc} (100%) rename lib/ir/{module.cpp => module.cc} (100%) rename lib/ir/{print.cpp => print.cc} (100%) rename lib/ir/{type.cpp => type.cc} (100%) rename lib/ir/{value.cpp => value.cc} (100%) rename lib/runtime/{arg.cpp => arg.cc} (100%) rename lib/runtime/{function.cpp => function.cc} (100%) create mode 100644 python/examples/lol.cc rename python/src/{tensorflow.cpp => tensorflow.cc} (100%) rename python/src/tensorflow/{alloc_empty.cpp => alloc_empty.cc} (100%) diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt index f5f6a40b8..cea728c8e 100644 --- a/examples/cpp/CMakeLists.txt +++ b/examples/cpp/CMakeLists.txt @@ -1,5 +1,5 @@ foreach(PROG dot) - add_executable(${PROG} ${PROG}.cpp) + add_executable(${PROG} ${PROG}.cc) set_target_properties(${PROG} PROPERTIES OUTPUT_NAME ${PROG}) include_directories(/usr/local/cuda/include/) target_link_libraries(${PROG} triton cublas) diff --git a/examples/cpp/dot.cpp b/examples/cpp/dot.cc similarity index 100% rename from examples/cpp/dot.cpp rename to examples/cpp/dot.cc diff --git a/include/triton/runtime/function.h b/include/triton/runtime/function.h index 788def32a..c3f4d53ff 100644 --- a/include/triton/runtime/function.h +++ b/include/triton/runtime/function.h @@ -4,12 +4,10 @@ #define _TRITON_RUNTIME_FUNCTION_H_ -#include #include #include #include #include -#include "arg.h" // codegen #include "triton/codegen/selection/selection.h" #include "triton/codegen/selection/target.h" @@ -24,6 +22,7 @@ #include "triton/codegen/transform/reassociate.h" #include "triton/codegen/transform/vectorize.h" #include "triton/lang/parser.h" +#include "triton/runtime/arg.h" namespace llvm { class Module; diff --git a/lib/codegen/analysis/alignment.cpp b/lib/codegen/analysis/alignment.cc similarity index 100% rename from lib/codegen/analysis/alignment.cpp rename to lib/codegen/analysis/alignment.cc diff --git a/lib/codegen/analysis/shmem/allocation.cpp b/lib/codegen/analysis/shmem/allocation.cc similarity index 100% rename from lib/codegen/analysis/shmem/allocation.cpp rename to lib/codegen/analysis/shmem/allocation.cc diff --git a/lib/codegen/analysis/shmem/info.cpp b/lib/codegen/analysis/shmem/info.cc similarity index 100% rename from lib/codegen/analysis/shmem/info.cpp rename to lib/codegen/analysis/shmem/info.cc diff --git a/lib/codegen/analysis/shmem/liveness.cpp b/lib/codegen/analysis/shmem/liveness.cc similarity index 100% rename from lib/codegen/analysis/shmem/liveness.cpp rename to lib/codegen/analysis/shmem/liveness.cc diff --git a/lib/codegen/analysis/tune.cpp b/lib/codegen/analysis/tune.cc similarity index 100% rename from lib/codegen/analysis/tune.cpp rename to lib/codegen/analysis/tune.cc diff --git a/lib/codegen/selection/selection.cpp b/lib/codegen/selection/selection.cc similarity index 100% rename from lib/codegen/selection/selection.cpp rename to lib/codegen/selection/selection.cc diff --git a/lib/codegen/selection/target.cpp b/lib/codegen/selection/target.cc similarity index 100% rename from lib/codegen/selection/target.cpp rename to lib/codegen/selection/target.cc diff --git a/lib/codegen/transform/dce.cpp b/lib/codegen/transform/dce.cc similarity index 100% rename from lib/codegen/transform/dce.cpp rename to lib/codegen/transform/dce.cc diff --git a/lib/codegen/transform/peephole.cpp b/lib/codegen/transform/peephole.cc similarity index 100% rename from lib/codegen/transform/peephole.cpp rename to lib/codegen/transform/peephole.cc diff --git a/lib/codegen/transform/reassociate.cpp b/lib/codegen/transform/reassociate.cc similarity index 100% rename from lib/codegen/transform/reassociate.cpp rename to lib/codegen/transform/reassociate.cc diff --git a/lib/codegen/transform/shmem/barriers.cpp b/lib/codegen/transform/shmem/barriers.cc similarity index 100% rename from lib/codegen/transform/shmem/barriers.cpp rename to lib/codegen/transform/shmem/barriers.cc diff --git a/lib/codegen/transform/vectorize.cpp b/lib/codegen/transform/vectorize.cc similarity index 100% rename from lib/codegen/transform/vectorize.cpp rename to lib/codegen/transform/vectorize.cc diff --git a/lib/driver/backend.cpp b/lib/driver/backend.cc similarity index 100% rename from lib/driver/backend.cpp rename to lib/driver/backend.cc diff --git a/lib/driver/buffer.cpp b/lib/driver/buffer.cc similarity index 100% rename from lib/driver/buffer.cpp rename to lib/driver/buffer.cc diff --git a/lib/driver/context.cpp b/lib/driver/context.cc similarity index 100% rename from lib/driver/context.cpp rename to lib/driver/context.cc diff --git a/lib/driver/device.cpp b/lib/driver/device.cc similarity index 100% rename from lib/driver/device.cpp rename to lib/driver/device.cc diff --git a/lib/driver/dispatch.cpp b/lib/driver/dispatch.cc similarity index 100% rename from lib/driver/dispatch.cpp rename to lib/driver/dispatch.cc diff --git a/lib/driver/error.cpp b/lib/driver/error.cc similarity index 100% rename from lib/driver/error.cpp rename to lib/driver/error.cc diff --git a/lib/driver/event.cpp b/lib/driver/event.cc similarity index 100% rename from lib/driver/event.cpp rename to lib/driver/event.cc diff --git a/lib/driver/handle.cpp b/lib/driver/handle.cc similarity index 100% rename from lib/driver/handle.cpp rename to lib/driver/handle.cc diff --git a/lib/driver/kernel.cpp b/lib/driver/kernel.cc similarity index 100% rename from lib/driver/kernel.cpp rename to lib/driver/kernel.cc diff --git a/lib/driver/module.cpp b/lib/driver/module.cc similarity index 94% rename from lib/driver/module.cpp rename to lib/driver/module.cc index 6195a1249..3174eb031 100755 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cc @@ -19,38 +19,23 @@ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include #include #include -#include #include "triton/driver/module.h" #include "triton/driver/context.h" #include "triton/driver/error.h" -#include "triton/tools/sys/getenv.hpp" #include "llvm/IR/IRBuilder.h" -#include "llvm/IR/IRPrintingPasses.h" -#include "llvm/Bitcode/BitcodeWriter.h" #include "llvm/IR/Verifier.h" #include "llvm/IR/Module.h" -#include "llvm/IR/LLVMContext.h" -#include "llvm/IR/PassManager.h" #include "llvm/Support/SourceMgr.h" -#include "llvm/Linker/Linker.h" -#include "llvm/IRReader/IRReader.h" -#include "llvm/AsmParser/Parser.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/TargetSelect.h" -#include "llvm/Support/Host.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" -#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/LegacyPassManager.h" -#include "llvm/Transforms/Scalar/EarlyCSE.h" -#include "llvm/Analysis/LoopPass.h" #include "llvm/ExecutionEngine/ExecutionEngine.h" -#include "llvm/ExecutionEngine/OrcMCJITReplacement.h" -#include +#include "llvm/ExecutionEngine/SectionMemoryManager.h" #include "llvm/Transforms/Utils/Cloning.h" namespace triton diff --git a/lib/driver/platform.cpp b/lib/driver/platform.cc similarity index 100% rename from lib/driver/platform.cpp rename to lib/driver/platform.cc diff --git a/lib/driver/stream.cpp b/lib/driver/stream.cc similarity index 100% rename from lib/driver/stream.cpp rename to lib/driver/stream.cc diff --git a/lib/ir/basic_block.cpp b/lib/ir/basic_block.cc similarity index 100% rename from lib/ir/basic_block.cpp rename to lib/ir/basic_block.cc diff --git a/lib/ir/builder.cpp b/lib/ir/builder.cc similarity index 100% rename from lib/ir/builder.cpp rename to lib/ir/builder.cc diff --git a/lib/ir/cfg.cpp b/lib/ir/cfg.cc similarity index 100% rename from lib/ir/cfg.cpp rename to lib/ir/cfg.cc diff --git a/lib/ir/constant.cpp b/lib/ir/constant.cc similarity index 100% rename from lib/ir/constant.cpp rename to lib/ir/constant.cc diff --git a/lib/ir/context.cpp b/lib/ir/context.cc similarity index 100% rename from lib/ir/context.cpp rename to lib/ir/context.cc diff --git a/lib/ir/function.cpp b/lib/ir/function.cc similarity index 100% rename from lib/ir/function.cpp rename to lib/ir/function.cc diff --git a/lib/ir/instructions.cpp b/lib/ir/instructions.cc similarity index 100% rename from lib/ir/instructions.cpp rename to lib/ir/instructions.cc diff --git a/lib/ir/metadata.cpp b/lib/ir/metadata.cc similarity index 100% rename from lib/ir/metadata.cpp rename to lib/ir/metadata.cc diff --git a/lib/ir/module.cpp b/lib/ir/module.cc similarity index 100% rename from lib/ir/module.cpp rename to lib/ir/module.cc diff --git a/lib/ir/print.cpp b/lib/ir/print.cc similarity index 100% rename from lib/ir/print.cpp rename to lib/ir/print.cc diff --git a/lib/ir/type.cpp b/lib/ir/type.cc similarity index 100% rename from lib/ir/type.cpp rename to lib/ir/type.cc diff --git a/lib/ir/value.cpp b/lib/ir/value.cc similarity index 100% rename from lib/ir/value.cpp rename to lib/ir/value.cc diff --git a/lib/runtime/arg.cpp b/lib/runtime/arg.cc similarity index 100% rename from lib/runtime/arg.cpp rename to lib/runtime/arg.cc diff --git a/lib/runtime/function.cpp b/lib/runtime/function.cc similarity index 100% rename from lib/runtime/function.cpp rename to lib/runtime/function.cc diff --git a/python/examples/lol.cc b/python/examples/lol.cc new file mode 100644 index 000000000..c847e46a5 --- /dev/null +++ b/python/examples/lol.cc @@ -0,0 +1,7 @@ +#include + +int main(){ + const char* TEST = "test\n"; + const char* LOL = "lol\n"; + printf("%s\n",DTYPE); +} diff --git a/python/src/tensorflow.cpp b/python/src/tensorflow.cc similarity index 100% rename from python/src/tensorflow.cpp rename to python/src/tensorflow.cc diff --git a/python/src/tensorflow/alloc_empty.cpp b/python/src/tensorflow/alloc_empty.cc similarity index 100% rename from python/src/tensorflow/alloc_empty.cpp rename to python/src/tensorflow/alloc_empty.cc From cb04ec0b3bd6ef10525234075606d8db9522118e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 23 Aug 2019 19:22:38 -0700 Subject: [PATCH 322/494] some more cleaning --- CMakeLists.txt | 4 ++-- lib/driver/module.cc | 47 ++++++++++++++++++++++---------------------- lib/lang/code_gen.cc | 4 +--- 3 files changed, 27 insertions(+), 28 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b57c0859d..c3c57def7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,7 +11,7 @@ option(BUILD_PYTHON_MODULE "Build Python Triton bindings" OFF) find_package(LLVM REQUIRED CONFIG) include_directories(${LLVM_INCLUDE_DIRS}) add_definitions(${LLVM_DEFINITIONS}) -#llvm_map_components_to_libnames(llvm_libs all) +llvm_map_components_to_libnames(llvm_libs all) # Default build type if(NOT CMAKE_BUILD_TYPE) @@ -51,7 +51,7 @@ endif() # Triton file(GLOB_RECURSE LIBTRITON_SRC lib/*.cpp lib/*.cc) -add_library(triton SHARED ${LIBTRITON_SRC} ${EIGHTCC_SRC} ${PYTHON_SRC}) +add_library(triton SHARED ${LIBTRITON_SRC} ${PYTHON_SRC}) target_link_libraries(triton LLVM) # Warning level diff --git a/lib/driver/module.cc b/lib/driver/module.cc index 3174eb031..486d7d588 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -185,29 +185,30 @@ host_module::host_module(driver::context * context, llvm::Module* src): module(c /* ------------------------ */ ocl_module::ocl_module(driver::context * context, llvm::Module* src): module(context, cl_program(), true) { - init_llvm(); - llvm::SmallVector buffer; - module::compile_llvm_module(src, "amdgcn-amd-amdhsa-amdgizcl", "gfx902", "", buffer, "code-object-v3", Object); - std::ofstream output("/tmp/tmp.o", std::ios::binary); - std::copy(buffer.begin(), buffer.end(), std::ostreambuf_iterator(output)); - system("ld.lld-8 /tmp/tmp.o -shared -o /tmp/tmp.o"); - std::ifstream input("/tmp/tmp.o", std::ios::in | std::ios::binary ); - std::vector in_buffer(std::istreambuf_iterator(input), {}); - size_t sizes[] = {in_buffer.size()}; - const unsigned char* data[] = {(unsigned char*)in_buffer.data()}; - cl_int status; - cl_int err; - *cl_ = dispatch::clCreateProgramWithBinary(*context->cl(), 1, &*context->device()->cl(), sizes, data, &status, &err); - check(status); - check(err); - try{ - dispatch::clBuildProgram(*cl_, 1, &*context->device()->cl(), NULL, NULL, NULL); - } - catch(...){ - char log[2048]; - dispatch::clGetProgramBuildInfo(*cl_, *context->device()->cl(), CL_PROGRAM_BUILD_LOG, 1024, log, NULL); - throw; - } + throw std::runtime_error("not supported"); +// init_llvm(); +// llvm::SmallVector buffer; +// module::compile_llvm_module(src, "amdgcn-amd-amdhsa-amdgizcl", "gfx902", "", buffer, "code-object-v3", Object); +// std::ofstream output("/tmp/tmp.o", std::ios::binary); +// std::copy(buffer.begin(), buffer.end(), std::ostreambuf_iterator(output)); +// system("ld.lld-8 /tmp/tmp.o -shared -o /tmp/tmp.o"); +// std::ifstream input("/tmp/tmp.o", std::ios::in | std::ios::binary ); +// std::vector in_buffer(std::istreambuf_iterator(input), {}); +// size_t sizes[] = {in_buffer.size()}; +// const unsigned char* data[] = {(unsigned char*)in_buffer.data()}; +// cl_int status; +// cl_int err; +// *cl_ = dispatch::clCreateProgramWithBinary(*context->cl(), 1, &*context->device()->cl(), sizes, data, &status, &err); +// check(status); +// check(err); +// try{ +// dispatch::clBuildProgram(*cl_, 1, &*context->device()->cl(), NULL, NULL, NULL); +// } +// catch(...){ +// char log[2048]; +// dispatch::clGetProgramBuildInfo(*cl_, *context->device()->cl(), CL_PROGRAM_BUILD_LOG, 1024, log, NULL); +// throw; +// } } diff --git a/lib/lang/code_gen.cc b/lib/lang/code_gen.cc index 17b24d80d..83595d867 100644 --- a/lib/lang/code_gen.cc +++ b/lib/lang/code_gen.cc @@ -333,10 +333,8 @@ void Generator::VisitLabelStmt(LabelStmt* labelStmt) { } void Generator::VisitCompoundStmt(CompoundStmt* compoundStmt) { - if (compoundStmt->scope_){ -// AllocObjects(compoundStmt->scope_); + if (compoundStmt->scope_) pushScope(); - } for (auto stmt: compoundStmt->stmts_) Visit(stmt); if(compoundStmt->scope_) From 8c6bac49d1f9be18cad8a1d13c690b6294d38e72 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 23 Aug 2019 19:49:06 -0700 Subject: [PATCH 323/494] [lang][codegen] added basic attribute support --- CMakeLists.txt | 4 ---- include/triton/lang/ast.h | 4 ++-- include/triton/lang/code_gen.h | 6 +++++- lib/lang/code_gen.cc | 24 ++++++++++++++++++++++++ 4 files changed, 31 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c3c57def7..84e16ddf9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,10 +19,6 @@ if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE "Release") endif() -# Gather headers for cmake-based IDEs -file( GLOB_RECURSE ALL_SRC *.cpp *.hpp *.h *.py CMakeLists*) -add_custom_target( ALL SOURCES ${ALL_SRC} ) - # Compiler flags include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${LLVM_CXXFLAGS} -std=c++11") diff --git a/include/triton/lang/ast.h b/include/triton/lang/ast.h index 6e10219e4..d293a0ae5 100644 --- a/include/triton/lang/ast.h +++ b/include/triton/lang/ast.h @@ -667,7 +667,7 @@ public: void SetOffset(int offset) { offset_ = offset; } Declaration* Decl() { return decl_; } void SetDecl(Declaration* decl) { decl_ = decl; } - + const AttrList& GetAttrList() const { return attrList_; } unsigned char BitFieldBegin() const { return bitFieldBegin_; } unsigned char BitFieldEnd() const { return bitFieldBegin_ + bitFieldWidth_; } unsigned char BitFieldWidth() const { return bitFieldWidth_; } @@ -723,7 +723,7 @@ private: bool anonymous_; long id_ {0}; - ASTNode::AttrList attrList_; + AttrList attrList_; }; diff --git a/include/triton/lang/code_gen.h b/include/triton/lang/code_gen.h index 794423536..69a1a7514 100644 --- a/include/triton/lang/code_gen.h +++ b/include/triton/lang/code_gen.h @@ -15,6 +15,7 @@ class module; class type; class context; class builder; +class attribute; } } @@ -32,7 +33,7 @@ using LocationList = std::vector; using StaticInitList = std::vector; // Error -inline void should_not_happen() { assert(false); } +inline void should_not_happen() { throw std::runtime_error("should not happen"); } inline void error_not_implemented() { throw std::runtime_error("not implemented"); } class Generator: public Visitor { @@ -81,6 +82,9 @@ public: void Gen(ir::module *mod); protected: + // Triton-IR attributes + ir::attribute GenIRAttr(ASTNode::Attr attr); + // Triton-IR values ir::value* GenAssignOp(Expr* lvalue, ir::value* rhs); ir::value* GenBroadcastOp(ir::value* src, ir::type* dst_ty); diff --git a/lib/lang/code_gen.cc b/lib/lang/code_gen.cc index 83595d867..f7dd021ed 100644 --- a/lib/lang/code_gen.cc +++ b/lib/lang/code_gen.cc @@ -354,6 +354,8 @@ void Generator::VisitFuncDef(FuncDef* funcDef) { for(Object* obj: type->Params()){ std::string name = obj->Name(); args[i]->set_name(name); + for(ASTNode::Attr attr: obj->GetAttrList()) + fn->add_attr(i, GenIRAttr(attr)); mod_->set_value(name, nullptr, args[i]); mod_->get_scope().types[name] = args[i]->get_type(); i++; @@ -436,6 +438,28 @@ ir::value* Generator::GenCastOp(ir::value* src, ir::type* dst_ty) { return GenNumcastOp(GenBroadcastOp(src, dst_ty), dst_ty); } +// Triton-IR Attr +ir::attribute Generator::GenIRAttr(ASTNode::Attr attr) { + if(attr.name == "multiple_of") { + VisitExpr(attr.vals[0]); + auto cst = dynamic_cast(ret_); + if(!cst) should_not_happen(); + return ir::attribute(ir::multiple_of, cst->get_value()); + } + if(attr.name == "aligned") { + VisitExpr(attr.vals[0]); + auto cst = dynamic_cast(ret_); + return ir::attribute(ir::aligned, cst->get_value()); + } + if(attr.name == "noalias") + return ir::attribute(ir::noalias); + if(attr.name == "readonly") + return ir::attribute(ir::readonly); + if(attr.name == "writeonly") + return ir::attribute(ir::writeonly); + should_not_happen(); +} + // Triton-IR Types ir::type* Generator::GenIRType(::Type* type, ir::context& ctx) { if(auto T = type->ToVoid()) From 44eb3891ae111fc8401d2a6717b0403cfc5a0c34 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 23 Aug 2019 20:29:12 -0700 Subject: [PATCH 324/494] [lang] added support for restrict; added macros for attributes --- examples/cpp/dot.cc | 17 ++++++++++++----- include/triton/lang/ast.h | 11 ++++++++++- lib/lang/code_gen.cc | 12 +++++++----- lib/lang/parser.cc | 19 +++++++++++++++++-- 4 files changed, 46 insertions(+), 13 deletions(-) diff --git a/examples/cpp/dot.cc b/examples/cpp/dot.cc index 3122ead6f..7ca9a1937 100644 --- a/examples/cpp/dot.cc +++ b/examples/cpp/dot.cc @@ -82,14 +82,21 @@ R"( #define true 1 #define false 0 #define __bool_true_false_are_defined 1 + +#define __readonly __attribute__((readonly)) +#define __writeonly __attribute__((writeonly)) +#define __noalias __attribute__((noalias)) +#define __aligned(A) __attribute__((aligned(A))) +#define __multipleof(A) __attribute__((multipleof(A))) + extern int get_program_id(int); -void matmul(restrict )" + a_ty + R"( * A __attribute__((readonly, aligned(16))), - restrict )" + b_ty + R"( * B __attribute__((readonly, aligned(16))), - restrict )" + c_ty + R"( * C __attribute__((aligned(16))), +void matmul()" + a_ty + R"( * A __noalias __readonly __aligned(16), + )" + b_ty + R"( * B __noalias __readonly __aligned(16), + )" + c_ty + R"( * C __noalias __readonly __aligned(16), int M, int N, int K, - int lda __attribute__((multiple_of(8))), - int ldb __attribute__((multiple_of(8))), + int lda __multipleof(8), + int ldb __multipleof(8), int ldc) { int ridx = get_program_id(0); int ridy = get_program_id(1); diff --git a/include/triton/lang/ast.h b/include/triton/lang/ast.h index d293a0ae5..710c67e4a 100644 --- a/include/triton/lang/ast.h +++ b/include/triton/lang/ast.h @@ -58,7 +58,16 @@ class TranslationUnit; class ASTNode { public: struct Attr{ - std::string name; + + enum KindT{ + MULTIPLEOF, + ALIGNED, + NOALIAS, + READONLY, + WRITEONLY + }; + + KindT kind; std::vector vals; }; using AttrList = std::vector; diff --git a/lib/lang/code_gen.cc b/lib/lang/code_gen.cc index f7dd021ed..451015d84 100644 --- a/lib/lang/code_gen.cc +++ b/lib/lang/code_gen.cc @@ -356,6 +356,8 @@ void Generator::VisitFuncDef(FuncDef* funcDef) { args[i]->set_name(name); for(ASTNode::Attr attr: obj->GetAttrList()) fn->add_attr(i, GenIRAttr(attr)); + if(obj->IsRestrictQualified()) + fn->add_attr(i, ir::attribute(ir::noalias)); mod_->set_value(name, nullptr, args[i]); mod_->get_scope().types[name] = args[i]->get_type(); i++; @@ -440,22 +442,22 @@ ir::value* Generator::GenCastOp(ir::value* src, ir::type* dst_ty) { // Triton-IR Attr ir::attribute Generator::GenIRAttr(ASTNode::Attr attr) { - if(attr.name == "multiple_of") { + if(attr.kind == ASTNode::Attr::MULTIPLEOF) { VisitExpr(attr.vals[0]); auto cst = dynamic_cast(ret_); if(!cst) should_not_happen(); return ir::attribute(ir::multiple_of, cst->get_value()); } - if(attr.name == "aligned") { + if(attr.kind == ASTNode::Attr::ALIGNED) { VisitExpr(attr.vals[0]); auto cst = dynamic_cast(ret_); return ir::attribute(ir::aligned, cst->get_value()); } - if(attr.name == "noalias") + if(attr.kind == ASTNode::Attr::NOALIAS) return ir::attribute(ir::noalias); - if(attr.name == "readonly") + if(attr.kind == ASTNode::Attr::READONLY) return ir::attribute(ir::readonly); - if(attr.name == "writeonly") + if(attr.kind == ASTNode::Attr::WRITEONLY) return ir::attribute(ir::writeonly); should_not_happen(); } diff --git a/lib/lang/parser.cc b/lib/lang/parser.cc index 35ed63e15..d8dbcf1c8 100644 --- a/lib/lang/parser.cc +++ b/lib/lang/parser.cc @@ -1806,7 +1806,8 @@ Object* Parser::ParseParamDecl() { auto type = ParseDeclSpec(&storageSpec, &funcSpec, nullptr); auto tokTypePair = ParseDeclarator(type); auto tok = tokTypePair.tok; - type = Type::MayCast(tokTypePair.type, true); + QualType fullType(tokTypePair.type.GetPtr(), type.Qual()); + type = Type::MayCast(fullType, true); auto attrs = tokTypePair.attrs; if (!tok) { // Abstract declarator return Object::NewAnony(ts_.Peek(), type, 0, Linkage::L_NONE); @@ -2692,7 +2693,21 @@ ASTNode::Attr Parser::ParseAttribute() { if (!ts_.Test(Token::IDENTIFIER)) return ret; auto tok = ts_.Next(); - ret.name = tok->str_; + std::string name = tok->str_; + // set kind + if(name == "aligned") + ret.kind = ASTNode::Attr::ALIGNED; + else if(name == "readonly") + ret.kind = ASTNode::Attr::READONLY; + else if(name == "writeonly") + ret.kind = ASTNode::Attr::WRITEONLY; + else if(name == "multipleof") + ret.kind = ASTNode::Attr::MULTIPLEOF; + else if(name == "noalias") + ret.kind = ASTNode::Attr::NOALIAS; + else + Error(tok, "unknown attribute kind"); + // set exprs if (ts_.Try('(')) { if (ts_.Try(')')) return ret; From 0b1c389894a640253b48f202770044f35c2bd038 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 23 Aug 2019 20:34:24 -0700 Subject: [PATCH 325/494] [lang] changed array declarations from [{}] to [] --- examples/cpp/dot.cc | 36 ++++++++++++++++++------------------ lib/lang/parser.cc | 32 ++++++-------------------------- 2 files changed, 24 insertions(+), 44 deletions(-) diff --git a/examples/cpp/dot.cc b/examples/cpp/dot.cc index 7ca9a1937..812670dd1 100644 --- a/examples/cpp/dot.cc +++ b/examples/cpp/dot.cc @@ -74,8 +74,8 @@ std::string src(bool AT, bool BT, std::string a_ty, std::string b_ty, std::strin std::string AS = AS0 + ", " + AS1; std::string BS = BS0 + ", " + BS1; std::string XCS = "TM, TN"; - std::string align_lda_str = "multiple_of(" + std::to_string(align_lda) + ")"; - std::string align_ldb_str = "multiple_of(" + std::to_string(align_ldb) + ")"; + std::string align_lda_str = "multipleof(" + std::to_string(align_lda) + ")"; + std::string align_ldb_str = "multipleof(" + std::to_string(align_ldb) + ")"; std::string res = R"( #define bool _Bool @@ -100,15 +100,15 @@ void matmul()" + a_ty + R"( * A __noalias __readonly __aligned(16), int ldc) { int ridx = get_program_id(0); int ridy = get_program_id(1); - int rxa[{TM}] = ridx * TM + 0 ... TM; - int ryb[{TN}] = ridy * TN + 0 ... TN; - int rka[{TK}] = 0 ... TK; - int rkb[{TK}] = 0 ... TK; - float xc[{)" + XCS + R"(}] = 0; - )" + a_ty + R"(* pa[{)" + AS + "}] = A + rka" + bca0 + lda0 + " + rxa" + bca1 + lda1 + R"(; - )" + b_ty + R"(* pb[{)" + BS + "}] = B + rkb" + bcb0 + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; - )" + a_ty + R"( a[{)" + AS + R"(}] = *pa; - )" + b_ty + R"( b[{)" + BS + R"(}] = *pb; + int rxa[TM] = ridx * TM + 0 ... TM; + int ryb[TN] = ridy * TN + 0 ... TN; + int rka[TK] = 0 ... TK; + int rkb[TK] = 0 ... TK; + float xc[)" + XCS + R"(] = 0; + )" + a_ty + R"(* pa[)" + AS + "] = A + rka" + bca0 + lda0 + " + rxa" + bca1 + lda1 + R"(; + )" + b_ty + R"(* pb[)" + BS + "] = B + rkb" + bcb0 + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; + )" + a_ty + R"( a[)" + AS + R"(] = *pa; + )" + b_ty + R"( b[)" + BS + R"(] = *pb; for(int k = K; k > 0; k = k - TK){ xc = )" + usea + " @ " + useb + R"( + xc; pa = pa + TK)" + lda0 + R"(; @@ -116,13 +116,13 @@ void matmul()" + a_ty + R"( * A __noalias __readonly __aligned(16), a = *pa; b = *pb; } - int rxc[{TM}] = ridx * TM + (0 ... TM); - int ryc[{TN}] = ridy * TN + (0 ... TN); - )" + c_ty + R"(* pc[{TM, TN}] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - )" + c_ty + R"( c[{TM, TN}] = xc; - bool checkc0[{TM}] = rxc < M; - bool checkc1[{TN}] = ryc < N; - bool checkc[{TM, TN}] = checkc0[:, newaxis] && checkc1[newaxis, :]; + int rxc[TM] = ridx * TM + (0 ... TM); + int ryc[TN] = ridy * TN + (0 ... TN); + )" + c_ty + R"(* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; + )" + c_ty + R"( c[TM, TN] = xc; + bool checkc0[TM] = rxc < M; + bool checkc1[TN] = ryc < N; + bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; *pc = c; } )"; diff --git a/lib/lang/parser.cc b/lib/lang/parser.cc index d8dbcf1c8..7a545992d 100644 --- a/lib/lang/parser.cc +++ b/lib/lang/parser.cc @@ -1684,37 +1684,17 @@ Identifier* Parser::ProcessDeclarator(const Token* tok, QualType Parser::ParseArrayFuncDeclarator(const Token* ident, QualType base) { if (ts_.Try('[')) { - - if (ts_.Try('{')) { - if(!base->IsScalar()) { - Error(ts_.Peek(), "tiles must have scalar elements"); - } - auto shape = ParseTileShape(); - ts_.Expect('}'); - ts_.Expect(']'); - base = ParseArrayFuncDeclarator(ident, base); - if (!base->Complete()) { - // FIXME(wgtdkp): ident could be nullptr - Error(ident, "'%s' has incomplete element type", - ident->str_.c_str()); - } - return TileType::New(shape, base); + if(!base->IsScalar()) { + Error(ts_.Peek(), "tiles must have scalar elements"); } - - if (nullptr != base->ToFunc()) { - Error(ts_.Peek(), "the element of array cannot be a function"); - } - - auto len = ParseArrayLength(); + auto shape = ParseTileShape(); ts_.Expect(']'); - base = ParseArrayFuncDeclarator(ident, base); if (!base->Complete()) { - // FIXME(wgtdkp): ident could be nullptr - Error(ident, "'%s' has incomplete element type", - ident->str_.c_str()); + Error(ident, "'%s' has incomplete element type", ident->str_.c_str()); } - return ArrayType::New(len, base); + return TileType::New(shape, base); + } else if (ts_.Try('(')) { // Function declaration if (base->ToFunc()) { Error(ts_.Peek(), From 96b4d5e411d248069a5582dbf3c573de08eb15c5 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 24 Aug 2019 13:06:20 -0700 Subject: [PATCH 326/494] [examples] multiple transposition schemes now supported --- examples/cpp/dot.cc | 118 ++++++++++++++---------------- lib/codegen/transform/peephole.cc | 7 +- lib/runtime/function.cc | 18 ++++- 3 files changed, 76 insertions(+), 67 deletions(-) diff --git a/examples/cpp/dot.cc b/examples/cpp/dot.cc index 812670dd1..acb00afc7 100644 --- a/examples/cpp/dot.cc +++ b/examples/cpp/dot.cc @@ -45,81 +45,67 @@ void cpu_ref(bool AT_, bool BT_, size_t M, size_t N, size_t K, -std::string src(bool AT, bool BT, std::string a_ty, std::string b_ty, std::string c_ty, int align_lda, int align_ldb) { - std::string ZS = "1"; - std::string AS0 = "TM", AS1 = "TK"; - std::string BS0 = "TK", BS1 = "TN"; - std::string XAS0 = "TM", XAS1 = "TK / " + ZS, XAS2 = ZS; - std::string XBS0 = "TK / " + ZS, XBS1 = ZS, XBS2 = "TN"; - std::string bca0 = "[newaxis, :]", bca1 = "[:, newaxis]"; - std::string bcb0 = "[:, newaxis]", bcb1 = "[newaxis, :]"; - std::string lda0 = "*lda", lda1 = ""; - std::string ldb0 = "", ldb1 = "*ldb"; - std::string usea = AT ? "^a" : "a"; - std::string useb = BT ? "^b" : "b"; - if(AT){ - std::swap(AS0, AS1); - std::swap(XAS0, XAS1); - std::swap(XAS1, XAS2); - std::swap(bca0, bca1); - std::swap(lda0, lda1); - } - if(BT){ - std::swap(BS0, BS1); - std::swap(XBS1, XBS2); - std::swap(XBS0, XBS1); - std::swap(bcb0, bcb1); - std::swap(ldb0, ldb1); - } - std::string AS = AS0 + ", " + AS1; - std::string BS = BS0 + ", " + BS1; - std::string XCS = "TM, TN"; - std::string align_lda_str = "multipleof(" + std::to_string(align_lda) + ")"; - std::string align_ldb_str = "multipleof(" + std::to_string(align_ldb) + ")"; - std::string res = +std::string src = R"( -#define bool _Bool -#define true 1 -#define false 0 -#define __bool_true_false_are_defined 1 +#ifdef AT +#define USEA ^a +#else +#define USEA a +#endif -#define __readonly __attribute__((readonly)) -#define __writeonly __attribute__((writeonly)) -#define __noalias __attribute__((noalias)) -#define __aligned(A) __attribute__((aligned(A))) -#define __multipleof(A) __attribute__((multipleof(A))) +#ifdef BT +#define USEB ^b +#else +#define USEB b +#endif -extern int get_program_id(int); - -void matmul()" + a_ty + R"( * A __noalias __readonly __aligned(16), - )" + b_ty + R"( * B __noalias __readonly __aligned(16), - )" + c_ty + R"( * C __noalias __readonly __aligned(16), - int M, int N, int K, - int lda __multipleof(8), - int ldb __multipleof(8), - int ldc) { +void dot(TYPE * A __noalias __readonly __aligned(16), + TYPE * B __noalias __readonly __aligned(16), + TYPE * C __noalias __readonly __aligned(16), + int M, int N, int K, + int lda __multipleof(8), + int ldb __multipleof(8), + int ldc) { int ridx = get_program_id(0); int ridy = get_program_id(1); int rxa[TM] = ridx * TM + 0 ... TM; int ryb[TN] = ridy * TN + 0 ... TN; int rka[TK] = 0 ... TK; int rkb[TK] = 0 ... TK; - float xc[)" + XCS + R"(] = 0; - )" + a_ty + R"(* pa[)" + AS + "] = A + rka" + bca0 + lda0 + " + rxa" + bca1 + lda1 + R"(; - )" + b_ty + R"(* pb[)" + BS + "] = B + rkb" + bcb0 + ldb0 + " + ryb" + bcb1 + ldb1 + R"(; - )" + a_ty + R"( a[)" + AS + R"(] = *pa; - )" + b_ty + R"( b[)" + BS + R"(] = *pb; + float xc[TM, TN] = 0; +#ifdef AT + TYPE* pa[TK, TM] = A + rka[:, newaxis] + rxa[newaxis, :]*lda; + TYPE a[TK, TM] = *pa; +#else + TYPE* pa[TM, TK] = A + rka[newaxis, :]*lda + rxa[:, newaxis]; + TYPE a[TM, TK] = *pa; +#endif +#ifdef BT + TYPE* pb[TN, TK] = B + rkb[newaxis, :]*ldb + ryb[:, newaxis]; + TYPE b[TN, TK] = *pb; +#else + TYPE* pb[TK, TN] = B + rkb[:, newaxis] + ryb[newaxis, :]*ldb; + TYPE b[TK, TN] = *pb; +#endif for(int k = K; k > 0; k = k - TK){ - xc = )" + usea + " @ " + useb + R"( + xc; - pa = pa + TK)" + lda0 + R"(; - pb = pb + TK)" + ldb0 + R"(; + xc = USEA @ USEB + xc; +#ifdef AT + pa = pa + TK; +#else + pa = pa + TK*lda; +#endif +#ifdef BT + pb = pb + TK*ldb; +#else + pb = pb + TK; +#endif a = *pa; b = *pb; } int rxc[TM] = ridx * TM + (0 ... TM); int ryc[TN] = ridy * TN + (0 ... TN); - )" + c_ty + R"(* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - )" + c_ty + R"( c[TM, TN] = xc; + TYPE* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; + TYPE c[TM, TN] = xc; bool checkc0[TM] = rxc < M; bool checkc1[TN] = ryc < N; bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; @@ -127,9 +113,6 @@ void matmul()" + a_ty + R"( * A __noalias __readonly __aligned(16), } )"; - return res; -} - struct perf_t { double triton; double cublas; @@ -165,11 +148,16 @@ perf_t do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int stream->synchronize(); // run rt::function::options_space_t opt; + opt.defines.push_back({"TYPE", {ty}}); + if(AT) + opt.defines.push_back({"AT", {""}}); + if(BT) + opt.defines.push_back({"BT", {""}}); opt.defines.push_back({"TM", {"128"}}); opt.defines.push_back({"TN", {"128"}}); opt.defines.push_back({"TK", {"32"}}); opt.num_warps = {1, 2, 4, 8}; - rt::function function(src(AT, BT, ty, ty, ty, 8, 8), opt); + rt::function function(src, opt); auto ceil = [](size_t x, size_t y) { return (x + y - 1) / y; }; auto grid = [&](const rt::function::options_t& x) { return rt::grid_t{ceil(M, x.D("TM")), ceil(N, x.D("TN")), 1}; }; @@ -220,7 +208,7 @@ int main() { // shapes to benchmark std::vector configs = { // {false, false, 8192, 512, 512}, - {false, true, 128, 128, 128} + {false, false, 128, 128, 128} // {false, true, 128, 128, 128}, // {false, false, 128, 128, 128}, // {true, false, 128, 128, 128}, diff --git a/lib/codegen/transform/peephole.cc b/lib/codegen/transform/peephole.cc index f6f5e3c00..853bed1b2 100644 --- a/lib/codegen/transform/peephole.cc +++ b/lib/codegen/transform/peephole.cc @@ -112,8 +112,12 @@ bool peephole::rewrite_dot_hmma(ir::dot_inst *dot, ir::builder& builder, bool tr trans_a = true; } } + if(!trans_a && !trans_b) + return false; + ir::instruction *dot_atbt = builder.insert(ir::dot_inst::create(AA, BB, D, trans_a, trans_b)); dot->replace_all_uses_with(dot_atbt); + return true; } @@ -186,8 +190,9 @@ bool peephole::rewrite_dot(ir::instruction *value, ir::builder& builder){ if(dot->is_a_trans() || dot->is_b_trans()) return false; // hmma - if(is_hmma(dot)) + if(is_hmma(dot)){ return rewrite_dot_hmma(dot, builder, trans_a, trans_b, A, B, D); + } else return rewrite_dot_fp32(dot, builder, trans_a, trans_b, A, B, D); } diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 11279b571..8a47c35d4 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -206,9 +206,25 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c return res; } +std::string preheader() { +return R"( + #define bool _Bool + #define true 1 + #define false 0 + #define __bool_true_false_are_defined 1 + + #define __readonly __attribute__((readonly)) + #define __writeonly __attribute__((writeonly)) + #define __noalias __attribute__((noalias)) + #define __aligned(A) __attribute__((aligned(A))) + #define __multipleof(A) __attribute__((multipleof(A))) + + extern int get_program_id(int); + )"; +} function::function(const std::string &src, const options_space_t& opt): src_(src), opt_space_(opt) { - + src_ = preheader() + src_; } void function::operator()(const std::vector& args, const grid_fn_ty& grid_fn, driver::stream *stream) { From 321d268a4a105dbf9070c42f1a7fbc15c8be3cf7 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 25 Aug 2019 21:26:09 -0700 Subject: [PATCH 327/494] more progress --- CMakeLists.txt | 6 +- include/triton/runtime/function.h | 13 ++- lib/codegen/analysis/tune.cc | 4 +- lib/runtime/function.cc | 38 ++++--- python/examples/dot.py | 88 ++++++++++++---- python/setup.py | 1 + python/src/tensorflow.cc | 162 +++++++++++++++++++----------- python/triton/ops.py | 66 +++++++++++- 8 files changed, 268 insertions(+), 110 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 84e16ddf9..15985cc87 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,20 +33,20 @@ endif() if(BUILD_PYTHON_MODULE) message(STATUS "Adding Python module") # PyBind11 wrapper source file - file(GLOB_RECURSE PYTHON_SRC python/src/tensorflow.cpp) + file(GLOB_RECURSE PYTHON_SRC python/src/tensorflow.cc) # update include directory include_directories(python/src/ ${PYTHON_INCLUDE_DIRS} ${TF_INCLUDE_DIRS}) # update link directories link_directories(${TF_LIB_DIRS}) # extra tensorflow ops (e.g., alloc_empty) - file(GLOB_RECURSE EXTRA_TF_OPS_SRC python/src/tensorflow/*.cpp) + file(GLOB_RECURSE EXTRA_TF_OPS_SRC python/src/tensorflow/*.cc) add_library(extra_tf_ops SHARED ${EXTRA_TF_OPS_SRC}) target_link_libraries(extra_tf_ops ${TF_LIBS}) endif() # Triton -file(GLOB_RECURSE LIBTRITON_SRC lib/*.cpp lib/*.cc) +file(GLOB_RECURSE LIBTRITON_SRC lib/*.cc) add_library(triton SHARED ${LIBTRITON_SRC} ${PYTHON_SRC}) target_link_libraries(triton LLVM) diff --git a/include/triton/runtime/function.h b/include/triton/runtime/function.h index c3f4d53ff..b0054c647 100644 --- a/include/triton/runtime/function.h +++ b/include/triton/runtime/function.h @@ -59,12 +59,11 @@ class metaparameter; namespace runtime{ -typedef std::array grid_t; +typedef std::vector grid_t; typedef std::map params_t; - -template T convert(const std::string& name); -template<> long convert(const std::string& name) { return std::stol(name); } -template<> int convert(const std::string& name) { return std::stoi(name); } +template inline T convert(const std::string& name); +template<> inline long convert(const std::string& name) { return std::stol(name); } +template<> inline int convert(const std::string& name) { return std::stoi(name); } class function { public: @@ -91,7 +90,7 @@ private: class caller { public: caller(ir::function *ir, std::shared_ptr program, const options_t& opt_); - void operator()(driver::stream *stream, const std::array& grid, const std::vector& args) const; + void operator()(driver::stream *stream, const grid_t& grid, const std::vector& args) const; const options_t opt() const { return opt_; } private: @@ -113,7 +112,7 @@ private: public: function(const std::string& src, const options_space_t& opt = options_space_t()); - void operator()(const std::vector& args, const std::array& grid, driver::stream* stream); + void operator()(const std::vector& args, const grid_t& grid, driver::stream* stream); void operator()(const std::vector& args, const grid_fn_ty& grid, driver::stream *stream); std::string make_tensorflow_src(const std::vector &outputs, const std::string ¯o); diff --git a/lib/codegen/analysis/tune.cc b/lib/codegen/analysis/tune.cc index 5f150ee2c..5ff536849 100644 --- a/lib/codegen/analysis/tune.cc +++ b/lib/codegen/analysis/tune.cc @@ -15,8 +15,8 @@ namespace triton{ namespace codegen{ namespace analysis{ -grids::grids(size_t num_warps): num_warps_(num_warps){ -} +grids::grids(size_t num_warps): num_warps_(num_warps) +{ } bool is_hmma(ir::value *v){ bool result = false; diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 8a47c35d4..750952bbb 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -93,7 +93,7 @@ function::caller::caller(ir::function *ir, std::shared_ptr paren } -void function::caller::operator ()(driver::stream *stream, const std::array& grid, const std::vector& args) const { +void function::caller::operator ()(driver::stream *stream, const grid_t& _grid, const std::vector& args) const { if(args.size() != param_tys_.size()) throw std::runtime_error("invalid number of arguments"); for(size_t i = 0; i < args.size(); i++){ @@ -106,6 +106,12 @@ void function::caller::operator ()(driver::stream *stream, const std::arraysetArg(i, size_of(ty), arg_i.data()); } + // sanity check + if(_grid.size() > 3) + throw std::runtime_error("grid size must be no greater than 3"); + std::array grid; + for(size_t i = 0; i < 3; i++) + grid[i] = (i < _grid.size()) ? _grid[i] : 1; stream->enqueue(&*bin_, grid, {opt_.num_warps * 32, 1, 1}); } @@ -207,20 +213,21 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c } std::string preheader() { -return R"( - #define bool _Bool - #define true 1 - #define false 0 - #define __bool_true_false_are_defined 1 +return +R"( +#define bool _Bool +#define true 1 +#define false 0 +#define __bool_true_false_are_defined 1 - #define __readonly __attribute__((readonly)) - #define __writeonly __attribute__((writeonly)) - #define __noalias __attribute__((noalias)) - #define __aligned(A) __attribute__((aligned(A))) - #define __multipleof(A) __attribute__((multipleof(A))) +#define __readonly __attribute__((readonly)) +#define __writeonly __attribute__((writeonly)) +#define __noalias __attribute__((noalias)) +#define __aligned(A) __attribute__((aligned(A))) +#define __multipleof(A) __attribute__((multipleof(A))) - extern int get_program_id(int); - )"; +extern int get_program_id(int); +)"; } function::function(const std::string &src, const options_space_t& opt): src_(src), opt_space_(opt) { @@ -228,9 +235,10 @@ function::function(const std::string &src, const options_space_t& opt): src_(sr } void function::operator()(const std::vector& args, const grid_fn_ty& grid_fn, driver::stream *stream) { - /* determine if should re-tune or not */ cache_key_t key; - // re-tune if device is difference + + /* figure out if the kernel should be re-tuned */ + // re-tune if device is different key.first = stream->context()->device(); // re-tune if any int argument is different for(size_t i = 0; i < args.size(); i++){ diff --git a/python/examples/dot.py b/python/examples/dot.py index 638d49c20..e807305e6 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -3,49 +3,89 @@ import tensorflow as tf import numpy as np src = """ -const tunable int TM = {128}; -const tunable int TN = {128}; -const tunable int TK = {32}; +#if AT == 1 +#define USEA ^a +#else +#define USEA a +#endif -void matmul(restrict read_only align(16) half *A, - restrict read_only align(16) half *B, - restrict read_only align(16) half *C, - int M, int N, int K, - multiple_of(8) int lda, multiple_of(8) int ldb, int ldc) -{ +#if BT == 1 +#define USEB ^b +#else +#define USEB b +#endif + +void dot(TYPE * A __noalias __readonly __aligned(16), + TYPE * B __noalias __readonly __aligned(16), + TYPE * C __noalias __readonly __aligned(16), + int M, int N, int K, + int lda __multipleof(8), + int ldb __multipleof(8), + int ldc) { int ridx = get_program_id(0); int ridy = get_program_id(1); - int rxa[TM] = ridx * TM + (0 ... TM); - int ryb[TN] = ridy * TN + (0 ... TN); + int rxa[TM] = ridx * TM + 0 ... TM; + int ryb[TN] = ridy * TN + 0 ... TN; int rka[TK] = 0 ... TK; int rkb[TK] = 0 ... TK; float xc[TM, TN] = 0; - half* pa[TM, TK] = A + rka[newaxis, :]*lda + rxa[:, newaxis]; - half* pb[TN, TK] = B + rkb[newaxis, :]*ldb + ryb[:, newaxis]; - half a[TM, TK] = *pa; - half b[TN, TK] = *pb; + + /* pointers for A */ +#if AT == 1 + TYPE* pa[TK, TM] = A + rka[:, newaxis] + rxa[newaxis, :]*lda; + TYPE a[TK, TM] = *pa; +#else + TYPE* pa[TM, TK] = A + rka[newaxis, :]*lda + rxa[:, newaxis]; + TYPE a[TM, TK] = *pa; +#endif + + /* pointers for B */ +#if BT == 1 + TYPE* pb[TN, TK] = B + rkb[newaxis, :]*ldb + ryb[:, newaxis]; + TYPE b[TN, TK] = *pb; +#else + TYPE* pb[TK, TN] = B + rkb[:, newaxis] + ryb[newaxis, :]*ldb; + TYPE b[TK, TN] = *pb; +#endif + + /* reduction loop */ for(int k = K; k > 0; k = k - TK){ - xc = dot(a, trans(b), xc); + xc = USEA @ USEB + xc; +#if AT == 1 + pa = pa + TK; +#else pa = pa + TK*lda; +#endif +#if BT == 1 pb = pb + TK*ldb; +#else + pb = pb + TK; +#endif a = *pa; b = *pb; } + + /* epilogue */ int rxc[TM] = ridx * TM + (0 ... TM); int ryc[TN] = ridy * TN + (0 ... TN); - half* pc[TM, TN] = C + ryc[newaxis, :] + rxc[:, newaxis]*ldc; - half c[TM, TN] = xc; + TYPE* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; + TYPE c[TM, TN] = xc; bool checkc0[TM] = rxc < M; bool checkc1[TN] = ryc < N; bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - @checkc *pc = c; + *pc = c; } """ +def cdiv(a, b): + return -(-a // b) + class dot: - def __init__(self): - self.matmul = triton.make_tensorflow_op(src, ['C'], ['(M + #TM - 1)/#TM', '(N + #TN - 1)/#TN']) + def __init__(self, trans_a = False, trans_b = True): + self.dot = triton.op(src, ['C']) + self.trans_a = trans_a + self.trans_b = trans_b def __call__(self, a, b): shape_a = tf.shape(a) @@ -57,9 +97,13 @@ class dot: ldb = K ldc = N c = triton.empty([M, N]) - return self.matmul.matmul(a, b, c, M, N, K, lda, ldb, ldc) + return self.dot(a, b, c, M, N, K, lda, ldb, ldc, + lambda opt: [cdiv(M, opt.D('TM')), cdiv(N, opt.D('TN')), 1], + AT = self.trans_a, BT = self.trans_b, TYPE = tf.float16, + TM = [128], TN = [128], TK = [32]) dot_tn = dot() + def run_dot(): M, N, K = 128, 128, 128 a = tf.placeholder(tf.float16, shape=[M, K]) diff --git a/python/setup.py b/python/setup.py index aeba8b5a6..ef5fa9865 100644 --- a/python/setup.py +++ b/python/setup.py @@ -35,6 +35,7 @@ class CMakeBuild(build_ext): self.build_extension(ext) def build_extension(self, ext): + self.debug = True extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name))) # python directors python_include_dirs = distutils.sysconfig.get_python_inc() diff --git a/python/src/tensorflow.cc b/python/src/tensorflow.cc index 098d338ad..489c545ac 100644 --- a/python/src/tensorflow.cc +++ b/python/src/tensorflow.cc @@ -1,11 +1,14 @@ -#include +#include #include +#include #include #include #include #include "triton/codegen/selection/selection.h" #include "triton/runtime/function.h" -#include "triton/lang/lang.h" +#include "triton/lang/code_gen.h" +#include "triton/lang/parser.h" +#include "triton/lang/cpp.h" #include "triton/driver/device.h" #include "triton/driver/stream.h" #include "triton/driver/kernel.h" @@ -14,14 +17,33 @@ #include "triton/ir/function.h" #include "triton/tools/bench.hpp" -typedef struct yy_buffer_state * YY_BUFFER_STATE; -extern int yyparse(); -extern YY_BUFFER_STATE yy_scan_string(const char * str); -extern void yy_delete_buffer(YY_BUFFER_STATE buffer); -extern triton::lang::translation_unit *ast_root; - using namespace triton; +namespace rt = triton::runtime; + + +/* TF triton op properties */ + +std::map id_grid_map; +std::map id_fn_map; + +void register_grid(size_t id, + const rt::function::grid_fn_ty& grid_fn) { + id_grid_map[id] = grid_fn; +} + +size_t register_fn(const std::string& src, + const rt::function::options_space_t& opt) { + size_t id = id_grid_map.size(); + bool is_inserted = id_fn_map.insert({id, new rt::function(src, opt)}).second; + if(!is_inserted) + assert(false); + return id; +} + + +/* TF source-code generation */ + inline std::string to_tf_ty(ir::type *ty) { if(ty->is_integer_ty(1)) return "bool"; @@ -59,21 +81,6 @@ inline std::string ref_to_tf_ty(ir::type *ty) { return res; } -inline triton::lang::translation_unit *make_ast(const char *src) { - YY_BUFFER_STATE buffer = yy_scan_string(src); - yyparse(); - yy_delete_buffer(buffer); - triton::lang::translation_unit *program = ast_root; - return program; -} - -inline std::unique_ptr make_ir(ir::context& ctx, triton::lang::translation_unit *program) { - // create Triton-IR from AST - ir::module* module = new ir::module("", ctx); - program->codegen(module); - return std::unique_ptr(module); -} - void gen_extract_inputs(std::ostream &os, const std::vector& args) { for(unsigned i = 0; i < args.size(); i++){ @@ -102,24 +109,8 @@ void gen_make_handles(std::ostream &os, const std::vector& args) } } -void gen_make_spmd_grid(std::ostream &os, const std::vector& macros) { - std::regex regex("#([a-zA-Z]([a-zA-Z]|[0-9])*)"); - std::vector grids = macros; - for(size_t i = grids.size(); i < 3; i++) - grids.push_back("1"); - std::string grid = "rt::grid_t{"; - for(size_t i = 0; i < grids.size(); i++){ - if(i > 0) - grid += ", "; - grid += std::regex_replace(grids[i], regex, "x.at(\"$1\")"); - } - grid += "}"; - - os << " auto grid = [&](const rt::params_t& x) { return " << grid << "; };\n "; -} - void gen_make_launch_function(std::ostream &os, const std::vector& args) { - os << " fn_({"; + os << " (*id_fn_map.at(id_))({"; for(unsigned i = 0; i < args.size() ; i++){ ir::argument *arg = args[i]; std::string name = arg->get_name(); @@ -129,7 +120,7 @@ void gen_make_launch_function(std::ostream &os, const std::vector os << ", "; os << name; } - os << "}, grid, stream); \n"; + os << "}, id_grid_map.at(id_), stream); \n"; } void gen_register_kernel_builder(std::ostream &os, const std::string &name, @@ -168,20 +159,55 @@ void gen_register_op(std::ostream &os, const std::string &name, throw std::runtime_error("unknown output"); os << " .Output(\"out" << i << ": " << to_tf_scalar_ty(args[idx]->get_type()) << "\")\n"; } + os << " .Attr(\"id: int\")" << std::endl; os << ";\n"; } -std::string make_tensorflow_src(const std::string src, +inline std::string preheader() { +return +R"( +#define bool _Bool +#define true 1 +#define false 0 +#define __bool_true_false_are_defined 1 + +#define __readonly __attribute__((readonly)) +#define __writeonly __attribute__((writeonly)) +#define __noalias __attribute__((noalias)) +#define __aligned(A) __attribute__((aligned(A))) +#define __multipleof(A) __attribute__((multipleof(A))) + +extern int get_program_id(int); +)"; +} + +std::tuple make_tensorflow_src(std::string src, const std::vector& outputs, - const std::vector& macros) { - triton::lang::translation_unit *ast = make_ast(src.c_str()); - triton::ir::context context; - std::unique_ptr ir = make_ir(context, ast); + const runtime::function::options_space_t& opt) +{ + src = preheader() + src; + // pre-process + TokenSequence tokens; + Preprocessor cpp(&src, true); + for(auto it: opt.defines){ + cpp.AddMacro(it.first, &it.second[0]); + } + cpp.Process(tokens); + // parse + Parser parser(tokens); + parser.Parse(); + // triton-ir code-gen + ir::context ctx; + auto ir = std::unique_ptr(new ir::module("", ctx)); + Generator gen(&parser); + gen.Gen(&*ir); // function ir::function* fn = ir->get_function_list().front(); std::string name = fn->get_name(); - name[0] = static_cast(std::toupper(name[0])); - std::string opname = name + "Op"; + std::string cc_name = name; + cc_name[0] = static_cast(std::toupper(cc_name[0])); + std::string opname = cc_name + "Op"; std::ostringstream oss; oss << R"( @@ -204,12 +230,16 @@ using GPUDevice = Eigen::GpuDevice; namespace rt = triton::runtime; namespace drv = triton::driver; -std::string src = R"TTKERNSRC( )" + src + ")TTKERNSRC\";" + R"( +extern std::map id_grid_map; +extern std::map id_fn_map; + class )" << opname << R"(: public OpKernel { public: explicit )" << opname << R"((OpKernelConstruction* context) - : OpKernel(context), fn_(src) { } + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("id", &id_)); + } void Compute(OpKernelContext* context){ // get device/stream @@ -229,9 +259,7 @@ oss << R"( )"; gen_make_handles(oss, fn->args()); oss << R"( - // create spmd grid )"; -gen_make_spmd_grid(oss, macros); oss << R"( // launch function )"; @@ -240,22 +268,42 @@ oss << R"( } private: - rt::function fn_; + int id_; }; // register kernel builder )"; -gen_register_kernel_builder(oss, name, opname, fn->args()); +gen_register_kernel_builder(oss, cc_name, opname, fn->args()); oss << R"( // register op )"; -gen_register_op(oss, name, fn->args(), outputs); +gen_register_op(oss, cc_name, fn->args(), outputs); - return oss.str(); + return {oss.str(), name}; } +typedef triton::runtime::function::options_t options_t; +typedef triton::runtime::function::options_space_t options_space_t; PYBIND11_MODULE(libtriton, m) { m.doc() = "Python bindings to the C++ Triton API"; - m.def("make_tensorflow_src", &make_tensorflow_src, "Creates C++ source code for a custom Tensorflow op corresponding to the specified Triton kernel"); + + // framework binding source code generation + m.def("make_tensorflow_src", &make_tensorflow_src, + "Creates C++ source code for a custom Tensorflow op " + "corresponding to the specified Triton kernel"); + + // bindings for triton classes + pybind11::class_(m, "options") + .def(pybind11::init<>()) + .def("D", &options_t::D); + + pybind11::class_(m, "options_space") + .def(pybind11::init<>()) + .def_readwrite("defines", &options_space_t::defines) + .def_readwrite("num_warps", &options_space_t::num_warps); + + // hooks into triton constructs since frameworks may not use pybind11 + m.def("register_grid", ®ister_grid); + m.def("register_fn", ®ister_fn); } diff --git a/python/triton/ops.py b/python/triton/ops.py index a10739903..0099e1289 100644 --- a/python/triton/ops.py +++ b/python/triton/ops.py @@ -91,13 +91,71 @@ def build(src, path): setuptools.setup(**args) shutil.rmtree(tmp) +def _cvt_to_def_str(obj): + if isinstance(obj, bool): + return str(int(obj)) + if isinstance(obj, tf.DType): + return {tf.int8: 'char', + tf.int16: 'short', + tf.int32: 'int', + tf.int64: 'long', + tf.float16: 'half', + tf.float32: 'float', + tf.float64: 'double'}[obj] + return str(obj) + +class op: + + def _make_tensorflow_op(self, src, outputs, options): + src, name = make_bindings(src, outputs, options) + cache_path = make_cache_path(src) + cpp, so = write_bindings(src, cache_path) + build(cpp, cache_path) + result = tf.load_op_library(so) + return result.__dict__[name] + + def __init__(self, src, outputs): + self.fw_ops = dict() + self.src = src + self.outputs = outputs + pass + + def D(self, name): + pass + + def __call__(self, *args, **kwargs): + # recompilation key + key = zip(kwargs.keys(), kwargs.values()) + # create a new op when non-iterable defines are different + if key not in self.fw_ops: + # code generation options + defines = [] + for k, v in kwargs.items(): + try: + values = list(map(_cvt_to_def_str, v)) + except TypeError: + values = [_cvt_to_def_str(v)] + defines.append((k, values)) + opt = libtriton.options_space() + opt.defines = defines + opt.num_warps = [1, 2, 4, 8] + # register framework op + id = libtriton.register_fn(self.src, opt) + self.fw_ops[key] = (self._make_tensorflow_op(self.src, self.outputs, opt), id) + # retrieve framework op + op, id = self.fw_ops[key] + libtriton.register_grid(id, args[-1]) + op_args = args[:-1] + return op(*op_args, id=id) + + def make_tensorflow_op(src, outputs, grids): - bindings = make_bindings(src, outputs, grids) - cache_path = make_cache_path(bindings) - cpp, so = write_bindings(bindings, cache_path) + src, name = make_bindings(src, outputs, grids) + cache_path = make_cache_path(src) + cpp, so = write_bindings(src, cache_path) build(cpp, cache_path) result = tf.load_op_library(so) - return result + return result.__dict__[name] def empty(shapes): return extra_ops.alloc_empty(tf.stack(shapes)) From 0e0399f866625e4c20f82b26f1ab79dacaaa11fe Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 26 Aug 2019 11:00:00 -0700 Subject: [PATCH 328/494] more tests --- CMakeLists.txt | 2 +- python/src/tensorflow.cc | 21 ++++++++++++++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 15985cc87..0616d19f1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,7 +48,7 @@ endif() # Triton file(GLOB_RECURSE LIBTRITON_SRC lib/*.cc) add_library(triton SHARED ${LIBTRITON_SRC} ${PYTHON_SRC}) -target_link_libraries(triton LLVM) +target_link_libraries(triton LLVM ${TF_LIBS}) # Warning level #if(MSVC) diff --git a/python/src/tensorflow.cc b/python/src/tensorflow.cc index 489c545ac..d843fecab 100644 --- a/python/src/tensorflow.cc +++ b/python/src/tensorflow.cc @@ -4,6 +4,7 @@ #include #include #include +#include "tensorflow/core/framework/tensor.h" #include "triton/codegen/selection/selection.h" #include "triton/runtime/function.h" #include "triton/lang/code_gen.h" @@ -21,14 +22,15 @@ using namespace triton; namespace rt = triton::runtime; +typedef std::vector tf_grid_t; +typedef std::function tf_grid_fn_ty; /* TF triton op properties */ - -std::map id_grid_map; +std::map id_grid_map; std::map id_fn_map; void register_grid(size_t id, - const rt::function::grid_fn_ty& grid_fn) { + const tf_grid_fn_ty& grid_fn) { id_grid_map[id] = grid_fn; } @@ -110,6 +112,13 @@ void gen_make_handles(std::ostream &os, const std::vector& args) } void gen_make_launch_function(std::ostream &os, const std::vector& args) { + os << " rt::function::grid_fn_ty grid_fn = [&](const rt::function::options_t& opt) {" << std::endl; + os << " auto tmp = id_grid_map.at(id_)(opt);" << std::endl; + os << " rt::grid_t result;" << std::endl; + os << " for(auto& x: tmp) { result.push_back(x.scalar()()); }" << std::endl; + os << " return result; }; " << std::endl; + + os << " (*id_fn_map.at(id_))({"; for(unsigned i = 0; i < args.size() ; i++){ ir::argument *arg = args[i]; @@ -120,7 +129,7 @@ void gen_make_launch_function(std::ostream &os, const std::vector os << ", "; os << name; } - os << "}, id_grid_map.at(id_), stream); \n"; + os << "}, grid_fn, stream); \n"; } void gen_register_kernel_builder(std::ostream &os, const std::string &name, @@ -230,7 +239,9 @@ using GPUDevice = Eigen::GpuDevice; namespace rt = triton::runtime; namespace drv = triton::driver; -extern std::map id_grid_map; +typedef std::vector tf_grid_t; +typedef std::function tf_grid_fn_ty; +extern std::map id_grid_map; extern std::map id_fn_map; From 4075949f80737b8d995725e89e9c8fc5e2d458bd Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 26 Aug 2019 16:53:49 -0700 Subject: [PATCH 329/494] [python] basic tensorflow wrapper working --- CMakeLists.txt | 2 +- examples/cpp/dot.cc | 6 +- lib/codegen/analysis/tune.cc | 9 +- lib/runtime/function.cc | 19 +- python/examples/dot.py | 12 +- python/src/pybind11/attr.h | 10 +- python/src/pybind11/cast.h | 189 +++------ python/src/pybind11/complex.h | 4 - python/src/pybind11/detail/class.h | 9 +- python/src/pybind11/detail/common.h | 90 ++--- python/src/pybind11/detail/descr.h | 199 ++++++--- python/src/pybind11/detail/init.h | 2 +- python/src/pybind11/detail/internals.h | 16 +- python/src/pybind11/detail/typeid.h | 2 - python/src/pybind11/eigen.h | 53 ++- python/src/pybind11/embed.h | 8 +- python/src/pybind11/functional.h | 21 +- python/src/pybind11/iostream.h | 21 +- python/src/pybind11/numpy.h | 109 +++-- python/src/pybind11/pybind11.h | 489 +++++++---------------- python/src/pybind11/pytypes.h | 151 +------ python/src/pybind11/stl.h | 36 +- python/src/pybind11/stl_bind.h | 33 +- python/src/tensorflow.cc | 38 +- python/src/tensorflow/register_scalar.cc | 37 ++ python/triton/ops.py | 105 ++++- 26 files changed, 702 insertions(+), 968 deletions(-) create mode 100644 python/src/tensorflow/register_scalar.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 0616d19f1..637718fa6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,7 +41,7 @@ if(BUILD_PYTHON_MODULE) # extra tensorflow ops (e.g., alloc_empty) file(GLOB_RECURSE EXTRA_TF_OPS_SRC python/src/tensorflow/*.cc) add_library(extra_tf_ops SHARED ${EXTRA_TF_OPS_SRC}) - target_link_libraries(extra_tf_ops ${TF_LIBS}) + target_link_libraries(extra_tf_ops triton ${TF_LIBS}) endif() diff --git a/examples/cpp/dot.cc b/examples/cpp/dot.cc index acb00afc7..6e40f79d2 100644 --- a/examples/cpp/dot.cc +++ b/examples/cpp/dot.cc @@ -153,8 +153,8 @@ perf_t do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int opt.defines.push_back({"AT", {""}}); if(BT) opt.defines.push_back({"BT", {""}}); - opt.defines.push_back({"TM", {"128"}}); - opt.defines.push_back({"TN", {"128"}}); + opt.defines.push_back({"TM", {"32"}}); + opt.defines.push_back({"TN", {"32"}}); opt.defines.push_back({"TK", {"32"}}); opt.num_warps = {1, 2, 4, 8}; rt::function function(src, opt); @@ -208,7 +208,7 @@ int main() { // shapes to benchmark std::vector configs = { // {false, false, 8192, 512, 512}, - {false, false, 128, 128, 128} + {false, true, 128, 128, 128} // {false, true, 128, 128, 128}, // {false, false, 128, 128, 128}, // {true, false, 128, 128, 128}, diff --git a/lib/codegen/analysis/tune.cc b/lib/codegen/analysis/tune.cc index 5ff536849..fdb3741cc 100644 --- a/lib/codegen/analysis/tune.cc +++ b/lib/codegen/analysis/tune.cc @@ -282,7 +282,10 @@ void grids::run(ir::module &mod) { std::string str_d = std::to_string(d); effective_num_warps *= params_.at(i).at("wpt.d" + str_d)->get_value(); } - assert(num_warps_ == effective_num_warps); + + if(num_warps_ != effective_num_warps) + throw std::runtime_error("cannot create a kernel with this amount of warps"); + } /* Scan-line */ @@ -305,7 +308,9 @@ void grids::run(ir::module &mod) { std::string str_d = std::to_string(d); effective_num_threads *= params_.at(i).at("mts.d" + str_d)->get_value(); } - assert(num_threads == effective_num_threads); + + if(num_threads != effective_num_threads) + throw std::runtime_error("cannot create a kernel with this amount of warps"); } } diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 750952bbb..fdc9b6d15 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -160,7 +160,12 @@ function::caller function::autotune(driver::stream* stream, const grid_fn_ty& gr // triton-ir code-gen auto ir = make_ir(parser); // binary code-gen - auto bin = make_bin(*ir, stream->context(), opt); + std::unique_ptr bin; + try{ + bin = make_bin(*ir, stream->context(), opt); + }catch(const std::runtime_error& e) { + return; + } // benchmark ir::function *tmp = ir->get_function_list()[0]; caller call(tmp, std::move(bin), opt); @@ -177,21 +182,21 @@ function::caller function::autotune(driver::stream* stream, const grid_fn_ty& gr std::unique_ptr function::make_bin(ir::module &module, driver::context *context, const options_t& opt) { std::unique_ptr target = context->device()->make_target(); // create passes - codegen::analysis::grids tune(opt.num_warps); + codegen::analysis::grids grids(opt.num_warps); codegen::analysis::shmem::info shmem_info; codegen::analysis::shmem::liveness shmem_liveness(&shmem_info); - codegen::analysis::shmem::allocation shmem_allocation(&shmem_liveness, &shmem_info, &tune); + codegen::analysis::shmem::allocation shmem_allocation(&shmem_liveness, &shmem_info, &grids); codegen::analysis::alignment_info alignment_info; codegen::transform::shmem_barriers shmem_barriers(&shmem_allocation, &shmem_info); - codegen::transform::vectorize vectorize(&tune); + codegen::transform::vectorize vectorize(&grids); codegen::transform::dce dce; codegen::transform::peephole peephole; - codegen::transform::reassociate reassociate(&tune); - codegen::selection selection(&shmem_allocation, &tune, &shmem_info, &alignment_info, target.get()); + codegen::transform::reassociate reassociate(&grids); + codegen::selection selection(&shmem_allocation, &grids, &shmem_info, &alignment_info, target.get()); // run passes peephole.run(module); dce.run(module); - tune.run(module); + grids.run(module); reassociate.run(module); peephole.run(module); if(target->is_gpu()){ diff --git a/python/examples/dot.py b/python/examples/dot.py index e807305e6..351a6d3dc 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -22,6 +22,8 @@ void dot(TYPE * A __noalias __readonly __aligned(16), int lda __multipleof(8), int ldb __multipleof(8), int ldc) { + + /* prologue */ int ridx = get_program_id(0); int ridy = get_program_id(1); int rxa[TM] = ridx * TM + 0 ... TM; @@ -88,8 +90,8 @@ class dot: self.trans_b = trans_b def __call__(self, a, b): - shape_a = tf.shape(a) - shape_b = tf.shape(b) + shape_a = triton.shape(a) + shape_b = triton.shape(b) M = shape_a[0] K = shape_a[1] N = shape_b[0] @@ -98,9 +100,9 @@ class dot: ldc = N c = triton.empty([M, N]) return self.dot(a, b, c, M, N, K, lda, ldb, ldc, - lambda opt: [cdiv(M, opt.D('TM')), cdiv(N, opt.D('TN')), 1], + lambda opt: [cdiv(M, opt.d('TM')), cdiv(N, opt.d('TN'))], AT = self.trans_a, BT = self.trans_b, TYPE = tf.float16, - TM = [128], TN = [128], TK = [32]) + TM = [32, 64, 128], TN = [32, 64, 128], TK = [32]) dot_tn = dot() @@ -119,7 +121,7 @@ def run_dot(): result = sess.run([c], feed_dict = {a: ha, b: hb})[0] # Test - hresult = np.dot(ha.T, hb) + hresult = np.dot(ha.T, hb).T dif = np.abs(result - hresult) np.savetxt('dif.dat', dif, '%2.4f') print(hresult) diff --git a/python/src/pybind11/attr.h b/python/src/pybind11/attr.h index 6962d6fc5..dce875a6b 100644 --- a/python/src/pybind11/attr.h +++ b/python/src/pybind11/attr.h @@ -200,8 +200,7 @@ struct function_record { /// Special data structure which (temporarily) holds metadata about a bound class struct type_record { PYBIND11_NOINLINE type_record() - : multiple_inheritance(false), dynamic_attr(false), buffer_protocol(false), - default_holder(true), module_local(false) { } + : multiple_inheritance(false), dynamic_attr(false), buffer_protocol(false), module_local(false) { } /// Handle to the parent scope handle scope; @@ -215,14 +214,11 @@ struct type_record { /// How large is the underlying C++ type? size_t type_size = 0; - /// What is the alignment of the underlying C++ type? - size_t type_align = 0; - /// How large is the type's holder? size_t holder_size = 0; /// The global operator new can be overridden with a class-specific variant - void *(*operator_new)(size_t) = nullptr; + void *(*operator_new)(size_t) = ::operator new; /// Function pointer to class_<..>::init_instance void (*init_instance)(instance *, const void *) = nullptr; @@ -282,7 +278,7 @@ struct type_record { } }; -inline function_call::function_call(const function_record &f, handle p) : +inline function_call::function_call(function_record &f, handle p) : func(f), parent(p) { args.reserve(f.nargs); args_convert.reserve(f.nargs); diff --git a/python/src/pybind11/cast.h b/python/src/pybind11/cast.h index 8d0fd5d90..214545083 100644 --- a/python/src/pybind11/cast.h +++ b/python/src/pybind11/cast.h @@ -17,7 +17,6 @@ #include #include #include -#include #if defined(PYBIND11_CPP17) # if defined(__has_include) @@ -204,10 +203,10 @@ PYBIND11_NOINLINE inline handle get_type_handle(const std::type_info &tp, bool t } struct value_and_holder { - instance *inst = nullptr; - size_t index = 0u; - const detail::type_info *type = nullptr; - void **vh = nullptr; + instance *inst; + size_t index; + const detail::type_info *type; + void **vh; // Main constructor for a found value/holder: value_and_holder(instance *i, const detail::type_info *type, size_t vpos, size_t index) : @@ -216,7 +215,7 @@ struct value_and_holder { {} // Default constructor (used to signal a value-and-holder not found by get_value_and_holder()) - value_and_holder() {} + value_and_holder() : inst{nullptr} {} // Used for past-the-end iterator value_and_holder(size_t index) : index{index} {} @@ -270,8 +269,8 @@ public: struct iterator { private: - instance *inst = nullptr; - const type_vec *types = nullptr; + instance *inst; + const type_vec *types; value_and_holder curr; friend struct values_and_holders; iterator(instance *inst, const type_vec *tinfo) @@ -571,17 +570,7 @@ public: // Lazy allocation for unallocated values: if (vptr == nullptr) { auto *type = v_h.type ? v_h.type : typeinfo; - if (type->operator_new) { - vptr = type->operator_new(type->type_size); - } else { - #if defined(PYBIND11_CPP17) - if (type->type_align > __STDCPP_DEFAULT_NEW_ALIGNMENT__) - vptr = ::operator new(type->type_size, - (std::align_val_t) type->type_align); - else - #endif - vptr = ::operator new(type->type_size); - } + vptr = type->operator_new(type->type_size); } value = vptr; } @@ -785,47 +774,11 @@ template struct is_copy_constructible, is_copy_constructible> {}; #endif -NAMESPACE_END(detail) - -// polymorphic_type_hook::get(src, tinfo) determines whether the object pointed -// to by `src` actually is an instance of some class derived from `itype`. -// If so, it sets `tinfo` to point to the std::type_info representing that derived -// type, and returns a pointer to the start of the most-derived object of that type -// (in which `src` is a subobject; this will be the same address as `src` in most -// single inheritance cases). If not, or if `src` is nullptr, it simply returns `src` -// and leaves `tinfo` at its default value of nullptr. -// -// The default polymorphic_type_hook just returns src. A specialization for polymorphic -// types determines the runtime type of the passed object and adjusts the this-pointer -// appropriately via dynamic_cast. This is what enables a C++ Animal* to appear -// to Python as a Dog (if Dog inherits from Animal, Animal is polymorphic, Dog is -// registered with pybind11, and this Animal is in fact a Dog). -// -// You may specialize polymorphic_type_hook yourself for types that want to appear -// polymorphic to Python but do not use C++ RTTI. (This is a not uncommon pattern -// in performance-sensitive applications, used most notably in LLVM.) -template -struct polymorphic_type_hook -{ - static const void *get(const itype *src, const std::type_info*&) { return src; } -}; -template -struct polymorphic_type_hook::value>> -{ - static const void *get(const itype *src, const std::type_info*& type) { - type = src ? &typeid(*src) : nullptr; - return dynamic_cast(src); - } -}; - -NAMESPACE_BEGIN(detail) - /// Generic type caster for objects stored on the heap template class type_caster_base : public type_caster_generic { using itype = intrinsic_t; - public: - static constexpr auto name = _(); + static PYBIND11_DESCR name() { return type_descr(_()); } type_caster_base() : type_caster_base(typeid(type)) { } explicit type_caster_base(const std::type_info &info) : type_caster_generic(info) { } @@ -840,28 +793,32 @@ public: return cast(&src, return_value_policy::move, parent); } - // Returns a (pointer, type_info) pair taking care of necessary type lookup for a - // polymorphic type (using RTTI by default, but can be overridden by specializing - // polymorphic_type_hook). If the instance isn't derived, returns the base version. + // Returns a (pointer, type_info) pair taking care of necessary RTTI type lookup for a + // polymorphic type. If the instance isn't derived, returns the non-RTTI base version. + template ::value, int> = 0> static std::pair src_and_type(const itype *src) { + const void *vsrc = src; auto &cast_type = typeid(itype); const std::type_info *instance_type = nullptr; - const void *vsrc = polymorphic_type_hook::get(src, instance_type); - if (instance_type && !same_type(cast_type, *instance_type)) { - // This is a base pointer to a derived type. If the derived type is registered - // with pybind11, we want to make the full derived object available. - // In the typical case where itype is polymorphic, we get the correct - // derived pointer (which may be != base pointer) by a dynamic_cast to - // most derived type. If itype is not polymorphic, we won't get here - // except via a user-provided specialization of polymorphic_type_hook, - // and the user has promised that no this-pointer adjustment is - // required in that case, so it's OK to use static_cast. - if (const auto *tpi = get_type_info(*instance_type)) - return {vsrc, tpi}; + if (vsrc) { + instance_type = &typeid(*src); + if (!same_type(cast_type, *instance_type)) { + // This is a base pointer to a derived type; if it is a pybind11-registered type, we + // can get the correct derived pointer (which may be != base pointer) by a + // dynamic_cast to most derived type: + if (auto *tpi = get_type_info(*instance_type)) + return {dynamic_cast(src), const_cast(tpi)}; + } } // Otherwise we have either a nullptr, an `itype` pointer, or an unknown derived pointer, so // don't do a cast - return type_caster_generic::src_and_type(src, cast_type, instance_type); + return type_caster_generic::src_and_type(vsrc, cast_type, instance_type); + } + + // Non-polymorphic type, so no dynamic casting; just call the generic version directly + template ::value, int> = 0> + static std::pair src_and_type(const itype *src) { + return type_caster_generic::src_and_type(src, typeid(itype)); } static handle cast(const itype *src, return_value_policy policy, handle parent) { @@ -878,7 +835,7 @@ public: nullptr, nullptr, holder); } - template using cast_op_type = detail::cast_op_type; + template using cast_op_type = cast_op_type; operator itype*() { return (type *) value; } operator itype&() { if (!value) throw reference_cast_error(); return *((itype *) value); } @@ -928,7 +885,7 @@ private: "std::reference_wrapper caster requires T to have a caster with an `T &` operator"); public: bool load(handle src, bool convert) { return subcaster.load(src, convert); } - static constexpr auto name = caster_t::name; + static PYBIND11_DESCR name() { return caster_t::name(); } static handle cast(const std::reference_wrapper &src, return_value_policy policy, handle parent) { // It is definitely wrong to take ownership of this pointer, so mask that rvp if (policy == return_value_policy::take_ownership || policy == return_value_policy::automatic) @@ -943,7 +900,7 @@ public: protected: \ type value; \ public: \ - static constexpr auto name = py_name; \ + static PYBIND11_DESCR name() { return type_descr(py_name); } \ template >::value, int> = 0> \ static handle cast(T_ *src, return_value_policy policy, handle parent) { \ if (!src) return none().release(); \ @@ -1020,34 +977,20 @@ public: return true; } - template - static typename std::enable_if::value, handle>::type - cast(U src, return_value_policy /* policy */, handle /* parent */) { - return PyFloat_FromDouble((double) src); - } - - template - static typename std::enable_if::value && std::is_signed::value && (sizeof(U) <= sizeof(long)), handle>::type - cast(U src, return_value_policy /* policy */, handle /* parent */) { - return PYBIND11_LONG_FROM_SIGNED((long) src); - } - - template - static typename std::enable_if::value && std::is_unsigned::value && (sizeof(U) <= sizeof(unsigned long)), handle>::type - cast(U src, return_value_policy /* policy */, handle /* parent */) { - return PYBIND11_LONG_FROM_UNSIGNED((unsigned long) src); - } - - template - static typename std::enable_if::value && std::is_signed::value && (sizeof(U) > sizeof(long)), handle>::type - cast(U src, return_value_policy /* policy */, handle /* parent */) { - return PyLong_FromLongLong((long long) src); - } - - template - static typename std::enable_if::value && std::is_unsigned::value && (sizeof(U) > sizeof(unsigned long)), handle>::type - cast(U src, return_value_policy /* policy */, handle /* parent */) { - return PyLong_FromUnsignedLongLong((unsigned long long) src); + static handle cast(T src, return_value_policy /* policy */, handle /* parent */) { + if (std::is_floating_point::value) { + return PyFloat_FromDouble((double) src); + } else if (sizeof(T) <= sizeof(long)) { + if (std::is_signed::value) + return PyLong_FromLong((long) src); + else + return PyLong_FromUnsignedLong((unsigned long) src); + } else { + if (std::is_signed::value) + return PyLong_FromLongLong((long long) src); + else + return PyLong_FromUnsignedLongLong((unsigned long long) src); + } } PYBIND11_TYPE_CASTER(T, _::value>("int", "float")); @@ -1106,7 +1049,7 @@ public: template using cast_op_type = void*&; operator void *&() { return value; } - static constexpr auto name = _("capsule"); + static PYBIND11_DESCR name() { return type_descr(_("capsule")); } private: void *value = nullptr; }; @@ -1349,7 +1292,7 @@ public: return one_char; } - static constexpr auto name = _(PYBIND11_STRING_NAME); + static PYBIND11_DESCR name() { return type_descr(_(PYBIND11_STRING_NAME)); } template using cast_op_type = pybind11::detail::cast_op_type<_T>; }; @@ -1374,7 +1317,9 @@ public: return cast_impl(std::forward(src), policy, parent, indices{}); } - static constexpr auto name = _("Tuple[") + concat(make_caster::name...) + _("]"); + static PYBIND11_DESCR name() { + return type_descr(_("Tuple[") + detail::concat(make_caster::name()...) + _("]")); + } template using cast_op_type = type; @@ -1519,7 +1464,7 @@ struct move_only_holder_caster { auto *ptr = holder_helper::get(src); return type_caster_base::cast_holder(ptr, std::addressof(src)); } - static constexpr auto name = type_caster_base::name; + static PYBIND11_DESCR name() { return type_caster_base::name(); } }; template @@ -1550,10 +1495,10 @@ template struct is_holder_type : template struct is_holder_type> : std::true_type {}; -template struct handle_type_name { static constexpr auto name = _(); }; -template <> struct handle_type_name { static constexpr auto name = _(PYBIND11_BYTES_NAME); }; -template <> struct handle_type_name { static constexpr auto name = _("*args"); }; -template <> struct handle_type_name { static constexpr auto name = _("**kwargs"); }; +template struct handle_type_name { static PYBIND11_DESCR name() { return _(); } }; +template <> struct handle_type_name { static PYBIND11_DESCR name() { return _(PYBIND11_BYTES_NAME); } }; +template <> struct handle_type_name { static PYBIND11_DESCR name() { return _("*args"); } }; +template <> struct handle_type_name { static PYBIND11_DESCR name() { return _("**kwargs"); } }; template struct pyobject_caster { @@ -1571,7 +1516,7 @@ struct pyobject_caster { static handle cast(const handle &src, return_value_policy /* policy */, handle /* parent */) { return src.inc_ref(); } - PYBIND11_TYPE_CASTER(type, handle_type_name::name); + PYBIND11_TYPE_CASTER(type, handle_type_name::name()); }; template @@ -1611,8 +1556,7 @@ template using move_never = none_of, move_if_unrefer // everything else returns a reference/pointer to a local variable. template using cast_is_temporary_value_reference = bool_constant< (std::is_reference::value || std::is_pointer::value) && - !std::is_base_of>::value && - !std::is_same, void>::value + !std::is_base_of>::value >; // When a value returned from a C++ function is being cast back to Python, we almost always want to @@ -1625,9 +1569,8 @@ template struct return_value_policy_ov template struct return_value_policy_override>::value, void>> { static return_value_policy policy(return_value_policy p) { - return !std::is_lvalue_reference::value && - !std::is_pointer::value - ? return_value_policy::move : p; + return !std::is_lvalue_reference::value && !std::is_pointer::value + ? return_value_policy::move : p; } }; @@ -1855,7 +1798,7 @@ struct function_record; /// Internal data associated with a single function call struct function_call { - function_call(const function_record &f, handle p); // Implementation in attr.h + function_call(function_record &f, handle p); // Implementation in attr.h /// The function data: const function_record &func; @@ -1897,7 +1840,7 @@ public: static constexpr bool has_kwargs = kwargs_pos < 0; static constexpr bool has_args = args_pos < 0; - static constexpr auto arg_names = concat(type_descr(make_caster::name)...); + static PYBIND11_DESCR arg_names() { return detail::concat(make_caster::name()...); } bool load_args(function_call &call) { return load_impl_sequence(call, indices{}); @@ -2116,13 +2059,9 @@ object object_api::call(Args &&...args) const { NAMESPACE_END(detail) -#define PYBIND11_MAKE_OPAQUE(...) \ +#define PYBIND11_MAKE_OPAQUE(Type) \ namespace pybind11 { namespace detail { \ - template<> class type_caster<__VA_ARGS__> : public type_caster_base<__VA_ARGS__> { }; \ + template<> class type_caster : public type_caster_base { }; \ }} -/// Lets you pass a type containing a `,` through a macro parameter without needing a separate -/// typedef, e.g.: `PYBIND11_OVERLOAD(PYBIND11_TYPE(ReturnType), PYBIND11_TYPE(Parent), f, arg)` -#define PYBIND11_TYPE(...) __VA_ARGS__ - NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/python/src/pybind11/complex.h b/python/src/pybind11/complex.h index 3f8963857..5dac27cc4 100644 --- a/python/src/pybind11/complex.h +++ b/python/src/pybind11/complex.h @@ -25,13 +25,9 @@ template struct format_descriptor, detail::enable_i static std::string format() { return std::string(value); } }; -#ifndef PYBIND11_CPP17 - template constexpr const char format_descriptor< std::complex, detail::enable_if_t::value>>::value[3]; -#endif - NAMESPACE_BEGIN(detail) template struct is_fmt_numeric, detail::enable_if_t::value>> { diff --git a/python/src/pybind11/detail/class.h b/python/src/pybind11/detail/class.h index b1916fcd0..ff06370fa 100644 --- a/python/src/pybind11/detail/class.h +++ b/python/src/pybind11/detail/class.h @@ -10,7 +10,6 @@ #pragma once #include "../attr.h" -#include "../options.h" NAMESPACE_BEGIN(PYBIND11_NAMESPACE) NAMESPACE_BEGIN(detail) @@ -290,9 +289,13 @@ extern "C" inline int pybind11_object_init(PyObject *self, PyObject *, PyObject inline void add_patient(PyObject *nurse, PyObject *patient) { auto &internals = get_internals(); auto instance = reinterpret_cast(nurse); + auto ¤t_patients = internals.patients[nurse]; instance->has_patients = true; + for (auto &p : current_patients) + if (p == patient) + return; Py_INCREF(patient); - internals.patients[nurse].push_back(patient); + current_patients.push_back(patient); } inline void clear_patients(PyObject *self) { @@ -469,7 +472,7 @@ extern "C" inline int pybind11_getbuffer(PyObject *obj, Py_buffer *view, int fla if (tinfo && tinfo->get_buffer) break; } - if (view == nullptr || !tinfo || !tinfo->get_buffer) { + if (view == nullptr || obj == nullptr || !tinfo || !tinfo->get_buffer) { if (view) view->obj = nullptr; PyErr_SetString(PyExc_BufferError, "pybind11_getbuffer(): Internal error"); diff --git a/python/src/pybind11/detail/common.h b/python/src/pybind11/detail/common.h index bec8ccf3b..892de0f8f 100644 --- a/python/src/pybind11/detail/common.h +++ b/python/src/pybind11/detail/common.h @@ -93,8 +93,8 @@ #endif #define PYBIND11_VERSION_MAJOR 2 -#define PYBIND11_VERSION_MINOR 3 -#define PYBIND11_VERSION_PATCH 0 +#define PYBIND11_VERSION_MINOR 2 +#define PYBIND11_VERSION_PATCH 4 /// Include Python header, disable linking to pythonX_d.lib on Windows in debug mode #if defined(_MSC_VER) @@ -159,8 +159,6 @@ #define PYBIND11_BYTES_SIZE PyBytes_Size #define PYBIND11_LONG_CHECK(o) PyLong_Check(o) #define PYBIND11_LONG_AS_LONGLONG(o) PyLong_AsLongLong(o) -#define PYBIND11_LONG_FROM_SIGNED(o) PyLong_FromSsize_t((ssize_t) o) -#define PYBIND11_LONG_FROM_UNSIGNED(o) PyLong_FromSize_t((size_t) o) #define PYBIND11_BYTES_NAME "bytes" #define PYBIND11_STRING_NAME "str" #define PYBIND11_SLICE_OBJECT PyObject @@ -183,8 +181,6 @@ #define PYBIND11_BYTES_SIZE PyString_Size #define PYBIND11_LONG_CHECK(o) (PyInt_Check(o) || PyLong_Check(o)) #define PYBIND11_LONG_AS_LONGLONG(o) (PyInt_Check(o) ? (long long) PyLong_AsLong(o) : PyLong_AsLongLong(o)) -#define PYBIND11_LONG_FROM_SIGNED(o) PyInt_FromSsize_t((ssize_t) o) // Returns long if needed. -#define PYBIND11_LONG_FROM_UNSIGNED(o) PyInt_FromSize_t((size_t) o) // Returns long if needed. #define PYBIND11_BYTES_NAME "str" #define PYBIND11_STRING_NAME "unicode" #define PYBIND11_SLICE_OBJECT PySliceObject @@ -212,31 +208,6 @@ extern "C" { #define PYBIND11_TOSTRING(x) PYBIND11_STRINGIFY(x) #define PYBIND11_CONCAT(first, second) first##second -#define PYBIND11_CHECK_PYTHON_VERSION \ - { \ - const char *compiled_ver = PYBIND11_TOSTRING(PY_MAJOR_VERSION) \ - "." PYBIND11_TOSTRING(PY_MINOR_VERSION); \ - const char *runtime_ver = Py_GetVersion(); \ - size_t len = std::strlen(compiled_ver); \ - if (std::strncmp(runtime_ver, compiled_ver, len) != 0 \ - || (runtime_ver[len] >= '0' && runtime_ver[len] <= '9')) { \ - PyErr_Format(PyExc_ImportError, \ - "Python version mismatch: module was compiled for Python %s, " \ - "but the interpreter version is incompatible: %s.", \ - compiled_ver, runtime_ver); \ - return nullptr; \ - } \ - } - -#define PYBIND11_CATCH_INIT_EXCEPTIONS \ - catch (pybind11::error_already_set &e) { \ - PyErr_SetString(PyExc_ImportError, e.what()); \ - return nullptr; \ - } catch (const std::exception &e) { \ - PyErr_SetString(PyExc_ImportError, e.what()); \ - return nullptr; \ - } \ - /** \rst ***Deprecated in favor of PYBIND11_MODULE*** @@ -256,10 +227,27 @@ extern "C" { PYBIND11_DEPRECATED("PYBIND11_PLUGIN is deprecated, use PYBIND11_MODULE") \ static PyObject *pybind11_init(); \ PYBIND11_PLUGIN_IMPL(name) { \ - PYBIND11_CHECK_PYTHON_VERSION \ + int major, minor; \ + if (sscanf(Py_GetVersion(), "%i.%i", &major, &minor) != 2) { \ + PyErr_SetString(PyExc_ImportError, "Can't parse Python version."); \ + return nullptr; \ + } else if (major != PY_MAJOR_VERSION || minor != PY_MINOR_VERSION) { \ + PyErr_Format(PyExc_ImportError, \ + "Python version mismatch: module was compiled for " \ + "version %i.%i, while the interpreter is running " \ + "version %i.%i.", PY_MAJOR_VERSION, PY_MINOR_VERSION, \ + major, minor); \ + return nullptr; \ + } \ try { \ return pybind11_init(); \ - } PYBIND11_CATCH_INIT_EXCEPTIONS \ + } catch (pybind11::error_already_set &e) { \ + PyErr_SetString(PyExc_ImportError, e.what()); \ + return nullptr; \ + } catch (const std::exception &e) { \ + PyErr_SetString(PyExc_ImportError, e.what()); \ + return nullptr; \ + } \ } \ PyObject *pybind11_init() @@ -283,12 +271,29 @@ extern "C" { #define PYBIND11_MODULE(name, variable) \ static void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &); \ PYBIND11_PLUGIN_IMPL(name) { \ - PYBIND11_CHECK_PYTHON_VERSION \ + int major, minor; \ + if (sscanf(Py_GetVersion(), "%i.%i", &major, &minor) != 2) { \ + PyErr_SetString(PyExc_ImportError, "Can't parse Python version."); \ + return nullptr; \ + } else if (major != PY_MAJOR_VERSION || minor != PY_MINOR_VERSION) { \ + PyErr_Format(PyExc_ImportError, \ + "Python version mismatch: module was compiled for " \ + "version %i.%i, while the interpreter is running " \ + "version %i.%i.", PY_MAJOR_VERSION, PY_MINOR_VERSION, \ + major, minor); \ + return nullptr; \ + } \ auto m = pybind11::module(PYBIND11_TOSTRING(name)); \ try { \ PYBIND11_CONCAT(pybind11_init_, name)(m); \ return m.ptr(); \ - } PYBIND11_CATCH_INIT_EXCEPTIONS \ + } catch (pybind11::error_already_set &e) { \ + PyErr_SetString(PyExc_ImportError, e.what()); \ + return nullptr; \ + } catch (const std::exception &e) { \ + PyErr_SetString(PyExc_ImportError, e.what()); \ + return nullptr; \ + } \ } \ void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &variable) @@ -386,7 +391,7 @@ struct instance { void *simple_value_holder[1 + instance_simple_holder_in_ptrs()]; nonsimple_values_and_holders nonsimple; }; - /// Weak references + /// Weak references (needed for keep alive): PyObject *weakrefs; /// If true, the pointer is owned which means we're free to manage it with a holder. bool owned : 1; @@ -403,10 +408,10 @@ struct instance { * (which is typically the size of two pointers), or when multiple inheritance is used on the * python side. Non-simple layout allocates the required amount of memory to have multiple * bound C++ classes as parents. Under this layout, `nonsimple.values_and_holders` is set to a - * pointer to allocated space of the required space to hold a sequence of value pointers and + * pointer to allocated space of the required space to hold a a sequence of value pointers and * holders followed `status`, a set of bit flags (1 byte each), i.e. * [val1*][holder1][val2*][holder2]...[bb...] where each [block] is rounded up to a multiple of - * `sizeof(void *)`. `nonsimple.status` is, for convenience, a pointer to the + * `sizeof(void *)`. `nonsimple.holder_constructed` is, for convenience, a pointer to the * beginning of the [bb...] block (but not independently allocated). * * Status bits indicate whether the associated holder is constructed (& @@ -579,11 +584,6 @@ template using deferred_t = typename deferred_type< template using is_strict_base_of = bool_constant< std::is_base_of::value && !std::is_same::value>; -/// Like is_base_of, but also requires that the base type is accessible (i.e. that a Derived pointer -/// can be converted to a Base pointer) -template using is_accessible_base_of = bool_constant< - std::is_base_of::value && std::is_convertible::value>; - template class Base> struct is_template_base_of_impl { template static std::true_type check(Base *); @@ -702,13 +702,9 @@ template struct format_descriptor constexpr const char format_descriptor< T, detail::enable_if_t::value>>::value[2]; -#endif - /// RAII wrapper that temporarily clears any Python error state struct error_scope { PyObject *type, *value, *trace; diff --git a/python/src/pybind11/detail/descr.h b/python/src/pybind11/detail/descr.h index 8d404e534..e3bf2ba97 100644 --- a/python/src/pybind11/detail/descr.h +++ b/python/src/pybind11/detail/descr.h @@ -1,5 +1,6 @@ /* - pybind11/detail/descr.h: Helper type for concatenating type signatures at compile time + pybind11/detail/descr.h: Helper type for concatenating type signatures + either at runtime (C++11) or compile time (C++14) Copyright (c) 2016 Wenzel Jakob @@ -14,87 +15,171 @@ NAMESPACE_BEGIN(PYBIND11_NAMESPACE) NAMESPACE_BEGIN(detail) -#if !defined(_MSC_VER) -# define PYBIND11_DESCR_CONSTEXPR static constexpr -#else -# define PYBIND11_DESCR_CONSTEXPR const -#endif +/* Concatenate type signatures at compile time using C++14 */ +#if defined(PYBIND11_CPP14) && !defined(_MSC_VER) +#define PYBIND11_CONSTEXPR_DESCR -/* Concatenate type signatures at compile time */ -template -struct descr { - char text[N + 1]; +template class descr { + template friend class descr; +public: + constexpr descr(char const (&text) [Size1+1], const std::type_info * const (&types)[Size2+1]) + : descr(text, types, + make_index_sequence(), + make_index_sequence()) { } - constexpr descr() : text{'\0'} { } - constexpr descr(char const (&s)[N+1]) : descr(s, make_index_sequence()) { } + constexpr const char *text() const { return m_text; } + constexpr const std::type_info * const * types() const { return m_types; } - template - constexpr descr(char const (&s)[N+1], index_sequence) : text{s[Is]..., '\0'} { } - - template - constexpr descr(char c, Chars... cs) : text{c, static_cast(cs)..., '\0'} { } - - static constexpr std::array types() { - return {{&typeid(Ts)..., nullptr}}; + template + constexpr descr operator+(const descr &other) const { + return concat(other, + make_index_sequence(), + make_index_sequence(), + make_index_sequence(), + make_index_sequence()); } + +protected: + template + constexpr descr( + char const (&text) [Size1+1], + const std::type_info * const (&types) [Size2+1], + index_sequence, index_sequence) + : m_text{text[Indices1]..., '\0'}, + m_types{types[Indices2]..., nullptr } {} + + template + constexpr descr + concat(const descr &other, + index_sequence, index_sequence, + index_sequence, index_sequence) const { + return descr( + { m_text[Indices1]..., other.m_text[OtherIndices1]..., '\0' }, + { m_types[Indices2]..., other.m_types[OtherIndices2]..., nullptr } + ); + } + +protected: + char m_text[Size1 + 1]; + const std::type_info * m_types[Size2 + 1]; }; -template -constexpr descr plus_impl(const descr &a, const descr &b, - index_sequence, index_sequence) { - return {a.text[Is1]..., b.text[Is2]...}; +template constexpr descr _(char const(&text)[Size]) { + return descr(text, { nullptr }); } -template -constexpr descr operator+(const descr &a, const descr &b) { - return plus_impl(a, b, make_index_sequence(), make_index_sequence()); -} - -template -constexpr descr _(char const(&text)[N]) { return descr(text); } -constexpr descr<0> _(char const(&)[1]) { return {}; } - template struct int_to_str : int_to_str { }; template struct int_to_str<0, Digits...> { - static constexpr auto digits = descr(('0' + Digits)...); + static constexpr auto digits = descr({ ('0' + Digits)..., '\0' }, { nullptr }); }; // Ternary description (like std::conditional) -template -constexpr enable_if_t> _(char const(&text1)[N1], char const(&)[N2]) { +template +constexpr enable_if_t> _(char const(&text1)[Size1], char const(&)[Size2]) { return _(text1); } -template -constexpr enable_if_t> _(char const(&)[N1], char const(&text2)[N2]) { +template +constexpr enable_if_t> _(char const(&)[Size1], char const(&text2)[Size2]) { return _(text2); } - -template -constexpr enable_if_t _(const T1 &d, const T2 &) { return d; } -template -constexpr enable_if_t _(const T1 &, const T2 &d) { return d; } +template +constexpr enable_if_t> _(descr d, descr) { return d; } +template +constexpr enable_if_t> _(descr, descr d) { return d; } template auto constexpr _() -> decltype(int_to_str::digits) { return int_to_str::digits; } -template constexpr descr<1, Type> _() { return {'%'}; } - -constexpr descr<0> concat() { return {}; } - -template -constexpr descr concat(const descr &descr) { return descr; } - -template -constexpr auto concat(const descr &d, const Args &...args) - -> decltype(std::declval>() + concat(args...)) { - return d + _(", ") + concat(args...); +template constexpr descr<1, 1> _() { + return descr<1, 1>({ '%', '\0' }, { &typeid(Type), nullptr }); } -template -constexpr descr type_descr(const descr &descr) { - return _("{") + descr + _("}"); +inline constexpr descr<0, 0> concat() { return _(""); } +template auto constexpr concat(descr descr) { return descr; } +template auto constexpr concat(descr descr, Args&&... args) { return descr + _(", ") + concat(args...); } +template auto constexpr type_descr(descr descr) { return _("{") + descr + _("}"); } + +#define PYBIND11_DESCR constexpr auto + +#else /* Simpler C++11 implementation based on run-time memory allocation and copying */ + +class descr { +public: + PYBIND11_NOINLINE descr(const char *text, const std::type_info * const * types) { + size_t nChars = len(text), nTypes = len(types); + m_text = new char[nChars]; + m_types = new const std::type_info *[nTypes]; + memcpy(m_text, text, nChars * sizeof(char)); + memcpy(m_types, types, nTypes * sizeof(const std::type_info *)); + } + + PYBIND11_NOINLINE descr operator+(descr &&d2) && { + descr r; + + size_t nChars1 = len(m_text), nTypes1 = len(m_types); + size_t nChars2 = len(d2.m_text), nTypes2 = len(d2.m_types); + + r.m_text = new char[nChars1 + nChars2 - 1]; + r.m_types = new const std::type_info *[nTypes1 + nTypes2 - 1]; + memcpy(r.m_text, m_text, (nChars1-1) * sizeof(char)); + memcpy(r.m_text + nChars1 - 1, d2.m_text, nChars2 * sizeof(char)); + memcpy(r.m_types, m_types, (nTypes1-1) * sizeof(std::type_info *)); + memcpy(r.m_types + nTypes1 - 1, d2.m_types, nTypes2 * sizeof(std::type_info *)); + + delete[] m_text; delete[] m_types; + delete[] d2.m_text; delete[] d2.m_types; + + return r; + } + + char *text() { return m_text; } + const std::type_info * * types() { return m_types; } + +protected: + PYBIND11_NOINLINE descr() { } + + template static size_t len(const T *ptr) { // return length including null termination + const T *it = ptr; + while (*it++ != (T) 0) + ; + return static_cast(it - ptr); + } + + const std::type_info **m_types = nullptr; + char *m_text = nullptr; +}; + +/* The 'PYBIND11_NOINLINE inline' combinations below are intentional to get the desired linkage while producing as little object code as possible */ + +PYBIND11_NOINLINE inline descr _(const char *text) { + const std::type_info *types[1] = { nullptr }; + return descr(text, types); } +template PYBIND11_NOINLINE enable_if_t _(const char *text1, const char *) { return _(text1); } +template PYBIND11_NOINLINE enable_if_t _(char const *, const char *text2) { return _(text2); } +template PYBIND11_NOINLINE enable_if_t _(descr d, descr) { return d; } +template PYBIND11_NOINLINE enable_if_t _(descr, descr d) { return d; } + +template PYBIND11_NOINLINE descr _() { + const std::type_info *types[2] = { &typeid(Type), nullptr }; + return descr("%", types); +} + +template PYBIND11_NOINLINE descr _() { + const std::type_info *types[1] = { nullptr }; + return descr(std::to_string(Size).c_str(), types); +} + +PYBIND11_NOINLINE inline descr concat() { return _(""); } +PYBIND11_NOINLINE inline descr concat(descr &&d) { return d; } +template PYBIND11_NOINLINE descr concat(descr &&d, Args&&... args) { return std::move(d) + _(", ") + concat(std::forward(args)...); } +PYBIND11_NOINLINE inline descr type_descr(descr&& d) { return _("{") + std::move(d) + _("}"); } + +#define PYBIND11_DESCR ::pybind11::detail::descr +#endif + NAMESPACE_END(detail) NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/python/src/pybind11/detail/init.h b/python/src/pybind11/detail/init.h index acfe00bdb..82f740760 100644 --- a/python/src/pybind11/detail/init.h +++ b/python/src/pybind11/detail/init.h @@ -24,7 +24,7 @@ public: template using cast_op_type = value_and_holder &; operator value_and_holder &() { return *value; } - static constexpr auto name = _(); + static PYBIND11_DESCR name() { return type_descr(_()); } private: value_and_holder *value = nullptr; diff --git a/python/src/pybind11/detail/internals.h b/python/src/pybind11/detail/internals.h index f1dd38764..78d4afed0 100644 --- a/python/src/pybind11/detail/internals.h +++ b/python/src/pybind11/detail/internals.h @@ -23,7 +23,7 @@ inline PyObject *make_object_base_type(PyTypeObject *metaclass); #if PY_VERSION_HEX >= 0x03070000 # define PYBIND11_TLS_KEY_INIT(var) Py_tss_t *var = nullptr # define PYBIND11_TLS_GET_VALUE(key) PyThread_tss_get((key)) -# define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_tss_set((key), (value)) +# define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_tss_set((key), (tstate)) # define PYBIND11_TLS_DELETE_VALUE(key) PyThread_tss_set((key), nullptr) #else // Usually an int but a long on Cygwin64 with Python 3.x @@ -116,7 +116,7 @@ struct internals { struct type_info { PyTypeObject *type; const std::type_info *cpptype; - size_t type_size, type_align, holder_size_in_ptrs; + size_t type_size, holder_size_in_ptrs; void *(*operator_new)(size_t); void (*init_instance)(instance *, const void *); void (*dealloc)(value_and_holder &v_h); @@ -138,13 +138,7 @@ struct type_info { }; /// Tracks the `internals` and `type_info` ABI version independent of the main library version -#define PYBIND11_INTERNALS_VERSION 3 - -#if defined(_DEBUG) -# define PYBIND11_BUILD_TYPE "_debug" -#else -# define PYBIND11_BUILD_TYPE "" -#endif +#define PYBIND11_INTERNALS_VERSION 2 #if defined(WITH_THREAD) # define PYBIND11_INTERNALS_KIND "" @@ -153,10 +147,10 @@ struct type_info { #endif #define PYBIND11_INTERNALS_ID "__pybind11_internals_v" \ - PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) PYBIND11_INTERNALS_KIND PYBIND11_BUILD_TYPE "__" + PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) PYBIND11_INTERNALS_KIND "__" #define PYBIND11_MODULE_LOCAL_ID "__pybind11_module_local_v" \ - PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) PYBIND11_INTERNALS_KIND PYBIND11_BUILD_TYPE "__" + PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) PYBIND11_INTERNALS_KIND "__" /// Each module locally stores a pointer to the `internals` data. The data /// itself is shared among modules with the same `PYBIND11_INTERNALS_ID`. diff --git a/python/src/pybind11/detail/typeid.h b/python/src/pybind11/detail/typeid.h index 9c8a4fc69..6f36aab75 100644 --- a/python/src/pybind11/detail/typeid.h +++ b/python/src/pybind11/detail/typeid.h @@ -16,8 +16,6 @@ #include #endif -#include "common.h" - NAMESPACE_BEGIN(PYBIND11_NAMESPACE) NAMESPACE_BEGIN(detail) /// Erase all occurrences of a substring diff --git a/python/src/pybind11/eigen.h b/python/src/pybind11/eigen.h index d963d9650..0899ec73f 100644 --- a/python/src/pybind11/eigen.h +++ b/python/src/pybind11/eigen.h @@ -17,11 +17,6 @@ # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Wconversion" # pragma GCC diagnostic ignored "-Wdeprecated-declarations" -# ifdef __clang__ -// Eigen generates a bunch of implicit-copy-constructor-is-deprecated warnings with -Wdeprecated -// under Clang, so disable that warning here: -# pragma GCC diagnostic ignored "-Wdeprecated" -# endif # if __GNUC__ >= 7 # pragma GCC diagnostic ignored "-Wint-in-bool-context" # endif @@ -186,26 +181,28 @@ template struct EigenProps { } } - static constexpr bool show_writeable = is_eigen_dense_map::value && is_eigen_mutable_map::value; - static constexpr bool show_order = is_eigen_dense_map::value; - static constexpr bool show_c_contiguous = show_order && requires_row_major; - static constexpr bool show_f_contiguous = !show_c_contiguous && show_order && requires_col_major; + static PYBIND11_DESCR descriptor() { + constexpr bool show_writeable = is_eigen_dense_map::value && is_eigen_mutable_map::value; + constexpr bool show_order = is_eigen_dense_map::value; + constexpr bool show_c_contiguous = show_order && requires_row_major; + constexpr bool show_f_contiguous = !show_c_contiguous && show_order && requires_col_major; - static constexpr auto descriptor = - _("numpy.ndarray[") + npy_format_descriptor::name + - _("[") + _(_<(size_t) rows>(), _("m")) + - _(", ") + _(_<(size_t) cols>(), _("n")) + - _("]") + - // For a reference type (e.g. Ref) we have other constraints that might need to be - // satisfied: writeable=True (for a mutable reference), and, depending on the map's stride - // options, possibly f_contiguous or c_contiguous. We include them in the descriptor output - // to provide some hint as to why a TypeError is occurring (otherwise it can be confusing to - // see that a function accepts a 'numpy.ndarray[float64[3,2]]' and an error message that you - // *gave* a numpy.ndarray of the right type and dimensions. - _(", flags.writeable", "") + - _(", flags.c_contiguous", "") + - _(", flags.f_contiguous", "") + - _("]"); + return type_descr(_("numpy.ndarray[") + npy_format_descriptor::name() + + _("[") + _(_<(size_t) rows>(), _("m")) + + _(", ") + _(_<(size_t) cols>(), _("n")) + + _("]") + + // For a reference type (e.g. Ref) we have other constraints that might need to be + // satisfied: writeable=True (for a mutable reference), and, depending on the map's stride + // options, possibly f_contiguous or c_contiguous. We include them in the descriptor output + // to provide some hint as to why a TypeError is occurring (otherwise it can be confusing to + // see that a function accepts a 'numpy.ndarray[float64[3,2]]' and an error message that you + // *gave* a numpy.ndarray of the right type and dimensions. + _(", flags.writeable", "") + + _(", flags.c_contiguous", "") + + _(", flags.f_contiguous", "") + + _("]") + ); + } }; // Casts an Eigen type to numpy array. If given a base, the numpy array references the src data, @@ -342,7 +339,7 @@ public: return cast_impl(src, policy, parent); } - static constexpr auto name = props::descriptor; + static PYBIND11_DESCR name() { return props::descriptor(); } operator Type*() { return &value; } operator Type&() { return value; } @@ -382,7 +379,7 @@ public: } } - static constexpr auto name = props::descriptor; + static PYBIND11_DESCR name() { return props::descriptor(); } // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return // types but not bound arguments). We still provide them (with an explicitly delete) so that @@ -527,7 +524,7 @@ public: } static handle cast(const Type *src, return_value_policy policy, handle parent) { return cast(*src, policy, parent); } - static constexpr auto name = props::descriptor; + static PYBIND11_DESCR name() { return props::descriptor(); } // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return // types but not bound arguments). We still provide them (with an explicitly delete) so that @@ -594,7 +591,7 @@ struct type_caster::value>> { } PYBIND11_TYPE_CASTER(Type, _<(Type::IsRowMajor) != 0>("scipy.sparse.csr_matrix[", "scipy.sparse.csc_matrix[") - + npy_format_descriptor::name + _("]")); + + npy_format_descriptor::name() + _("]")); }; NAMESPACE_END(detail) diff --git a/python/src/pybind11/embed.h b/python/src/pybind11/embed.h index 72655885e..9abc61c34 100644 --- a/python/src/pybind11/embed.h +++ b/python/src/pybind11/embed.h @@ -90,14 +90,8 @@ NAMESPACE_END(detail) Initialize the Python interpreter. No other pybind11 or CPython API functions can be called before this is done; with the exception of `PYBIND11_EMBEDDED_MODULE`. The optional parameter can be used to skip the registration of signal handlers (see the - `Python documentation`_ for details). Calling this function again after the interpreter + Python documentation for details). Calling this function again after the interpreter has already been initialized is a fatal error. - - If initializing the Python interpreter fails, then the program is terminated. (This - is controlled by the CPython runtime and is an exception to pybind11's normal behavior - of throwing exceptions on errors.) - - .. _Python documentation: https://docs.python.org/3/c-api/init.html#c.Py_InitializeEx \endrst */ inline void initialize_interpreter(bool init_signal_handlers = true) { if (Py_IsInitialized()) diff --git a/python/src/pybind11/functional.h b/python/src/pybind11/functional.h index 7a0988ab0..eda14ba58 100644 --- a/python/src/pybind11/functional.h +++ b/python/src/pybind11/functional.h @@ -54,20 +54,9 @@ public: } } - // ensure GIL is held during functor destruction - struct func_handle { - function f; - func_handle(function&& f_) : f(std::move(f_)) {} - func_handle(const func_handle&) = default; - ~func_handle() { - gil_scoped_acquire acq; - function kill_f(std::move(f)); - } - }; - - value = [hfunc = func_handle(std::move(func))](Args... args) -> Return { + value = [func](Args... args) -> Return { gil_scoped_acquire acq; - object retval(hfunc.f(std::forward(args)...)); + object retval(func(std::forward(args)...)); /* Visual studio 2015 parser issue: need parentheses around this expression */ return (retval.template cast()); }; @@ -86,8 +75,10 @@ public: return cpp_function(std::forward(f_), policy).release(); } - PYBIND11_TYPE_CASTER(type, _("Callable[[") + concat(make_caster::name...) + _("], ") - + make_caster::name + _("]")); + PYBIND11_TYPE_CASTER(type, _("Callable[[") + + argument_loader::arg_names() + _("], ") + + make_caster::name() + + _("]")); }; NAMESPACE_END(detail) diff --git a/python/src/pybind11/iostream.h b/python/src/pybind11/iostream.h index 72baef8fd..3caf55639 100644 --- a/python/src/pybind11/iostream.h +++ b/python/src/pybind11/iostream.h @@ -25,8 +25,7 @@ class pythonbuf : public std::streambuf { private: using traits_type = std::streambuf::traits_type; - const size_t buf_size; - std::unique_ptr d_buffer; + char d_buffer[1024]; object pywrite; object pyflush; @@ -43,11 +42,8 @@ private: // This subtraction cannot be negative, so dropping the sign str line(pbase(), static_cast(pptr() - pbase())); - { - gil_scoped_acquire tmp; - pywrite(line); - pyflush(); - } + pywrite(line); + pyflush(); setp(pbase(), epptr()); } @@ -55,13 +51,10 @@ private: } public: - - pythonbuf(object pyostream, size_t buffer_size = 1024) - : buf_size(buffer_size), - d_buffer(new char[buf_size]), - pywrite(pyostream.attr("write")), + pythonbuf(object pyostream) + : pywrite(pyostream.attr("write")), pyflush(pyostream.attr("flush")) { - setp(d_buffer.get(), d_buffer.get() + buf_size - 1); + setp(d_buffer, d_buffer + sizeof(d_buffer) - 1); } /// Sync before destroy @@ -201,7 +194,7 @@ inline class_ add_ostream_redirect(module m, std::strin return class_(m, name.c_str(), module_local()) .def(init(), arg("stdout")=true, arg("stderr")=true) .def("__enter__", &detail::OstreamRedirect::enter) - .def("__exit__", [](detail::OstreamRedirect &self_, args) { self_.exit(); }); + .def("__exit__", [](detail::OstreamRedirect &self, args) { self.exit(); }); } NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/python/src/pybind11/numpy.h b/python/src/pybind11/numpy.h index b2a02e024..9df493499 100644 --- a/python/src/pybind11/numpy.h +++ b/python/src/pybind11/numpy.h @@ -18,9 +18,9 @@ #include #include #include +#include #include #include -#include #include #if defined(_MSC_VER) @@ -250,7 +250,7 @@ template struct array_info_scalar { typedef T type; static constexpr bool is_array = false; static constexpr bool is_empty = false; - static constexpr auto extents = _(""); + static PYBIND11_DESCR extents() { return _(""); } static void append_extents(list& /* shape */) { } }; // Computes underlying type and a comma-separated list of extents for array @@ -269,9 +269,15 @@ template struct array_info> { array_info::append_extents(shape); } - static constexpr auto extents = _::is_array>( - concat(_(), array_info::extents), _() - ); + template::is_array, int> = 0> + static PYBIND11_DESCR extents() { + return _(); + } + + template::is_array, int> = 0> + static PYBIND11_DESCR extents() { + return concat(_(), array_info::extents()); + } }; // For numpy we have special handling for arrays of characters, so we don't include // the size in the array extents. @@ -440,7 +446,7 @@ public: /// This is essentially the same as calling numpy.dtype(args) in Python. static dtype from_args(object args) { PyObject *ptr = nullptr; - if (!detail::npy_api::get().PyArray_DescrConverter_(args.ptr(), &ptr) || !ptr) + if (!detail::npy_api::get().PyArray_DescrConverter_(args.release().ptr(), &ptr) || !ptr) throw error_already_set(); return reinterpret_steal(ptr); } @@ -855,14 +861,14 @@ public: // Reference to element at a given index template const T& at(Ix... index) const { - if ((ssize_t) sizeof...(index) != ndim()) + if (sizeof...(index) != ndim()) fail_dim_check(sizeof...(index), "index dimension mismatch"); return *(static_cast(array::data()) + byte_offset(ssize_t(index)...) / itemsize()); } // Mutable reference to element at a given index template T& mutable_at(Ix... index) { - if ((ssize_t) sizeof...(index) != ndim()) + if (sizeof...(index) != ndim()) fail_dim_check(sizeof...(index), "index dimension mismatch"); return *(static_cast(array::mutable_data()) + byte_offset(ssize_t(index)...) / itemsize()); } @@ -942,8 +948,8 @@ template struct format_descriptor::is_array>> { static std::string format() { using namespace detail; - static constexpr auto extents = _("(") + array_info::extents + _(")"); - return extents.text + format_descriptor>::format(); + PYBIND11_DESCR extents = _("(") + array_info::extents() + _(")"); + return extents.text() + format_descriptor>::format(); } }; @@ -962,7 +968,7 @@ struct pyobject_caster> { static handle cast(const handle &src, return_value_policy /* policy */, handle /* parent */) { return src.inc_ref(); } - PYBIND11_TYPE_CASTER(type, handle_type_name::name); + PYBIND11_TYPE_CASTER(type, handle_type_name::name()); }; template @@ -972,34 +978,7 @@ struct compare_buffer_info::valu } }; -template -struct npy_format_descriptor_name; - -template -struct npy_format_descriptor_name::value>> { - static constexpr auto name = _::value>( - _("bool"), _::value>("int", "uint") + _() - ); -}; - -template -struct npy_format_descriptor_name::value>> { - static constexpr auto name = _::value || std::is_same::value>( - _("float") + _(), _("longdouble") - ); -}; - -template -struct npy_format_descriptor_name::value>> { - static constexpr auto name = _::value - || std::is_same::value>( - _("complex") + _(), _("longcomplex") - ); -}; - -template -struct npy_format_descriptor::value>> - : npy_format_descriptor_name { +template struct npy_format_descriptor::value>> { private: // NB: the order here must match the one in common.h constexpr static const int values[15] = { @@ -1018,10 +997,25 @@ public: return reinterpret_borrow(ptr); pybind11_fail("Unsupported buffer format!"); } + template ::value, int> = 0> + static PYBIND11_DESCR name() { + return _::value>(_("bool"), + _::value>("int", "uint") + _()); + } + template ::value, int> = 0> + static PYBIND11_DESCR name() { + return _::value || std::is_same::value>( + _("float") + _(), _("longdouble")); + } + template ::value, int> = 0> + static PYBIND11_DESCR name() { + return _::value || std::is_same::value>( + _("complex") + _(), _("longcomplex")); + } }; #define PYBIND11_DECL_CHAR_FMT \ - static constexpr auto name = _("S") + _(); \ + static PYBIND11_DESCR name() { return _("S") + _(); } \ static pybind11::dtype dtype() { return pybind11::dtype(std::string("S") + std::to_string(N)); } template struct npy_format_descriptor { PYBIND11_DECL_CHAR_FMT }; template struct npy_format_descriptor> { PYBIND11_DECL_CHAR_FMT }; @@ -1033,7 +1027,7 @@ private: public: static_assert(!array_info::is_empty, "Zero-sized arrays are not supported"); - static constexpr auto name = _("(") + array_info::extents + _(")") + base_descr::name; + static PYBIND11_DESCR name() { return _("(") + array_info::extents() + _(")") + base_descr::name(); } static pybind11::dtype dtype() { list shape; array_info::append_extents(shape); @@ -1045,7 +1039,7 @@ template struct npy_format_descriptor private: using base_descr = npy_format_descriptor::type>; public: - static constexpr auto name = base_descr::name; + static PYBIND11_DESCR name() { return base_descr::name(); } static pybind11::dtype dtype() { return base_descr::dtype(); } }; @@ -1058,7 +1052,7 @@ struct field_descriptor { }; inline PYBIND11_NOINLINE void register_structured_dtype( - any_container fields, + const std::initializer_list& fields, const std::type_info& tinfo, ssize_t itemsize, bool (*direct_converter)(PyObject *, void *&)) { @@ -1067,7 +1061,7 @@ inline PYBIND11_NOINLINE void register_structured_dtype( pybind11_fail("NumPy: dtype is already registered"); list names, formats, offsets; - for (auto field : *fields) { + for (auto field : fields) { if (!field.descr) pybind11_fail(std::string("NumPy: unsupported field dtype: `") + field.name + "` @ " + tinfo.name()); @@ -1084,7 +1078,7 @@ inline PYBIND11_NOINLINE void register_structured_dtype( // - https://github.com/numpy/numpy/pull/7798 // Because of this, we won't use numpy's logic to generate buffer format // strings and will just do it ourselves. - std::vector ordered_fields(std::move(fields)); + std::vector ordered_fields(fields); std::sort(ordered_fields.begin(), ordered_fields.end(), [](const field_descriptor &a, const field_descriptor &b) { return a.offset < b.offset; }); ssize_t offset = 0; @@ -1120,7 +1114,7 @@ inline PYBIND11_NOINLINE void register_structured_dtype( template struct npy_format_descriptor { static_assert(is_pod_struct::value, "Attempt to use a non-POD or unimplemented POD type as a numpy dtype"); - static constexpr auto name = make_caster::name; + static PYBIND11_DESCR name() { return make_caster::name(); } static pybind11::dtype dtype() { return reinterpret_borrow(dtype_ptr()); @@ -1131,8 +1125,8 @@ template struct npy_format_descriptor { return format_str; } - static void register_dtype(any_container fields) { - register_structured_dtype(std::move(fields), typeid(typename std::remove_cv::type), + static void register_dtype(const std::initializer_list& fields) { + register_structured_dtype(fields, typeid(typename std::remove_cv::type), sizeof(T), &direct_converter); } @@ -1205,8 +1199,7 @@ private: #define PYBIND11_NUMPY_DTYPE(Type, ...) \ ::pybind11::detail::npy_format_descriptor::register_dtype \ - (::std::vector<::pybind11::detail::field_descriptor> \ - {PYBIND11_MAP_LIST (PYBIND11_FIELD_DESCRIPTOR, Type, __VA_ARGS__)}) + ({PYBIND11_MAP_LIST (PYBIND11_FIELD_DESCRIPTOR, Type, __VA_ARGS__)}) #ifdef _MSC_VER #define PYBIND11_MAP2_LIST_NEXT1(test, next) \ @@ -1227,8 +1220,7 @@ private: #define PYBIND11_NUMPY_DTYPE_EX(Type, ...) \ ::pybind11::detail::npy_format_descriptor::register_dtype \ - (::std::vector<::pybind11::detail::field_descriptor> \ - {PYBIND11_MAP2_LIST (PYBIND11_FIELD_DESCRIPTOR_EX, Type, __VA_ARGS__)}) + ({PYBIND11_MAP2_LIST (PYBIND11_FIELD_DESCRIPTOR_EX, Type, __VA_ARGS__)}) #endif // __CLION_IDE__ @@ -1466,10 +1458,7 @@ public: private: remove_reference_t f; - // Internal compiler error in MSVC 19.16.27025.1 (Visual Studio 2017 15.9.4), when compiling with "/permissive-" flag - // when arg_call_types is manually inlined. - using arg_call_types = std::tuple::call_type...>; - template using param_n_t = typename std::tuple_element::type; + template using param_n_t = typename pack_element::call_type...>::type; // Runs a vectorized function given arguments tuple and three index sequences: // - Index is the full set of 0 ... (N-1) argument indices; @@ -1509,7 +1498,7 @@ private: if (trivial == broadcast_trivial::f_trivial) result = array_t(shape); else result = array_t(shape); - if (size == 0) return std::move(result); + if (size == 0) return result; /* Call the function */ if (trivial == broadcast_trivial::non_trivial) @@ -1517,7 +1506,7 @@ private: else apply_trivial(buffers, params, result.mutable_data(), size, i_seq, vi_seq, bi_seq); - return std::move(result); + return result; } template @@ -1570,7 +1559,9 @@ vectorize_extractor(const Func &f, Return (*) (Args ...)) { } template struct handle_type_name> { - static constexpr auto name = _("numpy.ndarray[") + npy_format_descriptor::name + _("]"); + static PYBIND11_DESCR name() { + return _("numpy.ndarray[") + npy_format_descriptor::name() + _("]"); + } }; NAMESPACE_END(detail) diff --git a/python/src/pybind11/pybind11.h b/python/src/pybind11/pybind11.h index f1d91c788..9094fc424 100644 --- a/python/src/pybind11/pybind11.h +++ b/python/src/pybind11/pybind11.h @@ -10,17 +10,7 @@ #pragma once -#if defined(__INTEL_COMPILER) -# pragma warning push -# pragma warning disable 68 // integer conversion resulted in a change of sign -# pragma warning disable 186 // pointless comparison of unsigned integer with zero -# pragma warning disable 878 // incompatible exception specifications -# pragma warning disable 1334 // the "template" keyword used for syntactic disambiguation may only be used within a template -# pragma warning disable 1682 // implicit conversion of a 64-bit integral type to a smaller integral type (potential portability problem) -# pragma warning disable 1786 // function "strdup" was declared deprecated -# pragma warning disable 1875 // offsetof applied to non-POD (Plain Old Data) types is nonstandard -# pragma warning disable 2196 // warning #2196: routine is both "inline" and "noinline" -#elif defined(_MSC_VER) +#if defined(_MSC_VER) # pragma warning(push) # pragma warning(disable: 4100) // warning C4100: Unreferenced formal parameter # pragma warning(disable: 4127) // warning C4127: Conditional expression is constant @@ -29,6 +19,15 @@ # pragma warning(disable: 4996) // warning C4996: The POSIX name for this item is deprecated. Instead, use the ISO C and C++ conformant name # pragma warning(disable: 4702) // warning C4702: unreachable code # pragma warning(disable: 4522) // warning C4522: multiple assignment operators specified +#elif defined(__INTEL_COMPILER) +# pragma warning(push) +# pragma warning(disable: 68) // integer conversion resulted in a change of sign +# pragma warning(disable: 186) // pointless comparison of unsigned integer with zero +# pragma warning(disable: 878) // incompatible exception specifications +# pragma warning(disable: 1334) // the "template" keyword used for syntactic disambiguation may only be used within a template +# pragma warning(disable: 1682) // implicit conversion of a 64-bit integral type to a smaller integral type (potential portability problem) +# pragma warning(disable: 1875) // offsetof applied to non-POD (Plain Old Data) types is nonstandard +# pragma warning(disable: 2196) // warning #2196: routine is both "inline" and "noinline" #elif defined(__GNUG__) && !defined(__clang__) # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Wunused-but-set-parameter" @@ -41,11 +40,6 @@ # endif #endif -#if defined(__GNUG__) && !defined(__clang__) - #include -#endif - - #include "attr.h" #include "options.h" #include "detail/class.h" @@ -57,7 +51,6 @@ NAMESPACE_BEGIN(PYBIND11_NAMESPACE) class cpp_function : public function { public: cpp_function() { } - cpp_function(std::nullptr_t) { } /// Construct a cpp_function from a vanilla function pointer template @@ -100,6 +93,7 @@ protected: template void initialize(Func &&f, Return (*)(Args...), const Extra&... extra) { using namespace detail; + struct capture { remove_reference_t f; }; /* Store the function including any extra state it might have (e.g. a lambda capture object) */ @@ -170,11 +164,10 @@ protected: process_attributes::init(extra..., rec); /* Generate a readable signature describing the function's arguments and return value types */ - static constexpr auto signature = _("(") + cast_in::arg_names + _(") -> ") + cast_out::name; - PYBIND11_DESCR_CONSTEXPR auto types = decltype(signature)::types(); + PYBIND11_DESCR signature = _("(") + cast_in::arg_names() + _(") -> ") + cast_out::name(); /* Register the function with Python from generic (non-templated) code */ - initialize_generic(rec, signature.text, types.data(), sizeof...(Args)); + initialize_generic(rec, signature.text(), signature.types(), sizeof...(Args)); if (cast_in::has_args) rec->has_args = true; if (cast_in::has_kwargs) rec->has_kwargs = true; @@ -224,30 +217,34 @@ protected: /* Generate a proper function signature */ std::string signature; - size_t type_index = 0, arg_index = 0; - for (auto *pc = text; *pc != '\0'; ++pc) { - const auto c = *pc; + size_t type_depth = 0, char_index = 0, type_index = 0, arg_index = 0; + while (true) { + char c = text[char_index++]; + if (c == '\0') + break; if (c == '{') { - // Write arg name for everything except *args and **kwargs. - if (*(pc + 1) == '*') - continue; - - if (arg_index < rec->args.size() && rec->args[arg_index].name) { - signature += rec->args[arg_index].name; - } else if (arg_index == 0 && rec->is_method) { - signature += "self"; - } else { - signature += "arg" + std::to_string(arg_index - (rec->is_method ? 1 : 0)); + // Write arg name for everything except *args, **kwargs and return type. + if (type_depth == 0 && text[char_index] != '*' && arg_index < args) { + if (!rec->args.empty() && rec->args[arg_index].name) { + signature += rec->args[arg_index].name; + } else if (arg_index == 0 && rec->is_method) { + signature += "self"; + } else { + signature += "arg" + std::to_string(arg_index - (rec->is_method ? 1 : 0)); + } + signature += ": "; } - signature += ": "; + ++type_depth; } else if (c == '}') { - // Write default value if available. - if (arg_index < rec->args.size() && rec->args[arg_index].descr) { - signature += " = "; - signature += rec->args[arg_index].descr; + --type_depth; + if (type_depth == 0) { + if (arg_index < rec->args.size() && rec->args[arg_index].descr) { + signature += "="; + signature += rec->args[arg_index].descr; + } + arg_index++; } - arg_index++; } else if (c == '%') { const std::type_info *t = types[type_index++]; if (!t) @@ -272,9 +269,14 @@ protected: signature += c; } } - if (arg_index != args || types[type_index] != nullptr) + if (type_depth != 0 || types[type_index] != nullptr) pybind11_fail("Internal error while parsing type signature (2)"); + #if !defined(PYBIND11_CONSTEXPR_DESCR) + delete[] types; + delete[] text; + #endif + #if PY_MAJOR_VERSION < 3 if (strcmp(rec->name, "__next__") == 0) { std::free(rec->name); @@ -426,8 +428,8 @@ protected: using namespace detail; /* Iterator over the list of potentially admissible overloads */ - const function_record *overloads = (function_record *) PyCapsule_GetPointer(self, nullptr), - *it = overloads; + function_record *overloads = (function_record *) PyCapsule_GetPointer(self, nullptr), + *it = overloads; /* Need to know how many arguments + keyword arguments there are to pick the right overload */ const size_t n_args_in = (size_t) PyTuple_GET_SIZE(args_in); @@ -483,7 +485,7 @@ protected: result other than PYBIND11_TRY_NEXT_OVERLOAD. */ - const function_record &func = *it; + function_record &func = *it; size_t pos_args = func.nargs; // Number of positional arguments that we need if (func.has_args) --pos_args; // (but don't count py::args if (func.has_kwargs) --pos_args; // or py::kwargs) @@ -515,7 +517,7 @@ protected: // 1. Copy any position arguments given. bool bad_arg = false; for (; args_copied < args_to_copy; ++args_copied) { - const argument_record *arg_rec = args_copied < func.args.size() ? &func.args[args_copied] : nullptr; + argument_record *arg_rec = args_copied < func.args.size() ? &func.args[args_copied] : nullptr; if (kwargs_in && arg_rec && arg_rec->name && PyDict_GetItemString(kwargs_in, arg_rec->name)) { bad_arg = true; break; @@ -656,22 +658,13 @@ protected: result = PYBIND11_TRY_NEXT_OVERLOAD; } - if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD) { - // The error reporting logic below expects 'it' to be valid, as it would be - // if we'd encountered this failure in the first-pass loop. - if (!result) - it = &call.func; + if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD) break; - } } } } catch (error_already_set &e) { e.restore(); return nullptr; -#if defined(__GNUG__) && !defined(__clang__) - } catch ( abi::__forced_unwind& ) { - throw; -#endif } catch (...) { /* When an exception is caught, give each registered exception translator a chance to translate it to a Python exception @@ -718,7 +711,7 @@ protected: " arguments. The following argument types are supported:\n"; int ctr = 0; - for (const function_record *it2 = overloads; it2 != nullptr; it2 = it2->next) { + for (function_record *it2 = overloads; it2 != nullptr; it2 = it2->next) { msg += " "+ std::to_string(++ctr) + ". "; bool wrote_sig = false; @@ -906,7 +899,6 @@ protected: tinfo->type = (PyTypeObject *) m_ptr; tinfo->cpptype = rec.type; tinfo->type_size = rec.type_size; - tinfo->type_align = rec.type_align; tinfo->operator_new = rec.operator_new; tinfo->holder_size_in_ptrs = size_in_ptrs(rec.holder_size); tinfo->init_instance = rec.init_instance; @@ -969,18 +961,18 @@ protected: tinfo->get_buffer_data = get_buffer_data; } - // rec_func must be set for either fget or fset. void def_property_static_impl(const char *name, handle fget, handle fset, - detail::function_record *rec_func) { - const auto is_static = rec_func && !(rec_func->is_method && rec_func->scope); - const auto has_doc = rec_func && rec_func->doc && pybind11::options::show_user_defined_docstrings(); + detail::function_record *rec_fget) { + const auto is_static = !(rec_fget->is_method && rec_fget->scope); + const auto has_doc = rec_fget->doc && pybind11::options::show_user_defined_docstrings(); + auto property = handle((PyObject *) (is_static ? get_internals().static_property_type : &PyProperty_Type)); attr(name) = property(fget.ptr() ? fget : none(), fset.ptr() ? fset : none(), /*deleter*/none(), - pybind11::str(has_doc ? rec_func->doc : "")); + pybind11::str(has_doc ? rec_fget->doc : "")); } }; @@ -998,21 +990,11 @@ template struct has_operator_delete_size::value, int> = 0> -void call_operator_delete(T *p, size_t, size_t) { T::operator delete(p); } +void call_operator_delete(T *p, size_t) { T::operator delete(p); } template ::value && has_operator_delete_size::value, int> = 0> -void call_operator_delete(T *p, size_t s, size_t) { T::operator delete(p, s); } +void call_operator_delete(T *p, size_t s) { T::operator delete(p, s); } -inline void call_operator_delete(void *p, size_t s, size_t a) { - (void)s; (void)a; -#if defined(PYBIND11_CPP17) - if (a > __STDCPP_DEFAULT_NEW_ALIGNMENT__) - ::operator delete(p, s, std::align_val_t(a)); - else - ::operator delete(p, s); -#else - ::operator delete(p); -#endif -} +inline void call_operator_delete(void *p, size_t) { ::operator delete(p); } NAMESPACE_END(detail) @@ -1022,18 +1004,10 @@ template auto method_adaptor(F &&f) -> decltype(std::forward(f)) { return std::forward(f); } template -auto method_adaptor(Return (Class::*pmf)(Args...)) -> Return (Derived::*)(Args...) { - static_assert(detail::is_accessible_base_of::value, - "Cannot bind an inaccessible base class method; use a lambda definition instead"); - return pmf; -} +auto method_adaptor(Return (Class::*pmf)(Args...)) -> Return (Derived::*)(Args...) { return pmf; } template -auto method_adaptor(Return (Class::*pmf)(Args...) const) -> Return (Derived::*)(Args...) const { - static_assert(detail::is_accessible_base_of::value, - "Cannot bind an inaccessible base class method; use a lambda definition instead"); - return pmf; -} +auto method_adaptor(Return (Class::*pmf)(Args...) const) -> Return (Derived::*)(Args...) const { return pmf; } template class class_ : public detail::generic_type { @@ -1075,11 +1049,10 @@ public: record.name = name; record.type = &typeid(type); record.type_size = sizeof(conditional_t); - record.type_align = alignof(conditional_t&); record.holder_size = sizeof(holder_type); record.init_instance = init_instance; record.dealloc = dealloc; - record.default_holder = detail::is_instantiation::value; + record.default_holder = std::is_same>::value; set_operator_new(&record); @@ -1121,7 +1094,7 @@ public: "def_static(...) called with a non-static member function pointer"); cpp_function cf(std::forward(f), name(name_), scope(*this), sibling(getattr(*this, name_, none())), extra...); - attr(cf.name()) = staticmethod(cf); + attr(cf.name()) = cf; return *this; } @@ -1185,7 +1158,7 @@ public: template class_ &def_readwrite(const char *name, D C::*pm, const Extra&... extra) { - static_assert(std::is_same::value || std::is_base_of::value, "def_readwrite() requires a class member (or base class member)"); + static_assert(std::is_base_of::value, "def_readwrite() requires a class member (or base class member)"); cpp_function fget([pm](const type &c) -> const D &{ return c.*pm; }, is_method(*this)), fset([pm](type &c, const D &value) { c.*pm = value; }, is_method(*this)); def_property(name, fget, fset, return_value_policy::reference_internal, extra...); @@ -1194,7 +1167,7 @@ public: template class_ &def_readonly(const char *name, const D C::*pm, const Extra& ...extra) { - static_assert(std::is_same::value || std::is_base_of::value, "def_readonly() requires a class member (or base class member)"); + static_assert(std::is_base_of::value, "def_readonly() requires a class member (or base class member)"); cpp_function fget([pm](const type &c) -> const D &{ return c.*pm; }, is_method(*this)); def_property_readonly(name, fget, return_value_policy::reference_internal, extra...); return *this; @@ -1225,7 +1198,7 @@ public: /// Uses cpp_function's return_value_policy by default template class_ &def_property_readonly(const char *name, const cpp_function &fget, const Extra& ...extra) { - return def_property(name, fget, nullptr, extra...); + return def_property(name, fget, cpp_function(), extra...); } /// Uses return_value_policy::reference by default @@ -1237,7 +1210,7 @@ public: /// Uses cpp_function's return_value_policy by default template class_ &def_property_readonly_static(const char *name, const cpp_function &fget, const Extra& ...extra) { - return def_property_static(name, fget, nullptr, extra...); + return def_property_static(name, fget, cpp_function(), extra...); } /// Uses return_value_policy::reference_internal by default @@ -1266,28 +1239,22 @@ public: /// Uses cpp_function's return_value_policy by default template class_ &def_property_static(const char *name, const cpp_function &fget, const cpp_function &fset, const Extra& ...extra) { - static_assert( 0 == detail::constexpr_sum(std::is_base_of::value...), - "Argument annotations are not allowed for properties"); auto rec_fget = get_function_record(fget), rec_fset = get_function_record(fset); - auto *rec_active = rec_fget; - if (rec_fget) { - char *doc_prev = rec_fget->doc; /* 'extra' field may include a property-specific documentation string */ - detail::process_attributes::init(extra..., rec_fget); - if (rec_fget->doc && rec_fget->doc != doc_prev) { - free(doc_prev); - rec_fget->doc = strdup(rec_fget->doc); - } + char *doc_prev = rec_fget->doc; /* 'extra' field may include a property-specific documentation string */ + detail::process_attributes::init(extra..., rec_fget); + if (rec_fget->doc && rec_fget->doc != doc_prev) { + free(doc_prev); + rec_fget->doc = strdup(rec_fget->doc); } if (rec_fset) { - char *doc_prev = rec_fset->doc; + doc_prev = rec_fset->doc; detail::process_attributes::init(extra..., rec_fset); if (rec_fset->doc && rec_fset->doc != doc_prev) { free(doc_prev); rec_fset->doc = strdup(rec_fset->doc); } - if (! rec_active) rec_active = rec_fset; } - def_property_static_impl(name, fget, fset, rec_active); + def_property_static_impl(name, fget, fset, rec_fget); return *this; } @@ -1353,10 +1320,7 @@ private: v_h.set_holder_constructed(false); } else { - detail::call_operator_delete(v_h.value_ptr(), - v_h.type->type_size, - v_h.type->type_align - ); + detail::call_operator_delete(v_h.value_ptr(), v_h.type->type_size); } v_h.value_ptr() = nullptr; } @@ -1392,190 +1356,93 @@ detail::initimpl::pickle_factory pickle(GetState &&g, SetSta return {std::forward(g), std::forward(s)}; } -NAMESPACE_BEGIN(detail) -struct enum_base { - enum_base(handle base, handle parent) : m_base(base), m_parent(parent) { } - - PYBIND11_NOINLINE void init(bool is_arithmetic, bool is_convertible) { - m_base.attr("__entries") = dict(); - auto property = handle((PyObject *) &PyProperty_Type); - auto static_property = handle((PyObject *) get_internals().static_property_type); - - m_base.attr("__repr__") = cpp_function( - [](handle arg) -> str { - handle type = arg.get_type(); - object type_name = type.attr("__name__"); - dict entries = type.attr("__entries"); - for (const auto &kv : entries) { - object other = kv.second[int_(0)]; - if (other.equal(arg)) - return pybind11::str("{}.{}").format(type_name, kv.first); - } - return pybind11::str("{}.???").format(type_name); - }, is_method(m_base) - ); - - m_base.attr("name") = property(cpp_function( - [](handle arg) -> str { - dict entries = arg.get_type().attr("__entries"); - for (const auto &kv : entries) { - if (handle(kv.second[int_(0)]).equal(arg)) - return pybind11::str(kv.first); - } - return "???"; - }, is_method(m_base) - )); - - m_base.attr("__doc__") = static_property(cpp_function( - [](handle arg) -> std::string { - std::string docstring; - dict entries = arg.attr("__entries"); - if (((PyTypeObject *) arg.ptr())->tp_doc) - docstring += std::string(((PyTypeObject *) arg.ptr())->tp_doc) + "\n\n"; - docstring += "Members:"; - for (const auto &kv : entries) { - auto key = std::string(pybind11::str(kv.first)); - auto comment = kv.second[int_(1)]; - docstring += "\n\n " + key; - if (!comment.is_none()) - docstring += " : " + (std::string) pybind11::str(comment); - } - return docstring; - } - ), none(), none(), ""); - - m_base.attr("__members__") = static_property(cpp_function( - [](handle arg) -> dict { - dict entries = arg.attr("__entries"), m; - for (const auto &kv : entries) - m[kv.first] = kv.second[int_(0)]; - return m; - }), none(), none(), "" - ); - - #define PYBIND11_ENUM_OP_STRICT(op, expr, strict_behavior) \ - m_base.attr(op) = cpp_function( \ - [](object a, object b) { \ - if (!a.get_type().is(b.get_type())) \ - strict_behavior; \ - return expr; \ - }, \ - is_method(m_base)) - - #define PYBIND11_ENUM_OP_CONV(op, expr) \ - m_base.attr(op) = cpp_function( \ - [](object a_, object b_) { \ - int_ a(a_), b(b_); \ - return expr; \ - }, \ - is_method(m_base)) - - if (is_convertible) { - PYBIND11_ENUM_OP_CONV("__eq__", !b.is_none() && a.equal(b)); - PYBIND11_ENUM_OP_CONV("__ne__", b.is_none() || !a.equal(b)); - - if (is_arithmetic) { - PYBIND11_ENUM_OP_CONV("__lt__", a < b); - PYBIND11_ENUM_OP_CONV("__gt__", a > b); - PYBIND11_ENUM_OP_CONV("__le__", a <= b); - PYBIND11_ENUM_OP_CONV("__ge__", a >= b); - PYBIND11_ENUM_OP_CONV("__and__", a & b); - PYBIND11_ENUM_OP_CONV("__rand__", a & b); - PYBIND11_ENUM_OP_CONV("__or__", a | b); - PYBIND11_ENUM_OP_CONV("__ror__", a | b); - PYBIND11_ENUM_OP_CONV("__xor__", a ^ b); - PYBIND11_ENUM_OP_CONV("__rxor__", a ^ b); - } - } else { - PYBIND11_ENUM_OP_STRICT("__eq__", int_(a).equal(int_(b)), return false); - PYBIND11_ENUM_OP_STRICT("__ne__", !int_(a).equal(int_(b)), return true); - - if (is_arithmetic) { - #define PYBIND11_THROW throw type_error("Expected an enumeration of matching type!"); - PYBIND11_ENUM_OP_STRICT("__lt__", int_(a) < int_(b), PYBIND11_THROW); - PYBIND11_ENUM_OP_STRICT("__gt__", int_(a) > int_(b), PYBIND11_THROW); - PYBIND11_ENUM_OP_STRICT("__le__", int_(a) <= int_(b), PYBIND11_THROW); - PYBIND11_ENUM_OP_STRICT("__ge__", int_(a) >= int_(b), PYBIND11_THROW); - #undef PYBIND11_THROW - } - } - - #undef PYBIND11_ENUM_OP_CONV - #undef PYBIND11_ENUM_OP_STRICT - - object getstate = cpp_function( - [](object arg) { return int_(arg); }, is_method(m_base)); - - m_base.attr("__getstate__") = getstate; - m_base.attr("__hash__") = getstate; - } - - PYBIND11_NOINLINE void value(char const* name_, object value, const char *doc = nullptr) { - dict entries = m_base.attr("__entries"); - str name(name_); - if (entries.contains(name)) { - std::string type_name = (std::string) str(m_base.attr("__name__")); - throw value_error(type_name + ": element \"" + std::string(name_) + "\" already exists!"); - } - - entries[name] = std::make_pair(value, doc); - m_base.attr(name) = value; - } - - PYBIND11_NOINLINE void export_values() { - dict entries = m_base.attr("__entries"); - for (const auto &kv : entries) - m_parent.attr(kv.first) = kv.second[int_(0)]; - } - - handle m_base; - handle m_parent; -}; - -NAMESPACE_END(detail) - /// Binds C++ enumerations and enumeration classes to Python template class enum_ : public class_ { public: - using Base = class_; - using Base::def; - using Base::attr; - using Base::def_property_readonly; - using Base::def_property_readonly_static; + using class_::def; + using class_::def_property_readonly_static; using Scalar = typename std::underlying_type::type; template enum_(const handle &scope, const char *name, const Extra&... extra) - : class_(scope, name, extra...), m_base(*this, scope) { - constexpr bool is_arithmetic = detail::any_of...>::value; - constexpr bool is_convertible = std::is_convertible::value; - m_base.init(is_arithmetic, is_convertible); + : class_(scope, name, extra...), m_entries(), m_parent(scope) { + constexpr bool is_arithmetic = detail::any_of...>::value; + + auto m_entries_ptr = m_entries.inc_ref().ptr(); + def("__repr__", [name, m_entries_ptr](Type value) -> pybind11::str { + for (const auto &kv : reinterpret_borrow(m_entries_ptr)) { + if (pybind11::cast(kv.second) == value) + return pybind11::str("{}.{}").format(name, kv.first); + } + return pybind11::str("{}.???").format(name); + }); + def_property_readonly_static("__members__", [m_entries_ptr](object /* self */) { + dict m; + for (const auto &kv : reinterpret_borrow(m_entries_ptr)) + m[kv.first] = kv.second; + return m; + }, return_value_policy::copy); def(init([](Scalar i) { return static_cast(i); })); def("__int__", [](Type value) { return (Scalar) value; }); #if PY_MAJOR_VERSION < 3 def("__long__", [](Type value) { return (Scalar) value; }); #endif - cpp_function setstate( - [](Type &value, Scalar arg) { value = static_cast(arg); }, - is_method(*this)); - attr("__setstate__") = setstate; + def("__eq__", [](const Type &value, Type *value2) { return value2 && value == *value2; }); + def("__ne__", [](const Type &value, Type *value2) { return !value2 || value != *value2; }); + if (is_arithmetic) { + def("__lt__", [](const Type &value, Type *value2) { return value2 && value < *value2; }); + def("__gt__", [](const Type &value, Type *value2) { return value2 && value > *value2; }); + def("__le__", [](const Type &value, Type *value2) { return value2 && value <= *value2; }); + def("__ge__", [](const Type &value, Type *value2) { return value2 && value >= *value2; }); + } + if (std::is_convertible::value) { + // Don't provide comparison with the underlying type if the enum isn't convertible, + // i.e. if Type is a scoped enum, mirroring the C++ behaviour. (NB: we explicitly + // convert Type to Scalar below anyway because this needs to compile). + def("__eq__", [](const Type &value, Scalar value2) { return (Scalar) value == value2; }); + def("__ne__", [](const Type &value, Scalar value2) { return (Scalar) value != value2; }); + if (is_arithmetic) { + def("__lt__", [](const Type &value, Scalar value2) { return (Scalar) value < value2; }); + def("__gt__", [](const Type &value, Scalar value2) { return (Scalar) value > value2; }); + def("__le__", [](const Type &value, Scalar value2) { return (Scalar) value <= value2; }); + def("__ge__", [](const Type &value, Scalar value2) { return (Scalar) value >= value2; }); + def("__invert__", [](const Type &value) { return ~((Scalar) value); }); + def("__and__", [](const Type &value, Scalar value2) { return (Scalar) value & value2; }); + def("__or__", [](const Type &value, Scalar value2) { return (Scalar) value | value2; }); + def("__xor__", [](const Type &value, Scalar value2) { return (Scalar) value ^ value2; }); + def("__rand__", [](const Type &value, Scalar value2) { return (Scalar) value & value2; }); + def("__ror__", [](const Type &value, Scalar value2) { return (Scalar) value | value2; }); + def("__rxor__", [](const Type &value, Scalar value2) { return (Scalar) value ^ value2; }); + def("__and__", [](const Type &value, const Type &value2) { return (Scalar) value & (Scalar) value2; }); + def("__or__", [](const Type &value, const Type &value2) { return (Scalar) value | (Scalar) value2; }); + def("__xor__", [](const Type &value, const Type &value2) { return (Scalar) value ^ (Scalar) value2; }); + } + } + def("__hash__", [](const Type &value) { return (Scalar) value; }); + // Pickling and unpickling -- needed for use with the 'multiprocessing' module + def(pickle([](const Type &value) { return pybind11::make_tuple((Scalar) value); }, + [](tuple t) { return static_cast(t[0].cast()); })); } /// Export enumeration entries into the parent scope enum_& export_values() { - m_base.export_values(); + for (const auto &kv : m_entries) + m_parent.attr(kv.first) = kv.second; return *this; } /// Add an enumeration entry - enum_& value(char const* name, Type value, const char *doc = nullptr) { - m_base.value(name, pybind11::cast(value, return_value_policy::copy), doc); + enum_& value(char const* name, Type value) { + auto v = pybind11::cast(value, return_value_policy::copy); + this->attr(name) = v; + m_entries[pybind11::str(name)] = v; return *this; } private: - detail::enum_base m_base; + dict m_entries; + handle m_parent; }; NAMESPACE_BEGIN(detail) @@ -1882,15 +1749,6 @@ public: auto const &internals = detail::get_internals(); tstate = (PyThreadState *) PYBIND11_TLS_GET_VALUE(internals.tstate); - if (!tstate) { - /* Check if the GIL was acquired using the PyGILState_* API instead (e.g. if - calling from a Python thread). Since we use a different key, this ensures - we don't create a new thread state and deadlock in PyEval_AcquireThread - below. Note we don't save this state with internals.tstate, since we don't - create it we would fail to clear it (its reference count should be > 0). */ - tstate = PyGILState_GetThisThreadState(); - } - if (!tstate) { tstate = PyThreadState_New(internals.istate); #if !defined(NDEBUG) @@ -1998,12 +1856,12 @@ class gil_scoped_release { }; #endif error_already_set::~error_already_set() { - if (m_type) { + if (type) { error_scope scope; gil_scoped_acquire gil; - m_type.release().dec_ref(); - m_value.release().dec_ref(); - m_trace.release().dec_ref(); + type.release().dec_ref(); + value.release().dec_ref(); + trace.release().dec_ref(); } } @@ -2064,14 +1922,6 @@ inline function get_type_overload(const void *this_ptr, const detail::type_info return overload; } -/** \rst - Try to retrieve a python method by the provided name from the instance pointed to by the this_ptr. - - :this_ptr: The pointer to the object the overload should be retrieved for. This should be the first - non-trampoline class encountered in the inheritance chain. - :name: The name of the overloaded Python method to retrieve. - :return: The Python method by this name from the object or an empty function wrapper. - \endrst */ template function get_overload(const T *this_ptr, const char *name) { auto tinfo = detail::get_type_info(typeid(T)); return tinfo ? get_type_overload(this_ptr, tinfo, name) : function(); @@ -2090,73 +1940,26 @@ template function get_overload(const T *this_ptr, const char *name) { } \ } -/** \rst - Macro to populate the virtual method in the trampoline class. This macro tries to look up a method named 'fn' - from the Python side, deals with the :ref:`gil` and necessary argument conversions to call this method and return - the appropriate type. See :ref:`overriding_virtuals` for more information. This macro should be used when the method - name in C is not the same as the method name in Python. For example with `__str__`. - - .. code-block:: cpp - - std::string toString() override { - PYBIND11_OVERLOAD_NAME( - std::string, // Return type (ret_type) - Animal, // Parent class (cname) - toString, // Name of function in C++ (name) - "__str__", // Name of method in Python (fn) - ); - } -\endrst */ #define PYBIND11_OVERLOAD_NAME(ret_type, cname, name, fn, ...) \ - PYBIND11_OVERLOAD_INT(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__) \ + PYBIND11_OVERLOAD_INT(ret_type, cname, name, __VA_ARGS__) \ return cname::fn(__VA_ARGS__) -/** \rst - Macro for pure virtual functions, this function is identical to :c:macro:`PYBIND11_OVERLOAD_NAME`, except that it - throws if no overload can be found. -\endrst */ #define PYBIND11_OVERLOAD_PURE_NAME(ret_type, cname, name, fn, ...) \ - PYBIND11_OVERLOAD_INT(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__) \ - pybind11::pybind11_fail("Tried to call pure virtual function \"" PYBIND11_STRINGIFY(cname) "::" name "\""); + PYBIND11_OVERLOAD_INT(ret_type, cname, name, __VA_ARGS__) \ + pybind11::pybind11_fail("Tried to call pure virtual function \"" #cname "::" name "\""); -/** \rst - Macro to populate the virtual method in the trampoline class. This macro tries to look up the method - from the Python side, deals with the :ref:`gil` and necessary argument conversions to call this method and return - the appropriate type. This macro should be used if the method name in C and in Python are identical. - See :ref:`overriding_virtuals` for more information. - - .. code-block:: cpp - - class PyAnimal : public Animal { - public: - // Inherit the constructors - using Animal::Animal; - - // Trampoline (need one for each virtual function) - std::string go(int n_times) override { - PYBIND11_OVERLOAD_PURE( - std::string, // Return type (ret_type) - Animal, // Parent class (cname) - go, // Name of function in C++ (must match Python name) (fn) - n_times // Argument(s) (...) - ); - } - }; -\endrst */ #define PYBIND11_OVERLOAD(ret_type, cname, fn, ...) \ - PYBIND11_OVERLOAD_NAME(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), #fn, fn, __VA_ARGS__) + PYBIND11_OVERLOAD_NAME(ret_type, cname, #fn, fn, __VA_ARGS__) -/** \rst - Macro for pure virtual functions, this function is identical to :c:macro:`PYBIND11_OVERLOAD`, except that it throws - if no overload can be found. -\endrst */ #define PYBIND11_OVERLOAD_PURE(ret_type, cname, fn, ...) \ - PYBIND11_OVERLOAD_PURE_NAME(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), #fn, fn, __VA_ARGS__) + PYBIND11_OVERLOAD_PURE_NAME(ret_type, cname, #fn, fn, __VA_ARGS__) NAMESPACE_END(PYBIND11_NAMESPACE) -#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) +#if defined(_MSC_VER) # pragma warning(pop) +#elif defined(__INTEL_COMPILER) +/* Leave ignored warnings on */ #elif defined(__GNUG__) && !defined(__clang__) # pragma GCC diagnostic pop #endif diff --git a/python/src/pybind11/pytypes.h b/python/src/pybind11/pytypes.h index 2d573dfad..d7fa17775 100644 --- a/python/src/pybind11/pytypes.h +++ b/python/src/pybind11/pytypes.h @@ -114,35 +114,6 @@ public: bool is(object_api const& other) const { return derived().ptr() == other.derived().ptr(); } /// Equivalent to ``obj is None`` in Python. bool is_none() const { return derived().ptr() == Py_None; } - /// Equivalent to obj == other in Python - bool equal(object_api const &other) const { return rich_compare(other, Py_EQ); } - bool not_equal(object_api const &other) const { return rich_compare(other, Py_NE); } - bool operator<(object_api const &other) const { return rich_compare(other, Py_LT); } - bool operator<=(object_api const &other) const { return rich_compare(other, Py_LE); } - bool operator>(object_api const &other) const { return rich_compare(other, Py_GT); } - bool operator>=(object_api const &other) const { return rich_compare(other, Py_GE); } - - object operator-() const; - object operator~() const; - object operator+(object_api const &other) const; - object operator+=(object_api const &other) const; - object operator-(object_api const &other) const; - object operator-=(object_api const &other) const; - object operator*(object_api const &other) const; - object operator*=(object_api const &other) const; - object operator/(object_api const &other) const; - object operator/=(object_api const &other) const; - object operator|(object_api const &other) const; - object operator|=(object_api const &other) const; - object operator&(object_api const &other) const; - object operator&=(object_api const &other) const; - object operator^(object_api const &other) const; - object operator^=(object_api const &other) const; - object operator<<(object_api const &other) const; - object operator<<=(object_api const &other) const; - object operator>>(object_api const &other) const; - object operator>>=(object_api const &other) const; - PYBIND11_DEPRECATED("Use py::str(obj) instead") pybind11::str str() const; @@ -153,9 +124,6 @@ public: int ref_count() const { return static_cast(Py_REFCNT(derived().ptr())); } /// Return a handle to the Python type object underlying the instance handle get_type() const; - -private: - bool rich_compare(object_api const &other, int value) const; }; NAMESPACE_END(detail) @@ -324,18 +292,15 @@ public: /// Constructs a new exception from the current Python error indicator, if any. The current /// Python error indicator will be cleared. error_already_set() : std::runtime_error(detail::error_string()) { - PyErr_Fetch(&m_type.ptr(), &m_value.ptr(), &m_trace.ptr()); + PyErr_Fetch(&type.ptr(), &value.ptr(), &trace.ptr()); } - error_already_set(const error_already_set &) = default; - error_already_set(error_already_set &&) = default; - inline ~error_already_set(); /// Give the currently-held error back to Python, if any. If there is currently a Python error /// already set it is cleared first. After this call, the current object no longer stores the /// error variables (but the `.what()` string is still available). - void restore() { PyErr_Restore(m_type.release().ptr(), m_value.release().ptr(), m_trace.release().ptr()); } + void restore() { PyErr_Restore(type.release().ptr(), value.release().ptr(), trace.release().ptr()); } // Does nothing; provided for backwards compatibility. PYBIND11_DEPRECATED("Use of error_already_set.clear() is deprecated") @@ -344,14 +309,10 @@ public: /// Check if the currently trapped error type matches the given Python exception class (or a /// subclass thereof). May also be passed a tuple to search for any exception class matches in /// the given tuple. - bool matches(handle exc) const { return PyErr_GivenExceptionMatches(m_type.ptr(), exc.ptr()); } - - const object& type() const { return m_type; } - const object& value() const { return m_value; } - const object& trace() const { return m_trace; } + bool matches(handle ex) const { return PyErr_GivenExceptionMatches(ex.ptr(), type.ptr()); } private: - object m_type, m_value, m_trace; + object type, value, trace; }; /** \defgroup python_builtins _ @@ -392,14 +353,6 @@ inline bool hasattr(handle obj, const char *name) { return PyObject_HasAttrString(obj.ptr(), name) == 1; } -inline void delattr(handle obj, handle name) { - if (PyObject_DelAttr(obj.ptr(), name.ptr()) != 0) { throw error_already_set(); } -} - -inline void delattr(handle obj, const char *name) { - if (PyObject_DelAttrString(obj.ptr(), name) != 0) { throw error_already_set(); } -} - inline object getattr(handle obj, handle name) { PyObject *result = PyObject_GetAttr(obj.ptr(), name.ptr()); if (!result) { throw error_already_set(); } @@ -471,6 +424,7 @@ object object_or_cast(T &&o); // Match a PyObject*, which we want to convert directly to handle via its converting constructor inline handle object_or_cast(PyObject *ptr) { return ptr; } + template class accessor : public object_api> { using key_type = typename Policy::key_type; @@ -708,7 +662,7 @@ protected: private: handle obj; - PyObject *key = nullptr, *value = nullptr; + PyObject *key, *value; ssize_t pos = -1; }; NAMESPACE_END(iterator_policies) @@ -736,14 +690,9 @@ inline bool PyIterable_Check(PyObject *obj) { } inline bool PyNone_Check(PyObject *o) { return o == Py_None; } -#if PY_MAJOR_VERSION >= 3 -inline bool PyEllipsis_Check(PyObject *o) { return o == Py_Ellipsis; } -#endif inline bool PyUnicode_Check_Permissive(PyObject *o) { return PyUnicode_Check(o) || PYBIND11_BYTES_CHECK(o); } -inline bool PyStaticMethod_Check(PyObject *o) { return o->ob_type == &PyStaticMethod_Type; } - class kwargs_proxy : public handle { public: explicit kwargs_proxy(handle h) : handle(h) { } @@ -1015,14 +964,6 @@ public: none() : object(Py_None, borrowed_t{}) { } }; -#if PY_MAJOR_VERSION >= 3 -class ellipsis : public object { -public: - PYBIND11_OBJECT(ellipsis, object, detail::PyEllipsis_Check) - ellipsis() : object(Py_Ellipsis, borrowed_t{}) { } -}; -#endif - class bool_ : public object { public: PYBIND11_OBJECT_CVT(bool_, object, PyBool_Check, raw_bool) @@ -1133,13 +1074,6 @@ public: (ssize_t *) stop, (ssize_t *) step, (ssize_t *) slicelength) == 0; } - bool compute(ssize_t length, ssize_t *start, ssize_t *stop, ssize_t *step, - ssize_t *slicelength) const { - return PySlice_GetIndicesEx((PYBIND11_SLICE_OBJECT *) m_ptr, - length, start, - stop, step, - slicelength) == 0; - } }; class capsule : public object { @@ -1203,7 +1137,6 @@ public: } size_t size() const { return (size_t) PyTuple_Size(m_ptr); } detail::tuple_accessor operator[](size_t index) const { return {*this, index}; } - detail::item_accessor operator[](handle h) const { return object::operator[](h); } detail::tuple_iterator begin() const { return {*this, 0}; } detail::tuple_iterator end() const { return {*this, PyTuple_GET_SIZE(m_ptr)}; } }; @@ -1241,7 +1174,6 @@ public: PYBIND11_OBJECT_DEFAULT(sequence, object, PySequence_Check) size_t size() const { return (size_t) PySequence_Size(m_ptr); } detail::sequence_accessor operator[](size_t index) const { return {*this, index}; } - detail::item_accessor operator[](handle h) const { return object::operator[](h); } detail::sequence_iterator begin() const { return {*this, 0}; } detail::sequence_iterator end() const { return {*this, PySequence_Size(m_ptr)}; } }; @@ -1254,7 +1186,6 @@ public: } size_t size() const { return (size_t) PyList_Size(m_ptr); } detail::list_accessor operator[](size_t index) const { return {*this, index}; } - detail::item_accessor operator[](handle h) const { return object::operator[](h); } detail::list_iterator begin() const { return {*this, 0}; } detail::list_iterator end() const { return {*this, PyList_GET_SIZE(m_ptr)}; } template void append(T &&val) const { @@ -1290,11 +1221,6 @@ public: bool is_cpp_function() const { return (bool) cpp_function(); } }; -class staticmethod : public object { -public: - PYBIND11_OBJECT_CVT(staticmethod, object, detail::PyStaticMethod_Check, PyStaticMethod_New) -}; - class buffer : public object { public: PYBIND11_OBJECT_DEFAULT(buffer, object, PyObject_CheckBuffer) @@ -1353,21 +1279,6 @@ inline size_t len(handle h) { return (size_t) result; } -inline size_t len_hint(handle h) { -#if PY_VERSION_HEX >= 0x03040000 - ssize_t result = PyObject_LengthHint(h.ptr(), 0); -#else - ssize_t result = PyObject_Length(h.ptr()); -#endif - if (result < 0) { - // Sometimes a length can't be determined at all (eg generators) - // In which case simply return 0 - PyErr_Clear(); - return 0; - } - return (size_t) result; -} - inline str repr(handle h) { PyObject *str_value = PyObject_Repr(h.ptr()); if (!str_value) throw error_already_set(); @@ -1417,55 +1328,5 @@ str_attr_accessor object_api::doc() const { return attr("__doc__"); } template handle object_api::get_type() const { return (PyObject *) Py_TYPE(derived().ptr()); } -template -bool object_api::rich_compare(object_api const &other, int value) const { - int rv = PyObject_RichCompareBool(derived().ptr(), other.derived().ptr(), value); - if (rv == -1) - throw error_already_set(); - return rv == 1; -} - -#define PYBIND11_MATH_OPERATOR_UNARY(op, fn) \ - template object object_api::op() const { \ - object result = reinterpret_steal(fn(derived().ptr())); \ - if (!result.ptr()) \ - throw error_already_set(); \ - return result; \ - } - -#define PYBIND11_MATH_OPERATOR_BINARY(op, fn) \ - template \ - object object_api::op(object_api const &other) const { \ - object result = reinterpret_steal( \ - fn(derived().ptr(), other.derived().ptr())); \ - if (!result.ptr()) \ - throw error_already_set(); \ - return result; \ - } - -PYBIND11_MATH_OPERATOR_UNARY (operator~, PyNumber_Invert) -PYBIND11_MATH_OPERATOR_UNARY (operator-, PyNumber_Negative) -PYBIND11_MATH_OPERATOR_BINARY(operator+, PyNumber_Add) -PYBIND11_MATH_OPERATOR_BINARY(operator+=, PyNumber_InPlaceAdd) -PYBIND11_MATH_OPERATOR_BINARY(operator-, PyNumber_Subtract) -PYBIND11_MATH_OPERATOR_BINARY(operator-=, PyNumber_InPlaceSubtract) -PYBIND11_MATH_OPERATOR_BINARY(operator*, PyNumber_Multiply) -PYBIND11_MATH_OPERATOR_BINARY(operator*=, PyNumber_InPlaceMultiply) -PYBIND11_MATH_OPERATOR_BINARY(operator/, PyNumber_TrueDivide) -PYBIND11_MATH_OPERATOR_BINARY(operator/=, PyNumber_InPlaceTrueDivide) -PYBIND11_MATH_OPERATOR_BINARY(operator|, PyNumber_Or) -PYBIND11_MATH_OPERATOR_BINARY(operator|=, PyNumber_InPlaceOr) -PYBIND11_MATH_OPERATOR_BINARY(operator&, PyNumber_And) -PYBIND11_MATH_OPERATOR_BINARY(operator&=, PyNumber_InPlaceAnd) -PYBIND11_MATH_OPERATOR_BINARY(operator^, PyNumber_Xor) -PYBIND11_MATH_OPERATOR_BINARY(operator^=, PyNumber_InPlaceXor) -PYBIND11_MATH_OPERATOR_BINARY(operator<<, PyNumber_Lshift) -PYBIND11_MATH_OPERATOR_BINARY(operator<<=, PyNumber_InPlaceLshift) -PYBIND11_MATH_OPERATOR_BINARY(operator>>, PyNumber_Rshift) -PYBIND11_MATH_OPERATOR_BINARY(operator>>=, PyNumber_InPlaceRshift) - -#undef PYBIND11_MATH_OPERATOR_UNARY -#undef PYBIND11_MATH_OPERATOR_BINARY - NAMESPACE_END(detail) NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/python/src/pybind11/stl.h b/python/src/pybind11/stl.h index 32f8d294a..1a4bbf0db 100644 --- a/python/src/pybind11/stl.h +++ b/python/src/pybind11/stl.h @@ -16,7 +16,6 @@ #include #include #include -#include #include #if defined(_MSC_VER) @@ -84,8 +83,7 @@ template struct set_caster { template static handle cast(T &&src, return_value_policy policy, handle parent) { - if (!std::is_lvalue_reference::value) - policy = return_value_policy_override::policy(policy); + policy = return_value_policy_override::policy(policy); pybind11::set s; for (auto &&value : src) { auto value_ = reinterpret_steal(key_conv::cast(forward_like(value), policy, parent)); @@ -95,7 +93,7 @@ template struct set_caster { return s.release(); } - PYBIND11_TYPE_CASTER(type, _("Set[") + key_conv::name + _("]")); + PYBIND11_TYPE_CASTER(type, _("Set[") + key_conv::name() + _("]")); }; template struct map_caster { @@ -121,12 +119,8 @@ template struct map_caster { template static handle cast(T &&src, return_value_policy policy, handle parent) { dict d; - return_value_policy policy_key = policy; - return_value_policy policy_value = policy; - if (!std::is_lvalue_reference::value) { - policy_key = return_value_policy_override::policy(policy_key); - policy_value = return_value_policy_override::policy(policy_value); - } + return_value_policy policy_key = return_value_policy_override::policy(policy); + return_value_policy policy_value = return_value_policy_override::policy(policy); for (auto &&kv : src) { auto key = reinterpret_steal(key_conv::cast(forward_like(kv.first), policy_key, parent)); auto value = reinterpret_steal(value_conv::cast(forward_like(kv.second), policy_value, parent)); @@ -137,14 +131,14 @@ template struct map_caster { return d.release(); } - PYBIND11_TYPE_CASTER(Type, _("Dict[") + key_conv::name + _(", ") + value_conv::name + _("]")); + PYBIND11_TYPE_CASTER(Type, _("Dict[") + key_conv::name() + _(", ") + value_conv::name() + _("]")); }; template struct list_caster { using value_conv = make_caster; bool load(handle src, bool convert) { - if (!isinstance(src) || isinstance(src)) + if (!isinstance(src)) return false; auto s = reinterpret_borrow(src); value.clear(); @@ -167,8 +161,7 @@ private: public: template static handle cast(T &&src, return_value_policy policy, handle parent) { - if (!std::is_lvalue_reference::value) - policy = return_value_policy_override::policy(policy); + policy = return_value_policy_override::policy(policy); list l(src.size()); size_t index = 0; for (auto &&value : src) { @@ -180,15 +173,12 @@ public: return l.release(); } - PYBIND11_TYPE_CASTER(Type, _("List[") + value_conv::name + _("]")); + PYBIND11_TYPE_CASTER(Type, _("List[") + value_conv::name() + _("]")); }; template struct type_caster> : list_caster, Type> { }; -template struct type_caster> - : list_caster, Type> { }; - template struct type_caster> : list_caster, Type> { }; @@ -209,9 +199,9 @@ private: public: bool load(handle src, bool convert) { - if (!isinstance(src)) + if (!isinstance(src)) return false; - auto l = reinterpret_borrow(src); + auto l = reinterpret_borrow(src); if (!require_size(l.size())) return false; size_t ctr = 0; @@ -237,7 +227,7 @@ public: return l.release(); } - PYBIND11_TYPE_CASTER(ArrayType, _("List[") + value_conv::name + _(_(""), _("[") + _() + _("]")) + _("]")); + PYBIND11_TYPE_CASTER(ArrayType, _("List[") + value_conv::name() + _(_(""), _("[") + _() + _("]")) + _("]")); }; template struct type_caster> @@ -284,7 +274,7 @@ template struct optional_caster { return true; } - PYBIND11_TYPE_CASTER(T, _("Optional[") + value_conv::name + _("]")); + PYBIND11_TYPE_CASTER(T, _("Optional[") + value_conv::name() + _("]")); }; #if PYBIND11_HAS_OPTIONAL @@ -364,7 +354,7 @@ struct variant_caster> { } using Type = V; - PYBIND11_TYPE_CASTER(Type, _("Union[") + detail::concat(make_caster::name...) + _("]")); + PYBIND11_TYPE_CASTER(Type, _("Union[") + detail::concat(make_caster::name()...) + _("]")); }; #if PYBIND11_HAS_VARIANT diff --git a/python/src/pybind11/stl_bind.h b/python/src/pybind11/stl_bind.h index 1f8725260..38dd68f69 100644 --- a/python/src/pybind11/stl_bind.h +++ b/python/src/pybind11/stl_bind.h @@ -122,7 +122,7 @@ void vector_modifiers(enable_if_t(new Vector()); - v->reserve(len_hint(it)); + v->reserve(len(it)); for (handle h : it) v->push_back(h.cast()); return v.release(); @@ -136,28 +136,6 @@ void vector_modifiers(enable_if_t()); - } - } catch (const cast_error &) { - v.erase(v.begin() + static_cast(old_size), v.end()); - try { - v.shrink_to_fit(); - } catch (const std::exception &) { - // Do nothing - } - throw; - } - }, - arg("L"), - "Extend the list by appending all the items in the given list" - ); - cl.def("insert", [](Vector &v, SizeType i, const T &x) { if (i > v.size()) @@ -601,15 +579,6 @@ class_ bind_map(handle scope, const std::string &name, Args&&. return_value_policy::reference_internal // ref + keepalive ); - cl.def("__contains__", - [](Map &m, const KeyType &k) -> bool { - auto it = m.find(k); - if (it == m.end()) - return false; - return true; - } - ); - // Assignment provided only if the type is copyable detail::map_assignment(cl); diff --git a/python/src/tensorflow.cc b/python/src/tensorflow.cc index d843fecab..018e4f74b 100644 --- a/python/src/tensorflow.cc +++ b/python/src/tensorflow.cc @@ -4,7 +4,6 @@ #include #include #include -#include "tensorflow/core/framework/tensor.h" #include "triton/codegen/selection/selection.h" #include "triton/runtime/function.h" #include "triton/lang/code_gen.h" @@ -22,15 +21,15 @@ using namespace triton; namespace rt = triton::runtime; -typedef std::vector tf_grid_t; -typedef std::function tf_grid_fn_ty; /* TF triton op properties */ -std::map id_grid_map; + +std::map id_grid_map; std::map id_fn_map; +std::map i64scalar_map; void register_grid(size_t id, - const tf_grid_fn_ty& grid_fn) { + const rt::function::grid_fn_ty& grid_fn) { id_grid_map[id] = grid_fn; } @@ -43,6 +42,17 @@ size_t register_fn(const std::string& src, return id; } +size_t make_scalar_id() { + return i64scalar_map.size(); +} + +bool has_scalar(size_t id) { + return i64scalar_map.find(id) != i64scalar_map.end(); +} + +int64_t retrieve_scalar(size_t id) { + return i64scalar_map.at(id); +} /* TF source-code generation */ @@ -112,13 +122,6 @@ void gen_make_handles(std::ostream &os, const std::vector& args) } void gen_make_launch_function(std::ostream &os, const std::vector& args) { - os << " rt::function::grid_fn_ty grid_fn = [&](const rt::function::options_t& opt) {" << std::endl; - os << " auto tmp = id_grid_map.at(id_)(opt);" << std::endl; - os << " rt::grid_t result;" << std::endl; - os << " for(auto& x: tmp) { result.push_back(x.scalar()()); }" << std::endl; - os << " return result; }; " << std::endl; - - os << " (*id_fn_map.at(id_))({"; for(unsigned i = 0; i < args.size() ; i++){ ir::argument *arg = args[i]; @@ -129,7 +132,7 @@ void gen_make_launch_function(std::ostream &os, const std::vector os << ", "; os << name; } - os << "}, grid_fn, stream); \n"; + os << "}, id_grid_map.at(id_), stream); \n"; } void gen_register_kernel_builder(std::ostream &os, const std::string &name, @@ -239,9 +242,7 @@ using GPUDevice = Eigen::GpuDevice; namespace rt = triton::runtime; namespace drv = triton::driver; -typedef std::vector tf_grid_t; -typedef std::function tf_grid_fn_ty; -extern std::map id_grid_map; +extern std::map id_grid_map; extern std::map id_fn_map; @@ -307,7 +308,8 @@ PYBIND11_MODULE(libtriton, m) { // bindings for triton classes pybind11::class_(m, "options") .def(pybind11::init<>()) - .def("D", &options_t::D); + .def("d", &options_t::D) + .def_readonly("num_warps", &options_t::num_warps); pybind11::class_(m, "options_space") .def(pybind11::init<>()) @@ -317,4 +319,6 @@ PYBIND11_MODULE(libtriton, m) { // hooks into triton constructs since frameworks may not use pybind11 m.def("register_grid", ®ister_grid); m.def("register_fn", ®ister_fn); + m.def("make_scalar_id", &make_scalar_id); + m.def("retrieve_scalar", &retrieve_scalar); } diff --git a/python/src/tensorflow/register_scalar.cc b/python/src/tensorflow/register_scalar.cc new file mode 100644 index 000000000..95eb3631f --- /dev/null +++ b/python/src/tensorflow/register_scalar.cc @@ -0,0 +1,37 @@ +#include +#include "tensorflow/core/framework/op_kernel.h" + +using namespace tensorflow; + +extern std::map i64scalar_map; + +class RegisterScalarOp : public OpKernel { +public: + explicit RegisterScalarOp(OpKernelConstruction* context) + : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("id", &id_)); + } + + void Compute(OpKernelContext* context) override { + // fetch input + const Tensor& x = context->input(0); + const int32* x_data = (const int32*)x.tensor_data().data(); + const int32 x_rank = x.dims(); + OP_REQUIRES(context, x_rank == 0, errors::InvalidArgument("Input must be a scalar")); + i64scalar_map[id_] = *x_data; + context->set_output(0, x); + } + +private: + int id_; +}; + + +REGISTER_KERNEL_BUILDER(Name("RegisterScalar") + .HostMemory("x") + .Device(DEVICE_CPU), RegisterScalarOp); +REGISTER_OP("RegisterScalar") + .Input("x: int32") + .Output("y: int32") + .Attr("id: int") +; diff --git a/python/triton/ops.py b/python/triton/ops.py index 0099e1289..c38b1266f 100644 --- a/python/triton/ops.py +++ b/python/triton/ops.py @@ -104,9 +104,77 @@ def _cvt_to_def_str(obj): tf.float64: 'double'}[obj] return str(obj) -class op: - def _make_tensorflow_op(self, src, outputs, options): +class scalar: + + def __init__(self, x): + self.id = libtriton.make_scalar_id() + self.handle = extra_ops.register_scalar(x, id=self.id) + self.assume_initialized = False + + def set_assume_initialized(self): + self.assume_initialized = True + + def unset_assume_initialized(self): + self.assume_initialized = False + + def get_value(self): + if self.assume_initialized: + return libtriton.retrieve_scalar(self.id) + else: + return self.handle + + def __add__(self, other): + return self.get_value() + other + + def __radd__(self, other): + return other + self.get_value() + + def __sub__(self, other): + return self.get_value() - other + + def __rsub(self, other): + return other - self.get_value() + + def __mul__(self, other): + return self.get_value() * other + + def __rmul(self, other): + return other * self.get_value() + + def __floordiv__(self, other): + return self.get_value() // other + + def __rfloordiv__(self, other): + return other // self.get_value() + + def __div__(self, other): + return self.get_value() / other + + def __rdiv__(self, other): + return other / self.get_value() + + def __truediv__(self, other): + self.get_value().__truediv__(other) + + def __rtruediv__(self, other): + other.__truediv__(self.get_value()) + + def __neg__(self): + return -self.get_value() + +class lazy_shape: + + def __init__(self, shape): + self.shape = shape + + def __getitem__(self, key): + return scalar(self.shape[key]) + +def shape(A) : + return lazy_shape(tf.shape(A)) + +def _make_tensorflow_op(src, outputs, options): src, name = make_bindings(src, outputs, options) cache_path = make_cache_path(src) cpp, so = write_bindings(src, cache_path) @@ -114,15 +182,18 @@ class op: result = tf.load_op_library(so) return result.__dict__[name] + +class op: + def __init__(self, src, outputs): self.fw_ops = dict() self.src = src self.outputs = outputs pass - def D(self, name): - pass - + def __del__(self): + libtriton.unregister_grid(self.id) + def __call__(self, *args, **kwargs): # recompilation key key = zip(kwargs.keys(), kwargs.values()) @@ -141,11 +212,23 @@ class op: opt.num_warps = [1, 2, 4, 8] # register framework op id = libtriton.register_fn(self.src, opt) - self.fw_ops[key] = (self._make_tensorflow_op(self.src, self.outputs, opt), id) + self.fw_ops[key] = (_make_tensorflow_op(self.src, self.outputs, opt), id) # retrieve framework op - op, id = self.fw_ops[key] - libtriton.register_grid(id, args[-1]) - op_args = args[:-1] + op, id = self.fw_ops[key] + # create grid function + scalars = [x for x in args[:-1] if isinstance(x, scalar)] + def grid(opt): + for x in scalars: + x.set_assume_initialized() + result = args[-1](opt) + for x in scalars: + x.unset_assume_initialized() + return result + # register grid function + self.grid = grid + libtriton.register_grid(id, self.grid) + # create operands + op_args = [x.handle if isinstance(x, scalar) else x for x in args[:-1]] return op(*op_args, id=id) @@ -158,4 +241,6 @@ def make_tensorflow_op(src, outputs, grids): return result.__dict__[name] def empty(shapes): - return extra_ops.alloc_empty(tf.stack(shapes)) + args = [x.handle if isinstance(x, scalar) else x for x in shapes] + args = tf.stack(args) + return extra_ops.alloc_empty(args) From 9ece3eccc665a93e1279a85b3e261d9d97433ff4 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 26 Aug 2019 17:21:09 -0700 Subject: [PATCH 330/494] some cleaning --- python/src/tensorflow.cc | 13 +++++--- python/triton/ops.py | 71 +++++++++++++++++++--------------------- 2 files changed, 43 insertions(+), 41 deletions(-) diff --git a/python/src/tensorflow.cc b/python/src/tensorflow.cc index 018e4f74b..893446c31 100644 --- a/python/src/tensorflow.cc +++ b/python/src/tensorflow.cc @@ -33,13 +33,16 @@ void register_grid(size_t id, id_grid_map[id] = grid_fn; } -size_t register_fn(const std::string& src, +void register_fn(size_t id, + const std::string& src, const rt::function::options_space_t& opt) { - size_t id = id_grid_map.size(); bool is_inserted = id_fn_map.insert({id, new rt::function(src, opt)}).second; if(!is_inserted) assert(false); - return id; +} + +size_t make_op_id() { + return id_fn_map.size(); } size_t make_scalar_id() { @@ -319,6 +322,8 @@ PYBIND11_MODULE(libtriton, m) { // hooks into triton constructs since frameworks may not use pybind11 m.def("register_grid", ®ister_grid); m.def("register_fn", ®ister_fn); + m.def("make_op_id", &make_op_id); m.def("make_scalar_id", &make_scalar_id); - m.def("retrieve_scalar", &retrieve_scalar); + m.def("retrieve_scalar", &retrieve_scalar) + ; } diff --git a/python/triton/ops.py b/python/triton/ops.py index c38b1266f..89166067a 100644 --- a/python/triton/ops.py +++ b/python/triton/ops.py @@ -175,29 +175,37 @@ def shape(A) : return lazy_shape(tf.shape(A)) def _make_tensorflow_op(src, outputs, options): - src, name = make_bindings(src, outputs, options) - cache_path = make_cache_path(src) - cpp, so = write_bindings(src, cache_path) - build(cpp, cache_path) - result = tf.load_op_library(so) - return result.__dict__[name] + src, name = make_bindings(src, outputs, options) + cache_path = make_cache_path(src) + cpp, so = write_bindings(src, cache_path) + build(cpp, cache_path) + result = tf.load_op_library(so) + return result.__dict__[name] +def _make_grid(args) : + scalars = [x for x in args[:-1] if isinstance(x, scalar)] + def grid(opt): + for x in scalars: + x.set_assume_initialized() + result = args[-1](opt) + for x in scalars: + x.unset_assume_initialized() + return result + return grid class op: def __init__(self, src, outputs): + self.fw_id = dict() self.fw_ops = dict() + self.fw_grids = dict() self.src = src self.outputs = outputs pass - def __del__(self): - libtriton.unregister_grid(self.id) - def __call__(self, *args, **kwargs): - # recompilation key + # create a new op when defines are different key = zip(kwargs.keys(), kwargs.values()) - # create a new op when non-iterable defines are different if key not in self.fw_ops: # code generation options defines = [] @@ -210,35 +218,24 @@ class op: opt = libtriton.options_space() opt.defines = defines opt.num_warps = [1, 2, 4, 8] - # register framework op - id = libtriton.register_fn(self.src, opt) - self.fw_ops[key] = (_make_tensorflow_op(self.src, self.outputs, opt), id) + # create unique id for this op + op_id = libtriton.make_op_id() + self.fw_id[key] = op_id + # register function + libtriton.register_fn(op_id, self.src, opt) + self.fw_ops[key] = _make_tensorflow_op(self.src, self.outputs, opt) + # retrieve framework op - op, id = self.fw_ops[key] - # create grid function - scalars = [x for x in args[:-1] if isinstance(x, scalar)] - def grid(opt): - for x in scalars: - x.set_assume_initialized() - result = args[-1](opt) - for x in scalars: - x.unset_assume_initialized() - return result - # register grid function - self.grid = grid - libtriton.register_grid(id, self.grid) + op_id = self.fw_id[key] + op = self.fw_ops[key] + # register grid + grid = _make_grid(args) + libtriton.register_grid(op_id, grid) + self.fw_grids[key] = grid # create operands op_args = [x.handle if isinstance(x, scalar) else x for x in args[:-1]] - return op(*op_args, id=id) - - -def make_tensorflow_op(src, outputs, grids): - src, name = make_bindings(src, outputs, grids) - cache_path = make_cache_path(src) - cpp, so = write_bindings(src, cache_path) - build(cpp, cache_path) - result = tf.load_op_library(so) - return result.__dict__[name] + # call framework op + return op(*op_args, id=op_id) def empty(shapes): args = [x.handle if isinstance(x, scalar) else x for x in shapes] From 7cb73f66e25b78e60d4cfd01dc32ce5a65659b5f Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 26 Aug 2019 19:25:58 -0700 Subject: [PATCH 331/494] testing some register gradient --- lib/driver/handle.cc | 10 ++++++++-- python/examples/dot.py | 22 ++++++++++++++++------ python/src/tensorflow.cc | 19 +++++++++---------- python/triton/ops.py | 15 ++++++++++++++- 4 files changed, 47 insertions(+), 19 deletions(-) diff --git a/lib/driver/handle.cc b/lib/driver/handle.cc index 20ae0f90d..a0013f347 100755 --- a/lib/driver/handle.cc +++ b/lib/driver/handle.cc @@ -21,6 +21,7 @@ */ #include "triton/driver/handle.h" +#include "triton/driver/error.h" namespace triton { @@ -68,8 +69,13 @@ handle::handle(): has_ownership_(false){ } template handle::~handle(){ - if(has_ownership_ && h_ && h_.unique()) - _delete(*h_); + try{ + if(has_ownership_ && h_ && h_.unique()) + _delete(*h_); + }catch(const exception::cuda::deinitialized&){ + // order of destruction for global variables + // is not guaranteed + } } template class handle; diff --git a/python/examples/dot.py b/python/examples/dot.py index 351a6d3dc..779f59408 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -84,7 +84,7 @@ def cdiv(a, b): class dot: - def __init__(self, trans_a = False, trans_b = True): + def __init__(self, trans_a = False, trans_b = False): self.dot = triton.op(src, ['C']) self.trans_a = trans_a self.trans_b = trans_b @@ -102,26 +102,36 @@ class dot: return self.dot(a, b, c, M, N, K, lda, ldb, ldc, lambda opt: [cdiv(M, opt.d('TM')), cdiv(N, opt.d('TN'))], AT = self.trans_a, BT = self.trans_b, TYPE = tf.float16, - TM = [32, 64, 128], TN = [32, 64, 128], TK = [32]) + TM = [128], TN = [ 128], TK = [32]) -dot_tn = dot() +dot_nt = dot(False, True) +dot_nn = dot(False, False) +dot_tn = dot(True, False) +dot_tt = dot(True, True) + +@triton.register_gradient(dot) +def _dot_grad(op, dy): + a = op.inputs[0] + b = op.inputs[1] + return [dot_tn(dy, b), dot_nt(a, dy), None, None, None, None, None, None, None] def run_dot(): M, N, K = 128, 128, 128 a = tf.placeholder(tf.float16, shape=[M, K]) b = tf.placeholder(tf.float16, shape=[N, K]) # c = tf.matmul(a, b, transpose_a=True) - c = dot_tn(a, b) + c = dot_nn(a, b) + grads = tf.gradients(c, [a]) # Reference ha = np.random.rand(M, K).astype(np.float16) hb = np.random.rand(N, K).astype(np.float16) # Run sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) - result = sess.run([c], feed_dict = {a: ha, + result = sess.run([grads], feed_dict = {a: ha, b: hb})[0] # Test - hresult = np.dot(ha.T, hb).T + hresult = np.dot(ha.T, hb.T).T dif = np.abs(result - hresult) np.savetxt('dif.dat', dif, '%2.4f') print(hresult) diff --git a/python/src/tensorflow.cc b/python/src/tensorflow.cc index 893446c31..e71f6a77a 100644 --- a/python/src/tensorflow.cc +++ b/python/src/tensorflow.cc @@ -24,21 +24,19 @@ namespace rt = triton::runtime; /* TF triton op properties */ -std::map id_grid_map; -std::map id_fn_map; +std::map> id_grid_map; +std::map> id_fn_map; std::map i64scalar_map; void register_grid(size_t id, const rt::function::grid_fn_ty& grid_fn) { - id_grid_map[id] = grid_fn; + id_grid_map[id].reset(new rt::function::grid_fn_ty(grid_fn)); } void register_fn(size_t id, const std::string& src, const rt::function::options_space_t& opt) { - bool is_inserted = id_fn_map.insert({id, new rt::function(src, opt)}).second; - if(!is_inserted) - assert(false); + id_fn_map[id].reset(new rt::function(src, opt)); } size_t make_op_id() { @@ -135,7 +133,7 @@ void gen_make_launch_function(std::ostream &os, const std::vector os << ", "; os << name; } - os << "}, id_grid_map.at(id_), stream); \n"; + os << "}, *id_grid_map.at(id_), stream); \n"; } void gen_register_kernel_builder(std::ostream &os, const std::string &name, @@ -214,7 +212,7 @@ std::tuple(new ir::module("", ctx)); + auto ir = std::shared_ptr(new ir::module("", ctx)); Generator gen(&parser); gen.Gen(&*ir); // function @@ -245,8 +243,8 @@ using GPUDevice = Eigen::GpuDevice; namespace rt = triton::runtime; namespace drv = triton::driver; -extern std::map id_grid_map; -extern std::map id_fn_map; +extern std::map> id_grid_map; +extern std::map> id_fn_map; class )" << opname << R"(: public OpKernel { @@ -294,6 +292,7 @@ oss << R"( )"; gen_register_op(oss, cc_name, fn->args(), outputs); + return {oss.str(), name}; } diff --git a/python/triton/ops.py b/python/triton/ops.py index 89166067a..9b72aad0a 100644 --- a/python/triton/ops.py +++ b/python/triton/ops.py @@ -13,6 +13,8 @@ import setuptools import libtriton # frameworks import tensorflow as tf +from tensorflow.python.framework import ops + extra_ops = tf.load_op_library('/home/philippe/development/triton/python/build/lib.linux-x86_64-3.6/libextra_tf_ops.so') @@ -230,13 +232,24 @@ class op: op = self.fw_ops[key] # register grid grid = _make_grid(args) - libtriton.register_grid(op_id, grid) self.fw_grids[key] = grid + libtriton.register_grid(op_id, self.fw_grids[key]) # create operands op_args = [x.handle if isinstance(x, scalar) else x for x in args[:-1]] # call framework op return op(*op_args, id=op_id) + +class register_gradient: + + def __init__(self, op): + self.op = op + + def __call__(self, f): + name = 'Dot' + ops.RegisterGradient(name)(f) + + def empty(shapes): args = [x.handle if isinstance(x, scalar) else x for x in shapes] args = tf.stack(args) From b4ae06a7142231e63aa059da1c4d41d0997353d3 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 26 Aug 2019 20:38:39 -0700 Subject: [PATCH 332/494] tracking down performance regression --- examples/cpp/dot.cc | 28 ++++++++++++++-------------- lib/codegen/analysis/alignment.cc | 3 ++- lib/driver/module.cc | 2 +- lib/runtime/function.cc | 3 +++ 4 files changed, 20 insertions(+), 16 deletions(-) diff --git a/examples/cpp/dot.cc b/examples/cpp/dot.cc index 6e40f79d2..409b77217 100644 --- a/examples/cpp/dot.cc +++ b/examples/cpp/dot.cc @@ -153,10 +153,10 @@ perf_t do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int opt.defines.push_back({"AT", {""}}); if(BT) opt.defines.push_back({"BT", {""}}); - opt.defines.push_back({"TM", {"32"}}); - opt.defines.push_back({"TN", {"32"}}); + opt.defines.push_back({"TM", {"128"}}); + opt.defines.push_back({"TN", {"128"}}); opt.defines.push_back({"TK", {"32"}}); - opt.num_warps = {1, 2, 4, 8}; + opt.num_warps = {4}; rt::function function(src, opt); auto ceil = [](size_t x, size_t y) { return (x + y - 1) / y; }; @@ -169,16 +169,16 @@ perf_t do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int // test stream->synchronize(); - stream->read(dc, true, 0, hc); - std::vector rc(hc.size()); - cpu_ref(AT, BT, M, N, K, rc, ha, hb); - for(size_t i = 0; i < M*N; i++) - if(std::isinf(hc[i]) || std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-2){ - std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; - exit(EXIT_FAILURE); - } - std::cout << hc[0] << " " << std::endl; - std::cout << "Pass!" << std::endl; +// stream->read(dc, true, 0, hc); +// std::vector rc(hc.size()); +// cpu_ref(AT, BT, M, N, K, rc, ha, hb); +// for(size_t i = 0; i < M*N; i++) +// if(std::isinf(hc[i]) || std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-2){ +// std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; +// exit(EXIT_FAILURE); +// } +// std::cout << hc[0] << " " << std::endl; +// std::cout << "Pass!" << std::endl; // clean-up delete dc; @@ -208,7 +208,7 @@ int main() { // shapes to benchmark std::vector configs = { // {false, false, 8192, 512, 512}, - {false, true, 128, 128, 128} + {false, true, 8192, 8192, 8192} // {false, true, 128, 128, 128}, // {false, false, 128, 128, 128}, // {true, false, 128, 128, 128}, diff --git a/lib/codegen/analysis/alignment.cc b/lib/codegen/analysis/alignment.cc index 69cf3479c..276422d10 100644 --- a/lib/codegen/analysis/alignment.cc +++ b/lib/codegen/analysis/alignment.cc @@ -4,6 +4,7 @@ #include "triton/ir/basic_block.h" #include "triton/ir/instructions.h" #include "triton/ir/type.h" +#include namespace triton { namespace codegen{ @@ -304,7 +305,7 @@ void alignment_info::run(ir::module &mod) { for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()){ populate_max_contiguous(i); -// std::cout << i->get_name() << " " << is_constant_.at(i).num_cst << " " << starting_multiple_.at(i) << " " << max_contiguous_.at(i) << std::endl; + std::cout << i->get_name() << " " << is_constant_.at(i).num_cst << " " << starting_multiple_.at(i) << " " << max_contiguous_.at(i) << std::endl; } } diff --git a/lib/driver/module.cc b/lib/driver/module.cc index 486d7d588..2ed0160f7 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -240,7 +240,7 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ -// std::cout << source << std::endl; + std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index fdc9b6d15..f2f0a42bb 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -14,6 +14,7 @@ #include "triton/driver/module.h" #include "triton/ir/module.h" #include "triton/ir/function.h" +#include "triton/ir/print.h" #include "triton/tools/bench.hpp" #include "llvm/IR/Module.h" @@ -205,6 +206,8 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c shmem_allocation.run(); shmem_barriers.run(module); } + dce.run(module); + ir::print(module, std::cout); alignment_info.run(module); vectorize.run(module); dce.run(module); From 37cbcfabd05dca3cbd5fe4a2c7141a6c0a3c3884 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 26 Aug 2019 22:48:15 -0700 Subject: [PATCH 333/494] [examples] back to 96 TFLOPS on V100 --- examples/cpp/cuda.h | 4 +- examples/cpp/dot.cc | 20 +++-- .../triton/codegen/transform/reassociate.h | 3 +- include/triton/lang/ast.h | 4 +- include/triton/lang/parser.h | 1 + include/triton/lang/token.h | 1 + include/triton/tools/bench.hpp | 2 +- lib/codegen/analysis/alignment.cc | 4 +- lib/codegen/transform/reassociate.cc | 22 +++++- lib/driver/module.cc | 2 +- lib/lang/ast.cc | 77 ++++++++++++------- lib/lang/code_gen.cc | 29 ++++++- lib/lang/parser.cc | 15 +++- lib/lang/token.cc | 1 - lib/runtime/function.cc | 9 ++- 15 files changed, 140 insertions(+), 54 deletions(-) diff --git a/examples/cpp/cuda.h b/examples/cpp/cuda.h index 5f03870f5..fef17dc55 100644 --- a/examples/cpp/cuda.h +++ b/examples/cpp/cuda.h @@ -49,7 +49,7 @@ inline size_t size_of(DType dtype){ } } -std::vector gather_all_algos() { +inline std::vector gather_all_algos() { std::vector result; // non-tensor ops for(int i = -1; i < 24; i++) @@ -124,7 +124,7 @@ inline cublasStatus_t cublasGemmEx(cublasHandle_t handle, cudaDataType cudt, cub /* Get cuBLAS handle */ -cublasHandle_t cublasGetHandle(triton::driver::stream* stream) { +inline cublasHandle_t cublasGetHandle(triton::driver::stream* stream) { static std::map cache; CUstream key = *stream->cu(); diff --git a/examples/cpp/dot.cc b/examples/cpp/dot.cc index 409b77217..7d5a44324 100644 --- a/examples/cpp/dot.cc +++ b/examples/cpp/dot.cc @@ -75,17 +75,21 @@ void dot(TYPE * A __noalias __readonly __aligned(16), float xc[TM, TN] = 0; #ifdef AT TYPE* pa[TK, TM] = A + rka[:, newaxis] + rxa[newaxis, :]*lda; - TYPE a[TK, TM] = *pa; + bool checka[TK, TM] = rka[:, newaxis] < K; + TYPE a[TK, TM] = checka ? *pa : 0; #else TYPE* pa[TM, TK] = A + rka[newaxis, :]*lda + rxa[:, newaxis]; - TYPE a[TM, TK] = *pa; + bool checka[TM, TK] = rka[newaxis, :] < K; + TYPE a[TM, TK] = checka ? *pa : 0; #endif #ifdef BT TYPE* pb[TN, TK] = B + rkb[newaxis, :]*ldb + ryb[:, newaxis]; - TYPE b[TN, TK] = *pb; + bool checkb[TN, TK] = rkb[newaxis, :] < K; + TYPE b[TN, TK] = checkb ? *pb : 0; #else TYPE* pb[TK, TN] = B + rkb[:, newaxis] + ryb[newaxis, :]*ldb; - TYPE b[TK, TN] = *pb; + bool checkb[TK, TN] = rkb[:, newazis] < K; + TYPE b[TK, TN] = checkb ? *pb : 0; #endif for(int k = K; k > 0; k = k - TK){ xc = USEA @ USEB + xc; @@ -99,8 +103,10 @@ void dot(TYPE * A __noalias __readonly __aligned(16), #else pb = pb + TK; #endif - a = *pa; - b = *pb; + checka = k > TK; + checkb = k > TK; + a = checka ? *pa : 0; + b = checkb ? *pb : 0; } int rxc[TM] = ridx * TM + (0 ... TM); int ryc[TN] = ridy * TN + (0 ... TN); @@ -109,7 +115,7 @@ void dot(TYPE * A __noalias __readonly __aligned(16), bool checkc0[TM] = rxc < M; bool checkc1[TN] = ryc < N; bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - *pc = c; + *?(checkc) pc = c; } )"; diff --git a/include/triton/codegen/transform/reassociate.h b/include/triton/codegen/transform/reassociate.h index f7b843846..075446e6f 100644 --- a/include/triton/codegen/transform/reassociate.h +++ b/include/triton/codegen/transform/reassociate.h @@ -37,11 +37,12 @@ private: ir::value *reassociate_ptr(ir::getelementptr_inst* pz, ir::builder &builder, std::map &offsets); public: - reassociate(analysis::grids *params); + reassociate(analysis::alignment_info* align, analysis::grids *params); void run(ir::module& module); private: analysis::grids* params_; + analysis::alignment_info* align_; }; } diff --git a/include/triton/lang/ast.h b/include/triton/lang/ast.h index 710c67e4a..8bf96a96b 100644 --- a/include/triton/lang/ast.h +++ b/include/triton/lang/ast.h @@ -360,11 +360,12 @@ public: switch (op_) { case '.': return !Type()->ToArray() && lhs_->IsLVal(); case ']': return !Type()->ToArray(); + case Token::MASKED_DEREF: return true; default: return false; } } ArithmType* Convert(); - void Broadcast(); + static void Broadcast(Expr* loc, Expr*& lhs, Expr*& rhs, QualType &type); virtual void TypeChecking(); void SubScriptingOpTypeChecking(); @@ -374,6 +375,7 @@ public: void ShiftOpTypeChecking(); void RangeOpTypeChecking(); void MatmulOpTypeChecking(); + void MaskedDerefOpTypeChecking(); void RelationalOpTypeChecking(); void EqualityOpTypeChecking(); void BitwiseOpTypeChecking(); diff --git a/include/triton/lang/parser.h b/include/triton/lang/parser.h index 63e312026..05edbb159 100644 --- a/include/triton/lang/parser.h +++ b/include/triton/lang/parser.h @@ -84,6 +84,7 @@ public: Constant* ParseAlignof(); UnaryOp* ParsePrefixIncDec(const Token* tok); UnaryOp* ParseUnaryOp(const Token* tok, int op); + Expr* ParseDerefOp(const Token* tok); QualType ParseTypeName(); Expr* ParseCastExpr(); diff --git a/include/triton/lang/token.h b/include/triton/lang/token.h index a920e082c..2f6b57cfc 100644 --- a/include/triton/lang/token.h +++ b/include/triton/lang/token.h @@ -94,6 +94,7 @@ public: OR_ASSIGN, ELLIPSIS, + MASKED_DEREF, // Punctuators end // KEYWORD BEGIN diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp index 99a04125e..56016638b 100644 --- a/include/triton/tools/bench.hpp +++ b/include/triton/tools/bench.hpp @@ -41,7 +41,7 @@ inline double bench(std::function const & op, driver::stream * stream) while(total_time*1e-9 < 1e-3){ float norm = 1; // normalize clock if possible to reduce noise in auto-tuning -// if(auto cu_device = dynamic_cast(device)) +// if(auto cu_device = dynamic_cast(stream->context()->device())) // norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock(); tmr.start(); op(); diff --git a/lib/codegen/analysis/alignment.cc b/lib/codegen/analysis/alignment.cc index 276422d10..98d4a110f 100644 --- a/lib/codegen/analysis/alignment.cc +++ b/lib/codegen/analysis/alignment.cc @@ -111,8 +111,9 @@ unsigned alignment_info::populate_max_contiguous(ir::value *v){ if(!v->get_type()->is_tile_ty()) return cache(1); auto shapes = v->get_type()->get_tile_shapes(); - if(dynamic_cast(v)) + if(dynamic_cast(v)){ return cache(shapes[0]); + } if(auto *x = dynamic_cast(v)){ ir::value *op = x->get_operand(0); if(op->get_type()->is_tile_ty()){ @@ -305,7 +306,6 @@ void alignment_info::run(ir::module &mod) { for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()){ populate_max_contiguous(i); - std::cout << i->get_name() << " " << is_constant_.at(i).num_cst << " " << starting_multiple_.at(i) << " " << max_contiguous_.at(i) << std::endl; } } diff --git a/lib/codegen/transform/reassociate.cc b/lib/codegen/transform/reassociate.cc index c5e76f18a..532c8e186 100644 --- a/lib/codegen/transform/reassociate.cc +++ b/lib/codegen/transform/reassociate.cc @@ -93,6 +93,9 @@ ir::value *reassociate::reassociate_idx(ir::value *old_value, params_->copy(new_value, old_value); params_->copy(new_lhs, old_value); params_->copy(new_rhs, old_value); + align_->copy(new_value, old_value); + align_->copy(new_lhs, old_value); + align_->copy(new_rhs, old_value); } } } @@ -130,6 +133,9 @@ ir::value *reassociate::reassociate_idx(ir::value *old_value, params_->copy(new_value, old_value); params_->copy(((ir::instruction*)new_value)->get_operand(0), old_value); params_->copy(((ir::instruction*)new_value)->get_operand(1), old_value); + align_->copy(new_value, old_value); + align_->copy(((ir::instruction*)new_value)->get_operand(0), old_value); + align_->copy(((ir::instruction*)new_value)->get_operand(1), old_value); } } @@ -155,8 +161,8 @@ ir::value *reassociate::reassociate_idx(ir::value *old_value, return new_value; } -reassociate::reassociate(analysis::grids* params) - : params_(params) +reassociate::reassociate(analysis::alignment_info *align, analysis::grids* params) + : params_(params), align_(align) { } @@ -185,6 +191,9 @@ void reassociate::run(ir::module &mod) { params_->copy(dyn_range, old_range); params_->copy(static_range, old_range); params_->copy(new_range, old_range); + align_->copy(dyn_range, old_range); + align_->copy(static_range, old_range); + align_->copy(new_range, old_range); } } @@ -217,6 +226,8 @@ void reassociate::run(ir::module &mod) { ir::value *sta_ptr = builder.create_gep(dyn_ptr, {sta}); params_->copy(dyn_ptr, pz); params_->copy(sta_ptr, pz); + align_->copy(dyn_ptr, pz); + align_->copy(sta_ptr, pz); pz->replace_all_uses_with(sta_ptr); infos[sta_ptr].dyn_ptr = dyn_ptr; infos[sta_ptr].sta_ptr = (ir::getelementptr_inst*)sta_ptr; @@ -233,6 +244,8 @@ void reassociate::run(ir::module &mod) { ir::value *pz_sta = builder.create_gep(pz_dyn, {cst}, pz->get_name()); params_->copy(pz_dyn, pz); params_->copy(pz_sta, pz); + align_->copy(pz_dyn, pz); + align_->copy(pz_sta, pz); pz->replace_all_uses_with(pz_sta); infos[pz_sta].dyn_ptr = pz_dyn; infos[pz_sta].sta_ptr = (ir::getelementptr_inst*)pz_sta; @@ -283,6 +296,11 @@ void reassociate::run(ir::module &mod) { params_->copy(neg_off, off); params_->copy(phi_dyn, phi); params_->copy(phi_sta, phi); + align_->copy(pz_dyn, pz); + align_->copy(((ir::instruction*)neg_off)->get_operand(0), off); + align_->copy(neg_off, off); + align_->copy(phi_dyn, phi); + align_->copy(phi_sta, phi); infos[phi_sta].dyn_ptr = phi_dyn; infos[phi_sta].sta_ptr = (ir::getelementptr_inst*)phi_sta; replaced.insert(phi); diff --git a/lib/driver/module.cc b/lib/driver/module.cc index 2ed0160f7..486d7d588 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -240,7 +240,7 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ - std::cout << source << std::endl; +// std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/lang/ast.cc b/lib/lang/ast.cc index 7d7e28471..e1f008c36 100644 --- a/lib/lang/ast.cc +++ b/lib/lang/ast.cc @@ -190,6 +190,7 @@ BinaryOp* BinaryOp::New(const Token* tok, int op, Expr* lhs, Expr* rhs) { case Token::LOGICAL_OR: case Token::ELLIPSIS: case Token::MATMUL: + case Token::MASKED_DEREF: break; default: assert(0); @@ -218,22 +219,22 @@ ArithmType* BinaryOp::Convert() { return maxType; } -void BinaryOp::Broadcast() { - auto lhsType = lhs_->Type()->ToTile(); - auto rhsType = rhs_->Type()->ToTile(); - auto eleType = type_->ScalarType(); +void BinaryOp::Broadcast(Expr* loc, Expr *&lhs, Expr *&rhs, QualType& type) { + auto lhsType = lhs->Type()->ToTile(); + auto rhsType = rhs->Type()->ToTile(); + auto eleType = type->ScalarType(); assert(eleType); if(!lhsType && !rhsType) return ; else if(lhsType && !rhsType){ - type_ = TileType::New(lhsType->Shape(), eleType); - ::Type* rtype = TileType::New(lhsType->Shape(), rhs_->Type()->ScalarType()); - rhs_ = UnaryOp::New(Token::CAST, rhs_, rtype); + type = TileType::New(lhsType->Shape(), eleType); + ::Type* rtype = TileType::New(lhsType->Shape(), rhs->Type()->ScalarType()); + rhs = UnaryOp::New(Token::CAST, rhs, rtype); } else if(!lhsType && rhsType){ - type_ = TileType::New(rhsType->Shape(), eleType); - ::Type* ltype = TileType::New(rhsType->Shape(), lhs_->Type()->ScalarType()); - lhs_ = UnaryOp::New(Token::CAST, lhs_, ltype); + type = TileType::New(rhsType->Shape(), eleType); + ::Type* ltype = TileType::New(rhsType->Shape(), lhs->Type()->ScalarType()); + lhs = UnaryOp::New(Token::CAST, lhs, ltype); } else { @@ -257,17 +258,17 @@ void BinaryOp::Broadcast() { else if(lhsShape[i] == rhsShape[i]) retShape[i] = lhsShape[i]; else - Error(this, "cannot broadcast dimension %d " + Error(loc, "cannot broadcast dimension %d " "for operands of shape %d and %d", i, lhsShape[i], rhsShape[i]); } ::Type* ltype = TileType::New(retShape, lhsType->ScalarType()); ::Type* rtype = TileType::New(retShape, rhsType->ScalarType()); - type_ = TileType::New(retShape, eleType); + type = TileType::New(retShape, eleType); if(retShape != lhsShape) - lhs_ = UnaryOp::New(Token::CAST, lhs_, ltype); + lhs = UnaryOp::New(Token::CAST, lhs, ltype); if(retShape != rhsShape) - rhs_ = UnaryOp::New(Token::CAST, rhs_, rtype); + rhs = UnaryOp::New(Token::CAST, rhs, rtype); } } @@ -340,6 +341,9 @@ void BinaryOp::TypeChecking() { case Token::MATMUL: return MatmulOpTypeChecking(); + case Token::MASKED_DEREF: + return MaskedDerefOpTypeChecking(); + default: assert(0); } @@ -375,7 +379,7 @@ void BinaryOp::MultiOpTypeChecking() { Error(this, "operands of '%%' should be integers"); } type_ = Convert(); - Broadcast(); + Broadcast(this, lhs_, rhs_, type_); } @@ -425,7 +429,7 @@ void BinaryOp::AdditiveOpTypeChecking() { } type_ = Convert(); } - Broadcast(); + Broadcast(this, lhs_, rhs_, type_); } void BinaryOp::RangeOpTypeChecking() { @@ -443,6 +447,19 @@ void BinaryOp::RangeOpTypeChecking() { type_ = TileType::New(TileType::ShapeInt{len}, lhs_->Type()); } +void BinaryOp::MaskedDerefOpTypeChecking() { + ::Type* lhsScalType = TryExtractScalarType(this, lhs_); + ::Type* rhsScalType = TryExtractScalarType(this, rhs_); + auto lhsType = lhsScalType->ToArithm(); + auto rhsType = rhsScalType->ToPointer(); + if (!rhsType) + Error(this, "pointer expected for deref pointer in operator '*?'"); + if (!lhsType || (lhsType && !lhsType->IsBool())) + Error(this, "bool expected for deref mask in operator '*?'"); + type_ = ScalarOrLikeTile(rhs_, rhsType->Derived().GetPtr()); + Broadcast(this, lhs_, rhs_, type_); +} + void BinaryOp::MatmulOpTypeChecking() { auto lhsType = lhs_->Type()->ToTile(); auto rhsType = rhs_->Type()->ToTile(); @@ -477,7 +494,7 @@ void BinaryOp::ShiftOpTypeChecking() { lhs_ = Expr::MayCast(lhs_, ScalarOrLikeTile(lhs_, ArithmType::IntegerPromote(lhsType))); rhs_ = Expr::MayCast(rhs_, ScalarOrLikeTile(rhs_, ArithmType::IntegerPromote(rhsType))); type_ = lhs_->Type(); - Broadcast(); + Broadcast(this, lhs_, rhs_, type_); } @@ -493,7 +510,7 @@ void BinaryOp::RelationalOpTypeChecking() { Convert(); } type_ = ArithmType::New(T_INT); - Broadcast(); + Broadcast(this, lhs_, rhs_, type_); } @@ -508,7 +525,7 @@ void BinaryOp::EqualityOpTypeChecking() { Convert(); } type_ = ArithmType::New(T_INT); - Broadcast(); + Broadcast(this, lhs_, rhs_, type_); } @@ -518,7 +535,7 @@ void BinaryOp::BitwiseOpTypeChecking() { if (!lhsScalType->IsInteger() || !rhsScalType->IsInteger()) Error(this, "operands of '&' should be integer"); type_ = Convert(); - Broadcast(); + Broadcast(this, lhs_, rhs_, type_); } @@ -528,7 +545,7 @@ void BinaryOp::LogicalOpTypeChecking() { if (!lhsScalType->IsScalar() || !rhsScalType->IsScalar()) Error(this, "the operand should be arithmetic type or pointer"); type_ = ArithmType::New(T_INT); - Broadcast(); + Broadcast(this, lhs_, rhs_, type_); } @@ -548,10 +565,9 @@ void BinaryOp::AssignOpTypeChecking() { // The other constraints are lefted to cast operator rhs_ = Expr::MayCast(rhs_, ScalarOrLikeTile(rhs_, lhsScalType)); type_ = lhs_->Type(); - Broadcast(); + Broadcast(this, lhs_, rhs_, type_); } - /* * Unary Operators */ @@ -734,8 +750,8 @@ ConditionalOp* ConditionalOp::New(const Token* tok, ArithmType* ConditionalOp::Convert() { - auto lhsType = exprTrue_->Type()->ToArithm(); - auto rhsType = exprFalse_->Type()->ToArithm(); + auto lhsType = exprTrue_->Type()->ScalarType()->ToArithm(); + auto rhsType = exprFalse_->Type()->ScalarType()->ToArithm(); assert(lhsType && rhsType); auto type = ArithmType::MaxType(lhsType, rhsType); if (lhsType != type) { // Pointer comparation is enough! @@ -750,18 +766,21 @@ ArithmType* ConditionalOp::Convert() { void ConditionalOp::TypeChecking() { - if (!cond_->Type()->IsScalar()) { - Error(cond_->Tok(), "scalar is required"); + auto condScalarType = TryExtractScalarType(this, cond_); + + if (!condScalarType) { + Error(cond_->Tok(), "condition must be tile or scalar"); } - auto lhsType = exprTrue_->Type(); - auto rhsType = exprFalse_->Type(); + auto lhsType = TryExtractScalarType(this, exprTrue_); + auto rhsType = TryExtractScalarType(this, exprFalse_); if (lhsType->ToArithm() && rhsType->ToArithm()) { type_ = Convert(); } else { EnsureCompatibleOrVoidPointer(lhsType, rhsType); type_ = lhsType; } + BinaryOp::Broadcast(this, exprFalse_, exprTrue_, type_); } diff --git a/lib/lang/code_gen.cc b/lib/lang/code_gen.cc index 451015d84..c2c691cb5 100644 --- a/lib/lang/code_gen.cc +++ b/lib/lang/code_gen.cc @@ -55,6 +55,10 @@ void Generator::VisitBinaryOp(BinaryOp* binary) { _0 = bld_->create_splat(_0, ret_ty->get_tile_shapes()); return set_ret(bld_->create_dot(lhs, rhs, _0)); } + case Token::MASKED_DEREF: { + ir::type* ret_ty = GenIRType(binary->Type(), *ctx_); + return set_ret(bld_->create_masked_load(rhs, lhs, ir::undef_value::get(ret_ty))); + } case Token::ELLIPSIS: { auto clhs = dynamic_cast(lhs); auto crhs = dynamic_cast(rhs); @@ -176,6 +180,21 @@ void Generator::VisitUnaryOp(UnaryOp* unary) { } void Generator::VisitConditionalOp(ConditionalOp* condOp) { +// auto &instructions = bld_->get_insert_block()->get_inst_list(); + VisitExpr(condOp->cond_); + ir::value* cond = ret_; + VisitExpr(condOp->exprTrue_); + ir::value* true_val = ret_; + VisitExpr(condOp->exprFalse_); + ir::value* false_val = ret_; + if(ir::load_inst* ld = dynamic_cast(true_val)) { + ir::value* new_ld = bld_->create_masked_load(ld->get_pointer_operand(), + cond, + false_val); + ld->replace_all_uses_with(new_ld); + ld->erase_from_parent(); + return set_ret(new_ld); + } return error_not_implemented(); } @@ -528,7 +547,7 @@ ir::type* Generator::GenIRFuncType(FuncType* type, ir::context& ctx) { ir::type* Generator::GenIRPointerType(PointerType* type, ir::context& ctx) { ir::type* ele_ty = GenIRType(type->Derived().GetPtr(), ctx); - unsigned addr_space = 0; + unsigned addr_space = 1; return ir::pointer_type::get(ele_ty, addr_space); } @@ -552,7 +571,13 @@ void Generator::popScope() { // LValue Generator void LValAssigner::VisitBinaryOp(BinaryOp* binary) { - error_not_implemented(); + if(binary->op_ != Token::MASKED_DEREF) + error_not_implemented(); + gen_->VisitExpr(binary->lhs_); + ir::value* mask = gen_->ret_; + gen_->VisitExpr(binary->rhs_); + ir::value* addr = gen_->ret_; + ret_ = gen_->bld_->create_masked_store(addr, rhs_, mask); } void LValAssigner::VisitUnaryOp(UnaryOp* unary) { diff --git a/lib/lang/parser.cc b/lib/lang/parser.cc index 7a545992d..8bd5634a9 100644 --- a/lib/lang/parser.cc +++ b/lib/lang/parser.cc @@ -517,7 +517,7 @@ Expr* Parser::ParseUnaryExpr() { case Token::INC: return ParsePrefixIncDec(tok); case Token::DEC: return ParsePrefixIncDec(tok); case '&': return ParseUnaryOp(tok, Token::ADDR); - case '*': return ParseUnaryOp(tok, Token::DEREF); + case '*': return ParseDerefOp(tok); case '+': return ParseUnaryOp(tok, Token::PLUS); case '-': return ParseUnaryOp(tok, Token::MINUS); case '~': return ParseUnaryOp(tok, '~'); @@ -577,6 +577,19 @@ UnaryOp* Parser::ParseUnaryOp(const Token* tok, int op) { return UnaryOp::New(op, operand); } +Expr* Parser::ParseDerefOp(const Token* tok) { + Expr* pred = nullptr; + if(ts_.Try('?')){ + ts_.Expect('('); + pred = ParseCastExpr(); + ts_.Expect(')'); + } + Expr* addr = ParseCastExpr(); + if(pred) + return BinaryOp::New(tok, Token::MASKED_DEREF, pred, addr); + else + return UnaryOp::New(Token::DEREF, addr); +} QualType Parser::ParseTypeName() { auto type = ParseSpecQual(); diff --git a/lib/lang/token.cc b/lib/lang/token.cc index 5445b2044..d8f0c0301 100644 --- a/lib/lang/token.cc +++ b/lib/lang/token.cc @@ -107,7 +107,6 @@ const std::unordered_map Token::tagLexemeMap_ { { Token::XOR_ASSIGN, "^=" }, { Token::OR_ASSIGN, "|=" }, { Token::ELLIPSIS, "..." }, - { Token::AUTO, "auto" }, { Token::BREAK, "break" }, { Token::CASE, "case" }, diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index f2f0a42bb..204d05b89 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -172,8 +172,10 @@ function::caller function::autotune(driver::stream* stream, const grid_fn_ty& gr caller call(tmp, std::move(bin), opt); double ts = tools::bench([&]() { call(stream, grid_fn(opt), args); }, stream); // save best - if(ts < best_ts) + if(ts < best_ts) { + best_ts = ts; ret.reset(new caller(call)); + } }; _parallel_loop_nest(space, benchmark, 1); return *ret; @@ -192,12 +194,13 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::transform::vectorize vectorize(&grids); codegen::transform::dce dce; codegen::transform::peephole peephole; - codegen::transform::reassociate reassociate(&grids); + codegen::transform::reassociate reassociate(&alignment_info, &grids); codegen::selection selection(&shmem_allocation, &grids, &shmem_info, &alignment_info, target.get()); // run passes peephole.run(module); dce.run(module); grids.run(module); + alignment_info.run(module); reassociate.run(module); peephole.run(module); if(target->is_gpu()){ @@ -207,8 +210,6 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c shmem_barriers.run(module); } dce.run(module); - ir::print(module, std::cout); - alignment_info.run(module); vectorize.run(module); dce.run(module); // generate llvm code From 59281f579454de74d73d45c4bd84044c9d15c35d Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 27 Aug 2019 20:33:38 -0700 Subject: [PATCH 334/494] [structure] better directory structure for tests --- CMakeLists.txt | 10 +- cmake/FindTensorFlow.cmake | 20 --- cmake/FindTorch.cmake | 14 -- examples/CMakeLists.txt | 1 - examples/cpp/CMakeLists.txt | 6 - examples/cpp/cuda.h | 160 -------------------- include/triton/driver/dispatch.h | 2 +- tests/CMakeLists.txt | 3 + tests/bench/CMakeLists.txt | 6 + tests/bench/dot.cc | 98 ++++++++++++ tests/common/cuda/cublas.h | 221 ++++++++++++++++++++++++++++ tests/common/cuda/forward.h | 105 +++++++++++++ tests/common/src/dot.h | 77 ++++++++++ tests/unit/CMakeLists.txt | 6 + {examples/cpp => tests/unit}/dot.cc | 103 +++---------- 15 files changed, 539 insertions(+), 293 deletions(-) delete mode 100644 cmake/FindTensorFlow.cmake delete mode 100644 cmake/FindTorch.cmake delete mode 100644 examples/CMakeLists.txt delete mode 100644 examples/cpp/CMakeLists.txt delete mode 100644 examples/cpp/cuda.h create mode 100644 tests/CMakeLists.txt create mode 100644 tests/bench/CMakeLists.txt create mode 100644 tests/bench/dot.cc create mode 100644 tests/common/cuda/cublas.h create mode 100644 tests/common/cuda/forward.h create mode 100644 tests/common/src/dot.h create mode 100644 tests/unit/CMakeLists.txt rename {examples/cpp => tests/unit}/dot.cc (67%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 637718fa6..9e05aca5d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,7 @@ include(CTest) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") # Options -option(BUILD_EXAMPLES "Build C++ Triton examples" ON) +option(BUILD_TESTS "Build C++ Triton tests" ON) option(BUILD_PYTHON_MODULE "Build Python Triton bindings" OFF) # LLVM @@ -23,10 +23,10 @@ endif() include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${LLVM_CXXFLAGS} -std=c++11") -# Examples -if(BUILD_EXAMPLES) - message(STATUS "Adding C++ examples") - add_subdirectory(examples) +# Tests +if(BUILD_TESTS) + message(STATUS "Adding C++ tests") + add_subdirectory(tests) endif() # Python module diff --git a/cmake/FindTensorFlow.cmake b/cmake/FindTensorFlow.cmake deleted file mode 100644 index 405febbeb..000000000 --- a/cmake/FindTensorFlow.cmake +++ /dev/null @@ -1,20 +0,0 @@ -include(FindPackageHandleStandardArgs) -unset(TENSORFLOW_FOUND) - -execute_process(COMMAND python -c "from os.path import dirname; import tensorflow as tf; print(dirname(dirname(tf.sysconfig.get_include())))" - OUTPUT_VARIABLE TF_INC OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET) -execute_process(COMMAND python -c "import tensorflow as tf; print(tf.sysconfig.get_lib())" - OUTPUT_VARIABLE TF_LIB OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET) -execute_process(COMMAND python -c "import tensorflow as tf; print(tf.__cxx11_abi_flag__ if \"__cxx11_abi_flag__\" in tf.__dict__ else 0)" - OUTPUT_VARIABLE TF_ABI OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET) - -find_package_handle_standard_args(TensorFlow DEFAULT_MSG TF_INC TF_LIB) - -# set external variables for usage in CMakeLists.txt -if(TensorFlow_FOUND) - set(TensorFlow_LIBRARIES ${TF_LIB}) - set(TensorFlow_INCLUDE_DIRS ${TF_INC}) - set(TensorFlow_ABI ${TF_ABI}) -endif() - -mark_as_advanced(TF_INC TF_LIB TF_ABI) diff --git a/cmake/FindTorch.cmake b/cmake/FindTorch.cmake deleted file mode 100644 index 79a814d03..000000000 --- a/cmake/FindTorch.cmake +++ /dev/null @@ -1,14 +0,0 @@ -include(FindPackageHandleStandardArgs) -execute_process(COMMAND python -c "import torch; import os; print(os.path.dirname(torch.__file__))" - OUTPUT_VARIABLE TORCH_INSTALL_PREFIX OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_QUIET) - -find_package_handle_standard_args(TORCH DEFAULT_MSG TORCH_INSTALL_PREFIX) -if(TORCH_INSTALL_PREFIX) - set(TORCH_INCLUDE_DIRS ${TORCH_INSTALL_PREFIX}/lib/include/ - ${TORCH_INSTALL_PREFIX}/lib/include/torch/csrc/api/include - ${TORCH_INSTALL_PREFIX}/include/ - ${TORCH_INSTALL_PREFIX}/include/torch/csrc/api/include/) - set(TORCH_LIBRARY_DIRS ${TORCH_INSTALL_PREFIX}/lib/) -endif() - -mark_as_advanced(TORCH_INCLUDE_DIRS TORCH_LIBRARY_DIRS) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt deleted file mode 100644 index 2322a85f7..000000000 --- a/examples/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_subdirectory(cpp) diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt deleted file mode 100644 index cea728c8e..000000000 --- a/examples/cpp/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -foreach(PROG dot) - add_executable(${PROG} ${PROG}.cc) - set_target_properties(${PROG} PROPERTIES OUTPUT_NAME ${PROG}) - include_directories(/usr/local/cuda/include/) - target_link_libraries(${PROG} triton cublas) -endforeach(PROG) diff --git a/examples/cpp/cuda.h b/examples/cpp/cuda.h deleted file mode 100644 index fef17dc55..000000000 --- a/examples/cpp/cuda.h +++ /dev/null @@ -1,160 +0,0 @@ -/* Copyright 2015-2017 Philippe Tillet -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files -* (the "Software"), to deal in the Software without restriction, -* including without limitation the rights to use, copy, modify, merge, -* publish, distribute, sublicense, and/or sell copies of the Software, -* and to permit persons to whom the Software is furnished to do so, -* subject to the following conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ - -#include -#include -#include -#include "cublas_v2.h" -#include "triton/driver/buffer.h" -#include "triton/driver/stream.h" -#include "triton/driver/context.h" -#include "triton/tools/bench.hpp" - -enum cublasStrategy_t{ - CUBLAS_PREFER_FASTEST, - CUBLAS_HEURISTICS -}; - -enum DType{ - HALF_TYPE, - FLOAT_TYPE, - DOUBLE_TYPE, -}; - -inline size_t size_of(DType dtype){ - switch (dtype) { - case HALF_TYPE: return 2; - case FLOAT_TYPE: return 4; - case DOUBLE_TYPE: return 8; - default: throw; - } -} - -inline std::vector gather_all_algos() { - std::vector result; - // non-tensor ops - for(int i = -1; i < 24; i++) - result.push_back((cublasGemmAlgo_t)i); - // tensor ops - for(int i = 99; i < 116; i++) - result.push_back((cublasGemmAlgo_t)i); - return result; -} - -static const std::vector algorithms = gather_all_algos(); - -static const std::map cu_dtype = { - {HALF_TYPE, CUDA_R_16F}, - {FLOAT_TYPE, CUDA_R_32F}, - {DOUBLE_TYPE, CUDA_R_64F} -}; - -static const std::map cu_op = { - {false, CUBLAS_OP_N}, - {true, CUBLAS_OP_T} -}; - -inline cublasGemmAlgo_t cublasGemmFastest( - triton::driver::stream* stream, - cublasHandle_t handle, cudaDataType cudt, - cublasOperation_t AT, cublasOperation_t BT, - int32_t M, int32_t N, int32_t K, - void* alpha, CUdeviceptr A, int32_t lda, CUdeviceptr B, int32_t ldb, - void* beta, CUdeviceptr C, int32_t ldc) { - - // cache to avoid re-benchmarking - typedef std::tuple key_t; - static std::map cache; - key_t key(cudt, AT, BT, M, N, K); - // benchmark algorithms if necessary - if(cache.find(key) == cache.end()){ - std::vector times; - for(cublasGemmAlgo_t a: algorithms) { - cublasStatus_t status; - double nanosec = triton::tools::bench([&](){ status = cublasGemmEx(handle, AT, BT, - M, N, K, - alpha, (const void*)A, cudt, lda, - (const void*)B, cudt, ldb, - beta, (void*)C, cudt, ldc, cudt, - a); }, stream); - if(status != CUBLAS_STATUS_SUCCESS) - nanosec = INFINITY; - } - size_t argmin = std::min_element(times.begin(), times.end()) - times.begin(); - assert(times[argmin] != INFINITY); - cache.insert({key, algorithms[argmin]}); - } - - // return best algorithm - return cache.at(key); -} - -/* Wrapper for cublasGemmEx */ -inline cublasStatus_t cublasGemmEx(cublasHandle_t handle, cudaDataType cudt, cublasOperation_t AT, cublasOperation_t BT, int32_t M, int32_t N, int32_t K, - void* alpha, CUdeviceptr A, int32_t lda, CUdeviceptr B, int32_t ldb, - void* beta, CUdeviceptr C, int32_t ldc, cublasGemmAlgo_t algo) -{ - cublasStatus_t status = cublasGemmEx(handle, AT, BT, M, N, K, alpha, (const void*)A, cudt, lda, (const void*)B, cudt, ldb, beta, (void*)C, cudt, ldc, cudt, algo); - if(status != CUBLAS_STATUS_SUCCESS){ - std::cout << status; - exit(EXIT_FAILURE); - } -} - - -/* Get cuBLAS handle */ -inline cublasHandle_t cublasGetHandle(triton::driver::stream* stream) { - static std::map cache; - CUstream key = *stream->cu(); - - // create handle if necessary - if(cache.find(key) == cache.end()) { - cublasHandle_t handle; - if(cublasCreate_v2(&handle) != CUBLAS_STATUS_SUCCESS) - throw std::runtime_error("Error: could not create cuBLAS handle"); - cublasSetStream_v2(handle, key); - cache.insert({key, handle}); - } - - // return handle for the stream - return cache.at(key); -} - -/* Simplified API for default GEMM */ -inline void cublasGemm(DType dtype, triton::driver::stream* stream, bool AT, bool BT, - int32_t M, int32_t N, int32_t K, - void* alpha, triton::driver::buffer* A, int32_t lda, - triton::driver::buffer* B, int32_t ldb, - void* beta, triton::driver::buffer* C, int32_t ldc, - cublasGemmAlgo_t* fastest = NULL, cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT) { - triton::driver::cu_context::context_switcher scope(*stream->context()); - static cublasHandle_t handle = cublasGetHandle(stream); - if(dtype == HALF_TYPE) - cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH); - cublasStatus_t status; - if(fastest) - *fastest = cublasGemmFastest(stream, handle, cu_dtype.at(dtype), cu_op.at(AT), cu_op.at(BT), M, N, K, alpha, *A->cu(), lda, *B->cu(), ldb, beta, *C->cu(), ldc); - else - status = cublasGemmEx(handle, cu_dtype.at(dtype), cu_op.at(AT), cu_op.at(BT), M, N, K, alpha, *A->cu(), lda, *B->cu(), ldb, beta, *C->cu(), ldc, algo); -} diff --git a/include/triton/driver/dispatch.h b/include/triton/driver/dispatch.h index 7f6fdf7e0..ed717a7fb 100755 --- a/include/triton/driver/dispatch.h +++ b/include/triton/driver/dispatch.h @@ -34,7 +34,7 @@ void check(cl_int err); class dispatch { -private: +protected: template struct return_type; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt new file mode 100644 index 000000000..8c80ee070 --- /dev/null +++ b/tests/CMakeLists.txt @@ -0,0 +1,3 @@ +include_directories("${CMAKE_CURRENT_SOURCE_DIR}/common") +add_subdirectory(bench) +add_subdirectory(unit) diff --git a/tests/bench/CMakeLists.txt b/tests/bench/CMakeLists.txt new file mode 100644 index 000000000..1f3cc3341 --- /dev/null +++ b/tests/bench/CMakeLists.txt @@ -0,0 +1,6 @@ +foreach(PROG dot) + set(TARGET bench_${PROG}) + add_executable(${TARGET} ${PROG}.cc) + set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME ${TARGET}) + target_link_libraries(${TARGET} triton dl) +endforeach(PROG) diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc new file mode 100644 index 000000000..63e5e877d --- /dev/null +++ b/tests/bench/dot.cc @@ -0,0 +1,98 @@ +#include +#include +#include +#include "triton/driver/backend.h" +#include "triton/driver/stream.h" +#include "triton/tools/bench.hpp" +#include "triton/external/half.hpp" +#include "triton/runtime/function.h" +#include "src/dot.h" +#include "cuda/cublas.h" + + +struct perf_t { + double triton; + double cublas; +}; + +namespace drv = triton::driver; +namespace rt = triton::runtime; + +inline size_t ceil(size_t x, size_t y) { + return (x + y - 1) / y; +}; + + +std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K){ + typedef half_float::half NumericT; + std::string ty = "half"; + size_t dt_nbytes = sizeof(NumericT); + drv::context* context = stream->context(); + // leading dimensions + int32_t lda = AT ? K : M; + int32_t ldb = BT ? N : K; + int32_t ldc = M; + // create inputs + auto dc = std::unique_ptr(drv::buffer::create(context, M*N*dt_nbytes)); + auto da = std::unique_ptr(drv::buffer::create(context, M*K*dt_nbytes)); + auto db = std::unique_ptr(drv::buffer::create(context, K*N*dt_nbytes)); + // create options + rt::function::options_space_t opt; + opt.defines.push_back({"TYPE", {ty}}); + if(AT) + opt.defines.push_back({"AT", {""}}); + if(BT) + opt.defines.push_back({"BT", {""}}); + opt.defines.push_back({"TM", {"16", "32", "64", "128"}}); + opt.defines.push_back({"TN", {"16", "32", "64", "128"}}); + opt.defines.push_back({"TK", {"32"}}); + opt.num_warps = {1, 2, 4, 8}; + // create grid + auto grid = [&](const rt::function::options_t& x) { + return rt::grid_t{ceil(M, x.D("TM")), + ceil(N, x.D("TN"))}; + }; + // create function + rt::function function(src::dot, opt); + // benchmark available libraries + std::vector result; + auto tflops = [&](double nanosec) { return 2.*M*N*K / nanosec * 1e-3; }; + // cublas + if(cublas::cublasinit()){ + NumericT alpha(static_cast(1)); + NumericT beta(static_cast(0)); + cublasGemmAlgo_t fastest; + cublasGemm(CUDA_R_16F, stream, AT, BT, M, N, K, &alpha, &*da, lda, &*db, ldb, &beta, &*dc, ldc, &fastest); + double cublas_ms = triton::tools::bench([&]() { cublasGemm(CUDA_R_16F, stream, AT, BT, M, N, K, + &alpha, &*da, lda, &*db, ldb, &beta, &*dc, ldc, nullptr, fastest); }, stream); + result.push_back(tflops(cublas_ms)); + } + // triton + double triton_ms = triton::tools::bench([&]() { function({&*da, &*db, &*dc, M, N, K, lda, ldb, ldc}, grid, stream);}, stream); + result.push_back(tflops(triton_ms)); + // done + return result; +} + +int main() { + // initialize default compute device + auto context = triton::driver::backend::contexts::get_default(); + triton::driver::stream* stream = triton::driver::stream::create(context); + // shapes to benchmark + typedef std::tuple config_t; + std::vector configs = { + config_t{false, true, 512, 512, 512}, + config_t{false, true, 2048, 2048, 2048}, + config_t{false, true, 8192, 8192, 8192} + }; + // does the work + bool AT, BT; + int32_t M, N, K; + for(const auto& c: configs){ + std::tie(AT, BT, M, N, K) = c; + std::cout << "// " << AT << " " << BT << " " << M << " " << N << " " << K << std::flush; + for(auto perf: do_bench(stream, AT, BT, M, N, K)) + std::cout << ", " << perf << std::flush; + std::cout << std::endl; + } +} diff --git a/tests/common/cuda/cublas.h b/tests/common/cuda/cublas.h new file mode 100644 index 000000000..db1f2a360 --- /dev/null +++ b/tests/common/cuda/cublas.h @@ -0,0 +1,221 @@ +/* Copyright 2019 Philippe Tillet +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files +* (the "Software"), to deal in the Software without restriction, +* including without limitation the rights to use, copy, modify, merge, +* publish, distribute, sublicense, and/or sell copies of the Software, +* and to permit persons to whom the Software is furnished to do so, +* subject to the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +#include +#include +#include +#include "forward.h" +#include "triton/driver/buffer.h" +#include "triton/driver/stream.h" +#include "triton/driver/context.h" +#include "triton/driver/error.h" +#include "triton/tools/bench.hpp" + + +class cublas { +private: + template + struct return_type; + + template + struct return_type + { typedef R type; }; + + typedef bool (*f_init_t)(); + + template + static typename return_type::type f_impl(void*& lib_h, FunPtrT, void*& cache, const char * name, Args... args) + { + initializer(); + if(cache == nullptr){ + cache = dlsym(lib_h, name); + if(cache == 0) + throw std::runtime_error("dlsym unable to load function"); + } + FunPtrT fptr; + *reinterpret_cast(&fptr) = cache; + typename return_type::type res = (*fptr)(args...); + triton::driver::check(res); + return res; + } + +public: + static bool cublasinit(); + static cublasStatus_t cublasSetMathMode(cublasHandle_t h, cublasMath_t m); + static cublasStatus_t cublasCreate_v2(cublasHandle_t* h); + static cublasStatus_t cublasGetStream_v2(cublasHandle_t h, cudaStream_t *streamId); + static cublasStatus_t cublasSetStream_v2(cublasHandle_t h, cudaStream_t streamId); + static cublasStatus_t cublasGemmEx(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, + const void *alpha, const void *A, cudaDataType Atype, int lda, + const void *B, cudaDataType Btype, int ldb, const void *beta, + void *C, cudaDataType Ctype, int ldc, + cudaDataType computeType, cublasGemmAlgo_t algo); + +private: + static void* so_; + static void* cublasGetStream_v2_; + static void* cublasSetStream_v2_; + static void* cublasCreate_v2_; + static void* cublasGemmEx_; + static void* cublasSetMathMode_; +}; + +void* cublas::so_; +void* cublas::cublasGetStream_v2_; +void* cublas::cublasSetStream_v2_; +void* cublas::cublasCreate_v2_; +void* cublas::cublasGemmEx_; +void* cublas::cublasSetMathMode_; + + +bool cublas::cublasinit() { + if(so_==nullptr) + so_ = dlopen("libcublas.so", RTLD_LAZY); + return so_ != nullptr; +} + +cublasStatus_t cublas::cublasGetStream_v2(cublasHandle_t h, cudaStream_t *a) +{ return f_impl(so_, cublasGetStream_v2, cublasGetStream_v2_, "cublasGetStream_v2", h, a); } +cublasStatus_t cublas::cublasSetStream_v2(cublasHandle_t h, cudaStream_t a) +{ return f_impl(so_, cublasSetStream_v2, cublasSetStream_v2_, "cublasSetStream_v2", h, a); } +cublasStatus_t cublas::cublasGemmEx(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, + const void *alpha, const void *A, cudaDataType Atype, int lda, + const void *B, cudaDataType Btype, int ldb, const void *beta, + void *C, cudaDataType Ctype, int ldc, cudaDataType computeType, cublasGemmAlgo_t algo) { + return f_impl(so_, cublasGemmEx, cublasGemmEx_, "cublasGemmEx", handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo); +} +cublasStatus_t cublas::cublasCreate_v2(cublasHandle_t *h) { + return f_impl(so_, cublasCreate_v2, cublasCreate_v2_, "cublasCreate_v2", h); +} +cublasStatus_t cublas::cublasSetMathMode(cublasHandle_t h, cublasMath_t m) { + return f_impl(so_, cublasSetMathMode, cublasSetMathMode_, "cublasSetMathMode", h, m); +} + + + +inline cublasGemmAlgo_t cublasGemmFastest( + triton::driver::stream* stream, + cublasHandle_t handle, cudaDataType cudt, + cublasOperation_t AT, cublasOperation_t BT, + int32_t M, int32_t N, int32_t K, + void* alpha, CUdeviceptr A, int32_t lda, CUdeviceptr B, int32_t ldb, + void* beta, CUdeviceptr C, int32_t ldc) { + + // initialize list of cublas algorithms + static std::vector algorithms; + if(algorithms.empty()) { + // non-tensor ops + for(int i = -1; i < 24; i++) + algorithms.push_back((cublasGemmAlgo_t)i); + // tensor ops + for(int i = 99; i < 116; i++) + algorithms.push_back((cublasGemmAlgo_t)i); + } + + // cache to avoid re-benchmarking + typedef std::tuple key_t; + static std::map cache; + key_t key(cudt, AT, BT, M, N, K); + // benchmark algorithms if necessary + if(cache.find(key) == cache.end()){ + std::vector times; + for(cublasGemmAlgo_t a: algorithms) { + cublasStatus_t status; + double nanosec = triton::tools::bench([&](){ status = cublas::cublasGemmEx(handle, AT, BT, + M, N, K, + alpha, (const void*)A, cudt, lda, + (const void*)B, cudt, ldb, + beta, (void*)C, cudt, ldc, cudt, + a); }, stream); + if(status != CUBLAS_STATUS_SUCCESS) + nanosec = INFINITY; + } + size_t argmin = std::min_element(times.begin(), times.end()) - times.begin(); + assert(times[argmin] != INFINITY); + cache.insert({key, algorithms[argmin]}); + } + + // return best algorithm + return cache.at(key); +} + + + + +/* Get cuBLAS handle */ +inline cublasHandle_t cublasGetHandle(triton::driver::stream* stream) { + static std::map cache; + CUstream key = *stream->cu(); + + // create handle if necessary + if(cache.find(key) == cache.end()) { + cublasHandle_t handle; + if(cublas::cublasCreate_v2(&handle) != CUBLAS_STATUS_SUCCESS) + throw std::runtime_error("Error: could not create cuBLAS handle"); + cublas::cublasSetStream_v2(handle, key); + cache.insert({key, handle}); + } + + // return handle for the stream + return cache.at(key); +} + + + +/* Simplified API for default GEMM */ +inline void cublasGemm(cublasDataType_t dtype, + triton::driver::stream* stream, + bool AT, bool BT, + int32_t M, int32_t N, int32_t K, + void* alpha, triton::driver::buffer* A, int32_t lda, + triton::driver::buffer* B, int32_t ldb, + void* beta, triton::driver::buffer* C, int32_t ldc, + cublasGemmAlgo_t* fastest = NULL, cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT) { + + // switch triton context + triton::driver::cu_context::context_switcher scope(*stream->context()); + // get handle + static cublasHandle_t handle = cublasGetHandle(stream); + // set math mode + if(dtype == CUDA_R_16F) + cublas::cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH); + // cuda types + static const std::map cu_op = { + {false, CUBLAS_OP_N}, + {true, CUBLAS_OP_T} + }; + cublasOperation_t opa = cu_op.at(AT); + cublasOperation_t opb = cu_op.at(BT); + // benchmark fastest + if(fastest) + *fastest = cublasGemmFastest(stream, handle, dtype, opa, opb, M, N, K, alpha, *A->cu(), lda, *B->cu(), ldb, beta, *C->cu(), ldc); + else { + // execute supplied algo + cublasStatus_t status = cublas::cublasGemmEx(handle, opa, opb, M, N, K, + alpha, (const void*)*A->cu(), dtype, lda, + (const void*)*B->cu(), dtype, ldb, + beta, (void*)*C->cu(), dtype, ldc, dtype, algo); + } +} diff --git a/tests/common/cuda/forward.h b/tests/common/cuda/forward.h new file mode 100644 index 000000000..1c12c4247 --- /dev/null +++ b/tests/common/cuda/forward.h @@ -0,0 +1,105 @@ +#ifndef _COMMON_CUDA_FORWARDS_H_ +#define _COMMON_CUDA_FORwARDS_H_ + +struct cublasContext; +typedef struct cublasContext *cublasHandle_t; +struct CUstream_st; +typedef struct CUstream_st *cudaStream_t; + +/* CUBLAS status type returns */ +typedef enum{ + CUBLAS_STATUS_SUCCESS =0, + CUBLAS_STATUS_NOT_INITIALIZED =1, + CUBLAS_STATUS_ALLOC_FAILED =3, + CUBLAS_STATUS_INVALID_VALUE =7, + CUBLAS_STATUS_ARCH_MISMATCH =8, + CUBLAS_STATUS_MAPPING_ERROR =11, + CUBLAS_STATUS_EXECUTION_FAILED=13, + CUBLAS_STATUS_INTERNAL_ERROR =14, + CUBLAS_STATUS_NOT_SUPPORTED =15, + CUBLAS_STATUS_LICENSE_ERROR =16 +} cublasStatus_t; + +/*For different GEMM algorithm */ +typedef enum { + CUBLAS_GEMM_DFALT = -1, + CUBLAS_GEMM_DEFAULT = -1, + CUBLAS_GEMM_ALGO0 = 0, + CUBLAS_GEMM_ALGO1 = 1, + CUBLAS_GEMM_ALGO2 = 2, + CUBLAS_GEMM_ALGO3 = 3, + CUBLAS_GEMM_ALGO4 = 4, + CUBLAS_GEMM_ALGO5 = 5, + CUBLAS_GEMM_ALGO6 = 6, + CUBLAS_GEMM_ALGO7 = 7, + CUBLAS_GEMM_ALGO8 = 8, + CUBLAS_GEMM_ALGO9 = 9, + CUBLAS_GEMM_ALGO10 = 10, + CUBLAS_GEMM_ALGO11 = 11, + CUBLAS_GEMM_ALGO12 = 12, + CUBLAS_GEMM_ALGO13 = 13, + CUBLAS_GEMM_ALGO14 = 14, + CUBLAS_GEMM_ALGO15 = 15, + CUBLAS_GEMM_ALGO16 = 16, + CUBLAS_GEMM_ALGO17 = 17, + CUBLAS_GEMM_ALGO18 = 18, //sliced 32x32 + CUBLAS_GEMM_ALGO19 = 19, //sliced 64x32 + CUBLAS_GEMM_ALGO20 = 20, //sliced 128x32 + CUBLAS_GEMM_ALGO21 = 21, //sliced 32x32 -splitK + CUBLAS_GEMM_ALGO22 = 22, //sliced 64x32 -splitK + CUBLAS_GEMM_ALGO23 = 23, //sliced 128x32 -splitK + CUBLAS_GEMM_DEFAULT_TENSOR_OP = 99, + CUBLAS_GEMM_DFALT_TENSOR_OP = 99, + CUBLAS_GEMM_ALGO0_TENSOR_OP = 100, + CUBLAS_GEMM_ALGO1_TENSOR_OP = 101, + CUBLAS_GEMM_ALGO2_TENSOR_OP = 102, + CUBLAS_GEMM_ALGO3_TENSOR_OP = 103, + CUBLAS_GEMM_ALGO4_TENSOR_OP = 104, + CUBLAS_GEMM_ALGO5_TENSOR_OP = 105, + CUBLAS_GEMM_ALGO6_TENSOR_OP = 106, + CUBLAS_GEMM_ALGO7_TENSOR_OP = 107, + CUBLAS_GEMM_ALGO8_TENSOR_OP = 108, + CUBLAS_GEMM_ALGO9_TENSOR_OP = 109, + CUBLAS_GEMM_ALGO10_TENSOR_OP = 110, + CUBLAS_GEMM_ALGO11_TENSOR_OP = 111, + CUBLAS_GEMM_ALGO12_TENSOR_OP = 112, + CUBLAS_GEMM_ALGO13_TENSOR_OP = 113, + CUBLAS_GEMM_ALGO14_TENSOR_OP = 114, + CUBLAS_GEMM_ALGO15_TENSOR_OP = 115 +} cublasGemmAlgo_t; + +typedef enum cudaDataType_t +{ + CUDA_R_16F= 2, /* real as a half */ + CUDA_C_16F= 6, /* complex as a pair of half numbers */ + CUDA_R_32F= 0, /* real as a float */ + CUDA_C_32F= 4, /* complex as a pair of float numbers */ + CUDA_R_64F= 1, /* real as a double */ + CUDA_C_64F= 5, /* complex as a pair of double numbers */ + CUDA_R_8I = 3, /* real as a signed char */ + CUDA_C_8I = 7, /* complex as a pair of signed char numbers */ + CUDA_R_8U = 8, /* real as a unsigned char */ + CUDA_C_8U = 9, /* complex as a pair of unsigned char numbers */ + CUDA_R_32I= 10, /* real as a signed int */ + CUDA_C_32I= 11, /* complex as a pair of signed int numbers */ + CUDA_R_32U= 12, /* real as a unsigned int */ + CUDA_C_32U= 13 /* complex as a pair of unsigned int numbers */ +} cudaDataType; + +typedef cudaDataType cublasDataType_t; + +typedef enum { + CUBLAS_OP_N=0, + CUBLAS_OP_T=1, + CUBLAS_OP_C=2, + CUBLAS_OP_HERMITAN=2, /* synonym if CUBLAS_OP_C */ + CUBLAS_OP_CONJG=3 /* conjugate */ +} cublasOperation_t; + +/*Enum for default math mode/tensor operation*/ +typedef enum { + CUBLAS_DEFAULT_MATH = 0, + CUBLAS_TENSOR_OP_MATH = 1 +} cublasMath_t; + +#endif \ No newline at end of file diff --git a/tests/common/src/dot.h b/tests/common/src/dot.h new file mode 100644 index 000000000..00814c0f0 --- /dev/null +++ b/tests/common/src/dot.h @@ -0,0 +1,77 @@ +namespace src { + + const char *dot = +R"( +#ifdef AT +#define USEA ^a +#else +#define USEA a +#endif + +#ifdef BT +#define USEB ^b +#else +#define USEB b +#endif + +void dot(TYPE * A __noalias __readonly __aligned(16), + TYPE * B __noalias __readonly __aligned(16), + TYPE * C __noalias __readonly __aligned(16), + int M, int N, int K, + int lda __multipleof(8), + int ldb __multipleof(8), + int ldc) { + int ridx = get_program_id(0); + int ridy = get_program_id(1); + int rxa[TM] = ridx * TM + 0 ... TM; + int ryb[TN] = ridy * TN + 0 ... TN; + int rka[TK] = 0 ... TK; + int rkb[TK] = 0 ... TK; + float xc[TM, TN] = 0; +#ifdef AT + TYPE* pa[TK, TM] = A + rka[:, newaxis] + rxa[newaxis, :]*lda; + bool checka[TK, TM] = rka[:, newaxis] < K; + TYPE a[TK, TM] = checka ? *pa : 0; +#else + TYPE* pa[TM, TK] = A + rka[newaxis, :]*lda + rxa[:, newaxis]; + bool checka[TM, TK] = rka[newaxis, :] < K; + TYPE a[TM, TK] = checka ? *pa : 0; +#endif +#ifdef BT + TYPE* pb[TN, TK] = B + rkb[newaxis, :]*ldb + ryb[:, newaxis]; + bool checkb[TN, TK] = rkb[newaxis, :] < K; + TYPE b[TN, TK] = checkb ? *pb : 0; +#else + TYPE* pb[TK, TN] = B + rkb[:, newaxis] + ryb[newaxis, :]*ldb; + bool checkb[TK, TN] = rkb[:, newaxis] < K; + TYPE b[TK, TN] = checkb ? *pb : 0; +#endif + for(int k = K; k > 0; k = k - TK){ + xc = USEA @ USEB + xc; +#ifdef AT + pa = pa + TK; +#else + pa = pa + TK*lda; +#endif +#ifdef BT + pb = pb + TK*ldb; +#else + pb = pb + TK; +#endif + checka = k > TK; + checkb = k > TK; + a = checka ? *pa : 0; + b = checkb ? *pb : 0; + } + int rxc[TM] = ridx * TM + (0 ... TM); + int ryc[TN] = ridy * TN + (0 ... TN); + TYPE* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; + TYPE c[TM, TN] = xc; + bool checkc0[TM] = rxc < M; + bool checkc1[TN] = ryc < N; + bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + *?(checkc) pc = c; +} +)"; + +} \ No newline at end of file diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt new file mode 100644 index 000000000..f3cdae9a1 --- /dev/null +++ b/tests/unit/CMakeLists.txt @@ -0,0 +1,6 @@ +foreach(PROG dot) + set(TARGET test_${PROG}) + add_executable(${TARGET} ${PROG}.cc) + set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME ${TARGET}) + target_link_libraries(${TARGET} triton dl) +endforeach(PROG) diff --git a/examples/cpp/dot.cc b/tests/unit/dot.cc similarity index 67% rename from examples/cpp/dot.cc rename to tests/unit/dot.cc index 7d5a44324..3ddc8953e 100644 --- a/examples/cpp/dot.cc +++ b/tests/unit/dot.cc @@ -6,7 +6,8 @@ #include "triton/tools/bench.hpp" #include "triton/external/half.hpp" #include "triton/runtime/function.h" -#include "cuda.h" +#include "src/dot.h" +#include "cuda/cublas.h" template void diff(const std::vector& x, const std::vector& y){ @@ -44,81 +45,6 @@ void cpu_ref(bool AT_, bool BT_, size_t M, size_t N, size_t K, } - -std::string src = -R"( -#ifdef AT -#define USEA ^a -#else -#define USEA a -#endif - -#ifdef BT -#define USEB ^b -#else -#define USEB b -#endif - -void dot(TYPE * A __noalias __readonly __aligned(16), - TYPE * B __noalias __readonly __aligned(16), - TYPE * C __noalias __readonly __aligned(16), - int M, int N, int K, - int lda __multipleof(8), - int ldb __multipleof(8), - int ldc) { - int ridx = get_program_id(0); - int ridy = get_program_id(1); - int rxa[TM] = ridx * TM + 0 ... TM; - int ryb[TN] = ridy * TN + 0 ... TN; - int rka[TK] = 0 ... TK; - int rkb[TK] = 0 ... TK; - float xc[TM, TN] = 0; -#ifdef AT - TYPE* pa[TK, TM] = A + rka[:, newaxis] + rxa[newaxis, :]*lda; - bool checka[TK, TM] = rka[:, newaxis] < K; - TYPE a[TK, TM] = checka ? *pa : 0; -#else - TYPE* pa[TM, TK] = A + rka[newaxis, :]*lda + rxa[:, newaxis]; - bool checka[TM, TK] = rka[newaxis, :] < K; - TYPE a[TM, TK] = checka ? *pa : 0; -#endif -#ifdef BT - TYPE* pb[TN, TK] = B + rkb[newaxis, :]*ldb + ryb[:, newaxis]; - bool checkb[TN, TK] = rkb[newaxis, :] < K; - TYPE b[TN, TK] = checkb ? *pb : 0; -#else - TYPE* pb[TK, TN] = B + rkb[:, newaxis] + ryb[newaxis, :]*ldb; - bool checkb[TK, TN] = rkb[:, newazis] < K; - TYPE b[TK, TN] = checkb ? *pb : 0; -#endif - for(int k = K; k > 0; k = k - TK){ - xc = USEA @ USEB + xc; -#ifdef AT - pa = pa + TK; -#else - pa = pa + TK*lda; -#endif -#ifdef BT - pb = pb + TK*ldb; -#else - pb = pb + TK; -#endif - checka = k > TK; - checkb = k > TK; - a = checka ? *pa : 0; - b = checkb ? *pb : 0; - } - int rxc[TM] = ridx * TM + (0 ... TM); - int ryc[TN] = ridy * TN + (0 ... TN); - TYPE* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - TYPE c[TM, TN] = xc; - bool checkc0[TM] = rxc < M; - bool checkc1[TN] = ryc < N; - bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - *?(checkc) pc = c; -} -)"; - struct perf_t { double triton; double cublas; @@ -128,7 +54,7 @@ namespace drv = triton::driver; namespace rt = triton::runtime; perf_t do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K){ - typedef half NumericT; + typedef half_float::half NumericT; std::string ty = "half"; size_t dt_nbytes = sizeof(NumericT); drv::context* context = stream->context(); @@ -140,9 +66,9 @@ perf_t do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int int32_t ldc = M; srand(0); for(size_t i = 0; i < ha.size(); i++) - ha[i] = static_cast((double)rand()/RAND_MAX); + ha[i] = static_cast((float)rand()/RAND_MAX); for(size_t i = 0; i < hb.size(); i++) - hb[i] = static_cast((double)rand()/RAND_MAX); + hb[i] = static_cast((float)rand()/RAND_MAX); for(size_t i = 0; i < hc.size(); i++) hc[i] = static_cast((double)0); drv::buffer* dc = drv::buffer::create(context, hc.size()*dt_nbytes); @@ -159,11 +85,11 @@ perf_t do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int opt.defines.push_back({"AT", {""}}); if(BT) opt.defines.push_back({"BT", {""}}); - opt.defines.push_back({"TM", {"128"}}); - opt.defines.push_back({"TN", {"128"}}); + opt.defines.push_back({"TM", {"16", "32", "64", "128"}}); + opt.defines.push_back({"TN", {"16", "32", "64", "128"}}); opt.defines.push_back({"TK", {"32"}}); - opt.num_warps = {4}; - rt::function function(src, opt); + opt.num_warps = {1, 2, 4, 8}; + rt::function function(src::dot, opt); auto ceil = [](size_t x, size_t y) { return (x + y - 1) / y; }; auto grid = [&](const rt::function::options_t& x) { return rt::grid_t{ceil(M, x.D("TM")), ceil(N, x.D("TN")), 1}; }; @@ -171,10 +97,15 @@ perf_t do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int auto tflops = [&](double nanosec) { return 2.*M*N*K / nanosec * 1e-3; }; perf_t res; res.triton = tflops(triton::tools::bench([&]() { function({da, db, dc, M, N, K, lda, ldb, ldc}, grid, stream);}, stream)); - res.cublas = 0; + NumericT alpha(static_cast(1)); + NumericT beta(static_cast(0)); + cublasGemmAlgo_t fastest; + cublasGemm(CUDA_R_16F, stream, AT, BT, M, N, K, &alpha, da, lda, db, ldb, &beta, dc, ldc, &fastest); + res.cublas = tflops(triton::tools::bench([&]() { cublasGemm(CUDA_R_16F, stream, AT, BT, M, N, K, + &alpha, da, lda, db, ldb, &beta, dc, ldc, nullptr, fastest); }, + stream)); // test - stream->synchronize(); // stream->read(dc, true, 0, hc); // std::vector rc(hc.size()); // cpu_ref(AT, BT, M, N, K, rc, ha, hb); @@ -214,7 +145,7 @@ int main() { // shapes to benchmark std::vector configs = { // {false, false, 8192, 512, 512}, - {false, true, 8192, 8192, 8192} + {false, true, 128, 128, 128} // {false, true, 128, 128, 128}, // {false, false, 128, 128, 128}, // {true, false, 128, 128, 128}, From d45748253985e33166bea80b50b65fb2b24b32af Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 28 Aug 2019 17:50:45 -0700 Subject: [PATCH 335/494] [codegen] fixed issue in double buffering pointer update --- lib/codegen/selection/selection.cc | 3 +- lib/driver/handle.cc | 2 +- lib/driver/module.cc | 4 +- lib/runtime/function.cc | 5 + python/setup.py | 2 +- python/src/tensorflow.cc | 3 +- tests/bench/dot.cc | 46 ++++++--- tests/common/src/dot.h | 24 ++--- tests/unit/dot.cc | 160 ++++++++++++++--------------- 9 files changed, 135 insertions(+), 114 deletions(-) diff --git a/lib/codegen/selection/selection.cc b/lib/codegen/selection/selection.cc index 5059a3130..b04209f61 100644 --- a/lib/codegen/selection/selection.cc +++ b/lib/codegen/selection/selection.cc @@ -1577,7 +1577,8 @@ void selection::run(ir::module &src, Module &dst) { offset->addIncoming(next_offset, llvm_inc_block); } else { - offset->addIncoming(dst_builder.getInt32(alloc_->get_num_bytes(phi)/(2*4)), llvm_inc_block); + unsigned num_bytes = phi->get_type()->get_scalar_ty()->get_primitive_size_in_bits() / 8; + offset->addIncoming(dst_builder.getInt32(alloc_->get_num_bytes(phi)/(2*num_bytes)), llvm_inc_block); } ptr->addIncoming(inc_shared->get_pointer(), llvm_inc_block); } diff --git a/lib/driver/handle.cc b/lib/driver/handle.cc index a0013f347..8899eb30e 100755 --- a/lib/driver/handle.cc +++ b/lib/driver/handle.cc @@ -72,7 +72,7 @@ handle::~handle(){ try{ if(has_ownership_ && h_ && h_.unique()) _delete(*h_); - }catch(const exception::cuda::deinitialized&){ + }catch(const exception::cuda::base&){ // order of destruction for global variables // is not guaranteed } diff --git a/lib/driver/module.cc b/lib/driver/module.cc index 486d7d588..96a7c0f08 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -26,6 +26,7 @@ #include "triton/driver/error.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Verifier.h" +#include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/Module.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/raw_ostream.h" @@ -240,7 +241,6 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ -// std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; @@ -250,8 +250,10 @@ cu_module::cu_module(driver::context * context, std::string const & source) : mo try{ dispatch::cuModuleLoadDataEx(&*cu_, source_.data(), 2, opt, optval); }catch(exception::cuda::base const &){ +#ifdef TRITON_LOG_PTX_ERROR std::cerr << "Compilation Failed! Log: " << std::endl; std::cerr << errbuf << std::endl; +#endif throw; } } diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 204d05b89..5c93eb452 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -12,6 +12,7 @@ #include "triton/driver/stream.h" #include "triton/driver/kernel.h" #include "triton/driver/module.h" +#include "triton/driver/error.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/print.h" @@ -166,6 +167,8 @@ function::caller function::autotune(driver::stream* stream, const grid_fn_ty& gr bin = make_bin(*ir, stream->context(), opt); }catch(const std::runtime_error& e) { return; + }catch(const driver::exception::cuda::invalid_ptx& e) { + return; } // benchmark ir::function *tmp = ir->get_function_list()[0]; @@ -178,6 +181,8 @@ function::caller function::autotune(driver::stream* stream, const grid_fn_ty& gr } }; _parallel_loop_nest(space, benchmark, 1); + if(!ret) + throw std::runtime_error("could not find valid option in provided space"); return *ret; } diff --git a/python/setup.py b/python/setup.py index ef5fa9865..b9285f84f 100644 --- a/python/setup.py +++ b/python/setup.py @@ -47,7 +47,7 @@ class CMakeBuild(build_ext): tf_libs = 'tensorflow_framework' cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir, - '-DBUILD_EXAMPLES=OFF', + '-DBUILD_TESTS=OFF', '-DBUILD_PYTHON_MODULE=ON', '-DPYTHON_INCLUDE_DIRS=' + python_include_dirs, '-DTF_INCLUDE_DIRS=' + tf_include_dirs, diff --git a/python/src/tensorflow.cc b/python/src/tensorflow.cc index e71f6a77a..fde2d84ec 100644 --- a/python/src/tensorflow.cc +++ b/python/src/tensorflow.cc @@ -160,7 +160,8 @@ void gen_register_op(std::ostream &os, const std::string &name, std::string name = arg->get_name(); auto tolower = [](char c) { return std::tolower(c);}; std::transform(name.begin(), name.end(), name.begin(), tolower); - os << " .Input(\"" << name << ": " << to_tf_scalar_ty(arg->get_type()) << "\")\n"; + os << " .Attr(\"T" << i << " : {bool, int8, int16, int32, int64, float16, float32, float64}\")" << std::endl; + os << " .Input(\"" << name << ": T" << i << "\")\n"; } for(size_t i = 0; i < outputs.size(); i++){ std::string name = outputs[i]; diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index 63e5e877d..4176d2377 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -10,11 +10,6 @@ #include "cuda/cublas.h" -struct perf_t { - double triton; - double cublas; -}; - namespace drv = triton::driver; namespace rt = triton::runtime; @@ -22,6 +17,14 @@ inline size_t ceil(size_t x, size_t y) { return (x + y - 1) / y; }; +inline rt::function::grid_fn_ty grid(size_t M, size_t N) { + return [M, N](const rt::function::options_t& x) { + return rt::grid_t{ceil(M, x.D("TM")), + ceil(N, x.D("TN"))}; + }; +} + + std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K){ typedef half_float::half NumericT; @@ -33,9 +36,9 @@ std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, i int32_t ldb = BT ? N : K; int32_t ldc = M; // create inputs - auto dc = std::unique_ptr(drv::buffer::create(context, M*N*dt_nbytes)); auto da = std::unique_ptr(drv::buffer::create(context, M*K*dt_nbytes)); auto db = std::unique_ptr(drv::buffer::create(context, K*N*dt_nbytes)); + auto dc = std::unique_ptr(drv::buffer::create(context, M*N*dt_nbytes)); // create options rt::function::options_space_t opt; opt.defines.push_back({"TYPE", {ty}}); @@ -47,11 +50,6 @@ std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, i opt.defines.push_back({"TN", {"16", "32", "64", "128"}}); opt.defines.push_back({"TK", {"32"}}); opt.num_warps = {1, 2, 4, 8}; - // create grid - auto grid = [&](const rt::function::options_t& x) { - return rt::grid_t{ceil(M, x.D("TM")), - ceil(N, x.D("TN"))}; - }; // create function rt::function function(src::dot, opt); // benchmark available libraries @@ -68,7 +66,7 @@ std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, i result.push_back(tflops(cublas_ms)); } // triton - double triton_ms = triton::tools::bench([&]() { function({&*da, &*db, &*dc, M, N, K, lda, ldb, ldc}, grid, stream);}, stream); + double triton_ms = triton::tools::bench([&]() { function({&*da, &*db, &*dc, M, N, K, lda, ldb, ldc}, grid(M, N), stream);}, stream); result.push_back(tflops(triton_ms)); // done return result; @@ -80,11 +78,25 @@ int main() { triton::driver::stream* stream = triton::driver::stream::create(context); // shapes to benchmark typedef std::tuple config_t; - std::vector configs = { - config_t{false, true, 512, 512, 512}, - config_t{false, true, 2048, 2048, 2048}, - config_t{false, true, 8192, 8192, 8192} - }; + std::vector configs; + for(auto x: std::vector>{{false, false}, + {false, true}, + {true, false}}){ + std::vector tmp = { + config_t{x[0], x[1], 8192, 8192, 8192} +// config_t{x[0], x[1], 16, 2048, 2048}, +// config_t{x[0], x[1], 32, 2048, 2048}, +// config_t{x[0], x[1], 64, 2048, 2048}, +// config_t{x[0], x[1], 128, 2048, 2048}, +// config_t{x[0], x[1], 7000, 2048, 2048}, +// config_t{x[0], x[1], 16, 4096, 4096}, +// config_t{x[0], x[1], 32, 4096, 4096}, +// config_t{x[0], x[1], 64, 4096, 4096}, +// config_t{x[0], x[1], 128, 4096, 4096}, +// config_t{x[0], x[1], 7000, 4096, 4096}, + }; + configs.insert(configs.end(), tmp.begin(), tmp.end()); + } // does the work bool AT, BT; int32_t M, N, K; diff --git a/tests/common/src/dot.h b/tests/common/src/dot.h index 00814c0f0..9df0643a6 100644 --- a/tests/common/src/dot.h +++ b/tests/common/src/dot.h @@ -30,21 +30,21 @@ void dot(TYPE * A __noalias __readonly __aligned(16), float xc[TM, TN] = 0; #ifdef AT TYPE* pa[TK, TM] = A + rka[:, newaxis] + rxa[newaxis, :]*lda; - bool checka[TK, TM] = rka[:, newaxis] < K; - TYPE a[TK, TM] = checka ? *pa : 0; + bool checka[TK, TM] = rka[:, newaxis] < TK; + TYPE a[TK, TM] = *pa; #else TYPE* pa[TM, TK] = A + rka[newaxis, :]*lda + rxa[:, newaxis]; - bool checka[TM, TK] = rka[newaxis, :] < K; - TYPE a[TM, TK] = checka ? *pa : 0; + bool checka[TM, TK] = rka[newaxis, :] < TK; + TYPE a[TM, TK] = *pa; #endif #ifdef BT TYPE* pb[TN, TK] = B + rkb[newaxis, :]*ldb + ryb[:, newaxis]; - bool checkb[TN, TK] = rkb[newaxis, :] < K; - TYPE b[TN, TK] = checkb ? *pb : 0; + bool checkb[TN, TK] = rkb[newaxis, :] < TK; + TYPE b[TN, TK] = *pb; #else TYPE* pb[TK, TN] = B + rkb[:, newaxis] + ryb[newaxis, :]*ldb; - bool checkb[TK, TN] = rkb[:, newaxis] < K; - TYPE b[TK, TN] = checkb ? *pb : 0; + bool checkb[TK, TN] = rkb[:, newaxis] < TK; + TYPE b[TK, TN] = *pb; #endif for(int k = K; k > 0; k = k - TK){ xc = USEA @ USEB + xc; @@ -60,8 +60,8 @@ void dot(TYPE * A __noalias __readonly __aligned(16), #endif checka = k > TK; checkb = k > TK; - a = checka ? *pa : 0; - b = checkb ? *pb : 0; + a = *pa; + b = *pb; } int rxc[TM] = ridx * TM + (0 ... TM); int ryc[TN] = ridy * TN + (0 ... TN); @@ -70,8 +70,8 @@ void dot(TYPE * A __noalias __readonly __aligned(16), bool checkc0[TM] = rxc < M; bool checkc1[TN] = ryc < N; bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - *?(checkc) pc = c; + *pc = c; } )"; -} \ No newline at end of file +} diff --git a/tests/unit/dot.cc b/tests/unit/dot.cc index 3ddc8953e..298b79a44 100644 --- a/tests/unit/dot.cc +++ b/tests/unit/dot.cc @@ -9,6 +9,9 @@ #include "src/dot.h" #include "cuda/cublas.h" +namespace drv = triton::driver; +namespace rt = triton::runtime; + template void diff(const std::vector& x, const std::vector& y){ for(size_t i = 0; i < x.size(); i++) @@ -44,16 +47,44 @@ void cpu_ref(bool AT_, bool BT_, size_t M, size_t N, size_t K, cpu_ref(c, a, b, M, N, K); } - -struct perf_t { - double triton; - double cublas; +inline size_t ceil(size_t x, size_t y) { + return (x + y - 1) / y; }; -namespace drv = triton::driver; -namespace rt = triton::runtime; +inline rt::function::grid_fn_ty grid(size_t M, size_t N) { + return [M, N](const rt::function::options_t& x) { + return rt::grid_t{ceil(M, x.D("TM")), + ceil(N, x.D("TN"))}; + }; +} -perf_t do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K){ +namespace aux{ +template struct seq{}; + +template +struct gen_seq : gen_seq{}; + +template +struct gen_seq<0, Is...> : seq{}; + +template +void print_tuple(std::basic_ostream& os, Tuple const& t, seq){ + using swallow = int[]; + (void)swallow{0, (void(os << (Is == 0? "" : ", ") << std::get(t)), 0)...}; +} +} // aux:: + +template +auto operator<<(std::basic_ostream& os, std::tuple const& t) + -> std::basic_ostream& +{ + os << "("; + aux::print_tuple(os, t, aux::gen_seq()); + return os << ")"; +} + + +bool do_test(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K, int32_t TM, int32_t TN, int32_t TK, size_t nwarp){ typedef half_float::half NumericT; std::string ty = "half"; size_t dt_nbytes = sizeof(NumericT); @@ -71,12 +102,12 @@ perf_t do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int hb[i] = static_cast((float)rand()/RAND_MAX); for(size_t i = 0; i < hc.size(); i++) hc[i] = static_cast((double)0); - drv::buffer* dc = drv::buffer::create(context, hc.size()*dt_nbytes); - drv::buffer* da = drv::buffer::create(context, ha.size()*dt_nbytes); - drv::buffer* db = drv::buffer::create(context, hb.size()*dt_nbytes); - stream->write(da, true, 0, ha); - stream->write(db, true, 0, hb); - stream->write(dc, true, 0, hc); + auto dc = std::shared_ptr(drv::buffer::create(context, hc.size()*dt_nbytes)); + auto da = std::shared_ptr(drv::buffer::create(context, ha.size()*dt_nbytes)); + auto db = std::shared_ptr(drv::buffer::create(context, hb.size()*dt_nbytes)); + stream->write(&*da, true, 0, ha); + stream->write(&*db, true, 0, hb); + stream->write(&*dc, true, 0, hc); stream->synchronize(); // run rt::function::options_space_t opt; @@ -85,81 +116,50 @@ perf_t do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int opt.defines.push_back({"AT", {""}}); if(BT) opt.defines.push_back({"BT", {""}}); - opt.defines.push_back({"TM", {"16", "32", "64", "128"}}); - opt.defines.push_back({"TN", {"16", "32", "64", "128"}}); - opt.defines.push_back({"TK", {"32"}}); - opt.num_warps = {1, 2, 4, 8}; + opt.defines.push_back({"TM", {std::to_string(TM)}}); + opt.defines.push_back({"TN", {std::to_string(TN)}}); + opt.defines.push_back({"TK", {std::to_string(TK)}}); + opt.num_warps = {nwarp}; rt::function function(src::dot, opt); - - auto ceil = [](size_t x, size_t y) { return (x + y - 1) / y; }; - auto grid = [&](const rt::function::options_t& x) { return rt::grid_t{ceil(M, x.D("TM")), ceil(N, x.D("TN")), 1}; }; - - auto tflops = [&](double nanosec) { return 2.*M*N*K / nanosec * 1e-3; }; - perf_t res; - res.triton = tflops(triton::tools::bench([&]() { function({da, db, dc, M, N, K, lda, ldb, ldc}, grid, stream);}, stream)); - NumericT alpha(static_cast(1)); - NumericT beta(static_cast(0)); - cublasGemmAlgo_t fastest; - cublasGemm(CUDA_R_16F, stream, AT, BT, M, N, K, &alpha, da, lda, db, ldb, &beta, dc, ldc, &fastest); - res.cublas = tflops(triton::tools::bench([&]() { cublasGemm(CUDA_R_16F, stream, AT, BT, M, N, K, - &alpha, da, lda, db, ldb, &beta, dc, ldc, nullptr, fastest); }, - stream)); - + try { + function({&*da, &*db, &*dc, M, N, K, lda, ldb, ldc}, grid(M, N), stream); + } catch (const std::runtime_error& e) { + return true; + } // test -// stream->read(dc, true, 0, hc); -// std::vector rc(hc.size()); -// cpu_ref(AT, BT, M, N, K, rc, ha, hb); -// for(size_t i = 0; i < M*N; i++) -// if(std::isinf(hc[i]) || std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-2){ -// std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; -// exit(EXIT_FAILURE); -// } -// std::cout << hc[0] << " " << std::endl; -// std::cout << "Pass!" << std::endl; - - // clean-up - delete dc; - delete da; - delete db; - return res; + stream->read(&*dc, true, 0, hc); + std::vector rc(hc.size()); + cpu_ref(AT, BT, M, N, K, rc, ha, hb); + for(size_t i = 0; i < M*N; i++) + if(std::isinf(hc[i]) || std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-2) + return false; + return true; } int main() { - struct config_t{ - bool AT; - bool BT; - int32_t M; - int32_t N; - int32_t K; - - std::string repr() { - std::ostringstream oss; - oss << AT << " " << BT << " " << M << " " << N << " " << K; - return oss.str(); - } - - perf_t perf(triton::driver::stream *stream){ - return do_bench(stream, AT, BT, M, N, K); - } - }; - // shapes to benchmark - std::vector configs = { -// {false, false, 8192, 512, 512}, - {false, true, 128, 128, 128} -// {false, true, 128, 128, 128}, -// {false, false, 128, 128, 128}, -// {true, false, 128, 128, 128}, -// {true, true, 128, 128, 128} -// {false, true, 32768, 256, 512} -// {true, false, 8192, 512, 512}, -// {true, true, 8192, 512, 512} - }; // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); triton::driver::stream* stream = triton::driver::stream::create(context); + // shapes to benchmark + typedef std::tuple config_t; + std::vector configs; + for(bool AT: std::array{false, true}) + for(bool BT: std::array{false, true}) + for(int TM: std::vector{16, 128}) + for(int TN: std::vector{16, 128}) + for(int TK: std::vector{16, 32}) + for(int nwarps: std::vector{1, 2, 4, 8}){ + configs.push_back(config_t{AT, BT, 128, 128, 128, TM, TN, TK, nwarps}); + } // does the work - for(config_t c: configs){ - perf_t perf = c.perf(stream); - std::cout << "// " << c.repr() << ", " << perf.triton << ", " << perf.cublas << std::endl; + bool AT, BT; + int M, N, K, TM, TN, TK, nwarp; + for(const auto& c: configs){ + std::tie(AT, BT, M, N, K, TM, TN, TK, nwarp) = c; + std::cout << "Testing " << c << " ... " << std::flush; + if(do_test(stream, AT, BT, M, N, K, TM, TN, TK, (size_t)nwarp)) + std::cout << " Pass! " << std::endl; + else + std::cout << " Fail! " << std::endl; } } From e3c953e79f31eec3c7bc228c9bed7eae8d7adc44 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 28 Aug 2019 18:06:36 -0700 Subject: [PATCH 336/494] [test] added more re-usable code in common/util.h --- tests/bench/dot.cc | 6 ++--- tests/common/src/dot.h | 14 +++++----- tests/common/util.h | 61 ++++++++++++++++++++++++++++++++++++++++++ tests/unit/dot.cc | 45 +++---------------------------- 4 files changed, 75 insertions(+), 51 deletions(-) create mode 100644 tests/common/util.h diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index 4176d2377..469f47e43 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -46,10 +46,10 @@ std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, i opt.defines.push_back({"AT", {""}}); if(BT) opt.defines.push_back({"BT", {""}}); - opt.defines.push_back({"TM", {"16", "32", "64", "128"}}); - opt.defines.push_back({"TN", {"16", "32", "64", "128"}}); + opt.defines.push_back({"TM", {"64", "128"}}); + opt.defines.push_back({"TN", {"128"}}); opt.defines.push_back({"TK", {"32"}}); - opt.num_warps = {1, 2, 4, 8}; + opt.num_warps = {4}; // create function rt::function function(src::dot, opt); // benchmark available libraries diff --git a/tests/common/src/dot.h b/tests/common/src/dot.h index 9df0643a6..993a1d260 100644 --- a/tests/common/src/dot.h +++ b/tests/common/src/dot.h @@ -31,20 +31,20 @@ void dot(TYPE * A __noalias __readonly __aligned(16), #ifdef AT TYPE* pa[TK, TM] = A + rka[:, newaxis] + rxa[newaxis, :]*lda; bool checka[TK, TM] = rka[:, newaxis] < TK; - TYPE a[TK, TM] = *pa; + TYPE a[TK, TM] = checka ? *pa : 0; #else TYPE* pa[TM, TK] = A + rka[newaxis, :]*lda + rxa[:, newaxis]; bool checka[TM, TK] = rka[newaxis, :] < TK; - TYPE a[TM, TK] = *pa; + TYPE a[TM, TK] = checka ? *pa : 0; #endif #ifdef BT TYPE* pb[TN, TK] = B + rkb[newaxis, :]*ldb + ryb[:, newaxis]; bool checkb[TN, TK] = rkb[newaxis, :] < TK; - TYPE b[TN, TK] = *pb; + TYPE b[TN, TK] = checkb ? *pb : 0; #else TYPE* pb[TK, TN] = B + rkb[:, newaxis] + ryb[newaxis, :]*ldb; bool checkb[TK, TN] = rkb[:, newaxis] < TK; - TYPE b[TK, TN] = *pb; + TYPE b[TK, TN] = checkb ? *pb : 0; #endif for(int k = K; k > 0; k = k - TK){ xc = USEA @ USEB + xc; @@ -60,8 +60,8 @@ void dot(TYPE * A __noalias __readonly __aligned(16), #endif checka = k > TK; checkb = k > TK; - a = *pa; - b = *pb; + a = checka ? *pa : 0; + b = checkb ? *pb : 0; } int rxc[TM] = ridx * TM + (0 ... TM); int ryc[TN] = ridy * TN + (0 ... TN); @@ -70,7 +70,7 @@ void dot(TYPE * A __noalias __readonly __aligned(16), bool checkc0[TM] = rxc < M; bool checkc1[TN] = ryc < N; bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - *pc = c; + *?(checkc) pc = c; } )"; diff --git a/tests/common/util.h b/tests/common/util.h new file mode 100644 index 000000000..a60050af7 --- /dev/null +++ b/tests/common/util.h @@ -0,0 +1,61 @@ +#pragma once + +#ifndef _TRITON_TESTS_UTIL_H +#define _TRITON_TESTS_UTIL_H + +#include "triton/runtime/function.h" + +namespace rt = triton::runtime; + +inline size_t ceil(size_t x, size_t y) { + return (x + y - 1) / y; +}; + +inline rt::function::grid_fn_ty grid(size_t M, size_t N) { + return [M, N](const rt::function::options_t& x) { + return rt::grid_t{ceil(M, x.D("TM")), + ceil(N, x.D("TN"))}; + }; +} + +namespace aux{ +template struct seq{}; + +template +struct gen_seq : gen_seq{}; + +template +struct gen_seq<0, Is...> : seq{}; + +template +void print_tuple(std::basic_ostream& os, Tuple const& t, seq){ + using swallow = int[]; + (void)swallow{0, (void(os << (Is == 0? "" : ", ") << std::setfill(' ') << std::setw(3) << std::get(t)), 0)...}; +} +} // aux:: + +template +auto operator<<(std::basic_ostream& os, std::tuple const& t) + -> std::basic_ostream& +{ + os << "("; + aux::print_tuple(os, t, aux::gen_seq()); + return os << ")"; +} + + +namespace testing { + + template + bool diff(const std::vector& hc, const std::vector& rc) { + if(hc.size() != rc.size()) + return false; + for(size_t i = 0; i < hc.size(); i++) + if(std::isinf(hc[i]) || std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-2) + return false; + return true; + } + +} + +#endif \ No newline at end of file diff --git a/tests/unit/dot.cc b/tests/unit/dot.cc index 298b79a44..3c9ec96d8 100644 --- a/tests/unit/dot.cc +++ b/tests/unit/dot.cc @@ -1,4 +1,5 @@ -#include +#include +#include #include #include #include "triton/driver/backend.h" @@ -8,6 +9,7 @@ #include "triton/runtime/function.h" #include "src/dot.h" #include "cuda/cublas.h" +#include "util.h" namespace drv = triton::driver; namespace rt = triton::runtime; @@ -47,42 +49,6 @@ void cpu_ref(bool AT_, bool BT_, size_t M, size_t N, size_t K, cpu_ref(c, a, b, M, N, K); } -inline size_t ceil(size_t x, size_t y) { - return (x + y - 1) / y; -}; - -inline rt::function::grid_fn_ty grid(size_t M, size_t N) { - return [M, N](const rt::function::options_t& x) { - return rt::grid_t{ceil(M, x.D("TM")), - ceil(N, x.D("TN"))}; - }; -} - -namespace aux{ -template struct seq{}; - -template -struct gen_seq : gen_seq{}; - -template -struct gen_seq<0, Is...> : seq{}; - -template -void print_tuple(std::basic_ostream& os, Tuple const& t, seq){ - using swallow = int[]; - (void)swallow{0, (void(os << (Is == 0? "" : ", ") << std::get(t)), 0)...}; -} -} // aux:: - -template -auto operator<<(std::basic_ostream& os, std::tuple const& t) - -> std::basic_ostream& -{ - os << "("; - aux::print_tuple(os, t, aux::gen_seq()); - return os << ")"; -} - bool do_test(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K, int32_t TM, int32_t TN, int32_t TK, size_t nwarp){ typedef half_float::half NumericT; @@ -130,10 +96,7 @@ bool do_test(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_ stream->read(&*dc, true, 0, hc); std::vector rc(hc.size()); cpu_ref(AT, BT, M, N, K, rc, ha, hb); - for(size_t i = 0; i < M*N; i++) - if(std::isinf(hc[i]) || std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-2) - return false; - return true; + return testing::diff(hc, rc); } int main() { From 141a8237996117b4be5a7137c512f9d0ddc9c8b4 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 29 Aug 2019 17:06:59 -0700 Subject: [PATCH 337/494] [python] refactoring in anticipation of pytorch support --- python/examples/dot.py | 66 +++++---- python/examples/lol.cc | 7 - python/src/tensorflow.cc | 164 +++++++++++++++++++-- python/triton/ops.py | 304 +++++++++++++++++++++++++-------------- 4 files changed, 385 insertions(+), 156 deletions(-) delete mode 100644 python/examples/lol.cc diff --git a/python/examples/dot.py b/python/examples/dot.py index 779f59408..f41c702b3 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -34,19 +34,19 @@ void dot(TYPE * A __noalias __readonly __aligned(16), /* pointers for A */ #if AT == 1 - TYPE* pa[TK, TM] = A + rka[:, newaxis] + rxa[newaxis, :]*lda; + TYPE* pa[TK, TM] = A + rka[:, newaxis]*lda + rxa[newaxis, :]; TYPE a[TK, TM] = *pa; #else - TYPE* pa[TM, TK] = A + rka[newaxis, :]*lda + rxa[:, newaxis]; + TYPE* pa[TM, TK] = A + rka[newaxis, :] + rxa[:, newaxis]*lda; TYPE a[TM, TK] = *pa; #endif /* pointers for B */ #if BT == 1 - TYPE* pb[TN, TK] = B + rkb[newaxis, :]*ldb + ryb[:, newaxis]; + TYPE* pb[TN, TK] = B + rkb[newaxis, :] + ryb[:, newaxis]*ldb; TYPE b[TN, TK] = *pb; #else - TYPE* pb[TK, TN] = B + rkb[:, newaxis] + ryb[newaxis, :]*ldb; + TYPE* pb[TK, TN] = B + rkb[:, newaxis]*ldb + ryb[newaxis, :]; TYPE b[TK, TN] = *pb; #endif @@ -54,14 +54,14 @@ void dot(TYPE * A __noalias __readonly __aligned(16), for(int k = K; k > 0; k = k - TK){ xc = USEA @ USEB + xc; #if AT == 1 - pa = pa + TK; -#else pa = pa + TK*lda; +#else + pa = pa + TK; #endif #if BT == 1 - pb = pb + TK*ldb; -#else pb = pb + TK; +#else + pb = pb + TK*ldb; #endif a = *pa; b = *pb; @@ -70,19 +70,19 @@ void dot(TYPE * A __noalias __readonly __aligned(16), /* epilogue */ int rxc[TM] = ridx * TM + (0 ... TM); int ryc[TN] = ridy * TN + (0 ... TN); - TYPE* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; + TYPE* pc[TM, TN] = C + ryc[newaxis, :] + rxc[:, newaxis] * ldc; TYPE c[TM, TN] = xc; bool checkc0[TM] = rxc < M; bool checkc1[TN] = ryc < N; bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - *pc = c; + *?(checkc) pc = c; } """ def cdiv(a, b): return -(-a // b) -class dot: +class dot_op: def __init__(self, trans_a = False, trans_b = False): self.dot = triton.op(src, ['C']) @@ -93,10 +93,18 @@ class dot: shape_a = triton.shape(a) shape_b = triton.shape(b) M = shape_a[0] - K = shape_a[1] - N = shape_b[0] - lda = M - ldb = K + Ka = shape_a[1] + Kb = shape_b[0] + N = shape_b[1] + # transpose shapes + if self.trans_a: + M, Ka = Ka, M + if self.trans_b: + Kb, N = N, Kb + K = Ka + # contiguous dimensions + lda = Ka + ldb = N ldc = N c = triton.empty([M, N]) return self.dot(a, b, c, M, N, K, lda, ldb, ldc, @@ -104,34 +112,34 @@ class dot: AT = self.trans_a, BT = self.trans_b, TYPE = tf.float16, TM = [128], TN = [ 128], TK = [32]) -dot_nt = dot(False, True) -dot_nn = dot(False, False) -dot_tn = dot(True, False) -dot_tt = dot(True, True) +dot_nt = dot_op(False, True) +dot_nn = dot_op(False, False) +dot_tn = dot_op(True, False) +dot_tt = dot_op(True, True) -@triton.register_gradient(dot) -def _dot_grad(op, dy): - a = op.inputs[0] - b = op.inputs[1] - return [dot_tn(dy, b), dot_nt(a, dy), None, None, None, None, None, None, None] +# @triton.register_gradient(dot_op) +# def _dot_grad(op, dy): +# a = op.inputs[0] +# b = op.inputs[1] +# return [dot_tn(dy, b), dot_nt(a, dy), None, None, None, None, None, None, None] def run_dot(): M, N, K = 128, 128, 128 a = tf.placeholder(tf.float16, shape=[M, K]) b = tf.placeholder(tf.float16, shape=[N, K]) # c = tf.matmul(a, b, transpose_a=True) - c = dot_nn(a, b) - grads = tf.gradients(c, [a]) + c = dot_nt(a, b) + # grads = tf.gradients(c, [a]) # Reference ha = np.random.rand(M, K).astype(np.float16) - hb = np.random.rand(N, K).astype(np.float16) + hb = np.random.rand(K, N).astype(np.float16) # Run sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) - result = sess.run([grads], feed_dict = {a: ha, + result = sess.run([c], feed_dict = {a: ha, b: hb})[0] # Test - hresult = np.dot(ha.T, hb.T).T + hresult = np.dot(ha, hb.T) dif = np.abs(result - hresult) np.savetxt('dif.dat', dif, '%2.4f') print(hresult) diff --git a/python/examples/lol.cc b/python/examples/lol.cc deleted file mode 100644 index c847e46a5..000000000 --- a/python/examples/lol.cc +++ /dev/null @@ -1,7 +0,0 @@ -#include - -int main(){ - const char* TEST = "test\n"; - const char* LOL = "lol\n"; - printf("%s\n",DTYPE); -} diff --git a/python/src/tensorflow.cc b/python/src/tensorflow.cc index fde2d84ec..1932402e0 100644 --- a/python/src/tensorflow.cc +++ b/python/src/tensorflow.cc @@ -136,7 +136,7 @@ void gen_make_launch_function(std::ostream &os, const std::vector os << "}, *id_grid_map.at(id_), stream); \n"; } -void gen_register_kernel_builder(std::ostream &os, const std::string &name, +void gen_tf_register_kernel_builder(std::ostream &os, const std::string &name, const std::string &opname, const std::vector& args){ os << "REGISTER_KERNEL_BUILDER(Name(\"" + name + "\").Device(DEVICE_GPU)"; @@ -151,7 +151,7 @@ void gen_register_kernel_builder(std::ostream &os, const std::string &name, os << ", " + opname << ");\n"; } -void gen_register_op(std::ostream &os, const std::string &name, +void gen_tf_register_op(std::ostream &os, const std::string &name, const std::vector& args, const std::vector& outputs){ os << "REGISTER_OP(\"" << name << "\")\n"; @@ -195,15 +195,12 @@ extern int get_program_id(int); )"; } -std::tuple make_tensorflow_src(std::string src, - const std::vector& outputs, - const runtime::function::options_space_t& opt) -{ - src = preheader() + src; +void make_module(const std::string& src, ir::module* ir, + const runtime::function::options_space_t& opt) { + std::string copy = preheader() + src; // pre-process TokenSequence tokens; - Preprocessor cpp(&src, true); + Preprocessor cpp(©, true); for(auto it: opt.defines){ cpp.AddMacro(it.first, &it.second[0]); } @@ -211,11 +208,19 @@ std::tuple make_tensorflow_src(const std::string& src, + const std::vector& outputs, + const runtime::function::options_space_t& opt) +{ // triton-ir code-gen ir::context ctx; auto ir = std::shared_ptr(new ir::module("", ctx)); - Generator gen(&parser); - gen.Gen(&*ir); + make_module(src, &*ir, opt); // function ir::function* fn = ir->get_function_list().front(); std::string name = fn->get_name(); @@ -287,16 +292,145 @@ private: // register kernel builder )"; -gen_register_kernel_builder(oss, cc_name, opname, fn->args()); +gen_tf_register_kernel_builder(oss, cc_name, opname, fn->args()); oss << R"( // register op )"; -gen_register_op(oss, cc_name, fn->args(), outputs); - +gen_tf_register_op(oss, cc_name, fn->args(), outputs); return {oss.str(), name}; } + +inline std::string to_torch_ty(ir::type *ty) { + if(ty->is_integer_ty(1)) + return "bool"; + if(ty->is_integer_ty(8)) + return "int8"; + if(ty->is_integer_ty(16)) + return "int16"; + if(ty->is_integer_ty(32)) + return "int32"; + if(ty->is_integer_ty(64)) + return "int64"; + if(ty->is_half_ty()) + return "float16"; + if(ty->is_float_ty()) + return "float32"; + if(ty->is_double_ty()) + return "float64"; + if(ty->is_pointer_ty()) + return "Tensor"; + throw std::runtime_error("unknown type"); +} + + + +void gen_torch_signature(std::ostringstream& oss, + ir::function* fn, + const std::vector& outputs, + const std::string& name) { + const auto& args = fn->args(); + std::vector out_types; + for(const std::string& out: outputs) { + auto it = std::find_if(args.begin(), args.end(), + [&](ir::argument* arg) { return arg->get_name() == out; }); + if(it == args.end()) + throw std::runtime_error("unknown argument"); + out_types.push_back((*it)->get_type()); + } + + oss << "std::tuple<"; + for(size_t i = 0; i < out_types.size(); i++){ + if(i > 0) + oss << ", "; + oss << to_torch_ty(out_types[i]); + } + oss << "> "; + oss << name << "("; + oss << "int64 id" << std::endl; + for(size_t i = 0; i < args.size(); i++) { + ir::argument* arg = args[i]; + if(i > 0) + oss << ", "; + oss << to_torch_ty(arg->get_type()) << " " << arg->get_name(); + } + oss << ")"; +} + +void gen_torch_init_driver(std::ostringstream &oss) { + oss << " // Wrap CUDA handles" << std::endl; + oss << " c10::DeviceIndex device = torcha.storage().device().index();" << std::endl; + oss << " // Get stream" << std::endl; + oss << " CUstream custream = (CUstream)at::cuda::getCurrentCUDAStream(device).stream();" << std::endl; + oss << " triton::driver::cu_stream stream(custream, false);" << std::endl; + oss << " triton::driver::context* ctx = stream.context();" << std::endl; +} + +void gen_torch_make_handles(std::ostream &os, + const std::vector& args) { + for(unsigned i = 0; i < args.size(); i++){ + ir::argument *arg = args[i]; + if(!arg->get_type()->is_pointer_ty()) + continue; + const std::string& name = arg->get_name(); + os << " drv::cu_buffer cu_" + name + "(ctx, " + name + ".storage().size(), (CUdeviceptr)" + name + ".storage.data(), false);\n "; + } +} + +void gen_torch_make_launch_function(std::ostream &os, const std::vector& args) { + os << " (*id_fn_map.at(id))({"; + for(unsigned i = 0; i < args.size() ; i++){ + ir::argument *arg = args[i]; + std::string name = arg->get_name(); + if(arg->get_type()->is_pointer_ty()) + name = "&cu_" + name; + if(i > 0) + os << ", "; + os << name; + } + os << "}, *id_grid_map.at(id), stream); \n"; +} + + +std::tuple make_pytorch_src(const std::string& src, + const std::vector& outputs, + const runtime::function::options_space_t& opt) { + // triton-ir code-gen + ir::context ctx; + auto ir = std::shared_ptr(new ir::module("", ctx)); + make_module(src, &*ir, opt); + // function + ir::function* fn = ir->get_function_list().front(); + std::string name = fn->get_name(); + // generate framework code + std::ostringstream oss; + oss << R"( +#include "triton/driver/buffer.h" +#include "triton/driver/backend.h" +#include "triton/driver/stream.h" +#include "triton/runtime/function.h" + +namespace rt = triton::runtime; +namespace drv = triton::driver; + +extern std::map> id_grid_map; +extern std::map> id_fn_map; + +)"; + + gen_torch_signature(oss, fn, outputs, name); + oss << " {" << std::endl; + gen_torch_init_driver(oss); + gen_torch_make_handles(oss, fn->args()); + gen_torch_make_launch_function(oss, fn->args()); + oss << std::endl << "}"; + + oss << "static auto registry = torch::jit::RegisterOperators(\"triton::" << name << "\", &" << name << ");" << std::endl; +} + + typedef triton::runtime::function::options_t options_t; typedef triton::runtime::function::options_space_t options_space_t; @@ -307,6 +441,8 @@ PYBIND11_MODULE(libtriton, m) { m.def("make_tensorflow_src", &make_tensorflow_src, "Creates C++ source code for a custom Tensorflow op " "corresponding to the specified Triton kernel"); + m.def("make_pytorch_src", &make_pytorch_src, + "Creates C++ source code for a custom PyTorch op "); // bindings for triton classes pybind11::class_(m, "options") diff --git a/python/triton/ops.py b/python/triton/ops.py index 9b72aad0a..4e2225f3d 100644 --- a/python/triton/ops.py +++ b/python/triton/ops.py @@ -11,18 +11,60 @@ import setuptools.command.build_ext import setuptools # triton import libtriton -# frameworks -import tensorflow as tf -from tensorflow.python.framework import ops -extra_ops = tf.load_op_library('/home/philippe/development/triton/python/build/lib.linux-x86_64-3.6/libextra_tf_ops.so') +torch_id = 'torch' +tensorflow_id = 'tensorflow' + +torch = None +tensorflow = None +tf_extra_ops = None + +def _import_torch(): + global torch + if torch is None: + import torch + +def _import_tensorflow(): + global tensorflow + if tensorflow is None: + import tensorflow + +def _import_tf_extra_ops(): + global tf_extra_ops + if tf_extra_ops is None: + path = os.path.dirname(libtriton.__file__) + path = os.path.join(path, 'libextra_tf_ops.so') + _import_tensorflow() + tf_extra_ops = tensorflow.load_op_library(path) -def make_bindings(src, out, grid): - return libtriton.make_tensorflow_src(src, out, grid) +def _find_framework(default = None): + is_tf_imported = 'tensorflow' in sys.modules + is_torch_imported = 'torch' in sys.modules + if default: + if default not in [tensorflow_id, torch_id]: + raise ValueError('unsupported framework') + else: + return default + elif is_tf_imported and not is_torch_imported: + return tensorflow_id + elif is_torch_imported and not is_tf_imported: + return torch_id + else: + raise ValueError('cannot determine imported framework, ' + 'please provide framework argument') -def make_cache_path(src): + +def _make_framework_src(src, out, grid, framework): + if framework == tensorflow_id: + return libtriton.make_tensorflow_src(src, out, grid) + elif framework == torch_id: + return libtriton.make_torch_src(src, out, grid) + else: + assert False + +def _make_cache_path(src): md5 = hashlib.sha1(src.encode()) hexhash = md5.hexdigest() home = os.path.expanduser('~') @@ -32,10 +74,10 @@ def make_cache_path(src): os.makedirs(cachepath) return cachepath -def write_bindings(src, root): - cpp = os.path.join(root, 'tensorflow.cpp') +def _write_bindings(src, root, framework): + cpp = os.path.join(root, '{framework}.cpp'.format(framework=framework)) suffix = sysconfig.get_config_var('EXT_SUFFIX') - so = os.path.join(root, 'tensorflow{suffix}'.format(suffix=suffix)) + so = os.path.join(root, '{framework}{suffix}'.format(framework=framework, suffix=suffix)) recompile = False # recompile if .so does not exist if not os.path.exists(cpp) or not os.path.exists(so): @@ -50,18 +92,32 @@ def write_bindings(src, root): # return path of cpp file return (cpp, so) -def build(src, path): +def _build(src, path, framework): # include directories triton_include_dirs = ['/home/philippe/development/triton/include'] - tensorflow_include_dirs = [tf.sysconfig.get_include()] - cuda_include_dirs = ['/usr/local/cuda-10.1/targets/x86_64-linux/include/'] - include_dirs = triton_include_dirs + tensorflow_include_dirs + cuda_include_dirs + include_dirs = triton_include_dirs # library directories triton_library_dirs = [os.path.realpath(os.path.join(libtriton.__file__, os.path.pardir))] - tensorflow_library_dirs = [tf.sysconfig.get_lib()] - library_dirs = triton_library_dirs + tensorflow_library_dirs + library_dirs = triton_library_dirs # libraries - libraries = ['tensorflow_framework', 'triton'] + libraries = ['triton'] + # add framework + if framework == tensorflow_id: + _import_tensorflow() + library_dirs += [tensorflow.sysconfig.get_lib()] + include_dirs += [tensorflow.sysconfig.get_lib()] + libraries += ['tensorflow_framework'] + elif framework == torch_id: + _import_torch() + prefix = os.path.dirname(torch.__file__) + library_dirs += [os.path.join(prefix, 'lib')] + include_dirs += [os.path.join(prefix, 'lib', 'include'), + os.path.join(prefix, 'lib', 'include', 'torch', 'csrc', 'api', 'include'), + os.path.join(prefix, 'include'), + os.path.join(prefix, 'include', 'torch', 'csrc', 'api', 'include')] + libraries += ['torch'] + else: + assert False # extra arguments extra_compile_args = [] extra_link_args = [] @@ -93,25 +149,138 @@ def build(src, path): setuptools.setup(**args) shutil.rmtree(tmp) -def _cvt_to_def_str(obj): +def _cvt_to_def_str(obj, framework): + # bool if isinstance(obj, bool): return str(int(obj)) - if isinstance(obj, tf.DType): - return {tf.int8: 'char', - tf.int16: 'short', - tf.int32: 'int', - tf.int64: 'long', - tf.float16: 'half', - tf.float32: 'float', - tf.float64: 'double'}[obj] + # tensorflow type + if framework == tensorflow_id: + _import_tensorflow() + if isinstance(obj, tensorflow.DType): + return {tensorflow.int8: 'char', + tensorflow.int16: 'short', + tensorflow.int32: 'int', + tensorflow.int64: 'long', + tensorflow.float16: 'half', + tensorflow.float32: 'float', + tensorflow.float64: 'double'}[obj] + # torch type + elif framework == torch_id: + _import_torch() + if isinstance(obj, torch.dtype): + return {torch.int8: 'char', + torch.int16: 'short', + torch.int32: 'int', + torch.int64: 'long', + torch.float16: 'half', + torch.float32: 'float', + torch.float64: 'double'}[obj] + else: + assert False + # default return str(obj) +def _make_framework_op(src, outputs, options, framework): + src, name = _make_framework_src(src, outputs, options, framework) + cache_path = _make_cache_path(src) + cpp, so = _write_bindings(src, cache_path, framework) + _build(cpp, cache_path, framework) + if framework == tensorflow_id: + _import_tensorflow() + return tensorflow.load_op_library(so).__dict__[name] + elif framework == torch_id: + _import_torch() + torch.ops.load_library(so) + return torch.ops.triton.__dict__[name] + else: + assert False + +def _make_grid(args) : + scalars = [x for x in args[:-1] if isinstance(x, scalar)] + def grid(opt): + for x in scalars: + x.set_assume_initialized() + result = args[-1](opt) + for x in scalars: + x.unset_assume_initialized() + return result + return grid + +class op: + + def __init__(self, src, outputs, framework = None): + self.fw_id = dict() + self.fw_ops = dict() + self.fw_grids = dict() + self.src = src + self.outputs = outputs + self.framework = _find_framework(None) + + def __call__(self, *args, **kwargs): + # create a new op when defines are different + key = zip(kwargs.keys(), kwargs.values()) + if key not in self.fw_ops: + # code generation options + defines = [] + for k, v in kwargs.items(): + cvt = lambda x: _cvt_to_def_str(x, self.framework) + try: + values = list(map(cvt, v)) + except TypeError: + values = [cvt(v)] + defines.append((k, values)) + opt = libtriton.options_space() + opt.defines = defines + opt.num_warps = [1, 2, 4, 8] + # create unique id for this op + op_id = libtriton.make_op_id() + self.fw_id[key] = op_id + # register function + libtriton.register_fn(op_id, self.src, opt) + self.fw_ops[key] = _make_framework_op(self.src, self.outputs, opt, self.framework) + + # retrieve framework op + op_id = self.fw_id[key] + op = self.fw_ops[key] + # register grid + grid = _make_grid(args) + self.fw_grids[key] = grid + libtriton.register_grid(op_id, self.fw_grids[key]) + # create operands + op_args = [x.handle if isinstance(x, scalar) else x for x in args[:-1]] + # call framework op + return op(*op_args, id=op_id) + + +# class register_gradient: + +# def __init__(self, op): +# self.op = op + +# def __call__(self, f): +# name = 'Dot' +# ops.RegisterGradient(name)(f) + + +def empty(shapes, framework = None): + framework = _find_framework(framework) + if framework == tensorflow_id: + _import_tensorflow() + _import_tf_extra_ops + args = [x.handle if isinstance(x, scalar) else x for x in shapes] + args = tensorflow.stack(args) + return tf_extra_ops.alloc_empty(args) + elif framework == torch_id: + _import_torch() + return torch.empty(*shapes) + class scalar: def __init__(self, x): + _import_tf_extra_ops() self.id = libtriton.make_scalar_id() - self.handle = extra_ops.register_scalar(x, id=self.id) + self.handle = tf_extra_ops.register_scalar(x, id=self.id) self.assume_initialized = False def set_assume_initialized(self): @@ -174,83 +343,6 @@ class lazy_shape: return scalar(self.shape[key]) def shape(A) : - return lazy_shape(tf.shape(A)) + _import_tensorflow() + return lazy_shape(tensorflow.shape(A)) -def _make_tensorflow_op(src, outputs, options): - src, name = make_bindings(src, outputs, options) - cache_path = make_cache_path(src) - cpp, so = write_bindings(src, cache_path) - build(cpp, cache_path) - result = tf.load_op_library(so) - return result.__dict__[name] - -def _make_grid(args) : - scalars = [x for x in args[:-1] if isinstance(x, scalar)] - def grid(opt): - for x in scalars: - x.set_assume_initialized() - result = args[-1](opt) - for x in scalars: - x.unset_assume_initialized() - return result - return grid - -class op: - - def __init__(self, src, outputs): - self.fw_id = dict() - self.fw_ops = dict() - self.fw_grids = dict() - self.src = src - self.outputs = outputs - pass - - def __call__(self, *args, **kwargs): - # create a new op when defines are different - key = zip(kwargs.keys(), kwargs.values()) - if key not in self.fw_ops: - # code generation options - defines = [] - for k, v in kwargs.items(): - try: - values = list(map(_cvt_to_def_str, v)) - except TypeError: - values = [_cvt_to_def_str(v)] - defines.append((k, values)) - opt = libtriton.options_space() - opt.defines = defines - opt.num_warps = [1, 2, 4, 8] - # create unique id for this op - op_id = libtriton.make_op_id() - self.fw_id[key] = op_id - # register function - libtriton.register_fn(op_id, self.src, opt) - self.fw_ops[key] = _make_tensorflow_op(self.src, self.outputs, opt) - - # retrieve framework op - op_id = self.fw_id[key] - op = self.fw_ops[key] - # register grid - grid = _make_grid(args) - self.fw_grids[key] = grid - libtriton.register_grid(op_id, self.fw_grids[key]) - # create operands - op_args = [x.handle if isinstance(x, scalar) else x for x in args[:-1]] - # call framework op - return op(*op_args, id=op_id) - - -class register_gradient: - - def __init__(self, op): - self.op = op - - def __call__(self, f): - name = 'Dot' - ops.RegisterGradient(name)(f) - - -def empty(shapes): - args = [x.handle if isinstance(x, scalar) else x for x in shapes] - args = tf.stack(args) - return extra_ops.alloc_empty(args) From 7e0af2118c503f5372549ab79bdaf79519fe6bf3 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 29 Aug 2019 21:34:23 -0700 Subject: [PATCH 338/494] [codegen] worked around bug seemingly from nvptx/ptxas by simplifying multiplications by 1: - Generated LLVM-IR looked correct - Illegal addressing disappeared when running cuda-memcheck - Illegal addressing disappeared when using nvptx-short-pointer --- include/triton/codegen/transform/peephole.h | 1 + lib/codegen/analysis/tune.cc | 2 +- lib/codegen/selection/selection.cc | 28 +++---- lib/codegen/transform/peephole.cc | 29 ++++++- lib/driver/module.cc | 31 ++++---- lib/ir/print.cc | 9 ++- lib/runtime/function.cc | 1 + python/examples/dot.py | 84 ++++++++++----------- python/triton/ops.py | 5 +- 9 files changed, 108 insertions(+), 82 deletions(-) diff --git a/include/triton/codegen/transform/peephole.h b/include/triton/codegen/transform/peephole.h index 691f8d0bd..9382b968d 100644 --- a/include/triton/codegen/transform/peephole.h +++ b/include/triton/codegen/transform/peephole.h @@ -23,6 +23,7 @@ private: bool rewrite_dot_fp32(ir::dot_inst *dot, ir::builder& builder, bool trans_a, bool trans_b, ir::value *A, ir::value *B, ir::value *D); bool rewrite_dot_hmma(ir::dot_inst *dot, ir::builder& builder, bool trans_a, bool trans_b, ir::value *A, ir::value *B, ir::value *D); bool rewrite_dot(ir::instruction *value, ir::builder& builder); + bool rewrite_mult(ir::instruction *value, ir::builder& builder); bool rewrite_unit_red(ir::instruction *value, ir::builder& builder); bool rewrite_gep_ptr_min_off_plus_off(ir::instruction *value, ir::builder& builder); diff --git a/lib/codegen/analysis/tune.cc b/lib/codegen/analysis/tune.cc index fdb3741cc..7c40788ce 100644 --- a/lib/codegen/analysis/tune.cc +++ b/lib/codegen/analysis/tune.cc @@ -95,7 +95,7 @@ void grids::init_c_graph(ir::instruction *v) { } // Splat else if(dynamic_cast(v)){ - + return; } // Trans else if(auto *x = dynamic_cast(v)){ diff --git a/lib/codegen/selection/selection.cc b/lib/codegen/selection/selection.cc index b04209f61..61aa73853 100644 --- a/lib/codegen/selection/selection.cc +++ b/lib/codegen/selection/selection.cc @@ -469,21 +469,21 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::function(inst)){ - Value *ptr = value(ii->get_operand(0)); - Value *val = value(ii->get_operand(1)); - Value *atom_f_add = nullptr; - if(val->getType()->isFloatTy()) - atom_f_add = Intrinsic::getDeclaration(builder.GetInsertBlock()->getModule(), Intrinsic::nvvm_atomic_load_add_f32, {ptr->getType()}); - else if(val->getType()->isHalfTy()){ - Type *fp16 = Type::getHalfTy(ctx); +// Value *ptr = value(ii->get_operand(0)); +// Value *val = value(ii->get_operand(1)); +// Value *atom_f_add = nullptr; +// if(val->getType()->isFloatTy()) +// atom_f_add = Intrinsic::getDeclaration(builder.GetInsertBlock()->getModule(), Intrinsic::nvvm_atomic_load_add_f32, {ptr->getType()}); +// else if(val->getType()->isHalfTy()){ +// Type *fp16 = Type::getHalfTy(ctx); - FunctionType *atom_ty = FunctionType::get(fp16, {fp16->getPointerTo(), fp16}, false); - atom_f_add = InlineAsm::get(atom_ty, " atom.relaxed.global.gpu.add.noftz.f16 $0, [$1], $2;", "=h,l,h", true); - } - if(atom_f_add == nullptr) - throw std::runtime_error("unsupported atomic add"); - Value *res = builder.CreateCall(atom_f_add, {ptr, val}); - return (Instruction*)res; +// FunctionType *atom_ty = FunctionType::get(fp16, {fp16->getPointerTo(), fp16}, false); +// atom_f_add = InlineAsm::get(atom_ty, " atom.relaxed.global.gpu.add.noftz.f16 $0, [$1], $2;", "=h,l,h", true); +// } +// if(atom_f_add == nullptr) + throw std::runtime_error("unsupported"); +// Value *res = builder.CreateCall(atom_f_add, {ptr, val}); +// return (Instruction*)res; } if(ir::sqrt_inst* ii = dynamic_cast(inst)){ Value *val = value(ii->get_operand(0)); diff --git a/lib/codegen/transform/peephole.cc b/lib/codegen/transform/peephole.cc index 853bed1b2..cfe1c8721 100644 --- a/lib/codegen/transform/peephole.cc +++ b/lib/codegen/transform/peephole.cc @@ -169,6 +169,7 @@ bool peephole::rewrite_dot(ir::instruction *value, ir::builder& builder){ return false; ir::value *a = dot->get_operand(0); ir::value *b = dot->get_operand(1); + builder.set_insert_point(add); ir::value * new_dot = builder.insert(ir::dot_inst::create(a, b, other, dot->is_a_trans(), dot->is_b_trans(), dot->get_name())); @@ -212,6 +213,30 @@ bool peephole::rewrite_unit_red(ir::instruction *value, ir::builder& builder){ return false; } +bool peephole::rewrite_mult(ir::instruction *value, ir::builder& builder) { + auto binop = dynamic_cast(value); + if(binop && binop->get_op() == ir::binary_op_t::Mul) { + ir::value *lhs = binop->get_operand(0); + ir::value *rhs = binop->get_operand(1); + ir::constant_int *_1_lhs = nullptr; + if(ir::splat_inst *splat = dynamic_cast(lhs)) + _1_lhs = dynamic_cast(splat->get_operand(0)); + ir::constant_int *_1_rhs = nullptr; + if(ir::splat_inst *splat = dynamic_cast(rhs)) + _1_rhs = dynamic_cast(splat->get_operand(0)); + if(_1_lhs){ + binop->replace_all_uses_with(rhs); + return true; + } + else if(_1_rhs){ + binop->replace_all_uses_with(lhs); + return true; + } + } + return false; +} + + bool peephole::rewrite_gep_ptr_min_off_plus_off(ir::instruction *value, ir::builder& builder) { auto x = dynamic_cast(value); if(!x) @@ -250,8 +275,9 @@ void peephole::run(ir::module &mod) { if(seen.find(i) != seen.end()) continue; bool was_modified = rewrite_dot(i, builder); - if(was_modified) + if(was_modified){ seen.insert(i); + } } }while(seen.size() != n_seen); @@ -265,6 +291,7 @@ void peephole::run(ir::module &mod) { if(seen.find(i) != seen.end()) continue; bool was_modified = false; + was_modified = was_modified || rewrite_mult(i, builder); was_modified = was_modified || rewrite_trans_phi(i, builder); was_modified = was_modified || rewrite_unit_red(i, builder); was_modified = was_modified || rewrite_gep_ptr_min_off_plus_off(i, builder); diff --git a/lib/driver/module.cc b/lib/driver/module.cc index 96a7c0f08..5a9bfc86f 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -218,29 +218,24 @@ ocl_module::ocl_module(driver::context * context, llvm::Module* src): module(con /* ------------------------ */ std::string cu_module::compile_llvm_module(llvm::Module* module) { - // set data layout - std::string layout = "e"; - bool is_64bit = true; - bool use_short_pointers = true; - if (!is_64bit) - layout += "-p:32:32"; - else if (use_short_pointers) - layout += "-p3:32:32-p4:32:32-p5:32:32"; - layout += "-i64:64-i128:128-v16:16-v32:32-n16:32:64"; - // create - llvm::SmallVector buffer; - module::compile_llvm_module(module, "nvptx64-nvidia-cuda", "sm_70", layout, buffer, "", Assembly); - std::string result(buffer.begin(), buffer.end()); - size_t start_replace = result.find(".version"); - size_t end_replace = result.find('\n', start_replace); - assert(start_replace != std::string::npos); - result.replace(start_replace, end_replace - start_replace, ".version 6.4"); - return result; + // options + auto options = llvm::cl::getRegisteredOptions(); + static_cast*>(options["nvptx-short-ptr"])->setValue(true); + // create + llvm::SmallVector buffer; + module::compile_llvm_module(module, "nvptx64-nvidia-cuda", "sm_70", "", buffer, "", Assembly); + std::string result(buffer.begin(), buffer.end()); + size_t start_replace = result.find(".version"); + size_t end_replace = result.find('\n', start_replace); + assert(start_replace != std::string::npos); + result.replace(start_replace, end_replace - start_replace, ".version 6.4"); + return result; } cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ +// std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/ir/print.cc b/lib/ir/print.cc index cf5e706e4..9b66305b8 100644 --- a/lib/ir/print.cc +++ b/lib/ir/print.cc @@ -49,8 +49,13 @@ void print(module &mod, std::ostream& os) { size_t num_ops = inst->get_num_operands(); if(num_ops > 0) os << " ";; - for(unsigned i = 0; i < num_ops; i++) - os << get_name(ops[i], cnt++) << (i < num_ops - 1?", ":""); + for(unsigned i = 0; i < num_ops; i++){ + if(auto *x = dynamic_cast(ops[i])) + os << x->get_value(); + else + os << get_name(ops[i], cnt++); + os << (i < num_ops - 1?", ":""); + } os << ";" << std::endl; } os << std::endl; diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 5c93eb452..9b2072974 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -217,6 +217,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c dce.run(module); vectorize.run(module); dce.run(module); +// ir::print(module, std::cout); // generate llvm code llvm::LLVMContext ctx; std::unique_ptr llvm(new llvm::Module(module.get_name(), ctx)); diff --git a/python/examples/dot.py b/python/examples/dot.py index f41c702b3..1eb7867af 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -1,23 +1,43 @@ -import triton import tensorflow as tf +import triton import numpy as np src = """ #if AT == 1 #define USEA ^a +#define STRIDE_AK lda +#define STRIDE_AM 1 +#define BROADCAST_AK :, newaxis +#define BROADCAST_AM newaxis, : +#define SHAPE_A TK, TM #else #define USEA a +#define STRIDE_AK 1 +#define STRIDE_AM lda +#define BROADCAST_AK newaxis, : +#define BROADCAST_AM :, newaxis +#define SHAPE_A TM, TK #endif #if BT == 1 #define USEB ^b +#define STRIDE_BK 1 +#define STRIDE_BN ldb +#define BROADCAST_BK newaxis, : +#define BROADCAST_BN :, newaxis +#define SHAPE_B TN, TK #else #define USEB b +#define STRIDE_BK ldb +#define STRIDE_BN 1 +#define BROADCAST_BK :, newaxis +#define BROADCAST_BN newaxis, : +#define SHAPE_B TK, TN #endif -void dot(TYPE * A __noalias __readonly __aligned(16), - TYPE * B __noalias __readonly __aligned(16), - TYPE * C __noalias __readonly __aligned(16), +void dot(TYPE * A, + TYPE * B, + TYPE * C, int M, int N, int K, int lda __multipleof(8), int ldb __multipleof(8), @@ -31,42 +51,20 @@ void dot(TYPE * A __noalias __readonly __aligned(16), int rka[TK] = 0 ... TK; int rkb[TK] = 0 ... TK; float xc[TM, TN] = 0; - - /* pointers for A */ -#if AT == 1 - TYPE* pa[TK, TM] = A + rka[:, newaxis]*lda + rxa[newaxis, :]; - TYPE a[TK, TM] = *pa; -#else - TYPE* pa[TM, TK] = A + rka[newaxis, :] + rxa[:, newaxis]*lda; - TYPE a[TM, TK] = *pa; -#endif - - /* pointers for B */ -#if BT == 1 - TYPE* pb[TN, TK] = B + rkb[newaxis, :] + ryb[:, newaxis]*ldb; - TYPE b[TN, TK] = *pb; -#else - TYPE* pb[TK, TN] = B + rkb[:, newaxis]*ldb + ryb[newaxis, :]; - TYPE b[TK, TN] = *pb; -#endif - + /* pointers for operands */ + TYPE* pa[SHAPE_A] = A + rka[BROADCAST_AK] * STRIDE_AK + rxa[BROADCAST_AM] * STRIDE_AM; + TYPE* pb[SHAPE_B] = B + rkb[BROADCAST_BK] * STRIDE_BK + ryb[BROADCAST_BN] * STRIDE_BN; + /* prefetches operands */ + TYPE a[SHAPE_A] = *pa; + TYPE b[SHAPE_B] = *pb; /* reduction loop */ for(int k = K; k > 0; k = k - TK){ xc = USEA @ USEB + xc; -#if AT == 1 - pa = pa + TK*lda; -#else - pa = pa + TK; -#endif -#if BT == 1 - pb = pb + TK; -#else - pb = pb + TK*ldb; -#endif + pa = pa + TK * STRIDE_AK; + pb = pb + TK * STRIDE_BK; a = *pa; b = *pb; } - /* epilogue */ int rxc[TM] = ridx * TM + (0 ... TM); int ryc[TN] = ridy * TN + (0 ... TN); @@ -75,7 +73,7 @@ void dot(TYPE * A __noalias __readonly __aligned(16), bool checkc0[TM] = rxc < M; bool checkc1[TN] = ryc < N; bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - *?(checkc) pc = c; + *pc = c; } """ @@ -112,10 +110,12 @@ class dot_op: AT = self.trans_a, BT = self.trans_b, TYPE = tf.float16, TM = [128], TN = [ 128], TK = [32]) -dot_nt = dot_op(False, True) -dot_nn = dot_op(False, False) -dot_tn = dot_op(True, False) -dot_tt = dot_op(True, True) + +def dot(a, b, trans_a = False, trans_b = False): + if (trans_a, trans_b) not in dot.ops: + dot.ops[trans_a, trans_b] = dot_op(trans_a, trans_b) + return dot.ops[trans_a, trans_b](a, b) +dot.ops = dict() # @triton.register_gradient(dot_op) # def _dot_grad(op, dy): @@ -127,9 +127,7 @@ def run_dot(): M, N, K = 128, 128, 128 a = tf.placeholder(tf.float16, shape=[M, K]) b = tf.placeholder(tf.float16, shape=[N, K]) - # c = tf.matmul(a, b, transpose_a=True) - c = dot_nt(a, b) - # grads = tf.gradients(c, [a]) + c = dot(a, b, trans_a = False, trans_b = True) # Reference ha = np.random.rand(M, K).astype(np.float16) hb = np.random.rand(K, N).astype(np.float16) @@ -142,8 +140,6 @@ def run_dot(): hresult = np.dot(ha, hb.T) dif = np.abs(result - hresult) np.savetxt('dif.dat', dif, '%2.4f') - print(hresult) - print(result) print("dif: %f" % np.max(dif)) run_dot() \ No newline at end of file diff --git a/python/triton/ops.py b/python/triton/ops.py index 4e2225f3d..f0b1ed86b 100644 --- a/python/triton/ops.py +++ b/python/triton/ops.py @@ -105,7 +105,8 @@ def _build(src, path, framework): if framework == tensorflow_id: _import_tensorflow() library_dirs += [tensorflow.sysconfig.get_lib()] - include_dirs += [tensorflow.sysconfig.get_lib()] + include_dirs += [tensorflow.sysconfig.get_include()] + include_dirs += ['/usr/local/cuda/include/'] libraries += ['tensorflow_framework'] elif framework == torch_id: _import_torch() @@ -215,7 +216,7 @@ class op: self.fw_grids = dict() self.src = src self.outputs = outputs - self.framework = _find_framework(None) + self.framework = _find_framework(framework) def __call__(self, *args, **kwargs): # create a new op when defines are different From 5db3a7adfebcb220e510600a2c6fa0d9261e447f Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 30 Aug 2019 17:05:03 -0700 Subject: [PATCH 339/494] [python][examples] some more cleaning of dot product example --- lib/runtime/function.cc | 2 +- python/examples/dot.py | 32 +++++++++++++------------------- 2 files changed, 14 insertions(+), 20 deletions(-) diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 9b2072974..54d6af4c1 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -217,7 +217,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c dce.run(module); vectorize.run(module); dce.run(module); -// ir::print(module, std::cout); + ir::print(module, std::cout); // generate llvm code llvm::LLVMContext ctx; std::unique_ptr llvm(new llvm::Module(module.get_name(), ctx)); diff --git a/python/examples/dot.py b/python/examples/dot.py index 1eb7867af..ffb93fd33 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -35,45 +35,39 @@ src = """ #define SHAPE_B TK, TN #endif -void dot(TYPE * A, - TYPE * B, - TYPE * C, +void dot(TYPE * A, TYPE * B, TYPE * C, int M, int N, int K, int lda __multipleof(8), int ldb __multipleof(8), int ldc) { - - /* prologue */ + // prologue int ridx = get_program_id(0); int ridy = get_program_id(1); int rxa[TM] = ridx * TM + 0 ... TM; int ryb[TN] = ridy * TN + 0 ... TN; int rka[TK] = 0 ... TK; int rkb[TK] = 0 ... TK; - float xc[TM, TN] = 0; - /* pointers for operands */ + float c[TM, TN] = 0; + // pointers to operands TYPE* pa[SHAPE_A] = A + rka[BROADCAST_AK] * STRIDE_AK + rxa[BROADCAST_AM] * STRIDE_AM; TYPE* pb[SHAPE_B] = B + rkb[BROADCAST_BK] * STRIDE_BK + ryb[BROADCAST_BN] * STRIDE_BN; - /* prefetches operands */ + // prefetches operands TYPE a[SHAPE_A] = *pa; TYPE b[SHAPE_B] = *pb; - /* reduction loop */ - for(int k = K; k > 0; k = k - TK){ - xc = USEA @ USEB + xc; + // reduction loop + for(int k = K; k > 0; k-= TK){ + c += USEA @ USEB; pa = pa + TK * STRIDE_AK; pb = pb + TK * STRIDE_BK; a = *pa; b = *pb; } - /* epilogue */ - int rxc[TM] = ridx * TM + (0 ... TM); - int ryc[TN] = ridy * TN + (0 ... TN); + // epilogue + int rxc[TM] = ridx * TM + 0 ... TM; + int ryc[TN] = ridy * TN + 0 ... TN; TYPE* pc[TM, TN] = C + ryc[newaxis, :] + rxc[:, newaxis] * ldc; - TYPE c[TM, TN] = xc; - bool checkc0[TM] = rxc < M; - bool checkc1[TN] = ryc < N; - bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - *pc = c; + bool checkc[TM, TN] = (rxc < M)[:, newaxis] && (ryc < N)[newaxis, :]; + *?(checkc) pc = c; } """ From 2d4ddab4d09032161875d89f5de022b8de304239 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 30 Aug 2019 18:02:33 -0700 Subject: [PATCH 340/494] [ir][print] improved pretty-printing of constants and instructions --- include/triton/ir/constant.h | 33 +++++++++------------------ include/triton/ir/context_impl.h | 3 --- include/triton/ir/instructions.h | 10 ++++----- include/triton/ir/type.h | 38 ++++++++++++++++++++++++++++++++ lib/ir/builder.cc | 18 ++++----------- lib/ir/constant.cc | 35 ----------------------------- lib/ir/instructions.cc | 12 ---------- lib/ir/print.cc | 7 +++--- 8 files changed, 61 insertions(+), 95 deletions(-) diff --git a/include/triton/ir/constant.h b/include/triton/ir/constant.h index 6e177a47f..23be73256 100644 --- a/include/triton/ir/constant.h +++ b/include/triton/ir/constant.h @@ -21,6 +21,7 @@ protected: public: static constant* get_all_ones_value(type *ty); static constant* get_null_value(type *ty); + virtual std::string repr() const = 0; }; /* Undef value */ @@ -30,6 +31,7 @@ private: public: static undef_value* get(type* ty); + std::string repr() const { return "undef"; } }; @@ -40,8 +42,8 @@ protected: public: virtual uint64_t get_value() const { return value_; } - virtual std::string repr() const { return std::to_string(get_value()); } static constant_int *get(type *ty, uint64_t value); + std::string repr() const { return std::to_string(value_); } protected: uint64_t value_; @@ -66,28 +68,6 @@ private: bool has_value_; }; -class constant_expression: public constant_int { - typedef binary_op_t op_t; - -private: - constant_expression(op_t op, constant_int* lhs, constant_int* rhs); - -public: - uint64_t get_value() const; - // Wraps - void set_has_no_unsigned_wrap(bool b = true) { has_no_unsigned_wrap_ = b; } - void set_has_no_signed_wrap(bool b = true) { has_no_signed_wrap_ = b; } - // Factory - static constant_expression *create(op_t op, constant_int* lhs, constant_int* rhs); - -private: - op_t op_; - constant_int* lhs_; - constant_int* rhs_; - bool has_no_unsigned_wrap_; - bool has_no_signed_wrap_; -}; - /* constant range */ class constant_range: public constant{ constant_range(type *ty, constant_int* first, constant_int* last); @@ -96,6 +76,7 @@ public: static constant *get(constant_int *first, constant_int *last); const constant_int* get_first() const; const constant_int* get_last() const; + std::string repr() const { return first_->repr() + " ... " + last_->repr(); } private: constant_int* first_; @@ -112,6 +93,7 @@ public: static constant* get_zero_value_for_negation(type *ty); static constant* get(context &ctx, double v); static constant* get(type *ty, double v); + std::string repr() const { return std::to_string(value_); } private: double value_; @@ -128,6 +110,7 @@ public: global_value(type *ty, unsigned num_ops, linkage_types_t linkage, const std::string &name, unsigned addr_space); + std::string repr() const { return get_name(); } private: linkage_types_t linkage_; @@ -139,6 +122,8 @@ public: global_object(type *ty, unsigned num_ops, linkage_types_t linkage, const std::string &name, unsigned addr_space = 0); + std::string repr() const { return get_name(); } + }; /* global variable */ @@ -146,6 +131,8 @@ class alloc_const: public global_object { public: alloc_const(type *ty, constant_int *size, const std::string &name = ""); + std::string repr() const { return get_name(); } + }; } diff --git a/include/triton/ir/context_impl.h b/include/triton/ir/context_impl.h index df26796c6..5995de0d4 100644 --- a/include/triton/ir/context_impl.h +++ b/include/triton/ir/context_impl.h @@ -11,7 +11,6 @@ namespace ir{ class context; class constant; -class constant_expression; class constant_int; class constant_fp; class undef_value; @@ -39,8 +38,6 @@ public: std::map uv_constants_; // Metaparameters std::vector mp_constants_; - // Expr constants - std::map, constant_expression*> expr_constants_; }; } diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index 5c6af5362..f0a345c81 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -309,7 +309,7 @@ public: // ternary class ternary_inst: public instruction { private: - std::string repr_impl() const { return "ternary"; } + std::string repr_impl() const { return "cond"; } ternary_inst(value *cond, value *true_value, value *false_value, const std::string &name, instruction *next); @@ -438,7 +438,6 @@ public: class retile_inst: public unary_inst { protected: retile_inst(value *arg, const type::tile_shapes_t &shapes, const std::string &name, instruction *next); - static std::string shape_suffix(ir::type* ty); }; // reshape @@ -446,7 +445,7 @@ protected: class reshape_inst: public retile_inst { private: using retile_inst::retile_inst; - std::string repr_impl() const { return "reshape" + shape_suffix(get_type()); } + std::string repr_impl() const { return "reshape"; } public: static instruction* create(value *arg, const type::tile_shapes_t &shape_suffix, @@ -458,7 +457,7 @@ public: class splat_inst: public retile_inst { private: using retile_inst::retile_inst; - std::string repr_impl() const { return "splat" + shape_suffix(get_type()); } + std::string repr_impl() const { return "splat"; } public: static instruction* create(value *arg, const type::tile_shapes_t &shape_suffix, @@ -470,7 +469,7 @@ public: class broadcast_inst: public retile_inst { private: using retile_inst::retile_inst; - std::string repr_impl() const { return "broadcast" + shape_suffix(get_type()); } + std::string repr_impl() const { return "broadcast"; } public: static instruction* create(value *arg, const type::tile_shapes_t &shape_suffix, @@ -688,6 +687,7 @@ private: public: static nv_static_program_idx *get(constant_range* range); constant_range* get_range() const; + std::string repr() const { return get_name(); } private: constant_range *range_; diff --git a/include/triton/ir/type.h b/include/triton/ir/type.h index 60d2d9691..aee2ecc42 100644 --- a/include/triton/ir/type.h +++ b/include/triton/ir/type.h @@ -3,7 +3,9 @@ #ifndef _TRITON_IR_TYPE_H_ #define _TRITON_IR_TYPE_H_ +#include #include +#include namespace triton{ namespace ir{ @@ -102,6 +104,42 @@ public: static integer_type *get_int64_ty(context &ctx); static integer_type *get_int128_ty(context &ctx); + // repr + std::string tile_repr() const { + std::string res = get_tile_element_ty()->repr(); + auto shapes = get_tile_shapes(); + res += "<"; + for(size_t i = 0; i < shapes.size(); i++){ + if(i > 0) + res += ", "; + res += std::to_string(shapes[i]); + } + res+= ">"; + return res; + } + + std::string repr() const { + switch(id_) { + case VoidTyID: return "void"; + case HalfTyID: return "f16"; + case FloatTyID: return "f32"; + case DoubleTyID: return "f64"; + case X86_FP80TyID: return "f80"; + case FP128TyID: return "f128"; + case PPC_FP128TyID: return "ppcf128"; + case LabelTyID: return "label"; + case MetadataTyID: return "md"; + case TokenTyID: return "tok"; + case IntegerTyID: return "i" + std::to_string(get_integer_bitwidth()); + case FunctionTyID: return "fn"; + case PointerTyID: return get_pointer_element_ty()->repr() + "*"; + case StructTyID: return "struct"; + case TileTyID: return tile_repr(); + default: break; + } + assert(false); + return ""; + }; private: context &ctx_; diff --git a/lib/ir/builder.cc b/lib/ir/builder.cc index b4ff3c5b2..458365a60 100644 --- a/lib/ir/builder.cc +++ b/lib/ir/builder.cc @@ -148,20 +148,10 @@ DEFINE_UNARY_FLOAT(fneg) value* builder::create_insert_nuwnswb_binop(binary_op_t op, value *lhs, value *rhs, const std::string &name, bool has_nuw, bool has_nsw) { - auto *clhs = dynamic_cast(lhs); - auto *crhs = dynamic_cast(rhs); - if(clhs && crhs){ - constant_expression* result = constant_expression::create(op, clhs, crhs); - if (has_nuw) result->set_has_no_unsigned_wrap(); - if (has_nsw) result->set_has_no_signed_wrap(); - return result; - } - else { - binary_operator* result = insert(binary_operator::create(op, lhs, rhs), name); - if (has_nuw) result->set_has_no_unsigned_wrap(); - if (has_nsw) result->set_has_no_signed_wrap(); - return result; - } + binary_operator* result = insert(binary_operator::create(op, lhs, rhs), name); + if (has_nuw) result->set_has_no_unsigned_wrap(); + if (has_nsw) result->set_has_no_signed_wrap(); + return result; } #define DEFINE_NOWRAP_BINARY(SUFFIX, OPCODE)\ diff --git a/lib/ir/constant.cc b/lib/ir/constant.cc index 6df5a58cd..9ff8d6e72 100644 --- a/lib/ir/constant.cc +++ b/lib/ir/constant.cc @@ -120,41 +120,6 @@ metaparameter* metaparameter::create(context &ctx, type *ty, const std::vectorget_type(), 0), - op_(op), lhs_(lhs), rhs_(rhs) { } - - -constant_expression *constant_expression::create(op_t op, constant_int* lhs, constant_int* rhs) { - context_impl *impl = lhs->get_type()->get_context().p_impl.get(); - constant_expression *& result = impl->expr_constants_[std::make_tuple((int)op, lhs, rhs)]; - if(!result) - result = new constant_expression(op, lhs, rhs); - return result; -} - -uint64_t constant_expression::get_value() const { - uint64_t lhs = lhs_->get_value(); - uint64_t rhs = rhs_->get_value(); - switch(op_) { - case op_t::Add : return lhs + rhs; - case op_t::Sub : return lhs - rhs; - case op_t::Mul : return lhs * rhs; - case op_t::UDiv : return lhs / rhs; - case op_t::SDiv : return lhs / rhs; - case op_t::URem : return lhs % rhs; - case op_t::SRem : return lhs % rhs; - case op_t::Shl : return lhs << rhs; - case op_t::LShr : return lhs >> rhs; - case op_t::AShr : return lhs >> rhs; - case op_t::And : return lhs && rhs; - case op_t::Or : return lhs || rhs; - case op_t::Xor : return lhs ^ rhs; - default: throw std::runtime_error("unsupported constexpr binary operator"); - } -} - // undef value undef_value::undef_value(type *ty) diff --git a/lib/ir/instructions.cc b/lib/ir/instructions.cc index 3d911b967..e7e5de1f2 100644 --- a/lib/ir/instructions.cc +++ b/lib/ir/instructions.cc @@ -482,18 +482,6 @@ masked_store_inst* masked_store_inst::create(value *ptr, value *val, value *mask // retile_inst classes //===----------------------------------------------------------------------===// -std::string retile_inst::shape_suffix(ir::type* ty){ - std::string res = "["; - const auto& shapes = ty->get_tile_shapes(); - for(unsigned i = 0; i < shapes.size(); i++){ - res += std::to_string(ty->get_tile_shapes()[i]); - if(i < shapes.size() - 1) - res += ", "; - } - res += "]"; - return res; -} - retile_inst::retile_inst(value *arg, const type::tile_shapes_t &shapes, const std::string &name, instruction *next) : unary_inst(tile_type::get(arg->get_type()->get_scalar_ty(), shapes), arg, name, next) { } diff --git a/lib/ir/print.cc b/lib/ir/print.cc index 9b66305b8..31cc15d9a 100644 --- a/lib/ir/print.cc +++ b/lib/ir/print.cc @@ -44,14 +44,15 @@ void print(module &mod, std::ostream& os) { else os << " = "; } - os << inst->repr(); + ir::type* type = inst->get_type(); + os << inst->repr() << " " << type->repr(); ir::instruction::ops_t ops = inst->ops(); size_t num_ops = inst->get_num_operands(); if(num_ops > 0) os << " ";; for(unsigned i = 0; i < num_ops; i++){ - if(auto *x = dynamic_cast(ops[i])) - os << x->get_value(); + if(auto *x = dynamic_cast(ops[i])) + os << x->repr(); else os << get_name(ops[i], cnt++); os << (i < num_ops - 1?", ":""); From 90d80c3b2e49efb5d8513b360bfbcf7f6689ef1d Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 1 Sep 2019 16:30:53 -0400 Subject: [PATCH 341/494] [codegen][selection] bugfix in scanline dot lowering --- lib/codegen/analysis/tune.cc | 1 - lib/codegen/selection/selection.cc | 2 +- lib/runtime/function.cc | 2 +- tests/bench/dot.cc | 30 +++++------ tests/common/src/dot.h | 87 +++++++++++++++--------------- 5 files changed, 59 insertions(+), 63 deletions(-) diff --git a/lib/codegen/analysis/tune.cc b/lib/codegen/analysis/tune.cc index 7c40788ce..275011a7b 100644 --- a/lib/codegen/analysis/tune.cc +++ b/lib/codegen/analysis/tune.cc @@ -308,7 +308,6 @@ void grids::run(ir::module &mod) { std::string str_d = std::to_string(d); effective_num_threads *= params_.at(i).at("mts.d" + str_d)->get_value(); } - if(num_threads != effective_num_threads) throw std::runtime_error("cannot create a kernel with this amount of warps"); } diff --git a/lib/codegen/selection/selection.cc b/lib/codegen/selection/selection.cc index 61aa73853..a44a4c926 100644 --- a/lib/codegen/selection/selection.cc +++ b/lib/codegen/selection/selection.cc @@ -1209,7 +1209,7 @@ void selection::lower_scanline_dot(ir::dot_inst *dot, LLVMContext &ctx, Function TA->set_vector_size(TC->axis(0).contiguous); TB->set_vector_size(TC->axis(1).contiguous); TC->for_each([&](indices_t idx){ - Value *res = TC->get_value(idx); + Value *res = TD->get_value(idx); for(unsigned K = 0; K < NK; ++K){ // input indices indices_t a_idx = {idx[0], builder.getInt32(K)}; diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 54d6af4c1..9b2072974 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -217,7 +217,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c dce.run(module); vectorize.run(module); dce.run(module); - ir::print(module, std::cout); +// ir::print(module, std::cout); // generate llvm code llvm::LLVMContext ctx; std::unique_ptr llvm(new llvm::Module(module.get_name(), ctx)); diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index 469f47e43..cb678ff99 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -27,8 +27,8 @@ inline rt::function::grid_fn_ty grid(size_t M, size_t N) { std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K){ - typedef half_float::half NumericT; - std::string ty = "half"; + typedef float NumericT; + std::string ty = "float"; size_t dt_nbytes = sizeof(NumericT); drv::context* context = stream->context(); // leading dimensions @@ -46,25 +46,25 @@ std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, i opt.defines.push_back({"AT", {""}}); if(BT) opt.defines.push_back({"BT", {""}}); - opt.defines.push_back({"TM", {"64", "128"}}); - opt.defines.push_back({"TN", {"128"}}); - opt.defines.push_back({"TK", {"32"}}); + opt.defines.push_back({"TM", {"64"}}); + opt.defines.push_back({"TN", {"64"}}); + opt.defines.push_back({"TK", {"8"}}); opt.num_warps = {4}; // create function rt::function function(src::dot, opt); // benchmark available libraries std::vector result; auto tflops = [&](double nanosec) { return 2.*M*N*K / nanosec * 1e-3; }; - // cublas - if(cublas::cublasinit()){ - NumericT alpha(static_cast(1)); - NumericT beta(static_cast(0)); - cublasGemmAlgo_t fastest; - cublasGemm(CUDA_R_16F, stream, AT, BT, M, N, K, &alpha, &*da, lda, &*db, ldb, &beta, &*dc, ldc, &fastest); - double cublas_ms = triton::tools::bench([&]() { cublasGemm(CUDA_R_16F, stream, AT, BT, M, N, K, - &alpha, &*da, lda, &*db, ldb, &beta, &*dc, ldc, nullptr, fastest); }, stream); - result.push_back(tflops(cublas_ms)); - } +// // cublas +// if(cublas::cublasinit()){ +// NumericT alpha(static_cast(1)); +// NumericT beta(static_cast(0)); +// cublasGemmAlgo_t fastest; +// cublasGemm(CUDA_R_16F, stream, AT, BT, M, N, K, &alpha, &*da, lda, &*db, ldb, &beta, &*dc, ldc, &fastest); +// double cublas_ms = triton::tools::bench([&]() { cublasGemm(CUDA_R_16F, stream, AT, BT, M, N, K, +// &alpha, &*da, lda, &*db, ldb, &beta, &*dc, ldc, nullptr, fastest); }, stream); +// result.push_back(tflops(cublas_ms)); +// } // triton double triton_ms = triton::tools::bench([&]() { function({&*da, &*db, &*dc, M, N, K, lda, ldb, ldc}, grid(M, N), stream);}, stream); result.push_back(tflops(triton_ms)); diff --git a/tests/common/src/dot.h b/tests/common/src/dot.h index 993a1d260..3e636e18a 100644 --- a/tests/common/src/dot.h +++ b/tests/common/src/dot.h @@ -4,74 +4,71 @@ namespace src { R"( #ifdef AT #define USEA ^a +#define STRIDE_AK lda +#define STRIDE_AM 1 +#define BROADCAST_AK :, newaxis +#define BROADCAST_AM newaxis, : +#define SHAPE_A TK, TM #else #define USEA a +#define STRIDE_AK 1 +#define STRIDE_AM lda +#define BROADCAST_AK newaxis, : +#define BROADCAST_AM :, newaxis +#define SHAPE_A TM, TK #endif #ifdef BT #define USEB ^b +#define STRIDE_BK 1 +#define STRIDE_BN ldb +#define BROADCAST_BK newaxis, : +#define BROADCAST_BN :, newaxis +#define SHAPE_B TN, TK #else #define USEB b +#define STRIDE_BK ldb +#define STRIDE_BN 1 +#define BROADCAST_BK :, newaxis +#define BROADCAST_BN newaxis, : +#define SHAPE_B TK, TN #endif -void dot(TYPE * A __noalias __readonly __aligned(16), - TYPE * B __noalias __readonly __aligned(16), - TYPE * C __noalias __readonly __aligned(16), +void dot(TYPE * A, TYPE * B, TYPE * C, int M, int N, int K, int lda __multipleof(8), int ldb __multipleof(8), int ldc) { + // prologue int ridx = get_program_id(0); int ridy = get_program_id(1); int rxa[TM] = ridx * TM + 0 ... TM; int ryb[TN] = ridy * TN + 0 ... TN; int rka[TK] = 0 ... TK; int rkb[TK] = 0 ... TK; - float xc[TM, TN] = 0; -#ifdef AT - TYPE* pa[TK, TM] = A + rka[:, newaxis] + rxa[newaxis, :]*lda; - bool checka[TK, TM] = rka[:, newaxis] < TK; - TYPE a[TK, TM] = checka ? *pa : 0; -#else - TYPE* pa[TM, TK] = A + rka[newaxis, :]*lda + rxa[:, newaxis]; - bool checka[TM, TK] = rka[newaxis, :] < TK; - TYPE a[TM, TK] = checka ? *pa : 0; -#endif -#ifdef BT - TYPE* pb[TN, TK] = B + rkb[newaxis, :]*ldb + ryb[:, newaxis]; - bool checkb[TN, TK] = rkb[newaxis, :] < TK; - TYPE b[TN, TK] = checkb ? *pb : 0; -#else - TYPE* pb[TK, TN] = B + rkb[:, newaxis] + ryb[newaxis, :]*ldb; - bool checkb[TK, TN] = rkb[:, newaxis] < TK; - TYPE b[TK, TN] = checkb ? *pb : 0; -#endif - for(int k = K; k > 0; k = k - TK){ - xc = USEA @ USEB + xc; -#ifdef AT - pa = pa + TK; -#else - pa = pa + TK*lda; -#endif -#ifdef BT - pb = pb + TK*ldb; -#else - pb = pb + TK; -#endif - checka = k > TK; - checkb = k > TK; - a = checka ? *pa : 0; - b = checkb ? *pb : 0; + float c[TM, TN] = 0; + // pointers to operands + TYPE* pa[SHAPE_A] = A + rka[BROADCAST_AK] * STRIDE_AK + rxa[BROADCAST_AM] * STRIDE_AM; + TYPE* pb[SHAPE_B] = B + rkb[BROADCAST_BK] * STRIDE_BK + ryb[BROADCAST_BN] * STRIDE_BN; + // prefetches operands + TYPE a[SHAPE_A] = *pa; + TYPE b[SHAPE_B] = *pb; + // reduction loop + for(int k = K; k > 0; k-= TK){ + c += USEA @ USEB; + pa = pa + TK * STRIDE_AK; + pb = pb + TK * STRIDE_BK; + a = *pa; + b = *pb; } - int rxc[TM] = ridx * TM + (0 ... TM); - int ryc[TN] = ridy * TN + (0 ... TN); - TYPE* pc[TM, TN] = C + ryc[newaxis, :]*ldc + rxc[:, newaxis]; - TYPE c[TM, TN] = xc; - bool checkc0[TM] = rxc < M; - bool checkc1[TN] = ryc < N; - bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + // epilogue + int rxc[TM] = ridx * TM + 0 ... TM; + int ryc[TN] = ridy * TN + 0 ... TN; + TYPE* pc[TM, TN] = C + ryc[newaxis, :] + rxc[:, newaxis] * ldc; + bool checkc[TM, TN] = (rxc < M)[:, newaxis] && (ryc < N)[newaxis, :]; *?(checkc) pc = c; } + )"; } From a842d337c561e8a0cbd2433311bffe1a7eeea8c1 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 2 Sep 2019 23:00:49 -0400 Subject: [PATCH 342/494] [general] various cleaning and bugfix: * added copy1d and copy2d benchmark * fixed issue in reassociation pass --- .../codegen/analysis/{alignment.h => align.h} | 2 +- .../codegen/analysis/{tune.h => grid.h} | 0 .../codegen/analysis/{shmem => }/liveness.h | 8 ++-- .../{shmem/allocation.h => memalloc.h} | 11 ++--- .../analysis/{shmem/info.h => meminfo.h} | 4 +- .../codegen/{selection => }/selection.h | 20 ++++------ .../triton/codegen/{selection => }/target.h | 4 ++ .../transform/{shmem/barriers.h => membar.h} | 14 +++---- .../triton/codegen/transform/reassociate.h | 6 +-- include/triton/ir/function.h | 13 ++++++ include/triton/ir/instructions.h | 2 +- include/triton/runtime/function.h | 16 ++++---- include/triton/tools/bench.hpp | 4 +- .../analysis/{alignment.cc => align.cc} | 40 +++++++++++++------ lib/codegen/analysis/{tune.cc => grid.cc} | 4 +- lib/codegen/analysis/{shmem => }/liveness.cc | 6 +-- .../{shmem/allocation.cc => memalloc.cc} | 16 ++++---- .../analysis/{shmem/info.cc => meminfo.cc} | 16 ++++---- lib/codegen/{selection => }/selection.cc | 32 ++++++++------- lib/codegen/{selection => }/target.cc | 2 +- .../{shmem/barriers.cc => membar.cc} | 28 ++++++------- lib/codegen/transform/reassociate.cc | 30 ++++++++++++-- lib/codegen/transform/vectorize.cc | 2 +- lib/driver/device.cc | 2 +- lib/driver/module.cc | 8 ++-- lib/ir/module.cc | 1 + lib/ir/print.cc | 12 ++++++ lib/lang/code_gen.cc | 7 +++- lib/runtime/function.cc | 27 ++++++++----- tests/bench/CMakeLists.txt | 2 +- tests/bench/dot.cc | 38 ++++++++---------- tests/common/cuda/forward.h | 12 +++--- tests/common/src/dot.h | 30 +++++++------- tests/common/util.h | 23 +++++++++-- tests/unit/CMakeLists.txt | 2 +- tests/unit/dot.cc | 12 +++--- 36 files changed, 265 insertions(+), 191 deletions(-) rename include/triton/codegen/analysis/{alignment.h => align.h} (97%) rename include/triton/codegen/analysis/{tune.h => grid.h} (100%) rename include/triton/codegen/analysis/{shmem => }/liveness.h (93%) rename include/triton/codegen/analysis/{shmem/allocation.h => memalloc.h} (86%) rename include/triton/codegen/analysis/{shmem/info.h => meminfo.h} (95%) rename include/triton/codegen/{selection => }/selection.h (94%) rename include/triton/codegen/{selection => }/target.h (94%) rename include/triton/codegen/transform/{shmem/barriers.h => membar.h} (79%) rename lib/codegen/analysis/{alignment.cc => align.cc} (89%) rename lib/codegen/analysis/{tune.cc => grid.cc} (99%) rename lib/codegen/analysis/{shmem => }/liveness.cc (89%) rename lib/codegen/analysis/{shmem/allocation.cc => memalloc.cc} (93%) rename lib/codegen/analysis/{shmem/info.cc => meminfo.cc} (91%) rename lib/codegen/{selection => }/selection.cc (99%) rename lib/codegen/{selection => }/target.cc (99%) rename lib/codegen/transform/{shmem/barriers.cc => membar.cc} (81%) diff --git a/include/triton/codegen/analysis/alignment.h b/include/triton/codegen/analysis/align.h similarity index 97% rename from include/triton/codegen/analysis/alignment.h rename to include/triton/codegen/analysis/align.h index 6ef3c0f55..6812314b7 100644 --- a/include/triton/codegen/analysis/alignment.h +++ b/include/triton/codegen/analysis/align.h @@ -13,7 +13,7 @@ namespace ir { namespace codegen{ namespace analysis{ -class alignment_info { +class align { struct cst_info { unsigned num_cst; unsigned value; diff --git a/include/triton/codegen/analysis/tune.h b/include/triton/codegen/analysis/grid.h similarity index 100% rename from include/triton/codegen/analysis/tune.h rename to include/triton/codegen/analysis/grid.h diff --git a/include/triton/codegen/analysis/shmem/liveness.h b/include/triton/codegen/analysis/liveness.h similarity index 93% rename from include/triton/codegen/analysis/shmem/liveness.h rename to include/triton/codegen/analysis/liveness.h index bec0303c0..4aa0c6dae 100644 --- a/include/triton/codegen/analysis/shmem/liveness.h +++ b/include/triton/codegen/analysis/liveness.h @@ -13,11 +13,10 @@ namespace ir{ namespace codegen{ namespace analysis{ -namespace shmem{ typedef unsigned slot_index; -class info; +class meminfo; struct segment { slot_index start; @@ -45,7 +44,7 @@ public: public: // constructor - liveness(info *info): info_(info){ } + liveness(meminfo *info): info_(info){ } // accessors const intervals_map_t& intervals() const { return intervals_; } @@ -55,7 +54,7 @@ public: void run(ir::module &mod); private: - info *info_; + meminfo *info_; has_storage_map_t has_dedicated_storage_; indices_map_t indices_; intervals_map_t intervals_; @@ -64,7 +63,6 @@ private: } } } -} #endif diff --git a/include/triton/codegen/analysis/shmem/allocation.h b/include/triton/codegen/analysis/memalloc.h similarity index 86% rename from include/triton/codegen/analysis/shmem/allocation.h rename to include/triton/codegen/analysis/memalloc.h index 243d78352..0e5b2adc9 100644 --- a/include/triton/codegen/analysis/shmem/allocation.h +++ b/include/triton/codegen/analysis/memalloc.h @@ -17,14 +17,12 @@ namespace analysis{ class grids; -namespace shmem{ - class liveness; -class info; +class meminfo; -class allocation { +class memalloc { public: - allocation(liveness *live, info *buffer_info, grids *params) + memalloc(liveness *live, meminfo *buffer_info, grids *params) : liveness_(live), buffer_info_(buffer_info), params_(params){ } // utilities @@ -44,13 +42,12 @@ private: size_t allocated_size_; // dependences liveness *liveness_; - info *buffer_info_; + meminfo *buffer_info_; grids *params_; }; } } } -} #endif diff --git a/include/triton/codegen/analysis/shmem/info.h b/include/triton/codegen/analysis/meminfo.h similarity index 95% rename from include/triton/codegen/analysis/shmem/info.h rename to include/triton/codegen/analysis/meminfo.h index 689516cb2..1b896056f 100644 --- a/include/triton/codegen/analysis/shmem/info.h +++ b/include/triton/codegen/analysis/meminfo.h @@ -15,9 +15,8 @@ namespace ir { namespace codegen{ namespace analysis{ -namespace shmem{ -class info { +class meminfo { public: void run(ir::module &mod); // queries @@ -38,6 +37,5 @@ private: } } } -} #endif diff --git a/include/triton/codegen/selection/selection.h b/include/triton/codegen/selection.h similarity index 94% rename from include/triton/codegen/selection/selection.h rename to include/triton/codegen/selection.h index 2610fefc3..0a5d84825 100644 --- a/include/triton/codegen/selection/selection.h +++ b/include/triton/codegen/selection.h @@ -5,7 +5,7 @@ #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/type.h" -#include "triton/codegen/analysis/shmem/info.h" +#include "triton/codegen/analysis/meminfo.h" namespace llvm{ @@ -45,14 +45,10 @@ namespace codegen{ namespace analysis{ class grids; -class alignment_info; +class align; +class memalloc; +class meminfo; -namespace shmem{ - -class allocation; -class info; - -} } class target; @@ -196,7 +192,7 @@ private: public: - selection(analysis::shmem::allocation *alloc, analysis::grids *params, analysis::shmem::info *buffer_info, analysis::alignment_info *alignment, target *tgt) + selection(analysis::memalloc *alloc, analysis::grids *params, analysis::meminfo *buffer_info, analysis::align *alignment, target *tgt) : alloc_(alloc), params_(params), buffer_info_(buffer_info), alignment_(alignment), tgt_(tgt){ } void run(ir::module &src, Module &dst); @@ -204,10 +200,10 @@ public: private: vmap_t vmap_; tmap_t tmap_; - analysis::shmem::allocation *alloc_; + analysis::memalloc *alloc_; analysis::grids *params_; - analysis::shmem::info *buffer_info_; - analysis::alignment_info *alignment_; + analysis::meminfo *buffer_info_; + analysis::align *alignment_; target *tgt_; std::map axes_; Value *sh_mem_ptr_; diff --git a/include/triton/codegen/selection/target.h b/include/triton/codegen/target.h similarity index 94% rename from include/triton/codegen/selection/target.h rename to include/triton/codegen/target.h index f5f8e9a7c..dc379bd0c 100644 --- a/include/triton/codegen/selection/target.h +++ b/include/triton/codegen/target.h @@ -46,6 +46,7 @@ public: virtual Value* get_local_id(Module *module, Builder& builder, unsigned ax) = 0; virtual Value* get_block_id(Module *module, Builder& builder, unsigned ax) = 0; virtual Value* get_num_blocks(Module *module, Builder& builder, unsigned ax) = 0; + virtual unsigned guaranteed_alignment() = 0; bool is_gpu() const; private: @@ -62,6 +63,7 @@ public: Value* get_local_id(Module *module, Builder& builder, unsigned ax); Value* get_block_id(Module *module, Builder& builder, unsigned ax); Value* get_num_blocks(Module *module, Builder& builder, unsigned ax); + unsigned guaranteed_alignment() { return 16; } }; class nvidia_cu_target: public target { @@ -74,6 +76,7 @@ public: Value* get_local_id(Module *module, Builder& builder, unsigned ax); Value* get_block_id(Module *module, Builder& builder, unsigned ax); Value* get_num_blocks(Module *module, Builder& builder, unsigned ax); + unsigned guaranteed_alignment() { return 16; } }; class cpu_target: public target { @@ -86,6 +89,7 @@ public: Value* get_local_id(Module *module, Builder& builder, unsigned ax); Value* get_block_id(Module *module, Builder& builder, unsigned ax); Value* get_num_blocks(Module *module, Builder& builder, unsigned ax); + unsigned guaranteed_alignment() { return 1; } }; } diff --git a/include/triton/codegen/transform/shmem/barriers.h b/include/triton/codegen/transform/membar.h similarity index 79% rename from include/triton/codegen/transform/shmem/barriers.h rename to include/triton/codegen/transform/membar.h index 6352fd060..8991ac57d 100644 --- a/include/triton/codegen/transform/shmem/barriers.h +++ b/include/triton/codegen/transform/membar.h @@ -14,17 +14,15 @@ namespace ir { namespace codegen{ namespace analysis{ -namespace shmem{ -class allocation; -class info; +class memalloc; +class meminfo; -} } namespace transform{ -class shmem_barriers { +class membar { private: typedef std::pair interval_t; typedef std::vector interval_vec_t; @@ -40,12 +38,12 @@ private: std::pair transfer(ir::basic_block *block, const interval_vec_t &written_to, const interval_vec_t &read_from, std::set &insert_loc); public: - shmem_barriers(analysis::shmem::allocation *alloc, analysis::shmem::info *buffer_info): alloc_(alloc), buffer_info_(buffer_info) {} + membar(analysis::memalloc *alloc, analysis::meminfo *buffer_info): alloc_(alloc), buffer_info_(buffer_info) {} void run(ir::module &mod); private: - analysis::shmem::allocation *alloc_; - analysis::shmem::info *buffer_info_; + analysis::memalloc *alloc_; + analysis::meminfo *buffer_info_; }; diff --git a/include/triton/codegen/transform/reassociate.h b/include/triton/codegen/transform/reassociate.h index 075446e6f..318884755 100644 --- a/include/triton/codegen/transform/reassociate.h +++ b/include/triton/codegen/transform/reassociate.h @@ -20,7 +20,7 @@ namespace codegen{ namespace analysis{ class grids; -class alignment_info; +class align; } namespace transform{ @@ -37,12 +37,12 @@ private: ir::value *reassociate_ptr(ir::getelementptr_inst* pz, ir::builder &builder, std::map &offsets); public: - reassociate(analysis::alignment_info* align, analysis::grids *params); + reassociate(analysis::align* align, analysis::grids *params); void run(ir::module& module); private: analysis::grids* params_; - analysis::alignment_info* align_; + analysis::align* align_; }; } diff --git a/include/triton/ir/function.h b/include/triton/ir/function.h index 4a7c308eb..74af3abe2 100644 --- a/include/triton/ir/function.h +++ b/include/triton/ir/function.h @@ -61,6 +61,19 @@ public: return kind_ != multiple_of; } + std::string repr() const { + switch(kind_){ + case readonly: return ".readonly"; + case writeonly: return ".writeonly"; + case noalias: return ".noalias"; + case aligned: return ".aligned(" + std::to_string(value_) + ")"; + case multiple_of: return ".readonly"; + default: break; + } + assert(false); + return ""; + } + private: attribute_kind_t kind_; unsigned value_; diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index f0a345c81..a4fbc3710 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -687,7 +687,7 @@ private: public: static nv_static_program_idx *get(constant_range* range); constant_range* get_range() const; - std::string repr() const { return get_name(); } + std::string repr() const { return "nv_static_program_idx"; } private: constant_range *range_; diff --git a/include/triton/runtime/function.h b/include/triton/runtime/function.h index b0054c647..96ec35ef7 100644 --- a/include/triton/runtime/function.h +++ b/include/triton/runtime/function.h @@ -9,16 +9,16 @@ #include #include // codegen -#include "triton/codegen/selection/selection.h" -#include "triton/codegen/selection/target.h" -#include "triton/codegen/analysis/tune.h" -#include "triton/codegen/analysis/shmem/allocation.h" -#include "triton/codegen/analysis/shmem/liveness.h" -#include "triton/codegen/analysis/shmem/info.h" -#include "triton/codegen/analysis/alignment.h" +#include "triton/codegen/selection.h" +#include "triton/codegen/target.h" +#include "triton/codegen/analysis/grid.h" +#include "triton/codegen/analysis/memalloc.h" +#include "triton/codegen/analysis/liveness.h" +#include "triton/codegen/analysis/meminfo.h" +#include "triton/codegen/analysis/align.h" #include "triton/codegen/transform/dce.h" #include "triton/codegen/transform/peephole.h" -#include "triton/codegen/transform/shmem/barriers.h" +#include "triton/codegen/transform/membar.h" #include "triton/codegen/transform/reassociate.h" #include "triton/codegen/transform/vectorize.h" #include "triton/lang/parser.h" diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp index 56016638b..554b3bcc3 100644 --- a/include/triton/tools/bench.hpp +++ b/include/triton/tools/bench.hpp @@ -41,8 +41,8 @@ inline double bench(std::function const & op, driver::stream * stream) while(total_time*1e-9 < 1e-3){ float norm = 1; // normalize clock if possible to reduce noise in auto-tuning -// if(auto cu_device = dynamic_cast(stream->context()->device())) -// norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock(); + if(auto cu_device = dynamic_cast(stream->context()->device())) + norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock(); tmr.start(); op(); stream->synchronize(); diff --git a/lib/codegen/analysis/alignment.cc b/lib/codegen/analysis/align.cc similarity index 89% rename from lib/codegen/analysis/alignment.cc rename to lib/codegen/analysis/align.cc index 98d4a110f..85500aefb 100644 --- a/lib/codegen/analysis/alignment.cc +++ b/lib/codegen/analysis/align.cc @@ -1,4 +1,4 @@ -#include "triton/codegen/analysis/alignment.h" +#include "triton/codegen/analysis/align.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" @@ -29,14 +29,14 @@ inline T add_to_cache(ir::value *i, T value, std::map &map) { } -bool alignment_info::is_first_axis_unit(ir::value *x){ +bool align::is_first_axis_unit(ir::value *x){ if(x->get_type()->is_tile_ty()) return x->get_type()->get_tile_shapes()[0] == 1; else return true; } -alignment_info::cst_info alignment_info::populate_is_constant(ir::value *v) { +align::cst_info align::populate_is_constant(ir::value *v) { if(is_constant_.find(v) != is_constant_.end()) return is_constant_.at(v); // helper for the cache @@ -102,7 +102,7 @@ alignment_info::cst_info alignment_info::populate_is_constant(ir::value *v) { return cache({1, 0}); } -unsigned alignment_info::populate_max_contiguous(ir::value *v){ +unsigned align::populate_max_contiguous(ir::value *v){ if(max_contiguous_.find(v) != max_contiguous_.end()) return max_contiguous_.at(v); // helper for the cache @@ -181,7 +181,7 @@ unsigned alignment_info::populate_max_contiguous(ir::value *v){ return cache(1); } -unsigned alignment_info::populate_starting_multiple(ir::value *v){ +unsigned align::populate_starting_multiple(ir::value *v){ if(starting_multiple_.find(v) != starting_multiple_.end()) return starting_multiple_.at(v); auto cache = [this,v](unsigned value){ @@ -240,7 +240,19 @@ unsigned alignment_info::populate_starting_multiple(ir::value *v){ int rhs = populate_starting_multiple(x->get_operand(1)); return cache(gcd(lhs, rhs)); } - if(auto *x = dynamic_cast(v)){ + if(auto *x = dynamic_cast(v)){ + int op = populate_starting_multiple(x->get_operand(0)); + return cache(op); + } + if(auto *x = dynamic_cast(v)){ + int op = populate_starting_multiple(x->get_operand(0)); + auto shapes = x->get_type()->get_tile_shapes(); + if(shapes[0] == 1) + return cache(1); + else + return cache(op); + } + if(auto *x = dynamic_cast(v)){ int op = populate_starting_multiple(x->get_operand(0)); return cache(op); } @@ -271,22 +283,22 @@ unsigned alignment_info::populate_starting_multiple(ir::value *v){ return cache(result); } -unsigned alignment_info::get_starting_multiple(ir::value* v) const { +unsigned align::get_starting_multiple(ir::value* v) const { return starting_multiple_.at(v); } -unsigned alignment_info::get_max_contiguous(ir::value* v) const { +unsigned align::get_max_contiguous(ir::value* v) const { return max_contiguous_.at(v); } -void alignment_info::copy(ir::value *dst, ir::value *src) { +void align::copy(ir::value *dst, ir::value *src) { starting_multiple_[dst] = starting_multiple_[src]; max_contiguous_[dst] = max_contiguous_[src]; is_constant_[dst] = is_constant_[src]; } ///TODO: This doesn't seem to work in DOT-NN, DOT-TT, DOT-TN -void alignment_info::run(ir::module &mod) { +void align::run(ir::module &mod) { // populate constant for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) @@ -304,9 +316,13 @@ void alignment_info::run(ir::module &mod) { // populate maximum contiguous for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i: block->get_inst_list()){ + for(ir::instruction *i: block->get_inst_list()) populate_max_contiguous(i); - } + +// for(ir::function *fn: mod.get_function_list()) +// for(ir::basic_block *block: fn->blocks()) +// for(ir::instruction *i: block->get_inst_list()) +// std::cout << i->get_name() << " " << max_contiguous_.at(i) << " " << is_constant_.at(i).num_cst << " " << starting_multiple_.at(i) << std::endl; } diff --git a/lib/codegen/analysis/tune.cc b/lib/codegen/analysis/grid.cc similarity index 99% rename from lib/codegen/analysis/tune.cc rename to lib/codegen/analysis/grid.cc index 275011a7b..f90ab8822 100644 --- a/lib/codegen/analysis/tune.cc +++ b/lib/codegen/analysis/grid.cc @@ -1,6 +1,6 @@ #include #include -#include "triton/codegen/analysis/tune.h" +#include "triton/codegen/analysis/grid.h" #include "triton/ir/instructions.h" #include "triton/ir/type.h" #include "triton/ir/module.h" @@ -292,7 +292,7 @@ void grids::run(ir::module &mod) { else{ unsigned shape = shapes[0]; unsigned current = num_threads; - params_.at(i).at("nts.d0")->set_value(clamp(size / num_threads, 1, 8)); + params_.at(i).at("nts.d0")->set_value(clamp(size / num_threads, 1, 4)); params_.at(i).at("mts.d0")->set_value(clamp(current, 1, shape / params_.at(i).at("nts.d0")->get_value())); current = current / params_.at(i).at("mts.d0")->get_value(); for(size_t d = 1; d < shapes.size(); d++){ diff --git a/lib/codegen/analysis/shmem/liveness.cc b/lib/codegen/analysis/liveness.cc similarity index 89% rename from lib/codegen/analysis/shmem/liveness.cc rename to lib/codegen/analysis/liveness.cc index 617a764ed..8801235b5 100644 --- a/lib/codegen/analysis/shmem/liveness.cc +++ b/lib/codegen/analysis/liveness.cc @@ -1,5 +1,5 @@ -#include "triton/codegen/analysis/shmem/liveness.h" -#include "triton/codegen/analysis/shmem/info.h" +#include "triton/codegen/analysis/liveness.h" +#include "triton/codegen/analysis/meminfo.h" #include "triton/ir/basic_block.h" #include "triton/ir/function.h" #include "triton/ir/module.h" @@ -9,7 +9,6 @@ namespace triton{ namespace codegen{ namespace analysis{ -namespace shmem{ // Entry point void liveness::run(ir::module &mod) { @@ -41,4 +40,3 @@ void liveness::run(ir::module &mod) { } } } -} diff --git a/lib/codegen/analysis/shmem/allocation.cc b/lib/codegen/analysis/memalloc.cc similarity index 93% rename from lib/codegen/analysis/shmem/allocation.cc rename to lib/codegen/analysis/memalloc.cc index 1061c0425..5f8a4d70b 100644 --- a/lib/codegen/analysis/shmem/allocation.cc +++ b/lib/codegen/analysis/memalloc.cc @@ -1,8 +1,8 @@ #include -#include "triton/codegen/analysis/shmem/allocation.h" -#include "triton/codegen/analysis/shmem/liveness.h" -#include "triton/codegen/analysis/shmem/info.h" -#include "triton/codegen/analysis/tune.h" +#include "triton/codegen/analysis/memalloc.h" +#include "triton/codegen/analysis/liveness.h" +#include "triton/codegen/analysis/meminfo.h" +#include "triton/codegen/analysis/grid.h" #include "triton/ir/basic_block.h" #include "triton/ir/type.h" #include "triton/ir/value.h" @@ -12,9 +12,8 @@ namespace triton{ namespace codegen{ namespace analysis{ -namespace shmem{ -unsigned allocation::is_ld_padded(ir::value *x) { +unsigned memalloc::is_ld_padded(ir::value *x) { if(auto *trans = dynamic_cast(x)){ if(trans->get_perm()[0]->get_value() != 0) return 4; @@ -46,7 +45,7 @@ unsigned allocation::is_ld_padded(ir::value *x) { return 0; } -unsigned allocation::get_num_bytes(ir::value *x) { +unsigned memalloc::get_num_bytes(ir::value *x) { if(auto *red = dynamic_cast(x)){ unsigned num_bytes = x->get_type()->get_scalar_ty()->get_primitive_size_in_bits() / 8; size_t axis = red->get_axis(); @@ -74,7 +73,7 @@ unsigned allocation::get_num_bytes(ir::value *x) { return num_bytes; } -void allocation::run(){ +void memalloc::run(){ using std::max; using std::min; typedef std::multimap triples_map_type; @@ -178,4 +177,3 @@ void allocation::run(){ } } } -} diff --git a/lib/codegen/analysis/shmem/info.cc b/lib/codegen/analysis/meminfo.cc similarity index 91% rename from lib/codegen/analysis/shmem/info.cc rename to lib/codegen/analysis/meminfo.cc index d16048d3b..d0b075603 100644 --- a/lib/codegen/analysis/shmem/info.cc +++ b/lib/codegen/analysis/meminfo.cc @@ -1,5 +1,5 @@ #include -#include "triton/codegen/analysis/shmem/info.h" +#include "triton/codegen/analysis/meminfo.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" @@ -10,10 +10,9 @@ namespace triton { namespace codegen{ namespace analysis{ -namespace shmem{ // run pass on module -bool info::is_loop_latch(ir::phi_node *phi, ir::instruction *terminator){ +bool meminfo::is_loop_latch(ir::phi_node *phi, ir::instruction *terminator){ if(phi->get_parent() != terminator->get_parent()) return false; if(auto *br = dynamic_cast(terminator)) @@ -25,7 +24,7 @@ bool info::is_loop_latch(ir::phi_node *phi, ir::instruction *terminator){ throw std::runtime_error("unreachable"); } -void info::replace(ir::value* before, ir::value *after) { +void meminfo::replace(ir::value* before, ir::value *after) { shared_.erase(before); shared_.insert(after); if(refs_.find(before) != refs_.end()){ @@ -72,7 +71,7 @@ void add_copy(ir::value *x, ir::builder &builder) { } } -void info::run(ir::module &mod) { +void meminfo::run(ir::module &mod) { // Add shared copies for(ir::function *fn: mod.get_function_list()){ ir::builder builder(mod.get_context()); @@ -122,15 +121,15 @@ void info::run(ir::module &mod) { } // query double-buffered status -bool info::is_double(ir::value *x) +bool meminfo::is_double(ir::value *x) { return double_.find(x) != double_.end(); } // query shared status -bool info::is_shared(ir::value *x) +bool meminfo::is_shared(ir::value *x) { return shared_.find(x) != shared_.end(); } // get reference if any -ir::value *info::get_reference(ir::value *x) +ir::value *meminfo::get_reference(ir::value *x) { return refs_[x]; } @@ -138,4 +137,3 @@ ir::value *info::get_reference(ir::value *x) } } } -} diff --git a/lib/codegen/selection/selection.cc b/lib/codegen/selection.cc similarity index 99% rename from lib/codegen/selection/selection.cc rename to lib/codegen/selection.cc index a44a4c926..ff246f4f5 100644 --- a/lib/codegen/selection/selection.cc +++ b/lib/codegen/selection.cc @@ -1,8 +1,8 @@ -#include "triton/codegen/selection/selection.h" -#include "triton/codegen/analysis/tune.h" -#include "triton/codegen/analysis/shmem/allocation.h" -#include "triton/codegen/selection/target.h" -#include "triton/codegen/analysis/alignment.h" +#include "triton/codegen/selection.h" +#include "triton/codegen/target.h" +#include "triton/codegen/analysis/grid.h" +#include "triton/codegen/analysis/memalloc.h" +#include "triton/codegen/analysis/align.h" #include "triton/ir/context.h" #include "triton/ir/module.h" #include "triton/ir/function.h" @@ -1304,10 +1304,7 @@ void selection::lower_masked_load(ir::masked_load_inst *x, LLVMContext &ctx, Fun unsigned id = linear / vector_size; if(linear % vector_size == 0) { Value *ptr = pointers->get_value(idx); -// ConstantInt *cst = nullptr; -// if(GetElementPtrInst *gep = dyn_cast(ptr)) -// if(gep->getNumIndices() == 1) -// cst = dyn_cast(gep->idx_begin()); + ptr = builder.CreateBitCast(ptr, PointerType::get(VectorType::get(result->get_ty(), vector_size), ptr->getType()->getPointerAddressSpace())); @@ -1326,23 +1323,28 @@ void selection::lower_masked_load(ir::masked_load_inst *x, LLVMContext &ctx, Fun ((PHINode*)current_result)->addIncoming(result_then, mask_then_bb); Value *result_false = false_values->get_value(idx); if(result_then->getType()->isVectorTy()) - result_false = builder.CreateVectorSplat(vector_size, result_false); + result_false = builder.CreateVectorSplat(vector_size, llvm::UndefValue::get(result_false->getType())); ((PHINode*)current_result)->addIncoming(result_false, current_bb); } else current_result = result_then; +// ConstantInt *cst = nullptr; +// if(GetElementPtrInst *gep = dyn_cast(ptr)) +// if(gep->getNumIndices() == 1) +// cst = dyn_cast(gep->idx_begin()); +// llvm::Value* mask = masks->get_value(idx); // std::string offset = ""; // if(cst) // offset = " + " + std::to_string(cst->getValue().getSExtValue()*2*vector_size); // Type *fp16x2_ty = VectorType::get(builder.getHalfTy(), 2); // Type *fp16x2_pack4_ty = StructType::get(ctx, {fp16x2_ty, fp16x2_ty, fp16x2_ty, fp16x2_ty}); // FunctionType *ty = FunctionType::get(fp16x2_pack4_ty, {mask->getType(), ptr->getType()}, false); -// std::string asm_str = "@$0 ld.global.nc.v4.b32 {$1, $2, $3, $4}, [$5" + offset + "];"; -// if(false_value) +// std::string asm_str = "@$0 ld.global.nc.b32 {$1, $2, $3, $4}, [$5" + offset + "];"; +// if(false_values) // asm_str += "\n\t@!$0 mov.v4.b32 {$1, $2, $3, $4}, {0, 0, 0, 0};"; // InlineAsm *iasm = InlineAsm::get(ty, asm_str, "b,=r,=r,=r,=r,l", true); -// Value *result = builder.CreateCall(iasm, {mask, ptr}); +// Value *current_result = builder.CreateCall(iasm, {mask, ptr}); packets[id] = current_result; } @@ -1499,9 +1501,11 @@ void selection::run(ir::module &src, Module &dst) { for(auto attr_pair: fn->attrs()){ unsigned id = attr_pair.first; for(ir::attribute attr: attr_pair.second) - if(attr.is_llvm_attr()) + if(attr.is_llvm_attr()){ dst_fn->addAttribute(id, llvm_attr(dst_ctx, attr)); + } } + tgt_->set_kernel(dst_builder, dst_ctx, &dst, dst_fn); // set metadata Metadata *md_args[] = { diff --git a/lib/codegen/selection/target.cc b/lib/codegen/target.cc similarity index 99% rename from lib/codegen/selection/target.cc rename to lib/codegen/target.cc index 3a5e35aa1..4116bcca7 100644 --- a/lib/codegen/selection/target.cc +++ b/lib/codegen/target.cc @@ -1,4 +1,4 @@ -#include "triton/codegen/selection/target.h" +#include "triton/codegen/target.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Function.h" #include "llvm/IR/Intrinsics.h" diff --git a/lib/codegen/transform/shmem/barriers.cc b/lib/codegen/transform/membar.cc similarity index 81% rename from lib/codegen/transform/shmem/barriers.cc rename to lib/codegen/transform/membar.cc index 6b66ab148..007263543 100644 --- a/lib/codegen/transform/shmem/barriers.cc +++ b/lib/codegen/transform/membar.cc @@ -2,9 +2,9 @@ #include #include -#include "triton/codegen/transform/shmem/barriers.h" -#include "triton/codegen/analysis/shmem/allocation.h" -#include "triton/codegen/analysis/shmem/info.h" +#include "triton/codegen/transform/membar.h" +#include "triton/codegen/analysis/memalloc.h" +#include "triton/codegen/analysis/meminfo.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" @@ -16,7 +16,7 @@ namespace triton { namespace codegen{ namespace transform{ -bool shmem_barriers::intersect(const interval_vec_t &X, interval_t x) { +bool membar::intersect(const interval_vec_t &X, interval_t x) { return std::any_of(X.begin(), X.end(), [&](const interval_t &y){ bool left_intersect = y.first <= x.first && x.first < y.second; bool right_intersect = y.first <= x.second && x.second < y.second; @@ -24,13 +24,13 @@ bool shmem_barriers::intersect(const interval_vec_t &X, interval_t x) { }); } -bool shmem_barriers::intersect(const interval_vec_t &X, const interval_vec_t &Y) { +bool membar::intersect(const interval_vec_t &X, const interval_vec_t &Y) { return std::any_of(Y.begin(), Y.end(), [&](const interval_t &y){ return intersect(X, y); }); } -void shmem_barriers::add_reference(ir::value *v, interval_vec_t &res){ +void membar::add_reference(ir::value *v, interval_vec_t &res){ if(buffer_info_->is_shared(v) && !dynamic_cast(v)){ unsigned offset = alloc_->get_offset(v); unsigned num_bytes = alloc_->get_num_bytes(v); @@ -38,17 +38,17 @@ void shmem_barriers::add_reference(ir::value *v, interval_vec_t &res){ } } -void shmem_barriers::get_read_intervals(ir::instruction *i, interval_vec_t &res){ +void membar::get_read_intervals(ir::instruction *i, interval_vec_t &res){ for(ir::value *op: i->ops()) add_reference(op, res); } -void shmem_barriers::get_written_intervals(ir::instruction *i, interval_vec_t &res){ +void membar::get_written_intervals(ir::instruction *i, interval_vec_t &res){ if(!dynamic_cast(i)) add_reference(i, res); } -void shmem_barriers::insert_barrier(ir::instruction *instr, ir::builder &builder) { +void membar::insert_barrier(ir::instruction *instr, ir::builder &builder) { if(auto *phi = dynamic_cast(instr)) { std::set incoming; for(unsigned n = 0; n < phi->get_num_incoming(); n++){ @@ -67,16 +67,16 @@ void shmem_barriers::insert_barrier(ir::instruction *instr, ir::builder &builder } } -shmem_barriers::interval_vec_t shmem_barriers::join(const std::vector& intervals) { - shmem_barriers::interval_vec_t result; +membar::interval_vec_t membar::join(const std::vector& intervals) { + membar::interval_vec_t result; for(auto x: intervals) for(interval_t i: x) result.push_back(i); return result; } -std::pair shmem_barriers::transfer(ir::basic_block *block, +std::pair membar::transfer(ir::basic_block *block, const interval_vec_t &written_to, const interval_vec_t &read_from, std::set& insert_loc) { @@ -104,7 +104,7 @@ std::pair rpo = ir::cfg::reverse_post_order(fn); diff --git a/lib/codegen/transform/reassociate.cc b/lib/codegen/transform/reassociate.cc index 532c8e186..b0f4a2e73 100644 --- a/lib/codegen/transform/reassociate.cc +++ b/lib/codegen/transform/reassociate.cc @@ -1,7 +1,8 @@ #include +#include #include "triton/codegen/transform/reassociate.h" -#include "triton/codegen/analysis/alignment.h" -#include "triton/codegen/analysis/tune.h" +#include "triton/codegen/analysis/align.h" +#include "triton/codegen/analysis/grid.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" @@ -161,7 +162,7 @@ ir::value *reassociate::reassociate_idx(ir::value *old_value, return new_value; } -reassociate::reassociate(analysis::alignment_info *align, analysis::grids* params) +reassociate::reassociate(analysis::align *align, analysis::grids* params) : params_(params), align_(align) { } @@ -209,6 +210,29 @@ void reassociate::run(ir::module &mod) { for(ir::basic_block *block: rpo){ // iterate through instruction for(ir::instruction *i: block->get_inst_list()){ + // retiling + if(ir::retile_inst *rt = dynamic_cast(i)) { + ir::value* op = rt->get_operand(0); + if(infos.find(op) != infos.end()){ + builder.set_insert_point(rt); + ir::getelementptr_inst* sta = infos.at(op).sta_ptr; + ir::value* dyn = infos.at(op).dyn_ptr; + ir::value* cst = *sta->idx_begin(); + if(dynamic_cast(rt)) { + auto shapes = rt->get_type()->get_tile_shapes(); + ir::value* ndyn = builder.create_broadcast(dyn, shapes); + ir::value* broadcast = builder.create_broadcast(cst, shapes); + ir::getelementptr_inst* nsta = (ir::getelementptr_inst*)builder.create_gep(ndyn, {broadcast}); + params_->copy(ndyn, rt); + params_->copy(nsta, rt); + params_->copy(broadcast, rt); + align_->copy(ndyn, rt); + align_->copy(nsta, rt); + align_->copy(broadcast, rt); + infos[rt] = cst_info{ndyn, nsta}; + } + } + } // getelementptr instruction if(ir::getelementptr_inst *pz = dynamic_cast(i)){ if(replaced.find(pz) != replaced.end()) diff --git a/lib/codegen/transform/vectorize.cc b/lib/codegen/transform/vectorize.cc index dbf7ee7f1..16309ffc5 100644 --- a/lib/codegen/transform/vectorize.cc +++ b/lib/codegen/transform/vectorize.cc @@ -1,5 +1,5 @@ #include "triton/codegen/transform/vectorize.h" -#include "triton/codegen/analysis/tune.h" +#include "triton/codegen/analysis/grid.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" diff --git a/lib/driver/device.cc b/lib/driver/device.cc index fceb2754e..3f82e2f33 100755 --- a/lib/driver/device.cc +++ b/lib/driver/device.cc @@ -27,7 +27,7 @@ #include #include "triton/driver/device.h" #include "triton/driver/context.h" -#include "triton/codegen/selection/target.h" +#include "triton/codegen/target.h" namespace triton { diff --git a/lib/driver/module.cc b/lib/driver/module.cc index 5a9bfc86f..f41fdc0e5 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -223,12 +223,12 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) { static_cast*>(options["nvptx-short-ptr"])->setValue(true); // create llvm::SmallVector buffer; - module::compile_llvm_module(module, "nvptx64-nvidia-cuda", "sm_70", "", buffer, "", Assembly); + module::compile_llvm_module(module, "nvptx64-nvidia-cuda", "sm_60", "", buffer, "", Assembly); std::string result(buffer.begin(), buffer.end()); size_t start_replace = result.find(".version"); size_t end_replace = result.find('\n', start_replace); assert(start_replace != std::string::npos); - result.replace(start_replace, end_replace - start_replace, ".version 6.4"); + result.replace(start_replace, end_replace - start_replace, ".version 6.0"); return result; } @@ -245,10 +245,10 @@ cu_module::cu_module(driver::context * context, std::string const & source) : mo try{ dispatch::cuModuleLoadDataEx(&*cu_, source_.data(), 2, opt, optval); }catch(exception::cuda::base const &){ -#ifdef TRITON_LOG_PTX_ERROR +//#ifdef TRITON_LOG_PTX_ERROR std::cerr << "Compilation Failed! Log: " << std::endl; std::cerr << errbuf << std::endl; -#endif +//#endif throw; } } diff --git a/lib/ir/module.cc b/lib/ir/module.cc index 3d995558e..98f171252 100644 --- a/lib/ir/module.cc +++ b/lib/ir/module.cc @@ -29,6 +29,7 @@ void module::set_value(const std::string& name, ir::basic_block *block, ir::valu if(it != metadatas_.end()){ x->set_metadata(it->second.first, it->second.second); } + value->set_name(name); } void module::set_value(const std::string& name, ir::value *value){ diff --git a/lib/ir/print.cc b/lib/ir/print.cc index 31cc15d9a..124091262 100644 --- a/lib/ir/print.cc +++ b/lib/ir/print.cc @@ -22,6 +22,18 @@ std::string get_name(ir::value *v, unsigned i) { void print(module &mod, std::ostream& os) { unsigned cnt = 0; for(ir::function *fn: mod.get_function_list()){ + os << "def " << fn->get_fn_type()->get_return_ty()->repr() << " " << fn->get_name() << "(" ; + for(ir::argument* arg: fn->args()) { + if(arg->get_arg_no() > 0) + os << ", "; + os << arg->get_type()->repr() << " " << arg->get_name(); + auto attrs = fn->get_attributes(arg); + if(attrs.size() > 0) + os << " "; + for(ir::attribute attr: attrs) + os << attr.repr() << " "; + } + os << ")" << std::endl; os << "{" << std::endl; for(ir::basic_block *block: fn->blocks()){ auto const &predecessors = block->get_predecessors(); diff --git a/lib/lang/code_gen.cc b/lib/lang/code_gen.cc index c2c691cb5..228bd69dd 100644 --- a/lib/lang/code_gen.cc +++ b/lib/lang/code_gen.cc @@ -373,8 +373,11 @@ void Generator::VisitFuncDef(FuncDef* funcDef) { for(Object* obj: type->Params()){ std::string name = obj->Name(); args[i]->set_name(name); - for(ASTNode::Attr attr: obj->GetAttrList()) - fn->add_attr(i, GenIRAttr(attr)); + if(obj->Type()->ToPointer()) + fn->add_attr(i + 1, ir::attribute(ir::aligned, 16)); + for(ASTNode::Attr attr: obj->GetAttrList()){ + fn->add_attr(i + 1, GenIRAttr(attr)); + } if(obj->IsRestrictQualified()) fn->add_attr(i, ir::attribute(ir::noalias)); mod_->set_value(name, nullptr, args[i]); diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 9b2072974..703918ba5 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -3,7 +3,7 @@ #include #include #include -#include "triton/codegen/selection/selection.h" +#include "triton/codegen/selection.h" #include "triton/runtime/function.h" #include "triton/lang/cpp.h" #include "triton/lang/parser.h" @@ -167,8 +167,6 @@ function::caller function::autotune(driver::stream* stream, const grid_fn_ty& gr bin = make_bin(*ir, stream->context(), opt); }catch(const std::runtime_error& e) { return; - }catch(const driver::exception::cuda::invalid_ptx& e) { - return; } // benchmark ir::function *tmp = ir->get_function_list()[0]; @@ -191,23 +189,31 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c std::unique_ptr target = context->device()->make_target(); // create passes codegen::analysis::grids grids(opt.num_warps); - codegen::analysis::shmem::info shmem_info; - codegen::analysis::shmem::liveness shmem_liveness(&shmem_info); - codegen::analysis::shmem::allocation shmem_allocation(&shmem_liveness, &shmem_info, &grids); - codegen::analysis::alignment_info alignment_info; - codegen::transform::shmem_barriers shmem_barriers(&shmem_allocation, &shmem_info); + codegen::analysis::meminfo shmem_info; + codegen::analysis::liveness shmem_liveness(&shmem_info); + codegen::analysis::memalloc shmem_allocation(&shmem_liveness, &shmem_info, &grids); + codegen::analysis::align alignment_info; + codegen::transform::membar shmem_barriers(&shmem_allocation, &shmem_info); codegen::transform::vectorize vectorize(&grids); codegen::transform::dce dce; codegen::transform::peephole peephole; codegen::transform::reassociate reassociate(&alignment_info, &grids); codegen::selection selection(&shmem_allocation, &grids, &shmem_info, &alignment_info, target.get()); + + // run passes peephole.run(module); dce.run(module); - grids.run(module); alignment_info.run(module); + grids.run(module); +// ir::print(module, std::cout); + reassociate.run(module); + dce.run(module); +// ir::print(module, std::cout); + peephole.run(module); + if(target->is_gpu()){ shmem_info.run(module); shmem_liveness.run(module); @@ -217,7 +223,8 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c dce.run(module); vectorize.run(module); dce.run(module); -// ir::print(module, std::cout); + + // generate llvm code llvm::LLVMContext ctx; std::unique_ptr llvm(new llvm::Module(module.get_name(), ctx)); diff --git a/tests/bench/CMakeLists.txt b/tests/bench/CMakeLists.txt index 1f3cc3341..598dadeea 100644 --- a/tests/bench/CMakeLists.txt +++ b/tests/bench/CMakeLists.txt @@ -1,4 +1,4 @@ -foreach(PROG dot) +foreach(PROG dot copy1d copy2d) set(TARGET bench_${PROG}) add_executable(${TARGET} ${PROG}.cc) set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME ${TARGET}) diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index cb678ff99..3fecb8e58 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -17,7 +17,7 @@ inline size_t ceil(size_t x, size_t y) { return (x + y - 1) / y; }; -inline rt::function::grid_fn_ty grid(size_t M, size_t N) { +inline rt::function::grid_fn_ty grid2d(size_t M, size_t N) { return [M, N](const rt::function::options_t& x) { return rt::grid_t{ceil(M, x.D("TM")), ceil(N, x.D("TN"))}; @@ -42,11 +42,9 @@ std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, i // create options rt::function::options_space_t opt; opt.defines.push_back({"TYPE", {ty}}); - if(AT) - opt.defines.push_back({"AT", {""}}); - if(BT) - opt.defines.push_back({"BT", {""}}); - opt.defines.push_back({"TM", {"64"}}); + opt.defines.push_back({"AT", {AT?"1":"0"}}); + opt.defines.push_back({"BT", {BT?"1":"0"}}); + opt.defines.push_back({"TM", {"128"}}); opt.defines.push_back({"TN", {"64"}}); opt.defines.push_back({"TK", {"8"}}); opt.num_warps = {4}; @@ -55,18 +53,18 @@ std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, i // benchmark available libraries std::vector result; auto tflops = [&](double nanosec) { return 2.*M*N*K / nanosec * 1e-3; }; -// // cublas -// if(cublas::cublasinit()){ -// NumericT alpha(static_cast(1)); -// NumericT beta(static_cast(0)); -// cublasGemmAlgo_t fastest; -// cublasGemm(CUDA_R_16F, stream, AT, BT, M, N, K, &alpha, &*da, lda, &*db, ldb, &beta, &*dc, ldc, &fastest); -// double cublas_ms = triton::tools::bench([&]() { cublasGemm(CUDA_R_16F, stream, AT, BT, M, N, K, -// &alpha, &*da, lda, &*db, ldb, &beta, &*dc, ldc, nullptr, fastest); }, stream); -// result.push_back(tflops(cublas_ms)); -// } + // cublas + if(cublas::cublasinit()){ + NumericT alpha(static_cast(1)); + NumericT beta(static_cast(0)); + cublasGemmAlgo_t fastest = CUBLAS_GEMM_ALGO5; +// cublasGemm(CUDA_R_32F, stream, AT, BT, M, N, K, &alpha, &*da, lda, &*db, ldb, &beta, &*dc, ldc, &fastest); + double cublas_ms = triton::tools::bench([&]() { cublasGemm(CUDA_R_32F, stream, AT, BT, M, N, K, + &alpha, &*da, lda, &*db, ldb, &beta, &*dc, ldc, nullptr, fastest); }, stream); + result.push_back(tflops(cublas_ms)); + } // triton - double triton_ms = triton::tools::bench([&]() { function({&*da, &*db, &*dc, M, N, K, lda, ldb, ldc}, grid(M, N), stream);}, stream); + double triton_ms = triton::tools::bench([&]() { function({&*da, &*db, &*dc, M, N, K, lda, ldb, ldc}, grid2d(M, N), stream);}, stream); result.push_back(tflops(triton_ms)); // done return result; @@ -79,11 +77,9 @@ int main() { // shapes to benchmark typedef std::tuple config_t; std::vector configs; - for(auto x: std::vector>{{false, false}, - {false, true}, - {true, false}}){ + for(auto x: std::vector>{{false, true}}){ std::vector tmp = { - config_t{x[0], x[1], 8192, 8192, 8192} + config_t{x[0], x[1], 2048, 2048, 2048} // config_t{x[0], x[1], 16, 2048, 2048}, // config_t{x[0], x[1], 32, 2048, 2048}, // config_t{x[0], x[1], 64, 2048, 2048}, diff --git a/tests/common/cuda/forward.h b/tests/common/cuda/forward.h index 1c12c4247..bd32adec6 100644 --- a/tests/common/cuda/forward.h +++ b/tests/common/cuda/forward.h @@ -24,11 +24,11 @@ typedef enum{ typedef enum { CUBLAS_GEMM_DFALT = -1, CUBLAS_GEMM_DEFAULT = -1, - CUBLAS_GEMM_ALGO0 = 0, - CUBLAS_GEMM_ALGO1 = 1, - CUBLAS_GEMM_ALGO2 = 2, - CUBLAS_GEMM_ALGO3 = 3, - CUBLAS_GEMM_ALGO4 = 4, + CUBLAS_GEMM_ALGO0 = 0, // maxwell_sgemm_32x128_nt + CUBLAS_GEMM_ALGO1 = 1, // maxwell_sgemm_64x64_nt + CUBLAS_GEMM_ALGO2 = 2, // maxwell_sgemm_128x32_nt + CUBLAS_GEMM_ALGO3 = 3, // maxwell_sgemm_128x64_nt + CUBLAS_GEMM_ALGO4 = 4, // maxwell_sgemm_128x128_nt CUBLAS_GEMM_ALGO5 = 5, CUBLAS_GEMM_ALGO6 = 6, CUBLAS_GEMM_ALGO7 = 7, @@ -102,4 +102,4 @@ typedef enum { CUBLAS_TENSOR_OP_MATH = 1 } cublasMath_t; -#endif \ No newline at end of file +#endif diff --git a/tests/common/src/dot.h b/tests/common/src/dot.h index 3e636e18a..c9b3454d7 100644 --- a/tests/common/src/dot.h +++ b/tests/common/src/dot.h @@ -2,33 +2,33 @@ namespace src { const char *dot = R"( -#ifdef AT +#if AT == 1 #define USEA ^a -#define STRIDE_AK lda -#define STRIDE_AM 1 +#define STRIDE_AK 1 +#define STRIDE_AM lda #define BROADCAST_AK :, newaxis #define BROADCAST_AM newaxis, : #define SHAPE_A TK, TM #else #define USEA a -#define STRIDE_AK 1 -#define STRIDE_AM lda +#define STRIDE_AK lda +#define STRIDE_AM 1 #define BROADCAST_AK newaxis, : #define BROADCAST_AM :, newaxis #define SHAPE_A TM, TK #endif -#ifdef BT +#if BT == 1 #define USEB ^b -#define STRIDE_BK 1 -#define STRIDE_BN ldb +#define STRIDE_BK ldb +#define STRIDE_BN 1 #define BROADCAST_BK newaxis, : #define BROADCAST_BN :, newaxis #define SHAPE_B TN, TK #else #define USEB b -#define STRIDE_BK ldb -#define STRIDE_BN 1 +#define STRIDE_BK 1 +#define STRIDE_BN ldb #define BROADCAST_BK :, newaxis #define BROADCAST_BN newaxis, : #define SHAPE_B TK, TN @@ -58,17 +58,15 @@ void dot(TYPE * A, TYPE * B, TYPE * C, c += USEA @ USEB; pa = pa + TK * STRIDE_AK; pb = pb + TK * STRIDE_BK; - a = *pa; - b = *pb; + a = ((bool[SHAPE_A])(k > TK)) ? *pa : 0; + b = ((bool[SHAPE_B])(k > TK)) ? *pb : 0; } // epilogue int rxc[TM] = ridx * TM + 0 ... TM; int ryc[TN] = ridy * TN + 0 ... TN; - TYPE* pc[TM, TN] = C + ryc[newaxis, :] + rxc[:, newaxis] * ldc; - bool checkc[TM, TN] = (rxc < M)[:, newaxis] && (ryc < N)[newaxis, :]; - *?(checkc) pc = c; + TYPE* pc[TM, TN] = C + ryc[newaxis, :] * ldc + rxc[:, newaxis]; + *pc = c; } - )"; } diff --git a/tests/common/util.h b/tests/common/util.h index a60050af7..d8ffef090 100644 --- a/tests/common/util.h +++ b/tests/common/util.h @@ -3,21 +3,35 @@ #ifndef _TRITON_TESTS_UTIL_H #define _TRITON_TESTS_UTIL_H +#include #include "triton/runtime/function.h" +namespace drv = triton::driver; namespace rt = triton::runtime; inline size_t ceil(size_t x, size_t y) { return (x + y - 1) / y; }; -inline rt::function::grid_fn_ty grid(size_t M, size_t N) { +inline rt::function::grid_fn_ty grid1d(size_t N) { + return [N](const rt::function::options_t& x) { + return rt::grid_t{ceil(N, x.D("TN"))}; + }; +} + +inline rt::function::grid_fn_ty grid2d(size_t M, size_t N) { return [M, N](const rt::function::options_t& x) { return rt::grid_t{ceil(M, x.D("TM")), ceil(N, x.D("TN"))}; }; } +enum order_t { + ROWMAJOR, + COLMAJOR +}; + + namespace aux{ template struct seq{}; @@ -51,11 +65,14 @@ namespace testing { if(hc.size() != rc.size()) return false; for(size_t i = 0; i < hc.size(); i++) - if(std::isinf(hc[i]) || std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-2) + if(std::isinf(hc[i]) || std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-2){ + std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; + return false; + } return true; } } -#endif \ No newline at end of file +#endif diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index f3cdae9a1..78fbc79d1 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -1,5 +1,5 @@ foreach(PROG dot) - set(TARGET test_${PROG}) + set(TARGET unit_${PROG}) add_executable(${TARGET} ${PROG}.cc) set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME ${TARGET}) target_link_libraries(${TARGET} triton dl) diff --git a/tests/unit/dot.cc b/tests/unit/dot.cc index 3c9ec96d8..69b8cf2d7 100644 --- a/tests/unit/dot.cc +++ b/tests/unit/dot.cc @@ -51,8 +51,8 @@ void cpu_ref(bool AT_, bool BT_, size_t M, size_t N, size_t K, bool do_test(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K, int32_t TM, int32_t TN, int32_t TK, size_t nwarp){ - typedef half_float::half NumericT; - std::string ty = "half"; + typedef float NumericT; + std::string ty = "float"; size_t dt_nbytes = sizeof(NumericT); drv::context* context = stream->context(); std::vector hc(M*N); @@ -78,17 +78,15 @@ bool do_test(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_ // run rt::function::options_space_t opt; opt.defines.push_back({"TYPE", {ty}}); - if(AT) - opt.defines.push_back({"AT", {""}}); - if(BT) - opt.defines.push_back({"BT", {""}}); + opt.defines.push_back({"AT", {AT?"1":"0"}}); + opt.defines.push_back({"BT", {BT?"1":"0"}}); opt.defines.push_back({"TM", {std::to_string(TM)}}); opt.defines.push_back({"TN", {std::to_string(TN)}}); opt.defines.push_back({"TK", {std::to_string(TK)}}); opt.num_warps = {nwarp}; rt::function function(src::dot, opt); try { - function({&*da, &*db, &*dc, M, N, K, lda, ldb, ldc}, grid(M, N), stream); + function({&*da, &*db, &*dc, M, N, K, lda, ldb, ldc}, grid2d(M, N), stream); } catch (const std::runtime_error& e) { return true; } From 97fdb5b6be730794b363866813a2803d89f9e6c4 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 3 Sep 2019 12:44:35 -0400 Subject: [PATCH 343/494] [tests] added missing files --- include/triton/codegen/analysis/align.h | 9 ++-- include/triton/driver/module.h | 2 +- lib/codegen/analysis/align.cc | 4 +- lib/driver/module.cc | 15 ++++-- tests/bench/copy1d.cc | 54 ++++++++++++++++++++ tests/bench/copy2d.cc | 65 +++++++++++++++++++++++++ tests/common/src/copy.h | 46 +++++++++++++++++ 7 files changed, 183 insertions(+), 12 deletions(-) create mode 100644 tests/bench/copy1d.cc create mode 100644 tests/bench/copy2d.cc create mode 100644 tests/common/src/copy.h diff --git a/include/triton/codegen/analysis/align.h b/include/triton/codegen/analysis/align.h index 6812314b7..610580cdb 100644 --- a/include/triton/codegen/analysis/align.h +++ b/include/triton/codegen/analysis/align.h @@ -2,6 +2,7 @@ #define TDL_INCLUDE_CODEGEN_ALIGNMENT_INFO_PASS_H #include +#include namespace triton { @@ -24,7 +25,7 @@ private: bool is_first_axis_unit(ir::value *v); // populate maps - cst_info populate_is_constant(ir::value *v); + std::vector populate_is_constant(ir::value *v); unsigned populate_max_contiguous(ir::value *v); unsigned populate_starting_multiple(ir::value *v); @@ -35,9 +36,9 @@ public: void copy(ir::value *dst, ir::value *src); private: - std::map is_constant_; - std::map max_contiguous_; - std::map starting_multiple_; + std::map> is_constant_; + std::map> max_contiguous_; + std::map> starting_multiple_; }; diff --git a/include/triton/driver/module.h b/include/triton/driver/module.h index 59fe6dcc0..18dea2453 100755 --- a/include/triton/driver/module.h +++ b/include/triton/driver/module.h @@ -65,7 +65,7 @@ public: // CUDA class cu_module: public module { - std::string compile_llvm_module(llvm::Module* module); + std::string compile_llvm_module(llvm::Module* module, driver::device* device); public: cu_module(driver::context* context, llvm::Module *module); diff --git a/lib/codegen/analysis/align.cc b/lib/codegen/analysis/align.cc index 85500aefb..864fb8ba5 100644 --- a/lib/codegen/analysis/align.cc +++ b/lib/codegen/analysis/align.cc @@ -36,11 +36,11 @@ bool align::is_first_axis_unit(ir::value *x){ return true; } -align::cst_info align::populate_is_constant(ir::value *v) { +std::vector align::populate_is_constant(ir::value *v) { if(is_constant_.find(v) != is_constant_.end()) return is_constant_.at(v); // helper for the cache - auto cache = [this,v](cst_info value){ + auto cache = [this,v](const std::vector& value){ return add_to_cache(v, value, is_constant_); } ; // populate diff --git a/lib/driver/module.cc b/lib/driver/module.cc index f41fdc0e5..34462e8ab 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -105,7 +105,7 @@ void module::compile_llvm_module(llvm::Module* module, const std::string& triple opt.UnsafeFPMath = false; opt.NoInfsFPMath = false; opt.NoNaNsFPMath = true; - llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, "-ptx60", opt, + llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt, llvm::Reloc::PIC_, llvm::None, llvm::CodeGenOpt::Aggressive); // set data layout if(layout.empty()) @@ -217,22 +217,27 @@ ocl_module::ocl_module(driver::context * context, llvm::Module* src): module(con // CUDA // /* ------------------------ */ -std::string cu_module::compile_llvm_module(llvm::Module* module) { +std::string cu_module::compile_llvm_module(llvm::Module* module, driver::device* device) { // options auto options = llvm::cl::getRegisteredOptions(); +// for(auto& opt: options) +// std::cout << opt.getKey().str() << std::endl; static_cast*>(options["nvptx-short-ptr"])->setValue(true); + // compute capability + auto cc = ((driver::cu_device*)device)->compute_capability(); + std::string sm = "sm_" + std::to_string(cc.first) + std::to_string(cc.second); // create llvm::SmallVector buffer; - module::compile_llvm_module(module, "nvptx64-nvidia-cuda", "sm_60", "", buffer, "", Assembly); + module::compile_llvm_module(module, "nvptx64-nvidia-cuda", sm, "", buffer, "ptx63", Assembly); std::string result(buffer.begin(), buffer.end()); size_t start_replace = result.find(".version"); size_t end_replace = result.find('\n', start_replace); assert(start_replace != std::string::npos); - result.replace(start_replace, end_replace - start_replace, ".version 6.0"); + result.replace(start_replace, end_replace - start_replace, ".version 6.4"); return result; } -cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { } +cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module, context->device())) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ // std::cout << source << std::endl; diff --git a/tests/bench/copy1d.cc b/tests/bench/copy1d.cc new file mode 100644 index 000000000..2e2fe20d2 --- /dev/null +++ b/tests/bench/copy1d.cc @@ -0,0 +1,54 @@ +#include +#include +#include +#include "triton/driver/backend.h" +#include "triton/driver/stream.h" +#include "triton/tools/bench.hpp" +#include "triton/external/half.hpp" +#include "triton/runtime/function.h" +#include "src/copy.h" +#include "util.h" +#include "cuda/cublas.h" + + +std::vector do_bench(drv::stream* stream, int32_t N){ + typedef float NumericT; + std::string ty = "float"; + size_t dt_nbytes = sizeof(NumericT); + drv::context* context = stream->context(); + // create inputs + auto dx = std::unique_ptr(drv::buffer::create(context, N*dt_nbytes)); + auto dy = std::unique_ptr(drv::buffer::create(context, N*dt_nbytes)); + // create options + rt::function::options_space_t opt; + opt.defines.push_back({"TYPE", {ty}}); + opt.defines.push_back({"TN", {"512"}}); + opt.num_warps = {4}; + // create function + rt::function function(src::copy1d, opt); + // benchmark available libraries + std::vector result; + auto gbps = [&](double ns) { return 2*N*dt_nbytes / (ns * 1e-9) * 1e-9; }; + // triton + double triton_ns = triton::tools::bench([&]() { function({&*dx, &*dy, N}, grid1d(N), stream);}, stream); + result.push_back(gbps(triton_ns)); + // done + return result; +} + +int main() { + // initialize default compute device + auto context = triton::driver::backend::contexts::get_default(); + triton::driver::stream* stream = triton::driver::stream::create(context); + // shapes to benchmark + typedef std::tuple config_t; + std::vector configs = { 1024*1024*16 }; + int N; + for(const auto& c: configs){ + std::tie(N) = c; + std::cout << "// " << c << std::flush; + for(auto perf: do_bench(stream, N)) + std::cout << ", " << perf << std::flush; + std::cout << std::endl; + } +} diff --git a/tests/bench/copy2d.cc b/tests/bench/copy2d.cc new file mode 100644 index 000000000..c512cf58d --- /dev/null +++ b/tests/bench/copy2d.cc @@ -0,0 +1,65 @@ +#include +#include +#include +#include "triton/driver/backend.h" +#include "triton/driver/stream.h" +#include "triton/tools/bench.hpp" +#include "triton/external/half.hpp" +#include "triton/runtime/function.h" +#include "src/copy.h" +#include "util.h" +#include "cuda/cublas.h" + + +std::vector do_bench(drv::stream* stream, int32_t M, int32_t N, order_t order){ + typedef float NumericT; + std::string ty = "float"; + size_t dt_nbytes = sizeof(NumericT); + drv::context* context = stream->context(); + int32_t ld = order == ROWMAJOR ? N : M; + // create inputs + auto dx = std::unique_ptr(drv::buffer::create(context, M*N*dt_nbytes)); + auto dy = std::unique_ptr(drv::buffer::create(context, M*N*dt_nbytes)); + // create options + rt::function::options_space_t opt; + opt.defines.push_back({"TYPE", {ty}}); + opt.defines.push_back({"ORDER", {order==ROWMAJOR?"ROWMAJOR":"COLMAJOR"}}); + opt.defines.push_back({"TM", {"32"}}); + opt.defines.push_back({"TN", {"32"}}); + opt.num_warps = {4}; + // create function + rt::function function(src::copy2d, opt); + // benchmark available libraries + std::vector result; + auto gbps = [&](double ns) { return 2*M*N*dt_nbytes / (ns * 1e-9) * 1e-9; }; + // triton + double triton_ns = triton::tools::bench([&]() { function({&*dx, &*dy, M, N, ld, ld}, grid2d(M, N), stream);}, stream); + result.push_back(gbps(triton_ns)); + // done + return result; +} + +int main() { + // initialize default compute device + auto context = triton::driver::backend::contexts::get_default(); + triton::driver::stream* stream = triton::driver::stream::create(context); + // shapes to benchmark + typedef std::tuple config_t; + std::vector configs; + for(auto x: std::vector{COLMAJOR}){ + std::vector tmp = { + config_t{1024, 1024, x} + }; + configs.insert(configs.end(), tmp.begin(), tmp.end()); + } + // does the work + int32_t M, N; + order_t ord; + for(const auto& c: configs){ + std::tie(M, N, ord) = c; + std::cout << "// " << M << ", " << N << ", " << ord << std::flush; + for(auto perf: do_bench(stream, M, N, ord)) + std::cout << ", " << perf << std::flush; + std::cout << std::endl; + } +} diff --git a/tests/common/src/copy.h b/tests/common/src/copy.h new file mode 100644 index 000000000..58651a84f --- /dev/null +++ b/tests/common/src/copy.h @@ -0,0 +1,46 @@ +namespace src { + + const char *copy1d = +R"( +void copy1d(TYPE * X __noalias __readonly __aligned(16), + TYPE * Y __noalias __readonly __aligned(16), + int N) { + int ridm = get_program_id(0); + int rm[TN] = ridm * TN + 0 ... TN; + TYPE* px[TN] = X + rm; + TYPE* py[TN] = Y + rm; + *py = *px; +} +)"; + + + const char *copy2d = +R"( +#if ORDER == ROWMAJOR +#define STRIDE_XM ldx +#define STRIDE_XN 1 +#define STRIDE_YM ldy +#define STRIDE_YN 1 +#else +#define STRIDE_XM 1 +#define STRIDE_XN ldx +#define STRIDE_YM 1 +#define STRIDE_YN ldy +#endif + +void copy2d(TYPE * X __noalias __readonly __aligned(16), + TYPE * Y __noalias __writeonly __aligned(16), + int M, int N, + int ldx __multipleof(8), + int ldy __multipleof(8)) { + int ridm = get_program_id(0); + int ridn = get_program_id(1); + int rm[TM] = ridm * TM + 0 ... TM; + int rn[TN] = ridn * TN + 0 ... TN; + TYPE* px[TM, TN] = X + rm[:, newaxis] + rn[newaxis, :] * ldx; + TYPE* py[TM, TN] = Y + rm[:, newaxis] + rn[newaxis, :] * ldy; + *py = *px; +} +)"; + +} From 5e03f0a065fee609c55c341040abce290299ef10 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 3 Sep 2019 15:28:07 -0400 Subject: [PATCH 344/494] [codegen][align] reverted some changes --- include/triton/codegen/analysis/align.h | 9 ++++----- lib/codegen/analysis/align.cc | 4 ++-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/include/triton/codegen/analysis/align.h b/include/triton/codegen/analysis/align.h index 610580cdb..6812314b7 100644 --- a/include/triton/codegen/analysis/align.h +++ b/include/triton/codegen/analysis/align.h @@ -2,7 +2,6 @@ #define TDL_INCLUDE_CODEGEN_ALIGNMENT_INFO_PASS_H #include -#include namespace triton { @@ -25,7 +24,7 @@ private: bool is_first_axis_unit(ir::value *v); // populate maps - std::vector populate_is_constant(ir::value *v); + cst_info populate_is_constant(ir::value *v); unsigned populate_max_contiguous(ir::value *v); unsigned populate_starting_multiple(ir::value *v); @@ -36,9 +35,9 @@ public: void copy(ir::value *dst, ir::value *src); private: - std::map> is_constant_; - std::map> max_contiguous_; - std::map> starting_multiple_; + std::map is_constant_; + std::map max_contiguous_; + std::map starting_multiple_; }; diff --git a/lib/codegen/analysis/align.cc b/lib/codegen/analysis/align.cc index 864fb8ba5..85500aefb 100644 --- a/lib/codegen/analysis/align.cc +++ b/lib/codegen/analysis/align.cc @@ -36,11 +36,11 @@ bool align::is_first_axis_unit(ir::value *x){ return true; } -std::vector align::populate_is_constant(ir::value *v) { +align::cst_info align::populate_is_constant(ir::value *v) { if(is_constant_.find(v) != is_constant_.end()) return is_constant_.at(v); // helper for the cache - auto cache = [this,v](const std::vector& value){ + auto cache = [this,v](cst_info value){ return add_to_cache(v, value, is_constant_); } ; // populate From 2ccc9150118679c1fc165850a43076d0ca78b17f Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 3 Sep 2019 20:44:27 -0400 Subject: [PATCH 345/494] [python][examples] added template for blocksparse --- CMakeLists.txt | 2 + lib/driver/module.cc | 4 +- python/examples/blocksparse.py | 158 +++++++++++++++++++++++++++++++++ python/examples/dot.py | 70 +++++++-------- python/setup.py | 4 +- python/src/tensorflow.cc | 2 +- python/triton/ops.py | 19 ++-- tests/bench/copy1d.cc | 6 +- tests/bench/dot.cc | 16 ++-- 9 files changed, 225 insertions(+), 56 deletions(-) create mode 100644 python/examples/blocksparse.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 9e05aca5d..d857a96ea 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -42,6 +42,8 @@ if(BUILD_PYTHON_MODULE) file(GLOB_RECURSE EXTRA_TF_OPS_SRC python/src/tensorflow/*.cc) add_library(extra_tf_ops SHARED ${EXTRA_TF_OPS_SRC}) target_link_libraries(extra_tf_ops triton ${TF_LIBS}) + target_compile_definitions(extra_tf_ops PRIVATE "-D_GLIBCXX_USE_CXX11_ABI=${TF_ABI}") + endif() diff --git a/lib/driver/module.cc b/lib/driver/module.cc index 34462e8ab..497fc332c 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -250,10 +250,10 @@ cu_module::cu_module(driver::context * context, std::string const & source) : mo try{ dispatch::cuModuleLoadDataEx(&*cu_, source_.data(), 2, opt, optval); }catch(exception::cuda::base const &){ -//#ifdef TRITON_LOG_PTX_ERROR +#ifdef TRITON_LOG_PTX_ERROR std::cerr << "Compilation Failed! Log: " << std::endl; std::cerr << errbuf << std::endl; -//#endif +#endif throw; } } diff --git a/python/examples/blocksparse.py b/python/examples/blocksparse.py new file mode 100644 index 000000000..27b7d1e9b --- /dev/null +++ b/python/examples/blocksparse.py @@ -0,0 +1,158 @@ +import tensorflow as tf +import triton +import numpy as np + +src = ''' + #if AT == 1 + #define USE_A ^a + #define STRIDE_AK lda + #define STRIDE_AM 1 + #define BROADCAST_AK :, newaxis + #define BROADCAST_AM newaxis, : + #define SHAPE_A TK, TM + #else + #define USE_A a + #define STRIDE_AK 1 + #define STRIDE_AM lda + #define BROADCAST_AK newaxis, : + #define BROADCAST_AM :, newaxis + #define SHAPE_A TM, TK + #endif + + #if BT == 1 + #define USE_B ^b + #define STRIDE_BK 1 + #define STRIDE_BM ldb + #define BROADCAST_BN newaxis, : + #define BROADCAST_BK :, newaxis + #define SHAPE_B TN, TK + #else + #define USE_B b + #define STRIDE_BK ldb + #define STRIDE_BM 1 + #define BROADCAST_BN :, newaxis + #define BROADCAST_BK newaxis, : + #define SHAPE_B TK, TN + #endif + + void dot (TYPE* A __readonly __noalias __align(16), + TYPE* B __readonly __noalias __align(16), + TYPE* C __writeonly __noalias __align(16), + int lda, int ldb, int ldc, + int N, int* lut, + int* locks, int nlocks) { + int ridx = get_program_id(0); + float c[TM, TN] = 0; + int rka[TK] = 0 ... TK; + int rkb[TK] = 0 ... TK; + // load LUT header + int *header = lut + get_program_id(1) * 4; + int offset = *(header + 0); + int K = *(header + 1); + int column = *(header + 2); + int lockid = *(header + 3); + int *plut = lut + offset * 2; + int offx = ridx; + int offy = 0; + // compute x, y offsets + int rxa[TM] = offx * TM + (0 ... TM); + int ryb[TN] = offy * TN + (0 ... TN); + // bounds checking + bool checka[SHAPE_A] = (rxa < N)[:, newaxis]; + bool checkb[SHAPE_B] = 1; + // base offset + int offa[SHAPE_A] = rxa[BROADCAST_AM] * STRIDE_AM + rka[BROADCAST_AK] * STRIDE_AK; + int offb[SHAPE_B] = ryb[BROADCAST_BN] * STRIDE_BN + rkb[BROADCAST_BK] * STRIDE_BK; + for(int k = K; k > 0; k -= 1) { + // fetch block indices + int ak = *(plut + 0); + int bk = *(plut + 1); + lut += 2; + // compute pointers to blocks + TYPE* pa[SHAPE_A] = A + offa + ak * TK * lda; + TYPE* pb[SHAPE_B] = B + offb + bk * TK * TN; + // load blocks + TYPE a[SHAPE_A] = checka ? *pa : 0; + TYPE b[SHAPE_B] = *pb; + // multiply blocks + c += USE_A @ USE_B; + } + int rxc[TM] = ridx * TM + (0 ... TM); + int ryc[TN] = column * TN + (0 ... TN); + TYPE* pc[TM, TN] = C + rxc[:, newaxis] + ryc[newaxis, :]*ldc; + bool checkc[TM, TN] = (rxc < N)[:, newaxis]; + if(lockid == 0) { + *?(checkc) pc = c; + } + else { + int *plock = locks + ridx*nlocks + lockid - 1; + int *pcount = plock + get_num_program(0)*nlocks; + while(__atomic_cas(plock, 0, 1)); + int count = *pcount; + if(count == 0) + *?(checkc) pc = c; + else + *?(checkc) pc = c + *pc; + __atomic_exch(pcount, 1); + __atomic_exch(plock, 0); + } + } +''' + + +# std::string dot::triton_c_src_dw() const { +# bool AT = (op_ == WGRAD); +# bool BT = (op_ == FPROP); +# std::string usea = AT ? "trans(a)" : "a"; +# std::string useb = BT ? "trans(b)" : "b"; +# std::string sizea = AT ? "TK, TM" : "TM, TK"; +# std::string sizeb = BT ? "TN, TK" : "TK, TN"; +# std::string bca0 = AT ? "newaxis, :" : ":, newaxis"; +# std::string bca1 = AT ? ":, newaxis" : "newaxis, :"; +# std::string bcb0 = BT ? ":, newaxis" : "newaxis, :"; +# std::string bcb1 = BT ? "newaxis, :" : ":, newaxis"; +# std::string lda0 = AT ? "*lda" : ""; +# std::string lda1 = AT ? "" : "*lda"; +# std::string ldb0 = BT ? "" : "*ldb"; +# std::string ldb1 = BT ? "*ldb" : "" ; +# std::string result = +# R"( +# const tunable int TM = {)" + std::to_string(BS_) + R"(}; +# const tunable int TN = {)" + std::to_string(BS_) + R"(}; +# const tunable int TK = {32}; +# void bsdot(restrict read_only align(16) )" + ab_ty_ + R"( *A, +# restrict read_only align(16) )" + ab_ty_ + R"( *B, +# )" + c_ty_ + R"(* C, +# int lda, int ldb, int ldc, +# int N, int* lut, +# int* locks, int nlocks) { +# int ridx = get_range_id(0); +# float acc[TM, TN] = 0; +# int rka[TK] = 0 ... TK; +# int rkb[TK] = 0 ... TK; +# int *header = lut + ridx * 2; +# int offx = *(header + 0); +# int offy = *(header + 1); +# int rxa[TM] = offx*TM + (0 ... TM); +# int ryb[TN] = offy*TN + (0 ... TN); +# bool checka[TK, TM] = (rka < N)[:, newaxis]; +# bool checkb[TK, TN] = (rkb < N)[:, newaxis]; +# int offa[)" + sizea + "] = rxa[" + bca0 + "]" + lda0 + " + rka[" + bca1 + "]" + lda1 + R"(; +# int offb[)" + sizeb + "] = ryb[" + bcb0 + "]" + ldb0 + " + rkb[" + bcb1 + "]" + ldb1 + R"(; +# )" + ab_ty_ + " * pa[" + sizea + R"(] = A + offa; +# )" + ab_ty_ + " * pb[" + sizeb + R"(] = B + offb; +# )" + ab_ty_ + " a[" + sizea + R"(] = checka ? *pa : 0; +# )" + ab_ty_ + " b[" + sizeb + R"(] = checkb ? *pb : 0; +# for(int k = N; k > 0; k = k - TK) { +# acc = dot()" + usea + ", " + useb + R"(, acc); +# pa = pa + TK)" + lda1 + R"(; +# pb = pb + TK)" + ldb1 + R"(; +# a = checka ? *pa : 0; +# b = checkb ? *pb : 0; +# } +# int rxc[TM] = (0 ... TM); +# int ryc[TN] = (0 ... TN); +# )" + c_ty_ + R"( c[TM, TN] = acc; +# )" + c_ty_ + R"(* pc[TM, TN] = C + rxc[:, newaxis]*TM + ryc[newaxis, :] + ridx*TM*TN; +# *pc = c; +# })"; \ No newline at end of file diff --git a/python/examples/dot.py b/python/examples/dot.py index ffb93fd33..4ea6fcf04 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -3,15 +3,16 @@ import triton import numpy as np src = """ +// Templates for accessing A #if AT == 1 -#define USEA ^a +#define USE_A ^a #define STRIDE_AK lda #define STRIDE_AM 1 #define BROADCAST_AK :, newaxis #define BROADCAST_AM newaxis, : #define SHAPE_A TK, TM #else -#define USEA a +#define USE_A a #define STRIDE_AK 1 #define STRIDE_AM lda #define BROADCAST_AK newaxis, : @@ -19,15 +20,16 @@ src = """ #define SHAPE_A TM, TK #endif +// Templates for accessing B #if BT == 1 -#define USEB ^b +#define USE_B ^b #define STRIDE_BK 1 #define STRIDE_BN ldb #define BROADCAST_BK newaxis, : #define BROADCAST_BN :, newaxis #define SHAPE_B TN, TK #else -#define USEB b +#define USE_B b #define STRIDE_BK ldb #define STRIDE_BN 1 #define BROADCAST_BK :, newaxis @@ -56,7 +58,7 @@ void dot(TYPE * A, TYPE * B, TYPE * C, TYPE b[SHAPE_B] = *pb; // reduction loop for(int k = K; k > 0; k-= TK){ - c += USEA @ USEB; + c += USE_A @ USE_B; pa = pa + TK * STRIDE_AK; pb = pb + TK * STRIDE_BK; a = *pa; @@ -71,57 +73,54 @@ void dot(TYPE * A, TYPE * B, TYPE * C, } """ -def cdiv(a, b): - return -(-a // b) - class dot_op: - def __init__(self, trans_a = False, trans_b = False): + def __init__(self, transpose_a = False, transpose_b = False): self.dot = triton.op(src, ['C']) - self.trans_a = trans_a - self.trans_b = trans_b + self.transpose_a = transpose_a + self.transpose_b = transpose_b def __call__(self, a, b): + # extract shapes shape_a = triton.shape(a) shape_b = triton.shape(b) - M = shape_a[0] - Ka = shape_a[1] - Kb = shape_b[0] - N = shape_b[1] + M, Ka = shape_a[0], shape_a[1] + Kb, N = shape_b[0], shape_b[1] # transpose shapes - if self.trans_a: + if self.transpose_a: M, Ka = Ka, M - if self.trans_b: + if self.transpose_b: Kb, N = N, Kb - K = Ka # contiguous dimensions - lda = Ka - ldb = N + lda = M if self.transpose_a else Ka + ldb = Kb if self.transpose_b else N ldc = N + # allocate output c = triton.empty([M, N]) - return self.dot(a, b, c, M, N, K, lda, ldb, ldc, - lambda opt: [cdiv(M, opt.d('TM')), cdiv(N, opt.d('TN'))], - AT = self.trans_a, BT = self.trans_b, TYPE = tf.float16, - TM = [128], TN = [ 128], TK = [32]) + # compute + return self.dot(a, b, c, M, N, Ka, lda, ldb, ldc, + lambda opt: [triton.cdiv(M, opt.d('TM')), triton.cdiv(N, opt.d('TN'))], + AT = self.transpose_a, BT = self.transpose_b, TYPE = tf.float16, + TM = [128], TN = [128], TK = [32]) -def dot(a, b, trans_a = False, trans_b = False): - if (trans_a, trans_b) not in dot.ops: - dot.ops[trans_a, trans_b] = dot_op(trans_a, trans_b) - return dot.ops[trans_a, trans_b](a, b) +def dot(a, b, transpose_a = False, transpose_b = False): + if (transpose_a, transpose_b) not in dot.ops: + dot.ops[transpose_a, transpose_b] = dot_op(transpose_a, transpose_b) + return dot.ops[transpose_a, transpose_b](a, b) dot.ops = dict() -# @triton.register_gradient(dot_op) -# def _dot_grad(op, dy): -# a = op.inputs[0] -# b = op.inputs[1] -# return [dot_tn(dy, b), dot_nt(a, dy), None, None, None, None, None, None, None] +@tf.RegisterGradient("Dot") +def _dot_grad(op, dy): + a = op.inputs[0] + b = op.inputs[1] + return [dot_tn(dy, b), dot_nt(a, dy), None, None, None, None, None, None, None] def run_dot(): M, N, K = 128, 128, 128 a = tf.placeholder(tf.float16, shape=[M, K]) b = tf.placeholder(tf.float16, shape=[N, K]) - c = dot(a, b, trans_a = False, trans_b = True) + c = dot(a, b, transpose_a = False, transpose_b = False) # Reference ha = np.random.rand(M, K).astype(np.float16) hb = np.random.rand(K, N).astype(np.float16) @@ -131,7 +130,8 @@ def run_dot(): result = sess.run([c], feed_dict = {a: ha, b: hb})[0] # Test - hresult = np.dot(ha, hb.T) + print(result) + hresult = np.dot(ha, hb) dif = np.abs(result - hresult) np.savetxt('dif.dat', dif, '%2.4f') print("dif: %f" % np.max(dif)) diff --git a/python/setup.py b/python/setup.py index b9285f84f..1cfe0a881 100644 --- a/python/setup.py +++ b/python/setup.py @@ -44,6 +44,7 @@ class CMakeBuild(build_ext): import tensorflow as tf tf_include_dirs = tf.sysconfig.get_include() tf_lib_dirs = tf.sysconfig.get_lib() + tf_abi = tf.__cxx11_abi_flag__ if "__cxx11_abi_flag__" in tf.__dict__ else 0 tf_libs = 'tensorflow_framework' cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir, @@ -52,7 +53,8 @@ class CMakeBuild(build_ext): '-DPYTHON_INCLUDE_DIRS=' + python_include_dirs, '-DTF_INCLUDE_DIRS=' + tf_include_dirs, '-DTF_LIB_DIRS=' + tf_lib_dirs, - '-DTF_LIBS=' + tf_libs] + '-DTF_LIBS=' + tf_libs, + '-DTF_ABI=' + str(tf_abi)] cfg = 'Debug' if self.debug else 'Release' build_args = ['--config', cfg] diff --git a/python/src/tensorflow.cc b/python/src/tensorflow.cc index 1932402e0..b01d5231c 100644 --- a/python/src/tensorflow.cc +++ b/python/src/tensorflow.cc @@ -4,7 +4,7 @@ #include #include #include -#include "triton/codegen/selection/selection.h" +#include "triton/codegen/selection.h" #include "triton/runtime/function.h" #include "triton/lang/code_gen.h" #include "triton/lang/parser.h" diff --git a/python/triton/ops.py b/python/triton/ops.py index f0b1ed86b..b4c4a7a54 100644 --- a/python/triton/ops.py +++ b/python/triton/ops.py @@ -102,12 +102,15 @@ def _build(src, path, framework): # libraries libraries = ['triton'] # add framework + extra_compile_args = [] if framework == tensorflow_id: _import_tensorflow() library_dirs += [tensorflow.sysconfig.get_lib()] include_dirs += [tensorflow.sysconfig.get_include()] include_dirs += ['/usr/local/cuda/include/'] libraries += ['tensorflow_framework'] + ABI = tensorflow.__cxx11_abi_flag__ if "__cxx11_abi_flag__" in tensorflow.__dict__ else 0 + extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI={ABI}'.format(ABI=ABI)] elif framework == torch_id: _import_torch() prefix = os.path.dirname(torch.__file__) @@ -120,7 +123,6 @@ def _build(src, path, framework): else: assert False # extra arguments - extra_compile_args = [] extra_link_args = [] # dependences depends = [os.path.realpath(libtriton.__file__)] @@ -254,14 +256,14 @@ class op: return op(*op_args, id=op_id) -# class register_gradient: +class register_gradient: -# def __init__(self, op): -# self.op = op + def __init__(self, op): + self.op = op -# def __call__(self, f): -# name = 'Dot' -# ops.RegisterGradient(name)(f) + def __call__(self, f): + name = 'Dot' + ops.RegisterGradient(name)(f) def empty(shapes, framework = None): @@ -276,6 +278,9 @@ def empty(shapes, framework = None): _import_torch() return torch.empty(*shapes) +def cdiv(a, b): + return -(-a // b) + class scalar: def __init__(self, x): diff --git a/tests/bench/copy1d.cc b/tests/bench/copy1d.cc index 2e2fe20d2..51afbacd6 100644 --- a/tests/bench/copy1d.cc +++ b/tests/bench/copy1d.cc @@ -22,8 +22,8 @@ std::vector do_bench(drv::stream* stream, int32_t N){ // create options rt::function::options_space_t opt; opt.defines.push_back({"TYPE", {ty}}); - opt.defines.push_back({"TN", {"512"}}); - opt.num_warps = {4}; + opt.defines.push_back({"TN", {"128"}}); + opt.num_warps = {1, 2, 4, 8}; // create function rt::function function(src::copy1d, opt); // benchmark available libraries @@ -42,7 +42,7 @@ int main() { triton::driver::stream* stream = triton::driver::stream::create(context); // shapes to benchmark typedef std::tuple config_t; - std::vector configs = { 1024*1024*16 }; + std::vector configs = { 1024*1024*32 }; int N; for(const auto& c: configs){ std::tie(N) = c; diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index 3fecb8e58..fc2243bfc 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -29,6 +29,7 @@ inline rt::function::grid_fn_ty grid2d(size_t M, size_t N) { std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K){ typedef float NumericT; std::string ty = "float"; + cublasDataType_t cuty = CUDA_R_32F; size_t dt_nbytes = sizeof(NumericT); drv::context* context = stream->context(); // leading dimensions @@ -44,10 +45,10 @@ std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, i opt.defines.push_back({"TYPE", {ty}}); opt.defines.push_back({"AT", {AT?"1":"0"}}); opt.defines.push_back({"BT", {BT?"1":"0"}}); - opt.defines.push_back({"TM", {"128"}}); - opt.defines.push_back({"TN", {"64"}}); + opt.defines.push_back({"TM", {"64", "128"}}); + opt.defines.push_back({"TN", {"64", "128"}}); opt.defines.push_back({"TK", {"8"}}); - opt.num_warps = {4}; + opt.num_warps = {2, 4, 8}; // create function rt::function function(src::dot, opt); // benchmark available libraries @@ -57,10 +58,11 @@ std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, i if(cublas::cublasinit()){ NumericT alpha(static_cast(1)); NumericT beta(static_cast(0)); - cublasGemmAlgo_t fastest = CUBLAS_GEMM_ALGO5; -// cublasGemm(CUDA_R_32F, stream, AT, BT, M, N, K, &alpha, &*da, lda, &*db, ldb, &beta, &*dc, ldc, &fastest); - double cublas_ms = triton::tools::bench([&]() { cublasGemm(CUDA_R_32F, stream, AT, BT, M, N, K, - &alpha, &*da, lda, &*db, ldb, &beta, &*dc, ldc, nullptr, fastest); }, stream); + cublasGemmAlgo_t fastest; + cublasGemm(cuty, stream, AT, BT, M, N, K, &alpha, &*da, lda, &*db, ldb, &beta, &*dc, ldc, &fastest); + double cublas_ms = triton::tools::bench([&]() { cublasGemm(cuty, stream, AT, BT, M, N, K, + &alpha, &*da, lda, &*db, ldb, &beta, &*dc, + ldc, nullptr, fastest); }, stream); result.push_back(tflops(cublas_ms)); } // triton From b747959a57dbff303af195221196986bb63043cf Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 4 Sep 2019 01:54:43 -0400 Subject: [PATCH 346/494] trying to work around tensorflow limitations --- CMakeLists.txt | 2 +- lib/driver/module.cc | 4 ++-- python/examples/blocksparse.py | 7 +++---- python/examples/dot.py | 23 ++++++++++++++--------- python/setup.py | 8 +++----- python/src/tensorflow.cc | 6 +----- python/src/tensorflow/alloc_empty.cc | 3 ++- python/triton/ops.py | 26 ++++++++++++++++++++++---- 8 files changed, 48 insertions(+), 31 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d857a96ea..201f14c5a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -50,7 +50,7 @@ endif() # Triton file(GLOB_RECURSE LIBTRITON_SRC lib/*.cc) add_library(triton SHARED ${LIBTRITON_SRC} ${PYTHON_SRC}) -target_link_libraries(triton LLVM ${TF_LIBS}) +target_link_libraries(triton LLVM) # Warning level #if(MSVC) diff --git a/lib/driver/module.cc b/lib/driver/module.cc index 497fc332c..34462e8ab 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -250,10 +250,10 @@ cu_module::cu_module(driver::context * context, std::string const & source) : mo try{ dispatch::cuModuleLoadDataEx(&*cu_, source_.data(), 2, opt, optval); }catch(exception::cuda::base const &){ -#ifdef TRITON_LOG_PTX_ERROR +//#ifdef TRITON_LOG_PTX_ERROR std::cerr << "Compilation Failed! Log: " << std::endl; std::cerr << errbuf << std::endl; -#endif +//#endif throw; } } diff --git a/python/examples/blocksparse.py b/python/examples/blocksparse.py index 27b7d1e9b..7d15fc4f4 100644 --- a/python/examples/blocksparse.py +++ b/python/examples/blocksparse.py @@ -87,19 +87,18 @@ src = ''' else { int *plock = locks + ridx*nlocks + lockid - 1; int *pcount = plock + get_num_program(0)*nlocks; - while(__atomic_cas(plock, 0, 1)); + while(atomic_cas(plock, 0, 1)); int count = *pcount; if(count == 0) *?(checkc) pc = c; else *?(checkc) pc = c + *pc; - __atomic_exch(pcount, 1); - __atomic_exch(plock, 0); + atomic_exch(pcount, 1); + atomic_exch(plock, 0); } } ''' - # std::string dot::triton_c_src_dw() const { # bool AT = (op_ == WGRAD); # bool BT = (op_ == FPROP); diff --git a/python/examples/dot.py b/python/examples/dot.py index 4ea6fcf04..dcb4c3540 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -81,6 +81,7 @@ class dot_op: self.transpose_b = transpose_b def __call__(self, a, b): + dtype = a.dtype # extract shapes shape_a = triton.shape(a) shape_b = triton.shape(b) @@ -96,13 +97,12 @@ class dot_op: ldb = Kb if self.transpose_b else N ldc = N # allocate output - c = triton.empty([M, N]) + c = triton.empty([M, N], dtype = dtype) # compute return self.dot(a, b, c, M, N, Ka, lda, ldb, ldc, lambda opt: [triton.cdiv(M, opt.d('TM')), triton.cdiv(N, opt.d('TN'))], - AT = self.transpose_a, BT = self.transpose_b, TYPE = tf.float16, - TM = [128], TN = [128], TK = [32]) - + AT = self.transpose_a, BT = self.transpose_b, TYPE = dtype, + TM = [128], TN = [128], TK = [8]) def dot(a, b, transpose_a = False, transpose_b = False): if (transpose_a, transpose_b) not in dot.ops: @@ -114,20 +114,25 @@ dot.ops = dict() def _dot_grad(op, dy): a = op.inputs[0] b = op.inputs[1] + print(op.triton) return [dot_tn(dy, b), dot_nt(a, dy), None, None, None, None, None, None, None] def run_dot(): M, N, K = 128, 128, 128 - a = tf.placeholder(tf.float16, shape=[M, K]) - b = tf.placeholder(tf.float16, shape=[N, K]) + a = tf.placeholder(tf.float32, shape=[M, K]) + b = tf.placeholder(tf.float32, shape=[N, K]) c = dot(a, b, transpose_a = False, transpose_b = False) + print("LULZ") + da, db = tf.gradients(c, [a, b]) + print(da, db) + exit # Reference - ha = np.random.rand(M, K).astype(np.float16) - hb = np.random.rand(K, N).astype(np.float16) + ha = np.random.rand(M, K).astype(np.float32) + hb = np.random.rand(K, N).astype(np.float32) # Run sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) - result = sess.run([c], feed_dict = {a: ha, + result = sess.run([da], feed_dict = {a: ha, b: hb})[0] # Test print(result) diff --git a/python/setup.py b/python/setup.py index 1cfe0a881..8a7c9b372 100644 --- a/python/setup.py +++ b/python/setup.py @@ -42,17 +42,15 @@ class CMakeBuild(build_ext): python_lib_dirs = distutils.sysconfig.get_config_var('LIBDIR') # tensorflow directories import tensorflow as tf - tf_include_dirs = tf.sysconfig.get_include() - tf_lib_dirs = tf.sysconfig.get_lib() tf_abi = tf.__cxx11_abi_flag__ if "__cxx11_abi_flag__" in tf.__dict__ else 0 - tf_libs = 'tensorflow_framework' - + tf_include_dirs = tf.sysconfig.get_include() + tf_libs = tf.sysconfig.get_link_flags()[1].replace('-l', '') cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir, '-DBUILD_TESTS=OFF', '-DBUILD_PYTHON_MODULE=ON', '-DPYTHON_INCLUDE_DIRS=' + python_include_dirs, '-DTF_INCLUDE_DIRS=' + tf_include_dirs, - '-DTF_LIB_DIRS=' + tf_lib_dirs, + '-DTF_LIB_DIRS=' + tf.sysconfig.get_lib(), '-DTF_LIBS=' + tf_libs, '-DTF_ABI=' + str(tf_abi)] diff --git a/python/src/tensorflow.cc b/python/src/tensorflow.cc index b01d5231c..cac300bd1 100644 --- a/python/src/tensorflow.cc +++ b/python/src/tensorflow.cc @@ -171,7 +171,7 @@ void gen_tf_register_op(std::ostream &os, const std::string &name, break; if(idx == args.size()) throw std::runtime_error("unknown output"); - os << " .Output(\"out" << i << ": " << to_tf_scalar_ty(args[idx]->get_type()) << "\")\n"; + os << " .Output(\"out" << i << ": T" << idx << "\")\n"; } os << " .Attr(\"id: int\")" << std::endl; os << ";\n"; @@ -239,10 +239,6 @@ std::tuple Date: Wed, 4 Sep 2019 03:12:23 -0400 Subject: [PATCH 347/494] [python] more generic gradient registration --- python/examples/dot.py | 23 ++++++++++---------- python/src/tensorflow.cc | 21 +++++++++++++++++- python/triton/ops.py | 47 +++++++++++++++++++++++++--------------- 3 files changed, 61 insertions(+), 30 deletions(-) diff --git a/python/examples/dot.py b/python/examples/dot.py index dcb4c3540..63b985747 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -73,14 +73,14 @@ void dot(TYPE * A, TYPE * B, TYPE * C, } """ -class dot_op: +class dot_op(triton.op2): def __init__(self, transpose_a = False, transpose_b = False): self.dot = triton.op(src, ['C']) self.transpose_a = transpose_a self.transpose_b = transpose_b - def __call__(self, a, b): + def forward(self, a, b): dtype = a.dtype # extract shapes shape_a = triton.shape(a) @@ -104,28 +104,27 @@ class dot_op: AT = self.transpose_a, BT = self.transpose_b, TYPE = dtype, TM = [128], TN = [128], TK = [8]) + def backward(self, op, dy): + a = op.inputs[0] + b = op.inputs[1] + da = dot_op(self.transpose_a, self.transpose_b).forward(dy, b) + db = dot_op(self.transpose_a, self.transpose_b).forward(a, dy) + return [da, db, None, None, None, None, None, None, None] + + def dot(a, b, transpose_a = False, transpose_b = False): if (transpose_a, transpose_b) not in dot.ops: dot.ops[transpose_a, transpose_b] = dot_op(transpose_a, transpose_b) return dot.ops[transpose_a, transpose_b](a, b) dot.ops = dict() -@tf.RegisterGradient("Dot") -def _dot_grad(op, dy): - a = op.inputs[0] - b = op.inputs[1] - print(op.triton) - return [dot_tn(dy, b), dot_nt(a, dy), None, None, None, None, None, None, None] def run_dot(): M, N, K = 128, 128, 128 a = tf.placeholder(tf.float32, shape=[M, K]) b = tf.placeholder(tf.float32, shape=[N, K]) c = dot(a, b, transpose_a = False, transpose_b = False) - print("LULZ") - da, db = tf.gradients(c, [a, b]) - print(da, db) - exit + da = tf.gradients(c, [a]) # Reference ha = np.random.rand(M, K).astype(np.float32) hb = np.random.rand(K, N).astype(np.float32) diff --git a/python/src/tensorflow.cc b/python/src/tensorflow.cc index cac300bd1..95ac51620 100644 --- a/python/src/tensorflow.cc +++ b/python/src/tensorflow.cc @@ -33,12 +33,28 @@ void register_grid(size_t id, id_grid_map[id].reset(new rt::function::grid_fn_ty(grid_fn)); } +void delete_grid(size_t id) { + id_grid_map.erase(id); + std::cout << "deleted " << id_grid_map.size() << std::endl; +} + void register_fn(size_t id, const std::string& src, const rt::function::options_space_t& opt) { id_fn_map[id].reset(new rt::function(src, opt)); } +void delete_fn(size_t id) { + id_fn_map.erase(id); + std::cout << "deleted " << id_fn_map.size() << std::endl; +} + +void cleanup() { + id_grid_map.clear(); + id_fn_map.clear(); + i64scalar_map.clear(); +} + size_t make_op_id() { return id_fn_map.size(); } @@ -453,9 +469,12 @@ PYBIND11_MODULE(libtriton, m) { // hooks into triton constructs since frameworks may not use pybind11 m.def("register_grid", ®ister_grid); + m.def("delete_grid", &delete_grid); m.def("register_fn", ®ister_fn); + m.def("delete_fn", &delete_fn); m.def("make_op_id", &make_op_id); m.def("make_scalar_id", &make_scalar_id); - m.def("retrieve_scalar", &retrieve_scalar) + m.def("retrieve_scalar", &retrieve_scalar); + m.def("cleanup", &cleanup); ; } diff --git a/python/triton/ops.py b/python/triton/ops.py index e98cc2b9e..f2984de30 100644 --- a/python/triton/ops.py +++ b/python/triton/ops.py @@ -13,6 +13,13 @@ import setuptools import libtriton +# clean-up libtriton resources +import atexit +@atexit.register +def cleanup(): + libtriton.cleanup() + + torch_id = 'torch' tensorflow_id = 'tensorflow' @@ -20,6 +27,9 @@ torch = None tensorflow = None tf_extra_ops = None + + + def _import_torch(): global torch if torch is None: @@ -211,19 +221,25 @@ def _make_grid(args) : return grid +class op2: + + def __init__(self): + pass + + def __call__(self, *args, **kwargs): + result = self.forward(*args, **kwargs) + # backprop is defined + if(callable(getattr(self, 'backward', None))): + _import_tensorflow() + @tensorflow.RegisterGradient('Dot') + def gradient(op, dy): + return self.backward(op, dy) + return result + + + class op: - class _definitions_descriptor: - def __init__(self): - self.values = dict() - - def __set__(self, instance, value): - self.values[value[0]] = value[1] - - def __get__(self, instance, owner): - return self.values - - def __init__(self, src, outputs, framework = None): self.fw_id = dict() self.fw_ops = dict() @@ -233,9 +249,8 @@ class op: self.framework = _find_framework(framework) if self.framework == tensorflow_id: _import_tensorflow() - tensorflow.Operation.triton = property(op._definitions_descriptor) - + def __call__(self, *args, **kwargs): # create a new op when defines are different key = zip(kwargs.keys(), kwargs.values()) @@ -251,7 +266,7 @@ class op: defines.append((k, values)) opt = libtriton.options_space() opt.defines = defines - opt.num_warps = [1, 2, 4, 8] + opt.num_warps = [4] # create unique id for this op op_id = libtriton.make_op_id() self.fw_id[key] = op_id @@ -269,9 +284,7 @@ class op: # create operands op_args = [x.handle if isinstance(x, scalar) else x for x in args[:-1]] # call framework op - tensor = op(*op_args, id=op_id) - tensor.op.triton = ('lol', 1) - return tensor + return op(*op_args, id=op_id) class register_gradient: From f6e9c24fe8e316c4e97a30b277d138f514a9d820 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 4 Sep 2019 12:47:59 -0400 Subject: [PATCH 348/494] [python] more progress towards tensorflow/pytorch unification --- lib/runtime/function.cc | 3 +- python/examples/dot.py | 133 ++++++++++++++++++++-------------------- python/triton/ops.py | 100 +++++++++++++++++++----------- 3 files changed, 132 insertions(+), 104 deletions(-) diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 703918ba5..838975086 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -148,8 +148,9 @@ function::caller function::autotune(driver::stream* stream, const grid_fn_ty& gr options_t opt; unsigned i = 0; opt.num_warps = std::stoi(params[i++]); - for(auto it: opt_space_.defines) + for(auto it: opt_space_.defines){ opt.defines[it.first] = params[i++]; + } // pre-process TokenSequence tokens; Preprocessor cpp(&src_, true); diff --git a/python/examples/dot.py b/python/examples/dot.py index 63b985747..da3cb9831 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -2,41 +2,10 @@ import tensorflow as tf import triton import numpy as np -src = """ -// Templates for accessing A -#if AT == 1 -#define USE_A ^a -#define STRIDE_AK lda -#define STRIDE_AM 1 -#define BROADCAST_AK :, newaxis -#define BROADCAST_AM newaxis, : -#define SHAPE_A TK, TM -#else -#define USE_A a -#define STRIDE_AK 1 -#define STRIDE_AM lda -#define BROADCAST_AK newaxis, : -#define BROADCAST_AM :, newaxis -#define SHAPE_A TM, TK -#endif -// Templates for accessing B -#if BT == 1 -#define USE_B ^b -#define STRIDE_BK 1 -#define STRIDE_BN ldb -#define BROADCAST_BK newaxis, : -#define BROADCAST_BN :, newaxis -#define SHAPE_B TN, TK -#else -#define USE_B b -#define STRIDE_BK ldb -#define STRIDE_BN 1 -#define BROADCAST_BK :, newaxis -#define BROADCAST_BN newaxis, : -#define SHAPE_B TK, TN -#endif +class dot(triton.function): + src = """ void dot(TYPE * A, TYPE * B, TYPE * C, int M, int N, int K, int lda __multipleof(8), @@ -73,71 +42,99 @@ void dot(TYPE * A, TYPE * B, TYPE * C, } """ -class dot_op(triton.op2): + op = triton.op(src, ['C']) - def __init__(self, transpose_a = False, transpose_b = False): - self.dot = triton.op(src, ['C']) - self.transpose_a = transpose_a - self.transpose_b = transpose_b - - def forward(self, a, b): - dtype = a.dtype + @staticmethod + def _call(a, b, transpose_a, transpose_b): # extract shapes shape_a = triton.shape(a) shape_b = triton.shape(b) M, Ka = shape_a[0], shape_a[1] Kb, N = shape_b[0], shape_b[1] # transpose shapes - if self.transpose_a: + if transpose_a: M, Ka = Ka, M - if self.transpose_b: + if transpose_b: Kb, N = N, Kb # contiguous dimensions - lda = M if self.transpose_a else Ka - ldb = Kb if self.transpose_b else N + lda = M if transpose_a else Ka + ldb = Kb if transpose_b else N ldc = N + # data-type + dtype = a.dtype # allocate output c = triton.empty([M, N], dtype = dtype) # compute - return self.dot(a, b, c, M, N, Ka, lda, ldb, ldc, - lambda opt: [triton.cdiv(M, opt.d('TM')), triton.cdiv(N, opt.d('TN'))], - AT = self.transpose_a, BT = self.transpose_b, TYPE = dtype, - TM = [128], TN = [128], TK = [8]) + grid = lambda opt: [triton.cdiv(M, opt.d('TM')), triton.cdiv(N, opt.d('TN'))] + # macros -- not necessary but makes kernel source-code simpler + macros = {# handle A transposition + 'USE_A' : '^a' if transpose_a else 'a', + 'STRIDE_AK' : 'lda' if transpose_a else '1', + 'STRIDE_AM' : '1' if transpose_a else 'lda', + 'BROADCAST_AK': ':, newaxis' if transpose_a else 'newaxis, :', + 'BROADCAST_AM': 'newaxis, :' if transpose_a else ':, newaxis', + 'SHAPE_A' : 'TK, TM' if transpose_a else 'TM, TK', + # handle B transposition + 'USE_B' : '^b' if transpose_b else 'b', + 'STRIDE_BK' : '1' if transpose_b else 'ldb', + 'STRIDE_BN' : 'ldb' if transpose_b else '1', + 'BROADCAST_BK': 'newaxis, :' if transpose_b else ':, newaxis', + 'BROADCAST_BN': ':, newaxis' if transpose_b else 'newaxis, :', + 'SHAPE_B' : 'TN, TK' if transpose_b else 'TK, TN'} + return dot.op(a, b, c, M, N, Ka, lda, ldb, ldc, grid, + AT = transpose_a, BT = transpose_b, TYPE = dtype, + TM = [64, 128], TN = [64, 128], TK = [8], **macros) - def backward(self, op, dy): - a = op.inputs[0] - b = op.inputs[1] - da = dot_op(self.transpose_a, self.transpose_b).forward(dy, b) - db = dot_op(self.transpose_a, self.transpose_b).forward(a, dy) + @staticmethod + def forward(ctx, a, b, transpose_a = False, transpose_b = False): + ctx.save_for_backward(a, b, transpose_a, transpose_b) + return dot._call(a, b, transpose_a, transpose_b) + + @staticmethod + def backward(ctx, dy): + a, b, t_a, t_b = ctx.saved_tensors + if not t_a and not t_b: + da = dot._call(dy, b, False, True) + db = dot._call(a, dy, True, False) + elif not t_a and t_b: + da = dot._call(dy, b, False, False) + db = dot._call(dy, a, True, False) + elif t_a and not t_b: + da = dot._call(b, dy, False, True) + db = dot._call(a, dy, False, False) + elif t_a and t_b: + da = dot._call(b, dy, True, True) + db = dot._call(dy, a, True, True) + else: + assert False return [da, db, None, None, None, None, None, None, None] -def dot(a, b, transpose_a = False, transpose_b = False): - if (transpose_a, transpose_b) not in dot.ops: - dot.ops[transpose_a, transpose_b] = dot_op(transpose_a, transpose_b) - return dot.ops[transpose_a, transpose_b](a, b) -dot.ops = dict() - def run_dot(): M, N, K = 128, 128, 128 a = tf.placeholder(tf.float32, shape=[M, K]) b = tf.placeholder(tf.float32, shape=[N, K]) - c = dot(a, b, transpose_a = False, transpose_b = False) - da = tf.gradients(c, [a]) + _dot = dot.apply + tr_c = _dot(a, b, transpose_a = False, transpose_b = True) + tr_d = _dot(tr_c, b, transpose_a = True, transpose_b = False) + tf_c = tf.matmul(a, b, transpose_a = False, transpose_b = True) + tf_d = tf.matmul(tf_c, b, transpose_a = True, transpose_b = False) + # Gradient + tr_da = tf.gradients(tr_d, [a]) + tf_da = tf.gradients(tf_d, [a]) # Reference ha = np.random.rand(M, K).astype(np.float32) hb = np.random.rand(K, N).astype(np.float32) # Run sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) - result = sess.run([da], feed_dict = {a: ha, - b: hb})[0] + result = sess.run([tr_da, tf_da], feed_dict = {a: ha, + b: hb}) # Test - print(result) - hresult = np.dot(ha, hb) - dif = np.abs(result - hresult) - np.savetxt('dif.dat', dif, '%2.4f') + print(result[0][0]) + print(result[1][0]) + dif = np.abs(result[0][0] - result[1][0]) print("dif: %f" % np.max(dif)) run_dot() \ No newline at end of file diff --git a/python/triton/ops.py b/python/triton/ops.py index f2984de30..633ceafa7 100644 --- a/python/triton/ops.py +++ b/python/triton/ops.py @@ -25,6 +25,7 @@ tensorflow_id = 'tensorflow' torch = None tensorflow = None +_gradient_registry = None tf_extra_ops = None @@ -39,6 +40,9 @@ def _import_tensorflow(): global tensorflow if tensorflow is None: import tensorflow + global _gradient_registry + if _gradient_registry is None: + from tensorflow.python.framework.ops import _gradient_registry def _import_tf_extra_ops(): global tf_extra_ops @@ -221,47 +225,85 @@ def _make_grid(args) : return grid -class op2: - def __init__(self): +class OpContext(object): + + def save_for_backward(self, *tensors): + self.to_save = tensors + + def mark_dirty(self, *args): + self.dirty_tensors = args + + @property + def saved_tensors(self): + return self.to_save + + +class function_meta(type): + + def __init__(cls, name, bases, attrs): + cls.contexts = dict() + cls.registered = False + return super(function_meta, cls).__init__(name, bases, attrs) + +class function(metaclass = function_meta): + + def __init__(self, framework = None): + self.framework = _find_framework(framework) pass - def __call__(self, *args, **kwargs): - result = self.forward(*args, **kwargs) - # backprop is defined - if(callable(getattr(self, 'backward', None))): - _import_tensorflow() - @tensorflow.RegisterGradient('Dot') - def gradient(op, dy): - return self.backward(op, dy) - return result + @staticmethod + def forward(ctx, *args, **kwargs): + raise NotImplementedError + @staticmethod + def backward(ctx, grad_output): + raise NotImplementedError + + @classmethod + def apply(cls, *args, **kwargs): + # call forward + ctx = OpContext() + result = cls.forward(ctx, *args, **kwargs) + id = result.op.get_attr('id') + cls.contexts[id] = ctx + # register backward + _import_tensorflow() + from tensorflow.python.framework.ops import _gradient_registry + name = result.op.op_def.name + if not cls.registered: + @tensorflow.RegisterGradient(name) + def gradient(op, dy): + id = op.get_attr('id') + return cls.backward(cls.contexts[id], dy) + cls.registered = True + # return result tensor + return result + class op: def __init__(self, src, outputs, framework = None): self.fw_id = dict() - self.fw_ops = dict() self.fw_grids = dict() + self.fw_op = None self.src = src self.outputs = outputs self.framework = _find_framework(framework) - if self.framework == tensorflow_id: - _import_tensorflow() def __call__(self, *args, **kwargs): # create a new op when defines are different - key = zip(kwargs.keys(), kwargs.values()) - if key not in self.fw_ops: + key = '-'.join(['{key}-{val}'.format(key=key, val=val) for key, val in kwargs.items()]) + if key not in self.fw_id.keys(): # code generation options defines = [] for k, v in kwargs.items(): cvt = lambda x: _cvt_to_def_str(x, self.framework) - try: + if(isinstance(v, list)): values = list(map(cvt, v)) - except TypeError: + else: values = [cvt(v)] defines.append((k, values)) opt = libtriton.options_space() @@ -272,30 +314,18 @@ class op: self.fw_id[key] = op_id # register function libtriton.register_fn(op_id, self.src, opt) - self.fw_ops[key] = _make_framework_op(self.src, self.outputs, opt, self.framework) + if self.fw_op is None: + self.fw_op = _make_framework_op(self.src, self.outputs, opt, self.framework) # retrieve framework op op_id = self.fw_id[key] - op = self.fw_ops[key] # register grid - grid = _make_grid(args) - self.fw_grids[key] = grid - libtriton.register_grid(op_id, self.fw_grids[key]) + libtriton.register_grid(op_id, _make_grid(args)) # create operands op_args = [x.handle if isinstance(x, scalar) else x for x in args[:-1]] # call framework op - return op(*op_args, id=op_id) - - -class register_gradient: - - def __init__(self, op): - self.op = op - - def __call__(self, f): - name = 'Dot' - ops.RegisterGradient(name)(f) - + return self.fw_op(*op_args, id=op_id) + def empty(shapes, dtype, framework = None): framework = _find_framework(framework) From 945b5d0de92dafcf38564d53631b810f0e48a3f4 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 4 Sep 2019 21:55:47 -0400 Subject: [PATCH 349/494] [python] modularized triton package --- python/examples/dot.py | 113 +-------- python/setup.py | 3 +- python/triton/__init__.py | 13 +- python/triton/frameworks.py | 46 ++++ python/triton/function.py | 54 +++++ python/triton/kernel.py | 215 +++++++++++++++++ python/triton/ops.py | 415 -------------------------------- python/triton/ops/__init__.py | 1 + python/triton/ops/dot.py | 107 ++++++++ python/triton/tools/build.py | 0 python/triton/tools/checksum.py | 0 python/triton/utils.py | 88 +++++++ 12 files changed, 527 insertions(+), 528 deletions(-) create mode 100644 python/triton/frameworks.py create mode 100644 python/triton/function.py create mode 100644 python/triton/kernel.py delete mode 100644 python/triton/ops.py create mode 100644 python/triton/ops/__init__.py create mode 100644 python/triton/ops/dot.py delete mode 100644 python/triton/tools/build.py delete mode 100644 python/triton/tools/checksum.py create mode 100644 python/triton/utils.py diff --git a/python/examples/dot.py b/python/examples/dot.py index da3cb9831..84ae9b6f3 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -1,121 +1,12 @@ +import numpy as np import tensorflow as tf import triton -import numpy as np - - -class dot(triton.function): - - src = """ -void dot(TYPE * A, TYPE * B, TYPE * C, - int M, int N, int K, - int lda __multipleof(8), - int ldb __multipleof(8), - int ldc) { - // prologue - int ridx = get_program_id(0); - int ridy = get_program_id(1); - int rxa[TM] = ridx * TM + 0 ... TM; - int ryb[TN] = ridy * TN + 0 ... TN; - int rka[TK] = 0 ... TK; - int rkb[TK] = 0 ... TK; - float c[TM, TN] = 0; - // pointers to operands - TYPE* pa[SHAPE_A] = A + rka[BROADCAST_AK] * STRIDE_AK + rxa[BROADCAST_AM] * STRIDE_AM; - TYPE* pb[SHAPE_B] = B + rkb[BROADCAST_BK] * STRIDE_BK + ryb[BROADCAST_BN] * STRIDE_BN; - // prefetches operands - TYPE a[SHAPE_A] = *pa; - TYPE b[SHAPE_B] = *pb; - // reduction loop - for(int k = K; k > 0; k-= TK){ - c += USE_A @ USE_B; - pa = pa + TK * STRIDE_AK; - pb = pb + TK * STRIDE_BK; - a = *pa; - b = *pb; - } - // epilogue - int rxc[TM] = ridx * TM + 0 ... TM; - int ryc[TN] = ridy * TN + 0 ... TN; - TYPE* pc[TM, TN] = C + ryc[newaxis, :] + rxc[:, newaxis] * ldc; - bool checkc[TM, TN] = (rxc < M)[:, newaxis] && (ryc < N)[newaxis, :]; - *?(checkc) pc = c; -} -""" - - op = triton.op(src, ['C']) - - @staticmethod - def _call(a, b, transpose_a, transpose_b): - # extract shapes - shape_a = triton.shape(a) - shape_b = triton.shape(b) - M, Ka = shape_a[0], shape_a[1] - Kb, N = shape_b[0], shape_b[1] - # transpose shapes - if transpose_a: - M, Ka = Ka, M - if transpose_b: - Kb, N = N, Kb - # contiguous dimensions - lda = M if transpose_a else Ka - ldb = Kb if transpose_b else N - ldc = N - # data-type - dtype = a.dtype - # allocate output - c = triton.empty([M, N], dtype = dtype) - # compute - grid = lambda opt: [triton.cdiv(M, opt.d('TM')), triton.cdiv(N, opt.d('TN'))] - # macros -- not necessary but makes kernel source-code simpler - macros = {# handle A transposition - 'USE_A' : '^a' if transpose_a else 'a', - 'STRIDE_AK' : 'lda' if transpose_a else '1', - 'STRIDE_AM' : '1' if transpose_a else 'lda', - 'BROADCAST_AK': ':, newaxis' if transpose_a else 'newaxis, :', - 'BROADCAST_AM': 'newaxis, :' if transpose_a else ':, newaxis', - 'SHAPE_A' : 'TK, TM' if transpose_a else 'TM, TK', - # handle B transposition - 'USE_B' : '^b' if transpose_b else 'b', - 'STRIDE_BK' : '1' if transpose_b else 'ldb', - 'STRIDE_BN' : 'ldb' if transpose_b else '1', - 'BROADCAST_BK': 'newaxis, :' if transpose_b else ':, newaxis', - 'BROADCAST_BN': ':, newaxis' if transpose_b else 'newaxis, :', - 'SHAPE_B' : 'TN, TK' if transpose_b else 'TK, TN'} - return dot.op(a, b, c, M, N, Ka, lda, ldb, ldc, grid, - AT = transpose_a, BT = transpose_b, TYPE = dtype, - TM = [64, 128], TN = [64, 128], TK = [8], **macros) - - @staticmethod - def forward(ctx, a, b, transpose_a = False, transpose_b = False): - ctx.save_for_backward(a, b, transpose_a, transpose_b) - return dot._call(a, b, transpose_a, transpose_b) - - @staticmethod - def backward(ctx, dy): - a, b, t_a, t_b = ctx.saved_tensors - if not t_a and not t_b: - da = dot._call(dy, b, False, True) - db = dot._call(a, dy, True, False) - elif not t_a and t_b: - da = dot._call(dy, b, False, False) - db = dot._call(dy, a, True, False) - elif t_a and not t_b: - da = dot._call(b, dy, False, True) - db = dot._call(a, dy, False, False) - elif t_a and t_b: - da = dot._call(b, dy, True, True) - db = dot._call(dy, a, True, True) - else: - assert False - return [da, db, None, None, None, None, None, None, None] - - def run_dot(): M, N, K = 128, 128, 128 a = tf.placeholder(tf.float32, shape=[M, K]) b = tf.placeholder(tf.float32, shape=[N, K]) - _dot = dot.apply + _dot = triton.ops.dot.apply tr_c = _dot(a, b, transpose_a = False, transpose_b = True) tr_d = _dot(tr_c, b, transpose_a = True, transpose_b = False) tf_c = tf.matmul(a, b, transpose_a = False, transpose_b = True) diff --git a/python/setup.py b/python/setup.py index 8a7c9b372..a70aa6c51 100644 --- a/python/setup.py +++ b/python/setup.py @@ -82,7 +82,8 @@ setup( author_email='ptillet@g.harvard.edu', description='A language and compiler for custom Deep Learning operations', long_description='', - packages=['triton'], + packages=['triton', + 'triton/ops'], ext_modules=[CMakeExtension('triton')], cmdclass=dict(build_ext=CMakeBuild), zip_safe=False, diff --git a/python/triton/__init__.py b/python/triton/__init__.py index 18dff0a49..aa05eefe1 100644 --- a/python/triton/__init__.py +++ b/python/triton/__init__.py @@ -1 +1,12 @@ -from .ops import * \ No newline at end of file +from .kernel import * +from .function import * +from .utils import * +import triton.ops + + +# clean-up libtriton resources +import atexit +import libtriton +@atexit.register +def cleanup(): + libtriton.cleanup() \ No newline at end of file diff --git a/python/triton/frameworks.py b/python/triton/frameworks.py new file mode 100644 index 000000000..60c0728f1 --- /dev/null +++ b/python/triton/frameworks.py @@ -0,0 +1,46 @@ +import sys +import os +import libtriton + +torch_id = 'torch' +tensorflow_id = 'tensorflow' + +torch = None +tensorflow = None +tf_extra_ops = None + + +def _import_torch(): + global torch + if torch is None: + import torch + +def _import_tensorflow(): + global tensorflow + if tensorflow is None: + import tensorflow + +def _import_tf_extra_ops(): + global tf_extra_ops + if tf_extra_ops is None: + path = os.path.dirname(libtriton.__file__) + path = os.path.join(path, 'libextra_tf_ops.so') + _import_tensorflow() + tf_extra_ops = tensorflow.load_op_library(path) + + +def _find_framework(default = None): + is_tf_imported = 'tensorflow' in sys.modules + is_torch_imported = 'torch' in sys.modules + if default: + if default not in [tensorflow_id, torch_id]: + raise ValueError('unsupported framework') + else: + return default + elif is_tf_imported and not is_torch_imported: + return tensorflow_id + elif is_torch_imported and not is_tf_imported: + return torch_id + else: + raise ValueError('cannot determine imported framework, ' + 'please provide framework argument') \ No newline at end of file diff --git a/python/triton/function.py b/python/triton/function.py new file mode 100644 index 000000000..8669dbc92 --- /dev/null +++ b/python/triton/function.py @@ -0,0 +1,54 @@ +import triton.frameworks as fw + +class OpContext(object): + + def save_for_backward(self, *tensors): + self.to_save = tensors + + def mark_dirty(self, *args): + self.dirty_tensors = args + + @property + def saved_tensors(self): + return self.to_save + + +class function_meta(type): + + def __init__(cls, name, bases, attrs): + cls.contexts = dict() + cls.registered = False + return super(function_meta, cls).__init__(name, bases, attrs) + +class function(metaclass = function_meta): + + def __init__(self, framework = None): + self.framework = _find_framework(framework) + pass + + @staticmethod + def forward(ctx, *args, **kwargs): + raise NotImplementedError + + @staticmethod + def backward(ctx, grad_output): + raise NotImplementedError + + @classmethod + def apply(cls, *args, **kwargs): + # call forward + ctx = OpContext() + result = cls.forward(ctx, *args, **kwargs) + id = result.op.get_attr('id') + cls.contexts[id] = ctx + # register backward + fw._import_tensorflow() + name = result.op.op_def.name + if not cls.registered: + @fw.tensorflow.RegisterGradient(name) + def gradient(op, dy): + id = op.get_attr('id') + return cls.backward(cls.contexts[id], dy) + cls.registered = True + # return result tensor + return result \ No newline at end of file diff --git a/python/triton/kernel.py b/python/triton/kernel.py new file mode 100644 index 000000000..b3d2be50a --- /dev/null +++ b/python/triton/kernel.py @@ -0,0 +1,215 @@ +# import for cache +import os +import tempfile +import shutil +import hashlib +import sysconfig +import sys +# import for just-in-time compilation +import distutils +import setuptools.command.build_ext +import setuptools +# triton +import triton.frameworks as fw +import triton.utils +import libtriton + +def _make_framework_src(src, out, grid, framework): + if framework == fw.tensorflow_id: + return libtriton.make_tensorflow_src(src, out, grid) + elif framework == fw.torch_id: + return libtriton.make_torch_src(src, out, grid) + else: + assert False + +def _make_cache_path(src): + md5 = hashlib.sha1(src.encode()) + hexhash = md5.hexdigest() + home = os.path.expanduser('~') + cacheroot = os.path.join(home, '.triton', 'cache') + cachepath = os.path.join(cacheroot, str(hexhash)) + if not os.path.exists(cachepath): + os.makedirs(cachepath) + return cachepath + +def _write_bindings(src, root, framework): + cpp = os.path.join(root, '{framework}.cpp'.format(framework=framework)) + suffix = sysconfig.get_config_var('EXT_SUFFIX') + so = os.path.join(root, '{framework}{suffix}'.format(framework=framework, suffix=suffix)) + recompile = False + # recompile if .so does not exist + if not os.path.exists(cpp) or not os.path.exists(so): + recompile = True + # recompile if cpp was modified after .so + elif max(cpp, so, key=os.path.getctime) == cpp: + recompile = True + # write cpp file + if recompile: + with open(cpp, 'w+') as handle: + handle.writelines(src) + # return path of cpp file + return (cpp, so) + +def _build(src, path, framework): + # include directories + triton_include_dirs = ['/home/philippe/development/triton/include'] + include_dirs = triton_include_dirs + # library directories + triton_library_dirs = [os.path.realpath(os.path.join(libtriton.__file__, os.path.pardir))] + library_dirs = triton_library_dirs + # libraries + libraries = ['triton'] + # add framework + extra_compile_args = [] + if framework == fw.tensorflow_id: + library_dirs += [fw.tensorflow.sysconfig.get_lib()] + include_dirs += [fw.tensorflow.sysconfig.get_include()] + include_dirs += ['/usr/local/cuda/include/'] + libraries += [fw.tensorflow.sysconfig.get_link_flags()[1].replace('-l', '')] + ABI = fw.tensorflow.__cxx11_abi_flag__ if "__cxx11_abi_flag__" in fw.tensorflow.__dict__ else 0 + extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI={ABI}'.format(ABI=ABI)] + elif framework == fw.torch_id: + prefix = os.path.dirname(torch.__file__) + library_dirs += [os.path.join(prefix, 'lib')] + include_dirs += [os.path.join(prefix, 'lib', 'include'), + os.path.join(prefix, 'lib', 'include', 'torch', 'csrc', 'api', 'include'), + os.path.join(prefix, 'include'), + os.path.join(prefix, 'include', 'torch', 'csrc', 'api', 'include')] + libraries += ['torch'] + else: + assert False + # extra arguments + extra_link_args = [] + # dependences + depends = [os.path.realpath(libtriton.__file__)] + # create extension module + ext = setuptools.Extension( + name = 'tensorflow', + language = 'c++', + sources = [src], + include_dirs = include_dirs, + extra_compile_args = extra_compile_args, + extra_link_args = extra_link_args, + library_dirs = library_dirs, + libraries = libraries, + depends = depends + ) + # build extension module + args = ['build_ext'] + tmp = tempfile.mkdtemp() + args.append('--build-temp=' + tmp) + args.append('--build-lib=' + path) + args.append('-q') + args = dict( + name = 'tensorflow', + ext_modules = [ext], + script_args = args, + ) + setuptools.setup(**args) + shutil.rmtree(tmp) + +def _cvt_to_def_str(obj, framework): + # bool + if isinstance(obj, bool): + return str(int(obj)) + # tensorflow type + if framework == fw.tensorflow_id: + if isinstance(obj, fw.tensorflow.DType): + return {fw.tensorflow.int8: 'char', + fw.tensorflow.int16: 'short', + fw.tensorflow.int32: 'int', + fw.tensorflow.int64: 'long', + fw.tensorflow.float16: 'half', + fw.tensorflow.float32: 'float', + fw.tensorflow.float64: 'double'}[obj] + # torch type + elif framework == fw.torch_id: + if isinstance(obj, torch.dtype): + return {torch.int8: 'char', + torch.int16: 'short', + torch.int32: 'int', + torch.int64: 'long', + torch.float16: 'half', + torch.float32: 'float', + torch.float64: 'double'}[obj] + else: + assert False + # default + return str(obj) + + +def _make_framework_op(src, outputs, options, framework): + src, name = _make_framework_src(src, outputs, options, framework) + cache_path = _make_cache_path(src) + cpp, so = _write_bindings(src, cache_path, framework) + _build(cpp, cache_path, framework) + if framework == fw.tensorflow_id: + return fw.tensorflow.load_op_library(so).__dict__[name] + elif framework == fw.torch_id: + torch.ops.load_library(so) + return torch.ops.triton.__dict__[name] + else: + assert False + +def _make_grid(args) : + scalars = [x for x in args[:-1] if isinstance(x, triton.utils.scalar)] + def grid(opt): + for x in scalars: + x.set_assume_initialized() + result = args[-1](opt) + for x in scalars: + x.unset_assume_initialized() + return result + return grid + + +class kernel: + + def __init__(self, src, outputs, framework = None): + self.fw_id = dict() + self.fw_grids = dict() + self.fw_op = None + self.src = src + self.outputs = outputs + self.framework = fw._find_framework(framework) + if self.framework == fw.tensorflow_id: + fw._import_tensorflow() + fw._import_tf_extra_ops() + elif self.framework == fw.torch_id: + fw._import_torch() + else: + assert False + + + def __call__(self, *args, **kwargs): + # create a new framework op when defines are different + key = '-'.join(['{key}-{val}'.format(key=key, val=val) for key, val in kwargs.items()]) + if key not in self.fw_id.keys(): + # code generation options + defines = [] + for k, v in kwargs.items(): + cvt = lambda x: _cvt_to_def_str(x, self.framework) + if(isinstance(v, list)): + values = list(map(cvt, v)) + else: + values = [cvt(v)] + defines.append((k, values)) + opt = libtriton.options_space() + opt.defines = defines + opt.num_warps = [4] + # create unique id for this op + op_id = libtriton.make_op_id() + self.fw_id[key] = op_id + # register function + libtriton.register_fn(op_id, self.src, opt) + if self.fw_op is None: + self.fw_op = _make_framework_op(self.src, self.outputs, opt, self.framework) + + # retrieve framework op + op_id = self.fw_id[key] + # register grid + libtriton.register_grid(op_id, _make_grid(args)) + # create operands + op_args = [x.handle if isinstance(x, triton.utils.scalar) else x for x in args[:-1]] + # call framework function + return self.fw_op(*op_args, id=op_id) \ No newline at end of file diff --git a/python/triton/ops.py b/python/triton/ops.py deleted file mode 100644 index 633ceafa7..000000000 --- a/python/triton/ops.py +++ /dev/null @@ -1,415 +0,0 @@ -# import for cache -import os -import tempfile -import shutil -import hashlib -import sysconfig -import sys -# import for just-in-time compilation -import distutils -import setuptools.command.build_ext -import setuptools -# triton -import libtriton - - -# clean-up libtriton resources -import atexit -@atexit.register -def cleanup(): - libtriton.cleanup() - - -torch_id = 'torch' -tensorflow_id = 'tensorflow' - -torch = None -tensorflow = None -_gradient_registry = None -tf_extra_ops = None - - - - -def _import_torch(): - global torch - if torch is None: - import torch - -def _import_tensorflow(): - global tensorflow - if tensorflow is None: - import tensorflow - global _gradient_registry - if _gradient_registry is None: - from tensorflow.python.framework.ops import _gradient_registry - -def _import_tf_extra_ops(): - global tf_extra_ops - if tf_extra_ops is None: - path = os.path.dirname(libtriton.__file__) - path = os.path.join(path, 'libextra_tf_ops.so') - _import_tensorflow() - tf_extra_ops = tensorflow.load_op_library(path) - - -def _find_framework(default = None): - is_tf_imported = 'tensorflow' in sys.modules - is_torch_imported = 'torch' in sys.modules - if default: - if default not in [tensorflow_id, torch_id]: - raise ValueError('unsupported framework') - else: - return default - elif is_tf_imported and not is_torch_imported: - return tensorflow_id - elif is_torch_imported and not is_tf_imported: - return torch_id - else: - raise ValueError('cannot determine imported framework, ' - 'please provide framework argument') - - -def _make_framework_src(src, out, grid, framework): - if framework == tensorflow_id: - return libtriton.make_tensorflow_src(src, out, grid) - elif framework == torch_id: - return libtriton.make_torch_src(src, out, grid) - else: - assert False - -def _make_cache_path(src): - md5 = hashlib.sha1(src.encode()) - hexhash = md5.hexdigest() - home = os.path.expanduser('~') - cacheroot = os.path.join(home, '.triton', 'cache') - cachepath = os.path.join(cacheroot, str(hexhash)) - if not os.path.exists(cachepath): - os.makedirs(cachepath) - return cachepath - -def _write_bindings(src, root, framework): - cpp = os.path.join(root, '{framework}.cpp'.format(framework=framework)) - suffix = sysconfig.get_config_var('EXT_SUFFIX') - so = os.path.join(root, '{framework}{suffix}'.format(framework=framework, suffix=suffix)) - recompile = False - # recompile if .so does not exist - if not os.path.exists(cpp) or not os.path.exists(so): - recompile = True - # recompile if cpp was modified after .so - elif max(cpp, so, key=os.path.getctime) == cpp: - recompile = True - # write cpp file - if recompile: - with open(cpp, 'w+') as handle: - handle.writelines(src) - # return path of cpp file - return (cpp, so) - -def _build(src, path, framework): - # include directories - triton_include_dirs = ['/home/philippe/development/triton/include'] - include_dirs = triton_include_dirs - # library directories - triton_library_dirs = [os.path.realpath(os.path.join(libtriton.__file__, os.path.pardir))] - library_dirs = triton_library_dirs - # libraries - libraries = ['triton'] - # add framework - extra_compile_args = [] - if framework == tensorflow_id: - _import_tensorflow() - library_dirs += [tensorflow.sysconfig.get_lib()] - include_dirs += [tensorflow.sysconfig.get_include()] - include_dirs += ['/usr/local/cuda/include/'] - libraries += [tensorflow.sysconfig.get_link_flags()[1].replace('-l', '')] - ABI = tensorflow.__cxx11_abi_flag__ if "__cxx11_abi_flag__" in tensorflow.__dict__ else 0 - extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI={ABI}'.format(ABI=ABI)] - elif framework == torch_id: - _import_torch() - prefix = os.path.dirname(torch.__file__) - library_dirs += [os.path.join(prefix, 'lib')] - include_dirs += [os.path.join(prefix, 'lib', 'include'), - os.path.join(prefix, 'lib', 'include', 'torch', 'csrc', 'api', 'include'), - os.path.join(prefix, 'include'), - os.path.join(prefix, 'include', 'torch', 'csrc', 'api', 'include')] - libraries += ['torch'] - else: - assert False - # extra arguments - extra_link_args = [] - # dependences - depends = [os.path.realpath(libtriton.__file__)] - # create extension module - ext = setuptools.Extension( - name = 'tensorflow', - language = 'c++', - sources = [src], - include_dirs = include_dirs, - extra_compile_args = extra_compile_args, - extra_link_args = extra_link_args, - library_dirs = library_dirs, - libraries = libraries, - depends = depends - ) - # build extension module - args = ['build_ext'] - tmp = tempfile.mkdtemp() - args.append('--build-temp=' + tmp) - args.append('--build-lib=' + path) - args.append('-q') - args = dict( - name = 'tensorflow', - ext_modules = [ext], - script_args = args, - ) - setuptools.setup(**args) - shutil.rmtree(tmp) - -def _cvt_to_def_str(obj, framework): - # bool - if isinstance(obj, bool): - return str(int(obj)) - # tensorflow type - if framework == tensorflow_id: - _import_tensorflow() - if isinstance(obj, tensorflow.DType): - return {tensorflow.int8: 'char', - tensorflow.int16: 'short', - tensorflow.int32: 'int', - tensorflow.int64: 'long', - tensorflow.float16: 'half', - tensorflow.float32: 'float', - tensorflow.float64: 'double'}[obj] - # torch type - elif framework == torch_id: - _import_torch() - if isinstance(obj, torch.dtype): - return {torch.int8: 'char', - torch.int16: 'short', - torch.int32: 'int', - torch.int64: 'long', - torch.float16: 'half', - torch.float32: 'float', - torch.float64: 'double'}[obj] - else: - assert False - # default - return str(obj) - - -def _make_framework_op(src, outputs, options, framework): - src, name = _make_framework_src(src, outputs, options, framework) - cache_path = _make_cache_path(src) - cpp, so = _write_bindings(src, cache_path, framework) - _build(cpp, cache_path, framework) - if framework == tensorflow_id: - _import_tensorflow() - return tensorflow.load_op_library(so).__dict__[name] - elif framework == torch_id: - _import_torch() - torch.ops.load_library(so) - return torch.ops.triton.__dict__[name] - else: - assert False - -def _make_grid(args) : - scalars = [x for x in args[:-1] if isinstance(x, scalar)] - def grid(opt): - for x in scalars: - x.set_assume_initialized() - result = args[-1](opt) - for x in scalars: - x.unset_assume_initialized() - return result - return grid - - - -class OpContext(object): - - def save_for_backward(self, *tensors): - self.to_save = tensors - - def mark_dirty(self, *args): - self.dirty_tensors = args - - @property - def saved_tensors(self): - return self.to_save - - -class function_meta(type): - - def __init__(cls, name, bases, attrs): - cls.contexts = dict() - cls.registered = False - return super(function_meta, cls).__init__(name, bases, attrs) - -class function(metaclass = function_meta): - - def __init__(self, framework = None): - self.framework = _find_framework(framework) - pass - - @staticmethod - def forward(ctx, *args, **kwargs): - raise NotImplementedError - - @staticmethod - def backward(ctx, grad_output): - raise NotImplementedError - - @classmethod - def apply(cls, *args, **kwargs): - # call forward - ctx = OpContext() - result = cls.forward(ctx, *args, **kwargs) - id = result.op.get_attr('id') - cls.contexts[id] = ctx - # register backward - _import_tensorflow() - from tensorflow.python.framework.ops import _gradient_registry - name = result.op.op_def.name - if not cls.registered: - @tensorflow.RegisterGradient(name) - def gradient(op, dy): - id = op.get_attr('id') - return cls.backward(cls.contexts[id], dy) - cls.registered = True - # return result tensor - return result - - - -class op: - - def __init__(self, src, outputs, framework = None): - self.fw_id = dict() - self.fw_grids = dict() - self.fw_op = None - self.src = src - self.outputs = outputs - self.framework = _find_framework(framework) - - - def __call__(self, *args, **kwargs): - # create a new op when defines are different - key = '-'.join(['{key}-{val}'.format(key=key, val=val) for key, val in kwargs.items()]) - if key not in self.fw_id.keys(): - # code generation options - defines = [] - for k, v in kwargs.items(): - cvt = lambda x: _cvt_to_def_str(x, self.framework) - if(isinstance(v, list)): - values = list(map(cvt, v)) - else: - values = [cvt(v)] - defines.append((k, values)) - opt = libtriton.options_space() - opt.defines = defines - opt.num_warps = [4] - # create unique id for this op - op_id = libtriton.make_op_id() - self.fw_id[key] = op_id - # register function - libtriton.register_fn(op_id, self.src, opt) - if self.fw_op is None: - self.fw_op = _make_framework_op(self.src, self.outputs, opt, self.framework) - - # retrieve framework op - op_id = self.fw_id[key] - # register grid - libtriton.register_grid(op_id, _make_grid(args)) - # create operands - op_args = [x.handle if isinstance(x, scalar) else x for x in args[:-1]] - # call framework op - return self.fw_op(*op_args, id=op_id) - - -def empty(shapes, dtype, framework = None): - framework = _find_framework(framework) - if framework == tensorflow_id: - _import_tensorflow() - _import_tf_extra_ops - args = [x.handle if isinstance(x, scalar) else x for x in shapes] - args = tensorflow.stack(args) - return tf_extra_ops.alloc_empty(args, T = dtype) - elif framework == torch_id: - _import_torch() - return torch.empty(*shapes) - -def cdiv(a, b): - return -(-a // b) - -class scalar: - - def __init__(self, x): - _import_tf_extra_ops() - self.id = libtriton.make_scalar_id() - self.handle = tf_extra_ops.register_scalar(x, id=self.id) - self.assume_initialized = False - - def set_assume_initialized(self): - self.assume_initialized = True - - def unset_assume_initialized(self): - self.assume_initialized = False - - def get_value(self): - if self.assume_initialized: - return libtriton.retrieve_scalar(self.id) - else: - return self.handle - - def __add__(self, other): - return self.get_value() + other - - def __radd__(self, other): - return other + self.get_value() - - def __sub__(self, other): - return self.get_value() - other - - def __rsub(self, other): - return other - self.get_value() - - def __mul__(self, other): - return self.get_value() * other - - def __rmul(self, other): - return other * self.get_value() - - def __floordiv__(self, other): - return self.get_value() // other - - def __rfloordiv__(self, other): - return other // self.get_value() - - def __div__(self, other): - return self.get_value() / other - - def __rdiv__(self, other): - return other / self.get_value() - - def __truediv__(self, other): - self.get_value().__truediv__(other) - - def __rtruediv__(self, other): - other.__truediv__(self.get_value()) - - def __neg__(self): - return -self.get_value() - -class lazy_shape: - - def __init__(self, shape): - self.shape = shape - - def __getitem__(self, key): - return scalar(self.shape[key]) - -def shape(A) : - _import_tensorflow() - return lazy_shape(tensorflow.shape(A)) - diff --git a/python/triton/ops/__init__.py b/python/triton/ops/__init__.py new file mode 100644 index 000000000..f995b88f1 --- /dev/null +++ b/python/triton/ops/__init__.py @@ -0,0 +1 @@ +from .dot import dot diff --git a/python/triton/ops/dot.py b/python/triton/ops/dot.py new file mode 100644 index 000000000..f799be983 --- /dev/null +++ b/python/triton/ops/dot.py @@ -0,0 +1,107 @@ +import triton + +class dot(triton.function): + + src = """ +void dot(TYPE * A, TYPE * B, TYPE * C, + int M, int N, int K, + int lda __multipleof(8), + int ldb __multipleof(8), + int ldc) { + // prologue + int ridx = get_program_id(0); + int ridy = get_program_id(1); + int rxa[TM] = ridx * TM + 0 ... TM; + int ryb[TN] = ridy * TN + 0 ... TN; + int rka[TK] = 0 ... TK; + int rkb[TK] = 0 ... TK; + float c[TM, TN] = 0; + // pointers to operands + TYPE* pa[SHAPE_A] = A + rka[BROADCAST_AK] * STRIDE_AK + rxa[BROADCAST_AM] * STRIDE_AM; + TYPE* pb[SHAPE_B] = B + rkb[BROADCAST_BK] * STRIDE_BK + ryb[BROADCAST_BN] * STRIDE_BN; + // prefetches operands + TYPE a[SHAPE_A] = *pa; + TYPE b[SHAPE_B] = *pb; + // reduction loop + for(int k = K; k > 0; k-= TK){ + c += USE_A @ USE_B; + pa = pa + TK * STRIDE_AK; + pb = pb + TK * STRIDE_BK; + a = *pa; + b = *pb; + } + // epilogue + int rxc[TM] = ridx * TM + 0 ... TM; + int ryc[TN] = ridy * TN + 0 ... TN; + TYPE* pc[TM, TN] = C + ryc[newaxis, :] + rxc[:, newaxis] * ldc; + bool checkc[TM, TN] = (rxc < M)[:, newaxis] && (ryc < N)[newaxis, :]; + *?(checkc) pc = c; +} +""" + + kernel = triton.kernel(src, ['C']) + + @staticmethod + def _call(a, b, transpose_a, transpose_b): + # extract shapes + shape_a = triton.shape(a) + shape_b = triton.shape(b) + M, Ka = shape_a[0], shape_a[1] + Kb, N = shape_b[0], shape_b[1] + # transpose shapes + if transpose_a: + M, Ka = Ka, M + if transpose_b: + Kb, N = N, Kb + # contiguous dimensions + lda = M if transpose_a else Ka + ldb = Kb if transpose_b else N + ldc = N + # data-type + dtype = a.dtype + # allocate output + c = triton.empty([M, N], dtype = dtype) + # compute + grid = lambda opt: [triton.cdiv(M, opt.d('TM')), triton.cdiv(N, opt.d('TN'))] + # macros -- not necessary but makes kernel source-code simpler + macros = {# handle A transposition + 'USE_A' : '^a' if transpose_a else 'a', + 'STRIDE_AK' : 'lda' if transpose_a else '1', + 'STRIDE_AM' : '1' if transpose_a else 'lda', + 'BROADCAST_AK': ':, newaxis' if transpose_a else 'newaxis, :', + 'BROADCAST_AM': 'newaxis, :' if transpose_a else ':, newaxis', + 'SHAPE_A' : 'TK, TM' if transpose_a else 'TM, TK', + # handle B transposition + 'USE_B' : '^b' if transpose_b else 'b', + 'STRIDE_BK' : '1' if transpose_b else 'ldb', + 'STRIDE_BN' : 'ldb' if transpose_b else '1', + 'BROADCAST_BK': 'newaxis, :' if transpose_b else ':, newaxis', + 'BROADCAST_BN': ':, newaxis' if transpose_b else 'newaxis, :', + 'SHAPE_B' : 'TN, TK' if transpose_b else 'TK, TN'} + return dot.kernel(a, b, c, M, N, Ka, lda, ldb, ldc, grid, + AT = transpose_a, BT = transpose_b, TYPE = dtype, + TM = [64, 128], TN = [64, 128], TK = [8], **macros) + + @staticmethod + def forward(ctx, a, b, transpose_a = False, transpose_b = False): + ctx.save_for_backward(a, b, transpose_a, transpose_b) + return dot._call(a, b, transpose_a, transpose_b) + + @staticmethod + def backward(ctx, dy): + a, b, t_a, t_b = ctx.saved_tensors + if not t_a and not t_b: + da = dot._call(dy, b, False, True) + db = dot._call(a, dy, True, False) + elif not t_a and t_b: + da = dot._call(dy, b, False, False) + db = dot._call(dy, a, True, False) + elif t_a and not t_b: + da = dot._call(b, dy, False, True) + db = dot._call(a, dy, False, False) + elif t_a and t_b: + da = dot._call(b, dy, True, True) + db = dot._call(dy, a, True, True) + else: + assert False + return [da, db, None, None, None, None, None, None, None] \ No newline at end of file diff --git a/python/triton/tools/build.py b/python/triton/tools/build.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/python/triton/tools/checksum.py b/python/triton/tools/checksum.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/python/triton/utils.py b/python/triton/utils.py new file mode 100644 index 000000000..98380bf37 --- /dev/null +++ b/python/triton/utils.py @@ -0,0 +1,88 @@ +import triton.frameworks as fw +import libtriton + +def cdiv(a, b): + return -(-a // b) + +def empty(shapes, dtype, framework = None): + framework = fw._find_framework(framework) + if framework == fw.tensorflow_id: + args = [x.handle if isinstance(x, scalar) else x for x in shapes] + args = fw.tensorflow.stack(args) + return fw.tf_extra_ops.alloc_empty(args, T = dtype) + elif framework == fw.torch_id: + _import_torch() + return fw.torch.empty(*shapes) + +class lazy_shape: + + def __init__(self, shape): + self.shape = shape + + def __getitem__(self, key): + return scalar(self.shape[key]) + +def shape(A) : + fw._import_tensorflow() + return lazy_shape(fw.tensorflow.shape(A)) + + +class scalar: + + def __init__(self, x): + self.id = libtriton.make_scalar_id() + self.handle = fw.tf_extra_ops.register_scalar(x, id=self.id) + self.assume_initialized = False + + def set_assume_initialized(self): + self.assume_initialized = True + + def unset_assume_initialized(self): + self.assume_initialized = False + + def get_value(self): + if self.assume_initialized: + return libtriton.retrieve_scalar(self.id) + else: + return self.handle + + def __add__(self, other): + return self.get_value() + other + + def __radd__(self, other): + return other + self.get_value() + + def __sub__(self, other): + return self.get_value() - other + + def __rsub(self, other): + return other - self.get_value() + + def __mul__(self, other): + return self.get_value() * other + + def __rmul(self, other): + return other * self.get_value() + + def __floordiv__(self, other): + return self.get_value() // other + + def __rfloordiv__(self, other): + return other // self.get_value() + + def __div__(self, other): + return self.get_value() / other + + def __rdiv__(self, other): + return other / self.get_value() + + def __truediv__(self, other): + self.get_value().__truediv__(other) + + def __rtruediv__(self, other): + other.__truediv__(self.get_value()) + + def __neg__(self): + return -self.get_value() + + From ed0f7060052c03513cc2677b2e6c7cfb1fc0305d Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 5 Sep 2019 00:19:42 -0400 Subject: [PATCH 350/494] [python] fixed various issues in pytorch supoport --- CMakeLists.txt | 22 ++++---- python/examples/dot.py | 20 +++++-- python/setup.py | 22 +++++--- python/src/tensorflow.cc | 110 ++++++++++++++++++++++++++---------- python/triton/frameworks.py | 7 +++ python/triton/kernel.py | 49 ++++++++++------ python/triton/ops/dot.py | 26 +++++---- python/triton/utils.py | 18 ++++-- 8 files changed, 182 insertions(+), 92 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 201f14c5a..20add646f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,16 +34,18 @@ if(BUILD_PYTHON_MODULE) message(STATUS "Adding Python module") # PyBind11 wrapper source file file(GLOB_RECURSE PYTHON_SRC python/src/tensorflow.cc) - # update include directory - include_directories(python/src/ ${PYTHON_INCLUDE_DIRS} ${TF_INCLUDE_DIRS}) - # update link directories - link_directories(${TF_LIB_DIRS}) - # extra tensorflow ops (e.g., alloc_empty) - file(GLOB_RECURSE EXTRA_TF_OPS_SRC python/src/tensorflow/*.cc) - add_library(extra_tf_ops SHARED ${EXTRA_TF_OPS_SRC}) - target_link_libraries(extra_tf_ops triton ${TF_LIBS}) - target_compile_definitions(extra_tf_ops PRIVATE "-D_GLIBCXX_USE_CXX11_ABI=${TF_ABI}") - + if(TF_LIBS) + # extra tensorflow ops (e.g., alloc_empty) + # update directories + link_directories(${TF_LIB_DIRS}) + include_directories(python/src/ ${PYTHON_INCLUDE_DIRS} ${TF_INCLUDE_DIRS}) + # get sources + file(GLOB_RECURSE EXTRA_TF_OPS_SRC python/src/tensorflow/*.cc) + add_library(extra_tf_ops SHARED ${EXTRA_TF_OPS_SRC}) + # create target + target_link_libraries(extra_tf_ops triton ${TF_LIBS}) + target_compile_definitions(extra_tf_ops PRIVATE "-D_GLIBCXX_USE_CXX11_ABI=${TF_ABI}") + endif() endif() diff --git a/python/examples/dot.py b/python/examples/dot.py index 84ae9b6f3..ce8e45c34 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -1,14 +1,13 @@ import numpy as np -import tensorflow as tf import triton -def run_dot(): +def run_tf(): + import tensorflow as tf M, N, K = 128, 128, 128 a = tf.placeholder(tf.float32, shape=[M, K]) b = tf.placeholder(tf.float32, shape=[N, K]) - _dot = triton.ops.dot.apply - tr_c = _dot(a, b, transpose_a = False, transpose_b = True) - tr_d = _dot(tr_c, b, transpose_a = True, transpose_b = False) + tr_c = triton.ops.dot(a, b, transpose_a = False, transpose_b = True) + tr_d = triton.ops.dot(tr_c, b, transpose_a = True, transpose_b = False) tf_c = tf.matmul(a, b, transpose_a = False, transpose_b = True) tf_d = tf.matmul(tf_c, b, transpose_a = True, transpose_b = False) # Gradient @@ -28,4 +27,13 @@ def run_dot(): dif = np.abs(result[0][0] - result[1][0]) print("dif: %f" % np.max(dif)) -run_dot() \ No newline at end of file +def run_torch(): + import torch as th + M, N, K = 128, 128, 128 + a = th.randn(M, K).cuda() + b = th.randn(K, N).cuda() + th_c = th.matmul(a, b) + tr_c = triton.ops.dot(a, b) + print(c) + +run_torch() \ No newline at end of file diff --git a/python/setup.py b/python/setup.py index a70aa6c51..49317af9f 100644 --- a/python/setup.py +++ b/python/setup.py @@ -41,18 +41,22 @@ class CMakeBuild(build_ext): python_include_dirs = distutils.sysconfig.get_python_inc() python_lib_dirs = distutils.sysconfig.get_config_var('LIBDIR') # tensorflow directories - import tensorflow as tf - tf_abi = tf.__cxx11_abi_flag__ if "__cxx11_abi_flag__" in tf.__dict__ else 0 - tf_include_dirs = tf.sysconfig.get_include() - tf_libs = tf.sysconfig.get_link_flags()[1].replace('-l', '') cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir, '-DBUILD_TESTS=OFF', '-DBUILD_PYTHON_MODULE=ON', - '-DPYTHON_INCLUDE_DIRS=' + python_include_dirs, - '-DTF_INCLUDE_DIRS=' + tf_include_dirs, - '-DTF_LIB_DIRS=' + tf.sysconfig.get_lib(), - '-DTF_LIBS=' + tf_libs, - '-DTF_ABI=' + str(tf_abi)] + '-DPYTHON_INCLUDE_DIRS=' + python_include_dirs] + # tensorflow compatibility + try: + import tensorflow as tf + tf_abi = tf.__cxx11_abi_flag__ if "__cxx11_abi_flag__" in tf.__dict__ else 0 + tf_include_dirs = tf.sysconfig.get_include() + tf_libs = tf.sysconfig.get_link_flags()[1].replace('-l', '') + cmake_args += ['-DTF_INCLUDE_DIRS=' + tf_include_dirs, + '-DTF_LIB_DIRS=' + tf.sysconfig.get_lib(), + '-DTF_LIBS=' + tf_libs, + '-DTF_ABI=' + str(tf_abi)] + except ModuleNotFoundError: + pass cfg = 'Debug' if self.debug else 'Release' build_args = ['--config', cfg] diff --git a/python/src/tensorflow.cc b/python/src/tensorflow.cc index 95ac51620..2450f35ef 100644 --- a/python/src/tensorflow.cc +++ b/python/src/tensorflow.cc @@ -315,16 +315,8 @@ gen_tf_register_op(oss, cc_name, fn->args(), outputs); inline std::string to_torch_ty(ir::type *ty) { - if(ty->is_integer_ty(1)) - return "bool"; - if(ty->is_integer_ty(8)) - return "int8"; - if(ty->is_integer_ty(16)) - return "int16"; - if(ty->is_integer_ty(32)) - return "int32"; - if(ty->is_integer_ty(64)) - return "int64"; + if(ty->is_integer_ty()) + return "int64_t"; if(ty->is_half_ty()) return "float16"; if(ty->is_float_ty()) @@ -332,7 +324,29 @@ inline std::string to_torch_ty(ir::type *ty) { if(ty->is_double_ty()) return "float64"; if(ty->is_pointer_ty()) - return "Tensor"; + return "torch::Tensor"; + throw std::runtime_error("unknown type"); +} + +inline std::string to_c_ty(ir::type *ty) { + if(ty->is_integer_ty(1)) + return "bool"; + if(ty->is_integer_ty(8)) + return "int8_t"; + if(ty->is_integer_ty(16)) + return "int16_t"; + if(ty->is_integer_ty(32)) + return "int32_t"; + if(ty->is_integer_ty(64)) + return "int64_t"; + if(ty->is_half_ty()) + return "float16"; + if(ty->is_float_ty()) + return "float32"; + if(ty->is_double_ty()) + return "float64"; + if(ty->is_pointer_ty()) + return "drv::cu_buffer"; throw std::runtime_error("unknown type"); } @@ -352,15 +366,22 @@ void gen_torch_signature(std::ostringstream& oss, out_types.push_back((*it)->get_type()); } - oss << "std::tuple<"; - for(size_t i = 0; i < out_types.size(); i++){ - if(i > 0) - oss << ", "; - oss << to_torch_ty(out_types[i]); + std::string ret_ty; + if(out_types.empty()) + ret_ty = "void"; + else{ + ir::type* ty = out_types[0]; + ret_ty = to_torch_ty(ty); + if(out_types.size() > 1){ + for(size_t i = 1; i < out_types.size(); i++) + if(out_types[i] != ty) + throw std::runtime_error("outputs of different types not supported by pytorch"); + ret_ty = "std::vector<" + ret_ty + ">"; + } } - oss << "> "; - oss << name << "("; - oss << "int64 id" << std::endl; + + oss << ret_ty << " " << name << "("; + oss << "int64_t id, "; for(size_t i = 0; i < args.size(); i++) { ir::argument* arg = args[i]; if(i > 0) @@ -370,9 +391,16 @@ void gen_torch_signature(std::ostringstream& oss, oss << ")"; } -void gen_torch_init_driver(std::ostringstream &oss) { +void gen_torch_init_driver(std::ostringstream &oss, + const std::vector&args) { + ir::argument* tensor = nullptr; + for(ir::argument* arg: args) + if(arg->get_type()->is_pointer_ty()){ + tensor = arg; + break; + } oss << " // Wrap CUDA handles" << std::endl; - oss << " c10::DeviceIndex device = torcha.storage().device().index();" << std::endl; + oss << " c10::DeviceIndex device = " << tensor->get_name() << ".storage().device().index();" << std::endl; oss << " // Get stream" << std::endl; oss << " CUstream custream = (CUstream)at::cuda::getCurrentCUDAStream(device).stream();" << std::endl; oss << " triton::driver::cu_stream stream(custream, false);" << std::endl; @@ -383,10 +411,12 @@ void gen_torch_make_handles(std::ostream &os, const std::vector& args) { for(unsigned i = 0; i < args.size(); i++){ ir::argument *arg = args[i]; - if(!arg->get_type()->is_pointer_ty()) - continue; const std::string& name = arg->get_name(); - os << " drv::cu_buffer cu_" + name + "(ctx, " + name + ".storage().size(), (CUdeviceptr)" + name + ".storage.data(), false);\n "; + ir::type* ty = arg->get_type(); + if(!ty->is_pointer_ty()) + os << " " << to_c_ty(ty) << " arg_" << name << " = " << name << ";" << std::endl; + else + os << " drv::cu_buffer arg_" + name + "(ctx, " + name + ".storage().size(), (CUdeviceptr)" + name + ".storage().data(), false);" << std::endl; } } @@ -394,19 +424,28 @@ void gen_torch_make_launch_function(std::ostream &os, const std::vectorget_name(); + std::string name = "arg_" + arg->get_name(); if(arg->get_type()->is_pointer_ty()) - name = "&cu_" + name; + name = "&" + name; if(i > 0) os << ", "; os << name; } - os << "}, *id_grid_map.at(id), stream); \n"; + os << "}, *id_grid_map.at(id), &stream);\n"; } +void gen_torch_ret(std::ostream &os, const std::vector& outputs) { + os << " return {"; + for(size_t i = 0; i < outputs.size(); i++){ + if(i > 0) + os << ", "; + os << outputs[i]; + } + os << "};" << std::endl; +} std::tuple make_pytorch_src(const std::string& src, + std::string> make_torch_src(const std::string& src, const std::vector& outputs, const runtime::function::options_space_t& opt) { // triton-ir code-gen @@ -423,6 +462,10 @@ std::tuple> id_fn_map; gen_torch_signature(oss, fn, outputs, name); oss << " {" << std::endl; - gen_torch_init_driver(oss); + gen_torch_init_driver(oss, fn->args()); gen_torch_make_handles(oss, fn->args()); gen_torch_make_launch_function(oss, fn->args()); - oss << std::endl << "}"; + gen_torch_ret(oss, outputs); + oss << "}" << std::endl; + oss << std::endl; + oss << std::endl; oss << "static auto registry = torch::jit::RegisterOperators(\"triton::" << name << "\", &" << name << ");" << std::endl; + + return {oss.str(), name}; } @@ -453,7 +501,7 @@ PYBIND11_MODULE(libtriton, m) { m.def("make_tensorflow_src", &make_tensorflow_src, "Creates C++ source code for a custom Tensorflow op " "corresponding to the specified Triton kernel"); - m.def("make_pytorch_src", &make_pytorch_src, + m.def("make_torch_src", &make_torch_src, "Creates C++ source code for a custom PyTorch op "); // bindings for triton classes diff --git a/python/triton/frameworks.py b/python/triton/frameworks.py index 60c0728f1..4d10697ad 100644 --- a/python/triton/frameworks.py +++ b/python/triton/frameworks.py @@ -9,6 +9,13 @@ torch = None tensorflow = None tf_extra_ops = None +def to_str(framework): + if framework == tensorflow_id: + return 'tensorflow' + elif framework == torch_id: + return 'torch' + else: + assert False def _import_torch(): global torch diff --git a/python/triton/kernel.py b/python/triton/kernel.py index b3d2be50a..2a7f2c929 100644 --- a/python/triton/kernel.py +++ b/python/triton/kernel.py @@ -66,16 +66,19 @@ def _build(src, path, framework): include_dirs += [fw.tensorflow.sysconfig.get_include()] include_dirs += ['/usr/local/cuda/include/'] libraries += [fw.tensorflow.sysconfig.get_link_flags()[1].replace('-l', '')] - ABI = fw.tensorflow.__cxx11_abi_flag__ if "__cxx11_abi_flag__" in fw.tensorflow.__dict__ else 0 - extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI={ABI}'.format(ABI=ABI)] + abi = fw.tensorflow.__cxx11_abi_flag__ if "__cxx11_abi_flag__" in fw.tensorflow.__dict__ else 0 + extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI={abi}'.format(abi=abi)] elif framework == fw.torch_id: - prefix = os.path.dirname(torch.__file__) + prefix = os.path.dirname(fw.torch.__file__) library_dirs += [os.path.join(prefix, 'lib')] - include_dirs += [os.path.join(prefix, 'lib', 'include'), + include_dirs += ['/usr/local/cuda/include/', + os.path.join(prefix, 'lib', 'include'), os.path.join(prefix, 'lib', 'include', 'torch', 'csrc', 'api', 'include'), os.path.join(prefix, 'include'), os.path.join(prefix, 'include', 'torch', 'csrc', 'api', 'include')] libraries += ['torch'] + abi = fw.torch._C._GLIBCXX_USE_CXX11_ABI + extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI={abi}'.format(abi=abi)] else: assert False # extra arguments @@ -84,7 +87,7 @@ def _build(src, path, framework): depends = [os.path.realpath(libtriton.__file__)] # create extension module ext = setuptools.Extension( - name = 'tensorflow', + name = fw.to_str(framework), language = 'c++', sources = [src], include_dirs = include_dirs, @@ -124,14 +127,14 @@ def _cvt_to_def_str(obj, framework): fw.tensorflow.float64: 'double'}[obj] # torch type elif framework == fw.torch_id: - if isinstance(obj, torch.dtype): - return {torch.int8: 'char', - torch.int16: 'short', - torch.int32: 'int', - torch.int64: 'long', - torch.float16: 'half', - torch.float32: 'float', - torch.float64: 'double'}[obj] + if isinstance(obj, fw.torch.dtype): + return {fw.torch.int8: 'char', + fw.torch.int16: 'short', + fw.torch.int32: 'int', + fw.torch.int64: 'long', + fw.torch.float16: 'half', + fw.torch.float32: 'float', + fw.torch.float64: 'double'}[obj] else: assert False # default @@ -146,8 +149,8 @@ def _make_framework_op(src, outputs, options, framework): if framework == fw.tensorflow_id: return fw.tensorflow.load_op_library(so).__dict__[name] elif framework == fw.torch_id: - torch.ops.load_library(so) - return torch.ops.triton.__dict__[name] + fw.torch.ops.load_library(so) + return getattr(fw.torch.ops.triton, name) else: assert False @@ -171,7 +174,12 @@ class kernel: self.fw_op = None self.src = src self.outputs = outputs - self.framework = fw._find_framework(framework) + self.framework = framework + + def _init_framework(self): + if self.framework is not None: + return + self.framework = fw._find_framework(self.framework) if self.framework == fw.tensorflow_id: fw._import_tensorflow() fw._import_tf_extra_ops() @@ -180,8 +188,8 @@ class kernel: else: assert False - def __call__(self, *args, **kwargs): + self._init_framework() # create a new framework op when defines are different key = '-'.join(['{key}-{val}'.format(key=key, val=val) for key, val in kwargs.items()]) if key not in self.fw_id.keys(): @@ -212,4 +220,9 @@ class kernel: # create operands op_args = [x.handle if isinstance(x, triton.utils.scalar) else x for x in args[:-1]] # call framework function - return self.fw_op(*op_args, id=op_id) \ No newline at end of file + if self.framework == fw.tensorflow_id: + return self.fw_op(*op_args, id=op_id) + elif self.framework == fw.torch_id: + return self.fw_op(op_id, *op_args) + else: + assert False \ No newline at end of file diff --git a/python/triton/ops/dot.py b/python/triton/ops/dot.py index f799be983..36bde11fe 100644 --- a/python/triton/ops/dot.py +++ b/python/triton/ops/dot.py @@ -1,6 +1,6 @@ import triton -class dot(triton.function): +class _dot(triton.function): src = """ void dot(TYPE * A, TYPE * B, TYPE * C, @@ -78,30 +78,32 @@ void dot(TYPE * A, TYPE * B, TYPE * C, 'BROADCAST_BK': 'newaxis, :' if transpose_b else ':, newaxis', 'BROADCAST_BN': ':, newaxis' if transpose_b else 'newaxis, :', 'SHAPE_B' : 'TN, TK' if transpose_b else 'TK, TN'} - return dot.kernel(a, b, c, M, N, Ka, lda, ldb, ldc, grid, + return _dot.kernel(a, b, c, M, N, Ka, lda, ldb, ldc, grid, AT = transpose_a, BT = transpose_b, TYPE = dtype, TM = [64, 128], TN = [64, 128], TK = [8], **macros) @staticmethod def forward(ctx, a, b, transpose_a = False, transpose_b = False): ctx.save_for_backward(a, b, transpose_a, transpose_b) - return dot._call(a, b, transpose_a, transpose_b) + return _dot._call(a, b, transpose_a, transpose_b) @staticmethod def backward(ctx, dy): a, b, t_a, t_b = ctx.saved_tensors if not t_a and not t_b: - da = dot._call(dy, b, False, True) - db = dot._call(a, dy, True, False) + da = _dot._call(dy, b, False, True) + db = _dot._call(a, dy, True, False) elif not t_a and t_b: - da = dot._call(dy, b, False, False) - db = dot._call(dy, a, True, False) + da = _dot._call(dy, b, False, False) + db = _dot._call(dy, a, True, False) elif t_a and not t_b: - da = dot._call(b, dy, False, True) - db = dot._call(a, dy, False, False) + da = _dot._call(b, dy, False, True) + db = _dot._call(a, dy, False, False) elif t_a and t_b: - da = dot._call(b, dy, True, True) - db = dot._call(dy, a, True, True) + da = _dot._call(b, dy, True, True) + db = _dot._call(dy, a, True, True) else: assert False - return [da, db, None, None, None, None, None, None, None] \ No newline at end of file + return [da, db, None, None, None, None, None, None, None] + +dot = _dot.apply \ No newline at end of file diff --git a/python/triton/utils.py b/python/triton/utils.py index 98380bf37..422f1117b 100644 --- a/python/triton/utils.py +++ b/python/triton/utils.py @@ -7,12 +7,13 @@ def cdiv(a, b): def empty(shapes, dtype, framework = None): framework = fw._find_framework(framework) if framework == fw.tensorflow_id: + fw._import_tensorflow() args = [x.handle if isinstance(x, scalar) else x for x in shapes] args = fw.tensorflow.stack(args) return fw.tf_extra_ops.alloc_empty(args, T = dtype) elif framework == fw.torch_id: - _import_torch() - return fw.torch.empty(*shapes) + fw._import_torch() + return fw.torch.empty(*shapes).cuda() class lazy_shape: @@ -22,15 +23,20 @@ class lazy_shape: def __getitem__(self, key): return scalar(self.shape[key]) -def shape(A) : - fw._import_tensorflow() - return lazy_shape(fw.tensorflow.shape(A)) +def shape(A, framework = None) : + framework = fw._find_framework(framework) + if framework == fw.tensorflow_id: + fw._import_tensorflow() + return lazy_shape(fw.tensorflow.shape(A)) + else: + return A.shape class scalar: - def __init__(self, x): + def __init__(self, x, framework = None): self.id = libtriton.make_scalar_id() + fw._import_tf_extra_ops() self.handle = fw.tf_extra_ops.register_scalar(x, id=self.id) self.assume_initialized = False From 65133cdf33c79390648079dd49b58c7f97809bbe Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 5 Sep 2019 01:32:21 -0400 Subject: [PATCH 351/494] [python] basic support for pytorch seems to be working --- python/examples/dot.py | 1 - python/triton/function.py | 40 +++++++++++++++++++++++++-------------- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/python/examples/dot.py b/python/examples/dot.py index ce8e45c34..56788d422 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -34,6 +34,5 @@ def run_torch(): b = th.randn(K, N).cuda() th_c = th.matmul(a, b) tr_c = triton.ops.dot(a, b) - print(c) run_torch() \ No newline at end of file diff --git a/python/triton/function.py b/python/triton/function.py index 8669dbc92..c51061652 100644 --- a/python/triton/function.py +++ b/python/triton/function.py @@ -4,45 +4,49 @@ class OpContext(object): def save_for_backward(self, *tensors): self.to_save = tensors - - def mark_dirty(self, *args): - self.dirty_tensors = args @property def saved_tensors(self): return self.to_save - class function_meta(type): def __init__(cls, name, bases, attrs): cls.contexts = dict() cls.registered = False + cls.framework = None return super(function_meta, cls).__init__(name, bases, attrs) class function(metaclass = function_meta): - - def __init__(self, framework = None): - self.framework = _find_framework(framework) - pass @staticmethod def forward(ctx, *args, **kwargs): - raise NotImplementedError + raise NotImplementedError @staticmethod def backward(ctx, grad_output): - raise NotImplementedError + raise NotImplementedError @classmethod - def apply(cls, *args, **kwargs): - # call forward + def apply_torch(cls, *args, **kwargs): + fw._import_torch() + class TorchFunction(fw.torch.autograd.Function): + @staticmethod + def forward(ctx, *targs, **tkwargs): + return cls.forward(ctx, *targs, **tkwargs) + @staticmethod + def backward(ctx, grad_output): + return cls.backward(ctx, grad_output) + return TorchFunction.apply(*args, **kwargs) + + @classmethod + def apply_tensorflow(cls, *args, **kwargs): + fw._import_tensorflow() ctx = OpContext() result = cls.forward(ctx, *args, **kwargs) id = result.op.get_attr('id') cls.contexts[id] = ctx # register backward - fw._import_tensorflow() name = result.op.op_def.name if not cls.registered: @fw.tensorflow.RegisterGradient(name) @@ -51,4 +55,12 @@ class function(metaclass = function_meta): return cls.backward(cls.contexts[id], dy) cls.registered = True # return result tensor - return result \ No newline at end of file + return result + + @classmethod + def apply(cls, *args, **kwargs): + cls.framework = fw._find_framework(cls.framework) + if cls.framework == fw.tensorflow_id: + return cls.apply_tensorflow(*args, **kwargs) + else: + return cls.apply_torch(*args, **kwargs) From 44896ee777e224841fd9a5134da6a064a6a4f342 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 5 Sep 2019 02:16:27 -0400 Subject: [PATCH 352/494] [pytorch] clean-up of dynamic framework load --- python/examples/dot.py | 1 + python/triton/frameworks.py | 30 +++-------------- python/triton/function.py | 8 ++--- python/triton/kernel.py | 66 ++++++++++++++++++++----------------- python/triton/utils.py | 18 +++++----- 5 files changed, 53 insertions(+), 70 deletions(-) diff --git a/python/examples/dot.py b/python/examples/dot.py index 56788d422..f60397bb7 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -34,5 +34,6 @@ def run_torch(): b = th.randn(K, N).cuda() th_c = th.matmul(a, b) tr_c = triton.ops.dot(a, b) + print(tr_c) run_torch() \ No newline at end of file diff --git a/python/triton/frameworks.py b/python/triton/frameworks.py index 4d10697ad..e3524c7ac 100644 --- a/python/triton/frameworks.py +++ b/python/triton/frameworks.py @@ -2,21 +2,10 @@ import sys import os import libtriton -torch_id = 'torch' -tensorflow_id = 'tensorflow' - torch = None tensorflow = None tf_extra_ops = None -def to_str(framework): - if framework == tensorflow_id: - return 'tensorflow' - elif framework == torch_id: - return 'torch' - else: - assert False - def _import_torch(): global torch if torch is None: @@ -35,19 +24,8 @@ def _import_tf_extra_ops(): _import_tensorflow() tf_extra_ops = tensorflow.load_op_library(path) +def has_tensorflow(): + return 'tensorflow' in sys.modules -def _find_framework(default = None): - is_tf_imported = 'tensorflow' in sys.modules - is_torch_imported = 'torch' in sys.modules - if default: - if default not in [tensorflow_id, torch_id]: - raise ValueError('unsupported framework') - else: - return default - elif is_tf_imported and not is_torch_imported: - return tensorflow_id - elif is_torch_imported and not is_tf_imported: - return torch_id - else: - raise ValueError('cannot determine imported framework, ' - 'please provide framework argument') \ No newline at end of file +def has_torch(): + return 'torch' in sys.modules \ No newline at end of file diff --git a/python/triton/function.py b/python/triton/function.py index c51061652..53fc5dfb3 100644 --- a/python/triton/function.py +++ b/python/triton/function.py @@ -14,7 +14,6 @@ class function_meta(type): def __init__(cls, name, bases, attrs): cls.contexts = dict() cls.registered = False - cls.framework = None return super(function_meta, cls).__init__(name, bases, attrs) class function(metaclass = function_meta): @@ -59,8 +58,9 @@ class function(metaclass = function_meta): @classmethod def apply(cls, *args, **kwargs): - cls.framework = fw._find_framework(cls.framework) - if cls.framework == fw.tensorflow_id: + if fw.has_tensorflow(): return cls.apply_tensorflow(*args, **kwargs) - else: + elif fw.has_torch(): return cls.apply_torch(*args, **kwargs) + else: + assert False diff --git a/python/triton/kernel.py b/python/triton/kernel.py index 2a7f2c929..554f0db1d 100644 --- a/python/triton/kernel.py +++ b/python/triton/kernel.py @@ -14,10 +14,10 @@ import triton.frameworks as fw import triton.utils import libtriton -def _make_framework_src(src, out, grid, framework): - if framework == fw.tensorflow_id: +def _make_framework_src(src, out, grid): + if fw.has_tensorflow(): return libtriton.make_tensorflow_src(src, out, grid) - elif framework == fw.torch_id: + elif fw.has_torch: return libtriton.make_torch_src(src, out, grid) else: assert False @@ -32,10 +32,16 @@ def _make_cache_path(src): os.makedirs(cachepath) return cachepath -def _write_bindings(src, root, framework): - cpp = os.path.join(root, '{framework}.cpp'.format(framework=framework)) +def _write_bindings(src, root): + if fw.has_tensorflow(): + name = 'tensorflow' + elif fw.has_torch(): + name = 'torch' + else: + assert False + cpp = os.path.join(root, '{name}.cpp'.format(name=name)) suffix = sysconfig.get_config_var('EXT_SUFFIX') - so = os.path.join(root, '{framework}{suffix}'.format(framework=framework, suffix=suffix)) + so = os.path.join(root, '{name}{suffix}'.format(name=name, suffix=suffix)) recompile = False # recompile if .so does not exist if not os.path.exists(cpp) or not os.path.exists(so): @@ -50,7 +56,7 @@ def _write_bindings(src, root, framework): # return path of cpp file return (cpp, so) -def _build(src, path, framework): +def _build(src, path): # include directories triton_include_dirs = ['/home/philippe/development/triton/include'] include_dirs = triton_include_dirs @@ -61,14 +67,15 @@ def _build(src, path, framework): libraries = ['triton'] # add framework extra_compile_args = [] - if framework == fw.tensorflow_id: + if fw.has_tensorflow(): library_dirs += [fw.tensorflow.sysconfig.get_lib()] include_dirs += [fw.tensorflow.sysconfig.get_include()] include_dirs += ['/usr/local/cuda/include/'] libraries += [fw.tensorflow.sysconfig.get_link_flags()[1].replace('-l', '')] abi = fw.tensorflow.__cxx11_abi_flag__ if "__cxx11_abi_flag__" in fw.tensorflow.__dict__ else 0 extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI={abi}'.format(abi=abi)] - elif framework == fw.torch_id: + name = 'tensorflow' + elif fw.has_torch(): prefix = os.path.dirname(fw.torch.__file__) library_dirs += [os.path.join(prefix, 'lib')] include_dirs += ['/usr/local/cuda/include/', @@ -79,6 +86,7 @@ def _build(src, path, framework): libraries += ['torch'] abi = fw.torch._C._GLIBCXX_USE_CXX11_ABI extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI={abi}'.format(abi=abi)] + name = 'torch' else: assert False # extra arguments @@ -87,7 +95,7 @@ def _build(src, path, framework): depends = [os.path.realpath(libtriton.__file__)] # create extension module ext = setuptools.Extension( - name = fw.to_str(framework), + name = name, language = 'c++', sources = [src], include_dirs = include_dirs, @@ -104,19 +112,19 @@ def _build(src, path, framework): args.append('--build-lib=' + path) args.append('-q') args = dict( - name = 'tensorflow', + name = name, ext_modules = [ext], script_args = args, ) setuptools.setup(**args) shutil.rmtree(tmp) -def _cvt_to_def_str(obj, framework): +def _cvt_to_def_str(obj): # bool if isinstance(obj, bool): return str(int(obj)) # tensorflow type - if framework == fw.tensorflow_id: + if fw.has_tensorflow(): if isinstance(obj, fw.tensorflow.DType): return {fw.tensorflow.int8: 'char', fw.tensorflow.int16: 'short', @@ -126,7 +134,7 @@ def _cvt_to_def_str(obj, framework): fw.tensorflow.float32: 'float', fw.tensorflow.float64: 'double'}[obj] # torch type - elif framework == fw.torch_id: + elif fw.has_torch(): if isinstance(obj, fw.torch.dtype): return {fw.torch.int8: 'char', fw.torch.int16: 'short', @@ -141,14 +149,14 @@ def _cvt_to_def_str(obj, framework): return str(obj) -def _make_framework_op(src, outputs, options, framework): - src, name = _make_framework_src(src, outputs, options, framework) +def _make_framework_op(src, outputs, options): + src, name = _make_framework_src(src, outputs, options) cache_path = _make_cache_path(src) - cpp, so = _write_bindings(src, cache_path, framework) - _build(cpp, cache_path, framework) - if framework == fw.tensorflow_id: + cpp, so = _write_bindings(src, cache_path) + _build(cpp, cache_path) + if fw.has_tensorflow(): return fw.tensorflow.load_op_library(so).__dict__[name] - elif framework == fw.torch_id: + elif fw.has_torch(): fw.torch.ops.load_library(so) return getattr(fw.torch.ops.triton, name) else: @@ -168,22 +176,18 @@ def _make_grid(args) : class kernel: - def __init__(self, src, outputs, framework = None): + def __init__(self, src, outputs): self.fw_id = dict() self.fw_grids = dict() self.fw_op = None self.src = src self.outputs = outputs - self.framework = framework def _init_framework(self): - if self.framework is not None: - return - self.framework = fw._find_framework(self.framework) - if self.framework == fw.tensorflow_id: + if fw.has_tensorflow(): fw._import_tensorflow() fw._import_tf_extra_ops() - elif self.framework == fw.torch_id: + elif fw.has_torch(): fw._import_torch() else: assert False @@ -196,7 +200,7 @@ class kernel: # code generation options defines = [] for k, v in kwargs.items(): - cvt = lambda x: _cvt_to_def_str(x, self.framework) + cvt = lambda x: _cvt_to_def_str(x) if(isinstance(v, list)): values = list(map(cvt, v)) else: @@ -211,7 +215,7 @@ class kernel: # register function libtriton.register_fn(op_id, self.src, opt) if self.fw_op is None: - self.fw_op = _make_framework_op(self.src, self.outputs, opt, self.framework) + self.fw_op = _make_framework_op(self.src, self.outputs, opt) # retrieve framework op op_id = self.fw_id[key] @@ -220,9 +224,9 @@ class kernel: # create operands op_args = [x.handle if isinstance(x, triton.utils.scalar) else x for x in args[:-1]] # call framework function - if self.framework == fw.tensorflow_id: + if fw.has_tensorflow(): return self.fw_op(*op_args, id=op_id) - elif self.framework == fw.torch_id: + elif fw.has_torch(): return self.fw_op(op_id, *op_args) else: assert False \ No newline at end of file diff --git a/python/triton/utils.py b/python/triton/utils.py index 422f1117b..3ef8be7b9 100644 --- a/python/triton/utils.py +++ b/python/triton/utils.py @@ -4,14 +4,13 @@ import libtriton def cdiv(a, b): return -(-a // b) -def empty(shapes, dtype, framework = None): - framework = fw._find_framework(framework) - if framework == fw.tensorflow_id: +def empty(shapes, dtype): + if fw.has_tensorflow(): fw._import_tensorflow() args = [x.handle if isinstance(x, scalar) else x for x in shapes] args = fw.tensorflow.stack(args) return fw.tf_extra_ops.alloc_empty(args, T = dtype) - elif framework == fw.torch_id: + elif fw.has_torch(): fw._import_torch() return fw.torch.empty(*shapes).cuda() @@ -23,18 +22,19 @@ class lazy_shape: def __getitem__(self, key): return scalar(self.shape[key]) -def shape(A, framework = None) : - framework = fw._find_framework(framework) - if framework == fw.tensorflow_id: +def shape(A) : + if fw.has_tensorflow(): fw._import_tensorflow() return lazy_shape(fw.tensorflow.shape(A)) - else: + elif fw.has_torch(): return A.shape + else: + assert False class scalar: - def __init__(self, x, framework = None): + def __init__(self, x): self.id = libtriton.make_scalar_id() fw._import_tf_extra_ops() self.handle = fw.tf_extra_ops.register_scalar(x, id=self.id) From b2629da1fea213496f3bfe01ec3641121c911d01 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 5 Sep 2019 02:21:07 -0400 Subject: [PATCH 353/494] [python] more cleaning of frameworks logic --- python/triton/frameworks.py | 11 +++++++++-- python/triton/function.py | 2 -- python/triton/kernel.py | 10 ---------- python/triton/utils.py | 4 ---- 4 files changed, 9 insertions(+), 18 deletions(-) diff --git a/python/triton/frameworks.py b/python/triton/frameworks.py index e3524c7ac..fcab5dcbf 100644 --- a/python/triton/frameworks.py +++ b/python/triton/frameworks.py @@ -25,7 +25,14 @@ def _import_tf_extra_ops(): tf_extra_ops = tensorflow.load_op_library(path) def has_tensorflow(): - return 'tensorflow' in sys.modules + result = 'tensorflow' in sys.modules + if result: + _import_tensorflow() + _import_tf_extra_ops() + return result def has_torch(): - return 'torch' in sys.modules \ No newline at end of file + result = 'torch' in sys.modules + if result: + _import_torch() + return result \ No newline at end of file diff --git a/python/triton/function.py b/python/triton/function.py index 53fc5dfb3..125cad668 100644 --- a/python/triton/function.py +++ b/python/triton/function.py @@ -28,7 +28,6 @@ class function(metaclass = function_meta): @classmethod def apply_torch(cls, *args, **kwargs): - fw._import_torch() class TorchFunction(fw.torch.autograd.Function): @staticmethod def forward(ctx, *targs, **tkwargs): @@ -40,7 +39,6 @@ class function(metaclass = function_meta): @classmethod def apply_tensorflow(cls, *args, **kwargs): - fw._import_tensorflow() ctx = OpContext() result = cls.forward(ctx, *args, **kwargs) id = result.op.get_attr('id') diff --git a/python/triton/kernel.py b/python/triton/kernel.py index 554f0db1d..355bc3675 100644 --- a/python/triton/kernel.py +++ b/python/triton/kernel.py @@ -183,17 +183,7 @@ class kernel: self.src = src self.outputs = outputs - def _init_framework(self): - if fw.has_tensorflow(): - fw._import_tensorflow() - fw._import_tf_extra_ops() - elif fw.has_torch(): - fw._import_torch() - else: - assert False - def __call__(self, *args, **kwargs): - self._init_framework() # create a new framework op when defines are different key = '-'.join(['{key}-{val}'.format(key=key, val=val) for key, val in kwargs.items()]) if key not in self.fw_id.keys(): diff --git a/python/triton/utils.py b/python/triton/utils.py index 3ef8be7b9..6c5df7b09 100644 --- a/python/triton/utils.py +++ b/python/triton/utils.py @@ -6,12 +6,10 @@ def cdiv(a, b): def empty(shapes, dtype): if fw.has_tensorflow(): - fw._import_tensorflow() args = [x.handle if isinstance(x, scalar) else x for x in shapes] args = fw.tensorflow.stack(args) return fw.tf_extra_ops.alloc_empty(args, T = dtype) elif fw.has_torch(): - fw._import_torch() return fw.torch.empty(*shapes).cuda() class lazy_shape: @@ -24,7 +22,6 @@ class lazy_shape: def shape(A) : if fw.has_tensorflow(): - fw._import_tensorflow() return lazy_shape(fw.tensorflow.shape(A)) elif fw.has_torch(): return A.shape @@ -36,7 +33,6 @@ class scalar: def __init__(self, x): self.id = libtriton.make_scalar_id() - fw._import_tf_extra_ops() self.handle = fw.tf_extra_ops.register_scalar(x, id=self.id) self.assume_initialized = False From 58544d05235a5e0d8f73638545e9a2c499b528ad Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 5 Sep 2019 09:39:58 -0400 Subject: [PATCH 354/494] [python] renamed src/tensorflow.cc -> src/bindings.cc --- CMakeLists.txt | 5 +++-- python/setup.py | 3 +-- python/src/{tensorflow.cc => bindings.cc} | 0 3 files changed, 4 insertions(+), 4 deletions(-) rename python/src/{tensorflow.cc => bindings.cc} (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 20add646f..78f2967ad 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,12 +33,13 @@ endif() if(BUILD_PYTHON_MODULE) message(STATUS "Adding Python module") # PyBind11 wrapper source file - file(GLOB_RECURSE PYTHON_SRC python/src/tensorflow.cc) + file(GLOB_RECURSE PYTHON_SRC python/src/bindings.cc) + include_directories(python/src/ ${PYTHON_INCLUDE_DIRS}) if(TF_LIBS) # extra tensorflow ops (e.g., alloc_empty) # update directories link_directories(${TF_LIB_DIRS}) - include_directories(python/src/ ${PYTHON_INCLUDE_DIRS} ${TF_INCLUDE_DIRS}) + include_directories(${TF_INCLUDE_DIRS}) # get sources file(GLOB_RECURSE EXTRA_TF_OPS_SRC python/src/tensorflow/*.cc) add_library(extra_tf_ops SHARED ${EXTRA_TF_OPS_SRC}) diff --git a/python/setup.py b/python/setup.py index 49317af9f..2ae0dba63 100644 --- a/python/setup.py +++ b/python/setup.py @@ -37,10 +37,9 @@ class CMakeBuild(build_ext): def build_extension(self, ext): self.debug = True extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name))) - # python directors + # python directories python_include_dirs = distutils.sysconfig.get_python_inc() python_lib_dirs = distutils.sysconfig.get_config_var('LIBDIR') - # tensorflow directories cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir, '-DBUILD_TESTS=OFF', '-DBUILD_PYTHON_MODULE=ON', diff --git a/python/src/tensorflow.cc b/python/src/bindings.cc similarity index 100% rename from python/src/tensorflow.cc rename to python/src/bindings.cc From 2d6c8311e8e9d7ff25ec06e59cc578866fd317d5 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 5 Sep 2019 12:30:51 -0400 Subject: [PATCH 355/494] [python] upgraded pybind11 ; forcing torch tensors to be contiguous() --- lib/runtime/function.cc | 2 + python/examples/dot.py | 17 +- python/src/bindings.cc | 14 +- python/src/pybind11/attr.h | 10 +- python/src/pybind11/cast.h | 189 ++++++---- python/src/pybind11/complex.h | 4 + python/src/pybind11/detail/class.h | 9 +- python/src/pybind11/detail/common.h | 90 ++--- python/src/pybind11/detail/descr.h | 199 +++------- python/src/pybind11/detail/init.h | 2 +- python/src/pybind11/detail/internals.h | 16 +- python/src/pybind11/detail/typeid.h | 2 + python/src/pybind11/eigen.h | 53 +-- python/src/pybind11/embed.h | 8 +- python/src/pybind11/functional.h | 21 +- python/src/pybind11/iostream.h | 21 +- python/src/pybind11/numpy.h | 109 +++--- python/src/pybind11/pybind11.h | 487 +++++++++++++++++-------- python/src/pybind11/pytypes.h | 151 +++++++- python/src/pybind11/stl.h | 36 +- python/src/pybind11/stl_bind.h | 33 +- python/triton/kernel.py | 3 +- python/triton/ops/dot.py | 15 +- 23 files changed, 960 insertions(+), 531 deletions(-) diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 838975086..75108c268 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -202,9 +202,11 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::selection selection(&shmem_allocation, &grids, &shmem_info, &alignment_info, target.get()); + // run passes peephole.run(module); dce.run(module); +// ir::print(module, std::cout); alignment_info.run(module); grids.run(module); // ir::print(module, std::cout); diff --git a/python/examples/dot.py b/python/examples/dot.py index f60397bb7..5c2dce459 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -29,11 +29,22 @@ def run_tf(): def run_torch(): import torch as th + th.manual_seed(0) M, N, K = 128, 128, 128 a = th.randn(M, K).cuda() b = th.randn(K, N).cuda() - th_c = th.matmul(a, b) - tr_c = triton.ops.dot(a, b) - print(tr_c) + b.requires_grad_(True) + #th_c = th.matmul(a, th.t(b)) + #th_d = th.matmul(th.t(th_c), b) + tr_c = triton.ops.dot(a, b, False, True) + #tr_d = triton.ops.dot(tr_c, b, True, False) + y = th.sum(tr_c) + #print('backprop', y) + y.backward() + #print('backward done') + print(b.grad) + #th_d.backward() + #print(a.grad) + run_torch() \ No newline at end of file diff --git a/python/src/bindings.cc b/python/src/bindings.cc index 2450f35ef..4ef860347 100644 --- a/python/src/bindings.cc +++ b/python/src/bindings.cc @@ -35,7 +35,6 @@ void register_grid(size_t id, void delete_grid(size_t id) { id_grid_map.erase(id); - std::cout << "deleted " << id_grid_map.size() << std::endl; } void register_fn(size_t id, @@ -46,7 +45,6 @@ void register_fn(size_t id, void delete_fn(size_t id) { id_fn_map.erase(id); - std::cout << "deleted " << id_fn_map.size() << std::endl; } void cleanup() { @@ -415,8 +413,10 @@ void gen_torch_make_handles(std::ostream &os, ir::type* ty = arg->get_type(); if(!ty->is_pointer_ty()) os << " " << to_c_ty(ty) << " arg_" << name << " = " << name << ";" << std::endl; - else + else{ + os << " CHECK_INPUT(" << name << ");" << std::endl; os << " drv::cu_buffer arg_" + name + "(ctx, " + name + ".storage().size(), (CUdeviceptr)" + name + ".storage().data(), false);" << std::endl; + } } } @@ -435,6 +435,10 @@ void gen_torch_make_launch_function(std::ostream &os, const std::vector& outputs) { + if(outputs.size() == 1){ + os << " return " << outputs[0] << ";" << std::endl; + return; + } os << " return {"; for(size_t i = 0; i < outputs.size(); i++){ if(i > 0) @@ -467,6 +471,10 @@ std::tuple::init_instance void (*init_instance)(instance *, const void *) = nullptr; @@ -278,7 +282,7 @@ struct type_record { } }; -inline function_call::function_call(function_record &f, handle p) : +inline function_call::function_call(const function_record &f, handle p) : func(f), parent(p) { args.reserve(f.nargs); args_convert.reserve(f.nargs); diff --git a/python/src/pybind11/cast.h b/python/src/pybind11/cast.h index 214545083..8d0fd5d90 100644 --- a/python/src/pybind11/cast.h +++ b/python/src/pybind11/cast.h @@ -17,6 +17,7 @@ #include #include #include +#include #if defined(PYBIND11_CPP17) # if defined(__has_include) @@ -203,10 +204,10 @@ PYBIND11_NOINLINE inline handle get_type_handle(const std::type_info &tp, bool t } struct value_and_holder { - instance *inst; - size_t index; - const detail::type_info *type; - void **vh; + instance *inst = nullptr; + size_t index = 0u; + const detail::type_info *type = nullptr; + void **vh = nullptr; // Main constructor for a found value/holder: value_and_holder(instance *i, const detail::type_info *type, size_t vpos, size_t index) : @@ -215,7 +216,7 @@ struct value_and_holder { {} // Default constructor (used to signal a value-and-holder not found by get_value_and_holder()) - value_and_holder() : inst{nullptr} {} + value_and_holder() {} // Used for past-the-end iterator value_and_holder(size_t index) : index{index} {} @@ -269,8 +270,8 @@ public: struct iterator { private: - instance *inst; - const type_vec *types; + instance *inst = nullptr; + const type_vec *types = nullptr; value_and_holder curr; friend struct values_and_holders; iterator(instance *inst, const type_vec *tinfo) @@ -570,7 +571,17 @@ public: // Lazy allocation for unallocated values: if (vptr == nullptr) { auto *type = v_h.type ? v_h.type : typeinfo; - vptr = type->operator_new(type->type_size); + if (type->operator_new) { + vptr = type->operator_new(type->type_size); + } else { + #if defined(PYBIND11_CPP17) + if (type->type_align > __STDCPP_DEFAULT_NEW_ALIGNMENT__) + vptr = ::operator new(type->type_size, + (std::align_val_t) type->type_align); + else + #endif + vptr = ::operator new(type->type_size); + } } value = vptr; } @@ -774,11 +785,47 @@ template struct is_copy_constructible, is_copy_constructible> {}; #endif +NAMESPACE_END(detail) + +// polymorphic_type_hook::get(src, tinfo) determines whether the object pointed +// to by `src` actually is an instance of some class derived from `itype`. +// If so, it sets `tinfo` to point to the std::type_info representing that derived +// type, and returns a pointer to the start of the most-derived object of that type +// (in which `src` is a subobject; this will be the same address as `src` in most +// single inheritance cases). If not, or if `src` is nullptr, it simply returns `src` +// and leaves `tinfo` at its default value of nullptr. +// +// The default polymorphic_type_hook just returns src. A specialization for polymorphic +// types determines the runtime type of the passed object and adjusts the this-pointer +// appropriately via dynamic_cast. This is what enables a C++ Animal* to appear +// to Python as a Dog (if Dog inherits from Animal, Animal is polymorphic, Dog is +// registered with pybind11, and this Animal is in fact a Dog). +// +// You may specialize polymorphic_type_hook yourself for types that want to appear +// polymorphic to Python but do not use C++ RTTI. (This is a not uncommon pattern +// in performance-sensitive applications, used most notably in LLVM.) +template +struct polymorphic_type_hook +{ + static const void *get(const itype *src, const std::type_info*&) { return src; } +}; +template +struct polymorphic_type_hook::value>> +{ + static const void *get(const itype *src, const std::type_info*& type) { + type = src ? &typeid(*src) : nullptr; + return dynamic_cast(src); + } +}; + +NAMESPACE_BEGIN(detail) + /// Generic type caster for objects stored on the heap template class type_caster_base : public type_caster_generic { using itype = intrinsic_t; + public: - static PYBIND11_DESCR name() { return type_descr(_()); } + static constexpr auto name = _(); type_caster_base() : type_caster_base(typeid(type)) { } explicit type_caster_base(const std::type_info &info) : type_caster_generic(info) { } @@ -793,32 +840,28 @@ public: return cast(&src, return_value_policy::move, parent); } - // Returns a (pointer, type_info) pair taking care of necessary RTTI type lookup for a - // polymorphic type. If the instance isn't derived, returns the non-RTTI base version. - template ::value, int> = 0> + // Returns a (pointer, type_info) pair taking care of necessary type lookup for a + // polymorphic type (using RTTI by default, but can be overridden by specializing + // polymorphic_type_hook). If the instance isn't derived, returns the base version. static std::pair src_and_type(const itype *src) { - const void *vsrc = src; auto &cast_type = typeid(itype); const std::type_info *instance_type = nullptr; - if (vsrc) { - instance_type = &typeid(*src); - if (!same_type(cast_type, *instance_type)) { - // This is a base pointer to a derived type; if it is a pybind11-registered type, we - // can get the correct derived pointer (which may be != base pointer) by a - // dynamic_cast to most derived type: - if (auto *tpi = get_type_info(*instance_type)) - return {dynamic_cast(src), const_cast(tpi)}; - } + const void *vsrc = polymorphic_type_hook::get(src, instance_type); + if (instance_type && !same_type(cast_type, *instance_type)) { + // This is a base pointer to a derived type. If the derived type is registered + // with pybind11, we want to make the full derived object available. + // In the typical case where itype is polymorphic, we get the correct + // derived pointer (which may be != base pointer) by a dynamic_cast to + // most derived type. If itype is not polymorphic, we won't get here + // except via a user-provided specialization of polymorphic_type_hook, + // and the user has promised that no this-pointer adjustment is + // required in that case, so it's OK to use static_cast. + if (const auto *tpi = get_type_info(*instance_type)) + return {vsrc, tpi}; } // Otherwise we have either a nullptr, an `itype` pointer, or an unknown derived pointer, so // don't do a cast - return type_caster_generic::src_and_type(vsrc, cast_type, instance_type); - } - - // Non-polymorphic type, so no dynamic casting; just call the generic version directly - template ::value, int> = 0> - static std::pair src_and_type(const itype *src) { - return type_caster_generic::src_and_type(src, typeid(itype)); + return type_caster_generic::src_and_type(src, cast_type, instance_type); } static handle cast(const itype *src, return_value_policy policy, handle parent) { @@ -835,7 +878,7 @@ public: nullptr, nullptr, holder); } - template using cast_op_type = cast_op_type; + template using cast_op_type = detail::cast_op_type; operator itype*() { return (type *) value; } operator itype&() { if (!value) throw reference_cast_error(); return *((itype *) value); } @@ -885,7 +928,7 @@ private: "std::reference_wrapper caster requires T to have a caster with an `T &` operator"); public: bool load(handle src, bool convert) { return subcaster.load(src, convert); } - static PYBIND11_DESCR name() { return caster_t::name(); } + static constexpr auto name = caster_t::name; static handle cast(const std::reference_wrapper &src, return_value_policy policy, handle parent) { // It is definitely wrong to take ownership of this pointer, so mask that rvp if (policy == return_value_policy::take_ownership || policy == return_value_policy::automatic) @@ -900,7 +943,7 @@ public: protected: \ type value; \ public: \ - static PYBIND11_DESCR name() { return type_descr(py_name); } \ + static constexpr auto name = py_name; \ template >::value, int> = 0> \ static handle cast(T_ *src, return_value_policy policy, handle parent) { \ if (!src) return none().release(); \ @@ -977,20 +1020,34 @@ public: return true; } - static handle cast(T src, return_value_policy /* policy */, handle /* parent */) { - if (std::is_floating_point::value) { - return PyFloat_FromDouble((double) src); - } else if (sizeof(T) <= sizeof(long)) { - if (std::is_signed::value) - return PyLong_FromLong((long) src); - else - return PyLong_FromUnsignedLong((unsigned long) src); - } else { - if (std::is_signed::value) - return PyLong_FromLongLong((long long) src); - else - return PyLong_FromUnsignedLongLong((unsigned long long) src); - } + template + static typename std::enable_if::value, handle>::type + cast(U src, return_value_policy /* policy */, handle /* parent */) { + return PyFloat_FromDouble((double) src); + } + + template + static typename std::enable_if::value && std::is_signed::value && (sizeof(U) <= sizeof(long)), handle>::type + cast(U src, return_value_policy /* policy */, handle /* parent */) { + return PYBIND11_LONG_FROM_SIGNED((long) src); + } + + template + static typename std::enable_if::value && std::is_unsigned::value && (sizeof(U) <= sizeof(unsigned long)), handle>::type + cast(U src, return_value_policy /* policy */, handle /* parent */) { + return PYBIND11_LONG_FROM_UNSIGNED((unsigned long) src); + } + + template + static typename std::enable_if::value && std::is_signed::value && (sizeof(U) > sizeof(long)), handle>::type + cast(U src, return_value_policy /* policy */, handle /* parent */) { + return PyLong_FromLongLong((long long) src); + } + + template + static typename std::enable_if::value && std::is_unsigned::value && (sizeof(U) > sizeof(unsigned long)), handle>::type + cast(U src, return_value_policy /* policy */, handle /* parent */) { + return PyLong_FromUnsignedLongLong((unsigned long long) src); } PYBIND11_TYPE_CASTER(T, _::value>("int", "float")); @@ -1049,7 +1106,7 @@ public: template using cast_op_type = void*&; operator void *&() { return value; } - static PYBIND11_DESCR name() { return type_descr(_("capsule")); } + static constexpr auto name = _("capsule"); private: void *value = nullptr; }; @@ -1292,7 +1349,7 @@ public: return one_char; } - static PYBIND11_DESCR name() { return type_descr(_(PYBIND11_STRING_NAME)); } + static constexpr auto name = _(PYBIND11_STRING_NAME); template using cast_op_type = pybind11::detail::cast_op_type<_T>; }; @@ -1317,9 +1374,7 @@ public: return cast_impl(std::forward(src), policy, parent, indices{}); } - static PYBIND11_DESCR name() { - return type_descr(_("Tuple[") + detail::concat(make_caster::name()...) + _("]")); - } + static constexpr auto name = _("Tuple[") + concat(make_caster::name...) + _("]"); template using cast_op_type = type; @@ -1464,7 +1519,7 @@ struct move_only_holder_caster { auto *ptr = holder_helper::get(src); return type_caster_base::cast_holder(ptr, std::addressof(src)); } - static PYBIND11_DESCR name() { return type_caster_base::name(); } + static constexpr auto name = type_caster_base::name; }; template @@ -1495,10 +1550,10 @@ template struct is_holder_type : template struct is_holder_type> : std::true_type {}; -template struct handle_type_name { static PYBIND11_DESCR name() { return _(); } }; -template <> struct handle_type_name { static PYBIND11_DESCR name() { return _(PYBIND11_BYTES_NAME); } }; -template <> struct handle_type_name { static PYBIND11_DESCR name() { return _("*args"); } }; -template <> struct handle_type_name { static PYBIND11_DESCR name() { return _("**kwargs"); } }; +template struct handle_type_name { static constexpr auto name = _(); }; +template <> struct handle_type_name { static constexpr auto name = _(PYBIND11_BYTES_NAME); }; +template <> struct handle_type_name { static constexpr auto name = _("*args"); }; +template <> struct handle_type_name { static constexpr auto name = _("**kwargs"); }; template struct pyobject_caster { @@ -1516,7 +1571,7 @@ struct pyobject_caster { static handle cast(const handle &src, return_value_policy /* policy */, handle /* parent */) { return src.inc_ref(); } - PYBIND11_TYPE_CASTER(type, handle_type_name::name()); + PYBIND11_TYPE_CASTER(type, handle_type_name::name); }; template @@ -1556,7 +1611,8 @@ template using move_never = none_of, move_if_unrefer // everything else returns a reference/pointer to a local variable. template using cast_is_temporary_value_reference = bool_constant< (std::is_reference::value || std::is_pointer::value) && - !std::is_base_of>::value + !std::is_base_of>::value && + !std::is_same, void>::value >; // When a value returned from a C++ function is being cast back to Python, we almost always want to @@ -1569,8 +1625,9 @@ template struct return_value_policy_ov template struct return_value_policy_override>::value, void>> { static return_value_policy policy(return_value_policy p) { - return !std::is_lvalue_reference::value && !std::is_pointer::value - ? return_value_policy::move : p; + return !std::is_lvalue_reference::value && + !std::is_pointer::value + ? return_value_policy::move : p; } }; @@ -1798,7 +1855,7 @@ struct function_record; /// Internal data associated with a single function call struct function_call { - function_call(function_record &f, handle p); // Implementation in attr.h + function_call(const function_record &f, handle p); // Implementation in attr.h /// The function data: const function_record &func; @@ -1840,7 +1897,7 @@ public: static constexpr bool has_kwargs = kwargs_pos < 0; static constexpr bool has_args = args_pos < 0; - static PYBIND11_DESCR arg_names() { return detail::concat(make_caster::name()...); } + static constexpr auto arg_names = concat(type_descr(make_caster::name)...); bool load_args(function_call &call) { return load_impl_sequence(call, indices{}); @@ -2059,9 +2116,13 @@ object object_api::call(Args &&...args) const { NAMESPACE_END(detail) -#define PYBIND11_MAKE_OPAQUE(Type) \ +#define PYBIND11_MAKE_OPAQUE(...) \ namespace pybind11 { namespace detail { \ - template<> class type_caster : public type_caster_base { }; \ + template<> class type_caster<__VA_ARGS__> : public type_caster_base<__VA_ARGS__> { }; \ }} +/// Lets you pass a type containing a `,` through a macro parameter without needing a separate +/// typedef, e.g.: `PYBIND11_OVERLOAD(PYBIND11_TYPE(ReturnType), PYBIND11_TYPE(Parent), f, arg)` +#define PYBIND11_TYPE(...) __VA_ARGS__ + NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/python/src/pybind11/complex.h b/python/src/pybind11/complex.h index 5dac27cc4..3f8963857 100644 --- a/python/src/pybind11/complex.h +++ b/python/src/pybind11/complex.h @@ -25,9 +25,13 @@ template struct format_descriptor, detail::enable_i static std::string format() { return std::string(value); } }; +#ifndef PYBIND11_CPP17 + template constexpr const char format_descriptor< std::complex, detail::enable_if_t::value>>::value[3]; +#endif + NAMESPACE_BEGIN(detail) template struct is_fmt_numeric, detail::enable_if_t::value>> { diff --git a/python/src/pybind11/detail/class.h b/python/src/pybind11/detail/class.h index ff06370fa..b1916fcd0 100644 --- a/python/src/pybind11/detail/class.h +++ b/python/src/pybind11/detail/class.h @@ -10,6 +10,7 @@ #pragma once #include "../attr.h" +#include "../options.h" NAMESPACE_BEGIN(PYBIND11_NAMESPACE) NAMESPACE_BEGIN(detail) @@ -289,13 +290,9 @@ extern "C" inline int pybind11_object_init(PyObject *self, PyObject *, PyObject inline void add_patient(PyObject *nurse, PyObject *patient) { auto &internals = get_internals(); auto instance = reinterpret_cast(nurse); - auto ¤t_patients = internals.patients[nurse]; instance->has_patients = true; - for (auto &p : current_patients) - if (p == patient) - return; Py_INCREF(patient); - current_patients.push_back(patient); + internals.patients[nurse].push_back(patient); } inline void clear_patients(PyObject *self) { @@ -472,7 +469,7 @@ extern "C" inline int pybind11_getbuffer(PyObject *obj, Py_buffer *view, int fla if (tinfo && tinfo->get_buffer) break; } - if (view == nullptr || obj == nullptr || !tinfo || !tinfo->get_buffer) { + if (view == nullptr || !tinfo || !tinfo->get_buffer) { if (view) view->obj = nullptr; PyErr_SetString(PyExc_BufferError, "pybind11_getbuffer(): Internal error"); diff --git a/python/src/pybind11/detail/common.h b/python/src/pybind11/detail/common.h index 892de0f8f..bec8ccf3b 100644 --- a/python/src/pybind11/detail/common.h +++ b/python/src/pybind11/detail/common.h @@ -93,8 +93,8 @@ #endif #define PYBIND11_VERSION_MAJOR 2 -#define PYBIND11_VERSION_MINOR 2 -#define PYBIND11_VERSION_PATCH 4 +#define PYBIND11_VERSION_MINOR 3 +#define PYBIND11_VERSION_PATCH 0 /// Include Python header, disable linking to pythonX_d.lib on Windows in debug mode #if defined(_MSC_VER) @@ -159,6 +159,8 @@ #define PYBIND11_BYTES_SIZE PyBytes_Size #define PYBIND11_LONG_CHECK(o) PyLong_Check(o) #define PYBIND11_LONG_AS_LONGLONG(o) PyLong_AsLongLong(o) +#define PYBIND11_LONG_FROM_SIGNED(o) PyLong_FromSsize_t((ssize_t) o) +#define PYBIND11_LONG_FROM_UNSIGNED(o) PyLong_FromSize_t((size_t) o) #define PYBIND11_BYTES_NAME "bytes" #define PYBIND11_STRING_NAME "str" #define PYBIND11_SLICE_OBJECT PyObject @@ -181,6 +183,8 @@ #define PYBIND11_BYTES_SIZE PyString_Size #define PYBIND11_LONG_CHECK(o) (PyInt_Check(o) || PyLong_Check(o)) #define PYBIND11_LONG_AS_LONGLONG(o) (PyInt_Check(o) ? (long long) PyLong_AsLong(o) : PyLong_AsLongLong(o)) +#define PYBIND11_LONG_FROM_SIGNED(o) PyInt_FromSsize_t((ssize_t) o) // Returns long if needed. +#define PYBIND11_LONG_FROM_UNSIGNED(o) PyInt_FromSize_t((size_t) o) // Returns long if needed. #define PYBIND11_BYTES_NAME "str" #define PYBIND11_STRING_NAME "unicode" #define PYBIND11_SLICE_OBJECT PySliceObject @@ -208,6 +212,31 @@ extern "C" { #define PYBIND11_TOSTRING(x) PYBIND11_STRINGIFY(x) #define PYBIND11_CONCAT(first, second) first##second +#define PYBIND11_CHECK_PYTHON_VERSION \ + { \ + const char *compiled_ver = PYBIND11_TOSTRING(PY_MAJOR_VERSION) \ + "." PYBIND11_TOSTRING(PY_MINOR_VERSION); \ + const char *runtime_ver = Py_GetVersion(); \ + size_t len = std::strlen(compiled_ver); \ + if (std::strncmp(runtime_ver, compiled_ver, len) != 0 \ + || (runtime_ver[len] >= '0' && runtime_ver[len] <= '9')) { \ + PyErr_Format(PyExc_ImportError, \ + "Python version mismatch: module was compiled for Python %s, " \ + "but the interpreter version is incompatible: %s.", \ + compiled_ver, runtime_ver); \ + return nullptr; \ + } \ + } + +#define PYBIND11_CATCH_INIT_EXCEPTIONS \ + catch (pybind11::error_already_set &e) { \ + PyErr_SetString(PyExc_ImportError, e.what()); \ + return nullptr; \ + } catch (const std::exception &e) { \ + PyErr_SetString(PyExc_ImportError, e.what()); \ + return nullptr; \ + } \ + /** \rst ***Deprecated in favor of PYBIND11_MODULE*** @@ -227,27 +256,10 @@ extern "C" { PYBIND11_DEPRECATED("PYBIND11_PLUGIN is deprecated, use PYBIND11_MODULE") \ static PyObject *pybind11_init(); \ PYBIND11_PLUGIN_IMPL(name) { \ - int major, minor; \ - if (sscanf(Py_GetVersion(), "%i.%i", &major, &minor) != 2) { \ - PyErr_SetString(PyExc_ImportError, "Can't parse Python version."); \ - return nullptr; \ - } else if (major != PY_MAJOR_VERSION || minor != PY_MINOR_VERSION) { \ - PyErr_Format(PyExc_ImportError, \ - "Python version mismatch: module was compiled for " \ - "version %i.%i, while the interpreter is running " \ - "version %i.%i.", PY_MAJOR_VERSION, PY_MINOR_VERSION, \ - major, minor); \ - return nullptr; \ - } \ + PYBIND11_CHECK_PYTHON_VERSION \ try { \ return pybind11_init(); \ - } catch (pybind11::error_already_set &e) { \ - PyErr_SetString(PyExc_ImportError, e.what()); \ - return nullptr; \ - } catch (const std::exception &e) { \ - PyErr_SetString(PyExc_ImportError, e.what()); \ - return nullptr; \ - } \ + } PYBIND11_CATCH_INIT_EXCEPTIONS \ } \ PyObject *pybind11_init() @@ -271,29 +283,12 @@ extern "C" { #define PYBIND11_MODULE(name, variable) \ static void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &); \ PYBIND11_PLUGIN_IMPL(name) { \ - int major, minor; \ - if (sscanf(Py_GetVersion(), "%i.%i", &major, &minor) != 2) { \ - PyErr_SetString(PyExc_ImportError, "Can't parse Python version."); \ - return nullptr; \ - } else if (major != PY_MAJOR_VERSION || minor != PY_MINOR_VERSION) { \ - PyErr_Format(PyExc_ImportError, \ - "Python version mismatch: module was compiled for " \ - "version %i.%i, while the interpreter is running " \ - "version %i.%i.", PY_MAJOR_VERSION, PY_MINOR_VERSION, \ - major, minor); \ - return nullptr; \ - } \ + PYBIND11_CHECK_PYTHON_VERSION \ auto m = pybind11::module(PYBIND11_TOSTRING(name)); \ try { \ PYBIND11_CONCAT(pybind11_init_, name)(m); \ return m.ptr(); \ - } catch (pybind11::error_already_set &e) { \ - PyErr_SetString(PyExc_ImportError, e.what()); \ - return nullptr; \ - } catch (const std::exception &e) { \ - PyErr_SetString(PyExc_ImportError, e.what()); \ - return nullptr; \ - } \ + } PYBIND11_CATCH_INIT_EXCEPTIONS \ } \ void PYBIND11_CONCAT(pybind11_init_, name)(pybind11::module &variable) @@ -391,7 +386,7 @@ struct instance { void *simple_value_holder[1 + instance_simple_holder_in_ptrs()]; nonsimple_values_and_holders nonsimple; }; - /// Weak references (needed for keep alive): + /// Weak references PyObject *weakrefs; /// If true, the pointer is owned which means we're free to manage it with a holder. bool owned : 1; @@ -408,10 +403,10 @@ struct instance { * (which is typically the size of two pointers), or when multiple inheritance is used on the * python side. Non-simple layout allocates the required amount of memory to have multiple * bound C++ classes as parents. Under this layout, `nonsimple.values_and_holders` is set to a - * pointer to allocated space of the required space to hold a a sequence of value pointers and + * pointer to allocated space of the required space to hold a sequence of value pointers and * holders followed `status`, a set of bit flags (1 byte each), i.e. * [val1*][holder1][val2*][holder2]...[bb...] where each [block] is rounded up to a multiple of - * `sizeof(void *)`. `nonsimple.holder_constructed` is, for convenience, a pointer to the + * `sizeof(void *)`. `nonsimple.status` is, for convenience, a pointer to the * beginning of the [bb...] block (but not independently allocated). * * Status bits indicate whether the associated holder is constructed (& @@ -584,6 +579,11 @@ template using deferred_t = typename deferred_type< template using is_strict_base_of = bool_constant< std::is_base_of::value && !std::is_same::value>; +/// Like is_base_of, but also requires that the base type is accessible (i.e. that a Derived pointer +/// can be converted to a Base pointer) +template using is_accessible_base_of = bool_constant< + std::is_base_of::value && std::is_convertible::value>; + template class Base> struct is_template_base_of_impl { template static std::true_type check(Base *); @@ -702,9 +702,13 @@ template struct format_descriptor constexpr const char format_descriptor< T, detail::enable_if_t::value>>::value[2]; +#endif + /// RAII wrapper that temporarily clears any Python error state struct error_scope { PyObject *type, *value, *trace; diff --git a/python/src/pybind11/detail/descr.h b/python/src/pybind11/detail/descr.h index e3bf2ba97..8d404e534 100644 --- a/python/src/pybind11/detail/descr.h +++ b/python/src/pybind11/detail/descr.h @@ -1,6 +1,5 @@ /* - pybind11/detail/descr.h: Helper type for concatenating type signatures - either at runtime (C++11) or compile time (C++14) + pybind11/detail/descr.h: Helper type for concatenating type signatures at compile time Copyright (c) 2016 Wenzel Jakob @@ -15,171 +14,87 @@ NAMESPACE_BEGIN(PYBIND11_NAMESPACE) NAMESPACE_BEGIN(detail) -/* Concatenate type signatures at compile time using C++14 */ -#if defined(PYBIND11_CPP14) && !defined(_MSC_VER) -#define PYBIND11_CONSTEXPR_DESCR +#if !defined(_MSC_VER) +# define PYBIND11_DESCR_CONSTEXPR static constexpr +#else +# define PYBIND11_DESCR_CONSTEXPR const +#endif -template class descr { - template friend class descr; -public: - constexpr descr(char const (&text) [Size1+1], const std::type_info * const (&types)[Size2+1]) - : descr(text, types, - make_index_sequence(), - make_index_sequence()) { } +/* Concatenate type signatures at compile time */ +template +struct descr { + char text[N + 1]; - constexpr const char *text() const { return m_text; } - constexpr const std::type_info * const * types() const { return m_types; } + constexpr descr() : text{'\0'} { } + constexpr descr(char const (&s)[N+1]) : descr(s, make_index_sequence()) { } - template - constexpr descr operator+(const descr &other) const { - return concat(other, - make_index_sequence(), - make_index_sequence(), - make_index_sequence(), - make_index_sequence()); + template + constexpr descr(char const (&s)[N+1], index_sequence) : text{s[Is]..., '\0'} { } + + template + constexpr descr(char c, Chars... cs) : text{c, static_cast(cs)..., '\0'} { } + + static constexpr std::array types() { + return {{&typeid(Ts)..., nullptr}}; } - -protected: - template - constexpr descr( - char const (&text) [Size1+1], - const std::type_info * const (&types) [Size2+1], - index_sequence, index_sequence) - : m_text{text[Indices1]..., '\0'}, - m_types{types[Indices2]..., nullptr } {} - - template - constexpr descr - concat(const descr &other, - index_sequence, index_sequence, - index_sequence, index_sequence) const { - return descr( - { m_text[Indices1]..., other.m_text[OtherIndices1]..., '\0' }, - { m_types[Indices2]..., other.m_types[OtherIndices2]..., nullptr } - ); - } - -protected: - char m_text[Size1 + 1]; - const std::type_info * m_types[Size2 + 1]; }; -template constexpr descr _(char const(&text)[Size]) { - return descr(text, { nullptr }); +template +constexpr descr plus_impl(const descr &a, const descr &b, + index_sequence, index_sequence) { + return {a.text[Is1]..., b.text[Is2]...}; } +template +constexpr descr operator+(const descr &a, const descr &b) { + return plus_impl(a, b, make_index_sequence(), make_index_sequence()); +} + +template +constexpr descr _(char const(&text)[N]) { return descr(text); } +constexpr descr<0> _(char const(&)[1]) { return {}; } + template struct int_to_str : int_to_str { }; template struct int_to_str<0, Digits...> { - static constexpr auto digits = descr({ ('0' + Digits)..., '\0' }, { nullptr }); + static constexpr auto digits = descr(('0' + Digits)...); }; // Ternary description (like std::conditional) -template -constexpr enable_if_t> _(char const(&text1)[Size1], char const(&)[Size2]) { +template +constexpr enable_if_t> _(char const(&text1)[N1], char const(&)[N2]) { return _(text1); } -template -constexpr enable_if_t> _(char const(&)[Size1], char const(&text2)[Size2]) { +template +constexpr enable_if_t> _(char const(&)[N1], char const(&text2)[N2]) { return _(text2); } -template -constexpr enable_if_t> _(descr d, descr) { return d; } -template -constexpr enable_if_t> _(descr, descr d) { return d; } + +template +constexpr enable_if_t _(const T1 &d, const T2 &) { return d; } +template +constexpr enable_if_t _(const T1 &, const T2 &d) { return d; } template auto constexpr _() -> decltype(int_to_str::digits) { return int_to_str::digits; } -template constexpr descr<1, 1> _() { - return descr<1, 1>({ '%', '\0' }, { &typeid(Type), nullptr }); +template constexpr descr<1, Type> _() { return {'%'}; } + +constexpr descr<0> concat() { return {}; } + +template +constexpr descr concat(const descr &descr) { return descr; } + +template +constexpr auto concat(const descr &d, const Args &...args) + -> decltype(std::declval>() + concat(args...)) { + return d + _(", ") + concat(args...); } -inline constexpr descr<0, 0> concat() { return _(""); } -template auto constexpr concat(descr descr) { return descr; } -template auto constexpr concat(descr descr, Args&&... args) { return descr + _(", ") + concat(args...); } -template auto constexpr type_descr(descr descr) { return _("{") + descr + _("}"); } - -#define PYBIND11_DESCR constexpr auto - -#else /* Simpler C++11 implementation based on run-time memory allocation and copying */ - -class descr { -public: - PYBIND11_NOINLINE descr(const char *text, const std::type_info * const * types) { - size_t nChars = len(text), nTypes = len(types); - m_text = new char[nChars]; - m_types = new const std::type_info *[nTypes]; - memcpy(m_text, text, nChars * sizeof(char)); - memcpy(m_types, types, nTypes * sizeof(const std::type_info *)); - } - - PYBIND11_NOINLINE descr operator+(descr &&d2) && { - descr r; - - size_t nChars1 = len(m_text), nTypes1 = len(m_types); - size_t nChars2 = len(d2.m_text), nTypes2 = len(d2.m_types); - - r.m_text = new char[nChars1 + nChars2 - 1]; - r.m_types = new const std::type_info *[nTypes1 + nTypes2 - 1]; - memcpy(r.m_text, m_text, (nChars1-1) * sizeof(char)); - memcpy(r.m_text + nChars1 - 1, d2.m_text, nChars2 * sizeof(char)); - memcpy(r.m_types, m_types, (nTypes1-1) * sizeof(std::type_info *)); - memcpy(r.m_types + nTypes1 - 1, d2.m_types, nTypes2 * sizeof(std::type_info *)); - - delete[] m_text; delete[] m_types; - delete[] d2.m_text; delete[] d2.m_types; - - return r; - } - - char *text() { return m_text; } - const std::type_info * * types() { return m_types; } - -protected: - PYBIND11_NOINLINE descr() { } - - template static size_t len(const T *ptr) { // return length including null termination - const T *it = ptr; - while (*it++ != (T) 0) - ; - return static_cast(it - ptr); - } - - const std::type_info **m_types = nullptr; - char *m_text = nullptr; -}; - -/* The 'PYBIND11_NOINLINE inline' combinations below are intentional to get the desired linkage while producing as little object code as possible */ - -PYBIND11_NOINLINE inline descr _(const char *text) { - const std::type_info *types[1] = { nullptr }; - return descr(text, types); +template +constexpr descr type_descr(const descr &descr) { + return _("{") + descr + _("}"); } -template PYBIND11_NOINLINE enable_if_t _(const char *text1, const char *) { return _(text1); } -template PYBIND11_NOINLINE enable_if_t _(char const *, const char *text2) { return _(text2); } -template PYBIND11_NOINLINE enable_if_t _(descr d, descr) { return d; } -template PYBIND11_NOINLINE enable_if_t _(descr, descr d) { return d; } - -template PYBIND11_NOINLINE descr _() { - const std::type_info *types[2] = { &typeid(Type), nullptr }; - return descr("%", types); -} - -template PYBIND11_NOINLINE descr _() { - const std::type_info *types[1] = { nullptr }; - return descr(std::to_string(Size).c_str(), types); -} - -PYBIND11_NOINLINE inline descr concat() { return _(""); } -PYBIND11_NOINLINE inline descr concat(descr &&d) { return d; } -template PYBIND11_NOINLINE descr concat(descr &&d, Args&&... args) { return std::move(d) + _(", ") + concat(std::forward(args)...); } -PYBIND11_NOINLINE inline descr type_descr(descr&& d) { return _("{") + std::move(d) + _("}"); } - -#define PYBIND11_DESCR ::pybind11::detail::descr -#endif - NAMESPACE_END(detail) NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/python/src/pybind11/detail/init.h b/python/src/pybind11/detail/init.h index 82f740760..acfe00bdb 100644 --- a/python/src/pybind11/detail/init.h +++ b/python/src/pybind11/detail/init.h @@ -24,7 +24,7 @@ public: template using cast_op_type = value_and_holder &; operator value_and_holder &() { return *value; } - static PYBIND11_DESCR name() { return type_descr(_()); } + static constexpr auto name = _(); private: value_and_holder *value = nullptr; diff --git a/python/src/pybind11/detail/internals.h b/python/src/pybind11/detail/internals.h index 78d4afed0..f1dd38764 100644 --- a/python/src/pybind11/detail/internals.h +++ b/python/src/pybind11/detail/internals.h @@ -23,7 +23,7 @@ inline PyObject *make_object_base_type(PyTypeObject *metaclass); #if PY_VERSION_HEX >= 0x03070000 # define PYBIND11_TLS_KEY_INIT(var) Py_tss_t *var = nullptr # define PYBIND11_TLS_GET_VALUE(key) PyThread_tss_get((key)) -# define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_tss_set((key), (tstate)) +# define PYBIND11_TLS_REPLACE_VALUE(key, value) PyThread_tss_set((key), (value)) # define PYBIND11_TLS_DELETE_VALUE(key) PyThread_tss_set((key), nullptr) #else // Usually an int but a long on Cygwin64 with Python 3.x @@ -116,7 +116,7 @@ struct internals { struct type_info { PyTypeObject *type; const std::type_info *cpptype; - size_t type_size, holder_size_in_ptrs; + size_t type_size, type_align, holder_size_in_ptrs; void *(*operator_new)(size_t); void (*init_instance)(instance *, const void *); void (*dealloc)(value_and_holder &v_h); @@ -138,7 +138,13 @@ struct type_info { }; /// Tracks the `internals` and `type_info` ABI version independent of the main library version -#define PYBIND11_INTERNALS_VERSION 2 +#define PYBIND11_INTERNALS_VERSION 3 + +#if defined(_DEBUG) +# define PYBIND11_BUILD_TYPE "_debug" +#else +# define PYBIND11_BUILD_TYPE "" +#endif #if defined(WITH_THREAD) # define PYBIND11_INTERNALS_KIND "" @@ -147,10 +153,10 @@ struct type_info { #endif #define PYBIND11_INTERNALS_ID "__pybind11_internals_v" \ - PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) PYBIND11_INTERNALS_KIND "__" + PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) PYBIND11_INTERNALS_KIND PYBIND11_BUILD_TYPE "__" #define PYBIND11_MODULE_LOCAL_ID "__pybind11_module_local_v" \ - PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) PYBIND11_INTERNALS_KIND "__" + PYBIND11_TOSTRING(PYBIND11_INTERNALS_VERSION) PYBIND11_INTERNALS_KIND PYBIND11_BUILD_TYPE "__" /// Each module locally stores a pointer to the `internals` data. The data /// itself is shared among modules with the same `PYBIND11_INTERNALS_ID`. diff --git a/python/src/pybind11/detail/typeid.h b/python/src/pybind11/detail/typeid.h index 6f36aab75..9c8a4fc69 100644 --- a/python/src/pybind11/detail/typeid.h +++ b/python/src/pybind11/detail/typeid.h @@ -16,6 +16,8 @@ #include #endif +#include "common.h" + NAMESPACE_BEGIN(PYBIND11_NAMESPACE) NAMESPACE_BEGIN(detail) /// Erase all occurrences of a substring diff --git a/python/src/pybind11/eigen.h b/python/src/pybind11/eigen.h index 0899ec73f..d963d9650 100644 --- a/python/src/pybind11/eigen.h +++ b/python/src/pybind11/eigen.h @@ -17,6 +17,11 @@ # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Wconversion" # pragma GCC diagnostic ignored "-Wdeprecated-declarations" +# ifdef __clang__ +// Eigen generates a bunch of implicit-copy-constructor-is-deprecated warnings with -Wdeprecated +// under Clang, so disable that warning here: +# pragma GCC diagnostic ignored "-Wdeprecated" +# endif # if __GNUC__ >= 7 # pragma GCC diagnostic ignored "-Wint-in-bool-context" # endif @@ -181,28 +186,26 @@ template struct EigenProps { } } - static PYBIND11_DESCR descriptor() { - constexpr bool show_writeable = is_eigen_dense_map::value && is_eigen_mutable_map::value; - constexpr bool show_order = is_eigen_dense_map::value; - constexpr bool show_c_contiguous = show_order && requires_row_major; - constexpr bool show_f_contiguous = !show_c_contiguous && show_order && requires_col_major; + static constexpr bool show_writeable = is_eigen_dense_map::value && is_eigen_mutable_map::value; + static constexpr bool show_order = is_eigen_dense_map::value; + static constexpr bool show_c_contiguous = show_order && requires_row_major; + static constexpr bool show_f_contiguous = !show_c_contiguous && show_order && requires_col_major; - return type_descr(_("numpy.ndarray[") + npy_format_descriptor::name() + - _("[") + _(_<(size_t) rows>(), _("m")) + - _(", ") + _(_<(size_t) cols>(), _("n")) + - _("]") + - // For a reference type (e.g. Ref) we have other constraints that might need to be - // satisfied: writeable=True (for a mutable reference), and, depending on the map's stride - // options, possibly f_contiguous or c_contiguous. We include them in the descriptor output - // to provide some hint as to why a TypeError is occurring (otherwise it can be confusing to - // see that a function accepts a 'numpy.ndarray[float64[3,2]]' and an error message that you - // *gave* a numpy.ndarray of the right type and dimensions. - _(", flags.writeable", "") + - _(", flags.c_contiguous", "") + - _(", flags.f_contiguous", "") + - _("]") - ); - } + static constexpr auto descriptor = + _("numpy.ndarray[") + npy_format_descriptor::name + + _("[") + _(_<(size_t) rows>(), _("m")) + + _(", ") + _(_<(size_t) cols>(), _("n")) + + _("]") + + // For a reference type (e.g. Ref) we have other constraints that might need to be + // satisfied: writeable=True (for a mutable reference), and, depending on the map's stride + // options, possibly f_contiguous or c_contiguous. We include them in the descriptor output + // to provide some hint as to why a TypeError is occurring (otherwise it can be confusing to + // see that a function accepts a 'numpy.ndarray[float64[3,2]]' and an error message that you + // *gave* a numpy.ndarray of the right type and dimensions. + _(", flags.writeable", "") + + _(", flags.c_contiguous", "") + + _(", flags.f_contiguous", "") + + _("]"); }; // Casts an Eigen type to numpy array. If given a base, the numpy array references the src data, @@ -339,7 +342,7 @@ public: return cast_impl(src, policy, parent); } - static PYBIND11_DESCR name() { return props::descriptor(); } + static constexpr auto name = props::descriptor; operator Type*() { return &value; } operator Type&() { return value; } @@ -379,7 +382,7 @@ public: } } - static PYBIND11_DESCR name() { return props::descriptor(); } + static constexpr auto name = props::descriptor; // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return // types but not bound arguments). We still provide them (with an explicitly delete) so that @@ -524,7 +527,7 @@ public: } static handle cast(const Type *src, return_value_policy policy, handle parent) { return cast(*src, policy, parent); } - static PYBIND11_DESCR name() { return props::descriptor(); } + static constexpr auto name = props::descriptor; // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return // types but not bound arguments). We still provide them (with an explicitly delete) so that @@ -591,7 +594,7 @@ struct type_caster::value>> { } PYBIND11_TYPE_CASTER(Type, _<(Type::IsRowMajor) != 0>("scipy.sparse.csr_matrix[", "scipy.sparse.csc_matrix[") - + npy_format_descriptor::name() + _("]")); + + npy_format_descriptor::name + _("]")); }; NAMESPACE_END(detail) diff --git a/python/src/pybind11/embed.h b/python/src/pybind11/embed.h index 9abc61c34..72655885e 100644 --- a/python/src/pybind11/embed.h +++ b/python/src/pybind11/embed.h @@ -90,8 +90,14 @@ NAMESPACE_END(detail) Initialize the Python interpreter. No other pybind11 or CPython API functions can be called before this is done; with the exception of `PYBIND11_EMBEDDED_MODULE`. The optional parameter can be used to skip the registration of signal handlers (see the - Python documentation for details). Calling this function again after the interpreter + `Python documentation`_ for details). Calling this function again after the interpreter has already been initialized is a fatal error. + + If initializing the Python interpreter fails, then the program is terminated. (This + is controlled by the CPython runtime and is an exception to pybind11's normal behavior + of throwing exceptions on errors.) + + .. _Python documentation: https://docs.python.org/3/c-api/init.html#c.Py_InitializeEx \endrst */ inline void initialize_interpreter(bool init_signal_handlers = true) { if (Py_IsInitialized()) diff --git a/python/src/pybind11/functional.h b/python/src/pybind11/functional.h index eda14ba58..7a0988ab0 100644 --- a/python/src/pybind11/functional.h +++ b/python/src/pybind11/functional.h @@ -54,9 +54,20 @@ public: } } - value = [func](Args... args) -> Return { + // ensure GIL is held during functor destruction + struct func_handle { + function f; + func_handle(function&& f_) : f(std::move(f_)) {} + func_handle(const func_handle&) = default; + ~func_handle() { + gil_scoped_acquire acq; + function kill_f(std::move(f)); + } + }; + + value = [hfunc = func_handle(std::move(func))](Args... args) -> Return { gil_scoped_acquire acq; - object retval(func(std::forward(args)...)); + object retval(hfunc.f(std::forward(args)...)); /* Visual studio 2015 parser issue: need parentheses around this expression */ return (retval.template cast()); }; @@ -75,10 +86,8 @@ public: return cpp_function(std::forward(f_), policy).release(); } - PYBIND11_TYPE_CASTER(type, _("Callable[[") + - argument_loader::arg_names() + _("], ") + - make_caster::name() + - _("]")); + PYBIND11_TYPE_CASTER(type, _("Callable[[") + concat(make_caster::name...) + _("], ") + + make_caster::name + _("]")); }; NAMESPACE_END(detail) diff --git a/python/src/pybind11/iostream.h b/python/src/pybind11/iostream.h index 3caf55639..72baef8fd 100644 --- a/python/src/pybind11/iostream.h +++ b/python/src/pybind11/iostream.h @@ -25,7 +25,8 @@ class pythonbuf : public std::streambuf { private: using traits_type = std::streambuf::traits_type; - char d_buffer[1024]; + const size_t buf_size; + std::unique_ptr d_buffer; object pywrite; object pyflush; @@ -42,8 +43,11 @@ private: // This subtraction cannot be negative, so dropping the sign str line(pbase(), static_cast(pptr() - pbase())); - pywrite(line); - pyflush(); + { + gil_scoped_acquire tmp; + pywrite(line); + pyflush(); + } setp(pbase(), epptr()); } @@ -51,10 +55,13 @@ private: } public: - pythonbuf(object pyostream) - : pywrite(pyostream.attr("write")), + + pythonbuf(object pyostream, size_t buffer_size = 1024) + : buf_size(buffer_size), + d_buffer(new char[buf_size]), + pywrite(pyostream.attr("write")), pyflush(pyostream.attr("flush")) { - setp(d_buffer, d_buffer + sizeof(d_buffer) - 1); + setp(d_buffer.get(), d_buffer.get() + buf_size - 1); } /// Sync before destroy @@ -194,7 +201,7 @@ inline class_ add_ostream_redirect(module m, std::strin return class_(m, name.c_str(), module_local()) .def(init(), arg("stdout")=true, arg("stderr")=true) .def("__enter__", &detail::OstreamRedirect::enter) - .def("__exit__", [](detail::OstreamRedirect &self, args) { self.exit(); }); + .def("__exit__", [](detail::OstreamRedirect &self_, args) { self_.exit(); }); } NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/python/src/pybind11/numpy.h b/python/src/pybind11/numpy.h index 9df493499..b2a02e024 100644 --- a/python/src/pybind11/numpy.h +++ b/python/src/pybind11/numpy.h @@ -18,9 +18,9 @@ #include #include #include -#include #include #include +#include #include #if defined(_MSC_VER) @@ -250,7 +250,7 @@ template struct array_info_scalar { typedef T type; static constexpr bool is_array = false; static constexpr bool is_empty = false; - static PYBIND11_DESCR extents() { return _(""); } + static constexpr auto extents = _(""); static void append_extents(list& /* shape */) { } }; // Computes underlying type and a comma-separated list of extents for array @@ -269,15 +269,9 @@ template struct array_info> { array_info::append_extents(shape); } - template::is_array, int> = 0> - static PYBIND11_DESCR extents() { - return _(); - } - - template::is_array, int> = 0> - static PYBIND11_DESCR extents() { - return concat(_(), array_info::extents()); - } + static constexpr auto extents = _::is_array>( + concat(_(), array_info::extents), _() + ); }; // For numpy we have special handling for arrays of characters, so we don't include // the size in the array extents. @@ -446,7 +440,7 @@ public: /// This is essentially the same as calling numpy.dtype(args) in Python. static dtype from_args(object args) { PyObject *ptr = nullptr; - if (!detail::npy_api::get().PyArray_DescrConverter_(args.release().ptr(), &ptr) || !ptr) + if (!detail::npy_api::get().PyArray_DescrConverter_(args.ptr(), &ptr) || !ptr) throw error_already_set(); return reinterpret_steal(ptr); } @@ -861,14 +855,14 @@ public: // Reference to element at a given index template const T& at(Ix... index) const { - if (sizeof...(index) != ndim()) + if ((ssize_t) sizeof...(index) != ndim()) fail_dim_check(sizeof...(index), "index dimension mismatch"); return *(static_cast(array::data()) + byte_offset(ssize_t(index)...) / itemsize()); } // Mutable reference to element at a given index template T& mutable_at(Ix... index) { - if (sizeof...(index) != ndim()) + if ((ssize_t) sizeof...(index) != ndim()) fail_dim_check(sizeof...(index), "index dimension mismatch"); return *(static_cast(array::mutable_data()) + byte_offset(ssize_t(index)...) / itemsize()); } @@ -948,8 +942,8 @@ template struct format_descriptor::is_array>> { static std::string format() { using namespace detail; - PYBIND11_DESCR extents = _("(") + array_info::extents() + _(")"); - return extents.text() + format_descriptor>::format(); + static constexpr auto extents = _("(") + array_info::extents + _(")"); + return extents.text + format_descriptor>::format(); } }; @@ -968,7 +962,7 @@ struct pyobject_caster> { static handle cast(const handle &src, return_value_policy /* policy */, handle /* parent */) { return src.inc_ref(); } - PYBIND11_TYPE_CASTER(type, handle_type_name::name()); + PYBIND11_TYPE_CASTER(type, handle_type_name::name); }; template @@ -978,7 +972,34 @@ struct compare_buffer_info::valu } }; -template struct npy_format_descriptor::value>> { +template +struct npy_format_descriptor_name; + +template +struct npy_format_descriptor_name::value>> { + static constexpr auto name = _::value>( + _("bool"), _::value>("int", "uint") + _() + ); +}; + +template +struct npy_format_descriptor_name::value>> { + static constexpr auto name = _::value || std::is_same::value>( + _("float") + _(), _("longdouble") + ); +}; + +template +struct npy_format_descriptor_name::value>> { + static constexpr auto name = _::value + || std::is_same::value>( + _("complex") + _(), _("longcomplex") + ); +}; + +template +struct npy_format_descriptor::value>> + : npy_format_descriptor_name { private: // NB: the order here must match the one in common.h constexpr static const int values[15] = { @@ -997,25 +1018,10 @@ public: return reinterpret_borrow(ptr); pybind11_fail("Unsupported buffer format!"); } - template ::value, int> = 0> - static PYBIND11_DESCR name() { - return _::value>(_("bool"), - _::value>("int", "uint") + _()); - } - template ::value, int> = 0> - static PYBIND11_DESCR name() { - return _::value || std::is_same::value>( - _("float") + _(), _("longdouble")); - } - template ::value, int> = 0> - static PYBIND11_DESCR name() { - return _::value || std::is_same::value>( - _("complex") + _(), _("longcomplex")); - } }; #define PYBIND11_DECL_CHAR_FMT \ - static PYBIND11_DESCR name() { return _("S") + _(); } \ + static constexpr auto name = _("S") + _(); \ static pybind11::dtype dtype() { return pybind11::dtype(std::string("S") + std::to_string(N)); } template struct npy_format_descriptor { PYBIND11_DECL_CHAR_FMT }; template struct npy_format_descriptor> { PYBIND11_DECL_CHAR_FMT }; @@ -1027,7 +1033,7 @@ private: public: static_assert(!array_info::is_empty, "Zero-sized arrays are not supported"); - static PYBIND11_DESCR name() { return _("(") + array_info::extents() + _(")") + base_descr::name(); } + static constexpr auto name = _("(") + array_info::extents + _(")") + base_descr::name; static pybind11::dtype dtype() { list shape; array_info::append_extents(shape); @@ -1039,7 +1045,7 @@ template struct npy_format_descriptor private: using base_descr = npy_format_descriptor::type>; public: - static PYBIND11_DESCR name() { return base_descr::name(); } + static constexpr auto name = base_descr::name; static pybind11::dtype dtype() { return base_descr::dtype(); } }; @@ -1052,7 +1058,7 @@ struct field_descriptor { }; inline PYBIND11_NOINLINE void register_structured_dtype( - const std::initializer_list& fields, + any_container fields, const std::type_info& tinfo, ssize_t itemsize, bool (*direct_converter)(PyObject *, void *&)) { @@ -1061,7 +1067,7 @@ inline PYBIND11_NOINLINE void register_structured_dtype( pybind11_fail("NumPy: dtype is already registered"); list names, formats, offsets; - for (auto field : fields) { + for (auto field : *fields) { if (!field.descr) pybind11_fail(std::string("NumPy: unsupported field dtype: `") + field.name + "` @ " + tinfo.name()); @@ -1078,7 +1084,7 @@ inline PYBIND11_NOINLINE void register_structured_dtype( // - https://github.com/numpy/numpy/pull/7798 // Because of this, we won't use numpy's logic to generate buffer format // strings and will just do it ourselves. - std::vector ordered_fields(fields); + std::vector ordered_fields(std::move(fields)); std::sort(ordered_fields.begin(), ordered_fields.end(), [](const field_descriptor &a, const field_descriptor &b) { return a.offset < b.offset; }); ssize_t offset = 0; @@ -1114,7 +1120,7 @@ inline PYBIND11_NOINLINE void register_structured_dtype( template struct npy_format_descriptor { static_assert(is_pod_struct::value, "Attempt to use a non-POD or unimplemented POD type as a numpy dtype"); - static PYBIND11_DESCR name() { return make_caster::name(); } + static constexpr auto name = make_caster::name; static pybind11::dtype dtype() { return reinterpret_borrow(dtype_ptr()); @@ -1125,8 +1131,8 @@ template struct npy_format_descriptor { return format_str; } - static void register_dtype(const std::initializer_list& fields) { - register_structured_dtype(fields, typeid(typename std::remove_cv::type), + static void register_dtype(any_container fields) { + register_structured_dtype(std::move(fields), typeid(typename std::remove_cv::type), sizeof(T), &direct_converter); } @@ -1199,7 +1205,8 @@ private: #define PYBIND11_NUMPY_DTYPE(Type, ...) \ ::pybind11::detail::npy_format_descriptor::register_dtype \ - ({PYBIND11_MAP_LIST (PYBIND11_FIELD_DESCRIPTOR, Type, __VA_ARGS__)}) + (::std::vector<::pybind11::detail::field_descriptor> \ + {PYBIND11_MAP_LIST (PYBIND11_FIELD_DESCRIPTOR, Type, __VA_ARGS__)}) #ifdef _MSC_VER #define PYBIND11_MAP2_LIST_NEXT1(test, next) \ @@ -1220,7 +1227,8 @@ private: #define PYBIND11_NUMPY_DTYPE_EX(Type, ...) \ ::pybind11::detail::npy_format_descriptor::register_dtype \ - ({PYBIND11_MAP2_LIST (PYBIND11_FIELD_DESCRIPTOR_EX, Type, __VA_ARGS__)}) + (::std::vector<::pybind11::detail::field_descriptor> \ + {PYBIND11_MAP2_LIST (PYBIND11_FIELD_DESCRIPTOR_EX, Type, __VA_ARGS__)}) #endif // __CLION_IDE__ @@ -1458,7 +1466,10 @@ public: private: remove_reference_t f; - template using param_n_t = typename pack_element::call_type...>::type; + // Internal compiler error in MSVC 19.16.27025.1 (Visual Studio 2017 15.9.4), when compiling with "/permissive-" flag + // when arg_call_types is manually inlined. + using arg_call_types = std::tuple::call_type...>; + template using param_n_t = typename std::tuple_element::type; // Runs a vectorized function given arguments tuple and three index sequences: // - Index is the full set of 0 ... (N-1) argument indices; @@ -1498,7 +1509,7 @@ private: if (trivial == broadcast_trivial::f_trivial) result = array_t(shape); else result = array_t(shape); - if (size == 0) return result; + if (size == 0) return std::move(result); /* Call the function */ if (trivial == broadcast_trivial::non_trivial) @@ -1506,7 +1517,7 @@ private: else apply_trivial(buffers, params, result.mutable_data(), size, i_seq, vi_seq, bi_seq); - return result; + return std::move(result); } template @@ -1559,9 +1570,7 @@ vectorize_extractor(const Func &f, Return (*) (Args ...)) { } template struct handle_type_name> { - static PYBIND11_DESCR name() { - return _("numpy.ndarray[") + npy_format_descriptor::name() + _("]"); - } + static constexpr auto name = _("numpy.ndarray[") + npy_format_descriptor::name + _("]"); }; NAMESPACE_END(detail) diff --git a/python/src/pybind11/pybind11.h b/python/src/pybind11/pybind11.h index 9094fc424..f1d91c788 100644 --- a/python/src/pybind11/pybind11.h +++ b/python/src/pybind11/pybind11.h @@ -10,7 +10,17 @@ #pragma once -#if defined(_MSC_VER) +#if defined(__INTEL_COMPILER) +# pragma warning push +# pragma warning disable 68 // integer conversion resulted in a change of sign +# pragma warning disable 186 // pointless comparison of unsigned integer with zero +# pragma warning disable 878 // incompatible exception specifications +# pragma warning disable 1334 // the "template" keyword used for syntactic disambiguation may only be used within a template +# pragma warning disable 1682 // implicit conversion of a 64-bit integral type to a smaller integral type (potential portability problem) +# pragma warning disable 1786 // function "strdup" was declared deprecated +# pragma warning disable 1875 // offsetof applied to non-POD (Plain Old Data) types is nonstandard +# pragma warning disable 2196 // warning #2196: routine is both "inline" and "noinline" +#elif defined(_MSC_VER) # pragma warning(push) # pragma warning(disable: 4100) // warning C4100: Unreferenced formal parameter # pragma warning(disable: 4127) // warning C4127: Conditional expression is constant @@ -19,15 +29,6 @@ # pragma warning(disable: 4996) // warning C4996: The POSIX name for this item is deprecated. Instead, use the ISO C and C++ conformant name # pragma warning(disable: 4702) // warning C4702: unreachable code # pragma warning(disable: 4522) // warning C4522: multiple assignment operators specified -#elif defined(__INTEL_COMPILER) -# pragma warning(push) -# pragma warning(disable: 68) // integer conversion resulted in a change of sign -# pragma warning(disable: 186) // pointless comparison of unsigned integer with zero -# pragma warning(disable: 878) // incompatible exception specifications -# pragma warning(disable: 1334) // the "template" keyword used for syntactic disambiguation may only be used within a template -# pragma warning(disable: 1682) // implicit conversion of a 64-bit integral type to a smaller integral type (potential portability problem) -# pragma warning(disable: 1875) // offsetof applied to non-POD (Plain Old Data) types is nonstandard -# pragma warning(disable: 2196) // warning #2196: routine is both "inline" and "noinline" #elif defined(__GNUG__) && !defined(__clang__) # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Wunused-but-set-parameter" @@ -40,6 +41,11 @@ # endif #endif +#if defined(__GNUG__) && !defined(__clang__) + #include +#endif + + #include "attr.h" #include "options.h" #include "detail/class.h" @@ -51,6 +57,7 @@ NAMESPACE_BEGIN(PYBIND11_NAMESPACE) class cpp_function : public function { public: cpp_function() { } + cpp_function(std::nullptr_t) { } /// Construct a cpp_function from a vanilla function pointer template @@ -93,7 +100,6 @@ protected: template void initialize(Func &&f, Return (*)(Args...), const Extra&... extra) { using namespace detail; - struct capture { remove_reference_t f; }; /* Store the function including any extra state it might have (e.g. a lambda capture object) */ @@ -164,10 +170,11 @@ protected: process_attributes::init(extra..., rec); /* Generate a readable signature describing the function's arguments and return value types */ - PYBIND11_DESCR signature = _("(") + cast_in::arg_names() + _(") -> ") + cast_out::name(); + static constexpr auto signature = _("(") + cast_in::arg_names + _(") -> ") + cast_out::name; + PYBIND11_DESCR_CONSTEXPR auto types = decltype(signature)::types(); /* Register the function with Python from generic (non-templated) code */ - initialize_generic(rec, signature.text(), signature.types(), sizeof...(Args)); + initialize_generic(rec, signature.text, types.data(), sizeof...(Args)); if (cast_in::has_args) rec->has_args = true; if (cast_in::has_kwargs) rec->has_kwargs = true; @@ -217,34 +224,30 @@ protected: /* Generate a proper function signature */ std::string signature; - size_t type_depth = 0, char_index = 0, type_index = 0, arg_index = 0; - while (true) { - char c = text[char_index++]; - if (c == '\0') - break; + size_t type_index = 0, arg_index = 0; + for (auto *pc = text; *pc != '\0'; ++pc) { + const auto c = *pc; if (c == '{') { - // Write arg name for everything except *args, **kwargs and return type. - if (type_depth == 0 && text[char_index] != '*' && arg_index < args) { - if (!rec->args.empty() && rec->args[arg_index].name) { - signature += rec->args[arg_index].name; - } else if (arg_index == 0 && rec->is_method) { - signature += "self"; - } else { - signature += "arg" + std::to_string(arg_index - (rec->is_method ? 1 : 0)); - } - signature += ": "; + // Write arg name for everything except *args and **kwargs. + if (*(pc + 1) == '*') + continue; + + if (arg_index < rec->args.size() && rec->args[arg_index].name) { + signature += rec->args[arg_index].name; + } else if (arg_index == 0 && rec->is_method) { + signature += "self"; + } else { + signature += "arg" + std::to_string(arg_index - (rec->is_method ? 1 : 0)); } - ++type_depth; + signature += ": "; } else if (c == '}') { - --type_depth; - if (type_depth == 0) { - if (arg_index < rec->args.size() && rec->args[arg_index].descr) { - signature += "="; - signature += rec->args[arg_index].descr; - } - arg_index++; + // Write default value if available. + if (arg_index < rec->args.size() && rec->args[arg_index].descr) { + signature += " = "; + signature += rec->args[arg_index].descr; } + arg_index++; } else if (c == '%') { const std::type_info *t = types[type_index++]; if (!t) @@ -269,14 +272,9 @@ protected: signature += c; } } - if (type_depth != 0 || types[type_index] != nullptr) + if (arg_index != args || types[type_index] != nullptr) pybind11_fail("Internal error while parsing type signature (2)"); - #if !defined(PYBIND11_CONSTEXPR_DESCR) - delete[] types; - delete[] text; - #endif - #if PY_MAJOR_VERSION < 3 if (strcmp(rec->name, "__next__") == 0) { std::free(rec->name); @@ -428,8 +426,8 @@ protected: using namespace detail; /* Iterator over the list of potentially admissible overloads */ - function_record *overloads = (function_record *) PyCapsule_GetPointer(self, nullptr), - *it = overloads; + const function_record *overloads = (function_record *) PyCapsule_GetPointer(self, nullptr), + *it = overloads; /* Need to know how many arguments + keyword arguments there are to pick the right overload */ const size_t n_args_in = (size_t) PyTuple_GET_SIZE(args_in); @@ -485,7 +483,7 @@ protected: result other than PYBIND11_TRY_NEXT_OVERLOAD. */ - function_record &func = *it; + const function_record &func = *it; size_t pos_args = func.nargs; // Number of positional arguments that we need if (func.has_args) --pos_args; // (but don't count py::args if (func.has_kwargs) --pos_args; // or py::kwargs) @@ -517,7 +515,7 @@ protected: // 1. Copy any position arguments given. bool bad_arg = false; for (; args_copied < args_to_copy; ++args_copied) { - argument_record *arg_rec = args_copied < func.args.size() ? &func.args[args_copied] : nullptr; + const argument_record *arg_rec = args_copied < func.args.size() ? &func.args[args_copied] : nullptr; if (kwargs_in && arg_rec && arg_rec->name && PyDict_GetItemString(kwargs_in, arg_rec->name)) { bad_arg = true; break; @@ -658,13 +656,22 @@ protected: result = PYBIND11_TRY_NEXT_OVERLOAD; } - if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD) + if (result.ptr() != PYBIND11_TRY_NEXT_OVERLOAD) { + // The error reporting logic below expects 'it' to be valid, as it would be + // if we'd encountered this failure in the first-pass loop. + if (!result) + it = &call.func; break; + } } } } catch (error_already_set &e) { e.restore(); return nullptr; +#if defined(__GNUG__) && !defined(__clang__) + } catch ( abi::__forced_unwind& ) { + throw; +#endif } catch (...) { /* When an exception is caught, give each registered exception translator a chance to translate it to a Python exception @@ -711,7 +718,7 @@ protected: " arguments. The following argument types are supported:\n"; int ctr = 0; - for (function_record *it2 = overloads; it2 != nullptr; it2 = it2->next) { + for (const function_record *it2 = overloads; it2 != nullptr; it2 = it2->next) { msg += " "+ std::to_string(++ctr) + ". "; bool wrote_sig = false; @@ -899,6 +906,7 @@ protected: tinfo->type = (PyTypeObject *) m_ptr; tinfo->cpptype = rec.type; tinfo->type_size = rec.type_size; + tinfo->type_align = rec.type_align; tinfo->operator_new = rec.operator_new; tinfo->holder_size_in_ptrs = size_in_ptrs(rec.holder_size); tinfo->init_instance = rec.init_instance; @@ -961,18 +969,18 @@ protected: tinfo->get_buffer_data = get_buffer_data; } + // rec_func must be set for either fget or fset. void def_property_static_impl(const char *name, handle fget, handle fset, - detail::function_record *rec_fget) { - const auto is_static = !(rec_fget->is_method && rec_fget->scope); - const auto has_doc = rec_fget->doc && pybind11::options::show_user_defined_docstrings(); - + detail::function_record *rec_func) { + const auto is_static = rec_func && !(rec_func->is_method && rec_func->scope); + const auto has_doc = rec_func && rec_func->doc && pybind11::options::show_user_defined_docstrings(); auto property = handle((PyObject *) (is_static ? get_internals().static_property_type : &PyProperty_Type)); attr(name) = property(fget.ptr() ? fget : none(), fset.ptr() ? fset : none(), /*deleter*/none(), - pybind11::str(has_doc ? rec_fget->doc : "")); + pybind11::str(has_doc ? rec_func->doc : "")); } }; @@ -990,11 +998,21 @@ template struct has_operator_delete_size::value, int> = 0> -void call_operator_delete(T *p, size_t) { T::operator delete(p); } +void call_operator_delete(T *p, size_t, size_t) { T::operator delete(p); } template ::value && has_operator_delete_size::value, int> = 0> -void call_operator_delete(T *p, size_t s) { T::operator delete(p, s); } +void call_operator_delete(T *p, size_t s, size_t) { T::operator delete(p, s); } -inline void call_operator_delete(void *p, size_t) { ::operator delete(p); } +inline void call_operator_delete(void *p, size_t s, size_t a) { + (void)s; (void)a; +#if defined(PYBIND11_CPP17) + if (a > __STDCPP_DEFAULT_NEW_ALIGNMENT__) + ::operator delete(p, s, std::align_val_t(a)); + else + ::operator delete(p, s); +#else + ::operator delete(p); +#endif +} NAMESPACE_END(detail) @@ -1004,10 +1022,18 @@ template auto method_adaptor(F &&f) -> decltype(std::forward(f)) { return std::forward(f); } template -auto method_adaptor(Return (Class::*pmf)(Args...)) -> Return (Derived::*)(Args...) { return pmf; } +auto method_adaptor(Return (Class::*pmf)(Args...)) -> Return (Derived::*)(Args...) { + static_assert(detail::is_accessible_base_of::value, + "Cannot bind an inaccessible base class method; use a lambda definition instead"); + return pmf; +} template -auto method_adaptor(Return (Class::*pmf)(Args...) const) -> Return (Derived::*)(Args...) const { return pmf; } +auto method_adaptor(Return (Class::*pmf)(Args...) const) -> Return (Derived::*)(Args...) const { + static_assert(detail::is_accessible_base_of::value, + "Cannot bind an inaccessible base class method; use a lambda definition instead"); + return pmf; +} template class class_ : public detail::generic_type { @@ -1049,10 +1075,11 @@ public: record.name = name; record.type = &typeid(type); record.type_size = sizeof(conditional_t); + record.type_align = alignof(conditional_t&); record.holder_size = sizeof(holder_type); record.init_instance = init_instance; record.dealloc = dealloc; - record.default_holder = std::is_same>::value; + record.default_holder = detail::is_instantiation::value; set_operator_new(&record); @@ -1094,7 +1121,7 @@ public: "def_static(...) called with a non-static member function pointer"); cpp_function cf(std::forward(f), name(name_), scope(*this), sibling(getattr(*this, name_, none())), extra...); - attr(cf.name()) = cf; + attr(cf.name()) = staticmethod(cf); return *this; } @@ -1158,7 +1185,7 @@ public: template class_ &def_readwrite(const char *name, D C::*pm, const Extra&... extra) { - static_assert(std::is_base_of::value, "def_readwrite() requires a class member (or base class member)"); + static_assert(std::is_same::value || std::is_base_of::value, "def_readwrite() requires a class member (or base class member)"); cpp_function fget([pm](const type &c) -> const D &{ return c.*pm; }, is_method(*this)), fset([pm](type &c, const D &value) { c.*pm = value; }, is_method(*this)); def_property(name, fget, fset, return_value_policy::reference_internal, extra...); @@ -1167,7 +1194,7 @@ public: template class_ &def_readonly(const char *name, const D C::*pm, const Extra& ...extra) { - static_assert(std::is_base_of::value, "def_readonly() requires a class member (or base class member)"); + static_assert(std::is_same::value || std::is_base_of::value, "def_readonly() requires a class member (or base class member)"); cpp_function fget([pm](const type &c) -> const D &{ return c.*pm; }, is_method(*this)); def_property_readonly(name, fget, return_value_policy::reference_internal, extra...); return *this; @@ -1198,7 +1225,7 @@ public: /// Uses cpp_function's return_value_policy by default template class_ &def_property_readonly(const char *name, const cpp_function &fget, const Extra& ...extra) { - return def_property(name, fget, cpp_function(), extra...); + return def_property(name, fget, nullptr, extra...); } /// Uses return_value_policy::reference by default @@ -1210,7 +1237,7 @@ public: /// Uses cpp_function's return_value_policy by default template class_ &def_property_readonly_static(const char *name, const cpp_function &fget, const Extra& ...extra) { - return def_property_static(name, fget, cpp_function(), extra...); + return def_property_static(name, fget, nullptr, extra...); } /// Uses return_value_policy::reference_internal by default @@ -1239,22 +1266,28 @@ public: /// Uses cpp_function's return_value_policy by default template class_ &def_property_static(const char *name, const cpp_function &fget, const cpp_function &fset, const Extra& ...extra) { + static_assert( 0 == detail::constexpr_sum(std::is_base_of::value...), + "Argument annotations are not allowed for properties"); auto rec_fget = get_function_record(fget), rec_fset = get_function_record(fset); - char *doc_prev = rec_fget->doc; /* 'extra' field may include a property-specific documentation string */ - detail::process_attributes::init(extra..., rec_fget); - if (rec_fget->doc && rec_fget->doc != doc_prev) { - free(doc_prev); - rec_fget->doc = strdup(rec_fget->doc); + auto *rec_active = rec_fget; + if (rec_fget) { + char *doc_prev = rec_fget->doc; /* 'extra' field may include a property-specific documentation string */ + detail::process_attributes::init(extra..., rec_fget); + if (rec_fget->doc && rec_fget->doc != doc_prev) { + free(doc_prev); + rec_fget->doc = strdup(rec_fget->doc); + } } if (rec_fset) { - doc_prev = rec_fset->doc; + char *doc_prev = rec_fset->doc; detail::process_attributes::init(extra..., rec_fset); if (rec_fset->doc && rec_fset->doc != doc_prev) { free(doc_prev); rec_fset->doc = strdup(rec_fset->doc); } + if (! rec_active) rec_active = rec_fset; } - def_property_static_impl(name, fget, fset, rec_fget); + def_property_static_impl(name, fget, fset, rec_active); return *this; } @@ -1320,7 +1353,10 @@ private: v_h.set_holder_constructed(false); } else { - detail::call_operator_delete(v_h.value_ptr(), v_h.type->type_size); + detail::call_operator_delete(v_h.value_ptr(), + v_h.type->type_size, + v_h.type->type_align + ); } v_h.value_ptr() = nullptr; } @@ -1356,93 +1392,190 @@ detail::initimpl::pickle_factory pickle(GetState &&g, SetSta return {std::forward(g), std::forward(s)}; } +NAMESPACE_BEGIN(detail) +struct enum_base { + enum_base(handle base, handle parent) : m_base(base), m_parent(parent) { } + + PYBIND11_NOINLINE void init(bool is_arithmetic, bool is_convertible) { + m_base.attr("__entries") = dict(); + auto property = handle((PyObject *) &PyProperty_Type); + auto static_property = handle((PyObject *) get_internals().static_property_type); + + m_base.attr("__repr__") = cpp_function( + [](handle arg) -> str { + handle type = arg.get_type(); + object type_name = type.attr("__name__"); + dict entries = type.attr("__entries"); + for (const auto &kv : entries) { + object other = kv.second[int_(0)]; + if (other.equal(arg)) + return pybind11::str("{}.{}").format(type_name, kv.first); + } + return pybind11::str("{}.???").format(type_name); + }, is_method(m_base) + ); + + m_base.attr("name") = property(cpp_function( + [](handle arg) -> str { + dict entries = arg.get_type().attr("__entries"); + for (const auto &kv : entries) { + if (handle(kv.second[int_(0)]).equal(arg)) + return pybind11::str(kv.first); + } + return "???"; + }, is_method(m_base) + )); + + m_base.attr("__doc__") = static_property(cpp_function( + [](handle arg) -> std::string { + std::string docstring; + dict entries = arg.attr("__entries"); + if (((PyTypeObject *) arg.ptr())->tp_doc) + docstring += std::string(((PyTypeObject *) arg.ptr())->tp_doc) + "\n\n"; + docstring += "Members:"; + for (const auto &kv : entries) { + auto key = std::string(pybind11::str(kv.first)); + auto comment = kv.second[int_(1)]; + docstring += "\n\n " + key; + if (!comment.is_none()) + docstring += " : " + (std::string) pybind11::str(comment); + } + return docstring; + } + ), none(), none(), ""); + + m_base.attr("__members__") = static_property(cpp_function( + [](handle arg) -> dict { + dict entries = arg.attr("__entries"), m; + for (const auto &kv : entries) + m[kv.first] = kv.second[int_(0)]; + return m; + }), none(), none(), "" + ); + + #define PYBIND11_ENUM_OP_STRICT(op, expr, strict_behavior) \ + m_base.attr(op) = cpp_function( \ + [](object a, object b) { \ + if (!a.get_type().is(b.get_type())) \ + strict_behavior; \ + return expr; \ + }, \ + is_method(m_base)) + + #define PYBIND11_ENUM_OP_CONV(op, expr) \ + m_base.attr(op) = cpp_function( \ + [](object a_, object b_) { \ + int_ a(a_), b(b_); \ + return expr; \ + }, \ + is_method(m_base)) + + if (is_convertible) { + PYBIND11_ENUM_OP_CONV("__eq__", !b.is_none() && a.equal(b)); + PYBIND11_ENUM_OP_CONV("__ne__", b.is_none() || !a.equal(b)); + + if (is_arithmetic) { + PYBIND11_ENUM_OP_CONV("__lt__", a < b); + PYBIND11_ENUM_OP_CONV("__gt__", a > b); + PYBIND11_ENUM_OP_CONV("__le__", a <= b); + PYBIND11_ENUM_OP_CONV("__ge__", a >= b); + PYBIND11_ENUM_OP_CONV("__and__", a & b); + PYBIND11_ENUM_OP_CONV("__rand__", a & b); + PYBIND11_ENUM_OP_CONV("__or__", a | b); + PYBIND11_ENUM_OP_CONV("__ror__", a | b); + PYBIND11_ENUM_OP_CONV("__xor__", a ^ b); + PYBIND11_ENUM_OP_CONV("__rxor__", a ^ b); + } + } else { + PYBIND11_ENUM_OP_STRICT("__eq__", int_(a).equal(int_(b)), return false); + PYBIND11_ENUM_OP_STRICT("__ne__", !int_(a).equal(int_(b)), return true); + + if (is_arithmetic) { + #define PYBIND11_THROW throw type_error("Expected an enumeration of matching type!"); + PYBIND11_ENUM_OP_STRICT("__lt__", int_(a) < int_(b), PYBIND11_THROW); + PYBIND11_ENUM_OP_STRICT("__gt__", int_(a) > int_(b), PYBIND11_THROW); + PYBIND11_ENUM_OP_STRICT("__le__", int_(a) <= int_(b), PYBIND11_THROW); + PYBIND11_ENUM_OP_STRICT("__ge__", int_(a) >= int_(b), PYBIND11_THROW); + #undef PYBIND11_THROW + } + } + + #undef PYBIND11_ENUM_OP_CONV + #undef PYBIND11_ENUM_OP_STRICT + + object getstate = cpp_function( + [](object arg) { return int_(arg); }, is_method(m_base)); + + m_base.attr("__getstate__") = getstate; + m_base.attr("__hash__") = getstate; + } + + PYBIND11_NOINLINE void value(char const* name_, object value, const char *doc = nullptr) { + dict entries = m_base.attr("__entries"); + str name(name_); + if (entries.contains(name)) { + std::string type_name = (std::string) str(m_base.attr("__name__")); + throw value_error(type_name + ": element \"" + std::string(name_) + "\" already exists!"); + } + + entries[name] = std::make_pair(value, doc); + m_base.attr(name) = value; + } + + PYBIND11_NOINLINE void export_values() { + dict entries = m_base.attr("__entries"); + for (const auto &kv : entries) + m_parent.attr(kv.first) = kv.second[int_(0)]; + } + + handle m_base; + handle m_parent; +}; + +NAMESPACE_END(detail) + /// Binds C++ enumerations and enumeration classes to Python template class enum_ : public class_ { public: - using class_::def; - using class_::def_property_readonly_static; + using Base = class_; + using Base::def; + using Base::attr; + using Base::def_property_readonly; + using Base::def_property_readonly_static; using Scalar = typename std::underlying_type::type; template enum_(const handle &scope, const char *name, const Extra&... extra) - : class_(scope, name, extra...), m_entries(), m_parent(scope) { - + : class_(scope, name, extra...), m_base(*this, scope) { constexpr bool is_arithmetic = detail::any_of...>::value; + constexpr bool is_convertible = std::is_convertible::value; + m_base.init(is_arithmetic, is_convertible); - auto m_entries_ptr = m_entries.inc_ref().ptr(); - def("__repr__", [name, m_entries_ptr](Type value) -> pybind11::str { - for (const auto &kv : reinterpret_borrow(m_entries_ptr)) { - if (pybind11::cast(kv.second) == value) - return pybind11::str("{}.{}").format(name, kv.first); - } - return pybind11::str("{}.???").format(name); - }); - def_property_readonly_static("__members__", [m_entries_ptr](object /* self */) { - dict m; - for (const auto &kv : reinterpret_borrow(m_entries_ptr)) - m[kv.first] = kv.second; - return m; - }, return_value_policy::copy); def(init([](Scalar i) { return static_cast(i); })); def("__int__", [](Type value) { return (Scalar) value; }); #if PY_MAJOR_VERSION < 3 def("__long__", [](Type value) { return (Scalar) value; }); #endif - def("__eq__", [](const Type &value, Type *value2) { return value2 && value == *value2; }); - def("__ne__", [](const Type &value, Type *value2) { return !value2 || value != *value2; }); - if (is_arithmetic) { - def("__lt__", [](const Type &value, Type *value2) { return value2 && value < *value2; }); - def("__gt__", [](const Type &value, Type *value2) { return value2 && value > *value2; }); - def("__le__", [](const Type &value, Type *value2) { return value2 && value <= *value2; }); - def("__ge__", [](const Type &value, Type *value2) { return value2 && value >= *value2; }); - } - if (std::is_convertible::value) { - // Don't provide comparison with the underlying type if the enum isn't convertible, - // i.e. if Type is a scoped enum, mirroring the C++ behaviour. (NB: we explicitly - // convert Type to Scalar below anyway because this needs to compile). - def("__eq__", [](const Type &value, Scalar value2) { return (Scalar) value == value2; }); - def("__ne__", [](const Type &value, Scalar value2) { return (Scalar) value != value2; }); - if (is_arithmetic) { - def("__lt__", [](const Type &value, Scalar value2) { return (Scalar) value < value2; }); - def("__gt__", [](const Type &value, Scalar value2) { return (Scalar) value > value2; }); - def("__le__", [](const Type &value, Scalar value2) { return (Scalar) value <= value2; }); - def("__ge__", [](const Type &value, Scalar value2) { return (Scalar) value >= value2; }); - def("__invert__", [](const Type &value) { return ~((Scalar) value); }); - def("__and__", [](const Type &value, Scalar value2) { return (Scalar) value & value2; }); - def("__or__", [](const Type &value, Scalar value2) { return (Scalar) value | value2; }); - def("__xor__", [](const Type &value, Scalar value2) { return (Scalar) value ^ value2; }); - def("__rand__", [](const Type &value, Scalar value2) { return (Scalar) value & value2; }); - def("__ror__", [](const Type &value, Scalar value2) { return (Scalar) value | value2; }); - def("__rxor__", [](const Type &value, Scalar value2) { return (Scalar) value ^ value2; }); - def("__and__", [](const Type &value, const Type &value2) { return (Scalar) value & (Scalar) value2; }); - def("__or__", [](const Type &value, const Type &value2) { return (Scalar) value | (Scalar) value2; }); - def("__xor__", [](const Type &value, const Type &value2) { return (Scalar) value ^ (Scalar) value2; }); - } - } - def("__hash__", [](const Type &value) { return (Scalar) value; }); - // Pickling and unpickling -- needed for use with the 'multiprocessing' module - def(pickle([](const Type &value) { return pybind11::make_tuple((Scalar) value); }, - [](tuple t) { return static_cast(t[0].cast()); })); + cpp_function setstate( + [](Type &value, Scalar arg) { value = static_cast(arg); }, + is_method(*this)); + attr("__setstate__") = setstate; } /// Export enumeration entries into the parent scope enum_& export_values() { - for (const auto &kv : m_entries) - m_parent.attr(kv.first) = kv.second; + m_base.export_values(); return *this; } /// Add an enumeration entry - enum_& value(char const* name, Type value) { - auto v = pybind11::cast(value, return_value_policy::copy); - this->attr(name) = v; - m_entries[pybind11::str(name)] = v; + enum_& value(char const* name, Type value, const char *doc = nullptr) { + m_base.value(name, pybind11::cast(value, return_value_policy::copy), doc); return *this; } private: - dict m_entries; - handle m_parent; + detail::enum_base m_base; }; NAMESPACE_BEGIN(detail) @@ -1749,6 +1882,15 @@ public: auto const &internals = detail::get_internals(); tstate = (PyThreadState *) PYBIND11_TLS_GET_VALUE(internals.tstate); + if (!tstate) { + /* Check if the GIL was acquired using the PyGILState_* API instead (e.g. if + calling from a Python thread). Since we use a different key, this ensures + we don't create a new thread state and deadlock in PyEval_AcquireThread + below. Note we don't save this state with internals.tstate, since we don't + create it we would fail to clear it (its reference count should be > 0). */ + tstate = PyGILState_GetThisThreadState(); + } + if (!tstate) { tstate = PyThreadState_New(internals.istate); #if !defined(NDEBUG) @@ -1856,12 +1998,12 @@ class gil_scoped_release { }; #endif error_already_set::~error_already_set() { - if (type) { + if (m_type) { error_scope scope; gil_scoped_acquire gil; - type.release().dec_ref(); - value.release().dec_ref(); - trace.release().dec_ref(); + m_type.release().dec_ref(); + m_value.release().dec_ref(); + m_trace.release().dec_ref(); } } @@ -1922,6 +2064,14 @@ inline function get_type_overload(const void *this_ptr, const detail::type_info return overload; } +/** \rst + Try to retrieve a python method by the provided name from the instance pointed to by the this_ptr. + + :this_ptr: The pointer to the object the overload should be retrieved for. This should be the first + non-trampoline class encountered in the inheritance chain. + :name: The name of the overloaded Python method to retrieve. + :return: The Python method by this name from the object or an empty function wrapper. + \endrst */ template function get_overload(const T *this_ptr, const char *name) { auto tinfo = detail::get_type_info(typeid(T)); return tinfo ? get_type_overload(this_ptr, tinfo, name) : function(); @@ -1940,26 +2090,73 @@ template function get_overload(const T *this_ptr, const char *name) { } \ } +/** \rst + Macro to populate the virtual method in the trampoline class. This macro tries to look up a method named 'fn' + from the Python side, deals with the :ref:`gil` and necessary argument conversions to call this method and return + the appropriate type. See :ref:`overriding_virtuals` for more information. This macro should be used when the method + name in C is not the same as the method name in Python. For example with `__str__`. + + .. code-block:: cpp + + std::string toString() override { + PYBIND11_OVERLOAD_NAME( + std::string, // Return type (ret_type) + Animal, // Parent class (cname) + toString, // Name of function in C++ (name) + "__str__", // Name of method in Python (fn) + ); + } +\endrst */ #define PYBIND11_OVERLOAD_NAME(ret_type, cname, name, fn, ...) \ - PYBIND11_OVERLOAD_INT(ret_type, cname, name, __VA_ARGS__) \ + PYBIND11_OVERLOAD_INT(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__) \ return cname::fn(__VA_ARGS__) +/** \rst + Macro for pure virtual functions, this function is identical to :c:macro:`PYBIND11_OVERLOAD_NAME`, except that it + throws if no overload can be found. +\endrst */ #define PYBIND11_OVERLOAD_PURE_NAME(ret_type, cname, name, fn, ...) \ - PYBIND11_OVERLOAD_INT(ret_type, cname, name, __VA_ARGS__) \ - pybind11::pybind11_fail("Tried to call pure virtual function \"" #cname "::" name "\""); + PYBIND11_OVERLOAD_INT(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), name, __VA_ARGS__) \ + pybind11::pybind11_fail("Tried to call pure virtual function \"" PYBIND11_STRINGIFY(cname) "::" name "\""); +/** \rst + Macro to populate the virtual method in the trampoline class. This macro tries to look up the method + from the Python side, deals with the :ref:`gil` and necessary argument conversions to call this method and return + the appropriate type. This macro should be used if the method name in C and in Python are identical. + See :ref:`overriding_virtuals` for more information. + + .. code-block:: cpp + + class PyAnimal : public Animal { + public: + // Inherit the constructors + using Animal::Animal; + + // Trampoline (need one for each virtual function) + std::string go(int n_times) override { + PYBIND11_OVERLOAD_PURE( + std::string, // Return type (ret_type) + Animal, // Parent class (cname) + go, // Name of function in C++ (must match Python name) (fn) + n_times // Argument(s) (...) + ); + } + }; +\endrst */ #define PYBIND11_OVERLOAD(ret_type, cname, fn, ...) \ - PYBIND11_OVERLOAD_NAME(ret_type, cname, #fn, fn, __VA_ARGS__) + PYBIND11_OVERLOAD_NAME(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), #fn, fn, __VA_ARGS__) +/** \rst + Macro for pure virtual functions, this function is identical to :c:macro:`PYBIND11_OVERLOAD`, except that it throws + if no overload can be found. +\endrst */ #define PYBIND11_OVERLOAD_PURE(ret_type, cname, fn, ...) \ - PYBIND11_OVERLOAD_PURE_NAME(ret_type, cname, #fn, fn, __VA_ARGS__) + PYBIND11_OVERLOAD_PURE_NAME(PYBIND11_TYPE(ret_type), PYBIND11_TYPE(cname), #fn, fn, __VA_ARGS__) NAMESPACE_END(PYBIND11_NAMESPACE) -#if defined(_MSC_VER) +#if defined(_MSC_VER) && !defined(__INTEL_COMPILER) # pragma warning(pop) -#elif defined(__INTEL_COMPILER) -/* Leave ignored warnings on */ #elif defined(__GNUG__) && !defined(__clang__) # pragma GCC diagnostic pop #endif diff --git a/python/src/pybind11/pytypes.h b/python/src/pybind11/pytypes.h index d7fa17775..2d573dfad 100644 --- a/python/src/pybind11/pytypes.h +++ b/python/src/pybind11/pytypes.h @@ -114,6 +114,35 @@ public: bool is(object_api const& other) const { return derived().ptr() == other.derived().ptr(); } /// Equivalent to ``obj is None`` in Python. bool is_none() const { return derived().ptr() == Py_None; } + /// Equivalent to obj == other in Python + bool equal(object_api const &other) const { return rich_compare(other, Py_EQ); } + bool not_equal(object_api const &other) const { return rich_compare(other, Py_NE); } + bool operator<(object_api const &other) const { return rich_compare(other, Py_LT); } + bool operator<=(object_api const &other) const { return rich_compare(other, Py_LE); } + bool operator>(object_api const &other) const { return rich_compare(other, Py_GT); } + bool operator>=(object_api const &other) const { return rich_compare(other, Py_GE); } + + object operator-() const; + object operator~() const; + object operator+(object_api const &other) const; + object operator+=(object_api const &other) const; + object operator-(object_api const &other) const; + object operator-=(object_api const &other) const; + object operator*(object_api const &other) const; + object operator*=(object_api const &other) const; + object operator/(object_api const &other) const; + object operator/=(object_api const &other) const; + object operator|(object_api const &other) const; + object operator|=(object_api const &other) const; + object operator&(object_api const &other) const; + object operator&=(object_api const &other) const; + object operator^(object_api const &other) const; + object operator^=(object_api const &other) const; + object operator<<(object_api const &other) const; + object operator<<=(object_api const &other) const; + object operator>>(object_api const &other) const; + object operator>>=(object_api const &other) const; + PYBIND11_DEPRECATED("Use py::str(obj) instead") pybind11::str str() const; @@ -124,6 +153,9 @@ public: int ref_count() const { return static_cast(Py_REFCNT(derived().ptr())); } /// Return a handle to the Python type object underlying the instance handle get_type() const; + +private: + bool rich_compare(object_api const &other, int value) const; }; NAMESPACE_END(detail) @@ -292,15 +324,18 @@ public: /// Constructs a new exception from the current Python error indicator, if any. The current /// Python error indicator will be cleared. error_already_set() : std::runtime_error(detail::error_string()) { - PyErr_Fetch(&type.ptr(), &value.ptr(), &trace.ptr()); + PyErr_Fetch(&m_type.ptr(), &m_value.ptr(), &m_trace.ptr()); } + error_already_set(const error_already_set &) = default; + error_already_set(error_already_set &&) = default; + inline ~error_already_set(); /// Give the currently-held error back to Python, if any. If there is currently a Python error /// already set it is cleared first. After this call, the current object no longer stores the /// error variables (but the `.what()` string is still available). - void restore() { PyErr_Restore(type.release().ptr(), value.release().ptr(), trace.release().ptr()); } + void restore() { PyErr_Restore(m_type.release().ptr(), m_value.release().ptr(), m_trace.release().ptr()); } // Does nothing; provided for backwards compatibility. PYBIND11_DEPRECATED("Use of error_already_set.clear() is deprecated") @@ -309,10 +344,14 @@ public: /// Check if the currently trapped error type matches the given Python exception class (or a /// subclass thereof). May also be passed a tuple to search for any exception class matches in /// the given tuple. - bool matches(handle ex) const { return PyErr_GivenExceptionMatches(ex.ptr(), type.ptr()); } + bool matches(handle exc) const { return PyErr_GivenExceptionMatches(m_type.ptr(), exc.ptr()); } + + const object& type() const { return m_type; } + const object& value() const { return m_value; } + const object& trace() const { return m_trace; } private: - object type, value, trace; + object m_type, m_value, m_trace; }; /** \defgroup python_builtins _ @@ -353,6 +392,14 @@ inline bool hasattr(handle obj, const char *name) { return PyObject_HasAttrString(obj.ptr(), name) == 1; } +inline void delattr(handle obj, handle name) { + if (PyObject_DelAttr(obj.ptr(), name.ptr()) != 0) { throw error_already_set(); } +} + +inline void delattr(handle obj, const char *name) { + if (PyObject_DelAttrString(obj.ptr(), name) != 0) { throw error_already_set(); } +} + inline object getattr(handle obj, handle name) { PyObject *result = PyObject_GetAttr(obj.ptr(), name.ptr()); if (!result) { throw error_already_set(); } @@ -424,7 +471,6 @@ object object_or_cast(T &&o); // Match a PyObject*, which we want to convert directly to handle via its converting constructor inline handle object_or_cast(PyObject *ptr) { return ptr; } - template class accessor : public object_api> { using key_type = typename Policy::key_type; @@ -662,7 +708,7 @@ protected: private: handle obj; - PyObject *key, *value; + PyObject *key = nullptr, *value = nullptr; ssize_t pos = -1; }; NAMESPACE_END(iterator_policies) @@ -690,9 +736,14 @@ inline bool PyIterable_Check(PyObject *obj) { } inline bool PyNone_Check(PyObject *o) { return o == Py_None; } +#if PY_MAJOR_VERSION >= 3 +inline bool PyEllipsis_Check(PyObject *o) { return o == Py_Ellipsis; } +#endif inline bool PyUnicode_Check_Permissive(PyObject *o) { return PyUnicode_Check(o) || PYBIND11_BYTES_CHECK(o); } +inline bool PyStaticMethod_Check(PyObject *o) { return o->ob_type == &PyStaticMethod_Type; } + class kwargs_proxy : public handle { public: explicit kwargs_proxy(handle h) : handle(h) { } @@ -964,6 +1015,14 @@ public: none() : object(Py_None, borrowed_t{}) { } }; +#if PY_MAJOR_VERSION >= 3 +class ellipsis : public object { +public: + PYBIND11_OBJECT(ellipsis, object, detail::PyEllipsis_Check) + ellipsis() : object(Py_Ellipsis, borrowed_t{}) { } +}; +#endif + class bool_ : public object { public: PYBIND11_OBJECT_CVT(bool_, object, PyBool_Check, raw_bool) @@ -1074,6 +1133,13 @@ public: (ssize_t *) stop, (ssize_t *) step, (ssize_t *) slicelength) == 0; } + bool compute(ssize_t length, ssize_t *start, ssize_t *stop, ssize_t *step, + ssize_t *slicelength) const { + return PySlice_GetIndicesEx((PYBIND11_SLICE_OBJECT *) m_ptr, + length, start, + stop, step, + slicelength) == 0; + } }; class capsule : public object { @@ -1137,6 +1203,7 @@ public: } size_t size() const { return (size_t) PyTuple_Size(m_ptr); } detail::tuple_accessor operator[](size_t index) const { return {*this, index}; } + detail::item_accessor operator[](handle h) const { return object::operator[](h); } detail::tuple_iterator begin() const { return {*this, 0}; } detail::tuple_iterator end() const { return {*this, PyTuple_GET_SIZE(m_ptr)}; } }; @@ -1174,6 +1241,7 @@ public: PYBIND11_OBJECT_DEFAULT(sequence, object, PySequence_Check) size_t size() const { return (size_t) PySequence_Size(m_ptr); } detail::sequence_accessor operator[](size_t index) const { return {*this, index}; } + detail::item_accessor operator[](handle h) const { return object::operator[](h); } detail::sequence_iterator begin() const { return {*this, 0}; } detail::sequence_iterator end() const { return {*this, PySequence_Size(m_ptr)}; } }; @@ -1186,6 +1254,7 @@ public: } size_t size() const { return (size_t) PyList_Size(m_ptr); } detail::list_accessor operator[](size_t index) const { return {*this, index}; } + detail::item_accessor operator[](handle h) const { return object::operator[](h); } detail::list_iterator begin() const { return {*this, 0}; } detail::list_iterator end() const { return {*this, PyList_GET_SIZE(m_ptr)}; } template void append(T &&val) const { @@ -1221,6 +1290,11 @@ public: bool is_cpp_function() const { return (bool) cpp_function(); } }; +class staticmethod : public object { +public: + PYBIND11_OBJECT_CVT(staticmethod, object, detail::PyStaticMethod_Check, PyStaticMethod_New) +}; + class buffer : public object { public: PYBIND11_OBJECT_DEFAULT(buffer, object, PyObject_CheckBuffer) @@ -1279,6 +1353,21 @@ inline size_t len(handle h) { return (size_t) result; } +inline size_t len_hint(handle h) { +#if PY_VERSION_HEX >= 0x03040000 + ssize_t result = PyObject_LengthHint(h.ptr(), 0); +#else + ssize_t result = PyObject_Length(h.ptr()); +#endif + if (result < 0) { + // Sometimes a length can't be determined at all (eg generators) + // In which case simply return 0 + PyErr_Clear(); + return 0; + } + return (size_t) result; +} + inline str repr(handle h) { PyObject *str_value = PyObject_Repr(h.ptr()); if (!str_value) throw error_already_set(); @@ -1328,5 +1417,55 @@ str_attr_accessor object_api::doc() const { return attr("__doc__"); } template handle object_api::get_type() const { return (PyObject *) Py_TYPE(derived().ptr()); } +template +bool object_api::rich_compare(object_api const &other, int value) const { + int rv = PyObject_RichCompareBool(derived().ptr(), other.derived().ptr(), value); + if (rv == -1) + throw error_already_set(); + return rv == 1; +} + +#define PYBIND11_MATH_OPERATOR_UNARY(op, fn) \ + template object object_api::op() const { \ + object result = reinterpret_steal(fn(derived().ptr())); \ + if (!result.ptr()) \ + throw error_already_set(); \ + return result; \ + } + +#define PYBIND11_MATH_OPERATOR_BINARY(op, fn) \ + template \ + object object_api::op(object_api const &other) const { \ + object result = reinterpret_steal( \ + fn(derived().ptr(), other.derived().ptr())); \ + if (!result.ptr()) \ + throw error_already_set(); \ + return result; \ + } + +PYBIND11_MATH_OPERATOR_UNARY (operator~, PyNumber_Invert) +PYBIND11_MATH_OPERATOR_UNARY (operator-, PyNumber_Negative) +PYBIND11_MATH_OPERATOR_BINARY(operator+, PyNumber_Add) +PYBIND11_MATH_OPERATOR_BINARY(operator+=, PyNumber_InPlaceAdd) +PYBIND11_MATH_OPERATOR_BINARY(operator-, PyNumber_Subtract) +PYBIND11_MATH_OPERATOR_BINARY(operator-=, PyNumber_InPlaceSubtract) +PYBIND11_MATH_OPERATOR_BINARY(operator*, PyNumber_Multiply) +PYBIND11_MATH_OPERATOR_BINARY(operator*=, PyNumber_InPlaceMultiply) +PYBIND11_MATH_OPERATOR_BINARY(operator/, PyNumber_TrueDivide) +PYBIND11_MATH_OPERATOR_BINARY(operator/=, PyNumber_InPlaceTrueDivide) +PYBIND11_MATH_OPERATOR_BINARY(operator|, PyNumber_Or) +PYBIND11_MATH_OPERATOR_BINARY(operator|=, PyNumber_InPlaceOr) +PYBIND11_MATH_OPERATOR_BINARY(operator&, PyNumber_And) +PYBIND11_MATH_OPERATOR_BINARY(operator&=, PyNumber_InPlaceAnd) +PYBIND11_MATH_OPERATOR_BINARY(operator^, PyNumber_Xor) +PYBIND11_MATH_OPERATOR_BINARY(operator^=, PyNumber_InPlaceXor) +PYBIND11_MATH_OPERATOR_BINARY(operator<<, PyNumber_Lshift) +PYBIND11_MATH_OPERATOR_BINARY(operator<<=, PyNumber_InPlaceLshift) +PYBIND11_MATH_OPERATOR_BINARY(operator>>, PyNumber_Rshift) +PYBIND11_MATH_OPERATOR_BINARY(operator>>=, PyNumber_InPlaceRshift) + +#undef PYBIND11_MATH_OPERATOR_UNARY +#undef PYBIND11_MATH_OPERATOR_BINARY + NAMESPACE_END(detail) NAMESPACE_END(PYBIND11_NAMESPACE) diff --git a/python/src/pybind11/stl.h b/python/src/pybind11/stl.h index 1a4bbf0db..32f8d294a 100644 --- a/python/src/pybind11/stl.h +++ b/python/src/pybind11/stl.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #if defined(_MSC_VER) @@ -83,7 +84,8 @@ template struct set_caster { template static handle cast(T &&src, return_value_policy policy, handle parent) { - policy = return_value_policy_override::policy(policy); + if (!std::is_lvalue_reference::value) + policy = return_value_policy_override::policy(policy); pybind11::set s; for (auto &&value : src) { auto value_ = reinterpret_steal(key_conv::cast(forward_like(value), policy, parent)); @@ -93,7 +95,7 @@ template struct set_caster { return s.release(); } - PYBIND11_TYPE_CASTER(type, _("Set[") + key_conv::name() + _("]")); + PYBIND11_TYPE_CASTER(type, _("Set[") + key_conv::name + _("]")); }; template struct map_caster { @@ -119,8 +121,12 @@ template struct map_caster { template static handle cast(T &&src, return_value_policy policy, handle parent) { dict d; - return_value_policy policy_key = return_value_policy_override::policy(policy); - return_value_policy policy_value = return_value_policy_override::policy(policy); + return_value_policy policy_key = policy; + return_value_policy policy_value = policy; + if (!std::is_lvalue_reference::value) { + policy_key = return_value_policy_override::policy(policy_key); + policy_value = return_value_policy_override::policy(policy_value); + } for (auto &&kv : src) { auto key = reinterpret_steal(key_conv::cast(forward_like(kv.first), policy_key, parent)); auto value = reinterpret_steal(value_conv::cast(forward_like(kv.second), policy_value, parent)); @@ -131,14 +137,14 @@ template struct map_caster { return d.release(); } - PYBIND11_TYPE_CASTER(Type, _("Dict[") + key_conv::name() + _(", ") + value_conv::name() + _("]")); + PYBIND11_TYPE_CASTER(Type, _("Dict[") + key_conv::name + _(", ") + value_conv::name + _("]")); }; template struct list_caster { using value_conv = make_caster; bool load(handle src, bool convert) { - if (!isinstance(src)) + if (!isinstance(src) || isinstance(src)) return false; auto s = reinterpret_borrow(src); value.clear(); @@ -161,7 +167,8 @@ private: public: template static handle cast(T &&src, return_value_policy policy, handle parent) { - policy = return_value_policy_override::policy(policy); + if (!std::is_lvalue_reference::value) + policy = return_value_policy_override::policy(policy); list l(src.size()); size_t index = 0; for (auto &&value : src) { @@ -173,12 +180,15 @@ public: return l.release(); } - PYBIND11_TYPE_CASTER(Type, _("List[") + value_conv::name() + _("]")); + PYBIND11_TYPE_CASTER(Type, _("List[") + value_conv::name + _("]")); }; template struct type_caster> : list_caster, Type> { }; +template struct type_caster> + : list_caster, Type> { }; + template struct type_caster> : list_caster, Type> { }; @@ -199,9 +209,9 @@ private: public: bool load(handle src, bool convert) { - if (!isinstance(src)) + if (!isinstance(src)) return false; - auto l = reinterpret_borrow(src); + auto l = reinterpret_borrow(src); if (!require_size(l.size())) return false; size_t ctr = 0; @@ -227,7 +237,7 @@ public: return l.release(); } - PYBIND11_TYPE_CASTER(ArrayType, _("List[") + value_conv::name() + _(_(""), _("[") + _() + _("]")) + _("]")); + PYBIND11_TYPE_CASTER(ArrayType, _("List[") + value_conv::name + _(_(""), _("[") + _() + _("]")) + _("]")); }; template struct type_caster> @@ -274,7 +284,7 @@ template struct optional_caster { return true; } - PYBIND11_TYPE_CASTER(T, _("Optional[") + value_conv::name() + _("]")); + PYBIND11_TYPE_CASTER(T, _("Optional[") + value_conv::name + _("]")); }; #if PYBIND11_HAS_OPTIONAL @@ -354,7 +364,7 @@ struct variant_caster> { } using Type = V; - PYBIND11_TYPE_CASTER(Type, _("Union[") + detail::concat(make_caster::name()...) + _("]")); + PYBIND11_TYPE_CASTER(Type, _("Union[") + detail::concat(make_caster::name...) + _("]")); }; #if PYBIND11_HAS_VARIANT diff --git a/python/src/pybind11/stl_bind.h b/python/src/pybind11/stl_bind.h index 38dd68f69..1f8725260 100644 --- a/python/src/pybind11/stl_bind.h +++ b/python/src/pybind11/stl_bind.h @@ -122,7 +122,7 @@ void vector_modifiers(enable_if_t(new Vector()); - v->reserve(len(it)); + v->reserve(len_hint(it)); for (handle h : it) v->push_back(h.cast()); return v.release(); @@ -136,6 +136,28 @@ void vector_modifiers(enable_if_t()); + } + } catch (const cast_error &) { + v.erase(v.begin() + static_cast(old_size), v.end()); + try { + v.shrink_to_fit(); + } catch (const std::exception &) { + // Do nothing + } + throw; + } + }, + arg("L"), + "Extend the list by appending all the items in the given list" + ); + cl.def("insert", [](Vector &v, SizeType i, const T &x) { if (i > v.size()) @@ -579,6 +601,15 @@ class_ bind_map(handle scope, const std::string &name, Args&&. return_value_policy::reference_internal // ref + keepalive ); + cl.def("__contains__", + [](Map &m, const KeyType &k) -> bool { + auto it = m.find(k); + if (it == m.end()) + return false; + return true; + } + ); + // Assignment provided only if the type is copyable detail::map_assignment(cl); diff --git a/python/triton/kernel.py b/python/triton/kernel.py index 355bc3675..93292d8c2 100644 --- a/python/triton/kernel.py +++ b/python/triton/kernel.py @@ -217,6 +217,7 @@ class kernel: if fw.has_tensorflow(): return self.fw_op(*op_args, id=op_id) elif fw.has_torch(): - return self.fw_op(op_id, *op_args) + args = [x.contiguous() if isinstance(x, fw.torch.Tensor) else x for x in op_args] + return self.fw_op(op_id, *args) else: assert False \ No newline at end of file diff --git a/python/triton/ops/dot.py b/python/triton/ops/dot.py index 36bde11fe..b37f2e32a 100644 --- a/python/triton/ops/dot.py +++ b/python/triton/ops/dot.py @@ -20,8 +20,8 @@ void dot(TYPE * A, TYPE * B, TYPE * C, TYPE* pa[SHAPE_A] = A + rka[BROADCAST_AK] * STRIDE_AK + rxa[BROADCAST_AM] * STRIDE_AM; TYPE* pb[SHAPE_B] = B + rkb[BROADCAST_BK] * STRIDE_BK + ryb[BROADCAST_BN] * STRIDE_BN; // prefetches operands - TYPE a[SHAPE_A] = *pa; - TYPE b[SHAPE_B] = *pb; + TYPE a[SHAPE_A] = (*pa); + TYPE b[SHAPE_B] = (*pb); // reduction loop for(int k = K; k > 0; k-= TK){ c += USE_A @ USE_B; @@ -80,16 +80,19 @@ void dot(TYPE * A, TYPE * B, TYPE * C, 'SHAPE_B' : 'TN, TK' if transpose_b else 'TK, TN'} return _dot.kernel(a, b, c, M, N, Ka, lda, ldb, ldc, grid, AT = transpose_a, BT = transpose_b, TYPE = dtype, - TM = [64, 128], TN = [64, 128], TK = [8], **macros) + TM = [128], TN = [128], TK = [8], **macros) @staticmethod def forward(ctx, a, b, transpose_a = False, transpose_b = False): - ctx.save_for_backward(a, b, transpose_a, transpose_b) + ctx.save_for_backward(a, b) + ctx.t_a = transpose_a + ctx.t_b = transpose_b return _dot._call(a, b, transpose_a, transpose_b) @staticmethod def backward(ctx, dy): - a, b, t_a, t_b = ctx.saved_tensors + a, b = ctx.saved_tensors + t_a, t_b = ctx.t_a, ctx.t_b if not t_a and not t_b: da = _dot._call(dy, b, False, True) db = _dot._call(a, dy, True, False) @@ -104,6 +107,6 @@ void dot(TYPE * A, TYPE * B, TYPE * C, db = _dot._call(dy, a, True, True) else: assert False - return [da, db, None, None, None, None, None, None, None] + return da, db, None, None, None, None, None, None, None dot = _dot.apply \ No newline at end of file From 9ab2880fba760cbdd71373773471813f58dd98a6 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 5 Sep 2019 12:54:35 -0400 Subject: [PATCH 356/494] [python][examples] cleaned up dot example --- python/examples/dot.py | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/python/examples/dot.py b/python/examples/dot.py index 5c2dce459..49ed5b298 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -28,23 +28,33 @@ def run_tf(): print("dif: %f" % np.max(dif)) def run_torch(): - import torch as th - th.manual_seed(0) + import torch + torch.manual_seed(0) M, N, K = 128, 128, 128 - a = th.randn(M, K).cuda() - b = th.randn(K, N).cuda() + a = torch.randn(M, K).cuda() + b = torch.randn(K, N).cuda() + a.requires_grad_(True) b.requires_grad_(True) - #th_c = th.matmul(a, th.t(b)) - #th_d = th.matmul(th.t(th_c), b) - tr_c = triton.ops.dot(a, b, False, True) - #tr_d = triton.ops.dot(tr_c, b, True, False) - y = th.sum(tr_c) - #print('backprop', y) - y.backward() - #print('backward done') - print(b.grad) - #th_d.backward() - #print(a.grad) + torch_c = torch.matmul(a, torch.t(b)) + torch_d = torch.matmul(torch.t(torch_c), b) + torch_y = torch.mean(torch_d) + triton_c = triton.ops.dot(a, b, False, True) + triton_d = triton.ops.dot(triton_c, b, True, False) + triton_y = torch.mean(triton_d) + + # torch gradient + torch_y.backward() + torch_da = a.grad.clone() + torch_db = b.grad.clone() + # triton gradient + a.grad.zero_() + b.grad.zero_() + triton_y.backward() + triton_da = a.grad.clone() + triton_db = b.grad.clone() + + print('Diff DA:', (torch_da - triton_da).max()) + print('Diff DB:', (torch_db - triton_db).max()) run_torch() \ No newline at end of file From 7bfbb8961221511c9c42269348cf9786176c9f9b Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 5 Sep 2019 15:37:00 -0400 Subject: [PATCH 357/494] [python] now packaging include and libtriton in triton._C submodule --- python/setup.py | 22 ++++++++++++++++------ python/triton/__init__.py | 2 +- python/triton/frameworks.py | 2 +- python/triton/kernel.py | 2 +- python/triton/utils.py | 2 +- 5 files changed, 20 insertions(+), 10 deletions(-) diff --git a/python/setup.py b/python/setup.py index 2ae0dba63..50d2a775a 100644 --- a/python/setup.py +++ b/python/setup.py @@ -5,16 +5,17 @@ import sysconfig import platform import subprocess import distutils - +import glob from distutils.version import LooseVersion from setuptools import setup, Extension, find_packages from setuptools.command.build_ext import build_ext from setuptools.command.test import test as TestCommand class CMakeExtension(Extension): - def __init__(self, name, sourcedir=''): + def __init__(self, name, path, sourcedir=''): Extension.__init__(self, name, sources=[]) self.sourcedir = os.path.abspath(sourcedir) + self.path = path class CMakeBuild(build_ext): @@ -36,7 +37,7 @@ class CMakeBuild(build_ext): def build_extension(self, ext): self.debug = True - extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name))) + extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.path))) # python directories python_include_dirs = distutils.sysconfig.get_python_inc() python_lib_dirs = distutils.sysconfig.get_config_var('LIBDIR') @@ -78,6 +79,15 @@ class CMakeBuild(build_ext): subprocess.check_call(['cmake', sourcedir] + cmake_args, cwd=self.build_temp, env=env) subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp) + +directories = [x[0] for x in os.walk(os.path.join(os.path.pardir, 'include'))] +data = [] +for d in directories: + files = glob.glob(os.path.join(d, '*.h'), recursive=False) + dest = os.path.relpath(d, os.path.pardir) + dest = os.path.join('triton', '_C', dest) + data += [(dest, files)] + setup( name='triton', version='0.1', @@ -85,9 +95,9 @@ setup( author_email='ptillet@g.harvard.edu', description='A language and compiler for custom Deep Learning operations', long_description='', - packages=['triton', - 'triton/ops'], - ext_modules=[CMakeExtension('triton')], + packages=['triton', 'triton/ops'], + data_files=data, + ext_modules=[CMakeExtension('triton', 'triton/_C/')], cmdclass=dict(build_ext=CMakeBuild), zip_safe=False, ) diff --git a/python/triton/__init__.py b/python/triton/__init__.py index aa05eefe1..cb4097e72 100644 --- a/python/triton/__init__.py +++ b/python/triton/__init__.py @@ -6,7 +6,7 @@ import triton.ops # clean-up libtriton resources import atexit -import libtriton +import triton._C.libtriton as libtriton @atexit.register def cleanup(): libtriton.cleanup() \ No newline at end of file diff --git a/python/triton/frameworks.py b/python/triton/frameworks.py index fcab5dcbf..993389a82 100644 --- a/python/triton/frameworks.py +++ b/python/triton/frameworks.py @@ -1,6 +1,6 @@ import sys import os -import libtriton +import triton._C.libtriton as libtriton torch = None tensorflow = None diff --git a/python/triton/kernel.py b/python/triton/kernel.py index 93292d8c2..cf2f0567e 100644 --- a/python/triton/kernel.py +++ b/python/triton/kernel.py @@ -12,7 +12,7 @@ import setuptools # triton import triton.frameworks as fw import triton.utils -import libtriton +import triton._C.libtriton as libtriton def _make_framework_src(src, out, grid): if fw.has_tensorflow(): diff --git a/python/triton/utils.py b/python/triton/utils.py index 6c5df7b09..127d67364 100644 --- a/python/triton/utils.py +++ b/python/triton/utils.py @@ -1,5 +1,5 @@ import triton.frameworks as fw -import libtriton +import triton._C.libtriton as libtriton def cdiv(a, b): return -(-a // b) From 945593e84731a6c0be625c6cfe0484a742ea6ce7 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 5 Sep 2019 15:42:43 -0400 Subject: [PATCH 358/494] [python] using generic path for triton include directories --- python/triton/kernel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/triton/kernel.py b/python/triton/kernel.py index cf2f0567e..d70e7c895 100644 --- a/python/triton/kernel.py +++ b/python/triton/kernel.py @@ -58,7 +58,7 @@ def _write_bindings(src, root): def _build(src, path): # include directories - triton_include_dirs = ['/home/philippe/development/triton/include'] + triton_include_dirs = [os.path.realpath(os.path.join(libtriton.__file__, 'include'))] include_dirs = triton_include_dirs # library directories triton_library_dirs = [os.path.realpath(os.path.join(libtriton.__file__, os.path.pardir))] From 0a6329ea7d9f368dd349309546b6d3bdb7439bcd Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 5 Sep 2019 16:01:56 -0400 Subject: [PATCH 359/494] [python] more robust way to add triton includes to python package --- python/setup.py | 8 +++----- python/triton/_C/include | 1 + python/triton/kernel.py | 6 ++++-- 3 files changed, 8 insertions(+), 7 deletions(-) create mode 120000 python/triton/_C/include diff --git a/python/setup.py b/python/setup.py index 50d2a775a..c90e18434 100644 --- a/python/setup.py +++ b/python/setup.py @@ -84,9 +84,7 @@ directories = [x[0] for x in os.walk(os.path.join(os.path.pardir, 'include'))] data = [] for d in directories: files = glob.glob(os.path.join(d, '*.h'), recursive=False) - dest = os.path.relpath(d, os.path.pardir) - dest = os.path.join('triton', '_C', dest) - data += [(dest, files)] + data += [os.path.relpath(f, os.path.pardir) for f in files] setup( name='triton', @@ -95,8 +93,8 @@ setup( author_email='ptillet@g.harvard.edu', description='A language and compiler for custom Deep Learning operations', long_description='', - packages=['triton', 'triton/ops'], - data_files=data, + packages=['triton', 'triton/_C', 'triton/ops'], + package_data={'': data}, ext_modules=[CMakeExtension('triton', 'triton/_C/')], cmdclass=dict(build_ext=CMakeBuild), zip_safe=False, diff --git a/python/triton/_C/include b/python/triton/_C/include new file mode 120000 index 000000000..b85a40983 --- /dev/null +++ b/python/triton/_C/include @@ -0,0 +1 @@ +../../../include/ \ No newline at end of file diff --git a/python/triton/kernel.py b/python/triton/kernel.py index d70e7c895..50ade154e 100644 --- a/python/triton/kernel.py +++ b/python/triton/kernel.py @@ -57,11 +57,13 @@ def _write_bindings(src, root): return (cpp, so) def _build(src, path): + ccdir = os.path.join(libtriton.__file__, os.path.pardir) + ccdir = os.path.realpath(ccdir) # include directories - triton_include_dirs = [os.path.realpath(os.path.join(libtriton.__file__, 'include'))] + triton_include_dirs = [os.path.join(ccdir, 'include')] include_dirs = triton_include_dirs # library directories - triton_library_dirs = [os.path.realpath(os.path.join(libtriton.__file__, os.path.pardir))] + triton_library_dirs = [ccdir] library_dirs = triton_library_dirs # libraries libraries = ['triton'] From 18848cbb71e620805fac0dfb741de0951c30f6c4 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 5 Sep 2019 17:24:57 -0400 Subject: [PATCH 360/494] [driver] now passing std::unique_ptr<> instead of cloning LLVM module when compiling it --- CMakeLists.txt | 16 ++++------------ include/triton/driver/backend.h | 1 - include/triton/driver/module.h | 12 ++++++------ lib/driver/backend.cc | 8 -------- lib/driver/module.cc | 31 ++++++++++++++++--------------- lib/runtime/function.cc | 10 ++-------- 6 files changed, 28 insertions(+), 50 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 78f2967ad..717bbe144 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,10 +8,9 @@ option(BUILD_TESTS "Build C++ Triton tests" ON) option(BUILD_PYTHON_MODULE "Build Python Triton bindings" OFF) # LLVM -find_package(LLVM REQUIRED CONFIG) +find_package(LLVM REQUIRED) include_directories(${LLVM_INCLUDE_DIRS}) add_definitions(${LLVM_DEFINITIONS}) -llvm_map_components_to_libnames(llvm_libs all) # Default build type if(NOT CMAKE_BUILD_TYPE) @@ -21,7 +20,7 @@ endif() # Compiler flags include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${LLVM_CXXFLAGS} -std=c++11") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") # Tests if(BUILD_TESTS) @@ -53,13 +52,6 @@ endif() # Triton file(GLOB_RECURSE LIBTRITON_SRC lib/*.cc) add_library(triton SHARED ${LIBTRITON_SRC} ${PYTHON_SRC}) -target_link_libraries(triton LLVM) - -# Warning level -#if(MSVC) -# target_compile_options(triton PRIVATE /W4) -#else() -# target_compile_options(triton PRIVATE -Wno-unused-parameter -Wall -Wextra -pedantic) -#endif() - +link_directories(${LLVM_LIBRARY_DIRS}) +target_link_libraries(triton ${LLVM_LIBRARIES}) diff --git a/include/triton/driver/backend.h b/include/triton/driver/backend.h index ac48a7461..c7a0f5aac 100755 --- a/include/triton/driver/backend.h +++ b/include/triton/driver/backend.h @@ -66,7 +66,6 @@ struct backend public: static void release(); - static driver::module* get(driver::stream* stream, std::string const & name, llvm::Module *src); private: static std::map, driver::module*> cache_; diff --git a/include/triton/driver/module.h b/include/triton/driver/module.h index 18dea2453..4f277d99b 100755 --- a/include/triton/driver/module.h +++ b/include/triton/driver/module.h @@ -38,9 +38,9 @@ public: module(driver::context* ctx, CUmodule mod, bool has_ownership); module(driver::context* ctx, cl_program mod, bool has_ownership); module(driver::context* ctx, host_module_t mod, bool has_ownership); - static module* create(driver::context* ctx, llvm::Module *src); + static module* create(driver::context* ctx, std::unique_ptr src); driver::context* context() const; - void compile_llvm_module(llvm::Module* module, const std::string& triple, + void compile_llvm_module(std::unique_ptr module, const std::string& triple, const std::string &proc, std::string layout, llvm::SmallVectorImpl &buffer, const std::string &features, @@ -53,22 +53,22 @@ protected: // CPU class host_module: public module{ public: - host_module(driver::context* context, llvm::Module *module); + host_module(driver::context* context, std::unique_ptr module); }; // OpenCL class ocl_module: public module{ public: - ocl_module(driver::context* context, llvm::Module *module); + ocl_module(driver::context* context, std::unique_ptr module); }; // CUDA class cu_module: public module { - std::string compile_llvm_module(llvm::Module* module, driver::device* device); + std::string compile_llvm_module(std::unique_ptr module, driver::device* device); public: - cu_module(driver::context* context, llvm::Module *module); + cu_module(driver::context* context, std::unique_ptr module); cu_module(driver::context* context, const std::string& source); cu_buffer* symbol(const char * name) const; diff --git a/lib/driver/backend.cc b/lib/driver/backend.cc index 3be4daa20..2c64936ef 100755 --- a/lib/driver/backend.cc +++ b/lib/driver/backend.cc @@ -103,14 +103,6 @@ void backend::modules::release(){ cache_.clear(); } -driver::module* backend::modules::get(driver::stream* stream, std::string const & name, llvm::Module* src){ - std::tuple key(stream, name); - if(cache_.find(key)==cache_.end()){ - return &*cache_.insert({key, driver::module::create(stream->context(), src)}).first->second; - } - return &*cache_.at(key); -} - std::map, driver::module*> backend::modules::cache_; /*-----------------------------------*/ diff --git a/lib/driver/module.cc b/lib/driver/module.cc index 34462e8ab..66c775ac6 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -76,16 +76,16 @@ driver::context* module::context() const { return ctx_; } -module* module::create(driver::context* ctx, llvm::Module *src) { +module* module::create(driver::context* ctx, std::unique_ptr src) { switch(ctx->backend()){ - case CUDA: return new cu_module(ctx, src); - case OpenCL: return new ocl_module(ctx, src); - case Host: return new host_module(ctx, src); + case CUDA: return new cu_module(ctx, std::move(src)); + case OpenCL: return new ocl_module(ctx, std::move(src)); + case Host: return new host_module(ctx, std::move(src)); default: throw std::runtime_error("unknown backend"); } } -void module::compile_llvm_module(llvm::Module* module, const std::string& triple, +void module::compile_llvm_module(std::unique_ptr module, const std::string& triple, const std::string &proc, std::string layout, llvm::SmallVectorImpl &buffer, const std::string& features, @@ -133,7 +133,7 @@ void module::compile_llvm_module(llvm::Module* module, const std::string& triple // Host // /* ------------------------ */ -host_module::host_module(driver::context * context, llvm::Module* src): module(context, host_module_t(), true) { +host_module::host_module(driver::context * context, std::unique_ptr src): module(context, host_module_t(), true) { init_llvm(); // host info // std::string triple = llvm::sys::getDefaultTargetTriple(); @@ -147,7 +147,7 @@ host_module::host_module(driver::context * context, llvm::Module* src): module(c llvm::Type *args_ty = llvm::Type::getInt8PtrTy(ctx)->getPointerTo(); llvm::Type *int32_ty = llvm::Type::getInt32Ty(ctx); llvm::FunctionType *main_ty = llvm::FunctionType::get(void_ty, {args_ty, int32_ty, int32_ty, int32_ty}, false); - llvm::Function* main = llvm::Function::Create(main_ty, llvm::Function::ExternalLinkage, "main", src); + llvm::Function* main = llvm::Function::Create(main_ty, llvm::Function::ExternalLinkage, "main", &*src); llvm::Function* fn = src->getFunction("matmul"); llvm::FunctionType *fn_ty = fn->getFunctionType(); std::vector fn_args(fn_ty->getNumParams()); @@ -169,10 +169,9 @@ host_module::host_module(driver::context * context, llvm::Module* src): module(c // create execution engine - auto cloned = llvm::CloneModule(*src); - for(llvm::Function& fn: cloned->functions()) + for(llvm::Function& fn: src->functions()) hst_->functions[fn.getName()] = &fn; - llvm::EngineBuilder builder(std::move(cloned)); + llvm::EngineBuilder builder(std::move(src)); builder.setErrorStr(&hst_->error); builder.setMCJITMemoryManager(llvm::make_unique()); builder.setOptLevel(llvm::CodeGenOpt::Aggressive); @@ -185,7 +184,7 @@ host_module::host_module(driver::context * context, llvm::Module* src): module(c // OpenCL // /* ------------------------ */ -ocl_module::ocl_module(driver::context * context, llvm::Module* src): module(context, cl_program(), true) { +ocl_module::ocl_module(driver::context * context, std::unique_ptr src): module(context, cl_program(), true) { throw std::runtime_error("not supported"); // init_llvm(); // llvm::SmallVector buffer; @@ -217,18 +216,20 @@ ocl_module::ocl_module(driver::context * context, llvm::Module* src): module(con // CUDA // /* ------------------------ */ -std::string cu_module::compile_llvm_module(llvm::Module* module, driver::device* device) { +std::string cu_module::compile_llvm_module(std::unique_ptr module, driver::device* device) { // options auto options = llvm::cl::getRegisteredOptions(); // for(auto& opt: options) // std::cout << opt.getKey().str() << std::endl; - static_cast*>(options["nvptx-short-ptr"])->setValue(true); + auto* short_ptr = static_cast*>(options["nvptx-short-ptr"]); + assert(short_ptr); + short_ptr->setValue(true); // compute capability auto cc = ((driver::cu_device*)device)->compute_capability(); std::string sm = "sm_" + std::to_string(cc.first) + std::to_string(cc.second); // create llvm::SmallVector buffer; - module::compile_llvm_module(module, "nvptx64-nvidia-cuda", sm, "", buffer, "ptx63", Assembly); + module::compile_llvm_module(std::move(module), "nvptx64-nvidia-cuda", sm, "", buffer, "ptx63", Assembly); std::string result(buffer.begin(), buffer.end()); size_t start_replace = result.find(".version"); size_t end_replace = result.find('\n', start_replace); @@ -237,7 +238,7 @@ std::string cu_module::compile_llvm_module(llvm::Module* module, driver::device* return result; } -cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module, context->device())) { } +cu_module::cu_module(driver::context * context, std::unique_ptr ll_module): cu_module(context, compile_llvm_module(std::move(ll_module), context->device())) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ // std::cout << source << std::endl; diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 75108c268..5454b327f 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -188,6 +188,7 @@ function::caller function::autotune(driver::stream* stream, const grid_fn_ty& gr std::unique_ptr function::make_bin(ir::module &module, driver::context *context, const options_t& opt) { std::unique_ptr target = context->device()->make_target(); + // create passes codegen::analysis::grids grids(opt.num_warps); codegen::analysis::meminfo shmem_info; @@ -201,20 +202,13 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::transform::reassociate reassociate(&alignment_info, &grids); codegen::selection selection(&shmem_allocation, &grids, &shmem_info, &alignment_info, target.get()); - - // run passes peephole.run(module); dce.run(module); -// ir::print(module, std::cout); alignment_info.run(module); grids.run(module); -// ir::print(module, std::cout); - reassociate.run(module); dce.run(module); -// ir::print(module, std::cout); - peephole.run(module); if(target->is_gpu()){ @@ -233,7 +227,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c std::unique_ptr llvm(new llvm::Module(module.get_name(), ctx)); selection.run(module, *llvm); // return binary - std::unique_ptr res(driver::module::create(context, llvm.get())); + std::unique_ptr res(driver::module::create(context, std::move(llvm))); return res; } From 0405509190d4ee3ee7141b56cfec61749a6d6c5a Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 5 Sep 2019 17:47:53 -0400 Subject: [PATCH 361/494] [python] setup.py now finds LLVM version if available --- python/setup.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/python/setup.py b/python/setup.py index c90e18434..ea1568b2f 100644 --- a/python/setup.py +++ b/python/setup.py @@ -10,6 +10,23 @@ from distutils.version import LooseVersion from setuptools import setup, Extension, find_packages from setuptools.command.build_ext import build_ext from setuptools.command.test import test as TestCommand +import distutils.spawn + + +def find_llvm(): + versions = ['9.0', '9', '90', '8.0', '8', '80'] + supported = ['llvm-config-{v}'.format(v=v) for v in versions] + paths = [distutils.spawn.find_executable(cfg) for cfg in supported] + paths = [p for p in paths if p is not None] + if paths: + return paths[0] + config = distutils.spawn.find_executable('llvm-config') + instructions = 'Please install llvm-{8, 9, 10}-dev' + if config is None: + raise RuntimeError('Could not find llvm-config. ' + instructions) + version = os.popen('{config} --version'.format(config=config)).read() + raise RuntimeError('Version {v} not supported. '.format(v=version) + instructions) + class CMakeExtension(Extension): def __init__(self, name, path, sourcedir=''): @@ -44,7 +61,8 @@ class CMakeBuild(build_ext): cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir, '-DBUILD_TESTS=OFF', '-DBUILD_PYTHON_MODULE=ON', - '-DPYTHON_INCLUDE_DIRS=' + python_include_dirs] + '-DPYTHON_INCLUDE_DIRS=' + python_include_dirs, + '-DLLVM_CONFIG=' + find_llvm()] # tensorflow compatibility try: import tensorflow as tf @@ -80,6 +98,8 @@ class CMakeBuild(build_ext): subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp) +find_llvm() + directories = [x[0] for x in os.walk(os.path.join(os.path.pardir, 'include'))] data = [] for d in directories: From 3fd61c1a02542b94e66617a2421c0f0201fc2f7d Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 5 Sep 2019 17:48:29 -0400 Subject: [PATCH 362/494] [cmake] better FindLLVM --- cmake/FindLLVM.cmake | 166 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 cmake/FindLLVM.cmake diff --git a/cmake/FindLLVM.cmake b/cmake/FindLLVM.cmake new file mode 100644 index 000000000..30ebcbd89 --- /dev/null +++ b/cmake/FindLLVM.cmake @@ -0,0 +1,166 @@ +# - Find LLVM headers and libraries. +# This module locates LLVM and adapts the llvm-config output for use with +# CMake. +# +# A given list of COMPONENTS is passed to llvm-config. +# +# The following variables are defined: +# LLVM_FOUND - true if LLVM was found +# LLVM_CXXFLAGS - C++ compiler flags for files that include LLVM headers. +# LLVM_HOST_TARGET - Target triple used to configure LLVM. +# LLVM_INCLUDE_DIRS - Directory containing LLVM include files. +# LLVM_LDFLAGS - Linker flags to add when linking against LLVM +# (includes -LLLVM_LIBRARY_DIRS). +# LLVM_LIBRARIES - Full paths to the library files to link against. +# LLVM_LIBRARY_DIRS - Directory containing LLVM libraries. +# LLVM_ROOT_DIR - The root directory of the LLVM installation. +# llvm-config is searched for in ${LLVM_ROOT_DIR}/bin. +# LLVM_VERSION_MAJOR - Major version of LLVM. +# LLVM_VERSION_MINOR - Minor version of LLVM. +# LLVM_VERSION_STRING - Full LLVM version string (e.g. 6.0.0svn). +# LLVM_VERSION_BASE_STRING - Base LLVM version string without git/svn suffix (e.g. 6.0.0). +# +# Note: The variable names were chosen in conformance with the offical CMake +# guidelines, see ${CMAKE_ROOT}/Modules/readme.txt. + +# Try suffixed versions to pick up the newest LLVM install available on Debian +# derivatives. +# We also want an user-specified LLVM_ROOT_DIR to take precedence over the +# system default locations such as /usr/local/bin. Executing find_program() +# multiples times is the approach recommended in the docs. +set(llvm_config_names llvm-config-9 llvm-config-9.0 llvm-config90 + llvm-config-8 llvm-config-8.0 llvm-config80 + llvm-config) +find_program(LLVM_CONFIG + NAMES ${llvm_config_names} + PATHS ${LLVM_ROOT_DIR}/bin NO_DEFAULT_PATH + DOC "Path to llvm-config tool.") +find_program(LLVM_CONFIG NAMES ${llvm_config_names}) + +# Prints a warning/failure message depending on the required/quiet flags. Copied +# from FindPackageHandleStandardArgs.cmake because it doesn't seem to be exposed. +macro(_LLVM_FAIL _msg) + if(LLVM_FIND_REQUIRED) + message(FATAL_ERROR "${_msg}") + else() + if(NOT LLVM_FIND_QUIETLY) + message(STATUS "${_msg}") + endif() + endif() +endmacro() + + +if(NOT LLVM_CONFIG) + if(NOT LLVM_FIND_QUIETLY) + message(WARNING "Could not find llvm-config (LLVM >= ${LLVM_FIND_VERSION}). Try manually setting LLVM_CONFIG to the llvm-config executable of the installation to use.") + endif() +else() + macro(llvm_set var flag) + if(LLVM_FIND_QUIETLY) + set(_quiet_arg ERROR_QUIET) + endif() + set(result_code) + execute_process( + COMMAND ${LLVM_CONFIG} --${flag} + RESULT_VARIABLE result_code + OUTPUT_VARIABLE LLVM_${var} + OUTPUT_STRIP_TRAILING_WHITESPACE + ${_quiet_arg} + ) + if(result_code) + _LLVM_FAIL("Failed to execute llvm-config ('${LLVM_CONFIG}', result code: '${result_code})'") + else() + if(${ARGV2}) + file(TO_CMAKE_PATH "${LLVM_${var}}" LLVM_${var}) + endif() + endif() + endmacro() + macro(llvm_set_libs var flag) + if(LLVM_FIND_QUIETLY) + set(_quiet_arg ERROR_QUIET) + endif() + set(result_code) + execute_process( + COMMAND ${LLVM_CONFIG} --${flag} ${LLVM_FIND_COMPONENTS} + RESULT_VARIABLE result_code + OUTPUT_VARIABLE tmplibs + OUTPUT_STRIP_TRAILING_WHITESPACE + ${_quiet_arg} + ) + if(result_code) + _LLVM_FAIL("Failed to execute llvm-config ('${LLVM_CONFIG}', result code: '${result_code})'") + else() + file(TO_CMAKE_PATH "${tmplibs}" tmplibs) + string(REGEX MATCHALL "${pattern}[^ ]+" LLVM_${var} ${tmplibs}) + endif() + endmacro() + + llvm_set(VERSION_STRING version) + llvm_set(CXXFLAGS cxxflags) + llvm_set(HOST_TARGET host-target) + llvm_set(INCLUDE_DIRS includedir true) + llvm_set(ROOT_DIR prefix true) + llvm_set(ENABLE_ASSERTIONS assertion-mode) + + # The LLVM version string _may_ contain a git/svn suffix, so cut that off + string(SUBSTRING "${LLVM_VERSION_STRING}" 0 5 LLVM_VERSION_BASE_STRING) + + # Versions below 4.0 do not support components debuginfomsf and demangle + if(${LLVM_VERSION_STRING} MATCHES "^3\\..*") + list(REMOVE_ITEM LLVM_FIND_COMPONENTS "debuginfomsf" index) + list(REMOVE_ITEM LLVM_FIND_COMPONENTS "demangle" index) + endif() + # Versions below 8.0 not supported + if(${LLVM_VERSION_STRING} MATCHES "^[3-7]\\..*") + message(FATAL_ERROR "LLVM version below 8.0 not supported") + endif() + + llvm_set(LDFLAGS ldflags) + # In LLVM 3.5+, the system library dependencies (e.g. "-lz") are accessed + # using the separate "--system-libs" flag. + llvm_set(SYSTEM_LIBS system-libs) + string(REPLACE "\n" " " LLVM_LDFLAGS "${LLVM_LDFLAGS} ${LLVM_SYSTEM_LIBS}") + llvm_set(LIBRARY_DIRS libdir true) + llvm_set_libs(LIBRARIES libs) + # LLVM bug: llvm-config --libs tablegen returns -lLLVM-3.8.0 + # but code for it is not in shared library + if("${LLVM_FIND_COMPONENTS}" MATCHES "tablegen") + if (NOT "${LLVM_LIBRARIES}" MATCHES "LLVMTableGen") + set(LLVM_LIBRARIES "${LLVM_LIBRARIES};-lLLVMTableGen") + endif() + endif() + + # Versions below 4.0 do not support llvm-config --cmakedir + if(${LLVM_VERSION_STRING} MATCHES "^3\\..*") + set(LLVM_CMAKEDIR ${LLVM_LIBRARY_DIRS}/cmake/llvm) + else() + llvm_set(CMAKEDIR cmakedir) + endif() + + llvm_set(TARGETS_TO_BUILD targets-built) + string(REGEX MATCHALL "${pattern}[^ ]+" LLVM_TARGETS_TO_BUILD ${LLVM_TARGETS_TO_BUILD}) +endif() + +# Remove some clang-specific flags for gcc. +if(CMAKE_COMPILER_IS_GNUCXX) + string(REPLACE "-Wcovered-switch-default " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS}) + string(REPLACE "-Wstring-conversion " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS}) + string(REPLACE "-fcolor-diagnostics " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS}) + string(REPLACE "-Werror=unguarded-availability-new " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS}) +endif() + +# Remove gcc-specific flags for clang. +if(${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") + string(REPLACE "-Wno-maybe-uninitialized " "" LLVM_CXXFLAGS ${LLVM_CXXFLAGS}) +endif() + +string(REGEX REPLACE "([0-9]+).*" "\\1" LLVM_VERSION_MAJOR "${LLVM_VERSION_STRING}" ) +string(REGEX REPLACE "[0-9]+\\.([0-9]+).*[A-Za-z]*" "\\1" LLVM_VERSION_MINOR "${LLVM_VERSION_STRING}" ) + + +# Use the default CMake facilities for handling QUIET/REQUIRED. +include(FindPackageHandleStandardArgs) + +find_package_handle_standard_args(LLVM + REQUIRED_VARS LLVM_ROOT_DIR LLVM_HOST_TARGET + VERSION_VAR LLVM_VERSION_STRING) From 1f8fd525b5faf240a69162422b5697962c1eabce Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 5 Sep 2019 20:28:00 -0400 Subject: [PATCH 363/494] [python] fixed warnings for pybind11 and pytorch --- lib/runtime/function.cc | 2 -- python/src/bindings.cc | 2 +- python/src/pybind11/functional.h | 24 +++++++++++++++++++----- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 5454b327f..7c2f42c3c 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -220,8 +220,6 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c dce.run(module); vectorize.run(module); dce.run(module); - - // generate llvm code llvm::LLVMContext ctx; std::unique_ptr llvm(new llvm::Module(module.get_name(), ctx)); diff --git a/python/src/bindings.cc b/python/src/bindings.cc index 4ef860347..a09a0a7cb 100644 --- a/python/src/bindings.cc +++ b/python/src/bindings.cc @@ -493,7 +493,7 @@ extern std::map> id_fn_map; oss << std::endl; oss << std::endl; - oss << "static auto registry = torch::jit::RegisterOperators(\"triton::" << name << "\", &" << name << ");" << std::endl; + oss << "static auto registry = torch::RegisterOperators(\"triton::" << name << "\", &" << name << ");" << std::endl; return {oss.str(), name}; } diff --git a/python/src/pybind11/functional.h b/python/src/pybind11/functional.h index 7a0988ab0..00457e965 100644 --- a/python/src/pybind11/functional.h +++ b/python/src/pybind11/functional.h @@ -65,12 +65,26 @@ public: } }; - value = [hfunc = func_handle(std::move(func))](Args... args) -> Return { - gil_scoped_acquire acq; - object retval(hfunc.f(std::forward(args)...)); - /* Visual studio 2015 parser issue: need parentheses around this expression */ - return (retval.template cast()); + // value = [hfunc = func_handle(std::move(func))](Args... args) -> Return { + // gil_scoped_acquire acq; + // object retval(hfunc.f(std::forward(args)...)); + // /* Visual studio 2015 parser issue: need parentheses around this expression */ + // return (retval.template cast()); + // }; + + struct func_wrapper { + func_handle hfunc; + func_wrapper(func_handle&& hf): hfunc(std::move(hf)) {} + Return operator()(Args... args) const { + gil_scoped_acquire acq; + object retval(hfunc.f(std::forward(args)...)); + /* Visual studio 2015 parser issue: need parentheses around this expression */ + return (retval.template cast()); + } }; + + value = func_wrapper(func_handle(std::move(func))); + return true; } From b79bcbaee827f8edbe31203fbb29ff932da1cf39 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 5 Sep 2019 21:03:09 -0400 Subject: [PATCH 364/494] [auto-tuning] now not compiling kernels that use too much shared memory --- lib/runtime/function.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 7c2f42c3c..114626dce 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -169,6 +169,9 @@ function::caller function::autotune(driver::stream* stream, const grid_fn_ty& gr }catch(const std::runtime_error& e) { return; } + // kernel uses too much resources + if(!bin) + return; // benchmark ir::function *tmp = ir->get_function_list()[0]; caller call(tmp, std::move(bin), opt); @@ -201,7 +204,6 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::transform::peephole peephole; codegen::transform::reassociate reassociate(&alignment_info, &grids); codegen::selection selection(&shmem_allocation, &grids, &shmem_info, &alignment_info, target.get()); - // run passes peephole.run(module); dce.run(module); @@ -210,11 +212,12 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c reassociate.run(module); dce.run(module); peephole.run(module); - if(target->is_gpu()){ shmem_info.run(module); shmem_liveness.run(module); shmem_allocation.run(); + if(shmem_allocation.get_allocated_size() > context->device()->max_shared_memory()) + return std::unique_ptr(); shmem_barriers.run(module); } dce.run(module); @@ -226,6 +229,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c selection.run(module, *llvm); // return binary std::unique_ptr res(driver::module::create(context, std::move(llvm))); + // done return res; } From 96bdae25d56a167f28895e32546666526a6d7187 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 5 Sep 2019 21:35:23 -0400 Subject: [PATCH 365/494] [python][example] now executing tensorflow and/or pytorch example automatically --- python/examples/dot.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/python/examples/dot.py b/python/examples/dot.py index 49ed5b298..eaa9c2d68 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -2,7 +2,6 @@ import numpy as np import triton def run_tf(): - import tensorflow as tf M, N, K = 128, 128, 128 a = tf.placeholder(tf.float32, shape=[M, K]) b = tf.placeholder(tf.float32, shape=[N, K]) @@ -28,7 +27,6 @@ def run_tf(): print("dif: %f" % np.max(dif)) def run_torch(): - import torch torch.manual_seed(0) M, N, K = 128, 128, 128 a = torch.randn(M, K).cuda() @@ -56,5 +54,14 @@ def run_torch(): print('Diff DA:', (torch_da - triton_da).max()) print('Diff DB:', (torch_db - triton_db).max()) +try: + import tensorflow as tf + run_tf() +except ModuleNotFoundError: + pass -run_torch() \ No newline at end of file +try: + import torch + run_torch() +except ModuleNotFoundError: + pass From 0ff81badac20c0692c1ec4affe1a157fe0828f6e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 6 Sep 2019 13:26:51 -0400 Subject: [PATCH 366/494] [driver] added TRITON_LIBCUDA environment variable to specify libcuda path if not in LD_LIBRARY_PATH --- include/triton/tools/sys/getenv.hpp | 6 +++--- lib/driver/dispatch.cc | 10 ++++++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/include/triton/tools/sys/getenv.hpp b/include/triton/tools/sys/getenv.hpp index 6e45ad5f2..0319d8868 100755 --- a/include/triton/tools/sys/getenv.hpp +++ b/include/triton/tools/sys/getenv.hpp @@ -38,11 +38,11 @@ namespace tools std::size_t sz = 0; _dupenv_s(&cache_path, &sz, name); #else - const char * cache_path = std::getenv(name); + const char * cstr = std::getenv(name); #endif - if(!cache_path) + if(!cstr) return ""; - std::string result(cache_path); + std::string result(cstr); #ifdef _MSC_VER free(cache_path); #endif diff --git a/lib/driver/dispatch.cc b/lib/driver/dispatch.cc index 9b5fc5242..fd6ca7bcb 100755 --- a/lib/driver/dispatch.cc +++ b/lib/driver/dispatch.cc @@ -22,6 +22,7 @@ #include "triton/driver/dispatch.h" #include "triton/driver/context.h" +#include "triton/tools/sys/getenv.hpp" namespace triton { @@ -108,8 +109,13 @@ bool dispatch::clinit() } bool dispatch::cuinit(){ - if(cuda_==nullptr) - cuda_ = dlopen("libcuda.so", RTLD_LAZY); + if(cuda_==nullptr){ + std::string libcuda = tools::getenv("TRITON_LIBCUDA"); + if(libcuda.empty()) + cuda_ = dlopen("libcuda.so", RTLD_LAZY); + else + cuda_ = dlopen(libcuda.c_str(), RTLD_LAZY); + } if(cuda_ == nullptr) return false; CUresult (*fptr)(unsigned int); From 32234c2612e27b96580e82aa9426913973280fae Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 8 Sep 2019 17:35:24 -0400 Subject: [PATCH 367/494] ugh --- include/triton/codegen/analysis/align.h | 46 +- include/triton/ir/instructions.h | 107 ++-- lib/codegen/analysis/align.cc | 636 +++++++++++++++--------- lib/codegen/analysis/grid.cc | 19 +- lib/codegen/selection.cc | 9 +- lib/driver/module.cc | 2 +- lib/ir/instructions.cc | 5 - lib/ir/print.cc | 10 +- lib/runtime/function.cc | 6 + tests/common/src/copy.h | 2 +- 10 files changed, 541 insertions(+), 301 deletions(-) diff --git a/include/triton/codegen/analysis/align.h b/include/triton/codegen/analysis/align.h index 6812314b7..9b1adb40f 100644 --- a/include/triton/codegen/analysis/align.h +++ b/include/triton/codegen/analysis/align.h @@ -2,12 +2,19 @@ #define TDL_INCLUDE_CODEGEN_ALIGNMENT_INFO_PASS_H #include +#include namespace triton { namespace ir { class value; class module; + class phi_node; + class splat_inst; + class reshape_inst; + class broadcast_inst; + class binary_operator; + class getelementptr_inst; } namespace codegen{ @@ -22,22 +29,47 @@ class align { private: // helpers bool is_first_axis_unit(ir::value *v); + std::vector get_shapes(ir::value *v); - // populate maps - cst_info populate_is_constant(ir::value *v); - unsigned populate_max_contiguous(ir::value *v); - unsigned populate_starting_multiple(ir::value *v); + // populate is_constant + std::vector populate_is_constant_phi(ir::phi_node* x); + std::vector populate_is_constant_splat(ir::splat_inst* x); + std::vector populate_is_constant_reshape(ir::reshape_inst* x); + std::vector populate_is_constant_broadcast(ir::broadcast_inst* x); + std::vector populate_is_constant_binop(ir::binary_operator* x); + std::vector populate_is_constant_gep(ir::getelementptr_inst* x); + std::vector populate_is_constant_default(ir::value* v); + std::vector populate_is_constant(ir::value *v); + // populate max_contiguous + std::vector populate_max_contiguous_phi(ir::phi_node* x); + std::vector populate_max_contiguous_splat(ir::splat_inst* x); + std::vector populate_max_contiguous_reshape(ir::reshape_inst* x); + std::vector populate_max_contiguous_broadcast(ir::broadcast_inst* x); + std::vector populate_max_contiguous_binop(ir::binary_operator* x); + std::vector populate_max_contiguous_gep(ir::getelementptr_inst* x); + std::vector populate_max_contiguous_default(ir::value* v); + std::vector populate_max_contiguous(ir::value *v); + // populate starting_multiple + std::vector populate_starting_multiple_phi(ir::phi_node* x); + std::vector populate_starting_multiple_splat(ir::splat_inst* x); + std::vector populate_starting_multiple_reshape(ir::reshape_inst* x); + std::vector populate_starting_multiple_broadcast(ir::broadcast_inst* x); + std::vector populate_starting_multiple_binop(ir::binary_operator* x); + std::vector populate_starting_multiple_gep(ir::getelementptr_inst* x); + std::vector populate_starting_multiple_default(ir::value* v); + std::vector populate_starting_multiple(ir::value *v); public: void run(ir::module &mod); unsigned get_starting_multiple(ir::value* v) const; unsigned get_max_contiguous(ir::value* v) const; + std::vector get_max_contiguous_vec(ir::value* v) const; void copy(ir::value *dst, ir::value *src); private: - std::map is_constant_; - std::map max_contiguous_; - std::map starting_multiple_; + std::map> is_constant_; + std::map> max_contiguous_; + std::map> starting_multiple_; }; diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index a4fbc3710..86df129d2 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -11,6 +11,10 @@ #include "triton/ir/type.h" #include "triton/ir/metadata.h" +#define _TRITON_DEFINE_CLONE(name) \ + ir::instruction* clone_impl() const { return new name(*this); } + + namespace triton{ namespace ir{ @@ -25,10 +29,15 @@ class context; //===----------------------------------------------------------------------===// class result_reference; + + class instruction: public user{ public: virtual std::string repr_impl() const = 0; +private: + virtual ir::instruction* clone_impl() const = 0; + protected: // constructors instruction(type *ty, unsigned num_ops, unsigned num_results = 1, const std::string &name = "", instruction *next = nullptr); @@ -43,19 +52,27 @@ public: bool has_tile_result_or_op(); // repr std::string repr() const { return repr_impl(); } - // results - unsigned get_num_results() const { return results_.size(); } - value* get_result(unsigned i) { return results_.at(i); } // metadata void set_metadata(ir::metadata::kind_t kind, unsigned value) { metadatas_[kind] = value;} unsigned get_metadata(ir::metadata::kind_t kind) { return metadatas_[kind];} + // cloning + ir::instruction* clone() { + ir::instruction* res = clone_impl(); +// for(auto it = op_begin(); it != op_end(); it++){ +// (*it)->add_use(res); +// } + res->set_name("testcloned"); + res->parent_ = nullptr; + return res; + } + private: basic_block *parent_; - std::vector results_; std::map metadatas_; }; + // result reference class result_reference: public value { public: @@ -72,7 +89,7 @@ private: // phi_node classes //===----------------------------------------------------------------------===// -class phi_node: public instruction{ +class phi_node: public instruction { private: phi_node(type *ty, unsigned num_reserved, const std::string &name, instruction *next); std::string repr_impl() const { return "phi"; } @@ -91,6 +108,8 @@ public: // Factory methods static phi_node* create(type *ty, unsigned num_reserved, const std::string &name = "", instruction *next = nullptr); + _TRITON_DEFINE_CLONE(phi_node) + private: unsigned num_reserved_; std::vector blocks_; @@ -99,7 +118,7 @@ private: //===----------------------------------------------------------------------===// // binary_operator classes //===----------------------------------------------------------------------===// -class binary_operator: public instruction{ +class binary_operator: public instruction { public: typedef binary_op_t op_t; @@ -138,6 +157,8 @@ public: static binary_operator *create_neg(value *arg, const std::string &name = "", instruction *next = nullptr); static binary_operator *create_not(value *arg, const std::string &name = "", instruction *next = nullptr); + _TRITON_DEFINE_CLONE(binary_operator) + public: binary_op_t op_; bool has_no_unsigned_wrap_; @@ -168,20 +189,22 @@ private: cmp_pred_t pred_; }; -class icmp_inst: public cmp_inst{ +class icmp_inst: public cmp_inst { using cmp_inst::cmp_inst; public: static icmp_inst* create(cmp_pred_t pred, value *lhs, value *rhs, const std::string &name = "", instruction *next = nullptr); + _TRITON_DEFINE_CLONE(icmp_inst) }; -class fcmp_inst: public cmp_inst{ +class fcmp_inst: public cmp_inst { using cmp_inst::cmp_inst; public: static fcmp_inst* create(cmp_pred_t pred, value *lhs, value *rhs, const std::string &name = "", instruction *next = nullptr); + _TRITON_DEFINE_CLONE(fcmp_inst) }; //===----------------------------------------------------------------------===// @@ -224,7 +247,8 @@ private: }; #define TRITON_IR_DECLARE_CAST_INST_SIMPL(name, op) \ -class name : public cast_inst{ \ +class name : public cast_inst { \ + _TRITON_DEFINE_CLONE(name); \ friend class cast_inst; \ name(type *ty, value *v, const std::string &name, instruction *next) \ : cast_inst(ty, v, name, next, op){ } \ @@ -253,7 +277,7 @@ class terminator_inst: public instruction{ }; // return instruction -class return_inst: public terminator_inst{ +class return_inst: public terminator_inst { private: std::string repr_impl() const { return "ret"; } return_inst(context &ctx, value *ret_val, instruction *next); @@ -267,6 +291,8 @@ public: // factory methods static return_inst* create(context &ctx, value *ret_val = nullptr, instruction *next = nullptr); + + _TRITON_DEFINE_CLONE(return_inst) }; // base branch instruction @@ -294,6 +320,7 @@ public: basic_block *get_true_dest() { return (basic_block*)get_operand(0); } basic_block *get_false_dest() { return (basic_block*)get_operand(1); } value *get_cond() { return get_operand(2); } + _TRITON_DEFINE_CLONE(cond_branch_inst) }; // unconditional branch @@ -304,28 +331,15 @@ private: public: basic_block *get_dest() { return (basic_block*)get_operand(0); } + _TRITON_DEFINE_CLONE(uncond_branch_inst) }; -// ternary -class ternary_inst: public instruction { -private: - std::string repr_impl() const { return "cond"; } - ternary_inst(value *cond, value *true_value, value *false_value, - const std::string &name, instruction *next); - -public: - value *get_cond() { return get_operand(0); } - value *get_true_value() { return get_operand(1); } - value *get_false_value() { return get_operand(2); } - static ternary_inst* create(value *cond, value *true_value, value *false_value, - const std::string &name = "", instruction *next = nullptr); -}; //===----------------------------------------------------------------------===// // getelementptr_inst classes //===----------------------------------------------------------------------===// -class getelementptr_inst: public instruction{ +class getelementptr_inst: public instruction { private: std::string repr_impl() const { return "getelementptr"; } getelementptr_inst(type *pointee_ty, value *ptr, const std::vector &idx, const std::string &name, instruction *next); @@ -345,6 +359,7 @@ public: // factory methods static getelementptr_inst* create(value *ptr, const std::vector &idx, const std::string &name = "", instruction *next = nullptr); + _TRITON_DEFINE_CLONE(getelementptr_inst) private: type *source_elt_ty; @@ -358,12 +373,16 @@ private: class io_inst: public instruction { protected: io_inst(type *ty, unsigned num_ops, unsigned num_results = 1, const std::string &name = "", instruction *next = nullptr); + public: + // accessors + value *get_pointer_operand() { return get_operand(0); } + // value *get_mask() const; // value *get_false_value() const; }; -class load_inst: public io_inst{ +class load_inst: public io_inst { protected: load_inst(value *ptr, unsigned num_extra_ops, const std::string &name, instruction *next); @@ -372,15 +391,15 @@ private: static type *get_pointee_type(type *ty); public: - // accessors - value *get_pointer_operand() { return get_operand(0); } + // factory method static load_inst* create(value *ptr, const std::string &name = "", instruction *next = nullptr); + _TRITON_DEFINE_CLONE(load_inst) }; -class masked_load_inst: public load_inst{ +class masked_load_inst: public load_inst { private: std::string repr_impl() const { return "masked_load"; } masked_load_inst(value *ptr, value *mask, value *false_value, @@ -394,6 +413,7 @@ public: static masked_load_inst* create(value *ptr, value *mask, value *false_value, const std::string &name = "", instruction *next = nullptr); + _TRITON_DEFINE_CLONE(masked_load_inst) }; class store_inst: public io_inst{ @@ -406,12 +426,12 @@ private: public: // accessors - value *get_pointer_operand() { return get_operand(0); } value *get_value_operand() { return get_operand(1); } // factory method static store_inst* create(value* ptr, value *v, const std::string &name = "", instruction *next = nullptr); + _TRITON_DEFINE_CLONE(store_inst) }; class masked_store_inst: public store_inst{ @@ -427,6 +447,7 @@ public: static masked_store_inst* create(value *ptr, value *v, value *mask, const std::string &name = "", instruction *next = nullptr); + _TRITON_DEFINE_CLONE(masked_store_inst) }; //===----------------------------------------------------------------------===// @@ -450,6 +471,7 @@ private: public: static instruction* create(value *arg, const type::tile_shapes_t &shape_suffix, const std::string &name = "", instruction *next = nullptr); + _TRITON_DEFINE_CLONE(reshape_inst) }; // splat @@ -462,6 +484,7 @@ private: public: static instruction* create(value *arg, const type::tile_shapes_t &shape_suffix, const std::string &name = "", instruction *next = nullptr); + _TRITON_DEFINE_CLONE(splat_inst) }; // broadcast @@ -474,6 +497,7 @@ private: public: static instruction* create(value *arg, const type::tile_shapes_t &shape_suffix, const std::string &name = "", instruction *next = nullptr); + _TRITON_DEFINE_CLONE(broadcast_inst) }; @@ -486,6 +510,7 @@ private: public: static instruction* create(value *arg, const std::string &name = "", instruction *next = nullptr); + _TRITON_DEFINE_CLONE(downcast_inst) }; //===----------------------------------------------------------------------===// @@ -505,6 +530,7 @@ private: public: static instruction* create(context &ctx, unsigned axis, const std::string &name = "", instruction *next = nullptr); unsigned get_axis() const { return axis_; } + _TRITON_DEFINE_CLONE(get_program_id_inst) private: unsigned axis_; @@ -518,6 +544,7 @@ private: public: static instruction* create(context &ctx, unsigned axis, const std::string &name = "", instruction *next = nullptr); unsigned get_axis() const { return axis_; } + _TRITON_DEFINE_CLONE(get_num_program_inst) private: unsigned axis_; @@ -527,6 +554,7 @@ class atomic_cas_inst: public builtin_inst { private: atomic_cas_inst(value *ptr, value *cmp, value *val, const std::string &name, instruction *next); std::string repr_impl() const { return "atomic_cas"; } + _TRITON_DEFINE_CLONE(atomic_cas_inst) public: static instruction* create(value *ptr, value *cmp, value *val, const std::string &name = "", instruction *next = nullptr); @@ -536,6 +564,7 @@ class atomic_exch_inst: public builtin_inst { private: atomic_exch_inst(value *ptr, value *val, const std::string &name = "", instruction *next = nullptr); std::string repr_impl() const { return "atomic_exch"; } + _TRITON_DEFINE_CLONE(atomic_exch_inst) public: static instruction* create(value *ptr, value *val, const std::string &name = "", instruction *next = nullptr); @@ -545,6 +574,7 @@ class atomic_add_inst: public builtin_inst { private: atomic_add_inst(value *ptr, value *val, const std::string &name = "", instruction *next = nullptr); std::string repr_impl() const { return "atomic_add"; } + _TRITON_DEFINE_CLONE(atomic_add_inst) public: static instruction* create(value *ptr, value *val, const std::string &name = "", instruction *next = nullptr); @@ -566,6 +596,7 @@ public: static instruction* create_tt(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr); bool is_a_trans() { return AT_ == Trans; } bool is_b_trans() { return BT_ == Trans; } + _TRITON_DEFINE_CLONE(dot_inst) private: TransT AT_; @@ -586,17 +617,12 @@ public: private: trans_inst(value *arg, const std::vector& perm, const std::string& name, instruction* next); - std::string repr_impl() const { - std::string res = "trans<"; - //for(ir::constant_int *x: perm_) - // res += x->repr() + ","; - res[res.size()-1] = '>'; - return res; - } + std::string repr_impl() const { return "trans"; } public: static instruction* create(value *arg, const std::vector& perm = {}, const std::string &name = "", instruction *next = nullptr); const std::vector get_perm() const; + _TRITON_DEFINE_CLONE(trans_inst) private: std::vector perm_; @@ -608,6 +634,7 @@ private: std::string repr_impl() const { return "sqrt"; } public: static instruction* create(value *arg, const std::string &name = "", instruction *next = nullptr); + _TRITON_DEFINE_CLONE(sqrt_inst) }; class reduce_inst: public builtin_inst { @@ -617,6 +644,7 @@ private: private: reduce_inst(value* arg, unsigned axis, const std::string& name, instruction* next); std::string repr_impl() const { return "reduce"; } + _TRITON_DEFINE_CLONE(reduce_inst) public: static instruction* create(value *arg, unsigned axis, const std::string &name = "", instruction *next = nullptr); @@ -630,6 +658,7 @@ class select_inst: public builtin_inst { private: select_inst(value *pred, value *if_value, value *else_value, const std::string& name, instruction* next); std::string repr_impl() const { return "select"; } + _TRITON_DEFINE_CLONE(select_inst) public: static instruction* create(value *pred, value *if_value, value *else_value, const std::string &name = "", instruction *next = nullptr); @@ -647,12 +676,14 @@ private: public: static copy_to_shared_inst* create(value *arg, const std::string &name = "", instruction *next = nullptr); + _TRITON_DEFINE_CLONE(copy_to_shared_inst) }; class barrier_inst: public instruction{ private: barrier_inst(context &ctx, const std::string &name, instruction *next); std::string repr_impl() const { return "barrier"; } + _TRITON_DEFINE_CLONE(barrier_inst) public: static barrier_inst* create(context &ctx, const std::string &name = "", @@ -663,6 +694,7 @@ class vectorize_inst: public unary_inst{ private: using unary_inst::unary_inst; std::string repr_impl() const { return "vectorize"; } + _TRITON_DEFINE_CLONE(vectorize_inst) public: static vectorize_inst* create(value *arg, const std::string &name = "", instruction *next = nullptr); @@ -675,6 +707,7 @@ class nv_dynamic_program_idx_inst: public instruction { private: nv_dynamic_program_idx_inst(type *ty, const std::string &name, instruction *next); std::string repr_impl() const { return "nv_dynamic_program_idx"; } + _TRITON_DEFINE_CLONE(nv_dynamic_program_idx_inst) public: static nv_dynamic_program_idx_inst* create(type *ty, const std::string &name = "", instruction *next = nullptr); diff --git a/lib/codegen/analysis/align.cc b/lib/codegen/analysis/align.cc index 85500aefb..119ece8fa 100644 --- a/lib/codegen/analysis/align.cc +++ b/lib/codegen/analysis/align.cc @@ -5,6 +5,8 @@ #include "triton/ir/instructions.h" #include "triton/ir/type.h" #include +#include +#include namespace triton { namespace codegen{ @@ -36,258 +38,448 @@ bool align::is_first_axis_unit(ir::value *x){ return true; } -align::cst_info align::populate_is_constant(ir::value *v) { +/* + * is constant + */ + +std::vector align::get_shapes(ir::value *v) { + ir::type *ty = v->get_type(); + if(ty->is_tile_ty()) + return ty->get_tile_shapes(); + else + return {1}; +} + +std::vector align::populate_is_constant_phi(ir::phi_node* x) { + auto shapes = get_shapes(x); + std::vector result(shapes.size(), cst_info{1, 0}); + for(unsigned n = 0; n < x->get_num_incoming(); n++){ + ir::value* inc = x->get_incoming_value(n); + auto it = is_constant_.find(inc); + if(it != is_constant_.end()) + result = it->second; + } + return add_to_cache(x, result, is_constant_); + // recurse + for(unsigned n = 0; n < x->get_num_incoming(); n++){ + ir::value* inc = x->get_incoming_value(n); + auto cst = populate_is_constant(inc); + for(size_t d = 0; d < cst.size(); d++) + result[d].num_cst = std::min(result[d].num_cst, cst[d].num_cst); + } + return add_to_cache(x, result, is_constant_); +} + +std::vector align::populate_is_constant_splat(ir::splat_inst* x) { + auto shapes = get_shapes(x); + std::vector result; + ir::value* op = x->get_operand(0); + auto op_cst = populate_is_constant(op); + for(auto d: shapes) + result.push_back(cst_info{d, op_cst[0].value}); + return add_to_cache(x, result, is_constant_); +} + +std::vector align::populate_is_constant_reshape(ir::reshape_inst* x) { + auto x_shapes = get_shapes(x); + std::vector result; + ir::value *op = x->get_operand(0); + auto op_shapes = op->get_type()->get_tile_shapes(); + auto op_cst = populate_is_constant(op); + unsigned current = 0; + bool is_skewed = false; + for(size_t d = 0; d < x_shapes.size(); d ++){ + cst_info ax ; + if(x_shapes[d] == 1) + ax = {1, op_cst[current].value}; + else if(!is_skewed + && x_shapes[d] == op_shapes[current]) + ax = {x_shapes[d], op_cst[current++].value}; + else { + is_skewed = true; + ax = {x_shapes[d], 0}; + } + result.push_back(ax); + } + return add_to_cache(x, result, is_constant_); +} + +std::vector align::populate_is_constant_broadcast(ir::broadcast_inst* x) { + auto x_shapes = get_shapes(x); + std::vector result; + ir::value *op = x->get_operand(0); + auto op_shapes = op->get_type()->get_tile_shapes(); + auto op_cst = populate_is_constant(op); + for(size_t d = 0; d < x_shapes.size(); d++) + if(op_shapes[d] == 1) + result.push_back(cst_info{x_shapes[d], op_cst[d].value}); + else + result.push_back(op_cst[d]); + return add_to_cache(x, result, is_constant_); +} + +std::vector align::populate_is_constant_binop(ir::binary_operator* x) { + auto x_shapes = get_shapes(x); + std::vector result; + ir::value* lhs_op = x->get_operand(0); + ir::value* rhs_op = x->get_operand(1); + auto lhs = populate_is_constant(lhs_op); + auto rhs = populate_is_constant(rhs_op); + auto max_contiguous = populate_max_contiguous(lhs_op); + for(size_t d = 0; d < x_shapes.size(); d++) { + cst_info ax; + if(lhs[d].num_cst==0 && rhs[d].value && x->is_int_div()){ + // todo might not be entirely true + unsigned num_constants = gcd(max_contiguous[d], rhs[d].value); + ax = {num_constants, 0}; + } + else + ax = {std::min(lhs[d].num_cst, rhs[d].num_cst), 0}; + result.push_back(ax); + } + return add_to_cache(x, result, is_constant_); +} + +std::vector align::populate_is_constant_gep(ir::getelementptr_inst* x) { + auto x_shapes = get_shapes(x); + ir::value* lhs_op = x->get_operand(0); + ir::value* rhs_op = x->get_operand(1); + auto lhs = populate_is_constant(lhs_op); + auto rhs = populate_is_constant(rhs_op); + std::vector result; + for(size_t d = 0; d < x_shapes.size(); d++) + result.push_back({std::min(lhs[d].num_cst, rhs[d].num_cst), 0}); + return add_to_cache(x, result, is_constant_); +} + +std::vector align::populate_is_constant_default(ir::value *v) { + auto shapes = get_shapes(v); + std::vector result(shapes.size(), {1, 0}); + return add_to_cache(v, result, is_constant_); +} + +std::vector align::populate_is_constant(ir::value *v) { if(is_constant_.find(v) != is_constant_.end()) return is_constant_.at(v); - // helper for the cache - auto cache = [this,v](cst_info value){ - return add_to_cache(v, value, is_constant_); } - ; - // populate - if(auto *x = dynamic_cast(v)){ - ir::value *op = x->get_operand(0); - auto op_cst = populate_is_constant(op); - if(is_first_axis_unit(op)){ - unsigned num_cst = x->get_type()->get_tile_shapes()[0]; - return cache({num_cst, op_cst.value}); - } - } if(auto *x = dynamic_cast(v)) - return cache({true, (unsigned)x->get_value()}); - if(auto *x = dynamic_cast(v)){ - ir::value* lhs_op = x->get_operand(0); - ir::value* rhs_op = x->get_operand(1); - cst_info lhs = populate_is_constant(lhs_op); - cst_info rhs = populate_is_constant(rhs_op); - if(lhs.num_cst==0 && rhs.value && x->is_int_div()){ - unsigned max_contiguous = populate_max_contiguous(lhs_op); - // todo might not be entirely true - unsigned num_constants = gcd(max_contiguous, rhs.value); - return cache({num_constants, 0}); - } - return cache({std::min(lhs.num_cst, rhs.num_cst), 0}); - } - if(auto *x = dynamic_cast(v)){ - ir::value* lhs_op = x->get_operand(0); - ir::value* rhs_op = x->get_operand(1); - cst_info lhs = populate_is_constant(lhs_op); - cst_info rhs = populate_is_constant(rhs_op); - return cache({std::min(lhs.num_cst, rhs.num_cst), 0}); - } -// if(auto *x = dynamic_cast(v)){ -// cst_info value_true = populate_is_constant(x->get_value_true()); -// cst_info value_false = populate_is_constant(x->get_value_false()); -// return cache({std::min(value_true.num_cst, value_false.num_cst), 0}); -// } - if(v->get_type()->is_tile_ty()) - return cache({0, 0}); - if(auto *x = dynamic_cast(v)){ - // put a conservative initial value in phi node to avoid infinite recursion - unsigned result = 1; - for(unsigned n = 0; n < x->get_num_incoming(); n++){ - ir::value* inc = x->get_incoming_value(n); - if(is_constant_.find(inc) != is_constant_.end()) - result = is_constant_.at(inc).num_cst; - } - cache({result, 0}); - // recurse - for(unsigned n = 0; n < x->get_num_incoming(); n++){ - ir::value* inc = x->get_incoming_value(n); - result = std::min(result, populate_is_constant(inc).num_cst); - } - return cache({result, 0}); - } - // scalars are always constant in the contiguous dimension - // but value is not known at compile-time - return cache({1, 0}); + return add_to_cache(v, {cst_info{true, (unsigned)x->get_value()}}, is_constant_); + if(auto *x = dynamic_cast(v)) + return populate_is_constant_phi(x); + if(auto *x = dynamic_cast(v)) + return populate_is_constant_splat(x); + if(auto *x = dynamic_cast(v)) + return populate_is_constant_reshape(x); + if(auto *x = dynamic_cast(v)) + return populate_is_constant_broadcast(x); + if(auto *x = dynamic_cast(v)) + return populate_is_constant_binop(x); + if(auto *x = dynamic_cast(v)) + return populate_is_constant_gep(x); + return populate_is_constant_default(v); } -unsigned align::populate_max_contiguous(ir::value *v){ - if(max_contiguous_.find(v) != max_contiguous_.end()) - return max_contiguous_.at(v); - // helper for the cache - auto cache = [this,v](unsigned value){ return add_to_cache(v, value, max_contiguous_); }; - // populate - if(!v->get_type()->is_tile_ty()) - return cache(1); - auto shapes = v->get_type()->get_tile_shapes(); - if(dynamic_cast(v)){ - return cache(shapes[0]); + +/* + * max contiguous + */ + +std::vector align::populate_max_contiguous_phi(ir::phi_node* x) { + auto shapes = get_shapes(x); + std::vector result(shapes.size(), 1); + for(unsigned n = 0; n < x->get_num_incoming(); n++){ + ir::value* inc = x->get_incoming_value(n); + auto it = max_contiguous_.find(inc); + if(it != max_contiguous_.end()) + result = it->second; } - if(auto *x = dynamic_cast(v)){ - ir::value *op = x->get_operand(0); - if(op->get_type()->is_tile_ty()){ - auto op_shapes = op->get_type()->get_tile_shapes(); - if(op_shapes[0] == shapes[0]) - return cache(populate_max_contiguous(op)); + add_to_cache(x, result, max_contiguous_); + // recurse + for(unsigned n = 0; n < x->get_num_incoming(); n++){ + ir::value* inc = x->get_incoming_value(n); + auto contiguous = populate_max_contiguous(inc); + for(size_t d = 0; d < result.size(); d++) + result[d] = std::min(result[d], contiguous[d]); + } + return add_to_cache(x, result, max_contiguous_); + +} + +std::vector align::populate_max_contiguous_splat(ir::splat_inst* x) { + auto x_shapes = get_shapes(x); + std::vector result; + for(size_t d = 0; d < x_shapes.size(); d++) + result.push_back({1}); + return add_to_cache(x, result, max_contiguous_); +} + +std::vector align::populate_max_contiguous_reshape(ir::reshape_inst* x) { + auto shapes = get_shapes(x); + std::vector result; + ir::value *op = x->get_operand(0); + auto op_shapes = op->get_type()->get_tile_shapes(); + auto op_mc = populate_max_contiguous(op); + unsigned current = 0; + bool is_skewed = false; + for(size_t d = 0; d < shapes.size(); d ++){ + if(shapes[d] == 1) + result.push_back(1); + else if(!is_skewed + && shapes[d] == op_shapes[current]) + result.push_back(op_mc[current++]); + else { + is_skewed = true; + result.push_back(1); } - return cache(1); } - if(auto *x = dynamic_cast(v)){ - ir::value* lhs = x->get_operand(0); - ir::value* rhs = x->get_operand(1); - unsigned lhs_max_contiguous = populate_max_contiguous(lhs); - unsigned rhs_max_contiguous = populate_max_contiguous(rhs); - cst_info lhs_cst_info = populate_is_constant(lhs); - cst_info rhs_cst_info = populate_is_constant(rhs); - if(x->is_int_rem() && rhs_cst_info.value > 0) - return cache(std::min(lhs_max_contiguous, rhs_cst_info.value)); + return add_to_cache(x, result, max_contiguous_); +} + +std::vector align::populate_max_contiguous_broadcast(ir::broadcast_inst* x) { + auto shapes = get_shapes(x); + std::vector result; + ir::value *op = x->get_operand(0); + auto op_shapes = op->get_type()->get_tile_shapes(); + auto op_mc = populate_max_contiguous(op); + for(size_t d = 0; d < shapes.size(); d++) + if(op_shapes[d] == 1) + result.push_back(1); + else + result.push_back(op_mc[d]); + return add_to_cache(x, result, max_contiguous_); +} + +std::vector align::populate_max_contiguous_binop(ir::binary_operator* x) { + auto shapes = get_shapes(x); + ir::value* lhs = x->get_operand(0); + ir::value* rhs = x->get_operand(1); + auto lhs_max_contiguous = populate_max_contiguous(lhs); + auto rhs_max_contiguous = populate_max_contiguous(rhs); + auto lhs_cst_info = populate_is_constant(lhs); + auto rhs_cst_info = populate_is_constant(rhs); + std::vector result; + for(size_t d = 0; d < shapes.size(); d++){ + unsigned value = 1; + if(x->is_int_rem() && rhs_cst_info[d].value > 0) + value = std::min(lhs_max_contiguous[d], rhs_cst_info[d].value); if(x->is_int_mult()){ - if(rhs_cst_info.value == 1) - return cache(lhs_max_contiguous); - if(lhs_cst_info.value == 1) - return cache(rhs_max_contiguous); + unsigned lvalue = 1, rvalue = 1; + if(rhs_cst_info[d].value == 1) + lvalue = lhs_max_contiguous[d]; + if(lhs_cst_info[d].value == 1) + rvalue = rhs_max_contiguous[d]; + value = std::max(lvalue, rvalue); } if(x->is_int_add_sub()){ - if(lhs_cst_info.num_cst) - return cache(gcd(rhs_max_contiguous, lhs_cst_info.num_cst)); - if(rhs_cst_info.num_cst) - return cache(gcd(lhs_max_contiguous, rhs_cst_info.num_cst)); + unsigned lvalue = 1, rvalue = 1; + if(lhs_cst_info[d].num_cst) + lvalue = gcd(rhs_max_contiguous[d], lhs_cst_info[d].num_cst); + if(rhs_cst_info[d].num_cst) + rvalue = gcd(lhs_max_contiguous[d], rhs_cst_info[d].num_cst); + value = std::max(lvalue, rvalue); } + result.push_back(value); } -// if(auto *x = dynamic_cast(v)){ -// int value_true = populate_max_contiguous(x->get_value_true()); -// int value_false = populate_max_contiguous(x->get_value_false()); -// return cache(std::min(value_true, value_false)); -// } - if(auto *x = dynamic_cast(v)){ - ir::value* lhs = x->get_operand(0); - ir::value* rhs = x->get_operand(1); - unsigned lhs_max_contiguous = populate_max_contiguous(lhs); - unsigned rhs_max_contiguous = populate_max_contiguous(rhs); - auto lhs_cst_info = populate_is_constant(lhs); - auto rhs_cst_info = populate_is_constant(rhs); - if(lhs_cst_info.num_cst) - return cache(rhs_max_contiguous); - if(rhs_cst_info.num_cst) - return cache(lhs_max_contiguous); - } - if(auto *x = dynamic_cast(v)){ - // put a conservative initial value in phi node to avoid infinite recursion - unsigned result = 1; - for(unsigned n = 0; n < x->get_num_incoming(); n++){ - ir::value* inc = x->get_incoming_value(n); - if(max_contiguous_.find(inc) != max_contiguous_.end()) - result = max_contiguous_.at(inc); - } - cache(result); - // recurse - for(unsigned n = 0; n < x->get_num_incoming(); n++){ - ir::value* inc = x->get_incoming_value(n); - result = std::min(result, populate_max_contiguous(inc)); - } - return cache(result); - } - return cache(1); + return add_to_cache(x, result, max_contiguous_); } -unsigned align::populate_starting_multiple(ir::value *v){ - if(starting_multiple_.find(v) != starting_multiple_.end()) - return starting_multiple_.at(v); - auto cache = [this,v](unsigned value){ - return add_to_cache(v, value, starting_multiple_); - }; - // has metadata +std::vector align::populate_max_contiguous_gep(ir::getelementptr_inst* x) { + auto shapes = get_shapes(x); + ir::value* lhs = x->get_operand(0); + ir::value* rhs = x->get_operand(1); + auto lhs_max_contiguous = populate_max_contiguous(lhs); + auto rhs_max_contiguous = populate_max_contiguous(rhs); + auto lhs_cst_info = populate_is_constant(lhs); + auto rhs_cst_info = populate_is_constant(rhs); + std::vector result(shapes.size(), 1); + for(size_t d = 0; d < shapes.size(); d++){ + unsigned lvalue = 1, rvalue = 1; + if(lhs_cst_info[d].num_cst) + lvalue = rhs_max_contiguous[d]; + if(rhs_cst_info[d].num_cst) + rvalue = lhs_max_contiguous[d]; + result[d] = std::max(lvalue, rvalue); + } + return add_to_cache(x, result, max_contiguous_); +} + +std::vector align::populate_max_contiguous_default(ir::value* v) { + if(!v->get_type()->is_tile_ty()) + return add_to_cache(v, {1}, max_contiguous_); + auto shapes = v->get_type()->get_tile_shapes(); + if(dynamic_cast(v)) + return add_to_cache(v, {shapes[0]}, max_contiguous_); + return add_to_cache(v, std::vector(shapes.size(), 1), max_contiguous_); +} + +std::vector align::populate_max_contiguous(ir::value *v){ + if(max_contiguous_.find(v) != max_contiguous_.end()) + return max_contiguous_.at(v); + if(auto *x = dynamic_cast(v)) + return populate_max_contiguous_splat(x); + if(auto *x = dynamic_cast(v)) + return populate_max_contiguous_reshape(x); + if(auto *x = dynamic_cast(v)) + return populate_max_contiguous_broadcast(x); + if(auto *x = dynamic_cast(v)) + return populate_max_contiguous_binop(x); + if(auto *x = dynamic_cast(v)) + return populate_max_contiguous_gep(x); + if(auto *x = dynamic_cast(v)) + return populate_max_contiguous_phi(x); + return populate_max_contiguous_default(v); +} + + +/* + * starting multiple + */ + +std::vector align::populate_starting_multiple_splat(ir::splat_inst* x){ + auto shapes = get_shapes(x); + auto op = populate_starting_multiple(x->get_operand(0)); + std::vector result(shapes.size(), op[0]); + return add_to_cache(x, result, starting_multiple_); +} + +std::vector align::populate_starting_multiple_reshape(ir::reshape_inst* x){ + auto op = populate_starting_multiple(x->get_operand(0)); + auto op_shapes = get_shapes(x->get_operand(0)); + auto shapes = get_shapes(x); + std::vector result(shapes.size(), 1); + unsigned current = 0; + bool is_skewed = false; + for(size_t d = 0; d < shapes.size(); d ++){ + if(shapes[d] == 1) + result[d] = 1; + else if(!is_skewed + && shapes[d] == op_shapes[current]) + result[d] = op[current++]; + else { + is_skewed = true; + result[d] = 1; + } + } + return add_to_cache(x, result, starting_multiple_); +} + +std::vector align::populate_starting_multiple_broadcast(ir::broadcast_inst* x){ + auto result = populate_starting_multiple(x->get_operand(0)); + return add_to_cache(x, result, starting_multiple_); +} + +std::vector align::populate_starting_multiple_binop(ir::binary_operator* x){ + auto lhs = populate_starting_multiple(x->get_operand(0)); + auto rhs = populate_starting_multiple(x->get_operand(1)); + std::vector result(lhs.size(), 1); + for(size_t d = 0; d < lhs.size(); d++){ + if(x->is_int_mult()) + result[d] = lhs[d] * rhs[d]; + if(x->is_int_add_sub()) + result[d] = gcd(lhs[d], rhs[d]); + if(x->is_int_div()) + result[d] = std::max(lhs[d] / rhs[d], 1); + if(x->is_int_rem() && rhs[d] > 1) + result[d] = gcd(lhs[d], rhs[d]); + if(x->is_shl()) + result[d] = lhs[d] << rhs[d]; + if(x->is_shr()) + result[d] = std::max(lhs[d] >> rhs[d], 1); + } + return add_to_cache(x, result, starting_multiple_); +} + +std::vector align::populate_starting_multiple_gep(ir::getelementptr_inst* x){ + auto lhs = populate_starting_multiple(x->get_operand(0)); + auto rhs = populate_starting_multiple(x->get_operand(1)); + std::vector result(lhs.size(), 1); + for(size_t d = 0; d < lhs.size(); d++) + result[d] = gcd(lhs[d], rhs[d]); + return add_to_cache(x, result, starting_multiple_); +} + +std::vector align::populate_starting_multiple_phi(ir::phi_node* x){ + auto shape = get_shapes(x); + std::vector result(shape.size(), 1); + for(unsigned n = 0; n < x->get_num_incoming(); n++){ + ir::value* inc = x->get_incoming_value(n); + if(starting_multiple_.find(inc) != starting_multiple_.end()) + result = starting_multiple_.at(inc); + } + add_to_cache(x, result, starting_multiple_); + // recurse + for(unsigned n = 0; n < x->get_num_incoming(); n++){ + ir::value* inc = x->get_incoming_value(n); + auto sm = populate_starting_multiple(inc); + for(size_t d = 0; d < result.size(); d++) + result[d] = gcd(result[d], sm[d]); + } + return add_to_cache(x, result, starting_multiple_); +} + + +std::vector align::populate_starting_multiple_default(ir::value* v) { + ir::type* ty = v->get_type(); + if(ty->is_tile_ty()) { + return add_to_cache(v, ty->get_tile_shapes(), starting_multiple_); + } if(auto *x = dynamic_cast(v)){ unsigned multiple_of = x->get_metadata(ir::metadata::multiple_of); if(multiple_of > 0) - return cache(multiple_of); + return add_to_cache(x, {multiple_of}, starting_multiple_); } - // arguments if(auto *x = dynamic_cast(v)){ std::set attributes = x->get_parent()->get_attributes(x); for(auto attr: attributes){ - if(attr.get_kind() == ir::multiple_of) - return cache(attr.get_value()); + if(attr.get_kind() == ir::multiple_of){ + return add_to_cache(x, {attr.get_value()}, starting_multiple_); + } if(attr.get_kind() == ir::aligned){ ir::type* ty = x->get_type()->get_pointer_element_ty(); int nbits = ty->get_primitive_size_in_bits(); int nbytes = nbits / 8; - return cache(attr.get_value() / nbytes); + return add_to_cache(x, {attr.get_value() / nbytes}, starting_multiple_); } } } - if(auto *x = dynamic_cast(v)){ - int lhs = populate_starting_multiple(x->get_operand(0)); - int rhs = populate_starting_multiple(x->get_operand(1)); - if(x->is_int_mult()) - return cache(lhs * rhs); - if(x->is_int_add_sub()) - return cache(gcd(lhs, rhs)); - if(x->is_int_div()) - return cache(std::max(lhs / rhs, 1)); - if(x->is_int_rem() && rhs > 1) - return cache(gcd(lhs, rhs)); - if(x->is_shl()) - return cache(lhs << rhs); - if(x->is_shr()) - return cache(std::max(lhs >> rhs, 1)); - } - if(auto *x = dynamic_cast(v)){ - return cache(x->get_value()); - } - if(auto *x = dynamic_cast(v)){ - return cache(x->get_first()->get_value()); - } - if(dynamic_cast(v)){ - return cache(128); - } - if(auto *x = dynamic_cast(v)){ - return cache(x->get_range()->get_first()->get_value()); - } - if(auto *x = dynamic_cast(v)){ - int lhs = populate_starting_multiple(x->get_operand(0)); - int rhs = populate_starting_multiple(x->get_operand(1)); - return cache(gcd(lhs, rhs)); - } - if(auto *x = dynamic_cast(v)){ - int op = populate_starting_multiple(x->get_operand(0)); - return cache(op); - } - if(auto *x = dynamic_cast(v)){ - int op = populate_starting_multiple(x->get_operand(0)); - auto shapes = x->get_type()->get_tile_shapes(); - if(shapes[0] == 1) - return cache(1); - else - return cache(op); - } - if(auto *x = dynamic_cast(v)){ - int op = populate_starting_multiple(x->get_operand(0)); - return cache(op); - } - if(auto *x = dynamic_cast(v)){ - // put a conservative initial value in phi node to avoid infinite recursion - unsigned result = 1; - for(unsigned n = 0; n < x->get_num_incoming(); n++){ - ir::value* inc = x->get_incoming_value(n); - if(starting_multiple_.find(inc) != starting_multiple_.end()) - result = starting_multiple_.at(inc); - } - cache(result); - // recurse - for(unsigned n = 0; n < x->get_num_incoming(); n++){ - ir::value* inc = x->get_incoming_value(n); - result = gcd(result, populate_starting_multiple(inc)); - } - return cache(result); - } - // scalars - if(!v->get_type()->is_tile_ty()) - return cache(1); - // tiles - auto shapes = v->get_type()->get_tile_shapes(); - unsigned result = 1; - for(unsigned i = 0; i < shapes.size() - 1; i++) - result *= shapes[i]; - return cache(result); + return add_to_cache(v, {1}, starting_multiple_); +} + + +std::vector align::populate_starting_multiple(ir::value *v){ + if(starting_multiple_.find(v) != starting_multiple_.end()) + return starting_multiple_.at(v); + if(auto *x = dynamic_cast(v)) + return populate_starting_multiple_binop(x); + if(auto *x = dynamic_cast(v)) + return add_to_cache(x, {(unsigned)x->get_value()}, starting_multiple_); + if(auto *x = dynamic_cast(v)) + return add_to_cache(x, {(unsigned)x->get_first()->get_value()}, starting_multiple_); + if(auto *x = dynamic_cast(v)) + return add_to_cache(x, {128}, starting_multiple_); + if(auto *x = dynamic_cast(v)) + return add_to_cache(x, {(unsigned)x->get_range()->get_first()->get_value()}, starting_multiple_); + if(auto *x = dynamic_cast(v)) + return populate_starting_multiple_gep(x); + if(auto *x = dynamic_cast(v)) + return populate_starting_multiple_splat(x); + if(auto *x = dynamic_cast(v)) + return populate_starting_multiple_reshape(x); + if(auto *x = dynamic_cast(v)) + return populate_starting_multiple_broadcast(x); + if(auto *x = dynamic_cast(v)) + return populate_starting_multiple_phi(x); + return populate_starting_multiple_default(v); } unsigned align::get_starting_multiple(ir::value* v) const { - return starting_multiple_.at(v); + return starting_multiple_.at(v)[0]; } unsigned align::get_max_contiguous(ir::value* v) const { + return max_contiguous_.at(v)[0]; +} + +std::vector align::get_max_contiguous_vec(ir::value* v) const { return max_contiguous_.at(v); } @@ -297,7 +489,7 @@ void align::copy(ir::value *dst, ir::value *src) { is_constant_[dst] = is_constant_[src]; } -///TODO: This doesn't seem to work in DOT-NN, DOT-TT, DOT-TN + void align::run(ir::module &mod) { // populate constant for(ir::function *fn: mod.get_function_list()) @@ -316,13 +508,9 @@ void align::run(ir::module &mod) { // populate maximum contiguous for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i: block->get_inst_list()) + for(ir::instruction *i: block->get_inst_list()){ populate_max_contiguous(i); - -// for(ir::function *fn: mod.get_function_list()) -// for(ir::basic_block *block: fn->blocks()) -// for(ir::instruction *i: block->get_inst_list()) -// std::cout << i->get_name() << " " << max_contiguous_.at(i) << " " << is_constant_.at(i).num_cst << " " << starting_multiple_.at(i) << std::endl; + } } diff --git a/lib/codegen/analysis/grid.cc b/lib/codegen/analysis/grid.cc index f90ab8822..bbd2940ec 100644 --- a/lib/codegen/analysis/grid.cc +++ b/lib/codegen/analysis/grid.cc @@ -76,16 +76,16 @@ void grids::init_c_graph(ir::instruction *v) { // Reshape if(dynamic_cast(v)) { ir::value *op = v->get_operand(0); + auto op_shapes = op->get_type()->get_tile_shapes(); unsigned current = 0; bool is_skewed = false; for(unsigned i = 0; i < shapes.size(); i ++){ - bool is_one = shapes[i] == 1; - bool is_same = shapes[i] == op->get_type()->get_tile_shapes()[current]; - if(is_one){ + if(shapes[i] == 1){ static_params_.insert({{v, i}, 1}); add_constraint({v, i}, {v, i}); } - else if(!is_skewed && is_same) + else if(!is_skewed && + shapes[i] == op_shapes[current]) add_constraint({v, i}, {op, current++}); else{ is_skewed = true; @@ -130,13 +130,10 @@ void grids::init_c_graph(ir::instruction *v) { } // Element-wise else if(dynamic_cast(v)) { - for(unsigned k = 0; k < v->get_num_results(); k++){ - ir::value *result = v->get_result(k); - for(unsigned i = 0; i < shapes.size(); i ++){ - std::vector ops = v->ops(); - for(ir::value* op: ops) - add_constraint({result, i}, {op, i}); - } + for(unsigned i = 0; i < shapes.size(); i ++){ + std::vector ops = v->ops(); + for(ir::value* op: ops) + add_constraint({v, i}, {op, i}); } } } diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index ff246f4f5..a2a48300b 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -864,11 +864,7 @@ void selection::init_grids(ir::function *fn, IRBuilder<> &builder, Value *sh_mem std::map references; create_grids(grids, references, fn); for(ir::value* i: grids){ - if(auto *instr = dynamic_cast(i)) - for(unsigned r = 0; r < instr->get_num_results(); r++) - init_axes(instr->get_result(r), builder, u_thread_warp_id, u_warp_id); - else - init_axes(i, builder, u_thread_warp_id, u_warp_id); + init_axes(i, builder, u_thread_warp_id, u_warp_id); } // create tile std::set seen; @@ -876,8 +872,7 @@ void selection::init_grids(ir::function *fn, IRBuilder<> &builder, Value *sh_mem for(ir::instruction *i: block->get_inst_list()){ if(!i->get_type()->is_tile_ty()) continue; - for(unsigned r = 0; r < i->get_num_results(); r++) - create_tile(i->get_result(r), builder, references, seen, sh_mem_ptr); + create_tile(i, builder, references, seen, sh_mem_ptr); } } diff --git a/lib/driver/module.cc b/lib/driver/module.cc index 66c775ac6..85877f911 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -241,7 +241,7 @@ std::string cu_module::compile_llvm_module(std::unique_ptr module, cu_module::cu_module(driver::context * context, std::unique_ptr ll_module): cu_module(context, compile_llvm_module(std::move(ll_module), context->device())) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ -// std::cout << source << std::endl; + std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/ir/instructions.cc b/lib/ir/instructions.cc index e7e5de1f2..343a59fbf 100644 --- a/lib/ir/instructions.cc +++ b/lib/ir/instructions.cc @@ -20,11 +20,6 @@ instruction::instruction(type *ty, unsigned num_ops, unsigned num_results, const auto it = std::find(block->begin(), block->end(), next); block->get_inst_list().insert(it, next); } - if(num_results == 1) - results_.push_back(this); - else - for(unsigned i = 0; i < num_results; i++) - results_.push_back(new result_reference(this, i)); } void instruction::erase_from_parent() { diff --git a/lib/ir/print.cc b/lib/ir/print.cc index 124091262..af2c68a2e 100644 --- a/lib/ir/print.cc +++ b/lib/ir/print.cc @@ -48,14 +48,8 @@ void print(module &mod, std::ostream& os) { os << std::endl; for(ir::instruction *inst: block->get_inst_list()){ os << " "; - unsigned num_results = inst->get_num_results(); - for(unsigned i = 0; i < num_results; i++){ - os << get_name(inst->get_result(i), cnt++); - if(i < num_results - 1) - os << ", "; - else - os << " = "; - } + os << get_name(inst, cnt++); + os << " = "; ir::type* type = inst->get_type(); os << inst->repr() << " " << type->repr(); ir::instruction::ops_t ops = inst->ops(); diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 114626dce..e7850e1c8 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -5,6 +5,7 @@ #include #include "triton/codegen/selection.h" #include "triton/runtime/function.h" +#include "triton/codegen/transform/reorder.h" #include "triton/lang/cpp.h" #include "triton/lang/parser.h" #include "triton/lang/code_gen.h" @@ -198,6 +199,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::analysis::liveness shmem_liveness(&shmem_info); codegen::analysis::memalloc shmem_allocation(&shmem_liveness, &shmem_info, &grids); codegen::analysis::align alignment_info; + codegen::transform::reorder reorder(&alignment_info); codegen::transform::membar shmem_barriers(&shmem_allocation, &shmem_info); codegen::transform::vectorize vectorize(&grids); codegen::transform::dce dce; @@ -208,6 +210,10 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c peephole.run(module); dce.run(module); alignment_info.run(module); + ir::print(module, std::cout); +// reorder.run(module); + dce.run(module); + ir::print(module, std::cout); grids.run(module); reassociate.run(module); dce.run(module); diff --git a/tests/common/src/copy.h b/tests/common/src/copy.h index 58651a84f..2a7dc0627 100644 --- a/tests/common/src/copy.h +++ b/tests/common/src/copy.h @@ -38,7 +38,7 @@ void copy2d(TYPE * X __noalias __readonly __aligned(16), int rm[TM] = ridm * TM + 0 ... TM; int rn[TN] = ridn * TN + 0 ... TN; TYPE* px[TM, TN] = X + rm[:, newaxis] + rn[newaxis, :] * ldx; - TYPE* py[TM, TN] = Y + rm[:, newaxis] + rn[newaxis, :] * ldy; + TYPE* py[TM, TN] = Y + rm[:, newaxis] * ldy + rn[newaxis, :]; *py = *px; } )"; From 3d78810d5efd43004143c48c24a4b63cf4055fe1 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 8 Sep 2019 21:29:40 -0400 Subject: [PATCH 368/494] more progress --- lib/codegen/analysis/grid.cc | 5 ++++- lib/codegen/selection.cc | 1 + lib/ir/instructions.cc | 2 ++ lib/runtime/function.cc | 7 +++++-- 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/lib/codegen/analysis/grid.cc b/lib/codegen/analysis/grid.cc index bbd2940ec..29d5c3657 100644 --- a/lib/codegen/analysis/grid.cc +++ b/lib/codegen/analysis/grid.cc @@ -157,7 +157,7 @@ grids::fragment_t grids::get_fragmentation_type(node_t x, graph_t &graph){ } void grids::connected_components(node_t x, const std::vector mps, const std::vector prefixes, std::set &nodes, graph_t &graph, unsigned group_id) { -// std::cout << "connected component: " << x.first->get_name() << " " << x.second << std::endl; + std::cout << "connected component: " << x.first->get_name() << " " << x.second << std::endl; groups_[x.first].insert({x.second, group_id}); if(nodes.find(x) != nodes.end()){ nodes.erase(x); @@ -229,10 +229,13 @@ void grids::run(ir::module &mod) { create_grids(grids_, references, fn); } + unsigned num_threads = get_num_threads(); auto clamp = [&](unsigned x, unsigned lo, unsigned hi) { return std::min(std::max(x, lo), hi); }; for(ir::value *i: grids_){ + std::cout << "grid: " << i->get_name() << std::endl; + if(!i->get_type()->is_tile_ty()) continue; auto shapes = i->get_type()->get_tile_shapes(); diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index a2a48300b..88664bb64 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -1436,6 +1436,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & } void selection::lower_instruction(ir::instruction *src, IRBuilder<> &builder) { + std::cout << src->get_name() << std::endl; if(src->has_tile_result_or_op()) { lower_tile_instruction(src, builder); } diff --git a/lib/ir/instructions.cc b/lib/ir/instructions.cc index 343a59fbf..ac595079d 100644 --- a/lib/ir/instructions.cc +++ b/lib/ir/instructions.cc @@ -731,6 +731,7 @@ barrier_inst* barrier_inst::create(context &ctx, const std::string &name, instru return new barrier_inst(ctx, name, next); } + // nv_dynamic_program_idx nv_dynamic_program_idx_inst::nv_dynamic_program_idx_inst(type *ty, const std::string &name, instruction *next) : instruction(ty, 0, 1, name, next) { } @@ -754,5 +755,6 @@ nv_static_program_idx* nv_static_program_idx::get(constant_range* range) { } + } } diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index e7850e1c8..3ddcc856f 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -199,7 +199,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::analysis::liveness shmem_liveness(&shmem_info); codegen::analysis::memalloc shmem_allocation(&shmem_liveness, &shmem_info, &grids); codegen::analysis::align alignment_info; - codegen::transform::reorder reorder(&alignment_info); + codegen::transform::reorder reorder(&alignment_info, &shmem_info); codegen::transform::membar shmem_barriers(&shmem_allocation, &shmem_info); codegen::transform::vectorize vectorize(&grids); codegen::transform::dce dce; @@ -211,7 +211,9 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c dce.run(module); alignment_info.run(module); ir::print(module, std::cout); -// reorder.run(module); + if(target->is_gpu()) + shmem_info.run(module); + reorder.run(module); dce.run(module); ir::print(module, std::cout); grids.run(module); @@ -229,6 +231,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c dce.run(module); vectorize.run(module); dce.run(module); + ir::print(module, std::cout); // generate llvm code llvm::LLVMContext ctx; std::unique_ptr llvm(new llvm::Module(module.get_name(), ctx)); From 3daef1726d11e2253f406631c1ec1cf9ea27f3f8 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 8 Sep 2019 21:36:54 -0400 Subject: [PATCH 369/494] more progress --- include/triton/codegen/analysis/grid.h | 11 +++++++++-- lib/codegen/analysis/grid.cc | 12 +++++++----- lib/runtime/function.cc | 4 ++-- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/include/triton/codegen/analysis/grid.h b/include/triton/codegen/analysis/grid.h index 26331c786..84fd2168c 100644 --- a/include/triton/codegen/analysis/grid.h +++ b/include/triton/codegen/analysis/grid.h @@ -17,6 +17,11 @@ namespace ir{ } namespace codegen{ + +namespace transform{ +class reorder; +} + namespace analysis{ class grids { @@ -36,12 +41,12 @@ private: fragment_t get_fragmentation_type(node_t x, graph_t &graph); void connected_components(node_t x, const std::vector mps, const std::vector prefixes, std::set &nodes, graph_t &graph, unsigned group_id); void create_grids(std::vector &grids, - std::map &references, + std::map >, triton::ir::value *> &references, ir::function *fn); public: - grids(size_t num_warps); + grids(size_t num_warps, transform::reorder* reorder); ir::metaparameter* get_param(ir::value *value, const std::string &key) { return params_[value][key]; } unsigned get_param_group(ir::value *value, unsigned ax); fragment_t get_fragment(ir::value *value, unsigned ax) { return fragments_.at({value, ax}); } @@ -60,6 +65,8 @@ private: std::vector grids_; std::map> groups_; size_t num_warps_; + transform::reorder* reorder_; + }; diff --git a/lib/codegen/analysis/grid.cc b/lib/codegen/analysis/grid.cc index 29d5c3657..d7b773aaf 100644 --- a/lib/codegen/analysis/grid.cc +++ b/lib/codegen/analysis/grid.cc @@ -1,5 +1,6 @@ #include #include +#include "triton/codegen/transform/reorder.h" #include "triton/codegen/analysis/grid.h" #include "triton/ir/instructions.h" #include "triton/ir/type.h" @@ -15,7 +16,7 @@ namespace triton{ namespace codegen{ namespace analysis{ -grids::grids(size_t num_warps): num_warps_(num_warps) +grids::grids(size_t num_warps, transform::reorder *reorder): num_warps_(num_warps), reorder_(reorder) { } bool is_hmma(ir::value *v){ @@ -157,7 +158,6 @@ grids::fragment_t grids::get_fragmentation_type(node_t x, graph_t &graph){ } void grids::connected_components(node_t x, const std::vector mps, const std::vector prefixes, std::set &nodes, graph_t &graph, unsigned group_id) { - std::cout << "connected component: " << x.first->get_name() << " " << x.second << std::endl; groups_[x.first].insert({x.second, group_id}); if(nodes.find(x) != nodes.end()){ nodes.erase(x); @@ -225,7 +225,7 @@ void grids::run(ir::module &mod) { } for(ir::function *fn: mod.get_function_list()){ - std::map references; + std::map>, ir::value*> references; create_grids(grids_, references, fn); } @@ -317,7 +317,8 @@ void grids::run(ir::module &mod) { void grids::create_grids(std::vector &grids, - std::map &references, + std::map>, ir::value*> &references, ir::function *fn) { // get number of dimensions greater than 1 auto get_tile_gt1_dim = [&](ir::value *v){ @@ -331,6 +332,7 @@ void grids::create_grids(std::vector &grids, std::set seen; std::function bind_references = [&](ir::value *v) { + auto order = reorder_->get_order(v); // skip if(!v->get_type()->is_tile_ty() || !seen.insert(v).second) return; @@ -344,7 +346,7 @@ void grids::create_grids(std::vector &grids, if(shapes[d] == 1) continue; unsigned x = get_param_group(v, d); - ir::value *&r = references[x]; + ir::value *&r = references[{x, order}]; if(!r || get_tile_gt1_dim(v) > get_tile_gt1_dim(r)) r = v; } diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 3ddcc856f..f2aa8e4db 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -194,12 +194,12 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c std::unique_ptr target = context->device()->make_target(); // create passes - codegen::analysis::grids grids(opt.num_warps); codegen::analysis::meminfo shmem_info; codegen::analysis::liveness shmem_liveness(&shmem_info); - codegen::analysis::memalloc shmem_allocation(&shmem_liveness, &shmem_info, &grids); codegen::analysis::align alignment_info; codegen::transform::reorder reorder(&alignment_info, &shmem_info); + codegen::analysis::grids grids(opt.num_warps, &reorder); + codegen::analysis::memalloc shmem_allocation(&shmem_liveness, &shmem_info, &grids); codegen::transform::membar shmem_barriers(&shmem_allocation, &shmem_info); codegen::transform::vectorize vectorize(&grids); codegen::transform::dce dce; From 0cbbcce5c0e8cd356704da71cc7c876e15813e36 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 8 Sep 2019 21:38:08 -0400 Subject: [PATCH 370/494] added missing file --- include/triton/codegen/transform/reorder.h | 39 +++++++++ lib/codegen/transform/reorder.cc | 96 ++++++++++++++++++++++ 2 files changed, 135 insertions(+) create mode 100644 include/triton/codegen/transform/reorder.h create mode 100644 lib/codegen/transform/reorder.cc diff --git a/include/triton/codegen/transform/reorder.h b/include/triton/codegen/transform/reorder.h new file mode 100644 index 000000000..19bffab03 --- /dev/null +++ b/include/triton/codegen/transform/reorder.h @@ -0,0 +1,39 @@ +#ifndef TDL_INCLUDE_CODEGEN_OPTIMIZE_REORDER_H +#define TDL_INCLUDE_CODEGEN_OPTIMIZE_REORDER_H + +#include +#include + +namespace triton { + +namespace ir { + class module; + class value; +} + +namespace codegen{ + +namespace analysis{ + class align; + class meminfo; +} + +namespace transform{ + +class reorder { +public: + reorder(analysis::align* algin, analysis::meminfo* mem); + std::vector get_order(ir::value* v); + void run(ir::module &mod); + +private: + analysis::align* align_; + analysis::meminfo* mem_; + std::map> order_; +}; + +} +} +} + +#endif diff --git a/lib/codegen/transform/reorder.cc b/lib/codegen/transform/reorder.cc new file mode 100644 index 000000000..c5bc31d59 --- /dev/null +++ b/lib/codegen/transform/reorder.cc @@ -0,0 +1,96 @@ +#include +#include +#include +#include "triton/ir/function.h" +#include "triton/ir/cfg.h" +#include "triton/ir/basic_block.h" +#include "triton/ir/instructions.h" +#include "triton/ir/module.h" +#include "triton/codegen/analysis/meminfo.h" +#include "triton/codegen/analysis/align.h" +#include "triton/codegen/transform/reorder.h" + +namespace triton { +namespace codegen{ +namespace transform{ + +reorder::reorder(analysis::align* align, analysis::meminfo *mem) + : align_(align), mem_(mem) { } + +std::vector reorder::get_order(ir::value* v) { + std::cout << v->get_name() << std::endl; + return order_.at(v); +} + +void reorder::run(ir::module &mod) { + + std::set io; + + // initialize work-list + for(ir::function *fn: mod.get_function_list()) + for(ir::basic_block *block: ir::cfg::reverse_post_order(fn)) + for(ir::instruction *i: block->get_inst_list()){ + if(auto *x = dynamic_cast(i)) { + ir::type* ptr_ty = x->get_pointer_operand()->get_type(); + if(ptr_ty->is_tile_ty()) + io.insert(x); + std::vector order(ptr_ty->get_tile_shapes().size()); + std::iota(order.begin(), order.end(), 0); + order_[i] = order; + } + } + + ir::builder &builder = mod.get_builder(); + for(ir::io_inst *i: io) { + ir::value *ptr = i->get_pointer_operand(); + auto max_contiguous = align_->get_max_contiguous_vec(ptr); + std::vector order(max_contiguous.size()); + std::iota(order.begin(), order.end(), 0); + std::sort(order.begin(), order.end(), [&](unsigned a, unsigned b) { return max_contiguous[a] > max_contiguous[b]; } ); + std::list work_list; + if(order != order_[i]){ + work_list.push_back(i); + } + // rematerialize recursively + while(!work_list.empty()) { + ir::instruction* current = work_list.back(); + order_[current] = order; + work_list.pop_back(); + for(ir::value *op: current->ops()) { + ir::instruction* i_op = dynamic_cast(op); + if(!i_op) + continue; + ir::type *ty = i_op->get_type(); + if(!ty->is_tile_ty()) + continue; + auto& inst_list = i_op->get_parent()->get_inst_list(); + auto it = std::find(inst_list.begin(), inst_list.end(), i_op); + it++; + builder.set_insert_point(it); + // found a load; write to shared memory and stop recursion + ir::instruction *n_op = nullptr; + if(mem_->is_shared(i_op)){ + continue; + } + if(auto* ld = dynamic_cast(i_op)) { + n_op = ir::copy_to_shared_inst::create(ld); + } + // not a load; rematerialize and recurse + else { + n_op = i_op->clone(); + work_list.push_back(n_op); + } + n_op = builder.insert(n_op); + order_[n_op] = order; + align_->copy(n_op, i_op); + current->replace_uses_of_with(i_op, n_op); + } + } + + } +} + + +} +} +} From 4a69af08e77c5fe59f62a38a6ea24b7d86de7630 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 9 Sep 2019 02:29:18 -0400 Subject: [PATCH 371/494] [documentation] added README.md and first part of the Triton-C tutorial --- README.md | 36 ++++++++++++ docs/pytriton.md | 0 docs/triton-c.md | 149 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 185 insertions(+) create mode 100644 README.md create mode 100644 docs/pytriton.md create mode 100644 docs/triton-c.md diff --git a/README.md b/README.md new file mode 100644 index 000000000..e864a1e18 --- /dev/null +++ b/README.md @@ -0,0 +1,36 @@ +# Triton + +This is the development repository of Triton, a language and compiler for writing highly efficient custom Deep-Learning primitives. + +The formal foundations of this project are described in the following MAPL2019 publication: [Triton: An Intermediate Language and Compiler for Tiled Neural Network Computations](http://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf). Please cite us if you use our work! + + +The main features of Triton at the moment are: + +- Triton-C: An imperative, single-threaded language for writing highly efficient compute-kernels at a relatively high abstraction level using numpy-like extensions of the C language. + +- PyTriton: A Python API for writing custom operations for Triton-C compute-kernels. PyTriton automatically generates and just-in-time Tensorflow and PyTorch bindings. + +- Triton-JIT: An optimizing just-in-time compiler for Triton-C, which generates GPU code on par with state-of-the-art CUDA-C (e.g., [CUTLASS](https://github.com/NVIDIA/cutlass)) and PTX (e.g., [ISAAC](https://github.com/ptillet/isaac)). This includes transparent support for mixed-precision and Tensor Cores. + + + +## Installation + +Triton is a fairly self-contained package and uses its own parser (forked from [wgtcc](https://github.com/wgtdkp/wgtcc)) and LLVM code-generator. However, at the moment it still relies on LLVM-8.0+ for PTX code generation. + +``` +sudo apt-get install llvm-8-dev +git clone https://github.com/ptillet/triton.git; +cd triton/python/; +python setup.py develop; +cd examples; +python dot.py +``` + +## Tutorials + +- [Write your own custom kernel using Triton-C](https://github.com/ptillet/triton/blob/master/docs/triton-c.md) +- [Write your own custom Deep Learning op using PyTriton](https://github.com/ptillet/triton/blob/master/docs/pytriton.md) + + diff --git a/docs/pytriton.md b/docs/pytriton.md new file mode 100644 index 000000000..e69de29bb diff --git a/docs/triton-c.md b/docs/triton-c.md new file mode 100644 index 000000000..e9f7b9d75 --- /dev/null +++ b/docs/triton-c.md @@ -0,0 +1,149 @@ +# The Triton-C Programming Language + +## Table of Contents +1. [Motivations](#motivations) +2. [Vector Addition](#vector-addition) + 1. [Differences over CUDA](#differences-with-cuda) + 2. [Advantages over CUDA](#advantages-over-cuda) + 1. [Vectorization](#vectorization) + 2. [Parameterization](#parameterization) + 3. [Auto-Tuning](#auto-tuning) +3. [Matrix Transposition](#matrix-transposition) +4. [Matrix Multiplication](#matrix-multiplication) + + +## Motivations + +The semantics of arrays in C/C++ is similar to that of pointers. In other way, there is no way to manipulate statically shaped multi-dimensional arrays (beyond initialization) as a whole without resorting to third-party libraries, as shown below: + +```c +float x[16][8] = {3.14}; +float y[16][8] = {5.17}; +// z = x + y +float z[16][8]; +for(int i = 0; i < 16; i ++) + for(int j = 0; j < 8; j++) + z[i][j] = x[i][j] + y[i][j]; +``` + +As mentioned above, this issue can be mitigated through the use of third-party libraries: + +```c +matrix x = {3.14}; +matrix y = {5.17}; +matrix z = x + y; +``` + +Here, we have a simple one-liner that will tell your C++ compiler to generate the above nested loop and check that the shapes match. This is better, but there are still some important issues with this approach: + +- The syntax could be better. + +- The compiler will now see a bunch of nested loops. Don't get me wrong, compilers have gotten really good at optimizing these (especially using polyhedral compilation), but they're still not at the point where they can automatically distribute them between CUDA threads and achieve performance on par with expert-tuned code. + +Triton-C addresses these issues by (a) adding syntax and semantics for numerical array operations to the C language; and (b) relying on an LLVM-like IR -- Triton-IR -- which supports array operations natively. The set of optimizations done by Triton-JIT on Triton-IR is beyond the scope of this tutorial, but you can learn more about it [there](http://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf). + +The above code then becomes the following Triton-C: +```c +float x[16, 8] = 3.14; +float y[16, 8] = 5.17; +// float z[8, 8] = x + y // doesn't compile -- incompatible shapes! +float z[16, 8] = x + y; +``` + +Of course, we can do much more than additions: matrix-multiplication, transposition, numpy-style broadcasting ... all of these array operations are built into Triton-C. + +_Note: You might be thinking that this is exactly what [MLIR](https://github.com/tensorflow/mlir) was made for... and you're right! You can think of Triton-IR as a dialect for MLIR, and Triton-C as a frontend for it. If you're interested in making this a thing, let me know._ + +## Vector Addition + +### Differences with CUDA + +Let's look at a really simple example to get started. Vector addition in its most trivial Triton-C implementation is written as follows: + +```c +// launched on a grid of (N / 32) programs of 1 thread each +__global__ void add(int N, float *a, float *b, float* c) { + int id = get_program_id(0); + int off[32] = id * 32 + (0 ... 32) + *(c + off) = *(a + off) + *(b + off) +} +``` +For reference, here is an equivalent CUDA kernel (the resulting PTX code will be identical): + +```c +// launched on a grid of (N / 32) programs of 32 threads each +__global__ void add(int N, float *a, float *b, float *c) { + int off = blockIdx.x * 32 + threadIdx.x; + c[off] = a[off] + b[off]; +} +``` + +There are two main differences between the Triton-C kernel and the CUDA-C kernel on this simple example: + +- **The programming model is different**. +While Triton-C and CUDA-C both use a Single-Program, Multiple-Data (SPMD) programming model, each Triton-C kernel is single-threaded (and automatically parallelized). Therefore, `get_program_id({0, 1, 2})` is equivalent to `blockIdx.{x, y, z}` and there is no such thing as `blockDim` and `threadIdx`. + +- **The semantics of arrays is different** +In the above Triton-C kernel, `off` is an array of 32 consecutive integers: `int off[32] = {id * 32 + 0, id * 32 + 1, ..., id * 32 + 31}`. + + As a result, the statement: `c + off` implicitly broadcast `c` and creates an array of 32 pointers. This could also be done explicitly as follows: +``` +float* c_broadcast[32] = c; +float* c_ptr[32] = c_broadcast + off; // c_ptr = c + off +``` + +- **The semantics of the subscript operator is different**. +In C/CUDA-C, subscripting can be used to offset and dereference a pointer, but in Triton-C it can only be used to index and broadcast an array (think NumPy). + +### Advantages over CUDA + +The above example does not exactly show any practical benefits for Triton, but its advantages over CUDA should become more and more obvious as this tutorial progresses. In this subsection, we show how Triton can be used to optimize vector additions by automatically taking care of load/store vectorization and auto-tuning. + +#### Vectorization + +On some hardware architectures, vectorizing I/O operations can lead to better memory utilization and, in turn, noticeable performance gains. In general, 128-bit memory transactions are favored, leading to the following kernel: +```c +// launched on a grid of (N / 128) programs of 32 threads each +__global__ void add(int N, float4 *a, float4 *b, float4 *c) { + int off = blockIdx.x * 32 + threadIdx.x; + c[off] = a[off] + b[off]; +} +``` +Or, for half-precision inputs: +```c +// launched on a grid of (N / 256) programs of 32 threads each +__global__ void add(int N, half8 *a, half8 *b, half8 *c) { + int off = blockIdx.x * 32 + threadIdx.x; + c[off] = a[off] + b[off]; +} +``` + +Now this is a bit annoying, because as a programmer you have to keep track of not only the ideal vector size for each data-type (which might change in future GPU architectures), but also of how many elements are processed in each thread-block -- and adjust the grid size of the kernel accordingly! + +In Triton-C, this is not a problem as the compiler will figure out automatically when vectorization should or should not be used, without any change in the source-code necessary. + +#### Parameterization + +It turns out that the Triton compiler would refuse to vectorize our code because then our array of 32 pointers would have to be distributed over 8 threads, which is less than a warp. Fortunately, it turns out that this problem can be easily solved using preprocessor directrives: +```c +// launched on a grid of (N / SIZE) programs of 1 thread each +__global__ void add(int N, TYPE* a, TYPE* b, TYPE* c) { + int id = get_program_id(0); + int off[SIZE] = id * SIZE + (0 ... SIZE) + *(c + off) = *(a + off) + *(b + off) +} +// Not vectorized when compiled with -DSIZE=32 -DTYPE=float +// 4-Vectorized when compiled with -DSIZE=128 -DTYPE=float +// 8-Vectorized when compiled with -DSIZE=256 -DTYPE=half +``` +Now, `TYPE` and `SIZE` are preprocessors macros which can be specified at compile-time, thereby giving the Triton compiler enough information to vectorize when beneficial without requiring any additional code modification. + + +#### Auto-Tuning + +So now we have this parameter, `SIZE`, whose optimal value depends not only on the data-type that is being used but also on the size of the input vectors `N`. Fortunately, the Triton preprocessor also accepts a list of possible definitions for macros, in which case an auto-tuning procedure will be launched every-time new input sizes are encountered. + +In other words, compiling the above kernel with the option`-DSIZE=[32, 64, 128, 256] -DTYPE=float` +will result in the parameter `SIZE` being automatically tuned every time a new value of `N` is encountered. + +_Note: Tuning our reference CUDA kernel would be much more cumbersome, as template metaprogramming would have to be used to ensure that proper vector types would be used_ From 433b08b39bf833a7dd4e0000188daf0eb1276d52 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 9 Sep 2019 02:38:23 -0400 Subject: [PATCH 372/494] [documentation] added [coming soon...] for tutorials in progress --- README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index e864a1e18..c65aa8680 100644 --- a/README.md +++ b/README.md @@ -6,12 +6,11 @@ The formal foundations of this project are described in the following MAPL2019 p The main features of Triton at the moment are: +- **PyTriton**: A Python API for writing custom operations for Triton-C compute-kernels. PyTriton automatically generates and just-in-time Tensorflow and PyTorch bindings. +- **Triton-C**: An imperative, single-threaded language for writing highly efficient compute-kernels at a relatively high abstraction level using numpy-like extensions of the C language. +- **Triton-IR**: An intermediate-representation for optimizing multi-dimensional array operations in linear algebra programs +- **Triton-JIT**: An optimizing just-in-time compiler for Triton-C, which generates GPU code on par with state-of-the-art CUDA-C (e.g., [CUTLASS](https://github.com/NVIDIA/cutlass)) and PTX (e.g., [ISAAC](https://github.com/ptillet/isaac)). This includes transparent support for mixed-precision and Tensor Cores. -- Triton-C: An imperative, single-threaded language for writing highly efficient compute-kernels at a relatively high abstraction level using numpy-like extensions of the C language. - -- PyTriton: A Python API for writing custom operations for Triton-C compute-kernels. PyTriton automatically generates and just-in-time Tensorflow and PyTorch bindings. - -- Triton-JIT: An optimizing just-in-time compiler for Triton-C, which generates GPU code on par with state-of-the-art CUDA-C (e.g., [CUTLASS](https://github.com/NVIDIA/cutlass)) and PTX (e.g., [ISAAC](https://github.com/ptillet/isaac)). This includes transparent support for mixed-precision and Tensor Cores. @@ -30,7 +29,8 @@ python dot.py ## Tutorials -- [Write your own custom kernel using Triton-C](https://github.com/ptillet/triton/blob/master/docs/triton-c.md) -- [Write your own custom Deep Learning op using PyTriton](https://github.com/ptillet/triton/blob/master/docs/pytriton.md) - +- The PyTriton API (coming soon...) +- [The Triton-C language](https://github.com/ptillet/triton/blob/master/docs/triton-c.md) +- The Triton-IR representation (coming soon...) +- The Triton-JIT compiler (coming soon...) From b953051eee6898f0eda178bedddf1382a2b3ab84 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 9 Sep 2019 04:09:23 -0400 Subject: [PATCH 373/494] [documentation] improved wording of triton-c tutorial --- docs/triton-c.md | 92 +++++++++++++++++++++++++++++------------------- 1 file changed, 56 insertions(+), 36 deletions(-) diff --git a/docs/triton-c.md b/docs/triton-c.md index e9f7b9d75..04fd7bde2 100644 --- a/docs/triton-c.md +++ b/docs/triton-c.md @@ -1,9 +1,11 @@ # The Triton-C Programming Language -## Table of Contents +## Table of Contents 1. [Motivations](#motivations) + 1. [Issues of C/C++ for Linear Algebra](#issues-c-c++) + 2. [Design Philosophy of Triton-C](#design-philosophy) 2. [Vector Addition](#vector-addition) - 1. [Differences over CUDA](#differences-with-cuda) + 1. [Differences with CUDA](#differences-with-cuda) 2. [Advantages over CUDA](#advantages-over-cuda) 1. [Vectorization](#vectorization) 2. [Parameterization](#parameterization) @@ -12,9 +14,11 @@ 4. [Matrix Multiplication](#matrix-multiplication) -## Motivations +## Motivations -The semantics of arrays in C/C++ is similar to that of pointers. In other way, there is no way to manipulate statically shaped multi-dimensional arrays (beyond initialization) as a whole without resorting to third-party libraries, as shown below: +## Issues of C/C++ for Linear Algebra + +In C and C++, arrays and pointers have similar semantics. Indeed, there is no way to manipulate statically shaped multi-dimensional arrays (beyond initialization) as a whole without resorting to third-party libraries: ```c float x[16][8] = {3.14}; @@ -26,7 +30,7 @@ for(int i = 0; i < 16; i ++) z[i][j] = x[i][j] + y[i][j]; ``` -As mentioned above, this issue can be mitigated through the use of third-party libraries: +This issue can be somewhat mitigated using templates metaprogramming in C++: ```c matrix x = {3.14}; @@ -34,31 +38,41 @@ matrix y = {5.17}; matrix z = x + y; ``` -Here, we have a simple one-liner that will tell your C++ compiler to generate the above nested loop and check that the shapes match. This is better, but there are still some important issues with this approach: +This is better, but there are still some important issues with this approach: -- The syntax could be better. +- The syntax could be better, especially when it comes to broadcasting and reshaping. -- The compiler will now see a bunch of nested loops. Don't get me wrong, compilers have gotten really good at optimizing these (especially using polyhedral compilation), but they're still not at the point where they can automatically distribute them between CUDA threads and achieve performance on par with expert-tuned code. +- Data-flow information for array operations does not propagate beyond the program's AST, thereby making it difficult for compilers to optimize moderately complicated array programs (i.e., Matrix-Multiplication). This can be worked around using heavy metaprogramming techniques (see [CUTLASS](https://github.com/NVIDIA/cutlass)), but even then programmers still have to allocate and synchronize shared memory manually and endure prohibitively long compilation procedures not easily amenable to auto-tuning. -Triton-C addresses these issues by (a) adding syntax and semantics for numerical array operations to the C language; and (b) relying on an LLVM-like IR -- Triton-IR -- which supports array operations natively. The set of optimizations done by Triton-JIT on Triton-IR is beyond the scope of this tutorial, but you can learn more about it [there](http://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf). +For these reasons, most Deep-Learning frameworks still rely heavily on highly optimized subroutines (e.g., BLAS), which makes the development of novel custom primitives time-consuming for experts and challenging for others. This is where Triton comes into play. -The above code then becomes the following Triton-C: +## Design Philosophy of Triton-C + +The purpose of Triton is to bring native support for efficient numerical multi-dimensional array operations into a standard procedural languages. We achieve this through: + +* **Triton-C**: Syntactic and semantical extensions to the C language. In particular, native support for reshaping, broadcasting, matrix-multiplication, transposition, etc. This is the object of this tutorial. + +* **Triton-IR**: An LLVM-like IR for array operations, as well as various (automatic memory coalescing, automatic vectorization, shared memory allocation/synchronization, tensor core instruction selection, etc.). Although our system generates Triton-IR programs from Triton-C source-code, this is beyond the scope of this tutorial. More information can be found [here](http://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf). + +Anyway, the Triton-C code corresponding to the above matrix addition operation can be written and extended as follows: ```c float x[16, 8] = 3.14; float y[16, 8] = 5.17; -// float z[8, 8] = x + y // doesn't compile -- incompatible shapes! +// float z[8, 8] = x + y; // doesn't compile -- incompatible shapes! float z[16, 8] = x + y; +float u[16] = z[:, +]; // sum along the second axis +float v[16, 32] = u[:, newaxis]; // broadcasting along the second axis ``` -Of course, we can do much more than additions: matrix-multiplication, transposition, numpy-style broadcasting ... all of these array operations are built into Triton-C. +Of course, we can do much more than additions, reduction and broadcasting. The purpose of this tutorial is to walk you through all the features of Triton-C, and eventually show you how it can be used to build auto-tuned matrix-multiplication kernels on par with state-of-the-art CUDA-C implementation in less than an afternoon. _Note: You might be thinking that this is exactly what [MLIR](https://github.com/tensorflow/mlir) was made for... and you're right! You can think of Triton-IR as a dialect for MLIR, and Triton-C as a frontend for it. If you're interested in making this a thing, let me know._ -## Vector Addition +## Vector Addition -### Differences with CUDA +### Differences with CUDA -Let's look at a really simple example to get started. Vector addition in its most trivial Triton-C implementation is written as follows: +Let's start it off by looking at a simple example. Vector addition, in its most trivial Triton-C implementation, can be written as follows: ```c // launched on a grid of (N / 32) programs of 1 thread each @@ -68,7 +82,7 @@ __global__ void add(int N, float *a, float *b, float* c) { *(c + off) = *(a + off) + *(b + off) } ``` -For reference, here is an equivalent CUDA kernel (the resulting PTX code will be identical): +For reference, here is an equivalent CUDA kernel (nvcc will generate the same PTX code as triton-jit on the above code): ```c // launched on a grid of (N / 32) programs of 32 threads each @@ -78,30 +92,30 @@ __global__ void add(int N, float *a, float *b, float *c) { } ``` -There are two main differences between the Triton-C kernel and the CUDA-C kernel on this simple example: +As you can see, there are three main differences between our Triton-C kernel and the equivalent CUDA-C: -- **The programming model is different**. -While Triton-C and CUDA-C both use a Single-Program, Multiple-Data (SPMD) programming model, each Triton-C kernel is single-threaded (and automatically parallelized). Therefore, `get_program_id({0, 1, 2})` is equivalent to `blockIdx.{x, y, z}` and there is no such thing as `blockDim` and `threadIdx`. +- **The programming model is different**. +While Triton-C and CUDA-C both use a Single-Program, Multiple-Data (SPMD) programming model, each Triton-C kernel is single-threaded. + Therefore, `get_program_id({0, 1, 2})` is equivalent to `blockIdx.{x, y, z}`, but there is no such thing as `blockDim` and `threadIdx`. -- **The semantics of arrays is different** -In the above Triton-C kernel, `off` is an array of 32 consecutive integers: `int off[32] = {id * 32 + 0, id * 32 + 1, ..., id * 32 + 31}`. - - As a result, the statement: `c + off` implicitly broadcast `c` and creates an array of 32 pointers. This could also be done explicitly as follows: +- **The semantics of arrays is different** +In the above Triton-C kernel, `off` is an array of 32 consecutive integers: `int off[32] = {id * 32 + 0, id * 32 + 1, ..., id * 32 + 31}`. + As a result, the statement: `c + off` implicitly broadcast `c` and creates an array of 32 pointers. This could also be done explicitly as follows: ``` float* c_broadcast[32] = c; float* c_ptr[32] = c_broadcast + off; // c_ptr = c + off ``` - **The semantics of the subscript operator is different**. -In C/CUDA-C, subscripting can be used to offset and dereference a pointer, but in Triton-C it can only be used to index and broadcast an array (think NumPy). +n C/CUDA-C, subscripting can be used to offset and dereference a pointer, but in Triton-C it can only be used to index and broadcast an array (think NumPy). -### Advantages over CUDA +### Advantages over CUDA -The above example does not exactly show any practical benefits for Triton, but its advantages over CUDA should become more and more obvious as this tutorial progresses. In this subsection, we show how Triton can be used to optimize vector additions by automatically taking care of load/store vectorization and auto-tuning. +At this point, the advantages of Triton-C over CUDA may not be obvious. But they should become clearer and clearer as this tutorial progresses. First and foremost, the purpose of this subsection is to show how Triton can be used to optimize vector additions by automatically taking care of load/store vectorization, code parameterization and auto-tuning -- all of which require nontrivial implementation efforts in CUDA. -#### Vectorization +#### Vectorization -On some hardware architectures, vectorizing I/O operations can lead to better memory utilization and, in turn, noticeable performance gains. In general, 128-bit memory transactions are favored, leading to the following kernel: +On some hardware architectures, vectorizing load/store operations can lead to better memory utilization and, in turn, noticeable performance gains. In general, 128-bit memory transactions are favored, leading to the following CUDA kernel: ```c // launched on a grid of (N / 128) programs of 32 threads each __global__ void add(int N, float4 *a, float4 *b, float4 *c) { @@ -118,13 +132,13 @@ __global__ void add(int N, half8 *a, half8 *b, half8 *c) { } ``` -Now this is a bit annoying, because as a programmer you have to keep track of not only the ideal vector size for each data-type (which might change in future GPU architectures), but also of how many elements are processed in each thread-block -- and adjust the grid size of the kernel accordingly! +Now this is a bit annoying, because as a programmer you have to keep track of not only the ideal vector size for each data-type (which might change in future GPU architectures), but also of how many elements are processed in each thread-block -- and adjust the grid size of the kernel accordingly! Not to mention that you may want to tune the thread-block size as well. -In Triton-C, this is not a problem as the compiler will figure out automatically when vectorization should or should not be used, without any change in the source-code necessary. +In Triton-C, this is not a problem as the compiler will figure out automatically when and where vectorization should be used, without any change in the source-code necessary. -#### Parameterization +#### Parameterization -It turns out that the Triton compiler would refuse to vectorize our code because then our array of 32 pointers would have to be distributed over 8 threads, which is less than a warp. Fortunately, it turns out that this problem can be easily solved using preprocessor directrives: +Specifically, the Triton compiler would refuse to 4-way vectorize our above compute kernel because it would require the array `int off[32]` to be distributed over 8 threads, which is less than a warp. Fortunately, it turns out that this problem can be easily solved using preprocessor directrives to _parameterize_ our kernel: ```c // launched on a grid of (N / SIZE) programs of 1 thread each __global__ void add(int N, TYPE* a, TYPE* b, TYPE* c) { @@ -139,11 +153,17 @@ __global__ void add(int N, TYPE* a, TYPE* b, TYPE* c) { Now, `TYPE` and `SIZE` are preprocessors macros which can be specified at compile-time, thereby giving the Triton compiler enough information to vectorize when beneficial without requiring any additional code modification. -#### Auto-Tuning +#### Auto-Tuning -So now we have this parameter, `SIZE`, whose optimal value depends not only on the data-type that is being used but also on the size of the input vectors `N`. Fortunately, the Triton preprocessor also accepts a list of possible definitions for macros, in which case an auto-tuning procedure will be launched every-time new input sizes are encountered. - -In other words, compiling the above kernel with the option`-DSIZE=[32, 64, 128, 256] -DTYPE=float` +As it turns out, different input vector lengths `N` may require different values of `SIZE` to perform optimally. Fortunately, the Triton preprocessor also accepts lists of possible definitions for macros, in which case an auto-tuning procedure will be launched every-time new input sizes are encountered. For example, compiling the above kernel with the option`-DSIZE=[32, 64, 128, 256] -DTYPE=float` will result in the parameter `SIZE` being automatically tuned every time a new value of `N` is encountered. _Note: Tuning our reference CUDA kernel would be much more cumbersome, as template metaprogramming would have to be used to ensure that proper vector types would be used_ + + +## Matrix Transposition + + +## Matrix Multiplication + +## Next Steps From 7d3fb6c390f595c289a7729f7379c74ab31caccd Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 9 Sep 2019 19:02:57 -0400 Subject: [PATCH 374/494] [documentation] updated triton-c tutorial --- docs/triton-c.md | 233 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 229 insertions(+), 4 deletions(-) diff --git a/docs/triton-c.md b/docs/triton-c.md index 04fd7bde2..bb453a227 100644 --- a/docs/triton-c.md +++ b/docs/triton-c.md @@ -11,9 +11,15 @@ 2. [Parameterization](#parameterization) 3. [Auto-Tuning](#auto-tuning) 3. [Matrix Transposition](#matrix-transposition) + 1. [Compute Kernel](#trans-compute-kernel) + 2. [Conditional Dereferencing](#conditional-dereferencing) 4. [Matrix Multiplication](#matrix-multiplication) - - + 1. [Compute Kernel](#matmul-compute-kernel) + 2. [Optimizations](#optimizations) + 1. [Pre-Fetching](#pre-fetching) + 1. [Rematerialization](#rematerialization) + 3. [Fused Transpositions](#fused-trans) + ## Motivations ## Issues of C/C++ for Linear Algebra @@ -163,7 +169,226 @@ _Note: Tuning our reference CUDA kernel would be much more cumbersome, as templa ## Matrix Transposition +Transpositions are (relatively) hard to efficiently write in CUDA because a naive implementation would lead to _uncoalesced_ memory operations when writing back the transposed matrix to DRAM. Therefore, optimized CUDA implementations require the explicit use of shared memory, as shown [here](https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/). + +### Compute Kernel + +In Triton, however, kernels are single-threaded and the compiler automatically detects if and when data should be temporarily stashed to shared memory. Therefore, an optimal Triton kernel for this operation would look like: + +```c +// launched on a grid of (M / TM) x (N / TN) programs of 1 thread each +__global__ void transpose(TYPE * X, TYPE * Y, int M, int N, int ldx, int ldy) { +// extract program ID + int pidm = get_program_id(0); //(1) + int pidn = get_program_id(1); //(2) + // create 1D range along the two matrix's axes + int rm[TM] = pidm * TM + 0 ... TM; //(3) + int rn[TN] = pidn * TN + 0 ... TN; //(4) + // create 2D array of pointers + TYPE* px[TM, TN] = X + rm[:, newaxis] + rn[newaxis, :] * ldx; //(5) + TYPE* py[TN, TM] = Y + rm[newaxis, :] * ldy + rn[:, newaxis]; //(6) + // write back using the transposition operator '^' + *py = ^(*px); //(7) +} +``` + +This kernel loads a `TM x TN` tile from the input matrix `X`, transposes it and write the resulting `TN x TM` tile to the output matrix `Y`. As a result, transposition of the full input matrix is achieved by launching a grid of `(M / TM) x (N / TN)` programs decomposed as follows: + +- Statements (1) and (2) extract the location of the program in the grid. For example, the program producing the output tile `Y[TN:2TN-1, 2TN:3TN-1]` will hold the values: +``` +pidm = 2 +pidn = 1 +``` + +- Statements (3) and (4) construct the ranges of indices to read from the first and second axis of X: +``` +rm = [pidm*TM + 0, pidm*TM + 1, ..., pidm*TM + (TM - 1)] +rn = [pidn*TN + 0, pidn*TN + 1, ..., pidn*TN + (TN - 1)] +``` + +- Statements (5) constructs the following array of pointers `px` using numpy-style broadcasting semantics: +``` +│ X + (pidm*TM + 0) + (pidn*TN + 0)*ldx, ..., ..., X + (pidm*TM + 0) + (pidn*TN + TN - 1)*ldx) │ +│ ⋮ ⋮ │ +│ ⋮ ⋮ │ +│ X + (pidm*TM + TM - 1) + (pidn*TN + 0)*ldx, ..., ..., X + (pidm*TM + TM - 1) + (pidn*TN + TN - 1)*ldx) │ +``` +- Statement (6) constructs the following array of pointers `py` using numpy-style broadcasting semantics: +``` +│ Y + (pidn*TN + 0) + (pidm*TM + 0)*ldy, ..., ..., Y + (pidn*TN + 0) + (pidm*TM + TM - 1)*ldy) │ +│ ⋮ ⋮ │ +│ ⋮ ⋮ │ +│ Y + (pidn*TN + TN - 1) + (pidn*TN + 0)*ldy, ..., ..., Y + (pidn*TN + TN - 1) + (pidm*TM + TM - 1)*ldy) │ +``` +- Statement (7) element-wise dereferences the above array of pointers `*px`, transposes it using the unary transposition operator `^`, and writes it back at the location specified by `py`. + +### Conditional Dereferencing + +You might have noticed that the above code will fail when `M` and `N` are not multiples of `TM` and `TN` respectively. Fortunately, the above kernel can be slightly modified to handle thie situation, as shown below: +``` +// launched on a grid of ((M + TM - 1) / TM) x ((N + TN - 1) / TN) programs +__global__ void transpose(TYPE * X, TYPE * Y, int M, int N, int ldx, int ldy) { + // ... + // create bounds-checking mask + bool checkx[TM, TN] = (rm[:, newaxis] < M) && (rn[newaxis, :] < N); //(7a) + bool checky[TN, TM] = (rm[newaxis, :] < M) && (rn[:, newaxis] < N); //(7b) + // conditional write-back using the conditional dereferencing operatior '*?()' + *?(checky)py = ^(*?(checkx)px); //(7) +} +``` + +Here, statements (7a) creates an array of booleans `checkx[TM, TN]` such that `checkx(i, j) = True` if and only if `px(i, j)` should be dereferenced. Statement (7b) does the same for `py`. Then, both `px` and `py` can be conditionally dereferenced using Triton-C's conditional dereferencing operator `*?(predicate) pointer`. + + +## Matrix Multiplication + +The purpose of this section is to present a Triton-C implementation of matrix multiplication that achieves performance competitive with the best existing hand-written CUDA-C kernels (see [CUTLASS](https://github.com/NVIDIA/cutlass)). We will also see how pre-processors macros can be leveraged to fuse transposition operations as well as to provide support for auto-tuning and FP16 Tensor Cores. + +_Note: Bounds-checking is ommitted for the sake of clarity. This feature can be easily added into our kernel, but may result in a slight performance hit because LLVM and PTXAS have issues dealing with conditionals and predicates inside loops._ + +### Compute Kernel + +Matrix multiplications of the form `C = A x B` can be implemented in Triton-C fairly concisely, as shown below: + +```c +// launched on a grid of (M / TM) x (N / TN) programs of 1 thread each +__global__ void dot(TYPE * A, TYPE * B, TYPE * C, int M, int N, int K, + int lda __multipleof(8), int ldb __multipleof(8), int ldc __multipleof(8)) { + // prologue + int pm = get_program_id(0); //(1) + int pn = get_program_id(1); //(2) + int rm[TM] = pm * TM + 0 ... TM; //(3) + int rn[TN] = pn * TN + 0 ... TN; //(4) + int rk[TK] = 0 ... TK; //(5) + // initialize accumulator + float c[TM, TN] = 0; //(6) + // pointers to operands + TYPE* pa[TM, TK] = A + rk[newaxis, :] * 1 + rm[:, newaxis] * lda; //(7) + TYPE* pb[TK, TN] = B + rk[:, newaxis] * ldb + rn[newaxis, :] * 1; //(8) + // reduction loop + for(int k = K; k > 0; k-= TK){ + // fetch operands + TYPE a[TM, TK] = *pa; //(9) + TYPE b[TK, TN] = *pb; //(10) + // matrix-multiply accumulate + c += a @ b; //(11) + // increment pointers + pa = pa + TK * 1; //(12) + pb = pb + TK * ldb; //(13) + } + // epilogue + TYPE* pc[TM, TN] = C + rn[newaxis, :] + rm[:, newaxis] * ldc; //(14) + *pc = c; //(15) +} +``` +Here, each kernel instance produces a `TM x TN` tile of the output matrix C as follows: + +- Statements (1) - (2) fetch the id of the current program instance. +- Statements (3) - (4) construct ranges of indices to process for the vertical and horizontal axes of the output matrix `C` +- Statement (5) constructs a range of indices along the reduction axis: `rk = [0, 1, ..., TK - 1]` +- Statement (6) initialize a `TM x TN` array of accumulators to hold the result of `A[rm, :] x B[:, rn]` +- Statements (7) - (8) initializes arrays of pointers `pa` and `pb` to the operands `A` and `B` using logic similar to that of the above transposition kernel +- Statements (9) - (10) load tiles of operands by dereferencing `pa` and `pb` +- Statement (11) performs updates the accumulator array using Triton-C's matrix multiplication operator '@' +- Statements (12) - (13) updates `pa` and `pb` +- Statement (14) creates an array of pointers `pc` to the result matrix `C` +- Statement (15) writes back the accumulator to `C` + +Internally, the Triton compiler will perform quite a few optimizations that will ensure good performance for this kernel: + +- Automatic coalescing of load/store operations +- Automatic vectorization of load/store operations +- Stashing `a` and `b` to shared memory +- Automatic allocation of shared memory +- Automatic synchronization of shared memory +- Automatic padding of shared memory to avoid bank conflicts +- Automatic usage of tensor cores when TYPE = half and TK % 4 = 0 + +### Optimizations + +Nonetheless, there are two important optimizations that the Triton compiler does not do at the moment yet are critical to achieve peak performance: pre-fetching and rematerialization. In this subsection we describe how these optimizations can be done manually by modifying the above source-code. + +#### Pre-Fetching + +The purpose of pre-fetching is to overlap the update of the accumulator `c` with the memory loads for the next tiles that will need to be multiplied. This can be done by modifying the above reduction loop as follows: + +``` +// pre-fetch operands +TYPE a[TM, TK] = *pa; //(9) +TYPE b[TK, TN] = *pb; //(10) +for(int k = K; k > 0; k-= TK){ + c += a @ b; + pa = pa + TK * 1; + pb = pb + TK * ldb; + // don't prefetch last iteration + bool check = k > TK; + // pre-fetch operands + a = check ? *pa : 0; + b = check ? *pb : 0; + } +``` + +Note that the Triton-C compiler will now also be able to use double-buffering techniques to make sure that the array `a` can be used and updated at the same time without any memory hazard. + +#### Rematerialization + +[Rematerialization](https://en.wikipedia.org/wiki/Rematerialization) is a compiler optimization which consists in recomputing some values instead of storing and reloading them from (register) memory, so as to decrease register pressure in the compute kernel. Although LLVM does this automatically to some extent, it fails to find good heuristics for the above kernel -- thereby requiring some source code modification to achieve optimal performance. Fortunately, only `rm` and `rn` need to be rematerialized, leading to the following epilogue: + +```c +// epilogue +int rcm[TM] = pm * TM + 0 ... TM; +int rcn[TN] = pn * TN + 0 ... TN; +TYPE* pc[TM, TN] = C + rcn[newaxis, :] + rcm[:, newaxis] * ldc; +*pc = c; +``` + +### Fused Transpositions + +It is common for optimized matrix-multiplication implementations (e.g., BLAS) to provide variants in which one or both operands are transposed. This is also what is done in the [PyTriton](https://github.com/ptillet/triton/blob/master/python/triton/ops/dot.py) implementation of matrix-multiplication. Fortunately, this can be done by using pre-processors macros for tile shapes and broadcasting directives, leading to the following kernel: + +```c +void dot(TYPE * A, TYPE * B, TYPE * C, + int M, int N, int K, + int lda __multipleof(8), int ldb __multipleof(8), int ldc __multipleof(8)) { + // prologue + int pm = get_program_id(0); + int pn = get_program_id(1); + int rm[TM] = pm * TM + 0 ... TM; + int rn[TN] = pn * TN + 0 ... TN; + int rk[TK] = 0 ... TK; + float c[TM, TN] = 0; + // pointers to operands + TYPE* pa[SHAPE_A] = A + rk[BROADCAST_AK] * STRIDE_AK + rm[BROADCAST_AM] * STRIDE_AM; + TYPE* pb[SHAPE_B] = B + rk[BROADCAST_BK] * STRIDE_BK + rn[BROADCAST_BN] * STRIDE_BN; + // prefetches operands + TYPE a[SHAPE_A] = (*pa); + TYPE b[SHAPE_B] = (*pb); + // reduction loop + for(int k = K; k > 0; k-= TK){ + c += USE_A @ USE_B; + pa = pa + TK * STRIDE_AK; + pb = pb + TK * STRIDE_BK; + a = *pa; + b = *pb; + } + // epilogue + int rcm[TM] = pm * TM + 0 ... TM; + int rcn[TN] = pn * TN + 0 ... TN; + TYPE* pc[TM, TN] = C + rcn[newaxis, :] + rcm[:, newaxis] * ldc; + *pc = c; +} +``` + +All matrix multiplications variants can then be retrieved using the following compilation option: +```c +// A is not transposed +-DUSE_A=a -DSTRIDE_AK=1-DSTRIDE_AM=lda -DBROADCAST_AK=newaxis,: -DBROADCAST_AN=:,newaxis -DSHAPE_A=TM,TK +// A is transposed +-DUSE_A=^a -DSTRIDE_AK=lda-DSTRIDE_AM=1 -DBROADCAST_AK=:,newaxis -DBROADCAST_AN=newaxis,: -DSHAPE_A=TK,TM +// B is not transpose +-DUSE_B=b -DSTRIDE_BK=ldb-DSTRIDE_BN=1 -DBROADCAST_BK=:,newaxis -DBROADCAST_BN=newaxis,: -DSHAPE_B=TK,TN +// B is transpose +-DUSE_B=^b -DSTRIDE_BK=1-DSTRIDE_BN=ldb -DBROADCAST_BK=newaxis,: -DBROADCAST_BN=:,newaxis -DSHAPE_B=TN,TK +``` -## Matrix Multiplication -## Next Steps From e1019cff3a2b37d833cf19a7653e2d7224f04224 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 10 Sep 2019 00:35:02 -0400 Subject: [PATCH 375/494] [documentations] updated pytriton tutorial --- docs/pytriton.md | 196 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 196 insertions(+) diff --git a/docs/pytriton.md b/docs/pytriton.md index e69de29bb..2fb49a4f1 100644 --- a/docs/pytriton.md +++ b/docs/pytriton.md @@ -0,0 +1,196 @@ +#The PyTriton API + + +## Table of Contents + +## Motivations + +In this tutorial we assume some basic knowledge of Triton-C, so check out the corresponding [tutorial](https://github.com/ptillet/triton/blob/master/docs/triton-c.md) if you have not already! + +The purpose of PyTriton is to provide an API for integrating Triton-C kernels into PyTorch and Tensorflow. The good thing about PyTriton is that it is framework agnostic, in the sense that any custom op written using this API will be transparently compatible with both Tensorflow and PyTorch without any additional effort required. Consider for example the following piece of code: + +```python +import numpy as np +import triton + +def run_tf(): + M, N, K = 128, 128, 128 + a = tf.placeholder(tf.float32, shape=[M, K]) + b = tf.placeholder(tf.float32, shape=[N, K]) + c = triton.ops.dot(a, b, transpose_a = False, transpose_b = True) + da, db = tf.gradients(c, [a, b]) + # Run + ha = np.random.rand(M, K).astype(np.float32) + hb = np.random.rand(K, N).astype(np.float32) + sess = tf.InteractiveSession() + sess.run(tf.global_variables_initializer()) + result = sess.run([da], feed_dict = {a: ha, b: hb}) + +def run_torch(): + M, N, K = 128, 128, 128 + a = torch.randn(M, K).cuda() + b = torch.randn(K, N).cuda() + a.requires_grad_(True) + b.requires_grad_(True) + c = triton.ops.dot(a, b, False, True) + c.backward() + da = a.grad.clone() + db = b.grad.clone() + +## Run on tensorflow +# import tensorflow as tf +# run_tf() + +## Run on pytorch +# import torch +# run_torch() +``` + +Here, the triton module detects which frameworks are imported when executiong a `triton.op` for the first time, and generates the appropriate framework bindings code accordingly. Specifically, when a Triton custom op is executed for the first time, the following chain of events takes place: +- The imported frameworks are detected +- The C++ code for a Tensorflow or PyTorch generic custom operation -- with the same signature as the provided Triton-C kernel -- is generated, compiled and cached +- The Tensorflow or PyTorch op is dynamically loaded using the generated .so file, and a framework-agnostic wrapper is returned +- The wrapper is called and a tf.tensor or a torch.tensor is returned. In the case of Tensorflow, the gradient is also registered at this point if applicable + + +## Writing your own custom operation + +In this section we will reimplement the above `dot` function, whose full source-code can be found [here](https://github.com/ptillet/triton/blob/master/python/triton/ops/dot.py). + + +The first thing to do to create a custom op is to declare a class which inherits from `triton.function`. +```python +import triton + +class _dot(triton.function): + + src = """ +__global__ void dot(TYPE * A, TYPE * B, TYPE * C, + int M, int N, int K, + int lda __multipleof(8), int ldb __multipleof(8), int ldc __multipleof(8)) { + // prologue + int pm = get_program_id(0); + int pn = get_program_id(1); + int rm[TM] = pm * TM + 0 ... TM; + int rn[TN] = pn * TN + 0 ... TN; + int rk[TK] = 0 ... TK; + float c[TM, TN] = 0; + // pointers to operands + TYPE* pa[SHAPE_A] = A + rk[BROADCAST_AK] * STRIDE_AK + rm[BROADCAST_AM] * STRIDE_AM; + TYPE* pb[SHAPE_B] = B + rk[BROADCAST_BK] * STRIDE_BK + rn[BROADCAST_BN] * STRIDE_BN; + // prefetches operands + TYPE a[SHAPE_A] = (*pa); + TYPE b[SHAPE_B] = (*pb); + // reduction loop + for(int k = K; k > 0; k-= TK){ + c += USE_A @ USE_B; + pa = pa + TK * STRIDE_AK; + pb = pb + TK * STRIDE_BK; + a = *pa; + b = *pb; + } + // epilogue + int rcm[TM] = pm * TM + 0 ... TM; + int rcn[TN] = pn * TN + 0 ... TN; + TYPE* pc[TM, TN] = C + rcn[newaxis, :] + rcm[:, newaxis] * ldc; + *pc = c; +} + +} +""" + + kernel = triton.kernel(src, ['C']) +``` + +Here, `src` is the exact Triton-C source-code generated at the end of the aforementioned [tutorial](https://github.com/ptillet/triton/blob/master/docs/triton-c.md) , and `kernel = triton.kernel(src, ['C'])` creates a triton kernel from this source code which returns the tensor whose data points to `C`. At this point, `kernel` is a callable object which takes the same signature as the `dot` function in our source code, except that pointers are treated as tensors: `[tensor, tensor, tensor, int, int, int, int, int, int]`. + +However, in practice only A, B and C are provided by the user, and all the other `int` arguments are deduced from them, hence we create a helper function that extracts shapes from the `A`, `B` and `C` tensor and calls ouer `kernel`: + +```python + @staticmethod + def _call(a, b, transpose_a, transpose_b): + # extract shapes + shape_a = triton.shape(a) + shape_b = triton.shape(b) + M, Ka = shape_a[0], shape_a[1] + Kb, N = shape_b[0], shape_b[1] + # transpose shapes + if transpose_a: + M, Ka = Ka, M + if transpose_b: + Kb, N = N, Kb + # contiguous dimensions + lda = M if transpose_a else Ka + ldb = Kb if transpose_b else N + ldc = N + # data-type + dtype = a.dtype + # allocate output + c = triton.empty([M, N], dtype = dtype) + # compute + grid = lambda opt: [triton.cdiv(M, opt.d('TM')), triton.cdiv(N, opt.d('TN'))] + # macros -- not necessary but makes kernel source-code simpler + macros = {# handle A transposition + 'USE_A' : '^a' if transpose_a else 'a', + 'STRIDE_AK' : 'lda' if transpose_a else '1', + 'STRIDE_AM' : '1' if transpose_a else 'lda', + 'BROADCAST_AK': ':, newaxis' if transpose_a else 'newaxis, :', + 'BROADCAST_AM': 'newaxis, :' if transpose_a else ':, newaxis', + 'SHAPE_A' : 'TK, TM' if transpose_a else 'TM, TK', + # handle B transposition + 'USE_B' : '^b' if transpose_b else 'b', + 'STRIDE_BK' : '1' if transpose_b else 'ldb', + 'STRIDE_BN' : 'ldb' if transpose_b else '1', + 'BROADCAST_BK': 'newaxis, :' if transpose_b else ':, newaxis', + 'BROADCAST_BN': ':, newaxis' if transpose_b else 'newaxis, :', + 'SHAPE_B' : 'TN, TK' if transpose_b else 'TK, TN'} + return _dot.kernel(a, b, c, M, N, Ka, lda, ldb, ldc, grid, + AT = transpose_a, BT = transpose_b, TYPE = dtype, + TM = [32, 64, 128], TN = [32, 64, 128], TK = [8], **macros) + +``` + +There are a few things to note here: + +- `triton.shape` provides a framework-agnostic way to retrieve the shape of a tensor +- `triton.empty` creates an empty tensor of the specified dimensions +- `grid` corresponds to the grid with which our Triton kernel will be launched. Because in our case this grid depends on parametric tile variables, it is supplied as a function of compilation options `opt`, whose compile-time definition can be retrieved using `opt.d(name)`. Here, `opt.d('TM')` and `opt.d('TN')` retrieve the first and second tile dimension our kernel was compiled with. We also provide a helper `triton.cdiv` for ceil divisions. +- `macros` provides a list of preprocessor definitions to compile the kernel with. Alternatively, these can also be supplied as named argument to the `_dot.kernel`. We recall that lists can be supplied to the preprocessor, in which case an auto-tuning procedure will be triggered. Here, the value of `TM` and `TN` are both tuned between 32, 64 and 128. + +PyTriton binds to Tensorflow's and PyTorch's automatic differentiation framework using a single, common API inspired by PyTorch. It consists of two static methods `forward` and `backward` that take a context as their first input: + +``` + @staticmethod + def forward(ctx, a, b, transpose_a = False, transpose_b = False): + ctx.save_for_backward(a, b) + ctx.t_a = transpose_a + ctx.t_b = transpose_b + return _dot._call(a, b, transpose_a, transpose_b) + + @staticmethod + def backward(ctx, dy): + a, b = ctx.saved_tensors + t_a, t_b = ctx.t_a, ctx.t_b + if not t_a and not t_b: + da = _dot._call(dy, b, False, True) + db = _dot._call(a, dy, True, False) + elif not t_a and t_b: + da = _dot._call(dy, b, False, False) + db = _dot._call(dy, a, True, False) + elif t_a and not t_b: + da = _dot._call(b, dy, False, True) + db = _dot._call(a, dy, False, False) + elif t_a and t_b: + da = _dot._call(b, dy, True, True) + db = _dot._call(dy, a, True, True) + else: + assert False + return da, db, None, None, None, None, None, None, None +``` + +Still like for PyTorch, a callable operation can be created using the `apply` method of our `triton.function` class. We wrap it as a module variable for convenience: + +```python +dot = _dot.apply +``` +And that's it! Our custom op is now created and ready to be used with both PyTorch and Tensorflow. \ No newline at end of file From c622619bcb26fce564a646a5112e0754ca739ba0 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 10 Sep 2019 00:37:51 -0400 Subject: [PATCH 376/494] more progress --- lib/codegen/analysis/grid.cc | 3 ++- lib/codegen/transform/reorder.cc | 22 ++++++++++++++++------ lib/runtime/function.cc | 2 +- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/lib/codegen/analysis/grid.cc b/lib/codegen/analysis/grid.cc index d7b773aaf..89b3d90a8 100644 --- a/lib/codegen/analysis/grid.cc +++ b/lib/codegen/analysis/grid.cc @@ -332,10 +332,10 @@ void grids::create_grids(std::vector &grids, std::set seen; std::function bind_references = [&](ir::value *v) { - auto order = reorder_->get_order(v); // skip if(!v->get_type()->is_tile_ty() || !seen.insert(v).second) return; + auto order = reorder_->get_order(v); // recurse if(auto *user = dynamic_cast(v)) for(ir::value *op: user->ops()) @@ -360,6 +360,7 @@ void grids::create_grids(std::vector &grids, for(auto &ref: references) if(std::find(grids.begin(), grids.end(), ref.second) == grids.end()) grids.push_back(ref.second); + std::cout << grids.size() << std::endl; } diff --git a/lib/codegen/transform/reorder.cc b/lib/codegen/transform/reorder.cc index c5bc31d59..875faaab1 100644 --- a/lib/codegen/transform/reorder.cc +++ b/lib/codegen/transform/reorder.cc @@ -18,7 +18,6 @@ reorder::reorder(analysis::align* align, analysis::meminfo *mem) : align_(align), mem_(mem) { } std::vector reorder::get_order(ir::value* v) { - std::cout << v->get_name() << std::endl; return order_.at(v); } @@ -26,6 +25,20 @@ void reorder::run(ir::module &mod) { std::set io; + std::function set_order = [&](ir::value *v) -> void { + if(order_.find(v) != order_.end()) + return; + if(ir::user* u = dynamic_cast(v)) + for(ir::value* op: u->ops()) + set_order(op); + ir::type* ty = v->get_type(); + if(!ty->is_tile_ty()) + return; + std::vector order(ty->get_tile_shapes().size()); + std::iota(order.begin(), order.end(), 0); + order_[v] = order; + }; + // initialize work-list for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: ir::cfg::reverse_post_order(fn)) @@ -34,10 +47,8 @@ void reorder::run(ir::module &mod) { ir::type* ptr_ty = x->get_pointer_operand()->get_type(); if(ptr_ty->is_tile_ty()) io.insert(x); - std::vector order(ptr_ty->get_tile_shapes().size()); - std::iota(order.begin(), order.end(), 0); - order_[i] = order; } + set_order(i); } ir::builder &builder = mod.get_builder(); @@ -48,9 +59,8 @@ void reorder::run(ir::module &mod) { std::iota(order.begin(), order.end(), 0); std::sort(order.begin(), order.end(), [&](unsigned a, unsigned b) { return max_contiguous[a] > max_contiguous[b]; } ); std::list work_list; - if(order != order_[i]){ + if(order != order_[i]) work_list.push_back(i); - } // rematerialize recursively while(!work_list.empty()) { ir::instruction* current = work_list.back(); diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index f2aa8e4db..28f3895a3 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -210,9 +210,9 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c peephole.run(module); dce.run(module); alignment_info.run(module); - ir::print(module, std::cout); if(target->is_gpu()) shmem_info.run(module); + ir::print(module, std::cout); reorder.run(module); dce.run(module); ir::print(module, std::cout); From 898b116f303e5e0b1097994d308fec76ee9b0106 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 10 Sep 2019 01:32:31 -0400 Subject: [PATCH 377/494] [documentation] added pytriton tutorial --- README.md | 2 +- docs/pytriton.md | 68 +++++++++++++++++++++++++++++++++++++----------- 2 files changed, 54 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index c65aa8680..41030146a 100644 --- a/README.md +++ b/README.md @@ -29,8 +29,8 @@ python dot.py ## Tutorials -- The PyTriton API (coming soon...) - [The Triton-C language](https://github.com/ptillet/triton/blob/master/docs/triton-c.md) +- [The PyTriton API](https://github.com/ptillet/triton/blob/master/docs/pytriton.md) - The Triton-IR representation (coming soon...) - The Triton-JIT compiler (coming soon...) diff --git a/docs/pytriton.md b/docs/pytriton.md index 2fb49a4f1..0c9a35265 100644 --- a/docs/pytriton.md +++ b/docs/pytriton.md @@ -1,13 +1,24 @@ #The PyTriton API - ## Table of Contents +This tutorial is the continuation of the [Triton-C tutorial](https://github.com/ptillet/triton/blob/master/docs/triton-c.md), so check it out if you have not already! + +1. [Motivations](#motivations) +2. [Triton Functions](#pytriton-function) + 1. [Creation of Triton Kernels](#creation-triton-kernels) + 2. [Usage of Triton Kernels](#usage-triton-kernels) +3. [Integration with Automatic Differentiation](#autodiff) + 1. [Basics](#autodiff:basics) + 2. [Convenience](#autodiff:convenience) + + ## Motivations -In this tutorial we assume some basic knowledge of Triton-C, so check out the corresponding [tutorial](https://github.com/ptillet/triton/blob/master/docs/triton-c.md) if you have not already! -The purpose of PyTriton is to provide an API for integrating Triton-C kernels into PyTorch and Tensorflow. The good thing about PyTriton is that it is framework agnostic, in the sense that any custom op written using this API will be transparently compatible with both Tensorflow and PyTorch without any additional effort required. Consider for example the following piece of code: +The purpose of PyTriton is to provide an API for easily executing Triton-C kernels from PyTorch and Tensorflow. One of the main advantages of PyTriton is that it is framework agnostic: any custom op written using this API will be transparently compatible with both Tensorflow and PyTorch without any additional effort required, as will be shown in this tutorial. + +Consider for example the following piece of code: ```python import numpy as np @@ -46,25 +57,34 @@ def run_torch(): # run_torch() ``` -Here, the triton module detects which frameworks are imported when executiong a `triton.op` for the first time, and generates the appropriate framework bindings code accordingly. Specifically, when a Triton custom op is executed for the first time, the following chain of events takes place: -- The imported frameworks are detected -- The C++ code for a Tensorflow or PyTorch generic custom operation -- with the same signature as the provided Triton-C kernel -- is generated, compiled and cached -- The Tensorflow or PyTorch op is dynamically loaded using the generated .so file, and a framework-agnostic wrapper is returned -- The wrapper is called and a tf.tensor or a torch.tensor is returned. In the case of Tensorflow, the gradient is also registered at this point if applicable +PyTriton works by detecting which frameworks are imported and automatically generating and just-in-time compiling C++ binding code for them. Specifically, the following chain of events is triggered when a Triton operation is executed: + +1. The imported frameworks are detected +2. C++ binding code for Tensorflow or PyTorch is generated, compiled and cached. +3. The corresponding custom-op is automatically loaded from the generated .so file, and a framework-agnostic wrapper is created. +4. The wrapper is called and a tf.tensor or a torch.tensor is returned. In the case of Tensorflow, the gradient is also registered at this point if applicable -## Writing your own custom operation +The remainder of this tutorial will show you how to re-implement the above `triton.ops.dot` operation from scratch. -In this section we will reimplement the above `dot` function, whose full source-code can be found [here](https://github.com/ptillet/triton/blob/master/python/triton/ops/dot.py). +## PyTriton Functions +The PyTriton API provides a `triton.function` class which automatically handles the interaction with automatic differentiation in whichever framework was detected. Therefore, every differentiable custom operation written with PyTriton should inherit from this class -The first thing to do to create a custom op is to declare a class which inherits from `triton.function`. ```python import triton class _dot(triton.function): - src = """ +``` + +### Creation of Triton Kernels + + +PyTriton also provides a `triton.kernel` class which automatically takes care of interaction with the Triton-JIT as well as the generation and compilation of C++ framework bindings code. For our dot operation we create a kernel from the Triton-C code derived at the end of the [previous tutorial](https://github.com/ptillet/triton/blob/master/docs/triton-c.md) + +``` +src = """ __global__ void dot(TYPE * A, TYPE * B, TYPE * C, int M, int N, int K, int lda __multipleof(8), int ldb __multipleof(8), int ldc __multipleof(8)) { @@ -102,9 +122,16 @@ __global__ void dot(TYPE * A, TYPE * B, TYPE * C, kernel = triton.kernel(src, ['C']) ``` -Here, `src` is the exact Triton-C source-code generated at the end of the aforementioned [tutorial](https://github.com/ptillet/triton/blob/master/docs/triton-c.md) , and `kernel = triton.kernel(src, ['C'])` creates a triton kernel from this source code which returns the tensor whose data points to `C`. At this point, `kernel` is a callable object which takes the same signature as the `dot` function in our source code, except that pointers are treated as tensors: `[tensor, tensor, tensor, int, int, int, int, int, int]`. +Note that the second argument to `triton.kernel` constructors indicates which of the operands our kernel function should return. Here, we only return `C`. -However, in practice only A, B and C are provided by the user, and all the other `int` arguments are deduced from them, hence we create a helper function that extracts shapes from the `A`, `B` and `C` tensor and calls ouer `kernel`: +At this point, `kernel` is a callable object which takes the same signature as the `dot` function in our source code, except that pointers are treated as tensors: +``` +[tensor, tensor, tensor, int, int, int, int, int, int] +``` + +### Usage of Triton Kernels + +However, in practice only A, B are provided by the user, and all the other `int` arguments should be derived from these operands only. Hence, we create a helper function that extracts shapes from the `A` and `B` tensors, and then returns the results of a call to `kernel`: ```python @staticmethod @@ -150,13 +177,22 @@ However, in practice only A, B and C are provided by the user, and all the other ``` -There are a few things to note here: +While this code should be mostly self-explanatory, there are a few of noteworthy things worth pointing out - `triton.shape` provides a framework-agnostic way to retrieve the shape of a tensor + - `triton.empty` creates an empty tensor of the specified dimensions + - `grid` corresponds to the grid with which our Triton kernel will be launched. Because in our case this grid depends on parametric tile variables, it is supplied as a function of compilation options `opt`, whose compile-time definition can be retrieved using `opt.d(name)`. Here, `opt.d('TM')` and `opt.d('TN')` retrieve the first and second tile dimension our kernel was compiled with. We also provide a helper `triton.cdiv` for ceil divisions. + - `macros` provides a list of preprocessor definitions to compile the kernel with. Alternatively, these can also be supplied as named argument to the `_dot.kernel`. We recall that lists can be supplied to the preprocessor, in which case an auto-tuning procedure will be triggered. Here, the value of `TM` and `TN` are both tuned between 32, 64 and 128. +## Compatibility with Automatic Differentiation + +At this point, our custom operation only takes two tensor arguments and transposition information, which is good. However, it is still not compatible with PyTorch's or TensorFlow's automatic differentiation engine, and a small amount of additional effort is needed. + +### Basics + PyTriton binds to Tensorflow's and PyTorch's automatic differentiation framework using a single, common API inspired by PyTorch. It consists of two static methods `forward` and `backward` that take a context as their first input: ``` @@ -188,6 +224,8 @@ PyTriton binds to Tensorflow's and PyTorch's automatic differentiation framework return da, db, None, None, None, None, None, None, None ``` +### Convenience + Still like for PyTorch, a callable operation can be created using the `apply` method of our `triton.function` class. We wrap it as a module variable for convenience: ```python From 3c88a206c398cadf2dcbc4ac0867e7bf4bc56b7d Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 10 Sep 2019 01:33:30 -0400 Subject: [PATCH 378/494] [documentation] fixed formating issue in pytriton tutorial --- docs/pytriton.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/pytriton.md b/docs/pytriton.md index 0c9a35265..f7c5c5540 100644 --- a/docs/pytriton.md +++ b/docs/pytriton.md @@ -1,4 +1,5 @@ -#The PyTriton API +# The PyTriton API + ## Table of Contents From 060498cad1eb5006f46a9c7539f4d1f805c126c7 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 10 Sep 2019 01:36:11 -0400 Subject: [PATCH 379/494] [documentation] fixed broken references in PyTriton tutorial --- docs/pytriton.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/pytriton.md b/docs/pytriton.md index f7c5c5540..a877dadbd 100644 --- a/docs/pytriton.md +++ b/docs/pytriton.md @@ -10,8 +10,8 @@ This tutorial is the continuation of the [Triton-C tutorial](https://github.com/ 1. [Creation of Triton Kernels](#creation-triton-kernels) 2. [Usage of Triton Kernels](#usage-triton-kernels) 3. [Integration with Automatic Differentiation](#autodiff) - 1. [Basics](#autodiff:basics) - 2. [Convenience](#autodiff:convenience) + 1. [Basics](#autodiff-basics) + 2. [Convenience](#autodiff-convenience) ## Motivations @@ -192,7 +192,7 @@ While this code should be mostly self-explanatory, there are a few of noteworthy At this point, our custom operation only takes two tensor arguments and transposition information, which is good. However, it is still not compatible with PyTorch's or TensorFlow's automatic differentiation engine, and a small amount of additional effort is needed. -### Basics +### Basics PyTriton binds to Tensorflow's and PyTorch's automatic differentiation framework using a single, common API inspired by PyTorch. It consists of two static methods `forward` and `backward` that take a context as their first input: @@ -225,7 +225,7 @@ PyTriton binds to Tensorflow's and PyTorch's automatic differentiation framework return da, db, None, None, None, None, None, None, None ``` -### Convenience +### Convenience Still like for PyTorch, a callable operation can be created using the `apply` method of our `triton.function` class. We wrap it as a module variable for convenience: From ef1feefe7fe3445da889cc9b8b828b85aa21b632 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 10 Sep 2019 02:01:09 -0400 Subject: [PATCH 380/494] [lang] added __global__ storage specifier --- include/triton/lang/token.h | 4 ++-- include/triton/lang/type.h | 2 +- lib/lang/parser.cc | 8 ++++---- lib/lang/token.cc | 4 ++-- tests/common/src/dot.h | 10 +++++----- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/triton/lang/token.h b/include/triton/lang/token.h index 2f6b57cfc..1690ba246 100644 --- a/include/triton/lang/token.h +++ b/include/triton/lang/token.h @@ -142,7 +142,7 @@ public: STATIC, THREAD, // _Thread_local AUTO, - REGISTER, + GLOBAL, // STORAGE CLASS SPECIFIER END BREAK, @@ -236,7 +236,7 @@ public: bool IsIdentifier() const { return IDENTIFIER == tag_; } bool IsEOF() const { return tag_ == Token::END; } bool IsTypeSpecQual() const { return CONST <= tag_ && tag_ <= ENUM; } - bool IsDecl() const { return CONST <= tag_ && tag_ <= REGISTER; } + bool IsDecl() const { return CONST <= tag_ && tag_ <= GLOBAL; } static const char* Lexeme(int tag) { auto iter = tagLexemeMap_.find(tag); if (iter == tagLexemeMap_.end()) diff --git a/include/triton/lang/type.h b/include/triton/lang/type.h index 08b8418f3..0985ba5e1 100644 --- a/include/triton/lang/type.h +++ b/include/triton/lang/type.h @@ -40,7 +40,7 @@ enum { S_STATIC = 0x04, S_THREAD = 0x08, S_AUTO = 0x10, - S_REGISTER = 0x20, + S_GLOBAL = 0x20, // Type specifier T_SIGNED = 0x40, diff --git a/lib/lang/parser.cc b/lib/lang/parser.cc index 8bd5634a9..fed1422fc 100644 --- a/lib/lang/parser.cc +++ b/lib/lang/parser.cc @@ -1000,6 +1000,10 @@ QualType Parser::ParseDeclSpec(int* storageSpec, int* funcSpec, int* alignSpec) EnsureAndSetStorageSpec(tok, storageSpec, S_EXTERN); break; + case Token::GLOBAL: + EnsureAndSetStorageSpec(tok, storageSpec, S_GLOBAL); + break; + case Token::STATIC: if (!storageSpec) Error(tok, ERR_FUNC_SPEC); @@ -1020,10 +1024,6 @@ QualType Parser::ParseDeclSpec(int* storageSpec, int* funcSpec, int* alignSpec) EnsureAndSetStorageSpec(tok, storageSpec, S_AUTO); break; - case Token::REGISTER: - EnsureAndSetStorageSpec(tok, storageSpec, S_REGISTER); - break; - // Type qualifier case Token::CONST: qualSpec |= Qualifier::CONST; break; case Token::RESTRICT: qualSpec |= Qualifier::RESTRICT; break; diff --git a/lib/lang/token.cc b/lib/lang/token.cc index d8f0c0301..b9f3c8467 100644 --- a/lib/lang/token.cc +++ b/lib/lang/token.cc @@ -7,6 +7,7 @@ static MemPoolImp tokenPool; const std::unordered_map Token::kwTypeMap_ { + { "__global__", Token::GLOBAL }, { "auto", Token::AUTO }, { "break", Token::BREAK }, { "case", Token::CASE }, @@ -30,7 +31,6 @@ const std::unordered_map Token::kwTypeMap_ { { "newaxis", Token::NEWAXIS }, { "signed", Token::SIGNED }, { "unsigned", Token::UNSIGNED }, - { "register", Token::REGISTER }, { "restrict", Token::RESTRICT }, { "return", Token::RETURN }, { "short", Token::SHORT }, @@ -121,6 +121,7 @@ const std::unordered_map Token::tagLexemeMap_ { { Token::EXTERN, "extern" }, { Token::FLOAT, "float" }, { Token::FOR, "for" }, + { Token::GLOBAL, "global" }, { Token::GOTO, "goto" }, { Token::IF, "if" }, { Token::INLINE, "inline" }, @@ -129,7 +130,6 @@ const std::unordered_map Token::tagLexemeMap_ { { Token::NEWAXIS, "newaxis" }, { Token::SIGNED, "signed" }, { Token::UNSIGNED, "unsigned" }, - { Token::REGISTER, "register" }, { Token::RESTRICT, "restrict" }, { Token::RETURN, "return" }, { Token::SHORT, "short" }, diff --git a/tests/common/src/dot.h b/tests/common/src/dot.h index c9b3454d7..1b38f3a21 100644 --- a/tests/common/src/dot.h +++ b/tests/common/src/dot.h @@ -34,11 +34,11 @@ R"( #define SHAPE_B TK, TN #endif -void dot(TYPE * A, TYPE * B, TYPE * C, - int M, int N, int K, - int lda __multipleof(8), - int ldb __multipleof(8), - int ldc) { +__global__ void dot(TYPE * A, TYPE * B, TYPE * C, + int M, int N, int K, + int lda __multipleof(8), + int ldb __multipleof(8), + int ldc) { // prologue int ridx = get_program_id(0); int ridy = get_program_id(1); From ab33e84337e0e2de5be99760b54a8356decdab84 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 10 Sep 2019 03:01:41 -0400 Subject: [PATCH 381/494] [documentation] improved wording in triton-c tutorial --- docs/triton-c.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/triton-c.md b/docs/triton-c.md index bb453a227..758f5ca10 100644 --- a/docs/triton-c.md +++ b/docs/triton-c.md @@ -169,11 +169,11 @@ _Note: Tuning our reference CUDA kernel would be much more cumbersome, as templa ## Matrix Transposition -Transpositions are (relatively) hard to efficiently write in CUDA because a naive implementation would lead to _uncoalesced_ memory operations when writing back the transposed matrix to DRAM. Therefore, optimized CUDA implementations require the explicit use of shared memory, as shown [here](https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/). +Transpositions are (relatively) hard to efficiently write in CUDA as naive implementations typically suffer from _uncoalesced_ memory operations when writing back the transposed matrix to DRAM. Of course, this can be fixed by using shared memory as shown [here](https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/), but this comes at the cost of simplicity and -- more importantly -- interferes with auto-tuning. ### Compute Kernel -In Triton, however, kernels are single-threaded and the compiler automatically detects if and when data should be temporarily stashed to shared memory. Therefore, an optimal Triton kernel for this operation would look like: +In Triton, however, kernels are single-threaded and the compiler automatically detects if and when data should be temporarily stashed to shared memory in order to enable shared memory stores/loads. Therefore, an optimal Triton kernel for this operation would look like: ```c // launched on a grid of (M / TM) x (N / TN) programs of 1 thread each @@ -192,20 +192,22 @@ __global__ void transpose(TYPE * X, TYPE * Y, int M, int N, int ldx, int ldy) { } ``` -This kernel loads a `TM x TN` tile from the input matrix `X`, transposes it and write the resulting `TN x TM` tile to the output matrix `Y`. As a result, transposition of the full input matrix is achieved by launching a grid of `(M / TM) x (N / TN)` programs decomposed as follows: +At a high level, this kernel loads a `TM x TN` tile from the input matrix `X`, transposes it and writes the resulting `TN x TM` tile to the output matrix `Y`. As a result, transposition of the full input matrix is achieved by launching a grid of `(M / TM) x (N / TN)` programs decomposed as follows: -- Statements (1) and (2) extract the location of the program in the grid. For example, the program producing the output tile `Y[TN:2TN-1, 2TN:3TN-1]` will hold the values: +- Statements (1) and (2) extract the coordinates the program in the above 2D launch grid. For example, the program producing the output tile `Y[TN:2TN-1, 2TN:3TN-1]` holds the values: ``` pidm = 2 pidn = 1 ``` -- Statements (3) and (4) construct the ranges of indices to read from the first and second axis of X: +- Statements (3) and (4) construct the ranges of indices: ``` rm = [pidm*TM + 0, pidm*TM + 1, ..., pidm*TM + (TM - 1)] rn = [pidn*TN + 0, pidn*TN + 1, ..., pidn*TN + (TN - 1)] ``` +which will be used in statements (5) and (6) to construct tiles of pointers + - Statements (5) constructs the following array of pointers `px` using numpy-style broadcasting semantics: ``` │ X + (pidm*TM + 0) + (pidn*TN + 0)*ldx, ..., ..., X + (pidm*TM + 0) + (pidn*TN + TN - 1)*ldx) │ From d3491e01a996aee33d47d19ebd9f472481e7c75b Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 10 Sep 2019 12:27:28 -0400 Subject: [PATCH 382/494] [documentation][triton-c] making it clearer that it is not (yet) an MLIR dialect --- docs/triton-c.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/triton-c.md b/docs/triton-c.md index 758f5ca10..f766bda3b 100644 --- a/docs/triton-c.md +++ b/docs/triton-c.md @@ -72,7 +72,7 @@ float v[16, 32] = u[:, newaxis]; // broadcasting along the second axis Of course, we can do much more than additions, reduction and broadcasting. The purpose of this tutorial is to walk you through all the features of Triton-C, and eventually show you how it can be used to build auto-tuned matrix-multiplication kernels on par with state-of-the-art CUDA-C implementation in less than an afternoon. -_Note: You might be thinking that this is exactly what [MLIR](https://github.com/tensorflow/mlir) was made for... and you're right! You can think of Triton-IR as a dialect for MLIR, and Triton-C as a frontend for it. If you're interested in making this a thing, let me know._ +_Note: You might be thinking that this is exactly what [MLIR](https://github.com/tensorflow/mlir) was made for... and you're right! You can think of Triton-IR conceptually as a dialect for MLIR, and Triton-C as a frontend for it -- although not yet implemented as such. If you're interested in making this a thing, let me know._ ## Vector Addition From 59c667801c7e4944d8ed516e71df25aec50119b7 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 10 Sep 2019 12:28:42 -0400 Subject: [PATCH 383/494] [documentation][triton-c] fixed syntax highlighting on conditional transposition example --- docs/triton-c.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/triton-c.md b/docs/triton-c.md index f766bda3b..55e9ebddb 100644 --- a/docs/triton-c.md +++ b/docs/triton-c.md @@ -227,7 +227,7 @@ which will be used in statements (5) and (6) to construct tiles of pointers ### Conditional Dereferencing You might have noticed that the above code will fail when `M` and `N` are not multiples of `TM` and `TN` respectively. Fortunately, the above kernel can be slightly modified to handle thie situation, as shown below: -``` +```c // launched on a grid of ((M + TM - 1) / TM) x ((N + TN - 1) / TN) programs __global__ void transpose(TYPE * X, TYPE * Y, int M, int N, int ldx, int ldy) { // ... From 7f21a63ae11c1f3bf469cace010cb066ff95e84e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 10 Sep 2019 13:26:12 -0400 Subject: [PATCH 384/494] [documentation][triton-c] clearer motivations; now starting each snippet with the language it's written in --- docs/triton-c.md | 112 +++++++++++++++++++++++++++++------------------ 1 file changed, 70 insertions(+), 42 deletions(-) diff --git a/docs/triton-c.md b/docs/triton-c.md index 55e9ebddb..0e798c1c4 100644 --- a/docs/triton-c.md +++ b/docs/triton-c.md @@ -2,8 +2,6 @@ ## Table of Contents 1. [Motivations](#motivations) - 1. [Issues of C/C++ for Linear Algebra](#issues-c-c++) - 2. [Design Philosophy of Triton-C](#design-philosophy) 2. [Vector Addition](#vector-addition) 1. [Differences with CUDA](#differences-with-cuda) 2. [Advantages over CUDA](#advantages-over-cuda) @@ -18,50 +16,47 @@ 2. [Optimizations](#optimizations) 1. [Pre-Fetching](#pre-fetching) 1. [Rematerialization](#rematerialization) - 3. [Fused Transpositions](#fused-trans) + 3. [Fused Transpositions and Auto-Tuning](#fused-trans-autotuning) ## Motivations -## Issues of C/C++ for Linear Algebra - -In C and C++, arrays and pointers have similar semantics. Indeed, there is no way to manipulate statically shaped multi-dimensional arrays (beyond initialization) as a whole without resorting to third-party libraries: +In C and C++, arrays and pointers have similar semantics. Indeed, there is no native way to manipulate statically shaped multi-dimensional arrays (beyond initialization) as a whole: ```c +// C99 float x[16][8] = {3.14}; float y[16][8] = {5.17}; // z = x + y float z[16][8]; -for(int i = 0; i < 16; i ++) +#pragma unroll +for(int i = 0; i < 16; i++) + #pragma unroll for(int j = 0; j < 8; j++) z[i][j] = x[i][j] + y[i][j]; ``` -This issue can be somewhat mitigated using templates metaprogramming in C++: +While it does not seem like a big deal at first sight, there are two issues with this: + +- **Ergonomics**: Of course, it is possible to simplify the above code using functions in C +``` +float z[16][8]; +add(z, x, y, 16, 8); +``` +but this would be semantically different as the loops can no longer be unrolled due to their bounds being now dynamic arguments of the add function. This can be mitigated using templates metaprogramming (and operator overloads) in C++: ```c +// C++ +template +class matrix; + matrix x = {3.14}; matrix y = {5.17}; matrix z = x + y; ``` -This is better, but there are still some important issues with this approach: - -- The syntax could be better, especially when it comes to broadcasting and reshaping. - -- Data-flow information for array operations does not propagate beyond the program's AST, thereby making it difficult for compilers to optimize moderately complicated array programs (i.e., Matrix-Multiplication). This can be worked around using heavy metaprogramming techniques (see [CUTLASS](https://github.com/NVIDIA/cutlass)), but even then programmers still have to allocate and synchronize shared memory manually and endure prohibitively long compilation procedures not easily amenable to auto-tuning. - -For these reasons, most Deep-Learning frameworks still rely heavily on highly optimized subroutines (e.g., BLAS), which makes the development of novel custom primitives time-consuming for experts and challenging for others. This is where Triton comes into play. - -## Design Philosophy of Triton-C - -The purpose of Triton is to bring native support for efficient numerical multi-dimensional array operations into a standard procedural languages. We achieve this through: - -* **Triton-C**: Syntactic and semantical extensions to the C language. In particular, native support for reshaping, broadcasting, matrix-multiplication, transposition, etc. This is the object of this tutorial. - -* **Triton-IR**: An LLVM-like IR for array operations, as well as various (automatic memory coalescing, automatic vectorization, shared memory allocation/synchronization, tensor core instruction selection, etc.). Although our system generates Triton-IR programs from Triton-C source-code, this is beyond the scope of this tutorial. More information can be found [here](http://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf). - -Anyway, the Triton-C code corresponding to the above matrix addition operation can be written and extended as follows: +While this is better and now equivalent to our initial code snippet, the syntax is not quite as ergonomically satisfying as what native syntactic support could provide: ```c +// Triton-C float x[16, 8] = 3.14; float y[16, 8] = 5.17; // float z[8, 8] = x + y; // doesn't compile -- incompatible shapes! @@ -69,10 +64,30 @@ float z[16, 8] = x + y; float u[16] = z[:, +]; // sum along the second axis float v[16, 32] = u[:, newaxis]; // broadcasting along the second axis ``` +which is valid _Triton-C_. + +_Note: Triton-C is single-threaded._ + +- **Portability**: One other issue with our initial C99 program is that it is not portable. While it will run well on a single CPU thread, the operation `z = x + y` would perform poorly on a GPU Streaming Processor as it would execute on a single thread only. For this readon, it would have to be rewritten for GPUs as follows: +``` +// CUDA +// Launch on a block of 16 x 8 threads +float x = 3.14; +float y = 5.17; +float z = x + y +``` +In Triton-C, the same code can be used on many different platforms (only CPUs and GPUs are supported at the moment). + +- **Performance**: Another issue with our initial C99 code snippet is its performance. Although the loops are unrolled, the program does not carry the data-flow information of array operations. This issue gets more and more problematic as programs get increasingly complex, eventually culminating in matrix multiplication being remarkably hard to optimize. + + This can be worked around using heavy metaprogramming techniques (see [CUTLASS](https://github.com/NVIDIA/cutlass)), but even then programmers still have to allocate and synchronize shared memory manually and endure prohibitively long compilation procedures not easily amenable to auto-tuning. For these reasons, most Deep-Learning frameworks still rely heavily on highly optimized subroutines (e.g., BLAS), which makes the development of novel custom primitives time-consuming for experts and challenging for others. + + Triton addresses this issue by relying on **Triton-IR**, an LLVM-like IR for array operations, and **Triton-JIT**, an optimizing compiler for Triton-IR. These two systems are, however, beyond the scope of this tutorial. More information can be found [here](http://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf). + + +_Note: You might be thinking that this is exactly what [MLIR](https://github.com/tensorflow/mlir) was made for... and you're right! You can think of Triton-IR as a dialect for MLIR, and Triton-C as a frontend for it. If you're interested in making this a thing, let me know._ -Of course, we can do much more than additions, reduction and broadcasting. The purpose of this tutorial is to walk you through all the features of Triton-C, and eventually show you how it can be used to build auto-tuned matrix-multiplication kernels on par with state-of-the-art CUDA-C implementation in less than an afternoon. -_Note: You might be thinking that this is exactly what [MLIR](https://github.com/tensorflow/mlir) was made for... and you're right! You can think of Triton-IR conceptually as a dialect for MLIR, and Triton-C as a frontend for it -- although not yet implemented as such. If you're interested in making this a thing, let me know._ ## Vector Addition @@ -81,16 +96,18 @@ _Note: You might be thinking that this is exactly what [MLIR](https://github.com Let's start it off by looking at a simple example. Vector addition, in its most trivial Triton-C implementation, can be written as follows: ```c +// Triton-C // launched on a grid of (N / 32) programs of 1 thread each __global__ void add(int N, float *a, float *b, float* c) { int id = get_program_id(0); int off[32] = id * 32 + (0 ... 32) - *(c + off) = *(a + off) + *(b + off) + *(c + off) = *(a + off) + *(b + off); } ``` -For reference, here is an equivalent CUDA kernel (nvcc will generate the same PTX code as triton-jit on the above code): +For reference, here is an equivalent CUDA kernel (NVCC will generate the same PTX code as Triton-JIT on the above code): ```c +// CUDA // launched on a grid of (N / 32) programs of 32 threads each __global__ void add(int N, float *a, float *b, float *c) { int off = blockIdx.x * 32 + threadIdx.x; @@ -98,10 +115,10 @@ __global__ void add(int N, float *a, float *b, float *c) { } ``` -As you can see, there are three main differences between our Triton-C kernel and the equivalent CUDA-C: +As you can see, there are three main differences between our Triton-C kernel and the equivalent CUDA: - **The programming model is different**. -While Triton-C and CUDA-C both use a Single-Program, Multiple-Data (SPMD) programming model, each Triton-C kernel is single-threaded. +While Triton-C and CUDA both use a Single-Program, Multiple-Data (SPMD) programming model, each Triton-C kernel is single-threaded. Therefore, `get_program_id({0, 1, 2})` is equivalent to `blockIdx.{x, y, z}`, but there is no such thing as `blockDim` and `threadIdx`. - **The semantics of arrays is different** @@ -113,7 +130,7 @@ float* c_ptr[32] = c_broadcast + off; // c_ptr = c + off ``` - **The semantics of the subscript operator is different**. -n C/CUDA-C, subscripting can be used to offset and dereference a pointer, but in Triton-C it can only be used to index and broadcast an array (think NumPy). +n C/CUDA, subscripting can be used to offset and dereference a pointer, but in Triton-C it can only be used to index and broadcast an array (think NumPy). ### Advantages over CUDA @@ -123,6 +140,7 @@ At this point, the advantages of Triton-C over CUDA may not be obvious. But they On some hardware architectures, vectorizing load/store operations can lead to better memory utilization and, in turn, noticeable performance gains. In general, 128-bit memory transactions are favored, leading to the following CUDA kernel: ```c +// CUDA // launched on a grid of (N / 128) programs of 32 threads each __global__ void add(int N, float4 *a, float4 *b, float4 *c) { int off = blockIdx.x * 32 + threadIdx.x; @@ -131,6 +149,7 @@ __global__ void add(int N, float4 *a, float4 *b, float4 *c) { ``` Or, for half-precision inputs: ```c +// CUDA // launched on a grid of (N / 256) programs of 32 threads each __global__ void add(int N, half8 *a, half8 *b, half8 *c) { int off = blockIdx.x * 32 + threadIdx.x; @@ -146,11 +165,12 @@ In Triton-C, this is not a problem as the compiler will figure out automatically Specifically, the Triton compiler would refuse to 4-way vectorize our above compute kernel because it would require the array `int off[32]` to be distributed over 8 threads, which is less than a warp. Fortunately, it turns out that this problem can be easily solved using preprocessor directrives to _parameterize_ our kernel: ```c +// Triton-C // launched on a grid of (N / SIZE) programs of 1 thread each __global__ void add(int N, TYPE* a, TYPE* b, TYPE* c) { int id = get_program_id(0); - int off[SIZE] = id * SIZE + (0 ... SIZE) - *(c + off) = *(a + off) + *(b + off) + int off[SIZE] = id * SIZE + (0 ... SIZE); + *(c + off) = *(a + off) + *(b + off); } // Not vectorized when compiled with -DSIZE=32 -DTYPE=float // 4-Vectorized when compiled with -DSIZE=128 -DTYPE=float @@ -169,7 +189,7 @@ _Note: Tuning our reference CUDA kernel would be much more cumbersome, as templa ## Matrix Transposition -Transpositions are (relatively) hard to efficiently write in CUDA as naive implementations typically suffer from _uncoalesced_ memory operations when writing back the transposed matrix to DRAM. Of course, this can be fixed by using shared memory as shown [here](https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/), but this comes at the cost of simplicity and -- more importantly -- interferes with auto-tuning. +Transpositions are (relatively) hard to efficiently write in CUDA because naive implementations typically suffer from _uncoalesced_ memory operations when writing back the transposed matrix to DRAM. Of course, this can be fixed by using shared memory as shown [here](https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/), but this comes at the cost of simplicity and -- more importantly -- interferes with auto-tuning. ### Compute Kernel @@ -192,7 +212,7 @@ __global__ void transpose(TYPE * X, TYPE * Y, int M, int N, int ldx, int ldy) { } ``` -At a high level, this kernel loads a `TM x TN` tile from the input matrix `X`, transposes it and writes the resulting `TN x TM` tile to the output matrix `Y`. As a result, transposition of the full input matrix is achieved by launching a grid of `(M / TM) x (N / TN)` programs decomposed as follows: +At a high level, this kernel loads a `TM x TN` tile from the input matrix `X`, transposes it and writes the resulting `TN x TM` tile to the output matrix `Y`. Eventually, transposition of the full input matrix is achieved by launching a grid of `(M / TM) x (N / TN)` programs decomposed as follows: - Statements (1) and (2) extract the coordinates the program in the above 2D launch grid. For example, the program producing the output tile `Y[TN:2TN-1, 2TN:3TN-1]` holds the values: ``` @@ -239,21 +259,22 @@ __global__ void transpose(TYPE * X, TYPE * Y, int M, int N, int ldx, int ldy) { } ``` -Here, statements (7a) creates an array of booleans `checkx[TM, TN]` such that `checkx(i, j) = True` if and only if `px(i, j)` should be dereferenced. Statement (7b) does the same for `py`. Then, both `px` and `py` can be conditionally dereferenced using Triton-C's conditional dereferencing operator `*?(predicate) pointer`. +Here, statements (7a) creates an array of booleans `checkx[TM, TN]` such that `checkx(i, j) = True` if and only if `px(i, j)` should be dereferenced. Statement (7b) does the same for `py`. Both `px` and `py` are then conditionally dereferenced using Triton-C's conditional dereferencing operator `*?(predicate) pointer`. ## Matrix Multiplication -The purpose of this section is to present a Triton-C implementation of matrix multiplication that achieves performance competitive with the best existing hand-written CUDA-C kernels (see [CUTLASS](https://github.com/NVIDIA/cutlass)). We will also see how pre-processors macros can be leveraged to fuse transposition operations as well as to provide support for auto-tuning and FP16 Tensor Cores. +The purpose of this section is to present a Triton-C implementation of matrix multiplication that achieves performance competitive with the best existing hand-written CUDA kernels (see [CUTLASS](https://github.com/NVIDIA/cutlass)). We will also see how pre-processors macros can be leveraged to fuse transposition operations as well as to provide support for auto-tuning and FP16 Tensor Cores. -_Note: Bounds-checking is ommitted for the sake of clarity. This feature can be easily added into our kernel, but may result in a slight performance hit because LLVM and PTXAS have issues dealing with conditionals and predicates inside loops._ +_Note: Bounds-checking is ommitted throughout for the sake of clarity. This feature can be easily added into our kernel, but may result in a slight performance hit because LLVM and PTXAS have issues dealing with conditionals and predicates inside loops._ ### Compute Kernel Matrix multiplications of the form `C = A x B` can be implemented in Triton-C fairly concisely, as shown below: ```c -// launched on a grid of (M / TM) x (N / TN) programs of 1 thread each +// Triton-C +// launched on a grid of (M / TM) x (N / TN) programs __global__ void dot(TYPE * A, TYPE * B, TYPE * C, int M, int N, int K, int lda __multipleof(8), int ldb __multipleof(8), int ldc __multipleof(8)) { // prologue @@ -308,7 +329,7 @@ Internally, the Triton compiler will perform quite a few optimizations that will ### Optimizations -Nonetheless, there are two important optimizations that the Triton compiler does not do at the moment yet are critical to achieve peak performance: pre-fetching and rematerialization. In this subsection we describe how these optimizations can be done manually by modifying the above source-code. +Nonetheless, there are two important optimizations that the Triton compiler does not do automatically at the moment yet are critical to achieve peak performance: pre-fetching and rematerialization. In this subsection we describe how these optimizations can be done manually by modifying the above source-code. #### Pre-Fetching @@ -344,11 +365,13 @@ TYPE* pc[TM, TN] = C + rcn[newaxis, :] + rcm[:, newaxis] * ldc; *pc = c; ``` -### Fused Transpositions +### Fused Transpositions and Auto-Tuning It is common for optimized matrix-multiplication implementations (e.g., BLAS) to provide variants in which one or both operands are transposed. This is also what is done in the [PyTriton](https://github.com/ptillet/triton/blob/master/python/triton/ops/dot.py) implementation of matrix-multiplication. Fortunately, this can be done by using pre-processors macros for tile shapes and broadcasting directives, leading to the following kernel: ```c +// Triton-C +// launched on a grid of (M / TM) x (N / TN) programs void dot(TYPE * A, TYPE * B, TYPE * C, int M, int N, int K, int lda __multipleof(8), int ldb __multipleof(8), int ldc __multipleof(8)) { @@ -393,4 +416,9 @@ All matrix multiplications variants can then be retrieved using the following co -DUSE_B=^b -DSTRIDE_BK=1-DSTRIDE_BN=ldb -DBROADCAST_BK=newaxis,: -DBROADCAST_BN=:,newaxis -DSHAPE_B=TN,TK ``` +Auto-tuning can also be handled using pre-processor macros: +```c +// Auto-tuning TM and TN in {32, 64, 128}; TK in {8, 16} +-DTM=[32, 64, 128] -DTN=[32, 64, 128] -DTK=[8, 16] +``` From 8111d56ee97e2a8acfc8ad2c87366fffdc200e91 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 10 Sep 2019 13:36:34 -0400 Subject: [PATCH 385/494] [documentation][triton-c] improved wording on Triton-C being single-threaded --- docs/triton-c.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/triton-c.md b/docs/triton-c.md index 0e798c1c4..a49fd4526 100644 --- a/docs/triton-c.md +++ b/docs/triton-c.md @@ -66,7 +66,6 @@ float v[16, 32] = u[:, newaxis]; // broadcasting along the second axis ``` which is valid _Triton-C_. -_Note: Triton-C is single-threaded._ - **Portability**: One other issue with our initial C99 program is that it is not portable. While it will run well on a single CPU thread, the operation `z = x + y` would perform poorly on a GPU Streaming Processor as it would execute on a single thread only. For this readon, it would have to be rewritten for GPUs as follows: ``` @@ -76,7 +75,7 @@ float x = 3.14; float y = 5.17; float z = x + y ``` -In Triton-C, the same code can be used on many different platforms (only CPUs and GPUs are supported at the moment). +In Triton-C, the same code can be used on many different platforms (only CPUs and GPUs are supported at the moment). Furthermore, Triton-C is single-threaded, hence easier to write than CUDA. - **Performance**: Another issue with our initial C99 code snippet is its performance. Although the loops are unrolled, the program does not carry the data-flow information of array operations. This issue gets more and more problematic as programs get increasingly complex, eventually culminating in matrix multiplication being remarkably hard to optimize. From df2455f4b8d45dfa2e8c120efff285bedeae983e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 10 Sep 2019 13:43:03 -0400 Subject: [PATCH 386/494] [documentation][triton-c] grammar --- docs/triton-c.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/triton-c.md b/docs/triton-c.md index a49fd4526..312d06e7d 100644 --- a/docs/triton-c.md +++ b/docs/triton-c.md @@ -67,7 +67,7 @@ float v[16, 32] = u[:, newaxis]; // broadcasting along the second axis which is valid _Triton-C_. -- **Portability**: One other issue with our initial C99 program is that it is not portable. While it will run well on a single CPU thread, the operation `z = x + y` would perform poorly on a GPU Streaming Processor as it would execute on a single thread only. For this readon, it would have to be rewritten for GPUs as follows: +- **Portability**: One other issue with our initial C program is that it is not portable. While it will run well on a single CPU thread, the operation `z = x + y` would underutilize a GPU Streaming Processor as it would execute on a single thread only. For this reason, it would have to be rewritten in CUDA as follows: ``` // CUDA // Launch on a block of 16 x 8 threads @@ -75,16 +75,16 @@ float x = 3.14; float y = 5.17; float z = x + y ``` -In Triton-C, the same code can be used on many different platforms (only CPUs and GPUs are supported at the moment). Furthermore, Triton-C is single-threaded, hence easier to write than CUDA. +In Triton-C, the same code can be used across many different platforms (only CPUs and GPUs are supported at the moment). Furthermore, Triton-C is single-threaded, hence easier to write than CUDA. -- **Performance**: Another issue with our initial C99 code snippet is its performance. Although the loops are unrolled, the program does not carry the data-flow information of array operations. This issue gets more and more problematic as programs get increasingly complex, eventually culminating in matrix multiplication being remarkably hard to optimize. +- **Performance**: Another issue with our initial C code snippet is its performance. Although the loops are unrolled, the program does not carry any data-flow information pertaining to array operations. This issue gets more and more problematic as programs get increasingly complex, eventually culminating in matrix multiplication being remarkably hard to optimize. - This can be worked around using heavy metaprogramming techniques (see [CUTLASS](https://github.com/NVIDIA/cutlass)), but even then programmers still have to allocate and synchronize shared memory manually and endure prohibitively long compilation procedures not easily amenable to auto-tuning. For these reasons, most Deep-Learning frameworks still rely heavily on highly optimized subroutines (e.g., BLAS), which makes the development of novel custom primitives time-consuming for experts and challenging for others. + This can be worked around using heavy metaprogramming techniques (see [CUTLASS](https://github.com/NVIDIA/cutlass)), but even then programmers still have to allocate and synchronize shared memory manually and endure prohibitively long compilation procedures not easily amenable to auto-tuning. For these reasons, most Deep-Learning frameworks still rely heavily on highly optimized subroutines (e.g., BLAS), which makes the development of novel custom primitives time-consuming for experts and almost impossible for others. Triton addresses this issue by relying on **Triton-IR**, an LLVM-like IR for array operations, and **Triton-JIT**, an optimizing compiler for Triton-IR. These two systems are, however, beyond the scope of this tutorial. More information can be found [here](http://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf). -_Note: You might be thinking that this is exactly what [MLIR](https://github.com/tensorflow/mlir) was made for... and you're right! You can think of Triton-IR as a dialect for MLIR, and Triton-C as a frontend for it. If you're interested in making this a thing, let me know._ +_Note: You might be thinking that this is exactly what [MLIR](https://github.com/tensorflow/mlir) was made for... and you're right! You can conceptually think of Triton-IR as a dialect for MLIR, and Triton-C as a frontend for it. I would like to integrate Triton-IR into MLIR in the future; If you're interested in making this a thing, let me know._ From 41acac6ba1814e45f69ae0524e13491866fff9e5 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 10 Sep 2019 14:16:52 -0400 Subject: [PATCH 387/494] [documentation] added description of the __multipleof attribute --- docs/triton-c.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/docs/triton-c.md b/docs/triton-c.md index 312d06e7d..6222169b5 100644 --- a/docs/triton-c.md +++ b/docs/triton-c.md @@ -10,7 +10,8 @@ 3. [Auto-Tuning](#auto-tuning) 3. [Matrix Transposition](#matrix-transposition) 1. [Compute Kernel](#trans-compute-kernel) - 2. [Conditional Dereferencing](#conditional-dereferencing) + 2. [The __multipleof Attribute](#trans-multipleof) + 3. [Conditional Dereferencing](#conditional-dereferencing) 4. [Matrix Multiplication](#matrix-multiplication) 1. [Compute Kernel](#matmul-compute-kernel) 2. [Optimizations](#optimizations) @@ -243,6 +244,18 @@ which will be used in statements (5) and (6) to construct tiles of pointers ``` - Statement (7) element-wise dereferences the above array of pointers `*px`, transposes it using the unary transposition operator `^`, and writes it back at the location specified by `py`. +### The __multipleof Attribute + +The memory loads and store in our transposition kernel are not vectorizable by default, since `X + ldx` (and `Y + ldy`) may be misaligned when `ldx` (and `ldy`) are not multiples of e.g., 4. This is unfortunate because tensor dimensions can be easily made into nice powers of two in Deep Learning, due to batch-sizes and layer width being flexible. + +For this reason, Triton provides a __multipleof(N) attributes for variables that are guaranteed to always be multiple of N. In the case of Matrix Transpositions, vector loads can be enabled by modifying the function's signature as follows: + +```c +__global__ void transpose(TYPE * X, TYPE * Y, int M, int N, int ldx __multipleof(8), int ldy __multipleof(8)) { +// ... +} +``` + ### Conditional Dereferencing You might have noticed that the above code will fail when `M` and `N` are not multiples of `TM` and `TN` respectively. Fortunately, the above kernel can be slightly modified to handle thie situation, as shown below: From 2781cdcf93acad6a582606e323bcf27a6d6eebc7 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 10 Sep 2019 15:54:16 -0400 Subject: [PATCH 388/494] [lang] added templates for reductions --- include/triton/lang/token.h | 3 ++ include/triton/runtime/function.h | 2 +- lib/lang/parser.cc | 25 ++++++++++--- tests/common/src/reduce.h | 27 ++++++++++++++ tests/common/util.h | 7 ++++ tests/unit/CMakeLists.txt | 2 +- tests/unit/dot.cc | 11 ++---- tests/unit/reduce.cc | 62 +++++++++++++++++++++++++++++++ 8 files changed, 124 insertions(+), 15 deletions(-) create mode 100644 tests/common/src/reduce.h create mode 100644 tests/unit/reduce.cc diff --git a/include/triton/lang/token.h b/include/triton/lang/token.h index 1690ba246..5724c50e3 100644 --- a/include/triton/lang/token.h +++ b/include/triton/lang/token.h @@ -180,6 +180,9 @@ public: PLUS, MINUS, CAST, + REDUCE_ADD, + REDUCE_MAX, + REDUCE_MIN, // For preprocessor PP_IF, diff --git a/include/triton/runtime/function.h b/include/triton/runtime/function.h index 96ec35ef7..42ecd69f9 100644 --- a/include/triton/runtime/function.h +++ b/include/triton/runtime/function.h @@ -70,7 +70,7 @@ public: struct options_space_t { typedef std::pair> define_t; std::vector defines; - std::vector num_warps; + std::vector num_warps; }; struct options_t { diff --git a/lib/lang/parser.cc b/lib/lang/parser.cc index fed1422fc..6c669208f 100644 --- a/lib/lang/parser.cc +++ b/lib/lang/parser.cc @@ -453,14 +453,27 @@ Expr* Parser::ParseSubScripting(Expr* lhs) { TileType::ShapeInt shape; size_t i = 0; const Token* tok; + std::vector> redList; do { tok = ts_.Next(); - if(tok->tag_ == ':') - shape.push_back(lhsShape[i++]); - else if(tok->tag_ == Token::NEWAXIS) - shape.push_back(1); - else - Error(tok, "only ':' and newaxis are supported in subscripts"); + switch(tok->tag_) { + case ':': + shape.push_back(lhsShape[i++]); + break; + + case Token::NEWAXIS: + shape.push_back(1); + break; + +// case Token::ADD: +// case Token::SUB: +// redList.push_back({i, tok->tag_}); +// break; + + default: + Error(tok, "Unexpected subscript symbol encountered at dimension %d", i); + break; + } }while(ts_.Try(',')); ts_.Expect(']'); if(lhsShape.size() > i) diff --git a/tests/common/src/reduce.h b/tests/common/src/reduce.h new file mode 100644 index 000000000..a9788f340 --- /dev/null +++ b/tests/common/src/reduce.h @@ -0,0 +1,27 @@ +namespace src { + + const char *reduce1d = +R"( +void reduce1d(TYPE * X __noalias __readonly __aligned(16), + TYPE * Y __noalias __readonly __aligned(16), + int N) { +} +)"; + + + const char *reduce2d = +R"( +void reduce2d(TYPE * X __noalias __readonly __aligned(16), + TYPE * Y __noalias __writeonly __aligned(16), + int M, int N, int ldx) { + int ridm = get_program_id(0); + int ridn = get_program_id(1); + int rm[TM] = ridm * TM + 0 ... TM; + int rn[TN] = ridn * TN + 0 ... TN; + TYPE* px[TM, TN] = X + rm[:, newaxis] + rn[newaxis, :] * ldx; + TYPE* py[TM, TN] = Y + rm[:, newaxis]; + *py = (*px)[:, +]; +} +)"; + +} diff --git a/tests/common/util.h b/tests/common/util.h index d8ffef090..e5cfef7b8 100644 --- a/tests/common/util.h +++ b/tests/common/util.h @@ -31,6 +31,13 @@ enum order_t { COLMAJOR }; +template +void init_rand(std::vector& x) { + for(size_t i = 0; i < x.size(); i++) + x[i] = static_cast((double)rand()/RAND_MAX); +} + + namespace aux{ template struct seq{}; diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index 78fbc79d1..3efbdd71f 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -1,4 +1,4 @@ -foreach(PROG dot) +foreach(PROG dot reduce) set(TARGET unit_${PROG}) add_executable(${TARGET} ${PROG}.cc) set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME ${TARGET}) diff --git a/tests/unit/dot.cc b/tests/unit/dot.cc index 69b8cf2d7..b08eb13ba 100644 --- a/tests/unit/dot.cc +++ b/tests/unit/dot.cc @@ -50,7 +50,7 @@ void cpu_ref(bool AT_, bool BT_, size_t M, size_t N, size_t K, } -bool do_test(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K, int32_t TM, int32_t TN, int32_t TK, size_t nwarp){ +bool do_test(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K, int32_t TM, int32_t TN, int32_t TK, int nwarp){ typedef float NumericT; std::string ty = "float"; size_t dt_nbytes = sizeof(NumericT); @@ -62,12 +62,9 @@ bool do_test(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_ int32_t ldb = BT ? N : K; int32_t ldc = M; srand(0); - for(size_t i = 0; i < ha.size(); i++) - ha[i] = static_cast((float)rand()/RAND_MAX); - for(size_t i = 0; i < hb.size(); i++) - hb[i] = static_cast((float)rand()/RAND_MAX); - for(size_t i = 0; i < hc.size(); i++) - hc[i] = static_cast((double)0); + init_rand(ha); + init_rand(hb); + init_rand(hc); auto dc = std::shared_ptr(drv::buffer::create(context, hc.size()*dt_nbytes)); auto da = std::shared_ptr(drv::buffer::create(context, ha.size()*dt_nbytes)); auto db = std::shared_ptr(drv::buffer::create(context, hb.size()*dt_nbytes)); diff --git a/tests/unit/reduce.cc b/tests/unit/reduce.cc new file mode 100644 index 000000000..59b574c4d --- /dev/null +++ b/tests/unit/reduce.cc @@ -0,0 +1,62 @@ +#include +#include +#include +#include +#include "triton/driver/backend.h" +#include "triton/driver/stream.h" +#include "triton/tools/bench.hpp" +#include "triton/external/half.hpp" +#include "triton/runtime/function.h" +#include "src/reduce.h" +#include "cuda/cublas.h" +#include "util.h" + +namespace drv = triton::driver; +namespace rt = triton::runtime; + + +bool do_test(drv::stream* stream, int M, int N, std::string op, int nwarp){ + typedef float NumericT; + std::string ty = "float"; + size_t dt_nbytes = sizeof(NumericT); + drv::context* context = stream->context(); + std::vector hy(M); + std::vector hx(M*N); + srand(0); + init_rand(hy); + init_rand(hx); + auto dy = std::shared_ptr(drv::buffer::create(context, hy.size()*dt_nbytes)); + auto dx = std::shared_ptr(drv::buffer::create(context, hx.size()*dt_nbytes)); + stream->write(&*dy, true, 0, hy); + stream->write(&*dx, true, 0, hx); + rt::function::options_space_t opt; + opt.defines.push_back({"TYPE", {ty}}); + opt.defines.push_back({"TM", {std::to_string(M)}}); + opt.defines.push_back({"TN", {std::to_string(N)}}); + opt.num_warps = {nwarp}; + rt::function function(src::reduce2d, opt); + function({&*dy, &*dx, M, N, M}, grid2d(M, N), stream); + stream->synchronize(); +} + +int main() { + // initialize default compute device + auto context = triton::driver::backend::contexts::get_default(); + triton::driver::stream* stream = triton::driver::stream::create(context); + // shapes to benchmark + typedef std::tuple config_t; + std::vector configs = { + config_t{32, 32, "+"} + }; + // does the work + int M, N; + std::string op; + for(const auto& c: configs){ + std::tie(M, N, op) = c; + std::cout << "Testing " << c << " ... " << std::flush; + if(do_test(stream, M, N, op, 1)) + std::cout << " Pass! " << std::endl; + else + std::cout << " Fail! " << std::endl; + } +} From d7be0edb15f85a9b4784a9368f288358a5a79cbb Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 10 Sep 2019 21:17:22 -0400 Subject: [PATCH 389/494] [documentation] swapped the order of pytriton and triton-c tutorial in README.md --- README.md | 2 +- docs/pytriton.md | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 41030146a..8a5fe3e98 100644 --- a/README.md +++ b/README.md @@ -29,8 +29,8 @@ python dot.py ## Tutorials -- [The Triton-C language](https://github.com/ptillet/triton/blob/master/docs/triton-c.md) - [The PyTriton API](https://github.com/ptillet/triton/blob/master/docs/pytriton.md) +- [The Triton-C language](https://github.com/ptillet/triton/blob/master/docs/triton-c.md) - The Triton-IR representation (coming soon...) - The Triton-JIT compiler (coming soon...) diff --git a/docs/pytriton.md b/docs/pytriton.md index a877dadbd..d2b2f7983 100644 --- a/docs/pytriton.md +++ b/docs/pytriton.md @@ -3,8 +3,6 @@ ## Table of Contents -This tutorial is the continuation of the [Triton-C tutorial](https://github.com/ptillet/triton/blob/master/docs/triton-c.md), so check it out if you have not already! - 1. [Motivations](#motivations) 2. [Triton Functions](#pytriton-function) 1. [Creation of Triton Kernels](#creation-triton-kernels) @@ -232,4 +230,4 @@ Still like for PyTorch, a callable operation can be created using the `apply` me ```python dot = _dot.apply ``` -And that's it! Our custom op is now created and ready to be used with both PyTorch and Tensorflow. \ No newline at end of file +And that's it! Our custom op is now created and ready to be used with both PyTorch and Tensorflow. From 0c41bade077e8b860a177c51295a961b8bb5c206 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 10 Sep 2019 23:25:47 -0400 Subject: [PATCH 390/494] [codegen] basic recoalescing working --- include/triton/codegen/analysis/grid.h | 3 +- include/triton/codegen/selection.h | 16 +++++---- include/triton/ir/constant.h | 16 +-------- include/triton/ir/instructions.h | 37 +++++++++++++++------ lib/codegen/analysis/align.cc | 8 ++--- lib/codegen/analysis/grid.cc | 45 ++++++++++++++------------ lib/codegen/selection.cc | 34 +++++++++---------- lib/codegen/transform/reassociate.cc | 10 +++--- lib/ir/constant.cc | 23 ------------- lib/ir/instructions.cc | 37 ++++++++++++++++----- lib/lang/code_gen.cc | 2 +- lib/runtime/function.cc | 4 +-- tests/bench/copy2d.cc | 2 +- 13 files changed, 122 insertions(+), 115 deletions(-) diff --git a/include/triton/codegen/analysis/grid.h b/include/triton/codegen/analysis/grid.h index 84fd2168c..c361db260 100644 --- a/include/triton/codegen/analysis/grid.h +++ b/include/triton/codegen/analysis/grid.h @@ -41,7 +41,7 @@ private: fragment_t get_fragmentation_type(node_t x, graph_t &graph); void connected_components(node_t x, const std::vector mps, const std::vector prefixes, std::set &nodes, graph_t &graph, unsigned group_id); void create_grids(std::vector &grids, - std::map >, triton::ir::value *> &references, + std::map &references, ir::function *fn); @@ -53,6 +53,7 @@ public: void copy(ir::value *dst, ir::value *src); void run(ir::module &mod); unsigned get_num_threads(); + const std::vector get_grids() const { return grids_; } private: std::vector pool_; diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 0a5d84825..842f544aa 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -43,13 +43,16 @@ namespace triton{ namespace codegen{ namespace analysis{ - class grids; class align; class memalloc; class meminfo; - } + +namespace transform{ +class reorder; +} + class target; typedef std::vector indices_t; @@ -154,7 +157,7 @@ private: void create_grids(std::vector &grids, std::map &references, ir::function *fn); - void create_tile(ir::value *v, Builder &builder, const std::map &references, std::set &seen, Value *sh_mem_ptr); + void create_tile(ir::value *v, Builder &builder, std::set &seen, Value *sh_mem_ptr); void init_axes(ir::value *i, Builder &builder, Value *u_thread_id, Value *u_warp_id); void init_grids(ir::function *fn, Builder &builder, Value *sh_mem_ptr); @@ -165,7 +168,7 @@ private: void lower_store(ir::store_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); void lower_downcast(ir::downcast_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); void lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); - void lower_dynamic_program_idx(ir::nv_dynamic_program_idx_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); + void lower_dynamic_program_idx(ir::make_range_dyn *x, LLVMContext &ctx, Function *fn, Builder &builder); void lower_reshape(ir::reshape_inst* x, LLVMContext &ctx, Function *fn, Builder &builder); void lower_splat(ir::splat_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); void lower_broadcast(ir::broadcast_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); @@ -192,8 +195,8 @@ private: public: - selection(analysis::memalloc *alloc, analysis::grids *params, analysis::meminfo *buffer_info, analysis::align *alignment, target *tgt) - : alloc_(alloc), params_(params), buffer_info_(buffer_info), alignment_(alignment), tgt_(tgt){ } + selection(analysis::memalloc *alloc, analysis::grids *params, analysis::meminfo *buffer_info, analysis::align *alignment, transform::reorder* reorder, target *tgt) + : alloc_(alloc), params_(params), buffer_info_(buffer_info), alignment_(alignment), reorder_(reorder), tgt_(tgt){ } void run(ir::module &src, Module &dst); @@ -204,6 +207,7 @@ private: analysis::grids *params_; analysis::meminfo *buffer_info_; analysis::align *alignment_; + transform::reorder *reorder_; target *tgt_; std::map axes_; Value *sh_mem_ptr_; diff --git a/include/triton/ir/constant.h b/include/triton/ir/constant.h index 23be73256..0127acae6 100644 --- a/include/triton/ir/constant.h +++ b/include/triton/ir/constant.h @@ -68,21 +68,6 @@ private: bool has_value_; }; -/* constant range */ -class constant_range: public constant{ - constant_range(type *ty, constant_int* first, constant_int* last); - -public: - static constant *get(constant_int *first, constant_int *last); - const constant_int* get_first() const; - const constant_int* get_last() const; - std::string repr() const { return first_->repr() + " ... " + last_->repr(); } - -private: - constant_int* first_; - constant_int* last_; -}; - /* constant fp */ class constant_fp: public constant{ constant_fp(type *ty, double value); @@ -99,6 +84,7 @@ private: double value_; }; + /* global value */ class global_value: public constant { public: diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index 86df129d2..dd85fd3a0 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -20,7 +20,7 @@ namespace ir{ class constant_int; class constant; -class constant_range; +class make_range; class basic_block; class context; @@ -703,27 +703,44 @@ public: // On NVIDIA, implementation is such that // constant_range = nv_dynamic_program_idx + nv_static_program_idx // so as to enable re-association on nv_static_program_idx which is constant -class nv_dynamic_program_idx_inst: public instruction { +class make_range_dyn: public instruction { private: - nv_dynamic_program_idx_inst(type *ty, const std::string &name, instruction *next); + make_range_dyn(type *ty, const std::string &name, instruction *next); std::string repr_impl() const { return "nv_dynamic_program_idx"; } - _TRITON_DEFINE_CLONE(nv_dynamic_program_idx_inst) + _TRITON_DEFINE_CLONE(make_range_dyn) public: - static nv_dynamic_program_idx_inst* create(type *ty, const std::string &name = "", instruction *next = nullptr); + static make_range_dyn* create(type *ty, const std::string &name = "", instruction *next = nullptr); }; -class nv_static_program_idx: public constant { +class make_range_sta: public constant { private: - nv_static_program_idx(constant_range *range); + make_range_sta(make_range *range); public: - static nv_static_program_idx *get(constant_range* range); - constant_range* get_range() const; + static make_range_sta *get(make_range* range); + make_range* get_range() const; std::string repr() const { return "nv_static_program_idx"; } private: - constant_range *range_; + make_range *range_; +}; + + +/* constant range */ +class make_range: public instruction{ + make_range(type *ty, constant_int* first, constant_int* last); + std::string repr_impl() const { return "make_range[" + first_->repr() + " : " + last_->repr() + "]"; } + _TRITON_DEFINE_CLONE(make_range) + +public: + static make_range *create(constant_int *first, constant_int *last); + const constant_int* get_first() const; + const constant_int* get_last() const; + +private: + constant_int* first_; + constant_int* last_; }; diff --git a/lib/codegen/analysis/align.cc b/lib/codegen/analysis/align.cc index 119ece8fa..3c2348587 100644 --- a/lib/codegen/analysis/align.cc +++ b/lib/codegen/analysis/align.cc @@ -306,7 +306,7 @@ std::vector align::populate_max_contiguous_default(ir::value* v) { if(!v->get_type()->is_tile_ty()) return add_to_cache(v, {1}, max_contiguous_); auto shapes = v->get_type()->get_tile_shapes(); - if(dynamic_cast(v)) + if(dynamic_cast(v)) return add_to_cache(v, {shapes[0]}, max_contiguous_); return add_to_cache(v, std::vector(shapes.size(), 1), max_contiguous_); } @@ -452,11 +452,11 @@ std::vector align::populate_starting_multiple(ir::value *v){ return populate_starting_multiple_binop(x); if(auto *x = dynamic_cast(v)) return add_to_cache(x, {(unsigned)x->get_value()}, starting_multiple_); - if(auto *x = dynamic_cast(v)) + if(auto *x = dynamic_cast(v)) return add_to_cache(x, {(unsigned)x->get_first()->get_value()}, starting_multiple_); - if(auto *x = dynamic_cast(v)) + if(auto *x = dynamic_cast(v)) return add_to_cache(x, {128}, starting_multiple_); - if(auto *x = dynamic_cast(v)) + if(auto *x = dynamic_cast(v)) return add_to_cache(x, {(unsigned)x->get_range()->get_first()->get_value()}, starting_multiple_); if(auto *x = dynamic_cast(v)) return populate_starting_multiple_gep(x); diff --git a/lib/codegen/analysis/grid.cc b/lib/codegen/analysis/grid.cc index 89b3d90a8..a33c4c25d 100644 --- a/lib/codegen/analysis/grid.cc +++ b/lib/codegen/analysis/grid.cc @@ -60,6 +60,8 @@ void grids::init_c_graph(ir::instruction *v) { shapes = atom->get_operand(0)->get_type()->get_tile_shapes(); else if(dynamic_cast(v)) return; + else if(dynamic_cast(v)) + return; else if(auto *reduce = dynamic_cast(v)) { unsigned axis = reduce->get_axis(); ir::value *arg = reduce->get_operand(0); @@ -169,8 +171,9 @@ void grids::connected_components(node_t x, const std::vectorset_value(static_params_.at(x)); } - for(const node_t &y: graph[x]) + for(const node_t &y: graph[x]){ connected_components(y, mps, prefixes, nodes, graph, group_id); + } } } @@ -225,7 +228,7 @@ void grids::run(ir::module &mod) { } for(ir::function *fn: mod.get_function_list()){ - std::map>, ir::value*> references; + std::map references; create_grids(grids_, references, fn); } @@ -234,17 +237,17 @@ void grids::run(ir::module &mod) { auto clamp = [&](unsigned x, unsigned lo, unsigned hi) { return std::min(std::max(x, lo), hi); }; for(ir::value *i: grids_){ - std::cout << "grid: " << i->get_name() << std::endl; - if(!i->get_type()->is_tile_ty()) continue; + auto order = reorder_->get_order(i); auto shapes = i->get_type()->get_tile_shapes(); - unsigned shape_0 = shapes[0]; - unsigned shape_1 = shapes[1]; unsigned size = i->get_type()->get_tile_num_elements(); /* HMMA parameters*/ if(fragments_.at({i, 0}) == HMMA_FRAGMENT_C){ + unsigned shape_0 = shapes[order[0]]; + unsigned shape_1 = shapes[order[1]]; + /* fragments per warp */ // try to make things as square as possible to maximize data re-use std::vector fpw = {1, 1, 1}; @@ -290,17 +293,22 @@ void grids::run(ir::module &mod) { /* Scan-line */ else{ - unsigned shape = shapes[0]; + unsigned ld = order[0]; + std::string s_ld = std::to_string(ld); unsigned current = num_threads; - params_.at(i).at("nts.d0")->set_value(clamp(size / num_threads, 1, 4)); - params_.at(i).at("mts.d0")->set_value(clamp(current, 1, shape / params_.at(i).at("nts.d0")->get_value())); - current = current / params_.at(i).at("mts.d0")->get_value(); + std::string nts = "nts.d" + s_ld; + std::string mts = "mts.d" + s_ld; + params_.at(i).at(nts)->set_value(clamp(size / num_threads, 1, 1)); + params_.at(i).at(mts)->set_value(clamp(current, 1, shapes[ld] / params_.at(i).at(nts)->get_value())); + current = current / params_.at(i).at(mts)->get_value(); for(size_t d = 1; d < shapes.size(); d++){ - std::string str_d = std::to_string(d); - shape = shapes[d]; - params_.at(i).at("nts.d" + str_d)->set_value(1); - params_.at(i).at("mts.d" + str_d)->set_value(clamp(current, 1, shape)); - current = current / params_.at(i).at("mts.d" + str_d)->get_value(); + ld = order[d]; + s_ld = std::to_string(ld); + nts = "nts.d" + s_ld; + mts = "mts.d" + s_ld; + params_.at(i).at(nts)->set_value(1); + params_.at(i).at(mts)->set_value(clamp(current, 1, shapes[ld])); + current = current / params_.at(i).at(mts)->get_value(); } /* sanity check */ unsigned effective_num_threads = 1; @@ -317,8 +325,7 @@ void grids::run(ir::module &mod) { void grids::create_grids(std::vector &grids, - std::map>, ir::value*> &references, + std::map &references, ir::function *fn) { // get number of dimensions greater than 1 auto get_tile_gt1_dim = [&](ir::value *v){ @@ -335,7 +342,6 @@ void grids::create_grids(std::vector &grids, // skip if(!v->get_type()->is_tile_ty() || !seen.insert(v).second) return; - auto order = reorder_->get_order(v); // recurse if(auto *user = dynamic_cast(v)) for(ir::value *op: user->ops()) @@ -346,7 +352,7 @@ void grids::create_grids(std::vector &grids, if(shapes[d] == 1) continue; unsigned x = get_param_group(v, d); - ir::value *&r = references[{x, order}]; + ir::value *&r = references[x]; if(!r || get_tile_gt1_dim(v) > get_tile_gt1_dim(r)) r = v; } @@ -360,7 +366,6 @@ void grids::create_grids(std::vector &grids, for(auto &ref: references) if(std::find(grids.begin(), grids.end(), ref.second) == grids.end()) grids.push_back(ref.second); - std::cout << grids.size() << std::endl; } diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 88664bb64..271de7640 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -3,6 +3,7 @@ #include "triton/codegen/analysis/grid.h" #include "triton/codegen/analysis/memalloc.h" #include "triton/codegen/analysis/align.h" +#include "triton/codegen/transform/reorder.h" #include "triton/ir/context.h" #include "triton/ir/module.h" #include "triton/ir/function.h" @@ -538,16 +539,16 @@ Value* selection::llvm_value(ir::value *v, IRBuilder<> &builder) { } // Grid construction -std::vector delinearize(Value *trailing, std::vector &shapes, IRBuilder<> &builder){ +std::vector delinearize(Value *trailing, const std::vector& order, std::vector &shapes, IRBuilder<> &builder){ size_t dim = shapes.size(); std::vector result(dim); for(unsigned k = 0; k < dim - 1; k++){ - Constant *dim_k = builder.getInt32(shapes[k]); + Constant *dim_k = builder.getInt32(shapes[order[k]]); Value *rem = builder.CreateURem(trailing, dim_k); trailing = builder.CreateUDiv(trailing, dim_k); - result[k] = rem; + result[order[k]] = rem; } - result[dim - 1] = trailing; + result[order[dim - 1]] = trailing; return result; } @@ -571,6 +572,7 @@ inline void to_warps(const std::vector &bs, std::vector &nw, } void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { + auto order = reorder_->get_order(v); const auto& shapes = v->get_type()->get_tile_shapes(); size_t dim = shapes.size(); if(params_->get_fragment(v, 0) == analysis::grids::STRIDED_SCAN){ @@ -584,8 +586,8 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id block_size[i] = params_->get_param(v, "mts.d" + str_i)->get_value(); } to_warps(block_size, n_warps, warp_size); - std::vector thread_id_in_warp = delinearize(u_thread_id, warp_size, builder); - std::vector warp_id = delinearize(u_warp_id, n_warps, builder); + std::vector thread_id_in_warp = delinearize(u_thread_id, order, warp_size, builder); + std::vector warp_id = delinearize(u_warp_id, order, n_warps, builder); // Create axes for(unsigned k = 0; k < dim; k++) { std::string str_k = std::to_string(k); @@ -763,13 +765,12 @@ bool static inline has_phi_user(ir::value *v) { return false; } void selection::create_tile(ir::value *v, IRBuilder<> &builder, - const std::map& references, std::set &seen, Value *sh_mem_ptr) { if(!v->get_type()->is_tile_ty() || !seen.insert(v).second) return; if(auto *user = dynamic_cast(v)) for(ir::value *op: user->ops()) - create_tile(op, builder, references, seen, sh_mem_ptr); + create_tile(op, builder, seen, sh_mem_ptr); LLVMContext &ctx = builder.getContext(); auto shapes = v->get_type()->get_tile_shapes(); unsigned pad = alloc_->is_ld_padded(v); @@ -832,13 +833,13 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, distributed_tile *T = new distributed_tile(ty, shapes, axes, builder, vectorize); bool is_inserted = tmap_.insert({v, T}).second; // constant range - if(is_inserted && dynamic_cast(v)){ + if(is_inserted && dynamic_cast(v)){ T->for_each([&](indices_t idx){ assert(idx.size() == 1); T->set_value(idx, idx[0]); }); } - if(is_inserted && dynamic_cast(v)){ + if(is_inserted && dynamic_cast(v)){ T->for_each([&](indices_t idx){ assert(idx.size() == 1); BinaryOperator *bin_add = dyn_cast(idx[0]); @@ -860,19 +861,15 @@ void selection::init_grids(ir::function *fn, IRBuilder<> &builder, Value *sh_mem Value *u_thread_warp_id = builder.CreateURem(u_thread_id, warp_size); Value *u_warp_id = builder.CreateUDiv(u_thread_id, warp_size); // create grid - std::vector grids; - std::map references; - create_grids(grids, references, fn); - for(ir::value* i: grids){ + for(ir::value* i: params_->get_grids()) init_axes(i, builder, u_thread_warp_id, u_warp_id); - } // create tile std::set seen; for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()){ if(!i->get_type()->is_tile_ty()) continue; - create_tile(i, builder, references, seen, sh_mem_ptr); + create_tile(i, builder, seen, sh_mem_ptr); } } @@ -992,7 +989,7 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, } } -void selection::lower_dynamic_program_idx(ir::nv_dynamic_program_idx_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { +void selection::lower_dynamic_program_idx(ir::make_range_dyn *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { distributed_tile* result = (distributed_tile*)tmap_.at(x); result->for_each([&](indices_t idx){ assert(idx.size() == 1); @@ -1411,7 +1408,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & lower_downcast(x, ctx, fn, builder); else if(auto *x = dynamic_cast(ins)) lower_reduce(x, ctx, fn, builder); - else if(auto *x = dynamic_cast(ins)) + else if(auto *x = dynamic_cast(ins)) lower_dynamic_program_idx(x, ctx, fn, builder); else if(auto *x = dynamic_cast(ins)) lower_reshape(x, ctx, fn, builder); @@ -1436,7 +1433,6 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & } void selection::lower_instruction(ir::instruction *src, IRBuilder<> &builder) { - std::cout << src->get_name() << std::endl; if(src->has_tile_result_or_op()) { lower_tile_instruction(src, builder); } diff --git a/lib/codegen/transform/reassociate.cc b/lib/codegen/transform/reassociate.cc index b0f4a2e73..ae42a6566 100644 --- a/lib/codegen/transform/reassociate.cc +++ b/lib/codegen/transform/reassociate.cc @@ -173,20 +173,20 @@ void reassociate::run(ir::module &mod) { // constant_range -> nv_dynamic_program_idx + nv_static_program_idx for(ir::function *fn: mod.get_function_list()){ - std::vector ranges; + std::vector ranges; std::vector rpo = ir::cfg::reverse_post_order(fn); for(ir::basic_block *block: rpo){ // iterate through instruction for(ir::instruction *i: block->get_inst_list()) for(ir::value* op: i->ops()) - if(auto *range = dynamic_cast(op)) + if(auto *range = dynamic_cast(op)) ranges.push_back(range); } builder.set_insert_point(rpo.front()->get_first_non_phi()); - for(ir::constant_range* old_range: ranges){ - ir::value* dyn_range = builder.insert(ir::nv_dynamic_program_idx_inst::create(old_range->get_type())); - ir::value* static_range = ir::nv_static_program_idx::get(old_range); + for(ir::make_range* old_range: ranges){ + ir::value* dyn_range = builder.insert(ir::make_range_dyn::create(old_range->get_type())); + ir::value* static_range = ir::make_range_sta::get(old_range); ir::value* new_range = builder.create_add(dyn_range, static_range); old_range->replace_all_uses_with(new_range); params_->copy(dyn_range, old_range); diff --git a/lib/ir/constant.cc b/lib/ir/constant.cc index 9ff8d6e72..0eff5261e 100644 --- a/lib/ir/constant.cc +++ b/lib/ir/constant.cc @@ -50,29 +50,6 @@ constant_int *constant_int::get(type *ty, uint64_t value) { return cst; } -// constant_range -// FIXME use something like APInt - -//"[" + std::to_string(first->get_value()) + " ... " + std::to_string(ty->get_tile_shapes()[0]->get_value()) + "]" - -constant_range::constant_range(type *ty, constant_int *first, constant_int *last) - : constant(ty, 0), first_(first), last_(last){ } - -constant *constant_range::get(constant_int *first, constant_int *last) { - assert(first->get_type()->is_integer_ty()); - assert(first->get_type() == last->get_type()); - assert(((constant_int*)first)->get_value() == 0); - type *ty = tile_type::get(first->get_type(), {(unsigned)last->get_value()}); - return new constant_range(ty, first, last); -} - -const constant_int* constant_range::get_first() const { - return first_; -} - -const constant_int* constant_range::get_last() const { - return last_; -} // constant_fp // FIXME use something like APFloat diff --git a/lib/ir/instructions.cc b/lib/ir/instructions.cc index ac595079d..acecc08b5 100644 --- a/lib/ir/instructions.cc +++ b/lib/ir/instructions.cc @@ -733,28 +733,49 @@ barrier_inst* barrier_inst::create(context &ctx, const std::string &name, instru // nv_dynamic_program_idx -nv_dynamic_program_idx_inst::nv_dynamic_program_idx_inst(type *ty, const std::string &name, instruction *next) +make_range_dyn::make_range_dyn(type *ty, const std::string &name, instruction *next) : instruction(ty, 0, 1, name, next) { } -nv_dynamic_program_idx_inst* nv_dynamic_program_idx_inst::create(type *ty, const std::string &name, instruction *next) { - return new nv_dynamic_program_idx_inst(ty, name, next); +make_range_dyn* make_range_dyn::create(type *ty, const std::string &name, instruction *next) { + return new make_range_dyn(ty, name, next); } // nv_static_program_idx -nv_static_program_idx::nv_static_program_idx(constant_range *range) +make_range_sta::make_range_sta(make_range *range) : constant(range->get_type(), 0), range_(range) { } -constant_range* nv_static_program_idx::get_range() const +make_range* make_range_sta::get_range() const { return range_; } -nv_static_program_idx* nv_static_program_idx::get(constant_range* range) { - static std::map cache; +make_range_sta* make_range_sta::get(make_range* range) { + static std::map cache; if(cache.find(range) == cache.end()) - cache.insert({range, new nv_static_program_idx(range)}); + cache.insert({range, new make_range_sta(range)}); return cache.at(range); } +// make_range +make_range::make_range(type *ty, constant_int *first, constant_int *last) + : instruction(ty, 0), first_(first), last_(last){ } + +make_range *make_range::create(constant_int *first, constant_int *last) { + assert(first->get_type()->is_integer_ty()); + assert(first->get_type() == last->get_type()); + assert(((constant_int*)first)->get_value() == 0); + type *ty = tile_type::get(first->get_type(), {(unsigned)last->get_value()}); + return new make_range(ty, first, last); +} + +const constant_int* make_range::get_first() const { + return first_; +} + +const constant_int* make_range::get_last() const { + return last_; +} + + } } diff --git a/lib/lang/code_gen.cc b/lib/lang/code_gen.cc index 228bd69dd..4e7e14f82 100644 --- a/lib/lang/code_gen.cc +++ b/lib/lang/code_gen.cc @@ -64,7 +64,7 @@ void Generator::VisitBinaryOp(BinaryOp* binary) { auto crhs = dynamic_cast(rhs); if(!clhs || !crhs) should_not_happen(); - return set_ret(ir::constant_range::get(clhs, crhs)); + return set_ret(bld_->insert(ir::make_range::create(clhs, crhs))); } case '+': if(binary->lhs_->Type()->ScalarType()->ToPointer()){ diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 28f3895a3..05c39a451 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -205,14 +205,14 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::transform::dce dce; codegen::transform::peephole peephole; codegen::transform::reassociate reassociate(&alignment_info, &grids); - codegen::selection selection(&shmem_allocation, &grids, &shmem_info, &alignment_info, target.get()); + codegen::selection selection(&shmem_allocation, &grids, &shmem_info, &alignment_info, &reorder, target.get()); // run passes peephole.run(module); dce.run(module); alignment_info.run(module); if(target->is_gpu()) shmem_info.run(module); - ir::print(module, std::cout); +// ir::print(module, std::cout); reorder.run(module); dce.run(module); ir::print(module, std::cout); diff --git a/tests/bench/copy2d.cc b/tests/bench/copy2d.cc index c512cf58d..69e877767 100644 --- a/tests/bench/copy2d.cc +++ b/tests/bench/copy2d.cc @@ -48,7 +48,7 @@ int main() { std::vector configs; for(auto x: std::vector{COLMAJOR}){ std::vector tmp = { - config_t{1024, 1024, x} + config_t{2048, 2048, x} }; configs.insert(configs.end(), tmp.begin(), tmp.end()); } From 04a0fbd8e3dc39e6d4216ceba0fea596476bc709 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 11 Sep 2019 17:35:56 -0400 Subject: [PATCH 391/494] [tests] basic test for reduction in python passes --- include/triton/lang/ast.h | 10 +++++++--- include/triton/lang/token.h | 4 +--- lib/codegen/selection.cc | 4 +++- lib/lang/ast.cc | 31 +++++++++++++++++++++++++++++-- lib/lang/code_gen.cc | 36 +++++++++++++++++++++++++++++++++--- lib/lang/parser.cc | 30 +++++++++++++++++++++++------- lib/runtime/function.cc | 1 + tests/common/src/reduce.h | 2 +- tests/common/util.h | 6 ++++++ tests/unit/reduce.cc | 18 ++++++++++++++++-- 10 files changed, 120 insertions(+), 22 deletions(-) diff --git a/include/triton/lang/ast.h b/include/triton/lang/ast.h index 8bf96a96b..43cfc485f 100644 --- a/include/triton/lang/ast.h +++ b/include/triton/lang/ast.h @@ -418,22 +418,25 @@ class UnaryOp : public Expr { friend class LValAssigner; public: - static UnaryOp* New(int op, Expr* operand, QualType type=nullptr); + static UnaryOp* New(int op, Expr* operand, QualType type=nullptr, int info=0); virtual ~UnaryOp() {} virtual void Accept(Visitor* v); virtual bool IsLVal(); ::Type *Convert(); + static int encodeRed(int ax, int tag); + static void decodeRed(int info, int& ax, int& tag); void TypeChecking(); void IncDecOpTypeChecking(); void AddrOpTypeChecking(); void DerefOpTypeChecking(); + void ReduceOpTypeChecking(); void TransOpTypeChecking(); void UnaryArithmOpTypeChecking(); void CastOpTypeChecking(); protected: - UnaryOp(int op, Expr* operand, QualType type=nullptr) - : Expr(operand->Tok(), type), op_(op) { + UnaryOp(int op, Expr* operand, QualType type=nullptr, int info=0) + : Expr(operand->Tok(), type), op_(op), info_(info) { operand_ = operand; if (op_ != Token::CAST && op_ != Token::ADDR) { operand_ = MayCast(operand); @@ -441,6 +444,7 @@ protected: } int op_; + int info_; Expr* operand_; }; diff --git a/include/triton/lang/token.h b/include/triton/lang/token.h index 5724c50e3..602113f93 100644 --- a/include/triton/lang/token.h +++ b/include/triton/lang/token.h @@ -180,9 +180,7 @@ public: PLUS, MINUS, CAST, - REDUCE_ADD, - REDUCE_MAX, - REDUCE_MIN, + REDUCE, // For preprocessor PP_IF, diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index ff246f4f5..02611444f 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -962,7 +962,9 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, tgt_->add_barrier(module, builder); builder.CreateStore(result, write_ptr); // build result - unsigned depth = params_->get_param(op, "wpt.d" + std::to_string(axis))->get_value(); + unsigned shape_ax = op->get_type()->get_tile_shapes()[axis]; + unsigned per_thread = op_tile->axis(axis).values.size(); + unsigned depth = shape_ax / per_thread; for(unsigned i = depth/2; i > 0; i >>= 1){ // current indices indices_t current(write_idx.size(), builder.getInt32(0)); diff --git a/lib/lang/ast.cc b/lib/lang/ast.cc index e1f008c36..b0a50adc3 100644 --- a/lib/lang/ast.cc +++ b/lib/lang/ast.cc @@ -448,6 +448,8 @@ void BinaryOp::RangeOpTypeChecking() { } void BinaryOp::MaskedDerefOpTypeChecking() { +// auto lhsTileType = lhs_->Type()->ToTile(); +// auto rhsTileType = rhs_->Type()->ToTile(); ::Type* lhsScalType = TryExtractScalarType(this, lhs_); ::Type* rhsScalType = TryExtractScalarType(this, rhs_); auto lhsType = lhsScalType->ToArithm(); @@ -572,8 +574,8 @@ void BinaryOp::AssignOpTypeChecking() { * Unary Operators */ -UnaryOp* UnaryOp::New(int op, Expr* operand, QualType type) { - auto ret = new (unaryOpPool.Alloc()) UnaryOp(op, operand, type); +UnaryOp* UnaryOp::New(int op, Expr* operand, QualType type, int info) { + auto ret = new (unaryOpPool.Alloc()) UnaryOp(op, operand, type, info); ret->pool_ = &unaryOpPool; ret->TypeChecking(); @@ -581,6 +583,18 @@ UnaryOp* UnaryOp::New(int op, Expr* operand, QualType type) { } +int UnaryOp::encodeRed(int ax, int tag) { + int result = 0; + result |= ax; + result |= tag << 16; + return result; +} + +void UnaryOp::decodeRed(int info, int& ax, int& tag) { + ax = info & 0x0000FFFF; + tag = (info & 0xFFFF0000) >> 16; +} + bool UnaryOp::IsLVal() { // Only deref('*') could be lvalue; return op_ == Token::DEREF; @@ -626,6 +640,9 @@ void UnaryOp::TypeChecking() { case '^': return TransOpTypeChecking(); + case Token::REDUCE: + return ReduceOpTypeChecking(); + default: assert(false); } @@ -663,6 +680,16 @@ void UnaryOp::DerefOpTypeChecking() { type_ = ScalarOrLikeTile(operand_, pointerType->Derived().GetPtr()); } +void UnaryOp::ReduceOpTypeChecking() { + int ax, tag; + decodeRed(info_, ax, tag); + auto tileType = operand_->Type()->ToTile(); + if(!tileType) + Error(this, "array expected for reduction operation"); + auto shape = tileType->Shape(); + shape.erase(shape.begin() + ax); + type_ = TileType::New(shape, tileType->Derived()); +} void UnaryOp::TransOpTypeChecking() { auto tileType = operand_->Type()->ToTile(); diff --git a/lib/lang/code_gen.cc b/lib/lang/code_gen.cc index 228bd69dd..56acb1c03 100644 --- a/lib/lang/code_gen.cc +++ b/lib/lang/code_gen.cc @@ -174,6 +174,11 @@ void Generator::VisitUnaryOp(UnaryOp* unary) { case '!': return set_ret(bld_->create_not(op)); case Token::CAST: return set_ret(GenCastOp(op, GenIRType(unary->Type(), *ctx_))); case '^': return set_ret(bld_->create_trans(op)); + case Token::REDUCE: { + int ax, tag; + UnaryOp::decodeRed(unary->info_, ax, tag); + return set_ret(bld_->create_reduce(op, ax)); + } default: error_not_implemented(); } return error_not_implemented(); @@ -412,16 +417,41 @@ void Generator::Gen(ir::module *mod) { ir::value* Generator::GenBroadcastOp(ir::value* src, ir::type* dst_ty) { + if(src->get_type() == dst_ty) + return src; if(dst_ty->is_tile_ty()) { ir::type *src_ty = src->get_type(); auto dst_shapes = dst_ty->get_tile_shapes(); if(!src_ty->is_tile_ty()) return bld_->create_splat(src, dst_shapes); auto src_shapes = src_ty->get_tile_shapes(); - if(src_shapes.size() != dst_shapes.size()) - return bld_->create_reshape(src, dst_shapes); - else + if(src_shapes.size() != dst_shapes.size()){ + unsigned src_numel = 1; + for(unsigned s: src_shapes) + src_numel *= s; + unsigned dst_numel = 1; + for(unsigned s: dst_shapes) + dst_numel *= s; + if(src_numel == dst_numel) + return bld_->create_reshape(src, dst_shapes); + else { + auto padded_shapes = src_shapes; + while(padded_shapes.size() != dst_shapes.size()) + padded_shapes.insert(padded_shapes.begin(), 1); + // check that broadcast is legal + for(size_t d = 0; d < padded_shapes.size(); d++){ + if(dst_shapes[d] != padded_shapes[d] && + padded_shapes[d] != 1) + should_not_happen(); + } + // pad and broadcast + ir::value *padded = bld_->create_reshape(src, padded_shapes); + return bld_->create_broadcast(padded, dst_shapes); + } + } + else{ return bld_->create_broadcast(src, dst_shapes); + } } return src; } diff --git a/lib/lang/parser.cc b/lib/lang/parser.cc index 6c669208f..f69337ced 100644 --- a/lib/lang/parser.cc +++ b/lib/lang/parser.cc @@ -453,7 +453,7 @@ Expr* Parser::ParseSubScripting(Expr* lhs) { TileType::ShapeInt shape; size_t i = 0; const Token* tok; - std::vector> redList; + std::vector> redInfo; do { tok = ts_.Next(); switch(tok->tag_) { @@ -465,10 +465,13 @@ Expr* Parser::ParseSubScripting(Expr* lhs) { shape.push_back(1); break; -// case Token::ADD: -// case Token::SUB: -// redList.push_back({i, tok->tag_}); -// break; + case Token::ADD: + case Token::SUB:{ + int info = UnaryOp::encodeRed(i, tok->tag_); + redInfo.push_back({i, info}); + shape.push_back(lhsShape[i++]); + break; + } default: Error(tok, "Unexpected subscript symbol encountered at dimension %d", i); @@ -479,8 +482,21 @@ Expr* Parser::ParseSubScripting(Expr* lhs) { if(lhsShape.size() > i) Error(tok, "broadcasting not using all operand axes"); // create ret tile - TileType *retType = TileType::New(shape, lhsQual); - return UnaryOp::New(Token::CAST, lhs, retType); + Expr* res = lhs; + for(auto r: redInfo){ + shape.erase(shape.begin() + r.first); + Type *retType; + if(shape.empty()) + retType = lhsQual.GetPtr(); + else + retType = TileType::New(shape, lhsQual); + res = UnaryOp::New(Token::REDUCE, res, retType, r.second); + } + if(!shape.empty()){ + TileType *retType = TileType::New(shape, lhsQual); + res = UnaryOp::New(Token::CAST, res, retType); + } + return res; } diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 114626dce..ae21128f6 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -204,6 +204,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::transform::peephole peephole; codegen::transform::reassociate reassociate(&alignment_info, &grids); codegen::selection selection(&shmem_allocation, &grids, &shmem_info, &alignment_info, target.get()); +// ir::print(module, std::cout); // run passes peephole.run(module); dce.run(module); diff --git a/tests/common/src/reduce.h b/tests/common/src/reduce.h index a9788f340..1f2be7461 100644 --- a/tests/common/src/reduce.h +++ b/tests/common/src/reduce.h @@ -19,7 +19,7 @@ void reduce2d(TYPE * X __noalias __readonly __aligned(16), int rm[TM] = ridm * TM + 0 ... TM; int rn[TN] = ridn * TN + 0 ... TN; TYPE* px[TM, TN] = X + rm[:, newaxis] + rn[newaxis, :] * ldx; - TYPE* py[TM, TN] = Y + rm[:, newaxis]; + TYPE* py[TM] = Y + rm; *py = (*px)[:, +]; } )"; diff --git a/tests/common/util.h b/tests/common/util.h index e5cfef7b8..874b33e84 100644 --- a/tests/common/util.h +++ b/tests/common/util.h @@ -37,6 +37,12 @@ void init_rand(std::vector& x) { x[i] = static_cast((double)rand()/RAND_MAX); } +template +void init_zeros(std::vector& x) { + for(size_t i = 0; i < x.size(); i++) + x[i] = 0; +} + namespace aux{ diff --git a/tests/unit/reduce.cc b/tests/unit/reduce.cc index 59b574c4d..c513d9cb5 100644 --- a/tests/unit/reduce.cc +++ b/tests/unit/reduce.cc @@ -15,15 +15,26 @@ namespace drv = triton::driver; namespace rt = triton::runtime; +template +void cpu_ref(std::vector &y, const std::vector &x, int M, int N) { + for(int m = 0; m < M; m++){ + T acc = 0; + for(int n = 0; n < N; n++) + acc = acc + x[m + n*M]; + y[m] = acc; + } +} + bool do_test(drv::stream* stream, int M, int N, std::string op, int nwarp){ typedef float NumericT; std::string ty = "float"; size_t dt_nbytes = sizeof(NumericT); drv::context* context = stream->context(); std::vector hy(M); + std::vector ry(M); std::vector hx(M*N); srand(0); - init_rand(hy); + init_zeros(hy); init_rand(hx); auto dy = std::shared_ptr(drv::buffer::create(context, hy.size()*dt_nbytes)); auto dx = std::shared_ptr(drv::buffer::create(context, hx.size()*dt_nbytes)); @@ -35,8 +46,11 @@ bool do_test(drv::stream* stream, int M, int N, std::string op, int nwarp){ opt.defines.push_back({"TN", {std::to_string(N)}}); opt.num_warps = {nwarp}; rt::function function(src::reduce2d, opt); - function({&*dy, &*dx, M, N, M}, grid2d(M, N), stream); + function({&*dx, &*dy, M, N, M}, grid2d(M, N), stream); stream->synchronize(); + stream->read(&*dy, true, 0, hy); + cpu_ref(ry, hx, M, N); + return testing::diff(hy, ry); } int main() { From 178094b5f7d5d0a0be6b4f030f6a5fe43400c102 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 11 Sep 2019 20:47:17 -0400 Subject: [PATCH 392/494] [codegen] exposed a bug in reductions --- include/triton/ir/instructions.h | 2 +- lib/codegen/analysis/grid.cc | 23 ++++++------- lib/codegen/selection.cc | 23 +++++-------- lib/runtime/function.cc | 8 ++--- tests/common/src/reduce.h | 4 +-- tests/common/util.h | 2 +- tests/unit/reduce.cc | 57 +++++++++++++++++++++++++++----- 7 files changed, 74 insertions(+), 45 deletions(-) diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index a4fbc3710..5a07f79b2 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -616,7 +616,7 @@ private: private: reduce_inst(value* arg, unsigned axis, const std::string& name, instruction* next); - std::string repr_impl() const { return "reduce"; } + std::string repr_impl() const { return "red<" + std::to_string(axis_) + ">"; } public: static instruction* create(value *arg, unsigned axis, const std::string &name = "", instruction *next = nullptr); diff --git a/lib/codegen/analysis/grid.cc b/lib/codegen/analysis/grid.cc index f90ab8822..4ce4116e3 100644 --- a/lib/codegen/analysis/grid.cc +++ b/lib/codegen/analysis/grid.cc @@ -59,16 +59,16 @@ void grids::init_c_graph(ir::instruction *v) { shapes = atom->get_operand(0)->get_type()->get_tile_shapes(); else if(dynamic_cast(v)) return; - else if(auto *reduce = dynamic_cast(v)) { - unsigned axis = reduce->get_axis(); - ir::value *arg = reduce->get_operand(0); - auto in_shapes = arg->get_type()->get_tile_shapes(); - unsigned current = 0; - for(unsigned i = 0; i < in_shapes.size(); i++){ - if(i == axis) - continue; - add_constraint({reduce, current++}, {arg, i}); - } + else if(dynamic_cast(v)) { +// unsigned axis = reduce->get_axis(); +// ir::value *arg = reduce->get_operand(0); +// auto in_shapes = arg->get_type()->get_tile_shapes(); +// unsigned current = 0; +// for(unsigned i = 0; i < in_shapes.size(); i++){ +// if(i == axis) +// continue; +// add_constraint({reduce, current++}, {arg, i}); +// } return; } else @@ -244,7 +244,6 @@ void grids::run(ir::module &mod) { unsigned size = i->get_type()->get_tile_num_elements(); /* HMMA parameters*/ if(fragments_.at({i, 0}) == HMMA_FRAGMENT_C){ - /* fragments per warp */ // try to make things as square as possible to maximize data re-use std::vector fpw = {1, 1, 1}; @@ -285,7 +284,6 @@ void grids::run(ir::module &mod) { if(num_warps_ != effective_num_warps) throw std::runtime_error("cannot create a kernel with this amount of warps"); - } /* Scan-line */ @@ -307,6 +305,7 @@ void grids::run(ir::module &mod) { for(size_t d = 0; d < shapes.size(); d++){ std::string str_d = std::to_string(d); effective_num_threads *= params_.at(i).at("mts.d" + str_d)->get_value(); + std::cout << shapes[d] << " " << params_.at(i).at("mts.d" + str_d)->get_value() << " " << params_.at(i).at("nts.d" + str_d)->get_value() << std::endl; } if(num_threads != effective_num_threads) throw std::runtime_error("cannot create a kernel with this amount of warps"); diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 02611444f..e1f08bbad 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -982,21 +982,16 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, // write back builder.CreateStore(result, write_ptr); } - - // result is on the first lane of shared memory - indices_t final = write_idx; - final[axis] = builder.getInt32(0); - Value *read_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), final); - Value *read_ptr = builder.CreateGEP(base_ptr, read_offset); - tgt_->add_barrier(module, builder); - result = builder.CreateLoad(read_ptr); - if(tmap_.find(ins) == tmap_.end()) - vmap_[ins] = result; - else{ - distributed_tile *ti = (distributed_tile*)tmap_[ins]; - ti->set_value(x.first, result); - } } + tgt_->add_barrier(module, builder); + + distributed_tile* x_tile = (distributed_tile*)tmap_.at(x); + x_tile->for_each([&](indices_t idx) { +// Value *read_offset = shared_tile::shared_offset(builder, x_tile->get_shapes(), idx); +// Value *read_ptr = builder.CreateGEP(base_ptr, read_offset); +// x_tile->set_value(idx, builder.CreateLoad(read_ptr)); + x_tile->set_value(idx, ConstantFP::get(builder.getFloatTy(), 0)); + }); } void selection::lower_dynamic_program_idx(ir::nv_dynamic_program_idx_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index ae21128f6..79ee61a51 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -164,11 +164,7 @@ function::caller function::autotune(driver::stream* stream, const grid_fn_ty& gr auto ir = make_ir(parser); // binary code-gen std::unique_ptr bin; - try{ - bin = make_bin(*ir, stream->context(), opt); - }catch(const std::runtime_error& e) { - return; - } + bin = make_bin(*ir, stream->context(), opt); // kernel uses too much resources if(!bin) return; @@ -204,7 +200,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::transform::peephole peephole; codegen::transform::reassociate reassociate(&alignment_info, &grids); codegen::selection selection(&shmem_allocation, &grids, &shmem_info, &alignment_info, target.get()); -// ir::print(module, std::cout); + ir::print(module, std::cout); // run passes peephole.run(module); dce.run(module); diff --git a/tests/common/src/reduce.h b/tests/common/src/reduce.h index 1f2be7461..02cc3fbe7 100644 --- a/tests/common/src/reduce.h +++ b/tests/common/src/reduce.h @@ -19,8 +19,8 @@ void reduce2d(TYPE * X __noalias __readonly __aligned(16), int rm[TM] = ridm * TM + 0 ... TM; int rn[TN] = ridn * TN + 0 ... TN; TYPE* px[TM, TN] = X + rm[:, newaxis] + rn[newaxis, :] * ldx; - TYPE* py[TM] = Y + rm; - *py = (*px)[:, +]; + TYPE* py[TY] = Y + rm; + *py = (*px)[RED]; } )"; diff --git a/tests/common/util.h b/tests/common/util.h index 874b33e84..800e2c5ae 100644 --- a/tests/common/util.h +++ b/tests/common/util.h @@ -78,7 +78,7 @@ namespace testing { if(hc.size() != rc.size()) return false; for(size_t i = 0; i < hc.size(); i++) - if(std::isinf(hc[i]) || std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-2){ + if(std::isinf(hc[i]) || std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; return false; diff --git a/tests/unit/reduce.cc b/tests/unit/reduce.cc index c513d9cb5..6545b5d62 100644 --- a/tests/unit/reduce.cc +++ b/tests/unit/reduce.cc @@ -14,17 +14,54 @@ namespace drv = triton::driver; namespace rt = triton::runtime; - -template -void cpu_ref(std::vector &y, const std::vector &x, int M, int N) { - for(int m = 0; m < M; m++){ - T acc = 0; - for(int n = 0; n < N; n++) - acc = acc + x[m + n*M]; - y[m] = acc; +void _loop_nest(std::vector const & ranges, + std::function const &)> const & f){ + int D = ranges.size(); + std::vector values(D, 0); + // Start with innermost loop + int i = D - 1; + while(true){ + // Execute function + f(values); + while(values[i]++ == ranges[i] - 1){ + if(i == 0) + return; + values[i--] = 0; + } + i = D - 1; } } +int offset(const std::vector& idx, const std::vector& shapes) { + int result = idx[0]; + for(int i = 1; i < idx.size(); i++) + result += idx[i]*shapes[i-1]; + return result; +} + +template +void reduce_nd(std::vector &y, const std::vector &x, size_t axis, const std::vector& shapes) { + assert(axis <= shapes.size() - 1); + // remove shape at index axis to get outer dimensions + std::vector outer = shapes; + outer.erase(outer.begin() + axis); + // retrieve shape at index axis to get inner dimension + int inner = shapes[axis]; + // iterate over outer dimensions + _loop_nest(outer, [&](const std::vector& y_idx) { + T acc = 0; + auto x_idx = y_idx; + x_idx.insert(x_idx.begin() + axis, 0); + // accumulate over inner dimensions + for(int z = 0; z < inner; z++){ + x_idx[axis] = z; + acc = acc + x[offset(x_idx, shapes)]; + } + y[offset(y_idx, outer)] = acc; + }); +} + + bool do_test(drv::stream* stream, int M, int N, std::string op, int nwarp){ typedef float NumericT; std::string ty = "float"; @@ -44,12 +81,14 @@ bool do_test(drv::stream* stream, int M, int N, std::string op, int nwarp){ opt.defines.push_back({"TYPE", {ty}}); opt.defines.push_back({"TM", {std::to_string(M)}}); opt.defines.push_back({"TN", {std::to_string(N)}}); + opt.defines.push_back({"TY", {std::to_string(M)}}); + opt.defines.push_back({"RED", {"+, :"}}); opt.num_warps = {nwarp}; rt::function function(src::reduce2d, opt); function({&*dx, &*dy, M, N, M}, grid2d(M, N), stream); stream->synchronize(); stream->read(&*dy, true, 0, hy); - cpu_ref(ry, hx, M, N); + reduce_nd(ry, hx, 0, {M, N}); return testing::diff(hy, ry); } From c4c93943dfd4c66d7aa09b38bf377a6ef16d0639 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 12 Sep 2019 00:32:10 -0400 Subject: [PATCH 393/494] [codegen] fixed bug in reduction --- lib/codegen/selection.cc | 32 ++++++++++++++++++-------------- tests/unit/reduce.cc | 3 +++ 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index e1f08bbad..243eb2bb2 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -923,7 +923,6 @@ void selection::lower_downcast(ir::downcast_inst *x, LLVMContext &ctx, Function } void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { - ir::instruction *ins = (ir::instruction*)x; Module *module = fn->getParent(); std::map partial; ir::value *op = x->get_operand(0); @@ -933,7 +932,7 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, // reduce within thread op_tile->for_each([&](indices_t idx) { indices_t pidx = idx; - pidx.erase(pidx.begin() + axis); + pidx[axis] = builder.getInt32(0); Value *current = op_tile->get_value(idx); // current partial result is not initialized -- create if(partial.find(pidx) == partial.end()) @@ -943,6 +942,15 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, partial[pidx] = builder.CreateFAdd(partial[pidx], current); }); + // depth + unsigned shape_ax = op->get_type()->get_tile_shapes()[axis]; + unsigned per_thread = op_tile->axis(axis).values.size(); + unsigned depth = shape_ax / per_thread; + + // shapes + auto shared_shapes = op_tile->get_shapes(); + shared_shapes[axis] = depth; + // reduce within blocks unsigned addr_space = sh_mem_ptr_->getType()->getPointerAddressSpace(); Type *res_ty = builder.getFloatTy(); @@ -952,25 +960,20 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, Value *lane = axes_.at(params_->get_param_group(op, axis)).thread_id; Value *&result = x.second; indices_t write_idx = x.first; - write_idx.insert(write_idx.begin() + axis, lane); - + write_idx[axis] = lane; // shared memory write pointer - Value *write_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), write_idx); + Value *write_offset = shared_tile::shared_offset(builder, shared_shapes, write_idx); Value *write_ptr = builder.CreateGEP(base_ptr, write_offset); - // initialize shared memory tgt_->add_barrier(module, builder); builder.CreateStore(result, write_ptr); // build result - unsigned shape_ax = op->get_type()->get_tile_shapes()[axis]; - unsigned per_thread = op_tile->axis(axis).values.size(); - unsigned depth = shape_ax / per_thread; for(unsigned i = depth/2; i > 0; i >>= 1){ // current indices indices_t current(write_idx.size(), builder.getInt32(0)); current[axis] = builder.getInt32(i); // shared memory offset - Value *read_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), current); + Value *read_offset = shared_tile::shared_offset(builder, shared_shapes, current); Value *is_active = builder.CreateICmpULT(lane, builder.getInt32(i)); read_offset = builder.CreateSelect(is_active, read_offset, builder.getInt32(0)); // shared memory read pointer @@ -987,10 +990,11 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, distributed_tile* x_tile = (distributed_tile*)tmap_.at(x); x_tile->for_each([&](indices_t idx) { -// Value *read_offset = shared_tile::shared_offset(builder, x_tile->get_shapes(), idx); -// Value *read_ptr = builder.CreateGEP(base_ptr, read_offset); -// x_tile->set_value(idx, builder.CreateLoad(read_ptr)); - x_tile->set_value(idx, ConstantFP::get(builder.getFloatTy(), 0)); + indices_t red_idx = idx; + red_idx.insert(red_idx.begin() + axis, builder.getInt32(0)); + Value *read_offset = shared_tile::shared_offset(builder, shared_shapes, red_idx); + Value *read_ptr = builder.CreateGEP(base_ptr, read_offset); + x_tile->set_value(idx, builder.CreateLoad(read_ptr)); }); } diff --git a/tests/unit/reduce.cc b/tests/unit/reduce.cc index 6545b5d62..2317b76d2 100644 --- a/tests/unit/reduce.cc +++ b/tests/unit/reduce.cc @@ -73,6 +73,9 @@ bool do_test(drv::stream* stream, int M, int N, std::string op, int nwarp){ srand(0); init_zeros(hy); init_rand(hx); + for(int i = 0; i < M; i++) + for(int j = 0; j < N; j++) + hx[i + j*M] = i+j; auto dy = std::shared_ptr(drv::buffer::create(context, hy.size()*dt_nbytes)); auto dx = std::shared_ptr(drv::buffer::create(context, hx.size()*dt_nbytes)); stream->write(&*dy, true, 0, hy); From f4beb713abb78c40c60f1e39d204ca0a0a28f560 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 12 Sep 2019 16:11:57 -0400 Subject: [PATCH 394/494] [test] added support for max, min reduction and made it easy to add more --- include/triton/ir/builder.h | 2 +- include/triton/ir/instructions.h | 17 ++++++--- include/triton/lang/token.h | 2 ++ lib/codegen/analysis/grid.cc | 10 ------ lib/codegen/selection.cc | 37 ++++++++++++++------ lib/ir/builder.cc | 4 +-- lib/ir/instructions.cc | 24 +++++++++++-- lib/lang/code_gen.cc | 32 ++++++++++++----- lib/lang/parser.cc | 4 ++- lib/lang/token.cc | 2 ++ lib/runtime/function.cc | 3 +- tests/common/src/reduce.h | 2 +- tests/common/util.h | 45 ++++++++++++++++++++++++ tests/unit/reduce.cc | 59 +++++++++++++++++++------------- 14 files changed, 178 insertions(+), 65 deletions(-) diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h index 0b6c859b1..5af20edbe 100644 --- a/include/triton/ir/builder.h +++ b/include/triton/ir/builder.h @@ -136,7 +136,7 @@ public: value *create_dot(value *A, value *B, value *C, const std::string &name = ""); value *create_trans(value *A, const std::vector &perm = {}, const std::string &name = ""); value *create_sqrt(value *A, const std::string &name = ""); - value *create_reduce(value *A, unsigned axis, const std::string &name = ""); + value *create_reduce(value *A, reduce_inst::op_t op, unsigned axis, const std::string &name = ""); value *create_select(value *pred, value *if_value, value *else_value, const std::string &name = ""); // Intrinsics value *create_copy_to_shared(value *arg, const std::string &name = ""); diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index 5a07f79b2..d961790ab 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -611,19 +611,28 @@ public: }; class reduce_inst: public builtin_inst { -private: - static type* get_res_type(value *arg, unsigned axis); +public: + enum op_t{ + ADD, SUB, MAX, MIN, + FADD, FSUB, FMAX, FMIN + }; private: - reduce_inst(value* arg, unsigned axis, const std::string& name, instruction* next); + static type* get_res_type(value *arg, unsigned axis); + static std::string to_str(op_t op); + +private: + reduce_inst(value* arg, op_t op, unsigned axis, const std::string& name, instruction* next); std::string repr_impl() const { return "red<" + std::to_string(axis_) + ">"; } public: - static instruction* create(value *arg, unsigned axis, const std::string &name = "", instruction *next = nullptr); + static instruction* create(value *arg, op_t op, unsigned axis, const std::string &name = "", instruction *next = nullptr); unsigned get_axis() const { return axis_; } + op_t get_op() const { return op_; } private: unsigned axis_; + op_t op_; }; class select_inst: public builtin_inst { diff --git a/include/triton/lang/token.h b/include/triton/lang/token.h index 602113f93..f11d08fc8 100644 --- a/include/triton/lang/token.h +++ b/include/triton/lang/token.h @@ -131,6 +131,8 @@ public: // TILE ARITHMETICS BEGIN NEWAXIS, + MAX, + MIN, // TILE ARITHMETICS END ALIGNAS, // _Alignas diff --git a/lib/codegen/analysis/grid.cc b/lib/codegen/analysis/grid.cc index 4ce4116e3..da8516daa 100644 --- a/lib/codegen/analysis/grid.cc +++ b/lib/codegen/analysis/grid.cc @@ -60,15 +60,6 @@ void grids::init_c_graph(ir::instruction *v) { else if(dynamic_cast(v)) return; else if(dynamic_cast(v)) { -// unsigned axis = reduce->get_axis(); -// ir::value *arg = reduce->get_operand(0); -// auto in_shapes = arg->get_type()->get_tile_shapes(); -// unsigned current = 0; -// for(unsigned i = 0; i < in_shapes.size(); i++){ -// if(i == axis) -// continue; -// add_constraint({reduce, current++}, {arg, i}); -// } return; } else @@ -305,7 +296,6 @@ void grids::run(ir::module &mod) { for(size_t d = 0; d < shapes.size(); d++){ std::string str_d = std::to_string(d); effective_num_threads *= params_.at(i).at("mts.d" + str_d)->get_value(); - std::cout << shapes[d] << " " << params_.at(i).at("mts.d" + str_d)->get_value() << " " << params_.at(i).at("nts.d" + str_d)->get_value() << std::endl; } if(num_threads != effective_num_threads) throw std::runtime_error("cannot create a kernel with this amount of warps"); diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 243eb2bb2..0b1568354 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -925,30 +925,47 @@ void selection::lower_downcast(ir::downcast_inst *x, LLVMContext &ctx, Function void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { Module *module = fn->getParent(); std::map partial; - ir::value *op = x->get_operand(0); - distributed_tile* op_tile = (distributed_tile*)tmap_.at(op); + ir::value *arg = x->get_operand(0); + distributed_tile* arg_tile = (distributed_tile*)tmap_.at(arg); + ir::reduce_inst::op_t op = x->get_op(); + auto accumulate = [&](Value* x, Value *y) -> Value* { + switch(op) { + case ir::reduce_inst::ADD: return builder.CreateAdd(x, y); + case ir::reduce_inst::SUB: return builder.CreateSub(x, y); + case ir::reduce_inst::MAX: return builder.CreateMaximum(x, y); + case ir::reduce_inst::MIN: return builder.CreateMinimum(x, y); + case ir::reduce_inst::FADD: return builder.CreateFAdd(x, y); + case ir::reduce_inst::FSUB: return builder.CreateFSub(x, y); + case ir::reduce_inst::FMAX: return builder.CreateSelect(builder.CreateFCmpOGT(x, y), x, y); + case ir::reduce_inst::FMIN: return builder.CreateSelect(builder.CreateFCmpOLT(x, y), x, y); + default: break; + } + assert(false); + return nullptr; + }; + unsigned axis = x->get_axis(); // reduce within thread - op_tile->for_each([&](indices_t idx) { + arg_tile->for_each([&](indices_t idx) { indices_t pidx = idx; pidx[axis] = builder.getInt32(0); - Value *current = op_tile->get_value(idx); + Value *current = arg_tile->get_value(idx); // current partial result is not initialized -- create if(partial.find(pidx) == partial.end()) partial[pidx] = current; // current partial result is initialized -- accumulate else - partial[pidx] = builder.CreateFAdd(partial[pidx], current); + partial[pidx] = accumulate(partial[pidx], current); }); // depth - unsigned shape_ax = op->get_type()->get_tile_shapes()[axis]; - unsigned per_thread = op_tile->axis(axis).values.size(); + unsigned shape_ax = arg->get_type()->get_tile_shapes()[axis]; + unsigned per_thread = arg_tile->axis(axis).values.size(); unsigned depth = shape_ax / per_thread; // shapes - auto shared_shapes = op_tile->get_shapes(); + auto shared_shapes = arg_tile->get_shapes(); shared_shapes[axis] = depth; // reduce within blocks @@ -957,7 +974,7 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, Value *base_ptr = builder.CreateBitCast(sh_mem_ptr_, PointerType::get(res_ty, addr_space)); for(auto& x: partial) { // current element being computed - Value *lane = axes_.at(params_->get_param_group(op, axis)).thread_id; + Value *lane = axes_.at(params_->get_param_group(arg, axis)).thread_id; Value *&result = x.second; indices_t write_idx = x.first; write_idx[axis] = lane; @@ -981,7 +998,7 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, tgt_->add_barrier(module, builder); Value *next = builder.CreateLoad(read_ptr); // accumulate - result = builder.CreateFAdd(result, next); + result = accumulate(result, next); // write back builder.CreateStore(result, write_ptr); } diff --git a/lib/ir/builder.cc b/lib/ir/builder.cc index 458365a60..8f42e263c 100644 --- a/lib/ir/builder.cc +++ b/lib/ir/builder.cc @@ -323,8 +323,8 @@ value *builder::create_sqrt(value *A, const std::string &name) { return insert(sqrt_inst::create(A, name)); } -value *builder::create_reduce(value *A, unsigned axis, const std::string &name) { - return insert(reduce_inst::create(A, axis, name)); +value *builder::create_reduce(value *A, reduce_inst::op_t op, unsigned axis, const std::string &name) { + return insert(reduce_inst::create(A, op, axis, name)); } value *builder::create_select(value *pred, value *if_value, value *else_value, const std::string &name){ diff --git a/lib/ir/instructions.cc b/lib/ir/instructions.cc index e7e5de1f2..9df26dc1a 100644 --- a/lib/ir/instructions.cc +++ b/lib/ir/instructions.cc @@ -615,6 +615,23 @@ instruction* sqrt_inst::create(value *arg, const std::string &name, instruction //===----------------------------------------------------------------------===// // reduce instructions //===----------------------------------------------------------------------===// + +std::string reduce_inst::to_str(op_t op) { + switch (op) { + case ADD: return "+"; + case SUB: return "-"; + case MAX: return "imax"; + case MIN: return "imin"; + case FADD: return "+"; + case FSUB: return "-"; + case FMAX: return "fmax"; + case FMIN: return "fmin"; + default: break; + } + assert(false); + return ""; +} + type* reduce_inst::get_res_type(value *arg, unsigned axis) { ir::tile_type::tile_shapes_t shapes = arg->get_type()->get_tile_shapes(); shapes.erase(shapes.begin() + axis); @@ -625,14 +642,15 @@ type* reduce_inst::get_res_type(value *arg, unsigned axis) { return tile_type::get(scalar_ty, shapes); } -reduce_inst::reduce_inst(value *arg, unsigned axis, const std::string &name, instruction *next) +reduce_inst::reduce_inst(value *arg, op_t op, unsigned axis, const std::string &name, instruction *next) : builtin_inst(get_res_type(arg, axis), 1, 1, name, next), + op_(op), axis_(axis){ set_operand(0, arg); } -instruction* reduce_inst::create(value *arg, unsigned axis, const std::string &name, instruction *next) { - return new reduce_inst(arg, axis, name, next); +instruction* reduce_inst::create(value *arg, op_t op, unsigned axis, const std::string &name, instruction *next) { + return new reduce_inst(arg, op, axis, name, next); } diff --git a/lib/lang/code_gen.cc b/lib/lang/code_gen.cc index 56acb1c03..8384dd710 100644 --- a/lib/lang/code_gen.cc +++ b/lib/lang/code_gen.cc @@ -154,12 +154,24 @@ void Generator::VisitBinaryOp(BinaryOp* binary) { error_not_implemented(); } +ir::reduce_inst::op_t reduce_op(int tag, bool is_float) { + using ir::reduce_inst; + switch(tag){ + case Token::ADD: return is_float ? reduce_inst::FADD : reduce_inst::ADD; + case Token::SUB: return is_float ? reduce_inst::FSUB : reduce_inst::SUB; + case Token::MAX: return is_float ? reduce_inst::FMAX : reduce_inst::MAX; + case Token::MIN: return is_float ? reduce_inst::FMIN : reduce_inst::MIN; + default: break; + } + should_not_happen(); + return reduce_inst::op_t(); +} void Generator::VisitUnaryOp(UnaryOp* unary) { - // recursion Visit(unary->operand_); - ir::value* op = ret_; - + ir::value* arg = ret_; + ir::type *arg_ty = arg->get_type(); + ir::type *arg_scal_ty = arg_ty->get_scalar_ty(); // return switch (unary->op_) { case Token::PREFIX_INC: return error_not_implemented(); @@ -167,17 +179,19 @@ void Generator::VisitUnaryOp(UnaryOp* unary) { case Token::POSTFIX_INC: return error_not_implemented(); case Token::POSTFIX_DEC: return error_not_implemented(); case Token::ADDR: return error_not_implemented(); - case Token::DEREF: return set_ret(bld_->create_load(op)); + case Token::DEREF: return set_ret(bld_->create_load(arg)); case Token::PLUS: return error_not_implemented(); case Token::MINUS: return error_not_implemented(); - case '~': return set_ret(bld_->create_neg(op)); - case '!': return set_ret(bld_->create_not(op)); - case Token::CAST: return set_ret(GenCastOp(op, GenIRType(unary->Type(), *ctx_))); - case '^': return set_ret(bld_->create_trans(op)); + case '~': return set_ret(bld_->create_neg(arg)); + case '!': return set_ret(bld_->create_not(arg)); + case Token::CAST: return set_ret(GenCastOp(arg, GenIRType(unary->Type(), *ctx_))); + case '^': return set_ret(bld_->create_trans(arg)); case Token::REDUCE: { int ax, tag; UnaryOp::decodeRed(unary->info_, ax, tag); - return set_ret(bld_->create_reduce(op, ax)); + bool is_float = arg_scal_ty->is_floating_point_ty(); + ir::reduce_inst::op_t op = reduce_op(tag, is_float); + return set_ret(bld_->create_reduce(arg, op, ax)); } default: error_not_implemented(); } diff --git a/lib/lang/parser.cc b/lib/lang/parser.cc index f69337ced..a30258c3d 100644 --- a/lib/lang/parser.cc +++ b/lib/lang/parser.cc @@ -466,7 +466,9 @@ Expr* Parser::ParseSubScripting(Expr* lhs) { break; case Token::ADD: - case Token::SUB:{ + case Token::SUB: + case Token::MAX: + case Token::MIN:{ int info = UnaryOp::encodeRed(i, tok->tag_); redInfo.push_back({i, info}); shape.push_back(lhsShape[i++]); diff --git a/lib/lang/token.cc b/lib/lang/token.cc index b9f3c8467..8b61aa098 100644 --- a/lib/lang/token.cc +++ b/lib/lang/token.cc @@ -54,6 +54,8 @@ const std::unordered_map Token::kwTypeMap_ { { "_Noreturn", Token::NORETURN }, { "_Static_assert", Token::STATIC_ASSERT }, { "_Thread_local", Token::THREAD }, + { "max", Token::MAX }, + { "min", Token::MIN }, }; const std::unordered_map Token::tagLexemeMap_ { diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 79ee61a51..ea84eac00 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -157,6 +157,7 @@ function::caller function::autotune(driver::stream* stream, const grid_fn_ty& gr for(auto it: opt_space_.defines) cpp.AddMacro(it.first, &opt.defines.at(it.first)); cpp.Process(tokens); +// tokens.Print(stdout); // parse Parser parser(tokens); parser.Parse(); @@ -200,7 +201,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::transform::peephole peephole; codegen::transform::reassociate reassociate(&alignment_info, &grids); codegen::selection selection(&shmem_allocation, &grids, &shmem_info, &alignment_info, target.get()); - ir::print(module, std::cout); +// ir::print(module, std::cout); // run passes peephole.run(module); dce.run(module); diff --git a/tests/common/src/reduce.h b/tests/common/src/reduce.h index 02cc3fbe7..3a77e960e 100644 --- a/tests/common/src/reduce.h +++ b/tests/common/src/reduce.h @@ -19,7 +19,7 @@ void reduce2d(TYPE * X __noalias __readonly __aligned(16), int rm[TM] = ridm * TM + 0 ... TM; int rn[TN] = ridn * TN + 0 ... TN; TYPE* px[TM, TN] = X + rm[:, newaxis] + rn[newaxis, :] * ldx; - TYPE* py[TY] = Y + rm; + TYPE* py[TY] = Y + RY; *py = (*px)[RED]; } )"; diff --git a/tests/common/util.h b/tests/common/util.h index 800e2c5ae..6de7f340f 100644 --- a/tests/common/util.h +++ b/tests/common/util.h @@ -43,6 +43,34 @@ void init_zeros(std::vector& x) { x[i] = 0; } +enum reduce_op_t { + ADD, + MAX, + MIN +}; + +std::string to_str(reduce_op_t op) { + switch (op) { + case ADD: return "+"; + case MAX: return "max"; + case MIN: return "min"; + default: break; + } + assert(false); + return ""; +} + +template +std::function get_accumulator(reduce_op_t op) { + switch (op) { + case ADD: return [](T x, T y) { return x + y; }; + case MAX: return [](T x, T y) { return std::max(x, y); }; + case MIN: return [](T x, T y) { return std::min(x, y); }; + default: break; + } + assert(false); + return std::function(); +} namespace aux{ @@ -70,6 +98,23 @@ auto operator<<(std::basic_ostream& os, std::tuple const& t) return os << ")"; } +template +std::basic_ostream& operator<<(std::basic_ostream& os, const std::vector& vec) { + os << "{"; + for(size_t i = 0; i < vec.size(); i++){ + if(i > 0) + os << ", "; + os << vec[i]; + } + os << "}"; + return os; +} + +template +std::basic_ostream& operator<<(std::basic_ostream& os, reduce_op_t op) { + return os << to_str(op); +} + namespace testing { diff --git a/tests/unit/reduce.cc b/tests/unit/reduce.cc index 2317b76d2..5951f3e50 100644 --- a/tests/unit/reduce.cc +++ b/tests/unit/reduce.cc @@ -2,6 +2,7 @@ #include #include #include +#include #include "triton/driver/backend.h" #include "triton/driver/stream.h" #include "triton/tools/bench.hpp" @@ -40,58 +41,66 @@ int offset(const std::vector& idx, const std::vector& shapes) { } template -void reduce_nd(std::vector &y, const std::vector &x, size_t axis, const std::vector& shapes) { +void reduce_nd(std::vector &y, const std::vector &x, reduce_op_t op, size_t axis, const std::vector& shapes) { assert(axis <= shapes.size() - 1); // remove shape at index axis to get outer dimensions std::vector outer = shapes; outer.erase(outer.begin() + axis); // retrieve shape at index axis to get inner dimension int inner = shapes[axis]; + // accumualtion function + auto acc = get_accumulator(op); // iterate over outer dimensions _loop_nest(outer, [&](const std::vector& y_idx) { - T acc = 0; + T ret = 0; auto x_idx = y_idx; x_idx.insert(x_idx.begin() + axis, 0); // accumulate over inner dimensions for(int z = 0; z < inner; z++){ x_idx[axis] = z; - acc = acc + x[offset(x_idx, shapes)]; + ret = acc(ret, x[offset(x_idx, shapes)]); } - y[offset(y_idx, outer)] = acc; + y[offset(y_idx, outer)] = ret; }); } -bool do_test(drv::stream* stream, int M, int N, std::string op, int nwarp){ +bool do_test(drv::stream* stream, std::vector shape, int axis, reduce_op_t op, int nwarp){ typedef float NumericT; std::string ty = "float"; size_t dt_nbytes = sizeof(NumericT); drv::context* context = stream->context(); - std::vector hy(M); - std::vector ry(M); - std::vector hx(M*N); + size_t axy = (axis == 0) ? 1 : 0; + std::string RY = (axis == 0) ? "rn" : "rm"; + std::vector hy(shape[axy]); + std::vector ry(shape[axy]); + std::vector hx(shape[0]*shape[1]); srand(0); init_zeros(hy); init_rand(hx); - for(int i = 0; i < M; i++) - for(int j = 0; j < N; j++) - hx[i + j*M] = i+j; auto dy = std::shared_ptr(drv::buffer::create(context, hy.size()*dt_nbytes)); auto dx = std::shared_ptr(drv::buffer::create(context, hx.size()*dt_nbytes)); stream->write(&*dy, true, 0, hy); stream->write(&*dx, true, 0, hx); rt::function::options_space_t opt; opt.defines.push_back({"TYPE", {ty}}); - opt.defines.push_back({"TM", {std::to_string(M)}}); - opt.defines.push_back({"TN", {std::to_string(N)}}); - opt.defines.push_back({"TY", {std::to_string(M)}}); - opt.defines.push_back({"RED", {"+, :"}}); + opt.defines.push_back({"TM", {std::to_string(shape[0])}}); + opt.defines.push_back({"TN", {std::to_string(shape[1])}}); + opt.defines.push_back({"TY", {std::to_string(shape[axy])}}); + opt.defines.push_back({"RY", {RY}}); + std::string RED = ""; + for(int n = 0; n < 2; n++){ + if(n > 0) + RED += ", "; + RED += (n==axis) ? to_str(op) : ":"; + } + opt.defines.push_back({"RED", {RED}}); opt.num_warps = {nwarp}; rt::function function(src::reduce2d, opt); - function({&*dx, &*dy, M, N, M}, grid2d(M, N), stream); + function({&*dx, &*dy, shape[0], shape[1], shape[0]}, grid2d(shape[0], shape[1]), stream); stream->synchronize(); stream->read(&*dy, true, 0, hy); - reduce_nd(ry, hx, 0, {M, N}); + reduce_nd(ry, hx, op, axis, shape); return testing::diff(hy, ry); } @@ -100,17 +109,21 @@ int main() { auto context = triton::driver::backend::contexts::get_default(); triton::driver::stream* stream = triton::driver::stream::create(context); // shapes to benchmark - typedef std::tuple config_t; + typedef std::tuple, int, reduce_op_t> config_t; std::vector configs = { - config_t{32, 32, "+"} + config_t{{32, 32}, 0, MAX}, + config_t{{32, 32}, 1, ADD}, + config_t{{32, 64}, 0, ADD}, + config_t{{64, 32}, 1, ADD} }; // does the work - int M, N; - std::string op; + int axis; + std::vector shape; + reduce_op_t op; for(const auto& c: configs){ - std::tie(M, N, op) = c; + std::tie(shape, axis, op) = c; std::cout << "Testing " << c << " ... " << std::flush; - if(do_test(stream, M, N, op, 1)) + if(do_test(stream, shape, axis, op, 1)) std::cout << " Pass! " << std::endl; else std::cout << " Fail! " << std::endl; From 7f2bc5bb6624d313557901f67510bb9fc9e163bf Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 12 Sep 2019 16:20:29 -0400 Subject: [PATCH 395/494] [testing] re-arranged util.h --- tests/common/util.h | 91 ++++++++++++++++++++++++++++++++++---------- tests/unit/dot.cc | 2 +- tests/unit/reduce.cc | 27 +------------ 3 files changed, 73 insertions(+), 47 deletions(-) diff --git a/tests/common/util.h b/tests/common/util.h index 6de7f340f..0d06b47a8 100644 --- a/tests/common/util.h +++ b/tests/common/util.h @@ -9,6 +9,10 @@ namespace drv = triton::driver; namespace rt = triton::runtime; +/* ------------------------ + * Launch Grid + * ------------------------ */ + inline size_t ceil(size_t x, size_t y) { return (x + y - 1) / y; }; @@ -26,10 +30,10 @@ inline rt::function::grid_fn_ty grid2d(size_t M, size_t N) { }; } -enum order_t { - ROWMAJOR, - COLMAJOR -}; + +/* ------------------------ + * Tensor Initialization + * ------------------------ */ template void init_rand(std::vector& x) { @@ -43,6 +47,49 @@ void init_zeros(std::vector& x) { x[i] = 0; } +/* ------------------------ + * Loop Nests + * ------------------------ */ + +void _loop_nest(std::vector const & ranges, + std::function const &)> const & f){ + int D = ranges.size(); + std::vector values(D, 0); + // Start with innermost loop + int i = D - 1; + while(true){ + // Execute function + f(values); + while(values[i]++ == ranges[i] - 1){ + if(i == 0) + return; + values[i--] = 0; + } + i = D - 1; + } +} + +/* ----------------------- + * TENSOR INDEXING + * ----------------------- */ + +enum order_t { + ROWMAJOR, + COLMAJOR +}; + + +int offset(const std::vector& idx, const std::vector& shapes) { + int result = idx[0]; + for(int i = 1; i < idx.size(); i++) + result += idx[i]*shapes[i-1]; + return result; +} + +/* ----------------------- + * REDUCTION HELPERS + * ----------------------- */ + enum reduce_op_t { ADD, MAX, @@ -73,6 +120,26 @@ std::function get_accumulator(reduce_op_t op) { } +/* ----------------------- + * TENSOR COMPARISON + * ----------------------- */ + +template +bool diff(const std::vector& hc, const std::vector& rc) { +if(hc.size() != rc.size()) + return false; +for(size_t i = 0; i < hc.size(); i++) + if(std::isinf(hc[i]) || std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ + std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; + return false; + } +return true; +} + +/* ----------------------- + * PRETTY PRINTING + * ----------------------- */ + namespace aux{ template struct seq{}; @@ -116,21 +183,5 @@ std::basic_ostream& operator<<(std::basic_ostream& os, reduce_op } -namespace testing { - - template - bool diff(const std::vector& hc, const std::vector& rc) { - if(hc.size() != rc.size()) - return false; - for(size_t i = 0; i < hc.size(); i++) - if(std::isinf(hc[i]) || std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ - std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; - - return false; - } - return true; - } - -} #endif diff --git a/tests/unit/dot.cc b/tests/unit/dot.cc index b08eb13ba..e1b0a8bb5 100644 --- a/tests/unit/dot.cc +++ b/tests/unit/dot.cc @@ -91,7 +91,7 @@ bool do_test(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_ stream->read(&*dc, true, 0, hc); std::vector rc(hc.size()); cpu_ref(AT, BT, M, N, K, rc, ha, hb); - return testing::diff(hc, rc); + return diff(hc, rc); } int main() { diff --git a/tests/unit/reduce.cc b/tests/unit/reduce.cc index 5951f3e50..63b870fe5 100644 --- a/tests/unit/reduce.cc +++ b/tests/unit/reduce.cc @@ -15,31 +15,6 @@ namespace drv = triton::driver; namespace rt = triton::runtime; -void _loop_nest(std::vector const & ranges, - std::function const &)> const & f){ - int D = ranges.size(); - std::vector values(D, 0); - // Start with innermost loop - int i = D - 1; - while(true){ - // Execute function - f(values); - while(values[i]++ == ranges[i] - 1){ - if(i == 0) - return; - values[i--] = 0; - } - i = D - 1; - } -} - -int offset(const std::vector& idx, const std::vector& shapes) { - int result = idx[0]; - for(int i = 1; i < idx.size(); i++) - result += idx[i]*shapes[i-1]; - return result; -} - template void reduce_nd(std::vector &y, const std::vector &x, reduce_op_t op, size_t axis, const std::vector& shapes) { assert(axis <= shapes.size() - 1); @@ -101,7 +76,7 @@ bool do_test(drv::stream* stream, std::vector shape, int axis, reduce_op_t stream->synchronize(); stream->read(&*dy, true, 0, hy); reduce_nd(ry, hx, op, axis, shape); - return testing::diff(hy, ry); + return diff(hy, ry); } int main() { From 11ff27d6384d594248300a63db459b321a320e37 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 12 Sep 2019 22:44:07 -0400 Subject: [PATCH 396/494] [codegen][coalesce] some bugfix for phi-nodes --- include/triton/codegen/analysis/grid.h | 6 +- include/triton/codegen/selection.h | 6 +- .../transform/{reorder.h => coalesce.h} | 4 +- lib/codegen/analysis/grid.cc | 6 +- lib/codegen/selection.cc | 2 +- lib/codegen/transform/coalesce.cc | 110 ++++++++++++++++++ lib/codegen/transform/reorder.cc | 106 ----------------- lib/driver/module.cc | 1 - lib/runtime/function.cc | 8 +- 9 files changed, 126 insertions(+), 123 deletions(-) rename include/triton/codegen/transform/{reorder.h => coalesce.h} (87%) create mode 100644 lib/codegen/transform/coalesce.cc delete mode 100644 lib/codegen/transform/reorder.cc diff --git a/include/triton/codegen/analysis/grid.h b/include/triton/codegen/analysis/grid.h index c361db260..1eb352b00 100644 --- a/include/triton/codegen/analysis/grid.h +++ b/include/triton/codegen/analysis/grid.h @@ -19,7 +19,7 @@ namespace ir{ namespace codegen{ namespace transform{ -class reorder; +class coalesce; } namespace analysis{ @@ -46,7 +46,7 @@ private: public: - grids(size_t num_warps, transform::reorder* reorder); + grids(size_t num_warps, transform::coalesce* reorder); ir::metaparameter* get_param(ir::value *value, const std::string &key) { return params_[value][key]; } unsigned get_param_group(ir::value *value, unsigned ax); fragment_t get_fragment(ir::value *value, unsigned ax) { return fragments_.at({value, ax}); } @@ -66,7 +66,7 @@ private: std::vector grids_; std::map> groups_; size_t num_warps_; - transform::reorder* reorder_; + transform::coalesce* reorder_; }; diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 842f544aa..3efe0a256 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -50,7 +50,7 @@ class meminfo; } namespace transform{ -class reorder; +class coalesce; } class target; @@ -195,7 +195,7 @@ private: public: - selection(analysis::memalloc *alloc, analysis::grids *params, analysis::meminfo *buffer_info, analysis::align *alignment, transform::reorder* reorder, target *tgt) + selection(analysis::memalloc *alloc, analysis::grids *params, analysis::meminfo *buffer_info, analysis::align *alignment, transform::coalesce* reorder, target *tgt) : alloc_(alloc), params_(params), buffer_info_(buffer_info), alignment_(alignment), reorder_(reorder), tgt_(tgt){ } void run(ir::module &src, Module &dst); @@ -207,7 +207,7 @@ private: analysis::grids *params_; analysis::meminfo *buffer_info_; analysis::align *alignment_; - transform::reorder *reorder_; + transform::coalesce *reorder_; target *tgt_; std::map axes_; Value *sh_mem_ptr_; diff --git a/include/triton/codegen/transform/reorder.h b/include/triton/codegen/transform/coalesce.h similarity index 87% rename from include/triton/codegen/transform/reorder.h rename to include/triton/codegen/transform/coalesce.h index 19bffab03..e78010703 100644 --- a/include/triton/codegen/transform/reorder.h +++ b/include/triton/codegen/transform/coalesce.h @@ -20,9 +20,9 @@ namespace analysis{ namespace transform{ -class reorder { +class coalesce { public: - reorder(analysis::align* algin, analysis::meminfo* mem); + coalesce(analysis::align* algin, analysis::meminfo* mem); std::vector get_order(ir::value* v); void run(ir::module &mod); diff --git a/lib/codegen/analysis/grid.cc b/lib/codegen/analysis/grid.cc index a33c4c25d..43a3eb1d9 100644 --- a/lib/codegen/analysis/grid.cc +++ b/lib/codegen/analysis/grid.cc @@ -1,6 +1,6 @@ #include #include -#include "triton/codegen/transform/reorder.h" +#include "triton/codegen/transform/coalesce.h" #include "triton/codegen/analysis/grid.h" #include "triton/ir/instructions.h" #include "triton/ir/type.h" @@ -16,7 +16,7 @@ namespace triton{ namespace codegen{ namespace analysis{ -grids::grids(size_t num_warps, transform::reorder *reorder): num_warps_(num_warps), reorder_(reorder) +grids::grids(size_t num_warps, transform::coalesce *reorder): num_warps_(num_warps), reorder_(reorder) { } bool is_hmma(ir::value *v){ @@ -298,7 +298,7 @@ void grids::run(ir::module &mod) { unsigned current = num_threads; std::string nts = "nts.d" + s_ld; std::string mts = "mts.d" + s_ld; - params_.at(i).at(nts)->set_value(clamp(size / num_threads, 1, 1)); + params_.at(i).at(nts)->set_value(clamp(size / num_threads, 1, 8)); params_.at(i).at(mts)->set_value(clamp(current, 1, shapes[ld] / params_.at(i).at(nts)->get_value())); current = current / params_.at(i).at(mts)->get_value(); for(size_t d = 1; d < shapes.size(); d++){ diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 271de7640..f452cc384 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -3,7 +3,7 @@ #include "triton/codegen/analysis/grid.h" #include "triton/codegen/analysis/memalloc.h" #include "triton/codegen/analysis/align.h" -#include "triton/codegen/transform/reorder.h" +#include "triton/codegen/transform/coalesce.h" #include "triton/ir/context.h" #include "triton/ir/module.h" #include "triton/ir/function.h" diff --git a/lib/codegen/transform/coalesce.cc b/lib/codegen/transform/coalesce.cc new file mode 100644 index 000000000..29a87129c --- /dev/null +++ b/lib/codegen/transform/coalesce.cc @@ -0,0 +1,110 @@ +#include +#include +#include +#include "triton/ir/function.h" +#include "triton/ir/cfg.h" +#include "triton/ir/basic_block.h" +#include "triton/ir/instructions.h" +#include "triton/ir/module.h" +#include "triton/codegen/analysis/meminfo.h" +#include "triton/codegen/analysis/align.h" +#include "triton/codegen/transform/coalesce.h" + +namespace triton { +namespace codegen{ +namespace transform{ + +coalesce::coalesce(analysis::align* align, analysis::meminfo *mem) + : align_(align), mem_(mem) { } + +std::vector coalesce::get_order(ir::value* v) { + return order_.at(v); +} + +void coalesce::run(ir::module &mod) { + + std::set io; + + std::function set_order = [&](ir::value *v) -> void { + if(order_.find(v) != order_.end()) + return; + order_[v] = {}; + if(ir::user* u = dynamic_cast(v)) + for(ir::value* op: u->ops()) + set_order(op); + ir::type* ty = v->get_type(); + if(!ty->is_tile_ty()) + return; + std::vector order(ty->get_tile_shapes().size()); + std::iota(order.begin(), order.end(), 0); + order_[v] = order; + }; + + // initialize work-list + for(ir::function *fn: mod.get_function_list()) + for(ir::basic_block *block: ir::cfg::reverse_post_order(fn)) + for(ir::instruction *i: block->get_inst_list()){ + if(auto *x = dynamic_cast(i)) { + ir::type* ptr_ty = x->get_pointer_operand()->get_type(); + if(ptr_ty->is_tile_ty()) + io.insert(x); + } + set_order(i); + } + +// ir::builder &builder = mod.get_builder(); +// std::set seen; +// for(ir::io_inst *i: io) { +// ir::value *ptr = i->get_pointer_operand(); +// auto max_contiguous = align_->get_max_contiguous_vec(ptr); +// std::vector order(max_contiguous.size()); +// std::iota(order.begin(), order.end(), 0); +// std::sort(order.begin(), order.end(), [&](unsigned a, unsigned b) { return max_contiguous[a] > max_contiguous[b]; } ); +// std::list work_list; +// if(order != order_[i]) +// work_list.push_back(i); +// // rematerialize recursively +// while(!work_list.empty()) { +// ir::instruction* current = work_list.back(); +// order_[current] = order; +// work_list.pop_back(); +// for(ir::value *op: current->ops()) { +// ir::instruction* i_op = dynamic_cast(op); +// if(!seen.insert(op).second) +// continue; +// if(!i_op) +// continue; +// ir::type *ty = i_op->get_type(); +// if(!ty->is_tile_ty()) +// continue; +// auto& inst_list = i_op->get_parent()->get_inst_list(); +// auto it = std::find(inst_list.begin(), inst_list.end(), i_op); +// it++; +// builder.set_insert_point(it); +// // found a load; write to shared memory and stop recursion +// ir::instruction *n_op = nullptr; +// if(mem_->is_shared(i_op)){ +// continue; +// } +// if(auto* ld = dynamic_cast(i_op)) { +// n_op = ir::copy_to_shared_inst::create(ld); +// } +// // not a load; rematerialize and recurse +// else { +// n_op = i_op->clone(); +// work_list.push_back(n_op); +// } +// n_op = builder.insert(n_op); +// order_[n_op] = order; +// align_->copy(n_op, i_op); +// current->replace_uses_of_with(i_op, n_op); +// } +// } + +// } +} + + +} +} +} diff --git a/lib/codegen/transform/reorder.cc b/lib/codegen/transform/reorder.cc deleted file mode 100644 index 875faaab1..000000000 --- a/lib/codegen/transform/reorder.cc +++ /dev/null @@ -1,106 +0,0 @@ -#include -#include -#include -#include "triton/ir/function.h" -#include "triton/ir/cfg.h" -#include "triton/ir/basic_block.h" -#include "triton/ir/instructions.h" -#include "triton/ir/module.h" -#include "triton/codegen/analysis/meminfo.h" -#include "triton/codegen/analysis/align.h" -#include "triton/codegen/transform/reorder.h" - -namespace triton { -namespace codegen{ -namespace transform{ - -reorder::reorder(analysis::align* align, analysis::meminfo *mem) - : align_(align), mem_(mem) { } - -std::vector reorder::get_order(ir::value* v) { - return order_.at(v); -} - -void reorder::run(ir::module &mod) { - - std::set io; - - std::function set_order = [&](ir::value *v) -> void { - if(order_.find(v) != order_.end()) - return; - if(ir::user* u = dynamic_cast(v)) - for(ir::value* op: u->ops()) - set_order(op); - ir::type* ty = v->get_type(); - if(!ty->is_tile_ty()) - return; - std::vector order(ty->get_tile_shapes().size()); - std::iota(order.begin(), order.end(), 0); - order_[v] = order; - }; - - // initialize work-list - for(ir::function *fn: mod.get_function_list()) - for(ir::basic_block *block: ir::cfg::reverse_post_order(fn)) - for(ir::instruction *i: block->get_inst_list()){ - if(auto *x = dynamic_cast(i)) { - ir::type* ptr_ty = x->get_pointer_operand()->get_type(); - if(ptr_ty->is_tile_ty()) - io.insert(x); - } - set_order(i); - } - - ir::builder &builder = mod.get_builder(); - for(ir::io_inst *i: io) { - ir::value *ptr = i->get_pointer_operand(); - auto max_contiguous = align_->get_max_contiguous_vec(ptr); - std::vector order(max_contiguous.size()); - std::iota(order.begin(), order.end(), 0); - std::sort(order.begin(), order.end(), [&](unsigned a, unsigned b) { return max_contiguous[a] > max_contiguous[b]; } ); - std::list work_list; - if(order != order_[i]) - work_list.push_back(i); - // rematerialize recursively - while(!work_list.empty()) { - ir::instruction* current = work_list.back(); - order_[current] = order; - work_list.pop_back(); - for(ir::value *op: current->ops()) { - ir::instruction* i_op = dynamic_cast(op); - if(!i_op) - continue; - ir::type *ty = i_op->get_type(); - if(!ty->is_tile_ty()) - continue; - auto& inst_list = i_op->get_parent()->get_inst_list(); - auto it = std::find(inst_list.begin(), inst_list.end(), i_op); - it++; - builder.set_insert_point(it); - // found a load; write to shared memory and stop recursion - ir::instruction *n_op = nullptr; - if(mem_->is_shared(i_op)){ - continue; - } - if(auto* ld = dynamic_cast(i_op)) { - n_op = ir::copy_to_shared_inst::create(ld); - } - // not a load; rematerialize and recurse - else { - n_op = i_op->clone(); - work_list.push_back(n_op); - } - n_op = builder.insert(n_op); - order_[n_op] = order; - align_->copy(n_op, i_op); - current->replace_uses_of_with(i_op, n_op); - } - } - - } -} - - -} -} -} diff --git a/lib/driver/module.cc b/lib/driver/module.cc index 85877f911..0bf85c84f 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -241,7 +241,6 @@ std::string cu_module::compile_llvm_module(std::unique_ptr module, cu_module::cu_module(driver::context * context, std::unique_ptr ll_module): cu_module(context, compile_llvm_module(std::move(ll_module), context->device())) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ - std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 05c39a451..ead9a9ab4 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -5,7 +5,7 @@ #include #include "triton/codegen/selection.h" #include "triton/runtime/function.h" -#include "triton/codegen/transform/reorder.h" +#include "triton/codegen/transform/coalesce.h" #include "triton/lang/cpp.h" #include "triton/lang/parser.h" #include "triton/lang/code_gen.h" @@ -197,7 +197,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::analysis::meminfo shmem_info; codegen::analysis::liveness shmem_liveness(&shmem_info); codegen::analysis::align alignment_info; - codegen::transform::reorder reorder(&alignment_info, &shmem_info); + codegen::transform::coalesce reorder(&alignment_info, &shmem_info); codegen::analysis::grids grids(opt.num_warps, &reorder); codegen::analysis::memalloc shmem_allocation(&shmem_liveness, &shmem_info, &grids); codegen::transform::membar shmem_barriers(&shmem_allocation, &shmem_info); @@ -215,7 +215,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c // ir::print(module, std::cout); reorder.run(module); dce.run(module); - ir::print(module, std::cout); +// ir::print(module, std::cout); grids.run(module); reassociate.run(module); dce.run(module); @@ -231,7 +231,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c dce.run(module); vectorize.run(module); dce.run(module); - ir::print(module, std::cout); +// ir::print(module, std::cout); // generate llvm code llvm::LLVMContext ctx; std::unique_ptr llvm(new llvm::Module(module.get_name(), ctx)); From 0dc7313e3b61912de50a3750a704e3062adc5368 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 12 Sep 2019 22:46:03 -0400 Subject: [PATCH 397/494] fixup --- tests/unit/dot.cc | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/tests/unit/dot.cc b/tests/unit/dot.cc index e1b0a8bb5..e493da4d2 100644 --- a/tests/unit/dot.cc +++ b/tests/unit/dot.cc @@ -14,16 +14,6 @@ namespace drv = triton::driver; namespace rt = triton::runtime; -template -void diff(const std::vector& x, const std::vector& y){ - for(size_t i = 0; i < x.size(); i++) - if(std::isnan(x[i]) || std::abs(x[i] - y[i])/std::max(x[i], y[i]) > 1e-4){ - std::cout << i << " " << x[i] << " " << y[i] << std::endl; - exit(EXIT_FAILURE); - } - std::cout << "Pass!" << std::endl; -} - template static void cpu_ref(std::vector &c, const std::vector &a, const std::vector &b, size_t M, size_t N, size_t K){ From 3fa3b90f164fa5ba5a356e79e1ef0b005cebeef2 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 12 Sep 2019 23:02:51 -0400 Subject: [PATCH 398/494] test --- lib/runtime/function.cc | 2 +- tests/bench/dot.cc | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index ead9a9ab4..715e20500 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -231,7 +231,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c dce.run(module); vectorize.run(module); dce.run(module); -// ir::print(module, std::cout); + ir::print(module, std::cout); // generate llvm code llvm::LLVMContext ctx; std::unique_ptr llvm(new llvm::Module(module.get_name(), ctx)); diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index fc2243bfc..646614afa 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -45,10 +45,10 @@ std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, i opt.defines.push_back({"TYPE", {ty}}); opt.defines.push_back({"AT", {AT?"1":"0"}}); opt.defines.push_back({"BT", {BT?"1":"0"}}); - opt.defines.push_back({"TM", {"64", "128"}}); - opt.defines.push_back({"TN", {"64", "128"}}); + opt.defines.push_back({"TM", {"128"}}); + opt.defines.push_back({"TN", {"128"}}); opt.defines.push_back({"TK", {"8"}}); - opt.num_warps = {2, 4, 8}; + opt.num_warps = {4}; // create function rt::function function(src::dot, opt); // benchmark available libraries From 579a662e60422d3704f2555777721b7f9cf3dd0c Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 13 Sep 2019 14:17:21 -0400 Subject: [PATCH 399/494] [codegen][coalesce] more bugfixes --- include/triton/codegen/analysis/meminfo.h | 2 +- lib/codegen/analysis/grid.cc | 2 +- lib/codegen/analysis/meminfo.cc | 10 ++ lib/codegen/selection.cc | 54 +--------- lib/codegen/transform/coalesce.cc | 115 +++++++++++----------- lib/codegen/transform/dce.cc | 4 +- lib/driver/module.cc | 1 + lib/runtime/function.cc | 2 - tests/bench/dot.cc | 24 ++--- tests/common/src/dot.h | 2 +- 10 files changed, 92 insertions(+), 124 deletions(-) diff --git a/include/triton/codegen/analysis/meminfo.h b/include/triton/codegen/analysis/meminfo.h index 1b896056f..f4ad290a6 100644 --- a/include/triton/codegen/analysis/meminfo.h +++ b/include/triton/codegen/analysis/meminfo.h @@ -26,7 +26,7 @@ public: bool is_loop_latch(ir::phi_node *phi, ir::instruction *terminator); ir::value *get_reference(ir::value *x); void replace(ir::value* before, ir::value *after); - + void copy(ir::value* y, ir::value *x); private: std::set shared_; diff --git a/lib/codegen/analysis/grid.cc b/lib/codegen/analysis/grid.cc index 43a3eb1d9..6a5169d13 100644 --- a/lib/codegen/analysis/grid.cc +++ b/lib/codegen/analysis/grid.cc @@ -298,7 +298,7 @@ void grids::run(ir::module &mod) { unsigned current = num_threads; std::string nts = "nts.d" + s_ld; std::string mts = "mts.d" + s_ld; - params_.at(i).at(nts)->set_value(clamp(size / num_threads, 1, 8)); + params_.at(i).at(nts)->set_value(clamp(size / num_threads, 1, 4)); params_.at(i).at(mts)->set_value(clamp(current, 1, shapes[ld] / params_.at(i).at(nts)->get_value())); current = current / params_.at(i).at(mts)->get_value(); for(size_t d = 1; d < shapes.size(); d++){ diff --git a/lib/codegen/analysis/meminfo.cc b/lib/codegen/analysis/meminfo.cc index d0b075603..2c66a7bc5 100644 --- a/lib/codegen/analysis/meminfo.cc +++ b/lib/codegen/analysis/meminfo.cc @@ -34,6 +34,16 @@ void meminfo::replace(ir::value* before, ir::value *after) { } } +void meminfo::copy(ir::value* y, ir::value *x) { + if(shared_.find(x) != shared_.end()) + shared_.insert(y); + if(refs_.find(x) != refs_.end()) + refs_[y] = refs_[x]; + if(double_.find(x) != double_.end()) + double_.insert(y); +} + + inline bool get_is_shared(ir::value* v) { if(dynamic_cast(v)) return true; diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index f452cc384..87a2adf58 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -556,15 +556,15 @@ inline int32_t ceil(int32_t num, int32_t div){ return (num + div - 1)/div; } -inline void to_warps(const std::vector &bs, std::vector &nw, std::vector &ws){ +inline void to_warps(const std::vector &bs, const std::vector& order, std::vector &nw, std::vector &ws){ static const size_t warp_size = 32; size_t nthreads = 1, nwarps = 1; nw.resize(bs.size()); ws.resize(bs.size()); for(size_t i = 0; i < bs.size(); ++i){ nthreads *= bs[i]; - nw[i] = ceil(nthreads, nwarps*warp_size); - nwarps *= nw[i]; + nw[order[i]] = ceil(nthreads, nwarps*warp_size); + nwarps *= nw[order[i]]; } for(size_t i = 0; i < bs.size(); ++i){ ws[i] = bs[i] / nw[i]; @@ -585,7 +585,7 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id contiguous[i] = params_->get_param(v, "nts.d" + str_i)->get_value(); block_size[i] = params_->get_param(v, "mts.d" + str_i)->get_value(); } - to_warps(block_size, n_warps, warp_size); + to_warps(block_size, order, n_warps, warp_size); std::vector thread_id_in_warp = delinearize(u_thread_id, order, warp_size, builder); std::vector warp_id = delinearize(u_warp_id, order, n_warps, builder); // Create axes @@ -711,52 +711,6 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id } } -void selection::create_grids(std::vector &grids, - std::map &references, - ir::function *fn) { - // get number of dimensions greater than 1 - auto get_tile_gt1_dim = [&](ir::value *v){ - unsigned result = 0; - for(auto shape: v->get_type()->get_tile_shapes()) { - result += (shape > 1)? shape : 0; - } - return result; - }; - // bind references - std::set seen; - std::function bind_references = [&](ir::value *v) - { - // skip - if(!v->get_type()->is_tile_ty() || !seen.insert(v).second) - return; - // recurse - if(auto *user = dynamic_cast(v)) - for(ir::value *op: user->ops()) - bind_references(op); - // bind - const auto& shapes = v->get_type()->get_tile_shapes(); - if(buffer_info_->is_shared(v)) - return; - for(size_t d = 0; d < shapes.size(); d++){ - if(shapes[d] == 1) - continue; - unsigned x = params_->get_param_group(v, d); - ir::value *&r = references[x]; - if(!r || get_tile_gt1_dim(v) > get_tile_gt1_dim(r)) - r = v; - } - }; - - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i: block->get_inst_list()) - bind_references(i); - - // create grid - for(auto &ref: references) - if(std::find(grids.begin(), grids.end(), ref.second) == grids.end()) - grids.push_back(ref.second); -} - bool static inline has_phi_user(ir::value *v) { for(ir::user *usr: v->get_users()){ if(dynamic_cast(usr)) diff --git a/lib/codegen/transform/coalesce.cc b/lib/codegen/transform/coalesce.cc index 29a87129c..b9fbbb534 100644 --- a/lib/codegen/transform/coalesce.cc +++ b/lib/codegen/transform/coalesce.cc @@ -28,16 +28,17 @@ void coalesce::run(ir::module &mod) { std::function set_order = [&](ir::value *v) -> void { if(order_.find(v) != order_.end()) return; - order_[v] = {}; + ir::type *tile_ty = v->get_type(); + if(auto *x = dynamic_cast(v)) + tile_ty = x->get_operand(0)->get_type(); + if(!tile_ty->is_tile_ty()) + return; + std::vector order(tile_ty->get_tile_shapes().size()); + std::iota(order.begin(), order.end(), 0); + order_[v] = order; if(ir::user* u = dynamic_cast(v)) for(ir::value* op: u->ops()) set_order(op); - ir::type* ty = v->get_type(); - if(!ty->is_tile_ty()) - return; - std::vector order(ty->get_tile_shapes().size()); - std::iota(order.begin(), order.end(), 0); - order_[v] = order; }; // initialize work-list @@ -52,56 +53,58 @@ void coalesce::run(ir::module &mod) { set_order(i); } -// ir::builder &builder = mod.get_builder(); -// std::set seen; -// for(ir::io_inst *i: io) { -// ir::value *ptr = i->get_pointer_operand(); -// auto max_contiguous = align_->get_max_contiguous_vec(ptr); -// std::vector order(max_contiguous.size()); -// std::iota(order.begin(), order.end(), 0); -// std::sort(order.begin(), order.end(), [&](unsigned a, unsigned b) { return max_contiguous[a] > max_contiguous[b]; } ); -// std::list work_list; -// if(order != order_[i]) -// work_list.push_back(i); -// // rematerialize recursively -// while(!work_list.empty()) { -// ir::instruction* current = work_list.back(); -// order_[current] = order; -// work_list.pop_back(); -// for(ir::value *op: current->ops()) { -// ir::instruction* i_op = dynamic_cast(op); -// if(!seen.insert(op).second) -// continue; -// if(!i_op) -// continue; -// ir::type *ty = i_op->get_type(); -// if(!ty->is_tile_ty()) -// continue; -// auto& inst_list = i_op->get_parent()->get_inst_list(); -// auto it = std::find(inst_list.begin(), inst_list.end(), i_op); -// it++; -// builder.set_insert_point(it); -// // found a load; write to shared memory and stop recursion -// ir::instruction *n_op = nullptr; -// if(mem_->is_shared(i_op)){ -// continue; -// } -// if(auto* ld = dynamic_cast(i_op)) { -// n_op = ir::copy_to_shared_inst::create(ld); -// } -// // not a load; rematerialize and recurse -// else { -// n_op = i_op->clone(); -// work_list.push_back(n_op); -// } -// n_op = builder.insert(n_op); -// order_[n_op] = order; -// align_->copy(n_op, i_op); -// current->replace_uses_of_with(i_op, n_op); -// } -// } + ir::builder &builder = mod.get_builder(); + std::map replaced; + for(ir::io_inst *i: io) { + ir::value *ptr = i->get_pointer_operand(); + auto max_contiguous = align_->get_max_contiguous_vec(ptr); + std::vector order(max_contiguous.size()); + std::iota(order.begin(), order.end(), 0); + std::sort(order.begin(), order.end(), [&](unsigned a, unsigned b) { return max_contiguous[a] > max_contiguous[b]; } ); + std::list work_list; + if(order != order_[i]) + work_list.push_back(i); + // rematerialize recursively + while(!work_list.empty()) { + ir::instruction* current = work_list.back(); + order_[current] = order; + work_list.pop_back(); + for(ir::value *op: current->ops()) { + ir::instruction* i_op = dynamic_cast(op); + if(replaced.find(i_op) != replaced.end()){ + current->replace_uses_of_with(i_op, replaced.at(i_op)); + continue; + } + if(!i_op) + continue; + ir::type *ty = i_op->get_type(); + if(!ty->is_tile_ty()) + continue; + auto& inst_list = i_op->get_parent()->get_inst_list(); + auto it = std::find(inst_list.begin(), inst_list.end(), i_op); + it++; + builder.set_insert_point(it); + // found a load; write to shared memory and stop recursion + ir::instruction *n_op = nullptr; + if(mem_->is_shared(i_op)) + continue; + if(auto* ld = dynamic_cast(i_op)) + n_op = ir::copy_to_shared_inst::create(ld); + // not a load; rematerialize and recurse + else { + n_op = i_op->clone(); + work_list.push_back(n_op); + } + n_op = builder.insert(n_op); + replaced.insert({i_op, n_op}); + order_[n_op] = order; + align_->copy(n_op, i_op); +// mem_->copy(n_op, i_op); + current->replace_uses_of_with(i_op, n_op); + } + } -// } + } } diff --git a/lib/codegen/transform/dce.cc b/lib/codegen/transform/dce.cc index 404eaa521..a1b5880c5 100644 --- a/lib/codegen/transform/dce.cc +++ b/lib/codegen/transform/dce.cc @@ -3,6 +3,7 @@ #include "triton/ir/module.h" #include "triton/ir/cfg.h" #include "triton/codegen/transform/dce.h" +#include namespace triton { namespace codegen{ @@ -35,9 +36,10 @@ void dce::run(ir::module &mod) { work_list.pop_back(); // mark instruction operands for(ir::value* op: current->ops()) { - if(auto *i = dynamic_cast(op)) + if(auto *i = dynamic_cast(op)){ if(marked.insert(i).second) work_list.push_back(i); + } } // TODO: mark last intstruction of current's reverse-dominance frontier } diff --git a/lib/driver/module.cc b/lib/driver/module.cc index 0bf85c84f..1dcf4d738 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -241,6 +241,7 @@ std::string cu_module::compile_llvm_module(std::unique_ptr module, cu_module::cu_module(driver::context * context, std::unique_ptr ll_module): cu_module(context, compile_llvm_module(std::move(ll_module), context->device())) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ +// std::cout << source_ << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 715e20500..6554ab4d6 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -212,10 +212,8 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c alignment_info.run(module); if(target->is_gpu()) shmem_info.run(module); -// ir::print(module, std::cout); reorder.run(module); dce.run(module); -// ir::print(module, std::cout); grids.run(module); reassociate.run(module); dce.run(module); diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index 646614afa..9a0cd9ca7 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -48,23 +48,23 @@ std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, i opt.defines.push_back({"TM", {"128"}}); opt.defines.push_back({"TN", {"128"}}); opt.defines.push_back({"TK", {"8"}}); - opt.num_warps = {4}; + opt.num_warps = {8}; // create function rt::function function(src::dot, opt); // benchmark available libraries std::vector result; auto tflops = [&](double nanosec) { return 2.*M*N*K / nanosec * 1e-3; }; - // cublas - if(cublas::cublasinit()){ - NumericT alpha(static_cast(1)); - NumericT beta(static_cast(0)); - cublasGemmAlgo_t fastest; - cublasGemm(cuty, stream, AT, BT, M, N, K, &alpha, &*da, lda, &*db, ldb, &beta, &*dc, ldc, &fastest); - double cublas_ms = triton::tools::bench([&]() { cublasGemm(cuty, stream, AT, BT, M, N, K, - &alpha, &*da, lda, &*db, ldb, &beta, &*dc, - ldc, nullptr, fastest); }, stream); - result.push_back(tflops(cublas_ms)); - } +// // cublas +// if(cublas::cublasinit()){ +// NumericT alpha(static_cast(1)); +// NumericT beta(static_cast(0)); +// cublasGemmAlgo_t fastest; +// cublasGemm(cuty, stream, AT, BT, M, N, K, &alpha, &*da, lda, &*db, ldb, &beta, &*dc, ldc, &fastest); +// double cublas_ms = triton::tools::bench([&]() { cublasGemm(cuty, stream, AT, BT, M, N, K, +// &alpha, &*da, lda, &*db, ldb, &beta, &*dc, +// ldc, nullptr, fastest); }, stream); +// result.push_back(tflops(cublas_ms)); +// } // triton double triton_ms = triton::tools::bench([&]() { function({&*da, &*db, &*dc, M, N, K, lda, ldb, ldc}, grid2d(M, N), stream);}, stream); result.push_back(tflops(triton_ms)); diff --git a/tests/common/src/dot.h b/tests/common/src/dot.h index c9b3454d7..7511eda9a 100644 --- a/tests/common/src/dot.h +++ b/tests/common/src/dot.h @@ -64,7 +64,7 @@ void dot(TYPE * A, TYPE * B, TYPE * C, // epilogue int rxc[TM] = ridx * TM + 0 ... TM; int ryc[TN] = ridy * TN + 0 ... TN; - TYPE* pc[TM, TN] = C + ryc[newaxis, :] * ldc + rxc[:, newaxis]; + TYPE* pc[TM, TN] = C + ryc[newaxis, :] + rxc[:, newaxis] * ldc; *pc = c; } )"; From eae02b99e5a582fc152db21248ff7e5a7b863dc4 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 13 Sep 2019 19:16:04 -0400 Subject: [PATCH 400/494] [codegen][coalesce] fixed stale users in cloned instructions --- include/triton/codegen/analysis/grid.h | 2 +- lib/codegen/analysis/grid.cc | 5 +++++ lib/codegen/selection.cc | 3 ++- lib/codegen/transform/coalesce.cc | 24 ++++++++++++++---------- lib/ir/value.cc | 2 +- 5 files changed, 23 insertions(+), 13 deletions(-) diff --git a/include/triton/codegen/analysis/grid.h b/include/triton/codegen/analysis/grid.h index 1eb352b00..25c5d24a4 100644 --- a/include/triton/codegen/analysis/grid.h +++ b/include/triton/codegen/analysis/grid.h @@ -49,7 +49,7 @@ public: grids(size_t num_warps, transform::coalesce* reorder); ir::metaparameter* get_param(ir::value *value, const std::string &key) { return params_[value][key]; } unsigned get_param_group(ir::value *value, unsigned ax); - fragment_t get_fragment(ir::value *value, unsigned ax) { return fragments_.at({value, ax}); } + fragment_t get_fragment(ir::value *value, unsigned ax); void copy(ir::value *dst, ir::value *src); void run(ir::module &mod); unsigned get_num_threads(); diff --git a/lib/codegen/analysis/grid.cc b/lib/codegen/analysis/grid.cc index 6a5169d13..aa87c9480 100644 --- a/lib/codegen/analysis/grid.cc +++ b/lib/codegen/analysis/grid.cc @@ -182,6 +182,11 @@ unsigned grids::get_param_group(ir::value *value, unsigned ax) { return result; } +grids::fragment_t grids::get_fragment(ir::value *value, unsigned ax) { + return fragments_.at({value, ax}); +} + + //TODO: This shouldn't exist! void grids::copy(ir::value *dst, ir::value *src) { params_[dst] = params_[src]; diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 87a2adf58..5b137c148 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -723,8 +723,9 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, if(!v->get_type()->is_tile_ty() || !seen.insert(v).second) return; if(auto *user = dynamic_cast(v)) - for(ir::value *op: user->ops()) + for(ir::value *op: user->ops()){ create_tile(op, builder, seen, sh_mem_ptr); + } LLVMContext &ctx = builder.getContext(); auto shapes = v->get_type()->get_tile_shapes(); unsigned pad = alloc_->is_ld_padded(v); diff --git a/lib/codegen/transform/coalesce.cc b/lib/codegen/transform/coalesce.cc index b9fbbb534..0ba534531 100644 --- a/lib/codegen/transform/coalesce.cc +++ b/lib/codegen/transform/coalesce.cc @@ -61,18 +61,20 @@ void coalesce::run(ir::module &mod) { std::vector order(max_contiguous.size()); std::iota(order.begin(), order.end(), 0); std::sort(order.begin(), order.end(), [&](unsigned a, unsigned b) { return max_contiguous[a] > max_contiguous[b]; } ); - std::list work_list; + std::list> work_list; if(order != order_[i]) - work_list.push_back(i); + work_list.push_back({i, nullptr}); // rematerialize recursively while(!work_list.empty()) { - ir::instruction* current = work_list.back(); - order_[current] = order; + auto pair = work_list.back(); + ir::instruction* cloned = pair.first; + ir::instruction* original = pair.second; + order_[cloned] = order; work_list.pop_back(); - for(ir::value *op: current->ops()) { + for(ir::value *op: cloned->ops()) { ir::instruction* i_op = dynamic_cast(op); if(replaced.find(i_op) != replaced.end()){ - current->replace_uses_of_with(i_op, replaced.at(i_op)); + cloned->replace_uses_of_with(i_op, replaced.at(i_op)); continue; } if(!i_op) @@ -90,17 +92,19 @@ void coalesce::run(ir::module &mod) { continue; if(auto* ld = dynamic_cast(i_op)) n_op = ir::copy_to_shared_inst::create(ld); - // not a load; rematerialize and recurse + // not a load; rematerialize and add to worklist else { n_op = i_op->clone(); - work_list.push_back(n_op); + work_list.push_back({n_op, i_op}); } n_op = builder.insert(n_op); replaced.insert({i_op, n_op}); order_[n_op] = order; align_->copy(n_op, i_op); -// mem_->copy(n_op, i_op); - current->replace_uses_of_with(i_op, n_op); + mem_->copy(n_op, i_op); + if(original) + n_op->erase_use(original); + cloned->replace_uses_of_with(i_op, n_op); } } diff --git a/lib/ir/value.cc b/lib/ir/value.cc index 3ab64b97a..5dfb0460c 100644 --- a/lib/ir/value.cc +++ b/lib/ir/value.cc @@ -66,7 +66,7 @@ void user::replace_uses_of_with(value *before, value *after) { if(ops_[i] == before) ops_[i] = after; after->add_use(this); - erase_use(this); + before->erase_use(this); } } From 8ae779206f473d723cd1615143450884cd6a96bb Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 14 Sep 2019 02:36:11 -0400 Subject: [PATCH 401/494] more fixes --- lib/codegen/analysis/liveness.cc | 1 + lib/codegen/analysis/meminfo.cc | 4 ++++ lib/codegen/transform/coalesce.cc | 4 +++- lib/runtime/function.cc | 2 +- tests/bench/dot.cc | 6 +++--- 5 files changed, 12 insertions(+), 5 deletions(-) diff --git a/lib/codegen/analysis/liveness.cc b/lib/codegen/analysis/liveness.cc index 8801235b5..088691263 100644 --- a/lib/codegen/analysis/liveness.cc +++ b/lib/codegen/analysis/liveness.cc @@ -1,3 +1,4 @@ +#include #include "triton/codegen/analysis/liveness.h" #include "triton/codegen/analysis/meminfo.h" #include "triton/ir/basic_block.h" diff --git a/lib/codegen/analysis/meminfo.cc b/lib/codegen/analysis/meminfo.cc index 2c66a7bc5..314c272c0 100644 --- a/lib/codegen/analysis/meminfo.cc +++ b/lib/codegen/analysis/meminfo.cc @@ -82,6 +82,10 @@ void add_copy(ir::value *x, ir::builder &builder) { } void meminfo::run(ir::module &mod) { +// shared_.clear(); +// refs_.clear(); +// double_.clear(); + // Add shared copies for(ir::function *fn: mod.get_function_list()){ ir::builder builder(mod.get_context()); diff --git a/lib/codegen/transform/coalesce.cc b/lib/codegen/transform/coalesce.cc index 0ba534531..8c880d638 100644 --- a/lib/codegen/transform/coalesce.cc +++ b/lib/codegen/transform/coalesce.cc @@ -88,8 +88,10 @@ void coalesce::run(ir::module &mod) { builder.set_insert_point(it); // found a load; write to shared memory and stop recursion ir::instruction *n_op = nullptr; - if(mem_->is_shared(i_op)) + if(mem_->is_shared(i_op)){ + i_op->add_use(cloned); continue; + } if(auto* ld = dynamic_cast(i_op)) n_op = ir::copy_to_shared_inst::create(ld); // not a load; rematerialize and add to worklist diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 6554ab4d6..3dd7c1507 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -229,7 +229,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c dce.run(module); vectorize.run(module); dce.run(module); - ir::print(module, std::cout); +// ir::print(module, std::cout); // generate llvm code llvm::LLVMContext ctx; std::unique_ptr llvm(new llvm::Module(module.get_name(), ctx)); diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index 9a0cd9ca7..7f2366ecc 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -45,10 +45,10 @@ std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, i opt.defines.push_back({"TYPE", {ty}}); opt.defines.push_back({"AT", {AT?"1":"0"}}); opt.defines.push_back({"BT", {BT?"1":"0"}}); - opt.defines.push_back({"TM", {"128"}}); - opt.defines.push_back({"TN", {"128"}}); + opt.defines.push_back({"TM", {"64", "128"}}); + opt.defines.push_back({"TN", {"64", "128"}}); opt.defines.push_back({"TK", {"8"}}); - opt.num_warps = {8}; + opt.num_warps = {2, 4, 8}; // create function rt::function function(src::dot, opt); // benchmark available libraries From 66e32b3074e3857b68a223104a8f350e952b2e4e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 14 Sep 2019 13:05:53 -0400 Subject: [PATCH 402/494] [codegen][grid] some cleaning --- include/triton/codegen/analysis/grid.h | 32 +++++++--- lib/codegen/analysis/grid.cc | 88 +++++++++++++------------- lib/codegen/analysis/memalloc.cc | 4 +- lib/codegen/selection.cc | 27 ++++---- lib/codegen/transform/vectorize.cc | 2 +- 5 files changed, 81 insertions(+), 72 deletions(-) diff --git a/include/triton/codegen/analysis/grid.h b/include/triton/codegen/analysis/grid.h index 25c5d24a4..467ba9fff 100644 --- a/include/triton/codegen/analysis/grid.h +++ b/include/triton/codegen/analysis/grid.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace triton{ @@ -27,6 +28,8 @@ namespace analysis{ class grids { typedef std::pair node_t; typedef std::map > graph_t; + typedef std::shared_ptr param_ptr_t; + typedef std::map> param_map_t; public: enum fragment_t{ @@ -39,7 +42,7 @@ private: void init_c_phi(ir::instruction *i); void init_c_graph(ir::instruction *v); fragment_t get_fragmentation_type(node_t x, graph_t &graph); - void connected_components(node_t x, const std::vector mps, const std::vector prefixes, std::set &nodes, graph_t &graph, unsigned group_id); + void connected_components(node_t x, const std::vector& params, const std::vector& maps, std::set &nodes, graph_t &graph, unsigned group_id); void create_grids(std::vector &grids, std::map &references, ir::function *fn); @@ -47,27 +50,36 @@ private: public: grids(size_t num_warps, transform::coalesce* reorder); - ir::metaparameter* get_param(ir::value *value, const std::string &key) { return params_[value][key]; } unsigned get_param_group(ir::value *value, unsigned ax); fragment_t get_fragment(ir::value *value, unsigned ax); void copy(ir::value *dst, ir::value *src); void run(ir::module &mod); unsigned get_num_threads(); const std::vector get_grids() const { return grids_; } + int get_mts(ir::value *value, unsigned ax); + int get_nts(ir::value *value, unsigned ax); + int get_fpw(ir::value *value, unsigned ax); + int get_wpt(ir::value *value, unsigned ax); private: - std::vector pool_; + + transform::coalesce* reorder_; + // number of warps + size_t num_warps_; + // grids + std::vector grids_; + // grid parameters + param_map_t fpw_; + param_map_t wpt_; + param_map_t mts_; + param_map_t nts_; + // constraints graph graph_t dependencies_; std::set nodes_; + // fragments std::map fragments_; - std::map static_params_; - std::map> params_; - std::map global_range_sizes_; - std::vector grids_; + // parameter groups std::map> groups_; - size_t num_warps_; - transform::coalesce* reorder_; - }; diff --git a/lib/codegen/analysis/grid.cc b/lib/codegen/analysis/grid.cc index aa87c9480..2d9b494c4 100644 --- a/lib/codegen/analysis/grid.cc +++ b/lib/codegen/analysis/grid.cc @@ -84,7 +84,6 @@ void grids::init_c_graph(ir::instruction *v) { bool is_skewed = false; for(unsigned i = 0; i < shapes.size(); i ++){ if(shapes[i] == 1){ - static_params_.insert({{v, i}, 1}); add_constraint({v, i}, {v, i}); } else if(!is_skewed && @@ -125,8 +124,6 @@ void grids::init_c_graph(ir::instruction *v) { for(unsigned i = 0; i < shapes.size(); i++) add_constraint({v, i}, {D, i}); for(unsigned i = 2; i < shapes.size(); i++){ - if(shapes[i] == 1) - static_params_.insert({{v, i}, 1}); add_constraint({v, i}, {A, i}); add_constraint({v, i}, {B, i}); } @@ -159,21 +156,15 @@ grids::fragment_t grids::get_fragmentation_type(node_t x, graph_t &graph){ return STRIDED_SCAN; } -void grids::connected_components(node_t x, const std::vector mps, const std::vector prefixes, std::set &nodes, graph_t &graph, unsigned group_id) { +void grids::connected_components(node_t x, const std::vector& ptr_vec, const std::vector& maps, std::set &nodes, graph_t &graph, unsigned group_id) +{ groups_[x.first].insert({x.second, group_id}); if(nodes.find(x) != nodes.end()){ nodes.erase(x); - std::string suffix = ".d" + std::to_string(x.second); - for(unsigned i = 0; i < mps.size(); i++) - params_[x.first].insert({prefixes[i] + suffix, mps[i]}); - ir::type *ty = x.first->get_type(); - if(static_params_.find(x) != static_params_.end()){ - for(ir::metaparameter *mp: mps) - mp->set_value(static_params_.at(x)); - } - for(const node_t &y: graph[x]){ - connected_components(y, mps, prefixes, nodes, graph, group_id); - } + for(unsigned i = 0; i < ptr_vec.size(); i++) + (*maps[i])[x.first][x.second] = ptr_vec[i]; + for(const node_t &y: graph[x]) + connected_components(y, ptr_vec, maps, nodes, graph, group_id); } } @@ -189,7 +180,10 @@ grids::fragment_t grids::get_fragment(ir::value *value, unsigned ax) { //TODO: This shouldn't exist! void grids::copy(ir::value *dst, ir::value *src) { - params_[dst] = params_[src]; + mts_[dst] = mts_[src]; + nts_[dst] = nts_[src]; + fpw_[dst] = fpw_[src]; + wpt_[dst] = wpt_[src]; groups_[dst] = groups_[src]; fragments_[{dst, 0}] = fragments_[{src, 0}]; } @@ -217,17 +211,16 @@ void grids::run(ir::module &mod) { for(auto x: nodes_) fragments_[x] = get_fragmentation_type(x, dependencies_); while(!nodes_.empty()) { - ir::type *ty = mod.get_builder().get_int32_ty(); node_t node = *nodes_.begin(); if(fragments_[node] == STRIDED_SCAN) { - ir::metaparameter *nts = ir::metaparameter::create(ctx, ty, 1, 1); - ir::metaparameter *mts = ir::metaparameter::create(ctx, ty, 1, 1); - connected_components(node, {nts, mts}, {"nts", "mts"}, nodes_, dependencies_, group_id++); + param_ptr_t nts(new int(-1)); + param_ptr_t mts(new int(-1)); + connected_components(node, {nts, mts}, {&nts_, &mts_}, nodes_, dependencies_, group_id++); } else { - ir::metaparameter *fpw = ir::metaparameter::create(ctx, ty, 1, 1); - ir::metaparameter *wpt = ir::metaparameter::create(ctx, ty, 1, 1); - connected_components(node, {fpw, wpt}, {"fpw", "wpt"}, nodes_, dependencies_, group_id++); + param_ptr_t fpw(new int(-1)); + param_ptr_t wpt(new int(-1)); + connected_components(node, {fpw, wpt}, {&fpw_, &wpt_}, nodes_, dependencies_, group_id++); } } } @@ -267,7 +260,7 @@ void grids::run(ir::module &mod) { }while(fpw_nm1 != fpw); // store parameters for(unsigned d = 0; d < shapes.size(); d++) - params_.at(i).at("fpw.d" + std::to_string(d))->set_value(fpw[d]); + *fpw_[i][d] = fpw[d]; /* warps per tile */ // try to make things as square as possible to maximize data re-use @@ -282,14 +275,12 @@ void grids::run(ir::module &mod) { }while(wpt_nm1 != wpt); // store parameters for(unsigned d = 0; d < shapes.size(); d++) - params_.at(i).at("wpt.d" + std::to_string(d))->set_value(wpt[d]); + *wpt_[i][d] = wpt[d]; /* sanity check */ unsigned effective_num_warps = 1; - for(size_t d = 0; d < shapes.size(); d++){ - std::string str_d = std::to_string(d); - effective_num_warps *= params_.at(i).at("wpt.d" + str_d)->get_value(); - } + for(size_t d = 0; d < shapes.size(); d++) + effective_num_warps *= *wpt_[i][d]; if(num_warps_ != effective_num_warps) throw std::runtime_error("cannot create a kernel with this amount of warps"); @@ -299,28 +290,20 @@ void grids::run(ir::module &mod) { /* Scan-line */ else{ unsigned ld = order[0]; - std::string s_ld = std::to_string(ld); unsigned current = num_threads; - std::string nts = "nts.d" + s_ld; - std::string mts = "mts.d" + s_ld; - params_.at(i).at(nts)->set_value(clamp(size / num_threads, 1, 4)); - params_.at(i).at(mts)->set_value(clamp(current, 1, shapes[ld] / params_.at(i).at(nts)->get_value())); - current = current / params_.at(i).at(mts)->get_value(); + *nts_[i][ld] = clamp(size / num_threads, 1, 4); + *mts_[i][ld] = clamp(current, 1, shapes[ld] / *nts_[i][ld]); + current = current / *mts_[i][ld]; for(size_t d = 1; d < shapes.size(); d++){ ld = order[d]; - s_ld = std::to_string(ld); - nts = "nts.d" + s_ld; - mts = "mts.d" + s_ld; - params_.at(i).at(nts)->set_value(1); - params_.at(i).at(mts)->set_value(clamp(current, 1, shapes[ld])); - current = current / params_.at(i).at(mts)->get_value(); + *nts_[i][ld] = 1; + *mts_[i][ld] = clamp(current, 1, shapes[ld]); + current = current / *mts_[i][ld]; } /* sanity check */ unsigned effective_num_threads = 1; - for(size_t d = 0; d < shapes.size(); d++){ - std::string str_d = std::to_string(d); - effective_num_threads *= params_.at(i).at("mts.d" + str_d)->get_value(); - } + for(size_t d = 0; d < shapes.size(); d++) + effective_num_threads *= *mts_[i][d]; if(num_threads != effective_num_threads) throw std::runtime_error("cannot create a kernel with this amount of warps"); } @@ -378,6 +361,21 @@ unsigned grids::get_num_threads() { return num_warps_*32; } +int grids::get_mts(ir::value *value, unsigned ax) { + return *mts_.at(value).at(ax); +} + +int grids::get_nts(ir::value *value, unsigned ax) { + return *nts_.at(value).at(ax); +} + +int grids::get_fpw(ir::value *value, unsigned ax) { + return *fpw_.at(value).at(ax); +} + +int grids::get_wpt(ir::value *value, unsigned ax) { + return *wpt_.at(value).at(ax); +} } } diff --git a/lib/codegen/analysis/memalloc.cc b/lib/codegen/analysis/memalloc.cc index 5f8a4d70b..be81b68e2 100644 --- a/lib/codegen/analysis/memalloc.cc +++ b/lib/codegen/analysis/memalloc.cc @@ -57,9 +57,9 @@ unsigned memalloc::get_num_bytes(ir::value *x) { num_elements *= x; size_t depth; if(params_->get_fragment(x, 0) == grids::HMMA_FRAGMENT_C) - depth = params_->get_param(op, "wpt.d" + std::to_string(axis))->get_value(); + depth = params_->get_wpt(op, axis); else - depth = params_->get_param(op, "mts.d" + std::to_string(axis))->get_value(); + depth = params_->get_mts(op, axis); return num_elements * num_bytes * depth; } unsigned num_bytes = x->get_type()->get_primitive_size_in_bits() / 8; diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 5b137c148..8b6588386 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -581,9 +581,8 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id std::vector warp_size(dim); std::vector n_warps(dim); for(unsigned i = 0; i < shapes.size(); i++){ - std::string str_i = std::to_string(i); - contiguous[i] = params_->get_param(v, "nts.d" + str_i)->get_value(); - block_size[i] = params_->get_param(v, "mts.d" + str_i)->get_value(); + contiguous[i] = params_->get_nts(v, i); + block_size[i] = params_->get_mts(v, i); } to_warps(block_size, order, n_warps, warp_size); std::vector thread_id_in_warp = delinearize(u_thread_id, order, warp_size, builder); @@ -617,13 +616,13 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id Value *_16 = builder.getInt32(16); // fragments per warp - unsigned fpw_0 = params_->get_param(v, "fpw.d0")->get_value(); - unsigned fpw_1 = params_->get_param(v, "fpw.d1")->get_value(); - unsigned fpw_2 = is_batched ? params_->get_param(v, "fpw.d2")->get_value() : 1; + unsigned fpw_0 = params_->get_fpw(v, 0); + unsigned fpw_1 = params_->get_fpw(v, 1); + unsigned fpw_2 = is_batched ? params_->get_fpw(v, 2) : 1; // warps per tile - unsigned wpt_0 = params_->get_param(v, "wpt.d0")->get_value(); - unsigned wpt_1 = params_->get_param(v, "wpt.d1")->get_value(); - unsigned wpt_2 = is_batched ? params_->get_param(v, "wpt.d2")->get_value() : 1; + unsigned wpt_0 = params_->get_wpt(v, 0); + unsigned wpt_1 = params_->get_wpt(v, 1); + unsigned wpt_2 = is_batched ? params_->get_wpt(v, 2) : 1; // hmma warp tile size unsigned hmma_wts_0 = fpw_0 * 8; unsigned hmma_wts_1 = fpw_1 * 8; @@ -909,7 +908,7 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, tgt_->add_barrier(module, builder); builder.CreateStore(result, write_ptr); // build result - unsigned depth = params_->get_param(op, "wpt.d" + std::to_string(axis))->get_value(); + unsigned depth = params_->get_wpt(op, axis); for(unsigned i = depth/2; i > 0; i >>= 1){ // current indices indices_t current(write_idx.size(), builder.getInt32(0)); @@ -1076,12 +1075,12 @@ void selection::lower_hmma_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn "{$10, $11}, " "{$0, $1, $2, $3, $4, $5, $6, $7};", "=f,=f,=f,=f,=f,=f,=f,=f,r,r,r,r,0,1,2,3,4,5,6,7", false); - unsigned fpw_0 = params_->get_param(dot, "fpw.d0")->get_value(); - unsigned fpw_1 = params_->get_param(dot, "fpw.d1")->get_value(); + unsigned fpw_0 = params_->get_fpw(dot, 0); + unsigned fpw_1 = params_->get_fpw(dot, 1); unsigned wts_0 = fpw_0 * 8; unsigned wts_1 = fpw_1 * 8; - unsigned wpt_0 = params_->get_param(dot, "wpt.d0")->get_value(); - unsigned wpt_1 = params_->get_param(dot, "wpt.d1")->get_value(); + unsigned wpt_0 = params_->get_wpt(dot, 0); + unsigned wpt_1 = params_->get_wpt(dot, 1); unsigned stride_rep_i = wpt_0 * wts_0; unsigned stride_rep_j = wpt_1 * wts_1; unsigned num_rep_i = shapes[0] / stride_rep_i; diff --git a/lib/codegen/transform/vectorize.cc b/lib/codegen/transform/vectorize.cc index 16309ffc5..e7e329c02 100644 --- a/lib/codegen/transform/vectorize.cc +++ b/lib/codegen/transform/vectorize.cc @@ -27,7 +27,7 @@ void vectorize::run(ir::module &mod) { } if(dynamic_cast(i)){ ir::value *x = i->get_operand(0); - if(params_->get_param(x, "nts.d0")->get_value() == 1) + if(params_->get_nts(x, 0) == 1) continue; builder.set_insert_point(i); ir::instruction *rx = (ir::instruction*)builder.create_vectorize(x); From 0d8f59dcec44b95adf3cc5bbc22fe4352d40cb91 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 14 Sep 2019 16:04:06 -0400 Subject: [PATCH 403/494] [codegen][selection] some cleaning --- include/triton/codegen/analysis/grid.h | 11 +- include/triton/codegen/selection.h | 9 +- lib/codegen/analysis/grid.cc | 31 +- lib/codegen/analysis/memalloc.cc | 4 +- lib/codegen/selection.cc | 450 +++++++++++++------------ lib/codegen/transform/vectorize.cc | 2 +- lib/runtime/function.cc | 2 +- 7 files changed, 255 insertions(+), 254 deletions(-) diff --git a/include/triton/codegen/analysis/grid.h b/include/triton/codegen/analysis/grid.h index 467ba9fff..50a8c578a 100644 --- a/include/triton/codegen/analysis/grid.h +++ b/include/triton/codegen/analysis/grid.h @@ -50,16 +50,15 @@ private: public: grids(size_t num_warps, transform::coalesce* reorder); - unsigned get_param_group(ir::value *value, unsigned ax); fragment_t get_fragment(ir::value *value, unsigned ax); void copy(ir::value *dst, ir::value *src); void run(ir::module &mod); - unsigned get_num_threads(); + unsigned get_param_group(ir::value *value, unsigned ax); const std::vector get_grids() const { return grids_; } - int get_mts(ir::value *value, unsigned ax); - int get_nts(ir::value *value, unsigned ax); - int get_fpw(ir::value *value, unsigned ax); - int get_wpt(ir::value *value, unsigned ax); + int mts(ir::value *value, unsigned ax); + int nts(ir::value *value, unsigned ax); + int fpw(ir::value *value, unsigned ax); + int wpt(ir::value *value, unsigned ax); private: diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 3efe0a256..b4d2e3344 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -157,7 +157,11 @@ private: void create_grids(std::vector &grids, std::map &references, ir::function *fn); + void create_shared_tile(ir::value *v, Builder &builder, Value *sh_mem_ptr); + void create_distributed_tile(ir::value *v, Builder &builder); void create_tile(ir::value *v, Builder &builder, std::set &seen, Value *sh_mem_ptr); + void init_strided_scan_axes(ir::value *i, Builder &builder, Value *u_thread_id, Value *u_warp_id); + void init_hmma_axes(ir::value *i, Builder &builder, Value *u_thread_id, Value *u_warp_id); void init_axes(ir::value *i, Builder &builder, Value *u_thread_id, Value *u_warp_id); void init_grids(ir::function *fn, Builder &builder, Value *sh_mem_ptr); @@ -195,8 +199,8 @@ private: public: - selection(analysis::memalloc *alloc, analysis::grids *params, analysis::meminfo *buffer_info, analysis::align *alignment, transform::coalesce* reorder, target *tgt) - : alloc_(alloc), params_(params), buffer_info_(buffer_info), alignment_(alignment), reorder_(reorder), tgt_(tgt){ } + selection(analysis::memalloc *alloc, analysis::grids *params, analysis::meminfo *buffer_info, analysis::align *alignment, transform::coalesce* reorder, target *tgt, unsigned num_warps) + : alloc_(alloc), params_(params), buffer_info_(buffer_info), alignment_(alignment), reorder_(reorder), tgt_(tgt), num_warps_(num_warps){ } void run(ir::module &src, Module &dst); @@ -215,6 +219,7 @@ private: Value *offset_b_j_, *offset_b_k_; unsigned num_packs_0_, num_packs_1_; unsigned pack_size_0_, pack_size_1_; + unsigned num_warps_; }; } diff --git a/lib/codegen/analysis/grid.cc b/lib/codegen/analysis/grid.cc index 2d9b494c4..cf5a718cd 100644 --- a/lib/codegen/analysis/grid.cc +++ b/lib/codegen/analysis/grid.cc @@ -156,8 +156,8 @@ grids::fragment_t grids::get_fragmentation_type(node_t x, graph_t &graph){ return STRIDED_SCAN; } -void grids::connected_components(node_t x, const std::vector& ptr_vec, const std::vector& maps, std::set &nodes, graph_t &graph, unsigned group_id) -{ +void grids::connected_components(node_t x, const std::vector& ptr_vec, const std::vector& maps, + std::set &nodes, graph_t &graph, unsigned group_id) { groups_[x.first].insert({x.second, group_id}); if(nodes.find(x) != nodes.end()){ nodes.erase(x); @@ -190,22 +190,18 @@ void grids::copy(ir::value *dst, ir::value *src) { void grids::run(ir::module &mod) { - ir::context &ctx = mod.get_context(); - // Create metaparameters + // Create tiling parameters for(ir::function *fn: mod.get_function_list()){ - // Build constraints graph for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i : block->get_inst_list()) if(i->has_tile_result_or_op()) init_c_graph(i); - // Build phi constraints for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i : block->get_inst_list()) if(i->has_tile_result_or_op()) init_c_phi(i); - // Layout parameters unsigned group_id = 0; for(auto x: nodes_) @@ -231,7 +227,7 @@ void grids::run(ir::module &mod) { } - unsigned num_threads = get_num_threads(); + unsigned num_threads = num_warps_*32; auto clamp = [&](unsigned x, unsigned lo, unsigned hi) { return std::min(std::max(x, lo), hi); }; for(ir::value *i: grids_){ @@ -242,10 +238,8 @@ void grids::run(ir::module &mod) { unsigned size = i->get_type()->get_tile_num_elements(); /* HMMA parameters*/ if(fragments_.at({i, 0}) == HMMA_FRAGMENT_C){ - unsigned shape_0 = shapes[order[0]]; unsigned shape_1 = shapes[order[1]]; - /* fragments per warp */ // try to make things as square as possible to maximize data re-use std::vector fpw = {1, 1, 1}; @@ -261,7 +255,6 @@ void grids::run(ir::module &mod) { // store parameters for(unsigned d = 0; d < shapes.size(); d++) *fpw_[i][d] = fpw[d]; - /* warps per tile */ // try to make things as square as possible to maximize data re-use std::vector wpt = {1, 1, 1}; @@ -276,15 +269,12 @@ void grids::run(ir::module &mod) { // store parameters for(unsigned d = 0; d < shapes.size(); d++) *wpt_[i][d] = wpt[d]; - /* sanity check */ unsigned effective_num_warps = 1; for(size_t d = 0; d < shapes.size(); d++) effective_num_warps *= *wpt_[i][d]; - if(num_warps_ != effective_num_warps) throw std::runtime_error("cannot create a kernel with this amount of warps"); - } /* Scan-line */ @@ -356,24 +346,19 @@ void grids::create_grids(std::vector &grids, grids.push_back(ref.second); } - -unsigned grids::get_num_threads() { - return num_warps_*32; -} - -int grids::get_mts(ir::value *value, unsigned ax) { +int grids::mts(ir::value *value, unsigned ax) { return *mts_.at(value).at(ax); } -int grids::get_nts(ir::value *value, unsigned ax) { +int grids::nts(ir::value *value, unsigned ax) { return *nts_.at(value).at(ax); } -int grids::get_fpw(ir::value *value, unsigned ax) { +int grids::fpw(ir::value *value, unsigned ax) { return *fpw_.at(value).at(ax); } -int grids::get_wpt(ir::value *value, unsigned ax) { +int grids::wpt(ir::value *value, unsigned ax) { return *wpt_.at(value).at(ax); } diff --git a/lib/codegen/analysis/memalloc.cc b/lib/codegen/analysis/memalloc.cc index be81b68e2..631b8f663 100644 --- a/lib/codegen/analysis/memalloc.cc +++ b/lib/codegen/analysis/memalloc.cc @@ -57,9 +57,9 @@ unsigned memalloc::get_num_bytes(ir::value *x) { num_elements *= x; size_t depth; if(params_->get_fragment(x, 0) == grids::HMMA_FRAGMENT_C) - depth = params_->get_wpt(op, axis); + depth = params_->wpt(op, axis); else - depth = params_->get_mts(op, axis); + depth = params_->mts(op, axis); return num_elements * num_bytes * depth; } unsigned num_bytes = x->get_type()->get_primitive_size_in_bits() / 8; diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 8b6588386..4cff99890 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -571,145 +571,154 @@ inline void to_warps(const std::vector &bs, const std::vector &builder, Value *u_thread_id, Value *u_warp_id) { +void selection::init_strided_scan_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { auto order = reorder_->get_order(v); const auto& shapes = v->get_type()->get_tile_shapes(); size_t dim = shapes.size(); - if(params_->get_fragment(v, 0) == analysis::grids::STRIDED_SCAN){ - std::vector contiguous(dim); - std::vector block_size(dim); - std::vector warp_size(dim); - std::vector n_warps(dim); - for(unsigned i = 0; i < shapes.size(); i++){ - contiguous[i] = params_->get_nts(v, i); - block_size[i] = params_->get_mts(v, i); - } - to_warps(block_size, order, n_warps, warp_size); - std::vector thread_id_in_warp = delinearize(u_thread_id, order, warp_size, builder); - std::vector warp_id = delinearize(u_warp_id, order, n_warps, builder); - // Create axes - for(unsigned k = 0; k < dim; k++) { - std::string str_k = std::to_string(k); - Value *warp_size_k = builder.getInt32(warp_size[k]); - Value *contiguous_k = builder.getInt32(contiguous[k]); - Value *thread_id = builder.CreateAdd(thread_id_in_warp[k], builder.CreateMul(warp_id[k], warp_size_k)); - Value *scaled_thread_id = builder.CreateMul(thread_id, contiguous_k); - unsigned per_block = contiguous[k] * warp_size[k] * n_warps[k]; - unsigned per_thread = contiguous[k] * shapes[k] / per_block; - std::vector idx_list(per_thread); - for(unsigned n = 0 ; n < per_thread; n++){ - unsigned offset = n / contiguous[k] * per_block + n % contiguous[k]; - idx_list[n] = builder.CreateAdd(scaled_thread_id, builder.getInt32(offset), "idx_" + str_k + "_" + std::to_string(n)); - } - axes_[params_->get_param_group(v, k)] = distributed_axis{contiguous[k], idx_list, thread_id}; - } + std::vector contiguous(dim); + std::vector block_size(dim); + std::vector warp_size(dim); + std::vector n_warps(dim); + for(unsigned i = 0; i < shapes.size(); i++){ + contiguous[i] = params_->nts(v, i); + block_size[i] = params_->mts(v, i); } - else { - if(shapes.size() > 3) - throw std::runtime_error("unsupported"); - bool is_batched = shapes.size() >= 3; - - Value *_1 = builder.getInt32(1); - Value *_2 = builder.getInt32(2); - Value *_3 = builder.getInt32(3); - Value *_4 = builder.getInt32(4); - Value *_16 = builder.getInt32(16); - - // fragments per warp - unsigned fpw_0 = params_->get_fpw(v, 0); - unsigned fpw_1 = params_->get_fpw(v, 1); - unsigned fpw_2 = is_batched ? params_->get_fpw(v, 2) : 1; - // warps per tile - unsigned wpt_0 = params_->get_wpt(v, 0); - unsigned wpt_1 = params_->get_wpt(v, 1); - unsigned wpt_2 = is_batched ? params_->get_wpt(v, 2) : 1; - // hmma warp tile size - unsigned hmma_wts_0 = fpw_0 * 8; - unsigned hmma_wts_1 = fpw_1 * 8; - unsigned hmma_wts_2 = is_batched ? fpw_2 : 1; - // hmma block tile size - unsigned hmma_bts_0 = hmma_wts_0 * wpt_0; - unsigned hmma_bts_1 = hmma_wts_1 * wpt_1; - unsigned hmma_bts_2 = is_batched ? hmma_wts_2 * wpt_2 : 1; - // number of repetition - unsigned num_rep_0 = shapes[0] / hmma_bts_0; - unsigned num_rep_1 = shapes[1] / hmma_bts_1; - unsigned num_rep_2 = is_batched ? shapes[2] / hmma_bts_2 : 1; - // size of each pack (interleaving) - pack_size_0_ = std::min(num_rep_0, 1); - pack_size_1_ = std::min(num_rep_1, 1); - // number of packs (interleaving) - num_packs_0_ = num_rep_0 / pack_size_0_; - num_packs_1_ = num_rep_1 / pack_size_1_; - - /* intra warp offset */ - // offset of quad in pair - Value *in_pair_off_a = builder.CreateMul(builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), builder.getInt32(4)), - builder.getInt32(fpw_0 * pack_size_0_)); - Value *in_pair_off_b = builder.CreateMul(builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), builder.getInt32(4)), - builder.getInt32(fpw_1 * pack_size_1_)); - - // Quad pair id - Value *pair_a_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), _4); - Value *pair_b_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), _4); - pair_a_id = builder.CreateURem(pair_a_id, builder.getInt32(fpw_0)); - pair_b_id = builder.CreateUDiv(pair_b_id, builder.getInt32(fpw_0)); - pair_b_id = builder.CreateURem(pair_b_id, builder.getInt32(fpw_1)); - // Quad pair offset - Value *pair_a_off = builder.CreateMul(pair_a_id, builder.getInt32(4 * pack_size_0_)); - Value *pair_b_off = builder.CreateMul(pair_b_id, builder.getInt32(4 * pack_size_1_)); - - /* inter warp offset */ - Value *warp_id_0 = builder.CreateURem(u_warp_id, builder.getInt32(wpt_0)); - Value *warp_id_12 = builder.CreateUDiv(u_warp_id, builder.getInt32(wpt_0)); - Value *warp_id_1 = builder.CreateURem(warp_id_12, builder.getInt32(wpt_1)); - Value *warp_id_2 = builder.CreateUDiv(warp_id_12, builder.getInt32(wpt_1)); - Value *warp_offset_i = builder.CreateMul(warp_id_0, builder.getInt32(hmma_wts_0 * pack_size_0_)); - Value *warp_offset_j = builder.CreateMul(warp_id_1, builder.getInt32(hmma_wts_1 * pack_size_1_)); - - /* offsets */ - // a offset - offset_a_i_ = builder.CreateAdd(warp_offset_i, builder.CreateAdd(pair_a_off, in_pair_off_a)); - offset_a_k_ = builder.CreateAnd(u_thread_id, _3); - // b offsets - offset_b_j_ = builder.CreateAdd(warp_offset_j, builder.CreateAdd(pair_b_off, in_pair_off_b)); - offset_b_k_ = builder.CreateAnd(u_thread_id, _3); - - // c offsets - Value *offset_c_i = builder.CreateAdd(builder.CreateAnd(u_thread_id, _1), offset_a_i_); - Value *offset_c_j = builder.CreateAdd(builder.CreateAnd(u_thread_id, _2), - builder.CreateAdd(warp_offset_j, pair_b_off)); - - /* indices */ - // i indices - std::vector idx_i; - for(unsigned pack = 0; pack < num_packs_0_; pack++) - for(unsigned ii = 0; ii < pack_size_0_; ii++) - for(unsigned i = 0; i < 2; i++){ - idx_i.push_back(builder.CreateAdd(offset_c_i, builder.getInt32(pack*hmma_bts_0*pack_size_0_ + ii*4 + i*2))); + to_warps(block_size, order, n_warps, warp_size); + std::vector thread_id_in_warp = delinearize(u_thread_id, order, warp_size, builder); + std::vector warp_id = delinearize(u_warp_id, order, n_warps, builder); + // Create axes + for(unsigned k = 0; k < dim; k++) { + std::string str_k = std::to_string(k); + Value *warp_size_k = builder.getInt32(warp_size[k]); + Value *contiguous_k = builder.getInt32(contiguous[k]); + Value *thread_id = builder.CreateAdd(thread_id_in_warp[k], builder.CreateMul(warp_id[k], warp_size_k)); + Value *scaled_thread_id = builder.CreateMul(thread_id, contiguous_k); + unsigned per_block = contiguous[k] * warp_size[k] * n_warps[k]; + unsigned per_thread = contiguous[k] * shapes[k] / per_block; + std::vector idx_list(per_thread); + for(unsigned n = 0 ; n < per_thread; n++){ + unsigned offset = n / contiguous[k] * per_block + n % contiguous[k]; + idx_list[n] = builder.CreateAdd(scaled_thread_id, builder.getInt32(offset), "idx_" + str_k + "_" + std::to_string(n)); } - // j indices - std::vector idx_j; - for(unsigned pack = 0; pack < num_packs_1_; pack++) - for(unsigned jj = 0; jj < pack_size_1_; jj++) - for(unsigned j = 0; j < 2; j++){ - idx_j.push_back(builder.CreateAdd(offset_c_j, builder.getInt32(pack*hmma_bts_1*pack_size_1_ + jj*4 + j*4*fpw_1*pack_size_1_))); - idx_j.push_back(builder.CreateAdd(offset_c_j, builder.getInt32(pack*hmma_bts_1*pack_size_1_ + jj*4 + j*4*fpw_1*pack_size_1_ + 1))); - } - // z indices - std::vector idx_z; - for(unsigned pack = 0; pack < num_rep_2; pack++) - idx_z.push_back(builder.CreateAdd(warp_id_2, builder.getInt32(pack*hmma_bts_2))); - - - /* axes */ - axes_[params_->get_param_group(v, 0)] = distributed_axis{1, idx_i, warp_id_0}; - axes_[params_->get_param_group(v, 1)] = distributed_axis{1, idx_j, warp_id_1}; - if(is_batched) - axes_[params_->get_param_group(v, 2)] = distributed_axis{1, idx_z, warp_id_2}; + axes_[params_->get_param_group(v, k)] = distributed_axis{contiguous[k], idx_list, thread_id}; } } +void selection::init_hmma_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { +// auto order = reorder_->get_order(v); + const auto& shapes = v->get_type()->get_tile_shapes(); + if(shapes.size() > 3) + throw std::runtime_error("unsupported"); + bool is_batched = shapes.size() >= 3; + + Value *_1 = builder.getInt32(1); + Value *_2 = builder.getInt32(2); + Value *_3 = builder.getInt32(3); + Value *_4 = builder.getInt32(4); + Value *_16 = builder.getInt32(16); + + // fragments per warp + unsigned fpw_0 = params_->fpw(v, 0); + unsigned fpw_1 = params_->fpw(v, 1); + unsigned fpw_2 = is_batched ? params_->fpw(v, 2) : 1; + // warps per tile + unsigned wpt_0 = params_->wpt(v, 0); + unsigned wpt_1 = params_->wpt(v, 1); + unsigned wpt_2 = is_batched ? params_->wpt(v, 2) : 1; + // hmma warp tile size + unsigned hmma_wts_0 = fpw_0 * 8; + unsigned hmma_wts_1 = fpw_1 * 8; + unsigned hmma_wts_2 = is_batched ? fpw_2 : 1; + // hmma block tile size + unsigned hmma_bts_0 = hmma_wts_0 * wpt_0; + unsigned hmma_bts_1 = hmma_wts_1 * wpt_1; + unsigned hmma_bts_2 = is_batched ? hmma_wts_2 * wpt_2 : 1; + // number of repetition + unsigned num_rep_0 = shapes[0] / hmma_bts_0; + unsigned num_rep_1 = shapes[1] / hmma_bts_1; + unsigned num_rep_2 = is_batched ? shapes[2] / hmma_bts_2 : 1; + // size of each pack (interleaving) + pack_size_0_ = std::min(num_rep_0, 1); + pack_size_1_ = std::min(num_rep_1, 1); + // number of packs (interleaving) + num_packs_0_ = num_rep_0 / pack_size_0_; + num_packs_1_ = num_rep_1 / pack_size_1_; + + /* intra warp offset */ + // offset of quad in pair + Value *in_pair_off_a = builder.CreateMul(builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), builder.getInt32(4)), + builder.getInt32(fpw_0 * pack_size_0_)); + Value *in_pair_off_b = builder.CreateMul(builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), builder.getInt32(4)), + builder.getInt32(fpw_1 * pack_size_1_)); + + // Quad pair id + Value *pair_a_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), _4); + Value *pair_b_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), _4); + pair_a_id = builder.CreateURem(pair_a_id, builder.getInt32(fpw_0)); + pair_b_id = builder.CreateUDiv(pair_b_id, builder.getInt32(fpw_0)); + pair_b_id = builder.CreateURem(pair_b_id, builder.getInt32(fpw_1)); + // Quad pair offset + Value *pair_a_off = builder.CreateMul(pair_a_id, builder.getInt32(4 * pack_size_0_)); + Value *pair_b_off = builder.CreateMul(pair_b_id, builder.getInt32(4 * pack_size_1_)); + + /* inter warp offset */ + Value *warp_id_0 = builder.CreateURem(u_warp_id, builder.getInt32(wpt_0)); + Value *warp_id_12 = builder.CreateUDiv(u_warp_id, builder.getInt32(wpt_0)); + Value *warp_id_1 = builder.CreateURem(warp_id_12, builder.getInt32(wpt_1)); + Value *warp_id_2 = builder.CreateUDiv(warp_id_12, builder.getInt32(wpt_1)); + Value *warp_offset_i = builder.CreateMul(warp_id_0, builder.getInt32(hmma_wts_0 * pack_size_0_)); + Value *warp_offset_j = builder.CreateMul(warp_id_1, builder.getInt32(hmma_wts_1 * pack_size_1_)); + + /* offsets */ + // a offset + offset_a_i_ = builder.CreateAdd(warp_offset_i, builder.CreateAdd(pair_a_off, in_pair_off_a)); + offset_a_k_ = builder.CreateAnd(u_thread_id, _3); + // b offsets + offset_b_j_ = builder.CreateAdd(warp_offset_j, builder.CreateAdd(pair_b_off, in_pair_off_b)); + offset_b_k_ = builder.CreateAnd(u_thread_id, _3); + + // c offsets + Value *offset_c_i = builder.CreateAdd(builder.CreateAnd(u_thread_id, _1), offset_a_i_); + Value *offset_c_j = builder.CreateAdd(builder.CreateAnd(u_thread_id, _2), + builder.CreateAdd(warp_offset_j, pair_b_off)); + + /* indices */ + // i indices + std::vector idx_i; + for(unsigned pack = 0; pack < num_packs_0_; pack++) + for(unsigned ii = 0; ii < pack_size_0_; ii++) + for(unsigned i = 0; i < 2; i++){ + idx_i.push_back(builder.CreateAdd(offset_c_i, builder.getInt32(pack*hmma_bts_0*pack_size_0_ + ii*4 + i*2))); + } + // j indices + std::vector idx_j; + for(unsigned pack = 0; pack < num_packs_1_; pack++) + for(unsigned jj = 0; jj < pack_size_1_; jj++) + for(unsigned j = 0; j < 2; j++){ + idx_j.push_back(builder.CreateAdd(offset_c_j, builder.getInt32(pack*hmma_bts_1*pack_size_1_ + jj*4 + j*4*fpw_1*pack_size_1_))); + idx_j.push_back(builder.CreateAdd(offset_c_j, builder.getInt32(pack*hmma_bts_1*pack_size_1_ + jj*4 + j*4*fpw_1*pack_size_1_ + 1))); + } + // z indices + std::vector idx_z; + for(unsigned pack = 0; pack < num_rep_2; pack++) + idx_z.push_back(builder.CreateAdd(warp_id_2, builder.getInt32(pack*hmma_bts_2))); + + + /* axes */ + axes_[params_->get_param_group(v, 0)] = distributed_axis{1, idx_i, warp_id_0}; + axes_[params_->get_param_group(v, 1)] = distributed_axis{1, idx_j, warp_id_1}; + if(is_batched) + axes_[params_->get_param_group(v, 2)] = distributed_axis{1, idx_z, warp_id_2}; +} + + +void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { + if(params_->get_fragment(v, 0) == analysis::grids::STRIDED_SCAN) + init_strided_scan_axes(v, builder, u_thread_id, u_warp_id); + else + init_hmma_axes(v, builder, u_thread_id, u_warp_id); +} + bool static inline has_phi_user(ir::value *v) { for(ir::user *usr: v->get_users()){ if(dynamic_cast(usr)) @@ -717,94 +726,97 @@ bool static inline has_phi_user(ir::value *v) { } return false; } + +void selection::create_shared_tile(ir::value *v, IRBuilder<> &builder, Value *sh_mem_ptr) { + auto shapes = v->get_type()->get_tile_shapes(); + unsigned pad = alloc_->is_ld_padded(v); + if(pad > 0) + shapes[0] += pad; + Type* ty = llvm_type(v->get_type()->get_scalar_ty(), builder.getContext()); + // shared copy + PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr->getType()->getPointerAddressSpace()); + // phi-node (double-buffering) + if(auto *phi = dynamic_cast(v)) { + BasicBlock *parent = (BasicBlock*)vmap_[phi->get_parent()]; + unsigned id_pre = 0, id_loop = 1; + if(phi->get_incoming_block(0) == phi->get_parent()) + std::swap(id_pre, id_loop); + if(parent->empty()) + builder.SetInsertPoint(parent); + else + builder.SetInsertPoint(&*parent->getFirstInsertionPt()); + PHINode *ptr = builder.CreatePHI(ptr_ty, 2); + PHINode *offset = builder.CreatePHI(builder.getInt32Ty(), 2); + // next pointer + Value *pre_ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(alloc_->get_offset(phi))); + pre_ptr = builder.CreateBitCast(pre_ptr, ptr->getType()); + Value *next_ptr = builder.CreateGEP(ptr, offset, "next_ptr"); + tmap_.insert({phi, new shared_tile(ty, shapes, ptr, builder, offset)}); + for(unsigned i = 0; i < phi->get_num_incoming(); i++) { + ir::basic_block* inc_block = phi->get_incoming_block(i); + ir::value* inc_value = phi->get_incoming_value(i); + ir::instruction* terminator = inc_block->get_inst_list().back(); + bool is_loop_latch = buffer_info_->is_loop_latch(phi, terminator); + tmap_.insert({inc_value, new shared_tile(ty, shapes, is_loop_latch?next_ptr:pre_ptr, builder)}); + } + } + else { + if(!has_phi_user(v)){ + size_t offset = alloc_->get_offset(v); + Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset)); + ptr = builder.CreateBitCast(ptr, ptr_ty); + tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)}); + } + } +} + +void selection::create_distributed_tile(ir::value *v, IRBuilder<> &builder) { + Type* ty = llvm_type(v->get_type()->get_scalar_ty(), builder.getContext()); + const auto &shapes = v->get_type()->get_tile_shapes(); + std::vector axes(shapes.size()); + for(size_t d = 0; d < shapes.size(); d++){ + if(shapes[d] > 1){ + unsigned x = params_->get_param_group(v, d); + axes[d] = axes_.at(x); + } + else{ + axes[d].contiguous = 1; + axes[d].values = {builder.getInt32(0)}; + } + } + bool vectorize = dynamic_cast(v); + distributed_tile *T = new distributed_tile(ty, shapes, axes, builder, vectorize); + bool is_inserted = tmap_.insert({v, T}).second; + // constant range + if(is_inserted && dynamic_cast(v)){ + T->for_each([&](indices_t idx){ + assert(idx.size() == 1); + T->set_value(idx, idx[0]); + }); + } + if(is_inserted && dynamic_cast(v)){ + T->for_each([&](indices_t idx){ + assert(idx.size() == 1); + BinaryOperator *bin_add = dyn_cast(idx[0]); + assert(bin_add); + Value *res = bin_add->getOperand(1); + assert(isa(res)); + T->set_value(idx, res); + }); + } +} + void selection::create_tile(ir::value *v, IRBuilder<> &builder, std::set &seen, Value *sh_mem_ptr) { if(!v->get_type()->is_tile_ty() || !seen.insert(v).second) return; if(auto *user = dynamic_cast(v)) - for(ir::value *op: user->ops()){ + for(ir::value *op: user->ops()) create_tile(op, builder, seen, sh_mem_ptr); - } - LLVMContext &ctx = builder.getContext(); - auto shapes = v->get_type()->get_tile_shapes(); - unsigned pad = alloc_->is_ld_padded(v); - if(pad > 0) - shapes[0] += pad; - Type* ty = llvm_type(v->get_type()->get_scalar_ty(), ctx); - // create shared tile - if(buffer_info_->is_shared(v) && !dynamic_cast(v)){ - // shared copy - PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr->getType()->getPointerAddressSpace()); - // phi-node (double-buffering) - if(auto *phi = dynamic_cast(v)) { - BasicBlock *parent = (BasicBlock*)vmap_[phi->get_parent()]; - unsigned id_pre = 0, id_loop = 1; - if(phi->get_incoming_block(0) == phi->get_parent()) - std::swap(id_pre, id_loop); - if(parent->empty()) - builder.SetInsertPoint(parent); - else - builder.SetInsertPoint(&*parent->getFirstInsertionPt()); - PHINode *ptr = builder.CreatePHI(ptr_ty, 2); - PHINode *offset = builder.CreatePHI(builder.getInt32Ty(), 2); - // next pointer - Value *pre_ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(alloc_->get_offset(phi))); - pre_ptr = builder.CreateBitCast(pre_ptr, ptr->getType()); - Value *next_ptr = builder.CreateGEP(ptr, offset, "next_ptr"); - tmap_.insert({phi, new shared_tile(ty, shapes, ptr, builder, offset)}); - for(unsigned i = 0; i < phi->get_num_incoming(); i++) { - ir::basic_block* inc_block = phi->get_incoming_block(i); - ir::value* inc_value = phi->get_incoming_value(i); - ir::instruction* terminator = inc_block->get_inst_list().back(); - bool is_loop_latch = buffer_info_->is_loop_latch(phi, terminator); - tmap_.insert({inc_value, new shared_tile(ty, shapes, is_loop_latch?next_ptr:pre_ptr, builder)}); - } - } - else { - if(!has_phi_user(v)){ - size_t offset = alloc_->get_offset(v); - Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset)); - ptr = builder.CreateBitCast(ptr, ptr_ty); - tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)}); - } - } - } - // create distributed tile - else { - const auto &shapes = v->get_type()->get_tile_shapes(); - std::vector axes(shapes.size()); - for(size_t d = 0; d < shapes.size(); d++){ - if(shapes[d] > 1){ - unsigned x = params_->get_param_group(v, d); - axes[d] = axes_.at(x); - } - else{ - axes[d].contiguous = 1; - axes[d].values = {builder.getInt32(0)}; - } - } - bool vectorize = dynamic_cast(v); - distributed_tile *T = new distributed_tile(ty, shapes, axes, builder, vectorize); - bool is_inserted = tmap_.insert({v, T}).second; - // constant range - if(is_inserted && dynamic_cast(v)){ - T->for_each([&](indices_t idx){ - assert(idx.size() == 1); - T->set_value(idx, idx[0]); - }); - } - if(is_inserted && dynamic_cast(v)){ - T->for_each([&](indices_t idx){ - assert(idx.size() == 1); - BinaryOperator *bin_add = dyn_cast(idx[0]); - assert(bin_add); - Value *res = bin_add->getOperand(1); - assert(isa(res)); - T->set_value(idx, res); - }); - } - - } + if(buffer_info_->is_shared(v) && !dynamic_cast(v)) + create_shared_tile(v, builder, sh_mem_ptr); + else + create_distributed_tile(v, builder); } void selection::init_grids(ir::function *fn, IRBuilder<> &builder, Value *sh_mem_ptr){ @@ -908,7 +920,7 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, tgt_->add_barrier(module, builder); builder.CreateStore(result, write_ptr); // build result - unsigned depth = params_->get_wpt(op, axis); + unsigned depth = params_->wpt(op, axis); for(unsigned i = depth/2; i > 0; i >>= 1){ // current indices indices_t current(write_idx.size(), builder.getInt32(0)); @@ -1075,12 +1087,12 @@ void selection::lower_hmma_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn "{$10, $11}, " "{$0, $1, $2, $3, $4, $5, $6, $7};", "=f,=f,=f,=f,=f,=f,=f,=f,r,r,r,r,0,1,2,3,4,5,6,7", false); - unsigned fpw_0 = params_->get_fpw(dot, 0); - unsigned fpw_1 = params_->get_fpw(dot, 1); + unsigned fpw_0 = params_->fpw(dot, 0); + unsigned fpw_1 = params_->fpw(dot, 1); unsigned wts_0 = fpw_0 * 8; unsigned wts_1 = fpw_1 * 8; - unsigned wpt_0 = params_->get_wpt(dot, 0); - unsigned wpt_1 = params_->get_wpt(dot, 1); + unsigned wpt_0 = params_->wpt(dot, 0); + unsigned wpt_1 = params_->wpt(dot, 1); unsigned stride_rep_i = wpt_0 * wts_0; unsigned stride_rep_j = wpt_1 * wts_1; unsigned num_rep_i = shapes[0] / stride_rep_i; @@ -1457,7 +1469,7 @@ void selection::run(ir::module &src, Module &dst) { Metadata *md_args[] = { ValueAsMetadata::get(dst_fn), MDString::get(dst_ctx, "maxntidx"), - ValueAsMetadata::get(dst_builder.getInt32(params_->get_num_threads())) + ValueAsMetadata::get(dst_builder.getInt32(num_warps_*32)) }; dst.getOrInsertNamedMetadata("nvvm.annotations")->addOperand(MDNode::get(dst_ctx, md_args)); diff --git a/lib/codegen/transform/vectorize.cc b/lib/codegen/transform/vectorize.cc index e7e329c02..4d1b88541 100644 --- a/lib/codegen/transform/vectorize.cc +++ b/lib/codegen/transform/vectorize.cc @@ -27,7 +27,7 @@ void vectorize::run(ir::module &mod) { } if(dynamic_cast(i)){ ir::value *x = i->get_operand(0); - if(params_->get_nts(x, 0) == 1) + if(params_->nts(x, 0) == 1) continue; builder.set_insert_point(i); ir::instruction *rx = (ir::instruction*)builder.create_vectorize(x); diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 3dd7c1507..016d5c879 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -205,7 +205,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::transform::dce dce; codegen::transform::peephole peephole; codegen::transform::reassociate reassociate(&alignment_info, &grids); - codegen::selection selection(&shmem_allocation, &grids, &shmem_info, &alignment_info, &reorder, target.get()); + codegen::selection selection(&shmem_allocation, &grids, &shmem_info, &alignment_info, &reorder, target.get(), opt.num_warps); // run passes peephole.run(module); dce.run(module); From 495163e0e8bef39d59beb9f5b0c0055021c74224 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 14 Sep 2019 16:53:13 -0400 Subject: [PATCH 404/494] some more cleaning --- include/triton/codegen/analysis/align.h | 11 +--- include/triton/codegen/analysis/grid.h | 12 ++-- include/triton/codegen/analysis/liveness.h | 2 - include/triton/codegen/analysis/memalloc.h | 9 +-- lib/codegen/analysis/align.cc | 29 +++------- lib/codegen/analysis/grid.cc | 10 ++-- lib/codegen/analysis/memalloc.cc | 16 +++--- lib/codegen/selection.cc | 64 ++++++++++++---------- lib/codegen/transform/coalesce.cc | 3 +- lib/codegen/transform/membar.cc | 4 +- lib/codegen/transform/reassociate.cc | 21 ------- lib/runtime/function.cc | 12 ++-- 12 files changed, 78 insertions(+), 115 deletions(-) diff --git a/include/triton/codegen/analysis/align.h b/include/triton/codegen/analysis/align.h index 9b1adb40f..bbc5fe440 100644 --- a/include/triton/codegen/analysis/align.h +++ b/include/triton/codegen/analysis/align.h @@ -21,16 +21,13 @@ namespace codegen{ namespace analysis{ class align { +private: struct cst_info { unsigned num_cst; unsigned value; }; - -private: // helpers - bool is_first_axis_unit(ir::value *v); std::vector get_shapes(ir::value *v); - // populate is_constant std::vector populate_is_constant_phi(ir::phi_node* x); std::vector populate_is_constant_splat(ir::splat_inst* x); @@ -61,10 +58,8 @@ private: public: void run(ir::module &mod); - unsigned get_starting_multiple(ir::value* v) const; - unsigned get_max_contiguous(ir::value* v) const; - std::vector get_max_contiguous_vec(ir::value* v) const; - void copy(ir::value *dst, ir::value *src); + unsigned get(ir::value* v, unsigned ax) const; + std::vector contiguous(ir::value* v) const; private: std::map> is_constant_; diff --git a/include/triton/codegen/analysis/grid.h b/include/triton/codegen/analysis/grid.h index 50a8c578a..465011b83 100644 --- a/include/triton/codegen/analysis/grid.h +++ b/include/triton/codegen/analysis/grid.h @@ -49,20 +49,20 @@ private: public: - grids(size_t num_warps, transform::coalesce* reorder); - fragment_t get_fragment(ir::value *value, unsigned ax); - void copy(ir::value *dst, ir::value *src); + grids(size_t num_warps, transform::coalesce* coalesce); void run(ir::module &mod); - unsigned get_param_group(ir::value *value, unsigned ax); - const std::vector get_grids() const { return grids_; } + const std::vector get() const { return grids_; } + fragment_t fragment_of(ir::value *value, unsigned ax); + unsigned group_of(ir::value *value, unsigned ax); int mts(ir::value *value, unsigned ax); int nts(ir::value *value, unsigned ax); int fpw(ir::value *value, unsigned ax); int wpt(ir::value *value, unsigned ax); + void copy(ir::value *dst, ir::value *src); private: - transform::coalesce* reorder_; + transform::coalesce* coalesce_; // number of warps size_t num_warps_; // grids diff --git a/include/triton/codegen/analysis/liveness.h b/include/triton/codegen/analysis/liveness.h index 4aa0c6dae..4b863ff55 100644 --- a/include/triton/codegen/analysis/liveness.h +++ b/include/triton/codegen/analysis/liveness.h @@ -45,11 +45,9 @@ public: public: // constructor liveness(meminfo *info): info_(info){ } - // accessors const intervals_map_t& intervals() const { return intervals_; } segment get_interval(ir::value* v) const { return intervals_.at(v); } - // run void run(ir::module &mod); diff --git a/include/triton/codegen/analysis/memalloc.h b/include/triton/codegen/analysis/memalloc.h index 0e5b2adc9..91cf89123 100644 --- a/include/triton/codegen/analysis/memalloc.h +++ b/include/triton/codegen/analysis/memalloc.h @@ -24,15 +24,12 @@ class memalloc { public: memalloc(liveness *live, meminfo *buffer_info, grids *params) : liveness_(live), buffer_info_(buffer_info), params_(params){ } - // utilities - unsigned get_num_bytes(ir::value *x); + unsigned num_bytes(ir::value *x); unsigned is_ld_padded(ir::value* x); - // accessors - unsigned get_offset(ir::value *x) const { return offsets_.at(x); } - unsigned get_allocated_size() const { return allocated_size_; } - + unsigned offset(ir::value *x) const { return offsets_.at(x); } + unsigned allocated_size() const { return allocated_size_; } // run void run(); diff --git a/lib/codegen/analysis/align.cc b/lib/codegen/analysis/align.cc index 3c2348587..8c2ecf847 100644 --- a/lib/codegen/analysis/align.cc +++ b/lib/codegen/analysis/align.cc @@ -30,14 +30,6 @@ inline T add_to_cache(ir::value *i, T value, std::map &map) { return map[i] = value; } - -bool align::is_first_axis_unit(ir::value *x){ - if(x->get_type()->is_tile_ty()) - return x->get_type()->get_tile_shapes()[0] == 1; - else - return true; -} - /* * is constant */ @@ -471,26 +463,19 @@ std::vector align::populate_starting_multiple(ir::value *v){ return populate_starting_multiple_default(v); } -unsigned align::get_starting_multiple(ir::value* v) const { - return starting_multiple_.at(v)[0]; + +unsigned align::get(ir::value *v, unsigned ax) const { + unsigned starting_multiple = starting_multiple_.at(v)[ax]; + unsigned max_contiguous = max_contiguous_.at(v)[ax]; + return std::min(starting_multiple, max_contiguous); } -unsigned align::get_max_contiguous(ir::value* v) const { - return max_contiguous_.at(v)[0]; -} - -std::vector align::get_max_contiguous_vec(ir::value* v) const { +std::vector align::contiguous(ir::value* v) const { return max_contiguous_.at(v); } -void align::copy(ir::value *dst, ir::value *src) { - starting_multiple_[dst] = starting_multiple_[src]; - max_contiguous_[dst] = max_contiguous_[src]; - is_constant_[dst] = is_constant_[src]; -} - - void align::run(ir::module &mod) { + // populate constant for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) diff --git a/lib/codegen/analysis/grid.cc b/lib/codegen/analysis/grid.cc index cf5a718cd..1e6de0de4 100644 --- a/lib/codegen/analysis/grid.cc +++ b/lib/codegen/analysis/grid.cc @@ -16,7 +16,7 @@ namespace triton{ namespace codegen{ namespace analysis{ -grids::grids(size_t num_warps, transform::coalesce *reorder): num_warps_(num_warps), reorder_(reorder) +grids::grids(size_t num_warps, transform::coalesce *reorder): num_warps_(num_warps), coalesce_(reorder) { } bool is_hmma(ir::value *v){ @@ -168,12 +168,12 @@ void grids::connected_components(node_t x, const std::vector& ptr_v } } -unsigned grids::get_param_group(ir::value *value, unsigned ax) { +unsigned grids::group_of(ir::value *value, unsigned ax) { unsigned result = groups_.at(value).at(ax); return result; } -grids::fragment_t grids::get_fragment(ir::value *value, unsigned ax) { +grids::fragment_t grids::fragment_of(ir::value *value, unsigned ax) { return fragments_.at({value, ax}); } @@ -233,7 +233,7 @@ void grids::run(ir::module &mod) { for(ir::value *i: grids_){ if(!i->get_type()->is_tile_ty()) continue; - auto order = reorder_->get_order(i); + auto order = coalesce_->get_order(i); auto shapes = i->get_type()->get_tile_shapes(); unsigned size = i->get_type()->get_tile_num_elements(); /* HMMA parameters*/ @@ -329,7 +329,7 @@ void grids::create_grids(std::vector &grids, for(size_t d = 0; d < shapes.size(); d++){ if(shapes[d] == 1) continue; - unsigned x = get_param_group(v, d); + unsigned x = group_of(v, d); ir::value *&r = references[x]; if(!r || get_tile_gt1_dim(v) > get_tile_gt1_dim(r)) r = v; diff --git a/lib/codegen/analysis/memalloc.cc b/lib/codegen/analysis/memalloc.cc index 631b8f663..866f9c7a4 100644 --- a/lib/codegen/analysis/memalloc.cc +++ b/lib/codegen/analysis/memalloc.cc @@ -20,7 +20,7 @@ unsigned memalloc::is_ld_padded(ir::value *x) { } for(ir::user* user: x->get_users()) if(auto dot = dynamic_cast(user)){ - bool is_hmma = params_->get_fragment(user, 0) == grids::HMMA_FRAGMENT_C; + bool is_hmma = params_->fragment_of(user, 0) == grids::HMMA_FRAGMENT_C; bool is_op_0 = x == dot->get_operand(0); bool is_op_1 = x == dot->get_operand(1); if(is_hmma && is_op_0){ @@ -45,7 +45,7 @@ unsigned memalloc::is_ld_padded(ir::value *x) { return 0; } -unsigned memalloc::get_num_bytes(ir::value *x) { +unsigned memalloc::num_bytes(ir::value *x) { if(auto *red = dynamic_cast(x)){ unsigned num_bytes = x->get_type()->get_scalar_ty()->get_primitive_size_in_bits() / 8; size_t axis = red->get_axis(); @@ -56,7 +56,7 @@ unsigned memalloc::get_num_bytes(ir::value *x) { for(auto x: shapes) num_elements *= x; size_t depth; - if(params_->get_fragment(x, 0) == grids::HMMA_FRAGMENT_C) + if(params_->fragment_of(x, 0) == grids::HMMA_FRAGMENT_C) depth = params_->wpt(op, axis); else depth = params_->mts(op, axis); @@ -102,7 +102,7 @@ void memalloc::run(){ return res; }); if(j_it != J.end()){ - unsigned size = get_num_bytes(*j_it); + unsigned size = num_bytes(*j_it); segment xj = liveness_->get_interval(*j_it); starts[*j_it] = w; H.insert({w + size, segment{max(xh.start, xj.start), min(xh.end, xj.end)}}); @@ -123,8 +123,8 @@ void memalloc::run(){ if(x == y) continue; unsigned X0 = starts[x], Y0 = starts[y]; - unsigned NX = get_num_bytes(x); - unsigned NY = get_num_bytes(y); + unsigned NX = num_bytes(x); + unsigned NY = num_bytes(y); segment XS = {X0, X0 + NX}; segment YS = {Y0, Y0 + NY}; if(liveness_->get_interval(x).intersect(liveness_->get_interval(y)) @@ -156,7 +156,7 @@ void memalloc::run(){ for(ir::value *x: V){ unsigned Adj = 0; for(ir::value *y: interferences[x]) - Adj = std::max(Adj, starts[y] + get_num_bytes(y)); + Adj = std::max(Adj, starts[y] + num_bytes(y)); offsets_[x] = starts[x] + colors[x] * Adj; if(buffer_info_->is_double(x)){ ir::phi_node *phi = (ir::phi_node*)x; @@ -170,7 +170,7 @@ void memalloc::run(){ // Save maximum size of induced memory space allocated_size_ = 0; for(auto &x: offsets_){ - allocated_size_ = std::max(allocated_size_, x.second + get_num_bytes(x.first)); + allocated_size_ = std::max(allocated_size_, x.second + num_bytes(x.first)); } } diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 4cff99890..e78228070 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -430,7 +430,7 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::functiongetParent()); BasicBlock *tid_0_done_bb = BasicBlock::Create(ctx, "tid_0_done", current->getParent()); - Value *ptr = builder.CreateGEP(sh_mem_ptr_, builder.getInt32(alloc_->get_offset(ii))); + Value *ptr = builder.CreateGEP(sh_mem_ptr_, builder.getInt32(alloc_->offset(ii))); ptr = builder.CreateBitCast(ptr, PointerType::get(builder.getInt32Ty(), ptr->getType()->getPointerAddressSpace())); tgt_->add_memfence(module, builder); tgt_->add_barrier(module, builder); @@ -538,6 +538,10 @@ Value* selection::llvm_value(ir::value *v, IRBuilder<> &builder) { throw std::runtime_error("unknown conversion from ir::value to Value"); } +/* ------------------- + * ---- Init Axes ---- + * ------------------- */ + // Grid construction std::vector delinearize(Value *trailing, const std::vector& order, std::vector &shapes, IRBuilder<> &builder){ size_t dim = shapes.size(); @@ -600,7 +604,7 @@ void selection::init_strided_scan_axes(ir::value *v, IRBuilder<> &builder, Value unsigned offset = n / contiguous[k] * per_block + n % contiguous[k]; idx_list[n] = builder.CreateAdd(scaled_thread_id, builder.getInt32(offset), "idx_" + str_k + "_" + std::to_string(n)); } - axes_[params_->get_param_group(v, k)] = distributed_axis{contiguous[k], idx_list, thread_id}; + axes_[params_->group_of(v, k)] = distributed_axis{contiguous[k], idx_list, thread_id}; } } @@ -705,27 +709,23 @@ void selection::init_hmma_axes(ir::value *v, IRBuilder<> &builder, Value *u_thre /* axes */ - axes_[params_->get_param_group(v, 0)] = distributed_axis{1, idx_i, warp_id_0}; - axes_[params_->get_param_group(v, 1)] = distributed_axis{1, idx_j, warp_id_1}; + axes_[params_->group_of(v, 0)] = distributed_axis{1, idx_i, warp_id_0}; + axes_[params_->group_of(v, 1)] = distributed_axis{1, idx_j, warp_id_1}; if(is_batched) - axes_[params_->get_param_group(v, 2)] = distributed_axis{1, idx_z, warp_id_2}; + axes_[params_->group_of(v, 2)] = distributed_axis{1, idx_z, warp_id_2}; } void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { - if(params_->get_fragment(v, 0) == analysis::grids::STRIDED_SCAN) + if(params_->fragment_of(v, 0) == analysis::grids::STRIDED_SCAN) init_strided_scan_axes(v, builder, u_thread_id, u_warp_id); else init_hmma_axes(v, builder, u_thread_id, u_warp_id); } -bool static inline has_phi_user(ir::value *v) { - for(ir::user *usr: v->get_users()){ - if(dynamic_cast(usr)) - return true; - } - return false; -} +/* ------------------- + * ---- Init Tiles ---- + * ------------------- */ void selection::create_shared_tile(ir::value *v, IRBuilder<> &builder, Value *sh_mem_ptr) { auto shapes = v->get_type()->get_tile_shapes(); @@ -748,7 +748,7 @@ void selection::create_shared_tile(ir::value *v, IRBuilder<> &builder, Value *sh PHINode *ptr = builder.CreatePHI(ptr_ty, 2); PHINode *offset = builder.CreatePHI(builder.getInt32Ty(), 2); // next pointer - Value *pre_ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(alloc_->get_offset(phi))); + Value *pre_ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(alloc_->offset(phi))); pre_ptr = builder.CreateBitCast(pre_ptr, ptr->getType()); Value *next_ptr = builder.CreateGEP(ptr, offset, "next_ptr"); tmap_.insert({phi, new shared_tile(ty, shapes, ptr, builder, offset)}); @@ -761,8 +761,12 @@ void selection::create_shared_tile(ir::value *v, IRBuilder<> &builder, Value *sh } } else { - if(!has_phi_user(v)){ - size_t offset = alloc_->get_offset(v); + bool has_phi_user = false; + for(ir::user *usr: v->get_users()) + if(dynamic_cast(usr)) + has_phi_user = true; + if(has_phi_user){ + size_t offset = alloc_->offset(v); Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset)); ptr = builder.CreateBitCast(ptr, ptr_ty); tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)}); @@ -776,7 +780,7 @@ void selection::create_distributed_tile(ir::value *v, IRBuilder<> &builder) { std::vector axes(shapes.size()); for(size_t d = 0; d < shapes.size(); d++){ if(shapes[d] > 1){ - unsigned x = params_->get_param_group(v, d); + unsigned x = params_->group_of(v, d); axes[d] = axes_.at(x); } else{ @@ -827,7 +831,7 @@ void selection::init_grids(ir::function *fn, IRBuilder<> &builder, Value *sh_mem Value *u_thread_warp_id = builder.CreateURem(u_thread_id, warp_size); Value *u_warp_id = builder.CreateUDiv(u_thread_id, warp_size); // create grid - for(ir::value* i: params_->get_grids()) + for(ir::value* i: params_->get()) init_axes(i, builder, u_thread_warp_id, u_warp_id); // create tile std::set seen; @@ -839,6 +843,10 @@ void selection::init_grids(ir::function *fn, IRBuilder<> &builder, Value *sh_mem } } +/* ---------------------------- + * ---- Lower Instructions ---- + * ---------------------------- */ + void selection::lower_masked_store(ir::masked_store_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { distributed_tile* ptrs = (distributed_tile*)tmap_.at(x->get_pointer_operand()); distributed_tile* scalars = (distributed_tile*)tmap_.at(x->get_value_operand()); @@ -907,7 +915,7 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, Value *base_ptr = builder.CreateBitCast(sh_mem_ptr_, PointerType::get(res_ty, addr_space)); for(auto& x: partial) { // current element being computed - Value *lane = axes_.at(params_->get_param_group(op, axis)).thread_id; + Value *lane = axes_.at(params_->group_of(op, axis)).thread_id; Value *&result = x.second; indices_t write_idx = x.first; write_idx.insert(write_idx.begin() + axis, lane); @@ -1233,7 +1241,7 @@ void selection::lower_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn, IRB if(NK != 1) { shared_tile *TA = (shared_tile*)tmap_.at(A); shared_tile *TB = (shared_tile*)tmap_.at(B); - if(params_->get_fragment(dot, 0) == analysis::grids::STRIDED_SCAN) + if(params_->fragment_of(dot, 0) == analysis::grids::STRIDED_SCAN) lower_scanline_dot(dot, ctx, fn, builder, TC, TA, TB, TD, NK, c_ty, f_mul_add); else lower_hmma_dot(dot, ctx, fn, builder, TC, TA, TB, TD, NK); @@ -1249,9 +1257,7 @@ void selection::lower_masked_load(ir::masked_load_inst *x, LLVMContext &ctx, Fun // find vector size distributed_tile* result = (distributed_tile*)tmap_.at(x); ir::value *ptr = x->get_pointer_operand(); - unsigned starting_multiple = alignment_->get_starting_multiple(ptr); - unsigned max_contiguous = alignment_->get_max_contiguous(ptr); - unsigned alignment = std::min(starting_multiple, max_contiguous); + unsigned alignment = alignment_->get(ptr, 0); unsigned vector_size = std::min(result->axis(0).contiguous, alignment); distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr); distributed_tile *masks = (distributed_tile*)tmap_.at(x->get_mask_operand()); @@ -1322,9 +1328,7 @@ void selection::lower_load(ir::load_inst *x, LLVMContext &ctx, Function *fn, IRB distributed_tile* result = (distributed_tile*)tmap_.at(x); // find vector size ir::value *ptr = x->get_pointer_operand(); - unsigned starting_multiple = alignment_->get_starting_multiple(ptr); - unsigned max_contiguous = alignment_->get_max_contiguous(ptr); - unsigned alignment = std::min(starting_multiple, max_contiguous); + unsigned alignment = alignment_->get(ptr, 0); unsigned vector_size = std::min(result->axis(0).contiguous, alignment); distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr); // vector loads @@ -1408,6 +1412,10 @@ void selection::lower_instruction(ir::instruction *src, IRBuilder<> &builder) { } } +/* ---------------------------- + * ---- Generate LLVM code ---- + * ---------------------------- */ + inline llvm::Attribute llvm_attr(llvm::LLVMContext& ctx, ir::attribute attr) { switch(attr.get_kind()){ case ir::noalias: return llvm::Attribute::get(ctx, llvm::Attribute::NoAlias); @@ -1487,7 +1495,7 @@ void selection::run(ir::module &src, Module &dst) { // allocate shared memory Value *sh_mem_ptr = nullptr; if(tgt_->is_gpu()) - if(unsigned alloc_size = alloc_->get_allocated_size()){ + if(unsigned alloc_size = alloc_->allocated_size()){ Type *int_8_ty = Type::getInt8Ty(dst_ctx); ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_size); Type *ptr_ty = PointerType::get(int_8_ty, 3); @@ -1540,7 +1548,7 @@ void selection::run(ir::module &src, Module &dst) { } else { unsigned num_bytes = phi->get_type()->get_scalar_ty()->get_primitive_size_in_bits() / 8; - offset->addIncoming(dst_builder.getInt32(alloc_->get_num_bytes(phi)/(2*num_bytes)), llvm_inc_block); + offset->addIncoming(dst_builder.getInt32(alloc_->num_bytes(phi)/(2*num_bytes)), llvm_inc_block); } ptr->addIncoming(inc_shared->get_pointer(), llvm_inc_block); } diff --git a/lib/codegen/transform/coalesce.cc b/lib/codegen/transform/coalesce.cc index 8c880d638..12cd3f671 100644 --- a/lib/codegen/transform/coalesce.cc +++ b/lib/codegen/transform/coalesce.cc @@ -57,7 +57,7 @@ void coalesce::run(ir::module &mod) { std::map replaced; for(ir::io_inst *i: io) { ir::value *ptr = i->get_pointer_operand(); - auto max_contiguous = align_->get_max_contiguous_vec(ptr); + auto max_contiguous = align_->contiguous(ptr); std::vector order(max_contiguous.size()); std::iota(order.begin(), order.end(), 0); std::sort(order.begin(), order.end(), [&](unsigned a, unsigned b) { return max_contiguous[a] > max_contiguous[b]; } ); @@ -102,7 +102,6 @@ void coalesce::run(ir::module &mod) { n_op = builder.insert(n_op); replaced.insert({i_op, n_op}); order_[n_op] = order; - align_->copy(n_op, i_op); mem_->copy(n_op, i_op); if(original) n_op->erase_use(original); diff --git a/lib/codegen/transform/membar.cc b/lib/codegen/transform/membar.cc index 007263543..fc6891ea8 100644 --- a/lib/codegen/transform/membar.cc +++ b/lib/codegen/transform/membar.cc @@ -32,8 +32,8 @@ bool membar::intersect(const interval_vec_t &X, const interval_vec_t &Y) { void membar::add_reference(ir::value *v, interval_vec_t &res){ if(buffer_info_->is_shared(v) && !dynamic_cast(v)){ - unsigned offset = alloc_->get_offset(v); - unsigned num_bytes = alloc_->get_num_bytes(v); + unsigned offset = alloc_->offset(v); + unsigned num_bytes = alloc_->num_bytes(v); res.push_back(interval_t(offset, offset + num_bytes)); } } diff --git a/lib/codegen/transform/reassociate.cc b/lib/codegen/transform/reassociate.cc index ae42a6566..f059aba88 100644 --- a/lib/codegen/transform/reassociate.cc +++ b/lib/codegen/transform/reassociate.cc @@ -94,9 +94,6 @@ ir::value *reassociate::reassociate_idx(ir::value *old_value, params_->copy(new_value, old_value); params_->copy(new_lhs, old_value); params_->copy(new_rhs, old_value); - align_->copy(new_value, old_value); - align_->copy(new_lhs, old_value); - align_->copy(new_rhs, old_value); } } } @@ -134,9 +131,6 @@ ir::value *reassociate::reassociate_idx(ir::value *old_value, params_->copy(new_value, old_value); params_->copy(((ir::instruction*)new_value)->get_operand(0), old_value); params_->copy(((ir::instruction*)new_value)->get_operand(1), old_value); - align_->copy(new_value, old_value); - align_->copy(((ir::instruction*)new_value)->get_operand(0), old_value); - align_->copy(((ir::instruction*)new_value)->get_operand(1), old_value); } } @@ -192,9 +186,6 @@ void reassociate::run(ir::module &mod) { params_->copy(dyn_range, old_range); params_->copy(static_range, old_range); params_->copy(new_range, old_range); - align_->copy(dyn_range, old_range); - align_->copy(static_range, old_range); - align_->copy(new_range, old_range); } } @@ -226,9 +217,6 @@ void reassociate::run(ir::module &mod) { params_->copy(ndyn, rt); params_->copy(nsta, rt); params_->copy(broadcast, rt); - align_->copy(ndyn, rt); - align_->copy(nsta, rt); - align_->copy(broadcast, rt); infos[rt] = cst_info{ndyn, nsta}; } } @@ -250,8 +238,6 @@ void reassociate::run(ir::module &mod) { ir::value *sta_ptr = builder.create_gep(dyn_ptr, {sta}); params_->copy(dyn_ptr, pz); params_->copy(sta_ptr, pz); - align_->copy(dyn_ptr, pz); - align_->copy(sta_ptr, pz); pz->replace_all_uses_with(sta_ptr); infos[sta_ptr].dyn_ptr = dyn_ptr; infos[sta_ptr].sta_ptr = (ir::getelementptr_inst*)sta_ptr; @@ -268,8 +254,6 @@ void reassociate::run(ir::module &mod) { ir::value *pz_sta = builder.create_gep(pz_dyn, {cst}, pz->get_name()); params_->copy(pz_dyn, pz); params_->copy(pz_sta, pz); - align_->copy(pz_dyn, pz); - align_->copy(pz_sta, pz); pz->replace_all_uses_with(pz_sta); infos[pz_sta].dyn_ptr = pz_dyn; infos[pz_sta].sta_ptr = (ir::getelementptr_inst*)pz_sta; @@ -320,11 +304,6 @@ void reassociate::run(ir::module &mod) { params_->copy(neg_off, off); params_->copy(phi_dyn, phi); params_->copy(phi_sta, phi); - align_->copy(pz_dyn, pz); - align_->copy(((ir::instruction*)neg_off)->get_operand(0), off); - align_->copy(neg_off, off); - align_->copy(phi_dyn, phi); - align_->copy(phi_sta, phi); infos[phi_sta].dyn_ptr = phi_dyn; infos[phi_sta].sta_ptr = (ir::getelementptr_inst*)phi_sta; replaced.insert(phi); diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 016d5c879..9c3f99869 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -197,24 +197,25 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::analysis::meminfo shmem_info; codegen::analysis::liveness shmem_liveness(&shmem_info); codegen::analysis::align alignment_info; - codegen::transform::coalesce reorder(&alignment_info, &shmem_info); - codegen::analysis::grids grids(opt.num_warps, &reorder); + codegen::transform::coalesce coalesce(&alignment_info, &shmem_info); + codegen::analysis::grids grids(opt.num_warps, &coalesce); codegen::analysis::memalloc shmem_allocation(&shmem_liveness, &shmem_info, &grids); codegen::transform::membar shmem_barriers(&shmem_allocation, &shmem_info); codegen::transform::vectorize vectorize(&grids); codegen::transform::dce dce; codegen::transform::peephole peephole; codegen::transform::reassociate reassociate(&alignment_info, &grids); - codegen::selection selection(&shmem_allocation, &grids, &shmem_info, &alignment_info, &reorder, target.get(), opt.num_warps); + codegen::selection selection(&shmem_allocation, &grids, &shmem_info, &alignment_info, &coalesce, target.get(), opt.num_warps); // run passes peephole.run(module); dce.run(module); alignment_info.run(module); if(target->is_gpu()) shmem_info.run(module); - reorder.run(module); + coalesce.run(module); dce.run(module); grids.run(module); + alignment_info.run(module); reassociate.run(module); dce.run(module); peephole.run(module); @@ -222,13 +223,14 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c shmem_info.run(module); shmem_liveness.run(module); shmem_allocation.run(); - if(shmem_allocation.get_allocated_size() > context->device()->max_shared_memory()) + if(shmem_allocation.allocated_size() > context->device()->max_shared_memory()) return std::unique_ptr(); shmem_barriers.run(module); } dce.run(module); vectorize.run(module); dce.run(module); + alignment_info.run(module); // ir::print(module, std::cout); // generate llvm code llvm::LLVMContext ctx; From 031f4dfe9677f64039f1efdc43f2802dd52a52c4 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 14 Sep 2019 19:13:54 -0400 Subject: [PATCH 405/494] no performance regression --- tests/common/src/dot.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/common/src/dot.h b/tests/common/src/dot.h index 7511eda9a..8521cd0a6 100644 --- a/tests/common/src/dot.h +++ b/tests/common/src/dot.h @@ -64,7 +64,7 @@ void dot(TYPE * A, TYPE * B, TYPE * C, // epilogue int rxc[TM] = ridx * TM + 0 ... TM; int ryc[TN] = ridy * TN + 0 ... TN; - TYPE* pc[TM, TN] = C + ryc[newaxis, :] + rxc[:, newaxis] * ldc; + TYPE* pc[TM, TN] = C + rxc[:, newaxis] * ldc + ryc[newaxis, :]; *pc = c; } )"; From 8d37a55a21a520754cb8a837a8dc4c3cea6a39b0 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 15 Sep 2019 21:14:14 -0400 Subject: [PATCH 406/494] [codegen][analysis] cleaned-up tiling formalism --- include/triton/codegen/analysis/axes.h | 49 +++ include/triton/codegen/analysis/grid.h | 89 ----- include/triton/codegen/analysis/layout.h | 57 +++ include/triton/codegen/analysis/memalloc.h | 8 +- include/triton/codegen/analysis/tiles.h | 68 ++++ include/triton/codegen/selection.h | 16 +- .../triton/codegen/transform/reassociate.h | 5 +- include/triton/codegen/transform/vectorize.h | 6 +- include/triton/ir/type.h | 1 + include/triton/runtime/function.h | 4 +- lib/codegen/analysis/axes.cc | 166 ++++++++ lib/codegen/analysis/grid.cc | 367 ------------------ lib/codegen/analysis/layout.cc | 96 +++++ lib/codegen/analysis/memalloc.cc | 10 +- lib/codegen/analysis/tiles.cc | 176 +++++++++ lib/codegen/selection.cc | 59 +-- lib/codegen/transform/reassociate.cc | 30 +- lib/codegen/transform/vectorize.cc | 4 +- lib/ir/type.cc | 4 + lib/runtime/function.cc | 50 ++- tests/bench/dot.cc | 6 +- 21 files changed, 710 insertions(+), 561 deletions(-) create mode 100644 include/triton/codegen/analysis/axes.h delete mode 100644 include/triton/codegen/analysis/grid.h create mode 100644 include/triton/codegen/analysis/layout.h create mode 100644 include/triton/codegen/analysis/tiles.h create mode 100644 lib/codegen/analysis/axes.cc delete mode 100644 lib/codegen/analysis/grid.cc create mode 100644 lib/codegen/analysis/layout.cc create mode 100644 lib/codegen/analysis/tiles.cc diff --git a/include/triton/codegen/analysis/axes.h b/include/triton/codegen/analysis/axes.h new file mode 100644 index 000000000..f625c4193 --- /dev/null +++ b/include/triton/codegen/analysis/axes.h @@ -0,0 +1,49 @@ +#ifndef _TRITON_CODEGEN_ANALYSIS_AXES_H_ +#define _TRITON_CODEGEN_ANALYSIS_AXES_H_ + +#include +#include +#include +#include + +namespace triton{ + +namespace ir{ + class value; + class module; + class instruction; +} + +namespace codegen{ +namespace analysis{ + +class axes { + typedef std::pair node_t; + typedef std::map > graph_t; + +private: + void add_constraint(node_t x, node_t y); + void init_c_phi(ir::instruction *i); + void init_c_graph(ir::instruction *v); + void connected_components(node_t x, std::set &nodes, graph_t &graph, unsigned group_id); + +public: + axes(); + void run(ir::module &mod); + unsigned get(ir::value *value, unsigned ax); + bool has(ir::value *value, unsigned ax); + +private: + // constraints graph + graph_t dependencies_; + std::set nodes_; + // parameter groups + std::map> groups_; +}; + +} +} + +} + +#endif diff --git a/include/triton/codegen/analysis/grid.h b/include/triton/codegen/analysis/grid.h deleted file mode 100644 index 465011b83..000000000 --- a/include/triton/codegen/analysis/grid.h +++ /dev/null @@ -1,89 +0,0 @@ -#ifndef TDL_INCLUDE_IR_CODEGEN_TUNE_H -#define TDL_INCLUDE_IR_CODEGEN_TUNE_H - -#include -#include -#include -#include - -namespace triton{ - -namespace ir{ - class value; - class module; - class instruction; - class function; - class metaparameter; - class constant_int; -} - -namespace codegen{ - -namespace transform{ -class coalesce; -} - -namespace analysis{ - -class grids { - typedef std::pair node_t; - typedef std::map > graph_t; - typedef std::shared_ptr param_ptr_t; - typedef std::map> param_map_t; - -public: - enum fragment_t{ - STRIDED_SCAN, - HMMA_FRAGMENT_C - }; - -private: - void add_constraint(node_t x, node_t y); - void init_c_phi(ir::instruction *i); - void init_c_graph(ir::instruction *v); - fragment_t get_fragmentation_type(node_t x, graph_t &graph); - void connected_components(node_t x, const std::vector& params, const std::vector& maps, std::set &nodes, graph_t &graph, unsigned group_id); - void create_grids(std::vector &grids, - std::map &references, - ir::function *fn); - - -public: - grids(size_t num_warps, transform::coalesce* coalesce); - void run(ir::module &mod); - const std::vector get() const { return grids_; } - fragment_t fragment_of(ir::value *value, unsigned ax); - unsigned group_of(ir::value *value, unsigned ax); - int mts(ir::value *value, unsigned ax); - int nts(ir::value *value, unsigned ax); - int fpw(ir::value *value, unsigned ax); - int wpt(ir::value *value, unsigned ax); - void copy(ir::value *dst, ir::value *src); - -private: - - transform::coalesce* coalesce_; - // number of warps - size_t num_warps_; - // grids - std::vector grids_; - // grid parameters - param_map_t fpw_; - param_map_t wpt_; - param_map_t mts_; - param_map_t nts_; - // constraints graph - graph_t dependencies_; - std::set nodes_; - // fragments - std::map fragments_; - // parameter groups - std::map> groups_; -}; - - -} -} -} - -#endif diff --git a/include/triton/codegen/analysis/layout.h b/include/triton/codegen/analysis/layout.h new file mode 100644 index 000000000..3bc1f2f6a --- /dev/null +++ b/include/triton/codegen/analysis/layout.h @@ -0,0 +1,57 @@ +#ifndef _TRITON_CODEGEN_ANALYSIS_GRID_H_ +#define _TRITON_CODEGEN_ANALYSIS_GRID_H_ + +#include +#include +#include +#include + +namespace triton{ + +namespace ir{ + class value; + class module; + class instruction; +} + +namespace codegen{ +namespace analysis{ + +class axes; + +class layout { + typedef ir::value* node_t; + typedef std::map > graph_t; + +private: + // connected components + void connected_components(node_t x, std::set &nodes, graph_t &graph, unsigned id); + // list the axes of the given value + std::set axes_of(ir::value *value); + +public: + // constructor + layout(analysis::axes *axes); + // run the passes + void run(ir::module &mod); + // get the layout ID of the given value + unsigned id(ir::value *value) const; + // get the values associates with the given ID + const std::vector& values(unsigned id) const; + // get number of groups + size_t get_num_groups() const; + +private: + analysis::axes* axes_; + graph_t dependencies_; + std::set nodes_; + std::map groups_; + std::map> values_; +}; + +} +} + +} + +#endif diff --git a/include/triton/codegen/analysis/memalloc.h b/include/triton/codegen/analysis/memalloc.h index 91cf89123..f50d00b22 100644 --- a/include/triton/codegen/analysis/memalloc.h +++ b/include/triton/codegen/analysis/memalloc.h @@ -15,15 +15,15 @@ namespace ir{ namespace codegen{ namespace analysis{ -class grids; +class tiles; class liveness; class meminfo; class memalloc { public: - memalloc(liveness *live, meminfo *buffer_info, grids *params) - : liveness_(live), buffer_info_(buffer_info), params_(params){ } + memalloc(liveness *live, meminfo *buffer_info, tiles *params) + : liveness_(live), buffer_info_(buffer_info), tiles_(params){ } // utilities unsigned num_bytes(ir::value *x); unsigned is_ld_padded(ir::value* x); @@ -40,7 +40,7 @@ private: // dependences liveness *liveness_; meminfo *buffer_info_; - grids *params_; + tiles *tiles_; }; } diff --git a/include/triton/codegen/analysis/tiles.h b/include/triton/codegen/analysis/tiles.h new file mode 100644 index 000000000..a9387cb5c --- /dev/null +++ b/include/triton/codegen/analysis/tiles.h @@ -0,0 +1,68 @@ +#ifndef _TRITON_CODEGEN_ANALYSIS_TILES_H_ +#define _TRITON_CODEGEN_ANALYSIS_TILES_H_ + +#include +#include +#include +#include + +namespace triton{ + +namespace ir{ + class value; + class module; + class instruction; + class function; + class metaparameter; + class constant_int; +} + +namespace codegen{ + +namespace transform{ +class coalesce; +} + +namespace analysis{ + +class axes; +class layout; + +class tiles { + typedef std::map> param_map_t; +private: + void init_hmma_tile(ir::value *i); + void init_scanline_tile(ir::value *i); + +public: + tiles(size_t num_warps, transform::coalesce* coalesce, analysis::axes* axes, analysis::layout* layout); + void run(ir::module &mod); + bool hmma(ir::value *value); + int mts(ir::value *value, unsigned ax); + int nts(ir::value *value, unsigned ax); + int fpw(ir::value *value, unsigned ax); + int wpt(ir::value *value, unsigned ax); + const std::map& largest(); + +private: + // dependencies + analysis::layout* layout_; + analysis::axes* axes_; + transform::coalesce* coalesce_; + // number of warps + size_t num_warps_; + // tile properties + std::map hmma_; + std::map largest_; + std::map fpw_; + std::map wpt_; + std::map mts_; + std::map nts_; +}; + + +} +} +} + +#endif diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index b4d2e3344..ba92843a4 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -43,10 +43,12 @@ namespace triton{ namespace codegen{ namespace analysis{ -class grids; +class tiles; class align; class memalloc; class meminfo; +class axes; +class layout; } namespace transform{ @@ -199,8 +201,12 @@ private: public: - selection(analysis::memalloc *alloc, analysis::grids *params, analysis::meminfo *buffer_info, analysis::align *alignment, transform::coalesce* reorder, target *tgt, unsigned num_warps) - : alloc_(alloc), params_(params), buffer_info_(buffer_info), alignment_(alignment), reorder_(reorder), tgt_(tgt), num_warps_(num_warps){ } + selection(analysis::memalloc *alloc, analysis::tiles *tiles, analysis::meminfo *buffer_info, + analysis::align *alignment, analysis::axes *axes, analysis::layout *layouts, + transform::coalesce* reorder, target *tgt, unsigned num_warps) + : alloc_(alloc), tiles_(tiles), buffer_info_(buffer_info), + alignment_(alignment), a_axes_(axes), layouts_(layouts), + reorder_(reorder), tgt_(tgt), num_warps_(num_warps){ } void run(ir::module &src, Module &dst); @@ -208,7 +214,9 @@ private: vmap_t vmap_; tmap_t tmap_; analysis::memalloc *alloc_; - analysis::grids *params_; + analysis::tiles *tiles_; + analysis::axes *a_axes_; + analysis::layout *layouts_; analysis::meminfo *buffer_info_; analysis::align *alignment_; transform::coalesce *reorder_; diff --git a/include/triton/codegen/transform/reassociate.h b/include/triton/codegen/transform/reassociate.h index 318884755..d7e33c9a2 100644 --- a/include/triton/codegen/transform/reassociate.h +++ b/include/triton/codegen/transform/reassociate.h @@ -19,7 +19,7 @@ class getelementptr_inst; namespace codegen{ namespace analysis{ -class grids; +class tiles; class align; } @@ -37,11 +37,10 @@ private: ir::value *reassociate_ptr(ir::getelementptr_inst* pz, ir::builder &builder, std::map &offsets); public: - reassociate(analysis::align* align, analysis::grids *params); + reassociate(analysis::align* align); void run(ir::module& module); private: - analysis::grids* params_; analysis::align* align_; }; diff --git a/include/triton/codegen/transform/vectorize.h b/include/triton/codegen/transform/vectorize.h index bf08eb46f..0a6571b61 100644 --- a/include/triton/codegen/transform/vectorize.h +++ b/include/triton/codegen/transform/vectorize.h @@ -10,18 +10,18 @@ namespace ir { namespace codegen{ namespace analysis{ - class grids; + class tiles; } namespace transform{ class vectorize { public: - vectorize(analysis::grids *params): params_(params){} + vectorize(analysis::tiles *params): params_(params){} void run(ir::module &mod); private: - analysis::grids *params_; + analysis::tiles *params_; }; } diff --git a/include/triton/ir/type.h b/include/triton/ir/type.h index aee2ecc42..4b67d9e94 100644 --- a/include/triton/ir/type.h +++ b/include/triton/ir/type.h @@ -63,6 +63,7 @@ public: unsigned get_primitive_size_in_bits() const; type *get_scalar_ty() const; const tile_shapes_t& get_tile_shapes() const; + const size_t get_tile_rank() const; unsigned get_tile_num_elements() const; type *get_tile_element_ty() const; unsigned get_pointer_address_space() const; diff --git a/include/triton/runtime/function.h b/include/triton/runtime/function.h index 96ec35ef7..9d04cad78 100644 --- a/include/triton/runtime/function.h +++ b/include/triton/runtime/function.h @@ -11,7 +11,7 @@ // codegen #include "triton/codegen/selection.h" #include "triton/codegen/target.h" -#include "triton/codegen/analysis/grid.h" +#include "triton/codegen/analysis/tiles.h" #include "triton/codegen/analysis/memalloc.h" #include "triton/codegen/analysis/liveness.h" #include "triton/codegen/analysis/meminfo.h" @@ -45,7 +45,7 @@ class translation_unit; namespace codegen{ namespace analysis{ -class grids; +class tiles; } } diff --git a/lib/codegen/analysis/axes.cc b/lib/codegen/analysis/axes.cc new file mode 100644 index 000000000..99fc59234 --- /dev/null +++ b/lib/codegen/analysis/axes.cc @@ -0,0 +1,166 @@ +#include "triton/codegen/analysis/axes.h" +#include "triton/ir/instructions.h" +#include "triton/ir/type.h" +#include "triton/ir/module.h" +#include "triton/ir/function.h" +#include "triton/ir/context_impl.h" +#include "triton/ir/constant.h" +#include "triton/driver/device.h" + + + +namespace triton{ +namespace codegen{ +namespace analysis{ + +axes::axes() {} + +void axes::add_constraint(node_t x, node_t y) { + size_t shape_x = 1; + size_t shape_y = 1; + if(x.first->get_type()->is_tile_ty()) + shape_x = x.first->get_type()->get_tile_shapes()[x.second]; + if(y.first->get_type()->is_tile_ty()) + shape_y = y.first->get_type()->get_tile_shapes()[y.second]; + if(shape_x == 1 && shape_y == 1) + return; + dependencies_[x].insert(y); + dependencies_[y].insert(x); + nodes_.insert(x); + nodes_.insert(y); +} + +void axes::init_c_graph(ir::instruction *v) { + // Reference shape + ir::type::tile_shapes_t shapes; + if(auto *store = dynamic_cast(v)) + shapes = store->get_pointer_operand()->get_type()->get_tile_shapes(); + else if(auto *atom = dynamic_cast(v)) + shapes = atom->get_operand(0)->get_type()->get_tile_shapes(); + else if(dynamic_cast(v)) + return; + else if(dynamic_cast(v)) + return; + else if(auto *reduce = dynamic_cast(v)) { + unsigned axis = reduce->get_axis(); + ir::value *arg = reduce->get_operand(0); + auto in_shapes = arg->get_type()->get_tile_shapes(); + unsigned current = 0; + for(unsigned i = 0; i < in_shapes.size(); i++){ + if(i == axis) + continue; + add_constraint({reduce, current++}, {arg, i}); + } + return; + } + else + shapes = v->get_type()->get_tile_shapes(); + // Reshape + if(dynamic_cast(v)) { + ir::value *op = v->get_operand(0); + auto op_shapes = op->get_type()->get_tile_shapes(); + unsigned current = 0; + bool is_skewed = false; + for(unsigned i = 0; i < shapes.size(); i ++){ + if(shapes[i] == 1){ + add_constraint({v, i}, {v, i}); + } + else if(!is_skewed && + shapes[i] == op_shapes[current]) + add_constraint({v, i}, {op, current++}); + else{ + is_skewed = true; + add_constraint({v, i}, {v, i}); + } + } + } + // Splat + else if(dynamic_cast(v)){ + return; + } + // Trans + else if(auto *x = dynamic_cast(v)){ + ir::value *op = v->get_operand(0); + auto perm = x->get_perm(); + for(unsigned i = 0; i < perm.size(); i++) + add_constraint({v, perm[i]->get_value()}, {op, i}); + } + // Broadcast + else if(dynamic_cast(v)){ + ir::value *op = v->get_operand(0); + ir::type *op_ty = op->get_type(); + const auto& op_shapes = op_ty->get_tile_shapes(); + for(unsigned i = 0; i < shapes.size(); i ++){ + if(op_shapes[i] == shapes[i] && v != op) + add_constraint({v, i}, {op, i}); + } + } + // Matrix multiplication + else if(dynamic_cast(v)){ + ir::value *A = v->get_operand(0); + ir::value *B = v->get_operand(1); + ir::value *D = v->get_operand(2); + for(unsigned i = 0; i < shapes.size(); i++) + add_constraint({v, i}, {D, i}); + for(unsigned i = 2; i < shapes.size(); i++){ + add_constraint({v, i}, {A, i}); + add_constraint({v, i}, {B, i}); + } + } + // Element-wise + else if(dynamic_cast(v)) { + for(unsigned i = 0; i < shapes.size(); i ++){ + std::vector ops = v->ops(); + for(ir::value* op: ops) + add_constraint({v, i}, {op, i}); + } + } +} + +void axes::connected_components(node_t x, std::set &nodes, graph_t &graph, unsigned group_id) { + groups_[x.first].insert({x.second, group_id}); + if(nodes.find(x) != nodes.end()){ + nodes.erase(x); + for(const node_t &y: graph[x]) + connected_components(y, nodes, graph, group_id); + } +} + +unsigned axes::get(ir::value *value, unsigned ax) { + unsigned result = groups_.at(value).at(ax); + return result; +} + +bool axes::has(ir::value *value, unsigned ax) { + auto it = groups_.find(value); + if(it == groups_.end()) + return false; + auto iit = it->second.find(ax); + if(iit == it->second.end()) + return false; + return true; +} + + +void axes::run(ir::module &mod) { + nodes_.clear(); + dependencies_.clear(); + groups_.clear(); + // Create graph + for(ir::function *fn: mod.get_function_list()){ + // Build constraints graph + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *i : block->get_inst_list()) + if(i->has_tile_result_or_op()) + init_c_graph(i); + } + // Axes + unsigned group_id = 0; + while(!nodes_.empty()) + connected_components(*nodes_.begin(), nodes_, dependencies_, group_id++); +} + +} +} + +} diff --git a/lib/codegen/analysis/grid.cc b/lib/codegen/analysis/grid.cc deleted file mode 100644 index 1e6de0de4..000000000 --- a/lib/codegen/analysis/grid.cc +++ /dev/null @@ -1,367 +0,0 @@ -#include -#include -#include "triton/codegen/transform/coalesce.h" -#include "triton/codegen/analysis/grid.h" -#include "triton/ir/instructions.h" -#include "triton/ir/type.h" -#include "triton/ir/module.h" -#include "triton/ir/function.h" -#include "triton/ir/context_impl.h" -#include "triton/ir/constant.h" -#include "triton/driver/device.h" - - - -namespace triton{ -namespace codegen{ -namespace analysis{ - -grids::grids(size_t num_warps, transform::coalesce *reorder): num_warps_(num_warps), coalesce_(reorder) -{ } - -bool is_hmma(ir::value *v){ - bool result = false; - if(auto *x = dynamic_cast(v)){ - ir::value *a = x->get_operand(0); - ir::type *a_ty = a->get_type(); - ir::value *b = x->get_operand(1); - ir::type *b_ty = b->get_type(); - // inputs have to be FP16 - result = a_ty->get_scalar_ty()->is_half_ty() && b_ty->get_scalar_ty()->is_half_ty(); - // reduction has to be multiple of 4: TODO - } - return result; -} - -void grids::add_constraint(node_t x, node_t y) { - dependencies_[x].insert(y); - dependencies_[y].insert(x); - nodes_.insert(x); - nodes_.insert(y); -} - -void grids::init_c_phi(ir::instruction *v) { - // Phi Nodes: all the incoming value share the result layout - if(auto *phi = dynamic_cast(v)) - for(ir::value *op: phi->ops()) - for(unsigned k = 0; k < phi->get_type()->get_tile_shapes().size(); k++) - if(dependencies_.find({op, k}) != dependencies_.end() - || dependencies_.find({phi, k}) != dependencies_.end()){ - add_constraint({phi, k}, {op, k}); - } -} - -void grids::init_c_graph(ir::instruction *v) { - // Reference shape - ir::type::tile_shapes_t shapes; - if(auto *store = dynamic_cast(v)) - shapes = store->get_pointer_operand()->get_type()->get_tile_shapes(); - else if(auto *atom = dynamic_cast(v)) - shapes = atom->get_operand(0)->get_type()->get_tile_shapes(); - else if(dynamic_cast(v)) - return; - else if(dynamic_cast(v)) - return; - else if(auto *reduce = dynamic_cast(v)) { - unsigned axis = reduce->get_axis(); - ir::value *arg = reduce->get_operand(0); - auto in_shapes = arg->get_type()->get_tile_shapes(); - unsigned current = 0; - for(unsigned i = 0; i < in_shapes.size(); i++){ - if(i == axis) - continue; - add_constraint({reduce, current++}, {arg, i}); - } - return; - } - else - shapes = v->get_type()->get_tile_shapes(); - // Reshape - if(dynamic_cast(v)) { - ir::value *op = v->get_operand(0); - auto op_shapes = op->get_type()->get_tile_shapes(); - unsigned current = 0; - bool is_skewed = false; - for(unsigned i = 0; i < shapes.size(); i ++){ - if(shapes[i] == 1){ - add_constraint({v, i}, {v, i}); - } - else if(!is_skewed && - shapes[i] == op_shapes[current]) - add_constraint({v, i}, {op, current++}); - else{ - is_skewed = true; - add_constraint({v, i}, {v, i}); - } - } - } - // Splat - else if(dynamic_cast(v)){ - return; - } - // Trans - else if(auto *x = dynamic_cast(v)){ - ir::value *op = v->get_operand(0); - auto perm = x->get_perm(); - for(unsigned i = 0; i < perm.size(); i++) - add_constraint({v, perm[i]->get_value()}, {op, i}); - } - // Broadcast - else if(dynamic_cast(v)){ - ir::value *op = v->get_operand(0); - ir::type *op_ty = op->get_type(); - const auto& op_shapes = op_ty->get_tile_shapes(); - for(unsigned i = 0; i < shapes.size(); i ++){ - if(op_shapes[i] == shapes[i] && v != op) - add_constraint({v, i}, {op, i}); - } - } - // Matrix multiplication - else if(dynamic_cast(v)){ - ir::value *A = v->get_operand(0); - ir::value *B = v->get_operand(1); - ir::value *D = v->get_operand(2); - for(unsigned i = 0; i < shapes.size(); i++) - add_constraint({v, i}, {D, i}); - for(unsigned i = 2; i < shapes.size(); i++){ - add_constraint({v, i}, {A, i}); - add_constraint({v, i}, {B, i}); - } - } - // Element-wise - else if(dynamic_cast(v)) { - for(unsigned i = 0; i < shapes.size(); i ++){ - std::vector ops = v->ops(); - for(ir::value* op: ops) - add_constraint({v, i}, {op, i}); - } - } -} - -grids::fragment_t grids::get_fragmentation_type(node_t x, graph_t &graph){ - std::list work; - std::set seen; - work.push_back(x); - while(!work.empty()){ - node_t current = work.back(); - if(is_hmma(current.first)) - return HMMA_FRAGMENT_C; - work.pop_back(); - seen.insert(current); - for(node_t y: graph[current]){ - if(seen.find(y) == seen.end()) - work.push_back(y); - } - } - return STRIDED_SCAN; -} - -void grids::connected_components(node_t x, const std::vector& ptr_vec, const std::vector& maps, - std::set &nodes, graph_t &graph, unsigned group_id) { - groups_[x.first].insert({x.second, group_id}); - if(nodes.find(x) != nodes.end()){ - nodes.erase(x); - for(unsigned i = 0; i < ptr_vec.size(); i++) - (*maps[i])[x.first][x.second] = ptr_vec[i]; - for(const node_t &y: graph[x]) - connected_components(y, ptr_vec, maps, nodes, graph, group_id); - } -} - -unsigned grids::group_of(ir::value *value, unsigned ax) { - unsigned result = groups_.at(value).at(ax); - return result; -} - -grids::fragment_t grids::fragment_of(ir::value *value, unsigned ax) { - return fragments_.at({value, ax}); -} - - -//TODO: This shouldn't exist! -void grids::copy(ir::value *dst, ir::value *src) { - mts_[dst] = mts_[src]; - nts_[dst] = nts_[src]; - fpw_[dst] = fpw_[src]; - wpt_[dst] = wpt_[src]; - groups_[dst] = groups_[src]; - fragments_[{dst, 0}] = fragments_[{src, 0}]; -} - - -void grids::run(ir::module &mod) { - // Create tiling parameters - for(ir::function *fn: mod.get_function_list()){ - // Build constraints graph - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i : block->get_inst_list()) - if(i->has_tile_result_or_op()) - init_c_graph(i); - // Build phi constraints - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i : block->get_inst_list()) - if(i->has_tile_result_or_op()) - init_c_phi(i); - // Layout parameters - unsigned group_id = 0; - for(auto x: nodes_) - fragments_[x] = get_fragmentation_type(x, dependencies_); - while(!nodes_.empty()) { - node_t node = *nodes_.begin(); - if(fragments_[node] == STRIDED_SCAN) { - param_ptr_t nts(new int(-1)); - param_ptr_t mts(new int(-1)); - connected_components(node, {nts, mts}, {&nts_, &mts_}, nodes_, dependencies_, group_id++); - } - else { - param_ptr_t fpw(new int(-1)); - param_ptr_t wpt(new int(-1)); - connected_components(node, {fpw, wpt}, {&fpw_, &wpt_}, nodes_, dependencies_, group_id++); - } - } - } - - for(ir::function *fn: mod.get_function_list()){ - std::map references; - create_grids(grids_, references, fn); - } - - - unsigned num_threads = num_warps_*32; - auto clamp = [&](unsigned x, unsigned lo, unsigned hi) { return std::min(std::max(x, lo), hi); }; - - for(ir::value *i: grids_){ - if(!i->get_type()->is_tile_ty()) - continue; - auto order = coalesce_->get_order(i); - auto shapes = i->get_type()->get_tile_shapes(); - unsigned size = i->get_type()->get_tile_num_elements(); - /* HMMA parameters*/ - if(fragments_.at({i, 0}) == HMMA_FRAGMENT_C){ - unsigned shape_0 = shapes[order[0]]; - unsigned shape_1 = shapes[order[1]]; - /* fragments per warp */ - // try to make things as square as possible to maximize data re-use - std::vector fpw = {1, 1, 1}; - std::vector fpw_nm1; - unsigned num_fragments = std::min((shape_0/8)*(shape_1/8), 4); - do { - fpw_nm1 = fpw; - if(fpw[0]*fpw[1] < num_fragments) - fpw[0] = clamp(fpw[0]*2, 1, shape_0 / 8); - if(fpw[0]*fpw[1] < num_fragments) - fpw[1] = clamp(fpw[1]*2, 1, shape_1 / 8); - }while(fpw_nm1 != fpw); - // store parameters - for(unsigned d = 0; d < shapes.size(); d++) - *fpw_[i][d] = fpw[d]; - /* warps per tile */ - // try to make things as square as possible to maximize data re-use - std::vector wpt = {1, 1, 1}; - std::vector wpt_nm1; - do{ - wpt_nm1 = wpt; - if(wpt[0] * wpt[1] * wpt[2] < num_warps_) - wpt[0] = clamp(wpt[0]*2, 1, shape_0 / (fpw[0]*8)); - if(wpt[0] * wpt[1] * wpt[2] < num_warps_) - wpt[1] = clamp(wpt[1]*2, 1, shape_1 / (fpw[1]*8)); - }while(wpt_nm1 != wpt); - // store parameters - for(unsigned d = 0; d < shapes.size(); d++) - *wpt_[i][d] = wpt[d]; - /* sanity check */ - unsigned effective_num_warps = 1; - for(size_t d = 0; d < shapes.size(); d++) - effective_num_warps *= *wpt_[i][d]; - if(num_warps_ != effective_num_warps) - throw std::runtime_error("cannot create a kernel with this amount of warps"); - } - - /* Scan-line */ - else{ - unsigned ld = order[0]; - unsigned current = num_threads; - *nts_[i][ld] = clamp(size / num_threads, 1, 4); - *mts_[i][ld] = clamp(current, 1, shapes[ld] / *nts_[i][ld]); - current = current / *mts_[i][ld]; - for(size_t d = 1; d < shapes.size(); d++){ - ld = order[d]; - *nts_[i][ld] = 1; - *mts_[i][ld] = clamp(current, 1, shapes[ld]); - current = current / *mts_[i][ld]; - } - /* sanity check */ - unsigned effective_num_threads = 1; - for(size_t d = 0; d < shapes.size(); d++) - effective_num_threads *= *mts_[i][d]; - if(num_threads != effective_num_threads) - throw std::runtime_error("cannot create a kernel with this amount of warps"); - } - } - -} - - -void grids::create_grids(std::vector &grids, - std::map &references, - ir::function *fn) { - // get number of dimensions greater than 1 - auto get_tile_gt1_dim = [&](ir::value *v){ - unsigned result = 0; - for(auto shape: v->get_type()->get_tile_shapes()) { - result += (shape > 1)? shape : 0; - } - return result; - }; - // bind references - std::set seen; - std::function bind_references = [&](ir::value *v) - { - // skip - if(!v->get_type()->is_tile_ty() || !seen.insert(v).second) - return; - // recurse - if(auto *user = dynamic_cast(v)) - for(ir::value *op: user->ops()) - bind_references(op); - // bind - const auto& shapes = v->get_type()->get_tile_shapes(); - for(size_t d = 0; d < shapes.size(); d++){ - if(shapes[d] == 1) - continue; - unsigned x = group_of(v, d); - ir::value *&r = references[x]; - if(!r || get_tile_gt1_dim(v) > get_tile_gt1_dim(r)) - r = v; - } - }; - - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i: block->get_inst_list()) - bind_references(i); - - // create grid - for(auto &ref: references) - if(std::find(grids.begin(), grids.end(), ref.second) == grids.end()) - grids.push_back(ref.second); -} - -int grids::mts(ir::value *value, unsigned ax) { - return *mts_.at(value).at(ax); -} - -int grids::nts(ir::value *value, unsigned ax) { - return *nts_.at(value).at(ax); -} - -int grids::fpw(ir::value *value, unsigned ax) { - return *fpw_.at(value).at(ax); -} - -int grids::wpt(ir::value *value, unsigned ax) { - return *wpt_.at(value).at(ax); -} - -} -} -} diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc new file mode 100644 index 000000000..a6eade0b2 --- /dev/null +++ b/lib/codegen/analysis/layout.cc @@ -0,0 +1,96 @@ +#include +#include +#include "triton/codegen/analysis/axes.h" +#include "triton/codegen/analysis/layout.h" +#include "triton/ir/function.h" +#include "triton/ir/module.h" + +namespace triton{ +namespace codegen{ +namespace analysis{ + + +// axes +std::set layout::axes_of(ir::value *value) { + auto ty = value->get_type(); + // rank of value + size_t rank = 0; + if(ty->is_tile_ty()) + rank = ty->get_tile_rank(); + // create result + std::set result; + for(size_t d = 0; d < rank; d++){ + if(axes_->has(value, d)) + result.insert(axes_->get(value, d)); + } + return result; +} + +// connected components +void layout::connected_components(node_t x, std::set &nodes, graph_t &graph, unsigned group_id) { + groups_[x] = group_id; + values_[group_id].push_back(x); + if(nodes.find(x) != nodes.end()){ + nodes.erase(x); + for(const node_t &y: graph[x]) + connected_components(y, nodes, graph, group_id); + } +} + +// constructor +layout::layout(analysis::axes *axes) + : axes_(axes) { } + +// get group id +unsigned layout::id(ir::value *value) const +{ return groups_.at(value); } + +// get values +const std::vector& layout::values(unsigned id) const +{ return values_.at(id); } + +// get number of groups +size_t layout::get_num_groups() const +{ return values_.size(); } + +// run +void layout::run(ir::module &mod) { + nodes_.clear(); + dependencies_.clear(); + groups_.clear(); + values_.clear(); + // Create graph + for(ir::function *fn: mod.get_function_list()) + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *i : block->get_inst_list()) { + // skip scalars + if(!i->get_type()->is_tile_ty()) + continue; + // add an edge between i and the operands that share an axis + std::set i_axes = axes_of(i); + nodes_.insert(i); + for(ir::value* op: i->ops()){ + if(!op->get_type()->is_tile_ty()) + continue; + nodes_.insert(op); + std::set op_axes = axes_of(op); + std::set common; + std::set_intersection(i_axes.begin(), i_axes.end(), + op_axes.begin(), op_axes.end(), + std::inserter(common, common.begin())); + if(!common.empty() || !op->get_type()->is_tile_ty()){ + dependencies_[i].insert(op); + dependencies_[op].insert(i); + } + } + } + // Grids + unsigned group_id = 0; + while(!nodes_.empty()){ + connected_components(*nodes_.begin(), nodes_, dependencies_, group_id++); + } +} + +} +} +} diff --git a/lib/codegen/analysis/memalloc.cc b/lib/codegen/analysis/memalloc.cc index 866f9c7a4..7f80824e3 100644 --- a/lib/codegen/analysis/memalloc.cc +++ b/lib/codegen/analysis/memalloc.cc @@ -2,7 +2,7 @@ #include "triton/codegen/analysis/memalloc.h" #include "triton/codegen/analysis/liveness.h" #include "triton/codegen/analysis/meminfo.h" -#include "triton/codegen/analysis/grid.h" +#include "triton/codegen/analysis/tiles.h" #include "triton/ir/basic_block.h" #include "triton/ir/type.h" #include "triton/ir/value.h" @@ -20,7 +20,7 @@ unsigned memalloc::is_ld_padded(ir::value *x) { } for(ir::user* user: x->get_users()) if(auto dot = dynamic_cast(user)){ - bool is_hmma = params_->fragment_of(user, 0) == grids::HMMA_FRAGMENT_C; + bool is_hmma = tiles_->hmma(user); bool is_op_0 = x == dot->get_operand(0); bool is_op_1 = x == dot->get_operand(1); if(is_hmma && is_op_0){ @@ -56,10 +56,10 @@ unsigned memalloc::num_bytes(ir::value *x) { for(auto x: shapes) num_elements *= x; size_t depth; - if(params_->fragment_of(x, 0) == grids::HMMA_FRAGMENT_C) - depth = params_->wpt(op, axis); + if(tiles_->hmma(x)) + depth = tiles_->wpt(op, axis); else - depth = params_->mts(op, axis); + depth = tiles_->mts(op, axis); return num_elements * num_bytes * depth; } unsigned num_bytes = x->get_type()->get_primitive_size_in_bits() / 8; diff --git a/lib/codegen/analysis/tiles.cc b/lib/codegen/analysis/tiles.cc new file mode 100644 index 000000000..7b8505ff7 --- /dev/null +++ b/lib/codegen/analysis/tiles.cc @@ -0,0 +1,176 @@ +#include +#include +#include "triton/codegen/analysis/axes.h" +#include "triton/codegen/analysis/tiles.h" +#include "triton/codegen/analysis/layout.h" +#include "triton/codegen/transform/coalesce.h" +#include "triton/ir/instructions.h" +#include "triton/ir/type.h" +#include "triton/ir/module.h" +#include "triton/ir/function.h" +#include "triton/ir/context_impl.h" +#include "triton/ir/constant.h" +#include "triton/driver/device.h" + + + +namespace triton{ +namespace codegen{ +namespace analysis{ + +tiles::tiles(size_t num_warps, transform::coalesce *reorder, analysis::axes *axes, analysis::layout *layout): + num_warps_(num_warps), coalesce_(reorder), axes_(axes), layout_(layout) +{ } + +bool is_hmma(ir::value *v){ + bool result = false; + if(auto *x = dynamic_cast(v)){ + ir::value *a = x->get_operand(0); + ir::type *a_ty = a->get_type(); + ir::value *b = x->get_operand(1); + ir::type *b_ty = b->get_type(); + result = a_ty->get_scalar_ty()->is_half_ty() && + b_ty->get_scalar_ty()->is_half_ty(); + } + return result; +} + + + +bool tiles::hmma(ir::value *value) { + return hmma_.at(layout_->id(value)); +} + +int tiles::mts(ir::value *value, unsigned ax) { + return mts_.at(axes_->get(value, ax)); +} + +int tiles::nts(ir::value *value, unsigned ax) { + return nts_.at(axes_->get(value, ax)); +} + +int tiles::fpw(ir::value *value, unsigned ax) { + return fpw_.at(axes_->get(value, ax)); +} + +int tiles::wpt(ir::value *value, unsigned ax) { + return wpt_.at(axes_->get(value, ax)); +} + +const std::map& tiles::largest() { + return largest_; +} + + +unsigned clamp(unsigned x, unsigned lo, unsigned hi) { + return std::min(std::max(x, lo), hi); +} + + +void tiles::init_hmma_tile(ir::value *i) { + auto order = coalesce_->get_order(i); + auto shapes = i->get_type()->get_tile_shapes(); + unsigned shape_0 = shapes[order[0]]; + unsigned shape_1 = shapes[order[1]]; + /* fragments per warp */ + // try to make things as square as possible to maximize data re-use + std::vector fpw = {1, 1, 1}; + std::vector fpw_nm1; + unsigned num_fragments = std::min((shape_0/8)*(shape_1/8), 4); + do { + fpw_nm1 = fpw; + if(fpw[0]*fpw[1] < num_fragments) + fpw[0] = clamp(fpw[0]*2, 1, shape_0 / 8); + if(fpw[0]*fpw[1] < num_fragments) + fpw[1] = clamp(fpw[1]*2, 1, shape_1 / 8); + }while(fpw_nm1 != fpw); + // store parameters + for(unsigned d = 0; d < shapes.size(); d++) + fpw_[axes_->get(i, d)] = fpw[d]; + /* warps per tile */ + // try to make things as square as possible to maximize data re-use + std::vector wpt = {1, 1, 1}; + std::vector wpt_nm1; + do{ + wpt_nm1 = wpt; + if(wpt[0] * wpt[1] * wpt[2] < num_warps_) + wpt[0] = clamp(wpt[0]*2, 1, shape_0 / (fpw[0]*8)); + if(wpt[0] * wpt[1] * wpt[2] < num_warps_) + wpt[1] = clamp(wpt[1]*2, 1, shape_1 / (fpw[1]*8)); + }while(wpt_nm1 != wpt); + // store parameters + for(unsigned d = 0; d < shapes.size(); d++) + wpt_[axes_->get(i, d)] = wpt[d]; + /* sanity check */ + unsigned effective_num_warps = 1; + for(size_t d = 0; d < shapes.size(); d++) + effective_num_warps *= wpt_[axes_->get(i, d)]; + if(num_warps_ != effective_num_warps) + throw std::runtime_error("cannot create a kernel with this amount of warps"); +} + +void tiles::init_scanline_tile(ir::value *i) { + auto order = coalesce_->get_order(i); + auto shapes = i->get_type()->get_tile_shapes(); + unsigned size = i->get_type()->get_tile_num_elements(); + unsigned ld = order[0]; + unsigned num_threads = num_warps_*32; + unsigned current = num_threads; + nts_[axes_->get(i, ld)] = clamp(size / num_threads, 1, 4); + mts_[axes_->get(i, ld)] = clamp(current, 1, shapes[ld] / nts_[axes_->get(i, ld)]); + current = current / mts_[axes_->get(i, ld)]; + for(size_t d = 1; d < shapes.size(); d++){ + ld = order[d]; + nts_[axes_->get(i, ld)] = 1; + mts_[axes_->get(i, ld)] = clamp(current, 1, shapes[ld]); + current = current / mts_[axes_->get(i, ld)]; + } + /* sanity check */ + unsigned effective_num_threads = 1; + for(size_t d = 0; d < shapes.size(); d++) + effective_num_threads *= mts_[axes_->get(i, d)]; + if(num_threads != effective_num_threads) + throw std::runtime_error("cannot create a kernel with this amount of warps"); +} + +void tiles::run(ir::module &) { + hmma_.clear(); + largest_.clear(); + size_t num_groups = layout_->get_num_groups(); + // find out which groups require hmma layout + for(size_t i = 0; i < num_groups; i++) { + const auto& values = layout_->values(i); + hmma_[i] = std::any_of(values.begin(), values.end(), &is_hmma); + } + // find out which value is the largest in each group +// std::vector axes; + for(size_t i = 0; i < num_groups; i++) { + const auto& values = layout_->values(i); + auto rank = [](ir::value* v) { + ir::type *ty = v->get_type(); + size_t ret = 0; + if(ty->is_tile_ty()) + for(int s: ty->get_tile_shapes()) + ret += s > 1; + return ret; + }; + auto cmp = [&rank](ir::value* x, ir::value *y) { return rank(x) < rank(y); }; + largest_[i] = *std::max_element(values.begin(), values.end(), cmp); + } + + // tiling parameters + for(auto x: largest_){ + ir::value *i = x.second; + if(!i->get_type()->is_tile_ty()) + continue; + /* HMMA parameters*/ + if(hmma_[x.first]) + init_hmma_tile(i); + else + init_scanline_tile(i); + } +} + +} +} +} diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index e78228070..62b68e78a 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -1,6 +1,8 @@ #include "triton/codegen/selection.h" #include "triton/codegen/target.h" -#include "triton/codegen/analysis/grid.h" +#include "triton/codegen/analysis/layout.h" +#include "triton/codegen/analysis/axes.h" +#include "triton/codegen/analysis/tiles.h" #include "triton/codegen/analysis/memalloc.h" #include "triton/codegen/analysis/align.h" #include "triton/codegen/transform/coalesce.h" @@ -584,8 +586,8 @@ void selection::init_strided_scan_axes(ir::value *v, IRBuilder<> &builder, Value std::vector warp_size(dim); std::vector n_warps(dim); for(unsigned i = 0; i < shapes.size(); i++){ - contiguous[i] = params_->nts(v, i); - block_size[i] = params_->mts(v, i); + contiguous[i] = tiles_->nts(v, i); + block_size[i] = tiles_->mts(v, i); } to_warps(block_size, order, n_warps, warp_size); std::vector thread_id_in_warp = delinearize(u_thread_id, order, warp_size, builder); @@ -604,7 +606,7 @@ void selection::init_strided_scan_axes(ir::value *v, IRBuilder<> &builder, Value unsigned offset = n / contiguous[k] * per_block + n % contiguous[k]; idx_list[n] = builder.CreateAdd(scaled_thread_id, builder.getInt32(offset), "idx_" + str_k + "_" + std::to_string(n)); } - axes_[params_->group_of(v, k)] = distributed_axis{contiguous[k], idx_list, thread_id}; + axes_[a_axes_->get(v, k)] = distributed_axis{contiguous[k], idx_list, thread_id}; } } @@ -622,13 +624,13 @@ void selection::init_hmma_axes(ir::value *v, IRBuilder<> &builder, Value *u_thre Value *_16 = builder.getInt32(16); // fragments per warp - unsigned fpw_0 = params_->fpw(v, 0); - unsigned fpw_1 = params_->fpw(v, 1); - unsigned fpw_2 = is_batched ? params_->fpw(v, 2) : 1; + unsigned fpw_0 = tiles_->fpw(v, 0); + unsigned fpw_1 = tiles_->fpw(v, 1); + unsigned fpw_2 = is_batched ? tiles_->fpw(v, 2) : 1; // warps per tile - unsigned wpt_0 = params_->wpt(v, 0); - unsigned wpt_1 = params_->wpt(v, 1); - unsigned wpt_2 = is_batched ? params_->wpt(v, 2) : 1; + unsigned wpt_0 = tiles_->wpt(v, 0); + unsigned wpt_1 = tiles_->wpt(v, 1); + unsigned wpt_2 = is_batched ? tiles_->wpt(v, 2) : 1; // hmma warp tile size unsigned hmma_wts_0 = fpw_0 * 8; unsigned hmma_wts_1 = fpw_1 * 8; @@ -709,18 +711,18 @@ void selection::init_hmma_axes(ir::value *v, IRBuilder<> &builder, Value *u_thre /* axes */ - axes_[params_->group_of(v, 0)] = distributed_axis{1, idx_i, warp_id_0}; - axes_[params_->group_of(v, 1)] = distributed_axis{1, idx_j, warp_id_1}; + axes_[a_axes_->get(v, 0)] = distributed_axis{1, idx_i, warp_id_0}; + axes_[a_axes_->get(v, 1)] = distributed_axis{1, idx_j, warp_id_1}; if(is_batched) - axes_[params_->group_of(v, 2)] = distributed_axis{1, idx_z, warp_id_2}; + axes_[a_axes_->get(v, 2)] = distributed_axis{1, idx_z, warp_id_2}; } void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { - if(params_->fragment_of(v, 0) == analysis::grids::STRIDED_SCAN) - init_strided_scan_axes(v, builder, u_thread_id, u_warp_id); - else + if(tiles_->hmma(v)) init_hmma_axes(v, builder, u_thread_id, u_warp_id); + else + init_strided_scan_axes(v, builder, u_thread_id, u_warp_id); } /* ------------------- @@ -780,7 +782,7 @@ void selection::create_distributed_tile(ir::value *v, IRBuilder<> &builder) { std::vector axes(shapes.size()); for(size_t d = 0; d < shapes.size(); d++){ if(shapes[d] > 1){ - unsigned x = params_->group_of(v, d); + unsigned x = a_axes_->get(v, d); axes[d] = axes_.at(x); } else{ @@ -831,8 +833,8 @@ void selection::init_grids(ir::function *fn, IRBuilder<> &builder, Value *sh_mem Value *u_thread_warp_id = builder.CreateURem(u_thread_id, warp_size); Value *u_warp_id = builder.CreateUDiv(u_thread_id, warp_size); // create grid - for(ir::value* i: params_->get()) - init_axes(i, builder, u_thread_warp_id, u_warp_id); + for(auto x: tiles_->largest()) + init_axes(x.second, builder, u_thread_warp_id, u_warp_id); // create tile std::set seen; for(ir::basic_block *block: fn->blocks()) @@ -915,7 +917,7 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, Value *base_ptr = builder.CreateBitCast(sh_mem_ptr_, PointerType::get(res_ty, addr_space)); for(auto& x: partial) { // current element being computed - Value *lane = axes_.at(params_->group_of(op, axis)).thread_id; + Value *lane = axes_.at(a_axes_->get(op, axis)).thread_id; Value *&result = x.second; indices_t write_idx = x.first; write_idx.insert(write_idx.begin() + axis, lane); @@ -928,7 +930,7 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, tgt_->add_barrier(module, builder); builder.CreateStore(result, write_ptr); // build result - unsigned depth = params_->wpt(op, axis); + unsigned depth = tiles_->wpt(op, axis); for(unsigned i = depth/2; i > 0; i >>= 1){ // current indices indices_t current(write_idx.size(), builder.getInt32(0)); @@ -1095,12 +1097,12 @@ void selection::lower_hmma_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn "{$10, $11}, " "{$0, $1, $2, $3, $4, $5, $6, $7};", "=f,=f,=f,=f,=f,=f,=f,=f,r,r,r,r,0,1,2,3,4,5,6,7", false); - unsigned fpw_0 = params_->fpw(dot, 0); - unsigned fpw_1 = params_->fpw(dot, 1); + unsigned fpw_0 = tiles_->fpw(dot, 0); + unsigned fpw_1 = tiles_->fpw(dot, 1); unsigned wts_0 = fpw_0 * 8; unsigned wts_1 = fpw_1 * 8; - unsigned wpt_0 = params_->wpt(dot, 0); - unsigned wpt_1 = params_->wpt(dot, 1); + unsigned wpt_0 = tiles_->wpt(dot, 0); + unsigned wpt_1 = tiles_->wpt(dot, 1); unsigned stride_rep_i = wpt_0 * wts_0; unsigned stride_rep_j = wpt_1 * wts_1; unsigned num_rep_i = shapes[0] / stride_rep_i; @@ -1241,10 +1243,11 @@ void selection::lower_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn, IRB if(NK != 1) { shared_tile *TA = (shared_tile*)tmap_.at(A); shared_tile *TB = (shared_tile*)tmap_.at(B); - if(params_->fragment_of(dot, 0) == analysis::grids::STRIDED_SCAN) - lower_scanline_dot(dot, ctx, fn, builder, TC, TA, TB, TD, NK, c_ty, f_mul_add); - else + if(tiles_->hmma(dot)) lower_hmma_dot(dot, ctx, fn, builder, TC, TA, TB, TD, NK); + else + lower_scanline_dot(dot, ctx, fn, builder, TC, TA, TB, TD, NK, c_ty, f_mul_add); + } else { distributed_tile *TA = (distributed_tile*)tmap_.at(A); diff --git a/lib/codegen/transform/reassociate.cc b/lib/codegen/transform/reassociate.cc index f059aba88..8ca89cda2 100644 --- a/lib/codegen/transform/reassociate.cc +++ b/lib/codegen/transform/reassociate.cc @@ -2,7 +2,6 @@ #include #include "triton/codegen/transform/reassociate.h" #include "triton/codegen/analysis/align.h" -#include "triton/codegen/analysis/grid.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" @@ -90,11 +89,6 @@ ir::value *reassociate::reassociate_idx(ir::value *old_value, new_rhs = builder.create_splat(old_rhs, shapes); new_value = builder.create_add(new_lhs, new_rhs, op->get_name()); } - if(new_value != old_value){ - params_->copy(new_value, old_value); - params_->copy(new_lhs, old_value); - params_->copy(new_rhs, old_value); - } } } @@ -127,11 +121,6 @@ ir::value *reassociate::reassociate_idx(ir::value *old_value, if(is_cst(rrhs)) new_value = builder.create_add(rrhs, builder.create_add(lrhs, lhs), name, cst); } - if(new_value != old_value){ - params_->copy(new_value, old_value); - params_->copy(((ir::instruction*)new_value)->get_operand(0), old_value); - params_->copy(((ir::instruction*)new_value)->get_operand(1), old_value); - } } // extract constant and non-constant @@ -156,8 +145,7 @@ ir::value *reassociate::reassociate_idx(ir::value *old_value, return new_value; } -reassociate::reassociate(analysis::align *align, analysis::grids* params) - : params_(params), align_(align) +reassociate::reassociate(analysis::align *align): align_(align) { } @@ -183,9 +171,6 @@ void reassociate::run(ir::module &mod) { ir::value* static_range = ir::make_range_sta::get(old_range); ir::value* new_range = builder.create_add(dyn_range, static_range); old_range->replace_all_uses_with(new_range); - params_->copy(dyn_range, old_range); - params_->copy(static_range, old_range); - params_->copy(new_range, old_range); } } @@ -214,9 +199,6 @@ void reassociate::run(ir::module &mod) { ir::value* ndyn = builder.create_broadcast(dyn, shapes); ir::value* broadcast = builder.create_broadcast(cst, shapes); ir::getelementptr_inst* nsta = (ir::getelementptr_inst*)builder.create_gep(ndyn, {broadcast}); - params_->copy(ndyn, rt); - params_->copy(nsta, rt); - params_->copy(broadcast, rt); infos[rt] = cst_info{ndyn, nsta}; } } @@ -236,8 +218,6 @@ void reassociate::run(ir::module &mod) { builder.set_insert_point(pz); ir::value *dyn_ptr = builder.create_gep(py, {dyn}); ir::value *sta_ptr = builder.create_gep(dyn_ptr, {sta}); - params_->copy(dyn_ptr, pz); - params_->copy(sta_ptr, pz); pz->replace_all_uses_with(sta_ptr); infos[sta_ptr].dyn_ptr = dyn_ptr; infos[sta_ptr].sta_ptr = (ir::getelementptr_inst*)sta_ptr; @@ -252,8 +232,6 @@ void reassociate::run(ir::module &mod) { ir::value *off = *pz->idx_begin(); ir::value *pz_dyn = builder.create_gep(dyn, {off}); ir::value *pz_sta = builder.create_gep(pz_dyn, {cst}, pz->get_name()); - params_->copy(pz_dyn, pz); - params_->copy(pz_sta, pz); pz->replace_all_uses_with(pz_sta); infos[pz_sta].dyn_ptr = pz_dyn; infos[pz_sta].sta_ptr = (ir::getelementptr_inst*)pz_sta; @@ -298,12 +276,6 @@ void reassociate::run(ir::module &mod) { ir::value *neg_off = builder.create_neg(off); ir::value *pz_dyn = builder.create_gep(pz, {neg_off}); phi_dyn->add_incoming(pz_dyn, phi->get_incoming_block(idx_z)); - // copy parameters - params_->copy(pz_dyn, pz); - params_->copy(((ir::instruction*)neg_off)->get_operand(0), off); - params_->copy(neg_off, off); - params_->copy(phi_dyn, phi); - params_->copy(phi_sta, phi); infos[phi_sta].dyn_ptr = phi_dyn; infos[phi_sta].sta_ptr = (ir::getelementptr_inst*)phi_sta; replaced.insert(phi); diff --git a/lib/codegen/transform/vectorize.cc b/lib/codegen/transform/vectorize.cc index 4d1b88541..ef120f903 100644 --- a/lib/codegen/transform/vectorize.cc +++ b/lib/codegen/transform/vectorize.cc @@ -1,5 +1,5 @@ #include "triton/codegen/transform/vectorize.h" -#include "triton/codegen/analysis/grid.h" +#include "triton/codegen/analysis/tiles.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" @@ -23,7 +23,6 @@ void vectorize::run(ir::module &mod) { ir::instruction *rx = (ir::instruction*)builder.create_vectorize(x); x->replace_all_uses_with(rx); rx->set_operand(0, x); - params_->copy(rx, x); } if(dynamic_cast(i)){ ir::value *x = i->get_operand(0); @@ -33,7 +32,6 @@ void vectorize::run(ir::module &mod) { ir::instruction *rx = (ir::instruction*)builder.create_vectorize(x); x->replace_all_uses_with(rx); rx->set_operand(0, x); - params_->copy(rx, x); } } } diff --git a/lib/ir/type.cc b/lib/ir/type.cc index aa3d9aa46..198553b52 100644 --- a/lib/ir/type.cc +++ b/lib/ir/type.cc @@ -73,6 +73,10 @@ const type::tile_shapes_t &type::get_tile_shapes() const { return ((tile_type*)this)->get_shapes(); } +const size_t type::get_tile_rank() const { + return get_tile_shapes().size(); +} + unsigned type::get_tile_num_elements() const { const tile_shapes_t& shapes = get_tile_shapes(); unsigned result = 1; diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 9c3f99869..d29e9d7ce 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -3,6 +3,9 @@ #include #include #include +#include "triton/codegen/analysis/axes.h" +#include "triton/codegen/analysis/layout.h" +#include "triton/codegen/analysis/tiles.h" #include "triton/codegen/selection.h" #include "triton/runtime/function.h" #include "triton/codegen/transform/coalesce.h" @@ -192,49 +195,54 @@ function::caller function::autotune(driver::stream* stream, const grid_fn_ty& gr std::unique_ptr function::make_bin(ir::module &module, driver::context *context, const options_t& opt) { std::unique_ptr target = context->device()->make_target(); - + // generate llvm code + llvm::LLVMContext ctx; + std::unique_ptr llvm(new llvm::Module(module.get_name(), ctx)); // create passes codegen::analysis::meminfo shmem_info; - codegen::analysis::liveness shmem_liveness(&shmem_info); codegen::analysis::align alignment_info; + codegen::analysis::liveness shmem_liveness(&shmem_info); codegen::transform::coalesce coalesce(&alignment_info, &shmem_info); - codegen::analysis::grids grids(opt.num_warps, &coalesce); - codegen::analysis::memalloc shmem_allocation(&shmem_liveness, &shmem_info, &grids); + codegen::analysis::axes axes; + codegen::analysis::layout layouts(&axes); + codegen::analysis::tiles tiles(opt.num_warps, &coalesce, &axes, &layouts); + codegen::analysis::memalloc shmem_allocation(&shmem_liveness, &shmem_info, &tiles); codegen::transform::membar shmem_barriers(&shmem_allocation, &shmem_info); - codegen::transform::vectorize vectorize(&grids); + codegen::transform::vectorize vectorize(&tiles); codegen::transform::dce dce; codegen::transform::peephole peephole; - codegen::transform::reassociate reassociate(&alignment_info, &grids); - codegen::selection selection(&shmem_allocation, &grids, &shmem_info, &alignment_info, &coalesce, target.get(), opt.num_warps); + codegen::transform::reassociate reassociate(&alignment_info); + codegen::selection selection(&shmem_allocation, &tiles, &shmem_info, &alignment_info, &axes, &layouts, &coalesce, target.get(), opt.num_warps); // run passes peephole.run(module); dce.run(module); alignment_info.run(module); - if(target->is_gpu()) - shmem_info.run(module); + shmem_info.run(module); coalesce.run(module); dce.run(module); - grids.run(module); + axes.run(module); + layouts.run(module); + tiles.run(module); alignment_info.run(module); reassociate.run(module); dce.run(module); peephole.run(module); - if(target->is_gpu()){ - shmem_info.run(module); - shmem_liveness.run(module); - shmem_allocation.run(); - if(shmem_allocation.allocated_size() > context->device()->max_shared_memory()) - return std::unique_ptr(); - shmem_barriers.run(module); - } + shmem_info.run(module); + shmem_liveness.run(module); + shmem_allocation.run(); + if(shmem_allocation.allocated_size() > context->device()->max_shared_memory()) + return std::unique_ptr(); + shmem_barriers.run(module); dce.run(module); vectorize.run(module); dce.run(module); alignment_info.run(module); + coalesce.run(module); + dce.run(module); // ir::print(module, std::cout); - // generate llvm code - llvm::LLVMContext ctx; - std::unique_ptr llvm(new llvm::Module(module.get_name(), ctx)); + axes.run(module); + layouts.run(module); + tiles.run(module); selection.run(module, *llvm); // return binary std::unique_ptr res(driver::module::create(context, std::move(llvm))); diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index 7f2366ecc..9a59f3ea7 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -45,10 +45,10 @@ std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, i opt.defines.push_back({"TYPE", {ty}}); opt.defines.push_back({"AT", {AT?"1":"0"}}); opt.defines.push_back({"BT", {BT?"1":"0"}}); - opt.defines.push_back({"TM", {"64", "128"}}); - opt.defines.push_back({"TN", {"64", "128"}}); + opt.defines.push_back({"TM", {"128"}}); + opt.defines.push_back({"TN", {"128"}}); opt.defines.push_back({"TK", {"8"}}); - opt.num_warps = {2, 4, 8}; + opt.num_warps = {4}; // create function rt::function function(src::dot, opt); // benchmark available libraries From e184bad9a1a750bc90b562d2af58235123bd0802 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 16 Sep 2019 13:28:23 -0400 Subject: [PATCH 407/494] [auto-coalesce] more bugfixes --- include/triton/codegen/analysis/tiles.h | 13 ++-- include/triton/codegen/transform/coalesce.h | 12 ++- lib/codegen/analysis/align.cc | 4 + lib/codegen/analysis/tiles.cc | 71 ++++++++++++----- lib/codegen/selection.cc | 6 +- lib/codegen/transform/coalesce.cc | 86 ++++++++++----------- lib/runtime/function.cc | 23 +++--- tests/bench/dot.cc | 6 +- 8 files changed, 128 insertions(+), 93 deletions(-) diff --git a/include/triton/codegen/analysis/tiles.h b/include/triton/codegen/analysis/tiles.h index a9387cb5c..93d3a9774 100644 --- a/include/triton/codegen/analysis/tiles.h +++ b/include/triton/codegen/analysis/tiles.h @@ -19,14 +19,11 @@ namespace ir{ namespace codegen{ -namespace transform{ -class coalesce; -} - namespace analysis{ class axes; class layout; +class align; class tiles { typedef std::map> param_map_t; @@ -35,25 +32,27 @@ private: void init_scanline_tile(ir::value *i); public: - tiles(size_t num_warps, transform::coalesce* coalesce, analysis::axes* axes, analysis::layout* layout); + tiles(size_t num_warps, analysis::align* align, analysis::axes* axes, analysis::layout* layout); void run(ir::module &mod); bool hmma(ir::value *value); int mts(ir::value *value, unsigned ax); int nts(ir::value *value, unsigned ax); int fpw(ir::value *value, unsigned ax); int wpt(ir::value *value, unsigned ax); + std::vector order(ir::value *v); const std::map& largest(); private: // dependencies + analysis::align* align_; analysis::layout* layout_; analysis::axes* axes_; - transform::coalesce* coalesce_; // number of warps size_t num_warps_; // tile properties - std::map hmma_; std::map largest_; + std::map> order_; + std::map hmma_; std::map fpw_; std::map wpt_; std::map mts_; diff --git a/include/triton/codegen/transform/coalesce.h b/include/triton/codegen/transform/coalesce.h index e78010703..9f0576af0 100644 --- a/include/triton/codegen/transform/coalesce.h +++ b/include/triton/codegen/transform/coalesce.h @@ -2,6 +2,7 @@ #define TDL_INCLUDE_CODEGEN_OPTIMIZE_REORDER_H #include +#include #include namespace triton { @@ -9,27 +10,32 @@ namespace triton { namespace ir { class module; class value; + class io_inst; } namespace codegen{ namespace analysis{ class align; + class layout; class meminfo; } namespace transform{ class coalesce { +private: + void extract_io_use(ir::value *v, std::set& result); + void extract_ld(ir::io_inst *i, std::map > &result); + public: - coalesce(analysis::align* algin, analysis::meminfo* mem); - std::vector get_order(ir::value* v); + coalesce(analysis::align* align, triton::codegen::analysis::layout *layouts, analysis::meminfo* mem); void run(ir::module &mod); private: analysis::align* align_; + analysis::layout* layout_; analysis::meminfo* mem_; - std::map> order_; }; } diff --git a/lib/codegen/analysis/align.cc b/lib/codegen/analysis/align.cc index 8c2ecf847..fda2f6e32 100644 --- a/lib/codegen/analysis/align.cc +++ b/lib/codegen/analysis/align.cc @@ -155,6 +155,8 @@ std::vector align::populate_is_constant(ir::value *v) { return is_constant_.at(v); if(auto *x = dynamic_cast(v)) return add_to_cache(v, {cst_info{true, (unsigned)x->get_value()}}, is_constant_); + if(dynamic_cast(v)) + return add_to_cache(v, {cst_info{true, 0}}, is_constant_); if(auto *x = dynamic_cast(v)) return populate_is_constant_phi(x); if(auto *x = dynamic_cast(v)) @@ -300,6 +302,8 @@ std::vector align::populate_max_contiguous_default(ir::value* v) { auto shapes = v->get_type()->get_tile_shapes(); if(dynamic_cast(v)) return add_to_cache(v, {shapes[0]}, max_contiguous_); + if(dynamic_cast(v)) + return add_to_cache(v, {shapes[0]}, max_contiguous_); return add_to_cache(v, std::vector(shapes.size(), 1), max_contiguous_); } diff --git a/lib/codegen/analysis/tiles.cc b/lib/codegen/analysis/tiles.cc index 7b8505ff7..d1b26a6f9 100644 --- a/lib/codegen/analysis/tiles.cc +++ b/lib/codegen/analysis/tiles.cc @@ -1,9 +1,10 @@ #include #include +#include +#include "triton/codegen/analysis/align.h" #include "triton/codegen/analysis/axes.h" #include "triton/codegen/analysis/tiles.h" #include "triton/codegen/analysis/layout.h" -#include "triton/codegen/transform/coalesce.h" #include "triton/ir/instructions.h" #include "triton/ir/type.h" #include "triton/ir/module.h" @@ -18,8 +19,8 @@ namespace triton{ namespace codegen{ namespace analysis{ -tiles::tiles(size_t num_warps, transform::coalesce *reorder, analysis::axes *axes, analysis::layout *layout): - num_warps_(num_warps), coalesce_(reorder), axes_(axes), layout_(layout) +tiles::tiles(size_t num_warps, analysis::align *align, analysis::axes *axes, analysis::layout *layout): + num_warps_(num_warps), align_(align), axes_(axes), layout_(layout) { } bool is_hmma(ir::value *v){ @@ -57,6 +58,11 @@ int tiles::wpt(ir::value *value, unsigned ax) { return wpt_.at(axes_->get(value, ax)); } +std::vector tiles::order(ir::value *v) { + auto ret = order_[layout_->id(v)]; + return ret; +} + const std::map& tiles::largest() { return largest_; } @@ -68,10 +74,10 @@ unsigned clamp(unsigned x, unsigned lo, unsigned hi) { void tiles::init_hmma_tile(ir::value *i) { - auto order = coalesce_->get_order(i); + auto ord = order(i); auto shapes = i->get_type()->get_tile_shapes(); - unsigned shape_0 = shapes[order[0]]; - unsigned shape_1 = shapes[order[1]]; + unsigned shape_0 = shapes[ord[0]]; + unsigned shape_1 = shapes[ord[1]]; /* fragments per warp */ // try to make things as square as possible to maximize data re-use std::vector fpw = {1, 1, 1}; @@ -110,17 +116,17 @@ void tiles::init_hmma_tile(ir::value *i) { } void tiles::init_scanline_tile(ir::value *i) { - auto order = coalesce_->get_order(i); + auto ord = order(i); auto shapes = i->get_type()->get_tile_shapes(); unsigned size = i->get_type()->get_tile_num_elements(); - unsigned ld = order[0]; + unsigned ld = ord[0]; unsigned num_threads = num_warps_*32; unsigned current = num_threads; nts_[axes_->get(i, ld)] = clamp(size / num_threads, 1, 4); mts_[axes_->get(i, ld)] = clamp(current, 1, shapes[ld] / nts_[axes_->get(i, ld)]); current = current / mts_[axes_->get(i, ld)]; for(size_t d = 1; d < shapes.size(); d++){ - ld = order[d]; + ld = ord[d]; nts_[axes_->get(i, ld)] = 1; mts_[axes_->get(i, ld)] = clamp(current, 1, shapes[ld]); current = current / mts_[axes_->get(i, ld)]; @@ -133,31 +139,58 @@ void tiles::init_scanline_tile(ir::value *i) { throw std::runtime_error("cannot create a kernel with this amount of warps"); } +void extract_io_use(ir::value *v, std::set& result) { + for(ir::user* u: v->get_users()){ + auto i = dynamic_cast(u); + if(i && i->get_pointer_operand() == v) + result.insert(i); + } +} + + void tiles::run(ir::module &) { hmma_.clear(); largest_.clear(); size_t num_groups = layout_->get_num_groups(); + // helpers + auto rank = [](ir::value* v) { + ir::type *ty = v->get_type(); + size_t ret = 0; + if(ty->is_tile_ty()) + for(int s: ty->get_tile_shapes()) + ret += s > 1; + return ret; + }; // find out which groups require hmma layout for(size_t i = 0; i < num_groups; i++) { const auto& values = layout_->values(i); hmma_[i] = std::any_of(values.begin(), values.end(), &is_hmma); } // find out which value is the largest in each group -// std::vector axes; for(size_t i = 0; i < num_groups; i++) { const auto& values = layout_->values(i); - auto rank = [](ir::value* v) { - ir::type *ty = v->get_type(); - size_t ret = 0; - if(ty->is_tile_ty()) - for(int s: ty->get_tile_shapes()) - ret += s > 1; - return ret; - }; auto cmp = [&rank](ir::value* x, ir::value *y) { return rank(x) < rank(y); }; largest_[i] = *std::max_element(values.begin(), values.end(), cmp); } - + // find out the order of a group + for(size_t i = 0; i < num_groups; i++){ + std::set io; + for(ir::value* v: layout_->values(i)) + extract_io_use(v, io); + auto cmp = [&rank](ir::io_inst* x, ir::io_inst *y) { + return rank(x->get_pointer_operand()) < rank(y->get_pointer_operand()); + }; + auto it = std::max_element(io.begin(), io.end(), cmp); + std::vector order(rank(largest_[i])); + std::iota(order.begin(), order.end(), 0); + if(it != io.end()) { + auto max_contiguous = align_->contiguous((*it)->get_pointer_operand()); + std::sort(order.begin(), order.end(), [&](unsigned a, unsigned b) { + return max_contiguous[a] > max_contiguous[b]; } + ); + } + order_[i] = order; + } // tiling parameters for(auto x: largest_){ ir::value *i = x.second; diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 62b68e78a..ca93bc917 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -545,7 +545,7 @@ Value* selection::llvm_value(ir::value *v, IRBuilder<> &builder) { * ------------------- */ // Grid construction -std::vector delinearize(Value *trailing, const std::vector& order, std::vector &shapes, IRBuilder<> &builder){ +std::vector delinearize(Value *trailing, const std::vector& order, std::vector &shapes, IRBuilder<> &builder){ size_t dim = shapes.size(); std::vector result(dim); for(unsigned k = 0; k < dim - 1; k++){ @@ -562,7 +562,7 @@ inline int32_t ceil(int32_t num, int32_t div){ return (num + div - 1)/div; } -inline void to_warps(const std::vector &bs, const std::vector& order, std::vector &nw, std::vector &ws){ +inline void to_warps(const std::vector &bs, const std::vector& order, std::vector &nw, std::vector &ws){ static const size_t warp_size = 32; size_t nthreads = 1, nwarps = 1; nw.resize(bs.size()); @@ -578,7 +578,7 @@ inline void to_warps(const std::vector &bs, const std::vector &builder, Value *u_thread_id, Value *u_warp_id) { - auto order = reorder_->get_order(v); + auto order = tiles_->order(v); const auto& shapes = v->get_type()->get_tile_shapes(); size_t dim = shapes.size(); std::vector contiguous(dim); diff --git a/lib/codegen/transform/coalesce.cc b/lib/codegen/transform/coalesce.cc index 12cd3f671..825b6adf6 100644 --- a/lib/codegen/transform/coalesce.cc +++ b/lib/codegen/transform/coalesce.cc @@ -6,6 +6,7 @@ #include "triton/ir/basic_block.h" #include "triton/ir/instructions.h" #include "triton/ir/module.h" +#include "triton/codegen/analysis/layout.h" #include "triton/codegen/analysis/meminfo.h" #include "triton/codegen/analysis/align.h" #include "triton/codegen/transform/coalesce.h" @@ -14,62 +15,59 @@ namespace triton { namespace codegen{ namespace transform{ -coalesce::coalesce(analysis::align* align, analysis::meminfo *mem) - : align_(align), mem_(mem) { } +coalesce::coalesce(analysis::align* align, analysis::layout *layouts, analysis::meminfo *mem) + : align_(align), layout_(layouts), mem_(mem) { } -std::vector coalesce::get_order(ir::value* v) { - return order_.at(v); +// Find all values that are used as pointer operands in LD/ST +void coalesce::extract_io_use(ir::value *v, std::set& result) { + for(ir::user* u: v->get_users()){ + auto i = dynamic_cast(u); + if(i && i->get_pointer_operand() == v) + result.insert(i); + } +} + +void coalesce::extract_ld(ir::io_inst* i, std::map>& result) { + ir::value *ptr = i->get_pointer_operand(); + auto contiguous = align_->contiguous(ptr); + auto it = std::max_element(contiguous.begin(), contiguous.end()); + int axis = std::distance(contiguous.begin(), it); + result[axis].push_back(i); } void coalesce::run(ir::module &mod) { - - std::set io; - - std::function set_order = [&](ir::value *v) -> void { - if(order_.find(v) != order_.end()) - return; - ir::type *tile_ty = v->get_type(); - if(auto *x = dynamic_cast(v)) - tile_ty = x->get_operand(0)->get_type(); - if(!tile_ty->is_tile_ty()) - return; - std::vector order(tile_ty->get_tile_shapes().size()); - std::iota(order.begin(), order.end(), 0); - order_[v] = order; - if(ir::user* u = dynamic_cast(v)) - for(ir::value* op: u->ops()) - set_order(op); - }; - - // initialize work-list - for(ir::function *fn: mod.get_function_list()) - for(ir::basic_block *block: ir::cfg::reverse_post_order(fn)) - for(ir::instruction *i: block->get_inst_list()){ - if(auto *x = dynamic_cast(i)) { - ir::type* ptr_ty = x->get_pointer_operand()->get_type(); - if(ptr_ty->is_tile_ty()) - io.insert(x); - } - set_order(i); + // find values to rematerialize + size_t num_groups = layout_->get_num_groups(); + std::vector remat; + for(size_t id = 0; id < num_groups; id++) { + const auto& values = layout_->values(id); + // extract pointers used in ld/st operations + std::set io; + for(ir::value *v: values) + extract_io_use(v, io); + // extract leading axes + std::map> axes; + for(ir::io_inst *i: io) + extract_ld(i, axes); + // update list of values to rematerialize + if(axes.empty()) + continue; + for(auto it = ++axes.rbegin(); it != axes.rend(); it++) + remat.insert(remat.begin(), + it->second.begin(), it->second.end()); } + // rematerialize values ir::builder &builder = mod.get_builder(); - std::map replaced; - for(ir::io_inst *i: io) { - ir::value *ptr = i->get_pointer_operand(); - auto max_contiguous = align_->contiguous(ptr); - std::vector order(max_contiguous.size()); - std::iota(order.begin(), order.end(), 0); - std::sort(order.begin(), order.end(), [&](unsigned a, unsigned b) { return max_contiguous[a] > max_contiguous[b]; } ); + for(ir::io_inst *r: remat) { std::list> work_list; - if(order != order_[i]) - work_list.push_back({i, nullptr}); + std::map replaced; + work_list.push_back({r, nullptr}); // rematerialize recursively while(!work_list.empty()) { auto pair = work_list.back(); ir::instruction* cloned = pair.first; ir::instruction* original = pair.second; - order_[cloned] = order; work_list.pop_back(); for(ir::value *op: cloned->ops()) { ir::instruction* i_op = dynamic_cast(op); @@ -101,14 +99,12 @@ void coalesce::run(ir::module &mod) { } n_op = builder.insert(n_op); replaced.insert({i_op, n_op}); - order_[n_op] = order; mem_->copy(n_op, i_op); if(original) n_op->erase_use(original); cloned->replace_uses_of_with(i_op, n_op); } } - } } diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index d29e9d7ce..5e40b4419 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -200,30 +200,30 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c std::unique_ptr llvm(new llvm::Module(module.get_name(), ctx)); // create passes codegen::analysis::meminfo shmem_info; - codegen::analysis::align alignment_info; + codegen::analysis::align align; codegen::analysis::liveness shmem_liveness(&shmem_info); - codegen::transform::coalesce coalesce(&alignment_info, &shmem_info); codegen::analysis::axes axes; codegen::analysis::layout layouts(&axes); - codegen::analysis::tiles tiles(opt.num_warps, &coalesce, &axes, &layouts); + codegen::transform::coalesce coalesce(&align, &layouts, &shmem_info); + codegen::analysis::tiles tiles(opt.num_warps, &align, &axes, &layouts); codegen::analysis::memalloc shmem_allocation(&shmem_liveness, &shmem_info, &tiles); codegen::transform::membar shmem_barriers(&shmem_allocation, &shmem_info); codegen::transform::vectorize vectorize(&tiles); codegen::transform::dce dce; codegen::transform::peephole peephole; - codegen::transform::reassociate reassociate(&alignment_info); - codegen::selection selection(&shmem_allocation, &tiles, &shmem_info, &alignment_info, &axes, &layouts, &coalesce, target.get(), opt.num_warps); + codegen::transform::reassociate reassociate(&align); + codegen::selection selection(&shmem_allocation, &tiles, &shmem_info, &align, &axes, &layouts, &coalesce, target.get(), opt.num_warps); // run passes peephole.run(module); dce.run(module); - alignment_info.run(module); + align.run(module); shmem_info.run(module); - coalesce.run(module); - dce.run(module); axes.run(module); layouts.run(module); + coalesce.run(module); + align.run(module); + dce.run(module); tiles.run(module); - alignment_info.run(module); reassociate.run(module); dce.run(module); peephole.run(module); @@ -236,12 +236,9 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c dce.run(module); vectorize.run(module); dce.run(module); - alignment_info.run(module); - coalesce.run(module); - dce.run(module); -// ir::print(module, std::cout); axes.run(module); layouts.run(module); + align.run(module); tiles.run(module); selection.run(module, *llvm); // return binary diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index 9a59f3ea7..7f2366ecc 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -45,10 +45,10 @@ std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, i opt.defines.push_back({"TYPE", {ty}}); opt.defines.push_back({"AT", {AT?"1":"0"}}); opt.defines.push_back({"BT", {BT?"1":"0"}}); - opt.defines.push_back({"TM", {"128"}}); - opt.defines.push_back({"TN", {"128"}}); + opt.defines.push_back({"TM", {"64", "128"}}); + opt.defines.push_back({"TN", {"64", "128"}}); opt.defines.push_back({"TK", {"8"}}); - opt.num_warps = {4}; + opt.num_warps = {2, 4, 8}; // create function rt::function function(src::dot, opt); // benchmark available libraries From e01e623333619e8e00f6ce3db7c870114e71e85e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 16 Sep 2019 20:34:08 -0400 Subject: [PATCH 408/494] [codegen][auto-coalesce] more debugging --- include/triton/codegen/selection.h | 3 +- include/triton/codegen/transform/coalesce.h | 3 + lib/codegen/analysis/axes.cc | 1 + lib/codegen/analysis/tiles.cc | 2 + lib/codegen/selection.cc | 37 +++++---- lib/codegen/transform/coalesce.cc | 84 ++++++++++----------- lib/driver/module.cc | 8 +- lib/ir/print.cc | 7 +- lib/runtime/function.cc | 1 + tests/bench/copy2d.cc | 2 +- tests/common/src/copy.h | 2 +- 11 files changed, 81 insertions(+), 69 deletions(-) diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index ba92843a4..961aea725 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -119,7 +119,7 @@ private: Type *make_vector_ty(Type *ty, size_t vector_size); public: - distributed_tile(Type *ty, const shapes_t& shapes, const axes_t &axes, Builder &builder, bool vectorize); + distributed_tile(Type *ty, const shapes_t& shapes, const std::vector& order, const axes_t &axes, Builder &builder, bool vectorize); void set_value(indices_t idx, Value *v); Value* get_value(indices_t idx); unsigned get_linear_index(indices_t idx); @@ -129,6 +129,7 @@ public: private: axes_t axes_; + std::vector order_; indices_map_t indices_; values_map_t values_; ordered_indices_vec_t ordered_indices_; diff --git a/include/triton/codegen/transform/coalesce.h b/include/triton/codegen/transform/coalesce.h index 9f0576af0..3d418fdb5 100644 --- a/include/triton/codegen/transform/coalesce.h +++ b/include/triton/codegen/transform/coalesce.h @@ -11,6 +11,8 @@ namespace ir { class module; class value; class io_inst; + class instruction; + class builder; } namespace codegen{ @@ -27,6 +29,7 @@ class coalesce { private: void extract_io_use(ir::value *v, std::set& result); void extract_ld(ir::io_inst *i, std::map > &result); + ir::value* rematerialize(ir::value *v, ir::builder& builder, std::map& seen); public: coalesce(analysis::align* align, triton::codegen::analysis::layout *layouts, analysis::meminfo* mem); diff --git a/lib/codegen/analysis/axes.cc b/lib/codegen/analysis/axes.cc index 99fc59234..3949a03db 100644 --- a/lib/codegen/analysis/axes.cc +++ b/lib/codegen/analysis/axes.cc @@ -158,6 +158,7 @@ void axes::run(ir::module &mod) { unsigned group_id = 0; while(!nodes_.empty()) connected_components(*nodes_.begin(), nodes_, dependencies_, group_id++); + std::cout << "Number of axes: " << group_id << std::endl; } } diff --git a/lib/codegen/analysis/tiles.cc b/lib/codegen/analysis/tiles.cc index d1b26a6f9..3ee256550 100644 --- a/lib/codegen/analysis/tiles.cc +++ b/lib/codegen/analysis/tiles.cc @@ -190,6 +190,8 @@ void tiles::run(ir::module &) { ); } order_[i] = order; + std::cout << "order: " << order[0] << " " << order[1] << std::endl; + } // tiling parameters for(auto x: largest_){ diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index ca93bc917..79c8214c7 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -1,4 +1,5 @@ -#include "triton/codegen/selection.h" +#include +#include "triton/codegen/selection.h" #include "triton/codegen/target.h" #include "triton/codegen/analysis/layout.h" #include "triton/codegen/analysis/axes.h" @@ -28,6 +29,14 @@ using namespace llvm; /* Distributed Tile */ void distributed_tile::init_indices() { std::vector id(axes_.size(), 0); + // create iteration order + std::vector order(id.size()); + std::iota(order.begin(), order.end(), 0); + auto cmp = [&](int x, int y) { + return axes_[x].contiguous > axes_[y].contiguous; + }; + std::sort(order.begin(), order.end(), cmp); + // build size_t k = 0; while(true) { indices_t current; @@ -37,12 +46,12 @@ void distributed_tile::init_indices() { indices_[current] = sz; values_[current] = nullptr; ordered_indices_.push_back(current); - id[0]++; - while(id[k] == axes_[k].values.size()){ + id[order[0]]++; + while(id[order[k]] == axes_[order[k]].values.size()){ if(k == id.size() - 1) return; - id[k++] = 0; - id[k]++; + id[order[k++]] = 0; + id[order[k]]++; } k = 0; } @@ -54,8 +63,8 @@ llvm::Type *distributed_tile::make_vector_ty(llvm::Type *ty, size_t vector_size) return VectorType::get(ty, vector_size); } -distributed_tile::distributed_tile(Type *ty, const shapes_t &shapes, const axes_t &axes, llvm::IRBuilder<> &builder, bool vectorize) - : tile(make_vector_ty(ty, vectorize?axes[0].contiguous:1), shapes), axes_(axes), builder_(builder) { +distributed_tile::distributed_tile(Type *ty, const shapes_t &shapes, const std::vector& order, const axes_t &axes, llvm::IRBuilder<> &builder, bool vectorize) + : tile(make_vector_ty(ty, vectorize?axes[0].contiguous:1), shapes), axes_(axes), order_(order), builder_(builder) { vector_size_ = vectorize?ty_->getVectorNumElements():1; init_indices(); } @@ -767,7 +776,7 @@ void selection::create_shared_tile(ir::value *v, IRBuilder<> &builder, Value *sh for(ir::user *usr: v->get_users()) if(dynamic_cast(usr)) has_phi_user = true; - if(has_phi_user){ + if(!has_phi_user){ size_t offset = alloc_->offset(v); Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset)); ptr = builder.CreateBitCast(ptr, ptr_ty); @@ -791,7 +800,7 @@ void selection::create_distributed_tile(ir::value *v, IRBuilder<> &builder) { } } bool vectorize = dynamic_cast(v); - distributed_tile *T = new distributed_tile(ty, shapes, axes, builder, vectorize); + distributed_tile *T = new distributed_tile(ty, shapes, tiles_->order(v), axes, builder, vectorize); bool is_inserted = tmap_.insert({v, T}).second; // constant range if(is_inserted && dynamic_cast(v)){ @@ -1260,8 +1269,9 @@ void selection::lower_masked_load(ir::masked_load_inst *x, LLVMContext &ctx, Fun // find vector size distributed_tile* result = (distributed_tile*)tmap_.at(x); ir::value *ptr = x->get_pointer_operand(); - unsigned alignment = alignment_->get(ptr, 0); - unsigned vector_size = std::min(result->axis(0).contiguous, alignment); + size_t ld = tiles_->order(ptr)[0]; + unsigned alignment = alignment_->get(ptr, ld); + unsigned vector_size = std::min(result->axis(ld).contiguous, alignment); distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr); distributed_tile *masks = (distributed_tile*)tmap_.at(x->get_mask_operand()); distributed_tile *false_values = (distributed_tile*)tmap_.at(x->get_false_value_operand()); @@ -1331,8 +1341,9 @@ void selection::lower_load(ir::load_inst *x, LLVMContext &ctx, Function *fn, IRB distributed_tile* result = (distributed_tile*)tmap_.at(x); // find vector size ir::value *ptr = x->get_pointer_operand(); - unsigned alignment = alignment_->get(ptr, 0); - unsigned vector_size = std::min(result->axis(0).contiguous, alignment); + size_t ld = tiles_->order(ptr)[0]; + unsigned alignment = alignment_->get(ptr, ld); + unsigned vector_size = std::min(result->axis(ld).contiguous, alignment); distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr); // vector loads std::map packets; diff --git a/lib/codegen/transform/coalesce.cc b/lib/codegen/transform/coalesce.cc index 825b6adf6..0e435e663 100644 --- a/lib/codegen/transform/coalesce.cc +++ b/lib/codegen/transform/coalesce.cc @@ -35,6 +35,31 @@ void coalesce::extract_ld(ir::io_inst* i, std::map& seen) { + if(seen.find(x) != seen.end()) + return seen.at(x); + auto i = dynamic_cast(x); + // not an instruction -- forward value + if(!i) + return x; + // already in shared memory -- forward value + if(dynamic_cast(x)){ + return x; + } + // set insert point + auto& inst_list = i->get_parent()->get_inst_list(); + auto pos = ++std::find(inst_list.begin(), inst_list.end(), i); + builder.set_insert_point(pos); + // default -- recursive clone + ir::instruction *cloned = builder.insert(i->clone()); + seen[i] = cloned; + // rematerialize operands + for(ir::value *op: cloned->ops()) + cloned->replace_uses_of_with(op, rematerialize(op, builder, seen)); + return cloned; +} + void coalesce::run(ir::module &mod) { // find values to rematerialize size_t num_groups = layout_->get_num_groups(); @@ -56,54 +81,21 @@ void coalesce::run(ir::module &mod) { remat.insert(remat.begin(), it->second.begin(), it->second.end()); } - // rematerialize values - ir::builder &builder = mod.get_builder(); for(ir::io_inst *r: remat) { - std::list> work_list; - std::map replaced; - work_list.push_back({r, nullptr}); - // rematerialize recursively - while(!work_list.empty()) { - auto pair = work_list.back(); - ir::instruction* cloned = pair.first; - ir::instruction* original = pair.second; - work_list.pop_back(); - for(ir::value *op: cloned->ops()) { - ir::instruction* i_op = dynamic_cast(op); - if(replaced.find(i_op) != replaced.end()){ - cloned->replace_uses_of_with(i_op, replaced.at(i_op)); - continue; - } - if(!i_op) - continue; - ir::type *ty = i_op->get_type(); - if(!ty->is_tile_ty()) - continue; - auto& inst_list = i_op->get_parent()->get_inst_list(); - auto it = std::find(inst_list.begin(), inst_list.end(), i_op); - it++; - builder.set_insert_point(it); - // found a load; write to shared memory and stop recursion - ir::instruction *n_op = nullptr; - if(mem_->is_shared(i_op)){ - i_op->add_use(cloned); - continue; - } - if(auto* ld = dynamic_cast(i_op)) - n_op = ir::copy_to_shared_inst::create(ld); - // not a load; rematerialize and add to worklist - else { - n_op = i_op->clone(); - work_list.push_back({n_op, i_op}); - } - n_op = builder.insert(n_op); - replaced.insert({i_op, n_op}); - mem_->copy(n_op, i_op); - if(original) - n_op->erase_use(original); - cloned->replace_uses_of_with(i_op, n_op); - } + ir::builder& builder = mod.get_builder(); + // rematerialize operands + std::map seen; + for(ir::value *op: r->ops()) + rematerialize(op, mod.get_builder(), seen); + // copy to shared if load + auto& inst_list = r->get_parent()->get_inst_list(); + auto pos = ++std::find(inst_list.begin(), inst_list.end(), r); + builder.set_insert_point(pos); + if(dynamic_cast(r)){ + ir::instruction *cts = builder.insert(ir::copy_to_shared_inst::create(r)); + r->replace_all_uses_with(cts); + cts->replace_uses_of_with(cts, r); } } } diff --git a/lib/driver/module.cc b/lib/driver/module.cc index 1dcf4d738..d541b4d6c 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -92,10 +92,10 @@ void module::compile_llvm_module(std::unique_ptr module, const std file_type_t ft) { init_llvm(); // debug -// llvm::legacy::PassManager pm; -// pm.add(llvm::createPrintModulePass(llvm::outs())); + llvm::legacy::PassManager pm; + pm.add(llvm::createPrintModulePass(llvm::outs())); // pm.add(llvm::createVerifierPass()); -// pm.run(*module); + pm.run(*module); // create machine module->setTargetTriple(triple); std::string error; @@ -241,7 +241,7 @@ std::string cu_module::compile_llvm_module(std::unique_ptr module, cu_module::cu_module(driver::context * context, std::unique_ptr ll_module): cu_module(context, compile_llvm_module(std::move(ll_module), context->device())) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ -// std::cout << source_ << std::endl; + std::cout << source_ << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/ir/print.cc b/lib/ir/print.cc index af2c68a2e..f88ba1f6f 100644 --- a/lib/ir/print.cc +++ b/lib/ir/print.cc @@ -48,8 +48,10 @@ void print(module &mod, std::ostream& os) { os << std::endl; for(ir::instruction *inst: block->get_inst_list()){ os << " "; - os << get_name(inst, cnt++); - os << " = "; + if(!inst->get_type()->is_void_ty()){ + os << get_name(inst, cnt++); + os << " = "; + } ir::type* type = inst->get_type(); os << inst->repr() << " " << type->repr(); ir::instruction::ops_t ops = inst->ops(); @@ -65,7 +67,6 @@ void print(module &mod, std::ostream& os) { } os << ";" << std::endl; } - os << std::endl; } os << "}" << std::endl; } diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 5e40b4419..7908f8ec7 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -221,6 +221,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c axes.run(module); layouts.run(module); coalesce.run(module); +// ir::print(module, std::cout); align.run(module); dce.run(module); tiles.run(module); diff --git a/tests/bench/copy2d.cc b/tests/bench/copy2d.cc index 69e877767..c3433b2e2 100644 --- a/tests/bench/copy2d.cc +++ b/tests/bench/copy2d.cc @@ -48,7 +48,7 @@ int main() { std::vector configs; for(auto x: std::vector{COLMAJOR}){ std::vector tmp = { - config_t{2048, 2048, x} + config_t{4096, 4096, x} }; configs.insert(configs.end(), tmp.begin(), tmp.end()); } diff --git a/tests/common/src/copy.h b/tests/common/src/copy.h index 2a7dc0627..58651a84f 100644 --- a/tests/common/src/copy.h +++ b/tests/common/src/copy.h @@ -38,7 +38,7 @@ void copy2d(TYPE * X __noalias __readonly __aligned(16), int rm[TM] = ridm * TM + 0 ... TM; int rn[TN] = ridn * TN + 0 ... TN; TYPE* px[TM, TN] = X + rm[:, newaxis] + rn[newaxis, :] * ldx; - TYPE* py[TM, TN] = Y + rm[:, newaxis] * ldy + rn[newaxis, :]; + TYPE* py[TM, TN] = Y + rm[:, newaxis] + rn[newaxis, :] * ldy; *py = *px; } )"; From 307c1128d54f44a9e9f07c2ce3e7fe414a431bb6 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 17 Sep 2019 15:21:10 -0400 Subject: [PATCH 409/494] [codegen] removed vectorization pass (now part of selection) --- include/triton/codegen/selection.h | 1 - lib/codegen/selection.cc | 45 ++++++++++++++++++------------ lib/codegen/transform/coalesce.cc | 2 +- lib/runtime/function.cc | 2 -- tests/common/src/copy.h | 4 +-- 5 files changed, 30 insertions(+), 24 deletions(-) diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 961aea725..74a617af9 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -179,7 +179,6 @@ private: void lower_reshape(ir::reshape_inst* x, LLVMContext &ctx, Function *fn, Builder &builder); void lower_splat(ir::splat_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); void lower_broadcast(ir::broadcast_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); - void lower_vectorize(ir::vectorize_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); void lower_copy_to_shared(ir::copy_to_shared_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); void lower_trans(ir::trans_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); // matrix multiply diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 79c8214c7..762fd90db 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -799,8 +799,7 @@ void selection::create_distributed_tile(ir::value *v, IRBuilder<> &builder) { axes[d].values = {builder.getInt32(0)}; } } - bool vectorize = dynamic_cast(v); - distributed_tile *T = new distributed_tile(ty, shapes, tiles_->order(v), axes, builder, vectorize); + distributed_tile *T = new distributed_tile(ty, shapes, tiles_->order(v), axes, builder, false); bool is_inserted = tmap_.insert({v, T}).second; // constant range if(is_inserted && dynamic_cast(v)){ @@ -890,8 +889,25 @@ void selection::lower_masked_store(ir::masked_store_inst *x, LLVMContext &ctx, F void selection::lower_store(ir::store_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { distributed_tile* ptrs = (distributed_tile*)tmap_.at(x->get_pointer_operand()); tile *scalars = tmap_.at(x->get_value_operand()); +// size_t ld = tiles_->order(x->get_pointer_operand())[0]; +// unsigned vector_size = 2; +// // vectorize pointers +// std::map ptr_packets; +// ptrs->for_each([&](indices_t idx){ +// unsigned linear = ptrs->get_linear_index(idx); +// unsigned id = linear / vector_size; +// if(linear % vector_size == 0) { +// Value *ptr = ptrs->get_value(idx); +// ptr = builder.CreateBitCast(ptr, PointerType::get(VectorType::get(ptr->getType()->getPointerElementType(), vector_size), +// ptr->getType()->getPointerAddressSpace())); +// ptr_packets[id] = ptr; +// } +// }); +// ((shared_tile*)(scalars))->set_vector_size(vector_size); +// ((shared_tile*)(scalars))->set_return_mode(true); + // extract result element ptrs->for_each([&](indices_t idx){ - builder.CreateStore(scalars->get_value(idx), ptrs->get_value(idx)); + builder.CreateStore(scalars->get_value(idx), ptrs->get_value(idx)); }); } @@ -1018,10 +1034,13 @@ void selection::lower_broadcast(ir::broadcast_inst *x, LLVMContext &ctx, Functio }); } -void selection::lower_vectorize(ir::vectorize_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { - distributed_tile* result = (distributed_tile*)tmap_.at(x); - distributed_tile* in = (distributed_tile*)tmap_.at(x->get_operand(0)); - unsigned vector_size = result->axis(0).contiguous; +void selection::lower_copy_to_shared(ir::copy_to_shared_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { + shared_tile* result = (shared_tile*)tmap_.at(x); + ir::value *arg = x->get_operand(0); + distributed_tile* in = (distributed_tile*)tmap_.at(arg); + size_t ld = tiles_->order(arg)[0]; + unsigned vector_size = in->axis(ld).contiguous; + std::map packets; in->for_each([&](indices_t idx){ unsigned linear = in->get_linear_index(idx); @@ -1031,7 +1050,7 @@ void selection::lower_vectorize(ir::vectorize_inst *x, LLVMContext &ctx, Functio packets[id] = UndefValue::get(VectorType::get(in_value->getType(), vector_size)); packets[id] = builder.CreateInsertElement(packets.at(id), in_value, linear % vector_size); }); - result->for_each([&](indices_t idx){ + in->for_each([&](indices_t idx){ unsigned linear = in->get_linear_index(idx); unsigned id = linear / vector_size; if(linear % vector_size == 0) @@ -1039,14 +1058,6 @@ void selection::lower_vectorize(ir::vectorize_inst *x, LLVMContext &ctx, Functio }); } -void selection::lower_copy_to_shared(ir::copy_to_shared_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { - shared_tile* result = (shared_tile*)tmap_.at(x); - distributed_tile* in = (distributed_tile*)tmap_.at(x->get_operand(0)); - in->for_each([&](indices_t idx){ - result->set_value(idx, in->get_value(idx)); - }); -} - void selection::lower_trans(ir::trans_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { shared_tile* result = (shared_tile*)tmap_.at(x); distributed_tile* in = (distributed_tile*)tmap_.at(x->get_operand(0)); @@ -1400,8 +1411,6 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & lower_splat(x, ctx, fn, builder); else if(auto *x = dynamic_cast(ins)) lower_broadcast(x, ctx, fn, builder); - else if(auto *x = dynamic_cast(ins)) - lower_vectorize(x, ctx, fn, builder); else if(auto *x = dynamic_cast(ins)) lower_copy_to_shared(x, ctx, fn, builder); else if(auto* x = dynamic_cast(ins)) diff --git a/lib/codegen/transform/coalesce.cc b/lib/codegen/transform/coalesce.cc index 0e435e663..b0d1a3521 100644 --- a/lib/codegen/transform/coalesce.cc +++ b/lib/codegen/transform/coalesce.cc @@ -87,7 +87,7 @@ void coalesce::run(ir::module &mod) { // rematerialize operands std::map seen; for(ir::value *op: r->ops()) - rematerialize(op, mod.get_builder(), seen); + r->replace_uses_of_with(op, rematerialize(op, mod.get_builder(), seen)); // copy to shared if load auto& inst_list = r->get_parent()->get_inst_list(); auto pos = ++std::find(inst_list.begin(), inst_list.end(), r); diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 7908f8ec7..7db1e1af1 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -208,7 +208,6 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::analysis::tiles tiles(opt.num_warps, &align, &axes, &layouts); codegen::analysis::memalloc shmem_allocation(&shmem_liveness, &shmem_info, &tiles); codegen::transform::membar shmem_barriers(&shmem_allocation, &shmem_info); - codegen::transform::vectorize vectorize(&tiles); codegen::transform::dce dce; codegen::transform::peephole peephole; codegen::transform::reassociate reassociate(&align); @@ -235,7 +234,6 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c return std::unique_ptr(); shmem_barriers.run(module); dce.run(module); - vectorize.run(module); dce.run(module); axes.run(module); layouts.run(module); diff --git a/tests/common/src/copy.h b/tests/common/src/copy.h index 58651a84f..b1d571b51 100644 --- a/tests/common/src/copy.h +++ b/tests/common/src/copy.h @@ -37,8 +37,8 @@ void copy2d(TYPE * X __noalias __readonly __aligned(16), int ridn = get_program_id(1); int rm[TM] = ridm * TM + 0 ... TM; int rn[TN] = ridn * TN + 0 ... TN; - TYPE* px[TM, TN] = X + rm[:, newaxis] + rn[newaxis, :] * ldx; - TYPE* py[TM, TN] = Y + rm[:, newaxis] + rn[newaxis, :] * ldy; + TYPE* px[TM, TN] = X + rm[:, newaxis] * ldx + rn[newaxis, :] ; + TYPE* py[TM, TN] = Y + rm[:, newaxis] + rn[newaxis, :] * ldy; *py = *px; } )"; From 1fd9be27ee94376d4f03f22210372b683ff22799 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 17 Sep 2019 17:40:03 -0400 Subject: [PATCH 410/494] [tests][bench] now benchmarking all variants of copy --- include/triton/codegen/analysis/layout.h | 2 ++ lib/codegen/analysis/axes.cc | 1 - lib/codegen/analysis/layout.cc | 45 ++++++++++++++---------- lib/codegen/analysis/tiles.cc | 2 -- lib/codegen/selection.cc | 12 +++++-- lib/codegen/transform/coalesce.cc | 8 +++++ lib/driver/module.cc | 7 ++-- lib/runtime/function.cc | 2 +- tests/bench/copy2d.cc | 33 ++++++++--------- tests/common/src/copy.h | 21 +++-------- 10 files changed, 70 insertions(+), 63 deletions(-) diff --git a/include/triton/codegen/analysis/layout.h b/include/triton/codegen/analysis/layout.h index 3bc1f2f6a..7bc14b08f 100644 --- a/include/triton/codegen/analysis/layout.h +++ b/include/triton/codegen/analysis/layout.h @@ -24,6 +24,8 @@ class layout { typedef std::map > graph_t; private: + // create edge + void connect(ir::value *x, ir::value *y); // connected components void connected_components(node_t x, std::set &nodes, graph_t &graph, unsigned id); // list the axes of the given value diff --git a/lib/codegen/analysis/axes.cc b/lib/codegen/analysis/axes.cc index 3949a03db..99fc59234 100644 --- a/lib/codegen/analysis/axes.cc +++ b/lib/codegen/analysis/axes.cc @@ -158,7 +158,6 @@ void axes::run(ir::module &mod) { unsigned group_id = 0; while(!nodes_.empty()) connected_components(*nodes_.begin(), nodes_, dependencies_, group_id++); - std::cout << "Number of axes: " << group_id << std::endl; } } diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index a6eade0b2..0f376b4fc 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -53,6 +53,27 @@ const std::vector& layout::values(unsigned id) const size_t layout::get_num_groups() const { return values_.size(); } +void layout::connect(ir::value *x, ir::value *y) { + if(x == y) + return; + if(!x->get_type()->is_tile_ty()) + return; + if(!y->get_type()->is_tile_ty()) + return; + std::set x_axes = axes_of(x); + std::set y_axes = axes_of(y); + std::set common; + std::set_intersection(x_axes.begin(), x_axes.end(), + y_axes.begin(), y_axes.end(), + std::inserter(common, common.begin())); + if(!common.empty()){ + nodes_.insert(x); + nodes_.insert(y); + dependencies_[x].insert(y); + dependencies_[y].insert(x); + } +} + // run void layout::run(ir::module &mod) { nodes_.clear(); @@ -63,26 +84,12 @@ void layout::run(ir::module &mod) { for(ir::function *fn: mod.get_function_list()) for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i : block->get_inst_list()) { - // skip scalars - if(!i->get_type()->is_tile_ty()) - continue; - // add an edge between i and the operands that share an axis - std::set i_axes = axes_of(i); - nodes_.insert(i); - for(ir::value* op: i->ops()){ - if(!op->get_type()->is_tile_ty()) - continue; - nodes_.insert(op); - std::set op_axes = axes_of(op); - std::set common; - std::set_intersection(i_axes.begin(), i_axes.end(), - op_axes.begin(), op_axes.end(), - std::inserter(common, common.begin())); - if(!common.empty() || !op->get_type()->is_tile_ty()){ - dependencies_[i].insert(op); - dependencies_[op].insert(i); + for(ir::value* opx: i->ops()) + for(ir::value* opy: i->ops()){ + connect(i, opx); + connect(opx, opy); } - } + } // Grids unsigned group_id = 0; diff --git a/lib/codegen/analysis/tiles.cc b/lib/codegen/analysis/tiles.cc index 3ee256550..d1b26a6f9 100644 --- a/lib/codegen/analysis/tiles.cc +++ b/lib/codegen/analysis/tiles.cc @@ -190,8 +190,6 @@ void tiles::run(ir::module &) { ); } order_[i] = order; - std::cout << "order: " << order[0] << " " << order[1] << std::endl; - } // tiling parameters for(auto x: largest_){ diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 762fd90db..c6592a59c 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -1035,11 +1035,17 @@ void selection::lower_broadcast(ir::broadcast_inst *x, LLVMContext &ctx, Functio } void selection::lower_copy_to_shared(ir::copy_to_shared_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { - shared_tile* result = (shared_tile*)tmap_.at(x); + unsigned vector_size = 1; + auto x_order = tiles_->order(x); ir::value *arg = x->get_operand(0); + auto arg_order = tiles_->order(arg); + // tiles + shared_tile* result = (shared_tile*)tmap_.at(x); distributed_tile* in = (distributed_tile*)tmap_.at(arg); - size_t ld = tiles_->order(arg)[0]; - unsigned vector_size = in->axis(ld).contiguous; + if(x_order == arg_order){ + size_t ld = arg_order[0]; + vector_size = std::min(tiles_->nts(x, ld),tiles_->nts(arg, ld)); + } std::map packets; in->for_each([&](indices_t idx){ diff --git a/lib/codegen/transform/coalesce.cc b/lib/codegen/transform/coalesce.cc index b0d1a3521..873f7a9f5 100644 --- a/lib/codegen/transform/coalesce.cc +++ b/lib/codegen/transform/coalesce.cc @@ -51,6 +51,11 @@ ir::value* coalesce::rematerialize(ir::value *x, ir::builder &builder, auto& inst_list = i->get_parent()->get_inst_list(); auto pos = ++std::find(inst_list.begin(), inst_list.end(), i); builder.set_insert_point(pos); + if(dynamic_cast(x)){ + ir::value *ret = builder.insert(ir::copy_to_shared_inst::create(x)); +// x->replace_all_uses_with(ret); + return ret; + } // default -- recursive clone ir::instruction *cloned = builder.insert(i->clone()); seen[i] = cloned; @@ -97,6 +102,9 @@ void coalesce::run(ir::module &mod) { r->replace_all_uses_with(cts); cts->replace_uses_of_with(cts, r); } + else{ + + } } } diff --git a/lib/driver/module.cc b/lib/driver/module.cc index d541b4d6c..0bf85c84f 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -92,10 +92,10 @@ void module::compile_llvm_module(std::unique_ptr module, const std file_type_t ft) { init_llvm(); // debug - llvm::legacy::PassManager pm; - pm.add(llvm::createPrintModulePass(llvm::outs())); +// llvm::legacy::PassManager pm; +// pm.add(llvm::createPrintModulePass(llvm::outs())); // pm.add(llvm::createVerifierPass()); - pm.run(*module); +// pm.run(*module); // create machine module->setTargetTriple(triple); std::string error; @@ -241,7 +241,6 @@ std::string cu_module::compile_llvm_module(std::unique_ptr module, cu_module::cu_module(driver::context * context, std::unique_ptr ll_module): cu_module(context, compile_llvm_module(std::move(ll_module), context->device())) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ - std::cout << source_ << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 7db1e1af1..04977966d 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -220,7 +220,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c axes.run(module); layouts.run(module); coalesce.run(module); -// ir::print(module, std::cout); + dce.run(module); align.run(module); dce.run(module); tiles.run(module); diff --git a/tests/bench/copy2d.cc b/tests/bench/copy2d.cc index c3433b2e2..6ee7f5496 100644 --- a/tests/bench/copy2d.cc +++ b/tests/bench/copy2d.cc @@ -11,19 +11,21 @@ #include "cuda/cublas.h" -std::vector do_bench(drv::stream* stream, int32_t M, int32_t N, order_t order){ +std::vector do_bench(drv::stream* stream, int32_t M, int32_t N, order_t order_x, order_t order_y){ typedef float NumericT; std::string ty = "float"; size_t dt_nbytes = sizeof(NumericT); drv::context* context = stream->context(); - int32_t ld = order == ROWMAJOR ? N : M; // create inputs auto dx = std::unique_ptr(drv::buffer::create(context, M*N*dt_nbytes)); auto dy = std::unique_ptr(drv::buffer::create(context, M*N*dt_nbytes)); // create options rt::function::options_space_t opt; opt.defines.push_back({"TYPE", {ty}}); - opt.defines.push_back({"ORDER", {order==ROWMAJOR?"ROWMAJOR":"COLMAJOR"}}); + opt.defines.push_back({"STRIDE_XM", {(order_x == ROWMAJOR)?"M":"1"}}); + opt.defines.push_back({"STRIDE_XN", {(order_x == ROWMAJOR)?"1":"N"}}); + opt.defines.push_back({"STRIDE_YM", {(order_y == ROWMAJOR)?"M":"1"}}); + opt.defines.push_back({"STRIDE_YN", {(order_y == ROWMAJOR)?"1":"N"}}); opt.defines.push_back({"TM", {"32"}}); opt.defines.push_back({"TN", {"32"}}); opt.num_warps = {4}; @@ -33,7 +35,7 @@ std::vector do_bench(drv::stream* stream, int32_t M, int32_t N, order_t std::vector result; auto gbps = [&](double ns) { return 2*M*N*dt_nbytes / (ns * 1e-9) * 1e-9; }; // triton - double triton_ns = triton::tools::bench([&]() { function({&*dx, &*dy, M, N, ld, ld}, grid2d(M, N), stream);}, stream); + double triton_ns = triton::tools::bench([&]() { function({&*dx, &*dy, M, N}, grid2d(M, N), stream);}, stream); result.push_back(gbps(triton_ns)); // done return result; @@ -44,21 +46,20 @@ int main() { auto context = triton::driver::backend::contexts::get_default(); triton::driver::stream* stream = triton::driver::stream::create(context); // shapes to benchmark - typedef std::tuple config_t; - std::vector configs; - for(auto x: std::vector{COLMAJOR}){ - std::vector tmp = { - config_t{4096, 4096, x} - }; - configs.insert(configs.end(), tmp.begin(), tmp.end()); - } + typedef std::tuple config_t; + std::vector configs = { + {4096, 4096, ROWMAJOR, ROWMAJOR}, + {4096, 4096, COLMAJOR, ROWMAJOR}, + {4096, 4096, ROWMAJOR, COLMAJOR}, + {4096, 4096, COLMAJOR, COLMAJOR}, + }; // does the work int32_t M, N; - order_t ord; + order_t ord_x, ord_y; for(const auto& c: configs){ - std::tie(M, N, ord) = c; - std::cout << "// " << M << ", " << N << ", " << ord << std::flush; - for(auto perf: do_bench(stream, M, N, ord)) + std::tie(M, N, ord_x, ord_y) = c; + std::cout << "// " << M << ", " << N << ", " << ord_x << ", " << ord_y << std::flush; + for(auto perf: do_bench(stream, M, N, ord_x, ord_y)) std::cout << ", " << perf << std::flush; std::cout << std::endl; } diff --git a/tests/common/src/copy.h b/tests/common/src/copy.h index b1d571b51..8b0f5d9dc 100644 --- a/tests/common/src/copy.h +++ b/tests/common/src/copy.h @@ -16,29 +16,16 @@ void copy1d(TYPE * X __noalias __readonly __aligned(16), const char *copy2d = R"( -#if ORDER == ROWMAJOR -#define STRIDE_XM ldx -#define STRIDE_XN 1 -#define STRIDE_YM ldy -#define STRIDE_YN 1 -#else -#define STRIDE_XM 1 -#define STRIDE_XN ldx -#define STRIDE_YM 1 -#define STRIDE_YN ldy -#endif - void copy2d(TYPE * X __noalias __readonly __aligned(16), TYPE * Y __noalias __writeonly __aligned(16), - int M, int N, - int ldx __multipleof(8), - int ldy __multipleof(8)) { + int M __multipleof(8), + int N __multipleof(8)) { int ridm = get_program_id(0); int ridn = get_program_id(1); int rm[TM] = ridm * TM + 0 ... TM; int rn[TN] = ridn * TN + 0 ... TN; - TYPE* px[TM, TN] = X + rm[:, newaxis] * ldx + rn[newaxis, :] ; - TYPE* py[TM, TN] = Y + rm[:, newaxis] + rn[newaxis, :] * ldy; + TYPE* px[TM, TN] = X + rm[:, newaxis] * STRIDE_XM + rn[newaxis, :] * STRIDE_XN; + TYPE* py[TM, TN] = Y + rm[:, newaxis] * STRIDE_YM + rn[newaxis, :] * STRIDE_YN; *py = *px; } )"; From e35be1ddcf514c0514518816e7cc310328689b86 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 19 Sep 2019 16:25:36 -0400 Subject: [PATCH 411/494] [ir][instruction] added identifier for each instruction --- include/triton/codegen/analysis/align.h | 2 + include/triton/codegen/analysis/axes.h | 17 +- include/triton/codegen/analysis/layout.h | 3 +- include/triton/codegen/transform/vectorize.h | 31 --- include/triton/ir/builder.h | 1 - include/triton/ir/enums.h | 61 ++++++ include/triton/ir/instructions.h | 143 +++++++------ include/triton/ir/{cfg.h => utils.h} | 7 + include/triton/runtime/function.h | 1 - lib/codegen/analysis/align.cc | 54 +++-- lib/codegen/analysis/axes.cc | 201 ++++++++++--------- lib/codegen/analysis/layout.cc | 28 +-- lib/codegen/analysis/meminfo.cc | 6 +- lib/codegen/analysis/tiles.cc | 28 +-- lib/codegen/selection.cc | 12 +- lib/codegen/target.cc | 36 +--- lib/codegen/transform/coalesce.cc | 2 +- lib/codegen/transform/dce.cc | 2 +- lib/codegen/transform/membar.cc | 2 +- lib/codegen/transform/reassociate.cc | 2 +- lib/codegen/transform/vectorize.cc | 41 ---- lib/driver/module.cc | 1 + lib/ir/builder.cc | 8 +- lib/ir/cfg.cc | 31 --- lib/ir/instructions.cc | 130 ++++++------ lib/ir/utils.cc | 54 +++++ 26 files changed, 460 insertions(+), 444 deletions(-) delete mode 100644 include/triton/codegen/transform/vectorize.h rename include/triton/ir/{cfg.h => utils.h} (50%) delete mode 100644 lib/codegen/transform/vectorize.cc delete mode 100644 lib/ir/cfg.cc create mode 100644 lib/ir/utils.cc diff --git a/include/triton/codegen/analysis/align.h b/include/triton/codegen/analysis/align.h index bbc5fe440..647db3984 100644 --- a/include/triton/codegen/analysis/align.h +++ b/include/triton/codegen/analysis/align.h @@ -55,6 +55,8 @@ private: std::vector populate_starting_multiple_gep(ir::getelementptr_inst* x); std::vector populate_starting_multiple_default(ir::value* v); std::vector populate_starting_multiple(ir::value *v); + // populate all maps + void populate(ir::value *v); public: void run(ir::module &mod); diff --git a/include/triton/codegen/analysis/axes.h b/include/triton/codegen/analysis/axes.h index f625c4193..d22fa5fa8 100644 --- a/include/triton/codegen/analysis/axes.h +++ b/include/triton/codegen/analysis/axes.h @@ -23,15 +23,24 @@ class axes { private: void add_constraint(node_t x, node_t y); - void init_c_phi(ir::instruction *i); - void init_c_graph(ir::instruction *v); + // update graph + void update_graph_store(ir::instruction *i); + void update_graph_reduce(ir::instruction *i); + void update_graph_reshape(ir::instruction *i); + void update_graph_splat(ir::instruction *i); + void update_graph_trans(ir::instruction *i); + void update_graph_broadcast(ir::instruction *i); + void update_graph_dot(ir::instruction *i); + void update_graph_elementwise(ir::instruction *i); + void update_graph(ir::instruction *i); + // connected components void connected_components(node_t x, std::set &nodes, graph_t &graph, unsigned group_id); public: axes(); void run(ir::module &mod); - unsigned get(ir::value *value, unsigned ax); - bool has(ir::value *value, unsigned ax); + unsigned get_id(ir::value *value, unsigned ax); + bool has_id(ir::value *value, unsigned ax); private: // constraints graph diff --git a/include/triton/codegen/analysis/layout.h b/include/triton/codegen/analysis/layout.h index 7bc14b08f..2e7fbd830 100644 --- a/include/triton/codegen/analysis/layout.h +++ b/include/triton/codegen/analysis/layout.h @@ -24,8 +24,9 @@ class layout { typedef std::map > graph_t; private: - // create edge + // graph creation void connect(ir::value *x, ir::value *y); + void make_graph(ir::instruction *i); // connected components void connected_components(node_t x, std::set &nodes, graph_t &graph, unsigned id); // list the axes of the given value diff --git a/include/triton/codegen/transform/vectorize.h b/include/triton/codegen/transform/vectorize.h deleted file mode 100644 index 0a6571b61..000000000 --- a/include/triton/codegen/transform/vectorize.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef TDL_INCLUDE_CODEGEN_VECTORIZE_H -#define TDL_INCLUDE_CODEGEN_VECTORIZE_H - -namespace triton { - -namespace ir { - class module; -} - -namespace codegen{ - -namespace analysis{ - class tiles; -} - -namespace transform{ - -class vectorize { -public: - vectorize(analysis::tiles *params): params_(params){} - void run(ir::module &mod); - -private: - analysis::tiles *params_; -}; - -} -} -} - -#endif diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h index 0b6c859b1..d5707265a 100644 --- a/include/triton/ir/builder.h +++ b/include/triton/ir/builder.h @@ -140,7 +140,6 @@ public: value *create_select(value *pred, value *if_value, value *else_value, const std::string &name = ""); // Intrinsics value *create_copy_to_shared(value *arg, const std::string &name = ""); - value *create_vectorize(value *arg, const std::string &name = ""); value *create_barrier(const std::string &name = ""); private: diff --git a/include/triton/ir/enums.h b/include/triton/ir/enums.h index 609fb2d46..19cf82086 100644 --- a/include/triton/ir/enums.h +++ b/include/triton/ir/enums.h @@ -77,6 +77,67 @@ enum cmp_pred_t { LAST_ICMP_PREDICATE }; +enum value_id_t: unsigned { + /* ------------ * + INSTRUCTIONS + * ------------ */ + INST_BEGIN, + // phi + INST_PHI, + // arithmetic + INST_BINOP, + INST_GETELEMENTPTR, + INST_SELECT, + INST_SQRT, + // cmp + INST_ICMP, + INST_FCMP, + // cast + INST_CAST_TRUNC, + INST_CAST_ZEXT, + INST_CAST_SEXT, + INST_CAST_FP_TRUNC, + INST_CAST_FP_EXT, + INST_CAST_UI_TO_FP, + INST_CAST_SI_TO_FP, + INST_CAST_FP_TO_UI, + INST_CAST_FP_TO_SI, + INST_CAST_PTR_TO_INT, + INST_CAST_INT_TO_PTR, + INST_CAST_BIT_CAST, + INST_CAST_ADDR_SPACE_CAST, + // terminators + INST_RETURN, + INST_COND_BRANCH, + INST_UNCOND_BRANCH, + // io + INST_UNMASKED_LOAD, + INST_MASKED_LOAD, + INST_UNMASKED_STORE, + INST_MASKED_STORE, + // retile + INST_RESHAPE, + INST_SPLAT, + INST_BROADCAST, + INST_DOWNCAST, + // builtin + INST_GET_PROGRAM_ID, + INST_GET_NUM_PROGRAMS, + // atomics + INST_ATOMIC_CAS, + INST_ATOMIC_EXCH, + INST_ATOMIC_ADD, + // array arithmetic + INST_TRANS, + INST_REDUCE, + INST_DOT, + // intrinsics + INST_COPY_TO_SHARED, + INST_BARRIER, + INST_MAKE_RANGE_DYN, + INST_MAKE_RANGE_STA, + INST_MAKE_RANGE +}; diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index dd85fd3a0..bafc1c2c3 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -40,7 +40,8 @@ private: protected: // constructors - instruction(type *ty, unsigned num_ops, unsigned num_results = 1, const std::string &name = "", instruction *next = nullptr); + instruction(type *ty, value_id_t ity, unsigned num_ops, + const std::string &name = "", instruction *next = nullptr); public: // parent @@ -59,32 +60,21 @@ public: // cloning ir::instruction* clone() { ir::instruction* res = clone_impl(); -// for(auto it = op_begin(); it != op_end(); it++){ -// (*it)->add_use(res); -// } - res->set_name("testcloned"); + for(auto it = op_begin(); it != op_end(); it++) + (*it)->add_use(res); res->parent_ = nullptr; return res; } + // instruction id + value_id_t get_id() const { return id_; } private: basic_block *parent_; std::map metadatas_; + value_id_t id_; }; -// result reference -class result_reference: public value { -public: - result_reference(instruction *ref, unsigned arg_id, const std::string &name = ""); - instruction *get_ref(); - unsigned get_arg_id(); - -private: - instruction *ref_; - unsigned arg_id_; -}; - //===----------------------------------------------------------------------===// // phi_node classes //===----------------------------------------------------------------------===// @@ -173,11 +163,13 @@ public: class cmp_inst: public instruction{ public: typedef cmp_pred_t pred_t; + private: std::string repr_impl() const; protected: - cmp_inst(type *ty, cmp_pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next); + cmp_inst(type *ty, value_id_t id, cmp_pred_t pred, + value *lhs, value *rhs, const std::string &name, instruction *next); static bool is_fp_predicate(cmp_pred_t pred); static bool is_int_predicate(cmp_pred_t pred); static type* make_cmp_result_type(type *ty); @@ -190,7 +182,8 @@ private: }; class icmp_inst: public cmp_inst { - using cmp_inst::cmp_inst; + icmp_inst(type *ty, cmp_pred_t pred, + value *lhs, value *rhs, const std::string &name, instruction *next); public: static icmp_inst* create(cmp_pred_t pred, value *lhs, value *rhs, @@ -199,7 +192,8 @@ public: }; class fcmp_inst: public cmp_inst { - using cmp_inst::cmp_inst; + fcmp_inst(type *ty, cmp_pred_t pred, + value *lhs, value *rhs, const std::string &name, instruction *next); public: static fcmp_inst* create(cmp_pred_t pred, value *lhs, value *rhs, @@ -213,7 +207,7 @@ public: class unary_inst: public instruction { protected: - unary_inst(type *Ty, value *v, const std::string &name, instruction *next); + unary_inst(type *ty, value_id_t id, value *v, const std::string &name, instruction *next); }; @@ -226,8 +220,8 @@ private: std::string repr_impl() const; protected: - cast_inst(type *ty, value *v, const std::string &name, instruction *next, cast_op_t op) - : unary_inst(ty, v, name, next), op_(op) { } + cast_inst(type *ty, value_id_t id, value *v, const std::string &name, instruction *next, cast_op_t op) + : unary_inst(ty, id, v, name, next), op_(op) { } private: static bool is_valid(cast_op_t op, value *arg, type *ty); @@ -246,27 +240,27 @@ private: cast_op_t op_; }; -#define TRITON_IR_DECLARE_CAST_INST_SIMPL(name, op) \ +#define TRITON_IR_DECLARE_CAST_INST_SIMPL(name, id, op) \ class name : public cast_inst { \ _TRITON_DEFINE_CLONE(name); \ friend class cast_inst; \ name(type *ty, value *v, const std::string &name, instruction *next) \ - : cast_inst(ty, v, name, next, op){ } \ + : cast_inst(ty, id, v, name, next, op){ } \ }; -TRITON_IR_DECLARE_CAST_INST_SIMPL(trunc_inst, cast_op_t::Trunc) -TRITON_IR_DECLARE_CAST_INST_SIMPL(z_ext_inst, cast_op_t::ZExt) -TRITON_IR_DECLARE_CAST_INST_SIMPL(s_ext_inst, cast_op_t::SExt) -TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_trunc_inst, cast_op_t::FPTrunc) -TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_ext_inst, cast_op_t::FPExt) -TRITON_IR_DECLARE_CAST_INST_SIMPL(ui_to_fp_inst, cast_op_t::UIToFP) -TRITON_IR_DECLARE_CAST_INST_SIMPL(si_to_fp_inst, cast_op_t::SIToFP) -TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_to_ui_inst, cast_op_t::FPToUI) -TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_to_si_inst, cast_op_t::FPToSI) -TRITON_IR_DECLARE_CAST_INST_SIMPL(ptr_to_int_inst, cast_op_t::PtrToInt) -TRITON_IR_DECLARE_CAST_INST_SIMPL(int_to_ptr_inst, cast_op_t::IntToPtr) -TRITON_IR_DECLARE_CAST_INST_SIMPL(bit_cast_inst, cast_op_t::BitCast) -TRITON_IR_DECLARE_CAST_INST_SIMPL(addr_space_cast_inst, cast_op_t::AddrSpaceCast) +TRITON_IR_DECLARE_CAST_INST_SIMPL(trunc_inst, INST_CAST_TRUNC, cast_op_t::Trunc) +TRITON_IR_DECLARE_CAST_INST_SIMPL(z_ext_inst, INST_CAST_ZEXT, cast_op_t::ZExt) +TRITON_IR_DECLARE_CAST_INST_SIMPL(s_ext_inst, INST_CAST_SEXT, cast_op_t::SExt) +TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_trunc_inst, INST_CAST_FP_TRUNC, cast_op_t::FPTrunc) +TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_ext_inst, INST_CAST_FP_EXT, cast_op_t::FPExt) +TRITON_IR_DECLARE_CAST_INST_SIMPL(ui_to_fp_inst, INST_CAST_UI_TO_FP, cast_op_t::UIToFP) +TRITON_IR_DECLARE_CAST_INST_SIMPL(si_to_fp_inst, INST_CAST_SI_TO_FP, cast_op_t::SIToFP) +TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_to_ui_inst, INST_CAST_FP_TO_UI, cast_op_t::FPToUI) +TRITON_IR_DECLARE_CAST_INST_SIMPL(fp_to_si_inst, INST_CAST_FP_TO_SI, cast_op_t::FPToSI) +TRITON_IR_DECLARE_CAST_INST_SIMPL(ptr_to_int_inst, INST_CAST_PTR_TO_INT, cast_op_t::PtrToInt) +TRITON_IR_DECLARE_CAST_INST_SIMPL(int_to_ptr_inst, INST_CAST_INT_TO_PTR, cast_op_t::IntToPtr) +TRITON_IR_DECLARE_CAST_INST_SIMPL(bit_cast_inst, INST_CAST_BIT_CAST, cast_op_t::BitCast) +TRITON_IR_DECLARE_CAST_INST_SIMPL(addr_space_cast_inst, INST_CAST_ADDR_SPACE_CAST, cast_op_t::AddrSpaceCast) //===----------------------------------------------------------------------===// // terminator_inst classes @@ -372,33 +366,38 @@ private: class io_inst: public instruction { protected: - io_inst(type *ty, unsigned num_ops, unsigned num_results = 1, const std::string &name = "", instruction *next = nullptr); + io_inst(type *ty, value_id_t id, unsigned num_ops, + const std::string &name = "", instruction *next = nullptr); public: // accessors value *get_pointer_operand() { return get_operand(0); } - -// value *get_mask() const; -// value *get_false_value() const; }; +// load class load_inst: public io_inst { protected: - load_inst(value *ptr, unsigned num_extra_ops, const std::string &name, instruction *next); + load_inst(value *ptr, value_id_t id, unsigned num_ops, + const std::string &name = "", instruction *next = nullptr); private: - std::string repr_impl() const { return "load"; } static type *get_pointee_type(type *ty); - -public: - - // factory method - static load_inst* create(value *ptr, - const std::string &name = "", - instruction *next = nullptr); - _TRITON_DEFINE_CLONE(load_inst) }; +// unmasked load +class unmasked_load_inst: public load_inst { +private: + std::string repr_impl() const { return "unmasked_load"; } + unmasked_load_inst(value *ptr, const std::string &name, instruction *next); + +public: + static unmasked_load_inst* create(value *ptr, + const std::string &name = "", + instruction *next = nullptr); + _TRITON_DEFINE_CLONE(unmasked_load_inst) +}; + +// masked load class masked_load_inst: public load_inst { private: std::string repr_impl() const { return "masked_load"; } @@ -416,22 +415,28 @@ public: _TRITON_DEFINE_CLONE(masked_load_inst) }; -class store_inst: public io_inst{ +// store +class store_inst: public io_inst { protected: - store_inst(value *ptr, value *v, unsigned num_extra_ops, - const std::string &name, instruction *next); - -private: - std::string repr_impl() const { return "store"; } + store_inst(value *ptr, value_id_t id, unsigned num_ops, + const std::string &name = "", instruction *next = nullptr); public: - // accessors value *get_value_operand() { return get_operand(1); } +}; + +// unmasked_store +class unmasked_store_inst: public store_inst{ +private: + std::string repr_impl() const { return "unmasked_store"; } + unmasked_store_inst(value *ptr, value *v, const std::string &name, instruction *next); + +public: // factory method - static store_inst* create(value* ptr, value *v, - const std::string &name = "", - instruction *next = nullptr); - _TRITON_DEFINE_CLONE(store_inst) + static unmasked_store_inst* create(value* ptr, value *v, + const std::string &name = "", + instruction *next = nullptr); + _TRITON_DEFINE_CLONE(unmasked_store_inst) }; class masked_store_inst: public store_inst{ @@ -458,7 +463,7 @@ public: class retile_inst: public unary_inst { protected: - retile_inst(value *arg, const type::tile_shapes_t &shapes, const std::string &name, instruction *next); + retile_inst(value *arg, value_id_t id, const type::tile_shapes_t &shapes, const std::string &name, instruction *next); }; // reshape @@ -690,16 +695,6 @@ public: instruction *next = nullptr); }; -class vectorize_inst: public unary_inst{ -private: - using unary_inst::unary_inst; - std::string repr_impl() const { return "vectorize"; } - _TRITON_DEFINE_CLONE(vectorize_inst) - -public: - static vectorize_inst* create(value *arg, const std::string &name = "", instruction *next = nullptr); -}; - // On NVIDIA, implementation is such that // constant_range = nv_dynamic_program_idx + nv_static_program_idx // so as to enable re-association on nv_static_program_idx which is constant diff --git a/include/triton/ir/cfg.h b/include/triton/ir/utils.h similarity index 50% rename from include/triton/ir/cfg.h rename to include/triton/ir/utils.h index a61ff6dee..3b9e2f5f3 100644 --- a/include/triton/ir/cfg.h +++ b/include/triton/ir/utils.h @@ -4,18 +4,25 @@ #define _TRITON_IR_CFG_H_ #include +#include namespace triton{ namespace ir{ +class module; class function; class basic_block; +class instruction; +class value; class cfg { public: static std::vector reverse_post_order(function* fn); }; +void for_each_instruction(ir::module& mod, const std::function &fn); +void for_each_value(ir::module& mod, const std::function &fn); + } } diff --git a/include/triton/runtime/function.h b/include/triton/runtime/function.h index 9d04cad78..0eaa9a33d 100644 --- a/include/triton/runtime/function.h +++ b/include/triton/runtime/function.h @@ -20,7 +20,6 @@ #include "triton/codegen/transform/peephole.h" #include "triton/codegen/transform/membar.h" #include "triton/codegen/transform/reassociate.h" -#include "triton/codegen/transform/vectorize.h" #include "triton/lang/parser.h" #include "triton/runtime/arg.h" diff --git a/lib/codegen/analysis/align.cc b/lib/codegen/analysis/align.cc index fda2f6e32..f84e8d692 100644 --- a/lib/codegen/analysis/align.cc +++ b/lib/codegen/analysis/align.cc @@ -1,4 +1,5 @@ #include "triton/codegen/analysis/align.h" +#include "triton/ir/utils.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" @@ -14,15 +15,19 @@ namespace analysis{ inline int gcd(int a, int b) { - if (a == 0) - return b; - if (b == 0) - return a; - if (a == b) - return a; - if (a > b) - return gcd(a-b, b); - return gcd(a, b-a); + if (a == 0) + return b; + if (b == 0) + return a; + if (a == b) + return a; + if (a > b) + return gcd(a - b, b); + return gcd(a, b - a); +} + +inline int lcm(int a, int b) { + return (a * b) / gcd(a, b); } template @@ -64,8 +69,8 @@ std::vector align::populate_is_constant_phi(ir::phi_node* x) { std::vector align::populate_is_constant_splat(ir::splat_inst* x) { auto shapes = get_shapes(x); - std::vector result; ir::value* op = x->get_operand(0); + std::vector result; auto op_cst = populate_is_constant(op); for(auto d: shapes) result.push_back(cst_info{d, op_cst[0].value}); @@ -478,28 +483,15 @@ std::vector align::contiguous(ir::value* v) const { return max_contiguous_.at(v); } + +void align::populate(ir::value *v) { + populate_is_constant(v); + populate_starting_multiple(v); + populate_max_contiguous(v); +} + void align::run(ir::module &mod) { - - // populate constant - for(ir::function *fn: mod.get_function_list()) - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i: block->get_inst_list()){ - populate_is_constant(i); - } - - // populate starting multiple - for(ir::function *fn: mod.get_function_list()) - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i: block->get_inst_list()){ - populate_starting_multiple(i); - } - - // populate maximum contiguous - for(ir::function *fn: mod.get_function_list()) - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i: block->get_inst_list()){ - populate_max_contiguous(i); - } + ir::for_each_value(mod, [this](ir::value* v) { populate(v); } ); } diff --git a/lib/codegen/analysis/axes.cc b/lib/codegen/analysis/axes.cc index 99fc59234..790c8a36b 100644 --- a/lib/codegen/analysis/axes.cc +++ b/lib/codegen/analysis/axes.cc @@ -1,5 +1,6 @@ #include "triton/codegen/analysis/axes.h" #include "triton/ir/instructions.h" +#include "triton/ir/utils.h" #include "triton/ir/type.h" #include "triton/ir/module.h" #include "triton/ir/function.h" @@ -30,91 +31,113 @@ void axes::add_constraint(node_t x, node_t y) { nodes_.insert(y); } -void axes::init_c_graph(ir::instruction *v) { - // Reference shape - ir::type::tile_shapes_t shapes; - if(auto *store = dynamic_cast(v)) - shapes = store->get_pointer_operand()->get_type()->get_tile_shapes(); - else if(auto *atom = dynamic_cast(v)) - shapes = atom->get_operand(0)->get_type()->get_tile_shapes(); - else if(dynamic_cast(v)) + +void axes::update_graph_reduce(ir::instruction *i) { + auto* red = static_cast(i); + unsigned axis = red->get_axis(); + ir::value *arg = red->get_operand(0); + auto in_shapes = arg->get_type()->get_tile_shapes(); + unsigned current = 0; + for(unsigned d = 0; d < in_shapes.size(); d++){ + if(d == axis) + continue; + add_constraint({i, current++}, {arg, d}); + } +} + +void axes::update_graph_reshape(ir::instruction *i) { + auto* reshape = static_cast(i); + // operands + ir::value *op = reshape->get_operand(0); + // shapes + auto op_shapes = op->get_type()->get_tile_shapes(); + auto res_shapes = reshape->get_type()->get_tile_shapes(); + // construct edges + unsigned current = 0; + bool is_skewed = false; + for(unsigned d = 0; d < res_shapes.size(); d ++){ + bool same_shape = res_shapes[d] == op_shapes[current]; + // either add edge between axis or just add a node in the graph + if(!is_skewed && same_shape) + add_constraint({i, d}, {op, current++}); + else + add_constraint({i, d}, {i, d}); + // reshaping is skewed + if(res_shapes[d] > 1 && !same_shape) + is_skewed = true; + } +} + +void axes::update_graph_splat(ir::instruction *) { + // argument is scalar so don't make any edge + return; +} + +void axes::update_graph_trans(ir::instruction *i) { + auto *trans = static_cast(i); + ir::value *op = trans->get_operand(0); + auto perm = trans->get_perm(); + // add edge between axis perm[d] and axis d + for(unsigned d = 0; d < perm.size(); d++) + add_constraint({i, perm[d]->get_value()}, {op, d}); +} + +void axes::update_graph_broadcast(ir::instruction *i) { + auto *broadcast = static_cast(i); + auto shapes = broadcast->get_type()->get_tile_shapes(); + ir::value *op = broadcast->get_operand(0); + ir::type *op_ty = op->get_type(); + const auto& op_shapes = op_ty->get_tile_shapes(); + // add edge between non-broadcast axes + for(unsigned d = 0; d < shapes.size(); d ++) + if(op_shapes[d] == shapes[d]) + add_constraint({i, d}, {op, d}); +} + +void axes::update_graph_dot(ir::instruction *i) { + auto *dot = static_cast(i); + auto shapes = dot->get_type()->get_tile_shapes(); + ir::value *A = dot->get_operand(0); + ir::value *B = dot->get_operand(1); + ir::value *D = dot->get_operand(2); + // add edges between result and accumulator + for(unsigned d = 0; d < shapes.size(); d++) + add_constraint({dot, d}, {D, d}); + // add edge for batch dimension + for(unsigned d = 2; d < shapes.size(); d++){ + add_constraint({dot, d}, {A, d}); + add_constraint({dot, d}, {B, d}); + } +} + +void axes::update_graph_elementwise(ir::instruction *i) { + if(i->get_num_operands() == 0) return; - else if(dynamic_cast(v)) - return; - else if(auto *reduce = dynamic_cast(v)) { - unsigned axis = reduce->get_axis(); - ir::value *arg = reduce->get_operand(0); - auto in_shapes = arg->get_type()->get_tile_shapes(); - unsigned current = 0; - for(unsigned i = 0; i < in_shapes.size(); i++){ - if(i == axis) - continue; - add_constraint({reduce, current++}, {arg, i}); - } + ir::value *op = i->get_operand(0); + if(!op->get_type()->is_tile_ty()) return; + auto rank = op->get_type()->get_tile_rank(); + for(unsigned d = 0; d < rank; d++) + for(ir::value* opx: i->ops()) + for(ir::value* opy: i->ops()){ + if(!i->get_type()->is_void_ty()) + add_constraint({i, d}, {opx, d}); + add_constraint({opx, d}, {opy, d}); } - else - shapes = v->get_type()->get_tile_shapes(); - // Reshape - if(dynamic_cast(v)) { - ir::value *op = v->get_operand(0); - auto op_shapes = op->get_type()->get_tile_shapes(); - unsigned current = 0; - bool is_skewed = false; - for(unsigned i = 0; i < shapes.size(); i ++){ - if(shapes[i] == 1){ - add_constraint({v, i}, {v, i}); - } - else if(!is_skewed && - shapes[i] == op_shapes[current]) - add_constraint({v, i}, {op, current++}); - else{ - is_skewed = true; - add_constraint({v, i}, {v, i}); - } - } - } - // Splat - else if(dynamic_cast(v)){ - return; - } - // Trans - else if(auto *x = dynamic_cast(v)){ - ir::value *op = v->get_operand(0); - auto perm = x->get_perm(); - for(unsigned i = 0; i < perm.size(); i++) - add_constraint({v, perm[i]->get_value()}, {op, i}); - } - // Broadcast - else if(dynamic_cast(v)){ - ir::value *op = v->get_operand(0); - ir::type *op_ty = op->get_type(); - const auto& op_shapes = op_ty->get_tile_shapes(); - for(unsigned i = 0; i < shapes.size(); i ++){ - if(op_shapes[i] == shapes[i] && v != op) - add_constraint({v, i}, {op, i}); - } - } - // Matrix multiplication - else if(dynamic_cast(v)){ - ir::value *A = v->get_operand(0); - ir::value *B = v->get_operand(1); - ir::value *D = v->get_operand(2); - for(unsigned i = 0; i < shapes.size(); i++) - add_constraint({v, i}, {D, i}); - for(unsigned i = 2; i < shapes.size(); i++){ - add_constraint({v, i}, {A, i}); - add_constraint({v, i}, {B, i}); - } - } - // Element-wise - else if(dynamic_cast(v)) { - for(unsigned i = 0; i < shapes.size(); i ++){ - std::vector ops = v->ops(); - for(ir::value* op: ops) - add_constraint({v, i}, {op, i}); - } +} + + +void axes::update_graph(ir::instruction *i) { + switch (i->get_id()) { + case ir::INST_REDUCE: return update_graph_reduce(i); + case ir::INST_RESHAPE: return update_graph_reshape(i); + case ir::INST_SPLAT: return update_graph_splat(i); + case ir::INST_TRANS: return update_graph_trans(i); + case ir::INST_BROADCAST: return update_graph_broadcast(i); + case ir::INST_DOT: return update_graph_dot(i); + default: return update_graph_elementwise(i); } + return; } void axes::connected_components(node_t x, std::set &nodes, graph_t &graph, unsigned group_id) { @@ -126,12 +149,12 @@ void axes::connected_components(node_t x, std::set &nodes, graph_t &grap } } -unsigned axes::get(ir::value *value, unsigned ax) { +unsigned axes::get_id(ir::value *value, unsigned ax) { unsigned result = groups_.at(value).at(ax); return result; } -bool axes::has(ir::value *value, unsigned ax) { +bool axes::has_id(ir::value *value, unsigned ax) { auto it = groups_.find(value); if(it == groups_.end()) return false; @@ -146,15 +169,9 @@ void axes::run(ir::module &mod) { nodes_.clear(); dependencies_.clear(); groups_.clear(); - // Create graph - for(ir::function *fn: mod.get_function_list()){ - // Build constraints graph - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i : block->get_inst_list()) - if(i->has_tile_result_or_op()) - init_c_graph(i); - } - // Axes + // make graph + ir::for_each_instruction(mod, [this](ir::instruction *x) { update_graph(x); }); + // connected components unsigned group_id = 0; while(!nodes_.empty()) connected_components(*nodes_.begin(), nodes_, dependencies_, group_id++); diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index 0f376b4fc..77b25e0bb 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -4,6 +4,7 @@ #include "triton/codegen/analysis/layout.h" #include "triton/ir/function.h" #include "triton/ir/module.h" +#include "triton/ir/utils.h" namespace triton{ namespace codegen{ @@ -20,8 +21,8 @@ std::set layout::axes_of(ir::value *value) { // create result std::set result; for(size_t d = 0; d < rank; d++){ - if(axes_->has(value, d)) - result.insert(axes_->get(value, d)); + if(axes_->has_id(value, d)) + result.insert(axes_->get_id(value, d)); } return result; } @@ -74,24 +75,23 @@ void layout::connect(ir::value *x, ir::value *y) { } } +void layout::make_graph(ir::instruction *i) { + for(ir::value* opx: i->ops()) + for(ir::value* opy: i->ops()){ + connect(i, opx); + connect(opx, opy); + } +} + // run void layout::run(ir::module &mod) { nodes_.clear(); dependencies_.clear(); groups_.clear(); values_.clear(); - // Create graph - for(ir::function *fn: mod.get_function_list()) - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i : block->get_inst_list()) { - for(ir::value* opx: i->ops()) - for(ir::value* opy: i->ops()){ - connect(i, opx); - connect(opx, opy); - } - - } - // Grids + // make graph + ir::for_each_instruction(mod, [this](ir::instruction* i) { make_graph(i); }); + // connected components unsigned group_id = 0; while(!nodes_.empty()){ connected_components(*nodes_.begin(), nodes_, dependencies_, group_id++); diff --git a/lib/codegen/analysis/meminfo.cc b/lib/codegen/analysis/meminfo.cc index 314c272c0..be55d6ac7 100644 --- a/lib/codegen/analysis/meminfo.cc +++ b/lib/codegen/analysis/meminfo.cc @@ -82,9 +82,9 @@ void add_copy(ir::value *x, ir::builder &builder) { } void meminfo::run(ir::module &mod) { -// shared_.clear(); -// refs_.clear(); -// double_.clear(); + shared_.clear(); + refs_.clear(); + double_.clear(); // Add shared copies for(ir::function *fn: mod.get_function_list()){ diff --git a/lib/codegen/analysis/tiles.cc b/lib/codegen/analysis/tiles.cc index d1b26a6f9..7d4d81376 100644 --- a/lib/codegen/analysis/tiles.cc +++ b/lib/codegen/analysis/tiles.cc @@ -43,19 +43,19 @@ bool tiles::hmma(ir::value *value) { } int tiles::mts(ir::value *value, unsigned ax) { - return mts_.at(axes_->get(value, ax)); + return mts_.at(axes_->get_id(value, ax)); } int tiles::nts(ir::value *value, unsigned ax) { - return nts_.at(axes_->get(value, ax)); + return nts_.at(axes_->get_id(value, ax)); } int tiles::fpw(ir::value *value, unsigned ax) { - return fpw_.at(axes_->get(value, ax)); + return fpw_.at(axes_->get_id(value, ax)); } int tiles::wpt(ir::value *value, unsigned ax) { - return wpt_.at(axes_->get(value, ax)); + return wpt_.at(axes_->get_id(value, ax)); } std::vector tiles::order(ir::value *v) { @@ -92,7 +92,7 @@ void tiles::init_hmma_tile(ir::value *i) { }while(fpw_nm1 != fpw); // store parameters for(unsigned d = 0; d < shapes.size(); d++) - fpw_[axes_->get(i, d)] = fpw[d]; + fpw_[axes_->get_id(i, d)] = fpw[d]; /* warps per tile */ // try to make things as square as possible to maximize data re-use std::vector wpt = {1, 1, 1}; @@ -106,11 +106,11 @@ void tiles::init_hmma_tile(ir::value *i) { }while(wpt_nm1 != wpt); // store parameters for(unsigned d = 0; d < shapes.size(); d++) - wpt_[axes_->get(i, d)] = wpt[d]; + wpt_[axes_->get_id(i, d)] = wpt[d]; /* sanity check */ unsigned effective_num_warps = 1; for(size_t d = 0; d < shapes.size(); d++) - effective_num_warps *= wpt_[axes_->get(i, d)]; + effective_num_warps *= wpt_[axes_->get_id(i, d)]; if(num_warps_ != effective_num_warps) throw std::runtime_error("cannot create a kernel with this amount of warps"); } @@ -122,19 +122,19 @@ void tiles::init_scanline_tile(ir::value *i) { unsigned ld = ord[0]; unsigned num_threads = num_warps_*32; unsigned current = num_threads; - nts_[axes_->get(i, ld)] = clamp(size / num_threads, 1, 4); - mts_[axes_->get(i, ld)] = clamp(current, 1, shapes[ld] / nts_[axes_->get(i, ld)]); - current = current / mts_[axes_->get(i, ld)]; + nts_[axes_->get_id(i, ld)] = clamp(size / num_threads, 1, 4); + mts_[axes_->get_id(i, ld)] = clamp(current, 1, shapes[ld] / nts_[axes_->get_id(i, ld)]); + current = current / mts_[axes_->get_id(i, ld)]; for(size_t d = 1; d < shapes.size(); d++){ ld = ord[d]; - nts_[axes_->get(i, ld)] = 1; - mts_[axes_->get(i, ld)] = clamp(current, 1, shapes[ld]); - current = current / mts_[axes_->get(i, ld)]; + nts_[axes_->get_id(i, ld)] = 1; + mts_[axes_->get_id(i, ld)] = clamp(current, 1, shapes[ld]); + current = current / mts_[axes_->get_id(i, ld)]; } /* sanity check */ unsigned effective_num_threads = 1; for(size_t d = 0; d < shapes.size(); d++) - effective_num_threads *= mts_[axes_->get(i, d)]; + effective_num_threads *= mts_[axes_->get_id(i, d)]; if(num_threads != effective_num_threads) throw std::runtime_error("cannot create a kernel with this amount of warps"); } diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index c6592a59c..d89a4e1c5 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -615,7 +615,7 @@ void selection::init_strided_scan_axes(ir::value *v, IRBuilder<> &builder, Value unsigned offset = n / contiguous[k] * per_block + n % contiguous[k]; idx_list[n] = builder.CreateAdd(scaled_thread_id, builder.getInt32(offset), "idx_" + str_k + "_" + std::to_string(n)); } - axes_[a_axes_->get(v, k)] = distributed_axis{contiguous[k], idx_list, thread_id}; + axes_[a_axes_->get_id(v, k)] = distributed_axis{contiguous[k], idx_list, thread_id}; } } @@ -720,10 +720,10 @@ void selection::init_hmma_axes(ir::value *v, IRBuilder<> &builder, Value *u_thre /* axes */ - axes_[a_axes_->get(v, 0)] = distributed_axis{1, idx_i, warp_id_0}; - axes_[a_axes_->get(v, 1)] = distributed_axis{1, idx_j, warp_id_1}; + axes_[a_axes_->get_id(v, 0)] = distributed_axis{1, idx_i, warp_id_0}; + axes_[a_axes_->get_id(v, 1)] = distributed_axis{1, idx_j, warp_id_1}; if(is_batched) - axes_[a_axes_->get(v, 2)] = distributed_axis{1, idx_z, warp_id_2}; + axes_[a_axes_->get_id(v, 2)] = distributed_axis{1, idx_z, warp_id_2}; } @@ -791,7 +791,7 @@ void selection::create_distributed_tile(ir::value *v, IRBuilder<> &builder) { std::vector axes(shapes.size()); for(size_t d = 0; d < shapes.size(); d++){ if(shapes[d] > 1){ - unsigned x = a_axes_->get(v, d); + unsigned x = a_axes_->get_id(v, d); axes[d] = axes_.at(x); } else{ @@ -942,7 +942,7 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, Value *base_ptr = builder.CreateBitCast(sh_mem_ptr_, PointerType::get(res_ty, addr_space)); for(auto& x: partial) { // current element being computed - Value *lane = axes_.at(a_axes_->get(op, axis)).thread_id; + Value *lane = axes_.at(a_axes_->get_id(op, axis)).thread_id; Value *&result = x.second; indices_t write_idx = x.first; write_idx.insert(write_idx.begin() + axis, lane); diff --git a/lib/codegen/target.cc b/lib/codegen/target.cc index 4116bcca7..f63b4b899 100644 --- a/lib/codegen/target.cc +++ b/lib/codegen/target.cc @@ -103,39 +103,9 @@ Value* nvidia_cu_target::get_block_id(Module *module, IRBuilder<>& builder, unsi Intrinsic::nvvm_read_ptx_sreg_ctaid_y, Intrinsic::nvvm_read_ptx_sreg_ctaid_z }; -// bool z_order = true; -// if(z_order && ax < 2){ -// static std::array n_cta_ids = { -// Intrinsic::nvvm_read_ptx_sreg_nctaid_x, -// Intrinsic::nvvm_read_ptx_sreg_nctaid_y, -// Intrinsic::nvvm_read_ptx_sreg_nctaid_z -// }; -// Value* cta_id_0 = builder.CreateIntrinsic(cta_ids[0], {}, {}); -// Value* cta_id_1 = builder.CreateIntrinsic(cta_ids[1], {}, {}); -// Value* n_cta_id_0 = builder.CreateIntrinsic(n_cta_ids[0], {}, {}); -// Value* n_cta_id_1 = builder.CreateIntrinsic(n_cta_ids[1], {}, {}); -// // global block ID -// Value* bid = builder.CreateAdd(cta_id_0, builder.CreateMul(cta_id_1, n_cta_id_0)); -// // helper for minimum -// auto Min = [&](Value *x, Value *y){ -// return builder.CreateSelect(builder.CreateICmpSGE(x, y), y, x); -// }; -// // super-tile size -// Value* sts = Min(builder.getInt32(16), n_cta_id_1); -// // number of CTAs per super-block -// Value *nscta = builder.CreateMul(n_cta_id_0, sts); -// Value *bid0 = builder.CreateURem(builder.CreateUDiv(bid, sts), n_cta_id_0); -// Value *bid1 = builder.CreateAdd(builder.CreateMul(builder.CreateUDiv(bid, nscta), sts),builder.CreateURem(bid, sts)); -// if(ax == 0) -// return bid0; -// else -// return bid1; -// } -// else{ - Value* get_cta_id = Intrinsic::getDeclaration(module, cta_ids[ax]); - Value* cta_id = builder.CreateCall(get_cta_id, {}); - return cta_id; -// } + Value* get_cta_id = Intrinsic::getDeclaration(module, cta_ids[ax]); + Value* cta_id = builder.CreateCall(get_cta_id, {}); + return cta_id; } Value* nvidia_cu_target::get_local_id(Module *module, IRBuilder<>& builder, unsigned ax) { diff --git a/lib/codegen/transform/coalesce.cc b/lib/codegen/transform/coalesce.cc index 873f7a9f5..455f2fb5d 100644 --- a/lib/codegen/transform/coalesce.cc +++ b/lib/codegen/transform/coalesce.cc @@ -2,7 +2,7 @@ #include #include #include "triton/ir/function.h" -#include "triton/ir/cfg.h" +#include "triton/ir/utils.h" #include "triton/ir/basic_block.h" #include "triton/ir/instructions.h" #include "triton/ir/module.h" diff --git a/lib/codegen/transform/dce.cc b/lib/codegen/transform/dce.cc index a1b5880c5..18406b4ab 100644 --- a/lib/codegen/transform/dce.cc +++ b/lib/codegen/transform/dce.cc @@ -1,7 +1,7 @@ #include "triton/ir/function.h" #include "triton/ir/basic_block.h" #include "triton/ir/module.h" -#include "triton/ir/cfg.h" +#include "triton/ir/utils.h" #include "triton/codegen/transform/dce.h" #include diff --git a/lib/codegen/transform/membar.cc b/lib/codegen/transform/membar.cc index fc6891ea8..b8b029d9a 100644 --- a/lib/codegen/transform/membar.cc +++ b/lib/codegen/transform/membar.cc @@ -9,7 +9,7 @@ #include "triton/ir/function.h" #include "triton/ir/basic_block.h" #include "triton/ir/instructions.h" -#include "triton/ir/cfg.h" +#include "triton/ir/utils.h" namespace triton { diff --git a/lib/codegen/transform/reassociate.cc b/lib/codegen/transform/reassociate.cc index 8ca89cda2..38e8c79ed 100644 --- a/lib/codegen/transform/reassociate.cc +++ b/lib/codegen/transform/reassociate.cc @@ -6,7 +6,7 @@ #include "triton/ir/function.h" #include "triton/ir/basic_block.h" #include "triton/ir/instructions.h" -#include "triton/ir/cfg.h" +#include "triton/ir/utils.h" namespace triton { namespace codegen{ diff --git a/lib/codegen/transform/vectorize.cc b/lib/codegen/transform/vectorize.cc deleted file mode 100644 index ef120f903..000000000 --- a/lib/codegen/transform/vectorize.cc +++ /dev/null @@ -1,41 +0,0 @@ -#include "triton/codegen/transform/vectorize.h" -#include "triton/codegen/analysis/tiles.h" -#include "triton/ir/module.h" -#include "triton/ir/function.h" -#include "triton/ir/basic_block.h" -#include "triton/ir/instructions.h" - -namespace triton { - -namespace codegen{ -namespace transform{ - -void vectorize::run(ir::module &mod) { - ir::builder &builder = mod.get_builder(); - for(ir::function *fn: mod.get_function_list()) - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i: block->get_inst_list()){ - if(auto *trans = dynamic_cast(i)){ - ir::value *x = i->get_operand(0); - if(trans->get_perm()[0]->get_value() != 0) - continue; - builder.set_insert_point(i); - ir::instruction *rx = (ir::instruction*)builder.create_vectorize(x); - x->replace_all_uses_with(rx); - rx->set_operand(0, x); - } - if(dynamic_cast(i)){ - ir::value *x = i->get_operand(0); - if(params_->nts(x, 0) == 1) - continue; - builder.set_insert_point(i); - ir::instruction *rx = (ir::instruction*)builder.create_vectorize(x); - x->replace_all_uses_with(rx); - rx->set_operand(0, x); - } - } -} - -} -} -} diff --git a/lib/driver/module.cc b/lib/driver/module.cc index 0bf85c84f..66c775ac6 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -241,6 +241,7 @@ std::string cu_module::compile_llvm_module(std::unique_ptr module, cu_module::cu_module(driver::context * context, std::unique_ptr ll_module): cu_module(context, compile_llvm_module(std::move(ll_module), context->device())) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ +// std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/ir/builder.cc b/lib/ir/builder.cc index 458365a60..00450b547 100644 --- a/lib/ir/builder.cc +++ b/lib/ir/builder.cc @@ -252,11 +252,11 @@ DEFINE_FCMP_INSTR(ONE, cmp_pred_t::FCMP_ONE) //===----------------------------------------------------------------------===// value *builder::create_load(value *ptr, const std::string &name){ - return insert(load_inst::create(ptr, name)); + return insert(unmasked_load_inst::create(ptr, name)); } value *builder::create_store(value *ptr, value *val, const std::string &name){ - return insert(store_inst::create(ptr, val, name)); + return insert(unmasked_store_inst::create(ptr, val, name)); } value *builder::create_masked_load(value *ptr, value *mask, value *false_value, const std::string &name){ @@ -340,10 +340,6 @@ value *builder::create_copy_to_shared(value *arg, const std::string &name) { return insert(copy_to_shared_inst::create(arg, name)); } -value *builder::create_vectorize(value *arg, const std::string &name) { - return insert(vectorize_inst::create(arg, name)); -} - value *builder::create_barrier(const std::string &name) { return insert(barrier_inst::create(ctx_, name)); } diff --git a/lib/ir/cfg.cc b/lib/ir/cfg.cc deleted file mode 100644 index 5b19849d4..000000000 --- a/lib/ir/cfg.cc +++ /dev/null @@ -1,31 +0,0 @@ -#include -#include "triton/ir/cfg.h" -#include "triton/ir/basic_block.h" -#include "triton/ir/function.h" - -namespace triton{ -namespace ir{ - -std::vector cfg::reverse_post_order(function* fn) { - std::stack stack; - std::set visited; - std::vector result; - // initialize stack - for(ir::basic_block* block: fn->blocks()) - if(block->get_predecessors().empty()) - stack.push(block); - // DFS - while(!stack.empty()) { - basic_block* current = stack.top(); - stack.pop(); - result.push_back(current); - visited.insert(current); - for(basic_block* succ: current->get_successors()) - if(visited.find(succ) == visited.end()) - stack.push(succ); - } - return std::move(result); -} - -} -} diff --git a/lib/ir/instructions.cc b/lib/ir/instructions.cc index acecc08b5..e89367536 100644 --- a/lib/ir/instructions.cc +++ b/lib/ir/instructions.cc @@ -12,8 +12,9 @@ namespace ir{ // instruction classes //===----------------------------------------------------------------------===// -instruction::instruction(type *ty, unsigned num_ops, unsigned num_results, const std::string &name, instruction *next) - : user(ty, num_ops, name) { +instruction::instruction(type *ty, value_id_t ity, unsigned num_ops, + const std::string &name, instruction *next) + : user(ty, num_ops, name), id_(ity) { if(next){ basic_block *block = next->get_parent(); assert(block && "Next instruction is not in a basic block!"); @@ -35,17 +36,12 @@ bool instruction::has_tile_result_or_op() { return result; } - -// result reference -result_reference::result_reference(instruction *ref, unsigned arg_id, const std::string &name) - : value(ref->get_type(), name), arg_id_(arg_id){ } - //===----------------------------------------------------------------------===// // phi_node classes //===----------------------------------------------------------------------===// phi_node::phi_node(type *ty, unsigned num_reserved, std::string const &name, instruction *next) - : instruction(ty, 0, 1, name, next) { + : instruction(ty, INST_PHI, 0, name, next) { blocks_.reserve(num_reserved); } @@ -131,7 +127,7 @@ bool binary_operator::is_int_add_sub() const { binary_operator::binary_operator(binary_op_t op, value *lhs, value *rhs, type *ty, const std::string &name, instruction *next) - : instruction(ty, 2, 1, name, next), op_(op){ + : instruction(ty, INST_BINOP, 2, name, next), op_(op){ set_operand(0, lhs); set_operand(1, rhs); } @@ -164,6 +160,8 @@ binary_operator *binary_operator::create_not(value *arg, const std::string &name // cmp_inst classes //===----------------------------------------------------------------------===// + + // cmp_inst std::string cmp_inst::repr_impl() const { switch (pred_) { @@ -197,8 +195,8 @@ std::string cmp_inst::repr_impl() const { } } -cmp_inst::cmp_inst(type *ty, cmp_pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next) - : instruction(ty, 2, 1, name, next), pred_(pred) { +cmp_inst::cmp_inst(type *ty, value_id_t id, cmp_pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next) + : instruction(ty, id, 2, name, next), pred_(pred) { set_operand(0, lhs); set_operand(1, rhs); } @@ -219,7 +217,12 @@ bool cmp_inst::is_int_predicate(cmp_pred_t pred) { return pred >= FIRST_ICMP_PREDICATE && pred <= LAST_ICMP_PREDICATE; } + // icmp_inst +icmp_inst::icmp_inst(type *ty, cmp_pred_t pred, + value *lhs, value *rhs, const std::string &name, instruction *next) + : cmp_inst(ty, INST_ICMP, pred, lhs, rhs, name, next){ } + icmp_inst* icmp_inst::create(cmp_pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next){ assert(is_int_predicate(pred)); type *res_ty = make_cmp_result_type(lhs->get_type()); @@ -227,6 +230,10 @@ icmp_inst* icmp_inst::create(cmp_pred_t pred, value *lhs, value *rhs, const std: } // fcmp_inst +fcmp_inst::fcmp_inst(type *ty, cmp_pred_t pred, + value *lhs, value *rhs, const std::string &name, instruction *next) + : cmp_inst(ty, INST_FCMP, pred, lhs, rhs, name, next){ } + fcmp_inst* fcmp_inst::create(cmp_pred_t pred, value *lhs, value *rhs, const std::string &name, instruction *next){ assert(is_fp_predicate(pred)); type *res_ty = make_cmp_result_type(lhs->get_type()); @@ -237,8 +244,8 @@ fcmp_inst* fcmp_inst::create(cmp_pred_t pred, value *lhs, value *rhs, const std: // unary_inst classes //===----------------------------------------------------------------------===// -unary_inst::unary_inst(type *ty, value *v, const std::string &name, instruction *next) - : instruction(ty, 1, 1, name, next) { +unary_inst::unary_inst(type *ty, value_id_t id, value *v, const std::string &name, instruction *next) + : instruction(ty, id, 1, name, next) { set_operand(0, v); } @@ -309,7 +316,7 @@ cast_inst *cast_inst::create_integer_cast(value *arg, type *ty, bool is_signed, // return_inst return_inst::return_inst(context &ctx, value *ret_val, instruction *next) - : terminator_inst(type::get_void_ty(ctx), ret_val!=nullptr, 0, "", next){ + : terminator_inst(type::get_void_ty(ctx), INST_RETURN, ret_val!=nullptr, "", next){ if(ret_val) set_operand(0, ret_val); } @@ -332,13 +339,13 @@ branch_inst* branch_inst::create(value *cond, basic_block *if_dst, basic_block * // uncond_branch_inst uncond_branch_inst::uncond_branch_inst(basic_block *dst, instruction *next) - : branch_inst(type::get_void_ty(dst->get_context()), 1, 0, "", next){ + : branch_inst(type::get_void_ty(dst->get_context()), INST_UNCOND_BRANCH, 1, "", next){ set_operand(0, dst); } // cond_branch_inst cond_branch_inst::cond_branch_inst(basic_block *if_dst, basic_block *else_dst, value *cond, instruction *next) - : branch_inst(type::get_void_ty(if_dst->get_context()), 3, 0, "", next){ + : branch_inst(type::get_void_ty(if_dst->get_context()), INST_COND_BRANCH, 3, "", next){ assert(cond->get_type()->is_integer_ty(1) && "May only branch on boolean predicates!"); set_operand(0, if_dst); set_operand(1, else_dst); @@ -351,7 +358,7 @@ cond_branch_inst::cond_branch_inst(basic_block *if_dst, basic_block *else_dst, v //===----------------------------------------------------------------------===// getelementptr_inst::getelementptr_inst(type *pointee_ty, value *ptr, const std::vector &idx, const std::string &name, instruction *next) - : instruction(get_return_type(pointee_ty, ptr, idx), 1 + idx.size(), 1, name, next), + : instruction(get_return_type(pointee_ty, ptr, idx), INST_GETELEMENTPTR, 1 + idx.size(), name, next), source_elt_ty(pointee_ty), res_elt_ty(get_indexed_type(pointee_ty, idx)){ // sanity check @@ -414,8 +421,13 @@ getelementptr_inst *getelementptr_inst::create(value *ptr, const std::vectorget_type()), id, num_ops, name, next) { } // load @@ -427,19 +439,21 @@ type *load_inst::get_pointee_type(type *ty) { return pointee_ty; } -load_inst::load_inst(value *ptr, unsigned num_extra_ops, const std::string &name, instruction *next) - : io_inst(get_pointee_type(ptr->get_type()), 1 + num_extra_ops, 1, name, next) { +// unmasked_load +unmasked_load_inst::unmasked_load_inst(value *ptr, const std::string &name, instruction *next) + : load_inst(ptr, INST_UNMASKED_LOAD, 1, name, next) { set_operand(0, ptr); } -load_inst* load_inst::create(value *ptr, const std::string &name, instruction *next) { - return new load_inst(ptr, 0, name, next); +unmasked_load_inst* unmasked_load_inst::create(value *ptr, const std::string &name, instruction *next) { + return new unmasked_load_inst(ptr, name, next); } // masked load masked_load_inst::masked_load_inst(value *ptr, value *mask, value *false_value, const std::string &name, instruction *next) - : load_inst(ptr, 2, name, next) { + : load_inst(ptr, INST_MASKED_LOAD, 3, name, next) { + set_operand(0, ptr); set_operand(1, mask); set_operand(2, false_value); } @@ -450,23 +464,29 @@ masked_load_inst* masked_load_inst::create(value *ptr, value *mask, value *false } -// store -store_inst::store_inst(value *ptr, value *val, unsigned num_extra_ops, - const std::string &name, instruction *next) - : io_inst(type::get_void_ty(ptr->get_type()->get_context()), 2 + num_extra_ops, 1, name, next) { +store_inst::store_inst(value *ptr, value_id_t id, unsigned num_ops, const std::string &name, instruction *next) + : io_inst(type::get_void_ty(ptr->get_type()->get_context()), id, num_ops, name, next) +{ } + +// unmasked_store +unmasked_store_inst::unmasked_store_inst(value *ptr, value *val, + const std::string &name, instruction *next) + : store_inst(ptr, INST_UNMASKED_STORE, 2, name, next) { set_operand(0, ptr); set_operand(1, val); } -store_inst* store_inst::create(value *ptr, value *val, - const std::string &name, instruction *next) { - return new store_inst(ptr, val, 0, name, next); +unmasked_store_inst* unmasked_store_inst::create(value *ptr, value *val, + const std::string &name, instruction *next) { + return new unmasked_store_inst(ptr, val, name, next); } // masked store masked_store_inst::masked_store_inst(value *ptr, value *val, value *mask, const std::string &name, instruction *next) - : store_inst(ptr, val, 1, name, next) { + : store_inst(ptr, INST_MASKED_STORE, 3, name, next) { + set_operand(0, ptr); + set_operand(1, val); set_operand(2, mask); } @@ -477,15 +497,16 @@ masked_store_inst* masked_store_inst::create(value *ptr, value *val, value *mask // retile_inst classes //===----------------------------------------------------------------------===// -retile_inst::retile_inst(value *arg, const type::tile_shapes_t &shapes, +retile_inst::retile_inst(value *arg, value_id_t id, const type::tile_shapes_t &shapes, const std::string &name, instruction *next) - : unary_inst(tile_type::get(arg->get_type()->get_scalar_ty(), shapes), arg, name, next) { } + : unary_inst(tile_type::get(arg->get_type()->get_scalar_ty(), shapes), id, arg, name, next) { } + // reshape instruction* reshape_inst::create(value *arg, const type::tile_shapes_t &shapes, const std::string &name, instruction *next) { - return new reshape_inst(arg, shapes, name, next); + return new reshape_inst(arg, INST_RESHAPE, shapes, name, next); } @@ -493,20 +514,20 @@ instruction* reshape_inst::create(value *arg, const type::tile_shapes_t &shapes, instruction* splat_inst::create(value *arg, const type::tile_shapes_t &shapes, const std::string &name, instruction *next) { - return new splat_inst(arg, shapes, name, next); + return new splat_inst(arg, INST_SPLAT, shapes, name, next); } // broadcast instruction* broadcast_inst::create(value *arg, const type::tile_shapes_t &shapes, const std::string &name, instruction *next) { - return new broadcast_inst(arg, shapes, name, next); + return new broadcast_inst(arg, INST_BROADCAST, shapes, name, next); } // downcast instruction* downcast_inst::create(value *arg, const std::string &name, instruction *next) { - return new downcast_inst(arg->get_type()->get_scalar_ty(), arg, name, next); + return new downcast_inst(arg->get_type()->get_scalar_ty(), INST_DOWNCAST, arg, name, next); } //===----------------------------------------------------------------------===// @@ -515,7 +536,7 @@ instruction* downcast_inst::create(value *arg, const std::string &name, instruct dot_inst::dot_inst(value *A, value *B, value *C, TransT AT, TransT BT, const std::string &name, instruction *next) - : builtin_inst(C->get_type(), 3, 1, name, next), AT_(AT), BT_(BT) { + : builtin_inst(C->get_type(), INST_DOT, 3, name, next), AT_(AT), BT_(BT) { set_operand(0, A); set_operand(1, B); set_operand(2, C); @@ -578,7 +599,7 @@ std::vector trans_inst::init_perm(ir::type* ty, const std::vector } trans_inst::trans_inst(value *arg, const std::vector& perm, const std::string &name, instruction *next) - : builtin_inst(get_res_ty(arg->get_type(), perm), 1, 1, name, next) { + : builtin_inst(get_res_ty(arg->get_type(), perm), INST_TRANS, 1, name, next) { // sanity check perm_ = init_perm(arg->get_type(), perm); //auto size = arg->get_type()->get_tile_shapes().size(); @@ -599,7 +620,7 @@ const std::vector trans_inst::get_perm() const { //===----------------------------------------------------------------------===// sqrt_inst::sqrt_inst(value *arg, const std::string &name, instruction *next) - : builtin_inst(arg->get_type(), 1, 1, name, next){ + : builtin_inst(arg->get_type(), INST_SQRT, 1, name, next){ set_operand(0, arg); } @@ -621,7 +642,7 @@ type* reduce_inst::get_res_type(value *arg, unsigned axis) { } reduce_inst::reduce_inst(value *arg, unsigned axis, const std::string &name, instruction *next) - : builtin_inst(get_res_type(arg, axis), 1, 1, name, next), + : builtin_inst(get_res_type(arg, axis), INST_REDUCE, 1, name, next), axis_(axis){ set_operand(0, arg); } @@ -636,7 +657,7 @@ instruction* reduce_inst::create(value *arg, unsigned axis, const std::string &n //===----------------------------------------------------------------------===// select_inst::select_inst(value *pred, value *if_value, value *else_value, const std::string &name, instruction *next) - : builtin_inst(if_value->get_type(), 3, 1, name, next){ + : builtin_inst(if_value->get_type(), INST_SELECT, 3, name, next){ set_operand(0, pred); set_operand(1, if_value); set_operand(2, else_value); @@ -652,7 +673,7 @@ instruction* select_inst::create(value *pred, value *if_value, value *else_value // get_program_id get_program_id_inst::get_program_id_inst(type *ty, unsigned axis, const std::string &name, instruction *next) - : builtin_inst(ty, 0, 1, name, next), axis_(axis){ + : builtin_inst(ty, INST_GET_PROGRAM_ID, 0, name, next), axis_(axis){ } @@ -662,7 +683,7 @@ instruction* get_program_id_inst::create(context &ctx, unsigned axis, const std: // get_num_program get_num_program_inst::get_num_program_inst(type *ty, unsigned axis, const std::string &name, instruction *next) - : builtin_inst(ty, 0, 1, name, next), axis_(axis){ + : builtin_inst(ty, INST_GET_NUM_PROGRAMS, 0, name, next), axis_(axis){ } @@ -674,7 +695,7 @@ instruction* get_num_program_inst::create(context &ctx, unsigned axis, const std // atomic cas atomic_cas_inst::atomic_cas_inst(value *ptr, value *cmp, value *val, const std::string &name, instruction *next) - : builtin_inst(ptr->get_type()->get_pointer_element_ty(), 3, 1, name, next) { + : builtin_inst(ptr->get_type()->get_pointer_element_ty(), INST_ATOMIC_CAS, 3, name, next) { set_operand(0, ptr); set_operand(1, cmp); set_operand(2, val); @@ -687,7 +708,7 @@ instruction* atomic_cas_inst::create(value *ptr, value *cmp, value *val, const s // atomic exch atomic_exch_inst::atomic_exch_inst(value *ptr, value *val, const std::string &name, instruction *next) - : builtin_inst(ptr->get_type()->get_pointer_element_ty(), 2, 1, name, next) { + : builtin_inst(ptr->get_type()->get_pointer_element_ty(), INST_ATOMIC_EXCH, 2, name, next) { set_operand(0, ptr); set_operand(1, val); } @@ -699,7 +720,7 @@ instruction* atomic_exch_inst::create(value *ptr, value *val, const std::string // atomic add atomic_add_inst::atomic_add_inst(value *ptr, value *val, const std::string &name, instruction *next) - : builtin_inst(ptr->get_type()->get_pointer_element_ty(), 2, 1, name, next) { + : builtin_inst(ptr->get_type()->get_pointer_element_ty(), INST_ATOMIC_ADD, 2, name, next) { set_operand(0, ptr); set_operand(1, val); } @@ -714,18 +735,13 @@ instruction* atomic_add_inst::create(value *ptr, value *val, const std::string & // copy to shared copy_to_shared_inst* copy_to_shared_inst::create(value *arg, const std::string &name, instruction *next) { - return new copy_to_shared_inst(arg->get_type(), arg, name, next); -} - -// vectorize -vectorize_inst* vectorize_inst::create(value *arg, const std::string &name, instruction *next) { - return new vectorize_inst(arg->get_type(), arg, name, next); + return new copy_to_shared_inst(arg->get_type(), INST_COPY_TO_SHARED, arg, name, next); } // barrier barrier_inst::barrier_inst(context &ctx, const std::string &name, instruction *next) - : instruction(type::get_void_ty(ctx), 0, 0, name, next) { } + : instruction(type::get_void_ty(ctx), INST_BARRIER, 0, name, next) { } barrier_inst* barrier_inst::create(context &ctx, const std::string &name, instruction *next) { return new barrier_inst(ctx, name, next); @@ -734,7 +750,7 @@ barrier_inst* barrier_inst::create(context &ctx, const std::string &name, instru // nv_dynamic_program_idx make_range_dyn::make_range_dyn(type *ty, const std::string &name, instruction *next) - : instruction(ty, 0, 1, name, next) { } + : instruction(ty, INST_MAKE_RANGE_DYN, 0, name, next) { } make_range_dyn* make_range_dyn::create(type *ty, const std::string &name, instruction *next) { return new make_range_dyn(ty, name, next); @@ -757,7 +773,7 @@ make_range_sta* make_range_sta::get(make_range* range) { // make_range make_range::make_range(type *ty, constant_int *first, constant_int *last) - : instruction(ty, 0), first_(first), last_(last){ } + : instruction(ty, INST_MAKE_RANGE, 0), first_(first), last_(last){ } make_range *make_range::create(constant_int *first, constant_int *last) { assert(first->get_type()->is_integer_ty()); diff --git a/lib/ir/utils.cc b/lib/ir/utils.cc new file mode 100644 index 000000000..7baf5df14 --- /dev/null +++ b/lib/ir/utils.cc @@ -0,0 +1,54 @@ +#include +#include +#include "triton/ir/utils.h" +#include "triton/ir/basic_block.h" +#include "triton/ir/function.h" +#include "triton/ir/module.h" + +namespace triton{ +namespace ir{ + +std::vector cfg::reverse_post_order(function* fn) { + std::stack stack; + std::set visited; + std::vector result; + // initialize stack + for(ir::basic_block* block: fn->blocks()) + if(block->get_predecessors().empty()) + stack.push(block); + // DFS + while(!stack.empty()) { + basic_block* current = stack.top(); + stack.pop(); + result.push_back(current); + visited.insert(current); + for(basic_block* succ: current->get_successors()) + if(visited.find(succ) == visited.end()) + stack.push(succ); + } + return std::move(result); +} + +void for_each_instruction(module &mod, const std::function &do_work) { + for(ir::function *fn: mod.get_function_list()) + for(ir::basic_block *block: cfg::reverse_post_order(fn)) + for(ir::instruction *i: block->get_inst_list()) + do_work(i); +} + +void for_each_value(module &mod, const std::function &do_work) { + std::set seen; + for(ir::function *fn: mod.get_function_list()) + for(ir::basic_block *block: cfg::reverse_post_order(fn)) + for(ir::instruction *i: block->get_inst_list()){ + for(ir::value *op: i->ops()){ + if(seen.insert(op).second) + do_work(op); + } + if(seen.insert(i).second) + do_work(i); + } +} + +} +} From 43d88154bda4e49de807f8e1ba4cf8aebf93581d Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 20 Sep 2019 16:01:12 -0400 Subject: [PATCH 412/494] [codegen] cleaning-up / formalizing shared-memory passes --- .../analysis/{memalloc.h => allocation.h} | 8 +- include/triton/codegen/analysis/axes.h | 1 - include/triton/codegen/analysis/liveness.h | 6 +- include/triton/codegen/instructions.h | 78 ++++++++++++++++++ include/triton/codegen/pass.h | 30 +++++++ include/triton/codegen/selection.h | 12 +-- include/triton/codegen/transform/coalesce.h | 6 +- .../{analysis/meminfo.h => transform/cts.h} | 4 +- include/triton/codegen/transform/membar.h | 10 +-- include/triton/ir/builder.h | 1 + include/triton/runtime/function.h | 4 +- lib/codegen/analysis/align.cc | 1 - .../analysis/{memalloc.cc => allocation.cc} | 19 +++-- lib/codegen/analysis/axes.cc | 20 ++--- lib/codegen/analysis/layout.cc | 5 +- lib/codegen/analysis/liveness.cc | 8 +- lib/codegen/instructions.cc | 0 lib/codegen/pass.cc | 0 lib/codegen/selection.cc | 2 +- lib/codegen/transform/coalesce.cc | 7 +- .../{analysis/meminfo.cc => transform/cts.cc} | 80 ++++++++----------- lib/codegen/transform/dce.cc | 22 +++-- lib/codegen/transform/membar.cc | 4 +- lib/codegen/transform/reassociate.cc | 6 +- lib/ir/builder.cc | 7 ++ lib/runtime/function.cc | 5 +- 26 files changed, 229 insertions(+), 117 deletions(-) rename include/triton/codegen/analysis/{memalloc.h => allocation.h} (87%) create mode 100644 include/triton/codegen/instructions.h create mode 100644 include/triton/codegen/pass.h rename include/triton/codegen/{analysis/meminfo.h => transform/cts.h} (85%) rename lib/codegen/analysis/{memalloc.cc => allocation.cc} (92%) create mode 100644 lib/codegen/instructions.cc create mode 100644 lib/codegen/pass.cc rename lib/codegen/{analysis/meminfo.cc => transform/cts.cc} (64%) diff --git a/include/triton/codegen/analysis/memalloc.h b/include/triton/codegen/analysis/allocation.h similarity index 87% rename from include/triton/codegen/analysis/memalloc.h rename to include/triton/codegen/analysis/allocation.h index f50d00b22..a43e93031 100644 --- a/include/triton/codegen/analysis/memalloc.h +++ b/include/triton/codegen/analysis/allocation.h @@ -18,11 +18,11 @@ namespace analysis{ class tiles; class liveness; -class meminfo; +class cts; -class memalloc { +class allocation { public: - memalloc(liveness *live, meminfo *buffer_info, tiles *params) + allocation(liveness *live, cts *buffer_info, tiles *params) : liveness_(live), buffer_info_(buffer_info), tiles_(params){ } // utilities unsigned num_bytes(ir::value *x); @@ -39,7 +39,7 @@ private: size_t allocated_size_; // dependences liveness *liveness_; - meminfo *buffer_info_; + cts *buffer_info_; tiles *tiles_; }; diff --git a/include/triton/codegen/analysis/axes.h b/include/triton/codegen/analysis/axes.h index d22fa5fa8..625d414c6 100644 --- a/include/triton/codegen/analysis/axes.h +++ b/include/triton/codegen/analysis/axes.h @@ -27,7 +27,6 @@ private: void update_graph_store(ir::instruction *i); void update_graph_reduce(ir::instruction *i); void update_graph_reshape(ir::instruction *i); - void update_graph_splat(ir::instruction *i); void update_graph_trans(ir::instruction *i); void update_graph_broadcast(ir::instruction *i); void update_graph_dot(ir::instruction *i); diff --git a/include/triton/codegen/analysis/liveness.h b/include/triton/codegen/analysis/liveness.h index 4b863ff55..df951161c 100644 --- a/include/triton/codegen/analysis/liveness.h +++ b/include/triton/codegen/analysis/liveness.h @@ -16,7 +16,7 @@ namespace analysis{ typedef unsigned slot_index; -class meminfo; +class cts; struct segment { slot_index start; @@ -44,7 +44,7 @@ public: public: // constructor - liveness(meminfo *info): info_(info){ } + liveness(cts *info): info_(info){ } // accessors const intervals_map_t& intervals() const { return intervals_; } segment get_interval(ir::value* v) const { return intervals_.at(v); } @@ -52,7 +52,7 @@ public: void run(ir::module &mod); private: - meminfo *info_; + cts *info_; has_storage_map_t has_dedicated_storage_; indices_map_t indices_; intervals_map_t intervals_; diff --git a/include/triton/codegen/instructions.h b/include/triton/codegen/instructions.h new file mode 100644 index 000000000..cecd716e0 --- /dev/null +++ b/include/triton/codegen/instructions.h @@ -0,0 +1,78 @@ +#ifndef _TRITON_CODEGEN_INSTRUCTIONS_H_ +#define _TRITON_CODEGEN_INSTRUCTIONS_H_ + +#include "triton/ir/enums.h" +#include +#include + +namespace triton{ +namespace codegen{ + + +enum storage_info_t { + NONE, + ANY, + SHARED, + DISTRIBUTED, + REPLICATED +}; + +typedef std::pair> inst_storage_info_t; +static const std::map storage_info = { + // scalars + { ir::INST_GET_PROGRAM_ID, {REPLICATED, {}}}, + { ir::INST_GET_NUM_PROGRAMS, {REPLICATED, {}}}, + // scalar/array + { ir::INST_PHI, {ANY, {ANY, ANY}}}, + { ir::INST_BINOP, {DISTRIBUTED, {DISTRIBUTED, DISTRIBUTED}}}, + { ir::INST_GETELEMENTPTR, {DISTRIBUTED, {DISTRIBUTED, DISTRIBUTED}}}, + { ir::INST_SELECT, {DISTRIBUTED, {DISTRIBUTED, DISTRIBUTED, DISTRIBUTED}}}, + { ir::INST_SQRT, {DISTRIBUTED, {DISTRIBUTED}}}, + // cmp + { ir::INST_ICMP, {DISTRIBUTED, {DISTRIBUTED, DISTRIBUTED}}}, + { ir::INST_FCMP, {DISTRIBUTED, {DISTRIBUTED, DISTRIBUTED}}}, + // cast + { ir::INST_CAST_TRUNC, {DISTRIBUTED, {DISTRIBUTED}}}, + { ir::INST_CAST_ZEXT, {DISTRIBUTED, {DISTRIBUTED}}}, + { ir::INST_CAST_SEXT, {DISTRIBUTED, {DISTRIBUTED}}}, + { ir::INST_CAST_FP_TRUNC, {DISTRIBUTED, {DISTRIBUTED}}}, + { ir::INST_CAST_FP_EXT, {DISTRIBUTED, {DISTRIBUTED}}}, + { ir::INST_CAST_UI_TO_FP, {DISTRIBUTED, {DISTRIBUTED}}}, + { ir::INST_CAST_SI_TO_FP, {DISTRIBUTED, {DISTRIBUTED}}}, + { ir::INST_CAST_FP_TO_UI, {DISTRIBUTED, {DISTRIBUTED}}}, + { ir::INST_CAST_FP_TO_SI, {DISTRIBUTED, {DISTRIBUTED}}}, + { ir::INST_CAST_PTR_TO_INT, {DISTRIBUTED, {DISTRIBUTED}}}, + { ir::INST_CAST_INT_TO_PTR, {DISTRIBUTED, {DISTRIBUTED}}}, + { ir::INST_CAST_BIT_CAST, {DISTRIBUTED, {DISTRIBUTED}}}, + { ir::INST_CAST_ADDR_SPACE_CAST, {DISTRIBUTED, {DISTRIBUTED}}}, + // io + { ir::INST_UNMASKED_LOAD, {DISTRIBUTED, {DISTRIBUTED}}}, + { ir::INST_MASKED_LOAD, {DISTRIBUTED, {DISTRIBUTED, DISTRIBUTED}}}, + { ir::INST_UNMASKED_STORE, {NONE , {DISTRIBUTED, DISTRIBUTED}}}, + { ir::INST_MASKED_STORE, {NONE , {DISTRIBUTED, DISTRIBUTED, DISTRIBUTED}}}, + // retile + { ir::INST_RESHAPE, {DISTRIBUTED, {DISTRIBUTED}}}, + { ir::INST_SPLAT, {DISTRIBUTED, {REPLICATED}}}, + { ir::INST_BROADCAST, {DISTRIBUTED, {REPLICATED}}}, + { ir::INST_DOWNCAST, {DISTRIBUTED, {REPLICATED}}}, + // array arithmetic + { ir::INST_TRANS, {SHARED, {DISTRIBUTED}}}, // TODO: not necessarily + { ir::INST_REDUCE, {SHARED, {DISTRIBUTED}}}, + { ir::INST_DOT, {DISTRIBUTED, {SHARED, SHARED, DISTRIBUTED}}}, + // terminator + { ir::INST_RETURN, {NONE, {}}}, + { ir::INST_UNCOND_BRANCH, {NONE, {}}}, + { ir::INST_COND_BRANCH, {NONE, {REPLICATED}}}, + + // intrinsics + { ir::INST_COPY_TO_SHARED, {SHARED, {DISTRIBUTED}}}, + { ir::INST_BARRIER, {NONE, {}}}, + { ir::INST_MAKE_RANGE_DYN, {DISTRIBUTED, {}}}, + { ir::INST_MAKE_RANGE_STA, {DISTRIBUTED, {}}}, + { ir::INST_MAKE_RANGE, {DISTRIBUTED, {}}} +}; + +} +} + +#endif diff --git a/include/triton/codegen/pass.h b/include/triton/codegen/pass.h new file mode 100644 index 000000000..129c02bc6 --- /dev/null +++ b/include/triton/codegen/pass.h @@ -0,0 +1,30 @@ +#ifndef _TRITON_CODEGEN_PASS_H_ +#define _TRITON_CODEGEN_PASS_H_ + +#include + +namespace triton{ + +namespace ir{ + class module; +} + +namespace codegen{ + +class pass { +public: + virtual void run(ir::module& m); +}; + + +class pass_manager { +public: + void add(pass* p); + void run(ir::module& m); + +private: + std::list passes; +}; + +} +} diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 74a617af9..21bd83ee1 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -5,7 +5,7 @@ #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/type.h" -#include "triton/codegen/analysis/meminfo.h" +#include "triton/codegen/transform/cts.h" namespace llvm{ @@ -45,8 +45,8 @@ namespace codegen{ namespace analysis{ class tiles; class align; -class memalloc; -class meminfo; +class allocation; +class cts; class axes; class layout; } @@ -201,7 +201,7 @@ private: public: - selection(analysis::memalloc *alloc, analysis::tiles *tiles, analysis::meminfo *buffer_info, + selection(analysis::allocation *alloc, analysis::tiles *tiles, analysis::cts *buffer_info, analysis::align *alignment, analysis::axes *axes, analysis::layout *layouts, transform::coalesce* reorder, target *tgt, unsigned num_warps) : alloc_(alloc), tiles_(tiles), buffer_info_(buffer_info), @@ -213,11 +213,11 @@ public: private: vmap_t vmap_; tmap_t tmap_; - analysis::memalloc *alloc_; + analysis::allocation *alloc_; analysis::tiles *tiles_; analysis::axes *a_axes_; analysis::layout *layouts_; - analysis::meminfo *buffer_info_; + analysis::cts *buffer_info_; analysis::align *alignment_; transform::coalesce *reorder_; target *tgt_; diff --git a/include/triton/codegen/transform/coalesce.h b/include/triton/codegen/transform/coalesce.h index 3d418fdb5..680f1ccb2 100644 --- a/include/triton/codegen/transform/coalesce.h +++ b/include/triton/codegen/transform/coalesce.h @@ -20,7 +20,7 @@ namespace codegen{ namespace analysis{ class align; class layout; - class meminfo; + class cts; } namespace transform{ @@ -32,13 +32,13 @@ private: ir::value* rematerialize(ir::value *v, ir::builder& builder, std::map& seen); public: - coalesce(analysis::align* align, triton::codegen::analysis::layout *layouts, analysis::meminfo* mem); + coalesce(analysis::align* align, triton::codegen::analysis::layout *layouts, analysis::cts* mem); void run(ir::module &mod); private: analysis::align* align_; analysis::layout* layout_; - analysis::meminfo* mem_; + analysis::cts* mem_; }; } diff --git a/include/triton/codegen/analysis/meminfo.h b/include/triton/codegen/transform/cts.h similarity index 85% rename from include/triton/codegen/analysis/meminfo.h rename to include/triton/codegen/transform/cts.h index f4ad290a6..7b7237f7e 100644 --- a/include/triton/codegen/analysis/meminfo.h +++ b/include/triton/codegen/transform/cts.h @@ -16,7 +16,7 @@ namespace ir { namespace codegen{ namespace analysis{ -class meminfo { +class cts { public: void run(ir::module &mod); // queries @@ -25,8 +25,6 @@ public: bool is_shared(ir::value *x); bool is_loop_latch(ir::phi_node *phi, ir::instruction *terminator); ir::value *get_reference(ir::value *x); - void replace(ir::value* before, ir::value *after); - void copy(ir::value* y, ir::value *x); private: std::set shared_; diff --git a/include/triton/codegen/transform/membar.h b/include/triton/codegen/transform/membar.h index 8991ac57d..b4aebc2ce 100644 --- a/include/triton/codegen/transform/membar.h +++ b/include/triton/codegen/transform/membar.h @@ -15,8 +15,8 @@ namespace codegen{ namespace analysis{ -class memalloc; -class meminfo; +class allocation; +class cts; } @@ -38,12 +38,12 @@ private: std::pair transfer(ir::basic_block *block, const interval_vec_t &written_to, const interval_vec_t &read_from, std::set &insert_loc); public: - membar(analysis::memalloc *alloc, analysis::meminfo *buffer_info): alloc_(alloc), buffer_info_(buffer_info) {} + membar(analysis::allocation *alloc, analysis::cts *buffer_info): alloc_(alloc), buffer_info_(buffer_info) {} void run(ir::module &mod); private: - analysis::memalloc *alloc_; - analysis::meminfo *buffer_info_; + analysis::allocation *alloc_; + analysis::cts *buffer_info_; }; diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h index d5707265a..5cf107be3 100644 --- a/include/triton/ir/builder.h +++ b/include/triton/ir/builder.h @@ -30,6 +30,7 @@ public: // Setters void set_insert_point(iterator instr); void set_insert_point(instruction* i); + void set_insert_point_after(instruction* i); void set_insert_point(basic_block* block); basic_block* get_insert_block() { return block_; } iterator get_insert_point() { return insert_point_;} diff --git a/include/triton/runtime/function.h b/include/triton/runtime/function.h index 0eaa9a33d..88de3825c 100644 --- a/include/triton/runtime/function.h +++ b/include/triton/runtime/function.h @@ -12,14 +12,14 @@ #include "triton/codegen/selection.h" #include "triton/codegen/target.h" #include "triton/codegen/analysis/tiles.h" -#include "triton/codegen/analysis/memalloc.h" +#include "triton/codegen/analysis/allocation.h" #include "triton/codegen/analysis/liveness.h" -#include "triton/codegen/analysis/meminfo.h" #include "triton/codegen/analysis/align.h" #include "triton/codegen/transform/dce.h" #include "triton/codegen/transform/peephole.h" #include "triton/codegen/transform/membar.h" #include "triton/codegen/transform/reassociate.h" +#include "triton/codegen/transform/cts.h" #include "triton/lang/parser.h" #include "triton/runtime/arg.h" diff --git a/lib/codegen/analysis/align.cc b/lib/codegen/analysis/align.cc index f84e8d692..ef57e7a4f 100644 --- a/lib/codegen/analysis/align.cc +++ b/lib/codegen/analysis/align.cc @@ -445,7 +445,6 @@ std::vector align::populate_starting_multiple_default(ir::value* v) { return add_to_cache(v, {1}, starting_multiple_); } - std::vector align::populate_starting_multiple(ir::value *v){ if(starting_multiple_.find(v) != starting_multiple_.end()) return starting_multiple_.at(v); diff --git a/lib/codegen/analysis/memalloc.cc b/lib/codegen/analysis/allocation.cc similarity index 92% rename from lib/codegen/analysis/memalloc.cc rename to lib/codegen/analysis/allocation.cc index 7f80824e3..b05b55a4d 100644 --- a/lib/codegen/analysis/memalloc.cc +++ b/lib/codegen/analysis/allocation.cc @@ -1,7 +1,7 @@ #include -#include "triton/codegen/analysis/memalloc.h" +#include "triton/codegen/analysis/allocation.h" #include "triton/codegen/analysis/liveness.h" -#include "triton/codegen/analysis/meminfo.h" +#include "triton/codegen/transform/cts.h" #include "triton/codegen/analysis/tiles.h" #include "triton/ir/basic_block.h" #include "triton/ir/type.h" @@ -13,7 +13,7 @@ namespace triton{ namespace codegen{ namespace analysis{ -unsigned memalloc::is_ld_padded(ir::value *x) { +unsigned allocation::is_ld_padded(ir::value *x) { if(auto *trans = dynamic_cast(x)){ if(trans->get_perm()[0]->get_value() != 0) return 4; @@ -45,7 +45,7 @@ unsigned memalloc::is_ld_padded(ir::value *x) { return 0; } -unsigned memalloc::num_bytes(ir::value *x) { +unsigned allocation::num_bytes(ir::value *x) { if(auto *red = dynamic_cast(x)){ unsigned num_bytes = x->get_type()->get_scalar_ty()->get_primitive_size_in_bits() / 8; size_t axis = red->get_axis(); @@ -73,15 +73,15 @@ unsigned memalloc::num_bytes(ir::value *x) { return num_bytes; } -void memalloc::run(){ + +void allocation::run() { using std::max; using std::min; typedef std::multimap triples_map_type; std::vector I; - for(auto x: liveness_->intervals()){ + for(auto x: liveness_->intervals()) I.push_back(x.first); - } std::vector J = I; triples_map_type H; @@ -137,7 +137,7 @@ void memalloc::run(){ for(ir::value *X: V) colors[X] = (X==V[0])?0:-1; - // First-fit coloring + // First-fit graph coloring std::vector available(V.size()); for(ir::value *x: V){ // Non-neighboring colors are available @@ -158,6 +158,7 @@ void memalloc::run(){ for(ir::value *y: interferences[x]) Adj = std::max(Adj, starts[y] + num_bytes(y)); offsets_[x] = starts[x] + colors[x] * Adj; +// std::cout << x->get_name() << " " << offsets_[x] << " " << num_bytes(x) << std::endl; if(buffer_info_->is_double(x)){ ir::phi_node *phi = (ir::phi_node*)x; for(unsigned i = 0; i < phi->get_num_incoming(); i++){ @@ -167,6 +168,8 @@ void memalloc::run(){ } } +// exit(EXIT_FAILURE); + // Save maximum size of induced memory space allocated_size_ = 0; for(auto &x: offsets_){ diff --git a/lib/codegen/analysis/axes.cc b/lib/codegen/analysis/axes.cc index 790c8a36b..2c152f439 100644 --- a/lib/codegen/analysis/axes.cc +++ b/lib/codegen/analysis/axes.cc @@ -68,11 +68,6 @@ void axes::update_graph_reshape(ir::instruction *i) { } } -void axes::update_graph_splat(ir::instruction *) { - // argument is scalar so don't make any edge - return; -} - void axes::update_graph_trans(ir::instruction *i) { auto *trans = static_cast(i); ir::value *op = trans->get_operand(0); @@ -129,13 +124,14 @@ void axes::update_graph_elementwise(ir::instruction *i) { void axes::update_graph(ir::instruction *i) { switch (i->get_id()) { - case ir::INST_REDUCE: return update_graph_reduce(i); - case ir::INST_RESHAPE: return update_graph_reshape(i); - case ir::INST_SPLAT: return update_graph_splat(i); - case ir::INST_TRANS: return update_graph_trans(i); - case ir::INST_BROADCAST: return update_graph_broadcast(i); - case ir::INST_DOT: return update_graph_dot(i); - default: return update_graph_elementwise(i); + case ir::INST_REDUCE: return update_graph_reduce(i); + case ir::INST_RESHAPE: return update_graph_reshape(i); + case ir::INST_SPLAT: return; + case ir::INST_TRANS: return update_graph_trans(i); + case ir::INST_BROADCAST: return update_graph_broadcast(i); + case ir::INST_DOT: return update_graph_dot(i); + case ir::INST_COPY_TO_SHARED: return; + default: return update_graph_elementwise(i); } return; } diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index 77b25e0bb..40c8449ea 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -20,10 +20,9 @@ std::set layout::axes_of(ir::value *value) { rank = ty->get_tile_rank(); // create result std::set result; - for(size_t d = 0; d < rank; d++){ + for(size_t d = 0; d < rank; d++) if(axes_->has_id(value, d)) result.insert(axes_->get_id(value, d)); - } return result; } @@ -54,6 +53,7 @@ const std::vector& layout::values(unsigned id) const size_t layout::get_num_groups() const { return values_.size(); } +// connect two values void layout::connect(ir::value *x, ir::value *y) { if(x == y) return; @@ -75,6 +75,7 @@ void layout::connect(ir::value *x, ir::value *y) { } } +// make graph void layout::make_graph(ir::instruction *i) { for(ir::value* opx: i->ops()) for(ir::value* opy: i->ops()){ diff --git a/lib/codegen/analysis/liveness.cc b/lib/codegen/analysis/liveness.cc index 088691263..05d29032b 100644 --- a/lib/codegen/analysis/liveness.cc +++ b/lib/codegen/analysis/liveness.cc @@ -1,6 +1,7 @@ #include +#include "triton/codegen/instructions.h" #include "triton/codegen/analysis/liveness.h" -#include "triton/codegen/analysis/meminfo.h" +#include "triton/codegen/transform/cts.h" #include "triton/ir/basic_block.h" #include "triton/ir/function.h" #include "triton/ir/module.h" @@ -25,6 +26,11 @@ void liveness::run(ir::module &mod) { // Creates live intervals for(auto i: indices_){ ir::value *v = i.first; +// ir::instruction* instr = dynamic_cast(v); +// if(!instr) +// continue; +// if(storage_info.at(instr->get_id()).first != SHARED) +// continue; if(!info_->is_shared(v) || info_->get_reference(v)) continue; unsigned start = i.second; diff --git a/lib/codegen/instructions.cc b/lib/codegen/instructions.cc new file mode 100644 index 000000000..e69de29bb diff --git a/lib/codegen/pass.cc b/lib/codegen/pass.cc new file mode 100644 index 000000000..e69de29bb diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index d89a4e1c5..169283e7f 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -4,7 +4,7 @@ #include "triton/codegen/analysis/layout.h" #include "triton/codegen/analysis/axes.h" #include "triton/codegen/analysis/tiles.h" -#include "triton/codegen/analysis/memalloc.h" +#include "triton/codegen/analysis/allocation.h" #include "triton/codegen/analysis/align.h" #include "triton/codegen/transform/coalesce.h" #include "triton/ir/context.h" diff --git a/lib/codegen/transform/coalesce.cc b/lib/codegen/transform/coalesce.cc index 455f2fb5d..117bd35df 100644 --- a/lib/codegen/transform/coalesce.cc +++ b/lib/codegen/transform/coalesce.cc @@ -7,7 +7,7 @@ #include "triton/ir/instructions.h" #include "triton/ir/module.h" #include "triton/codegen/analysis/layout.h" -#include "triton/codegen/analysis/meminfo.h" +#include "triton/codegen/transform/cts.h" #include "triton/codegen/analysis/align.h" #include "triton/codegen/transform/coalesce.h" @@ -15,7 +15,7 @@ namespace triton { namespace codegen{ namespace transform{ -coalesce::coalesce(analysis::align* align, analysis::layout *layouts, analysis::meminfo *mem) +coalesce::coalesce(analysis::align* align, analysis::layout *layouts, analysis::cts *mem) : align_(align), layout_(layouts), mem_(mem) { } // Find all values that are used as pointer operands in LD/ST @@ -102,9 +102,6 @@ void coalesce::run(ir::module &mod) { r->replace_all_uses_with(cts); cts->replace_uses_of_with(cts, r); } - else{ - - } } } diff --git a/lib/codegen/analysis/meminfo.cc b/lib/codegen/transform/cts.cc similarity index 64% rename from lib/codegen/analysis/meminfo.cc rename to lib/codegen/transform/cts.cc index be55d6ac7..5a7e16a2d 100644 --- a/lib/codegen/analysis/meminfo.cc +++ b/lib/codegen/transform/cts.cc @@ -1,5 +1,7 @@ #include -#include "triton/codegen/analysis/meminfo.h" +#include +#include "triton/codegen/transform/cts.h" +#include "triton/codegen/instructions.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" @@ -12,7 +14,7 @@ namespace codegen{ namespace analysis{ // run pass on module -bool meminfo::is_loop_latch(ir::phi_node *phi, ir::instruction *terminator){ +bool cts::is_loop_latch(ir::phi_node *phi, ir::instruction *terminator){ if(phi->get_parent() != terminator->get_parent()) return false; if(auto *br = dynamic_cast(terminator)) @@ -24,24 +26,6 @@ bool meminfo::is_loop_latch(ir::phi_node *phi, ir::instruction *terminator){ throw std::runtime_error("unreachable"); } -void meminfo::replace(ir::value* before, ir::value *after) { - shared_.erase(before); - shared_.insert(after); - if(refs_.find(before) != refs_.end()){ - ir::value* v = refs_.at(before); - refs_.erase(before); - refs_.insert({after, v}); - } -} - -void meminfo::copy(ir::value* y, ir::value *x) { - if(shared_.find(x) != shared_.end()) - shared_.insert(y); - if(refs_.find(x) != refs_.end()) - refs_[y] = refs_[x]; - if(double_.find(x) != double_.end()) - double_.insert(y); -} inline bool get_is_shared(ir::value* v) { @@ -62,40 +46,46 @@ inline bool get_is_shared(ir::value* v) { return false; } -void add_copy(ir::value *x, ir::builder &builder) { - if(auto phi = dynamic_cast(x)){ +void add_copy(ir::instruction *parent, ir::value *x, ir::builder &builder) { + auto *i = dynamic_cast(x); + // not an instruction + if(!i) { + builder.set_insert_point(parent); + ir::value *cts = builder.create_copy_to_shared(x); + parent->replace_uses_of_with(x, cts); + return; + } + // phi node + if(auto* phi = dynamic_cast(x)) { for(unsigned i = 0; i < phi->get_num_incoming(); ++i) - add_copy(phi->get_incoming_value(i), builder); - } - else { - if(get_is_shared(x)) - return; - if(auto *i = dynamic_cast(x)){ - ir::basic_block* block = i->get_parent(); - auto it = std::find(block->begin(), block->end(), i); - builder.set_insert_point(++it); - } - ir::instruction *rx = (ir::instruction*)builder.create_copy_to_shared(x); - x->replace_all_uses_with(rx); - rx->set_operand(0, x); + add_copy(phi, phi->get_incoming_value(i), builder); + return; } + ir::value_id_t id = i->get_id(); + // already in shared memory + if(storage_info.at(id).first == SHARED) + return; + // copy + builder.set_insert_point_after(i); + ir::value *cts = builder.create_copy_to_shared(x); + parent->replace_uses_of_with(x, cts); } -void meminfo::run(ir::module &mod) { +void cts::run(ir::module &mod) { shared_.clear(); refs_.clear(); double_.clear(); // Add shared copies + ir::builder &builder = mod.get_builder(); for(ir::function *fn: mod.get_function_list()){ - ir::builder builder(mod.get_context()); for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()){ - if(dynamic_cast(i)) - if(i->get_operand(1)->get_type()->get_tile_shapes()[1] != 1){ - add_copy(i->get_operand(0), builder); - add_copy(i->get_operand(1), builder); - } + auto storage = storage_info.at(i->get_id()); + // copy to shared operands when necessary + for(size_t k = 0; k < storage.second.size(); k++) + if(storage.second[k] == SHARED) + add_copy(i, i->get_operand(k), builder); } } @@ -135,15 +125,15 @@ void meminfo::run(ir::module &mod) { } // query double-buffered status -bool meminfo::is_double(ir::value *x) +bool cts::is_double(ir::value *x) { return double_.find(x) != double_.end(); } // query shared status -bool meminfo::is_shared(ir::value *x) +bool cts::is_shared(ir::value *x) { return shared_.find(x) != shared_.end(); } // get reference if any -ir::value *meminfo::get_reference(ir::value *x) +ir::value *cts::get_reference(ir::value *x) { return refs_[x]; } diff --git a/lib/codegen/transform/dce.cc b/lib/codegen/transform/dce.cc index 18406b4ab..4497f2fde 100644 --- a/lib/codegen/transform/dce.cc +++ b/lib/codegen/transform/dce.cc @@ -20,12 +20,22 @@ void dce::run(ir::module &mod) { // iterate through blocks for(ir::basic_block *block: rpo) for(ir::instruction *i: block->get_inst_list()){ - if(dynamic_cast(i) || dynamic_cast(i) - || dynamic_cast(i) || dynamic_cast(i) - || dynamic_cast(i) || dynamic_cast(i) || dynamic_cast(i) - || dynamic_cast(i)){ - work_list.push_back(i); - marked.insert(i); + switch(i->get_id()){ + case ir::INST_RETURN: + case ir::INST_UNCOND_BRANCH: + case ir::INST_COND_BRANCH: + case ir::INST_UNMASKED_STORE: + case ir::INST_MASKED_STORE: + case ir::INST_ATOMIC_ADD: + case ir::INST_ATOMIC_CAS: + case ir::INST_ATOMIC_EXCH: + case ir::INST_BARRIER: { + work_list.push_back(i); + marked.insert(i); + break; + } + default: + break; } } } diff --git a/lib/codegen/transform/membar.cc b/lib/codegen/transform/membar.cc index b8b029d9a..e77e9c71a 100644 --- a/lib/codegen/transform/membar.cc +++ b/lib/codegen/transform/membar.cc @@ -3,8 +3,8 @@ #include #include "triton/codegen/transform/membar.h" -#include "triton/codegen/analysis/memalloc.h" -#include "triton/codegen/analysis/meminfo.h" +#include "triton/codegen/analysis/allocation.h" +#include "triton/codegen/transform/cts.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" diff --git a/lib/codegen/transform/reassociate.cc b/lib/codegen/transform/reassociate.cc index 38e8c79ed..c2b9d2d4b 100644 --- a/lib/codegen/transform/reassociate.cc +++ b/lib/codegen/transform/reassociate.cc @@ -122,7 +122,6 @@ ir::value *reassociate::reassociate_idx(ir::value *old_value, new_value = builder.create_add(rrhs, builder.create_add(lrhs, lhs), name, cst); } } - // extract constant and non-constant if(ir::instruction *bin_add = is_bin_add(new_value)){ ir::value *new_lhs = bin_add->get_operand(0); @@ -136,12 +135,9 @@ ir::value *reassociate::reassociate_idx(ir::value *old_value, noncst = new_lhs; } } - // clean-up if some re-ordering happened - if(old_value != new_value){ + if(old_value != new_value) old_value->replace_all_uses_with(new_value); - } - return new_value; } diff --git a/lib/ir/builder.cc b/lib/ir/builder.cc index 00450b547..db2080a4d 100644 --- a/lib/ir/builder.cc +++ b/lib/ir/builder.cc @@ -27,6 +27,13 @@ void builder::set_insert_point(instruction* i){ } +void builder::set_insert_point_after(instruction* i){ + block_ = i->get_parent(); + auto it = std::find(block_->begin(), block_->end(), i); + set_insert_point(++it); +} + + void builder::set_insert_point(basic_block *block){ block_ = block; insert_point_ = block->end(); diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 04977966d..501d62f54 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -199,14 +199,14 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c llvm::LLVMContext ctx; std::unique_ptr llvm(new llvm::Module(module.get_name(), ctx)); // create passes - codegen::analysis::meminfo shmem_info; + codegen::analysis::cts shmem_info; codegen::analysis::align align; codegen::analysis::liveness shmem_liveness(&shmem_info); codegen::analysis::axes axes; codegen::analysis::layout layouts(&axes); codegen::transform::coalesce coalesce(&align, &layouts, &shmem_info); codegen::analysis::tiles tiles(opt.num_warps, &align, &axes, &layouts); - codegen::analysis::memalloc shmem_allocation(&shmem_liveness, &shmem_info, &tiles); + codegen::analysis::allocation shmem_allocation(&shmem_liveness, &shmem_info, &tiles); codegen::transform::membar shmem_barriers(&shmem_allocation, &shmem_info); codegen::transform::dce dce; codegen::transform::peephole peephole; @@ -229,6 +229,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c peephole.run(module); shmem_info.run(module); shmem_liveness.run(module); + ir::print(module, std::cout); shmem_allocation.run(); if(shmem_allocation.allocated_size() > context->device()->max_shared_memory()) return std::unique_ptr(); From 001973630e712f125c9554b559bfdde7f4e517d6 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 21 Sep 2019 22:21:40 -0400 Subject: [PATCH 413/494] [codegen] cleaned up shared memory and double-buffering logic --- include/triton/codegen/analysis/allocation.h | 8 +- include/triton/codegen/analysis/liveness.h | 13 ++- include/triton/codegen/selection.h | 11 +-- include/triton/codegen/transform/coalesce.h | 3 +- include/triton/codegen/transform/cts.h | 11 --- include/triton/codegen/transform/membar.h | 9 ++- lib/codegen/analysis/allocation.cc | 23 +++--- lib/codegen/analysis/liveness.cc | 71 +++++++++++++++-- lib/codegen/selection.cc | 65 ++++++++------- lib/codegen/transform/coalesce.cc | 4 +- lib/codegen/transform/cts.cc | 83 -------------------- lib/codegen/transform/membar.cc | 31 ++++++-- lib/driver/module.cc | 2 +- lib/runtime/function.cc | 20 ++--- tests/bench/dot.cc | 6 +- 15 files changed, 173 insertions(+), 187 deletions(-) diff --git a/include/triton/codegen/analysis/allocation.h b/include/triton/codegen/analysis/allocation.h index a43e93031..3dfede223 100644 --- a/include/triton/codegen/analysis/allocation.h +++ b/include/triton/codegen/analysis/allocation.h @@ -10,6 +10,7 @@ namespace triton{ namespace ir{ class value; class function; + class module; } namespace codegen{ @@ -22,8 +23,8 @@ class cts; class allocation { public: - allocation(liveness *live, cts *buffer_info, tiles *params) - : liveness_(live), buffer_info_(buffer_info), tiles_(params){ } + allocation(liveness *live, tiles *params) + : liveness_(live), tiles_(params){ } // utilities unsigned num_bytes(ir::value *x); unsigned is_ld_padded(ir::value* x); @@ -31,7 +32,7 @@ public: unsigned offset(ir::value *x) const { return offsets_.at(x); } unsigned allocated_size() const { return allocated_size_; } // run - void run(); + void run(ir::module& mod); private: std::map offsets_; @@ -39,7 +40,6 @@ private: size_t allocated_size_; // dependences liveness *liveness_; - cts *buffer_info_; tiles *tiles_; }; diff --git a/include/triton/codegen/analysis/liveness.h b/include/triton/codegen/analysis/liveness.h index df951161c..52ea33cca 100644 --- a/include/triton/codegen/analysis/liveness.h +++ b/include/triton/codegen/analysis/liveness.h @@ -7,6 +7,7 @@ namespace triton{ namespace ir{ class value; + class phi_node; class function; class module; } @@ -31,6 +32,11 @@ struct segment { } }; +struct double_buffer_info_t { + ir::value* latch; + ir::phi_node* phi; +}; + class liveness { private: typedef std::map indices_map_t; @@ -43,19 +49,20 @@ public: using const_iterator = intervals_map_t::const_iterator; public: - // constructor - liveness(cts *info): info_(info){ } // accessors const intervals_map_t& intervals() const { return intervals_; } segment get_interval(ir::value* v) const { return intervals_.at(v); } + // double-buffering + bool has_double(ir::value *x) const { return double_.find(x) != double_.end(); } + double_buffer_info_t get_double(ir::value *x) const { return double_.at(x); } // run void run(ir::module &mod); private: - cts *info_; has_storage_map_t has_dedicated_storage_; indices_map_t indices_; intervals_map_t intervals_; + std::map double_; }; } diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 21bd83ee1..b68746c76 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -43,6 +43,7 @@ namespace triton{ namespace codegen{ namespace analysis{ +class liveness; class tiles; class align; class allocation; @@ -201,10 +202,10 @@ private: public: - selection(analysis::allocation *alloc, analysis::tiles *tiles, analysis::cts *buffer_info, - analysis::align *alignment, analysis::axes *axes, analysis::layout *layouts, - transform::coalesce* reorder, target *tgt, unsigned num_warps) - : alloc_(alloc), tiles_(tiles), buffer_info_(buffer_info), + selection(analysis::liveness* liveness, analysis::allocation *alloc, analysis::tiles *tiles, + analysis::align *alignment, analysis::axes *axes, + analysis::layout *layouts, transform::coalesce* reorder, target *tgt, unsigned num_warps) + : liveness_(liveness), alloc_(alloc), tiles_(tiles), alignment_(alignment), a_axes_(axes), layouts_(layouts), reorder_(reorder), tgt_(tgt), num_warps_(num_warps){ } @@ -213,11 +214,11 @@ public: private: vmap_t vmap_; tmap_t tmap_; + analysis::liveness *liveness_; analysis::allocation *alloc_; analysis::tiles *tiles_; analysis::axes *a_axes_; analysis::layout *layouts_; - analysis::cts *buffer_info_; analysis::align *alignment_; transform::coalesce *reorder_; target *tgt_; diff --git a/include/triton/codegen/transform/coalesce.h b/include/triton/codegen/transform/coalesce.h index 680f1ccb2..e0ea0ea97 100644 --- a/include/triton/codegen/transform/coalesce.h +++ b/include/triton/codegen/transform/coalesce.h @@ -32,13 +32,12 @@ private: ir::value* rematerialize(ir::value *v, ir::builder& builder, std::map& seen); public: - coalesce(analysis::align* align, triton::codegen::analysis::layout *layouts, analysis::cts* mem); + coalesce(analysis::align* align, triton::codegen::analysis::layout *layouts); void run(ir::module &mod); private: analysis::align* align_; analysis::layout* layout_; - analysis::cts* mem_; }; } diff --git a/include/triton/codegen/transform/cts.h b/include/triton/codegen/transform/cts.h index 7b7237f7e..e670a4223 100644 --- a/include/triton/codegen/transform/cts.h +++ b/include/triton/codegen/transform/cts.h @@ -19,17 +19,6 @@ namespace analysis{ class cts { public: void run(ir::module &mod); - // queries - bool is_double(ir::value *x); - void add_shared(ir::value *v); - bool is_shared(ir::value *x); - bool is_loop_latch(ir::phi_node *phi, ir::instruction *terminator); - ir::value *get_reference(ir::value *x); - -private: - std::set shared_; - std::set double_; - std::map refs_; }; } diff --git a/include/triton/codegen/transform/membar.h b/include/triton/codegen/transform/membar.h index b4aebc2ce..a737d0e49 100644 --- a/include/triton/codegen/transform/membar.h +++ b/include/triton/codegen/transform/membar.h @@ -16,6 +16,7 @@ namespace codegen{ namespace analysis{ class allocation; +class liveness; class cts; } @@ -35,15 +36,17 @@ private: void add_reference(ir::value *v, interval_vec_t &res); void get_read_intervals(ir::instruction *i, interval_vec_t &res); void get_written_intervals(ir::instruction *i, interval_vec_t &res); - std::pair transfer(ir::basic_block *block, const interval_vec_t &written_to, const interval_vec_t &read_from, std::set &insert_loc); + std::pair transfer(ir::basic_block *block, const interval_vec_t &written_to, const interval_vec_t &read_from, + std::set &insert_loc, std::set &safe_war); public: - membar(analysis::allocation *alloc, analysis::cts *buffer_info): alloc_(alloc), buffer_info_(buffer_info) {} + membar(analysis::liveness *liveness, analysis::allocation *alloc): + liveness_(liveness), alloc_(alloc) {} void run(ir::module &mod); private: + analysis::liveness *liveness_; analysis::allocation *alloc_; - analysis::cts *buffer_info_; }; diff --git a/lib/codegen/analysis/allocation.cc b/lib/codegen/analysis/allocation.cc index b05b55a4d..0ad884fbc 100644 --- a/lib/codegen/analysis/allocation.cc +++ b/lib/codegen/analysis/allocation.cc @@ -1,4 +1,5 @@ #include +#include #include "triton/codegen/analysis/allocation.h" #include "triton/codegen/analysis/liveness.h" #include "triton/codegen/transform/cts.h" @@ -8,6 +9,7 @@ #include "triton/ir/value.h" #include "triton/ir/function.h" #include "triton/ir/instructions.h" +#include "triton/ir/utils.h" namespace triton{ namespace codegen{ @@ -68,13 +70,12 @@ unsigned allocation::num_bytes(ir::value *x) { unsigned ld = x->get_type()->get_tile_shapes()[0]; num_bytes += pad * num_bytes / ld; } - if(buffer_info_->is_double(x)) + if(liveness_->has_double(x)) num_bytes *= 2; return num_bytes; } - -void allocation::run() { +void allocation::run(ir::module &mod) { using std::max; using std::min; typedef std::multimap triples_map_type; @@ -85,7 +86,7 @@ void allocation::run() { std::vector J = I; triples_map_type H; - H.insert({0, segment{0, 1024}}); + H.insert({0, segment{0, INT_MAX}}); std::vector V; std::map starts; @@ -115,7 +116,6 @@ void allocation::run() { } } - // Build interference graph std::map> interferences; for(ir::value *x: V) @@ -137,6 +137,7 @@ void allocation::run() { for(ir::value *X: V) colors[X] = (X==V[0])?0:-1; + // First-fit graph coloring std::vector available(V.size()); for(ir::value *x: V){ @@ -158,18 +159,12 @@ void allocation::run() { for(ir::value *y: interferences[x]) Adj = std::max(Adj, starts[y] + num_bytes(y)); offsets_[x] = starts[x] + colors[x] * Adj; -// std::cout << x->get_name() << " " << offsets_[x] << " " << num_bytes(x) << std::endl; - if(buffer_info_->is_double(x)){ - ir::phi_node *phi = (ir::phi_node*)x; - for(unsigned i = 0; i < phi->get_num_incoming(); i++){ - ir::value *inc_val = phi->get_incoming_value(i); - offsets_[inc_val] = offsets_[phi]; - } + if(liveness_->has_double(x)){ + auto info = liveness_->get_double(x); + offsets_[info.latch] = offsets_[x] + num_bytes(x) / 2; } } -// exit(EXIT_FAILURE); - // Save maximum size of induced memory space allocated_size_ = 0; for(auto &x: offsets_){ diff --git a/lib/codegen/analysis/liveness.cc b/lib/codegen/analysis/liveness.cc index 05d29032b..f6df78b72 100644 --- a/lib/codegen/analysis/liveness.cc +++ b/lib/codegen/analysis/liveness.cc @@ -7,13 +7,59 @@ #include "triton/ir/module.h" #include "triton/ir/instructions.h" #include "triton/ir/value.h" +#include "triton/ir/utils.h" namespace triton{ namespace codegen{ namespace analysis{ +inline bool is_loop_latch(ir::phi_node *phi, ir::instruction *terminator){ + if(phi->get_parent() != terminator->get_parent()) + return false; + if(auto *br = dynamic_cast(terminator)) + return br->get_true_dest() == phi->get_parent() + || br->get_false_dest() == phi->get_parent(); + else if(dynamic_cast(terminator)) + return false; + else + throw std::runtime_error("unreachable"); +} + +inline void extract_double_bufferable(ir::instruction *i, std::map& result) { + auto* phi = dynamic_cast(i); + if(!phi || phi->get_num_incoming() != 2) + return; + ir::basic_block *block_0 = phi->get_incoming_block(0); + ir::basic_block *block_1 = phi->get_incoming_block(1); + ir::instruction *terminator_0 = block_0->get_inst_list().back(); + ir::instruction *terminator_1 = block_1->get_inst_list().back(); + bool is_latch_0 = is_loop_latch(phi, terminator_0); + bool is_latch_1 = is_loop_latch(phi, terminator_1); + ir::value *value_0 = phi->get_incoming_value(0); + ir::value *value_1 = phi->get_incoming_value(1); + ir::instruction *i_0 = dynamic_cast(value_0); + ir::instruction *i_1 = dynamic_cast(value_1); + if(!i_0 || !i_1 || storage_info.at(i_0->get_id()).first != SHARED || storage_info.at(i_1->get_id()).first != SHARED) + return; + if(is_latch_1) + result[value_0] = double_buffer_info_t{value_1, phi}; + if(is_latch_0) + result[value_1] = double_buffer_info_t{value_0, phi}; +} + + // Entry point void liveness::run(ir::module &mod) { + double_.clear(); + indices_.clear(); + intervals_.clear(); + + // set of pair of values that can be double-buffered + ir::for_each_instruction(mod, [this](ir::instruction* i) { + extract_double_bufferable(i, this->double_); + }); + + for(ir::function *fn: mod.get_function_list()){ // Assigns index to each instruction slot_index index = 0; @@ -26,12 +72,10 @@ void liveness::run(ir::module &mod) { // Creates live intervals for(auto i: indices_){ ir::value *v = i.first; -// ir::instruction* instr = dynamic_cast(v); -// if(!instr) -// continue; -// if(storage_info.at(instr->get_id()).first != SHARED) -// continue; - if(!info_->is_shared(v) || info_->get_reference(v)) + ir::instruction* instr = dynamic_cast(v); + if(!instr) + continue; + if(storage_info.at(instr->get_id()).first != SHARED) continue; unsigned start = i.second; unsigned end = start; @@ -41,6 +85,21 @@ void liveness::run(ir::module &mod) { } intervals_[v] = segment{start, end}; } + // Double-Buffering + // Arrays are live throughout the end of the loop + auto it = intervals_.begin(); + while(it != intervals_.end()) { + ir::value *x = it->first; + auto dit = double_.find(x); + if(dit != double_.end()) { + ir::value *y = dit->second.latch; + unsigned start = intervals_[x].start; + unsigned end = intervals_[y].end; + intervals_[x] = segment{start, end}; + intervals_.erase(y); + } + it++; + } } } diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 169283e7f..3313ef0b4 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -1,12 +1,14 @@ #include #include "triton/codegen/selection.h" #include "triton/codegen/target.h" +#include "triton/codegen/analysis/liveness.h" #include "triton/codegen/analysis/layout.h" #include "triton/codegen/analysis/axes.h" #include "triton/codegen/analysis/tiles.h" #include "triton/codegen/analysis/allocation.h" #include "triton/codegen/analysis/align.h" #include "triton/codegen/transform/coalesce.h" +#include "triton/codegen/instructions.h" #include "triton/ir/context.h" #include "triton/ir/module.h" #include "triton/ir/function.h" @@ -746,42 +748,31 @@ void selection::create_shared_tile(ir::value *v, IRBuilder<> &builder, Value *sh Type* ty = llvm_type(v->get_type()->get_scalar_ty(), builder.getContext()); // shared copy PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr->getType()->getPointerAddressSpace()); - // phi-node (double-buffering) - if(auto *phi = dynamic_cast(v)) { + // double-buffered + if(liveness_->has_double(v)) { + auto info = liveness_->get_double(v); + ir::phi_node *phi = info.phi; BasicBlock *parent = (BasicBlock*)vmap_[phi->get_parent()]; - unsigned id_pre = 0, id_loop = 1; - if(phi->get_incoming_block(0) == phi->get_parent()) - std::swap(id_pre, id_loop); if(parent->empty()) builder.SetInsertPoint(parent); else builder.SetInsertPoint(&*parent->getFirstInsertionPt()); + // create double-buffered pointer PHINode *ptr = builder.CreatePHI(ptr_ty, 2); PHINode *offset = builder.CreatePHI(builder.getInt32Ty(), 2); // next pointer - Value *pre_ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(alloc_->offset(phi))); + Value *pre_ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(alloc_->offset(v))); pre_ptr = builder.CreateBitCast(pre_ptr, ptr->getType()); Value *next_ptr = builder.CreateGEP(ptr, offset, "next_ptr"); tmap_.insert({phi, new shared_tile(ty, shapes, ptr, builder, offset)}); - for(unsigned i = 0; i < phi->get_num_incoming(); i++) { - ir::basic_block* inc_block = phi->get_incoming_block(i); - ir::value* inc_value = phi->get_incoming_value(i); - ir::instruction* terminator = inc_block->get_inst_list().back(); - bool is_loop_latch = buffer_info_->is_loop_latch(phi, terminator); - tmap_.insert({inc_value, new shared_tile(ty, shapes, is_loop_latch?next_ptr:pre_ptr, builder)}); - } + tmap_.insert({v, new shared_tile(ty, shapes, pre_ptr, builder)}); + tmap_.insert({info.latch, new shared_tile(ty, shapes, next_ptr, builder)}); } else { - bool has_phi_user = false; - for(ir::user *usr: v->get_users()) - if(dynamic_cast(usr)) - has_phi_user = true; - if(!has_phi_user){ - size_t offset = alloc_->offset(v); - Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset)); - ptr = builder.CreateBitCast(ptr, ptr_ty); - tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)}); - } + size_t offset = alloc_->offset(v); + Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset)); + ptr = builder.CreateBitCast(ptr, ptr_ty); + tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)}); } } @@ -827,8 +818,9 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, if(auto *user = dynamic_cast(v)) for(ir::value *op: user->ops()) create_tile(op, builder, seen, sh_mem_ptr); - if(buffer_info_->is_shared(v) && !dynamic_cast(v)) - create_shared_tile(v, builder, sh_mem_ptr); + auto *i = dynamic_cast(v); + if(i && storage_info.at(i->get_id()).first == SHARED && !dynamic_cast(v)) + create_shared_tile(i, builder, sh_mem_ptr); else create_distributed_tile(v, builder); } @@ -1427,7 +1419,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & lower_masked_load(x, ctx, fn, builder); else if(auto *x = dynamic_cast(ins)) lower_load(x, ctx, fn, builder); - else if(!buffer_info_->is_shared(ins)) + else if(!dynamic_cast(tmap_.at(ins))) lower_elementwise(ins, ctx, fn, builder); } @@ -1556,21 +1548,19 @@ void selection::run(ir::module &src, Module &dst) { } } - // add phi operands for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *inst: block->get_inst_list()) - if(auto *phi = dynamic_cast(inst)){ - if(buffer_info_->is_double(phi)) { + for(ir::instruction *inst: block->get_inst_list()) { + if(liveness_->has_double(inst)) { + auto info = liveness_->get_double(inst); + ir::phi_node *phi = info.phi; PHINode *ptr = (PHINode*)((shared_tile*)tmap_.at(phi))->get_pointer(); PHINode *offset = (PHINode*)((shared_tile*)tmap_.at(phi))->get_offset(); for(unsigned n = 0; n < phi->get_num_incoming(); n++){ ir::basic_block* inc_block = phi->get_incoming_block(n); ir::value* inc_val = phi->get_incoming_value(n); - ir::instruction* terminator = inc_block->get_inst_list().back(); BasicBlock *llvm_inc_block = last_block.at(inc_block); shared_tile *inc_shared = (shared_tile*)tmap_.at(inc_val); - bool is_loop_latch = buffer_info_->is_loop_latch(phi, terminator); - if(is_loop_latch){ + if(inc_val == info.latch){ dst_builder.SetInsertPoint(llvm_inc_block->getTerminator()); Value *next_offset = dst_builder.CreateNeg(offset); offset->addIncoming(next_offset, llvm_inc_block); @@ -1582,7 +1572,14 @@ void selection::run(ir::module &src, Module &dst) { ptr->addIncoming(inc_shared->get_pointer(), llvm_inc_block); } } - else { + } + + // add phi operands + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *inst: block->get_inst_list()) + if(auto *phi = dynamic_cast(inst)){ + if(tmap_.find(phi) == tmap_.end() || + !dynamic_cast(tmap_.at(phi))) { for(unsigned n = 0; n < phi->get_num_incoming(); n++){ ir::value *inc_val = phi->get_incoming_value(n); ir::basic_block *inc_block = phi->get_incoming_block(n); diff --git a/lib/codegen/transform/coalesce.cc b/lib/codegen/transform/coalesce.cc index 117bd35df..d349e5b11 100644 --- a/lib/codegen/transform/coalesce.cc +++ b/lib/codegen/transform/coalesce.cc @@ -15,8 +15,8 @@ namespace triton { namespace codegen{ namespace transform{ -coalesce::coalesce(analysis::align* align, analysis::layout *layouts, analysis::cts *mem) - : align_(align), layout_(layouts), mem_(mem) { } +coalesce::coalesce(analysis::align* align, analysis::layout *layouts) + : align_(align), layout_(layouts) { } // Find all values that are used as pointer operands in LD/ST void coalesce::extract_io_use(ir::value *v, std::set& result) { diff --git a/lib/codegen/transform/cts.cc b/lib/codegen/transform/cts.cc index 5a7e16a2d..c4660a2a4 100644 --- a/lib/codegen/transform/cts.cc +++ b/lib/codegen/transform/cts.cc @@ -14,38 +14,6 @@ namespace codegen{ namespace analysis{ // run pass on module -bool cts::is_loop_latch(ir::phi_node *phi, ir::instruction *terminator){ - if(phi->get_parent() != terminator->get_parent()) - return false; - if(auto *br = dynamic_cast(terminator)) - return br->get_true_dest() == phi->get_parent() - || br->get_false_dest() == phi->get_parent(); - else if(dynamic_cast(terminator)) - return false; - else - throw std::runtime_error("unreachable"); -} - - - -inline bool get_is_shared(ir::value* v) { - if(dynamic_cast(v)) - return true; - if(dynamic_cast(v)) - return true; - if(dynamic_cast(v)) - return true; - if(dynamic_cast(v)) - return true; - if(auto *x = dynamic_cast(v)){ - bool res = true; - for(unsigned inc = 0; inc < x->get_num_incoming(); inc++) - res = res && get_is_shared(x->get_incoming_value(inc)); - return res; - } - return false; -} - void add_copy(ir::instruction *parent, ir::value *x, ir::builder &builder) { auto *i = dynamic_cast(x); // not an instruction @@ -72,10 +40,6 @@ void add_copy(ir::instruction *parent, ir::value *x, ir::builder &builder) { } void cts::run(ir::module &mod) { - shared_.clear(); - refs_.clear(); - double_.clear(); - // Add shared copies ir::builder &builder = mod.get_builder(); for(ir::function *fn: mod.get_function_list()){ @@ -88,55 +52,8 @@ void cts::run(ir::module &mod) { add_copy(i, i->get_operand(k), builder); } } - - // Find which buffers are shared - for(ir::function *fn: mod.get_function_list()) - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i: block->get_inst_list()) - if(get_is_shared(i)) - shared_.insert(i); - - // double-buffering - for(ir::function *fn: mod.get_function_list()) - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i: block->get_inst_list()) { - if(!i->get_type()->is_tile_ty()) - continue; - // handle phi - if(auto *phi = dynamic_cast(i)) - if(is_shared(phi)){ - // determine if the value is in shared memory - bool is_double = false; - for(unsigned n = 0; n < phi->get_num_incoming(); n++){ - ir::basic_block *inc_block = phi->get_incoming_block(n); - ir::instruction *terminator = inc_block->get_inst_list().back(); - is_double = is_double || is_loop_latch(phi, terminator); - } - // add to double-buffered - if(is_double) - double_.insert(phi); - // set references of input - for(unsigned n = 0; n < phi->get_num_incoming(); n++){ - ir::value *inc_val = phi->get_incoming_value(n); - refs_[inc_val] = phi; - } - } - } } -// query double-buffered status -bool cts::is_double(ir::value *x) -{ return double_.find(x) != double_.end(); } - -// query shared status -bool cts::is_shared(ir::value *x) -{ return shared_.find(x) != shared_.end(); } - -// get reference if any -ir::value *cts::get_reference(ir::value *x) -{ return refs_[x]; } - - } } diff --git a/lib/codegen/transform/membar.cc b/lib/codegen/transform/membar.cc index e77e9c71a..6ec14bc09 100644 --- a/lib/codegen/transform/membar.cc +++ b/lib/codegen/transform/membar.cc @@ -2,8 +2,10 @@ #include #include -#include "triton/codegen/transform/membar.h" +#include "triton/codegen/analysis/liveness.h" #include "triton/codegen/analysis/allocation.h" +#include "triton/codegen/instructions.h" +#include "triton/codegen/transform/membar.h" #include "triton/codegen/transform/cts.h" #include "triton/ir/module.h" #include "triton/ir/function.h" @@ -31,7 +33,10 @@ bool membar::intersect(const interval_vec_t &X, const interval_vec_t &Y) { } void membar::add_reference(ir::value *v, interval_vec_t &res){ - if(buffer_info_->is_shared(v) && !dynamic_cast(v)){ + auto *i = dynamic_cast(v); + if(!i) + return; + if(storage_info.at(i->get_id()).first == SHARED){ unsigned offset = alloc_->offset(v); unsigned num_bytes = alloc_->num_bytes(v); res.push_back(interval_t(offset, offset + num_bytes)); @@ -79,10 +84,12 @@ std::pair membar::transfer(ir::basic_block *block, const interval_vec_t &written_to, const interval_vec_t &read_from, - std::set& insert_loc) { + std::set& insert_loc, + std::set& safe_war) { ir::basic_block::inst_list_t instructions = block->get_inst_list(); interval_vec_t new_written_to = written_to; interval_vec_t new_read_from = read_from; + for(ir::instruction *i: instructions){ interval_vec_t read, written; get_read_intervals(i, read); @@ -90,9 +97,9 @@ std::pairis_shared(i) && - buffer_info_->is_double(buffer_info_->get_reference(i))) + if(safe_war.find(i) != safe_war.end()) write_after_read = false; + // record hazards if(read_after_write || write_after_read) { insert_loc.insert(i); new_written_to.clear(); @@ -106,6 +113,18 @@ std::pair safe_war; + ir::for_each_instruction(mod, [&](ir::instruction* i){ + if(liveness_->has_double(i)){ + auto info = liveness_->get_double(i); + safe_war.insert(i); + safe_war.insert(info.latch); + } + }); + for(ir::function *fn: mod.get_function_list()){ std::vector rpo = ir::cfg::reverse_post_order(fn); std::map written_to; @@ -125,7 +144,7 @@ void membar::run(ir::module &mod) { for(ir::basic_block* pred: block->get_predecessors()) pred_read_from.push_back(read_from[pred]); // apply transfer function - auto result = transfer(block, join(pred_written_to), join(pred_read_from), insert_locs); + auto result = transfer(block, join(pred_written_to), join(pred_read_from), insert_locs, safe_war); written_to[block] = result.first; read_from[block] = result.second; } diff --git a/lib/driver/module.cc b/lib/driver/module.cc index 66c775ac6..85877f911 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -241,7 +241,7 @@ std::string cu_module::compile_llvm_module(std::unique_ptr module, cu_module::cu_module(driver::context * context, std::unique_ptr ll_module): cu_module(context, compile_llvm_module(std::move(ll_module), context->device())) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ -// std::cout << source << std::endl; + std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 501d62f54..f01693a95 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -199,24 +199,24 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c llvm::LLVMContext ctx; std::unique_ptr llvm(new llvm::Module(module.get_name(), ctx)); // create passes - codegen::analysis::cts shmem_info; + codegen::analysis::cts cts; codegen::analysis::align align; - codegen::analysis::liveness shmem_liveness(&shmem_info); + codegen::analysis::liveness shmem_liveness; codegen::analysis::axes axes; codegen::analysis::layout layouts(&axes); - codegen::transform::coalesce coalesce(&align, &layouts, &shmem_info); + codegen::transform::coalesce coalesce(&align, &layouts); codegen::analysis::tiles tiles(opt.num_warps, &align, &axes, &layouts); - codegen::analysis::allocation shmem_allocation(&shmem_liveness, &shmem_info, &tiles); - codegen::transform::membar shmem_barriers(&shmem_allocation, &shmem_info); + codegen::analysis::allocation shmem_allocation(&shmem_liveness, &tiles); + codegen::transform::membar shmem_barriers(&shmem_liveness, &shmem_allocation); codegen::transform::dce dce; codegen::transform::peephole peephole; codegen::transform::reassociate reassociate(&align); - codegen::selection selection(&shmem_allocation, &tiles, &shmem_info, &align, &axes, &layouts, &coalesce, target.get(), opt.num_warps); + codegen::selection selection(&shmem_liveness, &shmem_allocation, &tiles, &align, &axes, &layouts, &coalesce, target.get(), opt.num_warps); // run passes peephole.run(module); dce.run(module); align.run(module); - shmem_info.run(module); + cts.run(module); axes.run(module); layouts.run(module); coalesce.run(module); @@ -227,10 +227,10 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c reassociate.run(module); dce.run(module); peephole.run(module); - shmem_info.run(module); + dce.run(module); + cts.run(module); shmem_liveness.run(module); - ir::print(module, std::cout); - shmem_allocation.run(); + shmem_allocation.run(module); if(shmem_allocation.allocated_size() > context->device()->max_shared_memory()) return std::unique_ptr(); shmem_barriers.run(module); diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index 7f2366ecc..9a0cd9ca7 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -45,10 +45,10 @@ std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, i opt.defines.push_back({"TYPE", {ty}}); opt.defines.push_back({"AT", {AT?"1":"0"}}); opt.defines.push_back({"BT", {BT?"1":"0"}}); - opt.defines.push_back({"TM", {"64", "128"}}); - opt.defines.push_back({"TN", {"64", "128"}}); + opt.defines.push_back({"TM", {"128"}}); + opt.defines.push_back({"TN", {"128"}}); opt.defines.push_back({"TK", {"8"}}); - opt.num_warps = {2, 4, 8}; + opt.num_warps = {8}; // create function rt::function function(src::dot, opt); // benchmark available libraries From 856e7baa0462d930decfd04dd969c931f2392eaa Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 23 Sep 2019 12:07:24 -0400 Subject: [PATCH 414/494] [test] added tests for copy --- include/triton/codegen/selection.h | 4 +- include/triton/codegen/transform/cts.h | 2 +- lib/codegen/selection.cc | 169 +++++++++++-------------- lib/codegen/transform/cts.cc | 3 +- lib/runtime/function.cc | 2 +- tests/bench/copy2d.cc | 70 +++------- tests/common/copy.h | 142 +++++++++++++++++++++ tests/common/src/copy.h | 61 +++++++-- tests/common/util.h | 44 ++++++- tests/unit/CMakeLists.txt | 2 +- tests/unit/copy1d.cc | 30 +++++ tests/unit/copy2d.cc | 46 +++++++ tests/unit/copy3d.cc | 38 ++++++ tests/unit/dot.cc | 6 +- 14 files changed, 449 insertions(+), 170 deletions(-) create mode 100644 tests/common/copy.h create mode 100644 tests/unit/copy1d.cc create mode 100644 tests/unit/copy2d.cc create mode 100644 tests/unit/copy3d.cc diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index b68746c76..df34f2987 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -156,6 +156,8 @@ private: Constant* llvm_constant(ir::constant *cst, LLVMContext &ctx); Value* llvm_alloc_const(ir::alloc_const *v, Module *module, Builder &builder); ArrayType* llvm_linearized_tile_type(ir::type *ty, LLVMContext &ctx); + Function* llvm_fn(ir::function *fn, Builder& builder, Module &dst); + Value* alloc_shared(Builder &builder, Module& dst); // grid construction void create_grids(std::vector &grids, @@ -167,7 +169,7 @@ private: void init_strided_scan_axes(ir::value *i, Builder &builder, Value *u_thread_id, Value *u_warp_id); void init_hmma_axes(ir::value *i, Builder &builder, Value *u_thread_id, Value *u_warp_id); void init_axes(ir::value *i, Builder &builder, Value *u_thread_id, Value *u_warp_id); - void init_grids(ir::function *fn, Builder &builder, Value *sh_mem_ptr); + void init_layouts(ir::function *fn, Builder &builder, Value *sh_mem_ptr); // lower scalar instruction void lower_instruction(ir::instruction *src, Builder &builder); diff --git a/include/triton/codegen/transform/cts.h b/include/triton/codegen/transform/cts.h index e670a4223..b4289305b 100644 --- a/include/triton/codegen/transform/cts.h +++ b/include/triton/codegen/transform/cts.h @@ -14,7 +14,7 @@ namespace ir { } namespace codegen{ -namespace analysis{ +namespace transform{ class cts { public: diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 3313ef0b4..7d72daefa 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -573,51 +573,31 @@ inline int32_t ceil(int32_t num, int32_t div){ return (num + div - 1)/div; } -inline void to_warps(const std::vector &bs, const std::vector& order, std::vector &nw, std::vector &ws){ - static const size_t warp_size = 32; - size_t nthreads = 1, nwarps = 1; - nw.resize(bs.size()); - ws.resize(bs.size()); - for(size_t i = 0; i < bs.size(); ++i){ - nthreads *= bs[i]; - nw[order[i]] = ceil(nthreads, nwarps*warp_size); - nwarps *= nw[order[i]]; - } - for(size_t i = 0; i < bs.size(); ++i){ - ws[i] = bs[i] / nw[i]; - } -} - void selection::init_strided_scan_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { auto order = tiles_->order(v); const auto& shapes = v->get_type()->get_tile_shapes(); size_t dim = shapes.size(); std::vector contiguous(dim); std::vector block_size(dim); - std::vector warp_size(dim); - std::vector n_warps(dim); for(unsigned i = 0; i < shapes.size(); i++){ contiguous[i] = tiles_->nts(v, i); block_size[i] = tiles_->mts(v, i); } - to_warps(block_size, order, n_warps, warp_size); - std::vector thread_id_in_warp = delinearize(u_thread_id, order, warp_size, builder); - std::vector warp_id = delinearize(u_warp_id, order, n_warps, builder); + Value* full_thread_id = builder.CreateAdd(builder.CreateMul(u_warp_id, builder.getInt32(32)), u_thread_id); + std::vector thread_id = delinearize(full_thread_id, order, block_size, builder); // Create axes for(unsigned k = 0; k < dim; k++) { std::string str_k = std::to_string(k); - Value *warp_size_k = builder.getInt32(warp_size[k]); Value *contiguous_k = builder.getInt32(contiguous[k]); - Value *thread_id = builder.CreateAdd(thread_id_in_warp[k], builder.CreateMul(warp_id[k], warp_size_k)); - Value *scaled_thread_id = builder.CreateMul(thread_id, contiguous_k); - unsigned per_block = contiguous[k] * warp_size[k] * n_warps[k]; + Value *scaled_thread_id = builder.CreateMul(thread_id[k], contiguous_k); + unsigned per_block = contiguous[k] * block_size[k]; unsigned per_thread = contiguous[k] * shapes[k] / per_block; std::vector idx_list(per_thread); for(unsigned n = 0 ; n < per_thread; n++){ unsigned offset = n / contiguous[k] * per_block + n % contiguous[k]; idx_list[n] = builder.CreateAdd(scaled_thread_id, builder.getInt32(offset), "idx_" + str_k + "_" + std::to_string(n)); } - axes_[a_axes_->get_id(v, k)] = distributed_axis{contiguous[k], idx_list, thread_id}; + axes_[a_axes_->get_id(v, k)] = distributed_axis{contiguous[k], idx_list, thread_id[k]}; } } @@ -825,7 +805,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, create_distributed_tile(v, builder); } -void selection::init_grids(ir::function *fn, IRBuilder<> &builder, Value *sh_mem_ptr){ +void selection::init_layouts(ir::function *fn, IRBuilder<> &builder, Value *sh_mem_ptr){ // fetch linear ID Module *mod = builder.GetInsertBlock()->getParent()->getParent(); Value *warp_size = builder.getInt32(32); @@ -1454,84 +1434,83 @@ ArrayType* selection::llvm_linearized_tile_type(ir::type *ty, LLVMContext &ctx) return ArrayType::get(llvm_type(ty->get_scalar_ty(), ctx), size); } +Function* selection::llvm_fn(ir::function *fn, IRBuilder<>& builder, Module& dst) { + LLVMContext &ctx = builder.getContext(); + FunctionType *fn_ty = (FunctionType*)llvm_type(fn->get_fn_type(), ctx); + FunctionType *dst_fn_ty = fn_ty; + if(!tgt_->is_gpu()){ + Type *dst_fn_ret_ty = fn_ty->getReturnType(); + std::vector dst_fn_args_ty; + for(unsigned i = 0; i < fn_ty->getNumParams(); i++) + dst_fn_args_ty.push_back(fn_ty->getParamType(i)); + dst_fn_args_ty.push_back(builder.getInt32Ty()); + dst_fn_args_ty.push_back(builder.getInt32Ty()); + dst_fn_args_ty.push_back(builder.getInt32Ty()); + dst_fn_ty = FunctionType::get(dst_fn_ret_ty, dst_fn_args_ty, false); + } + Function *ret = Function::Create(dst_fn_ty, Function::ExternalLinkage, fn->get_name(), &dst); + // set attributes + for(auto attr_pair: fn->attrs()){ + unsigned id = attr_pair.first; + for(ir::attribute attr: attr_pair.second) + if(attr.is_llvm_attr()) + ret->addAttribute(id, llvm_attr(ctx, attr)); + } + // set metadata + tgt_->set_kernel(builder, ctx, &dst, ret); + Metadata *md_args[] = { + ValueAsMetadata::get(ret), + MDString::get(ctx, "maxntidx"), + ValueAsMetadata::get(builder.getInt32(num_warps_*32)) + }; + dst.getOrInsertNamedMetadata("nvvm.annotations")->addOperand(MDNode::get(ctx, md_args)); + // map parameters + for(unsigned i = 0; i < fn->args().size(); i++) + vmap_[fn->args()[i]] = &*(ret->arg_begin() + i); + // create blocks + for(ir::basic_block *block: fn->blocks()) { + BasicBlock *dst_block = BasicBlock::Create(ctx, block->get_name(), ret); + vmap_[block] = dst_block; + } + builder.SetInsertPoint((BasicBlock*)vmap_[fn->blocks()[0]]); +} + +Value* selection::alloc_shared(IRBuilder<> &builder, Module& dst) { + Value *ret = nullptr; + LLVMContext &ctx = builder.getContext(); + if(tgt_->is_gpu()) + if(unsigned alloc_size = alloc_->allocated_size()){ + Type *int_8_ty = Type::getInt8Ty(ctx); + ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_size); + Type *ptr_ty = PointerType::get(int_8_ty, 3); + GlobalVariable *sh_mem_array = + new GlobalVariable(dst, array_ty, false, GlobalVariable::ExternalLinkage, + nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); + ret = builder.CreateBitCast(sh_mem_array, ptr_ty); + } + return ret; +} + void selection::run(ir::module &src, Module &dst) { vmap_.clear(); + tmap_.clear(); + LLVMContext &dst_ctx = dst.getContext(); IRBuilder<> dst_builder(dst_ctx); - for(ir::alloc_const *x: src.allocs()) { + // constant memory + for(ir::alloc_const *x: src.allocs()) vmap_[x] = llvm_alloc_const(x, &dst, dst_builder); - } // iterate over functions for(ir::function *fn: src.get_function_list()) { - // create LLVM function - FunctionType *fn_ty = (FunctionType*)llvm_type(fn->get_fn_type(), dst_ctx); - FunctionType *dst_fn_ty = fn_ty; - if(!tgt_->is_gpu()){ - Type *dst_fn_ret_ty = fn_ty->getReturnType(); - std::vector dst_fn_args_ty; - for(unsigned i = 0; i < fn_ty->getNumParams(); i++) - dst_fn_args_ty.push_back(fn_ty->getParamType(i)); - dst_fn_args_ty.push_back(dst_builder.getInt32Ty()); - dst_fn_args_ty.push_back(dst_builder.getInt32Ty()); - dst_fn_args_ty.push_back(dst_builder.getInt32Ty()); - dst_fn_ty = FunctionType::get(dst_fn_ret_ty, dst_fn_args_ty, false); - } - - // grid indices - fn->get_fn_type()->get_return_ty(); - Function *dst_fn = Function::Create(dst_fn_ty, Function::ExternalLinkage, fn->get_name(), &dst); - - // set attributes - for(auto attr_pair: fn->attrs()){ - unsigned id = attr_pair.first; - for(ir::attribute attr: attr_pair.second) - if(attr.is_llvm_attr()){ - dst_fn->addAttribute(id, llvm_attr(dst_ctx, attr)); - } - } - - tgt_->set_kernel(dst_builder, dst_ctx, &dst, dst_fn); - // set metadata - Metadata *md_args[] = { - ValueAsMetadata::get(dst_fn), - MDString::get(dst_ctx, "maxntidx"), - ValueAsMetadata::get(dst_builder.getInt32(num_warps_*32)) - }; - dst.getOrInsertNamedMetadata("nvvm.annotations")->addOperand(MDNode::get(dst_ctx, md_args)); - - - // map parameters - for(unsigned i = 0; i < fn->args().size(); i++) - vmap_[fn->args()[i]] = &*(dst_fn->arg_begin() + i); - // create blocks - for(ir::basic_block *block: fn->blocks()) { - BasicBlock *dst_block = BasicBlock::Create(dst_ctx, block->get_name(), dst_fn); - vmap_[block] = dst_block; - } - dst_builder.SetInsertPoint((BasicBlock*)vmap_[fn->blocks()[0]]); - + llvm_fn(fn, dst_builder, dst); // allocate shared memory - Value *sh_mem_ptr = nullptr; - if(tgt_->is_gpu()) - if(unsigned alloc_size = alloc_->allocated_size()){ - Type *int_8_ty = Type::getInt8Ty(dst_ctx); - ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_size); - Type *ptr_ty = PointerType::get(int_8_ty, 3); - GlobalVariable *sh_mem_array = - new GlobalVariable(dst, array_ty, false, GlobalVariable::ExternalLinkage, - nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); - sh_mem_ptr = dst_builder.CreateBitCast(sh_mem_array, ptr_ty); - } - sh_mem_ptr_ = sh_mem_ptr; - - // create grids - init_grids(fn, dst_builder, sh_mem_ptr); - - - // iterate through block + sh_mem_ptr_ = alloc_shared(dst_builder, dst); + // initialize layouts + init_layouts(fn, dst_builder, sh_mem_ptr_); + // generate LLVM-IR code std::map last_block; for(ir::basic_block *block: fn->blocks()) { BasicBlock *parent = (BasicBlock*)vmap_[block]; @@ -1547,7 +1526,7 @@ void selection::run(ir::module &src, Module &dst) { last_block[block] = dst_builder.GetInsertBlock(); } } - + // finalize double-buffering for(ir::basic_block *block: fn->blocks()) for(ir::instruction *inst: block->get_inst_list()) { if(liveness_->has_double(inst)) { @@ -1574,7 +1553,7 @@ void selection::run(ir::module &src, Module &dst) { } } - // add phi operands + // finalize phi for(ir::basic_block *block: fn->blocks()) for(ir::instruction *inst: block->get_inst_list()) if(auto *phi = dynamic_cast(inst)){ diff --git a/lib/codegen/transform/cts.cc b/lib/codegen/transform/cts.cc index c4660a2a4..1f90e7e5e 100644 --- a/lib/codegen/transform/cts.cc +++ b/lib/codegen/transform/cts.cc @@ -9,9 +9,8 @@ #include "triton/ir/type.h" namespace triton { - namespace codegen{ -namespace analysis{ +namespace transform{ // run pass on module void add_copy(ir::instruction *parent, ir::value *x, ir::builder &builder) { diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index f01693a95..a0275074f 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -199,7 +199,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c llvm::LLVMContext ctx; std::unique_ptr llvm(new llvm::Module(module.get_name(), ctx)); // create passes - codegen::analysis::cts cts; + codegen::transform::cts cts; codegen::analysis::align align; codegen::analysis::liveness shmem_liveness; codegen::analysis::axes axes; diff --git a/tests/bench/copy2d.cc b/tests/bench/copy2d.cc index 6ee7f5496..f1252797e 100644 --- a/tests/bench/copy2d.cc +++ b/tests/bench/copy2d.cc @@ -1,65 +1,35 @@ -#include -#include -#include +#include +#include +#include "copy.h" #include "triton/driver/backend.h" -#include "triton/driver/stream.h" -#include "triton/tools/bench.hpp" -#include "triton/external/half.hpp" -#include "triton/runtime/function.h" -#include "src/copy.h" -#include "util.h" -#include "cuda/cublas.h" -std::vector do_bench(drv::stream* stream, int32_t M, int32_t N, order_t order_x, order_t order_y){ - typedef float NumericT; - std::string ty = "float"; - size_t dt_nbytes = sizeof(NumericT); - drv::context* context = stream->context(); - // create inputs - auto dx = std::unique_ptr(drv::buffer::create(context, M*N*dt_nbytes)); - auto dy = std::unique_ptr(drv::buffer::create(context, M*N*dt_nbytes)); - // create options - rt::function::options_space_t opt; - opt.defines.push_back({"TYPE", {ty}}); - opt.defines.push_back({"STRIDE_XM", {(order_x == ROWMAJOR)?"M":"1"}}); - opt.defines.push_back({"STRIDE_XN", {(order_x == ROWMAJOR)?"1":"N"}}); - opt.defines.push_back({"STRIDE_YM", {(order_y == ROWMAJOR)?"M":"1"}}); - opt.defines.push_back({"STRIDE_YN", {(order_y == ROWMAJOR)?"1":"N"}}); - opt.defines.push_back({"TM", {"32"}}); - opt.defines.push_back({"TN", {"32"}}); - opt.num_warps = {4}; - // create function - rt::function function(src::copy2d, opt); - // benchmark available libraries - std::vector result; - auto gbps = [&](double ns) { return 2*M*N*dt_nbytes / (ns * 1e-9) * 1e-9; }; - // triton - double triton_ns = triton::tools::bench([&]() { function({&*dx, &*dy, M, N}, grid2d(M, N), stream);}, stream); - result.push_back(gbps(triton_ns)); - // done - return result; -} - int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); triton::driver::stream* stream = triton::driver::stream::create(context); // shapes to benchmark - typedef std::tuple config_t; + typedef std::tuple, std::vector, std::vector> config_t; std::vector configs = { - {4096, 4096, ROWMAJOR, ROWMAJOR}, - {4096, 4096, COLMAJOR, ROWMAJOR}, - {4096, 4096, ROWMAJOR, COLMAJOR}, - {4096, 4096, COLMAJOR, COLMAJOR}, + {{4096*4096}, {0}, {0}}, + {{4096, 4096}, {0, 1}, {1, 0}}, + {{4096, 4096}, {0, 1}, {1, 0}}, + {{4096, 4096}, {1, 0}, {0, 1}}, + {{4096, 4096}, {0, 1}, {0, 1}}, + {{256, 256, 256}, {0, 1, 2}, {0, 1, 2}}, + {{256, 256, 256}, {0, 1, 2}, {0, 2, 1}}, + {{256, 256, 256}, {1, 0, 2}, {1, 2, 0}}, + {{256, 256, 256}, {1, 2, 0}, {1, 0, 2}}, + {{256, 256, 256}, {2, 0, 1}, {0, 1, 2}}, + {{256, 256, 256}, {2, 1, 0}, {0, 2, 1}} }; // does the work - int32_t M, N; - order_t ord_x, ord_y; + std::vector shape; + std::vector ord_x, ord_y; for(const auto& c: configs){ - std::tie(M, N, ord_x, ord_y) = c; - std::cout << "// " << M << ", " << N << ", " << ord_x << ", " << ord_y << std::flush; - for(auto perf: do_bench(stream, M, N, ord_x, ord_y)) + std::tie(shape, ord_x, ord_y) = c; + std::cout << "// " << c << std::flush; + for(auto perf: bench_copy_nd(stream, shape, ord_x, ord_y)) std::cout << ", " << perf << std::flush; std::cout << std::endl; } diff --git a/tests/common/copy.h b/tests/common/copy.h new file mode 100644 index 000000000..811e5d7a4 --- /dev/null +++ b/tests/common/copy.h @@ -0,0 +1,142 @@ +#include "src/copy.h" +#include "triton/driver/stream.h" +#include "triton/runtime/function.h" +#include "triton/tools/bench.hpp" +#include "util.h" + +int32_t off(const std::vector& idx, const std::vector& strides) { + int32_t res = 0; + for(size_t d = 0; d < idx.size(); d++) + res += idx[d] * strides[d]; + return res; +} + +enum run_mode_t { + BENCH, + TEST +}; + +template +void cc_copy_nd(const std::vector& x, std::vector& y, + const std::vector& shape, + const std::vector& x_order, const std::vector& y_order) { + size_t rank = shape.size(); + // strides for x + std::vector x_strides(shape.size()); + for(size_t d = 0; d < rank; d++) + x_strides[x_order[d]] = (d == 0) ? 1 : (x_strides[x_order[d-1]] * shape[x_order[d-1]]); + // strides for y + std::vector y_strides(shape.size()); + for(size_t d = 0; d < rank; d++) + y_strides[y_order[d]] = (d == 0) ? 1 : (y_strides[y_order[d-1]] * shape[y_order[d-1]]); + // copy 1d + if(rank == 1) + for(int32_t i = 0; i < shape[0]; i++) + y[off({i}, y_strides)] = x[off({i}, x_strides)]; + // copy 2d + if(rank == 2) + for(int32_t i = 0; i < shape[0]; i++) + for(int32_t j = 0; j < shape[1]; j++) + y[off({i, j}, y_strides)] = x[off({i, j}, x_strides)]; + // copy 3d + if(rank == 3) + for(int32_t i = 0; i < shape[0]; i++) + for(int32_t j = 0; j < shape[1]; j++) + for(int32_t k = 0; k < shape[2]; k++) + y[off({i, j, k}, y_strides)] = x[off({i, j, k}, x_strides)]; +} + +void triton_copy_nd(drv::stream* stream, const std::vector& shape, + const std::vector& x_order, const std::vector& y_order, + std::vector> TS, + run_mode_t mode, std::vector& bench, bool &test) { + typedef float NumericT; + std::string ty = "float"; + size_t dtsize = sizeof(NumericT); + drv::context* context = stream->context(); + + // rank + size_t rank = shape.size(); + // size + size_t size = 1; + for(int32_t d: shape) + size *= d; + std::vector shapename = {"S0", "S1", "S2"}; + // strides for x + std::vector x_strides = {"1"}; + for(size_t d = 0; d < rank - 1; d++) + x_strides.push_back(x_strides[d] + " * " + shapename[x_order[d]]); + // strides for y + std::vector y_strides = {"1"}; + for(size_t d = 0; d < rank - 1; d++) + y_strides.push_back(y_strides[d] + " * " + shapename[y_order[d]]); + + // create inputs + auto dx = std::unique_ptr(drv::buffer::create(context, size*dtsize)); + auto dy = std::unique_ptr(drv::buffer::create(context, size*dtsize)); + // create options + rt::function::options_space_t opt; + + + // macros + opt.defines.push_back({"TYPE", {ty}}); + for(size_t d = 0; d < rank; d++) + opt.defines.push_back({"STRIDE_XS" + std::to_string(x_order[d]), {x_strides[d]}}); + for(size_t d = 0; d < rank; d++) + opt.defines.push_back({"STRIDE_YS" + std::to_string(y_order[d]), {y_strides[d]}}); + if(TS.empty()) + TS = tile_nd(rank); + for(size_t d = 0; d < rank; d++) + opt.defines.push_back({"TS" + std::to_string(d), TS[d]}); + opt.num_warps = {4}; + + // kernel + rt::function function(src::copy_nd[rank - 1], opt); + std::vector args = {&*dx, &*dy}; + for(int32_t d: shape) + args.push_back(d); + std::vector ts = {"TS0", "TS1", "TS2"}; + auto grid = grid_nd(shape, ts); + + // metrics + if(mode == BENCH){ + auto gbps = [&](double ns) { return 2 * size * dtsize / (ns * 1e-9) * 1e-9; }; + double triton_ns = triton::tools::bench([&]() { function(args, grid, stream);}, stream); + bench.push_back(gbps(triton_ns)); + } + + // test triton + if(mode == TEST){ + std::vector hx(size); + std::vector hy(size); + std::vector ry(size); + for(size_t i = 0; i < hx.size(); i++) + hx[i] = static_cast((float)rand()/RAND_MAX); + stream->write(&*dx, true, 0, hx); + function(args, grid, stream); + stream->synchronize(); + stream->read(&*dy, true, 0, hy); + cc_copy_nd(hx, ry, shape, x_order, y_order); + test = testing::diff(hy, ry); + } +} + +std::vector bench_copy_nd(drv::stream* stream, const std::vector& shape, + const std::vector& x_order, const std::vector& y_order) { + std::vector bench; + bool test; + triton_copy_nd(stream, shape, x_order, y_order, {}, BENCH, bench, test); + return bench; +} + +bool test_copy_nd(drv::stream* stream, const std::vector& shape, + const std::vector& TS, + const std::vector& x_order, const std::vector& y_order) { + std::vector bench; + bool test; + std::vector> TSS; + for(int32_t d: TS) + TSS.push_back({std::to_string(d)}); + triton_copy_nd(stream, shape, x_order, y_order, TSS, TEST, bench, test); + return test; +} diff --git a/tests/common/src/copy.h b/tests/common/src/copy.h index 8b0f5d9dc..c6263d4bb 100644 --- a/tests/common/src/copy.h +++ b/tests/common/src/copy.h @@ -1,33 +1,66 @@ +#ifndef _TRITON_TEST_SRC_COPY_H_ +#define _TRITON_TEST_SRC_COPY_H_ + namespace src { const char *copy1d = R"( void copy1d(TYPE * X __noalias __readonly __aligned(16), TYPE * Y __noalias __readonly __aligned(16), - int N) { - int ridm = get_program_id(0); - int rm[TN] = ridm * TN + 0 ... TN; - TYPE* px[TN] = X + rm; - TYPE* py[TN] = Y + rm; + int S0) { + int pid0 = get_program_id(0); + int rs0[TS0] = pid0 * TS0 + 0 ... TS0; + TYPE* px[TS0] = X + rs0; + TYPE* py[TS0] = Y + rs0; *py = *px; } )"; - const char *copy2d = R"( void copy2d(TYPE * X __noalias __readonly __aligned(16), TYPE * Y __noalias __writeonly __aligned(16), - int M __multipleof(8), - int N __multipleof(8)) { - int ridm = get_program_id(0); - int ridn = get_program_id(1); - int rm[TM] = ridm * TM + 0 ... TM; - int rn[TN] = ridn * TN + 0 ... TN; - TYPE* px[TM, TN] = X + rm[:, newaxis] * STRIDE_XM + rn[newaxis, :] * STRIDE_XN; - TYPE* py[TM, TN] = Y + rm[:, newaxis] * STRIDE_YM + rn[newaxis, :] * STRIDE_YN; + int S0 __multipleof(8), + int S1 __multipleof(8)) { + int pid0 = get_program_id(0); + int pid1 = get_program_id(1); + int rs0[TS0] = pid0 * TS0 + 0 ... TS0; + int rs1[TS1] = pid1 * TS1 + 0 ... TS1; + TYPE* px[TS0, TS1] = X + rs0[:, newaxis] * STRIDE_XS0 + rs1[newaxis, :] * STRIDE_XS1; + TYPE* py[TS0, TS1] = Y + rs0[:, newaxis] * STRIDE_YS0 + rs1[newaxis, :] * STRIDE_YS1; *py = *px; } )"; + const char *copy3d = +R"( +void copy3d(TYPE * X __noalias __readonly __aligned(16), + TYPE * Y __noalias __writeonly __aligned(16), + int S0 __multipleof(8), + int S1 __multipleof(8), + int S2 __multipleof(8)) { + // program id + int pid0 = get_program_id(0); + int pid1 = get_program_id(1); + int pid2 = get_program_id(2); + // ranges + int rs0[TS0] = pid0 * TS0 + 0 ... TS0; + int rs1[TS1] = pid1 * TS1 + 0 ... TS1; + int rs2[TS2] = pid2 * TS2 + 0 ... TS2; + // X pointers + TYPE* px[TS0, TS1, TS2] = X + rs0[:, newaxis, newaxis] * STRIDE_XS0 + + rs1[newaxis, :, newaxis] * STRIDE_XS1 + + rs2[newaxis, newaxis, :] * STRIDE_XS2; + // Y pointers + TYPE* py[TS0, TS1, TS2] = Y + rs0[:, newaxis, newaxis] * STRIDE_YS0 + + rs1[newaxis, :, newaxis] * STRIDE_YS1 + + rs2[newaxis, newaxis, :] * STRIDE_YS2; + *py = *px; } +)"; + + const char* copy_nd[] = {copy1d, copy2d, copy3d}; + +} + +#endif diff --git a/tests/common/util.h b/tests/common/util.h index d8ffef090..34e530610 100644 --- a/tests/common/util.h +++ b/tests/common/util.h @@ -4,6 +4,7 @@ #define _TRITON_TESTS_UTIL_H #include +#include #include "triton/runtime/function.h" namespace drv = triton::driver; @@ -26,6 +27,30 @@ inline rt::function::grid_fn_ty grid2d(size_t M, size_t N) { }; } +inline rt::function::grid_fn_ty grid_nd(const std::vector &shape, + const std::vector& ts) { + return [&shape, &ts](const rt::function::options_t& x) { + rt::grid_t ret; + for(size_t d = 0; d < shape.size(); d++) + ret.push_back(ceil(shape[d], x.D(ts[d]))); + return ret; + }; +} + +inline std::vector> tile_nd(size_t rank) { + assert(rank <= 3); + if(rank == 1) + return {{"128", "256", "512", "1024"}}; + if(rank == 2) + return {{"16", "32", "64"}, + {"16", "32", "64"}}; + if(rank == 3) + return {{"4", "16", "32"}, + {"4", "16", "32"}, + {"4", "16", "32"}}; + return {}; +} + enum order_t { ROWMAJOR, COLMAJOR @@ -44,17 +69,30 @@ struct gen_seq<0, Is...> : seq{}; template void print_tuple(std::basic_ostream& os, Tuple const& t, seq){ using swallow = int[]; - (void)swallow{0, (void(os << (Is == 0? "" : ", ") << std::setfill(' ') << std::setw(3) << std::get(t)), 0)...}; + (void)swallow{0, (void(os << (Is == 0? "" : ", ") << std::get(t)), 0)...}; } } // aux:: + template auto operator<<(std::basic_ostream& os, std::tuple const& t) -> std::basic_ostream& { - os << "("; aux::print_tuple(os, t, aux::gen_seq()); - return os << ")"; + return os; +} + +template +auto operator<<(std::basic_ostream& os, std::vector const& t) + -> std::basic_ostream& +{ + os << "{"; + for(size_t i = 0; i < t.size(); i++) { + if(i > 0) + os << ", "; + os << t[i]; + } + return os << "}"; } diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index 78fbc79d1..fc5f9ab9e 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -1,4 +1,4 @@ -foreach(PROG dot) +foreach(PROG dot copy1d copy2d copy3d) set(TARGET unit_${PROG}) add_executable(${TARGET} ${PROG}.cc) set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME ${TARGET}) diff --git a/tests/unit/copy1d.cc b/tests/unit/copy1d.cc new file mode 100644 index 000000000..ad867bae4 --- /dev/null +++ b/tests/unit/copy1d.cc @@ -0,0 +1,30 @@ +#include +#include +#include "copy.h" +#include "triton/driver/backend.h" + + +int main() { + // initialize default compute device + auto context = triton::driver::backend::contexts::get_default(); + triton::driver::stream* stream = triton::driver::stream::create(context); + // shapes to benchmark + typedef std::tuple, std::vector, std::vector, std::vector> config_t; + std::vector configs = { +// {{65536}, {32}, {0}, {0}}, + {{65536}, {128}, {0}, {0}}, + {{65536}, {512}, {0}, {0}}, + {{65536}, {1024}, {0}, {0}}, + }; + // does the work + std::vector shape, tile; + std::vector ord_x, ord_y; + bool result = true; + for(const auto& c: configs){ + std::tie(shape, tile, ord_x, ord_y) = c; + bool pass = test_copy_nd(stream, shape, tile, ord_x, ord_y); + result = result && pass; + std::cout << "// " << c << ", " << pass << std::endl; + } + return result; +} diff --git a/tests/unit/copy2d.cc b/tests/unit/copy2d.cc new file mode 100644 index 000000000..f4c63e6be --- /dev/null +++ b/tests/unit/copy2d.cc @@ -0,0 +1,46 @@ +#include +#include +#include "copy.h" +#include "triton/driver/backend.h" + + +int main() { + // initialize default compute device + auto context = triton::driver::backend::contexts::get_default(); + triton::driver::stream* stream = triton::driver::stream::create(context); + // shapes to benchmark + typedef std::tuple, std::vector, std::vector, std::vector> config_t; + std::vector configs = { + {{256, 256}, {16, 16}, {0, 1}, {0, 1}}, + {{256, 256}, {16, 64}, {0, 1}, {0, 1}}, + {{256, 256}, {64, 16}, {0, 1}, {0, 1}}, + {{256, 256}, {64, 64}, {0, 1}, {0, 1}}, + + {{256, 256}, {16, 16}, {0, 1}, {1, 0}}, + {{256, 256}, {16, 64}, {0, 1}, {1, 0}}, + {{256, 256}, {64, 16}, {0, 1}, {1, 0}}, + {{256, 256}, {64, 64}, {0, 1}, {1, 0}}, + + {{256, 256}, {16, 16}, {1, 0}, {0, 1}}, + {{256, 256}, {16, 64}, {1, 0}, {0, 1}}, + {{256, 256}, {64, 16}, {1, 0}, {0, 1}}, + {{256, 256}, {64, 64}, {1, 0}, {0, 1}}, + + {{256, 256}, {64, 64}, {1, 0}, {1, 0}}, + {{256, 256}, {16, 64}, {1, 0}, {1, 0}}, + {{256, 256}, {64, 16}, {1, 0}, {1, 0}}, + {{256, 256}, {64, 64}, {1, 0}, {1, 0}}, + }; + // does the work + std::vector shape, tile; + std::vector ord_x, ord_y; + bool result = true; + for(const auto& c: configs){ + std::tie(shape, tile, ord_x, ord_y) = c; + bool pass = test_copy_nd(stream, shape, tile, ord_x, ord_y); + result = result && pass; + std::cout << "// " << c << ", " << pass << std::endl; + } + return result; +} + diff --git a/tests/unit/copy3d.cc b/tests/unit/copy3d.cc new file mode 100644 index 000000000..758944d98 --- /dev/null +++ b/tests/unit/copy3d.cc @@ -0,0 +1,38 @@ +#include +#include +#include "copy.h" +#include "triton/driver/backend.h" + + +int main() { + // initialize default compute device + auto context = triton::driver::backend::contexts::get_default(); + triton::driver::stream* stream = triton::driver::stream::create(context); + // shapes to benchmark + typedef std::tuple, std::vector, std::vector, std::vector> config_t; + std::vector configs; + std::vector x_idx = {0, 1, 2}; + do { + std::vector y_idx = {0, 1, 2}; + do { + configs.push_back(config_t{{64, 64, 32}, {16, 4, 8}, x_idx, y_idx}); + configs.push_back(config_t{{64, 64, 32}, {8, 16, 2}, x_idx, y_idx}); + configs.push_back(config_t{{64, 64, 32}, {32, 2, 2}, x_idx, y_idx}); + configs.push_back(config_t{{64, 64, 32}, {16, 64, 4}, x_idx, y_idx}); + + } while(std::next_permutation(y_idx.begin(), y_idx.end())); + } while(std::next_permutation(x_idx.begin(), x_idx.end())); + // testing + std::vector shape, tile; + std::vector ord_x, ord_y; + bool result = true; + for(const auto& c: configs){ + std::tie(shape, tile, ord_x, ord_y) = c; + bool pass = test_copy_nd(stream, shape, tile, ord_x, ord_y); + result = result && pass; + std::cout << "// " << c << ", " << pass << std::endl; + } + return result; +} + + diff --git a/tests/unit/dot.cc b/tests/unit/dot.cc index 69b8cf2d7..d1b9487e0 100644 --- a/tests/unit/dot.cc +++ b/tests/unit/dot.cc @@ -32,7 +32,7 @@ static void cpu_ref(std::vector &c, const std::vector &a, const std::vecto float acc = 0; for(size_t k = 0; k < K; k++) acc = acc + (AT ? a[k + m*K] : a[m + k*M]) * (BT ? b[n + k*N] : b[k + n*K]); - c[m + n*M] = static_cast(acc); + c[m*N + n] = static_cast(acc); } } @@ -120,7 +120,9 @@ int main() { std::cout << "Testing " << c << " ... " << std::flush; if(do_test(stream, AT, BT, M, N, K, TM, TN, TK, (size_t)nwarp)) std::cout << " Pass! " << std::endl; - else + else{ std::cout << " Fail! " << std::endl; + exit(EXIT_FAILURE); + } } } From b95ac15d48d0eb194ae6674d5470822e812660a5 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 23 Sep 2019 13:56:46 -0400 Subject: [PATCH 415/494] [codegen] [selection] fixed synchronization issue with double-buffering --- lib/codegen/selection.cc | 2 +- lib/driver/module.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 7d72daefa..7c9d34b05 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -1546,7 +1546,7 @@ void selection::run(ir::module &src, Module &dst) { } else { unsigned num_bytes = phi->get_type()->get_scalar_ty()->get_primitive_size_in_bits() / 8; - offset->addIncoming(dst_builder.getInt32(alloc_->num_bytes(phi)/(2*num_bytes)), llvm_inc_block); + offset->addIncoming(dst_builder.getInt32(alloc_->num_bytes(phi)/(num_bytes)), llvm_inc_block); } ptr->addIncoming(inc_shared->get_pointer(), llvm_inc_block); } diff --git a/lib/driver/module.cc b/lib/driver/module.cc index 85877f911..66c775ac6 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -241,7 +241,7 @@ std::string cu_module::compile_llvm_module(std::unique_ptr module, cu_module::cu_module(driver::context * context, std::unique_ptr ll_module): cu_module(context, compile_llvm_module(std::move(ll_module), context->device())) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ - std::cout << source << std::endl; +// std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; From f0013f8bf16f836ab484f1f828889c47623e315a Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 23 Sep 2019 17:54:42 -0400 Subject: [PATCH 416/494] [codegen] [allocation] fixed issues in HMMA --- include/triton/codegen/analysis/tiles.h | 13 +++++- lib/codegen/analysis/allocation.cc | 26 ++++-------- lib/codegen/analysis/tiles.cc | 54 +++++++++++++++++++++++-- lib/codegen/selection.cc | 5 +-- lib/codegen/transform/peephole.cc | 8 ++-- lib/driver/module.cc | 1 - lib/runtime/function.cc | 18 ++++----- tests/bench/dot.cc | 15 ++++--- tests/common/src/dot.h | 2 +- tests/unit/dot.cc | 2 +- 10 files changed, 94 insertions(+), 50 deletions(-) diff --git a/include/triton/codegen/analysis/tiles.h b/include/triton/codegen/analysis/tiles.h index 93d3a9774..87705d132 100644 --- a/include/triton/codegen/analysis/tiles.h +++ b/include/triton/codegen/analysis/tiles.h @@ -25,6 +25,15 @@ class axes; class layout; class align; +enum layout_t { + SCANLINE, + HMMA_C, + HMMA_A_COL, + HMMA_A_ROW, + HMMA_B_COL, + HMMA_B_ROW +}; + class tiles { typedef std::map> param_map_t; private: @@ -34,7 +43,7 @@ private: public: tiles(size_t num_warps, analysis::align* align, analysis::axes* axes, analysis::layout* layout); void run(ir::module &mod); - bool hmma(ir::value *value); + layout_t hmma(ir::value *value); int mts(ir::value *value, unsigned ax); int nts(ir::value *value, unsigned ax); int fpw(ir::value *value, unsigned ax); @@ -52,7 +61,7 @@ private: // tile properties std::map largest_; std::map> order_; - std::map hmma_; + std::map hmma_; std::map fpw_; std::map wpt_; std::map mts_; diff --git a/lib/codegen/analysis/allocation.cc b/lib/codegen/analysis/allocation.cc index 0ad884fbc..98813b4c0 100644 --- a/lib/codegen/analysis/allocation.cc +++ b/lib/codegen/analysis/allocation.cc @@ -20,24 +20,14 @@ unsigned allocation::is_ld_padded(ir::value *x) { if(trans->get_perm()[0]->get_value() != 0) return 4; } - for(ir::user* user: x->get_users()) - if(auto dot = dynamic_cast(user)){ - bool is_hmma = tiles_->hmma(user); - bool is_op_0 = x == dot->get_operand(0); - bool is_op_1 = x == dot->get_operand(1); - if(is_hmma && is_op_0){ - if(dot->is_a_trans()) - return 8; - else - return 16; - } - if(is_hmma && is_op_1){ - if(!dot->is_b_trans()) - return 8; - else - return 16; - } - } + if(tiles_->hmma(x) == HMMA_A_ROW) + return 8; + if(tiles_->hmma(x) == HMMA_A_COL) + return 16; + if(tiles_->hmma(x) == HMMA_B_COL) + return 8; + if(tiles_->hmma(x) == HMMA_B_ROW) + return 16; if(auto* phi = dynamic_cast(x)) { unsigned result = 0; for(unsigned i = 0; i < phi->get_num_incoming(); i++) diff --git a/lib/codegen/analysis/tiles.cc b/lib/codegen/analysis/tiles.cc index 7d4d81376..e48795cec 100644 --- a/lib/codegen/analysis/tiles.cc +++ b/lib/codegen/analysis/tiles.cc @@ -23,7 +23,7 @@ tiles::tiles(size_t num_warps, analysis::align *align, analysis::axes *axes, ana num_warps_(num_warps), align_(align), axes_(axes), layout_(layout) { } -bool is_hmma(ir::value *v){ +bool is_hmma_c(ir::value *v){ bool result = false; if(auto *x = dynamic_cast(v)){ ir::value *a = x->get_operand(0); @@ -36,9 +36,44 @@ bool is_hmma(ir::value *v){ return result; } +bool is_hmma_a_col(ir::value* v) { + for(ir::user *u: v->get_users()) + if(is_hmma_c(u)){ + ir::dot_inst* dot = (ir::dot_inst*)u; + if((v == dot->get_operand(0)) && !dot->is_a_trans()) + return true; + } +} + +bool is_hmma_a_row(ir::value* v) { + for(ir::user *u: v->get_users()) + if(is_hmma_c(u)){ + ir::dot_inst* dot = (ir::dot_inst*)u; + if((v == dot->get_operand(0)) && dot->is_a_trans()) + return true; + } +} + +bool is_hmma_b_col(ir::value* v) { + for(ir::user *u: v->get_users()) + if(is_hmma_c(u)){ + ir::dot_inst* dot = (ir::dot_inst*)u; + if((v == dot->get_operand(1)) && !dot->is_b_trans()) + return true; + } +} + +bool is_hmma_b_row(ir::value* v) { + for(ir::user *u: v->get_users()) + if(is_hmma_c(u)){ + ir::dot_inst* dot = (ir::dot_inst*)u; + if((v == dot->get_operand(1)) && dot->is_b_trans()) + return true; + } +} -bool tiles::hmma(ir::value *value) { +layout_t tiles::hmma(ir::value *value) { return hmma_.at(layout_->id(value)); } @@ -164,7 +199,18 @@ void tiles::run(ir::module &) { // find out which groups require hmma layout for(size_t i = 0; i < num_groups; i++) { const auto& values = layout_->values(i); - hmma_[i] = std::any_of(values.begin(), values.end(), &is_hmma); + bool hmma_c = std::any_of(values.begin(), values.end(), &is_hmma_c); + bool hmma_a_col = std::any_of(values.begin(), values.end(), &is_hmma_a_col); + bool hmma_a_row = std::any_of(values.begin(), values.end(), &is_hmma_a_row); + bool hmma_b_col = std::any_of(values.begin(), values.end(), &is_hmma_b_col); + bool hmma_b_row = std::any_of(values.begin(), values.end(), &is_hmma_b_row); + if(hmma_c) hmma_[i] = HMMA_C; + else if(hmma_a_col) hmma_[i] = HMMA_A_COL; + else if(hmma_a_row) hmma_[i] = HMMA_A_ROW; + else if(hmma_b_col) hmma_[i] = HMMA_B_COL; + else if(hmma_b_row) hmma_[i] = HMMA_B_ROW; + else hmma_[i] = SCANLINE; + } // find out which value is the largest in each group for(size_t i = 0; i < num_groups; i++) { @@ -197,7 +243,7 @@ void tiles::run(ir::module &) { if(!i->get_type()->is_tile_ty()) continue; /* HMMA parameters*/ - if(hmma_[x.first]) + if(hmma_[x.first] == HMMA_C) init_hmma_tile(i); else init_scanline_tile(i); diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 7c9d34b05..1bc356723 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -710,7 +710,7 @@ void selection::init_hmma_axes(ir::value *v, IRBuilder<> &builder, Value *u_thre void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { - if(tiles_->hmma(v)) + if(tiles_->hmma(v) == analysis::HMMA_C) init_hmma_axes(v, builder, u_thread_id, u_warp_id); else init_strided_scan_axes(v, builder, u_thread_id, u_warp_id); @@ -1241,11 +1241,10 @@ void selection::lower_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn, IRB if(NK != 1) { shared_tile *TA = (shared_tile*)tmap_.at(A); shared_tile *TB = (shared_tile*)tmap_.at(B); - if(tiles_->hmma(dot)) + if(tiles_->hmma(dot) == analysis::HMMA_C) lower_hmma_dot(dot, ctx, fn, builder, TC, TA, TB, TD, NK); else lower_scanline_dot(dot, ctx, fn, builder, TC, TA, TB, TD, NK, c_ty, f_mul_add); - } else { distributed_tile *TA = (distributed_tile*)tmap_.at(A); diff --git a/lib/codegen/transform/peephole.cc b/lib/codegen/transform/peephole.cc index cfe1c8721..d490e7bcc 100644 --- a/lib/codegen/transform/peephole.cc +++ b/lib/codegen/transform/peephole.cc @@ -104,12 +104,12 @@ bool peephole::rewrite_dot_hmma(ir::dot_inst *dot, ir::builder& builder, bool tr BB = ((ir::trans_inst*)B)->get_operand(0); } else{ - if(auto *T = dynamic_cast(A)){ + if(auto *T = dynamic_cast(B)){ std::vector perm(T->get_perm()); std::swap(perm[0], perm[1]); - AA = builder.create_trans(T->get_operand(0), perm); - T->replace_all_uses_with(AA); - trans_a = true; + BB = builder.create_trans(T->get_operand(0), perm); + T->replace_all_uses_with(BB); + trans_b = true; } } if(!trans_a && !trans_b) diff --git a/lib/driver/module.cc b/lib/driver/module.cc index 66c775ac6..0bf85c84f 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -241,7 +241,6 @@ std::string cu_module::compile_llvm_module(std::unique_ptr module, cu_module::cu_module(driver::context * context, std::unique_ptr ll_module): cu_module(context, compile_llvm_module(std::move(ll_module), context->device())) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ -// std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index a0275074f..ec0fcd990 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -201,17 +201,17 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c // create passes codegen::transform::cts cts; codegen::analysis::align align; - codegen::analysis::liveness shmem_liveness; + codegen::analysis::liveness liveness; codegen::analysis::axes axes; codegen::analysis::layout layouts(&axes); codegen::transform::coalesce coalesce(&align, &layouts); codegen::analysis::tiles tiles(opt.num_warps, &align, &axes, &layouts); - codegen::analysis::allocation shmem_allocation(&shmem_liveness, &tiles); - codegen::transform::membar shmem_barriers(&shmem_liveness, &shmem_allocation); + codegen::analysis::allocation allocation(&liveness, &tiles); + codegen::transform::membar barriers(&liveness, &allocation); codegen::transform::dce dce; codegen::transform::peephole peephole; codegen::transform::reassociate reassociate(&align); - codegen::selection selection(&shmem_liveness, &shmem_allocation, &tiles, &align, &axes, &layouts, &coalesce, target.get(), opt.num_warps); + codegen::selection selection(&liveness, &allocation, &tiles, &align, &axes, &layouts, &coalesce, target.get(), opt.num_warps); // run passes peephole.run(module); dce.run(module); @@ -226,14 +226,12 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c tiles.run(module); reassociate.run(module); dce.run(module); - peephole.run(module); - dce.run(module); cts.run(module); - shmem_liveness.run(module); - shmem_allocation.run(module); - if(shmem_allocation.allocated_size() > context->device()->max_shared_memory()) + liveness.run(module); + allocation.run(module); + if(allocation.allocated_size() > context->device()->max_shared_memory()) return std::unique_ptr(); - shmem_barriers.run(module); + barriers.run(module); dce.run(module); dce.run(module); axes.run(module); diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index 9a0cd9ca7..19dd95cd9 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -27,9 +27,9 @@ inline rt::function::grid_fn_ty grid2d(size_t M, size_t N) { std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K){ - typedef float NumericT; - std::string ty = "float"; - cublasDataType_t cuty = CUDA_R_32F; + typedef half_float::half NumericT; + std::string ty = "half"; + cublasDataType_t cuty = CUDA_R_16F; size_t dt_nbytes = sizeof(NumericT); drv::context* context = stream->context(); // leading dimensions @@ -47,8 +47,8 @@ std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, i opt.defines.push_back({"BT", {BT?"1":"0"}}); opt.defines.push_back({"TM", {"128"}}); opt.defines.push_back({"TN", {"128"}}); - opt.defines.push_back({"TK", {"8"}}); - opt.num_warps = {8}; + opt.defines.push_back({"TK", {"16"}}); + opt.num_warps = {2, 4, 8}; // create function rt::function function(src::dot, opt); // benchmark available libraries @@ -79,7 +79,10 @@ int main() { // shapes to benchmark typedef std::tuple config_t; std::vector configs; - for(auto x: std::vector>{{false, true}}){ + for(auto x: std::vector>{{false, false}, + {false, true}, + {true, false}, + {true, true}}){ std::vector tmp = { config_t{x[0], x[1], 2048, 2048, 2048} // config_t{x[0], x[1], 16, 2048, 2048}, diff --git a/tests/common/src/dot.h b/tests/common/src/dot.h index 8521cd0a6..2168b23b6 100644 --- a/tests/common/src/dot.h +++ b/tests/common/src/dot.h @@ -64,7 +64,7 @@ void dot(TYPE * A, TYPE * B, TYPE * C, // epilogue int rxc[TM] = ridx * TM + 0 ... TM; int ryc[TN] = ridy * TN + 0 ... TN; - TYPE* pc[TM, TN] = C + rxc[:, newaxis] * ldc + ryc[newaxis, :]; + TYPE* pc[TM, TN] = C + rxc[:, newaxis] + ryc[newaxis, :] * ldc; *pc = c; } )"; diff --git a/tests/unit/dot.cc b/tests/unit/dot.cc index d1b9487e0..c3a7c8b00 100644 --- a/tests/unit/dot.cc +++ b/tests/unit/dot.cc @@ -32,7 +32,7 @@ static void cpu_ref(std::vector &c, const std::vector &a, const std::vecto float acc = 0; for(size_t k = 0; k < K; k++) acc = acc + (AT ? a[k + m*K] : a[m + k*M]) * (BT ? b[n + k*N] : b[k + n*K]); - c[m*N + n] = static_cast(acc); + c[m + n*M] = static_cast(acc); } } From c24d55db23383b4506adeb5ffc3db37e0c3d6c70 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 23 Sep 2019 20:38:27 -0400 Subject: [PATCH 417/494] [codegen] more work on hmma coalescing --- include/triton/tools/bench.hpp | 2 +- lib/codegen/analysis/align.cc | 3 +++ lib/driver/module.cc | 1 + lib/runtime/function.cc | 4 ++++ tests/bench/dot.cc | 9 +++------ 5 files changed, 12 insertions(+), 7 deletions(-) diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp index 554b3bcc3..9d07bf227 100644 --- a/include/triton/tools/bench.hpp +++ b/include/triton/tools/bench.hpp @@ -38,7 +38,7 @@ inline double bench(std::function const & op, driver::stream * stream) double total_time = 0; op(); stream->synchronize(); - while(total_time*1e-9 < 1e-3){ + while(total_time*1e-9 < 1e-1){ float norm = 1; // normalize clock if possible to reduce noise in auto-tuning if(auto cu_device = dynamic_cast(stream->context()->device())) diff --git a/lib/codegen/analysis/align.cc b/lib/codegen/analysis/align.cc index ef57e7a4f..e31799b59 100644 --- a/lib/codegen/analysis/align.cc +++ b/lib/codegen/analysis/align.cc @@ -487,6 +487,9 @@ void align::populate(ir::value *v) { populate_is_constant(v); populate_starting_multiple(v); populate_max_contiguous(v); +// std::cout << v->get_name() << std::endl; +// if(max_contiguous_[v].size() == 2) +// std::cout << max_contiguous_[v][0] << " " << max_contiguous_[v][1] << std::endl; } void align::run(ir::module &mod) { diff --git a/lib/driver/module.cc b/lib/driver/module.cc index 0bf85c84f..66c775ac6 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -241,6 +241,7 @@ std::string cu_module::compile_llvm_module(std::unique_ptr module, cu_module::cu_module(driver::context * context, std::unique_ptr ll_module): cu_module(context, compile_llvm_module(std::move(ll_module), context->device())) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ +// std::cout << source << std::endl; cu_context::context_switcher ctx_switch(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index ec0fcd990..86c0a3f8f 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -161,6 +161,7 @@ function::caller function::autotune(driver::stream* stream, const grid_fn_ty& gr for(auto it: opt_space_.defines) cpp.AddMacro(it.first, &opt.defines.at(it.first)); cpp.Process(tokens); +// tokens.Print(stdout); // parse Parser parser(tokens); parser.Parse(); @@ -215,16 +216,19 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c // run passes peephole.run(module); dce.run(module); +// ir::print(module, std::cout); align.run(module); cts.run(module); axes.run(module); layouts.run(module); coalesce.run(module); +// ir::print(module, std::cout); dce.run(module); align.run(module); dce.run(module); tiles.run(module); reassociate.run(module); + peephole.run(module); dce.run(module); cts.run(module); liveness.run(module); diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index 19dd95cd9..4f6c989e9 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -48,7 +48,7 @@ std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, i opt.defines.push_back({"TM", {"128"}}); opt.defines.push_back({"TN", {"128"}}); opt.defines.push_back({"TK", {"16"}}); - opt.num_warps = {2, 4, 8}; + opt.num_warps = {4}; // create function rt::function function(src::dot, opt); // benchmark available libraries @@ -79,12 +79,9 @@ int main() { // shapes to benchmark typedef std::tuple config_t; std::vector configs; - for(auto x: std::vector>{{false, false}, - {false, true}, - {true, false}, - {true, true}}){ + for(auto x: std::vector>{{false, false}}){ std::vector tmp = { - config_t{x[0], x[1], 2048, 2048, 2048} + config_t{x[0], x[1], 4096, 4096, 4096} // config_t{x[0], x[1], 16, 2048, 2048}, // config_t{x[0], x[1], 32, 2048, 2048}, // config_t{x[0], x[1], 64, 2048, 2048}, From a3bf3a1804a5bc28b5d62e6df517ca7606b7571e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 24 Sep 2019 19:35:46 -0400 Subject: [PATCH 418/494] [codegen] more hmma row-major handling --- include/triton/codegen/selection.h | 7 ++-- lib/codegen/analysis/allocation.cc | 2 +- lib/codegen/analysis/tiles.cc | 15 +++++++++ lib/codegen/selection.cc | 54 ++++++++++++++++++------------ lib/driver/module.cc | 2 +- lib/runtime/function.cc | 3 +- tests/bench/dot.cc | 5 ++- tests/common/src/copy.h | 2 +- 8 files changed, 60 insertions(+), 30 deletions(-) diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index df34f2987..bc236ff22 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -89,7 +89,7 @@ private: public: - shared_tile(Type* ty, const shapes_t &shapes, Value* ptr, Builder &builder, Value* offset = nullptr); + shared_tile(Type* ty, const shapes_t &shapes, const std::vector &order, Value* ptr, Builder &builder, Value* offset = nullptr); void set_vector_size(unsigned vector_size); void set_return_mode(bool return_vector); void set_value(indices_t, Value *); @@ -97,7 +97,8 @@ public: Value* get_value(indices_t idx); Value* get_pointer() { return ptr_; } Value* get_offset() { return offset_; } - static Value* shared_offset(Builder& builder, const shapes_t& shapes, indices_t idx); + const std::vector& get_order() { return order_; } + static Value* shared_offset(Builder& builder, const shapes_t& shapes, const std::vector& order, indices_t idx); private: Value *ptr_; @@ -106,6 +107,7 @@ private: Value *offset_; std::map ptr_cache_; unsigned vector_size_; + std::vector order_; }; // Distribtued tile @@ -123,6 +125,7 @@ public: distributed_tile(Type *ty, const shapes_t& shapes, const std::vector& order, const axes_t &axes, Builder &builder, bool vectorize); void set_value(indices_t idx, Value *v); Value* get_value(indices_t idx); + const std::vector& get_order() { return order_; } unsigned get_linear_index(indices_t idx); indices_t get_ordered_indices(unsigned id); void for_each(std::function fn); diff --git a/lib/codegen/analysis/allocation.cc b/lib/codegen/analysis/allocation.cc index 98813b4c0..8ff77eb25 100644 --- a/lib/codegen/analysis/allocation.cc +++ b/lib/codegen/analysis/allocation.cc @@ -57,7 +57,7 @@ unsigned allocation::num_bytes(ir::value *x) { unsigned num_bytes = x->get_type()->get_primitive_size_in_bits() / 8; unsigned pad = is_ld_padded(x); if(pad > 0){ - unsigned ld = x->get_type()->get_tile_shapes()[0]; + unsigned ld = x->get_type()->get_tile_shapes()[tiles_->order(x)[0]]; num_bytes += pad * num_bytes / ld; } if(liveness_->has_double(x)) diff --git a/lib/codegen/analysis/tiles.cc b/lib/codegen/analysis/tiles.cc index e48795cec..7f19df276 100644 --- a/lib/codegen/analysis/tiles.cc +++ b/lib/codegen/analysis/tiles.cc @@ -218,6 +218,7 @@ void tiles::run(ir::module &) { auto cmp = [&rank](ir::value* x, ir::value *y) { return rank(x) < rank(y); }; largest_[i] = *std::max_element(values.begin(), values.end(), cmp); } + // find out the order of a group for(size_t i = 0; i < num_groups; i++){ std::set io; @@ -237,6 +238,20 @@ void tiles::run(ir::module &) { } order_[i] = order; } + for(size_t i = 0; i < num_groups; i++){ + bool is_hmma_op = hmma_[i] == HMMA_A_COL || hmma_[i] == HMMA_A_ROW || + hmma_[i] == HMMA_B_COL || hmma_[i] == HMMA_B_ROW; + if(!is_hmma_op) + continue; + // extract copies to shared memory + std::vector cts; + for(ir::value* v: layout_->values(i)) + if(auto *x = dynamic_cast(v)) + cts.push_back(x); + if(cts.empty()) + continue; + order_[i] = order(cts[0]->get_operand(0)); + } // tiling parameters for(auto x: largest_){ ir::value *i = x.second; diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 1bc356723..a20fbaa60 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -146,26 +146,26 @@ void shared_tile::extract_constant(const indices_t &arg_idx, indices_t &non_cst_ } -Value* shared_tile::shared_offset(llvm::IRBuilder<> &builder, const shapes_t& shapes, indices_t idx) { +Value* shared_tile::shared_offset(llvm::IRBuilder<> &builder, const shapes_t& shapes, const std::vector& order, indices_t idx) { Value *result = builder.getInt32(0); - result = builder.CreateAdd(result, idx[0]); - Value *ld = builder.getInt32(shapes[0]); + result = builder.CreateAdd(result, idx[order[0]]); + Value *ld = builder.getInt32(shapes[order[0]]); for(size_t i = 1; i < idx.size(); i++) { - result = builder.CreateAdd(result, builder.CreateMul(idx[i], ld)); + result = builder.CreateAdd(result, builder.CreateMul(idx[order[i]], ld)); if(i < idx.size() - 1){ - ld = builder.CreateMul(ld, builder.getInt32(shapes[i])); + ld = builder.CreateMul(ld, builder.getInt32(shapes[order[i]])); } } return result; } -shared_tile::shared_tile(Type *ty, const shapes_t &shapes, Value *ptr, llvm::IRBuilder<> &builder, Value *offset): - tile(ty, shapes), ptr_(ptr), builder_(builder), offset_(offset), vector_size_(1){ +shared_tile::shared_tile(Type *ty, const shapes_t &shapes, const std::vector& order, Value *ptr, llvm::IRBuilder<> &builder, Value *offset): + tile(ty, shapes), order_(order), ptr_(ptr), builder_(builder), offset_(offset), vector_size_(1){ return_vector_ = false; } void shared_tile::set_value(indices_t idx, Value *value) { - Value *ptr = builder_.CreateGEP(ptr_, shared_offset(builder_, shapes_, idx)); + Value *ptr = builder_.CreateGEP(ptr_, shared_offset(builder_, shapes_, order_, idx)); unsigned addr_space = ptr->getType()->getPointerAddressSpace(); ptr = builder_.CreateBitCast(ptr, value->getType()->getPointerTo(addr_space)); builder_.CreateStore(value, ptr); @@ -196,7 +196,7 @@ Value* shared_tile::get_value(indices_t idx) { // if(isa(non_cst_idx.front())){ // builder_.SetInsertPoint((Instruction*)non_cst_idx.front()); // } - base_ptr = builder_.CreateGEP(ptr_, shared_offset(builder_, shapes_, non_cst_idx)); + base_ptr = builder_.CreateGEP(ptr_, shared_offset(builder_, shapes_, order_, non_cst_idx)); if(vector_size_ > 1){ Type *vec_ty = VectorType::get(ty, vector_size); Type *vec_ptr_ty = PointerType::get(vec_ty, base_ptr->getType()->getPointerAddressSpace()); @@ -204,7 +204,7 @@ Value* shared_tile::get_value(indices_t idx) { } // builder_.SetInsertPoint(store); } - Value *offset = shared_offset(builder_, shapes_, cst_idx); + Value *offset = shared_offset(builder_, shapes_, order_, cst_idx); Value *div = offset; if(vector_size_ > 1) div = builder_.CreateUDiv(offset, builder_.getInt32(vector_size_)); @@ -721,10 +721,13 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id * ------------------- */ void selection::create_shared_tile(ir::value *v, IRBuilder<> &builder, Value *sh_mem_ptr) { + if(tmap_.find(v) != tmap_.end()) + return; + auto order = tiles_->order(v); auto shapes = v->get_type()->get_tile_shapes(); unsigned pad = alloc_->is_ld_padded(v); if(pad > 0) - shapes[0] += pad; + shapes[order[0]] += pad; Type* ty = llvm_type(v->get_type()->get_scalar_ty(), builder.getContext()); // shared copy PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr->getType()->getPointerAddressSpace()); @@ -744,15 +747,15 @@ void selection::create_shared_tile(ir::value *v, IRBuilder<> &builder, Value *sh Value *pre_ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(alloc_->offset(v))); pre_ptr = builder.CreateBitCast(pre_ptr, ptr->getType()); Value *next_ptr = builder.CreateGEP(ptr, offset, "next_ptr"); - tmap_.insert({phi, new shared_tile(ty, shapes, ptr, builder, offset)}); - tmap_.insert({v, new shared_tile(ty, shapes, pre_ptr, builder)}); - tmap_.insert({info.latch, new shared_tile(ty, shapes, next_ptr, builder)}); + tmap_.insert({phi, new shared_tile(ty, shapes, order, ptr, builder, offset)}); + tmap_.insert({v, new shared_tile(ty, shapes, order, pre_ptr, builder)}); + tmap_.insert({info.latch, new shared_tile(ty, shapes, order, next_ptr, builder)}); } else { size_t offset = alloc_->offset(v); Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset)); ptr = builder.CreateBitCast(ptr, ptr_ty); - tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)}); + tmap_.insert({v, new shared_tile(ty, shapes, order, ptr, builder)}); } } @@ -920,7 +923,7 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, write_idx.insert(write_idx.begin() + axis, lane); // shared memory write pointer - Value *write_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), write_idx); + Value *write_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), op_tile->get_order(), write_idx); Value *write_ptr = builder.CreateGEP(base_ptr, write_offset); // initialize shared memory @@ -933,7 +936,7 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, indices_t current(write_idx.size(), builder.getInt32(0)); current[axis] = builder.getInt32(i); // shared memory offset - Value *read_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), current); + Value *read_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), op_tile->get_order(), current); Value *is_active = builder.CreateICmpULT(lane, builder.getInt32(i)); read_offset = builder.CreateSelect(is_active, read_offset, builder.getInt32(0)); // shared memory read pointer @@ -949,7 +952,7 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, // result is on the first lane of shared memory indices_t final = write_idx; final[axis] = builder.getInt32(0); - Value *read_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), final); + Value *read_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), op_tile->get_order(), final); Value *read_ptr = builder.CreateGEP(base_ptr, read_offset); tgt_->add_barrier(module, builder); result = builder.CreateLoad(read_ptr); @@ -1077,17 +1080,24 @@ void selection::lower_hmma_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn Value *offset_b_k = offset_b_k_; Value* u_thread_id = tgt_->get_local_id(builder.GetInsertBlock()->getModule(), builder, 0); - if(dot->is_a_trans()){ + + auto ord_a = tiles_->order(dot->get_operand(0)); + auto ord_b = tiles_->order(dot->get_operand(1)); + + bool is_a_row = dot->is_a_trans() ^ ord_a[ord_a.size() - 2] == 1; + bool is_b_row = dot->is_b_trans() ^ ord_b[ord_b.size() - 2] == 1; + + if(is_a_row){ offset_a_i = builder.CreateAdd(offset_a_i, builder.CreateURem(u_thread_id, builder.getInt32(4))); offset_a_k = builder.getInt32(0); } - if(!dot->is_b_trans()){ + if(!is_b_row){ offset_b_j = builder.CreateAdd(offset_b_j, builder.CreateURem(u_thread_id, builder.getInt32(4))); offset_b_k = builder.getInt32(0); } - std::string op_a = dot->is_a_trans() ? "row" : "col"; - std::string op_b = dot->is_b_trans() ? "row" : "col"; + std::string op_a = is_a_row ? "row" : "col"; + std::string op_b = is_b_row ? "row" : "col"; InlineAsm *mma_fn = InlineAsm::get(mma_ty, " mma.sync.aligned.m8n8k4." + op_a + "." + op_b + ".f32.f16.f16.f32 " "{$0, $1, $2, $3, $4, $5, $6, $7}, " diff --git a/lib/driver/module.cc b/lib/driver/module.cc index 66c775ac6..f29c830f4 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -242,7 +242,7 @@ cu_module::cu_module(driver::context * context, std::unique_ptr ll cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ // std::cout << source << std::endl; - cu_context::context_switcher ctx_switch(*context); + cu_context::context_switcher ctx(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; unsigned int errbufsize = 8096; diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 86c0a3f8f..d1b342e45 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -222,13 +222,11 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c axes.run(module); layouts.run(module); coalesce.run(module); -// ir::print(module, std::cout); dce.run(module); align.run(module); dce.run(module); tiles.run(module); reassociate.run(module); - peephole.run(module); dce.run(module); cts.run(module); liveness.run(module); @@ -242,6 +240,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c layouts.run(module); align.run(module); tiles.run(module); +// ir::print(module, std::cout); selection.run(module, *llvm); // return binary std::unique_ptr res(driver::module::create(context, std::move(llvm))); diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index 4f6c989e9..74043d8e5 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -79,7 +79,10 @@ int main() { // shapes to benchmark typedef std::tuple config_t; std::vector configs; - for(auto x: std::vector>{{false, false}}){ + for(auto x: std::vector>{{false, false}, + {false, true}, + {true, false}, + {true, true}}){ std::vector tmp = { config_t{x[0], x[1], 4096, 4096, 4096} // config_t{x[0], x[1], 16, 2048, 2048}, diff --git a/tests/common/src/copy.h b/tests/common/src/copy.h index c6263d4bb..f45f7a5cd 100644 --- a/tests/common/src/copy.h +++ b/tests/common/src/copy.h @@ -59,7 +59,7 @@ void copy3d(TYPE * X __noalias __readonly __aligned(16), } )"; - const char* copy_nd[] = {copy1d, copy2d, copy3d}; + const char* copy_nd[] = {copy1d, copy2d, copy3d}; } From 69800a0318d4b28278d244936e3c9464ebdf6ab8 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 24 Sep 2019 20:36:55 -0400 Subject: [PATCH 419/494] [tests] [dot] now testing row-major --- include/triton/tools/bench.hpp | 2 +- lib/codegen/analysis/allocation.cc | 10 +++-- tests/unit/dot.cc | 72 ++++++++++++++++++++++-------- 3 files changed, 61 insertions(+), 23 deletions(-) diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp index 9d07bf227..554b3bcc3 100644 --- a/include/triton/tools/bench.hpp +++ b/include/triton/tools/bench.hpp @@ -38,7 +38,7 @@ inline double bench(std::function const & op, driver::stream * stream) double total_time = 0; op(); stream->synchronize(); - while(total_time*1e-9 < 1e-1){ + while(total_time*1e-9 < 1e-3){ float norm = 1; // normalize clock if possible to reduce noise in auto-tuning if(auto cu_device = dynamic_cast(stream->context()->device())) diff --git a/lib/codegen/analysis/allocation.cc b/lib/codegen/analysis/allocation.cc index 8ff77eb25..fc2a5ce22 100644 --- a/lib/codegen/analysis/allocation.cc +++ b/lib/codegen/analysis/allocation.cc @@ -20,14 +20,16 @@ unsigned allocation::is_ld_padded(ir::value *x) { if(trans->get_perm()[0]->get_value() != 0) return 4; } + auto order = tiles_->order(x); + bool is_col_major = order[0] == 0; if(tiles_->hmma(x) == HMMA_A_ROW) - return 8; + return is_col_major ? 16 : 8; if(tiles_->hmma(x) == HMMA_A_COL) - return 16; + return is_col_major ? 8 : 16; if(tiles_->hmma(x) == HMMA_B_COL) - return 8; + return is_col_major ? 16 : 8; if(tiles_->hmma(x) == HMMA_B_ROW) - return 16; + return is_col_major ? 8 : 16; if(auto* phi = dynamic_cast(x)) { unsigned result = 0; for(unsigned i = 0; i < phi->get_num_incoming(); i++) diff --git a/tests/unit/dot.cc b/tests/unit/dot.cc index c3a7c8b00..bb75df10e 100644 --- a/tests/unit/dot.cc +++ b/tests/unit/dot.cc @@ -31,7 +31,7 @@ static void cpu_ref(std::vector &c, const std::vector &a, const std::vecto for(size_t n = 0; n < N; n++){ float acc = 0; for(size_t k = 0; k < K; k++) - acc = acc + (AT ? a[k + m*K] : a[m + k*M]) * (BT ? b[n + k*N] : b[k + n*K]); + acc = acc + (AT ? a[k*M + m] : a[m*K + k]) * (BT ? b[n*K + k] : b[k*N + n]); c[m + n*M] = static_cast(acc); } } @@ -49,25 +49,47 @@ void cpu_ref(bool AT_, bool BT_, size_t M, size_t N, size_t K, cpu_ref(c, a, b, M, N, K); } +template +struct to_string; -bool do_test(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K, int32_t TM, int32_t TN, int32_t TK, size_t nwarp){ - typedef float NumericT; - std::string ty = "float"; - size_t dt_nbytes = sizeof(NumericT); +template<> struct to_string{ + static constexpr const char* value = "half"; +}; + +template<> struct to_string{ + static constexpr const char* value = "float"; +}; + +template<> struct to_string{ + static constexpr const char* value = "double"; +}; + +enum dtype_t { + FLOAT, + HALF, + DOUBLE +}; + +template +bool do_test(drv::stream* stream, bool AT, bool BT, + int32_t M, int32_t N, int32_t K, + int32_t TM, int32_t TN, int32_t TK, size_t nwarp){ + std::string ty = to_string::value; + size_t dt_nbytes = sizeof(T); drv::context* context = stream->context(); - std::vector hc(M*N); - std::vector ha(M*K); - std::vector hb(K*N); + std::vector hc(M*N); + std::vector ha(M*K); + std::vector hb(K*N); int32_t lda = AT ? K : M; int32_t ldb = BT ? N : K; int32_t ldc = M; srand(0); for(size_t i = 0; i < ha.size(); i++) - ha[i] = static_cast((float)rand()/RAND_MAX); + ha[i] = static_cast((float)rand()/RAND_MAX); for(size_t i = 0; i < hb.size(); i++) - hb[i] = static_cast((float)rand()/RAND_MAX); + hb[i] = static_cast((float)rand()/RAND_MAX); for(size_t i = 0; i < hc.size(); i++) - hc[i] = static_cast((double)0); + hc[i] = static_cast((double)0); auto dc = std::shared_ptr(drv::buffer::create(context, hc.size()*dt_nbytes)); auto da = std::shared_ptr(drv::buffer::create(context, ha.size()*dt_nbytes)); auto db = std::shared_ptr(drv::buffer::create(context, hb.size()*dt_nbytes)); @@ -92,33 +114,47 @@ bool do_test(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_ } // test stream->read(&*dc, true, 0, hc); - std::vector rc(hc.size()); + std::vector rc(hc.size()); cpu_ref(AT, BT, M, N, K, rc, ha, hb); return testing::diff(hc, rc); } +bool do_test(triton::driver::stream *stream, + dtype_t dtype, bool AT, bool BT, + int32_t M, int32_t N, int32_t K, + int32_t TM, int32_t TN, int32_t TK, size_t nwarp) { + switch(dtype){ + case HALF: return do_test(stream, AT, BT, M, N, K, TM, TN, TK, nwarp); + case FLOAT: return do_test(stream, AT, BT, M, N, K, TM, TN, TK, nwarp); + case DOUBLE: return do_test(stream, AT, BT, M, N, K, TM, TN, TK, nwarp); + default: break; + } + return false; +} + int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); triton::driver::stream* stream = triton::driver::stream::create(context); // shapes to benchmark - typedef std::tuple config_t; + typedef std::tuple config_t; std::vector configs; for(bool AT: std::array{false, true}) for(bool BT: std::array{false, true}) - for(int TM: std::vector{16, 128}) - for(int TN: std::vector{16, 128}) + for(int TM: std::vector{32, 64}) + for(int TN: std::vector{32, 64}) for(int TK: std::vector{16, 32}) for(int nwarps: std::vector{1, 2, 4, 8}){ - configs.push_back(config_t{AT, BT, 128, 128, 128, TM, TN, TK, nwarps}); + configs.push_back(config_t{HALF, AT, BT, 128, 128, 128, TM, TN, TK, nwarps}); } // does the work + dtype_t dtype; bool AT, BT; int M, N, K, TM, TN, TK, nwarp; for(const auto& c: configs){ - std::tie(AT, BT, M, N, K, TM, TN, TK, nwarp) = c; + std::tie(dtype, AT, BT, M, N, K, TM, TN, TK, nwarp) = c; std::cout << "Testing " << c << " ... " << std::flush; - if(do_test(stream, AT, BT, M, N, K, TM, TN, TK, (size_t)nwarp)) + if(do_test(stream, dtype, AT, BT, M, N, K, TM, TN, TK, (size_t)nwarp)) std::cout << " Pass! " << std::endl; else{ std::cout << " Fail! " << std::endl; From 575dd06be3e7e2d9ea9af1c9f0fa1924fccb4780 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 26 Sep 2019 14:01:28 -0400 Subject: [PATCH 420/494] [codegen] more progress towards unified dot implementation --- include/triton/codegen/analysis/allocation.h | 6 +- include/triton/codegen/analysis/liveness.h | 48 +++++- include/triton/codegen/instructions.h | 2 +- lib/codegen/analysis/allocation.cc | 104 ++++-------- lib/codegen/analysis/liveness.cc | 160 ++++++++++++++----- lib/codegen/selection.cc | 20 ++- lib/codegen/transform/membar.cc | 16 +- lib/codegen/transform/peephole.cc | 84 ---------- lib/runtime/function.cc | 4 +- tests/bench/dot.cc | 6 +- tests/common/src/dot.h | 16 +- tests/unit/dot.cc | 4 +- 12 files changed, 227 insertions(+), 243 deletions(-) diff --git a/include/triton/codegen/analysis/allocation.h b/include/triton/codegen/analysis/allocation.h index 3dfede223..b23f11964 100644 --- a/include/triton/codegen/analysis/allocation.h +++ b/include/triton/codegen/analysis/allocation.h @@ -4,6 +4,7 @@ #include #include #include +#include "triton/codegen/analysis/liveness.h" namespace triton{ @@ -25,10 +26,8 @@ class allocation { public: allocation(liveness *live, tiles *params) : liveness_(live), tiles_(params){ } - // utilities - unsigned num_bytes(ir::value *x); - unsigned is_ld_padded(ir::value* x); // accessors + bool has_offset(ir::value *x) const { return offsets_.find(x) != offsets_.end(); } unsigned offset(ir::value *x) const { return offsets_.at(x); } unsigned allocated_size() const { return allocated_size_; } // run @@ -36,7 +35,6 @@ public: private: std::map offsets_; - std::map num_bytes_; size_t allocated_size_; // dependences liveness *liveness_; diff --git a/include/triton/codegen/analysis/liveness.h b/include/triton/codegen/analysis/liveness.h index 52ea33cca..0f8aea7b4 100644 --- a/include/triton/codegen/analysis/liveness.h +++ b/include/triton/codegen/analysis/liveness.h @@ -2,6 +2,8 @@ #define TDL_INCLUDE_IR_CODEGEN_LIVENESS_H #include +#include +#include namespace triton{ @@ -10,6 +12,7 @@ namespace ir{ class phi_node; class function; class module; + class instruction; } namespace codegen{ @@ -17,7 +20,7 @@ namespace analysis{ typedef unsigned slot_index; -class cts; +class tiles; struct segment { slot_index start; @@ -37,21 +40,47 @@ struct double_buffer_info_t { ir::phi_node* phi; }; +struct buffer_t { + unsigned id; + size_t size; + bool operator<(buffer_t other) const { return id < other.id; } +}; + class liveness { private: typedef std::map indices_map_t; - typedef std::map intervals_map_t; + typedef std::map intervals_map_t; typedef std::map has_storage_map_t; + typedef ir::value* node_t; + typedef std::map > graph_t; public: // Intervals iterators using iterator = intervals_map_t::iterator; using const_iterator = intervals_map_t::const_iterator; + + + +private: + void connected_components(node_t x, std::set &nodes, graph_t &graph, unsigned group_id); + void extract_double_bufferable(ir::instruction *i); + void extract_buffers(ir::instruction *i); + void get_parents(ir::instruction *i, std::vector& res); + void make_graph(ir::instruction *i); + + public: + liveness(tiles *t): tiles_(t){ } + // buffer size + unsigned is_ld_padded(ir::value *x); + unsigned num_bytes(ir::value *x); // accessors - const intervals_map_t& intervals() const { return intervals_; } - segment get_interval(ir::value* v) const { return intervals_.at(v); } + const intervals_map_t& intervals() const { return intervals_; } + segment get_interval(buffer_t v) const { return intervals_.at(v); } + // buffers + buffer_t get_buffer(ir::value *v) const { return groups_.at(v); } + std::vector get_values(buffer_t x) const { return values_.at(x); } // double-buffering bool has_double(ir::value *x) const { return double_.find(x) != double_.end(); } double_buffer_info_t get_double(ir::value *x) const { return double_.at(x); } @@ -59,10 +88,19 @@ public: void run(ir::module &mod); private: + // analysis + tiles *tiles_; + // stuff has_storage_map_t has_dedicated_storage_; - indices_map_t indices_; + indices_map_t indices; intervals_map_t intervals_; std::map double_; + std::map> parents_; + // graph + std::set nodes_; + graph_t graph_; + std::map groups_; + std::map> values_; }; } diff --git a/include/triton/codegen/instructions.h b/include/triton/codegen/instructions.h index cecd716e0..e3ad9344d 100644 --- a/include/triton/codegen/instructions.h +++ b/include/triton/codegen/instructions.h @@ -56,7 +56,7 @@ static const std::map storage_info = { { ir::INST_BROADCAST, {DISTRIBUTED, {REPLICATED}}}, { ir::INST_DOWNCAST, {DISTRIBUTED, {REPLICATED}}}, // array arithmetic - { ir::INST_TRANS, {SHARED, {DISTRIBUTED}}}, // TODO: not necessarily + { ir::INST_TRANS, {SHARED, {SHARED}}}, { ir::INST_REDUCE, {SHARED, {DISTRIBUTED}}}, { ir::INST_DOT, {DISTRIBUTED, {SHARED, SHARED, DISTRIBUTED}}}, // terminator diff --git a/lib/codegen/analysis/allocation.cc b/lib/codegen/analysis/allocation.cc index fc2a5ce22..21087e680 100644 --- a/lib/codegen/analysis/allocation.cc +++ b/lib/codegen/analysis/allocation.cc @@ -15,79 +15,28 @@ namespace triton{ namespace codegen{ namespace analysis{ -unsigned allocation::is_ld_padded(ir::value *x) { - if(auto *trans = dynamic_cast(x)){ - if(trans->get_perm()[0]->get_value() != 0) - return 4; - } - auto order = tiles_->order(x); - bool is_col_major = order[0] == 0; - if(tiles_->hmma(x) == HMMA_A_ROW) - return is_col_major ? 16 : 8; - if(tiles_->hmma(x) == HMMA_A_COL) - return is_col_major ? 8 : 16; - if(tiles_->hmma(x) == HMMA_B_COL) - return is_col_major ? 16 : 8; - if(tiles_->hmma(x) == HMMA_B_ROW) - return is_col_major ? 8 : 16; - if(auto* phi = dynamic_cast(x)) { - unsigned result = 0; - for(unsigned i = 0; i < phi->get_num_incoming(); i++) - result = std::max(result, is_ld_padded(phi->get_incoming_value(i))); - return result; - } - return 0; -} - -unsigned allocation::num_bytes(ir::value *x) { - if(auto *red = dynamic_cast(x)){ - unsigned num_bytes = x->get_type()->get_scalar_ty()->get_primitive_size_in_bits() / 8; - size_t axis = red->get_axis(); - ir::value *op = red->get_operand(0); - auto shapes = op->get_type()->get_tile_shapes(); - shapes.erase(shapes.begin() + axis); - size_t num_elements = 1; - for(auto x: shapes) - num_elements *= x; - size_t depth; - if(tiles_->hmma(x)) - depth = tiles_->wpt(op, axis); - else - depth = tiles_->mts(op, axis); - return num_elements * num_bytes * depth; - } - unsigned num_bytes = x->get_type()->get_primitive_size_in_bits() / 8; - unsigned pad = is_ld_padded(x); - if(pad > 0){ - unsigned ld = x->get_type()->get_tile_shapes()[tiles_->order(x)[0]]; - num_bytes += pad * num_bytes / ld; - } - if(liveness_->has_double(x)) - num_bytes *= 2; - return num_bytes; -} void allocation::run(ir::module &mod) { using std::max; using std::min; typedef std::multimap triples_map_type; - std::vector I; + std::vector I; for(auto x: liveness_->intervals()) I.push_back(x.first); - std::vector J = I; + std::vector J = I; triples_map_type H; H.insert({0, segment{0, INT_MAX}}); - std::vector V; - std::map starts; + std::vector V; + std::map starts; while(!J.empty()){ auto h_it = H.begin(); unsigned w = h_it->first; segment xh = h_it->second; H.erase(h_it); - auto j_it = std::find_if(J.begin(), J.end(), [&](ir::value *JJ){ + auto j_it = std::find_if(J.begin(), J.end(), [&](buffer_t JJ){ segment xj = liveness_->get_interval(JJ); bool res = xj.intersect(xh); for(auto val: H) @@ -95,7 +44,7 @@ void allocation::run(ir::module &mod) { return res; }); if(j_it != J.end()){ - unsigned size = num_bytes(*j_it); + unsigned size = j_it->size; segment xj = liveness_->get_interval(*j_it); starts[*j_it] = w; H.insert({w + size, segment{max(xh.start, xj.start), min(xh.end, xj.end)}}); @@ -109,14 +58,14 @@ void allocation::run(ir::module &mod) { } // Build interference graph - std::map> interferences; - for(ir::value *x: V) - for(ir::value *y: V){ - if(x == y) + std::map> interferences; + for(buffer_t x: V) + for(buffer_t y: V){ + if(x.id == y.id) continue; unsigned X0 = starts[x], Y0 = starts[y]; - unsigned NX = num_bytes(x); - unsigned NY = num_bytes(y); + unsigned NX = x.size; + unsigned NY = y.size; segment XS = {X0, X0 + NX}; segment YS = {Y0, Y0 + NY}; if(liveness_->get_interval(x).intersect(liveness_->get_interval(y)) @@ -125,17 +74,17 @@ void allocation::run(ir::module &mod) { } // Initialize colors - std::map colors; - for(ir::value *X: V) - colors[X] = (X==V[0])?0:-1; + std::map colors; + for(buffer_t X: V) + colors[X] = (X.id==V[0].id)?0:-1; // First-fit graph coloring std::vector available(V.size()); - for(ir::value *x: V){ + for(buffer_t x: V){ // Non-neighboring colors are available std::fill(available.begin(), available.end(), true); - for(ir::value *Y: interferences[x]){ + for(buffer_t Y: interferences[x]){ int color = colors[Y]; if(color >= 0) available[color] = false; @@ -146,21 +95,24 @@ void allocation::run(ir::module &mod) { } // Finalize allocation - for(ir::value *x: V){ + for(buffer_t x: V){ unsigned Adj = 0; - for(ir::value *y: interferences[x]) - Adj = std::max(Adj, starts[y] + num_bytes(y)); - offsets_[x] = starts[x] + colors[x] * Adj; - if(liveness_->has_double(x)){ - auto info = liveness_->get_double(x); - offsets_[info.latch] = offsets_[x] + num_bytes(x) / 2; + for(buffer_t y: interferences[x]) + Adj = std::max(Adj, starts[y] + y.size); + // create offsets + for(ir::value *v: liveness_->get_values(x)){ + offsets_[v] = starts[x] + colors[x] * Adj; + if(liveness_->has_double(v)){ + auto info = liveness_->get_double(v); + offsets_[info.latch] = offsets_[v] + x.size / 2; + } } } // Save maximum size of induced memory space allocated_size_ = 0; for(auto &x: offsets_){ - allocated_size_ = std::max(allocated_size_, x.second + num_bytes(x.first)); + allocated_size_ = std::max(allocated_size_, x.second + liveness_->get_buffer(x.first).size); } } diff --git a/lib/codegen/analysis/liveness.cc b/lib/codegen/analysis/liveness.cc index f6df78b72..13b456cae 100644 --- a/lib/codegen/analysis/liveness.cc +++ b/lib/codegen/analysis/liveness.cc @@ -1,6 +1,9 @@ #include +#include +#include #include "triton/codegen/instructions.h" #include "triton/codegen/analysis/liveness.h" +#include "triton/codegen/analysis/tiles.h" #include "triton/codegen/transform/cts.h" #include "triton/ir/basic_block.h" #include "triton/ir/function.h" @@ -25,7 +28,7 @@ inline bool is_loop_latch(ir::phi_node *phi, ir::instruction *terminator){ throw std::runtime_error("unreachable"); } -inline void extract_double_bufferable(ir::instruction *i, std::map& result) { +void liveness::extract_double_bufferable(ir::instruction *i) { auto* phi = dynamic_cast(i); if(!phi || phi->get_num_incoming() != 2) return; @@ -42,65 +45,142 @@ inline void extract_double_bufferable(ir::instruction *i, std::mapget_id()).first != SHARED || storage_info.at(i_1->get_id()).first != SHARED) return; if(is_latch_1) - result[value_0] = double_buffer_info_t{value_1, phi}; + double_[value_0] = double_buffer_info_t{value_1, phi}; if(is_latch_0) - result[value_1] = double_buffer_info_t{value_0, phi}; + double_[value_1] = double_buffer_info_t{value_0, phi}; } +void liveness::make_graph(ir::instruction *i) { + if(has_double(i)){ + ir::value *latch = double_[i].latch; + nodes_.insert(i); + nodes_.insert(latch); + graph_[i].insert(latch); + graph_[latch].insert(i); + } + if(i->get_id() == ir::INST_TRANS){ + nodes_.insert(i); + nodes_.insert(i->get_operand(0)); + graph_[i].insert(i->get_operand(0)); + graph_[i->get_operand(0)].insert(i); + } +} + +// connected components +void liveness::connected_components(node_t x, std::set &nodes, graph_t &graph, unsigned group_id) { + buffer_t buffer{group_id, num_bytes(x)}; + groups_[x] = buffer; + values_[buffer].push_back(x); + if(nodes.find(x) != nodes.end()){ + nodes.erase(x); + for(const node_t &y: graph[x]) + connected_components(y, nodes, graph, group_id); + } +} + +unsigned liveness::is_ld_padded(ir::value *x) { + if(auto *trans = dynamic_cast(x)){ + if(trans->get_perm()[0]->get_value() != 0) + return 4; + } + auto order = tiles_->order(x); + bool is_col_major = order[0] == 0; + if(tiles_->hmma(x) == HMMA_A_ROW) + return is_col_major ? 16 : 16; + if(tiles_->hmma(x) == HMMA_A_COL) + return is_col_major ? 8 : 8; + if(tiles_->hmma(x) == HMMA_B_COL) + return is_col_major ? 16 : 16; + if(tiles_->hmma(x) == HMMA_B_ROW) + return is_col_major ? 8 : 8; + if(auto* phi = dynamic_cast(x)) { + unsigned result = 0; + for(unsigned i = 0; i < phi->get_num_incoming(); i++) + result = std::max(result, is_ld_padded(phi->get_incoming_value(i))); + return result; + } + return 0; +} + +unsigned liveness::num_bytes(ir::value *x) { + if(auto *red = dynamic_cast(x)){ + unsigned num_bytes = x->get_type()->get_scalar_ty()->get_primitive_size_in_bits() / 8; + size_t axis = red->get_axis(); + ir::value *op = red->get_operand(0); + auto shapes = op->get_type()->get_tile_shapes(); + shapes.erase(shapes.begin() + axis); + size_t num_elements = 1; + for(auto x: shapes) + num_elements *= x; + size_t depth; + if(tiles_->hmma(x)) + depth = tiles_->wpt(op, axis); + else + depth = tiles_->mts(op, axis); + return num_elements * num_bytes * depth; + } + unsigned num_bytes = x->get_type()->get_primitive_size_in_bits() / 8; + unsigned pad = is_ld_padded(x); + if(pad > 0){ + unsigned ld = x->get_type()->get_tile_shapes()[tiles_->order(x)[0]]; + num_bytes += pad * num_bytes / ld; + } + if(has_double(x)) + num_bytes *= 2; + return num_bytes; +} // Entry point void liveness::run(ir::module &mod) { double_.clear(); - indices_.clear(); + indices.clear(); intervals_.clear(); + parents_.clear(); - // set of pair of values that can be double-buffered + // Create set of pair of values that can be double-buffered ir::for_each_instruction(mod, [this](ir::instruction* i) { - extract_double_bufferable(i, this->double_); + this->extract_double_bufferable(i); }); + // Create buffer dependency graph + ir::for_each_instruction(mod, [this](ir::instruction* i) { + this->make_graph(i); + }); + // connected components + unsigned group_id = 0; + while(!nodes_.empty()){ + connected_components(*nodes_.begin(), nodes_, graph_, group_id++); + } + + // Assigns index to each instruction for(ir::function *fn: mod.get_function_list()){ - // Assigns index to each instruction slot_index index = 0; for(ir::basic_block *block: fn->blocks()) for(ir::instruction *instr: block->get_inst_list()){ index += 1; - indices_.insert({instr, index}); - } - // Liveness analysis - // Creates live intervals - for(auto i: indices_){ - ir::value *v = i.first; - ir::instruction* instr = dynamic_cast(v); - if(!instr) - continue; - if(storage_info.at(instr->get_id()).first != SHARED) - continue; - unsigned start = i.second; - unsigned end = start; - for(ir::value *u: v->get_users()){ - start = std::min(start, indices_.at(u)); - end = std::max(end, indices_.at(u)); - } - intervals_[v] = segment{start, end}; - } - // Double-Buffering - // Arrays are live throughout the end of the loop - auto it = intervals_.begin(); - while(it != intervals_.end()) { - ir::value *x = it->first; - auto dit = double_.find(x); - if(dit != double_.end()) { - ir::value *y = dit->second.latch; - unsigned start = intervals_[x].start; - unsigned end = intervals_[y].end; - intervals_[x] = segment{start, end}; - intervals_.erase(y); - } - it++; + indices.insert({instr, index}); } } + + for(auto x: values_) { + // users + std::set values; + for(ir::value *v: x.second){ + values.insert(v); + for(ir::user *u: v->get_users()) + values.insert(u); + } + // compute intervals + unsigned start = INT32_MAX; + unsigned end = 0; + for(ir::value *u: values){ + start = std::min(start, indices.at(u)); + end = std::max(end, indices.at(u)); + } + intervals_[x.first] = segment{start, end}; + } + } } diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index a20fbaa60..60facdae4 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -725,7 +725,7 @@ void selection::create_shared_tile(ir::value *v, IRBuilder<> &builder, Value *sh return; auto order = tiles_->order(v); auto shapes = v->get_type()->get_tile_shapes(); - unsigned pad = alloc_->is_ld_padded(v); + unsigned pad = liveness_->is_ld_padded(v); if(pad > 0) shapes[order[0]] += pad; Type* ty = llvm_type(v->get_type()->get_scalar_ty(), builder.getContext()); @@ -1040,15 +1040,13 @@ void selection::lower_copy_to_shared(ir::copy_to_shared_inst *x, LLVMContext &ct } void selection::lower_trans(ir::trans_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { - shared_tile* result = (shared_tile*)tmap_.at(x); - distributed_tile* in = (distributed_tile*)tmap_.at(x->get_operand(0)); - auto perm = x->get_perm(); - in->for_each([&](indices_t idx){ - indices_t out_idx(idx.size()); - for(size_t i = 0; i < idx.size(); i++) - out_idx[i] = idx[perm[i]->get_value()]; - result->set_value(out_idx, in->get_value(idx)); - }); + shared_tile* in = (shared_tile*)tmap_.at(x->get_operand(0)); + auto in_order = in->get_order(); + std::vector order; + for(auto p: x->get_perm()) + order.push_back(in_order[p->get_value()]); + shared_tile* out = new shared_tile(in->get_ty(), in->get_shapes(), order, in->get_pointer(), builder, in->get_offset()); + tmap_[x] = out; } void selection::lower_hmma_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn, IRBuilder<> &builder, @@ -1555,7 +1553,7 @@ void selection::run(ir::module &src, Module &dst) { } else { unsigned num_bytes = phi->get_type()->get_scalar_ty()->get_primitive_size_in_bits() / 8; - offset->addIncoming(dst_builder.getInt32(alloc_->num_bytes(phi)/(num_bytes)), llvm_inc_block); + offset->addIncoming(dst_builder.getInt32(liveness_->num_bytes(phi)/(num_bytes)), llvm_inc_block); } ptr->addIncoming(inc_shared->get_pointer(), llvm_inc_block); } diff --git a/lib/codegen/transform/membar.cc b/lib/codegen/transform/membar.cc index 6ec14bc09..aee19110f 100644 --- a/lib/codegen/transform/membar.cc +++ b/lib/codegen/transform/membar.cc @@ -36,9 +36,9 @@ void membar::add_reference(ir::value *v, interval_vec_t &res){ auto *i = dynamic_cast(v); if(!i) return; - if(storage_info.at(i->get_id()).first == SHARED){ + if(alloc_->has_offset(v)){ unsigned offset = alloc_->offset(v); - unsigned num_bytes = alloc_->num_bytes(v); + unsigned num_bytes = liveness_->num_bytes(v); res.push_back(interval_t(offset, offset + num_bytes)); } } @@ -97,8 +97,10 @@ std::pairget_double(i); safe_war.insert(i); safe_war.insert(info.latch); + auto *trans = dynamic_cast(info.latch); + if(trans) + safe_war.insert(trans->get_operand(0)); } + if(i->get_id() == ir::INST_TRANS) + safe_war.insert(i); }); for(ir::function *fn: mod.get_function_list()){ @@ -152,9 +159,8 @@ void membar::run(ir::module &mod) { done = (n_inserted_im1 == n_inserted_i); n_inserted_im1 = n_inserted_i; }while(!done); - for(ir::instruction* i: insert_locs){ + for(ir::instruction* i: insert_locs) insert_barrier(i, builder); - } } } diff --git a/lib/codegen/transform/peephole.cc b/lib/codegen/transform/peephole.cc index d490e7bcc..ca67ecf5a 100644 --- a/lib/codegen/transform/peephole.cc +++ b/lib/codegen/transform/peephole.cc @@ -84,70 +84,6 @@ bool peephole::rewrite_trans_phi(ir::instruction* value, ir::builder& builder) { return true; } -bool peephole::rewrite_dot_hmma(ir::dot_inst *dot, ir::builder& builder, bool trans_a, bool trans_b, - ir::value *A, ir::value *B, ir::value *D){ - ir::value *AA = A; - ir::value *BB = B; - if(trans_a){ - AA = ((ir::trans_inst*)A)->get_operand(0); - } - else{ - if(auto *T = dynamic_cast(A)){ - std::vector perm(T->get_perm()); - std::swap(perm[0], perm[1]); - AA = builder.create_trans(T->get_operand(0), perm); - T->replace_all_uses_with(AA); - trans_a = true; - } - } - if(trans_b){ - BB = ((ir::trans_inst*)B)->get_operand(0); - } - else{ - if(auto *T = dynamic_cast(B)){ - std::vector perm(T->get_perm()); - std::swap(perm[0], perm[1]); - BB = builder.create_trans(T->get_operand(0), perm); - T->replace_all_uses_with(BB); - trans_b = true; - } - } - if(!trans_a && !trans_b) - return false; - - ir::instruction *dot_atbt = builder.insert(ir::dot_inst::create(AA, BB, D, trans_a, trans_b)); - dot->replace_all_uses_with(dot_atbt); - - return true; -} - -bool peephole::rewrite_dot_fp32(ir::dot_inst *dot, ir::builder& builder, bool trans_a, bool trans_b, - ir::value *A, ir::value *B, ir::value *D){ - // dot(op(a), trans(b)) - if(trans_b){ - ir::value* BB = ((ir::trans_inst*)B)->get_operand(0); - ir::instruction *NT = builder.insert(ir::dot_inst::create_nt(A, BB, D)); - dot->replace_all_uses_with(NT); - return true; - } - // dot(op(a), b) - if(!trans_b){ - // create permutations - size_t size = B->get_type()->get_tile_shapes().size(); - std::vector perm(size); - ir::type *int32_ty = ir::type::get_int32_ty(B->get_type()->get_context()); - for(size_t i = 0; i < size; i++) - perm[i] = ir::constant_int::get(int32_ty, i); - std::swap(perm[0], perm[1]); - // replace NN -> NT (trans) - ir::value* BB = builder.create_trans(B, perm); - ir::instruction *NT = builder.insert(ir::dot_inst::create_nt(A, BB, D)); - dot->replace_all_uses_with(NT); - return true; - } - return false; -} - bool peephole::rewrite_dot(ir::instruction *value, ir::builder& builder){ // dot(a, b, 0) + c -> dot(a, b, c) auto add = dynamic_cast(value); @@ -176,26 +112,6 @@ bool peephole::rewrite_dot(ir::instruction *value, ir::builder& builder){ add->replace_all_uses_with(new_dot); return true; } - - // dot(a, b, c) - auto dot = dynamic_cast(value); - if(!dot) - return false; - builder.set_insert_point(value); - ir::value *A = dot->get_operand(0); - ir::value *B = dot->get_operand(1); - ir::value *D = dot->get_operand(2); - bool trans_a = is_trans(A); - bool trans_b = is_trans(B); - // only consider dot-nn - if(dot->is_a_trans() || dot->is_b_trans()) - return false; - // hmma - if(is_hmma(dot)){ - return rewrite_dot_hmma(dot, builder, trans_a, trans_b, A, B, D); - } - else - return rewrite_dot_fp32(dot, builder, trans_a, trans_b, A, B, D); } bool peephole::rewrite_unit_red(ir::instruction *value, ir::builder& builder){ diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index d1b342e45..f628f9171 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -202,11 +202,11 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c // create passes codegen::transform::cts cts; codegen::analysis::align align; - codegen::analysis::liveness liveness; codegen::analysis::axes axes; codegen::analysis::layout layouts(&axes); codegen::transform::coalesce coalesce(&align, &layouts); codegen::analysis::tiles tiles(opt.num_warps, &align, &axes, &layouts); + codegen::analysis::liveness liveness(&tiles); codegen::analysis::allocation allocation(&liveness, &tiles); codegen::transform::membar barriers(&liveness, &allocation); codegen::transform::dce dce; @@ -235,12 +235,10 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c return std::unique_ptr(); barriers.run(module); dce.run(module); - dce.run(module); axes.run(module); layouts.run(module); align.run(module); tiles.run(module); -// ir::print(module, std::cout); selection.run(module, *llvm); // return binary std::unique_ptr res(driver::module::create(context, std::move(llvm))); diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index 74043d8e5..14384bbe8 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -79,10 +79,8 @@ int main() { // shapes to benchmark typedef std::tuple config_t; std::vector configs; - for(auto x: std::vector>{{false, false}, - {false, true}, - {true, false}, - {true, true}}){ + for(auto x: std::vector>{{false, true}, + {true, false}, {true, true}}){ std::vector tmp = { config_t{x[0], x[1], 4096, 4096, 4096} // config_t{x[0], x[1], 16, 2048, 2048}, diff --git a/tests/common/src/dot.h b/tests/common/src/dot.h index 2168b23b6..ff80ad6ae 100644 --- a/tests/common/src/dot.h +++ b/tests/common/src/dot.h @@ -4,15 +4,15 @@ namespace src { R"( #if AT == 1 #define USEA ^a -#define STRIDE_AK 1 -#define STRIDE_AM lda +#define STRIDE_AK lda +#define STRIDE_AM 1 #define BROADCAST_AK :, newaxis #define BROADCAST_AM newaxis, : #define SHAPE_A TK, TM #else #define USEA a -#define STRIDE_AK lda -#define STRIDE_AM 1 +#define STRIDE_AK 1 +#define STRIDE_AM lda #define BROADCAST_AK newaxis, : #define BROADCAST_AM :, newaxis #define SHAPE_A TM, TK @@ -20,15 +20,15 @@ R"( #if BT == 1 #define USEB ^b -#define STRIDE_BK ldb -#define STRIDE_BN 1 +#define STRIDE_BK 1 +#define STRIDE_BN ldb #define BROADCAST_BK newaxis, : #define BROADCAST_BN :, newaxis #define SHAPE_B TN, TK #else #define USEB b -#define STRIDE_BK 1 -#define STRIDE_BN ldb +#define STRIDE_BK ldb +#define STRIDE_BN 1 #define BROADCAST_BK :, newaxis #define BROADCAST_BN newaxis, : #define SHAPE_B TK, TN diff --git a/tests/unit/dot.cc b/tests/unit/dot.cc index bb75df10e..b440a1c07 100644 --- a/tests/unit/dot.cc +++ b/tests/unit/dot.cc @@ -139,8 +139,8 @@ int main() { // shapes to benchmark typedef std::tuple config_t; std::vector configs; - for(bool AT: std::array{false, true}) - for(bool BT: std::array{false, true}) + for(bool AT: std::array{false}) + for(bool BT: std::array{false}) for(int TM: std::vector{32, 64}) for(int TN: std::vector{32, 64}) for(int TK: std::vector{16, 32}) From ed1b2bc563030897da9aa272b6fe087202e8fccf Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 27 Sep 2019 22:15:30 -0400 Subject: [PATCH 421/494] more work on padding --- include/triton/codegen/analysis/liveness.h | 20 +++-- include/triton/codegen/selection.h | 6 +- include/triton/ir/builder.h | 2 +- include/triton/ir/instructions.h | 20 ++--- lib/codegen/analysis/align.cc | 3 - lib/codegen/analysis/allocation.cc | 47 +++++------ lib/codegen/analysis/axes.cc | 2 +- lib/codegen/analysis/liveness.cc | 98 ++++++++++++++++------ lib/codegen/analysis/tiles.cc | 16 ++-- lib/codegen/selection.cc | 70 +++++++--------- lib/codegen/transform/membar.cc | 4 +- lib/codegen/transform/peephole.cc | 35 +------- lib/driver/module.cc | 2 +- lib/ir/builder.cc | 2 +- lib/ir/instructions.cc | 21 +++-- lib/runtime/function.cc | 1 + tests/bench/dot.cc | 17 ++-- tests/common/src/dot.h | 6 +- tests/unit/dot.cc | 10 +-- 19 files changed, 191 insertions(+), 191 deletions(-) diff --git a/include/triton/codegen/analysis/liveness.h b/include/triton/codegen/analysis/liveness.h index 0f8aea7b4..3aef03a8d 100644 --- a/include/triton/codegen/analysis/liveness.h +++ b/include/triton/codegen/analysis/liveness.h @@ -49,7 +49,7 @@ struct buffer_t { class liveness { private: typedef std::map indices_map_t; - typedef std::map intervals_map_t; + typedef std::map intervals_map_t; typedef std::map has_storage_map_t; typedef ir::value* node_t; typedef std::map > graph_t; @@ -63,24 +63,26 @@ public: private: - void connected_components(node_t x, std::set &nodes, graph_t &graph, unsigned group_id); + void connected_components(node_t x, std::set &nodes, graph_t &graph, buffer_t *buffer); void extract_double_bufferable(ir::instruction *i); void extract_buffers(ir::instruction *i); void get_parents(ir::instruction *i, std::vector& res); void make_graph(ir::instruction *i); + bool do_pad(ir::value *x); public: liveness(tiles *t): tiles_(t){ } + // padding + unsigned get_pad(ir::value *v) const { return pad_.at(v); } // buffer size - unsigned is_ld_padded(ir::value *x); unsigned num_bytes(ir::value *x); // accessors const intervals_map_t& intervals() const { return intervals_; } - segment get_interval(buffer_t v) const { return intervals_.at(v); } + segment get_interval(buffer_t* v) const { return intervals_.at(v); } // buffers - buffer_t get_buffer(ir::value *v) const { return groups_.at(v); } - std::vector get_values(buffer_t x) const { return values_.at(x); } + buffer_t* get_buffer(ir::value *v) const { return groups_.at(v); } + std::vector get_values(buffer_t* x) const { return values_.at(x); } // double-buffering bool has_double(ir::value *x) const { return double_.find(x) != double_.end(); } double_buffer_info_t get_double(ir::value *x) const { return double_.at(x); } @@ -95,12 +97,14 @@ private: indices_map_t indices; intervals_map_t intervals_; std::map double_; + std::map pad_; std::map> parents_; // graph std::set nodes_; graph_t graph_; - std::map groups_; - std::map> values_; + std::vector buffers_; + std::map groups_; + std::map> values_; }; } diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index bc236ff22..29241f1c3 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -89,7 +89,7 @@ private: public: - shared_tile(Type* ty, const shapes_t &shapes, const std::vector &order, Value* ptr, Builder &builder, Value* offset = nullptr); + shared_tile(Type* ty, const shapes_t &shapes, const std::vector &order, Value* ptr, Builder &builder, Value* offset = nullptr, const std::vector& perm = {}); void set_vector_size(unsigned vector_size); void set_return_mode(bool return_vector); void set_value(indices_t, Value *); @@ -97,8 +97,9 @@ public: Value* get_value(indices_t idx); Value* get_pointer() { return ptr_; } Value* get_offset() { return offset_; } + const std::vector& get_perm() { return perm_; } const std::vector& get_order() { return order_; } - static Value* shared_offset(Builder& builder, const shapes_t& shapes, const std::vector& order, indices_t idx); + static Value* shared_offset(Builder& builder, const shapes_t& shapes, const std::vector& perm, const std::vector& order, indices_t idx); private: Value *ptr_; @@ -108,6 +109,7 @@ private: std::map ptr_cache_; unsigned vector_size_; std::vector order_; + std::vector perm_; }; // Distribtued tile diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h index 5cf107be3..e254f6d38 100644 --- a/include/triton/ir/builder.h +++ b/include/triton/ir/builder.h @@ -135,7 +135,7 @@ public: value *create_atomic_exch(value *ptr, value *val, const std::string &name = ""); value *create_atomic_add(value *ptr, value *val, const std::string &name = ""); value *create_dot(value *A, value *B, value *C, const std::string &name = ""); - value *create_trans(value *A, const std::vector &perm = {}, const std::string &name = ""); + value *create_trans(value *A, const std::vector &perm = {}, const std::string &name = ""); value *create_sqrt(value *A, const std::string &name = ""); value *create_reduce(value *A, unsigned axis, const std::string &name = ""); value *create_select(value *pred, value *if_value, value *else_value, const std::string &name = ""); diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index bafc1c2c3..9298ccbe0 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -591,7 +591,7 @@ public: private: dot_inst(value *A, value *B, value *C, TransT AT, TransT BT, const std::string &name, instruction *next); - std::string repr_impl() const { return std::string("dot.") + ((AT_==NoTrans)?"n":"t") + ((BT_==NoTrans)?"n":"t"); } + std::string repr_impl() const { return "dot"; } public: static instruction *create(value *A, value *B, value *C, bool AT, bool BT, const std::string &name = "", instruction *next = nullptr); @@ -599,13 +599,7 @@ public: static instruction* create_nt(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr); static instruction* create_tn(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr); static instruction* create_tt(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr); - bool is_a_trans() { return AT_ == Trans; } - bool is_b_trans() { return BT_ == Trans; } _TRITON_DEFINE_CLONE(dot_inst) - -private: - TransT AT_; - TransT BT_; }; //class outer_inst: public builtin_inst { @@ -617,20 +611,20 @@ private: class trans_inst: public builtin_inst { public: - ir::type* get_res_ty(ir::type* in, std::vector perm); - std::vector init_perm(ir::type* ty, const std::vector& perm); + ir::type* get_res_ty(ir::type* in, std::vector perm); + std::vector init_perm(ir::type* ty, const std::vector& perm); private: - trans_inst(value *arg, const std::vector& perm, const std::string& name, instruction* next); + trans_inst(value *arg, const std::vector& perm, const std::string& name, instruction* next); std::string repr_impl() const { return "trans"; } public: - static instruction* create(value *arg, const std::vector& perm = {}, const std::string &name = "", instruction *next = nullptr); - const std::vector get_perm() const; + static instruction* create(value *arg, const std::vector &perm = {}, const std::string &name = "", instruction *next = nullptr); + const std::vector get_perm() const; _TRITON_DEFINE_CLONE(trans_inst) private: - std::vector perm_; + std::vector perm_; }; class sqrt_inst: public builtin_inst { diff --git a/lib/codegen/analysis/align.cc b/lib/codegen/analysis/align.cc index e31799b59..ef57e7a4f 100644 --- a/lib/codegen/analysis/align.cc +++ b/lib/codegen/analysis/align.cc @@ -487,9 +487,6 @@ void align::populate(ir::value *v) { populate_is_constant(v); populate_starting_multiple(v); populate_max_contiguous(v); -// std::cout << v->get_name() << std::endl; -// if(max_contiguous_[v].size() == 2) -// std::cout << max_contiguous_[v][0] << " " << max_contiguous_[v][1] << std::endl; } void align::run(ir::module &mod) { diff --git a/lib/codegen/analysis/allocation.cc b/lib/codegen/analysis/allocation.cc index 21087e680..91ca0868f 100644 --- a/lib/codegen/analysis/allocation.cc +++ b/lib/codegen/analysis/allocation.cc @@ -21,22 +21,22 @@ void allocation::run(ir::module &mod) { using std::min; typedef std::multimap triples_map_type; - std::vector I; + std::vector I; for(auto x: liveness_->intervals()) I.push_back(x.first); - std::vector J = I; + std::vector J = I; triples_map_type H; H.insert({0, segment{0, INT_MAX}}); - std::vector V; - std::map starts; + std::vector V; + std::map starts; while(!J.empty()){ auto h_it = H.begin(); unsigned w = h_it->first; segment xh = h_it->second; H.erase(h_it); - auto j_it = std::find_if(J.begin(), J.end(), [&](buffer_t JJ){ + auto j_it = std::find_if(J.begin(), J.end(), [&](buffer_t* JJ){ segment xj = liveness_->get_interval(JJ); bool res = xj.intersect(xh); for(auto val: H) @@ -44,7 +44,7 @@ void allocation::run(ir::module &mod) { return res; }); if(j_it != J.end()){ - unsigned size = j_it->size; + unsigned size = (*j_it)->size; segment xj = liveness_->get_interval(*j_it); starts[*j_it] = w; H.insert({w + size, segment{max(xh.start, xj.start), min(xh.end, xj.end)}}); @@ -58,14 +58,14 @@ void allocation::run(ir::module &mod) { } // Build interference graph - std::map> interferences; - for(buffer_t x: V) - for(buffer_t y: V){ - if(x.id == y.id) + std::map> interferences; + for(buffer_t* x: V) + for(buffer_t* y: V){ + if(x->id == y->id) continue; unsigned X0 = starts[x], Y0 = starts[y]; - unsigned NX = x.size; - unsigned NY = y.size; + unsigned NX = x->size; + unsigned NY = y->size; segment XS = {X0, X0 + NX}; segment YS = {Y0, Y0 + NY}; if(liveness_->get_interval(x).intersect(liveness_->get_interval(y)) @@ -74,17 +74,17 @@ void allocation::run(ir::module &mod) { } // Initialize colors - std::map colors; - for(buffer_t X: V) - colors[X] = (X.id==V[0].id)?0:-1; + std::map colors; + for(buffer_t* X: V) + colors[X] = (X->id==V[0]->id)?0:-1; // First-fit graph coloring std::vector available(V.size()); - for(buffer_t x: V){ + for(buffer_t* x: V){ // Non-neighboring colors are available std::fill(available.begin(), available.end(), true); - for(buffer_t Y: interferences[x]){ + for(buffer_t* Y: interferences[x]){ int color = colors[Y]; if(color >= 0) available[color] = false; @@ -95,25 +95,24 @@ void allocation::run(ir::module &mod) { } // Finalize allocation - for(buffer_t x: V){ + for(buffer_t* x: V){ unsigned Adj = 0; - for(buffer_t y: interferences[x]) - Adj = std::max(Adj, starts[y] + y.size); + for(buffer_t* y: interferences[x]) + Adj = std::max(Adj, starts[y] + y->size); // create offsets for(ir::value *v: liveness_->get_values(x)){ offsets_[v] = starts[x] + colors[x] * Adj; if(liveness_->has_double(v)){ auto info = liveness_->get_double(v); - offsets_[info.latch] = offsets_[v] + x.size / 2; + offsets_[info.latch] = offsets_[v] + x->size / 2; } } } // Save maximum size of induced memory space allocated_size_ = 0; - for(auto &x: offsets_){ - allocated_size_ = std::max(allocated_size_, x.second + liveness_->get_buffer(x.first).size); - } + for(buffer_t* x: V) + allocated_size_ = std::max(allocated_size_, starts[x] + x->size); } } diff --git a/lib/codegen/analysis/axes.cc b/lib/codegen/analysis/axes.cc index 2c152f439..16614b8a7 100644 --- a/lib/codegen/analysis/axes.cc +++ b/lib/codegen/analysis/axes.cc @@ -74,7 +74,7 @@ void axes::update_graph_trans(ir::instruction *i) { auto perm = trans->get_perm(); // add edge between axis perm[d] and axis d for(unsigned d = 0; d < perm.size(); d++) - add_constraint({i, perm[d]->get_value()}, {op, d}); + add_constraint({i, perm[d]}, {op, d}); } void axes::update_graph_broadcast(ir::instruction *i) { diff --git a/lib/codegen/analysis/liveness.cc b/lib/codegen/analysis/liveness.cc index 13b456cae..ace03a07a 100644 --- a/lib/codegen/analysis/liveness.cc +++ b/lib/codegen/analysis/liveness.cc @@ -58,6 +58,18 @@ void liveness::make_graph(ir::instruction *i) { graph_[i].insert(latch); graph_[latch].insert(i); } + if(i->get_id() == ir::INST_PHI){ + ir::phi_node* phi = (ir::phi_node*)i; + for(ir::value* op: phi->ops()){ + auto* iop = dynamic_cast(op); + if(!iop || storage_info.at(iop->get_id()).first != SHARED) + continue; + nodes_.insert(phi); + nodes_.insert(op); + graph_[phi].insert(op); + graph_[op].insert(phi); + } + } if(i->get_id() == ir::INST_TRANS){ nodes_.insert(i); nodes_.insert(i->get_operand(0)); @@ -67,39 +79,63 @@ void liveness::make_graph(ir::instruction *i) { } // connected components -void liveness::connected_components(node_t x, std::set &nodes, graph_t &graph, unsigned group_id) { - buffer_t buffer{group_id, num_bytes(x)}; +void liveness::connected_components(node_t x, std::set &nodes, graph_t &graph, buffer_t* buffer) { groups_[x] = buffer; values_[buffer].push_back(x); if(nodes.find(x) != nodes.end()){ nodes.erase(x); for(const node_t &y: graph[x]) - connected_components(y, nodes, graph, group_id); + connected_components(y, nodes, graph, buffer); } } -unsigned liveness::is_ld_padded(ir::value *x) { - if(auto *trans = dynamic_cast(x)){ - if(trans->get_perm()[0]->get_value() != 0) - return 4; +bool liveness::do_pad(ir::value *x) { + // alignment for matrix product + if(auto* dot = dynamic_cast(x)) { + auto order = tiles_->order(x); + // a + ir::value *a = dot->get_operand(0);\ + size_t previous_a = pad_[a]; + bool a_trans = dynamic_cast(a); + bool a_row = order[0] == 1; + if(tiles_->hmma(x) == HMMA_A_ROW) + pad_[a] = 16; + else if(tiles_->hmma(x) == HMMA_A_COL) + pad_[a] = 8; + else if(a_trans ^ a_row) + pad_[a] = 4; + else + pad_[a] = 0; + // b + ir::value *b = dot->get_operand(1); + size_t previous_b = pad_[b]; + bool b_trans = dynamic_cast(a); + bool b_col = order[0] == 0; + if(tiles_->hmma(x) == HMMA_B_COL) + pad_[b] = 16; + if(tiles_->hmma(x) == HMMA_B_ROW) + pad_[b] = 8; + if(b_trans ^ b_col) + pad_[b] = 4; + else + pad_[b] = 0; + return previous_a != pad_[a] || previous_b != pad_[b]; } - auto order = tiles_->order(x); - bool is_col_major = order[0] == 0; - if(tiles_->hmma(x) == HMMA_A_ROW) - return is_col_major ? 16 : 16; - if(tiles_->hmma(x) == HMMA_A_COL) - return is_col_major ? 8 : 8; - if(tiles_->hmma(x) == HMMA_B_COL) - return is_col_major ? 16 : 16; - if(tiles_->hmma(x) == HMMA_B_ROW) - return is_col_major ? 8 : 8; + // padding for phi-nodes if(auto* phi = dynamic_cast(x)) { - unsigned result = 0; - for(unsigned i = 0; i < phi->get_num_incoming(); i++) - result = std::max(result, is_ld_padded(phi->get_incoming_value(i))); - return result; + bool has_changed = false; + for(unsigned i = 0; i < phi->get_num_incoming(); i++){ + ir::value* op = phi->get_operand(i); + size_t previous = pad_[op]; + pad_[op] = std::max(pad_[op], pad_[phi]); + has_changed |= previous != pad_[op]; + } + return has_changed; } - return 0; + // default -- no pading + size_t previous = pad_[x]; + pad_[x] = std::max(previous, 0); + return pad_[x] != previous; } unsigned liveness::num_bytes(ir::value *x) { @@ -120,7 +156,8 @@ unsigned liveness::num_bytes(ir::value *x) { return num_elements * num_bytes * depth; } unsigned num_bytes = x->get_type()->get_primitive_size_in_bits() / 8; - unsigned pad = is_ld_padded(x); + unsigned pad = pad_.at(x); + std::cout << x->get_name() << " " << pad << std::endl; if(pad > 0){ unsigned ld = x->get_type()->get_tile_shapes()[tiles_->order(x)[0]]; num_bytes += pad * num_bytes / ld; @@ -134,6 +171,7 @@ unsigned liveness::num_bytes(ir::value *x) { void liveness::run(ir::module &mod) { double_.clear(); indices.clear(); + pad_.clear(); intervals_.clear(); parents_.clear(); @@ -142,6 +180,15 @@ void liveness::run(ir::module &mod) { this->extract_double_bufferable(i); }); + // Padding information + bool has_changed; + do{ + has_changed = false; + ir::for_each_value(mod, [this, &has_changed](ir::value* v){ + has_changed |= this->do_pad(v); + }); + }while(has_changed); + // Create buffer dependency graph ir::for_each_instruction(mod, [this](ir::instruction* i) { this->make_graph(i); @@ -150,7 +197,10 @@ void liveness::run(ir::module &mod) { // connected components unsigned group_id = 0; while(!nodes_.empty()){ - connected_components(*nodes_.begin(), nodes_, graph_, group_id++); + buffer_t* buffer = new buffer_t{group_id++}; + connected_components(*nodes_.begin(), nodes_, graph_, buffer); + for(ir::value *v: values_.at(buffer)) + buffer->size = std::max(buffer->size, num_bytes(v)); } // Assigns index to each instruction diff --git a/lib/codegen/analysis/tiles.cc b/lib/codegen/analysis/tiles.cc index 7f19df276..0bd317f8f 100644 --- a/lib/codegen/analysis/tiles.cc +++ b/lib/codegen/analysis/tiles.cc @@ -40,7 +40,7 @@ bool is_hmma_a_col(ir::value* v) { for(ir::user *u: v->get_users()) if(is_hmma_c(u)){ ir::dot_inst* dot = (ir::dot_inst*)u; - if((v == dot->get_operand(0)) && !dot->is_a_trans()) + if((v == dot->get_operand(0))) return true; } } @@ -49,7 +49,7 @@ bool is_hmma_a_row(ir::value* v) { for(ir::user *u: v->get_users()) if(is_hmma_c(u)){ ir::dot_inst* dot = (ir::dot_inst*)u; - if((v == dot->get_operand(0)) && dot->is_a_trans()) + if((v == dot->get_operand(0))) return true; } } @@ -58,7 +58,7 @@ bool is_hmma_b_col(ir::value* v) { for(ir::user *u: v->get_users()) if(is_hmma_c(u)){ ir::dot_inst* dot = (ir::dot_inst*)u; - if((v == dot->get_operand(1)) && !dot->is_b_trans()) + if((v == dot->get_operand(1))) return true; } } @@ -67,7 +67,7 @@ bool is_hmma_b_row(ir::value* v) { for(ir::user *u: v->get_users()) if(is_hmma_c(u)){ ir::dot_inst* dot = (ir::dot_inst*)u; - if((v == dot->get_operand(1)) && dot->is_b_trans()) + if((v == dot->get_operand(1))) return true; } } @@ -170,6 +170,7 @@ void tiles::init_scanline_tile(ir::value *i) { unsigned effective_num_threads = 1; for(size_t d = 0; d < shapes.size(); d++) effective_num_threads *= mts_[axes_->get_id(i, d)]; +// std::cout << num_threads << " " << effective_num_threads << std::endl; if(num_threads != effective_num_threads) throw std::runtime_error("cannot create a kernel with this amount of warps"); } @@ -219,7 +220,7 @@ void tiles::run(ir::module &) { largest_[i] = *std::max_element(values.begin(), values.end(), cmp); } - // find out the order of a group + // find out the layout ordering of a group for(size_t i = 0; i < num_groups; i++){ std::set io; for(ir::value* v: layout_->values(i)) @@ -239,11 +240,6 @@ void tiles::run(ir::module &) { order_[i] = order; } for(size_t i = 0; i < num_groups; i++){ - bool is_hmma_op = hmma_[i] == HMMA_A_COL || hmma_[i] == HMMA_A_ROW || - hmma_[i] == HMMA_B_COL || hmma_[i] == HMMA_B_ROW; - if(!is_hmma_op) - continue; - // extract copies to shared memory std::vector cts; for(ir::value* v: layout_->values(i)) if(auto *x = dynamic_cast(v)) diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 60facdae4..c355f9d2f 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -146,26 +146,30 @@ void shared_tile::extract_constant(const indices_t &arg_idx, indices_t &non_cst_ } -Value* shared_tile::shared_offset(llvm::IRBuilder<> &builder, const shapes_t& shapes, const std::vector& order, indices_t idx) { +Value* shared_tile::shared_offset(llvm::IRBuilder<> &builder, const shapes_t& shapes, const std::vector& perm, const std::vector& order, indices_t idx) { + // strides + std::vector strides(order.size()); + strides[order[0]] = builder.getInt32(1); + for(size_t i = 1; i < idx.size(); i++) + strides[order[i]] = builder.CreateMul(strides[order[i-1]], builder.getInt32(shapes[order[i-1]])); + // result Value *result = builder.getInt32(0); - result = builder.CreateAdd(result, idx[order[0]]); - Value *ld = builder.getInt32(shapes[order[0]]); - for(size_t i = 1; i < idx.size(); i++) { - result = builder.CreateAdd(result, builder.CreateMul(idx[order[i]], ld)); - if(i < idx.size() - 1){ - ld = builder.CreateMul(ld, builder.getInt32(shapes[order[i]])); - } - } + for(size_t i = 0; i < strides.size(); i++) + result = builder.CreateAdd(result, builder.CreateMul(idx[perm[i]], strides[i])); return result; } -shared_tile::shared_tile(Type *ty, const shapes_t &shapes, const std::vector& order, Value *ptr, llvm::IRBuilder<> &builder, Value *offset): - tile(ty, shapes), order_(order), ptr_(ptr), builder_(builder), offset_(offset), vector_size_(1){ +shared_tile::shared_tile(Type *ty, const shapes_t &shapes, const std::vector& order, Value *ptr, llvm::IRBuilder<> &builder, Value *offset, const std::vector& perm): + tile(ty, shapes), order_(order), ptr_(ptr), builder_(builder), offset_(offset), vector_size_(1), perm_(perm){ return_vector_ = false; + if(perm_.empty()){ + perm_.resize(shapes.size()); + std::iota(perm_.begin(), perm_.end(), 0); + } } void shared_tile::set_value(indices_t idx, Value *value) { - Value *ptr = builder_.CreateGEP(ptr_, shared_offset(builder_, shapes_, order_, idx)); + Value *ptr = builder_.CreateGEP(ptr_, shared_offset(builder_, shapes_, perm_, order_, idx)); unsigned addr_space = ptr->getType()->getPointerAddressSpace(); ptr = builder_.CreateBitCast(ptr, value->getType()->getPointerTo(addr_space)); builder_.CreateStore(value, ptr); @@ -196,7 +200,7 @@ Value* shared_tile::get_value(indices_t idx) { // if(isa(non_cst_idx.front())){ // builder_.SetInsertPoint((Instruction*)non_cst_idx.front()); // } - base_ptr = builder_.CreateGEP(ptr_, shared_offset(builder_, shapes_, order_, non_cst_idx)); + base_ptr = builder_.CreateGEP(ptr_, shared_offset(builder_, shapes_, perm_, order_, non_cst_idx)); if(vector_size_ > 1){ Type *vec_ty = VectorType::get(ty, vector_size); Type *vec_ptr_ty = PointerType::get(vec_ty, base_ptr->getType()->getPointerAddressSpace()); @@ -204,7 +208,7 @@ Value* shared_tile::get_value(indices_t idx) { } // builder_.SetInsertPoint(store); } - Value *offset = shared_offset(builder_, shapes_, order_, cst_idx); + Value *offset = shared_offset(builder_, shapes_, perm_, order_, cst_idx); Value *div = offset; if(vector_size_ > 1) div = builder_.CreateUDiv(offset, builder_.getInt32(vector_size_)); @@ -725,7 +729,7 @@ void selection::create_shared_tile(ir::value *v, IRBuilder<> &builder, Value *sh return; auto order = tiles_->order(v); auto shapes = v->get_type()->get_tile_shapes(); - unsigned pad = liveness_->is_ld_padded(v); + unsigned pad = liveness_->get_pad(v); if(pad > 0) shapes[order[0]] += pad; Type* ty = llvm_type(v->get_type()->get_scalar_ty(), builder.getContext()); @@ -923,7 +927,7 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, write_idx.insert(write_idx.begin() + axis, lane); // shared memory write pointer - Value *write_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), op_tile->get_order(), write_idx); + Value *write_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), {0, 1}, op_tile->get_order(), write_idx); Value *write_ptr = builder.CreateGEP(base_ptr, write_offset); // initialize shared memory @@ -936,7 +940,7 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, indices_t current(write_idx.size(), builder.getInt32(0)); current[axis] = builder.getInt32(i); // shared memory offset - Value *read_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), op_tile->get_order(), current); + Value *read_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), {0, 1}, op_tile->get_order(), current); Value *is_active = builder.CreateICmpULT(lane, builder.getInt32(i)); read_offset = builder.CreateSelect(is_active, read_offset, builder.getInt32(0)); // shared memory read pointer @@ -952,7 +956,7 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, // result is on the first lane of shared memory indices_t final = write_idx; final[axis] = builder.getInt32(0); - Value *read_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), op_tile->get_order(), final); + Value *read_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), {0, 1}, op_tile->get_order(), final); Value *read_ptr = builder.CreateGEP(base_ptr, read_offset); tgt_->add_barrier(module, builder); result = builder.CreateLoad(read_ptr); @@ -1041,11 +1045,7 @@ void selection::lower_copy_to_shared(ir::copy_to_shared_inst *x, LLVMContext &ct void selection::lower_trans(ir::trans_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { shared_tile* in = (shared_tile*)tmap_.at(x->get_operand(0)); - auto in_order = in->get_order(); - std::vector order; - for(auto p: x->get_perm()) - order.push_back(in_order[p->get_value()]); - shared_tile* out = new shared_tile(in->get_ty(), in->get_shapes(), order, in->get_pointer(), builder, in->get_offset()); + shared_tile* out = new shared_tile(in->get_ty(), in->get_shapes(), in->get_order(), in->get_pointer(), builder, in->get_offset(), x->get_perm()); tmap_[x] = out; } @@ -1082,8 +1082,8 @@ void selection::lower_hmma_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn auto ord_a = tiles_->order(dot->get_operand(0)); auto ord_b = tiles_->order(dot->get_operand(1)); - bool is_a_row = dot->is_a_trans() ^ ord_a[ord_a.size() - 2] == 1; - bool is_b_row = dot->is_b_trans() ^ ord_b[ord_b.size() - 2] == 1; + bool is_a_row = ord_a[ord_a.size() - 2] == 1; + bool is_b_row = ord_b[ord_b.size() - 2] == 1; if(is_a_row){ offset_a_i = builder.CreateAdd(offset_a_i, builder.CreateURem(u_thread_id, builder.getInt32(4))); @@ -1125,10 +1125,6 @@ void selection::lower_hmma_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn Value *current_offset_b_i = builder.CreateAdd(offset_b_j, builder.getInt32(pack_j*stride_rep_j*pack_size_1_)); indices_t idx_a = {current_offset_a_i, builder.CreateAdd(offset_a_k, _K)}; indices_t idx_b = {current_offset_b_i, builder.CreateAdd(offset_b_k, _K)}; - if(dot->is_a_trans()) - std::swap(idx_a[0], idx_a[1]); - if(!dot->is_b_trans()) - std::swap(idx_b[0], idx_b[1]); idx_a.insert(idx_a.end(), x.first.begin(), x.first.end()); idx_b.insert(idx_b.end(), x.first.begin(), x.first.end()); Value *ha = TA->get_value(idx_a); @@ -1188,10 +1184,6 @@ void selection::lower_scanline_dot(ir::dot_inst *dot, LLVMContext &ctx, Function // input indices indices_t a_idx = {idx[0], builder.getInt32(K)}; indices_t b_idx = {builder.getInt32(K), idx[1]}; - if(dot->is_a_trans()) - std::swap(a_idx[0], a_idx[1]); - if(dot->is_b_trans()) - std::swap(b_idx[0], b_idx[1]); // add batching dimension for(size_t i = 2; i < idx.size(); i++){ a_idx.insert(a_idx.end(), idx[i]); @@ -1217,10 +1209,8 @@ void selection::lower_outer_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *f Value *res = TD->get_value(idx); indices_t a_idx = {idx[0], builder.getInt32(0)}; indices_t b_idx = {builder.getInt32(0), idx[1]}; - if(dot->is_a_trans()) - std::swap(a_idx[0], a_idx[1]); - if(dot->is_b_trans()) - std::swap(b_idx[0], b_idx[1]); + std::swap(a_idx[0], a_idx[1]); + std::swap(b_idx[0], b_idx[1]); Value *a = TA->get_value(a_idx); Value *b = TB->get_value(b_idx); if(a->getType() != c_ty) @@ -1243,7 +1233,7 @@ void selection::lower_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn, IRB Type *c_ty = llvm_type(D->get_type()->get_scalar_ty(), ctx); Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {c_ty}); auto A_shapes = A->get_type()->get_tile_shapes(); - size_t red_axis = dot->is_a_trans() ? 0 : 1; + size_t red_axis = 1; unsigned NK = A_shapes[red_axis]; if(NK != 1) { @@ -1552,8 +1542,8 @@ void selection::run(ir::module &src, Module &dst) { offset->addIncoming(next_offset, llvm_inc_block); } else { - unsigned num_bytes = phi->get_type()->get_scalar_ty()->get_primitive_size_in_bits() / 8; - offset->addIncoming(dst_builder.getInt32(liveness_->num_bytes(phi)/(num_bytes)), llvm_inc_block); + unsigned num_bytes = inst->get_type()->get_scalar_ty()->get_primitive_size_in_bits() / 8; + offset->addIncoming(dst_builder.getInt32(liveness_->get_buffer(inst)->size / (2*num_bytes)), llvm_inc_block); } ptr->addIncoming(inc_shared->get_pointer(), llvm_inc_block); } diff --git a/lib/codegen/transform/membar.cc b/lib/codegen/transform/membar.cc index aee19110f..ee5821da4 100644 --- a/lib/codegen/transform/membar.cc +++ b/lib/codegen/transform/membar.cc @@ -38,8 +38,8 @@ void membar::add_reference(ir::value *v, interval_vec_t &res){ return; if(alloc_->has_offset(v)){ unsigned offset = alloc_->offset(v); - unsigned num_bytes = liveness_->num_bytes(v); - res.push_back(interval_t(offset, offset + num_bytes)); + unsigned size = liveness_->get_buffer(v)->size; + res.push_back(interval_t(offset, offset + size)); } } diff --git a/lib/codegen/transform/peephole.cc b/lib/codegen/transform/peephole.cc index ca67ecf5a..73b8ff27f 100644 --- a/lib/codegen/transform/peephole.cc +++ b/lib/codegen/transform/peephole.cc @@ -8,37 +8,8 @@ namespace codegen{ namespace transform{ -inline bool is_trans(ir::value *v){ - auto *x = dynamic_cast(v); - if(!x) - return false; - std::vector perm = x->get_perm(); - std::vector ref; - ir::type *int32_ty = ir::type::get_int32_ty(v->get_type()->get_context()); - for(size_t i = 0; i < perm.size(); i++) - ref.push_back(ir::constant_int::get(int32_ty, i)); - std::swap(ref[0], ref[1]); - // true is perm == ref - return std::equal(perm.begin(), perm.end(), ref.begin()); -} - -inline bool is_hmma(ir::value *v){ - bool result = false; - if(auto *x = dynamic_cast(v)){ - ir::value *a = x->get_operand(0); - ir::type *a_ty = a->get_type(); - ir::value *b = x->get_operand(1); - ir::type *b_ty = b->get_type(); - // inputs have to be FP16 - result = a_ty->get_scalar_ty()->is_half_ty() && b_ty->get_scalar_ty()->is_half_ty(); -// reduction has to be multiple of 4 -// result = result && ((a_ty->get_tile_shapes()[1]->get_value() % 4) == 0); - } - return result; -} - ir::value* rewrite_trans_phi_impl(ir::value *value, ir::builder &builder, - const std::vector& perm) { + const std::vector& perm) { if(auto phi = dynamic_cast(value)) { // transpose operands std::vector incs; @@ -106,9 +77,7 @@ bool peephole::rewrite_dot(ir::instruction *value, ir::builder& builder){ ir::value *a = dot->get_operand(0); ir::value *b = dot->get_operand(1); builder.set_insert_point(add); - ir::value * new_dot = builder.insert(ir::dot_inst::create(a, b, other, - dot->is_a_trans(), dot->is_b_trans(), - dot->get_name())); + ir::value * new_dot = builder.insert(ir::dot_inst::create_nn(a, b, other, dot->get_name())); add->replace_all_uses_with(new_dot); return true; } diff --git a/lib/driver/module.cc b/lib/driver/module.cc index f29c830f4..30881d087 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -241,7 +241,7 @@ std::string cu_module::compile_llvm_module(std::unique_ptr module, cu_module::cu_module(driver::context * context, std::unique_ptr ll_module): cu_module(context, compile_llvm_module(std::move(ll_module), context->device())) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ -// std::cout << source << std::endl; + std::cout << source << std::endl; cu_context::context_switcher ctx(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/ir/builder.cc b/lib/ir/builder.cc index db2080a4d..caf22348f 100644 --- a/lib/ir/builder.cc +++ b/lib/ir/builder.cc @@ -322,7 +322,7 @@ value *builder::create_dot(value *A, value *B, value *C, const std::string &name return insert(dot_inst::create_nn(A, B, C, name)); } -value *builder::create_trans(value *A, const std::vector& perm, const std::string &name) { +value *builder::create_trans(value *A, const std::vector& perm, const std::string &name) { return insert(trans_inst::create(A, perm, name)); } diff --git a/lib/ir/instructions.cc b/lib/ir/instructions.cc index e89367536..4fdfa797d 100644 --- a/lib/ir/instructions.cc +++ b/lib/ir/instructions.cc @@ -536,7 +536,7 @@ instruction* downcast_inst::create(value *arg, const std::string &name, instruct dot_inst::dot_inst(value *A, value *B, value *C, TransT AT, TransT BT, const std::string &name, instruction *next) - : builtin_inst(C->get_type(), INST_DOT, 3, name, next), AT_(AT), BT_(BT) { + : builtin_inst(C->get_type(), INST_DOT, 3, name, next) { set_operand(0, A); set_operand(1, B); set_operand(2, C); @@ -574,31 +574,30 @@ instruction *dot_inst::create_tt(value *A, value *B, value *C, // trans instructions //===----------------------------------------------------------------------===// -ir::type* trans_inst::get_res_ty(ir::type* ty, std::vector perm) { +ir::type* trans_inst::get_res_ty(ir::type* ty, std::vector perm) { // get argument shapes ir::tile_type::tile_shapes_t arg_shapes = ty->get_tile_shapes(); // permutate argument shapes perm = init_perm(ty, perm); ir::tile_type::tile_shapes_t res_shapes = arg_shapes; for(size_t i = 0; i < perm.size(); i++) - res_shapes[i] = arg_shapes[perm[i]->get_value()]; + res_shapes[i] = arg_shapes[perm[i]]; // construct type return tile_type::get(ty->get_scalar_ty(), res_shapes); } -std::vector trans_inst::init_perm(ir::type* ty, const std::vector& perm) { +std::vector trans_inst::init_perm(ir::type* ty, const std::vector& perm) { if(!perm.empty()) return perm; auto size = ty->get_tile_shapes().size(); - ir::type* int32_ty = type::get_int32_ty(ty->get_context()); - std::vector result; - result.push_back(ir::constant_int::get(int32_ty, size - 1)); + std::vector result; + result.push_back(size - 1); for(size_t i = 0; i < size - 1; i++) - result.push_back(ir::constant_int::get(int32_ty, i)); + result.push_back(i); return result; } -trans_inst::trans_inst(value *arg, const std::vector& perm, const std::string &name, instruction *next) +trans_inst::trans_inst(value *arg, const std::vector &perm, const std::string &name, instruction *next) : builtin_inst(get_res_ty(arg->get_type(), perm), INST_TRANS, 1, name, next) { // sanity check perm_ = init_perm(arg->get_type(), perm); @@ -607,11 +606,11 @@ trans_inst::trans_inst(value *arg, const std::vector& perm, const set_operand(0, arg); } -instruction* trans_inst::create(value *arg, const std::vector &perm, const std::string &name, instruction *next) { +instruction* trans_inst::create(value *arg, const std::vector &perm, const std::string &name, instruction *next) { return new trans_inst(arg, perm, name, next); } -const std::vector trans_inst::get_perm() const { +const std::vector trans_inst::get_perm() const { return perm_; } diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index f628f9171..e9f5f8921 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -229,6 +229,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c reassociate.run(module); dce.run(module); cts.run(module); +// ir::print(module, std::cout); liveness.run(module); allocation.run(module); if(allocation.allocated_size() > context->device()->max_shared_memory()) diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index 14384bbe8..45541e247 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -27,9 +27,9 @@ inline rt::function::grid_fn_ty grid2d(size_t M, size_t N) { std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K){ - typedef half_float::half NumericT; - std::string ty = "half"; - cublasDataType_t cuty = CUDA_R_16F; + typedef float NumericT; + std::string ty = "float"; + cublasDataType_t cuty = CUDA_R_32F; size_t dt_nbytes = sizeof(NumericT); drv::context* context = stream->context(); // leading dimensions @@ -45,9 +45,9 @@ std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, i opt.defines.push_back({"TYPE", {ty}}); opt.defines.push_back({"AT", {AT?"1":"0"}}); opt.defines.push_back({"BT", {BT?"1":"0"}}); - opt.defines.push_back({"TM", {"128"}}); - opt.defines.push_back({"TN", {"128"}}); - opt.defines.push_back({"TK", {"16"}}); + opt.defines.push_back({"TM", {"64"}}); + opt.defines.push_back({"TN", {"64"}}); + opt.defines.push_back({"TK", {"8"}}); opt.num_warps = {4}; // create function rt::function function(src::dot, opt); @@ -79,10 +79,9 @@ int main() { // shapes to benchmark typedef std::tuple config_t; std::vector configs; - for(auto x: std::vector>{{false, true}, - {true, false}, {true, true}}){ + for(auto x: std::vector>{{false, false}}){ std::vector tmp = { - config_t{x[0], x[1], 4096, 4096, 4096} + config_t{x[0], x[1], 2048, 2048, 2048} // config_t{x[0], x[1], 16, 2048, 2048}, // config_t{x[0], x[1], 32, 2048, 2048}, // config_t{x[0], x[1], 64, 2048, 2048}, diff --git a/tests/common/src/dot.h b/tests/common/src/dot.h index ff80ad6ae..2cc3fa290 100644 --- a/tests/common/src/dot.h +++ b/tests/common/src/dot.h @@ -54,12 +54,12 @@ void dot(TYPE * A, TYPE * B, TYPE * C, TYPE a[SHAPE_A] = *pa; TYPE b[SHAPE_B] = *pb; // reduction loop - for(int k = K; k > 0; k-= TK){ + for(int k = K; k > TK; k-= TK){ c += USEA @ USEB; pa = pa + TK * STRIDE_AK; pb = pb + TK * STRIDE_BK; - a = ((bool[SHAPE_A])(k > TK)) ? *pa : 0; - b = ((bool[SHAPE_B])(k > TK)) ? *pb : 0; + a = *pa; + b = *pb; } // epilogue int rxc[TM] = ridx * TM + 0 ... TM; diff --git a/tests/unit/dot.cc b/tests/unit/dot.cc index b440a1c07..9b3ee06f0 100644 --- a/tests/unit/dot.cc +++ b/tests/unit/dot.cc @@ -139,13 +139,13 @@ int main() { // shapes to benchmark typedef std::tuple config_t; std::vector configs; - for(bool AT: std::array{false}) - for(bool BT: std::array{false}) for(int TM: std::vector{32, 64}) for(int TN: std::vector{32, 64}) - for(int TK: std::vector{16, 32}) - for(int nwarps: std::vector{1, 2, 4, 8}){ - configs.push_back(config_t{HALF, AT, BT, 128, 128, 128, TM, TN, TK, nwarps}); + for(int TK: std::vector{8}) + for(int nwarps: std::vector{1, 2, 4, 8}) + for(bool AT: std::array{false, true}) + for(bool BT: std::array{false, true}){ + configs.push_back(config_t{FLOAT, AT, BT, 128, 128, 128, TM, TN, TK, nwarps}); } // does the work dtype_t dtype; From 86a3e5d89750146724cbae1705e16322d6aa84ec Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 1 Oct 2019 16:57:59 -0400 Subject: [PATCH 422/494] [codegen] now matrix-multiplication is bank-conflict free for all layouts --- include/triton/codegen/analysis/tiles.h | 1 + lib/codegen/analysis/liveness.cc | 69 ++++++++++++++----------- lib/codegen/analysis/tiles.cc | 32 +++++++++--- lib/driver/module.cc | 1 - tests/bench/dot.cc | 7 +-- tests/common/src/dot.h | 2 +- 6 files changed, 71 insertions(+), 41 deletions(-) diff --git a/include/triton/codegen/analysis/tiles.h b/include/triton/codegen/analysis/tiles.h index 87705d132..6fe964738 100644 --- a/include/triton/codegen/analysis/tiles.h +++ b/include/triton/codegen/analysis/tiles.h @@ -39,6 +39,7 @@ class tiles { private: void init_hmma_tile(ir::value *i); void init_scanline_tile(ir::value *i); + bool is_trans(ir::value *i); public: tiles(size_t num_warps, analysis::align* align, analysis::axes* axes, analysis::layout* layout); diff --git a/lib/codegen/analysis/liveness.cc b/lib/codegen/analysis/liveness.cc index ace03a07a..b1f75f03c 100644 --- a/lib/codegen/analysis/liveness.cc +++ b/lib/codegen/analysis/liveness.cc @@ -91,36 +91,46 @@ void liveness::connected_components(node_t x, std::set &nodes, graph_t & bool liveness::do_pad(ir::value *x) { // alignment for matrix product - if(auto* dot = dynamic_cast(x)) { - auto order = tiles_->order(x); - // a - ir::value *a = dot->get_operand(0);\ - size_t previous_a = pad_[a]; - bool a_trans = dynamic_cast(a); - bool a_row = order[0] == 1; - if(tiles_->hmma(x) == HMMA_A_ROW) - pad_[a] = 16; - else if(tiles_->hmma(x) == HMMA_A_COL) - pad_[a] = 8; - else if(a_trans ^ a_row) - pad_[a] = 4; - else - pad_[a] = 0; - // b - ir::value *b = dot->get_operand(1); - size_t previous_b = pad_[b]; - bool b_trans = dynamic_cast(a); - bool b_col = order[0] == 0; - if(tiles_->hmma(x) == HMMA_B_COL) - pad_[b] = 16; - if(tiles_->hmma(x) == HMMA_B_ROW) - pad_[b] = 8; - if(b_trans ^ b_col) - pad_[b] = 4; - else - pad_[b] = 0; - return previous_a != pad_[a] || previous_b != pad_[b]; +// if(auto* dot = dynamic_cast(x)) { +// auto order = tiles_->order(x); +// // a +// ir::value *a = dot->get_operand(0);\ +// size_t previous_a = pad_[a]; +// bool a_trans = dynamic_cast(a); +// bool a_row = order[0] == 0; +// if(tiles_->hmma(x) == HMMA_A_ROW) +// pad_[a] = 16; +// else if(tiles_->hmma(x) == HMMA_A_COL) +// pad_[a] = 8; +// else if(a_trans ^ a_row) +// pad_[a] = 4; +// else +// pad_[a] = 0; +// // b +// ir::value *b = dot->get_operand(1); +// size_t previous_b = pad_[b]; +// bool b_trans = dynamic_cast(b); +// bool b_col = order[0] == 0; +// if(tiles_->hmma(x) == HMMA_B_COL) +// pad_[b] = 16; +// if(tiles_->hmma(x) == HMMA_B_ROW) +// pad_[b] = 8; +// if(b_trans ^ b_col) +// pad_[b] = 4; +// else +// pad_[b] = 0; +// return previous_a != pad_[a] || previous_b != pad_[b]; +// } + if(auto* cts = dynamic_cast(x)) { + auto cts_order = tiles_->order(cts); + ir::value *arg = cts->get_operand(0); + auto arg_order = tiles_->order(arg); + if(cts_order != arg_order) + pad_[cts] = 4; } +// if(auto* tr = dynamic_cast(x)) { +// pad_[tr] = 4; +// } // padding for phi-nodes if(auto* phi = dynamic_cast(x)) { bool has_changed = false; @@ -157,7 +167,6 @@ unsigned liveness::num_bytes(ir::value *x) { } unsigned num_bytes = x->get_type()->get_primitive_size_in_bits() / 8; unsigned pad = pad_.at(x); - std::cout << x->get_name() << " " << pad << std::endl; if(pad > 0){ unsigned ld = x->get_type()->get_tile_shapes()[tiles_->order(x)[0]]; num_bytes += pad * num_bytes / ld; diff --git a/lib/codegen/analysis/tiles.cc b/lib/codegen/analysis/tiles.cc index 0bd317f8f..13d3fbd13 100644 --- a/lib/codegen/analysis/tiles.cc +++ b/lib/codegen/analysis/tiles.cc @@ -184,6 +184,20 @@ void extract_io_use(ir::value *v, std::set& result) { } +bool tiles::is_trans(ir::value *v) { + if(dynamic_cast(v)) { + return true; + } + if(auto *phi = dynamic_cast(v)) { + bool result = true; + for(ir::value *op: phi->ops()) + result = result && is_trans(op); + return result; + } + return false; +} + + void tiles::run(ir::module &) { hmma_.clear(); largest_.clear(); @@ -220,6 +234,7 @@ void tiles::run(ir::module &) { largest_[i] = *std::max_element(values.begin(), values.end(), cmp); } + // find out the layout ordering of a group for(size_t i = 0; i < num_groups; i++){ std::set io; @@ -240,13 +255,18 @@ void tiles::run(ir::module &) { order_[i] = order; } for(size_t i = 0; i < num_groups; i++){ - std::vector cts; + std::vector dots; for(ir::value* v: layout_->values(i)) - if(auto *x = dynamic_cast(v)) - cts.push_back(x); - if(cts.empty()) - continue; - order_[i] = order(cts[0]->get_operand(0)); + if(auto *x = dynamic_cast(v)) + dots.push_back(x); + for(ir::dot_inst* dot: dots){ + ir::value* a = dot->get_operand(0); + ir::value* b = dot->get_operand(1); + std::vector col = {0, 1}; + std::vector row = {1, 0}; + order_[layout_->id(a)] = is_trans(a) ? row : col; + order_[layout_->id(b)] = is_trans(b) ? col : row; + } } // tiling parameters for(auto x: largest_){ diff --git a/lib/driver/module.cc b/lib/driver/module.cc index 30881d087..e300a75f2 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -241,7 +241,6 @@ std::string cu_module::compile_llvm_module(std::unique_ptr module, cu_module::cu_module(driver::context * context, std::unique_ptr ll_module): cu_module(context, compile_llvm_module(std::move(ll_module), context->device())) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ - std::cout << source << std::endl; cu_context::context_switcher ctx(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index 45541e247..a5a6f559e 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -45,8 +45,8 @@ std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, i opt.defines.push_back({"TYPE", {ty}}); opt.defines.push_back({"AT", {AT?"1":"0"}}); opt.defines.push_back({"BT", {BT?"1":"0"}}); - opt.defines.push_back({"TM", {"64"}}); - opt.defines.push_back({"TN", {"64"}}); + opt.defines.push_back({"TM", {"128"}}); + opt.defines.push_back({"TN", {"128"}}); opt.defines.push_back({"TK", {"8"}}); opt.num_warps = {4}; // create function @@ -79,7 +79,8 @@ int main() { // shapes to benchmark typedef std::tuple config_t; std::vector configs; - for(auto x: std::vector>{{false, false}}){ + for(auto x: std::vector>{{false, false}, {false, true}, + {true, false}, {true, true}}){ std::vector tmp = { config_t{x[0], x[1], 2048, 2048, 2048} // config_t{x[0], x[1], 16, 2048, 2048}, diff --git a/tests/common/src/dot.h b/tests/common/src/dot.h index 2cc3fa290..c3c64d6b1 100644 --- a/tests/common/src/dot.h +++ b/tests/common/src/dot.h @@ -54,7 +54,7 @@ void dot(TYPE * A, TYPE * B, TYPE * C, TYPE a[SHAPE_A] = *pa; TYPE b[SHAPE_B] = *pb; // reduction loop - for(int k = K; k > TK; k-= TK){ + for(int k = K; k > 0; k-= TK){ c += USEA @ USEB; pa = pa + TK * STRIDE_AK; pb = pb + TK * STRIDE_BK; From adbc56d10a7071292f4ccb4395b96f95376ec53e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 2 Oct 2019 14:26:09 -0400 Subject: [PATCH 423/494] [tests] delete redundant code in dot benchmark and unit tests --- tests/bench/dot.cc | 80 ++--------------- tests/common/dot.h | 191 +++++++++++++++++++++++++++++++++++++++++ tests/common/src/dot.h | 32 ------- tests/unit/dot.cc | 139 ++---------------------------- 4 files changed, 201 insertions(+), 241 deletions(-) create mode 100644 tests/common/dot.h diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index a5a6f559e..168e239e6 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -1,76 +1,6 @@ -#include -#include -#include -#include "triton/driver/backend.h" +#include "triton/driver/backend.h" #include "triton/driver/stream.h" -#include "triton/tools/bench.hpp" -#include "triton/external/half.hpp" -#include "triton/runtime/function.h" -#include "src/dot.h" -#include "cuda/cublas.h" - - -namespace drv = triton::driver; -namespace rt = triton::runtime; - -inline size_t ceil(size_t x, size_t y) { - return (x + y - 1) / y; -}; - -inline rt::function::grid_fn_ty grid2d(size_t M, size_t N) { - return [M, N](const rt::function::options_t& x) { - return rt::grid_t{ceil(M, x.D("TM")), - ceil(N, x.D("TN"))}; - }; -} - - - -std::vector do_bench(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K){ - typedef float NumericT; - std::string ty = "float"; - cublasDataType_t cuty = CUDA_R_32F; - size_t dt_nbytes = sizeof(NumericT); - drv::context* context = stream->context(); - // leading dimensions - int32_t lda = AT ? K : M; - int32_t ldb = BT ? N : K; - int32_t ldc = M; - // create inputs - auto da = std::unique_ptr(drv::buffer::create(context, M*K*dt_nbytes)); - auto db = std::unique_ptr(drv::buffer::create(context, K*N*dt_nbytes)); - auto dc = std::unique_ptr(drv::buffer::create(context, M*N*dt_nbytes)); - // create options - rt::function::options_space_t opt; - opt.defines.push_back({"TYPE", {ty}}); - opt.defines.push_back({"AT", {AT?"1":"0"}}); - opt.defines.push_back({"BT", {BT?"1":"0"}}); - opt.defines.push_back({"TM", {"128"}}); - opt.defines.push_back({"TN", {"128"}}); - opt.defines.push_back({"TK", {"8"}}); - opt.num_warps = {4}; - // create function - rt::function function(src::dot, opt); - // benchmark available libraries - std::vector result; - auto tflops = [&](double nanosec) { return 2.*M*N*K / nanosec * 1e-3; }; -// // cublas -// if(cublas::cublasinit()){ -// NumericT alpha(static_cast(1)); -// NumericT beta(static_cast(0)); -// cublasGemmAlgo_t fastest; -// cublasGemm(cuty, stream, AT, BT, M, N, K, &alpha, &*da, lda, &*db, ldb, &beta, &*dc, ldc, &fastest); -// double cublas_ms = triton::tools::bench([&]() { cublasGemm(cuty, stream, AT, BT, M, N, K, -// &alpha, &*da, lda, &*db, ldb, &beta, &*dc, -// ldc, nullptr, fastest); }, stream); -// result.push_back(tflops(cublas_ms)); -// } - // triton - double triton_ms = triton::tools::bench([&]() { function({&*da, &*db, &*dc, M, N, K, lda, ldb, ldc}, grid2d(M, N), stream);}, stream); - result.push_back(tflops(triton_ms)); - // done - return result; -} +#include "dot.h" int main() { // initialize default compute device @@ -82,7 +12,7 @@ int main() { for(auto x: std::vector>{{false, false}, {false, true}, {true, false}, {true, true}}){ std::vector tmp = { - config_t{x[0], x[1], 2048, 2048, 2048} + config_t{x[0], x[1], 2048, 2048, 2048}, // config_t{x[0], x[1], 16, 2048, 2048}, // config_t{x[0], x[1], 32, 2048, 2048}, // config_t{x[0], x[1], 64, 2048, 2048}, @@ -92,7 +22,7 @@ int main() { // config_t{x[0], x[1], 32, 4096, 4096}, // config_t{x[0], x[1], 64, 4096, 4096}, // config_t{x[0], x[1], 128, 4096, 4096}, -// config_t{x[0], x[1], 7000, 4096, 4096}, +// config_t{x[0], x[1], 7000, 4096, 4096} }; configs.insert(configs.end(), tmp.begin(), tmp.end()); } @@ -102,7 +32,7 @@ int main() { for(const auto& c: configs){ std::tie(AT, BT, M, N, K) = c; std::cout << "// " << AT << " " << BT << " " << M << " " << N << " " << K << std::flush; - for(auto perf: do_bench(stream, AT, BT, M, N, K)) + for(auto perf: bench_dot(stream, FLOAT, AT, BT, M, N, K)) std::cout << ", " << perf << std::flush; std::cout << std::endl; } diff --git a/tests/common/dot.h b/tests/common/dot.h new file mode 100644 index 000000000..f96ce17f2 --- /dev/null +++ b/tests/common/dot.h @@ -0,0 +1,191 @@ +#include +#include +#include +#include +#include "triton/driver/backend.h" +#include "triton/driver/stream.h" +#include "triton/tools/bench.hpp" +#include "triton/external/half.hpp" +#include "triton/runtime/function.h" +#include "src/dot.h" +#include "cuda/cublas.h" +#include "util.h" + + +template +static void cc_dot(std::vector &c, const std::vector &a, const std::vector &b, + size_t M, size_t N, size_t K){ + for(size_t m = 0; m < M; m++) + for(size_t n = 0; n < N; n++){ + float acc = 0; + for(size_t k = 0; k < K; k++) + acc = acc + (AT ? a[k*M + m] : a[m*K + k]) * (BT ? b[n*K + k] : b[k*N + n]); + c[m + n*M] = static_cast(acc); + } +} + +template +void cc_dot(bool AT_, bool BT_, size_t M, size_t N, size_t K, + std::vector &c, const std::vector &a, const std::vector &b) { + if(AT_ && BT_) + cc_dot(c, a, b, M, N, K); + else if(AT_ && !BT_) + cc_dot(c, a, b, M, N, K); + else if(!AT_ && BT_) + cc_dot(c, a, b, M, N, K); + else + cc_dot(c, a, b, M, N, K); +} + +enum run_mode_t { + BENCH, + TEST +}; + +enum dtype_t { + FLOAT, + HALF, + DOUBLE +}; + +template +struct to_string; + +template<> struct to_string{ + static constexpr const char* value = "half"; +}; + +template<> struct to_string{ + static constexpr const char* value = "float"; +}; + +template<> struct to_string{ + static constexpr const char* value = "double"; +}; + +template +bool triton_dot(drv::stream* stream, bool AT, bool BT, + int32_t M, int32_t N, int32_t K, + int32_t TM, int32_t TN, int32_t TK, size_t nwarp, + run_mode_t mode, std::vector& bench, bool &test){ + std::string ty = to_string::value; + size_t dt_nbytes = sizeof(T); + drv::context* context = stream->context(); + int32_t lda = AT ? K : M; + int32_t ldb = BT ? N : K; + int32_t ldc = M; + + // inputs + auto dc = std::shared_ptr(drv::buffer::create(context, M*N*dt_nbytes)); + auto da = std::shared_ptr(drv::buffer::create(context, M*K*dt_nbytes)); + auto db = std::shared_ptr(drv::buffer::create(context, K*N*dt_nbytes)); + + // macros + rt::function::options_space_t opt; + // B access patterns + opt.defines.push_back({"USEB", {BT? "^b" : "b" }}); + opt.defines.push_back({"BROADCAST_BK", {BT? "newaxis, :" : ":, newaxis" }}); + opt.defines.push_back({"BROADCAST_BN", {BT? ":, newaxis" : "newaxis, :" }}); + opt.defines.push_back({"SHAPE_B", {BT? "TN, TK" : "TK, TN" }}); + opt.defines.push_back({"STRIDE_BK", {BT? "1" : "ldb" }}); + opt.defines.push_back({"STRIDE_BN", {BT? "ldb" : "1" }}); + // A access patterns + opt.defines.push_back({"USEA", {AT? "^a" : "a" }}); + opt.defines.push_back({"BROADCAST_AK", {AT? ":, newaxis" : "newaxis, :" }}); + opt.defines.push_back({"BROADCAST_AM", {AT? "newaxis, :" : ":, newaxis" }}); + opt.defines.push_back({"SHAPE_A", {AT? "TK, TM" : "TM, TK" }}); + opt.defines.push_back({"STRIDE_AK", {AT? "lda" : "1" }}); + opt.defines.push_back({"STRIDE_AM", {AT? "1" : "lda" }}); + // data-type + opt.defines.push_back({"TYPE", {ty}}); + // tile sizes + if(mode == TEST) { + opt.defines.push_back({"TM", {std::to_string(TM)}}); + opt.defines.push_back({"TN", {std::to_string(TN)}}); + opt.defines.push_back({"TK", {std::to_string(TK)}}); + opt.num_warps = {nwarp}; + } + if(mode == BENCH) { + opt.defines.push_back({"TM", {"128"}}); + opt.defines.push_back({"TN", {"128"}}); + opt.defines.push_back({"TK", {"8"}}); + opt.num_warps = {4}; + } + + // kernels + rt::function function(src::dot, opt); + std::vector args = {&*da, &*db, &*dc, M, N, K, lda, ldb, ldc}; + auto grid = grid2d(M, N); + + // metrics + if(mode == BENCH){ + auto tflops = [&](double nanosec) { return 2.*M*N*K / nanosec * 1e-3; }; + double triton_ns = triton::tools::bench([&]() { function(args, grid, stream);}, stream); + bench.push_back(tflops(triton_ns)); + + // // cublas + // if(cublas::cublasinit()){ + // NumericT alpha(static_cast(1)); + // NumericT beta(static_cast(0)); + // cublasGemmAlgo_t fastest; + // cublasGemm(cuty, stream, AT, BT, M, N, K, &alpha, &*da, lda, &*db, ldb, &beta, &*dc, ldc, &fastest); + // double cublas_ms = triton::tools::bench([&]() { cublasGemm(cuty, stream, AT, BT, M, N, K, + // &alpha, &*da, lda, &*db, ldb, &beta, &*dc, + // ldc, nullptr, fastest); }, stream); + // result.push_back(tflops(cublas_ms)); + // } + } + + // test triton + if(mode == TEST){ + srand(0); + // initialize buffers + std::vector hc(M*N); + std::vector ha(M*K); + std::vector hb(K*N); + for(size_t i = 0; i < ha.size(); i++) + ha[i] = static_cast((float)rand()/RAND_MAX); + for(size_t i = 0; i < hb.size(); i++) + hb[i] = static_cast((float)rand()/RAND_MAX); + // copy buffer + stream->write(&*da, true, 0, ha); + stream->write(&*db, true, 0, hb); + // run kernel + function(args, grid, stream); + // write back + stream->synchronize(); + // compare with CPU + stream->read(&*dc, true, 0, hc); + std::vector rc(hc.size()); + cc_dot(AT, BT, M, N, K, rc, ha, hb); + test = testing::diff(hc, rc); + } +} + +std::vector bench_dot(drv::stream* stream, + dtype_t dtype, bool AT, bool BT, + int32_t M, int32_t N, int32_t K) { + std::vector bench; + bool test; + switch(dtype){ + case HALF: triton_dot(stream, AT, BT, M, N, K, 0, 0, 0, 0, BENCH, bench, test); break; + case FLOAT: triton_dot(stream, AT, BT, M, N, K, 0, 0, 0, 0, BENCH, bench, test); break; + case DOUBLE: triton_dot(stream, AT, BT, M, N, K, 0, 0, 0, 0, BENCH, bench, test); break; + default: break; + } + return bench; +} +bool test_dot(drv::stream* stream, + dtype_t dtype, bool AT, bool BT, + int32_t M, int32_t N, int32_t K, + int32_t TM, int32_t TN, int32_t TK, size_t nwarp) { + std::vector bench; + bool test = false; + switch(dtype){ + case HALF: triton_dot(stream, AT, BT, M, N, K, TM, TN, TK, nwarp, TEST, bench, test); break; + case FLOAT: triton_dot(stream, AT, BT, M, N, K, TM, TN, TK, nwarp, TEST, bench, test); break; + case DOUBLE: triton_dot(stream, AT, BT, M, N, K, TM, TN, TK, nwarp, TEST, bench, test); break; + default: break; + } + return test; +} diff --git a/tests/common/src/dot.h b/tests/common/src/dot.h index c3c64d6b1..c360edbfe 100644 --- a/tests/common/src/dot.h +++ b/tests/common/src/dot.h @@ -2,38 +2,6 @@ namespace src { const char *dot = R"( -#if AT == 1 -#define USEA ^a -#define STRIDE_AK lda -#define STRIDE_AM 1 -#define BROADCAST_AK :, newaxis -#define BROADCAST_AM newaxis, : -#define SHAPE_A TK, TM -#else -#define USEA a -#define STRIDE_AK 1 -#define STRIDE_AM lda -#define BROADCAST_AK newaxis, : -#define BROADCAST_AM :, newaxis -#define SHAPE_A TM, TK -#endif - -#if BT == 1 -#define USEB ^b -#define STRIDE_BK 1 -#define STRIDE_BN ldb -#define BROADCAST_BK newaxis, : -#define BROADCAST_BN :, newaxis -#define SHAPE_B TN, TK -#else -#define USEB b -#define STRIDE_BK ldb -#define STRIDE_BN 1 -#define BROADCAST_BK :, newaxis -#define BROADCAST_BN newaxis, : -#define SHAPE_B TK, TN -#endif - void dot(TYPE * A, TYPE * B, TYPE * C, int M, int N, int K, int lda __multipleof(8), diff --git a/tests/unit/dot.cc b/tests/unit/dot.cc index 9b3ee06f0..53fbc990d 100644 --- a/tests/unit/dot.cc +++ b/tests/unit/dot.cc @@ -1,142 +1,13 @@ -#include -#include -#include -#include -#include "triton/driver/backend.h" +#include "triton/driver/backend.h" #include "triton/driver/stream.h" -#include "triton/tools/bench.hpp" -#include "triton/external/half.hpp" -#include "triton/runtime/function.h" -#include "src/dot.h" -#include "cuda/cublas.h" +#include "dot.h" #include "util.h" -namespace drv = triton::driver; -namespace rt = triton::runtime; - -template -void diff(const std::vector& x, const std::vector& y){ - for(size_t i = 0; i < x.size(); i++) - if(std::isnan(x[i]) || std::abs(x[i] - y[i])/std::max(x[i], y[i]) > 1e-4){ - std::cout << i << " " << x[i] << " " << y[i] << std::endl; - exit(EXIT_FAILURE); - } - std::cout << "Pass!" << std::endl; -} - -template -static void cpu_ref(std::vector &c, const std::vector &a, const std::vector &b, - size_t M, size_t N, size_t K){ - for(size_t m = 0; m < M; m++) - for(size_t n = 0; n < N; n++){ - float acc = 0; - for(size_t k = 0; k < K; k++) - acc = acc + (AT ? a[k*M + m] : a[m*K + k]) * (BT ? b[n*K + k] : b[k*N + n]); - c[m + n*M] = static_cast(acc); - } -} - -template -void cpu_ref(bool AT_, bool BT_, size_t M, size_t N, size_t K, - std::vector &c, const std::vector &a, const std::vector &b) { - if(AT_ && BT_) - cpu_ref(c, a, b, M, N, K); - else if(AT_ && !BT_) - cpu_ref(c, a, b, M, N, K); - else if(!AT_ && BT_) - cpu_ref(c, a, b, M, N, K); - else - cpu_ref(c, a, b, M, N, K); -} - -template -struct to_string; - -template<> struct to_string{ - static constexpr const char* value = "half"; -}; - -template<> struct to_string{ - static constexpr const char* value = "float"; -}; - -template<> struct to_string{ - static constexpr const char* value = "double"; -}; - -enum dtype_t { - FLOAT, - HALF, - DOUBLE -}; - -template -bool do_test(drv::stream* stream, bool AT, bool BT, - int32_t M, int32_t N, int32_t K, - int32_t TM, int32_t TN, int32_t TK, size_t nwarp){ - std::string ty = to_string::value; - size_t dt_nbytes = sizeof(T); - drv::context* context = stream->context(); - std::vector hc(M*N); - std::vector ha(M*K); - std::vector hb(K*N); - int32_t lda = AT ? K : M; - int32_t ldb = BT ? N : K; - int32_t ldc = M; - srand(0); - for(size_t i = 0; i < ha.size(); i++) - ha[i] = static_cast((float)rand()/RAND_MAX); - for(size_t i = 0; i < hb.size(); i++) - hb[i] = static_cast((float)rand()/RAND_MAX); - for(size_t i = 0; i < hc.size(); i++) - hc[i] = static_cast((double)0); - auto dc = std::shared_ptr(drv::buffer::create(context, hc.size()*dt_nbytes)); - auto da = std::shared_ptr(drv::buffer::create(context, ha.size()*dt_nbytes)); - auto db = std::shared_ptr(drv::buffer::create(context, hb.size()*dt_nbytes)); - stream->write(&*da, true, 0, ha); - stream->write(&*db, true, 0, hb); - stream->write(&*dc, true, 0, hc); - stream->synchronize(); - // run - rt::function::options_space_t opt; - opt.defines.push_back({"TYPE", {ty}}); - opt.defines.push_back({"AT", {AT?"1":"0"}}); - opt.defines.push_back({"BT", {BT?"1":"0"}}); - opt.defines.push_back({"TM", {std::to_string(TM)}}); - opt.defines.push_back({"TN", {std::to_string(TN)}}); - opt.defines.push_back({"TK", {std::to_string(TK)}}); - opt.num_warps = {nwarp}; - rt::function function(src::dot, opt); - try { - function({&*da, &*db, &*dc, M, N, K, lda, ldb, ldc}, grid2d(M, N), stream); - } catch (const std::runtime_error& e) { - return true; - } - // test - stream->read(&*dc, true, 0, hc); - std::vector rc(hc.size()); - cpu_ref(AT, BT, M, N, K, rc, ha, hb); - return testing::diff(hc, rc); -} - -bool do_test(triton::driver::stream *stream, - dtype_t dtype, bool AT, bool BT, - int32_t M, int32_t N, int32_t K, - int32_t TM, int32_t TN, int32_t TK, size_t nwarp) { - switch(dtype){ - case HALF: return do_test(stream, AT, BT, M, N, K, TM, TN, TK, nwarp); - case FLOAT: return do_test(stream, AT, BT, M, N, K, TM, TN, TK, nwarp); - case DOUBLE: return do_test(stream, AT, BT, M, N, K, TM, TN, TK, nwarp); - default: break; - } - return false; -} - int main() { // initialize default compute device auto context = triton::driver::backend::contexts::get_default(); triton::driver::stream* stream = triton::driver::stream::create(context); - // shapes to benchmark + // shapes to test typedef std::tuple config_t; std::vector configs; for(int TM: std::vector{32, 64}) @@ -147,14 +18,14 @@ int main() { for(bool BT: std::array{false, true}){ configs.push_back(config_t{FLOAT, AT, BT, 128, 128, 128, TM, TN, TK, nwarps}); } - // does the work + // test dtype_t dtype; bool AT, BT; int M, N, K, TM, TN, TK, nwarp; for(const auto& c: configs){ std::tie(dtype, AT, BT, M, N, K, TM, TN, TK, nwarp) = c; std::cout << "Testing " << c << " ... " << std::flush; - if(do_test(stream, dtype, AT, BT, M, N, K, TM, TN, TK, (size_t)nwarp)) + if(test_dot(stream, dtype, AT, BT, M, N, K, TM, TN, TK, (size_t)nwarp)) std::cout << " Pass! " << std::endl; else{ std::cout << " Fail! " << std::endl; From 1bf0c8adeb80587624a80cf1c1c226d2ceaa686b Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 2 Oct 2019 15:08:32 -0400 Subject: [PATCH 424/494] [test] re-added bounds checking in dot test --- tests/common/dot.h | 6 +++--- tests/common/src/dot.h | 6 ++++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/common/dot.h b/tests/common/dot.h index f96ce17f2..599784570 100644 --- a/tests/common/dot.h +++ b/tests/common/dot.h @@ -106,10 +106,10 @@ bool triton_dot(drv::stream* stream, bool AT, bool BT, opt.num_warps = {nwarp}; } if(mode == BENCH) { - opt.defines.push_back({"TM", {"128"}}); - opt.defines.push_back({"TN", {"128"}}); + opt.defines.push_back({"TM", {"64", "128"}}); + opt.defines.push_back({"TN", {"64", "128"}}); opt.defines.push_back({"TK", {"8"}}); - opt.num_warps = {4}; + opt.num_warps = {2, 4, 8}; } // kernels diff --git a/tests/common/src/dot.h b/tests/common/src/dot.h index c360edbfe..dc71d86bb 100644 --- a/tests/common/src/dot.h +++ b/tests/common/src/dot.h @@ -26,8 +26,10 @@ void dot(TYPE * A, TYPE * B, TYPE * C, c += USEA @ USEB; pa = pa + TK * STRIDE_AK; pb = pb + TK * STRIDE_BK; - a = *pa; - b = *pb; + bool checka[SHAPE_A] = k > TK; + bool checkb[SHAPE_B] = k > TK; + a = checka ? *pa : 0; + b = checkb ? *pb : 0; } // epilogue int rxc[TM] = ridx * TM + 0 ... TM; From a1e0512703508eb3c11762e2eca7abca8896fdf7 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 3 Oct 2019 14:11:50 -0400 Subject: [PATCH 425/494] [codegen] more progress --- lib/codegen/analysis/liveness.cc | 58 +++++++++++++------------------- lib/codegen/analysis/tiles.cc | 28 +++++++-------- lib/codegen/selection.cc | 22 ++++++++++-- lib/driver/module.cc | 1 + tests/bench/dot.cc | 2 +- tests/common/dot.h | 8 ++--- 6 files changed, 63 insertions(+), 56 deletions(-) diff --git a/lib/codegen/analysis/liveness.cc b/lib/codegen/analysis/liveness.cc index b1f75f03c..e23bb96da 100644 --- a/lib/codegen/analysis/liveness.cc +++ b/lib/codegen/analysis/liveness.cc @@ -91,46 +91,36 @@ void liveness::connected_components(node_t x, std::set &nodes, graph_t & bool liveness::do_pad(ir::value *x) { // alignment for matrix product -// if(auto* dot = dynamic_cast(x)) { -// auto order = tiles_->order(x); -// // a -// ir::value *a = dot->get_operand(0);\ -// size_t previous_a = pad_[a]; -// bool a_trans = dynamic_cast(a); -// bool a_row = order[0] == 0; -// if(tiles_->hmma(x) == HMMA_A_ROW) -// pad_[a] = 16; -// else if(tiles_->hmma(x) == HMMA_A_COL) -// pad_[a] = 8; -// else if(a_trans ^ a_row) -// pad_[a] = 4; -// else -// pad_[a] = 0; -// // b -// ir::value *b = dot->get_operand(1); -// size_t previous_b = pad_[b]; -// bool b_trans = dynamic_cast(b); -// bool b_col = order[0] == 0; -// if(tiles_->hmma(x) == HMMA_B_COL) -// pad_[b] = 16; -// if(tiles_->hmma(x) == HMMA_B_ROW) -// pad_[b] = 8; -// if(b_trans ^ b_col) -// pad_[b] = 4; -// else -// pad_[b] = 0; -// return previous_a != pad_[a] || previous_b != pad_[b]; -// } + if(auto* dot = dynamic_cast(x)) { + // a + ir::value *a = dot->get_operand(0);\ + size_t previous_a = pad_[a]; + if(tiles_->hmma(a) == HMMA_A_ROW) + pad_[a] = 16; + else if(tiles_->hmma(a) == HMMA_A_COL) + pad_[a] = 8; + else + pad_[a] = 0; + // b + ir::value *b = dot->get_operand(1); + size_t previous_b = pad_[b]; + if(tiles_->hmma(b) == HMMA_B_COL) + pad_[b] = 16; + if(tiles_->hmma(b) == HMMA_B_ROW) + pad_[b] = 8; + else + pad_[b] = 0; + return previous_a != pad_[a] || previous_b != pad_[b]; + } if(auto* cts = dynamic_cast(x)) { auto cts_order = tiles_->order(cts); ir::value *arg = cts->get_operand(0); auto arg_order = tiles_->order(arg); + size_t previous = pad_[cts]; if(cts_order != arg_order) pad_[cts] = 4; + return pad_[cts] != previous; } -// if(auto* tr = dynamic_cast(x)) { -// pad_[tr] = 4; -// } // padding for phi-nodes if(auto* phi = dynamic_cast(x)) { bool has_changed = false; @@ -142,7 +132,7 @@ bool liveness::do_pad(ir::value *x) { } return has_changed; } - // default -- no pading + // default -- no padding size_t previous = pad_[x]; pad_[x] = std::max(previous, 0); return pad_[x] != previous; diff --git a/lib/codegen/analysis/tiles.cc b/lib/codegen/analysis/tiles.cc index 13d3fbd13..77da5c03a 100644 --- a/lib/codegen/analysis/tiles.cc +++ b/lib/codegen/analysis/tiles.cc @@ -254,20 +254,20 @@ void tiles::run(ir::module &) { } order_[i] = order; } - for(size_t i = 0; i < num_groups; i++){ - std::vector dots; - for(ir::value* v: layout_->values(i)) - if(auto *x = dynamic_cast(v)) - dots.push_back(x); - for(ir::dot_inst* dot: dots){ - ir::value* a = dot->get_operand(0); - ir::value* b = dot->get_operand(1); - std::vector col = {0, 1}; - std::vector row = {1, 0}; - order_[layout_->id(a)] = is_trans(a) ? row : col; - order_[layout_->id(b)] = is_trans(b) ? col : row; - } - } +// for(size_t i = 0; i < num_groups; i++){ +// std::vector dots; +// for(ir::value* v: layout_->values(i)) +// if(auto *x = dynamic_cast(v)) +// dots.push_back(x); +// for(ir::dot_inst* dot: dots){ +// ir::value* a = dot->get_operand(0); +// ir::value* b = dot->get_operand(1); +// std::vector col = {0, 1}; +// std::vector row = {1, 0}; +// order_[layout_->id(a)] = is_trans(a) ? row : col; +// order_[layout_->id(b)] = is_trans(b) ? col : row; +// } +// } // tiling parameters for(auto x: largest_){ ir::value *i = x.second; diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index c355f9d2f..0b13b8982 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -1049,6 +1049,19 @@ void selection::lower_trans(ir::trans_inst *x, LLVMContext &ctx, Function *fn, I tmap_[x] = out; } +bool is_trans(ir::value *v) { + if(dynamic_cast(v)) { + return true; + } + if(auto *phi = dynamic_cast(v)) { + bool result = true; + for(ir::value *op: phi->ops()) + result = result && is_trans(op); + return result; + } + return false; +} + void selection::lower_hmma_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn, IRBuilder<> &builder, distributed_tile *TC, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK) { @@ -1082,8 +1095,11 @@ void selection::lower_hmma_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn auto ord_a = tiles_->order(dot->get_operand(0)); auto ord_b = tiles_->order(dot->get_operand(1)); - bool is_a_row = ord_a[ord_a.size() - 2] == 1; - bool is_b_row = ord_b[ord_b.size() - 2] == 1; + bool is_a_trans = is_trans(dot->get_operand(0)); + bool is_b_trans = is_trans(dot->get_operand(1)); + bool is_a_row = is_a_trans ^ (ord_a[ord_a.size() - 2] == 1); + bool is_b_row = is_b_trans ^ (ord_b[ord_b.size() - 2] == 1); + if(is_a_row){ offset_a_i = builder.CreateAdd(offset_a_i, builder.CreateURem(u_thread_id, builder.getInt32(4))); @@ -1124,7 +1140,7 @@ void selection::lower_hmma_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn Value *current_offset_a_i = builder.CreateAdd(offset_a_i, builder.getInt32(pack_i*stride_rep_i*pack_size_0_)); Value *current_offset_b_i = builder.CreateAdd(offset_b_j, builder.getInt32(pack_j*stride_rep_j*pack_size_1_)); indices_t idx_a = {current_offset_a_i, builder.CreateAdd(offset_a_k, _K)}; - indices_t idx_b = {current_offset_b_i, builder.CreateAdd(offset_b_k, _K)}; + indices_t idx_b = {builder.CreateAdd(offset_b_k, _K), current_offset_b_i}; idx_a.insert(idx_a.end(), x.first.begin(), x.first.end()); idx_b.insert(idx_b.end(), x.first.begin(), x.first.end()); Value *ha = TA->get_value(idx_a); diff --git a/lib/driver/module.cc b/lib/driver/module.cc index e300a75f2..f29c830f4 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -241,6 +241,7 @@ std::string cu_module::compile_llvm_module(std::unique_ptr module, cu_module::cu_module(driver::context * context, std::unique_ptr ll_module): cu_module(context, compile_llvm_module(std::move(ll_module), context->device())) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ +// std::cout << source << std::endl; cu_context::context_switcher ctx(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index 168e239e6..a276de4b1 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -32,7 +32,7 @@ int main() { for(const auto& c: configs){ std::tie(AT, BT, M, N, K) = c; std::cout << "// " << AT << " " << BT << " " << M << " " << N << " " << K << std::flush; - for(auto perf: bench_dot(stream, FLOAT, AT, BT, M, N, K)) + for(auto perf: bench_dot(stream, HALF, AT, BT, M, N, K)) std::cout << ", " << perf << std::flush; std::cout << std::endl; } diff --git a/tests/common/dot.h b/tests/common/dot.h index 599784570..bb27763b0 100644 --- a/tests/common/dot.h +++ b/tests/common/dot.h @@ -106,10 +106,10 @@ bool triton_dot(drv::stream* stream, bool AT, bool BT, opt.num_warps = {nwarp}; } if(mode == BENCH) { - opt.defines.push_back({"TM", {"64", "128"}}); - opt.defines.push_back({"TN", {"64", "128"}}); - opt.defines.push_back({"TK", {"8"}}); - opt.num_warps = {2, 4, 8}; + opt.defines.push_back({"TM", {"128"}}); + opt.defines.push_back({"TN", {"128"}}); + opt.defines.push_back({"TK", {"16"}}); + opt.num_warps = {4}; } // kernels From 1783d45bef8878f96a88c4893ebc43d0fda6a306 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 4 Oct 2019 16:07:31 -0400 Subject: [PATCH 426/494] [codegen] better handling of row/column-major --- include/triton/codegen/analysis/tiles.h | 6 +-- lib/codegen/analysis/liveness.cc | 51 ++++++++++++++++--------- lib/codegen/analysis/tiles.cc | 49 +++++++++++++----------- lib/runtime/function.cc | 2 + tests/bench/dot.cc | 32 ++++++++-------- tests/common/dot.h | 47 +++++++++++++---------- tests/unit/dot.cc | 2 +- 7 files changed, 107 insertions(+), 82 deletions(-) diff --git a/include/triton/codegen/analysis/tiles.h b/include/triton/codegen/analysis/tiles.h index 6fe964738..ca1eb0e90 100644 --- a/include/triton/codegen/analysis/tiles.h +++ b/include/triton/codegen/analysis/tiles.h @@ -27,11 +27,7 @@ class align; enum layout_t { SCANLINE, - HMMA_C, - HMMA_A_COL, - HMMA_A_ROW, - HMMA_B_COL, - HMMA_B_ROW + HMMA_C }; class tiles { diff --git a/lib/codegen/analysis/liveness.cc b/lib/codegen/analysis/liveness.cc index e23bb96da..297f31d92 100644 --- a/lib/codegen/analysis/liveness.cc +++ b/lib/codegen/analysis/liveness.cc @@ -89,28 +89,43 @@ void liveness::connected_components(node_t x, std::set &nodes, graph_t & } } +bool is_trans(ir::value *v) { + if(dynamic_cast(v)) { + return true; + } + if(auto *phi = dynamic_cast(v)) { + bool result = true; + for(ir::value *op: phi->ops()) + result = result && is_trans(op); + return result; + } + return false; +} + + bool liveness::do_pad(ir::value *x) { // alignment for matrix product if(auto* dot = dynamic_cast(x)) { // a - ir::value *a = dot->get_operand(0);\ - size_t previous_a = pad_[a]; - if(tiles_->hmma(a) == HMMA_A_ROW) - pad_[a] = 16; - else if(tiles_->hmma(a) == HMMA_A_COL) - pad_[a] = 8; - else - pad_[a] = 0; - // b + ir::value *a = dot->get_operand(0); ir::value *b = dot->get_operand(1); - size_t previous_b = pad_[b]; - if(tiles_->hmma(b) == HMMA_B_COL) - pad_[b] = 16; - if(tiles_->hmma(b) == HMMA_B_ROW) - pad_[b] = 8; - else - pad_[b] = 0; - return previous_a != pad_[a] || previous_b != pad_[b]; + size_t a_previous = pad_[a]; + size_t b_previous = pad_[b]; + auto a_order = tiles_->order(a); + auto b_order = tiles_->order(b); + bool a_row = is_trans(a) ^ (a_order[0] == 1); + bool b_row = is_trans(b) ^ (b_order[0] == 1); + auto a_shapes = a->get_type()->get_tile_shapes(); + auto b_shapes = b->get_type()->get_tile_shapes(); + pad_[a] = std::max(pad_[a], (24 - a_shapes[a_row ? 0 : 1]) % 32); + pad_[b] = std::max(pad_[b], (24 - b_shapes[b_row ? 1 : 0]) % 32); + return a_previous != pad_[a] || b_previous != pad_[b]; + } + if(auto* trans = dynamic_cast(x)) { + ir::value *op = trans->get_operand(0); + size_t previous = pad_[op]; + pad_[op] = std::max(pad_[op], pad_[x]); + return previous != pad_[op]; } if(auto* cts = dynamic_cast(x)) { auto cts_order = tiles_->order(cts); @@ -118,7 +133,7 @@ bool liveness::do_pad(ir::value *x) { auto arg_order = tiles_->order(arg); size_t previous = pad_[cts]; if(cts_order != arg_order) - pad_[cts] = 4; + pad_[cts] = std::max(pad_[cts], 4); return pad_[cts] != previous; } // padding for phi-nodes diff --git a/lib/codegen/analysis/tiles.cc b/lib/codegen/analysis/tiles.cc index 77da5c03a..3d414f723 100644 --- a/lib/codegen/analysis/tiles.cc +++ b/lib/codegen/analysis/tiles.cc @@ -215,15 +215,7 @@ void tiles::run(ir::module &) { for(size_t i = 0; i < num_groups; i++) { const auto& values = layout_->values(i); bool hmma_c = std::any_of(values.begin(), values.end(), &is_hmma_c); - bool hmma_a_col = std::any_of(values.begin(), values.end(), &is_hmma_a_col); - bool hmma_a_row = std::any_of(values.begin(), values.end(), &is_hmma_a_row); - bool hmma_b_col = std::any_of(values.begin(), values.end(), &is_hmma_b_col); - bool hmma_b_row = std::any_of(values.begin(), values.end(), &is_hmma_b_row); if(hmma_c) hmma_[i] = HMMA_C; - else if(hmma_a_col) hmma_[i] = HMMA_A_COL; - else if(hmma_a_row) hmma_[i] = HMMA_A_ROW; - else if(hmma_b_col) hmma_[i] = HMMA_B_COL; - else if(hmma_b_row) hmma_[i] = HMMA_B_ROW; else hmma_[i] = SCANLINE; } @@ -254,20 +246,33 @@ void tiles::run(ir::module &) { } order_[i] = order; } -// for(size_t i = 0; i < num_groups; i++){ -// std::vector dots; -// for(ir::value* v: layout_->values(i)) -// if(auto *x = dynamic_cast(v)) -// dots.push_back(x); -// for(ir::dot_inst* dot: dots){ -// ir::value* a = dot->get_operand(0); -// ir::value* b = dot->get_operand(1); -// std::vector col = {0, 1}; -// std::vector row = {1, 0}; -// order_[layout_->id(a)] = is_trans(a) ? row : col; -// order_[layout_->id(b)] = is_trans(b) ? col : row; -// } -// } + // matrix multiplication optimizations + for(size_t i = 0; i < num_groups; i++){ + std::vector dots; + for(ir::value* v: layout_->values(i)) + if(auto *x = dynamic_cast(v)) + dots.push_back(x); + for(ir::dot_inst* dot: dots){ + ir::value* a = dot->get_operand(0); + ir::value* b = dot->get_operand(1); + if(hmma_.at(layout_->id(dot)) == HMMA_C){ + auto a_val = layout_->values(layout_->id(a)); + auto b_val = layout_->values(layout_->id(b)); + for(ir::value *v: a_val) + if(auto *cts = dynamic_cast(v)) + order_[layout_->id(a)] = order_[layout_->id(cts->get_operand(0))]; + for(ir::value *v: b_val) + if(auto *cts = dynamic_cast(v)) + order_[layout_->id(b)] = order_[layout_->id(cts->get_operand(0))]; + } + else{ + std::vector col = {0, 1}; + std::vector row = {1, 0}; + order_[layout_->id(a)] = is_trans(a) ? row : col; + order_[layout_->id(b)] = is_trans(b) ? col : row; + } + } + } // tiling parameters for(auto x: largest_){ ir::value *i = x.second; diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index e9f5f8921..19c55a0a1 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -239,7 +239,9 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c axes.run(module); layouts.run(module); align.run(module); +// ir::print(module, std::cout); tiles.run(module); +// ir::print(module, std::cout); selection.run(module, *llvm); // return binary std::unique_ptr res(driver::module::create(context, std::move(llvm))); diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index a276de4b1..927f0044b 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -7,32 +7,34 @@ int main() { auto context = triton::driver::backend::contexts::get_default(); triton::driver::stream* stream = triton::driver::stream::create(context); // shapes to benchmark - typedef std::tuple config_t; + typedef std::tuple, bool, bool, int, int, int> config_t; std::vector configs; + for(auto ord: std::vector>{{0, 1}, {1, 0}}) for(auto x: std::vector>{{false, false}, {false, true}, {true, false}, {true, true}}){ std::vector tmp = { - config_t{x[0], x[1], 2048, 2048, 2048}, -// config_t{x[0], x[1], 16, 2048, 2048}, -// config_t{x[0], x[1], 32, 2048, 2048}, -// config_t{x[0], x[1], 64, 2048, 2048}, -// config_t{x[0], x[1], 128, 2048, 2048}, -// config_t{x[0], x[1], 7000, 2048, 2048}, -// config_t{x[0], x[1], 16, 4096, 4096}, -// config_t{x[0], x[1], 32, 4096, 4096}, -// config_t{x[0], x[1], 64, 4096, 4096}, -// config_t{x[0], x[1], 128, 4096, 4096}, -// config_t{x[0], x[1], 7000, 4096, 4096} + config_t{ord, x[0], x[1], 2048, 2048, 2048}, +// config_t{ord, x[0], x[1], 16, 2048, 2048}, +// config_t{ord, x[0], x[1], 32, 2048, 2048}, +// config_t{ord, x[0], x[1], 64, 2048, 2048}, +// config_t{ord, x[0], x[1], 128, 2048, 2048}, +// config_t{ord, x[0], x[1], 7000, 2048, 2048}, +// config_t{ord, x[0], x[1], 16, 4096, 4096}, +// config_t{ord, x[0], x[1], 32, 4096, 4096}, +// config_t{ord, x[0], x[1], 64, 4096, 4096}, +// config_t{ord, x[0], x[1], 128, 4096, 4096}, +// config_t{ord, x[0], x[1], 7000, 4096, 4096} }; configs.insert(configs.end(), tmp.begin(), tmp.end()); } // does the work + std::vector ord; bool AT, BT; int32_t M, N, K; for(const auto& c: configs){ - std::tie(AT, BT, M, N, K) = c; - std::cout << "// " << AT << " " << BT << " " << M << " " << N << " " << K << std::flush; - for(auto perf: bench_dot(stream, HALF, AT, BT, M, N, K)) + std::tie(ord, AT, BT, M, N, K) = c; + std::cout << "// " << c << std::flush; + for(auto perf: bench_dot(stream, HALF, AT, BT, M, N, K, ord, ord)) std::cout << ", " << perf << std::flush; std::cout << std::endl; } diff --git a/tests/common/dot.h b/tests/common/dot.h index bb27763b0..00d605f5d 100644 --- a/tests/common/dot.h +++ b/tests/common/dot.h @@ -19,7 +19,7 @@ static void cc_dot(std::vector &c, const std::vector &a, const std::vector for(size_t n = 0; n < N; n++){ float acc = 0; for(size_t k = 0; k < K; k++) - acc = acc + (AT ? a[k*M + m] : a[m*K + k]) * (BT ? b[n*K + k] : b[k*N + n]); + acc = acc + (!AT ? a[k*M + m] : a[m*K + k]) * (!BT ? b[n*K + k] : b[k*N + n]); c[m + n*M] = static_cast(acc); } } @@ -67,6 +67,7 @@ template bool triton_dot(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K, int32_t TM, int32_t TN, int32_t TK, size_t nwarp, + const std::vector& a_order, const std::vector& b_order, run_mode_t mode, std::vector& bench, bool &test){ std::string ty = to_string::value; size_t dt_nbytes = sizeof(T); @@ -74,6 +75,8 @@ bool triton_dot(drv::stream* stream, bool AT, bool BT, int32_t lda = AT ? K : M; int32_t ldb = BT ? N : K; int32_t ldc = M; + std::vector sa = { "1", "lda" }; + std::vector sb = { "1", "ldb" }; // inputs auto dc = std::shared_ptr(drv::buffer::create(context, M*N*dt_nbytes)); @@ -82,20 +85,20 @@ bool triton_dot(drv::stream* stream, bool AT, bool BT, // macros rt::function::options_space_t opt; - // B access patterns - opt.defines.push_back({"USEB", {BT? "^b" : "b" }}); - opt.defines.push_back({"BROADCAST_BK", {BT? "newaxis, :" : ":, newaxis" }}); - opt.defines.push_back({"BROADCAST_BN", {BT? ":, newaxis" : "newaxis, :" }}); - opt.defines.push_back({"SHAPE_B", {BT? "TN, TK" : "TK, TN" }}); - opt.defines.push_back({"STRIDE_BK", {BT? "1" : "ldb" }}); - opt.defines.push_back({"STRIDE_BN", {BT? "ldb" : "1" }}); // A access patterns - opt.defines.push_back({"USEA", {AT? "^a" : "a" }}); - opt.defines.push_back({"BROADCAST_AK", {AT? ":, newaxis" : "newaxis, :" }}); - opt.defines.push_back({"BROADCAST_AM", {AT? "newaxis, :" : ":, newaxis" }}); - opt.defines.push_back({"SHAPE_A", {AT? "TK, TM" : "TM, TK" }}); - opt.defines.push_back({"STRIDE_AK", {AT? "lda" : "1" }}); - opt.defines.push_back({"STRIDE_AM", {AT? "1" : "lda" }}); + opt.defines.push_back({"USEA", {AT? "^a" : "a" }}); + opt.defines.push_back({"BROADCAST_AK", {AT? ":, newaxis" : "newaxis, :" }}); + opt.defines.push_back({"BROADCAST_AM", {AT? "newaxis, :" : ":, newaxis" }}); + opt.defines.push_back({"SHAPE_A", {AT? "TK, TM" : "TM, TK" }}); + opt.defines.push_back({"STRIDE_AK", {AT? sa[a_order[0]] : sa[a_order[1]] }}); + opt.defines.push_back({"STRIDE_AM", {AT? sa[a_order[1]] : sa[a_order[0]] }}); + // B access patterns + opt.defines.push_back({"USEB", {BT? "^b" : "b" }}); + opt.defines.push_back({"BROADCAST_BK", {BT? "newaxis, :" : ":, newaxis" }}); + opt.defines.push_back({"BROADCAST_BN", {BT? ":, newaxis" : "newaxis, :" }}); + opt.defines.push_back({"SHAPE_B", {BT? "TN, TK" : "TK, TN" }}); + opt.defines.push_back({"STRIDE_BK", {BT? sb[b_order[1]] : sb[b_order[0]] }}); + opt.defines.push_back({"STRIDE_BN", {BT? sb[b_order[0]] : sb[b_order[1]] }}); // data-type opt.defines.push_back({"TYPE", {ty}}); // tile sizes @@ -164,13 +167,14 @@ bool triton_dot(drv::stream* stream, bool AT, bool BT, std::vector bench_dot(drv::stream* stream, dtype_t dtype, bool AT, bool BT, - int32_t M, int32_t N, int32_t K) { + int32_t M, int32_t N, int32_t K, + const std::vector& a_order, const std::vector& b_order) { std::vector bench; bool test; switch(dtype){ - case HALF: triton_dot(stream, AT, BT, M, N, K, 0, 0, 0, 0, BENCH, bench, test); break; - case FLOAT: triton_dot(stream, AT, BT, M, N, K, 0, 0, 0, 0, BENCH, bench, test); break; - case DOUBLE: triton_dot(stream, AT, BT, M, N, K, 0, 0, 0, 0, BENCH, bench, test); break; + case HALF: triton_dot(stream, AT, BT, M, N, K, 0, 0, 0, 0, a_order, b_order, BENCH, bench, test); break; + case FLOAT: triton_dot(stream, AT, BT, M, N, K, 0, 0, 0, 0, a_order, b_order, BENCH, bench, test); break; + case DOUBLE: triton_dot(stream, AT, BT, M, N, K, 0, 0, 0, 0, a_order, b_order, BENCH, bench, test); break; default: break; } return bench; @@ -178,13 +182,14 @@ std::vector bench_dot(drv::stream* stream, bool test_dot(drv::stream* stream, dtype_t dtype, bool AT, bool BT, int32_t M, int32_t N, int32_t K, + const std::vector& a_order, const std::vector& b_order, int32_t TM, int32_t TN, int32_t TK, size_t nwarp) { std::vector bench; bool test = false; switch(dtype){ - case HALF: triton_dot(stream, AT, BT, M, N, K, TM, TN, TK, nwarp, TEST, bench, test); break; - case FLOAT: triton_dot(stream, AT, BT, M, N, K, TM, TN, TK, nwarp, TEST, bench, test); break; - case DOUBLE: triton_dot(stream, AT, BT, M, N, K, TM, TN, TK, nwarp, TEST, bench, test); break; + case HALF: triton_dot(stream, AT, BT, M, N, K, TM, TN, TK, nwarp, a_order, b_order, TEST, bench, test); break; + case FLOAT: triton_dot(stream, AT, BT, M, N, K, TM, TN, TK, nwarp, a_order, b_order, TEST, bench, test); break; + case DOUBLE: triton_dot(stream, AT, BT, M, N, K, TM, TN, TK, nwarp, a_order, b_order, TEST, bench, test); break; default: break; } return test; diff --git a/tests/unit/dot.cc b/tests/unit/dot.cc index 53fbc990d..59b556858 100644 --- a/tests/unit/dot.cc +++ b/tests/unit/dot.cc @@ -25,7 +25,7 @@ int main() { for(const auto& c: configs){ std::tie(dtype, AT, BT, M, N, K, TM, TN, TK, nwarp) = c; std::cout << "Testing " << c << " ... " << std::flush; - if(test_dot(stream, dtype, AT, BT, M, N, K, TM, TN, TK, (size_t)nwarp)) + if(test_dot(stream, dtype, AT, BT, M, N, K, {0, 1}, {0, 1}, TM, TN, TK, (size_t)nwarp)) std::cout << " Pass! " << std::endl; else{ std::cout << " Fail! " << std::endl; From 650c43ca07e056922b4493452da77636fc21b91b Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 7 Oct 2019 18:06:54 -0400 Subject: [PATCH 427/494] [codegen] more cleaning --- include/triton/codegen/analysis/axes.h | 15 ++--- include/triton/codegen/analysis/layout.h | 22 +++---- include/triton/codegen/analysis/liveness.h | 11 ++-- include/triton/codegen/selection.h | 5 +- include/triton/tools/bench.hpp | 2 +- lib/codegen/analysis/align.cc | 4 +- lib/codegen/analysis/axes.cc | 74 ++++++---------------- lib/codegen/analysis/layout.cc | 45 ++++--------- lib/codegen/analysis/liveness.cc | 52 ++++++--------- lib/codegen/analysis/tiles.cc | 30 +++++---- lib/codegen/transform/coalesce.cc | 4 +- lib/codegen/transform/reassociate.cc | 5 +- lib/runtime/function.cc | 20 +++--- tests/bench/dot.cc | 2 +- tests/common/dot.h | 8 +-- tests/unit/dot.cc | 3 +- 16 files changed, 111 insertions(+), 191 deletions(-) diff --git a/include/triton/codegen/analysis/axes.h b/include/triton/codegen/analysis/axes.h index 625d414c6..453015ab8 100644 --- a/include/triton/codegen/analysis/axes.h +++ b/include/triton/codegen/analysis/axes.h @@ -5,6 +5,7 @@ #include #include #include +#include "triton/tools/graph.h" namespace triton{ @@ -19,10 +20,8 @@ namespace analysis{ class axes { typedef std::pair node_t; - typedef std::map > graph_t; private: - void add_constraint(node_t x, node_t y); // update graph void update_graph_store(ir::instruction *i); void update_graph_reduce(ir::instruction *i); @@ -32,21 +31,15 @@ private: void update_graph_dot(ir::instruction *i); void update_graph_elementwise(ir::instruction *i); void update_graph(ir::instruction *i); - // connected components - void connected_components(node_t x, std::set &nodes, graph_t &graph, unsigned group_id); public: axes(); void run(ir::module &mod); - unsigned get_id(ir::value *value, unsigned ax); - bool has_id(ir::value *value, unsigned ax); + unsigned get_id(ir::value *value, unsigned dim); private: - // constraints graph - graph_t dependencies_; - std::set nodes_; - // parameter groups - std::map> groups_; + tools::graph graph_; + std::map axes_; }; } diff --git a/include/triton/codegen/analysis/layout.h b/include/triton/codegen/analysis/layout.h index 2e7fbd830..93fd54437 100644 --- a/include/triton/codegen/analysis/layout.h +++ b/include/triton/codegen/analysis/layout.h @@ -5,6 +5,7 @@ #include #include #include +#include "triton/tools/graph.h" namespace triton{ @@ -27,29 +28,24 @@ private: // graph creation void connect(ir::value *x, ir::value *y); void make_graph(ir::instruction *i); - // connected components - void connected_components(node_t x, std::set &nodes, graph_t &graph, unsigned id); // list the axes of the given value std::set axes_of(ir::value *value); public: // constructor layout(analysis::axes *axes); - // run the passes + // accessors + unsigned layout_of(ir::value *value) const; + const std::vector& values_of(unsigned id) const; + size_t num_layouts() const; + // execution void run(ir::module &mod); - // get the layout ID of the given value - unsigned id(ir::value *value) const; - // get the values associates with the given ID - const std::vector& values(unsigned id) const; - // get number of groups - size_t get_num_groups() const; private: analysis::axes* axes_; - graph_t dependencies_; - std::set nodes_; - std::map groups_; - std::map> values_; + tools::graph graph_; + std::map groups_; + std::map> values_; }; } diff --git a/include/triton/codegen/analysis/liveness.h b/include/triton/codegen/analysis/liveness.h index 3aef03a8d..f082e1cfa 100644 --- a/include/triton/codegen/analysis/liveness.h +++ b/include/triton/codegen/analysis/liveness.h @@ -4,6 +4,7 @@ #include #include #include +#include "triton/tools/graph.h" namespace triton{ @@ -41,7 +42,7 @@ struct double_buffer_info_t { }; struct buffer_t { - unsigned id; + size_t id; size_t size; bool operator<(buffer_t other) const { return id < other.id; } }; @@ -63,7 +64,6 @@ public: private: - void connected_components(node_t x, std::set &nodes, graph_t &graph, buffer_t *buffer); void extract_double_bufferable(ir::instruction *i); void extract_buffers(ir::instruction *i); void get_parents(ir::instruction *i, std::vector& res); @@ -98,11 +98,8 @@ private: intervals_map_t intervals_; std::map double_; std::map pad_; - std::map> parents_; - // graph - std::set nodes_; - graph_t graph_; - std::vector buffers_; + // buffers + tools::graph graph_; std::map groups_; std::map> values_; }; diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 29241f1c3..b505a6a29 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -211,10 +211,10 @@ private: public: selection(analysis::liveness* liveness, analysis::allocation *alloc, analysis::tiles *tiles, analysis::align *alignment, analysis::axes *axes, - analysis::layout *layouts, transform::coalesce* reorder, target *tgt, unsigned num_warps) + analysis::layout *layouts, target *tgt, unsigned num_warps) : liveness_(liveness), alloc_(alloc), tiles_(tiles), alignment_(alignment), a_axes_(axes), layouts_(layouts), - reorder_(reorder), tgt_(tgt), num_warps_(num_warps){ } + tgt_(tgt), num_warps_(num_warps){ } void run(ir::module &src, Module &dst); @@ -227,7 +227,6 @@ private: analysis::axes *a_axes_; analysis::layout *layouts_; analysis::align *alignment_; - transform::coalesce *reorder_; target *tgt_; std::map axes_; Value *sh_mem_ptr_; diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp index 554b3bcc3..48a4ab972 100644 --- a/include/triton/tools/bench.hpp +++ b/include/triton/tools/bench.hpp @@ -38,7 +38,7 @@ inline double bench(std::function const & op, driver::stream * stream) double total_time = 0; op(); stream->synchronize(); - while(total_time*1e-9 < 1e-3){ + while(total_time*1e-9 < 1e-2){ float norm = 1; // normalize clock if possible to reduce noise in auto-tuning if(auto cu_device = dynamic_cast(stream->context()->device())) diff --git a/lib/codegen/analysis/align.cc b/lib/codegen/analysis/align.cc index ef57e7a4f..28ff4024d 100644 --- a/lib/codegen/analysis/align.cc +++ b/lib/codegen/analysis/align.cc @@ -270,9 +270,9 @@ std::vector align::populate_max_contiguous_binop(ir::binary_operator* } if(x->is_int_add_sub()){ unsigned lvalue = 1, rvalue = 1; - if(lhs_cst_info[d].num_cst) + if(lhs_cst_info[d].num_cst > 0) lvalue = gcd(rhs_max_contiguous[d], lhs_cst_info[d].num_cst); - if(rhs_cst_info[d].num_cst) + if(rhs_cst_info[d].num_cst > 0) rvalue = gcd(lhs_max_contiguous[d], rhs_cst_info[d].num_cst); value = std::max(lvalue, rvalue); } diff --git a/lib/codegen/analysis/axes.cc b/lib/codegen/analysis/axes.cc index 16614b8a7..9913cda3f 100644 --- a/lib/codegen/analysis/axes.cc +++ b/lib/codegen/analysis/axes.cc @@ -16,22 +16,6 @@ namespace analysis{ axes::axes() {} -void axes::add_constraint(node_t x, node_t y) { - size_t shape_x = 1; - size_t shape_y = 1; - if(x.first->get_type()->is_tile_ty()) - shape_x = x.first->get_type()->get_tile_shapes()[x.second]; - if(y.first->get_type()->is_tile_ty()) - shape_y = y.first->get_type()->get_tile_shapes()[y.second]; - if(shape_x == 1 && shape_y == 1) - return; - dependencies_[x].insert(y); - dependencies_[y].insert(x); - nodes_.insert(x); - nodes_.insert(y); -} - - void axes::update_graph_reduce(ir::instruction *i) { auto* red = static_cast(i); unsigned axis = red->get_axis(); @@ -41,7 +25,7 @@ void axes::update_graph_reduce(ir::instruction *i) { for(unsigned d = 0; d < in_shapes.size(); d++){ if(d == axis) continue; - add_constraint({i, current++}, {arg, d}); + graph_.add_edge({i, current++}, {arg, d}); } } @@ -59,9 +43,9 @@ void axes::update_graph_reshape(ir::instruction *i) { bool same_shape = res_shapes[d] == op_shapes[current]; // either add edge between axis or just add a node in the graph if(!is_skewed && same_shape) - add_constraint({i, d}, {op, current++}); + graph_.add_edge({i, d}, {op, current++}); else - add_constraint({i, d}, {i, d}); + graph_.add_edge({i, d}, {i, d}); // reshaping is skewed if(res_shapes[d] > 1 && !same_shape) is_skewed = true; @@ -74,7 +58,7 @@ void axes::update_graph_trans(ir::instruction *i) { auto perm = trans->get_perm(); // add edge between axis perm[d] and axis d for(unsigned d = 0; d < perm.size(); d++) - add_constraint({i, perm[d]}, {op, d}); + graph_.add_edge({i, perm[d]}, {op, d}); } void axes::update_graph_broadcast(ir::instruction *i) { @@ -86,7 +70,7 @@ void axes::update_graph_broadcast(ir::instruction *i) { // add edge between non-broadcast axes for(unsigned d = 0; d < shapes.size(); d ++) if(op_shapes[d] == shapes[d]) - add_constraint({i, d}, {op, d}); + graph_.add_edge({i, d}, {op, d}); } void axes::update_graph_dot(ir::instruction *i) { @@ -97,11 +81,11 @@ void axes::update_graph_dot(ir::instruction *i) { ir::value *D = dot->get_operand(2); // add edges between result and accumulator for(unsigned d = 0; d < shapes.size(); d++) - add_constraint({dot, d}, {D, d}); + graph_.add_edge({dot, d}, {D, d}); // add edge for batch dimension for(unsigned d = 2; d < shapes.size(); d++){ - add_constraint({dot, d}, {A, d}); - add_constraint({dot, d}, {B, d}); + graph_.add_edge({dot, d}, {A, d}); + graph_.add_edge({dot, d}, {B, d}); } } @@ -116,8 +100,8 @@ void axes::update_graph_elementwise(ir::instruction *i) { for(ir::value* opx: i->ops()) for(ir::value* opy: i->ops()){ if(!i->get_type()->is_void_ty()) - add_constraint({i, d}, {opx, d}); - add_constraint({opx, d}, {opy, d}); + graph_.add_edge({i, d}, {opx, d}); + graph_.add_edge({opx, d}, {opy, d}); } } @@ -136,41 +120,19 @@ void axes::update_graph(ir::instruction *i) { return; } -void axes::connected_components(node_t x, std::set &nodes, graph_t &graph, unsigned group_id) { - groups_[x.first].insert({x.second, group_id}); - if(nodes.find(x) != nodes.end()){ - nodes.erase(x); - for(const node_t &y: graph[x]) - connected_components(y, nodes, graph, group_id); - } -} -unsigned axes::get_id(ir::value *value, unsigned ax) { - unsigned result = groups_.at(value).at(ax); - return result; +unsigned axes::get_id(ir::value *value, unsigned dim) { + return axes_.at({value, dim}); } -bool axes::has_id(ir::value *value, unsigned ax) { - auto it = groups_.find(value); - if(it == groups_.end()) - return false; - auto iit = it->second.find(ax); - if(iit == it->second.end()) - return false; - return true; -} - - void axes::run(ir::module &mod) { - nodes_.clear(); - dependencies_.clear(); - groups_.clear(); // make graph - ir::for_each_instruction(mod, [this](ir::instruction *x) { update_graph(x); }); - // connected components - unsigned group_id = 0; - while(!nodes_.empty()) - connected_components(*nodes_.begin(), nodes_, dependencies_, group_id++); + graph_.clear(); + ir::for_each_instruction(mod, [this](ir::instruction *x) { + update_graph(x); + }); + // find connected components + graph_.connected_components(nullptr, &axes_); } } diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index 40c8449ea..8270255a7 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -21,36 +21,24 @@ std::set layout::axes_of(ir::value *value) { // create result std::set result; for(size_t d = 0; d < rank; d++) - if(axes_->has_id(value, d)) - result.insert(axes_->get_id(value, d)); + result.insert(axes_->get_id(value, d)); return result; } -// connected components -void layout::connected_components(node_t x, std::set &nodes, graph_t &graph, unsigned group_id) { - groups_[x] = group_id; - values_[group_id].push_back(x); - if(nodes.find(x) != nodes.end()){ - nodes.erase(x); - for(const node_t &y: graph[x]) - connected_components(y, nodes, graph, group_id); - } -} - // constructor layout::layout(analysis::axes *axes) : axes_(axes) { } // get group id -unsigned layout::id(ir::value *value) const +unsigned layout::layout_of(ir::value *value) const { return groups_.at(value); } // get values -const std::vector& layout::values(unsigned id) const +const std::vector& layout::values_of(unsigned id) const { return values_.at(id); } // get number of groups -size_t layout::get_num_groups() const +size_t layout::num_layouts() const { return values_.size(); } // connect two values @@ -67,12 +55,8 @@ void layout::connect(ir::value *x, ir::value *y) { std::set_intersection(x_axes.begin(), x_axes.end(), y_axes.begin(), y_axes.end(), std::inserter(common, common.begin())); - if(!common.empty()){ - nodes_.insert(x); - nodes_.insert(y); - dependencies_[x].insert(y); - dependencies_[y].insert(x); - } + if(!common.empty()) + graph_.add_edge(x, y); } // make graph @@ -84,19 +68,16 @@ void layout::make_graph(ir::instruction *i) { } } -// run void layout::run(ir::module &mod) { - nodes_.clear(); - dependencies_.clear(); - groups_.clear(); - values_.clear(); // make graph - ir::for_each_instruction(mod, [this](ir::instruction* i) { make_graph(i); }); + graph_.clear(); + ir::for_each_instruction(mod, [this](ir::instruction* i) { + make_graph(i); + }); // connected components - unsigned group_id = 0; - while(!nodes_.empty()){ - connected_components(*nodes_.begin(), nodes_, dependencies_, group_id++); - } + values_.clear(); + groups_.clear(); + graph_.connected_components(&values_, &groups_); } } diff --git a/lib/codegen/analysis/liveness.cc b/lib/codegen/analysis/liveness.cc index 297f31d92..4ca9bf96f 100644 --- a/lib/codegen/analysis/liveness.cc +++ b/lib/codegen/analysis/liveness.cc @@ -53,42 +53,20 @@ void liveness::extract_double_bufferable(ir::instruction *i) { void liveness::make_graph(ir::instruction *i) { if(has_double(i)){ ir::value *latch = double_[i].latch; - nodes_.insert(i); - nodes_.insert(latch); - graph_[i].insert(latch); - graph_[latch].insert(i); + graph_.add_edge(i, latch); } - if(i->get_id() == ir::INST_PHI){ - ir::phi_node* phi = (ir::phi_node*)i; - for(ir::value* op: phi->ops()){ + if(storage_info.at(i->get_id()).first == SHARED){ + graph_.add_edge(i, i); + for(ir::value* op: i->ops()){ auto* iop = dynamic_cast(op); if(!iop || storage_info.at(iop->get_id()).first != SHARED) continue; - nodes_.insert(phi); - nodes_.insert(op); - graph_[phi].insert(op); - graph_[op].insert(phi); + graph_.add_edge(i, op); } } - if(i->get_id() == ir::INST_TRANS){ - nodes_.insert(i); - nodes_.insert(i->get_operand(0)); - graph_[i].insert(i->get_operand(0)); - graph_[i->get_operand(0)].insert(i); - } } // connected components -void liveness::connected_components(node_t x, std::set &nodes, graph_t &graph, buffer_t* buffer) { - groups_[x] = buffer; - values_[buffer].push_back(x); - if(nodes.find(x) != nodes.end()){ - nodes.erase(x); - for(const node_t &y: graph[x]) - connected_components(y, nodes, graph, buffer); - } -} - bool is_trans(ir::value *v) { if(dynamic_cast(v)) { return true; @@ -121,12 +99,14 @@ bool liveness::do_pad(ir::value *x) { pad_[b] = std::max(pad_[b], (24 - b_shapes[b_row ? 1 : 0]) % 32); return a_previous != pad_[a] || b_previous != pad_[b]; } + // padding for trans if(auto* trans = dynamic_cast(x)) { ir::value *op = trans->get_operand(0); size_t previous = pad_[op]; pad_[op] = std::max(pad_[op], pad_[x]); return previous != pad_[op]; } + // padding for copy to shared if(auto* cts = dynamic_cast(x)) { auto cts_order = tiles_->order(cts); ir::value *arg = cts->get_operand(0); @@ -187,7 +167,7 @@ void liveness::run(ir::module &mod) { indices.clear(); pad_.clear(); intervals_.clear(); - parents_.clear(); + graph_.clear(); // Create set of pair of values that can be double-buffered ir::for_each_instruction(mod, [this](ir::instruction* i) { @@ -209,12 +189,16 @@ void liveness::run(ir::module &mod) { }); // connected components - unsigned group_id = 0; - while(!nodes_.empty()){ - buffer_t* buffer = new buffer_t{group_id++}; - connected_components(*nodes_.begin(), nodes_, graph_, buffer); - for(ir::value *v: values_.at(buffer)) + tools::graph::cmap_t cmap; + tools::graph::nmap_t nmap; + graph_.connected_components(&cmap, &nmap); + for(auto x: cmap) { + buffer_t* buffer = new buffer_t{x.first}; + values_[buffer] = x.second; + for(ir::value *v: x.second){ buffer->size = std::max(buffer->size, num_bytes(v)); + groups_[v] = buffer; + } } // Assigns index to each instruction @@ -245,6 +229,8 @@ void liveness::run(ir::module &mod) { intervals_[x.first] = segment{start, end}; } + + } } diff --git a/lib/codegen/analysis/tiles.cc b/lib/codegen/analysis/tiles.cc index 3d414f723..11757cb87 100644 --- a/lib/codegen/analysis/tiles.cc +++ b/lib/codegen/analysis/tiles.cc @@ -74,7 +74,7 @@ bool is_hmma_b_row(ir::value* v) { layout_t tiles::hmma(ir::value *value) { - return hmma_.at(layout_->id(value)); + return hmma_.at(layout_->layout_of(value)); } int tiles::mts(ir::value *value, unsigned ax) { @@ -94,7 +94,7 @@ int tiles::wpt(ir::value *value, unsigned ax) { } std::vector tiles::order(ir::value *v) { - auto ret = order_[layout_->id(v)]; + auto ret = order_[layout_->layout_of(v)]; return ret; } @@ -201,7 +201,9 @@ bool tiles::is_trans(ir::value *v) { void tiles::run(ir::module &) { hmma_.clear(); largest_.clear(); - size_t num_groups = layout_->get_num_groups(); + order_.clear(); + + size_t num_groups = layout_->num_layouts(); // helpers auto rank = [](ir::value* v) { ir::type *ty = v->get_type(); @@ -213,7 +215,7 @@ void tiles::run(ir::module &) { }; // find out which groups require hmma layout for(size_t i = 0; i < num_groups; i++) { - const auto& values = layout_->values(i); + const auto& values = layout_->values_of(i); bool hmma_c = std::any_of(values.begin(), values.end(), &is_hmma_c); if(hmma_c) hmma_[i] = HMMA_C; else hmma_[i] = SCANLINE; @@ -221,7 +223,7 @@ void tiles::run(ir::module &) { } // find out which value is the largest in each group for(size_t i = 0; i < num_groups; i++) { - const auto& values = layout_->values(i); + const auto& values = layout_->values_of(i); auto cmp = [&rank](ir::value* x, ir::value *y) { return rank(x) < rank(y); }; largest_[i] = *std::max_element(values.begin(), values.end(), cmp); } @@ -230,7 +232,7 @@ void tiles::run(ir::module &) { // find out the layout ordering of a group for(size_t i = 0; i < num_groups; i++){ std::set io; - for(ir::value* v: layout_->values(i)) + for(ir::value* v: layout_->values_of(i)) extract_io_use(v, io); auto cmp = [&rank](ir::io_inst* x, ir::io_inst *y) { return rank(x->get_pointer_operand()) < rank(y->get_pointer_operand()); @@ -249,27 +251,27 @@ void tiles::run(ir::module &) { // matrix multiplication optimizations for(size_t i = 0; i < num_groups; i++){ std::vector dots; - for(ir::value* v: layout_->values(i)) + for(ir::value* v: layout_->values_of(i)) if(auto *x = dynamic_cast(v)) dots.push_back(x); for(ir::dot_inst* dot: dots){ ir::value* a = dot->get_operand(0); ir::value* b = dot->get_operand(1); - if(hmma_.at(layout_->id(dot)) == HMMA_C){ - auto a_val = layout_->values(layout_->id(a)); - auto b_val = layout_->values(layout_->id(b)); + if(hmma_.at(layout_->layout_of(dot)) == HMMA_C){ + auto a_val = layout_->values_of(layout_->layout_of(a)); + auto b_val = layout_->values_of(layout_->layout_of(b)); for(ir::value *v: a_val) if(auto *cts = dynamic_cast(v)) - order_[layout_->id(a)] = order_[layout_->id(cts->get_operand(0))]; + order_[layout_->layout_of(a)] = order_[layout_->layout_of(cts->get_operand(0))]; for(ir::value *v: b_val) if(auto *cts = dynamic_cast(v)) - order_[layout_->id(b)] = order_[layout_->id(cts->get_operand(0))]; + order_[layout_->layout_of(b)] = order_[layout_->layout_of(cts->get_operand(0))]; } else{ std::vector col = {0, 1}; std::vector row = {1, 0}; - order_[layout_->id(a)] = is_trans(a) ? row : col; - order_[layout_->id(b)] = is_trans(b) ? col : row; + order_[layout_->layout_of(a)] = is_trans(a) ? row : col; + order_[layout_->layout_of(b)] = is_trans(b) ? col : row; } } } diff --git a/lib/codegen/transform/coalesce.cc b/lib/codegen/transform/coalesce.cc index d349e5b11..c5d356d31 100644 --- a/lib/codegen/transform/coalesce.cc +++ b/lib/codegen/transform/coalesce.cc @@ -67,10 +67,10 @@ ir::value* coalesce::rematerialize(ir::value *x, ir::builder &builder, void coalesce::run(ir::module &mod) { // find values to rematerialize - size_t num_groups = layout_->get_num_groups(); + size_t num_groups = layout_->num_layouts(); std::vector remat; for(size_t id = 0; id < num_groups; id++) { - const auto& values = layout_->values(id); + const auto& values = layout_->values_of(id); // extract pointers used in ld/st operations std::set io; for(ir::value *v: values) diff --git a/lib/codegen/transform/reassociate.cc b/lib/codegen/transform/reassociate.cc index c2b9d2d4b..7f9427aa3 100644 --- a/lib/codegen/transform/reassociate.cc +++ b/lib/codegen/transform/reassociate.cc @@ -269,7 +269,10 @@ void reassociate::run(ir::module &mod) { it++; builder.set_insert_point(*it); } - ir::value *neg_off = builder.create_neg(off); + ir::value *_0 = builder.get_int32(0); + if(off->get_type()->is_tile_ty()) + _0 = builder.create_splat(_0, off->get_type()->get_tile_shapes()); + ir::value *neg_off = builder.create_sub(_0, off); ir::value *pz_dyn = builder.create_gep(pz, {neg_off}); phi_dyn->add_incoming(pz_dyn, phi->get_incoming_block(idx_z)); infos[phi_sta].dyn_ptr = phi_dyn; diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 19c55a0a1..6af64c105 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -200,11 +200,9 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c llvm::LLVMContext ctx; std::unique_ptr llvm(new llvm::Module(module.get_name(), ctx)); // create passes - codegen::transform::cts cts; codegen::analysis::align align; codegen::analysis::axes axes; codegen::analysis::layout layouts(&axes); - codegen::transform::coalesce coalesce(&align, &layouts); codegen::analysis::tiles tiles(opt.num_warps, &align, &axes, &layouts); codegen::analysis::liveness liveness(&tiles); codegen::analysis::allocation allocation(&liveness, &tiles); @@ -212,11 +210,12 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::transform::dce dce; codegen::transform::peephole peephole; codegen::transform::reassociate reassociate(&align); - codegen::selection selection(&liveness, &allocation, &tiles, &align, &axes, &layouts, &coalesce, target.get(), opt.num_warps); + codegen::transform::coalesce coalesce(&align, &layouts); + codegen::transform::cts cts; + codegen::selection selection(&liveness, &allocation, &tiles, &align, &axes, &layouts, target.get(), opt.num_warps); // run passes peephole.run(module); dce.run(module); -// ir::print(module, std::cout); align.run(module); cts.run(module); axes.run(module); @@ -225,11 +224,15 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c dce.run(module); align.run(module); dce.run(module); - tiles.run(module); reassociate.run(module); +// ir::print(module, std::cout); +// exit(EXIT_FAILURE); dce.run(module); cts.run(module); -// ir::print(module, std::cout); + align.run(module); + axes.run(module); + layouts.run(module); + tiles.run(module); liveness.run(module); allocation.run(module); if(allocation.allocated_size() > context->device()->max_shared_memory()) @@ -238,10 +241,9 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c dce.run(module); axes.run(module); layouts.run(module); - align.run(module); -// ir::print(module, std::cout); - tiles.run(module); // ir::print(module, std::cout); + align.run(module); + tiles.run(module); selection.run(module, *llvm); // return binary std::unique_ptr res(driver::module::create(context, std::move(llvm))); diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index 927f0044b..c87e1c938 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -34,7 +34,7 @@ int main() { for(const auto& c: configs){ std::tie(ord, AT, BT, M, N, K) = c; std::cout << "// " << c << std::flush; - for(auto perf: bench_dot(stream, HALF, AT, BT, M, N, K, ord, ord)) + for(auto perf: bench_dot(stream, FLOAT, AT, BT, M, N, K, ord, ord)) std::cout << ", " << perf << std::flush; std::cout << std::endl; } diff --git a/tests/common/dot.h b/tests/common/dot.h index 00d605f5d..e87470edb 100644 --- a/tests/common/dot.h +++ b/tests/common/dot.h @@ -109,10 +109,10 @@ bool triton_dot(drv::stream* stream, bool AT, bool BT, opt.num_warps = {nwarp}; } if(mode == BENCH) { - opt.defines.push_back({"TM", {"128"}}); - opt.defines.push_back({"TN", {"128"}}); - opt.defines.push_back({"TK", {"16"}}); - opt.num_warps = {4}; + opt.defines.push_back({"TM", {"64", "128"}}); + opt.defines.push_back({"TN", {"64", "128"}}); + opt.defines.push_back({"TK", {"8"}}); + opt.num_warps = {2, 4, 8}; } // kernels diff --git a/tests/unit/dot.cc b/tests/unit/dot.cc index 59b556858..af7b509e0 100644 --- a/tests/unit/dot.cc +++ b/tests/unit/dot.cc @@ -13,7 +13,7 @@ int main() { for(int TM: std::vector{32, 64}) for(int TN: std::vector{32, 64}) for(int TK: std::vector{8}) - for(int nwarps: std::vector{1, 2, 4, 8}) + for(int nwarps: std::vector{1, 4}) for(bool AT: std::array{false, true}) for(bool BT: std::array{false, true}){ configs.push_back(config_t{FLOAT, AT, BT, 128, 128, 128, TM, TN, TK, nwarps}); @@ -29,7 +29,6 @@ int main() { std::cout << " Pass! " << std::endl; else{ std::cout << " Fail! " << std::endl; - exit(EXIT_FAILURE); } } } From 254ed52958f5e7b2df8f3a98fdfd00830eb325ec Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 8 Oct 2019 11:26:22 -0400 Subject: [PATCH 428/494] [codegen] more cleaning --- include/triton/codegen/analysis/axes.h | 4 +++- include/triton/codegen/analysis/layout.h | 2 -- lib/codegen/analysis/axes.cc | 9 +++++++- lib/codegen/analysis/layout.cc | 26 +++++----------------- lib/codegen/analysis/tiles.cc | 28 ++++++++++++------------ lib/codegen/selection.cc | 12 +++++----- 6 files changed, 37 insertions(+), 44 deletions(-) diff --git a/include/triton/codegen/analysis/axes.h b/include/triton/codegen/analysis/axes.h index 453015ab8..701abe04d 100644 --- a/include/triton/codegen/analysis/axes.h +++ b/include/triton/codegen/analysis/axes.h @@ -35,7 +35,9 @@ private: public: axes(); void run(ir::module &mod); - unsigned get_id(ir::value *value, unsigned dim); + // accessors + int get(ir::value *value, unsigned dim); + std::vector get(ir::value *value); private: tools::graph graph_; diff --git a/include/triton/codegen/analysis/layout.h b/include/triton/codegen/analysis/layout.h index 93fd54437..f462211e8 100644 --- a/include/triton/codegen/analysis/layout.h +++ b/include/triton/codegen/analysis/layout.h @@ -28,8 +28,6 @@ private: // graph creation void connect(ir::value *x, ir::value *y); void make_graph(ir::instruction *i); - // list the axes of the given value - std::set axes_of(ir::value *value); public: // constructor diff --git a/lib/codegen/analysis/axes.cc b/lib/codegen/analysis/axes.cc index 9913cda3f..dec2a4e88 100644 --- a/lib/codegen/analysis/axes.cc +++ b/lib/codegen/analysis/axes.cc @@ -121,10 +121,17 @@ void axes::update_graph(ir::instruction *i) { } -unsigned axes::get_id(ir::value *value, unsigned dim) { +int axes::get(ir::value *value, unsigned dim) { return axes_.at({value, dim}); } +std::vector axes::get(ir::value *value) { + std::vector result; + for(size_t d = 0; d < value->get_type()->get_tile_rank(); d++) + result.push_back(this->get(value, d)); + return result; +} + void axes::run(ir::module &mod) { // make graph graph_.clear(); diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index 8270255a7..2397df489 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -11,20 +11,6 @@ namespace codegen{ namespace analysis{ -// axes -std::set layout::axes_of(ir::value *value) { - auto ty = value->get_type(); - // rank of value - size_t rank = 0; - if(ty->is_tile_ty()) - rank = ty->get_tile_rank(); - // create result - std::set result; - for(size_t d = 0; d < rank; d++) - result.insert(axes_->get_id(value, d)); - return result; -} - // constructor layout::layout(analysis::axes *axes) : axes_(axes) { } @@ -49,11 +35,13 @@ void layout::connect(ir::value *x, ir::value *y) { return; if(!y->get_type()->is_tile_ty()) return; - std::set x_axes = axes_of(x); - std::set y_axes = axes_of(y); + std::vector x_axes = axes_->get(x); + std::vector y_axes = axes_->get(y); + std::set sx_axes(x_axes.begin(), x_axes.end()); + std::set sy_axes(y_axes.begin(), y_axes.end()); std::set common; - std::set_intersection(x_axes.begin(), x_axes.end(), - y_axes.begin(), y_axes.end(), + std::set_intersection(sx_axes.begin(), sx_axes.end(), + sy_axes.begin(), sy_axes.end(), std::inserter(common, common.begin())); if(!common.empty()) graph_.add_edge(x, y); @@ -75,8 +63,6 @@ void layout::run(ir::module &mod) { make_graph(i); }); // connected components - values_.clear(); - groups_.clear(); graph_.connected_components(&values_, &groups_); } diff --git a/lib/codegen/analysis/tiles.cc b/lib/codegen/analysis/tiles.cc index 11757cb87..070bcd8c2 100644 --- a/lib/codegen/analysis/tiles.cc +++ b/lib/codegen/analysis/tiles.cc @@ -78,19 +78,19 @@ layout_t tiles::hmma(ir::value *value) { } int tiles::mts(ir::value *value, unsigned ax) { - return mts_.at(axes_->get_id(value, ax)); + return mts_.at(axes_->get(value, ax)); } int tiles::nts(ir::value *value, unsigned ax) { - return nts_.at(axes_->get_id(value, ax)); + return nts_.at(axes_->get(value, ax)); } int tiles::fpw(ir::value *value, unsigned ax) { - return fpw_.at(axes_->get_id(value, ax)); + return fpw_.at(axes_->get(value, ax)); } int tiles::wpt(ir::value *value, unsigned ax) { - return wpt_.at(axes_->get_id(value, ax)); + return wpt_.at(axes_->get(value, ax)); } std::vector tiles::order(ir::value *v) { @@ -127,7 +127,7 @@ void tiles::init_hmma_tile(ir::value *i) { }while(fpw_nm1 != fpw); // store parameters for(unsigned d = 0; d < shapes.size(); d++) - fpw_[axes_->get_id(i, d)] = fpw[d]; + fpw_[axes_->get(i, d)] = fpw[d]; /* warps per tile */ // try to make things as square as possible to maximize data re-use std::vector wpt = {1, 1, 1}; @@ -141,11 +141,11 @@ void tiles::init_hmma_tile(ir::value *i) { }while(wpt_nm1 != wpt); // store parameters for(unsigned d = 0; d < shapes.size(); d++) - wpt_[axes_->get_id(i, d)] = wpt[d]; + wpt_[axes_->get(i, d)] = wpt[d]; /* sanity check */ unsigned effective_num_warps = 1; for(size_t d = 0; d < shapes.size(); d++) - effective_num_warps *= wpt_[axes_->get_id(i, d)]; + effective_num_warps *= wpt_[axes_->get(i, d)]; if(num_warps_ != effective_num_warps) throw std::runtime_error("cannot create a kernel with this amount of warps"); } @@ -157,19 +157,19 @@ void tiles::init_scanline_tile(ir::value *i) { unsigned ld = ord[0]; unsigned num_threads = num_warps_*32; unsigned current = num_threads; - nts_[axes_->get_id(i, ld)] = clamp(size / num_threads, 1, 4); - mts_[axes_->get_id(i, ld)] = clamp(current, 1, shapes[ld] / nts_[axes_->get_id(i, ld)]); - current = current / mts_[axes_->get_id(i, ld)]; + nts_[axes_->get(i, ld)] = clamp(size / num_threads, 1, 4); + mts_[axes_->get(i, ld)] = clamp(current, 1, shapes[ld] / nts_[axes_->get(i, ld)]); + current = current / mts_[axes_->get(i, ld)]; for(size_t d = 1; d < shapes.size(); d++){ ld = ord[d]; - nts_[axes_->get_id(i, ld)] = 1; - mts_[axes_->get_id(i, ld)] = clamp(current, 1, shapes[ld]); - current = current / mts_[axes_->get_id(i, ld)]; + nts_[axes_->get(i, ld)] = 1; + mts_[axes_->get(i, ld)] = clamp(current, 1, shapes[ld]); + current = current / mts_[axes_->get(i, ld)]; } /* sanity check */ unsigned effective_num_threads = 1; for(size_t d = 0; d < shapes.size(); d++) - effective_num_threads *= mts_[axes_->get_id(i, d)]; + effective_num_threads *= mts_[axes_->get(i, d)]; // std::cout << num_threads << " " << effective_num_threads << std::endl; if(num_threads != effective_num_threads) throw std::runtime_error("cannot create a kernel with this amount of warps"); diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 0b13b8982..b5692844f 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -601,7 +601,7 @@ void selection::init_strided_scan_axes(ir::value *v, IRBuilder<> &builder, Value unsigned offset = n / contiguous[k] * per_block + n % contiguous[k]; idx_list[n] = builder.CreateAdd(scaled_thread_id, builder.getInt32(offset), "idx_" + str_k + "_" + std::to_string(n)); } - axes_[a_axes_->get_id(v, k)] = distributed_axis{contiguous[k], idx_list, thread_id[k]}; + axes_[a_axes_->get(v, k)] = distributed_axis{contiguous[k], idx_list, thread_id[k]}; } } @@ -706,10 +706,10 @@ void selection::init_hmma_axes(ir::value *v, IRBuilder<> &builder, Value *u_thre /* axes */ - axes_[a_axes_->get_id(v, 0)] = distributed_axis{1, idx_i, warp_id_0}; - axes_[a_axes_->get_id(v, 1)] = distributed_axis{1, idx_j, warp_id_1}; + axes_[a_axes_->get(v, 0)] = distributed_axis{1, idx_i, warp_id_0}; + axes_[a_axes_->get(v, 1)] = distributed_axis{1, idx_j, warp_id_1}; if(is_batched) - axes_[a_axes_->get_id(v, 2)] = distributed_axis{1, idx_z, warp_id_2}; + axes_[a_axes_->get(v, 2)] = distributed_axis{1, idx_z, warp_id_2}; } @@ -769,7 +769,7 @@ void selection::create_distributed_tile(ir::value *v, IRBuilder<> &builder) { std::vector axes(shapes.size()); for(size_t d = 0; d < shapes.size(); d++){ if(shapes[d] > 1){ - unsigned x = a_axes_->get_id(v, d); + unsigned x = a_axes_->get(v, d); axes[d] = axes_.at(x); } else{ @@ -921,7 +921,7 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, Value *base_ptr = builder.CreateBitCast(sh_mem_ptr_, PointerType::get(res_ty, addr_space)); for(auto& x: partial) { // current element being computed - Value *lane = axes_.at(a_axes_->get_id(op, axis)).thread_id; + Value *lane = axes_.at(a_axes_->get(op, axis)).thread_id; Value *&result = x.second; indices_t write_idx = x.first; write_idx.insert(write_idx.begin() + axis, lane); From 10ab94d1c5a7ee66418152567ec38b5a098d0f0b Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 8 Oct 2019 17:10:34 -0400 Subject: [PATCH 429/494] [codegen] added missing file --- include/triton/tools/graph.h | 67 +++++++++++++++++++++++++++++++++++ lib/codegen/analysis/tiles.cc | 11 +++--- 2 files changed, 72 insertions(+), 6 deletions(-) create mode 100644 include/triton/tools/graph.h diff --git a/include/triton/tools/graph.h b/include/triton/tools/graph.h new file mode 100644 index 000000000..b53e754cd --- /dev/null +++ b/include/triton/tools/graph.h @@ -0,0 +1,67 @@ +#pragma once + +#ifndef _TRITON_TOOLS_THREAD_GRAPH_H_ +#define _TRITON_TOOLS_THREAD_GRAPH_H_ + +#include +#include +#include + +namespace triton { +namespace tools{ + +template +class graph { + typedef std::map> edges_t; + +public: + typedef std::map> cmap_t; + typedef std::map nmap_t; + +private: + void connected_components_impl(node_t x, std::set &nodes, + nmap_t* nmap, cmap_t* cmap, int id) const { + if(nmap) + (*nmap)[x] = id; + if(cmap) + (*cmap)[id].push_back(x); + if(nodes.find(x) != nodes.end()) { + nodes.erase(x); + for(const node_t &y: edges_.at(x)) + connected_components_impl(y, nodes, nmap, cmap, id); + } + } + +public: + void connected_components(cmap_t *cmap, nmap_t *nmap) const { + if(cmap) + cmap->clear(); + if(nmap) + nmap->clear(); + std::set nodes = nodes_; + unsigned id = 0; + while(!nodes.empty()) + connected_components_impl(*nodes.begin(), nodes, nmap, cmap, id++); + } + + void add_edge(node_t x, node_t y) { + nodes_.insert(x); + nodes_.insert(y); + edges_[x].insert(y); + edges_[y].insert(x); + } + + void clear() { + nodes_.clear(); + edges_.clear(); + } + +private: + std::set nodes_; + edges_t edges_; +}; + +} +} + +#endif diff --git a/lib/codegen/analysis/tiles.cc b/lib/codegen/analysis/tiles.cc index 070bcd8c2..dcec28a6b 100644 --- a/lib/codegen/analysis/tiles.cc +++ b/lib/codegen/analysis/tiles.cc @@ -206,21 +206,20 @@ void tiles::run(ir::module &) { size_t num_groups = layout_->num_layouts(); // helpers auto rank = [](ir::value* v) { - ir::type *ty = v->get_type(); - size_t ret = 0; - if(ty->is_tile_ty()) - for(int s: ty->get_tile_shapes()) - ret += s > 1; + int ret = 0; + for(int s: v->get_type()->get_tile_shapes()) + ret += s > 1; return ret; }; + // find out which groups require hmma layout for(size_t i = 0; i < num_groups; i++) { const auto& values = layout_->values_of(i); bool hmma_c = std::any_of(values.begin(), values.end(), &is_hmma_c); if(hmma_c) hmma_[i] = HMMA_C; else hmma_[i] = SCANLINE; - } + // find out which value is the largest in each group for(size_t i = 0; i < num_groups; i++) { const auto& values = layout_->values_of(i); From 9bc6df4fd15b5f77c6c25b489997719669c06c77 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 9 Oct 2019 15:05:44 -0400 Subject: [PATCH 430/494] [codegen] more cleaning --- include/triton/codegen/analysis/layout.h | 21 ++- include/triton/codegen/analysis/liveness.h | 4 +- include/triton/codegen/analysis/tiles.h | 16 +- include/triton/codegen/selection.h | 7 +- lib/codegen/analysis/layout.cc | 127 ++++++++++++- lib/codegen/analysis/liveness.cc | 13 +- lib/codegen/analysis/tiles.cc | 204 +++------------------ lib/codegen/selection.cc | 79 ++++---- lib/codegen/transform/coalesce.cc | 3 +- lib/runtime/function.cc | 4 +- 10 files changed, 226 insertions(+), 252 deletions(-) diff --git a/include/triton/codegen/analysis/layout.h b/include/triton/codegen/analysis/layout.h index f462211e8..a9d2d1a77 100644 --- a/include/triton/codegen/analysis/layout.h +++ b/include/triton/codegen/analysis/layout.h @@ -19,6 +19,20 @@ namespace codegen{ namespace analysis{ class axes; +class align; + +enum layout_type_t { + HMMA_884, + SCANLINE +}; + +struct layout_t { + layout_type_t type; + ir::value *i; + std::vector axes; + std::vector shapes; + std::vector order; +}; class layout { typedef ir::value* node_t; @@ -31,19 +45,24 @@ private: public: // constructor - layout(analysis::axes *axes); + layout(analysis::axes *axes, analysis::align *align); // accessors unsigned layout_of(ir::value *value) const; const std::vector& values_of(unsigned id) const; size_t num_layouts() const; + layout_t get(ir::value *v) const; + const std::map& get_all() const; + // execution void run(ir::module &mod); private: analysis::axes* axes_; + analysis::align* align_; tools::graph graph_; std::map groups_; std::map> values_; + std::map layouts_; }; } diff --git a/include/triton/codegen/analysis/liveness.h b/include/triton/codegen/analysis/liveness.h index f082e1cfa..57fc90b81 100644 --- a/include/triton/codegen/analysis/liveness.h +++ b/include/triton/codegen/analysis/liveness.h @@ -22,6 +22,7 @@ namespace analysis{ typedef unsigned slot_index; class tiles; +class layout; struct segment { slot_index start; @@ -72,7 +73,7 @@ private: public: - liveness(tiles *t): tiles_(t){ } + liveness(tiles *t, layout *l): tiles_(t), layouts_(l){ } // padding unsigned get_pad(ir::value *v) const { return pad_.at(v); } // buffer size @@ -92,6 +93,7 @@ public: private: // analysis tiles *tiles_; + layout *layouts_; // stuff has_storage_map_t has_dedicated_storage_; indices_map_t indices; diff --git a/include/triton/codegen/analysis/tiles.h b/include/triton/codegen/analysis/tiles.h index ca1eb0e90..fdc03cee1 100644 --- a/include/triton/codegen/analysis/tiles.h +++ b/include/triton/codegen/analysis/tiles.h @@ -5,6 +5,7 @@ #include #include #include +#include "triton/codegen/analysis/layout.h" namespace triton{ @@ -25,28 +26,22 @@ class axes; class layout; class align; -enum layout_t { - SCANLINE, - HMMA_C -}; class tiles { typedef std::map> param_map_t; private: - void init_hmma_tile(ir::value *i); - void init_scanline_tile(ir::value *i); + void init_hmma_tile(const layout_t& layout); + void init_scanline_tile(const layout_t& layout); bool is_trans(ir::value *i); public: tiles(size_t num_warps, analysis::align* align, analysis::axes* axes, analysis::layout* layout); void run(ir::module &mod); - layout_t hmma(ir::value *value); int mts(ir::value *value, unsigned ax); int nts(ir::value *value, unsigned ax); int fpw(ir::value *value, unsigned ax); int wpt(ir::value *value, unsigned ax); - std::vector order(ir::value *v); - const std::map& largest(); + private: // dependencies @@ -56,9 +51,6 @@ private: // number of warps size_t num_warps_; // tile properties - std::map largest_; - std::map> order_; - std::map hmma_; std::map fpw_; std::map wpt_; std::map mts_; diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index b505a6a29..bb03d3521 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -5,6 +5,7 @@ #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/type.h" +#include "triton/codegen/analysis/layout.h" #include "triton/codegen/transform/cts.h" @@ -171,9 +172,9 @@ private: void create_shared_tile(ir::value *v, Builder &builder, Value *sh_mem_ptr); void create_distributed_tile(ir::value *v, Builder &builder); void create_tile(ir::value *v, Builder &builder, std::set &seen, Value *sh_mem_ptr); - void init_strided_scan_axes(ir::value *i, Builder &builder, Value *u_thread_id, Value *u_warp_id); - void init_hmma_axes(ir::value *i, Builder &builder, Value *u_thread_id, Value *u_warp_id); - void init_axes(ir::value *i, Builder &builder, Value *u_thread_id, Value *u_warp_id); + void init_strided_scan_axes(const analysis::layout_t& layout, Builder &builder, Value *u_thread_id, Value *u_warp_id); + void init_hmma_axes(const analysis::layout_t& layout, Builder &builder, Value *u_thread_id, Value *u_warp_id); + void init_axes(const analysis::layout_t& layout, Builder &builder, Value *u_thread_id, Value *u_warp_id); void init_layouts(ir::function *fn, Builder &builder, Value *sh_mem_ptr); // lower scalar instruction diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index 2397df489..2a446f3ef 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -1,6 +1,8 @@ #include #include +#include #include "triton/codegen/analysis/axes.h" +#include "triton/codegen/analysis/align.h" #include "triton/codegen/analysis/layout.h" #include "triton/ir/function.h" #include "triton/ir/module.h" @@ -12,8 +14,8 @@ namespace analysis{ // constructor -layout::layout(analysis::axes *axes) - : axes_(axes) { } +layout::layout(analysis::axes *axes, analysis::align *align) + : axes_(axes), align_(align) { } // get group id unsigned layout::layout_of(ir::value *value) const @@ -56,6 +58,51 @@ void layout::make_graph(ir::instruction *i) { } } +// hmma +bool is_hmma_c(ir::value *v){ + bool result = false; + if(auto *x = dynamic_cast(v)){ + ir::value *a = x->get_operand(0); + ir::type *a_ty = a->get_type(); + ir::value *b = x->get_operand(1); + ir::type *b_ty = b->get_type(); + result = a_ty->get_scalar_ty()->is_half_ty() && + b_ty->get_scalar_ty()->is_half_ty(); + } + return result; +} + +layout_t layout::get(ir::value *v) const { + return layouts_.at(groups_.at(v)); +} + +const std::map& layout::get_all() const { + return layouts_; +} + +void extract_io_use(ir::value *v, std::set& result) { + for(ir::user* u: v->get_users()){ + auto i = dynamic_cast(u); + if(i && i->get_pointer_operand() == v) + result.insert(i); + } +} + + +inline bool is_trans(ir::value *v) { + if(dynamic_cast(v)) { + return true; + } + if(auto *phi = dynamic_cast(v)) { + bool result = true; + for(ir::value *op: phi->ops()) + result = result && is_trans(op); + return result; + } + return false; +} + + void layout::run(ir::module &mod) { // make graph graph_.clear(); @@ -64,6 +111,82 @@ void layout::run(ir::module &mod) { }); // connected components graph_.connected_components(&values_, &groups_); + // create layouts + for(const auto& x: values_) { + bool hmma_c = std::any_of(x.second.begin(), x.second.end(), &is_hmma_c); + layouts_[x.first].type = hmma_c ? HMMA_884 : SCANLINE; + + } + + + /* ---- TO CLEAN ---- */ + + size_t num_groups = num_layouts(); + // helpers + auto rank = [this](ir::value* v) { + int ret = 0; + for(int s: v->get_type()->get_tile_shapes()) + ret += s > 1; + return ret; + }; + + // find out which value is the largest in each group + for(const auto& x: values_) { + auto cmp = [&rank](ir::value* x, ir::value *y) { return rank(x) < rank(y); }; + ir::value *largest = *std::max_element(x.second.begin(), x.second.end(), cmp); + layouts_[x.first].axes = axes_->get(largest); + layouts_[x.first].i = largest; + layouts_[x.first].shapes = largest->get_type()->get_tile_shapes(); + } + + + // find out the layout ordering of a group + for(size_t i = 0; i < num_groups; i++){ + std::set io; + for(ir::value* v: values_of(i)) + extract_io_use(v, io); + auto cmp = [&rank](ir::io_inst* x, ir::io_inst *y) { + return rank(x->get_pointer_operand()) < rank(y->get_pointer_operand()); + }; + auto it = std::max_element(io.begin(), io.end(), cmp); + std::vector order(layouts_[i].axes.size()); + std::iota(order.begin(), order.end(), 0); + if(it != io.end()) { + auto max_contiguous = align_->contiguous((*it)->get_pointer_operand()); + std::sort(order.begin(), order.end(), [&](unsigned a, unsigned b) { + return max_contiguous[a] > max_contiguous[b]; } + ); + } + layouts_[i].order = order; + } + // matrix multiplication optimizations + for(size_t i = 0; i < num_groups; i++){ + std::vector dots; + for(ir::value* v: values_of(i)) + if(auto *x = dynamic_cast(v)) + dots.push_back(x); + for(ir::dot_inst* dot: dots){ + ir::value* a = dot->get_operand(0); + ir::value* b = dot->get_operand(1); + if(get(dot).type == HMMA_884){ + auto a_val = values_of(layout_of(a)); + auto b_val = values_of(layout_of(b)); + for(ir::value *v: a_val) + if(auto *cts = dynamic_cast(v)) + layouts_[layout_of(a)].order = layouts_[layout_of(cts->get_operand(0))].order; + for(ir::value *v: b_val) + if(auto *cts = dynamic_cast(v)) + layouts_[layout_of(b)].order = layouts_[layout_of(cts->get_operand(0))].order; + } + else{ + std::vector col = {0, 1}; + std::vector row = {1, 0}; + layouts_[layout_of(a)].order = is_trans(a) ? row : col; + layouts_[layout_of(b)].order = is_trans(b) ? col : row; + } + } + } + } } diff --git a/lib/codegen/analysis/liveness.cc b/lib/codegen/analysis/liveness.cc index 4ca9bf96f..35c801e8f 100644 --- a/lib/codegen/analysis/liveness.cc +++ b/lib/codegen/analysis/liveness.cc @@ -4,6 +4,7 @@ #include "triton/codegen/instructions.h" #include "triton/codegen/analysis/liveness.h" #include "triton/codegen/analysis/tiles.h" +#include "triton/codegen/analysis/layout.h" #include "triton/codegen/transform/cts.h" #include "triton/ir/basic_block.h" #include "triton/ir/function.h" @@ -89,8 +90,8 @@ bool liveness::do_pad(ir::value *x) { ir::value *b = dot->get_operand(1); size_t a_previous = pad_[a]; size_t b_previous = pad_[b]; - auto a_order = tiles_->order(a); - auto b_order = tiles_->order(b); + auto a_order = layouts_->get(a).order; + auto b_order = layouts_->get(b).order; bool a_row = is_trans(a) ^ (a_order[0] == 1); bool b_row = is_trans(b) ^ (b_order[0] == 1); auto a_shapes = a->get_type()->get_tile_shapes(); @@ -108,9 +109,9 @@ bool liveness::do_pad(ir::value *x) { } // padding for copy to shared if(auto* cts = dynamic_cast(x)) { - auto cts_order = tiles_->order(cts); + auto cts_order = layouts_->get(cts).order; ir::value *arg = cts->get_operand(0); - auto arg_order = tiles_->order(arg); + auto arg_order = layouts_->get(arg).order; size_t previous = pad_[cts]; if(cts_order != arg_order) pad_[cts] = std::max(pad_[cts], 4); @@ -144,7 +145,7 @@ unsigned liveness::num_bytes(ir::value *x) { for(auto x: shapes) num_elements *= x; size_t depth; - if(tiles_->hmma(x)) + if(layouts_->get(x).type == HMMA_884) depth = tiles_->wpt(op, axis); else depth = tiles_->mts(op, axis); @@ -153,7 +154,7 @@ unsigned liveness::num_bytes(ir::value *x) { unsigned num_bytes = x->get_type()->get_primitive_size_in_bits() / 8; unsigned pad = pad_.at(x); if(pad > 0){ - unsigned ld = x->get_type()->get_tile_shapes()[tiles_->order(x)[0]]; + unsigned ld = x->get_type()->get_tile_shapes()[layouts_->get(x).order[0]]; num_bytes += pad * num_bytes / ld; } if(has_double(x)) diff --git a/lib/codegen/analysis/tiles.cc b/lib/codegen/analysis/tiles.cc index dcec28a6b..6a16544e6 100644 --- a/lib/codegen/analysis/tiles.cc +++ b/lib/codegen/analysis/tiles.cc @@ -23,59 +23,7 @@ tiles::tiles(size_t num_warps, analysis::align *align, analysis::axes *axes, ana num_warps_(num_warps), align_(align), axes_(axes), layout_(layout) { } -bool is_hmma_c(ir::value *v){ - bool result = false; - if(auto *x = dynamic_cast(v)){ - ir::value *a = x->get_operand(0); - ir::type *a_ty = a->get_type(); - ir::value *b = x->get_operand(1); - ir::type *b_ty = b->get_type(); - result = a_ty->get_scalar_ty()->is_half_ty() && - b_ty->get_scalar_ty()->is_half_ty(); - } - return result; -} -bool is_hmma_a_col(ir::value* v) { - for(ir::user *u: v->get_users()) - if(is_hmma_c(u)){ - ir::dot_inst* dot = (ir::dot_inst*)u; - if((v == dot->get_operand(0))) - return true; - } -} - -bool is_hmma_a_row(ir::value* v) { - for(ir::user *u: v->get_users()) - if(is_hmma_c(u)){ - ir::dot_inst* dot = (ir::dot_inst*)u; - if((v == dot->get_operand(0))) - return true; - } -} - -bool is_hmma_b_col(ir::value* v) { - for(ir::user *u: v->get_users()) - if(is_hmma_c(u)){ - ir::dot_inst* dot = (ir::dot_inst*)u; - if((v == dot->get_operand(1))) - return true; - } -} - -bool is_hmma_b_row(ir::value* v) { - for(ir::user *u: v->get_users()) - if(is_hmma_c(u)){ - ir::dot_inst* dot = (ir::dot_inst*)u; - if((v == dot->get_operand(1))) - return true; - } -} - - -layout_t tiles::hmma(ir::value *value) { - return hmma_.at(layout_->layout_of(value)); -} int tiles::mts(ir::value *value, unsigned ax) { return mts_.at(axes_->get(value, ax)); @@ -93,24 +41,15 @@ int tiles::wpt(ir::value *value, unsigned ax) { return wpt_.at(axes_->get(value, ax)); } -std::vector tiles::order(ir::value *v) { - auto ret = order_[layout_->layout_of(v)]; - return ret; -} - -const std::map& tiles::largest() { - return largest_; -} - unsigned clamp(unsigned x, unsigned lo, unsigned hi) { return std::min(std::max(x, lo), hi); } -void tiles::init_hmma_tile(ir::value *i) { - auto ord = order(i); - auto shapes = i->get_type()->get_tile_shapes(); +void tiles::init_hmma_tile(const layout_t& layout) { + auto ord = layout.order; + auto shapes = layout.i->get_type()->get_tile_shapes(); unsigned shape_0 = shapes[ord[0]]; unsigned shape_1 = shapes[ord[1]]; /* fragments per warp */ @@ -127,7 +66,7 @@ void tiles::init_hmma_tile(ir::value *i) { }while(fpw_nm1 != fpw); // store parameters for(unsigned d = 0; d < shapes.size(); d++) - fpw_[axes_->get(i, d)] = fpw[d]; + fpw_[layout.axes[d]] = fpw[d]; /* warps per tile */ // try to make things as square as possible to maximize data re-use std::vector wpt = {1, 1, 1}; @@ -141,149 +80,48 @@ void tiles::init_hmma_tile(ir::value *i) { }while(wpt_nm1 != wpt); // store parameters for(unsigned d = 0; d < shapes.size(); d++) - wpt_[axes_->get(i, d)] = wpt[d]; + wpt_[layout.axes[d]] = wpt[d]; /* sanity check */ unsigned effective_num_warps = 1; for(size_t d = 0; d < shapes.size(); d++) - effective_num_warps *= wpt_[axes_->get(i, d)]; + effective_num_warps *= wpt_[layout.axes[d]]; if(num_warps_ != effective_num_warps) throw std::runtime_error("cannot create a kernel with this amount of warps"); } -void tiles::init_scanline_tile(ir::value *i) { - auto ord = order(i); - auto shapes = i->get_type()->get_tile_shapes(); - unsigned size = i->get_type()->get_tile_num_elements(); +void tiles::init_scanline_tile(const layout_t& layout) { + auto ord = layout.order; + auto shapes = layout.shapes; + unsigned size = std::accumulate(shapes.begin(), shapes.end(), 1, std::multiplies()); unsigned ld = ord[0]; unsigned num_threads = num_warps_*32; unsigned current = num_threads; - nts_[axes_->get(i, ld)] = clamp(size / num_threads, 1, 4); - mts_[axes_->get(i, ld)] = clamp(current, 1, shapes[ld] / nts_[axes_->get(i, ld)]); - current = current / mts_[axes_->get(i, ld)]; + nts_[layout.axes[ld]] = clamp(size / num_threads, 1, 4); + mts_[layout.axes[ld]] = clamp(current, 1, shapes[ld] / nts_[layout.axes[ld]]); + current = current / mts_[layout.axes[ld]]; for(size_t d = 1; d < shapes.size(); d++){ ld = ord[d]; - nts_[axes_->get(i, ld)] = 1; - mts_[axes_->get(i, ld)] = clamp(current, 1, shapes[ld]); - current = current / mts_[axes_->get(i, ld)]; + nts_[layout.axes[ld]] = 1; + mts_[layout.axes[ld]] = clamp(current, 1, shapes[ld]); + current = current / mts_[layout.axes[ld]]; } /* sanity check */ unsigned effective_num_threads = 1; for(size_t d = 0; d < shapes.size(); d++) - effective_num_threads *= mts_[axes_->get(i, d)]; + effective_num_threads *= mts_[layout.axes[d]]; // std::cout << num_threads << " " << effective_num_threads << std::endl; if(num_threads != effective_num_threads) throw std::runtime_error("cannot create a kernel with this amount of warps"); } -void extract_io_use(ir::value *v, std::set& result) { - for(ir::user* u: v->get_users()){ - auto i = dynamic_cast(u); - if(i && i->get_pointer_operand() == v) - result.insert(i); - } -} - - -bool tiles::is_trans(ir::value *v) { - if(dynamic_cast(v)) { - return true; - } - if(auto *phi = dynamic_cast(v)) { - bool result = true; - for(ir::value *op: phi->ops()) - result = result && is_trans(op); - return result; - } - return false; -} - - void tiles::run(ir::module &) { - hmma_.clear(); - largest_.clear(); - order_.clear(); - - size_t num_groups = layout_->num_layouts(); - // helpers - auto rank = [](ir::value* v) { - int ret = 0; - for(int s: v->get_type()->get_tile_shapes()) - ret += s > 1; - return ret; - }; - - // find out which groups require hmma layout - for(size_t i = 0; i < num_groups; i++) { - const auto& values = layout_->values_of(i); - bool hmma_c = std::any_of(values.begin(), values.end(), &is_hmma_c); - if(hmma_c) hmma_[i] = HMMA_C; - else hmma_[i] = SCANLINE; - } - - // find out which value is the largest in each group - for(size_t i = 0; i < num_groups; i++) { - const auto& values = layout_->values_of(i); - auto cmp = [&rank](ir::value* x, ir::value *y) { return rank(x) < rank(y); }; - largest_[i] = *std::max_element(values.begin(), values.end(), cmp); - } - - - // find out the layout ordering of a group - for(size_t i = 0; i < num_groups; i++){ - std::set io; - for(ir::value* v: layout_->values_of(i)) - extract_io_use(v, io); - auto cmp = [&rank](ir::io_inst* x, ir::io_inst *y) { - return rank(x->get_pointer_operand()) < rank(y->get_pointer_operand()); - }; - auto it = std::max_element(io.begin(), io.end(), cmp); - std::vector order(rank(largest_[i])); - std::iota(order.begin(), order.end(), 0); - if(it != io.end()) { - auto max_contiguous = align_->contiguous((*it)->get_pointer_operand()); - std::sort(order.begin(), order.end(), [&](unsigned a, unsigned b) { - return max_contiguous[a] > max_contiguous[b]; } - ); - } - order_[i] = order; - } - // matrix multiplication optimizations - for(size_t i = 0; i < num_groups; i++){ - std::vector dots; - for(ir::value* v: layout_->values_of(i)) - if(auto *x = dynamic_cast(v)) - dots.push_back(x); - for(ir::dot_inst* dot: dots){ - ir::value* a = dot->get_operand(0); - ir::value* b = dot->get_operand(1); - if(hmma_.at(layout_->layout_of(dot)) == HMMA_C){ - auto a_val = layout_->values_of(layout_->layout_of(a)); - auto b_val = layout_->values_of(layout_->layout_of(b)); - for(ir::value *v: a_val) - if(auto *cts = dynamic_cast(v)) - order_[layout_->layout_of(a)] = order_[layout_->layout_of(cts->get_operand(0))]; - for(ir::value *v: b_val) - if(auto *cts = dynamic_cast(v)) - order_[layout_->layout_of(b)] = order_[layout_->layout_of(cts->get_operand(0))]; - } - else{ - std::vector col = {0, 1}; - std::vector row = {1, 0}; - order_[layout_->layout_of(a)] = is_trans(a) ? row : col; - order_[layout_->layout_of(b)] = is_trans(b) ? col : row; - } - } - } // tiling parameters - for(auto x: largest_){ - ir::value *i = x.second; - if(!i->get_type()->is_tile_ty()) - continue; + for(auto x: layout_->get_all()){ /* HMMA parameters*/ - if(hmma_[x.first] == HMMA_C) - init_hmma_tile(i); + if(x.second.type == HMMA_884) + init_hmma_tile(x.second); else - init_scanline_tile(i); + init_scanline_tile(x.second); } } diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index b5692844f..1505bcbc6 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -577,37 +577,36 @@ inline int32_t ceil(int32_t num, int32_t div){ return (num + div - 1)/div; } -void selection::init_strided_scan_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { - auto order = tiles_->order(v); - const auto& shapes = v->get_type()->get_tile_shapes(); +void selection::init_strided_scan_axes(const analysis::layout_t& layout, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { + auto order = layout.order; + const auto& shapes = layout.shapes; size_t dim = shapes.size(); - std::vector contiguous(dim); - std::vector block_size(dim); + std::vector nts(dim); + std::vector mts(dim); for(unsigned i = 0; i < shapes.size(); i++){ - contiguous[i] = tiles_->nts(v, i); - block_size[i] = tiles_->mts(v, i); + nts[i] = tiles_->nts(layout.i, i); + mts[i] = tiles_->mts(layout.i, i); } Value* full_thread_id = builder.CreateAdd(builder.CreateMul(u_warp_id, builder.getInt32(32)), u_thread_id); - std::vector thread_id = delinearize(full_thread_id, order, block_size, builder); + std::vector thread_id = delinearize(full_thread_id, order, mts, builder); // Create axes for(unsigned k = 0; k < dim; k++) { std::string str_k = std::to_string(k); - Value *contiguous_k = builder.getInt32(contiguous[k]); + Value *contiguous_k = builder.getInt32(nts[k]); Value *scaled_thread_id = builder.CreateMul(thread_id[k], contiguous_k); - unsigned per_block = contiguous[k] * block_size[k]; - unsigned per_thread = contiguous[k] * shapes[k] / per_block; + unsigned per_block = nts[k] * mts[k]; + unsigned per_thread = nts[k] * shapes[k] / per_block; std::vector idx_list(per_thread); for(unsigned n = 0 ; n < per_thread; n++){ - unsigned offset = n / contiguous[k] * per_block + n % contiguous[k]; + unsigned offset = n / nts[k] * per_block + n % nts[k]; idx_list[n] = builder.CreateAdd(scaled_thread_id, builder.getInt32(offset), "idx_" + str_k + "_" + std::to_string(n)); } - axes_[a_axes_->get(v, k)] = distributed_axis{contiguous[k], idx_list, thread_id[k]}; + axes_[layout.axes[k]] = distributed_axis{nts[k], idx_list, thread_id[k]}; } } -void selection::init_hmma_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { -// auto order = reorder_->get_order(v); - const auto& shapes = v->get_type()->get_tile_shapes(); +void selection::init_hmma_axes(const analysis::layout_t& layout, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { + const auto& shapes = layout.shapes; if(shapes.size() > 3) throw std::runtime_error("unsupported"); bool is_batched = shapes.size() >= 3; @@ -619,13 +618,13 @@ void selection::init_hmma_axes(ir::value *v, IRBuilder<> &builder, Value *u_thre Value *_16 = builder.getInt32(16); // fragments per warp - unsigned fpw_0 = tiles_->fpw(v, 0); - unsigned fpw_1 = tiles_->fpw(v, 1); - unsigned fpw_2 = is_batched ? tiles_->fpw(v, 2) : 1; + unsigned fpw_0 = tiles_->fpw(layout.i, 0); + unsigned fpw_1 = tiles_->fpw(layout.i, 1); + unsigned fpw_2 = is_batched ? tiles_->fpw(layout.i, 2) : 1; // warps per tile - unsigned wpt_0 = tiles_->wpt(v, 0); - unsigned wpt_1 = tiles_->wpt(v, 1); - unsigned wpt_2 = is_batched ? tiles_->wpt(v, 2) : 1; + unsigned wpt_0 = tiles_->wpt(layout.i, 0); + unsigned wpt_1 = tiles_->wpt(layout.i, 1); + unsigned wpt_2 = is_batched ? tiles_->wpt(layout.i, 2) : 1; // hmma warp tile size unsigned hmma_wts_0 = fpw_0 * 8; unsigned hmma_wts_1 = fpw_1 * 8; @@ -706,18 +705,18 @@ void selection::init_hmma_axes(ir::value *v, IRBuilder<> &builder, Value *u_thre /* axes */ - axes_[a_axes_->get(v, 0)] = distributed_axis{1, idx_i, warp_id_0}; - axes_[a_axes_->get(v, 1)] = distributed_axis{1, idx_j, warp_id_1}; + axes_[layout.axes[0]] = distributed_axis{1, idx_i, warp_id_0}; + axes_[layout.axes[1]] = distributed_axis{1, idx_j, warp_id_1}; if(is_batched) - axes_[a_axes_->get(v, 2)] = distributed_axis{1, idx_z, warp_id_2}; + axes_[layout.axes[2]] = distributed_axis{1, idx_z, warp_id_2}; } -void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { - if(tiles_->hmma(v) == analysis::HMMA_C) - init_hmma_axes(v, builder, u_thread_id, u_warp_id); +void selection::init_axes(const analysis::layout_t& layout, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { + if(layout.type == analysis::HMMA_884) + init_hmma_axes(layout, builder, u_thread_id, u_warp_id); else - init_strided_scan_axes(v, builder, u_thread_id, u_warp_id); + init_strided_scan_axes(layout, builder, u_thread_id, u_warp_id); } /* ------------------- @@ -727,7 +726,7 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id void selection::create_shared_tile(ir::value *v, IRBuilder<> &builder, Value *sh_mem_ptr) { if(tmap_.find(v) != tmap_.end()) return; - auto order = tiles_->order(v); + auto order = layouts_->get(v).order; auto shapes = v->get_type()->get_tile_shapes(); unsigned pad = liveness_->get_pad(v); if(pad > 0) @@ -777,7 +776,7 @@ void selection::create_distributed_tile(ir::value *v, IRBuilder<> &builder) { axes[d].values = {builder.getInt32(0)}; } } - distributed_tile *T = new distributed_tile(ty, shapes, tiles_->order(v), axes, builder, false); + distributed_tile *T = new distributed_tile(ty, shapes, layouts_->get(v).order, axes, builder, false); bool is_inserted = tmap_.insert({v, T}).second; // constant range if(is_inserted && dynamic_cast(v)){ @@ -820,7 +819,7 @@ void selection::init_layouts(ir::function *fn, IRBuilder<> &builder, Value *sh_m Value *u_thread_warp_id = builder.CreateURem(u_thread_id, warp_size); Value *u_warp_id = builder.CreateUDiv(u_thread_id, warp_size); // create grid - for(auto x: tiles_->largest()) + for(auto x: layouts_->get_all()) init_axes(x.second, builder, u_thread_warp_id, u_warp_id); // create tile std::set seen; @@ -868,7 +867,7 @@ void selection::lower_masked_store(ir::masked_store_inst *x, LLVMContext &ctx, F void selection::lower_store(ir::store_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { distributed_tile* ptrs = (distributed_tile*)tmap_.at(x->get_pointer_operand()); tile *scalars = tmap_.at(x->get_value_operand()); -// size_t ld = tiles_->order(x->get_pointer_operand())[0]; +// size_t ld = layouts_->order(x->get_pointer_operand())[0]; // unsigned vector_size = 2; // // vectorize pointers // std::map ptr_packets; @@ -1015,9 +1014,9 @@ void selection::lower_broadcast(ir::broadcast_inst *x, LLVMContext &ctx, Functio void selection::lower_copy_to_shared(ir::copy_to_shared_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { unsigned vector_size = 1; - auto x_order = tiles_->order(x); + auto x_order = layouts_->get(x).order; ir::value *arg = x->get_operand(0); - auto arg_order = tiles_->order(arg); + auto arg_order = layouts_->get(arg).order; // tiles shared_tile* result = (shared_tile*)tmap_.at(x); distributed_tile* in = (distributed_tile*)tmap_.at(arg); @@ -1092,8 +1091,8 @@ void selection::lower_hmma_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn Value* u_thread_id = tgt_->get_local_id(builder.GetInsertBlock()->getModule(), builder, 0); - auto ord_a = tiles_->order(dot->get_operand(0)); - auto ord_b = tiles_->order(dot->get_operand(1)); + auto ord_a = layouts_->get(dot->get_operand(0)).order; + auto ord_b = layouts_->get(dot->get_operand(1)).order; bool is_a_trans = is_trans(dot->get_operand(0)); bool is_b_trans = is_trans(dot->get_operand(1)); @@ -1255,7 +1254,7 @@ void selection::lower_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn, IRB if(NK != 1) { shared_tile *TA = (shared_tile*)tmap_.at(A); shared_tile *TB = (shared_tile*)tmap_.at(B); - if(tiles_->hmma(dot) == analysis::HMMA_C) + if(layouts_->get(dot).type == analysis::HMMA_884) lower_hmma_dot(dot, ctx, fn, builder, TC, TA, TB, TD, NK); else lower_scanline_dot(dot, ctx, fn, builder, TC, TA, TB, TD, NK, c_ty, f_mul_add); @@ -1271,7 +1270,7 @@ void selection::lower_masked_load(ir::masked_load_inst *x, LLVMContext &ctx, Fun // find vector size distributed_tile* result = (distributed_tile*)tmap_.at(x); ir::value *ptr = x->get_pointer_operand(); - size_t ld = tiles_->order(ptr)[0]; + size_t ld = layouts_->get(ptr).order[0]; unsigned alignment = alignment_->get(ptr, ld); unsigned vector_size = std::min(result->axis(ld).contiguous, alignment); distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr); @@ -1343,7 +1342,7 @@ void selection::lower_load(ir::load_inst *x, LLVMContext &ctx, Function *fn, IRB distributed_tile* result = (distributed_tile*)tmap_.at(x); // find vector size ir::value *ptr = x->get_pointer_operand(); - size_t ld = tiles_->order(ptr)[0]; + size_t ld = layouts_->get(ptr).order[0]; unsigned alignment = alignment_->get(ptr, ld); unsigned vector_size = std::min(result->axis(ld).contiguous, alignment); distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr); diff --git a/lib/codegen/transform/coalesce.cc b/lib/codegen/transform/coalesce.cc index c5d356d31..8db94ed45 100644 --- a/lib/codegen/transform/coalesce.cc +++ b/lib/codegen/transform/coalesce.cc @@ -83,8 +83,7 @@ void coalesce::run(ir::module &mod) { if(axes.empty()) continue; for(auto it = ++axes.rbegin(); it != axes.rend(); it++) - remat.insert(remat.begin(), - it->second.begin(), it->second.end()); + remat.insert(remat.begin(), it->second.begin(), it->second.end()); } // rematerialize values for(ir::io_inst *r: remat) { diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 6af64c105..806f003d1 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -202,9 +202,9 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c // create passes codegen::analysis::align align; codegen::analysis::axes axes; - codegen::analysis::layout layouts(&axes); + codegen::analysis::layout layouts(&axes, &align); codegen::analysis::tiles tiles(opt.num_warps, &align, &axes, &layouts); - codegen::analysis::liveness liveness(&tiles); + codegen::analysis::liveness liveness(&tiles, &layouts); codegen::analysis::allocation allocation(&liveness, &tiles); codegen::transform::membar barriers(&liveness, &allocation); codegen::transform::dce dce; From a3f76b6eb1515f20eb3a51abe7b53b450e359208 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 9 Oct 2019 18:17:48 -0400 Subject: [PATCH 431/494] [codegen] more cleaning --- include/triton/codegen/analysis/allocation.h | 5 +- include/triton/codegen/analysis/layout.h | 15 ++- include/triton/codegen/analysis/liveness.h | 3 +- include/triton/codegen/analysis/tiles.h | 65 ---------- include/triton/codegen/selection.h | 5 +- include/triton/runtime/function.h | 9 -- include/triton/tools/bench.hpp | 2 +- lib/codegen/analysis/allocation.cc | 1 - lib/codegen/analysis/layout.cc | 116 ++++++++++++++--- lib/codegen/analysis/liveness.cc | 5 +- lib/codegen/analysis/tiles.cc | 130 ------------------- lib/codegen/selection.cc | 29 ++--- lib/runtime/function.cc | 27 ++-- 13 files changed, 142 insertions(+), 270 deletions(-) delete mode 100644 include/triton/codegen/analysis/tiles.h delete mode 100644 lib/codegen/analysis/tiles.cc diff --git a/include/triton/codegen/analysis/allocation.h b/include/triton/codegen/analysis/allocation.h index b23f11964..858152150 100644 --- a/include/triton/codegen/analysis/allocation.h +++ b/include/triton/codegen/analysis/allocation.h @@ -24,8 +24,8 @@ class cts; class allocation { public: - allocation(liveness *live, tiles *params) - : liveness_(live), tiles_(params){ } + allocation(liveness *live) + : liveness_(live) { } // accessors bool has_offset(ir::value *x) const { return offsets_.find(x) != offsets_.end(); } unsigned offset(ir::value *x) const { return offsets_.at(x); } @@ -38,7 +38,6 @@ private: size_t allocated_size_; // dependences liveness *liveness_; - tiles *tiles_; }; } diff --git a/include/triton/codegen/analysis/layout.h b/include/triton/codegen/analysis/layout.h index a9d2d1a77..bec751659 100644 --- a/include/triton/codegen/analysis/layout.h +++ b/include/triton/codegen/analysis/layout.h @@ -28,10 +28,13 @@ enum layout_type_t { struct layout_t { layout_type_t type; - ir::value *i; std::vector axes; std::vector shapes; std::vector order; + std::map mts; + std::map nts; + std::map fpw; + std::map wpt; }; class layout { @@ -43,15 +46,18 @@ private: void connect(ir::value *x, ir::value *y); void make_graph(ir::instruction *i); + void init_hmma_tile(layout_t& layout); + void init_scanline_tile(layout_t &layout); + public: // constructor - layout(analysis::axes *axes, analysis::align *align); + layout(analysis::axes *axes, analysis::align *align, size_t num_warps); // accessors unsigned layout_of(ir::value *value) const; const std::vector& values_of(unsigned id) const; size_t num_layouts() const; - layout_t get(ir::value *v) const; - const std::map& get_all() const; + const layout_t& get(ir::value *v) const; + std::map &get_all(); // execution void run(ir::module &mod); @@ -59,6 +65,7 @@ public: private: analysis::axes* axes_; analysis::align* align_; + size_t num_warps_; tools::graph graph_; std::map groups_; std::map> values_; diff --git a/include/triton/codegen/analysis/liveness.h b/include/triton/codegen/analysis/liveness.h index 57fc90b81..b23463f06 100644 --- a/include/triton/codegen/analysis/liveness.h +++ b/include/triton/codegen/analysis/liveness.h @@ -73,7 +73,7 @@ private: public: - liveness(tiles *t, layout *l): tiles_(t), layouts_(l){ } + liveness(layout *l): layouts_(l){ } // padding unsigned get_pad(ir::value *v) const { return pad_.at(v); } // buffer size @@ -92,7 +92,6 @@ public: private: // analysis - tiles *tiles_; layout *layouts_; // stuff has_storage_map_t has_dedicated_storage_; diff --git a/include/triton/codegen/analysis/tiles.h b/include/triton/codegen/analysis/tiles.h deleted file mode 100644 index fdc03cee1..000000000 --- a/include/triton/codegen/analysis/tiles.h +++ /dev/null @@ -1,65 +0,0 @@ -#ifndef _TRITON_CODEGEN_ANALYSIS_TILES_H_ -#define _TRITON_CODEGEN_ANALYSIS_TILES_H_ - -#include -#include -#include -#include -#include "triton/codegen/analysis/layout.h" - -namespace triton{ - -namespace ir{ - class value; - class module; - class instruction; - class function; - class metaparameter; - class constant_int; -} - -namespace codegen{ - -namespace analysis{ - -class axes; -class layout; -class align; - - -class tiles { - typedef std::map> param_map_t; -private: - void init_hmma_tile(const layout_t& layout); - void init_scanline_tile(const layout_t& layout); - bool is_trans(ir::value *i); - -public: - tiles(size_t num_warps, analysis::align* align, analysis::axes* axes, analysis::layout* layout); - void run(ir::module &mod); - int mts(ir::value *value, unsigned ax); - int nts(ir::value *value, unsigned ax); - int fpw(ir::value *value, unsigned ax); - int wpt(ir::value *value, unsigned ax); - - -private: - // dependencies - analysis::align* align_; - analysis::layout* layout_; - analysis::axes* axes_; - // number of warps - size_t num_warps_; - // tile properties - std::map fpw_; - std::map wpt_; - std::map mts_; - std::map nts_; -}; - - -} -} -} - -#endif diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index bb03d3521..b20bc6d51 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -210,10 +210,10 @@ private: public: - selection(analysis::liveness* liveness, analysis::allocation *alloc, analysis::tiles *tiles, + selection(analysis::liveness* liveness, analysis::allocation *alloc, analysis::align *alignment, analysis::axes *axes, analysis::layout *layouts, target *tgt, unsigned num_warps) - : liveness_(liveness), alloc_(alloc), tiles_(tiles), + : liveness_(liveness), alloc_(alloc), alignment_(alignment), a_axes_(axes), layouts_(layouts), tgt_(tgt), num_warps_(num_warps){ } @@ -224,7 +224,6 @@ private: tmap_t tmap_; analysis::liveness *liveness_; analysis::allocation *alloc_; - analysis::tiles *tiles_; analysis::axes *a_axes_; analysis::layout *layouts_; analysis::align *alignment_; diff --git a/include/triton/runtime/function.h b/include/triton/runtime/function.h index 88de3825c..c12f9c6ca 100644 --- a/include/triton/runtime/function.h +++ b/include/triton/runtime/function.h @@ -11,15 +11,6 @@ // codegen #include "triton/codegen/selection.h" #include "triton/codegen/target.h" -#include "triton/codegen/analysis/tiles.h" -#include "triton/codegen/analysis/allocation.h" -#include "triton/codegen/analysis/liveness.h" -#include "triton/codegen/analysis/align.h" -#include "triton/codegen/transform/dce.h" -#include "triton/codegen/transform/peephole.h" -#include "triton/codegen/transform/membar.h" -#include "triton/codegen/transform/reassociate.h" -#include "triton/codegen/transform/cts.h" #include "triton/lang/parser.h" #include "triton/runtime/arg.h" diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp index 48a4ab972..554b3bcc3 100644 --- a/include/triton/tools/bench.hpp +++ b/include/triton/tools/bench.hpp @@ -38,7 +38,7 @@ inline double bench(std::function const & op, driver::stream * stream) double total_time = 0; op(); stream->synchronize(); - while(total_time*1e-9 < 1e-2){ + while(total_time*1e-9 < 1e-3){ float norm = 1; // normalize clock if possible to reduce noise in auto-tuning if(auto cu_device = dynamic_cast(stream->context()->device())) diff --git a/lib/codegen/analysis/allocation.cc b/lib/codegen/analysis/allocation.cc index 91ca0868f..0fde814f3 100644 --- a/lib/codegen/analysis/allocation.cc +++ b/lib/codegen/analysis/allocation.cc @@ -3,7 +3,6 @@ #include "triton/codegen/analysis/allocation.h" #include "triton/codegen/analysis/liveness.h" #include "triton/codegen/transform/cts.h" -#include "triton/codegen/analysis/tiles.h" #include "triton/ir/basic_block.h" #include "triton/ir/type.h" #include "triton/ir/value.h" diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index 2a446f3ef..4b52f9e3b 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -14,8 +14,8 @@ namespace analysis{ // constructor -layout::layout(analysis::axes *axes, analysis::align *align) - : axes_(axes), align_(align) { } +layout::layout(analysis::axes *axes, analysis::align *align, size_t num_warps) + : axes_(axes), align_(align), num_warps_(num_warps) { } // get group id unsigned layout::layout_of(ir::value *value) const @@ -72,19 +72,19 @@ bool is_hmma_c(ir::value *v){ return result; } -layout_t layout::get(ir::value *v) const { +const layout_t &layout::get(ir::value *v) const { return layouts_.at(groups_.at(v)); } -const std::map& layout::get_all() const { +std::map& layout::get_all() { return layouts_; } -void extract_io_use(ir::value *v, std::set& result) { +void extract_io_use(ir::value *v, std::set& result) { for(ir::user* u: v->get_users()){ auto i = dynamic_cast(u); if(i && i->get_pointer_operand() == v) - result.insert(i); + result.insert(v); } } @@ -102,6 +102,75 @@ inline bool is_trans(ir::value *v) { return false; } +inline unsigned clamp(unsigned x, unsigned lo, unsigned hi) { + return std::min(std::max(x, lo), hi); +} + +void layout::init_hmma_tile(layout_t& layout) { + auto ord = layout.order; + auto shapes = layout.shapes; + unsigned shape_0 = shapes[ord[0]]; + unsigned shape_1 = shapes[ord[1]]; + /* fragments per warp */ + // try to make things as square as possible to maximize data re-use + std::vector fpw = {1, 1, 1}; + std::vector fpw_nm1; + unsigned num_fragments = std::min((shape_0/8)*(shape_1/8), 4); + do { + fpw_nm1 = fpw; + if(fpw[0]*fpw[1] < num_fragments) + fpw[0] = clamp(fpw[0]*2, 1, shape_0 / 8); + if(fpw[0]*fpw[1] < num_fragments) + fpw[1] = clamp(fpw[1]*2, 1, shape_1 / 8); + }while(fpw_nm1 != fpw); + // store parameters + for(unsigned d = 0; d < shapes.size(); d++) + layout.fpw[d] = fpw[d]; + /* warps per tile */ + // try to make things as square as possible to maximize data re-use + std::vector wpt = {1, 1, 1}; + std::vector wpt_nm1; + do{ + wpt_nm1 = wpt; + if(wpt[0] * wpt[1] * wpt[2] < num_warps_) + wpt[0] = clamp(wpt[0]*2, 1, shape_0 / (fpw[0]*8)); + if(wpt[0] * wpt[1] * wpt[2] < num_warps_) + wpt[1] = clamp(wpt[1]*2, 1, shape_1 / (fpw[1]*8)); + }while(wpt_nm1 != wpt); + // store parameters + for(unsigned d = 0; d < shapes.size(); d++) + layout.wpt[d] = wpt[d]; + /* sanity check */ + unsigned effective_num_warps = 1; + for(size_t d = 0; d < shapes.size(); d++) + effective_num_warps *= layout.wpt[d]; + if(num_warps_ != effective_num_warps) + throw std::runtime_error("cannot create a kernel with this amount of warps"); +} + +void layout::init_scanline_tile(layout_t& layout) { + auto ord = layout.order; + auto shapes = layout.shapes; + unsigned size = std::accumulate(shapes.begin(), shapes.end(), 1, std::multiplies()); + unsigned ld = ord[0]; + unsigned num_threads = num_warps_*32; + unsigned current = num_threads; + layout.nts[ld] = clamp(size / num_threads, 1, 4); + layout.mts[ld] = clamp(current, 1, shapes[ld] / layout.nts[ld]); + current = current / layout.mts[ld]; + for(size_t d = 1; d < shapes.size(); d++){ + ld = ord[d]; + layout.nts[ld] = 1; + layout.mts[ld] = clamp(current, 1, shapes[ld]); + current = current / layout.mts[ld]; + } + /* sanity check */ + unsigned effective_num_threads = 1; + for(size_t d = 0; d < shapes.size(); d++) + effective_num_threads *= layout.mts[d]; + if(num_threads != effective_num_threads) + throw std::runtime_error("cannot create a kernel with this amount of warps"); +} void layout::run(ir::module &mod) { // make graph @@ -114,8 +183,8 @@ void layout::run(ir::module &mod) { // create layouts for(const auto& x: values_) { bool hmma_c = std::any_of(x.second.begin(), x.second.end(), &is_hmma_c); + // type layouts_[x.first].type = hmma_c ? HMMA_884 : SCANLINE; - } @@ -130,35 +199,32 @@ void layout::run(ir::module &mod) { return ret; }; - // find out which value is the largest in each group + // find out axes for each layout for(const auto& x: values_) { auto cmp = [&rank](ir::value* x, ir::value *y) { return rank(x) < rank(y); }; ir::value *largest = *std::max_element(x.second.begin(), x.second.end(), cmp); layouts_[x.first].axes = axes_->get(largest); - layouts_[x.first].i = largest; layouts_[x.first].shapes = largest->get_type()->get_tile_shapes(); } // find out the layout ordering of a group - for(size_t i = 0; i < num_groups; i++){ - std::set io; - for(ir::value* v: values_of(i)) - extract_io_use(v, io); - auto cmp = [&rank](ir::io_inst* x, ir::io_inst *y) { - return rank(x->get_pointer_operand()) < rank(y->get_pointer_operand()); - }; - auto it = std::max_element(io.begin(), io.end(), cmp); - std::vector order(layouts_[i].axes.size()); + for(const auto& x: values_) { + std::set ptr; + for(ir::value* v: x.second) + extract_io_use(v, ptr); + size_t rank = layouts_[x.first].axes.size(); + std::vector order(rank); std::iota(order.begin(), order.end(), 0); - if(it != io.end()) { - auto max_contiguous = align_->contiguous((*it)->get_pointer_operand()); + for(ir::value *v: ptr){ + auto max_contiguous = align_->contiguous(v); std::sort(order.begin(), order.end(), [&](unsigned a, unsigned b) { return max_contiguous[a] > max_contiguous[b]; } ); } - layouts_[i].order = order; + layouts_[x.first].order = order; } + // matrix multiplication optimizations for(size_t i = 0; i < num_groups; i++){ std::vector dots; @@ -187,6 +253,14 @@ void layout::run(ir::module &mod) { } } + // tiling parameters + for(auto& x: layouts_){ + /* HMMA parameters*/ + if(x.second.type == HMMA_884) + init_hmma_tile(x.second); + else + init_scanline_tile(x.second); + } } } diff --git a/lib/codegen/analysis/liveness.cc b/lib/codegen/analysis/liveness.cc index 35c801e8f..d85271553 100644 --- a/lib/codegen/analysis/liveness.cc +++ b/lib/codegen/analysis/liveness.cc @@ -3,7 +3,6 @@ #include #include "triton/codegen/instructions.h" #include "triton/codegen/analysis/liveness.h" -#include "triton/codegen/analysis/tiles.h" #include "triton/codegen/analysis/layout.h" #include "triton/codegen/transform/cts.h" #include "triton/ir/basic_block.h" @@ -146,9 +145,9 @@ unsigned liveness::num_bytes(ir::value *x) { num_elements *= x; size_t depth; if(layouts_->get(x).type == HMMA_884) - depth = tiles_->wpt(op, axis); + depth = layouts_->get(op).wpt.at(axis); else - depth = tiles_->mts(op, axis); + depth = layouts_->get(op).mts.at(axis); return num_elements * num_bytes * depth; } unsigned num_bytes = x->get_type()->get_primitive_size_in_bits() / 8; diff --git a/lib/codegen/analysis/tiles.cc b/lib/codegen/analysis/tiles.cc deleted file mode 100644 index 6a16544e6..000000000 --- a/lib/codegen/analysis/tiles.cc +++ /dev/null @@ -1,130 +0,0 @@ -#include -#include -#include -#include "triton/codegen/analysis/align.h" -#include "triton/codegen/analysis/axes.h" -#include "triton/codegen/analysis/tiles.h" -#include "triton/codegen/analysis/layout.h" -#include "triton/ir/instructions.h" -#include "triton/ir/type.h" -#include "triton/ir/module.h" -#include "triton/ir/function.h" -#include "triton/ir/context_impl.h" -#include "triton/ir/constant.h" -#include "triton/driver/device.h" - - - -namespace triton{ -namespace codegen{ -namespace analysis{ - -tiles::tiles(size_t num_warps, analysis::align *align, analysis::axes *axes, analysis::layout *layout): - num_warps_(num_warps), align_(align), axes_(axes), layout_(layout) -{ } - - - -int tiles::mts(ir::value *value, unsigned ax) { - return mts_.at(axes_->get(value, ax)); -} - -int tiles::nts(ir::value *value, unsigned ax) { - return nts_.at(axes_->get(value, ax)); -} - -int tiles::fpw(ir::value *value, unsigned ax) { - return fpw_.at(axes_->get(value, ax)); -} - -int tiles::wpt(ir::value *value, unsigned ax) { - return wpt_.at(axes_->get(value, ax)); -} - - -unsigned clamp(unsigned x, unsigned lo, unsigned hi) { - return std::min(std::max(x, lo), hi); -} - - -void tiles::init_hmma_tile(const layout_t& layout) { - auto ord = layout.order; - auto shapes = layout.i->get_type()->get_tile_shapes(); - unsigned shape_0 = shapes[ord[0]]; - unsigned shape_1 = shapes[ord[1]]; - /* fragments per warp */ - // try to make things as square as possible to maximize data re-use - std::vector fpw = {1, 1, 1}; - std::vector fpw_nm1; - unsigned num_fragments = std::min((shape_0/8)*(shape_1/8), 4); - do { - fpw_nm1 = fpw; - if(fpw[0]*fpw[1] < num_fragments) - fpw[0] = clamp(fpw[0]*2, 1, shape_0 / 8); - if(fpw[0]*fpw[1] < num_fragments) - fpw[1] = clamp(fpw[1]*2, 1, shape_1 / 8); - }while(fpw_nm1 != fpw); - // store parameters - for(unsigned d = 0; d < shapes.size(); d++) - fpw_[layout.axes[d]] = fpw[d]; - /* warps per tile */ - // try to make things as square as possible to maximize data re-use - std::vector wpt = {1, 1, 1}; - std::vector wpt_nm1; - do{ - wpt_nm1 = wpt; - if(wpt[0] * wpt[1] * wpt[2] < num_warps_) - wpt[0] = clamp(wpt[0]*2, 1, shape_0 / (fpw[0]*8)); - if(wpt[0] * wpt[1] * wpt[2] < num_warps_) - wpt[1] = clamp(wpt[1]*2, 1, shape_1 / (fpw[1]*8)); - }while(wpt_nm1 != wpt); - // store parameters - for(unsigned d = 0; d < shapes.size(); d++) - wpt_[layout.axes[d]] = wpt[d]; - /* sanity check */ - unsigned effective_num_warps = 1; - for(size_t d = 0; d < shapes.size(); d++) - effective_num_warps *= wpt_[layout.axes[d]]; - if(num_warps_ != effective_num_warps) - throw std::runtime_error("cannot create a kernel with this amount of warps"); -} - -void tiles::init_scanline_tile(const layout_t& layout) { - auto ord = layout.order; - auto shapes = layout.shapes; - unsigned size = std::accumulate(shapes.begin(), shapes.end(), 1, std::multiplies()); - unsigned ld = ord[0]; - unsigned num_threads = num_warps_*32; - unsigned current = num_threads; - nts_[layout.axes[ld]] = clamp(size / num_threads, 1, 4); - mts_[layout.axes[ld]] = clamp(current, 1, shapes[ld] / nts_[layout.axes[ld]]); - current = current / mts_[layout.axes[ld]]; - for(size_t d = 1; d < shapes.size(); d++){ - ld = ord[d]; - nts_[layout.axes[ld]] = 1; - mts_[layout.axes[ld]] = clamp(current, 1, shapes[ld]); - current = current / mts_[layout.axes[ld]]; - } - /* sanity check */ - unsigned effective_num_threads = 1; - for(size_t d = 0; d < shapes.size(); d++) - effective_num_threads *= mts_[layout.axes[d]]; -// std::cout << num_threads << " " << effective_num_threads << std::endl; - if(num_threads != effective_num_threads) - throw std::runtime_error("cannot create a kernel with this amount of warps"); -} - -void tiles::run(ir::module &) { - // tiling parameters - for(auto x: layout_->get_all()){ - /* HMMA parameters*/ - if(x.second.type == HMMA_884) - init_hmma_tile(x.second); - else - init_scanline_tile(x.second); - } -} - -} -} -} diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 1505bcbc6..ee5b55f08 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -4,7 +4,6 @@ #include "triton/codegen/analysis/liveness.h" #include "triton/codegen/analysis/layout.h" #include "triton/codegen/analysis/axes.h" -#include "triton/codegen/analysis/tiles.h" #include "triton/codegen/analysis/allocation.h" #include "triton/codegen/analysis/align.h" #include "triton/codegen/transform/coalesce.h" @@ -584,8 +583,8 @@ void selection::init_strided_scan_axes(const analysis::layout_t& layout, IRBuild std::vector nts(dim); std::vector mts(dim); for(unsigned i = 0; i < shapes.size(); i++){ - nts[i] = tiles_->nts(layout.i, i); - mts[i] = tiles_->mts(layout.i, i); + nts[i] = layout.nts.at(i); + mts[i] = layout.mts.at(i); } Value* full_thread_id = builder.CreateAdd(builder.CreateMul(u_warp_id, builder.getInt32(32)), u_thread_id); std::vector thread_id = delinearize(full_thread_id, order, mts, builder); @@ -618,13 +617,13 @@ void selection::init_hmma_axes(const analysis::layout_t& layout, IRBuilder<> &bu Value *_16 = builder.getInt32(16); // fragments per warp - unsigned fpw_0 = tiles_->fpw(layout.i, 0); - unsigned fpw_1 = tiles_->fpw(layout.i, 1); - unsigned fpw_2 = is_batched ? tiles_->fpw(layout.i, 2) : 1; + unsigned fpw_0 = layout.fpw.at(0); + unsigned fpw_1 = layout.fpw.at(1); + unsigned fpw_2 = is_batched ? layout.fpw.at(2) : 1; // warps per tile - unsigned wpt_0 = tiles_->wpt(layout.i, 0); - unsigned wpt_1 = tiles_->wpt(layout.i, 1); - unsigned wpt_2 = is_batched ? tiles_->wpt(layout.i, 2) : 1; + unsigned wpt_0 = layout.wpt.at(0); + unsigned wpt_1 = layout.wpt.at(1); + unsigned wpt_2 = is_batched ? layout.wpt.at(2) : 1; // hmma warp tile size unsigned hmma_wts_0 = fpw_0 * 8; unsigned hmma_wts_1 = fpw_1 * 8; @@ -933,7 +932,7 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, tgt_->add_barrier(module, builder); builder.CreateStore(result, write_ptr); // build result - unsigned depth = tiles_->wpt(op, axis); + unsigned depth = layouts_->get(op).wpt.at(axis); for(unsigned i = depth/2; i > 0; i >>= 1){ // current indices indices_t current(write_idx.size(), builder.getInt32(0)); @@ -1022,7 +1021,7 @@ void selection::lower_copy_to_shared(ir::copy_to_shared_inst *x, LLVMContext &ct distributed_tile* in = (distributed_tile*)tmap_.at(arg); if(x_order == arg_order){ size_t ld = arg_order[0]; - vector_size = std::min(tiles_->nts(x, ld),tiles_->nts(arg, ld)); + vector_size = std::min(layouts_->get(x).nts.at(ld), layouts_->get(arg).nts.at(ld)); } std::map packets; @@ -1118,12 +1117,12 @@ void selection::lower_hmma_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn "{$10, $11}, " "{$0, $1, $2, $3, $4, $5, $6, $7};", "=f,=f,=f,=f,=f,=f,=f,=f,r,r,r,r,0,1,2,3,4,5,6,7", false); - unsigned fpw_0 = tiles_->fpw(dot, 0); - unsigned fpw_1 = tiles_->fpw(dot, 1); + unsigned fpw_0 = layouts_->get(dot).fpw.at(0); + unsigned fpw_1 = layouts_->get(dot).fpw.at(1); unsigned wts_0 = fpw_0 * 8; unsigned wts_1 = fpw_1 * 8; - unsigned wpt_0 = tiles_->wpt(dot, 0); - unsigned wpt_1 = tiles_->wpt(dot, 1); + unsigned wpt_0 = layouts_->get(dot).wpt.at(0); + unsigned wpt_1 = layouts_->get(dot).wpt.at(1); unsigned stride_rep_i = wpt_0 * wts_0; unsigned stride_rep_j = wpt_1 * wts_1; unsigned num_rep_i = shapes[0] / stride_rep_i; diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 806f003d1..9578a3acb 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -4,11 +4,17 @@ #include #include #include "triton/codegen/analysis/axes.h" -#include "triton/codegen/analysis/layout.h" -#include "triton/codegen/analysis/tiles.h" +#include "triton/codegen/analysis/allocation.h" +#include "triton/codegen/analysis/liveness.h" +#include "triton/codegen/analysis/align.h" +#include "triton/codegen/transform/coalesce.h" +#include "triton/codegen/transform/dce.h" +#include "triton/codegen/transform/peephole.h" +#include "triton/codegen/transform/membar.h" +#include "triton/codegen/transform/reassociate.h" +#include "triton/codegen/transform/cts.h" #include "triton/codegen/selection.h" #include "triton/runtime/function.h" -#include "triton/codegen/transform/coalesce.h" #include "triton/lang/cpp.h" #include "triton/lang/parser.h" #include "triton/lang/code_gen.h" @@ -202,17 +208,16 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c // create passes codegen::analysis::align align; codegen::analysis::axes axes; - codegen::analysis::layout layouts(&axes, &align); - codegen::analysis::tiles tiles(opt.num_warps, &align, &axes, &layouts); - codegen::analysis::liveness liveness(&tiles, &layouts); - codegen::analysis::allocation allocation(&liveness, &tiles); + codegen::analysis::layout layouts(&axes, &align, opt.num_warps); + codegen::analysis::liveness liveness(&layouts); + codegen::analysis::allocation allocation(&liveness); codegen::transform::membar barriers(&liveness, &allocation); codegen::transform::dce dce; codegen::transform::peephole peephole; codegen::transform::reassociate reassociate(&align); codegen::transform::coalesce coalesce(&align, &layouts); codegen::transform::cts cts; - codegen::selection selection(&liveness, &allocation, &tiles, &align, &axes, &layouts, target.get(), opt.num_warps); + codegen::selection selection(&liveness, &allocation, &align, &axes, &layouts, target.get(), opt.num_warps); // run passes peephole.run(module); dce.run(module); @@ -226,24 +231,20 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c dce.run(module); reassociate.run(module); // ir::print(module, std::cout); -// exit(EXIT_FAILURE); dce.run(module); cts.run(module); align.run(module); axes.run(module); layouts.run(module); - tiles.run(module); liveness.run(module); allocation.run(module); if(allocation.allocated_size() > context->device()->max_shared_memory()) return std::unique_ptr(); barriers.run(module); dce.run(module); + align.run(module); axes.run(module); layouts.run(module); -// ir::print(module, std::cout); - align.run(module); - tiles.run(module); selection.run(module, *llvm); // return binary std::unique_ptr res(driver::module::create(context, std::move(llvm))); From 4efd0a3c6b9b04f6b1a476b582aef0d70ebbee64 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 10 Oct 2019 15:52:03 -0400 Subject: [PATCH 432/494] [codegen] more cleaning --- include/triton/codegen/analysis/layout.h | 36 ++++- include/triton/codegen/analysis/liveness.h | 4 +- include/triton/codegen/selection.h | 2 +- include/triton/ir/type.h | 1 + lib/codegen/analysis/layout.cc | 171 ++++++++++----------- lib/codegen/analysis/liveness.cc | 26 +--- lib/codegen/selection.cc | 43 +++--- lib/codegen/transform/coalesce.cc | 1 - lib/ir/type.cc | 8 + 9 files changed, 148 insertions(+), 144 deletions(-) diff --git a/include/triton/codegen/analysis/layout.h b/include/triton/codegen/analysis/layout.h index bec751659..629a5fc02 100644 --- a/include/triton/codegen/analysis/layout.h +++ b/include/triton/codegen/analysis/layout.h @@ -27,14 +27,35 @@ enum layout_type_t { }; struct layout_t { + layout_t(layout_type_t _type, + const std::vector& _axes, + const std::vector &_shapes, + const std::vector &values, + analysis::align* align); layout_type_t type; std::vector axes; std::vector shapes; std::vector order; - std::map mts; - std::map nts; - std::map fpw; - std::map wpt; + std::vector mts; + std::vector nts; + std::vector fpw; + std::vector wpt; +}; + +struct layout_hmma_884_t: public layout_t { + layout_hmma_884_t(size_t num_warps, + const std::vector& _axes, + const std::vector& _shapes, + const std::vector &values, + analysis::align* align); +}; + +struct layout_scanline_t: public layout_t { + layout_scanline_t(size_t num_warps, + const std::vector& _axes, + const std::vector& _shapes, + const std::vector &values, + analysis::align* align); }; class layout { @@ -52,12 +73,13 @@ private: public: // constructor layout(analysis::axes *axes, analysis::align *align, size_t num_warps); + // accessors unsigned layout_of(ir::value *value) const; const std::vector& values_of(unsigned id) const; size_t num_layouts() const; - const layout_t& get(ir::value *v) const; - std::map &get_all(); + const layout_t* get(ir::value *v) const; + std::map &get_all(); // execution void run(ir::module &mod); @@ -69,7 +91,7 @@ private: tools::graph graph_; std::map groups_; std::map> values_; - std::map layouts_; + std::map layouts_; }; } diff --git a/include/triton/codegen/analysis/liveness.h b/include/triton/codegen/analysis/liveness.h index b23463f06..05ec3a1df 100644 --- a/include/triton/codegen/analysis/liveness.h +++ b/include/triton/codegen/analysis/liveness.h @@ -45,7 +45,9 @@ struct double_buffer_info_t { struct buffer_t { size_t id; size_t size; - bool operator<(buffer_t other) const { return id < other.id; } + bool operator<(buffer_t other) const { + return id < other.id; + } }; class liveness { diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index b20bc6d51..dce6a6278 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -62,7 +62,7 @@ class target; typedef std::vector indices_t; struct distributed_axis { - size_t contiguous; + int contiguous; std::vector values; Value* thread_id; }; diff --git a/include/triton/ir/type.h b/include/triton/ir/type.h index 4b67d9e94..dedc8ea8c 100644 --- a/include/triton/ir/type.h +++ b/include/triton/ir/type.h @@ -64,6 +64,7 @@ public: type *get_scalar_ty() const; const tile_shapes_t& get_tile_shapes() const; const size_t get_tile_rank() const; + const size_t get_tile_ranks1() const; unsigned get_tile_num_elements() const; type *get_tile_element_ty() const; unsigned get_pointer_address_space() const; diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index 4b52f9e3b..8b9dae808 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -58,6 +58,7 @@ void layout::make_graph(ir::instruction *i) { } } + // hmma bool is_hmma_c(ir::value *v){ bool result = false; @@ -72,11 +73,11 @@ bool is_hmma_c(ir::value *v){ return result; } -const layout_t &layout::get(ir::value *v) const { +const layout_t* layout::get(ir::value *v) const { return layouts_.at(groups_.at(v)); } -std::map& layout::get_all() { +std::map& layout::get_all() { return layouts_; } @@ -102,19 +103,45 @@ inline bool is_trans(ir::value *v) { return false; } + + +layout_t::layout_t(layout_type_t _type, + const std::vector &_axes, + const std::vector &_shapes, + const std::vector &values, + analysis::align* align): type(_type), axes(_axes), shapes(_shapes) { + // io pointer + std::set ptr; + for(ir::value* v: values) + extract_io_use(v, ptr); + size_t rank = axes.size(); + std::vector order(rank); + std::iota(order.begin(), order.end(), 0); + for(ir::value *v: ptr){ + auto max_contiguous = align->contiguous(v); + std::sort(order.begin(), order.end(), [&](unsigned a, unsigned b) { + return max_contiguous[a] > max_contiguous[b]; + }); + } + this->order = order; +} + inline unsigned clamp(unsigned x, unsigned lo, unsigned hi) { return std::min(std::max(x, lo), hi); } -void layout::init_hmma_tile(layout_t& layout) { - auto ord = layout.order; - auto shapes = layout.shapes; - unsigned shape_0 = shapes[ord[0]]; - unsigned shape_1 = shapes[ord[1]]; +layout_hmma_884_t::layout_hmma_884_t(size_t num_warps, + const std::vector& _axes, + const std::vector& _shapes, + const std::vector &values, + analysis::align* align): layout_t(HMMA_884, _axes, _shapes, values, align) { + + unsigned shape_0 = shapes[order[0]]; + unsigned shape_1 = shapes[order[1]]; /* fragments per warp */ // try to make things as square as possible to maximize data re-use - std::vector fpw = {1, 1, 1}; - std::vector fpw_nm1; + fpw = {1, 1, 1}; + std::vector fpw_nm1; unsigned num_fragments = std::min((shape_0/8)*(shape_1/8), 4); do { fpw_nm1 = fpw; @@ -123,144 +150,108 @@ void layout::init_hmma_tile(layout_t& layout) { if(fpw[0]*fpw[1] < num_fragments) fpw[1] = clamp(fpw[1]*2, 1, shape_1 / 8); }while(fpw_nm1 != fpw); - // store parameters - for(unsigned d = 0; d < shapes.size(); d++) - layout.fpw[d] = fpw[d]; /* warps per tile */ // try to make things as square as possible to maximize data re-use - std::vector wpt = {1, 1, 1}; - std::vector wpt_nm1; + wpt = {1, 1, 1}; + std::vector wpt_nm1; do{ wpt_nm1 = wpt; - if(wpt[0] * wpt[1] * wpt[2] < num_warps_) + if(wpt[0] * wpt[1] * wpt[2] < num_warps) wpt[0] = clamp(wpt[0]*2, 1, shape_0 / (fpw[0]*8)); - if(wpt[0] * wpt[1] * wpt[2] < num_warps_) + if(wpt[0] * wpt[1] * wpt[2] < num_warps) wpt[1] = clamp(wpt[1]*2, 1, shape_1 / (fpw[1]*8)); }while(wpt_nm1 != wpt); - // store parameters - for(unsigned d = 0; d < shapes.size(); d++) - layout.wpt[d] = wpt[d]; /* sanity check */ unsigned effective_num_warps = 1; for(size_t d = 0; d < shapes.size(); d++) - effective_num_warps *= layout.wpt[d]; - if(num_warps_ != effective_num_warps) + effective_num_warps *= wpt[d]; + if(num_warps != effective_num_warps) throw std::runtime_error("cannot create a kernel with this amount of warps"); } -void layout::init_scanline_tile(layout_t& layout) { - auto ord = layout.order; - auto shapes = layout.shapes; +layout_scanline_t::layout_scanline_t(size_t num_warps, + const std::vector& _axes, + const std::vector& _shapes, + const std::vector &values, + analysis::align* align): layout_t(SCANLINE, _axes, _shapes, values, align){ unsigned size = std::accumulate(shapes.begin(), shapes.end(), 1, std::multiplies()); - unsigned ld = ord[0]; - unsigned num_threads = num_warps_*32; - unsigned current = num_threads; - layout.nts[ld] = clamp(size / num_threads, 1, 4); - layout.mts[ld] = clamp(current, 1, shapes[ld] / layout.nts[ld]); - current = current / layout.mts[ld]; + unsigned num_threads = num_warps * 32; + nts.resize(shapes.size()); + mts.resize(shapes.size()); + unsigned i = order[0]; + nts[i] = clamp(size / num_threads, 1, 4); + mts[i] = clamp(num_threads, 1, shapes[i] / nts[i]); + num_threads = num_threads / mts[i]; for(size_t d = 1; d < shapes.size(); d++){ - ld = ord[d]; - layout.nts[ld] = 1; - layout.mts[ld] = clamp(current, 1, shapes[ld]); - current = current / layout.mts[ld]; + i = order[d]; + nts[i] = 1; + mts[i] = clamp(num_threads, 1, shapes[i]); + num_threads = num_threads / mts[i]; } /* sanity check */ unsigned effective_num_threads = 1; for(size_t d = 0; d < shapes.size(); d++) - effective_num_threads *= layout.mts[d]; - if(num_threads != effective_num_threads) + effective_num_threads *= mts[d]; + if(num_warps * 32 != effective_num_threads) throw std::runtime_error("cannot create a kernel with this amount of warps"); } + void layout::run(ir::module &mod) { // make graph graph_.clear(); ir::for_each_instruction(mod, [this](ir::instruction* i) { make_graph(i); }); + // connected components graph_.connected_components(&values_, &groups_); + // create layouts for(const auto& x: values_) { bool hmma_c = std::any_of(x.second.begin(), x.second.end(), &is_hmma_c); - // type - layouts_[x.first].type = hmma_c ? HMMA_884 : SCANLINE; - } - - - /* ---- TO CLEAN ---- */ - - size_t num_groups = num_layouts(); - // helpers - auto rank = [this](ir::value* v) { - int ret = 0; - for(int s: v->get_type()->get_tile_shapes()) - ret += s > 1; - return ret; - }; - - // find out axes for each layout - for(const auto& x: values_) { - auto cmp = [&rank](ir::value* x, ir::value *y) { return rank(x) < rank(y); }; + auto cmp = [](ir::value* x, ir::value *y) { + return x->get_type()->get_tile_ranks1() < + y->get_type()->get_tile_ranks1(); + }; ir::value *largest = *std::max_element(x.second.begin(), x.second.end(), cmp); - layouts_[x.first].axes = axes_->get(largest); - layouts_[x.first].shapes = largest->get_type()->get_tile_shapes(); + const auto& axes = axes_->get(largest); + const auto& shapes = largest->get_type()->get_tile_shapes(); + // type + if(hmma_c) + layouts_[x.first] = new layout_hmma_884_t(num_warps_, axes, shapes, x.second, align_); + else + layouts_[x.first] = new layout_scanline_t(num_warps_, axes, shapes, x.second, align_); } - // find out the layout ordering of a group - for(const auto& x: values_) { - std::set ptr; - for(ir::value* v: x.second) - extract_io_use(v, ptr); - size_t rank = layouts_[x.first].axes.size(); - std::vector order(rank); - std::iota(order.begin(), order.end(), 0); - for(ir::value *v: ptr){ - auto max_contiguous = align_->contiguous(v); - std::sort(order.begin(), order.end(), [&](unsigned a, unsigned b) { - return max_contiguous[a] > max_contiguous[b]; } - ); - } - layouts_[x.first].order = order; - } - // matrix multiplication optimizations - for(size_t i = 0; i < num_groups; i++){ + for(const auto& x: values_) { std::vector dots; - for(ir::value* v: values_of(i)) + for(ir::value* v: x.second) if(auto *x = dynamic_cast(v)) dots.push_back(x); for(ir::dot_inst* dot: dots){ ir::value* a = dot->get_operand(0); ir::value* b = dot->get_operand(1); - if(get(dot).type == HMMA_884){ + if(get(dot)->type == HMMA_884){ auto a_val = values_of(layout_of(a)); auto b_val = values_of(layout_of(b)); for(ir::value *v: a_val) if(auto *cts = dynamic_cast(v)) - layouts_[layout_of(a)].order = layouts_[layout_of(cts->get_operand(0))].order; + layouts_[layout_of(a)]->order = layouts_[layout_of(cts->get_operand(0))]->order; for(ir::value *v: b_val) if(auto *cts = dynamic_cast(v)) - layouts_[layout_of(b)].order = layouts_[layout_of(cts->get_operand(0))].order; + layouts_[layout_of(b)]->order = layouts_[layout_of(cts->get_operand(0))]->order; } else{ std::vector col = {0, 1}; std::vector row = {1, 0}; - layouts_[layout_of(a)].order = is_trans(a) ? row : col; - layouts_[layout_of(b)].order = is_trans(b) ? col : row; + layouts_[layout_of(a)]->order = is_trans(a) ? row : col; + layouts_[layout_of(b)]->order = is_trans(b) ? col : row; } } } - - // tiling parameters - for(auto& x: layouts_){ - /* HMMA parameters*/ - if(x.second.type == HMMA_884) - init_hmma_tile(x.second); - else - init_scanline_tile(x.second); - } } } diff --git a/lib/codegen/analysis/liveness.cc b/lib/codegen/analysis/liveness.cc index d85271553..00581e281 100644 --- a/lib/codegen/analysis/liveness.cc +++ b/lib/codegen/analysis/liveness.cc @@ -89,8 +89,8 @@ bool liveness::do_pad(ir::value *x) { ir::value *b = dot->get_operand(1); size_t a_previous = pad_[a]; size_t b_previous = pad_[b]; - auto a_order = layouts_->get(a).order; - auto b_order = layouts_->get(b).order; + auto a_order = layouts_->get(a)->order; + auto b_order = layouts_->get(b)->order; bool a_row = is_trans(a) ^ (a_order[0] == 1); bool b_row = is_trans(b) ^ (b_order[0] == 1); auto a_shapes = a->get_type()->get_tile_shapes(); @@ -108,9 +108,9 @@ bool liveness::do_pad(ir::value *x) { } // padding for copy to shared if(auto* cts = dynamic_cast(x)) { - auto cts_order = layouts_->get(cts).order; + auto cts_order = layouts_->get(cts)->order; ir::value *arg = cts->get_operand(0); - auto arg_order = layouts_->get(arg).order; + auto arg_order = layouts_->get(arg)->order; size_t previous = pad_[cts]; if(cts_order != arg_order) pad_[cts] = std::max(pad_[cts], 4); @@ -134,26 +134,10 @@ bool liveness::do_pad(ir::value *x) { } unsigned liveness::num_bytes(ir::value *x) { - if(auto *red = dynamic_cast(x)){ - unsigned num_bytes = x->get_type()->get_scalar_ty()->get_primitive_size_in_bits() / 8; - size_t axis = red->get_axis(); - ir::value *op = red->get_operand(0); - auto shapes = op->get_type()->get_tile_shapes(); - shapes.erase(shapes.begin() + axis); - size_t num_elements = 1; - for(auto x: shapes) - num_elements *= x; - size_t depth; - if(layouts_->get(x).type == HMMA_884) - depth = layouts_->get(op).wpt.at(axis); - else - depth = layouts_->get(op).mts.at(axis); - return num_elements * num_bytes * depth; - } unsigned num_bytes = x->get_type()->get_primitive_size_in_bits() / 8; unsigned pad = pad_.at(x); if(pad > 0){ - unsigned ld = x->get_type()->get_tile_shapes()[layouts_->get(x).order[0]]; + unsigned ld = x->get_type()->get_tile_shapes()[layouts_->get(x)->order[0]]; num_bytes += pad * num_bytes / ld; } if(has_double(x)) diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index ee5b55f08..4b25106aa 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -559,7 +559,7 @@ Value* selection::llvm_value(ir::value *v, IRBuilder<> &builder) { * ------------------- */ // Grid construction -std::vector delinearize(Value *trailing, const std::vector& order, std::vector &shapes, IRBuilder<> &builder){ +std::vector delinearize(Value *trailing, const std::vector& order, std::vector &shapes, IRBuilder<> &builder){ size_t dim = shapes.size(); std::vector result(dim); for(unsigned k = 0; k < dim - 1; k++){ @@ -580,12 +580,8 @@ void selection::init_strided_scan_axes(const analysis::layout_t& layout, IRBuild auto order = layout.order; const auto& shapes = layout.shapes; size_t dim = shapes.size(); - std::vector nts(dim); - std::vector mts(dim); - for(unsigned i = 0; i < shapes.size(); i++){ - nts[i] = layout.nts.at(i); - mts[i] = layout.mts.at(i); - } + std::vector nts = layout.nts; + std::vector mts = layout.mts; Value* full_thread_id = builder.CreateAdd(builder.CreateMul(u_warp_id, builder.getInt32(32)), u_thread_id); std::vector thread_id = delinearize(full_thread_id, order, mts, builder); // Create axes @@ -608,6 +604,7 @@ void selection::init_hmma_axes(const analysis::layout_t& layout, IRBuilder<> &bu const auto& shapes = layout.shapes; if(shapes.size() > 3) throw std::runtime_error("unsupported"); + bool is_batched = shapes.size() >= 3; Value *_1 = builder.getInt32(1); @@ -725,7 +722,7 @@ void selection::init_axes(const analysis::layout_t& layout, IRBuilder<> &builder void selection::create_shared_tile(ir::value *v, IRBuilder<> &builder, Value *sh_mem_ptr) { if(tmap_.find(v) != tmap_.end()) return; - auto order = layouts_->get(v).order; + auto order = layouts_->get(v)->order; auto shapes = v->get_type()->get_tile_shapes(); unsigned pad = liveness_->get_pad(v); if(pad > 0) @@ -775,7 +772,7 @@ void selection::create_distributed_tile(ir::value *v, IRBuilder<> &builder) { axes[d].values = {builder.getInt32(0)}; } } - distributed_tile *T = new distributed_tile(ty, shapes, layouts_->get(v).order, axes, builder, false); + distributed_tile *T = new distributed_tile(ty, shapes, layouts_->get(v)->order, axes, builder, false); bool is_inserted = tmap_.insert({v, T}).second; // constant range if(is_inserted && dynamic_cast(v)){ @@ -819,7 +816,7 @@ void selection::init_layouts(ir::function *fn, IRBuilder<> &builder, Value *sh_m Value *u_warp_id = builder.CreateUDiv(u_thread_id, warp_size); // create grid for(auto x: layouts_->get_all()) - init_axes(x.second, builder, u_thread_warp_id, u_warp_id); + init_axes(*x.second, builder, u_thread_warp_id, u_warp_id); // create tile std::set seen; for(ir::basic_block *block: fn->blocks()) @@ -932,7 +929,7 @@ void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, tgt_->add_barrier(module, builder); builder.CreateStore(result, write_ptr); // build result - unsigned depth = layouts_->get(op).wpt.at(axis); + unsigned depth = layouts_->get(op)->wpt.at(axis); for(unsigned i = depth/2; i > 0; i >>= 1){ // current indices indices_t current(write_idx.size(), builder.getInt32(0)); @@ -1013,15 +1010,15 @@ void selection::lower_broadcast(ir::broadcast_inst *x, LLVMContext &ctx, Functio void selection::lower_copy_to_shared(ir::copy_to_shared_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { unsigned vector_size = 1; - auto x_order = layouts_->get(x).order; + auto x_order = layouts_->get(x)->order; ir::value *arg = x->get_operand(0); - auto arg_order = layouts_->get(arg).order; + auto arg_order = layouts_->get(arg)->order; // tiles shared_tile* result = (shared_tile*)tmap_.at(x); distributed_tile* in = (distributed_tile*)tmap_.at(arg); if(x_order == arg_order){ size_t ld = arg_order[0]; - vector_size = std::min(layouts_->get(x).nts.at(ld), layouts_->get(arg).nts.at(ld)); + vector_size = std::min(layouts_->get(x)->nts.at(ld), layouts_->get(arg)->nts.at(ld)); } std::map packets; @@ -1090,8 +1087,8 @@ void selection::lower_hmma_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn Value* u_thread_id = tgt_->get_local_id(builder.GetInsertBlock()->getModule(), builder, 0); - auto ord_a = layouts_->get(dot->get_operand(0)).order; - auto ord_b = layouts_->get(dot->get_operand(1)).order; + auto ord_a = layouts_->get(dot->get_operand(0))->order; + auto ord_b = layouts_->get(dot->get_operand(1))->order; bool is_a_trans = is_trans(dot->get_operand(0)); bool is_b_trans = is_trans(dot->get_operand(1)); @@ -1117,12 +1114,12 @@ void selection::lower_hmma_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn "{$10, $11}, " "{$0, $1, $2, $3, $4, $5, $6, $7};", "=f,=f,=f,=f,=f,=f,=f,=f,r,r,r,r,0,1,2,3,4,5,6,7", false); - unsigned fpw_0 = layouts_->get(dot).fpw.at(0); - unsigned fpw_1 = layouts_->get(dot).fpw.at(1); + unsigned fpw_0 = layouts_->get(dot)->fpw.at(0); + unsigned fpw_1 = layouts_->get(dot)->fpw.at(1); unsigned wts_0 = fpw_0 * 8; unsigned wts_1 = fpw_1 * 8; - unsigned wpt_0 = layouts_->get(dot).wpt.at(0); - unsigned wpt_1 = layouts_->get(dot).wpt.at(1); + unsigned wpt_0 = layouts_->get(dot)->wpt.at(0); + unsigned wpt_1 = layouts_->get(dot)->wpt.at(1); unsigned stride_rep_i = wpt_0 * wts_0; unsigned stride_rep_j = wpt_1 * wts_1; unsigned num_rep_i = shapes[0] / stride_rep_i; @@ -1253,7 +1250,7 @@ void selection::lower_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn, IRB if(NK != 1) { shared_tile *TA = (shared_tile*)tmap_.at(A); shared_tile *TB = (shared_tile*)tmap_.at(B); - if(layouts_->get(dot).type == analysis::HMMA_884) + if(layouts_->get(dot)->type == analysis::HMMA_884) lower_hmma_dot(dot, ctx, fn, builder, TC, TA, TB, TD, NK); else lower_scanline_dot(dot, ctx, fn, builder, TC, TA, TB, TD, NK, c_ty, f_mul_add); @@ -1269,7 +1266,7 @@ void selection::lower_masked_load(ir::masked_load_inst *x, LLVMContext &ctx, Fun // find vector size distributed_tile* result = (distributed_tile*)tmap_.at(x); ir::value *ptr = x->get_pointer_operand(); - size_t ld = layouts_->get(ptr).order[0]; + size_t ld = layouts_->get(ptr)->order[0]; unsigned alignment = alignment_->get(ptr, ld); unsigned vector_size = std::min(result->axis(ld).contiguous, alignment); distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr); @@ -1341,7 +1338,7 @@ void selection::lower_load(ir::load_inst *x, LLVMContext &ctx, Function *fn, IRB distributed_tile* result = (distributed_tile*)tmap_.at(x); // find vector size ir::value *ptr = x->get_pointer_operand(); - size_t ld = layouts_->get(ptr).order[0]; + size_t ld = layouts_->get(ptr)->order[0]; unsigned alignment = alignment_->get(ptr, ld); unsigned vector_size = std::min(result->axis(ld).contiguous, alignment); distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr); diff --git a/lib/codegen/transform/coalesce.cc b/lib/codegen/transform/coalesce.cc index 8db94ed45..078aeb112 100644 --- a/lib/codegen/transform/coalesce.cc +++ b/lib/codegen/transform/coalesce.cc @@ -53,7 +53,6 @@ ir::value* coalesce::rematerialize(ir::value *x, ir::builder &builder, builder.set_insert_point(pos); if(dynamic_cast(x)){ ir::value *ret = builder.insert(ir::copy_to_shared_inst::create(x)); -// x->replace_all_uses_with(ret); return ret; } // default -- recursive clone diff --git a/lib/ir/type.cc b/lib/ir/type.cc index 198553b52..8300a32c4 100644 --- a/lib/ir/type.cc +++ b/lib/ir/type.cc @@ -77,6 +77,14 @@ const size_t type::get_tile_rank() const { return get_tile_shapes().size(); } +const size_t type::get_tile_ranks1() const { + int ret = 0; + for(int s: get_tile_shapes()) + ret += s > 1; + return ret; +} + + unsigned type::get_tile_num_elements() const { const tile_shapes_t& shapes = get_tile_shapes(); unsigned result = 1; From 323c90e4318d00d1d89454b4a83da078cbca65d0 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 11 Oct 2019 19:05:54 -0400 Subject: [PATCH 433/494] ugh --- include/triton/codegen/analysis/axes.h | 1 + include/triton/codegen/analysis/layout.h | 25 +++- include/triton/codegen/analysis/liveness.h | 19 +-- include/triton/codegen/instructions.h | 1 + include/triton/codegen/selection.h | 1 + include/triton/codegen/transform/membar.h | 6 +- include/triton/ir/builder.h | 1 + include/triton/ir/enums.h | 1 + include/triton/ir/instructions.h | 11 ++ lib/codegen/analysis/allocation.cc | 33 ++--- lib/codegen/analysis/axes.cc | 22 ++-- lib/codegen/analysis/layout.cc | 133 +++++++++++++-------- lib/codegen/analysis/liveness.cc | 55 +++------ lib/codegen/selection.cc | 33 +++-- lib/codegen/transform/cts.cc | 39 ++++-- lib/codegen/transform/membar.cc | 3 +- lib/driver/module.cc | 2 +- lib/ir/builder.cc | 4 + lib/ir/instructions.cc | 8 ++ lib/runtime/function.cc | 5 +- 20 files changed, 237 insertions(+), 166 deletions(-) diff --git a/include/triton/codegen/analysis/axes.h b/include/triton/codegen/analysis/axes.h index 701abe04d..dc39b07cb 100644 --- a/include/triton/codegen/analysis/axes.h +++ b/include/triton/codegen/analysis/axes.h @@ -30,6 +30,7 @@ private: void update_graph_broadcast(ir::instruction *i); void update_graph_dot(ir::instruction *i); void update_graph_elementwise(ir::instruction *i); + void update_graph_no_edge(ir::instruction *i); void update_graph(ir::instruction *i); public: diff --git a/include/triton/codegen/analysis/layout.h b/include/triton/codegen/analysis/layout.h index 629a5fc02..ba474d96c 100644 --- a/include/triton/codegen/analysis/layout.h +++ b/include/triton/codegen/analysis/layout.h @@ -23,19 +23,24 @@ class align; enum layout_type_t { HMMA_884, - SCANLINE + SCANLINE, + SHARED }; struct layout_t { layout_t(layout_type_t _type, const std::vector& _axes, const std::vector &_shapes, - const std::vector &values, + const std::vector &_values, + size_t _id, analysis::align* align); layout_type_t type; std::vector axes; std::vector shapes; + std::vector values; std::vector order; + size_t id; + size_t size; std::vector mts; std::vector nts; std::vector fpw; @@ -46,7 +51,8 @@ struct layout_hmma_884_t: public layout_t { layout_hmma_884_t(size_t num_warps, const std::vector& _axes, const std::vector& _shapes, - const std::vector &values, + const std::vector &_values, + size_t _id, analysis::align* align); }; @@ -55,9 +61,20 @@ struct layout_scanline_t: public layout_t { const std::vector& _axes, const std::vector& _shapes, const std::vector &values, + size_t _id, analysis::align* align); }; +struct layout_shared_t: public layout_t { + layout_shared_t(const layout_t *arg, + const std::vector& _axes, + const std::vector& _shapes, + const std::vector &values, + size_t _id, + analysis::align* align); +}; + + class layout { typedef ir::value* node_t; typedef std::map > graph_t; @@ -70,6 +87,8 @@ private: void init_hmma_tile(layout_t& layout); void init_scanline_tile(layout_t &layout); + void create(size_t id, const std::vector& values); + public: // constructor layout(analysis::axes *axes, analysis::align *align, size_t num_warps); diff --git a/include/triton/codegen/analysis/liveness.h b/include/triton/codegen/analysis/liveness.h index 05ec3a1df..9b012b5d8 100644 --- a/include/triton/codegen/analysis/liveness.h +++ b/include/triton/codegen/analysis/liveness.h @@ -23,6 +23,7 @@ typedef unsigned slot_index; class tiles; class layout; +class layout_t; struct segment { slot_index start; @@ -42,18 +43,11 @@ struct double_buffer_info_t { ir::phi_node* phi; }; -struct buffer_t { - size_t id; - size_t size; - bool operator<(buffer_t other) const { - return id < other.id; - } -}; class liveness { private: typedef std::map indices_map_t; - typedef std::map intervals_map_t; + typedef std::map intervals_map_t; typedef std::map has_storage_map_t; typedef ir::value* node_t; typedef std::map > graph_t; @@ -82,10 +76,7 @@ public: unsigned num_bytes(ir::value *x); // accessors const intervals_map_t& intervals() const { return intervals_; } - segment get_interval(buffer_t* v) const { return intervals_.at(v); } - // buffers - buffer_t* get_buffer(ir::value *v) const { return groups_.at(v); } - std::vector get_values(buffer_t* x) const { return values_.at(x); } + segment get_interval(layout_t* v) const { return intervals_.at(v); } // double-buffering bool has_double(ir::value *x) const { return double_.find(x) != double_.end(); } double_buffer_info_t get_double(ir::value *x) const { return double_.at(x); } @@ -101,10 +92,6 @@ private: intervals_map_t intervals_; std::map double_; std::map pad_; - // buffers - tools::graph graph_; - std::map groups_; - std::map> values_; }; } diff --git a/include/triton/codegen/instructions.h b/include/triton/codegen/instructions.h index e3ad9344d..2e5d6148f 100644 --- a/include/triton/codegen/instructions.h +++ b/include/triton/codegen/instructions.h @@ -66,6 +66,7 @@ static const std::map storage_info = { // intrinsics { ir::INST_COPY_TO_SHARED, {SHARED, {DISTRIBUTED}}}, + { ir::INST_COPY_FROM_SHARED, {DISTRIBUTED, {SHARED}}}, { ir::INST_BARRIER, {NONE, {}}}, { ir::INST_MAKE_RANGE_DYN, {DISTRIBUTED, {}}}, { ir::INST_MAKE_RANGE_STA, {DISTRIBUTED, {}}}, diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index dce6a6278..1c367916b 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -189,6 +189,7 @@ private: void lower_splat(ir::splat_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); void lower_broadcast(ir::broadcast_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); void lower_copy_to_shared(ir::copy_to_shared_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); + void lower_copy_from_shared(ir::copy_from_shared_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); void lower_trans(ir::trans_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); // matrix multiply void lower_hmma_dot(ir::dot_inst *x, LLVMContext &ctx, Function *fn, Builder &builder, diff --git a/include/triton/codegen/transform/membar.h b/include/triton/codegen/transform/membar.h index a737d0e49..820992da7 100644 --- a/include/triton/codegen/transform/membar.h +++ b/include/triton/codegen/transform/membar.h @@ -17,6 +17,7 @@ namespace analysis{ class allocation; class liveness; +class layout; class cts; } @@ -40,12 +41,13 @@ private: std::set &insert_loc, std::set &safe_war); public: - membar(analysis::liveness *liveness, analysis::allocation *alloc): - liveness_(liveness), alloc_(alloc) {} + membar(analysis::liveness *liveness, analysis::layout *layouts, analysis::allocation *alloc): + liveness_(liveness), layouts_(layouts), alloc_(alloc) {} void run(ir::module &mod); private: analysis::liveness *liveness_; + analysis::layout *layouts_; analysis::allocation *alloc_; }; diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h index e254f6d38..f92e825e4 100644 --- a/include/triton/ir/builder.h +++ b/include/triton/ir/builder.h @@ -141,6 +141,7 @@ public: value *create_select(value *pred, value *if_value, value *else_value, const std::string &name = ""); // Intrinsics value *create_copy_to_shared(value *arg, const std::string &name = ""); + value *create_copy_from_shared(value *arg, const std::string &name = ""); value *create_barrier(const std::string &name = ""); private: diff --git a/include/triton/ir/enums.h b/include/triton/ir/enums.h index 19cf82086..94c74c085 100644 --- a/include/triton/ir/enums.h +++ b/include/triton/ir/enums.h @@ -133,6 +133,7 @@ enum value_id_t: unsigned { INST_DOT, // intrinsics INST_COPY_TO_SHARED, + INST_COPY_FROM_SHARED, INST_BARRIER, INST_MAKE_RANGE_DYN, INST_MAKE_RANGE_STA, diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index 9298ccbe0..f59cf95be 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -678,6 +678,17 @@ public: _TRITON_DEFINE_CLONE(copy_to_shared_inst) }; +class copy_from_shared_inst: public unary_inst{ +private: + using unary_inst::unary_inst; + std::string repr_impl() const { return "copy_from_shared"; } + +public: + static copy_from_shared_inst* create(value *arg, const std::string &name = "", + instruction *next = nullptr); + _TRITON_DEFINE_CLONE(copy_from_shared_inst) +}; + class barrier_inst: public instruction{ private: barrier_inst(context &ctx, const std::string &name, instruction *next); diff --git a/lib/codegen/analysis/allocation.cc b/lib/codegen/analysis/allocation.cc index 0fde814f3..8980aa2b7 100644 --- a/lib/codegen/analysis/allocation.cc +++ b/lib/codegen/analysis/allocation.cc @@ -1,5 +1,6 @@ #include #include +#include "triton/codegen/analysis/layout.h" #include "triton/codegen/analysis/allocation.h" #include "triton/codegen/analysis/liveness.h" #include "triton/codegen/transform/cts.h" @@ -20,22 +21,22 @@ void allocation::run(ir::module &mod) { using std::min; typedef std::multimap triples_map_type; - std::vector I; + std::vector I; for(auto x: liveness_->intervals()) I.push_back(x.first); - std::vector J = I; + std::vector J = I; triples_map_type H; H.insert({0, segment{0, INT_MAX}}); - std::vector V; - std::map starts; + std::vector V; + std::map starts; while(!J.empty()){ auto h_it = H.begin(); unsigned w = h_it->first; segment xh = h_it->second; H.erase(h_it); - auto j_it = std::find_if(J.begin(), J.end(), [&](buffer_t* JJ){ + auto j_it = std::find_if(J.begin(), J.end(), [&](layout_t* JJ){ segment xj = liveness_->get_interval(JJ); bool res = xj.intersect(xh); for(auto val: H) @@ -57,9 +58,9 @@ void allocation::run(ir::module &mod) { } // Build interference graph - std::map> interferences; - for(buffer_t* x: V) - for(buffer_t* y: V){ + std::map> interferences; + for(layout_t* x: V) + for(layout_t* y: V){ if(x->id == y->id) continue; unsigned X0 = starts[x], Y0 = starts[y]; @@ -73,17 +74,17 @@ void allocation::run(ir::module &mod) { } // Initialize colors - std::map colors; - for(buffer_t* X: V) + std::map colors; + for(layout_t* X: V) colors[X] = (X->id==V[0]->id)?0:-1; // First-fit graph coloring std::vector available(V.size()); - for(buffer_t* x: V){ + for(layout_t* x: V){ // Non-neighboring colors are available std::fill(available.begin(), available.end(), true); - for(buffer_t* Y: interferences[x]){ + for(layout_t* Y: interferences[x]){ int color = colors[Y]; if(color >= 0) available[color] = false; @@ -94,12 +95,12 @@ void allocation::run(ir::module &mod) { } // Finalize allocation - for(buffer_t* x: V){ + for(layout_t* x: V){ unsigned Adj = 0; - for(buffer_t* y: interferences[x]) + for(layout_t* y: interferences[x]) Adj = std::max(Adj, starts[y] + y->size); // create offsets - for(ir::value *v: liveness_->get_values(x)){ + for(ir::value *v: x->values){ offsets_[v] = starts[x] + colors[x] * Adj; if(liveness_->has_double(v)){ auto info = liveness_->get_double(v); @@ -110,7 +111,7 @@ void allocation::run(ir::module &mod) { // Save maximum size of induced memory space allocated_size_ = 0; - for(buffer_t* x: V) + for(layout_t* x: V) allocated_size_ = std::max(allocated_size_, starts[x] + x->size); } diff --git a/lib/codegen/analysis/axes.cc b/lib/codegen/analysis/axes.cc index dec2a4e88..c446558a8 100644 --- a/lib/codegen/analysis/axes.cc +++ b/lib/codegen/analysis/axes.cc @@ -105,17 +105,23 @@ void axes::update_graph_elementwise(ir::instruction *i) { } } +void axes::update_graph_no_edge(ir::instruction *i) { + auto rank = i->get_type()->get_tile_rank(); + for(unsigned d = 0; d < rank; d++) + graph_.add_edge({i, d}, {i, d}); +} void axes::update_graph(ir::instruction *i) { switch (i->get_id()) { - case ir::INST_REDUCE: return update_graph_reduce(i); - case ir::INST_RESHAPE: return update_graph_reshape(i); - case ir::INST_SPLAT: return; - case ir::INST_TRANS: return update_graph_trans(i); - case ir::INST_BROADCAST: return update_graph_broadcast(i); - case ir::INST_DOT: return update_graph_dot(i); - case ir::INST_COPY_TO_SHARED: return; - default: return update_graph_elementwise(i); + case ir::INST_REDUCE: return update_graph_reduce(i); + case ir::INST_RESHAPE: return update_graph_reshape(i); + case ir::INST_SPLAT: return update_graph_no_edge(i);; + case ir::INST_TRANS: return update_graph_trans(i); + case ir::INST_BROADCAST: return update_graph_broadcast(i); + case ir::INST_DOT: return update_graph_dot(i); + case ir::INST_COPY_TO_SHARED: return update_graph_no_edge(i);; + case ir::INST_COPY_FROM_SHARED: return update_graph_no_edge(i); + default: return update_graph_elementwise(i); } return; } diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index 8b9dae808..314b6d590 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -45,6 +45,8 @@ void layout::connect(ir::value *x, ir::value *y) { std::set_intersection(sx_axes.begin(), sx_axes.end(), sy_axes.begin(), sy_axes.end(), std::inserter(common, common.begin())); + graph_.add_edge(x, x); + graph_.add_edge(y, y); if(!common.empty()) graph_.add_edge(x, y); } @@ -89,6 +91,23 @@ void extract_io_use(ir::value *v, std::set& result) { } } +void extract_dot_use(ir::value *v, ir::value*& result, size_t n) { + for(ir::user* u: v->get_users()){ + auto i = dynamic_cast(u); + if(i && i->get_operand(n) == v) + result = v; + } +} + +void extract_hmma_dot_use(ir::value *v, ir::value*& result, size_t n) { + for(ir::user* u: v->get_users()){ + auto i = dynamic_cast(u); + if(i && is_hmma_c(i) && i->get_operand(n) == v) + result = v; + } +} + + inline bool is_trans(ir::value *v) { if(dynamic_cast(v)) { @@ -108,14 +127,14 @@ inline bool is_trans(ir::value *v) { layout_t::layout_t(layout_type_t _type, const std::vector &_axes, const std::vector &_shapes, - const std::vector &values, - analysis::align* align): type(_type), axes(_axes), shapes(_shapes) { + const std::vector &_values, + size_t _id, + analysis::align* align): type(_type), axes(_axes), shapes(_shapes), values(_values), id(_id) { // io pointer std::set ptr; for(ir::value* v: values) extract_io_use(v, ptr); - size_t rank = axes.size(); - std::vector order(rank); + order.resize(axes.size()); std::iota(order.begin(), order.end(), 0); for(ir::value *v: ptr){ auto max_contiguous = align->contiguous(v); @@ -123,7 +142,6 @@ layout_t::layout_t(layout_type_t _type, return max_contiguous[a] > max_contiguous[b]; }); } - this->order = order; } inline unsigned clamp(unsigned x, unsigned lo, unsigned hi) { @@ -133,8 +151,8 @@ inline unsigned clamp(unsigned x, unsigned lo, unsigned hi) { layout_hmma_884_t::layout_hmma_884_t(size_t num_warps, const std::vector& _axes, const std::vector& _shapes, - const std::vector &values, - analysis::align* align): layout_t(HMMA_884, _axes, _shapes, values, align) { + const std::vector &values, size_t _id, + analysis::align* align): layout_t(HMMA_884, _axes, _shapes, values, _id, align) { unsigned shape_0 = shapes[order[0]]; unsigned shape_1 = shapes[order[1]]; @@ -173,7 +191,8 @@ layout_scanline_t::layout_scanline_t(size_t num_warps, const std::vector& _axes, const std::vector& _shapes, const std::vector &values, - analysis::align* align): layout_t(SCANLINE, _axes, _shapes, values, align){ + size_t _id, + analysis::align* align): layout_t(SCANLINE, _axes, _shapes, values, _id, align){ unsigned size = std::accumulate(shapes.begin(), shapes.end(), 1, std::multiplies()); unsigned num_threads = num_warps * 32; nts.resize(shapes.size()); @@ -196,6 +215,58 @@ layout_scanline_t::layout_scanline_t(size_t num_warps, throw std::runtime_error("cannot create a kernel with this amount of warps"); } +layout_shared_t::layout_shared_t(const layout_t *arg, + const std::vector& _axes, + const std::vector& _shapes, + const std::vector &values, + size_t _id, + analysis::align* align): layout_t(SHARED, _axes, _shapes, values, _id, align) { + + if(arg->type == SCANLINE) + order = arg->order; + + ir::value* dot_a = nullptr; + ir::value* dot_b = nullptr; + ir::value* hmma_dot_a = nullptr; + ir::value* hmma_dot_b = nullptr; + for(ir::value* v: values){ + extract_dot_use(v, dot_a, 0); + extract_dot_use(v, dot_b, 1); + extract_hmma_dot_use(v, hmma_dot_a, 0); + extract_hmma_dot_use(v, hmma_dot_b, 1); + } + std::vector col = {0, 1}; + std::vector row = {1, 0}; + if(dot_a && !hmma_dot_a) + order = is_trans(dot_a) ? row : col; + if(dot_b && !hmma_dot_b) + order = is_trans(dot_b) ? col : row; +} + +void layout::create(size_t id, const std::vector& values) { + auto it_hmma_c = std::find_if(values.begin(), values.end(), &is_hmma_c); + auto cmp = [](ir::value* x, ir::value *y) { + return x->get_type()->get_tile_ranks1() < + y->get_type()->get_tile_ranks1(); + }; + ir::value *largest = *std::max_element(values.begin(), values.end(), cmp); + const auto& axes = axes_->get(largest); + const auto& shapes = largest->get_type()->get_tile_shapes(); + auto it_cts = std::find_if(values.begin(), values.end(), [](ir::value* v) { + return dynamic_cast(v); + }); + // type + if(it_hmma_c != values.end()) + layouts_[id] = new layout_hmma_884_t(num_warps_, axes, shapes, values, id, align_); + else if(it_cts != values.end()){ + ir::copy_to_shared_inst *cts = (ir::copy_to_shared_inst*)*it_cts; + ir::value *arg = cts->get_operand(0); + create(groups_.at(arg), values_.at(groups_.at(arg))); + layouts_[id] = new layout_shared_t(get(arg), axes, shapes, values, id, align_); + } + else + layouts_[id] = new layout_scanline_t(num_warps_, axes, shapes, values, id, align_); +} void layout::run(ir::module &mod) { // make graph @@ -208,50 +279,8 @@ void layout::run(ir::module &mod) { graph_.connected_components(&values_, &groups_); // create layouts - for(const auto& x: values_) { - bool hmma_c = std::any_of(x.second.begin(), x.second.end(), &is_hmma_c); - auto cmp = [](ir::value* x, ir::value *y) { - return x->get_type()->get_tile_ranks1() < - y->get_type()->get_tile_ranks1(); - }; - ir::value *largest = *std::max_element(x.second.begin(), x.second.end(), cmp); - const auto& axes = axes_->get(largest); - const auto& shapes = largest->get_type()->get_tile_shapes(); - // type - if(hmma_c) - layouts_[x.first] = new layout_hmma_884_t(num_warps_, axes, shapes, x.second, align_); - else - layouts_[x.first] = new layout_scanline_t(num_warps_, axes, shapes, x.second, align_); - } - - - // matrix multiplication optimizations - for(const auto& x: values_) { - std::vector dots; - for(ir::value* v: x.second) - if(auto *x = dynamic_cast(v)) - dots.push_back(x); - for(ir::dot_inst* dot: dots){ - ir::value* a = dot->get_operand(0); - ir::value* b = dot->get_operand(1); - if(get(dot)->type == HMMA_884){ - auto a_val = values_of(layout_of(a)); - auto b_val = values_of(layout_of(b)); - for(ir::value *v: a_val) - if(auto *cts = dynamic_cast(v)) - layouts_[layout_of(a)]->order = layouts_[layout_of(cts->get_operand(0))]->order; - for(ir::value *v: b_val) - if(auto *cts = dynamic_cast(v)) - layouts_[layout_of(b)]->order = layouts_[layout_of(cts->get_operand(0))]->order; - } - else{ - std::vector col = {0, 1}; - std::vector row = {1, 0}; - layouts_[layout_of(a)]->order = is_trans(a) ? row : col; - layouts_[layout_of(b)]->order = is_trans(b) ? col : row; - } - } - } + for(const auto& x: values_) + create(x.first, x.second); } } diff --git a/lib/codegen/analysis/liveness.cc b/lib/codegen/analysis/liveness.cc index 00581e281..2953bcc8e 100644 --- a/lib/codegen/analysis/liveness.cc +++ b/lib/codegen/analysis/liveness.cc @@ -42,7 +42,7 @@ void liveness::extract_double_bufferable(ir::instruction *i) { ir::value *value_1 = phi->get_incoming_value(1); ir::instruction *i_0 = dynamic_cast(value_0); ir::instruction *i_1 = dynamic_cast(value_1); - if(!i_0 || !i_1 || storage_info.at(i_0->get_id()).first != SHARED || storage_info.at(i_1->get_id()).first != SHARED) + if(!i_0 || !i_1 || storage_info.at(i_0->get_id()).first != codegen::SHARED || storage_info.at(i_1->get_id()).first != codegen::SHARED) return; if(is_latch_1) double_[value_0] = double_buffer_info_t{value_1, phi}; @@ -50,21 +50,6 @@ void liveness::extract_double_bufferable(ir::instruction *i) { double_[value_1] = double_buffer_info_t{value_0, phi}; } -void liveness::make_graph(ir::instruction *i) { - if(has_double(i)){ - ir::value *latch = double_[i].latch; - graph_.add_edge(i, latch); - } - if(storage_info.at(i->get_id()).first == SHARED){ - graph_.add_edge(i, i); - for(ir::value* op: i->ops()){ - auto* iop = dynamic_cast(op); - if(!iop || storage_info.at(iop->get_id()).first != SHARED) - continue; - graph_.add_edge(i, op); - } - } -} // connected components bool is_trans(ir::value *v) { @@ -151,7 +136,6 @@ void liveness::run(ir::module &mod) { indices.clear(); pad_.clear(); intervals_.clear(); - graph_.clear(); // Create set of pair of values that can be double-buffered ir::for_each_instruction(mod, [this](ir::instruction* i) { @@ -167,22 +151,14 @@ void liveness::run(ir::module &mod) { }); }while(has_changed); - // Create buffer dependency graph - ir::for_each_instruction(mod, [this](ir::instruction* i) { - this->make_graph(i); - }); // connected components - tools::graph::cmap_t cmap; - tools::graph::nmap_t nmap; - graph_.connected_components(&cmap, &nmap); - for(auto x: cmap) { - buffer_t* buffer = new buffer_t{x.first}; - values_[buffer] = x.second; - for(ir::value *v: x.second){ - buffer->size = std::max(buffer->size, num_bytes(v)); - groups_[v] = buffer; - } + for(auto &x: layouts_->get_all()) { + layout_t* layout = x.second; + if(layout->type != SHARED) + continue; + for(ir::value *v: layout->values) + layout->size = std::max(layout->size, num_bytes(v)); } // Assigns index to each instruction @@ -195,22 +171,25 @@ void liveness::run(ir::module &mod) { } } - for(auto x: values_) { + for(auto &x: layouts_->get_all()) { + layout_t* layout = x.second; + if(layout->type != SHARED) + continue; // users - std::set values; - for(ir::value *v: x.second){ - values.insert(v); + std::set users; + for(ir::value *v: layout->values){ + users.insert(v); for(ir::user *u: v->get_users()) - values.insert(u); + users.insert(u); } // compute intervals unsigned start = INT32_MAX; unsigned end = 0; - for(ir::value *u: values){ + for(ir::value *u: users){ start = std::min(start, indices.at(u)); end = std::max(end, indices.at(u)); } - intervals_[x.first] = segment{start, end}; + intervals_[layout] = segment{start, end}; } diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 4b25106aa..e039f8ec7 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -486,21 +486,7 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::function(inst)){ -// Value *ptr = value(ii->get_operand(0)); -// Value *val = value(ii->get_operand(1)); -// Value *atom_f_add = nullptr; -// if(val->getType()->isFloatTy()) -// atom_f_add = Intrinsic::getDeclaration(builder.GetInsertBlock()->getModule(), Intrinsic::nvvm_atomic_load_add_f32, {ptr->getType()}); -// else if(val->getType()->isHalfTy()){ -// Type *fp16 = Type::getHalfTy(ctx); - -// FunctionType *atom_ty = FunctionType::get(fp16, {fp16->getPointerTo(), fp16}, false); -// atom_f_add = InlineAsm::get(atom_ty, " atom.relaxed.global.gpu.add.noftz.f16 $0, [$1], $2;", "=h,l,h", true); -// } -// if(atom_f_add == nullptr) throw std::runtime_error("unsupported"); -// Value *res = builder.CreateCall(atom_f_add, {ptr, val}); -// return (Instruction*)res; } if(ir::sqrt_inst* ii = dynamic_cast(inst)){ Value *val = value(ii->get_operand(0)); @@ -711,7 +697,7 @@ void selection::init_hmma_axes(const analysis::layout_t& layout, IRBuilder<> &bu void selection::init_axes(const analysis::layout_t& layout, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { if(layout.type == analysis::HMMA_884) init_hmma_axes(layout, builder, u_thread_id, u_warp_id); - else + else if(layout.type == analysis::SCANLINE) init_strided_scan_axes(layout, builder, u_thread_id, u_warp_id); } @@ -801,7 +787,7 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, for(ir::value *op: user->ops()) create_tile(op, builder, seen, sh_mem_ptr); auto *i = dynamic_cast(v); - if(i && storage_info.at(i->get_id()).first == SHARED && !dynamic_cast(v)) + if(i && layouts_->get(i)->type == analysis::SHARED && !dynamic_cast(v)) create_shared_tile(i, builder, sh_mem_ptr); else create_distributed_tile(v, builder); @@ -1018,7 +1004,7 @@ void selection::lower_copy_to_shared(ir::copy_to_shared_inst *x, LLVMContext &ct distributed_tile* in = (distributed_tile*)tmap_.at(arg); if(x_order == arg_order){ size_t ld = arg_order[0]; - vector_size = std::min(layouts_->get(x)->nts.at(ld), layouts_->get(arg)->nts.at(ld)); + vector_size = layouts_->get(arg)->nts.at(ld); } std::map packets; @@ -1038,6 +1024,15 @@ void selection::lower_copy_to_shared(ir::copy_to_shared_inst *x, LLVMContext &ct }); } +void selection::lower_copy_from_shared(ir::copy_from_shared_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { + distributed_tile* result = (distributed_tile*)tmap_.at(x); + shared_tile* arg = (shared_tile*)tmap_.at(x->get_operand(0)); + + result->for_each([&](indices_t idx){ + result->set_value(idx, arg->get_value(idx)); + }); +} + void selection::lower_trans(ir::trans_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { shared_tile* in = (shared_tile*)tmap_.at(x->get_operand(0)); shared_tile* out = new shared_tile(in->get_ty(), in->get_shapes(), in->get_order(), in->get_pointer(), builder, in->get_offset(), x->get_perm()); @@ -1399,6 +1394,8 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & lower_broadcast(x, ctx, fn, builder); else if(auto *x = dynamic_cast(ins)) lower_copy_to_shared(x, ctx, fn, builder); + else if(auto *x = dynamic_cast(ins)) + lower_copy_from_shared(x, ctx, fn, builder); else if(auto* x = dynamic_cast(ins)) lower_trans(x, ctx, fn, builder); else if(auto x = dynamic_cast(ins)) @@ -1554,7 +1551,7 @@ void selection::run(ir::module &src, Module &dst) { } else { unsigned num_bytes = inst->get_type()->get_scalar_ty()->get_primitive_size_in_bits() / 8; - offset->addIncoming(dst_builder.getInt32(liveness_->get_buffer(inst)->size / (2*num_bytes)), llvm_inc_block); + offset->addIncoming(dst_builder.getInt32(layouts_->get(inst)->size / (2*num_bytes)), llvm_inc_block); } ptr->addIncoming(inc_shared->get_pointer(), llvm_inc_block); } diff --git a/lib/codegen/transform/cts.cc b/lib/codegen/transform/cts.cc index 1f90e7e5e..b939c160c 100644 --- a/lib/codegen/transform/cts.cc +++ b/lib/codegen/transform/cts.cc @@ -12,30 +12,45 @@ namespace triton { namespace codegen{ namespace transform{ +inline bool is_shared(ir::value *v) { + auto *i = dynamic_cast(v); + if(!i) + return false; + return storage_info.at(i->get_id()).first == codegen::SHARED; +} + // run pass on module -void add_copy(ir::instruction *parent, ir::value *x, ir::builder &builder) { +void add_copy(ir::instruction *parent, ir::value *x, ir::builder &builder, bool to_shared) { auto *i = dynamic_cast(x); // not an instruction if(!i) { builder.set_insert_point(parent); - ir::value *cts = builder.create_copy_to_shared(x); - parent->replace_uses_of_with(x, cts); + ir::value *copy; + if(to_shared) + copy = builder.create_copy_to_shared(x); + else + copy = builder.create_copy_from_shared(x); + parent->replace_uses_of_with(x, copy); return; } // phi node if(auto* phi = dynamic_cast(x)) { for(unsigned i = 0; i < phi->get_num_incoming(); ++i) - add_copy(phi, phi->get_incoming_value(i), builder); + add_copy(phi, phi->get_incoming_value(i), builder, to_shared); return; } ir::value_id_t id = i->get_id(); // already in shared memory - if(storage_info.at(id).first == SHARED) + if(to_shared && storage_info.at(id).first == SHARED) return; // copy builder.set_insert_point_after(i); - ir::value *cts = builder.create_copy_to_shared(x); - parent->replace_uses_of_with(x, cts); + ir::value *copy; + if(to_shared) + copy = builder.create_copy_to_shared(x); + else + copy = builder.create_copy_from_shared(x); + parent->replace_uses_of_with(x, copy); } void cts::run(ir::module &mod) { @@ -45,10 +60,16 @@ void cts::run(ir::module &mod) { for(ir::basic_block *block: fn->blocks()) for(ir::instruction *i: block->get_inst_list()){ auto storage = storage_info.at(i->get_id()); - // copy to shared operands when necessary + // copy to shared operands for(size_t k = 0; k < storage.second.size(); k++) if(storage.second[k] == SHARED) - add_copy(i, i->get_operand(k), builder); + add_copy(i, i->get_operand(k), builder, true); + // copy from shared operands + for(size_t k = 0; k < storage.second.size(); k++) + if(storage.second[k] == DISTRIBUTED && + is_shared(i->get_operand(k))){ + add_copy(i, i->get_operand(k), builder, false); + } } } } diff --git a/lib/codegen/transform/membar.cc b/lib/codegen/transform/membar.cc index ee5821da4..d0bd890e8 100644 --- a/lib/codegen/transform/membar.cc +++ b/lib/codegen/transform/membar.cc @@ -3,6 +3,7 @@ #include #include "triton/codegen/analysis/liveness.h" +#include "triton/codegen/analysis/layout.h" #include "triton/codegen/analysis/allocation.h" #include "triton/codegen/instructions.h" #include "triton/codegen/transform/membar.h" @@ -38,7 +39,7 @@ void membar::add_reference(ir::value *v, interval_vec_t &res){ return; if(alloc_->has_offset(v)){ unsigned offset = alloc_->offset(v); - unsigned size = liveness_->get_buffer(v)->size; + unsigned size = layouts_->get(v)->size; res.push_back(interval_t(offset, offset + size)); } } diff --git a/lib/driver/module.cc b/lib/driver/module.cc index f29c830f4..30881d087 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -241,7 +241,7 @@ std::string cu_module::compile_llvm_module(std::unique_ptr module, cu_module::cu_module(driver::context * context, std::unique_ptr ll_module): cu_module(context, compile_llvm_module(std::move(ll_module), context->device())) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ -// std::cout << source << std::endl; + std::cout << source << std::endl; cu_context::context_switcher ctx(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/ir/builder.cc b/lib/ir/builder.cc index caf22348f..0a5a8f200 100644 --- a/lib/ir/builder.cc +++ b/lib/ir/builder.cc @@ -347,6 +347,10 @@ value *builder::create_copy_to_shared(value *arg, const std::string &name) { return insert(copy_to_shared_inst::create(arg, name)); } +value *builder::create_copy_from_shared(value *arg, const std::string &name) { + return insert(copy_from_shared_inst::create(arg, name)); +} + value *builder::create_barrier(const std::string &name) { return insert(barrier_inst::create(ctx_, name)); } diff --git a/lib/ir/instructions.cc b/lib/ir/instructions.cc index 4fdfa797d..568e951a0 100644 --- a/lib/ir/instructions.cc +++ b/lib/ir/instructions.cc @@ -731,12 +731,20 @@ instruction* atomic_add_inst::create(value *ptr, value *val, const std::string & //===----------------------------------------------------------------------===// // intrinsic instructions //===----------------------------------------------------------------------===// + // copy to shared copy_to_shared_inst* copy_to_shared_inst::create(value *arg, const std::string &name, instruction *next) { return new copy_to_shared_inst(arg->get_type(), INST_COPY_TO_SHARED, arg, name, next); } +// copy from shared +copy_from_shared_inst* copy_from_shared_inst::create(value *arg, const std::string &name, + instruction *next) { + return new copy_from_shared_inst(arg->get_type(), INST_COPY_FROM_SHARED, arg, name, next); +} + + // barrier barrier_inst::barrier_inst(context &ctx, const std::string &name, instruction *next) diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 9578a3acb..f8ca7fff6 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -211,7 +211,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::analysis::layout layouts(&axes, &align, opt.num_warps); codegen::analysis::liveness liveness(&layouts); codegen::analysis::allocation allocation(&liveness); - codegen::transform::membar barriers(&liveness, &allocation); + codegen::transform::membar barriers(&liveness, &layouts, &allocation); codegen::transform::dce dce; codegen::transform::peephole peephole; codegen::transform::reassociate reassociate(&align); @@ -230,11 +230,11 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c align.run(module); dce.run(module); reassociate.run(module); -// ir::print(module, std::cout); dce.run(module); cts.run(module); align.run(module); axes.run(module); +// ir::print(module, std::cout); layouts.run(module); liveness.run(module); allocation.run(module); @@ -245,6 +245,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c align.run(module); axes.run(module); layouts.run(module); +// ir::print(module, std::cout); selection.run(module, *llvm); // return binary std::unique_ptr res(driver::module::create(context, std::move(llvm))); From ee3803b57725e6908769bed9bfefb89ac4dcf565 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 11 Oct 2019 19:29:24 -0400 Subject: [PATCH 434/494] more cleaning --- lib/codegen/analysis/layout.cc | 1 + lib/codegen/analysis/liveness.cc | 2 +- lib/driver/module.cc | 1 - lib/runtime/function.cc | 1 + 4 files changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index 314b6d590..634b857a6 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -222,6 +222,7 @@ layout_shared_t::layout_shared_t(const layout_t *arg, size_t _id, analysis::align* align): layout_t(SHARED, _axes, _shapes, values, _id, align) { + size = 0; if(arg->type == SCANLINE) order = arg->order; diff --git a/lib/codegen/analysis/liveness.cc b/lib/codegen/analysis/liveness.cc index 2953bcc8e..98af7e039 100644 --- a/lib/codegen/analysis/liveness.cc +++ b/lib/codegen/analysis/liveness.cc @@ -154,7 +154,7 @@ void liveness::run(ir::module &mod) { // connected components for(auto &x: layouts_->get_all()) { - layout_t* layout = x.second; + layout_t*& layout = x.second; if(layout->type != SHARED) continue; for(ir::value *v: layout->values) diff --git a/lib/driver/module.cc b/lib/driver/module.cc index 30881d087..e300a75f2 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -241,7 +241,6 @@ std::string cu_module::compile_llvm_module(std::unique_ptr module, cu_module::cu_module(driver::context * context, std::unique_ptr ll_module): cu_module(context, compile_llvm_module(std::move(ll_module), context->device())) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ - std::cout << source << std::endl; cu_context::context_switcher ctx(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index f8ca7fff6..dba693475 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -245,6 +245,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c align.run(module); axes.run(module); layouts.run(module); + liveness.run(module); // ir::print(module, std::cout); selection.run(module, *llvm); // return binary From 7d77f34db0d746c0d0b2f978184e6ccf2ebb5469 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 11 Oct 2019 23:40:27 -0400 Subject: [PATCH 435/494] [codegen] more cleaning --- include/triton/codegen/analysis/layout.h | 13 ++ include/triton/codegen/analysis/liveness.h | 36 +----- lib/codegen/analysis/allocation.cc | 8 +- lib/codegen/analysis/layout.cc | 80 +++++++++++- lib/codegen/analysis/liveness.cc | 138 --------------------- lib/codegen/selection.cc | 17 ++- lib/codegen/transform/membar.cc | 17 ++- 7 files changed, 109 insertions(+), 200 deletions(-) diff --git a/include/triton/codegen/analysis/layout.h b/include/triton/codegen/analysis/layout.h index ba474d96c..01b65e8d2 100644 --- a/include/triton/codegen/analysis/layout.h +++ b/include/triton/codegen/analysis/layout.h @@ -11,8 +11,10 @@ namespace triton{ namespace ir{ class value; + class type; class module; class instruction; + class phi_node; } namespace codegen{ @@ -27,6 +29,13 @@ enum layout_type_t { SHARED }; +struct double_buffer_info_t { + ir::value* first; + ir::value* latch; + ir::phi_node* phi; +}; + + struct layout_t { layout_t(layout_type_t _type, const std::vector& _axes, @@ -41,6 +50,9 @@ struct layout_t { std::vector order; size_t id; size_t size; + std::shared_ptr double_buffer; + ir::type *ty; + size_t pad; std::vector mts; std::vector nts; std::vector fpw; @@ -70,6 +82,7 @@ struct layout_shared_t: public layout_t { const std::vector& _axes, const std::vector& _shapes, const std::vector &values, + ir::type *ty, size_t _id, analysis::align* align); }; diff --git a/include/triton/codegen/analysis/liveness.h b/include/triton/codegen/analysis/liveness.h index 9b012b5d8..6e5c456b9 100644 --- a/include/triton/codegen/analysis/liveness.h +++ b/include/triton/codegen/analysis/liveness.h @@ -4,6 +4,7 @@ #include #include #include +#include "triton/codegen/analysis/layout.h" #include "triton/tools/graph.h" namespace triton{ @@ -38,60 +39,25 @@ struct segment { } }; -struct double_buffer_info_t { - ir::value* latch; - ir::phi_node* phi; -}; - class liveness { private: typedef std::map indices_map_t; typedef std::map intervals_map_t; - typedef std::map has_storage_map_t; - typedef ir::value* node_t; - typedef std::map > graph_t; - -public: - // Intervals iterators - using iterator = intervals_map_t::iterator; - using const_iterator = intervals_map_t::const_iterator; - - - - -private: - void extract_double_bufferable(ir::instruction *i); - void extract_buffers(ir::instruction *i); - void get_parents(ir::instruction *i, std::vector& res); - void make_graph(ir::instruction *i); - bool do_pad(ir::value *x); - public: liveness(layout *l): layouts_(l){ } - // padding - unsigned get_pad(ir::value *v) const { return pad_.at(v); } - // buffer size - unsigned num_bytes(ir::value *x); // accessors const intervals_map_t& intervals() const { return intervals_; } segment get_interval(layout_t* v) const { return intervals_.at(v); } - // double-buffering - bool has_double(ir::value *x) const { return double_.find(x) != double_.end(); } - double_buffer_info_t get_double(ir::value *x) const { return double_.at(x); } // run void run(ir::module &mod); private: // analysis layout *layouts_; - // stuff - has_storage_map_t has_dedicated_storage_; indices_map_t indices; intervals_map_t intervals_; - std::map double_; - std::map pad_; }; } diff --git a/lib/codegen/analysis/allocation.cc b/lib/codegen/analysis/allocation.cc index 8980aa2b7..df562ff3b 100644 --- a/lib/codegen/analysis/allocation.cc +++ b/lib/codegen/analysis/allocation.cc @@ -102,10 +102,10 @@ void allocation::run(ir::module &mod) { // create offsets for(ir::value *v: x->values){ offsets_[v] = starts[x] + colors[x] * Adj; - if(liveness_->has_double(v)){ - auto info = liveness_->get_double(v); - offsets_[info.latch] = offsets_[v] + x->size / 2; - } + } + if(x->double_buffer){ + auto info = *x->double_buffer; + offsets_[info.latch] = offsets_[info.first] + x->size / 2; } } diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index 634b857a6..e07ed6d34 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -4,6 +4,7 @@ #include "triton/codegen/analysis/axes.h" #include "triton/codegen/analysis/align.h" #include "triton/codegen/analysis/layout.h" +#include "triton/codegen/instructions.h" #include "triton/ir/function.h" #include "triton/ir/module.h" #include "triton/ir/utils.h" @@ -187,6 +188,20 @@ layout_hmma_884_t::layout_hmma_884_t(size_t num_warps, throw std::runtime_error("cannot create a kernel with this amount of warps"); } +inline bool is_loop_latch(ir::phi_node *phi, ir::instruction *terminator){ + if(phi->get_parent() != terminator->get_parent()) + return false; + if(auto *br = dynamic_cast(terminator)) + return br->get_true_dest() == phi->get_parent() + || br->get_false_dest() == phi->get_parent(); + else if(dynamic_cast(terminator)) + return false; + else + throw std::runtime_error("unreachable"); +} + + + layout_scanline_t::layout_scanline_t(size_t num_warps, const std::vector& _axes, const std::vector& _shapes, @@ -215,17 +230,49 @@ layout_scanline_t::layout_scanline_t(size_t num_warps, throw std::runtime_error("cannot create a kernel with this amount of warps"); } +void extract_double_bufferable(ir::value *v, std::shared_ptr& res) { + auto* phi = dynamic_cast(v); + if(!phi || phi->get_num_incoming() != 2) + return; + ir::basic_block *block_0 = phi->get_incoming_block(0); + ir::basic_block *block_1 = phi->get_incoming_block(1); + ir::instruction *terminator_0 = block_0->get_inst_list().back(); + ir::instruction *terminator_1 = block_1->get_inst_list().back(); + bool is_latch_0 = is_loop_latch(phi, terminator_0); + bool is_latch_1 = is_loop_latch(phi, terminator_1); + ir::value *value_0 = phi->get_incoming_value(0); + ir::value *value_1 = phi->get_incoming_value(1); + ir::instruction *i_0 = dynamic_cast(value_0); + ir::instruction *i_1 = dynamic_cast(value_1); + if(!i_0 || !i_1 || + storage_info.at(i_0->get_id()).first != codegen::SHARED || + storage_info.at(i_1->get_id()).first != codegen::SHARED) + return; + if(is_latch_1) + res.reset(new double_buffer_info_t{value_0, value_1, phi}); + if(is_latch_0) + res.reset(new double_buffer_info_t{value_1, value_0, phi}); +} + + layout_shared_t::layout_shared_t(const layout_t *arg, const std::vector& _axes, const std::vector& _shapes, const std::vector &values, + ir::type *ty, size_t _id, analysis::align* align): layout_t(SHARED, _axes, _shapes, values, _id, align) { + this->ty = ty; size = 0; + + // double-buffering + for(ir::value *v: values) + extract_double_bufferable(v, double_buffer); + + // order if(arg->type == SCANLINE) order = arg->order; - ir::value* dot_a = nullptr; ir::value* dot_b = nullptr; ir::value* hmma_dot_a = nullptr; @@ -238,10 +285,35 @@ layout_shared_t::layout_shared_t(const layout_t *arg, } std::vector col = {0, 1}; std::vector row = {1, 0}; - if(dot_a && !hmma_dot_a) + bool is_nonhmma_dot_a = dot_a && !hmma_dot_a; + bool is_nonhmma_dot_b = dot_b && !hmma_dot_b; + if(is_nonhmma_dot_a) order = is_trans(dot_a) ? row : col; - if(dot_b && !hmma_dot_b) + if(is_nonhmma_dot_b) order = is_trans(dot_b) ? col : row; + + // padding + pad = 0; + if(hmma_dot_a){ + bool row = is_trans(hmma_dot_a) ^ order[0] == 1; + pad = 24 - shapes[row ? 0: 1] % 32; + } + else if(hmma_dot_b){ + bool row = is_trans(hmma_dot_b) ^ order[0] == 1; + pad = 24 - shapes[row ? 1 : 0] % 32; + } + else if(order != arg->order) { + pad = 16; + } + + // size + auto shape = this->shapes; + shape[order[0]] += pad; + size = ty->get_primitive_size_in_bits() / 8; + for(auto s: shape) + size *= s; + if(double_buffer) + size *= 2; } void layout::create(size_t id, const std::vector& values) { @@ -263,7 +335,7 @@ void layout::create(size_t id, const std::vector& values) { ir::copy_to_shared_inst *cts = (ir::copy_to_shared_inst*)*it_cts; ir::value *arg = cts->get_operand(0); create(groups_.at(arg), values_.at(groups_.at(arg))); - layouts_[id] = new layout_shared_t(get(arg), axes, shapes, values, id, align_); + layouts_[id] = new layout_shared_t(get(arg), axes, shapes, values, largest->get_type()->get_scalar_ty(), id, align_); } else layouts_[id] = new layout_scanline_t(num_warps_, axes, shapes, values, id, align_); diff --git a/lib/codegen/analysis/liveness.cc b/lib/codegen/analysis/liveness.cc index 98af7e039..c2dcc64cc 100644 --- a/lib/codegen/analysis/liveness.cc +++ b/lib/codegen/analysis/liveness.cc @@ -16,150 +16,12 @@ namespace triton{ namespace codegen{ namespace analysis{ -inline bool is_loop_latch(ir::phi_node *phi, ir::instruction *terminator){ - if(phi->get_parent() != terminator->get_parent()) - return false; - if(auto *br = dynamic_cast(terminator)) - return br->get_true_dest() == phi->get_parent() - || br->get_false_dest() == phi->get_parent(); - else if(dynamic_cast(terminator)) - return false; - else - throw std::runtime_error("unreachable"); -} - -void liveness::extract_double_bufferable(ir::instruction *i) { - auto* phi = dynamic_cast(i); - if(!phi || phi->get_num_incoming() != 2) - return; - ir::basic_block *block_0 = phi->get_incoming_block(0); - ir::basic_block *block_1 = phi->get_incoming_block(1); - ir::instruction *terminator_0 = block_0->get_inst_list().back(); - ir::instruction *terminator_1 = block_1->get_inst_list().back(); - bool is_latch_0 = is_loop_latch(phi, terminator_0); - bool is_latch_1 = is_loop_latch(phi, terminator_1); - ir::value *value_0 = phi->get_incoming_value(0); - ir::value *value_1 = phi->get_incoming_value(1); - ir::instruction *i_0 = dynamic_cast(value_0); - ir::instruction *i_1 = dynamic_cast(value_1); - if(!i_0 || !i_1 || storage_info.at(i_0->get_id()).first != codegen::SHARED || storage_info.at(i_1->get_id()).first != codegen::SHARED) - return; - if(is_latch_1) - double_[value_0] = double_buffer_info_t{value_1, phi}; - if(is_latch_0) - double_[value_1] = double_buffer_info_t{value_0, phi}; -} - - -// connected components -bool is_trans(ir::value *v) { - if(dynamic_cast(v)) { - return true; - } - if(auto *phi = dynamic_cast(v)) { - bool result = true; - for(ir::value *op: phi->ops()) - result = result && is_trans(op); - return result; - } - return false; -} - - -bool liveness::do_pad(ir::value *x) { - // alignment for matrix product - if(auto* dot = dynamic_cast(x)) { - // a - ir::value *a = dot->get_operand(0); - ir::value *b = dot->get_operand(1); - size_t a_previous = pad_[a]; - size_t b_previous = pad_[b]; - auto a_order = layouts_->get(a)->order; - auto b_order = layouts_->get(b)->order; - bool a_row = is_trans(a) ^ (a_order[0] == 1); - bool b_row = is_trans(b) ^ (b_order[0] == 1); - auto a_shapes = a->get_type()->get_tile_shapes(); - auto b_shapes = b->get_type()->get_tile_shapes(); - pad_[a] = std::max(pad_[a], (24 - a_shapes[a_row ? 0 : 1]) % 32); - pad_[b] = std::max(pad_[b], (24 - b_shapes[b_row ? 1 : 0]) % 32); - return a_previous != pad_[a] || b_previous != pad_[b]; - } - // padding for trans - if(auto* trans = dynamic_cast(x)) { - ir::value *op = trans->get_operand(0); - size_t previous = pad_[op]; - pad_[op] = std::max(pad_[op], pad_[x]); - return previous != pad_[op]; - } - // padding for copy to shared - if(auto* cts = dynamic_cast(x)) { - auto cts_order = layouts_->get(cts)->order; - ir::value *arg = cts->get_operand(0); - auto arg_order = layouts_->get(arg)->order; - size_t previous = pad_[cts]; - if(cts_order != arg_order) - pad_[cts] = std::max(pad_[cts], 4); - return pad_[cts] != previous; - } - // padding for phi-nodes - if(auto* phi = dynamic_cast(x)) { - bool has_changed = false; - for(unsigned i = 0; i < phi->get_num_incoming(); i++){ - ir::value* op = phi->get_operand(i); - size_t previous = pad_[op]; - pad_[op] = std::max(pad_[op], pad_[phi]); - has_changed |= previous != pad_[op]; - } - return has_changed; - } - // default -- no padding - size_t previous = pad_[x]; - pad_[x] = std::max(previous, 0); - return pad_[x] != previous; -} - -unsigned liveness::num_bytes(ir::value *x) { - unsigned num_bytes = x->get_type()->get_primitive_size_in_bits() / 8; - unsigned pad = pad_.at(x); - if(pad > 0){ - unsigned ld = x->get_type()->get_tile_shapes()[layouts_->get(x)->order[0]]; - num_bytes += pad * num_bytes / ld; - } - if(has_double(x)) - num_bytes *= 2; - return num_bytes; -} // Entry point void liveness::run(ir::module &mod) { - double_.clear(); indices.clear(); - pad_.clear(); intervals_.clear(); - // Create set of pair of values that can be double-buffered - ir::for_each_instruction(mod, [this](ir::instruction* i) { - this->extract_double_bufferable(i); - }); - - // Padding information - bool has_changed; - do{ - has_changed = false; - ir::for_each_value(mod, [this, &has_changed](ir::value* v){ - has_changed |= this->do_pad(v); - }); - }while(has_changed); - - - // connected components - for(auto &x: layouts_->get_all()) { - layout_t*& layout = x.second; - if(layout->type != SHARED) - continue; - for(ir::value *v: layout->values) - layout->size = std::max(layout->size, num_bytes(v)); - } // Assigns index to each instruction for(ir::function *fn: mod.get_function_list()){ diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index e039f8ec7..bb926949e 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -710,15 +710,15 @@ void selection::create_shared_tile(ir::value *v, IRBuilder<> &builder, Value *sh return; auto order = layouts_->get(v)->order; auto shapes = v->get_type()->get_tile_shapes(); - unsigned pad = liveness_->get_pad(v); + unsigned pad = layouts_->get(v)->pad; if(pad > 0) shapes[order[0]] += pad; Type* ty = llvm_type(v->get_type()->get_scalar_ty(), builder.getContext()); // shared copy PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr->getType()->getPointerAddressSpace()); // double-buffered - if(liveness_->has_double(v)) { - auto info = liveness_->get_double(v); + if(layouts_->get(v)->double_buffer) { + auto info = *layouts_->get(v)->double_buffer; ir::phi_node *phi = info.phi; BasicBlock *parent = (BasicBlock*)vmap_[phi->get_parent()]; if(parent->empty()) @@ -1532,10 +1532,9 @@ void selection::run(ir::module &src, Module &dst) { } } // finalize double-buffering - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *inst: block->get_inst_list()) { - if(liveness_->has_double(inst)) { - auto info = liveness_->get_double(inst); + for(const auto& x: layouts_->get_all()) { + if(x.second->double_buffer) { + auto info = *x.second->double_buffer; ir::phi_node *phi = info.phi; PHINode *ptr = (PHINode*)((shared_tile*)tmap_.at(phi))->get_pointer(); PHINode *offset = (PHINode*)((shared_tile*)tmap_.at(phi))->get_offset(); @@ -1550,8 +1549,8 @@ void selection::run(ir::module &src, Module &dst) { offset->addIncoming(next_offset, llvm_inc_block); } else { - unsigned num_bytes = inst->get_type()->get_scalar_ty()->get_primitive_size_in_bits() / 8; - offset->addIncoming(dst_builder.getInt32(layouts_->get(inst)->size / (2*num_bytes)), llvm_inc_block); + unsigned num_bytes = x.second->ty->get_primitive_size_in_bits() / 8; + offset->addIncoming(dst_builder.getInt32(x.second->size / (2*num_bytes)), llvm_inc_block); } ptr->addIncoming(inc_shared->get_pointer(), llvm_inc_block); } diff --git a/lib/codegen/transform/membar.cc b/lib/codegen/transform/membar.cc index d0bd890e8..9a8ad7fd2 100644 --- a/lib/codegen/transform/membar.cc +++ b/lib/codegen/transform/membar.cc @@ -120,18 +120,15 @@ void membar::run(ir::module &mod) { // shared-memory copies. These can be read from and written to // without needing synchronization std::set safe_war; - ir::for_each_instruction(mod, [&](ir::instruction* i){ - if(liveness_->has_double(i)){ - auto info = liveness_->get_double(i); - safe_war.insert(i); + for(const auto& x: layouts_->get_all()){ + if(x.second->double_buffer){ + auto info = *x.second->double_buffer; + safe_war.insert(info.first); safe_war.insert(info.latch); - auto *trans = dynamic_cast(info.latch); - if(trans) - safe_war.insert(trans->get_operand(0)); } - if(i->get_id() == ir::INST_TRANS) - safe_war.insert(i); - }); + } + + for(ir::function *fn: mod.get_function_list()){ std::vector rpo = ir::cfg::reverse_post_order(fn); From 6beef4be1f7dd7dd8e6c2b508cf657757fb0fa36 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 12 Oct 2019 01:25:08 -0400 Subject: [PATCH 436/494] more cleaning --- include/triton/codegen/analysis/liveness.h | 7 +++--- lib/codegen/analysis/allocation.cc | 8 +++---- lib/codegen/analysis/layout.cc | 27 ++++++++++++---------- lib/codegen/analysis/liveness.cc | 5 ++-- 4 files changed, 24 insertions(+), 23 deletions(-) diff --git a/include/triton/codegen/analysis/liveness.h b/include/triton/codegen/analysis/liveness.h index 6e5c456b9..4d5fa3e91 100644 --- a/include/triton/codegen/analysis/liveness.h +++ b/include/triton/codegen/analysis/liveness.h @@ -42,21 +42,20 @@ struct segment { class liveness { private: - typedef std::map indices_map_t; typedef std::map intervals_map_t; public: + // constructor liveness(layout *l): layouts_(l){ } // accessors - const intervals_map_t& intervals() const { return intervals_; } - segment get_interval(layout_t* v) const { return intervals_.at(v); } + const intervals_map_t& get() const { return intervals_; } + segment get(layout_t* v) const { return intervals_.at(v); } // run void run(ir::module &mod); private: // analysis layout *layouts_; - indices_map_t indices; intervals_map_t intervals_; }; diff --git a/lib/codegen/analysis/allocation.cc b/lib/codegen/analysis/allocation.cc index df562ff3b..2474acded 100644 --- a/lib/codegen/analysis/allocation.cc +++ b/lib/codegen/analysis/allocation.cc @@ -22,7 +22,7 @@ void allocation::run(ir::module &mod) { typedef std::multimap triples_map_type; std::vector I; - for(auto x: liveness_->intervals()) + for(auto x: liveness_->get()) I.push_back(x.first); std::vector J = I; @@ -37,7 +37,7 @@ void allocation::run(ir::module &mod) { segment xh = h_it->second; H.erase(h_it); auto j_it = std::find_if(J.begin(), J.end(), [&](layout_t* JJ){ - segment xj = liveness_->get_interval(JJ); + segment xj = liveness_->get(JJ); bool res = xj.intersect(xh); for(auto val: H) res = res && !val.second.intersect(xj); @@ -45,7 +45,7 @@ void allocation::run(ir::module &mod) { }); if(j_it != J.end()){ unsigned size = (*j_it)->size; - segment xj = liveness_->get_interval(*j_it); + segment xj = liveness_->get(*j_it); starts[*j_it] = w; H.insert({w + size, segment{max(xh.start, xj.start), min(xh.end, xj.end)}}); if(xh.start < xj.start) @@ -68,7 +68,7 @@ void allocation::run(ir::module &mod) { unsigned NY = y->size; segment XS = {X0, X0 + NX}; segment YS = {Y0, Y0 + NY}; - if(liveness_->get_interval(x).intersect(liveness_->get_interval(y)) + if(liveness_->get(x).intersect(liveness_->get(y)) && XS.intersect(YS)) interferences[x].insert(y); } diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index e07ed6d34..cfd3b3c47 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -188,17 +188,6 @@ layout_hmma_884_t::layout_hmma_884_t(size_t num_warps, throw std::runtime_error("cannot create a kernel with this amount of warps"); } -inline bool is_loop_latch(ir::phi_node *phi, ir::instruction *terminator){ - if(phi->get_parent() != terminator->get_parent()) - return false; - if(auto *br = dynamic_cast(terminator)) - return br->get_true_dest() == phi->get_parent() - || br->get_false_dest() == phi->get_parent(); - else if(dynamic_cast(terminator)) - return false; - else - throw std::runtime_error("unreachable"); -} @@ -230,6 +219,19 @@ layout_scanline_t::layout_scanline_t(size_t num_warps, throw std::runtime_error("cannot create a kernel with this amount of warps"); } +inline bool is_loop_latch(ir::phi_node *phi, ir::instruction *terminator){ + if(phi->get_parent() != terminator->get_parent()) + return false; + if(auto *br = dynamic_cast(terminator)) + return br->get_true_dest() == phi->get_parent() + || br->get_false_dest() == phi->get_parent(); + else if(dynamic_cast(terminator)) + return false; + else + throw std::runtime_error("unreachable"); +} + + void extract_double_bufferable(ir::value *v, std::shared_ptr& res) { auto* phi = dynamic_cast(v); if(!phi || phi->get_num_incoming() != 2) @@ -303,7 +305,7 @@ layout_shared_t::layout_shared_t(const layout_t *arg, pad = 24 - shapes[row ? 1 : 0] % 32; } else if(order != arg->order) { - pad = 16; + pad = 4; } // size @@ -316,6 +318,7 @@ layout_shared_t::layout_shared_t(const layout_t *arg, size *= 2; } +// layout factory method void layout::create(size_t id, const std::vector& values) { auto it_hmma_c = std::find_if(values.begin(), values.end(), &is_hmma_c); auto cmp = [](ir::value* x, ir::value *y) { diff --git a/lib/codegen/analysis/liveness.cc b/lib/codegen/analysis/liveness.cc index c2dcc64cc..707cbaa23 100644 --- a/lib/codegen/analysis/liveness.cc +++ b/lib/codegen/analysis/liveness.cc @@ -17,13 +17,11 @@ namespace codegen{ namespace analysis{ -// Entry point void liveness::run(ir::module &mod) { - indices.clear(); intervals_.clear(); - // Assigns index to each instruction + std::map indices; for(ir::function *fn: mod.get_function_list()){ slot_index index = 0; for(ir::basic_block *block: fn->blocks()) @@ -33,6 +31,7 @@ void liveness::run(ir::module &mod) { } } + // create live intervals for(auto &x: layouts_->get_all()) { layout_t* layout = x.second; if(layout->type != SHARED) From cb12fc1a87a546d2ef36407a63251a31050a4dbe Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 13 Oct 2019 00:25:06 -0400 Subject: [PATCH 437/494] [codegen] adding visitor --- include/triton/codegen/selection.h | 80 +++- include/triton/ir/instructions.h | 39 +- include/triton/ir/visitor.h | 116 +++++ lib/codegen/selection.cc | 732 +++++++++++++++++++++++++++-- 4 files changed, 932 insertions(+), 35 deletions(-) create mode 100644 include/triton/ir/visitor.h diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 1c367916b..5507d5b8b 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -5,6 +5,7 @@ #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/type.h" +#include "triton/ir/visitor.h" #include "triton/codegen/analysis/layout.h" #include "triton/codegen/transform/cts.h" @@ -24,6 +25,7 @@ namespace llvm{ class Function; } + // typedefs namespace triton{ namespace codegen{ @@ -145,6 +147,82 @@ private: }; +class generator: public ir::visitor { +private: + Type *type(ir::type *ty); + +private: + void visit_hmma_dot(ir::dot_inst*, distributed_tile *TC, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK); + void visit_scanline_dot(ir::dot_inst*, distributed_tile *TC, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK, Type *c_ty, Function *f_mul_add); + void visit_outer_dot(ir::dot_inst*, distributed_tile *TC, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK, + Type *c_ty, Function *f_mul_add); + + void for_each(ir::value *x, const std::function& fn); + void get_value(ir::value *x, const indices_t& idx); + void set_value(ir::value *x, const indices_t& idx, Value* v); + +public: + void visit_phi_node(ir::phi_node*); + void visit_binary_operator(ir::binary_operator*); + void visit_getelementptr_inst(ir::getelementptr_inst*); + + void visit_icmp_inst(ir::icmp_inst*); + void visit_fcmp_inst(ir::fcmp_inst*); + void visit_cast_inst(ir::cast_inst*); + + void visit_return_inst(ir::return_inst*); + void visit_cond_branch_inst(ir::cond_branch_inst*); + void visit_uncond_branch_inst(ir::uncond_branch_inst*); + + + void visit_unmasked_load_inst(ir::unmasked_load_inst*); + void visit_masked_load_inst(ir::masked_load_inst*); + void visit_unmasked_store_inst(ir::unmasked_store_inst*); + void visit_masked_store_inst(ir::masked_store_inst*); + + void visit_retile_inst(ir::retile_inst*); + void visit_reshape_inst(ir::reshape_inst*); + void visit_splat_inst(ir::splat_inst*); + void visit_broadcast_inst(ir::broadcast_inst*); + void visit_downcast_inst(ir::downcast_inst*); + + void visit_get_program_id_inst(ir::get_program_id_inst*); + void visit_get_num_program_inst(ir::get_num_program_inst*); + void visit_atomic_cas_inst(ir::atomic_cas_inst*); + void visit_atomic_exch_inst(ir::atomic_exch_inst*); + void visit_atomic_add_inst(ir::atomic_add_inst*); + void visit_dot_inst(ir::dot_inst*); + void visit_trans_inst(ir::trans_inst*); + void visit_sqrt_inst(ir::sqrt_inst*); + void visit_reduce_inst(ir::reduce_inst*); + void visit_select_inst(ir::select_inst*); + + void visit_copy_to_shared_inst(ir::copy_to_shared_inst*); + void visit_copy_from_shared_inst(ir::copy_from_shared_inst*); + void visit_barrier_inst(ir::barrier_inst*); + void visit_make_range_dyn(ir::make_range_dyn*); + void visit_make_range_sta(ir::make_range_sta*); + void visit_make_range(ir::make_range*); + +private: + LLVMContext *ctx_; + Function *fn_; + Builder *builder_; + + std::map vmap_; + std::map tmap_; + target *tgt_; + analysis::layout *layouts_; + analysis::align *alignment_; + analysis::allocation *alloc_; + Value *sh_mem_ptr_; + Value *offset_a_i_, *offset_a_k_; + Value *offset_b_j_, *offset_b_k_; + unsigned num_packs_0_, num_packs_1_; + unsigned pack_size_0_, pack_size_1_; + unsigned num_warps_; +}; + // Selection pass class selection{ typedef std::map vmap_t; @@ -178,7 +256,7 @@ private: void init_layouts(ir::function *fn, Builder &builder, Value *sh_mem_ptr); // lower scalar instruction - void lower_instruction(ir::instruction *src, Builder &builder); + void lower_value(ir::value *src, Builder &builder, std::set& seen); // lower tile instruction void lower_masked_store(ir::masked_store_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); void lower_store(ir::store_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index f59cf95be..6b999f9bb 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -10,10 +10,13 @@ #include "triton/ir/value.h" #include "triton/ir/type.h" #include "triton/ir/metadata.h" +#include "triton/ir/visitor.h" #define _TRITON_DEFINE_CLONE(name) \ ir::instruction* clone_impl() const { return new name(*this); } +#define _TRITON_DEFINE_ACCEPT(name) \ + void accept(visitor* v) { v->visit_ ## name (this); } namespace triton{ namespace ir{ @@ -23,6 +26,7 @@ class constant; class make_range; class basic_block; class context; +class visitor; //===----------------------------------------------------------------------===// // instruction classes @@ -99,6 +103,7 @@ public: static phi_node* create(type *ty, unsigned num_reserved, const std::string &name = "", instruction *next = nullptr); _TRITON_DEFINE_CLONE(phi_node) + _TRITON_DEFINE_ACCEPT(phi_node) private: unsigned num_reserved_; @@ -148,6 +153,7 @@ public: static binary_operator *create_not(value *arg, const std::string &name = "", instruction *next = nullptr); _TRITON_DEFINE_CLONE(binary_operator) + _TRITON_DEFINE_ACCEPT(binary_operator) public: binary_op_t op_; @@ -189,6 +195,7 @@ public: static icmp_inst* create(cmp_pred_t pred, value *lhs, value *rhs, const std::string &name = "", instruction *next = nullptr); _TRITON_DEFINE_CLONE(icmp_inst) + _TRITON_DEFINE_ACCEPT(icmp_inst) }; class fcmp_inst: public cmp_inst { @@ -199,6 +206,7 @@ public: static fcmp_inst* create(cmp_pred_t pred, value *lhs, value *rhs, const std::string &name = "", instruction *next = nullptr); _TRITON_DEFINE_CLONE(fcmp_inst) + _TRITON_DEFINE_ACCEPT(fcmp_inst) }; //===----------------------------------------------------------------------===// @@ -236,13 +244,15 @@ public: static cast_inst *create_integer_cast(value *arg, type *ty, bool is_signed, const std::string &name = "", instruction *next = nullptr); + _TRITON_DEFINE_ACCEPT(cast_inst) + private: cast_op_t op_; }; #define TRITON_IR_DECLARE_CAST_INST_SIMPL(name, id, op) \ class name : public cast_inst { \ - _TRITON_DEFINE_CLONE(name); \ + _TRITON_DEFINE_CLONE(name) \ friend class cast_inst; \ name(type *ty, value *v, const std::string &name, instruction *next) \ : cast_inst(ty, id, v, name, next, op){ } \ @@ -287,6 +297,7 @@ public: static return_inst* create(context &ctx, value *ret_val = nullptr, instruction *next = nullptr); _TRITON_DEFINE_CLONE(return_inst) + _TRITON_DEFINE_ACCEPT(return_inst) }; // base branch instruction @@ -315,6 +326,7 @@ public: basic_block *get_false_dest() { return (basic_block*)get_operand(1); } value *get_cond() { return get_operand(2); } _TRITON_DEFINE_CLONE(cond_branch_inst) + _TRITON_DEFINE_ACCEPT(cond_branch_inst) }; // unconditional branch @@ -326,6 +338,7 @@ private: public: basic_block *get_dest() { return (basic_block*)get_operand(0); } _TRITON_DEFINE_CLONE(uncond_branch_inst) + _TRITON_DEFINE_ACCEPT(uncond_branch_inst) }; @@ -354,6 +367,7 @@ public: static getelementptr_inst* create(value *ptr, const std::vector &idx, const std::string &name = "", instruction *next = nullptr); _TRITON_DEFINE_CLONE(getelementptr_inst) + _TRITON_DEFINE_ACCEPT(getelementptr_inst) private: type *source_elt_ty; @@ -395,6 +409,7 @@ public: const std::string &name = "", instruction *next = nullptr); _TRITON_DEFINE_CLONE(unmasked_load_inst) + _TRITON_DEFINE_ACCEPT(unmasked_load_inst) }; // masked load @@ -413,6 +428,7 @@ public: const std::string &name = "", instruction *next = nullptr); _TRITON_DEFINE_CLONE(masked_load_inst) + _TRITON_DEFINE_ACCEPT(masked_load_inst) }; // store @@ -437,6 +453,7 @@ public: const std::string &name = "", instruction *next = nullptr); _TRITON_DEFINE_CLONE(unmasked_store_inst) + _TRITON_DEFINE_ACCEPT(unmasked_store_inst) }; class masked_store_inst: public store_inst{ @@ -453,6 +470,7 @@ public: const std::string &name = "", instruction *next = nullptr); _TRITON_DEFINE_CLONE(masked_store_inst) + _TRITON_DEFINE_ACCEPT(masked_store_inst) }; //===----------------------------------------------------------------------===// @@ -477,6 +495,7 @@ public: static instruction* create(value *arg, const type::tile_shapes_t &shape_suffix, const std::string &name = "", instruction *next = nullptr); _TRITON_DEFINE_CLONE(reshape_inst) + _TRITON_DEFINE_ACCEPT(reshape_inst) }; // splat @@ -490,6 +509,7 @@ public: static instruction* create(value *arg, const type::tile_shapes_t &shape_suffix, const std::string &name = "", instruction *next = nullptr); _TRITON_DEFINE_CLONE(splat_inst) + _TRITON_DEFINE_ACCEPT(splat_inst) }; // broadcast @@ -503,6 +523,7 @@ public: static instruction* create(value *arg, const type::tile_shapes_t &shape_suffix, const std::string &name = "", instruction *next = nullptr); _TRITON_DEFINE_CLONE(broadcast_inst) + _TRITON_DEFINE_ACCEPT(broadcast_inst) }; @@ -516,6 +537,7 @@ private: public: static instruction* create(value *arg, const std::string &name = "", instruction *next = nullptr); _TRITON_DEFINE_CLONE(downcast_inst) + _TRITON_DEFINE_ACCEPT(downcast_inst) }; //===----------------------------------------------------------------------===// @@ -536,6 +558,7 @@ public: static instruction* create(context &ctx, unsigned axis, const std::string &name = "", instruction *next = nullptr); unsigned get_axis() const { return axis_; } _TRITON_DEFINE_CLONE(get_program_id_inst) + _TRITON_DEFINE_ACCEPT(get_program_id_inst) private: unsigned axis_; @@ -550,6 +573,7 @@ public: static instruction* create(context &ctx, unsigned axis, const std::string &name = "", instruction *next = nullptr); unsigned get_axis() const { return axis_; } _TRITON_DEFINE_CLONE(get_num_program_inst) + _TRITON_DEFINE_ACCEPT(get_num_program_inst) private: unsigned axis_; @@ -560,6 +584,7 @@ private: atomic_cas_inst(value *ptr, value *cmp, value *val, const std::string &name, instruction *next); std::string repr_impl() const { return "atomic_cas"; } _TRITON_DEFINE_CLONE(atomic_cas_inst) + _TRITON_DEFINE_ACCEPT(atomic_cas_inst) public: static instruction* create(value *ptr, value *cmp, value *val, const std::string &name = "", instruction *next = nullptr); @@ -570,6 +595,7 @@ private: atomic_exch_inst(value *ptr, value *val, const std::string &name = "", instruction *next = nullptr); std::string repr_impl() const { return "atomic_exch"; } _TRITON_DEFINE_CLONE(atomic_exch_inst) + _TRITON_DEFINE_ACCEPT(atomic_exch_inst) public: static instruction* create(value *ptr, value *val, const std::string &name = "", instruction *next = nullptr); @@ -580,6 +606,7 @@ private: atomic_add_inst(value *ptr, value *val, const std::string &name = "", instruction *next = nullptr); std::string repr_impl() const { return "atomic_add"; } _TRITON_DEFINE_CLONE(atomic_add_inst) + _TRITON_DEFINE_ACCEPT(atomic_add_inst) public: static instruction* create(value *ptr, value *val, const std::string &name = "", instruction *next = nullptr); @@ -600,6 +627,7 @@ public: static instruction* create_tn(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr); static instruction* create_tt(value *A, value *B, value *C, const std::string &name = "", instruction *next = nullptr); _TRITON_DEFINE_CLONE(dot_inst) + _TRITON_DEFINE_ACCEPT(dot_inst) }; //class outer_inst: public builtin_inst { @@ -622,6 +650,7 @@ public: static instruction* create(value *arg, const std::vector &perm = {}, const std::string &name = "", instruction *next = nullptr); const std::vector get_perm() const; _TRITON_DEFINE_CLONE(trans_inst) + _TRITON_DEFINE_ACCEPT(trans_inst) private: std::vector perm_; @@ -634,6 +663,7 @@ private: public: static instruction* create(value *arg, const std::string &name = "", instruction *next = nullptr); _TRITON_DEFINE_CLONE(sqrt_inst) + _TRITON_DEFINE_ACCEPT(sqrt_inst) }; class reduce_inst: public builtin_inst { @@ -644,6 +674,7 @@ private: reduce_inst(value* arg, unsigned axis, const std::string& name, instruction* next); std::string repr_impl() const { return "reduce"; } _TRITON_DEFINE_CLONE(reduce_inst) + _TRITON_DEFINE_ACCEPT(reduce_inst) public: static instruction* create(value *arg, unsigned axis, const std::string &name = "", instruction *next = nullptr); @@ -658,6 +689,7 @@ private: select_inst(value *pred, value *if_value, value *else_value, const std::string& name, instruction* next); std::string repr_impl() const { return "select"; } _TRITON_DEFINE_CLONE(select_inst) + _TRITON_DEFINE_ACCEPT(select_inst) public: static instruction* create(value *pred, value *if_value, value *else_value, const std::string &name = "", instruction *next = nullptr); @@ -676,6 +708,7 @@ public: static copy_to_shared_inst* create(value *arg, const std::string &name = "", instruction *next = nullptr); _TRITON_DEFINE_CLONE(copy_to_shared_inst) + _TRITON_DEFINE_ACCEPT(copy_to_shared_inst) }; class copy_from_shared_inst: public unary_inst{ @@ -687,6 +720,7 @@ public: static copy_from_shared_inst* create(value *arg, const std::string &name = "", instruction *next = nullptr); _TRITON_DEFINE_CLONE(copy_from_shared_inst) + _TRITON_DEFINE_ACCEPT(copy_from_shared_inst) }; class barrier_inst: public instruction{ @@ -694,6 +728,7 @@ private: barrier_inst(context &ctx, const std::string &name, instruction *next); std::string repr_impl() const { return "barrier"; } _TRITON_DEFINE_CLONE(barrier_inst) + _TRITON_DEFINE_ACCEPT(barrier_inst) public: static barrier_inst* create(context &ctx, const std::string &name = "", @@ -708,6 +743,7 @@ private: make_range_dyn(type *ty, const std::string &name, instruction *next); std::string repr_impl() const { return "nv_dynamic_program_idx"; } _TRITON_DEFINE_CLONE(make_range_dyn) + _TRITON_DEFINE_ACCEPT(make_range_dyn) public: static make_range_dyn* create(type *ty, const std::string &name = "", instruction *next = nullptr); @@ -732,6 +768,7 @@ class make_range: public instruction{ make_range(type *ty, constant_int* first, constant_int* last); std::string repr_impl() const { return "make_range[" + first_->repr() + " : " + last_->repr() + "]"; } _TRITON_DEFINE_CLONE(make_range) + _TRITON_DEFINE_ACCEPT(make_range) public: static make_range *create(constant_int *first, constant_int *last); diff --git a/include/triton/ir/visitor.h b/include/triton/ir/visitor.h new file mode 100644 index 000000000..645c2cbe7 --- /dev/null +++ b/include/triton/ir/visitor.h @@ -0,0 +1,116 @@ +#pragma once + +#ifndef _TRITON_IR_VISITOR_H_ +#define _TRITON_IR_VISITOR_H_ + +namespace triton{ +namespace ir{ + + +class phi_node; +class binary_operator; +class getelementptr_inst; + +class icmp_inst; +class fcmp_inst; +class trunc_inst; +class z_ext_inst; +class s_ext_inst; +class fp_trunc_inst; +class fp_ext_inst; +class ui_to_fp_inst; +class si_to_fp_inst; +class fp_to_ui_inst; +class fp_to_si_inst; +class ptr_to_int_inst; +class int_to_ptr_inst; +class bit_cast_inst; +class addr_space_cast_inst; + +class return_inst; +class cond_branch_inst; +class uncond_branch_inst; + + +class unmasked_load_inst; +class masked_load_inst; +class unmasked_store_inst; +class masked_store_inst; + +class retile_inst; +class reshape_inst; +class splat_inst; +class broadcast_inst; +class downcast_inst; + +class get_program_id_inst; +class get_num_program_inst; +class atomic_cas_inst; +class atomic_exch_inst; +class atomic_add_inst; +class dot_inst; +class trans_inst; +class sqrt_inst; +class reduce_inst; +class select_inst; + +class copy_to_shared_inst; +class copy_from_shared_inst; +class barrier_inst; +class make_range_dyn; +class make_range_sta; +class make_range; + + + +class visitor { +public: + virtual ~visitor() {} + + virtual void visit_phi_node(phi_node*) = 0; + virtual void visit_binary_operator(binary_operator*) = 0; + virtual void visit_getelementptr_inst(getelementptr_inst*) = 0; + + virtual void visit_icmp_inst(icmp_inst*) = 0; + virtual void visit_fcmp_inst(fcmp_inst*) = 0; + virtual void visit_cast_inst(trunc_inst*) = 0; + + virtual void visit_return_inst(return_inst*) = 0; + virtual void visit_cond_branch_inst(cond_branch_inst*) = 0; + virtual void visit_uncond_branch_inst(uncond_branch_inst*) = 0; + + + virtual void visit_unmasked_load_inst(unmasked_load_inst*) = 0; + virtual void visit_masked_load_inst(masked_load_inst*) = 0; + virtual void visit_unmasked_store_inst(unmasked_store_inst*) = 0; + virtual void visit_masked_store_inst(masked_store_inst*) = 0; + + virtual void visit_retile_inst(retile_inst*) = 0; + virtual void visit_reshape_inst(reshape_inst*) = 0; + virtual void visit_splat_inst(splat_inst*) = 0; + virtual void visit_broadcast_inst(broadcast_inst*) = 0; + virtual void visit_downcast_inst(downcast_inst*) = 0; + + virtual void visit_get_program_id_inst(get_program_id_inst*) = 0; + virtual void visit_get_num_program_inst(get_num_program_inst*) = 0; + virtual void visit_atomic_cas_inst(atomic_cas_inst*) = 0; + virtual void visit_atomic_exch_inst(atomic_exch_inst*) = 0; + virtual void visit_atomic_add_inst(atomic_add_inst*) = 0; + virtual void visit_dot_inst(dot_inst*) = 0; + virtual void visit_trans_inst(trans_inst*) = 0; + virtual void visit_sqrt_inst(sqrt_inst*) = 0; + virtual void visit_reduce_inst(reduce_inst*) = 0; + virtual void visit_select_inst(select_inst*) = 0; + + virtual void visit_copy_to_shared_inst(copy_to_shared_inst*) = 0; + virtual void visit_copy_from_shared_inst(copy_from_shared_inst*) = 0; + virtual void visit_barrier_inst(barrier_inst*) = 0; + virtual void visit_make_range_dyn(make_range_dyn*) = 0; + virtual void visit_make_range_sta(make_range_sta*) = 0; + virtual void visit_make_range(make_range*) = 0; +}; + +} +} + +#endif diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index bb926949e..a4e2067a5 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -759,24 +759,7 @@ void selection::create_distributed_tile(ir::value *v, IRBuilder<> &builder) { } } distributed_tile *T = new distributed_tile(ty, shapes, layouts_->get(v)->order, axes, builder, false); - bool is_inserted = tmap_.insert({v, T}).second; - // constant range - if(is_inserted && dynamic_cast(v)){ - T->for_each([&](indices_t idx){ - assert(idx.size() == 1); - T->set_value(idx, idx[0]); - }); - } - if(is_inserted && dynamic_cast(v)){ - T->for_each([&](indices_t idx){ - assert(idx.size() == 1); - BinaryOperator *bin_add = dyn_cast(idx[0]); - assert(bin_add); - Value *res = bin_add->getOperand(1); - assert(isa(res)); - T->set_value(idx, res); - }); - } + tmap_.insert({v, T}); } void selection::create_tile(ir::value *v, IRBuilder<> &builder, @@ -1408,14 +1391,56 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & lower_elementwise(ins, ctx, fn, builder); } -void selection::lower_instruction(ir::instruction *src, IRBuilder<> &builder) { - if(src->has_tile_result_or_op()) { - lower_tile_instruction(src, builder); +void selection::lower_value(ir::value *src, IRBuilder<> &builder, std::set& seen) { + if(!seen.insert(src).second) + return; + + auto *inst = dynamic_cast(src); + if(inst && !dynamic_cast(src)) + for(ir::value *op: inst->ops()) + lower_value(op, builder, seen); + + BasicBlock *current = builder.GetInsertBlock(); + auto *phi = dynamic_cast(src); + bool phi_inserted = phi && !current->empty(); + if(phi_inserted && current->getFirstNonPHI()) + builder.SetInsertPoint(&*current->getFirstNonPHI()); + + + if(dynamic_cast(src)){ + distributed_tile *T = (distributed_tile *)tmap_.at(src); + T->for_each([&](indices_t idx){ + assert(idx.size() == 1); + T->set_value(idx, idx[0]); + }); } - else { - Instruction *i = (Instruction*)llvm_value(src, builder); + else if(dynamic_cast(src)){ + distributed_tile *T = (distributed_tile *)tmap_.at(src); + T->for_each([&](indices_t idx){ + assert(idx.size() == 1); + BinaryOperator *bin_add = dyn_cast(idx[0]); + assert(bin_add); + Value *res = bin_add->getOperand(1); + assert(isa(res)); + T->set_value(idx, res); + }); + } + else if(inst && inst->has_tile_result_or_op()) { + lower_tile_instruction(inst, builder); + } + else if(inst){ + Instruction *i = (Instruction*)llvm_value(inst, builder); vmap_[src] = i; } + + if(phi_inserted && current->getFirstNonPHI()) + builder.SetInsertPoint(current); + +// if(dynamic_cast(src)) +// for(ir::value *op: inst->ops()) +// lower_value(op, builder, seen); + + } /* ---------------------------- @@ -1508,29 +1533,29 @@ void selection::run(ir::module &src, Module &dst) { vmap_[x] = llvm_alloc_const(x, &dst, dst_builder); // iterate over functions + std::set seen; + for(ir::function *fn: src.get_function_list()) { + // create LLVM function llvm_fn(fn, dst_builder, dst); + // allocate shared memory sh_mem_ptr_ = alloc_shared(dst_builder, dst); + // initialize layouts init_layouts(fn, dst_builder, sh_mem_ptr_); + // generate LLVM-IR code std::map last_block; for(ir::basic_block *block: fn->blocks()) { BasicBlock *parent = (BasicBlock*)vmap_[block]; dst_builder.SetInsertPoint(parent); - for(ir::instruction *i: block->get_inst_list()){ - BasicBlock *current = dst_builder.GetInsertBlock(); - bool phi_inserted = (dynamic_cast(i)) && !current->empty(); - if(phi_inserted && current->getFirstNonPHI()) - dst_builder.SetInsertPoint(&*current->getFirstNonPHI()); - lower_instruction(i, dst_builder); - if(phi_inserted && current->getFirstNonPHI()) - dst_builder.SetInsertPoint(current); - last_block[block] = dst_builder.GetInsertBlock(); - } + for(ir::instruction *i: block->get_inst_list()) + lower_value(i, dst_builder, seen); + last_block[block] = dst_builder.GetInsertBlock(); } + // finalize double-buffering for(const auto& x: layouts_->get_all()) { if(x.second->double_buffer) { @@ -1588,5 +1613,646 @@ void selection::run(ir::module &src, Module &dst) { } +/* ----------------------------------------------------- + * + * + * + * + * + * + * + * + * + * + * ------------------------------------------------------ */ + + + +void generator::visit_phi_node(ir::phi_node* phi) { + Type *ty = type(phi->get_type()->get_scalar_ty()); + unsigned num_ops = phi->get_num_operands(); + for_each(phi, [&](indices_t idx){ + set_value(phi, idx, builder_->Insert(PHINode::Create(ty, num_ops))); + }); +} + +void generator::visit_binary_operator(ir::binary_operator*binop) { + for_each(binop, [&](indices_t idx){ + Value *lhs = get_value(binop->get_operand(0), idx); + Value *rhs = get_value(binop->get_operand(1), idx); + Value *ret = builder_->Insert(BinaryOperator::Create(llvm_op(binop->get_op()), lhs, rhs)); + set_value(binop, idx, ret); + }); +} + +void generator::visit_getelementptr_inst(ir::getelementptr_inst* gep) { + for_each(gep, [&](indices_t idx){ + Value *ptr = get_value(gep->get_operand(0), idx); + std::vector idx_vals; + std::transform(gep->idx_begin(), gep->idx_end(), std::back_inserter(idx_vals), + [&](ir::value* x){ return get_value(x, idx);}); + Type *source_ty = type(gep->get_source_elt_ty()->get_scalar_ty()); + Value *ret = builder_->Insert(GetElementPtrInst::CreateInBounds(source_ty, ptr, idx_vals)); + set_value(gep, idx, ret); + }); +} + +void generator::visit_icmp_inst(ir::icmp_inst* icmp) { + for_each(icmp, [&](indices_t idx){ + ir::cmp_pred_t pred = icmp->get_pred(); + Value *lhs = get_value(icmp->get_operand(0), idx); + Value *rhs = get_value(icmp->get_operand(1), idx); + Value *ret = builder_->Insert(CmpInst::Create(Instruction::ICmp, llvm_pred(pred), lhs, rhs)); + set_value(icmp, idx, ret); + }); +} + +void generator::visit_fcmp_inst(ir::fcmp_inst* fcmp) { + for_each(fcmp, [&](indices_t idx){ + ir::cmp_pred_t pred = fcmp->get_pred(); + Value *lhs = get_value(fcmp->get_operand(0), idx); + Value *rhs = get_value(fcmp->get_operand(1), idx); + Value *ret = builder_->Insert(FCmpInst::Create(Instruction::FCmp, llvm_pred(pred), lhs, rhs)); + set_value(fcmp, idx, ret); + }); +} + +void generator::visit_cast_inst(ir::cast_inst* cast) { + for_each(cast, [&](indices_t idx){ + Value *arg = get_value(cast->get_operand(0), idx); + Type *dst_ty = type(cast->get_type()->get_scalar_ty()); + Value *ret = builder_->Insert(CastInst::Create(llvm_op(cast->get_op()), arg, dst_ty)); + set_value(cast, idx, ret); + }); +} + +void generator::visit_return_inst(ir::return_inst* rr) { + ir::value *ret_val = rr->get_return_value(); + builder_->Insert(ReturnInst::Create(*ctx_, ret_val ? ret_val : nullptr)); +} + +void generator::visit_cond_branch_inst(ir::cond_branch_inst* br) { + BasicBlock *true_dest = vmap_.at(br->get_true_dest()); + BasicBlock *false_dest = vmap_.at(br->get_false_dest()); + Value *cond = vmap_.at(br->get_cond()); + builder_->Insert(BranchInst::Create(true_dest, false_dest, cond)); +} + +void generator::visit_uncond_branch_inst(ir::uncond_branch_inst* br) { + BasicBlock *dest = vmap_.at(br->get_dest()); + builder_->Insert(BranchInst::Create(dest)); +} + + +void generator::visit_unmasked_load_inst(ir::unmasked_load_inst* x) { + distributed_tile* result = (distributed_tile*)tmap_.at(x); + // find vector size + ir::value *ptr = x->get_pointer_operand(); + size_t ld = layouts_->get(ptr)->order[0]; + unsigned alignment = alignment_->get(ptr, ld); + unsigned vector_size = std::min(result->axis(ld).contiguous, alignment); + distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr); + // vector loads + std::map packets; + result->for_each([&](indices_t idx){ + unsigned linear = result->get_linear_index(idx); + unsigned id = linear / vector_size; + if(linear % vector_size == 0) { + Value *ptr = pointers->get_value(idx); + ptr = builder_->CreateBitCast(ptr, PointerType::get(VectorType::get(result->get_ty(), vector_size), + ptr->getType()->getPointerAddressSpace())); + packets[id] = builder_->CreateLoad(ptr); + } + }); + // extract result element + result->for_each([&](indices_t idx){ + unsigned linear = result->get_linear_index(idx); + unsigned id = linear / vector_size; + result->set_value(idx, builder_->CreateExtractElement(packets.at(id), linear % vector_size)); + }); +} + +void generator::visit_masked_load_inst(ir::masked_load_inst* x) { + // find vector size + distributed_tile* result = (distributed_tile*)tmap_.at(x); + ir::value *ptr = x->get_pointer_operand(); + size_t ld = layouts_->get(ptr)->order[0]; + unsigned alignment = alignment_->get(ptr, ld); + unsigned vector_size = std::min(result->axis(ld).contiguous, alignment); + distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr); + distributed_tile *masks = (distributed_tile*)tmap_.at(x->get_mask_operand()); + distributed_tile *false_values = (distributed_tile*)tmap_.at(x->get_false_value_operand()); + std::map packets; + result->for_each([&](indices_t idx){ + unsigned linear = result->get_linear_index(idx); + unsigned id = linear / vector_size; + if(linear % vector_size == 0) { + Value *ptr = pointers->get_value(idx); + + + ptr = builder_->CreateBitCast(ptr, PointerType::get(VectorType::get(result->get_ty(), vector_size), + ptr->getType()->getPointerAddressSpace())); + Value *mask = masks->get_value(idx); + BasicBlock *current_bb = builder_->GetInsertBlock(); + const Function *parent = builder_->GetInsertBlock()->getParent(); + BasicBlock *mask_then_bb = BasicBlock::Create(*ctx_, "mask_then", parent); + BasicBlock *mask_done_bb = BasicBlock::Create(*ctx_, "mask_done", parent); + builder_->CreateCondBr(mask, mask_then_bb, mask_done_bb); + builder_->SetInsertPoint(mask_then_bb); + Value *result_then = builder_->CreateLoad(ptr); + builder_->CreateBr(mask_done_bb); + builder_->SetInsertPoint(mask_done_bb); + Value *current_result = nullptr; + if(false_values){ + current_result = builder_->CreatePHI(result_then->getType(), 2); + ((PHINode*)current_result)->addIncoming(result_then, mask_then_bb); + Value *result_false = false_values->get_value(idx); + if(result_then->getType()->isVectorTy()) + result_false = builder_->CreateVectorSplat(vector_size, llvm::UndefValue::get(result_false->getType())); + ((PHINode*)current_result)->addIncoming(result_false, current_bb); + } + else + current_result = result_then; + +// ConstantInt *cst = nullptr; +// if(GetElementPtrInst *gep = dyn_cast(ptr)) +// if(gep->getNumIndices() == 1) +// cst = dyn_cast(gep->idx_begin()); +// llvm::Value* mask = masks->get_value(idx); +// std::string offset = ""; +// if(cst) +// offset = " + " + std::to_string(cst->getValue().getSExtValue()*2*vector_size); +// Type *fp16x2_ty = VectorType::get(builder_->getHalfTy(), 2); +// Type *fp16x2_pack4_ty = StructType::get(ctx, {fp16x2_ty, fp16x2_ty, fp16x2_ty, fp16x2_ty}); +// FunctionType *ty = FunctionType::get(fp16x2_pack4_ty, {mask->getType(), ptr->getType()}, false); +// std::string asm_str = "@$0 ld.global.nc.b32 {$1, $2, $3, $4}, [$5" + offset + "];"; +// if(false_values) +// asm_str += "\n\t@!$0 mov.v4.b32 {$1, $2, $3, $4}, {0, 0, 0, 0};"; +// InlineAsm *iasm = InlineAsm::get(ty, asm_str, "b,=r,=r,=r,=r,l", true); +// Value *current_result = builder_->CreateCall(iasm, {mask, ptr}); + + packets[id] = current_result; + } + }); + // extract result element + result->for_each([&](indices_t idx){ + unsigned linear = result->get_linear_index(idx); + unsigned id = linear / vector_size; +// Value *tmp = builder_->CreateExtractValue(packets.at(id), {(linear % vector_size) / 2}); +// Value *res = builder_->CreateExtractElement(tmp, (linear % vector_size) % 2); +// result->set_value(idx, res); + result->set_value(idx, builder_->CreateExtractElement(packets.at(id), linear % vector_size)); + }); +} + +void generator::visit_unmasked_store_inst(ir::unmasked_store_inst* st) { + for_each(st->get_pointer_operand(), [&](indices_t idx){ + Value *ptr = get_value(st->get_pointer_operand(), idx); + Value *val = get_value(st->get_value_operand(), idx); + builder_->CreateStore(val, ptr); + }); +} + +void generator::visit_masked_store_inst(ir::masked_store_inst* st) { + distributed_tile* ptrs = (distributed_tile*)tmap_.at(st->get_pointer_operand()); + distributed_tile* scalars = (distributed_tile*)tmap_.at(st->get_value_operand()); + ir::value *mask = st->get_mask_operand(); + distributed_tile* preds = (distributed_tile*)tmap_.at(mask); + ptrs->for_each([&](indices_t idx){ + Value *scalar = scalars->get_value(idx); + Value *ptr = ptrs->get_value(idx); + Value *pred = preds->get_value(idx); + const Function *parent = builder_->GetInsertBlock()->getParent(); + BasicBlock *mask_then_bb = BasicBlock::Create(*ctx_, "mask_then", parent); + BasicBlock *mask_done_bb = BasicBlock::Create(*ctx_, "mask_done", parent); + builder_->CreateCondBr(pred, mask_then_bb, mask_done_bb); + builder_->SetInsertPoint(mask_then_bb); + builder_->CreateStore(scalar, ptr); + builder_->CreateBr(mask_done_bb); + builder_->SetInsertPoint(mask_done_bb); +// std::string offset = ""; +// if(GetElementPtrInst *gep = dyn_cast(ptr)) +// if(gep->getNumIndices() == 1) +// if(ConstantInt *cst = dyn_cast(gep->idx_begin())){ +// offset = " + " + std::to_string(cst->getValue().getSExtValue()*4); +// } +// FunctionType *ty = FunctionType::get(Type::getVoidTy(ctx), {pred->getType(), ptr->getType(), scalar->getType()}, false); +// std::string asm_str = "@$0 st.global.b32 [$1" + offset + "], $2;"; +// InlineAsm *iasm = InlineAsm::get(ty, asm_str, "b,l,f", true); +// builder.CreateCall(iasm, {pred, ptr, scalar}); + }); +} + + +void generator::visit_reshape_inst(ir::reshape_inst* reshape) { + distributed_tile* result = (distributed_tile*)tmap_.at(reshape); + ir::value* in = reshape->get_operand(0); + distributed_tile *in_tile = (distributed_tile*)tmap_.at(in); + for_each(reshape, [&](indices_t out_idx){ + unsigned pos = result->get_linear_index(out_idx); + indices_t in_idx = in_tile->get_ordered_indices(pos); + result->set_value(out_idx, in_tile->get_value(in_idx)); + }); +} + +void generator::visit_splat_inst(ir::splat_inst* splat) { + Value *in = get_value(splat->get_operand(0), {}); + for_each(splat, [&](indices_t idx){ + set_value(splat, idx, in); + }); +} + +void generator::visit_broadcast_inst(ir::broadcast_inst* bcast) { + distributed_tile* result = (distributed_tile*)tmap_.at(bcast); + ir::value* in = bcast->get_operand(0); + const auto& in_shapes = in->get_type()->get_tile_shapes(); + distributed_tile *in_tile = (distributed_tile*)tmap_.at(in); + result->for_each([&](indices_t out_idx){ + indices_t in_idx = out_idx; + for(size_t k = 0; k < in_idx.size(); k++){ + if(in_shapes[k] == 1) + in_idx[k] = builder_->getInt32(0); + } + result->set_value(out_idx, in_tile->get_value(in_idx)); + }); +} + +void generator::visit_downcast_inst(ir::downcast_inst* x) { + vmap_[x] = tmap_[x->get_operand(0)]->get_value({builder_->getInt32(0)}); +} + +void generator::visit_get_program_id_inst(ir::get_program_id_inst* pid) { + Module &module = builder_->GetInsertBlock()->getModule(); + Value *ret = tgt_->get_block_id(module, *builder_, pid->get_axis()); + vmap_[pid] = ret; +} + +void generator::visit_get_num_program_inst(ir::get_num_program_inst* np) { + Module &module = builder_->GetInsertBlock()->getModule(); + Value *ret = tgt_->get_num_blocks(module, *builder_, np->get_axis()); + vmap_[np] = ret; +} + +void generator::visit_atomic_cas_inst(ir::atomic_cas_inst* cas) { + BasicBlock *current = builder_->GetInsertBlock(); + Module *module = current->getModule(); + Value *tid = tgt_->get_local_id(module, *builder_, 0); + Value *pred = builder_->CreateICmpEQ(tid, builder_->getInt32(0)); + BasicBlock *tid_0_bb = BasicBlock::Create(*ctx_, "tid_0", current->getParent()); + BasicBlock *tid_0_done_bb = BasicBlock::Create(*ctx_, "tid_0_done", current->getParent()); + Value *ptr = builder_->CreateGEP(sh_mem_ptr_, builder_->getInt32(alloc_->offset(cas))); + ptr = builder_->CreateBitCast(ptr, PointerType::get(builder_->getInt32Ty(), ptr->getType()->getPointerAddressSpace())); + tgt_->add_memfence(module, *builder_); + tgt_->add_barrier(module, *builder_); + builder_->CreateCondBr(pred, tid_0_bb, tid_0_done_bb); + builder_->SetInsertPoint(tid_0_bb); + Value *cas_ptr = vmap_.at(cas->get_operand(0)); + Value *cas_cmp = vmap_.at(cas->get_operand(1)); + Value *cas_val = vmap_.at(cas->get_operand(2)); + Value *old = builder_->CreateAtomicCmpXchg(cas_ptr, cas_cmp, cas_val, AtomicOrdering::Monotonic, AtomicOrdering::Monotonic); + old = builder_->CreateExtractValue(old, {0}); + builder_->CreateStore(old, ptr); + builder_->CreateBr(tid_0_done_bb); + builder_->SetInsertPoint(tid_0_done_bb); + tgt_->add_memfence(module, *builder_); + tgt_->add_barrier(module, *builder_); + Value *res = builder_->CreateLoad(ptr); + return (Instruction*)res; +} + +void generator::visit_atomic_exch_inst(ir::atomic_exch_inst* xchg) { + BasicBlock *current = builder_->GetInsertBlock(); + Module *module = current->getModule(); + Value *rmw_ptr = vmap_.at(xchg->get_operand(0)); + Value *rmw_val = vmap_.at(xchg->get_operand(1)); + Value *tid = tgt_->get_local_id(module, *builder_, 0); + Value *pred = builder_->CreateICmpEQ(tid, builder_->getInt32(0)); + BasicBlock *tid_0_bb = BasicBlock::Create(*ctx_, "tid_0", current->getParent()); + BasicBlock *tid_0_done_bb = BasicBlock::Create(*ctx_, "tid_0_done", current->getParent()); + tgt_->add_memfence(module, *builder_); + tgt_->add_barrier(module, *builder_); + builder_->CreateCondBr(pred, tid_0_bb, tid_0_done_bb); + builder_->SetInsertPoint(tid_0_bb); + Value *res = builder_->CreateAtomicRMW(AtomicRMWInst::Xchg, rmw_ptr, rmw_val, AtomicOrdering::Monotonic, SyncScope::System); + builder_->CreateBr(tid_0_done_bb); + builder_->SetInsertPoint(tid_0_done_bb); + tgt_->add_memfence(module, *builder_); + tgt_->add_barrier(module, *builder_); + return (Instruction*)res; +} + +void generator::visit_atomic_add_inst(ir::atomic_add_inst*) { + throw std::runtime_error("unsupported"); +} + +void generator::visit_hmma_dot(ir::dot_inst* dot, distributed_tile *TC, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK) { + const auto& shapes = dot->get_type()->get_tile_shapes(); + + TA->set_vector_size(4*pack_size_0_); + TB->set_vector_size(4*pack_size_1_); + TA->set_return_mode(true); + TB->set_return_mode(true); + + std::map, std::vector> fcs; + + TC->for_each([&](indices_t idx){ + std::vector key(idx.size() - 2); + std::copy(idx.begin() + 2, idx.end(), key.begin()); + fcs[key].push_back(TD->get_value(idx)); + }); + + Type *fp32_ty = builder_->getFloatTy(); + Type *fp16x2_ty = VectorType::get(builder_->getHalfTy(), 2); + Type *fp32_pack8_ty = StructType::get(ctx, {fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}); + FunctionType *mma_ty = FunctionType::get(fp32_pack8_ty, {fp16x2_ty, fp16x2_ty, fp16x2_ty, fp16x2_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}, false); + + Value *offset_a_i = offset_a_i_; + Value *offset_a_k = offset_a_k_; + Value *offset_b_j = offset_b_j_; + Value *offset_b_k = offset_b_k_; + + Value* u_thread_id = tgt_->get_local_id(builder_->GetInsertBlock()->getModule(), *builder_, 0); + + auto ord_a = layouts_->get(dot->get_operand(0))->order; + auto ord_b = layouts_->get(dot->get_operand(1))->order; + + bool is_a_trans = is_trans(dot->get_operand(0)); + bool is_b_trans = is_trans(dot->get_operand(1)); + bool is_a_row = is_a_trans ^ (ord_a[ord_a.size() - 2] == 1); + bool is_b_row = is_b_trans ^ (ord_b[ord_b.size() - 2] == 1); + + + if(is_a_row){ + offset_a_i = builder_->CreateAdd(offset_a_i, builder_->CreateURem(u_thread_id, builder_->getInt32(4))); + offset_a_k = builder_->getInt32(0); + } + if(!is_b_row){ + offset_b_j = builder_->CreateAdd(offset_b_j, builder_->CreateURem(u_thread_id, builder_->getInt32(4))); + offset_b_k = builder_->getInt32(0); + } + + std::string op_a = is_a_row ? "row" : "col"; + std::string op_b = is_b_row ? "row" : "col"; + + InlineAsm *mma_fn = InlineAsm::get(mma_ty, " mma.sync.aligned.m8n8k4." + op_a + "." + op_b + ".f32.f16.f16.f32 " + "{$0, $1, $2, $3, $4, $5, $6, $7}, " + "{$8, $9}, " + "{$10, $11}, " + "{$0, $1, $2, $3, $4, $5, $6, $7};", "=f,=f,=f,=f,=f,=f,=f,=f,r,r,r,r,0,1,2,3,4,5,6,7", false); + + unsigned fpw_0 = layouts_->get(dot)->fpw.at(0); + unsigned fpw_1 = layouts_->get(dot)->fpw.at(1); + unsigned wts_0 = fpw_0 * 8; + unsigned wts_1 = fpw_1 * 8; + unsigned wpt_0 = layouts_->get(dot)->wpt.at(0); + unsigned wpt_1 = layouts_->get(dot)->wpt.at(1); + unsigned stride_rep_i = wpt_0 * wts_0; + unsigned stride_rep_j = wpt_1 * wts_1; + unsigned num_rep_i = shapes[0] / stride_rep_i; + unsigned ld_fc = num_rep_i * 2; + + + for(auto& x: fcs){ + std::vector& fc = x.second; + for(unsigned pack_i = 0; pack_i < num_packs_0_; pack_i++) + for(unsigned pack_j = 0; pack_j < num_packs_1_; pack_j++){ + for(unsigned K = 0; K < NK; K += 4){ + Value *_K = builder_->getInt32(K); + Value *current_offset_a_i = builder_->CreateAdd(offset_a_i, builder_->getInt32(pack_i*stride_rep_i*pack_size_0_)); + Value *current_offset_b_i = builder_->CreateAdd(offset_b_j, builder_->getInt32(pack_j*stride_rep_j*pack_size_1_)); + indices_t idx_a = {current_offset_a_i, builder_->CreateAdd(offset_a_k, _K)}; + indices_t idx_b = {builder_->CreateAdd(offset_b_k, _K), current_offset_b_i}; + idx_a.insert(idx_a.end(), x.first.begin(), x.first.end()); + idx_b.insert(idx_b.end(), x.first.begin(), x.first.end()); + Value *ha = TA->get_value(idx_a); + Value *hb = TB->get_value(idx_b); + for(unsigned ii = 0; ii < pack_size_0_; ii++) + for(unsigned jj = 0; jj < pack_size_1_; jj++){ + Value *ha0 = builder_->CreateBitCast(builder_->CreateExtractElement(ha, builder_->getInt32(ii*pack_size_0_ + 0)), fp16x2_ty); + Value *ha1 = builder_->CreateBitCast(builder_->CreateExtractElement(ha, builder_->getInt32(ii*pack_size_0_ + 1)), fp16x2_ty); + Value *hb0 = builder_->CreateBitCast(builder_->CreateExtractElement(hb, builder_->getInt32(jj*pack_size_0_ + 0)), fp16x2_ty); + Value *hb1 = builder_->CreateBitCast(builder_->CreateExtractElement(hb, builder_->getInt32(jj*pack_size_0_ + 1)), fp16x2_ty); + std::vector idx = { + (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 0)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 1)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 0)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 1)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 2)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 3)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 2)*ld_fc, + (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 3)*ld_fc + }; + Value *nc = builder_->CreateCall(mma_fn, {ha0, ha1, hb0, hb1, fc[idx[0]], fc[idx[1]], fc[idx[2]], fc[idx[3]], fc[idx[4]], fc[idx[5]], fc[idx[6]], fc[idx[7]]}); + fc[idx[0]] = builder_->CreateExtractValue(nc, {0}); + fc[idx[1]] = builder_->CreateExtractValue(nc, {1}); + fc[idx[2]] = builder_->CreateExtractValue(nc, {2}); + fc[idx[3]] = builder_->CreateExtractValue(nc, {3}); + fc[idx[4]] = builder_->CreateExtractValue(nc, {4}); + fc[idx[5]] = builder_->CreateExtractValue(nc, {5}); + fc[idx[6]] = builder_->CreateExtractValue(nc, {6}); + fc[idx[7]] = builder_->CreateExtractValue(nc, {7}); + } + } + } + } + + // write back + unsigned i = 0; + TC->for_each([&](indices_t idx){ + std::vector key(idx.size() - 2); + std::copy(idx.begin() + 2, idx.end(), key.begin()); + if(i >= fcs.at(key).size()) + i = 0; + TC->set_value(idx, fcs.at(key)[i++]); + }); + + TA->set_return_mode(false); + TB->set_return_mode(false); + +} +void generator::visit_scanline_dot(ir::dot_inst* dot, distributed_tile *TC, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK, + Type *c_ty, Function *f_mul_add) { + TA->set_vector_size(TC->axis(0).contiguous); + TB->set_vector_size(TC->axis(1).contiguous); + TC->for_each([&](indices_t idx){ + Value *res = TD->get_value(idx); + for(unsigned K = 0; K < NK; ++K){ + // input indices + indices_t a_idx = {idx[0], builder_->getInt32(K)}; + indices_t b_idx = {builder_->getInt32(K), idx[1]}; + // add batching dimension + for(size_t i = 2; i < idx.size(); i++){ + a_idx.insert(a_idx.end(), idx[i]); + b_idx.insert(b_idx.end(), idx[i]); + } + // load value + Value *a = TA->get_value(a_idx); + Value *b = TB->get_value(b_idx); + if(a->getType() != c_ty) + a = builder_->CreateFPCast(a, c_ty); + if(b->getType() != c_ty) + b = builder_->CreateFPCast(b, c_ty); + res = builder_->CreateCall(f_mul_add, {a, b, res}); + } + TC->set_value(idx, res); + }); +} + +void generator::visit_outer_dot(ir::dot_inst*, distributed_tile *TC, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK, + Type *c_ty, Function *f_mul_add) { + TC->for_each([&](indices_t idx){ + Value *res = TD->get_value(idx); + indices_t a_idx = {idx[0], builder_->getInt32(0)}; + indices_t b_idx = {builder_->getInt32(0), idx[1]}; + std::swap(a_idx[0], a_idx[1]); + std::swap(b_idx[0], b_idx[1]); + Value *a = TA->get_value(a_idx); + Value *b = TB->get_value(b_idx); + if(a->getType() != c_ty) + a = builder_->CreateFPCast(a, c_ty); + if(b->getType() != c_ty) + b = builder_->CreateFPCast(b, c_ty); + res = builder_->CreateCall(f_mul_add, {a, b, res}); + TC->set_value(idx, res); + }); +} + +void generator::visit_dot_inst(ir::dot_inst* dot) { + Function *fn = builder_->GetInsertBlock()->getParent(); + + distributed_tile* TC = (distributed_tile*)tmap_.at(dot); + Module *module = fn->getParent(); + ir::value *A = dot->get_operand(0); + ir::value *B = dot->get_operand(1); + ir::value *D = dot->get_operand(2); + + distributed_tile *TD = (distributed_tile*)tmap_.at(D); + Type *c_ty = type(D->get_type()->get_scalar_ty(), *ctx_); + Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {c_ty}); + auto A_shapes = A->get_type()->get_tile_shapes(); + size_t red_axis = 1; + unsigned NK = A_shapes[red_axis]; + + if(NK != 1) { + shared_tile *TA = (shared_tile*)tmap_.at(A); + shared_tile *TB = (shared_tile*)tmap_.at(B); + if(layouts_->get(dot)->type == analysis::HMMA_884) + visit_hmma_dot(dot, TC, TA, TB, TD, NK); + else + visit_scanline_dot(dot, TC, TA, TB, TD, NK, c_ty, f_mul_add); + } + else { + distributed_tile *TA = (distributed_tile*)tmap_.at(A); + distributed_tile *TB = (distributed_tile*)tmap_.at(B); + visit_outer_dot(dot, TC, TA, TB, TD, NK, c_ty, f_mul_add); + } +} + +void generator::visit_trans_inst(ir::trans_inst* trans) { + shared_tile* in = (shared_tile*)tmap_.at(trans->get_operand(0)); + shared_tile* out = new shared_tile(in->get_ty(), in->get_shapes(), in->get_order(), in->get_pointer(), *builder_, in->get_offset(), trans->get_perm()); + tmap_[trans] = out; +} + +void generator::visit_sqrt_inst(ir::sqrt_inst* sqrt) { + for_each(sqrt, [&](indices_t idx){ + Value *val = get_value(sqrt->get_operand(0), idx); + Module* module = builder_->GetInsertBlock()->getModule(); + Value *sqrt = Intrinsic::getDeclaration(module, Intrinsic::sqrt, {val->getType()}); + Value *ret = builder_->CreateCall(sqrt, {val}); + set_value(sqrt, idx, ret); + }); +} + +void generator::visit_reduce_inst(ir::reduce_inst*) { + throw std::runtime_error("not implemented"); +} + +void generator::visit_select_inst(ir::select_inst* select) { + for_each(select, [&](indices_t idx){ + Value *pred = get_value(select->get_operand(0), idx); + Value *if_value = get_value(select->get_operand(1), idx); + Value *else_value = get_value(select->get_operand(2), idx); + Value *ret = builder_->Insert(SelectInst::Create(pred, if_value, else_value)); + set_value(select, idx, ret); + }); + +} + +void generator::visit_copy_to_shared_inst(ir::copy_to_shared_inst* cts) { + unsigned vector_size = 1; + auto x_order = layouts_->get(cts)->order; + ir::value *arg = cts->get_operand(0); + auto arg_order = layouts_->get(arg)->order; + // tiles + shared_tile* result = (shared_tile*)tmap_.at(cts); + distributed_tile* in = (distributed_tile*)tmap_.at(arg); + if(x_order == arg_order){ + size_t ld = arg_order[0]; + vector_size = layouts_->get(arg)->nts.at(ld); + } + + std::map packets; + in->for_each([&](indices_t idx){ + unsigned linear = in->get_linear_index(idx); + unsigned id = linear / vector_size; + Value *in_value = in->get_value(idx); + if(linear % vector_size == 0) + packets[id] = UndefValue::get(VectorType::get(in_value->getType(), vector_size)); + packets[id] = builder_->CreateInsertElement(packets.at(id), in_value, linear % vector_size); + }); + in->for_each([&](indices_t idx){ + unsigned linear = in->get_linear_index(idx); + unsigned id = linear / vector_size; + if(linear % vector_size == 0) + result->set_value(idx, packets[id]); + }); +} + +void generator::visit_copy_from_shared_inst(ir::copy_from_shared_inst* cfs) { + distributed_tile* result = (distributed_tile*)tmap_.at(cfs); + shared_tile* arg = (shared_tile*)tmap_.at(cfs->get_operand(0)); + result->for_each([&](indices_t idx){ + result->set_value(idx, arg->get_value(idx)); + }); +} + +void generator::visit_barrier_inst(ir::barrier_inst*) { + Module *module = builder_->GetInsertBlock()->getModule(); + tgt_->add_barrier(module, *builder_); +} + +void generator::visit_make_range_dyn(ir::make_range_dyn* x) { + distributed_tile* result = (distributed_tile*)tmap_.at(x); + result->for_each([&](indices_t idx){ + assert(idx.size() == 1); + BinaryOperator *bin_add = dyn_cast(idx[0]); + assert(bin_add); + Value *res = bin_add->getOperand(0); + result->set_value(idx, res); + }); +} + +void generator::visit_make_range_sta(ir::make_range_sta* x) { + distributed_tile *T = (distributed_tile *)tmap_.at(x); + T->for_each([&](indices_t idx){ + assert(idx.size() == 1); + BinaryOperator *bin_add = dyn_cast(idx[0]); + assert(bin_add); + Value *res = bin_add->getOperand(1); + assert(isa(res)); + T->set_value(idx, res); + }); +} + +void generator::visit_make_range(ir::make_range* x) { + distributed_tile *T = (distributed_tile *)tmap_.at(x); + T->for_each([&](indices_t idx){ + assert(idx.size() == 1); + T->set_value(idx, idx[0]); + }); +} + + } } From e787ce0cab6f561a2be05ec468e0b6e3d461f056 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 13 Oct 2019 02:26:30 -0400 Subject: [PATCH 438/494] [codegen] more cleaning --- include/triton/codegen/selection.h | 70 +-- include/triton/ir/instructions.h | 2 + include/triton/ir/visitor.h | 6 +- lib/codegen/selection.cc | 872 +++-------------------------- lib/driver/module.cc | 1 + 5 files changed, 125 insertions(+), 826 deletions(-) diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 5507d5b8b..279c7475e 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -148,20 +148,41 @@ private: class generator: public ir::visitor { -private: - Type *type(ir::type *ty); - private: void visit_hmma_dot(ir::dot_inst*, distributed_tile *TC, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK); void visit_scanline_dot(ir::dot_inst*, distributed_tile *TC, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK, Type *c_ty, Function *f_mul_add); - void visit_outer_dot(ir::dot_inst*, distributed_tile *TC, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK, + void visit_outer_dot(ir::dot_inst*, distributed_tile *TC, distributed_tile *TA, distributed_tile *TB, distributed_tile *TD, unsigned NK, Type *c_ty, Function *f_mul_add); + Type *type(ir::type *ty); void for_each(ir::value *x, const std::function& fn); - void get_value(ir::value *x, const indices_t& idx); + Value* get_value(ir::value *x, const indices_t& idx); void set_value(ir::value *x, const indices_t& idx, Value* v); public: + + generator(LLVMContext *ctx, + Function *fn, + Builder *builder, + std::map& vmap, + std::map& tmap, + target *tgt, + analysis::layout *layouts, + analysis::align *alignment, + analysis::allocation *alloc, + Value *sh_mem_ptr, + Value *offset_a_i, Value *offset_a_k, + Value *offset_b_j, Value *offset_b_k, + unsigned num_packs_0, unsigned num_packs_1, + unsigned pack_size_0, unsigned pack_size_1, + unsigned num_warps) + : ctx_(ctx), fn_(fn), builder_(builder), vmap_(vmap), tmap_(tmap), tgt_(tgt), + layouts_(layouts), alignment_(alignment), alloc_(alloc), sh_mem_ptr_(sh_mem_ptr), + offset_a_i_(offset_a_i), offset_a_k_(offset_a_k), offset_b_j_(offset_b_j), offset_b_k_(offset_b_k), + num_packs_0_(num_packs_0), num_packs_1_(num_packs_1), pack_size_0_(pack_size_0), pack_size_1_(pack_size_1), + num_warps_(num_warps) { } + + void visit_phi_node(ir::phi_node*); void visit_binary_operator(ir::binary_operator*); void visit_getelementptr_inst(ir::getelementptr_inst*); @@ -180,7 +201,6 @@ public: void visit_unmasked_store_inst(ir::unmasked_store_inst*); void visit_masked_store_inst(ir::masked_store_inst*); - void visit_retile_inst(ir::retile_inst*); void visit_reshape_inst(ir::reshape_inst*); void visit_splat_inst(ir::splat_inst*); void visit_broadcast_inst(ir::broadcast_inst*); @@ -209,8 +229,8 @@ private: Function *fn_; Builder *builder_; - std::map vmap_; - std::map tmap_; + std::map& vmap_; + std::map& tmap_; target *tgt_; analysis::layout *layouts_; analysis::align *alignment_; @@ -235,8 +255,6 @@ private: // LLVM conversions Type* llvm_type(ir::type *ty, LLVMContext &ctx); - Value* llvm_value(ir::value *v, Builder &builder); - Instruction* llvm_inst(ir::instruction *inst, std::function value, Builder &builder); Constant* llvm_constant(ir::constant *cst, LLVMContext &ctx); Value* llvm_alloc_const(ir::alloc_const *v, Module *module, Builder &builder); ArrayType* llvm_linearized_tile_type(ir::type *ty, LLVMContext &ctx); @@ -256,37 +274,7 @@ private: void init_layouts(ir::function *fn, Builder &builder, Value *sh_mem_ptr); // lower scalar instruction - void lower_value(ir::value *src, Builder &builder, std::set& seen); - // lower tile instruction - void lower_masked_store(ir::masked_store_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); - void lower_store(ir::store_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); - void lower_downcast(ir::downcast_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); - void lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); - void lower_dynamic_program_idx(ir::make_range_dyn *x, LLVMContext &ctx, Function *fn, Builder &builder); - void lower_reshape(ir::reshape_inst* x, LLVMContext &ctx, Function *fn, Builder &builder); - void lower_splat(ir::splat_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); - void lower_broadcast(ir::broadcast_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); - void lower_copy_to_shared(ir::copy_to_shared_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); - void lower_copy_from_shared(ir::copy_from_shared_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); - void lower_trans(ir::trans_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); - // matrix multiply - void lower_hmma_dot(ir::dot_inst *x, LLVMContext &ctx, Function *fn, Builder &builder, - distributed_tile *TC, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK); - void lower_scanline_dot(ir::dot_inst *x, LLVMContext &ctx, Function *fn, Builder &builder, - distributed_tile *TC, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK, - Type *c_ty, Function *f_mul_add); - void lower_outer_dot(ir::dot_inst *x, LLVMContext &ctx, Function *fn, Builder &builder, - distributed_tile *TC, distributed_tile *TA, distributed_tile *TB, distributed_tile *TD, - Type *c_ty, Function *f_mul_add); - void lower_dot(ir::dot_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); - // load - void lower_masked_load(ir::masked_load_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); - void lower_load(ir::load_inst *x, LLVMContext &ctx, Function *fn, Builder &builder); - // element-wise - void lower_elementwise(ir::instruction *x, LLVMContext &ctx, Function *fn, Builder &builder); - void lower_tile_instruction(ir::instruction *src, Builder &builder); - - + void lower_value(ir::value *src, Builder &builder, generator* gen, std::set& seen); public: selection(analysis::liveness* liveness, analysis::allocation *alloc, diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index 6b999f9bb..bbc75c63c 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -71,6 +71,8 @@ public: } // instruction id value_id_t get_id() const { return id_; } + // visit + virtual void accept(visitor *v) = 0; private: basic_block *parent_; diff --git a/include/triton/ir/visitor.h b/include/triton/ir/visitor.h index 645c2cbe7..ffe8d734c 100644 --- a/include/triton/ir/visitor.h +++ b/include/triton/ir/visitor.h @@ -3,9 +3,11 @@ #ifndef _TRITON_IR_VISITOR_H_ #define _TRITON_IR_VISITOR_H_ + namespace triton{ namespace ir{ +class instruction; class phi_node; class binary_operator; @@ -13,6 +15,7 @@ class getelementptr_inst; class icmp_inst; class fcmp_inst; +class cast_inst; class trunc_inst; class z_ext_inst; class s_ext_inst; @@ -73,7 +76,7 @@ public: virtual void visit_icmp_inst(icmp_inst*) = 0; virtual void visit_fcmp_inst(fcmp_inst*) = 0; - virtual void visit_cast_inst(trunc_inst*) = 0; + virtual void visit_cast_inst(cast_inst*) = 0; virtual void visit_return_inst(return_inst*) = 0; virtual void visit_cond_branch_inst(cond_branch_inst*) = 0; @@ -85,7 +88,6 @@ public: virtual void visit_unmasked_store_inst(unmasked_store_inst*) = 0; virtual void visit_masked_store_inst(masked_store_inst*) = 0; - virtual void visit_retile_inst(retile_inst*) = 0; virtual void visit_reshape_inst(reshape_inst*) = 0; virtual void visit_splat_inst(splat_inst*) = 0; virtual void visit_broadcast_inst(broadcast_inst*) = 0; diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index a4e2067a5..9d95e0a41 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -354,150 +354,6 @@ Constant *selection::llvm_constant(ir::constant *cst, LLVMContext &ctx) { throw std::runtime_error("unknown conversion from ir::constant to Constant"); } -/* convert ir::instruction to llvm::Instruction */ -Instruction *selection::llvm_inst(ir::instruction *inst, std::function value, IRBuilder<> &builder) { - LLVMContext & ctx = builder.getContext(); - auto block = [&](ir::basic_block *x) { return (BasicBlock*)vmap_.at(x); }; - auto type = [&](ir::type *x) { return llvm_type(x, ctx); }; - if(auto* ii = dynamic_cast(inst)){ - BasicBlock *true_dest = block(ii->get_true_dest()); - BasicBlock *false_dest = block(ii->get_false_dest()); - Value *cond = value(ii->get_cond()); - return builder.Insert(BranchInst::Create(true_dest, false_dest, cond)); - } - if(auto* ii = dynamic_cast(inst)){ - BasicBlock *dest = block(ii->get_dest()); - return builder.Insert(BranchInst::Create(dest)); - } - if(dynamic_cast(inst)){ - Module *module = builder.GetInsertBlock()->getModule(); - return tgt_->add_barrier(module, builder); - } - if(auto* ii = dynamic_cast(inst)){ - Type *ty = type(ii->get_type()->get_scalar_ty()); - unsigned num_ops = ii->get_num_operands(); - return builder.Insert(PHINode::Create(ty, num_ops)); - } - if(auto* ii = dynamic_cast(inst)){ - ir::value *ret_val = ii->get_return_value(); - return builder.Insert(ReturnInst::Create(ctx, ret_val?value(ret_val):nullptr)); - } - if(auto* ii = dynamic_cast(inst)){ - Value *lhs = value(ii->get_operand(0)); - Value *rhs = value(ii->get_operand(1)); - return builder.Insert(BinaryOperator::Create(llvm_op(ii->get_op()), lhs, rhs)); - } - if(auto* ii = dynamic_cast(inst)){ - ir::cmp_pred_t pred = ii->get_pred(); - Value *lhs = value(ii->get_operand(0)); - Value *rhs = value(ii->get_operand(1)); - return builder.Insert(CmpInst::Create(Instruction::ICmp, llvm_pred(pred), lhs, rhs)); - } - if(auto* ii = dynamic_cast(inst)){ - ir::cmp_pred_t pred = ii->get_pred(); - Value *lhs = value(ii->get_operand(0)); - Value *rhs = value(ii->get_operand(1)); - return builder.Insert(FCmpInst::Create(Instruction::FCmp, llvm_pred(pred), lhs, rhs)); - } - if(auto* ii = dynamic_cast(inst)){ - Value *arg = value(ii->get_operand(0)); - Type *dst_ty = type(ii->get_type()->get_scalar_ty()); - return builder.Insert(CastInst::Create(llvm_op(ii->get_op()), arg, dst_ty)); - } - if(auto* ii = dynamic_cast(inst)){ - // get pointer - Value *ptr = value(ii->get_operand(0)); - // reassociate first index - std::vector idx_vals; - std::transform(ii->idx_begin(), ii->idx_end(), std::back_inserter(idx_vals), - [&value](ir::value* x){ return value(x);}); - Type *source_ty = type(ii->get_source_elt_ty()->get_scalar_ty()); - return builder.Insert(GetElementPtrInst::CreateInBounds(source_ty, ptr, idx_vals)); - } - if(ir::load_inst* ii = dynamic_cast(inst)){ - Value *ptr = value(ii->get_pointer_operand()); - LoadInst *result = new LoadInst(ptr); - return builder.Insert(result); - } - if(ir::store_inst* ii = dynamic_cast(inst)){ - Value *val = value(ii->get_value_operand()); - Value *ptr = value(ii->get_pointer_operand()); - builder.CreateStore(val, ptr); - return nullptr; - } - if(ir::select_inst* ii = dynamic_cast(inst)){ - Value *pred = value(ii->get_operand(0)); - Value *if_value = value(ii->get_operand(1)); - Value *else_value = value(ii->get_operand(2)); - return builder.Insert(SelectInst::Create(pred, if_value, else_value)); - } - if(ir::get_program_id_inst* ii = dynamic_cast(inst)){ - Value *result = tgt_->get_block_id(builder.GetInsertBlock()->getModule(), builder, ii->get_axis()); - return (Instruction*)result; - } - if(ir::get_num_program_inst* ii = dynamic_cast(inst)){ - Value *result = tgt_->get_num_blocks(builder.GetInsertBlock()->getModule(), builder, ii->get_axis()); - return (Instruction*)result; - } - if(ir::atomic_cas_inst* ii = dynamic_cast(inst)){ - BasicBlock *current = builder.GetInsertBlock(); - Module *module = current->getModule(); - Value *tid = tgt_->get_local_id(module, builder, 0); - Value *pred = builder.CreateICmpEQ(tid, builder.getInt32(0)); - BasicBlock *tid_0_bb = BasicBlock::Create(ctx, "tid_0", current->getParent()); - BasicBlock *tid_0_done_bb = BasicBlock::Create(ctx, "tid_0_done", current->getParent()); - Value *ptr = builder.CreateGEP(sh_mem_ptr_, builder.getInt32(alloc_->offset(ii))); - ptr = builder.CreateBitCast(ptr, PointerType::get(builder.getInt32Ty(), ptr->getType()->getPointerAddressSpace())); - tgt_->add_memfence(module, builder); - tgt_->add_barrier(module, builder); - builder.CreateCondBr(pred, tid_0_bb, tid_0_done_bb); - builder.SetInsertPoint(tid_0_bb); - Value *cas_ptr = value(ii->get_operand(0)); - Value *cas_cmp = value(ii->get_operand(1)); - Value *cas_val = value(ii->get_operand(2)); - Value *old = builder.CreateAtomicCmpXchg(cas_ptr, cas_cmp, cas_val, AtomicOrdering::Monotonic, AtomicOrdering::Monotonic); - old = builder.CreateExtractValue(old, {0}); - builder.CreateStore(old, ptr); - builder.CreateBr(tid_0_done_bb); - builder.SetInsertPoint(tid_0_done_bb); - tgt_->add_memfence(module, builder); - tgt_->add_barrier(module, builder); - Value *res = builder.CreateLoad(ptr); - return (Instruction*)res; - } - if(ir::atomic_exch_inst* ii = dynamic_cast(inst)){ - BasicBlock *current = builder.GetInsertBlock(); - Module *module = current->getModule(); - Value *rmw_ptr = value(ii->get_operand(0)); - Value *rmw_val = value(ii->get_operand(1)); - Value *tid = tgt_->get_local_id(module, builder, 0); - Value *pred = builder.CreateICmpEQ(tid, builder.getInt32(0)); - BasicBlock *tid_0_bb = BasicBlock::Create(ctx, "tid_0", current->getParent()); - BasicBlock *tid_0_done_bb = BasicBlock::Create(ctx, "tid_0_done", current->getParent()); - tgt_->add_memfence(module, builder); - tgt_->add_barrier(module, builder); - builder.CreateCondBr(pred, tid_0_bb, tid_0_done_bb); - builder.SetInsertPoint(tid_0_bb); - Value *res = builder.CreateAtomicRMW(AtomicRMWInst::Xchg, rmw_ptr, rmw_val, AtomicOrdering::Monotonic, SyncScope::System); - builder.CreateBr(tid_0_done_bb); - builder.SetInsertPoint(tid_0_done_bb); - tgt_->add_memfence(module, builder); - tgt_->add_barrier(module, builder); - return (Instruction*)res; - } - if(ir::atomic_add_inst* ii = dynamic_cast(inst)){ - throw std::runtime_error("unsupported"); - } - if(ir::sqrt_inst* ii = dynamic_cast(inst)){ - Value *val = value(ii->get_operand(0)); - Value *sqrt = Intrinsic::getDeclaration(builder.GetInsertBlock()->getModule(), Intrinsic::sqrt, {val->getType()}); - Value *res = builder.CreateCall(sqrt, {val}); - return (Instruction*)res; - } - // unknown instruction - throw std::runtime_error("unknown conversion from ir::instruction to Instruction"); -} - /* convert ir::alloc_const to llvm::GlobalVariable */ Value* selection::llvm_alloc_const(ir::alloc_const *v, Module *module, IRBuilder<> &builder) { unsigned size = ((ir::constant_int*)v->get_operand(0))->get_value(); @@ -508,37 +364,6 @@ Value* selection::llvm_alloc_const(ir::alloc_const *v, Module *module, IRBuilder return builder.CreateBitCast(array, element_ty->getPointerTo(4)); } -/* convert ir::value to llvm::Value */ -Value* selection::llvm_value(ir::value *v, IRBuilder<> &builder) { - assert(!v->get_type()->is_tile_ty()); - LLVMContext &ctx = builder.getContext(); - if(vmap_.find(v) != vmap_.end()) - return vmap_.at(v); - // create operands - if(auto *cc = dynamic_cast(v)) - return llvm_constant(cc, ctx); - // alloc const - if(auto *cc = dynamic_cast(v)){ - BasicBlock *block = builder.GetInsertBlock(); - Module *module = block->getModule(); - unsigned size = ((ir::constant_int*)cc->get_operand(0))->get_value(); - Type *element_ty = llvm_type(cc->get_type()->get_pointer_element_ty(), ctx); - Type *array_ty = llvm::ArrayType::get(element_ty, size); - if(vmap_.find(v) == vmap_.end()){ - Value *array = new llvm::GlobalVariable(*module, array_ty, false, llvm::GlobalVariable::ExternalLinkage, - nullptr, cc->get_name(), nullptr, llvm::GlobalVariable::NotThreadLocal, 4); - vmap_[v] = builder.CreateBitCast(array, array->getType()->getArrayElementType()->getPointerTo(4)); - } - return vmap_.at(v); - } - // instruction - if(auto *ii = dynamic_cast(v)){ - auto value = [&](ir::value *x) { return llvm_value(x, builder); }; - return llvm_inst(ii, value, builder); - } - // unknown value - throw std::runtime_error("unknown conversion from ir::value to Value"); -} /* ------------------- * ---- Init Axes ---- @@ -796,231 +621,7 @@ void selection::init_layouts(ir::function *fn, IRBuilder<> &builder, Value *sh_m } } -/* ---------------------------- - * ---- Lower Instructions ---- - * ---------------------------- */ -void selection::lower_masked_store(ir::masked_store_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { - distributed_tile* ptrs = (distributed_tile*)tmap_.at(x->get_pointer_operand()); - distributed_tile* scalars = (distributed_tile*)tmap_.at(x->get_value_operand()); - ir::value *mask = x->get_mask_operand(); - distributed_tile* preds = (distributed_tile*)tmap_.at(mask); - ptrs->for_each([&](indices_t idx){ - Value *scalar = scalars->get_value(idx); - Value *ptr = ptrs->get_value(idx); - Value *pred = preds->get_value(idx); - BasicBlock *mask_then_bb = BasicBlock::Create(ctx, "mask_then", fn); - BasicBlock *mask_done_bb = BasicBlock::Create(ctx, "mask_done", fn); - builder.CreateCondBr(pred, mask_then_bb, mask_done_bb); - builder.SetInsertPoint(mask_then_bb); - builder.CreateStore(scalar, ptr); - builder.CreateBr(mask_done_bb); - builder.SetInsertPoint(mask_done_bb); -// std::string offset = ""; -// if(GetElementPtrInst *gep = dyn_cast(ptr)) -// if(gep->getNumIndices() == 1) -// if(ConstantInt *cst = dyn_cast(gep->idx_begin())){ -// offset = " + " + std::to_string(cst->getValue().getSExtValue()*4); -// } -// FunctionType *ty = FunctionType::get(Type::getVoidTy(ctx), {pred->getType(), ptr->getType(), scalar->getType()}, false); -// std::string asm_str = "@$0 st.global.b32 [$1" + offset + "], $2;"; -// InlineAsm *iasm = InlineAsm::get(ty, asm_str, "b,l,f", true); -// builder.CreateCall(iasm, {pred, ptr, scalar}); - }); -} - -void selection::lower_store(ir::store_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { - distributed_tile* ptrs = (distributed_tile*)tmap_.at(x->get_pointer_operand()); - tile *scalars = tmap_.at(x->get_value_operand()); -// size_t ld = layouts_->order(x->get_pointer_operand())[0]; -// unsigned vector_size = 2; -// // vectorize pointers -// std::map ptr_packets; -// ptrs->for_each([&](indices_t idx){ -// unsigned linear = ptrs->get_linear_index(idx); -// unsigned id = linear / vector_size; -// if(linear % vector_size == 0) { -// Value *ptr = ptrs->get_value(idx); -// ptr = builder.CreateBitCast(ptr, PointerType::get(VectorType::get(ptr->getType()->getPointerElementType(), vector_size), -// ptr->getType()->getPointerAddressSpace())); -// ptr_packets[id] = ptr; -// } -// }); -// ((shared_tile*)(scalars))->set_vector_size(vector_size); -// ((shared_tile*)(scalars))->set_return_mode(true); - // extract result element - ptrs->for_each([&](indices_t idx){ - builder.CreateStore(scalars->get_value(idx), ptrs->get_value(idx)); - }); -} - -void selection::lower_downcast(ir::downcast_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { - vmap_[x] = tmap_[x->get_operand(0)]->get_value({builder.getInt32(0)}); -} - -void selection::lower_reduce(ir::reduce_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { - ir::instruction *ins = (ir::instruction*)x; - Module *module = fn->getParent(); - std::map partial; - ir::value *op = x->get_operand(0); - distributed_tile* op_tile = (distributed_tile*)tmap_.at(op); - unsigned axis = x->get_axis(); - - // reduce within thread - op_tile->for_each([&](indices_t idx) { - indices_t pidx = idx; - pidx.erase(pidx.begin() + axis); - Value *current = op_tile->get_value(idx); - // current partial result is not initialized -- create - if(partial.find(pidx) == partial.end()) - partial[pidx] = current; - // current partial result is initialized -- accumulate - else - partial[pidx] = builder.CreateFAdd(partial[pidx], current); - }); - - // reduce within blocks - unsigned addr_space = sh_mem_ptr_->getType()->getPointerAddressSpace(); - Type *res_ty = builder.getFloatTy(); - Value *base_ptr = builder.CreateBitCast(sh_mem_ptr_, PointerType::get(res_ty, addr_space)); - for(auto& x: partial) { - // current element being computed - Value *lane = axes_.at(a_axes_->get(op, axis)).thread_id; - Value *&result = x.second; - indices_t write_idx = x.first; - write_idx.insert(write_idx.begin() + axis, lane); - - // shared memory write pointer - Value *write_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), {0, 1}, op_tile->get_order(), write_idx); - Value *write_ptr = builder.CreateGEP(base_ptr, write_offset); - - // initialize shared memory - tgt_->add_barrier(module, builder); - builder.CreateStore(result, write_ptr); - // build result - unsigned depth = layouts_->get(op)->wpt.at(axis); - for(unsigned i = depth/2; i > 0; i >>= 1){ - // current indices - indices_t current(write_idx.size(), builder.getInt32(0)); - current[axis] = builder.getInt32(i); - // shared memory offset - Value *read_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), {0, 1}, op_tile->get_order(), current); - Value *is_active = builder.CreateICmpULT(lane, builder.getInt32(i)); - read_offset = builder.CreateSelect(is_active, read_offset, builder.getInt32(0)); - // shared memory read pointer - Value *read_ptr = builder.CreateGEP(write_ptr, read_offset); - tgt_->add_barrier(module, builder); - Value *next = builder.CreateLoad(read_ptr); - // accumulate - result = builder.CreateFAdd(result, next); - // write back - builder.CreateStore(result, write_ptr); - } - - // result is on the first lane of shared memory - indices_t final = write_idx; - final[axis] = builder.getInt32(0); - Value *read_offset = shared_tile::shared_offset(builder, op_tile->get_shapes(), {0, 1}, op_tile->get_order(), final); - Value *read_ptr = builder.CreateGEP(base_ptr, read_offset); - tgt_->add_barrier(module, builder); - result = builder.CreateLoad(read_ptr); - if(tmap_.find(ins) == tmap_.end()) - vmap_[ins] = result; - else{ - distributed_tile *ti = (distributed_tile*)tmap_[ins]; - ti->set_value(x.first, result); - } - } -} - -void selection::lower_dynamic_program_idx(ir::make_range_dyn *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { - distributed_tile* result = (distributed_tile*)tmap_.at(x); - result->for_each([&](indices_t idx){ - assert(idx.size() == 1); - BinaryOperator *bin_add = dyn_cast(idx[0]); - assert(bin_add); - Value *res = bin_add->getOperand(0); - result->set_value(idx, res); - }); -} - -void selection::lower_reshape(ir::reshape_inst* x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { - distributed_tile* result = (distributed_tile*)tmap_.at(x); - ir::value* in = x->get_operand(0); - distributed_tile *in_tile = (distributed_tile*)tmap_.at(in); - result->for_each([&](indices_t out_idx){ - unsigned pos = result->get_linear_index(out_idx); - indices_t in_idx = in_tile->get_ordered_indices(pos); - result->set_value(out_idx, in_tile->get_value(in_idx)); - }); -} - -void selection::lower_splat(ir::splat_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { - distributed_tile* result = (distributed_tile*)tmap_.at(x); - result->for_each([&](indices_t idx) { - result->set_value(idx, llvm_value(x->get_operand(0), builder)); - }); -} - -void selection::lower_broadcast(ir::broadcast_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { - distributed_tile* result = (distributed_tile*)tmap_.at(x); - ir::value* in = x->get_operand(0); - const auto& in_shapes = in->get_type()->get_tile_shapes(); - distributed_tile *in_tile = (distributed_tile*)tmap_.at(in); - result->for_each([&](indices_t out_idx){ - indices_t in_idx = out_idx; - for(size_t k = 0; k < in_idx.size(); k++){ - if(in_shapes[k] == 1) - in_idx[k] = builder.getInt32(0); - } - result->set_value(out_idx, in_tile->get_value(in_idx)); - }); -} - -void selection::lower_copy_to_shared(ir::copy_to_shared_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { - unsigned vector_size = 1; - auto x_order = layouts_->get(x)->order; - ir::value *arg = x->get_operand(0); - auto arg_order = layouts_->get(arg)->order; - // tiles - shared_tile* result = (shared_tile*)tmap_.at(x); - distributed_tile* in = (distributed_tile*)tmap_.at(arg); - if(x_order == arg_order){ - size_t ld = arg_order[0]; - vector_size = layouts_->get(arg)->nts.at(ld); - } - - std::map packets; - in->for_each([&](indices_t idx){ - unsigned linear = in->get_linear_index(idx); - unsigned id = linear / vector_size; - Value *in_value = in->get_value(idx); - if(linear % vector_size == 0) - packets[id] = UndefValue::get(VectorType::get(in_value->getType(), vector_size)); - packets[id] = builder.CreateInsertElement(packets.at(id), in_value, linear % vector_size); - }); - in->for_each([&](indices_t idx){ - unsigned linear = in->get_linear_index(idx); - unsigned id = linear / vector_size; - if(linear % vector_size == 0) - result->set_value(idx, packets[id]); - }); -} - -void selection::lower_copy_from_shared(ir::copy_from_shared_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { - distributed_tile* result = (distributed_tile*)tmap_.at(x); - shared_tile* arg = (shared_tile*)tmap_.at(x->get_operand(0)); - - result->for_each([&](indices_t idx){ - result->set_value(idx, arg->get_value(idx)); - }); -} - -void selection::lower_trans(ir::trans_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { - shared_tile* in = (shared_tile*)tmap_.at(x->get_operand(0)); - shared_tile* out = new shared_tile(in->get_ty(), in->get_shapes(), in->get_order(), in->get_pointer(), builder, in->get_offset(), x->get_perm()); - tmap_[x] = out; -} bool is_trans(ir::value *v) { if(dynamic_cast(v)) { @@ -1035,370 +636,15 @@ bool is_trans(ir::value *v) { return false; } -void selection::lower_hmma_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn, IRBuilder<> &builder, - distributed_tile *TC, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK) { - const auto& shapes = dot->get_type()->get_tile_shapes(); - - TA->set_vector_size(4*pack_size_0_); - TB->set_vector_size(4*pack_size_1_); - TA->set_return_mode(true); - TB->set_return_mode(true); - - std::map, std::vector> fcs; - - TC->for_each([&](indices_t idx){ - std::vector key(idx.size() - 2); - std::copy(idx.begin() + 2, idx.end(), key.begin()); - fcs[key].push_back(TD->get_value(idx)); - }); - - Type *fp32_ty = builder.getFloatTy(); - Type *fp16x2_ty = VectorType::get(builder.getHalfTy(), 2); - Type *fp32_pack8_ty = StructType::get(ctx, {fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}); - FunctionType *mma_ty = FunctionType::get(fp32_pack8_ty, {fp16x2_ty, fp16x2_ty, fp16x2_ty, fp16x2_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}, false); - - Value *offset_a_i = offset_a_i_; - Value *offset_a_k = offset_a_k_; - Value *offset_b_j = offset_b_j_; - Value *offset_b_k = offset_b_k_; - - Value* u_thread_id = tgt_->get_local_id(builder.GetInsertBlock()->getModule(), builder, 0); - - auto ord_a = layouts_->get(dot->get_operand(0))->order; - auto ord_b = layouts_->get(dot->get_operand(1))->order; - - bool is_a_trans = is_trans(dot->get_operand(0)); - bool is_b_trans = is_trans(dot->get_operand(1)); - bool is_a_row = is_a_trans ^ (ord_a[ord_a.size() - 2] == 1); - bool is_b_row = is_b_trans ^ (ord_b[ord_b.size() - 2] == 1); - - - if(is_a_row){ - offset_a_i = builder.CreateAdd(offset_a_i, builder.CreateURem(u_thread_id, builder.getInt32(4))); - offset_a_k = builder.getInt32(0); - } - if(!is_b_row){ - offset_b_j = builder.CreateAdd(offset_b_j, builder.CreateURem(u_thread_id, builder.getInt32(4))); - offset_b_k = builder.getInt32(0); - } - - std::string op_a = is_a_row ? "row" : "col"; - std::string op_b = is_b_row ? "row" : "col"; - - InlineAsm *mma_fn = InlineAsm::get(mma_ty, " mma.sync.aligned.m8n8k4." + op_a + "." + op_b + ".f32.f16.f16.f32 " - "{$0, $1, $2, $3, $4, $5, $6, $7}, " - "{$8, $9}, " - "{$10, $11}, " - "{$0, $1, $2, $3, $4, $5, $6, $7};", "=f,=f,=f,=f,=f,=f,=f,=f,r,r,r,r,0,1,2,3,4,5,6,7", false); - - unsigned fpw_0 = layouts_->get(dot)->fpw.at(0); - unsigned fpw_1 = layouts_->get(dot)->fpw.at(1); - unsigned wts_0 = fpw_0 * 8; - unsigned wts_1 = fpw_1 * 8; - unsigned wpt_0 = layouts_->get(dot)->wpt.at(0); - unsigned wpt_1 = layouts_->get(dot)->wpt.at(1); - unsigned stride_rep_i = wpt_0 * wts_0; - unsigned stride_rep_j = wpt_1 * wts_1; - unsigned num_rep_i = shapes[0] / stride_rep_i; - unsigned ld_fc = num_rep_i * 2; - - - for(auto& x: fcs){ - std::vector& fc = x.second; - for(unsigned pack_i = 0; pack_i < num_packs_0_; pack_i++) - for(unsigned pack_j = 0; pack_j < num_packs_1_; pack_j++){ - for(unsigned K = 0; K < NK; K += 4){ - Value *_K = builder.getInt32(K); - Value *current_offset_a_i = builder.CreateAdd(offset_a_i, builder.getInt32(pack_i*stride_rep_i*pack_size_0_)); - Value *current_offset_b_i = builder.CreateAdd(offset_b_j, builder.getInt32(pack_j*stride_rep_j*pack_size_1_)); - indices_t idx_a = {current_offset_a_i, builder.CreateAdd(offset_a_k, _K)}; - indices_t idx_b = {builder.CreateAdd(offset_b_k, _K), current_offset_b_i}; - idx_a.insert(idx_a.end(), x.first.begin(), x.first.end()); - idx_b.insert(idx_b.end(), x.first.begin(), x.first.end()); - Value *ha = TA->get_value(idx_a); - Value *hb = TB->get_value(idx_b); - for(unsigned ii = 0; ii < pack_size_0_; ii++) - for(unsigned jj = 0; jj < pack_size_1_; jj++){ - Value *ha0 = builder.CreateBitCast(builder.CreateExtractElement(ha, builder.getInt32(ii*pack_size_0_ + 0)), fp16x2_ty); - Value *ha1 = builder.CreateBitCast(builder.CreateExtractElement(ha, builder.getInt32(ii*pack_size_0_ + 1)), fp16x2_ty); - Value *hb0 = builder.CreateBitCast(builder.CreateExtractElement(hb, builder.getInt32(jj*pack_size_0_ + 0)), fp16x2_ty); - Value *hb1 = builder.CreateBitCast(builder.CreateExtractElement(hb, builder.getInt32(jj*pack_size_0_ + 1)), fp16x2_ty); - std::vector idx = { - (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 0)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 1)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 0)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 1)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 2)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 3)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 2)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 3)*ld_fc - }; - Value *nc = builder.CreateCall(mma_fn, {ha0, ha1, hb0, hb1, fc[idx[0]], fc[idx[1]], fc[idx[2]], fc[idx[3]], fc[idx[4]], fc[idx[5]], fc[idx[6]], fc[idx[7]]}); - fc[idx[0]] = builder.CreateExtractValue(nc, {0}); - fc[idx[1]] = builder.CreateExtractValue(nc, {1}); - fc[idx[2]] = builder.CreateExtractValue(nc, {2}); - fc[idx[3]] = builder.CreateExtractValue(nc, {3}); - fc[idx[4]] = builder.CreateExtractValue(nc, {4}); - fc[idx[5]] = builder.CreateExtractValue(nc, {5}); - fc[idx[6]] = builder.CreateExtractValue(nc, {6}); - fc[idx[7]] = builder.CreateExtractValue(nc, {7}); - } - } - } - } - - // write back - unsigned i = 0; - TC->for_each([&](indices_t idx){ - std::vector key(idx.size() - 2); - std::copy(idx.begin() + 2, idx.end(), key.begin()); - if(i >= fcs.at(key).size()) - i = 0; - TC->set_value(idx, fcs.at(key)[i++]); - }); - - TA->set_return_mode(false); - TB->set_return_mode(false); -} - -void selection::lower_scanline_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn, IRBuilder<> &builder, - distributed_tile *TC, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK, - Type *c_ty, Function *f_mul_add) { - TA->set_vector_size(TC->axis(0).contiguous); - TB->set_vector_size(TC->axis(1).contiguous); - TC->for_each([&](indices_t idx){ - Value *res = TD->get_value(idx); - for(unsigned K = 0; K < NK; ++K){ - // input indices - indices_t a_idx = {idx[0], builder.getInt32(K)}; - indices_t b_idx = {builder.getInt32(K), idx[1]}; - // add batching dimension - for(size_t i = 2; i < idx.size(); i++){ - a_idx.insert(a_idx.end(), idx[i]); - b_idx.insert(b_idx.end(), idx[i]); - } - // load value - Value *a = TA->get_value(a_idx); - Value *b = TB->get_value(b_idx); - if(a->getType() != c_ty) - a = builder.CreateFPCast(a, c_ty); - if(b->getType() != c_ty) - b = builder.CreateFPCast(b, c_ty); - res = builder.CreateCall(f_mul_add, {a, b, res}); - } - TC->set_value(idx, res); - }); -} - -void selection::lower_outer_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn, IRBuilder<> &builder, - distributed_tile *TC, distributed_tile *TA, distributed_tile *TB, distributed_tile *TD, - Type *c_ty, Function *f_mul_add) { - TC->for_each([&](indices_t idx){ - Value *res = TD->get_value(idx); - indices_t a_idx = {idx[0], builder.getInt32(0)}; - indices_t b_idx = {builder.getInt32(0), idx[1]}; - std::swap(a_idx[0], a_idx[1]); - std::swap(b_idx[0], b_idx[1]); - Value *a = TA->get_value(a_idx); - Value *b = TB->get_value(b_idx); - if(a->getType() != c_ty) - a = builder.CreateFPCast(a, c_ty); - if(b->getType() != c_ty) - b = builder.CreateFPCast(b, c_ty); - res = builder.CreateCall(f_mul_add, {a, b, res}); - TC->set_value(idx, res); - }); -} - -void selection::lower_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { - distributed_tile* TC = (distributed_tile*)tmap_.at(dot); - Module *module = fn->getParent(); - ir::value *A = dot->get_operand(0); - ir::value *B = dot->get_operand(1); - ir::value *D = dot->get_operand(2); - - distributed_tile *TD = (distributed_tile*)tmap_.at(D); - Type *c_ty = llvm_type(D->get_type()->get_scalar_ty(), ctx); - Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {c_ty}); - auto A_shapes = A->get_type()->get_tile_shapes(); - size_t red_axis = 1; - unsigned NK = A_shapes[red_axis]; - - if(NK != 1) { - shared_tile *TA = (shared_tile*)tmap_.at(A); - shared_tile *TB = (shared_tile*)tmap_.at(B); - if(layouts_->get(dot)->type == analysis::HMMA_884) - lower_hmma_dot(dot, ctx, fn, builder, TC, TA, TB, TD, NK); - else - lower_scanline_dot(dot, ctx, fn, builder, TC, TA, TB, TD, NK, c_ty, f_mul_add); - } - else { - distributed_tile *TA = (distributed_tile*)tmap_.at(A); - distributed_tile *TB = (distributed_tile*)tmap_.at(B); - lower_outer_dot(dot, ctx, fn, builder, TC, TA, TB, TD, c_ty, f_mul_add); - } -} - -void selection::lower_masked_load(ir::masked_load_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { - // find vector size - distributed_tile* result = (distributed_tile*)tmap_.at(x); - ir::value *ptr = x->get_pointer_operand(); - size_t ld = layouts_->get(ptr)->order[0]; - unsigned alignment = alignment_->get(ptr, ld); - unsigned vector_size = std::min(result->axis(ld).contiguous, alignment); - distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr); - distributed_tile *masks = (distributed_tile*)tmap_.at(x->get_mask_operand()); - distributed_tile *false_values = (distributed_tile*)tmap_.at(x->get_false_value_operand()); - std::map packets; - result->for_each([&](indices_t idx){ - unsigned linear = result->get_linear_index(idx); - unsigned id = linear / vector_size; - if(linear % vector_size == 0) { - Value *ptr = pointers->get_value(idx); - - - ptr = builder.CreateBitCast(ptr, PointerType::get(VectorType::get(result->get_ty(), vector_size), - ptr->getType()->getPointerAddressSpace())); - Value *mask = masks->get_value(idx); - BasicBlock *current_bb = builder.GetInsertBlock(); - BasicBlock *mask_then_bb = BasicBlock::Create(ctx, "mask_then", fn); - BasicBlock *mask_done_bb = BasicBlock::Create(ctx, "mask_done", fn); - builder.CreateCondBr(mask, mask_then_bb, mask_done_bb); - builder.SetInsertPoint(mask_then_bb); - Value *result_then = builder.CreateLoad(ptr); - builder.CreateBr(mask_done_bb); - builder.SetInsertPoint(mask_done_bb); - Value *current_result = nullptr; - if(false_values){ - current_result = builder.CreatePHI(result_then->getType(), 2); - ((PHINode*)current_result)->addIncoming(result_then, mask_then_bb); - Value *result_false = false_values->get_value(idx); - if(result_then->getType()->isVectorTy()) - result_false = builder.CreateVectorSplat(vector_size, llvm::UndefValue::get(result_false->getType())); - ((PHINode*)current_result)->addIncoming(result_false, current_bb); - } - else - current_result = result_then; - -// ConstantInt *cst = nullptr; -// if(GetElementPtrInst *gep = dyn_cast(ptr)) -// if(gep->getNumIndices() == 1) -// cst = dyn_cast(gep->idx_begin()); -// llvm::Value* mask = masks->get_value(idx); -// std::string offset = ""; -// if(cst) -// offset = " + " + std::to_string(cst->getValue().getSExtValue()*2*vector_size); -// Type *fp16x2_ty = VectorType::get(builder.getHalfTy(), 2); -// Type *fp16x2_pack4_ty = StructType::get(ctx, {fp16x2_ty, fp16x2_ty, fp16x2_ty, fp16x2_ty}); -// FunctionType *ty = FunctionType::get(fp16x2_pack4_ty, {mask->getType(), ptr->getType()}, false); -// std::string asm_str = "@$0 ld.global.nc.b32 {$1, $2, $3, $4}, [$5" + offset + "];"; -// if(false_values) -// asm_str += "\n\t@!$0 mov.v4.b32 {$1, $2, $3, $4}, {0, 0, 0, 0};"; -// InlineAsm *iasm = InlineAsm::get(ty, asm_str, "b,=r,=r,=r,=r,l", true); -// Value *current_result = builder.CreateCall(iasm, {mask, ptr}); - - packets[id] = current_result; - } - }); - // extract result element - result->for_each([&](indices_t idx){ - unsigned linear = result->get_linear_index(idx); - unsigned id = linear / vector_size; -// Value *tmp = builder.CreateExtractValue(packets.at(id), {(linear % vector_size) / 2}); -// Value *res = builder.CreateExtractElement(tmp, (linear % vector_size) % 2); -// result->set_value(idx, res); - result->set_value(idx, builder.CreateExtractElement(packets.at(id), linear % vector_size)); - }); -} - -void selection::lower_load(ir::load_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { - distributed_tile* result = (distributed_tile*)tmap_.at(x); - // find vector size - ir::value *ptr = x->get_pointer_operand(); - size_t ld = layouts_->get(ptr)->order[0]; - unsigned alignment = alignment_->get(ptr, ld); - unsigned vector_size = std::min(result->axis(ld).contiguous, alignment); - distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr); - // vector loads - std::map packets; - result->for_each([&](indices_t idx){ - unsigned linear = result->get_linear_index(idx); - unsigned id = linear / vector_size; - if(linear % vector_size == 0) { - Value *ptr = pointers->get_value(idx); - ptr = builder.CreateBitCast(ptr, PointerType::get(VectorType::get(result->get_ty(), vector_size), - ptr->getType()->getPointerAddressSpace())); - packets[id] = builder.CreateLoad(ptr); - } - }); - // extract result element - result->for_each([&](indices_t idx){ - unsigned linear = result->get_linear_index(idx); - unsigned id = linear / vector_size; - result->set_value(idx, builder.CreateExtractElement(packets.at(id), linear % vector_size)); - }); -} - -void selection::lower_elementwise(ir::instruction *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) { - distributed_tile* result = (distributed_tile*)tmap_.at(x); - result->for_each([&](indices_t idx){ - auto value = [&](ir::value *v) { - if(auto *cst = dynamic_cast(v)) - return (Value*)llvm_constant(cst, ctx); - else if(v->get_type()->is_tile_ty()) - return tmap_.at(v)->get_value(idx); - else - return llvm_value(v, builder); - }; - result->set_value(idx, llvm_inst(x, value, builder)); - }); -} - -void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> &builder) { - BasicBlock *block = builder.GetInsertBlock(); - LLVMContext &ctx = builder.getContext(); - Function *fn = block->getParent(); - if(auto *x = dynamic_cast(ins)) - lower_masked_store(x, ctx, fn, builder); - else if(auto *x = dynamic_cast(ins)) - lower_store(x, ctx, fn, builder); - else if(auto *x = dynamic_cast(ins)) - lower_downcast(x, ctx, fn, builder); - else if(auto *x = dynamic_cast(ins)) - lower_reduce(x, ctx, fn, builder); - else if(auto *x = dynamic_cast(ins)) - lower_dynamic_program_idx(x, ctx, fn, builder); - else if(auto *x = dynamic_cast(ins)) - lower_reshape(x, ctx, fn, builder); - else if(auto *x = dynamic_cast(ins)) - lower_splat(x, ctx, fn, builder); - else if(auto *x = dynamic_cast(ins)) - lower_broadcast(x, ctx, fn, builder); - else if(auto *x = dynamic_cast(ins)) - lower_copy_to_shared(x, ctx, fn, builder); - else if(auto *x = dynamic_cast(ins)) - lower_copy_from_shared(x, ctx, fn, builder); - else if(auto* x = dynamic_cast(ins)) - lower_trans(x, ctx, fn, builder); - else if(auto x = dynamic_cast(ins)) - lower_dot(x, ctx, fn, builder); - else if(auto *x = dynamic_cast(ins)) - lower_masked_load(x, ctx, fn, builder); - else if(auto *x = dynamic_cast(ins)) - lower_load(x, ctx, fn, builder); - else if(!dynamic_cast(tmap_.at(ins))) - lower_elementwise(ins, ctx, fn, builder); -} - -void selection::lower_value(ir::value *src, IRBuilder<> &builder, std::set& seen) { +void selection::lower_value(ir::value *src, IRBuilder<> &builder, generator* gen, std::set& seen) { if(!seen.insert(src).second) return; auto *inst = dynamic_cast(src); if(inst && !dynamic_cast(src)) for(ir::value *op: inst->ops()) - lower_value(op, builder, seen); + lower_value(op, builder, gen, seen); BasicBlock *current = builder.GetInsertBlock(); auto *phi = dynamic_cast(src); @@ -1425,12 +671,11 @@ void selection::lower_value(ir::value *src, IRBuilder<> &builder, std::setset_value(idx, res); }); } - else if(inst && inst->has_tile_result_or_op()) { - lower_tile_instruction(inst, builder); + else if(auto *cst = dynamic_cast(src)){ + vmap_[cst] = llvm_constant(cst, builder.getContext()); } else if(inst){ - Instruction *i = (Instruction*)llvm_value(inst, builder); - vmap_[src] = i; + inst->accept(gen); } if(phi_inserted && current->getFirstNonPHI()) @@ -1538,7 +783,7 @@ void selection::run(ir::module &src, Module &dst) { for(ir::function *fn: src.get_function_list()) { // create LLVM function - llvm_fn(fn, dst_builder, dst); + Function *ffn = llvm_fn(fn, dst_builder, dst); // allocate shared memory sh_mem_ptr_ = alloc_shared(dst_builder, dst); @@ -1546,13 +791,16 @@ void selection::run(ir::module &src, Module &dst) { // initialize layouts init_layouts(fn, dst_builder, sh_mem_ptr_); + generator gen(&dst_ctx, ffn, &dst_builder, vmap_, tmap_, tgt_, layouts_, alignment_, alloc_, sh_mem_ptr_, + offset_a_i_, offset_a_k_, offset_b_j_, offset_b_k_, num_packs_0_, num_packs_1_, pack_size_0_, pack_size_1_, num_warps_ ); + // generate LLVM-IR code std::map last_block; for(ir::basic_block *block: fn->blocks()) { BasicBlock *parent = (BasicBlock*)vmap_[block]; dst_builder.SetInsertPoint(parent); for(ir::instruction *i: block->get_inst_list()) - lower_value(i, dst_builder, seen); + lower_value(i, dst_builder, &gen, seen); last_block[block] = dst_builder.GetInsertBlock(); } @@ -1602,8 +850,8 @@ void selection::run(ir::module &src, Module &dst) { }); } else { - PHINode *llvm_phi = (PHINode*)llvm_value(phi, dst_builder); - Value *llvm_inc_val = llvm_value(inc_val, dst_builder); + PHINode *llvm_phi = (PHINode*)vmap_.at(phi); + Value *llvm_inc_val = vmap_.at(inc_val); llvm_phi->addIncoming(llvm_inc_val, llvm_inc_block); } } @@ -1688,18 +936,18 @@ void generator::visit_cast_inst(ir::cast_inst* cast) { void generator::visit_return_inst(ir::return_inst* rr) { ir::value *ret_val = rr->get_return_value(); - builder_->Insert(ReturnInst::Create(*ctx_, ret_val ? ret_val : nullptr)); + builder_->Insert(ReturnInst::Create(*ctx_, ret_val ? vmap_.at(ret_val) : nullptr)); } void generator::visit_cond_branch_inst(ir::cond_branch_inst* br) { - BasicBlock *true_dest = vmap_.at(br->get_true_dest()); - BasicBlock *false_dest = vmap_.at(br->get_false_dest()); + BasicBlock *true_dest = (BasicBlock*)vmap_.at(br->get_true_dest()); + BasicBlock *false_dest = (BasicBlock*)vmap_.at(br->get_false_dest()); Value *cond = vmap_.at(br->get_cond()); builder_->Insert(BranchInst::Create(true_dest, false_dest, cond)); } void generator::visit_uncond_branch_inst(ir::uncond_branch_inst* br) { - BasicBlock *dest = vmap_.at(br->get_dest()); + BasicBlock *dest = (BasicBlock*)vmap_.at(br->get_dest()); builder_->Insert(BranchInst::Create(dest)); } @@ -1754,7 +1002,7 @@ void generator::visit_masked_load_inst(ir::masked_load_inst* x) { ptr->getType()->getPointerAddressSpace())); Value *mask = masks->get_value(idx); BasicBlock *current_bb = builder_->GetInsertBlock(); - const Function *parent = builder_->GetInsertBlock()->getParent(); + Function *parent = builder_->GetInsertBlock()->getParent(); BasicBlock *mask_then_bb = BasicBlock::Create(*ctx_, "mask_then", parent); BasicBlock *mask_done_bb = BasicBlock::Create(*ctx_, "mask_done", parent); builder_->CreateCondBr(mask, mask_then_bb, mask_done_bb); @@ -1822,7 +1070,7 @@ void generator::visit_masked_store_inst(ir::masked_store_inst* st) { Value *scalar = scalars->get_value(idx); Value *ptr = ptrs->get_value(idx); Value *pred = preds->get_value(idx); - const Function *parent = builder_->GetInsertBlock()->getParent(); + Function *parent = builder_->GetInsertBlock()->getParent(); BasicBlock *mask_then_bb = BasicBlock::Create(*ctx_, "mask_then", parent); BasicBlock *mask_done_bb = BasicBlock::Create(*ctx_, "mask_done", parent); builder_->CreateCondBr(pred, mask_then_bb, mask_done_bb); @@ -1882,13 +1130,13 @@ void generator::visit_downcast_inst(ir::downcast_inst* x) { } void generator::visit_get_program_id_inst(ir::get_program_id_inst* pid) { - Module &module = builder_->GetInsertBlock()->getModule(); + Module *module = builder_->GetInsertBlock()->getModule(); Value *ret = tgt_->get_block_id(module, *builder_, pid->get_axis()); vmap_[pid] = ret; } void generator::visit_get_num_program_inst(ir::get_num_program_inst* np) { - Module &module = builder_->GetInsertBlock()->getModule(); + Module *module = builder_->GetInsertBlock()->getModule(); Value *ret = tgt_->get_num_blocks(module, *builder_, np->get_axis()); vmap_[np] = ret; } @@ -1916,8 +1164,7 @@ void generator::visit_atomic_cas_inst(ir::atomic_cas_inst* cas) { builder_->SetInsertPoint(tid_0_done_bb); tgt_->add_memfence(module, *builder_); tgt_->add_barrier(module, *builder_); - Value *res = builder_->CreateLoad(ptr); - return (Instruction*)res; + vmap_[cas] = builder_->CreateLoad(ptr); } void generator::visit_atomic_exch_inst(ir::atomic_exch_inst* xchg) { @@ -1933,12 +1180,11 @@ void generator::visit_atomic_exch_inst(ir::atomic_exch_inst* xchg) { tgt_->add_barrier(module, *builder_); builder_->CreateCondBr(pred, tid_0_bb, tid_0_done_bb); builder_->SetInsertPoint(tid_0_bb); - Value *res = builder_->CreateAtomicRMW(AtomicRMWInst::Xchg, rmw_ptr, rmw_val, AtomicOrdering::Monotonic, SyncScope::System); + vmap_[xchg] = builder_->CreateAtomicRMW(AtomicRMWInst::Xchg, rmw_ptr, rmw_val, AtomicOrdering::Monotonic, SyncScope::System); builder_->CreateBr(tid_0_done_bb); builder_->SetInsertPoint(tid_0_done_bb); tgt_->add_memfence(module, *builder_); tgt_->add_barrier(module, *builder_); - return (Instruction*)res; } void generator::visit_atomic_add_inst(ir::atomic_add_inst*) { @@ -1963,7 +1209,7 @@ void generator::visit_hmma_dot(ir::dot_inst* dot, distributed_tile *TC, shared_t Type *fp32_ty = builder_->getFloatTy(); Type *fp16x2_ty = VectorType::get(builder_->getHalfTy(), 2); - Type *fp32_pack8_ty = StructType::get(ctx, {fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}); + Type *fp32_pack8_ty = StructType::get(*ctx_, {fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}); FunctionType *mma_ty = FunctionType::get(fp32_pack8_ty, {fp16x2_ty, fp16x2_ty, fp16x2_ty, fp16x2_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}, false); Value *offset_a_i = offset_a_i_; @@ -2098,7 +1344,7 @@ void generator::visit_scanline_dot(ir::dot_inst* dot, distributed_tile *TC, shar }); } -void generator::visit_outer_dot(ir::dot_inst*, distributed_tile *TC, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK, +void generator::visit_outer_dot(ir::dot_inst*, distributed_tile *TC, distributed_tile *TA, distributed_tile *TB, distributed_tile *TD, unsigned NK, Type *c_ty, Function *f_mul_add) { TC->for_each([&](indices_t idx){ Value *res = TD->get_value(idx); @@ -2127,7 +1373,7 @@ void generator::visit_dot_inst(ir::dot_inst* dot) { ir::value *D = dot->get_operand(2); distributed_tile *TD = (distributed_tile*)tmap_.at(D); - Type *c_ty = type(D->get_type()->get_scalar_ty(), *ctx_); + Type *c_ty = type(D->get_type()->get_scalar_ty()); Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {c_ty}); auto A_shapes = A->get_type()->get_tile_shapes(); size_t red_axis = 1; @@ -2154,13 +1400,13 @@ void generator::visit_trans_inst(ir::trans_inst* trans) { tmap_[trans] = out; } -void generator::visit_sqrt_inst(ir::sqrt_inst* sqrt) { - for_each(sqrt, [&](indices_t idx){ - Value *val = get_value(sqrt->get_operand(0), idx); +void generator::visit_sqrt_inst(ir::sqrt_inst* sqt) { + for_each(sqt, [&](indices_t idx){ + Value *val = get_value(sqt->get_operand(0), idx); Module* module = builder_->GetInsertBlock()->getModule(); Value *sqrt = Intrinsic::getDeclaration(module, Intrinsic::sqrt, {val->getType()}); Value *ret = builder_->CreateCall(sqrt, {val}); - set_value(sqrt, idx, ret); + set_value(sqt, idx, ret); }); } @@ -2253,6 +1499,66 @@ void generator::visit_make_range(ir::make_range* x) { }); } +Type *generator::type(ir::type *ty) { + // function + if(auto* tt = dynamic_cast(ty)){ + Type *return_ty = type(tt->get_return_ty()); + std::vector param_tys; + std::transform(tt->params_begin(), tt->params_end(), std::back_inserter(param_tys), + [this](ir::type* t){ return type(t);}); + return FunctionType::get(return_ty, param_tys, false); + } + // pointer + if(ty->is_pointer_ty()){ + Type *elt_ty = type(ty->get_pointer_element_ty()); + unsigned addr_space = ty->get_pointer_address_space(); + return PointerType::get(elt_ty, addr_space); + } + // integer + if(ty->is_integer_ty()){ + unsigned bitwidth = ty->get_integer_bitwidth(); + return IntegerType::get(*ctx_, bitwidth); + } + // primitive types + switch(ty->get_type_id()){ + case ir::type::VoidTyID: return Type::getVoidTy(*ctx_); + case ir::type::HalfTyID: return Type::getHalfTy(*ctx_); + case ir::type::FloatTyID: return Type::getFloatTy(*ctx_); + case ir::type::DoubleTyID: return Type::getDoubleTy(*ctx_); + case ir::type::X86_FP80TyID: return Type::getX86_FP80Ty(*ctx_); + case ir::type::PPC_FP128TyID: return Type::getPPC_FP128Ty(*ctx_); + case ir::type::LabelTyID: return Type::getLabelTy(*ctx_); + case ir::type::MetadataTyID: return Type::getMetadataTy(*ctx_); + case ir::type::TokenTyID: return Type::getTokenTy(*ctx_); + default: break; + } + // unknown type + throw std::runtime_error("unknown conversion from ir::type to Type"); +} + +void generator::for_each(ir::value *x, const std::function& fn) { + if(!x->get_type()->is_tile_ty()) + return fn({}); + else { + if(auto *dt = dynamic_cast(tmap_.at(x))) + dt->for_each(fn); + } +} + +Value* generator::get_value(ir::value *x, const indices_t& idx) { + if(x->get_type()->is_tile_ty()) + return tmap_.at(x)->get_value(idx); + return vmap_.at(x); +} + +void generator::set_value(ir::value *x, const indices_t& idx, Value* v) { + if(x->get_type()->is_tile_ty()) + tmap_.at(x)->set_value(idx, v); + else + vmap_[x] = v; +} + + } } diff --git a/lib/driver/module.cc b/lib/driver/module.cc index e300a75f2..f29c830f4 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -241,6 +241,7 @@ std::string cu_module::compile_llvm_module(std::unique_ptr module, cu_module::cu_module(driver::context * context, std::unique_ptr ll_module): cu_module(context, compile_llvm_module(std::move(ll_module), context->device())) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ +// std::cout << source << std::endl; cu_context::context_switcher ctx(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; From ee387ff567586f65ff2731dedd9dd93e9fc89e51 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 13 Oct 2019 14:43:17 -0400 Subject: [PATCH 439/494] more cleaning --- include/triton/codegen/analysis/layout.h | 19 + include/triton/codegen/selection.h | 35 +- include/triton/ir/constant.h | 30 +- include/triton/ir/context_impl.h | 3 - include/triton/ir/function.h | 3 + include/triton/ir/instructions.h | 3 +- include/triton/ir/value.h | 4 + include/triton/ir/visitor.h | 26 +- include/triton/runtime/function.h | 1 - lib/codegen/selection.cc | 432 +++++++++++------------ lib/ir/constant.cc | 21 -- 11 files changed, 277 insertions(+), 300 deletions(-) diff --git a/include/triton/codegen/analysis/layout.h b/include/triton/codegen/analysis/layout.h index 01b65e8d2..096c45ea3 100644 --- a/include/triton/codegen/analysis/layout.h +++ b/include/triton/codegen/analysis/layout.h @@ -35,6 +35,18 @@ struct double_buffer_info_t { ir::phi_node* phi; }; +class layout_visitor; +class layout_hmma_884_t; +class layout_scanline_t; +class layout_shared_t; + + +class layout_visitor { +public: + virtual void visit_layout_hmma_884(layout_hmma_884_t*) = 0; + virtual void visit_layout_scanline(layout_scanline_t*) = 0; + virtual void visit_layout_shared(layout_shared_t*) = 0; +}; struct layout_t { layout_t(layout_type_t _type, @@ -43,6 +55,9 @@ struct layout_t { const std::vector &_values, size_t _id, analysis::align* align); + + virtual void accept(layout_visitor* vst) = 0; + layout_type_t type; std::vector axes; std::vector shapes; @@ -66,6 +81,7 @@ struct layout_hmma_884_t: public layout_t { const std::vector &_values, size_t _id, analysis::align* align); + void accept(layout_visitor* vst) { vst->visit_layout_hmma_884(this); } }; struct layout_scanline_t: public layout_t { @@ -75,6 +91,7 @@ struct layout_scanline_t: public layout_t { const std::vector &values, size_t _id, analysis::align* align); + void accept(layout_visitor* vst) { vst->visit_layout_scanline(this); } }; struct layout_shared_t: public layout_t { @@ -85,9 +102,11 @@ struct layout_shared_t: public layout_t { ir::type *ty, size_t _id, analysis::align* align); + void accept(layout_visitor* vst) { vst->visit_layout_shared(this); } }; + class layout { typedef ir::value* node_t; typedef std::map > graph_t; diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 279c7475e..8d42d9dfe 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -147,7 +147,7 @@ private: }; -class generator: public ir::visitor { +class generator: public ir::visitor, public analysis::layout_visitor { private: void visit_hmma_dot(ir::dot_inst*, distributed_tile *TC, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK); void visit_scanline_dot(ir::dot_inst*, distributed_tile *TC, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK, Type *c_ty, Function *f_mul_add); @@ -163,7 +163,9 @@ public: generator(LLVMContext *ctx, Function *fn, + Module *dst, Builder *builder, + std::map& axes, std::map& vmap, std::map& tmap, target *tgt, @@ -176,7 +178,7 @@ public: unsigned num_packs_0, unsigned num_packs_1, unsigned pack_size_0, unsigned pack_size_1, unsigned num_warps) - : ctx_(ctx), fn_(fn), builder_(builder), vmap_(vmap), tmap_(tmap), tgt_(tgt), + : ctx_(ctx), fn_(fn), mod_(dst), builder_(builder), axes_(axes), vmap_(vmap), tmap_(tmap), tgt_(tgt), layouts_(layouts), alignment_(alignment), alloc_(alloc), sh_mem_ptr_(sh_mem_ptr), offset_a_i_(offset_a_i), offset_a_k_(offset_a_k), offset_b_j_(offset_b_j), offset_b_k_(offset_b_k), num_packs_0_(num_packs_0), num_packs_1_(num_packs_1), pack_size_0_(pack_size_0), pack_size_1_(pack_size_1), @@ -221,14 +223,27 @@ public: void visit_copy_from_shared_inst(ir::copy_from_shared_inst*); void visit_barrier_inst(ir::barrier_inst*); void visit_make_range_dyn(ir::make_range_dyn*); - void visit_make_range_sta(ir::make_range_sta*); void visit_make_range(ir::make_range*); + void visit_make_range_sta(ir::make_range_sta*); + void visit_undef_value(ir::undef_value*); + void visit_constant_int(ir::constant_int*); + void visit_constant_fp(ir::constant_fp*); + void visit_alloc_const(ir::alloc_const*); + + void visit_function(ir::function*); + + void visit_layout_hmma_884(analysis::layout_hmma_884_t*); + void visit_layout_scanline(analysis::layout_scanline_t*); + void visit_layout_shared(analysis::layout_shared_t*); + private: LLVMContext *ctx_; Function *fn_; Builder *builder_; + Module *mod_; + std::map& axes_; std::map& vmap_; std::map& tmap_; target *tgt_; @@ -249,29 +264,15 @@ class selection{ typedef std::map tmap_t; private: - // utils - Type *make_vector_ty(Type *ty, size_t vector_size); - std::vector extract_shapes(ir::value *v); - // LLVM conversions Type* llvm_type(ir::type *ty, LLVMContext &ctx); - Constant* llvm_constant(ir::constant *cst, LLVMContext &ctx); Value* llvm_alloc_const(ir::alloc_const *v, Module *module, Builder &builder); - ArrayType* llvm_linearized_tile_type(ir::type *ty, LLVMContext &ctx); Function* llvm_fn(ir::function *fn, Builder& builder, Module &dst); Value* alloc_shared(Builder &builder, Module& dst); // grid construction - void create_grids(std::vector &grids, - std::map &references, - ir::function *fn); void create_shared_tile(ir::value *v, Builder &builder, Value *sh_mem_ptr); void create_distributed_tile(ir::value *v, Builder &builder); - void create_tile(ir::value *v, Builder &builder, std::set &seen, Value *sh_mem_ptr); - void init_strided_scan_axes(const analysis::layout_t& layout, Builder &builder, Value *u_thread_id, Value *u_warp_id); - void init_hmma_axes(const analysis::layout_t& layout, Builder &builder, Value *u_thread_id, Value *u_warp_id); - void init_axes(const analysis::layout_t& layout, Builder &builder, Value *u_thread_id, Value *u_warp_id); - void init_layouts(ir::function *fn, Builder &builder, Value *sh_mem_ptr); // lower scalar instruction void lower_value(ir::value *src, Builder &builder, generator* gen, std::set& seen); diff --git a/include/triton/ir/constant.h b/include/triton/ir/constant.h index 0127acae6..671d5e5f0 100644 --- a/include/triton/ir/constant.h +++ b/include/triton/ir/constant.h @@ -6,6 +6,7 @@ #include "enums.h" #include "value.h" #include +#include "visitor.h" namespace triton{ namespace ir{ @@ -32,6 +33,7 @@ private: public: static undef_value* get(type* ty); std::string repr() const { return "undef"; } + void accept(visitor* vst) { vst->visit_undef_value(this); } }; @@ -44,31 +46,13 @@ public: virtual uint64_t get_value() const { return value_; } static constant_int *get(type *ty, uint64_t value); std::string repr() const { return std::to_string(value_); } + void accept(visitor* vst) { vst->visit_constant_int(this); } protected: uint64_t value_; }; -/* Metaparameter (int) */ -class metaparameter: public constant_int { -private: - metaparameter(type *ty, const std::vector& space); - -public: - static metaparameter *create(context &ctx, type *ty, unsigned lo, unsigned hi); - static metaparameter *create(context &ctx, type *ty, const std::vector& space); - void set_value(uint64_t value) { has_value_ = true; value_ = value; } - bool has_value() { return has_value_; } - const std::vector& get_space() { return space_; } - void set_space(const std::vector &space) { space_ = space; } - uint64_t get_value() const { assert(has_value_); return value_; } - std::string repr() const { return has_value_? std::to_string(value_) : "?" ;} -private: - std::vector space_; - bool has_value_; -}; - -/* constant fp */ +/* Constant fp */ class constant_fp: public constant{ constant_fp(type *ty, double value); @@ -79,13 +63,14 @@ public: static constant* get(context &ctx, double v); static constant* get(type *ty, double v); std::string repr() const { return std::to_string(value_); } + void accept(visitor* vst) { vst->visit_constant_fp(this); } private: double value_; }; -/* global value */ +/* Global Value */ class global_value: public constant { public: enum linkage_types_t { @@ -109,7 +94,6 @@ public: linkage_types_t linkage, const std::string &name, unsigned addr_space = 0); std::string repr() const { return get_name(); } - }; /* global variable */ @@ -118,6 +102,8 @@ public: alloc_const(type *ty, constant_int *size, const std::string &name = ""); std::string repr() const { return get_name(); } + void accept(visitor* vst) { vst->visit_alloc_const(this); } + }; diff --git a/include/triton/ir/context_impl.h b/include/triton/ir/context_impl.h index 5995de0d4..a016d1add 100644 --- a/include/triton/ir/context_impl.h +++ b/include/triton/ir/context_impl.h @@ -14,7 +14,6 @@ class constant; class constant_int; class constant_fp; class undef_value; -class metaparameter; /* Context impl */ class context_impl { @@ -36,8 +35,6 @@ public: std::map, constant_fp*> fp_constants_; // undef values std::map uv_constants_; - // Metaparameters - std::vector mp_constants_; }; } diff --git a/include/triton/ir/function.h b/include/triton/ir/function.h index 74af3abe2..8cf11275a 100644 --- a/include/triton/ir/function.h +++ b/include/triton/ir/function.h @@ -112,6 +112,9 @@ public: const attr_map_t &attrs() { return attrs_; } std::set get_attributes(argument* arg) { return attrs_[arg->get_arg_no() + 1]; } + // visitor + void accept(visitor *v) { v->visit_function(this); } + private: module *parent_; bool init_; diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index bbc75c63c..4409d1ccb 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -71,8 +71,6 @@ public: } // instruction id value_id_t get_id() const { return id_; } - // visit - virtual void accept(visitor *v) = 0; private: basic_block *parent_; @@ -759,6 +757,7 @@ public: static make_range_sta *get(make_range* range); make_range* get_range() const; std::string repr() const { return "nv_static_program_idx"; } + _TRITON_DEFINE_ACCEPT(make_range_sta) private: make_range *range_; diff --git a/include/triton/ir/value.h b/include/triton/ir/value.h index 0c2727a38..bf4a4aa9c 100644 --- a/include/triton/ir/value.h +++ b/include/triton/ir/value.h @@ -13,6 +13,7 @@ namespace ir{ class type; class use; class user; +class visitor; //===----------------------------------------------------------------------===// // value class @@ -74,6 +75,9 @@ public: void replace_all_uses_with(value *target); void replace_uses_of_with(value *before, value *after); + // Visitor + virtual void accept(visitor *v) = 0; + private: ops_t ops_; unsigned num_ops_; diff --git a/include/triton/ir/visitor.h b/include/triton/ir/visitor.h index ffe8d734c..e2310e94a 100644 --- a/include/triton/ir/visitor.h +++ b/include/triton/ir/visitor.h @@ -61,10 +61,25 @@ class copy_to_shared_inst; class copy_from_shared_inst; class barrier_inst; class make_range_dyn; -class make_range_sta; class make_range; +class make_range_sta; +class undef_value; +class constant_int; +class constant_fp; +class global_value; +class global_object; +class alloc_const; +class constant_fp; +class undef_value; +class constant_int; +class constant_fp; +class global_value; +class global_object; +class alloc_const; + +class function; class visitor { public: @@ -108,8 +123,15 @@ public: virtual void visit_copy_from_shared_inst(copy_from_shared_inst*) = 0; virtual void visit_barrier_inst(barrier_inst*) = 0; virtual void visit_make_range_dyn(make_range_dyn*) = 0; - virtual void visit_make_range_sta(make_range_sta*) = 0; virtual void visit_make_range(make_range*) = 0; + + virtual void visit_function(function*) = 0; + + virtual void visit_make_range_sta(make_range_sta*) = 0; + virtual void visit_undef_value(undef_value*) = 0; + virtual void visit_constant_int(constant_int*) = 0; + virtual void visit_constant_fp(constant_fp*) = 0; + virtual void visit_alloc_const(alloc_const*) = 0; }; } diff --git a/include/triton/runtime/function.h b/include/triton/runtime/function.h index c12f9c6ca..e312cfded 100644 --- a/include/triton/runtime/function.h +++ b/include/triton/runtime/function.h @@ -43,7 +43,6 @@ namespace ir { class module; class function; class context; -class metaparameter; } namespace runtime{ diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 9d95e0a41..9a2f1f569 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -343,16 +343,6 @@ Type *selection::llvm_type(ir::type *ty, LLVMContext &ctx) { throw std::runtime_error("unknown conversion from ir::type to Type"); } -/* convert ir::constant to Constant */ -Constant *selection::llvm_constant(ir::constant *cst, LLVMContext &ctx) { - Type *dst_ty = llvm_type(cst->get_type()->get_scalar_ty(), ctx); - if(auto* cc = dynamic_cast(cst)) - return ConstantInt::get(dst_ty, cc->get_value()); - if(auto* cc = dynamic_cast(cst)) - return ConstantFP::get(dst_ty, cc->get_value()); - // unknown constant - throw std::runtime_error("unknown conversion from ir::constant to Constant"); -} /* convert ir::alloc_const to llvm::GlobalVariable */ Value* selection::llvm_alloc_const(ir::alloc_const *v, Module *module, IRBuilder<> &builder) { @@ -387,145 +377,6 @@ inline int32_t ceil(int32_t num, int32_t div){ return (num + div - 1)/div; } -void selection::init_strided_scan_axes(const analysis::layout_t& layout, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { - auto order = layout.order; - const auto& shapes = layout.shapes; - size_t dim = shapes.size(); - std::vector nts = layout.nts; - std::vector mts = layout.mts; - Value* full_thread_id = builder.CreateAdd(builder.CreateMul(u_warp_id, builder.getInt32(32)), u_thread_id); - std::vector thread_id = delinearize(full_thread_id, order, mts, builder); - // Create axes - for(unsigned k = 0; k < dim; k++) { - std::string str_k = std::to_string(k); - Value *contiguous_k = builder.getInt32(nts[k]); - Value *scaled_thread_id = builder.CreateMul(thread_id[k], contiguous_k); - unsigned per_block = nts[k] * mts[k]; - unsigned per_thread = nts[k] * shapes[k] / per_block; - std::vector idx_list(per_thread); - for(unsigned n = 0 ; n < per_thread; n++){ - unsigned offset = n / nts[k] * per_block + n % nts[k]; - idx_list[n] = builder.CreateAdd(scaled_thread_id, builder.getInt32(offset), "idx_" + str_k + "_" + std::to_string(n)); - } - axes_[layout.axes[k]] = distributed_axis{nts[k], idx_list, thread_id[k]}; - } -} - -void selection::init_hmma_axes(const analysis::layout_t& layout, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { - const auto& shapes = layout.shapes; - if(shapes.size() > 3) - throw std::runtime_error("unsupported"); - - bool is_batched = shapes.size() >= 3; - - Value *_1 = builder.getInt32(1); - Value *_2 = builder.getInt32(2); - Value *_3 = builder.getInt32(3); - Value *_4 = builder.getInt32(4); - Value *_16 = builder.getInt32(16); - - // fragments per warp - unsigned fpw_0 = layout.fpw.at(0); - unsigned fpw_1 = layout.fpw.at(1); - unsigned fpw_2 = is_batched ? layout.fpw.at(2) : 1; - // warps per tile - unsigned wpt_0 = layout.wpt.at(0); - unsigned wpt_1 = layout.wpt.at(1); - unsigned wpt_2 = is_batched ? layout.wpt.at(2) : 1; - // hmma warp tile size - unsigned hmma_wts_0 = fpw_0 * 8; - unsigned hmma_wts_1 = fpw_1 * 8; - unsigned hmma_wts_2 = is_batched ? fpw_2 : 1; - // hmma block tile size - unsigned hmma_bts_0 = hmma_wts_0 * wpt_0; - unsigned hmma_bts_1 = hmma_wts_1 * wpt_1; - unsigned hmma_bts_2 = is_batched ? hmma_wts_2 * wpt_2 : 1; - // number of repetition - unsigned num_rep_0 = shapes[0] / hmma_bts_0; - unsigned num_rep_1 = shapes[1] / hmma_bts_1; - unsigned num_rep_2 = is_batched ? shapes[2] / hmma_bts_2 : 1; - // size of each pack (interleaving) - pack_size_0_ = std::min(num_rep_0, 1); - pack_size_1_ = std::min(num_rep_1, 1); - // number of packs (interleaving) - num_packs_0_ = num_rep_0 / pack_size_0_; - num_packs_1_ = num_rep_1 / pack_size_1_; - - /* intra warp offset */ - // offset of quad in pair - Value *in_pair_off_a = builder.CreateMul(builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), builder.getInt32(4)), - builder.getInt32(fpw_0 * pack_size_0_)); - Value *in_pair_off_b = builder.CreateMul(builder.CreateUDiv(builder.CreateAnd(u_thread_id, _16), builder.getInt32(4)), - builder.getInt32(fpw_1 * pack_size_1_)); - - // Quad pair id - Value *pair_a_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), _4); - Value *pair_b_id = builder.CreateUDiv(builder.CreateURem(u_thread_id, _16), _4); - pair_a_id = builder.CreateURem(pair_a_id, builder.getInt32(fpw_0)); - pair_b_id = builder.CreateUDiv(pair_b_id, builder.getInt32(fpw_0)); - pair_b_id = builder.CreateURem(pair_b_id, builder.getInt32(fpw_1)); - // Quad pair offset - Value *pair_a_off = builder.CreateMul(pair_a_id, builder.getInt32(4 * pack_size_0_)); - Value *pair_b_off = builder.CreateMul(pair_b_id, builder.getInt32(4 * pack_size_1_)); - - /* inter warp offset */ - Value *warp_id_0 = builder.CreateURem(u_warp_id, builder.getInt32(wpt_0)); - Value *warp_id_12 = builder.CreateUDiv(u_warp_id, builder.getInt32(wpt_0)); - Value *warp_id_1 = builder.CreateURem(warp_id_12, builder.getInt32(wpt_1)); - Value *warp_id_2 = builder.CreateUDiv(warp_id_12, builder.getInt32(wpt_1)); - Value *warp_offset_i = builder.CreateMul(warp_id_0, builder.getInt32(hmma_wts_0 * pack_size_0_)); - Value *warp_offset_j = builder.CreateMul(warp_id_1, builder.getInt32(hmma_wts_1 * pack_size_1_)); - - /* offsets */ - // a offset - offset_a_i_ = builder.CreateAdd(warp_offset_i, builder.CreateAdd(pair_a_off, in_pair_off_a)); - offset_a_k_ = builder.CreateAnd(u_thread_id, _3); - // b offsets - offset_b_j_ = builder.CreateAdd(warp_offset_j, builder.CreateAdd(pair_b_off, in_pair_off_b)); - offset_b_k_ = builder.CreateAnd(u_thread_id, _3); - - // c offsets - Value *offset_c_i = builder.CreateAdd(builder.CreateAnd(u_thread_id, _1), offset_a_i_); - Value *offset_c_j = builder.CreateAdd(builder.CreateAnd(u_thread_id, _2), - builder.CreateAdd(warp_offset_j, pair_b_off)); - - /* indices */ - // i indices - std::vector idx_i; - for(unsigned pack = 0; pack < num_packs_0_; pack++) - for(unsigned ii = 0; ii < pack_size_0_; ii++) - for(unsigned i = 0; i < 2; i++){ - idx_i.push_back(builder.CreateAdd(offset_c_i, builder.getInt32(pack*hmma_bts_0*pack_size_0_ + ii*4 + i*2))); - } - // j indices - std::vector idx_j; - for(unsigned pack = 0; pack < num_packs_1_; pack++) - for(unsigned jj = 0; jj < pack_size_1_; jj++) - for(unsigned j = 0; j < 2; j++){ - idx_j.push_back(builder.CreateAdd(offset_c_j, builder.getInt32(pack*hmma_bts_1*pack_size_1_ + jj*4 + j*4*fpw_1*pack_size_1_))); - idx_j.push_back(builder.CreateAdd(offset_c_j, builder.getInt32(pack*hmma_bts_1*pack_size_1_ + jj*4 + j*4*fpw_1*pack_size_1_ + 1))); - } - // z indices - std::vector idx_z; - for(unsigned pack = 0; pack < num_rep_2; pack++) - idx_z.push_back(builder.CreateAdd(warp_id_2, builder.getInt32(pack*hmma_bts_2))); - - - /* axes */ - axes_[layout.axes[0]] = distributed_axis{1, idx_i, warp_id_0}; - axes_[layout.axes[1]] = distributed_axis{1, idx_j, warp_id_1}; - if(is_batched) - axes_[layout.axes[2]] = distributed_axis{1, idx_z, warp_id_2}; -} - - -void selection::init_axes(const analysis::layout_t& layout, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { - if(layout.type == analysis::HMMA_884) - init_hmma_axes(layout, builder, u_thread_id, u_warp_id); - else if(layout.type == analysis::SCANLINE) - init_strided_scan_axes(layout, builder, u_thread_id, u_warp_id); -} - /* ------------------- * ---- Init Tiles ---- * ------------------- */ @@ -549,7 +400,7 @@ void selection::create_shared_tile(ir::value *v, IRBuilder<> &builder, Value *sh if(parent->empty()) builder.SetInsertPoint(parent); else - builder.SetInsertPoint(&*parent->getFirstInsertionPt()); + builder.SetInsertPoint(&*parent->getFirstNonPHI()); // create double-buffered pointer PHINode *ptr = builder.CreatePHI(ptr_ty, 2); PHINode *offset = builder.CreatePHI(builder.getInt32Ty(), 2); @@ -587,41 +438,6 @@ void selection::create_distributed_tile(ir::value *v, IRBuilder<> &builder) { tmap_.insert({v, T}); } -void selection::create_tile(ir::value *v, IRBuilder<> &builder, - std::set &seen, Value *sh_mem_ptr) { - if(!v->get_type()->is_tile_ty() || !seen.insert(v).second) - return; - if(auto *user = dynamic_cast(v)) - for(ir::value *op: user->ops()) - create_tile(op, builder, seen, sh_mem_ptr); - auto *i = dynamic_cast(v); - if(i && layouts_->get(i)->type == analysis::SHARED && !dynamic_cast(v)) - create_shared_tile(i, builder, sh_mem_ptr); - else - create_distributed_tile(v, builder); -} - -void selection::init_layouts(ir::function *fn, IRBuilder<> &builder, Value *sh_mem_ptr){ - // fetch linear ID - Module *mod = builder.GetInsertBlock()->getParent()->getParent(); - Value *warp_size = builder.getInt32(32); - Value* u_thread_id = tgt_->get_local_id(mod, builder, 0); - Value *u_thread_warp_id = builder.CreateURem(u_thread_id, warp_size); - Value *u_warp_id = builder.CreateUDiv(u_thread_id, warp_size); - // create grid - for(auto x: layouts_->get_all()) - init_axes(*x.second, builder, u_thread_warp_id, u_warp_id); - // create tile - std::set seen; - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i: block->get_inst_list()){ - if(!i->get_type()->is_tile_ty()) - continue; - create_tile(i, builder, seen, sh_mem_ptr); - } -} - - bool is_trans(ir::value *v) { if(dynamic_cast(v)) { @@ -641,51 +457,34 @@ void selection::lower_value(ir::value *src, IRBuilder<> &builder, generator* gen if(!seen.insert(src).second) return; + BasicBlock *current = builder.GetInsertBlock(); + if(src->get_type()->is_tile_ty()){ + builder.SetInsertPoint(&*builder.GetInsertBlock()->getParent()->begin()); + auto *i = dynamic_cast(src); + if(i && layouts_->get(i)->type == analysis::SHARED && !dynamic_cast(src)){ + create_shared_tile(i, builder, sh_mem_ptr_); + } + else + create_distributed_tile(src, builder); + } + builder.SetInsertPoint(current); + + auto *inst = dynamic_cast(src); if(inst && !dynamic_cast(src)) for(ir::value *op: inst->ops()) lower_value(op, builder, gen, seen); - BasicBlock *current = builder.GetInsertBlock(); + builder.SetInsertPoint(current); auto *phi = dynamic_cast(src); - bool phi_inserted = phi && !current->empty(); - if(phi_inserted && current->getFirstNonPHI()) + if(phi && !current->empty() && current->getFirstNonPHI()) builder.SetInsertPoint(&*current->getFirstNonPHI()); + if(auto *usr = dynamic_cast(src)) + usr->accept(gen); - if(dynamic_cast(src)){ - distributed_tile *T = (distributed_tile *)tmap_.at(src); - T->for_each([&](indices_t idx){ - assert(idx.size() == 1); - T->set_value(idx, idx[0]); - }); - } - else if(dynamic_cast(src)){ - distributed_tile *T = (distributed_tile *)tmap_.at(src); - T->for_each([&](indices_t idx){ - assert(idx.size() == 1); - BinaryOperator *bin_add = dyn_cast(idx[0]); - assert(bin_add); - Value *res = bin_add->getOperand(1); - assert(isa(res)); - T->set_value(idx, res); - }); - } - else if(auto *cst = dynamic_cast(src)){ - vmap_[cst] = llvm_constant(cst, builder.getContext()); - } - else if(inst){ - inst->accept(gen); - } - - if(phi_inserted && current->getFirstNonPHI()) + if(phi && !current->empty() && current->getFirstNonPHI()) builder.SetInsertPoint(current); - -// if(dynamic_cast(src)) -// for(ir::value *op: inst->ops()) -// lower_value(op, builder, seen); - - } /* ---------------------------- @@ -702,12 +501,6 @@ inline llvm::Attribute llvm_attr(llvm::LLVMContext& ctx, ir::attribute attr) { } } -ArrayType* selection::llvm_linearized_tile_type(ir::type *ty, LLVMContext &ctx) { - unsigned size = 1; - for(auto shape: ty->get_tile_shapes()) - size *= shape; - return ArrayType::get(llvm_type(ty->get_scalar_ty(), ctx), size); -} Function* selection::llvm_fn(ir::function *fn, IRBuilder<>& builder, Module& dst) { LLVMContext &ctx = builder.getContext(); @@ -777,6 +570,9 @@ void selection::run(ir::module &src, Module &dst) { for(ir::alloc_const *x: src.allocs()) vmap_[x] = llvm_alloc_const(x, &dst, dst_builder); + // allocate shared memory + sh_mem_ptr_ = alloc_shared(dst_builder, dst); + // iterate over functions std::set seen; @@ -785,14 +581,13 @@ void selection::run(ir::module &src, Module &dst) { // create LLVM function Function *ffn = llvm_fn(fn, dst_builder, dst); - // allocate shared memory - sh_mem_ptr_ = alloc_shared(dst_builder, dst); + // create tile + generator gen(&dst_ctx, ffn, &dst, &dst_builder, axes_, vmap_, tmap_, tgt_, layouts_, alignment_, alloc_, sh_mem_ptr_, + offset_a_i_, offset_a_k_, offset_b_j_, offset_b_k_, num_packs_0_, num_packs_1_, pack_size_0_, pack_size_1_, num_warps_ ); // initialize layouts - init_layouts(fn, dst_builder, sh_mem_ptr_); - - generator gen(&dst_ctx, ffn, &dst_builder, vmap_, tmap_, tgt_, layouts_, alignment_, alloc_, sh_mem_ptr_, - offset_a_i_, offset_a_k_, offset_b_j_, offset_b_k_, num_packs_0_, num_packs_1_, pack_size_0_, pack_size_1_, num_warps_ ); + for(auto x: layouts_->get_all()) + x.second->accept(&gen); // generate LLVM-IR code std::map last_block; @@ -1536,6 +1331,179 @@ Type *generator::type(ir::type *ty) { throw std::runtime_error("unknown conversion from ir::type to Type"); } +void generator::visit_undef_value(ir::undef_value *ud) { + vmap_[ud] = llvm::UndefValue::get(type(ud->get_type())); +} + +void generator::visit_constant_int(ir::constant_int *cst){ + Type *ty = type(cst->get_type()->get_scalar_ty()); + vmap_[cst] = ConstantInt::get(ty, cst->get_value()); +} + +void generator::visit_constant_fp(ir::constant_fp *cst){ + Type *ty = type(cst->get_type()->get_scalar_ty()); + vmap_[cst] = ConstantFP::get(ty, cst->get_value()); +} + +void generator::visit_alloc_const(ir::alloc_const *alloc) { + unsigned size = ((ir::constant_int*)alloc->get_operand(0))->get_value(); + Type *element_ty = type(alloc->get_type()->get_pointer_element_ty()); + Type *array_ty = llvm::ArrayType::get(element_ty, size); + Value *array = new llvm::GlobalVariable(*mod_, array_ty, false, llvm::GlobalVariable::ExternalLinkage, + nullptr, alloc->get_name(), nullptr, llvm::GlobalVariable::NotThreadLocal, 4); + vmap_[alloc] = builder_->CreateBitCast(array, element_ty->getPointerTo(4)); +} + + +void generator::visit_function(ir::function*) { + +} + +void generator::visit_layout_hmma_884(analysis::layout_hmma_884_t* layout) { + Value *warp_size = builder_->getInt32(32); + Value* u_thread_id_0 = tgt_->get_local_id(mod_, *builder_, 0); + Value *u_thread_id = builder_->CreateURem(u_thread_id_0, warp_size); + Value *u_warp_id = builder_->CreateUDiv(u_thread_id_0, warp_size); + + const auto& shapes = layout->shapes; + if(shapes.size() > 3) + throw std::runtime_error("unsupported"); + + bool is_batched = shapes.size() >= 3; + + Value *_1 = builder_->getInt32(1); + Value *_2 = builder_->getInt32(2); + Value *_3 = builder_->getInt32(3); + Value *_4 = builder_->getInt32(4); + Value *_16 = builder_->getInt32(16); + + // fragments per warp + unsigned fpw_0 = layout->fpw.at(0); + unsigned fpw_1 = layout->fpw.at(1); + unsigned fpw_2 = is_batched ? layout->fpw.at(2) : 1; + // warps per tile + unsigned wpt_0 = layout->wpt.at(0); + unsigned wpt_1 = layout->wpt.at(1); + unsigned wpt_2 = is_batched ? layout->wpt.at(2) : 1; + // hmma warp tile size + unsigned hmma_wts_0 = fpw_0 * 8; + unsigned hmma_wts_1 = fpw_1 * 8; + unsigned hmma_wts_2 = is_batched ? fpw_2 : 1; + // hmma block tile size + unsigned hmma_bts_0 = hmma_wts_0 * wpt_0; + unsigned hmma_bts_1 = hmma_wts_1 * wpt_1; + unsigned hmma_bts_2 = is_batched ? hmma_wts_2 * wpt_2 : 1; + // number of repetition + unsigned num_rep_0 = shapes[0] / hmma_bts_0; + unsigned num_rep_1 = shapes[1] / hmma_bts_1; + unsigned num_rep_2 = is_batched ? shapes[2] / hmma_bts_2 : 1; + // size of each pack (interleaving) + pack_size_0_ = std::min(num_rep_0, 1); + pack_size_1_ = std::min(num_rep_1, 1); + // number of packs (interleaving) + num_packs_0_ = num_rep_0 / pack_size_0_; + num_packs_1_ = num_rep_1 / pack_size_1_; + + /* intra warp offset */ + // offset of quad in pair + Value *in_pair_off_a = builder_->CreateMul(builder_->CreateUDiv(builder_->CreateAnd(u_thread_id, _16), builder_->getInt32(4)), + builder_->getInt32(fpw_0 * pack_size_0_)); + Value *in_pair_off_b = builder_->CreateMul(builder_->CreateUDiv(builder_->CreateAnd(u_thread_id, _16), builder_->getInt32(4)), + builder_->getInt32(fpw_1 * pack_size_1_)); + + // Quad pair id + Value *pair_a_id = builder_->CreateUDiv(builder_->CreateURem(u_thread_id, _16), _4); + Value *pair_b_id = builder_->CreateUDiv(builder_->CreateURem(u_thread_id, _16), _4); + pair_a_id = builder_->CreateURem(pair_a_id, builder_->getInt32(fpw_0)); + pair_b_id = builder_->CreateUDiv(pair_b_id, builder_->getInt32(fpw_0)); + pair_b_id = builder_->CreateURem(pair_b_id, builder_->getInt32(fpw_1)); + // Quad pair offset + Value *pair_a_off = builder_->CreateMul(pair_a_id, builder_->getInt32(4 * pack_size_0_)); + Value *pair_b_off = builder_->CreateMul(pair_b_id, builder_->getInt32(4 * pack_size_1_)); + + /* inter warp offset */ + Value *warp_id_0 = builder_->CreateURem(u_warp_id, builder_->getInt32(wpt_0)); + Value *warp_id_12 = builder_->CreateUDiv(u_warp_id, builder_->getInt32(wpt_0)); + Value *warp_id_1 = builder_->CreateURem(warp_id_12, builder_->getInt32(wpt_1)); + Value *warp_id_2 = builder_->CreateUDiv(warp_id_12, builder_->getInt32(wpt_1)); + Value *warp_offset_i = builder_->CreateMul(warp_id_0, builder_->getInt32(hmma_wts_0 * pack_size_0_)); + Value *warp_offset_j = builder_->CreateMul(warp_id_1, builder_->getInt32(hmma_wts_1 * pack_size_1_)); + + /* offsets */ + // a offset + offset_a_i_ = builder_->CreateAdd(warp_offset_i, builder_->CreateAdd(pair_a_off, in_pair_off_a)); + offset_a_k_ = builder_->CreateAnd(u_thread_id, _3); + // b offsets + offset_b_j_ = builder_->CreateAdd(warp_offset_j, builder_->CreateAdd(pair_b_off, in_pair_off_b)); + offset_b_k_ = builder_->CreateAnd(u_thread_id, _3); + + // c offsets + Value *offset_c_i = builder_->CreateAdd(builder_->CreateAnd(u_thread_id, _1), offset_a_i_); + Value *offset_c_j = builder_->CreateAdd(builder_->CreateAnd(u_thread_id, _2), + builder_->CreateAdd(warp_offset_j, pair_b_off)); + + /* indices */ + // i indices + std::vector idx_i; + for(unsigned pack = 0; pack < num_packs_0_; pack++) + for(unsigned ii = 0; ii < pack_size_0_; ii++) + for(unsigned i = 0; i < 2; i++){ + idx_i.push_back(builder_->CreateAdd(offset_c_i, builder_->getInt32(pack*hmma_bts_0*pack_size_0_ + ii*4 + i*2))); + } + // j indices + std::vector idx_j; + for(unsigned pack = 0; pack < num_packs_1_; pack++) + for(unsigned jj = 0; jj < pack_size_1_; jj++) + for(unsigned j = 0; j < 2; j++){ + idx_j.push_back(builder_->CreateAdd(offset_c_j, builder_->getInt32(pack*hmma_bts_1*pack_size_1_ + jj*4 + j*4*fpw_1*pack_size_1_))); + idx_j.push_back(builder_->CreateAdd(offset_c_j, builder_->getInt32(pack*hmma_bts_1*pack_size_1_ + jj*4 + j*4*fpw_1*pack_size_1_ + 1))); + } + // z indices + std::vector idx_z; + for(unsigned pack = 0; pack < num_rep_2; pack++) + idx_z.push_back(builder_->CreateAdd(warp_id_2, builder_->getInt32(pack*hmma_bts_2))); + + + /* axes */ + axes_[layout->axes[0]] = distributed_axis{1, idx_i, warp_id_0}; + axes_[layout->axes[1]] = distributed_axis{1, idx_j, warp_id_1}; + if(is_batched) + axes_[layout->axes[2]] = distributed_axis{1, idx_z, warp_id_2}; +} + +void generator::visit_layout_scanline(analysis::layout_scanline_t* layout) { + Value *warp_size = builder_->getInt32(32); + Value* u_thread_id_0 = tgt_->get_local_id(mod_, *builder_, 0); + Value *u_thread_id = builder_->CreateURem(u_thread_id_0, warp_size); + Value *u_warp_id = builder_->CreateUDiv(u_thread_id_0, warp_size); + + auto order = layout->order; + const auto& shapes = layout->shapes; + size_t dim = shapes.size(); + std::vector nts = layout->nts; + std::vector mts = layout->mts; + Value* full_thread_id = builder_->CreateAdd(builder_->CreateMul(u_warp_id, builder_->getInt32(32)), u_thread_id); + std::vector thread_id = delinearize(full_thread_id, order, mts, *builder_); + // Create axes + for(unsigned k = 0; k < dim; k++) { + std::string str_k = std::to_string(k); + Value *contiguous_k = builder_->getInt32(nts[k]); + Value *scaled_thread_id = builder_->CreateMul(thread_id[k], contiguous_k); + unsigned per_block = nts[k] * mts[k]; + unsigned per_thread = nts[k] * shapes[k] / per_block; + std::vector idx_list(per_thread); + for(unsigned n = 0 ; n < per_thread; n++){ + unsigned offset = n / nts[k] * per_block + n % nts[k]; + idx_list[n] = builder_->CreateAdd(scaled_thread_id, builder_->getInt32(offset), "idx_" + str_k + "_" + std::to_string(n)); + } + axes_[layout->axes[k]] = distributed_axis{nts[k], idx_list, thread_id[k]}; + } +} + +void generator::visit_layout_shared(analysis::layout_shared_t*) { + +} + void generator::for_each(ir::value *x, const std::function& fn) { if(!x->get_type()->is_tile_ty()) return fn({}); diff --git a/lib/ir/constant.cc b/lib/ir/constant.cc index 0eff5261e..8a3f1a343 100644 --- a/lib/ir/constant.cc +++ b/lib/ir/constant.cc @@ -76,27 +76,6 @@ constant *constant_fp::get(type *ty, double v){ return result; } -// metaparameter -metaparameter::metaparameter(type *ty, const std::vector &space) - : constant_int(ty, 0), space_(space), has_value_(false){ } - -metaparameter* metaparameter::create(context &ctx, type *ty, unsigned lo, unsigned hi) { - context_impl *impl = ctx.p_impl.get(); - std::vector space; - for(unsigned i = lo; i <= hi; i *= 2) - space.push_back(i); - metaparameter *result = new metaparameter(ty, space); - impl->mp_constants_.push_back(result); - return result; -} - -metaparameter* metaparameter::create(context &ctx, type *ty, const std::vector &space) { - context_impl *impl = ctx.p_impl.get(); - metaparameter *result = new metaparameter(ty, space); - impl->mp_constants_.push_back(result); - return result; -} - // undef value undef_value::undef_value(type *ty) From 0a2a4d9fdd645df8bafbdde1c8c0f65cf02849d1 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 13 Oct 2019 19:59:33 -0400 Subject: [PATCH 440/494] more cleaning --- include/triton/codegen/selection.h | 24 ++++- lib/codegen/selection.cc | 147 ++++++++++++----------------- 2 files changed, 81 insertions(+), 90 deletions(-) diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 8d42d9dfe..1e895dc80 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -146,6 +146,21 @@ private: Builder &builder_; }; +class machine_layout_t { + +}; + +class machine_layout_shared_t: public machine_layout_t { + +}; + +class machine_layout_hmma_884_t: public machine_layout_t { + +}; + +class machine_layout_scanline_t: public machine_layout_t { + +}; class generator: public ir::visitor, public analysis::layout_visitor { private: @@ -160,9 +175,7 @@ private: void set_value(ir::value *x, const indices_t& idx, Value* v); public: - generator(LLVMContext *ctx, - Function *fn, Module *dst, Builder *builder, std::map& axes, @@ -178,7 +191,7 @@ public: unsigned num_packs_0, unsigned num_packs_1, unsigned pack_size_0, unsigned pack_size_1, unsigned num_warps) - : ctx_(ctx), fn_(fn), mod_(dst), builder_(builder), axes_(axes), vmap_(vmap), tmap_(tmap), tgt_(tgt), + : ctx_(ctx), mod_(dst), builder_(builder), axes_(axes), vmap_(vmap), tmap_(tmap), tgt_(tgt), layouts_(layouts), alignment_(alignment), alloc_(alloc), sh_mem_ptr_(sh_mem_ptr), offset_a_i_(offset_a_i), offset_a_k_(offset_a_k), offset_b_j_(offset_b_j), offset_b_k_(offset_b_k), num_packs_0_(num_packs_0), num_packs_1_(num_packs_1), pack_size_0_(pack_size_0), pack_size_1_(pack_size_1), @@ -243,6 +256,7 @@ private: Builder *builder_; Module *mod_; + std::map machine_layouts_; std::map& axes_; std::map& vmap_; std::map& tmap_; @@ -258,6 +272,8 @@ private: unsigned num_warps_; }; + + // Selection pass class selection{ typedef std::map vmap_t; @@ -266,8 +282,6 @@ class selection{ private: // LLVM conversions Type* llvm_type(ir::type *ty, LLVMContext &ctx); - Value* llvm_alloc_const(ir::alloc_const *v, Module *module, Builder &builder); - Function* llvm_fn(ir::function *fn, Builder& builder, Module &dst); Value* alloc_shared(Builder &builder, Module& dst); // grid construction diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 9a2f1f569..44f274217 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -344,16 +344,6 @@ Type *selection::llvm_type(ir::type *ty, LLVMContext &ctx) { } -/* convert ir::alloc_const to llvm::GlobalVariable */ -Value* selection::llvm_alloc_const(ir::alloc_const *v, Module *module, IRBuilder<> &builder) { - unsigned size = ((ir::constant_int*)v->get_operand(0))->get_value(); - Type *element_ty = llvm_type(v->get_type()->get_pointer_element_ty(), module->getContext()); - Type *array_ty = llvm::ArrayType::get(element_ty, size); - Value *array = new llvm::GlobalVariable(*module, array_ty, false, llvm::GlobalVariable::ExternalLinkage, - nullptr, v->get_name(), nullptr, llvm::GlobalVariable::NotThreadLocal, 4); - return builder.CreateBitCast(array, element_ty->getPointerTo(4)); -} - /* ------------------- * ---- Init Axes ---- @@ -384,17 +374,17 @@ inline int32_t ceil(int32_t num, int32_t div){ void selection::create_shared_tile(ir::value *v, IRBuilder<> &builder, Value *sh_mem_ptr) { if(tmap_.find(v) != tmap_.end()) return; - auto order = layouts_->get(v)->order; - auto shapes = v->get_type()->get_tile_shapes(); - unsigned pad = layouts_->get(v)->pad; - if(pad > 0) - shapes[order[0]] += pad; + analysis::layout_shared_t *layout = (analysis::layout_shared_t*)layouts_->get(v); + auto order = layout->order; + auto shapes = layout->shapes; + shapes[order[0]] += layout->pad; + Type* ty = llvm_type(v->get_type()->get_scalar_ty(), builder.getContext()); // shared copy PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr->getType()->getPointerAddressSpace()); // double-buffered - if(layouts_->get(v)->double_buffer) { - auto info = *layouts_->get(v)->double_buffer; + if(layout->double_buffer) { + auto info = *layout->double_buffer; ir::phi_node *phi = info.phi; BasicBlock *parent = (BasicBlock*)vmap_[phi->get_parent()]; if(parent->empty()) @@ -461,9 +451,8 @@ void selection::lower_value(ir::value *src, IRBuilder<> &builder, generator* gen if(src->get_type()->is_tile_ty()){ builder.SetInsertPoint(&*builder.GetInsertBlock()->getParent()->begin()); auto *i = dynamic_cast(src); - if(i && layouts_->get(i)->type == analysis::SHARED && !dynamic_cast(src)){ + if(i && layouts_->get(i)->type == analysis::SHARED) create_shared_tile(i, builder, sh_mem_ptr_); - } else create_distributed_tile(src, builder); } @@ -502,47 +491,6 @@ inline llvm::Attribute llvm_attr(llvm::LLVMContext& ctx, ir::attribute attr) { } -Function* selection::llvm_fn(ir::function *fn, IRBuilder<>& builder, Module& dst) { - LLVMContext &ctx = builder.getContext(); - FunctionType *fn_ty = (FunctionType*)llvm_type(fn->get_fn_type(), ctx); - FunctionType *dst_fn_ty = fn_ty; - if(!tgt_->is_gpu()){ - Type *dst_fn_ret_ty = fn_ty->getReturnType(); - std::vector dst_fn_args_ty; - for(unsigned i = 0; i < fn_ty->getNumParams(); i++) - dst_fn_args_ty.push_back(fn_ty->getParamType(i)); - dst_fn_args_ty.push_back(builder.getInt32Ty()); - dst_fn_args_ty.push_back(builder.getInt32Ty()); - dst_fn_args_ty.push_back(builder.getInt32Ty()); - dst_fn_ty = FunctionType::get(dst_fn_ret_ty, dst_fn_args_ty, false); - } - Function *ret = Function::Create(dst_fn_ty, Function::ExternalLinkage, fn->get_name(), &dst); - // set attributes - for(auto attr_pair: fn->attrs()){ - unsigned id = attr_pair.first; - for(ir::attribute attr: attr_pair.second) - if(attr.is_llvm_attr()) - ret->addAttribute(id, llvm_attr(ctx, attr)); - } - // set metadata - tgt_->set_kernel(builder, ctx, &dst, ret); - Metadata *md_args[] = { - ValueAsMetadata::get(ret), - MDString::get(ctx, "maxntidx"), - ValueAsMetadata::get(builder.getInt32(num_warps_*32)) - }; - dst.getOrInsertNamedMetadata("nvvm.annotations")->addOperand(MDNode::get(ctx, md_args)); - // map parameters - for(unsigned i = 0; i < fn->args().size(); i++) - vmap_[fn->args()[i]] = &*(ret->arg_begin() + i); - // create blocks - for(ir::basic_block *block: fn->blocks()) { - BasicBlock *dst_block = BasicBlock::Create(ctx, block->get_name(), ret); - vmap_[block] = dst_block; - } - builder.SetInsertPoint((BasicBlock*)vmap_[fn->blocks()[0]]); -} - Value* selection::alloc_shared(IRBuilder<> &builder, Module& dst) { Value *ret = nullptr; LLVMContext &ctx = builder.getContext(); @@ -566,24 +514,22 @@ void selection::run(ir::module &src, Module &dst) { LLVMContext &dst_ctx = dst.getContext(); IRBuilder<> dst_builder(dst_ctx); - // constant memory - for(ir::alloc_const *x: src.allocs()) - vmap_[x] = llvm_alloc_const(x, &dst, dst_builder); - // allocate shared memory sh_mem_ptr_ = alloc_shared(dst_builder, dst); // iterate over functions std::set seen; + // create tile + generator gen(&dst_ctx, &dst, &dst_builder, axes_, vmap_, tmap_, tgt_, layouts_, alignment_, alloc_, sh_mem_ptr_, + offset_a_i_, offset_a_k_, offset_b_j_, offset_b_k_, num_packs_0_, num_packs_1_, pack_size_0_, pack_size_1_, num_warps_ ); + + for(ir::alloc_const *x: src.allocs()) + x->accept(&gen); + for(ir::function *fn: src.get_function_list()) { - // create LLVM function - Function *ffn = llvm_fn(fn, dst_builder, dst); - - // create tile - generator gen(&dst_ctx, ffn, &dst, &dst_builder, axes_, vmap_, tmap_, tgt_, layouts_, alignment_, alloc_, sh_mem_ptr_, - offset_a_i_, offset_a_k_, offset_b_j_, offset_b_k_, num_packs_0_, num_packs_1_, pack_size_0_, pack_size_1_, num_warps_ ); + fn->accept(&gen); // initialize layouts for(auto x: layouts_->get_all()) @@ -656,18 +602,6 @@ void selection::run(ir::module &src, Module &dst) { } -/* ----------------------------------------------------- - * - * - * - * - * - * - * - * - * - * - * ------------------------------------------------------ */ @@ -1355,8 +1289,46 @@ void generator::visit_alloc_const(ir::alloc_const *alloc) { } -void generator::visit_function(ir::function*) { - +void generator::visit_function(ir::function* fn) { + LLVMContext &ctx = builder_->getContext(); + FunctionType *fn_ty = (FunctionType*)type(fn->get_fn_type()); + FunctionType *dst_fn_ty = fn_ty; + if(!tgt_->is_gpu()){ + Type *dst_fn_ret_ty = fn_ty->getReturnType(); + std::vector dst_fn_args_ty; + for(unsigned i = 0; i < fn_ty->getNumParams(); i++) + dst_fn_args_ty.push_back(fn_ty->getParamType(i)); + dst_fn_args_ty.push_back(builder_->getInt32Ty()); + dst_fn_args_ty.push_back(builder_->getInt32Ty()); + dst_fn_args_ty.push_back(builder_->getInt32Ty()); + dst_fn_ty = FunctionType::get(dst_fn_ret_ty, dst_fn_args_ty, false); + } + Function *ret = Function::Create(dst_fn_ty, Function::ExternalLinkage, fn->get_name(), mod_); + // set attributes + for(auto attr_pair: fn->attrs()){ + unsigned id = attr_pair.first; + for(ir::attribute attr: attr_pair.second) + if(attr.is_llvm_attr()) + ret->addAttribute(id, llvm_attr(ctx, attr)); + } + // set metadata + tgt_->set_kernel(*builder_, ctx, mod_, ret); + Metadata *md_args[] = { + ValueAsMetadata::get(ret), + MDString::get(ctx, "maxntidx"), + ValueAsMetadata::get(builder_->getInt32(num_warps_*32)) + }; + mod_->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(MDNode::get(ctx, md_args)); + // map parameters + for(unsigned i = 0; i < fn->args().size(); i++) + vmap_[fn->args()[i]] = &*(ret->arg_begin() + i); + // create blocks + for(ir::basic_block *block: fn->blocks()) { + BasicBlock *dst_block = BasicBlock::Create(ctx, block->get_name(), ret); + vmap_[block] = dst_block; + } + builder_->SetInsertPoint((BasicBlock*)vmap_[fn->blocks()[0]]); + fn_ = ret; } void generator::visit_layout_hmma_884(analysis::layout_hmma_884_t* layout) { @@ -1469,6 +1441,8 @@ void generator::visit_layout_hmma_884(analysis::layout_hmma_884_t* layout) { axes_[layout->axes[1]] = distributed_axis{1, idx_j, warp_id_1}; if(is_batched) axes_[layout->axes[2]] = distributed_axis{1, idx_z, warp_id_2}; + + machine_layouts_[layout] = new machine_layout_hmma_884_t(); } void generator::visit_layout_scanline(analysis::layout_scanline_t* layout) { @@ -1498,10 +1472,13 @@ void generator::visit_layout_scanline(analysis::layout_scanline_t* layout) { } axes_[layout->axes[k]] = distributed_axis{nts[k], idx_list, thread_id[k]}; } + + machine_layouts_[layout] = new machine_layout_scanline_t(); } -void generator::visit_layout_shared(analysis::layout_shared_t*) { +void generator::visit_layout_shared(analysis::layout_shared_t* layout) { + machine_layouts_[layout] = new machine_layout_shared_t(); } void generator::for_each(ir::value *x, const std::function& fn) { From 6f5f511a337c303d6e307a77b99c427f9cbe1a65 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 14 Oct 2019 11:36:54 -0400 Subject: [PATCH 441/494] [doc][pytriton] now showing full requirements of triton.function --- docs/pytriton.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/pytriton.md b/docs/pytriton.md index d2b2f7983..0e04e6246 100644 --- a/docs/pytriton.md +++ b/docs/pytriton.md @@ -73,8 +73,18 @@ The PyTriton API provides a `triton.function` class which automatically handles ```python import triton +# Entry point class _dot(triton.function): + @staticmethod + # Forward Pass + def forward(ctx, *args): + #... + + @staticmethod + # Backward Pass + def backward(ctx, dy): + #... ``` ### Creation of Triton Kernels From 3d5ab4bc0d603b8acc88c184363b3e2c8de56ea7 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 15 Oct 2019 12:29:58 -0400 Subject: [PATCH 442/494] [codegen] [selection] created machine layouts --- include/triton/codegen/selection.h | 29 ++++++++++++++++++++++++++-- lib/codegen/selection.cc | 31 +++++++++++++++++++++++++----- lib/runtime/function.cc | 1 + tests/bench/copy2d.cc | 20 +++++++++---------- tests/common/src/copy.h | 2 +- tests/common/util.h | 4 ++-- 6 files changed, 67 insertions(+), 20 deletions(-) diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 1e895dc80..06ec94222 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -155,11 +155,36 @@ class machine_layout_shared_t: public machine_layout_t { }; class machine_layout_hmma_884_t: public machine_layout_t { - +public: + machine_layout_hmma_884_t(Module *mod, Builder *builder, + target *tgt, std::map& axes, + Value *&offset_a_i, Value *&offset_a_k, Value *&offset_b_j, Value *&offset_b_k, + unsigned &pack_size_0, unsigned &pack_size_1, + unsigned &num_packs_0, unsigned &num_packs_1, + analysis::layout_hmma_884_t* layout); + Module *mod_; + Builder *builder_; + target *tgt_; + std::map& axes_; + Value *&offset_a_i_, *&offset_a_k_; + Value *&offset_b_j_, *&offset_b_k_; + unsigned &pack_size_0_; + unsigned& pack_size_1_; + unsigned &num_packs_0_; + unsigned& num_packs_1_; + analysis::layout_hmma_884_t* layout_; }; class machine_layout_scanline_t: public machine_layout_t { - +public: + machine_layout_scanline_t(Module *mod, Builder *builder, + target *tgt, std::map& axes, + analysis::layout_scanline_t* layout); + Module *mod_; + Builder *builder_; + target *tgt_; + std::map& axes_; + analysis::layout_scanline_t* layout_; }; class generator: public ir::visitor, public analysis::layout_visitor { diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 44f274217..9321d49fa 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -1331,7 +1331,16 @@ void generator::visit_function(ir::function* fn) { fn_ = ret; } -void generator::visit_layout_hmma_884(analysis::layout_hmma_884_t* layout) { +machine_layout_hmma_884_t::machine_layout_hmma_884_t(Module *mod, Builder *builder, + target *tgt, std::map& axes, + Value *&offset_a_i, Value *&offset_a_k, Value *&offset_b_j, Value *&offset_b_k, + unsigned &pack_size_0, unsigned &pack_size_1, + unsigned &num_packs_0, unsigned &num_packs_1, + analysis::layout_hmma_884_t* layout) + : mod_(mod), builder_(builder), tgt_(tgt), axes_(axes), + offset_a_i_(offset_a_i), offset_a_k_(offset_a_k), offset_b_j_(offset_b_j), offset_b_k_(offset_b_k), + pack_size_0_(pack_size_0), pack_size_1_(pack_size_1), num_packs_0_(num_packs_0), num_packs_1_(num_packs_1), + layout_(layout) { Value *warp_size = builder_->getInt32(32); Value* u_thread_id_0 = tgt_->get_local_id(mod_, *builder_, 0); Value *u_thread_id = builder_->CreateURem(u_thread_id_0, warp_size); @@ -1441,11 +1450,14 @@ void generator::visit_layout_hmma_884(analysis::layout_hmma_884_t* layout) { axes_[layout->axes[1]] = distributed_axis{1, idx_j, warp_id_1}; if(is_batched) axes_[layout->axes[2]] = distributed_axis{1, idx_z, warp_id_2}; - - machine_layouts_[layout] = new machine_layout_hmma_884_t(); } -void generator::visit_layout_scanline(analysis::layout_scanline_t* layout) { + +machine_layout_scanline_t::machine_layout_scanline_t(Module *mod, Builder *builder, + target *tgt, std::map &axes, + analysis::layout_scanline_t* layout) + : mod_(mod), builder_(builder), tgt_(tgt), axes_(axes), layout_(layout) +{ Value *warp_size = builder_->getInt32(32); Value* u_thread_id_0 = tgt_->get_local_id(mod_, *builder_, 0); Value *u_thread_id = builder_->CreateURem(u_thread_id_0, warp_size); @@ -1472,8 +1484,17 @@ void generator::visit_layout_scanline(analysis::layout_scanline_t* layout) { } axes_[layout->axes[k]] = distributed_axis{nts[k], idx_list, thread_id[k]}; } +} - machine_layouts_[layout] = new machine_layout_scanline_t(); +void generator::visit_layout_hmma_884(analysis::layout_hmma_884_t* layout) { + machine_layouts_[layout] = new machine_layout_hmma_884_t(mod_, builder_, tgt_, axes_, offset_a_i_, offset_a_k_, offset_b_j_, offset_b_k_, + pack_size_0_, pack_size_1_, + num_packs_0_, num_packs_1_, + layout); +} + +void generator::visit_layout_scanline(analysis::layout_scanline_t* layout) { + machine_layouts_[layout] = new machine_layout_scanline_t(mod_, builder_, tgt_, axes_, layout); } void generator::visit_layout_shared(analysis::layout_shared_t* layout) { diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index dba693475..37b14145f 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -219,6 +219,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::transform::cts cts; codegen::selection selection(&liveness, &allocation, &align, &axes, &layouts, target.get(), opt.num_warps); // run passes +// ir::print(module, std::cout); peephole.run(module); dce.run(module); align.run(module); diff --git a/tests/bench/copy2d.cc b/tests/bench/copy2d.cc index f1252797e..22006aae7 100644 --- a/tests/bench/copy2d.cc +++ b/tests/bench/copy2d.cc @@ -11,17 +11,17 @@ int main() { // shapes to benchmark typedef std::tuple, std::vector, std::vector> config_t; std::vector configs = { - {{4096*4096}, {0}, {0}}, +// {{4096*4096}, {0}, {0}}, {{4096, 4096}, {0, 1}, {1, 0}}, - {{4096, 4096}, {0, 1}, {1, 0}}, - {{4096, 4096}, {1, 0}, {0, 1}}, - {{4096, 4096}, {0, 1}, {0, 1}}, - {{256, 256, 256}, {0, 1, 2}, {0, 1, 2}}, - {{256, 256, 256}, {0, 1, 2}, {0, 2, 1}}, - {{256, 256, 256}, {1, 0, 2}, {1, 2, 0}}, - {{256, 256, 256}, {1, 2, 0}, {1, 0, 2}}, - {{256, 256, 256}, {2, 0, 1}, {0, 1, 2}}, - {{256, 256, 256}, {2, 1, 0}, {0, 2, 1}} +// {{4096, 4096}, {0, 1}, {1, 0}}, +// {{4096, 4096}, {1, 0}, {0, 1}}, +// {{4096, 4096}, {0, 1}, {0, 1}}, +// {{256, 256, 256}, {0, 1, 2}, {0, 1, 2}}, +// {{256, 256, 256}, {0, 1, 2}, {0, 2, 1}}, +// {{256, 256, 256}, {1, 0, 2}, {1, 2, 0}}, +// {{256, 256, 256}, {1, 2, 0}, {1, 0, 2}}, +// {{256, 256, 256}, {2, 0, 1}, {0, 1, 2}}, +// {{256, 256, 256}, {2, 1, 0}, {0, 2, 1}} }; // does the work std::vector shape; diff --git a/tests/common/src/copy.h b/tests/common/src/copy.h index f45f7a5cd..97264eeb5 100644 --- a/tests/common/src/copy.h +++ b/tests/common/src/copy.h @@ -28,7 +28,7 @@ void copy2d(TYPE * X __noalias __readonly __aligned(16), int rs1[TS1] = pid1 * TS1 + 0 ... TS1; TYPE* px[TS0, TS1] = X + rs0[:, newaxis] * STRIDE_XS0 + rs1[newaxis, :] * STRIDE_XS1; TYPE* py[TS0, TS1] = Y + rs0[:, newaxis] * STRIDE_YS0 + rs1[newaxis, :] * STRIDE_YS1; - *py = *px; + *py = ^(*px); } )"; diff --git a/tests/common/util.h b/tests/common/util.h index 34e530610..42310d847 100644 --- a/tests/common/util.h +++ b/tests/common/util.h @@ -42,8 +42,8 @@ inline std::vector> tile_nd(size_t rank) { if(rank == 1) return {{"128", "256", "512", "1024"}}; if(rank == 2) - return {{"16", "32", "64"}, - {"16", "32", "64"}}; + return {{"64"}, + {"64"}}; if(rank == 3) return {{"4", "16", "32"}, {"4", "16", "32"}, From 1b5b76b629b5855ba3a34f195e078e57237ffca0 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 15 Oct 2019 16:12:08 -0400 Subject: [PATCH 443/494] [codegen] [selection] machine layouts now create machine tiles --- include/triton/codegen/analysis/layout.h | 3 + include/triton/codegen/selection.h | 51 +++-- lib/codegen/analysis/layout.cc | 19 +- lib/codegen/selection.cc | 236 ++++++++++++----------- 4 files changed, 169 insertions(+), 140 deletions(-) diff --git a/include/triton/codegen/analysis/layout.h b/include/triton/codegen/analysis/layout.h index 096c45ea3..923e13411 100644 --- a/include/triton/codegen/analysis/layout.h +++ b/include/triton/codegen/analysis/layout.h @@ -53,6 +53,7 @@ struct layout_t { const std::vector& _axes, const std::vector &_shapes, const std::vector &_values, + ir::type *_ty, size_t _id, analysis::align* align); @@ -79,6 +80,7 @@ struct layout_hmma_884_t: public layout_t { const std::vector& _axes, const std::vector& _shapes, const std::vector &_values, + ir::type *_ty, size_t _id, analysis::align* align); void accept(layout_visitor* vst) { vst->visit_layout_hmma_884(this); } @@ -89,6 +91,7 @@ struct layout_scanline_t: public layout_t { const std::vector& _axes, const std::vector& _shapes, const std::vector &values, + ir::type *_ty, size_t _id, analysis::align* align); void accept(layout_visitor* vst) { vst->visit_layout_scanline(this); } diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 06ec94222..82466ab93 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -147,44 +147,54 @@ private: }; class machine_layout_t { - + virtual tile* create(ir::value *v) = 0; }; class machine_layout_shared_t: public machine_layout_t { - +public: + shared_tile* create(ir::value *v); }; -class machine_layout_hmma_884_t: public machine_layout_t { +class machine_layout_distributed_t: public machine_layout_t { +public: + machine_layout_distributed_t(Module *mod, Builder *builder, target *tgt, Type *ty, + analysis::axes *a_axes, std::map& axes, + analysis::layout_t* layout); + + distributed_tile* create(ir::value *v); + Module *mod_; + Builder *builder_; + target *tgt_; + Type *ty_; + analysis::axes *a_axes_; + std::map& axes_; + analysis::layout_t* layout_; +}; + + +class machine_layout_hmma_884_t: public machine_layout_distributed_t { public: machine_layout_hmma_884_t(Module *mod, Builder *builder, - target *tgt, std::map& axes, + target *tgt, Type *ty, + analysis::axes *a_axes, std::map& axes, Value *&offset_a_i, Value *&offset_a_k, Value *&offset_b_j, Value *&offset_b_k, unsigned &pack_size_0, unsigned &pack_size_1, unsigned &num_packs_0, unsigned &num_packs_1, analysis::layout_hmma_884_t* layout); - Module *mod_; - Builder *builder_; - target *tgt_; - std::map& axes_; Value *&offset_a_i_, *&offset_a_k_; Value *&offset_b_j_, *&offset_b_k_; unsigned &pack_size_0_; unsigned& pack_size_1_; unsigned &num_packs_0_; unsigned& num_packs_1_; - analysis::layout_hmma_884_t* layout_; }; -class machine_layout_scanline_t: public machine_layout_t { +class machine_layout_scanline_t: public machine_layout_distributed_t { public: machine_layout_scanline_t(Module *mod, Builder *builder, - target *tgt, std::map& axes, + target *tgt, Type *ty, + analysis::axes *a_axes, std::map& axes, analysis::layout_scanline_t* layout); - Module *mod_; - Builder *builder_; - target *tgt_; - std::map& axes_; - analysis::layout_scanline_t* layout_; }; class generator: public ir::visitor, public analysis::layout_visitor { @@ -194,7 +204,6 @@ private: void visit_outer_dot(ir::dot_inst*, distributed_tile *TC, distributed_tile *TA, distributed_tile *TB, distributed_tile *TD, unsigned NK, Type *c_ty, Function *f_mul_add); - Type *type(ir::type *ty); void for_each(ir::value *x, const std::function& fn); Value* get_value(ir::value *x, const indices_t& idx); void set_value(ir::value *x, const indices_t& idx, Value* v); @@ -203,6 +212,7 @@ public: generator(LLVMContext *ctx, Module *dst, Builder *builder, + analysis::axes *a_axes, std::map& axes, std::map& vmap, std::map& tmap, @@ -216,12 +226,13 @@ public: unsigned num_packs_0, unsigned num_packs_1, unsigned pack_size_0, unsigned pack_size_1, unsigned num_warps) - : ctx_(ctx), mod_(dst), builder_(builder), axes_(axes), vmap_(vmap), tmap_(tmap), tgt_(tgt), + : ctx_(ctx), mod_(dst), builder_(builder), a_axes_(a_axes), axes_(axes), vmap_(vmap), tmap_(tmap), tgt_(tgt), layouts_(layouts), alignment_(alignment), alloc_(alloc), sh_mem_ptr_(sh_mem_ptr), offset_a_i_(offset_a_i), offset_a_k_(offset_a_k), offset_b_j_(offset_b_j), offset_b_k_(offset_b_k), num_packs_0_(num_packs_0), num_packs_1_(num_packs_1), pack_size_0_(pack_size_0), pack_size_1_(pack_size_1), num_warps_(num_warps) { } + machine_layout_t *get_machine_layout(const analysis::layout_t *layout) { return machine_layouts_.at(layout); } void visit_phi_node(ir::phi_node*); void visit_binary_operator(ir::binary_operator*); @@ -281,7 +292,8 @@ private: Builder *builder_; Module *mod_; - std::map machine_layouts_; + std::map machine_layouts_; + analysis::axes *a_axes_; std::map& axes_; std::map& vmap_; std::map& tmap_; @@ -311,7 +323,6 @@ private: // grid construction void create_shared_tile(ir::value *v, Builder &builder, Value *sh_mem_ptr); - void create_distributed_tile(ir::value *v, Builder &builder); // lower scalar instruction void lower_value(ir::value *src, Builder &builder, generator* gen, std::set& seen); diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index cfd3b3c47..f435efef8 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -128,9 +128,9 @@ inline bool is_trans(ir::value *v) { layout_t::layout_t(layout_type_t _type, const std::vector &_axes, const std::vector &_shapes, - const std::vector &_values, + const std::vector &_values, ir::type *_ty, size_t _id, - analysis::align* align): type(_type), axes(_axes), shapes(_shapes), values(_values), id(_id) { + analysis::align* align): type(_type), axes(_axes), shapes(_shapes), values(_values), id(_id), ty(_ty) { // io pointer std::set ptr; for(ir::value* v: values) @@ -152,8 +152,8 @@ inline unsigned clamp(unsigned x, unsigned lo, unsigned hi) { layout_hmma_884_t::layout_hmma_884_t(size_t num_warps, const std::vector& _axes, const std::vector& _shapes, - const std::vector &values, size_t _id, - analysis::align* align): layout_t(HMMA_884, _axes, _shapes, values, _id, align) { + const std::vector &values, ir::type *_ty, size_t _id, + analysis::align* align): layout_t(HMMA_884, _axes, _shapes, values, _ty, _id, align) { unsigned shape_0 = shapes[order[0]]; unsigned shape_1 = shapes[order[1]]; @@ -194,9 +194,9 @@ layout_hmma_884_t::layout_hmma_884_t(size_t num_warps, layout_scanline_t::layout_scanline_t(size_t num_warps, const std::vector& _axes, const std::vector& _shapes, - const std::vector &values, + const std::vector &values, ir::type *_ty, size_t _id, - analysis::align* align): layout_t(SCANLINE, _axes, _shapes, values, _id, align){ + analysis::align* align): layout_t(SCANLINE, _axes, _shapes, values, _ty, _id, align){ unsigned size = std::accumulate(shapes.begin(), shapes.end(), 1, std::multiplies()); unsigned num_threads = num_warps * 32; nts.resize(shapes.size()); @@ -263,9 +263,8 @@ layout_shared_t::layout_shared_t(const layout_t *arg, const std::vector &values, ir::type *ty, size_t _id, - analysis::align* align): layout_t(SHARED, _axes, _shapes, values, _id, align) { + analysis::align* align): layout_t(SHARED, _axes, _shapes, values, ty, _id, align) { - this->ty = ty; size = 0; // double-buffering @@ -333,7 +332,7 @@ void layout::create(size_t id, const std::vector& values) { }); // type if(it_hmma_c != values.end()) - layouts_[id] = new layout_hmma_884_t(num_warps_, axes, shapes, values, id, align_); + layouts_[id] = new layout_hmma_884_t(num_warps_, axes, shapes, values, largest->get_type()->get_scalar_ty(), id, align_); else if(it_cts != values.end()){ ir::copy_to_shared_inst *cts = (ir::copy_to_shared_inst*)*it_cts; ir::value *arg = cts->get_operand(0); @@ -341,7 +340,7 @@ void layout::create(size_t id, const std::vector& values) { layouts_[id] = new layout_shared_t(get(arg), axes, shapes, values, largest->get_type()->get_scalar_ty(), id, align_); } else - layouts_[id] = new layout_scanline_t(num_warps_, axes, shapes, values, id, align_); + layouts_[id] = new layout_scanline_t(num_warps_, axes, shapes, values, largest->get_type()->get_scalar_ty(), id, align_); } void layout::run(ir::module &mod) { diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 9321d49fa..be97ac2e4 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -343,6 +343,42 @@ Type *selection::llvm_type(ir::type *ty, LLVMContext &ctx) { throw std::runtime_error("unknown conversion from ir::type to Type"); } +Type *type(ir::type *ty, LLVMContext &ctx) { + // function + if(auto* tt = dynamic_cast(ty)){ + Type *return_ty = type(tt->get_return_ty(), ctx); + std::vector param_tys; + std::transform(tt->params_begin(), tt->params_end(), std::back_inserter(param_tys), + [&ctx](ir::type* t){ return type(t, ctx);}); + return FunctionType::get(return_ty, param_tys, false); + } + // pointer + if(ty->is_pointer_ty()){ + Type *elt_ty = type(ty->get_pointer_element_ty(), ctx); + unsigned addr_space = ty->get_pointer_address_space(); + return PointerType::get(elt_ty, addr_space); + } + // integer + if(ty->is_integer_ty()){ + unsigned bitwidth = ty->get_integer_bitwidth(); + return IntegerType::get(ctx, bitwidth); + } + // primitive types + switch(ty->get_type_id()){ + case ir::type::VoidTyID: return Type::getVoidTy(ctx); + case ir::type::HalfTyID: return Type::getHalfTy(ctx); + case ir::type::FloatTyID: return Type::getFloatTy(ctx); + case ir::type::DoubleTyID: return Type::getDoubleTy(ctx); + case ir::type::X86_FP80TyID: return Type::getX86_FP80Ty(ctx); + case ir::type::PPC_FP128TyID: return Type::getPPC_FP128Ty(ctx); + case ir::type::LabelTyID: return Type::getLabelTy(ctx); + case ir::type::MetadataTyID: return Type::getMetadataTy(ctx); + case ir::type::TokenTyID: return Type::getTokenTy(ctx); + default: break; + } + // unknown type + throw std::runtime_error("unknown conversion from ir::type to Type"); +} /* ------------------- @@ -410,24 +446,6 @@ void selection::create_shared_tile(ir::value *v, IRBuilder<> &builder, Value *sh } } -void selection::create_distributed_tile(ir::value *v, IRBuilder<> &builder) { - Type* ty = llvm_type(v->get_type()->get_scalar_ty(), builder.getContext()); - const auto &shapes = v->get_type()->get_tile_shapes(); - std::vector axes(shapes.size()); - for(size_t d = 0; d < shapes.size(); d++){ - if(shapes[d] > 1){ - unsigned x = a_axes_->get(v, d); - axes[d] = axes_.at(x); - } - else{ - axes[d].contiguous = 1; - axes[d].values = {builder.getInt32(0)}; - } - } - distributed_tile *T = new distributed_tile(ty, shapes, layouts_->get(v)->order, axes, builder, false); - tmap_.insert({v, T}); -} - bool is_trans(ir::value *v) { if(dynamic_cast(v)) { @@ -454,7 +472,7 @@ void selection::lower_value(ir::value *src, IRBuilder<> &builder, generator* gen if(i && layouts_->get(i)->type == analysis::SHARED) create_shared_tile(i, builder, sh_mem_ptr_); else - create_distributed_tile(src, builder); + tmap_[src] = ((machine_layout_distributed_t*)gen->get_machine_layout(layouts_->get(src)))->create(src); } builder.SetInsertPoint(current); @@ -521,7 +539,7 @@ void selection::run(ir::module &src, Module &dst) { std::set seen; // create tile - generator gen(&dst_ctx, &dst, &dst_builder, axes_, vmap_, tmap_, tgt_, layouts_, alignment_, alloc_, sh_mem_ptr_, + generator gen(&dst_ctx, &dst, &dst_builder, a_axes_, axes_, vmap_, tmap_, tgt_, layouts_, alignment_, alloc_, sh_mem_ptr_, offset_a_i_, offset_a_k_, offset_b_j_, offset_b_k_, num_packs_0_, num_packs_1_, pack_size_0_, pack_size_1_, num_warps_ ); for(ir::alloc_const *x: src.allocs()) @@ -606,7 +624,7 @@ void selection::run(ir::module &src, Module &dst) { void generator::visit_phi_node(ir::phi_node* phi) { - Type *ty = type(phi->get_type()->get_scalar_ty()); + Type *ty = type(phi->get_type()->get_scalar_ty(), *ctx_); unsigned num_ops = phi->get_num_operands(); for_each(phi, [&](indices_t idx){ set_value(phi, idx, builder_->Insert(PHINode::Create(ty, num_ops))); @@ -628,7 +646,7 @@ void generator::visit_getelementptr_inst(ir::getelementptr_inst* gep) { std::vector idx_vals; std::transform(gep->idx_begin(), gep->idx_end(), std::back_inserter(idx_vals), [&](ir::value* x){ return get_value(x, idx);}); - Type *source_ty = type(gep->get_source_elt_ty()->get_scalar_ty()); + Type *source_ty = type(gep->get_source_elt_ty()->get_scalar_ty(), *ctx_); Value *ret = builder_->Insert(GetElementPtrInst::CreateInBounds(source_ty, ptr, idx_vals)); set_value(gep, idx, ret); }); @@ -657,7 +675,7 @@ void generator::visit_fcmp_inst(ir::fcmp_inst* fcmp) { void generator::visit_cast_inst(ir::cast_inst* cast) { for_each(cast, [&](indices_t idx){ Value *arg = get_value(cast->get_operand(0), idx); - Type *dst_ty = type(cast->get_type()->get_scalar_ty()); + Type *dst_ty = type(cast->get_type()->get_scalar_ty(), *ctx_); Value *ret = builder_->Insert(CastInst::Create(llvm_op(cast->get_op()), arg, dst_ty)); set_value(cast, idx, ret); }); @@ -1102,7 +1120,7 @@ void generator::visit_dot_inst(ir::dot_inst* dot) { ir::value *D = dot->get_operand(2); distributed_tile *TD = (distributed_tile*)tmap_.at(D); - Type *c_ty = type(D->get_type()->get_scalar_ty()); + Type *c_ty = type(D->get_type()->get_scalar_ty(), *ctx_); Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {c_ty}); auto A_shapes = A->get_type()->get_tile_shapes(); size_t red_axis = 1; @@ -1228,60 +1246,25 @@ void generator::visit_make_range(ir::make_range* x) { }); } -Type *generator::type(ir::type *ty) { - // function - if(auto* tt = dynamic_cast(ty)){ - Type *return_ty = type(tt->get_return_ty()); - std::vector param_tys; - std::transform(tt->params_begin(), tt->params_end(), std::back_inserter(param_tys), - [this](ir::type* t){ return type(t);}); - return FunctionType::get(return_ty, param_tys, false); - } - // pointer - if(ty->is_pointer_ty()){ - Type *elt_ty = type(ty->get_pointer_element_ty()); - unsigned addr_space = ty->get_pointer_address_space(); - return PointerType::get(elt_ty, addr_space); - } - // integer - if(ty->is_integer_ty()){ - unsigned bitwidth = ty->get_integer_bitwidth(); - return IntegerType::get(*ctx_, bitwidth); - } - // primitive types - switch(ty->get_type_id()){ - case ir::type::VoidTyID: return Type::getVoidTy(*ctx_); - case ir::type::HalfTyID: return Type::getHalfTy(*ctx_); - case ir::type::FloatTyID: return Type::getFloatTy(*ctx_); - case ir::type::DoubleTyID: return Type::getDoubleTy(*ctx_); - case ir::type::X86_FP80TyID: return Type::getX86_FP80Ty(*ctx_); - case ir::type::PPC_FP128TyID: return Type::getPPC_FP128Ty(*ctx_); - case ir::type::LabelTyID: return Type::getLabelTy(*ctx_); - case ir::type::MetadataTyID: return Type::getMetadataTy(*ctx_); - case ir::type::TokenTyID: return Type::getTokenTy(*ctx_); - default: break; - } - // unknown type - throw std::runtime_error("unknown conversion from ir::type to Type"); -} + void generator::visit_undef_value(ir::undef_value *ud) { - vmap_[ud] = llvm::UndefValue::get(type(ud->get_type())); + vmap_[ud] = llvm::UndefValue::get(type(ud->get_type(), *ctx_)); } void generator::visit_constant_int(ir::constant_int *cst){ - Type *ty = type(cst->get_type()->get_scalar_ty()); + Type *ty = type(cst->get_type()->get_scalar_ty(), *ctx_); vmap_[cst] = ConstantInt::get(ty, cst->get_value()); } void generator::visit_constant_fp(ir::constant_fp *cst){ - Type *ty = type(cst->get_type()->get_scalar_ty()); + Type *ty = type(cst->get_type()->get_scalar_ty(), *ctx_); vmap_[cst] = ConstantFP::get(ty, cst->get_value()); } void generator::visit_alloc_const(ir::alloc_const *alloc) { unsigned size = ((ir::constant_int*)alloc->get_operand(0))->get_value(); - Type *element_ty = type(alloc->get_type()->get_pointer_element_ty()); + Type *element_ty = type(alloc->get_type()->get_pointer_element_ty(), *ctx_); Type *array_ty = llvm::ArrayType::get(element_ty, size); Value *array = new llvm::GlobalVariable(*mod_, array_ty, false, llvm::GlobalVariable::ExternalLinkage, nullptr, alloc->get_name(), nullptr, llvm::GlobalVariable::NotThreadLocal, 4); @@ -1291,7 +1274,7 @@ void generator::visit_alloc_const(ir::alloc_const *alloc) { void generator::visit_function(ir::function* fn) { LLVMContext &ctx = builder_->getContext(); - FunctionType *fn_ty = (FunctionType*)type(fn->get_fn_type()); + FunctionType *fn_ty = (FunctionType*)type(fn->get_fn_type(), *ctx_); FunctionType *dst_fn_ty = fn_ty; if(!tgt_->is_gpu()){ Type *dst_fn_ret_ty = fn_ty->getReturnType(); @@ -1331,16 +1314,86 @@ void generator::visit_function(ir::function* fn) { fn_ = ret; } +void generator::visit_layout_hmma_884(analysis::layout_hmma_884_t* layout) { + machine_layouts_[layout] = new machine_layout_hmma_884_t(mod_, builder_, tgt_, type(layout->ty->get_scalar_ty(), *ctx_), a_axes_, axes_, + offset_a_i_, offset_a_k_, offset_b_j_, offset_b_k_, + pack_size_0_, pack_size_1_, + num_packs_0_, num_packs_1_, + layout); +} + +void generator::visit_layout_scanline(analysis::layout_scanline_t* layout) { + machine_layouts_[layout] = new machine_layout_scanline_t(mod_, builder_, tgt_, type(layout->ty->get_scalar_ty(), *ctx_), a_axes_, axes_, layout); +} + +void generator::visit_layout_shared(analysis::layout_shared_t* layout) { + + machine_layouts_[layout] = new machine_layout_shared_t(); +} + +void generator::for_each(ir::value *x, const std::function& fn) { + if(!x->get_type()->is_tile_ty()) + return fn({}); + else { + if(auto *dt = dynamic_cast(tmap_.at(x))) + dt->for_each(fn); + } +} + +Value* generator::get_value(ir::value *x, const indices_t& idx) { + if(x->get_type()->is_tile_ty()) + return tmap_.at(x)->get_value(idx); + return vmap_.at(x); +} + +void generator::set_value(ir::value *x, const indices_t& idx, Value* v) { + if(x->get_type()->is_tile_ty()) + tmap_.at(x)->set_value(idx, v); + else + vmap_[x] = v; +} + + + +shared_tile* machine_layout_shared_t::create(ir::value *v) { + +} + +machine_layout_distributed_t::machine_layout_distributed_t(Module *mod, Builder *builder, target *tgt, Type *ty, + analysis::axes *a_axes, std::map& axes, + analysis::layout_t *layout) + : mod_(mod), builder_(builder), tgt_(tgt), ty_(ty), a_axes_(a_axes), axes_(axes), layout_(layout) { + +} + +distributed_tile* machine_layout_distributed_t::create(ir::value *v) { + Type *ty = type(v->get_type()->get_scalar_ty(), builder_->getContext()); + const auto &shapes = v->get_type()->get_tile_shapes(); + std::vector axes(shapes.size()); + for(size_t d = 0; d < shapes.size(); d++){ + if(shapes[d] > 1){ + unsigned x = a_axes_->get(v, d); + axes[d] = axes_.at(x); + } + else{ + axes[d].contiguous = 1; + axes[d].values = {builder_->getInt32(0)}; + } + } + return new distributed_tile(ty, shapes, layout_->order, axes, *builder_, false); +} + machine_layout_hmma_884_t::machine_layout_hmma_884_t(Module *mod, Builder *builder, - target *tgt, std::map& axes, + target *tgt, Type *ty, analysis::axes *a_axes, + std::map& axes, Value *&offset_a_i, Value *&offset_a_k, Value *&offset_b_j, Value *&offset_b_k, unsigned &pack_size_0, unsigned &pack_size_1, unsigned &num_packs_0, unsigned &num_packs_1, analysis::layout_hmma_884_t* layout) - : mod_(mod), builder_(builder), tgt_(tgt), axes_(axes), + : machine_layout_distributed_t(mod, builder, tgt, ty, a_axes, axes, layout), offset_a_i_(offset_a_i), offset_a_k_(offset_a_k), offset_b_j_(offset_b_j), offset_b_k_(offset_b_k), - pack_size_0_(pack_size_0), pack_size_1_(pack_size_1), num_packs_0_(num_packs_0), num_packs_1_(num_packs_1), - layout_(layout) { + pack_size_0_(pack_size_0), pack_size_1_(pack_size_1), num_packs_0_(num_packs_0), num_packs_1_(num_packs_1) { + Value *warp_size = builder_->getInt32(32); Value* u_thread_id_0 = tgt_->get_local_id(mod_, *builder_, 0); Value *u_thread_id = builder_->CreateURem(u_thread_id_0, warp_size); @@ -1454,10 +1507,11 @@ machine_layout_hmma_884_t::machine_layout_hmma_884_t(Module *mod, Builder *build machine_layout_scanline_t::machine_layout_scanline_t(Module *mod, Builder *builder, - target *tgt, std::map &axes, + target *tgt, Type *ty, + analysis::axes *a_axes, std::map &axes, analysis::layout_scanline_t* layout) - : mod_(mod), builder_(builder), tgt_(tgt), axes_(axes), layout_(layout) -{ + : machine_layout_distributed_t(mod, builder, tgt, ty, a_axes, axes, layout) { + Value *warp_size = builder_->getInt32(32); Value* u_thread_id_0 = tgt_->get_local_id(mod_, *builder_, 0); Value *u_thread_id = builder_->CreateURem(u_thread_id_0, warp_size); @@ -1486,44 +1540,6 @@ machine_layout_scanline_t::machine_layout_scanline_t(Module *mod, Builder *build } } -void generator::visit_layout_hmma_884(analysis::layout_hmma_884_t* layout) { - machine_layouts_[layout] = new machine_layout_hmma_884_t(mod_, builder_, tgt_, axes_, offset_a_i_, offset_a_k_, offset_b_j_, offset_b_k_, - pack_size_0_, pack_size_1_, - num_packs_0_, num_packs_1_, - layout); -} - -void generator::visit_layout_scanline(analysis::layout_scanline_t* layout) { - machine_layouts_[layout] = new machine_layout_scanline_t(mod_, builder_, tgt_, axes_, layout); -} - -void generator::visit_layout_shared(analysis::layout_shared_t* layout) { - - machine_layouts_[layout] = new machine_layout_shared_t(); -} - -void generator::for_each(ir::value *x, const std::function& fn) { - if(!x->get_type()->is_tile_ty()) - return fn({}); - else { - if(auto *dt = dynamic_cast(tmap_.at(x))) - dt->for_each(fn); - } -} - -Value* generator::get_value(ir::value *x, const indices_t& idx) { - if(x->get_type()->is_tile_ty()) - return tmap_.at(x)->get_value(idx); - return vmap_.at(x); -} - -void generator::set_value(ir::value *x, const indices_t& idx, Value* v) { - if(x->get_type()->is_tile_ty()) - tmap_.at(x)->set_value(idx, v); - else - vmap_[x] = v; -} - } From 4bfe998cc8951411ea12d37361c91043b2acf10c Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 15 Oct 2019 21:53:23 -0400 Subject: [PATCH 444/494] [codegen] [selection] everything is now implemented with visitor --- include/triton/codegen/analysis/allocation.h | 6 +- include/triton/codegen/selection.h | 93 ++++++- lib/codegen/analysis/allocation.cc | 10 +- lib/codegen/selection.cc | 277 +++++++++---------- lib/codegen/transform/membar.cc | 6 +- lib/runtime/function.cc | 5 - 6 files changed, 232 insertions(+), 165 deletions(-) diff --git a/include/triton/codegen/analysis/allocation.h b/include/triton/codegen/analysis/allocation.h index 858152150..49f378886 100644 --- a/include/triton/codegen/analysis/allocation.h +++ b/include/triton/codegen/analysis/allocation.h @@ -27,14 +27,14 @@ public: allocation(liveness *live) : liveness_(live) { } // accessors - bool has_offset(ir::value *x) const { return offsets_.find(x) != offsets_.end(); } - unsigned offset(ir::value *x) const { return offsets_.at(x); } + bool has_offset(const layout_t *x) const { return offsets_.find(x) != offsets_.end(); } + unsigned offset(const layout_t *x) const { return offsets_.at(x); } unsigned allocated_size() const { return allocated_size_; } // run void run(ir::module& mod); private: - std::map offsets_; + std::map offsets_; size_t allocated_size_; // dependences liveness *liveness_; diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 82466ab93..ec00cdd20 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -147,12 +147,32 @@ private: }; class machine_layout_t { +public: virtual tile* create(ir::value *v) = 0; }; class machine_layout_shared_t: public machine_layout_t { public: - shared_tile* create(ir::value *v); + machine_layout_shared_t(Module *mod, Builder *builder, target *tgt, analysis::allocation* alloc, Value *&sh_mem_ptr, analysis::layout_t* layout, + std::map& vmap, + std::map& tmap); + + tile* create(ir::value *v); + + Module *mod_; + Builder *builder_; + target *tgt_; + analysis::allocation* alloc_; + Value *&sh_mem_ptr_; + analysis::layout_t* layout_; + std::map& vmap_; + std::map& tmap_; + + Value *offset_; + Value *ptr_; + Value *pre_ptr_; + Value *next_ptr_; + }; class machine_layout_distributed_t: public machine_layout_t { @@ -161,7 +181,7 @@ public: analysis::axes *a_axes, std::map& axes, analysis::layout_t* layout); - distributed_tile* create(ir::value *v); + tile* create(ir::value *v); Module *mod_; Builder *builder_; target *tgt_; @@ -309,7 +329,72 @@ private: unsigned num_warps_; }; +class finalizer: public ir::visitor, public analysis::layout_visitor { +private: + void for_each(ir::value *x, const std::function& fn); + Value* get_value(ir::value *x, const indices_t& idx); + void set_value(ir::value *x, const indices_t& idx, Value* v); +public: + finalizer(Builder *builder, std::map& vmap, std::map& tmap); + + void visit_phi_node(ir::phi_node*); + void visit_binary_operator(ir::binary_operator*) { } + void visit_getelementptr_inst(ir::getelementptr_inst*) { } + + void visit_icmp_inst(ir::icmp_inst*) { } + void visit_fcmp_inst(ir::fcmp_inst*) { } + void visit_cast_inst(ir::cast_inst*) { } + + void visit_return_inst(ir::return_inst*) { } + void visit_cond_branch_inst(ir::cond_branch_inst*) { } + void visit_uncond_branch_inst(ir::uncond_branch_inst*) { } + + + void visit_unmasked_load_inst(ir::unmasked_load_inst*) { } + void visit_masked_load_inst(ir::masked_load_inst*) { } + void visit_unmasked_store_inst(ir::unmasked_store_inst*) { } + void visit_masked_store_inst(ir::masked_store_inst*) { } + + void visit_reshape_inst(ir::reshape_inst*) { } + void visit_splat_inst(ir::splat_inst*) { } + void visit_broadcast_inst(ir::broadcast_inst*) { } + void visit_downcast_inst(ir::downcast_inst*) { } + + void visit_get_program_id_inst(ir::get_program_id_inst*) { } + void visit_get_num_program_inst(ir::get_num_program_inst*) { } + void visit_atomic_cas_inst(ir::atomic_cas_inst*) { } + void visit_atomic_exch_inst(ir::atomic_exch_inst*) { } + void visit_atomic_add_inst(ir::atomic_add_inst*) { } + void visit_dot_inst(ir::dot_inst*) { } + void visit_trans_inst(ir::trans_inst*) { } + void visit_sqrt_inst(ir::sqrt_inst*) { } + void visit_reduce_inst(ir::reduce_inst*) { } + void visit_select_inst(ir::select_inst*) { } + + void visit_copy_to_shared_inst(ir::copy_to_shared_inst*) { } + void visit_copy_from_shared_inst(ir::copy_from_shared_inst*) { } + void visit_barrier_inst(ir::barrier_inst*) { } + void visit_make_range_dyn(ir::make_range_dyn*) { } + void visit_make_range(ir::make_range*) { } + + void visit_make_range_sta(ir::make_range_sta*) { } + void visit_undef_value(ir::undef_value*) { } + void visit_constant_int(ir::constant_int*) { } + void visit_constant_fp(ir::constant_fp*) { } + void visit_alloc_const(ir::alloc_const*) { } + + void visit_function(ir::function*) { } + + void visit_layout_hmma_884(analysis::layout_hmma_884_t*) { } + void visit_layout_scanline(analysis::layout_scanline_t*) { } + void visit_layout_shared(analysis::layout_shared_t*); + +private: + Builder *builder_; + std::map& vmap_; + std::map& tmap_; +}; // Selection pass class selection{ @@ -318,12 +403,8 @@ class selection{ private: // LLVM conversions - Type* llvm_type(ir::type *ty, LLVMContext &ctx); Value* alloc_shared(Builder &builder, Module& dst); - // grid construction - void create_shared_tile(ir::value *v, Builder &builder, Value *sh_mem_ptr); - // lower scalar instruction void lower_value(ir::value *src, Builder &builder, generator* gen, std::set& seen); diff --git a/lib/codegen/analysis/allocation.cc b/lib/codegen/analysis/allocation.cc index 2474acded..3ea0a758d 100644 --- a/lib/codegen/analysis/allocation.cc +++ b/lib/codegen/analysis/allocation.cc @@ -99,16 +99,10 @@ void allocation::run(ir::module &mod) { unsigned Adj = 0; for(layout_t* y: interferences[x]) Adj = std::max(Adj, starts[y] + y->size); - // create offsets - for(ir::value *v: x->values){ - offsets_[v] = starts[x] + colors[x] * Adj; - } - if(x->double_buffer){ - auto info = *x->double_buffer; - offsets_[info.latch] = offsets_[info.first] + x->size / 2; - } + offsets_[x] = starts[x] + colors[x] * Adj; } + // Save maximum size of induced memory space allocated_size_ = 0; for(layout_t* x: V) diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index be97ac2e4..96a4632f5 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -305,43 +305,6 @@ llvm::CmpInst::Predicate llvm_pred(ir::cmp_pred_t pred) { throw std::runtime_error("unknown operator"); } -/* convert ir::type to Type */ -Type *selection::llvm_type(ir::type *ty, LLVMContext &ctx) { - // function - if(auto* tt = dynamic_cast(ty)){ - Type *return_ty = llvm_type(tt->get_return_ty(), ctx); - std::vector param_tys; - std::transform(tt->params_begin(), tt->params_end(), std::back_inserter(param_tys), - [this,&ctx](ir::type* t){ return llvm_type(t, ctx);}); - return FunctionType::get(return_ty, param_tys, false); - } - // pointer - if(ty->is_pointer_ty()){ - Type *elt_ty = llvm_type(ty->get_pointer_element_ty(), ctx); - unsigned addr_space = ty->get_pointer_address_space(); - return PointerType::get(elt_ty, addr_space); - } - // integer - if(ty->is_integer_ty()){ - unsigned bitwidth = ty->get_integer_bitwidth(); - return IntegerType::get(ctx, bitwidth); - } - // primitive types - switch(ty->get_type_id()){ - case ir::type::VoidTyID: return Type::getVoidTy(ctx); - case ir::type::HalfTyID: return Type::getHalfTy(ctx); - case ir::type::FloatTyID: return Type::getFloatTy(ctx); - case ir::type::DoubleTyID: return Type::getDoubleTy(ctx); - case ir::type::X86_FP80TyID: return Type::getX86_FP80Ty(ctx); - case ir::type::PPC_FP128TyID: return Type::getPPC_FP128Ty(ctx); - case ir::type::LabelTyID: return Type::getLabelTy(ctx); - case ir::type::MetadataTyID: return Type::getMetadataTy(ctx); - case ir::type::TokenTyID: return Type::getTokenTy(ctx); - default: break; - } - // unknown type - throw std::runtime_error("unknown conversion from ir::type to Type"); -} Type *type(ir::type *ty, LLVMContext &ctx) { // function @@ -407,45 +370,6 @@ inline int32_t ceil(int32_t num, int32_t div){ * ---- Init Tiles ---- * ------------------- */ -void selection::create_shared_tile(ir::value *v, IRBuilder<> &builder, Value *sh_mem_ptr) { - if(tmap_.find(v) != tmap_.end()) - return; - analysis::layout_shared_t *layout = (analysis::layout_shared_t*)layouts_->get(v); - auto order = layout->order; - auto shapes = layout->shapes; - shapes[order[0]] += layout->pad; - - Type* ty = llvm_type(v->get_type()->get_scalar_ty(), builder.getContext()); - // shared copy - PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr->getType()->getPointerAddressSpace()); - // double-buffered - if(layout->double_buffer) { - auto info = *layout->double_buffer; - ir::phi_node *phi = info.phi; - BasicBlock *parent = (BasicBlock*)vmap_[phi->get_parent()]; - if(parent->empty()) - builder.SetInsertPoint(parent); - else - builder.SetInsertPoint(&*parent->getFirstNonPHI()); - // create double-buffered pointer - PHINode *ptr = builder.CreatePHI(ptr_ty, 2); - PHINode *offset = builder.CreatePHI(builder.getInt32Ty(), 2); - // next pointer - Value *pre_ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(alloc_->offset(v))); - pre_ptr = builder.CreateBitCast(pre_ptr, ptr->getType()); - Value *next_ptr = builder.CreateGEP(ptr, offset, "next_ptr"); - tmap_.insert({phi, new shared_tile(ty, shapes, order, ptr, builder, offset)}); - tmap_.insert({v, new shared_tile(ty, shapes, order, pre_ptr, builder)}); - tmap_.insert({info.latch, new shared_tile(ty, shapes, order, next_ptr, builder)}); - } - else { - size_t offset = alloc_->offset(v); - Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset)); - ptr = builder.CreateBitCast(ptr, ptr_ty); - tmap_.insert({v, new shared_tile(ty, shapes, order, ptr, builder)}); - } -} - bool is_trans(ir::value *v) { if(dynamic_cast(v)) { @@ -465,18 +389,11 @@ void selection::lower_value(ir::value *src, IRBuilder<> &builder, generator* gen if(!seen.insert(src).second) return; + if(src->get_type()->is_tile_ty()) + tmap_[src] = gen->get_machine_layout(layouts_->get(src))->create(src); + + BasicBlock *current = builder.GetInsertBlock(); - if(src->get_type()->is_tile_ty()){ - builder.SetInsertPoint(&*builder.GetInsertBlock()->getParent()->begin()); - auto *i = dynamic_cast(src); - if(i && layouts_->get(i)->type == analysis::SHARED) - create_shared_tile(i, builder, sh_mem_ptr_); - else - tmap_[src] = ((machine_layout_distributed_t*)gen->get_machine_layout(layouts_->get(src)))->create(src); - } - builder.SetInsertPoint(current); - - auto *inst = dynamic_cast(src); if(inst && !dynamic_cast(src)) for(ir::value *op: inst->ops()) @@ -541,6 +458,7 @@ void selection::run(ir::module &src, Module &dst) { // create tile generator gen(&dst_ctx, &dst, &dst_builder, a_axes_, axes_, vmap_, tmap_, tgt_, layouts_, alignment_, alloc_, sh_mem_ptr_, offset_a_i_, offset_a_k_, offset_b_j_, offset_b_k_, num_packs_0_, num_packs_1_, pack_size_0_, pack_size_1_, num_warps_ ); + finalizer fin(&dst_builder, vmap_, tmap_); for(ir::alloc_const *x: src.allocs()) x->accept(&gen); @@ -554,69 +472,25 @@ void selection::run(ir::module &src, Module &dst) { x.second->accept(&gen); // generate LLVM-IR code - std::map last_block; for(ir::basic_block *block: fn->blocks()) { BasicBlock *parent = (BasicBlock*)vmap_[block]; dst_builder.SetInsertPoint(parent); for(ir::instruction *i: block->get_inst_list()) lower_value(i, dst_builder, &gen, seen); - last_block[block] = dst_builder.GetInsertBlock(); + vmap_[block] = dst_builder.GetInsertBlock(); } // finalize double-buffering - for(const auto& x: layouts_->get_all()) { - if(x.second->double_buffer) { - auto info = *x.second->double_buffer; - ir::phi_node *phi = info.phi; - PHINode *ptr = (PHINode*)((shared_tile*)tmap_.at(phi))->get_pointer(); - PHINode *offset = (PHINode*)((shared_tile*)tmap_.at(phi))->get_offset(); - for(unsigned n = 0; n < phi->get_num_incoming(); n++){ - ir::basic_block* inc_block = phi->get_incoming_block(n); - ir::value* inc_val = phi->get_incoming_value(n); - BasicBlock *llvm_inc_block = last_block.at(inc_block); - shared_tile *inc_shared = (shared_tile*)tmap_.at(inc_val); - if(inc_val == info.latch){ - dst_builder.SetInsertPoint(llvm_inc_block->getTerminator()); - Value *next_offset = dst_builder.CreateNeg(offset); - offset->addIncoming(next_offset, llvm_inc_block); - } - else { - unsigned num_bytes = x.second->ty->get_primitive_size_in_bits() / 8; - offset->addIncoming(dst_builder.getInt32(x.second->size / (2*num_bytes)), llvm_inc_block); - } - ptr->addIncoming(inc_shared->get_pointer(), llvm_inc_block); - } - } - } + for(const auto& x: layouts_->get_all()) + x.second->accept(&fin); // finalize phi for(ir::basic_block *block: fn->blocks()) for(ir::instruction *inst: block->get_inst_list()) - if(auto *phi = dynamic_cast(inst)){ - if(tmap_.find(phi) == tmap_.end() || - !dynamic_cast(tmap_.at(phi))) { - for(unsigned n = 0; n < phi->get_num_incoming(); n++){ - ir::value *inc_val = phi->get_incoming_value(n); - ir::basic_block *inc_block = phi->get_incoming_block(n); - BasicBlock *llvm_inc_block = last_block.at(inc_block); - if(phi->get_type()->is_tile_ty()) { - distributed_tile *phi_tile = (distributed_tile*)tmap_.at(phi); - distributed_tile *inc_tile = (distributed_tile*)tmap_.at(inc_val); - phi_tile->for_each([&](indices_t idx){ - PHINode *llvm_phi = (PHINode*)phi_tile->get_value(idx); - Value *llvm_inc_val = inc_tile->get_value(idx); - llvm_phi->addIncoming(llvm_inc_val, llvm_inc_block); - }); - } - else { - PHINode *llvm_phi = (PHINode*)vmap_.at(phi); - Value *llvm_inc_val = vmap_.at(inc_val); - llvm_phi->addIncoming(llvm_inc_val, llvm_inc_block); - } - } - } - } + inst->accept(&fin); + } + } @@ -895,7 +769,7 @@ void generator::visit_atomic_cas_inst(ir::atomic_cas_inst* cas) { Value *pred = builder_->CreateICmpEQ(tid, builder_->getInt32(0)); BasicBlock *tid_0_bb = BasicBlock::Create(*ctx_, "tid_0", current->getParent()); BasicBlock *tid_0_done_bb = BasicBlock::Create(*ctx_, "tid_0_done", current->getParent()); - Value *ptr = builder_->CreateGEP(sh_mem_ptr_, builder_->getInt32(alloc_->offset(cas))); + Value *ptr = builder_->CreateGEP(sh_mem_ptr_, builder_->getInt32(alloc_->offset(layouts_->get(cas)))); ptr = builder_->CreateBitCast(ptr, PointerType::get(builder_->getInt32Ty(), ptr->getType()->getPointerAddressSpace())); tgt_->add_memfence(module, *builder_); tgt_->add_barrier(module, *builder_); @@ -1328,7 +1202,7 @@ void generator::visit_layout_scanline(analysis::layout_scanline_t* layout) { void generator::visit_layout_shared(analysis::layout_shared_t* layout) { - machine_layouts_[layout] = new machine_layout_shared_t(); + machine_layouts_[layout] = new machine_layout_shared_t(mod_, builder_, tgt_, alloc_, sh_mem_ptr_, layout, vmap_, tmap_); } void generator::for_each(ir::value *x, const std::function& fn) { @@ -1355,8 +1229,61 @@ void generator::set_value(ir::value *x, const indices_t& idx, Value* v) { -shared_tile* machine_layout_shared_t::create(ir::value *v) { +machine_layout_shared_t::machine_layout_shared_t(Module *mod, Builder *builder, target *tgt, analysis::allocation* alloc, + Value *&sh_mem_ptr, analysis::layout_t *layout, + std::map& vmap, + std::map& tmap) + : mod_(mod), builder_(builder), tgt_(tgt), alloc_(alloc), sh_mem_ptr_(sh_mem_ptr), layout_(layout), vmap_(vmap), tmap_(tmap) { + auto order = layout_->order; + auto shapes = layout_->shapes; + shapes[order[0]] += layout_->pad; + + Type* ty = type(layout_->ty, builder_->getContext()); + + PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr_->getType()->getPointerAddressSpace()); + // double-buffered + if(layout_->double_buffer) { + BasicBlock *current = builder_->GetInsertBlock(); + auto info = *layout_->double_buffer; + ir::phi_node *phi = info.phi; + BasicBlock *parent = (BasicBlock*)vmap_.at(phi->get_parent()); + if(parent->empty()) + builder_->SetInsertPoint(parent); + else + builder_->SetInsertPoint(&*parent->getFirstNonPHI()); + // create pointers + ptr_ = builder_->CreatePHI(ptr_ty, 2); + pre_ptr_ = builder_->CreateGEP(sh_mem_ptr_, builder_->getInt32(alloc_->offset(layout_))); + pre_ptr_ = builder_->CreateBitCast(pre_ptr_, ptr_->getType()); + offset_ = builder_->CreatePHI(builder_->getInt32Ty(), 2); + next_ptr_ = builder_->CreateGEP(ptr_, offset_, "next_ptr"); + builder_->SetInsertPoint(current); + } + else{ + size_t offset = alloc_->offset(layout_); + ptr_ = builder_->CreateGEP(sh_mem_ptr_, builder_->getInt32(offset)); + ptr_ = builder_->CreateBitCast(ptr_, ptr_ty); + } +} + + +tile* machine_layout_shared_t::create(ir::value *v) { + auto order = layout_->order; + auto shapes = layout_->shapes; + shapes[order[0]] += layout_->pad; + Type* ty = type(layout_->ty, builder_->getContext()); + // double-buffered + if(layout_->double_buffer) { + if(v == layout_->double_buffer->phi) + return new shared_tile(ty, shapes, order, ptr_, *builder_, offset_); + if(v == layout_->double_buffer->latch) + return new shared_tile(ty, shapes, order, next_ptr_, *builder_); + return new shared_tile(ty, shapes, order, pre_ptr_, *builder_); + } + else { + return new shared_tile(ty, shapes, order, ptr_, *builder_); + } } machine_layout_distributed_t::machine_layout_distributed_t(Module *mod, Builder *builder, target *tgt, Type *ty, @@ -1366,7 +1293,7 @@ machine_layout_distributed_t::machine_layout_distributed_t(Module *mod, Builder } -distributed_tile* machine_layout_distributed_t::create(ir::value *v) { +tile *machine_layout_distributed_t::create(ir::value *v) { Type *ty = type(v->get_type()->get_scalar_ty(), builder_->getContext()); const auto &shapes = v->get_type()->get_tile_shapes(); std::vector axes(shapes.size()); @@ -1540,6 +1467,74 @@ machine_layout_scanline_t::machine_layout_scanline_t(Module *mod, Builder *build } } +finalizer::finalizer(Builder *builder, std::map& vmap, std::map& tmap) + : builder_(builder), vmap_(vmap), tmap_(tmap) { + +} + +void finalizer::for_each(ir::value *x, const std::function& fn) { + if(!x->get_type()->is_tile_ty()) + return fn({}); + else { + if(auto *dt = dynamic_cast(tmap_.at(x))) + dt->for_each(fn); + } +} + +Value* finalizer::get_value(ir::value *x, const indices_t& idx) { + if(x->get_type()->is_tile_ty()) + return tmap_.at(x)->get_value(idx); + return vmap_.at(x); +} + +void finalizer::set_value(ir::value *x, const indices_t& idx, Value* v) { + if(x->get_type()->is_tile_ty()) + tmap_.at(x)->set_value(idx, v); + else + vmap_[x] = v; +} + +void finalizer::visit_phi_node(ir::phi_node* phi) { + auto it = tmap_.find(phi); + if(it != tmap_.end() && dynamic_cast(it->second)) + return; + for(unsigned n = 0; n < phi->get_num_incoming(); n++){ + ir::basic_block *inc_block = phi->get_incoming_block(n); + BasicBlock *llvm_inc_block = (BasicBlock*)vmap_.at(inc_block); + for_each(phi, [&](indices_t idx){ + PHINode *llvm_phi = (PHINode*)get_value(phi, idx); + Value *llvm_inc_val = get_value(phi->get_incoming_value(n), idx); + llvm_phi->addIncoming(llvm_inc_val, llvm_inc_block); + }); + } +} + + +void finalizer::visit_layout_shared(analysis::layout_shared_t* layout) { + if(layout->double_buffer) { + auto info = *layout->double_buffer; + ir::phi_node *phi = info.phi; + PHINode *ptr = (PHINode*)((shared_tile*)tmap_.at(phi))->get_pointer(); + PHINode *offset = (PHINode*)((shared_tile*)tmap_.at(phi))->get_offset(); + for(unsigned n = 0; n < phi->get_num_incoming(); n++){ + ir::basic_block* inc_block = phi->get_incoming_block(n); + ir::value* inc_val = phi->get_incoming_value(n); + BasicBlock *llvm_inc_block = (BasicBlock*)vmap_.at(inc_block); + shared_tile *inc_shared = (shared_tile*)tmap_.at(inc_val); + if(inc_val == info.latch){ + builder_->SetInsertPoint(llvm_inc_block->getTerminator()); + Value *next_offset = builder_->CreateNeg(offset); + offset->addIncoming(next_offset, llvm_inc_block); + } + else { + unsigned num_bytes = layout->ty->get_primitive_size_in_bits() / 8; + offset->addIncoming(builder_->getInt32(layout->size / (2*num_bytes)), llvm_inc_block); + } + ptr->addIncoming(inc_shared->get_pointer(), llvm_inc_block); + } + } +} + } diff --git a/lib/codegen/transform/membar.cc b/lib/codegen/transform/membar.cc index 9a8ad7fd2..8c2f3d909 100644 --- a/lib/codegen/transform/membar.cc +++ b/lib/codegen/transform/membar.cc @@ -37,8 +37,10 @@ void membar::add_reference(ir::value *v, interval_vec_t &res){ auto *i = dynamic_cast(v); if(!i) return; - if(alloc_->has_offset(v)){ - unsigned offset = alloc_->offset(v); + if(!i->get_type()->is_tile_ty()) + return; + if(alloc_->has_offset(layouts_->get(v))){ + unsigned offset = alloc_->offset(layouts_->get(v)); unsigned size = layouts_->get(v)->size; res.push_back(interval_t(offset, offset + size)); } diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 37b14145f..b83ea8442 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -242,11 +242,6 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c if(allocation.allocated_size() > context->device()->max_shared_memory()) return std::unique_ptr(); barriers.run(module); - dce.run(module); - align.run(module); - axes.run(module); - layouts.run(module); - liveness.run(module); // ir::print(module, std::cout); selection.run(module, *llvm); // return binary From ae246218252a893bf62bc2c3e37f9ee6b9527a27 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 17 Oct 2019 00:36:46 -0400 Subject: [PATCH 445/494] more cleaning --- include/triton/codegen/analysis/layout.h | 2 + include/triton/codegen/selection.h | 118 ++----- include/triton/ir/basic_block.h | 4 + include/triton/ir/function.h | 2 + include/triton/ir/value.h | 4 +- include/triton/ir/visitor.h | 10 + lib/codegen/analysis/layout.cc | 5 + lib/codegen/selection.cc | 372 +++++++++-------------- lib/ir/function.cc | 4 + lib/ir/value.cc | 6 + 10 files changed, 205 insertions(+), 322 deletions(-) diff --git a/include/triton/codegen/analysis/layout.h b/include/triton/codegen/analysis/layout.h index 923e13411..70260542a 100644 --- a/include/triton/codegen/analysis/layout.h +++ b/include/triton/codegen/analysis/layout.h @@ -36,6 +36,7 @@ struct double_buffer_info_t { }; class layout_visitor; +class layout_t; class layout_hmma_884_t; class layout_scanline_t; class layout_shared_t; @@ -43,6 +44,7 @@ class layout_shared_t; class layout_visitor { public: + virtual void visit_layout(layout_t *); virtual void visit_layout_hmma_884(layout_hmma_884_t*) = 0; virtual void visit_layout_scanline(layout_scanline_t*) = 0; virtual void visit_layout_shared(layout_shared_t*) = 0; diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index ec00cdd20..dfdf48ca1 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -197,16 +197,13 @@ public: machine_layout_hmma_884_t(Module *mod, Builder *builder, target *tgt, Type *ty, analysis::axes *a_axes, std::map& axes, - Value *&offset_a_i, Value *&offset_a_k, Value *&offset_b_j, Value *&offset_b_k, - unsigned &pack_size_0, unsigned &pack_size_1, - unsigned &num_packs_0, unsigned &num_packs_1, analysis::layout_hmma_884_t* layout); - Value *&offset_a_i_, *&offset_a_k_; - Value *&offset_b_j_, *&offset_b_k_; - unsigned &pack_size_0_; - unsigned& pack_size_1_; - unsigned &num_packs_0_; - unsigned& num_packs_1_; + Value *offset_a_i_, *offset_a_k_; + Value *offset_b_j_, *offset_b_k_; + unsigned pack_size_0_; + unsigned pack_size_1_; + unsigned num_packs_0_; + unsigned num_packs_1_; }; class machine_layout_scanline_t: public machine_layout_distributed_t { @@ -219,15 +216,18 @@ public: class generator: public ir::visitor, public analysis::layout_visitor { private: - void visit_hmma_dot(ir::dot_inst*, distributed_tile *TC, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK); - void visit_scanline_dot(ir::dot_inst*, distributed_tile *TC, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK, Type *c_ty, Function *f_mul_add); - void visit_outer_dot(ir::dot_inst*, distributed_tile *TC, distributed_tile *TA, distributed_tile *TB, distributed_tile *TD, unsigned NK, - Type *c_ty, Function *f_mul_add); - void for_each(ir::value *x, const std::function& fn); Value* get_value(ir::value *x, const indices_t& idx); void set_value(ir::value *x, const indices_t& idx, Value* v); + void visit_hmma_dot(ir::dot_inst*, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK); + void visit_scanline_dot(ir::dot_inst*, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK, Type *c_ty, Function *f_mul_add); + void visit_outer_dot(ir::dot_inst*, distributed_tile *TA, distributed_tile *TB, distributed_tile *TD, unsigned NK, + Type *c_ty, Function *f_mul_add); + + void finalize_function(ir::function*); + void finalize_phi_node(ir::phi_node*); + public: generator(LLVMContext *ctx, Module *dst, @@ -241,18 +241,12 @@ public: analysis::align *alignment, analysis::allocation *alloc, Value *sh_mem_ptr, - Value *offset_a_i, Value *offset_a_k, - Value *offset_b_j, Value *offset_b_k, - unsigned num_packs_0, unsigned num_packs_1, - unsigned pack_size_0, unsigned pack_size_1, unsigned num_warps) : ctx_(ctx), mod_(dst), builder_(builder), a_axes_(a_axes), axes_(axes), vmap_(vmap), tmap_(tmap), tgt_(tgt), layouts_(layouts), alignment_(alignment), alloc_(alloc), sh_mem_ptr_(sh_mem_ptr), - offset_a_i_(offset_a_i), offset_a_k_(offset_a_k), offset_b_j_(offset_b_j), offset_b_k_(offset_b_k), - num_packs_0_(num_packs_0), num_packs_1_(num_packs_1), pack_size_0_(pack_size_0), pack_size_1_(pack_size_1), num_warps_(num_warps) { } - machine_layout_t *get_machine_layout(const analysis::layout_t *layout) { return machine_layouts_.at(layout); } + void visit_value(ir::value* v); void visit_phi_node(ir::phi_node*); void visit_binary_operator(ir::binary_operator*); @@ -301,6 +295,8 @@ public: void visit_alloc_const(ir::alloc_const*); void visit_function(ir::function*); + void visit_basic_block(ir::basic_block*); + void visit_argument(ir::argument*); void visit_layout_hmma_884(analysis::layout_hmma_884_t*); void visit_layout_scanline(analysis::layout_scanline_t*); @@ -308,7 +304,6 @@ public: private: LLVMContext *ctx_; - Function *fn_; Builder *builder_; Module *mod_; @@ -322,78 +317,9 @@ private: analysis::align *alignment_; analysis::allocation *alloc_; Value *sh_mem_ptr_; - Value *offset_a_i_, *offset_a_k_; - Value *offset_b_j_, *offset_b_k_; - unsigned num_packs_0_, num_packs_1_; - unsigned pack_size_0_, pack_size_1_; unsigned num_warps_; -}; -class finalizer: public ir::visitor, public analysis::layout_visitor { -private: - void for_each(ir::value *x, const std::function& fn); - Value* get_value(ir::value *x, const indices_t& idx); - void set_value(ir::value *x, const indices_t& idx, Value* v); - -public: - finalizer(Builder *builder, std::map& vmap, std::map& tmap); - - void visit_phi_node(ir::phi_node*); - void visit_binary_operator(ir::binary_operator*) { } - void visit_getelementptr_inst(ir::getelementptr_inst*) { } - - void visit_icmp_inst(ir::icmp_inst*) { } - void visit_fcmp_inst(ir::fcmp_inst*) { } - void visit_cast_inst(ir::cast_inst*) { } - - void visit_return_inst(ir::return_inst*) { } - void visit_cond_branch_inst(ir::cond_branch_inst*) { } - void visit_uncond_branch_inst(ir::uncond_branch_inst*) { } - - - void visit_unmasked_load_inst(ir::unmasked_load_inst*) { } - void visit_masked_load_inst(ir::masked_load_inst*) { } - void visit_unmasked_store_inst(ir::unmasked_store_inst*) { } - void visit_masked_store_inst(ir::masked_store_inst*) { } - - void visit_reshape_inst(ir::reshape_inst*) { } - void visit_splat_inst(ir::splat_inst*) { } - void visit_broadcast_inst(ir::broadcast_inst*) { } - void visit_downcast_inst(ir::downcast_inst*) { } - - void visit_get_program_id_inst(ir::get_program_id_inst*) { } - void visit_get_num_program_inst(ir::get_num_program_inst*) { } - void visit_atomic_cas_inst(ir::atomic_cas_inst*) { } - void visit_atomic_exch_inst(ir::atomic_exch_inst*) { } - void visit_atomic_add_inst(ir::atomic_add_inst*) { } - void visit_dot_inst(ir::dot_inst*) { } - void visit_trans_inst(ir::trans_inst*) { } - void visit_sqrt_inst(ir::sqrt_inst*) { } - void visit_reduce_inst(ir::reduce_inst*) { } - void visit_select_inst(ir::select_inst*) { } - - void visit_copy_to_shared_inst(ir::copy_to_shared_inst*) { } - void visit_copy_from_shared_inst(ir::copy_from_shared_inst*) { } - void visit_barrier_inst(ir::barrier_inst*) { } - void visit_make_range_dyn(ir::make_range_dyn*) { } - void visit_make_range(ir::make_range*) { } - - void visit_make_range_sta(ir::make_range_sta*) { } - void visit_undef_value(ir::undef_value*) { } - void visit_constant_int(ir::constant_int*) { } - void visit_constant_fp(ir::constant_fp*) { } - void visit_alloc_const(ir::alloc_const*) { } - - void visit_function(ir::function*) { } - - void visit_layout_hmma_884(analysis::layout_hmma_884_t*) { } - void visit_layout_scanline(analysis::layout_scanline_t*) { } - void visit_layout_shared(analysis::layout_shared_t*); - -private: - Builder *builder_; - std::map& vmap_; - std::map& tmap_; + std::set seen_; }; // Selection pass @@ -405,9 +331,6 @@ private: // LLVM conversions Value* alloc_shared(Builder &builder, Module& dst); - // lower scalar instruction - void lower_value(ir::value *src, Builder &builder, generator* gen, std::set& seen); - public: selection(analysis::liveness* liveness, analysis::allocation *alloc, analysis::align *alignment, analysis::axes *axes, @@ -428,11 +351,6 @@ private: analysis::align *alignment_; target *tgt_; std::map axes_; - Value *sh_mem_ptr_; - Value *offset_a_i_, *offset_a_k_; - Value *offset_b_j_, *offset_b_k_; - unsigned num_packs_0_, num_packs_1_; - unsigned pack_size_0_, pack_size_1_; unsigned num_warps_; }; diff --git a/include/triton/ir/basic_block.h b/include/triton/ir/basic_block.h index 4a60586f0..3d274815a 100644 --- a/include/triton/ir/basic_block.h +++ b/include/triton/ir/basic_block.h @@ -6,6 +6,7 @@ #include #include #include "value.h" +#include "visitor.h" namespace triton{ namespace ir{ @@ -66,6 +67,9 @@ public: // factory functions static basic_block* create(context &ctx, const std::string &name, function *parent); + // visitor + void accept(visitor *v) { v->visit_basic_block(this); } + private: context &ctx_; std::string name_; diff --git a/include/triton/ir/function.h b/include/triton/ir/function.h index 8cf11275a..d3ebe199b 100644 --- a/include/triton/ir/function.h +++ b/include/triton/ir/function.h @@ -26,6 +26,8 @@ public: function* get_parent() const; unsigned get_arg_no() const; + void accept(visitor *v); + private: function *parent_; unsigned arg_no_; diff --git a/include/triton/ir/value.h b/include/triton/ir/value.h index bf4a4aa9c..e192a54ef 100644 --- a/include/triton/ir/value.h +++ b/include/triton/ir/value.h @@ -33,6 +33,8 @@ public: void set_name(const std::string &name); const std::string &get_name() const { return name_; } type* get_type() const { return ty_; } + // visitor + virtual void accept(visitor *v) = 0; private: std::string name_; @@ -75,8 +77,6 @@ public: void replace_all_uses_with(value *target); void replace_uses_of_with(value *before, value *after); - // Visitor - virtual void accept(visitor *v) = 0; private: ops_t ops_; diff --git a/include/triton/ir/visitor.h b/include/triton/ir/visitor.h index e2310e94a..62e63e6c4 100644 --- a/include/triton/ir/visitor.h +++ b/include/triton/ir/visitor.h @@ -7,6 +7,8 @@ namespace triton{ namespace ir{ +class value; + class instruction; class phi_node; @@ -81,10 +83,18 @@ class alloc_const; class function; +class basic_block; + +class argument; + class visitor { public: virtual ~visitor() {} + virtual void visit_value(ir::value*); + + virtual void visit_basic_block(basic_block*) = 0; + virtual void visit_argument(argument*) = 0; virtual void visit_phi_node(phi_node*) = 0; virtual void visit_binary_operator(binary_operator*) = 0; virtual void visit_getelementptr_inst(getelementptr_inst*) = 0; diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index f435efef8..dc43f8ea6 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -124,6 +124,10 @@ inline bool is_trans(ir::value *v) { } +void layout_visitor::visit_layout(layout_t *layout) { + layout->accept(this); +} + layout_t::layout_t(layout_type_t _type, const std::vector &_axes, @@ -145,6 +149,7 @@ layout_t::layout_t(layout_type_t _type, } } + inline unsigned clamp(unsigned x, unsigned lo, unsigned hi) { return std::min(std::max(x, lo), hi); } diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 96a4632f5..61e4d9bdd 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -385,31 +385,6 @@ bool is_trans(ir::value *v) { } -void selection::lower_value(ir::value *src, IRBuilder<> &builder, generator* gen, std::set& seen) { - if(!seen.insert(src).second) - return; - - if(src->get_type()->is_tile_ty()) - tmap_[src] = gen->get_machine_layout(layouts_->get(src))->create(src); - - - BasicBlock *current = builder.GetInsertBlock(); - auto *inst = dynamic_cast(src); - if(inst && !dynamic_cast(src)) - for(ir::value *op: inst->ops()) - lower_value(op, builder, gen, seen); - - builder.SetInsertPoint(current); - auto *phi = dynamic_cast(src); - if(phi && !current->empty() && current->getFirstNonPHI()) - builder.SetInsertPoint(&*current->getFirstNonPHI()); - - if(auto *usr = dynamic_cast(src)) - usr->accept(gen); - - if(phi && !current->empty() && current->getFirstNonPHI()) - builder.SetInsertPoint(current); -} /* ---------------------------- * ---- Generate LLVM code ---- @@ -445,57 +420,44 @@ Value* selection::alloc_shared(IRBuilder<> &builder, Module& dst) { void selection::run(ir::module &src, Module &dst) { vmap_.clear(); tmap_.clear(); - - LLVMContext &dst_ctx = dst.getContext(); - IRBuilder<> dst_builder(dst_ctx); - + LLVMContext &ctx = dst.getContext(); + IRBuilder<> builder(ctx); // allocate shared memory - sh_mem_ptr_ = alloc_shared(dst_builder, dst); - - // iterate over functions - std::set seen; - - // create tile - generator gen(&dst_ctx, &dst, &dst_builder, a_axes_, axes_, vmap_, tmap_, tgt_, layouts_, alignment_, alloc_, sh_mem_ptr_, - offset_a_i_, offset_a_k_, offset_b_j_, offset_b_k_, num_packs_0_, num_packs_1_, pack_size_0_, pack_size_1_, num_warps_ ); - finalizer fin(&dst_builder, vmap_, tmap_); - + Value *sh_mem_ptr = alloc_shared(builder, dst); + // visit + generator visitor(&ctx, &dst, &builder, a_axes_, axes_, vmap_, tmap_, tgt_, layouts_, alignment_, alloc_, sh_mem_ptr, num_warps_ ); for(ir::alloc_const *x: src.allocs()) - x->accept(&gen); - - for(ir::function *fn: src.get_function_list()) { - - fn->accept(&gen); - - // initialize layouts - for(auto x: layouts_->get_all()) - x.second->accept(&gen); - - // generate LLVM-IR code - for(ir::basic_block *block: fn->blocks()) { - BasicBlock *parent = (BasicBlock*)vmap_[block]; - dst_builder.SetInsertPoint(parent); - for(ir::instruction *i: block->get_inst_list()) - lower_value(i, dst_builder, &gen, seen); - vmap_[block] = dst_builder.GetInsertBlock(); - } - - // finalize double-buffering - for(const auto& x: layouts_->get_all()) - x.second->accept(&fin); - - // finalize phi - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *inst: block->get_inst_list()) - inst->accept(&fin); - - } - + visitor.visit_value(x); + for(ir::function *fn: src.get_function_list()) + visitor.visit_value(fn); } - +void generator::visit_value(ir::value* v) { + if(!seen_.insert(v).second) + return; + // create machine tile + if(v->get_type()->is_tile_ty()) + tmap_[v] = machine_layouts_.at(layouts_->get(v))->create(v); + // visit operands + BasicBlock *current = builder_->GetInsertBlock(); + auto *inst = dynamic_cast(v); + if(inst && !dynamic_cast(v)) + for(ir::value *op: inst->ops()) + visit_value(op); + // change insert point for phi node + builder_->SetInsertPoint(current); + auto *phi = dynamic_cast(v); + if(phi && !current->empty() && current->getFirstNonPHI()) + builder_->SetInsertPoint(&*current->getFirstNonPHI()); + // visit user + if(auto *usr = dynamic_cast(v)) + usr->accept(this); + // revert insert point + if(phi && !current->empty() && current->getFirstNonPHI()) + builder_->SetInsertPoint(current); +} void generator::visit_phi_node(ir::phi_node* phi) { Type *ty = type(phi->get_type()->get_scalar_ty(), *ctx_); @@ -574,19 +536,19 @@ void generator::visit_uncond_branch_inst(ir::uncond_branch_inst* br) { void generator::visit_unmasked_load_inst(ir::unmasked_load_inst* x) { - distributed_tile* result = (distributed_tile*)tmap_.at(x); // find vector size ir::value *ptr = x->get_pointer_operand(); size_t ld = layouts_->get(ptr)->order[0]; unsigned alignment = alignment_->get(ptr, ld); - unsigned vector_size = std::min(result->axis(ld).contiguous, alignment); - distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr); + unsigned vector_size = std::min(axes_.at(a_axes_->get(x, ld)).contiguous, alignment); // vector loads std::map packets; - result->for_each([&](indices_t idx){ + for_each(x, [&](indices_t idx){ + distributed_tile* result = (distributed_tile*)tmap_.at(x); unsigned linear = result->get_linear_index(idx); unsigned id = linear / vector_size; if(linear % vector_size == 0) { + distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr); Value *ptr = pointers->get_value(idx); ptr = builder_->CreateBitCast(ptr, PointerType::get(VectorType::get(result->get_ty(), vector_size), ptr->getType()->getPointerAddressSpace())); @@ -594,25 +556,26 @@ void generator::visit_unmasked_load_inst(ir::unmasked_load_inst* x) { } }); // extract result element - result->for_each([&](indices_t idx){ + for_each(x, [&](indices_t idx){ + distributed_tile* result = (distributed_tile*)tmap_.at(x); unsigned linear = result->get_linear_index(idx); unsigned id = linear / vector_size; - result->set_value(idx, builder_->CreateExtractElement(packets.at(id), linear % vector_size)); + set_value(x, idx, builder_->CreateExtractElement(packets.at(id), linear % vector_size)); }); } void generator::visit_masked_load_inst(ir::masked_load_inst* x) { // find vector size - distributed_tile* result = (distributed_tile*)tmap_.at(x); ir::value *ptr = x->get_pointer_operand(); size_t ld = layouts_->get(ptr)->order[0]; unsigned alignment = alignment_->get(ptr, ld); - unsigned vector_size = std::min(result->axis(ld).contiguous, alignment); + unsigned vector_size = std::min(axes_.at(a_axes_->get(x, ld)).contiguous, alignment); distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr); distributed_tile *masks = (distributed_tile*)tmap_.at(x->get_mask_operand()); distributed_tile *false_values = (distributed_tile*)tmap_.at(x->get_false_value_operand()); std::map packets; - result->for_each([&](indices_t idx){ + for_each(x, [&](indices_t idx){ + distributed_tile* result = (distributed_tile*)tmap_.at(x); unsigned linear = result->get_linear_index(idx); unsigned id = linear / vector_size; if(linear % vector_size == 0) { @@ -664,7 +627,8 @@ void generator::visit_masked_load_inst(ir::masked_load_inst* x) { } }); // extract result element - result->for_each([&](indices_t idx){ + for_each(x, [&](indices_t idx){ + distributed_tile* result = (distributed_tile*)tmap_.at(x); unsigned linear = result->get_linear_index(idx); unsigned id = linear / vector_size; // Value *tmp = builder_->CreateExtractValue(packets.at(id), {(linear % vector_size) / 2}); @@ -714,13 +678,13 @@ void generator::visit_masked_store_inst(ir::masked_store_inst* st) { void generator::visit_reshape_inst(ir::reshape_inst* reshape) { - distributed_tile* result = (distributed_tile*)tmap_.at(reshape); - ir::value* in = reshape->get_operand(0); - distributed_tile *in_tile = (distributed_tile*)tmap_.at(in); for_each(reshape, [&](indices_t out_idx){ + distributed_tile* result = (distributed_tile*)tmap_.at(reshape); unsigned pos = result->get_linear_index(out_idx); + ir::value* in = reshape->get_operand(0); + distributed_tile *in_tile = (distributed_tile*)tmap_.at(in); indices_t in_idx = in_tile->get_ordered_indices(pos); - result->set_value(out_idx, in_tile->get_value(in_idx)); + set_value(reshape, out_idx, get_value(in, in_idx)); }); } @@ -732,17 +696,16 @@ void generator::visit_splat_inst(ir::splat_inst* splat) { } void generator::visit_broadcast_inst(ir::broadcast_inst* bcast) { - distributed_tile* result = (distributed_tile*)tmap_.at(bcast); ir::value* in = bcast->get_operand(0); const auto& in_shapes = in->get_type()->get_tile_shapes(); distributed_tile *in_tile = (distributed_tile*)tmap_.at(in); - result->for_each([&](indices_t out_idx){ + for_each(bcast, [&](indices_t out_idx){ indices_t in_idx = out_idx; for(size_t k = 0; k < in_idx.size(); k++){ if(in_shapes[k] == 1) in_idx[k] = builder_->getInt32(0); } - result->set_value(out_idx, in_tile->get_value(in_idx)); + set_value(bcast, out_idx, in_tile->get_value(in_idx)); }); } @@ -812,17 +775,17 @@ void generator::visit_atomic_add_inst(ir::atomic_add_inst*) { throw std::runtime_error("unsupported"); } -void generator::visit_hmma_dot(ir::dot_inst* dot, distributed_tile *TC, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK) { +void generator::visit_hmma_dot(ir::dot_inst* dot, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK) { const auto& shapes = dot->get_type()->get_tile_shapes(); - - TA->set_vector_size(4*pack_size_0_); - TB->set_vector_size(4*pack_size_1_); + machine_layout_hmma_884_t* hmma = (machine_layout_hmma_884_t*)machine_layouts_.at(layouts_->get(dot)); + TA->set_vector_size(4*hmma->pack_size_0_); + TB->set_vector_size(4*hmma->pack_size_1_); TA->set_return_mode(true); TB->set_return_mode(true); std::map, std::vector> fcs; - TC->for_each([&](indices_t idx){ + for_each(dot, [&](indices_t idx){ std::vector key(idx.size() - 2); std::copy(idx.begin() + 2, idx.end(), key.begin()); fcs[key].push_back(TD->get_value(idx)); @@ -833,10 +796,6 @@ void generator::visit_hmma_dot(ir::dot_inst* dot, distributed_tile *TC, shared_t Type *fp32_pack8_ty = StructType::get(*ctx_, {fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}); FunctionType *mma_ty = FunctionType::get(fp32_pack8_ty, {fp16x2_ty, fp16x2_ty, fp16x2_ty, fp16x2_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty, fp32_ty}, false); - Value *offset_a_i = offset_a_i_; - Value *offset_a_k = offset_a_k_; - Value *offset_b_j = offset_b_j_; - Value *offset_b_k = offset_b_k_; Value* u_thread_id = tgt_->get_local_id(builder_->GetInsertBlock()->getModule(), *builder_, 0); @@ -849,10 +808,15 @@ void generator::visit_hmma_dot(ir::dot_inst* dot, distributed_tile *TC, shared_t bool is_b_row = is_b_trans ^ (ord_b[ord_b.size() - 2] == 1); + Value *offset_a_i = hmma->offset_a_i_; + Value *offset_a_k = hmma->offset_a_k_; if(is_a_row){ offset_a_i = builder_->CreateAdd(offset_a_i, builder_->CreateURem(u_thread_id, builder_->getInt32(4))); offset_a_k = builder_->getInt32(0); } + + Value *offset_b_j = hmma->offset_b_j_; + Value *offset_b_k = hmma->offset_b_k_; if(!is_b_row){ offset_b_j = builder_->CreateAdd(offset_b_j, builder_->CreateURem(u_thread_id, builder_->getInt32(4))); offset_b_k = builder_->getInt32(0); @@ -881,33 +845,33 @@ void generator::visit_hmma_dot(ir::dot_inst* dot, distributed_tile *TC, shared_t for(auto& x: fcs){ std::vector& fc = x.second; - for(unsigned pack_i = 0; pack_i < num_packs_0_; pack_i++) - for(unsigned pack_j = 0; pack_j < num_packs_1_; pack_j++){ + for(unsigned pack_i = 0; pack_i < hmma->num_packs_0_; pack_i++) + for(unsigned pack_j = 0; pack_j < hmma->num_packs_1_; pack_j++){ for(unsigned K = 0; K < NK; K += 4){ Value *_K = builder_->getInt32(K); - Value *current_offset_a_i = builder_->CreateAdd(offset_a_i, builder_->getInt32(pack_i*stride_rep_i*pack_size_0_)); - Value *current_offset_b_i = builder_->CreateAdd(offset_b_j, builder_->getInt32(pack_j*stride_rep_j*pack_size_1_)); + Value *current_offset_a_i = builder_->CreateAdd(offset_a_i, builder_->getInt32(pack_i*stride_rep_i*hmma->pack_size_0_)); + Value *current_offset_b_i = builder_->CreateAdd(offset_b_j, builder_->getInt32(pack_j*stride_rep_j*hmma->pack_size_1_)); indices_t idx_a = {current_offset_a_i, builder_->CreateAdd(offset_a_k, _K)}; indices_t idx_b = {builder_->CreateAdd(offset_b_k, _K), current_offset_b_i}; idx_a.insert(idx_a.end(), x.first.begin(), x.first.end()); idx_b.insert(idx_b.end(), x.first.begin(), x.first.end()); Value *ha = TA->get_value(idx_a); Value *hb = TB->get_value(idx_b); - for(unsigned ii = 0; ii < pack_size_0_; ii++) - for(unsigned jj = 0; jj < pack_size_1_; jj++){ - Value *ha0 = builder_->CreateBitCast(builder_->CreateExtractElement(ha, builder_->getInt32(ii*pack_size_0_ + 0)), fp16x2_ty); - Value *ha1 = builder_->CreateBitCast(builder_->CreateExtractElement(ha, builder_->getInt32(ii*pack_size_0_ + 1)), fp16x2_ty); - Value *hb0 = builder_->CreateBitCast(builder_->CreateExtractElement(hb, builder_->getInt32(jj*pack_size_0_ + 0)), fp16x2_ty); - Value *hb1 = builder_->CreateBitCast(builder_->CreateExtractElement(hb, builder_->getInt32(jj*pack_size_0_ + 1)), fp16x2_ty); + for(unsigned ii = 0; ii < hmma->pack_size_0_; ii++) + for(unsigned jj = 0; jj < hmma->pack_size_1_; jj++){ + Value *ha0 = builder_->CreateBitCast(builder_->CreateExtractElement(ha, builder_->getInt32(ii*hmma->pack_size_0_ + 0)), fp16x2_ty); + Value *ha1 = builder_->CreateBitCast(builder_->CreateExtractElement(ha, builder_->getInt32(ii*hmma->pack_size_0_ + 1)), fp16x2_ty); + Value *hb0 = builder_->CreateBitCast(builder_->CreateExtractElement(hb, builder_->getInt32(jj*hmma->pack_size_0_ + 0)), fp16x2_ty); + Value *hb1 = builder_->CreateBitCast(builder_->CreateExtractElement(hb, builder_->getInt32(jj*hmma->pack_size_0_ + 1)), fp16x2_ty); std::vector idx = { - (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 0)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 1)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 0)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 1)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 2)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 0) + (pack_j*4*pack_size_1_ + jj*4 + 3)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 2)*ld_fc, - (pack_i*2*pack_size_0_ + ii*2 + 1) + (pack_j*4*pack_size_1_ + jj*4 + 3)*ld_fc + (pack_i*2*hmma->pack_size_0_ + ii*2 + 0) + (pack_j*4*hmma->pack_size_1_ + jj*4 + 0)*ld_fc, + (pack_i*2*hmma->pack_size_0_ + ii*2 + 0) + (pack_j*4*hmma->pack_size_1_ + jj*4 + 1)*ld_fc, + (pack_i*2*hmma->pack_size_0_ + ii*2 + 1) + (pack_j*4*hmma->pack_size_1_ + jj*4 + 0)*ld_fc, + (pack_i*2*hmma->pack_size_0_ + ii*2 + 1) + (pack_j*4*hmma->pack_size_1_ + jj*4 + 1)*ld_fc, + (pack_i*2*hmma->pack_size_0_ + ii*2 + 0) + (pack_j*4*hmma->pack_size_1_ + jj*4 + 2)*ld_fc, + (pack_i*2*hmma->pack_size_0_ + ii*2 + 0) + (pack_j*4*hmma->pack_size_1_ + jj*4 + 3)*ld_fc, + (pack_i*2*hmma->pack_size_0_ + ii*2 + 1) + (pack_j*4*hmma->pack_size_1_ + jj*4 + 2)*ld_fc, + (pack_i*2*hmma->pack_size_0_ + ii*2 + 1) + (pack_j*4*hmma->pack_size_1_ + jj*4 + 3)*ld_fc }; Value *nc = builder_->CreateCall(mma_fn, {ha0, ha1, hb0, hb1, fc[idx[0]], fc[idx[1]], fc[idx[2]], fc[idx[3]], fc[idx[4]], fc[idx[5]], fc[idx[6]], fc[idx[7]]}); fc[idx[0]] = builder_->CreateExtractValue(nc, {0}); @@ -925,23 +889,23 @@ void generator::visit_hmma_dot(ir::dot_inst* dot, distributed_tile *TC, shared_t // write back unsigned i = 0; - TC->for_each([&](indices_t idx){ + for_each(dot, [&](indices_t idx){ std::vector key(idx.size() - 2); std::copy(idx.begin() + 2, idx.end(), key.begin()); if(i >= fcs.at(key).size()) i = 0; - TC->set_value(idx, fcs.at(key)[i++]); + set_value(dot, idx, fcs.at(key)[i++]); }); TA->set_return_mode(false); TB->set_return_mode(false); } -void generator::visit_scanline_dot(ir::dot_inst* dot, distributed_tile *TC, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK, +void generator::visit_scanline_dot(ir::dot_inst* dot, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK, Type *c_ty, Function *f_mul_add) { - TA->set_vector_size(TC->axis(0).contiguous); - TB->set_vector_size(TC->axis(1).contiguous); - TC->for_each([&](indices_t idx){ + TA->set_vector_size(axes_.at(a_axes_->get(dot, 0)).contiguous); + TB->set_vector_size(axes_.at(a_axes_->get(dot, 1)).contiguous); + for_each(dot, [&](indices_t idx){ Value *res = TD->get_value(idx); for(unsigned K = 0; K < NK; ++K){ // input indices @@ -961,13 +925,13 @@ void generator::visit_scanline_dot(ir::dot_inst* dot, distributed_tile *TC, shar b = builder_->CreateFPCast(b, c_ty); res = builder_->CreateCall(f_mul_add, {a, b, res}); } - TC->set_value(idx, res); + set_value(dot, idx, res); }); } -void generator::visit_outer_dot(ir::dot_inst*, distributed_tile *TC, distributed_tile *TA, distributed_tile *TB, distributed_tile *TD, unsigned NK, +void generator::visit_outer_dot(ir::dot_inst* dot, distributed_tile *TA, distributed_tile *TB, distributed_tile *TD, unsigned NK, Type *c_ty, Function *f_mul_add) { - TC->for_each([&](indices_t idx){ + for_each(dot, [&](indices_t idx){ Value *res = TD->get_value(idx); indices_t a_idx = {idx[0], builder_->getInt32(0)}; indices_t b_idx = {builder_->getInt32(0), idx[1]}; @@ -980,14 +944,13 @@ void generator::visit_outer_dot(ir::dot_inst*, distributed_tile *TC, distributed if(b->getType() != c_ty) b = builder_->CreateFPCast(b, c_ty); res = builder_->CreateCall(f_mul_add, {a, b, res}); - TC->set_value(idx, res); + set_value(dot, idx, res); }); } void generator::visit_dot_inst(ir::dot_inst* dot) { Function *fn = builder_->GetInsertBlock()->getParent(); - distributed_tile* TC = (distributed_tile*)tmap_.at(dot); Module *module = fn->getParent(); ir::value *A = dot->get_operand(0); ir::value *B = dot->get_operand(1); @@ -1004,14 +967,14 @@ void generator::visit_dot_inst(ir::dot_inst* dot) { shared_tile *TA = (shared_tile*)tmap_.at(A); shared_tile *TB = (shared_tile*)tmap_.at(B); if(layouts_->get(dot)->type == analysis::HMMA_884) - visit_hmma_dot(dot, TC, TA, TB, TD, NK); + visit_hmma_dot(dot, TA, TB, TD, NK); else - visit_scanline_dot(dot, TC, TA, TB, TD, NK, c_ty, f_mul_add); + visit_scanline_dot(dot, TA, TB, TD, NK, c_ty, f_mul_add); } else { distributed_tile *TA = (distributed_tile*)tmap_.at(A); distributed_tile *TB = (distributed_tile*)tmap_.at(B); - visit_outer_dot(dot, TC, TA, TB, TD, NK, c_ty, f_mul_add); + visit_outer_dot(dot, TA, TB, TD, NK, c_ty, f_mul_add); } } @@ -1052,15 +1015,14 @@ void generator::visit_copy_to_shared_inst(ir::copy_to_shared_inst* cts) { ir::value *arg = cts->get_operand(0); auto arg_order = layouts_->get(arg)->order; // tiles - shared_tile* result = (shared_tile*)tmap_.at(cts); - distributed_tile* in = (distributed_tile*)tmap_.at(arg); if(x_order == arg_order){ size_t ld = arg_order[0]; vector_size = layouts_->get(arg)->nts.at(ld); } std::map packets; - in->for_each([&](indices_t idx){ + for_each(arg, [&](indices_t idx){ + distributed_tile* in = (distributed_tile*)tmap_.at(arg); unsigned linear = in->get_linear_index(idx); unsigned id = linear / vector_size; Value *in_value = in->get_value(idx); @@ -1068,19 +1030,19 @@ void generator::visit_copy_to_shared_inst(ir::copy_to_shared_inst* cts) { packets[id] = UndefValue::get(VectorType::get(in_value->getType(), vector_size)); packets[id] = builder_->CreateInsertElement(packets.at(id), in_value, linear % vector_size); }); - in->for_each([&](indices_t idx){ + + for_each(arg, [&](indices_t idx){ + distributed_tile* in = (distributed_tile*)tmap_.at(arg); + shared_tile* result = (shared_tile*)tmap_.at(cts); unsigned linear = in->get_linear_index(idx); unsigned id = linear / vector_size; if(linear % vector_size == 0) result->set_value(idx, packets[id]); }); } - void generator::visit_copy_from_shared_inst(ir::copy_from_shared_inst* cfs) { - distributed_tile* result = (distributed_tile*)tmap_.at(cfs); - shared_tile* arg = (shared_tile*)tmap_.at(cfs->get_operand(0)); - result->for_each([&](indices_t idx){ - result->set_value(idx, arg->get_value(idx)); + for_each(cfs, [&](indices_t idx){ + set_value(cfs, idx, get_value(cfs->get_operand(0), idx)); }); } @@ -1090,33 +1052,30 @@ void generator::visit_barrier_inst(ir::barrier_inst*) { } void generator::visit_make_range_dyn(ir::make_range_dyn* x) { - distributed_tile* result = (distributed_tile*)tmap_.at(x); - result->for_each([&](indices_t idx){ + for_each(x, [&](indices_t idx){ assert(idx.size() == 1); BinaryOperator *bin_add = dyn_cast(idx[0]); assert(bin_add); Value *res = bin_add->getOperand(0); - result->set_value(idx, res); + set_value(x, idx, res); }); } void generator::visit_make_range_sta(ir::make_range_sta* x) { - distributed_tile *T = (distributed_tile *)tmap_.at(x); - T->for_each([&](indices_t idx){ + for_each(x, [&](indices_t idx){ assert(idx.size() == 1); BinaryOperator *bin_add = dyn_cast(idx[0]); assert(bin_add); Value *res = bin_add->getOperand(1); assert(isa(res)); - T->set_value(idx, res); + set_value(x, idx, res); }); } void generator::visit_make_range(ir::make_range* x) { - distributed_tile *T = (distributed_tile *)tmap_.at(x); - T->for_each([&](indices_t idx){ + for_each(x, [&](indices_t idx){ assert(idx.size() == 1); - T->set_value(idx, idx[0]); + set_value(x, idx, idx[0]); }); } @@ -1149,18 +1108,17 @@ void generator::visit_alloc_const(ir::alloc_const *alloc) { void generator::visit_function(ir::function* fn) { LLVMContext &ctx = builder_->getContext(); FunctionType *fn_ty = (FunctionType*)type(fn->get_fn_type(), *ctx_); - FunctionType *dst_fn_ty = fn_ty; if(!tgt_->is_gpu()){ - Type *dst_fn_ret_ty = fn_ty->getReturnType(); - std::vector dst_fn_args_ty; + Type *fn_ret_ty = fn_ty->getReturnType(); + std::vector fn_args_ty; for(unsigned i = 0; i < fn_ty->getNumParams(); i++) - dst_fn_args_ty.push_back(fn_ty->getParamType(i)); - dst_fn_args_ty.push_back(builder_->getInt32Ty()); - dst_fn_args_ty.push_back(builder_->getInt32Ty()); - dst_fn_args_ty.push_back(builder_->getInt32Ty()); - dst_fn_ty = FunctionType::get(dst_fn_ret_ty, dst_fn_args_ty, false); + fn_args_ty.push_back(fn_ty->getParamType(i)); + fn_args_ty.push_back(builder_->getInt32Ty()); + fn_args_ty.push_back(builder_->getInt32Ty()); + fn_args_ty.push_back(builder_->getInt32Ty()); + fn_ty = FunctionType::get(fn_ret_ty, fn_args_ty, false); } - Function *ret = Function::Create(dst_fn_ty, Function::ExternalLinkage, fn->get_name(), mod_); + Function *ret = Function::Create(fn_ty, Function::ExternalLinkage, fn->get_name(), mod_); // set attributes for(auto attr_pair: fn->attrs()){ unsigned id = attr_pair.first; @@ -1176,7 +1134,7 @@ void generator::visit_function(ir::function* fn) { ValueAsMetadata::get(builder_->getInt32(num_warps_*32)) }; mod_->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(MDNode::get(ctx, md_args)); - // map parameters + // set arguments for(unsigned i = 0; i < fn->args().size(); i++) vmap_[fn->args()[i]] = &*(ret->arg_begin() + i); // create blocks @@ -1185,15 +1143,22 @@ void generator::visit_function(ir::function* fn) { vmap_[block] = dst_block; } builder_->SetInsertPoint((BasicBlock*)vmap_[fn->blocks()[0]]); - fn_ = ret; + // initialize layouts + for(auto x: layouts_->get_all()) + visit_layout(x.second); + // generate LLVM-IR code + for(ir::basic_block *block: fn->blocks()) + visit_basic_block(block); + // finalize + finalize_function(fn); } + + + + void generator::visit_layout_hmma_884(analysis::layout_hmma_884_t* layout) { - machine_layouts_[layout] = new machine_layout_hmma_884_t(mod_, builder_, tgt_, type(layout->ty->get_scalar_ty(), *ctx_), a_axes_, axes_, - offset_a_i_, offset_a_k_, offset_b_j_, offset_b_k_, - pack_size_0_, pack_size_1_, - num_packs_0_, num_packs_1_, - layout); + machine_layouts_[layout] = new machine_layout_hmma_884_t(mod_, builder_, tgt_, type(layout->ty->get_scalar_ty(), *ctx_), a_axes_, axes_, layout); } void generator::visit_layout_scanline(analysis::layout_scanline_t* layout) { @@ -1205,10 +1170,24 @@ void generator::visit_layout_shared(analysis::layout_shared_t* layout) { machine_layouts_[layout] = new machine_layout_shared_t(mod_, builder_, tgt_, alloc_, sh_mem_ptr_, layout, vmap_, tmap_); } +void generator::visit_basic_block(ir::basic_block * block) { + BasicBlock *parent = (BasicBlock*)vmap_[block]; + builder_->SetInsertPoint(parent); + for(ir::instruction *i: block->get_inst_list()) + visit_value(i); + vmap_[block] = builder_->GetInsertBlock(); +} + +void generator::visit_argument(ir::argument* arg) { + +} + void generator::for_each(ir::value *x, const std::function& fn) { if(!x->get_type()->is_tile_ty()) return fn({}); else { +// if(tmap_.find(x) == tmap_.end()) +// tmap_[x] = machine_layouts_.at(layouts_->get(x))->create(x); if(auto *dt = dynamic_cast(tmap_.at(x))) dt->for_each(fn); } @@ -1313,13 +1292,8 @@ tile *machine_layout_distributed_t::create(ir::value *v) { machine_layout_hmma_884_t::machine_layout_hmma_884_t(Module *mod, Builder *builder, target *tgt, Type *ty, analysis::axes *a_axes, std::map& axes, - Value *&offset_a_i, Value *&offset_a_k, Value *&offset_b_j, Value *&offset_b_k, - unsigned &pack_size_0, unsigned &pack_size_1, - unsigned &num_packs_0, unsigned &num_packs_1, analysis::layout_hmma_884_t* layout) - : machine_layout_distributed_t(mod, builder, tgt, ty, a_axes, axes, layout), - offset_a_i_(offset_a_i), offset_a_k_(offset_a_k), offset_b_j_(offset_b_j), offset_b_k_(offset_b_k), - pack_size_0_(pack_size_0), pack_size_1_(pack_size_1), num_packs_0_(num_packs_0), num_packs_1_(num_packs_1) { + : machine_layout_distributed_t(mod, builder, tgt, ty, a_axes, axes, layout) { Value *warp_size = builder_->getInt32(32); Value* u_thread_id_0 = tgt_->get_local_id(mod_, *builder_, 0); @@ -1467,34 +1441,18 @@ machine_layout_scanline_t::machine_layout_scanline_t(Module *mod, Builder *build } } -finalizer::finalizer(Builder *builder, std::map& vmap, std::map& tmap) - : builder_(builder), vmap_(vmap), tmap_(tmap) { - +void generator::finalize_function(ir::function* fn) { + // finalize double-buffering + for(const auto& x: layouts_->get_all()) + visit_layout(x.second); + // finalize phi + for(ir::basic_block *block: fn->blocks()) + for(ir::instruction *inst: block->get_inst_list()) + if(auto *phi = dynamic_cast(inst)) + finalize_phi_node(phi); } -void finalizer::for_each(ir::value *x, const std::function& fn) { - if(!x->get_type()->is_tile_ty()) - return fn({}); - else { - if(auto *dt = dynamic_cast(tmap_.at(x))) - dt->for_each(fn); - } -} - -Value* finalizer::get_value(ir::value *x, const indices_t& idx) { - if(x->get_type()->is_tile_ty()) - return tmap_.at(x)->get_value(idx); - return vmap_.at(x); -} - -void finalizer::set_value(ir::value *x, const indices_t& idx, Value* v) { - if(x->get_type()->is_tile_ty()) - tmap_.at(x)->set_value(idx, v); - else - vmap_[x] = v; -} - -void finalizer::visit_phi_node(ir::phi_node* phi) { +void generator::finalize_phi_node(ir::phi_node* phi) { auto it = tmap_.find(phi); if(it != tmap_.end() && dynamic_cast(it->second)) return; @@ -1510,32 +1468,6 @@ void finalizer::visit_phi_node(ir::phi_node* phi) { } -void finalizer::visit_layout_shared(analysis::layout_shared_t* layout) { - if(layout->double_buffer) { - auto info = *layout->double_buffer; - ir::phi_node *phi = info.phi; - PHINode *ptr = (PHINode*)((shared_tile*)tmap_.at(phi))->get_pointer(); - PHINode *offset = (PHINode*)((shared_tile*)tmap_.at(phi))->get_offset(); - for(unsigned n = 0; n < phi->get_num_incoming(); n++){ - ir::basic_block* inc_block = phi->get_incoming_block(n); - ir::value* inc_val = phi->get_incoming_value(n); - BasicBlock *llvm_inc_block = (BasicBlock*)vmap_.at(inc_block); - shared_tile *inc_shared = (shared_tile*)tmap_.at(inc_val); - if(inc_val == info.latch){ - builder_->SetInsertPoint(llvm_inc_block->getTerminator()); - Value *next_offset = builder_->CreateNeg(offset); - offset->addIncoming(next_offset, llvm_inc_block); - } - else { - unsigned num_bytes = layout->ty->get_primitive_size_in_bits() / 8; - offset->addIncoming(builder_->getInt32(layout->size / (2*num_bytes)), llvm_inc_block); - } - ptr->addIncoming(inc_shared->get_pointer(), llvm_inc_block); - } - } -} - - } } diff --git a/lib/ir/function.cc b/lib/ir/function.cc index c15440e9d..84d52df72 100644 --- a/lib/ir/function.cc +++ b/lib/ir/function.cc @@ -25,6 +25,10 @@ unsigned argument::get_arg_no() const { return arg_no_; } +void argument::accept(visitor *v) { + v->visit_argument(this); +} + /* function */ function::function(function_type *ty, linkage_types_t linkage, diff --git a/lib/ir/value.cc b/lib/ir/value.cc index 5dfb0460c..a43aaa05e 100644 --- a/lib/ir/value.cc +++ b/lib/ir/value.cc @@ -32,6 +32,10 @@ void value::replace_all_uses_with(value *target){ throw std::runtime_error("not implemented"); } +void visitor::visit_value(ir::value* v) { + v->accept(this); +} + //===----------------------------------------------------------------------===// // user class @@ -69,5 +73,7 @@ void user::replace_uses_of_with(value *before, value *after) { before->erase_use(this); } + + } } From a15717726754a383fa89783e7604adad8b606fd9 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 17 Oct 2019 00:51:26 -0400 Subject: [PATCH 446/494] [codegen] [selection] more cleaning --- include/triton/codegen/selection.h | 2 ++ lib/codegen/selection.cc | 46 ++++++++++++++++++++++++------ 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index dfdf48ca1..02deedff6 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -225,6 +225,7 @@ private: void visit_outer_dot(ir::dot_inst*, distributed_tile *TA, distributed_tile *TB, distributed_tile *TD, unsigned NK, Type *c_ty, Function *f_mul_add); + void finalize_shared_layout(analysis::layout_shared_t*); void finalize_function(ir::function*); void finalize_phi_node(ir::phi_node*); @@ -322,6 +323,7 @@ private: std::set seen_; }; + // Selection pass class selection{ typedef std::map vmap_t; diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index 61e4d9bdd..dfa12ba17 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -420,16 +420,20 @@ Value* selection::alloc_shared(IRBuilder<> &builder, Module& dst) { void selection::run(ir::module &src, Module &dst) { vmap_.clear(); tmap_.clear(); + LLVMContext &ctx = dst.getContext(); IRBuilder<> builder(ctx); + // allocate shared memory Value *sh_mem_ptr = alloc_shared(builder, dst); - // visit - generator visitor(&ctx, &dst, &builder, a_axes_, axes_, vmap_, tmap_, tgt_, layouts_, alignment_, alloc_, sh_mem_ptr, num_warps_ ); + + // create tile + generator gen(&ctx, &dst, &builder, a_axes_, axes_, vmap_, tmap_, tgt_, layouts_, alignment_, alloc_, sh_mem_ptr, num_warps_ ); + for(ir::alloc_const *x: src.allocs()) - visitor.visit_value(x); + gen.visit_value(x); for(ir::function *fn: src.get_function_list()) - visitor.visit_value(fn); + gen.visit_value(fn); } @@ -1441,10 +1445,36 @@ machine_layout_scanline_t::machine_layout_scanline_t(Module *mod, Builder *build } } -void generator::finalize_function(ir::function* fn) { +void generator::finalize_shared_layout(analysis::layout_shared_t *shared) { + if(shared->double_buffer) { + auto info = *shared->double_buffer; + ir::phi_node *phi = info.phi; + PHINode *ptr = (PHINode*)((shared_tile*)tmap_.at(phi))->get_pointer(); + PHINode *offset = (PHINode*)((shared_tile*)tmap_.at(phi))->get_offset(); + for(unsigned n = 0; n < phi->get_num_incoming(); n++){ + ir::basic_block* inc_block = phi->get_incoming_block(n); + ir::value* inc_val = phi->get_incoming_value(n); + BasicBlock *llvm_inc_block = (BasicBlock*)vmap_.at(inc_block); + shared_tile *inc_shared = (shared_tile*)tmap_.at(inc_val); + if(inc_val == info.latch){ + builder_->SetInsertPoint(llvm_inc_block->getTerminator()); + Value *next_offset = builder_->CreateNeg(offset); + offset->addIncoming(next_offset, llvm_inc_block); + } + else { + unsigned num_bytes = shared->ty->get_primitive_size_in_bits() / 8; + offset->addIncoming(builder_->getInt32(shared->size / (2*num_bytes)), llvm_inc_block); + } + ptr->addIncoming(inc_shared->get_pointer(), llvm_inc_block); + } + } +} + +void generator::finalize_function(ir::function *fn) { // finalize double-buffering for(const auto& x: layouts_->get_all()) - visit_layout(x.second); + if(auto *shared = dynamic_cast(x.second)) + finalize_shared_layout(shared); // finalize phi for(ir::basic_block *block: fn->blocks()) for(ir::instruction *inst: block->get_inst_list()) @@ -1452,7 +1482,7 @@ void generator::finalize_function(ir::function* fn) { finalize_phi_node(phi); } -void generator::finalize_phi_node(ir::phi_node* phi) { +void generator::finalize_phi_node(ir::phi_node *phi) { auto it = tmap_.find(phi); if(it != tmap_.end() && dynamic_cast(it->second)) return; @@ -1467,7 +1497,5 @@ void generator::finalize_phi_node(ir::phi_node* phi) { } } - - } } From a0182f41dd5be05231bc9621a27978306ee3469d Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 17 Oct 2019 08:17:23 -0400 Subject: [PATCH 447/494] more cleaning --- include/triton/codegen/selection.h | 42 ++++----------- lib/codegen/selection.cc | 84 ++++++++++++++++-------------- lib/runtime/function.cc | 2 +- 3 files changed, 56 insertions(+), 72 deletions(-) diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index 02deedff6..f93b53886 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -178,7 +178,7 @@ public: class machine_layout_distributed_t: public machine_layout_t { public: machine_layout_distributed_t(Module *mod, Builder *builder, target *tgt, Type *ty, - analysis::axes *a_axes, std::map& axes, + std::map& axes, analysis::layout_t* layout); tile* create(ir::value *v); @@ -186,7 +186,6 @@ public: Builder *builder_; target *tgt_; Type *ty_; - analysis::axes *a_axes_; std::map& axes_; analysis::layout_t* layout_; }; @@ -196,7 +195,7 @@ class machine_layout_hmma_884_t: public machine_layout_distributed_t { public: machine_layout_hmma_884_t(Module *mod, Builder *builder, target *tgt, Type *ty, - analysis::axes *a_axes, std::map& axes, + std::map& axes, analysis::layout_hmma_884_t* layout); Value *offset_a_i_, *offset_a_k_; Value *offset_b_j_, *offset_b_k_; @@ -210,7 +209,7 @@ class machine_layout_scanline_t: public machine_layout_distributed_t { public: machine_layout_scanline_t(Module *mod, Builder *builder, target *tgt, Type *ty, - analysis::axes *a_axes, std::map& axes, + std::map& axes, analysis::layout_scanline_t* layout); }; @@ -230,22 +229,12 @@ private: void finalize_phi_node(ir::phi_node*); public: - generator(LLVMContext *ctx, - Module *dst, - Builder *builder, - analysis::axes *a_axes, - std::map& axes, - std::map& vmap, - std::map& tmap, + generator(Module *dst, target *tgt, analysis::layout *layouts, analysis::align *alignment, analysis::allocation *alloc, - Value *sh_mem_ptr, - unsigned num_warps) - : ctx_(ctx), mod_(dst), builder_(builder), a_axes_(a_axes), axes_(axes), vmap_(vmap), tmap_(tmap), tgt_(tgt), - layouts_(layouts), alignment_(alignment), alloc_(alloc), sh_mem_ptr_(sh_mem_ptr), - num_warps_(num_warps) { } + unsigned num_warps); void visit_value(ir::value* v); @@ -305,14 +294,13 @@ public: private: LLVMContext *ctx_; - Builder *builder_; + std::unique_ptr builder_; Module *mod_; std::map machine_layouts_; - analysis::axes *a_axes_; - std::map& axes_; - std::map& vmap_; - std::map& tmap_; + std::map axes_; + std::map vmap_; + std::map tmap_; target *tgt_; analysis::layout *layouts_; analysis::align *alignment_; @@ -329,30 +317,22 @@ class selection{ typedef std::map vmap_t; typedef std::map tmap_t; -private: - // LLVM conversions - Value* alloc_shared(Builder &builder, Module& dst); - public: selection(analysis::liveness* liveness, analysis::allocation *alloc, - analysis::align *alignment, analysis::axes *axes, + analysis::align *alignment, analysis::layout *layouts, target *tgt, unsigned num_warps) : liveness_(liveness), alloc_(alloc), - alignment_(alignment), a_axes_(axes), layouts_(layouts), + alignment_(alignment), layouts_(layouts), tgt_(tgt), num_warps_(num_warps){ } void run(ir::module &src, Module &dst); private: - vmap_t vmap_; - tmap_t tmap_; analysis::liveness *liveness_; analysis::allocation *alloc_; - analysis::axes *a_axes_; analysis::layout *layouts_; analysis::align *alignment_; target *tgt_; - std::map axes_; unsigned num_warps_; }; diff --git a/lib/codegen/selection.cc b/lib/codegen/selection.cc index dfa12ba17..a4daa2d50 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection.cc @@ -401,34 +401,9 @@ inline llvm::Attribute llvm_attr(llvm::LLVMContext& ctx, ir::attribute attr) { } -Value* selection::alloc_shared(IRBuilder<> &builder, Module& dst) { - Value *ret = nullptr; - LLVMContext &ctx = builder.getContext(); - if(tgt_->is_gpu()) - if(unsigned alloc_size = alloc_->allocated_size()){ - Type *int_8_ty = Type::getInt8Ty(ctx); - ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_size); - Type *ptr_ty = PointerType::get(int_8_ty, 3); - GlobalVariable *sh_mem_array = - new GlobalVariable(dst, array_ty, false, GlobalVariable::ExternalLinkage, - nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); - ret = builder.CreateBitCast(sh_mem_array, ptr_ty); - } - return ret; -} - void selection::run(ir::module &src, Module &dst) { - vmap_.clear(); - tmap_.clear(); - - LLVMContext &ctx = dst.getContext(); - IRBuilder<> builder(ctx); - - // allocate shared memory - Value *sh_mem_ptr = alloc_shared(builder, dst); - // create tile - generator gen(&ctx, &dst, &builder, a_axes_, axes_, vmap_, tmap_, tgt_, layouts_, alignment_, alloc_, sh_mem_ptr, num_warps_ ); + generator gen(&dst, tgt_, layouts_, alignment_, alloc_, num_warps_ ); for(ir::alloc_const *x: src.allocs()) gen.visit_value(x); @@ -438,6 +413,32 @@ void selection::run(ir::module &src, Module &dst) { +generator::generator(Module *dst, + target *tgt, + analysis::layout *layouts, + analysis::align *alignment, + analysis::allocation *alloc, + unsigned num_warps) + : ctx_(&dst->getContext()), mod_(dst), + builder_(new Builder(dst->getContext())), + tgt_(tgt), + layouts_(layouts), alignment_(alignment), alloc_(alloc), + num_warps_(num_warps) { + + if(tgt_->is_gpu()) + if(unsigned alloc_size = alloc_->allocated_size()){ + Type *int_8_ty = Type::getInt8Ty(*ctx_); + ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_size); + Type *ptr_ty = PointerType::get(int_8_ty, 3); + GlobalVariable *sh_mem_array = + new GlobalVariable(*dst, array_ty, false, GlobalVariable::ExternalLinkage, + nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); + sh_mem_ptr_ = builder_->CreateBitCast(sh_mem_array, ptr_ty); + } + +} + + void generator::visit_value(ir::value* v) { if(!seen_.insert(v).second) return; @@ -544,11 +545,12 @@ void generator::visit_unmasked_load_inst(ir::unmasked_load_inst* x) { ir::value *ptr = x->get_pointer_operand(); size_t ld = layouts_->get(ptr)->order[0]; unsigned alignment = alignment_->get(ptr, ld); - unsigned vector_size = std::min(axes_.at(a_axes_->get(x, ld)).contiguous, alignment); // vector loads std::map packets; for_each(x, [&](indices_t idx){ distributed_tile* result = (distributed_tile*)tmap_.at(x); + unsigned vector_size = std::min(result->axis(ld).contiguous, alignment); + unsigned linear = result->get_linear_index(idx); unsigned id = linear / vector_size; if(linear % vector_size == 0) { @@ -562,6 +564,7 @@ void generator::visit_unmasked_load_inst(ir::unmasked_load_inst* x) { // extract result element for_each(x, [&](indices_t idx){ distributed_tile* result = (distributed_tile*)tmap_.at(x); + unsigned vector_size = std::min(result->axis(ld).contiguous, alignment); unsigned linear = result->get_linear_index(idx); unsigned id = linear / vector_size; set_value(x, idx, builder_->CreateExtractElement(packets.at(id), linear % vector_size)); @@ -573,13 +576,13 @@ void generator::visit_masked_load_inst(ir::masked_load_inst* x) { ir::value *ptr = x->get_pointer_operand(); size_t ld = layouts_->get(ptr)->order[0]; unsigned alignment = alignment_->get(ptr, ld); - unsigned vector_size = std::min(axes_.at(a_axes_->get(x, ld)).contiguous, alignment); distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr); distributed_tile *masks = (distributed_tile*)tmap_.at(x->get_mask_operand()); distributed_tile *false_values = (distributed_tile*)tmap_.at(x->get_false_value_operand()); std::map packets; for_each(x, [&](indices_t idx){ distributed_tile* result = (distributed_tile*)tmap_.at(x); + unsigned vector_size = std::min(result->axis(ld).contiguous, alignment); unsigned linear = result->get_linear_index(idx); unsigned id = linear / vector_size; if(linear % vector_size == 0) { @@ -633,6 +636,7 @@ void generator::visit_masked_load_inst(ir::masked_load_inst* x) { // extract result element for_each(x, [&](indices_t idx){ distributed_tile* result = (distributed_tile*)tmap_.at(x); + unsigned vector_size = std::min(result->axis(ld).contiguous, alignment); unsigned linear = result->get_linear_index(idx); unsigned id = linear / vector_size; // Value *tmp = builder_->CreateExtractValue(packets.at(id), {(linear % vector_size) / 2}); @@ -907,8 +911,8 @@ void generator::visit_hmma_dot(ir::dot_inst* dot, shared_tile *TA, shared_tile * } void generator::visit_scanline_dot(ir::dot_inst* dot, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK, Type *c_ty, Function *f_mul_add) { - TA->set_vector_size(axes_.at(a_axes_->get(dot, 0)).contiguous); - TB->set_vector_size(axes_.at(a_axes_->get(dot, 1)).contiguous); + TA->set_vector_size(TD->axis(0).contiguous); + TB->set_vector_size(TD->axis(1).contiguous); for_each(dot, [&](indices_t idx){ Value *res = TD->get_value(idx); for(unsigned K = 0; K < NK; ++K){ @@ -1162,16 +1166,16 @@ void generator::visit_function(ir::function* fn) { void generator::visit_layout_hmma_884(analysis::layout_hmma_884_t* layout) { - machine_layouts_[layout] = new machine_layout_hmma_884_t(mod_, builder_, tgt_, type(layout->ty->get_scalar_ty(), *ctx_), a_axes_, axes_, layout); + machine_layouts_[layout] = new machine_layout_hmma_884_t(mod_, &*builder_, tgt_, type(layout->ty->get_scalar_ty(), *ctx_), axes_, layout); } void generator::visit_layout_scanline(analysis::layout_scanline_t* layout) { - machine_layouts_[layout] = new machine_layout_scanline_t(mod_, builder_, tgt_, type(layout->ty->get_scalar_ty(), *ctx_), a_axes_, axes_, layout); + machine_layouts_[layout] = new machine_layout_scanline_t(mod_, &*builder_, tgt_, type(layout->ty->get_scalar_ty(), *ctx_), axes_, layout); } void generator::visit_layout_shared(analysis::layout_shared_t* layout) { - machine_layouts_[layout] = new machine_layout_shared_t(mod_, builder_, tgt_, alloc_, sh_mem_ptr_, layout, vmap_, tmap_); + machine_layouts_[layout] = new machine_layout_shared_t(mod_, &*builder_, tgt_, alloc_, sh_mem_ptr_, layout, vmap_, tmap_); } void generator::visit_basic_block(ir::basic_block * block) { @@ -1270,9 +1274,9 @@ tile* machine_layout_shared_t::create(ir::value *v) { } machine_layout_distributed_t::machine_layout_distributed_t(Module *mod, Builder *builder, target *tgt, Type *ty, - analysis::axes *a_axes, std::map& axes, + std::map& axes, analysis::layout_t *layout) - : mod_(mod), builder_(builder), tgt_(tgt), ty_(ty), a_axes_(a_axes), axes_(axes), layout_(layout) { + : mod_(mod), builder_(builder), tgt_(tgt), ty_(ty), axes_(axes), layout_(layout) { } @@ -1282,7 +1286,7 @@ tile *machine_layout_distributed_t::create(ir::value *v) { std::vector axes(shapes.size()); for(size_t d = 0; d < shapes.size(); d++){ if(shapes[d] > 1){ - unsigned x = a_axes_->get(v, d); + unsigned x = layout_->axes[d]; axes[d] = axes_.at(x); } else{ @@ -1294,10 +1298,10 @@ tile *machine_layout_distributed_t::create(ir::value *v) { } machine_layout_hmma_884_t::machine_layout_hmma_884_t(Module *mod, Builder *builder, - target *tgt, Type *ty, analysis::axes *a_axes, + target *tgt, Type *ty, std::map& axes, analysis::layout_hmma_884_t* layout) - : machine_layout_distributed_t(mod, builder, tgt, ty, a_axes, axes, layout) { + : machine_layout_distributed_t(mod, builder, tgt, ty, axes, layout) { Value *warp_size = builder_->getInt32(32); Value* u_thread_id_0 = tgt_->get_local_id(mod_, *builder_, 0); @@ -1413,9 +1417,9 @@ machine_layout_hmma_884_t::machine_layout_hmma_884_t(Module *mod, Builder *build machine_layout_scanline_t::machine_layout_scanline_t(Module *mod, Builder *builder, target *tgt, Type *ty, - analysis::axes *a_axes, std::map &axes, + std::map &axes, analysis::layout_scanline_t* layout) - : machine_layout_distributed_t(mod, builder, tgt, ty, a_axes, axes, layout) { + : machine_layout_distributed_t(mod, builder, tgt, ty, axes, layout) { Value *warp_size = builder_->getInt32(32); Value* u_thread_id_0 = tgt_->get_local_id(mod_, *builder_, 0); diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index b83ea8442..68fe7fe01 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -217,7 +217,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::transform::reassociate reassociate(&align); codegen::transform::coalesce coalesce(&align, &layouts); codegen::transform::cts cts; - codegen::selection selection(&liveness, &allocation, &align, &axes, &layouts, target.get(), opt.num_warps); + codegen::selection selection(&liveness, &allocation, &align, &layouts, target.get(), opt.num_warps); // run passes // ir::print(module, std::cout); peephole.run(module); From f4f70db234b85d14226e3980f02cc0800446ad76 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 17 Oct 2019 12:31:26 -0400 Subject: [PATCH 448/494] [codegen] [selection] re-arranged file structure --- include/triton/codegen/selection.h | 14 +- include/triton/codegen/selection/generator.h | 169 ++++++ .../triton/codegen/selection/machine_layout.h | 138 +++++ .../triton/codegen/selection/machine_value.h | 153 ++++++ include/triton/codegen/selection/selection.h | 70 +++ .../{selection.cc => selection/generator.cc} | 509 +----------------- lib/codegen/selection/machine_layout.cc | 308 +++++++++++ lib/codegen/selection/machine_value.cc | 206 +++++++ lib/codegen/selection/selection.cc | 20 + lib/runtime/function.cc | 2 +- 10 files changed, 1091 insertions(+), 498 deletions(-) create mode 100644 include/triton/codegen/selection/generator.h create mode 100644 include/triton/codegen/selection/machine_layout.h create mode 100644 include/triton/codegen/selection/machine_value.h create mode 100644 include/triton/codegen/selection/selection.h rename lib/codegen/{selection.cc => selection/generator.cc} (69%) create mode 100644 lib/codegen/selection/machine_layout.cc create mode 100644 lib/codegen/selection/machine_value.cc create mode 100644 lib/codegen/selection/selection.cc diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h index f93b53886..da6399573 100644 --- a/include/triton/codegen/selection.h +++ b/include/triton/codegen/selection.h @@ -178,7 +178,7 @@ public: class machine_layout_distributed_t: public machine_layout_t { public: machine_layout_distributed_t(Module *mod, Builder *builder, target *tgt, Type *ty, - std::map& axes, + analysis::axes *a_axes, std::map& axes, analysis::layout_t* layout); tile* create(ir::value *v); @@ -186,6 +186,7 @@ public: Builder *builder_; target *tgt_; Type *ty_; + analysis::axes *a_axes_; std::map& axes_; analysis::layout_t* layout_; }; @@ -195,7 +196,7 @@ class machine_layout_hmma_884_t: public machine_layout_distributed_t { public: machine_layout_hmma_884_t(Module *mod, Builder *builder, target *tgt, Type *ty, - std::map& axes, + analysis::axes *a_axes, std::map& axes, analysis::layout_hmma_884_t* layout); Value *offset_a_i_, *offset_a_k_; Value *offset_b_j_, *offset_b_k_; @@ -209,7 +210,7 @@ class machine_layout_scanline_t: public machine_layout_distributed_t { public: machine_layout_scanline_t(Module *mod, Builder *builder, target *tgt, Type *ty, - std::map& axes, + analysis::axes *a_axes, std::map& axes, analysis::layout_scanline_t* layout); }; @@ -230,6 +231,7 @@ private: public: generator(Module *dst, + analysis::axes *a_axes, target *tgt, analysis::layout *layouts, analysis::align *alignment, @@ -298,6 +300,7 @@ private: Module *mod_; std::map machine_layouts_; + analysis::axes *a_axes_; std::map axes_; std::map vmap_; std::map tmap_; @@ -319,10 +322,10 @@ class selection{ public: selection(analysis::liveness* liveness, analysis::allocation *alloc, - analysis::align *alignment, + analysis::align *alignment, analysis::axes *axes, analysis::layout *layouts, target *tgt, unsigned num_warps) : liveness_(liveness), alloc_(alloc), - alignment_(alignment), layouts_(layouts), + alignment_(alignment), a_axes_(axes), layouts_(layouts), tgt_(tgt), num_warps_(num_warps){ } void run(ir::module &src, Module &dst); @@ -330,6 +333,7 @@ public: private: analysis::liveness *liveness_; analysis::allocation *alloc_; + analysis::axes *a_axes_; analysis::layout *layouts_; analysis::align *alignment_; target *tgt_; diff --git a/include/triton/codegen/selection/generator.h b/include/triton/codegen/selection/generator.h new file mode 100644 index 000000000..76ec88b90 --- /dev/null +++ b/include/triton/codegen/selection/generator.h @@ -0,0 +1,169 @@ +#pragma once + +#ifndef _TRITON_SELECTION_GENERATOR_H_ +#define _TRITON_SELECTION_GENERATOR_H_ + +#include "triton/ir/visitor.h" +#include "triton/codegen/analysis/layout.h" +#include "triton/codegen/selection/machine_value.h" +#include + +// forward +namespace llvm{ + class Type; + class Value; + class Instruction; + class Constant; + class LLVMContext; + class Module; + class ConstantFolder; + class IRBuilderDefaultInserter; + template + class IRBuilder; + class ArrayType; + class Function; +} + +namespace triton{ +namespace codegen{ + +// forward +namespace analysis{ +class liveness; +class tiles; +class align; +class allocation; +class cts; +class axes; +class layout; +} +// typedef +typedef llvm::IRBuilder Builder; +typedef llvm::LLVMContext LLVMContext; +typedef llvm::Type Type; +typedef llvm::Value Value; +typedef llvm::Module Module; +typedef llvm::Instruction Instruction; +typedef llvm::Constant Constant; +typedef llvm::ArrayType ArrayType; +typedef llvm::Function Function; +typedef std::vector indices_t; +// forward +class machine_layout_t; +class tile; +class shared_tile; +class distributed_tile; +class target; + +} +} + +namespace triton{ +namespace codegen{ + + +class generator: public ir::visitor, public analysis::layout_visitor { +private: + void for_each(ir::value *x, const std::function& fn); + Value* get_value(ir::value *x, const indices_t& idx); + void set_value(ir::value *x, const indices_t& idx, Value* v); + + void visit_hmma_dot(ir::dot_inst*, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK); + void visit_scanline_dot(ir::dot_inst*, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK, Type *c_ty, Function *f_mul_add); + void visit_outer_dot(ir::dot_inst*, distributed_tile *TA, distributed_tile *TB, distributed_tile *TD, unsigned NK, + Type *c_ty, Function *f_mul_add); + + void finalize_shared_layout(analysis::layout_shared_t*); + void finalize_function(ir::function*); + void finalize_phi_node(ir::phi_node*); + +public: + generator(Module *dst, + analysis::axes *a_axes, + target *tgt, + analysis::layout *layouts, + analysis::align *alignment, + analysis::allocation *alloc, + unsigned num_warps); + + void visit_value(ir::value* v); + + void visit_phi_node(ir::phi_node*); + void visit_binary_operator(ir::binary_operator*); + void visit_getelementptr_inst(ir::getelementptr_inst*); + + void visit_icmp_inst(ir::icmp_inst*); + void visit_fcmp_inst(ir::fcmp_inst*); + void visit_cast_inst(ir::cast_inst*); + + void visit_return_inst(ir::return_inst*); + void visit_cond_branch_inst(ir::cond_branch_inst*); + void visit_uncond_branch_inst(ir::uncond_branch_inst*); + + + void visit_unmasked_load_inst(ir::unmasked_load_inst*); + void visit_masked_load_inst(ir::masked_load_inst*); + void visit_unmasked_store_inst(ir::unmasked_store_inst*); + void visit_masked_store_inst(ir::masked_store_inst*); + + void visit_reshape_inst(ir::reshape_inst*); + void visit_splat_inst(ir::splat_inst*); + void visit_broadcast_inst(ir::broadcast_inst*); + void visit_downcast_inst(ir::downcast_inst*); + + void visit_get_program_id_inst(ir::get_program_id_inst*); + void visit_get_num_program_inst(ir::get_num_program_inst*); + void visit_atomic_cas_inst(ir::atomic_cas_inst*); + void visit_atomic_exch_inst(ir::atomic_exch_inst*); + void visit_atomic_add_inst(ir::atomic_add_inst*); + void visit_dot_inst(ir::dot_inst*); + void visit_trans_inst(ir::trans_inst*); + void visit_sqrt_inst(ir::sqrt_inst*); + void visit_reduce_inst(ir::reduce_inst*); + void visit_select_inst(ir::select_inst*); + + void visit_copy_to_shared_inst(ir::copy_to_shared_inst*); + void visit_copy_from_shared_inst(ir::copy_from_shared_inst*); + void visit_barrier_inst(ir::barrier_inst*); + void visit_make_range_dyn(ir::make_range_dyn*); + void visit_make_range(ir::make_range*); + + void visit_make_range_sta(ir::make_range_sta*); + void visit_undef_value(ir::undef_value*); + void visit_constant_int(ir::constant_int*); + void visit_constant_fp(ir::constant_fp*); + void visit_alloc_const(ir::alloc_const*); + + void visit_function(ir::function*); + void visit_basic_block(ir::basic_block*); + void visit_argument(ir::argument*); + + void visit_layout_hmma_884(analysis::layout_hmma_884_t*); + void visit_layout_scanline(analysis::layout_scanline_t*); + void visit_layout_shared(analysis::layout_shared_t*); + +private: + LLVMContext *ctx_; + Builder* builder_; + Module *mod_; + + std::map machine_layouts_; + analysis::axes *a_axes_; + std::map axes_; + std::map vmap_; + std::map tmap_; + target *tgt_; + analysis::layout *layouts_; + analysis::align *alignment_; + analysis::allocation *alloc_; + Value *sh_mem_ptr_; + unsigned num_warps_; + + std::set seen_; +}; + +} +} + +#endif diff --git a/include/triton/codegen/selection/machine_layout.h b/include/triton/codegen/selection/machine_layout.h new file mode 100644 index 000000000..a3b453995 --- /dev/null +++ b/include/triton/codegen/selection/machine_layout.h @@ -0,0 +1,138 @@ +#pragma once + +#ifndef _TRITON_SELECTION_MACHINE_LAYOUT_H_ +#define _TRITON_SELECTION_MACHINE_LAYOUT_H_ + +#include +#include "triton/codegen/analysis/layout.h" + +namespace llvm{ + class Type; + class Value; + class Instruction; + class Constant; + class LLVMContext; + class Module; + class ConstantFolder; + class IRBuilderDefaultInserter; + template + class IRBuilder; + class ArrayType; + class Function; +} + +namespace triton{ + +namespace ir{ +class value; +} + +namespace codegen{ + +namespace analysis{ +class liveness; +class tiles; +class align; +class allocation; +class cts; +class axes; +class layout; +} + +typedef llvm::IRBuilder Builder; +typedef llvm::LLVMContext LLVMContext; +typedef llvm::Type Type; +typedef llvm::Value Value; +typedef llvm::Module Module; +typedef llvm::Instruction Instruction; +typedef llvm::Constant Constant; +typedef llvm::ArrayType ArrayType; +typedef llvm::Function Function; + +class distributed_axis; +class machine_layout_t; +class tile; +class shared_tile; +class distributed_tile; +class target; + +} +} + +namespace triton{ +namespace codegen{ + + +class machine_layout_t { +public: + virtual tile* create(ir::value *v) = 0; +}; + +class machine_layout_shared_t: public machine_layout_t { +public: + machine_layout_shared_t(Module *mod, Builder *builder, target *tgt, analysis::allocation* alloc, Value *&sh_mem_ptr, analysis::layout_t* layout, + std::map& vmap, + std::map& tmap); + + tile* create(ir::value *v); + + Module *mod_; + Builder *builder_; + target *tgt_; + analysis::allocation* alloc_; + Value *&sh_mem_ptr_; + analysis::layout_t* layout_; + std::map& vmap_; + std::map& tmap_; + + Value *offset_; + Value *ptr_; + Value *pre_ptr_; + Value *next_ptr_; + +}; + +class machine_layout_distributed_t: public machine_layout_t { +public: + machine_layout_distributed_t(Module *mod, Builder *builder, target *tgt, Type *ty, + analysis::axes *a_axes, std::map& axes, + analysis::layout_t* layout); + + tile* create(ir::value *v); + Module *mod_; + Builder *builder_; + target *tgt_; + Type *ty_; + analysis::axes *a_axes_; + std::map& axes_; + analysis::layout_t* layout_; +}; + + +class machine_layout_hmma_884_t: public machine_layout_distributed_t { +public: + machine_layout_hmma_884_t(Module *mod, Builder *builder, + target *tgt, Type *ty, + analysis::axes *a_axes, std::map& axes, + analysis::layout_hmma_884_t* layout); + Value *offset_a_i_, *offset_a_k_; + Value *offset_b_j_, *offset_b_k_; + unsigned pack_size_0_; + unsigned pack_size_1_; + unsigned num_packs_0_; + unsigned num_packs_1_; +}; + +class machine_layout_scanline_t: public machine_layout_distributed_t { +public: + machine_layout_scanline_t(Module *mod, Builder *builder, + target *tgt, Type *ty, + analysis::axes *a_axes, std::map& axes, + analysis::layout_scanline_t* layout); +}; + +} +} + +#endif diff --git a/include/triton/codegen/selection/machine_value.h b/include/triton/codegen/selection/machine_value.h new file mode 100644 index 000000000..508881fd3 --- /dev/null +++ b/include/triton/codegen/selection/machine_value.h @@ -0,0 +1,153 @@ +#pragma once + +#ifndef _TRITON_SELECTION_MACHINE_VALUE_H_ +#define _TRITON_SELECTION_MACHINE_VALUE_H_ + +#include +#include +#include + +namespace llvm{ + class Type; + class Value; + class Instruction; + class Constant; + class LLVMContext; + class Module; + class ConstantFolder; + class IRBuilderDefaultInserter; + template + class IRBuilder; + class ArrayType; + class Function; +} + +namespace triton{ +namespace codegen{ + typedef llvm::IRBuilder Builder; + typedef llvm::LLVMContext LLVMContext; + typedef llvm::Type Type; + typedef llvm::Value Value; + typedef llvm::Module Module; + typedef llvm::Instruction Instruction; + typedef llvm::Constant Constant; + typedef llvm::ArrayType ArrayType; + typedef llvm::Function Function; +} +} + +namespace triton{ +namespace codegen{ + +namespace analysis{ +class liveness; +class tiles; +class align; +class allocation; +class cts; +class axes; +class layout; +} + +class distributed_axis; +class machine_layout_t; +class tile; +class shared_tile; +class distributed_tile; +class target; +typedef std::vector indices_t; + +} +} + +namespace triton{ +namespace codegen{ + +struct distributed_axis { + int contiguous; + std::vector values; + Value* thread_id; +}; + +class tile { +protected: + typedef std::vector shapes_t; + +public: + tile(Type *ty, const shapes_t &shapes): ty_(ty), shapes_(shapes){ } + virtual void set_value(indices_t idx, Value *v) = 0; + virtual Value* get_value(indices_t idx) = 0; + Type *get_ty() const { return ty_; } + shapes_t get_shapes() const { return shapes_; } + +protected: + Type *ty_; + shapes_t shapes_; +}; + +class shared_tile: public tile { +private: + void extract_constant(Value *arg, Value *&non_cst, Value *&cst); + void extract_constant(const indices_t &arg_idx, indices_t &non_cst_idx, indices_t &cst_idx); + + +public: + shared_tile(Type* ty, const shapes_t &shapes, const std::vector &order, Value* ptr, Builder &builder, Value* offset = nullptr, const std::vector& perm = {}); + void set_vector_size(unsigned vector_size); + void set_return_mode(bool return_vector); + void set_value(indices_t, Value *); + Value* get_ptr_to(indices_t idx); + Value* get_value(indices_t idx); + Value* get_pointer() { return ptr_; } + Value* get_offset() { return offset_; } + const std::vector& get_perm() { return perm_; } + const std::vector& get_order() { return order_; } + static Value* shared_offset(Builder& builder, const shapes_t& shapes, const std::vector& perm, const std::vector& order, indices_t idx); + +private: + Value *ptr_; + bool return_vector_; + Builder &builder_; + Value *offset_; + std::map ptr_cache_; + unsigned vector_size_; + std::vector order_; + std::vector perm_; +}; + +// Distribtued tile +class distributed_tile: public tile{ + typedef std::vector axes_t; + typedef std::vector ordered_indices_vec_t; + typedef std::map indices_map_t; + typedef std::map values_map_t; + +private: + void init_indices(); + Type *make_vector_ty(Type *ty, size_t vector_size); + +public: + distributed_tile(Type *ty, const shapes_t& shapes, const std::vector& order, const axes_t &axes, Builder &builder, bool vectorize); + void set_value(indices_t idx, Value *v); + Value* get_value(indices_t idx); + const std::vector& get_order() { return order_; } + unsigned get_linear_index(indices_t idx); + indices_t get_ordered_indices(unsigned id); + void for_each(std::function fn); + const distributed_axis &axis(unsigned dim) { return axes_.at(dim); } + +private: + axes_t axes_; + std::vector order_; + indices_map_t indices_; + values_map_t values_; + ordered_indices_vec_t ordered_indices_; + size_t vector_size_; + Builder &builder_; +}; + +} +} + +#endif diff --git a/include/triton/codegen/selection/selection.h b/include/triton/codegen/selection/selection.h new file mode 100644 index 000000000..a2b88247f --- /dev/null +++ b/include/triton/codegen/selection/selection.h @@ -0,0 +1,70 @@ +#pragma once + +#ifndef _TRITON_SELECTION_SELECTION_H_ +#define _TRITON_SELECTION_SELECTION_H_ + +#include + +namespace llvm{ + class Module; + class Value; +} + + +namespace triton{ + +namespace ir{ +class value; +class module; +} + +namespace codegen{ +// typedef +typedef llvm::Module Module; +typedef llvm::Value Value; +// forward +namespace analysis{ +class liveness; +class align; +class allocation; +class axes; +class layout; +} +class target; +class tile; + +} +} + +namespace triton{ +namespace codegen{ + +// Selection pass +class selection{ + typedef std::map vmap_t; + typedef std::map tmap_t; + +public: + selection(analysis::liveness* liveness, analysis::allocation *alloc, + analysis::align *alignment, analysis::axes *axes, + analysis::layout *layouts, target *tgt, unsigned num_warps) + : liveness_(liveness), alloc_(alloc), + alignment_(alignment), a_axes_(axes), layouts_(layouts), + tgt_(tgt), num_warps_(num_warps){ } + + void run(ir::module &src, Module &dst); + +private: + analysis::liveness *liveness_; + analysis::allocation *alloc_; + analysis::axes *a_axes_; + analysis::layout *layouts_; + analysis::align *alignment_; + target *tgt_; + unsigned num_warps_; +}; + +} +} + +#endif diff --git a/lib/codegen/selection.cc b/lib/codegen/selection/generator.cc similarity index 69% rename from lib/codegen/selection.cc rename to lib/codegen/selection/generator.cc index a4daa2d50..831719b73 100644 --- a/lib/codegen/selection.cc +++ b/lib/codegen/selection/generator.cc @@ -1,9 +1,8 @@ -#include -#include "triton/codegen/selection.h" +#include +#include "triton/codegen/selection/generator.h" +#include "triton/codegen/selection/machine_layout.h" +#include "triton/codegen/selection/machine_value.h" #include "triton/codegen/target.h" -#include "triton/codegen/analysis/liveness.h" -#include "triton/codegen/analysis/layout.h" -#include "triton/codegen/analysis/axes.h" #include "triton/codegen/analysis/allocation.h" #include "triton/codegen/analysis/align.h" #include "triton/codegen/transform/coalesce.h" @@ -12,12 +11,8 @@ #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/type.h" -#include "llvm/IR/InstrTypes.h" #include "llvm/IR/Module.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/Transforms/Scalar/EarlyCSE.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/InlineAsm.h" @@ -27,198 +22,6 @@ namespace codegen{ using namespace llvm; -/* Distributed Tile */ -void distributed_tile::init_indices() { - std::vector id(axes_.size(), 0); - // create iteration order - std::vector order(id.size()); - std::iota(order.begin(), order.end(), 0); - auto cmp = [&](int x, int y) { - return axes_[x].contiguous > axes_[y].contiguous; - }; - std::sort(order.begin(), order.end(), cmp); - // build - size_t k = 0; - while(true) { - indices_t current; - for(size_t d = 0; d < id.size(); d++) - current.push_back(axes_[d].values[id[d]]); - size_t sz = indices_.size(); - indices_[current] = sz; - values_[current] = nullptr; - ordered_indices_.push_back(current); - id[order[0]]++; - while(id[order[k]] == axes_[order[k]].values.size()){ - if(k == id.size() - 1) - return; - id[order[k++]] = 0; - id[order[k]]++; - } - k = 0; - } -} - -llvm::Type *distributed_tile::make_vector_ty(llvm::Type *ty, size_t vector_size) { - if(vector_size == 1) - return ty; - return VectorType::get(ty, vector_size); -} - -distributed_tile::distributed_tile(Type *ty, const shapes_t &shapes, const std::vector& order, const axes_t &axes, llvm::IRBuilder<> &builder, bool vectorize) - : tile(make_vector_ty(ty, vectorize?axes[0].contiguous:1), shapes), axes_(axes), order_(order), builder_(builder) { - vector_size_ = vectorize?ty_->getVectorNumElements():1; - init_indices(); -} - -void distributed_tile::set_value(indices_t idx, Value *x) { - assert(x->getType() == ty_ && "cannot set a value of different type"); - Value *&result = values_[idx]; - assert(!result && "value cannot be set twice"); - result = x; -} - -Value* distributed_tile::get_value(indices_t idx) { - Value *result = values_.at(idx); - assert(result && "value has not been set"); - return result; -} - -unsigned distributed_tile::get_linear_index(indices_t idx) { - return indices_[idx]; -} - -indices_t distributed_tile::get_ordered_indices(unsigned id) { - return ordered_indices_.at(id); -} - - -void distributed_tile::for_each(std::function fn) { - for(unsigned i = 0; i < ordered_indices_.size(); i++){ - if(i % vector_size_ == 0) - fn(ordered_indices_[i]); - } -} - -/* Shared Tile */ -void shared_tile::extract_constant(Value *arg, Value *&non_cst, Value *&cst) { - BinaryOperator *bin_op = dyn_cast(arg); - Constant *_0 = ConstantInt::get(Type::getInt32Ty(arg->getContext()), 0); - if(dyn_cast(arg)){ - cst = arg; - non_cst = _0; - return; - } - if(!bin_op || bin_op->getOpcode() != llvm::BinaryOperator::Add){ - non_cst = arg; - cst = _0; - return; - } - Constant *cst_lhs = dyn_cast(bin_op->getOperand(0)); - Constant *cst_rhs = dyn_cast(bin_op->getOperand(1)); - if(cst_lhs && cst_rhs){ - cst = arg; - non_cst = _0; - } - else if(cst_lhs){ - cst = cst_lhs; - non_cst = bin_op->getOperand(1); - } - else if(cst_rhs){ - cst = cst_rhs; - non_cst = bin_op->getOperand(0); - } - else{ - non_cst = arg; - cst = _0; - } -} - -void shared_tile::extract_constant(const indices_t &arg_idx, indices_t &non_cst_idx, indices_t &cst_idx) { - non_cst_idx.clear(); - cst_idx.clear(); - for(Value *idx: arg_idx){ - Value *non_cst, *cst; - extract_constant(idx, non_cst, cst); - non_cst_idx.push_back(non_cst); - cst_idx.push_back(cst); - } -} - - -Value* shared_tile::shared_offset(llvm::IRBuilder<> &builder, const shapes_t& shapes, const std::vector& perm, const std::vector& order, indices_t idx) { - // strides - std::vector strides(order.size()); - strides[order[0]] = builder.getInt32(1); - for(size_t i = 1; i < idx.size(); i++) - strides[order[i]] = builder.CreateMul(strides[order[i-1]], builder.getInt32(shapes[order[i-1]])); - // result - Value *result = builder.getInt32(0); - for(size_t i = 0; i < strides.size(); i++) - result = builder.CreateAdd(result, builder.CreateMul(idx[perm[i]], strides[i])); - return result; -} - -shared_tile::shared_tile(Type *ty, const shapes_t &shapes, const std::vector& order, Value *ptr, llvm::IRBuilder<> &builder, Value *offset, const std::vector& perm): - tile(ty, shapes), order_(order), ptr_(ptr), builder_(builder), offset_(offset), vector_size_(1), perm_(perm){ - return_vector_ = false; - if(perm_.empty()){ - perm_.resize(shapes.size()); - std::iota(perm_.begin(), perm_.end(), 0); - } -} - -void shared_tile::set_value(indices_t idx, Value *value) { - Value *ptr = builder_.CreateGEP(ptr_, shared_offset(builder_, shapes_, perm_, order_, idx)); - unsigned addr_space = ptr->getType()->getPointerAddressSpace(); - ptr = builder_.CreateBitCast(ptr, value->getType()->getPointerTo(addr_space)); - builder_.CreateStore(value, ptr); -} - -void shared_tile::set_vector_size(unsigned vector_size) { - vector_size_ = vector_size; -} - -void shared_tile::set_return_mode(bool return_vector){ - return_vector_ = return_vector; -} - - -Value* shared_tile::get_value(indices_t idx) { - indices_t non_cst_idx, cst_idx; - extract_constant(idx, non_cst_idx, cst_idx); - Value *&base_ptr = ptr_cache_[non_cst_idx]; - unsigned vector_size = vector_size_; - Type *ty = ty_; - if(ty->isHalfTy() && (vector_size % 2 == 0)){ - ty = IntegerType::get(ty->getContext(), 32); - vector_size = vector_size / 2; - } - if(base_ptr == nullptr){ -// BasicBlock* store = builder_.GetInsertBlock(); -// if(!non_cst_idx.empty()) -// if(isa(non_cst_idx.front())){ -// builder_.SetInsertPoint((Instruction*)non_cst_idx.front()); -// } - base_ptr = builder_.CreateGEP(ptr_, shared_offset(builder_, shapes_, perm_, order_, non_cst_idx)); - if(vector_size_ > 1){ - Type *vec_ty = VectorType::get(ty, vector_size); - Type *vec_ptr_ty = PointerType::get(vec_ty, base_ptr->getType()->getPointerAddressSpace()); - base_ptr = builder_.CreateBitCast(base_ptr, vec_ptr_ty); - } -// builder_.SetInsertPoint(store); - } - Value *offset = shared_offset(builder_, shapes_, perm_, order_, cst_idx); - Value *div = offset; - if(vector_size_ > 1) - div = builder_.CreateUDiv(offset, builder_.getInt32(vector_size_)); - Value *ptr = builder_.CreateGEP(base_ptr, div); - Value *result = builder_.CreateLoad(ptr); - if(return_vector_ == false && vector_size_ > 1) { - Value *rem = builder_.CreateURem(offset, builder_.getInt32(vector_size_)); - result = builder_.CreateExtractElement(result, rem); - } - return result; -} llvm::Instruction::BinaryOps llvm_op(ir::binary_op_t op) { using llop = llvm::Instruction::BinaryOps; @@ -306,7 +109,7 @@ llvm::CmpInst::Predicate llvm_pred(ir::cmp_pred_t pred) { } -Type *type(ir::type *ty, LLVMContext &ctx) { +inline Type *type(ir::type *ty, LLVMContext &ctx) { // function if(auto* tt = dynamic_cast(ty)){ Type *return_ty = type(tt->get_return_ty(), ctx); @@ -344,34 +147,17 @@ Type *type(ir::type *ty, LLVMContext &ctx) { } -/* ------------------- - * ---- Init Axes ---- - * ------------------- */ - -// Grid construction -std::vector delinearize(Value *trailing, const std::vector& order, std::vector &shapes, IRBuilder<> &builder){ - size_t dim = shapes.size(); - std::vector result(dim); - for(unsigned k = 0; k < dim - 1; k++){ - Constant *dim_k = builder.getInt32(shapes[order[k]]); - Value *rem = builder.CreateURem(trailing, dim_k); - trailing = builder.CreateUDiv(trailing, dim_k); - result[order[k]] = rem; +inline llvm::Attribute llvm_attr(llvm::LLVMContext& ctx, ir::attribute attr) { + switch(attr.get_kind()){ + case ir::noalias: return llvm::Attribute::get(ctx, llvm::Attribute::NoAlias); + case ir::readonly: return llvm::Attribute::get(ctx, llvm::Attribute::ReadOnly); + case ir::writeonly: return llvm::Attribute::get(ctx, llvm::Attribute::WriteOnly); + case ir::aligned: return llvm::Attribute::get(ctx, llvm::Attribute::Alignment, attr.get_value()); + default: throw std::runtime_error("cannot convert ir::attribute_t to llvm::Attribute"); } - result[order[dim - 1]] = trailing; - return result; } -inline int32_t ceil(int32_t num, int32_t div){ - return (num + div - 1)/div; -} - -/* ------------------- - * ---- Init Tiles ---- - * ------------------- */ - - -bool is_trans(ir::value *v) { +inline bool is_trans(ir::value *v) { if(dynamic_cast(v)) { return true; } @@ -386,34 +172,9 @@ bool is_trans(ir::value *v) { -/* ---------------------------- - * ---- Generate LLVM code ---- - * ---------------------------- */ - -inline llvm::Attribute llvm_attr(llvm::LLVMContext& ctx, ir::attribute attr) { - switch(attr.get_kind()){ - case ir::noalias: return llvm::Attribute::get(ctx, llvm::Attribute::NoAlias); - case ir::readonly: return llvm::Attribute::get(ctx, llvm::Attribute::ReadOnly); - case ir::writeonly: return llvm::Attribute::get(ctx, llvm::Attribute::WriteOnly); - case ir::aligned: return llvm::Attribute::get(ctx, llvm::Attribute::Alignment, attr.get_value()); - default: throw std::runtime_error("cannot convert ir::attribute_t to llvm::Attribute"); - } -} - - -void selection::run(ir::module &src, Module &dst) { - // create tile - generator gen(&dst, tgt_, layouts_, alignment_, alloc_, num_warps_ ); - - for(ir::alloc_const *x: src.allocs()) - gen.visit_value(x); - for(ir::function *fn: src.get_function_list()) - gen.visit_value(fn); -} - - generator::generator(Module *dst, + analysis::axes *a_axes, target *tgt, analysis::layout *layouts, analysis::align *alignment, @@ -421,7 +182,7 @@ generator::generator(Module *dst, unsigned num_warps) : ctx_(&dst->getContext()), mod_(dst), builder_(new Builder(dst->getContext())), - tgt_(tgt), + a_axes_(a_axes), tgt_(tgt), layouts_(layouts), alignment_(alignment), alloc_(alloc), num_warps_(num_warps) { @@ -1163,14 +924,12 @@ void generator::visit_function(ir::function* fn) { - - void generator::visit_layout_hmma_884(analysis::layout_hmma_884_t* layout) { - machine_layouts_[layout] = new machine_layout_hmma_884_t(mod_, &*builder_, tgt_, type(layout->ty->get_scalar_ty(), *ctx_), axes_, layout); + machine_layouts_[layout] = new machine_layout_hmma_884_t(mod_, &*builder_, tgt_, type(layout->ty->get_scalar_ty(), *ctx_), a_axes_, axes_, layout); } void generator::visit_layout_scanline(analysis::layout_scanline_t* layout) { - machine_layouts_[layout] = new machine_layout_scanline_t(mod_, &*builder_, tgt_, type(layout->ty->get_scalar_ty(), *ctx_), axes_, layout); + machine_layouts_[layout] = new machine_layout_scanline_t(mod_, &*builder_, tgt_, type(layout->ty->get_scalar_ty(), *ctx_), a_axes_, axes_, layout); } void generator::visit_layout_shared(analysis::layout_shared_t* layout) { @@ -1215,240 +974,6 @@ void generator::set_value(ir::value *x, const indices_t& idx, Value* v) { } - -machine_layout_shared_t::machine_layout_shared_t(Module *mod, Builder *builder, target *tgt, analysis::allocation* alloc, - Value *&sh_mem_ptr, analysis::layout_t *layout, - std::map& vmap, - std::map& tmap) - : mod_(mod), builder_(builder), tgt_(tgt), alloc_(alloc), sh_mem_ptr_(sh_mem_ptr), layout_(layout), vmap_(vmap), tmap_(tmap) { - - auto order = layout_->order; - auto shapes = layout_->shapes; - shapes[order[0]] += layout_->pad; - - Type* ty = type(layout_->ty, builder_->getContext()); - - PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr_->getType()->getPointerAddressSpace()); - // double-buffered - if(layout_->double_buffer) { - BasicBlock *current = builder_->GetInsertBlock(); - auto info = *layout_->double_buffer; - ir::phi_node *phi = info.phi; - BasicBlock *parent = (BasicBlock*)vmap_.at(phi->get_parent()); - if(parent->empty()) - builder_->SetInsertPoint(parent); - else - builder_->SetInsertPoint(&*parent->getFirstNonPHI()); - // create pointers - ptr_ = builder_->CreatePHI(ptr_ty, 2); - pre_ptr_ = builder_->CreateGEP(sh_mem_ptr_, builder_->getInt32(alloc_->offset(layout_))); - pre_ptr_ = builder_->CreateBitCast(pre_ptr_, ptr_->getType()); - offset_ = builder_->CreatePHI(builder_->getInt32Ty(), 2); - next_ptr_ = builder_->CreateGEP(ptr_, offset_, "next_ptr"); - builder_->SetInsertPoint(current); - } - else{ - size_t offset = alloc_->offset(layout_); - ptr_ = builder_->CreateGEP(sh_mem_ptr_, builder_->getInt32(offset)); - ptr_ = builder_->CreateBitCast(ptr_, ptr_ty); - } -} - - -tile* machine_layout_shared_t::create(ir::value *v) { - auto order = layout_->order; - auto shapes = layout_->shapes; - shapes[order[0]] += layout_->pad; - Type* ty = type(layout_->ty, builder_->getContext()); - // double-buffered - if(layout_->double_buffer) { - if(v == layout_->double_buffer->phi) - return new shared_tile(ty, shapes, order, ptr_, *builder_, offset_); - if(v == layout_->double_buffer->latch) - return new shared_tile(ty, shapes, order, next_ptr_, *builder_); - return new shared_tile(ty, shapes, order, pre_ptr_, *builder_); - } - else { - return new shared_tile(ty, shapes, order, ptr_, *builder_); - } -} - -machine_layout_distributed_t::machine_layout_distributed_t(Module *mod, Builder *builder, target *tgt, Type *ty, - std::map& axes, - analysis::layout_t *layout) - : mod_(mod), builder_(builder), tgt_(tgt), ty_(ty), axes_(axes), layout_(layout) { - -} - -tile *machine_layout_distributed_t::create(ir::value *v) { - Type *ty = type(v->get_type()->get_scalar_ty(), builder_->getContext()); - const auto &shapes = v->get_type()->get_tile_shapes(); - std::vector axes(shapes.size()); - for(size_t d = 0; d < shapes.size(); d++){ - if(shapes[d] > 1){ - unsigned x = layout_->axes[d]; - axes[d] = axes_.at(x); - } - else{ - axes[d].contiguous = 1; - axes[d].values = {builder_->getInt32(0)}; - } - } - return new distributed_tile(ty, shapes, layout_->order, axes, *builder_, false); -} - -machine_layout_hmma_884_t::machine_layout_hmma_884_t(Module *mod, Builder *builder, - target *tgt, Type *ty, - std::map& axes, - analysis::layout_hmma_884_t* layout) - : machine_layout_distributed_t(mod, builder, tgt, ty, axes, layout) { - - Value *warp_size = builder_->getInt32(32); - Value* u_thread_id_0 = tgt_->get_local_id(mod_, *builder_, 0); - Value *u_thread_id = builder_->CreateURem(u_thread_id_0, warp_size); - Value *u_warp_id = builder_->CreateUDiv(u_thread_id_0, warp_size); - - const auto& shapes = layout->shapes; - if(shapes.size() > 3) - throw std::runtime_error("unsupported"); - - bool is_batched = shapes.size() >= 3; - - Value *_1 = builder_->getInt32(1); - Value *_2 = builder_->getInt32(2); - Value *_3 = builder_->getInt32(3); - Value *_4 = builder_->getInt32(4); - Value *_16 = builder_->getInt32(16); - - // fragments per warp - unsigned fpw_0 = layout->fpw.at(0); - unsigned fpw_1 = layout->fpw.at(1); - unsigned fpw_2 = is_batched ? layout->fpw.at(2) : 1; - // warps per tile - unsigned wpt_0 = layout->wpt.at(0); - unsigned wpt_1 = layout->wpt.at(1); - unsigned wpt_2 = is_batched ? layout->wpt.at(2) : 1; - // hmma warp tile size - unsigned hmma_wts_0 = fpw_0 * 8; - unsigned hmma_wts_1 = fpw_1 * 8; - unsigned hmma_wts_2 = is_batched ? fpw_2 : 1; - // hmma block tile size - unsigned hmma_bts_0 = hmma_wts_0 * wpt_0; - unsigned hmma_bts_1 = hmma_wts_1 * wpt_1; - unsigned hmma_bts_2 = is_batched ? hmma_wts_2 * wpt_2 : 1; - // number of repetition - unsigned num_rep_0 = shapes[0] / hmma_bts_0; - unsigned num_rep_1 = shapes[1] / hmma_bts_1; - unsigned num_rep_2 = is_batched ? shapes[2] / hmma_bts_2 : 1; - // size of each pack (interleaving) - pack_size_0_ = std::min(num_rep_0, 1); - pack_size_1_ = std::min(num_rep_1, 1); - // number of packs (interleaving) - num_packs_0_ = num_rep_0 / pack_size_0_; - num_packs_1_ = num_rep_1 / pack_size_1_; - - /* intra warp offset */ - // offset of quad in pair - Value *in_pair_off_a = builder_->CreateMul(builder_->CreateUDiv(builder_->CreateAnd(u_thread_id, _16), builder_->getInt32(4)), - builder_->getInt32(fpw_0 * pack_size_0_)); - Value *in_pair_off_b = builder_->CreateMul(builder_->CreateUDiv(builder_->CreateAnd(u_thread_id, _16), builder_->getInt32(4)), - builder_->getInt32(fpw_1 * pack_size_1_)); - - // Quad pair id - Value *pair_a_id = builder_->CreateUDiv(builder_->CreateURem(u_thread_id, _16), _4); - Value *pair_b_id = builder_->CreateUDiv(builder_->CreateURem(u_thread_id, _16), _4); - pair_a_id = builder_->CreateURem(pair_a_id, builder_->getInt32(fpw_0)); - pair_b_id = builder_->CreateUDiv(pair_b_id, builder_->getInt32(fpw_0)); - pair_b_id = builder_->CreateURem(pair_b_id, builder_->getInt32(fpw_1)); - // Quad pair offset - Value *pair_a_off = builder_->CreateMul(pair_a_id, builder_->getInt32(4 * pack_size_0_)); - Value *pair_b_off = builder_->CreateMul(pair_b_id, builder_->getInt32(4 * pack_size_1_)); - - /* inter warp offset */ - Value *warp_id_0 = builder_->CreateURem(u_warp_id, builder_->getInt32(wpt_0)); - Value *warp_id_12 = builder_->CreateUDiv(u_warp_id, builder_->getInt32(wpt_0)); - Value *warp_id_1 = builder_->CreateURem(warp_id_12, builder_->getInt32(wpt_1)); - Value *warp_id_2 = builder_->CreateUDiv(warp_id_12, builder_->getInt32(wpt_1)); - Value *warp_offset_i = builder_->CreateMul(warp_id_0, builder_->getInt32(hmma_wts_0 * pack_size_0_)); - Value *warp_offset_j = builder_->CreateMul(warp_id_1, builder_->getInt32(hmma_wts_1 * pack_size_1_)); - - /* offsets */ - // a offset - offset_a_i_ = builder_->CreateAdd(warp_offset_i, builder_->CreateAdd(pair_a_off, in_pair_off_a)); - offset_a_k_ = builder_->CreateAnd(u_thread_id, _3); - // b offsets - offset_b_j_ = builder_->CreateAdd(warp_offset_j, builder_->CreateAdd(pair_b_off, in_pair_off_b)); - offset_b_k_ = builder_->CreateAnd(u_thread_id, _3); - - // c offsets - Value *offset_c_i = builder_->CreateAdd(builder_->CreateAnd(u_thread_id, _1), offset_a_i_); - Value *offset_c_j = builder_->CreateAdd(builder_->CreateAnd(u_thread_id, _2), - builder_->CreateAdd(warp_offset_j, pair_b_off)); - - /* indices */ - // i indices - std::vector idx_i; - for(unsigned pack = 0; pack < num_packs_0_; pack++) - for(unsigned ii = 0; ii < pack_size_0_; ii++) - for(unsigned i = 0; i < 2; i++){ - idx_i.push_back(builder_->CreateAdd(offset_c_i, builder_->getInt32(pack*hmma_bts_0*pack_size_0_ + ii*4 + i*2))); - } - // j indices - std::vector idx_j; - for(unsigned pack = 0; pack < num_packs_1_; pack++) - for(unsigned jj = 0; jj < pack_size_1_; jj++) - for(unsigned j = 0; j < 2; j++){ - idx_j.push_back(builder_->CreateAdd(offset_c_j, builder_->getInt32(pack*hmma_bts_1*pack_size_1_ + jj*4 + j*4*fpw_1*pack_size_1_))); - idx_j.push_back(builder_->CreateAdd(offset_c_j, builder_->getInt32(pack*hmma_bts_1*pack_size_1_ + jj*4 + j*4*fpw_1*pack_size_1_ + 1))); - } - // z indices - std::vector idx_z; - for(unsigned pack = 0; pack < num_rep_2; pack++) - idx_z.push_back(builder_->CreateAdd(warp_id_2, builder_->getInt32(pack*hmma_bts_2))); - - - /* axes */ - axes_[layout->axes[0]] = distributed_axis{1, idx_i, warp_id_0}; - axes_[layout->axes[1]] = distributed_axis{1, idx_j, warp_id_1}; - if(is_batched) - axes_[layout->axes[2]] = distributed_axis{1, idx_z, warp_id_2}; -} - - -machine_layout_scanline_t::machine_layout_scanline_t(Module *mod, Builder *builder, - target *tgt, Type *ty, - std::map &axes, - analysis::layout_scanline_t* layout) - : machine_layout_distributed_t(mod, builder, tgt, ty, axes, layout) { - - Value *warp_size = builder_->getInt32(32); - Value* u_thread_id_0 = tgt_->get_local_id(mod_, *builder_, 0); - Value *u_thread_id = builder_->CreateURem(u_thread_id_0, warp_size); - Value *u_warp_id = builder_->CreateUDiv(u_thread_id_0, warp_size); - - auto order = layout->order; - const auto& shapes = layout->shapes; - size_t dim = shapes.size(); - std::vector nts = layout->nts; - std::vector mts = layout->mts; - Value* full_thread_id = builder_->CreateAdd(builder_->CreateMul(u_warp_id, builder_->getInt32(32)), u_thread_id); - std::vector thread_id = delinearize(full_thread_id, order, mts, *builder_); - // Create axes - for(unsigned k = 0; k < dim; k++) { - std::string str_k = std::to_string(k); - Value *contiguous_k = builder_->getInt32(nts[k]); - Value *scaled_thread_id = builder_->CreateMul(thread_id[k], contiguous_k); - unsigned per_block = nts[k] * mts[k]; - unsigned per_thread = nts[k] * shapes[k] / per_block; - std::vector idx_list(per_thread); - for(unsigned n = 0 ; n < per_thread; n++){ - unsigned offset = n / nts[k] * per_block + n % nts[k]; - idx_list[n] = builder_->CreateAdd(scaled_thread_id, builder_->getInt32(offset), "idx_" + str_k + "_" + std::to_string(n)); - } - axes_[layout->axes[k]] = distributed_axis{nts[k], idx_list, thread_id[k]}; - } -} - void generator::finalize_shared_layout(analysis::layout_shared_t *shared) { if(shared->double_buffer) { auto info = *shared->double_buffer; diff --git a/lib/codegen/selection/machine_layout.cc b/lib/codegen/selection/machine_layout.cc new file mode 100644 index 000000000..1e6f0d5da --- /dev/null +++ b/lib/codegen/selection/machine_layout.cc @@ -0,0 +1,308 @@ +#include +#include "triton/codegen/selection/machine_layout.h" +#include "triton/codegen/selection/machine_value.h" +#include "triton/codegen/analysis/allocation.h" +#include "triton/codegen/analysis/axes.h" +#include "triton/codegen/target.h" +#include "triton/ir/instructions.h" +#include "triton/ir/type.h" +#include "llvm/IR/IRBuilder.h" + +namespace triton{ +namespace codegen{ + +using namespace llvm; + +inline Type *type(ir::type *ty, LLVMContext &ctx) { + // function + if(auto* tt = dynamic_cast(ty)){ + Type *return_ty = type(tt->get_return_ty(), ctx); + std::vector param_tys; + std::transform(tt->params_begin(), tt->params_end(), std::back_inserter(param_tys), + [&ctx](ir::type* t){ return type(t, ctx);}); + return FunctionType::get(return_ty, param_tys, false); + } + // pointer + if(ty->is_pointer_ty()){ + Type *elt_ty = type(ty->get_pointer_element_ty(), ctx); + unsigned addr_space = ty->get_pointer_address_space(); + return PointerType::get(elt_ty, addr_space); + } + // integer + if(ty->is_integer_ty()){ + unsigned bitwidth = ty->get_integer_bitwidth(); + return IntegerType::get(ctx, bitwidth); + } + // primitive types + switch(ty->get_type_id()){ + case ir::type::VoidTyID: return Type::getVoidTy(ctx); + case ir::type::HalfTyID: return Type::getHalfTy(ctx); + case ir::type::FloatTyID: return Type::getFloatTy(ctx); + case ir::type::DoubleTyID: return Type::getDoubleTy(ctx); + case ir::type::X86_FP80TyID: return Type::getX86_FP80Ty(ctx); + case ir::type::PPC_FP128TyID: return Type::getPPC_FP128Ty(ctx); + case ir::type::LabelTyID: return Type::getLabelTy(ctx); + case ir::type::MetadataTyID: return Type::getMetadataTy(ctx); + case ir::type::TokenTyID: return Type::getTokenTy(ctx); + default: break; + } + // unknown type + throw std::runtime_error("unknown conversion from ir::type to Type"); +} + +// Grid construction +inline std::vector delinearize(Value *trailing, const std::vector& order, std::vector &shapes, IRBuilder<> &builder){ + size_t dim = shapes.size(); + std::vector result(dim); + for(unsigned k = 0; k < dim - 1; k++){ + Constant *dim_k = builder.getInt32(shapes[order[k]]); + Value *rem = builder.CreateURem(trailing, dim_k); + trailing = builder.CreateUDiv(trailing, dim_k); + result[order[k]] = rem; + } + result[order[dim - 1]] = trailing; + return result; +} + +inline int32_t ceil(int32_t num, int32_t div){ + return (num + div - 1)/div; +} + + + +machine_layout_shared_t::machine_layout_shared_t(Module *mod, Builder *builder, target *tgt, analysis::allocation* alloc, + Value *&sh_mem_ptr, analysis::layout_t *layout, + std::map& vmap, + std::map& tmap) + : mod_(mod), builder_(builder), tgt_(tgt), alloc_(alloc), sh_mem_ptr_(sh_mem_ptr), layout_(layout), vmap_(vmap), tmap_(tmap) { + + auto order = layout_->order; + auto shapes = layout_->shapes; + shapes[order[0]] += layout_->pad; + + Type* ty = type(layout_->ty, builder_->getContext()); + + PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr_->getType()->getPointerAddressSpace()); + // double-buffered + if(layout_->double_buffer) { + BasicBlock *current = builder_->GetInsertBlock(); + auto info = *layout_->double_buffer; + ir::phi_node *phi = info.phi; + BasicBlock *parent = (BasicBlock*)vmap_.at((ir::value*)(phi->get_parent())); + if(parent->empty()) + builder_->SetInsertPoint(parent); + else + builder_->SetInsertPoint(&*parent->getFirstNonPHI()); + // create pointers + ptr_ = builder_->CreatePHI(ptr_ty, 2); + pre_ptr_ = builder_->CreateGEP(sh_mem_ptr_, builder_->getInt32(alloc_->offset(layout_))); + pre_ptr_ = builder_->CreateBitCast(pre_ptr_, ptr_->getType()); + offset_ = builder_->CreatePHI(builder_->getInt32Ty(), 2); + next_ptr_ = builder_->CreateGEP(ptr_, offset_, "next_ptr"); + builder_->SetInsertPoint(current); + } + else{ + size_t offset = alloc_->offset(layout_); + ptr_ = builder_->CreateGEP(sh_mem_ptr_, builder_->getInt32(offset)); + ptr_ = builder_->CreateBitCast(ptr_, ptr_ty); + } +} + + +tile* machine_layout_shared_t::create(ir::value *v) { + auto order = layout_->order; + auto shapes = layout_->shapes; + shapes[order[0]] += layout_->pad; + Type* ty = type(layout_->ty, builder_->getContext()); + // double-buffered + if(layout_->double_buffer) { + if(v == layout_->double_buffer->phi) + return new shared_tile(ty, shapes, order, ptr_, *builder_, offset_); + if(v == layout_->double_buffer->latch) + return new shared_tile(ty, shapes, order, next_ptr_, *builder_); + return new shared_tile(ty, shapes, order, pre_ptr_, *builder_); + } + else { + return new shared_tile(ty, shapes, order, ptr_, *builder_); + } +} + +machine_layout_distributed_t::machine_layout_distributed_t(Module *mod, Builder *builder, target *tgt, Type *ty, + analysis::axes *a_axes, std::map& axes, + analysis::layout_t *layout) + : mod_(mod), builder_(builder), tgt_(tgt), ty_(ty), a_axes_(a_axes), axes_(axes), layout_(layout) { + +} + +tile *machine_layout_distributed_t::create(ir::value *v) { + Type *ty = type(v->get_type()->get_scalar_ty(), builder_->getContext()); + const auto &shapes = v->get_type()->get_tile_shapes(); + std::vector axes(shapes.size()); + for(size_t d = 0; d < shapes.size(); d++){ + if(shapes[d] > 1){ + unsigned x = a_axes_->get(v, d); + axes[d] = axes_.at(x); + } + else{ + axes[d].contiguous = 1; + axes[d].values = {builder_->getInt32(0)}; + } + } + return new distributed_tile(ty, shapes, layout_->order, axes, *builder_, false); +} + +machine_layout_hmma_884_t::machine_layout_hmma_884_t(Module *mod, Builder *builder, + target *tgt, Type *ty, analysis::axes *a_axes, + std::map& axes, + analysis::layout_hmma_884_t* layout) + : machine_layout_distributed_t(mod, builder, tgt, ty, a_axes, axes, layout) { + + Value *warp_size = builder_->getInt32(32); + Value* u_thread_id_0 = tgt_->get_local_id(mod_, *builder_, 0); + Value *u_thread_id = builder_->CreateURem(u_thread_id_0, warp_size); + Value *u_warp_id = builder_->CreateUDiv(u_thread_id_0, warp_size); + + const auto& shapes = layout->shapes; + if(shapes.size() > 3) + throw std::runtime_error("unsupported"); + + bool is_batched = shapes.size() >= 3; + + Value *_1 = builder_->getInt32(1); + Value *_2 = builder_->getInt32(2); + Value *_3 = builder_->getInt32(3); + Value *_4 = builder_->getInt32(4); + Value *_16 = builder_->getInt32(16); + + // fragments per warp + unsigned fpw_0 = layout->fpw.at(0); + unsigned fpw_1 = layout->fpw.at(1); + unsigned fpw_2 = is_batched ? layout->fpw.at(2) : 1; + // warps per tile + unsigned wpt_0 = layout->wpt.at(0); + unsigned wpt_1 = layout->wpt.at(1); + unsigned wpt_2 = is_batched ? layout->wpt.at(2) : 1; + // hmma warp tile size + unsigned hmma_wts_0 = fpw_0 * 8; + unsigned hmma_wts_1 = fpw_1 * 8; + unsigned hmma_wts_2 = is_batched ? fpw_2 : 1; + // hmma block tile size + unsigned hmma_bts_0 = hmma_wts_0 * wpt_0; + unsigned hmma_bts_1 = hmma_wts_1 * wpt_1; + unsigned hmma_bts_2 = is_batched ? hmma_wts_2 * wpt_2 : 1; + // number of repetition + unsigned num_rep_0 = shapes[0] / hmma_bts_0; + unsigned num_rep_1 = shapes[1] / hmma_bts_1; + unsigned num_rep_2 = is_batched ? shapes[2] / hmma_bts_2 : 1; + // size of each pack (interleaving) + pack_size_0_ = std::min(num_rep_0, 1); + pack_size_1_ = std::min(num_rep_1, 1); + // number of packs (interleaving) + num_packs_0_ = num_rep_0 / pack_size_0_; + num_packs_1_ = num_rep_1 / pack_size_1_; + + /* intra warp offset */ + // offset of quad in pair + Value *in_pair_off_a = builder_->CreateMul(builder_->CreateUDiv(builder_->CreateAnd(u_thread_id, _16), builder_->getInt32(4)), + builder_->getInt32(fpw_0 * pack_size_0_)); + Value *in_pair_off_b = builder_->CreateMul(builder_->CreateUDiv(builder_->CreateAnd(u_thread_id, _16), builder_->getInt32(4)), + builder_->getInt32(fpw_1 * pack_size_1_)); + + // Quad pair id + Value *pair_a_id = builder_->CreateUDiv(builder_->CreateURem(u_thread_id, _16), _4); + Value *pair_b_id = builder_->CreateUDiv(builder_->CreateURem(u_thread_id, _16), _4); + pair_a_id = builder_->CreateURem(pair_a_id, builder_->getInt32(fpw_0)); + pair_b_id = builder_->CreateUDiv(pair_b_id, builder_->getInt32(fpw_0)); + pair_b_id = builder_->CreateURem(pair_b_id, builder_->getInt32(fpw_1)); + // Quad pair offset + Value *pair_a_off = builder_->CreateMul(pair_a_id, builder_->getInt32(4 * pack_size_0_)); + Value *pair_b_off = builder_->CreateMul(pair_b_id, builder_->getInt32(4 * pack_size_1_)); + + /* inter warp offset */ + Value *warp_id_0 = builder_->CreateURem(u_warp_id, builder_->getInt32(wpt_0)); + Value *warp_id_12 = builder_->CreateUDiv(u_warp_id, builder_->getInt32(wpt_0)); + Value *warp_id_1 = builder_->CreateURem(warp_id_12, builder_->getInt32(wpt_1)); + Value *warp_id_2 = builder_->CreateUDiv(warp_id_12, builder_->getInt32(wpt_1)); + Value *warp_offset_i = builder_->CreateMul(warp_id_0, builder_->getInt32(hmma_wts_0 * pack_size_0_)); + Value *warp_offset_j = builder_->CreateMul(warp_id_1, builder_->getInt32(hmma_wts_1 * pack_size_1_)); + + /* offsets */ + // a offset + offset_a_i_ = builder_->CreateAdd(warp_offset_i, builder_->CreateAdd(pair_a_off, in_pair_off_a)); + offset_a_k_ = builder_->CreateAnd(u_thread_id, _3); + // b offsets + offset_b_j_ = builder_->CreateAdd(warp_offset_j, builder_->CreateAdd(pair_b_off, in_pair_off_b)); + offset_b_k_ = builder_->CreateAnd(u_thread_id, _3); + + // c offsets + Value *offset_c_i = builder_->CreateAdd(builder_->CreateAnd(u_thread_id, _1), offset_a_i_); + Value *offset_c_j = builder_->CreateAdd(builder_->CreateAnd(u_thread_id, _2), + builder_->CreateAdd(warp_offset_j, pair_b_off)); + + /* indices */ + // i indices + std::vector idx_i; + for(unsigned pack = 0; pack < num_packs_0_; pack++) + for(unsigned ii = 0; ii < pack_size_0_; ii++) + for(unsigned i = 0; i < 2; i++){ + idx_i.push_back(builder_->CreateAdd(offset_c_i, builder_->getInt32(pack*hmma_bts_0*pack_size_0_ + ii*4 + i*2))); + } + // j indices + std::vector idx_j; + for(unsigned pack = 0; pack < num_packs_1_; pack++) + for(unsigned jj = 0; jj < pack_size_1_; jj++) + for(unsigned j = 0; j < 2; j++){ + idx_j.push_back(builder_->CreateAdd(offset_c_j, builder_->getInt32(pack*hmma_bts_1*pack_size_1_ + jj*4 + j*4*fpw_1*pack_size_1_))); + idx_j.push_back(builder_->CreateAdd(offset_c_j, builder_->getInt32(pack*hmma_bts_1*pack_size_1_ + jj*4 + j*4*fpw_1*pack_size_1_ + 1))); + } + // z indices + std::vector idx_z; + for(unsigned pack = 0; pack < num_rep_2; pack++) + idx_z.push_back(builder_->CreateAdd(warp_id_2, builder_->getInt32(pack*hmma_bts_2))); + + + /* axes */ + axes_[layout->axes[0]] = distributed_axis{1, idx_i, warp_id_0}; + axes_[layout->axes[1]] = distributed_axis{1, idx_j, warp_id_1}; + if(is_batched) + axes_[layout->axes[2]] = distributed_axis{1, idx_z, warp_id_2}; +} + + +machine_layout_scanline_t::machine_layout_scanline_t(Module *mod, Builder *builder, + target *tgt, Type *ty, + analysis::axes *a_axes, std::map &axes, + analysis::layout_scanline_t* layout) + : machine_layout_distributed_t(mod, builder, tgt, ty, a_axes, axes, layout) { + + Value *warp_size = builder_->getInt32(32); + Value* u_thread_id_0 = tgt_->get_local_id(mod_, *builder_, 0); + Value *u_thread_id = builder_->CreateURem(u_thread_id_0, warp_size); + Value *u_warp_id = builder_->CreateUDiv(u_thread_id_0, warp_size); + + auto order = layout->order; + const auto& shapes = layout->shapes; + size_t dim = shapes.size(); + std::vector nts = layout->nts; + std::vector mts = layout->mts; + Value* full_thread_id = builder_->CreateAdd(builder_->CreateMul(u_warp_id, builder_->getInt32(32)), u_thread_id); + std::vector thread_id = delinearize(full_thread_id, order, mts, *builder_); + // Create axes + for(unsigned k = 0; k < dim; k++) { + std::string str_k = std::to_string(k); + Value *contiguous_k = builder_->getInt32(nts[k]); + Value *scaled_thread_id = builder_->CreateMul(thread_id[k], contiguous_k); + unsigned per_block = nts[k] * mts[k]; + unsigned per_thread = nts[k] * shapes[k] / per_block; + std::vector idx_list(per_thread); + for(unsigned n = 0 ; n < per_thread; n++){ + unsigned offset = n / nts[k] * per_block + n % nts[k]; + idx_list[n] = builder_->CreateAdd(scaled_thread_id, builder_->getInt32(offset), "idx_" + str_k + "_" + std::to_string(n)); + } + axes_[layout->axes[k]] = distributed_axis{nts[k], idx_list, thread_id[k]}; + } +} + + +} +} diff --git a/lib/codegen/selection/machine_value.cc b/lib/codegen/selection/machine_value.cc new file mode 100644 index 000000000..bd4237043 --- /dev/null +++ b/lib/codegen/selection/machine_value.cc @@ -0,0 +1,206 @@ +#include +#include "llvm/IR/IRBuilder.h" +#include "triton/codegen/selection/machine_value.h" + +namespace triton{ +namespace codegen{ + +using namespace llvm; + +/* Distributed Tile */ +void distributed_tile::init_indices() { + std::vector id(axes_.size(), 0); + // create iteration order + std::vector order(id.size()); + std::iota(order.begin(), order.end(), 0); + auto cmp = [&](int x, int y) { + return axes_[x].contiguous > axes_[y].contiguous; + }; + std::sort(order.begin(), order.end(), cmp); + // build + size_t k = 0; + while(true) { + indices_t current; + for(size_t d = 0; d < id.size(); d++) + current.push_back(axes_[d].values[id[d]]); + size_t sz = indices_.size(); + indices_[current] = sz; + values_[current] = nullptr; + ordered_indices_.push_back(current); + id[order[0]]++; + while(id[order[k]] == axes_[order[k]].values.size()){ + if(k == id.size() - 1) + return; + id[order[k++]] = 0; + id[order[k]]++; + } + k = 0; + } +} + +llvm::Type *distributed_tile::make_vector_ty(llvm::Type *ty, size_t vector_size) { + if(vector_size == 1) + return ty; + return VectorType::get(ty, vector_size); +} + +distributed_tile::distributed_tile(Type *ty, const shapes_t &shapes, const std::vector& order, const axes_t &axes, llvm::IRBuilder<> &builder, bool vectorize) + : tile(make_vector_ty(ty, vectorize?axes[0].contiguous:1), shapes), axes_(axes), order_(order), builder_(builder) { + vector_size_ = vectorize?ty_->getVectorNumElements():1; + init_indices(); +} + +void distributed_tile::set_value(indices_t idx, Value *x) { + assert(x->getType() == ty_ && "cannot set a value of different type"); + Value *&result = values_[idx]; + assert(!result && "value cannot be set twice"); + result = x; +} + +Value* distributed_tile::get_value(indices_t idx) { + Value *result = values_.at(idx); + assert(result && "value has not been set"); + return result; +} + +unsigned distributed_tile::get_linear_index(indices_t idx) { + return indices_[idx]; +} + +indices_t distributed_tile::get_ordered_indices(unsigned id) { + return ordered_indices_.at(id); +} + + +void distributed_tile::for_each(std::function fn) { + for(unsigned i = 0; i < ordered_indices_.size(); i++){ + if(i % vector_size_ == 0) + fn(ordered_indices_[i]); + } +} + +/* Shared Tile */ +void shared_tile::extract_constant(Value *arg, Value *&non_cst, Value *&cst) { + BinaryOperator *bin_op = dyn_cast(arg); + Constant *_0 = ConstantInt::get(Type::getInt32Ty(arg->getContext()), 0); + if(dyn_cast(arg)){ + cst = arg; + non_cst = _0; + return; + } + if(!bin_op || bin_op->getOpcode() != llvm::BinaryOperator::Add){ + non_cst = arg; + cst = _0; + return; + } + Constant *cst_lhs = dyn_cast(bin_op->getOperand(0)); + Constant *cst_rhs = dyn_cast(bin_op->getOperand(1)); + if(cst_lhs && cst_rhs){ + cst = arg; + non_cst = _0; + } + else if(cst_lhs){ + cst = cst_lhs; + non_cst = bin_op->getOperand(1); + } + else if(cst_rhs){ + cst = cst_rhs; + non_cst = bin_op->getOperand(0); + } + else{ + non_cst = arg; + cst = _0; + } +} + +void shared_tile::extract_constant(const indices_t &arg_idx, indices_t &non_cst_idx, indices_t &cst_idx) { + non_cst_idx.clear(); + cst_idx.clear(); + for(Value *idx: arg_idx){ + Value *non_cst, *cst; + extract_constant(idx, non_cst, cst); + non_cst_idx.push_back(non_cst); + cst_idx.push_back(cst); + } +} + + +Value* shared_tile::shared_offset(llvm::IRBuilder<> &builder, const shapes_t& shapes, const std::vector& perm, const std::vector& order, indices_t idx) { + // strides + std::vector strides(order.size()); + strides[order[0]] = builder.getInt32(1); + for(size_t i = 1; i < idx.size(); i++) + strides[order[i]] = builder.CreateMul(strides[order[i-1]], builder.getInt32(shapes[order[i-1]])); + // result + Value *result = builder.getInt32(0); + for(size_t i = 0; i < strides.size(); i++) + result = builder.CreateAdd(result, builder.CreateMul(idx[perm[i]], strides[i])); + return result; +} + +shared_tile::shared_tile(Type *ty, const shapes_t &shapes, const std::vector& order, Value *ptr, llvm::IRBuilder<> &builder, Value *offset, const std::vector& perm): + tile(ty, shapes), order_(order), ptr_(ptr), builder_(builder), offset_(offset), vector_size_(1), perm_(perm){ + return_vector_ = false; + if(perm_.empty()){ + perm_.resize(shapes.size()); + std::iota(perm_.begin(), perm_.end(), 0); + } +} + +void shared_tile::set_value(indices_t idx, Value *value) { + Value *ptr = builder_.CreateGEP(ptr_, shared_offset(builder_, shapes_, perm_, order_, idx)); + unsigned addr_space = ptr->getType()->getPointerAddressSpace(); + ptr = builder_.CreateBitCast(ptr, value->getType()->getPointerTo(addr_space)); + builder_.CreateStore(value, ptr); +} + +void shared_tile::set_vector_size(unsigned vector_size) { + vector_size_ = vector_size; +} + +void shared_tile::set_return_mode(bool return_vector){ + return_vector_ = return_vector; +} + + +Value* shared_tile::get_value(indices_t idx) { + indices_t non_cst_idx, cst_idx; + extract_constant(idx, non_cst_idx, cst_idx); + Value *&base_ptr = ptr_cache_[non_cst_idx]; + unsigned vector_size = vector_size_; + Type *ty = ty_; + if(ty->isHalfTy() && (vector_size % 2 == 0)){ + ty = IntegerType::get(ty->getContext(), 32); + vector_size = vector_size / 2; + } + if(base_ptr == nullptr){ +// BasicBlock* store = builder_.GetInsertBlock(); +// if(!non_cst_idx.empty()) +// if(isa(non_cst_idx.front())){ +// builder_.SetInsertPoint((Instruction*)non_cst_idx.front()); +// } + base_ptr = builder_.CreateGEP(ptr_, shared_offset(builder_, shapes_, perm_, order_, non_cst_idx)); + if(vector_size_ > 1){ + Type *vec_ty = VectorType::get(ty, vector_size); + Type *vec_ptr_ty = PointerType::get(vec_ty, base_ptr->getType()->getPointerAddressSpace()); + base_ptr = builder_.CreateBitCast(base_ptr, vec_ptr_ty); + } +// builder_.SetInsertPoint(store); + } + Value *offset = shared_offset(builder_, shapes_, perm_, order_, cst_idx); + Value *div = offset; + if(vector_size_ > 1) + div = builder_.CreateUDiv(offset, builder_.getInt32(vector_size_)); + Value *ptr = builder_.CreateGEP(base_ptr, div); + Value *result = builder_.CreateLoad(ptr); + if(return_vector_ == false && vector_size_ > 1) { + Value *rem = builder_.CreateURem(offset, builder_.getInt32(vector_size_)); + result = builder_.CreateExtractElement(result, rem); + } + return result; +} + + + +} +} diff --git a/lib/codegen/selection/selection.cc b/lib/codegen/selection/selection.cc new file mode 100644 index 000000000..49fa1b714 --- /dev/null +++ b/lib/codegen/selection/selection.cc @@ -0,0 +1,20 @@ +#include +#include "triton/codegen/selection/selection.h" +#include "triton/codegen/selection/generator.h" +#include "triton/ir/module.h" + +namespace triton{ +namespace codegen{ + +using namespace llvm; + +void selection::run(ir::module &src, Module &dst) { + generator gen(&dst, a_axes_, tgt_, layouts_, alignment_, alloc_, num_warps_ ); + for(ir::alloc_const *x: src.allocs()) + gen.visit_alloc_const(x); + for(ir::function *fn: src.get_function_list()) + gen.visit_function(fn); +} + +} +} diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 68fe7fe01..b83ea8442 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -217,7 +217,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::transform::reassociate reassociate(&align); codegen::transform::coalesce coalesce(&align, &layouts); codegen::transform::cts cts; - codegen::selection selection(&liveness, &allocation, &align, &layouts, target.get(), opt.num_warps); + codegen::selection selection(&liveness, &allocation, &align, &axes, &layouts, target.get(), opt.num_warps); // run passes // ir::print(module, std::cout); peephole.run(module); From be25e954f62d510e5e298964bd74088cdb8b0099 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 17 Oct 2019 12:55:37 -0400 Subject: [PATCH 449/494] [codegen] [selection] merged selection into generator visit --- include/triton/codegen/selection.h | 346 ------------------- include/triton/codegen/selection/generator.h | 7 +- include/triton/codegen/selection/selection.h | 70 ---- include/triton/runtime/function.h | 2 +- lib/codegen/selection/generator.cc | 85 +++-- lib/codegen/selection/machine_layout.cc | 15 +- lib/codegen/selection/selection.cc | 20 -- lib/runtime/function.cc | 6 +- 8 files changed, 63 insertions(+), 488 deletions(-) delete mode 100644 include/triton/codegen/selection.h delete mode 100644 include/triton/codegen/selection/selection.h delete mode 100644 lib/codegen/selection/selection.cc diff --git a/include/triton/codegen/selection.h b/include/triton/codegen/selection.h deleted file mode 100644 index da6399573..000000000 --- a/include/triton/codegen/selection.h +++ /dev/null @@ -1,346 +0,0 @@ -#ifndef TDL_INCLUDE_CODEGEN_SELECTION_H -#define TDL_INCLUDE_CODEGEN_SELECTION_H - -#include "triton/ir/context.h" -#include "triton/ir/module.h" -#include "triton/ir/function.h" -#include "triton/ir/type.h" -#include "triton/ir/visitor.h" -#include "triton/codegen/analysis/layout.h" -#include "triton/codegen/transform/cts.h" - - -namespace llvm{ - class Type; - class Value; - class Instruction; - class Constant; - class LLVMContext; - class Module; - class ConstantFolder; - class IRBuilderDefaultInserter; - template - class IRBuilder; - class ArrayType; - class Function; -} - - -// typedefs -namespace triton{ -namespace codegen{ - typedef llvm::IRBuilder Builder; - typedef llvm::LLVMContext LLVMContext; - typedef llvm::Type Type; - typedef llvm::Value Value; - typedef llvm::Module Module; - typedef llvm::Instruction Instruction; - typedef llvm::Constant Constant; - typedef llvm::ArrayType ArrayType; - typedef llvm::Function Function; -} -} - -namespace triton{ -namespace codegen{ - -namespace analysis{ -class liveness; -class tiles; -class align; -class allocation; -class cts; -class axes; -class layout; -} - -namespace transform{ -class coalesce; -} - -class target; - -typedef std::vector indices_t; - -struct distributed_axis { - int contiguous; - std::vector values; - Value* thread_id; -}; - -class tile { -protected: - typedef std::vector shapes_t; - -public: - tile(Type *ty, const shapes_t &shapes): ty_(ty), shapes_(shapes){ } - virtual void set_value(indices_t idx, Value *v) = 0; - virtual Value* get_value(indices_t idx) = 0; - Type *get_ty() const { return ty_; } - shapes_t get_shapes() const { return shapes_; } - -protected: - Type *ty_; - shapes_t shapes_; -}; - -class shared_tile: public tile { -private: - void extract_constant(Value *arg, Value *&non_cst, Value *&cst); - void extract_constant(const indices_t &arg_idx, indices_t &non_cst_idx, indices_t &cst_idx); - - -public: - shared_tile(Type* ty, const shapes_t &shapes, const std::vector &order, Value* ptr, Builder &builder, Value* offset = nullptr, const std::vector& perm = {}); - void set_vector_size(unsigned vector_size); - void set_return_mode(bool return_vector); - void set_value(indices_t, Value *); - Value* get_ptr_to(indices_t idx); - Value* get_value(indices_t idx); - Value* get_pointer() { return ptr_; } - Value* get_offset() { return offset_; } - const std::vector& get_perm() { return perm_; } - const std::vector& get_order() { return order_; } - static Value* shared_offset(Builder& builder, const shapes_t& shapes, const std::vector& perm, const std::vector& order, indices_t idx); - -private: - Value *ptr_; - bool return_vector_; - Builder &builder_; - Value *offset_; - std::map ptr_cache_; - unsigned vector_size_; - std::vector order_; - std::vector perm_; -}; - -// Distribtued tile -class distributed_tile: public tile{ - typedef std::vector axes_t; - typedef std::vector ordered_indices_vec_t; - typedef std::map indices_map_t; - typedef std::map values_map_t; - -private: - void init_indices(); - Type *make_vector_ty(Type *ty, size_t vector_size); - -public: - distributed_tile(Type *ty, const shapes_t& shapes, const std::vector& order, const axes_t &axes, Builder &builder, bool vectorize); - void set_value(indices_t idx, Value *v); - Value* get_value(indices_t idx); - const std::vector& get_order() { return order_; } - unsigned get_linear_index(indices_t idx); - indices_t get_ordered_indices(unsigned id); - void for_each(std::function fn); - const distributed_axis &axis(unsigned dim) { return axes_.at(dim); } - -private: - axes_t axes_; - std::vector order_; - indices_map_t indices_; - values_map_t values_; - ordered_indices_vec_t ordered_indices_; - size_t vector_size_; - Builder &builder_; -}; - -class machine_layout_t { -public: - virtual tile* create(ir::value *v) = 0; -}; - -class machine_layout_shared_t: public machine_layout_t { -public: - machine_layout_shared_t(Module *mod, Builder *builder, target *tgt, analysis::allocation* alloc, Value *&sh_mem_ptr, analysis::layout_t* layout, - std::map& vmap, - std::map& tmap); - - tile* create(ir::value *v); - - Module *mod_; - Builder *builder_; - target *tgt_; - analysis::allocation* alloc_; - Value *&sh_mem_ptr_; - analysis::layout_t* layout_; - std::map& vmap_; - std::map& tmap_; - - Value *offset_; - Value *ptr_; - Value *pre_ptr_; - Value *next_ptr_; - -}; - -class machine_layout_distributed_t: public machine_layout_t { -public: - machine_layout_distributed_t(Module *mod, Builder *builder, target *tgt, Type *ty, - analysis::axes *a_axes, std::map& axes, - analysis::layout_t* layout); - - tile* create(ir::value *v); - Module *mod_; - Builder *builder_; - target *tgt_; - Type *ty_; - analysis::axes *a_axes_; - std::map& axes_; - analysis::layout_t* layout_; -}; - - -class machine_layout_hmma_884_t: public machine_layout_distributed_t { -public: - machine_layout_hmma_884_t(Module *mod, Builder *builder, - target *tgt, Type *ty, - analysis::axes *a_axes, std::map& axes, - analysis::layout_hmma_884_t* layout); - Value *offset_a_i_, *offset_a_k_; - Value *offset_b_j_, *offset_b_k_; - unsigned pack_size_0_; - unsigned pack_size_1_; - unsigned num_packs_0_; - unsigned num_packs_1_; -}; - -class machine_layout_scanline_t: public machine_layout_distributed_t { -public: - machine_layout_scanline_t(Module *mod, Builder *builder, - target *tgt, Type *ty, - analysis::axes *a_axes, std::map& axes, - analysis::layout_scanline_t* layout); -}; - -class generator: public ir::visitor, public analysis::layout_visitor { -private: - void for_each(ir::value *x, const std::function& fn); - Value* get_value(ir::value *x, const indices_t& idx); - void set_value(ir::value *x, const indices_t& idx, Value* v); - - void visit_hmma_dot(ir::dot_inst*, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK); - void visit_scanline_dot(ir::dot_inst*, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK, Type *c_ty, Function *f_mul_add); - void visit_outer_dot(ir::dot_inst*, distributed_tile *TA, distributed_tile *TB, distributed_tile *TD, unsigned NK, - Type *c_ty, Function *f_mul_add); - - void finalize_shared_layout(analysis::layout_shared_t*); - void finalize_function(ir::function*); - void finalize_phi_node(ir::phi_node*); - -public: - generator(Module *dst, - analysis::axes *a_axes, - target *tgt, - analysis::layout *layouts, - analysis::align *alignment, - analysis::allocation *alloc, - unsigned num_warps); - - void visit_value(ir::value* v); - - void visit_phi_node(ir::phi_node*); - void visit_binary_operator(ir::binary_operator*); - void visit_getelementptr_inst(ir::getelementptr_inst*); - - void visit_icmp_inst(ir::icmp_inst*); - void visit_fcmp_inst(ir::fcmp_inst*); - void visit_cast_inst(ir::cast_inst*); - - void visit_return_inst(ir::return_inst*); - void visit_cond_branch_inst(ir::cond_branch_inst*); - void visit_uncond_branch_inst(ir::uncond_branch_inst*); - - - void visit_unmasked_load_inst(ir::unmasked_load_inst*); - void visit_masked_load_inst(ir::masked_load_inst*); - void visit_unmasked_store_inst(ir::unmasked_store_inst*); - void visit_masked_store_inst(ir::masked_store_inst*); - - void visit_reshape_inst(ir::reshape_inst*); - void visit_splat_inst(ir::splat_inst*); - void visit_broadcast_inst(ir::broadcast_inst*); - void visit_downcast_inst(ir::downcast_inst*); - - void visit_get_program_id_inst(ir::get_program_id_inst*); - void visit_get_num_program_inst(ir::get_num_program_inst*); - void visit_atomic_cas_inst(ir::atomic_cas_inst*); - void visit_atomic_exch_inst(ir::atomic_exch_inst*); - void visit_atomic_add_inst(ir::atomic_add_inst*); - void visit_dot_inst(ir::dot_inst*); - void visit_trans_inst(ir::trans_inst*); - void visit_sqrt_inst(ir::sqrt_inst*); - void visit_reduce_inst(ir::reduce_inst*); - void visit_select_inst(ir::select_inst*); - - void visit_copy_to_shared_inst(ir::copy_to_shared_inst*); - void visit_copy_from_shared_inst(ir::copy_from_shared_inst*); - void visit_barrier_inst(ir::barrier_inst*); - void visit_make_range_dyn(ir::make_range_dyn*); - void visit_make_range(ir::make_range*); - - void visit_make_range_sta(ir::make_range_sta*); - void visit_undef_value(ir::undef_value*); - void visit_constant_int(ir::constant_int*); - void visit_constant_fp(ir::constant_fp*); - void visit_alloc_const(ir::alloc_const*); - - void visit_function(ir::function*); - void visit_basic_block(ir::basic_block*); - void visit_argument(ir::argument*); - - void visit_layout_hmma_884(analysis::layout_hmma_884_t*); - void visit_layout_scanline(analysis::layout_scanline_t*); - void visit_layout_shared(analysis::layout_shared_t*); - -private: - LLVMContext *ctx_; - std::unique_ptr builder_; - Module *mod_; - - std::map machine_layouts_; - analysis::axes *a_axes_; - std::map axes_; - std::map vmap_; - std::map tmap_; - target *tgt_; - analysis::layout *layouts_; - analysis::align *alignment_; - analysis::allocation *alloc_; - Value *sh_mem_ptr_; - unsigned num_warps_; - - std::set seen_; -}; - - -// Selection pass -class selection{ - typedef std::map vmap_t; - typedef std::map tmap_t; - -public: - selection(analysis::liveness* liveness, analysis::allocation *alloc, - analysis::align *alignment, analysis::axes *axes, - analysis::layout *layouts, target *tgt, unsigned num_warps) - : liveness_(liveness), alloc_(alloc), - alignment_(alignment), a_axes_(axes), layouts_(layouts), - tgt_(tgt), num_warps_(num_warps){ } - - void run(ir::module &src, Module &dst); - -private: - analysis::liveness *liveness_; - analysis::allocation *alloc_; - analysis::axes *a_axes_; - analysis::layout *layouts_; - analysis::align *alignment_; - target *tgt_; - unsigned num_warps_; -}; - -} -} - -#endif diff --git a/include/triton/codegen/selection/generator.h b/include/triton/codegen/selection/generator.h index 76ec88b90..3e6c0bacb 100644 --- a/include/triton/codegen/selection/generator.h +++ b/include/triton/codegen/selection/generator.h @@ -79,12 +79,11 @@ private: void finalize_phi_node(ir::phi_node*); public: - generator(Module *dst, - analysis::axes *a_axes, - target *tgt, + generator(analysis::axes *a_axes, analysis::layout *layouts, analysis::align *alignment, analysis::allocation *alloc, + target *tgt, unsigned num_warps); void visit_value(ir::value* v); @@ -143,6 +142,8 @@ public: void visit_layout_scanline(analysis::layout_scanline_t*); void visit_layout_shared(analysis::layout_shared_t*); + void visit(ir::module &, llvm::Module &); + private: LLVMContext *ctx_; Builder* builder_; diff --git a/include/triton/codegen/selection/selection.h b/include/triton/codegen/selection/selection.h deleted file mode 100644 index a2b88247f..000000000 --- a/include/triton/codegen/selection/selection.h +++ /dev/null @@ -1,70 +0,0 @@ -#pragma once - -#ifndef _TRITON_SELECTION_SELECTION_H_ -#define _TRITON_SELECTION_SELECTION_H_ - -#include - -namespace llvm{ - class Module; - class Value; -} - - -namespace triton{ - -namespace ir{ -class value; -class module; -} - -namespace codegen{ -// typedef -typedef llvm::Module Module; -typedef llvm::Value Value; -// forward -namespace analysis{ -class liveness; -class align; -class allocation; -class axes; -class layout; -} -class target; -class tile; - -} -} - -namespace triton{ -namespace codegen{ - -// Selection pass -class selection{ - typedef std::map vmap_t; - typedef std::map tmap_t; - -public: - selection(analysis::liveness* liveness, analysis::allocation *alloc, - analysis::align *alignment, analysis::axes *axes, - analysis::layout *layouts, target *tgt, unsigned num_warps) - : liveness_(liveness), alloc_(alloc), - alignment_(alignment), a_axes_(axes), layouts_(layouts), - tgt_(tgt), num_warps_(num_warps){ } - - void run(ir::module &src, Module &dst); - -private: - analysis::liveness *liveness_; - analysis::allocation *alloc_; - analysis::axes *a_axes_; - analysis::layout *layouts_; - analysis::align *alignment_; - target *tgt_; - unsigned num_warps_; -}; - -} -} - -#endif diff --git a/include/triton/runtime/function.h b/include/triton/runtime/function.h index e312cfded..fa06544f8 100644 --- a/include/triton/runtime/function.h +++ b/include/triton/runtime/function.h @@ -9,7 +9,7 @@ #include #include // codegen -#include "triton/codegen/selection.h" +#include "triton/ir/context.h" #include "triton/codegen/target.h" #include "triton/lang/parser.h" #include "triton/runtime/arg.h" diff --git a/lib/codegen/selection/generator.cc b/lib/codegen/selection/generator.cc index 831719b73..022c51ed7 100644 --- a/lib/codegen/selection/generator.cc +++ b/lib/codegen/selection/generator.cc @@ -109,18 +109,18 @@ llvm::CmpInst::Predicate llvm_pred(ir::cmp_pred_t pred) { } -inline Type *type(ir::type *ty, LLVMContext &ctx) { +inline Type *llvm_type(ir::type *ty, LLVMContext &ctx) { // function if(auto* tt = dynamic_cast(ty)){ - Type *return_ty = type(tt->get_return_ty(), ctx); + Type *return_ty = llvm_type(tt->get_return_ty(), ctx); std::vector param_tys; std::transform(tt->params_begin(), tt->params_end(), std::back_inserter(param_tys), - [&ctx](ir::type* t){ return type(t, ctx);}); + [&ctx](ir::type* t){ return llvm_type(t, ctx);}); return FunctionType::get(return_ty, param_tys, false); } // pointer if(ty->is_pointer_ty()){ - Type *elt_ty = type(ty->get_pointer_element_ty(), ctx); + Type *elt_ty = llvm_type(ty->get_pointer_element_ty(), ctx); unsigned addr_space = ty->get_pointer_address_space(); return PointerType::get(elt_ty, addr_space); } @@ -173,29 +173,14 @@ inline bool is_trans(ir::value *v) { -generator::generator(Module *dst, - analysis::axes *a_axes, - target *tgt, - analysis::layout *layouts, - analysis::align *alignment, - analysis::allocation *alloc, - unsigned num_warps) - : ctx_(&dst->getContext()), mod_(dst), - builder_(new Builder(dst->getContext())), - a_axes_(a_axes), tgt_(tgt), - layouts_(layouts), alignment_(alignment), alloc_(alloc), - num_warps_(num_warps) { - - if(tgt_->is_gpu()) - if(unsigned alloc_size = alloc_->allocated_size()){ - Type *int_8_ty = Type::getInt8Ty(*ctx_); - ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_size); - Type *ptr_ty = PointerType::get(int_8_ty, 3); - GlobalVariable *sh_mem_array = - new GlobalVariable(*dst, array_ty, false, GlobalVariable::ExternalLinkage, - nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); - sh_mem_ptr_ = builder_->CreateBitCast(sh_mem_array, ptr_ty); - } +generator::generator(analysis::axes *a_axes, + analysis::layout *layouts, + analysis::align *alignment, + analysis::allocation *alloc, + target *tgt, + unsigned num_warps) + : a_axes_(a_axes), layouts_(layouts), alignment_(alignment), alloc_(alloc), + tgt_(tgt), num_warps_(num_warps) { } @@ -226,7 +211,7 @@ void generator::visit_value(ir::value* v) { } void generator::visit_phi_node(ir::phi_node* phi) { - Type *ty = type(phi->get_type()->get_scalar_ty(), *ctx_); + Type *ty = llvm_type(phi->get_type()->get_scalar_ty(), *ctx_); unsigned num_ops = phi->get_num_operands(); for_each(phi, [&](indices_t idx){ set_value(phi, idx, builder_->Insert(PHINode::Create(ty, num_ops))); @@ -248,7 +233,7 @@ void generator::visit_getelementptr_inst(ir::getelementptr_inst* gep) { std::vector idx_vals; std::transform(gep->idx_begin(), gep->idx_end(), std::back_inserter(idx_vals), [&](ir::value* x){ return get_value(x, idx);}); - Type *source_ty = type(gep->get_source_elt_ty()->get_scalar_ty(), *ctx_); + Type *source_ty = llvm_type(gep->get_source_elt_ty()->get_scalar_ty(), *ctx_); Value *ret = builder_->Insert(GetElementPtrInst::CreateInBounds(source_ty, ptr, idx_vals)); set_value(gep, idx, ret); }); @@ -277,7 +262,7 @@ void generator::visit_fcmp_inst(ir::fcmp_inst* fcmp) { void generator::visit_cast_inst(ir::cast_inst* cast) { for_each(cast, [&](indices_t idx){ Value *arg = get_value(cast->get_operand(0), idx); - Type *dst_ty = type(cast->get_type()->get_scalar_ty(), *ctx_); + Type *dst_ty = llvm_type(cast->get_type()->get_scalar_ty(), *ctx_); Value *ret = builder_->Insert(CastInst::Create(llvm_op(cast->get_op()), arg, dst_ty)); set_value(cast, idx, ret); }); @@ -726,7 +711,7 @@ void generator::visit_dot_inst(ir::dot_inst* dot) { ir::value *D = dot->get_operand(2); distributed_tile *TD = (distributed_tile*)tmap_.at(D); - Type *c_ty = type(D->get_type()->get_scalar_ty(), *ctx_); + Type *c_ty = llvm_type(D->get_type()->get_scalar_ty(), *ctx_); Function *f_mul_add = Intrinsic::getDeclaration(module, Intrinsic::fmuladd, {c_ty}); auto A_shapes = A->get_type()->get_tile_shapes(); size_t red_axis = 1; @@ -851,22 +836,22 @@ void generator::visit_make_range(ir::make_range* x) { void generator::visit_undef_value(ir::undef_value *ud) { - vmap_[ud] = llvm::UndefValue::get(type(ud->get_type(), *ctx_)); + vmap_[ud] = llvm::UndefValue::get(llvm_type(ud->get_type(), *ctx_)); } void generator::visit_constant_int(ir::constant_int *cst){ - Type *ty = type(cst->get_type()->get_scalar_ty(), *ctx_); + Type *ty = llvm_type(cst->get_type()->get_scalar_ty(), *ctx_); vmap_[cst] = ConstantInt::get(ty, cst->get_value()); } void generator::visit_constant_fp(ir::constant_fp *cst){ - Type *ty = type(cst->get_type()->get_scalar_ty(), *ctx_); + Type *ty = llvm_type(cst->get_type()->get_scalar_ty(), *ctx_); vmap_[cst] = ConstantFP::get(ty, cst->get_value()); } void generator::visit_alloc_const(ir::alloc_const *alloc) { unsigned size = ((ir::constant_int*)alloc->get_operand(0))->get_value(); - Type *element_ty = type(alloc->get_type()->get_pointer_element_ty(), *ctx_); + Type *element_ty = llvm_type(alloc->get_type()->get_pointer_element_ty(), *ctx_); Type *array_ty = llvm::ArrayType::get(element_ty, size); Value *array = new llvm::GlobalVariable(*mod_, array_ty, false, llvm::GlobalVariable::ExternalLinkage, nullptr, alloc->get_name(), nullptr, llvm::GlobalVariable::NotThreadLocal, 4); @@ -876,7 +861,7 @@ void generator::visit_alloc_const(ir::alloc_const *alloc) { void generator::visit_function(ir::function* fn) { LLVMContext &ctx = builder_->getContext(); - FunctionType *fn_ty = (FunctionType*)type(fn->get_fn_type(), *ctx_); + FunctionType *fn_ty = (FunctionType*)llvm_type(fn->get_fn_type(), *ctx_); if(!tgt_->is_gpu()){ Type *fn_ret_ty = fn_ty->getReturnType(); std::vector fn_args_ty; @@ -925,11 +910,11 @@ void generator::visit_function(ir::function* fn) { void generator::visit_layout_hmma_884(analysis::layout_hmma_884_t* layout) { - machine_layouts_[layout] = new machine_layout_hmma_884_t(mod_, &*builder_, tgt_, type(layout->ty->get_scalar_ty(), *ctx_), a_axes_, axes_, layout); + machine_layouts_[layout] = new machine_layout_hmma_884_t(mod_, &*builder_, tgt_, llvm_type(layout->ty->get_scalar_ty(), *ctx_), a_axes_, axes_, layout); } void generator::visit_layout_scanline(analysis::layout_scanline_t* layout) { - machine_layouts_[layout] = new machine_layout_scanline_t(mod_, &*builder_, tgt_, type(layout->ty->get_scalar_ty(), *ctx_), a_axes_, axes_, layout); + machine_layouts_[layout] = new machine_layout_scanline_t(mod_, &*builder_, tgt_, llvm_type(layout->ty->get_scalar_ty(), *ctx_), a_axes_, axes_, layout); } void generator::visit_layout_shared(analysis::layout_shared_t* layout) { @@ -1026,5 +1011,29 @@ void generator::finalize_phi_node(ir::phi_node *phi) { } } +void generator::visit(ir::module &src, llvm::Module &dst) { + mod_ = &dst; + ctx_ = &dst.getContext(); + builder_ = new Builder(*ctx_); + // allocate shared memory + if(tgt_->is_gpu()) + if(unsigned alloc_size = alloc_->allocated_size()){ + Type *int_8_ty = Type::getInt8Ty(*ctx_); + ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_size); + Type *ptr_ty = PointerType::get(int_8_ty, 3); + GlobalVariable *sh_mem_array = + new GlobalVariable(*mod_, array_ty, false, GlobalVariable::ExternalLinkage, + nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); + sh_mem_ptr_ = builder_->CreateBitCast(sh_mem_array, ptr_ty); + } + // allocate constant memory + for(ir::alloc_const *x: src.allocs()) + visit_alloc_const(x); + // visit functions + for(ir::function *fn: src.get_function_list()) + visit_function(fn); +} + + } } diff --git a/lib/codegen/selection/machine_layout.cc b/lib/codegen/selection/machine_layout.cc index 1e6f0d5da..ac242c815 100644 --- a/lib/codegen/selection/machine_layout.cc +++ b/lib/codegen/selection/machine_layout.cc @@ -1,6 +1,7 @@ #include #include "triton/codegen/selection/machine_layout.h" #include "triton/codegen/selection/machine_value.h" +#include "triton/codegen/selection/generator.h" #include "triton/codegen/analysis/allocation.h" #include "triton/codegen/analysis/axes.h" #include "triton/codegen/target.h" @@ -13,18 +14,18 @@ namespace codegen{ using namespace llvm; -inline Type *type(ir::type *ty, LLVMContext &ctx) { +inline Type *llvm_type(ir::type *ty, LLVMContext &ctx) { // function if(auto* tt = dynamic_cast(ty)){ - Type *return_ty = type(tt->get_return_ty(), ctx); + Type *return_ty = llvm_type(tt->get_return_ty(), ctx); std::vector param_tys; std::transform(tt->params_begin(), tt->params_end(), std::back_inserter(param_tys), - [&ctx](ir::type* t){ return type(t, ctx);}); + [&ctx](ir::type* t){ return llvm_type(t, ctx);}); return FunctionType::get(return_ty, param_tys, false); } // pointer if(ty->is_pointer_ty()){ - Type *elt_ty = type(ty->get_pointer_element_ty(), ctx); + Type *elt_ty = llvm_type(ty->get_pointer_element_ty(), ctx); unsigned addr_space = ty->get_pointer_address_space(); return PointerType::get(elt_ty, addr_space); } @@ -80,7 +81,7 @@ machine_layout_shared_t::machine_layout_shared_t(Module *mod, Builder *builder, auto shapes = layout_->shapes; shapes[order[0]] += layout_->pad; - Type* ty = type(layout_->ty, builder_->getContext()); + Type* ty = llvm_type(layout_->ty, builder_->getContext()); PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr_->getType()->getPointerAddressSpace()); // double-buffered @@ -113,7 +114,7 @@ tile* machine_layout_shared_t::create(ir::value *v) { auto order = layout_->order; auto shapes = layout_->shapes; shapes[order[0]] += layout_->pad; - Type* ty = type(layout_->ty, builder_->getContext()); + Type* ty = llvm_type(layout_->ty, builder_->getContext()); // double-buffered if(layout_->double_buffer) { if(v == layout_->double_buffer->phi) @@ -135,7 +136,7 @@ machine_layout_distributed_t::machine_layout_distributed_t(Module *mod, Builder } tile *machine_layout_distributed_t::create(ir::value *v) { - Type *ty = type(v->get_type()->get_scalar_ty(), builder_->getContext()); + Type *ty = llvm_type(v->get_type()->get_scalar_ty(), builder_->getContext()); const auto &shapes = v->get_type()->get_tile_shapes(); std::vector axes(shapes.size()); for(size_t d = 0; d < shapes.size(); d++){ diff --git a/lib/codegen/selection/selection.cc b/lib/codegen/selection/selection.cc deleted file mode 100644 index 49fa1b714..000000000 --- a/lib/codegen/selection/selection.cc +++ /dev/null @@ -1,20 +0,0 @@ -#include -#include "triton/codegen/selection/selection.h" -#include "triton/codegen/selection/generator.h" -#include "triton/ir/module.h" - -namespace triton{ -namespace codegen{ - -using namespace llvm; - -void selection::run(ir::module &src, Module &dst) { - generator gen(&dst, a_axes_, tgt_, layouts_, alignment_, alloc_, num_warps_ ); - for(ir::alloc_const *x: src.allocs()) - gen.visit_alloc_const(x); - for(ir::function *fn: src.get_function_list()) - gen.visit_function(fn); -} - -} -} diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index b83ea8442..115b739d9 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -13,7 +13,7 @@ #include "triton/codegen/transform/membar.h" #include "triton/codegen/transform/reassociate.h" #include "triton/codegen/transform/cts.h" -#include "triton/codegen/selection.h" +#include "triton/codegen/selection/generator.h" #include "triton/runtime/function.h" #include "triton/lang/cpp.h" #include "triton/lang/parser.h" @@ -217,7 +217,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::transform::reassociate reassociate(&align); codegen::transform::coalesce coalesce(&align, &layouts); codegen::transform::cts cts; - codegen::selection selection(&liveness, &allocation, &align, &axes, &layouts, target.get(), opt.num_warps); + codegen::generator isel(&axes, &layouts, &align, &allocation, target.get(), opt.num_warps); // run passes // ir::print(module, std::cout); peephole.run(module); @@ -243,7 +243,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c return std::unique_ptr(); barriers.run(module); // ir::print(module, std::cout); - selection.run(module, *llvm); + isel.visit(module, *llvm); // return binary std::unique_ptr res(driver::module::create(context, std::move(llvm))); // done From cf4fbfefeee0b603350cf53341f280685cc5f19c Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 17 Oct 2019 13:12:37 -0400 Subject: [PATCH 450/494] [codegen] [selection] no longer using llvm::IRBuilder<>::Insert() --- lib/codegen/selection/generator.cc | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/lib/codegen/selection/generator.cc b/lib/codegen/selection/generator.cc index 022c51ed7..8f9e37c16 100644 --- a/lib/codegen/selection/generator.cc +++ b/lib/codegen/selection/generator.cc @@ -214,7 +214,7 @@ void generator::visit_phi_node(ir::phi_node* phi) { Type *ty = llvm_type(phi->get_type()->get_scalar_ty(), *ctx_); unsigned num_ops = phi->get_num_operands(); for_each(phi, [&](indices_t idx){ - set_value(phi, idx, builder_->Insert(PHINode::Create(ty, num_ops))); + set_value(phi, idx, builder_->CreatePHI(ty, num_ops)); }); } @@ -222,7 +222,7 @@ void generator::visit_binary_operator(ir::binary_operator*binop) { for_each(binop, [&](indices_t idx){ Value *lhs = get_value(binop->get_operand(0), idx); Value *rhs = get_value(binop->get_operand(1), idx); - Value *ret = builder_->Insert(BinaryOperator::Create(llvm_op(binop->get_op()), lhs, rhs)); + Value *ret = builder_->CreateBinOp(llvm_op(binop->get_op()), lhs, rhs); set_value(binop, idx, ret); }); } @@ -234,7 +234,7 @@ void generator::visit_getelementptr_inst(ir::getelementptr_inst* gep) { std::transform(gep->idx_begin(), gep->idx_end(), std::back_inserter(idx_vals), [&](ir::value* x){ return get_value(x, idx);}); Type *source_ty = llvm_type(gep->get_source_elt_ty()->get_scalar_ty(), *ctx_); - Value *ret = builder_->Insert(GetElementPtrInst::CreateInBounds(source_ty, ptr, idx_vals)); + Value *ret = builder_->CreateGEP(source_ty, ptr, idx_vals); set_value(gep, idx, ret); }); } @@ -244,7 +244,7 @@ void generator::visit_icmp_inst(ir::icmp_inst* icmp) { ir::cmp_pred_t pred = icmp->get_pred(); Value *lhs = get_value(icmp->get_operand(0), idx); Value *rhs = get_value(icmp->get_operand(1), idx); - Value *ret = builder_->Insert(CmpInst::Create(Instruction::ICmp, llvm_pred(pred), lhs, rhs)); + Value *ret = builder_->CreateICmp(llvm_pred(pred), lhs, rhs); set_value(icmp, idx, ret); }); } @@ -254,7 +254,7 @@ void generator::visit_fcmp_inst(ir::fcmp_inst* fcmp) { ir::cmp_pred_t pred = fcmp->get_pred(); Value *lhs = get_value(fcmp->get_operand(0), idx); Value *rhs = get_value(fcmp->get_operand(1), idx); - Value *ret = builder_->Insert(FCmpInst::Create(Instruction::FCmp, llvm_pred(pred), lhs, rhs)); + Value *ret = builder_->CreateFCmp(llvm_pred(pred), lhs, rhs); set_value(fcmp, idx, ret); }); } @@ -263,26 +263,26 @@ void generator::visit_cast_inst(ir::cast_inst* cast) { for_each(cast, [&](indices_t idx){ Value *arg = get_value(cast->get_operand(0), idx); Type *dst_ty = llvm_type(cast->get_type()->get_scalar_ty(), *ctx_); - Value *ret = builder_->Insert(CastInst::Create(llvm_op(cast->get_op()), arg, dst_ty)); + Value *ret = builder_->CreateCast(llvm_op(cast->get_op()), arg, dst_ty); set_value(cast, idx, ret); }); } void generator::visit_return_inst(ir::return_inst* rr) { ir::value *ret_val = rr->get_return_value(); - builder_->Insert(ReturnInst::Create(*ctx_, ret_val ? vmap_.at(ret_val) : nullptr)); + builder_->CreateRet(ret_val ? vmap_.at(ret_val) : nullptr); } void generator::visit_cond_branch_inst(ir::cond_branch_inst* br) { BasicBlock *true_dest = (BasicBlock*)vmap_.at(br->get_true_dest()); BasicBlock *false_dest = (BasicBlock*)vmap_.at(br->get_false_dest()); Value *cond = vmap_.at(br->get_cond()); - builder_->Insert(BranchInst::Create(true_dest, false_dest, cond)); + builder_->CreateCondBr(cond, true_dest, false_dest); } void generator::visit_uncond_branch_inst(ir::uncond_branch_inst* br) { BasicBlock *dest = (BasicBlock*)vmap_.at(br->get_dest()); - builder_->Insert(BranchInst::Create(dest)); + builder_->CreateBr(dest); } @@ -757,7 +757,7 @@ void generator::visit_select_inst(ir::select_inst* select) { Value *pred = get_value(select->get_operand(0), idx); Value *if_value = get_value(select->get_operand(1), idx); Value *else_value = get_value(select->get_operand(2), idx); - Value *ret = builder_->Insert(SelectInst::Create(pred, if_value, else_value)); + Value *ret = builder_->CreateSelect(pred, if_value, else_value); set_value(select, idx, ret); }); From b43454c9b7c71e39097260767afc6df794c92563 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 17 Oct 2019 22:38:41 -0400 Subject: [PATCH 451/494] [codegen] [membar] view do not write to shared memory --- lib/codegen/analysis/layout.cc | 5 ++++- lib/codegen/selection/machine_layout.cc | 5 ----- lib/codegen/transform/membar.cc | 9 +++++---- lib/driver/module.cc | 1 + lib/runtime/function.cc | 1 + tests/unit/dot.cc | 2 +- 6 files changed, 12 insertions(+), 11 deletions(-) diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index dc43f8ea6..83ee2086d 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -291,6 +291,7 @@ layout_shared_t::layout_shared_t(const layout_t *arg, } std::vector col = {0, 1}; std::vector row = {1, 0}; + order = col; bool is_nonhmma_dot_a = dot_a && !hmma_dot_a; bool is_nonhmma_dot_b = dot_b && !hmma_dot_b; if(is_nonhmma_dot_a) @@ -329,7 +330,9 @@ void layout::create(size_t id, const std::vector& values) { return x->get_type()->get_tile_ranks1() < y->get_type()->get_tile_ranks1(); }; - ir::value *largest = *std::max_element(values.begin(), values.end(), cmp); + std::vector lvalue = values; + std::remove_if(lvalue.begin(), lvalue.end(), [&](ir::value* v) { return dynamic_cast(v); }); + ir::value *largest = *std::max_element(lvalue.begin(), lvalue.end(), cmp); const auto& axes = axes_->get(largest); const auto& shapes = largest->get_type()->get_tile_shapes(); auto it_cts = std::find_if(values.begin(), values.end(), [](ir::value* v) { diff --git a/lib/codegen/selection/machine_layout.cc b/lib/codegen/selection/machine_layout.cc index ac242c815..cf5d9bf33 100644 --- a/lib/codegen/selection/machine_layout.cc +++ b/lib/codegen/selection/machine_layout.cc @@ -77,12 +77,7 @@ machine_layout_shared_t::machine_layout_shared_t(Module *mod, Builder *builder, std::map& tmap) : mod_(mod), builder_(builder), tgt_(tgt), alloc_(alloc), sh_mem_ptr_(sh_mem_ptr), layout_(layout), vmap_(vmap), tmap_(tmap) { - auto order = layout_->order; - auto shapes = layout_->shapes; - shapes[order[0]] += layout_->pad; - Type* ty = llvm_type(layout_->ty, builder_->getContext()); - PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr_->getType()->getPointerAddressSpace()); // double-buffered if(layout_->double_buffer) { diff --git a/lib/codegen/transform/membar.cc b/lib/codegen/transform/membar.cc index 8c2f3d909..9dd793294 100644 --- a/lib/codegen/transform/membar.cc +++ b/lib/codegen/transform/membar.cc @@ -52,7 +52,7 @@ void membar::get_read_intervals(ir::instruction *i, interval_vec_t &res){ } void membar::get_written_intervals(ir::instruction *i, interval_vec_t &res){ - if(!dynamic_cast(i)) + if(!dynamic_cast(i) && !dynamic_cast(i)) add_reference(i, res); } @@ -99,7 +99,7 @@ std::pairget_all()){ if(x.second->double_buffer){ auto info = *x.second->double_buffer; - safe_war.insert(info.first); - safe_war.insert(info.latch); + for(ir::value *v: x.second->values) + if(v != info.phi) + safe_war.insert(v); } } diff --git a/lib/driver/module.cc b/lib/driver/module.cc index f29c830f4..4fdc8ee90 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -242,6 +242,7 @@ cu_module::cu_module(driver::context * context, std::unique_ptr ll cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ // std::cout << source << std::endl; +// exit(EXIT_FAILURE); cu_context::context_switcher ctx(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 115b739d9..444a113db 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -243,6 +243,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c return std::unique_ptr(); barriers.run(module); // ir::print(module, std::cout); +// exit(EXIT_FAILURE); isel.visit(module, *llvm); // return binary std::unique_ptr res(driver::module::create(context, std::move(llvm))); diff --git a/tests/unit/dot.cc b/tests/unit/dot.cc index af7b509e0..38851b54d 100644 --- a/tests/unit/dot.cc +++ b/tests/unit/dot.cc @@ -13,7 +13,7 @@ int main() { for(int TM: std::vector{32, 64}) for(int TN: std::vector{32, 64}) for(int TK: std::vector{8}) - for(int nwarps: std::vector{1, 4}) + for(int nwarps: std::vector{8}) for(bool AT: std::array{false, true}) for(bool BT: std::array{false, true}){ configs.push_back(config_t{FLOAT, AT, BT, 128, 128, 128, TM, TN, TK, nwarps}); From cfde3dd7665af2059e0c80df8fd887b548c86c40 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 18 Oct 2019 13:42:15 -0400 Subject: [PATCH 452/494] [codegen] [layout] fixed padding issue for row-major HMMA --- lib/codegen/analysis/layout.cc | 12 ++++++------ lib/codegen/selection/machine_layout.cc | 1 - tests/bench/dot.cc | 2 +- tests/common/dot.h | 2 +- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index 83ee2086d..3d2296aae 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -279,6 +279,8 @@ layout_shared_t::layout_shared_t(const layout_t *arg, // order if(arg->type == SCANLINE) order = arg->order; + else + order = arg->order; ir::value* dot_a = nullptr; ir::value* dot_b = nullptr; ir::value* hmma_dot_a = nullptr; @@ -291,7 +293,6 @@ layout_shared_t::layout_shared_t(const layout_t *arg, } std::vector col = {0, 1}; std::vector row = {1, 0}; - order = col; bool is_nonhmma_dot_a = dot_a && !hmma_dot_a; bool is_nonhmma_dot_b = dot_b && !hmma_dot_b; if(is_nonhmma_dot_a) @@ -303,21 +304,20 @@ layout_shared_t::layout_shared_t(const layout_t *arg, pad = 0; if(hmma_dot_a){ bool row = is_trans(hmma_dot_a) ^ order[0] == 1; - pad = 24 - shapes[row ? 0: 1] % 32; + pad = 24 - shapes[row ? order[0] : order[1]] % 32; } else if(hmma_dot_b){ bool row = is_trans(hmma_dot_b) ^ order[0] == 1; - pad = 24 - shapes[row ? 1 : 0] % 32; + pad = 24 - shapes[row ? order[1] : order[0]] % 32; } else if(order != arg->order) { pad = 4; } + shapes[order[0]] += pad; // size - auto shape = this->shapes; - shape[order[0]] += pad; size = ty->get_primitive_size_in_bits() / 8; - for(auto s: shape) + for(auto s: shapes) size *= s; if(double_buffer) size *= 2; diff --git a/lib/codegen/selection/machine_layout.cc b/lib/codegen/selection/machine_layout.cc index cf5d9bf33..1c026bfc8 100644 --- a/lib/codegen/selection/machine_layout.cc +++ b/lib/codegen/selection/machine_layout.cc @@ -108,7 +108,6 @@ machine_layout_shared_t::machine_layout_shared_t(Module *mod, Builder *builder, tile* machine_layout_shared_t::create(ir::value *v) { auto order = layout_->order; auto shapes = layout_->shapes; - shapes[order[0]] += layout_->pad; Type* ty = llvm_type(layout_->ty, builder_->getContext()); // double-buffered if(layout_->double_buffer) { diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index c87e1c938..927f0044b 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -34,7 +34,7 @@ int main() { for(const auto& c: configs){ std::tie(ord, AT, BT, M, N, K) = c; std::cout << "// " << c << std::flush; - for(auto perf: bench_dot(stream, FLOAT, AT, BT, M, N, K, ord, ord)) + for(auto perf: bench_dot(stream, HALF, AT, BT, M, N, K, ord, ord)) std::cout << ", " << perf << std::flush; std::cout << std::endl; } diff --git a/tests/common/dot.h b/tests/common/dot.h index e87470edb..d9374de7a 100644 --- a/tests/common/dot.h +++ b/tests/common/dot.h @@ -111,7 +111,7 @@ bool triton_dot(drv::stream* stream, bool AT, bool BT, if(mode == BENCH) { opt.defines.push_back({"TM", {"64", "128"}}); opt.defines.push_back({"TN", {"64", "128"}}); - opt.defines.push_back({"TK", {"8"}}); + opt.defines.push_back({"TK", {"8", "16"}}); opt.num_warps = {2, 4, 8}; } From 50efd9c82f70d2e4af9f9a1857e74d477e1472a8 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 18 Oct 2019 14:54:26 -0400 Subject: [PATCH 453/494] [codegen] [liveness] bugfix in live range computation --- lib/codegen/analysis/liveness.cc | 6 +++--- lib/runtime/function.cc | 3 +-- tests/bench/copy2d.cc | 20 ++++++++++---------- tests/bench/dot.cc | 2 +- tests/unit/dot.cc | 4 ++-- 5 files changed, 17 insertions(+), 18 deletions(-) diff --git a/lib/codegen/analysis/liveness.cc b/lib/codegen/analysis/liveness.cc index 707cbaa23..2d4162ff4 100644 --- a/lib/codegen/analysis/liveness.cc +++ b/lib/codegen/analysis/liveness.cc @@ -37,16 +37,16 @@ void liveness::run(ir::module &mod) { if(layout->type != SHARED) continue; // users - std::set users; + std::set users; for(ir::value *v: layout->values){ - users.insert(v); for(ir::user *u: v->get_users()) users.insert(u); } // compute intervals unsigned start = INT32_MAX; unsigned end = 0; - for(ir::value *u: users){ + for(ir::user *u: users) + if(indices.find(u) != indices.end()){ start = std::min(start, indices.at(u)); end = std::max(end, indices.at(u)); } diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 444a113db..02cceea63 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -231,11 +231,10 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c align.run(module); dce.run(module); reassociate.run(module); - dce.run(module); cts.run(module); + dce.run(module); align.run(module); axes.run(module); -// ir::print(module, std::cout); layouts.run(module); liveness.run(module); allocation.run(module); diff --git a/tests/bench/copy2d.cc b/tests/bench/copy2d.cc index 22006aae7..f1252797e 100644 --- a/tests/bench/copy2d.cc +++ b/tests/bench/copy2d.cc @@ -11,17 +11,17 @@ int main() { // shapes to benchmark typedef std::tuple, std::vector, std::vector> config_t; std::vector configs = { -// {{4096*4096}, {0}, {0}}, + {{4096*4096}, {0}, {0}}, {{4096, 4096}, {0, 1}, {1, 0}}, -// {{4096, 4096}, {0, 1}, {1, 0}}, -// {{4096, 4096}, {1, 0}, {0, 1}}, -// {{4096, 4096}, {0, 1}, {0, 1}}, -// {{256, 256, 256}, {0, 1, 2}, {0, 1, 2}}, -// {{256, 256, 256}, {0, 1, 2}, {0, 2, 1}}, -// {{256, 256, 256}, {1, 0, 2}, {1, 2, 0}}, -// {{256, 256, 256}, {1, 2, 0}, {1, 0, 2}}, -// {{256, 256, 256}, {2, 0, 1}, {0, 1, 2}}, -// {{256, 256, 256}, {2, 1, 0}, {0, 2, 1}} + {{4096, 4096}, {0, 1}, {1, 0}}, + {{4096, 4096}, {1, 0}, {0, 1}}, + {{4096, 4096}, {0, 1}, {0, 1}}, + {{256, 256, 256}, {0, 1, 2}, {0, 1, 2}}, + {{256, 256, 256}, {0, 1, 2}, {0, 2, 1}}, + {{256, 256, 256}, {1, 0, 2}, {1, 2, 0}}, + {{256, 256, 256}, {1, 2, 0}, {1, 0, 2}}, + {{256, 256, 256}, {2, 0, 1}, {0, 1, 2}}, + {{256, 256, 256}, {2, 1, 0}, {0, 2, 1}} }; // does the work std::vector shape; diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index 927f0044b..9857e9865 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -13,7 +13,7 @@ int main() { for(auto x: std::vector>{{false, false}, {false, true}, {true, false}, {true, true}}){ std::vector tmp = { - config_t{ord, x[0], x[1], 2048, 2048, 2048}, + config_t{ord, x[0], x[1], 4096, 4096, 4096}, // config_t{ord, x[0], x[1], 16, 2048, 2048}, // config_t{ord, x[0], x[1], 32, 2048, 2048}, // config_t{ord, x[0], x[1], 64, 2048, 2048}, diff --git a/tests/unit/dot.cc b/tests/unit/dot.cc index 38851b54d..02777fa4b 100644 --- a/tests/unit/dot.cc +++ b/tests/unit/dot.cc @@ -12,8 +12,8 @@ int main() { std::vector configs; for(int TM: std::vector{32, 64}) for(int TN: std::vector{32, 64}) - for(int TK: std::vector{8}) - for(int nwarps: std::vector{8}) + for(int TK: std::vector{16}) + for(int nwarps: std::vector{4}) for(bool AT: std::array{false, true}) for(bool BT: std::array{false, true}){ configs.push_back(config_t{FLOAT, AT, BT, 128, 128, 128, TM, TN, TK, nwarps}); From a76efd326d61c4cc70c38fa4c02a0b071550ab06 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 19 Oct 2019 14:47:16 -0400 Subject: [PATCH 454/494] [selection] [codegen] added reduction --- include/triton/codegen/analysis/layout.h | 15 ++ include/triton/codegen/instructions.h | 8 +- lib/codegen/analysis/layout.cc | 20 +++ lib/codegen/selection/generator.cc | 169 ++++++++++++----------- tests/bench/dot.cc | 4 +- tests/common/dot.h | 2 +- 6 files changed, 129 insertions(+), 89 deletions(-) diff --git a/include/triton/codegen/analysis/layout.h b/include/triton/codegen/analysis/layout.h index 70260542a..e0eee3a38 100644 --- a/include/triton/codegen/analysis/layout.h +++ b/include/triton/codegen/analysis/layout.h @@ -126,6 +126,18 @@ private: void create(size_t id, const std::vector& values); +// size_t shared_tmp_req(ir::instruction* i) { +// switch(i->get_id()) { +// case ir::INST_REDUCE: { +// ir::reduce_inst *red = (ir::reduce_inst*)i; +// ir::type *ty = red->get_type(); + + +// } +// default: return 0; +// } +// } + public: // constructor layout(analysis::axes *axes, analysis::align *align, size_t num_warps); @@ -134,8 +146,10 @@ public: unsigned layout_of(ir::value *value) const; const std::vector& values_of(unsigned id) const; size_t num_layouts() const; + const layout_t* get(size_t id) const; const layout_t* get(ir::value *v) const; std::map &get_all(); + size_t tmp(ir::instruction* i); // execution void run(ir::module &mod); @@ -148,6 +162,7 @@ private: std::map groups_; std::map> values_; std::map layouts_; + std::map tmp_; }; } diff --git a/include/triton/codegen/instructions.h b/include/triton/codegen/instructions.h index 2e5d6148f..c42abee4a 100644 --- a/include/triton/codegen/instructions.h +++ b/include/triton/codegen/instructions.h @@ -6,8 +6,12 @@ #include namespace triton{ -namespace codegen{ +namespace ir{ +class instruction; +} + +namespace codegen{ enum storage_info_t { NONE, @@ -63,7 +67,6 @@ static const std::map storage_info = { { ir::INST_RETURN, {NONE, {}}}, { ir::INST_UNCOND_BRANCH, {NONE, {}}}, { ir::INST_COND_BRANCH, {NONE, {REPLICATED}}}, - // intrinsics { ir::INST_COPY_TO_SHARED, {SHARED, {DISTRIBUTED}}}, { ir::INST_COPY_FROM_SHARED, {DISTRIBUTED, {SHARED}}}, @@ -73,6 +76,7 @@ static const std::map storage_info = { { ir::INST_MAKE_RANGE, {DISTRIBUTED, {}}} }; + } } diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index 3d2296aae..1066a5cae 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -76,6 +76,10 @@ bool is_hmma_c(ir::value *v){ return result; } +const layout_t* layout::get(size_t id) const { + return layouts_.at(id); +} + const layout_t* layout::get(ir::value *v) const { return layouts_.at(groups_.at(v)); } @@ -84,6 +88,10 @@ std::map& layout::get_all() { return layouts_; } +size_t layout::tmp(ir::instruction* i) { + return tmp_.at(i); +} + void extract_io_use(ir::value *v, std::set& result) { for(ir::user* u: v->get_users()){ auto i = dynamic_cast(u); @@ -323,6 +331,7 @@ layout_shared_t::layout_shared_t(const layout_t *arg, size *= 2; } + // layout factory method void layout::create(size_t id, const std::vector& values) { auto it_hmma_c = std::find_if(values.begin(), values.end(), &is_hmma_c); @@ -364,6 +373,17 @@ void layout::run(ir::module &mod) { // create layouts for(const auto& x: values_) create(x.first, x.second); + + // create temporaries + size_t id = values_.size(); + ir::for_each_instruction(mod, [this, &id](ir::instruction* i) { + if(auto *red = dynamic_cast(i)) { + id++; + ir::value *arg = red->get_operand(0); + layouts_[id] = new layout_shared_t(get(arg), axes_->get(arg), arg->get_type()->get_tile_shapes(), {red}, red->get_type()->get_scalar_ty(), id, align_); + tmp_[red] = id; + } + }); } } diff --git a/lib/codegen/selection/generator.cc b/lib/codegen/selection/generator.cc index 41398dc76..8fbdbeded 100644 --- a/lib/codegen/selection/generator.cc +++ b/lib/codegen/selection/generator.cc @@ -750,96 +750,97 @@ void generator::visit_sqrt_inst(ir::sqrt_inst* sqt) { } void generator::visit_reduce_inst(ir::reduce_inst* x) { - throw std::runtime_error("not implemented"); -// std::map partial; -// ir::value *arg = x->get_operand(0); -// distributed_tile* arg_tile = (distributed_tile*)tmap_.at(arg); -// ir::reduce_inst::op_t op = x->get_op(); -// auto accumulate = [&](Value* x, Value *y) -> Value* { -// switch(op) { -// case ir::reduce_inst::ADD: return builder_->CreateAdd(x, y); -// case ir::reduce_inst::SUB: return builder_->CreateSub(x, y); -// case ir::reduce_inst::MAX: return builder_->CreateMaximum(x, y); -// case ir::reduce_inst::MIN: return builder_->CreateMinimum(x, y); -// case ir::reduce_inst::FADD: return builder_->CreateFAdd(x, y); -// case ir::reduce_inst::FSUB: return builder_->CreateFSub(x, y); -// case ir::reduce_inst::FMAX: return builder_->CreateSelect(builder_->CreateFCmpOGT(x, y), x, y); -// case ir::reduce_inst::FMIN: return builder_->CreateSelect(builder_->CreateFCmpOLT(x, y), x, y); -// default: break; -// } -// assert(false); -// return nullptr; -// }; + std::map partial; + ir::value *arg = x->get_operand(0); + distributed_tile* arg_tile = (distributed_tile*)tmap_.at(arg); + ir::reduce_inst::op_t op = x->get_op(); + auto accumulate = [&](Value* x, Value *y) -> Value* { + switch(op) { + case ir::reduce_inst::ADD: return builder_->CreateAdd(x, y); + case ir::reduce_inst::SUB: return builder_->CreateSub(x, y); + case ir::reduce_inst::MAX: return builder_->CreateMaximum(x, y); + case ir::reduce_inst::MIN: return builder_->CreateMinimum(x, y); + case ir::reduce_inst::FADD: return builder_->CreateFAdd(x, y); + case ir::reduce_inst::FSUB: return builder_->CreateFSub(x, y); + case ir::reduce_inst::FMAX: return builder_->CreateSelect(builder_->CreateFCmpOGT(x, y), x, y); + case ir::reduce_inst::FMIN: return builder_->CreateSelect(builder_->CreateFCmpOLT(x, y), x, y); + default: break; + } + assert(false); + return nullptr; + }; -// unsigned axis = x->get_axis(); + // reduce within thread + unsigned axis = x->get_axis(); + arg_tile->for_each([&](indices_t idx) { + indices_t pidx = idx; + pidx[axis] = builder_->getInt32(0); + Value *current = arg_tile->get_value(idx); + // current partial result is not initialized -- create + if(partial.find(pidx) == partial.end()) + partial[pidx] = current; + // current partial result is initialized -- accumulate + else + partial[pidx] = accumulate(partial[pidx], current); + }); -// // reduce within thread -// arg_tile->for_each([&](indices_t idx) { -// indices_t pidx = idx; -// pidx[axis] = builder_->getInt32(0); -// Value *current = arg_tile->get_value(idx); -// // current partial result is not initialized -- create -// if(partial.find(pidx) == partial.end()) -// partial[pidx] = current; -// // current partial result is initialized -- accumulate -// else -// partial[pidx] = accumulate(partial[pidx], current); -// }); + // depth + unsigned shape_ax = arg->get_type()->get_tile_shapes()[axis]; + unsigned per_thread = arg_tile->axis(axis).values.size(); + unsigned depth = shape_ax / per_thread; -// // depth -// unsigned shape_ax = arg->get_type()->get_tile_shapes()[axis]; -// unsigned per_thread = arg_tile->axis(axis).values.size(); -// unsigned depth = shape_ax / per_thread; + // shapes + auto shared_shapes = arg_tile->get_shapes(); + shared_shapes[axis] = depth; -// // shapes -// auto shared_shapes = arg_tile->get_shapes(); -// shared_shapes[axis] = depth; + // reduce within blocks + machine_layout_t *slayout = machine_layouts_.at(layouts_->get(layouts_->tmp(x))); + shared_tile *stile = (shared_tile*)slayout->create(x); -// // reduce within blocks -// unsigned addr_space = sh_mem_ptr_->getType()->getPointerAddressSpace(); -// Type *res_ty = builder_->getFloatTy(); -// Value *base_ptr = builder_->CreateBitCast(sh_mem_ptr_, PointerType::get(res_ty, addr_space)); -// for(auto& x: partial) { -// // current element being computed -// Value *lane = axes_.at(a_axes_->get(arg, axis)).thread_id; -// Value *&result = x.second; -// indices_t write_idx = x.first; -// write_idx[axis] = lane; -// // shared memory write pointer -// Value *write_offset = shared_tile::shared_offset(*builder_, shared_shapes, write_idx); -// Value *write_ptr = builder_->CreateGEP(base_ptr, write_offset); -// // initialize shared memory -// tgt_->add_barrier(*mod_, *builder_); -// builder_->CreateStore(result, write_ptr); -// // build result -// for(unsigned i = depth/2; i > 0; i >>= 1){ -// // current indices -// indices_t current(write_idx.size(), builder_->getInt32(0)); -// current[axis] = builder_->getInt32(i); -// // shared memory offset -// Value *read_offset = shared_tile::shared_offset(*builder_, shared_shapes, current); -// Value *is_active = builder_->CreateICmpULT(lane, builder_->getInt32(i)); -// read_offset = builder_->CreateSelect(is_active, read_offset, builder_->getInt32(0)); -// // shared memory read pointer -// Value *read_ptr = builder_->CreateGEP(write_ptr, read_offset); -// tgt_->add_barrier(*mod_, *builder_); -// Value *next = builder_->CreateLoad(read_ptr); -// // accumulate -// result = accumulate(result, next); -// // write back -// builder_->CreateStore(result, write_ptr); -// } -// } -// tgt_->add_barrier(*mod_, *builder_); + unsigned addr_space = sh_mem_ptr_->getType()->getPointerAddressSpace(); + Type *res_ty = builder_->getFloatTy(); + Value *base_ptr = builder_->CreateBitCast(sh_mem_ptr_, PointerType::get(res_ty, addr_space)); + for(auto& x: partial) { + // current element being computed + Value *lane = axes_.at(a_axes_->get(arg, axis)).thread_id; + Value *&result = x.second; + indices_t write_idx = x.first; + write_idx[axis] = lane; + // shared memory write pointer + Value *write_offset = shared_tile::shared_offset(*builder_, stile->get_shapes(), stile->get_perm(), stile->get_order(), write_idx); + Value *write_ptr = builder_->CreateGEP(base_ptr, write_offset); + // initialize shared memory + tgt_->add_barrier(mod_, *builder_); + builder_->CreateStore(result, write_ptr); + // build result + for(unsigned i = depth/2; i > 0; i >>= 1){ + // current indices + indices_t current(write_idx.size(), builder_->getInt32(0)); + current[axis] = builder_->getInt32(i); + // shared memory offset + Value *read_offset = shared_tile::shared_offset(*builder_, stile->get_shapes(), stile->get_perm(), stile->get_order(), current); + Value *is_active = builder_->CreateICmpULT(lane, builder_->getInt32(i)); + read_offset = builder_->CreateSelect(is_active, read_offset, builder_->getInt32(0)); + // shared memory read pointer + Value *read_ptr = builder_->CreateGEP(write_ptr, read_offset); + tgt_->add_barrier(mod_, *builder_); + Value *next = builder_->CreateLoad(read_ptr); + // accumulate + result = accumulate(result, next); + // write back + builder_->CreateStore(result, write_ptr); + } + } + tgt_->add_barrier(mod_, *builder_); -// distributed_tile* x_tile = (distributed_tile*)tmap_.at(x); -// x_tile->for_each([&](indices_t idx) { -// indices_t red_idx = idx; -// red_idx.insert(red_idx.begin() + axis, builder_->getInt32(0)); -// Value *read_offset = shared_tile::shared_offset(*builder_, shared_shapes, red_idx); -// Value *read_ptr = builder_->CreateGEP(base_ptr, read_offset); -// x_tile->set_value(idx, builder_->CreateLoad(read_ptr)); -// }); + distributed_tile* x_tile = (distributed_tile*)tmap_.at(x); + x_tile->for_each([&](indices_t idx) { + indices_t red_idx = idx; + red_idx.insert(red_idx.begin() + axis, builder_->getInt32(0)); + Value *read_offset = shared_tile::shared_offset(*builder_, stile->get_shapes(), stile->get_perm(), stile->get_order(), red_idx); + Value *read_ptr = builder_->CreateGEP(base_ptr, read_offset); + x_tile->set_value(idx, builder_->CreateLoad(read_ptr)); + }); } void generator::visit_select_inst(ir::select_inst* select) { diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index 9857e9865..c87e1c938 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -13,7 +13,7 @@ int main() { for(auto x: std::vector>{{false, false}, {false, true}, {true, false}, {true, true}}){ std::vector tmp = { - config_t{ord, x[0], x[1], 4096, 4096, 4096}, + config_t{ord, x[0], x[1], 2048, 2048, 2048}, // config_t{ord, x[0], x[1], 16, 2048, 2048}, // config_t{ord, x[0], x[1], 32, 2048, 2048}, // config_t{ord, x[0], x[1], 64, 2048, 2048}, @@ -34,7 +34,7 @@ int main() { for(const auto& c: configs){ std::tie(ord, AT, BT, M, N, K) = c; std::cout << "// " << c << std::flush; - for(auto perf: bench_dot(stream, HALF, AT, BT, M, N, K, ord, ord)) + for(auto perf: bench_dot(stream, FLOAT, AT, BT, M, N, K, ord, ord)) std::cout << ", " << perf << std::flush; std::cout << std::endl; } diff --git a/tests/common/dot.h b/tests/common/dot.h index ba6447162..ddbb1c77a 100644 --- a/tests/common/dot.h +++ b/tests/common/dot.h @@ -111,7 +111,7 @@ bool triton_dot(drv::stream* stream, bool AT, bool BT, if(mode == BENCH) { opt.defines.push_back({"TM", {"64", "128"}}); opt.defines.push_back({"TN", {"64", "128"}}); - opt.defines.push_back({"TK", {"8", "16"}}); + opt.defines.push_back({"TK", {"8"}}); opt.num_warps = {2, 4, 8}; } From 23db500edff7af81cc287ecf0d0ff5853e300dda Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 19 Oct 2019 16:53:48 -0400 Subject: [PATCH 455/494] [tests] [common] added reduce.h to common headers --- lib/codegen/analysis/layout.cc | 11 ++- lib/codegen/selection/generator.cc | 12 +-- tests/common/reduce.h | 148 +++++++++++++++++++++++++++++ tests/common/src/reduce.h | 6 +- tests/unit/reduce.cc | 68 +------------ 5 files changed, 164 insertions(+), 81 deletions(-) create mode 100644 tests/common/reduce.h diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index 1066a5cae..eda55e451 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -380,7 +380,16 @@ void layout::run(ir::module &mod) { if(auto *red = dynamic_cast(i)) { id++; ir::value *arg = red->get_operand(0); - layouts_[id] = new layout_shared_t(get(arg), axes_->get(arg), arg->get_type()->get_tile_shapes(), {red}, red->get_type()->get_scalar_ty(), id, align_); + unsigned axis = red->get_axis(); + // shape + auto shapes = arg->get_type()->get_tile_shapes(); + unsigned shape_ax = shapes[axis]; + const layout_t *layout = get(arg); + unsigned per_thread = layout->nts[axis]; + unsigned depth = shape_ax / per_thread; + shapes[axis] = depth; + // create layout + layouts_[id] = new layout_shared_t(layout, axes_->get(arg), shapes, {red}, red->get_type()->get_scalar_ty(), id, align_); tmp_[red] = id; } }); diff --git a/lib/codegen/selection/generator.cc b/lib/codegen/selection/generator.cc index 8fbdbeded..d9d1e1cec 100644 --- a/lib/codegen/selection/generator.cc +++ b/lib/codegen/selection/generator.cc @@ -784,18 +784,10 @@ void generator::visit_reduce_inst(ir::reduce_inst* x) { partial[pidx] = accumulate(partial[pidx], current); }); - // depth - unsigned shape_ax = arg->get_type()->get_tile_shapes()[axis]; - unsigned per_thread = arg_tile->axis(axis).values.size(); - unsigned depth = shape_ax / per_thread; - - // shapes - auto shared_shapes = arg_tile->get_shapes(); - shared_shapes[axis] = depth; - // reduce within blocks machine_layout_t *slayout = machine_layouts_.at(layouts_->get(layouts_->tmp(x))); shared_tile *stile = (shared_tile*)slayout->create(x); + unsigned depth = stile->get_shapes()[axis]; unsigned addr_space = sh_mem_ptr_->getType()->getPointerAddressSpace(); Type *res_ty = builder_->getFloatTy(); @@ -832,7 +824,7 @@ void generator::visit_reduce_inst(ir::reduce_inst* x) { } } tgt_->add_barrier(mod_, *builder_); - + // write back distributed_tile* x_tile = (distributed_tile*)tmap_.at(x); x_tile->for_each([&](indices_t idx) { indices_t red_idx = idx; diff --git a/tests/common/reduce.h b/tests/common/reduce.h new file mode 100644 index 000000000..86e066638 --- /dev/null +++ b/tests/common/reduce.h @@ -0,0 +1,148 @@ +#include +#include +#include +#include +#include "triton/driver/backend.h" +#include "triton/driver/stream.h" +#include "triton/tools/bench.hpp" +#include "triton/external/half.hpp" +#include "triton/runtime/function.h" +#include "src/reduce.h" +#include "util.h" + +namespace drv = triton::driver; +namespace rt = triton::runtime; + +template +void cc_reduce_nd(std::vector &y, const std::vector &x, reduce_op_t op, size_t axis, const std::vector& shapes) { + assert(axis <= shapes.size() - 1); + // remove shape at index axis to get outer dimensions + std::vector outer = shapes; + outer.erase(outer.begin() + axis); + // retrieve shape at index axis to get inner dimension + int inner = shapes[axis]; + // accumualtion function + auto acc = get_accumulator(op); + // iterate over outer dimensions + _loop_nest(outer, [&](const std::vector& y_idx) { + T ret = 0; + auto x_idx = y_idx; + x_idx.insert(x_idx.begin() + axis, 0); + // accumulate over inner dimensions + for(int z = 0; z < inner; z++){ + x_idx[axis] = z; + ret = acc(ret, x[offset(x_idx, shapes)]); + } + y[offset(y_idx, outer)] = ret; + }); +} + +enum run_mode_t { + BENCH, + TEST +}; + +void triton_reduce_nd(drv::stream* stream, const std::vector& shape, + int axis, reduce_op_t op, + const std::vector& x_order, const std::vector& y_order, + std::vector> TS, + run_mode_t mode, std::vector& bench, bool &test) { + typedef float NumericT; + std::string ty = "float"; + size_t dtsize = sizeof(NumericT); + drv::context* context = stream->context(); + size_t axy = (axis == 0) ? 1 : 0; + + // rank + size_t rank = shape.size(); + // size + size_t size = 1; + for(int32_t d: shape) + size *= d; + std::vector shapename = {"S0", "S1", "S2"}; + // strides for x + std::vector x_strides = {"1"}; + for(size_t d = 0; d < rank - 1; d++) + x_strides.push_back(x_strides[d] + " * " + shapename[x_order[d]]); + // strides for y + std::vector y_strides = {"1"}; + for(size_t d = 0; d < rank - 1; d++) + y_strides.push_back(y_strides[d] + " * " + shapename[y_order[d]]); + + // create inputs + auto dx = std::unique_ptr(drv::buffer::create(context, size*dtsize)); + auto dy = std::unique_ptr(drv::buffer::create(context, size*dtsize)); + // create options + rt::function::options_space_t opt; + + // type + opt.defines.push_back({"TYPE", {ty}}); + // x strides + for(size_t d = 0; d < rank; d++) + opt.defines.push_back({"STRIDE_XS" + std::to_string(x_order[d]), {x_strides[d]}}); + // y strides + for(size_t d = 0; d < rank; d++) + opt.defines.push_back({"STRIDE_YS" + std::to_string(y_order[d]), {y_strides[d]}}); + if(TS.empty()) + TS = tile_nd(rank); + // tile size + for(size_t d = 0; d < rank; d++) + opt.defines.push_back({"TS" + std::to_string(d), TS[d]}); + // non-reduced axis + std::string RY = (axis == 0) ? "rn" : "rm"; + opt.defines.push_back({"TY", {std::to_string(shape[axy])}}); + opt.defines.push_back({"RY", {RY}}); + // reduction broadcasting + std::string RED = ""; + for(int n = 0; n < 2; n++){ + if(n > 0) + RED += ", "; + RED += (n==axis) ? to_str(op) : ":"; + } + opt.defines.push_back({"RED", {RED}}); + + opt.num_warps = {4}; + + // kernel + rt::function function(src::reduce2d, opt); + + // grid + std::vector args = {&*dx, &*dy}; + for(int32_t d: shape) + args.push_back(d); + args.push_back(shape[0]); + std::vector ts = {"TS0", "TS1", "TS2"}; + auto grid = grid_nd(shape, ts); + + // metrics + if(mode == BENCH){ + auto gbps = [&](double ns) { return 2 * size * dtsize / (ns * 1e-9) * 1e-9; }; + double triton_ns = triton::tools::bench([&]() { function(args, grid, stream);}, stream); + bench.push_back(gbps(triton_ns)); + } + + // test triton + if(mode == TEST){ + std::vector hy(shape[axy]); + std::vector ry(shape[axy]); + std::vector hx(shape[0]*shape[1]); + init_zeros(hy); + init_rand(hx); + stream->write(&*dx, true, 0, hx); + function(args, grid, stream); + stream->synchronize(); + stream->read(&*dy, true, 0, hy); + cc_reduce_nd(ry, hx, op, axis, shape); + test = testing::diff(hy, ry); + } +} + +bool do_test(drv::stream* stream, std::vector shape, int axis, reduce_op_t op, int nwarp){ + std::vector bench; + bool test; + std::vector> TSS; + for(int32_t d: shape) + TSS.push_back({std::to_string(d)}); + triton_reduce_nd(stream, shape, axis, op, {0, 1}, {0, 1}, TSS, TEST, bench, test); + return test; +} diff --git a/tests/common/src/reduce.h b/tests/common/src/reduce.h index 3a77e960e..cc44ca5fc 100644 --- a/tests/common/src/reduce.h +++ b/tests/common/src/reduce.h @@ -16,9 +16,9 @@ void reduce2d(TYPE * X __noalias __readonly __aligned(16), int M, int N, int ldx) { int ridm = get_program_id(0); int ridn = get_program_id(1); - int rm[TM] = ridm * TM + 0 ... TM; - int rn[TN] = ridn * TN + 0 ... TN; - TYPE* px[TM, TN] = X + rm[:, newaxis] + rn[newaxis, :] * ldx; + int rm[TS0] = ridm * TS0 + 0 ... TS0; + int rn[TS1] = ridn * TS1 + 0 ... TS1; + TYPE* px[TS0, TS1] = X + rm[:, newaxis] + rn[newaxis, :] * ldx; TYPE* py[TY] = Y + RY; *py = (*px)[RED]; } diff --git a/tests/unit/reduce.cc b/tests/unit/reduce.cc index 3c3754133..0ee43cbc5 100644 --- a/tests/unit/reduce.cc +++ b/tests/unit/reduce.cc @@ -8,76 +8,10 @@ #include "triton/tools/bench.hpp" #include "triton/external/half.hpp" #include "triton/runtime/function.h" -#include "src/reduce.h" #include "cuda/cublas.h" +#include "reduce.h" #include "util.h" -namespace drv = triton::driver; -namespace rt = triton::runtime; - -template -void reduce_nd(std::vector &y, const std::vector &x, reduce_op_t op, size_t axis, const std::vector& shapes) { - assert(axis <= shapes.size() - 1); - // remove shape at index axis to get outer dimensions - std::vector outer = shapes; - outer.erase(outer.begin() + axis); - // retrieve shape at index axis to get inner dimension - int inner = shapes[axis]; - // accumualtion function - auto acc = get_accumulator(op); - // iterate over outer dimensions - _loop_nest(outer, [&](const std::vector& y_idx) { - T ret = 0; - auto x_idx = y_idx; - x_idx.insert(x_idx.begin() + axis, 0); - // accumulate over inner dimensions - for(int z = 0; z < inner; z++){ - x_idx[axis] = z; - ret = acc(ret, x[offset(x_idx, shapes)]); - } - y[offset(y_idx, outer)] = ret; - }); -} - - -bool do_test(drv::stream* stream, std::vector shape, int axis, reduce_op_t op, int nwarp){ - typedef float NumericT; - std::string ty = "float"; - size_t dt_nbytes = sizeof(NumericT); - drv::context* context = stream->context(); - size_t axy = (axis == 0) ? 1 : 0; - std::string RY = (axis == 0) ? "rn" : "rm"; - std::vector hy(shape[axy]); - std::vector ry(shape[axy]); - std::vector hx(shape[0]*shape[1]); - srand(0); - init_zeros(hy); - init_rand(hx); - auto dy = std::shared_ptr(drv::buffer::create(context, hy.size()*dt_nbytes)); - auto dx = std::shared_ptr(drv::buffer::create(context, hx.size()*dt_nbytes)); - stream->write(&*dy, true, 0, hy); - stream->write(&*dx, true, 0, hx); - rt::function::options_space_t opt; - opt.defines.push_back({"TYPE", {ty}}); - opt.defines.push_back({"TM", {std::to_string(shape[0])}}); - opt.defines.push_back({"TN", {std::to_string(shape[1])}}); - opt.defines.push_back({"TY", {std::to_string(shape[axy])}}); - opt.defines.push_back({"RY", {RY}}); - std::string RED = ""; - for(int n = 0; n < 2; n++){ - if(n > 0) - RED += ", "; - RED += (n==axis) ? to_str(op) : ":"; - } - opt.defines.push_back({"RED", {RED}}); - opt.num_warps = {nwarp}; - rt::function function(src::reduce2d, opt); - function({&*dx, &*dy, shape[0], shape[1], shape[0]}, grid2d(shape[0], shape[1]), stream); - stream->synchronize(); - stream->read(&*dy, true, 0, hy); - reduce_nd(ry, hx, op, axis, shape); - return testing::diff(hy, ry); -} int main() { // initialize default compute device From abe3fbb480efb25662d501d220cbab4eebde9995 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 20 Oct 2019 01:01:53 -0400 Subject: [PATCH 456/494] [test] [reduce] added test for 1D reduction --- lib/codegen/analysis/axes.cc | 2 + lib/codegen/selection/generator.cc | 5 +- lib/ir/instructions.cc | 6 +-- lib/lang/ast.cc | 7 ++- lib/lang/code_gen.cc | 3 ++ tests/common/reduce.h | 85 ++++++++++++++++-------------- tests/common/src/reduce.h | 25 ++++++--- tests/unit/reduce.cc | 1 + 8 files changed, 79 insertions(+), 55 deletions(-) diff --git a/lib/codegen/analysis/axes.cc b/lib/codegen/analysis/axes.cc index c446558a8..6271e224b 100644 --- a/lib/codegen/analysis/axes.cc +++ b/lib/codegen/analysis/axes.cc @@ -106,6 +106,8 @@ void axes::update_graph_elementwise(ir::instruction *i) { } void axes::update_graph_no_edge(ir::instruction *i) { + if(!i->get_type()->is_tile_ty()) + return; auto rank = i->get_type()->get_tile_rank(); for(unsigned d = 0; d < rank; d++) graph_.add_edge({i, d}, {i, d}); diff --git a/lib/codegen/selection/generator.cc b/lib/codegen/selection/generator.cc index d9d1e1cec..d7b9bc6a3 100644 --- a/lib/codegen/selection/generator.cc +++ b/lib/codegen/selection/generator.cc @@ -825,13 +825,12 @@ void generator::visit_reduce_inst(ir::reduce_inst* x) { } tgt_->add_barrier(mod_, *builder_); // write back - distributed_tile* x_tile = (distributed_tile*)tmap_.at(x); - x_tile->for_each([&](indices_t idx) { + for_each(x, [&](indices_t idx) { indices_t red_idx = idx; red_idx.insert(red_idx.begin() + axis, builder_->getInt32(0)); Value *read_offset = shared_tile::shared_offset(*builder_, stile->get_shapes(), stile->get_perm(), stile->get_order(), red_idx); Value *read_ptr = builder_->CreateGEP(base_ptr, read_offset); - x_tile->set_value(idx, builder_->CreateLoad(read_ptr)); + set_value(x, idx, builder_->CreateLoad(read_ptr)); }); } diff --git a/lib/ir/instructions.cc b/lib/ir/instructions.cc index 01f961402..0be815a51 100644 --- a/lib/ir/instructions.cc +++ b/lib/ir/instructions.cc @@ -651,10 +651,10 @@ type* reduce_inst::get_res_type(value *arg, unsigned axis) { ir::tile_type::tile_shapes_t shapes = arg->get_type()->get_tile_shapes(); shapes.erase(shapes.begin() + axis); type *scalar_ty = arg->get_type()->get_scalar_ty(); - if(shapes.size() == 0) + if(shapes.empty()) +// shapes.push_back(1); return scalar_ty; - else - return tile_type::get(scalar_ty, shapes); + return tile_type::get(scalar_ty, shapes); } reduce_inst::reduce_inst(value *arg, op_t op, unsigned axis, const std::string &name, instruction *next) diff --git a/lib/lang/ast.cc b/lib/lang/ast.cc index b0a50adc3..bf0c7e964 100644 --- a/lib/lang/ast.cc +++ b/lib/lang/ast.cc @@ -567,7 +567,7 @@ void BinaryOp::AssignOpTypeChecking() { // The other constraints are lefted to cast operator rhs_ = Expr::MayCast(rhs_, ScalarOrLikeTile(rhs_, lhsScalType)); type_ = lhs_->Type(); - Broadcast(this, lhs_, rhs_, type_); + rhs_ = UnaryOp::New(Token::CAST, rhs_, type_); } /* @@ -688,7 +688,10 @@ void UnaryOp::ReduceOpTypeChecking() { Error(this, "array expected for reduction operation"); auto shape = tileType->Shape(); shape.erase(shape.begin() + ax); - type_ = TileType::New(shape, tileType->Derived()); + if(shape.empty()) + type_ = tileType->Derived(); + else + type_ = TileType::New(shape, tileType->Derived()); } void UnaryOp::TransOpTypeChecking() { diff --git a/lib/lang/code_gen.cc b/lib/lang/code_gen.cc index 4bf9d4a2c..aee604b4a 100644 --- a/lib/lang/code_gen.cc +++ b/lib/lang/code_gen.cc @@ -467,6 +467,9 @@ ir::value* Generator::GenBroadcastOp(ir::value* src, ir::type* dst_ty) { return bld_->create_broadcast(src, dst_shapes); } } + else if(src->get_type()->is_tile_ty() && src->get_type()->get_tile_num_elements() == 1){ + return bld_->create_downcast(src); + } return src; } diff --git a/tests/common/reduce.h b/tests/common/reduce.h index 86e066638..ba4e6e470 100644 --- a/tests/common/reduce.h +++ b/tests/common/reduce.h @@ -19,6 +19,8 @@ void cc_reduce_nd(std::vector &y, const std::vector &x, reduce_op_t op, si // remove shape at index axis to get outer dimensions std::vector outer = shapes; outer.erase(outer.begin() + axis); + if(outer.empty()) + outer.push_back(1); // retrieve shape at index axis to get inner dimension int inner = shapes[axis]; // accumualtion function @@ -42,7 +44,7 @@ enum run_mode_t { TEST }; -void triton_reduce_nd(drv::stream* stream, const std::vector& shape, +void triton_reduce_nd(drv::stream* stream, const std::vector& shape_x, int axis, reduce_op_t op, const std::vector& x_order, const std::vector& y_order, std::vector> TS, @@ -53,86 +55,91 @@ void triton_reduce_nd(drv::stream* stream, const std::vector& shape, drv::context* context = stream->context(); size_t axy = (axis == 0) ? 1 : 0; + // shape + std::vector shape_y = shape_x; + shape_y.erase(shape_y.begin() + axis); + // rank - size_t rank = shape.size(); + int rank_x = shape_x.size(); + int rank_y = shape_y.size(); + // size - size_t size = 1; - for(int32_t d: shape) - size *= d; - std::vector shapename = {"S0", "S1", "S2"}; + size_t size_x = 1; + for(int32_t d: shape_x) + size_x *= d; + size_t size_y = 1; + for(int32_t d: shape_y) + size_y *= d; + // strides for x + std::vector x_shapename = {"S0", "S1", "S2"}; std::vector x_strides = {"1"}; - for(size_t d = 0; d < rank - 1; d++) - x_strides.push_back(x_strides[d] + " * " + shapename[x_order[d]]); + for(int d = 0; d < rank_x - 1; d++) + x_strides.push_back(x_strides[d] + " * " + x_shapename[x_order[d]]); + // strides for y + std::vector y_shapename = x_shapename; + y_shapename.erase(y_shapename.begin() + axis); std::vector y_strides = {"1"}; - for(size_t d = 0; d < rank - 1; d++) - y_strides.push_back(y_strides[d] + " * " + shapename[y_order[d]]); + for(int d = 0; d < rank_y - 1; d++) + y_strides.push_back(y_strides[d] + " * " + y_shapename[y_order[d]]); - // create inputs - auto dx = std::unique_ptr(drv::buffer::create(context, size*dtsize)); - auto dy = std::unique_ptr(drv::buffer::create(context, size*dtsize)); - // create options + // options rt::function::options_space_t opt; - - // type opt.defines.push_back({"TYPE", {ty}}); - // x strides - for(size_t d = 0; d < rank; d++) + for(int d = 0; d < rank_x; d++) opt.defines.push_back({"STRIDE_XS" + std::to_string(x_order[d]), {x_strides[d]}}); - // y strides - for(size_t d = 0; d < rank; d++) + for(int d = 0; d < rank_y; d++) opt.defines.push_back({"STRIDE_YS" + std::to_string(y_order[d]), {y_strides[d]}}); if(TS.empty()) - TS = tile_nd(rank); - // tile size - for(size_t d = 0; d < rank; d++) + TS = tile_nd(rank_x); + for(int d = 0; d < rank_x; d++) opt.defines.push_back({"TS" + std::to_string(d), TS[d]}); - // non-reduced axis - std::string RY = (axis == 0) ? "rn" : "rm"; - opt.defines.push_back({"TY", {std::to_string(shape[axy])}}); + std::string RY = (axis == 0) ? "rs1" : "rs0"; + opt.defines.push_back({"TY", {std::to_string(shape_x[axy])}}); opt.defines.push_back({"RY", {RY}}); - // reduction broadcasting std::string RED = ""; - for(int n = 0; n < 2; n++){ + for(int n = 0; n < rank_x; n++){ if(n > 0) RED += ", "; RED += (n==axis) ? to_str(op) : ":"; } opt.defines.push_back({"RED", {RED}}); - - opt.num_warps = {4}; + opt.num_warps = {1}; // kernel - rt::function function(src::reduce2d, opt); + rt::function function(src::reduce_nd[rank_x - 1], opt); + + // input buffers + auto dx = std::unique_ptr(drv::buffer::create(context, size_x*dtsize)); + auto dy = std::unique_ptr(drv::buffer::create(context, size_y*dtsize)); // grid std::vector args = {&*dx, &*dy}; - for(int32_t d: shape) + for(int32_t d: shape_x) args.push_back(d); - args.push_back(shape[0]); std::vector ts = {"TS0", "TS1", "TS2"}; - auto grid = grid_nd(shape, ts); + auto grid = grid_nd(shape_x, ts); // metrics if(mode == BENCH){ - auto gbps = [&](double ns) { return 2 * size * dtsize / (ns * 1e-9) * 1e-9; }; + auto gbps = [&](double ns) { return 2 * size_x * dtsize / (ns * 1e-9) * 1e-9; }; double triton_ns = triton::tools::bench([&]() { function(args, grid, stream);}, stream); bench.push_back(gbps(triton_ns)); } // test triton if(mode == TEST){ - std::vector hy(shape[axy]); - std::vector ry(shape[axy]); - std::vector hx(shape[0]*shape[1]); + std::vector hy(size_y); + std::vector ry(size_y); + std::vector hx(size_x); init_zeros(hy); init_rand(hx); stream->write(&*dx, true, 0, hx); function(args, grid, stream); stream->synchronize(); stream->read(&*dy, true, 0, hy); - cc_reduce_nd(ry, hx, op, axis, shape); + cc_reduce_nd(ry, hx, op, axis, shape_x); test = testing::diff(hy, ry); } } diff --git a/tests/common/src/reduce.h b/tests/common/src/reduce.h index cc44ca5fc..158e68567 100644 --- a/tests/common/src/reduce.h +++ b/tests/common/src/reduce.h @@ -4,7 +4,11 @@ namespace src { R"( void reduce1d(TYPE * X __noalias __readonly __aligned(16), TYPE * Y __noalias __readonly __aligned(16), - int N) { + int S0) { + int pid0 = get_program_id(0); + int rs0[TS0] = pid0 * TS0 + 0 ... TS0; + TYPE* px[TS0] = X + rs0; + *Y = (*px)[RED]; } )"; @@ -13,15 +17,20 @@ void reduce1d(TYPE * X __noalias __readonly __aligned(16), R"( void reduce2d(TYPE * X __noalias __readonly __aligned(16), TYPE * Y __noalias __writeonly __aligned(16), - int M, int N, int ldx) { - int ridm = get_program_id(0); - int ridn = get_program_id(1); - int rm[TS0] = ridm * TS0 + 0 ... TS0; - int rn[TS1] = ridn * TS1 + 0 ... TS1; - TYPE* px[TS0, TS1] = X + rm[:, newaxis] + rn[newaxis, :] * ldx; - TYPE* py[TY] = Y + RY; + int S0, int S1) { + int pid0 = get_program_id(0); + int pid1 = get_program_id(1); + int rs0[TS0] = pid0 * TS0 + 0 ... TS0; + int rs1[TS1] = pid1 * TS1 + 0 ... TS1; + TYPE* px[TS0, TS1] = X + rs0[:, newaxis] * STRIDE_XS0 + + rs1[newaxis, :] * STRIDE_XS1; + TYPE* py[TY] = Y + RY * STRIDE_YS0; *py = (*px)[RED]; } )"; + + const char* reduce_nd[] = {reduce1d, reduce2d}; + + } diff --git a/tests/unit/reduce.cc b/tests/unit/reduce.cc index 0ee43cbc5..5a311686c 100644 --- a/tests/unit/reduce.cc +++ b/tests/unit/reduce.cc @@ -20,6 +20,7 @@ int main() { // shapes to benchmark typedef std::tuple, int, reduce_op_t> config_t; std::vector configs = { + config_t{{32}, 0, ADD}, config_t{{32, 32}, 0, MAX}, config_t{{32, 32}, 1, ADD}, config_t{{32, 64}, 0, ADD}, From 96cba9036af1ab045dc41218732d1a111c4c4c14 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 20 Oct 2019 17:48:19 -0400 Subject: [PATCH 457/494] [tests] [unit] added 1D and 3D reduction test --- lib/driver/module.cc | 1 - lib/runtime/function.cc | 6 ++- tests/bench/CMakeLists.txt | 2 +- tests/bench/{copy2d.cc => copy.cc} | 0 tests/common/reduce.h | 18 ++++++--- tests/common/src/copy.h | 2 +- tests/common/src/reduce.h | 26 ++++++++++++- tests/common/util.h | 9 +++-- tests/unit/CMakeLists.txt | 2 +- tests/unit/copy.cc | 60 ++++++++++++++++++++++++++++++ tests/unit/copy1d.cc | 30 --------------- tests/unit/copy2d.cc | 46 ----------------------- tests/unit/copy3d.cc | 38 ------------------- tests/unit/reduce.cc | 1 + 14 files changed, 112 insertions(+), 129 deletions(-) rename tests/bench/{copy2d.cc => copy.cc} (100%) create mode 100644 tests/unit/copy.cc delete mode 100644 tests/unit/copy1d.cc delete mode 100644 tests/unit/copy2d.cc delete mode 100644 tests/unit/copy3d.cc diff --git a/lib/driver/module.cc b/lib/driver/module.cc index 4fdc8ee90..722b8f6de 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -241,7 +241,6 @@ std::string cu_module::compile_llvm_module(std::unique_ptr module, cu_module::cu_module(driver::context * context, std::unique_ptr ll_module): cu_module(context, compile_llvm_module(std::move(ll_module), context->device())) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ -// std::cout << source << std::endl; // exit(EXIT_FAILURE); cu_context::context_switcher ctx(*context); // JIT compile source-code diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 2e34ed13c..ccdadab51 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -175,7 +175,11 @@ function::caller function::autotune(driver::stream* stream, const grid_fn_ty& gr auto ir = make_ir(parser); // binary code-gen std::unique_ptr bin; - bin = make_bin(*ir, stream->context(), opt); + try{ + bin = make_bin(*ir, stream->context(), opt); + }catch(...){ + return; + } // kernel uses too much resources if(!bin) return; diff --git a/tests/bench/CMakeLists.txt b/tests/bench/CMakeLists.txt index 598dadeea..d9978ca3f 100644 --- a/tests/bench/CMakeLists.txt +++ b/tests/bench/CMakeLists.txt @@ -1,4 +1,4 @@ -foreach(PROG dot copy1d copy2d) +foreach(PROG dot copy) set(TARGET bench_${PROG}) add_executable(${TARGET} ${PROG}.cc) set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME ${TARGET}) diff --git a/tests/bench/copy2d.cc b/tests/bench/copy.cc similarity index 100% rename from tests/bench/copy2d.cc rename to tests/bench/copy.cc diff --git a/tests/common/reduce.h b/tests/common/reduce.h index ba4e6e470..0f5d63612 100644 --- a/tests/common/reduce.h +++ b/tests/common/reduce.h @@ -53,7 +53,8 @@ void triton_reduce_nd(drv::stream* stream, const std::vector& shape_x, std::string ty = "float"; size_t dtsize = sizeof(NumericT); drv::context* context = stream->context(); - size_t axy = (axis == 0) ? 1 : 0; + + // shape std::vector shape_y = shape_x; @@ -95,9 +96,16 @@ void triton_reduce_nd(drv::stream* stream, const std::vector& shape_x, TS = tile_nd(rank_x); for(int d = 0; d < rank_x; d++) opt.defines.push_back({"TS" + std::to_string(d), TS[d]}); - std::string RY = (axis == 0) ? "rs1" : "rs0"; - opt.defines.push_back({"TY", {std::to_string(shape_x[axy])}}); - opt.defines.push_back({"RY", {RY}}); + + std::vector axy; + for(int d = 0; d < rank_x; d++) + if(d != axis) + axy.push_back(d); + for(int d = 0; d < rank_y; d++) + opt.defines.push_back({"TY" + std::to_string(d), {std::to_string(shape_x[axy[d]])}}); + for(int d = 0; d < rank_y; d++) + opt.defines.push_back({"RY" + std::to_string(d), {"rs" + std::to_string(axy[d])}}); + std::string RED = ""; for(int n = 0; n < rank_x; n++){ if(n > 0) @@ -150,6 +158,6 @@ bool do_test(drv::stream* stream, std::vector shape, int axis, reduce_op_t std::vector> TSS; for(int32_t d: shape) TSS.push_back({std::to_string(d)}); - triton_reduce_nd(stream, shape, axis, op, {0, 1}, {0, 1}, TSS, TEST, bench, test); + triton_reduce_nd(stream, shape, axis, op, {0, 1, 2}, {0, 1, 2}, TSS, TEST, bench, test); return test; } diff --git a/tests/common/src/copy.h b/tests/common/src/copy.h index 97264eeb5..f45f7a5cd 100644 --- a/tests/common/src/copy.h +++ b/tests/common/src/copy.h @@ -28,7 +28,7 @@ void copy2d(TYPE * X __noalias __readonly __aligned(16), int rs1[TS1] = pid1 * TS1 + 0 ... TS1; TYPE* px[TS0, TS1] = X + rs0[:, newaxis] * STRIDE_XS0 + rs1[newaxis, :] * STRIDE_XS1; TYPE* py[TS0, TS1] = Y + rs0[:, newaxis] * STRIDE_YS0 + rs1[newaxis, :] * STRIDE_YS1; - *py = ^(*px); + *py = *px; } )"; diff --git a/tests/common/src/reduce.h b/tests/common/src/reduce.h index 158e68567..6dc1640e0 100644 --- a/tests/common/src/reduce.h +++ b/tests/common/src/reduce.h @@ -24,13 +24,35 @@ void reduce2d(TYPE * X __noalias __readonly __aligned(16), int rs1[TS1] = pid1 * TS1 + 0 ... TS1; TYPE* px[TS0, TS1] = X + rs0[:, newaxis] * STRIDE_XS0 + rs1[newaxis, :] * STRIDE_XS1; - TYPE* py[TY] = Y + RY * STRIDE_YS0; + TYPE* py[TY0] = Y + RY0 * STRIDE_YS0; *py = (*px)[RED]; } )"; + const char *reduce3d = +R"( +void reduce2d(TYPE * X __noalias __readonly __aligned(16), + TYPE * Y __noalias __writeonly __aligned(16), + int S0, int S1, int S2) { + int pid0 = get_program_id(0); + int pid1 = get_program_id(1); + int pid2 = get_program_id(2); + int rs0[TS0] = pid0 * TS0 + 0 ... TS0; + int rs1[TS1] = pid1 * TS1 + 0 ... TS1; + int rs2[TS2] = pid2 * TS2 + 0 ... TS2; + // input pointers + TYPE* px[TS0, TS1, TS2] = X + rs0[:, newaxis, newaxis] * STRIDE_XS0 + + rs1[newaxis, :, newaxis] * STRIDE_XS1 + + rs2[newaxis, newaxis, :] * STRIDE_XS2; + // output pointers + TYPE* py[TY0, TY1] = Y + RY0[:, newaxis] * STRIDE_YS0 + + RY1[newaxis, :] * STRIDE_YS1; + // write-back + *py = (*px)[RED]; +} +)"; - const char* reduce_nd[] = {reduce1d, reduce2d}; + const char* reduce_nd[] = {reduce1d, reduce2d, reduce3d}; } diff --git a/tests/common/util.h b/tests/common/util.h index ede8a5dbb..0a7788195 100644 --- a/tests/common/util.h +++ b/tests/common/util.h @@ -62,7 +62,7 @@ inline std::vector> tile_nd(size_t rank) { template void init_rand(std::vector& x) { for(size_t i = 0; i < x.size(); i++) - x[i] = static_cast((double)rand()/RAND_MAX); + x[i] = i; } template @@ -107,8 +107,11 @@ enum order_t { int offset(const std::vector& idx, const std::vector& shapes) { int result = idx[0]; - for(int i = 1; i < idx.size(); i++) - result += idx[i]*shapes[i-1]; + int ld = 1; + for(int i = 1; i < idx.size(); i++){ + ld *= shapes[i - 1]; + result += idx[i]*ld; + } return result; } diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index 89b4b4455..6f397badb 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -1,4 +1,4 @@ -foreach(PROG dot copy1d copy2d copy3d reduce) +foreach(PROG dot copy reduce) set(TARGET unit_${PROG}) add_executable(${TARGET} ${PROG}.cc) set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME ${TARGET}) diff --git a/tests/unit/copy.cc b/tests/unit/copy.cc new file mode 100644 index 000000000..ec586066c --- /dev/null +++ b/tests/unit/copy.cc @@ -0,0 +1,60 @@ +#include +#include +#include "copy.h" +#include "triton/driver/backend.h" + + + +int main() { + // initialize default compute device + auto context = triton::driver::backend::contexts::get_default(); + triton::driver::stream* stream = triton::driver::stream::create(context); + // shapes to benchmark + typedef std::tuple, std::vector, std::vector, std::vector> config_t; + std::vector configs; + // 1D +// configs.push_back({{65536}, {32}, {0}, {0}}); + configs.push_back({{65536}, {128}, {0}, {0}}); + configs.push_back({{65536}, {512}, {0}, {0}}); + configs.push_back({{65536}, {1024}, {0}, {0}}); + // 2D + configs.push_back({{256, 256}, {16, 16}, {0, 1}, {0, 1}}); + configs.push_back({{256, 256}, {16, 64}, {0, 1}, {0, 1}}); + configs.push_back({{256, 256}, {64, 16}, {0, 1}, {0, 1}}); + configs.push_back({{256, 256}, {64, 64}, {0, 1}, {0, 1}}); + configs.push_back({{256, 256}, {16, 16}, {0, 1}, {1, 0}}); + configs.push_back({{256, 256}, {16, 64}, {0, 1}, {1, 0}}); + configs.push_back({{256, 256}, {64, 16}, {0, 1}, {1, 0}}); + configs.push_back({{256, 256}, {64, 64}, {0, 1}, {1, 0}}); + configs.push_back({{256, 256}, {16, 16}, {1, 0}, {0, 1}}); + configs.push_back({{256, 256}, {16, 64}, {1, 0}, {0, 1}}); + configs.push_back({{256, 256}, {64, 16}, {1, 0}, {0, 1}}); + configs.push_back({{256, 256}, {64, 64}, {1, 0}, {0, 1}}); + configs.push_back({{256, 256}, {64, 64}, {1, 0}, {1, 0}}); + configs.push_back({{256, 256}, {16, 64}, {1, 0}, {1, 0}}); + configs.push_back({{256, 256}, {64, 16}, {1, 0}, {1, 0}}); + configs.push_back({{256, 256}, {64, 64}, {1, 0}, {1, 0}}); + // 3D + std::vector> xx_idx = {{0, 1, 2}, {2, 1, 0}, {1, 0, 2}}; + std::vector> yy_idx = {{0, 1, 2}, {2, 1, 0}, {1, 0, 2}}; + for(const auto& x_idx: xx_idx) + for(const auto& y_idx: yy_idx){ + configs.push_back({{64, 64, 32}, {16, 4, 8}, x_idx, y_idx}); + configs.push_back({{64, 64, 32}, {8, 16, 2}, x_idx, y_idx}); + configs.push_back({{64, 64, 32}, {32, 2, 2}, x_idx, y_idx}); + configs.push_back({{64, 64, 32}, {16, 64, 4}, x_idx, y_idx}); + } + // testing + std::vector shape, tile; + std::vector ord_x, ord_y; + bool result = true; + for(const auto& c: configs){ + std::tie(shape, tile, ord_x, ord_y) = c; + bool pass = test_copy_nd(stream, shape, tile, ord_x, ord_y); + result = result && pass; + std::cout << "// " << c << ", " << pass << std::endl; + } + return result; +} + + diff --git a/tests/unit/copy1d.cc b/tests/unit/copy1d.cc deleted file mode 100644 index ad867bae4..000000000 --- a/tests/unit/copy1d.cc +++ /dev/null @@ -1,30 +0,0 @@ -#include -#include -#include "copy.h" -#include "triton/driver/backend.h" - - -int main() { - // initialize default compute device - auto context = triton::driver::backend::contexts::get_default(); - triton::driver::stream* stream = triton::driver::stream::create(context); - // shapes to benchmark - typedef std::tuple, std::vector, std::vector, std::vector> config_t; - std::vector configs = { -// {{65536}, {32}, {0}, {0}}, - {{65536}, {128}, {0}, {0}}, - {{65536}, {512}, {0}, {0}}, - {{65536}, {1024}, {0}, {0}}, - }; - // does the work - std::vector shape, tile; - std::vector ord_x, ord_y; - bool result = true; - for(const auto& c: configs){ - std::tie(shape, tile, ord_x, ord_y) = c; - bool pass = test_copy_nd(stream, shape, tile, ord_x, ord_y); - result = result && pass; - std::cout << "// " << c << ", " << pass << std::endl; - } - return result; -} diff --git a/tests/unit/copy2d.cc b/tests/unit/copy2d.cc deleted file mode 100644 index f4c63e6be..000000000 --- a/tests/unit/copy2d.cc +++ /dev/null @@ -1,46 +0,0 @@ -#include -#include -#include "copy.h" -#include "triton/driver/backend.h" - - -int main() { - // initialize default compute device - auto context = triton::driver::backend::contexts::get_default(); - triton::driver::stream* stream = triton::driver::stream::create(context); - // shapes to benchmark - typedef std::tuple, std::vector, std::vector, std::vector> config_t; - std::vector configs = { - {{256, 256}, {16, 16}, {0, 1}, {0, 1}}, - {{256, 256}, {16, 64}, {0, 1}, {0, 1}}, - {{256, 256}, {64, 16}, {0, 1}, {0, 1}}, - {{256, 256}, {64, 64}, {0, 1}, {0, 1}}, - - {{256, 256}, {16, 16}, {0, 1}, {1, 0}}, - {{256, 256}, {16, 64}, {0, 1}, {1, 0}}, - {{256, 256}, {64, 16}, {0, 1}, {1, 0}}, - {{256, 256}, {64, 64}, {0, 1}, {1, 0}}, - - {{256, 256}, {16, 16}, {1, 0}, {0, 1}}, - {{256, 256}, {16, 64}, {1, 0}, {0, 1}}, - {{256, 256}, {64, 16}, {1, 0}, {0, 1}}, - {{256, 256}, {64, 64}, {1, 0}, {0, 1}}, - - {{256, 256}, {64, 64}, {1, 0}, {1, 0}}, - {{256, 256}, {16, 64}, {1, 0}, {1, 0}}, - {{256, 256}, {64, 16}, {1, 0}, {1, 0}}, - {{256, 256}, {64, 64}, {1, 0}, {1, 0}}, - }; - // does the work - std::vector shape, tile; - std::vector ord_x, ord_y; - bool result = true; - for(const auto& c: configs){ - std::tie(shape, tile, ord_x, ord_y) = c; - bool pass = test_copy_nd(stream, shape, tile, ord_x, ord_y); - result = result && pass; - std::cout << "// " << c << ", " << pass << std::endl; - } - return result; -} - diff --git a/tests/unit/copy3d.cc b/tests/unit/copy3d.cc deleted file mode 100644 index 758944d98..000000000 --- a/tests/unit/copy3d.cc +++ /dev/null @@ -1,38 +0,0 @@ -#include -#include -#include "copy.h" -#include "triton/driver/backend.h" - - -int main() { - // initialize default compute device - auto context = triton::driver::backend::contexts::get_default(); - triton::driver::stream* stream = triton::driver::stream::create(context); - // shapes to benchmark - typedef std::tuple, std::vector, std::vector, std::vector> config_t; - std::vector configs; - std::vector x_idx = {0, 1, 2}; - do { - std::vector y_idx = {0, 1, 2}; - do { - configs.push_back(config_t{{64, 64, 32}, {16, 4, 8}, x_idx, y_idx}); - configs.push_back(config_t{{64, 64, 32}, {8, 16, 2}, x_idx, y_idx}); - configs.push_back(config_t{{64, 64, 32}, {32, 2, 2}, x_idx, y_idx}); - configs.push_back(config_t{{64, 64, 32}, {16, 64, 4}, x_idx, y_idx}); - - } while(std::next_permutation(y_idx.begin(), y_idx.end())); - } while(std::next_permutation(x_idx.begin(), x_idx.end())); - // testing - std::vector shape, tile; - std::vector ord_x, ord_y; - bool result = true; - for(const auto& c: configs){ - std::tie(shape, tile, ord_x, ord_y) = c; - bool pass = test_copy_nd(stream, shape, tile, ord_x, ord_y); - result = result && pass; - std::cout << "// " << c << ", " << pass << std::endl; - } - return result; -} - - diff --git a/tests/unit/reduce.cc b/tests/unit/reduce.cc index 5a311686c..96f2d89f9 100644 --- a/tests/unit/reduce.cc +++ b/tests/unit/reduce.cc @@ -20,6 +20,7 @@ int main() { // shapes to benchmark typedef std::tuple, int, reduce_op_t> config_t; std::vector configs = { + config_t{{8, 8, 4}, 2, ADD}, config_t{{32}, 0, ADD}, config_t{{32, 32}, 0, MAX}, config_t{{32, 32}, 1, ADD}, From de6fdd56255664fa9f9d164ce470043dc8df4251 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 20 Oct 2019 19:29:48 -0400 Subject: [PATCH 458/494] [general] removed useless files and includes --- include/triton/codegen/analysis/axes.h | 6 ++--- .../triton/codegen/transform/reassociate.h | 4 --- lib/codegen/analysis/align.cc | 3 --- lib/codegen/analysis/allocation.cc | 6 ----- lib/codegen/analysis/axes.cc | 7 +---- lib/codegen/analysis/layout.cc | 1 - lib/codegen/analysis/liveness.cc | 7 ----- lib/codegen/instructions.cc | 0 lib/codegen/pass.cc | 0 lib/codegen/transform/coalesce.cc | 9 ++----- lib/codegen/transform/cts.cc | 3 --- lib/codegen/transform/dce.cc | 3 +-- lib/codegen/transform/membar.cc | 3 --- lib/codegen/transform/peephole.cc | 2 +- lib/codegen/transform/reassociate.cc | 27 ------------------- lib/runtime/function.cc | 2 +- tests/common/src/reduce.h | 4 +-- 17 files changed, 10 insertions(+), 77 deletions(-) delete mode 100644 lib/codegen/instructions.cc delete mode 100644 lib/codegen/pass.cc diff --git a/include/triton/codegen/analysis/axes.h b/include/triton/codegen/analysis/axes.h index dc39b07cb..dc50223fb 100644 --- a/include/triton/codegen/analysis/axes.h +++ b/include/triton/codegen/analysis/axes.h @@ -1,11 +1,9 @@ #ifndef _TRITON_CODEGEN_ANALYSIS_AXES_H_ #define _TRITON_CODEGEN_ANALYSIS_AXES_H_ -#include -#include -#include -#include #include "triton/tools/graph.h" +#include +#include namespace triton{ diff --git a/include/triton/codegen/transform/reassociate.h b/include/triton/codegen/transform/reassociate.h index d7e33c9a2..708de3c73 100644 --- a/include/triton/codegen/transform/reassociate.h +++ b/include/triton/codegen/transform/reassociate.h @@ -37,11 +37,7 @@ private: ir::value *reassociate_ptr(ir::getelementptr_inst* pz, ir::builder &builder, std::map &offsets); public: - reassociate(analysis::align* align); void run(ir::module& module); - -private: - analysis::align* align_; }; } diff --git a/lib/codegen/analysis/align.cc b/lib/codegen/analysis/align.cc index 28ff4024d..ec692d6f6 100644 --- a/lib/codegen/analysis/align.cc +++ b/lib/codegen/analysis/align.cc @@ -5,9 +5,6 @@ #include "triton/ir/basic_block.h" #include "triton/ir/instructions.h" #include "triton/ir/type.h" -#include -#include -#include namespace triton { namespace codegen{ diff --git a/lib/codegen/analysis/allocation.cc b/lib/codegen/analysis/allocation.cc index 3ea0a758d..5cce0d049 100644 --- a/lib/codegen/analysis/allocation.cc +++ b/lib/codegen/analysis/allocation.cc @@ -3,12 +3,6 @@ #include "triton/codegen/analysis/layout.h" #include "triton/codegen/analysis/allocation.h" #include "triton/codegen/analysis/liveness.h" -#include "triton/codegen/transform/cts.h" -#include "triton/ir/basic_block.h" -#include "triton/ir/type.h" -#include "triton/ir/value.h" -#include "triton/ir/function.h" -#include "triton/ir/instructions.h" #include "triton/ir/utils.h" namespace triton{ diff --git a/lib/codegen/analysis/axes.cc b/lib/codegen/analysis/axes.cc index 6271e224b..d60e0c320 100644 --- a/lib/codegen/analysis/axes.cc +++ b/lib/codegen/analysis/axes.cc @@ -1,12 +1,7 @@ #include "triton/codegen/analysis/axes.h" -#include "triton/ir/instructions.h" #include "triton/ir/utils.h" +#include "triton/ir/instructions.h" #include "triton/ir/type.h" -#include "triton/ir/module.h" -#include "triton/ir/function.h" -#include "triton/ir/context_impl.h" -#include "triton/ir/constant.h" -#include "triton/driver/device.h" diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index eda55e451..eaa1a18a5 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -1,5 +1,4 @@ #include -#include #include #include "triton/codegen/analysis/axes.h" #include "triton/codegen/analysis/align.h" diff --git a/lib/codegen/analysis/liveness.cc b/lib/codegen/analysis/liveness.cc index 2d4162ff4..cd7221443 100644 --- a/lib/codegen/analysis/liveness.cc +++ b/lib/codegen/analysis/liveness.cc @@ -1,15 +1,8 @@ -#include #include -#include -#include "triton/codegen/instructions.h" #include "triton/codegen/analysis/liveness.h" #include "triton/codegen/analysis/layout.h" -#include "triton/codegen/transform/cts.h" -#include "triton/ir/basic_block.h" #include "triton/ir/function.h" #include "triton/ir/module.h" -#include "triton/ir/instructions.h" -#include "triton/ir/value.h" #include "triton/ir/utils.h" namespace triton{ diff --git a/lib/codegen/instructions.cc b/lib/codegen/instructions.cc deleted file mode 100644 index e69de29bb..000000000 diff --git a/lib/codegen/pass.cc b/lib/codegen/pass.cc deleted file mode 100644 index e69de29bb..000000000 diff --git a/lib/codegen/transform/coalesce.cc b/lib/codegen/transform/coalesce.cc index 078aeb112..764e2138a 100644 --- a/lib/codegen/transform/coalesce.cc +++ b/lib/codegen/transform/coalesce.cc @@ -1,15 +1,10 @@ -#include #include -#include -#include "triton/ir/function.h" #include "triton/ir/utils.h" -#include "triton/ir/basic_block.h" #include "triton/ir/instructions.h" #include "triton/ir/module.h" -#include "triton/codegen/analysis/layout.h" -#include "triton/codegen/transform/cts.h" -#include "triton/codegen/analysis/align.h" #include "triton/codegen/transform/coalesce.h" +#include "triton/codegen/analysis/align.h" +#include "triton/codegen/analysis/layout.h" namespace triton { namespace codegen{ diff --git a/lib/codegen/transform/cts.cc b/lib/codegen/transform/cts.cc index b939c160c..47a1e13a8 100644 --- a/lib/codegen/transform/cts.cc +++ b/lib/codegen/transform/cts.cc @@ -1,12 +1,9 @@ -#include -#include #include "triton/codegen/transform/cts.h" #include "triton/codegen/instructions.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" #include "triton/ir/instructions.h" -#include "triton/ir/type.h" namespace triton { namespace codegen{ diff --git a/lib/codegen/transform/dce.cc b/lib/codegen/transform/dce.cc index 4497f2fde..907b1621f 100644 --- a/lib/codegen/transform/dce.cc +++ b/lib/codegen/transform/dce.cc @@ -1,9 +1,8 @@ +#include "triton/codegen/transform/dce.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" #include "triton/ir/module.h" #include "triton/ir/utils.h" -#include "triton/codegen/transform/dce.h" -#include namespace triton { namespace codegen{ diff --git a/lib/codegen/transform/membar.cc b/lib/codegen/transform/membar.cc index 9dd793294..44316d504 100644 --- a/lib/codegen/transform/membar.cc +++ b/lib/codegen/transform/membar.cc @@ -1,13 +1,10 @@ #include #include #include - -#include "triton/codegen/analysis/liveness.h" #include "triton/codegen/analysis/layout.h" #include "triton/codegen/analysis/allocation.h" #include "triton/codegen/instructions.h" #include "triton/codegen/transform/membar.h" -#include "triton/codegen/transform/cts.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" diff --git a/lib/codegen/transform/peephole.cc b/lib/codegen/transform/peephole.cc index 73b8ff27f..98b70e4ab 100644 --- a/lib/codegen/transform/peephole.cc +++ b/lib/codegen/transform/peephole.cc @@ -2,7 +2,7 @@ #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/codegen/transform/peephole.h" -#include + namespace triton { namespace codegen{ namespace transform{ diff --git a/lib/codegen/transform/reassociate.cc b/lib/codegen/transform/reassociate.cc index 7f9427aa3..20241e70e 100644 --- a/lib/codegen/transform/reassociate.cc +++ b/lib/codegen/transform/reassociate.cc @@ -1,7 +1,5 @@ #include -#include #include "triton/codegen/transform/reassociate.h" -#include "triton/codegen/analysis/align.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" @@ -12,27 +10,6 @@ namespace triton { namespace codegen{ namespace transform{ -//inline Constant *get_gep_cst_offset(GetElementPtrInst *gep){ -// std::vector idx_vals; -// std::transform(gep->idx_begin(), gep->idx_end(), -// std::back_inserter(idx_vals), -// [](Value* x){ return x;}); -// if(idx_vals.size() > 1) -// return nullptr; -// Value *idx = idx_vals[0]; -// if(isa(idx)) -// return idx; -// if(Instruction *BinOp = is_bin_add(idx)){ -// Value *LHS = BinOp->getOperand(0); -// Value *RHS = BinOp->getOperand(1); -// if(Constant* Res = dyn_cast(LHS)) -// return Res; -// if(Constant* Res = dyn_cast(RHS)) -// return Res; -// } -// return nullptr; -//} - inline ir::instruction* reassociate::is_bin_add(ir::value *x) { ir::binary_operator *bin_op = dynamic_cast(x); @@ -141,10 +118,6 @@ ir::value *reassociate::reassociate_idx(ir::value *old_value, return new_value; } -reassociate::reassociate(analysis::align *align): align_(align) -{ } - - /* run */ void reassociate::run(ir::module &mod) { ir::builder &builder = mod.get_builder(); diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index ccdadab51..ca1dc8fb9 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -214,7 +214,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::transform::membar barriers(&liveness, &layouts, &allocation); codegen::transform::dce dce; codegen::transform::peephole peephole; - codegen::transform::reassociate reassociate(&align); + codegen::transform::reassociate reassociate; codegen::transform::coalesce coalesce(&align, &layouts); codegen::transform::cts cts; codegen::generator isel(&axes, &layouts, &align, &allocation, target.get(), opt.num_warps); diff --git a/tests/common/src/reduce.h b/tests/common/src/reduce.h index 6dc1640e0..508ce896b 100644 --- a/tests/common/src/reduce.h +++ b/tests/common/src/reduce.h @@ -16,8 +16,8 @@ void reduce1d(TYPE * X __noalias __readonly __aligned(16), const char *reduce2d = R"( void reduce2d(TYPE * X __noalias __readonly __aligned(16), - TYPE * Y __noalias __writeonly __aligned(16), - int S0, int S1) { + TYPE * Y __noalias __writeonly __aligned(16), + int S0, int S1) { int pid0 = get_program_id(0); int pid1 = get_program_id(1); int rs0[TS0] = pid0 * TS0 + 0 ... TS0; From e827d4f4679e7d8428d41d39a0e559bcb2a607c7 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 20 Oct 2019 20:37:37 -0400 Subject: [PATCH 459/494] [python] [bindings] removed obsolete #include --- python/src/bindings.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/python/src/bindings.cc b/python/src/bindings.cc index a09a0a7cb..969f74df4 100644 --- a/python/src/bindings.cc +++ b/python/src/bindings.cc @@ -4,7 +4,6 @@ #include #include #include -#include "triton/codegen/selection.h" #include "triton/runtime/function.h" #include "triton/lang/code_gen.h" #include "triton/lang/parser.h" From b81734553bb7b250cfd069eb7bbc903252d9d7db Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 21 Oct 2019 15:41:50 -0400 Subject: [PATCH 460/494] [lang] added support for batched matrix multiplication --- lib/codegen/analysis/allocation.cc | 1 - lib/codegen/analysis/axes.cc | 5 ----- lib/codegen/analysis/layout.cc | 4 ++++ lib/codegen/analysis/liveness.cc | 10 ++++++---- lib/ir/module.cc | 1 + lib/lang/ast.cc | 12 ++++++++++-- lib/runtime/function.cc | 2 +- 7 files changed, 22 insertions(+), 13 deletions(-) diff --git a/lib/codegen/analysis/allocation.cc b/lib/codegen/analysis/allocation.cc index 5cce0d049..b92b5bd44 100644 --- a/lib/codegen/analysis/allocation.cc +++ b/lib/codegen/analysis/allocation.cc @@ -96,7 +96,6 @@ void allocation::run(ir::module &mod) { offsets_[x] = starts[x] + colors[x] * Adj; } - // Save maximum size of induced memory space allocated_size_ = 0; for(layout_t* x: V) diff --git a/lib/codegen/analysis/axes.cc b/lib/codegen/analysis/axes.cc index d60e0c320..0e67877b9 100644 --- a/lib/codegen/analysis/axes.cc +++ b/lib/codegen/analysis/axes.cc @@ -77,11 +77,6 @@ void axes::update_graph_dot(ir::instruction *i) { // add edges between result and accumulator for(unsigned d = 0; d < shapes.size(); d++) graph_.add_edge({dot, d}, {D, d}); - // add edge for batch dimension - for(unsigned d = 2; d < shapes.size(); d++){ - graph_.add_edge({dot, d}, {A, d}); - graph_.add_edge({dot, d}, {B, d}); - } } void axes::update_graph_elementwise(ir::instruction *i) { diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index eaa1a18a5..6f717d77c 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -300,6 +300,10 @@ layout_shared_t::layout_shared_t(const layout_t *arg, } std::vector col = {0, 1}; std::vector row = {1, 0}; + for(size_t s = 2; s < shapes.size(); s++){ + col.push_back(s); + row.push_back(s); + } bool is_nonhmma_dot_a = dot_a && !hmma_dot_a; bool is_nonhmma_dot_b = dot_b && !hmma_dot_b; if(is_nonhmma_dot_a) diff --git a/lib/codegen/analysis/liveness.cc b/lib/codegen/analysis/liveness.cc index cd7221443..382f8ef6c 100644 --- a/lib/codegen/analysis/liveness.cc +++ b/lib/codegen/analysis/liveness.cc @@ -1,4 +1,5 @@ #include +#include #include "triton/codegen/analysis/liveness.h" #include "triton/codegen/analysis/layout.h" #include "triton/ir/function.h" @@ -37,12 +38,13 @@ void liveness::run(ir::module &mod) { } // compute intervals unsigned start = INT32_MAX; + for(ir::value *v: layout->values) + if(indices.find(v) != indices.end()) + start = std::min(start, indices.at(v)); unsigned end = 0; for(ir::user *u: users) - if(indices.find(u) != indices.end()){ - start = std::min(start, indices.at(u)); - end = std::max(end, indices.at(u)); - } + if(indices.find(u) != indices.end()) + end = std::max(end, indices.at(u)); intervals_[layout] = segment{start, end}; } diff --git a/lib/ir/module.cc b/lib/ir/module.cc index 98f171252..4a4655fb6 100644 --- a/lib/ir/module.cc +++ b/lib/ir/module.cc @@ -1,4 +1,5 @@ #include +#include #include "triton/ir/basic_block.h" #include "triton/ir/module.h" #include "triton/ir/type.h" diff --git a/lib/lang/ast.cc b/lib/lang/ast.cc index bf0c7e964..b6cd99633 100644 --- a/lib/lang/ast.cc +++ b/lib/lang/ast.cc @@ -471,12 +471,20 @@ void BinaryOp::MatmulOpTypeChecking() { auto rhsShape = rhsType->Shape(); size_t lhsRank = lhsShape.size(); size_t rhsRank = rhsShape.size(); - if(lhsRank != 2 || rhsRank != 2) - Error(this, "matrix multiplication operands must have rank 2"); + if(lhsRank != rhsRank) + Error(this, "matrix multiplication operands have incompatible rank" + "%d and %d", lhsRank, rhsRank); + for(int d = 2; d < lhsRank; d++) + if(lhsShape[d] != rhsShape[d]) + Error(this, "matrix multiplication operands have incompatible batch dimension" + "%d and %d for axis %d", lhsShape[d], rhsShape[d], d); if(lhsShape[1] != rhsShape[0]) Error(this, "matrix multiplication operands have incompatible inner dimension" " %d and %d", lhsShape[1], rhsShape[0]); + // ret shape TileType::ShapeInt retShape = {lhsShape[0], rhsShape[1]}; + for(int d = 2; d < lhsRank; d++) + retShape.push_back(lhsShape[d]); QualType retType = lhsType->Derived(); if(retType != rhsType->Derived()) Error(this, "matrix multiplication operands have incompatible data types"); diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index ca1dc8fb9..ffb8610b3 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -219,7 +219,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::transform::cts cts; codegen::generator isel(&axes, &layouts, &align, &allocation, target.get(), opt.num_warps); // run passes -// ir::print(module, std::cout); +// ir::print(module, std::cout); peephole.run(module); dce.run(module); align.run(module); From 4b0c43bb7b1ee5a36797c8bf3f41d531d0b7a1bf Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 21 Oct 2019 17:13:12 -0400 Subject: [PATCH 461/494] [python][example] added test for einsum --- python/examples/einsum.py | 113 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 python/examples/einsum.py diff --git a/python/examples/einsum.py b/python/examples/einsum.py new file mode 100644 index 000000000..6d535b911 --- /dev/null +++ b/python/examples/einsum.py @@ -0,0 +1,113 @@ +import numpy as np +import torch +import triton + +class _dot(triton.function): + + src = """ +__global__ void dot(TYPE * A, TYPE * B, TYPE * C, + int sb, int sh, int sa, int sk, int sn) { + // program id + int pidx = get_program_id(0); + int pidy = get_program_id(1); + int pidz = get_program_id(2); + // ranges + int rxa[TM] = pidx * TM + 0 ... TM; + int ryb[TN] = pidy * TN + 0 ... TN; + int rza[TZ] = pidz * TZ + 0 ... TZ; + int rzb[TZ] = pidz * TZ + 0 ... TZ; + int rka[TK] = 0 ... TK; + int rkb[TK] = 0 ... TK; + // accumulator + float c[TM, TN, TZ] = 0; + // pointers to A + TYPE* pa[TM, TK, TZ] = A + rka[newaxis, :, newaxis] * 1 // reduction + + rxa[:, newaxis, newaxis] * sk * sa * sh // outer + + rza[newaxis, newaxis, :] * sk; // batch + // pointers to B + TYPE* pb[TK, TN, TZ] = B + rkb[:, newaxis, newaxis] * 1 // reduction + + ryb[newaxis, :, newaxis] * sk // outer + + rzb[newaxis, newaxis, :] * sk * sn; // batch + // reduction loop + for(int k = sk; k > 0; k -= TK){ + TYPE a[TM, TK, TZ] = *pa; + TYPE b[TK, TN, TZ] = *pb; + c += a @ b; + pa += TK; + pb += TK; + } + // epilogue + int rxc[TM] = pidx * TM + 0 ... TM; + int ryc[TN] = pidy * TN + 0 ... TN; + int rzc[TZ] = pidz * TZ + 0 ... TZ; + TYPE* pc[TM, TN, TZ] = C + rxc[:, newaxis, newaxis] * sn * sa * sh // outer[0] + + ryc[newaxis, :, newaxis] * 1 // outer[1] + + rzc[newaxis, newaxis, :] * sn; + *pc = c; +} +""" + + kernel = triton.kernel(src, ['C']) + + @staticmethod + def _call(a, b, transpose_a, transpose_b): + # extract shapes + shape_a = triton.shape(a) + shape_b = triton.shape(b) + B, H, A, K = shape_a[0], shape_a[1], shape_a[2], shape_a[3] + H, A, N, K = shape_b[0], shape_b[1], shape_b[2], shape_b[3] + # allocate output + dtype = a.dtype + c = triton.empty([B, H, A, N], dtype = dtype) + # SPMD grid + grid = lambda opt: [triton.cdiv(B, opt.d('TM')), + triton.cdiv(N, opt.d('TN')), + triton.cdiv(H*A, opt.d('TZ'))] + # launch kernel + return _dot.kernel(a, b, c, B, H, A, K, N, grid, + AT = transpose_a, BT = transpose_b, TYPE = dtype, + TM = [32], TN = [32], TK = [8], TZ = [8]) + + @staticmethod + def forward(ctx, a, b, transpose_a = False, transpose_b = False): + ctx.save_for_backward(a, b) + ctx.t_a = transpose_a + ctx.t_b = transpose_b + return _dot._call(a, b, transpose_a, transpose_b) + + +dot = _dot.apply + + +batch_dim = 16 +ctx_dim = 32 +head_dim = 8 +state_dim = 32 +key_dim = 32 +n_keys = 32 +bs = batch_dim * ctx_dim + +# shapes +x_shape = (bs, state_dim) +qw_shape = (state_dim, head_dim * key_dim) +kw_shape = (head_dim, 2, n_keys, key_dim // 2) + +x = np.random.uniform(-1.0, 1.0, x_shape).astype(np.float32) # layer input +qw = np.random.uniform(-1.0, 1.0, qw_shape).astype(np.float32) # query weights +kw = np.random.uniform(-1.0, 1.0, kw_shape).astype(np.float32) # key weights +# (bs, head_dim * key_dim) = (bs, state_dim) * (state_dim, head_dim * key_dim) +# (bs, head_dim, 2, key_dim//2) <== (bs, head_dim * key_dim) +q = np.dot(x, qw).reshape(bs, head_dim, 2, key_dim//2) # normal matmul + +# (bs, head_dim, 2, n_keys) = (bs, head_dim, 2, key_dim//2) * (head_dim, 2, n_keys, key_dim//2) +# outer: bs, n_keys +# inner: key_dim//2 +# batch: head_dim, 2 (key_axis) +qk = np.einsum("bhak,hank->bhan", q, kw) + +tq = torch.from_numpy(q).contiguous().cuda() +tkw = torch.from_numpy(kw).contiguous().cuda() +tqk = dot(tq, tkw) +diff = qk - tqk.cpu().numpy() +print(np.max(diff)) + From 099918b3c028cbde71e17bf5caf546859264b4d6 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 21 Oct 2019 18:58:02 -0400 Subject: [PATCH 462/494] [python] [ops] added skeleton for einsum op --- python/examples/einsum.py | 2 +- python/triton/ops/__init__.py | 1 + python/triton/ops/einsum.py | 126 ++++++++++++++++++++++++++++++++++ 3 files changed, 128 insertions(+), 1 deletion(-) create mode 100644 python/triton/ops/einsum.py diff --git a/python/examples/einsum.py b/python/examples/einsum.py index 6d535b911..708277f2d 100644 --- a/python/examples/einsum.py +++ b/python/examples/einsum.py @@ -107,7 +107,7 @@ qk = np.einsum("bhak,hank->bhan", q, kw) tq = torch.from_numpy(q).contiguous().cuda() tkw = torch.from_numpy(kw).contiguous().cuda() -tqk = dot(tq, tkw) +tqk = triton.ops.einsum("bhak,hank->bhan", tq, tkw) diff = qk - tqk.cpu().numpy() print(np.max(diff)) diff --git a/python/triton/ops/__init__.py b/python/triton/ops/__init__.py index f995b88f1..3edb82406 100644 --- a/python/triton/ops/__init__.py +++ b/python/triton/ops/__init__.py @@ -1 +1,2 @@ from .dot import dot +from .einsum import einsum diff --git a/python/triton/ops/einsum.py b/python/triton/ops/einsum.py new file mode 100644 index 000000000..e33da2451 --- /dev/null +++ b/python/triton/ops/einsum.py @@ -0,0 +1,126 @@ +import triton + +class _einsum(triton.function): + + src = """ + void einsum(TYPE * A, TYPE * B, TYPE * C, + int dim_M, int dim_N, int dim_K, + int std_A0, int std_B0, int std_C0, + int std_A1, int std_B1, int std_C1) { + int pid0 = get_program_id(0); + int pid1 = get_program_id(1); + } + """ + + kernel = triton.kernel(src, ['C']) + + @staticmethod + def _append_dim(dim_data, dim_type, idx, label, dim, stride): + if dim_type in dim_data: + data = dim_data[dim_type] + if idx != data["idx"] + 1: + raise ValueError("aggregate inner, outer and batch dims must be adjacent to each other.") + data["dim"] *= dim + data["lab"] = label + data["lab"] + else: + dim_data[dim_type] = dict(idx=idx, lab=label, dim=dim, std=stride) + return dim_type + + @staticmethod + def _parse_abc(labels_a, labels_b, labels_c, shape_a, is_a=False): + + if len(labels_a) != len(shape_a): + raise ValueError(f"einsum notation dims do not match shape: {labels_a} {shape_a}") + + trans = False + stride = 1 + std1 = None + data = dict() + for idx, (lab, dim) in enumerate(reversed(list(zip(labels_a, shape_a)))): + #print(idx, lab, dim) + if dim is None: + raise ValueError("einsum doens't currently work on shapes with placeholder dims.") + if idx == 0 and dim % 8 != 0: + raise ValueError("contiguous dim must be multiple of 8") + + if lab in labels_c: + # batch dim + if lab in labels_b: + _einsum._append_dim(data, "B", idx, lab, dim, stride) + if idx == 0: + raise ValueError(f"batch dim can not be contiguous dim: {lab} {labels_a} {shape_a}") + # outer dim + else: + std1 = _einsum._append_dim(data, "O", idx, lab, dim, stride) + if idx == 0: + trans = is_a + # inner dim + elif lab in labels_b: + std1 = _einsum._append_dim(data, "I", idx, lab, dim, stride) + if idx == 0: + trans = not is_a + else: + raise ValueError(f"einsum def for output: {lab} ({labels_a}), not present in either other def") + + stride *= dim + + if "B" not in data: + data["B"] = dict(dim=1, std=1) + + # batch, outer, inner, std0, std1, trans + return data["B"]["dim"], data["O"]["dim"], data["I"]["dim"], data["B"]["std"], data[std1]["std"], trans + + @staticmethod + def _parse_einsum(labels_a, labels_b, labels_c, shape_a, shape_b): + + dims_a = dict(zip(labels_a, shape_a)) + dims_b = dict(zip(labels_b, shape_b)) + shape_c = list() + for lab in labels_c: + if lab in dims_a: + shape_c.append(dims_a[lab]) + elif lab in dims_b: + shape_c.append(dims_b[lab]) + else: + raise ValueError(f"einsum def for output: {lab} ({labels_c}), not present in either input def ({labels_a}, {labels_b})") + + BA, M, KA, std_a0, std_a1, ta = _einsum._parse_abc(labels_a, labels_b, labels_c, shape_a, True) + BB, N, KB, std_b0, std_b1, tb = _einsum._parse_abc(labels_b, labels_a, labels_c, shape_b, False) + BC, _, _, std_c0, std_c1, _ = _einsum._parse_abc(labels_c, labels_b, labels_a, shape_c) + + if not (BA == BB == BC): + raise ValueError("mismatched batch dims") + if KA != KB: + raise ValueError("mismatched reduction dims") + + return shape_c, (BA, M, N, KA), (std_a0, std_b0, std_c0), (std_a1, std_b1, std_c1), ta, tb + + @staticmethod + def call(a, b, trans_a, trans_b, shape_c, bmnk, + std0, std1, einsum_a, einsum_b, einsum_c): + dtype = a.dtype + c = triton.empty(shape_c, dtype) + grid = lambda opt: (1, 1, 1) + return _einsum.kernel(a, b, c, + bmnk[1], bmnk[2], bmnk[3], + std0[0], std0[1], std0[2], + std1[0], std1[1], std1[2], + grid, + TYPE=['float']) + + + @staticmethod + def forward(ctx, subscripts, a, b): + if type(subscripts) is str: + einsum_a, einsum_bc = subscripts.split(",") + einsum_b, einsum_c = einsum_bc.split("->") + else: + einsum_a, einsum_b, einsum_c = subscripts + + shape_c, bmnk, std0, std1, ta, tb = _einsum._parse_einsum( + einsum_a, einsum_b, einsum_c, + a.shape, b.shape + ) + return _einsum.call(a, b, ta, tb, shape_c, bmnk, std0, std1, einsum_a, einsum_b, einsum_c) + +einsum = _einsum.apply \ No newline at end of file From 943bf41b5caffc655470b3e1da2f26d9b1c6f222 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 21 Oct 2019 23:37:39 -0400 Subject: [PATCH 463/494] [python] [op] added Triton NT einsum --- lib/runtime/function.cc | 2 +- python/examples/einsum.py | 81 ++----------------------------------- python/triton/ops/einsum.py | 42 ++++++++++++++++++- 3 files changed, 44 insertions(+), 81 deletions(-) diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index ffb8610b3..e017982f8 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -177,7 +177,7 @@ function::caller function::autotune(driver::stream* stream, const grid_fn_ty& gr std::unique_ptr bin; try{ bin = make_bin(*ir, stream->context(), opt); - }catch(...){ + }catch(const std::runtime_error& e){ return; } // kernel uses too much resources diff --git a/python/examples/einsum.py b/python/examples/einsum.py index 708277f2d..5585cc9b6 100644 --- a/python/examples/einsum.py +++ b/python/examples/einsum.py @@ -2,83 +2,6 @@ import numpy as np import torch import triton -class _dot(triton.function): - - src = """ -__global__ void dot(TYPE * A, TYPE * B, TYPE * C, - int sb, int sh, int sa, int sk, int sn) { - // program id - int pidx = get_program_id(0); - int pidy = get_program_id(1); - int pidz = get_program_id(2); - // ranges - int rxa[TM] = pidx * TM + 0 ... TM; - int ryb[TN] = pidy * TN + 0 ... TN; - int rza[TZ] = pidz * TZ + 0 ... TZ; - int rzb[TZ] = pidz * TZ + 0 ... TZ; - int rka[TK] = 0 ... TK; - int rkb[TK] = 0 ... TK; - // accumulator - float c[TM, TN, TZ] = 0; - // pointers to A - TYPE* pa[TM, TK, TZ] = A + rka[newaxis, :, newaxis] * 1 // reduction - + rxa[:, newaxis, newaxis] * sk * sa * sh // outer - + rza[newaxis, newaxis, :] * sk; // batch - // pointers to B - TYPE* pb[TK, TN, TZ] = B + rkb[:, newaxis, newaxis] * 1 // reduction - + ryb[newaxis, :, newaxis] * sk // outer - + rzb[newaxis, newaxis, :] * sk * sn; // batch - // reduction loop - for(int k = sk; k > 0; k -= TK){ - TYPE a[TM, TK, TZ] = *pa; - TYPE b[TK, TN, TZ] = *pb; - c += a @ b; - pa += TK; - pb += TK; - } - // epilogue - int rxc[TM] = pidx * TM + 0 ... TM; - int ryc[TN] = pidy * TN + 0 ... TN; - int rzc[TZ] = pidz * TZ + 0 ... TZ; - TYPE* pc[TM, TN, TZ] = C + rxc[:, newaxis, newaxis] * sn * sa * sh // outer[0] - + ryc[newaxis, :, newaxis] * 1 // outer[1] - + rzc[newaxis, newaxis, :] * sn; - *pc = c; -} -""" - - kernel = triton.kernel(src, ['C']) - - @staticmethod - def _call(a, b, transpose_a, transpose_b): - # extract shapes - shape_a = triton.shape(a) - shape_b = triton.shape(b) - B, H, A, K = shape_a[0], shape_a[1], shape_a[2], shape_a[3] - H, A, N, K = shape_b[0], shape_b[1], shape_b[2], shape_b[3] - # allocate output - dtype = a.dtype - c = triton.empty([B, H, A, N], dtype = dtype) - # SPMD grid - grid = lambda opt: [triton.cdiv(B, opt.d('TM')), - triton.cdiv(N, opt.d('TN')), - triton.cdiv(H*A, opt.d('TZ'))] - # launch kernel - return _dot.kernel(a, b, c, B, H, A, K, N, grid, - AT = transpose_a, BT = transpose_b, TYPE = dtype, - TM = [32], TN = [32], TK = [8], TZ = [8]) - - @staticmethod - def forward(ctx, a, b, transpose_a = False, transpose_b = False): - ctx.save_for_backward(a, b) - ctx.t_a = transpose_a - ctx.t_b = transpose_b - return _dot._call(a, b, transpose_a, transpose_b) - - -dot = _dot.apply - - batch_dim = 16 ctx_dim = 32 head_dim = 8 @@ -92,6 +15,7 @@ x_shape = (bs, state_dim) qw_shape = (state_dim, head_dim * key_dim) kw_shape = (head_dim, 2, n_keys, key_dim // 2) +np.random.seed(0) x = np.random.uniform(-1.0, 1.0, x_shape).astype(np.float32) # layer input qw = np.random.uniform(-1.0, 1.0, qw_shape).astype(np.float32) # query weights kw = np.random.uniform(-1.0, 1.0, kw_shape).astype(np.float32) # key weights @@ -108,6 +32,7 @@ qk = np.einsum("bhak,hank->bhan", q, kw) tq = torch.from_numpy(q).contiguous().cuda() tkw = torch.from_numpy(kw).contiguous().cuda() tqk = triton.ops.einsum("bhak,hank->bhan", tq, tkw) -diff = qk - tqk.cpu().numpy() +diff = np.abs(qk - tqk.cpu().numpy()) print(np.max(diff)) +print(np.min(diff)) diff --git a/python/triton/ops/einsum.py b/python/triton/ops/einsum.py index e33da2451..5be9911dd 100644 --- a/python/triton/ops/einsum.py +++ b/python/triton/ops/einsum.py @@ -7,8 +7,43 @@ class _einsum(triton.function): int dim_M, int dim_N, int dim_K, int std_A0, int std_B0, int std_C0, int std_A1, int std_B1, int std_C1) { + // program id int pid0 = get_program_id(0); int pid1 = get_program_id(1); + int pid2 = get_program_id(2); + // range + int rma[TM] = pid0 * TM + 0 ... TM; + int rnb[TN] = pid1 * TN + 0 ... TN; + int rka[TK] = 0 ... TK; + int rkb[TK] = 0 ... TK; + int rba[TB] = pid2 * TB + 0 ... TB; + int rbb[TB] = pid2 * TB + 0 ... TB; + // accumulator + TYPE c[TM, TN, TB] = 0; + // pointers to a + TYPE *pa[TM, TK, TB] = A + rka[newaxis, :, newaxis] * 1 + + rma[:, newaxis, newaxis] * std_A1 + + rba[newaxis, newaxis, :] * std_A0; + // pointers to b + TYPE *pb[TK, TN, TB] = B + rkb[:, newaxis, newaxis] * 1 + + rnb[newaxis, :, newaxis] * std_B1 + + rbb[newaxis, newaxis, :] * std_B0; + // accumulation + for(int k = dim_K; k > 0; k -= TK) { + TYPE a[TM, TK, TB] = *pa; + TYPE b[TK, TN, TB] = *pb; + c += a @ b; + pa += TK; + pb += TK; + } + // write-back + int rmc[TM] = pid0 * TM + 0 ... TM; + int rnc[TN] = pid1 * TN + 0 ... TN; + int rbc[TB] = pid2 * TB + 0 ... TB; + TYPE *pc[TM, TN, TB] = C + rmc[:, newaxis, newaxis] * std_C1 + + rnc[newaxis, :, newaxis] * 1 + + rbc[newaxis, newaxis, :] * std_C0; + *pc = c; } """ @@ -100,13 +135,16 @@ class _einsum(triton.function): std0, std1, einsum_a, einsum_b, einsum_c): dtype = a.dtype c = triton.empty(shape_c, dtype) - grid = lambda opt: (1, 1, 1) + grid = lambda opt: [triton.cdiv(bmnk[1], opt.d('TM')), + triton.cdiv(bmnk[2], opt.d('TN')), + triton.cdiv(bmnk[0], opt.d('TB'))] + #print(std0, std1) return _einsum.kernel(a, b, c, bmnk[1], bmnk[2], bmnk[3], std0[0], std0[1], std0[2], std1[0], std1[1], std1[2], grid, - TYPE=['float']) + TYPE='float', TM=32, TN=32, TK=8, TB=8) @staticmethod From 0770ccf53770858245ff1b57192bd955083da656 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 24 Oct 2019 23:32:51 -0400 Subject: [PATCH 464/494] [codegen] [selection] disassociation prototype --- .../triton/codegen/transform/disassociate.h | 22 +++++ lib/codegen/transform/disassociate.cc | 83 +++++++++++++++++++ lib/runtime/function.cc | 5 +- tests/common/src/dot.h | 15 ++-- 4 files changed, 115 insertions(+), 10 deletions(-) create mode 100644 include/triton/codegen/transform/disassociate.h create mode 100644 lib/codegen/transform/disassociate.cc diff --git a/include/triton/codegen/transform/disassociate.h b/include/triton/codegen/transform/disassociate.h new file mode 100644 index 000000000..f2363f3fe --- /dev/null +++ b/include/triton/codegen/transform/disassociate.h @@ -0,0 +1,22 @@ +#ifndef _TRITON_SELECTION_TRANSFORM_DISASSOCIATE_H_ +#define _TRITON_SELECTION_TRANSFORM_DISASSOCIATE_H_ + + +namespace triton { +namespace ir { + class module; +} + +namespace codegen{ +namespace transform{ + +class disassociate { +public: + void run(ir::module &mod); +}; + +} +} +} + +#endif diff --git a/lib/codegen/transform/disassociate.cc b/lib/codegen/transform/disassociate.cc new file mode 100644 index 000000000..70134b186 --- /dev/null +++ b/lib/codegen/transform/disassociate.cc @@ -0,0 +1,83 @@ +#include "triton/codegen/transform/disassociate.h" +#include "triton/ir/utils.h" +#include "triton/ir/instructions.h" +#include "triton/ir/builder.h" +#include "triton/ir/module.h" +#include + +namespace triton { +namespace codegen{ +namespace transform{ + +void extract_retile_chain(ir::user *root, + const std::vector& current, + std::vector>& result, + std::set& seen) { + if(!seen.insert(root).second) + return; + if(dynamic_cast(root) || dynamic_cast(root)){ + std::vector next = current; + next.push_back(root); + result.push_back(next); + return; + } + for(ir::value *op: root->ops()){ + ir::user *u = dynamic_cast(op); + if(!u) + continue; + std::vector next = current; + next.push_back(u); + extract_retile_chain(u, next, result, seen); + } +} + +void disassociate::run(ir::module &mod) { + ir::builder &bld = mod.get_builder(); + + std::map>> clone_info; + ir::for_each_instruction(mod, [&](ir::instruction *i){ + if(dynamic_cast(i)){ + std::vector> chains; + std::set seen; + if(!dynamic_cast(i->get_operand(0))) + return; + extract_retile_chain(i, {}, chains, seen); + if(chains.size()) + clone_info[i] = chains; + } + }); + + + for(auto x: clone_info){ + for(auto chain: x.second){ + for(int i = 0; i < chain.size(); i++) { + ir::instruction *y = (ir::instruction*)chain[i]; + ir::instruction *cloned = y->clone(); + bld.set_insert_point(y); + bld.insert(cloned); + if(i > 0) + chain[i-1]->replace_uses_of_with(y, cloned); + else + x.first->replace_uses_of_with(y, cloned); + } + + +// ir::instruction *y = (ir::instruction*)parent; +// for(ir::user *u: chain){ +// ir::instruction *cloned = y->clone(); +// bld.set_insert_point(y); +// bld.insert(cloned); +// std::cout << typeid(*u).name() << std::endl; +// u->replace_uses_of_with(y, cloned); +// y = (ir::instruction*)u; +// } + } + } + + +} + + +} +} +} diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index e017982f8..f9eab4ee4 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -13,6 +13,7 @@ #include "triton/codegen/transform/membar.h" #include "triton/codegen/transform/reassociate.h" #include "triton/codegen/transform/cts.h" +#include "triton/codegen/transform/disassociate.h" #include "triton/codegen/selection/generator.h" #include "triton/runtime/function.h" #include "triton/lang/cpp.h" @@ -208,6 +209,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c // create passes codegen::analysis::align align; codegen::analysis::axes axes; + codegen::transform::disassociate disassociate; codegen::analysis::layout layouts(&axes, &align, opt.num_warps); codegen::analysis::liveness liveness(&layouts); codegen::analysis::allocation allocation(&liveness); @@ -219,7 +221,8 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::transform::cts cts; codegen::generator isel(&axes, &layouts, &align, &allocation, target.get(), opt.num_warps); // run passes -// ir::print(module, std::cout); + disassociate.run(module); + dce.run(module); peephole.run(module); dce.run(module); align.run(module); diff --git a/tests/common/src/dot.h b/tests/common/src/dot.h index dc71d86bb..05ed68a7b 100644 --- a/tests/common/src/dot.h +++ b/tests/common/src/dot.h @@ -10,14 +10,13 @@ void dot(TYPE * A, TYPE * B, TYPE * C, // prologue int ridx = get_program_id(0); int ridy = get_program_id(1); - int rxa[TM] = ridx * TM + 0 ... TM; - int ryb[TN] = ridy * TN + 0 ... TN; - int rka[TK] = 0 ... TK; - int rkb[TK] = 0 ... TK; + int rm[TM] = ridx * TM + 0 ... TM; + int rn[TN] = ridy * TN + 0 ... TN; + int rk[TK] = 0 ... TK; float c[TM, TN] = 0; // pointers to operands - TYPE* pa[SHAPE_A] = A + rka[BROADCAST_AK] * STRIDE_AK + rxa[BROADCAST_AM] * STRIDE_AM; - TYPE* pb[SHAPE_B] = B + rkb[BROADCAST_BK] * STRIDE_BK + ryb[BROADCAST_BN] * STRIDE_BN; + TYPE* pa[SHAPE_A] = A + rk[BROADCAST_AK] * STRIDE_AK + rm[BROADCAST_AM] * STRIDE_AM; + TYPE* pb[SHAPE_B] = B + rk[BROADCAST_BK] * STRIDE_BK + rn[BROADCAST_BN] * STRIDE_BN; // prefetches operands TYPE a[SHAPE_A] = *pa; TYPE b[SHAPE_B] = *pb; @@ -32,9 +31,7 @@ void dot(TYPE * A, TYPE * B, TYPE * C, b = checkb ? *pb : 0; } // epilogue - int rxc[TM] = ridx * TM + 0 ... TM; - int ryc[TN] = ridy * TN + 0 ... TN; - TYPE* pc[TM, TN] = C + rxc[:, newaxis] + ryc[newaxis, :] * ldc; + TYPE* pc[TM, TN] = C + rm[:, newaxis] + rn[newaxis, :] * ldc; *pc = c; } )"; From b615af2e7e9cd9489457deb567fba9db31b927ed Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 25 Oct 2019 14:22:10 -0400 Subject: [PATCH 465/494] [codegen] [generator] fixed issue when tile size is 1 along one or more dimensions --- lib/codegen/selection/generator.cc | 27 ++++++++++++++++++--------- python/triton/ops/einsum.py | 2 +- tests/common/src/dot.h | 6 ++---- 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/lib/codegen/selection/generator.cc b/lib/codegen/selection/generator.cc index d7b9bc6a3..04bc29412 100644 --- a/lib/codegen/selection/generator.cc +++ b/lib/codegen/selection/generator.cc @@ -890,21 +890,30 @@ void generator::visit_barrier_inst(ir::barrier_inst*) { void generator::visit_make_range_dyn(ir::make_range_dyn* x) { for_each(x, [&](indices_t idx){ assert(idx.size() == 1); - BinaryOperator *bin_add = dyn_cast(idx[0]); - assert(bin_add); - Value *res = bin_add->getOperand(0); - set_value(x, idx, res); + if(idx[0] == builder_->getInt32(0)) + set_value(x, idx, idx[0]); + else{ + BinaryOperator *bin_add = dyn_cast(idx[0]); + assert(bin_add); + Value *res = bin_add->getOperand(0); + set_value(x, idx, res); + } }); } void generator::visit_make_range_sta(ir::make_range_sta* x) { for_each(x, [&](indices_t idx){ assert(idx.size() == 1); - BinaryOperator *bin_add = dyn_cast(idx[0]); - assert(bin_add); - Value *res = bin_add->getOperand(1); - assert(isa(res)); - set_value(x, idx, res); + if(idx[0] == builder_->getInt32(0)){ + set_value(x, idx, idx[0]); + } + else{ + BinaryOperator *bin_add = dyn_cast(idx[0]); + assert(bin_add); + Value *res = bin_add->getOperand(1); + assert(isa(res)); + set_value(x, idx, res); + } }); } diff --git a/python/triton/ops/einsum.py b/python/triton/ops/einsum.py index 5be9911dd..571c8f1ba 100644 --- a/python/triton/ops/einsum.py +++ b/python/triton/ops/einsum.py @@ -144,7 +144,7 @@ class _einsum(triton.function): std0[0], std0[1], std0[2], std1[0], std1[1], std1[2], grid, - TYPE='float', TM=32, TN=32, TK=8, TB=8) + TYPE='float', TM=32, TN=32, TK=8, TB=1) @staticmethod diff --git a/tests/common/src/dot.h b/tests/common/src/dot.h index 05ed68a7b..d27d8ba9a 100644 --- a/tests/common/src/dot.h +++ b/tests/common/src/dot.h @@ -25,10 +25,8 @@ void dot(TYPE * A, TYPE * B, TYPE * C, c += USEA @ USEB; pa = pa + TK * STRIDE_AK; pb = pb + TK * STRIDE_BK; - bool checka[SHAPE_A] = k > TK; - bool checkb[SHAPE_B] = k > TK; - a = checka ? *pa : 0; - b = checkb ? *pb : 0; + a = ((bool[SHAPE_A]) k > TK) ? *pa : 0; + b = ((bool[SHAPE_B]) k > TK) ? *pb : 0; } // epilogue TYPE* pc[TM, TN] = C + rm[:, newaxis] + rn[newaxis, :] * ldc; From 8bd87fa19d0bdcfccc0bc5b94e13a50be248d5e8 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 25 Oct 2019 17:00:53 -0400 Subject: [PATCH 466/494] [TEST][DOT] There seems to be a bug in casting tiles before ternary. Reverting for now --- tests/bench/dot.cc | 2 +- tests/common/dot.h | 4 ++-- tests/common/src/dot.h | 6 ++++-- tests/unit/dot.cc | 2 +- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index c87e1c938..927f0044b 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -34,7 +34,7 @@ int main() { for(const auto& c: configs){ std::tie(ord, AT, BT, M, N, K) = c; std::cout << "// " << c << std::flush; - for(auto perf: bench_dot(stream, FLOAT, AT, BT, M, N, K, ord, ord)) + for(auto perf: bench_dot(stream, HALF, AT, BT, M, N, K, ord, ord)) std::cout << ", " << perf << std::flush; std::cout << std::endl; } diff --git a/tests/common/dot.h b/tests/common/dot.h index ddbb1c77a..3ee1e9f68 100644 --- a/tests/common/dot.h +++ b/tests/common/dot.h @@ -147,9 +147,9 @@ bool triton_dot(drv::stream* stream, bool AT, bool BT, std::vector ha(M*K); std::vector hb(K*N); for(size_t i = 0; i < ha.size(); i++) - ha[i] = static_cast((float)rand()/RAND_MAX); + ha[i] = 1; for(size_t i = 0; i < hb.size(); i++) - hb[i] = static_cast((float)rand()/RAND_MAX); + hb[i] = 1; // copy buffer stream->write(&*da, true, 0, ha); stream->write(&*db, true, 0, hb); diff --git a/tests/common/src/dot.h b/tests/common/src/dot.h index d27d8ba9a..05ed68a7b 100644 --- a/tests/common/src/dot.h +++ b/tests/common/src/dot.h @@ -25,8 +25,10 @@ void dot(TYPE * A, TYPE * B, TYPE * C, c += USEA @ USEB; pa = pa + TK * STRIDE_AK; pb = pb + TK * STRIDE_BK; - a = ((bool[SHAPE_A]) k > TK) ? *pa : 0; - b = ((bool[SHAPE_B]) k > TK) ? *pb : 0; + bool checka[SHAPE_A] = k > TK; + bool checkb[SHAPE_B] = k > TK; + a = checka ? *pa : 0; + b = checkb ? *pb : 0; } // epilogue TYPE* pc[TM, TN] = C + rm[:, newaxis] + rn[newaxis, :] * ldc; diff --git a/tests/unit/dot.cc b/tests/unit/dot.cc index 02777fa4b..dec01dc21 100644 --- a/tests/unit/dot.cc +++ b/tests/unit/dot.cc @@ -16,7 +16,7 @@ int main() { for(int nwarps: std::vector{4}) for(bool AT: std::array{false, true}) for(bool BT: std::array{false, true}){ - configs.push_back(config_t{FLOAT, AT, BT, 128, 128, 128, TM, TN, TK, nwarps}); + configs.push_back(config_t{HALF, AT, BT, 128, 128, 128, TM, TN, TK, nwarps}); } // test dtype_t dtype; From 76adcb755acf789f21ec628719cb803d7824eb4a Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 25 Oct 2019 19:01:21 -0400 Subject: [PATCH 467/494] [PYTHON][EXAMPLES] Tentative support for einsum with transpositions --- include/triton/lang/ast.h | 23 ++++++++++++- include/triton/lang/code_gen.h | 2 ++ include/triton/lang/evaluator.h | 6 ++++ include/triton/lang/visitor.h | 2 ++ lib/lang/ast.cc | 39 +++++++++++++++------- lib/lang/code_gen.cc | 7 +++- lib/lang/parser.cc | 42 +++++++++++++++++++++--- python/triton/ops/einsum.py | 58 +++++++++++++++++++-------------- tests/common/dot.h | 4 +-- 9 files changed, 137 insertions(+), 46 deletions(-) diff --git a/include/triton/lang/ast.h b/include/triton/lang/ast.h index 43cfc485f..f523cb835 100644 --- a/include/triton/lang/ast.h +++ b/include/triton/lang/ast.h @@ -430,7 +430,6 @@ public: void AddrOpTypeChecking(); void DerefOpTypeChecking(); void ReduceOpTypeChecking(); - void TransOpTypeChecking(); void UnaryArithmOpTypeChecking(); void CastOpTypeChecking(); @@ -448,6 +447,28 @@ protected: Expr* operand_; }; +class TransOp: public Expr { + friend class Generator; + +public: + using PermInt = std::vector; + +public: + static TransOp* New(const PermInt& perm, Expr* operand); + const PermInt& getPerm() const { return perm_; } + void Accept(Visitor* v); + bool IsLVal() { return false; } + void TypeChecking(); + +protected: + TransOp(const PermInt& perm, Expr* operand) + : Expr(operand->Tok(), nullptr), operand_(operand), perm_(perm) {} + +private: + Expr* operand_; + PermInt perm_; +}; + // cond ? true : false class ConditionalOp : public Expr { diff --git a/include/triton/lang/code_gen.h b/include/triton/lang/code_gen.h index 69a1a7514..96a02ce9a 100644 --- a/include/triton/lang/code_gen.h +++ b/include/triton/lang/code_gen.h @@ -58,6 +58,7 @@ public: // Expression void VisitBinaryOp(BinaryOp* binaryOp); void VisitUnaryOp(UnaryOp* unaryOp); + void VisitTransOp(TransOp* transOp); void VisitConditionalOp(ConditionalOp* condOp); void VisitFuncCall(FuncCall* funcCall); void VisitObject(Object* obj); @@ -130,6 +131,7 @@ public: void VisitConditionalOp(ConditionalOp*) { should_not_happen(); } void VisitFuncCall(FuncCall*) { should_not_happen(); } + void VisitTransOp(TransOp*) { should_not_happen(); } void VisitEnumerator(Enumerator*) { should_not_happen(); } void VisitConstant(Constant*) { should_not_happen(); } void VisitTempVar(TempVar*) { should_not_happen(); } diff --git a/include/triton/lang/evaluator.h b/include/triton/lang/evaluator.h index 589739b45..ac8404550 100644 --- a/include/triton/lang/evaluator.h +++ b/include/triton/lang/evaluator.h @@ -30,6 +30,9 @@ public: virtual void VisitIdentifier(Identifier* ident) { Error(ident, "expect constant expression"); } + virtual void VisitTransOp(TransOp* trans) { + Error(trans, "expect constant expression"); + } virtual void VisitObject(Object* obj) { Error(obj, "expect constant expression"); } @@ -83,6 +86,9 @@ public: virtual void VisitFuncCall(FuncCall* funcCall) { Error(funcCall, "expect constant expression"); } + virtual void VisitTransOp(TransOp* trans) { + Error(trans, "expect constant expression"); + } virtual void VisitEnumerator(Enumerator* enumer) { addr_.offset_ = enumer->Val(); } diff --git a/include/triton/lang/visitor.h b/include/triton/lang/visitor.h index 16398f57b..239071edf 100644 --- a/include/triton/lang/visitor.h +++ b/include/triton/lang/visitor.h @@ -6,6 +6,7 @@ class BinaryOp; class UnaryOp; +class TransOp; class ConditionalOp; class FuncCall; class Identifier; @@ -31,6 +32,7 @@ public: virtual ~Visitor() {} virtual void VisitBinaryOp(BinaryOp* binary) = 0; virtual void VisitUnaryOp(UnaryOp* unary) = 0; + virtual void VisitTransOp(TransOp* trans) = 0; virtual void VisitConditionalOp(ConditionalOp* cond) = 0; virtual void VisitFuncCall(FuncCall* funcCall) = 0; virtual void VisitEnumerator(Enumerator* enumer) = 0; diff --git a/lib/lang/ast.cc b/lib/lang/ast.cc index b6cd99633..1fd8b2dcb 100644 --- a/lib/lang/ast.cc +++ b/lib/lang/ast.cc @@ -7,6 +7,7 @@ static MemPoolImp binaryOpPool; +static MemPoolImp transOpPool; static MemPoolImp conditionalOpPool; static MemPoolImp funcCallPool; static MemPoolImp initializationPool; @@ -78,6 +79,9 @@ void UnaryOp::Accept(Visitor* v) { v->VisitUnaryOp(this); } +void TransOp::Accept(Visitor* v) { + v->VisitTransOp(this); +} void ConditionalOp::Accept(Visitor* v) { v->VisitConditionalOp(this); @@ -645,9 +649,6 @@ void UnaryOp::TypeChecking() { case Token::CAST: return CastOpTypeChecking(); - case '^': - return TransOpTypeChecking(); - case Token::REDUCE: return ReduceOpTypeChecking(); @@ -702,15 +703,6 @@ void UnaryOp::ReduceOpTypeChecking() { type_ = TileType::New(shape, tileType->Derived()); } -void UnaryOp::TransOpTypeChecking() { - auto tileType = operand_->Type()->ToTile(); - if(!tileType) - Error(this, "tile expected for transposition operator '^'"); - auto shape = tileType->Shape(); - std::rotate(shape.begin(), shape.begin() + 1, shape.end()); - type_ = TileType::New(shape, tileType->Derived()); -} - void UnaryOp::UnaryArithmOpTypeChecking() { auto scalType = TryExtractScalarType(this, operand_); if (Token::PLUS == op_ || Token::MINUS == op_) { @@ -769,6 +761,29 @@ void UnaryOp::CastOpTypeChecking() { } } +/* + * Transposition Operator + */ +void TransOp::TypeChecking() { + auto tileType = operand_->Type()->ToTile(); + if(!tileType) + Error(this, "tile expected for transposition operator '^'"); + auto opShape = tileType->Shape(); + if(perm_.size() != opShape.size()) + Error(this, "invalid permutations"); + // permutate input shape + TileType::ShapeInt resShape(opShape.size()); + for(int d = 0; d < opShape.size(); d++) + resShape[d] = opShape[perm_[d]]; + type_ = TileType::New(resShape, tileType->Derived()); +} + +TransOp* TransOp::New(const PermInt& perm, Expr* operand) { + auto ret = new (transOpPool.Alloc()) TransOp(perm, operand); + ret->pool_ = &transOpPool; + ret->TypeChecking(); + return ret; +} /* * Conditional Operator diff --git a/lib/lang/code_gen.cc b/lib/lang/code_gen.cc index aee604b4a..fdc754048 100644 --- a/lib/lang/code_gen.cc +++ b/lib/lang/code_gen.cc @@ -185,7 +185,6 @@ void Generator::VisitUnaryOp(UnaryOp* unary) { case '~': return set_ret(bld_->create_neg(arg)); case '!': return set_ret(bld_->create_not(arg)); case Token::CAST: return set_ret(GenCastOp(arg, GenIRType(unary->Type(), *ctx_))); - case '^': return set_ret(bld_->create_trans(arg)); case Token::REDUCE: { int ax, tag; UnaryOp::decodeRed(unary->info_, ax, tag); @@ -198,6 +197,12 @@ void Generator::VisitUnaryOp(UnaryOp* unary) { return error_not_implemented(); } +void Generator::VisitTransOp(TransOp *trans) { + Visit(trans->operand_); + ir::value* arg = ret_; + return set_ret(bld_->create_trans(arg, trans->getPerm())); +} + void Generator::VisitConditionalOp(ConditionalOp* condOp) { // auto &instructions = bld_->get_insert_block()->get_inst_list(); VisitExpr(condOp->cond_); diff --git a/lib/lang/parser.cc b/lib/lang/parser.cc index a30258c3d..960c983cf 100644 --- a/lib/lang/parser.cc +++ b/lib/lang/parser.cc @@ -451,6 +451,7 @@ Expr* Parser::ParseSubScripting(Expr* lhs) { QualType lhsQual = lhsTile->Derived(); // create ret shape TileType::ShapeInt shape; + TileType::ShapeInt axVec; size_t i = 0; const Token* tok; std::vector> redInfo; @@ -469,10 +470,22 @@ Expr* Parser::ParseSubScripting(Expr* lhs) { case Token::SUB: case Token::MAX: case Token::MIN:{ - int info = UnaryOp::encodeRed(i, tok->tag_); - redInfo.push_back({i, info}); - shape.push_back(lhsShape[i++]); - break; + int info = UnaryOp::encodeRed(i, tok->tag_); + redInfo.push_back({i, info}); + shape.push_back(lhsShape[i++]); + break; + } + + case '^':{ + Expr* expr = ParseConditionalExpr(); + EnsureInteger(expr); + int ax = Evaluator().Eval(expr); + axVec.push_back(ax); + if(ax < 0 || ax >= lhsShape.size()) + Error(tok, "unknown axis %d in transposition", ax); + shape.push_back(lhsShape[ax]); + i++; + break; } default: @@ -481,8 +494,19 @@ Expr* Parser::ParseSubScripting(Expr* lhs) { } }while(ts_.Try(',')); ts_.Expect(']'); + + // transposition mode + std::set axSet(axVec.begin(), axVec.end()); + if(!axSet.empty()){ + if(axSet.size()!=lhsShape.size()) + Error(tok, "transposition must address all axes of input array"); + return TransOp::New(axVec, lhs); + } + + // broadcasting mode if(lhsShape.size() > i) Error(tok, "broadcasting not using all operand axes"); + // create ret tile Expr* res = lhs; for(auto r: redInfo){ @@ -553,7 +577,15 @@ Expr* Parser::ParseUnaryExpr() { case '-': return ParseUnaryOp(tok, Token::MINUS); case '~': return ParseUnaryOp(tok, '~'); case '!': return ParseUnaryOp(tok, '!'); - case '^': return ParseUnaryOp(tok, Token::XOR); + case '^': { + auto operand = ParseCastExpr(); + TileType::ShapeInt shape = operand->Type()->ToTile()->Shape(); + TransOp::PermInt perm(shape.size()); + for(int d = 0; d < shape.size(); d++) + perm[d] = d; + std::rotate(perm.begin(), perm.begin() + 1, perm.end()); + return TransOp::New(perm, operand); + } default: ts_.PutBack(); return ParsePostfixExpr(); diff --git a/python/triton/ops/einsum.py b/python/triton/ops/einsum.py index 571c8f1ba..43efa7db1 100644 --- a/python/triton/ops/einsum.py +++ b/python/triton/ops/einsum.py @@ -8,41 +8,36 @@ class _einsum(triton.function): int std_A0, int std_B0, int std_C0, int std_A1, int std_B1, int std_C1) { // program id - int pid0 = get_program_id(0); - int pid1 = get_program_id(1); - int pid2 = get_program_id(2); + int pgm = get_program_id(0); + int pgn = get_program_id(1); + int pgb = get_program_id(2); // range - int rma[TM] = pid0 * TM + 0 ... TM; - int rnb[TN] = pid1 * TN + 0 ... TN; - int rka[TK] = 0 ... TK; - int rkb[TK] = 0 ... TK; - int rba[TB] = pid2 * TB + 0 ... TB; - int rbb[TB] = pid2 * TB + 0 ... TB; + int rm[TM] = pgm * TM + 0 ... TM; + int rn[TN] = pgn * TN + 0 ... TN; + int rb[TB] = pgb * TB + 0 ... TB; + int rk[TK] = 0 ... TK; // accumulator TYPE c[TM, TN, TB] = 0; // pointers to a - TYPE *pa[TM, TK, TB] = A + rka[newaxis, :, newaxis] * 1 - + rma[:, newaxis, newaxis] * std_A1 - + rba[newaxis, newaxis, :] * std_A0; + TYPE *pa[SHAPE_A] = A + rk[BROADCAST_AK] * STRIDE_AK + + rm[BROADCAST_AM] * STRIDE_AM + + rb[newaxis, newaxis, :] * std_A0; // pointers to b - TYPE *pb[TK, TN, TB] = B + rkb[:, newaxis, newaxis] * 1 - + rnb[newaxis, :, newaxis] * std_B1 - + rbb[newaxis, newaxis, :] * std_B0; + TYPE *pb[SHAPE_B] = B + rk[BROADCAST_BK] * STRIDE_BK + + rn[BROADCAST_BN] * STRIDE_BN + + rb[newaxis, newaxis, :] * std_B0; // accumulation for(int k = dim_K; k > 0; k -= TK) { - TYPE a[TM, TK, TB] = *pa; - TYPE b[TK, TN, TB] = *pb; + TYPE a[SHAPE_A] = *pa; + TYPE b[SHAPE_B] = *pb; c += a @ b; pa += TK; pb += TK; } // write-back - int rmc[TM] = pid0 * TM + 0 ... TM; - int rnc[TN] = pid1 * TN + 0 ... TN; - int rbc[TB] = pid2 * TB + 0 ... TB; - TYPE *pc[TM, TN, TB] = C + rmc[:, newaxis, newaxis] * std_C1 - + rnc[newaxis, :, newaxis] * 1 - + rbc[newaxis, newaxis, :] * std_C0; + TYPE *pc[TM, TN, TB] = C + rm[:, newaxis, newaxis] * std_C1 + + rn[newaxis, :, newaxis] * 1 + + rb[newaxis, newaxis, :] * std_C0; *pc = c; } """ @@ -138,12 +133,25 @@ class _einsum(triton.function): grid = lambda opt: [triton.cdiv(bmnk[1], opt.d('TM')), triton.cdiv(bmnk[2], opt.d('TN')), triton.cdiv(bmnk[0], opt.d('TB'))] - #print(std0, std1) + macros = {# handle A transposition + 'USE_A' : 'a[^1, ^0, ^2]' if trans_a else 'a', + 'STRIDE_AK' : 'std_A1' if trans_a else '1', + 'STRIDE_AM' : '1' if trans_a else 'std_A1', + 'BROADCAST_AK': ':, newaxis, newaxis' if trans_a else 'newaxis, :, newaxis', + 'BROADCAST_AM': 'newaxis, :, newaxis' if trans_a else ':, newaxis, newaxis', + 'SHAPE_A' : 'TK, TM, TB' if trans_a else 'TM, TK, TB', + # handle B transposition + 'USE_B' : 'b[^1, ^0, ^2]' if not trans_b else 'b', + 'STRIDE_BK' : 'std_B1' if not trans_b else '1', + 'STRIDE_BN' : '1' if not trans_b else 'std_B1', + 'BROADCAST_BK': 'newaxis, :, newaxis' if not trans_b else ':, newaxis, newaxis', + 'BROADCAST_BN': ':, newaxis, newaxis' if not trans_b else 'newaxis, :, newaxis', + 'SHAPE_B' : 'TN, TK, TB' if not trans_b else 'TK, TN, TB'} return _einsum.kernel(a, b, c, bmnk[1], bmnk[2], bmnk[3], std0[0], std0[1], std0[2], std1[0], std1[1], std1[2], - grid, + grid, **macros, TYPE='float', TM=32, TN=32, TK=8, TB=1) diff --git a/tests/common/dot.h b/tests/common/dot.h index 3ee1e9f68..23bb46c72 100644 --- a/tests/common/dot.h +++ b/tests/common/dot.h @@ -86,14 +86,14 @@ bool triton_dot(drv::stream* stream, bool AT, bool BT, // macros rt::function::options_space_t opt; // A access patterns - opt.defines.push_back({"USEA", {AT? "^a" : "a" }}); + opt.defines.push_back({"USEA", {AT? "a[^1, ^0]" : "a" }}); opt.defines.push_back({"BROADCAST_AK", {AT? ":, newaxis" : "newaxis, :" }}); opt.defines.push_back({"BROADCAST_AM", {AT? "newaxis, :" : ":, newaxis" }}); opt.defines.push_back({"SHAPE_A", {AT? "TK, TM" : "TM, TK" }}); opt.defines.push_back({"STRIDE_AK", {AT? sa[a_order[0]] : sa[a_order[1]] }}); opt.defines.push_back({"STRIDE_AM", {AT? sa[a_order[1]] : sa[a_order[0]] }}); // B access patterns - opt.defines.push_back({"USEB", {BT? "^b" : "b" }}); + opt.defines.push_back({"USEB", {BT? "b[^1, ^0]" : "b" }}); opt.defines.push_back({"BROADCAST_BK", {BT? "newaxis, :" : ":, newaxis" }}); opt.defines.push_back({"BROADCAST_BN", {BT? ":, newaxis" : "newaxis, :" }}); opt.defines.push_back({"SHAPE_B", {BT? "TN, TK" : "TK, TN" }}); From 655f43fb5b7bdacbeda78a8aa18bb3f59cdeb9fc Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 26 Oct 2019 15:10:19 -0400 Subject: [PATCH 468/494] more work --- lib/codegen/selection/generator.cc | 5 ++-- lib/codegen/selection/machine_value.cc | 1 + lib/runtime/function.cc | 4 ++- python/triton/ops/__init__.py | 4 +-- python/triton/ops/einsum.py | 40 ++++++++++++++++++++++++-- 5 files changed, 46 insertions(+), 8 deletions(-) diff --git a/lib/codegen/selection/generator.cc b/lib/codegen/selection/generator.cc index 04bc29412..03f393069 100644 --- a/lib/codegen/selection/generator.cc +++ b/lib/codegen/selection/generator.cc @@ -187,6 +187,7 @@ generator::generator(analysis::axes *a_axes, void generator::visit_value(ir::value* v) { + std::cout << "visiting " << typeid(*v).name() << std::endl; if(!seen_.insert(v).second) return; // create machine tile @@ -559,8 +560,8 @@ void generator::visit_hmma_dot(ir::dot_inst* dot, shared_tile *TA, shared_tile * bool is_a_trans = is_trans(dot->get_operand(0)); bool is_b_trans = is_trans(dot->get_operand(1)); - bool is_a_row = is_a_trans ^ (ord_a[ord_a.size() - 2] == 1); - bool is_b_row = is_b_trans ^ (ord_b[ord_b.size() - 2] == 1); + bool is_a_row = is_a_trans ^ (ord_a[0] == 1); + bool is_b_row = is_b_trans ^ (ord_b[0] == 1); Value *offset_a_i = hmma->offset_a_i_; diff --git a/lib/codegen/selection/machine_value.cc b/lib/codegen/selection/machine_value.cc index bd4237043..a7cd73a8e 100644 --- a/lib/codegen/selection/machine_value.cc +++ b/lib/codegen/selection/machine_value.cc @@ -1,4 +1,5 @@ #include +#include #include "llvm/IR/IRBuilder.h" #include "triton/codegen/selection/machine_value.h" diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index f9eab4ee4..b37dbf332 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -221,6 +221,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::transform::cts cts; codegen::generator isel(&axes, &layouts, &align, &allocation, target.get(), opt.num_warps); // run passes + std::cout << "begin" << std::endl; disassociate.run(module); dce.run(module); peephole.run(module); @@ -244,9 +245,10 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c if(allocation.allocated_size() > context->device()->max_shared_memory()) return std::unique_ptr(); barriers.run(module); + std::cout << "isel" << std::endl; // ir::print(module, std::cout); -// exit(EXIT_FAILURE); isel.visit(module, *llvm); + std::cout << "done" << std::endl; // return binary std::unique_ptr res(driver::module::create(context, std::move(llvm))); // done diff --git a/python/triton/ops/__init__.py b/python/triton/ops/__init__.py index 3edb82406..f409fde46 100644 --- a/python/triton/ops/__init__.py +++ b/python/triton/ops/__init__.py @@ -1,2 +1,2 @@ -from .dot import dot -from .einsum import einsum +from .dot import _dot, dot +from .einsum import _einsum, einsum diff --git a/python/triton/ops/einsum.py b/python/triton/ops/einsum.py index 43efa7db1..d78d9f1a2 100644 --- a/python/triton/ops/einsum.py +++ b/python/triton/ops/einsum.py @@ -3,7 +3,7 @@ import triton class _einsum(triton.function): src = """ - void einsum(TYPE * A, TYPE * B, TYPE * C, + void einsum_(TYPE * A, TYPE * B, TYPE * C, int dim_M, int dim_N, int dim_K, int std_A0, int std_B0, int std_C0, int std_A1, int std_B1, int std_C1) { @@ -30,7 +30,7 @@ class _einsum(triton.function): for(int k = dim_K; k > 0; k -= TK) { TYPE a[SHAPE_A] = *pa; TYPE b[SHAPE_B] = *pb; - c += a @ b; + c += USE_A @ USE_B; pa += TK; pb += TK; } @@ -157,6 +157,7 @@ class _einsum(triton.function): @staticmethod def forward(ctx, subscripts, a, b): + ctx.save_for_backward(a, b) if type(subscripts) is str: einsum_a, einsum_bc = subscripts.split(",") einsum_b, einsum_c = einsum_bc.split("->") @@ -165,8 +166,41 @@ class _einsum(triton.function): shape_c, bmnk, std0, std1, ta, tb = _einsum._parse_einsum( einsum_a, einsum_b, einsum_c, - a.shape, b.shape + a.shape.as_list(), b.shape.as_list() ) + ctx.trans_a = ta + ctx.trans_b = tb + ctx.einsum_a = einsum_a + ctx.einsum_b = einsum_b + ctx.einsum_c = einsum_c return _einsum.call(a, b, ta, tb, shape_c, bmnk, std0, std1, einsum_a, einsum_b, einsum_c) + + @staticmethod + def backward(ctx, dc): + a, b = ctx.saved_tensors + trans_a = ctx.trans_a + trans_b = ctx.trans_b + einsum_a = ctx.einsum_a + einsum_b = ctx.einsum_b + einsum_c = ctx.einsum_c + + if not trans_a and not trans_b: # NN + da = einsum((einsum_c, einsum_b, einsum_a), dc, b) + db = einsum((einsum_a, einsum_c, einsum_b), a, dc) + + elif not trans_a and trans_b: # NT + da = einsum((einsum_c, einsum_b, einsum_a), dc, b) + db = einsum((einsum_c, einsum_a, einsum_b), dc, a) + + elif trans_a and not trans_b: # TN + da = einsum((einsum_b, einsum_c, einsum_a), b, dc) + db = einsum((einsum_a, einsum_c, einsum_b), a, dc) + + elif trans_a and trans_b: # TT (not used) + da = einsum((einsum_b, einsum_c, einsum_a), b, dc) + db = einsum((einsum_c, einsum_a, einsum_b), dc, a) + + return da, db, None, None, None, None, None, None, None, None, None, None + einsum = _einsum.apply \ No newline at end of file From e11557855f00aa3afe2c8e8b6e93ea195cb75c01 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 26 Oct 2019 22:14:50 -0400 Subject: [PATCH 469/494] [PYTHON] [OPS] Added einsum implementation --- lib/codegen/selection/generator.cc | 4 +- lib/codegen/transform/disassociate.cc | 57 ++++++------ lib/runtime/function.cc | 16 +++- python/examples/einsum_test.py | 129 ++++++++++++++++++++++++++ python/triton/ops/einsum.py | 21 +++-- 5 files changed, 183 insertions(+), 44 deletions(-) create mode 100644 python/examples/einsum_test.py diff --git a/lib/codegen/selection/generator.cc b/lib/codegen/selection/generator.cc index 03f393069..1ff4287eb 100644 --- a/lib/codegen/selection/generator.cc +++ b/lib/codegen/selection/generator.cc @@ -187,12 +187,12 @@ generator::generator(analysis::axes *a_axes, void generator::visit_value(ir::value* v) { - std::cout << "visiting " << typeid(*v).name() << std::endl; if(!seen_.insert(v).second) return; // create machine tile - if(v->get_type()->is_tile_ty()) + if(v->get_type()->is_tile_ty()){ tmap_[v] = machine_layouts_.at(layouts_->get(v))->create(v); + } // visit operands BasicBlock *current = builder_->GetInsertBlock(); auto *inst = dynamic_cast(v); diff --git a/lib/codegen/transform/disassociate.cc b/lib/codegen/transform/disassociate.cc index 70134b186..1134463ec 100644 --- a/lib/codegen/transform/disassociate.cc +++ b/lib/codegen/transform/disassociate.cc @@ -10,67 +10,62 @@ namespace codegen{ namespace transform{ void extract_retile_chain(ir::user *root, - const std::vector& current, - std::vector>& result, + std::map>& result, + int depth, std::set& seen) { if(!seen.insert(root).second) return; - if(dynamic_cast(root) || dynamic_cast(root)){ - std::vector next = current; - next.push_back(root); - result.push_back(next); + result[depth].insert(root); + if(dynamic_cast(root) || + dynamic_cast(root)){ return; } for(ir::value *op: root->ops()){ ir::user *u = dynamic_cast(op); if(!u) continue; - std::vector next = current; - next.push_back(u); - extract_retile_chain(u, next, result, seen); + extract_retile_chain(u, result, depth + 1, seen); } } void disassociate::run(ir::module &mod) { ir::builder &bld = mod.get_builder(); - std::map>> clone_info; + std::map>> clone_info; ir::for_each_instruction(mod, [&](ir::instruction *i){ if(dynamic_cast(i)){ - std::vector> chains; + std::map> chains; std::set seen; if(!dynamic_cast(i->get_operand(0))) return; - extract_retile_chain(i, {}, chains, seen); + extract_retile_chain(i, chains, 0, seen); if(chains.size()) clone_info[i] = chains; } }); - - for(auto x: clone_info){ - for(auto chain: x.second){ - for(int i = 0; i < chain.size(); i++) { - ir::instruction *y = (ir::instruction*)chain[i]; + for(const auto& x: clone_info){ + int depth = 1; + std::map clone_map; + while(x.second.find(depth) != x.second.end()){ + // clone all users + const auto& remat = x.second.at(depth); + for(ir::user* u: remat){ + ir::instruction *y = (ir::instruction*)u; ir::instruction *cloned = y->clone(); bld.set_insert_point(y); bld.insert(cloned); - if(i > 0) - chain[i-1]->replace_uses_of_with(y, cloned); - else + clone_map[y] = cloned; + // replace in above level + if(depth > 1){ + for(ir::user* ux: x.second.at(depth - 1)) + clone_map.at((ir::instruction*)ux)->replace_uses_of_with(y, cloned); + } + else{ x.first->replace_uses_of_with(y, cloned); + } } - - -// ir::instruction *y = (ir::instruction*)parent; -// for(ir::user *u: chain){ -// ir::instruction *cloned = y->clone(); -// bld.set_insert_point(y); -// bld.insert(cloned); -// std::cout << typeid(*u).name() << std::endl; -// u->replace_uses_of_with(y, cloned); -// y = (ir::instruction*)u; -// } + depth += 1; } } diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index b37dbf332..bc55d65eb 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -221,9 +221,17 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::transform::cts cts; codegen::generator isel(&axes, &layouts, &align, &allocation, target.get(), opt.num_warps); // run passes - std::cout << "begin" << std::endl; - disassociate.run(module); +// ir::print(module, std::cout); dce.run(module); +// ir::print(module, std::cout); + + disassociate.run(module); + +// ir::print(module, std::cout); + + dce.run(module); +// ir::print(module, std::cout); + peephole.run(module); dce.run(module); align.run(module); @@ -245,10 +253,10 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c if(allocation.allocated_size() > context->device()->max_shared_memory()) return std::unique_ptr(); barriers.run(module); - std::cout << "isel" << std::endl; +// std::cout << "isel" << std::endl; // ir::print(module, std::cout); isel.visit(module, *llvm); - std::cout << "done" << std::endl; +// std::cout << "done" << std::endl; // return binary std::unique_ptr res(driver::module::create(context, std::move(llvm))); // done diff --git a/python/examples/einsum_test.py b/python/examples/einsum_test.py new file mode 100644 index 000000000..799efbf70 --- /dev/null +++ b/python/examples/einsum_test.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import tensorflow as tf +import triton +import blocksparse as bs +from tensorflow.python.ops import gradient_checker + +one = 0 +out = 0 +bench = 0 +class ProdKeyTest(tf.test.TestCase): + + def testEinsum(self): + # multi-threading screws up benchmarking + conf = tf.ConfigProto( + intra_op_parallelism_threads=1, + inter_op_parallelism_threads=1) + + with self.test_session(config=conf) as sess, tf.device("/gpu:0"): + + batch_dim = 4 + ctx_dim = 256 + head_dim = 8 + n_keys = 512 + key_dim = 128 + + # batch_dim = 2 + # ctx_dim = 8 + # head_dim = 2 + # n_keys = 16 + # key_dim = 16 + + for a_shape, b_shape, c_shape, einsum in [ + [ [ 4, 8, 8 ], [ 8, 8 ], [ 4, 8, 8 ], "btc,ck->btk" ], + [ [ 4, 1024, 1024 ], [ 1024, 512 ], [ 4, 1024, 512 ], "btc,ck->btk" ], + [ (batch_dim, ctx_dim, head_dim, 2, key_dim//2),(head_dim, 2, n_keys, key_dim//2), (batch_dim, ctx_dim, head_dim, 2, n_keys), "bchak,hank->bchan" ], + ]: + + if one: + A = np.ones(a_shape, dtype=np.float32) + B = np.ones(b_shape, dtype=np.float32) + E = np.ones(c_shape, dtype=np.float32) + else: + # QK = np.random.normal(loc=0.0, scale=1.0, size=qk_shape).astype(np.float16).astype(np.float32) + # V = np.random.normal(loc=0.0, scale=1.0, size=vw_shape).astype(np.float16).astype(np.float32) + A = np.random.uniform(-1.0, 1.0, a_shape).astype(np.float16).astype(np.float32) + B = np.random.uniform(-1.0, 1.0, b_shape).astype(np.float16).astype(np.float32) + E = np.random.uniform(-1.0, 1.0, c_shape).astype(np.float16).astype(np.float32) + + a = tf.placeholder(tf.float32, a_shape, name="a") + b = tf.placeholder(tf.float32, b_shape, name="b") + e = tf.placeholder(tf.float32, c_shape, name="e") + feed_dict = { a:A, b:B, e:E } + + cc = triton.ops.einsum(einsum, a, b) + + # error = gradient_checker.compute_gradient_error(a, a_shape, c, c_shape, delta=1e-1, extra_feed_dict={ b:B }) # + # print(error) + # error = gradient_checker.compute_gradient_error(b, b_shape, c, c_shape, delta=1e-1, extra_feed_dict={ a:A }) # + # print(error) + # return + + with tf.control_dependencies([cc.op]): + da, db = tf.gradients(cc, [a, b], e) + + # c, = sess.run( [ c, ], feed_dict ) + c, da, db = sess.run( [ cc, da, db ], feed_dict ) + + if bench == 0: + + C = np.einsum(einsum, A, B) + id = cc.op.get_attr('id') + ctx = triton.ops._einsum.contexts[id] + t_a = ctx.trans_a + t_b = ctx.trans_b + e_a = ctx.einsum_a + e_b = ctx.einsum_b + e_c = ctx.einsum_c + + if not t_a and not t_b: # NN + DA = np.einsum(f"{e_c},{e_b}->{e_a}", E, B) + DB = np.einsum(f"{e_a},{e_c}->{e_b}", A, E) + elif not t_a and t_b: # NT + DA = np.einsum(f"{e_c},{e_b}->{e_a}", E, B) + DB = np.einsum(f"{e_c},{e_a}->{e_b}", E, A) + elif t_a and not t_b: # TN + DA = np.einsum(f"{e_b},{e_c}->{e_a}", B, E) + DB = np.einsum(f"{e_a},{e_c}->{e_b}", A, E) + + print("testProdKey", einsum) + if not bench: + for op, dev, cpu in [ + [ "C", c, C ], + [ "DA", da, DA ], + [ "DB", db, DB ], + ]: + self.compare_results(op, dev, cpu) + + def compare_results(self, op, dev, cpu): + dev = dev.astype(np.float64) + cpu = cpu.astype(np.float64) + + # print(dev.reshape(-1)[0:4]) + # print(cpu.reshape(-1)[0:4]) + + dif = np.abs(cpu - dev) + maxval = np.max(abs(cpu)) + avgval = np.average(abs(cpu)) + maxdif = dif.max() + max_err = maxdif if avgval == 0 else maxdif / avgval + l2_err = 0.0 if avgval == 0 else np.sqrt(np.square(dif).sum()) / np.sqrt(np.square(cpu).sum()) + + print("op:%3s, max:%18.12f, avg:%18.12f, dif:%18.12f, err:%18.12f, l2_err:%18.12f shape:%15s" % (op, maxval, avgval, maxdif, max_err, l2_err, str(cpu.shape))) + + if out: + dim = cpu.shape[-1] + np.savetxt("%s_dif.txt" % op, dif.reshape((-1,dim)), fmt='%4.1f') #7.5 5.3 + np.savetxt("%s_cpu.txt" % op, cpu.reshape((-1,dim)), fmt='%4.1f') #7.5 5.3 + np.savetxt("%s_dev.txt" % op, dev.reshape((-1,dim)), fmt='%4.1f') #7.5 5.3 + exit() + +if __name__ == "__main__": + tf.test.main() + diff --git a/python/triton/ops/einsum.py b/python/triton/ops/einsum.py index d78d9f1a2..7f4457d99 100644 --- a/python/triton/ops/einsum.py +++ b/python/triton/ops/einsum.py @@ -1,3 +1,6 @@ +# Special thanks to Scott Gray from OpenAI for writing the einsum parsing function + + import triton class _einsum(triton.function): @@ -31,14 +34,18 @@ class _einsum(triton.function): TYPE a[SHAPE_A] = *pa; TYPE b[SHAPE_B] = *pb; c += USE_A @ USE_B; - pa += TK; - pb += TK; + pa += TK * STRIDE_AK; + pb += TK * STRIDE_BK; } // write-back TYPE *pc[TM, TN, TB] = C + rm[:, newaxis, newaxis] * std_C1 + rn[newaxis, :, newaxis] * 1 + rb[newaxis, newaxis, :] * std_C0; - *pc = c; + bool checkm[TM] = rm < dim_M; + bool checkn[TN] = rn < dim_N; + bool checkc[TM, TN, TB] = checkm[:, newaxis, newaxis] && + checkn[newaxis, :, newaxis]; + *?(checkc)pc = c; } """ @@ -141,12 +148,12 @@ class _einsum(triton.function): 'BROADCAST_AM': 'newaxis, :, newaxis' if trans_a else ':, newaxis, newaxis', 'SHAPE_A' : 'TK, TM, TB' if trans_a else 'TM, TK, TB', # handle B transposition - 'USE_B' : 'b[^1, ^0, ^2]' if not trans_b else 'b', + 'USE_B' : 'b' if not trans_b else 'b[^1, ^0, ^2]', 'STRIDE_BK' : 'std_B1' if not trans_b else '1', 'STRIDE_BN' : '1' if not trans_b else 'std_B1', - 'BROADCAST_BK': 'newaxis, :, newaxis' if not trans_b else ':, newaxis, newaxis', - 'BROADCAST_BN': ':, newaxis, newaxis' if not trans_b else 'newaxis, :, newaxis', - 'SHAPE_B' : 'TN, TK, TB' if not trans_b else 'TK, TN, TB'} + 'BROADCAST_BK': ':, newaxis, newaxis' if not trans_b else 'newaxis, :, newaxis', + 'BROADCAST_BN': 'newaxis, :, newaxis' if not trans_b else ':, newaxis, newaxis', + 'SHAPE_B' : 'TK, TN, TB' if not trans_b else 'TN, TK, TB'} return _einsum.kernel(a, b, c, bmnk[1], bmnk[2], bmnk[3], std0[0], std0[1], std0[2], From 0ec213547c2593ec868c402193ed49da90a735d5 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 27 Oct 2019 15:32:34 -0400 Subject: [PATCH 470/494] [PYTHON][KERNEL] Added benchmarking functionalities for kernels --- lib/codegen/transform/disassociate.cc | 8 +- python/examples/dot.py | 19 ++-- python/examples/einsum_test.py | 17 ++-- python/src/bindings.cc | 49 +++++++--- python/triton/kernel.py | 52 ++++++++++- python/triton/ops/dot.py | 41 ++++----- python/triton/ops/einsum.py | 128 +++++++++++++++----------- python/triton/utils.py | 3 +- tests/bench/dot.cc | 2 +- 9 files changed, 207 insertions(+), 112 deletions(-) diff --git a/lib/codegen/transform/disassociate.cc b/lib/codegen/transform/disassociate.cc index 1134463ec..2244ebccd 100644 --- a/lib/codegen/transform/disassociate.cc +++ b/lib/codegen/transform/disassociate.cc @@ -56,14 +56,12 @@ void disassociate::run(ir::module &mod) { bld.set_insert_point(y); bld.insert(cloned); clone_map[y] = cloned; - // replace in above level - if(depth > 1){ + // replace operands of parents + if(depth > 1) for(ir::user* ux: x.second.at(depth - 1)) clone_map.at((ir::instruction*)ux)->replace_uses_of_with(y, cloned); - } - else{ + else x.first->replace_uses_of_with(y, cloned); - } } depth += 1; } diff --git a/python/examples/dot.py b/python/examples/dot.py index eaa9c2d68..8fd0b35d9 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -2,11 +2,11 @@ import numpy as np import triton def run_tf(): - M, N, K = 128, 128, 128 + M, N, K = 2048, 2048, 2048 a = tf.placeholder(tf.float32, shape=[M, K]) b = tf.placeholder(tf.float32, shape=[N, K]) - tr_c = triton.ops.dot(a, b, transpose_a = False, transpose_b = True) - tr_d = triton.ops.dot(tr_c, b, transpose_a = True, transpose_b = False) + tr_c = triton.ops.dot(a, b, transpose_a = False, transpose_b = True, bench=10) + tr_d = triton.ops.dot(tr_c, b, transpose_a = True, transpose_b = False, bench=10) tf_c = tf.matmul(a, b, transpose_a = False, transpose_b = True) tf_d = tf.matmul(tf_c, b, transpose_a = True, transpose_b = False) # Gradient @@ -20,15 +20,20 @@ def run_tf(): sess.run(tf.global_variables_initializer()) result = sess.run([tr_da, tf_da], feed_dict = {a: ha, b: hb}) + # Benchmark + nanosec = triton.bench_registry[tr_d] + print('NANOSEC: ', nanosec) + print('TFLOPS:', 2. * M * N * K / nanosec * 1e-3) # Test print(result[0][0]) print(result[1][0]) dif = np.abs(result[0][0] - result[1][0]) print("dif: %f" % np.max(dif)) + def run_torch(): torch.manual_seed(0) - M, N, K = 128, 128, 128 + M, N, K = 2048, 2048, 2048 a = torch.randn(M, K).cuda() b = torch.randn(K, N).cuda() a.requires_grad_(True) @@ -37,9 +42,8 @@ def run_torch(): torch_d = torch.matmul(torch.t(torch_c), b) torch_y = torch.mean(torch_d) triton_c = triton.ops.dot(a, b, False, True) - triton_d = triton.ops.dot(triton_c, b, True, False) + triton_d = triton.ops.dot(triton_c, b, True, False, 1) triton_y = torch.mean(triton_d) - # torch gradient torch_y.backward() torch_da = a.grad.clone() @@ -51,6 +55,9 @@ def run_torch(): triton_da = a.grad.clone() triton_db = b.grad.clone() + nanosec = triton.bench_registry[triton_d] + print(nanosec) + print('TFLOPS:', 2. * M * N * K / nanosec * 1e-3) print('Diff DA:', (torch_da - triton_da).max()) print('Diff DB:', (torch_db - triton_db).max()) diff --git a/python/examples/einsum_test.py b/python/examples/einsum_test.py index 799efbf70..4a7c2f2c7 100644 --- a/python/examples/einsum_test.py +++ b/python/examples/einsum_test.py @@ -12,7 +12,8 @@ from tensorflow.python.ops import gradient_checker one = 0 out = 0 -bench = 0 +bench = 10 + class ProdKeyTest(tf.test.TestCase): def testEinsum(self): @@ -36,9 +37,9 @@ class ProdKeyTest(tf.test.TestCase): # key_dim = 16 for a_shape, b_shape, c_shape, einsum in [ - [ [ 4, 8, 8 ], [ 8, 8 ], [ 4, 8, 8 ], "btc,ck->btk" ], - [ [ 4, 1024, 1024 ], [ 1024, 512 ], [ 4, 1024, 512 ], "btc,ck->btk" ], - [ (batch_dim, ctx_dim, head_dim, 2, key_dim//2),(head_dim, 2, n_keys, key_dim//2), (batch_dim, ctx_dim, head_dim, 2, n_keys), "bchak,hank->bchan" ], + #[ [ 4, 8, 8 ], [ 8, 8 ], [ 4, 8, 8 ], "btc,ck->btk" ], + [ [4, 2048, 2048 ], [ 2048, 2048 ], [4, 2048, 2048 ], "btc,ck->btk" ], + #[ (batch_dim, ctx_dim, head_dim, 2, key_dim//2),(head_dim, 2, n_keys, key_dim//2), (batch_dim, ctx_dim, head_dim, 2, n_keys), "bchak,hank->bchan" ], ]: if one: @@ -57,7 +58,7 @@ class ProdKeyTest(tf.test.TestCase): e = tf.placeholder(tf.float32, c_shape, name="e") feed_dict = { a:A, b:B, e:E } - cc = triton.ops.einsum(einsum, a, b) + cc = triton.ops.einsum(einsum, a, b, bench=bench) # error = gradient_checker.compute_gradient_error(a, a_shape, c, c_shape, delta=1e-1, extra_feed_dict={ b:B }) # # print(error) @@ -71,8 +72,12 @@ class ProdKeyTest(tf.test.TestCase): # c, = sess.run( [ c, ], feed_dict ) c, da, db = sess.run( [ cc, da, db ], feed_dict ) - if bench == 0: + if bench > 0: + nanosec = triton.bench_registry[cc] + print(A.shape, B.shape) + print(nanosec) + else: C = np.einsum(einsum, A, B) id = cc.op.get_attr('id') ctx = triton.ops._einsum.contexts[id] diff --git a/python/src/bindings.cc b/python/src/bindings.cc index 969f74df4..59b5c54d6 100644 --- a/python/src/bindings.cc +++ b/python/src/bindings.cc @@ -20,13 +20,13 @@ using namespace triton; namespace rt = triton::runtime; - -/* TF triton op properties */ - std::map> id_grid_map; std::map> id_fn_map; +std::map fp64scalar_map; std::map i64scalar_map; +/* Grid map */ + void register_grid(size_t id, const rt::function::grid_fn_ty& grid_fn) { id_grid_map[id].reset(new rt::function::grid_fn_ty(grid_fn)); @@ -36,6 +36,8 @@ void delete_grid(size_t id) { id_grid_map.erase(id); } +/* Function map */ + void register_fn(size_t id, const std::string& src, const rt::function::options_space_t& opt) { @@ -56,8 +58,11 @@ size_t make_op_id() { return id_fn_map.size(); } +/* TF scalar wrapper */ size_t make_scalar_id() { - return i64scalar_map.size(); + size_t ret = i64scalar_map.size(); + i64scalar_map[ret] = int64_t(); + return ret; } bool has_scalar(size_t id) { @@ -135,8 +140,9 @@ void gen_make_handles(std::ostream &os, const std::vector& args) } } -void gen_make_launch_function(std::ostream &os, const std::vector& args) { - os << " (*id_fn_map.at(id_))({"; +void gen_make_launch_function(std::ostream &os, int num_outputs, const std::vector& args) { + os << " std::function run = [&](){\n "; + os << " (*id_fn_map.at(id_))({"; for(unsigned i = 0; i < args.size() ; i++){ ir::argument *arg = args[i]; std::string name = arg->get_name(); @@ -146,7 +152,11 @@ void gen_make_launch_function(std::ostream &os, const std::vector os << ", "; os << name; } - os << "}, *id_grid_map.at(id_), stream); \n"; + os << "}, *id_grid_map.at(id_), stream);\n"; + os << " };\n "; + os << " run();"; + os << " if(bench_ > 0)\n "; + os << " i64scalar_map[id_] = triton::tools::bench(run, stream);\n "; } void gen_tf_register_kernel_builder(std::ostream &os, const std::string &name, @@ -186,7 +196,9 @@ void gen_tf_register_op(std::ostream &os, const std::string &name, throw std::runtime_error("unknown output"); os << " .Output(\"out" << i << ": T" << idx << "\")\n"; } - os << " .Attr(\"id: int\")" << std::endl; + os << " .Attr(\"id: int\")\n"; + os << " .Attr(\"bench: int\")\n"; + os << " .Attr(\"bench_id: int\")\n"; os << ";\n"; } @@ -247,6 +259,7 @@ std::tuple> id_grid_map; extern std::map> id_fn_map; - +extern std::map i64scalar_map; class )" << opname << R"(: public OpKernel { public: explicit )" << opname << R"((OpKernelConstruction* context) : OpKernel(context) { OP_REQUIRES_OK(context, context->GetAttr("id", &id_)); + OP_REQUIRES_OK(context, context->GetAttr("bench", &bench_)); + OP_REQUIRES_OK(context, context->GetAttr("bench_id", &bench_id_)); } void Compute(OpKernelContext* context){ @@ -291,12 +306,14 @@ oss << R"( oss << R"( // launch function )"; -gen_make_launch_function(oss, fn->args()); +gen_make_launch_function(oss, outputs.size(), fn->args()); oss << R"( } private: int id_; + int bench_; + int bench_id_; }; // register kernel builder @@ -379,6 +396,7 @@ void gen_torch_signature(std::ostringstream& oss, oss << ret_ty << " " << name << "("; oss << "int64_t id, "; + oss << "int64_t bench, "; for(size_t i = 0; i < args.size(); i++) { ir::argument* arg = args[i]; if(i > 0) @@ -420,7 +438,8 @@ void gen_torch_make_handles(std::ostream &os, } void gen_torch_make_launch_function(std::ostream &os, const std::vector& args) { - os << " (*id_fn_map.at(id))({"; + os << " std::function run = [&](){\n "; + os << " (*id_fn_map.at(id))({"; for(unsigned i = 0; i < args.size() ; i++){ ir::argument *arg = args[i]; std::string name = "arg_" + arg->get_name(); @@ -431,7 +450,11 @@ void gen_torch_make_launch_function(std::ostream &os, const std::vector 0)\n "; + os << " i64scalar_map[id] = triton::tools::bench(run, stream);\n "; + } void gen_torch_ret(std::ostream &os, const std::vector& outputs) { if(outputs.size() == 1){ @@ -465,6 +488,7 @@ std::tuple> id_grid_map; extern std::map> id_fn_map; +extern std::map i64scalar_map; )"; diff --git a/python/triton/kernel.py b/python/triton/kernel.py index 50ade154e..3a71d0ecd 100644 --- a/python/triton/kernel.py +++ b/python/triton/kernel.py @@ -5,6 +5,7 @@ import shutil import hashlib import sysconfig import sys +import weakref # import for just-in-time compilation import distutils import setuptools.command.build_ext @@ -176,6 +177,38 @@ def _make_grid(args) : return grid +class bench_dict: + + # Lazy entry for e.g., tensorflow, when value of benchmark is + # not known at graph compile time + class lazy_entry: + def __init__(self, id): + self.id = id + + def get(self): + return libtriton.retrieve_scalar(self.id) + + + def __init__(self): + self.data = dict() + + def __delitem__(self, key): + del self.data[id(key)] + + def __getitem__(self, key): + ret = self.data[id(key)] + if isinstance(ret, bench_dict.lazy_entry): + return ret.get() + return ret + + def __len__(self): + return len(self.data) + + def __setitem__(self, key, value): + self.data[id(key)] = value + +bench_registry = bench_dict() + class kernel: def __init__(self, src, outputs): @@ -200,7 +233,7 @@ class kernel: defines.append((k, values)) opt = libtriton.options_space() opt.defines = defines - opt.num_warps = [4] + opt.num_warps = [2, 4, 8] # create unique id for this op op_id = libtriton.make_op_id() self.fw_id[key] = op_id @@ -209,6 +242,10 @@ class kernel: if self.fw_op is None: self.fw_op = _make_framework_op(self.src, self.outputs, opt) + # benchmarking info + bench = 0 + if 'bench' in kwargs: + bench = kwargs['bench'] # retrieve framework op op_id = self.fw_id[key] # register grid @@ -217,9 +254,16 @@ class kernel: op_args = [x.handle if isinstance(x, triton.utils.scalar) else x for x in args[:-1]] # call framework function if fw.has_tensorflow(): - return self.fw_op(*op_args, id=op_id) + bench_id = libtriton.make_scalar_id() if bench > 0 else 0 + ret = self.fw_op(*op_args, id=op_id, bench=bench, bench_id=bench_id) + if bench > 0: + bench_registry[ret] = bench_dict.lazy_entry(bench_id) + elif fw.has_torch(): args = [x.contiguous() if isinstance(x, fw.torch.Tensor) else x for x in op_args] - return self.fw_op(op_id, *args) + ret = self.fw_op(op_id, bench, *args) + if bench > 0: + bench_registry[ret] = libtriton.retrieve_scalar(op_id) else: - assert False \ No newline at end of file + assert False + return ret \ No newline at end of file diff --git a/python/triton/ops/dot.py b/python/triton/ops/dot.py index b37f2e32a..7a5069701 100644 --- a/python/triton/ops/dot.py +++ b/python/triton/ops/dot.py @@ -11,38 +11,36 @@ void dot(TYPE * A, TYPE * B, TYPE * C, // prologue int ridx = get_program_id(0); int ridy = get_program_id(1); - int rxa[TM] = ridx * TM + 0 ... TM; - int ryb[TN] = ridy * TN + 0 ... TN; - int rka[TK] = 0 ... TK; - int rkb[TK] = 0 ... TK; + int rm[TM] = ridx * TM + 0 ... TM; + int rn[TN] = ridy * TN + 0 ... TN; + int rk[TK] = 0 ... TK; float c[TM, TN] = 0; // pointers to operands - TYPE* pa[SHAPE_A] = A + rka[BROADCAST_AK] * STRIDE_AK + rxa[BROADCAST_AM] * STRIDE_AM; - TYPE* pb[SHAPE_B] = B + rkb[BROADCAST_BK] * STRIDE_BK + ryb[BROADCAST_BN] * STRIDE_BN; + TYPE* pa[SHAPE_A] = A + rk[BROADCAST_AK] * STRIDE_AK + rm[BROADCAST_AM] * STRIDE_AM; + TYPE* pb[SHAPE_B] = B + rk[BROADCAST_BK] * STRIDE_BK + rn[BROADCAST_BN] * STRIDE_BN; // prefetches operands - TYPE a[SHAPE_A] = (*pa); - TYPE b[SHAPE_B] = (*pb); + TYPE a[SHAPE_A] = *pa; + TYPE b[SHAPE_B] = *pb; // reduction loop for(int k = K; k > 0; k-= TK){ c += USE_A @ USE_B; pa = pa + TK * STRIDE_AK; pb = pb + TK * STRIDE_BK; - a = *pa; - b = *pb; + bool checka[SHAPE_A] = k > TK; + bool checkb[SHAPE_B] = k > TK; + a = checka ? *pa : 0; + b = checkb ? *pb : 0; } // epilogue - int rxc[TM] = ridx * TM + 0 ... TM; - int ryc[TN] = ridy * TN + 0 ... TN; - TYPE* pc[TM, TN] = C + ryc[newaxis, :] + rxc[:, newaxis] * ldc; - bool checkc[TM, TN] = (rxc < M)[:, newaxis] && (ryc < N)[newaxis, :]; - *?(checkc) pc = c; + TYPE* pc[TM, TN] = C + rm[:, newaxis] * ldc + rn[newaxis, :]; + *pc = c; } """ kernel = triton.kernel(src, ['C']) @staticmethod - def _call(a, b, transpose_a, transpose_b): + def _call(a, b, transpose_a, transpose_b, bench = 0): # extract shapes shape_a = triton.shape(a) shape_b = triton.shape(b) @@ -78,16 +76,17 @@ void dot(TYPE * A, TYPE * B, TYPE * C, 'BROADCAST_BK': 'newaxis, :' if transpose_b else ':, newaxis', 'BROADCAST_BN': ':, newaxis' if transpose_b else 'newaxis, :', 'SHAPE_B' : 'TN, TK' if transpose_b else 'TK, TN'} - return _dot.kernel(a, b, c, M, N, Ka, lda, ldb, ldc, grid, + return _dot.kernel(a, b, c, M, N, Ka, lda, ldb, ldc, + grid, bench=bench, AT = transpose_a, BT = transpose_b, TYPE = dtype, - TM = [128], TN = [128], TK = [8], **macros) + TM = [64, 128], TN = [64, 128], TK = [8], **macros) @staticmethod - def forward(ctx, a, b, transpose_a = False, transpose_b = False): + def forward(ctx, a, b, transpose_a = False, transpose_b = False, bench = 0): ctx.save_for_backward(a, b) ctx.t_a = transpose_a ctx.t_b = transpose_b - return _dot._call(a, b, transpose_a, transpose_b) + return _dot._call(a, b, transpose_a, transpose_b, bench) @staticmethod def backward(ctx, dy): @@ -108,5 +107,5 @@ void dot(TYPE * A, TYPE * B, TYPE * C, else: assert False return da, db, None, None, None, None, None, None, None - + dot = _dot.apply \ No newline at end of file diff --git a/python/triton/ops/einsum.py b/python/triton/ops/einsum.py index 7f4457d99..d6207c194 100644 --- a/python/triton/ops/einsum.py +++ b/python/triton/ops/einsum.py @@ -2,52 +2,58 @@ import triton +import math class _einsum(triton.function): src = """ - void einsum_(TYPE * A, TYPE * B, TYPE * C, - int dim_M, int dim_N, int dim_K, - int std_A0, int std_B0, int std_C0, - int std_A1, int std_B1, int std_C1) { - // program id - int pgm = get_program_id(0); - int pgn = get_program_id(1); - int pgb = get_program_id(2); - // range - int rm[TM] = pgm * TM + 0 ... TM; - int rn[TN] = pgn * TN + 0 ... TN; - int rb[TB] = pgb * TB + 0 ... TB; - int rk[TK] = 0 ... TK; - // accumulator - TYPE c[TM, TN, TB] = 0; - // pointers to a - TYPE *pa[SHAPE_A] = A + rk[BROADCAST_AK] * STRIDE_AK - + rm[BROADCAST_AM] * STRIDE_AM - + rb[newaxis, newaxis, :] * std_A0; - // pointers to b - TYPE *pb[SHAPE_B] = B + rk[BROADCAST_BK] * STRIDE_BK - + rn[BROADCAST_BN] * STRIDE_BN - + rb[newaxis, newaxis, :] * std_B0; - // accumulation - for(int k = dim_K; k > 0; k -= TK) { - TYPE a[SHAPE_A] = *pa; - TYPE b[SHAPE_B] = *pb; - c += USE_A @ USE_B; - pa += TK * STRIDE_AK; - pb += TK * STRIDE_BK; - } - // write-back - TYPE *pc[TM, TN, TB] = C + rm[:, newaxis, newaxis] * std_C1 - + rn[newaxis, :, newaxis] * 1 - + rb[newaxis, newaxis, :] * std_C0; - bool checkm[TM] = rm < dim_M; - bool checkn[TN] = rn < dim_N; - bool checkc[TM, TN, TB] = checkm[:, newaxis, newaxis] && - checkn[newaxis, :, newaxis]; - *?(checkc)pc = c; +void einsum_(TYPE * A, TYPE * B, TYPE * C, + int dim_M, int dim_N, int dim_K, + int std_A0, int std_B0, int std_C0, + int std_A1, int std_B1, int std_C1) { + // program id + int pgm = get_program_id(0); + int pgn = get_program_id(1); + int pgb = get_program_id(2); + // range + int rm[TM] = pgm * TM + 0 ... TM; + int rn[TN] = pgn * TN + 0 ... TN; + int rb[TB] = pgb * TB + 0 ... TB; + int rk[TK] = 0 ... TK; + // accumulator + TYPE c[TM, TN, TB] = 0; + // pointers to a + TYPE *pa[SHAPE_A] = A + rk[BROADCAST_AK] * STRIDE_AK + + rm[BROADCAST_AM] * STRIDE_AM + + rb[newaxis, newaxis, :] * std_A0; + // pointers to b + TYPE *pb[SHAPE_B] = B + rk[BROADCAST_BK] * STRIDE_BK + + rn[BROADCAST_BN] * STRIDE_BN + + rb[newaxis, newaxis, :] * std_B0; + // prefetch + TYPE a[SHAPE_A] = *pa; + TYPE b[SHAPE_B] = *pb; + // accumulation + for(int k = dim_K; k > 0; k -= TK) { + c += USE_A @ USE_B; + pa += TK * STRIDE_AK; + pb += TK * STRIDE_BK; + bool checka[SHAPE_A] = k > TK; + bool checkb[SHAPE_B] = k > TK; + a = checka ? *pa : 0; + b = checkb ? *pb : 0; } - """ + // write-back + TYPE *pc[TM, TN, TB] = C + rm[:, newaxis, newaxis] * std_C1 + + rn[newaxis, :, newaxis] * 1 + + rb[newaxis, newaxis, :] * std_C0; + bool checkm[TM] = rm < dim_M; + bool checkn[TN] = rn < dim_N; + bool checkc[TM, TN, TB] = checkm[:, newaxis, newaxis] && + checkn[newaxis, :, newaxis]; + *?(checkc)pc = c; +} +""" kernel = triton.kernel(src, ['C']) @@ -134,7 +140,8 @@ class _einsum(triton.function): @staticmethod def call(a, b, trans_a, trans_b, shape_c, bmnk, - std0, std1, einsum_a, einsum_b, einsum_c): + std0, std1, einsum_a, einsum_b, einsum_c, + bench): dtype = a.dtype c = triton.empty(shape_c, dtype) grid = lambda opt: [triton.cdiv(bmnk[1], opt.d('TM')), @@ -154,16 +161,22 @@ class _einsum(triton.function): 'BROADCAST_BK': ':, newaxis, newaxis' if not trans_b else 'newaxis, :, newaxis', 'BROADCAST_BN': 'newaxis, :, newaxis' if not trans_b else ':, newaxis, newaxis', 'SHAPE_B' : 'TK, TN, TB' if not trans_b else 'TN, TK, TB'} - return _einsum.kernel(a, b, c, + TM = [2**i for i in range(5, max(6, min(8, int(math.log2(bmnk[1]) + 1 ))))] + TN = [2**i for i in range(5, max(6, min(8, int(math.log2(bmnk[2]) + 1 ))))] + TB = [2**i for i in range(0, max(1, min(3, int(math.log2(bmnk[0]) + 1 ))))] + print(TM) + print(TN) + return _einsum.kernel(a, b, c, bmnk[1], bmnk[2], bmnk[3], std0[0], std0[1], std0[2], std1[0], std1[1], std1[2], - grid, **macros, - TYPE='float', TM=32, TN=32, TK=8, TB=1) + grid, bench=bench, + **macros, + TYPE='float', TM=TM, TN=TN, TK=8, TB=TB) @staticmethod - def forward(ctx, subscripts, a, b): + def forward(ctx, subscripts, a, b, **kwargs): ctx.save_for_backward(a, b) if type(subscripts) is str: einsum_a, einsum_bc = subscripts.split(",") @@ -173,14 +186,16 @@ class _einsum(triton.function): shape_c, bmnk, std0, std1, ta, tb = _einsum._parse_einsum( einsum_a, einsum_b, einsum_c, - a.shape.as_list(), b.shape.as_list() + triton.shape(a), triton.shape(b) ) + bench = kwargs['bench'] if 'bench' in kwargs else 0 ctx.trans_a = ta ctx.trans_b = tb ctx.einsum_a = einsum_a ctx.einsum_b = einsum_b ctx.einsum_c = einsum_c - return _einsum.call(a, b, ta, tb, shape_c, bmnk, std0, std1, einsum_a, einsum_b, einsum_c) + ctx.bench = bench + return _einsum.call(a, b, ta, tb, shape_c, bmnk, std0, std1, einsum_a, einsum_b, einsum_c, bench) @staticmethod @@ -191,22 +206,23 @@ class _einsum(triton.function): einsum_a = ctx.einsum_a einsum_b = ctx.einsum_b einsum_c = ctx.einsum_c + bench = ctx.bench if not trans_a and not trans_b: # NN - da = einsum((einsum_c, einsum_b, einsum_a), dc, b) - db = einsum((einsum_a, einsum_c, einsum_b), a, dc) + da = einsum((einsum_c, einsum_b, einsum_a), dc, b, bench=bench) + db = einsum((einsum_a, einsum_c, einsum_b), a, dc, bench=bench) elif not trans_a and trans_b: # NT - da = einsum((einsum_c, einsum_b, einsum_a), dc, b) - db = einsum((einsum_c, einsum_a, einsum_b), dc, a) + da = einsum((einsum_c, einsum_b, einsum_a), dc, b, bench=bench) + db = einsum((einsum_c, einsum_a, einsum_b), dc, a, bench=bench) elif trans_a and not trans_b: # TN - da = einsum((einsum_b, einsum_c, einsum_a), b, dc) - db = einsum((einsum_a, einsum_c, einsum_b), a, dc) + da = einsum((einsum_b, einsum_c, einsum_a), b, dc, bench=bench) + db = einsum((einsum_a, einsum_c, einsum_b), a, dc, bench=bench) elif trans_a and trans_b: # TT (not used) - da = einsum((einsum_b, einsum_c, einsum_a), b, dc) - db = einsum((einsum_c, einsum_a, einsum_b), dc, a) + da = einsum((einsum_b, einsum_c, einsum_a), b, dc, bench=bench) + db = einsum((einsum_c, einsum_a, einsum_b), dc, a, bench=bench) return da, db, None, None, None, None, None, None, None, None, None, None diff --git a/python/triton/utils.py b/python/triton/utils.py index 127d67364..d5f5f4129 100644 --- a/python/triton/utils.py +++ b/python/triton/utils.py @@ -22,7 +22,8 @@ class lazy_shape: def shape(A) : if fw.has_tensorflow(): - return lazy_shape(fw.tensorflow.shape(A)) + return A.shape.as_list() + #return lazy_shape(fw.tensorflow.shape(A)) elif fw.has_torch(): return A.shape else: diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index 927f0044b..c87e1c938 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -34,7 +34,7 @@ int main() { for(const auto& c: configs){ std::tie(ord, AT, BT, M, N, K) = c; std::cout << "// " << c << std::flush; - for(auto perf: bench_dot(stream, HALF, AT, BT, M, N, K, ord, ord)) + for(auto perf: bench_dot(stream, FLOAT, AT, BT, M, N, K, ord, ord)) std::cout << ", " << perf << std::flush; std::cout << std::endl; } From e9c787ef054c96b5038f5023b51463e93a4b6ef7 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 28 Oct 2019 11:33:18 -0400 Subject: [PATCH 471/494] [PYTHON][EINSUM] Added support for FP16 --- include/triton/tools/bench.hpp | 2 +- lib/codegen/analysis/layout.cc | 4 +-- lib/codegen/selection/generator.cc | 5 ++- python/examples/dot.py | 6 ++-- python/examples/einsum_test.py | 51 ++++++++++++++++-------------- python/setup.py | 1 + python/triton/function.py | 8 ++++- python/triton/kernel.py | 36 ++------------------- python/triton/ops/einsum.py | 20 +++++++----- python/triton/utils.py | 29 +++++++++++++++++ tests/bench/dot.cc | 6 ++-- tests/common/dot.h | 8 ++--- tests/common/src/dot.h | 4 +-- 13 files changed, 97 insertions(+), 83 deletions(-) diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp index 554b3bcc3..48a4ab972 100644 --- a/include/triton/tools/bench.hpp +++ b/include/triton/tools/bench.hpp @@ -38,7 +38,7 @@ inline double bench(std::function const & op, driver::stream * stream) double total_time = 0; op(); stream->synchronize(); - while(total_time*1e-9 < 1e-3){ + while(total_time*1e-9 < 1e-2){ float norm = 1; // normalize clock if possible to reduce noise in auto-tuning if(auto cu_device = dynamic_cast(stream->context()->device())) diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index 6f717d77c..70ca9e3b2 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -314,11 +314,11 @@ layout_shared_t::layout_shared_t(const layout_t *arg, // padding pad = 0; if(hmma_dot_a){ - bool row = is_trans(hmma_dot_a) ^ order[0] == 1; + bool row = is_trans(hmma_dot_a) ^ order[0] != 0; pad = 24 - shapes[row ? order[0] : order[1]] % 32; } else if(hmma_dot_b){ - bool row = is_trans(hmma_dot_b) ^ order[0] == 1; + bool row = is_trans(hmma_dot_b) ^ order[0] != 0; pad = 24 - shapes[row ? order[1] : order[0]] % 32; } else if(order != arg->order) { diff --git a/lib/codegen/selection/generator.cc b/lib/codegen/selection/generator.cc index 1ff4287eb..2efa834a8 100644 --- a/lib/codegen/selection/generator.cc +++ b/lib/codegen/selection/generator.cc @@ -560,9 +560,8 @@ void generator::visit_hmma_dot(ir::dot_inst* dot, shared_tile *TA, shared_tile * bool is_a_trans = is_trans(dot->get_operand(0)); bool is_b_trans = is_trans(dot->get_operand(1)); - bool is_a_row = is_a_trans ^ (ord_a[0] == 1); - bool is_b_row = is_b_trans ^ (ord_b[0] == 1); - + bool is_a_row = is_a_trans ^ (ord_a[0] != 0); + bool is_b_row = is_b_trans ^ (ord_b[0] != 0); Value *offset_a_i = hmma->offset_a_i_; Value *offset_a_k = hmma->offset_a_k_; diff --git a/python/examples/dot.py b/python/examples/dot.py index 8fd0b35d9..425fed986 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -5,8 +5,8 @@ def run_tf(): M, N, K = 2048, 2048, 2048 a = tf.placeholder(tf.float32, shape=[M, K]) b = tf.placeholder(tf.float32, shape=[N, K]) - tr_c = triton.ops.dot(a, b, transpose_a = False, transpose_b = True, bench=10) - tr_d = triton.ops.dot(tr_c, b, transpose_a = True, transpose_b = False, bench=10) + tr_c = triton.ops.dot(a, b, transpose_a = False, transpose_b = True, bench=1) + tr_d = triton.ops.dot(tr_c, b, transpose_a = True, transpose_b = False, bench=1) tf_c = tf.matmul(a, b, transpose_a = False, transpose_b = True) tf_d = tf.matmul(tf_c, b, transpose_a = True, transpose_b = False) # Gradient @@ -23,7 +23,7 @@ def run_tf(): # Benchmark nanosec = triton.bench_registry[tr_d] print('NANOSEC: ', nanosec) - print('TFLOPS:', 2. * M * N * K / nanosec * 1e-3) + #print('TFLOPS:', 2. * M * N * K / nanosec * 1e-3) # Test print(result[0][0]) print(result[1][0]) diff --git a/python/examples/einsum_test.py b/python/examples/einsum_test.py index 4a7c2f2c7..3363a88ea 100644 --- a/python/examples/einsum_test.py +++ b/python/examples/einsum_test.py @@ -12,7 +12,7 @@ from tensorflow.python.ops import gradient_checker one = 0 out = 0 -bench = 10 +bench = 0 class ProdKeyTest(tf.test.TestCase): @@ -37,14 +37,14 @@ class ProdKeyTest(tf.test.TestCase): # key_dim = 16 for a_shape, b_shape, c_shape, einsum in [ - #[ [ 4, 8, 8 ], [ 8, 8 ], [ 4, 8, 8 ], "btc,ck->btk" ], - [ [4, 2048, 2048 ], [ 2048, 2048 ], [4, 2048, 2048 ], "btc,ck->btk" ], - #[ (batch_dim, ctx_dim, head_dim, 2, key_dim//2),(head_dim, 2, n_keys, key_dim//2), (batch_dim, ctx_dim, head_dim, 2, n_keys), "bchak,hank->bchan" ], + [ [ 4, 8, 8 ], [ 8, 8 ], [ 4, 8, 8 ], "btc,ck->btk" ], + [ [4, 1024, 1024], [ 1024, 1024 ], [4, 1024, 1024 ], "btc,ck->btk" ], + [ (batch_dim, ctx_dim, head_dim, 2, key_dim//2),(head_dim, 2, n_keys, key_dim//2), (batch_dim, ctx_dim, head_dim, 2, n_keys), "bchak,hank->bchan" ], ]: if one: - A = np.ones(a_shape, dtype=np.float32) - B = np.ones(b_shape, dtype=np.float32) + A = np.ones(a_shape, dtype=np.float16).astype(np.float32) + B = np.ones(b_shape, dtype=np.float16).astype(np.float32) E = np.ones(c_shape, dtype=np.float32) else: # QK = np.random.normal(loc=0.0, scale=1.0, size=qk_shape).astype(np.float16).astype(np.float32) @@ -53,12 +53,14 @@ class ProdKeyTest(tf.test.TestCase): B = np.random.uniform(-1.0, 1.0, b_shape).astype(np.float16).astype(np.float32) E = np.random.uniform(-1.0, 1.0, c_shape).astype(np.float16).astype(np.float32) - a = tf.placeholder(tf.float32, a_shape, name="a") - b = tf.placeholder(tf.float32, b_shape, name="b") - e = tf.placeholder(tf.float32, c_shape, name="e") - feed_dict = { a:A, b:B, e:E } + a = tf.placeholder(tf.float16, a_shape, name="a") + b = tf.placeholder(tf.float16, b_shape, name="b") + e = tf.placeholder(tf.float16, c_shape, name="e") + feed_dict = { a: A.astype(np.float16), + b: B.astype(np.float16), + e: E } - cc = triton.ops.einsum(einsum, a, b, bench=bench) + c = triton.ops.einsum(einsum, a, b, bench=bench) # error = gradient_checker.compute_gradient_error(a, a_shape, c, c_shape, delta=1e-1, extra_feed_dict={ b:B }) # # print(error) @@ -66,21 +68,24 @@ class ProdKeyTest(tf.test.TestCase): # print(error) # return - with tf.control_dependencies([cc.op]): - da, db = tf.gradients(cc, [a, b], e) + with tf.control_dependencies([c.op]): + da, db = tf.gradients(c, [a, b], e) # c, = sess.run( [ c, ], feed_dict ) - c, da, db = sess.run( [ cc, da, db ], feed_dict ) - + rc, rda, rdb = sess.run( [ c, da, db ], feed_dict ) + if bench > 0: - nanosec = triton.bench_registry[cc] - print(A.shape, B.shape) - print(nanosec) + nanosec = triton.bench_registry[c] + ctx = triton.ctx_registry[c] + b, m, n, k = tuple((ctx.bmnk[i] for i in range(0, 4))) + ops = 2. * b * m * n * k + print('C TFLOPS:', ops / triton.bench_registry[c] * 1e-3) + print('DA TFLOPS:', ops / triton.bench_registry[da] * 1e-3) + print('DB TFLOPS:', ops / triton.bench_registry[db] * 1e-3) else: C = np.einsum(einsum, A, B) - id = cc.op.get_attr('id') - ctx = triton.ops._einsum.contexts[id] + ctx = triton.ctx_registry[c] t_a = ctx.trans_a t_b = ctx.trans_b e_a = ctx.einsum_a @@ -100,9 +105,9 @@ class ProdKeyTest(tf.test.TestCase): print("testProdKey", einsum) if not bench: for op, dev, cpu in [ - [ "C", c, C ], - [ "DA", da, DA ], - [ "DB", db, DB ], + [ "C", rc, C ], + [ "DA", rda, DA ], + [ "DB", rdb, DB ], ]: self.compare_results(op, dev, cpu) diff --git a/python/setup.py b/python/setup.py index ea1568b2f..060a1c450 100644 --- a/python/setup.py +++ b/python/setup.py @@ -77,6 +77,7 @@ class CMakeBuild(build_ext): pass cfg = 'Debug' if self.debug else 'Release' + cfg = 'Release' build_args = ['--config', cfg] if platform.system() == "Windows": diff --git a/python/triton/function.py b/python/triton/function.py index 125cad668..79a0e5ec8 100644 --- a/python/triton/function.py +++ b/python/triton/function.py @@ -1,4 +1,5 @@ import triton.frameworks as fw +import triton.utils class OpContext(object): @@ -16,6 +17,8 @@ class function_meta(type): cls.registered = False return super(function_meta, cls).__init__(name, bases, attrs) +ctx_registry = triton.utils.id_dict() + class function(metaclass = function_meta): @staticmethod @@ -31,7 +34,9 @@ class function(metaclass = function_meta): class TorchFunction(fw.torch.autograd.Function): @staticmethod def forward(ctx, *targs, **tkwargs): - return cls.forward(ctx, *targs, **tkwargs) + y = cls.forward(ctx, *targs, **tkwargs) + ctx_registry[y] = ctx + return y @staticmethod def backward(ctx, grad_output): return cls.backward(ctx, grad_output) @@ -43,6 +48,7 @@ class function(metaclass = function_meta): result = cls.forward(ctx, *args, **kwargs) id = result.op.get_attr('id') cls.contexts[id] = ctx + ctx_registry[result] = ctx # register backward name = result.op.op_def.name if not cls.registered: diff --git a/python/triton/kernel.py b/python/triton/kernel.py index 3a71d0ecd..57e0afc13 100644 --- a/python/triton/kernel.py +++ b/python/triton/kernel.py @@ -177,37 +177,7 @@ def _make_grid(args) : return grid -class bench_dict: - - # Lazy entry for e.g., tensorflow, when value of benchmark is - # not known at graph compile time - class lazy_entry: - def __init__(self, id): - self.id = id - - def get(self): - return libtriton.retrieve_scalar(self.id) - - - def __init__(self): - self.data = dict() - - def __delitem__(self, key): - del self.data[id(key)] - - def __getitem__(self, key): - ret = self.data[id(key)] - if isinstance(ret, bench_dict.lazy_entry): - return ret.get() - return ret - - def __len__(self): - return len(self.data) - - def __setitem__(self, key, value): - self.data[id(key)] = value - -bench_registry = bench_dict() +bench_registry = triton.utils.id_dict() class kernel: @@ -233,7 +203,7 @@ class kernel: defines.append((k, values)) opt = libtriton.options_space() opt.defines = defines - opt.num_warps = [2, 4, 8] + opt.num_warps = [4] # create unique id for this op op_id = libtriton.make_op_id() self.fw_id[key] = op_id @@ -257,7 +227,7 @@ class kernel: bench_id = libtriton.make_scalar_id() if bench > 0 else 0 ret = self.fw_op(*op_args, id=op_id, bench=bench, bench_id=bench_id) if bench > 0: - bench_registry[ret] = bench_dict.lazy_entry(bench_id) + bench_registry[ret] = triton.utils.id_dict.lazy_entry(bench_id) elif fw.has_torch(): args = [x.contiguous() if isinstance(x, fw.torch.Tensor) else x for x in op_args] diff --git a/python/triton/ops/einsum.py b/python/triton/ops/einsum.py index d6207c194..7bbc18f5a 100644 --- a/python/triton/ops/einsum.py +++ b/python/triton/ops/einsum.py @@ -7,10 +7,14 @@ import math class _einsum(triton.function): src = """ -void einsum_(TYPE * A, TYPE * B, TYPE * C, +void einsumk(TYPE * A, TYPE * B, TYPE * C, int dim_M, int dim_N, int dim_K, - int std_A0, int std_B0, int std_C0, - int std_A1, int std_B1, int std_C1) { + int std_A0 __multipleof(8), + int std_B0 __multipleof(8), + int std_C0 __multipleof(8), + int std_A1 __multipleof(8), + int std_B1 __multipleof(8), + int std_C1 __multipleof(8)) { // program id int pgm = get_program_id(0); int pgn = get_program_id(1); @@ -21,7 +25,7 @@ void einsum_(TYPE * A, TYPE * B, TYPE * C, int rb[TB] = pgb * TB + 0 ... TB; int rk[TK] = 0 ... TK; // accumulator - TYPE c[TM, TN, TB] = 0; + float c[TM, TN, TB] = 0; // pointers to a TYPE *pa[SHAPE_A] = A + rk[BROADCAST_AK] * STRIDE_AK + rm[BROADCAST_AM] * STRIDE_AM @@ -51,7 +55,7 @@ void einsum_(TYPE * A, TYPE * B, TYPE * C, bool checkn[TN] = rn < dim_N; bool checkc[TM, TN, TB] = checkm[:, newaxis, newaxis] && checkn[newaxis, :, newaxis]; - *?(checkc)pc = c; + *?(checkc)pc = (TYPE[TM, TN, TB])c; } """ @@ -164,15 +168,14 @@ void einsum_(TYPE * A, TYPE * B, TYPE * C, TM = [2**i for i in range(5, max(6, min(8, int(math.log2(bmnk[1]) + 1 ))))] TN = [2**i for i in range(5, max(6, min(8, int(math.log2(bmnk[2]) + 1 ))))] TB = [2**i for i in range(0, max(1, min(3, int(math.log2(bmnk[0]) + 1 ))))] - print(TM) - print(TN) + TK = [bmnk[2]] if bmnk[2] < 16 else [8, 16] return _einsum.kernel(a, b, c, bmnk[1], bmnk[2], bmnk[3], std0[0], std0[1], std0[2], std1[0], std1[1], std1[2], grid, bench=bench, **macros, - TYPE='float', TM=TM, TN=TN, TK=8, TB=TB) + TYPE=dtype, TM=TM, TN=TN, TK=TK, TB=TB) @staticmethod @@ -195,6 +198,7 @@ void einsum_(TYPE * A, TYPE * B, TYPE * C, ctx.einsum_b = einsum_b ctx.einsum_c = einsum_c ctx.bench = bench + ctx.bmnk = bmnk return _einsum.call(a, b, ta, tb, shape_c, bmnk, std0, std1, einsum_a, einsum_b, einsum_c, bench) diff --git a/python/triton/utils.py b/python/triton/utils.py index d5f5f4129..5b832668f 100644 --- a/python/triton/utils.py +++ b/python/triton/utils.py @@ -89,3 +89,32 @@ class scalar: return -self.get_value() +class id_dict: + + # Lazy entry for e.g., tensorflow, when value of benchmark is + # not known at graph compile time + class lazy_entry: + def __init__(self, id): + self.id = id + + def get(self): + return libtriton.retrieve_scalar(self.id) + + + def __init__(self): + self.data = dict() + + def __delitem__(self, key): + del self.data[id(key)] + + def __getitem__(self, key): + ret = self.data[id(key)] + if isinstance(ret, id_dict.lazy_entry): + return ret.get() + return ret + + def __len__(self): + return len(self.data) + + def __setitem__(self, key, value): + self.data[id(key)] = value \ No newline at end of file diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index c87e1c938..876ce0962 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -9,9 +9,9 @@ int main() { // shapes to benchmark typedef std::tuple, bool, bool, int, int, int> config_t; std::vector configs; - for(auto ord: std::vector>{{0, 1}, {1, 0}}) + for(auto ord: std::vector>{{1, 0}}) for(auto x: std::vector>{{false, false}, {false, true}, - {true, false}, {true, true}}){ + {true, false}}){ std::vector tmp = { config_t{ord, x[0], x[1], 2048, 2048, 2048}, // config_t{ord, x[0], x[1], 16, 2048, 2048}, @@ -34,7 +34,7 @@ int main() { for(const auto& c: configs){ std::tie(ord, AT, BT, M, N, K) = c; std::cout << "// " << c << std::flush; - for(auto perf: bench_dot(stream, FLOAT, AT, BT, M, N, K, ord, ord)) + for(auto perf: bench_dot(stream, HALF, AT, BT, M, N, K, ord, ord)) std::cout << ", " << perf << std::flush; std::cout << std::endl; } diff --git a/tests/common/dot.h b/tests/common/dot.h index 23bb46c72..a157d7994 100644 --- a/tests/common/dot.h +++ b/tests/common/dot.h @@ -109,10 +109,10 @@ bool triton_dot(drv::stream* stream, bool AT, bool BT, opt.num_warps = {nwarp}; } if(mode == BENCH) { - opt.defines.push_back({"TM", {"64", "128"}}); - opt.defines.push_back({"TN", {"64", "128"}}); - opt.defines.push_back({"TK", {"8"}}); - opt.num_warps = {2, 4, 8}; + opt.defines.push_back({"TM", {"128"}}); + opt.defines.push_back({"TN", {"128"}}); + opt.defines.push_back({"TK", {"16"}}); + opt.num_warps = {4}; } // kernels diff --git a/tests/common/src/dot.h b/tests/common/src/dot.h index 05ed68a7b..7c368e593 100644 --- a/tests/common/src/dot.h +++ b/tests/common/src/dot.h @@ -23,8 +23,8 @@ void dot(TYPE * A, TYPE * B, TYPE * C, // reduction loop for(int k = K; k > 0; k-= TK){ c += USEA @ USEB; - pa = pa + TK * STRIDE_AK; - pb = pb + TK * STRIDE_BK; + pa += TK * STRIDE_AK; + pb += TK * STRIDE_BK; bool checka[SHAPE_A] = k > TK; bool checkb[SHAPE_B] = k > TK; a = checka ? *pa : 0; From 448f4433d942db2a26d0f250f1bd3d85e31577b6 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 28 Oct 2019 17:12:37 -0400 Subject: [PATCH 472/494] [PYTHON][KERNEL] Enforcing shapes to be known at compile-time for TensorFlow Graph Execution --- python/examples/dot.py | 39 +++++++++++++--------------- python/examples/einsum_test.py | 10 +++---- python/src/bindings.cc | 18 ++++++++++--- python/src/tensorflow/alloc_empty.cc | 7 +++++ python/triton/frameworks.py | 3 +++ python/triton/function.py | 14 +++++----- python/triton/kernel.py | 12 ++++----- python/triton/ops/dot.py | 20 +++++++------- python/triton/utils.py | 11 +++++++- 9 files changed, 82 insertions(+), 52 deletions(-) diff --git a/python/examples/dot.py b/python/examples/dot.py index 425fed986..3e061c112 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -5,30 +5,28 @@ def run_tf(): M, N, K = 2048, 2048, 2048 a = tf.placeholder(tf.float32, shape=[M, K]) b = tf.placeholder(tf.float32, shape=[N, K]) - tr_c = triton.ops.dot(a, b, transpose_a = False, transpose_b = True, bench=1) - tr_d = triton.ops.dot(tr_c, b, transpose_a = True, transpose_b = False, bench=1) - tf_c = tf.matmul(a, b, transpose_a = False, transpose_b = True) - tf_d = tf.matmul(tf_c, b, transpose_a = True, transpose_b = False) + triton_c = triton.ops.dot(a, b, False, True, 1) + triton_d = triton.ops.dot(triton_c, b, True, False, 1) + triton_y = tf.math.reduce_mean(triton_d) + fw_c = tf.matmul(a, b, False, True) + fw_d = tf.matmul(fw_c, b, True, False) + fw_y = tf.math.reduce_mean(fw_d) # Gradient - tr_da = tf.gradients(tr_d, [a]) - tf_da = tf.gradients(tf_d, [a]) + triton_da, triton_db = tf.gradients(triton_y, [a, b]) + fw_da, fw_db = tf.gradients(fw_y, [a, b]) # Reference - ha = np.random.rand(M, K).astype(np.float32) - hb = np.random.rand(K, N).astype(np.float32) - # Run + feed_dict = {a: np.random.rand(M, K).astype(np.float32), + b: np.random.rand(K, N).astype(np.float32)} sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer()) - result = sess.run([tr_da, tf_da], feed_dict = {a: ha, - b: hb}) + result = sess.run([triton_da, fw_da, triton_db, fw_db, fw_y, triton_y], feed_dict = feed_dict) + triton_da, fw_da = result[0][0], result[1][0] + triton_db, fw_db = result[2][0], result[3][0] # Benchmark - nanosec = triton.bench_registry[tr_d] - print('NANOSEC: ', nanosec) - #print('TFLOPS:', 2. * M * N * K / nanosec * 1e-3) - # Test - print(result[0][0]) - print(result[1][0]) - dif = np.abs(result[0][0] - result[1][0]) - print("dif: %f" % np.max(dif)) + nanosec = triton.bench_registry[triton_d] + print('TFLOPS:', 2. * M * N * K / nanosec * 1e-3) + print('Diff DA:', (triton_da - fw_da).max()) + print('Diff DB:', (triton_db - fw_db).max()) def run_torch(): @@ -41,7 +39,7 @@ def run_torch(): torch_c = torch.matmul(a, torch.t(b)) torch_d = torch.matmul(torch.t(torch_c), b) torch_y = torch.mean(torch_d) - triton_c = triton.ops.dot(a, b, False, True) + triton_c = triton.ops.dot(a, b, False, True, 1) triton_d = triton.ops.dot(triton_c, b, True, False, 1) triton_y = torch.mean(triton_d) # torch gradient @@ -56,7 +54,6 @@ def run_torch(): triton_db = b.grad.clone() nanosec = triton.bench_registry[triton_d] - print(nanosec) print('TFLOPS:', 2. * M * N * K / nanosec * 1e-3) print('Diff DA:', (torch_da - triton_da).max()) print('Diff DB:', (torch_db - triton_db).max()) diff --git a/python/examples/einsum_test.py b/python/examples/einsum_test.py index 3363a88ea..b09f46cab 100644 --- a/python/examples/einsum_test.py +++ b/python/examples/einsum_test.py @@ -53,11 +53,11 @@ class ProdKeyTest(tf.test.TestCase): B = np.random.uniform(-1.0, 1.0, b_shape).astype(np.float16).astype(np.float32) E = np.random.uniform(-1.0, 1.0, c_shape).astype(np.float16).astype(np.float32) - a = tf.placeholder(tf.float16, a_shape, name="a") - b = tf.placeholder(tf.float16, b_shape, name="b") - e = tf.placeholder(tf.float16, c_shape, name="e") - feed_dict = { a: A.astype(np.float16), - b: B.astype(np.float16), + a = tf.placeholder(tf.float32, a_shape, name="a") + b = tf.placeholder(tf.float32, b_shape, name="b") + e = tf.placeholder(tf.float32, c_shape, name="e") + feed_dict = { a: A.astype(np.float32), + b: B.astype(np.float32), e: E } c = triton.ops.einsum(einsum, a, b, bench=bench) diff --git a/python/src/bindings.cc b/python/src/bindings.cc index 59b5c54d6..80e2f8ddc 100644 --- a/python/src/bindings.cc +++ b/python/src/bindings.cc @@ -156,7 +156,7 @@ void gen_make_launch_function(std::ostream &os, int num_outputs, const std::vect os << " };\n "; os << " run();"; os << " if(bench_ > 0)\n "; - os << " i64scalar_map[id_] = triton::tools::bench(run, stream);\n "; + os << " i64scalar_map[bench_id_] = triton::tools::bench(run, stream);\n "; } void gen_tf_register_kernel_builder(std::ostream &os, const std::string &name, @@ -186,6 +186,7 @@ void gen_tf_register_op(std::ostream &os, const std::string &name, os << " .Attr(\"T" << i << " : {bool, int8, int16, int32, int64, float16, float32, float64}\")" << std::endl; os << " .Input(\"" << name << ": T" << i << "\")\n"; } + std::vector out_idx; for(size_t i = 0; i < outputs.size(); i++){ std::string name = outputs[i]; size_t idx; @@ -194,11 +195,19 @@ void gen_tf_register_op(std::ostream &os, const std::string &name, break; if(idx == args.size()) throw std::runtime_error("unknown output"); - os << " .Output(\"out" << i << ": T" << idx << "\")\n"; + out_idx.push_back(idx); } + for(size_t i = 0; i < out_idx.size(); i++) + os << " .Output(\"out" << i << ": T" << out_idx[i] << "\")\n"; os << " .Attr(\"id: int\")\n"; os << " .Attr(\"bench: int\")\n"; os << " .Attr(\"bench_id: int\")\n"; + os << " .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {\n"; + for(size_t i = 0; i < out_idx.size(); i++) + os << " c->set_output(" << i << ", c->input(" << out_idx[i] << "));\n"; + os << " return Status::OK();\n"; + os << " })\n"; + os << ";\n"; } @@ -313,7 +322,7 @@ oss << R"( private: int id_; int bench_; - int bench_id_; + int64 bench_id_; }; // register kernel builder @@ -397,6 +406,7 @@ void gen_torch_signature(std::ostringstream& oss, oss << ret_ty << " " << name << "("; oss << "int64_t id, "; oss << "int64_t bench, "; + oss << "int64_t bench_id, "; for(size_t i = 0; i < args.size(); i++) { ir::argument* arg = args[i]; if(i > 0) @@ -453,7 +463,7 @@ void gen_torch_make_launch_function(std::ostream &os, const std::vector 0)\n "; - os << " i64scalar_map[id] = triton::tools::bench(run, stream);\n "; + os << " i64scalar_map[bench_id] = triton::tools::bench(run, &stream);\n "; } void gen_torch_ret(std::ostream &os, const std::vector& outputs) { diff --git a/python/src/tensorflow/alloc_empty.cc b/python/src/tensorflow/alloc_empty.cc index 75ab1201d..43f82cbfa 100644 --- a/python/src/tensorflow/alloc_empty.cc +++ b/python/src/tensorflow/alloc_empty.cc @@ -1,4 +1,5 @@ #include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/shape_inference.h" using namespace tensorflow; @@ -28,4 +29,10 @@ REGISTER_OP("AllocEmpty") .Input("x: int32") .Attr("T : {bool, int8, int16, int32, int64, float16, float32, float64}") .Output("y: T") + .SetShapeFn([](shape_inference::InferenceContext* c) { + shape_inference::ShapeHandle handle; + c->MakeShapeFromShapeTensor(0, &handle); + c->set_output(0, handle); + return Status::OK(); + }); ; diff --git a/python/triton/frameworks.py b/python/triton/frameworks.py index 993389a82..f495680f0 100644 --- a/python/triton/frameworks.py +++ b/python/triton/frameworks.py @@ -5,6 +5,7 @@ import triton._C.libtriton as libtriton torch = None tensorflow = None tf_extra_ops = None +gen_resource_variable_ops = None def _import_torch(): global torch @@ -13,8 +14,10 @@ def _import_torch(): def _import_tensorflow(): global tensorflow + global gen_resource_variable_ops if tensorflow is None: import tensorflow + from tensorflow.python.ops import gen_resource_variable_ops def _import_tf_extra_ops(): global tf_extra_ops diff --git a/python/triton/function.py b/python/triton/function.py index 79a0e5ec8..eb52d145e 100644 --- a/python/triton/function.py +++ b/python/triton/function.py @@ -13,7 +13,6 @@ class OpContext(object): class function_meta(type): def __init__(cls, name, bases, attrs): - cls.contexts = dict() cls.registered = False return super(function_meta, cls).__init__(name, bases, attrs) @@ -45,17 +44,20 @@ class function(metaclass = function_meta): @classmethod def apply_tensorflow(cls, *args, **kwargs): ctx = OpContext() - result = cls.forward(ctx, *args, **kwargs) - id = result.op.get_attr('id') - cls.contexts[id] = ctx + # Acquire a mutex here to ensure that calls to alloc_empty() + # are handled properly + mutex = fw.gen_resource_variable_ops.mutex_v2() + lock = fw.gen_resource_variable_ops.mutex_lock(mutex) + with fw.tensorflow.python.ops.control_dependencies([lock]): + result = cls.forward(ctx, *args, **kwargs) ctx_registry[result] = ctx # register backward name = result.op.op_def.name if not cls.registered: @fw.tensorflow.RegisterGradient(name) def gradient(op, dy): - id = op.get_attr('id') - return cls.backward(cls.contexts[id], dy) + with fw.tensorflow.control_dependencies([op]): + return cls.backward(ctx_registry[op.outputs[0]], dy) cls.registered = True # return result tensor return result diff --git a/python/triton/kernel.py b/python/triton/kernel.py index 57e0afc13..769e47a29 100644 --- a/python/triton/kernel.py +++ b/python/triton/kernel.py @@ -220,18 +220,18 @@ class kernel: op_id = self.fw_id[key] # register grid libtriton.register_grid(op_id, _make_grid(args)) + # id for the benchmark result + bench_id = libtriton.make_scalar_id() if bench > 0 else -1 # create operands - op_args = [x.handle if isinstance(x, triton.utils.scalar) else x for x in args[:-1]] # call framework function if fw.has_tensorflow(): - bench_id = libtriton.make_scalar_id() if bench > 0 else 0 - ret = self.fw_op(*op_args, id=op_id, bench=bench, bench_id=bench_id) + args = [x for x in args[:-1]] + ret = self.fw_op(*args, id=op_id, bench=bench, bench_id=bench_id) if bench > 0: bench_registry[ret] = triton.utils.id_dict.lazy_entry(bench_id) - elif fw.has_torch(): - args = [x.contiguous() if isinstance(x, fw.torch.Tensor) else x for x in op_args] - ret = self.fw_op(op_id, bench, *args) + args = [x.contiguous() if isinstance(x, fw.torch.Tensor) else x for x in args[:-1]] + ret = self.fw_op(op_id, bench, bench_id, *args) if bench > 0: bench_registry[ret] = libtriton.retrieve_scalar(op_id) else: diff --git a/python/triton/ops/dot.py b/python/triton/ops/dot.py index 7a5069701..140cd82cd 100644 --- a/python/triton/ops/dot.py +++ b/python/triton/ops/dot.py @@ -40,7 +40,7 @@ void dot(TYPE * A, TYPE * B, TYPE * C, kernel = triton.kernel(src, ['C']) @staticmethod - def _call(a, b, transpose_a, transpose_b, bench = 0): + def _call(a, b, transpose_a, transpose_b, bench): # extract shapes shape_a = triton.shape(a) shape_b = triton.shape(b) @@ -86,24 +86,26 @@ void dot(TYPE * A, TYPE * B, TYPE * C, ctx.save_for_backward(a, b) ctx.t_a = transpose_a ctx.t_b = transpose_b + ctx.bench = bench return _dot._call(a, b, transpose_a, transpose_b, bench) @staticmethod def backward(ctx, dy): a, b = ctx.saved_tensors t_a, t_b = ctx.t_a, ctx.t_b + bench = ctx.bench if not t_a and not t_b: - da = _dot._call(dy, b, False, True) - db = _dot._call(a, dy, True, False) + da = _dot._call(dy, b, False, True, bench) + db = _dot._call(a, dy, True, False, bench) elif not t_a and t_b: - da = _dot._call(dy, b, False, False) - db = _dot._call(dy, a, True, False) + da = _dot._call(dy, b, False, False, bench) + db = _dot._call(dy, a, True, False, bench) elif t_a and not t_b: - da = _dot._call(b, dy, False, True) - db = _dot._call(a, dy, False, False) + da = _dot._call(b, dy, False, True, bench) + db = _dot._call(a, dy, False, False, bench) elif t_a and t_b: - da = _dot._call(b, dy, True, True) - db = _dot._call(dy, a, True, True) + da = _dot._call(b, dy, True, True, bench) + db = _dot._call(dy, a, True, True, bench) else: assert False return da, db, None, None, None, None, None, None, None diff --git a/python/triton/utils.py b/python/triton/utils.py index 5b832668f..eca9f665e 100644 --- a/python/triton/utils.py +++ b/python/triton/utils.py @@ -1,13 +1,22 @@ import triton.frameworks as fw import triton._C.libtriton as libtriton +import numpy as np def cdiv(a, b): return -(-a // b) +class tf_empty_proxy: + + def __init__(self, args, dtype): + self.args = args + self.dtype = dtype + def empty(shapes, dtype): if fw.has_tensorflow(): - args = [x.handle if isinstance(x, scalar) else x for x in shapes] + #return fw.tensorflow.Variable(np.empty(shapes),shape=shapes, dtype=dtype) + args = [x.handle if isinstance(x, scalar) else fw.tensorflow.constant(x) for x in shapes] args = fw.tensorflow.stack(args) + #return tf_empty_proxy(args, dtype) return fw.tf_extra_ops.alloc_empty(args, T = dtype) elif fw.has_torch(): return fw.torch.empty(*shapes).cuda() From 76651a065f564b5d2c29a3a6027b2883ec03d9b2 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 29 Oct 2019 12:42:38 -0400 Subject: [PATCH 473/494] [PYTHON][EXAMPLES] Better einsum example --- include/triton/runtime/function.h | 2 + lib/lang/cpp.cc | 2 - lib/runtime/function.cc | 10 ++- python/examples/einsum.py | 116 ++++++++++++++++++------- python/examples/einsum_test.py | 139 ------------------------------ python/triton/ops/einsum.py | 22 +++-- 6 files changed, 103 insertions(+), 188 deletions(-) delete mode 100644 python/examples/einsum_test.py diff --git a/include/triton/runtime/function.h b/include/triton/runtime/function.h index 23ca7d1e0..539de8684 100644 --- a/include/triton/runtime/function.h +++ b/include/triton/runtime/function.h @@ -8,6 +8,7 @@ #include #include #include +#include // codegen #include "triton/ir/context.h" #include "triton/codegen/target.h" @@ -110,6 +111,7 @@ private: std::string src_; options_space_t opt_space_; std::map cache_; + std::mutex src_mutex_; }; } diff --git a/lib/lang/cpp.cc b/lib/lang/cpp.cc index 308eba1e6..2cdfb453a 100644 --- a/lib/lang/cpp.cc +++ b/lib/lang/cpp.cc @@ -9,8 +9,6 @@ #include -extern std::string filename_in; -extern std::string filename_out; using DirectiveMap = std::unordered_map; diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index bc55d65eb..a7072b757 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -29,9 +29,9 @@ #include "triton/ir/print.h" #include "triton/tools/bench.hpp" #include "llvm/IR/Module.h" +#include - - +std::mutex mut; namespace triton{ namespace runtime { @@ -168,7 +168,6 @@ function::caller function::autotune(driver::stream* stream, const grid_fn_ty& gr for(auto it: opt_space_.defines) cpp.AddMacro(it.first, &opt.defines.at(it.first)); cpp.Process(tokens); -// tokens.Print(stdout); // parse Parser parser(tokens); parser.Parse(); @@ -309,7 +308,10 @@ void function::operator()(const std::vector& args, const grid_fn_ty& grid_f } /* re-tune and re-compile */ - cache_.insert({key, autotune(stream, grid_fn, args)}); + { + std::lock_guard lock(mut); + cache_.insert({key, autotune(stream, grid_fn, args)}); + } } void function::operator()(const std::vector& args, const grid_t& grid, driver::stream *stream) { diff --git a/python/examples/einsum.py b/python/examples/einsum.py index 5585cc9b6..a8ec95435 100644 --- a/python/examples/einsum.py +++ b/python/examples/einsum.py @@ -1,38 +1,92 @@ -import numpy as np -import torch +#!/usr/bin/env python + +import numpy as np +from enum import Enum import triton -batch_dim = 16 -ctx_dim = 32 -head_dim = 8 -state_dim = 32 -key_dim = 32 -n_keys = 32 -bs = batch_dim * ctx_dim +class MODE(Enum): + TF = 1 + TORCH = 2 -# shapes -x_shape = (bs, state_dim) -qw_shape = (state_dim, head_dim * key_dim) -kw_shape = (head_dim, 2, n_keys, key_dim // 2) +try: + import tensorflow as tf + mode = MODE.TF +except ModuleNotFoundError: + pass -np.random.seed(0) -x = np.random.uniform(-1.0, 1.0, x_shape).astype(np.float32) # layer input -qw = np.random.uniform(-1.0, 1.0, qw_shape).astype(np.float32) # query weights -kw = np.random.uniform(-1.0, 1.0, kw_shape).astype(np.float32) # key weights -# (bs, head_dim * key_dim) = (bs, state_dim) * (state_dim, head_dim * key_dim) -# (bs, head_dim, 2, key_dim//2) <== (bs, head_dim * key_dim) -q = np.dot(x, qw).reshape(bs, head_dim, 2, key_dim//2) # normal matmul +try: + import torch + mode = MODE.TORCH +except ModuleNotFoundError: + pass -# (bs, head_dim, 2, n_keys) = (bs, head_dim, 2, key_dim//2) * (head_dim, 2, n_keys, key_dim//2) -# outer: bs, n_keys -# inner: key_dim//2 -# batch: head_dim, 2 (key_axis) -qk = np.einsum("bhak,hank->bhan", q, kw) +cases = [] +# Matmul +cases += [[[4, 1024, 1024], [1024, 1024], [4, 1024, 1024], "btc,ck->btk"]] +# Attention +cases += [[[4, 256, 8, 2, 64], [8, 2, 512, 64], [4, 256, 8, 2, 512], "bchak,hank->bchan"]] -tq = torch.from_numpy(q).contiguous().cuda() -tkw = torch.from_numpy(kw).contiguous().cuda() -tqk = triton.ops.einsum("bhak,hank->bhan", tq, tkw) -diff = np.abs(qk - tqk.cpu().numpy()) -print(np.max(diff)) -print(np.min(diff)) +if mode == MODE.TF: + sess = tf.InteractiveSession() +for a_shape, b_shape, c_shape, einsum in cases: + + A = np.random.uniform(-1.0, 1.0, a_shape).astype(np.float16).astype(np.float32) + B = np.random.uniform(-1.0, 1.0, b_shape).astype(np.float16).astype(np.float32) + E = np.random.uniform(-1.0, 1.0, c_shape).astype(np.float16).astype(np.float32) + + # Execute (tensorflow) + if mode == MODE.TF: + a = tf.placeholder(tf.float32, a_shape, name="a") + b = tf.placeholder(tf.float32, b_shape, name="b") + e = tf.placeholder(tf.float32, c_shape, name="e") + c = triton.ops.einsum(einsum, a, b, 1) + da, db = tf.gradients(c, [a, b], e) + feed_dict = { a: A.astype(np.float32), + b: B.astype(np.float32), + e: E } + sess.run(tf.global_variables_initializer()) + result = sess.run([c, da, db], feed_dict = feed_dict) + # Execute (torch) + if mode == MODE.TORCH: + a = torch.from_numpy(A).cuda() + b = torch.from_numpy(B).cuda() + e = torch.from_numpy(E).cuda() + a.requires_grad_(True) + b.requires_grad_(True) + c = triton.ops.einsum(einsum, a, b, 1) + torch.autograd.backward(c, e) + da = a.grad + db = b.grad + result = [c.cpu().detach().numpy(), da.cpu().detach().numpy(), db.cpu().detach().numpy()] + + # benchmark + nanosec = triton.bench_registry[c] + ctx = triton.ctx_registry[c] + b, m, n, k = tuple((ctx.bmnk[i] for i in range(0, 4))) + ops = 2.*b*m*n*k + print('C TFLOPS:', ops / triton.bench_registry[c] * 1e-3) + #print('DA TFLOPS:', ops / triton.bench_registry[da] * 1e-3) + #print('DB TFLOPS:', ops / triton.bench_registry[db] * 1e-3) + + # test + ctx = triton.ctx_registry[c] + t_a = ctx.trans_a + t_b = ctx.trans_b + e_a = ctx.einsum_a + e_b = ctx.einsum_b + e_c = ctx.einsum_c + C = np.einsum(einsum, A, B) + if not t_a and not t_b: # NN + DA = np.einsum(f"{e_c},{e_b}->{e_a}", E, B) + DB = np.einsum(f"{e_a},{e_c}->{e_b}", A, E) + elif not t_a and t_b: # NT + DA = np.einsum(f"{e_c},{e_b}->{e_a}", E, B) + DB = np.einsum(f"{e_c},{e_a}->{e_b}", E, A) + elif t_a and not t_b: # TN + DA = np.einsum(f"{e_b},{e_c}->{e_a}", B, E) + DB = np.einsum(f"{e_a},{e_c}->{e_b}", A, E) + c, da, db = result[0], result[1], result[2] + print('C diff:', np.abs((C - c)).max()) + print('DA diff:', np.abs((DA - da)).max()) + print('DB diff:', np.abs((DB - db)).max()) \ No newline at end of file diff --git a/python/examples/einsum_test.py b/python/examples/einsum_test.py deleted file mode 100644 index b09f46cab..000000000 --- a/python/examples/einsum_test.py +++ /dev/null @@ -1,139 +0,0 @@ -#!/usr/bin/env python - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np -import tensorflow as tf -import triton -import blocksparse as bs -from tensorflow.python.ops import gradient_checker - -one = 0 -out = 0 -bench = 0 - -class ProdKeyTest(tf.test.TestCase): - - def testEinsum(self): - # multi-threading screws up benchmarking - conf = tf.ConfigProto( - intra_op_parallelism_threads=1, - inter_op_parallelism_threads=1) - - with self.test_session(config=conf) as sess, tf.device("/gpu:0"): - - batch_dim = 4 - ctx_dim = 256 - head_dim = 8 - n_keys = 512 - key_dim = 128 - - # batch_dim = 2 - # ctx_dim = 8 - # head_dim = 2 - # n_keys = 16 - # key_dim = 16 - - for a_shape, b_shape, c_shape, einsum in [ - [ [ 4, 8, 8 ], [ 8, 8 ], [ 4, 8, 8 ], "btc,ck->btk" ], - [ [4, 1024, 1024], [ 1024, 1024 ], [4, 1024, 1024 ], "btc,ck->btk" ], - [ (batch_dim, ctx_dim, head_dim, 2, key_dim//2),(head_dim, 2, n_keys, key_dim//2), (batch_dim, ctx_dim, head_dim, 2, n_keys), "bchak,hank->bchan" ], - ]: - - if one: - A = np.ones(a_shape, dtype=np.float16).astype(np.float32) - B = np.ones(b_shape, dtype=np.float16).astype(np.float32) - E = np.ones(c_shape, dtype=np.float32) - else: - # QK = np.random.normal(loc=0.0, scale=1.0, size=qk_shape).astype(np.float16).astype(np.float32) - # V = np.random.normal(loc=0.0, scale=1.0, size=vw_shape).astype(np.float16).astype(np.float32) - A = np.random.uniform(-1.0, 1.0, a_shape).astype(np.float16).astype(np.float32) - B = np.random.uniform(-1.0, 1.0, b_shape).astype(np.float16).astype(np.float32) - E = np.random.uniform(-1.0, 1.0, c_shape).astype(np.float16).astype(np.float32) - - a = tf.placeholder(tf.float32, a_shape, name="a") - b = tf.placeholder(tf.float32, b_shape, name="b") - e = tf.placeholder(tf.float32, c_shape, name="e") - feed_dict = { a: A.astype(np.float32), - b: B.astype(np.float32), - e: E } - - c = triton.ops.einsum(einsum, a, b, bench=bench) - - # error = gradient_checker.compute_gradient_error(a, a_shape, c, c_shape, delta=1e-1, extra_feed_dict={ b:B }) # - # print(error) - # error = gradient_checker.compute_gradient_error(b, b_shape, c, c_shape, delta=1e-1, extra_feed_dict={ a:A }) # - # print(error) - # return - - with tf.control_dependencies([c.op]): - da, db = tf.gradients(c, [a, b], e) - - # c, = sess.run( [ c, ], feed_dict ) - rc, rda, rdb = sess.run( [ c, da, db ], feed_dict ) - - if bench > 0: - nanosec = triton.bench_registry[c] - ctx = triton.ctx_registry[c] - b, m, n, k = tuple((ctx.bmnk[i] for i in range(0, 4))) - ops = 2. * b * m * n * k - print('C TFLOPS:', ops / triton.bench_registry[c] * 1e-3) - print('DA TFLOPS:', ops / triton.bench_registry[da] * 1e-3) - print('DB TFLOPS:', ops / triton.bench_registry[db] * 1e-3) - - else: - C = np.einsum(einsum, A, B) - ctx = triton.ctx_registry[c] - t_a = ctx.trans_a - t_b = ctx.trans_b - e_a = ctx.einsum_a - e_b = ctx.einsum_b - e_c = ctx.einsum_c - - if not t_a and not t_b: # NN - DA = np.einsum(f"{e_c},{e_b}->{e_a}", E, B) - DB = np.einsum(f"{e_a},{e_c}->{e_b}", A, E) - elif not t_a and t_b: # NT - DA = np.einsum(f"{e_c},{e_b}->{e_a}", E, B) - DB = np.einsum(f"{e_c},{e_a}->{e_b}", E, A) - elif t_a and not t_b: # TN - DA = np.einsum(f"{e_b},{e_c}->{e_a}", B, E) - DB = np.einsum(f"{e_a},{e_c}->{e_b}", A, E) - - print("testProdKey", einsum) - if not bench: - for op, dev, cpu in [ - [ "C", rc, C ], - [ "DA", rda, DA ], - [ "DB", rdb, DB ], - ]: - self.compare_results(op, dev, cpu) - - def compare_results(self, op, dev, cpu): - dev = dev.astype(np.float64) - cpu = cpu.astype(np.float64) - - # print(dev.reshape(-1)[0:4]) - # print(cpu.reshape(-1)[0:4]) - - dif = np.abs(cpu - dev) - maxval = np.max(abs(cpu)) - avgval = np.average(abs(cpu)) - maxdif = dif.max() - max_err = maxdif if avgval == 0 else maxdif / avgval - l2_err = 0.0 if avgval == 0 else np.sqrt(np.square(dif).sum()) / np.sqrt(np.square(cpu).sum()) - - print("op:%3s, max:%18.12f, avg:%18.12f, dif:%18.12f, err:%18.12f, l2_err:%18.12f shape:%15s" % (op, maxval, avgval, maxdif, max_err, l2_err, str(cpu.shape))) - - if out: - dim = cpu.shape[-1] - np.savetxt("%s_dif.txt" % op, dif.reshape((-1,dim)), fmt='%4.1f') #7.5 5.3 - np.savetxt("%s_cpu.txt" % op, cpu.reshape((-1,dim)), fmt='%4.1f') #7.5 5.3 - np.savetxt("%s_dev.txt" % op, dev.reshape((-1,dim)), fmt='%4.1f') #7.5 5.3 - exit() - -if __name__ == "__main__": - tf.test.main() - diff --git a/python/triton/ops/einsum.py b/python/triton/ops/einsum.py index 7bbc18f5a..f91a178ad 100644 --- a/python/triton/ops/einsum.py +++ b/python/triton/ops/einsum.py @@ -179,7 +179,7 @@ void einsumk(TYPE * A, TYPE * B, TYPE * C, @staticmethod - def forward(ctx, subscripts, a, b, **kwargs): + def forward(ctx, subscripts, a, b, bench = 0): ctx.save_for_backward(a, b) if type(subscripts) is str: einsum_a, einsum_bc = subscripts.split(",") @@ -189,9 +189,7 @@ void einsumk(TYPE * A, TYPE * B, TYPE * C, shape_c, bmnk, std0, std1, ta, tb = _einsum._parse_einsum( einsum_a, einsum_b, einsum_c, - triton.shape(a), triton.shape(b) - ) - bench = kwargs['bench'] if 'bench' in kwargs else 0 + triton.shape(a), triton.shape(b)) ctx.trans_a = ta ctx.trans_b = tb ctx.einsum_a = einsum_a @@ -213,20 +211,20 @@ void einsumk(TYPE * A, TYPE * B, TYPE * C, bench = ctx.bench if not trans_a and not trans_b: # NN - da = einsum((einsum_c, einsum_b, einsum_a), dc, b, bench=bench) - db = einsum((einsum_a, einsum_c, einsum_b), a, dc, bench=bench) + da = einsum((einsum_c, einsum_b, einsum_a), dc, b, bench) + db = einsum((einsum_a, einsum_c, einsum_b), a, dc, bench) elif not trans_a and trans_b: # NT - da = einsum((einsum_c, einsum_b, einsum_a), dc, b, bench=bench) - db = einsum((einsum_c, einsum_a, einsum_b), dc, a, bench=bench) + da = einsum((einsum_c, einsum_b, einsum_a), dc, b, bench) + db = einsum((einsum_c, einsum_a, einsum_b), dc, a, bench) elif trans_a and not trans_b: # TN - da = einsum((einsum_b, einsum_c, einsum_a), b, dc, bench=bench) - db = einsum((einsum_a, einsum_c, einsum_b), a, dc, bench=bench) + da = einsum((einsum_b, einsum_c, einsum_a), b, dc, bench) + db = einsum((einsum_a, einsum_c, einsum_b), a, dc, bench) elif trans_a and trans_b: # TT (not used) - da = einsum((einsum_b, einsum_c, einsum_a), b, dc, bench=bench) - db = einsum((einsum_c, einsum_a, einsum_b), dc, a, bench=bench) + da = einsum((einsum_b, einsum_c, einsum_a), b, dc, bench) + db = einsum((einsum_c, einsum_a, einsum_b), dc, a, bench) return da, db, None, None, None, None, None, None, None, None, None, None From d9eacf937c35792fcdc953f64acb4e55f3f13bb6 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 29 Oct 2019 14:09:40 -0400 Subject: [PATCH 474/494] [PYTHON][FUNCTION] Now using common grad output format for both tensorflow and pytorch --- python/triton/function.py | 21 +++++++++++++++++---- python/triton/ops/einsum.py | 2 +- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/python/triton/function.py b/python/triton/function.py index eb52d145e..f40605ea9 100644 --- a/python/triton/function.py +++ b/python/triton/function.py @@ -48,16 +48,29 @@ class function(metaclass = function_meta): # are handled properly mutex = fw.gen_resource_variable_ops.mutex_v2() lock = fw.gen_resource_variable_ops.mutex_lock(mutex) - with fw.tensorflow.python.ops.control_dependencies([lock]): + with fw.tensorflow.control_dependencies([lock]): result = cls.forward(ctx, *args, **kwargs) - ctx_registry[result] = ctx + # Find a mapping between ::forward arguments and tensorflow op arguments + remap = dict() + for i, ix in enumerate(result.op.inputs): + for j, jx in enumerate(args): + if ix is jx: + remap[j] = i # register backward + ctx_registry[result] = ctx name = result.op.op_def.name if not cls.registered: @fw.tensorflow.RegisterGradient(name) def gradient(op, dy): - with fw.tensorflow.control_dependencies([op]): - return cls.backward(ctx_registry[op.outputs[0]], dy) + y = op.outputs[0] + grad = cls.backward(ctx_registry[y], dy) + # Remap gradient in the right order + ret = [None] * len(op.inputs) + for i in range(len(grad)): + if i in remap: + ret[remap[i]] = grad[i] + # Return + return ret cls.registered = True # return result tensor return result diff --git a/python/triton/ops/einsum.py b/python/triton/ops/einsum.py index f91a178ad..1467b1173 100644 --- a/python/triton/ops/einsum.py +++ b/python/triton/ops/einsum.py @@ -226,6 +226,6 @@ void einsumk(TYPE * A, TYPE * B, TYPE * C, da = einsum((einsum_b, einsum_c, einsum_a), b, dc, bench) db = einsum((einsum_c, einsum_a, einsum_b), dc, a, bench) - return da, db, None, None, None, None, None, None, None, None, None, None + return None, da, db, None einsum = _einsum.apply \ No newline at end of file From d65a94c76843c53f7722949a493d7f77bfed814a Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 29 Oct 2019 17:29:11 -0400 Subject: [PATCH 475/494] [PYTHON][OPS] Added batch normalization op --- include/triton/runtime/function.h | 2 + lib/codegen/selection/generator.cc | 5 ++ lib/lang/code_gen.cc | 14 ++++-- lib/runtime/function.cc | 7 ++- python/setup.py | 2 +- python/src/bindings.cc | 32 +++---------- python/triton/ops/__init__.py | 1 + python/triton/ops/batchnorm.py | 75 ++++++++++++++++++++++++++++++ python/triton/ops/einsum.py | 4 +- 9 files changed, 108 insertions(+), 34 deletions(-) create mode 100644 python/triton/ops/batchnorm.py diff --git a/include/triton/runtime/function.h b/include/triton/runtime/function.h index 539de8684..a6ab851a9 100644 --- a/include/triton/runtime/function.h +++ b/include/triton/runtime/function.h @@ -99,6 +99,8 @@ private: std::unique_ptr make_bin(ir::module &function, driver::context *context, const options_t &opt); caller autotune(driver::stream *stream, const grid_fn_ty& grid, const std::vector &args); +public: + static std::string preheader(); public: function(const std::string& src, const options_space_t& opt = options_space_t()); diff --git a/lib/codegen/selection/generator.cc b/lib/codegen/selection/generator.cc index 2efa834a8..34bf52b7f 100644 --- a/lib/codegen/selection/generator.cc +++ b/lib/codegen/selection/generator.cc @@ -289,6 +289,11 @@ void generator::visit_uncond_branch_inst(ir::uncond_branch_inst* br) { void generator::visit_unmasked_load_inst(ir::unmasked_load_inst* x) { + if(!x->get_type()->is_tile_ty()){ + Value *ptr = get_value(x->get_pointer_operand(), {}); + set_value(x, {}, builder_->CreateLoad(ptr)); + return; + } // find vector size ir::value *ptr = x->get_pointer_operand(); size_t ld = layouts_->get(ptr)->order[0]; diff --git a/lib/lang/code_gen.cc b/lib/lang/code_gen.cc index fdc754048..d13f68856 100644 --- a/lib/lang/code_gen.cc +++ b/lib/lang/code_gen.cc @@ -229,6 +229,13 @@ void Generator::VisitFuncCall(FuncCall* funcCall) { ir::value* ret = ret_; if(auto axis = dynamic_cast(ret)) return set_ret(bld_->create_get_program_id(axis->get_value())); + else + return should_not_happen(); + } + if(name == "sqrtf"){ + VisitExpr(funcCall->Args()->at(0)); + ir::value* ret = ret_; + return set_ret(bld_->create_sqrt(ret)); } return error_not_implemented(); } @@ -274,10 +281,11 @@ void Generator::VisitDeclaration(Declaration* decl) { // initialize declaration ir::type::id_t id = ty->get_type_id(); if(id == ir::type::StructTyID) - assert(false); + should_not_happen(); if(inits.size() > 1) - assert(false); - val = inits[0]; + should_not_happen(); + if(inits.size() > 0) + val = inits[0]; assert(val->get_type() == ty); // update scope symbols table const std::string &name = obj->Name(); diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index a7072b757..d955b69f0 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -113,7 +113,7 @@ void function::caller::operator ()(driver::stream *stream, const grid_t& _grid, arg arg_i = args.at(i); arg_type ty = arg_i.type(); if(ty != param_tys_.at(i)) - throw std::runtime_error("invalid type"); + throw std::runtime_error("invalid type for argument " + std::to_string(i)); if(ty == BUFFER_T) bin_->setArg(i, *((driver::buffer**)arg_i.data())); else @@ -253,16 +253,14 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c return std::unique_ptr(); barriers.run(module); // std::cout << "isel" << std::endl; -// ir::print(module, std::cout); isel.visit(module, *llvm); -// std::cout << "done" << std::endl; // return binary std::unique_ptr res(driver::module::create(context, std::move(llvm))); // done return res; } -std::string preheader() { +std::string function::preheader() { return R"( #define bool _Bool @@ -277,6 +275,7 @@ R"( #define __multipleof(A) __attribute__((multipleof(A))) extern int get_program_id(int); +extern float sqrtf(float); )"; } diff --git a/python/setup.py b/python/setup.py index 060a1c450..4c8d38259 100644 --- a/python/setup.py +++ b/python/setup.py @@ -77,7 +77,7 @@ class CMakeBuild(build_ext): pass cfg = 'Debug' if self.debug else 'Release' - cfg = 'Release' + #cfg = 'Release' build_args = ['--config', cfg] if platform.system() == "Windows": diff --git a/python/src/bindings.cc b/python/src/bindings.cc index 80e2f8ddc..37aa0a2c8 100644 --- a/python/src/bindings.cc +++ b/python/src/bindings.cc @@ -211,27 +211,9 @@ void gen_tf_register_op(std::ostream &os, const std::string &name, os << ";\n"; } -inline std::string preheader() { -return -R"( -#define bool _Bool -#define true 1 -#define false 0 -#define __bool_true_false_are_defined 1 - -#define __readonly __attribute__((readonly)) -#define __writeonly __attribute__((writeonly)) -#define __noalias __attribute__((noalias)) -#define __aligned(A) __attribute__((aligned(A))) -#define __multipleof(A) __attribute__((multipleof(A))) - -extern int get_program_id(int); -)"; -} - void make_module(const std::string& src, ir::module* ir, const runtime::function::options_space_t& opt) { - std::string copy = preheader() + src; + std::string copy = triton::runtime::function::preheader() + src; // pre-process TokenSequence tokens; Preprocessor cpp(©, true); @@ -341,11 +323,11 @@ inline std::string to_torch_ty(ir::type *ty) { if(ty->is_integer_ty()) return "int64_t"; if(ty->is_half_ty()) - return "float16"; + return "double"; if(ty->is_float_ty()) - return "float32"; + return "double"; if(ty->is_double_ty()) - return "float64"; + return "double"; if(ty->is_pointer_ty()) return "torch::Tensor"; throw std::runtime_error("unknown type"); @@ -363,11 +345,11 @@ inline std::string to_c_ty(ir::type *ty) { if(ty->is_integer_ty(64)) return "int64_t"; if(ty->is_half_ty()) - return "float16"; + return "half"; if(ty->is_float_ty()) - return "float32"; + return "float"; if(ty->is_double_ty()) - return "float64"; + return "double"; if(ty->is_pointer_ty()) return "drv::cu_buffer"; throw std::runtime_error("unknown type"); diff --git a/python/triton/ops/__init__.py b/python/triton/ops/__init__.py index f409fde46..ac0c3293d 100644 --- a/python/triton/ops/__init__.py +++ b/python/triton/ops/__init__.py @@ -1,2 +1,3 @@ from .dot import _dot, dot from .einsum import _einsum, einsum +from .batchnorm import _batchnorm, batchnorm \ No newline at end of file diff --git a/python/triton/ops/batchnorm.py b/python/triton/ops/batchnorm.py new file mode 100644 index 000000000..fb6e375e2 --- /dev/null +++ b/python/triton/ops/batchnorm.py @@ -0,0 +1,75 @@ +import triton +import math + +class _batchnorm(triton.function): + + fwd_src = """ +void batchnormForward(float *Y, float *M, float *V, + float *X, float *G, float *B, + int N, float rcpN, float eps) { + int rx[TM] = 0 ... TM; + float *px[TM]; + float x[TM] = 0; + int c = get_program_id(1); + float g = *(G + c); + float b = *(B + c); + + float mean[TM] = 0; + px = X + rx + c*N; + for(int i = 0; i < N; i = i + TM){ + x = *px; + mean = mean + x; + px = px + TM; + } + float *pm = M + c; + float m = mean[+] * rcpN; + *pm = m; + + float var[TM] = 0; + px = X + rx + c*N; + for(int i = 0; i < N; i = i + TM){ + x = *px; + x = x - m; + var = var + x*x; + px = px + TM; + } + float v = var[+] * rcpN; + float *pv = V + c; + *pv = v; + float rstdg = 1 / sqrtf(v + eps) * g; + + px = X + rx + c*N; + float* py[TM] = Y + rx + c*N; + for(int i = 0; i < N; i = i + TM){ + x = *px; + float y[TM] = (x - m)*rstdg + b; + *py = y; + px = px + TM; + py = py + TM; + } +} +""" + + fwd_kernel = triton.kernel(fwd_src, ['Y', 'M', 'V']) + + @staticmethod + def forward(ctx, x, gamma, beta, eps): + shape = triton.shape(x) + dtype = x.dtype + # allocate outputs + C, H, W, B = shape[0], shape[1], shape[2], shape[3] + y = triton.empty(shape, dtype=dtype) + mean = triton.empty([C], dtype=dtype) + var = triton.empty([C], dtype=dtype) + # execute kernels + N = H*W*B + _batchnorm.fwd_kernel(y, mean, var, x, gamma, beta, N, 1./N, eps, + lambda opt: [1, C], + TM = 128) + # save + ctx.eps = eps + ctx.save_for_backward(x, gamma, beta, mean, var) + return y, mean, var + + +batchnorm = _batchnorm.apply \ No newline at end of file diff --git a/python/triton/ops/einsum.py b/python/triton/ops/einsum.py index 1467b1173..167b2aacd 100644 --- a/python/triton/ops/einsum.py +++ b/python/triton/ops/einsum.py @@ -181,15 +181,16 @@ void einsumk(TYPE * A, TYPE * B, TYPE * C, @staticmethod def forward(ctx, subscripts, a, b, bench = 0): ctx.save_for_backward(a, b) + # parse if type(subscripts) is str: einsum_a, einsum_bc = subscripts.split(",") einsum_b, einsum_c = einsum_bc.split("->") else: einsum_a, einsum_b, einsum_c = subscripts - shape_c, bmnk, std0, std1, ta, tb = _einsum._parse_einsum( einsum_a, einsum_b, einsum_c, triton.shape(a), triton.shape(b)) + # save for backward ctx.trans_a = ta ctx.trans_b = tb ctx.einsum_a = einsum_a @@ -197,6 +198,7 @@ void einsumk(TYPE * A, TYPE * B, TYPE * C, ctx.einsum_c = einsum_c ctx.bench = bench ctx.bmnk = bmnk + # run return _einsum.call(a, b, ta, tb, shape_c, bmnk, std0, std1, einsum_a, einsum_b, einsum_c, bench) From 2b9355c9e4e193bf8dbebcd8e4f1be84367096a8 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 30 Oct 2019 01:38:30 -0400 Subject: [PATCH 476/494] [PYTHON][TENSORFLOW] Got rid of alloc_empty entirely; now doing generating allocation code inside the tensorflow op --- python/src/bindings.cc | 99 +++++++++++++++++++++------- python/src/tensorflow/alloc_empty.cc | 1 + python/triton/function.py | 7 +- python/triton/kernel.py | 12 +++- python/triton/ops/batchnorm.py | 74 ++++++++++++++++++++- python/triton/utils.py | 15 ++--- 6 files changed, 164 insertions(+), 44 deletions(-) diff --git a/python/src/bindings.cc b/python/src/bindings.cc index 37aa0a2c8..b3b74b37b 100644 --- a/python/src/bindings.cc +++ b/python/src/bindings.cc @@ -89,9 +89,9 @@ inline std::string to_tf_ty(ir::type *ty) { if(ty->is_half_ty()) return "float16"; if(ty->is_float_ty()) - return "float32"; + return "float"; if(ty->is_double_ty()) - return "float64"; + return "double"; if(ty->is_pointer_ty()) return "Tensor"; throw std::runtime_error("unknown type"); @@ -113,21 +113,50 @@ inline std::string ref_to_tf_ty(ir::type *ty) { } -void gen_extract_inputs(std::ostream &os, const std::vector& args) { +void gen_extract_inputs(std::ostream &os, const std::vector& args, const std::vector& outputs) { for(unsigned i = 0; i < args.size(); i++){ ir::value *arg = args[i]; - std::string suffix = ""; - ir::type *tr_ty = arg->get_type(); - std::string tf_ty = ref_to_tf_ty(tr_ty); - if(!tr_ty->is_pointer_ty()) - suffix = ".scalar<" + tf_ty + ">()()"; - os << " " << tf_ty << " " << arg->get_name() << " = context->input(" << i << ")" << suffix << ";\n "; + const std::string& name = arg->get_name(); + std::string ty = to_tf_ty(arg->get_type()); + if(!arg->get_type()->is_pointer_ty()) + os << " " << ty << " " << name << " = context->input(" << i << ").scalar<" << ty << ">()();\n "; + else if(std::find(outputs.begin(), outputs.end(), arg->get_name()) == outputs.end()) + os << " const Tensor* " << name << " = &context->input(" << i << ");\n "; + else + os << " Tensor* " << name << " = nullptr;\n "; } } -void gen_set_outputs(std::ostream &os, const std::vector& outputs) { +void gen_set_outputs(std::ostream &os, const std::vector& args, const std::vector& outputs) { for(unsigned i = 0; i < outputs.size(); i++) - os << " context->set_output(" << i << ", " << outputs[i] << ");\n "; + os << " TensorShape shape" << i << ";\n "; + // initialize shapes + + std::vector out_idx; + for(size_t i = 0; i < outputs.size(); i++){ + std::string name = outputs[i]; + size_t idx; + for(idx = 0; idx < args.size(); idx++) + if(args[idx]->get_name() == name) + break; + if(idx == args.size()) + throw std::runtime_error("unknown output"); + out_idx.push_back(idx); + } + + for(unsigned i = 0; i < outputs.size(); i++) + os << " const Tensor& " << outputs[i] << "_shape = context->input(" << out_idx[i] << ");\n "; + for(unsigned i = 0; i < outputs.size(); i++) + os << " const int32* " << outputs[i] << "_shape_data = (const int32*)" << outputs[i] << "_shape.tensor_data().data();\n "; + for(unsigned i = 0; i < outputs.size(); i++) + os << " size_t " << outputs[i] << "_rank = " << outputs[i] << "_shape.dim_size(0);\n "; + for(unsigned i = 0; i < outputs.size(); i++) + os << " for(size_t d = 0; d < " << outputs[i] << "_rank ; d++) " + << "shape" << i << ".AddDim(" << outputs[i] << "_shape_data[d]);\n "; + + // allocate + for(unsigned i = 0; i < outputs.size(); i++) + os << " OP_REQUIRES_OK(context, context->allocate_output(" << i << ", shape" << i << ", &" << outputs[i] << "));\n "; } void gen_make_handles(std::ostream &os, const std::vector& args) { @@ -136,7 +165,7 @@ void gen_make_handles(std::ostream &os, const std::vector& args) if(!arg->get_type()->is_pointer_ty()) continue; const std::string& name = arg->get_name(); - os << " drv::cu_buffer cu_" + name + "(ctx, " + name + ".tensor_data().size(), (CUdeviceptr)" + name + ".tensor_data().data(), false);\n "; + os << " drv::cu_buffer cu_" + name + "(ctx, " + name + "->tensor_data().size(), (CUdeviceptr)" + name + "->tensor_data().data(), false);\n "; } } @@ -161,7 +190,8 @@ void gen_make_launch_function(std::ostream &os, int num_outputs, const std::vect void gen_tf_register_kernel_builder(std::ostream &os, const std::string &name, const std::string &opname, - const std::vector& args){ + const std::vector& args, + const std::vector& outputs){ os << "REGISTER_KERNEL_BUILDER(Name(\"" + name + "\").Device(DEVICE_GPU)"; for(size_t i = 0; i < args.size(); i++){ ir::argument *arg = args[i]; @@ -171,20 +201,31 @@ void gen_tf_register_kernel_builder(std::ostream &os, const std::string &name, if(!arg->get_type()->is_pointer_ty()) os << ".HostMemory(\"" + name + "\")"; } + for(size_t i = 0; i < outputs.size(); i++){ + std::string name = outputs[i]; + name[0] = std::tolower(name[0]); + os << ".HostMemory(\"" << name << "_shape\")"; + } os << ", " + opname << ");\n"; } void gen_tf_register_op(std::ostream &os, const std::string &name, const std::vector& args, const std::vector& outputs){ + + auto tolower = [](char c) { return std::tolower(c);}; + os << "REGISTER_OP(\"" << name << "\")\n"; + for(size_t i = 0; i < args.size(); i++) + os << " .Attr(\"T" << i << " : {bool, int8, int16, int32, int64, float16, float32, float64}\")" << std::endl; for(size_t i = 0; i < args.size(); i++){ ir::argument *arg = args[i]; std::string name = arg->get_name(); - auto tolower = [](char c) { return std::tolower(c);}; std::transform(name.begin(), name.end(), name.begin(), tolower); - os << " .Attr(\"T" << i << " : {bool, int8, int16, int32, int64, float16, float32, float64}\")" << std::endl; - os << " .Input(\"" << name << ": T" << i << "\")\n"; + if(std::find(outputs.begin(), outputs.end(), arg->get_name()) == outputs.end()) + os << " .Input(\"" << name << ": T" << i << "\")\n"; + else + os << " .Input(\"" << name << "_shape: int32\")\n"; } std::vector out_idx; for(size_t i = 0; i < outputs.size(); i++){ @@ -197,15 +238,22 @@ void gen_tf_register_op(std::ostream &os, const std::string &name, throw std::runtime_error("unknown output"); out_idx.push_back(idx); } - for(size_t i = 0; i < out_idx.size(); i++) - os << " .Output(\"out" << i << ": T" << out_idx[i] << "\")\n"; + for(size_t i = 0; i < out_idx.size(); i++){ + std::string name = outputs[i]; + std::transform(name.begin(), name.end(), name.begin(), tolower); + os << " .Output(\"" << name << ": T" << out_idx[i] << "\")\n"; + } os << " .Attr(\"id: int\")\n"; os << " .Attr(\"bench: int\")\n"; os << " .Attr(\"bench_id: int\")\n"; - os << " .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {\n"; + os << " .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* ctx) {\n"; for(size_t i = 0; i < out_idx.size(); i++) - os << " c->set_output(" << i << ", c->input(" << out_idx[i] << "));\n"; - os << " return Status::OK();\n"; + os << " shape_inference::ShapeHandle handle" << i << ";\n"; + for(size_t i = 0; i < out_idx.size(); i++) + os << " ctx->MakeShapeFromShapeTensor(" << out_idx[i] << ", &handle" << i << ");\n"; + for(size_t i = 0; i < out_idx.size(); i++) + os << " ctx->set_output(" << i << ", handle" << i << ");\n"; + os << " return Status::OK();\n"; os << " })\n"; os << ";\n"; @@ -237,6 +285,7 @@ std::tuple(new ir::module("", ctx)); make_module(src, &*ir, opt); + // function ir::function* fn = ir->get_function_list().front(); std::string name = fn->get_name(); @@ -276,18 +325,20 @@ class )" << opname << R"(: public OpKernel { } void Compute(OpKernelContext* context){ + // get device/stream GPUDevice device = context->eigen_device(); drv::cu_stream sstream(device.stream(), false); drv::context* ctx = sstream.context(); drv::stream* stream = &sstream; + // extract inputs )"; -gen_extract_inputs(oss, fn->args()); +gen_extract_inputs(oss, fn->args(), outputs); oss << R"( // set outputs )"; -gen_set_outputs(oss, outputs); +gen_set_outputs(oss, fn->args(), outputs); oss << R"( // wrap tensors )"; @@ -309,7 +360,7 @@ private: // register kernel builder )"; -gen_tf_register_kernel_builder(oss, cc_name, opname, fn->args()); +gen_tf_register_kernel_builder(oss, cc_name, opname, fn->args(), outputs); oss << R"( // register op )"; diff --git a/python/src/tensorflow/alloc_empty.cc b/python/src/tensorflow/alloc_empty.cc index 43f82cbfa..a9c97b1d5 100644 --- a/python/src/tensorflow/alloc_empty.cc +++ b/python/src/tensorflow/alloc_empty.cc @@ -8,6 +8,7 @@ class AllocEmptyOp : public OpKernel { explicit AllocEmptyOp(OpKernelConstruction* context) : OpKernel(context) {} void Compute(OpKernelContext* context) override { + std::cout << "executing allocempty" << std::endl; // fetch input const Tensor& x = context->input(0); const int32* x_data = (const int32*)x.tensor_data().data(); diff --git a/python/triton/function.py b/python/triton/function.py index f40605ea9..e75512b1b 100644 --- a/python/triton/function.py +++ b/python/triton/function.py @@ -44,12 +44,7 @@ class function(metaclass = function_meta): @classmethod def apply_tensorflow(cls, *args, **kwargs): ctx = OpContext() - # Acquire a mutex here to ensure that calls to alloc_empty() - # are handled properly - mutex = fw.gen_resource_variable_ops.mutex_v2() - lock = fw.gen_resource_variable_ops.mutex_lock(mutex) - with fw.tensorflow.control_dependencies([lock]): - result = cls.forward(ctx, *args, **kwargs) + result = cls.forward(ctx, *args, **kwargs) # Find a mapping between ::forward arguments and tensorflow op arguments remap = dict() for i, ix in enumerate(result.op.inputs): diff --git a/python/triton/kernel.py b/python/triton/kernel.py index 769e47a29..60964abdc 100644 --- a/python/triton/kernel.py +++ b/python/triton/kernel.py @@ -222,11 +222,17 @@ class kernel: libtriton.register_grid(op_id, _make_grid(args)) # id for the benchmark result bench_id = libtriton.make_scalar_id() if bench > 0 else -1 - # create operands # call framework function if fw.has_tensorflow(): - args = [x for x in args[:-1]] - ret = self.fw_op(*args, id=op_id, bench=bench, bench_id=bench_id) + # operands + operands = [x.shape if isinstance(x, triton.utils.tf_empty_proxy) else x for x in args[:-1]] + # output data types + kwargs = {'id': op_id, 'bench': bench, 'bench_id': bench_id} + for i, x in enumerate(args[:-1]): + if isinstance(x, triton.utils.tf_empty_proxy): + kwargs['T' + str(i)] = x.dtype + # launch + ret = self.fw_op(*operands, **kwargs) if bench > 0: bench_registry[ret] = triton.utils.id_dict.lazy_entry(bench_id) elif fw.has_torch(): diff --git a/python/triton/ops/batchnorm.py b/python/triton/ops/batchnorm.py index fb6e375e2..5e352d93a 100644 --- a/python/triton/ops/batchnorm.py +++ b/python/triton/ops/batchnorm.py @@ -4,7 +4,7 @@ import math class _batchnorm(triton.function): fwd_src = """ -void batchnormForward(float *Y, float *M, float *V, +void fwdbatchnorm(float *Y, float *M, float *V, float *X, float *G, float *B, int N, float rcpN, float eps) { int rx[TM] = 0 ... TM; @@ -52,6 +52,58 @@ void batchnormForward(float *Y, float *M, float *V, fwd_kernel = triton.kernel(fwd_src, ['Y', 'M', 'V']) + bwd_src = """ +void batchnormBackward(float *DX, float *DG, float *DB, + float *DY, float *X, float *G, + float *M, float *V, + int DHWN, float rcpDHWN, float epsilon) { + int rx[TM] = 0 ... TM; + int c = get_program_id(1); + int offset = c*DHWN; + float g = *(G + c); + float mean = *(M + c); + float var = *(V + c); + float rstd = 1 / sqrtf(var + epsilon); + float* px[TM]; + float* pdx[TM]; + float* pdy[TM]; + px = X + rx + offset; + pdy = DY + rx + offset; + float dg[TM] = 0; + float db[TM] = 0; + for(int i = 0; i < DHWN; i = i + TM){ + float x[TM] = *px; + float dy[TM] = *pdy; + dg = dg + dy*(x - mean)*rstd; + db = db + dy; + px = px + TM; + pdy = pdy + TM; + } + float sdg = dg[+]; + float sdb = db[+]; + float *pdg = DG + c; + float *pdb = DB + c; + *pdg = sdg; + *pdb = sdb; + px = X + rx + offset; + pdy = DY + rx + offset; + pdx = DX + rx + offset; + for(int i = 0; i < DHWN; i = i + TM){ + float x[TM] = *px; + float dy[TM] = *pdy; + float xhat[TM] = (x - mean) * rstd; + float xtmp[TM] = (xhat * dg + db) * rcpDHWN; + float dx[TM] = (dy - xtmp) * rstd * g; + *pdx = dx; + px = px + TM; + pdy = pdy + TM; + pdx = pdx + TM; + } +} +""" + + bwd_kernel = triton.kernel(bwd_src, ['DX', 'DG', 'DB']) + @staticmethod def forward(ctx, x, gamma, beta, eps): shape = triton.shape(x) @@ -63,13 +115,29 @@ void batchnormForward(float *Y, float *M, float *V, var = triton.empty([C], dtype=dtype) # execute kernels N = H*W*B - _batchnorm.fwd_kernel(y, mean, var, x, gamma, beta, N, 1./N, eps, + y, mean, var = _batchnorm.fwd_kernel(y, mean, var, x, gamma, beta, N, 1./N, eps, lambda opt: [1, C], TM = 128) # save ctx.eps = eps ctx.save_for_backward(x, gamma, beta, mean, var) - return y, mean, var + return y + @staticmethod + def backward(ctx, dy): + eps = ctx.eps + x, gamma, beta, mean, var = ctx.saved_tensors + dx = triton.empty(x.shape, dtype=x.dtype) + dgamma = triton.empty(gamma.shape, dtype=gamma.dtype) + dbeta = triton.empty(beta.shape, dtype=beta.dtype) + # launch + C, H, W, B = x.shape + N = H*W*B + _batchnorm.bwd_kernel(dx, dgamma, dbeta, dy, + x, gamma, mean, var, + N, 1./N, eps, + lambda opt: [1, C], + TM = 128) + return dx, dgamma, dbeta, None batchnorm = _batchnorm.apply \ No newline at end of file diff --git a/python/triton/utils.py b/python/triton/utils.py index eca9f665e..e55afd602 100644 --- a/python/triton/utils.py +++ b/python/triton/utils.py @@ -7,17 +7,16 @@ def cdiv(a, b): class tf_empty_proxy: - def __init__(self, args, dtype): - self.args = args + def __init__(self, shape, dtype): + self.shape = shape self.dtype = dtype -def empty(shapes, dtype): +def empty(shape, dtype): if fw.has_tensorflow(): - #return fw.tensorflow.Variable(np.empty(shapes),shape=shapes, dtype=dtype) - args = [x.handle if isinstance(x, scalar) else fw.tensorflow.constant(x) for x in shapes] - args = fw.tensorflow.stack(args) - #return tf_empty_proxy(args, dtype) - return fw.tf_extra_ops.alloc_empty(args, T = dtype) + shape = [x.handle if isinstance(x, scalar) else fw.tensorflow.constant(x) for x in shape] + shape = fw.tensorflow.stack(shape) + return tf_empty_proxy(shape, dtype) + #return fw.tf_extra_ops.alloc_empty(args, T = dtype) elif fw.has_torch(): return fw.torch.empty(*shapes).cuda() From f4fcaf84df9ede3db75ade8fd13aa11d35193aad Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 30 Oct 2019 01:49:14 -0400 Subject: [PATCH 477/494] [PYTHON][EXAMPLES] Added example for batchnorm --- python/examples/batchnorm.py | 55 ++++++++++++++++++++++++++++++++++++ python/triton/utils.py | 2 +- 2 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 python/examples/batchnorm.py diff --git a/python/examples/batchnorm.py b/python/examples/batchnorm.py new file mode 100644 index 000000000..673c8ec2c --- /dev/null +++ b/python/examples/batchnorm.py @@ -0,0 +1,55 @@ +import triton +import numpy as np +from enum import Enum + +class MODE(Enum): + TF = 1 + TORCH = 2 + +try: + import tensorflow as tf + mode = MODE.TF +except ModuleNotFoundError: + pass + +try: + import torch + mode = MODE.TORCH +except ModuleNotFoundError: + pass + + +C, H, W, B = 32, 1, 1, 128 + +x = np.random.uniform(-1, 1, (C, H, W, B)).astype(np.float32) +gamma = np.random.uniform(-1, 1, C).astype(np.float32) +beta = np.random.uniform(-1, 1, C).astype(np.float32) +dy = np.random.uniform(-1, 1, (C, H, W, B)).astype(np.float32) + +if mode == MODE.TORCH: + fw_x = torch.from_numpy(x).cuda() + fw_gamma = torch.from_numpy(gamma).cuda() + fw_beta = torch.from_numpy(beta).cuda() + fw_dy = torch.from_numpy(dy).cuda() + # register gradients + fw_x.requires_grad_(True) + fw_gamma.requires_grad_(True) + fw_beta.requires_grad_(True) + # execute + fw_y = triton.ops.batchnorm(fw_x, fw_gamma, fw_beta, 1e-4) + fw_y.backward(fw_dy) + +if mode == MODE.TF: + fw_x = tf.placeholder(shape=x.shape, dtype=x.dtype) + fw_gamma = tf.placeholder(shape=gamma.shape, dtype=gamma.dtype) + fw_beta = tf.placeholder(shape=beta.shape, dtype=beta.dtype) + fw_dy = tf.placeholder(shape=dy.shape, dtype=dy.dtype) + # execute + fw_y = triton.ops.batchnorm(fw_x, fw_gamma, fw_beta, 1e-4) + #fw_dx, fw_dgamma, fw_dbeta = tf.gradients(fw_y, [fw_x, fw_gamma, fw_beta]) + sess = tf.InteractiveSession() + feed_dict = {fw_x: x, fw_gamma: gamma, fw_beta: beta, fw_dy: dy} + sess.run(tf.global_variables_initializer()) + #print(fw_dx, fw_dgamma, fw_dbeta) + result = sess.run([fw_y], feed_dict=feed_dict) + print(result) diff --git a/python/triton/utils.py b/python/triton/utils.py index e55afd602..17534abb7 100644 --- a/python/triton/utils.py +++ b/python/triton/utils.py @@ -18,7 +18,7 @@ def empty(shape, dtype): return tf_empty_proxy(shape, dtype) #return fw.tf_extra_ops.alloc_empty(args, T = dtype) elif fw.has_torch(): - return fw.torch.empty(*shapes).cuda() + return fw.torch.empty(*shape).cuda() class lazy_shape: From bf3dc63858faca9922a48bb65461d2e36d00889d Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 30 Oct 2019 10:37:30 -0400 Subject: [PATCH 478/494] [PYTHON] Removed dead code for alloc_empty and register_scalar --- CMakeLists.txt | 12 ---- python/examples/einsum.py | 2 +- python/src/tensorflow/alloc_empty.cc | 39 ------------- python/src/tensorflow/register_scalar.cc | 37 ------------- python/triton/frameworks.py | 13 ----- python/triton/kernel.py | 2 +- python/triton/utils.py | 70 +----------------------- 7 files changed, 3 insertions(+), 172 deletions(-) delete mode 100644 python/src/tensorflow/alloc_empty.cc delete mode 100644 python/src/tensorflow/register_scalar.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 717bbe144..bdb9e1ce7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -34,18 +34,6 @@ if(BUILD_PYTHON_MODULE) # PyBind11 wrapper source file file(GLOB_RECURSE PYTHON_SRC python/src/bindings.cc) include_directories(python/src/ ${PYTHON_INCLUDE_DIRS}) - if(TF_LIBS) - # extra tensorflow ops (e.g., alloc_empty) - # update directories - link_directories(${TF_LIB_DIRS}) - include_directories(${TF_INCLUDE_DIRS}) - # get sources - file(GLOB_RECURSE EXTRA_TF_OPS_SRC python/src/tensorflow/*.cc) - add_library(extra_tf_ops SHARED ${EXTRA_TF_OPS_SRC}) - # create target - target_link_libraries(extra_tf_ops triton ${TF_LIBS}) - target_compile_definitions(extra_tf_ops PRIVATE "-D_GLIBCXX_USE_CXX11_ABI=${TF_ABI}") - endif() endif() diff --git a/python/examples/einsum.py b/python/examples/einsum.py index a8ec95435..8c3327e5a 100644 --- a/python/examples/einsum.py +++ b/python/examples/einsum.py @@ -24,7 +24,7 @@ cases = [] # Matmul cases += [[[4, 1024, 1024], [1024, 1024], [4, 1024, 1024], "btc,ck->btk"]] # Attention -cases += [[[4, 256, 8, 2, 64], [8, 2, 512, 64], [4, 256, 8, 2, 512], "bchak,hank->bchan"]] +# cases += [[[4, 256, 8, 2, 64], [8, 2, 512, 64], [4, 256, 8, 2, 512], "bchak,hank->bchan"]] if mode == MODE.TF: sess = tf.InteractiveSession() diff --git a/python/src/tensorflow/alloc_empty.cc b/python/src/tensorflow/alloc_empty.cc deleted file mode 100644 index a9c97b1d5..000000000 --- a/python/src/tensorflow/alloc_empty.cc +++ /dev/null @@ -1,39 +0,0 @@ -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/shape_inference.h" - -using namespace tensorflow; - -class AllocEmptyOp : public OpKernel { - public: - explicit AllocEmptyOp(OpKernelConstruction* context) : OpKernel(context) {} - - void Compute(OpKernelContext* context) override { - std::cout << "executing allocempty" << std::endl; - // fetch input - const Tensor& x = context->input(0); - const int32* x_data = (const int32*)x.tensor_data().data(); - // allocate output - Tensor* y = NULL; - int32 x_rank = x.dims(); - OP_REQUIRES(context, x_rank == 1, errors::InvalidArgument("Input tensor must be 1D")); - int32 y_rank = x.dim_size(0); - TensorShape y_shapes; - for(size_t i = 0; i < y_rank; i++) - y_shapes.AddDim(x_data[i]); - OP_REQUIRES_OK(context, context->allocate_output(0, y_shapes, &y)); - } -}; - - -REGISTER_KERNEL_BUILDER(Name("AllocEmpty").HostMemory("x").Device(DEVICE_CPU).Device(DEVICE_GPU), AllocEmptyOp); -REGISTER_OP("AllocEmpty") - .Input("x: int32") - .Attr("T : {bool, int8, int16, int32, int64, float16, float32, float64}") - .Output("y: T") - .SetShapeFn([](shape_inference::InferenceContext* c) { - shape_inference::ShapeHandle handle; - c->MakeShapeFromShapeTensor(0, &handle); - c->set_output(0, handle); - return Status::OK(); - }); -; diff --git a/python/src/tensorflow/register_scalar.cc b/python/src/tensorflow/register_scalar.cc deleted file mode 100644 index 95eb3631f..000000000 --- a/python/src/tensorflow/register_scalar.cc +++ /dev/null @@ -1,37 +0,0 @@ -#include -#include "tensorflow/core/framework/op_kernel.h" - -using namespace tensorflow; - -extern std::map i64scalar_map; - -class RegisterScalarOp : public OpKernel { -public: - explicit RegisterScalarOp(OpKernelConstruction* context) - : OpKernel(context) { - OP_REQUIRES_OK(context, context->GetAttr("id", &id_)); - } - - void Compute(OpKernelContext* context) override { - // fetch input - const Tensor& x = context->input(0); - const int32* x_data = (const int32*)x.tensor_data().data(); - const int32 x_rank = x.dims(); - OP_REQUIRES(context, x_rank == 0, errors::InvalidArgument("Input must be a scalar")); - i64scalar_map[id_] = *x_data; - context->set_output(0, x); - } - -private: - int id_; -}; - - -REGISTER_KERNEL_BUILDER(Name("RegisterScalar") - .HostMemory("x") - .Device(DEVICE_CPU), RegisterScalarOp); -REGISTER_OP("RegisterScalar") - .Input("x: int32") - .Output("y: int32") - .Attr("id: int") -; diff --git a/python/triton/frameworks.py b/python/triton/frameworks.py index f495680f0..9385bc212 100644 --- a/python/triton/frameworks.py +++ b/python/triton/frameworks.py @@ -4,8 +4,6 @@ import triton._C.libtriton as libtriton torch = None tensorflow = None -tf_extra_ops = None -gen_resource_variable_ops = None def _import_torch(): global torch @@ -14,24 +12,13 @@ def _import_torch(): def _import_tensorflow(): global tensorflow - global gen_resource_variable_ops if tensorflow is None: import tensorflow - from tensorflow.python.ops import gen_resource_variable_ops - -def _import_tf_extra_ops(): - global tf_extra_ops - if tf_extra_ops is None: - path = os.path.dirname(libtriton.__file__) - path = os.path.join(path, 'libextra_tf_ops.so') - _import_tensorflow() - tf_extra_ops = tensorflow.load_op_library(path) def has_tensorflow(): result = 'tensorflow' in sys.modules if result: _import_tensorflow() - _import_tf_extra_ops() return result def has_torch(): diff --git a/python/triton/kernel.py b/python/triton/kernel.py index 60964abdc..aa0246289 100644 --- a/python/triton/kernel.py +++ b/python/triton/kernel.py @@ -219,7 +219,7 @@ class kernel: # retrieve framework op op_id = self.fw_id[key] # register grid - libtriton.register_grid(op_id, _make_grid(args)) + libtriton.register_grid(op_id, args[-1]) # id for the benchmark result bench_id = libtriton.make_scalar_id() if bench > 0 else -1 # call framework function diff --git a/python/triton/utils.py b/python/triton/utils.py index 17534abb7..231a35af4 100644 --- a/python/triton/utils.py +++ b/python/triton/utils.py @@ -13,90 +13,22 @@ class tf_empty_proxy: def empty(shape, dtype): if fw.has_tensorflow(): - shape = [x.handle if isinstance(x, scalar) else fw.tensorflow.constant(x) for x in shape] + shape = [fw.tensorflow.constant(x) for x in shape] shape = fw.tensorflow.stack(shape) return tf_empty_proxy(shape, dtype) #return fw.tf_extra_ops.alloc_empty(args, T = dtype) elif fw.has_torch(): return fw.torch.empty(*shape).cuda() -class lazy_shape: - - def __init__(self, shape): - self.shape = shape - - def __getitem__(self, key): - return scalar(self.shape[key]) - def shape(A) : if fw.has_tensorflow(): return A.shape.as_list() - #return lazy_shape(fw.tensorflow.shape(A)) elif fw.has_torch(): return A.shape else: assert False -class scalar: - - def __init__(self, x): - self.id = libtriton.make_scalar_id() - self.handle = fw.tf_extra_ops.register_scalar(x, id=self.id) - self.assume_initialized = False - - def set_assume_initialized(self): - self.assume_initialized = True - - def unset_assume_initialized(self): - self.assume_initialized = False - - def get_value(self): - if self.assume_initialized: - return libtriton.retrieve_scalar(self.id) - else: - return self.handle - - def __add__(self, other): - return self.get_value() + other - - def __radd__(self, other): - return other + self.get_value() - - def __sub__(self, other): - return self.get_value() - other - - def __rsub(self, other): - return other - self.get_value() - - def __mul__(self, other): - return self.get_value() * other - - def __rmul(self, other): - return other * self.get_value() - - def __floordiv__(self, other): - return self.get_value() // other - - def __rfloordiv__(self, other): - return other // self.get_value() - - def __div__(self, other): - return self.get_value() / other - - def __rdiv__(self, other): - return other / self.get_value() - - def __truediv__(self, other): - self.get_value().__truediv__(other) - - def __rtruediv__(self, other): - other.__truediv__(self.get_value()) - - def __neg__(self): - return -self.get_value() - - class id_dict: # Lazy entry for e.g., tensorflow, when value of benchmark is From 9b0f1a0807b051afdbd525e94e75a5aa21901d33 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 30 Oct 2019 13:44:31 -0400 Subject: [PATCH 479/494] more stuff --- python/examples/batchnorm.py | 10 +- python/src/bindings.cc | 5 +- python/triton/function.py | 10 +- python/triton/ops/batchnorm.py | 172 +++++++++++++++------------------ 4 files changed, 91 insertions(+), 106 deletions(-) diff --git a/python/examples/batchnorm.py b/python/examples/batchnorm.py index 673c8ec2c..e28fd5039 100644 --- a/python/examples/batchnorm.py +++ b/python/examples/batchnorm.py @@ -45,11 +45,11 @@ if mode == MODE.TF: fw_beta = tf.placeholder(shape=beta.shape, dtype=beta.dtype) fw_dy = tf.placeholder(shape=dy.shape, dtype=dy.dtype) # execute - fw_y = triton.ops.batchnorm(fw_x, fw_gamma, fw_beta, 1e-4) - #fw_dx, fw_dgamma, fw_dbeta = tf.gradients(fw_y, [fw_x, fw_gamma, fw_beta]) + fw_mean, fw_var = tf.nn.moments(fw_x, [1, 2, 3]) + fw_y = triton.ops.batchnorm(fw_x, fw_mean, fw_var, fw_gamma, fw_beta, 1e-4) + fw_dx, fw_dgamma, fw_dbeta = tf.gradients(fw_y, [fw_x, fw_gamma, fw_beta], fw_dy) + # run sess = tf.InteractiveSession() feed_dict = {fw_x: x, fw_gamma: gamma, fw_beta: beta, fw_dy: dy} sess.run(tf.global_variables_initializer()) - #print(fw_dx, fw_dgamma, fw_dbeta) - result = sess.run([fw_y], feed_dict=feed_dict) - print(result) + result = sess.run([fw_dx, fw_dgamma, fw_dbeta], feed_dict=feed_dict) \ No newline at end of file diff --git a/python/src/bindings.cc b/python/src/bindings.cc index b3b74b37b..15386ecc1 100644 --- a/python/src/bindings.cc +++ b/python/src/bindings.cc @@ -192,18 +192,19 @@ void gen_tf_register_kernel_builder(std::ostream &os, const std::string &name, const std::string &opname, const std::vector& args, const std::vector& outputs){ + + auto tolower = [](char c) { return std::tolower(c);}; os << "REGISTER_KERNEL_BUILDER(Name(\"" + name + "\").Device(DEVICE_GPU)"; for(size_t i = 0; i < args.size(); i++){ ir::argument *arg = args[i]; std::string name = arg->get_name(); - auto tolower = [](char c) { return std::tolower(c);}; std::transform(name.begin(), name.end(), name.begin(), tolower); if(!arg->get_type()->is_pointer_ty()) os << ".HostMemory(\"" + name + "\")"; } for(size_t i = 0; i < outputs.size(); i++){ std::string name = outputs[i]; - name[0] = std::tolower(name[0]); + std::transform(name.begin(), name.end(), name.begin(), tolower); os << ".HostMemory(\"" << name << "_shape\")"; } os << ", " + opname << ");\n"; diff --git a/python/triton/function.py b/python/triton/function.py index e75512b1b..7eba7f9a7 100644 --- a/python/triton/function.py +++ b/python/triton/function.py @@ -45,6 +45,7 @@ class function(metaclass = function_meta): def apply_tensorflow(cls, *args, **kwargs): ctx = OpContext() result = cls.forward(ctx, *args, **kwargs) + op = result[0].op if isinstance(result, tuple) else result.op # Find a mapping between ::forward arguments and tensorflow op arguments remap = dict() for i, ix in enumerate(result.op.inputs): @@ -52,13 +53,12 @@ class function(metaclass = function_meta): if ix is jx: remap[j] = i # register backward - ctx_registry[result] = ctx - name = result.op.op_def.name + ctx_registry[op] = ctx + name = op.op_def.name if not cls.registered: @fw.tensorflow.RegisterGradient(name) - def gradient(op, dy): - y = op.outputs[0] - grad = cls.backward(ctx_registry[y], dy) + def gradient(op, *dys): + grad = cls.backward(ctx_registry[op], dys if len(dys) > 1 else dys[0]) # Remap gradient in the right order ret = [None] * len(op.inputs) for i in range(len(grad)): diff --git a/python/triton/ops/batchnorm.py b/python/triton/ops/batchnorm.py index 5e352d93a..fb6d94017 100644 --- a/python/triton/ops/batchnorm.py +++ b/python/triton/ops/batchnorm.py @@ -6,138 +6,122 @@ class _batchnorm(triton.function): fwd_src = """ void fwdbatchnorm(float *Y, float *M, float *V, float *X, float *G, float *B, - int N, float rcpN, float eps) { - int rx[TM] = 0 ... TM; - float *px[TM]; - float x[TM] = 0; + int N, float eps) { + // pointers int c = get_program_id(1); - float g = *(G + c); - float b = *(B + c); + int rm[TM] = 0 ... TM; + float *px[TM] = X + rm + c*N; + float* py[TM] = Y + rm + c*N; - float mean[TM] = 0; - px = X + rx + c*N; + // compute mean + float accm[TM] = 0; + for(int i = 0; i < N; i = i + TM) + accm = accm + *(px + i); + float mean = (float)accm[+] / N; + *(M + c) = mean; + + // compute variance + float accv[TM] = 0; for(int i = 0; i < N; i = i + TM){ - x = *px; - mean = mean + x; - px = px + TM; + float x[TM] = *(px + i); + x = x - mean; + accv = accv + x*x; } - float *pm = M + c; - float m = mean[+] * rcpN; - *pm = m; + float var = (float)accv[+] / N; + *(V + c) = var; - float var[TM] = 0; - px = X + rx + c*N; + // Normalize batch + float gamma = *(G + c); + float beta = *(B + c); + float rstdg = 1 / sqrtf(var + eps) * gamma; for(int i = 0; i < N; i = i + TM){ - x = *px; - x = x - m; - var = var + x*x; - px = px + TM; - } - float v = var[+] * rcpN; - float *pv = V + c; - *pv = v; - float rstdg = 1 / sqrtf(v + eps) * g; - - px = X + rx + c*N; - float* py[TM] = Y + rx + c*N; - for(int i = 0; i < N; i = i + TM){ - x = *px; - float y[TM] = (x - m)*rstdg + b; - *py = y; - px = px + TM; - py = py + TM; + float x[TM] = *(px + i); + float y[TM] = (x - mean)*rstdg + beta; + *(py + i) = y; } } """ - - fwd_kernel = triton.kernel(fwd_src, ['Y', 'M', 'V']) + fwd_kernel = triton.kernel(fwd_src, ['Y']) bwd_src = """ -void batchnormBackward(float *DX, float *DG, float *DB, - float *DY, float *X, float *G, - float *M, float *V, - int DHWN, float rcpDHWN, float epsilon) { - int rx[TM] = 0 ... TM; +void bwdbatchnorm(float *DX, float *DG, float *DB, + float *DY, float *X, float *G, + float *M, float *V, + int N, float epsilon) { + + // pointers int c = get_program_id(1); - int offset = c*DHWN; - float g = *(G + c); + int rx[TM] = 0 ... TM; + int offset = c*N; + float* px[TM] = X + rx + offset; + float* pdy[TM] = DY + rx + offset; + float* pdx[TM] = DX + rx + offset; + + // fetch statistics + float gamma = *(G + c); float mean = *(M + c); float var = *(V + c); float rstd = 1 / sqrtf(var + epsilon); - float* px[TM]; - float* pdx[TM]; - float* pdy[TM]; - px = X + rx + offset; - pdy = DY + rx + offset; - float dg[TM] = 0; - float db[TM] = 0; - for(int i = 0; i < DHWN; i = i + TM){ - float x[TM] = *px; - float dy[TM] = *pdy; - dg = dg + dy*(x - mean)*rstd; - db = db + dy; - px = px + TM; - pdy = pdy + TM; + + // compute dgamma and dbeta + float acc_dg[TM] = 0; + float acc_db[TM] = 0; + for(int i = 0; i < N; i = i + TM){ + float x[TM] = *(px + i); + float dy[TM] = *(pdy + i); + acc_dg += dy*(x - mean)*rstd; + acc_db += dy; } - float sdg = dg[+]; - float sdb = db[+]; - float *pdg = DG + c; - float *pdb = DB + c; - *pdg = sdg; - *pdb = sdb; - px = X + rx + offset; - pdy = DY + rx + offset; - pdx = DX + rx + offset; - for(int i = 0; i < DHWN; i = i + TM){ - float x[TM] = *px; - float dy[TM] = *pdy; + float dg = acc_dg[+]; + float db = acc_db[+]; + *(DG + c) = dg; + *(DB + c) = db; + + // compute dx + for(int i = 0; i < N; i = i + TM){ + float x[TM] = *(px + i); + float dy[TM] = *(pdy + i); float xhat[TM] = (x - mean) * rstd; - float xtmp[TM] = (xhat * dg + db) * rcpDHWN; - float dx[TM] = (dy - xtmp) * rstd * g; - *pdx = dx; - px = px + TM; - pdy = pdy + TM; - pdx = pdx + TM; + float xtmp[TM] = (xhat * dg + db) / N; + float dx[TM] = (dy - xtmp) * rstd * gamma; + *(pdx + i) = dx; } } """ - bwd_kernel = triton.kernel(bwd_src, ['DX', 'DG', 'DB']) @staticmethod - def forward(ctx, x, gamma, beta, eps): + def forward(ctx, x, mean, var, gamma, beta, eps): shape = triton.shape(x) dtype = x.dtype # allocate outputs C, H, W, B = shape[0], shape[1], shape[2], shape[3] y = triton.empty(shape, dtype=dtype) - mean = triton.empty([C], dtype=dtype) - var = triton.empty([C], dtype=dtype) # execute kernels - N = H*W*B - y, mean, var = _batchnorm.fwd_kernel(y, mean, var, x, gamma, beta, N, 1./N, eps, - lambda opt: [1, C], - TM = 128) + y = _batchnorm.fwd_kernel(y, mean, var, x, gamma, beta, H*W*B, eps, + lambda opt: [1, C], + TM = 128) # save - ctx.eps = eps ctx.save_for_backward(x, gamma, beta, mean, var) + ctx.eps = eps return y @staticmethod def backward(ctx, dy): - eps = ctx.eps + # retrieve info x, gamma, beta, mean, var = ctx.saved_tensors - dx = triton.empty(x.shape, dtype=x.dtype) - dgamma = triton.empty(gamma.shape, dtype=gamma.dtype) - dbeta = triton.empty(beta.shape, dtype=beta.dtype) - # launch - C, H, W, B = x.shape - N = H*W*B - _batchnorm.bwd_kernel(dx, dgamma, dbeta, dy, + eps = ctx.eps + # allocate result + dx = triton.empty(triton.shape(x), dtype=x.dtype) + dgamma = triton.empty(triton.shape(gamma), dtype=gamma.dtype) + dbeta = triton.empty(triton.shape(beta), dtype=beta.dtype) + # execute + C, H, W, B = triton.shape(x) + dx, dgamma, dbeta = _batchnorm.bwd_kernel(dx, dgamma, dbeta, dy, x, gamma, mean, var, - N, 1./N, eps, + H*W*B, eps, lambda opt: [1, C], TM = 128) - return dx, dgamma, dbeta, None + return dx, None, None, dgamma, dbeta, None batchnorm = _batchnorm.apply \ No newline at end of file From fd09f9c99d2d47fe100463ad1a6ce1d60ae4dbef Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 30 Oct 2019 13:48:55 -0400 Subject: [PATCH 480/494] fixup --- python/triton/ops/batchnorm.py | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/python/triton/ops/batchnorm.py b/python/triton/ops/batchnorm.py index fb6d94017..f21faabdb 100644 --- a/python/triton/ops/batchnorm.py +++ b/python/triton/ops/batchnorm.py @@ -13,22 +13,9 @@ void fwdbatchnorm(float *Y, float *M, float *V, float *px[TM] = X + rm + c*N; float* py[TM] = Y + rm + c*N; - // compute mean - float accm[TM] = 0; - for(int i = 0; i < N; i = i + TM) - accm = accm + *(px + i); - float mean = (float)accm[+] / N; - *(M + c) = mean; - - // compute variance - float accv[TM] = 0; - for(int i = 0; i < N; i = i + TM){ - float x[TM] = *(px + i); - x = x - mean; - accv = accv + x*x; - } - float var = (float)accv[+] / N; - *(V + c) = var; + // fetch mean/var + float mean = *(M + c); + float var = *(V + c); // Normalize batch float gamma = *(G + c); From e0fe8d9058fb0dd100da23bf266279d8e73c097b Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 30 Oct 2019 18:39:58 -0400 Subject: [PATCH 481/494] [PYTHON][TENSORFLOW] More work --- python/examples/batchnorm.py | 5 +- python/src/bindings.cc | 181 +++++++++++++++++++-------------- python/triton/function.py | 32 ++++-- python/triton/kernel.py | 25 +++-- python/triton/ops/batchnorm.py | 40 +++++--- python/triton/utils.py | 5 + 6 files changed, 184 insertions(+), 104 deletions(-) diff --git a/python/examples/batchnorm.py b/python/examples/batchnorm.py index e28fd5039..a69d127c4 100644 --- a/python/examples/batchnorm.py +++ b/python/examples/batchnorm.py @@ -45,11 +45,12 @@ if mode == MODE.TF: fw_beta = tf.placeholder(shape=beta.shape, dtype=beta.dtype) fw_dy = tf.placeholder(shape=dy.shape, dtype=dy.dtype) # execute + fw_y = triton.ops.batchnorm(fw_x, fw_gamma, fw_beta, 1e-4) fw_mean, fw_var = tf.nn.moments(fw_x, [1, 2, 3]) - fw_y = triton.ops.batchnorm(fw_x, fw_mean, fw_var, fw_gamma, fw_beta, 1e-4) fw_dx, fw_dgamma, fw_dbeta = tf.gradients(fw_y, [fw_x, fw_gamma, fw_beta], fw_dy) # run sess = tf.InteractiveSession() feed_dict = {fw_x: x, fw_gamma: gamma, fw_beta: beta, fw_dy: dy} sess.run(tf.global_variables_initializer()) - result = sess.run([fw_dx, fw_dgamma, fw_dbeta], feed_dict=feed_dict) \ No newline at end of file + result = sess.run([fw_dx, fw_dgamma, fw_dbeta], feed_dict=feed_dict) + print(result) \ No newline at end of file diff --git a/python/src/bindings.cc b/python/src/bindings.cc index 15386ecc1..7fb4a29f0 100644 --- a/python/src/bindings.cc +++ b/python/src/bindings.cc @@ -112,51 +112,70 @@ inline std::string ref_to_tf_ty(ir::type *ty) { return res; } +std::string tf_normalize(const std::string& name) { + std::string ret = name; + auto tolower = [](char c) { return std::tolower(c);}; + std::transform(ret.begin(), ret.end(), ret.begin(), tolower); + return ret; +} -void gen_extract_inputs(std::ostream &os, const std::vector& args, const std::vector& outputs) { +struct tf_alloc_t{ + enum type_t{ + OUTPUT, + TEMP + }; + + tf_alloc_t(const std::string& _name, type_t _type) + : name(_name), type(_type), tf_name(tf_normalize(_name)){ } + + std::string tf_name; + std::string name; + type_t type; + size_t shape_id; +}; + +typedef std::vector alloc_map_t; + + +void gen_extract_inputs(std::ostream &os, const std::vector& args, const alloc_map_t& allocs) { for(unsigned i = 0; i < args.size(); i++){ ir::value *arg = args[i]; const std::string& name = arg->get_name(); std::string ty = to_tf_ty(arg->get_type()); if(!arg->get_type()->is_pointer_ty()) os << " " << ty << " " << name << " = context->input(" << i << ").scalar<" << ty << ">()();\n "; - else if(std::find(outputs.begin(), outputs.end(), arg->get_name()) == outputs.end()) + else if(std::find_if(allocs.begin(), allocs.end(), + [&](tf_alloc_t x) { + return x.name == name; + }) == allocs.end()) os << " const Tensor* " << name << " = &context->input(" << i << ");\n "; else os << " Tensor* " << name << " = nullptr;\n "; } } -void gen_set_outputs(std::ostream &os, const std::vector& args, const std::vector& outputs) { - for(unsigned i = 0; i < outputs.size(); i++) - os << " TensorShape shape" << i << ";\n "; +void gen_set_outputs(std::ostream &os, const std::vector& args, const alloc_map_t& allocs) { // initialize shapes - - std::vector out_idx; - for(size_t i = 0; i < outputs.size(); i++){ - std::string name = outputs[i]; - size_t idx; - for(idx = 0; idx < args.size(); idx++) - if(args[idx]->get_name() == name) - break; - if(idx == args.size()) - throw std::runtime_error("unknown output"); - out_idx.push_back(idx); - } - - for(unsigned i = 0; i < outputs.size(); i++) - os << " const Tensor& " << outputs[i] << "_shape = context->input(" << out_idx[i] << ");\n "; - for(unsigned i = 0; i < outputs.size(); i++) - os << " const int32* " << outputs[i] << "_shape_data = (const int32*)" << outputs[i] << "_shape.tensor_data().data();\n "; - for(unsigned i = 0; i < outputs.size(); i++) - os << " size_t " << outputs[i] << "_rank = " << outputs[i] << "_shape.dim_size(0);\n "; - for(unsigned i = 0; i < outputs.size(); i++) - os << " for(size_t d = 0; d < " << outputs[i] << "_rank ; d++) " - << "shape" << i << ".AddDim(" << outputs[i] << "_shape_data[d]);\n "; + for(const auto& x: allocs) + os << " TensorShape " << x.name << "_shape;\n "; + for(const auto& x: allocs) + os << " const Tensor& " << x.name << "_shape_tensor = context->input(" << x.shape_id << ");\n "; + for(const auto& x: allocs) + os << " const int32* " << x.name << "_shape_data = (const int32*)" << x.name << "_shape_tensor.tensor_data().data();\n "; + for(const auto& x: allocs) + os << " size_t " << x.name << "_rank = " << x.name << "_shape_tensor.dim_size(0);\n "; + for(const auto& x: allocs) + os << " for(size_t d = 0; d < " << x.name << "_rank ; d++) " + << x.name << "_shape.AddDim(" << x.name << "_shape_data[d]);\n "; // allocate - for(unsigned i = 0; i < outputs.size(); i++) - os << " OP_REQUIRES_OK(context, context->allocate_output(" << i << ", shape" << i << ", &" << outputs[i] << "));\n "; + int output = 0; + for(const auto& x: allocs){ + if(x.type == tf_alloc_t::OUTPUT) + os << " OP_REQUIRES_OK(context, context->allocate_output(" << output++ << ", " << x.name << "_shape, &" << x.name << "));\n "; + else + os << " OP_REQUIRES_OK(context, context->allocate_temp(" << x.name << "_type, " << x.name << "_shape, " << x.name << "));\n "; + } } void gen_make_handles(std::ostream &os, const std::vector& args) { @@ -169,7 +188,7 @@ void gen_make_handles(std::ostream &os, const std::vector& args) } } -void gen_make_launch_function(std::ostream &os, int num_outputs, const std::vector& args) { +void gen_make_launch_function(std::ostream &os, const std::vector& args) { os << " std::function run = [&](){\n "; os << " (*id_fn_map.at(id_))({"; for(unsigned i = 0; i < args.size() ; i++){ @@ -181,9 +200,9 @@ void gen_make_launch_function(std::ostream &os, int num_outputs, const std::vect os << ", "; os << name; } - os << "}, *id_grid_map.at(id_), stream);\n"; + os << "}, *id_grid_map.at(id_), stream);\n "; os << " };\n "; - os << " run();"; + os << " run();\n "; os << " if(bench_ > 0)\n "; os << " i64scalar_map[bench_id_] = triton::tools::bench(run, stream);\n "; } @@ -191,69 +210,53 @@ void gen_make_launch_function(std::ostream &os, int num_outputs, const std::vect void gen_tf_register_kernel_builder(std::ostream &os, const std::string &name, const std::string &opname, const std::vector& args, - const std::vector& outputs){ + const alloc_map_t& allocs){ - auto tolower = [](char c) { return std::tolower(c);}; os << "REGISTER_KERNEL_BUILDER(Name(\"" + name + "\").Device(DEVICE_GPU)"; for(size_t i = 0; i < args.size(); i++){ ir::argument *arg = args[i]; - std::string name = arg->get_name(); - std::transform(name.begin(), name.end(), name.begin(), tolower); + std::string name = tf_normalize(arg->get_name()); if(!arg->get_type()->is_pointer_ty()) os << ".HostMemory(\"" + name + "\")"; } - for(size_t i = 0; i < outputs.size(); i++){ - std::string name = outputs[i]; - std::transform(name.begin(), name.end(), name.begin(), tolower); - os << ".HostMemory(\"" << name << "_shape\")"; - } + for(const auto& x: allocs) + os << ".HostMemory(\"" << x.tf_name << "_shape\")"; os << ", " + opname << ");\n"; } void gen_tf_register_op(std::ostream &os, const std::string &name, const std::vector& args, - const std::vector& outputs){ + const alloc_map_t& allocs){ - auto tolower = [](char c) { return std::tolower(c);}; os << "REGISTER_OP(\"" << name << "\")\n"; for(size_t i = 0; i < args.size(); i++) os << " .Attr(\"T" << i << " : {bool, int8, int16, int32, int64, float16, float32, float64}\")" << std::endl; for(size_t i = 0; i < args.size(); i++){ ir::argument *arg = args[i]; - std::string name = arg->get_name(); - std::transform(name.begin(), name.end(), name.begin(), tolower); - if(std::find(outputs.begin(), outputs.end(), arg->get_name()) == outputs.end()) + std::string name = tf_normalize(arg->get_name()); + if(std::find_if(allocs.begin(), allocs.end(), + [&](tf_alloc_t x) { + return name == x.tf_name; + }) == allocs.end()) os << " .Input(\"" << name << ": T" << i << "\")\n"; else os << " .Input(\"" << name << "_shape: int32\")\n"; } - std::vector out_idx; - for(size_t i = 0; i < outputs.size(); i++){ - std::string name = outputs[i]; - size_t idx; - for(idx = 0; idx < args.size(); idx++) - if(args[idx]->get_name() == name) - break; - if(idx == args.size()) - throw std::runtime_error("unknown output"); - out_idx.push_back(idx); - } - for(size_t i = 0; i < out_idx.size(); i++){ - std::string name = outputs[i]; - std::transform(name.begin(), name.end(), name.begin(), tolower); - os << " .Output(\"" << name << ": T" << out_idx[i] << "\")\n"; - } + for(const auto& x: allocs) + if(x.type == tf_alloc_t::OUTPUT) + os << " .Output(\"" << x.tf_name << ": T" << x.shape_id << "\")\n"; os << " .Attr(\"id: int\")\n"; os << " .Attr(\"bench: int\")\n"; os << " .Attr(\"bench_id: int\")\n"; os << " .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* ctx) {\n"; - for(size_t i = 0; i < out_idx.size(); i++) - os << " shape_inference::ShapeHandle handle" << i << ";\n"; - for(size_t i = 0; i < out_idx.size(); i++) - os << " ctx->MakeShapeFromShapeTensor(" << out_idx[i] << ", &handle" << i << ");\n"; - for(size_t i = 0; i < out_idx.size(); i++) - os << " ctx->set_output(" << i << ", handle" << i << ");\n"; + size_t current = 0; + for(const auto& x: allocs) + if(x.type == tf_alloc_t::OUTPUT){ + os << " shape_inference::ShapeHandle " << x.tf_name << "_handle;\n"; + os << " ctx->MakeShapeFromShapeTensor(" << x.shape_id << ", &" << x.tf_name << "_handle);\n"; + os << " ctx->set_output(" << current++ << ", " << x.tf_name << "_handle);\n"; + } os << " return Status::OK();\n"; os << " })\n"; @@ -280,6 +283,7 @@ void make_module(const std::string& src, ir::module* ir, std::tuple make_tensorflow_src(const std::string& src, const std::vector& outputs, + const std::vector& tmp, const runtime::function::options_space_t& opt) { // triton-ir code-gen @@ -289,10 +293,28 @@ std::tupleget_function_list().front(); + const std::vector& args = fn->args(); std::string name = fn->get_name(); std::string cc_name = name; cc_name[0] = static_cast(std::toupper(cc_name[0])); std::string opname = cc_name + "Op"; + + // allocation info + alloc_map_t allocs; + for(size_t i = 0; i < outputs.size(); i++) + allocs.push_back(tf_alloc_t(outputs[i], tf_alloc_t::OUTPUT)); + for(size_t i = 0; i < tmp.size(); i++) + allocs.push_back(tf_alloc_t(tmp[i], tf_alloc_t::TEMP)); + + for(auto &x: allocs){ + size_t idx; + for(idx = 0; idx < args.size(); idx++) + if(args[idx]->get_name() == x.name) + break; + if(idx == args.size()) + throw std::runtime_error("unknown output"); + x.shape_id = idx; + } std::ostringstream oss; oss << R"( @@ -323,6 +345,11 @@ class )" << opname << R"(: public OpKernel { OP_REQUIRES_OK(context, context->GetAttr("id", &id_)); OP_REQUIRES_OK(context, context->GetAttr("bench", &bench_)); OP_REQUIRES_OK(context, context->GetAttr("bench_id", &bench_id_)); + )"; +for(const auto& alloc: allocs) + oss << " OP_REQUIRES_OK(context, context->GetAttr(\"T" << alloc.shape_id << "\", &" << alloc.name << "_type));\n "; + +oss << R"( } void Compute(OpKernelContext* context){ @@ -335,21 +362,21 @@ class )" << opname << R"(: public OpKernel { // extract inputs )"; -gen_extract_inputs(oss, fn->args(), outputs); +gen_extract_inputs(oss, args, allocs); oss << R"( // set outputs )"; -gen_set_outputs(oss, fn->args(), outputs); +gen_set_outputs(oss, args, allocs); oss << R"( // wrap tensors )"; -gen_make_handles(oss, fn->args()); +gen_make_handles(oss, args); oss << R"( )"; oss << R"( // launch function )"; -gen_make_launch_function(oss, outputs.size(), fn->args()); +gen_make_launch_function(oss, args); oss << R"( } @@ -357,15 +384,20 @@ private: int id_; int bench_; int64 bench_id_; + )"; +for(const auto& alloc: allocs) + oss << "DataType " << alloc.name << "_type;\n "; + +oss << R"( }; // register kernel builder )"; -gen_tf_register_kernel_builder(oss, cc_name, opname, fn->args(), outputs); +gen_tf_register_kernel_builder(oss, cc_name, opname, args, allocs); oss << R"( // register op )"; -gen_tf_register_op(oss, cc_name, fn->args(), outputs); +gen_tf_register_op(oss, cc_name, args, allocs); return {oss.str(), name}; } @@ -517,6 +549,7 @@ void gen_torch_ret(std::ostream &os, const std::vector& outputs) { std::tuple make_torch_src(const std::string& src, const std::vector& outputs, + const std::vector& tmp, const runtime::function::options_space_t& opt) { // triton-ir code-gen ir::context ctx; diff --git a/python/triton/function.py b/python/triton/function.py index 7eba7f9a7..213f9f16e 100644 --- a/python/triton/function.py +++ b/python/triton/function.py @@ -41,24 +41,43 @@ class function(metaclass = function_meta): return cls.backward(ctx, grad_output) return TorchFunction.apply(*args, **kwargs) + @classmethod + def extract_tf_tensors(cls, lst, err): + for x in lst: + if x and not isinstance(x, triton.utils.tf_empty_proxy): + raise ValueError('Results of ' + err + ' must be created using triton.empty()') + if x and x.tensor is None: + raise ValueError('Empty tensor never filled during ' + err) + return [x.tensor if x else None for x in lst] + @classmethod def apply_tensorflow(cls, *args, **kwargs): ctx = OpContext() result = cls.forward(ctx, *args, **kwargs) - op = result[0].op if isinstance(result, tuple) else result.op + + # check that all the results stem from triton.empty + # and get the corresponding TF tensors if possible + result = result if isinstance(result, tuple) else (result, ) + result = function.extract_tf_tensors(result, 'forward') + # Find a mapping between ::forward arguments and tensorflow op arguments + op = result[0].op remap = dict() - for i, ix in enumerate(result.op.inputs): + for i, ix in enumerate(op.inputs): for j, jx in enumerate(args): if ix is jx: remap[j] = i - # register backward + + # Register backward pass ctx_registry[op] = ctx name = op.op_def.name if not cls.registered: @fw.tensorflow.RegisterGradient(name) - def gradient(op, *dys): - grad = cls.backward(ctx_registry[op], dys if len(dys) > 1 else dys[0]) + def gradient(op, *dy): + dy = dy if len(dy) > 1 else dy[0] + grad = cls.backward(ctx_registry[op], dy) + grad = function.extract_tf_tensors(grad, 'backward') + # Remap gradient in the right order ret = [None] * len(op.inputs) for i in range(len(grad)): @@ -67,7 +86,8 @@ class function(metaclass = function_meta): # Return return ret cls.registered = True - # return result tensor + + # Return tensor return result @classmethod diff --git a/python/triton/kernel.py b/python/triton/kernel.py index aa0246289..59a797ec8 100644 --- a/python/triton/kernel.py +++ b/python/triton/kernel.py @@ -15,11 +15,11 @@ import triton.frameworks as fw import triton.utils import triton._C.libtriton as libtriton -def _make_framework_src(src, out, grid): +def _make_framework_src(src, out, tmp, grid): if fw.has_tensorflow(): - return libtriton.make_tensorflow_src(src, out, grid) + return libtriton.make_tensorflow_src(src, out, tmp, grid) elif fw.has_torch: - return libtriton.make_torch_src(src, out, grid) + return libtriton.make_torch_src(src, out, tmp, grid) else: assert False @@ -152,8 +152,8 @@ def _cvt_to_def_str(obj): return str(obj) -def _make_framework_op(src, outputs, options): - src, name = _make_framework_src(src, outputs, options) +def _make_framework_op(src, outputs, tmp, options): + src, name = _make_framework_src(src, outputs, tmp, options) cache_path = _make_cache_path(src) cpp, so = _write_bindings(src, cache_path) _build(cpp, cache_path) @@ -181,12 +181,13 @@ bench_registry = triton.utils.id_dict() class kernel: - def __init__(self, src, outputs): + def __init__(self, src, outputs, tmp=[]): self.fw_id = dict() self.fw_grids = dict() self.fw_op = None self.src = src self.outputs = outputs + self.tmp = tmp def __call__(self, *args, **kwargs): # create a new framework op when defines are different @@ -210,7 +211,7 @@ class kernel: # register function libtriton.register_fn(op_id, self.src, opt) if self.fw_op is None: - self.fw_op = _make_framework_op(self.src, self.outputs, opt) + self.fw_op = _make_framework_op(self.src, self.outputs, self.tmp, opt) # benchmarking info bench = 0 @@ -225,6 +226,7 @@ class kernel: # call framework function if fw.has_tensorflow(): # operands + outputs = [x for x in args[:-1] if isinstance(x, triton.utils.tf_empty_proxy)] operands = [x.shape if isinstance(x, triton.utils.tf_empty_proxy) else x for x in args[:-1]] # output data types kwargs = {'id': op_id, 'bench': bench, 'bench_id': bench_id} @@ -233,13 +235,16 @@ class kernel: kwargs['T' + str(i)] = x.dtype # launch ret = self.fw_op(*operands, **kwargs) + assert len(ret) == len(outputs) + # record results + for i in range(len(outputs)): + outputs[i].tensor = ret[i] if bench > 0: bench_registry[ret] = triton.utils.id_dict.lazy_entry(bench_id) elif fw.has_torch(): args = [x.contiguous() if isinstance(x, fw.torch.Tensor) else x for x in args[:-1]] - ret = self.fw_op(op_id, bench, bench_id, *args) + self.fw_op(op_id, bench, bench_id, *args) if bench > 0: bench_registry[ret] = libtriton.retrieve_scalar(op_id) else: - assert False - return ret \ No newline at end of file + assert False \ No newline at end of file diff --git a/python/triton/ops/batchnorm.py b/python/triton/ops/batchnorm.py index f21faabdb..82a2b9fe2 100644 --- a/python/triton/ops/batchnorm.py +++ b/python/triton/ops/batchnorm.py @@ -13,9 +13,22 @@ void fwdbatchnorm(float *Y, float *M, float *V, float *px[TM] = X + rm + c*N; float* py[TM] = Y + rm + c*N; - // fetch mean/var - float mean = *(M + c); - float var = *(V + c); + // compute mean + float accm[TM] = 0; + for(int i = 0; i < N; i = i + TM) + accm = accm + *(px + i); + float mean = (float)accm[+] / N; + *(M + c) = mean; + + // compute variance + float accv[TM] = 0; + for(int i = 0; i < N; i = i + TM){ + float x[TM] = *(px + i); + x = x - mean; + accv = accv + x*x; + } + float var = (float)accv[+] / N; + *(V + c) = var; // Normalize batch float gamma = *(G + c); @@ -28,7 +41,7 @@ void fwdbatchnorm(float *Y, float *M, float *V, } } """ - fwd_kernel = triton.kernel(fwd_src, ['Y']) + fwd_kernel = triton.kernel(fwd_src, ['Y', 'M', 'V']) bwd_src = """ void bwdbatchnorm(float *DX, float *DG, float *DB, @@ -78,23 +91,26 @@ void bwdbatchnorm(float *DX, float *DG, float *DB, bwd_kernel = triton.kernel(bwd_src, ['DX', 'DG', 'DB']) @staticmethod - def forward(ctx, x, mean, var, gamma, beta, eps): + def forward(ctx, x, gamma, beta, eps): shape = triton.shape(x) dtype = x.dtype # allocate outputs C, H, W, B = shape[0], shape[1], shape[2], shape[3] y = triton.empty(shape, dtype=dtype) + mean = triton.empty([C], dtype=dtype) + var = triton.empty([C], dtype=dtype) # execute kernels - y = _batchnorm.fwd_kernel(y, mean, var, x, gamma, beta, H*W*B, eps, - lambda opt: [1, C], - TM = 128) + _batchnorm.fwd_kernel(y, mean, var, x, gamma, beta, H*W*B, eps, + lambda opt: [1, C], + TM = 128) # save - ctx.save_for_backward(x, gamma, beta, mean, var) + ctx.save_for_backward(x, gamma, beta, mean.tensor, var.tensor) ctx.eps = eps return y @staticmethod - def backward(ctx, dy): + def backward(ctx, grads): + dy, dmean, dvar = grads # retrieve info x, gamma, beta, mean, var = ctx.saved_tensors eps = ctx.eps @@ -104,11 +120,11 @@ void bwdbatchnorm(float *DX, float *DG, float *DB, dbeta = triton.empty(triton.shape(beta), dtype=beta.dtype) # execute C, H, W, B = triton.shape(x) - dx, dgamma, dbeta = _batchnorm.bwd_kernel(dx, dgamma, dbeta, dy, + _batchnorm.bwd_kernel(dx, dgamma, dbeta, dy, x, gamma, mean, var, H*W*B, eps, lambda opt: [1, C], TM = 128) - return dx, None, None, dgamma, dbeta, None + return dx, dgamma, dbeta, None batchnorm = _batchnorm.apply \ No newline at end of file diff --git a/python/triton/utils.py b/python/triton/utils.py index 231a35af4..ddc050d15 100644 --- a/python/triton/utils.py +++ b/python/triton/utils.py @@ -10,6 +10,11 @@ class tf_empty_proxy: def __init__(self, shape, dtype): self.shape = shape self.dtype = dtype + self.tensor = None + + def to_tensor(self): + assert self.tensor + return self.tensor def empty(shape, dtype): if fw.has_tensorflow(): From 93a86d4fc62ee2957b43f19ca262a3be08a55237 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 30 Oct 2019 20:29:23 -0400 Subject: [PATCH 482/494] [PYTHON][TENSORFLOW] Signature of function.forward() does not have to match signature of kernel anymore --- python/triton/function.py | 19 +++++++++++++------ python/triton/kernel.py | 14 +++++++++----- python/triton/ops/batchnorm.py | 3 +-- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/python/triton/function.py b/python/triton/function.py index 213f9f16e..43c8aa7b9 100644 --- a/python/triton/function.py +++ b/python/triton/function.py @@ -62,11 +62,16 @@ class function(metaclass = function_meta): # Find a mapping between ::forward arguments and tensorflow op arguments op = result[0].op - remap = dict() + remap_in = dict() for i, ix in enumerate(op.inputs): for j, jx in enumerate(args): if ix is jx: - remap[j] = i + remap_in[j] = i + remap_out = [] + for i, ix in enumerate(result): + for j, jx in enumerate(op.outputs): + if ix is jx: + remap_out.append(j) # Register backward pass ctx_registry[op] = ctx @@ -75,14 +80,16 @@ class function(metaclass = function_meta): @fw.tensorflow.RegisterGradient(name) def gradient(op, *dy): dy = dy if len(dy) > 1 else dy[0] - grad = cls.backward(ctx_registry[op], dy) + # Remap gradient inputs in the right order + grads = [dy[i] for i in remap_out] + # Execute gradient function + grad = cls.backward(ctx_registry[op], grads) grad = function.extract_tf_tensors(grad, 'backward') - # Remap gradient in the right order ret = [None] * len(op.inputs) for i in range(len(grad)): - if i in remap: - ret[remap[i]] = grad[i] + if i in remap_in: + ret[remap_in[i]] = grad[i] # Return return ret cls.registered = True diff --git a/python/triton/kernel.py b/python/triton/kernel.py index 59a797ec8..ea90e339b 100644 --- a/python/triton/kernel.py +++ b/python/triton/kernel.py @@ -225,8 +225,10 @@ class kernel: bench_id = libtriton.make_scalar_id() if bench > 0 else -1 # call framework function if fw.has_tensorflow(): + empty = [x for x in args[:-1] if isinstance(x, triton.utils.tf_empty_proxy)] + if len(empty) != len(self.outputs): + raise ValueError('Number of empty arguments does not much number of outputs provided') # operands - outputs = [x for x in args[:-1] if isinstance(x, triton.utils.tf_empty_proxy)] operands = [x.shape if isinstance(x, triton.utils.tf_empty_proxy) else x for x in args[:-1]] # output data types kwargs = {'id': op_id, 'bench': bench, 'bench_id': bench_id} @@ -235,10 +237,12 @@ class kernel: kwargs['T' + str(i)] = x.dtype # launch ret = self.fw_op(*operands, **kwargs) - assert len(ret) == len(outputs) - # record results - for i in range(len(outputs)): - outputs[i].tensor = ret[i] + # fill empty tensors with corresponding values + for j, y in enumerate(ret[0].op.op_def.output_arg): + for i, x in enumerate(ret[0].op.op_def.input_arg): + if y.name + '_shape' == x.name: + empty[i].tensor = ret[j] + # store timing information if bench > 0: bench_registry[ret] = triton.utils.id_dict.lazy_entry(bench_id) elif fw.has_torch(): diff --git a/python/triton/ops/batchnorm.py b/python/triton/ops/batchnorm.py index 82a2b9fe2..9bb3450c0 100644 --- a/python/triton/ops/batchnorm.py +++ b/python/triton/ops/batchnorm.py @@ -109,8 +109,7 @@ void bwdbatchnorm(float *DX, float *DG, float *DB, return y @staticmethod - def backward(ctx, grads): - dy, dmean, dvar = grads + def backward(ctx, dy): # retrieve info x, gamma, beta, mean, var = ctx.saved_tensors eps = ctx.eps From 91a2fd463bf92b92b1519b5f78314451566ae28e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 31 Oct 2019 01:49:30 -0400 Subject: [PATCH 483/494] [PYTHON][TENSORFLOW] More bugfixes for forward/backward signatures --- python/triton/function.py | 73 ++++++++++++++++++++++------------ python/triton/kernel.py | 16 +++++--- python/triton/ops/batchnorm.py | 2 +- python/triton/ops/dot.py | 9 +++-- python/triton/ops/einsum.py | 17 ++++---- python/triton/utils.py | 2 +- 6 files changed, 74 insertions(+), 45 deletions(-) diff --git a/python/triton/function.py b/python/triton/function.py index 43c8aa7b9..9b71106de 100644 --- a/python/triton/function.py +++ b/python/triton/function.py @@ -1,10 +1,14 @@ import triton.frameworks as fw -import triton.utils +import triton.utils as utils class OpContext(object): + def __init__(self): + self.to_save = [] + def save_for_backward(self, *tensors): - self.to_save = tensors + self.to_save = [x.to_tensor() if isinstance(x, utils.tf_empty_proxy) else x + for x in tensors] @property def saved_tensors(self): @@ -16,7 +20,7 @@ class function_meta(type): cls.registered = False return super(function_meta, cls).__init__(name, bases, attrs) -ctx_registry = triton.utils.id_dict() +ctx_registry = utils.id_dict() class function(metaclass = function_meta): @@ -43,13 +47,39 @@ class function(metaclass = function_meta): @classmethod def extract_tf_tensors(cls, lst, err): + ret = [] for x in lst: - if x and not isinstance(x, triton.utils.tf_empty_proxy): - raise ValueError('Results of ' + err + ' must be created using triton.empty()') - if x and x.tensor is None: - raise ValueError('Empty tensor never filled during ' + err) - return [x.tensor if x else None for x in lst] + if x is None: + ret += [None] + elif isinstance(x, fw.tensorflow.Tensor): + ret += [x] + elif isinstance(x, utils.tf_empty_proxy): + if x.tensor is None: + raise ValueError('Empty tensor never filled during ' + err) + else: + ret += [x.tensor] + else: + raise ValueError('Unsupported return type', type(x)) + return ret + @classmethod + def map_in_to_args(cls, op, args): + ret = dict() + for i, ix in enumerate(op.inputs): + for j, jx in enumerate(args): + if ix is jx: + ret[j] = i + return ret + + @classmethod + def map_res_to_out(cls, op, result): + ret = [] + for i, ix in enumerate(result): + for j, jx in enumerate(op.outputs): + if ix is jx: + ret.append(j) + return ret + @classmethod def apply_tensorflow(cls, *args, **kwargs): ctx = OpContext() @@ -60,30 +90,21 @@ class function(metaclass = function_meta): result = result if isinstance(result, tuple) else (result, ) result = function.extract_tf_tensors(result, 'forward') - # Find a mapping between ::forward arguments and tensorflow op arguments - op = result[0].op - remap_in = dict() - for i, ix in enumerate(op.inputs): - for j, jx in enumerate(args): - if ix is jx: - remap_in[j] = i - remap_out = [] - for i, ix in enumerate(result): - for j, jx in enumerate(op.outputs): - if ix is jx: - remap_out.append(j) - # Register backward pass - ctx_registry[op] = ctx + key = result[0] + op = result[0].op + ctx_registry[key] = ctx + remap_in = cls.map_in_to_args(op, args) + remap_out = cls.map_res_to_out(op, result) name = op.op_def.name if not cls.registered: @fw.tensorflow.RegisterGradient(name) def gradient(op, *dy): - dy = dy if len(dy) > 1 else dy[0] # Remap gradient inputs in the right order - grads = [dy[i] for i in remap_out] + dy = [dy[i] for i in remap_out] + dy = dy if len(dy) > 1 else dy[0] # Execute gradient function - grad = cls.backward(ctx_registry[op], grads) + grad = cls.backward(ctx_registry[key], dy) grad = function.extract_tf_tensors(grad, 'backward') # Remap gradient in the right order ret = [None] * len(op.inputs) @@ -95,7 +116,7 @@ class function(metaclass = function_meta): cls.registered = True # Return tensor - return result + return result[0] if len(result)==1 else result @classmethod def apply(cls, *args, **kwargs): diff --git a/python/triton/kernel.py b/python/triton/kernel.py index ea90e339b..79b3e59ce 100644 --- a/python/triton/kernel.py +++ b/python/triton/kernel.py @@ -237,14 +237,20 @@ class kernel: kwargs['T' + str(i)] = x.dtype # launch ret = self.fw_op(*operands, **kwargs) - # fill empty tensors with corresponding values - for j, y in enumerate(ret[0].op.op_def.output_arg): - for i, x in enumerate(ret[0].op.op_def.input_arg): + ret = [ret] if isinstance(ret, fw.tensorflow.Tensor) else ret + op_def = ret[0].op.op_def + # fill empty tensors with corresponding values + for j, y in enumerate(op_def.output_arg): + found = False + for i, x in enumerate(op_def.input_arg): if y.name + '_shape' == x.name: - empty[i].tensor = ret[j] + args[i].tensor = ret[j] + found = True + assert found # store timing information if bench > 0: - bench_registry[ret] = triton.utils.id_dict.lazy_entry(bench_id) + for y in ret: + bench_registry[y] = triton.utils.id_dict.lazy_entry(bench_id) elif fw.has_torch(): args = [x.contiguous() if isinstance(x, fw.torch.Tensor) else x for x in args[:-1]] self.fw_op(op_id, bench, bench_id, *args) diff --git a/python/triton/ops/batchnorm.py b/python/triton/ops/batchnorm.py index 9bb3450c0..9409134d9 100644 --- a/python/triton/ops/batchnorm.py +++ b/python/triton/ops/batchnorm.py @@ -104,7 +104,7 @@ void bwdbatchnorm(float *DX, float *DG, float *DB, lambda opt: [1, C], TM = 128) # save - ctx.save_for_backward(x, gamma, beta, mean.tensor, var.tensor) + ctx.save_for_backward(x, gamma, beta, mean, var) ctx.eps = eps return y diff --git a/python/triton/ops/dot.py b/python/triton/ops/dot.py index 140cd82cd..339fba4c6 100644 --- a/python/triton/ops/dot.py +++ b/python/triton/ops/dot.py @@ -76,10 +76,11 @@ void dot(TYPE * A, TYPE * B, TYPE * C, 'BROADCAST_BK': 'newaxis, :' if transpose_b else ':, newaxis', 'BROADCAST_BN': ':, newaxis' if transpose_b else 'newaxis, :', 'SHAPE_B' : 'TN, TK' if transpose_b else 'TK, TN'} - return _dot.kernel(a, b, c, M, N, Ka, lda, ldb, ldc, - grid, bench=bench, - AT = transpose_a, BT = transpose_b, TYPE = dtype, - TM = [64, 128], TN = [64, 128], TK = [8], **macros) + _dot.kernel(a, b, c, M, N, Ka, lda, ldb, ldc, + grid, bench=bench, + AT = transpose_a, BT = transpose_b, TYPE = dtype, + TM = [64, 128], TN = [64, 128], TK = [8], **macros) + return c @staticmethod def forward(ctx, a, b, transpose_a = False, transpose_b = False, bench = 0): diff --git a/python/triton/ops/einsum.py b/python/triton/ops/einsum.py index 167b2aacd..4c3409885 100644 --- a/python/triton/ops/einsum.py +++ b/python/triton/ops/einsum.py @@ -169,14 +169,15 @@ void einsumk(TYPE * A, TYPE * B, TYPE * C, TN = [2**i for i in range(5, max(6, min(8, int(math.log2(bmnk[2]) + 1 ))))] TB = [2**i for i in range(0, max(1, min(3, int(math.log2(bmnk[0]) + 1 ))))] TK = [bmnk[2]] if bmnk[2] < 16 else [8, 16] - return _einsum.kernel(a, b, c, - bmnk[1], bmnk[2], bmnk[3], - std0[0], std0[1], std0[2], - std1[0], std1[1], std1[2], - grid, bench=bench, - **macros, - TYPE=dtype, TM=TM, TN=TN, TK=TK, TB=TB) - + _einsum.kernel(a, b, c, + bmnk[1], bmnk[2], bmnk[3], + std0[0], std0[1], std0[2], + std1[0], std1[1], std1[2], + grid, bench=bench, + **macros, + TYPE=dtype, TM=TM, TN=TN, TK=TK, TB=TB) + return c + @staticmethod def forward(ctx, subscripts, a, b, bench = 0): diff --git a/python/triton/utils.py b/python/triton/utils.py index ddc050d15..a51608508 100644 --- a/python/triton/utils.py +++ b/python/triton/utils.py @@ -13,7 +13,7 @@ class tf_empty_proxy: self.tensor = None def to_tensor(self): - assert self.tensor + assert self.tensor is not None return self.tensor def empty(shape, dtype): From 739a8d90619006d7f9158b196a344e20c0c940bf Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 31 Oct 2019 18:08:27 -0400 Subject: [PATCH 484/494] some work on conv --- lib/runtime/function.cc | 4 +- python/examples/conv.py | 11 ++ python/examples/dot.py | 4 +- python/src/bindings.cc | 6 + python/triton/function.py | 17 +-- python/triton/kernel.py | 2 +- python/triton/ops/__init__.py | 3 +- python/triton/ops/conv.py | 234 ++++++++++++++++++++++++++++++++++ python/triton/ops/dot.py | 3 +- python/triton/utils.py | 18 ++- 10 files changed, 278 insertions(+), 24 deletions(-) create mode 100644 python/examples/conv.py create mode 100644 python/triton/ops/conv.py diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index d955b69f0..32b295556 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -168,6 +168,7 @@ function::caller function::autotune(driver::stream* stream, const grid_fn_ty& gr for(auto it: opt_space_.defines) cpp.AddMacro(it.first, &opt.defines.at(it.first)); cpp.Process(tokens); + // parse Parser parser(tokens); parser.Parse(); @@ -220,7 +221,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::transform::cts cts; codegen::generator isel(&axes, &layouts, &align, &allocation, target.get(), opt.num_warps); // run passes -// ir::print(module, std::cout); + ir::print(module, std::cout); dce.run(module); // ir::print(module, std::cout); @@ -266,7 +267,6 @@ R"( #define bool _Bool #define true 1 #define false 0 -#define __bool_true_false_are_defined 1 #define __readonly __attribute__((readonly)) #define __writeonly __attribute__((writeonly)) diff --git a/python/examples/conv.py b/python/examples/conv.py new file mode 100644 index 000000000..64378c67a --- /dev/null +++ b/python/examples/conv.py @@ -0,0 +1,11 @@ +import torch +import triton + +N, C, K = 32, 32, 32 +H, W = 32, 32 +R, S = 3, 3 +a = torch.randn(N, C, H, W).cuda() +b = torch.randn(C, R, S, K).cuda() +#c = torch.nn.functional.conv2d(a, b) +c = triton.ops.conv(a, b) +print(c) \ No newline at end of file diff --git a/python/examples/dot.py b/python/examples/dot.py index 3e061c112..dfc2587f2 100644 --- a/python/examples/dot.py +++ b/python/examples/dot.py @@ -53,8 +53,8 @@ def run_torch(): triton_da = a.grad.clone() triton_db = b.grad.clone() - nanosec = triton.bench_registry[triton_d] - print('TFLOPS:', 2. * M * N * K / nanosec * 1e-3) + #nanosec = triton.bench_registry[triton_d] + #print('TFLOPS:', 2. * M * N * K / nanosec * 1e-3) print('Diff DA:', (torch_da - triton_da).max()) print('Diff DB:', (torch_db - triton_db).max()) diff --git a/python/src/bindings.cc b/python/src/bindings.cc index 7fb4a29f0..8c00b423d 100644 --- a/python/src/bindings.cc +++ b/python/src/bindings.cc @@ -514,6 +514,7 @@ void gen_torch_make_handles(std::ostream &os, } void gen_torch_make_launch_function(std::ostream &os, const std::vector& args) { + os << " std::cout << 9 << std::endl;"; os << " std::function run = [&](){\n "; os << " (*id_fn_map.at(id))({"; for(unsigned i = 0; i < args.size() ; i++){ @@ -528,6 +529,7 @@ void gen_torch_make_launch_function(std::ostream &os, const std::vector 0)\n "; os << " i64scalar_map[bench_id] = triton::tools::bench(run, &stream);\n "; } @@ -586,10 +588,14 @@ extern std::map i64scalar_map; gen_torch_signature(oss, fn, outputs, name); oss << " {" << std::endl; + oss << " std::cout << 1 << std::endl;"; gen_torch_init_driver(oss, fn->args()); gen_torch_make_handles(oss, fn->args()); + oss << " std::cout << 2 << std::endl;"; gen_torch_make_launch_function(oss, fn->args()); + oss << " std::cout << 3 << std::endl;"; gen_torch_ret(oss, outputs); + oss << " std::cout << \"done\" << std::endl;\n"; oss << "}" << std::endl; oss << std::endl; diff --git a/python/triton/function.py b/python/triton/function.py index 9b71106de..d343eb03c 100644 --- a/python/triton/function.py +++ b/python/triton/function.py @@ -83,28 +83,25 @@ class function(metaclass = function_meta): @classmethod def apply_tensorflow(cls, *args, **kwargs): ctx = OpContext() - result = cls.forward(ctx, *args, **kwargs) - # check that all the results stem from triton.empty - # and get the corresponding TF tensors if possible + # run forward pass + result = cls.forward(ctx, *args, **kwargs) result = result if isinstance(result, tuple) else (result, ) result = function.extract_tf_tensors(result, 'forward') # Register backward pass - key = result[0] op = result[0].op - ctx_registry[key] = ctx - remap_in = cls.map_in_to_args(op, args) - remap_out = cls.map_res_to_out(op, result) - name = op.op_def.name + ctx_registry[op] = ctx if not cls.registered: - @fw.tensorflow.RegisterGradient(name) + remap_in = cls.map_in_to_args(op, args) + remap_out = cls.map_res_to_out(op, result) + @fw.tensorflow.RegisterGradient(op.op_def.name) def gradient(op, *dy): # Remap gradient inputs in the right order dy = [dy[i] for i in remap_out] dy = dy if len(dy) > 1 else dy[0] # Execute gradient function - grad = cls.backward(ctx_registry[key], dy) + grad = cls.backward(ctx_registry[op], dy) grad = function.extract_tf_tensors(grad, 'backward') # Remap gradient in the right order ret = [None] * len(op.inputs) diff --git a/python/triton/kernel.py b/python/triton/kernel.py index 79b3e59ce..68864fb47 100644 --- a/python/triton/kernel.py +++ b/python/triton/kernel.py @@ -253,7 +253,7 @@ class kernel: bench_registry[y] = triton.utils.id_dict.lazy_entry(bench_id) elif fw.has_torch(): args = [x.contiguous() if isinstance(x, fw.torch.Tensor) else x for x in args[:-1]] - self.fw_op(op_id, bench, bench_id, *args) + ret = self.fw_op(op_id, bench, bench_id, *args) if bench > 0: bench_registry[ret] = libtriton.retrieve_scalar(op_id) else: diff --git a/python/triton/ops/__init__.py b/python/triton/ops/__init__.py index ac0c3293d..6e4ded8c7 100644 --- a/python/triton/ops/__init__.py +++ b/python/triton/ops/__init__.py @@ -1,3 +1,4 @@ from .dot import _dot, dot from .einsum import _einsum, einsum -from .batchnorm import _batchnorm, batchnorm \ No newline at end of file +from .batchnorm import _batchnorm, batchnorm +from .conv import _conv, conv \ No newline at end of file diff --git a/python/triton/ops/conv.py b/python/triton/ops/conv.py new file mode 100644 index 000000000..c4c0c01ae --- /dev/null +++ b/python/triton/ops/conv.py @@ -0,0 +1,234 @@ +import triton +import numpy as np + +class _conv(triton.function): + + src = """ +void convnd(A_TYPE *A, + B_TYPE *B, + float *C, + int M, int N, int K, + int AH, int AW, + int BH, int BW, + int CH, int CW, + int NC, + int lda_n, int lda_c, int lda_d, int lda_h, int lda_w, + int ldb_c, int ldb_t, int ldb_r, int ldb_s, int ldb_k, + int ldc_n, int ldc_k, int ldc_m, int ldc_p, int ldc_q, + int pad_h, int pad_w, + int stride_h, int stride_w, + int upsample_h, int upsample_w, + int off_uh, int off_uw, + int off_uah, int off_uaw, + int off_uch, int off_ucw, + int* a_delta){ + + // range of indices along the reduction axis + int rka[TK] = 0 ... TK; + int rkb[TK] = 0 ... TK; + + // initialize accumulator + float c[TM, TN] = 0; + + // pointers for A + int rxa[TM] = get_program_id(0) * TM + 0 ... TM; + int rabh[TM] = rxa / CW; + int raw[TM] = rxa % CW; + int rab[TM] = rabh / CH; + int rah[TM] = rabh % CH; + rah = rah * UPAW - off_uah; + raw = raw * UPAH - off_uaw; + int racr[TK] = rka / BW; + int ras[TK] = rka % BW; + int rac[TK] = racr / BH; + int rar[TK] = racr % BH; + rar = FLIPR rar; + ras = FLIPS ras; + rar = UPAR * rar; + ras = UPAS * ras; + int ra0[TM] = rab*lda_n + rah*lda_h + raw*lda_w; + int ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; + A_TYPE* pa[TM, TK] = A + ra0[:, newaxis] + ra1[newaxis, :]; + + // pointers for B + int rb0[TN] = get_program_id(1) * TN + 0 ... TN; +#ifdef B_LUT + int rbcr[TK] = rkb / BW; + int rbs[TK] = rkb % BW; + int rbc[TK] = rbcr / BH; + int rbr[TK] = rbcr % BH; + rbr = rbr * upsample_h + off_uh; + rbs = rbs * upsample_w + off_uw; + int rb1[TK] = rbc*ldb_c + rbr*ldb_r + rbs*ldb_s; +#else + int rb1[TK] = rkb * STRIDE_B0; +#endif + B_TYPE* pb [B_SHAPE] = B + rb1[BROADCAST_B1] * STRIDE_B1 + rb0[BROADCAST_B0] * STRIDE_B0 * ldb_k; + + // pointers for A look-up table + int offda[TK] = rka % LUT_SIZE; + int* pincd[TK] = a_delta + offda; + int* pda[TK] = a_delta + LUT_SIZE + offda + off_uw * LUT_SIZE + off_uh * LUT_SIZE * upsample_w; + int da[TK] = *pda; + int incd[TK] = *pincd; + + // pointers for B look-up table + int offdb[TK] = rkb % LUT_SIZE; +#ifdef B_LUT + int* pdb[TK] = b_delta + offdb + off_uw * LUT_SIZE + off_uh * LUT_SIZE * upsample_w; + int db[TK] = *pdb; +#endif + + // reduction loop + A_TYPE a[TM, TK] = *pa; + B_TYPE b[B_SHAPE] = *pb; + for(int k = K; k > 0; k = k - TK){ + c += a @ USE_B; + pa = pa + da[newaxis, :]; + pb = pb + INC_PB; + // increment A look-up table + pda = pda + incd; + da = *pda; + pincd = pincd + incd; + incd = *pincd; + // increment B look-up table +#ifdef B_LUT + pdb = pdb + INC_PDB; + db = *pdb; +#endif + // pre-fetches + a = *pa; + b = *pb; + } + + // write back + int rxc[TM] = get_program_id(0) * TM + 0 ... TM; + int rc1[TN] = get_program_id(1) * TN + 0 ... TN; + int rcn[TM] = rxc / (CH*CW); + int rcpq[TM] = rxc % (CH*CW); + int rcp[TM] = rcpq / CW; + int rcq[TM] = rcpq % CW; + rcp = rcp * upsample_h + off_uch; + rcq = rcq * upsample_w + off_ucw; + int rc0[TM] = rcn * ldc_n + rcp * ldc_p + rcq * ldc_q; + float* pc[TM, TN] = C + rc1[newaxis, :]*ldc_k + rc0[:, newaxis]; + *pc = c; +} +""" + kernel = triton.kernel(src, ['C']) + + @staticmethod + def _unpack(idx, D, H, W): + c = idx // (D*H*W) + dhw = idx % (D*H*W) + dh = dhw // W + w = dhw % W + d = dh // H + h = dh % H + return c, d, h, w + + @staticmethod + def _delta_a(upsample_d, upsample_h, upsample_w, depth, TK, + T, R, S, stride_a): + ud = np.arange(upsample_d)[:, np.newaxis, np.newaxis, np.newaxis] + uh = np.arange(upsample_h)[np.newaxis, :, np.newaxis, np.newaxis] + uw = np.arange(upsample_w)[np.newaxis, np.newaxis, :, np.newaxis] + ctrs = np.arange(depth)[np.newaxis, np.newaxis, np.newaxis, :] + c, t, r, s = _conv._unpack(ctrs, T, R, S) + nextc, nextt, nextr, nexts = _conv._unpack(ctrs + TK, T, R, S) + cdiff = nextc - c + tdiff = nextt - t + rdiff = nextr - r + sdiff = nexts - s + return cdiff*stride_a[1] + tdiff*stride_a[2] + rdiff*stride_a[3] + sdiff*stride_a[4] + + @staticmethod + def _extract_strides(shape): + rank = len(shape) + ret = [1] * rank + for i in range(rank - 1, 0, -1): + ret[i-1] = ret[i] * shape[i] + return ret + + + @staticmethod + def _call(a, b, + upsample_d, upsample_h, upsample_w, + pad_d, pad_h, pad_w, + stride_d, stride_h, stride_w, + mode): + # input shapes + shape_a = list(triton.shape(a)) + shape_b = list(triton.shape(b)) + # add depth + shape_a.insert(2, 1) + shape_b.insert(1, 1) + NB, NC, AD, AH, AW = shape_a + NC, BD, BH, BW, NF = shape_b + # output shape + CD = (AD*upsample_d - BD + 1 + 2*pad_d + stride_d - 1) // stride_d + CH = (AH*upsample_h - BH + 1 + 2*pad_h + stride_h - 1) // stride_h + CW = (AW*upsample_w - BW + 1 + 2*pad_w + stride_w - 1) // stride_w + shape_c = [NB, NF, CD, CH, CW] + # strides + stride_a = _conv._extract_strides(shape_a) + stride_b = _conv._extract_strides(shape_b) + stride_c = _conv._extract_strides(shape_c) + # look-up tables + TK = 8 + FS = BD * BH * BW + depth = (TK + FS - 1)//FS * FS + delta_a = _conv._delta_a(upsample_d, upsample_h, upsample_w, + depth, TK, BD, BH, BW, stride_a) + delta_a = triton.fw.torch.from_numpy(delta_a).cuda() + + trans_b = False + is_wgrad = False + is_blut = False + macros = { + 'B_SHAPE': 'TN, TK' if trans_b else 'TK, TN', + 'BROADCAST_B0': ':, newaxis' if trans_b else 'newaxis, :', + 'BROADCAST_B1': 'newaxis, :' if trans_b else ':, newaxis', + 'STRIDE_B0': 'ldb_s' if trans_b else '1', + 'STRIDE_B1': '1' if trans_b else 'ldb_s', + 'USE_B': '^b' if trans_b else 'b', + 'FLIPR': '' if trans_b else 'BH - 1 -', + 'FLIPS': '' if trans_b else 'BW - 1 -', + 'UPAR': 'stride_h' if is_wgrad else '1', + 'UPAS': 'stride_w' if is_wgrad else '1', + 'UPAH': '' if is_wgrad else 'stride_h', + 'UPAW': '' if is_wgrad else 'stride_w', + 'REDAX0': 'NC' if trans_b else 'BH', + 'REDAX1': 'BH' if trans_b else 'BW', + 'REDAX2': 'BW' if trans_b else 'NC', + 'AX0': 'c' if trans_b else 'r', + 'AX1': 'r' if trans_b else 's', + 'AX2': 's' if trans_b else 'c', + 'INC_PB': 'db[newaxis, :]' if is_blut else 'TK', + 'INC_PDB': 'incd' if trans_b else 'TK', + 'LUT_SIZE': depth, + 'TM': [32], + 'TN': [32], + 'TK': TK, + 'A_TYPE': 'float', + 'B_TYPE': 'float' + } + + shape_c.pop(2) + print(shape_c) + c = triton.empty(shape_c, dtype=a.dtype) + _conv.kernel(a, b, c, CD*CH*CW, NF, NC*BD*BH*BW, AH, AW, BH, BW, CH, CW, NC, + stride_a[0], stride_a[1], stride_a[2], stride_a[3], stride_a[4], + stride_b[0], stride_b[1], stride_b[2], stride_b[3], stride_b[4], + stride_c[0], stride_c[1], stride_c[2], stride_c[3], stride_c[4], + pad_h, pad_w, stride_h, stride_w, upsample_h, upsample_w, + 0, 0, 0, 0, 0, 0, + delta_a, + lambda opt: (1, 1, 1), **macros) + return c + + @staticmethod + def forward(ctx, input, weight): + _conv._call(input, weight, 1, 1, 1, 0, 0, 0, 1, 1, 1, '') + +conv = _conv.apply \ No newline at end of file diff --git a/python/triton/ops/dot.py b/python/triton/ops/dot.py index 339fba4c6..ae568c642 100644 --- a/python/triton/ops/dot.py +++ b/python/triton/ops/dot.py @@ -36,7 +36,6 @@ void dot(TYPE * A, TYPE * B, TYPE * C, *pc = c; } """ - kernel = triton.kernel(src, ['C']) @staticmethod @@ -109,6 +108,6 @@ void dot(TYPE * A, TYPE * B, TYPE * C, db = _dot._call(dy, a, True, True, bench) else: assert False - return da, db, None, None, None, None, None, None, None + return da, db, None, None, None dot = _dot.apply \ No newline at end of file diff --git a/python/triton/utils.py b/python/triton/utils.py index a51608508..0b012af3f 100644 --- a/python/triton/utils.py +++ b/python/triton/utils.py @@ -1,6 +1,7 @@ import triton.frameworks as fw import triton._C.libtriton as libtriton import numpy as np +import weakref def cdiv(a, b): return -(-a // b) @@ -23,7 +24,7 @@ def empty(shape, dtype): return tf_empty_proxy(shape, dtype) #return fw.tf_extra_ops.alloc_empty(args, T = dtype) elif fw.has_torch(): - return fw.torch.empty(*shape).cuda() + return fw.torch.empty(shape).cuda() def shape(A) : if fw.has_tensorflow(): @@ -45,15 +46,17 @@ class id_dict: def get(self): return libtriton.retrieve_scalar(self.id) - def __init__(self): - self.data = dict() + self.data = weakref.WeakKeyDictionary() def __delitem__(self, key): - del self.data[id(key)] + del self.data[key] def __getitem__(self, key): - ret = self.data[id(key)] + if fw.has_tensorflow(): + if isinstance(key, fw.tensorflow.Tensor): + key = key.op + ret = self.data[key] if isinstance(ret, id_dict.lazy_entry): return ret.get() return ret @@ -62,4 +65,7 @@ class id_dict: return len(self.data) def __setitem__(self, key, value): - self.data[id(key)] = value \ No newline at end of file + if fw.has_tensorflow(): + if isinstance(key, fw.tensorflow.Tensor): + key = key.op + self.data[key] = value \ No newline at end of file From f4bbbbe5e42754d40365c584e14e485e1468ae1e Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 1 Nov 2019 00:43:02 -0400 Subject: [PATCH 485/494] [PYTHON][OPS] Bugfix in conv fprop --- lib/codegen/selection/generator.cc | 17 ++++- lib/driver/module.cc | 1 + lib/runtime/function.cc | 1 - python/examples/conv.py | 17 +++-- python/src/bindings.cc | 6 -- python/triton/ops/conv.py | 108 +++++++++++------------------ 6 files changed, 65 insertions(+), 85 deletions(-) diff --git a/lib/codegen/selection/generator.cc b/lib/codegen/selection/generator.cc index 34bf52b7f..6585af9be 100644 --- a/lib/codegen/selection/generator.cc +++ b/lib/codegen/selection/generator.cc @@ -297,12 +297,18 @@ void generator::visit_unmasked_load_inst(ir::unmasked_load_inst* x) { // find vector size ir::value *ptr = x->get_pointer_operand(); size_t ld = layouts_->get(ptr)->order[0]; - unsigned alignment = alignment_->get(ptr, ld); + unsigned alignment = std::max(alignment_->get(ptr, ld), 1); + + // vector loads std::map packets; for_each(x, [&](indices_t idx){ distributed_tile* result = (distributed_tile*)tmap_.at(x); - unsigned vector_size = std::min(result->axis(ld).contiguous, alignment); + // vector size + unsigned contiguous = 1; + if(ld < x->get_type()->get_tile_rank()) + contiguous = result->axis(ld).contiguous; + unsigned vector_size = std::min(contiguous, alignment); unsigned linear = result->get_linear_index(idx); unsigned id = linear / vector_size; @@ -314,10 +320,15 @@ void generator::visit_unmasked_load_inst(ir::unmasked_load_inst* x) { packets[id] = builder_->CreateLoad(ptr); } }); + // extract result element for_each(x, [&](indices_t idx){ distributed_tile* result = (distributed_tile*)tmap_.at(x); - unsigned vector_size = std::min(result->axis(ld).contiguous, alignment); + // vector size + unsigned contiguous = 1; + if(ld < x->get_type()->get_tile_rank()) + contiguous = result->axis(ld).contiguous; + unsigned vector_size = std::min(contiguous, alignment); unsigned linear = result->get_linear_index(idx); unsigned id = linear / vector_size; set_value(x, idx, builder_->CreateExtractElement(packets.at(id), linear % vector_size)); diff --git a/lib/driver/module.cc b/lib/driver/module.cc index 722b8f6de..ddeb20bfc 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -242,6 +242,7 @@ cu_module::cu_module(driver::context * context, std::unique_ptr ll cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ // exit(EXIT_FAILURE); +// std::cout << source << std::endl; cu_context::context_switcher ctx(*context); // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 32b295556..7c6005a56 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -221,7 +221,6 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::transform::cts cts; codegen::generator isel(&axes, &layouts, &align, &allocation, target.get(), opt.num_warps); // run passes - ir::print(module, std::cout); dce.run(module); // ir::print(module, std::cout); diff --git a/python/examples/conv.py b/python/examples/conv.py index 64378c67a..dff03488a 100644 --- a/python/examples/conv.py +++ b/python/examples/conv.py @@ -1,11 +1,16 @@ import torch import triton -N, C, K = 32, 32, 32 -H, W = 32, 32 +N, C, K = 32, 8, 32 +H, W = 4, 4 R, S = 3, 3 +torch.manual_seed(0) a = torch.randn(N, C, H, W).cuda() -b = torch.randn(C, R, S, K).cuda() -#c = torch.nn.functional.conv2d(a, b) -c = triton.ops.conv(a, b) -print(c) \ No newline at end of file +b = torch.ones(C, R, S, K).cuda() + +rc = torch.nn.functional.conv2d(a, b.permute(3, 0, 1, 2)) +tc = triton.ops.conv(a, b) +print((rc - tc).abs().max()) +print((tc[:,:,0,0] - rc[:,:,0,0]).abs()) +#print((rc[:30,:30,:,:] - tc[:30, :30, :, :]).abs().max()) +#print(tc[31, 31,:,:]) \ No newline at end of file diff --git a/python/src/bindings.cc b/python/src/bindings.cc index 8c00b423d..7fb4a29f0 100644 --- a/python/src/bindings.cc +++ b/python/src/bindings.cc @@ -514,7 +514,6 @@ void gen_torch_make_handles(std::ostream &os, } void gen_torch_make_launch_function(std::ostream &os, const std::vector& args) { - os << " std::cout << 9 << std::endl;"; os << " std::function run = [&](){\n "; os << " (*id_fn_map.at(id))({"; for(unsigned i = 0; i < args.size() ; i++){ @@ -529,7 +528,6 @@ void gen_torch_make_launch_function(std::ostream &os, const std::vector 0)\n "; os << " i64scalar_map[bench_id] = triton::tools::bench(run, &stream);\n "; } @@ -588,14 +586,10 @@ extern std::map i64scalar_map; gen_torch_signature(oss, fn, outputs, name); oss << " {" << std::endl; - oss << " std::cout << 1 << std::endl;"; gen_torch_init_driver(oss, fn->args()); gen_torch_make_handles(oss, fn->args()); - oss << " std::cout << 2 << std::endl;"; gen_torch_make_launch_function(oss, fn->args()); - oss << " std::cout << 3 << std::endl;"; gen_torch_ret(oss, outputs); - oss << " std::cout << \"done\" << std::endl;\n"; oss << "}" << std::endl; oss << std::endl; diff --git a/python/triton/ops/conv.py b/python/triton/ops/conv.py index c4c0c01ae..4bf290258 100644 --- a/python/triton/ops/conv.py +++ b/python/triton/ops/conv.py @@ -21,7 +21,7 @@ void convnd(A_TYPE *A, int off_uh, int off_uw, int off_uah, int off_uaw, int off_uch, int off_ucw, - int* a_delta){ + int* a_delta, int* inc_a){ // range of indices along the reduction axis int rka[TK] = 0 ... TK; @@ -42,8 +42,6 @@ void convnd(A_TYPE *A, int ras[TK] = rka % BW; int rac[TK] = racr / BH; int rar[TK] = racr % BH; - rar = FLIPR rar; - ras = FLIPS ras; rar = UPAR * rar; ras = UPAS * ras; int ra0[TM] = rab*lda_n + rah*lda_h + raw*lda_w; @@ -51,56 +49,36 @@ void convnd(A_TYPE *A, A_TYPE* pa[TM, TK] = A + ra0[:, newaxis] + ra1[newaxis, :]; // pointers for B - int rb0[TN] = get_program_id(1) * TN + 0 ... TN; -#ifdef B_LUT - int rbcr[TK] = rkb / BW; - int rbs[TK] = rkb % BW; - int rbc[TK] = rbcr / BH; - int rbr[TK] = rbcr % BH; - rbr = rbr * upsample_h + off_uh; - rbs = rbs * upsample_w + off_uw; - int rb1[TK] = rbc*ldb_c + rbr*ldb_r + rbs*ldb_s; -#else - int rb1[TK] = rkb * STRIDE_B0; -#endif - B_TYPE* pb [B_SHAPE] = B + rb1[BROADCAST_B1] * STRIDE_B1 + rb0[BROADCAST_B0] * STRIDE_B0 * ldb_k; + int rbn[TN] = get_program_id(1) * TN + 0 ... TN; + B_TYPE* pb[TK, TN] = B + rbn[newaxis, :] * ldb_k + rkb[:, newaxis] * ldb_s; // pointers for A look-up table int offda[TK] = rka % LUT_SIZE; - int* pincd[TK] = a_delta + offda; - int* pda[TK] = a_delta + LUT_SIZE + offda + off_uw * LUT_SIZE + off_uh * LUT_SIZE * upsample_w; + int* pincd[TK] = inc_a + offda; + int* pda[TK] = a_delta + offda + off_uw * LUT_SIZE + off_uh * LUT_SIZE * upsample_w; int da[TK] = *pda; int incd[TK] = *pincd; - // pointers for B look-up table - int offdb[TK] = rkb % LUT_SIZE; -#ifdef B_LUT - int* pdb[TK] = b_delta + offdb + off_uw * LUT_SIZE + off_uh * LUT_SIZE * upsample_w; - int db[TK] = *pdb; -#endif - // reduction loop A_TYPE a[TM, TK] = *pa; - B_TYPE b[B_SHAPE] = *pb; + B_TYPE b[TK, TN] = *pb; for(int k = K; k > 0; k = k - TK){ - c += a @ USE_B; - pa = pa + da[newaxis, :]; - pb = pb + INC_PB; + c += a @ b; + pa += da[newaxis, :]; + pb += TK * ldb_s; // increment A look-up table pda = pda + incd; da = *pda; pincd = pincd + incd; incd = *pincd; - // increment B look-up table -#ifdef B_LUT - pdb = pdb + INC_PDB; - db = *pdb; -#endif // pre-fetches - a = *pa; - b = *pb; + bool checka[TM, TK] = k > TK; + bool checkb[TK, TN] = k > TK; + a = checka ? *pa : 0; + b = checkb ? *pb : 0; } + // write back int rxc[TM] = get_program_id(0) * TM + 0 ... TM; int rc1[TN] = get_program_id(1) * TN + 0 ... TN; @@ -112,28 +90,31 @@ void convnd(A_TYPE *A, rcq = rcq * upsample_w + off_ucw; int rc0[TM] = rcn * ldc_n + rcp * ldc_p + rcq * ldc_q; float* pc[TM, TN] = C + rc1[newaxis, :]*ldc_k + rc0[:, newaxis]; - *pc = c; + bool checkc0[TM] = rxc < M; + bool checkc1[TN] = rc1 < N; + bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; + *?(checkc)pc = c; } """ kernel = triton.kernel(src, ['C']) @staticmethod def _unpack(idx, D, H, W): - c = idx // (D*H*W) - dhw = idx % (D*H*W) - dh = dhw // W - w = dhw % W - d = dh // H - h = dh % H + cdh = idx // W + w = idx % W + cd = cdh // H + h = cdh % H + c = cd // D + d = cd % D return c, d, h, w @staticmethod def _delta_a(upsample_d, upsample_h, upsample_w, depth, TK, T, R, S, stride_a): - ud = np.arange(upsample_d)[:, np.newaxis, np.newaxis, np.newaxis] - uh = np.arange(upsample_h)[np.newaxis, :, np.newaxis, np.newaxis] - uw = np.arange(upsample_w)[np.newaxis, np.newaxis, :, np.newaxis] - ctrs = np.arange(depth)[np.newaxis, np.newaxis, np.newaxis, :] + ud = np.arange(upsample_d, dtype=np.int32)[:, np.newaxis, np.newaxis, np.newaxis] + uh = np.arange(upsample_h, dtype=np.int32)[np.newaxis, :, np.newaxis, np.newaxis] + uw = np.arange(upsample_w, dtype=np.int32)[np.newaxis, np.newaxis, :, np.newaxis] + ctrs = np.arange(depth, dtype=np.int32)[np.newaxis, np.newaxis, np.newaxis, :] c, t, r, s = _conv._unpack(ctrs, T, R, S) nextc, nextt, nextr, nexts = _conv._unpack(ctrs + TK, T, R, S) cdiff = nextc - c @@ -181,31 +162,18 @@ void convnd(A_TYPE *A, delta_a = _conv._delta_a(upsample_d, upsample_h, upsample_w, depth, TK, BD, BH, BW, stride_a) delta_a = triton.fw.torch.from_numpy(delta_a).cuda() + inc_a = np.arange(depth, dtype=np.int32) + inc_a = ((inc_a + TK) % depth) - inc_a + inc_a = triton.fw.torch.from_numpy(inc_a).cuda() trans_b = False is_wgrad = False is_blut = False - macros = { - 'B_SHAPE': 'TN, TK' if trans_b else 'TK, TN', - 'BROADCAST_B0': ':, newaxis' if trans_b else 'newaxis, :', - 'BROADCAST_B1': 'newaxis, :' if trans_b else ':, newaxis', - 'STRIDE_B0': 'ldb_s' if trans_b else '1', - 'STRIDE_B1': '1' if trans_b else 'ldb_s', - 'USE_B': '^b' if trans_b else 'b', - 'FLIPR': '' if trans_b else 'BH - 1 -', - 'FLIPS': '' if trans_b else 'BW - 1 -', + macros = { 'UPAR': 'stride_h' if is_wgrad else '1', 'UPAS': 'stride_w' if is_wgrad else '1', 'UPAH': '' if is_wgrad else 'stride_h', 'UPAW': '' if is_wgrad else 'stride_w', - 'REDAX0': 'NC' if trans_b else 'BH', - 'REDAX1': 'BH' if trans_b else 'BW', - 'REDAX2': 'BW' if trans_b else 'NC', - 'AX0': 'c' if trans_b else 'r', - 'AX1': 'r' if trans_b else 's', - 'AX2': 's' if trans_b else 'c', - 'INC_PB': 'db[newaxis, :]' if is_blut else 'TK', - 'INC_PDB': 'incd' if trans_b else 'TK', 'LUT_SIZE': depth, 'TM': [32], 'TN': [32], @@ -215,20 +183,22 @@ void convnd(A_TYPE *A, } shape_c.pop(2) - print(shape_c) c = triton.empty(shape_c, dtype=a.dtype) - _conv.kernel(a, b, c, CD*CH*CW, NF, NC*BD*BH*BW, AH, AW, BH, BW, CH, CW, NC, + grid = lambda opt: [triton.cdiv(NB*CD*CH*CW, opt.d('TM')), triton.cdiv(NF, opt.d('TN'))] + print(stride_c) + print(stride_b) + _conv.kernel(a, b, c, NB*CD*CH*CW, NF, NC*BD*BH*BW, AH, AW, BH, BW, CH, CW, NC, stride_a[0], stride_a[1], stride_a[2], stride_a[3], stride_a[4], stride_b[0], stride_b[1], stride_b[2], stride_b[3], stride_b[4], stride_c[0], stride_c[1], stride_c[2], stride_c[3], stride_c[4], pad_h, pad_w, stride_h, stride_w, upsample_h, upsample_w, 0, 0, 0, 0, 0, 0, - delta_a, - lambda opt: (1, 1, 1), **macros) + delta_a, inc_a, + grid, **macros) return c @staticmethod def forward(ctx, input, weight): - _conv._call(input, weight, 1, 1, 1, 0, 0, 0, 1, 1, 1, '') + return _conv._call(input, weight, 1, 1, 1, 0, 0, 0, 1, 1, 1, '') conv = _conv.apply \ No newline at end of file From 50a52df4896fa59b990fdc750f0159281774f2a7 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 1 Nov 2019 11:20:00 -0400 Subject: [PATCH 486/494] [PYTHON][OPS] Convolution: Some cleaning of Triton-C kernel --- python/examples/conv.py | 3 +-- python/triton/ops/conv.py | 44 +++++++++++++++++++-------------------- 2 files changed, 23 insertions(+), 24 deletions(-) diff --git a/python/examples/conv.py b/python/examples/conv.py index dff03488a..43f0f5d91 100644 --- a/python/examples/conv.py +++ b/python/examples/conv.py @@ -2,7 +2,7 @@ import torch import triton N, C, K = 32, 8, 32 -H, W = 4, 4 +H, W = 16, 16 R, S = 3, 3 torch.manual_seed(0) a = torch.randn(N, C, H, W).cuda() @@ -11,6 +11,5 @@ b = torch.ones(C, R, S, K).cuda() rc = torch.nn.functional.conv2d(a, b.permute(3, 0, 1, 2)) tc = triton.ops.conv(a, b) print((rc - tc).abs().max()) -print((tc[:,:,0,0] - rc[:,:,0,0]).abs()) #print((rc[:30,:30,:,:] - tc[:30, :30, :, :]).abs().max()) #print(tc[31, 31,:,:]) \ No newline at end of file diff --git a/python/triton/ops/conv.py b/python/triton/ops/conv.py index 4bf290258..8a2678f2a 100644 --- a/python/triton/ops/conv.py +++ b/python/triton/ops/conv.py @@ -21,56 +21,57 @@ void convnd(A_TYPE *A, int off_uh, int off_uw, int off_uah, int off_uaw, int off_uch, int off_ucw, - int* a_delta, int* inc_a){ + int* ADELTA, int* ADIFF){ // range of indices along the reduction axis - int rka[TK] = 0 ... TK; - int rkb[TK] = 0 ... TK; + int rxa[TM] = get_program_id(0) * TM + 0 ... TM; + int ryb[TN] = get_program_id(1) * TN + 0 ... TN; + int rk[TK] = 0 ... TK; // initialize accumulator float c[TM, TN] = 0; // pointers for A - int rxa[TM] = get_program_id(0) * TM + 0 ... TM; int rabh[TM] = rxa / CW; int raw[TM] = rxa % CW; int rab[TM] = rabh / CH; int rah[TM] = rabh % CH; rah = rah * UPAW - off_uah; raw = raw * UPAH - off_uaw; - int racr[TK] = rka / BW; - int ras[TK] = rka % BW; + int racr[TK] = rk / BW; + int ras[TK] = rk % BW; int rac[TK] = racr / BH; int rar[TK] = racr % BH; rar = UPAR * rar; ras = UPAS * ras; - int ra0[TM] = rab*lda_n + rah*lda_h + raw*lda_w; - int ra1[TK] = rac*lda_c + rar*lda_h + ras*lda_w; - A_TYPE* pa[TM, TK] = A + ra0[:, newaxis] + ra1[newaxis, :]; + int ram[TM] = rab*lda_n + rah*lda_h + raw*lda_w; + int rak[TK] = rac*lda_c + rar*lda_h + ras*lda_w; + A_TYPE* pa[TM, TK] = A + ram[:, newaxis] + rak[newaxis, :]; // pointers for B - int rbn[TN] = get_program_id(1) * TN + 0 ... TN; - B_TYPE* pb[TK, TN] = B + rbn[newaxis, :] * ldb_k + rkb[:, newaxis] * ldb_s; + int rbk[TK] = rk; + int rbn[TN] = ryb; + B_TYPE* pb[TK, TN] = B + rbn[newaxis, :] * ldb_k + rbk[:, newaxis] * ldb_s; // pointers for A look-up table - int offda[TK] = rka % LUT_SIZE; - int* pincd[TK] = inc_a + offda; - int* pda[TK] = a_delta + offda + off_uw * LUT_SIZE + off_uh * LUT_SIZE * upsample_w; - int da[TK] = *pda; - int incd[TK] = *pincd; + int rklut[TK] = rk % LUT_SIZE; + int* padiff[TK] = ADIFF + rklut; + int* padelta[TK] = ADELTA + rklut + off_uw * LUT_SIZE + off_uh * LUT_SIZE * upsample_w; + int adiff[TK] = *padiff; + int adelta[TK] = *padelta; // reduction loop A_TYPE a[TM, TK] = *pa; B_TYPE b[TK, TN] = *pb; for(int k = K; k > 0; k = k - TK){ c += a @ b; - pa += da[newaxis, :]; + pa += adelta[newaxis, :]; pb += TK * ldb_s; // increment A look-up table - pda = pda + incd; - da = *pda; - pincd = pincd + incd; - incd = *pincd; + padelta = padelta + adiff; + adelta = *padelta; + padiff = padiff + adiff; + adiff = *padiff; // pre-fetches bool checka[TM, TK] = k > TK; bool checkb[TK, TN] = k > TK; @@ -78,7 +79,6 @@ void convnd(A_TYPE *A, b = checkb ? *pb : 0; } - // write back int rxc[TM] = get_program_id(0) * TM + 0 ... TM; int rc1[TN] = get_program_id(1) * TN + 0 ... TN; From f278d9741a3bee8c59ba3ce03178d28aad03d450 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 16 Jan 2020 12:09:50 -0500 Subject: [PATCH 487/494] [GENERAL] Merged einsum feature branch. Various feature, performance improvements and bugfixes: * Added preliminary support for extended Einstein summation in PyTriton * Significant performance improvement on FP32 kernels containing matrix multiplication * Added re-coalescing pass for FP16 kernels containing matrix multiplication * Various bugfixes --- include/triton/codegen/instructions.h | 83 -- include/triton/codegen/selection/generator.h | 1 + .../triton/codegen/selection/machine_value.h | 8 +- include/triton/driver/module.h | 7 +- include/triton/ir/builder.h | 9 +- include/triton/ir/enums.h | 1 + include/triton/ir/instructions.h | 17 +- include/triton/ir/visitor.h | 2 + include/triton/lang/code_gen.h | 1 + include/triton/lang/token.h | 1 + include/triton/lang/type.h | 6 +- include/triton/runtime/function.h | 8 +- include/triton/tools/bench.hpp | 5 +- lib/codegen/analysis/align.cc | 42 +- lib/codegen/analysis/axes.cc | 1 + lib/codegen/analysis/layout.cc | 88 +- lib/codegen/selection/generator.cc | 247 +++++- lib/codegen/selection/machine_layout.cc | 2 +- lib/codegen/selection/machine_value.cc | 35 +- lib/codegen/transform/coalesce.cc | 45 +- lib/codegen/transform/cts.cc | 46 +- lib/codegen/transform/membar.cc | 1 - lib/driver/module.cc | 31 +- lib/ir/builder.cc | 18 +- lib/ir/instructions.cc | 36 +- lib/lang/code_gen.cc | 97 ++- lib/lang/parser.cc | 6 +- lib/lang/token.cc | 1 + lib/lang/type.cc | 3 +- lib/runtime/function.cc | 33 +- python/examples/blocksparse.py | 157 ---- python/examples/conv.py | 15 - python/examples/dot.py | 71 -- python/examples/einsum.py | 266 ++++-- python/examples/kernels/shift_cuda.cpp | 42 + python/examples/kernels/shift_cuda_kernel.cu | 111 +++ python/setup.py | 2 +- python/src/bindings.cc | 19 +- python/triton/function.py | 8 +- python/triton/kernel.py | 31 +- python/triton/ops/conv.py | 239 +++-- python/triton/ops/dot.py | 71 +- python/triton/ops/einsum.py | 823 +++++++++++++----- python/triton/utils.py | 22 +- tests/bench/dot.cc | 10 +- tests/common/dot.h | 60 +- tests/common/src/dot.h | 83 +- tests/common/util.h | 2 +- tests/unit/dot.cc | 4 +- 49 files changed, 1923 insertions(+), 994 deletions(-) delete mode 100644 include/triton/codegen/instructions.h delete mode 100644 python/examples/blocksparse.py delete mode 100644 python/examples/conv.py delete mode 100644 python/examples/dot.py create mode 100644 python/examples/kernels/shift_cuda.cpp create mode 100644 python/examples/kernels/shift_cuda_kernel.cu diff --git a/include/triton/codegen/instructions.h b/include/triton/codegen/instructions.h deleted file mode 100644 index c42abee4a..000000000 --- a/include/triton/codegen/instructions.h +++ /dev/null @@ -1,83 +0,0 @@ -#ifndef _TRITON_CODEGEN_INSTRUCTIONS_H_ -#define _TRITON_CODEGEN_INSTRUCTIONS_H_ - -#include "triton/ir/enums.h" -#include -#include - -namespace triton{ - -namespace ir{ -class instruction; -} - -namespace codegen{ - -enum storage_info_t { - NONE, - ANY, - SHARED, - DISTRIBUTED, - REPLICATED -}; - -typedef std::pair> inst_storage_info_t; -static const std::map storage_info = { - // scalars - { ir::INST_GET_PROGRAM_ID, {REPLICATED, {}}}, - { ir::INST_GET_NUM_PROGRAMS, {REPLICATED, {}}}, - // scalar/array - { ir::INST_PHI, {ANY, {ANY, ANY}}}, - { ir::INST_BINOP, {DISTRIBUTED, {DISTRIBUTED, DISTRIBUTED}}}, - { ir::INST_GETELEMENTPTR, {DISTRIBUTED, {DISTRIBUTED, DISTRIBUTED}}}, - { ir::INST_SELECT, {DISTRIBUTED, {DISTRIBUTED, DISTRIBUTED, DISTRIBUTED}}}, - { ir::INST_SQRT, {DISTRIBUTED, {DISTRIBUTED}}}, - // cmp - { ir::INST_ICMP, {DISTRIBUTED, {DISTRIBUTED, DISTRIBUTED}}}, - { ir::INST_FCMP, {DISTRIBUTED, {DISTRIBUTED, DISTRIBUTED}}}, - // cast - { ir::INST_CAST_TRUNC, {DISTRIBUTED, {DISTRIBUTED}}}, - { ir::INST_CAST_ZEXT, {DISTRIBUTED, {DISTRIBUTED}}}, - { ir::INST_CAST_SEXT, {DISTRIBUTED, {DISTRIBUTED}}}, - { ir::INST_CAST_FP_TRUNC, {DISTRIBUTED, {DISTRIBUTED}}}, - { ir::INST_CAST_FP_EXT, {DISTRIBUTED, {DISTRIBUTED}}}, - { ir::INST_CAST_UI_TO_FP, {DISTRIBUTED, {DISTRIBUTED}}}, - { ir::INST_CAST_SI_TO_FP, {DISTRIBUTED, {DISTRIBUTED}}}, - { ir::INST_CAST_FP_TO_UI, {DISTRIBUTED, {DISTRIBUTED}}}, - { ir::INST_CAST_FP_TO_SI, {DISTRIBUTED, {DISTRIBUTED}}}, - { ir::INST_CAST_PTR_TO_INT, {DISTRIBUTED, {DISTRIBUTED}}}, - { ir::INST_CAST_INT_TO_PTR, {DISTRIBUTED, {DISTRIBUTED}}}, - { ir::INST_CAST_BIT_CAST, {DISTRIBUTED, {DISTRIBUTED}}}, - { ir::INST_CAST_ADDR_SPACE_CAST, {DISTRIBUTED, {DISTRIBUTED}}}, - // io - { ir::INST_UNMASKED_LOAD, {DISTRIBUTED, {DISTRIBUTED}}}, - { ir::INST_MASKED_LOAD, {DISTRIBUTED, {DISTRIBUTED, DISTRIBUTED}}}, - { ir::INST_UNMASKED_STORE, {NONE , {DISTRIBUTED, DISTRIBUTED}}}, - { ir::INST_MASKED_STORE, {NONE , {DISTRIBUTED, DISTRIBUTED, DISTRIBUTED}}}, - // retile - { ir::INST_RESHAPE, {DISTRIBUTED, {DISTRIBUTED}}}, - { ir::INST_SPLAT, {DISTRIBUTED, {REPLICATED}}}, - { ir::INST_BROADCAST, {DISTRIBUTED, {REPLICATED}}}, - { ir::INST_DOWNCAST, {DISTRIBUTED, {REPLICATED}}}, - // array arithmetic - { ir::INST_TRANS, {SHARED, {SHARED}}}, - { ir::INST_REDUCE, {SHARED, {DISTRIBUTED}}}, - { ir::INST_DOT, {DISTRIBUTED, {SHARED, SHARED, DISTRIBUTED}}}, - // terminator - { ir::INST_RETURN, {NONE, {}}}, - { ir::INST_UNCOND_BRANCH, {NONE, {}}}, - { ir::INST_COND_BRANCH, {NONE, {REPLICATED}}}, - // intrinsics - { ir::INST_COPY_TO_SHARED, {SHARED, {DISTRIBUTED}}}, - { ir::INST_COPY_FROM_SHARED, {DISTRIBUTED, {SHARED}}}, - { ir::INST_BARRIER, {NONE, {}}}, - { ir::INST_MAKE_RANGE_DYN, {DISTRIBUTED, {}}}, - { ir::INST_MAKE_RANGE_STA, {DISTRIBUTED, {}}}, - { ir::INST_MAKE_RANGE, {DISTRIBUTED, {}}} -}; - - -} -} - -#endif diff --git a/include/triton/codegen/selection/generator.h b/include/triton/codegen/selection/generator.h index 3e6c0bacb..1f18bc6e1 100644 --- a/include/triton/codegen/selection/generator.h +++ b/include/triton/codegen/selection/generator.h @@ -122,6 +122,7 @@ public: void visit_reduce_inst(ir::reduce_inst*); void visit_select_inst(ir::select_inst*); + void visit_recoalesce_inst(ir::recoalesce_inst*); void visit_copy_to_shared_inst(ir::copy_to_shared_inst*); void visit_copy_from_shared_inst(ir::copy_from_shared_inst*); void visit_barrier_inst(ir::barrier_inst*); diff --git a/include/triton/codegen/selection/machine_value.h b/include/triton/codegen/selection/machine_value.h index 508881fd3..917151971 100644 --- a/include/triton/codegen/selection/machine_value.h +++ b/include/triton/codegen/selection/machine_value.h @@ -128,22 +128,22 @@ private: Type *make_vector_ty(Type *ty, size_t vector_size); public: - distributed_tile(Type *ty, const shapes_t& shapes, const std::vector& order, const axes_t &axes, Builder &builder, bool vectorize); + distributed_tile(Type *ty, const shapes_t& shapes, const std::vector& order, const axes_t &axes, Builder &builder); void set_value(indices_t idx, Value *v); Value* get_value(indices_t idx); const std::vector& get_order() { return order_; } unsigned get_linear_index(indices_t idx); indices_t get_ordered_indices(unsigned id); - void for_each(std::function fn); - const distributed_axis &axis(unsigned dim) { return axes_.at(dim); } + void for_each(std::function fn, int start = 0, int end = -1); + void for_each(std::function fn, std::vector start, std::vector size); + const distributed_axis &axis(unsigned dim) { return axes_.at(dim); } private: axes_t axes_; std::vector order_; indices_map_t indices_; values_map_t values_; ordered_indices_vec_t ordered_indices_; - size_t vector_size_; Builder &builder_; }; diff --git a/include/triton/driver/module.h b/include/triton/driver/module.h index 4f277d99b..d62142c7d 100755 --- a/include/triton/driver/module.h +++ b/include/triton/driver/module.h @@ -45,6 +45,8 @@ public: llvm::SmallVectorImpl &buffer, const std::string &features, file_type_t file_type); + virtual std::unique_ptr symbol(const char * name) const = 0; + protected: driver::context* ctx_; @@ -54,13 +56,14 @@ protected: class host_module: public module{ public: host_module(driver::context* context, std::unique_ptr module); + std::unique_ptr symbol(const char * name) const; }; // OpenCL class ocl_module: public module{ - public: ocl_module(driver::context* context, std::unique_ptr module); + std::unique_ptr symbol(const char * name) const; }; // CUDA @@ -70,7 +73,7 @@ class cu_module: public module { public: cu_module(driver::context* context, std::unique_ptr module); cu_module(driver::context* context, const std::string& source); - cu_buffer* symbol(const char * name) const; + std::unique_ptr symbol(const char * name) const; private: std::string source_; diff --git a/include/triton/ir/builder.h b/include/triton/ir/builder.h index 62690b11e..d5782e3a1 100644 --- a/include/triton/ir/builder.h +++ b/include/triton/ir/builder.h @@ -37,6 +37,7 @@ public: // Constants value *get_int32(unsigned val); // Types + type *get_void_ty(); type *get_int1_ty(); type *get_int8_ty(); type *get_int16_ty(); @@ -115,10 +116,10 @@ public: value *create_and(value *lhs, value *rhs, const std::string &name = ""); value *create_xor(value *lhs, value *rhs, const std::string &name = ""); value *create_or(value *lhs, value *rhs, const std::string &name = ""); - // Side effects - value *create_fneg(value *arg, const std::string &name = ""); - value *create_neg(value *arg, const std::string &name = ""); - value *create_not(value *arg, const std::string &name = ""); + // Unary +// value *create_fneg(value *arg, const std::string &name = ""); +// value *create_neg(value *arg, const std::string &name = ""); +// value *create_not(value *arg, const std::string &name = ""); // Input/Output value *create_load(value *arg, const std::string &name = ""); value *create_store(value *ptr, value *val, const std::string &name = ""); diff --git a/include/triton/ir/enums.h b/include/triton/ir/enums.h index 94c74c085..491d37edf 100644 --- a/include/triton/ir/enums.h +++ b/include/triton/ir/enums.h @@ -134,6 +134,7 @@ enum value_id_t: unsigned { // intrinsics INST_COPY_TO_SHARED, INST_COPY_FROM_SHARED, + INST_RECOALESCE, INST_BARRIER, INST_MAKE_RANGE_DYN, INST_MAKE_RANGE_STA, diff --git a/include/triton/ir/instructions.h b/include/triton/ir/instructions.h index 7c6b0465d..41eb98eb3 100644 --- a/include/triton/ir/instructions.h +++ b/include/triton/ir/instructions.h @@ -148,9 +148,9 @@ public: // Factory methods static binary_operator *create(binary_op_t op, value *lhs, value *rhs, const std::string &name = "", instruction *next = nullptr); - static binary_operator *create_fneg(value *arg, const std::string &name = "", instruction *next = nullptr); - static binary_operator *create_neg(value *arg, const std::string &name = "", instruction *next = nullptr); - static binary_operator *create_not(value *arg, const std::string &name = "", instruction *next = nullptr); +// static binary_operator *create_fneg(value *arg, const std::string &name = "", instruction *next = nullptr); +// static binary_operator *create_neg(value *arg, const std::string &name = "", instruction *next = nullptr); +// static binary_operator *create_not(value *arg, const std::string &name = "", instruction *next = nullptr); _TRITON_DEFINE_CLONE(binary_operator) _TRITON_DEFINE_ACCEPT(binary_operator) @@ -732,6 +732,17 @@ public: _TRITON_DEFINE_ACCEPT(copy_from_shared_inst) }; +class recoalesce_inst: public unary_inst{ +private: + using unary_inst::unary_inst; + std::string repr_impl() const { return "recoalesce_inst"; } + +public: + static recoalesce_inst* create(value *arg, const std::string &name = "", instruction *next = nullptr); + _TRITON_DEFINE_CLONE(recoalesce_inst) + _TRITON_DEFINE_ACCEPT(recoalesce_inst) +}; + class barrier_inst: public instruction{ private: barrier_inst(context &ctx, const std::string &name, instruction *next); diff --git a/include/triton/ir/visitor.h b/include/triton/ir/visitor.h index 62e63e6c4..b5941b88f 100644 --- a/include/triton/ir/visitor.h +++ b/include/triton/ir/visitor.h @@ -59,6 +59,7 @@ class sqrt_inst; class reduce_inst; class select_inst; +class recoalesce_inst; class copy_to_shared_inst; class copy_from_shared_inst; class barrier_inst; @@ -129,6 +130,7 @@ public: virtual void visit_reduce_inst(reduce_inst*) = 0; virtual void visit_select_inst(select_inst*) = 0; + virtual void visit_recoalesce_inst(recoalesce_inst*) = 0; virtual void visit_copy_to_shared_inst(copy_to_shared_inst*) = 0; virtual void visit_copy_from_shared_inst(copy_from_shared_inst*) = 0; virtual void visit_barrier_inst(barrier_inst*) = 0; diff --git a/include/triton/lang/code_gen.h b/include/triton/lang/code_gen.h index 96a02ce9a..a29cf268b 100644 --- a/include/triton/lang/code_gen.h +++ b/include/triton/lang/code_gen.h @@ -47,6 +47,7 @@ protected: }; void set_ret(ir::value* value); + ir::value *GenUnaryMinus(ir::value* arg); public: Generator(Parser* parser) : parser_(parser) {} diff --git a/include/triton/lang/token.h b/include/triton/lang/token.h index f11d08fc8..1b2868849 100644 --- a/include/triton/lang/token.h +++ b/include/triton/lang/token.h @@ -145,6 +145,7 @@ public: THREAD, // _Thread_local AUTO, GLOBAL, + CMEM, // constant memory // STORAGE CLASS SPECIFIER END BREAK, diff --git a/include/triton/lang/type.h b/include/triton/lang/type.h index 0985ba5e1..59ea8eb3f 100644 --- a/include/triton/lang/type.h +++ b/include/triton/lang/type.h @@ -39,7 +39,7 @@ enum { S_EXTERN = 0x02, S_STATIC = 0x04, S_THREAD = 0x08, - S_AUTO = 0x10, + S_CONSTANT = 0x10, S_GLOBAL = 0x20, // Type specifier @@ -73,7 +73,8 @@ struct Qualifier { CONST = 0x01, RESTRICT = 0x02, VOLATILE = 0x04, - MASK = CONST | RESTRICT | VOLATILE + CMEM = 0x08, + MASK = CONST | RESTRICT | VOLATILE | CMEM }; }; @@ -111,6 +112,7 @@ public: bool IsConstQualified() const { return ptr_ & Qualifier::CONST; } bool IsRestrictQualified() const { return ptr_ & Qualifier::RESTRICT; } bool IsVolatileQualified() const { return ptr_ & Qualifier::VOLATILE; } + bool IsConstantQualified() const { return ptr_ & Qualifier::CMEM; } private: intptr_t ptr_; diff --git a/include/triton/runtime/function.h b/include/triton/runtime/function.h index a6ab851a9..26253b7ee 100644 --- a/include/triton/runtime/function.h +++ b/include/triton/runtime/function.h @@ -8,11 +8,9 @@ #include #include #include -#include // codegen #include "triton/ir/context.h" #include "triton/codegen/target.h" -#include "triton/lang/parser.h" #include "triton/runtime/arg.h" namespace llvm { @@ -20,6 +18,8 @@ namespace llvm { class LLVMContext; } +class Parser; + namespace triton { namespace driver{ @@ -106,14 +106,14 @@ public: function(const std::string& src, const options_space_t& opt = options_space_t()); void operator()(const std::vector& args, const grid_t& grid, driver::stream* stream); void operator()(const std::vector& args, const grid_fn_ty& grid, driver::stream *stream); - std::string make_tensorflow_src(const std::vector &outputs, const std::string ¯o); + void set_cst(const std::string& name, void* data, size_t n_bytes); private: ir::context ctx_; std::string src_; options_space_t opt_space_; std::map cache_; - std::mutex src_mutex_; + std::map> cst_; }; } diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp index 48a4ab972..430418b27 100644 --- a/include/triton/tools/bench.hpp +++ b/include/triton/tools/bench.hpp @@ -30,7 +30,7 @@ private: high_resolution_clock::time_point _start; }; -inline double bench(std::function const & op, driver::stream * stream) +inline double bench(std::function const & op, driver::stream * stream, bool normalize = false) { // const driver::device * device = stream->context()->device(); timer tmr; @@ -38,9 +38,10 @@ inline double bench(std::function const & op, driver::stream * stream) double total_time = 0; op(); stream->synchronize(); - while(total_time*1e-9 < 1e-2){ + while(total_time*1e-9 < 1e-1){ float norm = 1; // normalize clock if possible to reduce noise in auto-tuning + if(normalize) if(auto cu_device = dynamic_cast(stream->context()->device())) norm = (float)cu_device->current_sm_clock()/cu_device->max_sm_clock(); tmr.start(); diff --git a/lib/codegen/analysis/align.cc b/lib/codegen/analysis/align.cc index ec692d6f6..8e9ba699c 100644 --- a/lib/codegen/analysis/align.cc +++ b/lib/codegen/analysis/align.cc @@ -5,24 +5,41 @@ #include "triton/ir/basic_block.h" #include "triton/ir/instructions.h" #include "triton/ir/type.h" +#include namespace triton { namespace codegen{ namespace analysis{ -inline int gcd(int a, int b) { - if (a == 0) - return b; - if (b == 0) - return a; - if (a == b) - return a; - if (a > b) - return gcd(a - b, b); - return gcd(a, b - a); +// Function for extended Euclidean Algorithm +int gcd_impl(int a, int b, int *x, int *y) +{ + // Base Case + if (a == 0) + { + *x = 0; + *y = 1; + return b; + } + + int x1, y1; // To store results of recursive call + int gcd = gcd_impl(b%a, a, &x1, &y1); + + // Update x and y using results of + // recursive call + *x = y1 - (b/a) * x1; + *y = x1; + + return gcd; } +int gcd(int a, int b) { + int x, y; + return gcd_impl(a, b, &x, &y); +} + + inline int lcm(int a, int b) { return (a * b) / gcd(a, b); } @@ -156,7 +173,7 @@ std::vector align::populate_is_constant(ir::value *v) { if(is_constant_.find(v) != is_constant_.end()) return is_constant_.at(v); if(auto *x = dynamic_cast(v)) - return add_to_cache(v, {cst_info{true, (unsigned)x->get_value()}}, is_constant_); + return add_to_cache(v, {cst_info{true, std::min(x->get_value(), 128)}}, is_constant_); if(dynamic_cast(v)) return add_to_cache(v, {cst_info{true, 0}}, is_constant_); if(auto *x = dynamic_cast(v)) @@ -448,7 +465,7 @@ std::vector align::populate_starting_multiple(ir::value *v){ if(auto *x = dynamic_cast(v)) return populate_starting_multiple_binop(x); if(auto *x = dynamic_cast(v)) - return add_to_cache(x, {(unsigned)x->get_value()}, starting_multiple_); + return add_to_cache(x, {std::min(x->get_value(), 128)}, starting_multiple_); if(auto *x = dynamic_cast(v)) return add_to_cache(x, {(unsigned)x->get_first()->get_value()}, starting_multiple_); if(auto *x = dynamic_cast(v)) @@ -484,6 +501,7 @@ void align::populate(ir::value *v) { populate_is_constant(v); populate_starting_multiple(v); populate_max_contiguous(v); + } void align::run(ir::module &mod) { diff --git a/lib/codegen/analysis/axes.cc b/lib/codegen/analysis/axes.cc index 0e67877b9..a01ef9aa1 100644 --- a/lib/codegen/analysis/axes.cc +++ b/lib/codegen/analysis/axes.cc @@ -113,6 +113,7 @@ void axes::update_graph(ir::instruction *i) { case ir::INST_DOT: return update_graph_dot(i); case ir::INST_COPY_TO_SHARED: return update_graph_no_edge(i);; case ir::INST_COPY_FROM_SHARED: return update_graph_no_edge(i); + case ir::INST_RECOALESCE: return update_graph_no_edge(i); default: return update_graph_elementwise(i); } return; diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index 70ca9e3b2..6d7c2dc9c 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -1,9 +1,9 @@ #include #include +#include #include "triton/codegen/analysis/axes.h" #include "triton/codegen/analysis/align.h" #include "triton/codegen/analysis/layout.h" -#include "triton/codegen/instructions.h" #include "triton/ir/function.h" #include "triton/ir/module.h" #include "triton/ir/utils.h" @@ -148,8 +148,11 @@ layout_t::layout_t(layout_type_t _type, extract_io_use(v, ptr); order.resize(axes.size()); std::iota(order.begin(), order.end(), 0); - for(ir::value *v: ptr){ - auto max_contiguous = align->contiguous(v); + auto largest = std::max_element(ptr.begin(), ptr.end(), [&](ir::value *x, ir::value *y){ + return x->get_type()->get_tile_rank() < y->get_type()->get_tile_rank(); + }); + if(*largest){ + auto max_contiguous = align->contiguous(*largest); std::sort(order.begin(), order.end(), [&](unsigned a, unsigned b) { return max_contiguous[a] > max_contiguous[b]; }); @@ -166,9 +169,8 @@ layout_hmma_884_t::layout_hmma_884_t(size_t num_warps, const std::vector& _shapes, const std::vector &values, ir::type *_ty, size_t _id, analysis::align* align): layout_t(HMMA_884, _axes, _shapes, values, _ty, _id, align) { - - unsigned shape_0 = shapes[order[0]]; - unsigned shape_1 = shapes[order[1]]; + unsigned shape_0 = shapes[0]; + unsigned shape_1 = shapes[1]; /* fragments per warp */ // try to make things as square as possible to maximize data re-use fpw = {1, 1, 1}; @@ -196,6 +198,7 @@ layout_hmma_884_t::layout_hmma_884_t(size_t num_warps, unsigned effective_num_warps = 1; for(size_t d = 0; d < shapes.size(); d++) effective_num_warps *= wpt[d]; + if(num_warps != effective_num_warps) throw std::runtime_error("cannot create a kernel with this amount of warps"); } @@ -213,20 +216,38 @@ layout_scanline_t::layout_scanline_t(size_t num_warps, unsigned num_threads = num_warps * 32; nts.resize(shapes.size()); mts.resize(shapes.size()); + bool is_dot = std::any_of(values.begin(), values.end(), + [&](ir::value* v) { return dynamic_cast(v); }); + + ir::value *ptr = nullptr; + for(ir::value *v: values) + for(ir::user *usr: v->get_users()) + if(auto *st = dynamic_cast(usr)) + ptr = st->get_pointer_operand(); + unsigned i = order[0]; - nts[i] = clamp(size / num_threads, 1, 4); + int contiguous = 4; + if(ptr) + contiguous = std::min(align->contiguous(ptr)[i], 4); + + nts[i] = clamp(size / num_threads, 1, std::min(contiguous, shapes[i])); mts[i] = clamp(num_threads, 1, shapes[i] / nts[i]); - num_threads = num_threads / mts[i]; + size /= shapes[i]; + num_threads /= mts[i]; + if(is_dot) + nts[order[1]] = clamp(size / num_threads, 1, std::min(4, shapes[order[1]])); for(size_t d = 1; d < shapes.size(); d++){ i = order[d]; - nts[i] = 1; - mts[i] = clamp(num_threads, 1, shapes[i]); + if(d > 1 || !is_dot) + nts[i] = 1; + mts[i] = clamp(num_threads, 1, shapes[i] / nts[i]); num_threads = num_threads / mts[i]; } /* sanity check */ unsigned effective_num_threads = 1; for(size_t d = 0; d < shapes.size(); d++) effective_num_threads *= mts[d]; + if(num_warps * 32 != effective_num_threads) throw std::runtime_error("cannot create a kernel with this amount of warps"); } @@ -259,8 +280,8 @@ void extract_double_bufferable(ir::value *v, std::shared_ptr(value_0); ir::instruction *i_1 = dynamic_cast(value_1); if(!i_0 || !i_1 || - storage_info.at(i_0->get_id()).first != codegen::SHARED || - storage_info.at(i_1->get_id()).first != codegen::SHARED) + !dynamic_cast(i_0) || + !dynamic_cast(i_1) ) return; if(is_latch_1) res.reset(new double_buffer_info_t{value_0, value_1, phi}); @@ -284,10 +305,9 @@ layout_shared_t::layout_shared_t(const layout_t *arg, extract_double_bufferable(v, double_buffer); // order - if(arg->type == SCANLINE) - order = arg->order; - else - order = arg->order; + std::vector arg_order = arg ? arg->order : std::vector{0}; + order = arg_order; + ir::value* dot_a = nullptr; ir::value* dot_b = nullptr; ir::value* hmma_dot_a = nullptr; @@ -304,24 +324,27 @@ layout_shared_t::layout_shared_t(const layout_t *arg, col.push_back(s); row.push_back(s); } + + bool is_nonhmma_dot_a = dot_a && !hmma_dot_a; bool is_nonhmma_dot_b = dot_b && !hmma_dot_b; if(is_nonhmma_dot_a) order = is_trans(dot_a) ? row : col; - if(is_nonhmma_dot_b) + else if(is_nonhmma_dot_b) order = is_trans(dot_b) ? col : row; - +// else +// order = row; // padding pad = 0; if(hmma_dot_a){ bool row = is_trans(hmma_dot_a) ^ order[0] != 0; - pad = 24 - shapes[row ? order[0] : order[1]] % 32; + pad = 24 - shapes[row ? 0 : 1] % 32; } else if(hmma_dot_b){ bool row = is_trans(hmma_dot_b) ^ order[0] != 0; - pad = 24 - shapes[row ? order[1] : order[0]] % 32; + pad = 24 - shapes[row ? 1 : 0] % 32; } - else if(order != arg->order) { + else if(order != arg_order) { pad = 4; } shapes[order[0]] += pad; @@ -395,6 +418,29 @@ void layout::run(ir::module &mod) { layouts_[id] = new layout_shared_t(layout, axes_->get(arg), shapes, {red}, red->get_type()->get_scalar_ty(), id, align_); tmp_[red] = id; } + if(auto *recoalasce = dynamic_cast(i)){ + ir::value *val = recoalasce->get_operand(0); + const layout_t* in_layout = get(val); + const layout_t* out_layout = get(i); + if(in_layout->type != HMMA_884) + return; + id++; + ir::type::tile_shapes_t in_shape = val->get_type()->get_tile_shapes(); + ir::type::tile_shapes_t shape(in_shape.size()); + size_t ld = out_layout->order[0]; + shape[ld] = in_shape[ld]; + for(size_t k = 0; k < in_shape.size(); k++) + if(k != ld) + shape[k] = 4*in_layout->fpw[k]*in_layout->wpt[k]; + // create layout + layouts_[id] = new layout_shared_t(out_layout, axes_->get(val), shape, {recoalasce}, val->get_type()->get_scalar_ty(), id, align_); + tmp_[recoalasce] = id; + } + if(auto *atom = dynamic_cast(i)){ + id++; + layouts_[id] = new layout_shared_t(nullptr, {}, {1}, {atom}, atom->get_type()->get_scalar_ty(), id, align_); + tmp_[atom] = id; + } }); } diff --git a/lib/codegen/selection/generator.cc b/lib/codegen/selection/generator.cc index 6585af9be..4d4fe0b11 100644 --- a/lib/codegen/selection/generator.cc +++ b/lib/codegen/selection/generator.cc @@ -7,7 +7,6 @@ #include "triton/codegen/analysis/allocation.h" #include "triton/codegen/analysis/align.h" #include "triton/codegen/transform/coalesce.h" -#include "triton/codegen/instructions.h" #include "triton/ir/context.h" #include "triton/ir/module.h" #include "triton/ir/function.h" @@ -351,10 +350,9 @@ void generator::visit_masked_load_inst(ir::masked_load_inst* x) { unsigned id = linear / vector_size; if(linear % vector_size == 0) { Value *ptr = pointers->get_value(idx); - - ptr = builder_->CreateBitCast(ptr, PointerType::get(VectorType::get(result->get_ty(), vector_size), ptr->getType()->getPointerAddressSpace())); + Value *mask = masks->get_value(idx); BasicBlock *current_bb = builder_->GetInsertBlock(); Function *parent = builder_->GetInsertBlock()->getParent(); @@ -386,9 +384,9 @@ void generator::visit_masked_load_inst(ir::masked_load_inst* x) { // if(cst) // offset = " + " + std::to_string(cst->getValue().getSExtValue()*2*vector_size); // Type *fp16x2_ty = VectorType::get(builder_->getHalfTy(), 2); -// Type *fp16x2_pack4_ty = StructType::get(ctx, {fp16x2_ty, fp16x2_ty, fp16x2_ty, fp16x2_ty}); +// Type *fp16x2_pack4_ty = StructType::get(*ctx_, {fp16x2_ty, fp16x2_ty, fp16x2_ty, fp16x2_ty}); // FunctionType *ty = FunctionType::get(fp16x2_pack4_ty, {mask->getType(), ptr->getType()}, false); -// std::string asm_str = "@$0 ld.global.nc.b32 {$1, $2, $3, $4}, [$5" + offset + "];"; +// std::string asm_str = "@$0 ld.global.nc.v4.b32 {$1, $2, $3, $4}, [$5" + offset + "];"; // if(false_values) // asm_str += "\n\t@!$0 mov.v4.b32 {$1, $2, $3, $4}, {0, 0, 0, 0};"; // InlineAsm *iasm = InlineAsm::get(ty, asm_str, "b,=r,=r,=r,=r,l", true); @@ -420,31 +418,83 @@ void generator::visit_unmasked_store_inst(ir::unmasked_store_inst* st) { void generator::visit_masked_store_inst(ir::masked_store_inst* st) { distributed_tile* ptrs = (distributed_tile*)tmap_.at(st->get_pointer_operand()); - distributed_tile* scalars = (distributed_tile*)tmap_.at(st->get_value_operand()); - ir::value *mask = st->get_mask_operand(); - distributed_tile* preds = (distributed_tile*)tmap_.at(mask); - ptrs->for_each([&](indices_t idx){ - Value *scalar = scalars->get_value(idx); - Value *ptr = ptrs->get_value(idx); - Value *pred = preds->get_value(idx); - Function *parent = builder_->GetInsertBlock()->getParent(); - BasicBlock *mask_then_bb = BasicBlock::Create(*ctx_, "mask_then", parent); - BasicBlock *mask_done_bb = BasicBlock::Create(*ctx_, "mask_done", parent); - builder_->CreateCondBr(pred, mask_then_bb, mask_done_bb); - builder_->SetInsertPoint(mask_then_bb); - builder_->CreateStore(scalar, ptr); - builder_->CreateBr(mask_done_bb); - builder_->SetInsertPoint(mask_done_bb); -// std::string offset = ""; -// if(GetElementPtrInst *gep = dyn_cast(ptr)) -// if(gep->getNumIndices() == 1) -// if(ConstantInt *cst = dyn_cast(gep->idx_begin())){ -// offset = " + " + std::to_string(cst->getValue().getSExtValue()*4); -// } -// FunctionType *ty = FunctionType::get(Type::getVoidTy(ctx), {pred->getType(), ptr->getType(), scalar->getType()}, false); -// std::string asm_str = "@$0 st.global.b32 [$1" + offset + "], $2;"; -// InlineAsm *iasm = InlineAsm::get(ty, asm_str, "b,l,f", true); -// builder.CreateCall(iasm, {pred, ptr, scalar}); + distributed_tile* masks = (distributed_tile*)tmap_.at(st->get_mask_operand()); + // vector size + int vector_size = 1; + int ld = ptrs->get_order()[0]; + unsigned alignment = alignment_->get(st->get_pointer_operand(), ld); + vector_size = std::min(ptrs->axis(ld).contiguous, alignment); + // create packets + std::map packets; + ir::value *arg = st->get_value_operand(); + for_each(arg, [&](indices_t idx){ + distributed_tile* in = (distributed_tile*)tmap_.at(arg); + unsigned linear = in->get_linear_index(idx); + unsigned id = linear / vector_size; + Value *in_value = in->get_value(idx); + if(linear % vector_size == 0) + packets[id] = UndefValue::get(VectorType::get(in_value->getType(), vector_size)); + packets[id] = builder_->CreateInsertElement(packets.at(id), in_value, linear % vector_size); + }); + // write-back packets + for_each(arg, [&](indices_t idx){ + distributed_tile* in = (distributed_tile*)tmap_.at(arg); + unsigned linear = in->get_linear_index(idx); + unsigned id = linear / vector_size; + if(linear % vector_size == 0){ + // fetch tile elements + Value *elt = packets[id]; + Value *ptr = ptrs->get_value(idx); + Value *pred = masks->get_value(idx); + // type information + Type *ty = elt->getType(); + unsigned nbits = ty->getScalarSizeInBits(); + unsigned nbytes = nbits / 8; + // extract pointer offset + std::string offset = ""; + if(GetElementPtrInst *gep = dyn_cast(ptr)) + if(gep->getNumIndices() == 1) + if(ConstantInt *cst = dyn_cast(gep->idx_begin())){ + offset = " + " + std::to_string(cst->getValue().getSExtValue()*nbytes); + ptr = gep->getPointerOperand(); + } + ptr = builder_->CreateBitCast(ptr, ty->getPointerTo(1)); + // asm argument type + std::vector arg_ty = {pred->getType(), ptr->getType()}; + for(int v = 0; v < vector_size; v++) + arg_ty.push_back(ty->getScalarType()); + // asm function type + FunctionType *fn_ty = FunctionType::get(builder_->getVoidTy(), arg_ty, false); + // asm string + std::string asm_str; + asm_str += "@$0 st.global"; + if(vector_size > 1) + asm_str += ".v" + std::to_string(vector_size); + asm_str += ".b" + std::to_string(nbits) + " [$1" + offset + "],"; + if(vector_size > 1) + asm_str += "{"; + for(int v = 0; v < vector_size; v++){ + if(v > 0) + asm_str += ", "; + asm_str += "$" + std::to_string(2 + v); + } + if(vector_size > 1) + asm_str += "}"; + asm_str += ";"; + // asm constraint + std::string constraint = "b,l"; + for(int v = 0; v < vector_size; v++){ + constraint += ","; + constraint += (nbits == 32 ? "r" : "h"); + } + // create inline asm + InlineAsm *iasm = InlineAsm::get(fn_ty, asm_str, constraint, true); + // call asm + std::vector args = {pred, ptr}; + for(int v = 0; v < vector_size; v++) + args.push_back(builder_->CreateExtractElement(elt, builder_->getInt32(v))); + builder_->CreateCall(iasm, args); + } }); } @@ -504,23 +554,27 @@ void generator::visit_atomic_cas_inst(ir::atomic_cas_inst* cas) { Value *pred = builder_->CreateICmpEQ(tid, builder_->getInt32(0)); BasicBlock *tid_0_bb = BasicBlock::Create(*ctx_, "tid_0", current->getParent()); BasicBlock *tid_0_done_bb = BasicBlock::Create(*ctx_, "tid_0_done", current->getParent()); - Value *ptr = builder_->CreateGEP(sh_mem_ptr_, builder_->getInt32(alloc_->offset(layouts_->get(cas)))); - ptr = builder_->CreateBitCast(ptr, PointerType::get(builder_->getInt32Ty(), ptr->getType()->getPointerAddressSpace())); - tgt_->add_memfence(module, *builder_); tgt_->add_barrier(module, *builder_); + tgt_->add_memfence(module, *builder_); builder_->CreateCondBr(pred, tid_0_bb, tid_0_done_bb); builder_->SetInsertPoint(tid_0_bb); Value *cas_ptr = vmap_.at(cas->get_operand(0)); Value *cas_cmp = vmap_.at(cas->get_operand(1)); Value *cas_val = vmap_.at(cas->get_operand(2)); - Value *old = builder_->CreateAtomicCmpXchg(cas_ptr, cas_cmp, cas_val, AtomicOrdering::Monotonic, AtomicOrdering::Monotonic); + Value *old = builder_->CreateAtomicCmpXchg(cas_ptr, cas_cmp, cas_val, + AtomicOrdering::Monotonic, + AtomicOrdering::Monotonic); old = builder_->CreateExtractValue(old, {0}); - builder_->CreateStore(old, ptr); + Value *atom_ptr; + atom_ptr = builder_->CreateGEP(sh_mem_ptr_, builder_->getInt32(alloc_->offset(layouts_->get(layouts_->tmp(cas))))); + atom_ptr = builder_->CreateBitCast(atom_ptr, PointerType::get(old->getType(), 3)); + + builder_->CreateStore(old, atom_ptr); builder_->CreateBr(tid_0_done_bb); builder_->SetInsertPoint(tid_0_done_bb); tgt_->add_memfence(module, *builder_); tgt_->add_barrier(module, *builder_); - vmap_[cas] = builder_->CreateLoad(ptr); + vmap_[cas] = builder_->CreateLoad(atom_ptr); } void generator::visit_atomic_exch_inst(ir::atomic_exch_inst* xchg) { @@ -533,14 +587,14 @@ void generator::visit_atomic_exch_inst(ir::atomic_exch_inst* xchg) { BasicBlock *tid_0_bb = BasicBlock::Create(*ctx_, "tid_0", current->getParent()); BasicBlock *tid_0_done_bb = BasicBlock::Create(*ctx_, "tid_0_done", current->getParent()); tgt_->add_memfence(module, *builder_); - tgt_->add_barrier(module, *builder_); builder_->CreateCondBr(pred, tid_0_bb, tid_0_done_bb); builder_->SetInsertPoint(tid_0_bb); - vmap_[xchg] = builder_->CreateAtomicRMW(AtomicRMWInst::Xchg, rmw_ptr, rmw_val, AtomicOrdering::Monotonic, SyncScope::System); + builder_->CreateAtomicRMW(AtomicRMWInst::Xchg, rmw_ptr, rmw_val, + AtomicOrdering::Monotonic, + SyncScope::System); builder_->CreateBr(tid_0_done_bb); builder_->SetInsertPoint(tid_0_done_bb); tgt_->add_memfence(module, *builder_); - tgt_->add_barrier(module, *builder_); } void generator::visit_atomic_add_inst(ir::atomic_add_inst*) { @@ -861,6 +915,115 @@ void generator::visit_select_inst(ir::select_inst* select) { } +void generator::visit_recoalesce_inst(ir::recoalesce_inst* rc) { + ir::value *op = rc->get_operand(0); + ir::tile_type::tile_shapes_t shape = rc->get_type()->get_tile_shapes(); + size_t rank = shape.size(); + // temporary layout + shared_tile *tmp = (shared_tile*)machine_layouts_.at(layouts_->get(layouts_->tmp(rc))) + ->create(rc); + // pointer to temporary shared memory + Type *ty = llvm_type(rc->get_type()->get_scalar_ty(), *ctx_); + // layouts + const analysis::layout_t* in_layout = layouts_->get(op); + const analysis::layout_t* out_layout = layouts_->get(rc); + // machine tiles + distributed_tile *in_dt = (distributed_tile*)(tmap_.at(op)); + distributed_tile *out_dt = (distributed_tile*)(tmap_.at(rc)); + // WMMA configuration + long wmma_pt[3] = { 2, 4, 1}; + long wmma[3] = { 8*in_layout->wpt[0]*in_layout->fpw[0], + 8*in_layout->wpt[1]*in_layout->fpw[1], + 1}; + // Work per thread for input layout + long in_pt[3] = { shape[0] / wmma[0], + shape[1] / wmma[1], + 1 }; + // Work per thread for output layout + long out_pt[3] = { shape[0] / out_layout->mts[0], + shape[1] / out_layout->mts[1], + 1 }; + if(rank > 2){ + wmma[2] = in_layout->wpt[2]*in_layout->fpw[2]; + in_pt[2] = shape[2] / wmma[2]; + out_pt[2] = shape[2] / out_layout->mts[2]; + } + // Orders + auto ord = out_layout->order; + if(ord.size() < 3) + ord.push_back(2); + // pointer lanes + std::vector> ptrs; + for(int in_zz = 0; in_zz < wmma_pt[ord[2]]; in_zz++) { + std::vector current; + for(int in_cc = 0; in_cc < wmma_pt[ord[1]]; in_cc++) { + Value *base; + base = builder_->CreateGEP(sh_mem_ptr_, builder_->getInt32(alloc_->offset(layouts_->get(layouts_->tmp(rc))))); + base = builder_->CreateBitCast(base, PointerType::get(ty, 3)); + + // shared memory stride + Value *stride_0 = builder_->getInt32(tmp->get_shapes()[ord[0]]); + // indices + Value *idx_cc = axes_.at(a_axes_->get(op, ord[1])).values[in_cc]; + // offset + Value *off = builder_->CreateMul(stride_0, idx_cc); + if(rank > 2){ + Value *stride_1 = builder_->CreateMul(stride_0, + builder_->getInt32(tmp->get_shapes()[ord[1]])); + Value *idx_zz = axes_.at(a_axes_->get(op, ord[2])).values[in_zz]; + off = builder_->CreateAdd(off, builder_->CreateMul(stride_1, idx_zz)); + } + current.push_back(builder_->CreateGEP(base, off)); + } + ptrs.push_back(current); + } + // Re-coalesce loops + for(int in_z = 0; in_z < in_pt[ord[2]]; in_z++) + for(int in_c = 0; in_c < in_pt[ord[1]]; in_c++){ + // write to shared + tgt_->add_barrier(mod_, *builder_); + for(int in_zz = 0; in_zz < wmma_pt[ord[2]]; in_zz++) + for(int in_cc = 0; in_cc < wmma_pt[ord[1]]; in_cc++){ + std::vector starts(rank), len(rank); + starts[ord[0]] = 0; + starts[ord[1]] = in_c*wmma_pt[ord[1]] + in_cc; + len[ord[0]] = wmma_pt[ord[0]]*in_pt[ord[0]]; + len[ord[1]] = 1; + if(rank > 2){ + starts[ord[2]] = in_z*wmma_pt[ord[2]] + in_zz; + len[ord[2]] = 1; + } + in_dt->for_each([&](indices_t idx){ + Value *write_ptr = builder_->CreateGEP(ptrs[in_zz][in_cc], idx[ord[0]]); + builder_->CreateStore(in_dt->get_value(idx), write_ptr); + }, starts, len); + } + tgt_->add_barrier(mod_, *builder_); + // load from shared + for(int out_zz = 0; out_zz < out_pt[ord[2]] / in_pt[ord[2]]; out_zz++) + for(int out_cc = 0; out_cc < out_pt[ord[1]] / in_pt[ord[1]]; out_cc++){ + std::vector starts(rank), len(rank); + starts[ord[0]] = 0; + starts[ord[1]] = in_c*(out_pt[ord[1]] / in_pt[ord[1]]) + out_cc; + len[ord[0]] = out_pt[ord[0]]; + len[ord[1]] = 1; + if(rank > 2){ + starts[ord[2]] = in_z*(out_pt[ord[2]] / in_pt[ord[2]]) + out_zz; + len[ord[2]] = 1; + } + out_dt->for_each([&](indices_t idx){ + indices_t read_idx(rank); + read_idx[ord[0]] = idx[ord[0]]; + read_idx[ord[1]] = axes_.at(a_axes_->get(rc, ord[1])).values[out_cc]; + if(rank > 2) + read_idx[ord[2]] = axes_.at(a_axes_->get(rc, ord[2])).values[out_zz]; + out_dt->set_value(idx, tmp->get_value(read_idx)); + }, starts, len); + } + } + tgt_->add_barrier(mod_, *builder_); +} + void generator::visit_copy_to_shared_inst(ir::copy_to_shared_inst* cts) { unsigned vector_size = 1; auto x_order = layouts_->get(cts)->order; @@ -1126,16 +1289,14 @@ void generator::visit(ir::module &src, llvm::Module &dst) { if(tgt_->is_gpu()) if(unsigned alloc_size = alloc_->allocated_size()){ Type *int_8_ty = Type::getInt8Ty(*ctx_); - ArrayType *array_ty = ArrayType::get(int_8_ty, alloc_size); + Type *int_32_ty = Type::getInt32Ty(*ctx_); + ArrayType *array_ty = ArrayType::get(int_32_ty, alloc_size/4); Type *ptr_ty = PointerType::get(int_8_ty, 3); GlobalVariable *sh_mem_array = new GlobalVariable(*mod_, array_ty, false, GlobalVariable::ExternalLinkage, nullptr, "__shared_ptr", nullptr, GlobalVariable::NotThreadLocal, 3); sh_mem_ptr_ = builder_->CreateBitCast(sh_mem_array, ptr_ty); } - // allocate constant memory - for(ir::alloc_const *x: src.allocs()) - visit_alloc_const(x); // visit functions for(ir::function *fn: src.get_function_list()) visit_function(fn); diff --git a/lib/codegen/selection/machine_layout.cc b/lib/codegen/selection/machine_layout.cc index 1c026bfc8..2d02e7b1f 100644 --- a/lib/codegen/selection/machine_layout.cc +++ b/lib/codegen/selection/machine_layout.cc @@ -143,7 +143,7 @@ tile *machine_layout_distributed_t::create(ir::value *v) { axes[d].values = {builder_->getInt32(0)}; } } - return new distributed_tile(ty, shapes, layout_->order, axes, *builder_, false); + return new distributed_tile(ty, shapes, layout_->order, axes, *builder_); } machine_layout_hmma_884_t::machine_layout_hmma_884_t(Module *mod, Builder *builder, diff --git a/lib/codegen/selection/machine_value.cc b/lib/codegen/selection/machine_value.cc index a7cd73a8e..72aace4b2 100644 --- a/lib/codegen/selection/machine_value.cc +++ b/lib/codegen/selection/machine_value.cc @@ -45,9 +45,8 @@ llvm::Type *distributed_tile::make_vector_ty(llvm::Type *ty, size_t vector_size) return VectorType::get(ty, vector_size); } -distributed_tile::distributed_tile(Type *ty, const shapes_t &shapes, const std::vector& order, const axes_t &axes, llvm::IRBuilder<> &builder, bool vectorize) - : tile(make_vector_ty(ty, vectorize?axes[0].contiguous:1), shapes), axes_(axes), order_(order), builder_(builder) { - vector_size_ = vectorize?ty_->getVectorNumElements():1; +distributed_tile::distributed_tile(Type *ty, const shapes_t &shapes, const std::vector& order, const axes_t &axes, llvm::IRBuilder<> &builder) + : tile(ty, shapes), axes_(axes), order_(order), builder_(builder) { init_indices(); } @@ -73,13 +72,31 @@ indices_t distributed_tile::get_ordered_indices(unsigned id) { } -void distributed_tile::for_each(std::function fn) { - for(unsigned i = 0; i < ordered_indices_.size(); i++){ - if(i % vector_size_ == 0) - fn(ordered_indices_[i]); +void distributed_tile::for_each(std::function fn, int start, int end) { + if(end < 0) + end = ordered_indices_.size() + end + 1; + for(unsigned i = start; i < end; i++) + fn(ordered_indices_[i]); +} + +void distributed_tile::for_each(std::function fn, std::vector starts, std::vector sizes){ + int rank = sizes.size(); + int len = 1; + for(int s: sizes) + len *= s; + + for(int i = 0; i < len; i++){ + indices_t idx(rank); + int current = i; + for(int k = 0; k < rank; k++){ + idx[k] = axes_[k].values.at(starts[k] + current % sizes[k]); + current = current / sizes[k]; + } + fn(idx); } } + /* Shared Tile */ void shared_tile::extract_constant(Value *arg, Value *&non_cst, Value *&cst) { BinaryOperator *bin_op = dyn_cast(arg); @@ -126,7 +143,9 @@ void shared_tile::extract_constant(const indices_t &arg_idx, indices_t &non_cst_ } -Value* shared_tile::shared_offset(llvm::IRBuilder<> &builder, const shapes_t& shapes, const std::vector& perm, const std::vector& order, indices_t idx) { +Value* shared_tile::shared_offset(llvm::IRBuilder<> &builder, const shapes_t& shapes, + const std::vector& perm, const std::vector& order, + indices_t idx) { // strides std::vector strides(order.size()); strides[order[0]] = builder.getInt32(1); diff --git a/lib/codegen/transform/coalesce.cc b/lib/codegen/transform/coalesce.cc index 764e2138a..78c03396f 100644 --- a/lib/codegen/transform/coalesce.cc +++ b/lib/codegen/transform/coalesce.cc @@ -1,6 +1,8 @@ #include +#include #include "triton/ir/utils.h" #include "triton/ir/instructions.h" +#include "triton/ir/function.h" #include "triton/ir/module.h" #include "triton/codegen/transform/coalesce.h" #include "triton/codegen/analysis/align.h" @@ -60,8 +62,43 @@ ir::value* coalesce::rematerialize(ir::value *x, ir::builder &builder, } void coalesce::run(ir::module &mod) { - // find values to rematerialize size_t num_groups = layout_->num_layouts(); + + for(size_t id = 0; id < num_groups; id++) { + if(layout_->get(id)->type != analysis::HMMA_884) + continue; + // extract memory stores + const auto& values = layout_->values_of(id); + ir::value* dot = nullptr; + for(ir::value *v: values) + if(auto x = dynamic_cast(v)) + dot = x; + + ir::builder& builder = mod.get_builder(); + std::vector worklist = {dot}; + std::set seen; + while(!worklist.empty()) { + ir::value *current = worklist.back(); + seen.insert(current); + worklist.pop_back(); + // stop if trunc + if(auto x = dynamic_cast(current)){ + builder.set_insert_point_after(x); + ir::recoalesce_inst* rc = ir::recoalesce_inst::create(x); + builder.insert(rc); + x->replace_all_uses_with(rc); + rc->replace_uses_of_with(rc, x); + break; + } + // recurse + for(ir::user *u: current->get_users()) + if(seen.find(u) == seen.end()) + worklist.push_back(u); + } + } + + + // find values to rematerialize std::vector remat; for(size_t id = 0; id < num_groups; id++) { const auto& values = layout_->values_of(id); @@ -71,8 +108,10 @@ void coalesce::run(ir::module &mod) { extract_io_use(v, io); // extract leading axes std::map> axes; - for(ir::io_inst *i: io) - extract_ld(i, axes); + for(ir::io_inst *i: io){ + if(i->get_pointer_operand()->get_type()->get_tile_ranks1() == layout_->get(id)->axes.size()) + extract_ld(i, axes); + } // update list of values to rematerialize if(axes.empty()) continue; diff --git a/lib/codegen/transform/cts.cc b/lib/codegen/transform/cts.cc index 47a1e13a8..f98b685e1 100644 --- a/lib/codegen/transform/cts.cc +++ b/lib/codegen/transform/cts.cc @@ -1,21 +1,37 @@ #include "triton/codegen/transform/cts.h" -#include "triton/codegen/instructions.h" #include "triton/ir/module.h" #include "triton/ir/function.h" #include "triton/ir/basic_block.h" #include "triton/ir/instructions.h" +#include namespace triton { namespace codegen{ namespace transform{ -inline bool is_shared(ir::value *v) { - auto *i = dynamic_cast(v); + +inline bool is_shmem_op(ir::instruction* i, int op) { + if(i->get_id() == ir::INST_DOT) + return op==0 || op==1; + if(i->get_id() == ir::INST_COPY_FROM_SHARED) + return op==0; + return false; +} + +inline bool is_shmem_res(ir::value* v){ + ir::instruction* i = dynamic_cast(v); if(!i) return false; - return storage_info.at(i->get_id()).first == codegen::SHARED; + if(i->get_id() == ir::INST_TRANS) + return true; + if(i->get_id() == ir::INST_REDUCE) + return true; + if(i->get_id() == ir::INST_COPY_TO_SHARED) + return true; + return false; } + // run pass on module void add_copy(ir::instruction *parent, ir::value *x, ir::builder &builder, bool to_shared) { auto *i = dynamic_cast(x); @@ -36,9 +52,8 @@ void add_copy(ir::instruction *parent, ir::value *x, ir::builder &builder, bool add_copy(phi, phi->get_incoming_value(i), builder, to_shared); return; } - ir::value_id_t id = i->get_id(); // already in shared memory - if(to_shared && storage_info.at(id).first == SHARED) + if(to_shared && is_shmem_res(i)) return; // copy builder.set_insert_point_after(i); @@ -53,18 +68,19 @@ void add_copy(ir::instruction *parent, ir::value *x, ir::builder &builder, bool void cts::run(ir::module &mod) { // Add shared copies ir::builder &builder = mod.get_builder(); - for(ir::function *fn: mod.get_function_list()){ - for(ir::basic_block *block: fn->blocks()) - for(ir::instruction *i: block->get_inst_list()){ - auto storage = storage_info.at(i->get_id()); + for(ir::function* fn: mod.get_function_list()){ + for(ir::basic_block* block: fn->blocks()) + for(ir::instruction* i: block->get_inst_list()){ + size_t num_op = i->get_num_operands(); // copy to shared operands - for(size_t k = 0; k < storage.second.size(); k++) - if(storage.second[k] == SHARED) + for(size_t k = 0; k < num_op; k++) + if(is_shmem_op(i, k)) add_copy(i, i->get_operand(k), builder, true); // copy from shared operands - for(size_t k = 0; k < storage.second.size(); k++) - if(storage.second[k] == DISTRIBUTED && - is_shared(i->get_operand(k))){ + for(size_t k = 0; k < num_op; k++) + if(!dynamic_cast(i) && + !is_shmem_op(i,k) && + is_shmem_res(i->get_operand(k))){ add_copy(i, i->get_operand(k), builder, false); } } diff --git a/lib/codegen/transform/membar.cc b/lib/codegen/transform/membar.cc index 44316d504..8cb48f7df 100644 --- a/lib/codegen/transform/membar.cc +++ b/lib/codegen/transform/membar.cc @@ -3,7 +3,6 @@ #include #include "triton/codegen/analysis/layout.h" #include "triton/codegen/analysis/allocation.h" -#include "triton/codegen/instructions.h" #include "triton/codegen/transform/membar.h" #include "triton/ir/module.h" #include "triton/ir/function.h" diff --git a/lib/driver/module.cc b/lib/driver/module.cc index ddeb20bfc..28940f563 100755 --- a/lib/driver/module.cc +++ b/lib/driver/module.cc @@ -180,6 +180,11 @@ host_module::host_module(driver::context * context, std::unique_ptrengine = builder.create(); } +std::unique_ptr host_module::symbol(const char *name) const { + throw std::runtime_error("not implemented"); +} + + /* ------------------------ */ // OpenCL // /* ------------------------ */ @@ -211,10 +216,21 @@ ocl_module::ocl_module(driver::context * context, std::unique_ptr // } } +std::unique_ptr ocl_module::symbol(const char *name) const { + throw std::runtime_error("not implemented"); +} /* ------------------------ */ // CUDA // /* ------------------------ */ +static bool find_and_replace(std::string& str, const std::string& begin, const std::string& end, const std::string& target){ + size_t start_replace = str.find(begin); + size_t end_replace = str.find(end, start_replace); + if(start_replace == std::string::npos) + return false; + str.replace(start_replace, end_replace + 1 - start_replace, target); + return true; +} std::string cu_module::compile_llvm_module(std::unique_ptr module, driver::device* device) { // options @@ -231,19 +247,17 @@ std::string cu_module::compile_llvm_module(std::unique_ptr module, llvm::SmallVector buffer; module::compile_llvm_module(std::move(module), "nvptx64-nvidia-cuda", sm, "", buffer, "ptx63", Assembly); std::string result(buffer.begin(), buffer.end()); - size_t start_replace = result.find(".version"); - size_t end_replace = result.find('\n', start_replace); - assert(start_replace != std::string::npos); - result.replace(start_replace, end_replace - start_replace, ".version 6.4"); + find_and_replace(result, ".version", "\n", ".version 6.4\n"); + while(find_and_replace(result, "\t// begin inline asm", "\n", "")); + while(find_and_replace(result, "\t// end inline asm", "\n", "")); return result; } cu_module::cu_module(driver::context * context, std::unique_ptr ll_module): cu_module(context, compile_llvm_module(std::move(ll_module), context->device())) { } cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){ -// exit(EXIT_FAILURE); -// std::cout << source << std::endl; cu_context::context_switcher ctx(*context); +// std::cout << source << std::endl; // JIT compile source-code CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; unsigned int errbufsize = 8096; @@ -260,11 +274,12 @@ cu_module::cu_module(driver::context * context, std::string const & source) : mo } } -cu_buffer* cu_module::symbol(const char *name) const{ +std::unique_ptr cu_module::symbol(const char *name) const{ CUdeviceptr handle; size_t size; dispatch::cuModuleGetGlobal_v2(&handle, &size, *cu_, name); - return new cu_buffer(ctx_, size, handle, false); + std::unique_ptr res(new cu_buffer(ctx_, size, handle, false)); + return std::move(res); } diff --git a/lib/ir/builder.cc b/lib/ir/builder.cc index a2ea9d30b..b1e417e5f 100644 --- a/lib/ir/builder.cc +++ b/lib/ir/builder.cc @@ -48,6 +48,9 @@ value *builder::get_int32(unsigned val) { return constant_int::get(type::get_int32_ty(ctx_), val); } +type *builder::get_void_ty() +{ return type::get_void_ty(ctx_); } + type *builder::get_int1_ty() { return type::get_int1_ty(ctx_); } @@ -132,19 +135,12 @@ phi_node* builder::create_phi(type *ty, unsigned num_reserved, const std::string return insert(binary_operator::create(OPCODE, lhs, rhs), name);\ } -#define DEFINE_UNARY_FLOAT(SUFFIX)\ - value *builder::create_ ## SUFFIX(value *arg, const std::string &name){\ - return insert(binary_operator::create_ ## SUFFIX(arg), name);\ - } - // Binary DEFINE_BINARY_FLOAT(fmul, binary_op_t::FMul) DEFINE_BINARY_FLOAT(fdiv, binary_op_t::FDiv) DEFINE_BINARY_FLOAT(frem, binary_op_t::FRem) DEFINE_BINARY_FLOAT(fadd, binary_op_t::FAdd) DEFINE_BINARY_FLOAT(fsub, binary_op_t::FSub) -// Unary -DEFINE_UNARY_FLOAT(fneg) //===----------------------------------------------------------------------===// @@ -171,10 +167,7 @@ value* builder::create_insert_nuwnswb_binop(binary_op_t op, value *lhs, return create_insert_nuwnswb_binop(OPCODE, lhs, rhs, name, false, false);\ } -#define DEFINE_UNARY_INT(SUFFIX)\ - value *builder::create_ ## SUFFIX(value *arg, const std::string &name){\ - return insert(binary_operator::create_ ## SUFFIX(arg), name);\ - } + // Binary DEFINE_NOWRAP_BINARY(mul, binary_op_t::Mul) @@ -190,9 +183,6 @@ DEFINE_BINARY_INT(urem, binary_op_t::URem) DEFINE_BINARY_INT(and, binary_op_t::And) DEFINE_BINARY_INT(or, binary_op_t::Or) DEFINE_BINARY_INT(xor, binary_op_t::Xor) -// Unary -DEFINE_UNARY_INT(neg) -DEFINE_UNARY_INT(not) //===----------------------------------------------------------------------===// diff --git a/lib/ir/instructions.cc b/lib/ir/instructions.cc index 0be815a51..930c4a116 100644 --- a/lib/ir/instructions.cc +++ b/lib/ir/instructions.cc @@ -138,23 +138,23 @@ binary_operator *binary_operator::create(binary_op_t op, value *lhs, value *rhs, return new binary_operator(op, lhs, rhs, lhs->get_type(), name, next); } -binary_operator *binary_operator::create_fneg(value *arg, const std::string &name, instruction *next){ - assert(arg->get_type()->get_scalar_ty()->is_floating_point_ty()); - value *zero = constant_fp::get_zero_value_for_negation(arg->get_type()); - return binary_operator::create(binary_op_t::FSub, zero, arg, name, next); -} +//binary_operator *binary_operator::create_fneg(value *arg, const std::string &name, instruction *next){ +// assert(arg->get_type()->get_scalar_ty()->is_floating_point_ty()); +// value *zero = constant_fp::get_zero_value_for_negation(arg->get_type()); +// return binary_operator::create(binary_op_t::FSub, zero, arg, name, next); +//} -binary_operator *binary_operator::create_neg(value *arg, const std::string &name, instruction *next){ - assert(arg->get_type()->get_scalar_ty()->is_integer_ty()); - value *zero = constant_fp::get_zero_value_for_negation(arg->get_type()); - return binary_operator::create(binary_op_t::Sub, zero, arg, name, next); -} +//binary_operator *binary_operator::create_neg(value *arg, const std::string &name, instruction *next){ +// assert(arg->get_type()->get_scalar_ty()->is_integer_ty()); +// value *zero = constant_fp::get_zero_value_for_negation(arg->get_type()->get_scalar_ty()); +// return binary_operator::create(binary_op_t::Sub, zero, arg, name, next); +//} -binary_operator *binary_operator::create_not(value *arg, const std::string &name, instruction *next){ - assert(arg->get_type()->is_integer_ty()); - constant *mask = constant::get_all_ones_value(arg->get_type()); - return binary_operator::create(binary_op_t::Xor, arg, mask, name, next); -} +//binary_operator *binary_operator::create_not(value *arg, const std::string &name, instruction *next){ +// assert(arg->get_type()->is_integer_ty()); +// constant *mask = constant::get_all_ones_value(arg->get_type()); +// return binary_operator::create(binary_op_t::Xor, arg, mask, name, next); +//} //===----------------------------------------------------------------------===// // cmp_inst classes @@ -762,6 +762,12 @@ copy_from_shared_inst* copy_from_shared_inst::create(value *arg, const std::stri return new copy_from_shared_inst(arg->get_type(), INST_COPY_FROM_SHARED, arg, name, next); } +// recoalesce +recoalesce_inst* recoalesce_inst::create(value *arg, const std::string &name, instruction *next) { + return new recoalesce_inst(arg->get_type(), INST_RECOALESCE, arg, name, next); +} + + // barrier barrier_inst::barrier_inst(context &ctx, const std::string &name, diff --git a/lib/lang/code_gen.cc b/lib/lang/code_gen.cc index d13f68856..8bbf39081 100644 --- a/lib/lang/code_gen.cc +++ b/lib/lang/code_gen.cc @@ -57,7 +57,10 @@ void Generator::VisitBinaryOp(BinaryOp* binary) { } case Token::MASKED_DEREF: { ir::type* ret_ty = GenIRType(binary->Type(), *ctx_); - return set_ret(bld_->create_masked_load(rhs, lhs, ir::undef_value::get(ret_ty))); + ir::value* false_value = ir::undef_value::get(ret_ty->get_scalar_ty()); + if(ret_ty->is_tile_ty()) + false_value = bld_->create_splat(false_value, ret_ty->get_tile_shapes()); + return set_ret(bld_->create_masked_load(rhs, lhs, false_value)); } case Token::ELLIPSIS: { auto clhs = dynamic_cast(lhs); @@ -76,7 +79,7 @@ void Generator::VisitBinaryOp(BinaryOp* binary) { return set_ret(bld_->create_add(lhs, rhs)); case '-': if(binary->lhs_->Type()->ToPointer()) - return set_ret(bld_->create_gep(lhs, {bld_->create_neg(rhs)})); + return set_ret(bld_->create_gep(lhs, {GenUnaryMinus(rhs)})); else if(flt) return set_ret(bld_->create_fsub(lhs, rhs)); else @@ -147,7 +150,7 @@ void Generator::VisitBinaryOp(BinaryOp* binary) { if(flt) return set_ret(bld_->create_fcmpONE(lhs, rhs)); else - return set_ret(bld_->create_icmpEQ(lhs, rhs)); + return set_ret(bld_->create_icmpNE(lhs, rhs)); default: error_not_implemented(); } @@ -166,6 +169,16 @@ ir::reduce_inst::op_t reduce_op(int tag, bool is_float) { should_not_happen(); return reduce_inst::op_t(); } + +ir::value* Generator::GenUnaryMinus(ir::value* arg) { + ir::type *ty = arg->get_type(); + ir::type *sca_ty = ty->get_scalar_ty(); + ir::value *_0 = ir::constant_fp::get_zero_value_for_negation(sca_ty); + if(ty->is_tile_ty()) + _0 = bld_->create_splat(_0, ty->get_tile_shapes()); + return bld_->create_sub(_0, arg); +} + void Generator::VisitUnaryOp(UnaryOp* unary) { // recursion Visit(unary->operand_); @@ -174,17 +187,17 @@ void Generator::VisitUnaryOp(UnaryOp* unary) { ir::type *arg_scal_ty = arg_ty->get_scalar_ty(); // return switch (unary->op_) { - case Token::PREFIX_INC: return error_not_implemented(); - case Token::PREFIX_DEC: return error_not_implemented(); + case Token::PREFIX_INC: return error_not_implemented(); + case Token::PREFIX_DEC: return error_not_implemented(); case Token::POSTFIX_INC: return error_not_implemented(); case Token::POSTFIX_DEC: return error_not_implemented(); - case Token::ADDR: return error_not_implemented(); - case Token::DEREF: return set_ret(bld_->create_load(arg)); - case Token::PLUS: return error_not_implemented(); - case Token::MINUS: return error_not_implemented(); - case '~': return set_ret(bld_->create_neg(arg)); - case '!': return set_ret(bld_->create_not(arg)); - case Token::CAST: return set_ret(GenCastOp(arg, GenIRType(unary->Type(), *ctx_))); + case Token::ADDR: return error_not_implemented(); + case Token::DEREF: return set_ret(bld_->create_load(arg)); + case Token::PLUS: return error_not_implemented(); + case Token::MINUS: return set_ret(GenUnaryMinus(arg)); + case '~': return error_not_implemented(); + case '!': return error_not_implemented(); + case Token::CAST: return set_ret(GenCastOp(arg, GenIRType(unary->Type(), *ctx_))); case Token::REDUCE: { int ax, tag; UnaryOp::decodeRed(unary->info_, ax, tag); @@ -232,11 +245,54 @@ void Generator::VisitFuncCall(FuncCall* funcCall) { else return should_not_happen(); } + if(name == "get_num_programs"){ + VisitExpr(funcCall->Args()->at(0)); + ir::value* ret = ret_; + if(auto axis = dynamic_cast(ret)) + return set_ret(bld_->create_get_num_program(axis->get_value())); + else + return should_not_happen(); + } + if(name == "atomic_cas"){ + VisitExpr(funcCall->Args()->at(0)); + ir::value* ptr = ret_; + VisitExpr(funcCall->Args()->at(1)); + ir::value* cmp = ret_; + VisitExpr(funcCall->Args()->at(2)); + ir::value* val = ret_; + return set_ret(bld_->create_atomic_cas(ptr, cmp, val)); + } + if(name == "atomic_xchg"){ + VisitExpr(funcCall->Args()->at(0)); + ir::value* ptr = ret_; + VisitExpr(funcCall->Args()->at(1)); + ir::value* val = ret_; + return set_ret(bld_->create_atomic_exch(ptr, val)); + } if(name == "sqrtf"){ VisitExpr(funcCall->Args()->at(0)); ir::value* ret = ret_; return set_ret(bld_->create_sqrt(ret)); } + if(name == "calloc"){ + VisitExpr(funcCall->Args()->at(0)); + ir::value* ret = ret_; + ir::constant_int *size = dynamic_cast(ret); + assert(size); + ir::alloc_const* alloc = new ir::alloc_const(bld_->get_int8_ty(), size); + mod_->add_alloc(alloc); + return set_ret(alloc); + } + //TODO: integrate this into conditionalop + if(name == "select"){ + VisitExpr(funcCall->Args()->at(0)); + ir::value* cond = ret_; + VisitExpr(funcCall->Args()->at(1)); + ir::value* true_val = ret_; + VisitExpr(funcCall->Args()->at(2)); + ir::value* false_val = ret_; + return set_ret(bld_->create_select(cond, true_val, false_val)); + } return error_not_implemented(); } @@ -350,12 +406,15 @@ void Generator::VisitForStmt(ForStmt *forStmt) { ir::value *cond = ret_; return bld_->create_cond_br(cond, loop_bb, next_bb); }); - VisitStmt(init_); - VisitExpr(cond_); - ir::value *cond = ret_; - bld_->create_cond_br(cond, loop_bb, next_bb); + if(init_) + VisitStmt(init_); +// VisitExpr(cond_); +// ir::value *cond = ret_; +// bld_->create_cond_br(cond, loop_bb, next_bb); + bld_->create_br(loop_bb); bld_->set_insert_point(loop_bb); - VisitStmt(body_); + if(body_) + VisitStmt(body_); if(!is_terminator(ret_)) mod_->get_continue_fn()(); ir::basic_block *stop_bb = bld_->get_insert_block(); @@ -512,6 +571,8 @@ ir::value* Generator::GenNumcastOp(ir::value*src, ir::type* dst_ty) { else if(src_scalar_ty->is_integer_ty() && dst_scalar_ty->is_integer_ty() && src_scalar_ty->get_integer_bitwidth()) return bld_->create_int_cast(src, dst_ty, dst_signed); + else if(src_scalar_ty->is_pointer_ty() && dst_scalar_ty->is_pointer_ty()) + return bld_->create_cast(ir::BitCast, src, dst_ty); else{ should_not_happen(); return nullptr; @@ -611,6 +672,8 @@ ir::type* Generator::GenIRFuncType(FuncType* type, ir::context& ctx) { ir::type* Generator::GenIRPointerType(PointerType* type, ir::context& ctx) { ir::type* ele_ty = GenIRType(type->Derived().GetPtr(), ctx); unsigned addr_space = 1; + if(type->Derived().IsConstantQualified()) + addr_space = 4; return ir::pointer_type::get(ele_ty, addr_space); } diff --git a/lib/lang/parser.cc b/lib/lang/parser.cc index 960c983cf..ae37d9567 100644 --- a/lib/lang/parser.cc +++ b/lib/lang/parser.cc @@ -1083,14 +1083,12 @@ QualType Parser::ParseDeclSpec(int* storageSpec, int* funcSpec, int* alignSpec) *storageSpec |= S_THREAD; break; - case Token::AUTO: - EnsureAndSetStorageSpec(tok, storageSpec, S_AUTO); - break; // Type qualifier case Token::CONST: qualSpec |= Qualifier::CONST; break; case Token::RESTRICT: qualSpec |= Qualifier::RESTRICT; break; case Token::VOLATILE: qualSpec |= Qualifier::VOLATILE; break; + case Token::CMEM: qualSpec |= Qualifier::CMEM; break; // Type specifier case Token::SIGNED: @@ -1551,6 +1549,7 @@ int Parser::ParseQual() { case Token::CONST: qualSpec |= Qualifier::CONST; break; case Token::RESTRICT: qualSpec |= Qualifier::RESTRICT; break; case Token::VOLATILE: qualSpec |= Qualifier::VOLATILE; break; + case Token::CMEM: qualSpec |= Qualifier::CMEM; break; case Token::ATOMIC: Error(tok, "do not support 'atomic'"); break; default: ts_.PutBack(); return qualSpec; } @@ -1769,6 +1768,7 @@ QualType Parser::ParseArrayFuncDeclarator(const Token* ident, QualType base) { if (!base->Complete()) { Error(ident, "'%s' has incomplete element type", ident->str_.c_str()); } + // return a pointer for tiles in constant memory: return TileType::New(shape, base); } else if (ts_.Try('(')) { // Function declaration diff --git a/lib/lang/token.cc b/lib/lang/token.cc index 8b61aa098..c4a95c0c4 100644 --- a/lib/lang/token.cc +++ b/lib/lang/token.cc @@ -7,6 +7,7 @@ static MemPoolImp tokenPool; const std::unordered_map Token::kwTypeMap_ { + { "__constant__", Token::CMEM }, { "__global__", Token::GLOBAL }, { "auto", Token::AUTO }, { "break", Token::BREAK }, diff --git a/lib/lang/type.cc b/lib/lang/type.cc index a1564ad97..13d09cf89 100644 --- a/lib/lang/type.cc +++ b/lib/lang/type.cc @@ -294,7 +294,8 @@ std::string ArithmType::Str() const { bool PointerType::Compatible(const Type& other) const { // C11 6.7.6.1 [2]: pointer compatibility auto otherPointer = other.ToPointer(); - return otherPointer && derived_->Compatible(*otherPointer->derived_); + return otherPointer && + derived_->Compatible(*otherPointer->derived_); // FIXME(wgtdkp): cannot loose compatible constraints //return other.IsInteger() || diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index 7c6005a56..fe1d77b66 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -184,10 +184,20 @@ function::caller function::autotune(driver::stream* stream, const grid_fn_ty& gr // kernel uses too much resources if(!bin) return; + // copy constants + std::unique_ptr buffer; + for(ir::alloc_const* alloc: ir->allocs()){ + std::string name = alloc->get_name(); + auto it = cst_.find(name); + if(it == cst_.end()) + throw std::runtime_error("constant not set before execution"); + buffer = bin->symbol(name.c_str()); + stream->write(&*buffer, true, 0, it->second); + } // benchmark ir::function *tmp = ir->get_function_list()[0]; caller call(tmp, std::move(bin), opt); - double ts = tools::bench([&]() { call(stream, grid_fn(opt), args); }, stream); + double ts = tools::bench([&]() { call(stream, grid_fn(opt), args); }, stream, true); // save best if(ts < best_ts) { best_ts = ts; @@ -222,20 +232,14 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::generator isel(&axes, &layouts, &align, &allocation, target.get(), opt.num_warps); // run passes dce.run(module); -// ir::print(module, std::cout); - disassociate.run(module); - -// ir::print(module, std::cout); - dce.run(module); -// ir::print(module, std::cout); - peephole.run(module); dce.run(module); align.run(module); cts.run(module); axes.run(module); +// ir::print(module, std::cout); layouts.run(module); coalesce.run(module); dce.run(module); @@ -246,17 +250,19 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c dce.run(module); align.run(module); axes.run(module); +// ir::print(module, std::cout); layouts.run(module); liveness.run(module); allocation.run(module); if(allocation.allocated_size() > context->device()->max_shared_memory()) return std::unique_ptr(); barriers.run(module); -// std::cout << "isel" << std::endl; +// ir::print(module, std::cout); isel.visit(module, *llvm); // return binary std::unique_ptr res(driver::module::create(context, std::move(llvm))); // done +// exit(EXIT_FAILURE); return res; } @@ -273,8 +279,13 @@ R"( #define __aligned(A) __attribute__((aligned(A))) #define __multipleof(A) __attribute__((multipleof(A))) +extern int atomic_cas(int*, int, int); +extern int atomic_xchg(int*, int); extern int get_program_id(int); +extern int get_num_programs(int); extern float sqrtf(float); +extern int select(bool, int, int); +extern char __constant__ * calloc(int); )"; } @@ -316,5 +327,9 @@ void function::operator()(const std::vector& args, const grid_t& grid, driv return this->operator()(args, [&grid](const options_t&){ return grid; }, stream); } +void function::set_cst(const std::string& name, void* data, size_t n_bytes) { + cst_[name] = std::vector((char*)data, (char*)data + n_bytes); +} + } } diff --git a/python/examples/blocksparse.py b/python/examples/blocksparse.py deleted file mode 100644 index 7d15fc4f4..000000000 --- a/python/examples/blocksparse.py +++ /dev/null @@ -1,157 +0,0 @@ -import tensorflow as tf -import triton -import numpy as np - -src = ''' - #if AT == 1 - #define USE_A ^a - #define STRIDE_AK lda - #define STRIDE_AM 1 - #define BROADCAST_AK :, newaxis - #define BROADCAST_AM newaxis, : - #define SHAPE_A TK, TM - #else - #define USE_A a - #define STRIDE_AK 1 - #define STRIDE_AM lda - #define BROADCAST_AK newaxis, : - #define BROADCAST_AM :, newaxis - #define SHAPE_A TM, TK - #endif - - #if BT == 1 - #define USE_B ^b - #define STRIDE_BK 1 - #define STRIDE_BM ldb - #define BROADCAST_BN newaxis, : - #define BROADCAST_BK :, newaxis - #define SHAPE_B TN, TK - #else - #define USE_B b - #define STRIDE_BK ldb - #define STRIDE_BM 1 - #define BROADCAST_BN :, newaxis - #define BROADCAST_BK newaxis, : - #define SHAPE_B TK, TN - #endif - - void dot (TYPE* A __readonly __noalias __align(16), - TYPE* B __readonly __noalias __align(16), - TYPE* C __writeonly __noalias __align(16), - int lda, int ldb, int ldc, - int N, int* lut, - int* locks, int nlocks) { - int ridx = get_program_id(0); - float c[TM, TN] = 0; - int rka[TK] = 0 ... TK; - int rkb[TK] = 0 ... TK; - // load LUT header - int *header = lut + get_program_id(1) * 4; - int offset = *(header + 0); - int K = *(header + 1); - int column = *(header + 2); - int lockid = *(header + 3); - int *plut = lut + offset * 2; - int offx = ridx; - int offy = 0; - // compute x, y offsets - int rxa[TM] = offx * TM + (0 ... TM); - int ryb[TN] = offy * TN + (0 ... TN); - // bounds checking - bool checka[SHAPE_A] = (rxa < N)[:, newaxis]; - bool checkb[SHAPE_B] = 1; - // base offset - int offa[SHAPE_A] = rxa[BROADCAST_AM] * STRIDE_AM + rka[BROADCAST_AK] * STRIDE_AK; - int offb[SHAPE_B] = ryb[BROADCAST_BN] * STRIDE_BN + rkb[BROADCAST_BK] * STRIDE_BK; - for(int k = K; k > 0; k -= 1) { - // fetch block indices - int ak = *(plut + 0); - int bk = *(plut + 1); - lut += 2; - // compute pointers to blocks - TYPE* pa[SHAPE_A] = A + offa + ak * TK * lda; - TYPE* pb[SHAPE_B] = B + offb + bk * TK * TN; - // load blocks - TYPE a[SHAPE_A] = checka ? *pa : 0; - TYPE b[SHAPE_B] = *pb; - // multiply blocks - c += USE_A @ USE_B; - } - int rxc[TM] = ridx * TM + (0 ... TM); - int ryc[TN] = column * TN + (0 ... TN); - TYPE* pc[TM, TN] = C + rxc[:, newaxis] + ryc[newaxis, :]*ldc; - bool checkc[TM, TN] = (rxc < N)[:, newaxis]; - if(lockid == 0) { - *?(checkc) pc = c; - } - else { - int *plock = locks + ridx*nlocks + lockid - 1; - int *pcount = plock + get_num_program(0)*nlocks; - while(atomic_cas(plock, 0, 1)); - int count = *pcount; - if(count == 0) - *?(checkc) pc = c; - else - *?(checkc) pc = c + *pc; - atomic_exch(pcount, 1); - atomic_exch(plock, 0); - } - } -''' - -# std::string dot::triton_c_src_dw() const { -# bool AT = (op_ == WGRAD); -# bool BT = (op_ == FPROP); -# std::string usea = AT ? "trans(a)" : "a"; -# std::string useb = BT ? "trans(b)" : "b"; -# std::string sizea = AT ? "TK, TM" : "TM, TK"; -# std::string sizeb = BT ? "TN, TK" : "TK, TN"; -# std::string bca0 = AT ? "newaxis, :" : ":, newaxis"; -# std::string bca1 = AT ? ":, newaxis" : "newaxis, :"; -# std::string bcb0 = BT ? ":, newaxis" : "newaxis, :"; -# std::string bcb1 = BT ? "newaxis, :" : ":, newaxis"; -# std::string lda0 = AT ? "*lda" : ""; -# std::string lda1 = AT ? "" : "*lda"; -# std::string ldb0 = BT ? "" : "*ldb"; -# std::string ldb1 = BT ? "*ldb" : "" ; -# std::string result = -# R"( -# const tunable int TM = {)" + std::to_string(BS_) + R"(}; -# const tunable int TN = {)" + std::to_string(BS_) + R"(}; -# const tunable int TK = {32}; -# void bsdot(restrict read_only align(16) )" + ab_ty_ + R"( *A, -# restrict read_only align(16) )" + ab_ty_ + R"( *B, -# )" + c_ty_ + R"(* C, -# int lda, int ldb, int ldc, -# int N, int* lut, -# int* locks, int nlocks) { -# int ridx = get_range_id(0); -# float acc[TM, TN] = 0; -# int rka[TK] = 0 ... TK; -# int rkb[TK] = 0 ... TK; -# int *header = lut + ridx * 2; -# int offx = *(header + 0); -# int offy = *(header + 1); -# int rxa[TM] = offx*TM + (0 ... TM); -# int ryb[TN] = offy*TN + (0 ... TN); -# bool checka[TK, TM] = (rka < N)[:, newaxis]; -# bool checkb[TK, TN] = (rkb < N)[:, newaxis]; -# int offa[)" + sizea + "] = rxa[" + bca0 + "]" + lda0 + " + rka[" + bca1 + "]" + lda1 + R"(; -# int offb[)" + sizeb + "] = ryb[" + bcb0 + "]" + ldb0 + " + rkb[" + bcb1 + "]" + ldb1 + R"(; -# )" + ab_ty_ + " * pa[" + sizea + R"(] = A + offa; -# )" + ab_ty_ + " * pb[" + sizeb + R"(] = B + offb; -# )" + ab_ty_ + " a[" + sizea + R"(] = checka ? *pa : 0; -# )" + ab_ty_ + " b[" + sizeb + R"(] = checkb ? *pb : 0; -# for(int k = N; k > 0; k = k - TK) { -# acc = dot()" + usea + ", " + useb + R"(, acc); -# pa = pa + TK)" + lda1 + R"(; -# pb = pb + TK)" + ldb1 + R"(; -# a = checka ? *pa : 0; -# b = checkb ? *pb : 0; -# } -# int rxc[TM] = (0 ... TM); -# int ryc[TN] = (0 ... TN); -# )" + c_ty_ + R"( c[TM, TN] = acc; -# )" + c_ty_ + R"(* pc[TM, TN] = C + rxc[:, newaxis]*TM + ryc[newaxis, :] + ridx*TM*TN; -# *pc = c; -# })"; \ No newline at end of file diff --git a/python/examples/conv.py b/python/examples/conv.py deleted file mode 100644 index 43f0f5d91..000000000 --- a/python/examples/conv.py +++ /dev/null @@ -1,15 +0,0 @@ -import torch -import triton - -N, C, K = 32, 8, 32 -H, W = 16, 16 -R, S = 3, 3 -torch.manual_seed(0) -a = torch.randn(N, C, H, W).cuda() -b = torch.ones(C, R, S, K).cuda() - -rc = torch.nn.functional.conv2d(a, b.permute(3, 0, 1, 2)) -tc = triton.ops.conv(a, b) -print((rc - tc).abs().max()) -#print((rc[:30,:30,:,:] - tc[:30, :30, :, :]).abs().max()) -#print(tc[31, 31,:,:]) \ No newline at end of file diff --git a/python/examples/dot.py b/python/examples/dot.py deleted file mode 100644 index dfc2587f2..000000000 --- a/python/examples/dot.py +++ /dev/null @@ -1,71 +0,0 @@ -import numpy as np -import triton - -def run_tf(): - M, N, K = 2048, 2048, 2048 - a = tf.placeholder(tf.float32, shape=[M, K]) - b = tf.placeholder(tf.float32, shape=[N, K]) - triton_c = triton.ops.dot(a, b, False, True, 1) - triton_d = triton.ops.dot(triton_c, b, True, False, 1) - triton_y = tf.math.reduce_mean(triton_d) - fw_c = tf.matmul(a, b, False, True) - fw_d = tf.matmul(fw_c, b, True, False) - fw_y = tf.math.reduce_mean(fw_d) - # Gradient - triton_da, triton_db = tf.gradients(triton_y, [a, b]) - fw_da, fw_db = tf.gradients(fw_y, [a, b]) - # Reference - feed_dict = {a: np.random.rand(M, K).astype(np.float32), - b: np.random.rand(K, N).astype(np.float32)} - sess = tf.InteractiveSession() - sess.run(tf.global_variables_initializer()) - result = sess.run([triton_da, fw_da, triton_db, fw_db, fw_y, triton_y], feed_dict = feed_dict) - triton_da, fw_da = result[0][0], result[1][0] - triton_db, fw_db = result[2][0], result[3][0] - # Benchmark - nanosec = triton.bench_registry[triton_d] - print('TFLOPS:', 2. * M * N * K / nanosec * 1e-3) - print('Diff DA:', (triton_da - fw_da).max()) - print('Diff DB:', (triton_db - fw_db).max()) - - -def run_torch(): - torch.manual_seed(0) - M, N, K = 2048, 2048, 2048 - a = torch.randn(M, K).cuda() - b = torch.randn(K, N).cuda() - a.requires_grad_(True) - b.requires_grad_(True) - torch_c = torch.matmul(a, torch.t(b)) - torch_d = torch.matmul(torch.t(torch_c), b) - torch_y = torch.mean(torch_d) - triton_c = triton.ops.dot(a, b, False, True, 1) - triton_d = triton.ops.dot(triton_c, b, True, False, 1) - triton_y = torch.mean(triton_d) - # torch gradient - torch_y.backward() - torch_da = a.grad.clone() - torch_db = b.grad.clone() - # triton gradient - a.grad.zero_() - b.grad.zero_() - triton_y.backward() - triton_da = a.grad.clone() - triton_db = b.grad.clone() - - #nanosec = triton.bench_registry[triton_d] - #print('TFLOPS:', 2. * M * N * K / nanosec * 1e-3) - print('Diff DA:', (torch_da - triton_da).max()) - print('Diff DB:', (torch_db - triton_db).max()) - -try: - import tensorflow as tf - run_tf() -except ModuleNotFoundError: - pass - -try: - import torch - run_torch() -except ModuleNotFoundError: - pass diff --git a/python/examples/einsum.py b/python/examples/einsum.py index 8c3327e5a..2cbf2ca10 100644 --- a/python/examples/einsum.py +++ b/python/examples/einsum.py @@ -1,92 +1,194 @@ -#!/usr/bin/env python - -import numpy as np -from enum import Enum import triton +import torch +from torch.utils.cpp_extension import load +import numpy as np +#import utils +from time import time -class MODE(Enum): - TF = 1 - TORCH = 2 +#torch.backends.cudnn.benchmark = True -try: - import tensorflow as tf - mode = MODE.TF -except ModuleNotFoundError: - pass +configs = [] -try: - import torch - mode = MODE.TORCH -except ModuleNotFoundError: - pass +# Matrix multiplication +MNK = [ + (512, 512 ,512), + (2048, 2048, 2048), + (8192, 8192, 8192), + + (64, 64, 64000), + (64, 64, 128000), + (256, 256, 64000), + (256, 256, 128000), -cases = [] -# Matmul -cases += [[[4, 1024, 1024], [1024, 1024], [4, 1024, 1024], "btc,ck->btk"]] -# Attention -# cases += [[[4, 256, 8, 2, 64], [8, 2, 512, 64], [4, 256, 8, 2, 512], "bchak,hank->bchan"]] + (1536, 16, 1536), + (1536, 32, 1536), + (1536, 64, 1536), + (1536, 128, 1536), + (4096, 16, 4096), + (4096, 32, 4096), + (4096, 64, 4096), + (4096, 128, 4096), + + #(127008, 768, 576) + ] +for M, N, K in MNK: + matmul = lambda a, b: torch.matmul(a, b) + configs += [([M, K], [K, N], [M, N], matmul, 'mk,kn->mn', dict())] +for M, N, K in MNK: + matmul = lambda a, b: torch.matmul(a.t(), b) + configs += [([M, K], [M, N], [K, N], None, 'mk,mn->kn', dict())] +for M, N, K in MNK: + matmul = lambda a, b: torch.matmul(a, b.t()) + configs += [([M, N], [K, N], [M, K], None, 'mn,kn->mk', dict())] -if mode == MODE.TF: - sess = tf.InteractiveSession() +# Relative attention +NTHSE = [ + #(16, 512, 1, 64, 64), + # (16, 512, 1, 128, 128), + # (16, 512, 1, 256, 256), + # (16, 512, 1, 256, 512), + #(16, 512, 8, 64, 64), + # (16, 512, 8, 128, 128), + # (16, 512, 8, 256, 256), + # (16, 512, 8, 256, 512), -for a_shape, b_shape, c_shape, einsum in cases: + # (64, 1024, 1, 64, 64), + #(64, 1024, 1, 128, 128), + # (64, 1024, 1, 256, 256), + # (64, 1024, 1, 256, 512), + # (64, 1024, 8, 64, 64), + #(64, 1024, 8, 128, 128), + # (64, 1024, 8, 256, 256), + # (64, 1024, 8, 256, 512), - A = np.random.uniform(-1.0, 1.0, a_shape).astype(np.float16).astype(np.float32) - B = np.random.uniform(-1.0, 1.0, b_shape).astype(np.float16).astype(np.float32) - E = np.random.uniform(-1.0, 1.0, c_shape).astype(np.float16).astype(np.float32) + # (128, 1024, 1, 64, 64), + # (128, 1024, 1, 128, 128), + # (128, 1024, 1, 256, 256), + #(128, 1024, 1, 256, 512), + # (128, 1024, 8, 64, 64), + # (128, 1024, 8, 128, 128), + # (128, 1024, 8, 256, 256), + #(128, 1024, 8, 256, 512) + ] +for N, T, H, S, E in NTHSE: + configs += [([N, T, H, S], [H, E, S], [N, H, T, E], None, 'nths,hes->nhte', dict())] +for N, T, H, S, E in NTHSE: + configs += [([N, H, T, E], [N, T, H, S], [H, E, S], None, 'nhte,nths->hes', dict())] +for N, T, H, S, E in NTHSE: + configs += [([N, H, T, E], [H, E, S], [N, T, H, S], None, 'nhte,hes->nths', dict())] - # Execute (tensorflow) - if mode == MODE.TF: - a = tf.placeholder(tf.float32, a_shape, name="a") - b = tf.placeholder(tf.float32, b_shape, name="b") - e = tf.placeholder(tf.float32, c_shape, name="e") - c = triton.ops.einsum(einsum, a, b, 1) - da, db = tf.gradients(c, [a, b], e) - feed_dict = { a: A.astype(np.float32), - b: B.astype(np.float32), - e: E } - sess.run(tf.global_variables_initializer()) - result = sess.run([c, da, db], feed_dict = feed_dict) - # Execute (torch) - if mode == MODE.TORCH: - a = torch.from_numpy(A).cuda() - b = torch.from_numpy(B).cuda() - e = torch.from_numpy(E).cuda() - a.requires_grad_(True) - b.requires_grad_(True) - c = triton.ops.einsum(einsum, a, b, 1) - torch.autograd.backward(c, e) - da = a.grad - db = b.grad - result = [c.cpu().detach().numpy(), da.cpu().detach().numpy(), db.cpu().detach().numpy()] - - # benchmark - nanosec = triton.bench_registry[c] - ctx = triton.ctx_registry[c] - b, m, n, k = tuple((ctx.bmnk[i] for i in range(0, 4))) - ops = 2.*b*m*n*k - print('C TFLOPS:', ops / triton.bench_registry[c] * 1e-3) - #print('DA TFLOPS:', ops / triton.bench_registry[da] * 1e-3) - #print('DB TFLOPS:', ops / triton.bench_registry[db] * 1e-3) +# 1D Dense convolution +NCHKR = [ + # (1, 1152, 12602, 512, 3) + ] +for N, C, H, K, R in NCHKR: + torch_fn = lambda a, b: torch.nn.functional.conv1d(a, b.permute(2, 0, 1)) + configs += [([N, C, H], + [C, R, K], + [N, K, H - R + 1], + torch_fn, + 'nc(h+r),crk->nkh', + dict())] - # test - ctx = triton.ctx_registry[c] - t_a = ctx.trans_a - t_b = ctx.trans_b - e_a = ctx.einsum_a - e_b = ctx.einsum_b - e_c = ctx.einsum_c - C = np.einsum(einsum, A, B) - if not t_a and not t_b: # NN - DA = np.einsum(f"{e_c},{e_b}->{e_a}", E, B) - DB = np.einsum(f"{e_a},{e_c}->{e_b}", A, E) - elif not t_a and t_b: # NT - DA = np.einsum(f"{e_c},{e_b}->{e_a}", E, B) - DB = np.einsum(f"{e_c},{e_a}->{e_b}", E, A) - elif t_a and not t_b: # TN - DA = np.einsum(f"{e_b},{e_c}->{e_a}", B, E) - DB = np.einsum(f"{e_a},{e_c}->{e_b}", A, E) - c, da, db = result[0], result[1], result[2] - print('C diff:', np.abs((C - c)).max()) - print('DA diff:', np.abs((DA - da)).max()) - print('DB diff:', np.abs((DB - db)).max()) \ No newline at end of file +# 2D Dense convolution +NCHWKRS = [ + #(8, 64, 128, 128, 768, 3, 3), + #(8, 128, 64, 64, 256, 3, 3), + #(8, 256, 32, 32, 512, 3, 3), + #(8, 512, 32, 32, 1024, 3, 3) + ] +for N, C, H, W, K, R, S in NCHWKRS: + torch_fn = lambda a, b: torch.nn.functional.conv2d(a, b.permute(3, 0, 1, 2)) + configs += [([N, C, H, W], + [C, R, S, K], + [N, K, H - R + 1, W - R + 1], + torch_fn, + 'nc(h+r)(w+s),crsk->nkhw', + dict())] + +# 3D Dense Convolution +NCDHWKTRS = [ + #(8, 32, 27, 100, 100, 64, 3, 3, 3), + #(8, 64, 23, 48, 48, 256, 3, 3, 3), + #(8, 256, 19, 22, 22, 640, 3, 3, 3), + #(8, 640, 15, 36, 36, 384, 3, 3, 3) + ] +for N, C, D, H, W, K, T, R, S in NCDHWKTRS: + torch_fn = lambda a, b: torch.nn.functional.conv3d(a, b.permute(4, 0, 1, 2, 3)) + configs += [([N, C, D, H, W], + [C, T, R, S, K], + [N, K, D - T + 1, H - R + 1, W - R + 1], + torch_fn, + 'nc(d+t)(h+r)(w+s),ctrsk->nkdhw', + dict())] + + +# Shift convolution +shift_cuda = torch.utils.cpp_extension.load( + 'shift_cuda', ['kernels/shift_cuda.cpp', + 'kernels/shift_cuda_kernel.cu'], + extra_cflags=['-O3']) +class shift(torch.autograd.Function): + @staticmethod + def forward(ctx, x, shift): + ctx.save_for_backward(shift) + return shift_cuda.forward(x, shift) + + @staticmethod + def backward(ctx, grad_output): + shift, = ctx.saved_tensors + grad_output = shift_cuda.backward(grad_output, shift) + + return grad_output, None + +NCHWKRS = [ + #(8, 64, 128, 128, 128, 3, 3), + #(8, 128, 64, 64, 256, 3, 3), + #(8, 256, 32, 32, 512, 3, 3), + #(8, 512, 32, 32, 1024, 3, 3) + ] +for N, C, H, W, K, R, S in NCHWKRS: + shift_h = np.random.randint(R, size=C, dtype=np.int32) - R//2 + shift_w = np.random.randint(S, size=C, dtype=np.int32) - S//2 + def shift_conv(a, b, **kwargs): + shift_h, shift_w = kwargs['sh'], kwargs['sw'] + shift_torch = np.column_stack((shift_w*-1, shift_h*-1)) + shift_torch = torch.from_numpy(shift_torch).cuda() + a = shift.apply(a, shift_torch) + b = b.permute(1, 0) + b = b.reshape(b.shape[0], b.shape[1], 1, 1) + return torch.nn.functional.conv2d(a, b) + configs += [([N, C, H, W], + [C, K], + [N, K, H, W], + shift_conv, + 'nc(h + sh[c])(w + sw[c]),ck->nkhw', + {'sh': shift_h, 'sw': shift_w})] + +# Benchmark +torch.set_num_threads(1) +for a_shape, b_shape, c_shape, torch_fn, expr, arrays in configs: + dtype = torch.cuda.HalfTensor + # initialize input tensors + a = torch.rand(*a_shape).type(dtype).cuda() + b = torch.rand(*b_shape).type(dtype).cuda() + # triton output + #ta = triton.ops._einsum.pad(a, [4,4,4,4]) + tc = triton.ops.einsum(expr, a, b, c_shape, arrays = arrays, bench = True) + # reference output + if torch_fn: + rc = torch_fn(a, b, **arrays) + else: + rc = torch.einsum(expr, a, b) + # performance relative to equivalent matrix multiplication + ctx = triton.ctx_registry[tc] + B, M, N, K = ctx.matmul_B, ctx.matmul_M, ctx.matmul_N, ctx.matmul_K + # a = torch.rand(B, M, K).type(dtype).cuda() + # b = torch.rand(B, K, N).type(dtype).cuda() + # tmmc = triton.ops.einsum('bmk,bkn->bmn', a, b, [B, M, N], bench = True) + # ratio = triton.bench_registry[tmmc] / triton.bench_registry[tc] + ratio = 0 + # test and benchmark + bench = 2. * B * M * N * K / triton.bench_registry[tc] * 1e-3 + diff = (tc - rc).abs().max() / rc.abs().max() + print(f'{expr:>15}; {str(a_shape):>20}; {str(b_shape):>20}; {bench:4.2f} ({ratio:4.2f}); {diff:4.2f}') diff --git a/python/examples/kernels/shift_cuda.cpp b/python/examples/kernels/shift_cuda.cpp new file mode 100644 index 000000000..b7a769feb --- /dev/null +++ b/python/examples/kernels/shift_cuda.cpp @@ -0,0 +1,42 @@ +#include + +#include + +// CUDA forward declarations + +at::Tensor shift_cuda_forward( + const at::Tensor input, + const at::Tensor shift); + +at::Tensor shift_cuda_backward( + const at::Tensor grad_input, + const at::Tensor shift); + +// C++ interface + +// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4. +#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor") +#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") +#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) + +at::Tensor shift_forward( + const at::Tensor input, + const at::Tensor shift) { + CHECK_INPUT(input); + CHECK_INPUT(shift); + + return shift_cuda_forward(input, shift); +} + +at::Tensor shift_backward( + const at::Tensor grad_input, + const at::Tensor shift) { + CHECK_INPUT(grad_input); + CHECK_INPUT(shift); + return shift_cuda_backward(grad_input, shift); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("forward", &shift_forward, "Shift forward (CUDA)"); + m.def("backward", &shift_backward, "Shift backward (CUDA)"); +} diff --git a/python/examples/kernels/shift_cuda_kernel.cu b/python/examples/kernels/shift_cuda_kernel.cu new file mode 100644 index 000000000..ca56b6b0f --- /dev/null +++ b/python/examples/kernels/shift_cuda_kernel.cu @@ -0,0 +1,111 @@ +#include + +#include +#include + +#include + +namespace { +template +__global__ void shift_cuda_forward_kernel( + const scalar_t* __restrict__ input, + const int32_t* __restrict__ shift, + scalar_t* __restrict__ output, + const int32_t B, + const int32_t C, + const int32_t H, + const int32_t W) { + const int32_t idx = blockIdx.x * blockDim.x + threadIdx.x; + const int32_t size = B*C*H*W; + + const int32_t CHW = C*H*W; + const int32_t HW = H*W; + const int32_t b = idx / CHW; + const int32_t c = (idx - b*CHW) / HW; + const int32_t h = (idx - b*CHW - c*HW) / W; + const int32_t w = idx - b*CHW - c*HW - h*W; + const int32_t target_w = w + shift[2*c]; + const int32_t target_h = h + shift[2*c + 1]; + const int32_t target_idx = b*CHW + c*HW + target_h*W + target_w; + if (idx < size && target_w >= 0 && target_w < W && target_h >= 0 && target_h < H) { + output[target_idx] = input[idx]; + } +} + +template +__global__ void shift_cuda_backward_kernel( + const scalar_t* __restrict__ grad_input, + scalar_t* __restrict__ grad_output, + const int32_t* __restrict__ shift, + const int32_t B, + const int32_t C, + const int32_t W, + const int32_t H) { + const int32_t idx = blockIdx.x * blockDim.x + threadIdx.x; + const int32_t size = B*C*W*H; + const int32_t CWH = C*W*H; + const int32_t WH = W*H; + const int32_t b = idx / CWH; + const int32_t c = (idx - b*CWH) / WH; + const int32_t w = (idx - b*CWH - c*WH) / W; + const int32_t h = idx - b*CWH - c*WH - w*H; + const int32_t target_w = w - shift[2*c]; + const int32_t target_h = h - shift[2*c + 1]; + const int32_t target_idx = b*CWH + c*WH + target_w*W + target_h; + if (idx < size && target_w >= 0 && target_w < W && target_h >= 0 && target_h < H) { + grad_output[target_idx] = grad_input[idx]; + } +} +} // namespace + +at::Tensor shift_cuda_forward( + const at::Tensor input, + const at::Tensor shift) { + const auto B = input.size(0); + const auto C = input.size(1); + const auto H = input.size(2); + const auto W = input.size(3); + const auto size = B*C*W*H; + const int threads = 1024; + const int blocks = (size + threads - 1) / threads; + auto output = at::zeros_like(input); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "shift_forward_cuda", ([&] { + shift_cuda_forward_kernel<<>>( + input.data(), + shift.data(), + output.data(), + B, + C, + H, + W); + })); + + return output; +} + +at::Tensor shift_cuda_backward( + const at::Tensor grad_input, + const at::Tensor shift) { + const auto B = grad_input.size(0); + const auto C = grad_input.size(1); + const auto H = grad_input.size(2); + const auto W = grad_input.size(3); + const auto size = B*C*W*H; + const int threads = 1024; + const int blocks = (size + threads - 1) / threads; + auto grad_output = at::zeros_like(grad_input); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad_input.type(), "shift_backward_cuda", ([&] { + shift_cuda_backward_kernel<<>>( + grad_input.data(), + grad_output.data(), + shift.data(), + B, + C, + H, + W); + })); + + return grad_output; +} diff --git a/python/setup.py b/python/setup.py index 4c8d38259..060a1c450 100644 --- a/python/setup.py +++ b/python/setup.py @@ -77,7 +77,7 @@ class CMakeBuild(build_ext): pass cfg = 'Debug' if self.debug else 'Release' - #cfg = 'Release' + cfg = 'Release' build_args = ['--config', cfg] if platform.system() == "Windows": diff --git a/python/src/bindings.cc b/python/src/bindings.cc index 7fb4a29f0..8b3ee2971 100644 --- a/python/src/bindings.cc +++ b/python/src/bindings.cc @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -48,6 +49,11 @@ void delete_fn(size_t id) { id_fn_map.erase(id); } +void register_cst(size_t id, const std::string& name, pybind11::buffer& data) { + pybind11::buffer_info info = data.request(); + id_fn_map[id]->set_cst(name, info.ptr, info.size*info.itemsize); +} + void cleanup() { id_grid_map.clear(); id_fn_map.clear(); @@ -508,7 +514,8 @@ void gen_torch_make_handles(std::ostream &os, os << " " << to_c_ty(ty) << " arg_" << name << " = " << name << ";" << std::endl; else{ os << " CHECK_INPUT(" << name << ");" << std::endl; - os << " drv::cu_buffer arg_" + name + "(ctx, " + name + ".storage().size(), (CUdeviceptr)" + name + ".storage().data(), false);" << std::endl; + os << " drv::cu_buffer arg_" + name + "(ctx, " + name + ".storage().size(), " + " (CUdeviceptr)((char*)" + name + ".storage().data() + " + name + ".storage_offset() * " + name + ".itemsize()), false);" << std::endl; } } } @@ -526,8 +533,8 @@ void gen_torch_make_launch_function(std::ostream &os, const std::vector 0)\n "; os << " i64scalar_map[bench_id] = triton::tools::bench(run, &stream);\n "; } @@ -562,18 +569,15 @@ std::tuple 0: - bench_registry[ret] = libtriton.retrieve_scalar(op_id) + bench_registry[ret] = libtriton.retrieve_scalar(bench_id) else: assert False \ No newline at end of file diff --git a/python/triton/ops/conv.py b/python/triton/ops/conv.py index 8a2678f2a..8bd0acbd3 100644 --- a/python/triton/ops/conv.py +++ b/python/triton/ops/conv.py @@ -38,25 +38,19 @@ void convnd(A_TYPE *A, int rah[TM] = rabh % CH; rah = rah * UPAW - off_uah; raw = raw * UPAH - off_uaw; - int racr[TK] = rk / BW; - int ras[TK] = rk % BW; - int rac[TK] = racr / BH; - int rar[TK] = racr % BH; - rar = UPAR * rar; - ras = UPAS * ras; int ram[TM] = rab*lda_n + rah*lda_h + raw*lda_w; - int rak[TK] = rac*lda_c + rar*lda_h + ras*lda_w; + int rak[TK] = *(ADELTA + rk); A_TYPE* pa[TM, TK] = A + ram[:, newaxis] + rak[newaxis, :]; // pointers for B int rbk[TK] = rk; int rbn[TN] = ryb; - B_TYPE* pb[TK, TN] = B + rbn[newaxis, :] * ldb_k + rbk[:, newaxis] * ldb_s; + B_TYPE* pb[TK, TN] = B + rbn[newaxis, :] * ldb_k + rbk[:, newaxis] * ldb_c; // pointers for A look-up table int rklut[TK] = rk % LUT_SIZE; int* padiff[TK] = ADIFF + rklut; - int* padelta[TK] = ADELTA + rklut + off_uw * LUT_SIZE + off_uh * LUT_SIZE * upsample_w; + int* padelta[TK] = ADELTA + TK + rklut + off_uw * LUT_SIZE + off_uh * LUT_SIZE * upsample_w; int adiff[TK] = *padiff; int adelta[TK] = *padelta; @@ -66,7 +60,7 @@ void convnd(A_TYPE *A, for(int k = K; k > 0; k = k - TK){ c += a @ b; pa += adelta[newaxis, :]; - pb += TK * ldb_s; + pb += TK * ldb_c; // increment A look-up table padelta = padelta + adiff; adelta = *padelta; @@ -99,29 +93,54 @@ void convnd(A_TYPE *A, kernel = triton.kernel(src, ['C']) @staticmethod - def _unpack(idx, D, H, W): - cdh = idx // W - w = idx % W - cd = cdh // H - h = cdh % H - c = cd // D - d = cd % D - return c, d, h, w + def _unpack(idx, order, shape_b): + _123 = idx // shape_b[order[0]] + _0 = idx % shape_b[order[0]] + _23 = _123 // shape_b[order[1]] + _1 = _123 % shape_b[order[1]] + _3 = _23 // shape_b[order[2]] + _2 = _23 % shape_b[order[2]] + return _0, _1, _2, _3 @staticmethod - def _delta_a(upsample_d, upsample_h, upsample_w, depth, TK, - T, R, S, stride_a): + def _roundup(x, div): + return (x + div - 1) // div * div + + @staticmethod + def _delta_a(upsample_d, upsample_h, upsample_w, + bc, bd, bh, bw, + ac, ad, ah, aw, + stride_a, shape_b, + TK): + # Parse the axes so that the reduction is done + # from the innermost dimension outward + order = sorted([bc, bd, bh, bw], reverse = True) + c, d, h, w = [order.index(x) for x in [bc, bd, bh, bw]] + # Size of the lookup table is the product of the 3 innermost dimensions + K = _conv._roundup(TK, shape_b[order[0]] * shape_b[order[1]] * shape_b[order[2]]) + # Allocate temporary arrays ud = np.arange(upsample_d, dtype=np.int32)[:, np.newaxis, np.newaxis, np.newaxis] uh = np.arange(upsample_h, dtype=np.int32)[np.newaxis, :, np.newaxis, np.newaxis] uw = np.arange(upsample_w, dtype=np.int32)[np.newaxis, np.newaxis, :, np.newaxis] - ctrs = np.arange(depth, dtype=np.int32)[np.newaxis, np.newaxis, np.newaxis, :] - c, t, r, s = _conv._unpack(ctrs, T, R, S) - nextc, nextt, nextr, nexts = _conv._unpack(ctrs + TK, T, R, S) - cdiff = nextc - c - tdiff = nextt - t - rdiff = nextr - r - sdiff = nexts - s - return cdiff*stride_a[1] + tdiff*stride_a[2] + rdiff*stride_a[3] + sdiff*stride_a[4] + k = np.arange(K , dtype=np.int32)[np.newaxis, np.newaxis, np.newaxis, :] + # Find reduction indices at the current and next reduction indices + currentk = _conv._unpack(k , order, shape_b) + nextk = _conv._unpack(k + TK, order, shape_b) + # Compute memory stride + result = 0 + result += (nextk[c] - currentk[c]) * stride_a[ac] + result += (nextk[d] - currentk[d]) * stride_a[ad] + result += (nextk[h] - currentk[h]) * stride_a[ah] + result += (nextk[w] - currentk[w]) * stride_a[aw] + # Initial k + ki = np.arange(TK , dtype=np.int32)[np.newaxis, np.newaxis, np.newaxis, :] + currentk = _conv._unpack(ki, order, shape_b) + resulti = 0 + resulti += currentk[c] * stride_a[ac] + resulti += currentk[d] * stride_a[ad] + resulti += currentk[h] * stride_a[ah] + resulti += currentk[w] * stride_a[aw] + return np.concatenate((resulti, result), axis=-1) @staticmethod def _extract_strides(shape): @@ -134,38 +153,56 @@ void convnd(A_TYPE *A, @staticmethod def _call(a, b, - upsample_d, upsample_h, upsample_w, pad_d, pad_h, pad_w, - stride_d, stride_h, stride_w, - mode): + stride_d, stride_h, stride_w, + upsample_d, upsample_h, upsample_w, + a_layout, b_layout, c_layout): # input shapes shape_a = list(triton.shape(a)) shape_b = list(triton.shape(b)) - # add depth - shape_a.insert(2, 1) - shape_b.insert(1, 1) - NB, NC, AD, AH, AW = shape_a - NC, BD, BH, BW, NF = shape_b + dim = len(shape_a) - 2 + # indices + an, ac, ad, ah, aw = [a_layout.find(x) for x in 'ncdhw'] + bk, bc, bd, bh, bw = [b_layout.find(x) for x in 'kctrs'] + cn, ck, cd, ch, cw = [c_layout.find(x) for x in 'nkdhw'] + # extract shapes + if dim == 2: + shape_a.insert(ad, 1) + if dim == 2: + shape_b.insert(bd, 1) # output shape - CD = (AD*upsample_d - BD + 1 + 2*pad_d + stride_d - 1) // stride_d - CH = (AH*upsample_h - BH + 1 + 2*pad_h + stride_h - 1) // stride_h - CW = (AW*upsample_w - BW + 1 + 2*pad_w + stride_w - 1) // stride_w - shape_c = [NB, NF, CD, CH, CW] + shape_c = [0] * 5 + shape_c[cn] = shape_a[an] + shape_c[ck] = shape_b[bk] + shape_c[cd] = (shape_a[ad]*upsample_d - shape_b[bd] + 1 + 2*pad_d + stride_d - 1) // stride_d + shape_c[ch] = (shape_a[ah]*upsample_h - shape_b[bh] + 1 + 2*pad_h + stride_h - 1) // stride_h + shape_c[cw] = (shape_a[aw]*upsample_w - shape_b[bw] + 1 + 2*pad_w + stride_w - 1) // stride_w # strides stride_a = _conv._extract_strides(shape_a) stride_b = _conv._extract_strides(shape_b) stride_c = _conv._extract_strides(shape_c) - # look-up tables + # tiling parameters + TM = [32] + TN = [32] TK = 8 - FS = BD * BH * BW - depth = (TK + FS - 1)//FS * FS + # pointer deltas for a delta_a = _conv._delta_a(upsample_d, upsample_h, upsample_w, - depth, TK, BD, BH, BW, stride_a) + bc, bd, bh, bw, + ac, ad, ah, aw, + stride_a, shape_b, + TK) delta_a = triton.fw.torch.from_numpy(delta_a).cuda() - inc_a = np.arange(depth, dtype=np.int32) - inc_a = ((inc_a + TK) % depth) - inc_a + # delta increments for a + inc_a = np.arange(delta_a.shape[-1] - TK, dtype=np.int32) + inc_a = ((inc_a + TK) % inc_a.size) - inc_a inc_a = triton.fw.torch.from_numpy(inc_a).cuda() - + # allocate output + if dim == 2: + shape_c.pop(cd) + c = triton.empty(shape_c, dtype=a.dtype) + if dim == 2: + shape_c.insert(cd, 1) + # execute kernel trans_b = False is_wgrad = False is_blut = False @@ -174,31 +211,99 @@ void convnd(A_TYPE *A, 'UPAS': 'stride_w' if is_wgrad else '1', 'UPAH': '' if is_wgrad else 'stride_h', 'UPAW': '' if is_wgrad else 'stride_w', - 'LUT_SIZE': depth, - 'TM': [32], - 'TN': [32], - 'TK': TK, - 'A_TYPE': 'float', - 'B_TYPE': 'float' + 'LUT_SIZE': delta_a.shape[-1], + 'TM': TM, 'TN': TN, 'TK': TK, + 'A_TYPE': 'float', 'B_TYPE': 'float' } - - shape_c.pop(2) - c = triton.empty(shape_c, dtype=a.dtype) - grid = lambda opt: [triton.cdiv(NB*CD*CH*CW, opt.d('TM')), triton.cdiv(NF, opt.d('TN'))] - print(stride_c) - print(stride_b) - _conv.kernel(a, b, c, NB*CD*CH*CW, NF, NC*BD*BH*BW, AH, AW, BH, BW, CH, CW, NC, - stride_a[0], stride_a[1], stride_a[2], stride_a[3], stride_a[4], - stride_b[0], stride_b[1], stride_b[2], stride_b[3], stride_b[4], - stride_c[0], stride_c[1], stride_c[2], stride_c[3], stride_c[4], - pad_h, pad_w, stride_h, stride_w, upsample_h, upsample_w, + MATMUL_M = shape_c[cn] * shape_c[cd] * shape_c[ch] * shape_c[cw] + MATMUL_N = shape_c[ck] + MATMUL_K = shape_b[bc] * shape_b[bd] * shape_b[bh] * shape_b[bw] + _conv.kernel(a, b, c, + # matrix multiplication shapes + MATMUL_M, MATMUL_N, MATMUL_K, + # shapes for a + shape_a[ah], shape_a[aw], + # shapes for b + shape_b[bh], shape_b[bw], + # chapes for c + shape_c[ch], shape_c[cw], shape_c[cn], + # strides for a + stride_a[an], stride_a[ac], stride_a[ad + 0], stride_a[ad + 1], stride_a[ad + 2], + # strides for b + stride_b[bc], stride_b[bd + 0], stride_b[bd + 1], stride_b[bd + 2], stride_b[bk], + # strides for c + stride_c[cn], stride_c[ck], stride_c[cd], stride_c[cd + 1], stride_c[cd + 2], + # padding + pad_h, pad_w, + # striding + stride_h, stride_w, + # upsampling + upsample_h, upsample_w, 0, 0, 0, 0, 0, 0, + # look-up table delta_a, inc_a, - grid, **macros) + lambda opt: [triton.cdiv(MATMUL_M, opt.d('TM')), triton.cdiv(MATMUL_N, opt.d('TN'))], + **macros) return c @staticmethod - def forward(ctx, input, weight): - return _conv._call(input, weight, 1, 1, 1, 0, 0, 0, 1, 1, 1, '') + def forward(ctx, x, w, + pad_d = 0, pad_h = 0, pad_w = 0, + stride_d = 1, stride_h = 1, stride_w = 1, + upsample_d = 1, upsample_h = 1, upsample_w = 1, + layout_a = 'ncdhw', layout_b = 'ktrsc', layout_c = 'nkdhw'): + # save for backward + ctx.save_for_backward(x, w) + ctx.pad_d = pad_d + ctx.pad_h = pad_h + ctx.pad_w = pad_w + ctx.stride_d = stride_d + ctx.stride_h = stride_h + ctx.stride_w = stride_w + ctx.upsample_d = upsample_d + ctx.upsample_h = upsample_h + ctx.upsample_w = upsample_w + ctx.layout_a = layout_a + ctx.layout_b = layout_b + ctx.layout_c = layout_c + # return + return _conv._call(x, w, + pad_d, pad_h, pad_w, + stride_d, stride_h, stride_w, + upsample_d, upsample_h, upsample_w, + layout_a, layout_b, layout_c) + + @staticmethod + def backward(ctx, dy): + x, w = ctx.saved_tensors + pad_d = ctx.pad_d + pad_h = ctx.pad_h + pad_w = ctx.pad_w + stride_d = ctx.stride_d + stride_h = ctx.stride_h + stride_w = ctx.stride_w + upsample_d = ctx.upsample_d + upsample_h = ctx.upsample_h + upsample_w = ctx.upsample_w + layout_a = ctx.layout_a + layout_b = ctx.layout_b + layout_c = ctx.layout_c + + # TODO: Deal with this + dx_pad_d = 1 + dx_pad_h = 1 + dx_pad_w = 1 + dx = _conv.call(dy, w, + dw_pad_d, dw_pad_h, dw_pad_w, + upsample_w, upsample_h, upsample_w, + stride_d, stride_h, stride_w, + 'ncdhw', 'cktrs', 'nkdhw') + + + + ret = [None] * 14 + ret[0] = None + ret[1] = dw + return None, conv = _conv.apply \ No newline at end of file diff --git a/python/triton/ops/dot.py b/python/triton/ops/dot.py index ae568c642..89b28d20e 100644 --- a/python/triton/ops/dot.py +++ b/python/triton/ops/dot.py @@ -3,37 +3,50 @@ import triton class _dot(triton.function): src = """ -void dot(TYPE * A, TYPE * B, TYPE * C, +void dot(TYPE * A __noalias __readonly __aligned(16), + TYPE * B __noalias __readonly __aligned(16), + TYPE * C, + float alpha, int M, int N, int K, int lda __multipleof(8), int ldb __multipleof(8), int ldc) { - // prologue - int ridx = get_program_id(0); - int ridy = get_program_id(1); - int rm[TM] = ridx * TM + 0 ... TM; - int rn[TN] = ridy * TN + 0 ... TN; - int rk[TK] = 0 ... TK; - float c[TM, TN] = 0; - // pointers to operands - TYPE* pa[SHAPE_A] = A + rk[BROADCAST_AK] * STRIDE_AK + rm[BROADCAST_AM] * STRIDE_AM; - TYPE* pb[SHAPE_B] = B + rk[BROADCAST_BK] * STRIDE_BK + rn[BROADCAST_BN] * STRIDE_BN; - // prefetches operands - TYPE a[SHAPE_A] = *pa; - TYPE b[SHAPE_B] = *pb; - // reduction loop - for(int k = K; k > 0; k-= TK){ - c += USE_A @ USE_B; - pa = pa + TK * STRIDE_AK; - pb = pb + TK * STRIDE_BK; - bool checka[SHAPE_A] = k > TK; - bool checkb[SHAPE_B] = k > TK; - a = checka ? *pa : 0; - b = checkb ? *pb : 0; - } - // epilogue - TYPE* pc[TM, TN] = C + rm[:, newaxis] * ldc + rn[newaxis, :]; - *pc = c; + // prologue + int ridx = get_program_id(0); + int ridy = get_program_id(1); + int rm[TM] = ridx * TM + 0 ... TM; + int rn[TN] = ridy * TN + 0 ... TN; + int rk[TK] = 0 ... TK; + + // pointers to operands + TYPE* pa[SHAPE_A] = A + rk[BROADCAST_AK] * STRIDE_AK + rm[BROADCAST_AM] * STRIDE_AM; + TYPE* pb[SHAPE_B] = B + rk[BROADCAST_BK] * STRIDE_BK + rn[BROADCAST_BN] * STRIDE_BN; + + // prefetches operands + bool checka[SHAPE_A] = rk[BROADCAST_AK] < K; + bool checkb[SHAPE_B] = rk[BROADCAST_BK] < K; + TYPE a[SHAPE_A] = checka ? *pa : 0; + TYPE b[SHAPE_B] = checkb ? *pb : 0; + + // reduction loop + float c[TM, TN] = 0; + for(int k = K; k > 0; k -= TK){ + c += USE_A @ USE_B; + bool checka[SHAPE_A] = k > TK; + bool checkb[SHAPE_B] = k > TK; + pa += TK * STRIDE_AK; + pb += TK * STRIDE_BK; + a = *?(checka)pa; + b = *?(checkb)pb; + } + //c = c * alpha; + + // epilogue + int rxm[TM] = get_program_id(0) * TM + 0 ... TM; + int rxn[TN] = get_program_id(1) * TN + 0 ... TN; + TYPE* pc[TM, TN] = C + rxm[:, newaxis] * ldc + rxn[newaxis, :]; + bool checkc[TM, TN] = (rxm[:, newaxis] < M) && (rxn[newaxis, :] < N); + *?(checkc)pc = (TYPE[TM, TN])c; } """ kernel = triton.kernel(src, ['C']) @@ -75,10 +88,10 @@ void dot(TYPE * A, TYPE * B, TYPE * C, 'BROADCAST_BK': 'newaxis, :' if transpose_b else ':, newaxis', 'BROADCAST_BN': ':, newaxis' if transpose_b else 'newaxis, :', 'SHAPE_B' : 'TN, TK' if transpose_b else 'TK, TN'} - _dot.kernel(a, b, c, M, N, Ka, lda, ldb, ldc, + _dot.kernel(a, b, c, 1., M, N, Ka, lda, ldb, ldc, grid, bench=bench, AT = transpose_a, BT = transpose_b, TYPE = dtype, - TM = [64, 128], TN = [64, 128], TK = [8], **macros) + TM = [64], TN = [128], TK = [8], **macros) return c @staticmethod diff --git a/python/triton/ops/einsum.py b/python/triton/ops/einsum.py index 4c3409885..ff29432e5 100644 --- a/python/triton/ops/einsum.py +++ b/python/triton/ops/einsum.py @@ -1,234 +1,651 @@ -# Special thanks to Scott Gray from OpenAI for writing the einsum parsing function - - +import numpy as np +import torch +from math import ceil, log2 +from enum import IntEnum import triton -import math +from functools import reduce +from operator import mul +from sympy.parsing.sympy_parser import parse_expr +import sympy as sp +from collections import OrderedDict +from collections import namedtuple +import re +from sympy.printing.ccode import C89CodePrinter + class _einsum(triton.function): - src = """ -void einsumk(TYPE * A, TYPE * B, TYPE * C, - int dim_M, int dim_N, int dim_K, - int std_A0 __multipleof(8), - int std_B0 __multipleof(8), - int std_C0 __multipleof(8), - int std_A1 __multipleof(8), - int std_B1 __multipleof(8), - int std_C1 __multipleof(8)) { - // program id - int pgm = get_program_id(0); - int pgn = get_program_id(1); - int pgb = get_program_id(2); - // range - int rm[TM] = pgm * TM + 0 ... TM; - int rn[TN] = pgn * TN + 0 ... TN; - int rb[TB] = pgb * TB + 0 ... TB; - int rk[TK] = 0 ... TK; - // accumulator - float c[TM, TN, TB] = 0; - // pointers to a - TYPE *pa[SHAPE_A] = A + rk[BROADCAST_AK] * STRIDE_AK - + rm[BROADCAST_AM] * STRIDE_AM - + rb[newaxis, newaxis, :] * std_A0; - // pointers to b - TYPE *pb[SHAPE_B] = B + rk[BROADCAST_BK] * STRIDE_BK - + rn[BROADCAST_BN] * STRIDE_BN - + rb[newaxis, newaxis, :] * std_B0; - // prefetch - TYPE a[SHAPE_A] = *pa; - TYPE b[SHAPE_B] = *pb; - // accumulation - for(int k = dim_K; k > 0; k -= TK) { - c += USE_A @ USE_B; - pa += TK * STRIDE_AK; - pb += TK * STRIDE_BK; - bool checka[SHAPE_A] = k > TK; - bool checkb[SHAPE_B] = k > TK; - a = checka ? *pa : 0; - b = checkb ? *pb : 0; - } - // write-back - TYPE *pc[TM, TN, TB] = C + rm[:, newaxis, newaxis] * std_C1 - + rn[newaxis, :, newaxis] * 1 - + rb[newaxis, newaxis, :] * std_C0; - bool checkm[TM] = rm < dim_M; - bool checkn[TN] = rn < dim_N; + + ############################# + ## Triton-C code generation + ############################# + def print_cc(expr, axes_0, axes_1, axes_2): + + class TritonCodePrinter(C89CodePrinter): + + def __init__(self, axes_0, axes_1, axes_2): + super(TritonCodePrinter, self).__init__() + self.axes_0 = axes_0 + self.axes_1 = axes_1 + self.axes_2 = axes_2 + + def _print_Symbol(self, expr): + name = super(C89CodePrinter, self)._print_Symbol(expr) + if expr in self.axes_0: + return f'r{name}[:, newaxis, newaxis]' + if expr in self.axes_1: + return f'r{name}[newaxis, :, newaxis]' + if expr in self.axes_2: + return f'r{name}[newaxis, newaxis, :]' + return name + + def _print_Indexed(self, expr): + assert len(expr.indices) == 1 + return "*(%s + %s)" % (self._print(expr.base.label), + self._print(expr.indices[0])) + + return TritonCodePrinter(axes_0, axes_1, axes_2).doprint(expr) + + + def unpack_cc(tile, axes, prefix, remat): + ret = '' + axes = list(map(str, axes)) + for i, d in enumerate(reversed(axes)): + if i == len(axes) - 1: + break + currs = ''.join(axes[: len(axes) - i]) + nexts = ''.join(axes[: len(axes) - (i + 1)]) + ty = '' if remat else 'int ' + sz = '' if remat else f'[{tile}]' + ret += f' {ty}{prefix}{nexts}{sz} = r{currs} / dim_{d};\n' + ret += f' {ty}{prefix}{d}{sz} = r{currs} % dim_{d};\n' + return ret + + def strides_cc(name, expr): + ret = [f'stride_{name}_{d}' for d in expr[:-1]] + ['1'] + ret = dict(zip(expr, ret)) + return ret + + def make_kernel(name, + expr_a, expr_b, expr_c, + axes_m, axes_n, axes_k, axes_b, + multipleof_a, multipleof_b, multipleof_c, + lut_mode_a, lut_mode_b, + delta_a, delta_b, + subscripted): + + use_lut_a = True + use_lut_b = True + + src = "" + + if use_lut_a and lut_mode_a == _einsum.LUT_MODE.CONSTANT: + src += f""" +char __constant__* AD = calloc({4*len(delta_a)});""" + if use_lut_b and lut_mode_b == _einsum.LUT_MODE.CONSTANT: + src += f""" +char __constant__* BD = calloc({4*len(delta_b)});""" + + + src += f""" +__global__ void {name}( + TYPE * A __noalias __readonly __aligned(16) + , TYPE * B __noalias __readonly __aligned(16) + , TYPE * C + , int * locks + , float alpha + , int matmul_m, int matmul_n, int matmul_k __multipleof(16) + , int div_m + """ + for dim in [axes_m, axes_n, axes_k, axes_b]: + for d in dim: + src += f", int dim_{d}" + src += "\n " + for dim, name, mult in zip([expr_a, expr_b, expr_c], + ['a', 'b', 'c'], + [multipleof_a, multipleof_b, multipleof_c]): + for d in range(len(dim) - 1): + attr = f'__multipleof({mult})' + src += f", int stride_{name}_{d} {attr}" + src += "\n " + if lut_mode_a == _einsum.LUT_MODE.SCALAR: + src += f", int stride_a_inner __multipleof({multipleof_a})" + elif lut_mode_a == _einsum.LUT_MODE.DRAM: + src += ", int* AD __noalias __readonly __aligned(16)" + src += "\n " + if lut_mode_b == _einsum.LUT_MODE.SCALAR: + src += f", int stride_b_inner __multipleof({multipleof_b})" + elif lut_mode_b == _einsum.LUT_MODE.DRAM: + src += ", int* BD" + for ptr in subscripted: + src += f", int* {ptr}" + src += """) { + + // re-order outer program ids + int grid_m = (matmul_m + TM - 1) / TM; + int grid_n = (matmul_n + TN - 1) / TN; + int pid_mn = get_program_id(0) / div_m; + int pid_n = pid_mn % grid_n; + int pid_m = (pid_mn / grid_n)*div_m + (get_program_id(0) % div_m); + + // get batch program id + int pid_b = get_program_id(1); + +#if TZ == 1 + int off_k = 0; +#else + // get reduction sub-group program id + int pid_z = get_program_id(2); + int grid_z = get_num_programs(2); + int div_z = matmul_k / TZ; + int rem_z = matmul_k % TZ; + int off_k = pid_z * div_z; + matmul_k = select(pid_z < rem_z, div_z, div_z + rem_z); +#endif + + // create ranges +""" + rk = 'r{}'.format(''.join(map(str,axes_k))) + for axes, tile, off in zip([axes_m, axes_n, axes_b, axes_k], + ['TM', 'TN', 'TB', 'TK'], + ['pid_m*TM', 'pid_n*TN', 'pid_b*TB', 'off_k']): + currs = ''.join(map(str,axes)) + if axes: + src += f" int r{currs}[{tile}] = {off} + 0 ... {tile};\n" + src += _einsum.unpack_cc(tile, axes, 'r', False) + + src += """ + // initialize pointers to A + int offa[TM, TK, TB] = """ + for i, sym in enumerate(expr_a): + ccode = _einsum.print_cc(sym, axes_m, axes_k, axes_b) + stride = f'stride_a_{i}' if i < len(expr_a) - 1 else '1' + if i > 0: + src += ' + ' + src += f"({ccode}) * {stride}\n " + src += ';' + + src += """ + TYPE *pa[TM, TK, TB] = A + offa;""" + + if use_lut_a and not lut_mode_a == _einsum.LUT_MODE.SCALAR: + spec = '__constant__' if lut_mode_a == _einsum.LUT_MODE.CONSTANT else '' + cast = '(int __constant__*)' if lut_mode_a == _einsum.LUT_MODE.CONSTANT else '' + src += f""" + // initialize pointers to A look-up table + int offadelta[TK] = off_k + 0 ... TK; + int {spec} *padelta[TK] = {cast}AD + offadelta; + int incda[TM, TK, TB] = (*padelta)[newaxis, :, newaxis];""" + + src += """ + + // initialize pointers to B + int offb[TK, TN, TB] = """ + for i, sym in enumerate(expr_b): + ccode = _einsum.print_cc(sym, axes_k, axes_n, axes_b) + stride = f'stride_b_{i}' if i < len(expr_b) - 1 else '1' + if i > 0: + src += ' + ' + src += f"({ccode}) * {stride}\n " + src += ';' + + src += """ + TYPE *pb[TK, TN, TB] = B + offb;""" + + + if use_lut_b and not lut_mode_b == _einsum.LUT_MODE.SCALAR: + spec = '__constant__' if lut_mode_b == _einsum.LUT_MODE.CONSTANT else '' + cast = '(int __constant__*)' if lut_mode_b == _einsum.LUT_MODE.CONSTANT else '' + src += f""" + // initialize pointers to B look-up table + int offbdelta[TK] = off_k + 0 ... TK; + int *pbdelta[TK] = BD + offbdelta;""" + + src += f""" + + // prefetch + bool checkm[TM] = r""" + ''.join(map(str,axes_m)) + f""" < matmul_m; + bool checkn[TN] = r""" + ''.join(map(str,axes_n)) + f""" < matmul_n; + bool checkk[TK] = {rk} < matmul_k + off_k; + bool checka[TM, TK, TB] = checkm[:, newaxis, newaxis] && checkk[newaxis, :, newaxis]; + bool checkb[TK, TN, TB] = checkk[:, newaxis, newaxis] && checkn[newaxis, :, newaxis]; + TYPE a[TM, TK, TB] = checka ? *pa : 0; + TYPE b[TK, TN, TB] = checkb ? *pb : 0; + // accumulate + float acc[TM, TN, TB] = 0; + for(int k = matmul_k; k > 0; k -= TK) {{ + acc += a @ b;""" + + if not use_lut_a or not use_lut_b: + src += f""" + {rk} += TK; +""" + src += _einsum.unpack_cc(tile, axes_k, 'r', True) + + + if use_lut_a: + if lut_mode_a == _einsum.LUT_MODE.SCALAR: + src += """ + pa += stride_a_inner;""" + else: + src += """ + pa += incda; + padelta += TK; + incda = (*padelta)[newaxis, :, newaxis];""" + else: + src += """ + offa = """ + for i, sym in enumerate(expr_a): + ccode = _einsum.print_cc(sym, axes_m, axes_k, axes_b) + stride = f'stride_a_{i}' if i < len(expr_a) - 1 else '1' + if i > 0: + src += ' + ' + src += f"({ccode}) * {stride}\n " + src += """; + TYPE *pa[TM, TK, TB] = A + offa;""" + + + + if lut_mode_b == _einsum.LUT_MODE.SCALAR: + src += """ + pb += stride_b_inner;""" + else: + src += """ + pb += (*pbdelta)[:, newaxis, newaxis]; + pbdelta += TK;""" + + src += f""" + checkk = k > TK; + checka = checkm[:, newaxis, newaxis] && checkk[newaxis, :, newaxis]; + checkb = checkk[:, newaxis, newaxis] && checkn[newaxis, :, newaxis]; + a = *?(checka)pa; + b = *?(checkb)pb; + }} + TYPE c[TM, TN, TB] = acc; + + // re-materialize ranges +""" + for axes, tile, off in zip([axes_m, axes_n, axes_b], + ['TM', 'TN', 'TB'], + ['pid_m*TM', 'pid_n*TN', 'pid_b*TB']): + currs = ''.join(map(str,axes)) + if axes: + src += f" r{currs} = {off} + 0 ... {tile};\n" + src += _einsum.unpack_cc(tile, axes, 'r', True) + + src += """ + // initialize pointers to C + int offc[TM, TN, TB] = """ + for i, sym in enumerate(expr_c): + stride = f'stride_c_{i}' if i < len(expr_c) - 1 else '1' + ccode = _einsum.print_cc(sym, axes_m, axes_n, axes_b) + if i > 0: + src += ' + ' + src += f"({ccode}) * {stride}\n " + src += ';' + + src += """ + TYPE *pc[TM, TN, TB] = C + offc; + + // bounds-checking + checkm = r""" + ''.join(map(str,axes_m)) + """ < matmul_m; + checkn = r""" + ''.join(map(str,axes_n)) + """ < matmul_n; bool checkc[TM, TN, TB] = checkm[:, newaxis, newaxis] && - checkn[newaxis, :, newaxis]; - *?(checkc)pc = (TYPE[TM, TN, TB])c; + checkn[newaxis, :, newaxis]; + + // write back +#if TZ == 1 + *?(checkc)pc = c; +#else + int *plock = locks + pid_mn + pid_b * get_num_programs(0); + int *pcount = plock + 1024*1024; + // spin + for(int repeat = 1; repeat == 1; repeat = atomic_cas(plock, 0, 1)); + int count = *pcount; + if(count == 0) + *?(checkc)pc = c; + else + *?(checkc)pc = c + *?(checkc)pc; + atomic_xchg(pcount, (count + 1) % (grid_z)); + atomic_xchg(plock, 0); +#endif } """ - kernel = triton.kernel(src, ['C']) + #print(src) + ret = triton.kernel(src, ['C']) + if use_lut_a and lut_mode_a == _einsum.LUT_MODE.CONSTANT: + ret.set_constant('AD', delta_a) + if use_lut_b and lut_mode_b == _einsum.LUT_MODE.CONSTANT: + ret.set_constant('BD', delta_b) + return ret + + ############################ + ## Look-up Table + ############################ + + class LUT_MODE(IntEnum): + SCALAR = 1 + CONSTANT = 2 + DRAM = 3 + + def lut_mode(delta): + if delta.size == 0 or np.min(delta) == np.max(delta): + return _einsum.LUT_MODE.SCALAR + #if delta.size < 4096: + # return _einsum.LUT_MODE.CONSTANT + return _einsum.LUT_MODE.DRAM + + def symbolic_delta(symbols, axes): + rank = len(symbols) + strides = [sp.symbols(f'stride{d}') for d in range(rank)] + nexts = {s: sp.symbols(f'next{s}') for s in axes} + delta = 0 + for i in range(rank): + delta += strides[i] * (symbols[i].subs(nexts) - symbols[i]) + return delta + + def unpack_offset(k, axes, dims): + ret = dict() + for d in reversed(axes): + ret[d] = k % dims[d] + k = k // dims[d] + return ret + + def make_delta(axes, step, stride, dims, symbols, arrays): + # symbolic pointer increments + delta = _einsum.symbolic_delta(symbols, axes) + args = [f'stride{d}' for d in range(len(stride))] + args += [f'{sk}' for sk in axes] + args += [f'next{sk}' for sk in axes] + args += [f'{sk}' for sk, _ in arrays] + fn = sp.lambdify(args, delta, 'numpy') + # inner axes values + inner = [dims[d] for d in axes] + k = np.arange(np.prod(inner), dtype=np.int32) + off = _einsum.unpack_offset(k, axes, dims) + nextoff = _einsum.unpack_offset(k + step, axes, dims) + # evaluate deltas + args = [s for s in stride] + args += [off[sk] for sk in axes] + args += [nextoff[sk] for sk in axes] + args += [x for _, x in arrays] + delta = fn(*args) + return delta, _einsum.lut_mode(delta[:-step]) + + ############################ + ## Einsum parsing + ############################ + + def uniq(seq): + seen = set() + seen_add = seen.add + return [x for x in seq if not (x in seen or seen_add(x))] + + def parse_axes(expr_a, expr_b, expr_c, subscripted): + is_index = lambda x: type(x) == sp.indexed.Indexed or str(x) in subscripted + sym_a = [x for s in expr_a for x in s.free_symbols if not is_index(x)] + sym_b = [x for s in expr_b for x in s.free_symbols if not is_index(x)] + sym_c = [x for s in expr_c for x in s.free_symbols] + batch = [d for d in sym_a if d in sym_b and d in sym_c] + outer = [d for d in sym_a if d not in sym_b and d in sym_c] + inner = [d for d in sym_a if d in sym_b and d not in sym_c] + illegal = [d for d in sym_a if d not in sym_b and d not in sym_c] + if illegal: + raise ValueError(f"einsum labels {illegal} ({expr_a}) "\ + f"not present in {expr_b} or {expr_c}") + return _einsum.uniq(batch), _einsum.uniq(outer), _einsum.uniq(inner) + + + def replace_subscript(expr, arrays): + # replace array indexing by Indexed() + indexed = re.findall('([_a-zA-Z][_a-zA-Z0-9]*)\[([_a-z]*)\]', expr) + for x in indexed: + arrays.append(x[0]) + expr = expr.replace(f'{x[0]}[{x[1]}]', f'Indexed({x[0]},{x[1]})') + return expr + + + def parse_expr(expr, arrays): + # extract symbols + sym = [] + i = 0 + while i < len(expr): + d = expr[i] + if d == '(': + size = expr[i:].find(')') + d = expr[i : i + size + 1] + d = _einsum.replace_subscript(d, arrays) + sym.append(parse_expr(d)) + i += size + 1 + else: + sym.append(parse_expr(d)) + i += 1 + return sym - @staticmethod - def _append_dim(dim_data, dim_type, idx, label, dim, stride): - if dim_type in dim_data: - data = dim_data[dim_type] - if idx != data["idx"] + 1: - raise ValueError("aggregate inner, outer and batch dims must be adjacent to each other.") - data["dim"] *= dim - data["lab"] = label + data["lab"] - else: - dim_data[dim_type] = dict(idx=idx, lab=label, dim=dim, std=stride) - return dim_type + ############################ + ## Preprocessing + ############################ @staticmethod - def _parse_abc(labels_a, labels_b, labels_c, shape_a, is_a=False): + def pad(tensor, pad): + pad = pad + [0] * (2*len(tensor.shape) - len(pad)) + begin = [ x if x > 0 else None for x in pad[-1::-2]] + end = [-x if x > 0 else None for x in pad[-2::-2]] + slices = [slice(b, e) for b, e in zip(begin, end)] + tensor = torch.nn.functional.pad(tensor, pad, 'constant', 0) + tensor = tensor[slices] + return tensor - if len(labels_a) != len(shape_a): - raise ValueError(f"einsum notation dims do not match shape: {labels_a} {shape_a}") - trans = False - stride = 1 - std1 = None - data = dict() - for idx, (lab, dim) in enumerate(reversed(list(zip(labels_a, shape_a)))): - #print(idx, lab, dim) - if dim is None: - raise ValueError("einsum doens't currently work on shapes with placeholder dims.") - if idx == 0 and dim % 8 != 0: - raise ValueError("contiguous dim must be multiple of 8") + ############################ + ## Compilation + ############################ - if lab in labels_c: - # batch dim - if lab in labels_b: - _einsum._append_dim(data, "B", idx, lab, dim, stride) - if idx == 0: - raise ValueError(f"batch dim can not be contiguous dim: {lab} {labels_a} {shape_a}") - # outer dim - else: - std1 = _einsum._append_dim(data, "O", idx, lab, dim, stride) - if idx == 0: - trans = is_a - # inner dim - elif lab in labels_b: - std1 = _einsum._append_dim(data, "I", idx, lab, dim, stride) - if idx == 0: - trans = not is_a - else: - raise ValueError(f"einsum def for output: {lab} ({labels_a}), not present in either other def") + class instance: - stride *= dim + locks = None + kernel_cache = dict() - if "B" not in data: - data["B"] = dict(dim=1, std=1) + def __init__(self, einsum, dtype, stride_a, stride_b, stride_c, shape_a, shape_b, shape_c, arrays): + # parse symbols + expr_a, expr_bc = einsum.split(",") + expr_b, expr_c = expr_bc.split("->") + subscripted = [] + sym_a = _einsum.parse_expr(expr_a, subscripted) + sym_b = _einsum.parse_expr(expr_b, subscripted) + sym_c = _einsum.parse_expr(expr_c, subscripted) + # parse axes + axes_b, axes_m, axes_k = _einsum.parse_axes(sym_a, sym_b, sym_c, subscripted) + _, axes_n, _ = _einsum.parse_axes(sym_b, sym_a, sym_c, subscripted) + axes = axes_b + axes_m + axes_n + axes_k + # check dimensions + dims_a = dict(zip(sym_a, shape_a)) + dims_b = dict(zip(sym_b, shape_b)) + dims_c = dict(zip(sym_c, shape_c)) + for axes in [axes_b, axes_k]: + for d in axes: + dim_a = dims_a[d] if d in sym_a else None + dim_b = dims_b[d] if d in sym_b else None + if dim_a and dim_b and dim_a != dim_b: + raise ValueError(f'incompatible dimension {d}' + f' (a: {dim_a}; b: {dim_b})') + dims = dict() + dims.update(dims_a) + dims.update(dims_b) + dims.update(dims_c) + # look-up tables + TK = 16 if dtype == triton.fw.torch.float16 else 8 + arrays = [(x, arrays[x]) for x in subscripted] + delta_a, lut_mode_a = _einsum.make_delta(axes_k, TK, stride_a, dims, sym_a, arrays) + delta_b, lut_mode_b = _einsum.make_delta(axes_k, TK, stride_b, dims, sym_b, arrays) + # hash for recompilation + stride_a_multiple = max([x for x in [1, 2, 4, 8] if shape_a[-1] % x == 0]) + stride_b_multiple = max([x for x in [1, 2, 4, 8] if shape_b[-1] % x == 0]) + stride_c_multiple = max([x for x in [1, 2, 4, 8] if shape_c[-1] % x == 0]) + name = f'{expr_a}_{expr_b}_{expr_c}_{lut_mode_a}_{lut_mode_b}'\ + f'_{stride_a_multiple}_{stride_b_multiple}_{stride_c_multiple}' + # recompile if necessary + cache = _einsum.instance.kernel_cache + if name not in cache: + cachesize = len(cache) + cache[name] = _einsum.make_kernel(f'__einsum{cachesize}', + sym_a, sym_b, sym_c, + axes_m, axes_n, axes_k, axes_b, + stride_a_multiple, stride_b_multiple, stride_c_multiple, + lut_mode_a, lut_mode_b, + delta_a, delta_b, + subscripted) + self.kernel = cache[name] + # Initialize locks + if _einsum.instance.locks is None: + _einsum.instance.locks = torch.zeros(2*1024*1024, dtype=torch.int32).cuda() + # Kernel arguments + dim_m = [dims[d] for d in axes_m] + dim_n = [dims[d] for d in axes_n] + dim_k = [dims[d] for d in axes_k] + dim_b = [dims[d] for d in axes_b] + M = reduce(mul, dim_m, 1) + N = reduce(mul, dim_n, 1) + K = reduce(mul, dim_k, 1) + B = reduce(mul, dim_b, 1) + stride_a = list(stride_a[:-1]) + stride_b = list(stride_b[:-1]) + stride_c = list(stride_c[:-1]) + arrays = [torch.from_numpy(x).cuda() for _, x in arrays] + alpha = 1. + div_m = 1 + self.args = [None, None, None, + _einsum.instance.locks, + alpha, M, N, K, div_m] +\ + dim_m + dim_n + dim_k + dim_b +\ + stride_a + stride_b + stride_c + if lut_mode_a != _einsum.LUT_MODE.CONSTANT: + delta_a = delta_a[0] if lut_mode_a == _einsum.LUT_MODE.SCALAR else torch.from_numpy(delta_a).cuda() + self.args += [delta_a] + if lut_mode_b != _einsum.LUT_MODE.CONSTANT: + delta_b = delta_b[0] if lut_mode_b == _einsum.LUT_MODE.SCALAR else torch.from_numpy(delta_b).cuda() + self.args += [delta_b] + self.args += arrays + self.args += [lambda opt: [triton.cdiv(M, opt.d('TM')) * + triton.cdiv(N, opt.d('TN')), + triton.cdiv(B, opt.d('TB')), + opt.d('TZ')]] + # position of dynamic arguments + self.pos_a = 0 + self.pos_b = 1 + self.pos_c = 2 + # pre-processor macros + TM = [x for x in [16, 32, 64, 128] if x <= M] + TN = [x for x in [16, 32, 64, 128] if x <= N] + TB = [x for x in [1, 2, 4] if x <= B] + MAX_GZ = K // 2048 + MIN_GM = M // max(TM) + MIN_GN = N // max(TN) + MIN_GB = B // max(TB) + TZ = [x for x in [1, 2, 4, 8, 16, 32] \ + if x < MAX_GZ and x*MIN_GM*MIN_GN*MIN_GB < 256] + TZ = [1] if not TZ else [TZ[-1], TZ[-1]*2] + #TB, TZ = [1], [1] + #TM, TN, TB, TZ = [128], [128], [1], [1] + self.macros = { 'TM': TM, 'TN': TN, 'TB': TB, 'TK': TK, 'TZ': TZ, 'TYPE': dtype } + self.dtype = dtype + self.flops = 2 * B * M * N * K + self.sym_a = sym_a + self.sym_b = sym_b + self.sym_c = sym_c + # save equivalent mat-mul dimensions + self.matmul_B = B + self.matmul_M = M + self.matmul_N = N + self.matmul_K = K + + def run(self, a, b, c, bench): + self.args[self.pos_a] = a + self.args[self.pos_b] = b + self.args[self.pos_c] = c + self.kernel(*self.args, bench=bench, **self.macros) - # batch, outer, inner, std0, std1, trans - return data["B"]["dim"], data["O"]["dim"], data["I"]["dim"], data["B"]["std"], data[std1]["std"], trans + + + + ############################ + ## Forward + ############################ + + instance_cache = dict() @staticmethod - def _parse_einsum(labels_a, labels_b, labels_c, shape_a, shape_b): - - dims_a = dict(zip(labels_a, shape_a)) - dims_b = dict(zip(labels_b, shape_b)) - shape_c = list() - for lab in labels_c: - if lab in dims_a: - shape_c.append(dims_a[lab]) - elif lab in dims_b: - shape_c.append(dims_b[lab]) - else: - raise ValueError(f"einsum def for output: {lab} ({labels_c}), not present in either input def ({labels_a}, {labels_b})") - - BA, M, KA, std_a0, std_a1, ta = _einsum._parse_abc(labels_a, labels_b, labels_c, shape_a, True) - BB, N, KB, std_b0, std_b1, tb = _einsum._parse_abc(labels_b, labels_a, labels_c, shape_b, False) - BC, _, _, std_c0, std_c1, _ = _einsum._parse_abc(labels_c, labels_b, labels_a, shape_c) - - if not (BA == BB == BC): - raise ValueError("mismatched batch dims") - if KA != KB: - raise ValueError("mismatched reduction dims") - - return shape_c, (BA, M, N, KA), (std_a0, std_b0, std_c0), (std_a1, std_b1, std_c1), ta, tb - - @staticmethod - def call(a, b, trans_a, trans_b, shape_c, bmnk, - std0, std1, einsum_a, einsum_b, einsum_c, - bench): + def forward(ctx, einsum, a, b, shape_c, **kwargs): + bench = kwargs['bench'] if 'bench' in kwargs else False + arrays = kwargs['arrays'] if 'arrays' in kwargs else dict() + # allocate output dtype = a.dtype - c = triton.empty(shape_c, dtype) - grid = lambda opt: [triton.cdiv(bmnk[1], opt.d('TM')), - triton.cdiv(bmnk[2], opt.d('TN')), - triton.cdiv(bmnk[0], opt.d('TB'))] - macros = {# handle A transposition - 'USE_A' : 'a[^1, ^0, ^2]' if trans_a else 'a', - 'STRIDE_AK' : 'std_A1' if trans_a else '1', - 'STRIDE_AM' : '1' if trans_a else 'std_A1', - 'BROADCAST_AK': ':, newaxis, newaxis' if trans_a else 'newaxis, :, newaxis', - 'BROADCAST_AM': 'newaxis, :, newaxis' if trans_a else ':, newaxis, newaxis', - 'SHAPE_A' : 'TK, TM, TB' if trans_a else 'TM, TK, TB', - # handle B transposition - 'USE_B' : 'b' if not trans_b else 'b[^1, ^0, ^2]', - 'STRIDE_BK' : 'std_B1' if not trans_b else '1', - 'STRIDE_BN' : '1' if not trans_b else 'std_B1', - 'BROADCAST_BK': ':, newaxis, newaxis' if not trans_b else 'newaxis, :, newaxis', - 'BROADCAST_BN': 'newaxis, :, newaxis' if not trans_b else ':, newaxis, newaxis', - 'SHAPE_B' : 'TK, TN, TB' if not trans_b else 'TN, TK, TB'} - TM = [2**i for i in range(5, max(6, min(8, int(math.log2(bmnk[1]) + 1 ))))] - TN = [2**i for i in range(5, max(6, min(8, int(math.log2(bmnk[2]) + 1 ))))] - TB = [2**i for i in range(0, max(1, min(3, int(math.log2(bmnk[0]) + 1 ))))] - TK = [bmnk[2]] if bmnk[2] < 16 else [8, 16] - _einsum.kernel(a, b, c, - bmnk[1], bmnk[2], bmnk[3], - std0[0], std0[1], std0[2], - std1[0], std1[1], std1[2], - grid, bench=bench, - **macros, - TYPE=dtype, TM=TM, TN=TN, TK=TK, TB=TB) + c = triton.empty(shape_c, dtype=dtype) + key = (einsum, dtype, + a.stride(), b.stride(), c.stride(), + a.shape, b.shape, c.shape) + # compile einsum instance + cache = _einsum.instance_cache + #if key not in cache: + cache[key] = _einsum.instance(einsum, dtype, + a.stride(), b.stride(), c.stride(), + a.shape, b.shape, c.shape, arrays) + instance = cache[key] + instance.run(a, b, c, bench) + # save information in context + ctx.flops = instance.flops + ctx.sym_a = instance.sym_a + ctx.sym_b = instance.sym_b + ctx.sym_c = instance.sym_c + ctx.matmul_B = instance.matmul_B + ctx.matmul_M = instance.matmul_M + ctx.matmul_N = instance.matmul_N + ctx.matmul_K = instance.matmul_K + ctx.bench = bench + ctx.save_for_backward(a, b) return c + ############################ + ## Backward + ############################ @staticmethod - def forward(ctx, subscripts, a, b, bench = 0): - ctx.save_for_backward(a, b) - # parse - if type(subscripts) is str: - einsum_a, einsum_bc = subscripts.split(",") - einsum_b, einsum_c = einsum_bc.split("->") - else: - einsum_a, einsum_b, einsum_c = subscripts - shape_c, bmnk, std0, std1, ta, tb = _einsum._parse_einsum( - einsum_a, einsum_b, einsum_c, - triton.shape(a), triton.shape(b)) - # save for backward - ctx.trans_a = ta - ctx.trans_b = tb - ctx.einsum_a = einsum_a - ctx.einsum_b = einsum_b - ctx.einsum_c = einsum_c - ctx.bench = bench - ctx.bmnk = bmnk - # run - return _einsum.call(a, b, ta, tb, shape_c, bmnk, std0, std1, einsum_a, einsum_b, einsum_c, bench) - + def sym_invert(sym_c, sym_x, prefix, renamed, inverse): + for i, expr in enumerate(sym_x): + if expr.is_symbol: + continue + sc = [x for x in expr.free_symbols if x in sym_c][0] + sx = sp.symbols(f'{prefix}{i}') + renamed[expr] = sx + inverse[sc] = sp.solve(sp.Eq(expr, sx), sc)[0] @staticmethod - def backward(ctx, dc): + def sym_to_expr(sym): + res = [f'({x})' for x in sym] + res = ''.join(res) + return res + + @staticmethod + def backward(ctx, dy): a, b = ctx.saved_tensors - trans_a = ctx.trans_a - trans_b = ctx.trans_b - einsum_a = ctx.einsum_a - einsum_b = ctx.einsum_b - einsum_c = ctx.einsum_c - bench = ctx.bench + sym_a = ctx.sym_a + sym_b = ctx.sym_b + sym_c = ctx.sym_c + inverse = dict() + renamed = dict() + _einsum.sym_invert(sym_c, sym_a, 'a', renamed, inverse) + _einsum.sym_invert(sym_c, sym_b, 'b', renamed, inverse) + sym_a = [renamed[x] if x in renamed else x for x in sym_a] + sym_b = [renamed[x] if x in renamed else x for x in sym_b] + sym_c = [inverse[x] if x in inverse else x for x in sym_c] + expr_a = _einsum.sym_to_expr(sym_a) + expr_b = _einsum.sym_to_expr(sym_b) + expr_c = _einsum.sym_to_expr(sym_c) + expr = f'{expr_c},{expr_b}->{expr_a}' + da = einsum(expr, dy, b, a.shape, False) + return None, da, None, None, None - if not trans_a and not trans_b: # NN - da = einsum((einsum_c, einsum_b, einsum_a), dc, b, bench) - db = einsum((einsum_a, einsum_c, einsum_b), a, dc, bench) - elif not trans_a and trans_b: # NT - da = einsum((einsum_c, einsum_b, einsum_a), dc, b, bench) - db = einsum((einsum_c, einsum_a, einsum_b), dc, a, bench) - - elif trans_a and not trans_b: # TN - da = einsum((einsum_b, einsum_c, einsum_a), b, dc, bench) - db = einsum((einsum_a, einsum_c, einsum_b), a, dc, bench) - - elif trans_a and trans_b: # TT (not used) - da = einsum((einsum_b, einsum_c, einsum_a), b, dc, bench) - db = einsum((einsum_c, einsum_a, einsum_b), dc, a, bench) - - return None, da, db, None einsum = _einsum.apply \ No newline at end of file diff --git a/python/triton/utils.py b/python/triton/utils.py index 0b012af3f..117f69136 100644 --- a/python/triton/utils.py +++ b/python/triton/utils.py @@ -24,7 +24,7 @@ def empty(shape, dtype): return tf_empty_proxy(shape, dtype) #return fw.tf_extra_ops.alloc_empty(args, T = dtype) elif fw.has_torch(): - return fw.torch.empty(shape).cuda() + return fw.torch.empty(shape, dtype=dtype).cuda() def shape(A) : if fw.has_tensorflow(): @@ -47,16 +47,23 @@ class id_dict: return libtriton.retrieve_scalar(self.id) def __init__(self): - self.data = weakref.WeakKeyDictionary() + self.data = dict() def __delitem__(self, key): del self.data[key] - def __getitem__(self, key): + @staticmethod + def _get_key(key): if fw.has_tensorflow(): if isinstance(key, fw.tensorflow.Tensor): - key = key.op - ret = self.data[key] + key = id(key.op) + if fw.has_torch(): + if isinstance(key, fw.torch.Tensor): + key = id(key) + return key + + def __getitem__(self, key): + ret = self.data[id_dict._get_key(key)] if isinstance(ret, id_dict.lazy_entry): return ret.get() return ret @@ -65,7 +72,4 @@ class id_dict: return len(self.data) def __setitem__(self, key, value): - if fw.has_tensorflow(): - if isinstance(key, fw.tensorflow.Tensor): - key = key.op - self.data[key] = value \ No newline at end of file + self.data[id_dict._get_key(key)] = value \ No newline at end of file diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index 876ce0962..79718a232 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -10,10 +10,12 @@ int main() { typedef std::tuple, bool, bool, int, int, int> config_t; std::vector configs; for(auto ord: std::vector>{{1, 0}}) - for(auto x: std::vector>{{false, false}, {false, true}, - {true, false}}){ + for(auto x: std::vector>{{false, false}}){ std::vector tmp = { - config_t{ord, x[0], x[1], 2048, 2048, 2048}, +// config_t{ord, x[0], x[1], 512, 512, 512}, +// config_t{ord, x[0], x[1], 1024, 1024, 1024}, + config_t{ord, x[0], x[1], 127008, 768, 576}, +// config_t{ord, x[0], x[1], 8192, 8192, 8192} // config_t{ord, x[0], x[1], 16, 2048, 2048}, // config_t{ord, x[0], x[1], 32, 2048, 2048}, // config_t{ord, x[0], x[1], 64, 2048, 2048}, @@ -33,7 +35,7 @@ int main() { int32_t M, N, K; for(const auto& c: configs){ std::tie(ord, AT, BT, M, N, K) = c; - std::cout << "// " << c << std::flush; + std::cout << "// " << c ; for(auto perf: bench_dot(stream, HALF, AT, BT, M, N, K, ord, ord)) std::cout << ", " << perf << std::flush; std::cout << std::endl; diff --git a/tests/common/dot.h b/tests/common/dot.h index a157d7994..427e7ca04 100644 --- a/tests/common/dot.h +++ b/tests/common/dot.h @@ -20,7 +20,7 @@ static void cc_dot(std::vector &c, const std::vector &a, const std::vector float acc = 0; for(size_t k = 0; k < K; k++) acc = acc + (!AT ? a[k*M + m] : a[m*K + k]) * (!BT ? b[n*K + k] : b[k*N + n]); - c[m + n*M] = static_cast(acc); + c[m*N + n] = static_cast(acc); } } @@ -72,9 +72,9 @@ bool triton_dot(drv::stream* stream, bool AT, bool BT, std::string ty = to_string::value; size_t dt_nbytes = sizeof(T); drv::context* context = stream->context(); - int32_t lda = AT ? K : M; - int32_t ldb = BT ? N : K; - int32_t ldc = M; + int32_t lda = (AT ^ a_order[0]==1) ? K : M; + int32_t ldb = (BT ^ b_order[0]==1) ? N : K; + int32_t ldc = N; std::vector sa = { "1", "lda" }; std::vector sb = { "1", "ldb" }; @@ -86,17 +86,17 @@ bool triton_dot(drv::stream* stream, bool AT, bool BT, // macros rt::function::options_space_t opt; // A access patterns - opt.defines.push_back({"USEA", {AT? "a[^1, ^0]" : "a" }}); - opt.defines.push_back({"BROADCAST_AK", {AT? ":, newaxis" : "newaxis, :" }}); - opt.defines.push_back({"BROADCAST_AM", {AT? "newaxis, :" : ":, newaxis" }}); - opt.defines.push_back({"SHAPE_A", {AT? "TK, TM" : "TM, TK" }}); + opt.defines.push_back({"USEA", {AT? "a" : "a" }}); + opt.defines.push_back({"BROADCAST_AK", {AT? "newaxis, :" : "newaxis, :" }}); + opt.defines.push_back({"BROADCAST_AM", {AT? ":, newaxis" : ":, newaxis" }}); + opt.defines.push_back({"SHAPE_A", {AT? "TM, TK" : "TM, TK" }}); opt.defines.push_back({"STRIDE_AK", {AT? sa[a_order[0]] : sa[a_order[1]] }}); opt.defines.push_back({"STRIDE_AM", {AT? sa[a_order[1]] : sa[a_order[0]] }}); // B access patterns - opt.defines.push_back({"USEB", {BT? "b[^1, ^0]" : "b" }}); - opt.defines.push_back({"BROADCAST_BK", {BT? "newaxis, :" : ":, newaxis" }}); - opt.defines.push_back({"BROADCAST_BN", {BT? ":, newaxis" : "newaxis, :" }}); - opt.defines.push_back({"SHAPE_B", {BT? "TN, TK" : "TK, TN" }}); + opt.defines.push_back({"USEB", {BT? "b" : "b" }}); + opt.defines.push_back({"BROADCAST_BK", {BT? ":, newaxis" : ":, newaxis" }}); + opt.defines.push_back({"BROADCAST_BN", {BT? "newaxis, :" : "newaxis, :" }}); + opt.defines.push_back({"SHAPE_B", {BT? "TK, TN" : "TK, TN" }}); opt.defines.push_back({"STRIDE_BK", {BT? sb[b_order[1]] : sb[b_order[0]] }}); opt.defines.push_back({"STRIDE_BN", {BT? sb[b_order[0]] : sb[b_order[1]] }}); // data-type @@ -109,15 +109,15 @@ bool triton_dot(drv::stream* stream, bool AT, bool BT, opt.num_warps = {nwarp}; } if(mode == BENCH) { - opt.defines.push_back({"TM", {"128"}}); - opt.defines.push_back({"TN", {"128"}}); - opt.defines.push_back({"TK", {"16"}}); - opt.num_warps = {4}; + opt.defines.push_back({"TM", {"32", "64", "128"}}); + opt.defines.push_back({"TN", {"32", "64", "128"}}); + opt.defines.push_back({"TK", {to_string::value == "half" ? "16" : "8"}}); + opt.num_warps = {2, 4, 8}; } // kernels rt::function function(src::dot, opt); - std::vector args = {&*da, &*db, &*dc, M, N, K, lda, ldb, ldc}; + std::vector args = {&*da, &*db, &*dc, (float)1, M, N, K, lda, ldb, ldc}; auto grid = grid2d(M, N); // metrics @@ -126,17 +126,17 @@ bool triton_dot(drv::stream* stream, bool AT, bool BT, double triton_ns = triton::tools::bench([&]() { function(args, grid, stream);}, stream); bench.push_back(tflops(triton_ns)); - // // cublas - // if(cublas::cublasinit()){ - // NumericT alpha(static_cast(1)); - // NumericT beta(static_cast(0)); - // cublasGemmAlgo_t fastest; - // cublasGemm(cuty, stream, AT, BT, M, N, K, &alpha, &*da, lda, &*db, ldb, &beta, &*dc, ldc, &fastest); - // double cublas_ms = triton::tools::bench([&]() { cublasGemm(cuty, stream, AT, BT, M, N, K, - // &alpha, &*da, lda, &*db, ldb, &beta, &*dc, - // ldc, nullptr, fastest); }, stream); - // result.push_back(tflops(cublas_ms)); - // } +// // cublas +// if(cublas::cublasinit()){ +// T alpha(static_cast(1)); +// T beta(static_cast(0)); +// cublasGemmAlgo_t fastest; +// cublasGemm(CUDA_R_32F, stream, AT, BT, M, N, K, &alpha, &*da, lda, &*db, ldb, &beta, &*dc, ldc, &fastest); +// double cublas_ms = triton::tools::bench([&]() { cublasGemm(CUDA_R_16F, stream, AT, BT, M, N, K, +// &alpha, &*da, lda, &*db, ldb, &beta, &*dc, +// ldc, nullptr, fastest); }, stream); +// bench.push_back(tflops(cublas_ms)); +// } } // test triton @@ -147,9 +147,9 @@ bool triton_dot(drv::stream* stream, bool AT, bool BT, std::vector ha(M*K); std::vector hb(K*N); for(size_t i = 0; i < ha.size(); i++) - ha[i] = 1; + ha[i] = (float)rand()/RAND_MAX; for(size_t i = 0; i < hb.size(); i++) - hb[i] = 1; + hb[i] = (float)rand()/RAND_MAX; // copy buffer stream->write(&*da, true, 0, ha); stream->write(&*db, true, 0, hb); diff --git a/tests/common/src/dot.h b/tests/common/src/dot.h index 7c368e593..4dcab1efc 100644 --- a/tests/common/src/dot.h +++ b/tests/common/src/dot.h @@ -2,37 +2,58 @@ namespace src { const char *dot = R"( -void dot(TYPE * A, TYPE * B, TYPE * C, - int M, int N, int K, - int lda __multipleof(8), - int ldb __multipleof(8), - int ldc) { - // prologue - int ridx = get_program_id(0); - int ridy = get_program_id(1); - int rm[TM] = ridx * TM + 0 ... TM; - int rn[TN] = ridy * TN + 0 ... TN; - int rk[TK] = 0 ... TK; - float c[TM, TN] = 0; - // pointers to operands - TYPE* pa[SHAPE_A] = A + rk[BROADCAST_AK] * STRIDE_AK + rm[BROADCAST_AM] * STRIDE_AM; - TYPE* pb[SHAPE_B] = B + rk[BROADCAST_BK] * STRIDE_BK + rn[BROADCAST_BN] * STRIDE_BN; - // prefetches operands - TYPE a[SHAPE_A] = *pa; - TYPE b[SHAPE_B] = *pb; - // reduction loop - for(int k = K; k > 0; k-= TK){ - c += USEA @ USEB; - pa += TK * STRIDE_AK; - pb += TK * STRIDE_BK; - bool checka[SHAPE_A] = k > TK; - bool checkb[SHAPE_B] = k > TK; - a = checka ? *pa : 0; - b = checkb ? *pb : 0; - } - // epilogue - TYPE* pc[TM, TN] = C + rm[:, newaxis] + rn[newaxis, :] * ldc; - *pc = c; +__global__ void dot(TYPE * A __noalias __readonly __aligned(16), + TYPE * B __noalias __readonly __aligned(16), + TYPE * C __noalias __aligned(16), + float alpha, + int M, int N, int K, + int lda __multipleof(8), + int ldb __multipleof(8), + int ldc __multipleof(8)) { + // prologue + int ridx = get_program_id(0); + int ridy = get_program_id(1); + int gridx = M / TM; + int gridy = N / TN; + int rid = ridx + ridy * gridx; + ridx = rid / gridy; + ridy = rid % gridy; + int rm[TM] = ridx * TM + 0 ... TM; + int rn[TN] = ridy * TN + 0 ... TN; + int rk[TK] = 0 ... TK; + + // pointers to operands + int offa[SHAPE_A] = rk[BROADCAST_AK] * STRIDE_AK + rm[BROADCAST_AM] * STRIDE_AM; + int offb[SHAPE_B] = rk[BROADCAST_BK] * STRIDE_BK + rn[BROADCAST_BN] * STRIDE_BN; + TYPE* pa[SHAPE_A] = A + offa; + TYPE* pb[SHAPE_B] = B + offb; + + // prefetches operands + bool checka[SHAPE_A] = rk[BROADCAST_AK] < K; + bool checkb[SHAPE_B] = rk[BROADCAST_BK] < K; + TYPE a[SHAPE_A] = checka ? *pa : 0; + TYPE b[SHAPE_B] = checkb ? *pb : 0; + + // reduction loop + float c[TM, TN] = 0; + for(int k = K; k > 0; k -= TK){ + c += USEA @ USEB; + bool checka[SHAPE_A] = k > TK; + bool checkb[SHAPE_B] = k > TK; + pa += TK * STRIDE_AK; + pb += TK * STRIDE_BK; + a = *?(checka)pa; + b = *?(checkb)pb; + } + //c = c * alpha; + + // epilogue + int rxm[TM] = get_program_id(0) * TM + 0 ... TM; + int rxn[TN] = get_program_id(1) * TN + 0 ... TN; + int offc[TM, TN] = rxm[:, newaxis] * ldc + rxn[newaxis, :]; + TYPE* pc[TM, TN] = C + offc; + bool checkc[TM, TN] = (rxm[:, newaxis] < M) && (rxn[newaxis, :] < N); + *?(checkc)pc = (TYPE[TM, TN])c; } )"; diff --git a/tests/common/util.h b/tests/common/util.h index 0a7788195..89489f889 100644 --- a/tests/common/util.h +++ b/tests/common/util.h @@ -159,7 +159,7 @@ bool diff(const std::vector& hc, const std::vector& rc) { if(hc.size() != rc.size()) return false; for(size_t i = 0; i < hc.size(); i++) - if(std::isinf(hc[i]) || std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-4){ + if(std::isinf(hc[i]) || std::isnan(hc[i]) || std::abs(hc[i] - rc[i])/std::max(hc[i], rc[i]) > 1e-2){ std::cout << i << " " << hc[i] << " " << rc[i] << std::endl; return false; } diff --git a/tests/unit/dot.cc b/tests/unit/dot.cc index dec01dc21..283951377 100644 --- a/tests/unit/dot.cc +++ b/tests/unit/dot.cc @@ -10,8 +10,8 @@ int main() { // shapes to test typedef std::tuple config_t; std::vector configs; - for(int TM: std::vector{32, 64}) - for(int TN: std::vector{32, 64}) + for(int TM: std::vector{32, 64, 128}) + for(int TN: std::vector{32, 64, 128}) for(int TK: std::vector{16}) for(int nwarps: std::vector{4}) for(bool AT: std::array{false, true}) From fbf2a3f56fd9d0dd48519c8b01e484e77a7aeb43 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 19 Jan 2020 19:58:40 -0500 Subject: [PATCH 488/494] [CODEGEN][TRANSFORM] some bug-fixes for FP32 einsum --- include/triton/codegen/selection/machine_value.h | 1 - lib/codegen/selection/machine_value.cc | 7 +------ lib/codegen/transform/cts.cc | 2 ++ python/examples/einsum.py | 2 +- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/include/triton/codegen/selection/machine_value.h b/include/triton/codegen/selection/machine_value.h index 917151971..aab1f023a 100644 --- a/include/triton/codegen/selection/machine_value.h +++ b/include/triton/codegen/selection/machine_value.h @@ -125,7 +125,6 @@ class distributed_tile: public tile{ private: void init_indices(); - Type *make_vector_ty(Type *ty, size_t vector_size); public: distributed_tile(Type *ty, const shapes_t& shapes, const std::vector& order, const axes_t &axes, Builder &builder); diff --git a/lib/codegen/selection/machine_value.cc b/lib/codegen/selection/machine_value.cc index 72aace4b2..a94661b90 100644 --- a/lib/codegen/selection/machine_value.cc +++ b/lib/codegen/selection/machine_value.cc @@ -15,7 +15,7 @@ void distributed_tile::init_indices() { std::vector order(id.size()); std::iota(order.begin(), order.end(), 0); auto cmp = [&](int x, int y) { - return axes_[x].contiguous > axes_[y].contiguous; + return order_[x] < order_[y]; }; std::sort(order.begin(), order.end(), cmp); // build @@ -39,11 +39,6 @@ void distributed_tile::init_indices() { } } -llvm::Type *distributed_tile::make_vector_ty(llvm::Type *ty, size_t vector_size) { - if(vector_size == 1) - return ty; - return VectorType::get(ty, vector_size); -} distributed_tile::distributed_tile(Type *ty, const shapes_t &shapes, const std::vector& order, const axes_t &axes, llvm::IRBuilder<> &builder) : tile(ty, shapes), axes_(axes), order_(order), builder_(builder) { diff --git a/lib/codegen/transform/cts.cc b/lib/codegen/transform/cts.cc index f98b685e1..ae2791cc8 100644 --- a/lib/codegen/transform/cts.cc +++ b/lib/codegen/transform/cts.cc @@ -15,6 +15,8 @@ inline bool is_shmem_op(ir::instruction* i, int op) { return op==0 || op==1; if(i->get_id() == ir::INST_COPY_FROM_SHARED) return op==0; + if(i->get_id() == ir::INST_TRANS) + return op==0; return false; } diff --git a/python/examples/einsum.py b/python/examples/einsum.py index 2cbf2ca10..a3fdba5e0 100644 --- a/python/examples/einsum.py +++ b/python/examples/einsum.py @@ -168,7 +168,7 @@ for N, C, H, W, K, R, S in NCHWKRS: # Benchmark torch.set_num_threads(1) for a_shape, b_shape, c_shape, torch_fn, expr, arrays in configs: - dtype = torch.cuda.HalfTensor + dtype = torch.cuda.FloatTensor # initialize input tensors a = torch.rand(*a_shape).type(dtype).cuda() b = torch.rand(*b_shape).type(dtype).cuda() From 382ca2c74576881a15d60d70cf6c45e4b9ef5564 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 20 Jan 2020 11:55:34 -0500 Subject: [PATCH 489/494] [CODEGEN][ANALYSIS] cleaning: moving towards better polymorphism for tile layouts --- include/triton/codegen/analysis/layout.h | 48 ++++++++--------- include/triton/codegen/analysis/liveness.h | 4 +- .../triton/codegen/selection/machine_layout.h | 5 +- lib/codegen/analysis/allocation.cc | 34 ++++++------ lib/codegen/analysis/layout.cc | 54 +++++++++++-------- lib/codegen/analysis/liveness.cc | 4 +- lib/codegen/selection/generator.cc | 25 ++++----- lib/codegen/selection/machine_layout.cc | 24 +++++++-- lib/codegen/selection/machine_value.cc | 15 ++---- lib/codegen/transform/membar.cc | 16 +++--- tests/bench/dot.cc | 10 ++-- 11 files changed, 130 insertions(+), 109 deletions(-) diff --git a/include/triton/codegen/analysis/layout.h b/include/triton/codegen/analysis/layout.h index e0eee3a38..074bfb27c 100644 --- a/include/triton/codegen/analysis/layout.h +++ b/include/triton/codegen/analysis/layout.h @@ -50,31 +50,31 @@ public: virtual void visit_layout_shared(layout_shared_t*) = 0; }; +class layout_hmma_884_t; +class layout_scanline_t; +class layout_shared_t; + struct layout_t { layout_t(layout_type_t _type, const std::vector& _axes, const std::vector &_shapes, const std::vector &_values, ir::type *_ty, - size_t _id, analysis::align* align); - + // visitor virtual void accept(layout_visitor* vst) = 0; + // downcast + layout_hmma_884_t* to_hmma884(); + layout_scanline_t* to_scanline(); + layout_shared_t* to_shared(); + layout_type_t type; std::vector axes; std::vector shapes; std::vector values; std::vector order; - size_t id; - size_t size; - std::shared_ptr double_buffer; ir::type *ty; - size_t pad; - std::vector mts; - std::vector nts; - std::vector fpw; - std::vector wpt; }; struct layout_hmma_884_t: public layout_t { @@ -83,9 +83,11 @@ struct layout_hmma_884_t: public layout_t { const std::vector& _shapes, const std::vector &_values, ir::type *_ty, - size_t _id, analysis::align* align); void accept(layout_visitor* vst) { vst->visit_layout_hmma_884(this); } + + std::vector fpw; + std::vector wpt; }; struct layout_scanline_t: public layout_t { @@ -94,9 +96,11 @@ struct layout_scanline_t: public layout_t { const std::vector& _shapes, const std::vector &values, ir::type *_ty, - size_t _id, analysis::align* align); void accept(layout_visitor* vst) { vst->visit_layout_scanline(this); } + + std::vector mts; + std::vector nts; }; struct layout_shared_t: public layout_t { @@ -105,9 +109,11 @@ struct layout_shared_t: public layout_t { const std::vector& _shapes, const std::vector &values, ir::type *ty, - size_t _id, analysis::align* align); void accept(layout_visitor* vst) { vst->visit_layout_shared(this); } + + std::shared_ptr double_buffer; + size_t size; }; @@ -126,18 +132,6 @@ private: void create(size_t id, const std::vector& values); -// size_t shared_tmp_req(ir::instruction* i) { -// switch(i->get_id()) { -// case ir::INST_REDUCE: { -// ir::reduce_inst *red = (ir::reduce_inst*)i; -// ir::type *ty = red->get_type(); - - -// } -// default: return 0; -// } -// } - public: // constructor layout(analysis::axes *axes, analysis::align *align, size_t num_warps); @@ -146,8 +140,8 @@ public: unsigned layout_of(ir::value *value) const; const std::vector& values_of(unsigned id) const; size_t num_layouts() const; - const layout_t* get(size_t id) const; - const layout_t* get(ir::value *v) const; + layout_t* get(size_t id); + layout_t* get(ir::value *v); std::map &get_all(); size_t tmp(ir::instruction* i); diff --git a/include/triton/codegen/analysis/liveness.h b/include/triton/codegen/analysis/liveness.h index 4d5fa3e91..e0158dc8a 100644 --- a/include/triton/codegen/analysis/liveness.h +++ b/include/triton/codegen/analysis/liveness.h @@ -42,14 +42,14 @@ struct segment { class liveness { private: - typedef std::map intervals_map_t; + typedef std::map intervals_map_t; public: // constructor liveness(layout *l): layouts_(l){ } // accessors const intervals_map_t& get() const { return intervals_; } - segment get(layout_t* v) const { return intervals_.at(v); } + segment get(layout_shared_t* v) const { return intervals_.at(v); } // run void run(ir::module &mod); diff --git a/include/triton/codegen/selection/machine_layout.h b/include/triton/codegen/selection/machine_layout.h index a3b453995..5ea34f3f3 100644 --- a/include/triton/codegen/selection/machine_layout.h +++ b/include/triton/codegen/selection/machine_layout.h @@ -71,7 +71,8 @@ public: class machine_layout_shared_t: public machine_layout_t { public: - machine_layout_shared_t(Module *mod, Builder *builder, target *tgt, analysis::allocation* alloc, Value *&sh_mem_ptr, analysis::layout_t* layout, + machine_layout_shared_t(Module *mod, Builder *builder, target *tgt, analysis::allocation* alloc, Value *&sh_mem_ptr, + analysis::layout_shared_t* layout, std::map& vmap, std::map& tmap); @@ -82,7 +83,7 @@ public: target *tgt_; analysis::allocation* alloc_; Value *&sh_mem_ptr_; - analysis::layout_t* layout_; + analysis::layout_shared_t* layout_; std::map& vmap_; std::map& tmap_; diff --git a/lib/codegen/analysis/allocation.cc b/lib/codegen/analysis/allocation.cc index b92b5bd44..0cff27640 100644 --- a/lib/codegen/analysis/allocation.cc +++ b/lib/codegen/analysis/allocation.cc @@ -15,22 +15,22 @@ void allocation::run(ir::module &mod) { using std::min; typedef std::multimap triples_map_type; - std::vector I; + std::vector I; for(auto x: liveness_->get()) I.push_back(x.first); - std::vector J = I; + std::vector J = I; triples_map_type H; H.insert({0, segment{0, INT_MAX}}); - std::vector V; - std::map starts; + std::vector V; + std::map starts; while(!J.empty()){ auto h_it = H.begin(); unsigned w = h_it->first; segment xh = h_it->second; H.erase(h_it); - auto j_it = std::find_if(J.begin(), J.end(), [&](layout_t* JJ){ + auto j_it = std::find_if(J.begin(), J.end(), [&](layout_shared_t* JJ){ segment xj = liveness_->get(JJ); bool res = xj.intersect(xh); for(auto val: H) @@ -52,10 +52,10 @@ void allocation::run(ir::module &mod) { } // Build interference graph - std::map> interferences; - for(layout_t* x: V) - for(layout_t* y: V){ - if(x->id == y->id) + std::map> interferences; + for(layout_shared_t* x: V) + for(layout_shared_t* y: V){ + if(x == y) continue; unsigned X0 = starts[x], Y0 = starts[y]; unsigned NX = x->size; @@ -68,17 +68,17 @@ void allocation::run(ir::module &mod) { } // Initialize colors - std::map colors; - for(layout_t* X: V) - colors[X] = (X->id==V[0]->id)?0:-1; + std::map colors; + for(layout_shared_t* X: V) + colors[X] = (X==V[0])?0:-1; // First-fit graph coloring std::vector available(V.size()); - for(layout_t* x: V){ + for(layout_shared_t* x: V){ // Non-neighboring colors are available std::fill(available.begin(), available.end(), true); - for(layout_t* Y: interferences[x]){ + for(layout_shared_t* Y: interferences[x]){ int color = colors[Y]; if(color >= 0) available[color] = false; @@ -89,16 +89,16 @@ void allocation::run(ir::module &mod) { } // Finalize allocation - for(layout_t* x: V){ + for(layout_shared_t* x: V){ unsigned Adj = 0; - for(layout_t* y: interferences[x]) + for(layout_shared_t* y: interferences[x]) Adj = std::max(Adj, starts[y] + y->size); offsets_[x] = starts[x] + colors[x] * Adj; } // Save maximum size of induced memory space allocated_size_ = 0; - for(layout_t* x: V) + for(layout_shared_t* x: V) allocated_size_ = std::max(allocated_size_, starts[x] + x->size); } diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index 6d7c2dc9c..2136d4162 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -75,11 +75,11 @@ bool is_hmma_c(ir::value *v){ return result; } -const layout_t* layout::get(size_t id) const { +layout_t* layout::get(size_t id) { return layouts_.at(id); } -const layout_t* layout::get(ir::value *v) const { +layout_t* layout::get(ir::value *v) { return layouts_.at(groups_.at(v)); } @@ -140,8 +140,7 @@ layout_t::layout_t(layout_type_t _type, const std::vector &_axes, const std::vector &_shapes, const std::vector &_values, ir::type *_ty, - size_t _id, - analysis::align* align): type(_type), axes(_axes), shapes(_shapes), values(_values), id(_id), ty(_ty) { + analysis::align* align): type(_type), axes(_axes), shapes(_shapes), values(_values), ty(_ty) { // io pointer std::set ptr; for(ir::value* v: values) @@ -159,6 +158,21 @@ layout_t::layout_t(layout_type_t _type, } } +// downcast +layout_hmma_884_t* layout_t::to_hmma884() { + assert(type == HMMA_884); + return static_cast(this); +} + +layout_scanline_t* layout_t::to_scanline() { + assert(type == SCANLINE); + return static_cast(this); +} + +layout_shared_t* layout_t::to_shared() { + assert(type == SHARED); + return static_cast(this); +} inline unsigned clamp(unsigned x, unsigned lo, unsigned hi) { return std::min(std::max(x, lo), hi); @@ -167,8 +181,8 @@ inline unsigned clamp(unsigned x, unsigned lo, unsigned hi) { layout_hmma_884_t::layout_hmma_884_t(size_t num_warps, const std::vector& _axes, const std::vector& _shapes, - const std::vector &values, ir::type *_ty, size_t _id, - analysis::align* align): layout_t(HMMA_884, _axes, _shapes, values, _ty, _id, align) { + const std::vector &values, ir::type *_ty, + analysis::align* align): layout_t(HMMA_884, _axes, _shapes, values, _ty, align) { unsigned shape_0 = shapes[0]; unsigned shape_1 = shapes[1]; /* fragments per warp */ @@ -210,8 +224,7 @@ layout_scanline_t::layout_scanline_t(size_t num_warps, const std::vector& _axes, const std::vector& _shapes, const std::vector &values, ir::type *_ty, - size_t _id, - analysis::align* align): layout_t(SCANLINE, _axes, _shapes, values, _ty, _id, align){ + analysis::align* align): layout_t(SCANLINE, _axes, _shapes, values, _ty, align){ unsigned size = std::accumulate(shapes.begin(), shapes.end(), 1, std::multiplies()); unsigned num_threads = num_warps * 32; nts.resize(shapes.size()); @@ -295,8 +308,7 @@ layout_shared_t::layout_shared_t(const layout_t *arg, const std::vector& _shapes, const std::vector &values, ir::type *ty, - size_t _id, - analysis::align* align): layout_t(SHARED, _axes, _shapes, values, ty, _id, align) { + analysis::align* align): layout_t(SHARED, _axes, _shapes, values, ty, align) { size = 0; @@ -335,7 +347,7 @@ layout_shared_t::layout_shared_t(const layout_t *arg, // else // order = row; // padding - pad = 0; + size_t pad = 0; if(hmma_dot_a){ bool row = is_trans(hmma_dot_a) ^ order[0] != 0; pad = 24 - shapes[row ? 0 : 1] % 32; @@ -375,15 +387,15 @@ void layout::create(size_t id, const std::vector& values) { }); // type if(it_hmma_c != values.end()) - layouts_[id] = new layout_hmma_884_t(num_warps_, axes, shapes, values, largest->get_type()->get_scalar_ty(), id, align_); + layouts_[id] = new layout_hmma_884_t(num_warps_, axes, shapes, values, largest->get_type()->get_scalar_ty(), align_); else if(it_cts != values.end()){ ir::copy_to_shared_inst *cts = (ir::copy_to_shared_inst*)*it_cts; ir::value *arg = cts->get_operand(0); create(groups_.at(arg), values_.at(groups_.at(arg))); - layouts_[id] = new layout_shared_t(get(arg), axes, shapes, values, largest->get_type()->get_scalar_ty(), id, align_); + layouts_[id] = new layout_shared_t(get(arg), axes, shapes, values, largest->get_type()->get_scalar_ty(), align_); } else - layouts_[id] = new layout_scanline_t(num_warps_, axes, shapes, values, largest->get_type()->get_scalar_ty(), id, align_); + layouts_[id] = new layout_scanline_t(num_warps_, axes, shapes, values, largest->get_type()->get_scalar_ty(), align_); } void layout::run(ir::module &mod) { @@ -410,18 +422,18 @@ void layout::run(ir::module &mod) { // shape auto shapes = arg->get_type()->get_tile_shapes(); unsigned shape_ax = shapes[axis]; - const layout_t *layout = get(arg); + layout_scanline_t *layout = get(arg)->to_scanline(); unsigned per_thread = layout->nts[axis]; unsigned depth = shape_ax / per_thread; shapes[axis] = depth; // create layout - layouts_[id] = new layout_shared_t(layout, axes_->get(arg), shapes, {red}, red->get_type()->get_scalar_ty(), id, align_); + layouts_[id] = new layout_shared_t(layout, axes_->get(arg), shapes, {red}, red->get_type()->get_scalar_ty(), align_); tmp_[red] = id; } if(auto *recoalasce = dynamic_cast(i)){ ir::value *val = recoalasce->get_operand(0); - const layout_t* in_layout = get(val); - const layout_t* out_layout = get(i); + layout_t* in_layout = get(val); + layout_t* out_layout = get(i); if(in_layout->type != HMMA_884) return; id++; @@ -431,14 +443,14 @@ void layout::run(ir::module &mod) { shape[ld] = in_shape[ld]; for(size_t k = 0; k < in_shape.size(); k++) if(k != ld) - shape[k] = 4*in_layout->fpw[k]*in_layout->wpt[k]; + shape[k] = 4*in_layout->to_hmma884()->fpw[k]*in_layout->to_hmma884()->wpt[k]; // create layout - layouts_[id] = new layout_shared_t(out_layout, axes_->get(val), shape, {recoalasce}, val->get_type()->get_scalar_ty(), id, align_); + layouts_[id] = new layout_shared_t(out_layout, axes_->get(val), shape, {recoalasce}, val->get_type()->get_scalar_ty(), align_); tmp_[recoalasce] = id; } if(auto *atom = dynamic_cast(i)){ id++; - layouts_[id] = new layout_shared_t(nullptr, {}, {1}, {atom}, atom->get_type()->get_scalar_ty(), id, align_); + layouts_[id] = new layout_shared_t(nullptr, {}, {1}, {atom}, atom->get_type()->get_scalar_ty(), align_); tmp_[atom] = id; } }); diff --git a/lib/codegen/analysis/liveness.cc b/lib/codegen/analysis/liveness.cc index 382f8ef6c..a4bb41f5e 100644 --- a/lib/codegen/analysis/liveness.cc +++ b/lib/codegen/analysis/liveness.cc @@ -27,9 +27,9 @@ void liveness::run(ir::module &mod) { // create live intervals for(auto &x: layouts_->get_all()) { - layout_t* layout = x.second; - if(layout->type != SHARED) + if(x.second->type != SHARED) continue; + layout_shared_t* layout = x.second->to_shared(); // users std::set users; for(ir::value *v: layout->values){ diff --git a/lib/codegen/selection/generator.cc b/lib/codegen/selection/generator.cc index 4d4fe0b11..5cf964915 100644 --- a/lib/codegen/selection/generator.cc +++ b/lib/codegen/selection/generator.cc @@ -655,13 +655,14 @@ void generator::visit_hmma_dot(ir::dot_inst* dot, shared_tile *TA, shared_tile * "{$8, $9}, " "{$10, $11}, " "{$0, $1, $2, $3, $4, $5, $6, $7};", "=f,=f,=f,=f,=f,=f,=f,=f,r,r,r,r,0,1,2,3,4,5,6,7", false); + analysis::layout_hmma_884_t* layout = layouts_->get(dot)->to_hmma884(); - unsigned fpw_0 = layouts_->get(dot)->fpw.at(0); - unsigned fpw_1 = layouts_->get(dot)->fpw.at(1); + unsigned fpw_0 = layout->fpw.at(0); + unsigned fpw_1 = layout->fpw.at(1); unsigned wts_0 = fpw_0 * 8; unsigned wts_1 = fpw_1 * 8; - unsigned wpt_0 = layouts_->get(dot)->wpt.at(0); - unsigned wpt_1 = layouts_->get(dot)->wpt.at(1); + unsigned wpt_0 = layout->wpt.at(0); + unsigned wpt_1 = layout->wpt.at(1); unsigned stride_rep_i = wpt_0 * wts_0; unsigned stride_rep_j = wpt_1 * wts_1; unsigned num_rep_i = shapes[0] / stride_rep_i; @@ -925,8 +926,8 @@ void generator::visit_recoalesce_inst(ir::recoalesce_inst* rc) { // pointer to temporary shared memory Type *ty = llvm_type(rc->get_type()->get_scalar_ty(), *ctx_); // layouts - const analysis::layout_t* in_layout = layouts_->get(op); - const analysis::layout_t* out_layout = layouts_->get(rc); + analysis::layout_hmma_884_t* in_layout = layouts_->get(op)->to_hmma884(); + analysis::layout_scanline_t* out_layout = layouts_->get(rc)->to_scanline(); // machine tiles distributed_tile *in_dt = (distributed_tile*)(tmap_.at(op)); distributed_tile *out_dt = (distributed_tile*)(tmap_.at(rc)); @@ -1026,14 +1027,14 @@ void generator::visit_recoalesce_inst(ir::recoalesce_inst* rc) { void generator::visit_copy_to_shared_inst(ir::copy_to_shared_inst* cts) { unsigned vector_size = 1; - auto x_order = layouts_->get(cts)->order; ir::value *arg = cts->get_operand(0); - auto arg_order = layouts_->get(arg)->order; + analysis::layout_shared_t* out_layout = layouts_->get(cts)->to_shared(); + analysis::layout_scanline_t* in_layout = layouts_->get(arg)->to_scanline(); + auto out_order = out_layout->order; + auto in_order = in_layout->order; // tiles - if(x_order == arg_order){ - size_t ld = arg_order[0]; - vector_size = layouts_->get(arg)->nts.at(ld); - } + if(out_order == in_order) + vector_size = in_layout->nts.at(in_order[0]); std::map packets; for_each(arg, [&](indices_t idx){ diff --git a/lib/codegen/selection/machine_layout.cc b/lib/codegen/selection/machine_layout.cc index 2d02e7b1f..d1ea9fa0f 100644 --- a/lib/codegen/selection/machine_layout.cc +++ b/lib/codegen/selection/machine_layout.cc @@ -72,7 +72,7 @@ inline int32_t ceil(int32_t num, int32_t div){ machine_layout_shared_t::machine_layout_shared_t(Module *mod, Builder *builder, target *tgt, analysis::allocation* alloc, - Value *&sh_mem_ptr, analysis::layout_t *layout, + Value *&sh_mem_ptr, analysis::layout_shared_t *layout, std::map& vmap, std::map& tmap) : mod_(mod), builder_(builder), tgt_(tgt), alloc_(alloc), sh_mem_ptr_(sh_mem_ptr), layout_(layout), vmap_(vmap), tmap_(tmap) { @@ -132,7 +132,10 @@ machine_layout_distributed_t::machine_layout_distributed_t(Module *mod, Builder tile *machine_layout_distributed_t::create(ir::value *v) { Type *ty = llvm_type(v->get_type()->get_scalar_ty(), builder_->getContext()); const auto &shapes = v->get_type()->get_tile_shapes(); - std::vector axes(shapes.size()); + size_t rank = shapes.size(); + std::vector axes(rank); + std::vector order(rank); + // compute axes for(size_t d = 0; d < shapes.size(); d++){ if(shapes[d] > 1){ unsigned x = a_axes_->get(v, d); @@ -143,7 +146,22 @@ tile *machine_layout_distributed_t::create(ir::value *v) { axes[d].values = {builder_->getInt32(0)}; } } - return new distributed_tile(ty, shapes, layout_->order, axes, *builder_); + // compute order + std::iota(order.begin(), order.end(), 0); + auto cmp = [&](int x, int y) { + unsigned axx = a_axes_->get(v, x); + unsigned axy = a_axes_->get(v, y); + auto itx = std::find(layout_->axes.begin(), layout_->axes.end(), axx); + auto ity = std::find(layout_->axes.begin(), layout_->axes.end(), axy); + size_t posx = std::distance(layout_->axes.begin(), itx); + size_t posy = std::distance(layout_->axes.begin(), ity); + if(posx < rank && posy < rank) + return layout_->order[posx] < layout_->order[posy]; + return false; + }; + std::sort(order.begin(), order.end(), cmp); + + return new distributed_tile(ty, shapes, order, axes, *builder_); } machine_layout_hmma_884_t::machine_layout_hmma_884_t(Module *mod, Builder *builder, diff --git a/lib/codegen/selection/machine_value.cc b/lib/codegen/selection/machine_value.cc index a94661b90..c70ba85b0 100644 --- a/lib/codegen/selection/machine_value.cc +++ b/lib/codegen/selection/machine_value.cc @@ -11,13 +11,6 @@ using namespace llvm; /* Distributed Tile */ void distributed_tile::init_indices() { std::vector id(axes_.size(), 0); - // create iteration order - std::vector order(id.size()); - std::iota(order.begin(), order.end(), 0); - auto cmp = [&](int x, int y) { - return order_[x] < order_[y]; - }; - std::sort(order.begin(), order.end(), cmp); // build size_t k = 0; while(true) { @@ -28,12 +21,12 @@ void distributed_tile::init_indices() { indices_[current] = sz; values_[current] = nullptr; ordered_indices_.push_back(current); - id[order[0]]++; - while(id[order[k]] == axes_[order[k]].values.size()){ + id[order_[0]]++; + while(id[order_[k]] == axes_[order_[k]].values.size()){ if(k == id.size() - 1) return; - id[order[k++]] = 0; - id[order[k]]++; + id[order_[k++]] = 0; + id[order_[k]]++; } k = 0; } diff --git a/lib/codegen/transform/membar.cc b/lib/codegen/transform/membar.cc index 8cb48f7df..1d9aef055 100644 --- a/lib/codegen/transform/membar.cc +++ b/lib/codegen/transform/membar.cc @@ -37,7 +37,7 @@ void membar::add_reference(ir::value *v, interval_vec_t &res){ return; if(alloc_->has_offset(layouts_->get(v))){ unsigned offset = alloc_->offset(layouts_->get(v)); - unsigned size = layouts_->get(v)->size; + unsigned size = layouts_->get(v)->to_shared()->size; res.push_back(interval_t(offset, offset + size)); } } @@ -119,12 +119,14 @@ void membar::run(ir::module &mod) { // without needing synchronization std::set safe_war; for(const auto& x: layouts_->get_all()){ - if(x.second->double_buffer){ - auto info = *x.second->double_buffer; - for(ir::value *v: x.second->values) - if(v != info.phi) - safe_war.insert(v); - } + if(x.second->type != analysis::SHARED) + continue; + analysis::layout_shared_t* layout = x.second->to_shared(); + if(!layout->double_buffer) + continue; + for(ir::value *v: layout->values) + if(v != layout->double_buffer->phi) + safe_war.insert(v); } diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index 79718a232..d118b95be 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -10,11 +10,11 @@ int main() { typedef std::tuple, bool, bool, int, int, int> config_t; std::vector configs; for(auto ord: std::vector>{{1, 0}}) - for(auto x: std::vector>{{false, false}}){ + for(auto x: std::vector>{{false, false}, {true, false}}){ std::vector tmp = { -// config_t{ord, x[0], x[1], 512, 512, 512}, -// config_t{ord, x[0], x[1], 1024, 1024, 1024}, - config_t{ord, x[0], x[1], 127008, 768, 576}, + config_t{ord, x[0], x[1], 512, 512, 512}, + config_t{ord, x[0], x[1], 2048, 2048, 2048}, +// config_t{ord, x[0], x[1], 127008, 768, 576}, // config_t{ord, x[0], x[1], 8192, 8192, 8192} // config_t{ord, x[0], x[1], 16, 2048, 2048}, // config_t{ord, x[0], x[1], 32, 2048, 2048}, @@ -36,7 +36,7 @@ int main() { for(const auto& c: configs){ std::tie(ord, AT, BT, M, N, K) = c; std::cout << "// " << c ; - for(auto perf: bench_dot(stream, HALF, AT, BT, M, N, K, ord, ord)) + for(auto perf: bench_dot(stream, FLOAT, AT, BT, M, N, K, ord, ord)) std::cout << ", " << perf << std::flush; std::cout << std::endl; } From 78b98fb7cf75fe4b42c20290d78f6ee147426508 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Mon, 20 Jan 2020 15:15:32 -0500 Subject: [PATCH 490/494] [GENERAL] Cleaned polymorphic structure of layouts analysis pass --- include/triton/codegen/analysis/allocation.h | 6 +- include/triton/codegen/analysis/layout.h | 220 ++++++----- include/triton/codegen/analysis/liveness.h | 12 +- include/triton/codegen/selection/generator.h | 18 +- .../triton/codegen/selection/machine_layout.h | 45 ++- .../triton/codegen/selection/machine_value.h | 4 +- include/triton/codegen/transform/coalesce.h | 6 +- include/triton/codegen/transform/membar.h | 6 +- lib/codegen/analysis/allocation.cc | 40 +- lib/codegen/analysis/layout.cc | 351 +++++++++--------- lib/codegen/analysis/liveness.cc | 8 +- lib/codegen/selection/generator.cc | 78 ++-- lib/codegen/selection/machine_layout.cc | 133 +++---- lib/codegen/transform/coalesce.cc | 8 +- lib/codegen/transform/membar.cc | 19 +- lib/runtime/function.cc | 6 +- python/examples/einsum.py | 20 +- 17 files changed, 500 insertions(+), 480 deletions(-) diff --git a/include/triton/codegen/analysis/allocation.h b/include/triton/codegen/analysis/allocation.h index 49f378886..e49f5c591 100644 --- a/include/triton/codegen/analysis/allocation.h +++ b/include/triton/codegen/analysis/allocation.h @@ -27,14 +27,14 @@ public: allocation(liveness *live) : liveness_(live) { } // accessors - bool has_offset(const layout_t *x) const { return offsets_.find(x) != offsets_.end(); } - unsigned offset(const layout_t *x) const { return offsets_.at(x); } + bool has_offset(const data_layout *x) const { return offsets_.find(x) != offsets_.end(); } + unsigned offset(const data_layout *x) const { return offsets_.at(x); } unsigned allocated_size() const { return allocated_size_; } // run void run(ir::module& mod); private: - std::map offsets_; + std::map offsets_; size_t allocated_size_; // dependences liveness *liveness_; diff --git a/include/triton/codegen/analysis/layout.h b/include/triton/codegen/analysis/layout.h index 074bfb27c..13ddfafb4 100644 --- a/include/triton/codegen/analysis/layout.h +++ b/include/triton/codegen/analysis/layout.h @@ -22,11 +22,106 @@ namespace analysis{ class axes; class align; +class layout_visitor; +class data_layout; +class mma884_layout; +class scanline_layout; +class shared_layout; -enum layout_type_t { - HMMA_884, - SCANLINE, - SHARED + +class layout_visitor { +public: + virtual void visit_layout(data_layout *); + virtual void visit_layout_hmma_884(mma884_layout*) = 0; + virtual void visit_layout_scanline(scanline_layout*) = 0; + virtual void visit_layout_shared(shared_layout*) = 0; +}; + +class data_layout { +protected: + enum id_t { + HMMA_884, + SCANLINE, + SHARED + }; + + typedef std::vector axes_t; + typedef std::vector shape_t; + typedef std::vector order_t; + typedef std::vector values_t; + +private: + template + T* downcast(id_t id) { + if(id_ == id) + return static_cast(this); + return nullptr; + } + +public: + data_layout(id_t id, + const std::vector& axes, + const std::vector &shape, + const std::vector &values, + analysis::align* align); + // visitor + virtual void accept(layout_visitor* vst) = 0; + // downcast + mma884_layout* to_mma884() { return downcast(HMMA_884); } + scanline_layout* to_scanline() { return downcast(SCANLINE); } + shared_layout* to_shared() { return downcast(SHARED); } + // accessors + size_t get_rank() { return shape_.size(); } + const shape_t& get_shape() const { return shape_; } + const order_t& get_order() const { return order_; } + const values_t& get_values() const { return values_;} + int get_axis(size_t k) const { return axes_.at(k); } + const int get_order(size_t k) const { return order_.at(k); } + // find the position of given axis + size_t find_axis(int to_find) const; + + +private: + id_t id_; + axes_t axes_; + values_t values_; + +protected: + order_t order_; + shape_t shape_; +}; + +class mma884_layout: public data_layout { +public: + mma884_layout(size_t num_warps, + const std::vector& axes, + const std::vector& shapes, + const std::vector &values, + analysis::align* align); + void accept(layout_visitor* vst) { vst->visit_layout_hmma_884(this); } + // accessor + int fpw(size_t k) { return fpw_.at(k); } + int wpt(size_t k) { return wpt_.at(k); } + +private: + std::vector fpw_; + std::vector wpt_; +}; + +struct scanline_layout: public data_layout { + scanline_layout(size_t num_warps, + const std::vector& axes, + const std::vector& shape, + const std::vector &values, + analysis::align* align); + void accept(layout_visitor* vst) { vst->visit_layout_scanline(this); } + // accessor + int mts(size_t k) { return mts_.at(k); } + int nts(size_t k) { return nts_.at(k); } + +private: + std::vector mts_; + std::vector nts_; }; struct double_buffer_info_t { @@ -35,90 +130,33 @@ struct double_buffer_info_t { ir::phi_node* phi; }; -class layout_visitor; -class layout_t; -class layout_hmma_884_t; -class layout_scanline_t; -class layout_shared_t; +class shared_layout: public data_layout { +private: + static bool is_loop_latch(ir::phi_node *phi, ir::instruction *terminator); + static void extract_double_bufferable(ir::value *v, std::shared_ptr& res); - -class layout_visitor { public: - virtual void visit_layout(layout_t *); - virtual void visit_layout_hmma_884(layout_hmma_884_t*) = 0; - virtual void visit_layout_scanline(layout_scanline_t*) = 0; - virtual void visit_layout_shared(layout_shared_t*) = 0; -}; - -class layout_hmma_884_t; -class layout_scanline_t; -class layout_shared_t; - -struct layout_t { - layout_t(layout_type_t _type, - const std::vector& _axes, - const std::vector &_shapes, - const std::vector &_values, - ir::type *_ty, - analysis::align* align); - // visitor - virtual void accept(layout_visitor* vst) = 0; - // downcast - layout_hmma_884_t* to_hmma884(); - layout_scanline_t* to_scanline(); - layout_shared_t* to_shared(); - - - layout_type_t type; - std::vector axes; - std::vector shapes; - std::vector values; - std::vector order; - ir::type *ty; -}; - -struct layout_hmma_884_t: public layout_t { - layout_hmma_884_t(size_t num_warps, - const std::vector& _axes, - const std::vector& _shapes, - const std::vector &_values, - ir::type *_ty, - analysis::align* align); - void accept(layout_visitor* vst) { vst->visit_layout_hmma_884(this); } - - std::vector fpw; - std::vector wpt; -}; - -struct layout_scanline_t: public layout_t { - layout_scanline_t(size_t num_warps, - const std::vector& _axes, - const std::vector& _shapes, - const std::vector &values, - ir::type *_ty, - analysis::align* align); - void accept(layout_visitor* vst) { vst->visit_layout_scanline(this); } - - std::vector mts; - std::vector nts; -}; - -struct layout_shared_t: public layout_t { - layout_shared_t(const layout_t *arg, - const std::vector& _axes, - const std::vector& _shapes, - const std::vector &values, - ir::type *ty, - analysis::align* align); + shared_layout(const data_layout *arg, + const std::vector& axes, + const std::vector& shapes, + const std::vector &values_, + ir::type *ty, + analysis::align* align); void accept(layout_visitor* vst) { vst->visit_layout_shared(this); } + // accessors + size_t get_size() { return size_; } + ir::type* get_type() { return ty_; } + double_buffer_info_t* get_double_buffer() { return double_buffer_.get(); } - std::shared_ptr double_buffer; - size_t size; +private: + size_t size_; + ir::type *ty_; + std::shared_ptr double_buffer_; }; -class layout { +class layouts { typedef ir::value* node_t; typedef std::map > graph_t; @@ -127,23 +165,23 @@ private: void connect(ir::value *x, ir::value *y); void make_graph(ir::instruction *i); - void init_hmma_tile(layout_t& layout); - void init_scanline_tile(layout_t &layout); + void init_hmma_tile(data_layout& layouts); + void init_scanline_tile(data_layout &layouts); void create(size_t id, const std::vector& values); public: // constructor - layout(analysis::axes *axes, analysis::align *align, size_t num_warps); + layouts(analysis::axes *axes, analysis::align *align, size_t num_warps); // accessors - unsigned layout_of(ir::value *value) const; - const std::vector& values_of(unsigned id) const; - size_t num_layouts() const; - layout_t* get(size_t id); - layout_t* get(ir::value *v); - std::map &get_all(); - size_t tmp(ir::instruction* i); + unsigned layout_of(ir::value *value) const { return groups_.at(value); } + const std::vector& values_of(unsigned id) const { return values_.at(id); } + size_t num_layouts() const { return values_.size();} + data_layout* get(size_t id) { return layouts_.at(id); } + data_layout* get(ir::value *v) { return get(layout_of(v));} + std::map &get_all() { return layouts_; } + size_t tmp(ir::instruction* i) { return tmp_.at((ir::value*)i);} // execution void run(ir::module &mod); @@ -155,7 +193,7 @@ private: tools::graph graph_; std::map groups_; std::map> values_; - std::map layouts_; + std::map layouts_; std::map tmp_; }; diff --git a/include/triton/codegen/analysis/liveness.h b/include/triton/codegen/analysis/liveness.h index e0158dc8a..a95d62a06 100644 --- a/include/triton/codegen/analysis/liveness.h +++ b/include/triton/codegen/analysis/liveness.h @@ -23,8 +23,8 @@ namespace analysis{ typedef unsigned slot_index; class tiles; -class layout; -class layout_t; +class layouts; +class data_layout; struct segment { slot_index start; @@ -42,20 +42,20 @@ struct segment { class liveness { private: - typedef std::map intervals_map_t; + typedef std::map intervals_map_t; public: // constructor - liveness(layout *l): layouts_(l){ } + liveness(layouts *l): layouts_(l){ } // accessors const intervals_map_t& get() const { return intervals_; } - segment get(layout_shared_t* v) const { return intervals_.at(v); } + segment get(shared_layout* v) const { return intervals_.at(v); } // run void run(ir::module &mod); private: // analysis - layout *layouts_; + layouts *layouts_; intervals_map_t intervals_; }; diff --git a/include/triton/codegen/selection/generator.h b/include/triton/codegen/selection/generator.h index 1f18bc6e1..8b8c5bf64 100644 --- a/include/triton/codegen/selection/generator.h +++ b/include/triton/codegen/selection/generator.h @@ -35,7 +35,7 @@ class align; class allocation; class cts; class axes; -class layout; +class layouts; } // typedef typedef llvm::IRBuilder indices_t; // forward -class machine_layout_t; +class machine_data_layout; class tile; class shared_tile; class distributed_tile; @@ -74,13 +74,13 @@ private: void visit_outer_dot(ir::dot_inst*, distributed_tile *TA, distributed_tile *TB, distributed_tile *TD, unsigned NK, Type *c_ty, Function *f_mul_add); - void finalize_shared_layout(analysis::layout_shared_t*); + void finalize_shared_layout(analysis::shared_layout*); void finalize_function(ir::function*); void finalize_phi_node(ir::phi_node*); public: generator(analysis::axes *a_axes, - analysis::layout *layouts, + analysis::layouts *layouts, analysis::align *alignment, analysis::allocation *alloc, target *tgt, @@ -139,9 +139,9 @@ public: void visit_basic_block(ir::basic_block*); void visit_argument(ir::argument*); - void visit_layout_hmma_884(analysis::layout_hmma_884_t*); - void visit_layout_scanline(analysis::layout_scanline_t*); - void visit_layout_shared(analysis::layout_shared_t*); + void visit_layout_hmma_884(analysis::mma884_layout*); + void visit_layout_scanline(analysis::scanline_layout*); + void visit_layout_shared(analysis::shared_layout*); void visit(ir::module &, llvm::Module &); @@ -150,13 +150,13 @@ private: Builder* builder_; Module *mod_; - std::map machine_layouts_; + std::map machine_layouts_; analysis::axes *a_axes_; std::map axes_; std::map vmap_; std::map tmap_; target *tgt_; - analysis::layout *layouts_; + analysis::layouts *layouts_; analysis::align *alignment_; analysis::allocation *alloc_; Value *sh_mem_ptr_; diff --git a/include/triton/codegen/selection/machine_layout.h b/include/triton/codegen/selection/machine_layout.h index 5ea34f3f3..5458f15d3 100644 --- a/include/triton/codegen/selection/machine_layout.h +++ b/include/triton/codegen/selection/machine_layout.h @@ -36,7 +36,7 @@ class align; class allocation; class cts; class axes; -class layout; +class layouts; } typedef llvm::IRBuilder& vmap, std::map& tmap); @@ -83,7 +83,7 @@ public: target *tgt_; analysis::allocation* alloc_; Value *&sh_mem_ptr_; - analysis::layout_shared_t* layout_; + analysis::shared_layout* layout_; std::map& vmap_; std::map& tmap_; @@ -94,29 +94,28 @@ public: }; -class machine_layout_distributed_t: public machine_layout_t { +class machine_distributed_layout: public machine_data_layout { public: - machine_layout_distributed_t(Module *mod, Builder *builder, target *tgt, Type *ty, - analysis::axes *a_axes, std::map& axes, - analysis::layout_t* layout); + machine_distributed_layout(Module *mod, Builder *builder, target *tgt, + analysis::axes *a_axes, std::map& axes, + analysis::data_layout* layout); tile* create(ir::value *v); Module *mod_; Builder *builder_; target *tgt_; - Type *ty_; analysis::axes *a_axes_; std::map& axes_; - analysis::layout_t* layout_; + analysis::data_layout* layout_; }; -class machine_layout_hmma_884_t: public machine_layout_distributed_t { +class machine_mma884_layout: public machine_distributed_layout { public: - machine_layout_hmma_884_t(Module *mod, Builder *builder, - target *tgt, Type *ty, - analysis::axes *a_axes, std::map& axes, - analysis::layout_hmma_884_t* layout); + machine_mma884_layout(Module *mod, Builder *builder, + target *tgt, + analysis::axes *a_axes, std::map& axes, + analysis::mma884_layout* layout); Value *offset_a_i_, *offset_a_k_; Value *offset_b_j_, *offset_b_k_; unsigned pack_size_0_; @@ -125,12 +124,12 @@ public: unsigned num_packs_1_; }; -class machine_layout_scanline_t: public machine_layout_distributed_t { +class machine_scanline_layout: public machine_distributed_layout { public: - machine_layout_scanline_t(Module *mod, Builder *builder, - target *tgt, Type *ty, - analysis::axes *a_axes, std::map& axes, - analysis::layout_scanline_t* layout); + machine_scanline_layout(Module *mod, Builder *builder, + target *tgt, + analysis::axes *a_axes, std::map& axes, + analysis::scanline_layout* layout); }; } diff --git a/include/triton/codegen/selection/machine_value.h b/include/triton/codegen/selection/machine_value.h index aab1f023a..67c2ed394 100644 --- a/include/triton/codegen/selection/machine_value.h +++ b/include/triton/codegen/selection/machine_value.h @@ -47,11 +47,11 @@ class align; class allocation; class cts; class axes; -class layout; +class layouts; } class distributed_axis; -class machine_layout_t; +class machine_data_layout; class tile; class shared_tile; class distributed_tile; diff --git a/include/triton/codegen/transform/coalesce.h b/include/triton/codegen/transform/coalesce.h index e0ea0ea97..1b15306f1 100644 --- a/include/triton/codegen/transform/coalesce.h +++ b/include/triton/codegen/transform/coalesce.h @@ -19,7 +19,7 @@ namespace codegen{ namespace analysis{ class align; - class layout; + class layouts; class cts; } @@ -32,12 +32,12 @@ private: ir::value* rematerialize(ir::value *v, ir::builder& builder, std::map& seen); public: - coalesce(analysis::align* align, triton::codegen::analysis::layout *layouts); + coalesce(analysis::align* align, triton::codegen::analysis::layouts *layouts); void run(ir::module &mod); private: analysis::align* align_; - analysis::layout* layout_; + analysis::layouts* layout_; }; } diff --git a/include/triton/codegen/transform/membar.h b/include/triton/codegen/transform/membar.h index 820992da7..015f44f3d 100644 --- a/include/triton/codegen/transform/membar.h +++ b/include/triton/codegen/transform/membar.h @@ -17,7 +17,7 @@ namespace analysis{ class allocation; class liveness; -class layout; +class layouts; class cts; } @@ -41,13 +41,13 @@ private: std::set &insert_loc, std::set &safe_war); public: - membar(analysis::liveness *liveness, analysis::layout *layouts, analysis::allocation *alloc): + membar(analysis::liveness *liveness, analysis::layouts *layouts, analysis::allocation *alloc): liveness_(liveness), layouts_(layouts), alloc_(alloc) {} void run(ir::module &mod); private: analysis::liveness *liveness_; - analysis::layout *layouts_; + analysis::layouts *layouts_; analysis::allocation *alloc_; }; diff --git a/lib/codegen/analysis/allocation.cc b/lib/codegen/analysis/allocation.cc index 0cff27640..d0a66543a 100644 --- a/lib/codegen/analysis/allocation.cc +++ b/lib/codegen/analysis/allocation.cc @@ -15,22 +15,22 @@ void allocation::run(ir::module &mod) { using std::min; typedef std::multimap triples_map_type; - std::vector I; + std::vector I; for(auto x: liveness_->get()) I.push_back(x.first); - std::vector J = I; + std::vector J = I; triples_map_type H; H.insert({0, segment{0, INT_MAX}}); - std::vector V; - std::map starts; + std::vector V; + std::map starts; while(!J.empty()){ auto h_it = H.begin(); unsigned w = h_it->first; segment xh = h_it->second; H.erase(h_it); - auto j_it = std::find_if(J.begin(), J.end(), [&](layout_shared_t* JJ){ + auto j_it = std::find_if(J.begin(), J.end(), [&](shared_layout* JJ){ segment xj = liveness_->get(JJ); bool res = xj.intersect(xh); for(auto val: H) @@ -38,7 +38,7 @@ void allocation::run(ir::module &mod) { return res; }); if(j_it != J.end()){ - unsigned size = (*j_it)->size; + unsigned size = (*j_it)->get_size(); segment xj = liveness_->get(*j_it); starts[*j_it] = w; H.insert({w + size, segment{max(xh.start, xj.start), min(xh.end, xj.end)}}); @@ -52,14 +52,14 @@ void allocation::run(ir::module &mod) { } // Build interference graph - std::map> interferences; - for(layout_shared_t* x: V) - for(layout_shared_t* y: V){ + std::map> interferences; + for(shared_layout* x: V) + for(shared_layout* y: V){ if(x == y) continue; unsigned X0 = starts[x], Y0 = starts[y]; - unsigned NX = x->size; - unsigned NY = y->size; + unsigned NX = x->get_size(); + unsigned NY = y->get_size(); segment XS = {X0, X0 + NX}; segment YS = {Y0, Y0 + NY}; if(liveness_->get(x).intersect(liveness_->get(y)) @@ -68,17 +68,17 @@ void allocation::run(ir::module &mod) { } // Initialize colors - std::map colors; - for(layout_shared_t* X: V) + std::map colors; + for(shared_layout* X: V) colors[X] = (X==V[0])?0:-1; // First-fit graph coloring std::vector available(V.size()); - for(layout_shared_t* x: V){ + for(shared_layout* x: V){ // Non-neighboring colors are available std::fill(available.begin(), available.end(), true); - for(layout_shared_t* Y: interferences[x]){ + for(shared_layout* Y: interferences[x]){ int color = colors[Y]; if(color >= 0) available[color] = false; @@ -89,17 +89,17 @@ void allocation::run(ir::module &mod) { } // Finalize allocation - for(layout_shared_t* x: V){ + for(shared_layout* x: V){ unsigned Adj = 0; - for(layout_shared_t* y: interferences[x]) - Adj = std::max(Adj, starts[y] + y->size); + for(shared_layout* y: interferences[x]) + Adj = std::max(Adj, starts[y] + y->get_size()); offsets_[x] = starts[x] + colors[x] * Adj; } // Save maximum size of induced memory space allocated_size_ = 0; - for(layout_shared_t* x: V) - allocated_size_ = std::max(allocated_size_, starts[x] + x->size); + for(shared_layout* x: V) + allocated_size_ = std::max(allocated_size_, starts[x] + x->get_size()); } } diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index 2136d4162..8b4a3242a 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -12,57 +12,15 @@ namespace triton{ namespace codegen{ namespace analysis{ +/* -------------------------------- * + * Helper Functions * + * -------------------------------- */ -// constructor -layout::layout(analysis::axes *axes, analysis::align *align, size_t num_warps) - : axes_(axes), align_(align), num_warps_(num_warps) { } - -// get group id -unsigned layout::layout_of(ir::value *value) const -{ return groups_.at(value); } - -// get values -const std::vector& layout::values_of(unsigned id) const -{ return values_.at(id); } - -// get number of groups -size_t layout::num_layouts() const -{ return values_.size(); } - -// connect two values -void layout::connect(ir::value *x, ir::value *y) { - if(x == y) - return; - if(!x->get_type()->is_tile_ty()) - return; - if(!y->get_type()->is_tile_ty()) - return; - std::vector x_axes = axes_->get(x); - std::vector y_axes = axes_->get(y); - std::set sx_axes(x_axes.begin(), x_axes.end()); - std::set sy_axes(y_axes.begin(), y_axes.end()); - std::set common; - std::set_intersection(sx_axes.begin(), sx_axes.end(), - sy_axes.begin(), sy_axes.end(), - std::inserter(common, common.begin())); - graph_.add_edge(x, x); - graph_.add_edge(y, y); - if(!common.empty()) - graph_.add_edge(x, y); +inline unsigned clamp(unsigned x, unsigned lo, unsigned hi) { + return std::min(std::max(x, lo), hi); } -// make graph -void layout::make_graph(ir::instruction *i) { - for(ir::value* opx: i->ops()) - for(ir::value* opy: i->ops()){ - connect(i, opx); - connect(opx, opy); - } -} - - -// hmma -bool is_hmma_c(ir::value *v){ +inline bool is_hmma_c(ir::value *v){ bool result = false; if(auto *x = dynamic_cast(v)){ ir::value *a = x->get_operand(0); @@ -75,23 +33,7 @@ bool is_hmma_c(ir::value *v){ return result; } -layout_t* layout::get(size_t id) { - return layouts_.at(id); -} - -layout_t* layout::get(ir::value *v) { - return layouts_.at(groups_.at(v)); -} - -std::map& layout::get_all() { - return layouts_; -} - -size_t layout::tmp(ir::instruction* i) { - return tmp_.at(i); -} - -void extract_io_use(ir::value *v, std::set& result) { +inline void extract_io_use(ir::value *v, std::set& result) { for(ir::user* u: v->get_users()){ auto i = dynamic_cast(u); if(i && i->get_pointer_operand() == v) @@ -99,7 +41,7 @@ void extract_io_use(ir::value *v, std::set& result) { } } -void extract_dot_use(ir::value *v, ir::value*& result, size_t n) { +inline void extract_dot_use(ir::value *v, ir::value*& result, size_t n) { for(ir::user* u: v->get_users()){ auto i = dynamic_cast(u); if(i && i->get_operand(n) == v) @@ -107,7 +49,7 @@ void extract_dot_use(ir::value *v, ir::value*& result, size_t n) { } } -void extract_hmma_dot_use(ir::value *v, ir::value*& result, size_t n) { +inline void extract_hmma_dot_use(ir::value *v, ir::value*& result, size_t n) { for(ir::user* u: v->get_users()){ auto i = dynamic_cast(u); if(i && is_hmma_c(i) && i->get_operand(n) == v) @@ -116,7 +58,6 @@ void extract_hmma_dot_use(ir::value *v, ir::value*& result, size_t n) { } - inline bool is_trans(ir::value *v) { if(dynamic_cast(v)) { return true; @@ -131,104 +72,103 @@ inline bool is_trans(ir::value *v) { } -void layout_visitor::visit_layout(layout_t *layout) { +/* -------------------------------- * + * Layout Visitor * + * -------------------------------- */ + +void layout_visitor::visit_layout(data_layout *layout) { layout->accept(this); } -layout_t::layout_t(layout_type_t _type, - const std::vector &_axes, - const std::vector &_shapes, - const std::vector &_values, ir::type *_ty, - analysis::align* align): type(_type), axes(_axes), shapes(_shapes), values(_values), ty(_ty) { +/* -------------------------------- * + * Base Data Layout * + * -------------------------------- */ + +data_layout::data_layout(id_t id, + const std::vector &axes, + const std::vector &shape, + const std::vector &values, + analysis::align* align): id_(id), axes_(axes), shape_(shape), values_(values) { // io pointer std::set ptr; - for(ir::value* v: values) + for(ir::value* v: values_) extract_io_use(v, ptr); - order.resize(axes.size()); - std::iota(order.begin(), order.end(), 0); + order_.resize(axes_.size()); + std::iota(order_.begin(), order_.end(), 0); auto largest = std::max_element(ptr.begin(), ptr.end(), [&](ir::value *x, ir::value *y){ return x->get_type()->get_tile_rank() < y->get_type()->get_tile_rank(); }); if(*largest){ auto max_contiguous = align->contiguous(*largest); - std::sort(order.begin(), order.end(), [&](unsigned a, unsigned b) { + std::sort(order_.begin(), order_.end(), [&](unsigned a, unsigned b) { return max_contiguous[a] > max_contiguous[b]; }); } } -// downcast -layout_hmma_884_t* layout_t::to_hmma884() { - assert(type == HMMA_884); - return static_cast(this); +size_t data_layout::find_axis(int to_find) const { + auto it = std::find(axes_.begin(), axes_.end(), to_find); + return std::distance(axes_.begin(), it); } -layout_scanline_t* layout_t::to_scanline() { - assert(type == SCANLINE); - return static_cast(this); -} -layout_shared_t* layout_t::to_shared() { - assert(type == SHARED); - return static_cast(this); -} +/* -------------------------------- * + * MMA Layout * + * -------------------------------- */ -inline unsigned clamp(unsigned x, unsigned lo, unsigned hi) { - return std::min(std::max(x, lo), hi); -} - -layout_hmma_884_t::layout_hmma_884_t(size_t num_warps, - const std::vector& _axes, - const std::vector& _shapes, - const std::vector &values, ir::type *_ty, - analysis::align* align): layout_t(HMMA_884, _axes, _shapes, values, _ty, align) { - unsigned shape_0 = shapes[0]; - unsigned shape_1 = shapes[1]; +mma884_layout::mma884_layout(size_t num_warps, + const std::vector& axes, + const std::vector& shape, + const std::vector &values, + analysis::align* align): data_layout(HMMA_884, axes, shape, values, align) { /* fragments per warp */ // try to make things as square as possible to maximize data re-use - fpw = {1, 1, 1}; + fpw_ = {1, 1, 1}; std::vector fpw_nm1; - unsigned num_fragments = std::min((shape_0/8)*(shape_1/8), 4); + unsigned num_fragments = std::min((shape_[0]/8)*(shape_[1]/8), 4); do { - fpw_nm1 = fpw; - if(fpw[0]*fpw[1] < num_fragments) - fpw[0] = clamp(fpw[0]*2, 1, shape_0 / 8); - if(fpw[0]*fpw[1] < num_fragments) - fpw[1] = clamp(fpw[1]*2, 1, shape_1 / 8); - }while(fpw_nm1 != fpw); + fpw_nm1 = fpw_; + if(fpw_[0]*fpw_[1] < num_fragments) + fpw_[0] = clamp(fpw_[0]*2, 1, shape_[0] / 8); + if(fpw_[0]*fpw_[1] < num_fragments) + fpw_[1] = clamp(fpw_[1]*2, 1, shape_[1] / 8); + }while(fpw_nm1 != fpw_); + /* warps per tile */ // try to make things as square as possible to maximize data re-use - wpt = {1, 1, 1}; + wpt_ = {1, 1, 1}; std::vector wpt_nm1; do{ - wpt_nm1 = wpt; - if(wpt[0] * wpt[1] * wpt[2] < num_warps) - wpt[0] = clamp(wpt[0]*2, 1, shape_0 / (fpw[0]*8)); - if(wpt[0] * wpt[1] * wpt[2] < num_warps) - wpt[1] = clamp(wpt[1]*2, 1, shape_1 / (fpw[1]*8)); - }while(wpt_nm1 != wpt); + wpt_nm1 = wpt_; + if(wpt_[0] * wpt_[1] * wpt_[2] < num_warps) + wpt_[0] = clamp(wpt_[0]*2, 1, shape_[0] / (fpw_[0]*8)); + if(wpt_[0] * wpt_[1] * wpt_[2] < num_warps) + wpt_[1] = clamp(wpt_[1]*2, 1, shape_[1] / (fpw_[1]*8)); + }while(wpt_nm1 != wpt_); + /* sanity check */ unsigned effective_num_warps = 1; - for(size_t d = 0; d < shapes.size(); d++) - effective_num_warps *= wpt[d]; - + for(size_t d = 0; d < shape.size(); d++) + effective_num_warps *= wpt_[d]; if(num_warps != effective_num_warps) throw std::runtime_error("cannot create a kernel with this amount of warps"); } +/* -------------------------------- * + * Scanline Layout * + * -------------------------------- */ - -layout_scanline_t::layout_scanline_t(size_t num_warps, - const std::vector& _axes, - const std::vector& _shapes, - const std::vector &values, ir::type *_ty, - analysis::align* align): layout_t(SCANLINE, _axes, _shapes, values, _ty, align){ - unsigned size = std::accumulate(shapes.begin(), shapes.end(), 1, std::multiplies()); +scanline_layout::scanline_layout(size_t num_warps, + const std::vector& axes, + const std::vector& shape, + const std::vector &values, + analysis::align* align): data_layout(SCANLINE, axes, shape, values, align){ + unsigned size = std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies()); unsigned num_threads = num_warps * 32; - nts.resize(shapes.size()); - mts.resize(shapes.size()); + nts_.resize(shape_.size()); + mts_.resize(shape_.size()); bool is_dot = std::any_of(values.begin(), values.end(), [&](ir::value* v) { return dynamic_cast(v); }); @@ -238,34 +178,39 @@ layout_scanline_t::layout_scanline_t(size_t num_warps, if(auto *st = dynamic_cast(usr)) ptr = st->get_pointer_operand(); - unsigned i = order[0]; + unsigned i = order_[0]; int contiguous = 4; if(ptr) contiguous = std::min(align->contiguous(ptr)[i], 4); - nts[i] = clamp(size / num_threads, 1, std::min(contiguous, shapes[i])); - mts[i] = clamp(num_threads, 1, shapes[i] / nts[i]); - size /= shapes[i]; - num_threads /= mts[i]; + nts_[i] = clamp(size / num_threads, 1, std::min(contiguous, shape_[i])); + mts_[i] = clamp(num_threads, 1, shape_[i] / nts_[i]); + size /= shape_[i]; + num_threads /= mts_[i]; if(is_dot) - nts[order[1]] = clamp(size / num_threads, 1, std::min(4, shapes[order[1]])); - for(size_t d = 1; d < shapes.size(); d++){ - i = order[d]; + nts_[order_[1]] = clamp(size / num_threads, 1, std::min(4, shape_[order_[1]])); + for(size_t d = 1; d < shape_.size(); d++){ + i = order_[d]; if(d > 1 || !is_dot) - nts[i] = 1; - mts[i] = clamp(num_threads, 1, shapes[i] / nts[i]); - num_threads = num_threads / mts[i]; + nts_[i] = 1; + mts_[i] = clamp(num_threads, 1, shape_[i] / nts_[i]); + num_threads = num_threads / mts_[i]; } /* sanity check */ unsigned effective_num_threads = 1; - for(size_t d = 0; d < shapes.size(); d++) - effective_num_threads *= mts[d]; + for(size_t d = 0; d < shape_.size(); d++) + effective_num_threads *= mts_[d]; if(num_warps * 32 != effective_num_threads) throw std::runtime_error("cannot create a kernel with this amount of warps"); } -inline bool is_loop_latch(ir::phi_node *phi, ir::instruction *terminator){ + +/* -------------------------------- * + * Shared Layout * + * -------------------------------- */ + +bool shared_layout::is_loop_latch(ir::phi_node *phi, ir::instruction *terminator){ if(phi->get_parent() != terminator->get_parent()) return false; if(auto *br = dynamic_cast(terminator)) @@ -278,7 +223,7 @@ inline bool is_loop_latch(ir::phi_node *phi, ir::instruction *terminator){ } -void extract_double_bufferable(ir::value *v, std::shared_ptr& res) { +void shared_layout::extract_double_bufferable(ir::value *v, std::shared_ptr& res) { auto* phi = dynamic_cast(v); if(!phi || phi->get_num_incoming() != 2) return; @@ -303,22 +248,22 @@ void extract_double_bufferable(ir::value *v, std::shared_ptr& _axes, - const std::vector& _shapes, +shared_layout::shared_layout(const data_layout *arg, + const std::vector& axes, + const std::vector& shape, const std::vector &values, ir::type *ty, - analysis::align* align): layout_t(SHARED, _axes, _shapes, values, ty, align) { + analysis::align* align): data_layout(SHARED, axes, shape, values, align), ty_(ty) { - size = 0; + size_ = 0; // double-buffering for(ir::value *v: values) - extract_double_bufferable(v, double_buffer); + extract_double_bufferable(v, double_buffer_); // order - std::vector arg_order = arg ? arg->order : std::vector{0}; - order = arg_order; + std::vector arg_order = arg ? arg->get_order() : std::vector{0}; + order_ = arg_order; ir::value* dot_a = nullptr; ir::value* dot_b = nullptr; @@ -330,48 +275,84 @@ layout_shared_t::layout_shared_t(const layout_t *arg, extract_hmma_dot_use(v, hmma_dot_a, 0); extract_hmma_dot_use(v, hmma_dot_b, 1); } + + + // non-mma ordering std::vector col = {0, 1}; std::vector row = {1, 0}; - for(size_t s = 2; s < shapes.size(); s++){ + for(size_t s = 2; s < get_rank(); s++){ col.push_back(s); row.push_back(s); } - - bool is_nonhmma_dot_a = dot_a && !hmma_dot_a; bool is_nonhmma_dot_b = dot_b && !hmma_dot_b; if(is_nonhmma_dot_a) - order = is_trans(dot_a) ? row : col; + order_ = is_trans(dot_a) ? row : col; else if(is_nonhmma_dot_b) - order = is_trans(dot_b) ? col : row; -// else -// order = row; + order_ = is_trans(dot_b) ? col : row; + // padding size_t pad = 0; if(hmma_dot_a){ - bool row = is_trans(hmma_dot_a) ^ order[0] != 0; - pad = 24 - shapes[row ? 0 : 1] % 32; + bool row = is_trans(hmma_dot_a) ^ order_[0] != 0; + pad = 24 - shape_[row ? 0 : 1] % 32; } else if(hmma_dot_b){ - bool row = is_trans(hmma_dot_b) ^ order[0] != 0; - pad = 24 - shapes[row ? 1 : 0] % 32; + bool row = is_trans(hmma_dot_b) ^ order_[0] != 0; + pad = 24 - shape_[row ? 1 : 0] % 32; } - else if(order != arg_order) { + else if(order_ != arg_order) { pad = 4; } - shapes[order[0]] += pad; + shape_[order_[0]] += pad; // size - size = ty->get_primitive_size_in_bits() / 8; - for(auto s: shapes) - size *= s; - if(double_buffer) - size *= 2; + size_ = ty_->get_primitive_size_in_bits() / 8; + for(auto s: shape_) + size_ *= s; + if(double_buffer_) + size_ *= 2; } -// layout factory method -void layout::create(size_t id, const std::vector& values) { +/* -------------------------------- * + * ---- Layouts Inference Pass ---- * + * -------------------------------- */ + +layouts::layouts(analysis::axes *axes, analysis::align *align, size_t num_warps) + : axes_(axes), align_(align), num_warps_(num_warps) { } + + +void layouts::connect(ir::value *x, ir::value *y) { + if(x == y) + return; + if(!x->get_type()->is_tile_ty()) + return; + if(!y->get_type()->is_tile_ty()) + return; + std::vector x_axes = axes_->get(x); + std::vector y_axes = axes_->get(y); + std::set sx_axes(x_axes.begin(), x_axes.end()); + std::set sy_axes(y_axes.begin(), y_axes.end()); + std::set common; + std::set_intersection(sx_axes.begin(), sx_axes.end(), + sy_axes.begin(), sy_axes.end(), + std::inserter(common, common.begin())); + graph_.add_edge(x, x); + graph_.add_edge(y, y); + if(!common.empty()) + graph_.add_edge(x, y); +} + +void layouts::make_graph(ir::instruction *i) { + for(ir::value* opx: i->ops()) + for(ir::value* opy: i->ops()){ + connect(i, opx); + connect(opx, opy); + } +} + +void layouts::create(size_t id, const std::vector& values) { auto it_hmma_c = std::find_if(values.begin(), values.end(), &is_hmma_c); auto cmp = [](ir::value* x, ir::value *y) { return x->get_type()->get_tile_ranks1() < @@ -387,18 +368,18 @@ void layout::create(size_t id, const std::vector& values) { }); // type if(it_hmma_c != values.end()) - layouts_[id] = new layout_hmma_884_t(num_warps_, axes, shapes, values, largest->get_type()->get_scalar_ty(), align_); + layouts_[id] = new mma884_layout(num_warps_, axes, shapes, values, align_); else if(it_cts != values.end()){ ir::copy_to_shared_inst *cts = (ir::copy_to_shared_inst*)*it_cts; ir::value *arg = cts->get_operand(0); create(groups_.at(arg), values_.at(groups_.at(arg))); - layouts_[id] = new layout_shared_t(get(arg), axes, shapes, values, largest->get_type()->get_scalar_ty(), align_); + layouts_[id] = new shared_layout(get(arg), axes, shapes, values, largest->get_type()->get_scalar_ty(), align_); } else - layouts_[id] = new layout_scanline_t(num_warps_, axes, shapes, values, largest->get_type()->get_scalar_ty(), align_); + layouts_[id] = new scanline_layout(num_warps_, axes, shapes, values, align_); } -void layout::run(ir::module &mod) { +void layouts::run(ir::module &mod) { // make graph graph_.clear(); ir::for_each_instruction(mod, [this](ir::instruction* i) { @@ -422,35 +403,35 @@ void layout::run(ir::module &mod) { // shape auto shapes = arg->get_type()->get_tile_shapes(); unsigned shape_ax = shapes[axis]; - layout_scanline_t *layout = get(arg)->to_scanline(); - unsigned per_thread = layout->nts[axis]; + scanline_layout *layout = get(arg)->to_scanline(); + unsigned per_thread = layout->nts(axis); unsigned depth = shape_ax / per_thread; shapes[axis] = depth; // create layout - layouts_[id] = new layout_shared_t(layout, axes_->get(arg), shapes, {red}, red->get_type()->get_scalar_ty(), align_); + layouts_[id] = new shared_layout(layout, axes_->get(arg), shapes, {red}, red->get_type()->get_scalar_ty(), align_); tmp_[red] = id; } if(auto *recoalasce = dynamic_cast(i)){ ir::value *val = recoalasce->get_operand(0); - layout_t* in_layout = get(val); - layout_t* out_layout = get(i); - if(in_layout->type != HMMA_884) + mma884_layout* in_layout = get(val)->to_mma884(); + scanline_layout* out_layout = get(i)->to_scanline(); + if(!in_layout || !out_layout) return; id++; ir::type::tile_shapes_t in_shape = val->get_type()->get_tile_shapes(); ir::type::tile_shapes_t shape(in_shape.size()); - size_t ld = out_layout->order[0]; + size_t ld = out_layout->get_order(0); shape[ld] = in_shape[ld]; for(size_t k = 0; k < in_shape.size(); k++) if(k != ld) - shape[k] = 4*in_layout->to_hmma884()->fpw[k]*in_layout->to_hmma884()->wpt[k]; + shape[k] = 4*in_layout->to_mma884()->fpw(k)*in_layout->to_mma884()->wpt(k); // create layout - layouts_[id] = new layout_shared_t(out_layout, axes_->get(val), shape, {recoalasce}, val->get_type()->get_scalar_ty(), align_); + layouts_[id] = new shared_layout(out_layout, axes_->get(val), shape, {recoalasce}, val->get_type()->get_scalar_ty(), align_); tmp_[recoalasce] = id; } if(auto *atom = dynamic_cast(i)){ id++; - layouts_[id] = new layout_shared_t(nullptr, {}, {1}, {atom}, atom->get_type()->get_scalar_ty(), align_); + layouts_[id] = new shared_layout(nullptr, {}, {1}, {atom}, atom->get_type()->get_scalar_ty(), align_); tmp_[atom] = id; } }); diff --git a/lib/codegen/analysis/liveness.cc b/lib/codegen/analysis/liveness.cc index a4bb41f5e..224a93fc9 100644 --- a/lib/codegen/analysis/liveness.cc +++ b/lib/codegen/analysis/liveness.cc @@ -27,18 +27,18 @@ void liveness::run(ir::module &mod) { // create live intervals for(auto &x: layouts_->get_all()) { - if(x.second->type != SHARED) + shared_layout* layout = x.second->to_shared(); + if(!layout) continue; - layout_shared_t* layout = x.second->to_shared(); // users std::set users; - for(ir::value *v: layout->values){ + for(ir::value *v: layout->get_values()){ for(ir::user *u: v->get_users()) users.insert(u); } // compute intervals unsigned start = INT32_MAX; - for(ir::value *v: layout->values) + for(ir::value *v: layout->get_values()) if(indices.find(v) != indices.end()) start = std::min(start, indices.at(v)); unsigned end = 0; diff --git a/lib/codegen/selection/generator.cc b/lib/codegen/selection/generator.cc index 5cf964915..ae7d0e876 100644 --- a/lib/codegen/selection/generator.cc +++ b/lib/codegen/selection/generator.cc @@ -174,7 +174,7 @@ inline bool is_trans(ir::value *v) { generator::generator(analysis::axes *a_axes, - analysis::layout *layouts, + analysis::layouts *layouts, analysis::align *alignment, analysis::allocation *alloc, target *tgt, @@ -295,7 +295,7 @@ void generator::visit_unmasked_load_inst(ir::unmasked_load_inst* x) { } // find vector size ir::value *ptr = x->get_pointer_operand(); - size_t ld = layouts_->get(ptr)->order[0]; + size_t ld = layouts_->get(ptr)->get_order(0); unsigned alignment = std::max(alignment_->get(ptr, ld), 1); @@ -337,7 +337,7 @@ void generator::visit_unmasked_load_inst(ir::unmasked_load_inst* x) { void generator::visit_masked_load_inst(ir::masked_load_inst* x) { // find vector size ir::value *ptr = x->get_pointer_operand(); - size_t ld = layouts_->get(ptr)->order[0]; + size_t ld = layouts_->get(ptr)->get_order(0); unsigned alignment = alignment_->get(ptr, ld); distributed_tile *pointers = (distributed_tile*)tmap_.at(ptr); distributed_tile *masks = (distributed_tile*)tmap_.at(x->get_mask_operand()); @@ -603,7 +603,7 @@ void generator::visit_atomic_add_inst(ir::atomic_add_inst*) { void generator::visit_hmma_dot(ir::dot_inst* dot, shared_tile *TA, shared_tile *TB, distributed_tile *TD, unsigned NK) { const auto& shapes = dot->get_type()->get_tile_shapes(); - machine_layout_hmma_884_t* hmma = (machine_layout_hmma_884_t*)machine_layouts_.at(layouts_->get(dot)); + machine_mma884_layout* hmma = (machine_mma884_layout*)machine_layouts_.at(layouts_->get(dot)); TA->set_vector_size(4*hmma->pack_size_0_); TB->set_vector_size(4*hmma->pack_size_1_); TA->set_return_mode(true); @@ -625,8 +625,8 @@ void generator::visit_hmma_dot(ir::dot_inst* dot, shared_tile *TA, shared_tile * Value* u_thread_id = tgt_->get_local_id(builder_->GetInsertBlock()->getModule(), *builder_, 0); - auto ord_a = layouts_->get(dot->get_operand(0))->order; - auto ord_b = layouts_->get(dot->get_operand(1))->order; + auto ord_a = layouts_->get(dot->get_operand(0))->get_order(); + auto ord_b = layouts_->get(dot->get_operand(1))->get_order(); bool is_a_trans = is_trans(dot->get_operand(0)); bool is_b_trans = is_trans(dot->get_operand(1)); @@ -655,14 +655,14 @@ void generator::visit_hmma_dot(ir::dot_inst* dot, shared_tile *TA, shared_tile * "{$8, $9}, " "{$10, $11}, " "{$0, $1, $2, $3, $4, $5, $6, $7};", "=f,=f,=f,=f,=f,=f,=f,=f,r,r,r,r,0,1,2,3,4,5,6,7", false); - analysis::layout_hmma_884_t* layout = layouts_->get(dot)->to_hmma884(); + analysis::mma884_layout* layout = layouts_->get(dot)->to_mma884(); - unsigned fpw_0 = layout->fpw.at(0); - unsigned fpw_1 = layout->fpw.at(1); + unsigned fpw_0 = layout->fpw(0); + unsigned fpw_1 = layout->fpw(1); unsigned wts_0 = fpw_0 * 8; unsigned wts_1 = fpw_1 * 8; - unsigned wpt_0 = layout->wpt.at(0); - unsigned wpt_1 = layout->wpt.at(1); + unsigned wpt_0 = layout->wpt(0); + unsigned wpt_1 = layout->wpt(1); unsigned stride_rep_i = wpt_0 * wts_0; unsigned stride_rep_j = wpt_1 * wts_1; unsigned num_rep_i = shapes[0] / stride_rep_i; @@ -792,7 +792,7 @@ void generator::visit_dot_inst(ir::dot_inst* dot) { if(NK != 1) { shared_tile *TA = (shared_tile*)tmap_.at(A); shared_tile *TB = (shared_tile*)tmap_.at(B); - if(layouts_->get(dot)->type == analysis::HMMA_884) + if(layouts_->get(dot)->to_mma884()) visit_hmma_dot(dot, TA, TB, TD, NK); else visit_scanline_dot(dot, TA, TB, TD, NK, c_ty, f_mul_add); @@ -856,7 +856,7 @@ void generator::visit_reduce_inst(ir::reduce_inst* x) { }); // reduce within blocks - machine_layout_t *slayout = machine_layouts_.at(layouts_->get(layouts_->tmp(x))); + machine_data_layout *slayout = machine_layouts_.at(layouts_->get(layouts_->tmp(x))); shared_tile *stile = (shared_tile*)slayout->create(x); unsigned depth = stile->get_shapes()[axis]; @@ -926,31 +926,31 @@ void generator::visit_recoalesce_inst(ir::recoalesce_inst* rc) { // pointer to temporary shared memory Type *ty = llvm_type(rc->get_type()->get_scalar_ty(), *ctx_); // layouts - analysis::layout_hmma_884_t* in_layout = layouts_->get(op)->to_hmma884(); - analysis::layout_scanline_t* out_layout = layouts_->get(rc)->to_scanline(); + analysis::mma884_layout* in_layout = layouts_->get(op)->to_mma884(); + analysis::scanline_layout* out_layout = layouts_->get(rc)->to_scanline(); // machine tiles distributed_tile *in_dt = (distributed_tile*)(tmap_.at(op)); distributed_tile *out_dt = (distributed_tile*)(tmap_.at(rc)); // WMMA configuration long wmma_pt[3] = { 2, 4, 1}; - long wmma[3] = { 8*in_layout->wpt[0]*in_layout->fpw[0], - 8*in_layout->wpt[1]*in_layout->fpw[1], + long wmma[3] = { 8*in_layout->wpt(0)*in_layout->fpw(0), + 8*in_layout->wpt(1)*in_layout->fpw(1), 1}; // Work per thread for input layout long in_pt[3] = { shape[0] / wmma[0], shape[1] / wmma[1], 1 }; // Work per thread for output layout - long out_pt[3] = { shape[0] / out_layout->mts[0], - shape[1] / out_layout->mts[1], + long out_pt[3] = { shape[0] / out_layout->mts(0), + shape[1] / out_layout->mts(1), 1 }; if(rank > 2){ - wmma[2] = in_layout->wpt[2]*in_layout->fpw[2]; + wmma[2] = in_layout->wpt(2)*in_layout->fpw(2); in_pt[2] = shape[2] / wmma[2]; - out_pt[2] = shape[2] / out_layout->mts[2]; + out_pt[2] = shape[2] / out_layout->mts(2); } // Orders - auto ord = out_layout->order; + auto ord = out_layout->get_order(); if(ord.size() < 3) ord.push_back(2); // pointer lanes @@ -1028,13 +1028,13 @@ void generator::visit_recoalesce_inst(ir::recoalesce_inst* rc) { void generator::visit_copy_to_shared_inst(ir::copy_to_shared_inst* cts) { unsigned vector_size = 1; ir::value *arg = cts->get_operand(0); - analysis::layout_shared_t* out_layout = layouts_->get(cts)->to_shared(); - analysis::layout_scanline_t* in_layout = layouts_->get(arg)->to_scanline(); - auto out_order = out_layout->order; - auto in_order = in_layout->order; + analysis::shared_layout* out_layout = layouts_->get(cts)->to_shared(); + analysis::scanline_layout* in_layout = layouts_->get(arg)->to_scanline(); + auto out_order = out_layout->get_order(); + auto in_order = in_layout->get_order(); // tiles if(out_order == in_order) - vector_size = in_layout->nts.at(in_order[0]); + vector_size = in_layout->nts(in_order[0]); std::map packets; for_each(arg, [&](indices_t idx){ @@ -1180,17 +1180,17 @@ void generator::visit_function(ir::function* fn) { -void generator::visit_layout_hmma_884(analysis::layout_hmma_884_t* layout) { - machine_layouts_[layout] = new machine_layout_hmma_884_t(mod_, &*builder_, tgt_, llvm_type(layout->ty->get_scalar_ty(), *ctx_), a_axes_, axes_, layout); +void generator::visit_layout_hmma_884(analysis::mma884_layout* layout) { + machine_layouts_[layout] = new machine_mma884_layout(mod_, &*builder_, tgt_, a_axes_, axes_, layout); } -void generator::visit_layout_scanline(analysis::layout_scanline_t* layout) { - machine_layouts_[layout] = new machine_layout_scanline_t(mod_, &*builder_, tgt_, llvm_type(layout->ty->get_scalar_ty(), *ctx_), a_axes_, axes_, layout); +void generator::visit_layout_scanline(analysis::scanline_layout* layout) { + machine_layouts_[layout] = new machine_scanline_layout(mod_, &*builder_, tgt_, a_axes_, axes_, layout); } -void generator::visit_layout_shared(analysis::layout_shared_t* layout) { +void generator::visit_layout_shared(analysis::shared_layout* layout) { - machine_layouts_[layout] = new machine_layout_shared_t(mod_, &*builder_, tgt_, alloc_, sh_mem_ptr_, layout, vmap_, tmap_); + machine_layouts_[layout] = new machine_shared_layout(mod_, &*builder_, tgt_, alloc_, sh_mem_ptr_, layout, vmap_, tmap_); } void generator::visit_basic_block(ir::basic_block * block) { @@ -1230,9 +1230,9 @@ void generator::set_value(ir::value *x, const indices_t& idx, Value* v) { } -void generator::finalize_shared_layout(analysis::layout_shared_t *shared) { - if(shared->double_buffer) { - auto info = *shared->double_buffer; +void generator::finalize_shared_layout(analysis::shared_layout *shared) { + if(shared->get_double_buffer()) { + auto info = *shared->get_double_buffer(); ir::phi_node *phi = info.phi; PHINode *ptr = (PHINode*)((shared_tile*)tmap_.at(phi))->get_pointer(); PHINode *offset = (PHINode*)((shared_tile*)tmap_.at(phi))->get_offset(); @@ -1247,8 +1247,8 @@ void generator::finalize_shared_layout(analysis::layout_shared_t *shared) { offset->addIncoming(next_offset, llvm_inc_block); } else { - unsigned num_bytes = shared->ty->get_primitive_size_in_bits() / 8; - offset->addIncoming(builder_->getInt32(shared->size / (2*num_bytes)), llvm_inc_block); + unsigned num_bytes = shared->get_type()->get_primitive_size_in_bits() / 8; + offset->addIncoming(builder_->getInt32(shared->get_size() / (2*num_bytes)), llvm_inc_block); } ptr->addIncoming(inc_shared->get_pointer(), llvm_inc_block); } @@ -1258,7 +1258,7 @@ void generator::finalize_shared_layout(analysis::layout_shared_t *shared) { void generator::finalize_function(ir::function *fn) { // finalize double-buffering for(const auto& x: layouts_->get_all()) - if(auto *shared = dynamic_cast(x.second)) + if(auto *shared = dynamic_cast(x.second)) finalize_shared_layout(shared); // finalize phi for(ir::basic_block *block: fn->blocks()) diff --git a/lib/codegen/selection/machine_layout.cc b/lib/codegen/selection/machine_layout.cc index d1ea9fa0f..e1fcc8fe6 100644 --- a/lib/codegen/selection/machine_layout.cc +++ b/lib/codegen/selection/machine_layout.cc @@ -71,18 +71,18 @@ inline int32_t ceil(int32_t num, int32_t div){ -machine_layout_shared_t::machine_layout_shared_t(Module *mod, Builder *builder, target *tgt, analysis::allocation* alloc, - Value *&sh_mem_ptr, analysis::layout_shared_t *layout, +machine_shared_layout::machine_shared_layout(Module *mod, Builder *builder, target *tgt, analysis::allocation* alloc, + Value *&sh_mem_ptr, analysis::shared_layout *layout, std::map& vmap, std::map& tmap) : mod_(mod), builder_(builder), tgt_(tgt), alloc_(alloc), sh_mem_ptr_(sh_mem_ptr), layout_(layout), vmap_(vmap), tmap_(tmap) { - Type* ty = llvm_type(layout_->ty, builder_->getContext()); + Type* ty = llvm_type(layout_->get_type(), builder_->getContext()); PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr_->getType()->getPointerAddressSpace()); // double-buffered - if(layout_->double_buffer) { + if(layout_->get_double_buffer()) { BasicBlock *current = builder_->GetInsertBlock(); - auto info = *layout_->double_buffer; + auto info = *layout_->get_double_buffer(); ir::phi_node *phi = info.phi; BasicBlock *parent = (BasicBlock*)vmap_.at((ir::value*)(phi->get_parent())); if(parent->empty()) @@ -105,31 +105,31 @@ machine_layout_shared_t::machine_layout_shared_t(Module *mod, Builder *builder, } -tile* machine_layout_shared_t::create(ir::value *v) { - auto order = layout_->order; - auto shapes = layout_->shapes; - Type* ty = llvm_type(layout_->ty, builder_->getContext()); - // double-buffered - if(layout_->double_buffer) { - if(v == layout_->double_buffer->phi) - return new shared_tile(ty, shapes, order, ptr_, *builder_, offset_); - if(v == layout_->double_buffer->latch) - return new shared_tile(ty, shapes, order, next_ptr_, *builder_); - return new shared_tile(ty, shapes, order, pre_ptr_, *builder_); - } - else { - return new shared_tile(ty, shapes, order, ptr_, *builder_); - } +tile* machine_shared_layout::create(ir::value *v) { + Type* ty = llvm_type(layout_->get_type(), builder_->getContext()); + auto double_buffer = layout_->get_double_buffer(); + // offset + Value *offset = nullptr; + if(double_buffer && v == double_buffer->phi) + offset = offset_; + // base pointer + Value *ptr = ptr_; + if(double_buffer && v == double_buffer->latch) + ptr = next_ptr_; + else if(double_buffer && v == double_buffer->first) + ptr = pre_ptr_; + // create tile + return new shared_tile(ty, layout_->get_shape(), layout_->get_order(), ptr, *builder_, offset); } -machine_layout_distributed_t::machine_layout_distributed_t(Module *mod, Builder *builder, target *tgt, Type *ty, +machine_distributed_layout::machine_distributed_layout(Module *mod, Builder *builder, target *tgt, analysis::axes *a_axes, std::map& axes, - analysis::layout_t *layout) - : mod_(mod), builder_(builder), tgt_(tgt), ty_(ty), a_axes_(a_axes), axes_(axes), layout_(layout) { + analysis::data_layout *layout) + : mod_(mod), builder_(builder), tgt_(tgt), a_axes_(a_axes), axes_(axes), layout_(layout) { } -tile *machine_layout_distributed_t::create(ir::value *v) { +tile *machine_distributed_layout::create(ir::value *v) { Type *ty = llvm_type(v->get_type()->get_scalar_ty(), builder_->getContext()); const auto &shapes = v->get_type()->get_tile_shapes(); size_t rank = shapes.size(); @@ -151,12 +151,10 @@ tile *machine_layout_distributed_t::create(ir::value *v) { auto cmp = [&](int x, int y) { unsigned axx = a_axes_->get(v, x); unsigned axy = a_axes_->get(v, y); - auto itx = std::find(layout_->axes.begin(), layout_->axes.end(), axx); - auto ity = std::find(layout_->axes.begin(), layout_->axes.end(), axy); - size_t posx = std::distance(layout_->axes.begin(), itx); - size_t posy = std::distance(layout_->axes.begin(), ity); + size_t posx = layout_->find_axis(axx); + size_t posy = layout_->find_axis(axy); if(posx < rank && posy < rank) - return layout_->order[posx] < layout_->order[posy]; + return layout_->get_order(posx) < layout_->get_order(posy); return false; }; std::sort(order.begin(), order.end(), cmp); @@ -164,22 +162,21 @@ tile *machine_layout_distributed_t::create(ir::value *v) { return new distributed_tile(ty, shapes, order, axes, *builder_); } -machine_layout_hmma_884_t::machine_layout_hmma_884_t(Module *mod, Builder *builder, - target *tgt, Type *ty, analysis::axes *a_axes, +machine_mma884_layout::machine_mma884_layout(Module *mod, Builder *builder, + target *tgt, analysis::axes *a_axes, std::map& axes, - analysis::layout_hmma_884_t* layout) - : machine_layout_distributed_t(mod, builder, tgt, ty, a_axes, axes, layout) { + analysis::mma884_layout* layout) + : machine_distributed_layout(mod, builder, tgt, a_axes, axes, layout) { Value *warp_size = builder_->getInt32(32); Value* u_thread_id_0 = tgt_->get_local_id(mod_, *builder_, 0); Value *u_thread_id = builder_->CreateURem(u_thread_id_0, warp_size); Value *u_warp_id = builder_->CreateUDiv(u_thread_id_0, warp_size); - const auto& shapes = layout->shapes; - if(shapes.size() > 3) + const auto& shape = layout->get_shape(); + if(shape.size() > 3) throw std::runtime_error("unsupported"); - - bool is_batched = shapes.size() >= 3; + bool is_batched = shape.size() >= 3; Value *_1 = builder_->getInt32(1); Value *_2 = builder_->getInt32(2); @@ -188,13 +185,13 @@ machine_layout_hmma_884_t::machine_layout_hmma_884_t(Module *mod, Builder *build Value *_16 = builder_->getInt32(16); // fragments per warp - unsigned fpw_0 = layout->fpw.at(0); - unsigned fpw_1 = layout->fpw.at(1); - unsigned fpw_2 = is_batched ? layout->fpw.at(2) : 1; + unsigned fpw_0 = layout->fpw(0); + unsigned fpw_1 = layout->fpw(1); + unsigned fpw_2 = is_batched ? layout->fpw(2) : 1; // warps per tile - unsigned wpt_0 = layout->wpt.at(0); - unsigned wpt_1 = layout->wpt.at(1); - unsigned wpt_2 = is_batched ? layout->wpt.at(2) : 1; + unsigned wpt_0 = layout->wpt(0); + unsigned wpt_1 = layout->wpt(1); + unsigned wpt_2 = is_batched ? layout->wpt(2) : 1; // hmma warp tile size unsigned hmma_wts_0 = fpw_0 * 8; unsigned hmma_wts_1 = fpw_1 * 8; @@ -204,9 +201,9 @@ machine_layout_hmma_884_t::machine_layout_hmma_884_t(Module *mod, Builder *build unsigned hmma_bts_1 = hmma_wts_1 * wpt_1; unsigned hmma_bts_2 = is_batched ? hmma_wts_2 * wpt_2 : 1; // number of repetition - unsigned num_rep_0 = shapes[0] / hmma_bts_0; - unsigned num_rep_1 = shapes[1] / hmma_bts_1; - unsigned num_rep_2 = is_batched ? shapes[2] / hmma_bts_2 : 1; + unsigned num_rep_0 = shape[0] / hmma_bts_0; + unsigned num_rep_1 = shape[1] / hmma_bts_1; + unsigned num_rep_2 = is_batched ? shape[2] / hmma_bts_2 : 1; // size of each pack (interleaving) pack_size_0_ = std::min(num_rep_0, 1); pack_size_1_ = std::min(num_rep_1, 1); @@ -275,44 +272,52 @@ machine_layout_hmma_884_t::machine_layout_hmma_884_t(Module *mod, Builder *build /* axes */ - axes_[layout->axes[0]] = distributed_axis{1, idx_i, warp_id_0}; - axes_[layout->axes[1]] = distributed_axis{1, idx_j, warp_id_1}; + axes_[layout->get_axis(0)] = distributed_axis{1, idx_i, warp_id_0}; + axes_[layout->get_axis(1)] = distributed_axis{1, idx_j, warp_id_1}; if(is_batched) - axes_[layout->axes[2]] = distributed_axis{1, idx_z, warp_id_2}; + axes_[layout->get_axis(2)] = distributed_axis{1, idx_z, warp_id_2}; } -machine_layout_scanline_t::machine_layout_scanline_t(Module *mod, Builder *builder, - target *tgt, Type *ty, +machine_scanline_layout::machine_scanline_layout(Module *mod, Builder *builder, + target *tgt, analysis::axes *a_axes, std::map &axes, - analysis::layout_scanline_t* layout) - : machine_layout_distributed_t(mod, builder, tgt, ty, a_axes, axes, layout) { + analysis::scanline_layout* layout) + : machine_distributed_layout(mod, builder, tgt, a_axes, axes, layout) { Value *warp_size = builder_->getInt32(32); Value* u_thread_id_0 = tgt_->get_local_id(mod_, *builder_, 0); Value *u_thread_id = builder_->CreateURem(u_thread_id_0, warp_size); Value *u_warp_id = builder_->CreateUDiv(u_thread_id_0, warp_size); - auto order = layout->order; - const auto& shapes = layout->shapes; - size_t dim = shapes.size(); - std::vector nts = layout->nts; - std::vector mts = layout->mts; + auto order = layout->get_order(); + const auto& shape = layout->get_shape(); Value* full_thread_id = builder_->CreateAdd(builder_->CreateMul(u_warp_id, builder_->getInt32(32)), u_thread_id); - std::vector thread_id = delinearize(full_thread_id, order, mts, *builder_); + // Delinearize + size_t dim = shape.size(); + std::vector thread_id(dim); + for(unsigned k = 0; k < dim - 1; k++){ + Constant *dim_k = builder_->getInt32(layout->mts(order[k])); + Value *rem = builder_->CreateURem(full_thread_id, dim_k); + full_thread_id = builder_->CreateUDiv(full_thread_id, dim_k); + thread_id[order[k]] = rem; + } + thread_id[order[dim - 1]] = full_thread_id; // Create axes for(unsigned k = 0; k < dim; k++) { + int nts = layout->nts(k); + int mts = layout->mts(k); std::string str_k = std::to_string(k); - Value *contiguous_k = builder_->getInt32(nts[k]); + Value *contiguous_k = builder_->getInt32(nts); Value *scaled_thread_id = builder_->CreateMul(thread_id[k], contiguous_k); - unsigned per_block = nts[k] * mts[k]; - unsigned per_thread = nts[k] * shapes[k] / per_block; + unsigned per_block = nts * mts; + unsigned per_thread = nts * shape[k] / per_block; std::vector idx_list(per_thread); for(unsigned n = 0 ; n < per_thread; n++){ - unsigned offset = n / nts[k] * per_block + n % nts[k]; + unsigned offset = n / nts * per_block + n % nts; idx_list[n] = builder_->CreateAdd(scaled_thread_id, builder_->getInt32(offset), "idx_" + str_k + "_" + std::to_string(n)); } - axes_[layout->axes[k]] = distributed_axis{nts[k], idx_list, thread_id[k]}; + axes_[layout->get_axis(k)] = distributed_axis{nts, idx_list, thread_id[k]}; } } diff --git a/lib/codegen/transform/coalesce.cc b/lib/codegen/transform/coalesce.cc index 78c03396f..14a295b00 100644 --- a/lib/codegen/transform/coalesce.cc +++ b/lib/codegen/transform/coalesce.cc @@ -12,7 +12,7 @@ namespace triton { namespace codegen{ namespace transform{ -coalesce::coalesce(analysis::align* align, analysis::layout *layouts) +coalesce::coalesce(analysis::align* align, analysis::layouts *layouts) : align_(align), layout_(layouts) { } // Find all values that are used as pointer operands in LD/ST @@ -64,8 +64,9 @@ ir::value* coalesce::rematerialize(ir::value *x, ir::builder &builder, void coalesce::run(ir::module &mod) { size_t num_groups = layout_->num_layouts(); + for(size_t id = 0; id < num_groups; id++) { - if(layout_->get(id)->type != analysis::HMMA_884) + if(!layout_->get(id)->to_mma884()) continue; // extract memory stores const auto& values = layout_->values_of(id); @@ -97,7 +98,6 @@ void coalesce::run(ir::module &mod) { } } - // find values to rematerialize std::vector remat; for(size_t id = 0; id < num_groups; id++) { @@ -109,7 +109,7 @@ void coalesce::run(ir::module &mod) { // extract leading axes std::map> axes; for(ir::io_inst *i: io){ - if(i->get_pointer_operand()->get_type()->get_tile_ranks1() == layout_->get(id)->axes.size()) + if(i->get_pointer_operand()->get_type()->get_tile_ranks1() == layout_->get(id)->get_rank()) extract_ld(i, axes); } // update list of values to rematerialize diff --git a/lib/codegen/transform/membar.cc b/lib/codegen/transform/membar.cc index 1d9aef055..0a9b0235b 100644 --- a/lib/codegen/transform/membar.cc +++ b/lib/codegen/transform/membar.cc @@ -35,10 +35,11 @@ void membar::add_reference(ir::value *v, interval_vec_t &res){ return; if(!i->get_type()->is_tile_ty()) return; - if(alloc_->has_offset(layouts_->get(v))){ - unsigned offset = alloc_->offset(layouts_->get(v)); - unsigned size = layouts_->get(v)->to_shared()->size; - res.push_back(interval_t(offset, offset + size)); + analysis::shared_layout* layout = layouts_->get(v)->to_shared(); + assert(layout); + if(alloc_->has_offset(layout)){ + unsigned offset = alloc_->offset(layout); + res.push_back(interval_t(offset, offset + layout->get_size())); } } @@ -119,13 +120,11 @@ void membar::run(ir::module &mod) { // without needing synchronization std::set safe_war; for(const auto& x: layouts_->get_all()){ - if(x.second->type != analysis::SHARED) + analysis::shared_layout* layout = x.second->to_shared(); + if(!layout || !layout->get_double_buffer()) continue; - analysis::layout_shared_t* layout = x.second->to_shared(); - if(!layout->double_buffer) - continue; - for(ir::value *v: layout->values) - if(v != layout->double_buffer->phi) + for(ir::value *v: layout->get_values()) + if(v != layout->get_double_buffer()->phi) safe_war.insert(v); } diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index fe1d77b66..bdd695298 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -220,7 +220,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c codegen::analysis::align align; codegen::analysis::axes axes; codegen::transform::disassociate disassociate; - codegen::analysis::layout layouts(&axes, &align, opt.num_warps); + codegen::analysis::layouts layouts(&axes, &align, opt.num_warps); codegen::analysis::liveness liveness(&layouts); codegen::analysis::allocation allocation(&liveness); codegen::transform::membar barriers(&liveness, &layouts, &allocation); @@ -239,7 +239,6 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c align.run(module); cts.run(module); axes.run(module); -// ir::print(module, std::cout); layouts.run(module); coalesce.run(module); dce.run(module); @@ -250,15 +249,14 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c dce.run(module); align.run(module); axes.run(module); -// ir::print(module, std::cout); layouts.run(module); liveness.run(module); allocation.run(module); if(allocation.allocated_size() > context->device()->max_shared_memory()) return std::unique_ptr(); barriers.run(module); -// ir::print(module, std::cout); isel.visit(module, *llvm); + // return binary std::unique_ptr res(driver::module::create(context, std::move(llvm))); // done diff --git a/python/examples/einsum.py b/python/examples/einsum.py index a3fdba5e0..f61347d0c 100644 --- a/python/examples/einsum.py +++ b/python/examples/einsum.py @@ -79,7 +79,7 @@ for N, T, H, S, E in NTHSE: # 1D Dense convolution NCHKR = [ - # (1, 1152, 12602, 512, 3) + (1, 1152, 12602, 512, 3) ] for N, C, H, K, R in NCHKR: torch_fn = lambda a, b: torch.nn.functional.conv1d(a, b.permute(2, 0, 1)) @@ -92,10 +92,10 @@ for N, C, H, K, R in NCHKR: # 2D Dense convolution NCHWKRS = [ - #(8, 64, 128, 128, 768, 3, 3), - #(8, 128, 64, 64, 256, 3, 3), - #(8, 256, 32, 32, 512, 3, 3), - #(8, 512, 32, 32, 1024, 3, 3) + (8, 64, 128, 128, 768, 3, 3), + (8, 128, 64, 64, 256, 3, 3), + (8, 256, 32, 32, 512, 3, 3), + (8, 512, 32, 32, 1024, 3, 3) ] for N, C, H, W, K, R, S in NCHWKRS: torch_fn = lambda a, b: torch.nn.functional.conv2d(a, b.permute(3, 0, 1, 2)) @@ -108,10 +108,10 @@ for N, C, H, W, K, R, S in NCHWKRS: # 3D Dense Convolution NCDHWKTRS = [ - #(8, 32, 27, 100, 100, 64, 3, 3, 3), - #(8, 64, 23, 48, 48, 256, 3, 3, 3), - #(8, 256, 19, 22, 22, 640, 3, 3, 3), - #(8, 640, 15, 36, 36, 384, 3, 3, 3) + (8, 32, 27, 100, 100, 64, 3, 3, 3), + (8, 64, 23, 48, 48, 256, 3, 3, 3), + (8, 256, 19, 22, 22, 640, 3, 3, 3), + (8, 640, 15, 36, 36, 384, 3, 3, 3) ] for N, C, D, H, W, K, T, R, S in NCDHWKTRS: torch_fn = lambda a, b: torch.nn.functional.conv3d(a, b.permute(4, 0, 1, 2, 3)) @@ -168,7 +168,7 @@ for N, C, H, W, K, R, S in NCHWKRS: # Benchmark torch.set_num_threads(1) for a_shape, b_shape, c_shape, torch_fn, expr, arrays in configs: - dtype = torch.cuda.FloatTensor + dtype = torch.cuda.HalfTensor # initialize input tensors a = torch.rand(*a_shape).type(dtype).cuda() b = torch.rand(*b_shape).type(dtype).cuda() From ce7a00674a1cba0389ca4d3d56e684ea4980e2e7 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Tue, 21 Jan 2020 16:45:04 -0500 Subject: [PATCH 491/494] [PYTHON][EXAMPLES] Added self-attention example using triton.ops.einsum --- lib/runtime/function.cc | 1 - python/examples/attention/bench.py | 47 +++++++++++++++++ python/examples/attention/optimized.py | 50 ++++++++++++++++++ python/examples/attention/reference.py | 72 ++++++++++++++++++++++++++ python/examples/einsum.py | 63 +++++++++++----------- python/triton/kernel.py | 23 +++++--- python/triton/ops/einsum.py | 16 +++--- python/triton/utils.py | 2 +- 8 files changed, 227 insertions(+), 47 deletions(-) create mode 100644 python/examples/attention/bench.py create mode 100644 python/examples/attention/optimized.py create mode 100644 python/examples/attention/reference.py diff --git a/lib/runtime/function.cc b/lib/runtime/function.cc index bdd695298..3cc5ea87b 100644 --- a/lib/runtime/function.cc +++ b/lib/runtime/function.cc @@ -256,7 +256,6 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c return std::unique_ptr(); barriers.run(module); isel.visit(module, *llvm); - // return binary std::unique_ptr res(driver::module::create(context, std::move(llvm))); // done diff --git a/python/examples/attention/bench.py b/python/examples/attention/bench.py new file mode 100644 index 000000000..99c1722eb --- /dev/null +++ b/python/examples/attention/bench.py @@ -0,0 +1,47 @@ +import torch +import numpy as np +import reference +import optimized +from time import time + +use_half = False +def cast(x): + if use_half: + return x.half() + else: + return x + +# GPU device +device = torch.device("cuda:0") +# shapes +batch, nhead = 16, 8 +dm, dk, dv = 512, 512, 512 +lq, lk, lv = 256, 256, 256 +# initialize tensors +torch.manual_seed(0) +np.random.seed(0) +query = cast(torch.randn(batch, lq, dm)).cuda() +key = cast(torch.randn(batch, lk, dm)).cuda() +value = cast(torch.randn(batch, lv, dm)).cuda() +# initialize layers +torch.manual_seed(0) +np.random.seed(0) +rattn = cast(reference.MultiHeadAttention(nhead, dm, dk, dv).to(device)) +torch.manual_seed(0) +np.random.seed(0) +tattn = cast(optimized.MultiHeadAttention(nhead, dm, dk, dv).to(device)) +# test +routput, _ = rattn(query, key, value) +toutput, _ = tattn(query, key, value) +diff = torch.max(torch.abs(routput - toutput)) +assert diff < 1e-2 +# benchmark +start = time() +routput, _ = rattn(query, key, value) +end = time() +rtime = end - start +start = time() +toutput, _ = tattn(query, key, value) +end = time() +ttime = end - start +print(rtime, ttime) \ No newline at end of file diff --git a/python/examples/attention/optimized.py b/python/examples/attention/optimized.py new file mode 100644 index 000000000..96cc14262 --- /dev/null +++ b/python/examples/attention/optimized.py @@ -0,0 +1,50 @@ +import numpy as np +import torch +import torch.nn as nn +import triton + +class MultiHeadAttention(nn.Module): + ''' Multi-Head Attention module ''' + + def __init__(self, n_head, d_model, d_k, d_v): + super().__init__() + self.n_head = n_head + self.d_k = d_k + self.d_v = d_v + # linear layers + self.w_qs = nn.Linear(d_model, n_head * d_k) + self.w_ks = nn.Linear(d_model, n_head * d_k) + self.w_vs = nn.Linear(d_model, n_head * d_v) + self.fc = nn.Linear(n_head * d_v, d_model) + # initialize weights + nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) + nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) + nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v))) + nn.init.xavier_normal_(self.fc.weight) + # layer normalization + self.layer_norm = nn.LayerNorm(d_model) + + + def forward(self, q, k, v, mask=None): + # dimensions + d_k, d_v, n_head = self.d_k, self.d_v, self.n_head + sz_b, len_q, _ = q.size() + sz_b, len_k, _ = k.size() + sz_b, len_v, _ = v.size() + # linear transformations + residual = q + q = self.w_qs(q).view(sz_b, len_q, n_head, d_k) + k = self.w_ks(k).view(sz_b, len_k, n_head, d_k) + v = self.w_vs(v).view(sz_b, len_v, n_head, d_v) + # scaled dot-product attention + attn = triton.ops.einsum('blhk,bthk->hblt', q, k, [n_head, sz_b, len_q, len_k]) + attn = attn / np.sqrt(d_k) + if mask is not None: + attn = attn.masked_fill(mask[None], -np.inf) + attn = torch.softmax(attn, dim=3) + output = triton.ops.einsum('hblt,bthv->blhv', attn, v, [sz_b, len_q, n_head, d_v]) + output = output.view(sz_b, len_q, -1) + output = self.fc(output) + # epilogue + output = self.layer_norm(output + residual) + return output, attn \ No newline at end of file diff --git a/python/examples/attention/reference.py b/python/examples/attention/reference.py new file mode 100644 index 000000000..e60f474f6 --- /dev/null +++ b/python/examples/attention/reference.py @@ -0,0 +1,72 @@ +import numpy as np +import torch +import torch.nn as nn + +class ScaledDotProductAttention(nn.Module): + ''' Scaled Dot-Product Attention ''' + + def __init__(self, temperature, attn_dropout=0.1): + super().__init__() + self.temperature = temperature + self.softmax = nn.Softmax(dim=2) + + def forward(self, q, k, v, mask=None): + attn = torch.bmm(q, k.transpose(1, 2)) + attn = attn / self.temperature + if mask is not None: + attn = attn.masked_fill(mask, -np.inf) + attn = self.softmax(attn) + output = torch.bmm(attn, v) + return output, attn + + + +class MultiHeadAttention(nn.Module): + ''' Multi-Head Attention module ''' + + def __init__(self, n_head, d_model, d_k, d_v): + super().__init__() + self.n_head = n_head + self.d_k = d_k + self.d_v = d_v + # linear layers + self.w_qs = nn.Linear(d_model, n_head * d_k) + self.w_ks = nn.Linear(d_model, n_head * d_k) + self.w_vs = nn.Linear(d_model, n_head * d_v) + self.fc = nn.Linear(n_head * d_v, d_model) + # initialize weights + nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) + nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) + nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v))) + nn.init.xavier_normal_(self.fc.weight) + # normalization + self.layer_norm = nn.LayerNorm(d_model) + # scaled dot-product + self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5)) + + + def forward(self, q, k, v, mask=None): + # dimensions + d_k, d_v, n_head = self.d_k, self.d_v, self.n_head + sz_b, len_q, _ = q.size() + sz_b, len_k, _ = k.size() + sz_b, len_v, _ = v.size() + # linear transformations + residual = q + q = self.w_qs(q).view(sz_b, len_q, n_head, d_k) + k = self.w_ks(k).view(sz_b, len_k, n_head, d_k) + v = self.w_vs(v).view(sz_b, len_v, n_head, d_v) + # scaled dot-product attention + q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k) # (n*b) x lq x dk + k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k) # (n*b) x lk x dk + v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v) # (n*b) x lv x dv + if mask: + mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x .. + output, attn = self.attention(q, k, v, mask=mask) + # linear transformation + output = output.view(n_head, sz_b, len_q, d_v) + output = output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1) # b x lq x (n*dv) + output = self.fc(output) + # normalization + output = self.layer_norm(output + residual) + return output, attn \ No newline at end of file diff --git a/python/examples/einsum.py b/python/examples/einsum.py index f61347d0c..e571ec955 100644 --- a/python/examples/einsum.py +++ b/python/examples/einsum.py @@ -11,25 +11,25 @@ configs = [] # Matrix multiplication MNK = [ - (512, 512 ,512), - (2048, 2048, 2048), - (8192, 8192, 8192), + # (512, 512 ,512), + # (2048, 2048, 2048), + # (8192, 8192, 8192), - (64, 64, 64000), - (64, 64, 128000), - (256, 256, 64000), - (256, 256, 128000), + # (64, 64, 64000), + # (64, 64, 128000), + # (256, 256, 64000), + # (256, 256, 128000), - (1536, 16, 1536), - (1536, 32, 1536), - (1536, 64, 1536), - (1536, 128, 1536), - (4096, 16, 4096), - (4096, 32, 4096), - (4096, 64, 4096), - (4096, 128, 4096), + # (1536, 16, 1536), + # (1536, 32, 1536), + # (1536, 64, 1536), + # (1536, 128, 1536), + # (4096, 16, 4096), + # (4096, 32, 4096), + # (4096, 64, 4096), + # (4096, 128, 4096), - #(127008, 768, 576) + # (127008, 768, 576) ] for M, N, K in MNK: matmul = lambda a, b: torch.matmul(a, b) @@ -43,32 +43,32 @@ for M, N, K in MNK: # Relative attention NTHSE = [ - #(16, 512, 1, 64, 64), + (16, 512, 1, 64, 64), # (16, 512, 1, 128, 128), # (16, 512, 1, 256, 256), # (16, 512, 1, 256, 512), - #(16, 512, 8, 64, 64), + (16, 512, 8, 64, 64), # (16, 512, 8, 128, 128), # (16, 512, 8, 256, 256), # (16, 512, 8, 256, 512), # (64, 1024, 1, 64, 64), - #(64, 1024, 1, 128, 128), + (64, 1024, 1, 128, 128), # (64, 1024, 1, 256, 256), # (64, 1024, 1, 256, 512), # (64, 1024, 8, 64, 64), - #(64, 1024, 8, 128, 128), + (64, 1024, 8, 128, 128), # (64, 1024, 8, 256, 256), # (64, 1024, 8, 256, 512), # (128, 1024, 1, 64, 64), # (128, 1024, 1, 128, 128), # (128, 1024, 1, 256, 256), - #(128, 1024, 1, 256, 512), + (128, 1024, 1, 256, 512), # (128, 1024, 8, 64, 64), # (128, 1024, 8, 128, 128), # (128, 1024, 8, 256, 256), - #(128, 1024, 8, 256, 512) + (128, 1024, 8, 256, 512) ] for N, T, H, S, E in NTHSE: configs += [([N, T, H, S], [H, E, S], [N, H, T, E], None, 'nths,hes->nhte', dict())] @@ -168,12 +168,11 @@ for N, C, H, W, K, R, S in NCHWKRS: # Benchmark torch.set_num_threads(1) for a_shape, b_shape, c_shape, torch_fn, expr, arrays in configs: - dtype = torch.cuda.HalfTensor + dtype = torch.cuda.FloatTensor # initialize input tensors a = torch.rand(*a_shape).type(dtype).cuda() b = torch.rand(*b_shape).type(dtype).cuda() # triton output - #ta = triton.ops._einsum.pad(a, [4,4,4,4]) tc = triton.ops.einsum(expr, a, b, c_shape, arrays = arrays, bench = True) # reference output if torch_fn: @@ -183,12 +182,16 @@ for a_shape, b_shape, c_shape, torch_fn, expr, arrays in configs: # performance relative to equivalent matrix multiplication ctx = triton.ctx_registry[tc] B, M, N, K = ctx.matmul_B, ctx.matmul_M, ctx.matmul_N, ctx.matmul_K - # a = torch.rand(B, M, K).type(dtype).cuda() - # b = torch.rand(B, K, N).type(dtype).cuda() - # tmmc = triton.ops.einsum('bmk,bkn->bmn', a, b, [B, M, N], bench = True) - # ratio = triton.bench_registry[tmmc] / triton.bench_registry[tc] - ratio = 0 + cmp_eqbmm = True + if cmp_eqbmm: + a = torch.rand(B, M, K).type(dtype).cuda() + b = torch.rand(B, K, N).type(dtype).cuda() + tmmc = triton.ops.einsum('bmk,bkn->bmn', a, b, [B, M, N], bench = True) + ratio = triton.bench_registry[tmmc] / triton.bench_registry[tc] + cmp_str = f'({ratio:4.2f})' + else: + cmp_str = '' # test and benchmark bench = 2. * B * M * N * K / triton.bench_registry[tc] * 1e-3 diff = (tc - rc).abs().max() / rc.abs().max() - print(f'{expr:>15}; {str(a_shape):>20}; {str(b_shape):>20}; {bench:4.2f} ({ratio:4.2f}); {diff:4.2f}') + print(f'{expr:>15}; {str(a_shape):>20}; {str(b_shape):>20}; {bench:4.2f} {cmp_str}; {diff:4.2f}') diff --git a/python/triton/kernel.py b/python/triton/kernel.py index 77177d740..85195790d 100644 --- a/python/triton/kernel.py +++ b/python/triton/kernel.py @@ -206,6 +206,9 @@ class kernel: self.cst[name] = value def __call__(self, *args, **kwargs): + ######################### + # cache + ######################## # create a new framework op when defines are different key = '-'.join(['{key}-{val}'.format(key=key, val=val) for key, val in kwargs.items()]) if key not in self.fw_id.keys(): @@ -230,17 +233,18 @@ class kernel: libtriton.register_cst(op_id, name, value) if self.fw_op is None: self.fw_op = _make_framework_op(self.src, self.outputs, self.tmp, opt) - # benchmarking info - bench = 0 - if 'bench' in kwargs: - bench = kwargs['bench'] - # retrieve framework op + + ######################## + # initialize + ######################## op_id = self.fw_id[key] - # register grid libtriton.register_grid(op_id, args[-1]) - # id for the benchmark result + bench = kwargs['bench'] if 'bench' in kwargs else 0 bench_id = libtriton.make_scalar_id() if bench > 0 else -1 + + ######################### # call framework function + ######################### if fw.has_tensorflow(): empty = [x for x in args[:-1] if isinstance(x, triton.utils.tf_empty_proxy)] if len(empty) != len(self.outputs): @@ -268,10 +272,15 @@ class kernel: if bench > 0: for y in ret: bench_registry[y] = triton.utils.id_dict.lazy_entry(bench_id) + + ############################ + # call torch function + ############################ elif fw.has_torch(): args = [x if isinstance(x, fw.torch.Tensor) else x for x in args[:-1]] ret = self.fw_op(op_id, bench, bench_id, *args) if bench > 0: bench_registry[ret] = libtriton.retrieve_scalar(bench_id) + else: assert False \ No newline at end of file diff --git a/python/triton/ops/einsum.py b/python/triton/ops/einsum.py index ff29432e5..dbb236b18 100644 --- a/python/triton/ops/einsum.py +++ b/python/triton/ops/einsum.py @@ -536,8 +536,8 @@ __global__ void {name}( self.pos_b = 1 self.pos_c = 2 # pre-processor macros - TM = [x for x in [16, 32, 64, 128] if x <= M] - TN = [x for x in [16, 32, 64, 128] if x <= N] + TM = [16] + [x for x in [32, 64, 128] if x <= M] + TN = [16] + [x for x in [32, 64, 128] if x <= N] TB = [x for x in [1, 2, 4] if x <= B] MAX_GZ = K // 2048 MIN_GM = M // max(TM) @@ -546,8 +546,8 @@ __global__ void {name}( TZ = [x for x in [1, 2, 4, 8, 16, 32] \ if x < MAX_GZ and x*MIN_GM*MIN_GN*MIN_GB < 256] TZ = [1] if not TZ else [TZ[-1], TZ[-1]*2] - #TB, TZ = [1], [1] - #TM, TN, TB, TZ = [128], [128], [1], [1] + TM, TN, TB = [128], [64], [1] + #print(TM, TN, TB) self.macros = { 'TM': TM, 'TN': TN, 'TB': TB, 'TK': TK, 'TZ': TZ, 'TYPE': dtype } self.dtype = dtype self.flops = 2 * B * M * N * K @@ -582,13 +582,13 @@ __global__ void {name}( # allocate output dtype = a.dtype c = triton.empty(shape_c, dtype=dtype) + # compile einsum instance + cache = _einsum.instance_cache key = (einsum, dtype, a.stride(), b.stride(), c.stride(), a.shape, b.shape, c.shape) - # compile einsum instance - cache = _einsum.instance_cache - #if key not in cache: - cache[key] = _einsum.instance(einsum, dtype, + if key not in cache: + cache[key] = _einsum.instance(einsum, dtype, a.stride(), b.stride(), c.stride(), a.shape, b.shape, c.shape, arrays) instance = cache[key] diff --git a/python/triton/utils.py b/python/triton/utils.py index 117f69136..da8a1e8f9 100644 --- a/python/triton/utils.py +++ b/python/triton/utils.py @@ -24,7 +24,7 @@ def empty(shape, dtype): return tf_empty_proxy(shape, dtype) #return fw.tf_extra_ops.alloc_empty(args, T = dtype) elif fw.has_torch(): - return fw.torch.empty(shape, dtype=dtype).cuda() + return fw.torch.empty(shape, dtype=dtype, device='cuda:0') def shape(A) : if fw.has_tensorflow(): From db941161ed3c8d8737290fbeab02901107be01f5 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 22 Jan 2020 18:09:00 -0500 Subject: [PATCH 492/494] [PYTHON][EXAMPLES] Cleaned self-attention benchmarks --- python/examples/attention/bench.py | 11 ++++++----- python/triton/ops/einsum.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/python/examples/attention/bench.py b/python/examples/attention/bench.py index 99c1722eb..abd4ed24c 100644 --- a/python/examples/attention/bench.py +++ b/python/examples/attention/bench.py @@ -4,7 +4,7 @@ import reference import optimized from time import time -use_half = False +use_half = True def cast(x): if use_half: return x.half() @@ -14,9 +14,9 @@ def cast(x): # GPU device device = torch.device("cuda:0") # shapes -batch, nhead = 16, 8 -dm, dk, dv = 512, 512, 512 -lq, lk, lv = 256, 256, 256 +batch, nhead = 8, 28 +dm, dk, dv = 1024, 1024, 1024 +lq, lk, lv = 1024, 1024, 1024 # initialize tensors torch.manual_seed(0) np.random.seed(0) @@ -44,4 +44,5 @@ start = time() toutput, _ = tattn(query, key, value) end = time() ttime = end - start -print(rtime, ttime) \ No newline at end of file +print(f'Torch: {rtime} s') +print(f'Triton: {ttime} s') \ No newline at end of file diff --git a/python/triton/ops/einsum.py b/python/triton/ops/einsum.py index dbb236b18..7af856be3 100644 --- a/python/triton/ops/einsum.py +++ b/python/triton/ops/einsum.py @@ -546,7 +546,7 @@ __global__ void {name}( TZ = [x for x in [1, 2, 4, 8, 16, 32] \ if x < MAX_GZ and x*MIN_GM*MIN_GN*MIN_GB < 256] TZ = [1] if not TZ else [TZ[-1], TZ[-1]*2] - TM, TN, TB = [128], [64], [1] + #TM, TN, TB = [128], [64], [1] #print(TM, TN, TB) self.macros = { 'TM': TM, 'TN': TN, 'TB': TB, 'TK': TK, 'TZ': TZ, 'TYPE': dtype } self.dtype = dtype From 2fcf5cec5b471de66dcc1e50c6dad6ed9e4a3acf Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 24 Jan 2020 15:25:00 -0500 Subject: [PATCH 493/494] [TRITON][CODEGEN] Fixed flawed assert() --- include/triton/tools/bench.hpp | 2 +- lib/codegen/selection/machine_layout.cc | 4 ++-- lib/codegen/transform/membar.cc | 3 ++- python/triton/kernel.py | 2 +- tests/bench/dot.cc | 6 +++--- tests/unit/dot.cc | 2 +- 6 files changed, 10 insertions(+), 9 deletions(-) diff --git a/include/triton/tools/bench.hpp b/include/triton/tools/bench.hpp index 430418b27..e1807f25e 100644 --- a/include/triton/tools/bench.hpp +++ b/include/triton/tools/bench.hpp @@ -38,7 +38,7 @@ inline double bench(std::function const & op, driver::stream * stream, b double total_time = 0; op(); stream->synchronize(); - while(total_time*1e-9 < 1e-1){ + while(total_time*1e-9 < 1e-2){ float norm = 1; // normalize clock if possible to reduce noise in auto-tuning if(normalize) diff --git a/lib/codegen/selection/machine_layout.cc b/lib/codegen/selection/machine_layout.cc index e1fcc8fe6..7ed00c586 100644 --- a/lib/codegen/selection/machine_layout.cc +++ b/lib/codegen/selection/machine_layout.cc @@ -192,11 +192,11 @@ machine_mma884_layout::machine_mma884_layout(Module *mod, Builder *builder, unsigned wpt_0 = layout->wpt(0); unsigned wpt_1 = layout->wpt(1); unsigned wpt_2 = is_batched ? layout->wpt(2) : 1; - // hmma warp tile size + // mma warp tile size unsigned hmma_wts_0 = fpw_0 * 8; unsigned hmma_wts_1 = fpw_1 * 8; unsigned hmma_wts_2 = is_batched ? fpw_2 : 1; - // hmma block tile size + // mma block tile size unsigned hmma_bts_0 = hmma_wts_0 * wpt_0; unsigned hmma_bts_1 = hmma_wts_1 * wpt_1; unsigned hmma_bts_2 = is_batched ? hmma_wts_2 * wpt_2 : 1; diff --git a/lib/codegen/transform/membar.cc b/lib/codegen/transform/membar.cc index 0a9b0235b..450e98315 100644 --- a/lib/codegen/transform/membar.cc +++ b/lib/codegen/transform/membar.cc @@ -36,7 +36,8 @@ void membar::add_reference(ir::value *v, interval_vec_t &res){ if(!i->get_type()->is_tile_ty()) return; analysis::shared_layout* layout = layouts_->get(v)->to_shared(); - assert(layout); + if(!layout) + return; if(alloc_->has_offset(layout)){ unsigned offset = alloc_->offset(layout); res.push_back(interval_t(offset, offset + layout->get_size())); diff --git a/python/triton/kernel.py b/python/triton/kernel.py index 85195790d..f0aef5614 100644 --- a/python/triton/kernel.py +++ b/python/triton/kernel.py @@ -223,7 +223,7 @@ class kernel: defines.append((k, values)) opt = libtriton.options_space() opt.defines = defines - opt.num_warps = [2, 4] + opt.num_warps = [4] # create unique id for this op op_id = libtriton.make_op_id() self.fw_id[key] = op_id diff --git a/tests/bench/dot.cc b/tests/bench/dot.cc index d118b95be..131083e72 100644 --- a/tests/bench/dot.cc +++ b/tests/bench/dot.cc @@ -12,8 +12,8 @@ int main() { for(auto ord: std::vector>{{1, 0}}) for(auto x: std::vector>{{false, false}, {true, false}}){ std::vector tmp = { - config_t{ord, x[0], x[1], 512, 512, 512}, - config_t{ord, x[0], x[1], 2048, 2048, 2048}, +// config_t{ord, x[0], x[1], 512, 512, 512}, + config_t{ord, x[0], x[1], 8192, 8192, 8192}, // config_t{ord, x[0], x[1], 127008, 768, 576}, // config_t{ord, x[0], x[1], 8192, 8192, 8192} // config_t{ord, x[0], x[1], 16, 2048, 2048}, @@ -36,7 +36,7 @@ int main() { for(const auto& c: configs){ std::tie(ord, AT, BT, M, N, K) = c; std::cout << "// " << c ; - for(auto perf: bench_dot(stream, FLOAT, AT, BT, M, N, K, ord, ord)) + for(auto perf: bench_dot(stream, HALF, AT, BT, M, N, K, ord, ord)) std::cout << ", " << perf << std::flush; std::cout << std::endl; } diff --git a/tests/unit/dot.cc b/tests/unit/dot.cc index 283951377..6c24386ea 100644 --- a/tests/unit/dot.cc +++ b/tests/unit/dot.cc @@ -16,7 +16,7 @@ int main() { for(int nwarps: std::vector{4}) for(bool AT: std::array{false, true}) for(bool BT: std::array{false, true}){ - configs.push_back(config_t{HALF, AT, BT, 128, 128, 128, TM, TN, TK, nwarps}); + configs.push_back(config_t{HALF, AT, BT, TM, TN, TK, TM, TN, TK, nwarps}); } // test dtype_t dtype; From 3e92901bd561aadb8b19a1f2ad7454c4431be52c Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 31 Jan 2020 14:04:42 -0500 Subject: [PATCH 494/494] [TRITON][PYTHON] Cleaned up API --- python/examples/einsum.py | 6 +- python/triton/kernel.py | 42 +++-- python/triton/ops/__init__.py | 4 +- python/triton/ops/batchnorm.py | 8 +- python/triton/ops/conv.py | 309 --------------------------------- python/triton/ops/dot.py | 126 -------------- python/triton/ops/einsum.py | 13 +- 7 files changed, 41 insertions(+), 467 deletions(-) delete mode 100644 python/triton/ops/conv.py delete mode 100644 python/triton/ops/dot.py diff --git a/python/examples/einsum.py b/python/examples/einsum.py index e571ec955..4ec2dea36 100644 --- a/python/examples/einsum.py +++ b/python/examples/einsum.py @@ -11,9 +11,9 @@ configs = [] # Matrix multiplication MNK = [ - # (512, 512 ,512), - # (2048, 2048, 2048), - # (8192, 8192, 8192), + (512, 512 ,512), + (2048, 2048, 2048), + (8192, 8192, 8192), # (64, 64, 64000), # (64, 64, 128000), diff --git a/python/triton/kernel.py b/python/triton/kernel.py index f0aef5614..71b79bb99 100644 --- a/python/triton/kernel.py +++ b/python/triton/kernel.py @@ -177,12 +177,12 @@ def _make_framework_op(src, outputs, tmp, options): else: assert False -def _make_grid(args) : - scalars = [x for x in args[:-1] if isinstance(x, triton.utils.scalar)] +def _make_grid(grid, args) : + scalars = [x for x in args if isinstance(x, triton.utils.scalar)] def grid(opt): for x in scalars: x.set_assume_initialized() - result = args[-1](opt) + result = grid(opt) for x in scalars: x.unset_assume_initialized() return result @@ -206,24 +206,37 @@ class kernel: self.cst[name] = value def __call__(self, *args, **kwargs): + + ######################## + # keyword arguments + ######################## + num_warps = kwargs['num_warps'] if 'num_warps' in kwargs else [2, 4, 8] + defines = kwargs['defines'] if 'defines' in kwargs else dict() + bench = kwargs['bench'] if 'bench' in kwargs else 0 + if 'grid' not in kwargs: + raise RuntimeError('Must provide grid for kernel launch') + grid = kwargs['grid'] + + ######################### # cache ######################## + # create a new framework op when defines are different - key = '-'.join(['{key}-{val}'.format(key=key, val=val) for key, val in kwargs.items()]) + key = '-'.join(['{key}-{val}'.format(key=key, val=val) for key, val in defines.items()]) if key not in self.fw_id.keys(): # code generation options - defines = [] - for k, v in kwargs.items(): + macros = [] + for k, v in defines.items(): cvt = lambda x: _cvt_to_def_str(x) if(isinstance(v, list)): values = list(map(cvt, v)) else: values = [cvt(v)] - defines.append((k, values)) + macros.append((k, values)) opt = libtriton.options_space() - opt.defines = defines - opt.num_warps = [4] + opt.defines = macros + opt.num_warps = [2, 4, 8] # create unique id for this op op_id = libtriton.make_op_id() self.fw_id[key] = op_id @@ -238,22 +251,21 @@ class kernel: # initialize ######################## op_id = self.fw_id[key] - libtriton.register_grid(op_id, args[-1]) - bench = kwargs['bench'] if 'bench' in kwargs else 0 + libtriton.register_grid(op_id, grid) bench_id = libtriton.make_scalar_id() if bench > 0 else -1 ######################### # call framework function ######################### if fw.has_tensorflow(): - empty = [x for x in args[:-1] if isinstance(x, triton.utils.tf_empty_proxy)] + empty = [x for x in args if isinstance(x, triton.utils.tf_empty_proxy)] if len(empty) != len(self.outputs): raise ValueError('Number of empty arguments does not much number of outputs provided') # operands - operands = [x.shape if isinstance(x, triton.utils.tf_empty_proxy) else x for x in args[:-1]] + operands = [x.shape if isinstance(x, triton.utils.tf_empty_proxy) else x for x in args] # output data types kwargs = {'id': op_id, 'bench': bench, 'bench_id': bench_id} - for i, x in enumerate(args[:-1]): + for i, x in enumerate(args): if isinstance(x, triton.utils.tf_empty_proxy): kwargs['T' + str(i)] = x.dtype # launch @@ -277,7 +289,7 @@ class kernel: # call torch function ############################ elif fw.has_torch(): - args = [x if isinstance(x, fw.torch.Tensor) else x for x in args[:-1]] + args = [x if isinstance(x, fw.torch.Tensor) else x for x in args] ret = self.fw_op(op_id, bench, bench_id, *args) if bench > 0: bench_registry[ret] = libtriton.retrieve_scalar(bench_id) diff --git a/python/triton/ops/__init__.py b/python/triton/ops/__init__.py index 6e4ded8c7..ea638f76c 100644 --- a/python/triton/ops/__init__.py +++ b/python/triton/ops/__init__.py @@ -1,4 +1,2 @@ -from .dot import _dot, dot from .einsum import _einsum, einsum -from .batchnorm import _batchnorm, batchnorm -from .conv import _conv, conv \ No newline at end of file +from .batchnorm import _batchnorm, batchnorm \ No newline at end of file diff --git a/python/triton/ops/batchnorm.py b/python/triton/ops/batchnorm.py index 9409134d9..117cca3b1 100644 --- a/python/triton/ops/batchnorm.py +++ b/python/triton/ops/batchnorm.py @@ -101,8 +101,8 @@ void bwdbatchnorm(float *DX, float *DG, float *DB, var = triton.empty([C], dtype=dtype) # execute kernels _batchnorm.fwd_kernel(y, mean, var, x, gamma, beta, H*W*B, eps, - lambda opt: [1, C], - TM = 128) + grid = lambda opt: [1, C], + defines = {'TM': 128}) # save ctx.save_for_backward(x, gamma, beta, mean, var) ctx.eps = eps @@ -122,8 +122,8 @@ void bwdbatchnorm(float *DX, float *DG, float *DB, _batchnorm.bwd_kernel(dx, dgamma, dbeta, dy, x, gamma, mean, var, H*W*B, eps, - lambda opt: [1, C], - TM = 128) + grid = lambda opt: [1, C], + defines = {'TM': 128}) return dx, dgamma, dbeta, None batchnorm = _batchnorm.apply \ No newline at end of file diff --git a/python/triton/ops/conv.py b/python/triton/ops/conv.py deleted file mode 100644 index 8bd0acbd3..000000000 --- a/python/triton/ops/conv.py +++ /dev/null @@ -1,309 +0,0 @@ -import triton -import numpy as np - -class _conv(triton.function): - - src = """ -void convnd(A_TYPE *A, - B_TYPE *B, - float *C, - int M, int N, int K, - int AH, int AW, - int BH, int BW, - int CH, int CW, - int NC, - int lda_n, int lda_c, int lda_d, int lda_h, int lda_w, - int ldb_c, int ldb_t, int ldb_r, int ldb_s, int ldb_k, - int ldc_n, int ldc_k, int ldc_m, int ldc_p, int ldc_q, - int pad_h, int pad_w, - int stride_h, int stride_w, - int upsample_h, int upsample_w, - int off_uh, int off_uw, - int off_uah, int off_uaw, - int off_uch, int off_ucw, - int* ADELTA, int* ADIFF){ - - // range of indices along the reduction axis - int rxa[TM] = get_program_id(0) * TM + 0 ... TM; - int ryb[TN] = get_program_id(1) * TN + 0 ... TN; - int rk[TK] = 0 ... TK; - - // initialize accumulator - float c[TM, TN] = 0; - - // pointers for A - int rabh[TM] = rxa / CW; - int raw[TM] = rxa % CW; - int rab[TM] = rabh / CH; - int rah[TM] = rabh % CH; - rah = rah * UPAW - off_uah; - raw = raw * UPAH - off_uaw; - int ram[TM] = rab*lda_n + rah*lda_h + raw*lda_w; - int rak[TK] = *(ADELTA + rk); - A_TYPE* pa[TM, TK] = A + ram[:, newaxis] + rak[newaxis, :]; - - // pointers for B - int rbk[TK] = rk; - int rbn[TN] = ryb; - B_TYPE* pb[TK, TN] = B + rbn[newaxis, :] * ldb_k + rbk[:, newaxis] * ldb_c; - - // pointers for A look-up table - int rklut[TK] = rk % LUT_SIZE; - int* padiff[TK] = ADIFF + rklut; - int* padelta[TK] = ADELTA + TK + rklut + off_uw * LUT_SIZE + off_uh * LUT_SIZE * upsample_w; - int adiff[TK] = *padiff; - int adelta[TK] = *padelta; - - // reduction loop - A_TYPE a[TM, TK] = *pa; - B_TYPE b[TK, TN] = *pb; - for(int k = K; k > 0; k = k - TK){ - c += a @ b; - pa += adelta[newaxis, :]; - pb += TK * ldb_c; - // increment A look-up table - padelta = padelta + adiff; - adelta = *padelta; - padiff = padiff + adiff; - adiff = *padiff; - // pre-fetches - bool checka[TM, TK] = k > TK; - bool checkb[TK, TN] = k > TK; - a = checka ? *pa : 0; - b = checkb ? *pb : 0; - } - - // write back - int rxc[TM] = get_program_id(0) * TM + 0 ... TM; - int rc1[TN] = get_program_id(1) * TN + 0 ... TN; - int rcn[TM] = rxc / (CH*CW); - int rcpq[TM] = rxc % (CH*CW); - int rcp[TM] = rcpq / CW; - int rcq[TM] = rcpq % CW; - rcp = rcp * upsample_h + off_uch; - rcq = rcq * upsample_w + off_ucw; - int rc0[TM] = rcn * ldc_n + rcp * ldc_p + rcq * ldc_q; - float* pc[TM, TN] = C + rc1[newaxis, :]*ldc_k + rc0[:, newaxis]; - bool checkc0[TM] = rxc < M; - bool checkc1[TN] = rc1 < N; - bool checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :]; - *?(checkc)pc = c; -} -""" - kernel = triton.kernel(src, ['C']) - - @staticmethod - def _unpack(idx, order, shape_b): - _123 = idx // shape_b[order[0]] - _0 = idx % shape_b[order[0]] - _23 = _123 // shape_b[order[1]] - _1 = _123 % shape_b[order[1]] - _3 = _23 // shape_b[order[2]] - _2 = _23 % shape_b[order[2]] - return _0, _1, _2, _3 - - @staticmethod - def _roundup(x, div): - return (x + div - 1) // div * div - - @staticmethod - def _delta_a(upsample_d, upsample_h, upsample_w, - bc, bd, bh, bw, - ac, ad, ah, aw, - stride_a, shape_b, - TK): - # Parse the axes so that the reduction is done - # from the innermost dimension outward - order = sorted([bc, bd, bh, bw], reverse = True) - c, d, h, w = [order.index(x) for x in [bc, bd, bh, bw]] - # Size of the lookup table is the product of the 3 innermost dimensions - K = _conv._roundup(TK, shape_b[order[0]] * shape_b[order[1]] * shape_b[order[2]]) - # Allocate temporary arrays - ud = np.arange(upsample_d, dtype=np.int32)[:, np.newaxis, np.newaxis, np.newaxis] - uh = np.arange(upsample_h, dtype=np.int32)[np.newaxis, :, np.newaxis, np.newaxis] - uw = np.arange(upsample_w, dtype=np.int32)[np.newaxis, np.newaxis, :, np.newaxis] - k = np.arange(K , dtype=np.int32)[np.newaxis, np.newaxis, np.newaxis, :] - # Find reduction indices at the current and next reduction indices - currentk = _conv._unpack(k , order, shape_b) - nextk = _conv._unpack(k + TK, order, shape_b) - # Compute memory stride - result = 0 - result += (nextk[c] - currentk[c]) * stride_a[ac] - result += (nextk[d] - currentk[d]) * stride_a[ad] - result += (nextk[h] - currentk[h]) * stride_a[ah] - result += (nextk[w] - currentk[w]) * stride_a[aw] - # Initial k - ki = np.arange(TK , dtype=np.int32)[np.newaxis, np.newaxis, np.newaxis, :] - currentk = _conv._unpack(ki, order, shape_b) - resulti = 0 - resulti += currentk[c] * stride_a[ac] - resulti += currentk[d] * stride_a[ad] - resulti += currentk[h] * stride_a[ah] - resulti += currentk[w] * stride_a[aw] - return np.concatenate((resulti, result), axis=-1) - - @staticmethod - def _extract_strides(shape): - rank = len(shape) - ret = [1] * rank - for i in range(rank - 1, 0, -1): - ret[i-1] = ret[i] * shape[i] - return ret - - - @staticmethod - def _call(a, b, - pad_d, pad_h, pad_w, - stride_d, stride_h, stride_w, - upsample_d, upsample_h, upsample_w, - a_layout, b_layout, c_layout): - # input shapes - shape_a = list(triton.shape(a)) - shape_b = list(triton.shape(b)) - dim = len(shape_a) - 2 - # indices - an, ac, ad, ah, aw = [a_layout.find(x) for x in 'ncdhw'] - bk, bc, bd, bh, bw = [b_layout.find(x) for x in 'kctrs'] - cn, ck, cd, ch, cw = [c_layout.find(x) for x in 'nkdhw'] - # extract shapes - if dim == 2: - shape_a.insert(ad, 1) - if dim == 2: - shape_b.insert(bd, 1) - # output shape - shape_c = [0] * 5 - shape_c[cn] = shape_a[an] - shape_c[ck] = shape_b[bk] - shape_c[cd] = (shape_a[ad]*upsample_d - shape_b[bd] + 1 + 2*pad_d + stride_d - 1) // stride_d - shape_c[ch] = (shape_a[ah]*upsample_h - shape_b[bh] + 1 + 2*pad_h + stride_h - 1) // stride_h - shape_c[cw] = (shape_a[aw]*upsample_w - shape_b[bw] + 1 + 2*pad_w + stride_w - 1) // stride_w - # strides - stride_a = _conv._extract_strides(shape_a) - stride_b = _conv._extract_strides(shape_b) - stride_c = _conv._extract_strides(shape_c) - # tiling parameters - TM = [32] - TN = [32] - TK = 8 - # pointer deltas for a - delta_a = _conv._delta_a(upsample_d, upsample_h, upsample_w, - bc, bd, bh, bw, - ac, ad, ah, aw, - stride_a, shape_b, - TK) - delta_a = triton.fw.torch.from_numpy(delta_a).cuda() - # delta increments for a - inc_a = np.arange(delta_a.shape[-1] - TK, dtype=np.int32) - inc_a = ((inc_a + TK) % inc_a.size) - inc_a - inc_a = triton.fw.torch.from_numpy(inc_a).cuda() - # allocate output - if dim == 2: - shape_c.pop(cd) - c = triton.empty(shape_c, dtype=a.dtype) - if dim == 2: - shape_c.insert(cd, 1) - # execute kernel - trans_b = False - is_wgrad = False - is_blut = False - macros = { - 'UPAR': 'stride_h' if is_wgrad else '1', - 'UPAS': 'stride_w' if is_wgrad else '1', - 'UPAH': '' if is_wgrad else 'stride_h', - 'UPAW': '' if is_wgrad else 'stride_w', - 'LUT_SIZE': delta_a.shape[-1], - 'TM': TM, 'TN': TN, 'TK': TK, - 'A_TYPE': 'float', 'B_TYPE': 'float' - } - MATMUL_M = shape_c[cn] * shape_c[cd] * shape_c[ch] * shape_c[cw] - MATMUL_N = shape_c[ck] - MATMUL_K = shape_b[bc] * shape_b[bd] * shape_b[bh] * shape_b[bw] - _conv.kernel(a, b, c, - # matrix multiplication shapes - MATMUL_M, MATMUL_N, MATMUL_K, - # shapes for a - shape_a[ah], shape_a[aw], - # shapes for b - shape_b[bh], shape_b[bw], - # chapes for c - shape_c[ch], shape_c[cw], shape_c[cn], - # strides for a - stride_a[an], stride_a[ac], stride_a[ad + 0], stride_a[ad + 1], stride_a[ad + 2], - # strides for b - stride_b[bc], stride_b[bd + 0], stride_b[bd + 1], stride_b[bd + 2], stride_b[bk], - # strides for c - stride_c[cn], stride_c[ck], stride_c[cd], stride_c[cd + 1], stride_c[cd + 2], - # padding - pad_h, pad_w, - # striding - stride_h, stride_w, - # upsampling - upsample_h, upsample_w, - 0, 0, 0, 0, 0, 0, - # look-up table - delta_a, inc_a, - lambda opt: [triton.cdiv(MATMUL_M, opt.d('TM')), triton.cdiv(MATMUL_N, opt.d('TN'))], - **macros) - return c - - @staticmethod - def forward(ctx, x, w, - pad_d = 0, pad_h = 0, pad_w = 0, - stride_d = 1, stride_h = 1, stride_w = 1, - upsample_d = 1, upsample_h = 1, upsample_w = 1, - layout_a = 'ncdhw', layout_b = 'ktrsc', layout_c = 'nkdhw'): - # save for backward - ctx.save_for_backward(x, w) - ctx.pad_d = pad_d - ctx.pad_h = pad_h - ctx.pad_w = pad_w - ctx.stride_d = stride_d - ctx.stride_h = stride_h - ctx.stride_w = stride_w - ctx.upsample_d = upsample_d - ctx.upsample_h = upsample_h - ctx.upsample_w = upsample_w - ctx.layout_a = layout_a - ctx.layout_b = layout_b - ctx.layout_c = layout_c - # return - return _conv._call(x, w, - pad_d, pad_h, pad_w, - stride_d, stride_h, stride_w, - upsample_d, upsample_h, upsample_w, - layout_a, layout_b, layout_c) - - @staticmethod - def backward(ctx, dy): - x, w = ctx.saved_tensors - pad_d = ctx.pad_d - pad_h = ctx.pad_h - pad_w = ctx.pad_w - stride_d = ctx.stride_d - stride_h = ctx.stride_h - stride_w = ctx.stride_w - upsample_d = ctx.upsample_d - upsample_h = ctx.upsample_h - upsample_w = ctx.upsample_w - layout_a = ctx.layout_a - layout_b = ctx.layout_b - layout_c = ctx.layout_c - - # TODO: Deal with this - dx_pad_d = 1 - dx_pad_h = 1 - dx_pad_w = 1 - dx = _conv.call(dy, w, - dw_pad_d, dw_pad_h, dw_pad_w, - upsample_w, upsample_h, upsample_w, - stride_d, stride_h, stride_w, - 'ncdhw', 'cktrs', 'nkdhw') - - - - ret = [None] * 14 - ret[0] = None - ret[1] = dw - return None, - -conv = _conv.apply \ No newline at end of file diff --git a/python/triton/ops/dot.py b/python/triton/ops/dot.py deleted file mode 100644 index 89b28d20e..000000000 --- a/python/triton/ops/dot.py +++ /dev/null @@ -1,126 +0,0 @@ -import triton - -class _dot(triton.function): - - src = """ -void dot(TYPE * A __noalias __readonly __aligned(16), - TYPE * B __noalias __readonly __aligned(16), - TYPE * C, - float alpha, - int M, int N, int K, - int lda __multipleof(8), - int ldb __multipleof(8), - int ldc) { - // prologue - int ridx = get_program_id(0); - int ridy = get_program_id(1); - int rm[TM] = ridx * TM + 0 ... TM; - int rn[TN] = ridy * TN + 0 ... TN; - int rk[TK] = 0 ... TK; - - // pointers to operands - TYPE* pa[SHAPE_A] = A + rk[BROADCAST_AK] * STRIDE_AK + rm[BROADCAST_AM] * STRIDE_AM; - TYPE* pb[SHAPE_B] = B + rk[BROADCAST_BK] * STRIDE_BK + rn[BROADCAST_BN] * STRIDE_BN; - - // prefetches operands - bool checka[SHAPE_A] = rk[BROADCAST_AK] < K; - bool checkb[SHAPE_B] = rk[BROADCAST_BK] < K; - TYPE a[SHAPE_A] = checka ? *pa : 0; - TYPE b[SHAPE_B] = checkb ? *pb : 0; - - // reduction loop - float c[TM, TN] = 0; - for(int k = K; k > 0; k -= TK){ - c += USE_A @ USE_B; - bool checka[SHAPE_A] = k > TK; - bool checkb[SHAPE_B] = k > TK; - pa += TK * STRIDE_AK; - pb += TK * STRIDE_BK; - a = *?(checka)pa; - b = *?(checkb)pb; - } - //c = c * alpha; - - // epilogue - int rxm[TM] = get_program_id(0) * TM + 0 ... TM; - int rxn[TN] = get_program_id(1) * TN + 0 ... TN; - TYPE* pc[TM, TN] = C + rxm[:, newaxis] * ldc + rxn[newaxis, :]; - bool checkc[TM, TN] = (rxm[:, newaxis] < M) && (rxn[newaxis, :] < N); - *?(checkc)pc = (TYPE[TM, TN])c; -} -""" - kernel = triton.kernel(src, ['C']) - - @staticmethod - def _call(a, b, transpose_a, transpose_b, bench): - # extract shapes - shape_a = triton.shape(a) - shape_b = triton.shape(b) - M, Ka = shape_a[0], shape_a[1] - Kb, N = shape_b[0], shape_b[1] - # transpose shapes - if transpose_a: - M, Ka = Ka, M - if transpose_b: - Kb, N = N, Kb - # contiguous dimensions - lda = M if transpose_a else Ka - ldb = Kb if transpose_b else N - ldc = N - # data-type - dtype = a.dtype - # allocate output - c = triton.empty([M, N], dtype = dtype) - # compute - grid = lambda opt: [triton.cdiv(M, opt.d('TM')), triton.cdiv(N, opt.d('TN'))] - # macros -- not necessary but makes kernel source-code simpler - macros = {# handle A transposition - 'USE_A' : '^a' if transpose_a else 'a', - 'STRIDE_AK' : 'lda' if transpose_a else '1', - 'STRIDE_AM' : '1' if transpose_a else 'lda', - 'BROADCAST_AK': ':, newaxis' if transpose_a else 'newaxis, :', - 'BROADCAST_AM': 'newaxis, :' if transpose_a else ':, newaxis', - 'SHAPE_A' : 'TK, TM' if transpose_a else 'TM, TK', - # handle B transposition - 'USE_B' : '^b' if transpose_b else 'b', - 'STRIDE_BK' : '1' if transpose_b else 'ldb', - 'STRIDE_BN' : 'ldb' if transpose_b else '1', - 'BROADCAST_BK': 'newaxis, :' if transpose_b else ':, newaxis', - 'BROADCAST_BN': ':, newaxis' if transpose_b else 'newaxis, :', - 'SHAPE_B' : 'TN, TK' if transpose_b else 'TK, TN'} - _dot.kernel(a, b, c, 1., M, N, Ka, lda, ldb, ldc, - grid, bench=bench, - AT = transpose_a, BT = transpose_b, TYPE = dtype, - TM = [64], TN = [128], TK = [8], **macros) - return c - - @staticmethod - def forward(ctx, a, b, transpose_a = False, transpose_b = False, bench = 0): - ctx.save_for_backward(a, b) - ctx.t_a = transpose_a - ctx.t_b = transpose_b - ctx.bench = bench - return _dot._call(a, b, transpose_a, transpose_b, bench) - - @staticmethod - def backward(ctx, dy): - a, b = ctx.saved_tensors - t_a, t_b = ctx.t_a, ctx.t_b - bench = ctx.bench - if not t_a and not t_b: - da = _dot._call(dy, b, False, True, bench) - db = _dot._call(a, dy, True, False, bench) - elif not t_a and t_b: - da = _dot._call(dy, b, False, False, bench) - db = _dot._call(dy, a, True, False, bench) - elif t_a and not t_b: - da = _dot._call(b, dy, False, True, bench) - db = _dot._call(a, dy, False, False, bench) - elif t_a and t_b: - da = _dot._call(b, dy, True, True, bench) - db = _dot._call(dy, a, True, True, bench) - else: - assert False - return da, db, None, None, None - -dot = _dot.apply \ No newline at end of file diff --git a/python/triton/ops/einsum.py b/python/triton/ops/einsum.py index 7af856be3..936a5fced 100644 --- a/python/triton/ops/einsum.py +++ b/python/triton/ops/einsum.py @@ -527,10 +527,10 @@ __global__ void {name}( delta_b = delta_b[0] if lut_mode_b == _einsum.LUT_MODE.SCALAR else torch.from_numpy(delta_b).cuda() self.args += [delta_b] self.args += arrays - self.args += [lambda opt: [triton.cdiv(M, opt.d('TM')) * - triton.cdiv(N, opt.d('TN')), - triton.cdiv(B, opt.d('TB')), - opt.d('TZ')]] + self.grid = lambda opt: [triton.cdiv(M, opt.d('TM')) * + triton.cdiv(N, opt.d('TN')), + triton.cdiv(B, opt.d('TB')), + opt.d('TZ')] # position of dynamic arguments self.pos_a = 0 self.pos_b = 1 @@ -546,9 +546,8 @@ __global__ void {name}( TZ = [x for x in [1, 2, 4, 8, 16, 32] \ if x < MAX_GZ and x*MIN_GM*MIN_GN*MIN_GB < 256] TZ = [1] if not TZ else [TZ[-1], TZ[-1]*2] - #TM, TN, TB = [128], [64], [1] - #print(TM, TN, TB) self.macros = { 'TM': TM, 'TN': TN, 'TB': TB, 'TK': TK, 'TZ': TZ, 'TYPE': dtype } + # information on compute self.dtype = dtype self.flops = 2 * B * M * N * K self.sym_a = sym_a @@ -564,7 +563,7 @@ __global__ void {name}( self.args[self.pos_a] = a self.args[self.pos_b] = b self.args[self.pos_c] = c - self.kernel(*self.args, bench=bench, **self.macros) + self.kernel(*self.args, grid=self.grid, bench=bench, defines=self.macros)